From 46afaee1d54b15e084dfe90ffec3eea5ba7d65c8 Mon Sep 17 00:00:00 2001
From: Zhiping du <zhipingdu@tencent.com>
Date: Tue, 12 Oct 2021 16:59:06 +0800
Subject: [PATCH 0001/2161] ice: update E810 100g nic driver

commit opencloudos.

update intel ice driver to ice-1.6.7.1.1

Signed-off-by: Zhiping Du <zhipingdu@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/configs/oc.config                    |     1 +
 drivers/net/ethernet/intel/ice/Makefile       |    46 +-
 drivers/net/ethernet/intel/ice/ice.h          |  1349 +-
 drivers/net/ethernet/intel/ice/ice_acl.c      |   592 +
 drivers/net/ethernet/intel/ice/ice_acl.h      |   202 +
 drivers/net/ethernet/intel/ice/ice_acl_ctrl.c |  1171 ++
 drivers/net/ethernet/intel/ice/ice_acl_main.c |   323 +
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |  2900 ++++-
 drivers/net/ethernet/intel/ice/ice_arfs.c     |   701 +
 drivers/net/ethernet/intel/ice/ice_arfs.h     |    83 +
 drivers/net/ethernet/intel/ice/ice_base.c     |  1045 ++
 drivers/net/ethernet/intel/ice/ice_base.h     |    33 +
 drivers/net/ethernet/intel/ice/ice_cgu.h      |   231 +
 drivers/net/ethernet/intel/ice/ice_cgu_ops.c  |   248 +
 drivers/net/ethernet/intel/ice/ice_cgu_ops.h  |   121 +
 drivers/net/ethernet/intel/ice/ice_cgu_regs.h |   941 ++
 drivers/net/ethernet/intel/ice/ice_cgu_util.c |   444 +
 drivers/net/ethernet/intel/ice/ice_cgu_util.h |    46 +
 drivers/net/ethernet/intel/ice/ice_common.c   |  6840 +++++++---
 drivers/net/ethernet/intel/ice/ice_common.h   |   212 +-
 drivers/net/ethernet/intel/ice/ice_controlq.c |   325 +-
 drivers/net/ethernet/intel/ice/ice_controlq.h |    15 +-
 drivers/net/ethernet/intel/ice/ice_dcb.c      |   554 +-
 drivers/net/ethernet/intel/ice/ice_dcb.h      |    53 +-
 drivers/net/ethernet/intel/ice/ice_dcb_lib.c  |   859 +-
 drivers/net/ethernet/intel/ice/ice_dcb_lib.h  |   109 +-
 drivers/net/ethernet/intel/ice/ice_dcb_nl.c   |  1139 ++
 drivers/net/ethernet/intel/ice/ice_dcb_nl.h   |    20 +
 drivers/net/ethernet/intel/ice/ice_dcf.c      |  1137 ++
 drivers/net/ethernet/intel/ice/ice_dcf.h      |   118 +
 drivers/net/ethernet/intel/ice/ice_debugfs.c  |   672 +
 drivers/net/ethernet/intel/ice/ice_devids.h   |    47 +-
 drivers/net/ethernet/intel/ice/ice_devlink.c  |  1090 ++
 drivers/net/ethernet/intel/ice/ice_devlink.h  |    46 +
 drivers/net/ethernet/intel/ice/ice_eswitch.c  |   719 ++
 drivers/net/ethernet/intel/ice/ice_eswitch.h  |    92 +
 drivers/net/ethernet/intel/ice/ice_ethtool.c  |  5959 ++++++---
 drivers/net/ethernet/intel/ice/ice_ethtool.h  |   196 +
 .../net/ethernet/intel/ice/ice_ethtool_fdir.c |  2180 ++++
 drivers/net/ethernet/intel/ice/ice_fdir.c     |  2103 +++
 drivers/net/ethernet/intel/ice/ice_fdir.h     |   310 +
 .../net/ethernet/intel/ice/ice_flex_pipe.c    |  7118 +++++++++--
 .../net/ethernet/intel/ice/ice_flex_pipe.h    |    85 +-
 .../net/ethernet/intel/ice/ice_flex_type.h    |   662 +-
 drivers/net/ethernet/intel/ice/ice_flow.c     |  4222 ++++++
 drivers/net/ethernet/intel/ice/ice_flow.h     |   572 +
 drivers/net/ethernet/intel/ice/ice_fltr.c     |   767 ++
 drivers/net/ethernet/intel/ice/ice_fltr.h     |    60 +
 .../net/ethernet/intel/ice/ice_fw_update.c    |   845 ++
 .../net/ethernet/intel/ice/ice_fw_update.h    |    12 +
 drivers/net/ethernet/intel/ice/ice_fwlog.c    |   477 +
 drivers/net/ethernet/intel/ice/ice_fwlog.h    |    62 +
 .../net/ethernet/intel/ice/ice_hw_autogen.h   |  9404 +++++++++++++-
 drivers/net/ethernet/intel/ice/ice_idc.c      |  1488 +++
 drivers/net/ethernet/intel/ice/ice_idc.h      |   422 +
 drivers/net/ethernet/intel/ice/ice_idc_int.h  |   170 +
 drivers/net/ethernet/intel/ice/ice_lag.c      |   438 +
 drivers/net/ethernet/intel/ice/ice_lag.h      |    40 +
 .../net/ethernet/intel/ice/ice_lan_tx_rx.h    |   963 +-
 drivers/net/ethernet/intel/ice/ice_lib.c      |  4192 +++---
 drivers/net/ethernet/intel/ice/ice_lib.h      |   142 +-
 drivers/net/ethernet/intel/ice/ice_main.c     | 10391 ++++++++++++---
 drivers/net/ethernet/intel/ice/ice_nvm.c      |  1695 ++-
 drivers/net/ethernet/intel/ice/ice_nvm.h      |   122 +
 drivers/net/ethernet/intel/ice/ice_osdep.h    |    19 +-
 .../ethernet/intel/ice/ice_pf_vsi_vlan_ops.c  |    38 +
 .../ethernet/intel/ice/ice_pf_vsi_vlan_ops.h  |    13 +
 .../ethernet/intel/ice/ice_protocol_type.h    |   386 +
 drivers/net/ethernet/intel/ice/ice_ptp.c      |  3648 ++++++
 drivers/net/ethernet/intel/ice/ice_ptp.h      |   235 +
 .../net/ethernet/intel/ice/ice_ptp_consts.h   |    85 +
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c   |  1774 +++
 drivers/net/ethernet/intel/ice/ice_ptp_hw.h   |   359 +
 drivers/net/ethernet/intel/ice/ice_repr.c     |   467 +
 drivers/net/ethernet/intel/ice/ice_repr.h     |    32 +
 drivers/net/ethernet/intel/ice/ice_sbq_cmd.h  |    92 +
 drivers/net/ethernet/intel/ice/ice_sched.c    |  4386 ++++++-
 drivers/net/ethernet/intel/ice/ice_sched.h    |   177 +-
 drivers/net/ethernet/intel/ice/ice_sriov.c    |   407 +-
 drivers/net/ethernet/intel/ice/ice_sriov.h    |    22 +-
 drivers/net/ethernet/intel/ice/ice_status.h   |    10 +-
 drivers/net/ethernet/intel/ice/ice_switch.c   |  9067 ++++++++++---
 drivers/net/ethernet/intel/ice/ice_switch.h   |   322 +-
 drivers/net/ethernet/intel/ice/ice_tc_lib.c   |  2279 ++++
 drivers/net/ethernet/intel/ice/ice_tc_lib.h   |   197 +
 drivers/net/ethernet/intel/ice/ice_trace.h    |   276 +
 drivers/net/ethernet/intel/ice/ice_txrx.c     |  2707 +++-
 drivers/net/ethernet/intel/ice/ice_txrx.h     |   331 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |   391 +
 drivers/net/ethernet/intel/ice/ice_txrx_lib.h |    85 +
 drivers/net/ethernet/intel/ice/ice_type.h     |  1065 +-
 .../ethernet/intel/ice/ice_vf_vsi_vlan_ops.c  |   255 +
 .../ethernet/intel/ice/ice_vf_vsi_vlan_ops.h  |    21 +
 .../intel/ice/ice_virtchnl_allowlist.c        |   227 +
 .../intel/ice/ice_virtchnl_allowlist.h        |    13 +
 .../ethernet/intel/ice/ice_virtchnl_fdir.c    |  2267 ++++
 .../ethernet/intel/ice/ice_virtchnl_fdir.h    |    57 +
 .../net/ethernet/intel/ice/ice_virtchnl_pf.c  | 10654 +++++++++++++---
 .../net/ethernet/intel/ice/ice_virtchnl_pf.h  |   425 +-
 drivers/net/ethernet/intel/ice/ice_vlan.h     |    17 +
 .../net/ethernet/intel/ice/ice_vlan_mode.c    |   443 +
 .../net/ethernet/intel/ice/ice_vlan_mode.h    |    15 +
 .../net/ethernet/intel/ice/ice_vsi_vlan_lib.c |   744 ++
 .../net/ethernet/intel/ice/ice_vsi_vlan_lib.h |    34 +
 .../net/ethernet/intel/ice/ice_vsi_vlan_ops.c |   104 +
 .../net/ethernet/intel/ice/ice_vsi_vlan_ops.h |    30 +
 drivers/net/ethernet/intel/ice/ice_xsk.c      |  1393 ++
 drivers/net/ethernet/intel/ice/ice_xsk.h      |   121 +
 drivers/net/ethernet/intel/ice/kcompat.c      |  1360 ++
 drivers/net/ethernet/intel/ice/kcompat.h      |  3558 ++++++
 drivers/net/ethernet/intel/ice/kcompat_dim.c  |    82 +
 drivers/net/ethernet/intel/ice/kcompat_dim.h  |   339 +
 drivers/net/ethernet/intel/ice/kcompat_impl.h |   498 +
 .../net/ethernet/intel/ice/kcompat_net_dim.c  |   251 +
 .../net/ethernet/intel/ice/kcompat_overflow.h |   319 +
 .../net/ethernet/intel/ice/kcompat_pldmfw.c   |  1117 ++
 .../net/ethernet/intel/ice/kcompat_pldmfw.h   |   178 +
 .../ethernet/intel/ice/kcompat_rhel_defs.h    |    85 +
 .../ethernet/intel/ice/kcompat_sles_defs.h    |   152 +
 .../net/ethernet/intel/ice/kcompat_std_defs.h |   127 +
 .../ethernet/intel/ice/kcompat_ubuntu_defs.h  |    28 +
 drivers/net/ethernet/intel/ice/virtchnl.h     |  2232 ++++
 .../intel/ice/virtchnl_inline_ipsec.h         |   548 +
 .../ethernet/intel/ice/virtchnl_lan_desc.h    |   528 +
 package/default/config.default_kasan          |     1 +
 125 files changed, 123154 insertions(+), 13560 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/ice/ice_acl.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_acl.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_acl_ctrl.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_acl_main.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_arfs.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_arfs.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_base.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_base.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_cgu.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_cgu_ops.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_cgu_ops.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_cgu_regs.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_cgu_util.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_cgu_util.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_dcb_nl.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_dcb_nl.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_dcf.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_dcf.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_debugfs.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_devlink.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_devlink.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_eswitch.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_eswitch.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ethtool.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_fdir.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_fdir.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_flow.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_flow.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_fltr.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_fltr.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_fw_update.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_fw_update.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_fwlog.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_fwlog.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc_int.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_lag.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_lag.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_nvm.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_protocol_type.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ptp.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ptp.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ptp_consts.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ptp_hw.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ptp_hw.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_repr.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_repr.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_sbq_cmd.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_tc_lib.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_tc_lib.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_trace.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_txrx_lib.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_txrx_lib.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vlan.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vlan_mode.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vlan_mode.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_xsk.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_xsk.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat.c
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_dim.c
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_dim.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_impl.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_net_dim.c
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_overflow.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_pldmfw.c
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_pldmfw.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_rhel_defs.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_sles_defs.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_std_defs.h
 create mode 100644 drivers/net/ethernet/intel/ice/kcompat_ubuntu_defs.h
 create mode 100644 drivers/net/ethernet/intel/ice/virtchnl.h
 create mode 100644 drivers/net/ethernet/intel/ice/virtchnl_inline_ipsec.h
 create mode 100644 drivers/net/ethernet/intel/ice/virtchnl_lan_desc.h

diff --git a/arch/x86/configs/oc.config b/arch/x86/configs/oc.config
index d190a290bbf0..8c32b556976f 100644
--- a/arch/x86/configs/oc.config
+++ b/arch/x86/configs/oc.config
@@ -775,6 +775,7 @@ CONFIG_IXGBE_DCB=y
 CONFIG_IXGBEVF=m
 CONFIG_I40E=m
 CONFIG_I40EVF=m
+CONFIG_ICE=m
 CONFIG_IGC=m
 CONFIG_JME=m
 CONFIG_MVMDIO=m
diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile
index 9edde960b4f2..f45f573e6715 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -4,6 +4,9 @@
 #
 # Makefile for the Intel(R) Ethernet Connection E800 Series Linux Driver
 #
+ccflags-y += -I$(src)
+subdir-ccflags-y += -I$(src)
+
 
 obj-$(CONFIG_ICE) += ice.o
 
@@ -13,9 +16,46 @@ ice-y := ice_main.o	\
 	 ice_nvm.o	\
 	 ice_switch.o	\
 	 ice_sched.o	\
+	 ice_base.o	\
 	 ice_lib.o	\
+	 ice_txrx_lib.o	\
 	 ice_txrx.o	\
+	 ice_fltr.o	\
+	 ice_pf_vsi_vlan_ops.o \
+	 ice_vsi_vlan_ops.o \
+	 ice_vsi_vlan_lib.o \
+	 ice_tc_lib.o	\
+	 ice_fdir.o	\
+	 ice_ethtool_fdir.o	\
+	 ice_acl_main.o		\
+	 ice_acl.o		\
+	 ice_acl_ctrl.o		\
+	 ice_vlan_mode.o	\
 	 ice_flex_pipe.o	\
-	 ice_ethtool.o
-ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o
-ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_lib.o
+	 ice_flow.o		\
+	 ice_lag.o		\
+	 ice_fwlog.o		\
+	 ice_ethtool.o		\
+	 kcompat.o
+
+ice-$(CONFIG_NET_DEVLINK:m=y) += ice_devlink.o ice_fw_update.o
+ice-$(CONFIG_NET_DEVLINK:m=y) += ice_eswitch.o ice_repr.o
+ice-$(CONFIG_MFD_CORE:m=y) += ice_idc.o
+ice-$(CONFIG_DEBUG_FS) += ice_debugfs.o
+ice-$(CONFIG_PCI_IOV) += ice_virtchnl_allowlist.o
+ice-$(CONFIG_PCI_IOV) += ice_dcf.o
+ice-$(CONFIG_PCI_IOV) += ice_virtchnl_fdir.o
+ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o ice_vf_vsi_vlan_ops.o
+ice-$(CONFIG_PTP_1588_CLOCK:m=y) += ice_ptp.o ice_ptp_hw.o
+ice-$(CONFIG_PTP_1588_CLOCK:m=y) += ice_cgu_ops.o ice_cgu_util.o
+ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o
+ice-$(CONFIG_RFS_ACCEL) += ice_arfs.o
+ice-$(CONFIG_XDP_SOCKETS) += ice_xsk.o
+# Use kcompat pldmfw.c if kernel does not provide CONFIG_PLDMFW
+ifndef CONFIG_PLDMFW
+ice-y += kcompat_pldmfw.o
+endif
+# Use kcompat DIMLIB if kernel doesn't provide it
+ifndef CONFIG_DIMLIB
+ice-y += kcompat_dim.o kcompat_net_dim.o
+endif
diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 45e100666049..204dc3ddfcc0 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -1,9 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_H_
 #define _ICE_H_
 
+#include "kcompat.h"
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -16,9 +17,13 @@
 #include <linux/cpumask.h>
 #include <linux/rtnetlink.h>
 #include <linux/if_vlan.h>
+#ifdef HAVE_NETDEV_SB_DEV
+#include <linux/if_macvlan.h>
+#endif /* HAVE_NETDEV_SB_DEV */
 #include <linux/dma-mapping.h>
 #include <linux/pci.h>
 #include <linux/workqueue.h>
+#include <linux/wait.h>
 #include <linux/aer.h>
 #include <linux/interrupt.h>
 #include <linux/ethtool.h>
@@ -29,35 +34,120 @@
 #include <linux/ip.h>
 #include <linux/sctp.h>
 #include <linux/ipv6.h>
+#include <linux/pkt_sched.h>
 #include <linux/if_bridge.h>
+#include <linux/string.h>
 #include <linux/ctype.h>
-#include <linux/avf/virtchnl.h>
+#ifdef HAVE_XDP_SUPPORT
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#include <net/xdp_sock.h>
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 #include <net/ipv6.h>
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#include <net/devlink.h>
+#endif /* CONFIG_NET_DEVLINK */
+#if IS_ENABLED(CONFIG_DCB)
+#include <scsi/iscsi_proto.h>
+#endif /* CONFIG_DCB */
+#ifdef HAVE_CONFIG_DIMLIB
+#include <linux/dim.h>
+#else
+#include "kcompat_dim.h"
+#endif
 #include "ice_devids.h"
 #include "ice_type.h"
 #include "ice_txrx.h"
 #include "ice_dcb.h"
 #include "ice_switch.h"
 #include "ice_common.h"
+#include "ice_flow.h"
 #include "ice_sched.h"
+#include <linux/mfd/core.h>
+#include <linux/idr.h>
+#include "ice_idc_int.h"
+#include "virtchnl.h"
 #include "ice_virtchnl_pf.h"
 #include "ice_sriov.h"
+#include "ice_ptp.h"
+#include "ice_cgu.h"
+#include "ice_cgu_ops.h"
+#include "ice_cgu_util.h"
+#include "ice_fdir.h"
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#include "ice_xsk.h"
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#ifdef HAVE_NETDEV_UPPER_INFO
+#include "ice_lag.h"
+#endif /* HAVE_NETDEV_UPPER_INFO */
+#include "ice_trace.h"
+
+#if defined(HAVE_VXLAN_RX_OFFLOAD) || defined(HAVE_VXLAN_TYPE)
+#if IS_ENABLED(CONFIG_VXLAN)
+#include <net/vxlan.h>
+#endif
+#endif /* HAVE_VXLAN_RX_OFFLOAD || HAVE_VXLAN_TYPE */
+#ifdef HAVE_GRE_ENCAP_OFFLOAD
+#include <net/gre.h>
+#endif /* HAVE_GRE_ENCAP_OFFLOAD */
+#if defined(HAVE_GENEVE_RX_OFFLOAD) || defined(HAVE_GENEVE_TYPE)
+#if IS_ENABLED(CONFIG_GENEVE)
+#include <net/geneve.h>
+#endif
+#endif /* HAVE_GENEVE_RX_OFFLOAD || HAVE_GENEVE_TYPE */
+#ifdef HAVE_UDP_ENC_RX_OFFLOAD
+#include <net/udp_tunnel.h>
+#endif
+#ifdef NETIF_F_HW_TC
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_gact.h>
+#endif /* NETIF_F_HW_TC */
+#include <net/ip.h>
+#include <linux/cpu_rmap.h>
+#include <linux/atomic.h>
+#include <linux/jiffies.h>
+#include "ice_arfs.h"
+#include "ice_repr.h"
+#include "ice_eswitch.h"
+#include "ice_vsi_vlan_ops.h"
 
 extern const char ice_drv_ver[];
 #define ICE_BAR0		0
+#define ICE_BAR3		3
+#ifdef CONFIG_DEBUG_FS
+#define ICE_MAX_CSR_SPACE	(8 * 1024 * 1024 - 64 * 1024)
+#endif /* CONFIG_DEBUG_FS */
 #define ICE_REQ_DESC_MULTIPLE	32
 #define ICE_MIN_NUM_DESC	64
 #define ICE_MAX_NUM_DESC	8160
 #define ICE_DFLT_MIN_RX_DESC	512
-#define ICE_DFLT_NUM_TX_DESC	256
 #define ICE_DFLT_NUM_RX_DESC	2048
+#define ICE_DFLT_NUM_TX_DESC	256
 
+#define ICE_DFLT_TXQ_VMDQ_VSI	1
+#define ICE_DFLT_RXQ_VMDQ_VSI	1
+#define ICE_DFLT_VEC_VMDQ_VSI	1
+#define ICE_MAX_NUM_VMDQ_VSI	16
+#define ICE_MAX_TXQ_VMDQ_VSI	4
+#define ICE_MAX_RXQ_VMDQ_VSI	4
+#ifdef HAVE_NETDEV_SB_DEV
+#define ICE_MAX_MACVLANS	64
+#endif
 #define ICE_DFLT_TRAFFIC_CLASS	BIT(0)
 #define ICE_INT_NAME_STR_LEN	(IFNAMSIZ + 16)
-#define ICE_AQ_LEN		64
+#define ICE_AQ_LEN		192
 #define ICE_MBXSQ_LEN		64
-#define ICE_MBXRQ_LEN		512
-#define ICE_MIN_MSIX		2
+#define ICE_SBQ_LEN		64
+#define ICE_FDIR_MSIX		2
+#define ICE_ESWITCH_MSIX	1
+#define ICE_MIN_LAN_MSIX	1
+#define ICE_OICR_MSIX		1
+#define ICE_RDMA_NUM_AEQ_MSIX	4
+#define ICE_MIN_RDMA_MSIX	2
+#define ICE_MIN_MSIX		(ICE_MIN_LAN_MSIX + ICE_OICR_MSIX)
 #define ICE_NO_VSI		0xffff
 #define ICE_VSI_MAP_CONTIG	0
 #define ICE_VSI_MAP_SCATTER	1
@@ -66,20 +156,32 @@ extern const char ice_drv_ver[];
 #define ICE_Q_WAIT_RETRY_LIMIT	10
 #define ICE_Q_WAIT_MAX_RETRY	(5 * ICE_Q_WAIT_RETRY_LIMIT)
 #define ICE_MAX_LG_RSS_QS	256
-#define ICE_MAX_SMALL_RSS_QS	8
+#define ICE_MAX_MEDIUM_RSS_QS	64
+#define ICE_MAX_SMALL_RSS_QS	16
 #define ICE_RES_VALID_BIT	0x8000
 #define ICE_RES_MISC_VEC_ID	(ICE_RES_VALID_BIT - 1)
+#define ICE_RES_RDMA_VEC_ID	(ICE_RES_MISC_VEC_ID - 1)
+/* All VF control VSIs share the same irq, so assign a unique ID for them */
+#define ICE_RES_VF_CTRL_VEC_ID	(ICE_RES_RDMA_VEC_ID - 1)
 #define ICE_INVAL_Q_INDEX	0xffff
 #define ICE_INVAL_VFID		256
 
+#define ICE_MAX_RXQS_PER_TC		256	/* Used when setting VSI context per TC Rx queues */
+#define ICE_MAX_TXQS_PER_TC		8
+#define ICE_MAX_RDMA_QSET_PER_TC	1
+
+#define ICE_CHNL_START_TC		1
+#define ICE_CHNL_MAX_TC			16
+
 #define ICE_MAX_RESET_WAIT		20
 
+#define ICE_VF_CHNL_START_TC		1
+
 #define ICE_VSIQF_HKEY_ARRAY_SIZE	((VSIQF_HKEY_MAX_INDEX + 1) *	4)
 
 #define ICE_DFLT_NETIF_M (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
 
-#define ICE_MAX_MTU	(ICE_AQ_SET_MAC_FRAME_SIZE_MAX - \
-			(ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2)))
+#define ICE_MAX_MTU	(ICE_AQ_SET_MAC_FRAME_SIZE_MAX - ICE_ETH_PKT_HDR_PAD)
 
 #define ICE_UP_TABLE_TRANSLATE(val, i) \
 		(((val) << ICE_AQ_VSI_UP_TABLE_UP##i##_S) & \
@@ -88,6 +190,21 @@ extern const char ice_drv_ver[];
 #define ICE_TX_DESC(R, i) (&(((struct ice_tx_desc *)((R)->desc))[i]))
 #define ICE_RX_DESC(R, i) (&(((union ice_32b_rx_flex_desc *)((R)->desc))[i]))
 #define ICE_TX_CTX_DESC(R, i) (&(((struct ice_tx_ctx_desc *)((R)->desc))[i]))
+#define ICE_TX_FDIRDESC(R, i) (&(((struct ice_fltr_desc *)((R)->desc))[i]))
+
+#define ICE_ACL_ENTIRE_SLICE	1
+#define ICE_ACL_HALF_SLICE	2
+
+/* Minimum BW limit is 500 Kbps for any scheduler node */
+#define ICE_MIN_BW_LIMIT		500
+/* User can specify BW in either Kbit/Mbit/Gbit and OS converts it in bytes.
+ * use it to convert user specified BW limit into Kbps
+ */
+#define ICE_BW_KBPS_DIVISOR		125
+
+#if defined(HAVE_TC_FLOWER_ENC) && defined(HAVE_TC_INDIR_BLOCK)
+#define ICE_GTP_TNL_WELLKNOWN_PORT 2152
+#endif /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
 
 /* Macro for each VSI in a PF */
 #define ice_for_each_vsi(pf, i) \
@@ -110,13 +227,13 @@ extern const char ice_drv_ver[];
 #define ice_for_each_q_vector(vsi, i) \
 	for ((i) = 0; (i) < (vsi)->num_q_vectors; (i)++)
 
-#define ICE_UCAST_PROMISC_BITS (ICE_PROMISC_UCAST_TX | ICE_PROMISC_MCAST_TX | \
-				ICE_PROMISC_UCAST_RX | ICE_PROMISC_MCAST_RX)
+#define ice_for_each_chnl_tc(i)	\
+	for ((i) = ICE_CHNL_START_TC; (i) < ICE_CHNL_MAX_TC; (i)++)
+
+#define ICE_UCAST_PROMISC_BITS (ICE_PROMISC_UCAST_TX | ICE_PROMISC_UCAST_RX)
 
 #define ICE_UCAST_VLAN_PROMISC_BITS (ICE_PROMISC_UCAST_TX | \
-				     ICE_PROMISC_MCAST_TX | \
 				     ICE_PROMISC_UCAST_RX | \
-				     ICE_PROMISC_MCAST_RX | \
 				     ICE_PROMISC_VLAN_TX  | \
 				     ICE_PROMISC_VLAN_RX)
 
@@ -127,6 +244,72 @@ extern const char ice_drv_ver[];
 				     ICE_PROMISC_VLAN_TX  | \
 				     ICE_PROMISC_VLAN_RX)
 
+#define ice_pf_to_dev(pf) (&((pf)->pdev->dev))
+
+struct ice_fwlog_user_input {
+	unsigned long events;
+	u8 log_level;
+};
+
+enum ice_feature {
+	ICE_F_DSCP,
+	ICE_F_PTP_EXTTS,
+	ICE_F_MAX
+};
+
+
+enum ice_channel_fltr_type {
+	ICE_CHNL_FLTR_TYPE_INVALID,
+	ICE_CHNL_FLTR_TYPE_SRC_PORT,
+	ICE_CHNL_FLTR_TYPE_DEST_PORT,
+	ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT, /* for future use cases */
+	ICE_CHNL_FLTR_TYPE_TENANT_ID,
+	ICE_CHNL_FLTR_TYPE_SRC_IPV4,
+	ICE_CHNL_FLTR_TYPE_DEST_IPV4,
+	ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4,
+	ICE_CHNL_FLTR_TYPE_SRC_IPV6,
+	ICE_CHNL_FLTR_TYPE_DEST_IPV6,
+	ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6,
+	ICE_CHNL_FLTR_TYPE_LAST /* must be last */
+};
+
+struct ice_channel {
+	struct list_head list;
+	u8 type;
+	u8 ch_type; /* NVMe over TCP, AF_XDP, UDP based, etc.. */
+	u16 sw_id;
+	u16 base_q;
+	u16 num_rxq;
+	u16 num_txq;
+	u16 vsi_num;
+	u8 ena_tc;
+	struct ice_aqc_vsi_props info;
+	u64 max_tx_rate;
+	u64 min_tx_rate;
+	atomic_t num_sb_fltr;
+	/* counter index when side-band FD is used */
+	u32 fd_cnt_index;
+	/* queue used to setup inline-FD */
+	atomic_t fd_queue;
+	/* packets services thru' inline-FD filter */
+	u64 fd_pkt_cnt;
+	enum ice_channel_fltr_type fltr_type;
+	struct ice_vsi *ch_vsi;
+};
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+/* To convert BPS BW parameter into Mbps*/
+#define ICE_BW_MBIT_PS_DIVISOR	125000 /* rate / (1000000 / 8) Mbps */
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+struct ice_txq_meta {
+	u32 q_teid;	/* Tx-scheduler element identifier */
+	u16 q_id;	/* Entry in VSI's txq_map bitmap */
+	u16 q_handle;	/* Relative index of Tx queue within TC */
+	u16 vsi_idx;	/* VSI index that Tx queue belongs to */
+	u8 tc;		/* TC number that Tx queue belongs to */
+};
+
 struct ice_tc_info {
 	u16 qoffset;
 	u16 qcount_tx;
@@ -136,14 +319,14 @@ struct ice_tc_info {
 
 struct ice_tc_cfg {
 	u8 numtc; /* Total number of enabled TCs */
-	u8 ena_tc; /* Tx map */
+	u16 ena_tc; /* Tx map */
 	struct ice_tc_info tc_info[ICE_MAX_TRAFFIC_CLASS];
 };
 
 struct ice_res_tracker {
 	u16 num_entries;
 	u16 end;
-	u16 list[1];
+	u16 list[];
 };
 
 struct ice_qs_cfg {
@@ -161,50 +344,100 @@ struct ice_sw {
 	struct ice_pf *pf;
 	u16 sw_id;		/* switch ID for this switch */
 	u16 bridge_mode;	/* VEB/VEPA/Port Virtualizer */
+	struct ice_vsi *dflt_vsi;	/* default VSI for this switch */
+	u8 dflt_vsi_ena:1;	/* true if above dflt_vsi is enabled */
 };
 
-enum ice_state {
-	__ICE_TESTING,
-	__ICE_DOWN,
-	__ICE_NEEDS_RESTART,
-	__ICE_PREPARED_FOR_RESET,	/* set by driver when prepared */
-	__ICE_RESET_OICR_RECV,		/* set by driver after rcv reset OICR */
-	__ICE_PFR_REQ,			/* set by driver and peers */
-	__ICE_CORER_REQ,		/* set by driver and peers */
-	__ICE_GLOBR_REQ,		/* set by driver and peers */
-	__ICE_CORER_RECV,		/* set by OICR handler */
-	__ICE_GLOBR_RECV,		/* set by OICR handler */
-	__ICE_EMPR_RECV,		/* set by OICR handler */
-	__ICE_SUSPENDED,		/* set on module remove path */
-	__ICE_RESET_FAILED,		/* set by reset/rebuild */
+enum ice_pf_state {
+	ICE_TESTING,
+	ICE_DOWN,
+	ICE_NEEDS_RESTART,
+	ICE_PREPARED_FOR_RESET,	/* set by driver when prepared */
+	ICE_RESET_OICR_RECV,		/* set by driver after rcv reset OICR */
+	ICE_PFR_REQ,			/* set by driver and peers */
+	ICE_CORER_REQ,		/* set by driver and peers */
+	ICE_GLOBR_REQ,		/* set by driver and peers */
+	ICE_CORER_RECV,		/* set by OICR handler */
+	ICE_GLOBR_RECV,		/* set by OICR handler */
+	ICE_EMPR_RECV,		/* set by OICR handler */
+	ICE_SUSPENDED,		/* set on module remove path */
+	ICE_RESET_FAILED,		/* set by reset/rebuild */
+	ICE_RECOVERY_MODE,		/* set when recovery mode is detected */
+	ICE_PREPPED_RECOVERY_MODE,	/* set on recovery mode transition */
 	/* When checking for the PF to be in a nominal operating state, the
 	 * bits that are grouped at the beginning of the list need to be
-	 * checked. Bits occurring before __ICE_STATE_NOMINAL_CHECK_BITS will
+	 * checked. Bits occurring before ICE_STATE_NOMINAL_CHECK_BITS will
 	 * be checked. If you need to add a bit into consideration for nominal
 	 * operating state, it must be added before
-	 * __ICE_STATE_NOMINAL_CHECK_BITS. Do not move this entry's position
+	 * ICE_STATE_NOMINAL_CHECK_BITS. Do not move this entry's position
 	 * without appropriate consideration.
 	 */
-	__ICE_STATE_NOMINAL_CHECK_BITS,
-	__ICE_ADMINQ_EVENT_PENDING,
-	__ICE_MAILBOXQ_EVENT_PENDING,
-	__ICE_MDD_EVENT_PENDING,
-	__ICE_VFLR_EVENT_PENDING,
-	__ICE_FLTR_OVERFLOW_PROMISC,
-	__ICE_VF_DIS,
-	__ICE_CFG_BUSY,
-	__ICE_SERVICE_SCHED,
-	__ICE_SERVICE_DIS,
-	__ICE_OICR_INTR_DIS,		/* Global OICR interrupt disabled */
-	__ICE_STATE_NBITS		/* must be last */
+	ICE_STATE_NOMINAL_CHECK_BITS,
+	ICE_ADMINQ_EVENT_PENDING,
+	ICE_MAILBOXQ_EVENT_PENDING,
+	ICE_SIDEBANDQ_EVENT_PENDING,
+	ICE_MDD_EVENT_PENDING,
+	ICE_VFLR_EVENT_PENDING,
+	ICE_FLTR_OVERFLOW_PROMISC,
+	ICE_VF_DIS,
+	ICE_CFG_BUSY,
+	ICE_SERVICE_SCHED,
+	ICE_PTP_TX_TS_READY,
+	ICE_PTP_EXT_TS_READY,
+	ICE_SERVICE_DIS,
+	ICE_FD_FLUSH_REQ,
+	ICE_OICR_INTR_DIS,		/* Global OICR interrupt disabled */
+	ICE_BAD_EEPROM,
+	ICE_MDD_VF_PRINT_PENDING,	/* set when MDD event handle */
+	ICE_VF_RESETS_DISABLED,	/* disable resets during ice_remove */
+	ICE_LINK_DEFAULT_OVERRIDE_PENDING,
+	ICE_PHY_INIT_COMPLETE,
+	ICE_FD_VF_FLUSH_CTX,		/* set at FD Rx IRQ or timeout */
+	ICE_STATE_NBITS		/* must be last */
+};
+
+enum ice_vsi_state {
+	ICE_VSI_DOWN,
+	ICE_VSI_NEEDS_RESTART,
+	ICE_VSI_NETDEV_ALLOCD,
+	ICE_VSI_NETDEV_REGISTERED,
+	ICE_VSI_UMAC_FLTR_CHANGED,
+	ICE_VSI_MMAC_FLTR_CHANGED,
+	ICE_VSI_VLAN_FLTR_CHANGED,
+	ICE_VSI_PROMISC_CHANGED,
+	ICE_VSI_STATE_NBITS		/* must be last */
+};
+
+enum ice_chnl_feature {
+	ICE_CHNL_FEATURE_FD_ENA, /* for side-band flow-director */
+	ICE_CHNL_FEATURE_INLINE_FD_ENA, /* for inline flow-director */
+	/* using the SO_MARK socket option will trigger skb->mark to be set.
+	 * Driver should act on skb->mark of not (to align flow to HW queue
+	 * binding) is additionally controlled via ethtool private flag and
+	 * when that private flag is tunred ON/OFF, this feature flags is
+	 * set/reset. This feature flag is used to determine if driver should
+	 * act or not when skb->mark is set.
+	 */
+	ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA,
+	/* for pkt based inspection optimization - related to SW triggered
+	 * interrupt from napi_poll for channel enabled vector
+	 */
+	ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA,
+	/* when set, allows cleaning of Rx queue(s) when napi_poll is invoked
+	 * due to busy_poll_stop
+	 */
+	ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+	ICE_CHNL_FEATURE_NBITS		/* must be last */
 };
 
-enum ice_vsi_flags {
-	ICE_VSI_FLAG_UMAC_FLTR_CHANGED,
-	ICE_VSI_FLAG_MMAC_FLTR_CHANGED,
-	ICE_VSI_FLAG_VLAN_FLTR_CHANGED,
-	ICE_VSI_FLAG_PROMISC_CHANGED,
-	ICE_VSI_FLAG_NBITS		/* must be last */
+/* This is to be used only when channels are configured, to track state
+ * at PF level, whether it should use RSS or inline flow-director and this
+ * state gets set/reset appropriately as the HW flow-director table becomes
+ * full/not-full
+ */
+enum ice_advanced_state_t {
+	ICE_SWITCH_TO_RSS,
+	ICE_ADVANCED_STATE_LAST, /* this must be last */
 };
 
 /* struct that defines a VSI, associated with a dev */
@@ -215,20 +448,26 @@ struct ice_vsi {
 	struct ice_port_info *port_info; /* back pointer to port_info */
 	struct ice_ring **rx_rings;	 /* Rx ring array */
 	struct ice_ring **tx_rings;	 /* Tx ring array */
+#ifdef HAVE_NETDEV_SB_DEV
+	/* Initial VSI tx_rings array when L2 offload is off */
+	struct ice_ring **base_tx_rings;
+#endif /* HAVE_NETDEV_SB_DEV */
 	struct ice_q_vector **q_vectors; /* q_vector array */
 
 	irqreturn_t (*irq_handler)(int irq, void *data);
 
 	u64 tx_linearize;
-	DECLARE_BITMAP(state, __ICE_STATE_NBITS);
-	DECLARE_BITMAP(flags, ICE_VSI_FLAG_NBITS);
+	DECLARE_BITMAP(state, ICE_VSI_STATE_NBITS);
 	unsigned int current_netdev_flags;
 	u32 tx_restart;
 	u32 tx_busy;
 	u32 rx_buf_failed;
 	u32 rx_page_failed;
-	int num_q_vectors;
-	int base_vector;		/* IRQ base for OS reserved vectors */
+#ifdef ICE_ADD_PROBES
+	u32 rx_page_reuse;
+#endif /* ICE_ADD_PROBES */
+	u16 num_q_vectors;
+	u16 base_vector;		/* IRQ base for OS reserved vectors */
 	enum ice_vsi_type type;
 	u16 vsi_num;			/* HW (absolute) index of this VSI */
 	u16 idx;			/* software index in pf->vsi[] */
@@ -236,6 +475,10 @@ struct ice_vsi {
 	s16 vf_id;			/* VF ID for SR-IOV VSIs */
 
 	u16 ethtype;			/* Ethernet protocol for pause frame */
+	u16 num_gfltr;
+	u16 num_bfltr;
+	u16 cntr_gfltr;
+	u16 cntr_bfltr;
 
 	/* RSS config */
 	u16 rss_table_size;	/* HW RSS table size */
@@ -244,6 +487,15 @@ struct ice_vsi {
 	u8 *rss_lut_user;	/* User configured lookup table entries */
 	u8 rss_lut_type;	/* used to configure Get/Set RSS LUT AQ call */
 
+	/* aRFS members only allocated for the PF VSI */
+#define ICE_MAX_RFS_FILTERS	0xFFFF
+#define ICE_MAX_ARFS_LIST	1024
+#define ICE_ARFS_LST_MASK	(ICE_MAX_ARFS_LIST - 1)
+	struct hlist_head *arfs_fltr_list;
+	struct ice_arfs_active_fltr_cntrs *arfs_fltr_cntrs;
+	spinlock_t arfs_lock;	/* protects aRFS hash table and filter state */
+	atomic_t *arfs_last_fltr_id;
+
 	u16 max_frame;
 	u16 rx_buf_len;
 
@@ -260,7 +512,10 @@ struct ice_vsi {
 	u8 irqs_ready:1;
 	u8 current_isup:1;		 /* Sync 'link up' logging */
 	u8 stat_offsets_loaded:1;
-	u8 vlan_ena:1;
+	struct ice_vsi_vlan_ops inner_vlan_ops;
+	struct ice_vsi_vlan_ops outer_vlan_ops;
+	u16 num_vlan;
+
 
 	/* queue information */
 	u8 tx_mapping_mode;		 /* ICE_MAP_MODE_[CONTIG|SCATTER] */
@@ -271,11 +526,147 @@ struct ice_vsi {
 	u16 num_txq;			 /* Used Tx queues */
 	u16 alloc_rxq;			 /* Allocated Rx queues */
 	u16 num_rxq;			 /* Used Rx queues */
+	u16 req_txq;			 /* User requested Tx queues */
+	u16 req_rxq;			 /* User requested Rx queues */
 	u16 num_rx_desc;
 	u16 num_tx_desc;
+	u16 qset_handle[ICE_MAX_TRAFFIC_CLASS];
 	struct ice_tc_cfg tc_cfg;
+#ifdef HAVE_XDP_SUPPORT
+	struct bpf_prog *xdp_prog;
+	struct ice_ring **xdp_rings;	 /* XDP ring array */
+	u16 num_xdp_txq;		 /* Used XDP queues */
+	u8 xdp_mapping_mode;		 /* ICE_MAP_MODE_[CONTIG|SCATTER] */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	struct xdp_umem **xsk_umems;
+	u16 num_xsk_umems_used;
+	u16 num_xsk_umems;
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	struct tc_mqprio_qopt_offload mqprio_qopt;/* queue parameters */
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+	DECLARE_BITMAP(ptp_tx_idx, INDEX_PER_QUAD);
+	struct sk_buff *ptp_tx_skb[INDEX_PER_QUAD];
+	u32 tx_hwtstamp_skipped;
+	u8 ptp_tx:1;
+
+	/* Channel Specific Fields */
+	struct ice_vsi *tc_map_vsi[ICE_CHNL_MAX_TC];
+	u16 cnt_q_avail;
+	u16 next_base_q;    /* next queue to be used for channel setup */
+	struct list_head ch_list;
+	u16 num_chnl_rxq;
+	u16 num_chnl_txq;
+	u16 ch_rss_size;
+	u16 num_chnl_fltr;
+	/* store away rss size info before configuring ADQ channels so that,
+	 * it can be used after tc-qdisc delete, to get back RSS setting as
+	 * they were before
+	 */
+	u16 orig_rss_size;
+	u8 vf_adq_tc;	/* traffic class number for VF ADQ VSI */
+	/* track various feature bits for channel VSI */
+	DECLARE_BITMAP(features, ICE_CHNL_FEATURE_NBITS);
+#define ICE_TBL_FULL_TIMES             5
+	/* how many times transitioned into inline flow-director from RSS */
+	u64 cnt_inline_fd_transition;
+	/* how many times HW table is flushed */
+	u64 cnt_table_flushed;
+	/* keeps track, how many times SW detected that HW table remain full
+	 * once SW state is SWITCHED_TO_RSS
+	 */
+	int cnt_tbl_full;
+
+	/* inline_fd_active_cnt is SW based counter which keeps track of active
+	 * inline-FD filter entries in table
+	 */
+	atomic_t inline_fd_active_cnt;
+	DECLARE_BITMAP(adv_state, ICE_ADVANCED_STATE_LAST);
+
+	/* this keeps tracks of all enabled TC with and without DCB
+	 * and inclusive of ADQ, vsi->mqprio_opt keeps track of queue
+	 * information
+	 */
+	u8 all_numtc;
+	u16 all_enatc;
+
+	/* store away TC info, to be used for rebuild logic */
+	u8 old_numtc;
+	u16 old_ena_tc;
+
+	struct ice_channel *ch;
+	struct net_device **target_netdevs;
+
+	/* setup back reference, to which aggregator node this VSI
+	 * corresponds to
+	 */
+	struct ice_agg_node *agg_node;
+	u16 *global_lut_id;
 } ____cacheline_internodealigned_in_smp;
 
+enum ice_chnl_vector_state {
+	ICE_CHNL_VECTOR_IN_BP,
+	ICE_CHNL_VECTOR_PREV_IN_BP,
+	ICE_CHNL_VECTOR_ONCE_IN_BP,
+	ICE_CHNL_VECTOR_PREV_DATA_PKT_RECV,
+	ICE_CHNL_VECTOR_WD_EQUALS_BP,
+	ICE_CHNL_VECTOR_NBITS, /* This must be last */
+};
+
+#ifdef ADQ_PERF_COUNTERS
+struct ice_q_vector_ch_stats {
+	/* following are used as part of managing driver internal
+	 * state machine. Only to be used for perf debugging and
+	 * it is controlled by module_param : debug_mask
+	 */
+	u64 in_bp;
+	u64 in_int;
+	u64 real_int_to_bp;
+	u64 real_bp_to_int;
+	u64 real_int_to_int;
+	u64 real_bp_to_bp;
+
+	/* These counter is used to track real transition of vector from
+	 * BUSY_POLL to INTERRUPT based on enhanced logic (using state
+	 * machine and control packets).
+	 */
+	u64 unlikely_cb_to_bp;
+	/* This is used to keep track of enabling interrupt from napi_poll
+	 * when state machine condition indicated once_in_bp is false
+	 */
+	u64 once_bp_false;
+	u64 num_need_resched_bp_stop;
+	u64 num_timeout_bp_stop;
+	u64 num_l_c_data_pkt;
+	u64 num_l_c_data_pkt1;
+	u64 num_sw_intr_timeout; /* track SW INTR from napi_poll */
+	u64 num_sw_intr_serv_task; /* track SW INTR from service_task */
+	u64 cleaned_any_data_pkt;
+	/* Tracking "unlikely_cb_bp and once_in_bp is true" */
+	u64 ucb_o_bp;
+	/* This keeps track of how many times, bailout when once_in_bp is set,
+	 * unlikely_cb_to_bp is set, but pkt based interrupt optimization
+	 * is OFF
+	 */
+	u64 num_no_sw_intr_opt_off;
+	/* tracking, how many times WB_ON_ITR is set */
+	u64 num_wb_on_itr_set;
+
+	u64 pkt_bp_stop_napi_budget;
+	u64 pkt_bp_stop_bp_budget;
+
+	u64 bp_wd_equals_budget64;
+	u64 bp_wd_equals_budget8;
+
+	u64 keep_state_bp_budget64;
+	u64 keep_state_bp_budget8;
+};
+#endif /* ADQ_PERF_COUNTERS */
+
 /* struct that defines an interrupt vector */
 struct ice_q_vector {
 	struct ice_vsi *vsi;
@@ -284,7 +675,7 @@ struct ice_q_vector {
 	u16 reg_idx;
 	u8 num_ring_rx;			/* total number of Rx rings in vector */
 	u8 num_ring_tx;			/* total number of Tx rings in vector */
-	u8 itr_countdown;		/* when 0 should adjust adaptive ITR */
+	u8 wb_on_itr:1;			/* if true, WB on ITR is enabled */
 	/* in usecs, need to use ice_intrl_to_usecs_reg() before writing this
 	 * value to the device
 	 */
@@ -298,27 +689,185 @@ struct ice_q_vector {
 	cpumask_t affinity_mask;
 	struct irq_affinity_notify affinity_notify;
 
+	struct ice_channel *ch;
+
 	char name[ICE_INT_NAME_STR_LEN];
+
+	u16 total_events;	/* net_dim(): number of interrupts processed */
+	/* This tracks current state of vector, BUSY_POLL or INTR */
+#define ICE_CHNL_IN_BP			BIT(ICE_CHNL_VECTOR_IN_BP)
+	/* This tracks prev state of vector, BUSY_POLL or INTR */
+#define ICE_CHNL_PREV_IN_BP		BIT(ICE_CHNL_VECTOR_PREV_IN_BP)
+	/* This tracks state of vector, was the ever in BUSY_POLL. This
+	 * state goes to INTT if interrupt are enabled or SW interrupts
+	 * are triggered from either service_task or napi_poll
+	 */
+#define ICE_CHNL_ONCE_IN_BP		BIT(ICE_CHNL_VECTOR_ONCE_IN_BP)
+	/* Tracks if previously - were there any data packets received
+	 * on per channel enabled vector or not
+	 */
+#define ICE_CHNL_PREV_DATA_PKT_RECV	BIT(ICE_CHNL_VECTOR_PREV_DATA_PKT_RECV)
+	/* tracks if number of Rx packets processed is equal to budget or not.
+	 * It is set from napi_poll and used from ice_refresh_bp_state
+	 * to determine if internal state of vector to be kept in BUSY_POLL
+	 * or not
+	 */
+#define ICE_CHNL_WD_EQUALS_BP		BIT(ICE_CHNL_VECTOR_WD_EQUALS_BP)
+	/* it is used to keep track of various states as defined earlier
+	 * and those states are used during ADQ performance optimization
+	 */
+	u8 state_flags;
+	/* Used in logic to determine if SW inter is needed or not.
+	 * This is used only for channel enabled vector
+	 */
+	u64 jiffy;
+	/* Primarily used in decision making w.r.t using inline flow-director */
+	atomic_t inline_fd_cnt;
+
+	/* This is applicable only for ADQ enabled vectors and used to avoid
+	 * situation of OS triggering ksoftirqd.
+	 *
+	 * Usually busy_poll_stop is followed by napi_schedule:napi_poll if
+	 * driver returned "budget" as part of processing packets during
+	 * busy_poll_stop. As long as driver continue to return "budget",
+	 * OS keeps calling napi_schedule upto 10 times or 2msec and then
+	 * arms the ksoftrqd.
+	 *
+	 * As part of ADQ performance optimization, it is not preferable to
+	 * let ksoftirqd run when there has been enough packets processed.
+	 * To facilitate fairness to the consumer of those packets,
+	 * do not process Rx queues after 8 times.
+	 */
+	/* following variable keeps track of how many times Rx queues were
+	 * processed when napi_poll is invoked thru napi_schedule (as a result
+	 * of returning "budget" from busy_poll_stop:napi_poll) and
+	 * work_done == budget.
+	 */
+	u8 process_rx_queues;
+
+	/* following is controlled thru' priv-flag, value of
+	 * "max_limit_process_rx_queues" becomes 8 when priv-flag is set
+	 * otherwise it is set to 4 (default)
+	 */
+#define ICE_MAX_LIMIT_PROCESS_RX_PKTS_DFLT  4
+#define ICE_MAX_LIMIT_PROCESS_RX_PKTS  8
+	u8 max_limit_process_rx_queues;
+
+#ifdef ADQ_PERF_COUNTERS
+	struct ice_q_vector_ch_stats ch_stats;
+#endif /* ADQ_PERF_COUNTERS */
 } ____cacheline_internodealigned_in_smp;
 
 enum ice_pf_flags {
 	ICE_FLAG_FLTR_SYNC,
+	ICE_FLAG_VMDQ_ENA,
+#ifdef HAVE_NETDEV_SB_DEV
+	ICE_FLAG_MACVLAN_ENA,
+#endif /* HAVE_NETDEV_SB_DEV */
+	ICE_FLAG_IWARP_ENA,
 	ICE_FLAG_RSS_ENA,
 	ICE_FLAG_SRIOV_ENA,
 	ICE_FLAG_SRIOV_CAPABLE,
 	ICE_FLAG_DCB_CAPABLE,
 	ICE_FLAG_DCB_ENA,
+	ICE_FLAG_FD_ENA,
+	ICE_FLAG_PTP_ENA,		/* NVM PTP support */
+	ICE_FLAG_PTP,			/* PTP successfully initialized */
+	ICE_FLAG_PEER_ENA,
 	ICE_FLAG_ADV_FEATURES,
+#ifdef NETIF_F_HW_TC
+	ICE_FLAG_TC_MQPRIO,		/* support for Multi queue TC */
+	ICE_FLAG_CLS_FLOWER,		/* support cls flower filters */
+#endif /* NETIF_F_HW_TC */
 	ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA,
+	ICE_FLAG_TOTAL_PORT_SHUTDOWN_ENA,
 	ICE_FLAG_NO_MEDIA,
+#ifndef ETHTOOL_GFECPARAM
+	ICE_FLAG_RS_FEC,
+	ICE_FLAG_BASE_R_FEC,
+#endif /* !ETHTOOL_GFECPARAM */
 	ICE_FLAG_FW_LLDP_AGENT,
+	ICE_FLAG_CHNL_INLINE_FD_ENA,
+	ICE_FLAG_CHNL_INLINE_FD_MARK_ENA,
+	ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA,
+	ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_ENA,
+	ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_CFG,
+	ICE_FLAG_MOD_POWER_UNSUPPORTED,
 	ICE_FLAG_ETHTOOL_CTXT,		/* set when ethtool holds RTNL lock */
+	ICE_FLAG_LEGACY_RX,
+	ICE_FLAG_VF_TRUE_PROMISC_ENA,
+	ICE_FLAG_MDD_AUTO_RESET_VF,
+	ICE_FLAG_VF_VLAN_PRUNE_DIS,
+	ICE_FLAG_LINK_LENIENT_MODE_ENA,
+	ICE_FLAG_ESWITCH_CAPABLE,
 	ICE_PF_FLAGS_NBITS		/* must be last */
 };
 
+#ifdef HAVE_NETDEV_SB_DEV
+struct ice_macvlan {
+	struct list_head list;
+	int id;
+	struct net_device *vdev;
+	struct ice_vsi *parent_vsi;
+	struct ice_vsi *vsi;
+	u8 mac[ETH_ALEN];
+};
+#endif /* HAVE_NETDEV_SB_DEV */
+
+struct ice_switchdev_info {
+	struct ice_vsi *control_vsi;
+	struct ice_vsi *uplink_vsi;
+	bool is_running;
+};
+
+enum ice_tnl_state {
+	ICE_TNL_SET_TO_ADD,
+	ICE_TNL_ACTIVE,
+	ICE_TNL_SET_TO_DEL,
+	ICE_TNL_LAST = 0xFF, /* must be last */
+};
+
+struct ice_tnl_entry {
+	enum ice_tunnel_type type;
+	u16 port;
+#define ICE_TNL_STATE_TO_ADD	BIT(ICE_TNL_SET_TO_ADD)
+#define ICE_TNL_STATE_ACTIVE	BIT(ICE_TNL_ACTIVE)
+#define ICE_TNL_STATE_TO_DEL	BIT(ICE_TNL_SET_TO_DEL)
+	u8 state;
+	u8 ref_cnt;
+	struct list_head node;
+};
+
+struct ice_agg_node {
+	u32 agg_id;
+#define ICE_MAX_VSIS_IN_AGG_NODE	64
+	u32 num_vsis;
+	u8 valid;
+};
+
+enum ice_flash_update_preservation {
+	/* Preserve all settings and fields */
+	ICE_FLASH_UPDATE_PRESERVE_ALL = 0,
+	/* Preserve limited fields, such as VPD, PCI serial ID, MACs, etc */
+	ICE_FLASH_UPDATE_PRESERVE_LIMITED,
+	/* Return all fields to factory settings */
+	ICE_FLASH_UPDATE_PRESERVE_FACTORY_SETTINGS,
+	/* Do not perform any preservation */
+	ICE_FLASH_UPDATE_PRESERVE_NONE,
+};
+
 struct ice_pf {
 	struct pci_dev *pdev;
 
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_REGIONS
+	struct devlink_region *nvm_region;
+	struct devlink_region *devcaps_region;
+#endif /* HAVE_DEVLINK_REGIONS */
+	/* devlink port data */
+	struct devlink_port devlink_port;
+#endif /* CONFIG_NET_DEVLINK */
+
 	/* OS reserved IRQ details */
 	struct msix_entry *msix_entries;
 	struct ice_res_tracker *irq_tracker;
@@ -328,15 +877,25 @@ struct ice_pf {
 	 */
 	u16 sriov_base_vector;
 
+	u16 ctrl_vsi_idx;		/* control VSI index in pf->vsi array */
+
 	struct ice_vsi **vsi;		/* VSIs created by the driver */
 	struct ice_sw *first_sw;	/* first switch created by firmware */
+	u16 eswitch_mode;		/* current mode of eswitch */
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *ice_debugfs_pf;
+#endif /* CONFIG_DEBUG_FS */
 	/* Virtchnl/SR-IOV config info */
 	struct ice_vf *vf;
-	int num_alloc_vfs;		/* actual number of VFs allocated */
+	u16 num_alloc_vfs;	/* actual number of VFs allocated */
 	u16 num_vfs_supported;		/* num VFs supported for this PF */
-	u16 num_vf_qps;			/* num queue pairs per VF */
-	u16 num_vf_msix;		/* num vectors per VF */
-	DECLARE_BITMAP(state, __ICE_STATE_NBITS);
+	u16 num_qps_per_vf;
+	u16 num_msix_per_vf;
+	/* used to ratelimit the MDD event logging */
+	unsigned long last_printed_mdd_jiffies;
+	DECLARE_BITMAP(malvfs, ICE_MAX_VF_COUNT);
+	DECLARE_BITMAP(features, ICE_F_MAX);
+	DECLARE_BITMAP(state, ICE_STATE_NBITS);
 	DECLARE_BITMAP(flags, ICE_PF_FLAGS_NBITS);
 	unsigned long *avail_txqs;	/* bitmap to track PF Tx queue usage */
 	unsigned long *avail_rxqs;	/* bitmap to track PF Rx queue usage */
@@ -346,13 +905,34 @@ struct ice_pf {
 	struct work_struct serv_task;
 	struct mutex avail_q_mutex;	/* protects access to avail_[rx|tx]qs */
 	struct mutex sw_mutex;		/* lock for protecting VSI alloc flow */
+	struct mutex tc_mutex;		/* lock to protect TC changes */
 	u32 msg_enable;
+	struct ice_ptp ptp;
+	struct ice_cgu_info cgu_info;
+	u16 num_rdma_msix;	/* Total MSIX vectors for RDMA driver */
+	u16 rdma_base_vector;
+	struct ice_peer_obj *rdma_peer;
+#ifdef HAVE_NETDEV_SB_DEV
+	/* MACVLAN specific variables */
+	DECLARE_BITMAP(avail_macvlan, ICE_MAX_MACVLANS);
+	struct list_head macvlan_list;
+	u16 num_macvlan;
+	u16 max_num_macvlan;
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	/* spinlock to protect the AdminQ wait list */
+	spinlock_t aq_wait_lock;
+	struct hlist_head aq_wait_list;
+	wait_queue_head_t aq_wait_queue;
+
+	wait_queue_head_t reset_wait_queue;
+
 	u32 hw_csum_rx_error;
-	u32 oicr_idx;		/* Other interrupt cause MSIX vector index */
-	u32 num_avail_sw_msix;	/* remaining MSIX SW vectors left unclaimed */
+	u16 oicr_idx;		/* Other interrupt cause MSIX vector index */
+	u16 num_avail_sw_msix;	/* remaining MSIX SW vectors left unclaimed */
 	u16 max_pf_txqs;	/* Total Tx queues PF wide */
 	u16 max_pf_rxqs;	/* Total Rx queues PF wide */
-	u32 num_lan_msix;	/* Total MSIX vectors for base driver */
+	u16 num_lan_msix;	/* Total MSIX vectors for base driver */
 	u16 num_lan_tx;		/* num LAN Tx queues setup */
 	u16 num_lan_rx;		/* num LAN Rx queues setup */
 	u16 next_vsi;		/* Next free slot in pf->vsi[] - 0-based! */
@@ -362,24 +942,242 @@ struct ice_pf {
 	u16 empr_count;		/* EMP reset count */
 	u16 pfr_count;		/* PF reset count */
 
+	u8 stat_prev_loaded : 1; /* has previous stats been loaded */
+	u8 wol_ena : 1;		/* software state of WoL */
+	u32 wakeup_reason;	/* last wakeup reason */
 	struct ice_hw_port_stats stats;
 	struct ice_hw_port_stats stats_prev;
 	struct ice_hw hw;
-	u8 stat_prev_loaded:1; /* has previous stats been loaded */
-#ifdef CONFIG_DCB
+#ifdef ICE_ADD_PROBES
+	u64 tcp_segs;
+	u64 udp_segs;
+	u64 tx_tcp_cso;
+	u64 tx_udp_cso;
+	u64 tx_sctp_cso;
+	u64 tx_ip4_cso;
+	u64 tx_l3_cso_err;
+	u64 tx_l4_cso_err;
+	u64 rx_tcp_cso;
+	u64 rx_udp_cso;
+	u64 rx_sctp_cso;
+	u64 rx_ip4_cso;
+	u64 rx_ip4_cso_err;
+	u64 rx_tcp_cso_err;
+	u64 rx_udp_cso_err;
+	u64 rx_sctp_cso_err;
+	u64 tx_q_vlano;
+	u64 rx_q_vlano;
+	u64 tx_ad_vlano;
+	u64 rx_ad_vlano;
+#endif
 	u16 dcbx_cap;
-#endif /* CONFIG_DCB */
 	u32 tx_timeout_count;
 	unsigned long tx_timeout_last_recovery;
 	u32 tx_timeout_recovery_level;
 	char int_name[ICE_INT_NAME_STR_LEN];
+	struct ice_peer_obj_int **peers;
+	int peer_idx;
 	u32 sw_int_count;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	/* count of tc_flower filters specific to channel (aka where filter
+	 * action is "hw_tc <tc_num>")
+	 */
+	u16 num_dmac_chnl_fltrs;
+	struct hlist_head tc_flower_fltr_list;
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+	struct ice_dcf dcf;
+	__le64 nvm_phy_type_lo; /* NVM PHY type low */
+	__le64 nvm_phy_type_hi; /* NVM PHY type high */
+	struct ice_link_default_override_tlv link_dflt_override;
+	u64 supported_rxdids; /* bitmap for supported RXDID */
+#ifdef HAVE_NETDEV_UPPER_INFO
+	struct ice_lag *lag; /* Link Aggregation information */
+#endif /* HAVE_NETDEV_UPPER_INFO */
+
+	/* protects accesses to tunnel list, it is grabbed
+	 * from ice_udp_tunnel_add/del and as well from service_task
+	 */
+	spinlock_t tnl_lock;
+	struct list_head tnl_list;
+	struct ice_switchdev_info switchdev;
+
+#define ICE_INVALID_AGG_NODE_ID		0
+#define ICE_PF_AGG_NODE_ID_START	1
+#define ICE_MAX_PF_AGG_NODES		32
+	struct ice_agg_node pf_agg_node[ICE_MAX_PF_AGG_NODES];
+#ifdef HAVE_NETDEV_SB_DEV
+#define ICE_MACVLAN_AGG_NODE_ID_START	(ICE_PF_AGG_NODE_ID_START + \
+					 ICE_MAX_PF_AGG_NODES)
+#define ICE_MAX_MACVLAN_AGG_NODES	32
+	struct ice_agg_node macvlan_agg_node[ICE_MAX_MACVLAN_AGG_NODES];
+#endif
+#define ICE_VF_AGG_NODE_ID_START	65
+#define ICE_MAX_VF_AGG_NODES		32
+	struct ice_agg_node vf_agg_node[ICE_MAX_VF_AGG_NODES];
 };
 
 struct ice_netdev_priv {
 	struct ice_vsi *vsi;
+#ifdef HAVE_TC_INDIR_BLOCK
+	/* indirect block callbacks on registered higher level devices
+	 * (e.g. tunnel devices)
+	 *
+	 * tc_indr_block_cb_priv_list is used to lookup indirect callback
+	 * private data
+	 *
+	 * netdevice_nb is the netdev events notifier - used to register
+	 * tunnel devices for block events
+	 *
+	 */
+	struct list_head tc_indr_block_priv_list;
+#ifndef HAVE_TC_FLOW_INDIR_DEV
+	struct notifier_block netdevice_nb;
+#endif
+#endif /* HAVE_TC_INDIR_BLOCK */
+	struct ice_repr *repr;
 };
 
+extern struct ida ice_peer_index_ida;
+
+
+/**
+ * ice_vector_ch_enabled
+ * @qv: pointer to q_vector, can be NULL
+ *
+ * This function returns true if vector is channel enabled otherwise false
+ */
+static inline bool ice_vector_ch_enabled(struct ice_q_vector *qv)
+{
+	return !!qv->ch; /* Enable it to run with TC */
+}
+
+/**
+ * ice_vector_busypoll_intr
+ * @qv: pointer to q_vector
+ *
+ * This function returns true if vector is transitioning from BUSY_POLL
+ * to INTERRUPT based on current and previous state of vector
+ */
+static inline bool ice_vector_busypoll_intr(struct ice_q_vector *qv)
+{
+	return (qv->state_flags & ICE_CHNL_PREV_IN_BP) &&
+	      !(qv->state_flags & ICE_CHNL_IN_BP);
+}
+
+/**
+ * ice_vector_ever_in_busypoll
+ * @qv: pointer to q_vector
+ *
+ * This function returns true if vectors current OR previous state
+ * is BUSY_POLL
+ */
+static inline bool ice_vector_ever_in_busypoll(struct ice_q_vector *qv)
+{
+	return (qv->state_flags & ICE_CHNL_PREV_IN_BP) ||
+	       (qv->state_flags & ICE_CHNL_IN_BP);
+}
+
+/**
+ * ice_vector_state_curr_prev_intr
+ * @qv: pointer to q_vector
+ *
+ * This function returns true if vectors current AND previous state
+ * is INTERRUPT
+ */
+static inline bool ice_vector_state_curr_prev_intr(struct ice_q_vector *qv)
+{
+	return !(qv->state_flags & ICE_CHNL_PREV_IN_BP) &&
+	       !(qv->state_flags & ICE_CHNL_IN_BP);
+}
+
+/**
+ * ice_vector_intr_busypoll
+ * @qv: pointer to q_vector
+ *
+ * This function returns true if vector is transitioning from INTERRUPT
+ * to BUSY_POLL based on current and previous state of vector
+ */
+static inline bool ice_vector_intr_busypoll(struct ice_q_vector *qv)
+{
+	return !(qv->state_flags & ICE_CHNL_PREV_IN_BP) &&
+		(qv->state_flags & ICE_CHNL_IN_BP);
+}
+
+/**
+ * ice_adq_trigger_sw_intr
+ * @hw: ptr to HW
+ * @q_vector: pointer to q_vector
+ *
+ * This function triggers SW interrupt on specified vector and re-enables
+ * interrupt. This is for use with ADQ.
+ */
+static inline void
+ice_adq_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector)
+{
+	struct ice_ring_container *rx_rc = &q_vector->rx;
+
+	q_vector->state_flags &= ~ICE_CHNL_ONCE_IN_BP;
+
+	/* when entering into interrupt mode, use current value of Rx ITR
+	 * hence rx_rc->itr_setting. This is needed to honor user setting
+	 * for Rx ITR
+	 */
+	wr32(hw,
+	     GLINT_DYN_CTL(q_vector->reg_idx),
+	     (rx_rc->itr_idx << GLINT_DYN_CTL_ITR_INDX_S) |
+	     (ITR_REG_ALIGN(rx_rc->itr_setting) >> ICE_ITR_GRAN_S) |
+	     GLINT_DYN_CTL_SWINT_TRIG_M |
+	     GLINT_DYN_CTL_INTENA_M);
+}
+
+#ifdef ADQ_PERF_COUNTERS
+/**
+ * ice_sw_intr_cntr
+ * @q_vector: pointer to q_vector
+ * @napi_codepath: codepath separator for stats purpose
+ *
+ * This function counts the trigger code path for sw_intr. Caller of this
+ * expected to call ice_adq_trigger_sw_intr or ice_trigger_sw_intr function to
+ * actually trigger SW intr.
+ */
+static inline void
+ice_sw_intr_cntr(struct ice_q_vector *q_vector, bool napi_codepath)
+{
+	if (napi_codepath) /* napi - detected timeout */
+		q_vector->ch_stats.num_sw_intr_timeout++;
+	else
+		q_vector->ch_stats.num_sw_intr_serv_task++;
+}
+#endif /* ADQ_PERF_COUNTERS */
+
+/**
+ * ice_force_wb - trigger force write-back by setting WB_ON_ITR bit
+ * @hw: ptr to HW
+ * @q_vector: pointer to q_vector
+ *
+ * This function is used to force write-backs by setting WB_ON_ITR bit
+ * in DYN_CTLN register. WB_ON_ITR and INTENA are mutually exclusive bits.
+ * Setting WB_ON_ITR bits means Tx and Rx descriptors are written back based
+ * on ITR expiration irrespective of INTENA setting
+ */
+static inline void
+ice_force_wb(struct ice_hw *hw, struct ice_q_vector *q_vector)
+{
+	if (q_vector->num_ring_rx || q_vector->num_ring_tx) {
+#ifdef ADQ_PERF_COUNTERS
+		q_vector->ch_stats.num_wb_on_itr_set++;
+#endif /* ADQ_PERF_COUNTERS */
+		wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx),
+		     ICE_GLINT_DYN_CTL_WB_ON_ITR(0, ICE_RX_ITR));
+	}
+
+	/* needed to avoid triggering WB_ON_ITR again which typically
+	 * happens from ice_set_wb_on_itr function
+	 */
+	q_vector->wb_on_itr = true;
+}
+
 /**
  * ice_irq_dynamic_ena - Enable default interrupt generation settings
  * @hw: pointer to HW struct
@@ -401,7 +1199,7 @@ ice_irq_dynamic_ena(struct ice_hw *hw, struct ice_vsi *vsi,
 	val = GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M |
 	      (itr << GLINT_DYN_CTL_ITR_INDX_S);
 	if (vsi)
-		if (test_bit(__ICE_DOWN, vsi->state))
+		if (test_bit(ICE_VSI_DOWN, vsi->state))
 			return;
 	wr32(hw, GLINT_DYN_CTL(vector), val);
 }
@@ -417,6 +1215,55 @@ static inline struct ice_pf *ice_netdev_to_pf(struct net_device *netdev)
 	return np->vsi->back;
 }
 
+#ifdef HAVE_XDP_SUPPORT
+static inline bool ice_is_xdp_ena_vsi(struct ice_vsi *vsi)
+{
+	return !!vsi->xdp_prog;
+}
+
+static inline void ice_set_ring_xdp(struct ice_ring *ring)
+{
+	ring->flags |= ICE_TX_FLAGS_RING_XDP;
+}
+
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+/**
+ * ice_xsk_umem - get XDP UMEM bound to a ring
+ * @ring: ring to use
+ *
+ * Returns a pointer to xdp_umem structure if there is an UMEM present,
+ * NULL otherwise.
+ */
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+static inline struct xsk_buff_pool *ice_xsk_umem(struct ice_ring *ring)
+#else
+static inline struct xdp_umem *ice_xsk_umem(struct ice_ring *ring)
+#endif
+{
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	struct xdp_umem **umems = ring->vsi->xsk_umems;
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+	u16 qid = ring->q_index;
+
+	if (ice_ring_is_xdp(ring))
+		qid -= ring->vsi->num_xdp_txq;
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (qid >= ring->vsi->num_xsk_umems || !umems || !umems[qid] ||
+	    !ice_is_xdp_ena_vsi(ring->vsi))
+		return NULL;
+
+	return umems[qid];
+#else
+	if (!ice_is_xdp_ena_vsi(ring->vsi))
+		return NULL;
+
+	return xsk_get_pool_from_qid(ring->vsi->netdev, qid);
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
 /**
  * ice_get_main_vsi - Get the PF VSI
  * @pf: PF instance
@@ -431,27 +1278,381 @@ static inline struct ice_vsi *ice_get_main_vsi(struct ice_pf *pf)
 	return NULL;
 }
 
+/**
+ * ice_get_netdev_priv_vsi - return VSI associated with netdev priv.
+ * @np: private netdev structure
+ */
+static inline struct ice_vsi *ice_get_netdev_priv_vsi(struct ice_netdev_priv *np)
+{
+	/* In case of port representor return source port VSI. */
+	if (np->repr)
+		return np->repr->src_vsi;
+	else
+		return np->vsi;
+}
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+/**
+ * ice_is_switchdev_running - check if switchdev is configured
+ * @pf: pointer to PF structure
+ *
+ * Returns true if eswitch mode is set to DEVLINK_ESWITCH_MODE_SWITCHDEV
+ * and switchdev is configured, false otherwise.
+ */
+static inline bool ice_is_switchdev_running(struct ice_pf *pf)
+{
+	return pf->switchdev.is_running;
+}
+
+#else
+static inline bool
+ice_is_switchdev_running(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+#endif /* IS_ENABLED(CONFIG_NET_DEVLINK) */
+
+/**
+ * ice_get_ctrl_vsi - Get the control VSI
+ * @pf: PF instance
+ */
+static inline struct ice_vsi *ice_get_ctrl_vsi(struct ice_pf *pf)
+{
+	/* if pf->ctrl_vsi_idx is ICE_NO_VSI, control VSI was not set up */
+	if (!pf->vsi || pf->ctrl_vsi_idx == ICE_NO_VSI)
+		return NULL;
+
+	return pf->vsi[pf->ctrl_vsi_idx];
+}
+
+/**
+ * ice_find_first_vsi_by_type - Find and return first VSI of a given type
+ * @pf: PF to search for VSI
+ * @vsi_type: VSI type we are looking for
+ */
+static inline struct ice_vsi *
+ice_find_first_vsi_by_type(struct ice_pf *pf, enum ice_vsi_type vsi_type)
+{
+	int i;
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (vsi && vsi->type == vsi_type)
+			return vsi;
+	}
+
+	return NULL;
+}
+
+enum ice_fd_stat_idx {
+	ICE_FD_STAT_SB,
+	ICE_FD_STAT_CH,
+#ifdef ICE_ADD_PROBES
+	ICE_ARFS_STAT_TCPV4,
+	ICE_ARFS_STAT_TCPV6,
+	ICE_ARFS_STAT_UDPV4,
+	ICE_ARFS_STAT_UDPV6
+#endif /* ICE_ADD_PROBES */
+};
+
+#define ICE_FD_STAT_CTR_BLOCK_COUNT	256
+#define ICE_FD_STAT_PF_IDX(base_idx) \
+			((base_idx) * ICE_FD_STAT_CTR_BLOCK_COUNT)
+#define ICE_FD_SB_STAT_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_FD_STAT_SB)
+#ifdef ICE_ADD_PROBES
+#define ICE_ARFS_STAT_TCPV4_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_ARFS_STAT_TCPV4)
+#define ICE_ARFS_STAT_TCPV6_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_ARFS_STAT_TCPV6)
+#define ICE_ARFS_STAT_UDPV4_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_ARFS_STAT_UDPV4)
+#define ICE_ARFS_STAT_UDPV6_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_ARFS_STAT_UDPV6)
+#endif /* ICE_ADD_PROBES */
+
+#define ICE_FD_CH_STAT_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_FD_STAT_CH)
+
+/**
+ * ice_vsi_fd_ena
+ * @vsi: pointer to VSI
+ *
+ * This function returns true if VSI is capable for usage of flow-director
+ * otherwise returns false
+ */
+static inline bool ice_vsi_fd_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_FD_ENA, vsi->features);
+}
+
+/**
+ * ice_vsi_inline_fd_ena
+ * @vsi: pointer to VSI
+ *
+ * This function returns true if VSI is enabled for usage of flow-director
+ * otherwise returns false. This is controlled thru' ethtool priv-flag
+ * 'channel-inline-flow-director'
+ */
+static inline bool ice_vsi_inline_fd_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA, vsi->features);
+}
+
+static inline bool ice_vsi_inline_fd_mark_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA, vsi->features);
+}
+
+/**
+ * ice_get_current_fd_cnt - Get total FD filters programmed for this VSI
+ * @vsi: ptr to VSI
+ */
+static inline u32 ice_get_current_fd_cnt(struct ice_vsi *vsi)
+{
+	u32 val;
+
+	val = rd32(&vsi->back->hw, VSIQF_FD_CNT(vsi->vsi_num));
+
+	return (val & VSIQF_FD_CNT_FD_GCNT_M) +
+		((val & VSIQF_FD_CNT_FD_BCNT_M) >>
+		VSIQF_FD_CNT_FD_BCNT_S);
+}
+
+/**
+ * ice_read_cntr - read counter value using counter_index
+ * @pf: ptr to PF
+ * @counter_index: index of counter to be read
+ */
+static inline u64 ice_read_cntr(struct ice_pf *pf, u32 counter_index)
+{
+	/* Read the HW counter based on counter_index */
+	return ((u64)rd32(&pf->hw, GLSTAT_FD_CNT0H(counter_index)) << 32) |
+		rd32(&pf->hw, GLSTAT_FD_CNT0L(counter_index));
+}
+
+/**
+ * ice_clear_cntr - initialize counter to zero
+ * @pf: ptr to PF
+ * @counter_index: index of counter to be initialized
+ */
+static inline void ice_clear_cntr(struct ice_pf *pf, u32 counter_index)
+{
+	/* Read the HW counter based on counter_index */
+	wr32(&pf->hw, GLSTAT_FD_CNT0H(counter_index), 0);
+	wr32(&pf->hw, GLSTAT_FD_CNT0L(counter_index), 0);
+}
+
+/**
+ * ice_is_vsi_fd_table_full - VSI specific FD table is full or not
+ * @vsi: ptr to VSI
+ * @cnt: fd count, specific to VSI
+ *
+ * Retutn true if HW FD table specific to VSI is full, otherwise false
+ */
+static inline bool ice_is_vsi_fd_table_full(struct ice_vsi *vsi, u32 cnt)
+{
+	u32 max_allowed_fltr_cnt;
+
+	if (!cnt)
+		return false;
+
+	if (!vsi->num_gfltr && !vsi->num_bfltr)
+		return false;
+	/* determine if 'cnt' reached max_allowed for specified VSI,
+	 * if so, return HW table full for that specific VSI
+	 */
+	max_allowed_fltr_cnt = vsi->num_gfltr + vsi->num_bfltr - 1;
+
+	return cnt >= max_allowed_fltr_cnt;
+}
+
+#ifdef NETIF_F_HW_TC
+/**
+ * ice_is_adq_active - any active ADQs
+ * @pf: pointer to PF
+ *
+ * This function returns true if there are any ADQs configured (which is
+ * determined by looking at VSI type (which should be VSI_PF), numtc, and
+ * TC_MQPRIO flag) otherwise return false
+ */
+static inline bool ice_is_adq_active(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return false;
+
+	/* is ADQ configured */
+	if (vsi->tc_cfg.numtc > ICE_CHNL_START_TC &&
+	    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		return true;
+
+	return false;
+}
+#endif /* NETIF_F_HW_TC */
+/**
+ * ice_vsi_pkt_inspect_opt_ena - packet inspection based optimization is ON/OFF
+ * @vsi: pointer to VSI
+ *
+ * This function returns true if VSI is enabled for optimization based on
+ * control/data packet. By default, respective PF priv flags is ON (which user
+ * can change using ethtool if needed), hence by default VSI level feature
+ * flags is also ON. If user changes PF level priv flag after creating channel
+ * VSIs (aka ADQ VSI), those changes are not reflected in VSI level feature
+ * flag by design.
+ */
+static inline bool ice_vsi_pkt_inspect_opt_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA, vsi->features);
+}
+
+/**
+ * ice_vsi_pkt_process_bp_stop_ena - packet process ON/OFF from bp stop
+ * @vsi: pointer to VSI
+ *
+ * This function returns true if VSI is enabled for optimization to allow
+ * Tx/Rx cleanup from busy_poll_stop code path. There is an associated
+ * priv flag to control this feature and applicable only for channel (aka ADQ)
+ * specific vectors
+ */
+static inline bool ice_vsi_pkt_process_bp_stop_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+			  vsi->features);
+}
+
+static inline bool ice_active_vmdqs(struct ice_pf *pf)
+{
+	return !!ice_find_first_vsi_by_type(pf, ICE_VSI_VMDQ2);
+}
+
+#ifdef HAVE_NETDEV_SB_DEV
+static inline bool ice_is_offloaded_macvlan_ena(struct ice_pf *pf)
+{
+	return test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags);
+}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+#ifdef CONFIG_DEBUG_FS
+void ice_debugfs_pf_init(struct ice_pf *pf);
+void ice_debugfs_pf_exit(struct ice_pf *pf);
+void ice_debugfs_init(void);
+void ice_debugfs_exit(void);
+#else
+static inline void ice_debugfs_pf_init(struct ice_pf *pf) { }
+static inline void ice_debugfs_pf_exit(struct ice_pf *pf) { }
+static inline void ice_debugfs_init(void) { }
+static inline void ice_debugfs_exit(void) { }
+#endif /* CONFIG_DEBUG_FS */
+
+bool netif_is_ice(struct net_device *dev);
 int ice_vsi_setup_tx_rings(struct ice_vsi *vsi);
 int ice_vsi_setup_rx_rings(struct ice_vsi *vsi);
+int ice_vsi_open_ctrl(struct ice_vsi *vsi);
+int ice_vsi_open(struct ice_vsi *vsi);
+void ice_set_ethtool_repr_ops(struct net_device *netdev);
 void ice_set_ethtool_ops(struct net_device *netdev);
+void ice_set_ethtool_recovery_ops(struct net_device *netdev);
 void ice_set_ethtool_safe_mode_ops(struct net_device *netdev);
 u16 ice_get_avail_txq_count(struct ice_pf *pf);
 u16 ice_get_avail_rxq_count(struct ice_pf *pf);
-void ice_update_vsi_stats(struct ice_vsi *vsi);
+int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx);
 void ice_update_pf_stats(struct ice_pf *pf);
+void ice_update_vsi_stats(struct ice_vsi *vsi);
 int ice_up(struct ice_vsi *vsi);
 int ice_down(struct ice_vsi *vsi);
 int ice_vsi_cfg(struct ice_vsi *vsi);
 struct ice_vsi *ice_lb_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi);
-int ice_set_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size);
-int ice_get_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size);
+#ifdef HAVE_NETDEV_SB_DEV
+int ice_vsi_cfg_netdev_tc0(struct ice_vsi *vsi);
+#endif /* HAVE_NETDEV_SB_DEV */
+#ifdef HAVE_XDP_SUPPORT
+int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog);
+int ice_destroy_xdp_rings(struct ice_vsi *vsi);
+#ifndef NO_NDO_XDP_FLUSH
+void ice_xdp_flush(struct net_device *dev);
+#endif /* NO_NDO_XDP_FLUSH */
+#ifdef HAVE_XDP_FRAME_STRUCT
+int
+ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+	     u32 flags);
+#else
+int ice_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp);
+#endif /* HAVE_XDP_FRAME_STRUCT */
+#endif /* HAVE_XDP_SUPPORT */
+int ice_set_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size);
+int ice_get_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size);
+int ice_set_rss_key(struct ice_vsi *vsi, u8 *seed);
+int ice_get_rss_key(struct ice_vsi *vsi, u8 *seed);
 void ice_fill_rss_lut(u8 *lut, u16 rss_table_size, u16 rss_size);
+int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset);
 void ice_print_link_msg(struct ice_vsi *vsi, bool isup);
-#ifdef CONFIG_DCB
-int ice_pf_ena_all_vsi(struct ice_pf *pf, bool locked);
-void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked);
-#endif /* CONFIG_DCB */
+#if IS_ENABLED(CONFIG_MFD_CORE)
+int ice_init_peer_devices(struct ice_pf *pf);
+int
+ice_for_each_peer(struct ice_pf *pf, void *data,
+		  int (*fn)(struct ice_peer_obj_int *, void *));
+#ifdef CONFIG_PM
+void ice_peer_refresh_msix(struct ice_pf *pf);
+#endif /* CONFIG_PM */
+#else /* !CONFIG_MFD_CORE */
+static inline int ice_init_peer_devices(struct ice_pf *pf) { return 0; }
+
+static inline int
+ice_for_each_peer(struct ice_pf *pf, void *data,
+		  int (*fn)(struct ice_peer_obj_int *, void *))
+{
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static inline void ice_peer_refresh_msix(struct ice_pf *pf) { }
+#endif /* CONFIG_PM */
+#endif /* !CONFIG_MFD_CORE */
+const char *ice_stat_str(enum ice_status stat_err);
+const char *ice_aq_str(enum ice_aq_err aq_err);
+bool ice_is_wol_supported(struct ice_hw *hw);
+int ice_aq_wait_for_event(struct ice_pf *pf, u16 opcode, unsigned long timeout,
+			  struct ice_rq_event_info *event);
+int
+ice_fdir_write_fltr(struct ice_pf *pf, struct ice_fdir_fltr *input, bool add,
+		    bool is_tun);
+void ice_vsi_manage_fdir(struct ice_vsi *vsi, bool ena);
+int ice_add_ntuple_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd);
+int ice_del_ntuple_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd);
+int ice_get_ethtool_fdir_entry(struct ice_hw *hw, struct ethtool_rxnfc *cmd);
+u32 ice_ntuple_get_max_fltr_cnt(struct ice_hw *hw);
+int
+ice_ntuple_set_input_set(struct ice_vsi *vsi, enum ice_block blk,
+			 struct ethtool_rx_flow_spec *fsp,
+			 struct ice_fdir_fltr *input);
+int
+ice_ntuple_l4_proto_to_port(enum ice_flow_seg_hdr l4_proto,
+			    enum ice_flow_field *src_port,
+			    enum ice_flow_field *dst_port);
+int ice_ntuple_check_ip4_seg(struct ethtool_tcpip4_spec *tcp_ip4_spec);
+int ice_ntuple_check_ip4_usr_seg(struct ethtool_usrip4_spec *usr_ip4_spec);
+int
+ice_get_fdir_fltr_ids(struct ice_hw *hw, struct ethtool_rxnfc *cmd,
+		      u32 *rule_locs);
+void ice_fdir_rem_adq_chnl(struct ice_hw *hw, u16 vsi_idx);
+void ice_fdir_release_flows(struct ice_hw *hw);
+void ice_fdir_replay_flows(struct ice_hw *hw);
+void ice_fdir_replay_fltrs(struct ice_pf *pf);
+int ice_fdir_create_dflt_rules(struct ice_pf *pf);
+enum ice_fltr_ptype ice_ethtool_flow_to_fltr(int eth);
+int
+ice_ntuple_update_list_entry(struct ice_pf *pf, struct ice_fdir_fltr *input,
+			     int fltr_idx);
+void ice_update_ring_dest_vsi(struct ice_vsi *vsi, u16 *dest_vsi, u32 *ring);
 int ice_open(struct net_device *netdev);
+int ice_open_internal(struct net_device *netdev);
 int ice_stop(struct net_device *netdev);
-
+void ice_service_task_schedule(struct ice_pf *pf);
+int
+ice_acl_add_rule_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd);
+int ice_init_acl(struct ice_pf *pf);
 #endif /* _ICE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_acl.c b/drivers/net/ethernet/intel/ice/ice_acl.c
new file mode 100644
index 000000000000..fcf0fea30e17
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_acl.c
@@ -0,0 +1,592 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_acl.h"
+#include "ice_adminq_cmd.h"
+
+/**
+ * ice_aq_alloc_acl_tbl - allocate ACL table
+ * @hw: pointer to the HW struct
+ * @tbl: pointer to ice_acl_alloc_tbl struct
+ * @cd: pointer to command details structure or NULL
+ *
+ * Allocate ACL table (indirect 0x0C10)
+ */
+enum ice_status
+ice_aq_alloc_acl_tbl(struct ice_hw *hw, struct ice_acl_alloc_tbl *tbl,
+		     struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_alloc_table *cmd;
+	struct ice_aq_desc desc;
+
+	if (!tbl->act_pairs_per_entry)
+		return ICE_ERR_PARAM;
+
+	if (tbl->act_pairs_per_entry > ICE_AQC_MAX_ACTION_MEMORIES)
+		return ICE_ERR_MAX_LIMIT;
+
+	/* If this is concurrent table, then buffer shall be valid and
+	 * contain DependentAllocIDs, 'num_dependent_alloc_ids' should be valid
+	 * and within limit
+	 */
+	if (tbl->concurr) {
+		if (!tbl->num_dependent_alloc_ids)
+			return ICE_ERR_PARAM;
+		if (tbl->num_dependent_alloc_ids >
+		    ICE_AQC_MAX_CONCURRENT_ACL_TBL)
+			return ICE_ERR_INVAL_SIZE;
+	}
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_alloc_acl_tbl);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd = &desc.params.alloc_table;
+	cmd->table_width = cpu_to_le16(tbl->width * BITS_PER_BYTE);
+	cmd->table_depth = cpu_to_le16(tbl->depth);
+	cmd->act_pairs_per_entry = tbl->act_pairs_per_entry;
+	if (tbl->concurr)
+		cmd->table_type = tbl->num_dependent_alloc_ids;
+
+	return ice_aq_send_cmd(hw, &desc, &tbl->buf, sizeof(tbl->buf), cd);
+}
+
+/**
+ * ice_aq_dealloc_acl_tbl - deallocate ACL table
+ * @hw: pointer to the HW struct
+ * @alloc_id: allocation ID of the table being released
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Deallocate ACL table (indirect 0x0C11)
+ *
+ * NOTE: This command has no buffer format for command itself but response
+ * format is 'struct ice_aqc_acl_generic', pass ptr to that struct
+ * as 'buf' and its size as 'buf_size'
+ */
+enum ice_status
+ice_aq_dealloc_acl_tbl(struct ice_hw *hw, u16 alloc_id,
+		       struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_tbl_actpair *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dealloc_acl_tbl);
+	cmd = &desc.params.tbl_actpair;
+	cmd->alloc_id = cpu_to_le16(alloc_id);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+static enum ice_status
+ice_aq_acl_entry(struct ice_hw *hw, u16 opcode, u8 tcam_idx, u16 entry_idx,
+		 struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_entry *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+
+	if (opcode == ice_aqc_opc_program_acl_entry)
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd = &desc.params.program_query_entry;
+	cmd->tcam_index = tcam_idx;
+	cmd->entry_index = cpu_to_le16(entry_idx);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_program_acl_entry - program ACL entry
+ * @hw: pointer to the HW struct
+ * @tcam_idx: Updated TCAM block index
+ * @entry_idx: updated entry index
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program ACL entry (direct 0x0C20)
+ */
+enum ice_status
+ice_aq_program_acl_entry(struct ice_hw *hw, u8 tcam_idx, u16 entry_idx,
+			 struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_acl_entry(hw, ice_aqc_opc_program_acl_entry, tcam_idx,
+				entry_idx, buf, cd);
+}
+
+/**
+ * ice_aq_query_acl_entry - query ACL entry
+ * @hw: pointer to the HW struct
+ * @tcam_idx: Updated TCAM block index
+ * @entry_idx: updated entry index
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL entry (direct 0x0C24)
+ *
+ * NOTE: Caller of this API to parse 'buf' appropriately since it contains
+ * response (key and key invert)
+ */
+enum ice_status
+ice_aq_query_acl_entry(struct ice_hw *hw, u8 tcam_idx, u16 entry_idx,
+		       struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_acl_entry(hw, ice_aqc_opc_query_acl_entry, tcam_idx,
+				entry_idx, buf, cd);
+}
+
+/* Helper function to alloc/dealloc ACL action pair */
+static enum ice_status
+ice_aq_actpair_a_d(struct ice_hw *hw, u16 opcode, u16 alloc_id,
+		   struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_tbl_actpair *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+	cmd = &desc.params.tbl_actpair;
+	cmd->alloc_id = cpu_to_le16(alloc_id);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_alloc_actpair - allocate actionpair for specified ACL table
+ * @hw: pointer to the HW struct
+ * @alloc_id: allocation ID of the table being associated with the actionpair
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Allocate ACL actionpair (direct 0x0C12)
+ *
+ * This command doesn't need and doesn't have its own command buffer
+ * but for response format is as specified in 'struct ice_aqc_acl_generic'
+ */
+enum ice_status
+ice_aq_alloc_actpair(struct ice_hw *hw, u16 alloc_id,
+		     struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_actpair_a_d(hw, ice_aqc_opc_alloc_acl_actpair, alloc_id,
+				  buf, cd);
+}
+
+/**
+ * ice_aq_dealloc_actpair - dealloc actionpair for specified ACL table
+ * @hw: pointer to the HW struct
+ * @alloc_id: allocation ID of the table being associated with the actionpair
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ *  Deallocate ACL actionpair (direct 0x0C13)
+ */
+enum ice_status
+ice_aq_dealloc_actpair(struct ice_hw *hw, u16 alloc_id,
+		       struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_actpair_a_d(hw, ice_aqc_opc_dealloc_acl_actpair, alloc_id,
+				  buf, cd);
+}
+
+/* Helper function to program/query ACL action pair */
+static enum ice_status
+ice_aq_actpair_p_q(struct ice_hw *hw, u16 opcode, u8 act_mem_idx,
+		   u16 act_entry_idx, struct ice_aqc_actpair *buf,
+		   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_actpair *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+
+	if (opcode == ice_aqc_opc_program_acl_actpair)
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd = &desc.params.program_query_actpair;
+	cmd->act_mem_index = act_mem_idx;
+	cmd->act_entry_index = cpu_to_le16(act_entry_idx);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_program_actpair - program ACL actionpair
+ * @hw: pointer to the HW struct
+ * @act_mem_idx: action memory index to program/update/query
+ * @act_entry_idx: the entry index in action memory to be programmed/updated
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program action entries (indirect 0x0C1C)
+ */
+enum ice_status
+ice_aq_program_actpair(struct ice_hw *hw, u8 act_mem_idx, u16 act_entry_idx,
+		       struct ice_aqc_actpair *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_actpair_p_q(hw, ice_aqc_opc_program_acl_actpair,
+				  act_mem_idx, act_entry_idx, buf, cd);
+}
+
+/**
+ * ice_aq_query_actpair - query ACL actionpair
+ * @hw: pointer to the HW struct
+ * @act_mem_idx: action memory index to program/update/query
+ * @act_entry_idx: the entry index in action memory to be programmed/updated
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL actionpair (indirect 0x0C25)
+ */
+enum ice_status
+ice_aq_query_actpair(struct ice_hw *hw, u8 act_mem_idx, u16 act_entry_idx,
+		     struct ice_aqc_actpair *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_actpair_p_q(hw, ice_aqc_opc_query_acl_actpair,
+				  act_mem_idx, act_entry_idx, buf, cd);
+}
+
+/**
+ * ice_aq_dealloc_acl_res - deallocate ACL resources
+ * @hw: pointer to the HW struct
+ * @cd: pointer to command details structure or NULL
+ *
+ * De-allocate ACL resources (direct 0x0C1A). Used by SW to release all the
+ * resources allocated for it using a single command
+ */
+enum ice_status ice_aq_dealloc_acl_res(struct ice_hw *hw, struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dealloc_acl_res);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_acl_prof_aq_send - sending ACL profile AQ commands
+ * @hw: pointer to the HW struct
+ * @opc: command opcode
+ * @prof_id: profile ID
+ * @buf: ptr to buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function sends ACL profile commands
+ */
+static enum ice_status
+ice_acl_prof_aq_send(struct ice_hw *hw, u16 opc, u8 prof_id,
+		     struct ice_aqc_acl_prof_generic_frmt *buf,
+		     struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	desc.params.profile.profile_id = prof_id;
+	if (opc == ice_aqc_opc_program_acl_prof_extraction ||
+	    opc == ice_aqc_opc_program_acl_prof_ranges)
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_prgm_acl_prof_xtrct - program ACL profile extraction sequence
+ * @hw: pointer to the HW struct
+ * @prof_id: profile ID
+ * @buf: ptr to buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program ACL profile extraction (indirect 0x0C1D)
+ */
+enum ice_status
+ice_prgm_acl_prof_xtrct(struct ice_hw *hw, u8 prof_id,
+			struct ice_aqc_acl_prof_generic_frmt *buf,
+			struct ice_sq_cd *cd)
+{
+	return ice_acl_prof_aq_send(hw, ice_aqc_opc_program_acl_prof_extraction,
+				    prof_id, buf, cd);
+}
+
+/**
+ * ice_query_acl_prof - query ACL profile
+ * @hw: pointer to the HW struct
+ * @prof_id: profile ID
+ * @buf: ptr to buffer (which will contain response of this command)
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL profile (indirect 0x0C21)
+ */
+enum ice_status
+ice_query_acl_prof(struct ice_hw *hw, u8 prof_id,
+		   struct ice_aqc_acl_prof_generic_frmt *buf,
+		   struct ice_sq_cd *cd)
+{
+	return ice_acl_prof_aq_send(hw, ice_aqc_opc_query_acl_prof, prof_id,
+				    buf, cd);
+}
+
+/**
+ * ice_aq_acl_cntrs_chk_params - Checks ACL counter parameters
+ * @cntrs: ptr to buffer describing input and output params
+ *
+ * This function checks the counter bank range for counter type and returns
+ * success or failure.
+ */
+static enum ice_status ice_aq_acl_cntrs_chk_params(struct ice_acl_cntrs *cntrs)
+{
+	enum ice_status status = 0;
+
+	if (!cntrs || !cntrs->amount)
+		return ICE_ERR_PARAM;
+
+	switch (cntrs->type) {
+	case ICE_AQC_ACL_CNT_TYPE_SINGLE:
+		/* Single counter type - configured to count either bytes
+		 * or packets, the valid values for byte or packet counters
+		 * shall be 0-3.
+		 */
+		if (cntrs->bank > ICE_AQC_ACL_MAX_CNT_SINGLE)
+			status = ICE_ERR_OUT_OF_RANGE;
+		break;
+	case ICE_AQC_ACL_CNT_TYPE_DUAL:
+		/* Pair counter type - counts number of bytes and packets
+		 * The valid values for byte/packet counter duals shall be 0-1
+		 */
+		if (cntrs->bank > ICE_AQC_ACL_MAX_CNT_DUAL)
+			status = ICE_ERR_OUT_OF_RANGE;
+		break;
+	default:
+		/* Unspecified counter type - Invalid or error */
+		status = ICE_ERR_PARAM;
+	}
+
+	return status;
+}
+
+/**
+ * ice_aq_alloc_acl_cntrs - allocate ACL counters
+ * @hw: pointer to the HW struct
+ * @cntrs: ptr to buffer describing input and output params
+ * @cd: pointer to command details structure or NULL
+ *
+ * Allocate ACL counters (indirect 0x0C16). This function attempts to
+ * allocate a contiguous block of counters. In case of failures, caller can
+ * attempt to allocate a smaller chunk. The allocation is considered
+ * unsuccessful if returned counter value is invalid. In this case it returns
+ * an error otherwise success.
+ */
+enum ice_status
+ice_aq_alloc_acl_cntrs(struct ice_hw *hw, struct ice_acl_cntrs *cntrs,
+		       struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_alloc_counters *cmd;
+	u16 first_cntr, last_cntr;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	/* check for invalid params */
+	status = ice_aq_acl_cntrs_chk_params(cntrs);
+	if (status)
+		return status;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_alloc_acl_counters);
+	cmd = &desc.params.alloc_counters;
+	cmd->counter_amount = cntrs->amount;
+	cmd->counters_type = cntrs->type;
+	cmd->bank_alloc = cntrs->bank;
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (!status) {
+		first_cntr = le16_to_cpu(cmd->ops.resp.first_counter);
+		last_cntr = le16_to_cpu(cmd->ops.resp.last_counter);
+		if (first_cntr == ICE_AQC_ACL_ALLOC_CNT_INVAL ||
+		    last_cntr == ICE_AQC_ACL_ALLOC_CNT_INVAL)
+			return ICE_ERR_OUT_OF_RANGE;
+		cntrs->first_cntr = first_cntr;
+		cntrs->last_cntr = last_cntr;
+	}
+	return status;
+}
+
+/**
+ * ice_aq_dealloc_acl_cntrs - deallocate ACL counters
+ * @hw: pointer to the HW struct
+ * @cntrs: ptr to buffer describing input and output params
+ * @cd: pointer to command details structure or NULL
+ *
+ * De-allocate ACL counters (direct 0x0C17)
+ */
+enum ice_status
+ice_aq_dealloc_acl_cntrs(struct ice_hw *hw, struct ice_acl_cntrs *cntrs,
+			 struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_dealloc_counters *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	/* check for invalid params */
+	status = ice_aq_acl_cntrs_chk_params(cntrs);
+	if (status)
+		return status;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dealloc_acl_counters);
+	cmd = &desc.params.dealloc_counters;
+	cmd->first_counter = cpu_to_le16(cntrs->first_cntr);
+	cmd->last_counter = cpu_to_le16(cntrs->last_cntr);
+	cmd->counters_type = cntrs->type;
+	cmd->bank_alloc = cntrs->bank;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+
+/**
+ * ice_prog_acl_prof_ranges - program ACL profile ranges
+ * @hw: pointer to the HW struct
+ * @prof_id: programmed or updated profile ID
+ * @buf: pointer to input buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program ACL profile ranges (indirect 0x0C1E)
+ */
+enum ice_status
+ice_prog_acl_prof_ranges(struct ice_hw *hw, u8 prof_id,
+			 struct ice_aqc_acl_profile_ranges *buf,
+			 struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_program_acl_prof_ranges);
+	desc.params.profile.profile_id = prof_id;
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_query_acl_prof_ranges - query ACL profile ranges
+ * @hw: pointer to the HW struct
+ * @prof_id: programmed or updated profile ID
+ * @buf: pointer to response buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL profile ranges (indirect 0x0C22)
+ */
+enum ice_status
+ice_query_acl_prof_ranges(struct ice_hw *hw, u8 prof_id,
+			  struct ice_aqc_acl_profile_ranges *buf,
+			  struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_query_acl_prof_ranges);
+	desc.params.profile.profile_id = prof_id;
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_alloc_acl_scen - allocate ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: memory location to receive allocated scenario ID
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Allocate ACL scenario (indirect 0x0C14)
+ */
+enum ice_status
+ice_aq_alloc_acl_scen(struct ice_hw *hw, u16 *scen_id,
+		      struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_alloc_scen *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!scen_id)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_alloc_acl_scen);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd = &desc.params.alloc_scen;
+
+	status = ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+	if (!status)
+		*scen_id = le16_to_cpu(cmd->ops.resp.scen_id);
+
+	return status;
+}
+
+/**
+ * ice_aq_dealloc_acl_scen - deallocate ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: scen_id to be deallocated (input and output field)
+ * @cd: pointer to command details structure or NULL
+ *
+ * Deallocate ACL scenario (direct 0x0C15)
+ */
+enum ice_status
+ice_aq_dealloc_acl_scen(struct ice_hw *hw, u16 scen_id, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_dealloc_scen *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dealloc_acl_scen);
+	cmd = &desc.params.dealloc_scen;
+	cmd->scen_id = cpu_to_le16(scen_id);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_update_query_scen - update or query ACL scenario
+ * @hw: pointer to the HW struct
+ * @opcode: AQ command opcode for either query or update scenario
+ * @scen_id: scen_id to be updated or queried
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Calls update or query ACL scenario
+ */
+static enum ice_status
+ice_aq_update_query_scen(struct ice_hw *hw, u16 opcode, u16 scen_id,
+			 struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_update_query_scen *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+	if (opcode == ice_aqc_opc_update_acl_scen)
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd = &desc.params.update_query_scen;
+	cmd->scen_id = cpu_to_le16(scen_id);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_update_acl_scen - update ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: scen_id to be updated
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Update ACL scenario (indirect 0x0C1B)
+ */
+enum ice_status
+ice_aq_update_acl_scen(struct ice_hw *hw, u16 scen_id,
+		       struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_update_query_scen(hw, ice_aqc_opc_update_acl_scen,
+					scen_id, buf, cd);
+}
+
+/**
+ * ice_aq_query_acl_scen - query ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: scen_id to be queried
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL scenario (indirect 0x0C23)
+ */
+enum ice_status
+ice_aq_query_acl_scen(struct ice_hw *hw, u16 scen_id,
+		      struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_update_query_scen(hw, ice_aqc_opc_query_acl_scen,
+					scen_id, buf, cd);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_acl.h b/drivers/net/ethernet/intel/ice/ice_acl.h
new file mode 100644
index 000000000000..bdfe2681935c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_acl.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_ACL_H_
+#define _ICE_ACL_H_
+
+#include "ice_common.h"
+#include "ice_adminq_cmd.h"
+
+struct ice_acl_tbl_params {
+	u16 width;	/* Select/match bytes */
+	u16 depth;	/* Number of entries */
+
+#define ICE_ACL_TBL_MAX_DEP_TBLS	15
+	u16 dep_tbls[ICE_ACL_TBL_MAX_DEP_TBLS];
+
+	u8 entry_act_pairs;	/* Action pairs per entry */
+	u8 concurr;		/* Concurrent table lookup enable */
+};
+
+struct ice_acl_act_mem {
+	u8 act_mem;
+#define ICE_ACL_ACT_PAIR_MEM_INVAL	0xff
+	u8 member_of_tcam;
+};
+
+struct ice_acl_tbl {
+	/* TCAM configuration */
+	u8 first_tcam;	/* Index of the first TCAM block */
+	u8 last_tcam;	/* Index of the last TCAM block */
+	/* Index of the first entry in the first TCAM */
+	u16 first_entry;
+	/* Index of the last entry in the last TCAM */
+	u16 last_entry;
+
+	/* List of active scenarios */
+	struct list_head scens;
+
+	struct ice_acl_tbl_params info;
+	struct ice_acl_act_mem act_mems[ICE_AQC_MAX_ACTION_MEMORIES];
+
+	/* Keep track of available 64-entry chunks in TCAMs */
+	DECLARE_BITMAP(avail, ICE_AQC_ACL_ALLOC_UNITS);
+
+	u16 id;
+};
+
+#define ICE_MAX_ACL_TCAM_ENTRY (ICE_AQC_ACL_TCAM_DEPTH * ICE_AQC_ACL_SLICES)
+enum ice_acl_entry_prio {
+	ICE_ACL_PRIO_LOW = 0,
+	ICE_ACL_PRIO_NORMAL,
+	ICE_ACL_PRIO_HIGH,
+	ICE_ACL_MAX_PRIO
+};
+
+/* Scenario structure
+ * A scenario is a logical partition within an ACL table. It can span more
+ * than one TCAM in cascade mode to support select/mask key widths larger.
+ * than the width of a TCAM. It can also span more than one TCAM in stacked
+ * mode to support larger number of entries than what a TCAM can hold. It is
+ * used to select values from selection bases (field vectors holding extract
+ * protocol header fields) to form lookup keys, and to associate action memory
+ * banks to the TCAMs used.
+ */
+struct ice_acl_scen {
+	struct list_head list_entry;
+	/* If nth bit of act_mem_bitmap is set, then nth action memory will
+	 * participate in this scenario
+	 */
+	DECLARE_BITMAP(act_mem_bitmap, ICE_AQC_MAX_ACTION_MEMORIES);
+
+	/* If nth bit of entry_bitmap is set, then nth entry will
+	 * be available in this scenario
+	 */
+	DECLARE_BITMAP(entry_bitmap, ICE_MAX_ACL_TCAM_ENTRY);
+	u16 first_idx[ICE_ACL_MAX_PRIO];
+	u16 last_idx[ICE_ACL_MAX_PRIO];
+
+	u16 id;
+	u16 start;	/* Number of entry from the start of the parent table */
+#define ICE_ACL_SCEN_MIN_WIDTH	0x3
+	u16 width;	/* Number of select/mask bytes */
+	u16 num_entry;	/* Number of scenario entry */
+	u16 end;	/* Last addressable entry from start of table */
+	u8 eff_width;	/* Available width in bytes to match */
+#define ICE_ACL_SCEN_PKT_DIR_IDX_IN_TCAM	0x2
+#define ICE_ACL_SCEN_PID_IDX_IN_TCAM		0x3
+#define ICE_ACL_SCEN_RNG_CHK_IDX_IN_TCAM	0x4
+	u8 pid_idx;	/* Byte index used to match profile ID */
+	u8 rng_chk_idx;	/* Byte index used to match range checkers result */
+	u8 pkt_dir_idx;	/* Byte index used to match packet direction */
+};
+
+/* This structure represents input fields needed to allocate ACL table */
+struct ice_acl_alloc_tbl {
+	/* Table's width in number of bytes matched */
+	u16 width;
+	/* Table's depth in number of entries. */
+	u16 depth;
+	u8 num_dependent_alloc_ids;	/* number of depdendent alloc IDs */
+	u8 concurr;			/* true for concurrent table type */
+
+	/* Amount of action pairs per table entry. Minimal valid
+	 * value for this field is 1 (e.g. single pair of actions)
+	 */
+	u8 act_pairs_per_entry;
+	union {
+		struct ice_aqc_acl_alloc_table_data data_buf;
+		struct ice_aqc_acl_generic resp_buf;
+	} buf;
+};
+
+/* This structure is used to communicate input and output params for
+ * [de]allocate_acl_counters
+ */
+struct ice_acl_cntrs {
+	u8 amount;
+	u8 type;
+	u8 bank;
+
+	/* Next 2 variables are used for output in case of alloc_acl_counters
+	 * and input in case of deallocate_acl_counters
+	 */
+	u16 first_cntr;
+	u16 last_cntr;
+};
+
+enum ice_status
+ice_acl_create_tbl(struct ice_hw *hw, struct ice_acl_tbl_params *params);
+enum ice_status ice_acl_destroy_tbl(struct ice_hw *hw);
+enum ice_status
+ice_acl_create_scen(struct ice_hw *hw, u16 match_width, u16 num_entries,
+		    u16 *scen_id);
+enum ice_status
+ice_aq_alloc_acl_tbl(struct ice_hw *hw, struct ice_acl_alloc_tbl *tbl,
+		     struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dealloc_acl_tbl(struct ice_hw *hw, u16 alloc_id,
+		       struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_program_acl_entry(struct ice_hw *hw, u8 tcam_idx, u16 entry_idx,
+			 struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_acl_entry(struct ice_hw *hw, u8 tcam_idx, u16 entry_idx,
+		       struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_alloc_actpair(struct ice_hw *hw, u16 alloc_id,
+		     struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dealloc_actpair(struct ice_hw *hw, u16 alloc_id,
+		       struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_program_actpair(struct ice_hw *hw, u8 act_mem_idx, u16 act_entry_idx,
+		       struct ice_aqc_actpair *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_actpair(struct ice_hw *hw, u8 act_mem_idx, u16 act_entry_idx,
+		     struct ice_aqc_actpair *buf, struct ice_sq_cd *cd);
+enum ice_status ice_aq_dealloc_acl_res(struct ice_hw *hw, struct ice_sq_cd *cd);
+enum ice_status
+ice_prgm_acl_prof_xtrct(struct ice_hw *hw, u8 prof_id,
+			struct ice_aqc_acl_prof_generic_frmt *buf,
+			struct ice_sq_cd *cd);
+enum ice_status
+ice_query_acl_prof(struct ice_hw *hw, u8 prof_id,
+		   struct ice_aqc_acl_prof_generic_frmt *buf,
+		   struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_alloc_acl_cntrs(struct ice_hw *hw, struct ice_acl_cntrs *cntrs,
+		       struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dealloc_acl_cntrs(struct ice_hw *hw, struct ice_acl_cntrs *cntrs,
+			 struct ice_sq_cd *cd);
+enum ice_status
+ice_prog_acl_prof_ranges(struct ice_hw *hw, u8 prof_id,
+			 struct ice_aqc_acl_profile_ranges *buf,
+			 struct ice_sq_cd *cd);
+enum ice_status
+ice_query_acl_prof_ranges(struct ice_hw *hw, u8 prof_id,
+			  struct ice_aqc_acl_profile_ranges *buf,
+			  struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_alloc_acl_scen(struct ice_hw *hw, u16 *scen_id,
+		      struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dealloc_acl_scen(struct ice_hw *hw, u16 scen_id, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_update_acl_scen(struct ice_hw *hw, u16 scen_id,
+		       struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_acl_scen(struct ice_hw *hw, u16 scen_id,
+		      struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_acl_add_entry(struct ice_hw *hw, struct ice_acl_scen *scen,
+		  enum ice_acl_entry_prio prio, u8 *keys, u8 *inverts,
+		  struct ice_acl_act_entry *acts, u8 acts_cnt, u16 *entry_idx);
+enum ice_status
+ice_acl_prog_act(struct ice_hw *hw, struct ice_acl_scen *scen,
+		 struct ice_acl_act_entry *acts, u8 acts_cnt, u16 entry_idx);
+enum ice_status
+ice_acl_rem_entry(struct ice_hw *hw, struct ice_acl_scen *scen, u16 entry_idx);
+bool ice_is_acl_empty(struct ice_hw *hw);
+#endif /* _ICE_ACL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_acl_ctrl.c b/drivers/net/ethernet/intel/ice/ice_acl_ctrl.c
new file mode 100644
index 000000000000..a777d215f1b9
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_acl_ctrl.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_acl.h"
+#include "ice_flow.h"
+
+
+/* Determine the TCAM index of entry 'e' within the ACL table */
+#define ICE_ACL_TBL_TCAM_IDX(e) ((e) / ICE_AQC_ACL_TCAM_DEPTH)
+
+/* Determine the entry index within the TCAM */
+#define ICE_ACL_TBL_TCAM_ENTRY_IDX(e) ((e) % ICE_AQC_ACL_TCAM_DEPTH)
+
+#define ICE_ACL_SCEN_ENTRY_INVAL 0xFFFF
+
+/**
+ * ice_acl_init_entry
+ * @scen: pointer to the scenario struct
+ *
+ * Initialize the scenario control structure.
+ */
+static void ice_acl_init_entry(struct ice_acl_scen *scen)
+{
+	/* low priority: start from the highest index, 25% of total entries
+	 * normal priority: start from the highest index, 50% of total entries
+	 * high priority: start from the lowest index, 25% of total entries
+	 */
+	scen->first_idx[ICE_ACL_PRIO_LOW] = scen->num_entry - 1;
+	scen->first_idx[ICE_ACL_PRIO_NORMAL] = scen->num_entry -
+		scen->num_entry / 4 - 1;
+	scen->first_idx[ICE_ACL_PRIO_HIGH] = 0;
+
+	scen->last_idx[ICE_ACL_PRIO_LOW] = scen->num_entry -
+		scen->num_entry / 4;
+	scen->last_idx[ICE_ACL_PRIO_NORMAL] = scen->num_entry / 4;
+	scen->last_idx[ICE_ACL_PRIO_HIGH] = scen->num_entry / 4 - 1;
+}
+
+/**
+ * ice_acl_scen_assign_entry_idx
+ * @scen: pointer to the scenario struct
+ * @prio: the priority of the flow entry being allocated
+ *
+ * To find the index of an available entry in scenario
+ *
+ * Returns ICE_ACL_SCEN_ENTRY_INVAL if fails
+ * Returns index on success
+ */
+static u16
+ice_acl_scen_assign_entry_idx(struct ice_acl_scen *scen,
+			      enum ice_acl_entry_prio prio)
+{
+	u16 first_idx, last_idx, i;
+	s8 step;
+
+	if (prio >= ICE_ACL_MAX_PRIO)
+		return ICE_ACL_SCEN_ENTRY_INVAL;
+
+	first_idx = scen->first_idx[prio];
+	last_idx = scen->last_idx[prio];
+	step = first_idx <= last_idx ? 1 : -1;
+
+	for (i = first_idx; i != last_idx + step; i += step)
+		if (!test_and_set_bit(i, scen->entry_bitmap))
+			return i;
+
+	return ICE_ACL_SCEN_ENTRY_INVAL;
+}
+
+/**
+ * ice_acl_scen_free_entry_idx
+ * @scen: pointer to the scenario struct
+ * @idx: the index of the flow entry being de-allocated
+ *
+ * To mark an entry available in scenario
+ */
+static enum ice_status
+ice_acl_scen_free_entry_idx(struct ice_acl_scen *scen, u16 idx)
+{
+	if (idx >= scen->num_entry)
+		return ICE_ERR_MAX_LIMIT;
+
+	if (!test_and_clear_bit(idx, scen->entry_bitmap))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	return 0;
+}
+
+/**
+ * ice_acl_tbl_calc_end_idx
+ * @start: start index of the TCAM entry of this partition
+ * @num_entries: number of entries in this partition
+ * @width: width of a partition in number of TCAMs
+ *
+ * Calculate the end entry index for a partition with starting entry index
+ * 'start', entries 'num_entries', and width 'width'.
+ */
+static u16 ice_acl_tbl_calc_end_idx(u16 start, u16 num_entries, u16 width)
+{
+	u16 end_idx, add_entries = 0;
+
+	end_idx = start + (num_entries - 1);
+
+	/* In case that our ACL partition requires cascading TCAMs */
+	if (width > 1) {
+		u16 num_stack_level;
+
+		/* Figure out the TCAM stacked level in this ACL scenario */
+		num_stack_level = (start % ICE_AQC_ACL_TCAM_DEPTH) +
+			num_entries;
+		num_stack_level = DIV_ROUND_UP(num_stack_level,
+					       ICE_AQC_ACL_TCAM_DEPTH);
+
+		/* In this case, each entries in our ACL partition span
+		 * multiple TCAMs. Thus, we will need to add
+		 * ((width - 1) * num_stack_level) TCAM's entries to
+		 * end_idx.
+		 *
+		 * For example : In our case, our scenario is 2x2:
+		 *	[TCAM 0]	[TCAM 1]
+		 *	[TCAM 2]	[TCAM 3]
+		 * Assuming that a TCAM will have 512 entries. If "start"
+		 * is 500, "num_entries" is 3 and "width" = 2, then end_idx
+		 * should be 1024 (belongs to TCAM 2).
+		 * Before going to this if statement, end_idx will have the
+		 * value of 512. If "width" is 1, then the final value of
+		 * end_idx is 512. However, in our case, width is 2, then we
+		 * will need add (2 - 1) * 1 * 512. As result, end_idx will
+		 * have the value of 1024.
+		 */
+		add_entries = (width - 1) * num_stack_level *
+			ICE_AQC_ACL_TCAM_DEPTH;
+	}
+
+	return end_idx + add_entries;
+}
+
+/**
+ * ice_acl_init_tbl
+ * @hw: pointer to the hardware structure
+ *
+ * Initialize the ACL table by invalidating TCAM entries and action pairs.
+ */
+static enum ice_status ice_acl_init_tbl(struct ice_hw *hw)
+{
+	struct ice_aqc_actpair act_buf;
+	struct ice_aqc_acl_data buf;
+	enum ice_status status = 0;
+	struct ice_acl_tbl *tbl;
+	u8 tcam_idx, i;
+	u16 idx;
+
+	tbl = hw->acl_tbl;
+	if (!tbl)
+		return ICE_ERR_CFG;
+
+	memset(&buf, 0, sizeof(buf));
+	memset(&act_buf, 0, sizeof(act_buf));
+
+	tcam_idx = tbl->first_tcam;
+	idx = tbl->first_entry;
+	while (tcam_idx < tbl->last_tcam ||
+	       (tcam_idx == tbl->last_tcam && idx <= tbl->last_entry)) {
+		/* Use the same value for entry_key and entry_key_inv since
+		 * we are initializing the fields to 0
+		 */
+		status = ice_aq_program_acl_entry(hw, tcam_idx, idx, &buf,
+						  NULL);
+		if (status)
+			return status;
+
+		if (++idx > tbl->last_entry) {
+			tcam_idx++;
+			idx = tbl->first_entry;
+		}
+	}
+
+	for (i = 0; i < ICE_AQC_MAX_ACTION_MEMORIES; i++) {
+		u16 act_entry_idx, start, end;
+
+		if (tbl->act_mems[i].act_mem == ICE_ACL_ACT_PAIR_MEM_INVAL)
+			continue;
+
+		start = tbl->first_entry;
+		end = tbl->last_entry;
+
+		for (act_entry_idx = start; act_entry_idx <= end;
+		     act_entry_idx++) {
+			/* Invalidate all allocated action pairs */
+			status = ice_aq_program_actpair(hw, i, act_entry_idx,
+							&act_buf, NULL);
+			if (status)
+				return status;
+		}
+	}
+
+	return status;
+}
+
+/**
+ * ice_acl_assign_act_mems_to_tcam
+ * @tbl: pointer to ACL table structure
+ * @cur_tcam: Index of current TCAM. Value = 0 to (ICE_AQC_ACL_SLICES - 1)
+ * @cur_mem_idx: Index of current action memory bank. Value = 0 to
+ *		 (ICE_AQC_MAX_ACTION_MEMORIES - 1)
+ * @num_mem: Number of action memory banks for this TCAM
+ *
+ * Assign "num_mem" valid action memory banks from "curr_mem_idx" to
+ * "curr_tcam" TCAM.
+ */
+static void
+ice_acl_assign_act_mems_to_tcam(struct ice_acl_tbl *tbl, u8 cur_tcam,
+				u8 *cur_mem_idx, u8 num_mem)
+{
+	u8 mem_cnt;
+
+	for (mem_cnt = 0;
+	     *cur_mem_idx < ICE_AQC_MAX_ACTION_MEMORIES && mem_cnt < num_mem;
+	     (*cur_mem_idx)++) {
+		struct ice_acl_act_mem *p_mem = &tbl->act_mems[*cur_mem_idx];
+
+		if (p_mem->act_mem == ICE_ACL_ACT_PAIR_MEM_INVAL)
+			continue;
+
+		p_mem->member_of_tcam = cur_tcam;
+
+		mem_cnt++;
+	}
+}
+
+/**
+ * ice_acl_divide_act_mems_to_tcams
+ * @tbl: pointer to ACL table structure
+ *
+ * Figure out how to divide given action memory banks to given TCAMs. This
+ * division is for SW book keeping. In the time when scenario is created,
+ * an action memory bank can be used for different TCAM.
+ *
+ * For example, given that we have 2x2 ACL table with each table entry has
+ * 2 action memory pairs. As the result, we will have 4 TCAMs (T1,T2,T3,T4)
+ * and 4 action memory banks (A1,A2,A3,A4)
+ *	[T1 - T2] { A1 - A2 }
+ *	[T3 - T4] { A3 - A4 }
+ * In the time when we need to create a scenario, for example, 2x1 scenario,
+ * we will use [T3,T4] in a cascaded layout. As it is a requirement that all
+ * action memory banks in a cascaded TCAM's row will need to associate with
+ * the last TCAM. Thus, we will associate action memory banks [A3] and [A4]
+ * for TCAM [T4].
+ * For SW book-keeping purpose, we will keep theoretical maps between TCAM
+ * [Tn] to action memory bank [An].
+ */
+static void ice_acl_divide_act_mems_to_tcams(struct ice_acl_tbl *tbl)
+{
+	u16 num_cscd, stack_level, stack_idx, min_act_mem;
+	u8 tcam_idx = tbl->first_tcam;
+	u16 max_idx_to_get_extra;
+	u8 mem_idx = 0;
+
+	/* Determine number of stacked TCAMs */
+	stack_level = DIV_ROUND_UP(tbl->info.depth, ICE_AQC_ACL_TCAM_DEPTH);
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = DIV_ROUND_UP(tbl->info.width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	/* In a line of cascaded TCAM, given the number of action memory
+	 * banks per ACL table entry, we want to fairly divide these action
+	 * memory banks between these TCAMs.
+	 *
+	 * For example, there are 3 TCAMs (TCAM 3,4,5) in a line of
+	 * cascaded TCAM, and there are 7 act_mems for each ACL table entry.
+	 * The result is:
+	 *	[TCAM_3 will have 3 act_mems]
+	 *	[TCAM_4 will have 2 act_mems]
+	 *	[TCAM_5 will have 2 act_mems]
+	 */
+	min_act_mem = tbl->info.entry_act_pairs / num_cscd;
+	max_idx_to_get_extra = tbl->info.entry_act_pairs % num_cscd;
+
+	for (stack_idx = 0; stack_idx < stack_level; stack_idx++) {
+		u16 i;
+
+		for (i = 0; i < num_cscd; i++) {
+			u8 total_act_mem = min_act_mem;
+
+			if (i < max_idx_to_get_extra)
+				total_act_mem++;
+
+			ice_acl_assign_act_mems_to_tcam(tbl, tcam_idx,
+							&mem_idx,
+							total_act_mem);
+
+			tcam_idx++;
+		}
+	}
+}
+
+/**
+ * ice_acl_create_tbl
+ * @hw: pointer to the HW struct
+ * @params: parameters for the table to be created
+ *
+ * Create a LEM table for ACL usage. We are currently starting with some fixed
+ * values for the size of the table, but this will need to grow as more flow
+ * entries are added by the user level.
+ */
+enum ice_status
+ice_acl_create_tbl(struct ice_hw *hw, struct ice_acl_tbl_params *params)
+{
+	u16 width, depth, first_e, last_e, i;
+	struct ice_aqc_acl_generic *resp_buf;
+	struct ice_acl_alloc_tbl tbl_alloc;
+	struct ice_acl_tbl *tbl;
+	enum ice_status status;
+
+	if (hw->acl_tbl)
+		return ICE_ERR_ALREADY_EXISTS;
+
+	if (!params)
+		return ICE_ERR_PARAM;
+
+	/* round up the width to the next TCAM width boundary. */
+	width = roundup(params->width, (u16)ICE_AQC_ACL_KEY_WIDTH_BYTES);
+	/* depth should be provided in chunk (64 entry) increments */
+	depth = ALIGN(params->depth, ICE_ACL_ENTRY_ALLOC_UNIT);
+
+	if (params->entry_act_pairs < width / ICE_AQC_ACL_KEY_WIDTH_BYTES) {
+		params->entry_act_pairs = width / ICE_AQC_ACL_KEY_WIDTH_BYTES;
+
+		if (params->entry_act_pairs > ICE_AQC_TBL_MAX_ACTION_PAIRS)
+			params->entry_act_pairs = ICE_AQC_TBL_MAX_ACTION_PAIRS;
+	}
+
+	/* Validate that width*depth will not exceed the TCAM limit */
+	if ((DIV_ROUND_UP(depth, ICE_AQC_ACL_TCAM_DEPTH) *
+	     (width / ICE_AQC_ACL_KEY_WIDTH_BYTES)) > ICE_AQC_ACL_SLICES)
+		return ICE_ERR_MAX_LIMIT;
+
+	memset(&tbl_alloc, 0, sizeof(tbl_alloc));
+	tbl_alloc.width = width;
+	tbl_alloc.depth = depth;
+	tbl_alloc.act_pairs_per_entry = params->entry_act_pairs;
+	tbl_alloc.concurr = params->concurr;
+	/* Set dependent_alloc_id only for concurrent table type */
+	if (params->concurr) {
+		tbl_alloc.num_dependent_alloc_ids =
+			ICE_AQC_MAX_CONCURRENT_ACL_TBL;
+
+		for (i = 0; i < ICE_AQC_MAX_CONCURRENT_ACL_TBL; i++)
+			tbl_alloc.buf.data_buf.alloc_ids[i] =
+				cpu_to_le16(params->dep_tbls[i]);
+	}
+
+	/* call the AQ command to create the ACL table with these values */
+	status = ice_aq_alloc_acl_tbl(hw, &tbl_alloc, NULL);
+	if (status) {
+		if (le16_to_cpu(tbl_alloc.buf.resp_buf.alloc_id) <
+		    ICE_AQC_ALLOC_ID_LESS_THAN_4K)
+			ice_debug(hw, ICE_DBG_ACL, "Alloc ACL table failed. Unavailable resource.\n");
+		else
+			ice_debug(hw, ICE_DBG_ACL, "AQ allocation of ACL failed with error. status: %d\n",
+				  status);
+		return status;
+	}
+
+	tbl = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*tbl), GFP_KERNEL);
+	if (!tbl) {
+		status = ICE_ERR_NO_MEMORY;
+
+		goto out;
+	}
+
+	resp_buf = &tbl_alloc.buf.resp_buf;
+
+	/* Retrieve information of the allocated table */
+	tbl->id = le16_to_cpu(resp_buf->alloc_id);
+	tbl->first_tcam = resp_buf->ops.table.first_tcam;
+	tbl->last_tcam = resp_buf->ops.table.last_tcam;
+	tbl->first_entry = le16_to_cpu(resp_buf->first_entry);
+	tbl->last_entry = le16_to_cpu(resp_buf->last_entry);
+
+	tbl->info = *params;
+	tbl->info.width = width;
+	tbl->info.depth = depth;
+	hw->acl_tbl = tbl;
+
+	for (i = 0; i < ICE_AQC_MAX_ACTION_MEMORIES; i++)
+		tbl->act_mems[i].act_mem = resp_buf->act_mem[i];
+
+	/* Figure out which TCAMs that these newly allocated action memories
+	 * belong to.
+	 */
+	ice_acl_divide_act_mems_to_tcams(tbl);
+
+	/* Initialize the resources allocated by invalidating all TCAM entries
+	 * and all the action pairs
+	 */
+	status = ice_acl_init_tbl(hw);
+	if (status) {
+		devm_kfree(ice_hw_to_dev(hw), tbl);
+		hw->acl_tbl = NULL;
+		ice_debug(hw, ICE_DBG_ACL, "Initialization of TCAM entries failed. status: %d\n",
+			  status);
+		goto out;
+	}
+
+	first_e = (tbl->first_tcam * ICE_AQC_MAX_TCAM_ALLOC_UNITS) +
+		(tbl->first_entry / ICE_ACL_ENTRY_ALLOC_UNIT);
+	last_e = (tbl->last_tcam * ICE_AQC_MAX_TCAM_ALLOC_UNITS) +
+		(tbl->last_entry / ICE_ACL_ENTRY_ALLOC_UNIT);
+
+	/* Indicate available entries in the table */
+	bitmap_set(tbl->avail, first_e, last_e - first_e + 1);
+
+	INIT_LIST_HEAD(&tbl->scens);
+out:
+
+	return status;
+}
+
+/**
+ * ice_acl_alloc_partition - Allocate a partition from the ACL table
+ * @hw: pointer to the hardware structure
+ * @req: info of partition being allocated
+ */
+static enum ice_status
+ice_acl_alloc_partition(struct ice_hw *hw, struct ice_acl_scen *req)
+{
+	u16 start = 0, cnt = 0, off = 0;
+	u16 width, r_entries, row;
+	bool done = false;
+	int dir;
+
+	/* Determine the number of TCAMs each entry overlaps */
+	width = DIV_ROUND_UP(req->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	/* Check if we have enough TCAMs to accommodate the width */
+	if (width > hw->acl_tbl->last_tcam - hw->acl_tbl->first_tcam + 1)
+		return ICE_ERR_MAX_LIMIT;
+
+	/* Number of entries must be multiple of ICE_ACL_ENTRY_ALLOC_UNIT's */
+	r_entries = ALIGN(req->num_entry, ICE_ACL_ENTRY_ALLOC_UNIT);
+
+	/* To look for an available partition that can accommodate the request,
+	 * the process first logically arranges available TCAMs in rows such
+	 * that each row produces entries with the requested width. It then
+	 * scans the TCAMs' available bitmap, one bit at a time, and
+	 * accumulates contiguous available 64-entry chunks until there are
+	 * enough of them or when all TCAM configurations have been checked.
+	 *
+	 * For width of 1 TCAM, the scanning process starts from the top most
+	 * TCAM, and goes downward. Available bitmaps are examined from LSB
+	 * to MSB.
+	 *
+	 * For width of multiple TCAMs, the process starts from the bottom-most
+	 * row of TCAMs, and goes upward. Available bitmaps are examined from
+	 * the MSB to the LSB.
+	 *
+	 * To make sure that adjacent TCAMs can be logically arranged in the
+	 * same row, the scanning process may have multiple passes. In each
+	 * pass, the first TCAM of the bottom-most row is displaced by one
+	 * additional TCAM. The width of the row and the number of the TCAMs
+	 * available determine the number of passes. When the displacement is
+	 * more than the size of width, the TCAM row configurations will
+	 * repeat. The process will terminate when the configurations repeat.
+	 *
+	 * Available partitions can span more than one row of TCAMs.
+	 */
+	if (width == 1) {
+		row = hw->acl_tbl->first_tcam;
+		dir = 1;
+	} else {
+		/* Start with the bottom-most row, and scan for available
+		 * entries upward
+		 */
+		row = hw->acl_tbl->last_tcam + 1 - width;
+		dir = -1;
+	}
+
+	do {
+		u16 i;
+
+		/* Scan all 64-entry chunks, one chunk at a time, in the
+		 * current TCAM row
+		 */
+		for (i = 0;
+		     i < ICE_AQC_MAX_TCAM_ALLOC_UNITS && cnt < r_entries;
+		     i++) {
+			bool avail = true;
+			u16 w, p;
+
+			/* Compute the cumulative available mask across the
+			 * TCAM row to determine if the current 64-entry chunk
+			 * is available.
+			 */
+			p = dir > 0 ? i : ICE_AQC_MAX_TCAM_ALLOC_UNITS - i - 1;
+			for (w = row; w < row + width && avail; w++) {
+				u16 b;
+
+				b = (w * ICE_AQC_MAX_TCAM_ALLOC_UNITS) + p;
+				avail &= test_bit(b, hw->acl_tbl->avail);
+			}
+
+			if (!avail) {
+				cnt = 0;
+			} else {
+				/* Compute the starting index of the newly
+				 * found partition. When 'dir' is negative, the
+				 * scan processes is going upward. If so, the
+				 * starting index needs to be updated for every
+				 * available 64-entry chunk found.
+				 */
+				if (!cnt || dir < 0)
+					start = (row * ICE_AQC_ACL_TCAM_DEPTH) +
+						(p * ICE_ACL_ENTRY_ALLOC_UNIT);
+				cnt += ICE_ACL_ENTRY_ALLOC_UNIT;
+			}
+		}
+
+		if (cnt >= r_entries) {
+			req->start = start;
+			req->num_entry = r_entries;
+			req->end = ice_acl_tbl_calc_end_idx(start, r_entries,
+							    width);
+			break;
+		}
+
+		row = dir > 0 ? row + width : row - width;
+		if (row > hw->acl_tbl->last_tcam ||
+		    row < hw->acl_tbl->first_tcam) {
+			/* All rows have been checked. Increment 'off' that
+			 * will help yield a different TCAM configuration in
+			 * which adjacent TCAMs can be alternatively in the
+			 * same row.
+			 */
+			off++;
+
+			/* However, if the new 'off' value yields previously
+			 * checked configurations, then exit.
+			 */
+			if (off >= width)
+				done = true;
+			else
+				row = dir > 0 ? off :
+					hw->acl_tbl->last_tcam + 1 - off -
+					width;
+		}
+	} while (!done);
+
+	return cnt >= r_entries ? ICE_SUCCESS : ICE_ERR_MAX_LIMIT;
+}
+
+/**
+ * ice_acl_fill_tcam_select
+ * @scen_buf: Pointer to the scenario buffer that needs to be populated
+ * @scen: Pointer to the available space for the scenario
+ * @tcam_idx: Index of the TCAM used for this scenario
+ * @tcam_idx_in_cascade : Local index of the TCAM in the cascade scenario
+ *
+ * For all TCAM that participate in this scenario, fill out the tcam_select
+ * value.
+ */
+static void
+ice_acl_fill_tcam_select(struct ice_aqc_acl_scen *scen_buf,
+			 struct ice_acl_scen *scen, u16 tcam_idx,
+			 u16 tcam_idx_in_cascade)
+{
+	u16 cascade_cnt, idx;
+	u8 j;
+
+	idx = tcam_idx_in_cascade * ICE_AQC_ACL_KEY_WIDTH_BYTES;
+	cascade_cnt = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	/* For each scenario, we reserved last three bytes of scenario width for
+	 * profile ID, range checker, and packet direction. Thus, the last three
+	 * bytes of the last cascaded TCAMs will have value of 1st, 31st and
+	 * 32nd byte location of BYTE selection base.
+	 *
+	 * For other bytes in the TCAMs:
+	 * For non-cascade mode (1 TCAM wide) scenario, TCAM[x]'s Select {0-1}
+	 * select indices 0-1 of the Byte Selection Base
+	 * For cascade mode, the leftmost TCAM of the first cascade row selects
+	 * indices 0-4 of the Byte Selection Base; the second TCAM in the
+	 * cascade row selects indices starting with 5-n
+	 */
+	for (j = 0; j < ICE_AQC_ACL_KEY_WIDTH_BYTES; j++) {
+		/* PKT DIR uses the 1st location of Byte Selection Base: + 1 */
+		u8 val = ICE_AQC_ACL_BYTE_SEL_BASE + 1 + idx;
+
+		if (tcam_idx_in_cascade == cascade_cnt - 1) {
+			if (j == ICE_ACL_SCEN_RNG_CHK_IDX_IN_TCAM)
+				val = ICE_AQC_ACL_BYTE_SEL_BASE_RNG_CHK;
+			else if (j == ICE_ACL_SCEN_PID_IDX_IN_TCAM)
+				val = ICE_AQC_ACL_BYTE_SEL_BASE_PID;
+			else if (j == ICE_ACL_SCEN_PKT_DIR_IDX_IN_TCAM)
+				val = ICE_AQC_ACL_BYTE_SEL_BASE_PKT_DIR;
+		}
+
+		/* In case that scenario's width is greater than the width of
+		 * the Byte selection base, we will not assign a value to the
+		 * tcam_select[j]. As a result, the tcam_select[j] will have
+		 * default value which is zero.
+		 */
+		if (val > ICE_AQC_ACL_BYTE_SEL_BASE_RNG_CHK)
+			continue;
+
+		scen_buf->tcam_cfg[tcam_idx].tcam_select[j] = val;
+
+		idx++;
+	}
+}
+
+/**
+ * ice_acl_set_scen_chnk_msk
+ * @scen_buf: Pointer to the scenario buffer that needs to be populated
+ * @scen: pointer to the available space for the scenario
+ *
+ * Set the chunk mask for the entries that will be used by this scenario
+ */
+static void
+ice_acl_set_scen_chnk_msk(struct ice_aqc_acl_scen *scen_buf,
+			  struct ice_acl_scen *scen)
+{
+	u16 tcam_idx, num_cscd, units, cnt;
+	u8 chnk_offst;
+
+	/* Determine the starting TCAM index and offset of the start entry */
+	tcam_idx = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	chnk_offst = (u8)((scen->start % ICE_AQC_ACL_TCAM_DEPTH) /
+			  ICE_ACL_ENTRY_ALLOC_UNIT);
+
+	/* Entries are allocated and tracked in multiple of 64's */
+	units = scen->num_entry / ICE_ACL_ENTRY_ALLOC_UNIT;
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = scen->width / ICE_AQC_ACL_KEY_WIDTH_BYTES;
+
+	for (cnt = 0; cnt < units; cnt++) {
+		u16 i;
+
+		/* Set the corresponding bitmap of individual 64-entry
+		 * chunk spans across a cascade of 1 or more TCAMs
+		 * For each TCAM, there will be (ICE_AQC_ACL_TCAM_DEPTH
+		 * / ICE_ACL_ENTRY_ALLOC_UNIT) or 8 chunks.
+		 */
+		for (i = tcam_idx; i < tcam_idx + num_cscd; i++)
+			scen_buf->tcam_cfg[i].chnk_msk |= BIT(chnk_offst);
+
+		chnk_offst = (chnk_offst + 1) % ICE_AQC_MAX_TCAM_ALLOC_UNITS;
+		if (!chnk_offst)
+			tcam_idx += num_cscd;
+	}
+}
+
+/**
+ * ice_acl_assign_act_mem_for_scen
+ * @tbl: pointer to ACL table structure
+ * @scen: pointer to the scenario struct
+ * @scen_buf: pointer to the available space for the scenario
+ * @current_tcam_idx: theoretical index of the TCAM that we associated those
+ *		      action memory banks with, at the table creation time.
+ * @target_tcam_idx: index of the TCAM that we want to associate those action
+ *		     memory banks with.
+ */
+static void
+ice_acl_assign_act_mem_for_scen(struct ice_acl_tbl *tbl,
+				struct ice_acl_scen *scen,
+				struct ice_aqc_acl_scen *scen_buf,
+				u8 current_tcam_idx, u8 target_tcam_idx)
+{
+	u8 i;
+
+	for (i = 0; i < ICE_AQC_MAX_ACTION_MEMORIES; i++) {
+		struct ice_acl_act_mem *p_mem = &tbl->act_mems[i];
+
+		if (p_mem->act_mem == ICE_ACL_ACT_PAIR_MEM_INVAL ||
+		    p_mem->member_of_tcam != current_tcam_idx)
+			continue;
+
+		scen_buf->act_mem_cfg[i] = target_tcam_idx;
+		scen_buf->act_mem_cfg[i] |= ICE_AQC_ACL_SCE_ACT_MEM_EN;
+		set_bit(i, scen->act_mem_bitmap);
+	}
+}
+
+/**
+ * ice_acl_commit_partition - Indicate if the specified partition is active
+ * @hw: pointer to the hardware structure
+ * @scen: pointer to the scenario struct
+ * @commit: true if the partition is being commit
+ */
+static void
+ice_acl_commit_partition(struct ice_hw *hw, struct ice_acl_scen *scen,
+			 bool commit)
+{
+	u16 tcam_idx, off, num_cscd, units, cnt;
+
+	/* Determine the starting TCAM index and offset of the start entry */
+	tcam_idx = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	off = (scen->start % ICE_AQC_ACL_TCAM_DEPTH) /
+		ICE_ACL_ENTRY_ALLOC_UNIT;
+
+	/* Entries are allocated and tracked in multiple of 64's */
+	units = scen->num_entry / ICE_ACL_ENTRY_ALLOC_UNIT;
+
+	/* Determine number of cascaded TCAM */
+	num_cscd = scen->width / ICE_AQC_ACL_KEY_WIDTH_BYTES;
+
+	for (cnt = 0; cnt < units; cnt++) {
+		u16 w;
+
+		/* Set/clear the corresponding bitmap of individual 64-entry
+		 * chunk spans across a row of 1 or more TCAMs
+		 */
+		for (w = 0; w < num_cscd; w++) {
+			u16 b;
+
+			b = ((tcam_idx + w) * ICE_AQC_MAX_TCAM_ALLOC_UNITS) +
+				off;
+			if (commit)
+				set_bit(b, hw->acl_tbl->avail);
+			else
+				clear_bit(b, hw->acl_tbl->avail);
+		}
+
+		off = (off + 1) % ICE_AQC_MAX_TCAM_ALLOC_UNITS;
+		if (!off)
+			tcam_idx += num_cscd;
+	}
+}
+
+/**
+ * ice_acl_create_scen
+ * @hw: pointer to the hardware structure
+ * @match_width: number of bytes to be matched in this scenario
+ * @num_entries: number of entries to be allocated for the scenario
+ * @scen_id: holds returned scenario ID if successful
+ */
+enum ice_status
+ice_acl_create_scen(struct ice_hw *hw, u16 match_width, u16 num_entries,
+		    u16 *scen_id)
+{
+	u8 cascade_cnt, first_tcam, last_tcam, i, k;
+	struct ice_aqc_acl_scen scen_buf;
+	struct ice_acl_scen *scen;
+	enum ice_status status;
+
+	if (!hw->acl_tbl)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	scen = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*scen), GFP_KERNEL);
+	if (!scen)
+		return ICE_ERR_NO_MEMORY;
+
+	scen->start = hw->acl_tbl->first_entry;
+	scen->width = ICE_AQC_ACL_KEY_WIDTH_BYTES *
+		DIV_ROUND_UP(match_width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+	scen->num_entry = num_entries;
+
+	status = ice_acl_alloc_partition(hw, scen);
+	if (status)
+		goto out;
+
+	memset(&scen_buf, 0, sizeof(scen_buf));
+
+	/* Determine the number of cascade TCAMs, given the scenario's width */
+	cascade_cnt = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+	first_tcam = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	last_tcam = ICE_ACL_TBL_TCAM_IDX(scen->end);
+
+	/* For each scenario, we reserved last three bytes of scenario width for
+	 * packet direction flag, profile ID and range checker. Thus, we want to
+	 * return back to the caller the eff_width, pkt_dir_idx, rng_chk_idx and
+	 * pid_idx.
+	 */
+	scen->eff_width = cascade_cnt * ICE_AQC_ACL_KEY_WIDTH_BYTES -
+		ICE_ACL_SCEN_MIN_WIDTH;
+	scen->rng_chk_idx = (cascade_cnt - 1) * ICE_AQC_ACL_KEY_WIDTH_BYTES +
+		ICE_ACL_SCEN_RNG_CHK_IDX_IN_TCAM;
+	scen->pid_idx = (cascade_cnt - 1) * ICE_AQC_ACL_KEY_WIDTH_BYTES +
+		ICE_ACL_SCEN_PID_IDX_IN_TCAM;
+	scen->pkt_dir_idx = (cascade_cnt - 1) * ICE_AQC_ACL_KEY_WIDTH_BYTES +
+		ICE_ACL_SCEN_PKT_DIR_IDX_IN_TCAM;
+
+	/* set the chunk mask for the tcams */
+	ice_acl_set_scen_chnk_msk(&scen_buf, scen);
+
+	/* set the TCAM select and start_cmp and start_set bits */
+	k = first_tcam;
+	/* set the START_SET bit at the beginning of the stack */
+	scen_buf.tcam_cfg[k].start_cmp_set |= ICE_AQC_ACL_ALLOC_SCE_START_SET;
+	while (k <= last_tcam) {
+		u8 last_tcam_idx_cascade = cascade_cnt + k - 1;
+
+		/* set start_cmp for the first cascaded TCAM */
+		scen_buf.tcam_cfg[k].start_cmp_set |=
+			ICE_AQC_ACL_ALLOC_SCE_START_CMP;
+
+		/* cascade TCAMs up to the width of the scenario */
+		for (i = k; i < cascade_cnt + k; i++) {
+			ice_acl_fill_tcam_select(&scen_buf, scen, i, i - k);
+			ice_acl_assign_act_mem_for_scen(hw->acl_tbl, scen,
+							&scen_buf,
+							i,
+							last_tcam_idx_cascade);
+		}
+
+		k = i;
+	}
+
+	/* We need to set the start_cmp bit for the unused TCAMs. */
+	i = 0;
+	while (i < first_tcam)
+		scen_buf.tcam_cfg[i++].start_cmp_set =
+					ICE_AQC_ACL_ALLOC_SCE_START_CMP;
+
+	i = last_tcam + 1;
+	while (i < ICE_AQC_ACL_SLICES)
+		scen_buf.tcam_cfg[i++].start_cmp_set =
+					ICE_AQC_ACL_ALLOC_SCE_START_CMP;
+
+	status = ice_aq_alloc_acl_scen(hw, scen_id, &scen_buf, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_ACL, "AQ allocation of ACL scenario failed. status: %d\n",
+			  status);
+		goto out;
+	}
+
+	scen->id = *scen_id;
+	ice_acl_commit_partition(hw, scen, false);
+	ice_acl_init_entry(scen);
+	list_add(&scen->list_entry, &hw->acl_tbl->scens);
+
+out:
+	if (status)
+		devm_kfree(ice_hw_to_dev(hw), scen);
+
+	return status;
+}
+
+/**
+ * ice_acl_destroy_scen - Destroy an ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: ID of the remove scenario
+ */
+static enum ice_status ice_acl_destroy_scen(struct ice_hw *hw, u16 scen_id)
+{
+	struct ice_acl_scen *scen, *tmp_scen;
+	struct ice_flow_prof *p, *tmp;
+	enum ice_status status;
+
+	if (!hw->acl_tbl)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Remove profiles that use "scen_id" scenario */
+	list_for_each_entry_safe(p, tmp, &hw->fl_profs[ICE_BLK_ACL], l_entry)
+		if (p->cfg.scen && p->cfg.scen->id == scen_id) {
+			status = ice_flow_rem_prof(hw, ICE_BLK_ACL, p->id);
+			if (status) {
+				ice_debug(hw, ICE_DBG_ACL, "ice_flow_rem_prof failed. status: %d\n",
+					  status);
+				return status;
+			}
+		}
+
+	/* Call the AQ command to destroy the targeted scenario */
+	status = ice_aq_dealloc_acl_scen(hw, scen_id, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_ACL, "AQ de-allocation of scenario failed. status: %d\n",
+			  status);
+		return status;
+	}
+
+	/* Remove scenario from hw->acl_tbl->scens */
+	list_for_each_entry_safe(scen, tmp_scen, &hw->acl_tbl->scens,
+				 list_entry)
+		if (scen->id == scen_id) {
+			list_del(&scen->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), scen);
+		}
+
+	return 0;
+}
+
+/**
+ * ice_acl_destroy_tbl - Destroy a previously created LEM table for ACL
+ * @hw: pointer to the HW struct
+ */
+enum ice_status ice_acl_destroy_tbl(struct ice_hw *hw)
+{
+	struct ice_acl_scen *pos_scen, *tmp_scen;
+	struct ice_aqc_acl_generic resp_buf;
+	struct ice_aqc_acl_scen buf;
+	enum ice_status status;
+	u8 i;
+
+	if (!hw->acl_tbl)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Mark all the created scenario's TCAM to stop the packet lookup and
+	 * delete them afterward
+	 */
+	list_for_each_entry_safe(pos_scen, tmp_scen, &hw->acl_tbl->scens,
+				 list_entry) {
+		status = ice_aq_query_acl_scen(hw, pos_scen->id, &buf, NULL);
+		if (status) {
+			ice_debug(hw, ICE_DBG_ACL, "ice_aq_query_acl_scen() failed. status: %d\n",
+				  status);
+			return status;
+		}
+
+		for (i = 0; i < ICE_AQC_ACL_SLICES; i++) {
+			buf.tcam_cfg[i].chnk_msk = 0;
+			buf.tcam_cfg[i].start_cmp_set =
+					ICE_AQC_ACL_ALLOC_SCE_START_CMP;
+		}
+
+		for (i = 0; i < ICE_AQC_MAX_ACTION_MEMORIES; i++)
+			buf.act_mem_cfg[i] = 0;
+
+		status = ice_aq_update_acl_scen(hw, pos_scen->id, &buf, NULL);
+		if (status) {
+			ice_debug(hw, ICE_DBG_ACL, "ice_aq_update_acl_scen() failed. status: %d\n",
+				  status);
+			return status;
+		}
+
+		status = ice_acl_destroy_scen(hw, pos_scen->id);
+		if (status) {
+			ice_debug(hw, ICE_DBG_ACL, "deletion of scenario failed. status: %d\n",
+				  status);
+			return status;
+		}
+	}
+
+	/* call the AQ command to destroy the ACL table */
+	status = ice_aq_dealloc_acl_tbl(hw, hw->acl_tbl->id, &resp_buf, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_ACL, "AQ de-allocation of ACL failed. status: %d\n",
+			  status);
+		return status;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), hw->acl_tbl);
+	hw->acl_tbl = NULL;
+
+	return 0;
+}
+
+/**
+ * ice_acl_add_entry - Add a flow entry to an ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen: scenario to add the entry to
+ * @prio: priority level of the entry being added
+ * @keys: buffer of the value of the key to be programmed to the ACL entry
+ * @inverts: buffer of the value of the key inverts to be programmed
+ * @acts: pointer to a buffer containing formatted actions
+ * @acts_cnt: indicates the number of actions stored in "acts"
+ * @entry_idx: returned scenario relative index of the added flow entry
+ *
+ * Given an ACL table and a scenario, to add the specified key and key invert
+ * to an available entry in the specified scenario.
+ * The "keys" and "inverts" buffers must be of the size which is the same as
+ * the scenario's width
+ */
+enum ice_status
+ice_acl_add_entry(struct ice_hw *hw, struct ice_acl_scen *scen,
+		  enum ice_acl_entry_prio prio, u8 *keys, u8 *inverts,
+		  struct ice_acl_act_entry *acts, u8 acts_cnt, u16 *entry_idx)
+{
+	u8 i, entry_tcam, num_cscd, offset;
+	struct ice_aqc_acl_data buf;
+	enum ice_status status = 0;
+	u16 idx;
+
+	if (!scen)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	*entry_idx = ice_acl_scen_assign_entry_idx(scen, prio);
+	if (*entry_idx >= scen->num_entry) {
+		*entry_idx = 0;
+		return ICE_ERR_MAX_LIMIT;
+	}
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	entry_tcam = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	idx = ICE_ACL_TBL_TCAM_ENTRY_IDX(scen->start + *entry_idx);
+
+	memset(&buf, 0, sizeof(buf));
+	for (i = 0; i < num_cscd; i++) {
+		/* If the key spans more than one TCAM in the case of cascaded
+		 * TCAMs, the key and key inverts need to be properly split
+		 * among TCAMs.E.g.bytes 0 - 4 go to an index in the first TCAM
+		 * and bytes 5 - 9 go to the same index in the next TCAM, etc.
+		 * If the entry spans more than one TCAM in a cascaded TCAM
+		 * mode, the programming of the entries in the TCAMs must be in
+		 * reversed order - the TCAM entry of the rightmost TCAM should
+		 * be programmed first; the TCAM entry of the leftmost TCAM
+		 * should be programmed last.
+		 */
+		offset = num_cscd - i - 1;
+		memcpy(&buf.entry_key.val,
+		       &keys[offset * sizeof(buf.entry_key.val)],
+		       sizeof(buf.entry_key.val));
+		memcpy(&buf.entry_key_invert.val,
+		       &inverts[offset * sizeof(buf.entry_key_invert.val)],
+		       sizeof(buf.entry_key_invert.val));
+		status = ice_aq_program_acl_entry(hw, entry_tcam + offset, idx,
+						  &buf, NULL);
+		if (status) {
+			ice_debug(hw, ICE_DBG_ACL, "aq program acl entry failed status: %d\n",
+				  status);
+			goto out;
+		}
+	}
+
+	/* Program the action memory */
+	status = ice_acl_prog_act(hw, scen, acts, acts_cnt, *entry_idx);
+
+out:
+	if (status) {
+		ice_acl_rem_entry(hw, scen, *entry_idx);
+		*entry_idx = 0;
+	}
+
+	return status;
+}
+
+/**
+ * ice_acl_prog_act - Program a scenario's action memory
+ * @hw: pointer to the HW struct
+ * @scen: scenario to add the entry to
+ * @acts: pointer to a buffer containing formatted actions
+ * @acts_cnt: indicates the number of actions stored in "acts"
+ * @entry_idx: scenario relative index of the added flow entry
+ *
+ * Program a scenario's action memory
+ */
+enum ice_status
+ice_acl_prog_act(struct ice_hw *hw, struct ice_acl_scen *scen,
+		 struct ice_acl_act_entry *acts, u8 acts_cnt,
+		 u16 entry_idx)
+{
+	u8 entry_tcam, num_cscd, i, actx_idx = 0;
+	struct ice_aqc_actpair act_buf;
+	enum ice_status status = 0;
+	u16 idx;
+
+	if (entry_idx >= scen->num_entry)
+		return ICE_ERR_MAX_LIMIT;
+
+	memset(&act_buf, 0, sizeof(act_buf));
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	entry_tcam = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	idx = ICE_ACL_TBL_TCAM_ENTRY_IDX(scen->start + entry_idx);
+
+	for_each_set_bit(i, scen->act_mem_bitmap, ICE_AQC_MAX_ACTION_MEMORIES) {
+		struct ice_acl_act_mem *mem = &hw->acl_tbl->act_mems[i];
+
+		if (actx_idx >= acts_cnt)
+			break;
+		if (mem->member_of_tcam >= entry_tcam &&
+		    mem->member_of_tcam < entry_tcam + num_cscd) {
+			memcpy(&act_buf.act[0], &acts[actx_idx],
+			       sizeof(struct ice_acl_act_entry));
+
+			if (++actx_idx < acts_cnt) {
+				memcpy(&act_buf.act[1], &acts[actx_idx],
+				       sizeof(struct ice_acl_act_entry));
+			}
+
+			status = ice_aq_program_actpair(hw, i, idx, &act_buf,
+							NULL);
+			if (status) {
+				ice_debug(hw, ICE_DBG_ACL, "program actpair failed status: %d\n",
+					  status);
+				break;
+			}
+			actx_idx++;
+		}
+	}
+
+	if (!status && actx_idx < acts_cnt)
+		status = ICE_ERR_MAX_LIMIT;
+
+	return status;
+}
+
+/**
+ * ice_acl_rem_entry - Remove a flow entry from an ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen: scenario to remove the entry from
+ * @entry_idx: the scenario-relative index of the flow entry being removed
+ */
+enum ice_status
+ice_acl_rem_entry(struct ice_hw *hw, struct ice_acl_scen *scen, u16 entry_idx)
+{
+	struct ice_aqc_actpair act_buf;
+	struct ice_aqc_acl_data buf;
+	u8 entry_tcam, num_cscd, i;
+	enum ice_status status = 0;
+	u16 idx;
+
+	if (!scen)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	if (entry_idx >= scen->num_entry)
+		return ICE_ERR_MAX_LIMIT;
+
+	if (!test_bit(entry_idx, scen->entry_bitmap))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	entry_tcam = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	idx = ICE_ACL_TBL_TCAM_ENTRY_IDX(scen->start + entry_idx);
+
+	/* invalidate the flow entry */
+	memset(&buf, 0, sizeof(buf));
+	for (i = 0; i < num_cscd; i++) {
+		status = ice_aq_program_acl_entry(hw, entry_tcam + i, idx, &buf,
+						  NULL);
+		if (status)
+			ice_debug(hw, ICE_DBG_ACL, "AQ program ACL entry failed status: %d\n",
+				  status);
+	}
+
+	memset(&act_buf, 0, sizeof(act_buf));
+
+	for_each_set_bit(i, scen->act_mem_bitmap, ICE_AQC_MAX_ACTION_MEMORIES) {
+		struct ice_acl_act_mem *mem = &hw->acl_tbl->act_mems[i];
+
+		if (mem->member_of_tcam >= entry_tcam &&
+		    mem->member_of_tcam < entry_tcam + num_cscd) {
+			/* Invalidate allocated action pairs */
+			status = ice_aq_program_actpair(hw, i, idx, &act_buf,
+							NULL);
+			if (status)
+				ice_debug(hw, ICE_DBG_ACL, "program actpair failed status: %d\n",
+					  status);
+		}
+	}
+
+	ice_acl_scen_free_entry_idx(scen, entry_idx);
+
+	return status;
+}
+
+/**
+ * ice_is_acl_empty - Check if any entry exists
+ * @hw: pointer to the HW struct
+ */
+bool ice_is_acl_empty(struct ice_hw *hw)
+{
+	struct ice_acl_scen *scen, *tmp_scen;
+
+	if (!hw->acl_tbl)
+		return false;
+
+	list_for_each_entry_safe(scen, tmp_scen, &hw->acl_tbl->scens,
+				 list_entry)
+		if (!bitmap_empty(scen->entry_bitmap, ICE_MAX_ACL_TCAM_ENTRY))
+			return false;
+
+	return true;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_acl_main.c b/drivers/net/ethernet/intel/ice/ice_acl_main.c
new file mode 100644
index 000000000000..81eb056f1b32
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_acl_main.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* ACL support for ice */
+
+
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_flow.h"
+#include "ice_fdir.h"
+
+/* Default ACL Action priority */
+#define ICE_ACL_ACT_PRIO	3
+
+/* Number of action */
+#define ICE_ACL_NUM_ACT		1
+
+/**
+ * ice_acl_set_ip4_addr_seg
+ * @seg: flow segment for programming
+ *
+ * Set the IPv4 source and destination address mask for the given flow segment
+ */
+static void ice_acl_set_ip4_addr_seg(struct ice_flow_seg_info *seg)
+{
+	u16 val_loc, mask_loc;
+
+	/* IP source address */
+	val_loc = offsetof(struct ice_fdir_fltr, ip.v4.src_ip);
+	mask_loc = offsetof(struct ice_fdir_fltr, mask.v4.src_ip);
+
+	ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_SA, val_loc,
+			 mask_loc, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* IP destination address */
+	val_loc = offsetof(struct ice_fdir_fltr, ip.v4.dst_ip);
+	mask_loc = offsetof(struct ice_fdir_fltr, mask.v4.dst_ip);
+
+	ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_DA, val_loc,
+			 mask_loc, ICE_FLOW_FLD_OFF_INVAL, false);
+}
+
+/**
+ * ice_acl_set_ip4_port_seg
+ * @seg: flow segment for programming
+ * @l4_proto: Layer 4 protocol to program
+ *
+ * Set the source and destination port for the given flow segment based on the
+ * provided layer 4 protocol
+ */
+static int
+ice_acl_set_ip4_port_seg(struct ice_flow_seg_info *seg,
+			 enum ice_flow_seg_hdr l4_proto)
+{
+	enum ice_flow_field src_port, dst_port;
+	u16 val_loc, mask_loc;
+	int err;
+
+	err = ice_ntuple_l4_proto_to_port(l4_proto, &src_port, &dst_port);
+	if (err)
+		return err;
+
+	/* Layer 4 source port */
+	val_loc = offsetof(struct ice_fdir_fltr, ip.v4.src_port);
+	mask_loc = offsetof(struct ice_fdir_fltr, mask.v4.src_port);
+
+	ice_flow_set_fld(seg, src_port, val_loc, mask_loc,
+			 ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* Layer 4 destination port */
+	val_loc = offsetof(struct ice_fdir_fltr, ip.v4.dst_port);
+	mask_loc = offsetof(struct ice_fdir_fltr, mask.v4.dst_port);
+
+	ice_flow_set_fld(seg, dst_port, val_loc, mask_loc,
+			 ICE_FLOW_FLD_OFF_INVAL, false);
+
+	return 0;
+}
+
+/**
+ * ice_acl_set_ip4_seg
+ * @seg: flow segment for programming
+ * @tcp_ip4_spec: mask data from ethtool
+ * @l4_proto: Layer 4 protocol to program
+ *
+ * Set the mask data into the flow segment to be used to program HW
+ * table based on provided L4 protocol for IPv4
+ */
+static int
+ice_acl_set_ip4_seg(struct ice_flow_seg_info *seg,
+		    struct ethtool_tcpip4_spec *tcp_ip4_spec,
+		    enum ice_flow_seg_hdr l4_proto)
+{
+	int err;
+
+	err = ice_ntuple_check_ip4_seg(tcp_ip4_spec);
+	if (err)
+		return err;
+
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4 | l4_proto);
+	ice_acl_set_ip4_addr_seg(seg);
+
+	return ice_acl_set_ip4_port_seg(seg, l4_proto);
+}
+
+/**
+ * ice_acl_set_ip4_usr_seg
+ * @seg: flow segment for programming
+ * @usr_ip4_spec: ethtool userdef packet offset
+ *
+ * Set the offset data into the flow segment to be used to program HW
+ * table for IPv4
+ */
+static int
+ice_acl_set_ip4_usr_seg(struct ice_flow_seg_info *seg,
+			struct ethtool_usrip4_spec *usr_ip4_spec)
+{
+	int err;
+
+	err = ice_ntuple_check_ip4_usr_seg(usr_ip4_spec);
+	if (err)
+		return err;
+
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4);
+	ice_acl_set_ip4_addr_seg(seg);
+
+	return 0;
+}
+
+
+/**
+ * ice_acl_check_input_set - Checks that a given ACL input set is valid
+ * @pf: ice PF structure
+ * @fsp: pointer to ethtool Rx flow specification
+ *
+ * Returns 0 on success and negative values for failure
+ */
+static int
+ice_acl_check_input_set(struct ice_pf *pf, struct ethtool_rx_flow_spec *fsp)
+{
+	struct ice_fd_hw_prof *hw_prof = NULL;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_flow_seg_info *old_seg;
+	struct ice_flow_seg_info *seg;
+	enum ice_fltr_ptype fltr_type;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	seg = devm_kzalloc(dev, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+		err = ice_acl_set_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					  ICE_FLOW_SEG_HDR_TCP);
+		break;
+	case UDP_V4_FLOW:
+		err = ice_acl_set_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					  ICE_FLOW_SEG_HDR_UDP);
+		break;
+	case SCTP_V4_FLOW:
+		err = ice_acl_set_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					  ICE_FLOW_SEG_HDR_SCTP);
+		break;
+	case IPV4_USER_FLOW:
+		err = ice_acl_set_ip4_usr_seg(seg, &fsp->m_u.usr_ip4_spec);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+	if (err)
+		goto err_exit;
+
+	fltr_type = ice_ethtool_flow_to_fltr(fsp->flow_type & ~FLOW_EXT);
+
+	if (!hw->acl_prof) {
+		hw->acl_prof = devm_kcalloc(dev, ICE_FLTR_PTYPE_MAX,
+					    sizeof(*hw->acl_prof), GFP_KERNEL);
+		if (!hw->acl_prof) {
+			err = -ENOMEM;
+			goto err_exit;
+		}
+	}
+	if (!hw->acl_prof[fltr_type]) {
+		hw->acl_prof[fltr_type] = devm_kzalloc(dev,
+						       sizeof(**hw->acl_prof),
+						       GFP_KERNEL);
+		if (!hw->acl_prof[fltr_type]) {
+			err = -ENOMEM;
+			goto err_acl_prof_exit;
+		}
+		hw->acl_prof[fltr_type]->cnt = 0;
+	}
+
+	hw_prof = hw->acl_prof[fltr_type];
+	old_seg = hw_prof->fdir_seg[0];
+	if (old_seg) {
+		/* This flow_type already has an input set.
+		 * If it matches the requested input set then we are
+		 * done. If it's different then it's an error.
+		 */
+		if (!memcmp(old_seg, seg, sizeof(*seg))) {
+			devm_kfree(dev, seg);
+			return 0;
+		}
+
+		err = -EINVAL;
+		goto err_acl_prof_flow_exit;
+	}
+
+	/* Adding a profile for the given flow specification with no
+	 * actions (NULL) and zero actions 0.
+	 */
+	status = ice_flow_add_prof(hw, ICE_BLK_ACL, ICE_FLOW_RX, fltr_type,
+				   seg, 1, NULL, 0, &prof);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto err_exit;
+	}
+
+	hw_prof->fdir_seg[0] = seg;
+	return 0;
+
+err_acl_prof_flow_exit:
+	devm_kfree(dev, hw->acl_prof[fltr_type]);
+err_acl_prof_exit:
+	devm_kfree(dev, hw->acl_prof);
+err_exit:
+	devm_kfree(dev, seg);
+
+	return err;
+}
+
+/**
+ * ice_acl_add_rule_ethtool - Adds an ACL rule
+ * @vsi: pointer to target VSI
+ * @cmd: command to add or delete ACL rule
+ *
+ * Returns 0 on success and negative values for failure
+ */
+int ice_acl_add_rule_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd)
+{
+	struct ice_flow_action acts[ICE_ACL_NUM_ACT];
+	struct ethtool_rx_flow_spec *fsp;
+	struct ice_fd_hw_prof *hw_prof;
+	struct ice_fdir_fltr *input;
+	enum ice_fltr_ptype flow;
+	enum ice_status status;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u64 entry_h = 0;
+	int act_cnt;
+	int ret;
+
+	if (!vsi || !cmd)
+		return -EINVAL;
+
+	pf = vsi->back;
+	hw = &pf->hw;
+	dev = ice_pf_to_dev(pf);
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	ret = ice_acl_check_input_set(pf, fsp);
+	if (ret)
+		return ret;
+
+	/* Add new rule */
+	input = devm_kzalloc(dev, sizeof(*input), GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	ret = ice_ntuple_set_input_set(vsi, ICE_BLK_ACL, fsp, input);
+	if (ret)
+		goto free_input;
+
+	memset(&acts, 0, sizeof(acts));
+	act_cnt = 1;
+	if (fsp->ring_cookie == RX_CLS_FLOW_DISC) {
+		acts[0].type = ICE_FLOW_ACT_DROP;
+		acts[0].data.acl_act.mdid = ICE_MDID_RX_PKT_DROP;
+		acts[0].data.acl_act.prio = ICE_ACL_ACT_PRIO;
+		acts[0].data.acl_act.value = cpu_to_le16(0x1);
+	} else {
+		acts[0].type = ICE_FLOW_ACT_FWD_QUEUE;
+		acts[0].data.acl_act.mdid = ICE_MDID_RX_DST_Q;
+		acts[0].data.acl_act.prio = ICE_ACL_ACT_PRIO;
+		acts[0].data.acl_act.value = cpu_to_le16(input->q_index);
+	}
+
+	flow = ice_ethtool_flow_to_fltr(fsp->flow_type & ~FLOW_EXT);
+	hw_prof = hw->acl_prof[flow];
+
+	status = ice_flow_add_entry(hw, ICE_BLK_ACL, flow, fsp->location,
+				    vsi->idx, ICE_FLOW_PRIO_NORMAL, input, acts,
+				    act_cnt, &entry_h);
+	if (status) {
+		dev_err(dev, "Could not add flow entry %d\n", flow);
+		ret = ice_status_to_errno(status);
+		goto free_input;
+	}
+
+	if (!hw_prof->cnt || vsi->idx != hw_prof->vsi_h[hw_prof->cnt - 1]) {
+		hw_prof->vsi_h[hw_prof->cnt] = vsi->idx;
+		hw_prof->entry_h[hw_prof->cnt++][0] = entry_h;
+	}
+
+	input->acl_fltr = true;
+	/* input struct is added to the HW filter list */
+	ice_ntuple_update_list_entry(pf, input, fsp->location);
+
+	return 0;
+
+free_input:
+	devm_kfree(dev, input);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 023e3d2fee5f..c4fc3ee344d5 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_ADMINQ_CMD_H_
 #define _ICE_ADMINQ_CMD_H_
@@ -8,10 +8,12 @@
  * descriptor format. It is shared between Firmware and Software.
  */
 
+
 #define ICE_MAX_VSI			768
 #define ICE_AQC_TOPO_MAX_LEVEL_NUM	0x9
 #define ICE_AQ_SET_MAC_FRAME_SIZE_MAX	9728
 
+
 struct ice_aqc_generic {
 	__le32 param0;
 	__le32 param1;
@@ -19,6 +21,7 @@ struct ice_aqc_generic {
 	__le32 addr_low;
 };
 
+
 /* Get version (direct 0x0001) */
 struct ice_aqc_get_ver {
 	__le32 rom_ver;
@@ -33,6 +36,7 @@ struct ice_aqc_get_ver {
 	u8 api_patch;
 };
 
+
 /* Send driver version (indirect 0x0002) */
 struct ice_aqc_driver_ver {
 	u8 major_ver;
@@ -44,6 +48,7 @@ struct ice_aqc_driver_ver {
 	__le32 addr_low;
 };
 
+
 /* Queue Shutdown (direct 0x0003) */
 struct ice_aqc_q_shutdown {
 	u8 driver_unloading;
@@ -51,6 +56,17 @@ struct ice_aqc_q_shutdown {
 	u8 reserved[15];
 };
 
+
+
+/* Get Expanded Error Code (0x0005, direct) */
+struct ice_aqc_get_exp_err {
+	__le32 reason;
+#define ICE_AQC_EXPANDED_ERROR_NOT_PROVIDED	0xFFFFFFFF
+	__le32 identifier;
+	u8 rsvd[8];
+};
+
+
 /* Request resource ownership (direct 0x0008)
  * Release resource ownership (direct 0x0009)
  */
@@ -83,6 +99,7 @@ struct ice_aqc_req_res {
 	u8 reserved[2];
 };
 
+
 /* Get function capabilities (indirect 0x000A)
  * Get device capabilities (indirect 0x000B)
  */
@@ -95,19 +112,54 @@ struct ice_aqc_list_caps {
 	__le32 addr_low;
 };
 
+
 /* Device/Function buffer entry, repeated per reported capability */
 struct ice_aqc_list_caps_elem {
 	__le16 cap;
+#define ICE_AQC_CAPS_SWITCHING_MODE			0x0001
+#define ICE_AQC_CAPS_MANAGEABILITY_MODE			0x0002
+#define ICE_AQC_CAPS_OS2BMC				0x0004
 #define ICE_AQC_CAPS_VALID_FUNCTIONS			0x0005
+#define ICE_AQC_MAX_VALID_FUNCTIONS			0x8
+#define ICE_AQC_CAPS_ALTERNATE_RAM			0x0006
+#define ICE_AQC_CAPS_WOL_PROXY				0x0008
 #define ICE_AQC_CAPS_SRIOV				0x0012
 #define ICE_AQC_CAPS_VF					0x0013
+#define ICE_AQC_CAPS_VMDQ				0x0014
+#define ICE_AQC_CAPS_802_1QBG				0x0015
+#define ICE_AQC_CAPS_802_1BR				0x0016
 #define ICE_AQC_CAPS_VSI				0x0017
 #define ICE_AQC_CAPS_DCB				0x0018
+#define ICE_AQC_CAPS_RSVD				0x0021
+#define ICE_AQC_CAPS_ISCSI				0x0022
 #define ICE_AQC_CAPS_RSS				0x0040
 #define ICE_AQC_CAPS_RXQS				0x0041
 #define ICE_AQC_CAPS_TXQS				0x0042
 #define ICE_AQC_CAPS_MSIX				0x0043
+#define ICE_AQC_CAPS_FD					0x0045
+#define ICE_AQC_CAPS_1588				0x0046
 #define ICE_AQC_CAPS_MAX_MTU				0x0047
+#define ICE_AQC_CAPS_NVM_VER				0x0048
+#define ICE_AQC_CAPS_PENDING_NVM_VER			0x0049
+#define ICE_AQC_CAPS_OROM_VER				0x004A
+#define ICE_AQC_CAPS_PENDING_OROM_VER			0x004B
+#define ICE_AQC_CAPS_NET_VER				0x004C
+#define ICE_AQC_CAPS_PENDING_NET_VER			0x004D
+#define ICE_AQC_CAPS_CEM				0x00F2
+#define ICE_AQC_CAPS_IWARP				0x0051
+#define ICE_AQC_CAPS_LED				0x0061
+#define ICE_AQC_CAPS_SDP				0x0062
+#define ICE_AQC_CAPS_WR_CSR_PROT			0x0064
+#define ICE_AQC_CAPS_LOGI_TO_PHYSI_PORT_MAP		0x0073
+#define ICE_AQC_CAPS_SKU				0x0074
+#define ICE_AQC_CAPS_PORT_MAP				0x0075
+#define ICE_AQC_CAPS_PCIE_RESET_AVOIDANCE		0x0076
+#define ICE_AQC_CAPS_POST_UPDATE_RESET_RESTRICT		0x0077
+#define ICE_AQC_CAPS_NVM_MGMT				0x0080
+#define ICE_AQC_CAPS_EXT_TOPO_DEV_IMG0			0x0081
+#define ICE_AQC_CAPS_EXT_TOPO_DEV_IMG1			0x0082
+#define ICE_AQC_CAPS_EXT_TOPO_DEV_IMG2			0x0083
+#define ICE_AQC_CAPS_EXT_TOPO_DEV_IMG3			0x0084
 
 	u8 major_ver;
 	u8 minor_ver;
@@ -121,6 +173,7 @@ struct ice_aqc_list_caps_elem {
 	__le64 rsvd2;
 };
 
+
 /* Manage MAC address, read command - indirect (0x0107)
  * This struct is also used for the response
  */
@@ -130,6 +183,8 @@ struct ice_aqc_manage_mac_read {
 #define ICE_AQC_MAN_MAC_SAN_ADDR_VALID		BIT(5)
 #define ICE_AQC_MAN_MAC_PORT_ADDR_VALID		BIT(6)
 #define ICE_AQC_MAN_MAC_WOL_ADDR_VALID		BIT(7)
+#define ICE_AQC_MAN_MAC_MC_MAG_EN		BIT(8)
+#define ICE_AQC_MAN_MAC_WOL_PRESERVE_ON_PFR	BIT(9)
 #define ICE_AQC_MAN_MAC_READ_S			4
 #define ICE_AQC_MAN_MAC_READ_M			(0xF << ICE_AQC_MAN_MAC_READ_S)
 	u8 rsvd[2];
@@ -139,6 +194,7 @@ struct ice_aqc_manage_mac_read {
 	__le32 addr_low;
 };
 
+
 /* Response buffer format for manage MAC read command */
 struct ice_aqc_manage_mac_read_resp {
 	u8 lport_num;
@@ -148,6 +204,7 @@ struct ice_aqc_manage_mac_read_resp {
 	u8 mac_addr[ETH_ALEN];
 };
 
+
 /* Manage MAC address, write command - direct (0x0108) */
 struct ice_aqc_manage_mac_write {
 	u8 rsvd;
@@ -155,17 +212,16 @@ struct ice_aqc_manage_mac_write {
 #define ICE_AQC_MAN_MAC_WR_MC_MAG_EN		BIT(0)
 #define ICE_AQC_MAN_MAC_WR_WOL_LAA_PFR_KEEP	BIT(1)
 #define ICE_AQC_MAN_MAC_WR_S		6
-#define ICE_AQC_MAN_MAC_WR_M		(3 << ICE_AQC_MAN_MAC_WR_S)
+#define ICE_AQC_MAN_MAC_WR_M		ICE_M(3, ICE_AQC_MAN_MAC_WR_S)
 #define ICE_AQC_MAN_MAC_UPDATE_LAA	0
-#define ICE_AQC_MAN_MAC_UPDATE_LAA_WOL	(BIT(0) << ICE_AQC_MAN_MAC_WR_S)
-	/* High 16 bits of MAC address in big endian order */
-	__be16 sah;
-	/* Low 32 bits of MAC address in big endian order */
-	__be32 sal;
+#define ICE_AQC_MAN_MAC_UPDATE_LAA_WOL	BIT(ICE_AQC_MAN_MAC_WR_S)
+	/* byte stream in network order */
+	u8 mac_addr[ETH_ALEN];
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
+
 /* Clear PXE Command and response (direct 0x0110) */
 struct ice_aqc_clear_pxe {
 	u8 rx_cnt;
@@ -173,6 +229,15 @@ struct ice_aqc_clear_pxe {
 	u8 reserved[15];
 };
 
+
+/* Configure No-Drop Policy Command (direct 0x0112) */
+struct ice_aqc_config_no_drop_policy {
+	u8 opts;
+#define ICE_AQC_FORCE_NO_DROP			BIT(0)
+	u8 rsvd[15];
+};
+
+
 /* Get switch configuration (0x0200) */
 struct ice_aqc_get_sw_cfg {
 	/* Reserved for command and copy of request flags for response */
@@ -190,6 +255,7 @@ struct ice_aqc_get_sw_cfg {
 	__le32 addr_low;
 };
 
+
 /* Each entry in the response buffer is of the following type: */
 struct ice_aqc_get_sw_cfg_resp_elem {
 	/* VSI/Port Number */
@@ -216,13 +282,30 @@ struct ice_aqc_get_sw_cfg_resp_elem {
 #define ICE_AQC_GET_SW_CONF_RESP_IS_VF		BIT(15)
 };
 
-/* The response buffer is as follows. Note that the length of the
- * elements array varies with the length of the command response.
- */
-struct ice_aqc_get_sw_cfg_resp {
-	struct ice_aqc_get_sw_cfg_resp_elem elements[1];
+
+
+/* Set Port parameters, (direct, 0x0203) */
+struct ice_aqc_set_port_params {
+	__le16 cmd_flags;
+#define ICE_AQC_SET_P_PARAMS_SAVE_BAD_PACKETS	BIT(0)
+#define ICE_AQC_SET_P_PARAMS_PAD_SHORT_PACKETS	BIT(1)
+#define ICE_AQC_SET_P_PARAMS_DOUBLE_VLAN_ENA	BIT(2)
+	__le16 bad_frame_vsi;
+#define ICE_AQC_SET_P_PARAMS_VSI_S	0
+#define ICE_AQC_SET_P_PARAMS_VSI_M	(0x3FF << ICE_AQC_SET_P_PARAMS_VSI_S)
+#define ICE_AQC_SET_P_PARAMS_VSI_VALID	BIT(15)
+	__le16 swid;
+#define ICE_AQC_SET_P_PARAMS_SWID_S	0
+#define ICE_AQC_SET_P_PARAMS_SWID_M	(0xFF << ICE_AQC_SET_P_PARAMS_SWID_S)
+#define ICE_AQC_SET_P_PARAMS_LOGI_PORT_ID_S	8
+#define ICE_AQC_SET_P_PARAMS_LOGI_PORT_ID_M	\
+				(0x3F << ICE_AQC_SET_P_PARAMS_LOGI_PORT_ID_S)
+#define ICE_AQC_SET_P_PARAMS_IS_LOGI_PORT	BIT(14)
+#define ICE_AQC_SET_P_PARAMS_SWID_VALID		BIT(15)
+	u8 reserved[10];
 };
 
+
 /* These resource type defines are used for all switch resource
  * commands where a resource type is required, such as:
  * Get Resource Allocation command (indirect 0x0204)
@@ -230,8 +313,64 @@ struct ice_aqc_get_sw_cfg_resp {
  * Free Resources command (indirect 0x0209)
  * Get Allocated Resource Descriptors Command (indirect 0x020A)
  */
+#define ICE_AQC_RES_TYPE_VEB_COUNTER			0x00
+#define ICE_AQC_RES_TYPE_VLAN_COUNTER			0x01
+#define ICE_AQC_RES_TYPE_MIRROR_RULE			0x02
 #define ICE_AQC_RES_TYPE_VSI_LIST_REP			0x03
 #define ICE_AQC_RES_TYPE_VSI_LIST_PRUNE			0x04
+#define ICE_AQC_RES_TYPE_RECIPE				0x05
+#define ICE_AQC_RES_TYPE_PROFILE			0x06
+#define ICE_AQC_RES_TYPE_SWID				0x07
+#define ICE_AQC_RES_TYPE_VSI				0x08
+#define ICE_AQC_RES_TYPE_FLU				0x09
+#define ICE_AQC_RES_TYPE_WIDE_TABLE_1			0x0A
+#define ICE_AQC_RES_TYPE_WIDE_TABLE_2			0x0B
+#define ICE_AQC_RES_TYPE_WIDE_TABLE_4			0x0C
+#define ICE_AQC_RES_TYPE_GLOBAL_RSS_HASH		0x20
+#define ICE_AQC_RES_TYPE_FDIR_COUNTER_BLOCK		0x21
+#define ICE_AQC_RES_TYPE_FDIR_GUARANTEED_ENTRIES	0x22
+#define ICE_AQC_RES_TYPE_FDIR_SHARED_ENTRIES		0x23
+#define ICE_AQC_RES_TYPE_FLEX_DESC_PROG			0x30
+#define ICE_AQC_RES_TYPE_SWITCH_PROF_BLDR_PROFID	0x48
+#define ICE_AQC_RES_TYPE_SWITCH_PROF_BLDR_TCAM		0x49
+#define ICE_AQC_RES_TYPE_ACL_PROF_BLDR_PROFID		0x50
+#define ICE_AQC_RES_TYPE_ACL_PROF_BLDR_TCAM		0x51
+#define ICE_AQC_RES_TYPE_FD_PROF_BLDR_PROFID		0x58
+#define ICE_AQC_RES_TYPE_FD_PROF_BLDR_TCAM		0x59
+#define ICE_AQC_RES_TYPE_HASH_PROF_BLDR_PROFID		0x60
+#define ICE_AQC_RES_TYPE_HASH_PROF_BLDR_TCAM		0x61
+/* Resource types 0x62-67 are reserved for Hash profile builder */
+#define ICE_AQC_RES_TYPE_QHASH_PROF_BLDR_PROFID		0x68
+#define ICE_AQC_RES_TYPE_QHASH_PROF_BLDR_TCAM		0x69
+
+#define ICE_AQC_RES_TYPE_FLAG_SHARED			BIT(7)
+#define ICE_AQC_RES_TYPE_FLAG_SCAN_BOTTOM		BIT(12)
+#define ICE_AQC_RES_TYPE_FLAG_IGNORE_INDEX		BIT(13)
+
+#define ICE_AQC_RES_TYPE_FLAG_DEDICATED			0x00
+
+#define ICE_AQC_RES_TYPE_S	0
+#define ICE_AQC_RES_TYPE_M	(0x07F << ICE_AQC_RES_TYPE_S)
+
+/* Get Resource Allocation command (indirect 0x0204) */
+struct ice_aqc_get_res_alloc {
+	__le16 resp_elem_num; /* Used in response, reserved in command */
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Get Resource Allocation Response Buffer per response */
+struct ice_aqc_get_res_resp_elem {
+	__le16 res_type; /* Types defined above cmd 0x0204 */
+	__le16 total_capacity; /* Resources available to all PF's */
+	__le16 total_function; /* Resources allocated for a PF */
+	__le16 total_shared; /* Resources allocated as shared */
+	__le16 total_free; /* Resources un-allocated/not reserved by any PF */
+};
+
+
 
 /* Allocate Resources command (indirect 0x0208)
  * Free Resources command (indirect 0x0209)
@@ -243,6 +382,7 @@ struct ice_aqc_alloc_free_res_cmd {
 	__le32 addr_low;
 };
 
+
 /* Resource descriptor */
 struct ice_aqc_res_elem {
 	union {
@@ -251,18 +391,75 @@ struct ice_aqc_res_elem {
 	} e;
 };
 
+
 /* Buffer for Allocate/Free Resources commands */
 struct ice_aqc_alloc_free_res_elem {
 	__le16 res_type; /* Types defined above cmd 0x0204 */
-#define ICE_AQC_RES_TYPE_SHARED_S	7
-#define ICE_AQC_RES_TYPE_SHARED_M	(0x1 << ICE_AQC_RES_TYPE_SHARED_S)
 #define ICE_AQC_RES_TYPE_VSI_PRUNE_LIST_S	8
 #define ICE_AQC_RES_TYPE_VSI_PRUNE_LIST_M	\
 				(0xF << ICE_AQC_RES_TYPE_VSI_PRUNE_LIST_S)
 	__le16 num_elems;
-	struct ice_aqc_res_elem elem[1];
+	struct ice_aqc_res_elem elem[];
+};
+
+
+/* Get Allocated Resource Descriptors Command (indirect 0x020A) */
+struct ice_aqc_get_allocd_res_desc {
+	union {
+		struct {
+			__le16 res; /* Types defined above cmd 0x0204 */
+			__le16 first_desc;
+			__le32 reserved;
+		} cmd;
+		struct {
+			__le16 res;
+			__le16 next_desc;
+			__le16 num_desc;
+			__le16 reserved;
+		} resp;
+	} ops;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+
+/* Request buffer for Set VLAN Mode AQ command (indirect 0x020C) */
+struct ice_aqc_set_vlan_mode {
+	u8 reserved;
+	u8 l2tag_prio_tagging;
+#define ICE_AQ_VLAN_PRIO_TAG_S			0
+#define ICE_AQ_VLAN_PRIO_TAG_M			(0x7 << ICE_AQ_VLAN_PRIO_TAG_S)
+#define ICE_AQ_VLAN_PRIO_TAG_NOT_SUPPORTED	0x0
+#define ICE_AQ_VLAN_PRIO_TAG_STAG		0x1
+#define ICE_AQ_VLAN_PRIO_TAG_OUTER_CTAG		0x2
+#define ICE_AQ_VLAN_PRIO_TAG_OUTER_VLAN		0x3
+#define ICE_AQ_VLAN_PRIO_TAG_INNER_CTAG		0x4
+#define ICE_AQ_VLAN_PRIO_TAG_MAX		0x4
+#define ICE_AQ_VLAN_PRIO_TAG_ERROR		0x7
+	u8 l2tag_reserved[64];
+	u8 rdma_packet;
+#define ICE_AQ_VLAN_RDMA_TAG_S			0
+#define ICE_AQ_VLAN_RDMA_TAG_M			(0x3F << ICE_AQ_VLAN_RDMA_TAG_S)
+#define ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING	0x10
+#define ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING	0x1A
+	u8 rdma_reserved[2];
+	u8 mng_vlan_prot_id;
+#define ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER	0x10
+#define ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER	0x11
+	u8 prot_id_reserved[30];
+};
+
+
+/* Response buffer for Get VLAN Mode AQ command (indirect 0x020D) */
+struct ice_aqc_get_vlan_mode {
+	u8 vlan_mode;
+#define ICE_AQ_VLAN_MODE_DVM_ENA	BIT(0)
+	u8 l2tag_prio_tagging;
+	u8 reserved[98];
 };
 
+
 /* Add VSI (indirect 0x0210)
  * Update VSI (indirect 0x0211)
  * Get VSI (indirect 0x0212)
@@ -288,6 +485,7 @@ struct ice_aqc_add_get_update_free_vsi {
 	__le32 addr_low;
 };
 
+
 /* Response descriptor for:
  * Add VSI (indirect 0x0210)
  * Update VSI (indirect 0x0211)
@@ -302,6 +500,21 @@ struct ice_aqc_add_update_free_vsi_resp {
 	__le32 addr_low;
 };
 
+
+struct ice_aqc_get_vsi_resp {
+	__le16 vsi_num;
+	u8 vf_id;
+	/* The vsi_flags field uses the ICE_AQ_VSI_TYPE_* defines for values.
+	 * These are found above in struct ice_aqc_add_get_update_free_vsi.
+	 */
+	u8 vsi_flags;
+	__le16 vsi_used;
+	__le16 vsi_free;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
 struct ice_aqc_vsi_props {
 	__le16 valid_sections;
 #define ICE_AQ_VSI_PROP_SW_VALID		BIT(0)
@@ -313,6 +526,7 @@ struct ice_aqc_vsi_props {
 #define ICE_AQ_VSI_PROP_RXQ_MAP_VALID		BIT(6)
 #define ICE_AQ_VSI_PROP_Q_OPT_VALID		BIT(7)
 #define ICE_AQ_VSI_PROP_OUTER_UP_VALID		BIT(8)
+#define ICE_AQ_VSI_PROP_ACL_VALID		BIT(10)
 #define ICE_AQ_VSI_PROP_FLOW_DIR_VALID		BIT(11)
 #define ICE_AQ_VSI_PROP_PASID_VALID		BIT(12)
 	/* switch section */
@@ -323,141 +537,328 @@ struct ice_aqc_vsi_props {
 #define ICE_AQ_VSI_SW_FLAG_SRC_PRUNE		BIT(7)
 	u8 sw_flags2;
 #define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S	0
-#define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M	\
-				(0xF << ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S)
+#define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M	(0xF << ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S)
 #define ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA	BIT(0)
 #define ICE_AQ_VSI_SW_FLAG_LAN_ENA		BIT(4)
 	u8 veb_stat_id;
 #define ICE_AQ_VSI_SW_VEB_STAT_ID_S		0
-#define ICE_AQ_VSI_SW_VEB_STAT_ID_M	(0x1F << ICE_AQ_VSI_SW_VEB_STAT_ID_S)
+#define ICE_AQ_VSI_SW_VEB_STAT_ID_M		(0x1F << ICE_AQ_VSI_SW_VEB_STAT_ID_S)
 #define ICE_AQ_VSI_SW_VEB_STAT_ID_VALID		BIT(5)
 	/* security section */
 	u8 sec_flags;
 #define ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD	BIT(0)
 #define ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF	BIT(2)
-#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S	4
-#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M	(0xF << ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)
+#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S		4
+#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M		(0xF << ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)
 #define ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA	BIT(0)
 	u8 sec_reserved;
 	/* VLAN section */
-	__le16 pvid; /* VLANS include priority bits */
-	u8 pvlan_reserved[2];
-	u8 vlan_flags;
-#define ICE_AQ_VSI_VLAN_MODE_S	0
-#define ICE_AQ_VSI_VLAN_MODE_M	(0x3 << ICE_AQ_VSI_VLAN_MODE_S)
-#define ICE_AQ_VSI_VLAN_MODE_UNTAGGED	0x1
-#define ICE_AQ_VSI_VLAN_MODE_TAGGED	0x2
-#define ICE_AQ_VSI_VLAN_MODE_ALL	0x3
-#define ICE_AQ_VSI_PVLAN_INSERT_PVID	BIT(2)
-#define ICE_AQ_VSI_VLAN_EMOD_S		3
-#define ICE_AQ_VSI_VLAN_EMOD_M		(0x3 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR_BOTH	(0x0 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR_UP	(0x1 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR	(0x2 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_NOTHING	(0x3 << ICE_AQ_VSI_VLAN_EMOD_S)
-	u8 pvlan_reserved2[3];
+	__le16 port_based_inner_vlan; /* VLANS include priority bits */
+	u8 inner_vlan_reserved[2];
+	u8 inner_vlan_flags;
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_S		0
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_M		(0x3 << ICE_AQ_VSI_INNER_VLAN_TX_MODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTUNTAGGED	0x1
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTTAGGED	0x2
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL	0x3
+#define ICE_AQ_VSI_INNER_VLAN_INSERT_PVID	BIT(2)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_S		3
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_M		(0x3 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR_BOTH	(0x0 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR_UP	(0x1 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR		(0x2 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING	(0x3 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_BLOCK_TX_DESC	BIT(5)
+	u8 inner_vlan_reserved2[3];
 	/* ingress egress up sections */
 	__le32 ingress_table; /* bitmap, 3 bits per up */
-#define ICE_AQ_VSI_UP_TABLE_UP0_S	0
-#define ICE_AQ_VSI_UP_TABLE_UP0_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP0_S)
-#define ICE_AQ_VSI_UP_TABLE_UP1_S	3
-#define ICE_AQ_VSI_UP_TABLE_UP1_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP1_S)
-#define ICE_AQ_VSI_UP_TABLE_UP2_S	6
-#define ICE_AQ_VSI_UP_TABLE_UP2_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP2_S)
-#define ICE_AQ_VSI_UP_TABLE_UP3_S	9
-#define ICE_AQ_VSI_UP_TABLE_UP3_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP3_S)
-#define ICE_AQ_VSI_UP_TABLE_UP4_S	12
-#define ICE_AQ_VSI_UP_TABLE_UP4_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP4_S)
-#define ICE_AQ_VSI_UP_TABLE_UP5_S	15
-#define ICE_AQ_VSI_UP_TABLE_UP5_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP5_S)
-#define ICE_AQ_VSI_UP_TABLE_UP6_S	18
-#define ICE_AQ_VSI_UP_TABLE_UP6_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP6_S)
-#define ICE_AQ_VSI_UP_TABLE_UP7_S	21
-#define ICE_AQ_VSI_UP_TABLE_UP7_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP7_S)
+#define ICE_AQ_VSI_UP_TABLE_UP0_S		0
+#define ICE_AQ_VSI_UP_TABLE_UP0_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP0_S)
+#define ICE_AQ_VSI_UP_TABLE_UP1_S		3
+#define ICE_AQ_VSI_UP_TABLE_UP1_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP1_S)
+#define ICE_AQ_VSI_UP_TABLE_UP2_S		6
+#define ICE_AQ_VSI_UP_TABLE_UP2_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP2_S)
+#define ICE_AQ_VSI_UP_TABLE_UP3_S		9
+#define ICE_AQ_VSI_UP_TABLE_UP3_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP3_S)
+#define ICE_AQ_VSI_UP_TABLE_UP4_S		12
+#define ICE_AQ_VSI_UP_TABLE_UP4_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP4_S)
+#define ICE_AQ_VSI_UP_TABLE_UP5_S		15
+#define ICE_AQ_VSI_UP_TABLE_UP5_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP5_S)
+#define ICE_AQ_VSI_UP_TABLE_UP6_S		18
+#define ICE_AQ_VSI_UP_TABLE_UP6_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP6_S)
+#define ICE_AQ_VSI_UP_TABLE_UP7_S		21
+#define ICE_AQ_VSI_UP_TABLE_UP7_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP7_S)
 	__le32 egress_table;   /* same defines as for ingress table */
 	/* outer tags section */
-	__le16 outer_tag;
-	u8 outer_tag_flags;
-#define ICE_AQ_VSI_OUTER_TAG_MODE_S	0
-#define ICE_AQ_VSI_OUTER_TAG_MODE_M	(0x3 << ICE_AQ_VSI_OUTER_TAG_MODE_S)
-#define ICE_AQ_VSI_OUTER_TAG_NOTHING	0x0
-#define ICE_AQ_VSI_OUTER_TAG_REMOVE	0x1
-#define ICE_AQ_VSI_OUTER_TAG_COPY	0x2
-#define ICE_AQ_VSI_OUTER_TAG_TYPE_S	2
-#define ICE_AQ_VSI_OUTER_TAG_TYPE_M	(0x3 << ICE_AQ_VSI_OUTER_TAG_TYPE_S)
-#define ICE_AQ_VSI_OUTER_TAG_NONE	0x0
-#define ICE_AQ_VSI_OUTER_TAG_STAG	0x1
-#define ICE_AQ_VSI_OUTER_TAG_VLAN_8100	0x2
-#define ICE_AQ_VSI_OUTER_TAG_VLAN_9100	0x3
-#define ICE_AQ_VSI_OUTER_TAG_INSERT	BIT(4)
-#define ICE_AQ_VSI_OUTER_TAG_ACCEPT_HOST BIT(6)
-	u8 outer_tag_reserved;
+	__le16 port_based_outer_vlan;
+	u8 outer_vlan_flags;
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_S		0
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_M		(0x3 << ICE_AQ_VSI_OUTER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH	0x0
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_UP	0x1
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW	0x2
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING	0x3
+#define ICE_AQ_VSI_OUTER_TAG_TYPE_S		2
+#define ICE_AQ_VSI_OUTER_TAG_TYPE_M		(0x3 << ICE_AQ_VSI_OUTER_TAG_TYPE_S)
+#define ICE_AQ_VSI_OUTER_TAG_NONE		0x0
+#define ICE_AQ_VSI_OUTER_TAG_STAG		0x1
+#define ICE_AQ_VSI_OUTER_TAG_VLAN_8100		0x2
+#define ICE_AQ_VSI_OUTER_TAG_VLAN_9100		0x3
+#define ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT		BIT(4)
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S			5
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M			(0x3 << ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S)
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED	0x1
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTTAGGED	0x2
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL		0x3
+#define ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC		BIT(7)
+	u8 outer_vlan_reserved;
 	/* queue mapping section */
 	__le16 mapping_flags;
-#define ICE_AQ_VSI_Q_MAP_CONTIG	0x0
-#define ICE_AQ_VSI_Q_MAP_NONCONTIG	BIT(0)
+#define ICE_AQ_VSI_Q_MAP_CONTIG			0x0
+#define ICE_AQ_VSI_Q_MAP_NONCONTIG		BIT(0)
 	__le16 q_mapping[16];
-#define ICE_AQ_VSI_Q_S		0
-#define ICE_AQ_VSI_Q_M		(0x7FF << ICE_AQ_VSI_Q_S)
+#define ICE_AQ_VSI_Q_S				0
+#define ICE_AQ_VSI_Q_M				(0x7FF << ICE_AQ_VSI_Q_S)
 	__le16 tc_mapping[8];
-#define ICE_AQ_VSI_TC_Q_OFFSET_S	0
-#define ICE_AQ_VSI_TC_Q_OFFSET_M	(0x7FF << ICE_AQ_VSI_TC_Q_OFFSET_S)
-#define ICE_AQ_VSI_TC_Q_NUM_S		11
-#define ICE_AQ_VSI_TC_Q_NUM_M		(0xF << ICE_AQ_VSI_TC_Q_NUM_S)
+#define ICE_AQ_VSI_TC_Q_OFFSET_S		0
+#define ICE_AQ_VSI_TC_Q_OFFSET_M		(0x7FF << ICE_AQ_VSI_TC_Q_OFFSET_S)
+#define ICE_AQ_VSI_TC_Q_NUM_S			11
+#define ICE_AQ_VSI_TC_Q_NUM_M			(0xF << ICE_AQ_VSI_TC_Q_NUM_S)
 	/* queueing option section */
 	u8 q_opt_rss;
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_S	0
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_M	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_LUT_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI	0x0
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_PF	0x2
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL	0x3
-#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S	2
-#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M	(0xF << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_HASH_S	6
-#define ICE_AQ_VSI_Q_OPT_RSS_HASH_M	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_TPLZ	(0x0 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ	(0x1 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_XOR	(0x2 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_JHASH	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_S		0
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_M		(0x3 << ICE_AQ_VSI_Q_OPT_RSS_LUT_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI		0x0
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_PF		0x2
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL		0x3
+#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S		2
+#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M		(0xF << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_HASH_S		6
+#define ICE_AQ_VSI_Q_OPT_RSS_HASH_M		(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_TPLZ		(0x0 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ		(0x1 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_XOR		(0x2 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_JHASH		(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
 	u8 q_opt_tc;
-#define ICE_AQ_VSI_Q_OPT_TC_OVR_S	0
-#define ICE_AQ_VSI_Q_OPT_TC_OVR_M	(0x1F << ICE_AQ_VSI_Q_OPT_TC_OVR_S)
-#define ICE_AQ_VSI_Q_OPT_PROF_TC_OVR	BIT(7)
+#define ICE_AQ_VSI_Q_OPT_TC_OVR_S		0
+#define ICE_AQ_VSI_Q_OPT_TC_OVR_M		(0x1F << ICE_AQ_VSI_Q_OPT_TC_OVR_S)
+#define ICE_AQ_VSI_Q_OPT_PROF_TC_OVR		BIT(7)
 	u8 q_opt_flags;
-#define ICE_AQ_VSI_Q_OPT_PE_FLTR_EN	BIT(0)
+#define ICE_AQ_VSI_Q_OPT_PE_FLTR_EN		BIT(0)
 	u8 q_opt_reserved[3];
 	/* outer up section */
 	__le32 outer_up_table; /* same structure and defines as ingress tbl */
-	/* section 10 */
-	__le16 sect_10_reserved;
+	/* ACL section */
+	__le16 acl_def_act;
+#define ICE_AQ_VSI_ACL_DEF_RX_PROF_S		0
+#define ICE_AQ_VSI_ACL_DEF_RX_PROF_M		(0xF << ICE_AQ_VSI_ACL_DEF_RX_PROF_S)
+#define ICE_AQ_VSI_ACL_DEF_RX_TABLE_S		4
+#define ICE_AQ_VSI_ACL_DEF_RX_TABLE_M		(0xF << ICE_AQ_VSI_ACL_DEF_RX_TABLE_S)
+#define ICE_AQ_VSI_ACL_DEF_TX_PROF_S		8
+#define ICE_AQ_VSI_ACL_DEF_TX_PROF_M		(0xF << ICE_AQ_VSI_ACL_DEF_TX_PROF_S)
+#define ICE_AQ_VSI_ACL_DEF_TX_TABLE_S		12
+#define ICE_AQ_VSI_ACL_DEF_TX_TABLE_M		(0xF << ICE_AQ_VSI_ACL_DEF_TX_TABLE_S)
 	/* flow director section */
 	__le16 fd_options;
-#define ICE_AQ_VSI_FD_ENABLE		BIT(0)
-#define ICE_AQ_VSI_FD_TX_AUTO_ENABLE	BIT(1)
-#define ICE_AQ_VSI_FD_PROG_ENABLE	BIT(3)
+#define ICE_AQ_VSI_FD_ENABLE			BIT(0)
+#define ICE_AQ_VSI_FD_TX_AUTO_ENABLE		BIT(1)
+#define ICE_AQ_VSI_FD_PROG_ENABLE		BIT(3)
 	__le16 max_fd_fltr_dedicated;
 	__le16 max_fd_fltr_shared;
 	__le16 fd_def_q;
-#define ICE_AQ_VSI_FD_DEF_Q_S		0
-#define ICE_AQ_VSI_FD_DEF_Q_M		(0x7FF << ICE_AQ_VSI_FD_DEF_Q_S)
-#define ICE_AQ_VSI_FD_DEF_GRP_S	12
-#define ICE_AQ_VSI_FD_DEF_GRP_M	(0x7 << ICE_AQ_VSI_FD_DEF_GRP_S)
+#define ICE_AQ_VSI_FD_DEF_Q_S			0
+#define ICE_AQ_VSI_FD_DEF_Q_M			(0x7FF << ICE_AQ_VSI_FD_DEF_Q_S)
+#define ICE_AQ_VSI_FD_DEF_GRP_S			12
+#define ICE_AQ_VSI_FD_DEF_GRP_M			(0x7 << ICE_AQ_VSI_FD_DEF_GRP_S)
 	__le16 fd_report_opt;
-#define ICE_AQ_VSI_FD_REPORT_Q_S	0
-#define ICE_AQ_VSI_FD_REPORT_Q_M	(0x7FF << ICE_AQ_VSI_FD_REPORT_Q_S)
-#define ICE_AQ_VSI_FD_DEF_PRIORITY_S	12
-#define ICE_AQ_VSI_FD_DEF_PRIORITY_M	(0x7 << ICE_AQ_VSI_FD_DEF_PRIORITY_S)
-#define ICE_AQ_VSI_FD_DEF_DROP		BIT(15)
+#define ICE_AQ_VSI_FD_REPORT_Q_S		0
+#define ICE_AQ_VSI_FD_REPORT_Q_M		(0x7FF << ICE_AQ_VSI_FD_REPORT_Q_S)
+#define ICE_AQ_VSI_FD_DEF_PRIORITY_S		12
+#define ICE_AQ_VSI_FD_DEF_PRIORITY_M		(0x7 << ICE_AQ_VSI_FD_DEF_PRIORITY_S)
+#define ICE_AQ_VSI_FD_DEF_DROP			BIT(15)
 	/* PASID section */
 	__le32 pasid_id;
-#define ICE_AQ_VSI_PASID_ID_S		0
-#define ICE_AQ_VSI_PASID_ID_M		(0xFFFFF << ICE_AQ_VSI_PASID_ID_S)
-#define ICE_AQ_VSI_PASID_ID_VALID	BIT(31)
+#define ICE_AQ_VSI_PASID_ID_S			0
+#define ICE_AQ_VSI_PASID_ID_M			(0xFFFFF << ICE_AQ_VSI_PASID_ID_S)
+#define ICE_AQ_VSI_PASID_ID_VALID		BIT(31)
 	u8 reserved[24];
 };
 
+
+/* Add/update mirror rule - direct (0x0260) */
+#define ICE_AQC_RULE_ID_VALID_S		7
+#define ICE_AQC_RULE_ID_VALID_M		(0x1 << ICE_AQC_RULE_ID_VALID_S)
+#define ICE_AQC_RULE_ID_S		0
+#define ICE_AQC_RULE_ID_M		(0x3F << ICE_AQC_RULE_ID_S)
+
+/* Following defines to be used while processing caller specified mirror list
+ * of VSI indexes.
+ */
+/* Action: Byte.bit (1.7)
+ *	0 = Remove VSI from mirror rule
+ *	1 = Add VSI to mirror rule
+ */
+#define ICE_AQC_RULE_ACT_S	15
+#define ICE_AQC_RULE_ACT_M	(0x1 << ICE_AQC_RULE_ACT_S)
+/* Action: 1.2:0.0 = Mirrored VSI */
+#define ICE_AQC_RULE_MIRRORED_VSI_S	0
+#define ICE_AQC_RULE_MIRRORED_VSI_M	(0x7FF << ICE_AQC_RULE_MIRRORED_VSI_S)
+
+/* This is to be used by add/update mirror rule Admin Queue command.
+ * In case of add mirror rule - if rule ID is specified as
+ * INVAL_MIRROR_RULE_ID, new rule ID is allocated from shared pool.
+ * If specified rule_id is valid, then it is used. If specified rule_id
+ * is in use then new mirroring rule is added.
+ */
+#define ICE_INVAL_MIRROR_RULE_ID	0xFFFF
+
+struct ice_aqc_add_update_mir_rule {
+	__le16 rule_id;
+
+	__le16 rule_type;
+#define ICE_AQC_RULE_TYPE_S		0
+#define ICE_AQC_RULE_TYPE_M		(0x7 << ICE_AQC_RULE_TYPE_S)
+	/* VPORT ingress/egress */
+#define ICE_AQC_RULE_TYPE_VPORT_INGRESS	0x1
+#define ICE_AQC_RULE_TYPE_VPORT_EGRESS	0x2
+	/* Physical port ingress mirroring.
+	 * All traffic received by this port
+	 */
+#define ICE_AQC_RULE_TYPE_PPORT_INGRESS	0x6
+	/* Physical port egress mirroring. All traffic sent by this port */
+#define ICE_AQC_RULE_TYPE_PPORT_EGRESS	0x7
+
+	/* Number of mirrored entries.
+	 * The values are in the command buffer
+	 */
+	__le16 num_entries;
+
+	/* Destination VSI */
+	__le16 dest;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Delete mirror rule - direct(0x0261) */
+struct ice_aqc_delete_mir_rule {
+	__le16 rule_id;
+	__le16 rsvd;
+
+	/* Byte.bit: 20.0 = Keep allocation. If set VSI stays part of
+	 * the PF allocated resources, otherwise it is returned to the
+	 * shared pool
+	 */
+#define ICE_AQC_FLAG_KEEP_ALLOCD_S	0
+#define ICE_AQC_FLAG_KEEP_ALLOCD_M	(0x1 << ICE_AQC_FLAG_KEEP_ALLOCD_S)
+	__le16 flags;
+
+	u8 reserved[10];
+};
+
+
+/* Set/Get storm config - (direct 0x0280, 0x0281) */
+/* This structure holds get storm configuration response and same structure
+ * is used to perform set_storm_cfg
+ */
+struct ice_aqc_storm_cfg {
+	__le32 bcast_thresh_size;
+	__le32 mcast_thresh_size;
+	/* Bit 18:0 - Traffic upper threshold size
+	 * Bit 31:19 - Reserved
+	 */
+#define ICE_AQ_THRESHOLD_S	0
+#define ICE_AQ_THRESHOLD_M	(0x7FFFF << ICE_AQ_THRESHOLD_S)
+
+	__le32 storm_ctrl_ctrl;
+	/* Bit 0: MDIPW - Drop Multicast packets in previous window
+	 * Bit 1: MDICW - Drop multicast packets in current window
+	 * Bit 2: BDIPW - Drop broadcast packets in previous window
+	 * Bit 3: BDICW - Drop broadcast packets in current window
+	 */
+#define ICE_AQ_STORM_CTRL_MDIPW_DROP_MULTICAST	BIT(0)
+#define ICE_AQ_STORM_CTRL_MDICW_DROP_MULTICAST	BIT(1)
+#define ICE_AQ_STORM_CTRL_BDIPW_DROP_MULTICAST	BIT(2)
+#define ICE_AQ_STORM_CTRL_BDICW_DROP_MULTICAST	BIT(3)
+	/* Bit 7:5 : Reserved */
+	/* Bit 27:8 : Interval - BSC/MSC Time-interval specification: The
+	 * interval size for applying ingress broadcast or multicast storm
+	 * control.
+	 */
+#define ICE_AQ_STORM_BSC_MSC_TIME_INTERVAL_S	8
+#define ICE_AQ_STORM_BSC_MSC_TIME_INTERVAL_M	\
+			(0xFFFFF << ICE_AQ_STORM_BSC_MSC_TIME_INTERVAL_S)
+	__le32 reserved;
+};
+
+
 #define ICE_MAX_NUM_RECIPES 64
 
+/* Add/Get Recipe (indirect 0x0290/0x0292) */
+struct ice_aqc_add_get_recipe {
+	__le16 num_sub_recipes;	/* Input in Add cmd, Output in Get cmd */
+	__le16 return_index;	/* Input, used for Get cmd only */
+	u8 reserved[4];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_recipe_content {
+	u8 rid;
+#define ICE_AQ_RECIPE_ID_S		0
+#define ICE_AQ_RECIPE_ID_M		(0x3F << ICE_AQ_RECIPE_ID_S)
+#define ICE_AQ_RECIPE_ID_IS_ROOT	BIT(7)
+#define	ICE_AQ_SW_ID_LKUP_IDX		0
+	u8 lkup_indx[5];
+#define ICE_AQ_RECIPE_LKUP_DATA_S	0
+#define ICE_AQ_RECIPE_LKUP_DATA_M	(0x3F << ICE_AQ_RECIPE_LKUP_DATA_S)
+#define ICE_AQ_RECIPE_LKUP_IGNORE	BIT(7)
+#define ICE_AQ_SW_ID_LKUP_MASK		0x00FF
+	__le16 mask[5];
+	u8 result_indx;
+#define ICE_AQ_RECIPE_RESULT_DATA_S	0
+#define ICE_AQ_RECIPE_RESULT_DATA_M	(0x3F << ICE_AQ_RECIPE_RESULT_DATA_S)
+#define ICE_AQ_RECIPE_RESULT_EN		BIT(7)
+	u8 rsvd0[3];
+	u8 act_ctrl_join_priority;
+	u8 act_ctrl_fwd_priority;
+#define ICE_AQ_RECIPE_FWD_PRIORITY_S	0
+#define ICE_AQ_RECIPE_FWD_PRIORITY_M	(0xF << ICE_AQ_RECIPE_FWD_PRIORITY_S)
+	u8 act_ctrl;
+#define ICE_AQ_RECIPE_ACT_NEED_PASS_L2	BIT(0)
+#define ICE_AQ_RECIPE_ACT_ALLOW_PASS_L2	BIT(1)
+#define ICE_AQ_RECIPE_ACT_INV_ACT	BIT(2)
+#define ICE_AQ_RECIPE_ACT_PRUNE_INDX_S	4
+#define ICE_AQ_RECIPE_ACT_PRUNE_INDX_M	(0x3 << ICE_AQ_RECIPE_ACT_PRUNE_INDX_S)
+	u8 rsvd1;
+	__le32 dflt_act;
+#define ICE_AQ_RECIPE_DFLT_ACT_S	0
+#define ICE_AQ_RECIPE_DFLT_ACT_M	(0x7FFFF << ICE_AQ_RECIPE_DFLT_ACT_S)
+#define ICE_AQ_RECIPE_DFLT_ACT_VALID	BIT(31)
+};
+
+
+struct ice_aqc_recipe_data_elem {
+	u8 recipe_indx;
+	u8 resp_bits;
+#define ICE_AQ_RECIPE_WAS_UPDATED	BIT(0)
+	u8 rsvd0[2];
+	u8 recipe_bitmap[8];
+	u8 rsvd1[4];
+	struct ice_aqc_recipe_content content;
+	u8 rsvd2[20];
+};
+
+
+/* Set/Get Recipes to Profile Association (direct 0x0291/0x0293) */
+struct ice_aqc_recipe_to_profile {
+	__le16 profile_id;
+	u8 rsvd[6];
+	DECLARE_BITMAP(recipe_assoc, ICE_MAX_NUM_RECIPES);
+};
+
+
 /* Add/Update/Remove/Get switch rules (indirect 0x02A0, 0x02A1, 0x02A2, 0x02A3)
  */
 struct ice_aqc_sw_rules {
@@ -472,6 +873,7 @@ struct ice_aqc_sw_rules {
 	__le32 addr_low;
 };
 
+
 /* Add/Update/Get/Remove lookup Rx/Tx command/response entry
  * This structures describes the lookup rules and associated actions. "index"
  * is returned as part of a response to a successful Add command, and can be
@@ -534,7 +936,7 @@ struct ice_sw_rule_lkup_rx_tx {
 #define ICE_SINGLE_ACT_OTHER_ACTS		0x3
 #define ICE_SINGLE_OTHER_ACT_IDENTIFIER_S	17
 #define ICE_SINGLE_OTHER_ACT_IDENTIFIER_M	\
-				(0x3 << \ ICE_SINGLE_OTHER_ACT_IDENTIFIER_S)
+				(0x3 << ICE_SINGLE_OTHER_ACT_IDENTIFIER_S)
 
 	/* Bit 17:18 - Defines other actions */
 	/* Other action = 0 - Mirror VSI */
@@ -554,8 +956,9 @@ struct ice_sw_rule_lkup_rx_tx {
 	 * lookup-type
 	 */
 	__le16 hdr_len;
-	u8 hdr[1];
-} __packed;
+	u8 hdr[];
+};
+
 
 /* Add/Update/Remove large action command/response entry
  * "index" is returned as part of a response to a successful Add command, and
@@ -564,7 +967,6 @@ struct ice_sw_rule_lkup_rx_tx {
 struct ice_sw_rule_lg_act {
 	__le16 index; /* Index in large action table */
 	__le16 size;
-	__le32 act[1]; /* array of size for actions */
 	/* Max number of large actions */
 #define ICE_MAX_LG_ACT	4
 	/* Bit 0:1 - Action type */
@@ -615,8 +1017,10 @@ struct ice_sw_rule_lg_act {
 #define ICE_LG_ACT_STAT_COUNT		0x7
 #define ICE_LG_ACT_STAT_COUNT_S		3
 #define ICE_LG_ACT_STAT_COUNT_M		(0x7F << ICE_LG_ACT_STAT_COUNT_S)
+	__le32 act[]; /* array of size for actions */
 };
 
+
 /* Add/Update/Remove VSI list command/response entry
  * "index" is returned as part of a response to a successful Add command, and
  * can be used to identify the VSI list for Update/Get/Remove commands.
@@ -624,15 +1028,17 @@ struct ice_sw_rule_lg_act {
 struct ice_sw_rule_vsi_list {
 	__le16 index; /* Index of VSI/Prune list */
 	__le16 number_vsi;
-	__le16 vsi[1]; /* Array of number_vsi VSI numbers */
+	__le16 vsi[]; /* Array of number_vsi VSI numbers */
 };
 
+
 /* Query VSI list command/response entry */
 struct ice_sw_rule_vsi_list_query {
 	__le16 index;
 	DECLARE_BITMAP(vsi_list, ICE_MAX_VSI);
 } __packed;
 
+
 /* Add switch rule response:
  * Content of return buffer is same as the input buffer. The status field and
  * LUT index are updated as part of the response
@@ -655,6 +1061,47 @@ struct ice_aqc_sw_rules_elem {
 	} __packed pdata;
 };
 
+
+
+/* PFC Ignore (direct 0x0301)
+ * The command and response use the same descriptor structure
+ */
+struct ice_aqc_pfc_ignore {
+	u8	tc_bitmap;
+	u8	cmd_flags; /* unused in response */
+#define ICE_AQC_PFC_IGNORE_SET		BIT(7)
+#define ICE_AQC_PFC_IGNORE_CLEAR	0
+	u8	reserved[14];
+};
+
+
+/* Set PFC Mode (direct 0x0303)
+ * Query PFC Mode (direct 0x0302)
+ */
+struct ice_aqc_set_query_pfc_mode {
+	u8	pfc_mode;
+/* For Set Command response, reserved in all other cases */
+#define ICE_AQC_PFC_NOT_CONFIGURED	0
+/* For Query Command response, reserved in all other cases */
+#define ICE_AQC_DCB_DIS		0
+#define ICE_AQC_PFC_VLAN_BASED_PFC	1
+#define ICE_AQC_PFC_DSCP_BASED_PFC	2
+	u8	rsvd[15];
+};
+
+
+/* Set DCB Parameters (direct 0x0306) */
+struct ice_aqc_set_dcb_params {
+	u8 cmd_flags; /* unused in response */
+#define ICE_AQC_LINK_UP_DCB_CFG    BIT(0)
+#define ICE_AQC_PERSIST_DCB_CFG    BIT(1)
+	u8 valid_flags; /* unused in response */
+#define ICE_AQC_LINK_UP_DCB_CFG_VALID    BIT(0)
+#define ICE_AQC_PERSIST_DCB_CFG_VALID    BIT(1)
+	u8 rsvd[14];
+};
+
+
 /* Get Default Topology (indirect 0x0400) */
 struct ice_aqc_get_topo {
 	u8 port_num;
@@ -665,6 +1112,7 @@ struct ice_aqc_get_topo {
 	__le32 addr_low;
 };
 
+
 /* Update TSE (indirect 0x0403)
  * Get TSE (indirect 0x0404)
  * Add TSE (indirect 0x0401)
@@ -681,19 +1129,29 @@ struct ice_aqc_sched_elem_cmd {
 	__le32 addr_low;
 };
 
-/* This is the buffer for:
- * Suspend Nodes (indirect 0x0409)
- * Resume Nodes (indirect 0x040A)
- */
-struct ice_aqc_suspend_resume_elem {
-	__le32 teid[1];
+
+
+struct ice_aqc_txsched_move_grp_info_hdr {
+	__le32 src_parent_teid;
+	__le32 dest_parent_teid;
+	__le16 num_elems;
+	u8 flags;
+	u8 reserved;
+};
+
+
+struct ice_aqc_move_elem {
+	struct ice_aqc_txsched_move_grp_info_hdr hdr;
+	__le32 teid[];
 };
 
+
 struct ice_aqc_elem_info_bw {
 	__le16 bw_profile_idx;
 	__le16 bw_alloc;
 };
 
+
 struct ice_aqc_txsched_elem {
 	u8 elem_type; /* Special field, reserved for some aq calls */
 #define ICE_AQC_ELEM_TYPE_UNDEFINED		0x0
@@ -725,26 +1183,27 @@ struct ice_aqc_txsched_elem {
 	__le16 reserved2;
 };
 
+
 struct ice_aqc_txsched_elem_data {
 	__le32 parent_teid;
 	__le32 node_teid;
 	struct ice_aqc_txsched_elem data;
 };
 
+
 struct ice_aqc_txsched_topo_grp_info_hdr {
 	__le32 parent_teid;
 	__le16 num_elems;
 	__le16 reserved2;
 };
 
+
 struct ice_aqc_add_elem {
 	struct ice_aqc_txsched_topo_grp_info_hdr hdr;
-	struct ice_aqc_txsched_elem_data generic[1];
+	struct ice_aqc_txsched_elem_data generic[];
 };
 
-struct ice_aqc_get_elem {
-	struct ice_aqc_txsched_elem_data generic[1];
-};
+
 
 struct ice_aqc_get_topo_elem {
 	struct ice_aqc_txsched_topo_grp_info_hdr hdr;
@@ -752,11 +1211,13 @@ struct ice_aqc_get_topo_elem {
 		generic[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 };
 
+
 struct ice_aqc_delete_elem {
 	struct ice_aqc_txsched_topo_grp_info_hdr hdr;
-	__le32 teid[1];
+	__le32 teid[];
 };
 
+
 /* Query Port ETS (indirect 0x040E)
  *
  * This indirect command is used to query port TC node configuration.
@@ -768,6 +1229,7 @@ struct ice_aqc_query_port_ets {
 	__le32 addr_low;
 };
 
+
 struct ice_aqc_port_ets_elem {
 	u8 tc_valid_bits;
 	u8 reserved[3];
@@ -783,6 +1245,63 @@ struct ice_aqc_port_ets_elem {
 	__le32 tc_node_teid[8]; /* Used for response, reserved in command */
 };
 
+
+/* Rate limiting profile for
+ * Add RL profile (indirect 0x0410)
+ * Query RL profile (indirect 0x0411)
+ * Remove RL profile (indirect 0x0415)
+ * These indirect commands acts on single or multiple
+ * RL profiles with specified data.
+ */
+struct ice_aqc_rl_profile {
+	__le16 num_profiles;
+	__le16 num_processed; /* Only for response. Reserved in Command. */
+	u8 reserved[4];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_rl_profile_elem {
+	u8 level;
+	u8 flags;
+#define ICE_AQC_RL_PROFILE_TYPE_S	0x0
+#define ICE_AQC_RL_PROFILE_TYPE_M	(0x3 << ICE_AQC_RL_PROFILE_TYPE_S)
+#define ICE_AQC_RL_PROFILE_TYPE_CIR	0
+#define ICE_AQC_RL_PROFILE_TYPE_EIR	1
+#define ICE_AQC_RL_PROFILE_TYPE_SRL	2
+/* The following flag is used for Query RL Profile Data */
+#define ICE_AQC_RL_PROFILE_INVAL_S	0x7
+#define ICE_AQC_RL_PROFILE_INVAL_M	(0x1 << ICE_AQC_RL_PROFILE_INVAL_S)
+
+	__le16 profile_id;
+	__le16 max_burst_size;
+	__le16 rl_multiply;
+	__le16 wake_up_calc;
+	__le16 rl_encode;
+};
+
+
+
+/* Configure L2 Node CGD (indirect 0x0414)
+ * This indirect command allows configuring a congestion domain for given L2
+ * node TEIDs in the scheduler topology.
+ */
+struct ice_aqc_cfg_l2_node_cgd {
+	__le16 num_l2_nodes;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_cfg_l2_node_cgd_elem {
+	__le32 node_teid;
+	u8 cgd;
+	u8 reserved[3];
+};
+
+
 /* Query Scheduler Resource Allocation (indirect 0x0412)
  * This indirect command retrieves the scheduler resources allocated by
  * EMP Firmware to the given PF.
@@ -793,6 +1312,7 @@ struct ice_aqc_query_txsched_res {
 	__le32 addr_low;
 };
 
+
 struct ice_aqc_generic_sched_props {
 	__le16 phys_levels;
 	__le16 logical_levels;
@@ -804,6 +1324,7 @@ struct ice_aqc_generic_sched_props {
 	u8 rsvd1[22];
 };
 
+
 struct ice_aqc_layer_props {
 	u8 logical_layer;
 	u8 chunk_size;
@@ -817,11 +1338,24 @@ struct ice_aqc_layer_props {
 	u8 rsvd1[14];
 };
 
+
 struct ice_aqc_query_txsched_res_resp {
 	struct ice_aqc_generic_sched_props sched_props;
 	struct ice_aqc_layer_props layer_props[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 };
 
+
+/* Query Node to Root Topology (indirect 0x0413)
+ * This command uses ice_aqc_get_elem as its data buffer.
+ */
+struct ice_aqc_query_node_to_root {
+	__le32 teid;
+	__le32 num_nodes; /* Response only */
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
 /* Get PHY capabilities (indirect 0x0600) */
 struct ice_aqc_get_phy_caps {
 	u8 lport_num;
@@ -829,21 +1363,24 @@ struct ice_aqc_get_phy_caps {
 	__le16 param0;
 	/* 18.0 - Report qualified modules */
 #define ICE_AQC_GET_PHY_RQM		BIT(0)
-	/* 18.1 - 18.2 : Report mode
-	 * 00b - Report NVM capabilities
-	 * 01b - Report topology capabilities
-	 * 10b - Report SW configured
+	/* 18.1 - 18.3 : Report mode
+	 * 000b - Report NVM capabilities
+	 * 001b - Report topology capabilities
+	 * 010b - Report SW configured
+	 * 100b - Report default capabilities
 	 */
-#define ICE_AQC_REPORT_MODE_S		1
-#define ICE_AQC_REPORT_MODE_M		(3 << ICE_AQC_REPORT_MODE_S)
-#define ICE_AQC_REPORT_NVM_CAP		0
-#define ICE_AQC_REPORT_TOPO_CAP		BIT(1)
-#define ICE_AQC_REPORT_SW_CFG		BIT(2)
+#define ICE_AQC_REPORT_MODE_S			1
+#define ICE_AQC_REPORT_MODE_M			(7 << ICE_AQC_REPORT_MODE_S)
+#define ICE_AQC_REPORT_TOPO_CAP_NO_MEDIA	0
+#define ICE_AQC_REPORT_TOPO_CAP_MEDIA		BIT(1)
+#define ICE_AQC_REPORT_ACTIVE_CFG		BIT(2)
+#define ICE_AQC_REPORT_DFLT_CFG			BIT(3)
 	__le32 reserved1;
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
+
 /* This is #define of PHY type (Extended):
  * The first set of defines is for phy_type_low.
  */
@@ -918,7 +1455,7 @@ struct ice_aqc_get_phy_caps {
 #define ICE_PHY_TYPE_HIGH_100G_CAUI2		BIT_ULL(2)
 #define ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC	BIT_ULL(3)
 #define ICE_PHY_TYPE_HIGH_100G_AUI2		BIT_ULL(4)
-#define ICE_PHY_TYPE_HIGH_MAX_INDEX		19
+#define ICE_PHY_TYPE_HIGH_MAX_INDEX		5
 
 struct ice_aqc_get_phy_caps_data {
 	__le64 phy_type_low; /* Use values from ICE_PHY_TYPE_LOW_* */
@@ -929,11 +1466,15 @@ struct ice_aqc_get_phy_caps_data {
 #define ICE_AQC_PHY_LOW_POWER_MODE			BIT(2)
 #define ICE_AQC_PHY_EN_LINK				BIT(3)
 #define ICE_AQC_PHY_AN_MODE				BIT(4)
-#define ICE_AQC_GET_PHY_EN_MOD_QUAL			BIT(5)
+#define ICE_AQC_PHY_EN_MOD_QUAL				BIT(5)
+#define ICE_AQC_PHY_EN_LESM				BIT(6)
 #define ICE_AQC_PHY_EN_AUTO_FEC				BIT(7)
 #define ICE_AQC_PHY_CAPS_MASK				ICE_M(0xff, 0)
-	u8 low_power_ctrl;
+	u8 low_power_ctrl_an;
 #define ICE_AQC_PHY_EN_D3COLD_LOW_POWER_AUTONEG		BIT(0)
+#define ICE_AQC_PHY_AN_EN_CLAUSE28			BIT(1)
+#define ICE_AQC_PHY_AN_EN_CLAUSE73			BIT(2)
+#define ICE_AQC_PHY_AN_EN_CLAUSE37			BIT(3)
 	__le16 eee_cap;
 #define ICE_AQC_PHY_EEE_EN_100BASE_TX			BIT(0)
 #define ICE_AQC_PHY_EEE_EN_1000BASE_T			BIT(1)
@@ -942,6 +1483,10 @@ struct ice_aqc_get_phy_caps_data {
 #define ICE_AQC_PHY_EEE_EN_10GBASE_KR			BIT(4)
 #define ICE_AQC_PHY_EEE_EN_25GBASE_KR			BIT(5)
 #define ICE_AQC_PHY_EEE_EN_40GBASE_KR4			BIT(6)
+#define ICE_AQC_PHY_EEE_EN_50GBASE_KR2			BIT(7)
+#define ICE_AQC_PHY_EEE_EN_50GBASE_KR_PAM4		BIT(8)
+#define ICE_AQC_PHY_EEE_EN_100GBASE_KR4			BIT(9)
+#define ICE_AQC_PHY_EEE_EN_100GBASE_KR2_PAM4		BIT(10)
 	__le16 eeer_value;
 	u8 phy_id_oui[4]; /* PHY/Module ID connected on the port */
 	u8 phy_fw_ver[8];
@@ -954,12 +1499,14 @@ struct ice_aqc_get_phy_caps_data {
 #define ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN		BIT(6)
 #define ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN		BIT(7)
 #define ICE_AQC_PHY_FEC_MASK				ICE_M(0xdf, 0)
-	u8 rsvd1;	/* Byte 35 reserved */
+	u8 module_compliance_enforcement;
+#define ICE_AQC_MOD_ENFORCE_STRICT_MODE			BIT(0)
 	u8 extended_compliance_code;
 #define ICE_MODULE_TYPE_TOTAL_BYTE			3
 	u8 module_type[ICE_MODULE_TYPE_TOTAL_BYTE];
 #define ICE_AQC_MOD_TYPE_BYTE0_SFP_PLUS			0xA0
 #define ICE_AQC_MOD_TYPE_BYTE0_QSFP_PLUS		0x80
+#define ICE_AQC_MOD_TYPE_IDENT				1
 #define ICE_AQC_MOD_TYPE_BYTE1_SFP_PLUS_CU_PASSIVE	BIT(0)
 #define ICE_AQC_MOD_TYPE_BYTE1_SFP_PLUS_CU_ACTIVE	BIT(1)
 #define ICE_AQC_MOD_TYPE_BYTE1_10G_BASE_SR		BIT(4)
@@ -980,6 +1527,7 @@ struct ice_aqc_get_phy_caps_data {
 	} qual_modules[ICE_AQC_QUAL_MOD_COUNT_MAX];
 };
 
+
 /* Set PHY capabilities (direct 0x0601)
  * NOTE: This command must be followed by setup link and restart auto-neg
  */
@@ -990,6 +1538,7 @@ struct ice_aqc_set_phy_cfg {
 	__le32 addr_low;
 };
 
+
 /* Set PHY config command data structure */
 struct ice_aqc_set_phy_cfg_data {
 	__le64 phy_type_low; /* Use values from ICE_PHY_TYPE_LOW_* */
@@ -1003,13 +1552,34 @@ struct ice_aqc_set_phy_cfg_data {
 #define ICE_AQ_PHY_ENA_AUTO_LINK_UPDT	BIT(5)
 #define ICE_AQ_PHY_ENA_LESM		BIT(6)
 #define ICE_AQ_PHY_ENA_AUTO_FEC		BIT(7)
-	u8 low_power_ctrl;
+	u8 low_power_ctrl_an;
 	__le16 eee_cap; /* Value from ice_aqc_get_phy_caps */
 	__le16 eeer_value;
 	u8 link_fec_opt; /* Use defines from ice_aqc_get_phy_caps */
-	u8 rsvd1;
+	u8 module_compliance_enforcement;
 };
 
+
+/* Set MAC Config command data structure (direct 0x0603) */
+struct ice_aqc_set_mac_cfg {
+	__le16 max_frame_size;
+	u8 params;
+#define ICE_AQ_SET_MAC_PACE_S		3
+#define ICE_AQ_SET_MAC_PACE_M		(0xF << ICE_AQ_SET_MAC_PACE_S)
+#define ICE_AQ_SET_MAC_PACE_TYPE_M	BIT(7)
+#define ICE_AQ_SET_MAC_PACE_TYPE_RATE	0
+#define ICE_AQ_SET_MAC_PACE_TYPE_FIXED	ICE_AQ_SET_MAC_PACE_TYPE_M
+	u8 tx_tmr_priority;
+	__le16 tx_tmr_value;
+	__le16 fc_refresh_threshold;
+	u8 drop_opts;
+#define ICE_AQ_SET_MAC_AUTO_DROP_MASK		BIT(0)
+#define ICE_AQ_SET_MAC_AUTO_DROP_NONE		0
+#define ICE_AQ_SET_MAC_AUTO_DROP_BLOCKING_PKTS	BIT(0)
+	u8 reserved[7];
+};
+
+
 /* Restart AN command data structure (direct 0x0605)
  * Also used for response, with only the lport_num field present.
  */
@@ -1022,6 +1592,7 @@ struct ice_aqc_restart_an {
 	u8 reserved2[13];
 };
 
+
 /* Get link status (indirect 0x0607), also used for Link Status Event */
 struct ice_aqc_get_link_status {
 	u8 lport_num;
@@ -1038,13 +1609,25 @@ struct ice_aqc_get_link_status {
 	__le32 addr_low;
 };
 
+
 /* Get link status response data structure, also used for Link Status Event */
 struct ice_aqc_get_link_status_data {
 	u8 topo_media_conflict;
 #define ICE_AQ_LINK_TOPO_CONFLICT	BIT(0)
 #define ICE_AQ_LINK_MEDIA_CONFLICT	BIT(1)
 #define ICE_AQ_LINK_TOPO_CORRUPT	BIT(2)
-	u8 reserved1;
+#define ICE_AQ_LINK_TOPO_UNREACH_PRT	BIT(4)
+#define ICE_AQ_LINK_TOPO_UNDRUTIL_PRT	BIT(5)
+#define ICE_AQ_LINK_TOPO_UNDRUTIL_MEDIA	BIT(6)
+#define ICE_AQ_LINK_TOPO_UNSUPP_MEDIA	BIT(7)
+	u8 link_cfg_err;
+#define ICE_AQ_LINK_CFG_ERR			BIT(0)
+#define ICE_AQ_LINK_ACT_PORT_OPT_INVAL		BIT(2)
+#define ICE_AQ_LINK_FEAT_ID_OR_CONFIG_ID_INVAL	BIT(3)
+#define ICE_AQ_LINK_TOPO_CRITICAL_SDP_ERR	BIT(4)
+#define ICE_AQ_LINK_MODULE_POWER_UNSUPPORTED	BIT(5)
+#define ICE_AQ_LINK_EXTERNAL_PHY_LOAD_FAILURE	BIT(6)
+#define ICE_AQ_LINK_INVAL_MAX_POWER_LIMIT	BIT(7)
 	u8 link_info;
 #define ICE_AQ_LINK_UP			BIT(0)	/* Link Status */
 #define ICE_AQ_LINK_FAULT		BIT(1)
@@ -1072,7 +1655,12 @@ struct ice_aqc_get_link_status_data {
 #define ICE_AQ_LINK_TX_ACTIVE		0
 #define ICE_AQ_LINK_TX_DRAINED		1
 #define ICE_AQ_LINK_TX_FLUSHED		3
-	u8 reserved2;
+	u8 lb_status;
+#define ICE_AQ_LINK_LB_PHY_LCL		BIT(0)
+#define ICE_AQ_LINK_LB_PHY_RMT		BIT(1)
+#define ICE_AQ_LINK_LB_MAC_LCL		BIT(2)
+#define ICE_AQ_LINK_LB_PHY_IDX_S	3
+#define ICE_AQ_LINK_LB_PHY_IDX_M	(0x7 << ICE_AQ_LB_PHY_IDX_S)
 	__le16 max_frame_size;
 	u8 cfg;
 #define ICE_AQ_LINK_25G_KR_FEC_EN	BIT(0)
@@ -1087,7 +1675,7 @@ struct ice_aqc_get_link_status_data {
 #define ICE_AQ_CFG_PACING_TYPE_FIXED	ICE_AQ_CFG_PACING_TYPE_M
 	/* External Device Power Ability */
 	u8 power_desc;
-#define ICE_AQ_PWR_CLASS_M		0x3
+#define ICE_AQ_PWR_CLASS_M		0x3F
 #define ICE_AQ_LINK_PWR_BASET_LOW_HIGH	0
 #define ICE_AQ_LINK_PWR_BASET_HIGH	1
 #define ICE_AQ_LINK_PWR_QSFP_CLASS_1	0
@@ -1095,6 +1683,7 @@ struct ice_aqc_get_link_status_data {
 #define ICE_AQ_LINK_PWR_QSFP_CLASS_3	2
 #define ICE_AQ_LINK_PWR_QSFP_CLASS_4	3
 	__le16 link_speed;
+#define ICE_AQ_LINK_SPEED_M		0x7FF
 #define ICE_AQ_LINK_SPEED_10MB		BIT(0)
 #define ICE_AQ_LINK_SPEED_100MB		BIT(1)
 #define ICE_AQ_LINK_SPEED_1000MB	BIT(2)
@@ -1112,6 +1701,7 @@ struct ice_aqc_get_link_status_data {
 	__le64 phy_type_high; /* Use values from ICE_PHY_TYPE_HIGH_* */
 };
 
+
 /* Set event mask command (direct 0x0613) */
 struct ice_aqc_set_event_mask {
 	u8	lport_num;
@@ -1126,9 +1716,31 @@ struct ice_aqc_set_event_mask {
 #define ICE_AQ_LINK_EVENT_AN_COMPLETED		BIT(7)
 #define ICE_AQ_LINK_EVENT_MODULE_QUAL_FAIL	BIT(8)
 #define ICE_AQ_LINK_EVENT_PORT_TX_SUSPENDED	BIT(9)
+#define ICE_AQ_LINK_EVENT_TOPO_CONFLICT		BIT(10)
+#define ICE_AQ_LINK_EVENT_MEDIA_CONFLICT	BIT(11)
+#define ICE_AQ_LINK_EVENT_PHY_FW_LOAD_FAIL	BIT(12)
 	u8	reserved1[6];
 };
 
+
+/* Set PHY Loopback command (direct 0x0619) */
+struct ice_aqc_set_phy_lb {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQ_PHY_LB_PORT_NUM_VALID	BIT(0)
+	u8 phy_index;
+	u8 lb_mode;
+#define ICE_AQ_PHY_LB_EN		BIT(0)
+#define ICE_AQ_PHY_LB_TYPE_M		BIT(1)
+#define ICE_AQ_PHY_LB_TYPE_LOCAL	0
+#define ICE_AQ_PHY_LB_TYPE_REMOTE	ICE_AQ_PHY_LB_TYPE_M
+#define ICE_AQ_PHY_LB_LEVEL_M		BIT(2)
+#define ICE_AQ_PHY_LB_LEVEL_PMD		0
+#define ICE_AQ_PHY_LB_LEVEL_PCS		ICE_AQ_PHY_LB_LEVEL_M
+	u8 reserved2[12];
+};
+
+
 /* Set MAC Loopback command (direct 0x0620) */
 struct ice_aqc_set_mac_lb {
 	u8 lb_mode;
@@ -1137,63 +1749,708 @@ struct ice_aqc_set_mac_lb {
 	u8 reserved[15];
 };
 
-/* Set Port Identification LED (direct, 0x06E9) */
-struct ice_aqc_set_port_id_led {
-	u8 lport_num;
-	u8 lport_num_valid;
-	u8 ident_mode;
-#define ICE_AQC_PORT_IDENT_LED_BLINK	BIT(0)
-#define ICE_AQC_PORT_IDENT_LED_ORIG	0
-	u8 rsvd[13];
-};
 
-/* NVM Read command (indirect 0x0701)
- * NVM Erase commands (direct 0x0702)
- * NVM Update commands (indirect 0x0703)
+
+
+/* DNL Get Status command (indirect 0x0680)
+ * Structure used for the response, the command uses the generic
+ * ice_aqc_generic struct to pass a buffer address to the FW.
  */
-struct ice_aqc_nvm {
-	__le16 offset_low;
-	u8 offset_high;
-	u8 cmd_flags;
-#define ICE_AQC_NVM_LAST_CMD		BIT(0)
-#define ICE_AQC_NVM_PCIR_REQ		BIT(0)	/* Used by NVM Update reply */
-#define ICE_AQC_NVM_PRESERVATION_S	1
-#define ICE_AQC_NVM_PRESERVATION_M	(3 << ICE_AQC_NVM_PRESERVATION_S)
-#define ICE_AQC_NVM_NO_PRESERVATION	(0 << ICE_AQC_NVM_PRESERVATION_S)
-#define ICE_AQC_NVM_PRESERVE_ALL	BIT(1)
-#define ICE_AQC_NVM_PRESERVE_SELECTED	(3 << ICE_AQC_NVM_PRESERVATION_S)
-#define ICE_AQC_NVM_FLASH_ONLY		BIT(7)
-	__le16 module_typeid;
-	__le16 length;
-#define ICE_AQC_NVM_ERASE_LEN	0xFFFF
+struct ice_aqc_dnl_get_status {
+	u8 ctx;
+	u8 status;
+#define ICE_AQ_DNL_STATUS_IDLE		0x0
+#define ICE_AQ_DNL_STATUS_RESERVED	0x1
+#define ICE_AQ_DNL_STATUS_STOPPED	0x2
+#define ICE_AQ_DNL_STATUS_FATAL		0x3 /* Fatal DNL engine error */
+#define ICE_AQ_DNL_SRC_S		3
+#define ICE_AQ_DNL_SRC_M		(0x3 << ICE_AQ_DNL_SRC_S)
+#define ICE_AQ_DNL_SRC_NVM		(0x0 << ICE_AQ_DNL_SRC_S)
+#define ICE_AQ_DNL_SRC_NVM_SCRATCH	(0x1 << ICE_AQ_DNL_SRC_S)
+	u8 stack_ptr;
+#define ICE_AQ_DNL_ST_PTR_S		0x0
+#define ICE_AQ_DNL_ST_PTR_M		(0x7 << ICE_AQ_DNL_ST_PTR_S)
+	u8 engine_flags;
+#define ICE_AQ_DNL_FLAGS_ERROR		BIT(2)
+#define ICE_AQ_DNL_FLAGS_NEGATIVE	BIT(3)
+#define ICE_AQ_DNL_FLAGS_OVERFLOW	BIT(4)
+#define ICE_AQ_DNL_FLAGS_ZERO		BIT(5)
+#define ICE_AQ_DNL_FLAGS_CARRY		BIT(6)
+#define ICE_AQ_DNL_FLAGS_JUMP		BIT(7)
+	__le16 pc;
+	__le16 activity_id;
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
-/* NVM Checksum Command (direct, 0x0706) */
-struct ice_aqc_nvm_checksum {
-	u8 flags;
-#define ICE_AQC_NVM_CHECKSUM_VERIFY	BIT(0)
-#define ICE_AQC_NVM_CHECKSUM_RECALC	BIT(1)
-	u8 rsvd;
-	__le16 checksum; /* Used only by response */
-#define ICE_AQC_NVM_CHECKSUM_CORRECT	0xBABA
-	u8 rsvd2[12];
-};
 
-/**
- * Send to PF command (indirect 0x0801) ID is only used by PF
- *
- * Send to VF command (indirect 0x0802) ID is only used by PF
- *
+struct ice_aqc_dnl_get_status_data {
+	__le16 activity_err_code;
+	__le16 act_err_code;
+#define ICE_AQ_DNL_ACT_ERR_SUCCESS	0x0000 /* no error */
+#define ICE_AQ_DNL_ACT_ERR_PARSE	0x8001 /* NVM parse error */
+#define ICE_AQ_DNL_ACT_ERR_UNSUPPORTED	0x8002 /* unsupported action */
+#define ICE_AQ_DNL_ACT_ERR_NOT_FOUND	0x8003 /* activity not found */
+#define ICE_AQ_DNL_ACT_ERR_BAD_JUMP	0x8004 /* an illegal jump */
+#define ICE_AQ_DNL_ACT_ERR_PSTO_OVER	0x8005 /* persistent store overflow */
+#define ICE_AQ_DNL_ACT_ERR_ST_OVERFLOW	0x8006 /* stack overflow */
+#define ICE_AQ_DNL_ACT_ERR_TIMEOUT	0x8007 /* activity timeout */
+#define ICE_AQ_DNL_ACT_ERR_BREAK	0x0008 /* stopped at breakpoint */
+#define ICE_AQ_DNL_ACT_ERR_INVAL_ARG	0x0101 /* invalid action argument */
+	__le32 execution_time; /* in nanoseconds */
+	__le16 lib_ver;
+	u8 psto_local_sz;
+	u8 psto_global_sz;
+	u8 stack_sz;
+#define ICE_AQ_DNL_STACK_SZ_S		0
+#define ICE_AQ_DNL_STACK_SZ_M		(0xF << ICE_AQ_DNL_STACK_SZ_S)
+	u8 port_count;
+#define ICE_AQ_DNL_PORT_CNT_S		0
+#define ICE_AQ_DNL_PORT_CNT_M		(0x1F << ICE_AQ_DNL_PORT_CNT_S)
+	__le16 act_cache_cntr;
+	u32 i2c_clk_cntr;
+	u32 mdio_clk_cntr;
+	u32 sb_iosf_clk_cntr;
+};
+
+
+/* DNL run command (direct 0x0681) */
+struct ice_aqc_dnl_run_command {
+	u8 reserved0;
+	u8 command;
+#define ICE_AQ_DNL_CMD_S		0
+#define ICE_AQ_DNL_CMD_M		(0x7 << ICE_AQ_DNL_CMD_S)
+#define ICE_AQ_DNL_CMD_RESET		0x0
+#define ICE_AQ_DNL_CMD_RUN		0x1
+#define ICE_AQ_DNL_CMD_STEP		0x3
+#define ICE_AQ_DNL_CMD_ABORT		0x4
+#define ICE_AQ_DNL_CMD_SET_PC		0x7
+#define ICE_AQ_DNL_CMD_SRC_S		3
+#define ICE_AQ_DNL_CMD_SRC_M		(0x3 << ICE_AQ_DNL_CMD_SRC_S)
+#define ICE_AQ_DNL_CMD_SRC_DNL		0x0
+#define ICE_AQ_DNL_CMD_SRC_SCRATCH	0x1
+	__le16 new_pc;
+	u8 reserved1[12];
+};
+
+
+/* DNL call command (indirect 0x0682)
+ * Struct is used for both command and response
  */
-struct ice_aqc_pf_vf_msg {
-	__le32 id;
-	u32 reserved;
+struct ice_aqc_dnl_call_command {
+	u8 ctx; /* Used in command, reserved in response */
+	u8 reserved;
+	__le16 activity_id;
+	__le32 reserved1;
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
+
+/* DNL call command/response buffer (indirect 0x0682) */
+struct ice_aqc_dnl_call {
+	__le32 stores[4];
+};
+
+
+/* Used for both commands:
+ * DNL read sto command (indirect 0x0683)
+ * DNL write sto command (indirect 0x0684)
+ */
+struct ice_aqc_dnl_read_write_command {
+	u8 ctx;
+	u8 sto_sel; /* STORE select */
+#define ICE_AQC_DNL_STORE_SELECT_STORE	0x0
+#define ICE_AQC_DNL_STORE_SELECT_PSTO	0x1
+#define ICE_AQC_DNL_STORE_SELECT_STACK	0x2
+	__le16 offset;
+	__le32 data; /* Used for write sto only */
+	__le32 addr_high; /* Used for read sto only */
+	__le32 addr_low; /* Used for read sto only */
+};
+
+
+/* Used for both command responses:
+ * DNL read sto response (indirect 0x0683)
+ * DNL write sto response (indirect 0x0684)
+ */
+struct ice_aqc_dnl_read_write_response {
+	u8 reserved;
+	u8 status; /* Reserved for read command */
+	__le16 size; /* Reserved for write command */
+	__le32 data; /* Reserved for write command */
+	__le32 addr_high; /* Reserved for write command */
+	__le32 addr_low; /* Reserved for write command */
+};
+
+
+/* DNL set breakpoints command (indirect 0x0686) */
+struct ice_aqc_dnl_set_breakpoints_command {
+	__le32 reserved[2];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* DNL set breakpoints data buffer structure (indirect 0x0686) */
+struct ice_aqc_dnl_set_breakpoints {
+	u8 ctx;
+	u8 ena; /* 0- disabled, 1- enabled */
+	__le16 offset;
+	__le16 activity_id;
+};
+
+
+/* DNL read log data command(indirect 0x0687) */
+struct ice_aqc_dnl_read_log_command {
+	__le16 reserved0;
+	__le16 offset;
+	__le32 reserved1;
+	__le32 addr_high;
+	__le32 addr_low;
+
+};
+
+
+/* DNL read log data response(indirect 0x0687) */
+struct ice_aqc_dnl_read_log_response {
+	__le16 reserved;
+	__le16 size;
+	__le32 data;
+	__le32 addr_high;
+	__le32 addr_low;
+
+};
+
+
+struct ice_aqc_link_topo_params {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_LINK_TOPO_PORT_NUM_VALID	BIT(0)
+	u8 node_type_ctx;
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_S		0
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_M	(0xF << ICE_AQC_LINK_TOPO_NODE_TYPE_S)
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_PHY		0
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_GPIO_CTRL	1
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_MUX_CTRL	2
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_LED_CTRL	3
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_LED		4
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_THERMAL	5
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_CAGE	6
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_MEZZ	7
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_ID_EEPROM	8
+#define ICE_AQC_LINK_TOPO_NODE_CTX_S		4
+#define ICE_AQC_LINK_TOPO_NODE_CTX_M		\
+				(0xF << ICE_AQC_LINK_TOPO_NODE_CTX_S)
+#define ICE_AQC_LINK_TOPO_NODE_CTX_GLOBAL	0
+#define ICE_AQC_LINK_TOPO_NODE_CTX_BOARD	1
+#define ICE_AQC_LINK_TOPO_NODE_CTX_PORT		2
+#define ICE_AQC_LINK_TOPO_NODE_CTX_NODE		3
+#define ICE_AQC_LINK_TOPO_NODE_CTX_PROVIDED	4
+#define ICE_AQC_LINK_TOPO_NODE_CTX_OVERRIDE	5
+	u8 index;
+};
+
+
+struct ice_aqc_link_topo_addr {
+	struct ice_aqc_link_topo_params topo_params;
+	__le16 handle;
+#define ICE_AQC_LINK_TOPO_HANDLE_S	0
+#define ICE_AQC_LINK_TOPO_HANDLE_M	(0x3FF << ICE_AQC_LINK_TOPO_HANDLE_S)
+/* Used to decode the handle field */
+#define ICE_AQC_LINK_TOPO_HANDLE_BRD_TYPE_M	BIT(9)
+#define ICE_AQC_LINK_TOPO_HANDLE_BRD_TYPE_LOM	BIT(9)
+#define ICE_AQC_LINK_TOPO_HANDLE_BRD_TYPE_MEZZ	0
+#define ICE_AQC_LINK_TOPO_HANDLE_NODE_S		0
+/* In case of a Mezzanine type */
+#define ICE_AQC_LINK_TOPO_HANDLE_MEZZ_NODE_M	\
+				(0x3F << ICE_AQC_LINK_TOPO_HANDLE_NODE_S)
+#define ICE_AQC_LINK_TOPO_HANDLE_MEZZ_S	6
+#define ICE_AQC_LINK_TOPO_HANDLE_MEZZ_M	(0x7 << ICE_AQC_LINK_TOPO_HANDLE_MEZZ_S)
+/* In case of a LOM type */
+#define ICE_AQC_LINK_TOPO_HANDLE_LOM_NODE_M	\
+				(0x1FF << ICE_AQC_LINK_TOPO_HANDLE_NODE_S)
+};
+
+
+/* Get Link Topology Handle (direct, 0x06E0) */
+struct ice_aqc_get_link_topo {
+	struct ice_aqc_link_topo_addr addr;
+	u8 node_part_num;
+#define ICE_ACQ_GET_LINK_TOPO_NODE_NR_PCA9575	0x21
+	u8 rsvd[9];
+};
+
+
+/* Get Link Topology Pin (direct, 0x06E1) */
+struct ice_aqc_get_link_topo_pin {
+	struct ice_aqc_link_topo_addr addr;
+	u8 input_io_params;
+#define ICE_AQC_LINK_TOPO_INPUT_IO_FUNC_S	0
+#define ICE_AQC_LINK_TOPO_INPUT_IO_FUNC_M	\
+				(0x1F << ICE_AQC_LINK_TOPO_INPUT_IO_FUNC_S)
+#define ICE_AQC_LINK_TOPO_IO_FUNC_GPIO		0
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RESET_N	1
+#define ICE_AQC_LINK_TOPO_IO_FUNC_INT_N		2
+#define ICE_AQC_LINK_TOPO_IO_FUNC_PRESENT_N	3
+#define ICE_AQC_LINK_TOPO_IO_FUNC_TX_DIS	4
+#define ICE_AQC_LINK_TOPO_IO_FUNC_MODSEL_N	5
+#define ICE_AQC_LINK_TOPO_IO_FUNC_LPMODE	6
+#define ICE_AQC_LINK_TOPO_IO_FUNC_TX_FAULT	7
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RX_LOSS	8
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RS0		9
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RS1		10
+#define ICE_AQC_LINK_TOPO_IO_FUNC_EEPROM_WP	11
+/* 12 repeats intentionally due to two different uses depending on context */
+#define ICE_AQC_LINK_TOPO_IO_FUNC_LED		12
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RED_LED	12
+#define ICE_AQC_LINK_TOPO_IO_FUNC_GREEN_LED	13
+#define ICE_AQC_LINK_TOPO_IO_FUNC_BLUE_LED	14
+#define ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_S	5
+#define ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_M	\
+			(0x7 << ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_S)
+#define ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_GPIO	3
+/* Use ICE_AQC_LINK_TOPO_NODE_TYPE_* for the type values */
+	u8 output_io_params;
+#define ICE_AQC_LINK_TOPO_OUTPUT_IO_FUNC_S	0
+#define ICE_AQC_LINK_TOPO_OUTPUT_IO_FUNC_M	\
+			(0x1F << \ ICE_AQC_LINK_TOPO_INPUT_IO_FUNC_NUM_S)
+/* Use ICE_AQC_LINK_TOPO_IO_FUNC_* for the non-numerical options */
+#define ICE_AQC_LINK_TOPO_OUTPUT_IO_TYPE_S	5
+#define ICE_AQC_LINK_TOPO_OUTPUT_IO_TYPE_M	\
+			(0x7 << ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_S)
+/* Use ICE_AQC_LINK_TOPO_NODE_TYPE_* for the type values */
+	u8 output_io_flags;
+#define ICE_AQC_LINK_TOPO_OUTPUT_SPEED_S	0
+#define ICE_AQC_LINK_TOPO_OUTPUT_SPEED_M	\
+			(0x7 << ICE_AQC_LINK_TOPO_OUTPUT_SPEED_S)
+#define ICE_AQC_LINK_TOPO_OUTPUT_INT_S		3
+#define ICE_AQC_LINK_TOPO_OUTPUT_INT_M		\
+			(0x3 << ICE_AQC_LINK_TOPO_OUTPUT_INT_S)
+#define ICE_AQC_LINK_TOPO_OUTPUT_POLARITY	BIT(5)
+#define ICE_AQC_LINK_TOPO_OUTPUT_VALUE		BIT(6)
+#define ICE_AQC_LINK_TOPO_OUTPUT_DRIVEN		BIT(7)
+	u8 rsvd[7];
+};
+
+/* Read/Write I2C (direct, 0x06E2/0x06E3) */
+struct ice_aqc_i2c {
+	struct ice_aqc_link_topo_addr topo_addr;
+	__le16 i2c_addr;
+	u8 i2c_params;
+#define ICE_AQC_I2C_DATA_SIZE_S		0
+#define ICE_AQC_I2C_DATA_SIZE_M		(0xF << ICE_AQC_I2C_DATA_SIZE_S)
+#define ICE_AQC_I2C_ADDR_TYPE_M		BIT(4)
+#define ICE_AQC_I2C_ADDR_TYPE_7BIT	0
+#define ICE_AQC_I2C_ADDR_TYPE_10BIT	ICE_AQC_I2C_ADDR_TYPE_M
+#define ICE_AQC_I2C_DATA_OFFSET_S	5
+#define ICE_AQC_I2C_DATA_OFFSET_M	(0x3 << ICE_AQC_I2C_DATA_OFFSET_S)
+#define ICE_AQC_I2C_USE_REPEATED_START	BIT(7)
+	u8 rsvd;
+	__le16 i2c_bus_addr;
+#define ICE_AQC_I2C_ADDR_7BIT_MASK	0x7F
+#define ICE_AQC_I2C_ADDR_10BIT_MASK	0x3FF
+	u8 i2c_data[4]; /* Used only by write command, reserved in read. */
+};
+
+
+/* Read I2C Response (direct, 0x06E2) */
+struct ice_aqc_read_i2c_resp {
+	u8 i2c_data[16];
+};
+
+
+/* Read/Write MDIO (direct, 0x06E4/0x06E5) */
+struct ice_aqc_mdio {
+	struct ice_aqc_link_topo_addr topo_addr;
+	u8 mdio_device_addr;
+#define ICE_AQC_MDIO_DEV_S	0
+#define ICE_AQC_MDIO_DEV_M	(0x1F << ICE_AQC_MDIO_DEV_S)
+#define ICE_AQC_MDIO_CLAUSE_22	BIT(5)
+#define ICE_AQC_MDIO_CLAUSE_45	BIT(6)
+	u8 mdio_bus_address;
+#define ICE_AQC_MDIO_BUS_ADDR_S 0
+#define ICE_AQC_MDIO_BUS_ADDR_M (0x1F << ICE_AQC_MDIO_BUS_ADDR_S)
+	__le16 offset;
+	__le16 data; /* Input in write cmd, output in read cmd. */
+	u8 rsvd1[4];
+};
+
+
+/* Set/Get GPIO By Function (direct, 0x06E6/0x06E7) */
+struct ice_aqc_gpio_by_func {
+	struct ice_aqc_link_topo_addr topo_addr;
+	u8 io_func_num;
+#define ICE_AQC_GPIO_FUNC_S	0
+#define ICE_AQC_GPIO_FUNC_M	(0x1F << ICE_AQC_GPIO_IO_FUNC_NUM_S)
+	u8 io_value; /* Input in write cmd, output in read cmd. */
+#define ICE_AQC_GPIO_ON		BIT(0)
+#define ICE_AQC_GPIO_OFF	0
+	u8 rsvd[8];
+};
+
+
+/* Set LED (direct, 0x06E8) */
+struct ice_aqc_set_led {
+	struct ice_aqc_link_topo_addr topo_addr;
+	u8 color_and_blink;
+#define ICE_AQC_LED_COLOR_S		0
+#define ICE_AQC_LED_COLOR_M		(0x7 << ICE_AQC_LED_COLOR_S)
+#define ICE_AQC_LED_COLOR_SKIP		0
+#define ICE_AQC_LED_COLOR_RED		1
+#define ICE_AQC_LED_COLOR_ORANGE	2
+#define ICE_AQC_LED_COLOR_YELLOW	3
+#define ICE_AQC_LED_COLOR_GREEN		4
+#define ICE_AQC_LED_COLOR_BLUE		5
+#define ICE_AQC_LED_COLOR_PURPLE	6
+#define ICE_AQC_LED_BLINK_S		3
+#define ICE_AQC_LED_BLINK_M		(0x7 << ICE_AQC_LED_BLINK_S)
+#define ICE_AQC_LED_BLINK_NONE		0
+#define ICE_AQC_LED_BLINK_SLOW		1
+#define ICE_AQC_LED_BLINK_SLOW_MAC	2
+#define ICE_AQC_LED_BLINK_SLOW_FLTR	3
+#define ICE_AQC_LED_BLINK_FAST		5
+#define ICE_AQC_LED_BLINK_FAST_MAC	6
+#define ICE_AQC_LED_BLINK_FAST_FLTR	7
+	u8 rsvd[9];
+};
+
+
+/* Set Port Identification LED (direct, 0x06E9) */
+struct ice_aqc_set_port_id_led {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_PORT_ID_PORT_NUM_VALID	BIT(0)
+	u8 ident_mode;
+#define ICE_AQC_PORT_IDENT_LED_BLINK	BIT(0)
+#define ICE_AQC_PORT_IDENT_LED_ORIG	0
+	u8 rsvd[13];
+};
+
+
+/* Get Port Options (indirect, 0x06EA) */
+struct ice_aqc_get_port_options {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_PORT_OPT_PORT_NUM_VALID	BIT(0)
+	u8 port_options_count;
+#define ICE_AQC_PORT_OPT_COUNT_S	0
+#define ICE_AQC_PORT_OPT_COUNT_M	(0xF << ICE_AQC_PORT_OPT_COUNT_S)
+	u8 innermost_phy_index;
+	u8 port_options;
+#define ICE_AQC_PORT_OPT_ACTIVE_S	0
+#define ICE_AQC_PORT_OPT_ACTIVE_M	(0xF << ICE_AQC_PORT_OPT_ACTIVE_S)
+#define ICE_AQC_PORT_OPT_FORCED		BIT(6)
+#define ICE_AQC_PORT_OPT_VALID		BIT(7)
+	u8 pending_port_option_status;
+#define ICE_AQC_PENDING_PORT_OPT_IDX_S	0
+#define ICE_AQC_PENDING_PORT_OPT_IDX_M	(0xF << ICE_AQC_PENDING_PORT_OPT_IDX_S)
+#define ICE_AQC_PENDING_PORT_OPT_VALID	BIT(7)
+	u8 rsvd[2];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_get_port_options_elem {
+	u8 pmd;
+#define ICE_AQC_PORT_INV_PORT_OPT	4
+#define ICE_AQC_PORT_OPT_PMD_COUNT_S	0
+#define ICE_AQC_PORT_OPT_PMD_COUNT_M	(0xF << ICE_AQC_PORT_OPT_PMD_COUNT_S)
+#define ICE_AQC_PORT_OPT_PMD_WIDTH_S	4
+#define ICE_AQC_PORT_OPT_PMD_WIDTH_M	(0xF << ICE_AQC_PORT_OPT_PMD_WIDTH_S)
+	u8 max_lane_speed;
+#define ICE_AQC_PORT_OPT_MAX_LANE_S	0
+#define ICE_AQC_PORT_OPT_MAX_LANE_M	(0xF << ICE_AQC_PORT_OPT_MAX_LANE_S)
+#define ICE_AQC_PORT_OPT_MAX_LANE_100M	0
+#define ICE_AQC_PORT_OPT_MAX_LANE_1G	1
+#define ICE_AQC_PORT_OPT_MAX_LANE_2500M	2
+#define ICE_AQC_PORT_OPT_MAX_LANE_5G	3
+#define ICE_AQC_PORT_OPT_MAX_LANE_10G	4
+#define ICE_AQC_PORT_OPT_MAX_LANE_25G	5
+#define ICE_AQC_PORT_OPT_MAX_LANE_50G	6
+#define ICE_AQC_PORT_OPT_MAX_LANE_100G	7
+	u8 global_scid[2];
+	u8 phy_scid[2];
+	u8 pf2port_cid[2];
+};
+
+
+/* Set Port Option (direct, 0x06EB) */
+struct ice_aqc_set_port_option {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_SET_PORT_OPT_PORT_NUM_VALID	BIT(0)
+	u8 selected_port_option;
+	u8 rsvd[13];
+};
+
+
+/* Set/Get GPIO (direct, 0x06EC/0x06ED) */
+struct ice_aqc_gpio {
+	__le16 gpio_ctrl_handle;
+#define ICE_AQC_GPIO_HANDLE_S	0
+#define ICE_AQC_GPIO_HANDLE_M	(0x3FF << ICE_AQC_GPIO_HANDLE_S)
+	u8 gpio_num;
+	u8 gpio_val;
+	u8 rsvd[12];
+};
+
+
+/* Read/Write SFF EEPROM command (indirect 0x06EE) */
+struct ice_aqc_sff_eeprom {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_SFF_PORT_NUM_VALID	BIT(0)
+	__le16 i2c_bus_addr;
+#define ICE_AQC_SFF_I2CBUS_7BIT_M	0x7F
+#define ICE_AQC_SFF_I2CBUS_10BIT_M	0x3FF
+#define ICE_AQC_SFF_I2CBUS_TYPE_M	BIT(10)
+#define ICE_AQC_SFF_I2CBUS_TYPE_7BIT	0
+#define ICE_AQC_SFF_I2CBUS_TYPE_10BIT	ICE_AQC_SFF_I2CBUS_TYPE_M
+#define ICE_AQC_SFF_SET_EEPROM_PAGE_S	11
+#define ICE_AQC_SFF_SET_EEPROM_PAGE_M	(0x3 << ICE_AQC_SFF_SET_EEPROM_PAGE_S)
+#define ICE_AQC_SFF_NO_PAGE_CHANGE	0
+#define ICE_AQC_SFF_SET_23_ON_MISMATCH	1
+#define ICE_AQC_SFF_SET_22_ON_MISMATCH	2
+#define ICE_AQC_SFF_IS_WRITE		BIT(15)
+	__le16 i2c_mem_addr;
+	__le16 eeprom_page;
+#define  ICE_AQC_SFF_EEPROM_BANK_S 0
+#define  ICE_AQC_SFF_EEPROM_BANK_M (0xFF << ICE_AQC_SFF_EEPROM_BANK_S)
+#define  ICE_AQC_SFF_EEPROM_PAGE_S 8
+#define  ICE_AQC_SFF_EEPROM_PAGE_M (0xFF << ICE_AQC_SFF_EEPROM_PAGE_S)
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* SW Set GPIO command (indirect 0x6EF)
+ * SW Get GPIO command (indirect 0x6F0)
+ */
+struct ice_aqc_sw_gpio {
+	__le16 gpio_ctrl_handle;
+#define ICE_AQC_SW_GPIO_CONTROLLER_HANDLE_S	0
+#define ICE_AQC_SW_GPIO_CONTROLLER_HANDLE_M	(0x3FF << ICE_AQC_SW_GPIO_CONTROLLER_HANDLE_S)
+	u8 gpio_num;
+#define ICE_AQC_SW_GPIO_NUMBER_S	0
+#define ICE_AQC_SW_GPIO_NUMBER_M	(0x1F << ICE_AQC_SW_GPIO_NUMBER_S)
+	u8 gpio_params;
+#define ICE_AQC_SW_GPIO_PARAMS_DIRECTION    BIT(1)
+#define ICE_AQC_SW_GPIO_PARAMS_VALUE        BIT(0)
+	u8 rsvd[12];
+};
+
+
+/* Program Topology Device NVM (direct, 0x06F2) */
+struct ice_aqc_prog_topo_dev_nvm {
+	struct ice_aqc_link_topo_params topo_params;
+	u8 rsvd[12];
+};
+
+
+/* Read Topology Device NVM (direct, 0x06F3) */
+struct ice_aqc_read_topo_dev_nvm {
+	struct ice_aqc_link_topo_params topo_params;
+	__le32 start_address;
+#define ICE_AQC_READ_TOPO_DEV_NVM_DATA_READ_SIZE 8
+	u8 data_read[ICE_AQC_READ_TOPO_DEV_NVM_DATA_READ_SIZE];
+};
+
+
+/* NVM Read command (indirect 0x0701)
+ * NVM Erase commands (direct 0x0702)
+ * NVM Write commands (indirect 0x0703)
+ * NVM Write Activate commands (direct 0x0707)
+ * NVM Shadow RAM Dump commands (direct 0x0707)
+ */
+struct ice_aqc_nvm {
+#define ICE_AQC_NVM_MAX_OFFSET		0xFFFFFF
+	__le16 offset_low;
+	u8 offset_high; /* For Write Activate offset_high is used as flags2 */
+	u8 cmd_flags;
+#define ICE_AQC_NVM_LAST_CMD		BIT(0)
+#define ICE_AQC_NVM_PCIR_REQ		BIT(0)	/* Used by NVM Write reply */
+#define ICE_AQC_NVM_PRESERVATION_S	1 /* Used by NVM Write Activate only */
+#define ICE_AQC_NVM_PRESERVATION_M	(3 << ICE_AQC_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_NO_PRESERVATION	(0 << ICE_AQC_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_PRESERVE_ALL	BIT(1)
+#define ICE_AQC_NVM_FACTORY_DEFAULT	(2 << ICE_AQC_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_PRESERVE_SELECTED	(3 << ICE_AQC_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_ACTIV_SEL_NVM	BIT(3) /* Write Activate/SR Dump only */
+#define ICE_AQC_NVM_ACTIV_SEL_OROM	BIT(4)
+#define ICE_AQC_NVM_ACTIV_SEL_NETLIST	BIT(5)
+#define ICE_AQC_NVM_SPECIAL_UPDATE	BIT(6)
+#define ICE_AQC_NVM_REVERT_LAST_ACTIV	BIT(6) /* Write Activate only */
+#define ICE_AQC_NVM_ACTIV_SEL_MASK	ICE_M(0x7, 3)
+#define ICE_AQC_NVM_FLASH_ONLY		BIT(7)
+#define ICE_AQC_NVM_RESET_LVL_M		ICE_M(0x3, 0) /* Write reply only */
+#define ICE_AQC_NVM_POR_FLAG		0
+#define ICE_AQC_NVM_PERST_FLAG		1
+#define ICE_AQC_NVM_EMPR_FLAG		2
+#define ICE_AQC_NVM_EMPR_ENA		BIT(0) /* Write Activate reply only */
+	__le16 module_typeid;
+	__le16 length;
+#define ICE_AQC_NVM_ERASE_LEN	0xFFFF
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+/* NVM Module_Type ID, needed offset and read_len for struct ice_aqc_nvm. */
+#define ICE_AQC_NVM_SECTOR_UNIT			4096 /* In Bytes */
+#define ICE_AQC_NVM_WORD_UNIT			2 /* In Bytes */
+
+#define ICE_AQC_NVM_START_POINT			0
+#define ICE_AQC_NVM_EMP_SR_PTR_OFFSET		0x90
+#define ICE_AQC_NVM_EMP_SR_PTR_RD_LEN		2 /* In Bytes */
+#define ICE_AQC_NVM_EMP_SR_PTR_M		ICE_M(0x7FFF, 0)
+#define ICE_AQC_NVM_EMP_SR_PTR_TYPE_S		15
+#define ICE_AQC_NVM_EMP_SR_PTR_TYPE_M		BIT(15)
+#define ICE_AQC_NVM_EMP_SR_PTR_TYPE_SECTOR	1
+
+#define ICE_AQC_NVM_LLDP_CFG_PTR_OFFSET		0x46
+#define ICE_AQC_NVM_LLDP_CFG_HEADER_LEN		2 /* In Bytes */
+#define ICE_AQC_NVM_LLDP_CFG_PTR_RD_LEN		2 /* In Bytes */
+
+#define ICE_AQC_NVM_LLDP_PRESERVED_MOD_ID	0x129
+#define ICE_AQC_NVM_CUR_LLDP_PERSIST_RD_OFFSET	2 /* In Bytes */
+#define ICE_AQC_NVM_LLDP_STATUS_M		ICE_M(0xF, 0)
+#define ICE_AQC_NVM_LLDP_STATUS_M_LEN		4 /* In Bits */
+#define ICE_AQC_NVM_LLDP_STATUS_RD_LEN		4 /* In Bytes */
+
+#define ICE_AQC_NVM_MINSREV_MOD_ID		0x130
+
+
+/* Used for reading and writing MinSRev using 0x0701 and 0x0703. Note that the
+ * type field is excluded from the section when reading and writing from
+ * a module using the module_typeid field with these AQ commands.
+ */
+struct ice_aqc_nvm_minsrev {
+	__le16 length;
+	__le16 validity;
+#define ICE_AQC_NVM_MINSREV_NVM_VALID		BIT(0)
+#define ICE_AQC_NVM_MINSREV_OROM_VALID		BIT(1)
+	__le16 nvm_minsrev_l;
+	__le16 nvm_minsrev_h;
+	__le16 orom_minsrev_l;
+	__le16 orom_minsrev_h;
+};
+
+
+/* Used for 0x0704 as well as for 0x0705 commands */
+struct ice_aqc_nvm_cfg {
+	u8	cmd_flags;
+#define ICE_AQC_ANVM_MULTIPLE_ELEMS	BIT(0)
+#define ICE_AQC_ANVM_IMMEDIATE_FIELD	BIT(1)
+#define ICE_AQC_ANVM_NEW_CFG		BIT(2)
+	u8	reserved;
+	__le16 count;
+	__le16 id;
+	u8 reserved1[2];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_nvm_cfg_data {
+	__le16 field_id;
+	__le16 field_options;
+	__le16 field_value;
+};
+
+
+/* NVM Checksum Command (direct, 0x0706) */
+struct ice_aqc_nvm_checksum {
+	u8 flags;
+#define ICE_AQC_NVM_CHECKSUM_VERIFY	BIT(0)
+#define ICE_AQC_NVM_CHECKSUM_RECALC	BIT(1)
+	u8 rsvd;
+	__le16 checksum; /* Used only by response */
+#define ICE_AQC_NVM_CHECKSUM_CORRECT	0xBABA
+	u8 rsvd2[12];
+};
+
+
+/* Used for NVM Set Package Data command - 0x070A */
+struct ice_aqc_nvm_pkg_data {
+	u8 reserved[3];
+	u8 cmd_flags;
+#define ICE_AQC_NVM_PKG_DELETE		BIT(0) /* used for command call */
+#define ICE_AQC_NVM_PKG_SKIPPED		BIT(0) /* used for command response */
+
+	u32 reserved1;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Used for Pass Component Table command - 0x070B */
+struct ice_aqc_nvm_pass_comp_tbl {
+	u8 component_response; /* Response only */
+#define ICE_AQ_NVM_PASS_COMP_CAN_BE_UPDATED		0x0
+#define ICE_AQ_NVM_PASS_COMP_CAN_MAY_BE_UPDATEABLE	0x1
+#define ICE_AQ_NVM_PASS_COMP_CAN_NOT_BE_UPDATED		0x2
+#define ICE_AQ_NVM_PASS_COMP_PARTIAL_CHECK		0x3
+	u8 component_response_code; /* Response only */
+#define ICE_AQ_NVM_PASS_COMP_CAN_BE_UPDATED_CODE	0x0
+#define ICE_AQ_NVM_PASS_COMP_STAMP_IDENTICAL_CODE	0x1
+#define ICE_AQ_NVM_PASS_COMP_STAMP_LOWER		0x2
+#define ICE_AQ_NVM_PASS_COMP_INVALID_STAMP_CODE		0x3
+#define ICE_AQ_NVM_PASS_COMP_CONFLICT_CODE		0x4
+#define ICE_AQ_NVM_PASS_COMP_PRE_REQ_NOT_MET_CODE	0x5
+#define ICE_AQ_NVM_PASS_COMP_NOT_SUPPORTED_CODE		0x6
+#define ICE_AQ_NVM_PASS_COMP_CANNOT_DOWNGRADE_CODE	0x7
+#define ICE_AQ_NVM_PASS_COMP_INCOMPLETE_IMAGE_CODE	0x8
+#define ICE_AQ_NVM_PASS_COMP_VER_STR_IDENTICAL_CODE	0xA
+#define ICE_AQ_NVM_PASS_COMP_VER_STR_LOWER_CODE		0xB
+	u8 reserved;
+	u8 transfer_flag;
+#define ICE_AQ_NVM_PASS_COMP_TBL_START			0x1
+#define ICE_AQ_NVM_PASS_COMP_TBL_MIDDLE			0x2
+#define ICE_AQ_NVM_PASS_COMP_TBL_END			0x4
+#define ICE_AQ_NVM_PASS_COMP_TBL_START_AND_END		0x5
+	__le32 reserved1;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_nvm_comp_tbl {
+	__le16 comp_class;
+#define NVM_COMP_CLASS_ALL_FW	0x000A
+
+	__le16 comp_id;
+#define NVM_COMP_ID_OROM	0x5
+#define NVM_COMP_ID_NVM		0x6
+#define NVM_COMP_ID_NETLIST	0x8
+
+	u8 comp_class_idx;
+#define FWU_COMP_CLASS_IDX_NOT_USE 0x0
+
+	__le32 comp_cmp_stamp;
+	u8 cvs_type;
+#define NVM_CVS_TYPE_ASCII	0x1
+
+	u8 cvs_len;
+	u8 cvs[]; /* Component Version String */
+} __packed;
+
+
+/*
+ * Send to PF command (indirect 0x0801) ID is only used by PF
+ *
+ * Send to VF command (indirect 0x0802) ID is only used by PF
+ *
+ */
+struct ice_aqc_pf_vf_msg {
+	__le32 id;
+	u32 reserved;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+
 /* Get LLDP MIB (indirect 0x0A00)
  * Note: This is also used by the LLDP MIB Change Event (0x0A01)
  * as the format is the same.
@@ -1227,6 +2484,7 @@ struct ice_aqc_lldp_get_mib {
 	__le32 addr_low;
 };
 
+
 /* Configure LLDP MIB Change Event (direct 0x0A01) */
 /* For MIB Change Event use ice_aqc_lldp_get_mib structure above */
 struct ice_aqc_lldp_set_mib_change {
@@ -1236,6 +2494,32 @@ struct ice_aqc_lldp_set_mib_change {
 	u8 reserved[15];
 };
 
+
+/* Add LLDP TLV (indirect 0x0A02)
+ * Delete LLDP TLV (indirect 0x0A04)
+ */
+struct ice_aqc_lldp_add_delete_tlv {
+	u8 type; /* only nearest bridge and non-TPMR from 0x0A00 */
+	u8 reserved1[1];
+	__le16 len;
+	u8 reserved2[4];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Update LLDP TLV (indirect 0x0A03) */
+struct ice_aqc_lldp_update_tlv {
+	u8 type; /* only nearest bridge and non-TPMR from 0x0A00 */
+	u8 reserved;
+	__le16 old_len;
+	__le16 new_offset;
+	__le16 new_len;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
 /* Stop LLDP (direct 0x0A05) */
 struct ice_aqc_lldp_stop {
 	u8 command;
@@ -1246,6 +2530,7 @@ struct ice_aqc_lldp_stop {
 	u8 reserved[15];
 };
 
+
 /* Start LLDP (direct 0x0A06) */
 struct ice_aqc_lldp_start {
 	u8 command;
@@ -1254,6 +2539,7 @@ struct ice_aqc_lldp_start {
 	u8 reserved[15];
 };
 
+
 /* Get CEE DCBX Oper Config (0x0A07)
  * The command uses the generic descriptor struct and
  * returns the struct below as an indirect response.
@@ -1284,6 +2570,7 @@ struct ice_aqc_get_cee_dcb_cfg_resp {
 	u8 reserved[12];
 };
 
+
 /* Set Local LLDP MIB (indirect 0x0A08)
  * Used to replace the local MIB of a given LLDP agent. e.g. DCBX
  */
@@ -1301,76 +2588,571 @@ struct ice_aqc_lldp_set_local_mib {
 	__le32 addr_low;
 };
 
-/* Stop/Start LLDP Agent (direct 0x0A09)
- * Used for stopping/starting specific LLDP agent. e.g. DCBX.
- * The same structure is used for the response, with the command field
- * being used as the status field.
+
+struct ice_aqc_lldp_set_local_mib_resp {
+	u8 status;
+#define SET_LOCAL_MIB_RESP_EVENT_M		BIT(0)
+#define SET_LOCAL_MIB_RESP_MIB_CHANGE_SILENT	0
+#define SET_LOCAL_MIB_RESP_MIB_CHANGE_EVENT	SET_LOCAL_MIB_RESP_EVENT_M
+	u8 reserved[15];
+};
+
+
+/* Stop/Start LLDP Agent (direct 0x0A09)
+ * Used for stopping/starting specific LLDP agent. e.g. DCBX.
+ * The same structure is used for the response, with the command field
+ * being used as the status field.
+ */
+struct ice_aqc_lldp_stop_start_specific_agent {
+	u8 command;
+#define ICE_AQC_START_STOP_AGENT_M		BIT(0)
+#define ICE_AQC_START_STOP_AGENT_STOP_DCBX	0
+#define ICE_AQC_START_STOP_AGENT_START_DCBX	ICE_AQC_START_STOP_AGENT_M
+	u8 reserved[15];
+};
+
+
+/* LLDP Filter Control (direct 0x0A0A) */
+struct ice_aqc_lldp_filter_ctrl {
+	u8 cmd_flags;
+#define ICE_AQC_LLDP_FILTER_ACTION_M		ICE_M(3, 0)
+#define ICE_AQC_LLDP_FILTER_ACTION_ADD		0x0
+#define ICE_AQC_LLDP_FILTER_ACTION_DELETE	0x1
+#define ICE_AQC_LLDP_FILTER_ACTION_UPDATE	0x2
+	u8 reserved1;
+	__le16 vsi_num;
+	u8 reserved2[12];
+};
+
+
+/* Get/Set RSS key (indirect 0x0B04/0x0B02) */
+struct ice_aqc_get_set_rss_key {
+#define ICE_AQC_GSET_RSS_KEY_VSI_VALID	BIT(15)
+#define ICE_AQC_GSET_RSS_KEY_VSI_ID_S	0
+#define ICE_AQC_GSET_RSS_KEY_VSI_ID_M	(0x3FF << ICE_AQC_GSET_RSS_KEY_VSI_ID_S)
+	__le16 vsi_id;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+#define ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE	0x28
+#define ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE	0xC
+#define ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE \
+				(ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE + \
+				 ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)
+
+/**
+ * struct ice_aqc_get_set_rss_keys - Get/Set RSS hash key command buffer
+ * @standard_rss_key: 40 most significant bytes of hash key
+ * @extended_hash_key: 12 least significant bytes of hash key
+ *
+ * Set/Get 40 byte hash key using standard_rss_key field, and set
+ * extended_hash_key field to zero. Set/Get 52 byte hash key using
+ * standard_rss_key field for 40 most significant bytes and the
+ * extended_hash_key field for the 12 least significant bytes of hash key.
+ */
+struct ice_aqc_get_set_rss_keys {
+	u8 standard_rss_key[ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE];
+	u8 extended_hash_key[ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE];
+};
+
+
+/* Get/Set RSS LUT (indirect 0x0B05/0x0B03) */
+struct ice_aqc_get_set_rss_lut {
+#define ICE_AQC_GSET_RSS_LUT_VSI_VALID	BIT(15)
+#define ICE_AQC_GSET_RSS_LUT_VSI_ID_S	0
+#define ICE_AQC_GSET_RSS_LUT_VSI_ID_M	(0x3FF << ICE_AQC_GSET_RSS_LUT_VSI_ID_S)
+	__le16 vsi_id;
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S	0
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_M	\
+				(0x3 << ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S)
+
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI	 0
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF	 1
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL	 2
+
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S	 2
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M	 \
+				(0x3 << ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S)
+
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128	 128
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128_FLAG 0
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512	 512
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512_FLAG 1
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K	 2048
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K_FLAG	 2
+
+#define ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S	 4
+#define ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_M	 \
+				(0xF << ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S)
+
+	__le16 flags;
+	__le32 reserved;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Clear FD Table Command (direct, 0x0B06) */
+struct ice_aqc_clear_fd_table {
+	u8 clear_type;
+#define CL_FD_VM_VF_TYPE_VSI_IDX	1
+#define CL_FD_VM_VF_TYPE_PF_IDX		2
+	u8 rsvd;
+	__le16 vsi_index;
+	u8 reserved[12];
+};
+
+
+/* Sideband Control Interface Commands */
+/* Neighbor Device Request (indirect 0x0C00); also used for the response. */
+struct ice_aqc_neigh_dev_req {
+	__le16 sb_data_len;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Allocate ACL table (indirect 0x0C10) */
+#define ICE_AQC_ACL_KEY_WIDTH		40
+#define ICE_AQC_ACL_KEY_WIDTH_BYTES	5
+#define ICE_AQC_ACL_TCAM_DEPTH		512
+#define ICE_ACL_ENTRY_ALLOC_UNIT	64
+#define ICE_AQC_MAX_CONCURRENT_ACL_TBL	15
+#define ICE_AQC_MAX_ACTION_MEMORIES	20
+#define ICE_AQC_MAX_ACTION_ENTRIES	512
+#define ICE_AQC_ACL_SLICES		16
+#define ICE_AQC_ALLOC_ID_LESS_THAN_4K	0x1000
+/* The ACL block supports up to 8 actions per a single output. */
+#define ICE_AQC_TBL_MAX_ACTION_PAIRS	4
+
+#define ICE_AQC_MAX_TCAM_ALLOC_UNITS	(ICE_AQC_ACL_TCAM_DEPTH / \
+					 ICE_ACL_ENTRY_ALLOC_UNIT)
+#define ICE_AQC_ACL_ALLOC_UNITS		(ICE_AQC_ACL_SLICES * \
+					 ICE_AQC_MAX_TCAM_ALLOC_UNITS)
+
+struct ice_aqc_acl_alloc_table {
+	__le16 table_width;
+	__le16 table_depth;
+	u8 act_pairs_per_entry;
+	/* For non-concurrent table allocation, this field needs
+	 * to be set to zero(0) otherwise it shall specify the
+	 * amount of concurrent tables whose AllocIDs are
+	 * specified in buffer. Thus the newly allocated table
+	 * is concurrent with table IDs specified in AllocIDs.
+	 */
+#define ICE_AQC_ACL_ALLOC_TABLE_TYPE_NONCONCURR	0
+	u8 table_type;
+	__le16 reserved;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Allocate ACL table command buffer format */
+struct ice_aqc_acl_alloc_table_data {
+	/* Dependent table AllocIDs. Each word in this 15 word array specifies
+	 * a dependent table AllocID according to the amount specified in the
+	 * "table_type" field. All unused words shall be set to 0xFFFF
+	 */
+#define ICE_AQC_CONCURR_ID_INVALID	0xffff
+	__le16 alloc_ids[ICE_AQC_MAX_CONCURRENT_ACL_TBL];
+};
+
+
+/* Deallocate ACL table (indirect 0x0C11)
+ * Allocate ACL action-pair (indirect 0x0C12)
+ * Deallocate ACL action-pair (indirect 0x0C13)
+ */
+
+/* Following structure is common and used in case of deallocation
+ * of ACL table and action-pair
+ */
+struct ice_aqc_acl_tbl_actpair {
+	/* Alloc ID of the table being released */
+	__le16 alloc_id;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* This response structure is same in case of alloc/dealloc table,
+ * alloc/dealloc action-pair
+ */
+struct ice_aqc_acl_generic {
+	/* if alloc_id is below 0x1000 then alllocation failed due to
+	 * unavailable resources, else this is set by FW to identify
+	 * table allocation
+	 */
+	__le16 alloc_id;
+
+	union {
+		/* to be used only in case of alloc/dealloc table */
+		struct {
+			/* Index of the first TCAM block, otherwise set to 0xFF
+			 * for a failed allocation
+			 */
+			u8 first_tcam;
+			/* Index of the last TCAM block. This index shall be
+			 * set to the value of first_tcam for single TCAM block
+			 * allocation, otherwise set to 0xFF for a failed
+			 * allocation
+			 */
+			u8 last_tcam;
+		} table;
+		/* reserved in case of alloc/dealloc action-pair */
+		struct {
+			__le16 reserved;
+		} act_pair;
+	} ops;
+
+	/* index of first entry (in both TCAM and action memories),
+	 * otherwise set to 0xFF for a failed allocation
+	 */
+	__le16 first_entry;
+	/* index of last entry (in both TCAM and action memories),
+	 * otherwise set to 0xFF for a failed allocation
+	 */
+	__le16 last_entry;
+
+	/* Each act_mem element specifies the order of the memory
+	 * otherwise 0xFF
+	 */
+	u8 act_mem[ICE_AQC_MAX_ACTION_MEMORIES];
+};
+
+
+/* Allocate ACL scenario (indirect 0x0C14). This command doesn't have separate
+ * response buffer since original command buffer gets updated with
+ * 'scen_id' in case of success
+ */
+struct ice_aqc_acl_alloc_scen {
+	union {
+		struct {
+			u8 reserved[8];
+		} cmd;
+		struct {
+			__le16 scen_id;
+			u8 reserved[6];
+		} resp;
+	} ops;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* De-allocate ACL scenario (direct 0x0C15). This command doesn't need
+ * separate response buffer since nothing to be returned as a response
+ * except status.
+ */
+struct ice_aqc_acl_dealloc_scen {
+	__le16 scen_id;
+	u8 reserved[14];
+};
+
+
+/* Update ACL scenario (direct 0x0C1B)
+ * Query ACL scenario (direct 0x0C23)
+ */
+struct ice_aqc_acl_update_query_scen {
+	__le16 scen_id;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Input buffer format in case allocate/update ACL scenario and same format
+ * is used for response buffer in case of query ACL scenario.
+ * NOTE: de-allocate ACL scenario is direct command and doesn't require
+ * "buffer", hence no buffer format.
+ */
+struct ice_aqc_acl_scen {
+	struct {
+		/* Byte [x] selection for the TCAM key. This value must be
+		 * set to 0x0 for unusued TCAM.
+		 * Only Bit 6..0 is used in each byte and MSB is reserved
+		 */
+#define ICE_AQC_ACL_ALLOC_SCE_SELECT_M		0x7F
+#define ICE_AQC_ACL_BYTE_SEL_BASE		0x20
+#define ICE_AQC_ACL_BYTE_SEL_BASE_PID		0x3E
+#define ICE_AQC_ACL_BYTE_SEL_BASE_PKT_DIR	ICE_AQC_ACL_BYTE_SEL_BASE
+#define ICE_AQC_ACL_BYTE_SEL_BASE_RNG_CHK	0x3F
+		u8 tcam_select[5];
+		/* TCAM Block entry masking. This value should be set to 0x0 for
+		 * unused TCAM
+		 */
+		u8 chnk_msk;
+		/* Bit 0 : masks TCAM entries 0-63
+		 * Bit 1 : masks TCAM entries 64-127
+		 * Bit 2 to 7 : follow the pattern of bit 0 and 1
+		 */
+#define ICE_AQC_ACL_ALLOC_SCE_START_CMP		BIT(0)
+#define ICE_AQC_ACL_ALLOC_SCE_START_SET		BIT(1)
+		u8 start_cmp_set;
+
+	} tcam_cfg[ICE_AQC_ACL_SLICES];
+
+	/* Each byte, Bit 6..0: Action memory association to a TCAM block,
+	 * otherwise it shall be set to 0x0 for disabled memory action.
+	 * Bit 7 : Action memory enable for this scenario
+	 */
+#define ICE_AQC_ACL_SCE_ACT_MEM_TCAM_ASSOC_M	0x7F
+#define ICE_AQC_ACL_SCE_ACT_MEM_EN		BIT(7)
+	u8 act_mem_cfg[ICE_AQC_MAX_ACTION_MEMORIES];
+};
+
+
+/* Allocate ACL counters (indirect 0x0C16) */
+struct ice_aqc_acl_alloc_counters {
+	/* Amount of contiguous counters requested. Min value is 1 and
+	 * max value is 255
+	 */
+#define ICE_AQC_ACL_ALLOC_CNT_MIN_AMT	0x1
+#define ICE_AQC_ACL_ALLOC_CNT_MAX_AMT	0xFF
+	u8 counter_amount;
+
+	/* Counter type: 'single counter' which can be configured to count
+	 * either bytes or packets
+	 */
+#define ICE_AQC_ACL_CNT_TYPE_SINGLE	0x0
+
+	/* Counter type: 'counter pair' which counts number of bytes and number
+	 * of packets.
+	 */
+#define ICE_AQC_ACL_CNT_TYPE_DUAL	0x1
+	/* requested counter type, single/dual */
+	u8 counters_type;
+
+	/* counter bank allocation shall be 0-3 for 'byte or packet counter' */
+#define ICE_AQC_ACL_MAX_CNT_SINGLE	0x3
+/* counter bank allocation shall be 0-1 for 'byte and packet counter dual' */
+#define ICE_AQC_ACL_MAX_CNT_DUAL	0x1
+	/* requested counter bank allocation */
+	u8 bank_alloc;
+
+	u8 reserved;
+
+	union {
+		/* Applicable only in case of command */
+		struct {
+			u8 reserved[12];
+		} cmd;
+		/* Applicable only in case of response */
+#define ICE_AQC_ACL_ALLOC_CNT_INVAL	0xFFFF
+		struct {
+			/* Index of first allocated counter. 0xFFFF in case
+			 * of unsuccessful allocation
+			 */
+			__le16 first_counter;
+			/* Index of last allocated counter. 0xFFFF in case
+			 * of unsuccessful allocation
+			 */
+			__le16 last_counter;
+			u8 rsvd[8];
+		} resp;
+	} ops;
+};
+
+
+/* De-allocate ACL counters (direct 0x0C17) */
+struct ice_aqc_acl_dealloc_counters {
+	/* first counter being released */
+	__le16 first_counter;
+	/* last counter being released */
+	__le16 last_counter;
+	/* requested counter type, single/dual */
+	u8 counters_type;
+	/* requested counter bank allocation */
+	u8 bank_alloc;
+	u8 reserved[10];
+};
+
+
+/* De-allocate ACL resources (direct 0x0C1A). Used by SW to release all the
+ * resources allocated for it using a single command
+ */
+struct ice_aqc_acl_dealloc_res {
+	u8 reserved[16];
+};
+
+
+/* Program ACL actionpair (indirect 0x0C1C)
+ * Query ACL actionpair (indirect 0x0C25)
+ */
+struct ice_aqc_acl_actpair {
+	/* action mem index to program/update */
+	u8 act_mem_index;
+	u8 reserved;
+	/* The entry index in action memory to be programmed/updated */
+	__le16 act_entry_index;
+	__le32 reserved2;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Input buffer format for program/query action-pair admin command */
+struct ice_acl_act_entry {
+	/* Action priority, values must be between 0..7 */
+#define ICE_AQC_ACT_PRIO_VALID_MAX	7
+#define ICE_AQC_ACT_PRIO_MSK		ICE_M(0xff, 0)
+	u8 prio;
+	/* Action meta-data identifier. This field should be set to 0x0
+	 * for a NOP action
+	 */
+#define ICE_AQC_ACT_MDID_S		8
+#define ICE_AQC_ACT_MDID_MSK		ICE_M(0xff00, ICE_AQC_ACT_MDID_S)
+	u8 mdid;
+	/* Action value */
+#define ICE_AQC_ACT_VALUE_S		16
+#define ICE_AQC_ACT_VALUE_MSK		ICE_M(0xffff0000, 16)
+	__le16 value;
+};
+
+
+#define ICE_ACL_NUM_ACT_PER_ACT_PAIR 2
+struct ice_aqc_actpair {
+	struct ice_acl_act_entry act[ICE_ACL_NUM_ACT_PER_ACT_PAIR];
+};
+
+
+/* Generic format used to describe either input or response buffer
+ * for admin commands related to ACL profile
+ */
+struct ice_aqc_acl_prof_generic_frmt {
+	/* The first byte of the byte selection base is reserved to keep the
+	 * first byte of the field vector where the packet direction info is
+	 * available. Thus we should start at index 1 of the field vector to
+	 * map its entries to the byte selection base.
+	 */
+#define ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX	1
+	/* In each byte:
+	 * Bit 0..5 = Byte selection for the byte selection base from the
+	 * extracted fields (expressed as byte offset in extracted fields).
+	 * Applicable values are 0..63
+	 * Bit 6..7 = Reserved
+	 */
+#define ICE_AQC_ACL_PROF_BYTE_SEL_ELEMS		30
+	u8 byte_selection[ICE_AQC_ACL_PROF_BYTE_SEL_ELEMS];
+	/* In each byte:
+	 * Bit 0..4 = Word selection for the word selection base from the
+	 * extracted fields (expressed as word offset in extracted fields).
+	 * Applicable values are 0..31
+	 * Bit 5..7 = Reserved
+	 */
+#define ICE_AQC_ACL_PROF_WORD_SEL_ELEMS		32
+	u8 word_selection[ICE_AQC_ACL_PROF_WORD_SEL_ELEMS];
+	/* In each byte:
+	 * Bit 0..3 = Double word selection for the double-word selection base
+	 * from the extracted fields (expressed as double-word offset in
+	 * extracted fields).
+	 * Applicable values are 0..15
+	 * Bit 4..7 = Reserved
+	 */
+#define ICE_AQC_ACL_PROF_DWORD_SEL_ELEMS	15
+	u8 dword_selection[ICE_AQC_ACL_PROF_DWORD_SEL_ELEMS];
+	/* Scenario numbers for individual Physical Function's */
+#define ICE_AQC_ACL_PROF_PF_SCEN_NUM_ELEMS	8
+	u8 pf_scenario_num[ICE_AQC_ACL_PROF_PF_SCEN_NUM_ELEMS];
+};
+
+
+/* Program ACL profile extraction (indirect 0x0C1D)
+ * Program ACL profile ranges (indirect 0x0C1E)
+ * Query ACL profile (indirect 0x0C21)
+ * Query ACL profile ranges (indirect 0x0C22)
+ */
+struct ice_aqc_acl_profile {
+	u8 profile_id; /* Programmed/Updated profile ID */
+	u8 reserved[7];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Input buffer format for program profile extraction admin command and
+ * response buffer format for query profile admin command is as defined
+ * in struct ice_aqc_acl_prof_generic_frmt
  */
-struct ice_aqc_lldp_stop_start_specific_agent {
-	u8 command;
-#define ICE_AQC_START_STOP_AGENT_M		BIT(0)
-#define ICE_AQC_START_STOP_AGENT_STOP_DCBX	0
-#define ICE_AQC_START_STOP_AGENT_START_DCBX	ICE_AQC_START_STOP_AGENT_M
-	u8 reserved[15];
-};
 
-/* Get/Set RSS key (indirect 0x0B04/0x0B02) */
-struct ice_aqc_get_set_rss_key {
-#define ICE_AQC_GSET_RSS_KEY_VSI_VALID	BIT(15)
-#define ICE_AQC_GSET_RSS_KEY_VSI_ID_S	0
-#define ICE_AQC_GSET_RSS_KEY_VSI_ID_M	(0x3FF << ICE_AQC_GSET_RSS_KEY_VSI_ID_S)
-	__le16 vsi_id;
-	u8 reserved[6];
-	__le32 addr_high;
-	__le32 addr_low;
+/* Input buffer format for program profile ranges and query profile ranges
+ * admin commands. Same format is used for response buffer in case of query
+ * profile ranges command
+ */
+struct ice_acl_rng_data {
+	/* The range checker output shall be sent when the value
+	 * related to this range checker is lower than low boundary
+	 */
+	__be16 low_boundary;
+	/* The range checker output shall be sent when the value
+	 * related to this range checker is higher than high boundary
+	 */
+	__be16 high_boundary;
+	/* A value of '0' in bit shall clear the relevant bit input
+	 * to the range checker
+	 */
+	__be16 mask;
 };
 
-#define ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE	0x28
-#define ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE	0xC
-#define ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE \
-				(ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE + \
-				 ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)
 
-struct ice_aqc_get_set_rss_keys {
-	u8 standard_rss_key[ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE];
-	u8 extended_hash_key[ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE];
+struct ice_aqc_acl_profile_ranges {
+#define ICE_AQC_ACL_PROF_RANGES_NUM_CFG 8
+	struct ice_acl_rng_data checker_cfg[ICE_AQC_ACL_PROF_RANGES_NUM_CFG];
 };
 
-/* Get/Set RSS LUT (indirect 0x0B05/0x0B03) */
-struct ice_aqc_get_set_rss_lut {
-#define ICE_AQC_GSET_RSS_LUT_VSI_VALID	BIT(15)
-#define ICE_AQC_GSET_RSS_LUT_VSI_ID_S	0
-#define ICE_AQC_GSET_RSS_LUT_VSI_ID_M	(0x1FF << ICE_AQC_GSET_RSS_LUT_VSI_ID_S)
-	__le16 vsi_id;
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S	0
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_M	\
-				(0x3 << ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S)
 
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI	 0
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF	 1
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL	 2
+/* Program ACL entry (indirect 0x0C20)
+ * Query ACL entry (indirect 0x0C24)
+ */
+struct ice_aqc_acl_entry {
+	u8 tcam_index; /* Updated TCAM block index */
+	u8 reserved;
+	__le16 entry_index; /* Updated entry index */
+	__le32 reserved2;
+	__le32 addr_high;
+	__le32 addr_low;
+};
 
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S	 2
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M	 \
-				(0x3 << ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S)
 
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128	 128
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128_FLAG 0
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512	 512
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512_FLAG 1
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K	 2048
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K_FLAG	 2
+/* Input buffer format in case of program ACL entry and response buffer format
+ * in case of query ACL entry
+ */
+struct ice_aqc_acl_data {
+	/* Entry key and entry key invert are 40 bits wide.
+	 * Byte 0..4 : entry key and Byte 5..7 are reserved
+	 * Byte 8..12: entry key invert and Byte 13..15 are reserved
+	 */
+	struct {
+		u8 val[5];
+		u8 reserved[3];
+	} entry_key, entry_key_invert;
+};
 
-#define ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S	 4
-#define ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_M	 \
-				(0xF << ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S)
 
-	__le16 flags;
-	__le32 reserved;
-	__le32 addr_high;
-	__le32 addr_low;
+/* Query ACL counter (direct 0x0C27) */
+struct ice_aqc_acl_query_counter {
+	/* Queried counter index */
+	__le16 counter_index;
+	/* Queried counter bank */
+	u8 counter_bank;
+	union {
+		struct {
+			u8 reserved[13];
+		} cmd;
+		struct {
+			/* Holds counter value/packet counter value */
+			u8 val[5];
+			u8 reserved[8];
+		} resp;
+	} ops;
 };
 
+
 /* Add Tx LAN Queues (indirect 0x0C30) */
 struct ice_aqc_add_txqs {
 	u8 num_qgrps;
@@ -1380,6 +3162,7 @@ struct ice_aqc_add_txqs {
 	__le32 addr_low;
 };
 
+
 /* This is the descriptor of each queue entry for the Add Tx LAN Queues
  * command (0x0C30). Only used within struct ice_aqc_add_tx_qgrp.
  */
@@ -1392,6 +3175,7 @@ struct ice_aqc_add_txqs_perq {
 	struct ice_aqc_txsched_elem info;
 };
 
+
 /* The format of the command buffer for Add Tx LAN Queues (0x0C30)
  * is an array of the following structs. Please note that the length of
  * each struct ice_aqc_add_tx_qgrp is variable due
@@ -1401,9 +3185,10 @@ struct ice_aqc_add_tx_qgrp {
 	__le32 parent_teid;
 	u8 num_txqs;
 	u8 rsvd[3];
-	struct ice_aqc_add_txqs_perq txqs[1];
+	struct ice_aqc_add_txqs_perq txqs[];
 };
 
+
 /* Disable Tx LAN Queues (indirect 0x0C31) */
 struct ice_aqc_dis_txqs {
 	u8 cmd_type;
@@ -1426,6 +3211,7 @@ struct ice_aqc_dis_txqs {
 	__le32 addr_low;
 };
 
+
 /* The buffer for Disable Tx LAN Queues (indirect 0x0C31)
  * contains the following structures, arrayed one after the
  * other.
@@ -1439,99 +3225,122 @@ struct ice_aqc_dis_txq_item {
 	u8 num_qs;
 	u8 rsvd;
 	/* The length of the q_id array varies according to num_qs */
-	__le16 q_id[1];
-	/* This only applies from F8 onward */
 #define ICE_AQC_Q_DIS_BUF_ELEM_TYPE_S		15
 #define ICE_AQC_Q_DIS_BUF_ELEM_TYPE_LAN_Q	\
 			(0 << ICE_AQC_Q_DIS_BUF_ELEM_TYPE_S)
 #define ICE_AQC_Q_DIS_BUF_ELEM_TYPE_RDMA_QSET	\
 			(1 << ICE_AQC_Q_DIS_BUF_ELEM_TYPE_S)
-};
+	__le16 q_id[];
+} __packed;
+
 
-struct ice_aqc_dis_txq {
-	struct ice_aqc_dis_txq_item qgrps[1];
+
+/* Tx LAN Queues Cleanup Event (0x0C31) */
+struct ice_aqc_txqs_cleanup {
+	__le16 caller_opc;
+	__le16 cmd_tag;
+	u8 reserved[12];
 };
 
-/* Configure Firmware Logging Command (indirect 0xFF09)
- * Logging Information Read Response (indirect 0xFF10)
- * Note: The 0xFF10 command has no input parameters.
- */
-struct ice_aqc_fw_logging {
-	u8 log_ctrl;
-#define ICE_AQC_FW_LOG_AQ_EN		BIT(0)
-#define ICE_AQC_FW_LOG_UART_EN		BIT(1)
-	u8 rsvd0;
-	u8 log_ctrl_valid; /* Not used by 0xFF10 Response */
-#define ICE_AQC_FW_LOG_AQ_VALID		BIT(0)
-#define ICE_AQC_FW_LOG_UART_VALID	BIT(1)
-	u8 rsvd1[5];
+
+/* Move / Reconfigure Tx Queues (indirect 0x0C32) */
+struct ice_aqc_move_txqs {
+	u8 cmd_type;
+#define ICE_AQC_Q_CMD_TYPE_S		0
+#define ICE_AQC_Q_CMD_TYPE_M		(0x3 << ICE_AQC_Q_CMD_TYPE_S)
+#define ICE_AQC_Q_CMD_TYPE_MOVE		1
+#define ICE_AQC_Q_CMD_TYPE_TC_CHANGE	2
+#define ICE_AQC_Q_CMD_TYPE_MOVE_AND_TC	3
+#define ICE_AQC_Q_CMD_SUBSEQ_CALL	BIT(2)
+#define ICE_AQC_Q_CMD_FLUSH_PIPE	BIT(3)
+	u8 num_qs;
+	u8 rsvd;
+	u8 timeout;
+#define ICE_AQC_Q_CMD_TIMEOUT_S		2
+#define ICE_AQC_Q_CMD_TIMEOUT_M		(0x3F << ICE_AQC_Q_CMD_TIMEOUT_S)
+	__le32 blocked_cgds;
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
-enum ice_aqc_fw_logging_mod {
-	ICE_AQC_FW_LOG_ID_GENERAL = 0,
-	ICE_AQC_FW_LOG_ID_CTRL,
-	ICE_AQC_FW_LOG_ID_LINK,
-	ICE_AQC_FW_LOG_ID_LINK_TOPO,
-	ICE_AQC_FW_LOG_ID_DNL,
-	ICE_AQC_FW_LOG_ID_I2C,
-	ICE_AQC_FW_LOG_ID_SDP,
-	ICE_AQC_FW_LOG_ID_MDIO,
-	ICE_AQC_FW_LOG_ID_ADMINQ,
-	ICE_AQC_FW_LOG_ID_HDMA,
-	ICE_AQC_FW_LOG_ID_LLDP,
-	ICE_AQC_FW_LOG_ID_DCBX,
-	ICE_AQC_FW_LOG_ID_DCB,
-	ICE_AQC_FW_LOG_ID_NETPROXY,
-	ICE_AQC_FW_LOG_ID_NVM,
-	ICE_AQC_FW_LOG_ID_AUTH,
-	ICE_AQC_FW_LOG_ID_VPD,
-	ICE_AQC_FW_LOG_ID_IOSF,
-	ICE_AQC_FW_LOG_ID_PARSER,
-	ICE_AQC_FW_LOG_ID_SW,
-	ICE_AQC_FW_LOG_ID_SCHEDULER,
-	ICE_AQC_FW_LOG_ID_TXQ,
-	ICE_AQC_FW_LOG_ID_RSVD,
-	ICE_AQC_FW_LOG_ID_POST,
-	ICE_AQC_FW_LOG_ID_WATCHDOG,
-	ICE_AQC_FW_LOG_ID_TASK_DISPATCH,
-	ICE_AQC_FW_LOG_ID_MNG,
-	ICE_AQC_FW_LOG_ID_MAX,
+
+/* Per-queue data buffer for the Move Tx LAN Queues command/response */
+struct ice_aqc_move_txqs_elem {
+	__le16 txq_id;
+	u8 q_cgd;
+	u8 rsvd;
+	__le32 q_teid;
+};
+
+
+/* Indirect data buffer for the Move Tx LAN Queues command/response */
+struct ice_aqc_move_txqs_data {
+	__le32 src_teid;
+	__le32 dest_teid;
+	struct ice_aqc_move_txqs_elem txqs[];
+};
+
+
+/* Add Tx RDMA Queue Set (indirect 0x0C33) */
+struct ice_aqc_add_rdma_qset {
+	u8 num_qset_grps;
+	u8 reserved[7];
+	__le32 addr_high;
+	__le32 addr_low;
 };
 
-/* This is the buffer for both of the logging commands.
- * The entry array size depends on the datalen parameter in the descriptor.
- * There will be a total of datalen / 2 entries.
+
+/* This is the descriptor of each qset entry for the Add Tx RDMA Queue Set
+ * command (0x0C33). Only used within struct ice_aqc_add_rdma_qset.
  */
-struct ice_aqc_fw_logging_data {
-	__le16 entry[1];
-#define ICE_AQC_FW_LOG_ID_S		0
-#define ICE_AQC_FW_LOG_ID_M		(0xFFF << ICE_AQC_FW_LOG_ID_S)
+struct ice_aqc_add_tx_rdma_qset_entry {
+	__le16 tx_qset_id;
+	u8 rsvd[2];
+	__le32 qset_teid;
+	struct ice_aqc_txsched_elem info;
+};
 
-#define ICE_AQC_FW_LOG_CONF_SUCCESS	0	/* Used by response */
-#define ICE_AQC_FW_LOG_CONF_BAD_INDX	BIT(12)	/* Used by response */
 
-#define ICE_AQC_FW_LOG_EN_S		12
-#define ICE_AQC_FW_LOG_EN_M		(0xF << ICE_AQC_FW_LOG_EN_S)
-#define ICE_AQC_FW_LOG_INFO_EN		BIT(12)	/* Used by command */
-#define ICE_AQC_FW_LOG_INIT_EN		BIT(13)	/* Used by command */
-#define ICE_AQC_FW_LOG_FLOW_EN		BIT(14)	/* Used by command */
-#define ICE_AQC_FW_LOG_ERR_EN		BIT(15)	/* Used by command */
+/* The format of the command buffer for Add Tx RDMA Queue Set(0x0C33)
+ * is an array of the following structs. Please note that the length of
+ * each struct ice_aqc_add_rdma_qset is variable due to the variable
+ * number of queues in each group!
+ */
+struct ice_aqc_add_rdma_qset_data {
+	__le32 parent_teid;
+	__le16 num_qsets;
+	u8 rsvd[2];
+	struct ice_aqc_add_tx_rdma_qset_entry rdma_qsets[];
 };
 
-/* Get/Clear FW Log (indirect 0xFF11) */
-struct ice_aqc_get_clear_fw_log {
+
+/* Move RDMA Queue Set (indirect 0x0C34) */
+struct ice_aqc_move_rdma_qset_cmd {
+	u8 num_rdma_qset;	/* Used by commands and response */
 	u8 flags;
-#define ICE_AQC_FW_LOG_CLEAR		BIT(0)
-#define ICE_AQC_FW_LOG_MORE_DATA_AVAIL	BIT(1)
-	u8 rsvd1[7];
+	u8 reserved[6];
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
+
+/* Buffer */
+struct ice_aqc_move_rdma_qset_buffer_desc {
+	__le16 tx_qset_id;
+	__le16 qset_teid;
+};
+
+
+struct ice_aqc_move_rdma_qset_buffer {
+	__le32 src_parent_teid;
+	__le32 dest_parent_teid;
+	struct ice_aqc_move_rdma_qset_buffer_desc descs[];
+};
+
+
+
 /* Download Package (indirect 0x0C40) */
-/* Also used for Update Package (indirect 0x0C42) */
+/* Also used for Update Package (indirect 0x0C42 and 0x0C41) */
 struct ice_aqc_download_pkg {
 	u8 flags;
 #define ICE_AQC_DOWNLOAD_PKG_LAST_BUF	0x01
@@ -1541,6 +3350,7 @@ struct ice_aqc_download_pkg {
 	__le32 addr_low;
 };
 
+
 struct ice_aqc_download_pkg_resp {
 	__le32 error_offset;
 	__le32 error_info;
@@ -1548,6 +3358,7 @@ struct ice_aqc_download_pkg_resp {
 	__le32 addr_low;
 };
 
+
 /* Get Package Info List (indirect 0x0C43) */
 struct ice_aqc_get_pkg_info_list {
 	__le32 reserved1;
@@ -1556,6 +3367,7 @@ struct ice_aqc_get_pkg_info_list {
 	__le32 addr_low;
 };
 
+
 /* Version format for packages */
 struct ice_pkg_ver {
 	u8 major;
@@ -1564,30 +3376,238 @@ struct ice_pkg_ver {
 	u8 draft;
 };
 
+
 #define ICE_PKG_NAME_SIZE	32
+#define ICE_SEG_ID_SIZE		28
+#define ICE_SEG_NAME_SIZE	28
 
 struct ice_aqc_get_pkg_info {
 	struct ice_pkg_ver ver;
-	char name[ICE_PKG_NAME_SIZE];
+	char name[ICE_SEG_NAME_SIZE];
+	__le32 track_id;
 	u8 is_in_nvm;
 	u8 is_active;
 	u8 is_active_at_boot;
 	u8 is_modified;
 };
 
+
 /* Get Package Info List response buffer format (0x0C43) */
 struct ice_aqc_get_pkg_info_resp {
 	__le32 count;
-	struct ice_aqc_get_pkg_info pkg_info[1];
+	struct ice_aqc_get_pkg_info pkg_info[];
+};
+
+
+
+
+/* Driver Shared Parameters (direct, 0x0C90) */
+struct ice_aqc_driver_shared_params {
+	u8 set_or_get_op;
+#define ICE_AQC_DRIVER_PARAM_OP_MASK		BIT(0)
+#define ICE_AQC_DRIVER_PARAM_SET		0
+#define ICE_AQC_DRIVER_PARAM_GET		1
+	u8 param_indx;
+#define ICE_AQC_DRIVER_PARAM_MAX_IDX		15
+	u8 rsvd[2];
+	__le32 param_val;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+enum ice_aqc_driver_params {
+	/* OS clock index for PTP timer Domain 0 */
+	ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR0 = 0,
+	/* OS clock index for PTP timer Domain 1 */
+	ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR1,
+
+	/* Add new parameters above */
+	ICE_AQC_DRIVER_PARAM_MAX = 16,
+};
+
+
+/* Lan Queue Overflow Event (direct, 0x1001) */
+struct ice_aqc_event_lan_overflow {
+	__le32 prtdcb_ruptq;
+	__le32 qtx_ctl;
+	u8 reserved[8];
+};
+
+
+
+
+enum ice_aqc_fw_logging_mod {
+	ICE_AQC_FW_LOG_ID_GENERAL = 0,
+	ICE_AQC_FW_LOG_ID_CTRL,
+	ICE_AQC_FW_LOG_ID_LINK,
+	ICE_AQC_FW_LOG_ID_LINK_TOPO,
+	ICE_AQC_FW_LOG_ID_DNL,
+	ICE_AQC_FW_LOG_ID_I2C,
+	ICE_AQC_FW_LOG_ID_SDP,
+	ICE_AQC_FW_LOG_ID_MDIO,
+	ICE_AQC_FW_LOG_ID_ADMINQ,
+	ICE_AQC_FW_LOG_ID_HDMA,
+	ICE_AQC_FW_LOG_ID_LLDP,
+	ICE_AQC_FW_LOG_ID_DCBX,
+	ICE_AQC_FW_LOG_ID_DCB,
+	ICE_AQC_FW_LOG_ID_XLR,
+	ICE_AQC_FW_LOG_ID_NVM,
+	ICE_AQC_FW_LOG_ID_AUTH,
+	ICE_AQC_FW_LOG_ID_VPD,
+	ICE_AQC_FW_LOG_ID_IOSF,
+	ICE_AQC_FW_LOG_ID_PARSER,
+	ICE_AQC_FW_LOG_ID_SW,
+	ICE_AQC_FW_LOG_ID_SCHEDULER,
+	ICE_AQC_FW_LOG_ID_TXQ,
+	ICE_AQC_FW_LOG_ID_ACL,
+	ICE_AQC_FW_LOG_ID_POST,
+	ICE_AQC_FW_LOG_ID_WATCHDOG,
+	ICE_AQC_FW_LOG_ID_TASK_DISPATCH,
+	ICE_AQC_FW_LOG_ID_MNG,
+	ICE_AQC_FW_LOG_ID_SYNCE,
+	ICE_AQC_FW_LOG_ID_HEALTH,
+	ICE_AQC_FW_LOG_ID_TSDRV,
+	ICE_AQC_FW_LOG_ID_PFREG,
+	ICE_AQC_FW_LOG_ID_MDLVER,
+	ICE_AQC_FW_LOG_ID_MAX,
+};
+
+
+
+/* Set Health Status (direct 0xFF20) */
+struct ice_aqc_set_health_status_config {
+	u8 event_source;
+#define ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK	BIT(0)
+#define ICE_AQC_HEALTH_STATUS_SET_ALL_PF_MASK		BIT(1)
+#define ICE_AQC_HEALTH_STATUS_SET_GLOBAL_MASK		BIT(2)
+	u8 reserved[15];
+};
+
+
+#define ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_STRICT		0x101
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_TYPE			0x102
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_QUAL			0x103
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_COMM			0x104
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_CONFLICT			0x105
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_NOT_PRESENT		0x106
+#define ICE_AQC_HEALTH_STATUS_INFO_MOD_UNDERUTILIZED		0x107
+#define ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_LENIENT		0x108
+#define ICE_AQC_HEALTH_STATUS_ERR_INVALID_LINK_CFG		0x10B
+#define ICE_AQC_HEALTH_STATUS_ERR_PORT_ACCESS			0x10C
+#define ICE_AQC_HEALTH_STATUS_ERR_PORT_UNREACHABLE		0x10D
+#define ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_MOD_LIMITED	0x10F
+#define ICE_AQC_HEALTH_STATUS_ERR_PARALLEL_FAULT		0x110
+#define ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_PHY_LIMITED	0x111
+#define ICE_AQC_HEALTH_STATUS_ERR_NETLIST_TOPO			0x112
+#define ICE_AQC_HEALTH_STATUS_ERR_NETLIST			0x113
+#define ICE_AQC_HEALTH_STATUS_ERR_TOPO_CONFLICT			0x114
+#define ICE_AQC_HEALTH_STATUS_ERR_LINK_HW_ACCESS		0x115
+#define ICE_AQC_HEALTH_STATUS_ERR_LINK_RUNTIME			0x116
+#define ICE_AQC_HEALTH_STATUS_ERR_DNL_INIT			0x117
+#define ICE_AQC_HEALTH_STATUS_ERR_PHY_NVM_PROG			0x120
+#define ICE_AQC_HEALTH_STATUS_ERR_PHY_FW_LOAD			0x121
+#define ICE_AQC_HEALTH_STATUS_INFO_RECOVERY			0x500
+#define ICE_AQC_HEALTH_STATUS_ERR_FLASH_ACCESS			0x501
+#define ICE_AQC_HEALTH_STATUS_ERR_NVM_AUTH			0x502
+#define ICE_AQC_HEALTH_STATUS_ERR_OROM_AUTH			0x503
+#define ICE_AQC_HEALTH_STATUS_ERR_DDP_AUTH			0x504
+#define ICE_AQC_HEALTH_STATUS_ERR_NVM_COMPAT			0x505
+#define ICE_AQC_HEALTH_STATUS_ERR_OROM_COMPAT			0x506
+#define ICE_AQC_HEALTH_STATUS_ERR_DCB_MIB			0x509
+
+/* Get Health Status codes (indirect 0xFF21) */
+struct ice_aqc_get_supported_health_status_codes {
+	__le16 health_code_count;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Get Health Status (indirect 0xFF22) */
+struct ice_aqc_get_health_status {
+	__le16 health_status_count;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Get Health Status event buffer entry, (0xFF22)
+ * repeated per reported health status
+ */
+struct ice_aqc_health_status_elem {
+	__le16 health_status_code;
+	__le16 event_source;
+#define ICE_AQC_HEALTH_STATUS_PF			(0x1)
+#define ICE_AQC_HEALTH_STATUS_PORT			(0x2)
+#define ICE_AQC_HEALTH_STATUS_GLOBAL			(0x3)
+	__le32 internal_data1;
+#define ICE_AQC_HEALTH_STATUS_UNDEFINED_DATA	(0xDEADBEEF)
+	__le32 internal_data2;
+};
+
+
+/* Clear Health Status (direct 0xFF23) */
+struct ice_aqc_clear_health_status {
+	__le32 reserved[4];
+};
+
+
+/* Set FW Logging configuration (indirect 0xFF30)
+ * Register for FW Logging (indirect 0xFF31)
+ * Query FW Logging (indirect 0xFF32)
+ * FW Log Event (indirect 0xFF33)
+ * Get FW Log (indirect 0xFF34)
+ * Clear FW Log (indirect 0xFF35)
+ */
+struct ice_aqc_fw_log {
+	u8 cmd_flags;
+#define ICE_AQC_FW_LOG_CONF_UART_EN	BIT(0)
+#define ICE_AQC_FW_LOG_CONF_AQ_EN	BIT(1)
+#define ICE_AQC_FW_LOG_QUERY_REGISTERED	BIT(2)
+#define ICE_AQC_FW_LOG_CONF_SET_VALID	BIT(3)
+#define ICE_AQC_FW_LOG_AQ_REGISTER	BIT(0)
+#define ICE_AQC_FW_LOG_AQ_QUERY		BIT(2)
+#define ICE_AQC_FW_LOG_PERSISTENT	BIT(0)
+	u8 rsp_flag;
+#define ICE_AQC_FW_LOG_MORE_DATA	BIT(1)
+	__le16 fw_rt_msb;
+	union {
+		struct {
+			__le32 fw_rt_lsb;
+		} sync;
+		struct {
+			__le16 log_resolution;
+#define ICE_AQC_FW_LOG_MIN_RESOLUTION		(1)
+#define ICE_AQC_FW_LOG_MAX_RESOLUTION		(128)
+			__le16 mdl_cnt;
+		} cfg;
+	} ops;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Response Buffer for:
+ *    Set Firmware Logging Configuration (0xFF30)
+ *    Query FW Logging (0xFF32)
+ */
+struct ice_aqc_fw_log_cfg_resp {
+	__le16 module_identifier;
+	u8 log_level;
+	u8 rsvd0;
 };
+
+
 /**
  * struct ice_aq_desc - Admin Queue (AQ) descriptor
  * @flags: ICE_AQ_FLAG_* flags
  * @opcode: AQ command opcode
  * @datalen: length in bytes of indirect/external data buffer
  * @retval: return value from firmware
- * @cookie_h: opaque data high-half
- * @cookie_l: opaque data low-half
+ * @cookie_high: opaque data high-half
+ * @cookie_low: opaque data low-half
  * @params: command-specific parameters
  *
  * Descriptor format for commands the driver posts on the Admin Transmit Queue
@@ -1610,76 +3630,184 @@ struct ice_aq_desc {
 		struct ice_aqc_get_ver get_ver;
 		struct ice_aqc_driver_ver driver_ver;
 		struct ice_aqc_q_shutdown q_shutdown;
+		struct ice_aqc_get_exp_err exp_err;
 		struct ice_aqc_req_res res_owner;
 		struct ice_aqc_manage_mac_read mac_read;
 		struct ice_aqc_manage_mac_write mac_write;
 		struct ice_aqc_clear_pxe clear_pxe;
+		struct ice_aqc_config_no_drop_policy no_drop;
+		struct ice_aqc_add_update_mir_rule add_update_rule;
+		struct ice_aqc_delete_mir_rule del_rule;
 		struct ice_aqc_list_caps get_cap;
 		struct ice_aqc_get_phy_caps get_phy;
 		struct ice_aqc_set_phy_cfg set_phy;
 		struct ice_aqc_restart_an restart_an;
+		struct ice_aqc_dnl_get_status get_status;
+		struct ice_aqc_dnl_run_command dnl_run;
+		struct ice_aqc_dnl_call_command dnl_call;
+		struct ice_aqc_dnl_read_write_command dnl_read_write;
+		struct ice_aqc_dnl_read_write_response dnl_read_write_resp;
+		struct ice_aqc_dnl_set_breakpoints_command dnl_set_brk;
+		struct ice_aqc_dnl_read_log_command dnl_read_log;
+		struct ice_aqc_dnl_read_log_response dnl_read_log_resp;
+		struct ice_aqc_i2c read_write_i2c;
+		struct ice_aqc_read_i2c_resp read_i2c_resp;
+		struct ice_aqc_mdio read_write_mdio;
+		struct ice_aqc_gpio_by_func read_write_gpio_by_func;
+		struct ice_aqc_gpio read_write_gpio;
+		struct ice_aqc_set_led set_led;
+		struct ice_aqc_mdio read_mdio;
+		struct ice_aqc_mdio write_mdio;
+		struct ice_aqc_sff_eeprom read_write_sff_param;
 		struct ice_aqc_set_port_id_led set_port_id_led;
+		struct ice_aqc_get_port_options get_port_options;
+		struct ice_aqc_set_port_option set_port_option;
 		struct ice_aqc_get_sw_cfg get_sw_conf;
+		struct ice_aqc_set_port_params set_port_params;
 		struct ice_aqc_sw_rules sw_rules;
+		struct ice_aqc_storm_cfg storm_conf;
+		struct ice_aqc_add_get_recipe add_get_recipe;
+		struct ice_aqc_recipe_to_profile recipe_to_profile;
 		struct ice_aqc_get_topo get_topo;
 		struct ice_aqc_sched_elem_cmd sched_elem_cmd;
 		struct ice_aqc_query_txsched_res query_sched_res;
+		struct ice_aqc_query_node_to_root query_node_to_root;
+		struct ice_aqc_cfg_l2_node_cgd cfg_l2_node_cgd;
 		struct ice_aqc_query_port_ets port_ets;
+		struct ice_aqc_rl_profile rl_profile;
 		struct ice_aqc_nvm nvm;
+		struct ice_aqc_nvm_cfg nvm_cfg;
 		struct ice_aqc_nvm_checksum nvm_checksum;
+		struct ice_aqc_nvm_pkg_data pkg_data;
+		struct ice_aqc_nvm_pass_comp_tbl pass_comp_tbl;
 		struct ice_aqc_pf_vf_msg virt;
+		struct ice_aqc_pfc_ignore pfc_ignore;
+		struct ice_aqc_set_query_pfc_mode set_query_pfc_mode;
+		struct ice_aqc_set_dcb_params set_dcb_params;
 		struct ice_aqc_lldp_get_mib lldp_get_mib;
 		struct ice_aqc_lldp_set_mib_change lldp_set_event;
+		struct ice_aqc_lldp_add_delete_tlv lldp_add_delete_tlv;
+		struct ice_aqc_lldp_update_tlv lldp_update_tlv;
 		struct ice_aqc_lldp_stop lldp_stop;
 		struct ice_aqc_lldp_start lldp_start;
 		struct ice_aqc_lldp_set_local_mib lldp_set_mib;
 		struct ice_aqc_lldp_stop_start_specific_agent lldp_agent_ctrl;
+		struct ice_aqc_lldp_filter_ctrl lldp_filter_ctrl;
 		struct ice_aqc_get_set_rss_lut get_set_rss_lut;
 		struct ice_aqc_get_set_rss_key get_set_rss_key;
+		struct ice_aqc_clear_fd_table clear_fd_table;
+		struct ice_aqc_neigh_dev_req neigh_dev;
+		struct ice_aqc_acl_alloc_table alloc_table;
+		struct ice_aqc_acl_tbl_actpair tbl_actpair;
+		struct ice_aqc_acl_alloc_scen alloc_scen;
+		struct ice_aqc_acl_dealloc_scen dealloc_scen;
+		struct ice_aqc_acl_update_query_scen update_query_scen;
+		struct ice_aqc_acl_alloc_counters alloc_counters;
+		struct ice_aqc_acl_dealloc_counters dealloc_counters;
+		struct ice_aqc_acl_dealloc_res dealloc_res;
+		struct ice_aqc_acl_entry program_query_entry;
+		struct ice_aqc_acl_actpair program_query_actpair;
+		struct ice_aqc_acl_profile profile;
+		struct ice_aqc_acl_query_counter query_counter;
 		struct ice_aqc_add_txqs add_txqs;
 		struct ice_aqc_dis_txqs dis_txqs;
+		struct ice_aqc_move_txqs move_txqs;
+		struct ice_aqc_add_rdma_qset add_rdma_qset;
+		struct ice_aqc_txqs_cleanup txqs_cleanup;
 		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
 		struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res;
-		struct ice_aqc_fw_logging fw_logging;
-		struct ice_aqc_get_clear_fw_log get_clear_fw_log;
+		struct ice_aqc_get_vsi_resp get_vsi_resp;
 		struct ice_aqc_download_pkg download_pkg;
+		struct ice_aqc_get_pkg_info_list get_pkg_info_list;
+		struct ice_aqc_driver_shared_params drv_shared_params;
+		struct ice_aqc_fw_log fw_log;
 		struct ice_aqc_set_mac_lb set_mac_lb;
 		struct ice_aqc_alloc_free_res_cmd sw_res_ctrl;
+		struct ice_aqc_get_res_alloc get_res;
+		struct ice_aqc_get_allocd_res_desc get_res_desc;
+		struct ice_aqc_set_mac_cfg set_mac_cfg;
 		struct ice_aqc_set_event_mask set_event_mask;
 		struct ice_aqc_get_link_status get_link_status;
+		struct ice_aqc_event_lan_overflow lan_overflow;
+		struct ice_aqc_get_link_topo get_link_topo;
+		struct ice_aqc_get_link_topo_pin get_link_topo_pin;
+		struct ice_aqc_set_health_status_config
+			set_health_status_config;
+		struct ice_aqc_get_supported_health_status_codes
+			get_supported_health_status_codes;
+		struct ice_aqc_get_health_status get_health_status;
+		struct ice_aqc_clear_health_status clear_health_status;
+		struct ice_aqc_prog_topo_dev_nvm prog_topo_dev_nvm;
+		struct ice_aqc_read_topo_dev_nvm read_topo_dev_nvm;
 	} params;
 };
 
+
 /* FW defined boundary for a large buffer, 4k >= Large buffer > 512 bytes */
 #define ICE_AQ_LG_BUF	512
 
+/* Flags sub-structure
+ * |0  |1  |2  |3  |4  |5  |6  |7  |8  |9  |10 |11 |12 |13 |14 |15 |
+ * |DD |CMP|ERR|VFE| * *  RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE |
+ */
+
+/* command flags and offsets */
+#define ICE_AQ_FLAG_DD_S	0
+#define ICE_AQ_FLAG_CMP_S	1
 #define ICE_AQ_FLAG_ERR_S	2
+#define ICE_AQ_FLAG_VFE_S	3
 #define ICE_AQ_FLAG_LB_S	9
 #define ICE_AQ_FLAG_RD_S	10
+#define ICE_AQ_FLAG_VFC_S	11
 #define ICE_AQ_FLAG_BUF_S	12
 #define ICE_AQ_FLAG_SI_S	13
+#define ICE_AQ_FLAG_EI_S	14
+#define ICE_AQ_FLAG_FE_S	15
 
+#define ICE_AQ_FLAG_DD		BIT(ICE_AQ_FLAG_DD_S)  /* 0x1    */
+#define ICE_AQ_FLAG_CMP		BIT(ICE_AQ_FLAG_CMP_S) /* 0x2    */
 #define ICE_AQ_FLAG_ERR		BIT(ICE_AQ_FLAG_ERR_S) /* 0x4    */
+#define ICE_AQ_FLAG_VFE		BIT(ICE_AQ_FLAG_VFE_S) /* 0x8    */
 #define ICE_AQ_FLAG_LB		BIT(ICE_AQ_FLAG_LB_S)  /* 0x200  */
 #define ICE_AQ_FLAG_RD		BIT(ICE_AQ_FLAG_RD_S)  /* 0x400  */
+#define ICE_AQ_FLAG_VFC		BIT(ICE_AQ_FLAG_VFC_S) /* 0x800  */
 #define ICE_AQ_FLAG_BUF		BIT(ICE_AQ_FLAG_BUF_S) /* 0x1000 */
 #define ICE_AQ_FLAG_SI		BIT(ICE_AQ_FLAG_SI_S)  /* 0x2000 */
+#define ICE_AQ_FLAG_EI		BIT(ICE_AQ_FLAG_EI_S)  /* 0x4000 */
+#define ICE_AQ_FLAG_FE		BIT(ICE_AQ_FLAG_FE_S)  /* 0x8000 */
 
 /* error codes */
 enum ice_aq_err {
 	ICE_AQ_RC_OK		= 0,  /* Success */
 	ICE_AQ_RC_EPERM		= 1,  /* Operation not permitted */
 	ICE_AQ_RC_ENOENT	= 2,  /* No such element */
+	ICE_AQ_RC_ESRCH		= 3,  /* Bad opcode */
+	ICE_AQ_RC_EINTR		= 4,  /* Operation interrupted */
+	ICE_AQ_RC_EIO		= 5,  /* I/O error */
+	ICE_AQ_RC_ENXIO		= 6,  /* No such resource */
+	ICE_AQ_RC_E2BIG		= 7,  /* Arg too long */
+	ICE_AQ_RC_EAGAIN	= 8,  /* Try again */
 	ICE_AQ_RC_ENOMEM	= 9,  /* Out of memory */
+	ICE_AQ_RC_EACCES	= 10, /* Permission denied */
+	ICE_AQ_RC_EFAULT	= 11, /* Bad address */
 	ICE_AQ_RC_EBUSY		= 12, /* Device or resource busy */
 	ICE_AQ_RC_EEXIST	= 13, /* Object already exists */
+	ICE_AQ_RC_EINVAL	= 14, /* Invalid argument */
+	ICE_AQ_RC_ENOTTY	= 15, /* Not a typewriter */
 	ICE_AQ_RC_ENOSPC	= 16, /* No space left or allocation failure */
 	ICE_AQ_RC_ENOSYS	= 17, /* Function not implemented */
+	ICE_AQ_RC_ERANGE	= 18, /* Parameter out of range */
+	ICE_AQ_RC_EFLUSHED	= 19, /* Cmd flushed due to prev cmd error */
+	ICE_AQ_RC_BAD_ADDR	= 20, /* Descriptor contains a bad pointer */
+	ICE_AQ_RC_EMODE		= 21, /* Op not allowed in current dev mode */
+	ICE_AQ_RC_EFBIG		= 22, /* File too big */
+	ICE_AQ_RC_ESBCOMP	= 23, /* SB-IOSF completion unsuccessful */
 	ICE_AQ_RC_ENOSEC	= 24, /* Missing security manifest */
 	ICE_AQ_RC_EBADSIG	= 25, /* Bad RSA signature */
 	ICE_AQ_RC_ESVN		= 26, /* SVN number prohibits this package */
 	ICE_AQ_RC_EBADMAN	= 27, /* Manifest hash mismatch */
 	ICE_AQ_RC_EBADBUF	= 28, /* Buffer hash mismatches manifest */
+	ICE_AQ_RC_EACCES_BMCU	= 29, /* BMC Update in progress */
 };
 
 /* Admin Queue command opcodes */
@@ -1688,6 +3816,7 @@ enum ice_adminq_opc {
 	ice_aqc_opc_get_ver				= 0x0001,
 	ice_aqc_opc_driver_ver				= 0x0002,
 	ice_aqc_opc_q_shutdown				= 0x0003,
+	ice_aqc_opc_get_exp_err				= 0x0005,
 
 	/* resource ownership */
 	ice_aqc_opc_req_res				= 0x0008,
@@ -1704,77 +3833,200 @@ enum ice_adminq_opc {
 	/* PXE */
 	ice_aqc_opc_clear_pxe_mode			= 0x0110,
 
+	ice_aqc_opc_config_no_drop_policy		= 0x0112,
+
 	/* internal switch commands */
 	ice_aqc_opc_get_sw_cfg				= 0x0200,
+	ice_aqc_opc_set_port_params			= 0x0203,
 
 	/* Alloc/Free/Get Resources */
+	ice_aqc_opc_get_res_alloc			= 0x0204,
 	ice_aqc_opc_alloc_res				= 0x0208,
 	ice_aqc_opc_free_res				= 0x0209,
+	ice_aqc_opc_get_allocd_res_desc			= 0x020A,
+	ice_aqc_opc_set_vlan_mode_parameters		= 0x020C,
+	ice_aqc_opc_get_vlan_mode_parameters		= 0x020D,
 
 	/* VSI commands */
 	ice_aqc_opc_add_vsi				= 0x0210,
 	ice_aqc_opc_update_vsi				= 0x0211,
+	ice_aqc_opc_get_vsi_params			= 0x0212,
 	ice_aqc_opc_free_vsi				= 0x0213,
 
+	/* Mirroring rules - add/update, delete */
+	ice_aqc_opc_add_update_mir_rule			= 0x0260,
+	ice_aqc_opc_del_mir_rule			= 0x0261,
+
+	/* storm configuration */
+	ice_aqc_opc_set_storm_cfg			= 0x0280,
+	ice_aqc_opc_get_storm_cfg			= 0x0281,
+
+	/* recipe commands */
+	ice_aqc_opc_add_recipe				= 0x0290,
+	ice_aqc_opc_recipe_to_profile			= 0x0291,
+	ice_aqc_opc_get_recipe				= 0x0292,
+	ice_aqc_opc_get_recipe_to_profile		= 0x0293,
+
 	/* switch rules population commands */
 	ice_aqc_opc_add_sw_rules			= 0x02A0,
 	ice_aqc_opc_update_sw_rules			= 0x02A1,
 	ice_aqc_opc_remove_sw_rules			= 0x02A2,
-
+	ice_aqc_opc_get_sw_rules			= 0x02A3,
 	ice_aqc_opc_clear_pf_cfg			= 0x02A4,
 
+	/* DCB commands */
+	ice_aqc_opc_pfc_ignore				= 0x0301,
+	ice_aqc_opc_query_pfc_mode			= 0x0302,
+	ice_aqc_opc_set_pfc_mode			= 0x0303,
+	ice_aqc_opc_set_dcb_params			= 0x0306,
+
 	/* transmit scheduler commands */
 	ice_aqc_opc_get_dflt_topo			= 0x0400,
 	ice_aqc_opc_add_sched_elems			= 0x0401,
+	ice_aqc_opc_cfg_sched_elems			= 0x0403,
 	ice_aqc_opc_get_sched_elems			= 0x0404,
+	ice_aqc_opc_move_sched_elems			= 0x0408,
 	ice_aqc_opc_suspend_sched_elems			= 0x0409,
 	ice_aqc_opc_resume_sched_elems			= 0x040A,
 	ice_aqc_opc_query_port_ets			= 0x040E,
 	ice_aqc_opc_delete_sched_elems			= 0x040F,
+	ice_aqc_opc_add_rl_profiles			= 0x0410,
+	ice_aqc_opc_query_rl_profiles			= 0x0411,
 	ice_aqc_opc_query_sched_res			= 0x0412,
+	ice_aqc_opc_query_node_to_root			= 0x0413,
+	ice_aqc_opc_cfg_l2_node_cgd			= 0x0414,
+	ice_aqc_opc_remove_rl_profiles			= 0x0415,
 
 	/* PHY commands */
 	ice_aqc_opc_get_phy_caps			= 0x0600,
 	ice_aqc_opc_set_phy_cfg				= 0x0601,
+	ice_aqc_opc_set_mac_cfg				= 0x0603,
 	ice_aqc_opc_restart_an				= 0x0605,
 	ice_aqc_opc_get_link_status			= 0x0607,
 	ice_aqc_opc_set_event_mask			= 0x0613,
 	ice_aqc_opc_set_mac_lb				= 0x0620,
+	ice_aqc_opc_dnl_get_status			= 0x0680,
+	ice_aqc_opc_dnl_run				= 0x0681,
+	ice_aqc_opc_dnl_call				= 0x0682,
+	ice_aqc_opc_dnl_read_sto			= 0x0683,
+	ice_aqc_opc_dnl_write_sto			= 0x0684,
+	ice_aqc_opc_dnl_set_breakpoints			= 0x0686,
+	ice_aqc_opc_dnl_read_log			= 0x0687,
+	ice_aqc_opc_get_link_topo			= 0x06E0,
+	ice_aqc_opc_get_link_topo_pin			= 0x06E1,
+	ice_aqc_opc_read_i2c				= 0x06E2,
+	ice_aqc_opc_write_i2c				= 0x06E3,
+	ice_aqc_opc_read_mdio				= 0x06E4,
+	ice_aqc_opc_write_mdio				= 0x06E5,
+	ice_aqc_opc_set_gpio_by_func			= 0x06E6,
+	ice_aqc_opc_get_gpio_by_func			= 0x06E7,
+	ice_aqc_opc_set_led				= 0x06E8,
 	ice_aqc_opc_set_port_id_led			= 0x06E9,
+	ice_aqc_opc_get_port_options			= 0x06EA,
+	ice_aqc_opc_set_port_option			= 0x06EB,
+	ice_aqc_opc_set_gpio				= 0x06EC,
+	ice_aqc_opc_get_gpio				= 0x06ED,
+	ice_aqc_opc_sff_eeprom				= 0x06EE,
+	ice_aqc_opc_sw_set_gpio				= 0x06EF,
+	ice_aqc_opc_sw_get_gpio				= 0x06F0,
+	ice_aqc_opc_prog_topo_dev_nvm			= 0x06F2,
+	ice_aqc_opc_read_topo_dev_nvm			= 0x06F3,
 
 	/* NVM commands */
 	ice_aqc_opc_nvm_read				= 0x0701,
+	ice_aqc_opc_nvm_erase				= 0x0702,
+	ice_aqc_opc_nvm_write				= 0x0703,
+	ice_aqc_opc_nvm_cfg_read			= 0x0704,
+	ice_aqc_opc_nvm_cfg_write			= 0x0705,
 	ice_aqc_opc_nvm_checksum			= 0x0706,
+	ice_aqc_opc_nvm_write_activate			= 0x0707,
+	ice_aqc_opc_nvm_sr_dump				= 0x0707,
+	ice_aqc_opc_nvm_save_factory_settings		= 0x0708,
+	ice_aqc_opc_nvm_update_empr			= 0x0709,
+	ice_aqc_opc_nvm_pkg_data			= 0x070A,
+	ice_aqc_opc_nvm_pass_component_tbl		= 0x070B,
 
 	/* PF/VF mailbox commands */
 	ice_mbx_opc_send_msg_to_pf			= 0x0801,
 	ice_mbx_opc_send_msg_to_vf			= 0x0802,
+	/* Peer driver communication mailbox commands */
+	ice_mbx_opc_send_to_peer_pf			= 0x0803,
+	ice_mbx_opc_send_to_peer_drv			= 0x0804,
 	/* LLDP commands */
 	ice_aqc_opc_lldp_get_mib			= 0x0A00,
 	ice_aqc_opc_lldp_set_mib_change			= 0x0A01,
+	ice_aqc_opc_lldp_add_tlv			= 0x0A02,
+	ice_aqc_opc_lldp_update_tlv			= 0x0A03,
+	ice_aqc_opc_lldp_delete_tlv			= 0x0A04,
 	ice_aqc_opc_lldp_stop				= 0x0A05,
 	ice_aqc_opc_lldp_start				= 0x0A06,
 	ice_aqc_opc_get_cee_dcb_cfg			= 0x0A07,
 	ice_aqc_opc_lldp_set_local_mib			= 0x0A08,
 	ice_aqc_opc_lldp_stop_start_specific_agent	= 0x0A09,
+	ice_aqc_opc_lldp_filter_ctrl			= 0x0A0A,
 
 	/* RSS commands */
 	ice_aqc_opc_set_rss_key				= 0x0B02,
 	ice_aqc_opc_set_rss_lut				= 0x0B03,
 	ice_aqc_opc_get_rss_key				= 0x0B04,
 	ice_aqc_opc_get_rss_lut				= 0x0B05,
+	ice_aqc_opc_clear_fd_table			= 0x0B06,
+	/* Sideband Control Interface commands */
+	ice_aqc_opc_neighbour_device_request		= 0x0C00,
+	/* ACL commands */
+	ice_aqc_opc_alloc_acl_tbl			= 0x0C10,
+	ice_aqc_opc_dealloc_acl_tbl			= 0x0C11,
+	ice_aqc_opc_alloc_acl_actpair			= 0x0C12,
+	ice_aqc_opc_dealloc_acl_actpair			= 0x0C13,
+	ice_aqc_opc_alloc_acl_scen			= 0x0C14,
+	ice_aqc_opc_dealloc_acl_scen			= 0x0C15,
+	ice_aqc_opc_alloc_acl_counters			= 0x0C16,
+	ice_aqc_opc_dealloc_acl_counters		= 0x0C17,
+	ice_aqc_opc_dealloc_acl_res			= 0x0C1A,
+	ice_aqc_opc_update_acl_scen			= 0x0C1B,
+	ice_aqc_opc_program_acl_actpair			= 0x0C1C,
+	ice_aqc_opc_program_acl_prof_extraction		= 0x0C1D,
+	ice_aqc_opc_program_acl_prof_ranges		= 0x0C1E,
+	ice_aqc_opc_program_acl_entry			= 0x0C20,
+	ice_aqc_opc_query_acl_prof			= 0x0C21,
+	ice_aqc_opc_query_acl_prof_ranges		= 0x0C22,
+	ice_aqc_opc_query_acl_scen			= 0x0C23,
+	ice_aqc_opc_query_acl_entry			= 0x0C24,
+	ice_aqc_opc_query_acl_actpair			= 0x0C25,
+	ice_aqc_opc_query_acl_counter			= 0x0C27,
 
 	/* Tx queue handling commands/events */
 	ice_aqc_opc_add_txqs				= 0x0C30,
 	ice_aqc_opc_dis_txqs				= 0x0C31,
+	ice_aqc_opc_txqs_cleanup			= 0x0C31,
+	ice_aqc_opc_move_recfg_txqs			= 0x0C32,
+	ice_aqc_opc_add_rdma_qset			= 0x0C33,
+	ice_aqc_opc_move_rdma_qset			= 0x0C34,
 
 	/* package commands */
 	ice_aqc_opc_download_pkg			= 0x0C40,
+	ice_aqc_opc_upload_section			= 0x0C41,
+	ice_aqc_opc_update_pkg				= 0x0C42,
 	ice_aqc_opc_get_pkg_info_list			= 0x0C43,
 
-	/* debug commands */
-	ice_aqc_opc_fw_logging				= 0xFF09,
-	ice_aqc_opc_fw_logging_info			= 0xFF10,
+
+	ice_aqc_opc_driver_shared_params		= 0x0C90,
+
+	/* Standalone Commands/Events */
+	ice_aqc_opc_event_lan_overflow			= 0x1001,
+	/* SystemDiagnostic commands */
+	ice_aqc_opc_set_health_status_config		= 0xFF20,
+	ice_aqc_opc_get_supported_health_status_codes	= 0xFF21,
+	ice_aqc_opc_get_health_status			= 0xFF22,
+	ice_aqc_opc_clear_health_status			= 0xFF23,
+
+	/* FW Logging Commands */
+	ice_aqc_opc_fw_logs_config			= 0xFF30,
+	ice_aqc_opc_fw_logs_register			= 0xFF31,
+	ice_aqc_opc_fw_logs_query			= 0xFF32,
+	ice_aqc_opc_fw_logs_event			= 0xFF33,
+	ice_aqc_opc_fw_logs_get				= 0xFF34,
+	ice_aqc_opc_fw_logs_clear			= 0xFF35
 };
 
 #endif /* _ICE_ADMINQ_CMD_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c
new file mode 100644
index 000000000000..b059cd91a7e0
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_arfs.c
@@ -0,0 +1,701 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+
+/**
+ * ice_is_arfs_active - helper to check is aRFS is active
+ * @vsi: VSI to check
+ */
+static bool ice_is_arfs_active(struct ice_vsi *vsi)
+{
+	return !!vsi->arfs_fltr_list;
+}
+
+/**
+ * ice_is_arfs_using_perfect_flow - check if aRFS has active perfect filters
+ * @hw: pointer to the HW structure
+ * @flow_type: flow type as Flow Director understands it
+ *
+ * Flow Director will query this function to see if aRFS is currently using
+ * the specified flow_type for perfect (4-tuple) filters.
+ */
+bool
+ice_is_arfs_using_perfect_flow(struct ice_hw *hw, enum ice_fltr_ptype flow_type)
+{
+	struct ice_arfs_active_fltr_cntrs *arfs_fltr_cntrs;
+	struct ice_pf *pf = hw->back;
+	struct ice_vsi *vsi;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return false;
+
+	arfs_fltr_cntrs = vsi->arfs_fltr_cntrs;
+
+	/* active counters can be updated by multiple CPUs */
+	smp_mb__before_atomic();
+	switch (flow_type) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		return atomic_read(&arfs_fltr_cntrs->active_udpv4_cnt) > 0;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		return atomic_read(&arfs_fltr_cntrs->active_udpv6_cnt) > 0;
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		return atomic_read(&arfs_fltr_cntrs->active_tcpv4_cnt) > 0;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		return atomic_read(&arfs_fltr_cntrs->active_tcpv6_cnt) > 0;
+	default:
+		return false;
+	}
+}
+
+/**
+ * ice_arfs_update_active_fltr_cntrs - update active filter counters for aRFS
+ * @vsi: VSI that aRFS is active on
+ * @entry: aRFS entry used to change counters
+ * @add: true to increment counter, false to decrement
+ */
+static void
+ice_arfs_update_active_fltr_cntrs(struct ice_vsi *vsi,
+				  struct ice_arfs_entry *entry, bool add)
+{
+	struct ice_arfs_active_fltr_cntrs *fltr_cntrs = vsi->arfs_fltr_cntrs;
+
+	switch (entry->fltr_info.flow_type) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		if (add)
+			atomic_inc(&fltr_cntrs->active_tcpv4_cnt);
+		else
+			atomic_dec(&fltr_cntrs->active_tcpv4_cnt);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		if (add)
+			atomic_inc(&fltr_cntrs->active_tcpv6_cnt);
+		else
+			atomic_dec(&fltr_cntrs->active_tcpv6_cnt);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		if (add)
+			atomic_inc(&fltr_cntrs->active_udpv4_cnt);
+		else
+			atomic_dec(&fltr_cntrs->active_udpv4_cnt);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		if (add)
+			atomic_inc(&fltr_cntrs->active_udpv6_cnt);
+		else
+			atomic_dec(&fltr_cntrs->active_udpv6_cnt);
+		break;
+	default:
+		dev_err(ice_pf_to_dev(vsi->back), "aRFS: Failed to update filter counters, invalid filter type %d\n",
+			entry->fltr_info.flow_type);
+	}
+}
+
+/**
+ * ice_arfs_del_flow_rules - delete the rules passed in from HW
+ * @vsi: VSI for the flow rules that need to be deleted
+ * @del_list_head: head of the list of ice_arfs_entry(s) for rule deletion
+ *
+ * Loop through the delete list passed in and remove the rules from HW. After
+ * each rule is deleted, disconnect and free the ice_arfs_entry because it is no
+ * longer being referenced by the aRFS hash table.
+ */
+static void
+ice_arfs_del_flow_rules(struct ice_vsi *vsi, struct hlist_head *del_list_head)
+{
+	struct ice_arfs_entry *e;
+	struct hlist_node *n;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(vsi->back);
+
+	hlist_for_each_entry_safe(e, n, del_list_head, list_entry) {
+		int result;
+
+		result = ice_fdir_write_fltr(vsi->back, &e->fltr_info, false,
+					     false);
+		if (!result)
+			ice_arfs_update_active_fltr_cntrs(vsi, e, false);
+		else
+			dev_dbg(dev, "Unable to delete aRFS entry, err %d fltr_state %d fltr_id %d flow_id %d Q %d\n",
+				result, e->fltr_state, e->fltr_info.fltr_id,
+				e->flow_id, e->fltr_info.q_index);
+
+		/* The aRFS hash table is no longer referencing this entry */
+		hlist_del(&e->list_entry);
+		devm_kfree(dev, e);
+	}
+}
+
+/**
+ * ice_arfs_add_flow_rules - add the rules passed in from HW
+ * @vsi: VSI for the flow rules that need to be added
+ * @add_list_head: head of the list of ice_arfs_entry_ptr(s) for rule addition
+ *
+ * Loop through the add list passed in and remove the rules from HW. After each
+ * rule is added, disconnect and free the ice_arfs_entry_ptr node. Don't free
+ * the ice_arfs_entry(s) because they are still being referenced in the aRFS
+ * hash table.
+ */
+static void
+ice_arfs_add_flow_rules(struct ice_vsi *vsi, struct hlist_head *add_list_head)
+{
+	struct ice_arfs_entry_ptr *ep;
+	struct hlist_node *n;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(vsi->back);
+
+	hlist_for_each_entry_safe(ep, n, add_list_head, list_entry) {
+		int result;
+
+		result = ice_fdir_write_fltr(vsi->back,
+					     &ep->arfs_entry->fltr_info, true,
+					     false);
+		if (!result)
+			ice_arfs_update_active_fltr_cntrs(vsi, ep->arfs_entry,
+							  true);
+		else
+			dev_dbg(dev, "Unable to add aRFS entry, err %d fltr_state %d fltr_id %d flow_id %d Q %d\n",
+				result, ep->arfs_entry->fltr_state,
+				ep->arfs_entry->fltr_info.fltr_id,
+				ep->arfs_entry->flow_id,
+				ep->arfs_entry->fltr_info.q_index);
+
+		hlist_del(&ep->list_entry);
+		devm_kfree(dev, ep);
+	}
+}
+
+/**
+ * ice_arfs_is_flow_expired - check if the aRFS entry has expired
+ * @vsi: VSI containing the aRFS entry
+ * @arfs_entry: aRFS entry that's being checked for expiration
+ *
+ * Return true if the flow has expired, else false. This function should be used
+ * to determine whether or not an aRFS entry should be removed from the hardware
+ * and software structures.
+ */
+static bool
+ice_arfs_is_flow_expired(struct ice_vsi *vsi, struct ice_arfs_entry *arfs_entry)
+{
+#define ICE_ARFS_TIME_DELTA_EXPIRATION	msecs_to_jiffies(5000)
+	if (rps_may_expire_flow(vsi->netdev, arfs_entry->fltr_info.q_index,
+				arfs_entry->flow_id,
+				arfs_entry->fltr_info.fltr_id))
+		return true;
+
+	/* expiration timer only used for UDP filters */
+	if (arfs_entry->fltr_info.flow_type != ICE_FLTR_PTYPE_NONF_IPV4_UDP &&
+	    arfs_entry->fltr_info.flow_type != ICE_FLTR_PTYPE_NONF_IPV6_UDP)
+		return false;
+
+	return time_in_range64(arfs_entry->time_activated +
+			       ICE_ARFS_TIME_DELTA_EXPIRATION,
+			       arfs_entry->time_activated, get_jiffies_64());
+}
+
+/**
+ * ice_arfs_update_flow_rules - add/delete aRFS rules in HW
+ * @vsi: the VSI to be forwarded to
+ * @idx: index into the table of aRFS filter lists. Obtained from skb->hash
+ * @add_list: list to populate with filters to be added to Flow Director
+ * @del_list: list to populate with filters to be deleted from Flow Director
+ *
+ * Iterate over the hlist at the index given in the aRFS hash table and
+ * determine if there are any aRFS entries that need to be either added or
+ * deleted in the HW. If the aRFS entry is marked as ICE_ARFS_INACTIVE the
+ * filter needs to be added to HW, else if it's marked as ICE_ARFS_ACTIVE and
+ * the flow has expired delete the filter from HW. The caller of this function
+ * is expected to add/delete rules on the add_list/del_list respectively.
+ */
+static void
+ice_arfs_update_flow_rules(struct ice_vsi *vsi, u16 idx,
+			   struct hlist_head *add_list,
+			   struct hlist_head *del_list)
+{
+	struct ice_arfs_entry *e;
+	struct hlist_node *n;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(vsi->back);
+
+	/* go through the aRFS hlist at this idx and check for needed updates */
+	hlist_for_each_entry_safe(e, n, &vsi->arfs_fltr_list[idx], list_entry)
+		/* check if filter needs to be added to HW */
+		if (e->fltr_state == ICE_ARFS_INACTIVE) {
+			enum ice_fltr_ptype flow_type = e->fltr_info.flow_type;
+			struct ice_arfs_entry_ptr *ep =
+				devm_kzalloc(dev, sizeof(*ep), GFP_ATOMIC);
+
+			if (!ep)
+				continue;
+			INIT_HLIST_NODE(&ep->list_entry);
+			/* reference aRFS entry to add HW filter */
+			ep->arfs_entry = e;
+			hlist_add_head(&ep->list_entry, add_list);
+			e->fltr_state = ICE_ARFS_ACTIVE;
+			/* expiration timer only used for UDP flows */
+			if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+			    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_UDP)
+				e->time_activated = get_jiffies_64();
+		} else if (e->fltr_state == ICE_ARFS_ACTIVE) {
+			/* check if filter needs to be removed from HW */
+			if (ice_arfs_is_flow_expired(vsi, e)) {
+				/* remove aRFS entry from hash table for delete
+				 * and to prevent referencing it the next time
+				 * through this hlist index
+				 */
+				hlist_del(&e->list_entry);
+				e->fltr_state = ICE_ARFS_TODEL;
+				/* save reference to aRFS entry for delete */
+				hlist_add_head(&e->list_entry, del_list);
+			}
+		}
+}
+
+/**
+ * ice_sync_arfs_fltrs - update all aRFS filters
+ * @pf: board private structure
+ */
+void ice_sync_arfs_fltrs(struct ice_pf *pf)
+{
+	HLIST_HEAD(tmp_del_list);
+	HLIST_HEAD(tmp_add_list);
+	struct ice_vsi *pf_vsi;
+	unsigned int i;
+
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi)
+		return;
+
+	if (!ice_is_arfs_active(pf_vsi))
+		return;
+
+	spin_lock_bh(&pf_vsi->arfs_lock);
+	/* Once we process aRFS for the PF VSI get out */
+	for (i = 0; i < ICE_MAX_ARFS_LIST; i++)
+		ice_arfs_update_flow_rules(pf_vsi, i, &tmp_add_list,
+					   &tmp_del_list);
+	spin_unlock_bh(&pf_vsi->arfs_lock);
+
+	/* use list of ice_arfs_entry(s) for delete */
+	ice_arfs_del_flow_rules(pf_vsi, &tmp_del_list);
+
+	/* use list of ice_arfs_entry_ptr(s) for add */
+	ice_arfs_add_flow_rules(pf_vsi, &tmp_add_list);
+}
+
+#ifdef ICE_ADD_PROBES
+static u16
+ice_arfs_get_cnt_index(struct ice_pf *pf, struct ice_arfs_entry *entry)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	switch (entry->fltr_info.flow_type) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		return ICE_ARFS_STAT_TCPV4_IDX(hw->fd_ctr_base);
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		return ICE_ARFS_STAT_TCPV6_IDX(hw->fd_ctr_base);
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		return ICE_ARFS_STAT_UDPV4_IDX(hw->fd_ctr_base);
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		return ICE_ARFS_STAT_UDPV6_IDX(hw->fd_ctr_base);
+	default:
+		dev_err(ice_pf_to_dev(pf), "aRFS: Invalid flow type %d\n",
+			entry->fltr_info.flow_type);
+		return ICE_FD_SB_STAT_IDX(hw->fd_ctr_base);
+	}
+}
+#endif /* ICE_ADD_PROBES */
+
+/**
+ * ice_arfs_build_entry - builds an aRFS entry based on input
+ * @vsi: destination VSI for this flow
+ * @fk: flow dissector keys for creating the tuple
+ * @rxq_idx: Rx queue to steer this flow to
+ * @flow_id: passed down from the stack and saved for flow expiration
+ *
+ * returns an aRFS entry on success and NULL on failure
+ */
+static struct ice_arfs_entry *
+ice_arfs_build_entry(struct ice_vsi *vsi, const struct flow_keys *fk,
+		     u16 rxq_idx, u32 flow_id)
+{
+	struct ice_arfs_entry *arfs_entry;
+	struct ice_fdir_fltr *fltr_info;
+	u8 ip_proto;
+
+	arfs_entry = devm_kzalloc(ice_pf_to_dev(vsi->back),
+				  sizeof(*arfs_entry),
+				  GFP_ATOMIC | __GFP_NOWARN);
+	if (!arfs_entry)
+		return NULL;
+
+	fltr_info = &arfs_entry->fltr_info;
+	fltr_info->q_index = rxq_idx;
+	fltr_info->dest_ctl = ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QINDEX;
+	fltr_info->dest_vsi = vsi->idx;
+	fltr_info->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_THREE;
+	fltr_info->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
+	ip_proto = fk->basic.ip_proto;
+
+	if (fk->basic.n_proto == htons(ETH_P_IP)) {
+		fltr_info->ip.v4.proto = ip_proto;
+		fltr_info->flow_type = (ip_proto == IPPROTO_TCP) ?
+			ICE_FLTR_PTYPE_NONF_IPV4_TCP :
+			ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+		fltr_info->ip.v4.src_ip = fk->addrs.v4addrs.src;
+		fltr_info->ip.v4.dst_ip = fk->addrs.v4addrs.dst;
+		fltr_info->ip.v4.src_port = fk->ports.src;
+		fltr_info->ip.v4.dst_port = fk->ports.dst;
+	} else { /* ETH_P_IPV6 */
+		fltr_info->ip.v6.proto = ip_proto;
+		fltr_info->flow_type = (ip_proto == IPPROTO_TCP) ?
+			ICE_FLTR_PTYPE_NONF_IPV6_TCP :
+			ICE_FLTR_PTYPE_NONF_IPV6_UDP;
+		memcpy(&fltr_info->ip.v6.src_ip, &fk->addrs.v6addrs.src,
+		       sizeof(struct in6_addr));
+		memcpy(&fltr_info->ip.v6.dst_ip, &fk->addrs.v6addrs.dst,
+		       sizeof(struct in6_addr));
+		fltr_info->ip.v6.src_port = fk->ports.src;
+		fltr_info->ip.v6.dst_port = fk->ports.dst;
+	}
+
+#ifdef ICE_ADD_PROBES
+	fltr_info->cnt_index =
+		ice_arfs_get_cnt_index(vsi->back, arfs_entry);
+	fltr_info->cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+#endif /* ICE_ADD_PROBES */
+
+	arfs_entry->flow_id = flow_id;
+	fltr_info->fltr_id =
+		atomic_inc_return(vsi->arfs_last_fltr_id) % RPS_NO_FILTER;
+
+	return arfs_entry;
+}
+
+/**
+ * ice_arfs_is_perfect_flow_set - Check to see if perfect flow is set
+ * @hw: pointer to HW structure
+ * @l3_proto: ETH_P_IP or ETH_P_IPV6 in network order
+ * @l4_proto: IPPROTO_UDP or IPPROTO_TCP
+ *
+ * We only support perfect (4-tuple) filters for aRFS. This function allows aRFS
+ * to check if perfect (4-tuple) flow rules are currently in place by Flow
+ * Director.
+ */
+static bool
+ice_arfs_is_perfect_flow_set(struct ice_hw *hw, __be16 l3_proto, u8 l4_proto)
+{
+	unsigned long *perfect_fltr = hw->fdir_perfect_fltr;
+
+	/* advanced Flow Director disabled, perfect filters always supported */
+	if (!perfect_fltr)
+		return true;
+
+	if (l3_proto == htons(ETH_P_IP) && l4_proto == IPPROTO_UDP)
+		return test_bit(ICE_FLTR_PTYPE_NONF_IPV4_UDP, perfect_fltr);
+	else if (l3_proto == htons(ETH_P_IP) && l4_proto == IPPROTO_TCP)
+		return test_bit(ICE_FLTR_PTYPE_NONF_IPV4_TCP, perfect_fltr);
+	else if (l3_proto == htons(ETH_P_IPV6) && l4_proto == IPPROTO_UDP)
+		return test_bit(ICE_FLTR_PTYPE_NONF_IPV6_UDP, perfect_fltr);
+	else if (l3_proto == htons(ETH_P_IPV6) && l4_proto == IPPROTO_TCP)
+		return test_bit(ICE_FLTR_PTYPE_NONF_IPV6_TCP, perfect_fltr);
+
+	return false;
+}
+
+/**
+ * ice_rx_flow_steer - steer the Rx flow to where application is being run
+ * @netdev: ptr to the netdev being adjusted
+ * @skb: buffer with required header information
+ * @rxq_idx: queue to which the flow needs to move
+ * @flow_id: flow identifier provided by the netdev
+ *
+ * Based on the skb, rxq_idx, and flow_id passed in add/update an entry in the
+ * aRFS hash table. Iterate over one of the hlists in the aRFS hash table and
+ * if the flow_id already exists in the hash table but the rxq_idx has changed
+ * mark the entry as ICE_ARFS_INACTIVE so it can get updated in HW, else
+ * if the entry is marked as ICE_ARFS_TODEL delete it from the aRFS hash table.
+ * If neither of the previous conditions are true then add a new entry in the
+ * aRFS hash table, which gets set to ICE_ARFS_INACTIVE by default so it can be
+ * added to HW.
+ */
+int
+ice_rx_flow_steer(struct net_device *netdev, const struct sk_buff *skb,
+		  u16 rxq_idx, u32 flow_id)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_arfs_entry *arfs_entry;
+	struct ice_vsi *vsi = np->vsi;
+	struct flow_keys fk;
+	struct ice_pf *pf;
+	__be16 n_proto;
+	u8 ip_proto;
+	u16 idx;
+	int ret;
+
+	/* failed to allocate memory for aRFS so don't crash */
+	if (unlikely(!vsi->arfs_fltr_list))
+		return -ENODEV;
+
+	pf = vsi->back;
+
+#ifdef NETIF_F_HW_TC
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	/* aRFS only supported on Rx queues belonging to PF VSI */
+	if (vsi->type == ICE_VSI_PF && ice_is_adq_active(pf) &&
+	    rxq_idx >= vsi->mqprio_qopt.qopt.count[0])
+		return -EOPNOTSUPP;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#endif /* NETIF_F_HW_TC */
+
+	if (skb->encapsulation)
+		return -EPROTONOSUPPORT;
+
+	if (!skb_flow_dissect_flow_keys(skb, &fk, 0))
+		return -EPROTONOSUPPORT;
+
+	n_proto = fk.basic.n_proto;
+	/* Support only IPV4 and IPV6 */
+	if ((n_proto == htons(ETH_P_IP) && !ip_is_fragment(ip_hdr(skb))) ||
+	    n_proto == htons(ETH_P_IPV6))
+		ip_proto = fk.basic.ip_proto;
+	else
+		return -EPROTONOSUPPORT;
+
+	/* Support only TCP and UDP */
+	if (ip_proto != IPPROTO_TCP && ip_proto != IPPROTO_UDP)
+		return -EPROTONOSUPPORT;
+
+	/* only support 4-tuple filters for aRFS */
+	if (!ice_arfs_is_perfect_flow_set(&pf->hw, n_proto, ip_proto))
+		return -EOPNOTSUPP;
+
+	/* choose the aRFS list bucket based on skb hash */
+	idx = skb_get_hash_raw(skb) & ICE_ARFS_LST_MASK;
+	/* search for entry in the bucket */
+	spin_lock_bh(&vsi->arfs_lock);
+	hlist_for_each_entry(arfs_entry, &vsi->arfs_fltr_list[idx],
+			     list_entry) {
+		struct ice_fdir_fltr *fltr_info;
+
+		/* keep searching for the already existing arfs_entry flow */
+		if (arfs_entry->flow_id != flow_id)
+			continue;
+
+		fltr_info = &arfs_entry->fltr_info;
+		ret = fltr_info->fltr_id;
+
+		if (fltr_info->q_index == rxq_idx ||
+		    arfs_entry->fltr_state != ICE_ARFS_ACTIVE)
+			goto out;
+
+		/* update the queue to forward to on an already existing flow */
+		fltr_info->q_index = rxq_idx;
+		arfs_entry->fltr_state = ICE_ARFS_INACTIVE;
+		ice_arfs_update_active_fltr_cntrs(vsi, arfs_entry, false);
+		goto out_schedule_service_task;
+	}
+
+	arfs_entry = ice_arfs_build_entry(vsi, &fk, rxq_idx, flow_id);
+	if (!arfs_entry) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = arfs_entry->fltr_info.fltr_id;
+	INIT_HLIST_NODE(&arfs_entry->list_entry);
+	hlist_add_head(&arfs_entry->list_entry, &vsi->arfs_fltr_list[idx]);
+out_schedule_service_task:
+	ice_service_task_schedule(pf);
+out:
+	spin_unlock_bh(&vsi->arfs_lock);
+	return ret;
+}
+
+/**
+ * ice_init_arfs_cntrs - initialize aRFS counter values
+ * @vsi: VSI that aRFS counters need to be initialized on
+ */
+static int ice_init_arfs_cntrs(struct ice_vsi *vsi)
+{
+	if (!vsi || vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	vsi->arfs_fltr_cntrs = kzalloc(sizeof(*vsi->arfs_fltr_cntrs),
+				       GFP_KERNEL);
+	if (!vsi->arfs_fltr_cntrs)
+		return -ENOMEM;
+
+	vsi->arfs_last_fltr_id = kzalloc(sizeof(*vsi->arfs_last_fltr_id),
+					 GFP_KERNEL);
+	if (!vsi->arfs_last_fltr_id) {
+		kfree(vsi->arfs_fltr_cntrs);
+		vsi->arfs_fltr_cntrs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_init_arfs - initialize aRFS resources
+ * @vsi: the VSI to be forwarded to
+ */
+void ice_init_arfs(struct ice_vsi *vsi)
+{
+	struct hlist_head *arfs_fltr_list;
+	unsigned int i;
+
+	if (!vsi || vsi->type != ICE_VSI_PF)
+		return;
+
+	arfs_fltr_list = kzalloc(sizeof(*arfs_fltr_list) * ICE_MAX_ARFS_LIST,
+				 GFP_KERNEL);
+	if (!arfs_fltr_list)
+		return;
+
+	if (ice_init_arfs_cntrs(vsi))
+		goto free_arfs_fltr_list;
+
+	for (i = 0; i < ICE_MAX_ARFS_LIST; i++)
+		INIT_HLIST_HEAD(&arfs_fltr_list[i]);
+
+	spin_lock_init(&vsi->arfs_lock);
+
+	vsi->arfs_fltr_list = arfs_fltr_list;
+
+	return;
+
+free_arfs_fltr_list:
+	kfree(arfs_fltr_list);
+}
+
+/**
+ * ice_clear_arfs - clear the aRFS hash table and any memory used for aRFS
+ * @vsi: the VSI to be forwarded to
+ */
+void ice_clear_arfs(struct ice_vsi *vsi)
+{
+	struct device *dev;
+	unsigned int i;
+
+	if (!vsi || vsi->type != ICE_VSI_PF || !vsi->back ||
+	    !vsi->arfs_fltr_list)
+		return;
+
+	dev = ice_pf_to_dev(vsi->back);
+	for (i = 0; i < ICE_MAX_ARFS_LIST; i++) {
+		struct ice_arfs_entry *r;
+		struct hlist_node *n;
+
+		spin_lock_bh(&vsi->arfs_lock);
+		hlist_for_each_entry_safe(r, n, &vsi->arfs_fltr_list[i],
+					  list_entry) {
+			hlist_del(&r->list_entry);
+			devm_kfree(dev, r);
+		}
+		spin_unlock_bh(&vsi->arfs_lock);
+	}
+
+	kfree(vsi->arfs_fltr_list);
+	vsi->arfs_fltr_list = NULL;
+	kfree(vsi->arfs_last_fltr_id);
+	vsi->arfs_last_fltr_id = NULL;
+	kfree(vsi->arfs_fltr_cntrs);
+	vsi->arfs_fltr_cntrs = NULL;
+}
+
+/**
+ * ice_free_cpu_rx_rmap - free setup cpu reverse map
+ * @vsi: the VSI to be forwarded to
+ */
+static void ice_free_cpu_rx_rmap(struct ice_vsi *vsi)
+{
+	struct net_device *netdev;
+
+	if (!vsi || vsi->type != ICE_VSI_PF || !vsi->arfs_fltr_list)
+		return;
+
+	netdev = vsi->netdev;
+	if (!netdev || !netdev->rx_cpu_rmap)
+		return;
+
+	free_irq_cpu_rmap(netdev->rx_cpu_rmap);
+	netdev->rx_cpu_rmap = NULL;
+}
+
+/**
+ * ice_set_cpu_rx_rmap - setup cpu reverse map for each queue
+ * @vsi: the VSI to be forwarded to
+ */
+int ice_set_cpu_rx_rmap(struct ice_vsi *vsi)
+{
+	struct net_device *netdev;
+	struct ice_pf *pf;
+	int base_idx, i;
+
+	if (!vsi || vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	pf = vsi->back;
+	netdev = vsi->netdev;
+	if (!pf || !netdev || !vsi->num_q_vectors)
+		return -EINVAL;
+
+	netdev_dbg(netdev, "Setup CPU RMAP: vsi type 0x%x, ifname %s, q_vectors %d\n",
+		   vsi->type, netdev->name, vsi->num_q_vectors);
+
+	netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(vsi->num_q_vectors);
+	if (unlikely(!netdev->rx_cpu_rmap))
+		return -EINVAL;
+
+	base_idx = vsi->base_vector;
+	for (i = 0; i < vsi->num_q_vectors; i++)
+		if (irq_cpu_rmap_add(netdev->rx_cpu_rmap,
+				     pf->msix_entries[base_idx + i].vector)) {
+			ice_free_cpu_rx_rmap(vsi);
+			return -EINVAL;
+		}
+
+	return 0;
+}
+
+/**
+ * ice_remove_arfs - remove/clear all aRFS resources
+ * @pf: device private structure
+ */
+void ice_remove_arfs(struct ice_pf *pf)
+{
+	struct ice_vsi *pf_vsi;
+
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi)
+		return;
+
+	ice_free_cpu_rx_rmap(pf_vsi);
+	ice_clear_arfs(pf_vsi);
+}
+
+/**
+ * ice_rebuild_arfs - remove/clear all aRFS resources and rebuild after reset
+ * @pf: device private structure
+ */
+void ice_rebuild_arfs(struct ice_pf *pf)
+{
+	struct ice_vsi *pf_vsi;
+
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi)
+		return;
+
+	ice_remove_arfs(pf);
+	if (ice_set_cpu_rx_rmap(pf_vsi)) {
+		dev_err(ice_pf_to_dev(pf), "Failed to rebuild aRFS\n");
+		return;
+	}
+	ice_init_arfs(pf_vsi);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.h b/drivers/net/ethernet/intel/ice/ice_arfs.h
new file mode 100644
index 000000000000..7b9346fa73c8
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_arfs.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_ARFS_H_
+#define _ICE_ARFS_H_
+
+#include "ice.h"
+
+enum ice_arfs_fltr_state {
+	ICE_ARFS_INACTIVE,
+	ICE_ARFS_ACTIVE,
+	ICE_ARFS_TODEL,
+};
+
+struct ice_arfs_entry {
+	struct ice_fdir_fltr fltr_info;
+	struct hlist_node list_entry;
+	u64 time_activated;	/* only valid for UDP flows */
+	u32 flow_id;
+	/* fltr_state = 0 - ICE_ARFS_INACTIVE:
+	 *	filter needs to be updated or programmed in HW.
+	 * fltr_state = 1 - ICE_ARFS_ACTIVE:
+	 *	filter is active and programmed in HW.
+	 * fltr_state = 2 - ICE_ARFS_TODEL:
+	 *	filter has been deleted from HW and needs to be removed from
+	 *	the aRFS hash table.
+	 */
+	u8 fltr_state;
+};
+
+struct ice_arfs_entry_ptr {
+	struct ice_arfs_entry *arfs_entry;
+	struct hlist_node list_entry;
+};
+
+struct ice_arfs_active_fltr_cntrs {
+	atomic_t active_tcpv4_cnt;
+	atomic_t active_tcpv6_cnt;
+	atomic_t active_udpv4_cnt;
+	atomic_t active_udpv6_cnt;
+};
+
+#ifdef CONFIG_RFS_ACCEL
+int
+ice_rx_flow_steer(struct net_device *netdev, const struct sk_buff *skb,
+		  u16 rxq_idx, u32 flow_id);
+void ice_clear_arfs(struct ice_vsi *vsi);
+void ice_init_arfs(struct ice_vsi *vsi);
+void ice_sync_arfs_fltrs(struct ice_pf *pf);
+int ice_set_cpu_rx_rmap(struct ice_vsi *vsi);
+void ice_remove_arfs(struct ice_pf *pf);
+void ice_rebuild_arfs(struct ice_pf *pf);
+bool
+ice_is_arfs_using_perfect_flow(struct ice_hw *hw,
+			       enum ice_fltr_ptype flow_type);
+#else
+static inline void ice_clear_arfs(struct ice_vsi *vsi) { }
+static inline void ice_init_arfs(struct ice_vsi *vsi) { }
+static inline void ice_sync_arfs_fltrs(struct ice_pf *pf) { }
+static inline void ice_remove_arfs(struct ice_pf *pf) { }
+static inline void ice_rebuild_arfs(struct ice_pf *pf) { }
+
+static inline int ice_set_cpu_rx_rmap(struct ice_vsi __always_unused *vsi)
+{
+	return 0;
+}
+
+static inline int
+ice_rx_flow_steer(struct net_device __always_unused *netdev,
+		  const struct sk_buff __always_unused *skb,
+		  u16 __always_unused rxq_idx, u32 __always_unused flow_id)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool
+ice_is_arfs_using_perfect_flow(struct ice_hw __always_unused *hw,
+			       enum ice_fltr_ptype __always_unused flow_type)
+{
+	return false;
+}
+#endif /* CONFIG_RFS_ACCEL */
+#endif /* _ICE_ARFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
new file mode 100644
index 000000000000..e2acab0c0777
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_base.c
@@ -0,0 +1,1045 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_base.h"
+#include "ice_lib.h"
+#include "ice_dcb_lib.h"
+#include "ice_virtchnl_pf.h"
+
+/**
+ * __ice_vsi_get_qs_contig - Assign a contiguous chunk of queues to VSI
+ * @qs_cfg: gathered variables needed for PF->VSI queues assignment
+ *
+ * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
+ */
+static int __ice_vsi_get_qs_contig(struct ice_qs_cfg *qs_cfg)
+{
+	unsigned int offset, i;
+
+	mutex_lock(qs_cfg->qs_mutex);
+	offset = bitmap_find_next_zero_area(qs_cfg->pf_map, qs_cfg->pf_map_size,
+					    0, qs_cfg->q_count, 0);
+	if (offset >= qs_cfg->pf_map_size) {
+		mutex_unlock(qs_cfg->qs_mutex);
+		return -ENOMEM;
+	}
+
+	bitmap_set(qs_cfg->pf_map, offset, qs_cfg->q_count);
+	for (i = 0; i < qs_cfg->q_count; i++)
+		qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = (u16)(i + offset);
+	mutex_unlock(qs_cfg->qs_mutex);
+
+	return 0;
+}
+
+/**
+ * __ice_vsi_get_qs_sc - Assign a scattered queues from PF to VSI
+ * @qs_cfg: gathered variables needed for pf->vsi queues assignment
+ *
+ * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
+ */
+static int __ice_vsi_get_qs_sc(struct ice_qs_cfg *qs_cfg)
+{
+	unsigned int i, index = 0;
+
+	mutex_lock(qs_cfg->qs_mutex);
+	for (i = 0; i < qs_cfg->q_count; i++) {
+		index = find_next_zero_bit(qs_cfg->pf_map,
+					   qs_cfg->pf_map_size, index);
+		if (index >= qs_cfg->pf_map_size)
+			goto err_scatter;
+		set_bit(index, qs_cfg->pf_map);
+		qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = (u16)index;
+	}
+	mutex_unlock(qs_cfg->qs_mutex);
+
+	return 0;
+err_scatter:
+	for (index = 0; index < i; index++) {
+		clear_bit(qs_cfg->vsi_map[index], qs_cfg->pf_map);
+		qs_cfg->vsi_map[index + qs_cfg->vsi_map_offset] = 0;
+	}
+	mutex_unlock(qs_cfg->qs_mutex);
+
+	return -ENOMEM;
+}
+
+/**
+ * ice_pf_rxq_wait - Wait for a PF's Rx queue to be enabled or disabled
+ * @pf: the PF being configured
+ * @pf_q: the PF queue
+ * @ena: enable or disable state of the queue
+ *
+ * This routine will wait for the given Rx queue of the PF to reach the
+ * enabled or disabled state.
+ * Returns -ETIMEDOUT in case of failing to reach the requested state after
+ * multiple retries; else will return 0 in case of success.
+ */
+static int ice_pf_rxq_wait(struct ice_pf *pf, int pf_q, bool ena)
+{
+	int i;
+
+	for (i = 0; i < ICE_Q_WAIT_MAX_RETRY; i++) {
+		if (ena == !!(rd32(&pf->hw, QRX_CTRL(pf_q)) &
+			      QRX_CTRL_QENA_STAT_M))
+			return 0;
+
+		usleep_range(20, 40);
+	}
+
+	return -ETIMEDOUT;
+}
+
+/**
+ * ice_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
+ * @vsi: the VSI being configured
+ * @v_idx: index of the vector in the VSI struct
+ *
+ * We allocate one q_vector and set default value for ITR setting associated
+ * with this q_vector. If allocation fails we return -ENOMEM.
+ */
+static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, u16 v_idx)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_q_vector *q_vector;
+
+	/* allocate q_vector */
+	q_vector = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*q_vector),
+				GFP_KERNEL);
+	if (!q_vector)
+		return -ENOMEM;
+
+	q_vector->vsi = vsi;
+	q_vector->v_idx = v_idx;
+	q_vector->tx.itr_setting = ICE_DFLT_TX_ITR;
+	q_vector->rx.itr_setting = ICE_DFLT_RX_ITR;
+	q_vector->tx.itr_mode = ITR_DYNAMIC;
+	q_vector->rx.itr_mode = ITR_DYNAMIC;
+
+	if (vsi->type == ICE_VSI_VF)
+		goto out;
+	/* only set affinity_mask if the CPU is online */
+	if (cpu_online(v_idx))
+		cpumask_set_cpu(v_idx, &q_vector->affinity_mask);
+
+	/* This will not be called in the driver load path because the netdev
+	 * will not be created yet. All other cases with register the NAPI
+	 * handler here (i.e. resume, reset/rebuild, etc.)
+	 */
+	if (vsi->netdev)
+		netif_napi_add(vsi->netdev, &q_vector->napi, ice_napi_poll,
+			       NAPI_POLL_WEIGHT);
+
+out:
+	/* tie q_vector and VSI together */
+	vsi->q_vectors[v_idx] = q_vector;
+
+	return 0;
+}
+
+/**
+ * ice_free_q_vector - Free memory allocated for a specific interrupt vector
+ * @vsi: VSI having the memory freed
+ * @v_idx: index of the vector to be freed
+ */
+static void ice_free_q_vector(struct ice_vsi *vsi, int v_idx)
+{
+	struct ice_q_vector *q_vector;
+	struct ice_pf *pf = vsi->back;
+	struct ice_ring *ring;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (!vsi->q_vectors[v_idx]) {
+		dev_dbg(dev, "Queue vector at index %d not found\n", v_idx);
+		return;
+	}
+	q_vector = vsi->q_vectors[v_idx];
+
+	ice_for_each_ring(ring, q_vector->tx)
+		ring->q_vector = NULL;
+	ice_for_each_ring(ring, q_vector->rx)
+		ring->q_vector = NULL;
+
+	/* only VSI with an associated netdev is set up with NAPI */
+	if (vsi->netdev)
+		netif_napi_del(&q_vector->napi);
+
+	devm_kfree(dev, q_vector);
+	vsi->q_vectors[v_idx] = NULL;
+}
+
+/**
+ * ice_cfg_itr_gran - set the ITR granularity to 2 usecs if not already set
+ * @hw: board specific structure
+ */
+static void ice_cfg_itr_gran(struct ice_hw *hw)
+{
+	u32 regval = rd32(hw, GLINT_CTL);
+
+	/* no need to update global register if ITR gran is already set */
+	if (!(regval & GLINT_CTL_DIS_AUTOMASK_M) &&
+	    (((regval & GLINT_CTL_ITR_GRAN_200_M) >>
+	     GLINT_CTL_ITR_GRAN_200_S) == ICE_ITR_GRAN_US) &&
+	    (((regval & GLINT_CTL_ITR_GRAN_100_M) >>
+	     GLINT_CTL_ITR_GRAN_100_S) == ICE_ITR_GRAN_US) &&
+	    (((regval & GLINT_CTL_ITR_GRAN_50_M) >>
+	     GLINT_CTL_ITR_GRAN_50_S) == ICE_ITR_GRAN_US) &&
+	    (((regval & GLINT_CTL_ITR_GRAN_25_M) >>
+	      GLINT_CTL_ITR_GRAN_25_S) == ICE_ITR_GRAN_US))
+		return;
+
+	regval = ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_200_S) &
+		  GLINT_CTL_ITR_GRAN_200_M) |
+		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_100_S) &
+		  GLINT_CTL_ITR_GRAN_100_M) |
+		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_50_S) &
+		  GLINT_CTL_ITR_GRAN_50_M) |
+		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_25_S) &
+		  GLINT_CTL_ITR_GRAN_25_M);
+	wr32(hw, GLINT_CTL, regval);
+}
+
+/**
+ * ice_calc_q_handle - calculate the queue handle
+ * @vsi: VSI that ring belongs to
+ * @ring: ring to get the absolute queue index
+ * @tc: traffic class number
+ */
+static u16 ice_calc_q_handle(struct ice_vsi *vsi, struct ice_ring *ring, u8 tc)
+{
+#ifdef HAVE_XDP_SUPPORT
+	WARN_ONCE(ice_ring_is_xdp(ring) && tc, "XDP ring can't belong to TC other than 0\n");
+
+#endif /* HAVE_XDP_SUPPORT */
+	if (ring->ch)
+		return ring->q_index - ring->ch->base_q;
+
+	/* Idea here for calculation is that we subtract the number of queue
+	 * count from TC that ring belongs to from it's absolute queue index
+	 * and as a result we get the queue's index within TC.
+	 */
+	return ring->q_index - vsi->tc_cfg.tc_info[tc].qoffset;
+}
+
+/**
+ * ice_eswitch_calc_q_handle
+ * @ring: pointer to ring which unique index is needed
+ *
+ * To correctly work with many netdevs
+ * ring->q_index of Tx rings on switchdev VSI can repeat. Hardware ring setup
+ * requires unique q_index. Calculate it here by finding index in vsi->tx_rings
+ * of this ring.
+ *
+ * Return -1 when index wasn't found. Should never happen, because vsi is get
+ * from ring->vsi, so it has to be present in this vsi.
+ */
+static u16 ice_eswitch_calc_q_handle(struct ice_ring *ring)
+{
+	struct ice_vsi *vsi = ring->vsi;
+	int i;
+
+	ice_for_each_txq(vsi, i) {
+		if (vsi->tx_rings[i] == ring)
+			return i;
+	}
+
+	return -1;
+}
+
+/**
+ * ice_cfg_xps_tx_ring - Configure XPS for a Tx ring
+ * @ring: The Tx ring to configure
+ *
+ * This enables/disables XPS for a given Tx descriptor ring
+ * based on the TCs enabled for the VSI that ring belongs to.
+ */
+static void ice_cfg_xps_tx_ring(struct ice_ring *ring)
+{
+#ifndef HAVE_XPS_QOS_SUPPORT
+	struct ice_vsi *vsi = ring->vsi;
+
+#endif /* !HAVE_XPS_QOS_SUPPORT */
+	if (!ring->q_vector || !ring->netdev)
+		return;
+
+#ifndef HAVE_XPS_QOS_SUPPORT
+	/* Single TC mode enable XPS
+	 * If there is more than 1 TC, netdev_set_num_tc() resets XPS settings
+	 */
+	if (vsi->tc_cfg.numtc > 1)
+		return;
+#endif /* !HAVE_XPS_QOS_SUPPORT */
+
+	/* We only initialize XPS once, so as not to overwrite user settings */
+	if (test_and_set_bit(ICE_TX_XPS_INIT_DONE, ring->xps_state))
+		return;
+
+	netif_set_xps_queue(ring->netdev, &ring->q_vector->affinity_mask,
+			    ring->q_index);
+}
+
+/**
+ * ice_setup_tx_ctx - setup a struct ice_tlan_ctx instance
+ * @ring: The Tx ring to configure
+ * @tlan_ctx: Pointer to the Tx LAN queue context structure to be initialized
+ * @pf_q: queue index in the PF space
+ *
+ * Configure the Tx descriptor ring in TLAN context.
+ */
+static void
+ice_setup_tx_ctx(struct ice_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf_q)
+{
+	struct ice_vsi *vsi = ring->vsi;
+	struct ice_hw *hw = &vsi->back->hw;
+
+	tlan_ctx->base = ring->dma >> ICE_TLAN_CTX_BASE_S;
+
+	tlan_ctx->port_num = vsi->port_info->lport;
+
+	/* Transmit Queue Length */
+	tlan_ctx->qlen = ring->count;
+
+	ice_set_cgd_num(tlan_ctx, ring);
+
+	/* PF number */
+	tlan_ctx->pf_num = hw->pf_id;
+
+	/* queue belongs to a specific VSI type
+	 * VF / VM index should be programmed per vmvf_type setting:
+	 * for vmvf_type = VF, it is VF number between 0-256
+	 * for vmvf_type = VM, it is VM number between 0-767
+	 * for PF or EMP this field should be set to zero
+	 */
+	switch (vsi->type) {
+	case ICE_VSI_LB:
+	case ICE_VSI_CTRL:
+	case ICE_VSI_PF:
+		if (ring->ch)
+			tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
+		else
+			tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_PF;
+		break;
+	case ICE_VSI_VF:
+		/* Firmware expects vmvf_num to be absolute VF ID */
+		tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_id;
+		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
+		break;
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
+		break;
+	default:
+		return;
+	}
+
+	/* make sure the context is associated with the right VSI */
+	if (ring->ch)
+		tlan_ctx->src_vsi = ring->ch->vsi_num;
+	else
+		tlan_ctx->src_vsi = ice_get_hw_vsi_num(hw, vsi->idx);
+
+	/* Restrict Tx timestamps to the PF VSI */
+	switch (vsi->type) {
+	case ICE_VSI_PF:
+		tlan_ctx->tsyn_ena = 1;
+		break;
+	default:
+		break;
+	}
+
+	tlan_ctx->tso_ena = ICE_TX_LEGACY;
+	tlan_ctx->tso_qnum = pf_q;
+
+	/* Legacy or Advanced Host Interface:
+	 * 0: Advanced Host Interface
+	 * 1: Legacy Host Interface
+	 */
+	tlan_ctx->legacy_int = ICE_TX_LEGACY;
+}
+
+/**
+ * ice_setup_rx_ctx - Configure a receive ring context
+ * @ring: The Rx ring to configure
+ *
+ * Configure the Rx descriptor ring in RLAN context.
+ */
+static int ice_setup_rx_ctx(struct ice_ring *ring)
+{
+	int chain_len = ICE_MAX_CHAINED_RX_BUFS;
+	struct ice_vsi *vsi = ring->vsi;
+	u32 rxdid = ICE_RXDID_FLEX_NIC;
+	struct ice_rlan_ctx rlan_ctx;
+	struct ice_hw *hw;
+	u16 pf_q;
+	int err;
+
+	hw = &vsi->back->hw;
+
+	/* what is Rx queue number in global space of 2K Rx queues */
+	pf_q = vsi->rxq_map[ring->q_index];
+
+	/* clear the context structure first */
+	memset(&rlan_ctx, 0, sizeof(rlan_ctx));
+
+	/* Receive Queue Base Address.
+	 * Indicates the starting address of the descriptor queue defined in
+	 * 128 Byte units.
+	 */
+	rlan_ctx.base = ring->dma >> 7;
+
+	rlan_ctx.qlen = ring->count;
+
+	/* Receive Packet Data Buffer Size.
+	 * The Packet Data Buffer Size is defined in 128 byte units.
+	 */
+	rlan_ctx.dbuf = ring->rx_buf_len >> ICE_RLAN_CTX_DBUF_S;
+
+	/* use 32 byte descriptors */
+	rlan_ctx.dsize = 1;
+
+	/* Strip the Ethernet CRC bytes before the packet is posted to host
+	 * memory.
+	 */
+	rlan_ctx.crcstrip = ring->rx_crc_strip_dis ? 0 : 1;
+
+	/* L2TSEL flag defines the reported L2 Tags in the receive descriptor
+	 * and it needs to remain 1 for non-DVM capable configurations to not
+	 * break backward compatibility for VF drivers. Setting this field to 0
+	 * will cause the single/outer VLAN tag to be stripped to the L2TAG2_2ND
+	 * field in the Rx descriptor. Setting it to 1 allows the VLAN tag to
+	 * be stripped in L2TAG1 of the Rx descriptor, which is where VFs will
+	 * check for the tag
+	 */
+	if (ice_is_dvm_ena(hw))
+		if (vsi->type == ICE_VSI_VF &&
+		    ice_vf_is_port_vlan_ena(&vsi->back->vf[vsi->vf_id]))
+			rlan_ctx.l2tsel = 1;
+		else
+			rlan_ctx.l2tsel = 0;
+	else
+		rlan_ctx.l2tsel = 1;
+
+	rlan_ctx.dtype = ICE_RX_DTYPE_NO_SPLIT;
+	rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_NO_SPLIT;
+	rlan_ctx.hsplit_1 = ICE_RLAN_RX_HSPLIT_1_NO_SPLIT;
+
+	/* This controls whether VLAN is stripped from inner headers
+	 * The VLAN in the inner L2 header is stripped to the receive
+	 * descriptor if enabled by this flag.
+	 */
+	rlan_ctx.showiv = 0;
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	/* For AF_XDP ZC, we disallow packets to span on
+	 * multiple buffers, thus letting us skip that
+	 * handling in the fast-path.
+	 */
+	if (ring->xsk_pool)
+		chain_len = 1;
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	/* Max packet size for this queue - must not be set to a larger value
+	 * than 5 x DBUF
+	 */
+	rlan_ctx.rxmax = min_t(u32, vsi->max_frame,
+			       chain_len * ring->rx_buf_len);
+
+	/* Rx queue threshold in units of 64 */
+	rlan_ctx.lrxqthresh = 1;
+
+	/* Enable Flexible Descriptors in the queue context which
+	 * allows this driver to select a specific receive descriptor format
+	 * increasing context priority to pick up profile ID; default is 0x01;
+	 * setting to 0x03 to ensure profile is programming if prev context is
+	 * of same priority
+	 */
+	if (vsi->type != ICE_VSI_VF)
+		ice_write_qrxflxp_cntxt(hw, pf_q, rxdid, 0x3, true);
+	else
+		ice_write_qrxflxp_cntxt(hw, pf_q, ICE_RXDID_LEGACY_1, 0x3,
+					false);
+
+	/* Absolute queue number out of 2K needs to be passed */
+	err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q);
+	if (err) {
+		dev_err(ice_pf_to_dev(vsi->back),
+			"Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n",
+			pf_q, err);
+		return -EIO;
+	}
+
+	if (vsi->type == ICE_VSI_VF)
+		return 0;
+
+	/* configure Rx buffer alignment */
+	if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags))
+		ice_clear_ring_build_skb_ena(ring);
+	else
+		ice_set_ring_build_skb_ena(ring);
+
+	/* init queue specific tail register */
+	ring->tail = hw->hw_addr + QRX_TAIL(pf_q);
+	writel(0, ring->tail);
+
+	return 0;
+}
+
+/**
+ * ice_vsi_cfg_rxq - Configure an Rx queue
+ * @ring: the ring being configured
+ *
+ * Return 0 on success and a negative value on error.
+ */
+int ice_vsi_cfg_rxq(struct ice_ring *ring)
+{
+	struct device *dev = ice_pf_to_dev(ring->vsi->back);
+	u16 num_bufs = ICE_DESC_UNUSED(ring);
+	int err;
+
+	ring->rx_buf_len = ring->vsi->rx_buf_len;
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (ring->vsi->type == ICE_VSI_PF) {
+		if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+			/* coverity[check_return] */
+			xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
+					 ring->q_index, ring->q_vector->napi.napi_id);
+
+		ring->xsk_pool = ice_xsk_umem(ring);
+		if (ring->xsk_pool) {
+			xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
+
+			ring->rx_buf_len =
+				xsk_pool_get_rx_frame_size(ring->xsk_pool);
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			ring->zca.free = ice_zca_free;
+			err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+							 MEM_TYPE_ZERO_COPY,
+							 &ring->zca);
+			if (err)
+				return err;
+
+			dev_info(dev, "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+				 ring->q_index);
+#else
+			err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+							 MEM_TYPE_XSK_BUFF_POOL,
+							 NULL);
+			if (err)
+				return err;
+			xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
+
+			dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
+				 ring->q_index);
+#endif
+		} else {
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			ring->zca.free = NULL;
+#endif
+			if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+				/* coverity[check_return] */
+				xdp_rxq_info_reg(&ring->xdp_rxq,
+						 ring->netdev,
+						 ring->q_index, ring->q_vector->napi.napi_id);
+
+			err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+							 MEM_TYPE_PAGE_SHARED,
+							 NULL);
+			if (err)
+				return err;
+		}
+	}
+#elif defined(HAVE_XDP_FRAME_STRUCT)
+	if (ring->vsi->type == ICE_VSI_PF) {
+		if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+			/* coverity[check_return] */
+			xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
+					 ring->q_index, ring->q_vector->napi.napi_id);
+
+		err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err)
+			return err;
+	}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	err = ice_setup_rx_ctx(ring);
+	if (err) {
+		dev_err(dev, "ice_setup_rx_ctx failed for RxQ %d, err %d\n",
+			ring->q_index, err);
+		return err;
+	}
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (ring->xsk_pool) {
+#ifdef HAVE_XSK_UMEM_HAS_ADDRS
+		if (!xsk_umem_has_addrs_rq(ring->xsk_pool, num_bufs)) {
+			dev_warn(dev, "UMEM does not provide enough addresses to fill %d buffers on Rx ring %d\n",
+				 num_bufs, ring->q_index);
+			dev_warn(dev, "Change Rx ring/fill queue size to avoid performance issues\n");
+
+			return 0;
+		}
+#endif /* HAVE_XSK_UMEM_HAS_ADDRS */
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		err = ice_alloc_rx_bufs_slow_zc(ring, num_bufs);
+#else
+		err = ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring));
+#endif
+		if (err) {
+			u16 pf_q = ring->vsi->rxq_map[ring->q_index];
+
+			dev_info(dev, "Failed to allocate some buffers on UMEM enabled Rx ring %d (pf_q %d)\n",
+				 ring->q_index, pf_q);
+		}
+
+		return 0;
+	}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	ice_alloc_rx_bufs(ring, num_bufs);
+
+	return 0;
+}
+
+/**
+ * __ice_vsi_get_qs - helper function for assigning queues from PF to VSI
+ * @qs_cfg: gathered variables needed for pf->vsi queues assignment
+ *
+ * This function first tries to find contiguous space. If it is not successful,
+ * it tries with the scatter approach.
+ *
+ * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
+ */
+int __ice_vsi_get_qs(struct ice_qs_cfg *qs_cfg)
+{
+	int ret = 0;
+
+	ret = __ice_vsi_get_qs_contig(qs_cfg);
+	if (ret) {
+		/* contig failed, so try with scatter approach */
+		qs_cfg->mapping_mode = ICE_VSI_MAP_SCATTER;
+		qs_cfg->q_count = min_t(unsigned int, qs_cfg->q_count,
+					qs_cfg->scatter_count);
+		ret = __ice_vsi_get_qs_sc(qs_cfg);
+	}
+	return ret;
+}
+
+/**
+ * ice_vsi_ctrl_one_rx_ring - start/stop VSI's Rx ring with no busy wait
+ * @vsi: the VSI being configured
+ * @ena: start or stop the Rx ring
+ * @rxq_idx: 0-based Rx queue index for the VSI passed in
+ * @wait: wait or don't wait for configuration to finish in hardware
+ *
+ * Return 0 on success and negative on error.
+ */
+int
+ice_vsi_ctrl_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx, bool wait)
+{
+	int pf_q = vsi->rxq_map[rxq_idx];
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	u32 rx_reg;
+
+	rx_reg = rd32(hw, QRX_CTRL(pf_q));
+
+	/* Skip if the queue is already in the requested state */
+	if (ena == !!(rx_reg & QRX_CTRL_QENA_STAT_M))
+		return 0;
+
+	/* turn on/off the queue */
+	if (ena)
+		rx_reg |= QRX_CTRL_QENA_REQ_M;
+	else
+		rx_reg &= ~QRX_CTRL_QENA_REQ_M;
+	wr32(hw, QRX_CTRL(pf_q), rx_reg);
+
+	if (!wait)
+		return 0;
+
+	ice_flush(hw);
+	return ice_pf_rxq_wait(pf, pf_q, ena);
+}
+
+/**
+ * ice_vsi_wait_one_rx_ring - wait for a VSI's Rx ring to be stopped/started
+ * @vsi: the VSI being configured
+ * @ena: true/false to verify Rx ring has been enabled/disabled respectively
+ * @rxq_idx: 0-based Rx queue index for the VSI passed in
+ *
+ * This routine will wait for the given Rx queue of the VSI to reach the
+ * enabled or disabled state. Returns -ETIMEDOUT in case of failing to reach
+ * the requested state after multiple retries; else will return 0 in case of
+ * success.
+ */
+int ice_vsi_wait_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx)
+{
+	int pf_q = vsi->rxq_map[rxq_idx];
+	struct ice_pf *pf = vsi->back;
+
+	return ice_pf_rxq_wait(pf, pf_q, ena);
+}
+
+/**
+ * ice_vsi_alloc_q_vectors - Allocate memory for interrupt vectors
+ * @vsi: the VSI being configured
+ *
+ * We allocate one q_vector per queue interrupt. If allocation fails we
+ * return -ENOMEM.
+ */
+int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	u16 v_idx;
+	int err;
+
+	if (vsi->q_vectors[0]) {
+		dev_dbg(dev, "VSI %d has existing q_vectors\n", vsi->vsi_num);
+		return -EEXIST;
+	}
+
+	for (v_idx = 0; v_idx < vsi->num_q_vectors; v_idx++) {
+		err = ice_vsi_alloc_q_vector(vsi, v_idx);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	while (v_idx--)
+		ice_free_q_vector(vsi, v_idx);
+
+	dev_err(dev, "Failed to allocate %d q_vector for VSI %d, ret=%d\n",
+		vsi->num_q_vectors, vsi->vsi_num, err);
+	vsi->num_q_vectors = 0;
+	return err;
+}
+
+/**
+ * ice_vsi_map_rings_to_vectors - Map VSI rings to interrupt vectors
+ * @vsi: the VSI being configured
+ *
+ * This function maps descriptor rings to the queue-specific vectors allotted
+ * through the MSI-X enabling code. On a constrained vector budget, we map Tx
+ * and Rx rings to the vector as "efficiently" as possible.
+ */
+void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
+{
+	int q_vectors = vsi->num_q_vectors;
+	u16 tx_rings_rem, rx_rings_rem;
+	int v_id;
+
+	/* initially assigning remaining rings count to VSIs num queue value */
+	tx_rings_rem = vsi->num_txq;
+	rx_rings_rem = vsi->num_rxq;
+
+	for (v_id = 0; v_id < q_vectors; v_id++) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[v_id];
+		u8 tx_rings_per_v, rx_rings_per_v;
+		u16 q_id, q_base;
+
+		/* Tx rings mapping to vector */
+		tx_rings_per_v = (u8)DIV_ROUND_UP(tx_rings_rem,
+						  q_vectors - v_id);
+		q_vector->num_ring_tx = tx_rings_per_v;
+		q_vector->tx.ring = NULL;
+		q_vector->tx.itr_idx = ICE_TX_ITR;
+		q_base = vsi->num_txq - tx_rings_rem;
+
+		for (q_id = q_base; q_id < (q_base + tx_rings_per_v); q_id++) {
+			struct ice_ring *tx_ring = vsi->tx_rings[q_id];
+
+			tx_ring->q_vector = q_vector;
+			tx_ring->next = q_vector->tx.ring;
+			q_vector->tx.ring = tx_ring;
+		}
+		tx_rings_rem -= tx_rings_per_v;
+
+		/* Rx rings mapping to vector */
+		rx_rings_per_v = (u8)DIV_ROUND_UP(rx_rings_rem,
+						  q_vectors - v_id);
+		q_vector->num_ring_rx = rx_rings_per_v;
+		q_vector->rx.ring = NULL;
+		q_vector->rx.itr_idx = ICE_RX_ITR;
+		q_base = vsi->num_rxq - rx_rings_rem;
+
+		for (q_id = q_base; q_id < (q_base + rx_rings_per_v); q_id++) {
+			struct ice_ring *rx_ring = vsi->rx_rings[q_id];
+
+			rx_ring->q_vector = q_vector;
+			rx_ring->next = q_vector->rx.ring;
+			q_vector->rx.ring = rx_ring;
+		}
+		rx_rings_rem -= rx_rings_per_v;
+	}
+}
+
+/**
+ * ice_vsi_free_q_vectors - Free memory allocated for interrupt vectors
+ * @vsi: the VSI having memory freed
+ */
+void ice_vsi_free_q_vectors(struct ice_vsi *vsi)
+{
+	int v_idx;
+
+	ice_for_each_q_vector(vsi, v_idx)
+		ice_free_q_vector(vsi, v_idx);
+}
+
+/**
+ * ice_vsi_cfg_txq - Configure single Tx queue
+ * @vsi: the VSI that queue belongs to
+ * @ring: Tx ring to be configured
+ * @qg_buf: queue group buffer
+ */
+int
+ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring,
+		struct ice_aqc_add_tx_qgrp *qg_buf)
+{
+	u8 buf_len = struct_size(qg_buf, txqs, 1);
+	struct ice_tlan_ctx tlan_ctx = { 0 };
+	struct ice_aqc_add_txqs_perq *txq;
+	struct ice_channel *ch = ring->ch;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u16 pf_q;
+	u8 tc;
+
+	/* Configure XPS */
+	ice_cfg_xps_tx_ring(ring);
+
+	pf_q = ring->reg_idx;
+	ice_setup_tx_ctx(ring, &tlan_ctx, pf_q);
+	/* copy context contents into the qg_buf */
+	qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q);
+	ice_set_ctx(hw, (u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx,
+		    ice_tlan_ctx_info);
+
+	/* init queue specific tail reg. It is referred as
+	 * transmit comm scheduler queue doorbell.
+	 */
+	ring->tail = hw->hw_addr + QTX_COMM_DBELL(pf_q);
+
+	if (IS_ENABLED(CONFIG_DCB))
+		tc = ring->dcb_tc;
+	else
+		tc = 0;
+
+	/* Add unique software queue handle of the Tx queue per
+	 * TC into the VSI Tx ring
+	 */
+	if (vsi->type == ICE_VSI_SWITCHDEV_CTRL)
+		ring->q_handle = ice_eswitch_calc_q_handle(ring);
+	else
+		ring->q_handle = ice_calc_q_handle(vsi, ring, tc);
+
+	status = (ch ?
+		  ice_ena_vsi_txq(vsi->port_info, ch->ch_vsi->idx, 0,
+				  ring->q_handle, 1, qg_buf, buf_len, NULL) :
+		  ice_ena_vsi_txq(vsi->port_info, vsi->idx, tc,
+				  ring->q_handle, 1, qg_buf, buf_len, NULL));
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Failed to set LAN Tx queue context, error: %s\n",
+			ice_stat_str(status));
+		return -ENODEV;
+	}
+
+	/* Add Tx Queue TEID into the VSI Tx ring from the
+	 * response. This will complete configuring and
+	 * enabling the queue.
+	 */
+	txq = &qg_buf->txqs[0];
+	if (pf_q == le16_to_cpu(txq->txq_id))
+		ring->txq_teid = le32_to_cpu(txq->q_teid);
+
+	return 0;
+}
+
+/**
+ * ice_cfg_itr - configure the initial interrupt throttle values
+ * @hw: pointer to the HW structure
+ * @q_vector: interrupt vector that's being configured
+ *
+ * Configure interrupt throttling values for the ring containers that are
+ * associated with the interrupt vector passed in.
+ */
+void ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector)
+{
+	ice_cfg_itr_gran(hw);
+
+	if (q_vector->num_ring_rx)
+		ice_write_itr(&q_vector->rx, q_vector->rx.itr_setting);
+
+	if (q_vector->num_ring_tx)
+		ice_write_itr(&q_vector->tx, q_vector->tx.itr_setting);
+
+	ice_write_intrl(q_vector, q_vector->intrl);
+}
+
+/**
+ * ice_cfg_txq_interrupt - configure interrupt on Tx queue
+ * @vsi: the VSI being configured
+ * @txq: Tx queue being mapped to MSI-X vector
+ * @msix_idx: MSI-X vector index within the function
+ * @itr_idx: ITR index of the interrupt cause
+ *
+ * Configure interrupt on Tx queue by associating Tx queue to MSI-X vector
+ * within the function space.
+ */
+void
+ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	u32 val;
+
+	itr_idx = (itr_idx << QINT_TQCTL_ITR_INDX_S) & QINT_TQCTL_ITR_INDX_M;
+
+	val = QINT_TQCTL_CAUSE_ENA_M | itr_idx |
+	      ((msix_idx << QINT_TQCTL_MSIX_INDX_S) & QINT_TQCTL_MSIX_INDX_M);
+
+	wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), val);
+#ifdef HAVE_XDP_SUPPORT
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		u32 xdp_txq = txq + vsi->num_xdp_txq;
+
+		wr32(hw, QINT_TQCTL(vsi->txq_map[xdp_txq]),
+		     val);
+	}
+
+	ice_flush(hw);
+#endif /* HAVE_XDP_SUPPORT */
+}
+
+/**
+ * ice_cfg_rxq_interrupt - configure interrupt on Rx queue
+ * @vsi: the VSI being configured
+ * @rxq: Rx queue being mapped to MSI-X vector
+ * @msix_idx: MSI-X vector index within the function
+ * @itr_idx: ITR index of the interrupt cause
+ *
+ * Configure interrupt on Rx queue by associating Rx queue to MSI-X vector
+ * within the function space.
+ */
+void
+ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	u32 val;
+
+	itr_idx = (itr_idx << QINT_RQCTL_ITR_INDX_S) & QINT_RQCTL_ITR_INDX_M;
+
+	val = QINT_RQCTL_CAUSE_ENA_M | itr_idx |
+	      ((msix_idx << QINT_RQCTL_MSIX_INDX_S) & QINT_RQCTL_MSIX_INDX_M);
+
+	wr32(hw, QINT_RQCTL(vsi->rxq_map[rxq]), val);
+
+	ice_flush(hw);
+}
+
+/**
+ * ice_trigger_sw_intr - trigger a software interrupt
+ * @hw: pointer to the HW structure
+ * @q_vector: interrupt vector to trigger the software interrupt for
+ */
+void ice_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector)
+{
+	wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx), (ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) |
+	     GLINT_DYN_CTL_SWINT_TRIG_M | GLINT_DYN_CTL_INTENA_M);
+}
+
+/**
+ * ice_vsi_stop_tx_ring - Disable single Tx ring
+ * @vsi: the VSI being configured
+ * @rst_src: reset source
+ * @rel_vmvf_num: Relative ID of VF/VM
+ * @ring: Tx ring to be stopped
+ * @txq_meta: Meta data of Tx ring to be stopped
+ */
+int
+ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
+		     u16 rel_vmvf_num, struct ice_ring *ring,
+		     struct ice_txq_meta *txq_meta)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_q_vector *q_vector;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+
+	/* clear cause_ena bit for disabled queues */
+	val = rd32(hw, QINT_TQCTL(ring->reg_idx));
+	val &= ~QINT_TQCTL_CAUSE_ENA_M;
+	wr32(hw, QINT_TQCTL(ring->reg_idx), val);
+
+	/* software is expected to wait for 100 ns */
+	ndelay(100);
+
+	/* trigger a software interrupt for the vector
+	 * associated to the queue to schedule NAPI handler
+	 */
+	q_vector = ring->q_vector;
+	if (q_vector)
+		ice_trigger_sw_intr(hw, q_vector);
+
+	status = ice_dis_vsi_txq(vsi->port_info, txq_meta->vsi_idx,
+				 txq_meta->tc, 1, &txq_meta->q_handle,
+				 &txq_meta->q_id, &txq_meta->q_teid, rst_src,
+				 rel_vmvf_num, NULL);
+
+	/* if the disable queue command was exercised during an
+	 * active reset flow, ICE_ERR_RESET_ONGOING is returned.
+	 * This is not an error as the reset operation disables
+	 * queues at the hardware level anyway.
+	 */
+	if (status == ICE_ERR_RESET_ONGOING) {
+		dev_dbg(ice_pf_to_dev(vsi->back), "Reset in progress. LAN Tx queues already disabled\n");
+	} else if (status == ICE_ERR_DOES_NOT_EXIST) {
+		dev_dbg(ice_pf_to_dev(vsi->back), "LAN Tx queues do not exist, nothing to disable\n");
+	} else if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to disable LAN Tx queues, error: %s\n",
+			ice_stat_str(status));
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_fill_txq_meta - Prepare the Tx queue's meta data
+ * @vsi: VSI that ring belongs to
+ * @ring: ring that txq_meta will be based on
+ * @txq_meta: a helper struct that wraps Tx queue's information
+ *
+ * Set up a helper struct that will contain all the necessary fields that
+ * are needed for stopping Tx queue
+ */
+void
+ice_fill_txq_meta(struct ice_vsi *vsi, struct ice_ring *ring,
+		  struct ice_txq_meta *txq_meta)
+{
+	struct ice_channel *ch = ring->ch;
+	u8 tc;
+
+	if (IS_ENABLED(CONFIG_DCB))
+		tc = ring->dcb_tc;
+	else
+		tc = 0;
+	txq_meta->q_id = ring->reg_idx;
+	txq_meta->q_teid = ring->txq_teid;
+	txq_meta->q_handle = ring->q_handle;
+	if (ch) {
+		txq_meta->vsi_idx = ch->ch_vsi->idx;
+		txq_meta->tc = 0;
+	} else {
+		txq_meta->vsi_idx = vsi->idx;
+		txq_meta->tc = tc;
+	}
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_base.h b/drivers/net/ethernet/intel/ice/ice_base.h
new file mode 100644
index 000000000000..5c83e555ef5c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_base.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_BASE_H_
+#define _ICE_BASE_H_
+
+#include "ice.h"
+
+int ice_vsi_cfg_rxq(struct ice_ring *ring);
+int __ice_vsi_get_qs(struct ice_qs_cfg *qs_cfg);
+int
+ice_vsi_ctrl_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx, bool wait);
+int ice_vsi_wait_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx);
+int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi);
+void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi);
+void ice_vsi_free_q_vectors(struct ice_vsi *vsi);
+int
+ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring,
+		struct ice_aqc_add_tx_qgrp *qg_buf);
+void ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector);
+void
+ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx);
+void
+ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx);
+void ice_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector);
+int
+ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
+		     u16 rel_vmvf_num, struct ice_ring *ring,
+		     struct ice_txq_meta *txq_meta);
+void
+ice_fill_txq_meta(struct ice_vsi *vsi, struct ice_ring *ring,
+		  struct ice_txq_meta *txq_meta);
+#endif /* _ICE_BASE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu.h b/drivers/net/ethernet/intel/ice/ice_cgu.h
new file mode 100644
index 000000000000..a9c215a9df92
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_CGU_H_
+#define _ICE_CGU_H_
+
+#include <linux/types.h>
+#include "ice_cgu_regs.h"
+
+/* CGU mux identifier
+ * Specifies a mux within the CGU block.
+ */
+enum ice_cgu_mux_sel {
+	/* CGU reference clock source (DWORD10_SYNCE_S_REF_CLK) */
+	ICE_CGU_MUX_SEL_REF_CLK,
+	/* CGU bypass clock source (DWORD11_SYNCE_S_BYP_CLK) */
+	ICE_CGU_MUX_SEL_BYPASS_CLK,
+	/* CGU ETHCLKO pin source (DWORD10_SYNCE_ETHCLKO_SEL) */
+	ICE_CGU_MUX_SEL_ETHCLKO,
+	/* CGU CLKO pin source (DWORD10_SYNCE_CLKO_SEL) */
+	ICE_CGU_MUX_SEL_CLKO,
+
+	NUM_ICE_CGU_MUX_SEL
+};
+
+/* CGU reference clock specification
+ * Specifies the source for the CGU reference/bypass clock.
+ */
+enum ice_cgu_clk_src {
+	/* network reference clock 0 */
+	ICE_CGU_CLK_SRC_NET_REF_CLK0,
+	/* network reference clock 1 */
+	ICE_CGU_CLK_SRC_NET_REF_CLK1,
+	/* 1588 recovered clock */
+	ICE_CGU_CLK_SRC_1588_RECOVERED_CLK,
+	/* recovered clock from phys port 0 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_0,
+	/* recovered clock from phys port 1 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_1,
+	/* recovered clock from phys port 2 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_2,
+	/* recovered clock from phys port 3 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_3,
+	/* recovered clock from phys port 4 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_4,
+	/* recovered clock from phys port 5 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_5,
+	/* recovered clock from phys port 6 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_6,
+	/* recovered clock from phys port 7 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_7,
+	NUM_ICE_CGU_CLK_SRC
+};
+
+/* Sources for ETHCLKO pin */
+enum ice_cgu_ethclko_sel {
+	/* DPLL reference clock 0 input divided by ETHDIV */
+	ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP0_DIV,
+	/* DPLL reference clock 1 input divided by ETHDIV */
+	ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP1_DIV,
+	/* DPLL output clock divided by ETHDIV */
+	ICE_CGU_ETHCLKO_SEL_CLK_PLL_25000_DIV,
+	/* JAPLL output clock divided by ETHDIV */
+	ICE_CGU_ETHCLKO_SEL_CLK_JAPLL_625000_DIV,
+	/* DPLL reference clock 0 input */
+	ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP0,
+	/* DPLL reference clock 1 input */
+	ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP1,
+	/* DPLL output clock */
+	ICE_CGU_ETHCLKO_SEL_CLK_PLL_25000,
+	ICE_CGU_ETHCLKO_SEL_CLK_JAPLL_625000,
+
+	NUM_ICE_CGU_ETHCLKO_SEL
+};
+
+#define ICE_CGU_ETHCLKO_SEL_NRCKI ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP1
+
+/* Sources for CLKO pin */
+enum ice_cgu_clko_sel {
+	/* DPLL reference clock 0 input divided by CLKODIV */
+	ICE_CGU_CLKO_SEL_REF_CLK_BYP0_DIV,
+	/* DPLL reference clock 1 input divided by CLKODIV */
+	ICE_CGU_CLKO_SEL_REF_CLK_BYP1_DIV,
+	/* DPLL core clock divided by CLKODIV */
+	ICE_CGU_CLKO_SEL_CLK_SYS_DIV,
+	/* JAPLL output clock divided by CLKODIV */
+	ICE_CGU_CLKO_SEL_CLK_JAPLL_625000_DIV,
+	/* DPLL reference clock 0 input */
+	ICE_CGU_CLKO_SEL_REF_CLK_BYP0,
+	/* DPLL reference clock 1 input */
+	ICE_CGU_CLKO_SEL_REF_CLK_BYP1,
+
+	/* 1.544 MHz, NRCP divider output */
+	ICE_CGU_CLKO_SEL_CLK_1544 = 8,
+	/* 2.048 MHz, NRCP divider output */
+	ICE_CGU_CLKO_SEL_CLK_2048 = 9,
+
+	NUM_ICE_CGU_CLKO_SEL
+};
+
+#define ICE_CGU_CLKO_SEL_NRCKI ICE_CGU_CLKO_SEL_REF_CLK_BYP1
+
+/* TIME_REF source selection */
+enum ice_cgu_time_ref_sel {
+	ICE_CGU_TIME_REF_SEL_TCXO, /* Use TCXO source */
+	ICE_CGU_TIME_REF_SEL_TIME_REF, /* Use TIME_REF source */
+
+	NUM_ICE_CGU_TIME_REF_SEL
+};
+
+/* Macro to convert an enum ice_time_ref_freq to a string for printing */
+#define ICE_TIME_REF_FREQ_TO_STR(__trf)                              \
+	({                                                           \
+		enum ice_time_ref_freq _trf = (__trf);               \
+		(_trf) == ICE_TIME_REF_FREQ_25_000 ? "25 MHz" :      \
+		(_trf) == ICE_TIME_REF_FREQ_122_880 ? "122.88 MHz" : \
+		(_trf) == ICE_TIME_REF_FREQ_125_000 ? "125 MHz" :    \
+		(_trf) == ICE_TIME_REF_FREQ_153_600 ? "153.6 MHz" :  \
+		(_trf) == ICE_TIME_REF_FREQ_156_250 ? "156.25 MHz" : \
+		(_trf) == ICE_TIME_REF_FREQ_245_760 ? "245.76 MHz" : \
+		"invalid"; \
+	})
+
+/* Macro to convert an enum ice_cgu_time_ref_sel to a string for printing */
+#define ICE_TIME_REF_SEL_TO_STR(__trs)                                 \
+	({                                                             \
+		enum ice_cgu_time_ref_sel _trs = (__trs);              \
+		(_trs) == ICE_CGU_TIME_REF_SEL_TCXO ? "TCXO" :         \
+		(_trs) == ICE_CGU_TIME_REF_SEL_TIME_REF ? "TIME_REF" : \
+		"invalid"; \
+	})
+/* Macro to convert an enum ice_src_tmr_mode to a string for printing */
+#define ICE_SRC_TMR_MODE_TO_STR(__mtm)                                    \
+	({                                                                \
+		enum ice_src_tmr_mode _mtm = (__mtm);                     \
+		(_mtm) == ICE_SRC_TMR_MODE_NANOSECONDS ? "nanoseconds" :  \
+		(_mtm) == ICE_SRC_TMR_MODE_LOCKED ? "locked" :            \
+		"invalid"; \
+	})
+
+/* DPLL select */
+enum ice_cgu_dpll_select {
+	/* DPLL (DPLL1) */
+	ICE_CGU_DPLL_SELECT_TRANSPORT,
+	/* EEC DPLL (DPLL2), 0x098 Hz BW */
+	ICE_CGU_DPLL_SELECT_EEC_RELAXED_BW,
+
+	NUM_ICE_CGU_DPLL_SELECT
+};
+
+/* DPLL holdover mode */
+enum ice_cgu_dpll_holdover_mode {
+	/* previous acquired frequency */
+	ICE_CGU_DPLL_HOLDOVER_MODE_ACQUIRED_FREQ,
+	/* local frequency (free run) */
+	ICE_CGU_DPLL_HOLDOVER_MODE_LOCAL_FREQ,
+
+	NUM_ICE_CGU_DPLL_HOLDOVER_MODE
+};
+
+/* DPLL configuration parameters */
+struct ice_cgu_dpll_cfg {
+	/* CGU reference clock frequency */
+	enum ice_time_ref_freq ref_freq;
+	/* select DPLL */
+	enum ice_cgu_dpll_select dpll_sel;
+	/* enable holdover feature support */
+	u32 holdover_support;
+	/* select holdover mode */
+	enum ice_cgu_dpll_holdover_mode holdover_mode;
+};
+
+enum ice_japll_ref_freq {
+	ICE_CGU_JAPLL_REF_FREQ_25_000, /* 25 MHz */
+	ICE_CGU_JAPLL_REF_FREQ_156_250, /* 156.25 MHz */
+
+	NUM_ICE_CGU_JAPLL_REF_FREQ
+};
+
+/* Mux configuration parameters */
+struct ice_cgu_mux_cfg {
+	/* reference clock source select */
+	enum ice_cgu_clk_src ref_clk_src;
+	/* bypass clock source select */
+	enum ice_cgu_clk_src byp_clk_src;
+	/* ETHCLKO pin source select */
+	enum ice_cgu_ethclko_sel eth_clk_out;
+	/* CLKO pin source select */
+	enum ice_cgu_clko_sel clk_out;
+	/* CLKO programmable divider */
+	__u8 clk_out_div;
+	/* ETHCLKO programmable divider */
+	__u8 eth_clk_out_div;
+	/* bypass DPLL */
+	u32 bypass;
+	/* tie refClk to ground (force holdover mode) */
+	u32 ref_clk_gnd_ena;
+};
+
+/* CGU event was triggered by SyncE loss of lock */
+#define ICE_CGU_EVENT_ERR_SYNCE_LOCK_LOSS 0x1
+
+/* CGU event was triggered by SyncE holdover change */
+#define ICE_CGU_EVENT_ERR_HOLDOVER_CHNG 0x2
+
+/* CGU event was triggered by timestamp PLL loss of lock */
+#define ICE_CGU_EVENT_ERR_TIMESYNC_LOCK_LOSS 0x4
+
+
+struct ice_cgu_info {
+	struct ice_cgu_dpll_cfg dpll_cfg;
+	struct ice_cgu_mux_cfg mux_cfg;
+	enum ice_japll_ref_freq japll_ref_freq;
+	wait_queue_head_t wq_head;
+
+	/* used to synchronize waiters (only one at a time) */
+	struct mutex event_mutex;
+
+	u32 event_occurred;
+	u8 err_type;
+	u8 unlock_event;
+
+	/* current state of 1588 output to CGU */
+	u8 out_1588_enabled;
+	enum ice_time_ref_freq out_1588_ref_freq;
+
+	enum ice_time_ref_freq time_ref_freq;
+	enum ice_src_tmr_mode src_tmr_mode;
+};
+
+#endif /* _ICE_CGU_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_ops.c b/drivers/net/ethernet/intel/ice/ice_cgu_ops.c
new file mode 100644
index 000000000000..cb7a3ce8605b
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_ops.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+
+/**
+ * ice_cgu_cfg_ts_pll - Configure the TS PLL
+ * @pf: Board private structure
+ * @enable: True to enable TS PLL
+ * @time_ref_freq: primary timer frequency
+ * @time_ref_sel: Time source
+ * @src_tmr_mode: primary timer mode
+ */
+int ice_cgu_cfg_ts_pll(struct ice_pf *pf, bool enable, enum ice_time_ref_freq time_ref_freq,
+		       enum ice_cgu_time_ref_sel time_ref_sel, enum ice_src_tmr_mode src_tmr_mode)
+{
+	struct ice_cgu_info *cgu_info = &pf->cgu_info;
+	union tspll_ro_bwm_lf bwm_lf;
+	union nac_cgu_dword19 dw19;
+	union nac_cgu_dword22 dw22;
+	union nac_cgu_dword24 dw24;
+	union nac_cgu_dword9 dw9;
+	int err;
+
+	dev_dbg(ice_pf_to_dev(pf), "Requested %s, time_ref_freq %s, time_ref_sel %s, src_tmr_mode %s\n",
+		enable ? "enable" : "disable",
+		ICE_TIME_REF_FREQ_TO_STR(time_ref_freq),
+		ICE_TIME_REF_SEL_TO_STR(time_ref_sel),
+		ICE_SRC_TMR_MODE_TO_STR(src_tmr_mode));
+
+	if (time_ref_freq >= NUM_ICE_TIME_REF_FREQ) {
+		dev_err(ice_pf_to_dev(pf), "Invalid TIME_REF freq %u\n", time_ref_freq);
+		return -EIO;
+	}
+
+	if (time_ref_sel >= NUM_ICE_CGU_TIME_REF_SEL) {
+		dev_err(ice_pf_to_dev(pf), "Invalid TIME_REF sel %u\n", time_ref_sel);
+		return -EIO;
+	}
+
+	if (src_tmr_mode >= NUM_ICE_SRC_TMR_MODE) {
+		dev_err(ice_pf_to_dev(pf), "Invalid src_tmr_mode %u\n", src_tmr_mode);
+		return -EIO;
+	}
+
+	if (time_ref_sel == ICE_CGU_TIME_REF_SEL_TCXO &&
+	    time_ref_freq != ICE_TIME_REF_FREQ_25_000) {
+		dev_err(ice_pf_to_dev(pf),
+			"TS PLL source specified as TCXO but specified frequency is not 25 MHz\n");
+		return -EIO;
+	}
+
+	err = ice_cgu_reg_read(pf, NAC_CGU_DWORD9, &dw9.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD24, &dw24.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, TSPLL_RO_BWM_LF, &bwm_lf.val);
+	if (err)
+		return err;
+
+	dev_dbg(ice_pf_to_dev(pf),
+		"Before change, %s, time_ref_freq %s, time_ref_sel %s, PLL %s\n",
+		dw24.field.ts_pll_enable ? "enabled" : "disabled",
+		ICE_TIME_REF_FREQ_TO_STR(dw9.field.time_ref_freq_sel),
+		ICE_TIME_REF_SEL_TO_STR(dw24.field.time_ref_sel),
+		bwm_lf.field.plllock_true_lock_cri ? "locked" : "unlocked");
+
+	if (!enable) {
+		if (dw24.field.ts_pll_enable) {
+			dw24.field.ts_pll_enable = 0;
+			err = ice_cgu_reg_write(pf, NAC_CGU_DWORD24, dw24.val);
+			if (!err)
+				ice_cgu_usleep(1);
+		}
+		/* don't need to update the freq, sel, or mode; that'll happen
+		 * when the PLL is re-enabled
+		 */
+		return err;
+	}
+
+	/* TS PLL must be disabled before changing freq or src */
+	if (dw24.field.ts_pll_enable && (dw9.field.time_ref_freq_sel != time_ref_freq ||
+					 dw24.field.time_ref_sel != time_ref_sel)) {
+		dev_err(ice_pf_to_dev(pf),
+			"Can't adjust time_ref_freq or time_ref_sel while TS PLL is enabled\n");
+		return -EIO;
+	}
+
+	/* set frequency, configure TS PLL params, and enable the TS PLL */
+	err = ice_cgu_reg_read(pf, NAC_CGU_DWORD19, &dw19.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD22, &dw22.val);
+	if (!err) {
+		dw9.field.time_ref_freq_sel = time_ref_freq;
+		dw19.field.tspll_fbdiv_intgr = tspll_per_rate_params[time_ref_freq].feedback_div;
+		dw19.field.tspll_ndivratio = 1;
+		dw22.field.time1588clk_div = tspll_per_rate_params[time_ref_freq].post_pll_div;
+		dw22.field.time1588clk_sel_div2 = 0;
+		dw24.field.ref1588_ck_div = tspll_per_rate_params[time_ref_freq].refclk_pre_div;
+		dw24.field.tspll_fbdiv_frac = tspll_per_rate_params[time_ref_freq].frac_n_div;
+		dw24.field.time_ref_sel = time_ref_sel;
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD9, dw9.val);
+	}
+	if (!err)
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD19, dw19.val);
+	if (!err)
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD22, dw22.val);
+	/* first write dw24 with updated values but still not enabled */
+	if (!err)
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD24, dw24.val);
+	/* now enable the TS_PLL */
+	if (!err) {
+		dw24.field.ts_pll_enable = 1;
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD24, dw24.val);
+	}
+
+	if (!err) {
+		cgu_info->time_ref_freq = time_ref_freq;
+		cgu_info->src_tmr_mode = src_tmr_mode;
+		err = ice_ptp_update_incval(pf, time_ref_freq, src_tmr_mode);
+		if (err) {
+			dev_err(ice_pf_to_dev(pf), "Failed to update INCVAL\n");
+			return err;
+		}
+	}
+
+	/* to check for lock, wait 1 ms; if it hasn't locked by then, it's not
+	 * going to lock
+	 */
+	if (!err) {
+		ice_cgu_usleep(1000);
+		err = ice_cgu_reg_read(pf, TSPLL_RO_BWM_LF, &bwm_lf.val);
+	}
+	if (!err && bwm_lf.field.plllock_true_lock_cri) {
+		dev_dbg(ice_pf_to_dev(pf),
+			"TS PLL successfully locked, time_ref_freq %s, time_ref_sel %s\n",
+			ICE_TIME_REF_FREQ_TO_STR(time_ref_freq),
+			ICE_TIME_REF_SEL_TO_STR(time_ref_sel));
+
+		/* update state to indicate no unlock event since last lock */
+		cgu_info->unlock_event = false;
+	} else {
+		dev_err(ice_pf_to_dev(pf), "TS PLL failed to lock\n");
+		err = -EFAULT;
+	}
+
+	return err;
+}
+
+/**
+ * ice_cgu_init_state - Initialize CGU HW
+ * @pf: Board private structure
+ *
+ * Read CGU registers, initialize internal state, and lock the timestamp PLL using the parameters
+ * read from the soft straps.
+ */
+void ice_cgu_init_state(struct ice_pf *pf)
+{
+	union tspll_cntr_bist_settings tspll_cntr_bist;
+	struct ice_cgu_info *cgu_info = &pf->cgu_info;
+	union nac_cgu_dword10 dw10;
+	union nac_cgu_dword11 dw11;
+	union nac_cgu_dword12 dw12;
+	union nac_cgu_dword14 dw14;
+	union nac_cgu_dword24 dw24;
+	union nac_cgu_dword9 dw9;
+	int err;
+
+	init_waitqueue_head(&cgu_info->wq_head);
+	mutex_init(&cgu_info->event_mutex);
+
+	err = ice_cgu_reg_read(pf, NAC_CGU_DWORD9, &dw9.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD11, &dw11.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD12, &dw12.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD14, &dw14.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD24, &dw24.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, TSPLL_CNTR_BIST_SETTINGS, &tspll_cntr_bist.val);
+	if (err)
+		goto err;
+
+	/* Note that the TIME_SYNC, TIME_REF, and ONE_PPS_OUT pins are enabled
+	 * through soft straps.
+	 */
+	/* Mux config */
+	cgu_info->mux_cfg.ref_clk_src = dw10.field.synce_s_ref_clk;
+	cgu_info->mux_cfg.byp_clk_src = dw11.field.synce_s_byp_clk;
+	cgu_info->mux_cfg.eth_clk_out = dw10.field.synce_ethclko_sel;
+	cgu_info->mux_cfg.clk_out = dw10.field.synce_clko_sel;
+	cgu_info->mux_cfg.clk_out_div = dw10.field.synce_clkodiv_m1;
+	cgu_info->mux_cfg.eth_clk_out_div = dw10.field.synce_ethdiv_m1;
+	cgu_info->mux_cfg.bypass = dw12.field.synce_dpll_byp;
+	cgu_info->mux_cfg.ref_clk_gnd_ena = dw10.field.synce_sel_gnd;
+
+	/* Timestamp PLL config */
+	/* Disable sticky lock detection so lock status reported is accurate */
+	tspll_cntr_bist.field.i_plllock_sel_0 = 0;
+	tspll_cntr_bist.field.i_plllock_sel_1 = 0;
+	err = ice_cgu_reg_write(pf, TSPLL_CNTR_BIST_SETTINGS, tspll_cntr_bist.val);
+
+	/* Assume the 1588 output to CGU isn't configured; require the app to reconfigure it before
+	 * using it
+	 */
+	if (!err)
+		cgu_info->out_1588_enabled = false;
+
+	/* first, try to lock the timestamp PLL with the parameters from the soft straps */
+	/* disable first, then re-enable with correct parameters */
+	err = ice_cgu_cfg_ts_pll(pf, false, dw9.field.time_ref_freq_sel, dw24.field.time_ref_sel,
+				 ICE_SRC_TMR_MODE_NANOSECONDS);
+	if (err)
+		dev_err(ice_pf_to_dev(pf), "Failed to disable TS PLL\n");
+	else
+		err = ice_cgu_cfg_ts_pll(pf, true, dw9.field.time_ref_freq_sel,
+					 dw24.field.time_ref_sel, ICE_SRC_TMR_MODE_NANOSECONDS);
+	if (err) {
+		/* if that fails, try to lock the timestamp PLL with the TCXO
+		 */
+		dev_info(ice_pf_to_dev(pf),
+			 "Unable to lock TS PLL with soft straps settings; trying TCXO\n");
+
+			/* disable first, then re-enable with correct parameters */
+			err = ice_cgu_cfg_ts_pll(pf, false, ICE_TIME_REF_FREQ_25_000,
+						 ICE_CGU_TIME_REF_SEL_TCXO,
+						 ICE_SRC_TMR_MODE_NANOSECONDS);
+		if (err)
+			dev_err(ice_pf_to_dev(pf), "Failed to disable TS PLL with TCXO\n");
+		else
+			err = ice_cgu_cfg_ts_pll(pf, true, ICE_TIME_REF_FREQ_25_000,
+						 ICE_CGU_TIME_REF_SEL_TCXO,
+						 ICE_SRC_TMR_MODE_NANOSECONDS);
+		if (err) {
+			dev_err(ice_pf_to_dev(pf), "Failed to lock TS PLL with TCXO\n");
+			goto err;
+		}
+	}
+
+	dev_info(ice_pf_to_dev(pf), "CGU init successful\n");
+	return;
+err:
+	dev_err(ice_pf_to_dev(pf), "CGU init failed, err=%d\n", err);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_ops.h b/drivers/net/ethernet/intel/ice/ice_cgu_ops.h
new file mode 100644
index 000000000000..9ba1ad7d939f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_ops.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_CGU_OPS_H_
+#define _ICE_CGU_OPS_H_
+
+#define ICE_CGU_LOCK_CHECK_DELAY_USEC        256000	/* 256 msec */
+
+/* fast mode lock check settings */
+#define ICE_CGU_EDPLL_FAST_LOCK_DELAY_LOOPS     239	/* 60 seconds total */
+#define ICE_CGU_TDPLL_FAST_LOCK_DELAY_LOOPS      25	/* 5 seconds total */
+
+/* normal mode lock check settings */
+#define ICE_CGU_EDPLL_NORMAL_LOCK_DELAY_LOOPS    52	/* 12 seconds total */
+#define ICE_CGU_TDPLL_NORMAL_LOCK_DELAY_LOOPS    13	/* 2 seconds total */
+
+/* number of consecutive locks to declare DPLL lock */
+#define ICE_CGU_DPLL_LOCK_COUNT 5
+
+#define ICE_CGU_CORE_CLOCK_MHZ 800
+#define ICE_CGU_DPLL_FREQ_MHZ 25
+
+/* DPLL lock/unlock threshold */
+#define ICE_CGU_TRANSPORT_DPLL_LOCK_THRESHOLD_800MHZ    0x2D8
+#define ICE_CGU_TRANSPORT_DPLL_UNLOCK_THRESHOLD_800MHZ  0x3640
+#define ICE_CGU_ECC_DPLL_LOCK_THRESHOLD_800MHZ          0x5A
+#define ICE_CGU_ECC_DPLL_UNLOCK_THRESHOLD_800MHZ        0x21E8
+
+/* time to hold enable bits low to perform a JAPLL reset */
+#define ICE_CGU_JAPLL_RESET_TIME_USEC 1
+
+/* LCPLL lock alone (FDPLL disabled) should take < 10 usec */
+#define ICE_CGU_LCPLL_LOCK_CHECK_DELAY_USEC 1
+#define ICE_CGU_LCPLL_LOCK_DELAY_LOOPS 10
+
+/* FDPLL lock time in fast mode is around 500 msec;
+ * use poll interval of 100ms, max poll time 5 seconds
+ * (max poll time was originally 2 seconds, increased
+ * to 5 to avoid occasional poll timeouts.)
+ */
+#define ICE_CGU_FDPLL_LOCK_CHECK_DELAY_USEC 100000
+#define ICE_CGU_FDPLL_LOCK_DELAY_LOOPS 50
+#define ICE_CGU_FDPLL_ACQ_TOGGLE_LOOPS 2
+
+
+/* valid values for enum ice_cgu_clko_sel */
+#define ICE_CGU_CLKO_SEL_VALID_BITMAP \
+	(BIT(ICE_CGU_CLKO_SEL_REF_CLK_BYP0_DIV) | \
+	 BIT(ICE_CGU_CLKO_SEL_REF_CLK_BYP1_DIV) | \
+	 BIT(ICE_CGU_CLKO_SEL_CLK_SYS_DIV) | \
+	 BIT(ICE_CGU_CLKO_SEL_CLK_JAPLL_625000_DIV) | \
+	 BIT(ICE_CGU_CLKO_SEL_REF_CLK_BYP0) | \
+	 BIT(ICE_CGU_CLKO_SEL_REF_CLK_BYP1) | \
+	 BIT(ICE_CGU_CLKO_SEL_CLK_1544) | \
+	 BIT(ICE_CGU_CLKO_SEL_CLK_2048))
+
+/* Only FW can read NAC_CGU_DWORD8 where these are defined, so they are exposed
+ * to the driver stack via soft straps in the misc24 field of NAC_CGU_DWORD9.
+ */
+#define MISC24_BIT_TCXO_FREQ_SEL_M	BIT(0)
+#define MISC24_BIT_TCXO_SEL_M		BIT(4)
+
+/* internal structure definitions */
+
+enum ice_cgu_sample_rate {
+	ICE_CGU_SAMPLE_RATE_8K = 0,	/* 8 KHz sample rate */
+	ICE_CGU_SAMPLE_RATE_10K,	/* 10 KHz sample rate */
+	ICE_CGU_SAMPLE_RATE_12K5,	/* 12.5 KHz sample rate */
+
+	NUM_ICE_CGU_SAMPLE_RATE
+};
+
+struct ice_cgu_div_rat_m1 {
+	u32 ref_clk_rate;	/* reference clock rate in kHz */
+	u32 div_rat_m1;		/* div_rat_m1 value */
+};
+
+struct ice_cgu_dpll_params {
+	enum ice_cgu_dpll_select dpll_select;
+	enum ice_cgu_sample_rate sample_rate;
+	u32 mul_rat_m1;
+	u32 scale;
+	u32 gain;
+};
+
+struct ice_cgu_dpll_per_rate_params {
+	u32 rate_hz;
+	enum ice_cgu_sample_rate sample_rate;
+	u32 div_rat_m1;
+	u32 synce_rat_sel;
+};
+
+struct ice_cgu_lcpll_per_rate_params {
+	u32 refclk_pre_div;
+	u32 feedback_div;
+	u32 frac_n_div;
+	u32 post_pll_div;
+};
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+/* Function to init internal state */
+void ice_cgu_init_state(struct ice_pf *pf);
+/* Function to configure TS PLL */
+int
+ice_cgu_cfg_ts_pll(struct ice_pf *pf, bool enable, enum ice_time_ref_freq time_ref_freq,
+		   enum ice_cgu_time_ref_sel time_ref_sel,
+		   enum ice_src_tmr_mode src_tmr_mode);
+#else /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
+static inline void ice_cgu_init_state(struct ice_pf *pf) { }
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static inline int
+ice_cgu_cfg_ts_pll(struct ice_pf __always_unused *pf, bool __always_unused enable,
+		   enum ice_time_ref_freq __always_unused time_ref_freq,
+		   enum ice_cgu_time_ref_sel __always_unused time_ref_sel,
+		   enum ice_src_tmr_mode __always_unused src_tmr_mode)
+{
+	return 0;
+}
+#endif /* IS_ENABLED(CONFIG_DEBUG_FS) */
+#endif /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
+#endif /* _ICE_CGU_OPS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_regs.h b/drivers/net/ethernet/intel/ice/ice_cgu_regs.h
new file mode 100644
index 000000000000..a58fc697e6f5
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_regs.h
@@ -0,0 +1,941 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_CGU_REGS_H_
+#define _ICE_CGU_REGS_H_
+
+#include "ice_osdep.h"
+
+#define NAC_CGU_DWORD8 0x20
+#define NAC_CGU_DWORD8_TCXO_FREQ_SEL_S 0
+#define NAC_CGU_DWORD8_TCXO_FREQ_SEL_M BIT(0)
+#define NAC_CGU_DWORD8_MISC8_S 1
+#define NAC_CGU_DWORD8_MISC8_M ICE_M(0x7, 1)
+#define NAC_CGU_DWORD8_HLP_SWITCH_FREQ_SEL_S 4
+#define NAC_CGU_DWORD8_HLP_SWITCH_FREQ_SEL_M ICE_M(0xf, 4)
+#define NAC_CGU_DWORD8_CGUPLL_NDIVRATIO_S 8
+#define NAC_CGU_DWORD8_CGUPLL_NDIVRATIO_M ICE_M(0xf, 8)
+#define NAC_CGU_DWORD8_CGUPLL_IREF_NDIVRATIO_S 12
+#define NAC_CGU_DWORD8_CGUPLL_IREF_NDIVRATIO_M ICE_M(0x7, 12)
+#define NAC_CGU_DWORD8_MISC28_S 15
+#define NAC_CGU_DWORD8_MISC28_M BIT(15)
+#define NAC_CGU_DWORD8_HLPPLL_NDIVRATIO_S 16
+#define NAC_CGU_DWORD8_HLPPLL_NDIVRATIO_M ICE_M(0xf, 16)
+#define NAC_CGU_DWORD8_HLPPLL_IREF_NDIVRATIO_S 20
+#define NAC_CGU_DWORD8_HLPPLL_IREF_NDIVRATIO_M ICE_M(0x7, 20)
+#define NAC_CGU_DWORD8_MISC29_S 23
+#define NAC_CGU_DWORD8_MISC29_M BIT(23)
+#define NAC_CGU_DWORD8_CLK_EREF1_EN_SELFBIAS_S 24
+#define NAC_CGU_DWORD8_CLK_EREF1_EN_SELFBIAS_M BIT(24)
+#define NAC_CGU_DWORD8_CLK_EREF0_EN_SELFBIAS_S 25
+#define NAC_CGU_DWORD8_CLK_EREF0_EN_SELFBIAS_M BIT(25)
+#define NAC_CGU_DWORD8_TIME_REF_EN_SELFBIAS_S 26
+#define NAC_CGU_DWORD8_TIME_REF_EN_SELFBIAS_M BIT(26)
+#define NAC_CGU_DWORD8_TIME_SYNC_EN_SELFBIAS_S 27
+#define NAC_CGU_DWORD8_TIME_SYNC_EN_SELFBIAS_M BIT(27)
+#define NAC_CGU_DWORD8_CLK_REF_SYNC_E_EN_SELFBIAS_S 28
+#define NAC_CGU_DWORD8_CLK_REF_SYNC_E_EN_SELFBIAS_M BIT(28)
+#define NAC_CGU_DWORD8_NET_CLK_REF1_EN_SELFBIAS_S 29
+#define NAC_CGU_DWORD8_NET_CLK_REF1_EN_SELFBIAS_M BIT(29)
+#define NAC_CGU_DWORD8_NET_CLK_REF0_EN_SELFBIAS_S 30
+#define NAC_CGU_DWORD8_NET_CLK_REF0_EN_SELFBIAS_M BIT(30)
+#define NAC_CGU_DWORD8_TCXO_SEL_S 31
+#define NAC_CGU_DWORD8_TCXO_SEL_M BIT(31)
+
+union nac_cgu_dword8 {
+	struct {
+		u32 tcxo_freq_sel : 1;
+		u32 misc8 : 3;
+		u32 hlp_switch_freq_sel : 4;
+		u32 cgupll_ndivratio : 4;
+		u32 cgupll_iref_ndivratio : 3;
+		u32 misc28 : 1;
+		u32 hlppll_ndivratio : 4;
+		u32 hlppll_iref_ndivratio : 3;
+		u32 misc29 : 1;
+		u32 clk_eref1_en_selfbias : 1;
+		u32 clk_eref0_en_selfbias : 1;
+		u32 time_ref_en_selfbias : 1;
+		u32 time_sync_en_selfbias : 1;
+		u32 clk_ref_sync_e_en_selfbias : 1;
+		u32 net_clk_ref1_en_selfbias : 1;
+		u32 net_clk_ref0_en_selfbias : 1;
+		u32 tcxo_sel : 1;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD9 0x24
+#define NAC_CGU_DWORD9_TIME_REF_FREQ_SEL_S 0
+#define NAC_CGU_DWORD9_TIME_REF_FREQ_SEL_M ICE_M(0x7, 0)
+#define NAC_CGU_DWORD9_CLK_EREF1_EN_S 3
+#define NAC_CGU_DWORD9_CLK_EREF1_EN_M BIT(3)
+#define NAC_CGU_DWORD9_CLK_EREF0_EN_S 4
+#define NAC_CGU_DWORD9_CLK_EREF0_EN_M BIT(4)
+#define NAC_CGU_DWORD9_TIME_REF_EN_S 5
+#define NAC_CGU_DWORD9_TIME_REF_EN_M BIT(5)
+#define NAC_CGU_DWORD9_TIME_SYNC_EN_S 6
+#define NAC_CGU_DWORD9_TIME_SYNC_EN_M BIT(6)
+#define NAC_CGU_DWORD9_ONE_PPS_OUT_EN_S 7
+#define NAC_CGU_DWORD9_ONE_PPS_OUT_EN_M BIT(7)
+#define NAC_CGU_DWORD9_CLK_REF_SYNCE_EN_S 8
+#define NAC_CGU_DWORD9_CLK_REF_SYNCE_EN_M BIT(8)
+#define NAC_CGU_DWORD9_CLK_SYNCE1_EN_S 9
+#define NAC_CGU_DWORD9_CLK_SYNCE1_EN_M BIT(9)
+#define NAC_CGU_DWORD9_CLK_SYNCE0_EN_S 10
+#define NAC_CGU_DWORD9_CLK_SYNCE0_EN_M BIT(10)
+#define NAC_CGU_DWORD9_NET_CLK_REF1_EN_S 11
+#define NAC_CGU_DWORD9_NET_CLK_REF1_EN_M BIT(11)
+#define NAC_CGU_DWORD9_NET_CLK_REF0_EN_S 12
+#define NAC_CGU_DWORD9_NET_CLK_REF0_EN_M BIT(12)
+#define NAC_CGU_DWORD9_CLK_SYNCE1_AMP_S 13
+#define NAC_CGU_DWORD9_CLK_SYNCE1_AMP_M ICE_M(0x3, 13)
+#define NAC_CGU_DWORD9_MISC6_S 15
+#define NAC_CGU_DWORD9_MISC6_M BIT(15)
+#define NAC_CGU_DWORD9_CLK_SYNCE0_AMP_S 16
+#define NAC_CGU_DWORD9_CLK_SYNCE0_AMP_M ICE_M(0x3, 16)
+#define NAC_CGU_DWORD9_ONE_PPS_OUT_AMP_S 18
+#define NAC_CGU_DWORD9_ONE_PPS_OUT_AMP_M ICE_M(0x3, 18)
+#define NAC_CGU_DWORD9_MISC24_S 20
+#define NAC_CGU_DWORD9_MISC24_M ICE_M(0xfff, 20)
+
+union nac_cgu_dword9 {
+	struct {
+		u32 time_ref_freq_sel : 3;
+		u32 clk_eref1_en : 1;
+		u32 clk_eref0_en : 1;
+		u32 time_ref_en : 1;
+		u32 time_sync_en : 1;
+		u32 one_pps_out_en : 1;
+		u32 clk_ref_synce_en : 1;
+		u32 clk_synce1_en : 1;
+		u32 clk_synce0_en : 1;
+		u32 net_clk_ref1_en : 1;
+		u32 net_clk_ref0_en : 1;
+		u32 clk_synce1_amp : 2;
+		u32 misc6 : 1;
+		u32 clk_synce0_amp : 2;
+		u32 one_pps_out_amp : 2;
+		u32 misc24 : 12;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD10 0x28
+#define NAC_CGU_DWORD10_JA_PLL_ENABLE_S 0
+#define NAC_CGU_DWORD10_JA_PLL_ENABLE_M BIT(0)
+#define NAC_CGU_DWORD10_MISC11_S 1
+#define NAC_CGU_DWORD10_MISC11_M BIT(1)
+#define NAC_CGU_DWORD10_FDPLL_ENABLE_S 2
+#define NAC_CGU_DWORD10_FDPLL_ENABLE_M BIT(2)
+#define NAC_CGU_DWORD10_FDPLL_SLOW_S 3
+#define NAC_CGU_DWORD10_FDPLL_SLOW_M BIT(3)
+#define NAC_CGU_DWORD10_FDPLL_LOCK_INT_ENB_S 4
+#define NAC_CGU_DWORD10_FDPLL_LOCK_INT_ENB_M BIT(4)
+#define NAC_CGU_DWORD10_SYNCE_CLKO_SEL_S 5
+#define NAC_CGU_DWORD10_SYNCE_CLKO_SEL_M ICE_M(0xf, 5)
+#define NAC_CGU_DWORD10_SYNCE_CLKODIV_M1_S 9
+#define NAC_CGU_DWORD10_SYNCE_CLKODIV_M1_M ICE_M(0x1f, 9)
+#define NAC_CGU_DWORD10_SYNCE_CLKODIV_LOAD_S 14
+#define NAC_CGU_DWORD10_SYNCE_CLKODIV_LOAD_M BIT(14)
+#define NAC_CGU_DWORD10_SYNCE_DCK_RST_S 15
+#define NAC_CGU_DWORD10_SYNCE_DCK_RST_M BIT(15)
+#define NAC_CGU_DWORD10_SYNCE_ETHCLKO_SEL_S 16
+#define NAC_CGU_DWORD10_SYNCE_ETHCLKO_SEL_M ICE_M(0x7, 16)
+#define NAC_CGU_DWORD10_SYNCE_ETHDIV_M1_S 19
+#define NAC_CGU_DWORD10_SYNCE_ETHDIV_M1_M ICE_M(0x1f, 19)
+#define NAC_CGU_DWORD10_SYNCE_ETHDIV_LOAD_S 24
+#define NAC_CGU_DWORD10_SYNCE_ETHDIV_LOAD_M BIT(24)
+#define NAC_CGU_DWORD10_SYNCE_DCK2_RST_S 25
+#define NAC_CGU_DWORD10_SYNCE_DCK2_RST_M BIT(25)
+#define NAC_CGU_DWORD10_SYNCE_SEL_GND_S 26
+#define NAC_CGU_DWORD10_SYNCE_SEL_GND_M BIT(26)
+#define NAC_CGU_DWORD10_SYNCE_S_REF_CLK_S 27
+#define NAC_CGU_DWORD10_SYNCE_S_REF_CLK_M ICE_M(0x1f, 27)
+
+union nac_cgu_dword10 {
+	struct {
+		u32 ja_pll_enable : 1;
+		u32 misc11 : 1;
+		u32 fdpll_enable : 1;
+		u32 fdpll_slow : 1;
+		u32 fdpll_lock_int_enb : 1;
+		u32 synce_clko_sel : 4;
+		u32 synce_clkodiv_m1 : 5;
+		u32 synce_clkodiv_load : 1;
+		u32 synce_dck_rst : 1;
+		u32 synce_ethclko_sel : 3;
+		u32 synce_ethdiv_m1 : 5;
+		u32 synce_ethdiv_load : 1;
+		u32 synce_dck2_rst : 1;
+		u32 synce_sel_gnd : 1;
+		u32 synce_s_ref_clk : 5;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD11 0x2c
+#define NAC_CGU_DWORD11_MISC25_S 0
+#define NAC_CGU_DWORD11_MISC25_M BIT(0)
+#define NAC_CGU_DWORD11_SYNCE_S_BYP_CLK_S 1
+#define NAC_CGU_DWORD11_SYNCE_S_BYP_CLK_M ICE_M(0x3f, 1)
+#define NAC_CGU_DWORD11_SYNCE_HDOV_MODE_S 7
+#define NAC_CGU_DWORD11_SYNCE_HDOV_MODE_M BIT(7)
+#define NAC_CGU_DWORD11_SYNCE_RAT_SEL_S 8
+#define NAC_CGU_DWORD11_SYNCE_RAT_SEL_M ICE_M(0x3, 8)
+#define NAC_CGU_DWORD11_SYNCE_LINK_ENABLE_S 10
+#define NAC_CGU_DWORD11_SYNCE_LINK_ENABLE_M ICE_M(0xfffff, 10)
+#define NAC_CGU_DWORD11_SYNCE_MISCLK_EN_S 30
+#define NAC_CGU_DWORD11_SYNCE_MISCLK_EN_M BIT(30)
+#define NAC_CGU_DWORD11_SYNCE_MISCLK_RAT_M1_S 31
+#define NAC_CGU_DWORD11_SYNCE_MISCLK_RAT_M1_M BIT(31)
+
+union nac_cgu_dword11 {
+	struct {
+		u32 misc25 : 1;
+		u32 synce_s_byp_clk : 6;
+		u32 synce_hdov_mode : 1;
+		u32 synce_rat_sel : 2;
+		u32 synce_link_enable : 20;
+		u32 synce_misclk_en : 1;
+		u32 synce_misclk_rat_m1 : 1;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD12 0x30
+#define NAC_CGU_DWORD12_SYNCE_MISCLK_RAT_M1_S 0
+#define NAC_CGU_DWORD12_SYNCE_MISCLK_RAT_M1_M ICE_M(0x3ff, 0)
+#define NAC_CGU_DWORD12_SYNCE_MCK_RST_S 10
+#define NAC_CGU_DWORD12_SYNCE_MCK_RST_M BIT(10)
+#define NAC_CGU_DWORD12_SYNCE_DPLL_BYP_S 11
+#define NAC_CGU_DWORD12_SYNCE_DPLL_BYP_M BIT(11)
+#define NAC_CGU_DWORD12_SYNCE_DV_RAT_M1_S 12
+#define NAC_CGU_DWORD12_SYNCE_DV_RAT_M1_M ICE_M(0x1fff, 12)
+#define NAC_CGU_DWORD12_SYNCE_ML_RAT_M1_S 25
+#define NAC_CGU_DWORD12_SYNCE_ML_RAT_M1_M ICE_M(0x7f, 25)
+
+union nac_cgu_dword12 {
+	struct {
+		u32 synce_misclk_rat_m1 : 10;
+		u32 synce_mck_rst : 1;
+		u32 synce_dpll_byp : 1;
+		u32 synce_dv_rat_m1 : 13;
+		u32 synce_ml_rat_m1 : 7;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD13 0x34
+#define NAC_CGU_DWORD13_SYNCE_ML_RAT_M1_S 0
+#define NAC_CGU_DWORD13_SYNCE_ML_RAT_M1_M ICE_M(0x1f, 0)
+#define NAC_CGU_DWORD13_SYNCE_HDOV_CHANGED_S 5
+#define NAC_CGU_DWORD13_SYNCE_HDOV_CHANGED_M BIT(5)
+#define NAC_CGU_DWORD13_SYNCE_LOCK_CHANGED_S 6
+#define NAC_CGU_DWORD13_SYNCE_LOCK_CHANGED_M BIT(6)
+#define NAC_CGU_DWORD13_SYNCE_HDOV_S 7
+#define NAC_CGU_DWORD13_SYNCE_HDOV_M BIT(7)
+#define NAC_CGU_DWORD13_SYNCE_HDOV_INT_ENB_S 8
+#define NAC_CGU_DWORD13_SYNCE_HDOV_INT_ENB_M BIT(8)
+#define NAC_CGU_DWORD13_SYNCE_LOCK_INT_ENB_S 9
+#define NAC_CGU_DWORD13_SYNCE_LOCK_INT_ENB_M BIT(9)
+#define NAC_CGU_DWORD13_SYNCE_LOCKED_NC_S 10
+#define NAC_CGU_DWORD13_SYNCE_LOCKED_NC_M BIT(10)
+#define NAC_CGU_DWORD13_FDPLL_LOCKED_NC_S 11
+#define NAC_CGU_DWORD13_FDPLL_LOCKED_NC_M BIT(11)
+#define NAC_CGU_DWORD13_SYNCE_LOCKED_CLEAR_S 12
+#define NAC_CGU_DWORD13_SYNCE_LOCKED_CLEAR_M BIT(12)
+#define NAC_CGU_DWORD13_SYNCE_HDOV_CLEAR_S 13
+#define NAC_CGU_DWORD13_SYNCE_HDOV_CLEAR_M BIT(13)
+#define NAC_CGU_DWORD13_FDPLL_LOCKED_CLEAR_S 14
+#define NAC_CGU_DWORD13_FDPLL_LOCKED_CLEAR_M BIT(14)
+#define NAC_CGU_DWORD13_FDPLL_LOCK_CHANGED_S 15
+#define NAC_CGU_DWORD13_FDPLL_LOCK_CHANGED_M BIT(15)
+#define NAC_CGU_DWORD13_RMNRXCLK_SEL_S 16
+#define NAC_CGU_DWORD13_RMNRXCLK_SEL_M ICE_M(0x1f, 16)
+#define NAC_CGU_DWORD13_ENABLE_ETH_COUNT_S 21
+#define NAC_CGU_DWORD13_ENABLE_ETH_COUNT_M BIT(21)
+#define NAC_CGU_DWORD13_ETH_COUNT_FAST_MODE_S 22
+#define NAC_CGU_DWORD13_ETH_COUNT_FAST_MODE_M BIT(22)
+#define NAC_CGU_DWORD13_MISC12_S 23
+#define NAC_CGU_DWORD13_MISC12_M ICE_M(0x1ff, 23)
+
+union nac_cgu_dword13 {
+	struct {
+		u32 synce_ml_rat_m1 : 5;
+		u32 synce_hdov_changed : 1;
+		u32 synce_lock_changed : 1;
+		u32 synce_hdov : 1;
+		u32 synce_hdov_int_enb : 1;
+		u32 synce_lock_int_enb : 1;
+		u32 synce_locked_nc : 1;
+		u32 fdpll_locked_nc : 1;
+		u32 synce_locked_clear : 1;
+		u32 synce_hdov_clear : 1;
+		u32 fdpll_locked_clear : 1;
+		u32 fdpll_lock_changed : 1;
+		u32 rmnrxclk_sel : 5;
+		u32 enable_eth_count : 1;
+		u32 eth_count_fast_mode : 1;
+		u32 misc12 : 9;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD14 0x38
+#define NAC_CGU_DWORD14_SYNCE_LNK_UP_MD_S 0
+#define NAC_CGU_DWORD14_SYNCE_LNK_UP_MD_M BIT(0)
+#define NAC_CGU_DWORD14_SYNCE_LNK_DN_MD_S 1
+#define NAC_CGU_DWORD14_SYNCE_LNK_DN_MD_M BIT(1)
+#define NAC_CGU_DWORD14_SYNCE_FAST_MODE_S 2
+#define NAC_CGU_DWORD14_SYNCE_FAST_MODE_M BIT(2)
+#define NAC_CGU_DWORD14_SYNCE_EEC_MODE_S 3
+#define NAC_CGU_DWORD14_SYNCE_EEC_MODE_M BIT(3)
+#define NAC_CGU_DWORD14_SYNCE_NGAIN_S 4
+#define NAC_CGU_DWORD14_SYNCE_NGAIN_M ICE_M(0xff, 4)
+#define NAC_CGU_DWORD14_SYNCE_NSCALE_S 12
+#define NAC_CGU_DWORD14_SYNCE_NSCALE_M ICE_M(0x3f, 12)
+#define NAC_CGU_DWORD14_SYNCE_UNLCK_THR_S 18
+#define NAC_CGU_DWORD14_SYNCE_UNLCK_THR_M ICE_M(0x3fff, 18)
+
+union nac_cgu_dword14 {
+	struct {
+		u32 synce_lnk_up_md : 1;
+		u32 synce_lnk_dn_md : 1;
+		u32 synce_fast_mode : 1;
+		u32 synce_eec_mode : 1;
+		u32 synce_ngain : 8;
+		u32 synce_nscale : 6;
+		u32 synce_unlck_thr : 14;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD15 0x3c
+#define NAC_CGU_DWORD15_SYNCE_UNLCK_THR_S 0
+#define NAC_CGU_DWORD15_SYNCE_UNLCK_THR_M ICE_M(0x7, 0)
+#define NAC_CGU_DWORD15_SYNCE_LOCK_THR_S 3
+#define NAC_CGU_DWORD15_SYNCE_LOCK_THR_M ICE_M(0x1ffff, 3)
+#define NAC_CGU_DWORD15_SYNCE_QUO_M1_S 20
+#define NAC_CGU_DWORD15_SYNCE_QUO_M1_M ICE_M(0x3f, 20)
+#define NAC_CGU_DWORD15_SYNCE_REMNDR_S 26
+#define NAC_CGU_DWORD15_SYNCE_REMNDR_M ICE_M(0x3f, 26)
+
+union nac_cgu_dword15 {
+	struct {
+		u32 synce_unlck_thr : 3;
+		u32 synce_lock_thr : 17;
+		u32 synce_quo_m1 : 6;
+		u32 synce_remndr : 6;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD16 0x40
+#define NAC_CGU_DWORD16_SYNCE_REMNDR_S 0
+#define NAC_CGU_DWORD16_SYNCE_REMNDR_M ICE_M(0x3f, 0)
+#define NAC_CGU_DWORD16_SYNCE_PHLMT_EN_S 6
+#define NAC_CGU_DWORD16_SYNCE_PHLMT_EN_M BIT(6)
+#define NAC_CGU_DWORD16_MISC13_S 7
+#define NAC_CGU_DWORD16_MISC13_M ICE_M(0x1ffffff, 7)
+
+union nac_cgu_dword16 {
+	struct {
+		u32 synce_remndr : 6;
+		u32 synce_phlmt_en : 1;
+		u32 misc13 : 25;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD17 0x44
+#define NAC_CGU_DWORD17_FDPLL_GAIN_S 0
+#define NAC_CGU_DWORD17_FDPLL_GAIN_M ICE_M(0xf, 0)
+#define NAC_CGU_DWORD17_FDPLL_SCALE_S 4
+#define NAC_CGU_DWORD17_FDPLL_SCALE_M ICE_M(0xf, 4)
+#define NAC_CGU_DWORD17_FDPLL_FGAIN_SHIFT_F_S 8
+#define NAC_CGU_DWORD17_FDPLL_FGAIN_SHIFT_F_M ICE_M(0x3f, 8)
+#define NAC_CGU_DWORD17_FDPLL_CLR_PHERR_S 14
+#define NAC_CGU_DWORD17_FDPLL_CLR_PHERR_M BIT(14)
+#define NAC_CGU_DWORD17_FDPLL_BB_EN_S 15
+#define NAC_CGU_DWORD17_FDPLL_BB_EN_M BIT(15)
+#define NAC_CGU_DWORD17_FDPLL_FGAIN_SHIFT_S 16
+#define NAC_CGU_DWORD17_FDPLL_FGAIN_SHIFT_M ICE_M(0x3f, 16)
+#define NAC_CGU_DWORD17_FDPLL_FSCALE_SHIFT_S 22
+#define NAC_CGU_DWORD17_FDPLL_FSCALE_SHIFT_M ICE_M(0x1f, 22)
+#define NAC_CGU_DWORD17_FDPLL_FSCALE_SHIFT_F_S 27
+#define NAC_CGU_DWORD17_FDPLL_FSCALE_SHIFT_F_M ICE_M(0x1f, 27)
+
+union nac_cgu_dword17 {
+	struct {
+		u32 fdpll_gain : 4;
+		u32 fdpll_scale : 4;
+		u32 fdpll_fgain_shift_f : 6;
+		u32 fdpll_clr_pherr : 1;
+		u32 fdpll_bb_en : 1;
+		u32 fdpll_fgain_shift : 6;
+		u32 fdpll_fscale_shift : 5;
+		u32 fdpll_fscale_shift_f : 5;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD18 0x48
+#define NAC_CGU_DWORD18_FDPLL_BYPASS_S 0
+#define NAC_CGU_DWORD18_FDPLL_BYPASS_M BIT(0)
+#define NAC_CGU_DWORD18_FDPLL_INP_NCO_S 1
+#define NAC_CGU_DWORD18_FDPLL_INP_NCO_M ICE_M(0xff, 1)
+#define NAC_CGU_DWORD18_FDPLL_AUTO_EN_S 9
+#define NAC_CGU_DWORD18_FDPLL_AUTO_EN_M BIT(9)
+#define NAC_CGU_DWORD18_FDPLL_SAMP_CNT_S 10
+#define NAC_CGU_DWORD18_FDPLL_SAMP_CNT_M ICE_M(0xfff, 10)
+#define NAC_CGU_DWORD18_FDPLL_LOCKCNT_S 22
+#define NAC_CGU_DWORD18_FDPLL_LOCKCNT_M ICE_M(0x1f, 22)
+#define NAC_CGU_DWORD18_FDPLL_LOCK_THR_S 27
+#define NAC_CGU_DWORD18_FDPLL_LOCK_THR_M ICE_M(0x1f, 27)
+
+union nac_cgu_dword18 {
+	struct {
+		u32 fdpll_bypass : 1;
+		u32 fdpll_inp_nco : 8;
+		u32 fdpll_auto_en : 1;
+		u32 fdpll_samp_cnt : 12;
+		u32 fdpll_lockcnt : 5;
+		u32 fdpll_lock_thr : 5;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD19 0x4c
+#define NAC_CGU_DWORD19_TSPLL_FBDIV_INTGR_S 0
+#define NAC_CGU_DWORD19_TSPLL_FBDIV_INTGR_M ICE_M(0xff, 0)
+#define NAC_CGU_DWORD19_FDPLL_ULCK_THR_S 8
+#define NAC_CGU_DWORD19_FDPLL_ULCK_THR_M ICE_M(0x1f, 8)
+#define NAC_CGU_DWORD19_MISC15_S 13
+#define NAC_CGU_DWORD19_MISC15_M ICE_M(0x7, 13)
+#define NAC_CGU_DWORD19_TSPLL_NDIVRATIO_S 16
+#define NAC_CGU_DWORD19_TSPLL_NDIVRATIO_M ICE_M(0xf, 16)
+#define NAC_CGU_DWORD19_TSPLL_IREF_NDIVRATIO_S 20
+#define NAC_CGU_DWORD19_TSPLL_IREF_NDIVRATIO_M ICE_M(0x7, 20)
+#define NAC_CGU_DWORD19_MISC19_S 23
+#define NAC_CGU_DWORD19_MISC19_M BIT(23)
+#define NAC_CGU_DWORD19_JAPLL_NDIVRATIO_S 24
+#define NAC_CGU_DWORD19_JAPLL_NDIVRATIO_M ICE_M(0xf, 24)
+#define NAC_CGU_DWORD19_JAPLL_IREF_NDIVRATIO_S 28
+#define NAC_CGU_DWORD19_JAPLL_IREF_NDIVRATIO_M ICE_M(0x7, 28)
+#define NAC_CGU_DWORD19_MISC27_S 31
+#define NAC_CGU_DWORD19_MISC27_M BIT(31)
+
+union nac_cgu_dword19 {
+	struct {
+		u32 tspll_fbdiv_intgr : 8;
+		u32 fdpll_ulck_thr : 5;
+		u32 misc15 : 3;
+		u32 tspll_ndivratio : 4;
+		u32 tspll_iref_ndivratio : 3;
+		u32 misc19 : 1;
+		u32 japll_ndivratio : 4;
+		u32 japll_iref_ndivratio : 3;
+		u32 misc27 : 1;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD20 0x50
+#define NAC_CGU_DWORD20_JAPLL_INT_DIV_S 0
+#define NAC_CGU_DWORD20_JAPLL_INT_DIV_M ICE_M(0xff, 0)
+#define NAC_CGU_DWORD20_JAPLL_FRAC_DIV_S 8
+#define NAC_CGU_DWORD20_JAPLL_FRAC_DIV_M ICE_M(0x3fffff, 8)
+#define NAC_CGU_DWORD20_MISC16_S 30
+#define NAC_CGU_DWORD20_MISC16_M ICE_M(0x3, 30)
+
+union nac_cgu_dword20 {
+	struct {
+		u32 japll_int_div : 8;
+		u32 japll_frac_div : 22;
+		u32 misc16 : 2;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD21 0x54
+#define NAC_CGU_DWORD21_MISC17_S 0
+#define NAC_CGU_DWORD21_MISC17_M ICE_M(0xf, 0)
+#define NAC_CGU_DWORD21_FDPLL_INT_DIV_OUT_NC_S 4
+#define NAC_CGU_DWORD21_FDPLL_INT_DIV_OUT_NC_M ICE_M(0xff, 4)
+#define NAC_CGU_DWORD21_FDPLL_FRAC_DIV_OUT_NC_S 12
+#define NAC_CGU_DWORD21_FDPLL_FRAC_DIV_OUT_NC_M ICE_M(0xfffff, 12)
+
+union nac_cgu_dword21 {
+	struct {
+		u32 misc17 : 4;
+		u32 fdpll_int_div_out_nc : 8;
+		u32 fdpll_frac_div_out_nc : 20;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD22 0x58
+#define NAC_CGU_DWORD22_FDPLL_FRAC_DIV_OUT_NC_S 0
+#define NAC_CGU_DWORD22_FDPLL_FRAC_DIV_OUT_NC_M ICE_M(0x3, 0)
+#define NAC_CGU_DWORD22_FDPLL_LOCK_INT_FOR_S 2
+#define NAC_CGU_DWORD22_FDPLL_LOCK_INT_FOR_M BIT(2)
+#define NAC_CGU_DWORD22_SYNCE_HDOV_INT_FOR_S 3
+#define NAC_CGU_DWORD22_SYNCE_HDOV_INT_FOR_M BIT(3)
+#define NAC_CGU_DWORD22_SYNCE_LOCK_INT_FOR_S 4
+#define NAC_CGU_DWORD22_SYNCE_LOCK_INT_FOR_M BIT(4)
+#define NAC_CGU_DWORD22_FDPLL_PHLEAD_SLIP_NC_S 5
+#define NAC_CGU_DWORD22_FDPLL_PHLEAD_SLIP_NC_M BIT(5)
+#define NAC_CGU_DWORD22_FDPLL_ACC1_OVFL_NC_S 6
+#define NAC_CGU_DWORD22_FDPLL_ACC1_OVFL_NC_M BIT(6)
+#define NAC_CGU_DWORD22_FDPLL_ACC2_OVFL_NC_S 7
+#define NAC_CGU_DWORD22_FDPLL_ACC2_OVFL_NC_M BIT(7)
+#define NAC_CGU_DWORD22_SYNCE_STATUS_NC_S 8
+#define NAC_CGU_DWORD22_SYNCE_STATUS_NC_M ICE_M(0x3f, 8)
+#define NAC_CGU_DWORD22_FDPLL_ACC1F_OVFL_S 14
+#define NAC_CGU_DWORD22_FDPLL_ACC1F_OVFL_M BIT(14)
+#define NAC_CGU_DWORD22_MISC18_S 15
+#define NAC_CGU_DWORD22_MISC18_M BIT(15)
+#define NAC_CGU_DWORD22_FDPLLCLK_DIV_S 16
+#define NAC_CGU_DWORD22_FDPLLCLK_DIV_M ICE_M(0xf, 16)
+#define NAC_CGU_DWORD22_TIME1588CLK_DIV_S 20
+#define NAC_CGU_DWORD22_TIME1588CLK_DIV_M ICE_M(0xf, 20)
+#define NAC_CGU_DWORD22_SYNCECLK_DIV_S 24
+#define NAC_CGU_DWORD22_SYNCECLK_DIV_M ICE_M(0xf, 24)
+#define NAC_CGU_DWORD22_SYNCECLK_SEL_DIV2_S 28
+#define NAC_CGU_DWORD22_SYNCECLK_SEL_DIV2_M BIT(28)
+#define NAC_CGU_DWORD22_FDPLLCLK_SEL_DIV2_S 29
+#define NAC_CGU_DWORD22_FDPLLCLK_SEL_DIV2_M BIT(29)
+#define NAC_CGU_DWORD22_TIME1588CLK_SEL_DIV2_S 30
+#define NAC_CGU_DWORD22_TIME1588CLK_SEL_DIV2_M BIT(30)
+#define NAC_CGU_DWORD22_MISC3_S 31
+#define NAC_CGU_DWORD22_MISC3_M BIT(31)
+
+union nac_cgu_dword22 {
+	struct {
+		u32 fdpll_frac_div_out_nc : 2;
+		u32 fdpll_lock_int_for : 1;
+		u32 synce_hdov_int_for : 1;
+		u32 synce_lock_int_for : 1;
+		u32 fdpll_phlead_slip_nc : 1;
+		u32 fdpll_acc1_ovfl_nc : 1;
+		u32 fdpll_acc2_ovfl_nc : 1;
+		u32 synce_status_nc : 6;
+		u32 fdpll_acc1f_ovfl : 1;
+		u32 misc18 : 1;
+		u32 fdpllclk_div : 4;
+		u32 time1588clk_div : 4;
+		u32 synceclk_div : 4;
+		u32 synceclk_sel_div2 : 1;
+		u32 fdpllclk_sel_div2 : 1;
+		u32 time1588clk_sel_div2 : 1;
+		u32 misc3 : 1;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD24 0x60
+#define NAC_CGU_DWORD24_TSPLL_FBDIV_FRAC_S 0
+#define NAC_CGU_DWORD24_TSPLL_FBDIV_FRAC_M ICE_M(0x3fffff, 0)
+#define NAC_CGU_DWORD24_MISC20_S 22
+#define NAC_CGU_DWORD24_MISC20_M ICE_M(0x3, 22)
+#define NAC_CGU_DWORD24_TS_PLL_ENABLE_S 24
+#define NAC_CGU_DWORD24_TS_PLL_ENABLE_M BIT(24)
+#define NAC_CGU_DWORD24_TIME_SYNC_TSPLL_ALIGN_SEL_S 25
+#define NAC_CGU_DWORD24_TIME_SYNC_TSPLL_ALIGN_SEL_M BIT(25)
+#define NAC_CGU_DWORD24_EXT_SYNCE_SEL_S 26
+#define NAC_CGU_DWORD24_EXT_SYNCE_SEL_M BIT(26)
+#define NAC_CGU_DWORD24_REF1588_CK_DIV_S 27
+#define NAC_CGU_DWORD24_REF1588_CK_DIV_M ICE_M(0xf, 27)
+#define NAC_CGU_DWORD24_TIME_REF_SEL_S 31
+#define NAC_CGU_DWORD24_TIME_REF_SEL_M BIT(31)
+
+union nac_cgu_dword24 {
+	struct {
+		u32 tspll_fbdiv_frac : 22;
+		u32 misc20 : 2;
+		u32 ts_pll_enable : 1;
+		u32 time_sync_tspll_align_sel : 1;
+		u32 ext_synce_sel : 1;
+		u32 ref1588_ck_div : 4;
+		u32 time_ref_sel : 1;
+	} field;
+	u32 val;
+};
+
+#define TSPLL_CNTR_BIST_SETTINGS 0x344
+#define TSPLL_CNTR_BIST_SETTINGS_I_IREFGEN_SETTLING_TIME_CNTR_7_0_S 0
+#define TSPLL_CNTR_BIST_SETTINGS_I_IREFGEN_SETTLING_TIME_CNTR_7_0_M \
+	ICE_M(0xff, 0)
+#define TSPLL_CNTR_BIST_SETTINGS_I_IREFGEN_SETTLING_TIME_RO_STANDBY_1_0_S 8
+#define TSPLL_CNTR_BIST_SETTINGS_I_IREFGEN_SETTLING_TIME_RO_STANDBY_1_0_M \
+	ICE_M(0x3, 8)
+#define TSPLL_CNTR_BIST_SETTINGS_RESERVED195_S 10
+#define TSPLL_CNTR_BIST_SETTINGS_RESERVED195_M ICE_M(0x1f, 10)
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_SEL_0_S 15
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_SEL_0_M BIT(15)
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_SEL_1_S 16
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_SEL_1_M BIT(16)
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_CNT_6_0_S 17
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_CNT_6_0_M ICE_M(0x7f, 17)
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_CNT_10_7_S 24
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_CNT_10_7_M ICE_M(0xf, 24)
+#define TSPLL_CNTR_BIST_SETTINGS_RESERVED200_S 28
+#define TSPLL_CNTR_BIST_SETTINGS_RESERVED200_M ICE_M(0xf, 28)
+
+union tspll_cntr_bist_settings {
+	struct {
+		u32 i_irefgen_settling_time_cntr_7_0 : 8;
+		u32 i_irefgen_settling_time_ro_standby_1_0 : 2;
+		u32 reserved195 : 5;
+		u32 i_plllock_sel_0 : 1;
+		u32 i_plllock_sel_1 : 1;
+		u32 i_plllock_cnt_6_0 : 7;
+		u32 i_plllock_cnt_10_7 : 4;
+		u32 reserved200 : 4;
+	} field;
+	u32 val;
+};
+
+#define TSPLL_RO_BWM_LF 0x370
+#define TSPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_7_0_S 0
+#define TSPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_7_0_M ICE_M(0xff, 0)
+#define TSPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_9_8_S 8
+#define TSPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_9_8_M ICE_M(0x3, 8)
+#define TSPLL_RO_BWM_LF_BIASCALDONE_CRI_S 10
+#define TSPLL_RO_BWM_LF_BIASCALDONE_CRI_M BIT(10)
+#define TSPLL_RO_BWM_LF_PLLLOCK_GAIN_TRAN_CRI_S 11
+#define TSPLL_RO_BWM_LF_PLLLOCK_GAIN_TRAN_CRI_M BIT(11)
+#define TSPLL_RO_BWM_LF_PLLLOCK_TRUE_LOCK_CRI_S 12
+#define TSPLL_RO_BWM_LF_PLLLOCK_TRUE_LOCK_CRI_M BIT(12)
+#define TSPLL_RO_BWM_LF_PLLUNLOCK_FLAG_CRI_S 13
+#define TSPLL_RO_BWM_LF_PLLUNLOCK_FLAG_CRI_M BIT(13)
+#define TSPLL_RO_BWM_LF_AFCERR_CRI_S 14
+#define TSPLL_RO_BWM_LF_AFCERR_CRI_M BIT(14)
+#define TSPLL_RO_BWM_LF_AFCDONE_CRI_S 15
+#define TSPLL_RO_BWM_LF_AFCDONE_CRI_M BIT(15)
+#define TSPLL_RO_BWM_LF_FEEDFWRDGAIN_CAL_CRI_7_0_S 16
+#define TSPLL_RO_BWM_LF_FEEDFWRDGAIN_CAL_CRI_7_0_M ICE_M(0xff, 16)
+#define TSPLL_RO_BWM_LF_M2FBDIVMOD_CRI_7_0_S 24
+#define TSPLL_RO_BWM_LF_M2FBDIVMOD_CRI_7_0_M ICE_M(0xff, 24)
+
+union tspll_ro_bwm_lf {
+	struct {
+		u32 bw_freqov_high_cri_7_0 : 8;
+		u32 bw_freqov_high_cri_9_8 : 2;
+		u32 biascaldone_cri : 1;
+		u32 plllock_gain_tran_cri : 1;
+		u32 plllock_true_lock_cri : 1;
+		u32 pllunlock_flag_cri : 1;
+		u32 afcerr_cri : 1;
+		u32 afcdone_cri : 1;
+		u32 feedfwrdgain_cal_cri_7_0 : 8;
+		u32 m2fbdivmod_cri_7_0 : 8;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_DIV0 0x400
+#define JAPLL_DIV0_I_FBDIV_INTGR_7_0_S 0
+#define JAPLL_DIV0_I_FBDIV_INTGR_7_0_M ICE_M(0xff, 0)
+#define JAPLL_DIV0_I_FBDIV_FRAC_7_0_S 8
+#define JAPLL_DIV0_I_FBDIV_FRAC_7_0_M ICE_M(0xff, 8)
+#define JAPLL_DIV0_I_FBDIV_FRAC_15_8_S 16
+#define JAPLL_DIV0_I_FBDIV_FRAC_15_8_M ICE_M(0xff, 16)
+#define JAPLL_DIV0_I_FBDIV_FRAC_21_16_S 24
+#define JAPLL_DIV0_I_FBDIV_FRAC_21_16_M ICE_M(0x3f, 24)
+#define JAPLL_DIV0_I_FRACNEN_H_S 30
+#define JAPLL_DIV0_I_FRACNEN_H_M BIT(30)
+#define JAPLL_DIV0_I_DIRECT_PIN_IF_EN_S 31
+#define JAPLL_DIV0_I_DIRECT_PIN_IF_EN_M BIT(31)
+
+union japll_div0 {
+	struct {
+		u32 i_fbdiv_intgr_7_0 : 8;
+		u32 i_fbdiv_frac_7_0 : 8;
+		u32 i_fbdiv_frac_15_8 : 8;
+		u32 i_fbdiv_frac_21_16 : 6;
+		u32 i_fracnen_h : 1;
+		u32 i_direct_pin_if_en : 1;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_LF 0x408
+#define JAPLL_LF_I_PROP_COEFF_3_0_S 0
+#define JAPLL_LF_I_PROP_COEFF_3_0_M ICE_M(0xf, 0)
+#define JAPLL_LF_I_FLL_INT_COEFF_3_0_S 4
+#define JAPLL_LF_I_FLL_INT_COEFF_3_0_M ICE_M(0xf, 4)
+#define JAPLL_LF_I_INT_COEFF_4_0_S 8
+#define JAPLL_LF_I_INT_COEFF_4_0_M ICE_M(0x1f, 8)
+#define JAPLL_LF_I_FLL_EN_H_S 13
+#define JAPLL_LF_I_FLL_EN_H_M BIT(13)
+#define JAPLL_LF_I_TDC_FINE_RES_S 14
+#define JAPLL_LF_I_TDC_FINE_RES_M BIT(14)
+#define JAPLL_LF_I_DCOFINE_RESOLUTION_S 15
+#define JAPLL_LF_I_DCOFINE_RESOLUTION_M BIT(15)
+#define JAPLL_LF_I_GAINCTRL_2_0_S 16
+#define JAPLL_LF_I_GAINCTRL_2_0_M ICE_M(0x7, 16)
+#define JAPLL_LF_I_AFC_DIVRATIO_S 19
+#define JAPLL_LF_I_AFC_DIVRATIO_M BIT(19)
+#define JAPLL_LF_I_AFCCNTSEL_S 20
+#define JAPLL_LF_I_AFCCNTSEL_M BIT(20)
+#define JAPLL_LF_I_AFC_STARTUP_1_0_S 21
+#define JAPLL_LF_I_AFC_STARTUP_1_0_M ICE_M(0x3, 21)
+#define JAPLL_LF_RESERVED31_S 23
+#define JAPLL_LF_RESERVED31_M BIT(23)
+#define JAPLL_LF_I_TDCTARGETCNT_7_0_S 24
+#define JAPLL_LF_I_TDCTARGETCNT_7_0_M ICE_M(0xff, 24)
+
+union japll_lf {
+	struct {
+		u32 i_prop_coeff_3_0 : 4;
+		u32 i_fll_int_coeff_3_0 : 4;
+		u32 i_int_coeff_4_0 : 5;
+		u32 i_fll_en_h : 1;
+		u32 i_tdc_fine_res : 1;
+		u32 i_dcofine_resolution : 1;
+		u32 i_gainctrl_2_0 : 3;
+		u32 i_afc_divratio : 1;
+		u32 i_afccntsel : 1;
+		u32 i_afc_startup_1_0 : 2;
+		u32 reserved31 : 1;
+		u32 i_tdctargetcnt_7_0 : 8;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_FRAC_LOCK 0x40c
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDGAIN_7_0_S 0
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDGAIN_7_0_M ICE_M(0xff, 0)
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDCAL_EN_H_S 8
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDCAL_EN_H_M BIT(8)
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDCAL_PAUSE_H_S 9
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDCAL_PAUSE_H_M BIT(9)
+#define JAPLL_FRAC_LOCK_I_DCODITHEREN_H_S 10
+#define JAPLL_FRAC_LOCK_I_DCODITHEREN_H_M BIT(10)
+#define JAPLL_FRAC_LOCK_I_LOCKTHRESH_3_0_S 11
+#define JAPLL_FRAC_LOCK_I_LOCKTHRESH_3_0_M ICE_M(0xf, 11)
+#define JAPLL_FRAC_LOCK_I_DCODITHER_CONFIG_S 15
+#define JAPLL_FRAC_LOCK_I_DCODITHER_CONFIG_M BIT(15)
+#define JAPLL_FRAC_LOCK_I_EARLYLOCK_CRITERIA_1_0_S 16
+#define JAPLL_FRAC_LOCK_I_EARLYLOCK_CRITERIA_1_0_M ICE_M(0x3, 16)
+#define JAPLL_FRAC_LOCK_I_TRUELOCK_CRITERIA_1_0_S 18
+#define JAPLL_FRAC_LOCK_I_TRUELOCK_CRITERIA_1_0_M ICE_M(0x3, 18)
+#define JAPLL_FRAC_LOCK_I_LF_HALF_CYC_EN_S 20
+#define JAPLL_FRAC_LOCK_I_LF_HALF_CYC_EN_M BIT(20)
+#define JAPLL_FRAC_LOCK_I_DITHER_OVRD_S 21
+#define JAPLL_FRAC_LOCK_I_DITHER_OVRD_M BIT(21)
+#define JAPLL_FRAC_LOCK_I_PLLLC_RESTORE_REG_S 22
+#define JAPLL_FRAC_LOCK_I_PLLLC_RESTORE_REG_M BIT(22)
+#define JAPLL_FRAC_LOCK_I_PLLLC_RESTORE_MODE_CTRL_S 23
+#define JAPLL_FRAC_LOCK_I_PLLLC_RESTORE_MODE_CTRL_M BIT(23)
+#define JAPLL_FRAC_LOCK_I_PLLRAMPEN_H_S 24
+#define JAPLL_FRAC_LOCK_I_PLLRAMPEN_H_M BIT(24)
+#define JAPLL_FRAC_LOCK_I_FBDIV_STROBE_H_S 25
+#define JAPLL_FRAC_LOCK_I_FBDIV_STROBE_H_M BIT(25)
+#define JAPLL_FRAC_LOCK_I_OVC_SNAPSHOT_H_S 26
+#define JAPLL_FRAC_LOCK_I_OVC_SNAPSHOT_H_M BIT(26)
+#define JAPLL_FRAC_LOCK_I_DITHER_VALUE_4_0_S 27
+#define JAPLL_FRAC_LOCK_I_DITHER_VALUE_4_0_M ICE_M(0x1f, 27)
+
+union japll_frac_lock {
+	struct {
+		u32 i_feedfwrdgain_7_0 : 8;
+		u32 i_feedfwrdcal_en_h : 1;
+		u32 i_feedfwrdcal_pause_h : 1;
+		u32 i_dcoditheren_h : 1;
+		u32 i_lockthresh_3_0 : 4;
+		u32 i_dcodither_config : 1;
+		u32 i_earlylock_criteria_1_0 : 2;
+		u32 i_truelock_criteria_1_0 : 2;
+		u32 i_lf_half_cyc_en : 1;
+		u32 i_dither_ovrd : 1;
+		u32 i_plllc_restore_reg : 1;
+		u32 i_plllc_restore_mode_ctrl : 1;
+		u32 i_pllrampen_h : 1;
+		u32 i_fbdiv_strobe_h : 1;
+		u32 i_ovc_snapshot_h : 1;
+		u32 i_dither_value_4_0 : 5;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_BIAS 0x414
+#define JAPLL_BIAS_I_IREFTRIM_4_0_S 0
+#define JAPLL_BIAS_I_IREFTRIM_4_0_M ICE_M(0x1f, 0)
+#define JAPLL_BIAS_I_VREF_RDAC_2_0_S 5
+#define JAPLL_BIAS_I_VREF_RDAC_2_0_M ICE_M(0x7, 5)
+#define JAPLL_BIAS_I_CTRIM_4_0_S 8
+#define JAPLL_BIAS_I_CTRIM_4_0_M ICE_M(0x1f, 8)
+#define JAPLL_BIAS_I_IREF_REFCLK_MODE_1_0_S 13
+#define JAPLL_BIAS_I_IREF_REFCLK_MODE_1_0_M ICE_M(0x3, 13)
+#define JAPLL_BIAS_I_BIASCAL_EN_H_S 15
+#define JAPLL_BIAS_I_BIASCAL_EN_H_M BIT(15)
+#define JAPLL_BIAS_I_BIAS_BONUS_7_0_S 16
+#define JAPLL_BIAS_I_BIAS_BONUS_7_0_M ICE_M(0xff, 16)
+#define JAPLL_BIAS_I_INIT_DCOAMP_5_0_S 24
+#define JAPLL_BIAS_I_INIT_DCOAMP_5_0_M ICE_M(0x3f, 24)
+#define JAPLL_BIAS_I_BIAS_GB_SEL_1_0_S 30
+#define JAPLL_BIAS_I_BIAS_GB_SEL_1_0_M ICE_M(0x3, 30)
+
+union japll_bias {
+	struct {
+		u32 i_ireftrim_4_0 : 5;
+		u32 i_vref_rdac_2_0 : 3;
+		u32 i_ctrim_4_0 : 5;
+		u32 i_iref_refclk_mode_1_0 : 2;
+		u32 i_biascal_en_h : 1;
+		u32 i_bias_bonus_7_0 : 8;
+		u32 i_init_dcoamp_5_0 : 6;
+		u32 i_bias_gb_sel_1_0 : 2;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_TDC_COLDST_BIAS 0x418
+#define JAPLL_TDC_COLDST_BIAS_I_TDCSEL_1_0_S 0
+#define JAPLL_TDC_COLDST_BIAS_I_TDCSEL_1_0_M ICE_M(0x3, 0)
+#define JAPLL_TDC_COLDST_BIAS_I_TDCOVCCORR_EN_H_S 2
+#define JAPLL_TDC_COLDST_BIAS_I_TDCOVCCORR_EN_H_M BIT(2)
+#define JAPLL_TDC_COLDST_BIAS_I_TDCDC_EN_H_S 3
+#define JAPLL_TDC_COLDST_BIAS_I_TDCDC_EN_H_M BIT(3)
+#define JAPLL_TDC_COLDST_BIAS_I_TDC_OFFSET_LOCK_1_0_S 4
+#define JAPLL_TDC_COLDST_BIAS_I_TDC_OFFSET_LOCK_1_0_M ICE_M(0x3, 4)
+#define JAPLL_TDC_COLDST_BIAS_I_SWCAP_IREFGEN_CLKMODE_1_0_S 6
+#define JAPLL_TDC_COLDST_BIAS_I_SWCAP_IREFGEN_CLKMODE_1_0_M ICE_M(0x3, 6)
+#define JAPLL_TDC_COLDST_BIAS_I_BB_GAIN_2_0_S 8
+#define JAPLL_TDC_COLDST_BIAS_I_BB_GAIN_2_0_M ICE_M(0x7, 8)
+#define JAPLL_TDC_COLDST_BIAS_I_BBTHRESH_3_0_S 11
+#define JAPLL_TDC_COLDST_BIAS_I_BBTHRESH_3_0_M ICE_M(0xf, 11)
+#define JAPLL_TDC_COLDST_BIAS_I_BBINLOCK_H_S 15
+#define JAPLL_TDC_COLDST_BIAS_I_BBINLOCK_H_M BIT(15)
+#define JAPLL_TDC_COLDST_BIAS_I_COLDSTART_S 16
+#define JAPLL_TDC_COLDST_BIAS_I_COLDSTART_M BIT(16)
+#define JAPLL_TDC_COLDST_BIAS_I_IREFBIAS_STARTUP_PULSE_WIDTH_1_0_S 17
+#define JAPLL_TDC_COLDST_BIAS_I_IREFBIAS_STARTUP_PULSE_WIDTH_1_0_M \
+	ICE_M(0x3, 17)
+#define JAPLL_TDC_COLDST_BIAS_I_DCO_SETTLING_TIME_CNTR_3_0_S 19
+#define JAPLL_TDC_COLDST_BIAS_I_DCO_SETTLING_TIME_CNTR_3_0_M ICE_M(0xf, 19)
+#define JAPLL_TDC_COLDST_BIAS_I_IREFBIAS_STARTUP_PULSE_BYPASS_S 23
+#define JAPLL_TDC_COLDST_BIAS_I_IREFBIAS_STARTUP_PULSE_BYPASS_M BIT(23)
+#define JAPLL_TDC_COLDST_BIAS_I_BIAS_CALIB_STEPSIZE_1_0_S 24
+#define JAPLL_TDC_COLDST_BIAS_I_BIAS_CALIB_STEPSIZE_1_0_M ICE_M(0x3, 24)
+#define JAPLL_TDC_COLDST_BIAS_RESERVED81_S 26
+#define JAPLL_TDC_COLDST_BIAS_RESERVED81_M BIT(26)
+#define JAPLL_TDC_COLDST_BIAS_I_IREFINT_EN_S 27
+#define JAPLL_TDC_COLDST_BIAS_I_IREFINT_EN_M BIT(27)
+#define JAPLL_TDC_COLDST_BIAS_I_VGSBUFEN_S 28
+#define JAPLL_TDC_COLDST_BIAS_I_VGSBUFEN_M BIT(28)
+#define JAPLL_TDC_COLDST_BIAS_I_DIGDFTSWEP_S 29
+#define JAPLL_TDC_COLDST_BIAS_I_DIGDFTSWEP_M BIT(29)
+#define JAPLL_TDC_COLDST_BIAS_I_IREFDIGDFTEN_S 30
+#define JAPLL_TDC_COLDST_BIAS_I_IREFDIGDFTEN_M BIT(30)
+#define JAPLL_TDC_COLDST_BIAS_I_IREF_REFCLK_INV_EN_S 31
+#define JAPLL_TDC_COLDST_BIAS_I_IREF_REFCLK_INV_EN_M BIT(31)
+
+union japll_tdc_coldst_bias {
+	struct {
+		u32 i_tdcsel_1_0 : 2;
+		u32 i_tdcovccorr_en_h : 1;
+		u32 i_tdcdc_en_h : 1;
+		u32 i_tdc_offset_lock_1_0 : 2;
+		u32 i_swcap_irefgen_clkmode_1_0 : 2;
+		u32 i_bb_gain_2_0 : 3;
+		u32 i_bbthresh_3_0 : 4;
+		u32 i_bbinlock_h : 1;
+		u32 i_coldstart : 1;
+		u32 i_irefbias_startup_pulse_width_1_0 : 2;
+		u32 i_dco_settling_time_cntr_3_0 : 4;
+		u32 i_irefbias_startup_pulse_bypass : 1;
+		u32 i_bias_calib_stepsize_1_0 : 2;
+		u32 reserved81 : 1;
+		u32 i_irefint_en : 1;
+		u32 i_vgsbufen : 1;
+		u32 i_digdftswep : 1;
+		u32 i_irefdigdften : 1;
+		u32 i_iref_refclk_inv_en : 1;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_DFX_DCO 0x424
+#define JAPLL_DFX_DCO_I_DCOFINEDFTSEL_1_0_S 0
+#define JAPLL_DFX_DCO_I_DCOFINEDFTSEL_1_0_M ICE_M(0x3, 0)
+#define JAPLL_DFX_DCO_I_DCOCOARSE_OVRD_H_S 2
+#define JAPLL_DFX_DCO_I_DCOCOARSE_OVRD_H_M BIT(2)
+#define JAPLL_DFX_DCO_I_BIAS_FILTER_EN_S 3
+#define JAPLL_DFX_DCO_I_BIAS_FILTER_EN_M BIT(3)
+#define JAPLL_DFX_DCO_I_PLLPWRMODE_1_0_S 4
+#define JAPLL_DFX_DCO_I_PLLPWRMODE_1_0_M ICE_M(0x3, 4)
+#define JAPLL_DFX_DCO_I_DCOAMP_STATICLEG_CFG_1_0_S 6
+#define JAPLL_DFX_DCO_I_DCOAMP_STATICLEG_CFG_1_0_M ICE_M(0x3, 6)
+#define JAPLL_DFX_DCO_I_DCOFINE_7_0_S 8
+#define JAPLL_DFX_DCO_I_DCOFINE_7_0_M ICE_M(0xff, 8)
+#define JAPLL_DFX_DCO_I_DCOFINE_9_8_S 16
+#define JAPLL_DFX_DCO_I_DCOFINE_9_8_M ICE_M(0x3, 16)
+#define JAPLL_DFX_DCO_I_DCOAMPOVRDEN_H_S 18
+#define JAPLL_DFX_DCO_I_DCOAMPOVRDEN_H_M BIT(18)
+#define JAPLL_DFX_DCO_I_DCOAMP_3_0_S 19
+#define JAPLL_DFX_DCO_I_DCOAMP_3_0_M ICE_M(0xf, 19)
+#define JAPLL_DFX_DCO_I_BIASFILTER_EN_DELAY_S 23
+#define JAPLL_DFX_DCO_I_BIASFILTER_EN_DELAY_M BIT(23)
+#define JAPLL_DFX_DCO_I_DCOCOARSE_7_0_S 24
+#define JAPLL_DFX_DCO_I_DCOCOARSE_7_0_M ICE_M(0xff, 24)
+
+union japll_dfx_dco {
+	struct {
+		u32 i_dcofinedftsel_1_0 : 2;
+		u32 i_dcocoarse_ovrd_h : 1;
+		u32 i_bias_filter_en : 1;
+		u32 i_pllpwrmode_1_0 : 2;
+		u32 i_dcoamp_staticleg_cfg_1_0 : 2;
+		u32 i_dcofine_7_0 : 8;
+		u32 i_dcofine_9_8 : 2;
+		u32 i_dcoampovrden_h : 1;
+		u32 i_dcoamp_3_0 : 4;
+		u32 i_biasfilter_en_delay : 1;
+		u32 i_dcocoarse_7_0 : 8;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_RO_BWM_LF 0x470
+#define JAPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_7_0_S 0
+#define JAPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_7_0_M ICE_M(0xff, 0)
+#define JAPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_9_8_S 8
+#define JAPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_9_8_M ICE_M(0x3, 8)
+#define JAPLL_RO_BWM_LF_BIASCALDONE_CRI_S 10
+#define JAPLL_RO_BWM_LF_BIASCALDONE_CRI_M BIT(10)
+#define JAPLL_RO_BWM_LF_PLLLOCK_GAIN_TRAN_CRI_S 11
+#define JAPLL_RO_BWM_LF_PLLLOCK_GAIN_TRAN_CRI_M BIT(11)
+#define JAPLL_RO_BWM_LF_PLLLOCK_TRUE_LOCK_CRI_S 12
+#define JAPLL_RO_BWM_LF_PLLLOCK_TRUE_LOCK_CRI_M BIT(12)
+#define JAPLL_RO_BWM_LF_PLLUNLOCK_FLAG_CRI_S 13
+#define JAPLL_RO_BWM_LF_PLLUNLOCK_FLAG_CRI_M BIT(13)
+#define JAPLL_RO_BWM_LF_AFCERR_CRI_S 14
+#define JAPLL_RO_BWM_LF_AFCERR_CRI_M BIT(14)
+#define JAPLL_RO_BWM_LF_AFCDONE_CRI_S 15
+#define JAPLL_RO_BWM_LF_AFCDONE_CRI_M BIT(15)
+#define JAPLL_RO_BWM_LF_FEEDFWRDGAIN_CAL_CRI_7_0_S 16
+#define JAPLL_RO_BWM_LF_FEEDFWRDGAIN_CAL_CRI_7_0_M ICE_M(0xff, 16)
+#define JAPLL_RO_BWM_LF_M2FBDIVMOD_CRI_7_0_S 24
+#define JAPLL_RO_BWM_LF_M2FBDIVMOD_CRI_7_0_M ICE_M(0xff, 24)
+
+union japll_ro_bwm_lf {
+	struct {
+		u32 bw_freqov_high_cri_7_0 : 8;
+		u32 bw_freqov_high_cri_9_8 : 2;
+		u32 biascaldone_cri : 1;
+		u32 plllock_gain_tran_cri : 1;
+		u32 plllock_true_lock_cri : 1;
+		u32 pllunlock_flag_cri : 1;
+		u32 afcerr_cri : 1;
+		u32 afcdone_cri : 1;
+		u32 feedfwrdgain_cal_cri_7_0 : 8;
+		u32 m2fbdivmod_cri_7_0 : 8;
+	} field;
+	u32 val;
+};
+
+#endif /* _ICE_CGU_REGS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_util.c b/drivers/net/ethernet/intel/ice/ice_cgu_util.c
new file mode 100644
index 000000000000..75561d5c26fd
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_util.c
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+
+/**
+ * ice_cgu_reg_read - Read a CGU register
+ * @pf: Board private structure
+ * @reg: Register to read from
+ * @val: Pointer to the value to read (out param)
+ */
+int ice_cgu_reg_read(struct ice_pf *pf, u32 reg, u32 *val)
+{
+	struct ice_sbq_msg_input cgu_msg;
+	int status;
+
+	cgu_msg.opcode = ice_sbq_msg_rd;
+	cgu_msg.dest_dev = cgu;
+	cgu_msg.msg_addr_low = reg;
+	cgu_msg.msg_addr_high = 0x0;
+
+	status = ice_sbq_rw_reg_lp(&pf->hw, &cgu_msg, true);
+	if (status) {
+		dev_dbg(ice_pf_to_dev(pf), "addr 0x%04x, val 0x%08x\n", reg, cgu_msg.data);
+		return -EIO;
+	}
+
+	*val = cgu_msg.data;
+
+	return 0;
+}
+
+/**
+ * ice_cgu_reg_write - Write a CGU register with lock parameter
+ * @pf: Board private structure
+ * @reg: Register to write to
+ * @val: Value to write
+ */
+int ice_cgu_reg_write(struct ice_pf *pf, u32 reg, u32 val)
+{
+	struct ice_sbq_msg_input cgu_msg;
+	int status;
+
+	cgu_msg.opcode = ice_sbq_msg_wr;
+	cgu_msg.dest_dev = cgu;
+	cgu_msg.msg_addr_low = reg;
+	cgu_msg.msg_addr_high = 0x0;
+	cgu_msg.data = val;
+
+	dev_dbg(ice_pf_to_dev(pf), "addr 0x%04x, val 0x%08x\n", reg, val);
+
+	status = ice_sbq_rw_reg_lp(&pf->hw, &cgu_msg, true);
+	if (status)
+		return -EIO;
+
+	return 0;
+}
+
+/**
+ * ice_cgu_set_gnd - Ground the refclk
+ * @pf: Board private structure
+ * @enable: True to ground the refclk
+ */
+int ice_cgu_set_gnd(struct ice_pf *pf, bool enable)
+{
+	int status = 0;
+	union nac_cgu_dword10 dw10;
+	int i;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+	if (enable)
+		dw10.field.synce_sel_gnd = 1;
+	else
+		dw10.field.synce_sel_gnd = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+	if (status)
+		goto err;
+
+	for (i = 0; i < 3; i++)
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_set_byp - Set the DPLL bypass
+ * @pf: Board private structure
+ * @enable: True to enable bypass
+ */
+int ice_cgu_set_byp(struct ice_pf *pf, bool enable)
+{
+	union nac_cgu_dword12 dw12;
+	int status = 0;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD12, &dw12.val);
+	if (status)
+		goto err;
+
+	if (enable)
+		dw12.field.synce_dpll_byp = 1;
+	else
+		dw12.field.synce_dpll_byp = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD12, dw12.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_set_holdover_lock_irq - Set holdover/lock interrupt
+ * @pf: Board private structure
+ * @enable: True to enable the lock
+ */
+int ice_cgu_set_holdover_lock_irq(struct ice_pf *pf, bool enable)
+{
+	union nac_cgu_dword13 dw13;
+	int status;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD13, &dw13.val);
+	if (status)
+		goto err;
+
+	/* the *_int_enb bits are defined opposite of what one would expect.
+	 * 0 = enabled, 1 = disabled
+	 */
+	if (enable) {
+		dw13.field.synce_hdov_int_enb = 0;
+		dw13.field.synce_lock_int_enb = 0;
+	} else {
+		dw13.field.synce_hdov_int_enb = 1;
+		dw13.field.synce_lock_int_enb = 1;
+	}
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD13, dw13.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_mux_sel_set_reg - Write to selected mux register
+ * @pf: Board private structure
+ * @mux_sel: Target mux
+ * @val: Value to write to
+ */
+int ice_cgu_mux_sel_set_reg(struct ice_pf *pf, enum ice_cgu_mux_sel mux_sel, u32 val)
+{
+	union nac_cgu_dword10 dw10;
+	union nac_cgu_dword11 dw11;
+	int status;
+
+	switch (mux_sel) {
+	case ICE_CGU_MUX_SEL_REF_CLK:
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+		if (status)
+			goto err;
+		dw10.field.synce_s_ref_clk = val;
+		status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+		if (status)
+			goto err;
+		break;
+
+	case ICE_CGU_MUX_SEL_BYPASS_CLK:
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD11, &dw11.val);
+		if (status)
+			goto err;
+		dw11.field.synce_s_byp_clk = val;
+		status = ice_cgu_reg_write(pf, NAC_CGU_DWORD11, dw11.val);
+		if (status)
+			goto err;
+		break;
+
+	case ICE_CGU_MUX_SEL_ETHCLKO:
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+		if (status)
+			goto err;
+		dw10.field.synce_ethclko_sel = val;
+		status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+		if (status)
+			goto err;
+		break;
+
+	case ICE_CGU_MUX_SEL_CLKO:
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+		if (status)
+			goto err;
+		dw10.field.synce_clko_sel = val;
+		status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+		if (status)
+			goto err;
+		break;
+
+	default:
+		dev_err(ice_pf_to_dev(pf), "internal error -- invalid mux!\n");
+		return -EIO;
+	}
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_dck_rst_assert_release - Assert the dck reset
+ * @pf: Board private structure
+ * @assert: True to assert, false to release
+ */
+int ice_cgu_dck_rst_assert_release(struct ice_pf *pf, bool assert)
+{
+	union nac_cgu_dword10 dw10;
+	int status = 0;
+	int i;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+	if (assert)
+		dw10.field.synce_dck_rst = 1;
+	else
+		dw10.field.synce_dck_rst = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+	if (status)
+		goto err;
+
+	for (i = 0; i < 3; i++)
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_dck2_rst_assert_release - Assert the dck2 reset
+ * @pf: Board private structure
+ * @assert: True to assert, false to release
+ */
+int ice_cgu_dck2_rst_assert_release(struct ice_pf *pf, bool assert)
+{
+	union nac_cgu_dword10 dw10;
+	int status = 0;
+	int i;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+	if (assert)
+		dw10.field.synce_dck2_rst = 1;
+	else
+		dw10.field.synce_dck2_rst = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+	if (status)
+		goto err;
+
+	for (i = 0; i < 3; i++)
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_mck_rst_assert_release - Assert the mck reset
+ * @pf: Board private structure
+ * @assert: True to assert, false to release
+ */
+int ice_cgu_mck_rst_assert_release(struct ice_pf *pf, bool assert)
+{
+	union nac_cgu_dword12 dw12;
+	int status = 0;
+	int i;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD12, &dw12.val);
+	if (status)
+		goto err;
+
+	if (assert)
+		dw12.field.synce_mck_rst = 1;
+	else
+		dw12.field.synce_mck_rst = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD12, dw12.val);
+	if (status)
+		goto err;
+
+	for (i = 0; i < 3; i++)
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD12, &dw12.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_usleep - Sleep for a specified period of time
+ * @usec: Time to sleep in microseconds
+ */
+void ice_cgu_usleep(u64 usec)
+{
+	if (usec <= 10) {
+		udelay(usec);
+	} else if (usec <= 20000) {
+		usleep_range(usec, usec + 10);
+	} else {
+		int msec;
+
+		msec = (usec + 999) / 1000;
+		msleep_interruptible(msec);
+	}
+}
+
+/**
+ * ice_cgu_poll - Poll the specified CGU register for the specified value
+ * @pf: Board private structure
+ * @offset: Offset of the register
+ * @mask: Bitmask for testing the value
+ * @value: Value to poll for
+ * @delay_time: Delay between the register reads
+ * @delay_loops: Number of read loops
+ */
+int ice_cgu_poll(struct ice_pf *pf, u64 offset, u32 mask, u32 value, u32 delay_time,
+		 u32 delay_loops)
+{
+	int status;
+	u32 reg, i;
+
+	for (i = 0; i < delay_loops; i++) {
+		status = ice_cgu_reg_read(pf, offset, &reg);
+		if (status)
+			goto err;
+
+		if ((reg & mask) == value)
+			return 0;
+
+		/* delay for a bit */
+		ice_cgu_usleep(delay_time);
+	}
+
+	return -EBUSY;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_npoll - Poll the specified CGU register for the specified value occurring n times
+ * @pf: Board private structure
+ * @offset: Offset of the register
+ * @mask: Bitmask for testing the value
+ * @value: Value to poll for
+ * @delay_time: Delay between the register reads
+ * @delay_loops: Number of read loops
+ * @poll_count: Number of the value matches to poll for
+ * @count_delay_time: Additional delay after the value match
+ */
+int ice_cgu_npoll(struct ice_pf *pf, u32 offset, u32 mask, u32 value, u32 delay_time,
+		  u32 delay_loops, u32 poll_count, u32 count_delay_time)
+{
+	u32 reg, i, my_count = 0, complete = 0;
+	int status;
+
+	for (i = 0; i < delay_loops; i++) {
+		status = ice_cgu_reg_read(pf, offset, &reg);
+		if (status)
+			goto err;
+
+		dev_dbg(ice_pf_to_dev(pf), "count=%u, reg=%08x\n", my_count, reg);
+
+		if ((reg & mask) == value) {
+			my_count++;
+			if (my_count < poll_count) {
+				ice_cgu_usleep(count_delay_time);
+			} else {
+				complete = 1;
+				break;
+			}
+		} else {
+			my_count = 0;
+			ice_cgu_usleep(delay_time);
+		}
+	}
+
+	if (complete)
+		return 0;
+	else
+		return -EBUSY;
+
+err:
+	return status;
+}
+
+struct ice_cgu_dpll_params dpll_params_table[ICE_NUM_DPLL_PARAMS] = {
+	/* {dpll select, sample rate, mul_rat_m1, scale, gain} */
+	{ ICE_CGU_DPLL_SELECT_TRANSPORT, ICE_CGU_SAMPLE_RATE_8K, 3124, 16, 42 },
+	{ ICE_CGU_DPLL_SELECT_EEC_RELAXED_BW, ICE_CGU_SAMPLE_RATE_8K, 3124, 7, 3 },
+	{ ICE_CGU_DPLL_SELECT_TRANSPORT, ICE_CGU_SAMPLE_RATE_10K, 2499, 20, 66 },
+	{ ICE_CGU_DPLL_SELECT_EEC_RELAXED_BW, ICE_CGU_SAMPLE_RATE_10K, 2499, 8, 4 },
+	{ ICE_CGU_DPLL_SELECT_TRANSPORT, ICE_CGU_SAMPLE_RATE_12K5, 1999, 25, 103 },
+	{ ICE_CGU_DPLL_SELECT_EEC_RELAXED_BW, ICE_CGU_SAMPLE_RATE_12K5, 1999, 10, 6 }
+};
+
+struct ice_cgu_dpll_per_rate_params dpll_per_rate_params[NUM_ICE_TIME_REF_FREQ] = {
+	/* {rate_hz, sample_rate, div_rat_m1, synce_rat_sel} */
+	{ 25000000, ICE_CGU_SAMPLE_RATE_10K, 2499, 0 }, /* 25 MHz */
+	{ 122880000, ICE_CGU_SAMPLE_RATE_8K, 3071, 1 }, /* 122.88 MHz */
+	{ 125000000, ICE_CGU_SAMPLE_RATE_10K, 2499, 1 }, /* 125 MHz */
+	{ 153600000, ICE_CGU_SAMPLE_RATE_10K, 3071, 1 }, /* 153.6 MHz */
+	{ 156250000, ICE_CGU_SAMPLE_RATE_10K, 3124, 1 }, /* 156.25 MHz */
+};
+
+struct ice_cgu_lcpll_per_rate_params tspll_per_rate_params[NUM_ICE_TIME_REF_FREQ] = {
+	/* {refclk_pre_div, feedback_div, frac_n_div, post_pll_div} */
+	{ 1, 197, 2621440, 6 }, /* 25 MHz */
+	{ 5, 223, 524288, 7 }, /* 122.88 MHz */
+	{ 5, 223, 524288, 7 }, /* 125 MHz */
+	{ 5, 159, 1572864, 6 }, /* 153.6 MHz */
+	{ 5, 159, 1572864, 6 }, /* 156.25 MHz */
+	{ 10, 223, 524288, 7 }, /* 245.76 MHz */
+};
+
+struct ice_cgu_lcpll_per_rate_params japll_per_rate_params[NUM_ICE_CGU_JAPLL_REF_FREQ] = {
+	/* {refclk_pre_div, feedback_div, frac_n_div, post_pll_div} */
+	{ 1, 150, 0, 6 }, /* 25 MHz */
+	{ 1, 120, 0, 6 }, /* 156.25 MHz */
+};
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_util.h b/drivers/net/ethernet/intel/ice/ice_cgu_util.h
new file mode 100644
index 000000000000..6a1566324cb1
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_util.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_CGU_UTIL_H_
+#define _ICE_CGU_UTIL_H_
+
+/* offset of last valid CGU register */
+#define ICE_CGU_MAX_REG_OFFS 0x47c
+
+int ice_cgu_reg_read(struct ice_pf *pf, u32 reg, u32 *val);
+
+int ice_cgu_reg_write(struct ice_pf *pf, u32 reg, u32 val);
+
+int ice_cgu_set_gnd(struct ice_pf *pf, bool enable);
+
+int ice_cgu_set_byp(struct ice_pf *pf, bool enable);
+
+int ice_cgu_set_holdover_lock_irq(struct ice_pf *pf, bool enable);
+
+int ice_cgu_mux_sel_set_reg(struct ice_pf *pf, enum ice_cgu_mux_sel mux_sel, u32 val);
+
+int ice_cgu_dck_rst_assert_release(struct ice_pf *pf, bool assert);
+
+int ice_cgu_dck2_rst_assert_release(struct ice_pf *pf, bool assert);
+
+int ice_cgu_mck_rst_assert_release(struct ice_pf *pf, bool assert);
+
+void ice_cgu_usleep(u64 usec);
+
+int ice_cgu_poll(struct ice_pf *pf, u64 offset, u32 mask, u32 value, u32 delay_time,
+		 u32 delay_loops);
+
+int ice_cgu_npoll(struct ice_pf *pf, u32 offset, u32 mask, u32 value, u32 delay_time,
+		  u32 delay_loops, u32 poll_count, u32 count_delay_time);
+
+#define ICE_NUM_DPLL_PARAMS (NUM_ICE_CGU_SAMPLE_RATE * NUM_ICE_CGU_DPLL_SELECT)
+
+extern struct ice_cgu_dpll_params dpll_params_table[ICE_NUM_DPLL_PARAMS];
+
+extern struct ice_cgu_dpll_per_rate_params dpll_per_rate_params[NUM_ICE_TIME_REF_FREQ];
+
+extern struct ice_cgu_lcpll_per_rate_params tspll_per_rate_params[NUM_ICE_TIME_REF_FREQ];
+
+extern struct ice_cgu_lcpll_per_rate_params japll_per_rate_params[NUM_ICE_CGU_JAPLL_REF_FREQ];
+
+#endif /* _ICE_CGU_UTIL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index d68b8aa31b19..32335405e484 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -1,30 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 #include "ice_sched.h"
 #include "ice_adminq_cmd.h"
 
-#define ICE_PF_RESET_WAIT_COUNT	200
+#include "ice_flow.h"
+#include "ice_switch.h"
 
-#define ICE_PROG_FLEX_ENTRY(hw, rxdid, mdid, idx) \
-	wr32((hw), GLFLXP_RXDID_FLX_WRD_##idx(rxdid), \
-	     ((ICE_RX_OPC_MDID << \
-	       GLFLXP_RXDID_FLX_WRD_##idx##_RXDID_OPCODE_S) & \
-	      GLFLXP_RXDID_FLX_WRD_##idx##_RXDID_OPCODE_M) | \
-	     (((mdid) << GLFLXP_RXDID_FLX_WRD_##idx##_PROT_MDID_S) & \
-	      GLFLXP_RXDID_FLX_WRD_##idx##_PROT_MDID_M))
+#define ICE_PF_RESET_WAIT_COUNT	300
+#define ICE_SCHED_VALID_SEC_BITS 4
 
-#define ICE_PROG_FLG_ENTRY(hw, rxdid, flg_0, flg_1, flg_2, flg_3, idx) \
-	wr32((hw), GLFLXP_RXDID_FLAGS(rxdid, idx), \
-	     (((flg_0) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S) & \
-	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M) | \
-	     (((flg_1) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S) & \
-	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_M) | \
-	     (((flg_2) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_S) & \
-	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_M) | \
-	     (((flg_3) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_S) & \
-	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_M))
 
 /**
  * ice_set_mac_type - Sets MAC type
@@ -38,24 +24,68 @@ static enum ice_status ice_set_mac_type(struct ice_hw *hw)
 	if (hw->vendor_id != PCI_VENDOR_ID_INTEL)
 		return ICE_ERR_DEVICE_NOT_SUPPORTED;
 
-	hw->mac_type = ICE_MAC_GENERIC;
+	switch (hw->device_id) {
+	case ICE_DEV_ID_E810C_BACKPLANE:
+	case ICE_DEV_ID_E810C_QSFP:
+	case ICE_DEV_ID_E810C_SFP:
+	case ICE_DEV_ID_E810_XXV_BACKPLANE:
+	case ICE_DEV_ID_E810_XXV_QSFP:
+	case ICE_DEV_ID_E810_XXV_SFP:
+		hw->mac_type = ICE_MAC_E810;
+		break;
+	case ICE_DEV_ID_E822C_10G_BASE_T:
+	case ICE_DEV_ID_E822C_BACKPLANE:
+	case ICE_DEV_ID_E822C_QSFP:
+	case ICE_DEV_ID_E822C_SFP:
+	case ICE_DEV_ID_E822C_SGMII:
+	case ICE_DEV_ID_E822L_10G_BASE_T:
+	case ICE_DEV_ID_E822L_BACKPLANE:
+	case ICE_DEV_ID_E822L_SFP:
+	case ICE_DEV_ID_E822L_SGMII:
+	case ICE_DEV_ID_E823L_10G_BASE_T:
+	case ICE_DEV_ID_E823L_1GBE:
+	case ICE_DEV_ID_E823L_BACKPLANE:
+	case ICE_DEV_ID_E823L_QSFP:
+	case ICE_DEV_ID_E823L_SFP:
+	case ICE_DEV_ID_E823C_10G_BASE_T:
+	case ICE_DEV_ID_E823C_BACKPLANE:
+	case ICE_DEV_ID_E823C_QSFP:
+	case ICE_DEV_ID_E823C_SFP:
+	case ICE_DEV_ID_E823C_SGMII:
+		hw->mac_type = ICE_MAC_GENERIC;
+		break;
+	default:
+		hw->mac_type = ICE_MAC_UNKNOWN;
+		break;
+	}
+
+	ice_debug(hw, ICE_DBG_INIT, "mac_type: %d\n", hw->mac_type);
 	return 0;
 }
 
 /**
- * ice_dev_onetime_setup - Temporary HW/FW workarounds
- * @hw: pointer to the HW structure
+ * ice_is_generic_mac
+ * @hw: pointer to the hardware structure
  *
- * This function provides temporary workarounds for certain issues
- * that are expected to be fixed in the HW/FW.
+ * returns true if mac_type is ICE_MAC_GENERIC, false if not
  */
-void ice_dev_onetime_setup(struct ice_hw *hw)
+bool ice_is_generic_mac(struct ice_hw *hw)
 {
-#define MBX_PF_VT_PFALLOC	0x00231E80
-	/* set VFs per PF */
-	wr32(hw, MBX_PF_VT_PFALLOC, rd32(hw, PF_VT_PFALLOC_HIF));
+	return hw->mac_type == ICE_MAC_GENERIC;
 }
 
+/**
+ * ice_is_e810
+ * @hw: pointer to the hardware structure
+ *
+ * returns true if the device is E810 based, false if not.
+ */
+bool ice_is_e810(struct ice_hw *hw)
+{
+	return hw->mac_type == ICE_MAC_E810;
+}
+
+
 /**
  * ice_clear_pf_cfg - Clear PF configuration
  * @hw: pointer to the hardware structure
@@ -84,7 +114,8 @@ enum ice_status ice_clear_pf_cfg(struct ice_hw *hw)
  * is returned in user specified buffer. Please interpret user specified
  * buffer as "manage_mac_read" response.
  * Response such as various MAC addresses are stored in HW struct (port.mac)
- * ice_aq_discover_caps is expected to be called before this function is called.
+ * ice_discover_dev_caps is expected to be called before this function is
+ * called.
  */
 static enum ice_status
 ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
@@ -108,7 +139,7 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
 	if (status)
 		return status;
 
-	resp = (struct ice_aqc_manage_mac_read_resp *)buf;
+	resp = buf;
 	flags = le16_to_cpu(cmd->flags) & ICE_AQC_MAN_MAC_READ_M;
 
 	if (!(flags & ICE_AQC_MAN_MAC_LAN_ADDR_VALID)) {
@@ -125,7 +156,6 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
 					resp[i].mac_addr);
 			break;
 		}
-
 	return 0;
 }
 
@@ -148,11 +178,17 @@ ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 	u16 pcaps_size = sizeof(*pcaps);
 	struct ice_aq_desc desc;
 	enum ice_status status;
+	struct ice_hw *hw;
 
 	cmd = &desc.params.get_phy;
 
 	if (!pcaps || (report_mode & ~ICE_AQC_REPORT_MODE_M) || !pi)
 		return ICE_ERR_PARAM;
+	hw = pi->hw;
+
+	if (report_mode == ICE_AQC_REPORT_DFLT_CFG &&
+	    !ice_fw_supports_report_dflt_cfg(hw))
+		return ICE_ERR_PARAM;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_phy_caps);
 
@@ -160,16 +196,95 @@ ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 		cmd->param0 |= cpu_to_le16(ICE_AQC_GET_PHY_RQM);
 
 	cmd->param0 |= cpu_to_le16(report_mode);
-	status = ice_aq_send_cmd(pi->hw, &desc, pcaps, pcaps_size, cd);
-
-	if (!status && report_mode == ICE_AQC_REPORT_TOPO_CAP) {
+	status = ice_aq_send_cmd(hw, &desc, pcaps, pcaps_size, cd);
+
+	ice_debug(hw, ICE_DBG_LINK, "get phy caps - report_mode = 0x%x\n",
+		  report_mode);
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_low = 0x%llx\n",
+		  (unsigned long long)le64_to_cpu(pcaps->phy_type_low));
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_high = 0x%llx\n",
+		  (unsigned long long)le64_to_cpu(pcaps->phy_type_high));
+	ice_debug(hw, ICE_DBG_LINK, "	caps = 0x%x\n", pcaps->caps);
+	ice_debug(hw, ICE_DBG_LINK, "	low_power_ctrl_an = 0x%x\n",
+		  pcaps->low_power_ctrl_an);
+	ice_debug(hw, ICE_DBG_LINK, "	eee_cap = 0x%x\n", pcaps->eee_cap);
+	ice_debug(hw, ICE_DBG_LINK, "	eeer_value = 0x%x\n",
+		  pcaps->eeer_value);
+	ice_debug(hw, ICE_DBG_LINK, "	link_fec_options = 0x%x\n",
+		  pcaps->link_fec_options);
+	ice_debug(hw, ICE_DBG_LINK, "	module_compliance_enforcement = 0x%x\n",
+		  pcaps->module_compliance_enforcement);
+	ice_debug(hw, ICE_DBG_LINK, "   extended_compliance_code = 0x%x\n",
+		  pcaps->extended_compliance_code);
+	ice_debug(hw, ICE_DBG_LINK, "   module_type[0] = 0x%x\n",
+		  pcaps->module_type[0]);
+	ice_debug(hw, ICE_DBG_LINK, "   module_type[1] = 0x%x\n",
+		  pcaps->module_type[1]);
+	ice_debug(hw, ICE_DBG_LINK, "   module_type[2] = 0x%x\n",
+		  pcaps->module_type[2]);
+
+	if (!status && report_mode == ICE_AQC_REPORT_TOPO_CAP_MEDIA) {
 		pi->phy.phy_type_low = le64_to_cpu(pcaps->phy_type_low);
 		pi->phy.phy_type_high = le64_to_cpu(pcaps->phy_type_high);
+		memcpy(pi->phy.link_info.module_type, &pcaps->module_type,
+		       sizeof(pi->phy.link_info.module_type));
 	}
 
 	return status;
 }
 
+/**
+ * ice_aq_get_link_topo_handle - get link topology node return status
+ * @pi: port information structure
+ * @node_type: requested node type
+ * @cd: pointer to command details structure or NULL
+ *
+ * Get link topology node return status for specified node type (0x06E0)
+ *
+ * Node type cage can be used to determine if cage is present. If AQC
+ * returns error (ENOENT), then no cage present. If no cage present, then
+ * connection type is backplane or BASE-T.
+ */
+static enum ice_status
+ice_aq_get_link_topo_handle(struct ice_port_info *pi, u8 node_type,
+			    struct ice_sq_cd *cd)
+{
+	struct ice_aqc_get_link_topo *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.get_link_topo;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo);
+
+	cmd->addr.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_CTX_PORT <<
+		 ICE_AQC_LINK_TOPO_NODE_CTX_S);
+
+	/* set node type */
+	cmd->addr.topo_params.node_type_ctx |=
+		(ICE_AQC_LINK_TOPO_NODE_TYPE_M & node_type);
+
+	return ice_aq_send_cmd(pi->hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_is_media_cage_present
+ * @pi: port information structure
+ *
+ * Returns true if media cage is present, else false. If no cage, then
+ * media type is backplane or BASE-T.
+ */
+static bool ice_is_media_cage_present(struct ice_port_info *pi)
+{
+	/* Node type cage can be used to determine if cage is present. If AQC
+	 * returns error (ENOENT), then no cage present. If no cage present then
+	 * connection type is backplane or BASE-T.
+	 */
+	return !ice_aq_get_link_topo_handle(pi,
+					    ICE_AQC_LINK_TOPO_NODE_TYPE_CAGE,
+					    NULL);
+}
+
 /**
  * ice_get_media_type - Gets media type
  * @pi: port information structure
@@ -187,6 +302,18 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		return ICE_MEDIA_UNKNOWN;
 
 	if (hw_link_info->phy_type_low) {
+		/* 1G SGMII is a special case where some DA cable PHYs
+		 * may show this as an option when it really shouldn't
+		 * be since SGMII is meant to be between a MAC and a PHY
+		 * in a backplane. Try to detect this case and handle it
+		 */
+		if (hw_link_info->phy_type_low == ICE_PHY_TYPE_LOW_1G_SGMII &&
+		    (hw_link_info->module_type[ICE_AQC_MOD_TYPE_IDENT] ==
+		    ICE_AQC_MOD_TYPE_BYTE1_SFP_PLUS_CU_ACTIVE ||
+		    hw_link_info->module_type[ICE_AQC_MOD_TYPE_IDENT] ==
+		    ICE_AQC_MOD_TYPE_BYTE1_SFP_PLUS_CU_PASSIVE))
+			return ICE_MEDIA_DA;
+
 		switch (hw_link_info->phy_type_low) {
 		case ICE_PHY_TYPE_LOW_1000BASE_SX:
 		case ICE_PHY_TYPE_LOW_1000BASE_LX:
@@ -195,7 +322,6 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
 		case ICE_PHY_TYPE_LOW_25GBASE_SR:
 		case ICE_PHY_TYPE_LOW_25GBASE_LR:
-		case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
 		case ICE_PHY_TYPE_LOW_40GBASE_SR4:
 		case ICE_PHY_TYPE_LOW_40GBASE_LR4:
 		case ICE_PHY_TYPE_LOW_50GBASE_SR2:
@@ -208,6 +334,15 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		case ICE_PHY_TYPE_LOW_100GBASE_SR2:
 		case ICE_PHY_TYPE_LOW_100GBASE_DR:
 			return ICE_MEDIA_FIBER;
+		case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC:
+			return ICE_MEDIA_FIBER;
 		case ICE_PHY_TYPE_LOW_100BASE_TX:
 		case ICE_PHY_TYPE_LOW_1000BASE_T:
 		case ICE_PHY_TYPE_LOW_2500BASE_T:
@@ -226,6 +361,16 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4:
 		case ICE_PHY_TYPE_LOW_100GBASE_CP2:
 			return ICE_MEDIA_DA;
+		case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
+		case ICE_PHY_TYPE_LOW_40G_XLAUI:
+		case ICE_PHY_TYPE_LOW_50G_LAUI2:
+		case ICE_PHY_TYPE_LOW_50G_AUI2:
+		case ICE_PHY_TYPE_LOW_50G_AUI1:
+		case ICE_PHY_TYPE_LOW_100G_AUI4:
+		case ICE_PHY_TYPE_LOW_100G_CAUI4:
+			if (ice_is_media_cage_present(pi))
+				return ICE_MEDIA_AUI;
+			/* fall-through */
 		case ICE_PHY_TYPE_LOW_1000BASE_KX:
 		case ICE_PHY_TYPE_LOW_2500BASE_KX:
 		case ICE_PHY_TYPE_LOW_2500BASE_X:
@@ -243,13 +388,22 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		}
 	} else {
 		switch (hw_link_info->phy_type_high) {
+		case ICE_PHY_TYPE_HIGH_100G_AUI2:
+		case ICE_PHY_TYPE_HIGH_100G_CAUI2:
+			if (ice_is_media_cage_present(pi))
+				return ICE_MEDIA_AUI;
+			/* fall-through */
 		case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4:
 			return ICE_MEDIA_BACKPLANE;
+		case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC:
+		case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC:
+			return ICE_MEDIA_FIBER;
 		}
 	}
 	return ICE_MEDIA_UNKNOWN;
 }
 
+
 /**
  * ice_aq_get_link_info
  * @pi: port information structure
@@ -277,6 +431,8 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 	if (!pi)
 		return ICE_ERR_PARAM;
 	hw = pi->hw;
+
+
 	li_old = &pi->phy.link_info_old;
 	hw_media_type = &pi->phy.media_type;
 	li = &pi->phy.link_info;
@@ -302,6 +458,7 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 	li->phy_type_high = le64_to_cpu(link_data.phy_type_high);
 	*hw_media_type = ice_get_media_type(pi);
 	li->link_info = link_data.link_info;
+	li->link_cfg_err = link_data.link_cfg_err;
 	li->an_info = link_data.an_info;
 	li->ext_info = link_data.ext_info;
 	li->max_frame_size = le16_to_cpu(link_data.max_frame_size);
@@ -324,18 +481,22 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 
 	li->lse_ena = !!(resp->cmd_flags & cpu_to_le16(ICE_AQ_LSE_IS_ENABLED));
 
-	ice_debug(hw, ICE_DBG_LINK, "link_speed = 0x%x\n", li->link_speed);
-	ice_debug(hw, ICE_DBG_LINK, "phy_type_low = 0x%llx\n",
+	ice_debug(hw, ICE_DBG_LINK, "get link info\n");
+	ice_debug(hw, ICE_DBG_LINK, "	link_speed = 0x%x\n", li->link_speed);
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_low = 0x%llx\n",
 		  (unsigned long long)li->phy_type_low);
-	ice_debug(hw, ICE_DBG_LINK, "phy_type_high = 0x%llx\n",
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_high = 0x%llx\n",
 		  (unsigned long long)li->phy_type_high);
-	ice_debug(hw, ICE_DBG_LINK, "media_type = 0x%x\n", *hw_media_type);
-	ice_debug(hw, ICE_DBG_LINK, "link_info = 0x%x\n", li->link_info);
-	ice_debug(hw, ICE_DBG_LINK, "an_info = 0x%x\n", li->an_info);
-	ice_debug(hw, ICE_DBG_LINK, "ext_info = 0x%x\n", li->ext_info);
-	ice_debug(hw, ICE_DBG_LINK, "lse_ena = 0x%x\n", li->lse_ena);
-	ice_debug(hw, ICE_DBG_LINK, "max_frame = 0x%x\n", li->max_frame_size);
-	ice_debug(hw, ICE_DBG_LINK, "pacing = 0x%x\n", li->pacing);
+	ice_debug(hw, ICE_DBG_LINK, "	media_type = 0x%x\n", *hw_media_type);
+	ice_debug(hw, ICE_DBG_LINK, "	link_info = 0x%x\n", li->link_info);
+	ice_debug(hw, ICE_DBG_LINK, "	link_cfg_err = 0x%x\n", li->link_cfg_err);
+	ice_debug(hw, ICE_DBG_LINK, "	an_info = 0x%x\n", li->an_info);
+	ice_debug(hw, ICE_DBG_LINK, "	ext_info = 0x%x\n", li->ext_info);
+	ice_debug(hw, ICE_DBG_LINK, "	fec_info = 0x%x\n", li->fec_info);
+	ice_debug(hw, ICE_DBG_LINK, "	lse_ena = 0x%x\n", li->lse_ena);
+	ice_debug(hw, ICE_DBG_LINK, "	max_frame = 0x%x\n",
+		  li->max_frame_size);
+	ice_debug(hw, ICE_DBG_LINK, "	pacing = 0x%x\n", li->pacing);
 
 	/* save link status information */
 	if (link)
@@ -348,85 +509,68 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 }
 
 /**
- * ice_init_flex_flags
- * @hw: pointer to the hardware structure
- * @prof_id: Rx Descriptor Builder profile ID
+ * ice_fill_tx_timer_and_fc_thresh
+ * @hw: pointer to the HW struct
+ * @cmd: pointer to MAC cfg structure
  *
- * Function to initialize Rx flex flags
+ * Add Tx timer and FC refresh threshold info to Set MAC Config AQ command
+ * descriptor
  */
-static void ice_init_flex_flags(struct ice_hw *hw, enum ice_rxdid prof_id)
+static void
+ice_fill_tx_timer_and_fc_thresh(struct ice_hw *hw,
+				struct ice_aqc_set_mac_cfg *cmd)
 {
-	u8 idx = 0;
+	u16 fc_thres_val, tx_timer_val;
+	u32 val;
 
-	/* Flex-flag fields (0-2) are programmed with FLG64 bits with layout:
-	 * flexiflags0[5:0] - TCP flags, is_packet_fragmented, is_packet_UDP_GRE
-	 * flexiflags1[3:0] - Not used for flag programming
-	 * flexiflags2[7:0] - Tunnel and VLAN types
-	 * 2 invalid fields in last index
-	 */
-	switch (prof_id) {
-	/* Rx flex flags are currently programmed for the NIC profiles only.
-	 * Different flag bit programming configurations can be added per
-	 * profile as needed.
+	/* We read back the transmit timer and fc threshold value of
+	 * LFC. Thus, we will use index =
+	 * PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX.
+	 *
+	 * Also, because we are opearating on transmit timer and fc
+	 * threshold of LFC, we don't turn on any bit in tx_tmr_priority
 	 */
-	case ICE_RXDID_FLEX_NIC:
-	case ICE_RXDID_FLEX_NIC_2:
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_PKT_FRG,
-				   ICE_FLG_UDP_GRE, ICE_FLG_PKT_DSI,
-				   ICE_FLG_FIN, idx++);
-		/* flex flag 1 is not used for flexi-flag programming, skipping
-		 * these four FLG64 bits.
-		 */
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_SYN, ICE_FLG_RST,
-				   ICE_FLG_PKT_DSI, ICE_FLG_PKT_DSI, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_PKT_DSI,
-				   ICE_FLG_PKT_DSI, ICE_FLG_EVLAN_x8100,
-				   ICE_FLG_EVLAN_x9100, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_VLAN_x8100,
-				   ICE_FLG_TNL_VLAN, ICE_FLG_TNL_MAC,
-				   ICE_FLG_TNL0, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_TNL1, ICE_FLG_TNL2,
-				   ICE_FLG_PKT_DSI, ICE_FLG_PKT_DSI, idx);
-		break;
+#define IDX_OF_LFC PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX
 
-	default:
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Flag programming for profile ID %d not supported\n",
-			  prof_id);
-	}
+	/* Retrieve the transmit timer */
+	val = rd32(hw, PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA(IDX_OF_LFC));
+	tx_timer_val = val &
+		PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_M;
+	cmd->tx_tmr_value = cpu_to_le16(tx_timer_val);
+
+	/* Retrieve the fc threshold */
+	val = rd32(hw, PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER(IDX_OF_LFC));
+	fc_thres_val = val & PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_M;
+
+	cmd->fc_refresh_threshold = cpu_to_le16(fc_thres_val);
 }
 
 /**
- * ice_init_flex_flds
- * @hw: pointer to the hardware structure
- * @prof_id: Rx Descriptor Builder profile ID
+ * ice_aq_set_mac_cfg
+ * @hw: pointer to the HW struct
+ * @max_frame_size: Maximum Frame Size to be supported
+ * @cd: pointer to command details structure or NULL
  *
- * Function to initialize flex descriptors
+ * Set MAC configuration (0x0603)
  */
-static void ice_init_flex_flds(struct ice_hw *hw, enum ice_rxdid prof_id)
+enum ice_status
+ice_aq_set_mac_cfg(struct ice_hw *hw, u16 max_frame_size, struct ice_sq_cd *cd)
 {
-	enum ice_flex_rx_mdid mdid;
+	struct ice_aqc_set_mac_cfg *cmd;
+	struct ice_aq_desc desc;
 
-	switch (prof_id) {
-	case ICE_RXDID_FLEX_NIC:
-	case ICE_RXDID_FLEX_NIC_2:
-		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_HASH_LOW, 0);
-		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_HASH_HIGH, 1);
-		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_FLOW_ID_LOWER, 2);
+	cmd = &desc.params.set_mac_cfg;
 
-		mdid = (prof_id == ICE_RXDID_FLEX_NIC_2) ?
-			ICE_RX_MDID_SRC_VSI : ICE_RX_MDID_FLOW_ID_HIGH;
+	if (max_frame_size == 0)
+		return ICE_ERR_PARAM;
 
-		ICE_PROG_FLEX_ENTRY(hw, prof_id, mdid, 3);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_mac_cfg);
 
-		ice_init_flex_flags(hw, prof_id);
-		break;
+	cmd->max_frame_size = cpu_to_le16(max_frame_size);
 
-	default:
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Field init for profile ID %d not supported\n",
-			  prof_id);
-	}
+	ice_fill_tx_timer_and_fc_thresh(hw, cmd);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
 /**
@@ -440,14 +584,16 @@ static enum ice_status ice_init_fltr_mgmt_struct(struct ice_hw *hw)
 
 	hw->switch_info = devm_kzalloc(ice_hw_to_dev(hw),
 				       sizeof(*hw->switch_info), GFP_KERNEL);
+
 	sw = hw->switch_info;
 
 	if (!sw)
 		return ICE_ERR_NO_MEMORY;
 
 	INIT_LIST_HEAD(&sw->vsi_list_map_head);
+	sw->prof_res_bm_init = 0;
 
-	status = ice_init_def_sw_recp(hw);
+	status = ice_init_def_sw_recp(hw, &hw->switch_info->recp_list);
 	if (status) {
 		devm_kfree(ice_hw_to_dev(hw), hw->switch_info);
 		return status;
@@ -456,262 +602,83 @@ static enum ice_status ice_init_fltr_mgmt_struct(struct ice_hw *hw)
 }
 
 /**
- * ice_cleanup_fltr_mgmt_struct - cleanup filter management list and locks
+ * ice_cleanup_fltr_mgmt_single - clears single filter mngt struct
  * @hw: pointer to the HW struct
+ * @sw: pointer to switch info struct for which function clears filters
  */
-static void ice_cleanup_fltr_mgmt_struct(struct ice_hw *hw)
+static void
+ice_cleanup_fltr_mgmt_single(struct ice_hw *hw, struct ice_switch_info *sw)
 {
-	struct ice_switch_info *sw = hw->switch_info;
 	struct ice_vsi_list_map_info *v_pos_map;
 	struct ice_vsi_list_map_info *v_tmp_map;
 	struct ice_sw_recipe *recps;
 	u8 i;
 
+	if (!sw)
+		return;
+
 	list_for_each_entry_safe(v_pos_map, v_tmp_map, &sw->vsi_list_map_head,
 				 list_entry) {
 		list_del(&v_pos_map->list_entry);
 		devm_kfree(ice_hw_to_dev(hw), v_pos_map);
 	}
-	recps = hw->switch_info->recp_list;
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
-		struct ice_fltr_mgmt_list_entry *lst_itr, *tmp_entry;
+	recps = sw->recp_list;
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
+		struct ice_recp_grp_entry *rg_entry, *tmprg_entry;
 
 		recps[i].root_rid = i;
-		mutex_destroy(&recps[i].filt_rule_lock);
-		list_for_each_entry_safe(lst_itr, tmp_entry,
-					 &recps[i].filt_rules, list_entry) {
-			list_del(&lst_itr->list_entry);
-			devm_kfree(ice_hw_to_dev(hw), lst_itr);
-		}
-	}
-	ice_rm_all_sw_replay_rule_info(hw);
-	devm_kfree(ice_hw_to_dev(hw), sw->recp_list);
-	devm_kfree(ice_hw_to_dev(hw), sw);
-}
-
-#define ICE_FW_LOG_DESC_SIZE(n)	(sizeof(struct ice_aqc_fw_logging_data) + \
-	(((n) - 1) * sizeof(((struct ice_aqc_fw_logging_data *)0)->entry)))
-#define ICE_FW_LOG_DESC_SIZE_MAX	\
-	ICE_FW_LOG_DESC_SIZE(ICE_AQC_FW_LOG_ID_MAX)
-
-/**
- * ice_get_fw_log_cfg - get FW logging configuration
- * @hw: pointer to the HW struct
- */
-static enum ice_status ice_get_fw_log_cfg(struct ice_hw *hw)
-{
-	struct ice_aqc_fw_logging_data *config;
-	struct ice_aq_desc desc;
-	enum ice_status status;
-	u16 size;
-
-	size = ICE_FW_LOG_DESC_SIZE_MAX;
-	config = devm_kzalloc(ice_hw_to_dev(hw), size, GFP_KERNEL);
-	if (!config)
-		return ICE_ERR_NO_MEMORY;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logging_info);
-
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_BUF);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-
-	status = ice_aq_send_cmd(hw, &desc, config, size, NULL);
-	if (!status) {
-		u16 i;
-
-		/* Save FW logging information into the HW structure */
-		for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) {
-			u16 v, m, flgs;
-
-			v = le16_to_cpu(config->entry[i]);
-			m = (v & ICE_AQC_FW_LOG_ID_M) >> ICE_AQC_FW_LOG_ID_S;
-			flgs = (v & ICE_AQC_FW_LOG_EN_M) >> ICE_AQC_FW_LOG_EN_S;
-
-			if (m < ICE_AQC_FW_LOG_ID_MAX)
-				hw->fw_log.evnts[m].cur = flgs;
+		list_for_each_entry_safe(rg_entry, tmprg_entry,
+					 &recps[i].rg_list, l_entry) {
+			list_del(&rg_entry->l_entry);
+			devm_kfree(ice_hw_to_dev(hw), rg_entry);
 		}
-	}
-
-	devm_kfree(ice_hw_to_dev(hw), config);
-
-	return status;
-}
-
-/**
- * ice_cfg_fw_log - configure FW logging
- * @hw: pointer to the HW struct
- * @enable: enable certain FW logging events if true, disable all if false
- *
- * This function enables/disables the FW logging via Rx CQ events and a UART
- * port based on predetermined configurations. FW logging via the Rx CQ can be
- * enabled/disabled for individual PF's. However, FW logging via the UART can
- * only be enabled/disabled for all PFs on the same device.
- *
- * To enable overall FW logging, the "cq_en" and "uart_en" enable bits in
- * hw->fw_log need to be set accordingly, e.g. based on user-provided input,
- * before initializing the device.
- *
- * When re/configuring FW logging, callers need to update the "cfg" elements of
- * the hw->fw_log.evnts array with the desired logging event configurations for
- * modules of interest. When disabling FW logging completely, the callers can
- * just pass false in the "enable" parameter. On completion, the function will
- * update the "cur" element of the hw->fw_log.evnts array with the resulting
- * logging event configurations of the modules that are being re/configured. FW
- * logging modules that are not part of a reconfiguration operation retain their
- * previous states.
- *
- * Before resetting the device, it is recommended that the driver disables FW
- * logging before shutting down the control queue. When disabling FW logging
- * ("enable" = false), the latest configurations of FW logging events stored in
- * hw->fw_log.evnts[] are not overridden to allow them to be reconfigured after
- * a device reset.
- *
- * When enabling FW logging to emit log messages via the Rx CQ during the
- * device's initialization phase, a mechanism alternative to interrupt handlers
- * needs to be used to extract FW log messages from the Rx CQ periodically and
- * to prevent the Rx CQ from being full and stalling other types of control
- * messages from FW to SW. Interrupts are typically disabled during the device's
- * initialization phase.
- */
-static enum ice_status ice_cfg_fw_log(struct ice_hw *hw, bool enable)
-{
-	struct ice_aqc_fw_logging_data *data = NULL;
-	struct ice_aqc_fw_logging *cmd;
-	enum ice_status status = 0;
-	u16 i, chgs = 0, len = 0;
-	struct ice_aq_desc desc;
-	u8 actv_evnts = 0;
-	void *buf = NULL;
-
-	if (!hw->fw_log.cq_en && !hw->fw_log.uart_en)
-		return 0;
-
-	/* Disable FW logging only when the control queue is still responsive */
-	if (!enable &&
-	    (!hw->fw_log.actv_evnts || !ice_check_sq_alive(hw, &hw->adminq)))
-		return 0;
 
-	/* Get current FW log settings */
-	status = ice_get_fw_log_cfg(hw);
-	if (status)
-		return status;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logging);
-	cmd = &desc.params.fw_logging;
-
-	/* Indicate which controls are valid */
-	if (hw->fw_log.cq_en)
-		cmd->log_ctrl_valid |= ICE_AQC_FW_LOG_AQ_VALID;
-
-	if (hw->fw_log.uart_en)
-		cmd->log_ctrl_valid |= ICE_AQC_FW_LOG_UART_VALID;
-
-	if (enable) {
-		/* Fill in an array of entries with FW logging modules and
-		 * logging events being reconfigured.
-		 */
-		for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) {
-			u16 val;
-
-			/* Keep track of enabled event types */
-			actv_evnts |= hw->fw_log.evnts[i].cfg;
-
-			if (hw->fw_log.evnts[i].cfg == hw->fw_log.evnts[i].cur)
-				continue;
-
-			if (!data) {
-				data = devm_kzalloc(ice_hw_to_dev(hw),
-						    ICE_FW_LOG_DESC_SIZE_MAX,
-						    GFP_KERNEL);
-				if (!data)
-					return ICE_ERR_NO_MEMORY;
+		if (recps[i].adv_rule) {
+			struct ice_adv_fltr_mgmt_list_entry *tmp_entry;
+			struct ice_adv_fltr_mgmt_list_entry *lst_itr;
+
+			mutex_destroy(&recps[i].filt_rule_lock);
+			list_for_each_entry_safe(lst_itr, tmp_entry,
+						 &recps[i].filt_rules,
+						 list_entry) {
+				list_del(&lst_itr->list_entry);
+				devm_kfree(ice_hw_to_dev(hw), lst_itr->lkups);
+				devm_kfree(ice_hw_to_dev(hw), lst_itr);
 			}
-
-			val = i << ICE_AQC_FW_LOG_ID_S;
-			val |= hw->fw_log.evnts[i].cfg << ICE_AQC_FW_LOG_EN_S;
-			data->entry[chgs++] = cpu_to_le16(val);
-		}
-
-		/* Only enable FW logging if at least one module is specified.
-		 * If FW logging is currently enabled but all modules are not
-		 * enabled to emit log messages, disable FW logging altogether.
-		 */
-		if (actv_evnts) {
-			/* Leave if there is effectively no change */
-			if (!chgs)
-				goto out;
-
-			if (hw->fw_log.cq_en)
-				cmd->log_ctrl |= ICE_AQC_FW_LOG_AQ_EN;
-
-			if (hw->fw_log.uart_en)
-				cmd->log_ctrl |= ICE_AQC_FW_LOG_UART_EN;
-
-			buf = data;
-			len = ICE_FW_LOG_DESC_SIZE(chgs);
-			desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-		}
-	}
-
-	status = ice_aq_send_cmd(hw, &desc, buf, len, NULL);
-	if (!status) {
-		/* Update the current configuration to reflect events enabled.
-		 * hw->fw_log.cq_en and hw->fw_log.uart_en indicate if the FW
-		 * logging mode is enabled for the device. They do not reflect
-		 * actual modules being enabled to emit log messages. So, their
-		 * values remain unchanged even when all modules are disabled.
-		 */
-		u16 cnt = enable ? chgs : (u16)ICE_AQC_FW_LOG_ID_MAX;
-
-		hw->fw_log.actv_evnts = actv_evnts;
-		for (i = 0; i < cnt; i++) {
-			u16 v, m;
-
-			if (!enable) {
-				/* When disabling all FW logging events as part
-				 * of device's de-initialization, the original
-				 * configurations are retained, and can be used
-				 * to reconfigure FW logging later if the device
-				 * is re-initialized.
-				 */
-				hw->fw_log.evnts[i].cur = 0;
-				continue;
+		} else {
+			struct ice_fltr_mgmt_list_entry *lst_itr, *tmp_entry;
+
+			mutex_destroy(&recps[i].filt_rule_lock);
+			list_for_each_entry_safe(lst_itr, tmp_entry,
+						 &recps[i].filt_rules,
+						 list_entry) {
+				list_del(&lst_itr->list_entry);
+				devm_kfree(ice_hw_to_dev(hw), lst_itr);
 			}
-
-			v = le16_to_cpu(data->entry[i]);
-			m = (v & ICE_AQC_FW_LOG_ID_M) >> ICE_AQC_FW_LOG_ID_S;
-			hw->fw_log.evnts[m].cur = hw->fw_log.evnts[m].cfg;
 		}
+		if (recps[i].root_buf)
+			devm_kfree(ice_hw_to_dev(hw), recps[i].root_buf);
 	}
-
-out:
-	if (data)
-		devm_kfree(ice_hw_to_dev(hw), data);
-
-	return status;
+	ice_rm_sw_replay_rule_info(hw, sw);
+	devm_kfree(ice_hw_to_dev(hw), sw->recp_list);
+	devm_kfree(ice_hw_to_dev(hw), sw);
 }
 
 /**
- * ice_output_fw_log
+ * ice_cleanup_fltr_mgmt_struct - cleanup filter management list and locks
  * @hw: pointer to the HW struct
- * @desc: pointer to the AQ message descriptor
- * @buf: pointer to the buffer accompanying the AQ message
- *
- * Formats a FW Log message and outputs it via the standard driver logs.
  */
-void ice_output_fw_log(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf)
+static void ice_cleanup_fltr_mgmt_struct(struct ice_hw *hw)
 {
-	ice_debug(hw, ICE_DBG_FW_LOG, "[ FW Log Msg Start ]\n");
-	ice_debug_array(hw, ICE_DBG_FW_LOG, 16, 1, (u8 *)buf,
-			le16_to_cpu(desc->datalen));
-	ice_debug(hw, ICE_DBG_FW_LOG, "[ FW Log Msg End ]\n");
+	ice_cleanup_fltr_mgmt_single(hw, hw->switch_info);
 }
 
+
 /**
- * ice_get_itr_intrl_gran - determine int/intrl granularity
+ * ice_get_itr_intrl_gran
  * @hw: pointer to the HW struct
  *
- * Determines the ITR/intrl granularities based on the maximum aggregate
+ * Determines the ITR/INTRL granularities based on the maximum aggregate
  * bandwidth according to the device's configuration during power-on.
  */
 static void ice_get_itr_intrl_gran(struct ice_hw *hw)
@@ -735,26 +702,36 @@ static void ice_get_itr_intrl_gran(struct ice_hw *hw)
 }
 
 /**
- * ice_get_nvm_version - get cached NVM version data
+ * ice_print_rollback_msg - print FW rollback message
  * @hw: pointer to the hardware structure
- * @oem_ver: 8 bit NVM version
- * @oem_build: 16 bit NVM build number
- * @oem_patch: 8 NVM patch number
- * @ver_hi: high 16 bits of the NVM version
- * @ver_lo: low 16 bits of the NVM version
  */
-void
-ice_get_nvm_version(struct ice_hw *hw, u8 *oem_ver, u16 *oem_build,
-		    u8 *oem_patch, u8 *ver_hi, u8 *ver_lo)
+void ice_print_rollback_msg(struct ice_hw *hw)
 {
-	struct ice_nvm_info *nvm = &hw->nvm;
+	char nvm_str[ICE_NVM_VER_LEN] = { 0 };
+	struct ice_orom_info *orom;
+	struct ice_nvm_info *nvm;
+
+	orom = &hw->flash.orom;
+	nvm = &hw->flash.nvm;
+
+	snprintf(nvm_str, sizeof(nvm_str), "%x.%02x 0x%x %d.%d.%d",
+		 nvm->major, nvm->minor, nvm->eetrack, orom->major,
+		 orom->build, orom->patch);
+	dev_warn(ice_hw_to_dev(hw),
+		 "Firmware rollback mode detected. Current version is NVM: %s, FW: %d.%d. Device may exhibit limited functionality. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware rollback mode\n",
+		 nvm_str, hw->fw_maj_ver, hw->fw_min_ver);
+}
+
 
-	*oem_ver = (u8)((nvm->oem_ver & ICE_OEM_VER_MASK) >> ICE_OEM_VER_SHIFT);
-	*oem_patch = (u8)(nvm->oem_ver & ICE_OEM_VER_PATCH_MASK);
-	*oem_build = (u16)((nvm->oem_ver & ICE_OEM_VER_BUILD_MASK) >>
-			   ICE_OEM_VER_BUILD_SHIFT);
-	*ver_hi = (nvm->ver & ICE_NVM_VER_HI_MASK) >> ICE_NVM_VER_HI_SHIFT;
-	*ver_lo = (nvm->ver & ICE_NVM_VER_LO_MASK) >> ICE_NVM_VER_LO_SHIFT;
+/**
+ * ice_set_umac_shared
+ * @hw: pointer to the hw struct
+ *
+ * Set boolean flag to allow unicast MAC sharing
+ */
+void ice_set_umac_shared(struct ice_hw *hw)
+{
+	hw->umac_shared = true;
 }
 
 /**
@@ -774,34 +751,56 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 		return status;
 
 	hw->pf_id = (u8)(rd32(hw, PF_FUNC_RID) &
-			 PF_FUNC_RID_FUNC_NUM_M) >>
-		PF_FUNC_RID_FUNC_NUM_S;
+			 PF_FUNC_RID_FUNCTION_NUMBER_M) >>
+		PF_FUNC_RID_FUNCTION_NUMBER_S;
+
 
 	status = ice_reset(hw, ICE_RESET_PFR);
 	if (status)
 		return status;
-
 	ice_get_itr_intrl_gran(hw);
 
+
 	status = ice_create_all_ctrlq(hw);
 	if (status)
 		goto err_unroll_cqinit;
 
-	/* Enable FW logging. Not fatal if this fails. */
-	status = ice_cfg_fw_log(hw, true);
-	if (status)
-		ice_debug(hw, ICE_DBG_INIT, "Failed to enable FW logging.\n");
+	ice_fwlog_set_support_ena(hw);
+	status = ice_fwlog_set(hw, &hw->fwlog_cfg);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to enable FW logging, status %d.\n",
+			  status);
+	} else {
+		if (hw->fwlog_cfg.options & ICE_FWLOG_OPTION_REGISTER_ON_INIT) {
+			status = ice_fwlog_register(hw);
+			if (status)
+				ice_debug(hw, ICE_DBG_INIT, "Failed to register for FW logging events, status %d.\n",
+					  status);
+		} else {
+			status = ice_fwlog_unregister(hw);
+			if (status)
+				ice_debug(hw, ICE_DBG_INIT, "Failed to unregister for FW logging events, status %d.\n",
+					  status);
+		}
+	}
 
-	status = ice_clear_pf_cfg(hw);
+	status = ice_init_nvm(hw);
 	if (status)
 		goto err_unroll_cqinit;
 
-	ice_clear_pxe_mode(hw);
+	if (ice_get_fw_mode(hw) == ICE_FW_MODE_ROLLBACK)
+		ice_print_rollback_msg(hw);
 
-	status = ice_init_nvm(hw);
+	status = ice_clear_pf_cfg(hw);
 	if (status)
 		goto err_unroll_cqinit;
 
+	/* Set bit to enable Flow Director filters */
+	wr32(hw, PFQF_FD_ENA, PFQF_FD_ENA_FD_ENA_M);
+	INIT_LIST_HEAD(&hw->fdir_list_head);
+
+	ice_clear_pxe_mode(hw);
+
 	status = ice_get_caps(hw);
 	if (status)
 		goto err_unroll_cqinit;
@@ -822,20 +821,18 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 		goto err_unroll_alloc;
 
 	hw->evb_veb = true;
-
 	/* Query the allocated resources for Tx scheduler */
 	status = ice_sched_query_res_alloc(hw);
 	if (status) {
-		ice_debug(hw, ICE_DBG_SCHED,
-			  "Failed to get scheduler allocated resources\n");
+		ice_debug(hw, ICE_DBG_SCHED, "Failed to get scheduler allocated resources\n");
 		goto err_unroll_alloc;
 	}
+	ice_sched_get_psm_clk_freq(hw);
 
 	/* Initialize port_info struct with scheduler data */
 	status = ice_sched_init_port(hw->port_info);
 	if (status)
 		goto err_unroll_sched;
-
 	pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps), GFP_KERNEL);
 	if (!pcaps) {
 		status = ICE_ERR_NO_MEMORY;
@@ -844,16 +841,17 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 
 	/* Initialize port_info struct with PHY capabilities */
 	status = ice_aq_get_phy_caps(hw->port_info, false,
-				     ICE_AQC_REPORT_TOPO_CAP, pcaps, NULL);
+				     ICE_AQC_REPORT_TOPO_CAP_MEDIA, pcaps, NULL);
 	devm_kfree(ice_hw_to_dev(hw), pcaps);
 	if (status)
-		goto err_unroll_sched;
+		dev_warn(ice_hw_to_dev(hw),
+			 "Get PHY capabilities failed status = %d, continuing anyway\n",
+			 status);
 
 	/* Initialize port_info struct with link information */
 	status = ice_aq_get_link_info(hw->port_info, false, NULL, NULL);
 	if (status)
 		goto err_unroll_sched;
-
 	/* need a valid SW entry point to build a Tx tree */
 	if (!hw->sw_entry_point_layer) {
 		ice_debug(hw, ICE_DBG_SCHED, "invalid sw entry point\n");
@@ -861,14 +859,16 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 		goto err_unroll_sched;
 	}
 	INIT_LIST_HEAD(&hw->agg_list);
-
+	/* Initialize max burst size */
+	if (!hw->max_burst_size)
+		ice_cfg_rl_burst_size(hw, ICE_SCHED_DFLT_BURST_SIZE);
 	status = ice_init_fltr_mgmt_struct(hw);
 	if (status)
 		goto err_unroll_sched;
 
-	ice_dev_onetime_setup(hw);
 
 	/* Get MAC information */
+
 	/* A single port can report up to two (LAN and WoL) addresses */
 	mac_buf = devm_kcalloc(ice_hw_to_dev(hw), 2,
 			       sizeof(struct ice_aqc_manage_mac_read_resp),
@@ -885,12 +885,15 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 
 	if (status)
 		goto err_unroll_fltr_mgmt_struct;
-
-	ice_init_flex_flds(hw, ICE_RXDID_FLEX_NIC);
-	ice_init_flex_flds(hw, ICE_RXDID_FLEX_NIC_2);
+	/* Obtain counter base index which would be used by flow director */
+	status = ice_alloc_fd_res_cntr(hw, &hw->fd_ctr_base);
+	if (status)
+		goto err_unroll_fltr_mgmt_struct;
 	status = ice_init_hw_tbls(hw);
 	if (status)
 		goto err_unroll_fltr_mgmt_struct;
+	mutex_init(&hw->tnl_lock);
+
 	return 0;
 
 err_unroll_fltr_mgmt_struct:
@@ -899,6 +902,7 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 	ice_sched_cleanup_all(hw);
 err_unroll_alloc:
 	devm_kfree(ice_hw_to_dev(hw), hw->port_info);
+	hw->port_info = NULL;
 err_unroll_cqinit:
 	ice_destroy_all_ctrlq(hw);
 	return status;
@@ -914,20 +918,20 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
  */
 void ice_deinit_hw(struct ice_hw *hw)
 {
+	ice_free_fd_res_cntr(hw, hw->fd_ctr_base);
 	ice_cleanup_fltr_mgmt_struct(hw);
 
 	ice_sched_cleanup_all(hw);
 	ice_sched_clear_agg(hw);
 	ice_free_seg(hw);
 	ice_free_hw_tbls(hw);
+	mutex_destroy(&hw->tnl_lock);
 
 	if (hw->port_info) {
 		devm_kfree(ice_hw_to_dev(hw), hw->port_info);
 		hw->port_info = NULL;
 	}
 
-	/* Attempt to disable FW logging before shutting down control queues */
-	ice_cfg_fw_log(hw, false);
 	ice_destroy_all_ctrlq(hw);
 
 	/* Clear VSI contexts if not already cleared */
@@ -940,25 +944,24 @@ void ice_deinit_hw(struct ice_hw *hw)
  */
 enum ice_status ice_check_reset(struct ice_hw *hw)
 {
-	u32 cnt, reg = 0, grst_delay, uld_mask;
+	u32 cnt, reg = 0, grst_timeout, uld_mask;
 
 	/* Poll for Device Active state in case a recent CORER, GLOBR,
 	 * or EMPR has occurred. The grst delay value is in 100ms units.
 	 * Add 1sec for outstanding AQ commands that can take a long time.
 	 */
-	grst_delay = ((rd32(hw, GLGEN_RSTCTL) & GLGEN_RSTCTL_GRSTDEL_M) >>
-		      GLGEN_RSTCTL_GRSTDEL_S) + 10;
+	grst_timeout = ((rd32(hw, GLGEN_RSTCTL) & GLGEN_RSTCTL_GRSTDEL_M) >>
+			GLGEN_RSTCTL_GRSTDEL_S) + 10;
 
-	for (cnt = 0; cnt < grst_delay; cnt++) {
-		mdelay(100);
+	for (cnt = 0; cnt < grst_timeout; cnt++) {
+		msleep(100);
 		reg = rd32(hw, GLGEN_RSTAT);
 		if (!(reg & GLGEN_RSTAT_DEVSTATE_M))
 			break;
 	}
 
-	if (cnt == grst_delay) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Global reset polling failed to complete.\n");
+	if (cnt == grst_timeout) {
+		ice_debug(hw, ICE_DBG_INIT, "Global reset polling failed to complete.\n");
 		return ICE_ERR_RESET_FAILED;
 	}
 
@@ -970,22 +973,21 @@ enum ice_status ice_check_reset(struct ice_hw *hw)
 				 GLNVM_ULD_POR_DONE_1_M |\
 				 GLNVM_ULD_PCIER_DONE_2_M)
 
-	uld_mask = ICE_RESET_DONE_MASK;
+	uld_mask = ICE_RESET_DONE_MASK | (hw->func_caps.common_cap.iwarp ?
+					  GLNVM_ULD_PE_DONE_M : 0);
 
 	/* Device is Active; check Global Reset processes are done */
 	for (cnt = 0; cnt < ICE_PF_RESET_WAIT_COUNT; cnt++) {
 		reg = rd32(hw, GLNVM_ULD) & uld_mask;
 		if (reg == uld_mask) {
-			ice_debug(hw, ICE_DBG_INIT,
-				  "Global reset processes done. %d\n", cnt);
+			ice_debug(hw, ICE_DBG_INIT, "Global reset processes done. %d\n", cnt);
 			break;
 		}
-		mdelay(10);
+		msleep(10);
 	}
 
 	if (cnt == ICE_PF_RESET_WAIT_COUNT) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Wait for Reset Done timed out. GLNVM_ULD = 0x%x\n",
+		ice_debug(hw, ICE_DBG_INIT, "Wait for Reset Done timed out. GLNVM_ULD = 0x%x\n",
 			  reg);
 		return ICE_ERR_RESET_FAILED;
 	}
@@ -1023,17 +1025,21 @@ static enum ice_status ice_pf_reset(struct ice_hw *hw)
 
 	wr32(hw, PFGEN_CTRL, (reg | PFGEN_CTRL_PFSWR_M));
 
-	for (cnt = 0; cnt < ICE_PF_RESET_WAIT_COUNT; cnt++) {
+	/* Wait for the PFR to complete. The wait time is the global config lock
+	 * timeout plus the PFR timeout which will account for a possible reset
+	 * that is occurring during a download package operation.
+	 */
+	for (cnt = 0; cnt < ICE_GLOBAL_CFG_LOCK_TIMEOUT +
+	     ICE_PF_RESET_WAIT_COUNT; cnt++) {
 		reg = rd32(hw, PFGEN_CTRL);
 		if (!(reg & PFGEN_CTRL_PFSWR_M))
 			break;
 
-		mdelay(1);
+		msleep(1);
 	}
 
 	if (cnt == ICE_PF_RESET_WAIT_COUNT) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "PF reset polling failed to complete.\n");
+		ice_debug(hw, ICE_DBG_INIT, "PF reset polling failed to complete.\n");
 		return ICE_ERR_RESET_FAILED;
 	}
 
@@ -1075,10 +1081,12 @@ enum ice_status ice_reset(struct ice_hw *hw, enum ice_reset_req req)
 	wr32(hw, GLGEN_RTRIG, val);
 	ice_flush(hw);
 
+
 	/* wait for the FW to be ready */
 	return ice_check_reset(hw);
 }
 
+
 /**
  * ice_copy_rxq_ctx_to_hw
  * @hw: pointer to the hardware structure
@@ -1157,10 +1165,31 @@ ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx,
 
 	rlan_ctx->prefena = 1;
 
-	ice_set_ctx((u8 *)rlan_ctx, ctx_buf, ice_rlan_ctx_info);
+	ice_set_ctx(hw, (u8 *)rlan_ctx, ctx_buf, ice_rlan_ctx_info);
 	return ice_copy_rxq_ctx_to_hw(hw, ctx_buf, rxq_index);
 }
 
+/**
+ * ice_clear_rxq_ctx
+ * @hw: pointer to the hardware structure
+ * @rxq_index: the index of the Rx queue to clear
+ *
+ * Clears rxq context in HW register space
+ */
+enum ice_status ice_clear_rxq_ctx(struct ice_hw *hw, u32 rxq_index)
+{
+	u8 i;
+
+	if (rxq_index > QRX_CTRL_MAX_INDEX)
+		return ICE_ERR_PARAM;
+
+	/* Clear each dword register separately */
+	for (i = 0; i < ICE_RXQ_CTX_SIZE_DWORDS; i++)
+		wr32(hw, QRX_CONTEXT(i, rxq_index), 0);
+
+	return 0;
+}
+
 /* LAN Tx Queue Context */
 const struct ice_ctx_ele ice_tlan_ctx_info[] = {
 				    /* Field			Width	LSB */
@@ -1196,93 +1225,445 @@ const struct ice_ctx_ele ice_tlan_ctx_info[] = {
 };
 
 /**
- * ice_debug_cq
+ * ice_copy_tx_cmpltnq_ctx_to_hw
  * @hw: pointer to the hardware structure
- * @mask: debug mask
- * @desc: pointer to control queue descriptor
- * @buf: pointer to command buffer
- * @buf_len: max length of buf
+ * @ice_tx_cmpltnq_ctx: pointer to the Tx completion queue context
+ * @tx_cmpltnq_index: the index of the completion queue
  *
- * Dumps debug log about control command with descriptor contents.
+ * Copies Tx completion queue context from dense structure to HW register space
  */
-void
-ice_debug_cq(struct ice_hw *hw, u32 __maybe_unused mask, void *desc, void *buf,
-	     u16 buf_len)
+static enum ice_status
+ice_copy_tx_cmpltnq_ctx_to_hw(struct ice_hw *hw, u8 *ice_tx_cmpltnq_ctx,
+			      u32 tx_cmpltnq_index)
 {
-	struct ice_aq_desc *cq_desc = (struct ice_aq_desc *)desc;
-	u16 len;
-
-#ifndef CONFIG_DYNAMIC_DEBUG
-	if (!(mask & hw->debug_mask))
-		return;
-#endif
+	u8 i;
 
-	if (!desc)
-		return;
+	if (!ice_tx_cmpltnq_ctx)
+		return ICE_ERR_BAD_PTR;
 
-	len = le16_to_cpu(cq_desc->datalen);
+	if (tx_cmpltnq_index > GLTCLAN_CQ_CNTX0_MAX_INDEX)
+		return ICE_ERR_PARAM;
 
-	ice_debug(hw, mask,
-		  "CQ CMD: opcode 0x%04X, flags 0x%04X, datalen 0x%04X, retval 0x%04X\n",
-		  le16_to_cpu(cq_desc->opcode),
-		  le16_to_cpu(cq_desc->flags),
-		  le16_to_cpu(cq_desc->datalen), le16_to_cpu(cq_desc->retval));
-	ice_debug(hw, mask, "\tcookie (h,l) 0x%08X 0x%08X\n",
-		  le32_to_cpu(cq_desc->cookie_high),
-		  le32_to_cpu(cq_desc->cookie_low));
-	ice_debug(hw, mask, "\tparam (0,1)  0x%08X 0x%08X\n",
-		  le32_to_cpu(cq_desc->params.generic.param0),
-		  le32_to_cpu(cq_desc->params.generic.param1));
-	ice_debug(hw, mask, "\taddr (h,l)   0x%08X 0x%08X\n",
-		  le32_to_cpu(cq_desc->params.generic.addr_high),
-		  le32_to_cpu(cq_desc->params.generic.addr_low));
-	if (buf && cq_desc->datalen != 0) {
-		ice_debug(hw, mask, "Buffer:\n");
-		if (buf_len < len)
-			len = buf_len;
+	/* Copy each dword separately to HW */
+	for (i = 0; i < ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS; i++) {
+		wr32(hw, GLTCLAN_CQ_CNTX(i, tx_cmpltnq_index),
+		     *((u32 *)(ice_tx_cmpltnq_ctx + (i * sizeof(u32)))));
 
-		ice_debug_array(hw, mask, 16, 1, (u8 *)buf, len);
+		ice_debug(hw, ICE_DBG_QCTX, "cmpltnqdata[%d]: %08X\n", i,
+			  *((u32 *)(ice_tx_cmpltnq_ctx + (i * sizeof(u32)))));
 	}
+
+	return 0;
 }
 
-/* FW Admin Queue command wrappers */
+/* LAN Tx Completion Queue Context */
+static const struct ice_ctx_ele ice_tx_cmpltnq_ctx_info[] = {
+				       /* Field			Width   LSB */
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, base,			57,	0),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, q_len,		18,	64),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, generation,		1,	96),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, wrt_ptr,		22,	97),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, pf_num,		3,	128),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, vmvf_num,		10,	131),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, vmvf_type,		2,	141),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, tph_desc_wr,		1,	160),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, cpuid,		8,	161),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, cmpltn_cache,		512,	192),
+	{ 0 }
+};
 
-/* Software lock/mutex that is meant to be held while the Global Config Lock
- * in firmware is acquired by the software to prevent most (but not all) types
- * of AQ commands from being sent to FW
+/**
+ * ice_write_tx_cmpltnq_ctx
+ * @hw: pointer to the hardware structure
+ * @tx_cmpltnq_ctx: pointer to the completion queue context
+ * @tx_cmpltnq_index: the index of the completion queue
+ *
+ * Converts completion queue context from sparse to dense structure and then
+ * writes it to HW register space
  */
-DEFINE_MUTEX(ice_global_cfg_lock_sw);
+enum ice_status
+ice_write_tx_cmpltnq_ctx(struct ice_hw *hw,
+			 struct ice_tx_cmpltnq_ctx *tx_cmpltnq_ctx,
+			 u32 tx_cmpltnq_index)
+{
+	u8 ctx_buf[ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS * sizeof(u32)] = { 0 };
+
+	ice_set_ctx(hw, (u8 *)tx_cmpltnq_ctx, ctx_buf, ice_tx_cmpltnq_ctx_info);
+	return ice_copy_tx_cmpltnq_ctx_to_hw(hw, ctx_buf, tx_cmpltnq_index);
+}
 
 /**
- * ice_aq_send_cmd - send FW Admin Queue command to FW Admin Queue
- * @hw: pointer to the HW struct
- * @desc: descriptor describing the command
- * @buf: buffer to use for indirect commands (NULL for direct commands)
- * @buf_size: size of buffer for indirect commands (0 for direct commands)
- * @cd: pointer to command details structure
+ * ice_clear_tx_cmpltnq_ctx
+ * @hw: pointer to the hardware structure
+ * @tx_cmpltnq_index: the index of the completion queue to clear
  *
- * Helper function to send FW Admin Queue commands to the FW Admin Queue.
+ * Clears Tx completion queue context in HW register space
  */
 enum ice_status
-ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
-		u16 buf_size, struct ice_sq_cd *cd)
+ice_clear_tx_cmpltnq_ctx(struct ice_hw *hw, u32 tx_cmpltnq_index)
 {
-	struct ice_aqc_req_res *cmd = &desc->params.res_owner;
-	bool lock_acquired = false;
-	enum ice_status status;
+	u8 i;
 
-	/* When a package download is in process (i.e. when the firmware's
+	if (tx_cmpltnq_index > GLTCLAN_CQ_CNTX0_MAX_INDEX)
+		return ICE_ERR_PARAM;
+
+	/* Clear each dword register separately */
+	for (i = 0; i < ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS; i++)
+		wr32(hw, GLTCLAN_CQ_CNTX(i, tx_cmpltnq_index), 0);
+
+	return 0;
+}
+
+/**
+ * ice_copy_tx_drbell_q_ctx_to_hw
+ * @hw: pointer to the hardware structure
+ * @ice_tx_drbell_q_ctx: pointer to the doorbell queue context
+ * @tx_drbell_q_index: the index of the doorbell queue
+ *
+ * Copies doorbell queue context from dense structure to HW register space
+ */
+static enum ice_status
+ice_copy_tx_drbell_q_ctx_to_hw(struct ice_hw *hw, u8 *ice_tx_drbell_q_ctx,
+			       u32 tx_drbell_q_index)
+{
+	u8 i;
+
+	if (!ice_tx_drbell_q_ctx)
+		return ICE_ERR_BAD_PTR;
+
+	if (tx_drbell_q_index > QTX_COMM_DBLQ_DBELL_MAX_INDEX)
+		return ICE_ERR_PARAM;
+
+	/* Copy each dword separately to HW */
+	for (i = 0; i < ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS; i++) {
+		wr32(hw, QTX_COMM_DBLQ_CNTX(i, tx_drbell_q_index),
+		     *((u32 *)(ice_tx_drbell_q_ctx + (i * sizeof(u32)))));
+
+		ice_debug(hw, ICE_DBG_QCTX, "tx_drbell_qdata[%d]: %08X\n", i,
+			  *((u32 *)(ice_tx_drbell_q_ctx + (i * sizeof(u32)))));
+	}
+
+	return 0;
+}
+
+/* LAN Tx Doorbell Queue Context info */
+static const struct ice_ctx_ele ice_tx_drbell_q_ctx_info[] = {
+					/* Field		Width   LSB */
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, base,		57,	0),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, ring_len,		13,	64),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, pf_num,		3,	80),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, vf_num,		8,	84),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, vmvf_type,		2,	94),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, cpuid,		8,	96),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, tph_desc_rd,		1,	104),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, tph_desc_wr,		1,	108),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, db_q_en,		1,	112),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, rd_head,		13,	128),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, rd_tail,		13,	144),
+	{ 0 }
+};
+
+/**
+ * ice_write_tx_drbell_q_ctx
+ * @hw: pointer to the hardware structure
+ * @tx_drbell_q_ctx: pointer to the doorbell queue context
+ * @tx_drbell_q_index: the index of the doorbell queue
+ *
+ * Converts doorbell queue context from sparse to dense structure and then
+ * writes it to HW register space
+ */
+enum ice_status
+ice_write_tx_drbell_q_ctx(struct ice_hw *hw,
+			  struct ice_tx_drbell_q_ctx *tx_drbell_q_ctx,
+			  u32 tx_drbell_q_index)
+{
+	u8 ctx_buf[ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS * sizeof(u32)] = { 0 };
+
+	ice_set_ctx(hw, (u8 *)tx_drbell_q_ctx, ctx_buf,
+		    ice_tx_drbell_q_ctx_info);
+	return ice_copy_tx_drbell_q_ctx_to_hw(hw, ctx_buf, tx_drbell_q_index);
+}
+
+/**
+ * ice_clear_tx_drbell_q_ctx
+ * @hw: pointer to the hardware structure
+ * @tx_drbell_q_index: the index of the doorbell queue to clear
+ *
+ * Clears doorbell queue context in HW register space
+ */
+enum ice_status
+ice_clear_tx_drbell_q_ctx(struct ice_hw *hw, u32 tx_drbell_q_index)
+{
+	u8 i;
+
+	if (tx_drbell_q_index > QTX_COMM_DBLQ_DBELL_MAX_INDEX)
+		return ICE_ERR_PARAM;
+
+	/* Clear each dword register separately */
+	for (i = 0; i < ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS; i++)
+		wr32(hw, QTX_COMM_DBLQ_CNTX(i, tx_drbell_q_index), 0);
+
+	return 0;
+}
+
+/* Sideband Queue command wrappers */
+
+/**
+ * ice_get_sbq - returns the right control queue to use for sideband
+ * @hw: pointer to the hardware structure
+ */
+static struct ice_ctl_q_info *ice_get_sbq(struct ice_hw *hw)
+{
+	if (!ice_is_generic_mac(hw))
+		return &hw->adminq;
+	return &hw->sbq;
+}
+
+/**
+ * ice_sbq_send_cmd - send Sideband Queue command to Sideband Queue
+ * @hw: pointer to the HW struct
+ * @desc: descriptor describing the command
+ * @buf: buffer to use for indirect commands (NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (0 for direct commands)
+ * @cd: pointer to command details structure
+ */
+static enum ice_status
+ice_sbq_send_cmd(struct ice_hw *hw, struct ice_sbq_cmd_desc *desc,
+		 void *buf, u16 buf_size, struct ice_sq_cd *cd)
+{
+	return ice_sq_send_cmd(hw, ice_get_sbq(hw), (struct ice_aq_desc *)desc,
+			       buf, buf_size, cd);
+}
+
+/**
+ * ice_sbq_send_cmd_nolock - send Sideband Queue command to Sideband Queue
+ *                           but do not lock sq_lock
+ * @hw: pointer to the HW struct
+ * @desc: descriptor describing the command
+ * @buf: buffer to use for indirect commands (NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (0 for direct commands)
+ * @cd: pointer to command details structure
+ */
+static enum ice_status
+ice_sbq_send_cmd_nolock(struct ice_hw *hw, struct ice_sbq_cmd_desc *desc,
+			void *buf, u16 buf_size, struct ice_sq_cd *cd)
+{
+	return ice_sq_send_cmd_nolock(hw, ice_get_sbq(hw),
+				      (struct ice_aq_desc *)desc, buf,
+				      buf_size, cd);
+}
+
+/**
+ * ice_sbq_rw_reg_lp - Fill Sideband Queue command, with lock parameter
+ * @hw: pointer to the HW struct
+ * @in: message info to be filled in descriptor
+ * @lock: true to lock the sq_lock (the usual case); false if the sq_lock has
+ *        already been locked at a higher level
+ */
+enum ice_status ice_sbq_rw_reg_lp(struct ice_hw *hw,
+				  struct ice_sbq_msg_input *in, bool lock)
+{
+	struct ice_sbq_cmd_desc desc = {0};
+	struct ice_sbq_msg_req msg = {0};
+	enum ice_status status;
+	u16 msg_len;
+
+	msg_len = sizeof(msg);
+
+	msg.dest_dev = in->dest_dev;
+	msg.opcode = in->opcode;
+	msg.flags = ICE_SBQ_MSG_FLAGS;
+	msg.sbe_fbe = ICE_SBQ_MSG_SBE_FBE;
+	msg.msg_addr_low = cpu_to_le16(in->msg_addr_low);
+	msg.msg_addr_high = cpu_to_le32(in->msg_addr_high);
+
+	if (in->opcode)
+		msg.data = cpu_to_le32(in->data);
+	else
+		/* data read comes back in completion, so shorten the struct by
+		 * sizeof(msg.data)
+		 */
+		msg_len -= sizeof(msg.data);
+
+	desc.flags = cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.opcode = cpu_to_le16(ice_sbq_opc_neigh_dev_req);
+	desc.param0.cmd_len = cpu_to_le16(msg_len);
+	if (lock)
+		status = ice_sbq_send_cmd(hw, &desc, &msg, msg_len, NULL);
+	else
+		status = ice_sbq_send_cmd_nolock(hw, &desc, &msg, msg_len,
+						 NULL);
+	if (!status && !in->opcode)
+		in->data = le32_to_cpu
+			(((struct ice_sbq_msg_cmpl *)&msg)->data);
+	return status;
+}
+
+/**
+ * ice_sbq_rw_reg - Fill Sideband Queue command
+ * @hw: pointer to the HW struct
+ * @in: message info to be filled in descriptor
+ */
+enum ice_status ice_sbq_rw_reg(struct ice_hw *hw, struct ice_sbq_msg_input *in)
+{
+	return ice_sbq_rw_reg_lp(hw, in, true);
+}
+
+/**
+ * ice_sbq_lock - Lock the sideband queue's sq_lock
+ * @hw: pointer to the HW struct
+ */
+void ice_sbq_lock(struct ice_hw *hw)
+{
+	mutex_lock(&ice_get_sbq(hw)->sq_lock);
+}
+
+/**
+ * ice_sbq_unlock - Unlock the sideband queue's sq_lock
+ * @hw: pointer to the HW struct
+ */
+void ice_sbq_unlock(struct ice_hw *hw)
+{
+	mutex_unlock(&ice_get_sbq(hw)->sq_lock);
+}
+
+/* FW Admin Queue command wrappers */
+
+/* Software lock/mutex that is meant to be held while the Global Config Lock
+ * in firmware is acquired by the software to prevent most (but not all) types
+ * of AQ commands from being sent to FW
+ */
+DEFINE_MUTEX(ice_global_cfg_lock_sw);
+
+/**
+ * ice_should_retry_sq_send_cmd
+ * @opcode: AQ opcode
+ *
+ * Decide if we should retry the send command routine for the ATQ, depending
+ * on the opcode.
+ */
+static bool ice_should_retry_sq_send_cmd(u16 opcode)
+{
+	switch (opcode) {
+	case ice_aqc_opc_dnl_get_status:
+	case ice_aqc_opc_dnl_run:
+	case ice_aqc_opc_dnl_call:
+	case ice_aqc_opc_dnl_read_sto:
+	case ice_aqc_opc_dnl_write_sto:
+	case ice_aqc_opc_dnl_set_breakpoints:
+	case ice_aqc_opc_dnl_read_log:
+	case ice_aqc_opc_get_link_topo:
+	case ice_aqc_opc_lldp_stop:
+	case ice_aqc_opc_lldp_start:
+	case ice_aqc_opc_lldp_filter_ctrl:
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * ice_sq_send_cmd_retry - send command to Control Queue (ATQ)
+ * @hw: pointer to the HW struct
+ * @cq: pointer to the specific Control queue
+ * @desc: prefilled descriptor describing the command
+ * @buf: buffer to use for indirect commands (or NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (or 0 for direct commands)
+ * @cd: pointer to command details structure
+ *
+ * Retry sending the FW Admin Queue command, multiple times, to the FW Admin
+ * Queue if the EBUSY AQ error is returned.
+ */
+static enum ice_status
+ice_sq_send_cmd_retry(struct ice_hw *hw, struct ice_ctl_q_info *cq,
+		      struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc_cpy;
+	enum ice_status status;
+	bool is_cmd_for_retry;
+	u8 *buf_cpy = NULL;
+	u8 idx = 0;
+	u16 opcode;
+
+	opcode = le16_to_cpu(desc->opcode);
+	is_cmd_for_retry = ice_should_retry_sq_send_cmd(opcode);
+	memset(&desc_cpy, 0, sizeof(desc_cpy));
+
+	if (is_cmd_for_retry) {
+		if (buf) {
+			buf_cpy = devm_kzalloc(ice_hw_to_dev(hw), buf_size,
+					       GFP_KERNEL);
+			if (!buf_cpy)
+				return ICE_ERR_NO_MEMORY;
+		}
+
+		memcpy(&desc_cpy, desc, sizeof(desc_cpy));
+	}
+
+	do {
+		status = ice_sq_send_cmd(hw, cq, desc, buf, buf_size, cd);
+
+		if (!is_cmd_for_retry || !status ||
+		    hw->adminq.sq_last_status != ICE_AQ_RC_EBUSY)
+			break;
+
+		if (buf_cpy)
+			memcpy(buf, buf_cpy, buf_size);
+
+		memcpy(desc, &desc_cpy, sizeof(desc_cpy));
+
+		mdelay(ICE_SQ_SEND_DELAY_TIME_MS);
+
+	} while (++idx < ICE_SQ_SEND_MAX_EXECUTE);
+
+	if (buf_cpy)
+		devm_kfree(ice_hw_to_dev(hw), buf_cpy);
+
+	return status;
+}
+
+/**
+ * ice_aq_send_cmd - send FW Admin Queue command to FW Admin Queue
+ * @hw: pointer to the HW struct
+ * @desc: descriptor describing the command
+ * @buf: buffer to use for indirect commands (NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (0 for direct commands)
+ * @cd: pointer to command details structure
+ *
+ * Helper function to send FW Admin Queue commands to the FW Admin Queue.
+ */
+enum ice_status
+ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
+		u16 buf_size, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_req_res *cmd = &desc->params.res_owner;
+	bool lock_acquired = false;
+	enum ice_status status;
+
+	/* When a package download is in process (i.e. when the firmware's
 	 * Global Configuration Lock resource is held), only the Download
-	 * Package, Get Version, Get Package Info List and Release Resource
-	 * (with resource ID set to Global Config Lock) AdminQ commands are
-	 * allowed; all others must block until the package download completes
-	 * and the Global Config Lock is released.  See also
-	 * ice_acquire_global_cfg_lock().
+	 * Package, Get Version, Get Package Info List, Upload Section,
+	 * Update Package, Set Port Parameters, Get/Set VLAN Mode Parameters,
+	 * Add Recipe, Set Recipes to Profile Association, Get Recipe, and Get
+	 * Recipes to Profile Association, and Release Resource (with resource
+	 * ID set to Global Config Lock) AdminQ commands are allowed; all others
+	 * must block until the package download completes and the Global Config
+	 * Lock is released.  See also ice_acquire_global_cfg_lock().
 	 */
 	switch (le16_to_cpu(desc->opcode)) {
 	case ice_aqc_opc_download_pkg:
 	case ice_aqc_opc_get_pkg_info_list:
 	case ice_aqc_opc_get_ver:
+	case ice_aqc_opc_upload_section:
+	case ice_aqc_opc_update_pkg:
+	case ice_aqc_opc_set_port_params:
+	case ice_aqc_opc_get_vlan_mode_parameters:
+	case ice_aqc_opc_set_vlan_mode_parameters:
+	case ice_aqc_opc_add_recipe:
+	case ice_aqc_opc_recipe_to_profile:
+	case ice_aqc_opc_get_recipe:
+	case ice_aqc_opc_get_recipe_to_profile:
 		break;
 	case ice_aqc_opc_release_res:
 		if (le16_to_cpu(cmd->res_id) == ICE_AQC_RES_ID_GLBL_LOCK)
@@ -1294,7 +1675,7 @@ ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
 		break;
 	}
 
-	status = ice_sq_send_cmd(hw, &hw->adminq, desc, buf, buf_size, cd);
+	status = ice_sq_send_cmd_retry(hw, &hw->adminq, desc, buf, buf_size, cd);
 	if (lock_acquired)
 		mutex_unlock(&ice_global_cfg_lock_sw);
 
@@ -1536,13 +1917,12 @@ ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 		goto ice_acquire_res_exit;
 
 	if (status)
-		ice_debug(hw, ICE_DBG_RES,
-			  "resource %d acquire type %d failed.\n", res, access);
+		ice_debug(hw, ICE_DBG_RES, "resource %d acquire type %d failed.\n", res, access);
 
 	/* If necessary, poll until the current lock owner timeouts */
 	timeout = time_left;
 	while (status && timeout && time_left) {
-		mdelay(delay);
+		msleep(delay);
 		timeout = (timeout > delay) ? timeout - delay : 0;
 		status = ice_aq_req_res(hw, res, access, 0, &time_left, NULL);
 
@@ -1560,11 +1940,9 @@ ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 ice_acquire_res_exit:
 	if (status == ICE_ERR_AQ_NO_WORK) {
 		if (access == ICE_RES_WRITE)
-			ice_debug(hw, ICE_DBG_RES,
-				  "resource indicates no work to do.\n");
+			ice_debug(hw, ICE_DBG_RES, "resource indicates no work to do.\n");
 		else
-			ice_debug(hw, ICE_DBG_RES,
-				  "Warning: ICE_ERR_AQ_NO_WORK not expected\n");
+			ice_debug(hw, ICE_DBG_RES, "Warning: ICE_ERR_AQ_NO_WORK not expected\n");
 	}
 	return status;
 }
@@ -1588,1977 +1966,4503 @@ void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res)
 	 */
 	while ((status == ICE_ERR_AQ_TIMEOUT) &&
 	       (total_delay < hw->adminq.sq_cmd_timeout)) {
-		mdelay(1);
+		msleep(1);
 		status = ice_aq_release_res(hw, res, 0, NULL);
 		total_delay++;
 	}
 }
 
 /**
- * ice_get_num_per_func - determine number of resources per PF
- * @hw: pointer to the HW structure
- * @max: value to be evenly split between each PF
+ * ice_aq_alloc_free_res - command to allocate/free resources
+ * @hw: pointer to the HW struct
+ * @num_entries: number of resource entries in buffer
+ * @buf: Indirect buffer to hold data parameters and response
+ * @buf_size: size of buffer for indirect commands
+ * @opc: pass in the command opcode
+ * @cd: pointer to command details structure or NULL
  *
- * Determine the number of valid functions by going through the bitmap returned
- * from parsing capabilities and use this to calculate the number of resources
- * per PF based on the max value passed in.
+ * Helper function to allocate/free resources using the admin queue commands
  */
-static u32 ice_get_num_per_func(struct ice_hw *hw, u32 max)
+enum ice_status
+ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries,
+		      struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size,
+		      enum ice_adminq_opc opc, struct ice_sq_cd *cd)
 {
-	u8 funcs;
+	struct ice_aqc_alloc_free_res_cmd *cmd;
+	struct ice_aq_desc desc;
 
-#define ICE_CAPS_VALID_FUNCS_M	0xFF
-	funcs = hweight8(hw->dev_caps.common_cap.valid_functions &
-			 ICE_CAPS_VALID_FUNCS_M);
+	cmd = &desc.params.sw_res_ctrl;
 
-	if (!funcs)
-		return 0;
+	if (!buf)
+		return ICE_ERR_PARAM;
 
-	return max / funcs;
+	if (buf_size < flex_array_size(buf, elem, num_entries))
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_entries = cpu_to_le16(num_entries);
+
+	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 }
 
 /**
- * ice_parse_caps - parse function/device capabilities
+ * ice_alloc_hw_res - allocate resource
  * @hw: pointer to the HW struct
- * @buf: pointer to a buffer containing function/device capability records
- * @cap_count: number of capability records in the list
- * @opc: type of capabilities list to parse
- *
- * Helper function to parse function(0x000a)/device(0x000b) capabilities list.
+ * @type: type of resource
+ * @num: number of resources to allocate
+ * @btm: allocate from bottom
+ * @res: pointer to array that will receive the resources
  */
-static void
-ice_parse_caps(struct ice_hw *hw, void *buf, u32 cap_count,
-	       enum ice_adminq_opc opc)
+enum ice_status
+ice_alloc_hw_res(struct ice_hw *hw, u16 type, u16 num, bool btm, u16 *res)
 {
-	struct ice_aqc_list_caps_elem *cap_resp;
-	struct ice_hw_func_caps *func_p = NULL;
-	struct ice_hw_dev_caps *dev_p = NULL;
-	struct ice_hw_common_caps *caps;
-	char const *prefix;
-	u32 i;
+	struct ice_aqc_alloc_free_res_elem *buf;
+	enum ice_status status;
+	u16 buf_len;
 
+	buf_len = struct_size(buf, elem, num);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
 	if (!buf)
-		return;
+		return ICE_ERR_NO_MEMORY;
 
-	cap_resp = (struct ice_aqc_list_caps_elem *)buf;
+	/* Prepare buffer to allocate resource. */
+	buf->num_elems = cpu_to_le16(num);
+	buf->res_type = cpu_to_le16(type | ICE_AQC_RES_TYPE_FLAG_DEDICATED |
+				    ICE_AQC_RES_TYPE_FLAG_IGNORE_INDEX);
+	if (btm)
+		buf->res_type |= cpu_to_le16(ICE_AQC_RES_TYPE_FLAG_SCAN_BOTTOM);
 
-	if (opc == ice_aqc_opc_list_dev_caps) {
-		dev_p = &hw->dev_caps;
-		caps = &dev_p->common_cap;
-		prefix = "dev cap";
-	} else if (opc == ice_aqc_opc_list_func_caps) {
-		func_p = &hw->func_caps;
-		caps = &func_p->common_cap;
-		prefix = "func cap";
-	} else {
-		ice_debug(hw, ICE_DBG_INIT, "wrong opcode\n");
-		return;
-	}
+	status = ice_aq_alloc_free_res(hw, 1, buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+	if (status)
+		goto ice_alloc_res_exit;
 
-	for (i = 0; caps && i < cap_count; i++, cap_resp++) {
-		u32 logical_id = le32_to_cpu(cap_resp->logical_id);
-		u32 phys_id = le32_to_cpu(cap_resp->phys_id);
-		u32 number = le32_to_cpu(cap_resp->number);
-		u16 cap = le16_to_cpu(cap_resp->cap);
+	memcpy(res, buf->elem, sizeof(*buf->elem) * num);
 
-		switch (cap) {
-		case ICE_AQC_CAPS_VALID_FUNCTIONS:
-			caps->valid_functions = number;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: valid_functions (bitmap) = %d\n", prefix,
-				  caps->valid_functions);
-			break;
-		case ICE_AQC_CAPS_SRIOV:
-			caps->sr_iov_1_1 = (number == 1);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: sr_iov_1_1 = %d\n", prefix,
-				  caps->sr_iov_1_1);
-			break;
-		case ICE_AQC_CAPS_VF:
-			if (dev_p) {
-				dev_p->num_vfs_exposed = number;
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: num_vfs_exposed = %d\n", prefix,
-					  dev_p->num_vfs_exposed);
-			} else if (func_p) {
-				func_p->num_allocd_vfs = number;
-				func_p->vf_base_id = logical_id;
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: num_allocd_vfs = %d\n", prefix,
-					  func_p->num_allocd_vfs);
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: vf_base_id = %d\n", prefix,
-					  func_p->vf_base_id);
-			}
-			break;
-		case ICE_AQC_CAPS_VSI:
-			if (dev_p) {
-				dev_p->num_vsi_allocd_to_host = number;
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: num_vsi_allocd_to_host = %d\n",
-					  prefix,
-					  dev_p->num_vsi_allocd_to_host);
-			} else if (func_p) {
-				func_p->guar_num_vsi =
-					ice_get_num_per_func(hw, ICE_MAX_VSI);
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: guar_num_vsi (fw) = %d\n",
-					  prefix, number);
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: guar_num_vsi = %d\n",
-					  prefix, func_p->guar_num_vsi);
-			}
-			break;
-		case ICE_AQC_CAPS_DCB:
-			caps->dcb = (number == 1);
-			caps->active_tc_bitmap = logical_id;
-			caps->maxtc = phys_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: dcb = %d\n", prefix, caps->dcb);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: active_tc_bitmap = %d\n", prefix,
-				  caps->active_tc_bitmap);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: maxtc = %d\n", prefix, caps->maxtc);
-			break;
-		case ICE_AQC_CAPS_RSS:
-			caps->rss_table_size = number;
-			caps->rss_table_entry_width = logical_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: rss_table_size = %d\n", prefix,
-				  caps->rss_table_size);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: rss_table_entry_width = %d\n", prefix,
-				  caps->rss_table_entry_width);
-			break;
-		case ICE_AQC_CAPS_RXQS:
-			caps->num_rxq = number;
-			caps->rxq_first_id = phys_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: num_rxq = %d\n", prefix,
-				  caps->num_rxq);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: rxq_first_id = %d\n", prefix,
-				  caps->rxq_first_id);
-			break;
-		case ICE_AQC_CAPS_TXQS:
-			caps->num_txq = number;
-			caps->txq_first_id = phys_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: num_txq = %d\n", prefix,
-				  caps->num_txq);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: txq_first_id = %d\n", prefix,
-				  caps->txq_first_id);
-			break;
-		case ICE_AQC_CAPS_MSIX:
-			caps->num_msix_vectors = number;
-			caps->msix_vector_first_id = phys_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: num_msix_vectors = %d\n", prefix,
-				  caps->num_msix_vectors);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: msix_vector_first_id = %d\n", prefix,
-				  caps->msix_vector_first_id);
-			break;
-		case ICE_AQC_CAPS_MAX_MTU:
-			caps->max_mtu = number;
-			ice_debug(hw, ICE_DBG_INIT, "%s: max_mtu = %d\n",
-				  prefix, caps->max_mtu);
-			break;
-		default:
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: unknown capability[%d]: 0x%x\n", prefix,
-				  i, cap);
-			break;
-		}
-	}
+ice_alloc_res_exit:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
 }
 
 /**
- * ice_aq_discover_caps - query function/device capabilities
+ * ice_free_hw_res - free allocated HW resource
  * @hw: pointer to the HW struct
- * @buf: a virtual buffer to hold the capabilities
- * @buf_size: Size of the virtual buffer
- * @cap_count: cap count needed if AQ err==ENOMEM
- * @opc: capabilities type to discover - pass in the command opcode
- * @cd: pointer to command details structure or NULL
- *
- * Get the function(0x000a)/device(0x000b) capabilities description from
- * the firmware.
+ * @type: type of resource to free
+ * @num: number of resources
+ * @res: pointer to array that contains the resources to free
  */
-static enum ice_status
-ice_aq_discover_caps(struct ice_hw *hw, void *buf, u16 buf_size, u32 *cap_count,
-		     enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+enum ice_status ice_free_hw_res(struct ice_hw *hw, u16 type, u16 num, u16 *res)
 {
-	struct ice_aqc_list_caps *cmd;
-	struct ice_aq_desc desc;
+	struct ice_aqc_alloc_free_res_elem *buf;
 	enum ice_status status;
+	u16 buf_len;
 
-	cmd = &desc.params.get_cap;
+	buf_len = struct_size(buf, elem, num);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
 
-	if (opc != ice_aqc_opc_list_func_caps &&
-	    opc != ice_aqc_opc_list_dev_caps)
-		return ICE_ERR_PARAM;
+	/* Prepare buffer to free resource. */
+	buf->num_elems = cpu_to_le16(num);
+	buf->res_type = cpu_to_le16(type);
+	memcpy(buf->elem, res, sizeof(*buf->elem) * num);
 
-	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	status = ice_aq_alloc_free_res(hw, num, buf, buf_len,
+				       ice_aqc_opc_free_res, NULL);
+	if (status)
+		ice_debug(hw, ICE_DBG_SW, "CQ CMD Buffer:\n");
 
-	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
-	if (!status)
-		ice_parse_caps(hw, buf, le32_to_cpu(cmd->count), opc);
-	else if (hw->adminq.sq_last_status == ICE_AQ_RC_ENOMEM)
-		*cap_count = le32_to_cpu(cmd->count);
+	devm_kfree(ice_hw_to_dev(hw), buf);
 	return status;
 }
 
 /**
- * ice_discover_caps - get info about the HW
- * @hw: pointer to the hardware structure
- * @opc: capabilities type to discover - pass in the command opcode
- */
-static enum ice_status
-ice_discover_caps(struct ice_hw *hw, enum ice_adminq_opc opc)
-{
-	enum ice_status status;
-	u32 cap_count;
-	u16 cbuf_len;
-	u8 retries;
-
-	/* The driver doesn't know how many capabilities the device will return
-	 * so the buffer size required isn't known ahead of time. The driver
-	 * starts with cbuf_len and if this turns out to be insufficient, the
-	 * device returns ICE_AQ_RC_ENOMEM and also the cap_count it needs.
-	 * The driver then allocates the buffer based on the count and retries
-	 * the operation. So it follows that the retry count is 2.
-	 */
-#define ICE_GET_CAP_BUF_COUNT	40
-#define ICE_GET_CAP_RETRY_COUNT	2
-
-	cap_count = ICE_GET_CAP_BUF_COUNT;
-	retries = ICE_GET_CAP_RETRY_COUNT;
+ * ice_get_num_per_func - determine number of resources per PF
+ * @hw: pointer to the HW structure
+ * @max: value to be evenly split between each PF
+ *
+ * Determine the number of valid functions by going through the bitmap returned
+ * from parsing capabilities and use this to calculate the number of resources
+ * per PF based on the max value passed in.
+ */
+static u32 ice_get_num_per_func(struct ice_hw *hw, u32 max)
+{
+	u8 funcs;
 
-	do {
-		void *cbuf;
+#define ICE_CAPS_VALID_FUNCS_M	0xFF
+	funcs = hweight8(hw->dev_caps.common_cap.valid_functions & ICE_CAPS_VALID_FUNCS_M);
 
-		cbuf_len = (u16)(cap_count *
-				 sizeof(struct ice_aqc_list_caps_elem));
-		cbuf = devm_kzalloc(ice_hw_to_dev(hw), cbuf_len, GFP_KERNEL);
-		if (!cbuf)
-			return ICE_ERR_NO_MEMORY;
+	if (!funcs)
+		return 0;
 
-		status = ice_aq_discover_caps(hw, cbuf, cbuf_len, &cap_count,
-					      opc, NULL);
-		devm_kfree(ice_hw_to_dev(hw), cbuf);
+	return max / funcs;
+}
 
-		if (!status || hw->adminq.sq_last_status != ICE_AQ_RC_ENOMEM)
-			break;
+/**
+ * ice_print_led_caps - print LED capabilities
+ * @hw: pointer to the ice_hw instance
+ * @caps: pointer to common caps instance
+ * @prefix: string to prefix when printing
+ * @dbg: set to indicate debug print
+ */
+static void
+ice_print_led_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
+		   char const *prefix, bool dbg)
+{
+	u8 i;
 
-		/* If ENOMEM is returned, try again with bigger buffer */
-	} while (--retries);
+	if (dbg)
+		ice_debug(hw, ICE_DBG_INIT, "%s: led_pin_num = %d\n", prefix,
+			  caps->led_pin_num);
+	else
+		dev_info(ice_hw_to_dev(hw), "%s: led_pin_num = %d\n", prefix,
+			 caps->led_pin_num);
 
-	return status;
+	for (i = 0; i < ICE_MAX_SUPPORTED_GPIO_LED; i++) {
+		if (!caps->led[i])
+			continue;
+
+		if (dbg)
+			ice_debug(hw, ICE_DBG_INIT, "%s: led[%d] = %d\n",
+				  prefix, i, caps->led[i]);
+		else
+			dev_info(ice_hw_to_dev(hw), "%s: led[%d] = %d\n",
+				 prefix, i, caps->led[i]);
+	}
 }
 
 /**
- * ice_set_safe_mode_caps - Override dev/func capabilities when in safe mode
- * @hw: pointer to the hardware structure
+ * ice_print_sdp_caps - print SDP capabilities
+ * @hw: pointer to the ice_hw instance
+ * @caps: pointer to common caps instance
+ * @prefix: string to prefix when printing
+ * @dbg: set to indicate debug print
  */
-void ice_set_safe_mode_caps(struct ice_hw *hw)
+static void
+ice_print_sdp_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
+		   char const *prefix, bool dbg)
 {
-	struct ice_hw_func_caps *func_caps = &hw->func_caps;
-	struct ice_hw_dev_caps *dev_caps = &hw->dev_caps;
-	u32 valid_func, rxq_first_id, txq_first_id;
-	u32 msix_vector_first_id, max_mtu;
-	u32 num_func = 0;
 	u8 i;
 
-	/* cache some func_caps values that should be restored after memset */
-	valid_func = func_caps->common_cap.valid_functions;
-	txq_first_id = func_caps->common_cap.txq_first_id;
-	rxq_first_id = func_caps->common_cap.rxq_first_id;
-	msix_vector_first_id = func_caps->common_cap.msix_vector_first_id;
-	max_mtu = func_caps->common_cap.max_mtu;
+	if (dbg)
+		ice_debug(hw, ICE_DBG_INIT, "%s: sdp_pin_num = %d\n", prefix,
+			  caps->sdp_pin_num);
+	else
+		dev_info(ice_hw_to_dev(hw), "%s: sdp_pin_num = %d\n", prefix,
+			 caps->sdp_pin_num);
 
-	/* unset func capabilities */
-	memset(func_caps, 0, sizeof(*func_caps));
+	for (i = 0; i < ICE_MAX_SUPPORTED_GPIO_SDP; i++) {
+		if (!caps->sdp[i])
+			continue;
 
-	/* restore cached values */
-	func_caps->common_cap.valid_functions = valid_func;
-	func_caps->common_cap.txq_first_id = txq_first_id;
-	func_caps->common_cap.rxq_first_id = rxq_first_id;
-	func_caps->common_cap.msix_vector_first_id = msix_vector_first_id;
-	func_caps->common_cap.max_mtu = max_mtu;
+		if (dbg)
+			ice_debug(hw, ICE_DBG_INIT, "%s: sdp[%d] = %d\n",
+				  prefix, i, caps->sdp[i]);
+		else
+			dev_info(ice_hw_to_dev(hw), "%s: sdp[%d] = %d\n",
+				 prefix, i, caps->sdp[i]);
+	}
+}
 
-	/* one Tx and one Rx queue in safe mode */
-	func_caps->common_cap.num_rxq = 1;
-	func_caps->common_cap.num_txq = 1;
+/**
+ * ice_parse_common_caps - parse common device/function capabilities
+ * @hw: pointer to the HW struct
+ * @caps: pointer to common capabilities structure
+ * @elem: the capability element to parse
+ * @prefix: message prefix for tracing capabilities
+ *
+ * Given a capability element, extract relevant details into the common
+ * capability structure.
+ *
+ * Returns: true if the capability matches one of the common capability ids,
+ * false otherwise.
+ */
+static bool
+ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
+		      struct ice_aqc_list_caps_elem *elem, const char *prefix)
+{
+	u32 logical_id = le32_to_cpu(elem->logical_id);
+	u32 phys_id = le32_to_cpu(elem->phys_id);
+	u32 number = le32_to_cpu(elem->number);
+	u16 cap = le16_to_cpu(elem->cap);
+	bool found = true;
+
+	switch (cap) {
+	case ICE_AQC_CAPS_SWITCHING_MODE:
+		caps->switching_mode = number;
+		ice_debug(hw, ICE_DBG_INIT, "%s: switching_mode = %d\n", prefix,
+			  caps->switching_mode);
+		break;
+	case ICE_AQC_CAPS_MANAGEABILITY_MODE:
+		caps->mgmt_mode = number;
+		caps->mgmt_protocols_mctp = logical_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: mgmt_mode = %d\n", prefix,
+			  caps->mgmt_mode);
+		ice_debug(hw, ICE_DBG_INIT, "%s: mgmt_protocols_mctp = %d\n", prefix,
+			  caps->mgmt_protocols_mctp);
+		break;
+	case ICE_AQC_CAPS_OS2BMC:
+		caps->os2bmc = number;
+		ice_debug(hw, ICE_DBG_INIT, "%s: os2bmc = %d\n", prefix, caps->os2bmc);
+		break;
+	case ICE_AQC_CAPS_VALID_FUNCTIONS:
+		caps->valid_functions = number;
+		ice_debug(hw, ICE_DBG_INIT, "%s: valid_functions (bitmap) = %d\n", prefix,
+			  caps->valid_functions);
+		break;
+	case ICE_AQC_CAPS_SRIOV:
+		caps->sr_iov_1_1 = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: sr_iov_1_1 = %d\n", prefix,
+			  caps->sr_iov_1_1);
+		break;
+	case ICE_AQC_CAPS_VMDQ:
+		caps->vmdq = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: vmdq = %d\n", prefix, caps->vmdq);
+		break;
+	case ICE_AQC_CAPS_802_1QBG:
+		caps->evb_802_1_qbg = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: evb_802_1_qbg = %d\n", prefix, number);
+		break;
+	case ICE_AQC_CAPS_802_1BR:
+		caps->evb_802_1_qbh = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: evb_802_1_qbh = %d\n", prefix, number);
+		break;
+	case ICE_AQC_CAPS_DCB:
+		caps->dcb = (number == 1);
+		caps->active_tc_bitmap = logical_id;
+		caps->maxtc = phys_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: dcb = %d\n", prefix, caps->dcb);
+		ice_debug(hw, ICE_DBG_INIT, "%s: active_tc_bitmap = %d\n", prefix,
+			  caps->active_tc_bitmap);
+		ice_debug(hw, ICE_DBG_INIT, "%s: maxtc = %d\n", prefix, caps->maxtc);
+		break;
+	case ICE_AQC_CAPS_ISCSI:
+		caps->iscsi = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: iscsi = %d\n", prefix, caps->iscsi);
+		break;
+	case ICE_AQC_CAPS_RSS:
+		caps->rss_table_size = number;
+		caps->rss_table_entry_width = logical_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: rss_table_size = %d\n", prefix,
+			  caps->rss_table_size);
+		ice_debug(hw, ICE_DBG_INIT, "%s: rss_table_entry_width = %d\n", prefix,
+			  caps->rss_table_entry_width);
+		break;
+	case ICE_AQC_CAPS_RXQS:
+		caps->num_rxq = number;
+		caps->rxq_first_id = phys_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: num_rxq = %d\n", prefix,
+			  caps->num_rxq);
+		ice_debug(hw, ICE_DBG_INIT, "%s: rxq_first_id = %d\n", prefix,
+			  caps->rxq_first_id);
+		break;
+	case ICE_AQC_CAPS_TXQS:
+		caps->num_txq = number;
+		caps->txq_first_id = phys_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: num_txq = %d\n", prefix,
+			  caps->num_txq);
+		ice_debug(hw, ICE_DBG_INIT, "%s: txq_first_id = %d\n", prefix,
+			  caps->txq_first_id);
+		break;
+	case ICE_AQC_CAPS_MSIX:
+		caps->num_msix_vectors = number;
+		caps->msix_vector_first_id = phys_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: num_msix_vectors = %d\n", prefix,
+			  caps->num_msix_vectors);
+		ice_debug(hw, ICE_DBG_INIT, "%s: msix_vector_first_id = %d\n", prefix,
+			  caps->msix_vector_first_id);
+		break;
+	case ICE_AQC_CAPS_NVM_VER:
+		break;
+	case ICE_AQC_CAPS_PENDING_NVM_VER:
+		caps->nvm_update_pending_nvm = true;
+		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_nvm\n", prefix);
+		break;
+	case ICE_AQC_CAPS_PENDING_OROM_VER:
+		caps->nvm_update_pending_orom = true;
+		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_orom\n", prefix);
+		break;
+	case ICE_AQC_CAPS_PENDING_NET_VER:
+		caps->nvm_update_pending_netlist = true;
+		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_netlist\n", prefix);
+		break;
+	case ICE_AQC_CAPS_NVM_MGMT:
+		caps->sec_rev_disabled =
+			(number & ICE_NVM_MGMT_SEC_REV_DISABLED) ?
+			true : false;
+		ice_debug(hw, ICE_DBG_INIT, "%s: sec_rev_disabled = %d\n", prefix,
+			  caps->sec_rev_disabled);
+		caps->update_disabled =
+			(number & ICE_NVM_MGMT_UPDATE_DISABLED) ?
+			true : false;
+		ice_debug(hw, ICE_DBG_INIT, "%s: update_disabled = %d\n", prefix,
+			  caps->update_disabled);
+		caps->nvm_unified_update =
+			(number & ICE_NVM_MGMT_UNIFIED_UPD_SUPPORT) ?
+			true : false;
+		ice_debug(hw, ICE_DBG_INIT, "%s: nvm_unified_update = %d\n", prefix,
+			  caps->nvm_unified_update);
+		break;
+	case ICE_AQC_CAPS_CEM:
+		caps->mgmt_cem = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: mgmt_cem = %d\n", prefix,
+			  caps->mgmt_cem);
+		break;
+	case ICE_AQC_CAPS_IWARP:
+		caps->iwarp = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: iwarp = %d\n", prefix, caps->iwarp);
+		break;
+	case ICE_AQC_CAPS_LED:
+		if (phys_id < ICE_MAX_SUPPORTED_GPIO_LED) {
+			caps->led[phys_id] = true;
+			caps->led_pin_num++;
+			ice_debug(hw, ICE_DBG_INIT, "%s: led[%d] = 1\n", prefix, phys_id);
+		}
+		break;
+	case ICE_AQC_CAPS_SDP:
+		if (phys_id < ICE_MAX_SUPPORTED_GPIO_SDP) {
+			caps->sdp[phys_id] = true;
+			caps->sdp_pin_num++;
+			ice_debug(hw, ICE_DBG_INIT, "%s: sdp[%d] = 1\n", prefix, phys_id);
+		}
+		break;
+	case ICE_AQC_CAPS_WR_CSR_PROT:
+		caps->wr_csr_prot = number;
+		caps->wr_csr_prot |= (u64)logical_id << 32;
+		ice_debug(hw, ICE_DBG_INIT, "%s: wr_csr_prot = 0x%llX\n", prefix,
+			  (unsigned long long)caps->wr_csr_prot);
+		break;
+	case ICE_AQC_CAPS_WOL_PROXY:
+		caps->num_wol_proxy_fltr = number;
+		caps->wol_proxy_vsi_seid = logical_id;
+		caps->apm_wol_support = !!(phys_id & ICE_WOL_SUPPORT_M);
+		caps->acpi_prog_mthd = !!(phys_id &
+					  ICE_ACPI_PROG_MTHD_M);
+		caps->proxy_support = !!(phys_id & ICE_PROXY_SUPPORT_M);
+		ice_debug(hw, ICE_DBG_INIT, "%s: num_wol_proxy_fltr = %d\n", prefix,
+			  caps->num_wol_proxy_fltr);
+		ice_debug(hw, ICE_DBG_INIT, "%s: wol_proxy_vsi_seid = %d\n", prefix,
+			  caps->wol_proxy_vsi_seid);
+		break;
+	case ICE_AQC_CAPS_MAX_MTU:
+		caps->max_mtu = number;
+		ice_debug(hw, ICE_DBG_INIT, "%s: max_mtu = %d\n",
+			  prefix, caps->max_mtu);
+		break;
+	case ICE_AQC_CAPS_EXT_TOPO_DEV_IMG0:
+	case ICE_AQC_CAPS_EXT_TOPO_DEV_IMG1:
+	case ICE_AQC_CAPS_EXT_TOPO_DEV_IMG2:
+	case ICE_AQC_CAPS_EXT_TOPO_DEV_IMG3:
+	{
+		u8 index = cap - ICE_AQC_CAPS_EXT_TOPO_DEV_IMG0;
+
+		if (index >= ICE_EXT_TOPO_DEV_IMG_COUNT)
+			break;
 
-	/* two MSIX vectors, one for traffic and one for misc causes */
-	func_caps->common_cap.num_msix_vectors = 2;
-	func_caps->guar_num_vsi = 1;
+		caps->ext_topo_dev_img_ver_high[index] = number;
+		caps->ext_topo_dev_img_ver_low[index] = logical_id;
+		caps->ext_topo_dev_img_part_num[index] =
+			(phys_id & ICE_EXT_TOPO_DEV_IMG_PART_NUM_M) >>
+			ICE_EXT_TOPO_DEV_IMG_PART_NUM_S;
+		caps->ext_topo_dev_img_load_en[index] =
+			(phys_id & ICE_EXT_TOPO_DEV_IMG_LOAD_EN) != 0;
+		caps->ext_topo_dev_img_prog_en[index] =
+			(phys_id & ICE_EXT_TOPO_DEV_IMG_PROG_EN) != 0;
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_ver_high[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_ver_high[index]);
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_ver_low[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_ver_low[index]);
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_part_num[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_part_num[index]);
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_load_en[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_load_en[index]);
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_prog_en[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_prog_en[index]);
+		break;
+	}
+	default:
+		/* Not one of the recognized common capabilities */
+		found = false;
+	}
 
-	/* cache some dev_caps values that should be restored after memset */
-	valid_func = dev_caps->common_cap.valid_functions;
-	txq_first_id = dev_caps->common_cap.txq_first_id;
-	rxq_first_id = dev_caps->common_cap.rxq_first_id;
-	msix_vector_first_id = dev_caps->common_cap.msix_vector_first_id;
-	max_mtu = dev_caps->common_cap.max_mtu;
+	return found;
+}
 
-	/* unset dev capabilities */
-	memset(dev_caps, 0, sizeof(*dev_caps));
+/**
+ * ice_recalc_port_limited_caps - Recalculate port limited capabilities
+ * @hw: pointer to the HW structure
+ * @caps: pointer to capabilities structure to fix
+ *
+ * Re-calculate the capabilities that are dependent on the number of physical
+ * ports; i.e. some features are not supported or function differently on
+ * devices with more than 4 ports.
+ */
+static void
+ice_recalc_port_limited_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps)
+{
+	/* This assumes device capabilities are always scanned before function
+	 * capabilities during the initialization flow.
+	 */
+	if (hw->dev_caps.num_funcs > 4) {
+		/* Max 4 TCs per port */
+		caps->maxtc = 4;
+		ice_debug(hw, ICE_DBG_INIT, "reducing maxtc to %d (based on #ports)\n",
+			  caps->maxtc);
+		if (caps->iwarp) {
+			ice_debug(hw, ICE_DBG_INIT, "forcing RDMA off\n");
+			caps->iwarp = 0;
+		}
 
-	/* restore cached values */
-	dev_caps->common_cap.valid_functions = valid_func;
-	dev_caps->common_cap.txq_first_id = txq_first_id;
-	dev_caps->common_cap.rxq_first_id = rxq_first_id;
-	dev_caps->common_cap.msix_vector_first_id = msix_vector_first_id;
-	dev_caps->common_cap.max_mtu = max_mtu;
-
-	/* valid_func is a bitmap. get number of functions */
-#define ICE_MAX_FUNCS 8
-	for (i = 0; i < ICE_MAX_FUNCS; i++)
-		if (valid_func & BIT(i))
-			num_func++;
+		/* print message only when processing device capabilities
+		 * during initialization.
+		 */
+		if (caps == &hw->dev_caps.common_cap)
+			dev_info(ice_hw_to_dev(hw),
+				 "RDMA functionality is not available with the current device configuration.\n");
+	}
+}
 
-	/* one Tx and one Rx queue per function in safe mode */
-	dev_caps->common_cap.num_rxq = num_func;
-	dev_caps->common_cap.num_txq = num_func;
+/**
+ * ice_parse_vf_func_caps - Parse ICE_AQC_CAPS_VF function caps
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @cap: pointer to the capability element to parse
+ *
+ * Extract function capabilities for ICE_AQC_CAPS_VF.
+ */
+static void
+ice_parse_vf_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
+		       struct ice_aqc_list_caps_elem *cap)
+{
+	u32 number = le32_to_cpu(cap->number);
+	u32 logical_id = le32_to_cpu(cap->logical_id);
+
+	func_p->num_allocd_vfs = number;
+	func_p->vf_base_id = logical_id;
+	ice_debug(hw, ICE_DBG_INIT, "func caps: num_allocd_vfs = %d\n",
+		  func_p->num_allocd_vfs);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: vf_base_id = %d\n",
+		  func_p->vf_base_id);
+}
 
-	/* two MSIX vectors per function */
-	dev_caps->common_cap.num_msix_vectors = 2 * num_func;
+/**
+ * ice_parse_vsi_func_caps - Parse ICE_AQC_CAPS_VSI function caps
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @cap: pointer to the capability element to parse
+ *
+ * Extract function capabilities for ICE_AQC_CAPS_VSI.
+ */
+static void
+ice_parse_vsi_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
+			struct ice_aqc_list_caps_elem *cap)
+{
+	func_p->guar_num_vsi = ice_get_num_per_func(hw, ICE_MAX_VSI);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: guar_num_vsi (fw) = %d\n",
+		  le32_to_cpu(cap->number));
+	ice_debug(hw, ICE_DBG_INIT, "func caps: guar_num_vsi = %d\n",
+		  func_p->guar_num_vsi);
 }
 
 /**
- * ice_get_caps - get info about the HW
- * @hw: pointer to the hardware structure
+ * ice_parse_1588_func_caps - Parse ICE_AQC_CAPS_1588 function caps
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @cap: pointer to the capability element to parse
+ *
+ * Extract function capabilities for ICE_AQC_CAPS_1588.
  */
-enum ice_status ice_get_caps(struct ice_hw *hw)
+static void
+ice_parse_1588_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
+			 struct ice_aqc_list_caps_elem *cap)
 {
-	enum ice_status status;
+	struct ice_ts_func_info *info = &func_p->ts_func_info;
+	u32 number = le32_to_cpu(cap->number);
 
-	status = ice_discover_caps(hw, ice_aqc_opc_list_dev_caps);
-	if (!status)
-		status = ice_discover_caps(hw, ice_aqc_opc_list_func_caps);
+	info->ena = ((number & ICE_TS_FUNC_ENA_M) != 0);
+	func_p->common_cap.ieee_1588 = info->ena;
 
-	return status;
+	info->src_tmr_owned = ((number & ICE_TS_SRC_TMR_OWND_M) != 0);
+	info->tmr_ena = ((number & ICE_TS_TMR_ENA_M) != 0);
+	info->tmr_index_owned = ((number & ICE_TS_TMR_IDX_OWND_M) != 0);
+	info->tmr_index_assoc = ((number & ICE_TS_TMR_IDX_ASSOC_M) != 0);
+
+	info->clk_freq = (number & ICE_TS_CLK_FREQ_M) >> ICE_TS_CLK_FREQ_S;
+	info->clk_src = ((number & ICE_TS_CLK_SRC_M) != 0);
+
+	if (info->clk_freq < NUM_ICE_TIME_REF_FREQ) {
+		info->time_ref = (enum ice_time_ref_freq)info->clk_freq;
+	} else {
+		/* Unknown clock frequency, so assume a (probably incorrect)
+		 * default to avoid out-of-bounds look ups of frequency
+		 * related information.
+		 */
+		ice_debug(hw, ICE_DBG_INIT, "1588 func caps: unknown clock frequency %u\n",
+			  info->clk_freq);
+		info->time_ref = ICE_TIME_REF_FREQ_25_000;
+	}
+
+	ice_debug(hw, ICE_DBG_INIT, "func caps: ieee_1588 = %u\n",
+		  func_p->common_cap.ieee_1588);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: src_tmr_owned = %u\n",
+		  info->src_tmr_owned);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: tmr_ena = %u\n",
+		  info->tmr_ena);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: tmr_index_owned = %u\n",
+		  info->tmr_index_owned);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: tmr_index_assoc = %u\n",
+		  info->tmr_index_assoc);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: clk_freq = %u\n",
+		  info->clk_freq);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: clk_src = %u\n",
+		  info->clk_src);
 }
 
 /**
- * ice_aq_manage_mac_write - manage MAC address write command
+ * ice_parse_fdir_func_caps - Parse ICE_AQC_CAPS_FD function caps
  * @hw: pointer to the HW struct
- * @mac_addr: MAC address to be written as LAA/LAA+WoL/Port address
- * @flags: flags to control write behavior
- * @cd: pointer to command details structure or NULL
+ * @func_p: pointer to function capabilities structure
  *
- * This function is used to write MAC address to the NVM (0x0108).
+ * Extract function capabilities for ICE_AQC_CAPS_FD.
  */
-enum ice_status
-ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags,
-			struct ice_sq_cd *cd)
+static void
+ice_parse_fdir_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p)
 {
-	struct ice_aqc_manage_mac_write *cmd;
-	struct ice_aq_desc desc;
+	u32 reg_val, val;
+
+	reg_val = rd32(hw, GLQF_FD_SIZE);
+	val = (reg_val & GLQF_FD_SIZE_FD_GSIZE_M) >>
+		GLQF_FD_SIZE_FD_GSIZE_S;
+	func_p->fd_fltr_guar =
+		ice_get_num_per_func(hw, val);
+	val = (reg_val & GLQF_FD_SIZE_FD_BSIZE_M) >>
+		GLQF_FD_SIZE_FD_BSIZE_S;
+	func_p->fd_fltr_best_effort = val;
+
+	ice_debug(hw, ICE_DBG_INIT, "func caps: fd_fltr_guar = %d\n",
+		  func_p->fd_fltr_guar);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: fd_fltr_best_effort = %d\n",
+		  func_p->fd_fltr_best_effort);
+}
 
-	cmd = &desc.params.mac_write;
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_manage_mac_write);
 
-	cmd->flags = flags;
+/**
+ * ice_parse_func_caps - Parse function capabilities
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @buf: buffer containing the function capability records
+ * @cap_count: the number of capabilities
+ *
+ * Helper function to parse function (0x000A) capabilities list. For
+ * capabilities shared between device and function, this relies on
+ * ice_parse_common_caps.
+ *
+ * Loop through the list of provided capabilities and extract the relevant
+ * data into the function capabilities structured.
+ */
+static void
+ice_parse_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
+		    void *buf, u32 cap_count)
+{
+	struct ice_aqc_list_caps_elem *cap_resp;
+	u32 i;
 
-	/* Prep values for flags, sah, sal */
-	cmd->sah = htons(*((const u16 *)mac_addr));
-	cmd->sal = htonl(*((const u32 *)(mac_addr + 2)));
+	cap_resp = buf;
 
-	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	memset(func_p, 0, sizeof(*func_p));
+
+	for (i = 0; i < cap_count; i++) {
+		u16 cap = le16_to_cpu(cap_resp[i].cap);
+		bool found;
+
+		found = ice_parse_common_caps(hw, &func_p->common_cap,
+					      &cap_resp[i], "func caps");
+
+		switch (cap) {
+		case ICE_AQC_CAPS_VF:
+			ice_parse_vf_func_caps(hw, func_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_VSI:
+			ice_parse_vsi_func_caps(hw, func_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_1588:
+			ice_parse_1588_func_caps(hw, func_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_FD:
+			ice_parse_fdir_func_caps(hw, func_p);
+			break;
+		default:
+			/* Don't list common capabilities as unknown */
+			if (!found)
+				ice_debug(hw, ICE_DBG_INIT, "func caps: unknown capability[%d]: 0x%x\n",
+					  i, cap);
+			break;
+		}
+	}
+
+	ice_print_led_caps(hw, &func_p->common_cap, "func caps", true);
+	ice_print_sdp_caps(hw, &func_p->common_cap, "func caps", true);
+
+	ice_recalc_port_limited_caps(hw, &func_p->common_cap);
 }
 
 /**
- * ice_aq_clear_pxe_mode
+ * ice_parse_valid_functions_cap - Parse ICE_AQC_CAPS_VALID_FUNCTIONS caps
  * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
  *
- * Tell the firmware that the driver is taking over from PXE (0x0110).
+ * Parse ICE_AQC_CAPS_VALID_FUNCTIONS for device capabilities.
  */
-static enum ice_status ice_aq_clear_pxe_mode(struct ice_hw *hw)
+static void
+ice_parse_valid_functions_cap(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+			      struct ice_aqc_list_caps_elem *cap)
 {
-	struct ice_aq_desc desc;
+	u32 number = le32_to_cpu(cap->number);
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_pxe_mode);
-	desc.params.clear_pxe.rx_cnt = ICE_AQC_CLEAR_PXE_RX_CNT;
+	dev_p->num_funcs = hweight32(number);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: num_funcs = %d\n",
+		  dev_p->num_funcs);
+}
 
-	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+/**
+ * ice_parse_vf_dev_caps - Parse ICE_AQC_CAPS_VF device caps
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
+ *
+ * Parse ICE_AQC_CAPS_VF for device capabilities.
+ */
+static void
+ice_parse_vf_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+		      struct ice_aqc_list_caps_elem *cap)
+{
+	u32 number = le32_to_cpu(cap->number);
+
+	dev_p->num_vfs_exposed = number;
+	ice_debug(hw, ICE_DBG_INIT, "dev_caps: num_vfs_exposed = %d\n",
+		  dev_p->num_vfs_exposed);
 }
 
 /**
- * ice_clear_pxe_mode - clear pxe operations mode
+ * ice_parse_vsi_dev_caps - Parse ICE_AQC_CAPS_VSI device caps
  * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
  *
- * Make sure all PXE mode settings are cleared, including things
- * like descriptor fetch/write-back mode.
+ * Parse ICE_AQC_CAPS_VSI for device capabilities.
  */
-void ice_clear_pxe_mode(struct ice_hw *hw)
+static void
+ice_parse_vsi_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+		       struct ice_aqc_list_caps_elem *cap)
 {
-	if (ice_check_sq_alive(hw, &hw->adminq))
-		ice_aq_clear_pxe_mode(hw);
+	u32 number = le32_to_cpu(cap->number);
+
+	dev_p->num_vsi_allocd_to_host = number;
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: num_vsi_allocd_to_host = %d\n",
+		  dev_p->num_vsi_allocd_to_host);
 }
 
 /**
- * ice_get_link_speed_based_on_phy_type - returns link speed
- * @phy_type_low: lower part of phy_type
- * @phy_type_high: higher part of phy_type
+ * ice_parse_1588_dev_caps - Parse ICE_AQC_CAPS_1588 device caps
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
  *
- * This helper function will convert an entry in PHY type structure
- * [phy_type_low, phy_type_high] to its corresponding link speed.
- * Note: In the structure of [phy_type_low, phy_type_high], there should
- * be one bit set, as this function will convert one PHY type to its
- * speed.
- * If no bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
- * If more than one bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
+ * Parse ICE_AQC_CAPS_1588 for device capabilities.
  */
-static u16
-ice_get_link_speed_based_on_phy_type(u64 phy_type_low, u64 phy_type_high)
+static void
+ice_parse_1588_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+			struct ice_aqc_list_caps_elem *cap)
 {
-	u16 speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN;
-	u16 speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
+	struct ice_ts_dev_info *info = &dev_p->ts_dev_info;
+	u32 logical_id = le32_to_cpu(cap->logical_id);
+	u32 phys_id = le32_to_cpu(cap->phys_id);
+	u32 number = le32_to_cpu(cap->number);
+
+	info->ena = ((number & ICE_TS_DEV_ENA_M) != 0);
+	dev_p->common_cap.ieee_1588 = info->ena;
+
+	info->tmr0_owner = number & ICE_TS_TMR0_OWNR_M;
+	info->tmr0_owned = ((number & ICE_TS_TMR0_OWND_M) != 0);
+	info->tmr0_ena = ((number & ICE_TS_TMR0_ENA_M) != 0);
+
+	info->tmr1_owner = (number & ICE_TS_TMR1_OWNR_M) >> ICE_TS_TMR1_OWNR_S;
+	info->tmr1_owned = ((number & ICE_TS_TMR1_OWND_M) != 0);
+	info->tmr1_ena = ((number & ICE_TS_TMR1_ENA_M) != 0);
+
+	info->ena_ports = logical_id;
+	info->tmr_own_map = phys_id;
+
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: ieee_1588 = %u\n",
+		  dev_p->common_cap.ieee_1588);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr0_owner = %u\n",
+		  info->tmr0_owner);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr0_owned = %u\n",
+		  info->tmr0_owned);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr0_ena = %u\n",
+		  info->tmr0_ena);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr1_owner = %u\n",
+		  info->tmr1_owner);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr1_owned = %u\n",
+		  info->tmr1_owned);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr1_ena = %u\n",
+		  info->tmr1_ena);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: ieee_1588 ena_ports = %u\n",
+		  info->ena_ports);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr_own_map = %u\n",
+		  info->tmr_own_map);
+}
 
-	switch (phy_type_low) {
-	case ICE_PHY_TYPE_LOW_100BASE_TX:
-	case ICE_PHY_TYPE_LOW_100M_SGMII:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_100MB;
-		break;
-	case ICE_PHY_TYPE_LOW_1000BASE_T:
-	case ICE_PHY_TYPE_LOW_1000BASE_SX:
-	case ICE_PHY_TYPE_LOW_1000BASE_LX:
-	case ICE_PHY_TYPE_LOW_1000BASE_KX:
-	case ICE_PHY_TYPE_LOW_1G_SGMII:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_1000MB;
-		break;
-	case ICE_PHY_TYPE_LOW_2500BASE_T:
-	case ICE_PHY_TYPE_LOW_2500BASE_X:
-	case ICE_PHY_TYPE_LOW_2500BASE_KX:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_2500MB;
-		break;
-	case ICE_PHY_TYPE_LOW_5GBASE_T:
-	case ICE_PHY_TYPE_LOW_5GBASE_KR:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_5GB;
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_T:
-	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
-	case ICE_PHY_TYPE_LOW_10GBASE_SR:
-	case ICE_PHY_TYPE_LOW_10GBASE_LR:
-	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
-	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_10GB;
-		break;
-	case ICE_PHY_TYPE_LOW_25GBASE_T:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR_S:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR1:
-	case ICE_PHY_TYPE_LOW_25GBASE_SR:
-	case ICE_PHY_TYPE_LOW_25GBASE_LR:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR_S:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR1:
-	case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_25GB;
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
-	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
-	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
-	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
-	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_40G_XLAUI:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_40GB;
-		break;
-	case ICE_PHY_TYPE_LOW_50GBASE_CR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_SR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_LR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_KR2:
-	case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_LAUI2:
-	case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_AUI2:
-	case ICE_PHY_TYPE_LOW_50GBASE_CP:
-	case ICE_PHY_TYPE_LOW_50GBASE_SR:
-	case ICE_PHY_TYPE_LOW_50GBASE_FR:
-	case ICE_PHY_TYPE_LOW_50GBASE_LR:
-	case ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4:
-	case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_AUI1:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_50GB;
-		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_CR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_SR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_LR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_KR4:
-	case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_100G_CAUI4:
-	case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_100G_AUI4:
-	case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4:
-	case ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4:
-	case ICE_PHY_TYPE_LOW_100GBASE_CP2:
-	case ICE_PHY_TYPE_LOW_100GBASE_SR2:
-	case ICE_PHY_TYPE_LOW_100GBASE_DR:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_100GB;
-		break;
-	default:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
-		break;
-	}
-
-	switch (phy_type_high) {
-	case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4:
-	case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC:
-	case ICE_PHY_TYPE_HIGH_100G_CAUI2:
-	case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC:
-	case ICE_PHY_TYPE_HIGH_100G_AUI2:
-		speed_phy_type_high = ICE_AQ_LINK_SPEED_100GB;
-		break;
-	default:
-		speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN;
-		break;
-	}
+/**
+ * ice_parse_fdir_dev_caps - Parse ICE_AQC_CAPS_FD device caps
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
+ *
+ * Parse ICE_AQC_CAPS_FD for device capabilities.
+ */
+static void
+ice_parse_fdir_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+			struct ice_aqc_list_caps_elem *cap)
+{
+	u32 number = le32_to_cpu(cap->number);
 
-	if (speed_phy_type_low == ICE_AQ_LINK_SPEED_UNKNOWN &&
-	    speed_phy_type_high == ICE_AQ_LINK_SPEED_UNKNOWN)
-		return ICE_AQ_LINK_SPEED_UNKNOWN;
-	else if (speed_phy_type_low != ICE_AQ_LINK_SPEED_UNKNOWN &&
-		 speed_phy_type_high != ICE_AQ_LINK_SPEED_UNKNOWN)
-		return ICE_AQ_LINK_SPEED_UNKNOWN;
-	else if (speed_phy_type_low != ICE_AQ_LINK_SPEED_UNKNOWN &&
-		 speed_phy_type_high == ICE_AQ_LINK_SPEED_UNKNOWN)
-		return speed_phy_type_low;
-	else
-		return speed_phy_type_high;
+	dev_p->num_flow_director_fltr = number;
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: num_flow_director_fltr = %d\n",
+		  dev_p->num_flow_director_fltr);
 }
 
+
 /**
- * ice_update_phy_type
- * @phy_type_low: pointer to the lower part of phy_type
- * @phy_type_high: pointer to the higher part of phy_type
- * @link_speeds_bitmap: targeted link speeds bitmap
+ * ice_parse_dev_caps - Parse device capabilities
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @buf: buffer containing the device capability records
+ * @cap_count: the number of capabilities
  *
- * Note: For the link_speeds_bitmap structure, you can check it at
- * [ice_aqc_get_link_status->link_speed]. Caller can pass in
- * link_speeds_bitmap include multiple speeds.
+ * Helper device to parse device (0x000B) capabilities list. For
+ * capabilities shared between device and function, this relies on
+ * ice_parse_common_caps.
  *
- * Each entry in this [phy_type_low, phy_type_high] structure will
- * present a certain link speed. This helper function will turn on bits
- * in [phy_type_low, phy_type_high] structure based on the value of
- * link_speeds_bitmap input parameter.
+ * Loop through the list of provided capabilities and extract the relevant
+ * data into the device capabilities structured.
  */
-void
-ice_update_phy_type(u64 *phy_type_low, u64 *phy_type_high,
-		    u16 link_speeds_bitmap)
+static void
+ice_parse_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+		   void *buf, u32 cap_count)
 {
-	u64 pt_high;
-	u64 pt_low;
-	int index;
-	u16 speed;
+	struct ice_aqc_list_caps_elem *cap_resp;
+	u32 i;
 
-	/* We first check with low part of phy_type */
-	for (index = 0; index <= ICE_PHY_TYPE_LOW_MAX_INDEX; index++) {
-		pt_low = BIT_ULL(index);
-		speed = ice_get_link_speed_based_on_phy_type(pt_low, 0);
+	cap_resp = buf;
 
-		if (link_speeds_bitmap & speed)
-			*phy_type_low |= BIT_ULL(index);
-	}
+	memset(dev_p, 0, sizeof(*dev_p));
 
-	/* We then check with high part of phy_type */
-	for (index = 0; index <= ICE_PHY_TYPE_HIGH_MAX_INDEX; index++) {
-		pt_high = BIT_ULL(index);
-		speed = ice_get_link_speed_based_on_phy_type(0, pt_high);
+	for (i = 0; i < cap_count; i++) {
+		u16 cap = le16_to_cpu(cap_resp[i].cap);
+		bool found;
 
-		if (link_speeds_bitmap & speed)
-			*phy_type_high |= BIT_ULL(index);
+		found = ice_parse_common_caps(hw, &dev_p->common_cap,
+					      &cap_resp[i], "dev caps");
+
+		switch (cap) {
+		case ICE_AQC_CAPS_VALID_FUNCTIONS:
+			ice_parse_valid_functions_cap(hw, dev_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_VF:
+			ice_parse_vf_dev_caps(hw, dev_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_VSI:
+			ice_parse_vsi_dev_caps(hw, dev_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_1588:
+			ice_parse_1588_dev_caps(hw, dev_p, &cap_resp[i]);
+			break;
+		case  ICE_AQC_CAPS_FD:
+			ice_parse_fdir_dev_caps(hw, dev_p, &cap_resp[i]);
+			break;
+		default:
+			/* Don't list common capabilities as unknown */
+			if (!found)
+				ice_debug(hw, ICE_DBG_INIT, "dev caps: unknown capability[%d]: 0x%x\n",
+					  i, cap);
+			break;
+		}
 	}
+
+	ice_print_led_caps(hw, &dev_p->common_cap, "dev caps", true);
+	ice_print_sdp_caps(hw, &dev_p->common_cap, "dev caps", true);
+
+	ice_recalc_port_limited_caps(hw, &dev_p->common_cap);
 }
 
 /**
- * ice_aq_set_phy_cfg
+ * ice_aq_list_caps - query function/device capabilities
  * @hw: pointer to the HW struct
- * @lport: logical port number
- * @cfg: structure with PHY configuration data to be set
+ * @buf: a buffer to hold the capabilities
+ * @buf_size: size of the buffer
+ * @cap_count: if not NULL, set to the number of capabilities reported
+ * @opc: capabilities type to discover, device or function
  * @cd: pointer to command details structure or NULL
  *
- * Set the various PHY configuration parameters supported on the Port.
- * One or more of the Set PHY config parameters may be ignored in an MFP
- * mode as the PF may not have the privilege to set some of the PHY Config
- * parameters. This status will be indicated by the command response (0x0601).
+ * Get the function (0x000A) or device (0x000B) capabilities description from
+ * firmware and store it in the buffer.
+ *
+ * If the cap_count pointer is not NULL, then it is set to the number of
+ * capabilities firmware will report. Note that if the buffer size is too
+ * small, it is possible the command will return ICE_AQ_ERR_ENOMEM. The
+ * cap_count will still be updated in this case. It is recommended that the
+ * buffer size be set to ICE_AQ_MAX_BUF_LEN (the largest possible buffer that
+ * firmware could return) to avoid this.
  */
 enum ice_status
-ice_aq_set_phy_cfg(struct ice_hw *hw, u8 lport,
-		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd)
+ice_aq_list_caps(struct ice_hw *hw, void *buf, u16 buf_size, u32 *cap_count,
+		 enum ice_adminq_opc opc, struct ice_sq_cd *cd)
 {
+	struct ice_aqc_list_caps *cmd;
 	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	if (!cfg)
-		return ICE_ERR_PARAM;
-
-	/* Ensure that only valid bits of cfg->caps can be turned on. */
-	if (cfg->caps & ~ICE_AQ_PHY_ENA_VALID_MASK) {
-		ice_debug(hw, ICE_DBG_PHY,
-			  "Invalid bit is set in ice_aqc_set_phy_cfg_data->caps : 0x%x\n",
-			  cfg->caps);
+	cmd = &desc.params.get_cap;
 
-		cfg->caps &= ICE_AQ_PHY_ENA_VALID_MASK;
-	}
+	if (opc != ice_aqc_opc_list_func_caps &&
+	    opc != ice_aqc_opc_list_dev_caps)
+		return ICE_ERR_PARAM;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_phy_cfg);
-	desc.params.set_phy.lport_num = lport;
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 
-	ice_debug(hw, ICE_DBG_LINK, "phy_type_low = 0x%llx\n",
-		  (unsigned long long)le64_to_cpu(cfg->phy_type_low));
-	ice_debug(hw, ICE_DBG_LINK, "phy_type_high = 0x%llx\n",
-		  (unsigned long long)le64_to_cpu(cfg->phy_type_high));
-	ice_debug(hw, ICE_DBG_LINK, "caps = 0x%x\n", cfg->caps);
-	ice_debug(hw, ICE_DBG_LINK, "low_power_ctrl = 0x%x\n",
-		  cfg->low_power_ctrl);
-	ice_debug(hw, ICE_DBG_LINK, "eee_cap = 0x%x\n", cfg->eee_cap);
-	ice_debug(hw, ICE_DBG_LINK, "eeer_value = 0x%x\n", cfg->eeer_value);
-	ice_debug(hw, ICE_DBG_LINK, "link_fec_opt = 0x%x\n", cfg->link_fec_opt);
+	if (cap_count)
+		*cap_count = le32_to_cpu(cmd->count);
 
-	return ice_aq_send_cmd(hw, &desc, cfg, sizeof(*cfg), cd);
+	return status;
 }
 
 /**
- * ice_update_link_info - update status of the HW network link
- * @pi: port info structure of the interested logical port
+ * ice_discover_dev_caps - Read and extract device capabilities
+ * @hw: pointer to the hardware structure
+ * @dev_caps: pointer to device capabilities structure
+ *
+ * Read the device capabilities and extract them into the dev_caps structure
+ * for later use.
  */
-enum ice_status ice_update_link_info(struct ice_port_info *pi)
+enum ice_status
+ice_discover_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_caps)
 {
-	struct ice_link_status *li;
 	enum ice_status status;
+	u32 cap_count = 0;
+	void *cbuf;
 
-	if (!pi)
-		return ICE_ERR_PARAM;
-
-	li = &pi->phy.link_info;
-
-	status = ice_aq_get_link_info(pi, true, NULL, NULL);
-	if (status)
-		return status;
-
-	if (li->link_info & ICE_AQ_MEDIA_AVAILABLE) {
-		struct ice_aqc_get_phy_caps_data *pcaps;
-		struct ice_hw *hw;
-
-		hw = pi->hw;
-		pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps),
-				     GFP_KERNEL);
-		if (!pcaps)
-			return ICE_ERR_NO_MEMORY;
+	cbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
+	if (!cbuf)
+		return ICE_ERR_NO_MEMORY;
 
-		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP,
-					     pcaps, NULL);
-		if (!status)
-			memcpy(li->module_type, &pcaps->module_type,
-			       sizeof(li->module_type));
+	/* Although the driver doesn't know the number of capabilities the
+	 * device will return, we can simply send a 4KB buffer, the maximum
+	 * possible size that firmware can return.
+	 */
+	cap_count = ICE_AQ_MAX_BUF_LEN / sizeof(struct ice_aqc_list_caps_elem);
 
-		devm_kfree(ice_hw_to_dev(hw), pcaps);
-	}
+	status = ice_aq_list_caps(hw, cbuf, ICE_AQ_MAX_BUF_LEN, &cap_count,
+				  ice_aqc_opc_list_dev_caps, NULL);
+	if (!status)
+		ice_parse_dev_caps(hw, dev_caps, cbuf, cap_count);
+	devm_kfree(ice_hw_to_dev(hw), cbuf);
 
 	return status;
 }
 
 /**
- * ice_set_fc
- * @pi: port information structure
- * @aq_failures: pointer to status code, specific to ice_set_fc routine
- * @ena_auto_link_update: enable automatic link update
+ * ice_discover_func_caps - Read and extract function capabilities
+ * @hw: pointer to the hardware structure
+ * @func_caps: pointer to function capabilities structure
  *
- * Set the requested flow control mode.
+ * Read the function capabilities and extract them into the func_caps structure
+ * for later use.
  */
-enum ice_status
-ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool ena_auto_link_update)
+static enum ice_status
+ice_discover_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_caps)
 {
-	struct ice_aqc_set_phy_cfg_data cfg = { 0 };
-	struct ice_aqc_get_phy_caps_data *pcaps;
 	enum ice_status status;
-	u8 pause_mask = 0x0;
-	struct ice_hw *hw;
+	u32 cap_count = 0;
+	void *cbuf;
 
-	if (!pi)
-		return ICE_ERR_PARAM;
-	hw = pi->hw;
-	*aq_failures = ICE_SET_FC_AQ_FAIL_NONE;
+	cbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
+	if (!cbuf)
+		return ICE_ERR_NO_MEMORY;
 
-	switch (pi->fc.req_mode) {
-	case ICE_FC_FULL:
-		pause_mask |= ICE_AQC_PHY_EN_TX_LINK_PAUSE;
-		pause_mask |= ICE_AQC_PHY_EN_RX_LINK_PAUSE;
-		break;
-	case ICE_FC_RX_PAUSE:
-		pause_mask |= ICE_AQC_PHY_EN_RX_LINK_PAUSE;
-		break;
-	case ICE_FC_TX_PAUSE:
-		pause_mask |= ICE_AQC_PHY_EN_TX_LINK_PAUSE;
-		break;
-	default:
-		break;
-	}
+	/* Although the driver doesn't know the number of capabilities the
+	 * device will return, we can simply send a 4KB buffer, the maximum
+	 * possible size that firmware can return.
+	 */
+	cap_count = ICE_AQ_MAX_BUF_LEN / sizeof(struct ice_aqc_list_caps_elem);
 
-	pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps), GFP_KERNEL);
-	if (!pcaps)
-		return ICE_ERR_NO_MEMORY;
+	status = ice_aq_list_caps(hw, cbuf, ICE_AQ_MAX_BUF_LEN, &cap_count,
+				  ice_aqc_opc_list_func_caps, NULL);
+	if (!status)
+		ice_parse_func_caps(hw, func_caps, cbuf, cap_count);
+	devm_kfree(ice_hw_to_dev(hw), cbuf);
 
-	/* Get the current PHY config */
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps,
-				     NULL);
-	if (status) {
-		*aq_failures = ICE_SET_FC_AQ_FAIL_GET;
-		goto out;
-	}
+	return status;
+}
 
-	/* clear the old pause settings */
-	cfg.caps = pcaps->caps & ~(ICE_AQC_PHY_EN_TX_LINK_PAUSE |
-				   ICE_AQC_PHY_EN_RX_LINK_PAUSE);
+/**
+ * ice_set_safe_mode_caps - Override dev/func capabilities when in safe mode
+ * @hw: pointer to the hardware structure
+ */
+void ice_set_safe_mode_caps(struct ice_hw *hw)
+{
+	struct ice_hw_func_caps *func_caps = &hw->func_caps;
+	struct ice_hw_dev_caps *dev_caps = &hw->dev_caps;
+	struct ice_hw_common_caps cached_caps;
+	u32 num_funcs;
 
-	/* set the new capabilities */
-	cfg.caps |= pause_mask;
+	/* cache some func_caps values that should be restored after memset */
+	cached_caps = func_caps->common_cap;
 
-	/* If the capabilities have changed, then set the new config */
-	if (cfg.caps != pcaps->caps) {
-		int retry_count, retry_max = 10;
+	/* unset func capabilities */
+	memset(func_caps, 0, sizeof(*func_caps));
 
-		/* Auto restart link so settings take effect */
-		if (ena_auto_link_update)
-			cfg.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-		/* Copy over all the old settings */
-		cfg.phy_type_high = pcaps->phy_type_high;
-		cfg.phy_type_low = pcaps->phy_type_low;
-		cfg.low_power_ctrl = pcaps->low_power_ctrl;
-		cfg.eee_cap = pcaps->eee_cap;
-		cfg.eeer_value = pcaps->eeer_value;
-		cfg.link_fec_opt = pcaps->link_fec_options;
-
-		status = ice_aq_set_phy_cfg(hw, pi->lport, &cfg, NULL);
-		if (status) {
-			*aq_failures = ICE_SET_FC_AQ_FAIL_SET;
-			goto out;
-		}
+#define ICE_RESTORE_FUNC_CAP(name) \
+	func_caps->common_cap.name = cached_caps.name
 
-		/* Update the link info
-		 * It sometimes takes a really long time for link to
-		 * come back from the atomic reset. Thus, we wait a
-		 * little bit.
-		 */
-		for (retry_count = 0; retry_count < retry_max; retry_count++) {
-			status = ice_update_link_info(pi);
+	/* restore cached values */
+	ICE_RESTORE_FUNC_CAP(valid_functions);
+	ICE_RESTORE_FUNC_CAP(txq_first_id);
+	ICE_RESTORE_FUNC_CAP(rxq_first_id);
+	ICE_RESTORE_FUNC_CAP(msix_vector_first_id);
+	ICE_RESTORE_FUNC_CAP(max_mtu);
+	ICE_RESTORE_FUNC_CAP(nvm_unified_update);
+	ICE_RESTORE_FUNC_CAP(nvm_update_pending_nvm);
+	ICE_RESTORE_FUNC_CAP(nvm_update_pending_orom);
+	ICE_RESTORE_FUNC_CAP(nvm_update_pending_netlist);
 
-			if (!status)
-				break;
+	/* one Tx and one Rx queue in safe mode */
+	func_caps->common_cap.num_rxq = 1;
+	func_caps->common_cap.num_txq = 1;
 
-			mdelay(100);
-		}
+	/* two MSIX vectors, one for traffic and one for misc causes */
+	func_caps->common_cap.num_msix_vectors = 2;
+	func_caps->guar_num_vsi = 1;
 
-		if (status)
-			*aq_failures = ICE_SET_FC_AQ_FAIL_UPDATE;
-	}
+	/* cache some dev_caps values that should be restored after memset */
+	cached_caps = dev_caps->common_cap;
+	num_funcs = dev_caps->num_funcs;
 
-out:
-	devm_kfree(ice_hw_to_dev(hw), pcaps);
-	return status;
-}
+	/* unset dev capabilities */
+	memset(dev_caps, 0, sizeof(*dev_caps));
 
-/**
- * ice_copy_phy_caps_to_cfg - Copy PHY ability data to configuration data
- * @caps: PHY ability structure to copy date from
- * @cfg: PHY configuration structure to copy data to
- *
- * Helper function to copy AQC PHY get ability data to PHY set configuration
- * data structure
- */
-void
-ice_copy_phy_caps_to_cfg(struct ice_aqc_get_phy_caps_data *caps,
-			 struct ice_aqc_set_phy_cfg_data *cfg)
-{
-	if (!caps || !cfg)
-		return;
+#define ICE_RESTORE_DEV_CAP(name) \
+	dev_caps->common_cap.name = cached_caps.name
 
-	cfg->phy_type_low = caps->phy_type_low;
-	cfg->phy_type_high = caps->phy_type_high;
-	cfg->caps = caps->caps;
-	cfg->low_power_ctrl = caps->low_power_ctrl;
-	cfg->eee_cap = caps->eee_cap;
-	cfg->eeer_value = caps->eeer_value;
-	cfg->link_fec_opt = caps->link_fec_options;
-}
+	/* restore cached values */
+	ICE_RESTORE_DEV_CAP(valid_functions);
+	ICE_RESTORE_DEV_CAP(txq_first_id);
+	ICE_RESTORE_DEV_CAP(rxq_first_id);
+	ICE_RESTORE_DEV_CAP(msix_vector_first_id);
+	ICE_RESTORE_DEV_CAP(max_mtu);
+	ICE_RESTORE_DEV_CAP(nvm_unified_update);
+	ICE_RESTORE_DEV_CAP(nvm_update_pending_nvm);
+	ICE_RESTORE_DEV_CAP(nvm_update_pending_orom);
+	ICE_RESTORE_DEV_CAP(nvm_update_pending_netlist);
+	dev_caps->num_funcs = num_funcs;
 
-/**
- * ice_cfg_phy_fec - Configure PHY FEC data based on FEC mode
- * @cfg: PHY configuration data to set FEC mode
- * @fec: FEC mode to configure
- *
- * Caller should copy ice_aqc_get_phy_caps_data.caps ICE_AQC_PHY_EN_AUTO_FEC
- * (bit 7) and ice_aqc_get_phy_caps_data.link_fec_options to cfg.caps
- * ICE_AQ_PHY_ENA_AUTO_FEC (bit 7) and cfg.link_fec_options before calling.
- */
-void
-ice_cfg_phy_fec(struct ice_aqc_set_phy_cfg_data *cfg, enum ice_fec_mode fec)
-{
-	switch (fec) {
-	case ICE_FEC_BASER:
-		/* Clear RS bits, and AND BASE-R ability
-		 * bits and OR request bits.
-		 */
-		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN |
-				     ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN;
-		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ |
-				     ICE_AQC_PHY_FEC_25G_KR_REQ;
-		break;
-	case ICE_FEC_RS:
-		/* Clear BASE-R bits, and AND RS ability
-		 * bits and OR request bits.
-		 */
-		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN;
-		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_25G_RS_528_REQ |
-				     ICE_AQC_PHY_FEC_25G_RS_544_REQ;
-		break;
-	case ICE_FEC_NONE:
-		/* Clear all FEC option bits. */
-		cfg->link_fec_opt &= ~ICE_AQC_PHY_FEC_MASK;
-		break;
-	case ICE_FEC_AUTO:
-		/* AND auto FEC bit, and all caps bits. */
-		cfg->caps &= ICE_AQC_PHY_CAPS_MASK;
-		break;
-	}
+	/* one Tx and one Rx queue per function in safe mode */
+	dev_caps->common_cap.num_rxq = num_funcs;
+	dev_caps->common_cap.num_txq = num_funcs;
+
+	/* two MSIX vectors per function */
+	dev_caps->common_cap.num_msix_vectors = 2 * num_funcs;
 }
 
 /**
- * ice_get_link_status - get status of the HW network link
- * @pi: port information structure
- * @link_up: pointer to bool (true/false = linkup/linkdown)
- *
- * Variable link_up is true if link is up, false if link is down.
- * The variable link_up is invalid if status is non zero. As a
- * result of this call, link status reporting becomes enabled
+ * ice_get_caps - get info about the HW
+ * @hw: pointer to the hardware structure
  */
-enum ice_status ice_get_link_status(struct ice_port_info *pi, bool *link_up)
+enum ice_status ice_get_caps(struct ice_hw *hw)
 {
-	struct ice_phy_info *phy_info;
-	enum ice_status status = 0;
-
-	if (!pi || !link_up)
-		return ICE_ERR_PARAM;
-
-	phy_info = &pi->phy;
-
-	if (phy_info->get_link_info) {
-		status = ice_update_link_info(pi);
-
-		if (status)
-			ice_debug(pi->hw, ICE_DBG_LINK,
-				  "get link status error, status = %d\n",
-				  status);
-	}
+	enum ice_status status;
 
-	*link_up = phy_info->link_info.link_info & ICE_AQ_LINK_UP;
+	status = ice_discover_dev_caps(hw, &hw->dev_caps);
+	if (status)
+		return status;
 
-	return status;
+	return ice_discover_func_caps(hw, &hw->func_caps);
 }
 
 /**
- * ice_aq_set_link_restart_an
- * @pi: pointer to the port information structure
- * @ena_link: if true: enable link, if false: disable link
+ * ice_aq_manage_mac_write - manage MAC address write command
+ * @hw: pointer to the HW struct
+ * @mac_addr: MAC address to be written as LAA/LAA+WoL/Port address
+ * @flags: flags to control write behavior
  * @cd: pointer to command details structure or NULL
  *
- * Sets up the link and restarts the Auto-Negotiation over the link.
+ * This function is used to write MAC address to the NVM (0x0108).
  */
 enum ice_status
-ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
-			   struct ice_sq_cd *cd)
+ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags,
+			struct ice_sq_cd *cd)
 {
-	struct ice_aqc_restart_an *cmd;
+	struct ice_aqc_manage_mac_write *cmd;
 	struct ice_aq_desc desc;
 
-	cmd = &desc.params.restart_an;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_restart_an);
+	cmd = &desc.params.mac_write;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_manage_mac_write);
 
-	cmd->cmd_flags = ICE_AQC_RESTART_AN_LINK_RESTART;
-	cmd->lport_num = pi->lport;
-	if (ena_link)
-		cmd->cmd_flags |= ICE_AQC_RESTART_AN_LINK_ENABLE;
-	else
-		cmd->cmd_flags &= ~ICE_AQC_RESTART_AN_LINK_ENABLE;
+	cmd->flags = flags;
+	ether_addr_copy(cmd->mac_addr, mac_addr);
 
-	return ice_aq_send_cmd(pi->hw, &desc, NULL, 0, cd);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
 /**
- * ice_aq_set_event_mask
+ * ice_aq_clear_pxe_mode
  * @hw: pointer to the HW struct
- * @port_num: port number of the physical function
- * @mask: event mask to be set
- * @cd: pointer to command details structure or NULL
  *
- * Set event mask (0x0613)
+ * Tell the firmware that the driver is taking over from PXE (0x0110).
  */
-enum ice_status
-ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
-		      struct ice_sq_cd *cd)
+static enum ice_status ice_aq_clear_pxe_mode(struct ice_hw *hw)
 {
-	struct ice_aqc_set_event_mask *cmd;
 	struct ice_aq_desc desc;
 
-	cmd = &desc.params.set_event_mask;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_event_mask);
-
-	cmd->lport_num = port_num;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_pxe_mode);
+	desc.params.clear_pxe.rx_cnt = ICE_AQC_CLEAR_PXE_RX_CNT;
 
-	cmd->event_mask = cpu_to_le16(mask);
-	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 }
 
 /**
- * ice_aq_set_mac_loopback
+ * ice_clear_pxe_mode - clear pxe operations mode
  * @hw: pointer to the HW struct
- * @ena_lpbk: Enable or Disable loopback
- * @cd: pointer to command details structure or NULL
  *
- * Enable/disable loopback on a given port
+ * Make sure all PXE mode settings are cleared, including things
+ * like descriptor fetch/write-back mode.
  */
-enum ice_status
-ice_aq_set_mac_loopback(struct ice_hw *hw, bool ena_lpbk, struct ice_sq_cd *cd)
+void ice_clear_pxe_mode(struct ice_hw *hw)
 {
-	struct ice_aqc_set_mac_lb *cmd;
-	struct ice_aq_desc desc;
-
-	cmd = &desc.params.set_mac_lb;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_mac_lb);
-	if (ena_lpbk)
-		cmd->lb_mode = ICE_AQ_MAC_LB_EN;
-
-	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (ice_check_sq_alive(hw, &hw->adminq))
+		ice_aq_clear_pxe_mode(hw);
 }
 
 /**
- * ice_aq_set_port_id_led
- * @pi: pointer to the port information
- * @is_orig_mode: is this LED set to original mode (by the net-list)
+ * ice_aq_set_port_params - set physical port parameters.
+ * @pi: pointer to the port info struct
+ * @bad_frame_vsi: defines the VSI to which bad frames are forwarded
+ * @save_bad_pac: if set packets with errors are forwarded to the bad frames VSI
+ * @pad_short_pac: if set transmit packets smaller than 60 bytes are padded
+ * @double_vlan: if set double VLAN is enabled
  * @cd: pointer to command details structure or NULL
  *
- * Set LED value for the given port (0x06e9)
+ * Set Physical port parameters (0x0203)
  */
 enum ice_status
-ice_aq_set_port_id_led(struct ice_port_info *pi, bool is_orig_mode,
+ice_aq_set_port_params(struct ice_port_info *pi, u16 bad_frame_vsi,
+		       bool save_bad_pac, bool pad_short_pac, bool double_vlan,
 		       struct ice_sq_cd *cd)
+
 {
-	struct ice_aqc_set_port_id_led *cmd;
+	struct ice_aqc_set_port_params *cmd;
 	struct ice_hw *hw = pi->hw;
 	struct ice_aq_desc desc;
+	u16 cmd_flags = 0;
 
-	cmd = &desc.params.set_port_id_led;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_id_led);
+	cmd = &desc.params.set_port_params;
 
-	if (is_orig_mode)
-		cmd->ident_mode = ICE_AQC_PORT_IDENT_LED_ORIG;
-	else
-		cmd->ident_mode = ICE_AQC_PORT_IDENT_LED_BLINK;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_params);
+	cmd->bad_frame_vsi = cpu_to_le16(bad_frame_vsi);
+	if (save_bad_pac)
+		cmd_flags |= ICE_AQC_SET_P_PARAMS_SAVE_BAD_PACKETS;
+	if (pad_short_pac)
+		cmd_flags |= ICE_AQC_SET_P_PARAMS_PAD_SHORT_PACKETS;
+	if (double_vlan)
+		cmd_flags |= ICE_AQC_SET_P_PARAMS_DOUBLE_VLAN_ENA;
+	cmd->cmd_flags = cpu_to_le16(cmd_flags);
 
 	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
 /**
- * __ice_aq_get_set_rss_lut
- * @hw: pointer to the hardware structure
- * @vsi_id: VSI FW index
- * @lut_type: LUT table type
- * @lut: pointer to the LUT buffer provided by the caller
- * @lut_size: size of the LUT buffer
- * @glob_lut_idx: global LUT index
- * @set: set true to set the table, false to get the table
+ * ice_get_link_speed_based_on_phy_type - returns link speed
+ * @phy_type_low: lower part of phy_type
+ * @phy_type_high: higher part of phy_type
  *
- * Internal function to get (0x0B05) or set (0x0B03) RSS look up table
+ * This helper function will convert an entry in PHY type structure
+ * [phy_type_low, phy_type_high] to its corresponding link speed.
+ * Note: In the structure of [phy_type_low, phy_type_high], there should
+ * be one bit set, as this function will convert one PHY type to its
+ * speed.
+ * If no bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
+ * If more than one bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
  */
-static enum ice_status
-__ice_aq_get_set_rss_lut(struct ice_hw *hw, u16 vsi_id, u8 lut_type, u8 *lut,
-			 u16 lut_size, u8 glob_lut_idx, bool set)
+static u16
+ice_get_link_speed_based_on_phy_type(u64 phy_type_low, u64 phy_type_high)
 {
-	struct ice_aqc_get_set_rss_lut *cmd_resp;
-	struct ice_aq_desc desc;
-	enum ice_status status;
-	u16 flags = 0;
-
-	cmd_resp = &desc.params.get_set_rss_lut;
-
-	if (set) {
-		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_lut);
-		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-	} else {
-		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_lut);
-	}
-
-	cmd_resp->vsi_id = cpu_to_le16(((vsi_id <<
-					 ICE_AQC_GSET_RSS_LUT_VSI_ID_S) &
-					ICE_AQC_GSET_RSS_LUT_VSI_ID_M) |
-				       ICE_AQC_GSET_RSS_LUT_VSI_VALID);
+	u16 speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN;
+	u16 speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
 
-	switch (lut_type) {
-	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI:
-	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF:
-	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL:
-		flags |= ((lut_type << ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S) &
-			  ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_M);
+	switch (phy_type_low) {
+	case ICE_PHY_TYPE_LOW_100BASE_TX:
+	case ICE_PHY_TYPE_LOW_100M_SGMII:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_100MB;
 		break;
-	default:
-		status = ICE_ERR_PARAM;
-		goto ice_aq_get_set_rss_lut_exit;
+	case ICE_PHY_TYPE_LOW_1000BASE_T:
+	case ICE_PHY_TYPE_LOW_1000BASE_SX:
+	case ICE_PHY_TYPE_LOW_1000BASE_LX:
+	case ICE_PHY_TYPE_LOW_1000BASE_KX:
+	case ICE_PHY_TYPE_LOW_1G_SGMII:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_1000MB;
+		break;
+	case ICE_PHY_TYPE_LOW_2500BASE_T:
+	case ICE_PHY_TYPE_LOW_2500BASE_X:
+	case ICE_PHY_TYPE_LOW_2500BASE_KX:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_2500MB;
+		break;
+	case ICE_PHY_TYPE_LOW_5GBASE_T:
+	case ICE_PHY_TYPE_LOW_5GBASE_KR:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_5GB;
+		break;
+	case ICE_PHY_TYPE_LOW_10GBASE_T:
+	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
+	case ICE_PHY_TYPE_LOW_10GBASE_SR:
+	case ICE_PHY_TYPE_LOW_10GBASE_LR:
+	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
+	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_10GB;
+		break;
+	case ICE_PHY_TYPE_LOW_25GBASE_T:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR_S:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR1:
+	case ICE_PHY_TYPE_LOW_25GBASE_SR:
+	case ICE_PHY_TYPE_LOW_25GBASE_LR:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR_S:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR1:
+	case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_25GB;
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_40GB;
+		break;
+	case ICE_PHY_TYPE_LOW_50GBASE_CR2:
+	case ICE_PHY_TYPE_LOW_50GBASE_SR2:
+	case ICE_PHY_TYPE_LOW_50GBASE_LR2:
+	case ICE_PHY_TYPE_LOW_50GBASE_KR2:
+	case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_50G_LAUI2:
+	case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_50G_AUI2:
+	case ICE_PHY_TYPE_LOW_50GBASE_CP:
+	case ICE_PHY_TYPE_LOW_50GBASE_SR:
+	case ICE_PHY_TYPE_LOW_50GBASE_FR:
+	case ICE_PHY_TYPE_LOW_50GBASE_LR:
+	case ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4:
+	case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_50G_AUI1:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_50GB;
+		break;
+	case ICE_PHY_TYPE_LOW_100GBASE_CR4:
+	case ICE_PHY_TYPE_LOW_100GBASE_SR4:
+	case ICE_PHY_TYPE_LOW_100GBASE_LR4:
+	case ICE_PHY_TYPE_LOW_100GBASE_KR4:
+	case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_100G_CAUI4:
+	case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_100G_AUI4:
+	case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4:
+	case ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4:
+	case ICE_PHY_TYPE_LOW_100GBASE_CP2:
+	case ICE_PHY_TYPE_LOW_100GBASE_SR2:
+	case ICE_PHY_TYPE_LOW_100GBASE_DR:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_100GB;
+		break;
+	default:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
+		break;
+	}
+
+	switch (phy_type_high) {
+	case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4:
+	case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC:
+	case ICE_PHY_TYPE_HIGH_100G_CAUI2:
+	case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC:
+	case ICE_PHY_TYPE_HIGH_100G_AUI2:
+		speed_phy_type_high = ICE_AQ_LINK_SPEED_100GB;
+		break;
+	default:
+		speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN;
+		break;
+	}
+
+	if (speed_phy_type_low == ICE_AQ_LINK_SPEED_UNKNOWN &&
+	    speed_phy_type_high == ICE_AQ_LINK_SPEED_UNKNOWN)
+		return ICE_AQ_LINK_SPEED_UNKNOWN;
+	else if (speed_phy_type_low != ICE_AQ_LINK_SPEED_UNKNOWN &&
+		 speed_phy_type_high != ICE_AQ_LINK_SPEED_UNKNOWN)
+		return ICE_AQ_LINK_SPEED_UNKNOWN;
+	else if (speed_phy_type_low != ICE_AQ_LINK_SPEED_UNKNOWN &&
+		 speed_phy_type_high == ICE_AQ_LINK_SPEED_UNKNOWN)
+		return speed_phy_type_low;
+	else
+		return speed_phy_type_high;
+}
+
+/**
+ * ice_update_phy_type
+ * @phy_type_low: pointer to the lower part of phy_type
+ * @phy_type_high: pointer to the higher part of phy_type
+ * @link_speeds_bitmap: targeted link speeds bitmap
+ *
+ * Note: For the link_speeds_bitmap structure, you can check it at
+ * [ice_aqc_get_link_status->link_speed]. Caller can pass in
+ * link_speeds_bitmap include multiple speeds.
+ *
+ * Each entry in this [phy_type_low, phy_type_high] structure will
+ * present a certain link speed. This helper function will turn on bits
+ * in [phy_type_low, phy_type_high] structure based on the value of
+ * link_speeds_bitmap input parameter.
+ */
+void
+ice_update_phy_type(u64 *phy_type_low, u64 *phy_type_high,
+		    u16 link_speeds_bitmap)
+{
+	u64 pt_high;
+	u64 pt_low;
+	int index;
+	u16 speed;
+
+	/* We first check with low part of phy_type */
+	for (index = 0; index <= ICE_PHY_TYPE_LOW_MAX_INDEX; index++) {
+		pt_low = BIT_ULL(index);
+		speed = ice_get_link_speed_based_on_phy_type(pt_low, 0);
+
+		if (link_speeds_bitmap & speed)
+			*phy_type_low |= BIT_ULL(index);
+	}
+
+	/* We then check with high part of phy_type */
+	for (index = 0; index <= ICE_PHY_TYPE_HIGH_MAX_INDEX; index++) {
+		pt_high = BIT_ULL(index);
+		speed = ice_get_link_speed_based_on_phy_type(0, pt_high);
+
+		if (link_speeds_bitmap & speed)
+			*phy_type_high |= BIT_ULL(index);
+	}
+}
+
+/**
+ * ice_aq_set_phy_cfg
+ * @hw: pointer to the HW struct
+ * @pi: port info structure of the interested logical port
+ * @cfg: structure with PHY configuration data to be set
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set the various PHY configuration parameters supported on the Port.
+ * One or more of the Set PHY config parameters may be ignored in an MFP
+ * mode as the PF may not have the privilege to set some of the PHY Config
+ * parameters. This status will be indicated by the command response (0x0601).
+ */
+enum ice_status
+ice_aq_set_phy_cfg(struct ice_hw *hw, struct ice_port_info *pi,
+		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!cfg)
+		return ICE_ERR_PARAM;
+
+	/* Ensure that only valid bits of cfg->caps can be turned on. */
+	if (cfg->caps & ~ICE_AQ_PHY_ENA_VALID_MASK) {
+		ice_debug(hw, ICE_DBG_PHY, "Invalid bit is set in ice_aqc_set_phy_cfg_data->caps : 0x%x\n",
+			  cfg->caps);
+
+		cfg->caps &= ICE_AQ_PHY_ENA_VALID_MASK;
+	}
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_phy_cfg);
+	desc.params.set_phy.lport_num = pi->lport;
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	ice_debug(hw, ICE_DBG_LINK, "set phy cfg\n");
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_low = 0x%llx\n",
+		  (unsigned long long)le64_to_cpu(cfg->phy_type_low));
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_high = 0x%llx\n",
+		  (unsigned long long)le64_to_cpu(cfg->phy_type_high));
+	ice_debug(hw, ICE_DBG_LINK, "	caps = 0x%x\n", cfg->caps);
+	ice_debug(hw, ICE_DBG_LINK, "	low_power_ctrl_an = 0x%x\n",
+		  cfg->low_power_ctrl_an);
+	ice_debug(hw, ICE_DBG_LINK, "	eee_cap = 0x%x\n", cfg->eee_cap);
+	ice_debug(hw, ICE_DBG_LINK, "	eeer_value = 0x%x\n", cfg->eeer_value);
+	ice_debug(hw, ICE_DBG_LINK, "	link_fec_opt = 0x%x\n",
+		  cfg->link_fec_opt);
+
+	status = ice_aq_send_cmd(hw, &desc, cfg, sizeof(*cfg), cd);
+
+	if (hw->adminq.sq_last_status == ICE_AQ_RC_EMODE)
+		status = 0;
+
+	if (!status)
+		pi->phy.curr_user_phy_cfg = *cfg;
+
+	return status;
+}
+
+/**
+ * ice_update_link_info - update status of the HW network link
+ * @pi: port info structure of the interested logical port
+ */
+enum ice_status ice_update_link_info(struct ice_port_info *pi)
+{
+	struct ice_link_status *li;
+	enum ice_status status;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+
+	li = &pi->phy.link_info;
+
+	status = ice_aq_get_link_info(pi, true, NULL, NULL);
+	if (status)
+		return status;
+
+	if (li->link_info & ICE_AQ_MEDIA_AVAILABLE) {
+		struct ice_aqc_get_phy_caps_data *pcaps;
+		struct ice_hw *hw;
+
+		hw = pi->hw;
+		pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps),
+				     GFP_KERNEL);
+		if (!pcaps)
+			return ICE_ERR_NO_MEMORY;
+
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+					     pcaps, NULL);
+
+		if (!status)
+			memcpy(li->module_type, &pcaps->module_type,
+			       sizeof(li->module_type));
+
+		devm_kfree(ice_hw_to_dev(hw), pcaps);
+	}
+
+	return status;
+}
+
+/**
+ * ice_cache_phy_user_req
+ * @pi: port information structure
+ * @cache_data: PHY logging data
+ * @cache_mode: PHY logging mode
+ *
+ * Log the user request on (FC, FEC, SPEED) for later user.
+ */
+static void
+ice_cache_phy_user_req(struct ice_port_info *pi,
+		       struct ice_phy_cache_mode_data cache_data,
+		       enum ice_phy_cache_mode cache_mode)
+{
+	if (!pi)
+		return;
+
+	switch (cache_mode) {
+	case ICE_FC_MODE:
+		pi->phy.curr_user_fc_req = cache_data.data.curr_user_fc_req;
+		break;
+	case ICE_SPEED_MODE:
+		pi->phy.curr_user_speed_req =
+			cache_data.data.curr_user_speed_req;
+		break;
+	case ICE_FEC_MODE:
+		pi->phy.curr_user_fec_req = cache_data.data.curr_user_fec_req;
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * ice_caps_to_fc_mode
+ * @caps: PHY capabilities
+ *
+ * Convert PHY FC capabilities to ice FC mode
+ */
+enum ice_fc_mode ice_caps_to_fc_mode(u8 caps)
+{
+	if (caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE &&
+	    caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)
+		return ICE_FC_FULL;
+
+	if (caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE)
+		return ICE_FC_TX_PAUSE;
+
+	if (caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)
+		return ICE_FC_RX_PAUSE;
+
+	return ICE_FC_NONE;
+}
+
+/**
+ * ice_caps_to_fec_mode
+ * @caps: PHY capabilities
+ * @fec_options: Link FEC options
+ *
+ * Convert PHY FEC capabilities to ice FEC mode
+ */
+enum ice_fec_mode ice_caps_to_fec_mode(u8 caps, u8 fec_options)
+{
+	if (caps & ICE_AQC_PHY_EN_AUTO_FEC)
+		return ICE_FEC_AUTO;
+
+	if (fec_options & (ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN |
+			   ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ |
+			   ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN |
+			   ICE_AQC_PHY_FEC_25G_KR_REQ))
+		return ICE_FEC_BASER;
+
+	if (fec_options & (ICE_AQC_PHY_FEC_25G_RS_528_REQ |
+			   ICE_AQC_PHY_FEC_25G_RS_544_REQ |
+			   ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN))
+		return ICE_FEC_RS;
+
+	return ICE_FEC_NONE;
+}
+
+/**
+ * ice_cfg_phy_fc - Configure PHY FC data based on FC mode
+ * @pi: port information structure
+ * @cfg: PHY configuration data to set FC mode
+ * @req_mode: FC mode to configure
+ */
+enum ice_status
+ice_cfg_phy_fc(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg,
+	       enum ice_fc_mode req_mode)
+{
+	struct ice_phy_cache_mode_data cache_data;
+	u8 pause_mask = 0x0;
+
+	if (!pi || !cfg)
+		return ICE_ERR_BAD_PTR;
+	switch (req_mode) {
+	case ICE_FC_FULL:
+		pause_mask |= ICE_AQC_PHY_EN_TX_LINK_PAUSE;
+		pause_mask |= ICE_AQC_PHY_EN_RX_LINK_PAUSE;
+		break;
+	case ICE_FC_RX_PAUSE:
+		pause_mask |= ICE_AQC_PHY_EN_RX_LINK_PAUSE;
+		break;
+	case ICE_FC_TX_PAUSE:
+		pause_mask |= ICE_AQC_PHY_EN_TX_LINK_PAUSE;
+		break;
+	default:
+		break;
+	}
+
+	/* clear the old pause settings */
+	cfg->caps &= ~(ICE_AQC_PHY_EN_TX_LINK_PAUSE |
+		ICE_AQC_PHY_EN_RX_LINK_PAUSE);
+
+	/* set the new capabilities */
+	cfg->caps |= pause_mask;
+
+	/* Cache user FC request */
+	cache_data.data.curr_user_fc_req = req_mode;
+	ice_cache_phy_user_req(pi, cache_data, ICE_FC_MODE);
+
+	return 0;
+}
+
+/**
+ * ice_set_fc
+ * @pi: port information structure
+ * @aq_failures: pointer to status code, specific to ice_set_fc routine
+ * @ena_auto_link_update: enable automatic link update
+ *
+ * Set the requested flow control mode.
+ */
+enum ice_status
+ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool ena_auto_link_update)
+{
+	struct ice_aqc_set_phy_cfg_data  cfg = { 0 };
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	enum ice_status status;
+	struct ice_hw *hw;
+
+	if (!pi || !aq_failures)
+		return ICE_ERR_BAD_PTR;
+
+	*aq_failures = 0;
+	hw = pi->hw;
+
+	pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Get the current PHY config */
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG,
+				     pcaps, NULL);
+
+	if (status) {
+		*aq_failures = ICE_SET_FC_AQ_FAIL_GET;
+		goto out;
+	}
+
+	ice_copy_phy_caps_to_cfg(pi, pcaps, &cfg);
+
+	/* Configure the set PHY data */
+	status = ice_cfg_phy_fc(pi, &cfg, pi->fc.req_mode);
+	if (status)
+		goto out;
+
+	/* If the capabilities have changed, then set the new config */
+	if (cfg.caps != pcaps->caps) {
+		int retry_count, retry_max = 10;
+
+		/* Auto restart link so settings take effect */
+		if (ena_auto_link_update)
+			cfg.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+		status = ice_aq_set_phy_cfg(hw, pi, &cfg, NULL);
+		if (status) {
+			*aq_failures = ICE_SET_FC_AQ_FAIL_SET;
+			goto out;
+		}
+
+		/* Update the link info
+		 * It sometimes takes a really long time for link to
+		 * come back from the atomic reset. Thus, we wait a
+		 * little bit.
+		 */
+		for (retry_count = 0; retry_count < retry_max; retry_count++) {
+			status = ice_update_link_info(pi);
+
+			if (!status)
+				break;
+
+			msleep(100);
+		}
+
+		if (status)
+			*aq_failures = ICE_SET_FC_AQ_FAIL_UPDATE;
+	}
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), pcaps);
+	return status;
+}
+
+/**
+ * ice_phy_caps_equals_cfg
+ * @phy_caps: PHY capabilities
+ * @phy_cfg: PHY configuration
+ *
+ * Helper function to determine if PHY capabilities matches PHY
+ * configuration
+ */
+bool
+ice_phy_caps_equals_cfg(struct ice_aqc_get_phy_caps_data *phy_caps,
+			struct ice_aqc_set_phy_cfg_data *phy_cfg)
+{
+	u8 caps_mask, cfg_mask;
+
+	if (!phy_caps || !phy_cfg)
+		return false;
+
+	/* These bits are not common between capabilities and configuration.
+	 * Do not use them to determine equality.
+	 */
+	caps_mask = ICE_AQC_PHY_CAPS_MASK & ~(ICE_AQC_PHY_AN_MODE |
+					      ICE_AQC_PHY_EN_MOD_QUAL);
+	cfg_mask = ICE_AQ_PHY_ENA_VALID_MASK & ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+	if (phy_caps->phy_type_low != phy_cfg->phy_type_low ||
+	    phy_caps->phy_type_high != phy_cfg->phy_type_high ||
+	    ((phy_caps->caps & caps_mask) != (phy_cfg->caps & cfg_mask)) ||
+	    phy_caps->low_power_ctrl_an != phy_cfg->low_power_ctrl_an ||
+	    phy_caps->eee_cap != phy_cfg->eee_cap ||
+	    phy_caps->eeer_value != phy_cfg->eeer_value ||
+	    phy_caps->link_fec_options != phy_cfg->link_fec_opt)
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_copy_phy_caps_to_cfg - Copy PHY ability data to configuration data
+ * @pi: port information structure
+ * @caps: PHY ability structure to copy date from
+ * @cfg: PHY configuration structure to copy data to
+ *
+ * Helper function to copy AQC PHY get ability data to PHY set configuration
+ * data structure
+ */
+void
+ice_copy_phy_caps_to_cfg(struct ice_port_info *pi,
+			 struct ice_aqc_get_phy_caps_data *caps,
+			 struct ice_aqc_set_phy_cfg_data *cfg)
+{
+	if (!pi || !caps || !cfg)
+		return;
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->phy_type_low = caps->phy_type_low;
+	cfg->phy_type_high = caps->phy_type_high;
+	cfg->caps = caps->caps;
+	cfg->low_power_ctrl_an = caps->low_power_ctrl_an;
+	cfg->eee_cap = caps->eee_cap;
+	cfg->eeer_value = caps->eeer_value;
+	cfg->link_fec_opt = caps->link_fec_options;
+	cfg->module_compliance_enforcement =
+		caps->module_compliance_enforcement;
+}
+
+/**
+ * ice_cfg_phy_fec - Configure PHY FEC data based on FEC mode
+ * @pi: port information structure
+ * @cfg: PHY configuration data to set FEC mode
+ * @fec: FEC mode to configure
+ */
+enum ice_status
+ice_cfg_phy_fec(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg,
+		enum ice_fec_mode fec)
+{
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	enum ice_status status = 0;
+	struct ice_hw *hw;
+
+	if (!pi || !cfg)
+		return ICE_ERR_BAD_PTR;
+
+	hw = pi->hw;
+
+	pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_aq_get_phy_caps(pi, false,
+				     (ice_fw_supports_report_dflt_cfg(hw) ?
+				      ICE_AQC_REPORT_DFLT_CFG :
+				      ICE_AQC_REPORT_TOPO_CAP_MEDIA), pcaps, NULL);
+
+	if (status)
+		goto out;
+
+	cfg->caps |= (pcaps->caps & ICE_AQC_PHY_EN_AUTO_FEC);
+	cfg->link_fec_opt = pcaps->link_fec_options;
+
+	switch (fec) {
+	case ICE_FEC_BASER:
+		/* Clear RS bits, and AND BASE-R ability
+		 * bits and OR request bits.
+		 */
+		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN |
+			ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN;
+		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ |
+			ICE_AQC_PHY_FEC_25G_KR_REQ;
+		break;
+	case ICE_FEC_RS:
+		/* Clear BASE-R bits, and AND RS ability
+		 * bits and OR request bits.
+		 */
+		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN;
+		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_25G_RS_528_REQ |
+			ICE_AQC_PHY_FEC_25G_RS_544_REQ;
+		break;
+	case ICE_FEC_NONE:
+		/* Clear all FEC option bits. */
+		cfg->link_fec_opt &= ~ICE_AQC_PHY_FEC_MASK;
+		break;
+	case ICE_FEC_AUTO:
+		/* AND auto FEC bit, and all caps bits. */
+		cfg->caps &= ICE_AQC_PHY_CAPS_MASK;
+		cfg->link_fec_opt |= pcaps->link_fec_options;
+		break;
+	default:
+		status = ICE_ERR_PARAM;
+		break;
+	}
+
+	if (fec == ICE_FEC_AUTO && ice_fw_supports_link_override(pi->hw) &&
+	    !ice_fw_supports_report_dflt_cfg(pi->hw)) {
+		struct ice_link_default_override_tlv tlv;
+
+		if (ice_get_link_default_override(&tlv, pi))
+			goto out;
+
+		if (!(tlv.options & ICE_LINK_OVERRIDE_STRICT_MODE) &&
+		    (tlv.options & ICE_LINK_OVERRIDE_EN))
+			cfg->link_fec_opt = tlv.fec_options;
+	}
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), pcaps);
+
+	return status;
+}
+
+/**
+ * ice_get_link_status - get status of the HW network link
+ * @pi: port information structure
+ * @link_up: pointer to bool (true/false = linkup/linkdown)
+ *
+ * Variable link_up is true if link is up, false if link is down.
+ * The variable link_up is invalid if status is non zero. As a
+ * result of this call, link status reporting becomes enabled
+ */
+enum ice_status ice_get_link_status(struct ice_port_info *pi, bool *link_up)
+{
+	struct ice_phy_info *phy_info;
+	enum ice_status status = 0;
+
+	if (!pi || !link_up)
+		return ICE_ERR_PARAM;
+
+	phy_info = &pi->phy;
+
+	if (phy_info->get_link_info) {
+		status = ice_update_link_info(pi);
+
+		if (status)
+			ice_debug(pi->hw, ICE_DBG_LINK, "get link status error, status = %d\n",
+				  status);
+	}
+
+	*link_up = phy_info->link_info.link_info & ICE_AQ_LINK_UP;
+
+	return status;
+}
+
+/**
+ * ice_aq_set_link_restart_an
+ * @pi: pointer to the port information structure
+ * @ena_link: if true: enable link, if false: disable link
+ * @cd: pointer to command details structure or NULL
+ *
+ * Sets up the link and restarts the Auto-Negotiation over the link.
+ */
+enum ice_status
+ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
+			   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_restart_an *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.restart_an;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_restart_an);
+
+	cmd->cmd_flags = ICE_AQC_RESTART_AN_LINK_RESTART;
+	cmd->lport_num = pi->lport;
+	if (ena_link)
+		cmd->cmd_flags |= ICE_AQC_RESTART_AN_LINK_ENABLE;
+	else
+		cmd->cmd_flags &= ~ICE_AQC_RESTART_AN_LINK_ENABLE;
+
+	return ice_aq_send_cmd(pi->hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_set_event_mask
+ * @hw: pointer to the HW struct
+ * @port_num: port number of the physical function
+ * @mask: event mask to be set
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set event mask (0x0613)
+ */
+enum ice_status
+ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_event_mask *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_event_mask;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_event_mask);
+
+	cmd->lport_num = port_num;
+
+	cmd->event_mask = cpu_to_le16(mask);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_set_mac_loopback
+ * @hw: pointer to the HW struct
+ * @ena_lpbk: Enable or Disable loopback
+ * @cd: pointer to command details structure or NULL
+ *
+ * Enable/disable loopback on a given port
+ */
+enum ice_status
+ice_aq_set_mac_loopback(struct ice_hw *hw, bool ena_lpbk, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_mac_lb *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_mac_lb;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_mac_lb);
+	if (ena_lpbk)
+		cmd->lb_mode = ICE_AQ_MAC_LB_EN;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+
+/**
+ * ice_aq_set_port_id_led
+ * @pi: pointer to the port information
+ * @is_orig_mode: is this LED set to original mode (by the net-list)
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set LED value for the given port (0x06e9)
+ */
+enum ice_status
+ice_aq_set_port_id_led(struct ice_port_info *pi, bool is_orig_mode,
+		       struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_port_id_led *cmd;
+	struct ice_hw *hw = pi->hw;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_port_id_led;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_id_led);
+
+
+	if (is_orig_mode)
+		cmd->ident_mode = ICE_AQC_PORT_IDENT_LED_ORIG;
+	else
+		cmd->ident_mode = ICE_AQC_PORT_IDENT_LED_BLINK;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_sff_eeprom
+ * @hw: pointer to the HW struct
+ * @lport: bits [7:0] = logical port, bit [8] = logical port valid
+ * @bus_addr: I2C bus address of the eeprom (typically 0xA0, 0=topo default)
+ * @mem_addr: I2C offset. lower 8 bits for address, 8 upper bits zero padding.
+ * @page: QSFP page
+ * @set_page: set or ignore the page
+ * @data: pointer to data buffer to be read/written to the I2C device.
+ * @length: 1-16 for read, 1 for write.
+ * @write: 0 read, 1 for write.
+ * @cd: pointer to command details structure or NULL
+ *
+ * Read/Write SFF EEPROM (0x06EE)
+ */
+enum ice_status
+ice_aq_sff_eeprom(struct ice_hw *hw, u16 lport, u8 bus_addr,
+		  u16 mem_addr, u8 page, u8 set_page, u8 *data, u8 length,
+		  bool write, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_sff_eeprom *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!data || (mem_addr & 0xff00))
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_sff_eeprom);
+	cmd = &desc.params.read_write_sff_param;
+	desc.flags = cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd->lport_num = (u8)(lport & 0xff);
+	cmd->lport_num_valid = (u8)((lport >> 8) & 0x01);
+	cmd->i2c_bus_addr = cpu_to_le16(((bus_addr >> 1) &
+					 ICE_AQC_SFF_I2CBUS_7BIT_M) |
+					((set_page <<
+					  ICE_AQC_SFF_SET_EEPROM_PAGE_S) &
+					 ICE_AQC_SFF_SET_EEPROM_PAGE_M));
+	cmd->i2c_mem_addr = cpu_to_le16(mem_addr & 0xff);
+	cmd->eeprom_page = cpu_to_le16((u16)page << ICE_AQC_SFF_EEPROM_PAGE_S);
+	if (write)
+		cmd->i2c_bus_addr |= cpu_to_le16(ICE_AQC_SFF_IS_WRITE);
+
+	status = ice_aq_send_cmd(hw, &desc, data, length, cd);
+	return status;
+}
+
+/**
+ * ice_aq_prog_topo_dev_nvm
+ * @hw: pointer to the hardware structure
+ * @topo_params: pointer to structure storing topology parameters for a device
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program Topology Device NVM (0x06F2)
+ *
+ */
+enum ice_status
+ice_aq_prog_topo_dev_nvm(struct ice_hw *hw,
+			 struct ice_aqc_link_topo_params *topo_params,
+			 struct ice_sq_cd *cd)
+{
+	struct ice_aqc_prog_topo_dev_nvm *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.prog_topo_dev_nvm;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_prog_topo_dev_nvm);
+
+	memcpy(&cmd->topo_params, topo_params, sizeof(*topo_params));
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_read_topo_dev_nvm
+ * @hw: pointer to the hardware structure
+ * @topo_params: pointer to structure storing topology parameters for a device
+ * @start_address: byte offset in the topology device NVM
+ * @data: pointer to data buffer
+ * @data_size: number of bytes to be read from the topology device NVM
+ * @cd: pointer to command details structure or NULL
+ * Read Topology Device NVM (0x06F3)
+ *
+ */
+enum ice_status
+ice_aq_read_topo_dev_nvm(struct ice_hw *hw,
+			 struct ice_aqc_link_topo_params *topo_params,
+			 u32 start_address, u8 *data, u8 data_size,
+			 struct ice_sq_cd *cd)
+{
+	struct ice_aqc_read_topo_dev_nvm *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!data || data_size == 0 ||
+	    data_size > ICE_AQC_READ_TOPO_DEV_NVM_DATA_READ_SIZE)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.read_topo_dev_nvm;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_read_topo_dev_nvm);
+
+	desc.datalen = data_size;
+	memcpy(&cmd->topo_params, topo_params, sizeof(*topo_params));
+	cmd->start_address = cpu_to_le32(start_address);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (status)
+		return status;
+
+	memcpy(data, cmd->data_read, data_size);
+
+	return 0;
+}
+
+/**
+ * __ice_aq_get_set_rss_lut
+ * @hw: pointer to the hardware structure
+ * @params: RSS LUT parameters
+ * @set: set true to set the table, false to get the table
+ *
+ * Internal function to get (0x0B05) or set (0x0B03) RSS look up table
+ */
+static enum ice_status
+__ice_aq_get_set_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *params, bool set)
+{
+	u16 flags = 0, vsi_id, lut_type, lut_size, glob_lut_idx, vsi_handle;
+	struct ice_aqc_get_set_rss_lut *cmd_resp;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u8 *lut;
+
+	if (!params)
+		return ICE_ERR_PARAM;
+
+	vsi_handle = params->vsi_handle;
+	lut = params->lut;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle) || !lut)
+		return ICE_ERR_PARAM;
+
+	lut_size = params->lut_size;
+	lut_type = params->lut_type;
+	glob_lut_idx = params->global_lut_id;
+	vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+
+	cmd_resp = &desc.params.get_set_rss_lut;
+
+	if (set) {
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_lut);
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	} else {
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_lut);
+	}
+
+	cmd_resp->vsi_id = cpu_to_le16(((vsi_id <<
+					 ICE_AQC_GSET_RSS_LUT_VSI_ID_S) &
+					ICE_AQC_GSET_RSS_LUT_VSI_ID_M) |
+				       ICE_AQC_GSET_RSS_LUT_VSI_VALID);
+
+	switch (lut_type) {
+	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI:
+	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF:
+	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL:
+		flags |= ((lut_type << ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S) &
+			  ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_M);
+		break;
+	default:
+		status = ICE_ERR_PARAM;
+		goto ice_aq_get_set_rss_lut_exit;
+	}
+
+	if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL) {
+		flags |= ((glob_lut_idx << ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S) &
+			  ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_M);
+
+		if (!set)
+			goto ice_aq_get_set_rss_lut_send;
+	} else if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF) {
+		if (!set)
+			goto ice_aq_get_set_rss_lut_send;
+	} else {
+		goto ice_aq_get_set_rss_lut_send;
+	}
+
+	/* LUT size is only valid for Global and PF table types */
+	switch (lut_size) {
+	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128:
+		break;
+	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512:
+		flags |= (ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512_FLAG <<
+			  ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S) &
+			 ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M;
+		break;
+	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K:
+		if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF) {
+			flags |= (ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K_FLAG <<
+				  ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S) &
+				 ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M;
+			break;
+		}
+		/* fall-through */
+	default:
+		status = ICE_ERR_PARAM;
+		goto ice_aq_get_set_rss_lut_exit;
+	}
+
+ice_aq_get_set_rss_lut_send:
+	cmd_resp->flags = cpu_to_le16(flags);
+	status = ice_aq_send_cmd(hw, &desc, lut, lut_size, NULL);
+
+ice_aq_get_set_rss_lut_exit:
+	return status;
+}
+
+/**
+ * ice_aq_get_rss_lut
+ * @hw: pointer to the hardware structure
+ * @get_params: RSS LUT parameters used to specify which RSS LUT to get
+ *
+ * get the RSS lookup table, PF or VSI type
+ */
+enum ice_status
+ice_aq_get_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *get_params)
+{
+	return __ice_aq_get_set_rss_lut(hw, get_params, false);
+}
+
+/**
+ * ice_aq_set_rss_lut
+ * @hw: pointer to the hardware structure
+ * @set_params: RSS LUT parameters used to specify how to set the RSS LUT
+ *
+ * set the RSS lookup table, PF or VSI type
+ */
+enum ice_status
+ice_aq_set_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *set_params)
+{
+	return __ice_aq_get_set_rss_lut(hw, set_params, true);
+}
+
+/**
+ * __ice_aq_get_set_rss_key
+ * @hw: pointer to the HW struct
+ * @vsi_id: VSI FW index
+ * @key: pointer to key info struct
+ * @set: set true to set the key, false to get the key
+ *
+ * get (0x0B04) or set (0x0B02) the RSS key per VSI
+ */
+static enum
+ice_status __ice_aq_get_set_rss_key(struct ice_hw *hw, u16 vsi_id,
+				    struct ice_aqc_get_set_rss_keys *key,
+				    bool set)
+{
+	struct ice_aqc_get_set_rss_key *cmd_resp;
+	u16 key_size = sizeof(*key);
+	struct ice_aq_desc desc;
+
+	cmd_resp = &desc.params.get_set_rss_key;
+
+	if (set) {
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_key);
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	} else {
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_key);
+	}
+
+	cmd_resp->vsi_id = cpu_to_le16(((vsi_id <<
+					 ICE_AQC_GSET_RSS_KEY_VSI_ID_S) &
+					ICE_AQC_GSET_RSS_KEY_VSI_ID_M) |
+				       ICE_AQC_GSET_RSS_KEY_VSI_VALID);
+
+	return ice_aq_send_cmd(hw, &desc, key, key_size, NULL);
+}
+
+/**
+ * ice_aq_get_rss_key
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ * @key: pointer to key info struct
+ *
+ * get the RSS key per VSI
+ */
+enum ice_status
+ice_aq_get_rss_key(struct ice_hw *hw, u16 vsi_handle,
+		   struct ice_aqc_get_set_rss_keys *key)
+{
+	if (!ice_is_vsi_valid(hw, vsi_handle) || !key)
+		return ICE_ERR_PARAM;
+
+	return __ice_aq_get_set_rss_key(hw, ice_get_hw_vsi_num(hw, vsi_handle),
+					key, false);
+}
+
+/**
+ * ice_aq_set_rss_key
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ * @keys: pointer to key info struct
+ *
+ * set the RSS key per VSI
+ */
+enum ice_status
+ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle,
+		   struct ice_aqc_get_set_rss_keys *keys)
+{
+	if (!ice_is_vsi_valid(hw, vsi_handle) || !keys)
+		return ICE_ERR_PARAM;
+
+	return __ice_aq_get_set_rss_key(hw, ice_get_hw_vsi_num(hw, vsi_handle),
+					keys, true);
+}
+
+/**
+ * ice_aq_add_lan_txq
+ * @hw: pointer to the hardware structure
+ * @num_qgrps: Number of added queue groups
+ * @qg_list: list of queue groups to be added
+ * @buf_size: size of buffer for indirect command
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add Tx LAN queue (0x0C30)
+ *
+ * NOTE:
+ * Prior to calling add Tx LAN queue:
+ * Initialize the following as part of the Tx queue context:
+ * Completion queue ID if the queue uses Completion queue, Quanta profile,
+ * Cache profile and Packet shaper profile.
+ *
+ * After add Tx LAN queue AQ command is completed:
+ * Interrupts should be associated with specific queues,
+ * Association of Tx queue to Doorbell queue is not part of Add LAN Tx queue
+ * flow.
+ */
+static enum ice_status
+ice_aq_add_lan_txq(struct ice_hw *hw, u8 num_qgrps,
+		   struct ice_aqc_add_tx_qgrp *qg_list, u16 buf_size,
+		   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_tx_qgrp *list;
+	struct ice_aqc_add_txqs *cmd;
+	struct ice_aq_desc desc;
+	u16 i, sum_size = 0;
+
+	cmd = &desc.params.add_txqs;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_txqs);
+
+	if (!qg_list)
+		return ICE_ERR_PARAM;
+
+	if (num_qgrps > ICE_LAN_TXQ_MAX_QGRPS)
+		return ICE_ERR_PARAM;
+
+	for (i = 0, list = qg_list; i < num_qgrps; i++) {
+		sum_size += struct_size(list, txqs, list->num_txqs);
+		list = (struct ice_aqc_add_tx_qgrp *)(list->txqs +
+						      list->num_txqs);
+	}
+
+	if (buf_size != sum_size)
+		return ICE_ERR_PARAM;
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_qgrps = num_qgrps;
+
+	return ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
+}
+
+/**
+ * ice_aq_dis_lan_txq
+ * @hw: pointer to the hardware structure
+ * @num_qgrps: number of groups in the list
+ * @qg_list: the list of groups to disable
+ * @buf_size: the total size of the qg_list buffer in bytes
+ * @rst_src: if called due to reset, specifies the reset source
+ * @vmvf_num: the relative VM or VF number that is undergoing the reset
+ * @cd: pointer to command details structure or NULL
+ *
+ * Disable LAN Tx queue (0x0C31)
+ */
+static enum ice_status
+ice_aq_dis_lan_txq(struct ice_hw *hw, u8 num_qgrps,
+		   struct ice_aqc_dis_txq_item *qg_list, u16 buf_size,
+		   enum ice_disq_rst_src rst_src, u16 vmvf_num,
+		   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_dis_txq_item *item;
+	struct ice_aqc_dis_txqs *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u16 i, sz = 0;
+
+	cmd = &desc.params.dis_txqs;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dis_txqs);
+
+	/* qg_list can be NULL only in VM/VF reset flow */
+	if (!qg_list && !rst_src)
+		return ICE_ERR_PARAM;
+
+	if (num_qgrps > ICE_LAN_TXQ_MAX_QGRPS)
+		return ICE_ERR_PARAM;
+
+	cmd->num_entries = num_qgrps;
+
+	cmd->vmvf_and_timeout = cpu_to_le16((5 << ICE_AQC_Q_DIS_TIMEOUT_S) &
+					    ICE_AQC_Q_DIS_TIMEOUT_M);
+
+	switch (rst_src) {
+	case ICE_VM_RESET:
+		cmd->cmd_type = ICE_AQC_Q_DIS_CMD_VM_RESET;
+		cmd->vmvf_and_timeout |=
+			cpu_to_le16(vmvf_num & ICE_AQC_Q_DIS_VMVF_NUM_M);
+		break;
+	case ICE_VF_RESET:
+		cmd->cmd_type = ICE_AQC_Q_DIS_CMD_VF_RESET;
+		/* In this case, FW expects vmvf_num to be absolute VF ID */
+		cmd->vmvf_and_timeout |=
+			cpu_to_le16((vmvf_num + hw->func_caps.vf_base_id) &
+				    ICE_AQC_Q_DIS_VMVF_NUM_M);
+		break;
+	case ICE_NO_RESET:
+	default:
+		break;
+	}
+
+	/* flush pipe on time out */
+	cmd->cmd_type |= ICE_AQC_Q_DIS_CMD_FLUSH_PIPE;
+	/* If no queue group info, we are in a reset flow. Issue the AQ */
+	if (!qg_list)
+		goto do_aq;
+
+	/* set RD bit to indicate that command buffer is provided by the driver
+	 * and it needs to be read by the firmware
+	 */
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	for (i = 0, item = qg_list; i < num_qgrps; i++) {
+		u16 item_size = struct_size(item, q_id, item->num_qs);
+
+		/* If the num of queues is even, add 2 bytes of padding */
+		if ((item->num_qs % 2) == 0)
+			item_size += 2;
+
+		sz += item_size;
+
+		item = (struct ice_aqc_dis_txq_item *)((u8 *)item + item_size);
+	}
+
+	if (buf_size != sz)
+		return ICE_ERR_PARAM;
+
+do_aq:
+	status = ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
+	if (status) {
+		if (!qg_list)
+			ice_debug(hw, ICE_DBG_SCHED, "VM%d disable failed %d\n",
+				  vmvf_num, hw->adminq.sq_last_status);
+		else
+			ice_debug(hw, ICE_DBG_SCHED, "disable queue %d failed %d\n",
+				  le16_to_cpu(qg_list[0].q_id[0]),
+				  hw->adminq.sq_last_status);
+	}
+	return status;
+}
+
+/**
+ * ice_aq_move_recfg_lan_txq
+ * @hw: pointer to the hardware structure
+ * @num_qs: number of queues to move/reconfigure
+ * @is_move: true if this operation involves node movement
+ * @is_tc_change: true if this operation involves a TC change
+ * @subseq_call: true if this operation is a subsequent call
+ * @flush_pipe: on timeout, true to flush pipe, false to return EAGAIN
+ * @timeout: timeout in units of 100 usec (valid values 0-50)
+ * @blocked_cgds: out param, bitmap of CGDs that timed out if returning EAGAIN
+ * @buf: struct containing src/dest TEID and per-queue info
+ * @buf_size: size of buffer for indirect command
+ * @txqs_moved: out param, number of queues successfully moved
+ * @cd: pointer to command details structure or NULL
+ *
+ * Move / Reconfigure Tx LAN queues (0x0C32)
+ */
+enum ice_status
+ice_aq_move_recfg_lan_txq(struct ice_hw *hw, u8 num_qs, bool is_move,
+			  bool is_tc_change, bool subseq_call, bool flush_pipe,
+			  u8 timeout, u32 *blocked_cgds,
+			  struct ice_aqc_move_txqs_data *buf, u16 buf_size,
+			  u8 *txqs_moved, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_move_txqs *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.move_txqs;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_move_recfg_txqs);
+
+#define ICE_LAN_TXQ_MOVE_TIMEOUT_MAX 50
+	if (timeout > ICE_LAN_TXQ_MOVE_TIMEOUT_MAX)
+		return ICE_ERR_PARAM;
+
+	if (is_tc_change && !flush_pipe && !blocked_cgds)
+		return ICE_ERR_PARAM;
+
+	if (!is_move && !is_tc_change)
+		return ICE_ERR_PARAM;
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	if (is_move)
+		cmd->cmd_type |= ICE_AQC_Q_CMD_TYPE_MOVE;
+
+	if (is_tc_change)
+		cmd->cmd_type |= ICE_AQC_Q_CMD_TYPE_TC_CHANGE;
+
+	if (subseq_call)
+		cmd->cmd_type |= ICE_AQC_Q_CMD_SUBSEQ_CALL;
+
+	if (flush_pipe)
+		cmd->cmd_type |= ICE_AQC_Q_CMD_FLUSH_PIPE;
+
+	cmd->num_qs = num_qs;
+	cmd->timeout = ((timeout << ICE_AQC_Q_CMD_TIMEOUT_S) &
+			ICE_AQC_Q_CMD_TIMEOUT_M);
+
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+
+	if (!status && txqs_moved)
+		*txqs_moved = cmd->num_qs;
+
+	if (hw->adminq.sq_last_status == ICE_AQ_RC_EAGAIN &&
+	    is_tc_change && !flush_pipe)
+		*blocked_cgds = le32_to_cpu(cmd->blocked_cgds);
+
+	return status;
+}
+
+/**
+ * ice_aq_add_rdma_qsets
+ * @hw: pointer to the hardware structure
+ * @num_qset_grps: Number of RDMA Qset groups
+ * @qset_list: list of qset groups to be added
+ * @buf_size: size of buffer for indirect command
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add Tx RDMA Qsets (0x0C33)
+ */
+static enum ice_status
+ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps,
+		      struct ice_aqc_add_rdma_qset_data *qset_list,
+		      u16 buf_size, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_rdma_qset_data *list;
+	struct ice_aqc_add_rdma_qset *cmd;
+	struct ice_aq_desc desc;
+	u16 i, sum_size = 0;
+
+	cmd = &desc.params.add_rdma_qset;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_rdma_qset);
+
+	if (!qset_list)
+		return ICE_ERR_PARAM;
+
+	if (num_qset_grps > ICE_LAN_TXQ_MAX_QGRPS)
+		return ICE_ERR_PARAM;
+
+	for (i = 0, list = qset_list; i < num_qset_grps; i++) {
+		u16 num_qsets = le16_to_cpu(list->num_qsets);
+
+		sum_size += struct_size(list, rdma_qsets, num_qsets);
+		list = (struct ice_aqc_add_rdma_qset_data *)(list->rdma_qsets +
+							     num_qsets);
+	}
+
+	if (buf_size != sum_size)
+		return ICE_ERR_PARAM;
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_qset_grps = num_qset_grps;
+
+	return ice_aq_send_cmd(hw, &desc, qset_list, buf_size, cd);
+}
+
+/* End of FW Admin Queue command wrappers */
+
+/**
+ * ice_write_byte - write a byte to a packed context structure
+ * @src_ctx:  the context structure to read from
+ * @dest_ctx: the context to be written to
+ * @ce_info:  a description of the struct to be filled
+ */
+static void
+ice_write_byte(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+{
+	u8 src_byte, dest_byte, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src_ctx + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = (u8)(BIT(ce_info->width) - 1);
+
+	src_byte = *from;
+	src_byte &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_byte <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = dest_ctx + (ce_info->lsb / 8);
+
+	memcpy(&dest_byte, dest, sizeof(dest_byte));
+
+	dest_byte &= ~mask;	/* get the bits not changing */
+	dest_byte |= src_byte;	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_byte, sizeof(dest_byte));
+}
+
+/**
+ * ice_write_word - write a word to a packed context structure
+ * @src_ctx:  the context structure to read from
+ * @dest_ctx: the context to be written to
+ * @ce_info:  a description of the struct to be filled
+ */
+static void
+ice_write_word(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+{
+	u16 src_word, mask;
+	__le16 dest_word;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src_ctx + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = BIT(ce_info->width) - 1;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_word = *(u16 *)from;
+	src_word &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_word <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = dest_ctx + (ce_info->lsb / 8);
+
+	memcpy(&dest_word, dest, sizeof(dest_word));
+
+	dest_word &= ~(cpu_to_le16(mask));	/* get the bits not changing */
+	dest_word |= cpu_to_le16(src_word);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_word, sizeof(dest_word));
+}
+
+/**
+ * ice_write_dword - write a dword to a packed context structure
+ * @src_ctx:  the context structure to read from
+ * @dest_ctx: the context to be written to
+ * @ce_info:  a description of the struct to be filled
+ */
+static void
+ice_write_dword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+{
+	u32 src_dword, mask;
+	__le32 dest_dword;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src_ctx + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 32 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 5 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 32)
+		mask = BIT(ce_info->width) - 1;
+	else
+		mask = (u32)~0;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_dword = *(u32 *)from;
+	src_dword &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_dword <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = dest_ctx + (ce_info->lsb / 8);
+
+	memcpy(&dest_dword, dest, sizeof(dest_dword));
+
+	dest_dword &= ~(cpu_to_le32(mask));	/* get the bits not changing */
+	dest_dword |= cpu_to_le32(src_dword);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_dword, sizeof(dest_dword));
+}
+
+/**
+ * ice_write_qword - write a qword to a packed context structure
+ * @src_ctx:  the context structure to read from
+ * @dest_ctx: the context to be written to
+ * @ce_info:  a description of the struct to be filled
+ */
+static void
+ice_write_qword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+{
+	u64 src_qword, mask;
+	__le64 dest_qword;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src_ctx + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 64 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 6 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 64)
+		mask = BIT_ULL(ce_info->width) - 1;
+	else
+		mask = (u64)~0;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_qword = *(u64 *)from;
+	src_qword &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_qword <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = dest_ctx + (ce_info->lsb / 8);
+
+	memcpy(&dest_qword, dest, sizeof(dest_qword));
+
+	dest_qword &= ~(cpu_to_le64(mask));	/* get the bits not changing */
+	dest_qword |= cpu_to_le64(src_qword);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_qword, sizeof(dest_qword));
+}
+
+/**
+ * ice_set_ctx - set context bits in packed structure
+ * @hw: pointer to the hardware structure
+ * @src_ctx:  pointer to a generic non-packed context structure
+ * @dest_ctx: pointer to memory for the packed structure
+ * @ce_info:  a description of the structure to be transformed
+ */
+enum ice_status
+ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx,
+	    const struct ice_ctx_ele *ce_info)
+{
+	int f;
+
+	for (f = 0; ce_info[f].width; f++) {
+		/* We have to deal with each element of the FW response
+		 * using the correct size so that we are correct regardless
+		 * of the endianness of the machine.
+		 */
+		if (ce_info[f].width > (ce_info[f].size_of * BITS_PER_BYTE)) {
+			ice_debug(hw, ICE_DBG_QCTX, "Field %d width of %d bits larger than size of %d byte(s) ... skipping write\n",
+				  f, ce_info[f].width, ce_info[f].size_of);
+			continue;
+		}
+		switch (ce_info[f].size_of) {
+		case sizeof(u8):
+			ice_write_byte(src_ctx, dest_ctx, &ce_info[f]);
+			break;
+		case sizeof(u16):
+			ice_write_word(src_ctx, dest_ctx, &ce_info[f]);
+			break;
+		case sizeof(u32):
+			ice_write_dword(src_ctx, dest_ctx, &ce_info[f]);
+			break;
+		case sizeof(u64):
+			ice_write_qword(src_ctx, dest_ctx, &ce_info[f]);
+			break;
+		default:
+			return ICE_ERR_INVAL_SIZE;
+		}
+	}
+
+	return 0;
+}
+
+
+/**
+ * ice_print_sched_elem - parse through an element struct in a branch
+ * @hw: ice hardware struct
+ * @elem: element number in a branch
+ * @data: corresponding element info struct to extract data from
+ */
+static void
+ice_print_sched_elem(struct ice_hw *hw, int elem,
+		     struct ice_aqc_txsched_elem_data *data)
+{
+	struct ice_aqc_txsched_elem *d = &data->data;
+	unsigned long valid_sec = d->valid_sections;
+	char str[128];
+	int i;
+
+	dev_info(ice_hw_to_dev(hw), "\t\telement %d\n", elem);
+	dev_info(ice_hw_to_dev(hw), "\t\t\tparent TEID %d\n",
+		 le32_to_cpu(data->parent_teid));
+	dev_info(ice_hw_to_dev(hw), "\t\t\tnode TEID %d\n",
+		 le32_to_cpu(data->node_teid));
+
+	switch (d->elem_type) {
+	case ICE_AQC_ELEM_TYPE_UNDEFINED:
+		snprintf(str, sizeof(str), "undefined");
+		break;
+	case ICE_AQC_ELEM_TYPE_ROOT_PORT:
+		snprintf(str, sizeof(str), "root port");
+		break;
+	case ICE_AQC_ELEM_TYPE_TC:
+		snprintf(str, sizeof(str), "tc");
+		break;
+	case ICE_AQC_ELEM_TYPE_SE_GENERIC:
+		snprintf(str, sizeof(str), "se generic");
+		break;
+	case ICE_AQC_ELEM_TYPE_ENTRY_POINT:
+		snprintf(str, sizeof(str), "sw entry point se");
+		break;
+	case ICE_AQC_ELEM_TYPE_LEAF:
+		snprintf(str, sizeof(str), "leaf");
+		break;
+	case ICE_AQC_ELEM_TYPE_SE_PADDED:
+		snprintf(str, sizeof(str), "se padded");
+		break;
+	default:
+		snprintf(str, sizeof(str), "unknown");
+		break;
+	}
+	dev_info(ice_hw_to_dev(hw), "\t\t\telement type %s\n", str);
+
+	dev_info(ice_hw_to_dev(hw), "\t\t\tvalid sections\n");
+	/* iterate through valid sections */
+	for_each_set_bit(i, &valid_sec, ICE_SCHED_VALID_SEC_BITS) {
+		switch (BIT(i)) {
+		case ICE_AQC_ELEM_VALID_GENERIC:
+			snprintf(str, sizeof(str), "generic section");
+			break;
+		case ICE_AQC_ELEM_VALID_CIR:
+			snprintf(str, sizeof(str),
+				 "cir bw:profile id %d, weight %d",
+				 le16_to_cpu(d->cir_bw.bw_profile_idx),
+				 le16_to_cpu(d->cir_bw.bw_alloc));
+			break;
+		case ICE_AQC_ELEM_VALID_EIR:
+			snprintf(str, sizeof(str),
+				 "eir bw:profile id %d, weight %d",
+				 le16_to_cpu(d->eir_bw.bw_profile_idx),
+				 le16_to_cpu(d->eir_bw.bw_alloc));
+			break;
+		case ICE_AQC_ELEM_VALID_SHARED:
+			snprintf(str, sizeof(str),
+				 "shared bw: rl profile id %d",
+				 le16_to_cpu(d->srl_id));
+			break;
+		default:
+			snprintf(str, sizeof(str), "unknown");
+			break;
+		}
+		dev_info(ice_hw_to_dev(hw), "\t\t\t\t%s\n", str);
+	}
+
+	dev_info(ice_hw_to_dev(hw), "\t\t\tgeneric\n");
+	snprintf(str, sizeof(str), "%s",
+		 (d->generic & ICE_AQC_ELEM_GENERIC_MODE_M) ?  "pss" : "bps");
+	dev_info(ice_hw_to_dev(hw), "\t\t\t\tscheduling mode %s\n", str);
+
+	dev_info(ice_hw_to_dev(hw), "\t\t\t\tpriority %d\n",
+		 ((d->generic & ICE_AQC_ELEM_GENERIC_PRIO_M) >> ICE_AQC_ELEM_GENERIC_PRIO_S));
+
+	dev_info(ice_hw_to_dev(hw), "\t\t\t\tadjustment value %d\n",
+		 (d->generic & ICE_AQC_ELEM_GENERIC_ADJUST_VAL_M) >> ICE_AQC_ELEM_GENERIC_ADJUST_VAL_S);
+}
+
+/**
+ * ice_dump_port_dflt_topo - print scheduler tree topology for a port
+ * @pi: pointer to the port_info structure
+ */
+enum ice_status ice_dump_port_dflt_topo(struct ice_port_info *pi)
+{
+	struct ice_aqc_get_topo_elem *buf;
+	struct ice_hw *hw = pi->hw;
+	u16 j, buf_size, num_elem;
+	enum ice_status ret;
+	u8 i, num_branches;
+
+	/* allocate memory for response buffer */
+	buf_size = sizeof(*buf) * ICE_TXSCHED_MAX_BRANCHES;
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+	ret = ice_aq_get_dflt_topo(hw, pi->lport, buf, buf_size, &num_branches,
+				   NULL);
+	if (ret) {
+		ret = ICE_ERR_CFG;
+		goto err_exit;
+	}
+
+	if (num_branches < 2 || num_branches > ICE_TXSCHED_MAX_BRANCHES)
+		dev_info(ice_hw_to_dev(hw),
+			 "CHECK: num_branches unexpected %d\n", num_branches);
+
+	dev_info(ice_hw_to_dev(hw), "scheduler tree topology for port %d\n",
+		 pi->lport);
+
+	/* iterate through all branches */
+	for (i = 0; i < num_branches; i++) {
+		dev_info(ice_hw_to_dev(hw), "\tbranch %d\n", i);
+		num_elem = le16_to_cpu(buf[i].hdr.num_elems);
+
+		if (num_elem < 2 || num_elem > ICE_AQC_TOPO_MAX_LEVEL_NUM)
+			dev_info(ice_hw_to_dev(hw),
+				 "CHECK: num_elems unexpected %d\n", num_elem);
+		/* iterate through all elements in a branch */
+		for (j = 0; j < num_elem; j++)
+			ice_print_sched_elem(hw, j, &buf[i].generic[j]);
+	}
+
+err_exit:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return ret;
+}
+
+/**
+ * ice_sched_print_tree - prints the node information
+ * @hw: pointer to the HW struct
+ * @node: pointer to the node
+ *
+ * This function prints the node and all its children information
+ */
+static void ice_sched_print_tree(struct ice_hw *hw, struct ice_sched_node *node)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+	enum ice_status status;
+	u8 i;
+
+	if (!node)
+		return;
+	for (i = 0; i < node->num_children; i++)
+		ice_sched_print_tree(hw, node->children[i]);
+	dev_info(ice_hw_to_dev(hw),
+		 "Node Layer: 0x%x Node TEID: 0x%x Parent TEID: 0x%x num_children: %d tc_num :0x%x\n",
+		 node->tx_sched_layer, le32_to_cpu(node->info.node_teid),
+		 le32_to_cpu(node->info.parent_teid), node->num_children,
+		 node->tc_num);
+	/* print the current RL values and suspend state */
+	status = ice_sched_query_elem(hw,  le32_to_cpu(node->info.node_teid),
+				      &buf);
+	if (status)
+		return;
+	data = &buf.data;
+	dev_info(ice_hw_to_dev(hw), "elem type 0x%x valid sec 0x%x\n",
+		 data->elem_type, data->valid_sections);
+	dev_info(ice_hw_to_dev(hw), "generic 0x%x Flags 0x%x\n",
+		 data->generic, data->flags);
+	dev_info(ice_hw_to_dev(hw),
+		 "BW Profile: CIR id 0x%x alloc 0x%x EIR id 0x%x alloc 0x%x SRL id alloc 0x%x\n",
+		 le16_to_cpu(data->cir_bw.bw_profile_idx),
+		 le16_to_cpu(data->cir_bw.bw_alloc),
+		 le16_to_cpu(data->eir_bw.bw_profile_idx),
+		 le16_to_cpu(data->eir_bw.bw_alloc),
+		 le16_to_cpu(data->srl_id));
+}
+
+/**
+ * ice_dump_port_topo - prints the tree topology of the port
+ * @pi: port information structure
+ *
+ * This function prints the tree topology of the port
+ */
+void ice_dump_port_topo(struct ice_port_info *pi)
+{
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return;
+
+	mutex_lock(&pi->sched_lock);
+	ice_sched_print_tree(pi->hw, pi->root);
+	mutex_unlock(&pi->sched_lock);
+}
+
+/**
+ * ice_dump_common_caps - print struct ice_hw_common_caps fields
+ * @hw: pointer to the ice_hw instance
+ * @caps: pointer to common caps instance
+ * @prefix: string to prefix when printing
+ */
+static void
+ice_dump_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
+		     char const *prefix)
+{
+	dev_info(ice_hw_to_dev(hw), "%s: switching_mode = %d\n", prefix,
+		 caps->switching_mode);
+	dev_info(ice_hw_to_dev(hw), "%s: mgmt_mode = %d\n", prefix,
+		 caps->mgmt_mode);
+	dev_info(ice_hw_to_dev(hw), "%s: mgmt_protocols_mctp = %d\n", prefix,
+		 caps->mgmt_protocols_mctp);
+	dev_info(ice_hw_to_dev(hw), "%s: os2bmc = %d\n", prefix, caps->os2bmc);
+	dev_info(ice_hw_to_dev(hw), "%s: valid_functions (bitmap) = %d\n",
+		 prefix, caps->valid_functions);
+	dev_info(ice_hw_to_dev(hw), "%s: sr_iov_1_1 = %d\n", prefix,
+		 caps->sr_iov_1_1);
+	dev_info(ice_hw_to_dev(hw), "%s: vmdq = %d\n", prefix, caps->vmdq);
+	dev_info(ice_hw_to_dev(hw), "%s: evb_802_1_qbg = %d\n", prefix,
+		 caps->evb_802_1_qbg);
+	dev_info(ice_hw_to_dev(hw), "%s: evb_802_1_qbh = %d\n", prefix,
+		 caps->evb_802_1_qbh);
+	dev_info(ice_hw_to_dev(hw), "%s: dcb = %d\n", prefix, caps->dcb);
+	dev_info(ice_hw_to_dev(hw), "%s: active_tc_bitmap = %d\n", prefix,
+		 caps->active_tc_bitmap);
+	dev_info(ice_hw_to_dev(hw), "%s: maxtc = %d\n", prefix, caps->maxtc);
+	dev_info(ice_hw_to_dev(hw), "%s: iscsi = %d\n", prefix, caps->iscsi);
+	dev_info(ice_hw_to_dev(hw), "%s: rss_table_size = %d\n", prefix,
+		 caps->rss_table_size);
+	dev_info(ice_hw_to_dev(hw), "%s: rss_table_entry_width = %d\n",
+		 prefix, caps->rss_table_entry_width);
+	dev_info(ice_hw_to_dev(hw), "%s: num_rxq = %d\n", prefix,
+		 caps->num_rxq);
+	dev_info(ice_hw_to_dev(hw), "%s: rxq_first_id = %d\n", prefix,
+		 caps->rxq_first_id);
+	dev_info(ice_hw_to_dev(hw), "%s: num_txq = %d\n", prefix,
+		 caps->num_txq);
+	dev_info(ice_hw_to_dev(hw), "%s: txq_first_id = %d\n", prefix,
+		 caps->txq_first_id);
+	dev_info(ice_hw_to_dev(hw), "%s: num_msix_vectors = %d\n", prefix,
+		 caps->num_msix_vectors);
+	dev_info(ice_hw_to_dev(hw), "%s: msix_vector_first_id = %d\n", prefix,
+		 caps->msix_vector_first_id);
+	dev_info(ice_hw_to_dev(hw), "%s: ieee_1588 = %d\n", prefix,
+		 caps->ieee_1588);
+	dev_info(ice_hw_to_dev(hw), "%s: mgmt_cem = %d\n", prefix,
+		 caps->mgmt_cem);
+	dev_info(ice_hw_to_dev(hw), "%s: iwarp = %d\n", prefix, caps->iwarp);
+	dev_info(ice_hw_to_dev(hw), "%s: wr_csr_prot = 0x%llX\n", prefix,
+		 (unsigned long long)caps->wr_csr_prot);
+	dev_info(ice_hw_to_dev(hw), "%s: num_wol_proxy_fltr = %d\n", prefix,
+		 caps->num_wol_proxy_fltr);
+	dev_info(ice_hw_to_dev(hw), "%s: wol_proxy_vsi_seid = %d\n", prefix,
+		 caps->wol_proxy_vsi_seid);
+	dev_info(ice_hw_to_dev(hw), "%s: max_mtu = %d\n", prefix,
+		 caps->max_mtu);
+	ice_print_led_caps(hw, caps, prefix, false);
+	ice_print_sdp_caps(hw, caps, prefix, false);
+}
+
+/**
+ * ice_dump_func_caps - Dump function capabilities
+ * @hw: pointer to the ice_hw instance
+ * @func_caps: pointer to function capabilities struct
+ */
+static void
+ice_dump_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_caps)
+{
+	char const *prefix = "func cap";
+
+	ice_dump_common_caps(hw, &func_caps->common_cap, prefix);
+	dev_info(ice_hw_to_dev(hw), "%s: num_allocd_vfs = %d\n", prefix,
+		 func_caps->num_allocd_vfs);
+	dev_info(ice_hw_to_dev(hw), "%s: vf_base_id = %d\n", prefix,
+		 func_caps->vf_base_id);
+	dev_info(ice_hw_to_dev(hw), "%s: guar_num_vsi = %d\n", prefix,
+		 func_caps->guar_num_vsi);
+	dev_info(ice_hw_to_dev(hw), "%s: fd_fltr_guar = %d\n", prefix,
+		 func_caps->fd_fltr_guar);
+	dev_info(ice_hw_to_dev(hw), "%s: fd_fltr_best_effort = %d\n", prefix,
+		 func_caps->fd_fltr_best_effort);
+}
+
+/**
+ * ice_dump_dev_caps - Dump device capabilities
+ * @hw: pointer to the ice_hw instance
+ * @dev_caps: pointer to device capabilities struct
+ */
+static void
+ice_dump_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_caps)
+{
+	char const *prefix = "dev cap";
+
+	ice_dump_common_caps(hw, &dev_caps->common_cap, prefix);
+	dev_info(ice_hw_to_dev(hw), "%s: num_vfs_exposed = %d\n", prefix,
+		 dev_caps->num_vfs_exposed);
+	dev_info(ice_hw_to_dev(hw), "%s: num_vsi_allocd_to_host = %d\n",
+		 prefix, dev_caps->num_vsi_allocd_to_host);
+	dev_info(ice_hw_to_dev(hw), "%s: num_flow_director_fltr = %d\n",
+		 prefix, dev_caps->num_flow_director_fltr);
+}
+
+/**
+ * ice_dump_ptp_func_caps - Dump function PTP capabilities
+ * @hw: pointer to the ice_hw instance
+ */
+void ice_dump_ptp_func_caps(struct ice_hw *hw)
+{
+	struct ice_ts_func_info *ptpfunc = &hw->func_caps.ts_func_info;
+
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: ena = %d\n", ptpfunc->ena);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: src_tmr_owned = %d\n",
+		 ptpfunc->src_tmr_owned);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: tmr_ena = %d\n",
+		 ptpfunc->tmr_ena);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: tmr_index_owned = %d\n",
+		 ptpfunc->tmr_index_owned);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: clk_freq = %d\n",
+		 ptpfunc->clk_freq);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: clk_src = %d\n",
+		 ptpfunc->clk_src);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: tmr_index_assoc = %d\n",
+		 ptpfunc->tmr_index_assoc);
+}
+
+/**
+ * ice_dump_ptp_dev_caps - Dump device PTP capabilities
+ * @hw: pointer to the ice_hw instance
+ */
+void ice_dump_ptp_dev_caps(struct ice_hw *hw)
+{
+	struct ice_ts_dev_info *ptpdev = &hw->dev_caps.ts_dev_info;
+
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr0_owner = %d\n",
+		 ptpdev->tmr0_owner);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr0_owned = %d\n",
+		 ptpdev->tmr0_owned);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr1_owner = %d\n",
+		 ptpdev->tmr1_owner);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr1_owned = %d\n",
+		 ptpdev->tmr1_owned);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: ena = %d\n", ptpdev->ena);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr0_ena = %d\n",
+		 ptpdev->tmr0_ena);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr1_ena = %d\n",
+		 ptpdev->tmr1_ena);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: ena_ports(bitmap) = %d\n",
+		 ptpdev->ena_ports);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr_own_map = %d\n",
+		 ptpdev->tmr_own_map);
+}
+
+/**
+ * ice_dump_caps - Dump a list of capabilities
+ * @hw: pointer to the ice_hw instance
+ */
+void ice_dump_caps(struct ice_hw *hw)
+{
+	ice_dump_dev_caps(hw, &hw->dev_caps);
+	ice_dump_func_caps(hw, &hw->func_caps);
+}
+
+/**
+ * ice_dump_port_info - Dump data from the port_info array
+ * @pi: pointer to the port_info structure
+ */
+void ice_dump_port_info(struct ice_port_info *pi)
+{
+	dev_info(ice_hw_to_dev(pi->hw), "\tvirt_port = %d\n", pi->lport);
+
+	dev_info(ice_hw_to_dev(pi->hw), "\tswid = %d\n", pi->sw_id);
+	dev_info(ice_hw_to_dev(pi->hw), "\tdflt_tx_vsi = %d\n",
+		 pi->dflt_tx_vsi_num);
+	dev_info(ice_hw_to_dev(pi->hw), "\tdflt_rx_vsi = %d\n",
+		 pi->dflt_rx_vsi_num);
+	dev_info(ice_hw_to_dev(pi->hw), "\t%s_num = %d\n",
+		 (pi->is_vf ? "vf" : "pf"), pi->pf_vf_num);
+	dev_info(ice_hw_to_dev(pi->hw), "\tlast_node_teid = %d\n",
+		 pi->last_node_teid);
+	dev_info(ice_hw_to_dev(pi->hw), "\tmedia_type = %d\n",
+		 pi->phy.media_type);
+
+	dev_info(ice_hw_to_dev(pi->hw), "\tmac_addr: %pM\n", pi->mac.lan_addr);
+}
+
+
+
+/**
+ * ice_get_lan_q_ctx - get the LAN queue context for the given VSI and TC
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @q_handle: software queue handle
+ */
+struct ice_q_ctx *
+ice_get_lan_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 q_handle)
+{
+	struct ice_vsi_ctx *vsi;
+	struct ice_q_ctx *q_ctx;
+
+	vsi = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!vsi)
+		return NULL;
+	if (q_handle >= vsi->num_lan_q_entries[tc])
+		return NULL;
+	if (!vsi->lan_q_ctx[tc])
+		return NULL;
+	q_ctx = vsi->lan_q_ctx[tc];
+	return &q_ctx[q_handle];
+}
+
+/**
+ * ice_ena_vsi_txq
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @q_handle: software queue handle
+ * @num_qgrps: Number of added queue groups
+ * @buf: list of queue groups to be added
+ * @buf_size: size of buffer for indirect command
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function adds one LAN queue
+ */
+enum ice_status
+ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle,
+		u8 num_qgrps, struct ice_aqc_add_tx_qgrp *buf, u16 buf_size,
+		struct ice_sq_cd *cd)
+{
+	struct ice_aqc_txsched_elem_data node = { 0 };
+	struct ice_sched_node *parent;
+	struct ice_q_ctx *q_ctx;
+	enum ice_status status;
+	struct ice_hw *hw;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	if (num_qgrps > 1 || buf->num_txqs > 1)
+		return ICE_ERR_MAX_LIMIT;
+
+	hw = pi->hw;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+
+	q_ctx = ice_get_lan_q_ctx(hw, vsi_handle, tc, q_handle);
+	if (!q_ctx) {
+		ice_debug(hw, ICE_DBG_SCHED, "Enaq: invalid queue handle %d\n",
+			  q_handle);
+		status = ICE_ERR_PARAM;
+		goto ena_txq_exit;
+	}
+
+	/* find a parent node */
+	parent = ice_sched_get_free_qparent(pi, vsi_handle, tc,
+					    ICE_SCHED_NODE_OWNER_LAN);
+	if (!parent) {
+		status = ICE_ERR_PARAM;
+		goto ena_txq_exit;
+	}
+
+	buf->parent_teid = parent->info.node_teid;
+	node.parent_teid = parent->info.node_teid;
+	/* Mark that the values in the "generic" section as valid. The default
+	 * value in the "generic" section is zero. This means that :
+	 * - Scheduling mode is Bytes Per Second (BPS), indicated by Bit 0.
+	 * - 0 priority among siblings, indicated by Bit 1-3.
+	 * - WFQ, indicated by Bit 4.
+	 * - 0 Adjustment value is used in PSM credit update flow, indicated by
+	 * Bit 5-6.
+	 * - Bit 7 is reserved.
+	 * Without setting the generic section as valid in valid_sections, the
+	 * Admin queue command will fail with error code ICE_AQ_RC_EINVAL.
+	 */
+	buf->txqs[0].info.valid_sections =
+		ICE_AQC_ELEM_VALID_GENERIC | ICE_AQC_ELEM_VALID_CIR |
+		ICE_AQC_ELEM_VALID_EIR;
+	buf->txqs[0].info.generic = 0;
+	buf->txqs[0].info.cir_bw.bw_profile_idx =
+		cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+	buf->txqs[0].info.cir_bw.bw_alloc =
+		cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
+	buf->txqs[0].info.eir_bw.bw_profile_idx =
+		cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+	buf->txqs[0].info.eir_bw.bw_alloc =
+		cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
+
+	/* add the LAN queue */
+	status = ice_aq_add_lan_txq(hw, num_qgrps, buf, buf_size, cd);
+	if (status) {
+		ice_debug(hw, ICE_DBG_SCHED, "enable queue %d failed %d\n",
+			  le16_to_cpu(buf->txqs[0].txq_id),
+			  hw->adminq.sq_last_status);
+		goto ena_txq_exit;
 	}
 
-	if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL) {
-		flags |= ((glob_lut_idx << ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S) &
-			  ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_M);
+	node.node_teid = buf->txqs[0].q_teid;
+	node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF;
+	q_ctx->q_handle = q_handle;
+	q_ctx->q_teid = le32_to_cpu(node.node_teid);
+
+	/* add a leaf node into scheduler tree queue layer */
+	status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1, &node);
+	if (!status)
+		status = ice_sched_replay_q_bw(pi, q_ctx);
+
+ena_txq_exit:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_dis_vsi_txq
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @num_queues: number of queues
+ * @q_handles: pointer to software queue handle array
+ * @q_ids: pointer to the q_id array
+ * @q_teids: pointer to queue node teids
+ * @rst_src: if called due to reset, specifies the reset source
+ * @vmvf_num: the relative VM or VF number that is undergoing the reset
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function removes queues and their corresponding nodes in SW DB
+ */
+enum ice_status
+ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues,
+		u16 *q_handles, u16 *q_ids, u32 *q_teids,
+		enum ice_disq_rst_src rst_src, u16 vmvf_num,
+		struct ice_sq_cd *cd)
+{
+	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
+	struct ice_aqc_dis_txq_item *qg_list;
+	struct ice_q_ctx *q_ctx;
+	struct ice_hw *hw;
+	u16 i, buf_size;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	hw = pi->hw;
+
+	if (!num_queues) {
+		/* if queue is disabled already yet the disable queue command
+		 * has to be sent to complete the VF reset, then call
+		 * ice_aq_dis_lan_txq without any queue information
+		 */
+		if (rst_src)
+			return ice_aq_dis_lan_txq(hw, 0, NULL, 0, rst_src,
+						  vmvf_num, NULL);
+		return ICE_ERR_CFG;
+	}
+
+	buf_size = struct_size(qg_list, q_id, 1);
+	qg_list = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!qg_list)
+		return ICE_ERR_NO_MEMORY;
+
+	mutex_lock(&pi->sched_lock);
+
+	for (i = 0; i < num_queues; i++) {
+		struct ice_sched_node *node;
+
+		node = ice_sched_find_node_by_teid(pi->root, q_teids[i]);
+		if (!node)
+			continue;
+		q_ctx = ice_get_lan_q_ctx(hw, vsi_handle, tc, q_handles[i]);
+		if (!q_ctx) {
+			ice_debug(hw, ICE_DBG_SCHED, "invalid queue handle%d\n",
+				  q_handles[i]);
+			continue;
+		}
+		if (q_ctx->q_handle != q_handles[i]) {
+			ice_debug(hw, ICE_DBG_SCHED, "Err:handles %d %d\n",
+				  q_ctx->q_handle, q_handles[i]);
+			continue;
+		}
+		qg_list->parent_teid = node->info.parent_teid;
+		qg_list->num_qs = 1;
+		qg_list->q_id[0] = cpu_to_le16(q_ids[i]);
+		status = ice_aq_dis_lan_txq(hw, 1, qg_list, buf_size, rst_src,
+					    vmvf_num, cd);
+
+		if (status)
+			break;
+		ice_free_sched_node(pi, node);
+		q_ctx->q_handle = ICE_INVAL_Q_HANDLE;
+	}
+	mutex_unlock(&pi->sched_lock);
+	devm_kfree(ice_hw_to_dev(hw), qg_list);
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_qs - configure the new/existing VSI queues
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap
+ * @maxqs: max queues array per TC
+ * @owner: LAN or RDMA
+ *
+ * This function adds/updates the VSI queues per TC.
+ */
+static enum ice_status
+ice_cfg_vsi_qs(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+	       u16 *maxqs, u8 owner)
+{
+	enum ice_status status = 0;
+	u8 i;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+
+	ice_for_each_traffic_class(i) {
+		/* configuration is possible only if TC node is present */
+		if (!ice_sched_get_tc_node(pi, i))
+			continue;
+
+		status = ice_sched_cfg_vsi(pi, vsi_handle, i, maxqs[i], owner,
+					   ice_is_tc_ena(tc_bitmap, i));
+		if (status)
+			break;
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_lan - configure VSI LAN queues
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap
+ * @max_lanqs: max LAN queues array per TC
+ *
+ * This function adds/updates the VSI LAN queues per TC.
+ */
+enum ice_status
+ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+		u16 *max_lanqs)
+{
+	return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_lanqs,
+			      ICE_SCHED_NODE_OWNER_LAN);
+}
+
+/**
+ * ice_cfg_vsi_rdma - configure the VSI RDMA queues
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap
+ * @max_rdmaqs: max RDMA queues array per TC
+ *
+ * This function adds/updates the VSI RDMA queues per TC.
+ */
+enum ice_status
+ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+		 u16 *max_rdmaqs)
+{
+	return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_rdmaqs,
+			      ICE_SCHED_NODE_OWNER_RDMA);
+}
 
-		if (!set)
-			goto ice_aq_get_set_rss_lut_send;
-	} else if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF) {
-		if (!set)
-			goto ice_aq_get_set_rss_lut_send;
-	} else {
-		goto ice_aq_get_set_rss_lut_send;
+/**
+ * ice_ena_vsi_rdma_qset
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @rdma_qset: pointer to RDMA qset
+ * @num_qsets: number of RDMA qsets
+ * @qset_teid: pointer to qset node teids
+ *
+ * This function adds RDMA qset
+ */
+enum ice_status
+ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 *rdma_qset, u16 num_qsets, u32 *qset_teid)
+{
+	struct ice_aqc_txsched_elem_data node = { 0 };
+	struct ice_aqc_add_rdma_qset_data *buf;
+	struct ice_sched_node *parent;
+	enum ice_status status;
+	struct ice_hw *hw;
+	u16 i, buf_size;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+	hw = pi->hw;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	buf_size = struct_size(buf, rdma_qsets, num_qsets);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+	mutex_lock(&pi->sched_lock);
+
+	parent = ice_sched_get_free_qparent(pi, vsi_handle, tc,
+					    ICE_SCHED_NODE_OWNER_RDMA);
+	if (!parent) {
+		status = ICE_ERR_PARAM;
+		goto rdma_error_exit;
 	}
+	buf->parent_teid = parent->info.node_teid;
+	node.parent_teid = parent->info.node_teid;
 
-	/* LUT size is only valid for Global and PF table types */
-	switch (lut_size) {
-	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128:
-		break;
-	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512:
-		flags |= (ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512_FLAG <<
-			  ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S) &
-			 ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M;
-		break;
-	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K:
-		if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF) {
-			flags |= (ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K_FLAG <<
-				  ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S) &
-				 ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M;
+	buf->num_qsets = cpu_to_le16(num_qsets);
+	for (i = 0; i < num_qsets; i++) {
+		buf->rdma_qsets[i].tx_qset_id = cpu_to_le16(rdma_qset[i]);
+		buf->rdma_qsets[i].info.valid_sections =
+			ICE_AQC_ELEM_VALID_GENERIC | ICE_AQC_ELEM_VALID_CIR |
+			ICE_AQC_ELEM_VALID_EIR;
+		buf->rdma_qsets[i].info.generic = 0;
+		buf->rdma_qsets[i].info.cir_bw.bw_profile_idx =
+			cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+		buf->rdma_qsets[i].info.cir_bw.bw_alloc =
+			cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
+		buf->rdma_qsets[i].info.eir_bw.bw_profile_idx =
+			cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+		buf->rdma_qsets[i].info.eir_bw.bw_alloc =
+			cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
+	}
+	status = ice_aq_add_rdma_qsets(hw, 1, buf, buf_size, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_RDMA, "add RDMA qset failed\n");
+		goto rdma_error_exit;
+	}
+	node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF;
+	for (i = 0; i < num_qsets; i++) {
+		node.node_teid = buf->rdma_qsets[i].qset_teid;
+		status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1,
+					    &node);
+		if (status)
 			break;
-		}
-		/* fall-through */
-	default:
-		status = ICE_ERR_PARAM;
-		goto ice_aq_get_set_rss_lut_exit;
+		qset_teid[i] = le32_to_cpu(node.node_teid);
 	}
+rdma_error_exit:
+	mutex_unlock(&pi->sched_lock);
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
 
-ice_aq_get_set_rss_lut_send:
-	cmd_resp->flags = cpu_to_le16(flags);
-	status = ice_aq_send_cmd(hw, &desc, lut, lut_size, NULL);
+/**
+ * ice_dis_vsi_rdma_qset - free RDMA resources
+ * @pi: port_info struct
+ * @count: number of RDMA qsets to free
+ * @qset_teid: TEID of qset node
+ * @q_id: list of queue IDs being disabled
+ */
+enum ice_status
+ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid,
+		      u16 *q_id)
+{
+	struct ice_aqc_dis_txq_item *qg_list;
+	enum ice_status status = 0;
+	struct ice_hw *hw;
+	u16 qg_size;
+	int i;
 
-ice_aq_get_set_rss_lut_exit:
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	hw = pi->hw;
+
+	qg_size = struct_size(qg_list, q_id, 1);
+	qg_list = devm_kzalloc(ice_hw_to_dev(hw), qg_size, GFP_KERNEL);
+	if (!qg_list)
+		return ICE_ERR_NO_MEMORY;
+
+	mutex_lock(&pi->sched_lock);
+
+	for (i = 0; i < count; i++) {
+		struct ice_sched_node *node;
+
+		node = ice_sched_find_node_by_teid(pi->root, qset_teid[i]);
+		if (!node)
+			continue;
+
+		qg_list->parent_teid = node->info.parent_teid;
+		qg_list->num_qs = 1;
+		qg_list->q_id[0] =
+			cpu_to_le16(q_id[i] |
+				    ICE_AQC_Q_DIS_BUF_ELEM_TYPE_RDMA_QSET);
+
+		status = ice_aq_dis_lan_txq(hw, 1, qg_list, qg_size,
+					    ICE_NO_RESET, 0, NULL);
+		if (status)
+			break;
+
+		ice_free_sched_node(pi, node);
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	devm_kfree(ice_hw_to_dev(hw), qg_list);
 	return status;
 }
 
+
 /**
- * ice_aq_get_rss_lut
- * @hw: pointer to the hardware structure
- * @vsi_handle: software VSI handle
- * @lut_type: LUT table type
- * @lut: pointer to the LUT buffer provided by the caller
- * @lut_size: size of the LUT buffer
+ * ice_is_main_vsi - checks whether the VSI is main VSI
+ * @hw: pointer to the HW struct
+ * @vsi_handle: VSI handle
  *
- * get the RSS lookup table, PF or VSI type
+ * Checks whether the VSI is the main VSI (the first PF VSI created on
+ * given PF).
  */
-enum ice_status
-ice_aq_get_rss_lut(struct ice_hw *hw, u16 vsi_handle, u8 lut_type,
-		   u8 *lut, u16 lut_size)
+static bool ice_is_main_vsi(struct ice_hw *hw, u16 vsi_handle)
 {
-	if (!ice_is_vsi_valid(hw, vsi_handle) || !lut)
-		return ICE_ERR_PARAM;
+	return vsi_handle == ICE_MAIN_VSI_HANDLE && hw->vsi_ctx[vsi_handle];
+}
+
+
+/**
+ * ice_replay_pre_init - replay pre initialization
+ * @hw: pointer to the HW struct
+ * @sw: pointer to switch info struct for which function initializes filters
+ *
+ * Initializes required config data for VSI, FD, ACL, and RSS before replay.
+ */
+static enum ice_status
+ice_replay_pre_init(struct ice_hw *hw, struct ice_switch_info *sw)
+{
+	enum ice_status status;
+	u8 i;
+
+	/* Delete old entries from replay filter list head if there is any */
+	ice_rm_sw_replay_rule_info(hw, sw);
+	/* In start of replay, move entries into replay_rules list, it
+	 * will allow adding rules entries back to filt_rules list,
+	 * which is operational list.
+	 */
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++)
+		list_replace_init(&sw->recp_list[i].filt_rules,
+				  &sw->recp_list[i].filt_replay_rules);
+	ice_sched_replay_agg_vsi_preinit(hw);
+
+	status = ice_sched_replay_root_node_bw(hw->port_info);
+	if (status)
+		return status;
 
-	return __ice_aq_get_set_rss_lut(hw, ice_get_hw_vsi_num(hw, vsi_handle),
-					lut_type, lut, lut_size, 0, false);
+	return ice_sched_replay_tc_node_bw(hw->port_info);
 }
 
 /**
- * ice_aq_set_rss_lut
- * @hw: pointer to the hardware structure
- * @vsi_handle: software VSI handle
- * @lut_type: LUT table type
- * @lut: pointer to the LUT buffer provided by the caller
- * @lut_size: size of the LUT buffer
+ * ice_replay_vsi - replay VSI configuration
+ * @hw: pointer to the HW struct
+ * @vsi_handle: driver VSI handle
  *
- * set the RSS lookup table, PF or VSI type
+ * Restore all VSI configuration after reset. It is required to call this
+ * function with main VSI first.
  */
-enum ice_status
-ice_aq_set_rss_lut(struct ice_hw *hw, u16 vsi_handle, u8 lut_type,
-		   u8 *lut, u16 lut_size)
+enum ice_status ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle)
 {
-	if (!ice_is_vsi_valid(hw, vsi_handle) || !lut)
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_port_info *pi = hw->port_info;
+	enum ice_status status;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
 		return ICE_ERR_PARAM;
 
-	return __ice_aq_get_set_rss_lut(hw, ice_get_hw_vsi_num(hw, vsi_handle),
-					lut_type, lut, lut_size, 0, true);
+	/* Replay pre-initialization if there is any */
+	if (ice_is_main_vsi(hw, vsi_handle)) {
+		status = ice_replay_pre_init(hw, sw);
+		if (status)
+			return status;
+	}
+	/* Replay per VSI all RSS configurations */
+	status = ice_replay_rss_cfg(hw, vsi_handle);
+	if (status)
+		return status;
+	/* Replay per VSI all filters */
+	status = ice_replay_vsi_all_fltr(hw, pi, vsi_handle);
+	if (!status)
+		status = ice_replay_vsi_agg(hw, vsi_handle);
+	return status;
 }
 
 /**
- * __ice_aq_get_set_rss_key
+ * ice_replay_post - post replay configuration cleanup
  * @hw: pointer to the HW struct
- * @vsi_id: VSI FW index
- * @key: pointer to key info struct
- * @set: set true to set the key, false to get the key
  *
- * get (0x0B04) or set (0x0B02) the RSS key per VSI
+ * Post replay cleanup.
  */
-static enum
-ice_status __ice_aq_get_set_rss_key(struct ice_hw *hw, u16 vsi_id,
-				    struct ice_aqc_get_set_rss_keys *key,
-				    bool set)
+void ice_replay_post(struct ice_hw *hw)
 {
-	struct ice_aqc_get_set_rss_key *cmd_resp;
-	u16 key_size = sizeof(*key);
-	struct ice_aq_desc desc;
+	/* Delete old entries from replay filter list head */
+	ice_rm_all_sw_replay_rule_info(hw);
+	ice_sched_replay_agg(hw);
+}
 
-	cmd_resp = &desc.params.get_set_rss_key;
+/**
+ * ice_stat_update40 - read 40 bit stat from the chip and update stat values
+ * @hw: ptr to the hardware info
+ * @reg: offset of 64 bit HW register to read from
+ * @prev_stat_loaded: bool to specify if previous stats are loaded
+ * @prev_stat: ptr to previous loaded stat value
+ * @cur_stat: ptr to current stat value
+ */
+void
+ice_stat_update40(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
+		  u64 *prev_stat, u64 *cur_stat)
+{
+	u64 new_data = rd64(hw, reg) & (BIT_ULL(40) - 1);
 
-	if (set) {
-		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_key);
-		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-	} else {
-		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_key);
+	/* device stats are not reset at PFR, they likely will not be zeroed
+	 * when the driver starts. Thus, save the value from the first read
+	 * without adding to the statistic value so that we report stats which
+	 * count up from zero.
+	 */
+	if (!prev_stat_loaded) {
+		*prev_stat = new_data;
+		return;
+	}
+
+	/* Calculate the difference between the new and old values, and then
+	 * add it to the software stat value.
+	 */
+	if (new_data >= *prev_stat)
+		*cur_stat += new_data - *prev_stat;
+	else
+		/* to manage the potential roll-over */
+		*cur_stat += (new_data + BIT_ULL(40)) - *prev_stat;
+
+	/* Update the previously stored value to prepare for next read */
+	*prev_stat = new_data;
+}
+
+/**
+ * ice_stat_update32 - read 32 bit stat from the chip and update stat values
+ * @hw: ptr to the hardware info
+ * @reg: offset of HW register to read from
+ * @prev_stat_loaded: bool to specify if previous stats are loaded
+ * @prev_stat: ptr to previous loaded stat value
+ * @cur_stat: ptr to current stat value
+ */
+void
+ice_stat_update32(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
+		  u64 *prev_stat, u64 *cur_stat)
+{
+	u32 new_data;
+
+	new_data = rd32(hw, reg);
+
+	/* device stats are not reset at PFR, they likely will not be zeroed
+	 * when the driver starts. Thus, save the value from the first read
+	 * without adding to the statistic value so that we report stats which
+	 * count up from zero.
+	 */
+	if (!prev_stat_loaded) {
+		*prev_stat = new_data;
+		return;
 	}
 
-	cmd_resp->vsi_id = cpu_to_le16(((vsi_id <<
-					 ICE_AQC_GSET_RSS_KEY_VSI_ID_S) &
-					ICE_AQC_GSET_RSS_KEY_VSI_ID_M) |
-				       ICE_AQC_GSET_RSS_KEY_VSI_VALID);
+	/* Calculate the difference between the new and old values, and then
+	 * add it to the software stat value.
+	 */
+	if (new_data >= *prev_stat)
+		*cur_stat += new_data - *prev_stat;
+	else
+		/* to manage the potential roll-over */
+		*cur_stat += (new_data + BIT_ULL(32)) - *prev_stat;
 
-	return ice_aq_send_cmd(hw, &desc, key, key_size, NULL);
+	/* Update the previously stored value to prepare for next read */
+	*prev_stat = new_data;
 }
 
+
+
 /**
- * ice_aq_get_rss_key
+ * ice_sched_query_elem - query element information from HW
  * @hw: pointer to the HW struct
- * @vsi_handle: software VSI handle
- * @key: pointer to key info struct
+ * @node_teid: node TEID to be queried
+ * @buf: buffer to element information
  *
- * get the RSS key per VSI
+ * This function queries HW element information
  */
 enum ice_status
-ice_aq_get_rss_key(struct ice_hw *hw, u16 vsi_handle,
-		   struct ice_aqc_get_set_rss_keys *key)
+ice_sched_query_elem(struct ice_hw *hw, u32 node_teid,
+		     struct ice_aqc_txsched_elem_data *buf)
 {
-	if (!ice_is_vsi_valid(hw, vsi_handle) || !key)
-		return ICE_ERR_PARAM;
+	u16 buf_size, num_elem_ret = 0;
+	enum ice_status status;
 
-	return __ice_aq_get_set_rss_key(hw, ice_get_hw_vsi_num(hw, vsi_handle),
-					key, false);
+	buf_size = sizeof(*buf);
+	memset(buf, 0, buf_size);
+	buf->node_teid = cpu_to_le32(node_teid);
+	status = ice_aq_query_sched_elems(hw, 1, buf, buf_size, &num_elem_ret,
+					  NULL);
+	if (status || num_elem_ret != 1)
+		ice_debug(hw, ICE_DBG_SCHED, "query element failed\n");
+	return status;
 }
 
+
 /**
- * ice_aq_set_rss_key
+ * ice_get_fw_mode - returns FW mode
  * @hw: pointer to the HW struct
- * @vsi_handle: software VSI handle
- * @keys: pointer to key info struct
- *
- * set the RSS key per VSI
  */
-enum ice_status
-ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle,
-		   struct ice_aqc_get_set_rss_keys *keys)
+enum ice_fw_modes ice_get_fw_mode(struct ice_hw *hw)
 {
-	if (!ice_is_vsi_valid(hw, vsi_handle) || !keys)
-		return ICE_ERR_PARAM;
-
-	return __ice_aq_get_set_rss_key(hw, ice_get_hw_vsi_num(hw, vsi_handle),
-					keys, true);
+#define ICE_FW_MODE_DBG_M BIT(0)
+#define ICE_FW_MODE_REC_M BIT(1)
+#define ICE_FW_MODE_ROLLBACK_M BIT(2)
+	u32 fw_mode;
+
+	/* check the current FW mode */
+	fw_mode = rd32(hw, GL_MNG_FWSM) & GL_MNG_FWSM_FW_MODES_M;
+
+	if (fw_mode & ICE_FW_MODE_DBG_M)
+		return ICE_FW_MODE_DBG;
+	else if (fw_mode & ICE_FW_MODE_REC_M)
+		return ICE_FW_MODE_REC;
+	else if (fw_mode & ICE_FW_MODE_ROLLBACK_M)
+		return ICE_FW_MODE_ROLLBACK;
+	else
+		return ICE_FW_MODE_NORMAL;
 }
 
+
 /**
- * ice_aq_add_lan_txq
- * @hw: pointer to the hardware structure
- * @num_qgrps: Number of added queue groups
- * @qg_list: list of queue groups to be added
- * @buf_size: size of buffer for indirect command
+ * ice_aq_read_i2c
+ * @hw: pointer to the hw struct
+ * @topo_addr: topology address for a device to communicate with
+ * @bus_addr: 7-bit I2C bus address
+ * @addr: I2C memory address (I2C offset) with up to 16 bits
+ * @params: I2C parameters: bit [7] - Repeated start, bits [6:5] data offset size,
+ *			    bit [4] - I2C address type, bits [3:0] - data size to read (0-16 bytes)
+ * @data: pointer to data (0 to 16 bytes) to be read from the I2C device
  * @cd: pointer to command details structure or NULL
  *
- * Add Tx LAN queue (0x0C30)
- *
- * NOTE:
- * Prior to calling add Tx LAN queue:
- * Initialize the following as part of the Tx queue context:
- * Completion queue ID if the queue uses Completion queue, Quanta profile,
- * Cache profile and Packet shaper profile.
- *
- * After add Tx LAN queue AQ command is completed:
- * Interrupts should be associated with specific queues,
- * Association of Tx queue to Doorbell queue is not part of Add LAN Tx queue
- * flow.
+ * Read I2C (0x06E2)
  */
-static enum ice_status
-ice_aq_add_lan_txq(struct ice_hw *hw, u8 num_qgrps,
-		   struct ice_aqc_add_tx_qgrp *qg_list, u16 buf_size,
-		   struct ice_sq_cd *cd)
+enum ice_status
+ice_aq_read_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
+		u16 bus_addr, __le16 addr, u8 params, u8 *data,
+		struct ice_sq_cd *cd)
 {
-	u16 i, sum_header_size, sum_q_size = 0;
-	struct ice_aqc_add_tx_qgrp *list;
-	struct ice_aqc_add_txqs *cmd;
-	struct ice_aq_desc desc;
-
-	cmd = &desc.params.add_txqs;
+	struct ice_aq_desc desc = { 0 };
+	struct ice_aqc_i2c *cmd;
+	enum ice_status status;
+	u8 data_size;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_txqs);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_read_i2c);
+	cmd = &desc.params.read_write_i2c;
 
-	if (!qg_list)
+	if (!data)
 		return ICE_ERR_PARAM;
 
-	if (num_qgrps > ICE_LAN_TXQ_MAX_QGRPS)
-		return ICE_ERR_PARAM;
+	data_size = (params & ICE_AQC_I2C_DATA_SIZE_M) >> ICE_AQC_I2C_DATA_SIZE_S;
 
-	sum_header_size = num_qgrps *
-		(sizeof(*qg_list) - sizeof(*qg_list->txqs));
+	cmd->i2c_bus_addr = cpu_to_le16(bus_addr);
+	cmd->topo_addr = topo_addr;
+	cmd->i2c_params = params;
+	cmd->i2c_addr = addr;
 
-	list = qg_list;
-	for (i = 0; i < num_qgrps; i++) {
-		struct ice_aqc_add_txqs_perq *q = list->txqs;
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (!status) {
+		struct ice_aqc_read_i2c_resp *resp;
+		u8 i;
 
-		sum_q_size += list->num_txqs * sizeof(*q);
-		list = (struct ice_aqc_add_tx_qgrp *)(q + list->num_txqs);
+		resp = &desc.params.read_i2c_resp;
+		for (i = 0; i < data_size; i++) {
+			*data = resp->i2c_data[i];
+			data++;
+		}
 	}
 
-	if (buf_size != (sum_header_size + sum_q_size))
-		return ICE_ERR_PARAM;
-
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-
-	cmd->num_qgrps = num_qgrps;
-
-	return ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
+	return status;
 }
 
 /**
- * ice_aq_dis_lan_txq
- * @hw: pointer to the hardware structure
- * @num_qgrps: number of groups in the list
- * @qg_list: the list of groups to disable
- * @buf_size: the total size of the qg_list buffer in bytes
- * @rst_src: if called due to reset, specifies the reset source
- * @vmvf_num: the relative VM or VF number that is undergoing the reset
+ * ice_aq_write_i2c
+ * @hw: pointer to the hw struct
+ * @topo_addr: topology address for a device to communicate with
+ * @bus_addr: 7-bit I2C bus address
+ * @addr: I2C memory address (I2C offset) with up to 16 bits
+ * @params: I2C parameters: bit [4] - I2C address type, bits [3:0] - data size to write (0-7 bytes)
+ * @data: pointer to data (0 to 4 bytes) to be written to the I2C device
  * @cd: pointer to command details structure or NULL
  *
- * Disable LAN Tx queue (0x0C31)
+ * Write I2C (0x06E3)
  */
-static enum ice_status
-ice_aq_dis_lan_txq(struct ice_hw *hw, u8 num_qgrps,
-		   struct ice_aqc_dis_txq_item *qg_list, u16 buf_size,
-		   enum ice_disq_rst_src rst_src, u16 vmvf_num,
-		   struct ice_sq_cd *cd)
+enum ice_status
+ice_aq_write_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
+		 u16 bus_addr, __le16 addr, u8 params, u8 *data,
+		 struct ice_sq_cd *cd)
 {
-	struct ice_aqc_dis_txqs *cmd;
-	struct ice_aq_desc desc;
-	enum ice_status status;
-	u16 i, sz = 0;
+	struct ice_aq_desc desc = { 0 };
+	struct ice_aqc_i2c *cmd;
+	u8 i, data_size;
 
-	cmd = &desc.params.dis_txqs;
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dis_txqs);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_write_i2c);
+	cmd = &desc.params.read_write_i2c;
 
-	/* qg_list can be NULL only in VM/VF reset flow */
-	if (!qg_list && !rst_src)
-		return ICE_ERR_PARAM;
+	data_size = (params & ICE_AQC_I2C_DATA_SIZE_M) >> ICE_AQC_I2C_DATA_SIZE_S;
 
-	if (num_qgrps > ICE_LAN_TXQ_MAX_QGRPS)
+	/* data_size limited to 4 */
+	if (data_size > 4)
 		return ICE_ERR_PARAM;
 
-	cmd->num_entries = num_qgrps;
-
-	cmd->vmvf_and_timeout = cpu_to_le16((5 << ICE_AQC_Q_DIS_TIMEOUT_S) &
-					    ICE_AQC_Q_DIS_TIMEOUT_M);
-
-	switch (rst_src) {
-	case ICE_VM_RESET:
-		cmd->cmd_type = ICE_AQC_Q_DIS_CMD_VM_RESET;
-		cmd->vmvf_and_timeout |=
-			cpu_to_le16(vmvf_num & ICE_AQC_Q_DIS_VMVF_NUM_M);
-		break;
-	case ICE_VF_RESET:
-		cmd->cmd_type = ICE_AQC_Q_DIS_CMD_VF_RESET;
-		/* In this case, FW expects vmvf_num to be absolute VF ID */
-		cmd->vmvf_and_timeout |=
-			cpu_to_le16((vmvf_num + hw->func_caps.vf_base_id) &
-				    ICE_AQC_Q_DIS_VMVF_NUM_M);
-		break;
-	case ICE_NO_RESET:
-	default:
-		break;
-	}
-
-	/* flush pipe on time out */
-	cmd->cmd_type |= ICE_AQC_Q_DIS_CMD_FLUSH_PIPE;
-	/* If no queue group info, we are in a reset flow. Issue the AQ */
-	if (!qg_list)
-		goto do_aq;
-
-	/* set RD bit to indicate that command buffer is provided by the driver
-	 * and it needs to be read by the firmware
-	 */
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-
-	for (i = 0; i < num_qgrps; ++i) {
-		/* Calculate the size taken up by the queue IDs in this group */
-		sz += qg_list[i].num_qs * sizeof(qg_list[i].q_id);
-
-		/* Add the size of the group header */
-		sz += sizeof(qg_list[i]) - sizeof(qg_list[i].q_id);
+	cmd->i2c_bus_addr = cpu_to_le16(bus_addr);
+	cmd->topo_addr = topo_addr;
+	cmd->i2c_params = params;
+	cmd->i2c_addr = addr;
 
-		/* If the num of queues is even, add 2 bytes of padding */
-		if ((qg_list[i].num_qs % 2) == 0)
-			sz += 2;
+	for (i = 0; i < data_size; i++) {
+		cmd->i2c_data[i] = *data;
+		data++;
 	}
 
-	if (buf_size != sz)
-		return ICE_ERR_PARAM;
-
-do_aq:
-	status = ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
-	if (status) {
-		if (!qg_list)
-			ice_debug(hw, ICE_DBG_SCHED, "VM%d disable failed %d\n",
-				  vmvf_num, hw->adminq.sq_last_status);
-		else
-			ice_debug(hw, ICE_DBG_SCHED, "disable queue %d failed %d\n",
-				  le16_to_cpu(qg_list[0].q_id[0]),
-				  hw->adminq.sq_last_status);
-	}
-	return status;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
-/* End of FW Admin Queue command wrappers */
-
 /**
- * ice_write_byte - write a byte to a packed context structure
- * @src_ctx:  the context structure to read from
- * @dest_ctx: the context to be written to
- * @ce_info:  a description of the struct to be filled
+ * ice_aq_set_driver_param - Set driver parameter to share via firmware
+ * @hw: pointer to the HW struct
+ * @idx: parameter index to set
+ * @value: the value to set the parameter to
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set the value of one of the software defined parameters. All PFs connected
+ * to this device can read the value using ice_aq_get_driver_param.
+ *
+ * Note that firmware provides no synchronization or locking, and will not
+ * save the parameter value during a device reset. It is expected that
+ * a single PF will write the parameter value, while all other PFs will only
+ * read it.
  */
-static void
-ice_write_byte(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+enum ice_status
+ice_aq_set_driver_param(struct ice_hw *hw, enum ice_aqc_driver_params idx,
+			u32 value, struct ice_sq_cd *cd)
 {
-	u8 src_byte, dest_byte, mask;
-	u8 *from, *dest;
-	u16 shift_width;
-
-	/* copy from the next struct field */
-	from = src_ctx + ce_info->offset;
-
-	/* prepare the bits and mask */
-	shift_width = ce_info->lsb % 8;
-	mask = (u8)(BIT(ce_info->width) - 1);
-
-	src_byte = *from;
-	src_byte &= mask;
+	struct ice_aqc_driver_shared_params *cmd;
+	struct ice_aq_desc desc;
 
-	/* shift to correct alignment */
-	mask <<= shift_width;
-	src_byte <<= shift_width;
+	if (idx >= ICE_AQC_DRIVER_PARAM_MAX)
+		return ICE_ERR_OUT_OF_RANGE;
 
-	/* get the current bits from the target bit string */
-	dest = dest_ctx + (ce_info->lsb / 8);
+	cmd = &desc.params.drv_shared_params;
 
-	memcpy(&dest_byte, dest, sizeof(dest_byte));
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_driver_shared_params);
 
-	dest_byte &= ~mask;	/* get the bits not changing */
-	dest_byte |= src_byte;	/* add in the new bits */
+	cmd->set_or_get_op = ICE_AQC_DRIVER_PARAM_SET;
+	cmd->param_indx = idx;
+	cmd->param_val = cpu_to_le32(value);
 
-	/* put it all back */
-	memcpy(dest, &dest_byte, sizeof(dest_byte));
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
 /**
- * ice_write_word - write a word to a packed context structure
- * @src_ctx:  the context structure to read from
- * @dest_ctx: the context to be written to
- * @ce_info:  a description of the struct to be filled
+ * ice_aq_get_driver_param - Get driver parameter shared via firmware
+ * @hw: pointer to the HW struct
+ * @idx: parameter index to set
+ * @value: storage to return the shared parameter
+ * @cd: pointer to command details structure or NULL
+ *
+ * Get the value of one of the software defined parameters.
+ *
+ * Note that firmware provides no synchronization or locking. It is expected
+ * that only a single PF will write a given parameter.
  */
-static void
-ice_write_word(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+enum ice_status
+ice_aq_get_driver_param(struct ice_hw *hw, enum ice_aqc_driver_params idx,
+			u32 *value, struct ice_sq_cd *cd)
 {
-	u16 src_word, mask;
-	__le16 dest_word;
-	u8 *from, *dest;
-	u16 shift_width;
-
-	/* copy from the next struct field */
-	from = src_ctx + ce_info->offset;
+	struct ice_aqc_driver_shared_params *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	/* prepare the bits and mask */
-	shift_width = ce_info->lsb % 8;
-	mask = BIT(ce_info->width) - 1;
+	if (idx >= ICE_AQC_DRIVER_PARAM_MAX)
+		return ICE_ERR_OUT_OF_RANGE;
 
-	/* don't swizzle the bits until after the mask because the mask bits
-	 * will be in a different bit position on big endian machines
-	 */
-	src_word = *(u16 *)from;
-	src_word &= mask;
+	cmd = &desc.params.drv_shared_params;
 
-	/* shift to correct alignment */
-	mask <<= shift_width;
-	src_word <<= shift_width;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_driver_shared_params);
 
-	/* get the current bits from the target bit string */
-	dest = dest_ctx + (ce_info->lsb / 8);
+	cmd->set_or_get_op = ICE_AQC_DRIVER_PARAM_GET;
+	cmd->param_indx = idx;
 
-	memcpy(&dest_word, dest, sizeof(dest_word));
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (status)
+		return status;
 
-	dest_word &= ~(cpu_to_le16(mask));	/* get the bits not changing */
-	dest_word |= cpu_to_le16(src_word);	/* add in the new bits */
+	*value = le32_to_cpu(cmd->param_val);
 
-	/* put it all back */
-	memcpy(dest, &dest_word, sizeof(dest_word));
+	return 0;
 }
 
 /**
- * ice_write_dword - write a dword to a packed context structure
- * @src_ctx:  the context structure to read from
- * @dest_ctx: the context to be written to
- * @ce_info:  a description of the struct to be filled
+ * ice_aq_set_gpio
+ * @hw: pointer to the hw struct
+ * @gpio_ctrl_handle: GPIO controller node handle
+ * @pin_idx: IO Number of the GPIO that needs to be set
+ * @value: SW provide IO value to set in the LSB
+ * @cd: pointer to command details structure or NULL
+ *
+ * Sends 0x06EC AQ command to set the GPIO pin state that's part of the topology
  */
-static void
-ice_write_dword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+enum ice_status
+ice_aq_set_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx, bool value,
+		struct ice_sq_cd *cd)
 {
-	u32 src_dword, mask;
-	__le32 dest_dword;
-	u8 *from, *dest;
-	u16 shift_width;
-
-	/* copy from the next struct field */
-	from = src_ctx + ce_info->offset;
-
-	/* prepare the bits and mask */
-	shift_width = ce_info->lsb % 8;
-
-	/* if the field width is exactly 32 on an x86 machine, then the shift
-	 * operation will not work because the SHL instructions count is masked
-	 * to 5 bits so the shift will do nothing
-	 */
-	if (ce_info->width < 32)
-		mask = BIT(ce_info->width) - 1;
-	else
-		mask = (u32)~0;
+	struct ice_aqc_gpio *cmd;
+	struct ice_aq_desc desc;
 
-	/* don't swizzle the bits until after the mask because the mask bits
-	 * will be in a different bit position on big endian machines
-	 */
-	src_dword = *(u32 *)from;
-	src_dword &= mask;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_gpio);
+	cmd = &desc.params.read_write_gpio;
+	cmd->gpio_ctrl_handle = gpio_ctrl_handle;
+	cmd->gpio_num = pin_idx;
+	cmd->gpio_val = value ? 1 : 0;
 
-	/* shift to correct alignment */
-	mask <<= shift_width;
-	src_dword <<= shift_width;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
 
-	/* get the current bits from the target bit string */
-	dest = dest_ctx + (ce_info->lsb / 8);
+/**
+ * ice_aq_get_gpio
+ * @hw: pointer to the hw struct
+ * @gpio_ctrl_handle: GPIO controller node handle
+ * @pin_idx: IO Number of the GPIO that needs to be set
+ * @value: IO value read
+ * @cd: pointer to command details structure or NULL
+ *
+ * Sends 0x06ED AQ command to get the value of a GPIO signal which is part of
+ * the topology
+ */
+enum ice_status
+ice_aq_get_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx,
+		bool *value, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_gpio *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	memcpy(&dest_dword, dest, sizeof(dest_dword));
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_gpio);
+	cmd = &desc.params.read_write_gpio;
+	cmd->gpio_ctrl_handle = gpio_ctrl_handle;
+	cmd->gpio_num = pin_idx;
 
-	dest_dword &= ~(cpu_to_le32(mask));	/* get the bits not changing */
-	dest_dword |= cpu_to_le32(src_dword);	/* add in the new bits */
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (status)
+		return status;
 
-	/* put it all back */
-	memcpy(dest, &dest_dword, sizeof(dest_dword));
+	*value = !!cmd->gpio_val;
+	return 0;
 }
 
 /**
- * ice_write_qword - write a qword to a packed context structure
- * @src_ctx:  the context structure to read from
- * @dest_ctx: the context to be written to
- * @ce_info:  a description of the struct to be filled
+ * ice_fw_supports_link_override
+ * @hw: pointer to the hardware structure
+ *
+ * Checks if the firmware supports link override
  */
-static void
-ice_write_qword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+bool ice_fw_supports_link_override(struct ice_hw *hw)
 {
-	u64 src_qword, mask;
-	__le64 dest_qword;
-	u8 *from, *dest;
-	u16 shift_width;
+	if (hw->api_maj_ver == ICE_FW_API_LINK_OVERRIDE_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_LINK_OVERRIDE_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_LINK_OVERRIDE_MIN &&
+		    hw->api_patch >= ICE_FW_API_LINK_OVERRIDE_PATCH)
+			return true;
+	} else if (hw->api_maj_ver > ICE_FW_API_LINK_OVERRIDE_MAJ) {
+		return true;
+	}
 
-	/* copy from the next struct field */
-	from = src_ctx + ce_info->offset;
+	return false;
+}
 
-	/* prepare the bits and mask */
-	shift_width = ce_info->lsb % 8;
+/**
+ * ice_get_link_default_override
+ * @ldo: pointer to the link default override struct
+ * @pi: pointer to the port info struct
+ *
+ * Gets the link default override for a port
+ */
+enum ice_status
+ice_get_link_default_override(struct ice_link_default_override_tlv *ldo,
+			      struct ice_port_info *pi)
+{
+	u16 i, tlv, tlv_len, tlv_start, buf, offset;
+	struct ice_hw *hw = pi->hw;
+	enum ice_status status;
 
-	/* if the field width is exactly 64 on an x86 machine, then the shift
-	 * operation will not work because the SHL instructions count is masked
-	 * to 6 bits so the shift will do nothing
-	 */
-	if (ce_info->width < 64)
-		mask = BIT_ULL(ce_info->width) - 1;
-	else
-		mask = (u64)~0;
+	status = ice_get_pfa_module_tlv(hw, &tlv, &tlv_len,
+					ICE_SR_LINK_DEFAULT_OVERRIDE_PTR);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read link override TLV.\n");
+		return status;
+	}
 
-	/* don't swizzle the bits until after the mask because the mask bits
-	 * will be in a different bit position on big endian machines
-	 */
-	src_qword = *(u64 *)from;
-	src_qword &= mask;
+	/* Each port has its own config; calculate for our port */
+	tlv_start = tlv + pi->lport * ICE_SR_PFA_LINK_OVERRIDE_WORDS +
+		ICE_SR_PFA_LINK_OVERRIDE_OFFSET;
 
-	/* shift to correct alignment */
-	mask <<= shift_width;
-	src_qword <<= shift_width;
+	/* link options first */
+	status = ice_read_sr_word(hw, tlv_start, &buf);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read override link options.\n");
+		return status;
+	}
+	ldo->options = buf & ICE_LINK_OVERRIDE_OPT_M;
+	ldo->phy_config = (buf & ICE_LINK_OVERRIDE_PHY_CFG_M) >>
+		ICE_LINK_OVERRIDE_PHY_CFG_S;
 
-	/* get the current bits from the target bit string */
-	dest = dest_ctx + (ce_info->lsb / 8);
+	/* link PHY config */
+	offset = tlv_start + ICE_SR_PFA_LINK_OVERRIDE_FEC_OFFSET;
+	status = ice_read_sr_word(hw, offset, &buf);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read override phy config.\n");
+		return status;
+	}
+	ldo->fec_options = buf & ICE_LINK_OVERRIDE_FEC_OPT_M;
 
-	memcpy(&dest_qword, dest, sizeof(dest_qword));
+	/* PHY types low */
+	offset = tlv_start + ICE_SR_PFA_LINK_OVERRIDE_PHY_OFFSET;
+	for (i = 0; i < ICE_SR_PFA_LINK_OVERRIDE_PHY_WORDS; i++) {
+		status = ice_read_sr_word(hw, (offset + i), &buf);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read override link options.\n");
+			return status;
+		}
+		/* shift 16 bits at a time to fill 64 bits */
+		ldo->phy_type_low |= ((u64)buf << (i * 16));
+	}
 
-	dest_qword &= ~(cpu_to_le64(mask));	/* get the bits not changing */
-	dest_qword |= cpu_to_le64(src_qword);	/* add in the new bits */
+	/* PHY types high */
+	offset = tlv_start + ICE_SR_PFA_LINK_OVERRIDE_PHY_OFFSET +
+		ICE_SR_PFA_LINK_OVERRIDE_PHY_WORDS;
+	for (i = 0; i < ICE_SR_PFA_LINK_OVERRIDE_PHY_WORDS; i++) {
+		status = ice_read_sr_word(hw, (offset + i), &buf);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read override link options.\n");
+			return status;
+		}
+		/* shift 16 bits at a time to fill 64 bits */
+		ldo->phy_type_high |= ((u64)buf << (i * 16));
+	}
 
-	/* put it all back */
-	memcpy(dest, &dest_qword, sizeof(dest_qword));
+	return status;
 }
 
 /**
- * ice_set_ctx - set context bits in packed structure
- * @src_ctx:  pointer to a generic non-packed context structure
- * @dest_ctx: pointer to memory for the packed structure
- * @ce_info:  a description of the structure to be transformed
+ * ice_is_phy_caps_an_enabled - check if PHY capabilities autoneg is enabled
+ * @caps: get PHY capability data
  */
-enum ice_status
-ice_set_ctx(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+bool ice_is_phy_caps_an_enabled(struct ice_aqc_get_phy_caps_data *caps)
 {
-	int f;
-
-	for (f = 0; ce_info[f].width; f++) {
-		/* We have to deal with each element of the FW response
-		 * using the correct size so that we are correct regardless
-		 * of the endianness of the machine.
-		 */
-		switch (ce_info[f].size_of) {
-		case sizeof(u8):
-			ice_write_byte(src_ctx, dest_ctx, &ce_info[f]);
-			break;
-		case sizeof(u16):
-			ice_write_word(src_ctx, dest_ctx, &ce_info[f]);
-			break;
-		case sizeof(u32):
-			ice_write_dword(src_ctx, dest_ctx, &ce_info[f]);
-			break;
-		case sizeof(u64):
-			ice_write_qword(src_ctx, dest_ctx, &ce_info[f]);
-			break;
-		default:
-			return ICE_ERR_INVAL_SIZE;
-		}
-	}
+	if (caps->caps & ICE_AQC_PHY_AN_MODE ||
+	    caps->low_power_ctrl_an & (ICE_AQC_PHY_AN_EN_CLAUSE28 |
+				       ICE_AQC_PHY_AN_EN_CLAUSE73 |
+				       ICE_AQC_PHY_AN_EN_CLAUSE37))
+		return true;
 
-	return 0;
+	return false;
 }
 
 /**
- * ice_get_lan_q_ctx - get the LAN queue context for the given VSI and TC
- * @hw: pointer to the HW struct
- * @vsi_handle: software VSI handle
- * @tc: TC number
- * @q_handle: software queue handle
+ * ice_is_fw_health_report_supported
+ * @hw: pointer to the hardware structure
+ *
+ * Return true if firmware supports health status reports,
+ * false otherwise
  */
-static struct ice_q_ctx *
-ice_get_lan_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 q_handle)
+bool ice_is_fw_health_report_supported(struct ice_hw *hw)
 {
-	struct ice_vsi_ctx *vsi;
-	struct ice_q_ctx *q_ctx;
+	if (hw->api_maj_ver > ICE_FW_API_HEALTH_REPORT_MAJ)
+		return true;
+
+	if (hw->api_maj_ver == ICE_FW_API_HEALTH_REPORT_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_HEALTH_REPORT_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_HEALTH_REPORT_MIN &&
+		    hw->api_patch >= ICE_FW_API_HEALTH_REPORT_PATCH)
+			return true;
+	}
 
-	vsi = ice_get_vsi_ctx(hw, vsi_handle);
-	if (!vsi)
-		return NULL;
-	if (q_handle >= vsi->num_lan_q_entries[tc])
-		return NULL;
-	if (!vsi->lan_q_ctx[tc])
-		return NULL;
-	q_ctx = vsi->lan_q_ctx[tc];
-	return &q_ctx[q_handle];
+	return false;
 }
 
 /**
- * ice_ena_vsi_txq
- * @pi: port information structure
- * @vsi_handle: software VSI handle
- * @tc: TC number
- * @q_handle: software queue handle
- * @num_qgrps: Number of added queue groups
- * @buf: list of queue groups to be added
- * @buf_size: size of buffer for indirect command
+ * ice_aq_set_health_status_config - Configure FW health events
+ * @hw: pointer to the HW struct
+ * @event_source: type of diagnostic events to enable
  * @cd: pointer to command details structure or NULL
  *
- * This function adds one LAN queue
+ * Configure the health status event types that the firmware will send to this
+ * PF. The supported event types are: PF-specific, all PFs, and global
  */
 enum ice_status
-ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle,
-		u8 num_qgrps, struct ice_aqc_add_tx_qgrp *buf, u16 buf_size,
-		struct ice_sq_cd *cd)
+ice_aq_set_health_status_config(struct ice_hw *hw, u8 event_source,
+				struct ice_sq_cd *cd)
 {
-	struct ice_aqc_txsched_elem_data node = { 0 };
-	struct ice_sched_node *parent;
-	struct ice_q_ctx *q_ctx;
-	enum ice_status status;
-	struct ice_hw *hw;
+	struct ice_aqc_set_health_status_config *cmd;
+	struct ice_aq_desc desc;
 
-	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
-		return ICE_ERR_CFG;
+	cmd = &desc.params.set_health_status_config;
 
-	if (num_qgrps > 1 || buf->num_txqs > 1)
-		return ICE_ERR_MAX_LIMIT;
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_set_health_status_config);
 
-	hw = pi->hw;
+	cmd->event_source = event_source;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
-		return ICE_ERR_PARAM;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
 
-	mutex_lock(&pi->sched_lock);
 
-	q_ctx = ice_get_lan_q_ctx(hw, vsi_handle, tc, q_handle);
-	if (!q_ctx) {
-		ice_debug(hw, ICE_DBG_SCHED, "Enaq: invalid queue handle %d\n",
-			  q_handle);
-		status = ICE_ERR_PARAM;
-		goto ena_txq_exit;
-	}
+/**
+ * ice_aq_get_port_options
+ * @hw: pointer to the hw struct
+ * @options: buffer for the resultant port options
+ * @option_count: input - size of the buffer in port options structures,
+ *                output - number of returned port options
+ * @lport: logical port to call the command with (optional)
+ * @lport_valid: when false, FW uses port owned by the PF instead of lport,
+ *               when PF owns more than 1 port it must be true
+ * @active_option_idx: index of active port option in returned buffer
+ * @active_option_valid: active option in returned buffer is valid
+ *
+ * Calls Get Port Options AQC (0x06ea) and verifies result.
+ */
+enum ice_status
+ice_aq_get_port_options(struct ice_hw *hw,
+			struct ice_aqc_get_port_options_elem *options,
+			u8 *option_count, u8 lport, bool lport_valid,
+			u8 *active_option_idx, bool *active_option_valid)
+{
+	struct ice_aqc_get_port_options *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u8 pmd_count;
+	u8 max_speed;
+	u8 i;
 
-	/* find a parent node */
-	parent = ice_sched_get_free_qparent(pi, vsi_handle, tc,
-					    ICE_SCHED_NODE_OWNER_LAN);
-	if (!parent) {
-		status = ICE_ERR_PARAM;
-		goto ena_txq_exit;
-	}
+	/* options buffer shall be able to hold max returned options */
+	if (*option_count < ICE_AQC_PORT_OPT_COUNT_M)
+		return ICE_ERR_PARAM;
 
-	buf->parent_teid = parent->info.node_teid;
-	node.parent_teid = parent->info.node_teid;
-	/* Mark that the values in the "generic" section as valid. The default
-	 * value in the "generic" section is zero. This means that :
-	 * - Scheduling mode is Bytes Per Second (BPS), indicated by Bit 0.
-	 * - 0 priority among siblings, indicated by Bit 1-3.
-	 * - WFQ, indicated by Bit 4.
-	 * - 0 Adjustment value is used in PSM credit update flow, indicated by
-	 * Bit 5-6.
-	 * - Bit 7 is reserved.
-	 * Without setting the generic section as valid in valid_sections, the
-	 * Admin queue command will fail with error code ICE_AQ_RC_EINVAL.
-	 */
-	buf->txqs[0].info.valid_sections = ICE_AQC_ELEM_VALID_GENERIC;
+	cmd = &desc.params.get_port_options;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_port_options);
 
-	/* add the LAN queue */
-	status = ice_aq_add_lan_txq(hw, num_qgrps, buf, buf_size, cd);
-	if (status) {
-		ice_debug(hw, ICE_DBG_SCHED, "enable queue %d failed %d\n",
-			  le16_to_cpu(buf->txqs[0].txq_id),
-			  hw->adminq.sq_last_status);
-		goto ena_txq_exit;
-	}
+	if (lport_valid)
+		cmd->lport_num = lport;
+	cmd->lport_num_valid = lport_valid;
 
-	node.node_teid = buf->txqs[0].q_teid;
-	node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF;
-	q_ctx->q_handle = q_handle;
+	status = ice_aq_send_cmd(hw, &desc, options,
+				 *option_count * sizeof(*options), NULL);
+	if (status)
+		return status;
 
-	/* add a leaf node into schduler tree queue layer */
-	status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1, &node);
+	/* verify direct FW response & set output parameters */
+	*option_count = cmd->port_options_count & ICE_AQC_PORT_OPT_COUNT_M;
+	ice_debug(hw, ICE_DBG_PHY, "options: %x\n", *option_count);
+	*active_option_valid = cmd->port_options & ICE_AQC_PORT_OPT_VALID;
+	if (*active_option_valid) {
+		*active_option_idx = cmd->port_options &
+				     ICE_AQC_PORT_OPT_ACTIVE_M;
+		if (*active_option_idx > (*option_count - 1))
+			return ICE_ERR_OUT_OF_RANGE;
+		ice_debug(hw, ICE_DBG_PHY, "active idx: %x\n",
+			  *active_option_idx);
+	}
 
-ena_txq_exit:
-	mutex_unlock(&pi->sched_lock);
-	return status;
+	/* verify indirect FW response & mask output options fields */
+	for (i = 0; i < *option_count; i++) {
+		options[i].pmd &= ICE_AQC_PORT_OPT_PMD_COUNT_M;
+		options[i].max_lane_speed &= ICE_AQC_PORT_OPT_MAX_LANE_M;
+		pmd_count = options[i].pmd;
+		max_speed = options[i].max_lane_speed;
+		ice_debug(hw, ICE_DBG_PHY, "pmds: %x max speed: %x\n",
+			  pmd_count, max_speed);
+
+		/* check only entries containing valid max pmd speed values,
+		 * other reserved values may be returned, when logical port
+		 * used is unrelated to specific option
+		 */
+		if (max_speed <= ICE_AQC_PORT_OPT_MAX_LANE_100G) {
+			if (pmd_count > ICE_MAX_PORT_PER_PCI_DEV)
+				return ICE_ERR_OUT_OF_RANGE;
+			if (pmd_count > 2 &&
+			    max_speed > ICE_AQC_PORT_OPT_MAX_LANE_25G)
+				return ICE_ERR_CFG;
+			if (pmd_count > 7 &&
+			    max_speed > ICE_AQC_PORT_OPT_MAX_LANE_10G)
+				return ICE_ERR_CFG;
+		}
+	}
+
+	return 0;
 }
 
 /**
- * ice_dis_vsi_txq
- * @pi: port information structure
- * @vsi_handle: software VSI handle
- * @tc: TC number
- * @num_queues: number of queues
- * @q_handles: pointer to software queue handle array
- * @q_ids: pointer to the q_id array
- * @q_teids: pointer to queue node teids
- * @rst_src: if called due to reset, specifies the reset source
- * @vmvf_num: the relative VM or VF number that is undergoing the reset
+ * ice_aq_set_lldp_mib - Set the LLDP MIB
+ * @hw: pointer to the HW struct
+ * @mib_type: Local, Remote or both Local and Remote MIBs
+ * @buf: pointer to the caller-supplied buffer to store the MIB block
+ * @buf_size: size of the buffer (in bytes)
  * @cd: pointer to command details structure or NULL
  *
- * This function removes queues and their corresponding nodes in SW DB
+ * Set the LLDP MIB. (0x0A08)
  */
 enum ice_status
-ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues,
-		u16 *q_handles, u16 *q_ids, u32 *q_teids,
-		enum ice_disq_rst_src rst_src, u16 vmvf_num,
-		struct ice_sq_cd *cd)
+ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
+		    struct ice_sq_cd *cd)
 {
-	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
-	struct ice_aqc_dis_txq_item qg_list;
-	struct ice_q_ctx *q_ctx;
-	u16 i;
+	struct ice_aqc_lldp_set_local_mib *cmd;
+	struct ice_aq_desc desc;
 
-	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
-		return ICE_ERR_CFG;
+	cmd = &desc.params.lldp_set_mib;
 
-	if (!num_queues) {
-		/* if queue is disabled already yet the disable queue command
-		 * has to be sent to complete the VF reset, then call
-		 * ice_aq_dis_lan_txq without any queue information
-		 */
-		if (rst_src)
-			return ice_aq_dis_lan_txq(pi->hw, 0, NULL, 0, rst_src,
-						  vmvf_num, NULL);
-		return ICE_ERR_CFG;
-	}
+	if (buf_size == 0 || !buf)
+		return ICE_ERR_PARAM;
 
-	mutex_lock(&pi->sched_lock);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_set_local_mib);
 
-	for (i = 0; i < num_queues; i++) {
-		struct ice_sched_node *node;
+	desc.flags |= cpu_to_le16((u16)ICE_AQ_FLAG_RD);
+	desc.datalen = cpu_to_le16(buf_size);
 
-		node = ice_sched_find_node_by_teid(pi->root, q_teids[i]);
-		if (!node)
-			continue;
-		q_ctx = ice_get_lan_q_ctx(pi->hw, vsi_handle, tc, q_handles[i]);
-		if (!q_ctx) {
-			ice_debug(pi->hw, ICE_DBG_SCHED, "invalid queue handle%d\n",
-				  q_handles[i]);
-			continue;
-		}
-		if (q_ctx->q_handle != q_handles[i]) {
-			ice_debug(pi->hw, ICE_DBG_SCHED, "Err:handles %d %d\n",
-				  q_ctx->q_handle, q_handles[i]);
-			continue;
-		}
-		qg_list.parent_teid = node->info.parent_teid;
-		qg_list.num_qs = 1;
-		qg_list.q_id[0] = cpu_to_le16(q_ids[i]);
-		status = ice_aq_dis_lan_txq(pi->hw, 1, &qg_list,
-					    sizeof(qg_list), rst_src, vmvf_num,
-					    cd);
+	cmd->type = mib_type;
+	cmd->length = cpu_to_le16(buf_size);
 
-		if (status)
-			break;
-		ice_free_sched_node(pi, node);
-		q_ctx->q_handle = ICE_INVAL_Q_HANDLE;
-	}
-	mutex_unlock(&pi->sched_lock);
-	return status;
+	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 }
 
 /**
- * ice_cfg_vsi_qs - configure the new/existing VSI queues
- * @pi: port information structure
- * @vsi_handle: software VSI handle
- * @tc_bitmap: TC bitmap
- * @maxqs: max queues array per TC
- * @owner: LAN or RDMA
- *
- * This function adds/updates the VSI queues per TC.
+ * ice_fw_supports_lldp_fltr_ctrl - check NVM version supports lldp_fltr_ctrl
+ * @hw: pointer to HW struct
  */
-static enum ice_status
-ice_cfg_vsi_qs(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,
-	       u16 *maxqs, u8 owner)
+bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw)
 {
-	enum ice_status status = 0;
-	u8 i;
+	if (hw->mac_type != ICE_MAC_E810)
+		return false;
+
+	if (hw->api_maj_ver == ICE_FW_API_LLDP_FLTR_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_LLDP_FLTR_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_LLDP_FLTR_MIN &&
+		    hw->api_patch >= ICE_FW_API_LLDP_FLTR_PATCH)
+			return true;
+	} else if (hw->api_maj_ver > ICE_FW_API_LLDP_FLTR_MAJ) {
+		return true;
+	}
+	return false;
+}
 
-	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
-		return ICE_ERR_CFG;
+/**
+ * ice_lldp_fltr_add_remove - add or remove a LLDP Rx switch filter
+ * @hw: pointer to HW struct
+ * @vsi_num: absolute HW index for VSI
+ * @add: boolean for if adding or removing a filter
+ */
+enum ice_status
+ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add)
+{
+	struct ice_aqc_lldp_filter_ctrl *cmd;
+	struct ice_aq_desc desc;
 
-	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
-		return ICE_ERR_PARAM;
+	cmd = &desc.params.lldp_filter_ctrl;
 
-	mutex_lock(&pi->sched_lock);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_filter_ctrl);
 
-	ice_for_each_traffic_class(i) {
-		/* configuration is possible only if TC node is present */
-		if (!ice_sched_get_tc_node(pi, i))
-			continue;
+	if (add)
+		cmd->cmd_flags = ICE_AQC_LLDP_FILTER_ACTION_ADD;
+	else
+		cmd->cmd_flags = ICE_AQC_LLDP_FILTER_ACTION_DELETE;
 
-		status = ice_sched_cfg_vsi(pi, vsi_handle, i, maxqs[i], owner,
-					   ice_is_tc_ena(tc_bitmap, i));
-		if (status)
-			break;
-	}
+	cmd->vsi_num = cpu_to_le16(vsi_num);
 
-	mutex_unlock(&pi->sched_lock);
-	return status;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 }
 
 /**
- * ice_cfg_vsi_lan - configure VSI LAN queues
- * @pi: port information structure
- * @vsi_handle: software VSI handle
- * @tc_bitmap: TC bitmap
- * @max_lanqs: max LAN queues array per TC
+ * ice_fw_supports_report_dflt_cfg
+ * @hw: pointer to the hardware structure
  *
- * This function adds/updates the VSI LAN queues per TC.
+ * Checks if the firmware supports report default configuration
  */
-enum ice_status
-ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,
-		u16 *max_lanqs)
+bool ice_fw_supports_report_dflt_cfg(struct ice_hw *hw)
 {
-	return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_lanqs,
-			      ICE_SCHED_NODE_OWNER_LAN);
+	if (hw->api_maj_ver == ICE_FW_API_REPORT_DFLT_CFG_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_REPORT_DFLT_CFG_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_REPORT_DFLT_CFG_MIN &&
+		    hw->api_patch >= ICE_FW_API_REPORT_DFLT_CFG_PATCH)
+			return true;
+	} else if (hw->api_maj_ver > ICE_FW_API_REPORT_DFLT_CFG_MAJ) {
+		return true;
+	}
+	return false;
 }
 
 /**
- * ice_replay_pre_init - replay pre initialization
- * @hw: pointer to the HW struct
+ * ice_is_pca9575_sw_handle
+ * @hw: pointer to the hw struct
+ * @handle: GPIO controller's handle
  *
- * Initializes required config data for VSI, FD, ACL, and RSS before replay.
+ * This command will check if the reset pin is present in the netlist for
+ * a given netlist handle. The SW controlled IO expander does not have this pin
+ * populated in the netlist.
  */
-static enum ice_status ice_replay_pre_init(struct ice_hw *hw)
+static bool
+ice_is_pca9575_sw_handle(struct ice_hw *hw, u16 handle)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	u8 i;
+	struct ice_aqc_get_link_topo_pin *cmd;
+	struct ice_aq_desc desc;
 
-	/* Delete old entries from replay filter list head if there is any */
-	ice_rm_all_sw_replay_rule_info(hw);
-	/* In start of replay, move entries into replay_rules list, it
-	 * will allow adding rules entries back to filt_rules list,
-	 * which is operational list.
+	cmd = &desc.params.get_link_topo_pin;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo_pin);
+
+	/* set node comtext to the given GPIO controller */
+	cmd->addr.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_CTX_PROVIDED <<
+		 ICE_AQC_LINK_TOPO_NODE_CTX_S);
+	cmd->addr.handle = handle;
+
+	/* Try finding the reset pin in the GPIO context */
+	cmd->input_io_params = (ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_GPIO <<
+				ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_S) |
+			       ICE_AQC_LINK_TOPO_IO_FUNC_RESET_N;
+
+	/* If the expander is controlled by software the following command
+	 * should return error ICE_AQ_RC_ENXIO
 	 */
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++)
-		list_replace_init(&sw->recp_list[i].filt_rules,
-				  &sw->recp_list[i].filt_replay_rules);
+	if (ice_aq_send_cmd(hw, &desc, NULL, 0, NULL) &&
+	    hw->adminq.sq_last_status == ICE_AQ_RC_ENXIO)
+		return true;
 
-	return 0;
+	return false;
 }
 
 /**
- * ice_replay_vsi - replay VSI configuration
- * @hw: pointer to the HW struct
- * @vsi_handle: driver VSI handle
+ * ice_get_pca9575_handle
+ * @hw: pointer to the hw struct
+ * @pca9575_handle: GPIO controller's handle
  *
- * Restore all VSI configuration after reset. It is required to call this
- * function with main VSI first.
+ * Find and return the GPIO controller's handle in the netlist.
+ * When found - the value will be cached in the hw structure and following calls
+ * will return cached value
  */
-enum ice_status ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle)
+static enum ice_status
+ice_get_pca9575_handle(struct ice_hw *hw, __le16 *pca9575_handle)
 {
+	struct ice_aqc_get_link_topo *cmd;
+	struct ice_aq_desc desc;
 	enum ice_status status;
+	__le16 handle;
+	u8 idx;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
+	if (!hw || !pca9575_handle)
 		return ICE_ERR_PARAM;
 
-	/* Replay pre-initialization if there is any */
-	if (vsi_handle == ICE_MAIN_VSI_HANDLE) {
-		status = ice_replay_pre_init(hw);
+	/* If handle was read previously return cached value */
+	if (hw->io_expander_handle) {
+		*pca9575_handle = hw->io_expander_handle;
+		return 0;
+	}
+
+	/* If handle was not detected read it from the netlist */
+	cmd = &desc.params.get_link_topo;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo);
+
+	/* Set node type to GPIO controller */
+	cmd->addr.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_TYPE_M &
+		 ICE_AQC_LINK_TOPO_NODE_TYPE_GPIO_CTRL);
+
+#define SW_PCA9575_MAX_TOPO_IDX	2
+
+	/* SW IO expander is usually the last one in the netlist. Scan the
+	 * netlist backward and see if we find it. Index 0 is assigned to
+	 * the IO widget so we skip it.
+	 */
+	for (idx = SW_PCA9575_MAX_TOPO_IDX; idx > 0; idx--) {
+		cmd->addr.topo_params.index = idx;
+
+		status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 		if (status)
-			return status;
+			continue;
+
+		handle = desc.params.get_link_topo.addr.handle;
+
+		/* Verify if we found the right IO expander type */
+		if (desc.params.get_link_topo.node_part_num ==
+		    ICE_ACQ_GET_LINK_TOPO_NODE_NR_PCA9575 &&
+		    ice_is_pca9575_sw_handle(hw, handle))
+			break;
 	}
 
-	/* Replay per VSI all filters */
-	status = ice_replay_vsi_all_fltr(hw, vsi_handle);
-	return status;
+	/* Expander not found */
+	if (!cmd->addr.topo_params.index)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	/* If present save the handle and return it */
+	hw->io_expander_handle = desc.params.get_link_topo.addr.handle;
+	*pca9575_handle = hw->io_expander_handle;
+
+	return 0;
 }
 
 /**
- * ice_replay_post - post replay configuration cleanup
- * @hw: pointer to the HW struct
+ * ice_read_e810t_pca9575_reg
+ * @hw: pointer to the hw struct
+ * @offset: GPIO controller register offset
+ * @data: pointer to data to be read from the GPIO controller
  *
- * Post replay cleanup.
+ * Read the register from the GPIO controller
  */
-void ice_replay_post(struct ice_hw *hw)
+enum ice_status
+ice_read_e810t_pca9575_reg(struct ice_hw *hw, u8 offset, u8 *data)
 {
-	/* Delete old entries from replay filter list head */
-	ice_rm_all_sw_replay_rule_info(hw);
-}
+	struct ice_aqc_link_topo_addr link_topo;
+	enum ice_status status;
+	__le16 addr;
 
-/**
- * ice_stat_update40 - read 40 bit stat from the chip and update stat values
- * @hw: ptr to the hardware info
- * @reg: offset of 64 bit HW register to read from
- * @prev_stat_loaded: bool to specify if previous stats are loaded
- * @prev_stat: ptr to previous loaded stat value
- * @cur_stat: ptr to current stat value
- */
-void
-ice_stat_update40(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
-		  u64 *prev_stat, u64 *cur_stat)
-{
-	u64 new_data = rd64(hw, reg) & (BIT_ULL(40) - 1);
+	memset(&link_topo, 0, sizeof(link_topo));
 
-	/* device stats are not reset at PFR, they likely will not be zeroed
-	 * when the driver starts. Thus, save the value from the first read
-	 * without adding to the statistic value so that we report stats which
-	 * count up from zero.
-	 */
-	if (!prev_stat_loaded) {
-		*prev_stat = new_data;
-		return;
-	}
+	status = ice_get_pca9575_handle(hw, &link_topo.handle);
+	if (status)
+		return status;
 
-	/* Calculate the difference between the new and old values, and then
-	 * add it to the software stat value.
-	 */
-	if (new_data >= *prev_stat)
-		*cur_stat += new_data - *prev_stat;
-	else
-		/* to manage the potential roll-over */
-		*cur_stat += (new_data + BIT_ULL(40)) - *prev_stat;
+	link_topo.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_CTX_PROVIDED <<
+		 ICE_AQC_LINK_TOPO_NODE_CTX_S);
 
-	/* Update the previously stored value to prepare for next read */
-	*prev_stat = new_data;
+	addr = cpu_to_le16((u16)offset);
+
+	return ice_aq_read_i2c(hw, link_topo, 0, addr, 1, data, NULL);
 }
 
 /**
- * ice_stat_update32 - read 32 bit stat from the chip and update stat values
- * @hw: ptr to the hardware info
- * @reg: offset of HW register to read from
- * @prev_stat_loaded: bool to specify if previous stats are loaded
- * @prev_stat: ptr to previous loaded stat value
- * @cur_stat: ptr to current stat value
+ * ice_write_e810t_pca9575_reg
+ * @hw: pointer to the hw struct
+ * @offset: GPIO controller register offset
+ * @data: data to be written to the GPIO controller
+ *
+ * Write the data to the GPIO controller register
  */
-void
-ice_stat_update32(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
-		  u64 *prev_stat, u64 *cur_stat)
+enum ice_status
+ice_write_e810t_pca9575_reg(struct ice_hw *hw, u8 offset, u8 data)
 {
-	u32 new_data;
+	struct ice_aqc_link_topo_addr link_topo;
+	enum ice_status status;
+	__le16 addr;
 
-	new_data = rd32(hw, reg);
+	memset(&link_topo, 0, sizeof(link_topo));
 
-	/* device stats are not reset at PFR, they likely will not be zeroed
-	 * when the driver starts. Thus, save the value from the first read
-	 * without adding to the statistic value so that we report stats which
-	 * count up from zero.
-	 */
-	if (!prev_stat_loaded) {
-		*prev_stat = new_data;
-		return;
-	}
+	status = ice_get_pca9575_handle(hw, &link_topo.handle);
+	if (status)
+		return status;
 
-	/* Calculate the difference between the new and old values, and then
-	 * add it to the software stat value.
-	 */
-	if (new_data >= *prev_stat)
-		*cur_stat += new_data - *prev_stat;
-	else
-		/* to manage the potential roll-over */
-		*cur_stat += (new_data + BIT_ULL(32)) - *prev_stat;
+	link_topo.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_CTX_PROVIDED <<
+		 ICE_AQC_LINK_TOPO_NODE_CTX_S);
 
-	/* Update the previously stored value to prepare for next read */
-	*prev_stat = new_data;
+	addr = cpu_to_le16((u16)offset);
+
+	return ice_aq_write_i2c(hw, link_topo, 0, addr, 1, &data, NULL);
 }
 
 /**
- * ice_sched_query_elem - query element information from HW
- * @hw: pointer to the HW struct
- * @node_teid: node TEID to be queried
- * @buf: buffer to element information
+ * ice_e810t_is_pca9575_present
+ * @hw: pointer to the hw struct
  *
- * This function queries HW element information
+ * Check if the SW IO expander is present on the board
  */
-enum ice_status
-ice_sched_query_elem(struct ice_hw *hw, u32 node_teid,
-		     struct ice_aqc_get_elem *buf)
+bool ice_e810t_is_pca9575_present(struct ice_hw *hw)
 {
-	u16 buf_size, num_elem_ret = 0;
 	enum ice_status status;
+	u8 data;
 
-	buf_size = sizeof(*buf);
-	memset(buf, 0, buf_size);
-	buf->generic[0].node_teid = cpu_to_le32(node_teid);
-	status = ice_aq_query_sched_elems(hw, 1, buf, buf_size, &num_elem_ret,
-					  NULL);
-	if (status || num_elem_ret != 1)
-		ice_debug(hw, ICE_DBG_SCHED, "query element failed\n");
-	return status;
+	status = ice_read_e810t_pca9575_reg(hw, ICE_PCA9575_P0_IN, &data);
+
+	if (status)
+		return false;
+
+	return true;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index c3df92f57777..e328501a2330 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -1,23 +1,34 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_COMMON_H_
 #define _ICE_COMMON_H_
 
 #include "ice.h"
 #include "ice_type.h"
+#include "ice_nvm.h"
 #include "ice_flex_pipe.h"
+#include "virtchnl.h"
 #include "ice_switch.h"
-#include <linux/avf/virtchnl.h>
+#include "ice_fdir.h"
 
-enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw);
+#define ICE_SQ_SEND_DELAY_TIME_MS	10
+#define ICE_SQ_SEND_MAX_EXECUTE		3
 
-void
-ice_debug_cq(struct ice_hw *hw, u32 mask, void *desc, void *buf, u16 buf_len);
+enum ice_fw_modes {
+	ICE_FW_MODE_NORMAL,
+	ICE_FW_MODE_DBG,
+	ICE_FW_MODE_REC,
+	ICE_FW_MODE_ROLLBACK
+};
+
+
+void ice_set_umac_shared(struct ice_hw *hw);
 enum ice_status ice_init_hw(struct ice_hw *hw);
 void ice_deinit_hw(struct ice_hw *hw);
 enum ice_status ice_check_reset(struct ice_hw *hw);
 enum ice_status ice_reset(struct ice_hw *hw, enum ice_reset_req req);
+
 enum ice_status ice_create_all_ctrlq(struct ice_hw *hw);
 enum ice_status ice_init_all_ctrlq(struct ice_hw *hw);
 void ice_shutdown_all_ctrlq(struct ice_hw *hw);
@@ -32,43 +43,73 @@ enum ice_status
 ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 		enum ice_aq_res_access_type access, u32 timeout);
 void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res);
-enum ice_status ice_init_nvm(struct ice_hw *hw);
 enum ice_status
-ice_read_sr_buf(struct ice_hw *hw, u16 offset, u16 *words, u16 *data);
+ice_alloc_hw_res(struct ice_hw *hw, u16 type, u16 num, bool btm, u16 *res);
+enum ice_status
+ice_free_hw_res(struct ice_hw *hw, u16 type, u16 num, u16 *res);
+enum ice_status
+ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries,
+		      struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size,
+		      enum ice_adminq_opc opc, struct ice_sq_cd *cd);
+enum ice_status
+ice_sq_send_cmd_nolock(struct ice_hw *hw, struct ice_ctl_q_info *cq,
+		       struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		       struct ice_sq_cd *cd);
 enum ice_status
 ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		struct ice_aq_desc *desc, void *buf, u16 buf_size,
 		struct ice_sq_cd *cd);
 void ice_clear_pxe_mode(struct ice_hw *hw);
+
 enum ice_status ice_get_caps(struct ice_hw *hw);
 
 void ice_set_safe_mode_caps(struct ice_hw *hw);
 
-void ice_dev_onetime_setup(struct ice_hw *hw);
+
+
+
 
 enum ice_status
 ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx,
 		  u32 rxq_index);
+enum ice_status ice_clear_rxq_ctx(struct ice_hw *hw, u32 rxq_index);
+enum ice_status
+ice_clear_tx_cmpltnq_ctx(struct ice_hw *hw, u32 tx_cmpltnq_index);
+enum ice_status
+ice_write_tx_cmpltnq_ctx(struct ice_hw *hw,
+			 struct ice_tx_cmpltnq_ctx *tx_cmpltnq_ctx,
+			 u32 tx_cmpltnq_index);
+enum ice_status
+ice_clear_tx_drbell_q_ctx(struct ice_hw *hw, u32 tx_drbell_q_index);
+enum ice_status
+ice_write_tx_drbell_q_ctx(struct ice_hw *hw,
+			  struct ice_tx_drbell_q_ctx *tx_drbell_q_ctx,
+			  u32 tx_drbell_q_index);
 
 enum ice_status
-ice_aq_get_rss_lut(struct ice_hw *hw, u16 vsi_handle, u8 lut_type, u8 *lut,
-		   u16 lut_size);
+ice_aq_get_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *get_params);
 enum ice_status
-ice_aq_set_rss_lut(struct ice_hw *hw, u16 vsi_handle, u8 lut_type, u8 *lut,
-		   u16 lut_size);
+ice_aq_set_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *set_params);
 enum ice_status
 ice_aq_get_rss_key(struct ice_hw *hw, u16 vsi_handle,
 		   struct ice_aqc_get_set_rss_keys *keys);
 enum ice_status
 ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle,
 		   struct ice_aqc_get_set_rss_keys *keys);
+enum ice_status
+ice_aq_move_recfg_lan_txq(struct ice_hw *hw, u8 num_qs, bool is_move,
+			  bool is_tc_change, bool subseq_call, bool flush_pipe,
+			  u8 timeout, u32 *blocked_cgds,
+			  struct ice_aqc_move_txqs_data *buf, u16 buf_size,
+			  u8 *txqs_moved, struct ice_sq_cd *cd);
 
 bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq);
 enum ice_status ice_aq_q_shutdown(struct ice_hw *hw, bool unloading);
 void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode);
 extern const struct ice_ctx_ele ice_tlan_ctx_info[];
 enum ice_status
-ice_set_ctx(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info);
+ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx,
+	    const struct ice_ctx_ele *ce_info);
 
 extern struct mutex ice_global_cfg_lock_sw;
 
@@ -81,31 +122,59 @@ enum ice_status
 ice_aq_send_driver_ver(struct ice_hw *hw, struct ice_driver_ver *dv,
 		       struct ice_sq_cd *cd);
 enum ice_status
+ice_aq_set_port_params(struct ice_port_info *pi, u16 bad_frame_vsi,
+		       bool save_bad_pac, bool pad_short_pac, bool double_vlan,
+		       struct ice_sq_cd *cd);
+enum ice_status
 ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 		    struct ice_aqc_get_phy_caps_data *caps,
 		    struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_list_caps(struct ice_hw *hw, void *buf, u16 buf_size, u32 *cap_count,
+		 enum ice_adminq_opc opc, struct ice_sq_cd *cd);
+enum ice_status
+ice_discover_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_caps);
 void
 ice_update_phy_type(u64 *phy_type_low, u64 *phy_type_high,
 		    u16 link_speeds_bitmap);
 enum ice_status
 ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags,
 			struct ice_sq_cd *cd);
+
 enum ice_status ice_clear_pf_cfg(struct ice_hw *hw);
 enum ice_status
-ice_aq_set_phy_cfg(struct ice_hw *hw, u8 lport,
+ice_aq_set_phy_cfg(struct ice_hw *hw, struct ice_port_info *pi,
 		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd);
+bool ice_fw_supports_link_override(struct ice_hw *hw);
+enum ice_status
+ice_get_link_default_override(struct ice_link_default_override_tlv *ldo,
+			      struct ice_port_info *pi);
+bool ice_is_phy_caps_an_enabled(struct ice_aqc_get_phy_caps_data *caps);
+
+enum ice_fc_mode ice_caps_to_fc_mode(u8 caps);
+enum ice_fec_mode ice_caps_to_fec_mode(u8 caps, u8 fec_options);
 enum ice_status
 ice_set_fc(struct ice_port_info *pi, u8 *aq_failures,
 	   bool ena_auto_link_update);
+enum ice_status
+ice_cfg_phy_fc(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg,
+	       enum ice_fc_mode req_mode);
+bool
+ice_phy_caps_equals_cfg(struct ice_aqc_get_phy_caps_data *caps,
+			struct ice_aqc_set_phy_cfg_data *cfg);
 void
-ice_cfg_phy_fec(struct ice_aqc_set_phy_cfg_data *cfg, enum ice_fec_mode fec);
-void
-ice_copy_phy_caps_to_cfg(struct ice_aqc_get_phy_caps_data *caps,
+ice_copy_phy_caps_to_cfg(struct ice_port_info *pi,
+			 struct ice_aqc_get_phy_caps_data *caps,
 			 struct ice_aqc_set_phy_cfg_data *cfg);
 enum ice_status
+ice_cfg_phy_fec(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg,
+		enum ice_fec_mode fec);
+enum ice_status
 ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
 			   struct ice_sq_cd *cd);
 enum ice_status
+ice_aq_set_mac_cfg(struct ice_hw *hw, u16 max_frame_size, struct ice_sq_cd *cd);
+enum ice_status
 ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 		     struct ice_link_status *link, struct ice_sq_cd *cd);
 enum ice_status
@@ -114,17 +183,54 @@ ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
 enum ice_status
 ice_aq_set_mac_loopback(struct ice_hw *hw, bool ena_lpbk, struct ice_sq_cd *cd);
 
+
 enum ice_status
 ice_aq_set_port_id_led(struct ice_port_info *pi, bool is_orig_mode,
 		       struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_sff_eeprom(struct ice_hw *hw, u16 lport, u8 bus_addr,
+		  u16 mem_addr, u8 page, u8 set_page, u8 *data, u8 length,
+		  bool write, struct ice_sq_cd *cd);
+
+enum ice_status
+ice_aq_prog_topo_dev_nvm(struct ice_hw *hw,
+			 struct ice_aqc_link_topo_params *topo_params,
+			 struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_read_topo_dev_nvm(struct ice_hw *hw,
+			 struct ice_aqc_link_topo_params *topo_params,
+			 u32 start_address, u8 *buf, u8 buf_size,
+			 struct ice_sq_cd *cd);
 
+
+void ice_dump_port_info(struct ice_port_info *pi);
+void ice_dump_caps(struct ice_hw *hw);
+void ice_dump_ptp_dev_caps(struct ice_hw *hw);
+void ice_dump_ptp_func_caps(struct ice_hw *hw);
+enum ice_status ice_dump_port_dflt_topo(struct ice_port_info *pi);
+void ice_dump_port_topo(struct ice_port_info *pi);
+
+enum ice_status
+ice_aq_get_port_options(struct ice_hw *hw,
+			struct ice_aqc_get_port_options_elem *options,
+			u8 *option_count, u8 lport, bool lport_valid,
+			u8 *active_option_idx, bool *active_option_valid);
+enum ice_status
+ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+		 u16 *max_rdmaqs);
+enum ice_status
+ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 *rdma_qset, u16 num_qsets, u32 *qset_teid);
+enum ice_status
+ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid,
+		      u16 *q_id);
 enum ice_status
 ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues,
 		u16 *q_handle, u16 *q_ids, u32 *q_teids,
 		enum ice_disq_rst_src rst_src, u16 vmvf_num,
 		struct ice_sq_cd *cd);
 enum ice_status
-ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,
+ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
 		u16 *max_lanqs);
 enum ice_status
 ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle,
@@ -132,17 +238,77 @@ ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle,
 		struct ice_sq_cd *cd);
 enum ice_status ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle);
 void ice_replay_post(struct ice_hw *hw);
-void ice_output_fw_log(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf);
+struct ice_q_ctx *
+ice_get_lan_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 q_handle);
+enum ice_status ice_sbq_rw_reg_lp(struct ice_hw *hw,
+				  struct ice_sbq_msg_input *in, bool lock);
+void ice_sbq_lock(struct ice_hw *hw);
+void ice_sbq_unlock(struct ice_hw *hw);
+enum ice_status ice_sbq_rw_reg(struct ice_hw *hw, struct ice_sbq_msg_input *in);
 void
 ice_stat_update40(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
 		  u64 *prev_stat, u64 *cur_stat);
 void
 ice_stat_update32(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
 		  u64 *prev_stat, u64 *cur_stat);
-void
-ice_get_nvm_version(struct ice_hw *hw, u8 *oem_ver, u16 *oem_build,
-		    u8 *oem_patch, u8 *ver_hi, u8 *ver_lo);
+enum ice_fw_modes ice_get_fw_mode(struct ice_hw *hw);
+void ice_print_rollback_msg(struct ice_hw *hw);
+bool ice_is_generic_mac(struct ice_hw *hw);
+bool ice_is_e810(struct ice_hw *hw);
 enum ice_status
 ice_sched_query_elem(struct ice_hw *hw, u32 node_teid,
-		     struct ice_aqc_get_elem *buf);
+		     struct ice_aqc_txsched_elem_data *buf);
+enum ice_status
+ice_aq_set_driver_param(struct ice_hw *hw, enum ice_aqc_driver_params idx,
+			u32 value, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_get_driver_param(struct ice_hw *hw, enum ice_aqc_driver_params idx,
+			u32 *value, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx, bool value,
+		struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_get_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx,
+		bool *value, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
+		    struct ice_sq_cd *cd);
+bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw);
+enum ice_status
+ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add);
+enum ice_status
+ice_aq_read_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
+		u16 bus_addr, __le16 addr, u8 params, u8 *data,
+		struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_write_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
+		 u16 bus_addr, __le16 addr, u8 params, u8 *data,
+		 struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_health_status_config(struct ice_hw *hw, u8 event_source,
+				struct ice_sq_cd *cd);
+bool ice_is_fw_health_report_supported(struct ice_hw *hw);
+bool ice_fw_supports_report_dflt_cfg(struct ice_hw *hw);
+
+/* E810T PCA9575 IO controller registers */
+#define ICE_PCA9575_P0_IN	0x0
+#define ICE_PCA9575_P1_IN	0x1
+#define ICE_PCA9575_P0_CFG	0x8
+#define ICE_PCA9575_P1_CFG	0x9
+#define ICE_PCA9575_P0_OUT	0xA
+#define ICE_PCA9575_P1_OUT	0xB
+
+/* E810T PCA9575 IO controller pin control */
+#define ICE_E810T_P0_GNSS_PRSNT_N	BIT(4)
+#define ICE_E810T_P1_SMA1_DIR_EN	BIT(4)
+#define ICE_E810T_P1_SMA1_TX_EN		BIT(5)
+#define ICE_E810T_P1_SMA2_UFL2_RX_DIS	BIT(3)
+#define ICE_E810T_P1_SMA2_DIR_EN	BIT(6)
+#define ICE_E810T_P1_SMA2_TX_EN		BIT(7)
+
+enum ice_status
+ice_read_e810t_pca9575_reg(struct ice_hw *hw, u8 offset, u8 *data);
+enum ice_status
+ice_write_e810t_pca9575_reg(struct ice_hw *hw, u8 offset, u8 data);
+bool ice_e810t_is_pca9575_present(struct ice_hw *hw);
 #endif /* _ICE_COMMON_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c
index 2e9c97bad3c3..345ca841b429 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.c
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 
+
 #define ICE_CQ_INIT_REGS(qinfo, prefix)				\
 do {								\
 	(qinfo)->sq.head = prefix##_ATQH;			\
@@ -12,6 +13,7 @@ do {								\
 	(qinfo)->sq.bal = prefix##_ATQBAL;			\
 	(qinfo)->sq.len_mask = prefix##_ATQLEN_ATQLEN_M;	\
 	(qinfo)->sq.len_ena_mask = prefix##_ATQLEN_ATQENABLE_M;	\
+	(qinfo)->sq.len_crit_mask = prefix##_ATQLEN_ATQCRIT_M;	\
 	(qinfo)->sq.head_mask = prefix##_ATQH_ATQH_M;		\
 	(qinfo)->rq.head = prefix##_ARQH;			\
 	(qinfo)->rq.tail = prefix##_ARQT;			\
@@ -20,9 +22,11 @@ do {								\
 	(qinfo)->rq.bal = prefix##_ARQBAL;			\
 	(qinfo)->rq.len_mask = prefix##_ARQLEN_ARQLEN_M;	\
 	(qinfo)->rq.len_ena_mask = prefix##_ARQLEN_ARQENABLE_M;	\
+	(qinfo)->rq.len_crit_mask = prefix##_ARQLEN_ARQCRIT_M;	\
 	(qinfo)->rq.head_mask = prefix##_ARQH_ARQH_M;		\
 } while (0)
 
+
 /**
  * ice_adminq_init_regs - Initialize AdminQ registers
  * @hw: pointer to the hardware structure
@@ -36,6 +40,7 @@ static void ice_adminq_init_regs(struct ice_hw *hw)
 	ICE_CQ_INIT_REGS(cq, PF_FW);
 }
 
+
 /**
  * ice_mailbox_init_regs - Initialize Mailbox registers
  * @hw: pointer to the hardware structure
@@ -49,6 +54,19 @@ static void ice_mailbox_init_regs(struct ice_hw *hw)
 	ICE_CQ_INIT_REGS(cq, PF_MBX);
 }
 
+/**
+ * ice_sb_init_regs - Initialize Sideband registers
+ * @hw: pointer to the hardware structure
+ *
+ * This assumes the alloc_sq and alloc_rq functions have already been called
+ */
+static void ice_sb_init_regs(struct ice_hw *hw)
+{
+	struct ice_ctl_q_info *cq = &hw->sbq;
+
+	ICE_CQ_INIT_REGS(cq, PF_SB);
+}
+
 /**
  * ice_check_sq_alive
  * @hw: pointer to the HW struct
@@ -310,9 +328,10 @@ ice_cfg_rq_regs(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 
 #define ICE_FREE_CQ_BUFS(hw, qi, ring)					\
 do {									\
-	int i;								\
 	/* free descriptors */						\
-	if ((qi)->ring.r.ring##_bi)					\
+	if ((qi)->ring.r.ring##_bi) {					\
+		int i;							\
+									\
 		for (i = 0; i < (qi)->num_##ring##_entries; i++)	\
 			if ((qi)->ring.r.ring##_bi[i].pa) {		\
 				dmam_free_coherent(ice_hw_to_dev(hw),	\
@@ -323,6 +342,7 @@ do {									\
 					(qi)->ring.r.ring##_bi[i].pa = 0;\
 					(qi)->ring.r.ring##_bi[i].size = 0;\
 		}							\
+	}								\
 	/* free the buffer info list */					\
 	if ((qi)->ring.cmd_buf)						\
 		devm_kfree(ice_hw_to_dev(hw), (qi)->ring.cmd_buf);	\
@@ -555,6 +575,7 @@ ice_shutdown_rq(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 	return ret_code;
 }
 
+
 /**
  * ice_init_check_adminq - Check version for Admin Queue to know if its alive
  * @hw: pointer to the hardware structure
@@ -568,6 +589,7 @@ static enum ice_status ice_init_check_adminq(struct ice_hw *hw)
 	if (status)
 		goto init_ctrlq_free_rq;
 
+
 	if (!ice_aq_ver_check(hw)) {
 		status = ICE_ERR_FW_API_VER;
 		goto init_ctrlq_free_rq;
@@ -605,6 +627,10 @@ static enum ice_status ice_init_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
 		ice_adminq_init_regs(hw);
 		cq = &hw->adminq;
 		break;
+	case ICE_CTL_Q_SB:
+		ice_sb_init_regs(hw);
+		cq = &hw->sbq;
+		break;
 	case ICE_CTL_Q_MAILBOX:
 		ice_mailbox_init_regs(hw);
 		cq = &hw->mailboxq;
@@ -641,6 +667,68 @@ static enum ice_status ice_init_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
 	return ret_code;
 }
 
+/**
+ * ice_is_sbq_supported - is the sideband queue supported
+ * @hw: pointer to the hardware structure
+ *
+ * Returns true if the sideband control queue interface is
+ * supported for the device, false otherwise
+ */
+static bool ice_is_sbq_supported(struct ice_hw *hw)
+{
+	return ice_is_generic_mac(hw);
+}
+
+/**
+ * ice_shutdown_ctrlq - shutdown routine for any control queue
+ * @hw: pointer to the hardware structure
+ * @q_type: specific Control queue type
+ *
+ * NOTE: this function does not destroy the control queue locks.
+ */
+static void ice_shutdown_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
+{
+	struct ice_ctl_q_info *cq;
+
+	switch (q_type) {
+	case ICE_CTL_Q_ADMIN:
+		cq = &hw->adminq;
+		if (ice_check_sq_alive(hw, cq))
+			ice_aq_q_shutdown(hw, true);
+		break;
+	case ICE_CTL_Q_SB:
+		cq = &hw->sbq;
+		break;
+	case ICE_CTL_Q_MAILBOX:
+		cq = &hw->mailboxq;
+		break;
+	default:
+		return;
+	}
+
+	ice_shutdown_sq(hw, cq);
+	ice_shutdown_rq(hw, cq);
+}
+
+/**
+ * ice_shutdown_all_ctrlq - shutdown routine for all control queues
+ * @hw: pointer to the hardware structure
+ *
+ * NOTE: this function does not destroy the control queue locks. The driver
+ * may call this at runtime to shutdown and later restart control queues, such
+ * as in response to a reset event.
+ */
+void ice_shutdown_all_ctrlq(struct ice_hw *hw)
+{
+	/* Shutdown FW admin queue */
+	ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN);
+	/* Shutdown PHY Sideband */
+	if (ice_is_sbq_supported(hw))
+		ice_shutdown_ctrlq(hw, ICE_CTL_Q_SB);
+	/* Shutdown PF-VF Mailbox */
+	ice_shutdown_ctrlq(hw, ICE_CTL_Q_MAILBOX);
+}
+
 /**
  * ice_init_all_ctrlq - main initialization routine for all control queues
  * @hw: pointer to the hardware structure
@@ -656,17 +744,35 @@ static enum ice_status ice_init_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
  */
 enum ice_status ice_init_all_ctrlq(struct ice_hw *hw)
 {
-	enum ice_status ret_code;
+	enum ice_status status;
+	u32 retry = 0;
 
 	/* Init FW admin queue */
-	ret_code = ice_init_ctrlq(hw, ICE_CTL_Q_ADMIN);
-	if (ret_code)
-		return ret_code;
+	do {
+		status = ice_init_ctrlq(hw, ICE_CTL_Q_ADMIN);
+		if (status)
+			return status;
 
-	ret_code = ice_init_check_adminq(hw);
-	if (ret_code)
-		return ret_code;
+		status = ice_init_check_adminq(hw);
+		if (status != ICE_ERR_AQ_FW_CRITICAL)
+			break;
+
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Retry Admin Queue init due to FW critical error\n");
+		ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN);
+		msleep(ICE_CTL_Q_ADMIN_INIT_MSEC);
+	} while (retry++ < ICE_CTL_Q_ADMIN_INIT_TIMEOUT);
 
+	if (status)
+		return status;
+	/* sideband control queue (SBQ) interface is not supported on some
+	 * devices. Initialize if supported, else fallback to the admin queue
+	 * interface
+	 */
+	if (ice_is_sbq_supported(hw)) {
+		status = ice_init_ctrlq(hw, ICE_CTL_Q_SB);
+		if (status)
+			return status;
+	}
 	/* Init Mailbox queue */
 	return ice_init_ctrlq(hw, ICE_CTL_Q_MAILBOX);
 }
@@ -702,63 +808,20 @@ static void ice_init_ctrlq_locks(struct ice_ctl_q_info *cq)
 enum ice_status ice_create_all_ctrlq(struct ice_hw *hw)
 {
 	ice_init_ctrlq_locks(&hw->adminq);
+	if (ice_is_sbq_supported(hw))
+		ice_init_ctrlq_locks(&hw->sbq);
 	ice_init_ctrlq_locks(&hw->mailboxq);
 
 	return ice_init_all_ctrlq(hw);
 }
 
-/**
- * ice_shutdown_ctrlq - shutdown routine for any control queue
- * @hw: pointer to the hardware structure
- * @q_type: specific Control queue type
- *
- * NOTE: this function does not destroy the control queue locks.
- */
-static void ice_shutdown_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
-{
-	struct ice_ctl_q_info *cq;
-
-	switch (q_type) {
-	case ICE_CTL_Q_ADMIN:
-		cq = &hw->adminq;
-		if (ice_check_sq_alive(hw, cq))
-			ice_aq_q_shutdown(hw, true);
-		break;
-	case ICE_CTL_Q_MAILBOX:
-		cq = &hw->mailboxq;
-		break;
-	default:
-		return;
-	}
-
-	ice_shutdown_sq(hw, cq);
-	ice_shutdown_rq(hw, cq);
-}
-
-/**
- * ice_shutdown_all_ctrlq - shutdown routine for all control queues
- * @hw: pointer to the hardware structure
- *
- * NOTE: this function does not destroy the control queue locks. The driver
- * may call this at runtime to shutdown and later restart control queues, such
- * as in response to a reset event.
- */
-void ice_shutdown_all_ctrlq(struct ice_hw *hw)
-{
-	/* Shutdown FW admin queue */
-	ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN);
-	/* Shutdown PF-VF Mailbox */
-	ice_shutdown_ctrlq(hw, ICE_CTL_Q_MAILBOX);
-}
-
 /**
  * ice_destroy_ctrlq_locks - Destroy locks for a control queue
  * @cq: pointer to the control queue
  *
  * Destroys the send and receive queue locks for a given control queue.
  */
-static void
-ice_destroy_ctrlq_locks(struct ice_ctl_q_info *cq)
+static void ice_destroy_ctrlq_locks(struct ice_ctl_q_info *cq)
 {
 	mutex_destroy(&cq->sq_lock);
 	mutex_destroy(&cq->rq_lock);
@@ -779,6 +842,8 @@ void ice_destroy_all_ctrlq(struct ice_hw *hw)
 	ice_shutdown_all_ctrlq(hw);
 
 	ice_destroy_ctrlq_locks(&hw->adminq);
+	if (ice_is_sbq_supported(hw))
+		ice_destroy_ctrlq_locks(&hw->sbq);
 	ice_destroy_ctrlq_locks(&hw->mailboxq);
 }
 
@@ -800,8 +865,7 @@ static u16 ice_clean_sq(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 	details = ICE_CTL_Q_DETAILS(*sq, ntc);
 
 	while (rd32(hw, cq->sq.head) != ntc) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "ntc %d head %d.\n", ntc, rd32(hw, cq->sq.head));
+		ice_debug(hw, ICE_DBG_AQ_MSG, "ntc %d head %d.\n", ntc, rd32(hw, cq->sq.head));
 		memset(desc, 0, sizeof(*desc));
 		memset(details, 0, sizeof(*details));
 		ntc++;
@@ -816,6 +880,54 @@ static u16 ice_clean_sq(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 	return ICE_CTL_Q_DESC_UNUSED(sq);
 }
 
+/**
+ * ice_debug_cq
+ * @hw: pointer to the hardware structure
+ * @desc: pointer to control queue descriptor
+ * @buf: pointer to command buffer
+ * @buf_len: max length of buf
+ *
+ * Dumps debug log about control command with descriptor contents.
+ */
+static void ice_debug_cq(struct ice_hw *hw, void *desc, void *buf, u16 buf_len)
+{
+	struct ice_aq_desc *cq_desc = desc;
+	u16 datalen, flags;
+
+	if (!IS_ENABLED(CONFIG_DYNAMIC_DEBUG) &&
+	    !((ICE_DBG_AQ_DESC | ICE_DBG_AQ_DESC_BUF) & hw->debug_mask))
+		return;
+
+	if (!desc)
+		return;
+
+	datalen = le16_to_cpu(cq_desc->datalen);
+	flags = le16_to_cpu(cq_desc->flags);
+
+	ice_debug(hw, ICE_DBG_AQ_DESC, "CQ CMD: opcode 0x%04X, flags 0x%04X, datalen 0x%04X, retval 0x%04X\n",
+		  le16_to_cpu(cq_desc->opcode), flags, datalen,
+		  le16_to_cpu(cq_desc->retval));
+	ice_debug(hw, ICE_DBG_AQ_DESC, "\tcookie (h,l) 0x%08X 0x%08X\n",
+		  le32_to_cpu(cq_desc->cookie_high),
+		  le32_to_cpu(cq_desc->cookie_low));
+	ice_debug(hw, ICE_DBG_AQ_DESC, "\tparam (0,1)  0x%08X 0x%08X\n",
+		  le32_to_cpu(cq_desc->params.generic.param0),
+		  le32_to_cpu(cq_desc->params.generic.param1));
+	ice_debug(hw, ICE_DBG_AQ_DESC, "\taddr (h,l)   0x%08X 0x%08X\n",
+		  le32_to_cpu(cq_desc->params.generic.addr_high),
+		  le32_to_cpu(cq_desc->params.generic.addr_low));
+	/* Dump buffer iff 1) one exists and 2) is either a response indicated
+	 * by the DD and/or CMP flag set or a command with the RD flag set.
+	 */
+	if (buf && cq_desc->datalen != 0 &&
+	    (flags & (ICE_AQ_FLAG_DD | ICE_AQ_FLAG_CMP) ||
+	     flags & ICE_AQ_FLAG_RD)) {
+		ice_debug(hw, ICE_DBG_AQ_DESC_BUF, "Buffer:\n");
+		ice_debug_array(hw, ICE_DBG_AQ_DESC_BUF, 16, 1, buf,
+				min_t(u16, buf_len, datalen));
+	}
+}
+
 /**
  * ice_sq_done - check if FW has processed the Admin Send Queue (ATQ)
  * @hw: pointer to the HW struct
@@ -833,7 +945,7 @@ static bool ice_sq_done(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 }
 
 /**
- * ice_sq_send_cmd - send command to Control Queue (ATQ)
+ * ice_sq_send_cmd_nolock - send command to Control Queue (ATQ)
  * @hw: pointer to the HW struct
  * @cq: pointer to the specific Control queue
  * @desc: prefilled descriptor describing the command (non DMA mem)
@@ -845,9 +957,9 @@ static bool ice_sq_done(struct ice_hw *hw, struct ice_ctl_q_info *cq)
  * cleans the queue, etc.
  */
 enum ice_status
-ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
-		struct ice_aq_desc *desc, void *buf, u16 buf_size,
-		struct ice_sq_cd *cd)
+ice_sq_send_cmd_nolock(struct ice_hw *hw, struct ice_ctl_q_info *cq,
+		       struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		       struct ice_sq_cd *cd)
 {
 	struct ice_dma_mem *dma_buf = NULL;
 	struct ice_aq_desc *desc_on_ring;
@@ -861,13 +973,11 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	/* if reset is in progress return a soft error */
 	if (hw->reset_ongoing)
 		return ICE_ERR_RESET_ONGOING;
-	mutex_lock(&cq->sq_lock);
 
 	cq->sq_last_status = ICE_AQ_RC_OK;
 
 	if (!cq->sq.count) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Control Send queue not initialized.\n");
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Control Send queue not initialized.\n");
 		status = ICE_ERR_AQ_EMPTY;
 		goto sq_send_command_error;
 	}
@@ -879,8 +989,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 
 	if (buf) {
 		if (buf_size > cq->sq_buf_size) {
-			ice_debug(hw, ICE_DBG_AQ_MSG,
-				  "Invalid buffer size for Control Send queue: %d.\n",
+			ice_debug(hw, ICE_DBG_AQ_MSG, "Invalid buffer size for Control Send queue: %d.\n",
 				  buf_size);
 			status = ICE_ERR_INVAL_SIZE;
 			goto sq_send_command_error;
@@ -893,8 +1002,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 
 	val = rd32(hw, cq->sq.head);
 	if (val >= cq->num_sq_entries) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "head overrun at %d in the Control Send Queue ring\n",
+		ice_debug(hw, ICE_DBG_AQ_MSG, "head overrun at %d in the Control Send Queue ring\n",
 			  val);
 		status = ICE_ERR_AQ_EMPTY;
 		goto sq_send_command_error;
@@ -912,8 +1020,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	 * called in a separate thread in case of asynchronous completions.
 	 */
 	if (ice_clean_sq(hw, cq) == 0) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Error: Control Send Queue is full.\n");
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Error: Control Send Queue is full.\n");
 		status = ICE_ERR_AQ_FULL;
 		goto sq_send_command_error;
 	}
@@ -941,10 +1048,9 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	}
 
 	/* Debug desc and buffer */
-	ice_debug(hw, ICE_DBG_AQ_MSG,
-		  "ATQ: Control Send queue desc and buffer:\n");
+	ice_debug(hw, ICE_DBG_AQ_DESC, "ATQ: Control Send queue desc and buffer:\n");
 
-	ice_debug_cq(hw, ICE_DBG_AQ_CMD, (void *)desc_on_ring, buf, buf_size);
+	ice_debug_cq(hw, (void *)desc_on_ring, buf, buf_size);
 
 	(cq->sq.next_to_use)++;
 	if (cq->sq.next_to_use == cq->sq.count)
@@ -967,8 +1073,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 			u16 copy_size = le16_to_cpu(desc->datalen);
 
 			if (copy_size > buf_size) {
-				ice_debug(hw, ICE_DBG_AQ_MSG,
-					  "Return len %d > than buf len %d\n",
+				ice_debug(hw, ICE_DBG_AQ_MSG, "Return len %d > than buf len %d\n",
 					  copy_size, buf_size);
 				status = ICE_ERR_AQ_ERROR;
 			} else {
@@ -977,8 +1082,8 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		}
 		retval = le16_to_cpu(desc->retval);
 		if (retval) {
-			ice_debug(hw, ICE_DBG_AQ_MSG,
-				  "Control Send Queue command completed with error 0x%x\n",
+			ice_debug(hw, ICE_DBG_AQ_MSG, "Control Send Queue command 0x%04X completed with error 0x%X\n",
+				  le16_to_cpu(desc->opcode),
 				  retval);
 
 			/* strip off FW internal code */
@@ -990,10 +1095,9 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		cq->sq_last_status = (enum ice_aq_err)retval;
 	}
 
-	ice_debug(hw, ICE_DBG_AQ_MSG,
-		  "ATQ: desc and buffer writeback:\n");
+	ice_debug(hw, ICE_DBG_AQ_MSG, "ATQ: desc and buffer writeback:\n");
 
-	ice_debug_cq(hw, ICE_DBG_AQ_CMD, (void *)desc, buf, buf_size);
+	ice_debug_cq(hw, (void *)desc, buf, buf_size);
 
 	/* save writeback AQ if requested */
 	if (details->wb_desc)
@@ -1002,13 +1106,47 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 
 	/* update the error if time out occurred */
 	if (!cmd_completed) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Control Send Queue Writeback timeout.\n");
-		status = ICE_ERR_AQ_TIMEOUT;
+		if (rd32(hw, cq->rq.len) & cq->rq.len_crit_mask ||
+		    rd32(hw, cq->sq.len) & cq->sq.len_crit_mask) {
+			ice_debug(hw, ICE_DBG_AQ_MSG, "Critical FW error.\n");
+			status = ICE_ERR_AQ_FW_CRITICAL;
+		} else {
+			ice_debug(hw, ICE_DBG_AQ_MSG, "Control Send Queue Writeback timeout.\n");
+			status = ICE_ERR_AQ_TIMEOUT;
+		}
 	}
 
 sq_send_command_error:
+	return status;
+}
+
+/**
+ * ice_sq_send_cmd - send command to Control Queue (ATQ)
+ * @hw: pointer to the HW struct
+ * @cq: pointer to the specific Control queue
+ * @desc: prefilled descriptor describing the command
+ * @buf: buffer to use for indirect commands (or NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (or 0 for direct commands)
+ * @cd: pointer to command details structure
+ *
+ * This is the main send command routine for the ATQ. It runs the queue,
+ * cleans the queue, etc.
+ */
+enum ice_status
+ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
+		struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		struct ice_sq_cd *cd)
+{
+	enum ice_status status = 0;
+
+	/* if reset is in progress return a soft error */
+	if (hw->reset_ongoing)
+		return ICE_ERR_RESET_ONGOING;
+
+	mutex_lock(&cq->sq_lock);
+	status = ice_sq_send_cmd_nolock(hw, cq, desc, buf, buf_size, cd);
 	mutex_unlock(&cq->sq_lock);
+
 	return status;
 }
 
@@ -1043,6 +1181,7 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		  struct ice_rq_event_info *e, u16 *pending)
 {
 	u16 ntc = cq->rq.next_to_clean;
+	enum ice_aq_err rq_last_status;
 	enum ice_status ret_code = 0;
 	struct ice_aq_desc *desc;
 	struct ice_dma_mem *bi;
@@ -1058,8 +1197,7 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	mutex_lock(&cq->rq_lock);
 
 	if (!cq->rq.count) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Control Receive queue not initialized.\n");
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Control Receive queue not initialized.\n");
 		ret_code = ICE_ERR_AQ_EMPTY;
 		goto clean_rq_elem_err;
 	}
@@ -1077,24 +1215,23 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	desc = ICE_CTL_Q_DESC(cq->rq, ntc);
 	desc_idx = ntc;
 
-	cq->rq_last_status = (enum ice_aq_err)le16_to_cpu(desc->retval);
+	rq_last_status = (enum ice_aq_err)le16_to_cpu(desc->retval);
 	flags = le16_to_cpu(desc->flags);
 	if (flags & ICE_AQ_FLAG_ERR) {
 		ret_code = ICE_ERR_AQ_ERROR;
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Control Receive Queue Event received with error 0x%x\n",
-			  cq->rq_last_status);
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Control Receive Queue Event 0x%04X received with error 0x%X\n",
+			  le16_to_cpu(desc->opcode), rq_last_status);
 	}
 	memcpy(&e->desc, desc, sizeof(e->desc));
 	datalen = le16_to_cpu(desc->datalen);
-	e->msg_len = min(datalen, e->buf_len);
+	e->msg_len = min_t(u16, datalen, e->buf_len);
 	if (e->msg_buf && e->msg_len)
 		memcpy(e->msg_buf, cq->rq.r.rq_bi[desc_idx].va, e->msg_len);
 
-	ice_debug(hw, ICE_DBG_AQ_MSG, "ARQ: desc and buffer:\n");
+	ice_debug(hw, ICE_DBG_AQ_DESC, "ARQ: desc and buffer:\n");
+
+	ice_debug_cq(hw, (void *)desc, e->msg_buf, cq->rq_buf_size);
 
-	ice_debug_cq(hw, ICE_DBG_AQ_CMD, (void *)desc, e->msg_buf,
-		     cq->rq_buf_size);
 
 	/* Restore the original datalen and buffer address in the desc,
 	 * FW updates datalen to indicate the event message size
diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.h b/drivers/net/ethernet/intel/ice/ice_controlq.h
index 3b1d35365ef0..4ab82a4cf43c 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.h
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.h
@@ -1,39 +1,44 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_CONTROLQ_H_
 #define _ICE_CONTROLQ_H_
 
 #include "ice_adminq_cmd.h"
 
+
 /* Maximum buffer lengths for all control queue types */
 #define ICE_AQ_MAX_BUF_LEN 4096
 #define ICE_MBXQ_MAX_BUF_LEN 4096
+#define ICE_SBQ_MAX_BUF_LEN 512
 
 #define ICE_CTL_Q_DESC(R, i) \
 	(&(((struct ice_aq_desc *)((R).desc_buf.va))[i]))
 
 #define ICE_CTL_Q_DESC_UNUSED(R) \
-	(u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
-	      (R)->next_to_clean - (R)->next_to_use - 1)
+	((u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
+	       (R)->next_to_clean - (R)->next_to_use - 1))
 
 /* Defines that help manage the driver vs FW API checks.
  * Take a look at ice_aq_ver_check in ice_controlq.c for actual usage.
  */
 #define EXP_FW_API_VER_BRANCH		0x00
 #define EXP_FW_API_VER_MAJOR		0x01
-#define EXP_FW_API_VER_MINOR		0x03
+#define EXP_FW_API_VER_MINOR		0x05
 
 /* Different control queue types: These are mainly for SW consumption. */
 enum ice_ctl_q {
 	ICE_CTL_Q_UNKNOWN = 0,
 	ICE_CTL_Q_ADMIN,
 	ICE_CTL_Q_MAILBOX,
+	ICE_CTL_Q_SB,
 };
 
 /* Control Queue timeout settings - max delay 1s */
 #define ICE_CTL_Q_SQ_CMD_TIMEOUT	10000 /* Count 10000 times */
 #define ICE_CTL_Q_SQ_CMD_USEC		100   /* Check every 100usec */
+#define ICE_CTL_Q_ADMIN_INIT_TIMEOUT	10    /* Count 10 times */
+#define ICE_CTL_Q_ADMIN_INIT_MSEC	100   /* Check every 100msec */
 
 struct ice_ctl_q_ring {
 	void *dma_head;			/* Virtual address to DMA head */
@@ -59,6 +64,7 @@ struct ice_ctl_q_ring {
 	u32 bal;
 	u32 len_mask;
 	u32 len_ena_mask;
+	u32 len_crit_mask;
 	u32 head_mask;
 };
 
@@ -80,7 +86,6 @@ struct ice_rq_event_info {
 /* Control Queue information */
 struct ice_ctl_q_info {
 	enum ice_ctl_q qtype;
-	enum ice_aq_err rq_last_status;	/* last status on receive queue */
 	struct ice_ctl_q_ring rq;	/* receive queue */
 	struct ice_ctl_q_ring sq;	/* send queue */
 	u32 sq_cmd_timeout;		/* send queue cmd write back timeout */
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c
index dd7efff121bd..35b2b6c60262 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 #include "ice_sched.h"
@@ -18,7 +18,7 @@
  *
  * Requests the complete LLDP MIB (entire packet). (0x0A00)
  */
-static enum ice_status
+enum ice_status
 ice_aq_get_lldp_mib(struct ice_hw *hw, u8 bridge_type, u8 mib_type, void *buf,
 		    u16 buf_size, u16 *local_len, u16 *remote_len,
 		    struct ice_sq_cd *cd)
@@ -77,6 +77,108 @@ ice_aq_cfg_lldp_mib_change(struct ice_hw *hw, bool ena_update,
 	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
+/**
+ * ice_aq_add_delete_lldp_tlv
+ * @hw: pointer to the HW struct
+ * @bridge_type: type of bridge
+ * @add_lldp_tlv: add (true) or delete (false) TLV
+ * @buf: buffer with TLV to add or delete
+ * @buf_size: length of the buffer
+ * @tlv_len: length of the TLV to be added/deleted
+ * @mib_len: length of the LLDP MIB returned in response
+ * @cd: pointer to command details structure or NULL
+ *
+ * (Add tlv)
+ * Add the specified TLV to LLDP Local MIB for the given bridge type,
+ * it is responsibility of the caller to make sure that the TLV is not
+ * already present in the LLDPDU.
+ * In return firmware will write the complete LLDP MIB with the newly
+ * added TLV in the response buffer. (0x0A02)
+ *
+ * (Delete tlv)
+ * Delete the specified TLV from LLDP Local MIB for the given bridge type.
+ * The firmware places the entire LLDP MIB in the response buffer. (0x0A04)
+ */
+enum ice_status
+ice_aq_add_delete_lldp_tlv(struct ice_hw *hw, u8 bridge_type, bool add_lldp_tlv,
+			   void *buf, u16 buf_size, u16 tlv_len, u16 *mib_len,
+			   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_lldp_add_delete_tlv *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (tlv_len == 0)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.lldp_add_delete_tlv;
+
+	if (add_lldp_tlv)
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_add_tlv);
+	else
+		ice_fill_dflt_direct_cmd_desc(&desc,
+					      ice_aqc_opc_lldp_delete_tlv);
+
+	desc.flags |= cpu_to_le16((u16)(ICE_AQ_FLAG_RD));
+
+	cmd->type = ((bridge_type << ICE_AQ_LLDP_BRID_TYPE_S) &
+		     ICE_AQ_LLDP_BRID_TYPE_M);
+	cmd->len = cpu_to_le16(tlv_len);
+
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status && mib_len)
+		*mib_len = le16_to_cpu(desc.datalen);
+
+	return status;
+}
+
+/**
+ * ice_aq_update_lldp_tlv
+ * @hw: pointer to the HW struct
+ * @bridge_type: type of bridge
+ * @buf: buffer with TLV to update
+ * @buf_size: size of the buffer holding original and updated TLVs
+ * @old_len: Length of the Original TLV
+ * @new_len: Length of the Updated TLV
+ * @offset: offset of the updated TLV in the buff
+ * @mib_len: length of the returned LLDP MIB
+ * @cd: pointer to command details structure or NULL
+ *
+ * Update the specified TLV to the LLDP Local MIB for the given bridge type.
+ * Firmware will place the complete LLDP MIB in response buffer with the
+ * updated TLV. (0x0A03)
+ */
+enum ice_status
+ice_aq_update_lldp_tlv(struct ice_hw *hw, u8 bridge_type, void *buf,
+		       u16 buf_size, u16 old_len, u16 new_len, u16 offset,
+		       u16 *mib_len, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_lldp_update_tlv *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.lldp_update_tlv;
+
+	if (offset == 0 || old_len == 0 || new_len == 0)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_update_tlv);
+
+	desc.flags |= cpu_to_le16((u16)(ICE_AQ_FLAG_RD));
+
+	cmd->type = ((bridge_type << ICE_AQ_LLDP_BRID_TYPE_S) &
+		     ICE_AQ_LLDP_BRID_TYPE_M);
+	cmd->old_len = cpu_to_le16(old_len);
+	cmd->new_offset = cpu_to_le16(offset);
+	cmd->new_len = cpu_to_le16(new_len);
+
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status && mib_len)
+		*mib_len = le16_to_cpu(desc.datalen);
+
+	return status;
+}
+
 /**
  * ice_aq_stop_lldp
  * @hw: pointer to the HW struct
@@ -134,39 +236,6 @@ ice_aq_start_lldp(struct ice_hw *hw, bool persist, struct ice_sq_cd *cd)
 	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
-/**
- * ice_aq_set_lldp_mib - Set the LLDP MIB
- * @hw: pointer to the HW struct
- * @mib_type: Local, Remote or both Local and Remote MIBs
- * @buf: pointer to the caller-supplied buffer to store the MIB block
- * @buf_size: size of the buffer (in bytes)
- * @cd: pointer to command details structure or NULL
- *
- * Set the LLDP MIB. (0x0A08)
- */
-static enum ice_status
-ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
-		    struct ice_sq_cd *cd)
-{
-	struct ice_aqc_lldp_set_local_mib *cmd;
-	struct ice_aq_desc desc;
-
-	cmd = &desc.params.lldp_set_mib;
-
-	if (buf_size == 0 || !buf)
-		return ICE_ERR_PARAM;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_set_local_mib);
-
-	desc.flags |= cpu_to_le16((u16)ICE_AQ_FLAG_RD);
-	desc.datalen = cpu_to_le16(buf_size);
-
-	cmd->type = mib_type;
-	cmd->length = cpu_to_le16(buf_size);
-
-	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
-}
-
 /**
  * ice_get_dcbx_status
  * @hw: pointer to the HW struct
@@ -705,6 +774,42 @@ ice_aq_get_dcb_cfg(struct ice_hw *hw, u8 mib_type, u8 bridgetype,
 	return ret;
 }
 
+/**
+ * ice_aq_dcb_ignore_pfc - Ignore PFC for given TCs
+ * @hw: pointer to the HW struct
+ * @tcmap: TC map for request/release any ignore PFC condition
+ * @request: request (true) or release (false) ignore PFC condition
+ * @tcmap_ret: return TCs for which PFC is currently ignored
+ * @cd: pointer to command details structure or NULL
+ *
+ * This sends out request/release to ignore PFC condition for a TC.
+ * It will return the TCs for which PFC is currently ignored. (0x0301)
+ */
+enum ice_status
+ice_aq_dcb_ignore_pfc(struct ice_hw *hw, u8 tcmap, bool request, u8 *tcmap_ret,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aqc_pfc_ignore *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.pfc_ignore;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_pfc_ignore);
+
+	if (request)
+		cmd->cmd_flags = ICE_AQC_PFC_IGNORE_SET;
+
+	cmd->tc_bitmap = tcmap;
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+
+	if (!status && tcmap_ret)
+		*tcmap_ret = cmd->tc_bitmap;
+
+	return status;
+}
+
 /**
  * ice_aq_start_stop_dcbx - Start/Stop DCBX service in FW
  * @hw: pointer to the HW struct
@@ -768,25 +873,126 @@ ice_aq_get_cee_dcb_cfg(struct ice_hw *hw,
 	return ice_aq_send_cmd(hw, &desc, (void *)buff, sizeof(*buff), cd);
 }
 
+/**
+ * ice_aq_query_pfc_mode - Query PFC mode
+ * @hw: pointer to the HW struct
+ * @pfcmode_ret: Return PFC mode
+ * @cd: pointer to command details structure or NULL
+ *
+ * This will return an indication if DSCP-based PFC or VLAN-based PFC
+ * is enabled. (0x0302)
+ */
+enum ice_status
+ice_aq_query_pfc_mode(struct ice_hw *hw, u8 *pfcmode_ret, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_query_pfc_mode *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.set_query_pfc_mode;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_query_pfc_mode);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+
+	if (!status)
+		*pfcmode_ret = cmd->pfc_mode;
+
+	return status;
+}
+
+/**
+ * ice_aq_set_pfc_mode - Set PFC mode
+ * @hw: pointer to the HW struct
+ * @pfc_mode: value of PFC mode to set
+ * @cd: pointer to command details structure or NULL
+ *
+ * This AQ call configures the PFC mdoe to DSCP-based PFC mode or VLAN
+ * -based PFC (0x0303)
+ */
+enum ice_status
+ice_aq_set_pfc_mode(struct ice_hw *hw, u8 pfc_mode, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_query_pfc_mode *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (pfc_mode > ICE_AQC_PFC_DSCP_BASED_PFC)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.set_query_pfc_mode;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_pfc_mode);
+
+	cmd->pfc_mode = pfc_mode;
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (status)
+		return status;
+
+	/* The spec isn't clear about whether the FW will return an error code
+	 * if the PFC mode requested by the driver was not set. The spec just
+	 * says that the FW will write the PFC mode set back into cmd->pfc_mode,
+	 * so after the AQ has been executed, check if cmd->pfc_mode is what was
+	 * requested.
+	 */
+	if (cmd->pfc_mode != pfc_mode)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	return 0;
+}
+
+/**
+ * ice_aq_set_dcb_parameters - Set DCB parameters
+ * @hw: pointer to the HW struct
+ * @dcb_enable: True if DCB configuration needs to be applied
+ * @cd: pointer to command details structure or NULL
+ *
+ * This AQ command will tell FW if it will apply or not apply the default DCB
+ * configuration when link up (0x0306).
+ */
+enum ice_status
+ice_aq_set_dcb_parameters(struct ice_hw *hw, bool dcb_enable,
+			  struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_dcb_params *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_dcb_params;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_dcb_params);
+
+	cmd->valid_flags = ICE_AQC_LINK_UP_DCB_CFG_VALID;
+	if (dcb_enable)
+		cmd->cmd_flags = ICE_AQC_LINK_UP_DCB_CFG;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
 /**
  * ice_cee_to_dcb_cfg
  * @cee_cfg: pointer to CEE configuration struct
- * @dcbcfg: DCB configuration struct
+ * @pi: port information structure
  *
  * Convert CEE configuration from firmware to DCB configuration
  */
 static void
 ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
-		   struct ice_dcbx_cfg *dcbcfg)
+		   struct ice_port_info *pi)
 {
 	u32 status, tlv_status = le32_to_cpu(cee_cfg->tlv_status);
 	u32 ice_aqc_cee_status_mask, ice_aqc_cee_status_shift;
+	u8 i, j, err, sync, oper, app_index, ice_app_sel_type;
 	u16 app_prio = le16_to_cpu(cee_cfg->oper_app_prio);
-	u8 i, err, sync, oper, app_index, ice_app_sel_type;
 	u16 ice_aqc_cee_app_mask, ice_aqc_cee_app_shift;
+	struct ice_dcbx_cfg *cmp_dcbcfg, *dcbcfg;
 	u16 ice_app_prot_id_type;
 
-	/* CEE PG data to ETS config */
+	dcbcfg = &pi->qos_cfg.local_dcbx_cfg;
+	dcbcfg->dcbx_mode = ICE_DCBX_MODE_CEE;
+	dcbcfg->tlv_status = tlv_status;
+
+	/* CEE PG data */
 	dcbcfg->etscfg.maxtcs = cee_cfg->oper_num_tc;
 
 	/* Note that the FW creates the oper_prio_tc nibbles reversed
@@ -813,10 +1019,16 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 		}
 	}
 
-	/* CEE PFC data to ETS config */
+	/* CEE PFC data */
 	dcbcfg->pfc.pfcena = cee_cfg->oper_pfc_en;
 	dcbcfg->pfc.pfccap = ICE_MAX_TRAFFIC_CLASS;
 
+	/* CEE APP TLV data */
+	if (dcbcfg->app_mode == ICE_DCBX_APPS_NON_WILLING)
+		cmp_dcbcfg = &pi->qos_cfg.desired_dcbx_cfg;
+	else
+		cmp_dcbcfg = &pi->qos_cfg.remote_dcbx_cfg;
+
 	app_index = 0;
 	for (i = 0; i < 3; i++) {
 		if (i == 0) {
@@ -826,7 +1038,7 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 			ice_aqc_cee_app_mask = ICE_AQC_CEE_APP_FCOE_M;
 			ice_aqc_cee_app_shift = ICE_AQC_CEE_APP_FCOE_S;
 			ice_app_sel_type = ICE_APP_SEL_ETHTYPE;
-			ice_app_prot_id_type = ICE_APP_PROT_ID_FCOE;
+			ice_app_prot_id_type = ETH_P_FCOE;
 		} else if (i == 1) {
 			/* iSCSI APP */
 			ice_aqc_cee_status_mask = ICE_AQC_CEE_ISCSI_STATUS_M;
@@ -834,7 +1046,19 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 			ice_aqc_cee_app_mask = ICE_AQC_CEE_APP_ISCSI_M;
 			ice_aqc_cee_app_shift = ICE_AQC_CEE_APP_ISCSI_S;
 			ice_app_sel_type = ICE_APP_SEL_TCPIP;
-			ice_app_prot_id_type = ICE_APP_PROT_ID_ISCSI;
+			ice_app_prot_id_type = ISCSI_LISTEN_PORT;
+
+			for (j = 0; j < cmp_dcbcfg->numapps; j++) {
+				u16 prot_id = cmp_dcbcfg->app[j].prot_id;
+				u8 sel = cmp_dcbcfg->app[j].selector;
+
+				if  (sel == ICE_APP_SEL_TCPIP &&
+				     (prot_id == ISCSI_LISTEN_PORT ||
+				      prot_id == ICE_APP_PROT_ID_ISCSI_860)) {
+					ice_app_prot_id_type = prot_id;
+					break;
+				}
+			}
 		} else {
 			/* FIP APP */
 			ice_aqc_cee_status_mask = ICE_AQC_CEE_FIP_STATUS_M;
@@ -842,7 +1066,7 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 			ice_aqc_cee_app_mask = ICE_AQC_CEE_APP_FIP_M;
 			ice_aqc_cee_app_shift = ICE_AQC_CEE_APP_FIP_S;
 			ice_app_sel_type = ICE_APP_SEL_ETHTYPE;
-			ice_app_prot_id_type = ICE_APP_PROT_ID_FIP;
+			ice_app_prot_id_type = ETH_P_FIP;
 		}
 
 		status = (tlv_status & ice_aqc_cee_status_mask) >>
@@ -867,7 +1091,7 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 }
 
 /**
- * ice_get_ieee_dcb_cfg
+ * ice_get_ieee_or_cee_dcb_cfg
  * @pi: port information structure
  * @dcbx_mode: mode of DCBX (IEEE or CEE)
  *
@@ -883,9 +1107,9 @@ ice_get_ieee_or_cee_dcb_cfg(struct ice_port_info *pi, u8 dcbx_mode)
 		return ICE_ERR_PARAM;
 
 	if (dcbx_mode == ICE_DCBX_MODE_IEEE)
-		dcbx_cfg = &pi->local_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 	else if (dcbx_mode == ICE_DCBX_MODE_CEE)
-		dcbx_cfg = &pi->desired_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.desired_dcbx_cfg;
 
 	/* Get Local DCB Config in case of ICE_DCBX_MODE_IEEE
 	 * or get CEE DCB Desired Config in case of ICE_DCBX_MODE_CEE
@@ -896,7 +1120,7 @@ ice_get_ieee_or_cee_dcb_cfg(struct ice_port_info *pi, u8 dcbx_mode)
 		goto out;
 
 	/* Get Remote DCB Config */
-	dcbx_cfg = &pi->remote_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.remote_dcbx_cfg;
 	ret = ice_aq_get_dcb_cfg(pi->hw, ICE_AQ_LLDP_MIB_REMOTE,
 				 ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID, dcbx_cfg);
 	/* Don't treat ENOENT as an error for Remote MIBs */
@@ -925,14 +1149,11 @@ enum ice_status ice_get_dcb_cfg(struct ice_port_info *pi)
 	ret = ice_aq_get_cee_dcb_cfg(pi->hw, &cee_cfg, NULL);
 	if (!ret) {
 		/* CEE mode */
-		dcbx_cfg = &pi->local_dcbx_cfg;
-		dcbx_cfg->dcbx_mode = ICE_DCBX_MODE_CEE;
-		dcbx_cfg->tlv_status = le32_to_cpu(cee_cfg.tlv_status);
-		ice_cee_to_dcb_cfg(&cee_cfg, dcbx_cfg);
 		ret = ice_get_ieee_or_cee_dcb_cfg(pi, ICE_DCBX_MODE_CEE);
+		ice_cee_to_dcb_cfg(&cee_cfg, pi);
 	} else if (pi->hw->adminq.sq_last_status == ICE_AQ_RC_ENOENT) {
 		/* CEE mode not enabled try querying IEEE data */
-		dcbx_cfg = &pi->local_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 		dcbx_cfg->dcbx_mode = ICE_DCBX_MODE_IEEE;
 		ret = ice_get_ieee_or_cee_dcb_cfg(pi, ICE_DCBX_MODE_IEEE);
 	}
@@ -949,34 +1170,34 @@ enum ice_status ice_get_dcb_cfg(struct ice_port_info *pi)
  */
 enum ice_status ice_init_dcb(struct ice_hw *hw, bool enable_mib_change)
 {
-	struct ice_port_info *pi = hw->port_info;
+	struct ice_qos_cfg *qos_cfg = &hw->port_info->qos_cfg;
 	enum ice_status ret = 0;
 
 	if (!hw->func_caps.common_cap.dcb)
 		return ICE_ERR_NOT_SUPPORTED;
 
-	pi->is_sw_lldp = true;
+	qos_cfg->is_sw_lldp = true;
 
 	/* Get DCBX status */
-	pi->dcbx_status = ice_get_dcbx_status(hw);
+	qos_cfg->dcbx_status = ice_get_dcbx_status(hw);
 
-	if (pi->dcbx_status == ICE_DCBX_STATUS_DONE ||
-	    pi->dcbx_status == ICE_DCBX_STATUS_IN_PROGRESS ||
-	    pi->dcbx_status == ICE_DCBX_STATUS_NOT_STARTED) {
+	if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DONE ||
+	    qos_cfg->dcbx_status == ICE_DCBX_STATUS_IN_PROGRESS ||
+	    qos_cfg->dcbx_status == ICE_DCBX_STATUS_NOT_STARTED) {
 		/* Get current DCBX configuration */
-		ret = ice_get_dcb_cfg(pi);
-		pi->is_sw_lldp = (hw->adminq.sq_last_status == ICE_AQ_RC_EPERM);
+		ret = ice_get_dcb_cfg(hw->port_info);
 		if (ret)
 			return ret;
-	} else if (pi->dcbx_status == ICE_DCBX_STATUS_DIS) {
+		qos_cfg->is_sw_lldp = false;
+	} else if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DIS) {
 		return ICE_ERR_NOT_READY;
 	}
 
 	/* Configure the LLDP MIB change event */
 	if (enable_mib_change) {
 		ret = ice_aq_cfg_lldp_mib_change(hw, true, NULL);
-		if (!ret)
-			pi->is_sw_lldp = false;
+		if (ret)
+			qos_cfg->is_sw_lldp = true;
 	}
 
 	return ret;
@@ -991,21 +1212,21 @@ enum ice_status ice_init_dcb(struct ice_hw *hw, bool enable_mib_change)
  */
 enum ice_status ice_cfg_lldp_mib_change(struct ice_hw *hw, bool ena_mib)
 {
-	struct ice_port_info *pi = hw->port_info;
+	struct ice_qos_cfg *qos_cfg = &hw->port_info->qos_cfg;
 	enum ice_status ret;
 
 	if (!hw->func_caps.common_cap.dcb)
 		return ICE_ERR_NOT_SUPPORTED;
 
 	/* Get DCBX status */
-	pi->dcbx_status = ice_get_dcbx_status(hw);
+	qos_cfg->dcbx_status = ice_get_dcbx_status(hw);
 
-	if (pi->dcbx_status == ICE_DCBX_STATUS_DIS)
+	if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DIS)
 		return ICE_ERR_NOT_READY;
 
 	ret = ice_aq_cfg_lldp_mib_change(hw, ena_mib, NULL);
 	if (!ret)
-		pi->is_sw_lldp = !ena_mib;
+		qos_cfg->is_sw_lldp = !ena_mib;
 
 	return ret;
 }
@@ -1220,7 +1441,140 @@ ice_add_ieee_app_pri_tlv(struct ice_lldp_org_tlv *tlv,
 }
 
 /**
- * ice_add_dcb_tlv - Add all IEEE TLVs
+ * ice_add_dscp_up_tlv - Prepare DSCP to UP TLV
+ * @tlv: location to build the TLV data
+ * @dcbcfg: location of data to convert to TLV
+ */
+static void
+ice_add_dscp_up_tlv(struct ice_lldp_org_tlv *tlv, struct ice_dcbx_cfg *dcbcfg)
+{
+	u8 *buf = tlv->tlvinfo;
+	u32 ouisubtype;
+	u16 typelen;
+	int i;
+
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_DSCP_UP_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = (u32)((ICE_DSCP_OUI << ICE_LLDP_TLV_OUI_S) |
+			   ICE_DSCP_SUBTYPE_DSCP2UP);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* bytes 0 - 63 - IPv4 DSCP2UP LUT */
+	for (i = 0; i < ICE_DSCP_NUM_VAL; i++) {
+		/* IPv4 mapping */
+		buf[i] = dcbcfg->dscp_map[i];
+		/* IPv6 mapping */
+		buf[i + ICE_DSCP_IPV6_OFFSET] = dcbcfg->dscp_map[i];
+	}
+
+	/* byte 64 - IPv4 untagged traffic */
+	buf[i] = 0;
+
+	/* byte 144 - IPv6 untagged traffic */
+	buf[i + ICE_DSCP_IPV6_OFFSET] = 0;
+}
+
+#define ICE_BYTES_PER_TC	8
+/**
+ * ice_add_dscp_enf_tlv - Prepare DSCP Enforcement TLV
+ * @tlv: location to build the TLV data
+ */
+static void
+ice_add_dscp_enf_tlv(struct ice_lldp_org_tlv *tlv)
+{
+	u8 *buf = tlv->tlvinfo;
+	u32 ouisubtype;
+	u16 typelen;
+
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_DSCP_ENF_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = (u32)((ICE_DSCP_OUI << ICE_LLDP_TLV_OUI_S) |
+			   ICE_DSCP_SUBTYPE_ENFORCE);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* Allow all DSCP values to be valid for all TC's (IPv4 and IPv6) */
+	memset(buf, 0, 2 * (ICE_MAX_TRAFFIC_CLASS * ICE_BYTES_PER_TC));
+}
+
+/**
+ * ice_add_dscp_tc_bw_tlv - Prepare DSCP BW for TC TLV
+ * @tlv: location to build the TLV data
+ * @dcbcfg: location of the data to convert to TLV
+ */
+static void
+ice_add_dscp_tc_bw_tlv(struct ice_lldp_org_tlv *tlv,
+		       struct ice_dcbx_cfg *dcbcfg)
+{
+	struct ice_dcb_ets_cfg *etscfg;
+	u8 *buf = tlv->tlvinfo;
+	u32 ouisubtype;
+	u8 offset = 0;
+	u16 typelen;
+	int i;
+
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_DSCP_TC_BW_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = (u32)((ICE_DSCP_OUI << ICE_LLDP_TLV_OUI_S) |
+			   ICE_DSCP_SUBTYPE_TCBW);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* First Octect after subtype
+	 * ----------------------------
+	 * | RSV | CBS | RSV | Max TCs |
+	 * | 1b  | 1b  | 3b  | 3b      |
+	 * ----------------------------
+	 */
+	etscfg = &dcbcfg->etscfg;
+	buf[0] = etscfg->maxtcs & ICE_IEEE_ETS_MAXTC_M;
+
+	/* bytes 1 - 4 reserved */
+	offset = 5;
+
+	/* TC BW table
+	 * bytes 0 - 7 for TC 0 - 7
+	 *
+	 * TSA Assignment table
+	 * bytes 8 - 15 for TC 0 - 7
+	 */
+	for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) {
+		buf[offset] = etscfg->tcbwtable[i];
+		buf[offset + ICE_MAX_TRAFFIC_CLASS] = etscfg->tsatable[i];
+		offset++;
+	}
+}
+
+/**
+ * ice_add_dscp_pfc_tlv - Prepare DSCP PFC TLV
+ * @tlv: Fill PFC TLV in IEEE format
+ * @dcbcfg: Local store which holds the PFC CFG data
+ */
+static void
+ice_add_dscp_pfc_tlv(struct ice_lldp_org_tlv *tlv, struct ice_dcbx_cfg *dcbcfg)
+{
+	u8 *buf = tlv->tlvinfo;
+	u32 ouisubtype;
+	u16 typelen;
+
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_DSCP_PFC_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = (u32)((ICE_DSCP_OUI << ICE_LLDP_TLV_OUI_S) |
+			   ICE_DSCP_SUBTYPE_PFC);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	buf[0] = dcbcfg->pfc.pfccap & 0xF;
+	buf[1] = dcbcfg->pfc.pfcena & 0xF;
+}
+
+/**
+ * ice_add_dcb_tlv - Add all IEEE or DSCP TLVs
  * @tlv: Fill TLV data in IEEE format
  * @dcbcfg: Local store which holds the DCB Config
  * @tlvid: Type of IEEE TLV
@@ -1231,21 +1585,41 @@ static void
 ice_add_dcb_tlv(struct ice_lldp_org_tlv *tlv, struct ice_dcbx_cfg *dcbcfg,
 		u16 tlvid)
 {
-	switch (tlvid) {
-	case ICE_IEEE_TLV_ID_ETS_CFG:
-		ice_add_ieee_ets_tlv(tlv, dcbcfg);
-		break;
-	case ICE_IEEE_TLV_ID_ETS_REC:
-		ice_add_ieee_etsrec_tlv(tlv, dcbcfg);
-		break;
-	case ICE_IEEE_TLV_ID_PFC_CFG:
-		ice_add_ieee_pfc_tlv(tlv, dcbcfg);
-		break;
-	case ICE_IEEE_TLV_ID_APP_PRI:
-		ice_add_ieee_app_pri_tlv(tlv, dcbcfg);
-		break;
-	default:
-		break;
+	if (dcbcfg->pfc_mode == ICE_QOS_MODE_VLAN) {
+		switch (tlvid) {
+		case ICE_IEEE_TLV_ID_ETS_CFG:
+			ice_add_ieee_ets_tlv(tlv, dcbcfg);
+			break;
+		case ICE_IEEE_TLV_ID_ETS_REC:
+			ice_add_ieee_etsrec_tlv(tlv, dcbcfg);
+			break;
+		case ICE_IEEE_TLV_ID_PFC_CFG:
+			ice_add_ieee_pfc_tlv(tlv, dcbcfg);
+			break;
+		case ICE_IEEE_TLV_ID_APP_PRI:
+			ice_add_ieee_app_pri_tlv(tlv, dcbcfg);
+			break;
+		default:
+			break;
+		}
+	} else {
+		/* pfc_mode == ICE_QOS_MODE_DSCP */
+		switch (tlvid) {
+		case ICE_TLV_ID_DSCP_UP:
+			ice_add_dscp_up_tlv(tlv, dcbcfg);
+			break;
+		case ICE_TLV_ID_DSCP_ENF:
+			ice_add_dscp_enf_tlv(tlv);
+			break;
+		case ICE_TLV_ID_DSCP_TC_BW:
+			ice_add_dscp_tc_bw_tlv(tlv, dcbcfg);
+			break;
+		case ICE_TLV_ID_DSCP_TO_PFC:
+			ice_add_dscp_pfc_tlv(tlv, dcbcfg);
+			break;
+		default:
+			break;
+		}
 	}
 }
 
@@ -1303,7 +1677,7 @@ enum ice_status ice_set_dcb_cfg(struct ice_port_info *pi)
 	hw = pi->hw;
 
 	/* update the HW local config */
-	dcbcfg = &pi->local_dcbx_cfg;
+	dcbcfg = &pi->qos_cfg.local_dcbx_cfg;
 	/* Allocate the LLDPDU */
 	lldpmib = devm_kzalloc(ice_hw_to_dev(hw), ICE_LLDPDU_SIZE, GFP_KERNEL);
 	if (!lldpmib)
@@ -1323,13 +1697,13 @@ enum ice_status ice_set_dcb_cfg(struct ice_port_info *pi)
 }
 
 /**
- * ice_aq_query_port_ets - query port ets configuration
+ * ice_aq_query_port_ets - query port ETS configuration
  * @pi: port information structure
  * @buf: pointer to buffer
  * @buf_size: buffer size in bytes
  * @cd: pointer to command details structure or NULL
  *
- * query current port ets configuration
+ * query current port ETS configuration
  */
 static enum ice_status
 ice_aq_query_port_ets(struct ice_port_info *pi,
@@ -1362,7 +1736,7 @@ ice_update_port_tc_tree_cfg(struct ice_port_info *pi,
 			    struct ice_aqc_port_ets_elem *buf)
 {
 	struct ice_sched_node *node, *tc_node;
-	struct ice_aqc_get_elem elem;
+	struct ice_aqc_txsched_elem_data elem;
 	enum ice_status status = 0;
 	u32 teid1, teid2;
 	u8 i, j;
@@ -1404,7 +1778,7 @@ ice_update_port_tc_tree_cfg(struct ice_port_info *pi,
 		/* new TC */
 		status = ice_sched_query_elem(pi->hw, teid2, &elem);
 		if (!status)
-			status = ice_sched_add_node(pi, 1, &elem.generic[0]);
+			status = ice_sched_add_node(pi, 1, &elem);
 		if (status)
 			break;
 		/* update the TC number */
@@ -1416,13 +1790,13 @@ ice_update_port_tc_tree_cfg(struct ice_port_info *pi,
 }
 
 /**
- * ice_query_port_ets - query port ets configuration
+ * ice_query_port_ets - query port ETS configuration
  * @pi: port information structure
  * @buf: pointer to buffer
  * @buf_size: buffer size in bytes
  * @cd: pointer to command details structure or NULL
  *
- * query current port ets configuration and update the
+ * query current port ETS configuration and update the
  * SW DB with the TC changes
  */
 enum ice_status
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.h b/drivers/net/ethernet/intel/ice/ice_dcb.h
index ee138f9bdc7c..370f673c5746 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb.h
+++ b/drivers/net/ethernet/intel/ice/ice_dcb.h
@@ -1,14 +1,16 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_DCB_H_
 #define _ICE_DCB_H_
 
 #include "ice_type.h"
+#include "ice_common.h"
 
 #define ICE_DCBX_STATUS_NOT_STARTED	0
 #define ICE_DCBX_STATUS_IN_PROGRESS	1
 #define ICE_DCBX_STATUS_DONE		2
+#define ICE_DCBX_STATUS_MULTIPLE_PEERS	3
 #define ICE_DCBX_STATUS_DIS		7
 
 #define ICE_TLV_TYPE_END		0
@@ -22,9 +24,19 @@
 
 #define ICE_CEE_DCBX_OUI		0x001B21
 #define ICE_CEE_DCBX_TYPE		2
+
+#define ICE_DSCP_OUI			0xFFFFFF
+#define ICE_DSCP_SUBTYPE_DSCP2UP	0x41
+#define ICE_DSCP_SUBTYPE_ENFORCE	0x42
+#define ICE_DSCP_SUBTYPE_TCBW		0x43
+#define ICE_DSCP_SUBTYPE_PFC		0x44
+#define ICE_DSCP_IPV6_OFFSET		80
+
+#define ICE_CEE_SUBTYPE_CTRL		1
 #define ICE_CEE_SUBTYPE_PG_CFG		2
 #define ICE_CEE_SUBTYPE_PFC_CFG		3
 #define ICE_CEE_SUBTYPE_APP_PRI		4
+
 #define ICE_CEE_MAX_FEAT_TYPE		3
 /* Defines for LLDP TLV header */
 #define ICE_LLDP_TLV_LEN_S		0
@@ -72,22 +84,34 @@
 #define ICE_IEEE_APP_PRIO_M		(0x7 << ICE_IEEE_APP_PRIO_S)
 
 /* TLV definitions for preparing MIB */
+#define ICE_TLV_ID_CHASSIS_ID		0
+#define ICE_TLV_ID_PORT_ID		1
+#define ICE_TLV_ID_TIME_TO_LIVE		2
 #define ICE_IEEE_TLV_ID_ETS_CFG		3
 #define ICE_IEEE_TLV_ID_ETS_REC		4
 #define ICE_IEEE_TLV_ID_PFC_CFG		5
 #define ICE_IEEE_TLV_ID_APP_PRI		6
 #define ICE_TLV_ID_END_OF_LLDPPDU	7
 #define ICE_TLV_ID_START		ICE_IEEE_TLV_ID_ETS_CFG
+#define ICE_TLV_ID_DSCP_UP		3
+#define ICE_TLV_ID_DSCP_ENF		4
+#define ICE_TLV_ID_DSCP_TC_BW		5
+#define ICE_TLV_ID_DSCP_TO_PFC		6
 
 #define ICE_IEEE_ETS_TLV_LEN		25
 #define ICE_IEEE_PFC_TLV_LEN		6
 #define ICE_IEEE_APP_TLV_LEN		11
 
+#define ICE_DSCP_UP_TLV_LEN		148
+#define ICE_DSCP_ENF_TLV_LEN		132
+#define ICE_DSCP_TC_BW_TLV_LEN		25
+#define ICE_DSCP_PFC_TLV_LEN		6
+
 /* IEEE 802.1AB LLDP Organization specific TLV */
 struct ice_lldp_org_tlv {
 	__be16 typelen;
 	__be32 ouisubtype;
-	u8 tlvinfo[1];
+	u8 tlvinfo[];
 } __packed;
 
 struct ice_cee_tlv_hdr {
@@ -109,7 +133,7 @@ struct ice_cee_feat_tlv {
 #define ICE_CEE_FEAT_TLV_WILLING_M	0x40
 #define ICE_CEE_FEAT_TLV_ERR_M		0x20
 	u8 subtype;
-	u8 tlvinfo[1];
+	u8 tlvinfo[];
 };
 
 struct ice_cee_app_prio {
@@ -120,6 +144,29 @@ struct ice_cee_app_prio {
 	u8 prio_map;
 } __packed;
 
+
+enum ice_status
+ice_aq_get_lldp_mib(struct ice_hw *hw, u8 bridge_type, u8 mib_type, void *buf,
+		    u16 buf_size, u16 *local_len, u16 *remote_len,
+		    struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_add_delete_lldp_tlv(struct ice_hw *hw, u8 bridge_type, bool add_lldp_tlv,
+			   void *buf, u16 buf_size, u16 tlv_len, u16 *mib_len,
+			   struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_update_lldp_tlv(struct ice_hw *hw, u8 bridge_type, void *buf,
+		       u16 buf_size, u16 old_len, u16 new_len, u16 offset,
+		       u16 *mib_len, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dcb_ignore_pfc(struct ice_hw *hw, u8 tcmap, bool request, u8 *tcmap_ret,
+		      struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_pfc_mode(struct ice_hw *hw, u8 *pfcmode_ret, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_dcb_parameters(struct ice_hw *hw, bool dcb_enable,
+			  struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_pfc_mode(struct ice_hw *hw, u8 pfc_mode, struct ice_sq_cd *cd);
 enum ice_status
 ice_aq_get_dcb_cfg(struct ice_hw *hw, u8 mib_type, u8 bridgetype,
 		   struct ice_dcbx_cfg *dcbcfg);
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
index dd47869c4ad4..8475e0a52925 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
@@ -1,71 +1,92 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_dcb_lib.h"
+#include "ice_dcb_nl.h"
 
 /**
- * ice_vsi_cfg_netdev_tc - Setup the netdev TC configuration
- * @vsi: the VSI being configured
- * @ena_tc: TC map to be enabled
+ * ice_is_pfc_causing_hung_q
+ * @pf: pointer to PF structure
+ * @txqueue: Tx queue which is supposedly hung queue
+ *
+ * find if PFC is causing the hung queue, if yes return true else false
  */
-void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc)
+bool ice_is_pfc_causing_hung_q(struct ice_pf *pf, unsigned int txqueue)
 {
-	struct net_device *netdev = vsi->netdev;
-	struct ice_pf *pf = vsi->back;
-	struct ice_dcbx_cfg *dcbcfg;
-	u8 netdev_tc;
-	int i;
-
-	if (!netdev)
-		return;
+	u8 num_tcs = 0, i, tc, up_mapped_tc, up_in_tc = 0;
+	u64 ref_prio_xoff[ICE_MAX_UP];
+	struct ice_vsi *vsi;
+	u32 up2tc;
 
-	if (!ena_tc) {
-		netdev_reset_tc(netdev);
-		return;
-	}
-
-	if (netdev_set_num_tc(netdev, vsi->tc_cfg.numtc))
-		return;
-
-	dcbcfg = &pf->hw.port_info->local_dcbx_cfg;
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return false;
 
 	ice_for_each_traffic_class(i)
 		if (vsi->tc_cfg.ena_tc & BIT(i))
-			netdev_set_tc_queue(netdev,
-					    vsi->tc_cfg.tc_info[i].netdev_tc,
-					    vsi->tc_cfg.tc_info[i].qcount_tx,
-					    vsi->tc_cfg.tc_info[i].qoffset);
+			num_tcs++;
 
-	for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
-		u8 ets_tc = dcbcfg->etscfg.prio_table[i];
+	/* first find out the TC to which the hung queue belongs to */
+	for (tc = 0; tc < num_tcs - 1; tc++)
+		if (ice_find_q_in_range(vsi->tc_cfg.tc_info[tc].qoffset,
+					vsi->tc_cfg.tc_info[tc + 1].qoffset,
+					txqueue))
+			break;
 
-		/* Get the mapped netdev TC# for the UP */
-		netdev_tc = vsi->tc_cfg.tc_info[ets_tc].netdev_tc;
-		netdev_set_prio_tc_map(netdev, i, netdev_tc);
+	/* Build a bit map of all UPs associated to the suspect hung queue TC,
+	 * so that we check for its counter increment.
+	 */
+	up2tc = rd32(&pf->hw, PRTDCB_TUP2TC);
+	for (i = 0; i < ICE_MAX_UP; i++) {
+		up_mapped_tc = (up2tc >> (i * 3)) & 0x7;
+		if (up_mapped_tc == tc)
+			up_in_tc |= BIT(i);
 	}
+
+	/* Now that we figured out that hung queue is PFC enabled, still the
+	 * Tx timeout can be legitimate. So to make sure Tx timeout is
+	 * absolutely caused by PFC storm, check if the counters are
+	 * incrementing.
+	 */
+	for (i = 0; i < ICE_MAX_UP; i++)
+		if (up_in_tc & BIT(i))
+			ref_prio_xoff[i] = pf->stats.priority_xoff_rx[i];
+
+	ice_update_dcb_stats(pf);
+
+	for (i = 0; i < ICE_MAX_UP; i++)
+		if (up_in_tc & BIT(i))
+			if (pf->stats.priority_xoff_rx[i] > ref_prio_xoff[i])
+				return true;
+
+	return false;
 }
 
 /**
- * ice_dcb_get_ena_tc - return bitmap of enabled TCs
- * @dcbcfg: DCB config to evaluate for enabled TCs
+ * ice_dcb_get_mode - gets the DCB mode
+ * @port_info: pointer to port info structure
+ * @host: if set it's HOST if not it's MANAGED
  */
-u8 ice_dcb_get_ena_tc(struct ice_dcbx_cfg *dcbcfg)
+static u8 ice_dcb_get_mode(struct ice_port_info *port_info, bool host)
 {
-	u8 i, num_tc, ena_tc = 1;
-
-	num_tc = ice_dcb_get_num_tc(dcbcfg);
+	u8 mode;
 
-	for (i = 0; i < num_tc; i++)
-		ena_tc |= BIT(i);
+	if (host)
+		mode = DCB_CAP_DCBX_HOST;
+	else
+		mode = DCB_CAP_DCBX_LLD_MANAGED;
 
-	return ena_tc;
+	if (port_info->qos_cfg.local_dcbx_cfg.dcbx_mode & ICE_DCBX_MODE_CEE)
+		return mode | DCB_CAP_DCBX_VER_CEE;
+	else
+		return mode | DCB_CAP_DCBX_VER_IEEE;
 }
 
 /**
  * ice_dcb_get_num_tc - Get the number of TCs from DCBX config
  * @dcbcfg: config to retrieve number of TCs from
  */
-u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg *dcbcfg)
+static u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg *dcbcfg)
 {
 	bool tc_unused = false;
 	u8 num_tc = 0;
@@ -99,26 +120,120 @@ u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg *dcbcfg)
 	return ret;
 }
 
+/**
+ * ice_dcb_get_ena_tc - return bitmap of enabled TCs
+ * @dcbcfg: DCB config to evaluate for enabled TCs
+ */
+static u8 ice_dcb_get_ena_tc(struct ice_dcbx_cfg *dcbcfg)
+{
+	u8 i, num_tc, ena_tc = 1;
+
+	num_tc = ice_dcb_get_num_tc(dcbcfg);
+
+	for (i = 0; i < num_tc; i++)
+		ena_tc |= BIT(i);
+
+	return ena_tc;
+}
+
+/**
+ * ice_get_first_droptc - returns number of first droptc
+ * @vsi: used to find the first droptc
+ *
+ * This function returns the value of first_droptc.
+ * When DCB is enabled, first droptc information is derived from enabled_tc
+ * and PFC enabled bits. otherwise this function returns 0 as there is one
+ * TC without DCB (tc0)
+ */
+static u8 ice_get_first_droptc(struct ice_vsi *vsi)
+{
+	struct ice_dcbx_cfg *cfg = &vsi->port_info->qos_cfg.local_dcbx_cfg;
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	u8 num_tc, ena_tc_map, pfc_ena_map;
+	u8 i;
+
+	num_tc = ice_dcb_get_num_tc(cfg);
+
+	/* get bitmap of enabled TCs */
+	ena_tc_map = ice_dcb_get_ena_tc(cfg);
+
+	/* get bitmap of PFC enabled TCs */
+	pfc_ena_map = cfg->pfc.pfcena;
+
+	/* get first TC that is not PFC enabled */
+	for (i = 0; i < num_tc; i++) {
+		if ((ena_tc_map & BIT(i)) && (!(pfc_ena_map & BIT(i)))) {
+			dev_dbg(dev, "first drop tc = %d\n", i);
+			return i;
+		}
+	}
+
+	dev_dbg(dev, "first drop tc = 0\n");
+	return 0;
+}
+
+/**
+ * ice_vsi_set_dcb_tc_cfg - Set VSI's TC based on DCB configuration
+ * @vsi: pointer to the VSI instance
+ */
+void ice_vsi_set_dcb_tc_cfg(struct ice_vsi *vsi)
+{
+	struct ice_dcbx_cfg *cfg = &vsi->port_info->qos_cfg.local_dcbx_cfg;
+
+	switch (vsi->type) {
+	case ICE_VSI_PF:
+		vsi->tc_cfg.ena_tc = ice_dcb_get_ena_tc(cfg);
+		vsi->tc_cfg.numtc = ice_dcb_get_num_tc(cfg);
+		break;
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	case ICE_VSI_CHNL:
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#ifdef HAVE_NETDEV_SB_DEV
+	case ICE_VSI_OFFLOAD_MACVLAN:
+#endif /* HAVE_NETDEV_SB_DEV */
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		vsi->tc_cfg.ena_tc = BIT(ice_get_first_droptc(vsi));
+		vsi->tc_cfg.numtc = 1;
+		break;
+	case ICE_VSI_CTRL:
+	case ICE_VSI_LB:
+	default:
+		vsi->tc_cfg.ena_tc = ICE_DFLT_TRAFFIC_CLASS;
+		vsi->tc_cfg.numtc = 1;
+	}
+}
+
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+/**
+ * ice_dcb_get_tc - Get the TC associated with the queue
+ * @vsi: ptr to the VSI
+ * @queue_index: queue number associated with VSI
+ */
+u8 ice_dcb_get_tc(struct ice_vsi *vsi, int queue_index)
+{
+	return vsi->tx_rings[queue_index]->dcb_tc;
+}
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
+
 /**
  * ice_vsi_cfg_dcb_rings - Update rings to reflect DCB TC
  * @vsi: VSI owner of rings being updated
  */
 void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi)
 {
-	struct ice_ring *tx_ring, *rx_ring;
-	u16 qoffset, qcount;
+	u16 qoffset;
+	u16 qcount;
 	int i, n;
 
 	if (!test_bit(ICE_FLAG_DCB_ENA, vsi->back->flags)) {
 		/* Reset the TC information */
-		for (i = 0; i < vsi->num_txq; i++) {
-			tx_ring = vsi->tx_rings[i];
-			tx_ring->dcb_tc = 0;
-		}
-		for (i = 0; i < vsi->num_rxq; i++) {
-			rx_ring = vsi->rx_rings[i];
-			rx_ring->dcb_tc = 0;
-		}
+		for (i = 0; i < vsi->num_txq; i++)
+			vsi->tx_rings[i]->dcb_tc = 0;
+
+		for (i = 0; i < vsi->num_rxq; i++)
+			vsi->rx_rings[i]->dcb_tc = 0;
+
 		return;
 	}
 
@@ -127,50 +242,150 @@ void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi)
 			break;
 
 		qoffset = vsi->tc_cfg.tc_info[n].qoffset;
+
 		qcount = vsi->tc_cfg.tc_info[n].qcount_tx;
-		for (i = qoffset; i < (qoffset + qcount); i++) {
-			tx_ring = vsi->tx_rings[i];
-			rx_ring = vsi->rx_rings[i];
-			tx_ring->dcb_tc = n;
-			rx_ring->dcb_tc = n;
+		for (i = qoffset; i < (qoffset + qcount); i++)
+			vsi->tx_rings[i]->dcb_tc = n;
+
+		qcount = vsi->tc_cfg.tc_info[n].qcount_rx;
+		for (i = qoffset; i < (qoffset + qcount); i++)
+			vsi->rx_rings[i]->dcb_tc = n;
+	}
+
+#ifdef HAVE_NETDEV_SB_DEV
+	/* when DCB is configured TC for MACVLAN queues should be
+	 * the first drop TC of the main VSI
+	 */
+	if (vsi->type == ICE_VSI_OFFLOAD_MACVLAN) {
+		u8 first_droptc = ice_get_first_droptc(vsi);
+
+		ice_for_each_alloc_txq(vsi, i)
+			vsi->tx_rings[i]->dcb_tc = first_droptc;
+		ice_for_each_alloc_rxq(vsi, i)
+			vsi->rx_rings[i]->dcb_tc = first_droptc;
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (vsi->type == ICE_VSI_PF) {
+		u8 first_droptc = ice_get_first_droptc(vsi);
+
+		/* When DCB is configured, TC for ADQ queues (which are really
+		 * PF queues) should be the first drop TC of the main VSI
+		 */
+		ice_for_each_chnl_tc(n) {
+			if (!(vsi->all_enatc & BIT(n)))
+				break;
+
+			qoffset = vsi->mqprio_qopt.qopt.offset[n];
+			qcount = vsi->mqprio_qopt.qopt.count[n];
+			for (i = qoffset; i < (qoffset + qcount); i++) {
+				vsi->tx_rings[i]->dcb_tc = first_droptc;
+				vsi->rx_rings[i]->dcb_tc = first_droptc;
+			}
 		}
 	}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 }
 
 /**
- * ice_pf_dcb_recfg - Reconfigure all VEBs and VSIs
- * @pf: pointer to the PF struct
+ * ice_peer_prep_tc_change - Pre-notify RDMA Peer in blocking call of TC change
+ * @peer_obj_int: ptr to peer device internal struct
+ * @data: ptr to opaque data
+ */
+static int
+ice_peer_prep_tc_change(struct ice_peer_obj_int *peer_obj_int,
+			void __always_unused *data)
+{
+	struct ice_peer_obj *peer_obj;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!ice_validate_peer_obj(peer_obj))
+		return 0;
+
+	if (!test_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj_int->state))
+		return 0;
+
+	if (peer_obj->peer_ops && peer_obj->peer_ops->prep_tc_change)
+		peer_obj->peer_ops->prep_tc_change(peer_obj);
+
+	return 0;
+}
+
+/**
+ * ice_dcb_ena_dis_vsi - disable certain VSIs for DCB config/reconfig
+ * @pf: pointer to the PF instance
+ * @ena: true to enable VSIs, false to disable
+ * @locked: true if caller holds RTNL lock, false otherwise
  *
- * Assumed caller has already disabled all VSIs before
- * calling this function. Reconfiguring DCB based on
- * local_dcbx_cfg.
+ * Before a new DCB configuration can be applied, VSIs of type PF, MACVLAN,
+ * and CHNL need to be brought down. Following completion of DCB configuration
+ * the VSIs that were downed need to be brought up again. This helper function
+ * does both.
  */
-static void ice_pf_dcb_recfg(struct ice_pf *pf)
+static void ice_dcb_ena_dis_vsi(struct ice_pf *pf, bool ena, bool locked)
 {
-	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->local_dcbx_cfg;
-	u8 tc_map = 0;
-	int v, ret;
+	int i;
 
-	/* Update each VSI */
-	ice_for_each_vsi(pf, v) {
-		if (!pf->vsi[v])
-			continue;
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
 
-		if (pf->vsi[v]->type == ICE_VSI_PF)
-			tc_map = ice_dcb_get_ena_tc(dcbcfg);
-		else
-			tc_map = ICE_DFLT_TRAFFIC_CLASS;
+		if (!vsi)
+			continue;
 
-		ret = ice_vsi_cfg_tc(pf->vsi[v], tc_map);
-		if (ret) {
-			dev_err(&pf->pdev->dev,
-				"Failed to config TC for VSI index: %d\n",
-				pf->vsi[v]->idx);
+		switch (vsi->type) {
+		case ICE_VSI_CHNL:
+		case ICE_VSI_OFFLOAD_MACVLAN:
+		case ICE_VSI_VMDQ2:
+		case ICE_VSI_SWITCHDEV_CTRL:
+		case ICE_VSI_PF:
+			if (ena)
+				ice_ena_vsi(vsi, locked);
+			else
+				ice_dis_vsi(vsi, locked);
+			break;
+		default:
 			continue;
 		}
+	}
+}
+
+/**
+ * ice_dcb_bwchk - check if ETS bandwidth input parameters are correct
+ * @pf: pointer to PF struct
+ * @dcbcfg: pointer to DCB config structure
+ */
+int ice_dcb_bwchk(struct ice_pf *pf, struct ice_dcbx_cfg *dcbcfg)
+{
+	struct ice_dcb_ets_cfg *etscfg = &dcbcfg->etscfg;
+	u8 num_tc, total_bw = 0;
+	int i;
+
+	/* returns number of contigous TCs and 1 TC for non-contigous TCs,
+	 * since at least 1 TC has to be configured
+	 */
+	num_tc = ice_dcb_get_num_tc(dcbcfg);
+
+	/* no bandwidth checks required if there's only one TC, so assign
+	 * all bandwidth to TC0 and return
+	 */
+	if (num_tc == 1) {
+		etscfg->tcbwtable[0] = ICE_TC_MAX_BW;
+		return 0;
+	}
 
-		ice_vsi_map_rings_to_vectors(pf->vsi[v]);
+	for (i = 0; i < num_tc; i++)
+		total_bw += etscfg->tcbwtable[i];
+
+	if (!total_bw) {
+		etscfg->tcbwtable[0] = ICE_TC_MAX_BW;
+	} else if (total_bw != ICE_TC_MAX_BW) {
+		dev_err(ice_pf_to_dev(pf),
+			"Invalid config, total bandwidth must equal 100\n");
+		return -EINVAL;
 	}
+
+	return 0;
 }
 
 /**
@@ -179,50 +394,65 @@ static void ice_pf_dcb_recfg(struct ice_pf *pf)
  * @new_cfg: DCBX config to apply
  * @locked: is the RTNL held
  */
-static
 int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked)
 {
-	struct ice_dcbx_cfg *old_cfg, *curr_cfg;
 	struct ice_aqc_port_ets_elem buf = { 0 };
-	int ret = 0;
+	struct ice_dcbx_cfg *old_cfg, *curr_cfg;
+	struct device *dev = ice_pf_to_dev(pf);
+	int ret = ICE_DCB_NO_HW_CHG;
 
-	curr_cfg = &pf->hw.port_info->local_dcbx_cfg;
+	curr_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	/* FW does not care if change happened */
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp)
+		ret = ICE_DCB_HW_CHG_RST;
 
 	/* Enable DCB tagging only when more than one TC */
 	if (ice_dcb_get_num_tc(new_cfg) > 1) {
-		dev_dbg(&pf->pdev->dev, "DCB tagging enabled (num TC > 1)\n");
+		dev_dbg(dev, "DCB tagging enabled (num TC > 1)\n");
 		set_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	} else {
-		dev_dbg(&pf->pdev->dev, "DCB tagging disabled (num TC = 1)\n");
+		dev_dbg(dev, "DCB tagging disabled (num TC = 1)\n");
 		clear_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	}
 
 	if (!memcmp(new_cfg, curr_cfg, sizeof(*new_cfg))) {
-		dev_dbg(&pf->pdev->dev, "No change in DCB config required\n");
+		dev_dbg(dev, "No change in DCB config required\n");
 		return ret;
 	}
 
+	if (ice_dcb_bwchk(pf, new_cfg))
+		return -EINVAL;
+
 	/* Store old config in case FW config fails */
-	old_cfg = devm_kzalloc(&pf->pdev->dev, sizeof(*old_cfg), GFP_KERNEL);
-	memcpy(old_cfg, curr_cfg, sizeof(*old_cfg));
+	old_cfg = kmemdup(curr_cfg, sizeof(*old_cfg), GFP_KERNEL);
+	if (!old_cfg)
+		return -ENOMEM;
+
+	dev_info(dev, "Commit DCB Configuration to the hardware\n");
+	/* Notify capable peers about impending change to TCs */
+	ice_for_each_peer(pf, NULL, ice_peer_prep_tc_change);
 
 	/* avoid race conditions by holding the lock while disabling and
 	 * re-enabling the VSI
 	 */
 	if (!locked)
 		rtnl_lock();
-	ice_pf_dis_all_vsi(pf, true);
+
+	/* disable VSIs affected by DCB changes */
+	ice_dcb_ena_dis_vsi(pf, false, true);
 
 	memcpy(curr_cfg, new_cfg, sizeof(*curr_cfg));
 	memcpy(&curr_cfg->etsrec, &curr_cfg->etscfg, sizeof(curr_cfg->etsrec));
+	memcpy(&new_cfg->etsrec, &curr_cfg->etscfg, sizeof(curr_cfg->etsrec));
 
 	/* Only send new config to HW if we are in SW LLDP mode. Otherwise,
 	 * the new config came from the HW in the first place.
 	 */
-	if (pf->hw.port_info->is_sw_lldp) {
+	if (pf->hw.port_info->qos_cfg.is_sw_lldp) {
 		ret = ice_set_dcb_cfg(pf->hw.port_info);
 		if (ret) {
-			dev_err(&pf->pdev->dev, "Set DCB Config failed\n");
+			dev_err(dev, "Set DCB Config failed\n");
 			/* Restore previous settings to local config */
 			memcpy(curr_cfg, old_cfg, sizeof(*curr_cfg));
 			goto out;
@@ -231,17 +461,18 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked)
 
 	ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Query Port ETS failed\n");
+		dev_err(dev, "Query Port ETS failed\n");
 		goto out;
 	}
 
 	ice_pf_dcb_recfg(pf);
 
 out:
-	ice_pf_ena_all_vsi(pf, true);
+	/* enable previously downed VSIs */
+	ice_dcb_ena_dis_vsi(pf, true, true);
 	if (!locked)
 		rtnl_unlock();
-	devm_kfree(&pf->pdev->dev, old_cfg);
+	kfree(old_cfg);
 	return ret;
 }
 
@@ -251,7 +482,7 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked)
  */
 static void ice_cfg_etsrec_defaults(struct ice_port_info *pi)
 {
-	struct ice_dcbx_cfg *dcbcfg = &pi->local_dcbx_cfg;
+	struct ice_dcbx_cfg *dcbcfg = &pi->qos_cfg.local_dcbx_cfg;
 	u8 i;
 
 	/* Ensure ETS recommended DCB configuration is not already set */
@@ -277,6 +508,7 @@ static bool
 ice_dcb_need_recfg(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
 		   struct ice_dcbx_cfg *new_cfg)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	bool need_reconfig = false;
 
 	/* Check if ETS configuration has changed */
@@ -287,33 +519,33 @@ ice_dcb_need_recfg(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
 			   &old_cfg->etscfg.prio_table,
 			   sizeof(new_cfg->etscfg.prio_table))) {
 			need_reconfig = true;
-			dev_dbg(&pf->pdev->dev, "ETS UP2TC changed.\n");
+			dev_dbg(dev, "ETS UP2TC changed.\n");
 		}
 
 		if (memcmp(&new_cfg->etscfg.tcbwtable,
 			   &old_cfg->etscfg.tcbwtable,
 			   sizeof(new_cfg->etscfg.tcbwtable)))
-			dev_dbg(&pf->pdev->dev, "ETS TC BW Table changed.\n");
+			dev_dbg(dev, "ETS TC BW Table changed.\n");
 
 		if (memcmp(&new_cfg->etscfg.tsatable,
 			   &old_cfg->etscfg.tsatable,
 			   sizeof(new_cfg->etscfg.tsatable)))
-			dev_dbg(&pf->pdev->dev, "ETS TSA Table changed.\n");
+			dev_dbg(dev, "ETS TSA Table changed.\n");
 	}
 
 	/* Check if PFC configuration has changed */
 	if (memcmp(&new_cfg->pfc, &old_cfg->pfc, sizeof(new_cfg->pfc))) {
 		need_reconfig = true;
-		dev_dbg(&pf->pdev->dev, "PFC config change detected.\n");
+		dev_dbg(dev, "PFC config change detected.\n");
 	}
 
 	/* Check if APP Table has changed */
 	if (memcmp(&new_cfg->app, &old_cfg->app, sizeof(new_cfg->app))) {
 		need_reconfig = true;
-		dev_dbg(&pf->pdev->dev, "APP Table change detected.\n");
+		dev_dbg(dev, "APP Table change detected.\n");
 	}
 
-	dev_dbg(&pf->pdev->dev, "dcb need_reconfig=%d\n", need_reconfig);
+	dev_dbg(dev, "dcb need_reconfig=%d\n", need_reconfig);
 	return need_reconfig;
 }
 
@@ -323,87 +555,69 @@ ice_dcb_need_recfg(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
  */
 void ice_dcb_rebuild(struct ice_pf *pf)
 {
-	struct ice_dcbx_cfg *local_dcbx_cfg, *desired_dcbx_cfg, *prev_cfg;
 	struct ice_aqc_port_ets_elem buf = { 0 };
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_dcbx_cfg *err_cfg;
 	enum ice_status ret;
 
 	ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Query Port ETS failed\n");
+		dev_err(dev, "Query Port ETS failed\n");
 		goto dcb_error;
 	}
 
-	/* If DCB was not enabled previously, we are done */
-	if (!test_bit(ICE_FLAG_DCB_ENA, pf->flags))
-		return;
-
-	local_dcbx_cfg = &pf->hw.port_info->local_dcbx_cfg;
-	desired_dcbx_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	mutex_lock(&pf->tc_mutex);
 
-	/* Save current willing state and force FW to unwilling */
-	local_dcbx_cfg->etscfg.willing = 0x0;
-	local_dcbx_cfg->pfc.willing = 0x0;
-	local_dcbx_cfg->app_mode = ICE_DCBX_APPS_NON_WILLING;
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp)
+		ice_cfg_etsrec_defaults(pf->hw.port_info);
 
-	ice_cfg_etsrec_defaults(pf->hw.port_info);
 	ret = ice_set_dcb_cfg(pf->hw.port_info);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Failed to set DCB to unwilling\n");
+		dev_err(dev, "Failed to set DCB config in rebuild\n");
 		goto dcb_error;
 	}
 
-	/* Retrieve DCB config and ensure same as current in SW */
-	prev_cfg = devm_kmemdup(&pf->pdev->dev, local_dcbx_cfg,
-				sizeof(*prev_cfg), GFP_KERNEL);
-	if (!prev_cfg) {
-		dev_err(&pf->pdev->dev, "Failed to alloc space for DCB cfg\n");
-		goto dcb_error;
-	}
-
-	ice_init_dcb(&pf->hw, true);
-	if (pf->hw.port_info->dcbx_status == ICE_DCBX_STATUS_DIS)
-		pf->hw.port_info->is_sw_lldp = true;
-	else
-		pf->hw.port_info->is_sw_lldp = false;
-
-	if (ice_dcb_need_recfg(pf, prev_cfg, local_dcbx_cfg)) {
-		/* difference in cfg detected - disable DCB till next MIB */
-		dev_err(&pf->pdev->dev, "Set local MIB not accurate\n");
-		goto dcb_error;
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp) {
+		ret = ice_cfg_lldp_mib_change(&pf->hw, true);
+		if (ret && !pf->hw.port_info->qos_cfg.is_sw_lldp) {
+			dev_err(dev, "Failed to register for MIB changes\n");
+			goto dcb_error;
+		}
 	}
 
-	/* fetched config congruent to previous configuration */
-	devm_kfree(&pf->pdev->dev, prev_cfg);
-
-	/* Set the local desired config */
-	if (local_dcbx_cfg->dcbx_mode == ICE_DCBX_MODE_CEE)
-		memcpy(local_dcbx_cfg, desired_dcbx_cfg,
-		       sizeof(*local_dcbx_cfg));
-
-	ice_cfg_etsrec_defaults(pf->hw.port_info);
-	ret = ice_set_dcb_cfg(pf->hw.port_info);
-	if (ret) {
-		dev_err(&pf->pdev->dev, "Failed to set desired config\n");
-		goto dcb_error;
-	}
-	dev_info(&pf->pdev->dev, "DCB restored after reset\n");
+	dev_info(dev, "DCB info restored\n");
 	ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Query Port ETS failed\n");
+		dev_err(dev, "Query Port ETS failed\n");
 		goto dcb_error;
 	}
 
+	mutex_unlock(&pf->tc_mutex);
+
 	return;
 
 dcb_error:
-	dev_err(&pf->pdev->dev, "Disabling DCB until new settings occur\n");
-	prev_cfg = devm_kzalloc(&pf->pdev->dev, sizeof(*prev_cfg), GFP_KERNEL);
-	prev_cfg->etscfg.willing = true;
-	prev_cfg->etscfg.tcbwtable[0] = ICE_TC_MAX_BW;
-	prev_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS;
-	memcpy(&prev_cfg->etsrec, &prev_cfg->etscfg, sizeof(prev_cfg->etsrec));
-	ice_pf_dcb_cfg(pf, prev_cfg, false);
-	devm_kfree(&pf->pdev->dev, prev_cfg);
+	dev_err(dev, "Disabling DCB until new settings occur\n");
+	err_cfg = kzalloc(sizeof(*err_cfg), GFP_KERNEL);
+	if (!err_cfg) {
+		mutex_unlock(&pf->tc_mutex);
+		return;
+	}
+
+	err_cfg->etscfg.willing = true;
+	err_cfg->etscfg.tcbwtable[0] = ICE_TC_MAX_BW;
+	err_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS;
+	memcpy(&err_cfg->etsrec, &err_cfg->etscfg, sizeof(err_cfg->etsrec));
+	/* Coverity warns the return code of ice_pf_dcb_cfg() is not checked
+	 * here as is done for other calls to that function. That check is
+	 * not necessary since this is in this function's error cleanup path.
+	 * Suppress the Coverity warning with the following comment...
+	 */
+	/* coverity[check_return] */
+	ice_pf_dcb_cfg(pf, err_cfg, false);
+	kfree(err_cfg);
+
+	mutex_unlock(&pf->tc_mutex);
 }
 
 /**
@@ -418,28 +632,29 @@ static int ice_dcb_init_cfg(struct ice_pf *pf, bool locked)
 	int ret = 0;
 
 	pi = pf->hw.port_info;
-	newcfg = devm_kzalloc(&pf->pdev->dev, sizeof(*newcfg), GFP_KERNEL);
+	newcfg = kmemdup(&pi->qos_cfg.local_dcbx_cfg, sizeof(*newcfg),
+			 GFP_KERNEL);
 	if (!newcfg)
 		return -ENOMEM;
 
-	memcpy(newcfg, &pi->local_dcbx_cfg, sizeof(*newcfg));
-	memset(&pi->local_dcbx_cfg, 0, sizeof(*newcfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0, sizeof(*newcfg));
 
-	dev_info(&pf->pdev->dev, "Configuring initial DCB values\n");
+	dev_info(ice_pf_to_dev(pf), "Configuring initial DCB values\n");
 	if (ice_pf_dcb_cfg(pf, newcfg, locked))
 		ret = -EINVAL;
 
-	devm_kfree(&pf->pdev->dev, newcfg);
+	kfree(newcfg);
 
 	return ret;
 }
 
 /**
- * ice_dcb_sw_default_config - Apply a default DCB config
+ * ice_dcb_sw_dflt_cfg - Apply a default DCB config
  * @pf: PF to apply config to
+ * @ets_willing: configure ETS willing
  * @locked: was this function called with RTNL held
  */
-static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool locked)
+int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool ets_willing, bool locked)
 {
 	struct ice_aqc_port_ets_elem buf = { 0 };
 	struct ice_dcbx_cfg *dcbcfg;
@@ -449,12 +664,13 @@ static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool locked)
 
 	hw = &pf->hw;
 	pi = hw->port_info;
-	dcbcfg = devm_kzalloc(&pf->pdev->dev, sizeof(*dcbcfg), GFP_KERNEL);
+	dcbcfg = kzalloc(sizeof(*dcbcfg), GFP_KERNEL);
+	if (!dcbcfg)
+		return -ENOMEM;
 
-	memset(dcbcfg, 0, sizeof(*dcbcfg));
-	memset(&pi->local_dcbx_cfg, 0, sizeof(*dcbcfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0, sizeof(*dcbcfg));
 
-	dcbcfg->etscfg.willing = 1;
+	dcbcfg->etscfg.willing = ets_willing ? 1 : 0;
 	dcbcfg->etscfg.maxtcs = hw->func_caps.common_cap.maxtc;
 	dcbcfg->etscfg.tcbwtable[0] = 100;
 	dcbcfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS;
@@ -469,16 +685,135 @@ static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool locked)
 	dcbcfg->numapps = 1;
 	dcbcfg->app[0].selector = ICE_APP_SEL_ETHTYPE;
 	dcbcfg->app[0].priority = 3;
-	dcbcfg->app[0].prot_id = ICE_APP_PROT_ID_FCOE;
+	dcbcfg->app[0].prot_id = ETH_P_FCOE;
 
 	ret = ice_pf_dcb_cfg(pf, dcbcfg, locked);
-	devm_kfree(&pf->pdev->dev, dcbcfg);
+	kfree(dcbcfg);
 	if (ret)
 		return ret;
 
 	return ice_query_port_ets(pi, &buf, sizeof(buf), NULL);
 }
 
+/**
+ * ice_dcb_tc_contig - Check that TCs are contiguous
+ * @prio_table: pointer to priority table
+ *
+ * Check if TCs begin with TC0 and are contiguous
+ */
+static bool ice_dcb_tc_contig(u8 *prio_table)
+{
+	bool found_empty = false;
+	u8 used_tc = 0;
+	int i;
+
+	/* Create a bitmap of used TCs */
+	for (i = 0; i < CEE_DCBX_MAX_PRIO; i++)
+		used_tc |= BIT(prio_table[i]);
+
+	for (i = 0; i < CEE_DCBX_MAX_PRIO; i++) {
+		if (used_tc & BIT(i)) {
+			if (found_empty)
+				return false;
+		} else {
+			found_empty = true;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * ice_dcb_noncontig_cfg - Configure DCB for non-contiguous TCs
+ * @pf: pointer to the PF struct
+ *
+ * If non-contiguous TCs, then configure SW DCB with TC0 and ETS non-willing
+ */
+static int ice_dcb_noncontig_cfg(struct ice_pf *pf)
+{
+	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+	struct device *dev = ice_pf_to_dev(pf);
+	int ret;
+
+	/* Configure SW DCB default with ETS non-willing */
+	ret = ice_dcb_sw_dflt_cfg(pf, false, true);
+	if (ret) {
+		dev_err(dev, "Failed to set local DCB config %d\n", ret);
+		return ret;
+	}
+
+	/* Reconfigure with ETS willing so that FW will send LLDP MIB event */
+	dcbcfg->etscfg.willing = 1;
+	ret = ice_set_dcb_cfg(pf->hw.port_info);
+	if (ret)
+		dev_err(dev, "Failed to set DCB to unwilling\n");
+
+	return ret;
+}
+
+/**
+ * ice_pf_dcb_recfg - Reconfigure all VEBs and VSIs
+ * @pf: pointer to the PF struct
+ *
+ * Assumed caller has already disabled all VSIs before
+ * calling this function. Reconfiguring DCB based on
+ * local_dcbx_cfg.
+ */
+void ice_pf_dcb_recfg(struct ice_pf *pf)
+{
+	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+	u8 tc_map = 0;
+	int v, ret;
+
+
+	/* Update each VSI */
+	ice_for_each_vsi(pf, v) {
+		struct ice_vsi *vsi = pf->vsi[v];
+
+		if (!vsi)
+			continue;
+
+		if (vsi->type == ICE_VSI_PF) {
+			tc_map = ice_dcb_get_ena_tc(dcbcfg);
+
+			/* If DCBX request non-contiguous TC, then configure
+			 * default TC
+			 */
+			if (!ice_dcb_tc_contig(dcbcfg->etscfg.prio_table)) {
+				tc_map = ICE_DFLT_TRAFFIC_CLASS;
+				ice_dcb_noncontig_cfg(pf);
+			}
+#if defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO) && defined(HAVE_NETDEV_SB_DEV)
+		} else if (vsi->type == ICE_VSI_CHNL ||
+			   vsi->type == ICE_VSI_OFFLOAD_MACVLAN) {
+			tc_map = BIT(ice_get_first_droptc(vsi));
+# endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO && HAVE_NETDEV_SB_DEV */
+		} else {
+			tc_map = ICE_DFLT_TRAFFIC_CLASS;
+		}
+
+		ret = ice_vsi_cfg_tc(vsi, tc_map);
+		if (ret) {
+			dev_err(ice_pf_to_dev(pf), "Failed to config TC for VSI index: %d\n",
+				vsi->idx);
+			continue;
+		}
+		/* no need to proceed with remaining cfg if it is CHNL VSI */
+		if (vsi->type == ICE_VSI_CHNL)
+			continue;
+
+		ice_vsi_map_rings_to_vectors(vsi);
+		if (vsi->type == ICE_VSI_PF)
+			ice_dcbnl_set_all(vsi);
+	}
+	/* If the RDMA peer is registered, update that peer's initial_qos_info struct.
+	 * The peer is closed during this process, so when it is opened, it will access
+	 * the initial_qos_info element to configure itself.
+	 */
+	if (pf->rdma_peer)
+		ice_setup_dcb_qos_info(pf, &pf->rdma_peer->initial_qos_info);
+}
+
 /**
  * ice_init_pf_dcb - initialize DCB for a PF
  * @pf: PF to initialize DCB for
@@ -486,43 +821,63 @@ static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool locked)
  */
 int ice_init_pf_dcb(struct ice_pf *pf, bool locked)
 {
-	struct device *dev = &pf->pdev->dev;
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_port_info *port_info;
 	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
 	int err;
 
 	port_info = hw->port_info;
 
-	err = ice_init_dcb(hw, false);
-	if (err && !port_info->is_sw_lldp) {
-		dev_err(&pf->pdev->dev, "Error initializing DCB %d\n", err);
+	status = ice_init_dcb(hw, false);
+	if (status && !port_info->qos_cfg.is_sw_lldp) {
+		dev_err(dev, "Error initializing DCB %s\n",
+			ice_stat_str(status));
+		err = ice_status_to_errno(status);
 		goto dcb_init_err;
 	}
 
-	dev_info(&pf->pdev->dev,
-		 "DCB is enabled in the hardware, max number of TCs supported on this port are %d\n",
+	dev_info(dev, "DCB is enabled in the hardware, max number of TCs supported on this port are %d\n",
 		 pf->hw.func_caps.common_cap.maxtc);
-	if (err) {
+	if (port_info->qos_cfg.is_sw_lldp) {
+		struct ice_vsi *pf_vsi;
+
 		/* FW LLDP is disabled, activate SW DCBX/LLDP mode */
-		dev_info(&pf->pdev->dev,
-			 "FW LLDP is disabled, DCBx/LLDP in SW mode.\n");
+		dev_info(dev, "FW LLDP is disabled, DCBx/LLDP in SW mode.\n");
 		clear_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
-		err = ice_dcb_sw_dflt_cfg(pf, locked);
+		err = ice_aq_set_pfc_mode(&pf->hw, ICE_AQC_PFC_VLAN_BASED_PFC,
+					  NULL);
+		if (err)
+			dev_info(dev, "Fail to set VLAN PFC mode\n");
+
+		err = ice_dcb_sw_dflt_cfg(pf, true, locked);
 		if (err) {
-			dev_err(&pf->pdev->dev,
-				"Failed to set local DCB config %d\n", err);
+			dev_err(dev, "Failed to set local DCB config %d\n",
+				err);
+			err = -EIO;
+			goto dcb_init_err;
+		}
+
+		/* If the FW DCBX engine is not running then Rx LLDP packets
+		 * need to be redirected up the stack.
+		 */
+		pf_vsi = ice_get_main_vsi(pf);
+		if (!pf_vsi) {
+			dev_err(dev, "Failed to set local DCB config\n");
 			err = -EIO;
 			goto dcb_init_err;
 		}
 
-		pf->dcbx_cap = DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+		ice_cfg_sw_lldp(pf_vsi, false, true);
+
+		pf->dcbx_cap = ice_dcb_get_mode(port_info, true);
 		return 0;
 	}
 
 	set_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
 
-	/* DCBX in FW and LLDP enabled in FW */
-	pf->dcbx_cap = DCB_CAP_DCBX_LLD_MANAGED | DCB_CAP_DCBX_VER_IEEE;
+	/* DCBX/LLDP enabled in FW, set DCBNL mode advertisement */
+	pf->dcbx_cap = ice_dcb_get_mode(port_info, false);
 
 	err = ice_dcb_init_cfg(pf, locked);
 	if (err)
@@ -578,39 +933,70 @@ void ice_update_dcb_stats(struct ice_pf *pf)
  * ice_tx_prepare_vlan_flags_dcb - prepare VLAN tagging for DCB
  * @tx_ring: ring to send buffer on
  * @first: pointer to struct ice_tx_buf
+ *
+ * This should not be called if the outer VLAN is software offloaded as the VLAN
+ * tag will already be configured with the correct ID and priority bits
  */
-int
+void
 ice_tx_prepare_vlan_flags_dcb(struct ice_ring *tx_ring,
 			      struct ice_tx_buf *first)
 {
 	struct sk_buff *skb = first->skb;
 
 	if (!test_bit(ICE_FLAG_DCB_ENA, tx_ring->vsi->back->flags))
-		return 0;
+		return;
 
 	/* Insert 802.1p priority into VLAN header */
-	if ((first->tx_flags & (ICE_TX_FLAGS_HW_VLAN | ICE_TX_FLAGS_SW_VLAN)) ||
+	if ((first->tx_flags & ICE_TX_FLAGS_HW_VLAN ||
+	     first->tx_flags & ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN) ||
 	    skb->priority != TC_PRIO_CONTROL) {
 		first->tx_flags &= ~ICE_TX_FLAGS_VLAN_PR_M;
 		/* Mask the lower 3 bits to set the 802.1p priority */
 		first->tx_flags |= (skb->priority & 0x7) <<
 				   ICE_TX_FLAGS_VLAN_PR_S;
-		if (first->tx_flags & ICE_TX_FLAGS_SW_VLAN) {
-			struct vlan_ethhdr *vhdr;
-			int rc;
-
-			rc = skb_cow_head(skb, 0);
-			if (rc < 0)
-				return rc;
-			vhdr = (struct vlan_ethhdr *)skb->data;
-			vhdr->h_vlan_TCI = htons(first->tx_flags >>
-						 ICE_TX_FLAGS_VLAN_S);
-		} else {
+		/* if this is not already set it means a VLAN 0 + priority needs
+		 * to be offloaded
+		 */
+		if (tx_ring->flags & ICE_TX_FLAGS_VLAN_TAG_LOC_L2TAG2)
+			first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
+		else
 			first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
-		}
 	}
+}
 
-	return 0;
+/**
+ * ice_setup_dcb_qos_info - Setup DCB QoS information
+ * @pf: ptr to ice_pf
+ * @qos_info: QoS param instance
+ */
+void ice_setup_dcb_qos_info(struct ice_pf *pf, struct ice_qos_params *qos_info)
+{
+	struct ice_dcbx_cfg *dcbx_cfg;
+	unsigned int i;
+	u32 up2tc;
+
+	dcbx_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+	up2tc = rd32(&pf->hw, PRTDCB_TUP2TC);
+	qos_info->num_apps = dcbx_cfg->numapps;
+
+	qos_info->num_tc = ice_dcb_get_num_tc(dcbx_cfg);
+
+	for (i = 0; i < ICE_IDC_MAX_USER_PRIORITY; i++)
+		qos_info->up2tc[i] = (up2tc >> (i * 3)) & 0x7;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		qos_info->tc_info[i].rel_bw =
+			dcbx_cfg->etscfg.tcbwtable[i];
+
+	for (i = 0; i < qos_info->num_apps; i++) {
+		qos_info->apps[i].priority = dcbx_cfg->app[i].priority;
+		qos_info->apps[i].prot_id = dcbx_cfg->app[i].prot_id;
+		qos_info->apps[i].selector = dcbx_cfg->app[i].selector;
+	}
+
+	qos_info->pfc_mode = dcbx_cfg->pfc_mode;
+	for (i = 0; i < ICE_IDC_DSCP_NUM_VAL; i++)
+		qos_info->dscp_map[i] = dcbx_cfg->dscp_map[i];
 }
 
 /**
@@ -623,11 +1009,12 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 				    struct ice_rq_event_info *event)
 {
 	struct ice_aqc_port_ets_elem buf = { 0 };
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_aqc_lldp_get_mib *mib;
 	struct ice_dcbx_cfg tmp_dcbx_cfg;
 	bool need_reconfig = false;
 	struct ice_port_info *pi;
-	u8 type;
+	u8 mib_type;
 	int ret;
 
 	/* Not DCB capable or capability disabled */
@@ -635,82 +1022,90 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 		return;
 
 	if (pf->dcbx_cap & DCB_CAP_DCBX_HOST) {
-		dev_dbg(&pf->pdev->dev,
-			"MIB Change Event in HOST mode\n");
+		dev_dbg(dev, "MIB Change Event in HOST mode\n");
 		return;
 	}
 
 	pi = pf->hw.port_info;
 	mib = (struct ice_aqc_lldp_get_mib *)&event->desc.params.raw;
 	/* Ignore if event is not for Nearest Bridge */
-	type = ((mib->type >> ICE_AQ_LLDP_BRID_TYPE_S) &
-		ICE_AQ_LLDP_BRID_TYPE_M);
-	dev_dbg(&pf->pdev->dev, "LLDP event MIB bridge type 0x%x\n", type);
-	if (type != ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID)
+	mib_type = ((mib->type >> ICE_AQ_LLDP_BRID_TYPE_S) &
+		    ICE_AQ_LLDP_BRID_TYPE_M);
+	dev_dbg(dev, "LLDP event MIB bridge type 0x%x\n", mib_type);
+	if (mib_type != ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID)
 		return;
 
 	/* Check MIB Type and return if event for Remote MIB update */
-	type = mib->type & ICE_AQ_LLDP_MIB_TYPE_M;
-	dev_dbg(&pf->pdev->dev,
-		"LLDP event mib type %s\n", type ? "remote" : "local");
-	if (type == ICE_AQ_LLDP_MIB_REMOTE) {
+	mib_type = mib->type & ICE_AQ_LLDP_MIB_TYPE_M;
+	dev_dbg(dev, "LLDP event mib type %s\n", mib_type ? "remote" : "local");
+	if (mib_type == ICE_AQ_LLDP_MIB_REMOTE) {
 		/* Update the remote cached instance and return */
 		ret = ice_aq_get_dcb_cfg(pi->hw, ICE_AQ_LLDP_MIB_REMOTE,
 					 ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID,
-					 &pi->remote_dcbx_cfg);
+					 &pi->qos_cfg.remote_dcbx_cfg);
 		if (ret) {
-			dev_err(&pf->pdev->dev, "Failed to get remote DCB config\n");
+			dev_err(dev, "Failed to get remote DCB config\n");
 			return;
 		}
 	}
 
+	mutex_lock(&pf->tc_mutex);
+
 	/* store the old configuration */
-	tmp_dcbx_cfg = pf->hw.port_info->local_dcbx_cfg;
+	tmp_dcbx_cfg = pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 
 	/* Reset the old DCBX configuration data */
-	memset(&pi->local_dcbx_cfg, 0, sizeof(pi->local_dcbx_cfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0,
+	       sizeof(pi->qos_cfg.local_dcbx_cfg));
 
 	/* Get updated DCBX data from firmware */
 	ret = ice_get_dcb_cfg(pf->hw.port_info);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Failed to get DCB config\n");
-		return;
+		dev_err(dev, "Failed to get DCB config\n");
+		goto out;
 	}
 
 	/* No change detected in DCBX configs */
-	if (!memcmp(&tmp_dcbx_cfg, &pi->local_dcbx_cfg, sizeof(tmp_dcbx_cfg))) {
-		dev_dbg(&pf->pdev->dev,
-			"No change detected in DCBX configuration.\n");
-		return;
+	if (!memcmp(&tmp_dcbx_cfg, &pi->qos_cfg.local_dcbx_cfg,
+		    sizeof(tmp_dcbx_cfg))) {
+		dev_dbg(dev, "No change detected in DCBX configuration.\n");
+		goto out;
 	}
 
+	pf->dcbx_cap = ice_dcb_get_mode(pi, false);
+
 	need_reconfig = ice_dcb_need_recfg(pf, &tmp_dcbx_cfg,
-					   &pi->local_dcbx_cfg);
+					   &pi->qos_cfg.local_dcbx_cfg);
+	ice_dcbnl_flush_apps(pf, &tmp_dcbx_cfg, &pi->qos_cfg.local_dcbx_cfg);
 	if (!need_reconfig)
-		return;
+		goto out;
 
 	/* Enable DCB tagging only when more than one TC */
-	if (ice_dcb_get_num_tc(&pi->local_dcbx_cfg) > 1) {
-		dev_dbg(&pf->pdev->dev, "DCB tagging enabled (num TC > 1)\n");
+	if (ice_dcb_get_num_tc(&pi->qos_cfg.local_dcbx_cfg) > 1) {
+		dev_dbg(dev, "DCB tagging enabled (num TC > 1)\n");
 		set_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	} else {
-		dev_dbg(&pf->pdev->dev, "DCB tagging disabled (num TC = 1)\n");
+		dev_dbg(dev, "DCB tagging disabled (num TC = 1)\n");
 		clear_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	}
 
 	rtnl_lock();
-	ice_pf_dis_all_vsi(pf, true);
+	/* disable VSIs affected by DCB changes */
+	ice_dcb_ena_dis_vsi(pf, false, true);
 
 	ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Query Port ETS failed\n");
-		rtnl_unlock();
-		return;
+		dev_err(dev, "Query Port ETS failed\n");
+		goto unlock_rtnl;
 	}
 
 	/* changes in configuration update VSI */
 	ice_pf_dcb_recfg(pf);
 
-	ice_pf_ena_all_vsi(pf, true);
+	/* enable previously downed VSIs */
+	ice_dcb_ena_dis_vsi(pf, true, true);
+unlock_rtnl:
 	rtnl_unlock();
+out:
+	mutex_unlock(&pf->tc_mutex);
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h
index 661a6f7bca64..971a8561415a 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h
@@ -1,50 +1,105 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_DCB_LIB_H_
 #define _ICE_DCB_LIB_H_
 
 #include "ice.h"
+#include "ice_base.h"
 #include "ice_lib.h"
 
 #ifdef CONFIG_DCB
-#define ICE_TC_MAX_BW 100 /* Default Max BW percentage */
+#define ICE_TC_MAX_BW		100 /* Default Max BW percentage */
+#define ICE_DCB_HW_CHG_RST	0 /* DCB configuration changed with reset */
+#define ICE_DCB_NO_HW_CHG	1 /* DCB configuration did not change */
+#define ICE_DCB_HW_CHG		2 /* DCB configuration changed, no reset */
 
 void ice_dcb_rebuild(struct ice_pf *pf);
-u8 ice_dcb_get_ena_tc(struct ice_dcbx_cfg *dcbcfg);
-u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg *dcbcfg);
+int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool ets_willing, bool locked);
+void ice_vsi_set_dcb_tc_cfg(struct ice_vsi *vsi);
+bool ice_is_pfc_causing_hung_q(struct ice_pf *pf, unsigned int txqueue);
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+u8 ice_dcb_get_tc(struct ice_vsi *vsi, int queue_index);
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
+int
+ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked);
+int ice_dcb_bwchk(struct ice_pf *pf, struct ice_dcbx_cfg *dcbcfg);
+void ice_pf_dcb_recfg(struct ice_pf *pf);
 void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi);
 int ice_init_pf_dcb(struct ice_pf *pf, bool locked);
 void ice_update_dcb_stats(struct ice_pf *pf);
-int
+void
 ice_tx_prepare_vlan_flags_dcb(struct ice_ring *tx_ring,
 			      struct ice_tx_buf *first);
+void ice_setup_dcb_qos_info(struct ice_pf *pf, struct ice_qos_params *qos_info);
 void
 ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 				    struct ice_rq_event_info *event);
-void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc);
+
+/**
+ * ice_find_q_in_range
+ * @low: start of queue range for a TC i.e. offset of TC
+ * @high: start of queue for next TC
+ * @tx_q: hung_queue/tx_queue
+ *
+ * finds if queue 'tx_q' falls between the two offsets of any given TC
+ */
+static inline bool ice_find_q_in_range(u16 low, u16 high, unsigned int tx_q)
+{
+	return (tx_q >= low) && (tx_q < high);
+}
+
 static inline void
 ice_set_cgd_num(struct ice_tlan_ctx *tlan_ctx, struct ice_ring *ring)
 {
 	tlan_ctx->cgd_num = ring->dcb_tc;
 }
+
+static inline bool ice_is_dcb_active(struct ice_pf *pf)
+{
+	return (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags) ||
+		test_bit(ICE_FLAG_DCB_ENA, pf->flags));
+}
+
+static inline u8 ice_get_pfc_mode(struct ice_pf *pf)
+{
+	return pf->hw.port_info->qos_cfg.local_dcbx_cfg.pfc_mode;
+}
+
 #else
-#define ice_dcb_rebuild(pf) do {} while (0)
+static inline void ice_dcb_rebuild(struct ice_pf *pf) { }
+static inline void ice_vsi_set_dcb_tc_cfg(struct ice_vsi *vsi)
+{
+	vsi->tc_cfg.ena_tc = ICE_DFLT_TRAFFIC_CLASS;
+	vsi->tc_cfg.numtc = 1;
+}
 
-static inline u8 ice_dcb_get_ena_tc(struct ice_dcbx_cfg __always_unused *dcbcfg)
+static inline u8 ice_get_first_droptc(struct ice_vsi __always_unused *vsi)
 {
-	return ICE_DFLT_TRAFFIC_CLASS;
+	return 0;
 }
 
-static inline u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg __always_unused *dcbcfg)
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+static inline u8
+ice_dcb_get_tc(struct ice_vsi __always_unused *vsi,
+	       int __always_unused queue_index)
 {
-	return 1;
+	return 0;
 }
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
 
 static inline int
 ice_init_pf_dcb(struct ice_pf *pf, bool __always_unused locked)
 {
-	dev_dbg(&pf->pdev->dev, "DCB not supported\n");
+	dev_dbg(ice_pf_to_dev(pf), "DCB not supported\n");
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ice_pf_dcb_cfg(struct ice_pf __always_unused *pf,
+	       struct ice_dcbx_cfg __always_unused *new_cfg,
+	       bool __always_unused locked)
+{
 	return -EOPNOTSUPP;
 }
 
@@ -55,10 +110,30 @@ ice_tx_prepare_vlan_flags_dcb(struct ice_ring __always_unused *tx_ring,
 	return 0;
 }
 
-#define ice_update_dcb_stats(pf) do {} while (0)
-#define ice_vsi_cfg_dcb_rings(vsi) do {} while (0)
-#define ice_dcb_process_lldp_set_mib_change(pf, event) do {} while (0)
-#define ice_set_cgd_num(tlan_ctx, ring) do {} while (0)
-#define ice_vsi_cfg_netdev_tc(vsi, ena_tc) do {} while (0)
+static inline bool ice_is_dcb_active(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+
+static inline bool
+ice_is_pfc_causing_hung_q(struct ice_pf __always_unused *pf,
+			  unsigned int __always_unused txqueue)
+{
+	return false;
+}
+
+static inline u8 ice_get_pfc_mode(struct ice_pf *pf)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void ice_pf_dcb_recfg(struct ice_pf *pf) { }
+static inline void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi) { }
+static inline void ice_update_dcb_stats(struct ice_pf *pf) { }
+static inline void ice_setup_dcb_qos_info(struct ice_pf *pf, struct ice_qos_params *qos_info) { }
+static inline
+void ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, struct ice_rq_event_info *event) { }
+static inline void ice_set_cgd_num(struct ice_tlan_ctx *tlan_ctx, struct ice_ring *ring) { }
 #endif /* CONFIG_DCB */
+
 #endif /* _ICE_DCB_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c
new file mode 100644
index 000000000000..4e7acd02f446
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c
@@ -0,0 +1,1139 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_dcb.h"
+#include "ice_dcb_lib.h"
+#include "ice_dcb_nl.h"
+#include <net/dcbnl.h>
+
+/**
+ * ice_dcbnl_devreset - perform enough of a ifdown/ifup to sync DCBNL info
+ * @netdev: device associated with interface that needs reset
+ */
+static void ice_dcbnl_devreset(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	while (ice_is_reset_in_progress(pf->state))
+		usleep_range(1000, 2000);
+
+	dev_close(netdev);
+	netdev_state_change(netdev);
+	dev_open(netdev, NULL);
+	netdev_state_change(netdev);
+}
+
+/**
+ * ice_dcbnl_getets - retrieve local ETS configuration
+ * @netdev: the relevant netdev
+ * @ets: struct to hold ETS configuration
+ */
+static int ice_dcbnl_getets(struct net_device *netdev, struct ieee_ets *ets)
+{
+	struct ice_dcbx_cfg *dcbxcfg;
+	struct ice_pf *pf;
+
+	pf = ice_netdev_to_pf(netdev);
+	dcbxcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	ets->willing = dcbxcfg->etscfg.willing;
+	ets->ets_cap = dcbxcfg->etscfg.maxtcs;
+	ets->cbs = dcbxcfg->etscfg.cbs;
+	memcpy(ets->tc_tx_bw, dcbxcfg->etscfg.tcbwtable, sizeof(ets->tc_tx_bw));
+	memcpy(ets->tc_rx_bw, dcbxcfg->etscfg.tcbwtable, sizeof(ets->tc_rx_bw));
+	memcpy(ets->tc_tsa, dcbxcfg->etscfg.tsatable, sizeof(ets->tc_tsa));
+	memcpy(ets->prio_tc, dcbxcfg->etscfg.prio_table, sizeof(ets->prio_tc));
+	memcpy(ets->tc_reco_bw, dcbxcfg->etsrec.tcbwtable,
+	       sizeof(ets->tc_reco_bw));
+	memcpy(ets->tc_reco_tsa, dcbxcfg->etsrec.tsatable,
+	       sizeof(ets->tc_reco_tsa));
+	memcpy(ets->reco_prio_tc, dcbxcfg->etscfg.prio_table,
+	       sizeof(ets->reco_prio_tc));
+
+	return 0;
+}
+
+/**
+ * ice_dcbnl_setets - set IEEE ETS configuration
+ * @netdev: pointer to relevant netdev
+ * @ets: struct to hold ETS configuration
+ */
+static int ice_dcbnl_setets(struct net_device *netdev, struct ieee_ets *ets)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+	int bwcfg = 0, bwrec = 0;
+	int err, i;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	mutex_lock(&pf->tc_mutex);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev, "can't set DCB configuration when ADQ is active\n");
+		err = ICE_DCB_NO_HW_CHG;
+		goto ets_out;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	new_cfg->etscfg.willing = ets->willing;
+	new_cfg->etscfg.cbs = ets->cbs;
+	ice_for_each_traffic_class(i) {
+		new_cfg->etscfg.tcbwtable[i] = ets->tc_tx_bw[i];
+		bwcfg += ets->tc_tx_bw[i];
+		new_cfg->etscfg.tsatable[i] = ets->tc_tsa[i];
+		if (new_cfg->pfc_mode == ICE_QOS_MODE_VLAN) {
+			/* in DSCP mode up->tc mapping cannot change */
+			new_cfg->etscfg.prio_table[i] = ets->prio_tc[i];
+			new_cfg->etsrec.prio_table[i] = ets->reco_prio_tc[i];
+		}
+		new_cfg->etsrec.tcbwtable[i] = ets->tc_reco_bw[i];
+		bwrec += ets->tc_reco_bw[i];
+		new_cfg->etsrec.tsatable[i] = ets->tc_reco_tsa[i];
+	}
+
+	if (ice_dcb_bwchk(pf, new_cfg)) {
+		err = -EINVAL;
+		goto ets_out;
+	}
+
+	new_cfg->etscfg.maxtcs = pf->hw.func_caps.common_cap.maxtc;
+
+	if (!bwrec)
+		new_cfg->etsrec.tcbwtable[0] = 100;
+
+	err = ice_pf_dcb_cfg(pf, new_cfg, true);
+	/* return of zero indicates new cfg applied */
+	if (err == ICE_DCB_HW_CHG_RST)
+		ice_dcbnl_devreset(netdev);
+	if (err == ICE_DCB_NO_HW_CHG)
+		err = ICE_DCB_HW_CHG_RST;
+
+ets_out:
+	mutex_unlock(&pf->tc_mutex);
+	return err;
+}
+
+/**
+ * ice_dcbnl_getnumtcs - Get max number of traffic classes supported
+ * @dev: pointer to netdev struct
+ * @tcid: TC ID
+ * @num: total number of TCs supported by the adapter
+ *
+ * Return the total number of TCs supported
+ */
+static int
+ice_dcbnl_getnumtcs(struct net_device *dev, int __always_unused tcid, u8 *num)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(dev);
+
+	if (!test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags))
+		return -EINVAL;
+
+	*num = pf->hw.func_caps.common_cap.maxtc;
+	return 0;
+}
+
+/**
+ * ice_dcbnl_getdcbx - retrieve current DCBX capability
+ * @netdev: pointer to the netdev struct
+ */
+static u8 ice_dcbnl_getdcbx(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	return pf->dcbx_cap;
+}
+
+/**
+ * ice_dcbnl_setdcbx - set required DCBX capability
+ * @netdev: the corresponding netdev
+ * @mode: required mode
+ */
+static u8 ice_dcbnl_setdcbx(struct net_device *netdev, u8 mode)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_qos_cfg *qos_cfg;
+
+	/* if FW LLDP agent is running, DCBNL not allowed to change mode */
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags))
+		return ICE_DCB_NO_HW_CHG;
+
+	/* No support for LLD_MANAGED modes or CEE+IEEE */
+	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    ((mode & DCB_CAP_DCBX_VER_IEEE) && (mode & DCB_CAP_DCBX_VER_CEE)) ||
+	    !(mode & DCB_CAP_DCBX_HOST))
+		return ICE_DCB_NO_HW_CHG;
+
+	/* Already set to the given mode no change */
+	if (mode == pf->dcbx_cap)
+		return ICE_DCB_NO_HW_CHG;
+
+	pf->dcbx_cap = mode;
+	qos_cfg = &pf->hw.port_info->qos_cfg;
+	if (mode & DCB_CAP_DCBX_VER_CEE) {
+		if (qos_cfg->local_dcbx_cfg.pfc_mode == ICE_QOS_MODE_DSCP)
+			return ICE_DCB_NO_HW_CHG;
+		qos_cfg->local_dcbx_cfg.dcbx_mode = ICE_DCBX_MODE_CEE;
+	} else {
+		qos_cfg->local_dcbx_cfg.dcbx_mode = ICE_DCBX_MODE_IEEE;
+	}
+
+	dev_info(ice_pf_to_dev(pf), "DCBx mode = 0x%x\n", mode);
+	return ICE_DCB_HW_CHG_RST;
+}
+
+/**
+ * ice_dcbnl_get_perm_hw_addr - MAC address used by DCBX
+ * @netdev: pointer to netdev struct
+ * @perm_addr: buffer to return permanent MAC address
+ */
+static void ice_dcbnl_get_perm_hw_addr(struct net_device *netdev, u8 *perm_addr)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+	int i, j;
+
+	memset(perm_addr, 0xff, MAX_ADDR_LEN);
+
+	for (i = 0; i < netdev->addr_len; i++)
+		perm_addr[i] = pi->mac.perm_addr[i];
+
+	for (j = 0; j < netdev->addr_len; j++, i++)
+		perm_addr[i] = pi->mac.perm_addr[j];
+}
+
+/**
+ * ice_get_pfc_delay - Retrieve PFC Link Delay
+ * @hw: pointer to HW struct
+ * @delay: holds the PFC Link Delay value
+ */
+static void ice_get_pfc_delay(struct ice_hw *hw, u16 *delay)
+{
+	u32 val;
+
+	val = rd32(hw, PRTDCB_GENC);
+	*delay = (u16)((val & PRTDCB_GENC_PFCLDA_M) >> PRTDCB_GENC_PFCLDA_S);
+}
+
+/**
+ * ice_dcbnl_getpfc - retrieve local IEEE PFC config
+ * @netdev: pointer to netdev struct
+ * @pfc: struct to hold PFC info
+ */
+static int ice_dcbnl_getpfc(struct net_device *netdev, struct ieee_pfc *pfc)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+	struct ice_dcbx_cfg *dcbxcfg;
+	int i;
+
+	dcbxcfg = &pi->qos_cfg.local_dcbx_cfg;
+	pfc->pfc_cap = dcbxcfg->pfc.pfccap;
+	pfc->pfc_en = dcbxcfg->pfc.pfcena;
+	pfc->mbc = dcbxcfg->pfc.mbc;
+	ice_get_pfc_delay(&pf->hw, &pfc->delay);
+
+	ice_for_each_traffic_class(i) {
+		pfc->requests[i] = pf->stats.priority_xoff_tx[i];
+		pfc->indications[i] = pf->stats.priority_xoff_rx[i];
+	}
+
+	return 0;
+}
+
+/**
+ * ice_dcbnl_setpfc - set local IEEE PFC config
+ * @netdev: pointer to relevant netdev
+ * @pfc: pointer to struct holding PFC config
+ */
+static int ice_dcbnl_setpfc(struct net_device *netdev, struct ieee_pfc *pfc)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+	int err;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	mutex_lock(&pf->tc_mutex);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev, "can't set DCB configuration when ADQ is active\n");
+		err = ICE_DCB_NO_HW_CHG;
+		goto pfc_out;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	if (pfc->pfc_cap)
+		new_cfg->pfc.pfccap = pfc->pfc_cap;
+	else
+		new_cfg->pfc.pfccap = pf->hw.func_caps.common_cap.maxtc;
+
+	new_cfg->pfc.pfcena = pfc->pfc_en;
+
+	err = ice_pf_dcb_cfg(pf, new_cfg, true);
+	if (err == ICE_DCB_HW_CHG_RST)
+		ice_dcbnl_devreset(netdev);
+	if (err == ICE_DCB_NO_HW_CHG)
+		err = ICE_DCB_HW_CHG_RST;
+#ifdef NETIF_F_HW_TC
+pfc_out:
+#endif /* NETIF_F_HW_TC */
+	mutex_unlock(&pf->tc_mutex);
+	return err;
+}
+
+/**
+ * ice_dcbnl_get_pfc_cfg - Get CEE PFC config
+ * @netdev: pointer to netdev struct
+ * @prio: corresponding user priority
+ * @setting: the PFC setting for given priority
+ */
+static void
+ice_dcbnl_get_pfc_cfg(struct net_device *netdev, int prio, u8 *setting)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (prio >= ICE_MAX_USER_PRIORITY)
+		return;
+
+	*setting = (pi->qos_cfg.local_dcbx_cfg.pfc.pfcena >> prio) & 0x1;
+	dev_dbg(ice_pf_to_dev(pf), "Get PFC Config up=%d, setting=%d, pfcenable=0x%x\n",
+		prio, *setting, pi->qos_cfg.local_dcbx_cfg.pfc.pfcena);
+}
+
+/**
+ * ice_dcbnl_set_pfc_cfg - Set CEE PFC config
+ * @netdev: the corresponding netdev
+ * @prio: User Priority
+ * @set: PFC setting to apply
+ */
+static void ice_dcbnl_set_pfc_cfg(struct net_device *netdev, int prio, u8 set)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (prio >= ICE_MAX_USER_PRIORITY)
+		return;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	new_cfg->pfc.pfccap = pf->hw.func_caps.common_cap.maxtc;
+	if (set)
+		new_cfg->pfc.pfcena |= BIT(prio);
+	else
+		new_cfg->pfc.pfcena &= ~BIT(prio);
+
+	dev_dbg(ice_pf_to_dev(pf), "Set PFC config UP:%d set:%d pfcena:0x%x\n",
+		prio, set, new_cfg->pfc.pfcena);
+}
+
+/**
+ * ice_dcbnl_getpfcstate - get CEE PFC mode
+ * @netdev: pointer to netdev struct
+ */
+static u8 ice_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	/* Return enabled if any UP enabled for PFC */
+	if (pi->qos_cfg.local_dcbx_cfg.pfc.pfcena)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * ice_dcbnl_getstate - get DCB enabled state
+ * @netdev: pointer to netdev struct
+ */
+static u8 ice_dcbnl_getstate(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	u8 state = 0;
+
+	state = test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags);
+
+	dev_dbg(ice_pf_to_dev(pf), "DCB enabled state = %d\n", state);
+	return state;
+}
+
+/**
+ * ice_dcbnl_setstate - Set CEE DCB state
+ * @netdev: pointer to relevant netdev
+ * @state: state value to set
+ */
+static u8 ice_dcbnl_setstate(struct net_device *netdev, u8 state)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return ICE_DCB_NO_HW_CHG;
+
+	/* Nothing to do */
+	if (!!state == test_bit(ICE_FLAG_DCB_ENA, pf->flags))
+		return ICE_DCB_NO_HW_CHG;
+
+	if (state) {
+		set_bit(ICE_FLAG_DCB_ENA, pf->flags);
+		memcpy(&pf->hw.port_info->qos_cfg.desired_dcbx_cfg,
+		       &pf->hw.port_info->qos_cfg.local_dcbx_cfg,
+		       sizeof(struct ice_dcbx_cfg));
+	} else {
+		clear_bit(ICE_FLAG_DCB_ENA, pf->flags);
+	}
+
+	return ICE_DCB_HW_CHG;
+}
+
+/**
+ * ice_dcbnl_get_pg_tc_cfg_tx - get CEE PG Tx config
+ * @netdev: pointer to netdev struct
+ * @prio: the corresponding user priority
+ * @prio_type: traffic priority type
+ * @pgid: the BW group ID the traffic class belongs to
+ * @bw_pct: BW percentage for the corresponding BWG
+ * @up_map: prio mapped to corresponding TC
+ */
+static void
+ice_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int prio,
+			   u8 __always_unused *prio_type, u8 *pgid,
+			   u8 __always_unused *bw_pct,
+			   u8 __always_unused *up_map)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (prio >= ICE_MAX_USER_PRIORITY)
+		return;
+
+	*pgid = pi->qos_cfg.local_dcbx_cfg.etscfg.prio_table[prio];
+	dev_dbg(ice_pf_to_dev(pf), "Get PG config prio=%d tc=%d\n", prio,
+		*pgid);
+}
+
+/**
+ * ice_dcbnl_set_pg_tc_cfg_tx - set CEE PG Tx config
+ * @netdev: pointer to relevant netdev
+ * @tc: the corresponding traffic class
+ * @prio_type: the traffic priority type
+ * @bwg_id: the BW group ID the TC belongs to
+ * @bw_pct: the BW perventage for the BWG
+ * @up_map: prio mapped to corresponding TC
+ */
+static void
+ice_dcbnl_set_pg_tc_cfg_tx(struct net_device *netdev, int tc,
+			   u8 __always_unused prio_type,
+			   u8 __always_unused bwg_id,
+			   u8 __always_unused bw_pct, u8 up_map)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+	int i;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	/* prio_type, bwg_id and bw_pct per UP are not supported */
+
+	ice_for_each_traffic_class(i) {
+		if (up_map & BIT(i))
+			new_cfg->etscfg.prio_table[i] = tc;
+	}
+	new_cfg->etscfg.tsatable[tc] = ICE_IEEE_TSA_ETS;
+}
+
+/**
+ * ice_dcbnl_get_pg_bwg_cfg_tx - Get CEE PGBW config
+ * @netdev: pointer to the netdev struct
+ * @pgid: corresponding traffic class
+ * @bw_pct: the BW percentage for the corresponding TC
+ */
+static void
+ice_dcbnl_get_pg_bwg_cfg_tx(struct net_device *netdev, int pgid, u8 *bw_pct)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (pgid >= ICE_MAX_TRAFFIC_CLASS)
+		return;
+
+	*bw_pct = pi->qos_cfg.local_dcbx_cfg.etscfg.tcbwtable[pgid];
+	dev_dbg(ice_pf_to_dev(pf), "Get PG BW config tc=%d bw_pct=%d\n",
+		pgid, *bw_pct);
+}
+
+/**
+ * ice_dcbnl_set_pg_bwg_cfg_tx - set CEE PG Tx BW config
+ * @netdev: the corresponding netdev
+ * @pgid: Correspongind traffic class
+ * @bw_pct: the BW percentage for the specified TC
+ */
+static void
+ice_dcbnl_set_pg_bwg_cfg_tx(struct net_device *netdev, int pgid, u8 bw_pct)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (pgid >= ICE_MAX_TRAFFIC_CLASS)
+		return;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	new_cfg->etscfg.tcbwtable[pgid] = bw_pct;
+}
+
+/**
+ * ice_dcbnl_get_pg_tc_cfg_rx - Get CEE PG Rx config
+ * @netdev: pointer to netdev struct
+ * @prio: the corresponding user priority
+ * @prio_type: the traffic priority type
+ * @pgid: the PG ID
+ * @bw_pct: the BW percentage for the corresponding BWG
+ * @up_map: prio mapped to corresponding TC
+ */
+static void
+ice_dcbnl_get_pg_tc_cfg_rx(struct net_device *netdev, int prio,
+			   u8 __always_unused *prio_type, u8 *pgid,
+			   u8 __always_unused *bw_pct,
+			   u8 __always_unused *up_map)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (prio >= ICE_MAX_USER_PRIORITY)
+		return;
+
+	*pgid = pi->qos_cfg.local_dcbx_cfg.etscfg.prio_table[prio];
+}
+
+/**
+ * ice_dcbnl_set_pg_tc_cfg_rx
+ * @netdev: relevant netdev struct
+ * @prio: corresponding user priority
+ * @prio_type: the traffic priority type
+ * @pgid: the PG ID
+ * @bw_pct: BW percentage for corresponding BWG
+ * @up_map: prio mapped to corresponding TC
+ *
+ * lldpad requires this function pointer to be non-NULL to complete CEE config.
+ */
+static void
+ice_dcbnl_set_pg_tc_cfg_rx(struct net_device *netdev,
+			   int __always_unused prio,
+			   u8 __always_unused prio_type,
+			   u8 __always_unused pgid,
+			   u8 __always_unused bw_pct,
+			   u8 __always_unused up_map)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	dev_dbg(ice_pf_to_dev(pf), "Rx TC PG Config Not Supported.\n");
+}
+
+/**
+ * ice_dcbnl_get_pg_bwg_cfg_rx - Get CEE PG BW Rx config
+ * @netdev: pointer to netdev struct
+ * @pgid: the corresponding traffic class
+ * @bw_pct: the BW percentage for the corresponding TC
+ */
+static void
+ice_dcbnl_get_pg_bwg_cfg_rx(struct net_device *netdev, int __always_unused pgid,
+			    u8 *bw_pct)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	*bw_pct = 0;
+}
+
+/**
+ * ice_dcbnl_set_pg_bwg_cfg_rx
+ * @netdev: the corresponding netdev
+ * @pgid: corresponding TC
+ * @bw_pct: BW percentage for given TC
+ *
+ * lldpad requires this function pointer to be non-NULL to complete CEE config.
+ */
+static void
+ice_dcbnl_set_pg_bwg_cfg_rx(struct net_device *netdev, int __always_unused pgid,
+			    u8 __always_unused bw_pct)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	dev_dbg(ice_pf_to_dev(pf), "Rx BWG PG Config Not Supported.\n");
+}
+
+/**
+ * ice_dcbnl_get_cap - Get DCBX capabilities of adapter
+ * @netdev: pointer to netdev struct
+ * @capid: the capability type
+ * @cap: the capability value
+ */
+static u8 ice_dcbnl_get_cap(struct net_device *netdev, int capid, u8 *cap)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	if (!(test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags)))
+		return ICE_DCB_NO_HW_CHG;
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PG:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_PFC:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_UP2TC:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_PG_TCS:
+		*cap = 0x80;
+		break;
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 0x80;
+		break;
+	case DCB_CAP_ATTR_GSP:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_BCN:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = pf->dcbx_cap;
+		break;
+	default:
+		*cap = false;
+		break;
+	}
+
+	dev_dbg(ice_pf_to_dev(pf), "DCBX Get Capability cap=%d capval=0x%x\n",
+		capid, *cap);
+	return 0;
+}
+
+/**
+ * ice_dcbnl_getapp - get CEE APP
+ * @netdev: pointer to netdev struct
+ * @idtype: the App selector
+ * @id: the App ethtype or port number
+ */
+#ifdef HAVE_DCBNL_OPS_SETAPP_RETURN_INT
+static int ice_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id)
+#else
+static u8 ice_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id)
+#endif /* HAVE_DCBNL_OPS_SETAPP_RETURN_INT */
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct dcb_app app = {
+				.selector = idtype,
+				.protocol = id,
+			     };
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return -EINVAL;
+
+	return dcb_getapp(netdev, &app);
+}
+
+/**
+ * ice_dcbnl_find_app - Search for APP in given DCB config
+ * @cfg: struct to hold DCBX config
+ * @app: struct to hold app data to look for
+ */
+static bool
+ice_dcbnl_find_app(struct ice_dcbx_cfg *cfg,
+		   struct ice_dcb_app_priority_table *app)
+{
+	unsigned int i;
+
+	for (i = 0; i < cfg->numapps; i++) {
+		if (app->selector == cfg->app[i].selector &&
+		    app->prot_id == cfg->app[i].prot_id &&
+		    app->priority == cfg->app[i].priority)
+			return true;
+	}
+
+	return false;
+}
+
+#define ICE_BYTES_PER_DSCP_VAL		8
+
+/**
+ * ice_dcbnl_setapp - set local IEEE App config
+ * @netdev: relevant netdev struct
+ * @app: struct to hold app config info
+ */
+static int ice_dcbnl_setapp(struct net_device *netdev, struct dcb_app *app)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcb_app_priority_table new_app;
+	struct ice_dcbx_cfg *old_cfg, *new_cfg;
+	u8 max_tc;
+	int ret;
+
+	/* ONLY DSCP APP TLVs have operational significance */
+	if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP)
+		return -EINVAL;
+
+	/* only allow APP TLVs in SW Mode */
+	if (pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) {
+		netdev_err(netdev, "can't do DSCP QoS when FW DCB agent active\n");
+		return -EINVAL;
+	}
+
+	if (!(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	if (!ice_is_feature_supported(pf, ICE_F_DSCP))
+		return -EOPNOTSUPP;
+
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev,
+			   "can't set DCB configuration when ADQ is active\n");
+		return ICE_DCB_NO_HW_CHG;
+	}
+#endif /* NETIF_F_HW_TC */
+	if (app->protocol >= ICE_DSCP_NUM_VAL) {
+		netdev_err(netdev, "DSCP value 0x%04X out of range\n",
+			   app->protocol);
+		return -EINVAL;
+	}
+
+	max_tc = pf->hw.func_caps.common_cap.maxtc;
+	if (app->priority >= max_tc) {
+		netdev_err(netdev, "TC %d out of range, max TC %d\n",
+			   app->priority, max_tc);
+		return -EINVAL;
+	}
+
+
+	/* grab TC mutex */
+	mutex_lock(&pf->tc_mutex);
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+	old_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	ret = dcb_ieee_setapp(netdev, app);
+	if (ret)
+		goto setapp_out;
+
+	if (test_and_set_bit(app->protocol, new_cfg->dscp_mapped)) {
+		netdev_err(netdev, "DSCP value 0x%04X already user mapped\n",
+			   app->protocol);
+		ret = dcb_ieee_delapp(netdev, app);
+		if (ret)
+			netdev_err(netdev, "Failed to delete re-mapping TLV\n");
+		ret = -EINVAL;
+		goto setapp_out;
+	}
+
+	new_app.selector = app->selector;
+	new_app.prot_id = app->protocol;
+	new_app.priority = app->priority;
+
+	/* If port is not in DSCP mode, need to set */
+	if (old_cfg->pfc_mode == ICE_QOS_MODE_VLAN) {
+		int i, j;
+
+		/* set DSCP mode */
+		ret = ice_aq_set_pfc_mode(&pf->hw, ICE_AQC_PFC_DSCP_BASED_PFC,
+					  NULL);
+		if (ret) {
+			netdev_err(netdev, "Failed to set DSCP PFC mode %d\n",
+				   ret);
+			goto setapp_out;
+		}
+		netdev_info(netdev, "Switched QoS to L3 DSCP mode\n");
+
+		new_cfg->pfc_mode = ICE_QOS_MODE_DSCP;
+
+		/* set default DSCP QoS values */
+		new_cfg->etscfg.willing = 0;
+		new_cfg->pfc.pfccap = max_tc;
+		new_cfg->pfc.willing = 0;
+
+		for (i = 0; i < max_tc; i++)
+			for (j = 0; j < ICE_BYTES_PER_DSCP_VAL; j++) {
+				int dscp, offset;
+
+				dscp = (i * max_tc) + j;
+				offset = max_tc * ICE_BYTES_PER_DSCP_VAL;
+
+				new_cfg->dscp_map[dscp] = i;
+				/* if less that 8 TCs supported */
+				if (max_tc < ICE_MAX_TRAFFIC_CLASS)
+					new_cfg->dscp_map[dscp + offset] = i;
+			}
+
+		new_cfg->etscfg.tcbwtable[0] = 100;
+		new_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS;
+		new_cfg->etscfg.prio_table[0] = 0;
+
+		for (i = 1; i < max_tc; i++) {
+			new_cfg->etscfg.tcbwtable[i] = 0;
+			new_cfg->etscfg.tsatable[i] = ICE_IEEE_TSA_ETS;
+			new_cfg->etscfg.prio_table[i] = i;
+		}
+	} /* end of switching to DSCP mode */
+
+	/* apply new mapping for this DSCP value */
+	new_cfg->dscp_map[app->protocol] = app->priority;
+	new_cfg->app[new_cfg->numapps++] = new_app;
+
+	ret = ice_pf_dcb_cfg(pf, new_cfg, true);
+	/* return of zero indicates new cfg applied */
+	if (ret == ICE_DCB_HW_CHG_RST)
+		ice_dcbnl_devreset(netdev);
+	else
+		ret = ICE_DCB_NO_HW_CHG;
+
+setapp_out:
+	mutex_unlock(&pf->tc_mutex);
+	return ret;
+}
+
+/**
+ * ice_dcbnl_delapp - Delete local IEEE App config
+ * @netdev: relevant netdev
+ * @app: struct to hold app too delete
+ *
+ * Will not delete first application required by the FW
+ */
+static int ice_dcbnl_delapp(struct net_device *netdev, struct dcb_app *app)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *old_cfg, *new_cfg;
+	unsigned int i, j;
+	int ret = 0;
+
+	if (pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) {
+		netdev_err(netdev, "can't delete DSCP netlink app when FW DCB agent is active\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&pf->tc_mutex);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev, "can't set DCB configuration when ADQ is active\n");
+		ret = ICE_DCB_NO_HW_CHG;
+		goto delapp_out;
+	}
+#endif /* NETIF_F_HW_TC */
+	old_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	ret = dcb_ieee_delapp(netdev, app);
+	if (ret)
+		goto delapp_out;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	for (i = 0; i < new_cfg->numapps; i++) {
+		if (app->selector == new_cfg->app[i].selector &&
+		    app->protocol == new_cfg->app[i].prot_id &&
+		    app->priority == new_cfg->app[i].priority) {
+			new_cfg->app[i].selector = 0;
+			new_cfg->app[i].prot_id = 0;
+			new_cfg->app[i].priority = 0;
+			break;
+		}
+	}
+
+	/* Did not find DCB App */
+	if (i == new_cfg->numapps) {
+		ret = -EINVAL;
+		goto delapp_out;
+	}
+
+	new_cfg->numapps--;
+
+	for (j = i; j < new_cfg->numapps; j++) {
+		new_cfg->app[j].selector = old_cfg->app[j + 1].selector;
+		new_cfg->app[j].prot_id = old_cfg->app[j + 1].prot_id;
+		new_cfg->app[j].priority = old_cfg->app[j + 1].priority;
+	}
+
+	/* if not a DSCP APP TLV or DSCP is not supported, we are done */
+	if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP ||
+	    !ice_is_feature_supported(pf, ICE_F_DSCP)) {
+		ret = ICE_DCB_HW_CHG;
+		goto delapp_out;
+	}
+
+	/* if DSCP TLV, then need to address change in mapping */
+	clear_bit(app->protocol, new_cfg->dscp_mapped);
+	/* remap this DSCP value to default value */
+	new_cfg->dscp_map[app->protocol] = app->protocol %
+					   ICE_BYTES_PER_DSCP_VAL;
+
+	/* if the last DSCP mapping just got deleted, need to switch
+	 * to L2 VLAN QoS mode
+	 */
+	if (bitmap_empty(new_cfg->dscp_mapped, ICE_DSCP_NUM_VAL) &&
+	    new_cfg->pfc_mode == ICE_QOS_MODE_DSCP) {
+
+		ret = ice_aq_set_pfc_mode(&pf->hw,
+					  ICE_AQC_PFC_VLAN_BASED_PFC,
+					  NULL);
+		if (ret) {
+			netdev_info(netdev, "Failed to set VLAN PFC mode %d\n",
+				    ret);
+			goto delapp_out;
+		}
+		netdev_info(netdev, "Switched QoS to L2 VLAN mode\n");
+
+		new_cfg->pfc_mode = ICE_QOS_MODE_VLAN;
+
+		ret = ice_dcb_sw_dflt_cfg(pf, true, true);
+	} else {
+		ret = ice_pf_dcb_cfg(pf, new_cfg, true);
+	}
+
+	/* return of ICE_DCB_HW_CHG_RST indicates new cfg applied
+	 * and reset needs to be performed
+	 */
+	if (ret == ICE_DCB_HW_CHG_RST)
+		ice_dcbnl_devreset(netdev);
+
+	/* if the change was not siginificant enough to actually call
+	 * the reconfiguration flow, we still need to tell caller that
+	 * their request was successfully handled
+	 */
+	if (ret == ICE_DCB_NO_HW_CHG)
+		ret = ICE_DCB_HW_CHG;
+
+delapp_out:
+	mutex_unlock(&pf->tc_mutex);
+	return ret;
+}
+
+/**
+ * ice_dcbnl_cee_set_all - Commit CEE DCB settings to HW
+ * @netdev: the corresponding netdev
+ */
+static u8 ice_dcbnl_cee_set_all(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+	int err;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return ICE_DCB_NO_HW_CHG;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	mutex_lock(&pf->tc_mutex);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev, "can't set DCB configuration when ADQ is active\n");
+		err = ICE_DCB_NO_HW_CHG;
+		goto out;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	err = ice_pf_dcb_cfg(pf, new_cfg, true);
+
+#ifdef NETIF_F_HW_TC
+out:
+#endif /* NETIF_F_HW_TC */
+	mutex_unlock(&pf->tc_mutex);
+	return (err != ICE_DCB_HW_CHG_RST) ? ICE_DCB_NO_HW_CHG : err;
+}
+
+static const struct dcbnl_rtnl_ops dcbnl_ops = {
+	/* IEEE 802.1Qaz std */
+	.ieee_getets = ice_dcbnl_getets,
+	.ieee_setets = ice_dcbnl_setets,
+	.ieee_getpfc = ice_dcbnl_getpfc,
+	.ieee_setpfc = ice_dcbnl_setpfc,
+	.ieee_setapp = ice_dcbnl_setapp,
+	.ieee_delapp = ice_dcbnl_delapp,
+
+	/* CEE std */
+	.getstate = ice_dcbnl_getstate,
+	.setstate = ice_dcbnl_setstate,
+	.getpermhwaddr = ice_dcbnl_get_perm_hw_addr,
+	.setpgtccfgtx = ice_dcbnl_set_pg_tc_cfg_tx,
+	.setpgbwgcfgtx = ice_dcbnl_set_pg_bwg_cfg_tx,
+	.setpgtccfgrx = ice_dcbnl_set_pg_tc_cfg_rx,
+	.setpgbwgcfgrx = ice_dcbnl_set_pg_bwg_cfg_rx,
+	.getpgtccfgtx = ice_dcbnl_get_pg_tc_cfg_tx,
+	.getpgbwgcfgtx = ice_dcbnl_get_pg_bwg_cfg_tx,
+	.getpgtccfgrx = ice_dcbnl_get_pg_tc_cfg_rx,
+	.getpgbwgcfgrx = ice_dcbnl_get_pg_bwg_cfg_rx,
+	.setpfccfg = ice_dcbnl_set_pfc_cfg,
+	.getpfccfg = ice_dcbnl_get_pfc_cfg,
+	.setall = ice_dcbnl_cee_set_all,
+	.getcap = ice_dcbnl_get_cap,
+	.getnumtcs = ice_dcbnl_getnumtcs,
+	.getpfcstate = ice_dcbnl_getpfcstate,
+	.getapp = ice_dcbnl_getapp,
+
+	/* DCBX configuration */
+	.getdcbx = ice_dcbnl_getdcbx,
+	.setdcbx = ice_dcbnl_setdcbx,
+};
+
+/**
+ * ice_dcbnl_set_all - set all the apps and ieee data from DCBX config
+ * @vsi: pointer to VSI struct
+ */
+void ice_dcbnl_set_all(struct ice_vsi *vsi)
+{
+	struct net_device *netdev = vsi->netdev;
+	struct ice_dcbx_cfg *dcbxcfg;
+	struct ice_port_info *pi;
+	struct dcb_app sapp;
+	struct ice_pf *pf;
+	unsigned int i;
+
+	if (!netdev)
+		return;
+
+	pf = ice_netdev_to_pf(netdev);
+	pi = pf->hw.port_info;
+
+	/* SW DCB taken care of by SW Default Config */
+	if (pf->dcbx_cap & DCB_CAP_DCBX_HOST)
+		return;
+
+	/* DCB not enabled */
+	if (!test_bit(ICE_FLAG_DCB_ENA, pf->flags))
+		return;
+
+	dcbxcfg = &pi->qos_cfg.local_dcbx_cfg;
+
+	for (i = 0; i < dcbxcfg->numapps; i++) {
+		u8 prio, tc_map;
+
+		prio = dcbxcfg->app[i].priority;
+		tc_map = BIT(dcbxcfg->etscfg.prio_table[prio]);
+
+		/* Add APP only if the TC is enabled for this VSI */
+		if (tc_map & vsi->tc_cfg.ena_tc) {
+			sapp.selector = dcbxcfg->app[i].selector;
+			sapp.protocol = dcbxcfg->app[i].prot_id;
+			sapp.priority = prio;
+			dcb_ieee_setapp(netdev, &sapp);
+		}
+	}
+#ifdef HAVE_DCBNL_IEEE_DELAPP
+	/* Notify user-space of the changes */
+	dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, 0, 0);
+#endif /* HAVE_DCBNL_IEEE_DELAPP */
+}
+
+/**
+ * ice_dcbnl_vsi_del_app - Delete APP on all VSIs
+ * @vsi: pointer to the main VSI
+ * @app: APP to delete
+ *
+ * Delete given APP from all the VSIs for given PF
+ */
+static void
+ice_dcbnl_vsi_del_app(struct ice_vsi *vsi,
+		      struct ice_dcb_app_priority_table *app)
+{
+	struct dcb_app sapp;
+	int err;
+
+	sapp.selector = app->selector;
+	sapp.protocol = app->prot_id;
+	sapp.priority = app->priority;
+	err = ice_dcbnl_delapp(vsi->netdev, &sapp);
+	dev_dbg(ice_pf_to_dev(vsi->back), "Deleting app for VSI idx=%d err=%d sel=%d proto=0x%x, prio=%d\n",
+		vsi->idx, err, app->selector, app->prot_id, app->priority);
+}
+
+/**
+ * ice_dcbnl_flush_apps - Delete all removed APPs
+ * @pf: the corresponding PF
+ * @old_cfg: old DCBX configuration data
+ * @new_cfg: new DCBX configuration data
+ *
+ * Find and delete all APPS that are not present in the passed
+ * DCB configuration
+ */
+void
+ice_dcbnl_flush_apps(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
+		     struct ice_dcbx_cfg *new_cfg)
+{
+	struct ice_vsi *main_vsi = ice_get_main_vsi(pf);
+	unsigned int i;
+
+	if (!main_vsi)
+		return;
+
+	for (i = 0; i < old_cfg->numapps; i++) {
+		struct ice_dcb_app_priority_table app = old_cfg->app[i];
+
+		/* The APP is not available anymore delete it */
+		if (!ice_dcbnl_find_app(new_cfg, &app))
+			ice_dcbnl_vsi_del_app(main_vsi, &app);
+	}
+}
+
+/**
+ * ice_dcbnl_setup - setup DCBNL
+ * @vsi: VSI to get associated netdev from
+ */
+void ice_dcbnl_setup(struct ice_vsi *vsi)
+{
+	struct net_device *netdev = vsi->netdev;
+	struct ice_pf *pf;
+
+	pf = ice_netdev_to_pf(netdev);
+	if (!test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags))
+		return;
+
+	netdev->dcbnl_ops = &dcbnl_ops;
+	ice_dcbnl_set_all(vsi);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.h b/drivers/net/ethernet/intel/ice/ice_dcb_nl.h
new file mode 100644
index 000000000000..69bea270e372
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_DCB_NL_H_
+#define _ICE_DCB_NL_H_
+
+#ifdef CONFIG_DCB
+void ice_dcbnl_setup(struct ice_vsi *vsi);
+void ice_dcbnl_set_all(struct ice_vsi *vsi);
+void
+ice_dcbnl_flush_apps(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
+		     struct ice_dcbx_cfg *new_cfg);
+#else
+static inline void ice_dcbnl_setup(struct ice_vsi *vsi) { }
+static inline void ice_dcbnl_set_all(struct ice_vsi *vsi) { }
+static inline void ice_dcbnl_flush_apps(struct ice_pf *pf,
+					struct ice_dcbx_cfg *old_cfg,
+					struct ice_dcbx_cfg *new_cfg) { }
+#endif /* CONFIG_DCB */
+#endif /* _ICE_DCB_NL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_dcf.c b/drivers/net/ethernet/intel/ice/ice_dcf.c
new file mode 100644
index 000000000000..7810116d9d3b
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_dcf.c
@@ -0,0 +1,1137 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+
+static const enum ice_adminq_opc aqc_permitted_tbl[] = {
+	/* Generic Firmware Admin commands */
+	ice_aqc_opc_get_ver,
+	ice_aqc_opc_req_res,
+	ice_aqc_opc_release_res,
+	ice_aqc_opc_list_func_caps,
+	ice_aqc_opc_list_dev_caps,
+
+	ice_aqc_opc_get_vlan_mode_parameters,
+
+	/* Package Configuration Admin Commands */
+	ice_aqc_opc_update_pkg,
+	ice_aqc_opc_get_pkg_info_list,
+
+	/* PHY commands */
+	ice_aqc_opc_get_phy_caps,
+	ice_aqc_opc_get_link_status,
+
+	/* Switch Block */
+	ice_aqc_opc_get_sw_cfg,
+	ice_aqc_opc_alloc_res,
+	ice_aqc_opc_free_res,
+	ice_aqc_opc_add_recipe,
+	ice_aqc_opc_recipe_to_profile,
+	ice_aqc_opc_get_recipe,
+	ice_aqc_opc_get_recipe_to_profile,
+	ice_aqc_opc_add_sw_rules,
+	ice_aqc_opc_update_sw_rules,
+	ice_aqc_opc_remove_sw_rules,
+
+	/* ACL commands */
+	ice_aqc_opc_alloc_acl_tbl,
+	ice_aqc_opc_dealloc_acl_tbl,
+	ice_aqc_opc_alloc_acl_actpair,
+	ice_aqc_opc_dealloc_acl_actpair,
+	ice_aqc_opc_alloc_acl_scen,
+	ice_aqc_opc_dealloc_acl_scen,
+	ice_aqc_opc_alloc_acl_counters,
+	ice_aqc_opc_dealloc_acl_counters,
+	ice_aqc_opc_dealloc_acl_res,
+	ice_aqc_opc_update_acl_scen,
+	ice_aqc_opc_program_acl_actpair,
+	ice_aqc_opc_program_acl_prof_extraction,
+	ice_aqc_opc_program_acl_prof_ranges,
+	ice_aqc_opc_program_acl_entry,
+	ice_aqc_opc_query_acl_prof,
+	ice_aqc_opc_query_acl_prof_ranges,
+	ice_aqc_opc_query_acl_scen,
+	ice_aqc_opc_query_acl_entry,
+	ice_aqc_opc_query_acl_actpair,
+	ice_aqc_opc_query_acl_counter,
+};
+
+/**
+ * ice_dcf_aq_cmd_permitted - validate the AdminQ command permitted or not
+ * @desc: descriptor describing the command
+ */
+bool ice_dcf_aq_cmd_permitted(struct ice_aq_desc *desc)
+{
+	u16 opc = le16_to_cpu(desc->opcode);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(aqc_permitted_tbl); i++)
+		if (opc == aqc_permitted_tbl[i])
+			return true;
+
+	return false;
+}
+
+/**
+ * ice_dcf_is_acl_aq_cmd - check if the AdminQ command is ACL command
+ * @desc: descriptor describing the command
+ */
+bool ice_dcf_is_acl_aq_cmd(struct ice_aq_desc *desc)
+{
+	u16 opc = le16_to_cpu(desc->opcode);
+
+	if (opc >= ice_aqc_opc_alloc_acl_tbl &&
+	    opc <= ice_aqc_opc_query_acl_counter)
+		return true;
+
+	return false;
+}
+
+/**
+ * ice_dcf_is_udp_tunnel_aq_cmd - check if the AdminQ command is UDP tunnel
+ * command
+ * @desc: descriptor describing the command
+ * @aq_buf: AdminQ buffer
+ */
+bool ice_dcf_is_udp_tunnel_aq_cmd(struct ice_aq_desc *desc, u8 *aq_buf)
+{
+	struct ice_buf_hdr *pkg_buf;
+
+	if (!aq_buf)
+		return false;
+
+	if (le16_to_cpu(desc->opcode) != ice_aqc_opc_update_pkg)
+		return false;
+
+	pkg_buf = (struct ice_buf_hdr *)aq_buf;
+	/* section count for udp tunnel command is always 2 */
+	if (le16_to_cpu(pkg_buf->section_count) != 2)
+		return false;
+
+	if (le32_to_cpu(pkg_buf->section_entry[0].type) ==
+	    ICE_SID_RXPARSER_BOOST_TCAM ||
+	    le32_to_cpu(pkg_buf->section_entry[0].type) ==
+	    ICE_SID_TXPARSER_BOOST_TCAM)
+		return true;
+
+	return false;
+}
+
+/**
+ * ice_check_dcf_allowed - check if DCF is allowed based on various checks
+ * @vf: pointer to the VF to check
+ */
+bool ice_check_dcf_allowed(struct ice_vf *vf)
+{
+	struct ice_switch_info *sw;
+	struct ice_pf *pf = vf->pf;
+	struct device *dev;
+	u16 i;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (vf->vf_id != ICE_DCF_VFID0 && vf->vf_id != ICE_DCF_VFID1) {
+		dev_err(dev, "VF %d requested DCF capability, but only VF %d and %d are allowed to request DCF capability\n",
+			vf->vf_id, ICE_DCF_VFID0, ICE_DCF_VFID1);
+		return false;
+	}
+
+	if (!vf->trusted) {
+#ifdef HAVE_NDO_SET_VF_TRUST
+		dev_err(dev, "VF needs to be trusted to configure DCF capability\n");
+		return false;
+#else
+
+		int ret;
+		ret = ice_set_vf_trust(ice_get_main_vsi(pf)->netdev, vf->vf_id, true);
+		if (ret) {
+			dev_err(dev, "Failed to set trusted VF to configure DCF capability.\n");
+			return false;
+		}
+#endif /* HAVE_NDO_SET_VF_TRUST */
+	}
+
+	/* DCF and ADQ are mutually exclusive. */
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		dev_err(dev, "ADQ on PF is currently enabled. Device Control Functionality cannot be enabled.\n");
+		return false;
+	}
+#endif /* NETIF_F_HW_TC */
+	ice_for_each_vf(pf, i) {
+		if (pf->vf[i].adq_enabled) {
+			dev_err(dev, "ADQ on VF %d is currently enabled. Device Control Functionality cannot be enabled.\n",
+				pf->vf[i].vf_id);
+			return false;
+		}
+	}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	if (!hlist_empty(&pf->tc_flower_fltr_list)) {
+		dev_err(dev, "TC filters on PF are currently in use. Device Control Functionality cannot be enabled.\n");
+		return false;
+	}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+	ice_for_each_vf(pf, i) {
+		if (pf->vf[i].num_dmac_chnl_fltrs) {
+			dev_err(dev, "TC filters on VF %d are currently in use. Device Control Functionality cannot be enabled.\n",
+				pf->vf[i].vf_id);
+			return false;
+		}
+	}
+
+#ifdef HAVE_NETDEV_SB_DEV
+	if (ice_is_offloaded_macvlan_ena(pf)) {
+		dev_err(dev, "L2 Forwarding Offload is currently enabled. Device Control Functionality cannot be enabled.\n");
+		return false;
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	sw = pf->hw.switch_info;
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
+		if (sw->recp_list[i].adv_rule) {
+			dev_err(dev, "Advanced switch filters are currently in use. Device Control Functionality cannot be enabled.\n");
+			return false;
+		}
+	}
+	return true;
+}
+
+/**
+ * ice_is_dcf_enabled - Check the DCF enabled status of the associated PF
+ * @pf: PF instance
+ */
+bool ice_is_dcf_enabled(struct ice_pf *pf)
+{
+	return !!pf->dcf.vf;
+}
+
+/**
+ * ice_is_vf_dcf - helper to check if the assigned VF is a DCF
+ * @vf: the assigned VF to be checked
+ */
+bool ice_is_vf_dcf(struct ice_vf *vf)
+{
+	return vf == vf->pf->dcf.vf;
+}
+
+/**
+ * ice_dcf_get_state - Get DCF state of the associated PF
+ * @pf: PF instance
+ */
+enum ice_dcf_state ice_dcf_get_state(struct ice_pf *pf)
+{
+	return pf->dcf.vf ? pf->dcf.state : ICE_DCF_STATE_OFF;
+}
+
+/**
+ * ice_dcf_state_str - convert DCF state code to a string
+ * @state: the DCF state code to convert
+ */
+static const char *ice_dcf_state_str(enum ice_dcf_state state)
+{
+	switch (state) {
+	case ICE_DCF_STATE_OFF:
+		return "ICE_DCF_STATE_OFF";
+	case ICE_DCF_STATE_ON:
+		return "ICE_DCF_STATE_ON";
+	case ICE_DCF_STATE_BUSY:
+		return "ICE_DCF_STATE_BUSY";
+	case ICE_DCF_STATE_PAUSE:
+		return "ICE_DCF_STATE_PAUSE";
+	}
+
+	return "ICE_DCF_STATE_UNKNOWN";
+}
+
+/**
+ * ice_dcf_set_state - Set DCF state for the associated PF
+ * @pf: PF instance
+ * @state: new DCF state
+ */
+void ice_dcf_set_state(struct ice_pf *pf, enum ice_dcf_state state)
+{
+	dev_dbg(ice_pf_to_dev(pf), "DCF state is changing from %s to %s\n",
+		ice_dcf_state_str(pf->dcf.state),
+		ice_dcf_state_str(state));
+
+	pf->dcf.state = state;
+}
+
+/**
+ * ice_dcf_rm_sw_rule_to_vsi - remove switch rule of "forward to VSI"
+ * @pf: pointer to the PF struct
+ * @s_entry: pointer to switch rule entry to remove
+ */
+static int
+ice_dcf_rm_sw_rule_to_vsi(struct ice_pf *pf,
+			  struct ice_dcf_sw_rule_entry *s_entry)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+
+	s_rule = kzalloc(ICE_SW_RULE_RX_TX_NO_HDR_SIZE, GFP_KERNEL);
+	if (!s_rule)
+		return -ENOMEM;
+
+	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX);
+	s_rule->pdata.lkup_tx_rx.act = 0;
+	s_rule->pdata.lkup_tx_rx.hdr_len = 0;
+	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(s_entry->rule_id);
+	status = ice_aq_sw_rules(&pf->hw, s_rule, ICE_SW_RULE_RX_TX_NO_HDR_SIZE,
+				 1, ice_aqc_opc_remove_sw_rules, NULL);
+	kfree(s_rule);
+	if (status)
+		return -EIO;
+
+	list_del(&s_entry->list_entry);
+	kfree(s_entry);
+	return 0;
+}
+
+/**
+ * ice_dcf_rm_sw_rule_to_vsi_list - remove switch rule of "forward to VSI list"
+ * @pf: pointer to the PF struct
+ * @s_entry: pointer to switch rule entry to remove
+ */
+static int
+ice_dcf_rm_sw_rule_to_vsi_list(struct ice_pf *pf,
+			       struct ice_dcf_sw_rule_entry *s_entry)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info = s_entry->vsi_list_info;
+	struct ice_aqc_alloc_free_res_elem *res_buf;
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+	u16 rule_sz;
+	u16 vsi_id;
+	int i = 0;
+
+	if (!vsi_list_info)
+		return -EINVAL;
+
+	/* The VSI list is empty, it can be freed immediately */
+	if (!vsi_list_info->vsi_count)
+		goto free_vsi_list;
+
+	rule_sz = ICE_SW_RULE_VSI_LIST_SIZE(vsi_list_info->vsi_count);
+	s_rule = kzalloc(rule_sz, GFP_KERNEL);
+	if (!s_rule)
+		return -ENOMEM;
+
+	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR);
+	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_info->list_id);
+	s_rule->pdata.vsi_list.number_vsi =
+					cpu_to_le16(vsi_list_info->vsi_count);
+	for_each_set_bit(vsi_id, vsi_list_info->hw_vsi_map, ICE_HW_VSI_ID_MAX)
+		s_rule->pdata.vsi_list.vsi[i++] = cpu_to_le16(vsi_id);
+
+	bitmap_zero(vsi_list_info->hw_vsi_map, ICE_HW_VSI_ID_MAX);
+	vsi_list_info->vsi_count = 0;
+
+	status = ice_aq_sw_rules(&pf->hw, s_rule, rule_sz, 1,
+				 ice_aqc_opc_update_sw_rules, NULL);
+	kfree(s_rule);
+	if (status)
+		return -EIO;
+
+free_vsi_list:
+	res_buf = kzalloc(sizeof(*res_buf), GFP_KERNEL);
+	if (!res_buf)
+		return -ENOMEM;
+
+	res_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_REP);
+	res_buf->num_elems = cpu_to_le16(1);
+	res_buf->elem[0].e.sw_resp = cpu_to_le16(vsi_list_info->list_id);
+	status = ice_aq_alloc_free_res(&pf->hw, 1, res_buf, sizeof(*res_buf),
+				       ice_aqc_opc_free_res, NULL);
+	kfree(res_buf);
+	if (status)
+		return -EIO;
+
+	list_del(&vsi_list_info->list_entry);
+	kfree(vsi_list_info);
+	s_entry->vsi_list_info = NULL;
+
+	return ice_dcf_rm_sw_rule_to_vsi(pf, s_entry);
+}
+
+/**
+ * ice_dcf_rm_vsi_from_list - remove VSI from switch rule forward VSI list
+ * @pf: pointer to the PF struct
+ * @vsi_list_info: pointer to the VSI list info
+ * @hw_vsi_id: the Hardware VSI number
+ */
+static int
+ice_dcf_rm_vsi_from_list(struct ice_pf *pf,
+			 struct ice_dcf_vsi_list_info *vsi_list_info,
+			 u16 hw_vsi_id)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+
+	if (!vsi_list_info || !vsi_list_info->vsi_count ||
+	    !test_bit(hw_vsi_id, vsi_list_info->hw_vsi_map))
+		return -ENOENT;
+
+	s_rule = kzalloc(ICE_SW_RULE_VSI_LIST_SIZE(1), GFP_KERNEL);
+	if (!s_rule)
+		return -ENOMEM;
+
+	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR);
+	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_info->list_id);
+	s_rule->pdata.vsi_list.number_vsi = cpu_to_le16(1);
+	s_rule->pdata.vsi_list.vsi[0] = cpu_to_le16(hw_vsi_id);
+	status = ice_aq_sw_rules(&pf->hw, s_rule,
+				 ICE_SW_RULE_VSI_LIST_SIZE(1), 1,
+				 ice_aqc_opc_update_sw_rules, NULL);
+	kfree(s_rule);
+	if (status)
+		return -EIO;
+
+	/* When the VF resets gracefully, it should keep the VSI list and its
+	 * rule, just clears the VSI from list, so that the DCF can replay the
+	 * rule by updating this VF to list successfully.
+	 */
+	vsi_list_info->vsi_count--;
+	clear_bit(hw_vsi_id, vsi_list_info->hw_vsi_map);
+
+	return 0;
+}
+
+/**
+ * ice_rm_dcf_sw_vsi_rule - remove switch rules added by DCF to VSI
+ * @pf: pointer to the PF struct
+ * @hw_vsi_id: hardware VSI ID of the VF
+ */
+void ice_rm_dcf_sw_vsi_rule(struct ice_pf *pf, u16 hw_vsi_id)
+{
+	struct ice_dcf_sw_rule_entry *s_entry, *tmp;
+	int ret;
+
+	list_for_each_entry_safe(s_entry, tmp, &pf->dcf.sw_rule_head,
+				 list_entry)
+		if (s_entry->fltr_act == ICE_FWD_TO_VSI_LIST) {
+			ret = ice_dcf_rm_vsi_from_list(pf,
+						       s_entry->vsi_list_info,
+						       hw_vsi_id);
+			if (ret && ret != -ENOENT)
+				dev_err(ice_pf_to_dev(pf),
+					"Failed to remove VSI %u from VSI list : %d\n",
+					hw_vsi_id, ret);
+		} else if (s_entry->fwd_id.hw_vsi_id == hw_vsi_id) {
+			ret = ice_dcf_rm_sw_rule_to_vsi(pf, s_entry);
+			if (ret)
+				dev_err(ice_pf_to_dev(pf),
+					"Failed to remove VSI %u switch rule : %d\n",
+					hw_vsi_id, ret);
+		}
+}
+
+/**
+ * ice_dcf_init_sw_rule_mgmt - initializes DCF rule filter mngt struct
+ * @pf: pointer to the PF struct
+ */
+void ice_dcf_init_sw_rule_mgmt(struct ice_pf *pf)
+{
+	INIT_LIST_HEAD(&pf->dcf.sw_rule_head);
+	INIT_LIST_HEAD(&pf->dcf.vsi_list_info_head);
+}
+
+/**
+ * ice_rm_all_dcf_sw_rules - remove switch rules configured by DCF
+ * @pf: pointer to the PF struct
+ */
+void ice_rm_all_dcf_sw_rules(struct ice_pf *pf)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info, *list_info_tmp;
+	struct ice_dcf_sw_rule_entry *sw_rule, *rule_tmp;
+	u16 rule_id, list_id;
+	int ret;
+
+	list_for_each_entry_safe(sw_rule, rule_tmp, &pf->dcf.sw_rule_head,
+				 list_entry)
+		if (sw_rule->fltr_act == ICE_FWD_TO_VSI_LIST) {
+			list_id = sw_rule->fwd_id.vsi_list_id;
+			rule_id = sw_rule->rule_id;
+			ret = ice_dcf_rm_sw_rule_to_vsi_list(pf, sw_rule);
+			if (ret)
+				dev_err(ice_pf_to_dev(pf),
+					"Failed to remove switch rule 0x%04x with list id %u : %d\n",
+					rule_id, list_id, ret);
+		} else {
+			rule_id = sw_rule->rule_id;
+			ret = ice_dcf_rm_sw_rule_to_vsi(pf, sw_rule);
+			if (ret)
+				dev_err(ice_pf_to_dev(pf),
+					"Failed to remove switch rule 0x%04x : %d\n",
+					rule_id, ret);
+		}
+
+	/* clears rule filter management data if AdminQ command has error */
+	list_for_each_entry_safe(vsi_list_info, list_info_tmp,
+				 &pf->dcf.vsi_list_info_head,
+				 list_entry) {
+		list_del(&vsi_list_info->list_entry);
+		kfree(vsi_list_info);
+	}
+
+	list_for_each_entry_safe(sw_rule, rule_tmp, &pf->dcf.sw_rule_head,
+				 list_entry) {
+		list_del(&sw_rule->list_entry);
+		kfree(sw_rule);
+	}
+}
+
+/**
+ * ice_clear_dcf_acl_cfg - clear DCF ACL configuration for the PF
+ * @pf: pointer to the PF info
+ */
+void ice_clear_dcf_acl_cfg(struct ice_pf *pf)
+{
+	if (pf->hw.dcf_caps & DCF_ACL_CAP) {
+		ice_acl_destroy_tbl(&pf->hw);
+		ice_init_acl(pf);
+	}
+}
+
+/**
+ * ice_dcf_is_acl_capable - check if DCF ACL capability enabled
+ * @hw: pointer to the hardware info
+ */
+bool ice_dcf_is_acl_capable(struct ice_hw *hw)
+{
+	return hw->dcf_caps & DCF_ACL_CAP;
+}
+
+/**
+ * ice_clear_dcf_udp_tunnel_cfg - clear DCF UDP tunnel configuration for the PF
+ * @pf: pointer to the PF info
+ */
+void ice_clear_dcf_udp_tunnel_cfg(struct ice_pf *pf)
+{
+	if (pf->hw.dcf_caps & DCF_UDP_TUNNEL_CAP)
+		ice_destroy_tunnel(&pf->hw, 0, true);
+}
+
+/**
+ * ice_dcf_is_udp_tunnel_capable - check if DCF UDP tunnel capability enabled
+ * @hw: pointer to the hardware info
+ */
+bool ice_dcf_is_udp_tunnel_capable(struct ice_hw *hw)
+{
+	return hw->dcf_caps & DCF_UDP_TUNNEL_CAP;
+}
+
+/**
+ * ice_dcf_find_vsi_list_info - find the VSI list by ID.
+ * @pf: pointer to the PF info
+ * @vsi_list_id: VSI list ID
+ */
+static struct ice_dcf_vsi_list_info *
+ice_dcf_find_vsi_list_info(struct ice_pf *pf, u16 vsi_list_id)
+{
+	struct ice_dcf_vsi_list_info *list_info;
+
+	list_for_each_entry(list_info, &pf->dcf.vsi_list_info_head, list_entry)
+		if (list_info->list_id == vsi_list_id)
+			return list_info;
+
+	return NULL;
+}
+
+/**
+ * ice_dcf_add_vsi_id - add new VSI ID into list.
+ * @vsi_list_info: pointer to the VSI list info
+ * @hw_vsi_id: the VSI ID
+ */
+static void
+ice_dcf_add_vsi_id(struct ice_dcf_vsi_list_info *vsi_list_info, u16 hw_vsi_id)
+{
+	if (!test_and_set_bit(hw_vsi_id, vsi_list_info->hw_vsi_map))
+		vsi_list_info->vsi_count++;
+}
+
+/**
+ * ice_dcf_del_vsi_id - delete the VSI ID from list.
+ * @vsi_list_info: pointer to the VSI list info
+ * @hw_vsi_id: the VSI ID
+ */
+static void
+ice_dcf_del_vsi_id(struct ice_dcf_vsi_list_info *vsi_list_info, u16 hw_vsi_id)
+{
+	if (test_and_clear_bit(hw_vsi_id, vsi_list_info->hw_vsi_map))
+		vsi_list_info->vsi_count--;
+}
+
+/**
+ * ice_dcf_parse_alloc_vsi_list_res - parse the allocate VSI list resource
+ * @pf: pointer to the PF info
+ * @res: pointer to the VSI list resource
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_alloc_vsi_list_res(struct ice_pf *pf,
+				 struct ice_aqc_res_elem *res)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	u16 list_id = le16_to_cpu(res->e.sw_resp);
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf, list_id);
+	if (vsi_list_info)
+		return VIRTCHNL_STATUS_SUCCESS;
+
+	vsi_list_info = kzalloc(sizeof(*vsi_list_info), GFP_KERNEL);
+	if (!vsi_list_info)
+		return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+
+	vsi_list_info->list_id = list_id;
+	list_add(&vsi_list_info->list_entry, &pf->dcf.vsi_list_info_head);
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_parse_free_vsi_list_res - parse the free VSI list resource
+ * @pf: pointer to the PF info
+ * @res: pointer to the VSI list resource
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_free_vsi_list_res(struct ice_pf *pf,
+				struct ice_aqc_res_elem *res)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	u16 list_id = le16_to_cpu(res->e.sw_resp);
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf, list_id);
+	if (!vsi_list_info)
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	if (vsi_list_info->vsi_count)
+		dev_warn(ice_pf_to_dev(pf),
+			 "VSI list %u still has %u VSIs to be removed!\n",
+			 list_id, vsi_list_info->vsi_count);
+
+	if (vsi_list_info->sw_rule)
+		vsi_list_info->sw_rule->vsi_list_info = NULL;
+
+	list_del(&vsi_list_info->list_entry);
+	kfree(vsi_list_info);
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_set_vsi_list - set the VSI to VSI list
+ * @pf: pointer to the PF info
+ * @vsi_list: pointer to the VSI ID list to be set
+ */
+static enum virtchnl_status_code
+ice_dcf_set_vsi_list(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *vsi_list)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	int i;
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf,
+						   le16_to_cpu(vsi_list->pdata.vsi_list.index));
+	if (!vsi_list_info)
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	for (i = 0; i < le16_to_cpu(vsi_list->pdata.vsi_list.number_vsi); i++)
+		ice_dcf_add_vsi_id(vsi_list_info,
+				   le16_to_cpu(vsi_list->pdata.vsi_list.vsi[i]));
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_clear_vsi_list - clear the VSI from VSI list
+ * @pf: pointer to the PF info
+ * @vsi_list: pointer to the VSI ID list to be cleared
+ */
+static enum virtchnl_status_code
+ice_dcf_clear_vsi_list(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *vsi_list)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	int i;
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf,
+						   le16_to_cpu(vsi_list->pdata.vsi_list.index));
+	if (!vsi_list_info)
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	for (i = 0; i < le16_to_cpu(vsi_list->pdata.vsi_list.number_vsi); i++)
+		ice_dcf_del_vsi_id(vsi_list_info,
+				   le16_to_cpu(vsi_list->pdata.vsi_list.vsi[i]));
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_find_sw_rule - find the switch rule by ID.
+ * @pf: pointer to the PF info
+ * @rule_id: switch rule ID
+ */
+static struct ice_dcf_sw_rule_entry *
+ice_dcf_find_sw_rule(struct ice_pf *pf, u16 rule_id)
+{
+	struct ice_dcf_sw_rule_entry *sw_rule;
+
+	list_for_each_entry(sw_rule, &pf->dcf.sw_rule_head, list_entry)
+		if (sw_rule->rule_id == rule_id)
+			return sw_rule;
+
+	return NULL;
+}
+
+/**
+ * ice_dcf_parse_add_sw_rule_data - parse the add switch rule data
+ * @pf: pointer to the PF info
+ * @lkup: pointer to the add switch rule data
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_add_sw_rule_data(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *lkup)
+{
+	struct ice_dcf_sw_rule_entry *sw_rule;
+	u32 act;
+
+	sw_rule = kzalloc(sizeof(*sw_rule), GFP_KERNEL);
+	if (!sw_rule)
+		return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+
+	act = le32_to_cpu(lkup->pdata.lkup_tx_rx.act);
+	sw_rule->fltr_act = ICE_FWD_TO_VSI;
+	sw_rule->fwd_id.hw_vsi_id = (act & ICE_SINGLE_ACT_VSI_ID_M) >>
+					ICE_SINGLE_ACT_VSI_ID_S;
+	sw_rule->rule_id = le16_to_cpu(lkup->pdata.lkup_tx_rx.index);
+
+	list_add(&sw_rule->list_entry, &pf->dcf.sw_rule_head);
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_parse_updt_sw_rule_data - parse the update switch rule data
+ * @pf: pointer to the PF info
+ * @lkup: pointer to the update switch rule data
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_updt_sw_rule_data(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *lkup)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	struct ice_dcf_sw_rule_entry *sw_rule;
+	u16 vsi_list_id, rule_id;
+	u32 act;
+
+	rule_id = le16_to_cpu(lkup->pdata.lkup_tx_rx.index);
+	sw_rule = ice_dcf_find_sw_rule(pf, rule_id);
+	if (!sw_rule)
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	act = le32_to_cpu(lkup->pdata.lkup_tx_rx.act);
+	if (!(act & ICE_SINGLE_ACT_VSI_LIST)) {
+		u16 vsi_hw_id = (act & ICE_SINGLE_ACT_VSI_ID_M) >>
+				ICE_SINGLE_ACT_VSI_ID_S;
+
+		sw_rule->fltr_act = ICE_FWD_TO_VSI;
+		sw_rule->fwd_id.hw_vsi_id = vsi_hw_id;
+
+		return VIRTCHNL_STATUS_SUCCESS;
+	}
+
+	vsi_list_id = (act & ICE_SINGLE_ACT_VSI_LIST_ID_M) >>
+				ICE_SINGLE_ACT_VSI_LIST_ID_S;
+	if (sw_rule->vsi_list_info) {
+		if (sw_rule->vsi_list_info->list_id == vsi_list_id)
+			return VIRTCHNL_STATUS_SUCCESS;
+
+		dev_err(ice_pf_to_dev(pf),
+			"The switch rule 0x%04x is running on VSI list %u\n",
+			rule_id, sw_rule->vsi_list_info->list_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf, vsi_list_id);
+	if (!vsi_list_info) {
+		dev_err(ice_pf_to_dev(pf),
+			"No VSI list %u found to bind the switch rule 0x%04x\n",
+			vsi_list_id, rule_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	if (vsi_list_info->sw_rule) {
+		if (vsi_list_info->sw_rule->rule_id == rule_id)
+			return VIRTCHNL_STATUS_SUCCESS;
+
+		dev_err(ice_pf_to_dev(pf),
+			"The VSI list %u is running on switch rule 0x%04x\n",
+			vsi_list_id, vsi_list_info->sw_rule->rule_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	vsi_list_info->sw_rule = sw_rule;
+
+	sw_rule->fltr_act = ICE_FWD_TO_VSI_LIST;
+	sw_rule->fwd_id.vsi_list_id = vsi_list_id;
+	sw_rule->vsi_list_info = vsi_list_info;
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_parse_rm_sw_rule_data - parse the remove switch rule data
+ * @pf: pointer to the PF info
+ * @lkup: pointer to the remove switch rule data
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_rm_sw_rule_data(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *lkup)
+{
+	u16 rule_id = le16_to_cpu(lkup->pdata.lkup_tx_rx.index);
+	struct ice_dcf_sw_rule_entry *sw_rule, *tmp;
+
+	list_for_each_entry_safe(sw_rule, tmp, &pf->dcf.sw_rule_head,
+				 list_entry)
+		if (sw_rule->rule_id == rule_id) {
+			list_del(&sw_rule->list_entry);
+			kfree(sw_rule);
+		}
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_handle_add_sw_rule_rsp - handle the add switch rule response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the add switch rule command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_add_sw_rule_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_sw_rules_elem *em =
+			(struct ice_aqc_sw_rules_elem *)aq_buf;
+	u16 type = le16_to_cpu(em->type);
+
+	if (type == ICE_AQC_SW_RULES_T_VSI_LIST_SET)
+		status = ice_dcf_set_vsi_list(pf, em);
+	else if (type == ICE_AQC_SW_RULES_T_LKUP_RX)
+		status = ice_dcf_parse_add_sw_rule_data(pf, em);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_updt_sw_rule_rsp - handle the update switch rule response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the update switch rule command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_updt_sw_rule_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_sw_rules_elem *em =
+			(struct ice_aqc_sw_rules_elem *)aq_buf;
+	u16 type = le16_to_cpu(em->type);
+
+	if (type == ICE_AQC_SW_RULES_T_VSI_LIST_SET)
+		status = ice_dcf_set_vsi_list(pf, em);
+	else if (type == ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR)
+		status = ice_dcf_clear_vsi_list(pf, em);
+	else if (type == ICE_AQC_SW_RULES_T_LKUP_RX)
+		status = ice_dcf_parse_updt_sw_rule_data(pf, em);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_rm_sw_rule_rsp - handle the remove switch rule response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the remove switch rule command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_rm_sw_rule_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_sw_rules_elem *em =
+			(struct ice_aqc_sw_rules_elem *)aq_buf;
+	u16 type = le16_to_cpu(em->type);
+
+	if (type == ICE_AQC_SW_RULES_T_LKUP_RX)
+		status = ice_dcf_parse_rm_sw_rule_data(pf, em);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_alloc_res_rsp - handle the allocate resource response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the allocate resource command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_alloc_res_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_alloc_free_res_elem *res_buf =
+		 (struct ice_aqc_alloc_free_res_elem *)aq_buf;
+	u16 type = (le16_to_cpu(res_buf->res_type) &
+		    ICE_AQC_RES_TYPE_M) >> ICE_AQC_RES_TYPE_S;
+
+	if (type == ICE_AQC_RES_TYPE_VSI_LIST_REP)
+		status = ice_dcf_parse_alloc_vsi_list_res(pf,
+							  &res_buf->elem[0]);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_free_res_rsp - handle the free resource response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the free resource command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_free_res_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_alloc_free_res_elem *res_buf =
+		 (struct ice_aqc_alloc_free_res_elem *)aq_buf;
+	u16 type = (le16_to_cpu(res_buf->res_type) &
+		    ICE_AQC_RES_TYPE_M) >> ICE_AQC_RES_TYPE_S;
+
+	if (type == ICE_AQC_RES_TYPE_VSI_LIST_REP)
+		status = ice_dcf_parse_free_vsi_list_res(pf,
+							 &res_buf->elem[0]);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_udp_tunnel_rsp - handle the update package response
+ * @pf: pointer to the PF info
+ * @aq_desc: descriptor describing the command
+ * @aq_buf: pointer to the package update command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_udp_tunnel_rsp(struct ice_pf *pf, struct ice_aq_desc *aq_desc,
+			      u8 *aq_buf)
+{
+	struct ice_boost_tcam_section *sect;
+	struct ice_buf_hdr *pkg_buf;
+	struct ice_hw *hw = &pf->hw;
+	u16 port_key, inv_port_key;
+	u16 offset;
+	u16 addr;
+	u8 count;
+	u16 i, j;
+
+	mutex_lock(&hw->tnl_lock);
+	pkg_buf = (struct ice_buf_hdr *)aq_buf;
+	offset = le16_to_cpu(pkg_buf->section_entry[0].offset);
+	sect = (struct ice_boost_tcam_section *)(((u8 *)pkg_buf) + offset);
+	count = le16_to_cpu(sect->count);
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		for (j = 0; j < count; j++) {
+			addr = le16_to_cpu(sect->tcam[j].addr);
+			inv_port_key =
+			    le16_to_cpu(sect->tcam[j].key.key.hv_dst_port_key);
+			port_key =
+			    le16_to_cpu(sect->tcam[j].key.key2.hv_dst_port_key);
+			if (hw->tnl.tbl[i].valid &&
+			    hw->tnl.tbl[i].boost_addr == addr) {
+				/* It's tunnel destroy command if the key and
+				 * inverse key is the same.
+				 */
+				if (port_key == inv_port_key) {
+					hw->tnl.tbl[i].in_use = false;
+					hw->tnl.tbl[i].port = 0;
+					hw->tnl.tbl[i].ref = 0;
+				} else {
+					hw->tnl.tbl[i].port = port_key;
+					hw->tnl.tbl[i].in_use = true;
+					hw->tnl.tbl[i].ref = 1;
+				}
+			}
+		}
+
+	if (ice_is_tunnel_empty(&pf->hw))
+		hw->dcf_caps &= ~DCF_UDP_TUNNEL_CAP;
+	else
+		hw->dcf_caps |= DCF_UDP_TUNNEL_CAP;
+	mutex_unlock(&hw->tnl_lock);
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_post_aq_send_cmd - get the data from firmware successful response
+ * @pf: pointer to the PF info
+ * @aq_desc: descriptor describing the command
+ * @aq_buf: the AdminQ command buffer
+ */
+enum virtchnl_status_code
+ice_dcf_post_aq_send_cmd(struct ice_pf *pf, struct ice_aq_desc *aq_desc,
+			 u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	u16 opc = le16_to_cpu(aq_desc->opcode);
+
+	if (!aq_buf)
+		return VIRTCHNL_STATUS_SUCCESS;
+
+	switch (opc) {
+	case ice_aqc_opc_add_sw_rules:
+		status = ice_dcf_handle_add_sw_rule_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_update_sw_rules:
+		status = ice_dcf_handle_updt_sw_rule_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_remove_sw_rules:
+		status = ice_dcf_handle_rm_sw_rule_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_alloc_res:
+		status = ice_dcf_handle_alloc_res_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_free_res:
+		status = ice_dcf_handle_free_res_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_update_pkg:
+		if (ice_dcf_is_udp_tunnel_aq_cmd(aq_desc, aq_buf))
+			status = ice_dcf_handle_udp_tunnel_rsp(pf, aq_desc,
+							       aq_buf);
+		break;
+	}
+
+	return status;
+}
+
+/**
+ * ice_dcf_update_acl_rule_info - update DCF ACL rule info
+ * @pf: pointer to the PF info
+ * @desc: descriptor describing the command
+ * @aq_buf: the AdminQ command buffer
+ */
+enum virtchnl_status_code
+ice_dcf_update_acl_rule_info(struct ice_pf *pf, struct ice_aq_desc *desc,
+			     u8 *aq_buf)
+{
+	struct ice_acl_scen *scen, *tmp;
+	struct ice_acl_tbl *tbl;
+	u16 scen_id;
+
+	switch (le16_to_cpu(desc->opcode)) {
+	case ice_aqc_opc_alloc_acl_tbl:
+		if (pf->hw.acl_tbl)
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		tbl = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*tbl),
+				   GFP_ATOMIC);
+		if (!tbl)
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		tbl->id = le16_to_cpu(((struct ice_aqc_acl_generic *)
+					aq_buf)->alloc_id);
+		INIT_LIST_HEAD(&tbl->scens);
+		pf->hw.acl_tbl = tbl;
+		break;
+	case ice_aqc_opc_dealloc_acl_tbl:
+		list_for_each_entry_safe(scen, tmp, &pf->hw.acl_tbl->scens,
+					 list_entry) {
+			list_del(&scen->list_entry);
+			devm_kfree(ice_pf_to_dev(pf), scen);
+		}
+		devm_kfree(ice_pf_to_dev(pf), pf->hw.acl_tbl);
+		pf->hw.acl_tbl = NULL;
+		break;
+	case ice_aqc_opc_alloc_acl_scen:
+		scen = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*scen),
+				    GFP_ATOMIC);
+		if (!scen)
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		INIT_LIST_HEAD(&scen->list_entry);
+		scen_id = le16_to_cpu(desc->params.alloc_scen.ops.resp.scen_id);
+		scen->id = scen_id;
+		list_add(&scen->list_entry, &pf->hw.acl_tbl->scens);
+		break;
+	case ice_aqc_opc_dealloc_acl_scen:
+		list_for_each_entry_safe(scen, tmp, &pf->hw.acl_tbl->scens,
+					 list_entry) {
+			if (le16_to_cpu(desc->params.dealloc_scen.scen_id) ==
+			    scen->id) {
+				list_del(&scen->list_entry);
+				devm_kfree(ice_pf_to_dev(pf), scen);
+			}
+		}
+		break;
+	}
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_pre_aq_send_cmd - check if it needs to send the command to firmware
+ * @vf: pointer to the VF info
+ * @aq_desc: descriptor describing the command
+ * @aq_buf: the AdminQ command buffer
+ * @aq_buf_size: the AdminQ command buffer size
+ */
+bool
+ice_dcf_pre_aq_send_cmd(struct ice_vf *vf, struct ice_aq_desc *aq_desc,
+			u8 *aq_buf, u16 aq_buf_size)
+{
+	struct ice_pf *pf = vf->pf;
+
+	switch (le16_to_cpu(aq_desc->opcode)) {
+	case ice_aqc_opc_update_sw_rules:
+	{
+		struct ice_dcf_vsi_list_info *vsi_list_info;
+		struct ice_aqc_sw_rules_elem *s_rule;
+		u16 list_id, vsi_id;
+
+		if (aq_buf_size < ICE_SW_RULE_VSI_LIST_SIZE(1))
+			break;
+
+		s_rule = (struct ice_aqc_sw_rules_elem *)aq_buf;
+		if (le16_to_cpu(s_rule->type) !=
+					ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR ||
+		    le16_to_cpu(s_rule->pdata.vsi_list.number_vsi) != 1)
+			break;
+
+		list_id = le16_to_cpu(s_rule->pdata.vsi_list.index);
+		vsi_list_info = ice_dcf_find_vsi_list_info(pf, list_id);
+		if (!vsi_list_info)
+			break;
+
+		vsi_id = le16_to_cpu(s_rule->pdata.vsi_list.vsi[0]);
+		if (vsi_id >= ICE_HW_VSI_ID_MAX ||
+		    test_bit(vsi_id, vsi_list_info->hw_vsi_map))
+			break;
+
+		/* The VSI is removed from list already, no need to send the
+		 * command to firmware.
+		 */
+		return true;
+	}
+	case ice_aqc_opc_remove_sw_rules:
+	{
+		struct ice_aqc_sw_rules_elem *s_rule;
+		u16 rule_id;
+
+		if (aq_buf_size < ICE_SW_RULE_RX_TX_NO_HDR_SIZE)
+			break;
+
+		s_rule = (struct ice_aqc_sw_rules_elem *)aq_buf;
+		if (le16_to_cpu(s_rule->type) != ICE_AQC_SW_RULES_T_LKUP_RX)
+			break;
+
+		rule_id = le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+		if (ice_dcf_find_sw_rule(pf, rule_id))
+			break;
+
+		/* The switch rule is removed already, no need to send the
+		 * command to firmware.
+		 */
+		return true;
+	}
+
+	default:
+		break;
+	}
+
+	return false;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_dcf.h b/drivers/net/ethernet/intel/ice/ice_dcf.h
new file mode 100644
index 000000000000..7d95ec56f5b3
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_dcf.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_DCF_H_
+#define _ICE_DCF_H_
+
+struct ice_vf;
+struct ice_pf;
+
+#define ICE_DCF_VFID0	0
+#define ICE_DCF_VFID1	1
+
+/* DCF mode states */
+enum ice_dcf_state {
+	/* DCF mode is fully off */
+	ICE_DCF_STATE_OFF = 0,
+	/* Process is live, acquired capability to send DCF CMD */
+	ICE_DCF_STATE_ON,
+	/* Kernel is busy, deny DCF CMD */
+	ICE_DCF_STATE_BUSY,
+	/* Kernel is ready for Process to Re-establish, deny DCF CMD */
+	ICE_DCF_STATE_PAUSE,
+};
+
+struct ice_dcf_sw_rule_entry;
+
+#define ICE_HW_VSI_ID_MAX	BIT(10) /* The AQ VSI number uses 10 bits */
+
+struct ice_dcf_vsi_list_info {
+	struct list_head list_entry;
+	struct ice_dcf_sw_rule_entry *sw_rule;
+	u16 list_id;
+
+	u16 vsi_count;
+	DECLARE_BITMAP(hw_vsi_map, ICE_HW_VSI_ID_MAX);
+};
+
+struct ice_dcf_sw_rule_entry {
+	struct list_head list_entry;
+	u16 rule_id;
+
+	/* Only support ICE_FWD_TO_VSI and ICE_FWD_TO_VSI_LIST */
+	enum ice_sw_fwd_act_type fltr_act;
+	/* Depending on filter action */
+	union {
+		u16 hw_vsi_id:10;
+		u16 vsi_list_id:10;
+	} fwd_id;
+
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+};
+
+struct ice_dcf {
+	struct ice_vf *vf;
+	enum ice_dcf_state state;
+
+	/* Trace the switch rules added/removed by DCF */
+	struct list_head sw_rule_head;
+	struct list_head vsi_list_info_head;
+
+	/* Handle the AdminQ command between the DCF (Device Config Function)
+	 * and the firmware.
+	 */
+#define ICE_DCF_AQ_DESC_TIMEOUT	(HZ / 10)
+	struct ice_aq_desc aq_desc;
+	u8 aq_desc_received;
+	unsigned long aq_desc_expires;
+
+	/* Save the current Device Serial Number when searching the package
+	 * path for later query.
+	 */
+#define ICE_DSN_NUM_LEN 8
+	u8 dsn[ICE_DSN_NUM_LEN];
+};
+
+#ifdef CONFIG_PCI_IOV
+bool ice_dcf_aq_cmd_permitted(struct ice_aq_desc *desc);
+bool ice_check_dcf_allowed(struct ice_vf *vf);
+bool ice_is_dcf_enabled(struct ice_pf *pf);
+bool ice_is_vf_dcf(struct ice_vf *vf);
+enum ice_dcf_state ice_dcf_get_state(struct ice_pf *pf);
+void ice_dcf_set_state(struct ice_pf *pf, enum ice_dcf_state state);
+void ice_dcf_init_sw_rule_mgmt(struct ice_pf *pf);
+void ice_rm_all_dcf_sw_rules(struct ice_pf *pf);
+void ice_rm_dcf_sw_vsi_rule(struct ice_pf *pf, u16 hw_vsi_id);
+bool
+ice_dcf_pre_aq_send_cmd(struct ice_vf *vf, struct ice_aq_desc *aq_desc,
+			u8 *aq_buf, u16 aq_buf_size);
+enum virtchnl_status_code
+ice_dcf_post_aq_send_cmd(struct ice_pf *pf, struct ice_aq_desc *aq_desc,
+			 u8 *aq_buf);
+bool ice_dcf_is_acl_aq_cmd(struct ice_aq_desc *desc);
+bool ice_dcf_is_udp_tunnel_aq_cmd(struct ice_aq_desc *desc, u8 *aq_buf);
+void ice_clear_dcf_acl_cfg(struct ice_pf *pf);
+bool ice_dcf_is_acl_capable(struct ice_hw *hw);
+void ice_clear_dcf_udp_tunnel_cfg(struct ice_pf *pf);
+bool ice_dcf_is_udp_tunnel_capable(struct ice_hw *hw);
+enum virtchnl_status_code
+ice_dcf_update_acl_rule_info(struct ice_pf *pf, struct ice_aq_desc *desc,
+			     u8 *aq_buf);
+#else
+static inline bool ice_is_dcf_enabled(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+
+static inline bool
+ice_dcf_is_udp_tunnel_capable(struct ice_hw __always_unused *hw)
+{
+	return false;
+}
+
+static inline bool ice_dcf_is_acl_capable(struct ice_hw __always_unused *hw)
+{
+	return false;
+}
+#endif /* CONFIG_PCI_IOV */
+#endif /* _ICE_DCF_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_debugfs.c b/drivers/net/ethernet/intel/ice/ice_debugfs.c
new file mode 100644
index 000000000000..09b403d5ff48
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_debugfs.c
@@ -0,0 +1,672 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+
+
+static struct dentry *ice_debugfs_root;
+
+
+static void ice_dump_pf(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+
+	dev_info(dev, "pf struct:\n");
+	dev_info(dev, "\tmax_pf_txqs = %d\n", pf->max_pf_txqs);
+	dev_info(dev, "\tmax_pf_rxqs = %d\n", pf->max_pf_rxqs);
+	dev_info(dev, "\tnum_alloc_vsi = %d\n", pf->num_alloc_vsi);
+	dev_info(dev, "\tnum_lan_tx = %d\n", pf->num_lan_tx);
+	dev_info(dev, "\tnum_lan_rx = %d\n", pf->num_lan_rx);
+	dev_info(dev, "\tnum_avail_tx = %d\n", ice_get_avail_txq_count(pf));
+	dev_info(dev, "\tnum_avail_rx = %d\n", ice_get_avail_rxq_count(pf));
+	dev_info(dev, "\tnum_lan_msix = %d\n", pf->num_lan_msix);
+	dev_info(dev, "\tnum_rdma_msix = %d\n", pf->num_rdma_msix);
+	dev_info(dev, "\trdma_base_vector = %d\n", pf->rdma_base_vector);
+#ifdef HAVE_NETDEV_SB_DEV
+	dev_info(dev, "\tnum_macvlan = %d\n", pf->num_macvlan);
+	dev_info(dev, "\tmax_num_macvlan = %d\n", pf->max_num_macvlan);
+#endif /* HAVE_NETDEV_SB_DEV */
+	dev_info(dev, "\tirq_tracker->num_entries = %d\n",
+		 pf->irq_tracker->num_entries);
+	dev_info(dev, "\tirq_tracker->end = %d\n", pf->irq_tracker->end);
+	dev_info(dev, "\tirq_tracker valid count = %d\n",
+		 ice_get_valid_res_count(pf->irq_tracker));
+	dev_info(dev, "\tnum_avail_sw_msix = %d\n", pf->num_avail_sw_msix);
+	dev_info(dev, "\tsriov_base_vector = %d\n", pf->sriov_base_vector);
+	dev_info(dev, "\tnum_alloc_vfs = %d\n", pf->num_alloc_vfs);
+	dev_info(dev, "\tnum_qps_per_vf = %d\n", pf->num_qps_per_vf);
+	dev_info(dev, "\tnum_msix_per_vf = %d\n", pf->num_msix_per_vf);
+}
+
+static void ice_dump_pf_vsi_list(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	u16 i;
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (!vsi)
+			continue;
+
+		dev_info(dev, "vsi[%d]:\n", i);
+		dev_info(dev, "\tvsi = %pK\n", vsi);
+		dev_info(dev, "\tvsi_num = %d\n", vsi->vsi_num);
+		dev_info(dev, "\ttype = %s\n", ice_vsi_type_str(vsi->type));
+		if (vsi->type == ICE_VSI_VF)
+			dev_info(dev, "\tvf_id = %d\n", vsi->vf_id);
+		dev_info(dev, "\tback = %pK\n", vsi->back);
+		dev_info(dev, "\tnetdev = %pK\n", vsi->netdev);
+		dev_info(dev, "\tmax_frame = %d\n", vsi->max_frame);
+		dev_info(dev, "\trx_buf_len = %d\n", vsi->rx_buf_len);
+		dev_info(dev, "\tnum_txq = %d\n", vsi->num_txq);
+		dev_info(dev, "\tnum_rxq = %d\n", vsi->num_rxq);
+		dev_info(dev, "\treq_txq = %d\n", vsi->req_txq);
+		dev_info(dev, "\treq_rxq = %d\n", vsi->req_rxq);
+		dev_info(dev, "\talloc_txq = %d\n", vsi->alloc_txq);
+		dev_info(dev, "\talloc_rxq = %d\n", vsi->alloc_rxq);
+		dev_info(dev, "\tnum_rx_desc = %d\n", vsi->num_rx_desc);
+		dev_info(dev, "\tnum_tx_desc = %d\n", vsi->num_tx_desc);
+		dev_info(dev, "\tnum_vlan = %d\n", vsi->num_vlan);
+	}
+}
+
+/**
+ * ice_dump_pf_fdir - output Flow Director stats to dmesg log
+ * @pf: pointer to PF to get Flow Director HW stats for.
+ */
+static void ice_dump_pf_fdir(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	u16 pf_guar_pool = 0;
+	u32 dev_fltr_size;
+	u32 dev_fltr_cnt;
+	u32 pf_fltr_cnt;
+	u16 i;
+
+	pf_fltr_cnt = rd32(hw, PFQF_FD_CNT);
+	dev_fltr_cnt = rd32(hw, GLQF_FD_CNT);
+	dev_fltr_size = rd32(hw, GLQF_FD_SIZE);
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (!vsi)
+			continue;
+
+		pf_guar_pool += vsi->num_gfltr;
+	}
+
+	dev_info(dev, "Flow Director filter usage:\n");
+	dev_info(dev, "\tPF guaranteed used = %d\n",
+		 (pf_fltr_cnt & PFQF_FD_CNT_FD_GCNT_M) >>
+		 PFQF_FD_CNT_FD_GCNT_S);
+	dev_info(dev, "\tPF best_effort used = %d\n",
+		 (pf_fltr_cnt & PFQF_FD_CNT_FD_BCNT_M) >>
+		 PFQF_FD_CNT_FD_BCNT_S);
+	dev_info(dev, "\tdevice guaranteed used = %d\n",
+		 (dev_fltr_cnt & GLQF_FD_CNT_FD_GCNT_M) >>
+		 GLQF_FD_CNT_FD_GCNT_S);
+	dev_info(dev, "\tdevice best_effort used = %d\n",
+		 (dev_fltr_cnt & GLQF_FD_CNT_FD_BCNT_M) >>
+		 GLQF_FD_CNT_FD_BCNT_S);
+	dev_info(dev, "\tPF guaranteed pool = %d\n", pf_guar_pool);
+	dev_info(dev, "\tdevice guaranteed pool = %d\n",
+		 (dev_fltr_size & GLQF_FD_SIZE_FD_GSIZE_M) >>
+		 GLQF_FD_SIZE_FD_GSIZE_S);
+	dev_info(dev, "\tdevice best_effort pool = %d\n",
+		 hw->func_caps.fd_fltr_best_effort);
+}
+
+/**
+ * ice_vsi_dump_ctxt - print the passed in VSI context structure
+ * @dev: Device used for dev_info prints
+ * @ctxt: VSI context structure to print
+ */
+static void ice_vsi_dump_ctxt(struct device *dev, struct ice_vsi_ctx *ctxt)
+{
+	struct ice_aqc_vsi_props *info;
+
+	if (!ctxt)
+		return;
+
+	info = &ctxt->info;
+	dev_info(dev, "Get VSI Parameters:\n");
+	dev_info(dev, "\tVSI Number: %d Valid sections: 0x%04x\n",
+		 ctxt->vsi_num, le16_to_cpu(info->valid_sections));
+
+	dev_info(dev, "========================\n");
+	dev_info(dev, "| Category - Switching |");
+	dev_info(dev, "========================\n");
+	dev_info(dev, "\tSwitch ID: %u\n", info->sw_id);
+	dev_info(dev, "\tAllow Loopback: %s\n", (info->sw_flags &
+		 ICE_AQ_VSI_SW_FLAG_ALLOW_LB) ? "enabled" : "disabled");
+	dev_info(dev, "\tAllow Local Loopback: %s\n", (info->sw_flags &
+		 ICE_AQ_VSI_SW_FLAG_LOCAL_LB) ? "enabled" : "disabled");
+	dev_info(dev, "\tApply source VSI pruning: %s\n", (info->sw_flags &
+		 ICE_AQ_VSI_SW_FLAG_SRC_PRUNE) ? "enabled" : "disabled");
+	dev_info(dev, "\tEgress (Rx VLAN) pruning: %s\n",
+		 (info->sw_flags2 & ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M) ?
+		 "enabled" : "disabled");
+	dev_info(dev, "\tLAN enable: %s\n", (info->sw_flags2 &
+		 ICE_AQ_VSI_SW_FLAG_LAN_ENA) ? "enabled" : "disabled");
+	dev_info(dev, "\tVEB statistic block ID: %u\n", info->veb_stat_id &
+		 ICE_AQ_VSI_SW_VEB_STAT_ID_M);
+	dev_info(dev, "\tVEB statistic block ID valid: %d\n",
+		 (info->veb_stat_id & ICE_AQ_VSI_SW_VEB_STAT_ID_VALID) ? 1 : 0);
+
+	dev_info(dev, "=======================\n");
+	dev_info(dev, "| Category - Security |\n");
+	dev_info(dev, "=======================\n");
+	dev_info(dev, "\tAllow destination override: %s\n", (info->sec_flags &
+		 ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD) ? "enabled" : "disabled");
+	dev_info(dev, "\tEnable MAC anti-spoof: %s\n", (info->sec_flags &
+		 ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF) ? "enabled" : "disabled");
+	dev_info(dev, "\tIngress (Tx VLAN) pruning enables: %s\n",
+		 (info->sec_flags & ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M) ?
+		 "enabled" : "disabled");
+
+	dev_info(dev, "=================================\n");
+	dev_info(dev, "| Category: Inner VLAN Handling |\n");
+	dev_info(dev, "=================================\n");
+	dev_info(dev, "\tPort Based Inner VLAN Insertion: PVLAN ID: %d PRIO: %d\n",
+		 le16_to_cpu(info->port_based_inner_vlan) & VLAN_VID_MASK,
+		 (le16_to_cpu(info->port_based_inner_vlan) & VLAN_PRIO_MASK) >>
+		 VLAN_PRIO_SHIFT);
+	dev_info(dev, "\tInner VLAN TX Mode: 0x%02x\n",
+		 (info->inner_vlan_flags & ICE_AQ_VSI_INNER_VLAN_TX_MODE_M) >>
+		 ICE_AQ_VSI_INNER_VLAN_TX_MODE_S);
+	dev_info(dev, "\tInsert PVID: %s\n", (info->inner_vlan_flags &
+		 ICE_AQ_VSI_INNER_VLAN_INSERT_PVID) ? "enabled" : "disabled");
+	dev_info(dev, "\tInner VLAN and UP expose mode (RX): 0x%02x\n",
+		 (info->inner_vlan_flags & ICE_AQ_VSI_INNER_VLAN_EMODE_M) >>
+		 ICE_AQ_VSI_INNER_VLAN_EMODE_S);
+	dev_info(dev, "\tBlock Inner VLAN from TX Descriptor: %s\n",
+		 (info->inner_vlan_flags & ICE_AQ_VSI_INNER_VLAN_BLOCK_TX_DESC) ?
+		 "enabled" : "disabled");
+
+	dev_info(dev, "=================================\n");
+	dev_info(dev, "| Category: Outer VLAN Handling |\n");
+	dev_info(dev, "=================================\n");
+	dev_info(dev, "\tPort Based Outer VLAN Insertion: PVID: %d PRIO: %d\n",
+		 le16_to_cpu(info->port_based_outer_vlan) & VLAN_VID_MASK,
+		 (le16_to_cpu(info->port_based_outer_vlan) & VLAN_PRIO_MASK) >>
+		 VLAN_PRIO_SHIFT);
+	dev_info(dev, "\tOuter VLAN and UP expose mode (RX): 0x%02x\n",
+		 (info->outer_vlan_flags & ICE_AQ_VSI_OUTER_VLAN_EMODE_M) >>
+		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S);
+	dev_info(dev, "\tOuter Tag type (Tx and Rx): 0x%02x\n",
+		 (info->outer_vlan_flags & ICE_AQ_VSI_OUTER_TAG_TYPE_M) >>
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_S);
+	dev_info(dev, "\tPort Based Outer VLAN Insert Enable: %s\n",
+		 (info->outer_vlan_flags &
+		 ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT) ?
+		 "enabled" : "disabled");
+	dev_info(dev, "\tOuter VLAN TX Mode: 0x%02x\n",
+		 (info->outer_vlan_flags & ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M) >>
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S);
+	dev_info(dev, "\tBlock Outer VLAN from TX Descriptor: %s\n",
+		 (info->outer_vlan_flags & ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC) ?
+		 "enabled" : "disabled");
+}
+
+static const char *module_id_to_name(u16 module_id)
+{
+	switch (module_id) {
+	case ICE_AQC_FW_LOG_ID_GENERAL:
+		return "General";
+	case ICE_AQC_FW_LOG_ID_CTRL:
+		return "Control (Resets + Autoload)";
+	case ICE_AQC_FW_LOG_ID_LINK:
+		return "Link Management";
+	case ICE_AQC_FW_LOG_ID_LINK_TOPO:
+		return "Link Topology Detection";
+	case ICE_AQC_FW_LOG_ID_DNL:
+		return "DNL";
+	case ICE_AQC_FW_LOG_ID_I2C:
+		return "I2C";
+	case ICE_AQC_FW_LOG_ID_SDP:
+		return "SDP";
+	case ICE_AQC_FW_LOG_ID_MDIO:
+		return "MDIO";
+	case ICE_AQC_FW_LOG_ID_ADMINQ:
+		return "Admin Queue";
+	case ICE_AQC_FW_LOG_ID_HDMA:
+		return "HDMA";
+	case ICE_AQC_FW_LOG_ID_LLDP:
+		return "LLDP";
+	case ICE_AQC_FW_LOG_ID_DCBX:
+		return "DCBX";
+	case ICE_AQC_FW_LOG_ID_DCB:
+		return "DCB";
+	case ICE_AQC_FW_LOG_ID_XLR:
+		return "XLR";
+	case ICE_AQC_FW_LOG_ID_NVM:
+		return "NVM";
+	case ICE_AQC_FW_LOG_ID_AUTH:
+		return "Authentication";
+	case ICE_AQC_FW_LOG_ID_VPD:
+		return "VPD";
+	case ICE_AQC_FW_LOG_ID_IOSF:
+		return "IOSF";
+	case ICE_AQC_FW_LOG_ID_PARSER:
+		return "Parser";
+	case ICE_AQC_FW_LOG_ID_SW:
+		return "Switch";
+	case ICE_AQC_FW_LOG_ID_SCHEDULER:
+		return "Scheduler";
+	case ICE_AQC_FW_LOG_ID_TXQ:
+		return "Tx Queue Management";
+	case ICE_AQC_FW_LOG_ID_ACL:
+		return "ACL";
+	case ICE_AQC_FW_LOG_ID_POST:
+		return "Post";
+	case ICE_AQC_FW_LOG_ID_WATCHDOG:
+		return "Watchdog";
+	case ICE_AQC_FW_LOG_ID_TASK_DISPATCH:
+		return "Task Dispatcher";
+	case ICE_AQC_FW_LOG_ID_MNG:
+		return "Manageability";
+	case ICE_AQC_FW_LOG_ID_SYNCE:
+		return "Synce";
+	case ICE_AQC_FW_LOG_ID_HEALTH:
+		return "Health";
+	case ICE_AQC_FW_LOG_ID_TSDRV:
+		return "Time Sync";
+	case ICE_AQC_FW_LOG_ID_PFREG:
+		return "PF Registration";
+	case ICE_AQC_FW_LOG_ID_MDLVER:
+		return "Module Version";
+	default:
+		return "Unsupported";
+	}
+}
+
+static const char *log_level_to_name(u8 log_level)
+{
+	switch (log_level) {
+	case ICE_FWLOG_LEVEL_NONE:
+		return "None";
+	case ICE_FWLOG_LEVEL_ERROR:
+		return "Error";
+	case ICE_FWLOG_LEVEL_WARNING:
+		return "Warning";
+	case ICE_FWLOG_LEVEL_NORMAL:
+		return "Normal";
+	case ICE_FWLOG_LEVEL_VERBOSE:
+		return "Verbose";
+	default:
+		return "Unsupported";
+	}
+}
+
+/**
+ * ice_fwlog_dump_cfg - Dump current FW logging configuration
+ * @hw: pointer to the HW structure
+ */
+static void ice_fwlog_dump_cfg(struct ice_hw *hw)
+{
+	struct device *dev = ice_pf_to_dev((struct ice_pf *)(hw->back));
+	struct ice_fwlog_cfg *cfg;
+	enum ice_status status;
+	u16 i;
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return;
+
+	status = ice_fwlog_get(hw, cfg);
+	if (status) {
+		kfree(cfg);
+		return;
+	}
+
+	dev_info(dev, "FWLOG Configuration:\n");
+	dev_info(dev, "Options: 0x%04x\n", cfg->options);
+	dev_info(dev, "\tarq_ena: %s\n",
+		 (cfg->options &
+		  ICE_FWLOG_OPTION_ARQ_ENA) ? "true" : "false");
+	dev_info(dev, "\tuart_ena: %s\n",
+		 (cfg->options &
+		  ICE_FWLOG_OPTION_UART_ENA) ? "true" : "false");
+	dev_info(dev, "\tPF registered: %s\n",
+		 (cfg->options &
+		  ICE_FWLOG_OPTION_IS_REGISTERED) ? "true" : "false");
+
+	dev_info(dev, "Module Entries:\n");
+	for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) {
+		struct ice_fwlog_module_entry *entry =
+			&cfg->module_entries[i];
+
+		dev_info(dev, "\tModule ID %d (%s) Log Level %d (%s)\n",
+			 entry->module_id, module_id_to_name(entry->module_id),
+			 entry->log_level, log_level_to_name(entry->log_level));
+	}
+
+	kfree(cfg);
+}
+
+/**
+ * ice_debugfs_command_write - write into command datum
+ * @filp: the opened file
+ * @buf: where to find the user's data
+ * @count: the length of the user's data
+ * @ppos: file position offset
+ */
+static ssize_t
+ice_debugfs_command_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	struct ice_pf *pf = filp->private_data;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	char *cmd_buf, *cmd_buf_tmp;
+	ssize_t ret;
+	char **argv;
+	int argc;
+
+	/* don't allow partial writes and writes when reset is in progress*/
+	if (*ppos != 0 || ice_is_reset_in_progress(pf->state))
+		return 0;
+
+	cmd_buf = memdup_user(buf, count + 1);
+	if (IS_ERR(cmd_buf))
+		return PTR_ERR(cmd_buf);
+	cmd_buf[count] = '\0';
+
+	cmd_buf_tmp = strchr(cmd_buf, '\n');
+	if (cmd_buf_tmp) {
+		*cmd_buf_tmp = '\0';
+		count = (size_t)cmd_buf_tmp - (size_t)cmd_buf + 1;
+	}
+
+	argv = argv_split(GFP_KERNEL, cmd_buf, &argc);
+	if (!argv) {
+		ret = -ENOMEM;
+		goto err_copy_from_user;
+	}
+
+	if (argc > 1 && !strncmp(argv[1], "vsi", 3)) {
+		if (argc == 3 && !strncmp(argv[0], "get", 3)) {
+			struct ice_vsi_ctx *vsi_ctx;
+
+			vsi_ctx = devm_kzalloc(dev, sizeof(*vsi_ctx),
+					       GFP_KERNEL);
+			if (!vsi_ctx) {
+				ret = -ENOMEM;
+				goto command_write_done;
+			}
+			ret = kstrtou16(argv[2], 0, &vsi_ctx->vsi_num);
+			if (ret) {
+				devm_kfree(dev, vsi_ctx);
+				goto command_help;
+			}
+			ret = ice_aq_get_vsi_params(hw, vsi_ctx, NULL);
+			if (ret) {
+				ret = -EINVAL;
+				devm_kfree(dev, vsi_ctx);
+				goto command_help;
+			}
+
+			ice_vsi_dump_ctxt(dev, vsi_ctx);
+			devm_kfree(dev, vsi_ctx);
+		} else {
+			goto command_help;
+		}
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "switch", 6)) {
+		ret = ice_dump_sw_cfg(hw);
+		if (ret) {
+			ret = -EINVAL;
+			dev_err(dev, "dump switch failed\n");
+			goto command_write_done;
+		}
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "capabilities", 11)) {
+		ice_dump_caps(hw);
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "fwlog_cfg", 9)) {
+		ice_fwlog_dump_cfg(&pf->hw);
+	} else if (argc == 4 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "ptp", 3) &&
+		   !strncmp(argv[2], "func", 4) &&
+		   !strncmp(argv[3], "capabilities", 11)) {
+		ice_dump_ptp_func_caps(hw);
+	} else if (argc == 4 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "ptp", 3) &&
+		   !strncmp(argv[2], "dev", 3) &&
+		   !strncmp(argv[3], "capabilities", 11)) {
+		ice_dump_ptp_dev_caps(hw);
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "ports", 5)) {
+		dev_info(dev, "port_info:\n");
+		ice_dump_port_info(hw->port_info);
+#ifdef ICE_ADD_PROBES
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "arfs_stats", 10)) {
+		struct ice_vsi *vsi = ice_get_main_vsi(pf);
+
+		if (!vsi) {
+			dev_err(dev, "Failed to find PF VSI\n");
+		} else if (vsi->netdev->features & NETIF_F_NTUPLE) {
+			struct ice_arfs_active_fltr_cntrs *fltr_cntrs;
+
+			fltr_cntrs = vsi->arfs_fltr_cntrs;
+
+			/* active counters can be updated by multiple CPUs */
+			smp_mb__before_atomic();
+			dev_info(dev, "arfs_active_tcpv4_filters: %d\n",
+				 atomic_read(&fltr_cntrs->active_tcpv4_cnt));
+			dev_info(dev, "arfs_active_tcpv6_filters: %d\n",
+				 atomic_read(&fltr_cntrs->active_tcpv6_cnt));
+			dev_info(dev, "arfs_active_udpv4_filters: %d\n",
+				 atomic_read(&fltr_cntrs->active_udpv4_cnt));
+			dev_info(dev, "arfs_active_udpv6_filters: %d\n",
+				 atomic_read(&fltr_cntrs->active_udpv6_cnt));
+		}
+#endif /* ICE_ADD_PROBES */
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4)) {
+		if (!strncmp(argv[1], "mmcast", 6)) {
+			ice_dump_sw_rules(hw, ICE_SW_LKUP_MAC);
+		} else if (!strncmp(argv[1], "vlan", 4)) {
+			ice_dump_sw_rules(hw, ICE_SW_LKUP_VLAN);
+		} else if (!strncmp(argv[1], "eth", 3)) {
+			ice_dump_sw_rules(hw, ICE_SW_LKUP_ETHERTYPE);
+		} else if (!strncmp(argv[1], "pf_vsi", 6)) {
+			ice_dump_pf_vsi_list(pf);
+		} else if (!strncmp(argv[1], "pf_port_num", 11)) {
+			dev_info(dev, "pf_id = %d, port_num = %d\n",
+				 hw->pf_id, hw->port_info->lport);
+		} else if (!strncmp(argv[1], "pf", 2)) {
+			ice_dump_pf(pf);
+		} else if (!strncmp(argv[1], "vfs", 3)) {
+			ice_dump_all_vfs(pf);
+		} else if (!strncmp(argv[1], "fdir_stats", 10)) {
+			ice_dump_pf_fdir(pf);
+		} else if (!strncmp(argv[1], "reset_stats", 11)) {
+			dev_info(dev, "core reset count: %d\n",
+				 pf->corer_count);
+			dev_info(dev, "global reset count: %d\n",
+				 pf->globr_count);
+			dev_info(dev, "emp reset count: %d\n", pf->empr_count);
+			dev_info(dev, "pf reset count: %d\n", pf->pfr_count);
+		}
+
+#ifdef CONFIG_DCB
+	} else if (argc == 3 && !strncmp(argv[0], "lldp", 4) &&
+				!strncmp(argv[1], "get", 3)) {
+		u8 mibtype;
+		u16 llen, rlen;
+		u8 *buff;
+
+		if (!strncmp(argv[2], "local", 5))
+			mibtype = ICE_AQ_LLDP_MIB_LOCAL;
+		else if (!strncmp(argv[2], "remote", 6))
+			mibtype = ICE_AQ_LLDP_MIB_REMOTE;
+		else
+			goto command_help;
+
+		buff = devm_kzalloc(dev, ICE_LLDPDU_SIZE, GFP_KERNEL);
+		if (!buff) {
+			ret = -ENOMEM;
+			goto command_write_done;
+		}
+
+		ret = ice_aq_get_lldp_mib(hw,
+					  ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID,
+					  mibtype, (void *)buff,
+					  ICE_LLDPDU_SIZE,
+					  &llen, &rlen, NULL);
+
+		if (!ret) {
+			if (mibtype == ICE_AQ_LLDP_MIB_LOCAL) {
+				dev_info(dev, "LLDP MIB (local)\n");
+				print_hex_dump(KERN_INFO, "LLDP MIB (local): ",
+					       DUMP_PREFIX_OFFSET, 16, 1,
+					       buff, llen, true);
+			} else if (mibtype == ICE_AQ_LLDP_MIB_REMOTE) {
+				dev_info(dev, "LLDP MIB (remote)\n");
+				print_hex_dump(KERN_INFO, "LLDP MIB (remote): ",
+					       DUMP_PREFIX_OFFSET, 16, 1,
+					       buff, rlen, true);
+			}
+		} else {
+			dev_err(dev, "GET LLDP MIB failed. Status: %ld\n", ret);
+		}
+		devm_kfree(dev, buff);
+#endif /* CONFIG_DCB */
+	} else if ((argc > 1) && !strncmp(argv[1], "scheduling", 10)) {
+		if (argc == 4 && !strncmp(argv[0], "get", 3) &&
+		    !strncmp(argv[2], "tree", 4) &&
+		    !strncmp(argv[3], "topology", 8)) {
+			ice_dump_port_topo(hw->port_info);
+		}
+	} else if (argc == 4 && !strncmp(argv[0], "set_ts_pll", 10)) {
+		u8 time_ref_freq;
+		u8 time_ref_sel;
+		u8 src_tmr_mode;
+
+		ret = kstrtou8(argv[1], 0, &time_ref_freq);
+		if (ret)
+			goto command_help;
+		ret = kstrtou8(argv[2], 0, &time_ref_sel);
+		if (ret)
+			goto command_help;
+		ret = kstrtou8(argv[3], 0, &src_tmr_mode);
+		if (ret)
+			goto command_help;
+
+		ice_cgu_cfg_ts_pll(pf, false, (enum ice_time_ref_freq)time_ref_freq,
+				   (enum ice_cgu_time_ref_sel)time_ref_sel,
+				   (enum ice_src_tmr_mode)src_tmr_mode);
+		ice_cgu_cfg_ts_pll(pf, true, (enum ice_time_ref_freq)time_ref_freq,
+				   (enum ice_cgu_time_ref_sel)time_ref_sel,
+				   (enum ice_src_tmr_mode)src_tmr_mode);
+	} else {
+command_help:
+		dev_info(dev, "unknown or invalid command '%s'\n", cmd_buf);
+		dev_info(dev, "available commands\n");
+		dev_info(dev, "\t get vsi <vsinum>\n");
+		dev_info(dev, "\t dump switch\n");
+		dev_info(dev, "\t dump ports\n");
+		dev_info(dev, "\t dump capabilities\n");
+		dev_info(dev, "\t dump fwlog_cfg\n");
+		dev_info(dev, "\t dump ptp func capabilities\n");
+		dev_info(dev, "\t dump ptp dev capabilities\n");
+		dev_info(dev, "\t dump mmcast\n");
+		dev_info(dev, "\t dump vlan\n");
+		dev_info(dev, "\t dump eth\n");
+		dev_info(dev, "\t dump pf_vsi\n");
+		dev_info(dev, "\t dump pf\n");
+		dev_info(dev, "\t dump pf_port_num\n");
+		dev_info(dev, "\t dump vfs\n");
+		dev_info(dev, "\t dump reset_stats\n");
+		dev_info(dev, "\t dump fdir_stats\n");
+		dev_info(dev, "\t get scheduling tree topology\n");
+		dev_info(dev, "\t get scheduling tree topology portnum <port>\n");
+#ifdef CONFIG_DCB
+		dev_info(dev, "\t lldp get local\n");
+		dev_info(dev, "\t lldp get remote\n");
+#endif /* CONFIG_DCB */
+#ifdef ICE_ADD_PROBES
+		dev_info(dev, "\t dump arfs_stats\n");
+#endif /* ICE_ADD_PROBES */
+		ret = -EINVAL;
+		goto command_write_done;
+	}
+
+	/* if we get here, nothing went wrong; return bytes copied */
+	ret = (ssize_t)count;
+
+command_write_done:
+	argv_free(argv);
+err_copy_from_user:
+	kfree(cmd_buf);
+	return ret;
+}
+
+static const struct file_operations ice_debugfs_command_fops = {
+	.owner = THIS_MODULE,
+	.open  = simple_open,
+	.write = ice_debugfs_command_write,
+};
+
+/**
+ * ice_debugfs_pf_init - setup the debugfs directory
+ * @pf: the ice that is starting up
+ */
+void ice_debugfs_pf_init(struct ice_pf *pf)
+{
+	const char *name = pci_name(pf->pdev);
+	struct dentry *pfile;
+
+	pf->ice_debugfs_pf = debugfs_create_dir(name, ice_debugfs_root);
+	if (IS_ERR(pf->ice_debugfs_pf))
+		return;
+
+	pfile = debugfs_create_file("command", 0600, pf->ice_debugfs_pf, pf,
+				    &ice_debugfs_command_fops);
+	if (!pfile)
+		goto create_failed;
+
+	return;
+
+create_failed:
+	dev_err(ice_pf_to_dev(pf), "debugfs dir/file for %s failed\n", name);
+	debugfs_remove_recursive(pf->ice_debugfs_pf);
+}
+
+/**
+ * ice_debugfs_pf_exit - clear out the ices debugfs entries
+ * @pf: the ice that is stopping
+ */
+void ice_debugfs_pf_exit(struct ice_pf *pf)
+{
+	debugfs_remove_recursive(pf->ice_debugfs_pf);
+	pf->ice_debugfs_pf = NULL;
+}
+
+/**
+ * ice_debugfs_init - create root directory for debugfs entries
+ */
+void ice_debugfs_init(void)
+{
+	ice_debugfs_root = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (IS_ERR(ice_debugfs_root))
+		pr_info("init of debugfs failed\n");
+}
+
+/**
+ * ice_debugfs_exit - remove debugfs entries
+ */
+void ice_debugfs_exit(void)
+{
+	debugfs_remove_recursive(ice_debugfs_root);
+	ice_debugfs_root = NULL;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_devids.h b/drivers/net/ethernet/intel/ice/ice_devids.h
index f8d5c661d0ba..a9c1d294def5 100644
--- a/drivers/net/ethernet/intel/ice/ice_devids.h
+++ b/drivers/net/ethernet/intel/ice/ice_devids.h
@@ -1,15 +1,60 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_DEVIDS_H_
 #define _ICE_DEVIDS_H_
 
+
 /* Device IDs */
+/* Intel(R) Ethernet Connection E823-L for backplane */
+#define ICE_DEV_ID_E823L_BACKPLANE	0x124C
+/* Intel(R) Ethernet Connection E823-L for SFP */
+#define ICE_DEV_ID_E823L_SFP		0x124D
+/* Intel(R) Ethernet Connection E823-L/X557-AT 10GBASE-T */
+#define ICE_DEV_ID_E823L_10G_BASE_T	0x124E
+/* Intel(R) Ethernet Connection E823-L 1GbE */
+#define ICE_DEV_ID_E823L_1GBE		0x124F
+/* Intel(R) Ethernet Connection E823-L for QSFP */
+#define ICE_DEV_ID_E823L_QSFP		0x151D
 /* Intel(R) Ethernet Controller E810-C for backplane */
 #define ICE_DEV_ID_E810C_BACKPLANE	0x1591
 /* Intel(R) Ethernet Controller E810-C for QSFP */
 #define ICE_DEV_ID_E810C_QSFP		0x1592
 /* Intel(R) Ethernet Controller E810-C for SFP */
 #define ICE_DEV_ID_E810C_SFP		0x1593
+/* Intel(R) Ethernet Controller E810-XXV for backplane */
+#define ICE_DEV_ID_E810_XXV_BACKPLANE	0x1599
+/* Intel(R) Ethernet Controller E810-XXV for QSFP */
+#define ICE_DEV_ID_E810_XXV_QSFP	0x159A
+/* Intel(R) Ethernet Controller E810-XXV for SFP */
+#define ICE_DEV_ID_E810_XXV_SFP		0x159B
+/* Intel(R) Ethernet Connection E823-C for backplane */
+#define ICE_DEV_ID_E823C_BACKPLANE	0x188A
+/* Intel(R) Ethernet Connection E823-C for QSFP */
+#define ICE_DEV_ID_E823C_QSFP		0x188B
+/* Intel(R) Ethernet Connection E823-C for SFP */
+#define ICE_DEV_ID_E823C_SFP		0x188C
+/* Intel(R) Ethernet Connection E823-C/X557-AT 10GBASE-T */
+#define ICE_DEV_ID_E823C_10G_BASE_T	0x188D
+/* Intel(R) Ethernet Connection E823-C 1GbE */
+#define ICE_DEV_ID_E823C_SGMII		0x188E
+/* Intel(R) Ethernet Connection E822-C for backplane */
+#define ICE_DEV_ID_E822C_BACKPLANE	0x1890
+/* Intel(R) Ethernet Connection E822-C for QSFP */
+#define ICE_DEV_ID_E822C_QSFP		0x1891
+/* Intel(R) Ethernet Connection E822-C for SFP */
+#define ICE_DEV_ID_E822C_SFP		0x1892
+/* Intel(R) Ethernet Connection E822-C/X557-AT 10GBASE-T */
+#define ICE_DEV_ID_E822C_10G_BASE_T	0x1893
+/* Intel(R) Ethernet Connection E822-C 1GbE */
+#define ICE_DEV_ID_E822C_SGMII		0x1894
+/* Intel(R) Ethernet Connection E822-L for backplane */
+#define ICE_DEV_ID_E822L_BACKPLANE	0x1897
+/* Intel(R) Ethernet Connection E822-L for SFP */
+#define ICE_DEV_ID_E822L_SFP		0x1898
+/* Intel(R) Ethernet Connection E822-L/X557-AT 10GBASE-T */
+#define ICE_DEV_ID_E822L_10G_BASE_T	0x1899
+/* Intel(R) Ethernet Connection E822-L 1GbE */
+#define ICE_DEV_ID_E822L_SGMII		0x189A
 
 #endif /* _ICE_DEVIDS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c
new file mode 100644
index 000000000000..8b7fcbc0a32d
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_devlink.c
@@ -0,0 +1,1090 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_devlink.h"
+#include "ice_eswitch.h"
+#include "ice_fw_update.h"
+
+#ifdef HAVE_DEVLINK_INFO_GET
+/* context for devlink info version reporting */
+struct ice_info_ctx {
+	char buf[128];
+	struct ice_orom_info pending_orom;
+	struct ice_nvm_info pending_nvm;
+	struct ice_netlist_info pending_netlist;
+	struct ice_hw_dev_caps dev_caps;
+};
+
+/*
+ * The following functions are used to format specific strings for various
+ * devlink info versions. The ctx parameter is used to provide the storage
+ * buffer, as well as any ancillary information calculated when the info
+ * request was made.
+ *
+ * If a version does not exist, for example a "stored" version that does not
+ * exist because no update is pending, the function should leave the buffer in
+ * the ctx structure empty and return 0.
+ */
+
+static void ice_info_get_dsn(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	u8 dsn[8];
+
+	/* Copy the DSN into an array in Big Endian format */
+	put_unaligned_be64(pci_get_dsn(pf->pdev), dsn);
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%8phD", dsn);
+}
+
+static void ice_info_pba(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+
+	status = ice_read_pba_string(hw, (u8 *)ctx->buf, sizeof(ctx->buf));
+	if (status)
+		/* We failed to locate the PBA, so just skip this entry */
+		dev_dbg(ice_pf_to_dev(pf), "Failed to read Product Board Assembly string, status %s\n",
+			ice_stat_str(status));
+}
+
+static void ice_info_fw_mgmt(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", hw->fw_maj_ver, hw->fw_min_ver,
+		 hw->fw_patch);
+}
+
+static void ice_info_fw_api(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u", hw->api_maj_ver, hw->api_min_ver);
+}
+
+static void ice_info_fw_build(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", hw->fw_build);
+}
+
+static void ice_info_fw_srev(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &pf->hw.flash.nvm;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u", nvm->srev);
+}
+
+static void ice_info_pending_fw_srev(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &ctx->pending_nvm;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_nvm)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%u", nvm->srev);
+}
+
+static void ice_info_orom_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_orom_info *orom = &pf->hw.flash.orom;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", orom->major, orom->build, orom->patch);
+}
+
+static void ice_info_pending_orom_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_orom_info *orom = &ctx->pending_orom;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_orom)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u",
+			 orom->major, orom->build, orom->patch);
+}
+
+static void ice_info_orom_srev(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_orom_info *orom = &pf->hw.flash.orom;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u", orom->srev);
+}
+
+static void ice_info_pending_orom_srev(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_orom_info *orom = &ctx->pending_orom;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_orom)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%u", orom->srev);
+}
+
+static void ice_info_nvm_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &pf->hw.flash.nvm;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%x.%02x", nvm->major, nvm->minor);
+}
+
+static void ice_info_pending_nvm_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &ctx->pending_nvm;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_nvm)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%x.%02x", nvm->major, nvm->minor);
+}
+
+static void ice_info_eetrack(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &pf->hw.flash.nvm;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", nvm->eetrack);
+}
+
+static void ice_info_pending_eetrack(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &ctx->pending_nvm;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_nvm)
+		snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", nvm->eetrack);
+}
+
+static void ice_info_ddp_pkg_name(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%s", hw->active_pkg_name);
+}
+
+static void ice_info_ddp_pkg_version(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_pkg_ver *pkg = &pf->hw.active_pkg_ver;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u.%u", pkg->major, pkg->minor, pkg->update,
+		 pkg->draft);
+}
+
+static void ice_info_ddp_pkg_bundle_id(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", pf->hw.active_track_id);
+}
+
+static void ice_info_netlist_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_netlist_info *netlist = &pf->hw.flash.netlist;
+
+	/* The netlist versions are BCD formatted */
+	snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x-%x.%x.%x", netlist->major, netlist->minor,
+		 netlist->type >> 16, netlist->type & 0xFFFF, netlist->rev,
+		 netlist->cust_ver);
+}
+
+static void ice_info_netlist_build(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_netlist_info *netlist = &pf->hw.flash.netlist;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", netlist->hash);
+}
+
+static void ice_info_pending_netlist_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_netlist_info *netlist = &ctx->pending_netlist;
+
+	/* The netlist versions are BCD formatted */
+	if (ctx->dev_caps.common_cap.nvm_update_pending_netlist)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x-%x.%x.%x",
+			 netlist->major, netlist->minor,
+			 netlist->type >> 16, netlist->type & 0xFFFF, netlist->rev,
+			 netlist->cust_ver);
+}
+
+static void ice_info_pending_netlist_build(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_netlist_info *netlist = &ctx->pending_netlist;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_netlist)
+		snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", netlist->hash);
+}
+
+#define fixed(key, getter) { ICE_VERSION_FIXED, key, getter }
+#define running(key, getter) { ICE_VERSION_RUNNING, key, getter }
+#define stored(key, getter) { ICE_VERSION_STORED, key, getter }
+
+enum ice_version_type {
+	ICE_VERSION_FIXED,
+	ICE_VERSION_RUNNING,
+	ICE_VERSION_STORED,
+};
+
+static const struct ice_devlink_version {
+	enum ice_version_type type;
+	const char *key;
+	void (*getter)(struct ice_pf *pf, struct ice_info_ctx *ctx);
+} ice_devlink_versions[] = {
+	fixed(DEVLINK_INFO_VERSION_GENERIC_BOARD_ID, ice_info_pba),
+	running(DEVLINK_INFO_VERSION_GENERIC_FW_MGMT, ice_info_fw_mgmt),
+	running("fw.mgmt.api", ice_info_fw_api),
+	running("fw.mgmt.build", ice_info_fw_build),
+	running("fw.mgmt.srev", ice_info_fw_srev),
+	stored("fw.mgmt.srev", ice_info_pending_fw_srev),
+	running(DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, ice_info_orom_ver),
+	stored(DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, ice_info_pending_orom_ver),
+	running("fw.undi.srev", ice_info_orom_srev),
+	stored("fw.undi.srev", ice_info_pending_orom_srev),
+	running("fw.psid.api", ice_info_nvm_ver),
+	stored("fw.psid.api", ice_info_pending_nvm_ver),
+	running(DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, ice_info_eetrack),
+	stored(DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, ice_info_pending_eetrack),
+	running("fw.app.name", ice_info_ddp_pkg_name),
+	running(DEVLINK_INFO_VERSION_GENERIC_FW_APP, ice_info_ddp_pkg_version),
+	running("fw.app.bundle_id", ice_info_ddp_pkg_bundle_id),
+	running("fw.netlist", ice_info_netlist_ver),
+	stored("fw.netlist", ice_info_pending_netlist_ver),
+	running("fw.netlist.build", ice_info_netlist_build),
+	stored("fw.netlist.build", ice_info_pending_netlist_build),
+};
+
+/**
+ * ice_devlink_info_get - .info_get devlink handler
+ * @devlink: devlink instance structure
+ * @req: the devlink info request
+ * @extack: extended netdev ack structure
+ *
+ * Callback for the devlink .info_get operation. Reports information about the
+ * device.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+static int ice_devlink_info_get(struct devlink *devlink,
+				struct devlink_info_req *req,
+				struct netlink_ext_ack *extack)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	struct ice_info_ctx *ctx;
+	enum ice_status status;
+	size_t i;
+	int err;
+
+	err = ice_wait_for_reset(pf, 10 * HZ);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Device is busy resetting");
+		return err;
+	}
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	/* discover capabilities first */
+	status = ice_discover_dev_caps(hw, &ctx->dev_caps);
+	if (status) {
+		dev_dbg(dev, "Failed to discover device capabilities, status %s aq_err %s\n",
+			ice_stat_str(status), ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Unable to discover device capabilities");
+		err = -EIO;
+		goto out_free_ctx;
+	}
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_orom) {
+		status = ice_get_inactive_orom_ver(hw, &ctx->pending_orom);
+		if (status) {
+			dev_dbg(dev, "Unable to read inactive Option ROM version data, status %s aq_err %s\n",
+				ice_stat_str(status), ice_aq_str(hw->adminq.sq_last_status));
+
+			/* disable display of pending Option ROM */
+			ctx->dev_caps.common_cap.nvm_update_pending_orom = false;
+		}
+	}
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_nvm) {
+		status = ice_get_inactive_nvm_ver(hw, &ctx->pending_nvm);
+		if (status) {
+			dev_dbg(dev, "Unable to read inactive NVM version data, status %s aq_err %s\n",
+				ice_stat_str(status), ice_aq_str(hw->adminq.sq_last_status));
+
+			/* disable display of pending Option ROM */
+			ctx->dev_caps.common_cap.nvm_update_pending_nvm = false;
+		}
+	}
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_netlist) {
+		status = ice_get_inactive_netlist_ver(hw, &ctx->pending_netlist);
+		if (status) {
+			dev_dbg(dev, "Unable to read inactive Netlist version data, status %s aq_err %s\n",
+				ice_stat_str(status), ice_aq_str(hw->adminq.sq_last_status));
+
+			/* disable display of pending Option ROM */
+			ctx->dev_caps.common_cap.nvm_update_pending_netlist = false;
+		}
+	}
+
+	err = devlink_info_driver_name_put(req, KBUILD_MODNAME);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to set driver name");
+		goto out_free_ctx;
+	}
+
+	ice_info_get_dsn(pf, ctx);
+
+	err = devlink_info_serial_number_put(req, ctx->buf);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to set serial number");
+		goto out_free_ctx;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ice_devlink_versions); i++) {
+		enum ice_version_type type = ice_devlink_versions[i].type;
+		const char *key = ice_devlink_versions[i].key;
+
+		memset(ctx->buf, 0, sizeof(ctx->buf));
+
+		ice_devlink_versions[i].getter(pf, ctx);
+
+		/* Do not report missing versions */
+		if (ctx->buf[0] == '\0')
+			continue;
+
+		switch (type) {
+		case ICE_VERSION_FIXED:
+			err = devlink_info_version_fixed_put(req, key, ctx->buf);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Unable to set fixed version");
+				goto out_free_ctx;
+			}
+			break;
+		case ICE_VERSION_RUNNING:
+			err = devlink_info_version_running_put(req, key, ctx->buf);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Unable to set running version");
+				goto out_free_ctx;
+			}
+			break;
+		case ICE_VERSION_STORED:
+			err = devlink_info_version_stored_put(req, key, ctx->buf);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Unable to set stored version");
+				goto out_free_ctx;
+			}
+			break;
+		}
+	}
+
+out_free_ctx:
+	kfree(ctx);
+	return err;
+}
+#endif /* HAVE_DEVLINK_INFO_GET */
+
+#ifdef HAVE_DEVLINK_PARAMS
+enum ice_devlink_param_id {
+	ICE_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
+	ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV,
+	ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV,
+};
+
+/**
+ * ice_devlink_minsrev_get - Get the current minimum security revision
+ * @devlink: pointer to the devlink instance
+ * @id: the parameter ID to get
+ * @ctx: context to return the parameter value
+ *
+ * Returns: zero on success, or an error code on failure.
+ */
+static int
+ice_devlink_minsrev_get(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_minsrev_info minsrevs = {};
+	enum ice_status status;
+
+	if (id != ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV &&
+	    id != ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV)
+		return -EINVAL;
+
+	status = ice_get_nvm_minsrevs(&pf->hw, &minsrevs);
+	if (status) {
+		dev_warn(dev, "Failed to read minimum security revision data from flash\n");
+		return -EIO;
+	}
+
+	/* We report zero if the device has not yet had a valid minimum
+	 * security revision programmed for the associated module. This makes
+	 * sense because it is not possible to have a security revision of
+	 * less than zero. Thus, all images will be able to load if the
+	 * minimum security revision is zero, the same as the case where the
+	 * minimum value is indicated as invalid.
+	 */
+	switch (id) {
+	case ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV:
+		if (minsrevs.nvm_valid)
+			ctx->val.vu32 = minsrevs.nvm;
+		else
+			ctx->val.vu32 = 0;
+		break;
+	case ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV:
+		if (minsrevs.orom_valid)
+			ctx->val.vu32 = minsrevs.orom;
+		else
+			ctx->val.vu32 = 0;
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_devlink_minsrev_set - Set the minimum security revision
+ * @devlink: pointer to the devlink instance
+ * @id: the parameter ID to set
+ * @ctx: context to return the parameter value
+ *
+ * Set the minimum security revision value for fw.mgmt or fw.undi. The kernel
+ * calls the validate handler before calling this, so we do not need to
+ * duplicate those checks here.
+ *
+ * Returns: zero on success, or an error code on failure.
+ */
+static int
+ice_devlink_minsrev_set(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_minsrev_info minsrevs = {};
+	enum ice_status status;
+
+	switch (id) {
+	case ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV:
+		minsrevs.nvm_valid = true;
+		minsrevs.nvm = ctx->val.vu32;
+		break;
+	case ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV:
+		minsrevs.orom_valid = true;
+		minsrevs.orom = ctx->val.vu32;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	status = ice_update_nvm_minsrevs(&pf->hw, &minsrevs);
+	if (status) {
+		dev_warn(dev, "Failed to update minimum security revision data\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_devlink_minsrev_validate - Validate a minimum security revision update
+ * @devlink: unused pointer to devlink instance
+ * @id: the parameter ID to validate
+ * @val: value to validate
+ * @extack: netlink extended ACK structure
+ *
+ * Check that a proposed update to a minimum security revision field is valid.
+ * Each minimum security revision can only be increased, not decreased.
+ * Additionally, we verify that the value is never set higher than the
+ * security revision of the active flash component.
+ *
+ * Returns: zero if the value is valid, -ERANGE if it is out of range, and
+ * -EINVAL if this function is called with the wrong ID.
+ */
+static int
+ice_devlink_minsrev_validate(struct devlink *devlink, u32 id, union devlink_param_value val,
+			     struct netlink_ext_ack *extack)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_minsrev_info minsrevs = {};
+	enum ice_status status;
+
+	if (id != ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV &&
+	    id != ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV)
+		return -EINVAL;
+
+	status = ice_get_nvm_minsrevs(&pf->hw, &minsrevs);
+	if (status) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to read minimum security revision data from flash");
+		return -EIO;
+	}
+
+	switch (id) {
+	case ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV:
+		if (val.vu32 > pf->hw.flash.nvm.srev) {
+			NL_SET_ERR_MSG_MOD(extack, "Cannot update fw.mgmt minimum security revision higher than the currently running firmware");
+			dev_dbg(dev, "Attempted to set fw.mgmt.minsrev to %u, but running firmware has srev %u\n",
+				val.vu32, pf->hw.flash.nvm.srev);
+			return -EPERM;
+		}
+
+		if (minsrevs.nvm_valid && val.vu32 < minsrevs.nvm) {
+			NL_SET_ERR_MSG_MOD(extack, "Cannot lower the minimum security revision for fw.mgmt flash section");
+			dev_dbg(dev, "Attempted  to set fw.mgmt.minsrev to %u, but current minsrev is %u\n",
+				val.vu32, minsrevs.nvm);
+			return -EPERM;
+		}
+		break;
+	case ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV:
+		if (val.vu32 > pf->hw.flash.orom.srev) {
+			NL_SET_ERR_MSG_MOD(extack, "Cannot update fw.undi minimum security revision higher than the currently running firmware");
+			dev_dbg(dev, "Attempted to set fw.undi.minsrev to %u, but running firmware has srev %u\n",
+				val.vu32, pf->hw.flash.orom.srev);
+			return -EPERM;
+		}
+
+		if (minsrevs.orom_valid && val.vu32 < minsrevs.orom) {
+			NL_SET_ERR_MSG_MOD(extack, "Cannot lower the minimum security revision for fw.undi flash section");
+			dev_dbg(dev, "Attempted  to set fw.undi.minsrev to %u, but current minsrev is %u\n",
+				val.vu32, minsrevs.orom);
+			return -EPERM;
+		}
+		break;
+	}
+
+	return 0;
+}
+
+/* devlink parameters for the ice driver */
+static const struct devlink_param ice_devlink_params[] = {
+	DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV,
+			     "fw.mgmt.minsrev",
+			     DEVLINK_PARAM_TYPE_U32,
+			     BIT(DEVLINK_PARAM_CMODE_PERMANENT),
+			     ice_devlink_minsrev_get,
+			     ice_devlink_minsrev_set,
+			     ice_devlink_minsrev_validate),
+	DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV,
+			     "fw.undi.minsrev",
+			     DEVLINK_PARAM_TYPE_U32,
+			     BIT(DEVLINK_PARAM_CMODE_PERMANENT),
+			     ice_devlink_minsrev_get,
+			     ice_devlink_minsrev_set,
+			     ice_devlink_minsrev_validate),
+};
+#endif /* HAVE_DEVLINK_PARAMS */
+
+#ifdef HAVE_DEVLINK_FLASH_UPDATE
+/**
+ * ice_devlink_flash_update - Update firmware stored in flash on the device
+ * @devlink: pointer to devlink associated with device to update
+ * @params: flash update parameters
+ * @extack: netlink extended ACK structure
+ *
+ * Perform a device flash update. The bulk of the update logic is contained
+ * within the ice_flash_pldm_image function.
+ *
+ * Returns: zero on success, or an error code on failure.
+ */
+static int
+ice_devlink_flash_update(struct devlink *devlink,
+			 struct devlink_flash_update_params *params,
+			 struct netlink_ext_ack *extack)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+#ifndef HAVE_DEVLINK_FLASH_UPDATE_PARAMS_FW
+	struct device *dev = &pf->pdev->dev;
+#endif
+	struct ice_hw *hw = &pf->hw;
+#ifndef HAVE_DEVLINK_FLASH_UPDATE_PARAMS_FW
+	const struct firmware *fw;
+#endif
+	u8 preservation;
+	int err;
+
+	if (!params->overwrite_mask) {
+		/* preserve all settings and identifiers */
+		preservation = ICE_AQC_NVM_PRESERVE_ALL;
+	} else if (params->overwrite_mask == DEVLINK_FLASH_OVERWRITE_SETTINGS) {
+		/* overwrite settings, but preserve the vital device identifiers */
+		preservation = ICE_AQC_NVM_PRESERVE_SELECTED;
+	} else if (params->overwrite_mask == (DEVLINK_FLASH_OVERWRITE_SETTINGS |
+					      DEVLINK_FLASH_OVERWRITE_IDENTIFIERS)) {
+		/* overwrite both settings and identifiers, preserve nothing */
+		preservation = ICE_AQC_NVM_NO_PRESERVATION;
+	} else {
+		NL_SET_ERR_MSG_MOD(extack, "Requested overwrite mask is not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (!hw->dev_caps.common_cap.nvm_unified_update) {
+		NL_SET_ERR_MSG_MOD(extack, "Current firmware does not support unified update");
+		return -EOPNOTSUPP;
+	}
+
+	err = ice_check_for_pending_update(pf, NULL, extack);
+	if (err)
+		return err;
+
+	devlink_flash_update_status_notify(devlink, "Preparing to flash", NULL, 0, 0);
+
+#ifndef HAVE_DEVLINK_FLASH_UPDATE_PARAMS_FW
+	err = request_firmware(&fw, params->file_name, dev);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to read file from disk");
+		return err;
+	}
+
+	err = ice_flash_pldm_image(pf, fw, preservation, extack);
+
+	release_firmware(fw);
+
+	return err;
+#else
+	return ice_flash_pldm_image(pf, params->fw, preservation, extack);
+#endif
+}
+
+#ifdef HAVE_DEVLINK_FLASH_UPDATE_BEGIN_END_NOTIFY
+static int
+ice_devlink_flash_update_notify_compat(struct devlink *devlink,
+				       struct devlink_flash_update_params *params,
+				       struct netlink_ext_ack *extack)
+{
+	int err;
+
+	devlink_flash_update_begin_notify(devlink);
+	err = ice_devlink_flash_update(devlink, params, extack);
+	devlink_flash_update_end_notify(devlink);
+
+	return err;
+}
+#endif
+
+#ifndef HAVE_DEVLINK_FLASH_UPDATE_PARAMS
+static int
+ice_devlink_flash_update_params_compat(struct devlink *devlink, const char *file_name,
+				       const char *component, struct netlink_ext_ack *extack)
+{
+	struct devlink_flash_update_params params = {};
+
+	/* individual component update is not yet supported, and older kernels
+	 * did not check this for us.
+	 */
+	if (component)
+		return -EOPNOTSUPP;
+
+	params.file_name = file_name;
+
+#ifdef HAVE_DEVLINK_FLASH_UPDATE_BEGIN_END_NOTIFY
+	return ice_devlink_flash_update_notify_compat(devlink, &params, extack);
+#else
+	return ice_devlink_flash_update(devlink, &params, extack);
+#endif
+}
+#endif /* !HAVE_DEVLINK_FLASH_UPDATE_PARAMS */
+#endif /* HAVE_DEVLINK_FLASH_UPDATE */
+
+static const struct devlink_ops ice_devlink_ops = {
+#ifdef HAVE_DEVLINK_FLASH_UPDATE_PARAMS
+	.supported_flash_update_params = DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK,
+#endif /* HAVE_DEVLINK_FLASH_UPDATE_PARAMS */
+	.eswitch_mode_get = ice_eswitch_mode_get,
+	.eswitch_mode_set = ice_eswitch_mode_set,
+#ifdef HAVE_DEVLINK_INFO_GET
+	.info_get = ice_devlink_info_get,
+#endif /* HAVE_DEVLINK_INFO_GET */
+#ifdef HAVE_DEVLINK_FLASH_UPDATE
+#if !defined(HAVE_DEVLINK_FLASH_UPDATE_PARAMS)
+	.flash_update = ice_devlink_flash_update_params_compat,
+#elif defined(HAVE_DEVLINK_FLASH_UPDATE_BEGIN_END_NOTIFY)
+	.flash_update = ice_devlink_flash_update_notify_compat,
+#else
+	.flash_update = ice_devlink_flash_update,
+#endif
+#endif /* HAVE_DEVLINK_FLASH_UPDATE */
+};
+
+static void ice_devlink_free(void *devlink_ptr)
+{
+	devlink_free((struct devlink *)devlink_ptr);
+}
+
+/**
+ * ice_allocate_pf - Allocate devlink and return PF structure pointer
+ * @dev: the device to allocate for
+ *
+ * Allocate a devlink instance for this device and return the private area as
+ * the PF structure. The devlink memory is kept track of through devres by
+ * adding an action to remove it when unwinding.
+ */
+struct ice_pf *ice_allocate_pf(struct device *dev)
+{
+	struct devlink *devlink;
+
+	devlink = devlink_alloc(&ice_devlink_ops, sizeof(struct ice_pf));
+	if (!devlink)
+		return NULL;
+
+	/* Add an action to teardown the devlink when unwinding the driver */
+	if (devm_add_action(dev, ice_devlink_free, devlink)) {
+		devlink_free(devlink);
+		return NULL;
+	}
+
+	return devlink_priv(devlink);
+}
+
+/**
+ * ice_devlink_register - Register devlink interface for this PF
+ * @pf: the PF to register the devlink for.
+ *
+ * Register the devlink instance associated with this physical function.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+int ice_devlink_register(struct ice_pf *pf)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+	struct device *dev = ice_pf_to_dev(pf);
+	int err;
+
+	err = devlink_register(devlink, dev);
+	if (err) {
+		dev_err(dev, "devlink registration failed: %d\n", err);
+		return err;
+	}
+
+#ifdef HAVE_DEVLINK_PARAMS
+	err = devlink_params_register(devlink, ice_devlink_params,
+				      ARRAY_SIZE(ice_devlink_params));
+	if (err) {
+		dev_err(dev, "devlink params registration failed: %d\n", err);
+		return err;
+	}
+#endif /* HAVE_DEVLINK_PARAMS */
+
+	return 0;
+}
+
+/**
+ * ice_devlink_unregister - Unregister devlink resources for this PF.
+ * @pf: the PF structure to cleanup
+ *
+ * Releases resources used by devlink and cleans up associated memory.
+ */
+void ice_devlink_unregister(struct ice_pf *pf)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+
+#ifdef HAVE_DEVLINK_PARAMS
+	devlink_params_unregister(devlink, ice_devlink_params,
+				  ARRAY_SIZE(ice_devlink_params));
+#endif /* HAVE_DEVLINK_PARAMS */
+	devlink_unregister(devlink);
+}
+
+/**
+ * ice_devlink_params_publish - Publish parameters to allow user access.
+ * @pf: the PF structure pointer
+ */
+void ice_devlink_params_publish(struct ice_pf __maybe_unused *pf)
+{
+#ifdef HAVE_DEVLINK_PARAMS
+	devlink_params_publish(priv_to_devlink(pf));
+#endif
+}
+
+/**
+ * ice_devlink_params_unpublish - Unpublish parameters to prevent user access.
+ * @pf: the PF structure pointer
+ */
+void ice_devlink_params_unpublish(struct ice_pf __maybe_unused *pf)
+{
+#ifdef HAVE_DEVLINK_PARAMS
+	devlink_params_unpublish(priv_to_devlink(pf));
+#endif
+}
+
+/**
+ * ice_devlink_create_pf_port - Create a devlink port for this PF
+ * @pf: the PF to create a devlink port for
+ *
+ * Create and register a devlink_port for this PF.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+int ice_devlink_create_pf_port(struct ice_pf *pf)
+{
+	struct devlink_port_attrs attrs = {};
+	struct devlink_port *devlink_port;
+	struct devlink *devlink;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+
+	devlink_port = &pf->devlink_port;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return -EIO;
+
+	attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
+	attrs.phys.port_number = vsi->port_info->lport;
+
+	devlink_port_attrs_set(devlink_port, &attrs);
+	devlink = priv_to_devlink(pf);
+
+	err = devlink_port_register(devlink, devlink_port, vsi->idx);
+	if (err) {
+		dev_err(dev, "Failed to create devlink port for PF %d, error %d\n",
+			pf->hw.pf_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_devlink_destroy_pf_port - Destroy the devlink_port for this PF
+ * @pf: the PF to cleanup
+ *
+ * Unregisters the devlink_port structure associated with this PF.
+ */
+void ice_devlink_destroy_pf_port(struct ice_pf *pf)
+{
+	struct devlink_port *devlink_port;
+
+	devlink_port = &pf->devlink_port;
+
+	devlink_port_type_clear(devlink_port);
+	devlink_port_unregister(devlink_port);
+}
+
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+/**
+ * ice_devlink_create_vf_port - Create a devlink port for this VF
+ * @vf: the VF to create a port for
+ *
+ * Create and register a devlink_port for this VF.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+int ice_devlink_create_vf_port(struct ice_vf *vf)
+{
+	struct devlink_port_attrs attrs = {};
+	struct devlink_port *devlink_port;
+	struct devlink *devlink;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_pf *pf;
+	int err;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	vsi = ice_get_vf_vsi(vf);
+	devlink_port = &vf->devlink_port;
+
+	attrs.flavour = DEVLINK_PORT_FLAVOUR_PCI_VF;
+	attrs.pci_vf.pf = pf->hw.bus.func;
+	attrs.pci_vf.vf = vf->vf_id;
+
+	devlink_port_attrs_set(devlink_port, &attrs);
+	devlink = priv_to_devlink(pf);
+
+	err = devlink_port_register(devlink, devlink_port, vsi->idx);
+	if (err) {
+		dev_err(dev, "Failed to create devlink port for VF %d, error %d\n",
+			vf->vf_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_devlink_destroy_vf_port - Destroy the devlink_port for this VF
+ * @vf: the VF to cleanup
+ *
+ * Unregisters the devlink_port structure associated with this VF.
+ */
+void ice_devlink_destroy_vf_port(struct ice_vf *vf)
+{
+	struct devlink_port *devlink_port;
+
+	devlink_port = &vf->devlink_port;
+
+	devlink_port_type_clear(devlink_port);
+	devlink_port_unregister(devlink_port);
+}
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+
+#ifdef HAVE_DEVLINK_REGIONS
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+/**
+ * ice_devlink_nvm_snapshot - Capture a snapshot of the Shadow RAM contents
+ * @devlink: the devlink instance
+ * @ops: the devlink region being snapshotted
+ * @extack: extended ACK response structure
+ * @data: on exit points to snapshot data buffer
+ *
+ * This function is called in response to the DEVLINK_CMD_REGION_TRIGGER for
+ * the shadow-ram devlink region. It captures a snapshot of the shadow ram
+ * contents. This snapshot can later be viewed via the devlink-region
+ * interface.
+ *
+ * @returns zero on success, and updates the data pointer. Returns a non-zero
+ * error code on failure.
+ */
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS */
+static int
+ice_devlink_nvm_snapshot(struct devlink *devlink,
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+			 const struct devlink_region_ops __always_unused *ops,
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS */
+			 struct netlink_ext_ack *extack, u8 **data)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 *nvm_data;
+	u32 nvm_size;
+
+	nvm_size = hw->flash.flash_size;
+	nvm_data = vzalloc(nvm_size);
+	if (!nvm_data)
+		return -ENOMEM;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status) {
+		dev_dbg(dev, "ice_acquire_nvm failed, err %d aq_err %d\n",
+			status, hw->adminq.sq_last_status);
+		NL_SET_ERR_MSG_MOD(extack, "Failed to acquire NVM semaphore");
+		vfree(nvm_data);
+		return -EIO;
+	}
+
+	status = ice_read_flat_nvm(hw, 0, &nvm_size, nvm_data, false);
+	if (status) {
+		dev_dbg(dev, "ice_read_flat_nvm failed after reading %u bytes, err %d aq_err %d\n",
+			nvm_size, status, hw->adminq.sq_last_status);
+		NL_SET_ERR_MSG_MOD(extack, "Failed to read NVM contents");
+		ice_release_nvm(hw);
+		vfree(nvm_data);
+		return -EIO;
+	}
+
+	ice_release_nvm(hw);
+
+	*data = nvm_data;
+
+	return 0;
+}
+
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+/**
+ * ice_devlink_devcaps_snapshot - Capture snapshot of device capabilities
+ * @devlink: the devlink instance
+ * @ops: the devlink region being snapshotted
+ * @extack: extended ACK response structure
+ * @data: on exit points to snapshot data buffer
+ *
+ * This function is called in response to the DEVLINK_CMD_REGION_TRIGGER for
+ * the device-caps devlink region. It captures a snapshot of the device
+ * capabilities reported by firmware.
+ *
+ * @returns zero on success, and updates the data pointer. Returns a non-zero
+ * error code on failure.
+ */
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS */
+static int
+ice_devlink_devcaps_snapshot(struct devlink *devlink,
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+			     const struct devlink_region_ops __always_unused *ops,
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS */
+			     struct netlink_ext_ack *extack, u8 **data)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	void *devcaps;
+
+	devcaps = vzalloc(ICE_AQ_MAX_BUF_LEN);
+	if (!devcaps)
+		return -ENOMEM;
+
+	status = ice_aq_list_caps(hw, devcaps, ICE_AQ_MAX_BUF_LEN, NULL,
+				  ice_aqc_opc_list_dev_caps, NULL);
+	if (status) {
+		dev_dbg(dev, "ice_aq_list_caps: failed to read device capabilities, err %d aq_err %d\n",
+			status, hw->adminq.sq_last_status);
+		NL_SET_ERR_MSG_MOD(extack, "Failed to read device capabilities");
+		vfree(devcaps);
+		return -EIO;
+	}
+
+	*data = (u8 *)devcaps;
+
+	return 0;
+}
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT */
+
+static const struct devlink_region_ops ice_nvm_region_ops = {
+	.name = "nvm-flash",
+	.destructor = vfree,
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT
+	.snapshot = ice_devlink_nvm_snapshot,
+#endif
+};
+
+static const struct devlink_region_ops ice_devcaps_region_ops = {
+	.name = "device-caps",
+	.destructor = vfree,
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT
+	.snapshot = ice_devlink_devcaps_snapshot,
+#endif
+};
+
+/**
+ * ice_devlink_init_regions - Initialize devlink regions
+ * @pf: the PF device structure
+ *
+ * Create devlink regions used to enable access to dump the contents of the
+ * flash memory on the device.
+ */
+void ice_devlink_init_regions(struct ice_pf *pf)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+	struct device *dev = ice_pf_to_dev(pf);
+	u64 nvm_size;
+
+	nvm_size = pf->hw.flash.flash_size;
+	pf->nvm_region = devlink_region_create(devlink, &ice_nvm_region_ops, 1,
+					       nvm_size);
+	if (IS_ERR(pf->nvm_region)) {
+		dev_err(dev, "failed to create NVM devlink region, err %ld\n",
+			PTR_ERR(pf->nvm_region));
+		pf->nvm_region = NULL;
+	}
+
+	pf->devcaps_region = devlink_region_create(devlink,
+						   &ice_devcaps_region_ops, 10,
+						   ICE_AQ_MAX_BUF_LEN);
+	if (IS_ERR(pf->devcaps_region)) {
+		dev_err(dev, "failed to create device-caps devlink region, err %ld\n",
+			PTR_ERR(pf->devcaps_region));
+		pf->devcaps_region = NULL;
+	}
+}
+
+/**
+ * ice_devlink_destroy_regions - Destroy devlink regions
+ * @pf: the PF device structure
+ *
+ * Remove previously created regions for this PF.
+ */
+void ice_devlink_destroy_regions(struct ice_pf *pf)
+{
+	if (pf->nvm_region)
+		devlink_region_destroy(pf->nvm_region);
+
+	if (pf->devcaps_region)
+		devlink_region_destroy(pf->devcaps_region);
+}
+#endif /* HAVE_DEVLINK_REGIONS */
diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.h b/drivers/net/ethernet/intel/ice/ice_devlink.h
new file mode 100644
index 000000000000..33464adf4ee6
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_devlink.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_DEVLINK_H_
+#define _ICE_DEVLINK_H_
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+struct ice_pf *ice_allocate_pf(struct device *dev);
+
+int ice_devlink_register(struct ice_pf *pf);
+void ice_devlink_unregister(struct ice_pf *pf);
+void ice_devlink_params_publish(struct ice_pf *pf);
+void ice_devlink_params_unpublish(struct ice_pf *pf);
+int ice_devlink_create_pf_port(struct ice_pf *pf);
+void ice_devlink_destroy_pf_port(struct ice_pf *pf);
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+int ice_devlink_create_vf_port(struct ice_vf *vf);
+void ice_devlink_destroy_vf_port(struct ice_vf *vf);
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#else /* CONFIG_NET_DEVLINK */
+static inline struct ice_pf *ice_allocate_pf(struct device *dev)
+{
+	return devm_kzalloc(dev, sizeof(struct ice_pf), GFP_KERNEL);
+}
+
+static inline int ice_devlink_register(struct ice_pf *pf) { return 0; }
+static inline void ice_devlink_unregister(struct ice_pf *pf) { }
+static inline void ice_devlink_params_publish(struct ice_pf *pf) { }
+static inline void ice_devlink_params_unpublish(struct ice_pf *pf) { }
+static inline int ice_devlink_create_pf_port(struct ice_pf *pf) { return 0; }
+static inline void ice_devlink_destroy_pf_port(struct ice_pf *pf) { }
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+static inline int ice_devlink_create_vf_port(struct ice_vf *vf) { return 0; }
+static inline void ice_devlink_destroy_vf_port(struct ice_vf *vf) { }
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* !CONFIG_NET_DEVLINK */
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK) && defined(HAVE_DEVLINK_REGIONS)
+void ice_devlink_init_regions(struct ice_pf *pf);
+void ice_devlink_destroy_regions(struct ice_pf *pf);
+#else
+static inline void ice_devlink_init_regions(struct ice_pf *pf) { }
+static inline void ice_devlink_destroy_regions(struct ice_pf *pf) { }
+#endif
+
+#endif /* _ICE_DEVLINK_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c
new file mode 100644
index 000000000000..d520934b420f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c
@@ -0,0 +1,719 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_eswitch.h"
+#include "ice_fltr.h"
+#include "ice_repr.h"
+#include "ice_devlink.h"
+#include "ice_pf_vsi_vlan_ops.h"
+#include "ice_tc_lib.h"
+
+/**
+ * ice_eswitch_setup_env - configure switchdev HW filters
+ * @pf: pointer to PF struct
+ *
+ * This function adds HW filters configuration specific for switchdev
+ * mode.
+ */
+static int ice_eswitch_setup_env(struct ice_pf *pf)
+{
+	struct ice_vsi *uplink_vsi = pf->switchdev.uplink_vsi;
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+	struct ice_port_info *pi = pf->hw.port_info;
+	struct ice_vsi_vlan_ops *vlan_ops;
+	bool rule_added = false;
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(ctrl_vsi);
+	if (vlan_ops->dis_stripping(ctrl_vsi) ||
+	    vlan_ops->dis_insertion(ctrl_vsi))
+		return -ENODEV;
+
+	ice_remove_vsi_fltr(&pf->hw, uplink_vsi->idx);
+
+	if (ice_vsi_add_vlan_zero(uplink_vsi))
+		goto err_def_rx;
+
+	if (!ice_is_dflt_vsi_in_use(uplink_vsi->vsw)) {
+		if (ice_set_dflt_vsi(uplink_vsi->vsw, uplink_vsi))
+			goto err_def_rx;
+		rule_added = true;
+	}
+
+	if (ice_cfg_dflt_vsi(pi, ctrl_vsi->idx, true, ICE_FLTR_TX))
+		goto err_def_tx;
+
+	if (ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_set_allow_override))
+		goto err_override_uplink;
+
+	if (ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_set_allow_override))
+		goto err_override_control;
+
+	if (ice_fltr_update_flags_dflt_rule(ctrl_vsi, pi->dflt_tx_vsi_rule_id,
+					    ICE_FLTR_TX,
+					    ICE_SINGLE_ACT_LB_ENABLE))
+		goto err_update_action;
+
+	return 0;
+
+err_update_action:
+	ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_clear_allow_override);
+err_override_control:
+	ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_clear_allow_override);
+err_override_uplink:
+	ice_cfg_dflt_vsi(pi, ctrl_vsi->idx, false, ICE_FLTR_TX);
+err_def_tx:
+	if (rule_added)
+		ice_clear_dflt_vsi(uplink_vsi->vsw);
+err_def_rx:
+	ice_fltr_add_mac_and_broadcast(uplink_vsi,
+				       uplink_vsi->port_info->mac.perm_addr,
+				       ICE_FWD_TO_VSI);
+	return -ENODEV;
+}
+
+/**
+ * ice_eswitch_release_env - clear switchdev HW filters
+ * @pf: pointer to PF struct
+ *
+ * This function removes HW filters configuration specific for switchdev
+ * mode and restores default legacy mode settings.
+ */
+static void
+ice_eswitch_release_env(struct ice_pf *pf)
+{
+	struct ice_vsi *uplink_vsi = pf->switchdev.uplink_vsi;
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_clear_allow_override);
+	ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_clear_allow_override);
+	ice_cfg_dflt_vsi(pi, ctrl_vsi->idx, false, ICE_FLTR_TX);
+	ice_clear_dflt_vsi(uplink_vsi->vsw);
+	ice_fltr_add_mac_and_broadcast(uplink_vsi,
+				       uplink_vsi->port_info->mac.perm_addr,
+				       ICE_FWD_TO_VSI);
+}
+
+#ifdef HAVE_METADATA_PORT_INFO
+/**
+ * ice_eswitch_remap_ring - reconfigure ring of switchdev ctrl VSI
+ * @ring: pointer to ring
+ * @q_vector: pointer of q_vector which is connected with this ring
+ * @netdev: netdevice connected with this ring
+ */
+static void
+ice_eswitch_remap_ring(struct ice_ring *ring, struct ice_q_vector *q_vector,
+		       struct net_device *netdev)
+{
+	ring->q_vector = q_vector;
+	ring->next = NULL;
+	ring->netdev = netdev;
+}
+
+/**
+ * ice_eswitch_remap_rings_to_vectors - reconfigure rings of switchdev ctrl VSI
+ * @pf: pointer to PF struct
+ *
+ * In switchdev number of allocated Tx/Rx rings is equal.
+ *
+ * This function fills q_vectors structures associated with representator and
+ * move each ring pairs to port representator netdevs. Each port representor
+ * will have dedicated 1 Tx/Rx ring pair, so number of rings pair is equal to
+ * number of VFs.
+ */
+static void
+ice_eswitch_remap_rings_to_vectors(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi = pf->switchdev.control_vsi;
+	int q_id;
+
+	ice_for_each_txq(vsi, q_id) {
+		struct ice_repr *repr = pf->vf[q_id].repr;
+		struct ice_q_vector *q_vector = repr->q_vector;
+		struct ice_ring *tx_ring = vsi->tx_rings[q_id];
+		struct ice_ring *rx_ring = vsi->rx_rings[q_id];
+
+		q_vector->vsi = vsi;
+		q_vector->reg_idx = vsi->q_vectors[0]->reg_idx;
+
+		q_vector->num_ring_tx = 1;
+		q_vector->tx.ring = tx_ring;
+		ice_eswitch_remap_ring(tx_ring, q_vector, repr->netdev);
+		/* In switchdev mode, from OS stack perspective, there is only
+		 * one queue for given netdev, so it needs to be indexed as 0.
+		 */
+		tx_ring->q_index = 0;
+
+		q_vector->num_ring_rx = 1;
+		q_vector->rx.ring = rx_ring;
+		ice_eswitch_remap_ring(rx_ring, q_vector, repr->netdev);
+	}
+}
+
+/**
+ * ice_eswitch_setup_reprs - configure port reprs to run in switchdev mode
+ * @pf: pointer to PF struct
+ */
+static int ice_eswitch_setup_reprs(struct ice_pf *pf)
+{
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+	int max_vsi_num = 0;
+	int i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vsi *vsi = pf->vf[i].repr->src_vsi;
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_remove_vsi_fltr(&pf->hw, vsi->idx);
+		vf->repr->dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX,
+						   GFP_KERNEL);
+		if (!vf->repr->dst) {
+			ice_fltr_add_mac_and_broadcast(vsi,
+						       vf->hw_lan_addr.addr,
+						       ICE_FWD_TO_VSI);
+			goto err;
+		}
+
+		if (ice_vsi_update_security(vsi, ice_vsi_ctx_clear_antispoof)) {
+			ice_fltr_add_mac_and_broadcast(vsi,
+						       vf->hw_lan_addr.addr,
+						       ICE_FWD_TO_VSI);
+			metadata_dst_free(vf->repr->dst);
+			goto err;
+		}
+
+		if (ice_vsi_add_vlan_zero(vsi)) {
+			ice_fltr_add_mac_and_broadcast(vsi,
+						       vf->hw_lan_addr.addr,
+						       ICE_FWD_TO_VSI);
+			metadata_dst_free(vf->repr->dst);
+			ice_vsi_update_security(vsi, ice_vsi_ctx_set_antispoof);
+			goto err;
+		}
+
+		if (max_vsi_num < vsi->vsi_num)
+			max_vsi_num = vsi->vsi_num;
+
+		netif_napi_add(vf->repr->netdev, &vf->repr->q_vector->napi, ice_napi_poll,
+			       NAPI_POLL_WEIGHT);
+
+		netif_keep_dst(vf->repr->netdev);
+	}
+
+	kfree(ctrl_vsi->target_netdevs);
+
+	ctrl_vsi->target_netdevs = kcalloc(max_vsi_num + 1,
+					   sizeof(*ctrl_vsi->target_netdevs),
+					   GFP_KERNEL);
+	if (!ctrl_vsi->target_netdevs)
+		goto err;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_repr *repr = pf->vf[i].repr;
+		struct ice_vsi *vsi = repr->src_vsi;
+		struct metadata_dst *dst;
+
+		ctrl_vsi->target_netdevs[vsi->vsi_num] = repr->netdev;
+
+		dst = repr->dst;
+		dst->u.port_info.port_id = vsi->vsi_num;
+		dst->u.port_info.lower_dev = repr->netdev;
+		ice_repr_set_traffic_vsi(repr, ctrl_vsi);
+	}
+
+	return 0;
+
+err:
+	for (i = i - 1; i >= 0; i--) {
+		struct ice_vsi *vsi = pf->vf[i].repr->src_vsi;
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_vsi_update_security(vsi, ice_vsi_ctx_set_antispoof);
+		metadata_dst_free(vf->repr->dst);
+		ice_fltr_add_mac_and_broadcast(vsi, vf->hw_lan_addr.addr,
+					       ICE_FWD_TO_VSI);
+	}
+
+	return -ENODEV;
+}
+
+/**
+ * ice_eswitch_release_reprs - clear PR VSIs configuration
+ * @pf: poiner to PF struct
+ * @ctrl_vsi: pointer to switchdev control VSI
+ */
+static void ice_eswitch_release_reprs(struct ice_pf *pf,
+				      struct ice_vsi *ctrl_vsi)
+{
+	int i;
+
+	kfree(ctrl_vsi->target_netdevs);
+	ice_for_each_vf(pf, i) {
+		struct ice_vsi *vsi = pf->vf[i].repr->src_vsi;
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_vsi_update_security(vsi, ice_vsi_ctx_set_antispoof);
+		metadata_dst_free(vf->repr->dst);
+		ice_fltr_add_mac_and_broadcast(vsi, vf->hw_lan_addr.addr,
+					       ICE_FWD_TO_VSI);
+
+		netif_napi_del(&vf->repr->q_vector->napi);
+	}
+}
+
+/**
+ * ice_eswitch_update_repr - reconfigure VF port representor
+ * @vsi: VF VSI for which port representor is configured
+ */
+void ice_eswitch_update_repr(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_repr *repr;
+	struct ice_vf *vf;
+	int ret;
+
+	if (!ice_is_switchdev_running(pf))
+		return;
+
+	vf = &pf->vf[vsi->vf_id];
+	repr = vf->repr;
+	repr->src_vsi = vsi;
+	repr->dst->u.port_info.port_id = vsi->vsi_num;
+
+	ret = ice_vsi_update_security(vsi, ice_vsi_ctx_clear_antispoof);
+	if (ret) {
+		ice_fltr_add_mac_and_broadcast(vsi, vf->hw_lan_addr.addr, ICE_FWD_TO_VSI);
+		dev_err(ice_pf_to_dev(pf), "Failed to update VF %d port representor", vsi->vf_id);
+		return;
+	}
+}
+
+/**
+ * ice_eswitch_port_start_xmit - callback for packets transmit
+ * @skb: send buffer
+ * @netdev: network interface device structure
+ *
+ * Returns NETDEV_TX_OK if sent, else an error code
+ */
+netdev_tx_t
+ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct ice_netdev_priv *np;
+	struct ice_repr *repr;
+	struct ice_vsi *vsi;
+
+	np = netdev_priv(netdev);
+	vsi = np->vsi;
+
+	if (ice_is_reset_in_progress(vsi->back->state))
+		return NETDEV_TX_BUSY;
+
+	repr = ice_netdev_to_repr(netdev);
+	skb_dst_drop(skb);
+	dst_hold((struct dst_entry *)repr->dst);
+	skb_dst_set(skb, (struct dst_entry *)repr->dst);
+	skb->queue_mapping = repr->vf->vf_id;
+
+	return ice_start_xmit(skb, netdev);
+}
+
+/**
+ * ice_eswitch_set_target_vsi - set switchdev context in Tx context descriptor
+ * @skb: pointer to send buffer
+ * @off: pointer to offload struct
+ */
+void ice_eswitch_set_target_vsi(struct sk_buff *skb,
+				struct ice_tx_offload_params *off)
+{
+	struct metadata_dst *dst = skb_metadata_dst(skb);
+	u64 cd_cmd, dst_vsi;
+
+	if (!dst) {
+		cd_cmd = ICE_TX_CTX_DESC_SWTCH_UPLINK << ICE_TXD_CTX_QW1_CMD_S;
+		off->cd_qw1 |= (cd_cmd | ICE_TX_DESC_DTYPE_CTX);
+	} else {
+		cd_cmd = ICE_TX_CTX_DESC_SWTCH_VSI << ICE_TXD_CTX_QW1_CMD_S;
+		dst_vsi = ((u64)dst->u.port_info.port_id <<
+			   ICE_TXD_CTX_QW1_VSI_S) & ICE_TXD_CTX_QW1_VSI_M;
+		off->cd_qw1 = cd_cmd | dst_vsi | ICE_TX_DESC_DTYPE_CTX;
+	}
+}
+#else
+static void
+ice_eswitch_release_reprs(struct ice_pf __always_unused *pf,
+			  struct ice_vsi __always_unused *ctrl_vsi)
+{
+}
+
+static void
+ice_eswitch_remap_rings_to_vectors(struct ice_pf *pf)
+{
+}
+
+static int ice_eswitch_setup_reprs(struct ice_pf __always_unused *pf)
+{
+	return -ENODEV;
+}
+
+netdev_tx_t
+ice_eswitch_port_start_xmit(struct sk_buff __always_unused *skb,
+			    struct net_device __always_unused *netdev)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+
+/**
+ * ice_eswitch_vsi_setup - configure switchdev control VSI
+ * @pf: pointer to PF structure
+ * @pi: pointer to port_info structure
+ */
+static struct ice_vsi *
+ice_eswitch_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_SWITCHDEV_CTRL, ICE_INVAL_VFID, NULL, 0);
+}
+
+
+/**
+ * ice_eswitch_napi_del - remove NAPI handle for all port representors
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_napi_del(struct ice_pf *pf)
+{
+	int i;
+
+	ice_for_each_vf(pf, i)
+		netif_napi_del(&pf->vf[i].repr->q_vector->napi);
+}
+
+/**
+ * ice_eswitch_napi_enable - enable NAPI for all port representors
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_napi_enable(struct ice_pf *pf)
+{
+	int i;
+
+	ice_for_each_vf(pf, i)
+		napi_enable(&pf->vf[i].repr->q_vector->napi);
+}
+
+/**
+ * ice_eswitch_napi_disable - disable NAPI for all port representors
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_napi_disable(struct ice_pf *pf)
+{
+	int i;
+
+	ice_for_each_vf(pf, i)
+		napi_disable(&pf->vf[i].repr->q_vector->napi);
+}
+
+/**
+ * ice_eswitch_set_rxdid - configure rxdid on all rx queues from VSI
+ * @vsi: vsi to setup rxdid on
+ * @rxdid: flex descriptor id
+ */
+static void ice_eswitch_set_rxdid(struct ice_vsi *vsi, u32 rxdid)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	int i;
+
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *ring = vsi->rx_rings[i];
+		u16 pf_q = vsi->rxq_map[ring->q_index];
+
+		ice_write_qrxflxp_cntxt(hw, pf_q, rxdid, 0x3, true);
+	}
+}
+
+/**
+ * ice_eswitch_enable_switchdev - configure eswitch in switchdev mode
+ * @pf: pointer to PF structure
+ */
+static int
+ice_eswitch_enable_switchdev(struct ice_pf *pf)
+{
+	struct ice_vsi *ctrl_vsi;
+
+	pf->switchdev.control_vsi = ice_eswitch_vsi_setup(pf, pf->hw.port_info);
+	if (!pf->switchdev.control_vsi)
+		return -ENODEV;
+
+	ctrl_vsi = pf->switchdev.control_vsi;
+	pf->switchdev.uplink_vsi = ice_get_main_vsi(pf);
+	if (!pf->switchdev.uplink_vsi)
+		goto err_vsi;
+
+	if (ice_eswitch_setup_env(pf))
+		goto err_vsi;
+
+	if (ice_repr_add_for_all_vfs(pf))
+		goto err_repr_add;
+
+	if (ice_eswitch_setup_reprs(pf))
+		goto err_setup_reprs;
+
+	ice_eswitch_remap_rings_to_vectors(pf);
+
+	if (ice_vsi_open(ctrl_vsi))
+		goto err_setup_reprs;
+
+	ice_eswitch_napi_enable(pf);
+
+	ice_eswitch_set_rxdid(ctrl_vsi, ICE_RXDID_FLEX_NIC_2);
+
+	return 0;
+
+err_setup_reprs:
+	ice_repr_rem_from_all_vfs(pf);
+err_repr_add:
+	ice_eswitch_release_env(pf);
+err_vsi:
+	ice_vsi_release(ctrl_vsi);
+	return -ENODEV;
+}
+
+/**
+ * ice_eswitch_disable_switchdev - disable switchdev resources
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_disable_switchdev(struct ice_pf *pf)
+{
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+
+	ice_eswitch_napi_disable(pf);
+	ice_eswitch_release_env(pf);
+	ice_vsi_release(ctrl_vsi);
+	ice_eswitch_release_reprs(pf, ctrl_vsi);
+	ice_repr_rem_from_all_vfs(pf);
+}
+
+#ifdef HAVE_METADATA_PORT_INFO
+#ifdef HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+/**
+ * ice_eswitch_mode_set - set new eswitch mode
+ * @devlink: pointer to devlink structure
+ * @mode: eswitch mode to switch to
+ * @extack: pointer to extack structure
+ */
+int ice_eswitch_mode_set(struct devlink *devlink, u16 mode,
+			 struct netlink_ext_ack *extack)
+#else
+int ice_eswitch_mode_set(struct devlink *devlink, u16 mode)
+#endif /* HAVE_DEVLINK_ESWITCH_OPS_EXTACK */
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+
+	if (pf->eswitch_mode == mode)
+		return 0;
+
+	if (pf->num_alloc_vfs) {
+		dev_info(ice_pf_to_dev(pf),
+			 "Changing eswitch mode is allowed only if there is no VFs created");
+		return -EOPNOTSUPP;
+	}
+
+	switch (mode) {
+	case DEVLINK_ESWITCH_MODE_LEGACY:
+		dev_info(ice_pf_to_dev(pf), "PF %d changed eswitch mode to legacy", pf->hw.pf_id);
+		break;
+	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+	{
+#ifdef NETIF_F_HW_TC
+		if (ice_is_adq_active(pf)) {
+			dev_err(ice_pf_to_dev(pf), "switchdev cannot be configured - ADQ is active. Delete ADQ configs using TC and try again\n");
+			return -EOPNOTSUPP;
+		}
+#endif /* NETIF_F_HW_TC */
+
+#ifdef HAVE_NETDEV_SB_DEV
+		if (ice_is_offloaded_macvlan_ena(pf)) {
+			dev_err(ice_pf_to_dev(pf), "switchdev cannot be configured -  L2 Forwarding Offload is currently enabled.\n");
+			return -EOPNOTSUPP;
+		}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+		dev_info(ice_pf_to_dev(pf),
+			 "PF %d changed eswitch mode to switchdev", pf->hw.pf_id);
+		break;
+	}
+	default:
+#ifdef HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+		NL_SET_ERR_MSG_MOD(extack, "Unknown eswitch mode");
+#else
+		dev_err(ice_pf_to_dev(pf), "Unknown eswitch mode");
+#endif /* HAVE_DEVLINK_ESWITCH_OPS_EXTACK */
+		return -EINVAL;
+	}
+
+	pf->eswitch_mode = mode;
+	return 0;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+
+/**
+ * ice_eswitch_get_target_netdev - return port representor netdev
+ * @rx_ring: pointer to rx ring
+ * @rx_desc: pointer to rx descriptor
+ *
+ * When working in switchdev mode context (when control vsi is used), this
+ * function returns netdev of appropriate port representor. For non-switchdev
+ * context, regular netdev associated with rx ring is returned.
+ */
+struct net_device *
+ice_eswitch_get_target_netdev(struct ice_ring *rx_ring,
+			      union ice_32b_rx_flex_desc *rx_desc)
+{
+	struct ice_32b_rx_flex_desc_nic_2 *desc;
+	struct ice_vsi *vsi = rx_ring->vsi;
+	struct ice_vsi *control_vsi;
+	u16 target_vsi_id;
+
+	control_vsi = vsi->back->switchdev.control_vsi;
+	if (vsi != control_vsi)
+		return rx_ring->netdev;
+
+	desc = (struct ice_32b_rx_flex_desc_nic_2 *)rx_desc;
+	target_vsi_id = le16_to_cpu(desc->src_vsi);
+
+	return vsi->target_netdevs[target_vsi_id];
+}
+
+/**
+ * ice_eswitch_mode_get - get current eswitch mode
+ * @devlink: pointer to devlink structure
+ * @mode: output parameter for current eswitch mode
+ */
+int ice_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+
+	*mode = pf->eswitch_mode;
+	return 0;
+}
+
+/**
+ * ice_is_eswitch_mode_switchdev - check if eswitch mode is set to switchdev
+ * @pf: pointer to PF structure
+ *
+ * Returns true if eswitch mode is set to DEVLINK_ESWITCH_MODE_SWITCHDEV,
+ * false otherwise.
+ */
+bool ice_is_eswitch_mode_switchdev(struct ice_pf *pf)
+{
+	return pf->eswitch_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV;
+}
+
+/**
+ * ice_eswitch_release - cleanup eswitch
+ * @pf: pointer to PF structure
+ */
+void ice_eswitch_release(struct ice_pf *pf)
+{
+	if (pf->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY)
+		return;
+
+	ice_eswitch_disable_switchdev(pf);
+	pf->switchdev.is_running = false;
+}
+
+/**
+ * ice_eswitch_configure - configure eswitch
+ * @pf: pointer to PF structure
+ */
+int ice_eswitch_configure(struct ice_pf *pf)
+{
+	int status;
+
+	if (pf->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY || pf->switchdev.is_running)
+		return 0;
+
+	status = ice_eswitch_enable_switchdev(pf);
+	if (status)
+		return status;
+
+	pf->switchdev.is_running = true;
+	return 0;
+}
+
+/**
+ * ice_eswitch_start_all_tx_queues - start Tx queues of all port representors
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_start_all_tx_queues(struct ice_pf *pf)
+{
+	struct ice_repr *repr;
+	int i;
+
+	if (test_bit(ICE_DOWN, pf->state))
+		return;
+
+	ice_for_each_vf(pf, i) {
+		repr = pf->vf[i].repr;
+		if (repr)
+			ice_repr_start_tx_queues(repr);
+	}
+}
+
+/**
+ * ice_eswitch_stop_all_tx_queues - stop Tx queues of all port representors
+ * @pf: pointer to PF structure
+ */
+void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf)
+{
+	struct ice_repr *repr;
+	int i;
+
+	if (test_bit(ICE_DOWN, pf->state))
+		return;
+
+	ice_for_each_vf(pf, i) {
+		repr = pf->vf[i].repr;
+		if (repr)
+			ice_repr_stop_tx_queues(repr);
+	}
+}
+
+/**
+ * ice_eswitch_rebuild - rebuild eswitch
+ * @pf: pointer to PF structure
+ */
+int ice_eswitch_rebuild(struct ice_pf *pf)
+{
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+	int status;
+
+	ice_eswitch_napi_disable(pf);
+	ice_eswitch_napi_del(pf);
+
+	status = ice_eswitch_setup_env(pf);
+	if (status)
+		return status;
+
+	status = ice_eswitch_setup_reprs(pf);
+	if (status)
+		return status;
+
+	ice_eswitch_remap_rings_to_vectors(pf);
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	ice_replay_tc_fltrs(pf);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+	status = ice_vsi_open(ctrl_vsi);
+	if (status)
+		return status;
+
+	ice_eswitch_napi_enable(pf);
+	ice_eswitch_set_rxdid(ctrl_vsi, ICE_RXDID_FLEX_NIC_2);
+	ice_eswitch_start_all_tx_queues(pf);
+
+	return 0;
+}
+#endif /* CONFIG_NET_DEVLINK */
diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.h b/drivers/net/ethernet/intel/ice/ice_eswitch.h
new file mode 100644
index 000000000000..1f8a39493cb7
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_eswitch.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_ESWITCH_H_
+#define _ICE_ESWITCH_H_
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#include <net/devlink.h>
+
+void ice_eswitch_release(struct ice_pf *pf);
+int ice_eswitch_configure(struct ice_pf *pf);
+int ice_eswitch_rebuild(struct ice_pf *pf);
+int ice_eswitch_mode_get(struct devlink *devlink, u16 *mode);
+void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf);
+
+struct net_device *
+ice_eswitch_get_target_netdev(struct ice_ring *rx_ring,
+			      union ice_32b_rx_flex_desc *rx_desc);
+#ifdef HAVE_METADATA_PORT_INFO
+void ice_eswitch_set_target_vsi(struct sk_buff *skb,
+				struct ice_tx_offload_params *off);
+void ice_eswitch_update_repr(struct ice_vsi *vsi);
+#else
+static inline
+void ice_eswitch_set_target_vsi(struct sk_buff *skb, struct ice_tx_offload_params *off) { }
+static inline void ice_eswitch_update_repr(struct ice_vsi *vsi) { }
+#endif /* HAVE_METADATA_PORT_INFO */
+netdev_tx_t
+ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+#ifdef HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+#ifdef HAVE_METADATA_PORT_INFO
+int ice_eswitch_mode_set(struct devlink *devlink, u16 mode,
+			 struct netlink_ext_ack *extack);
+#else
+static inline int
+ice_eswitch_mode_set(struct devlink __always_unused *devlink,
+		     u16 __always_unused mode,
+		     struct netlink_ext_ack __always_unused *extack)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+#else
+#ifdef HAVE_METADATA_PORT_INFO
+int ice_eswitch_mode_set(struct devlink *devlink, u16 mode);
+#else
+static inline int ice_eswitch_mode_set(struct devlink __always_unused *devlink,
+				       u16 __always_unused mode)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+#endif /* HAVE_DEVLINK_ESWITCH_OPS_EXTACK */
+bool ice_is_eswitch_mode_switchdev(struct ice_pf *pf);
+#else /* !CONFIG_NET_DEVLINK */
+static inline void ice_eswitch_release(struct ice_pf *pf) { }
+static inline
+void ice_eswitch_set_target_vsi(struct sk_buff *skb, struct ice_tx_offload_params *off) { }
+static inline void ice_eswitch_update_repr(struct ice_vsi *vsi) { }
+static inline void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf) { }
+
+static inline int
+ice_eswitch_configure(struct ice_pf *pf)
+{
+	return 0;
+}
+
+static inline bool
+ice_is_eswitch_mode_switchdev(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+
+static inline int
+ice_eswitch_rebuild(struct ice_pf __always_unused *pf)
+{
+	return 0;
+}
+
+static inline netdev_tx_t
+ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	return 0;
+}
+
+static inline struct net_device *
+ice_eswitch_get_target_netdev(struct ice_ring *rx_ring,
+			      union ice_32b_rx_flex_desc *rx_desc)
+{
+	return NULL;
+}
+#endif /* CONFIG_NET_DEVLINK */
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index fc9ff985a62b..9d9adef809dd 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -1,64 +1,111 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 /* ethtool support for ice */
 
 #include "ice.h"
+#include "ice_ethtool.h"
 #include "ice_lib.h"
+#include "ice_fltr.h"
 #include "ice_dcb_lib.h"
-
-struct ice_stats {
-	char stat_string[ETH_GSTRING_LEN];
-	int sizeof_stat;
-	int stat_offset;
-};
-
-#define ICE_STAT(_type, _name, _stat) { \
-	.stat_string = _name, \
-	.sizeof_stat = FIELD_SIZEOF(_type, _stat), \
-	.stat_offset = offsetof(_type, _stat) \
-}
-
-#define ICE_VSI_STAT(_name, _stat) \
-		ICE_STAT(struct ice_vsi, _name, _stat)
-#define ICE_PF_STAT(_name, _stat) \
-		ICE_STAT(struct ice_pf, _name, _stat)
+#include "ice_dcb_nl.h"
 
 static int ice_q_stats_len(struct net_device *netdev)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-
-	return ((np->vsi->alloc_txq + np->vsi->alloc_rxq) *
-		(sizeof(struct ice_q_stats) / sizeof(u64)));
+	int stats_size, total_slen = 0;
+
+#ifdef ADQ_PERF_COUNTERS
+	/* Tx stats */
+	stats_size = sizeof(struct ice_q_stats) +
+		     sizeof(struct ice_ch_q_poll_stats) +
+		     sizeof(struct ice_ch_tx_q_stats);
+	total_slen += np->vsi->alloc_txq * (stats_size / sizeof(u64));
+
+	/* Rx stats */
+	stats_size = sizeof(struct ice_q_stats) +
+		     sizeof(struct ice_ch_q_poll_stats) +
+		     sizeof(struct ice_ch_rx_q_stats);
+	total_slen += np->vsi->alloc_rxq * (stats_size / sizeof(u64));
+
+	stats_size = sizeof(struct ice_q_vector_ch_stats);
+	total_slen += np->vsi->alloc_rxq * (stats_size / sizeof(u64));
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+
+	stats_size = sizeof(struct ice_q_stats);
+	total_slen += (ICE_MAX_MACVLANS * 2) * (stats_size / sizeof(u64));
+	/* the napi_poll_cnt isn't included in the MACVLAN stats so reduce
+	 * the count by that many so the stats get printed correctly
+	 */
+	total_slen -= ICE_MAX_MACVLANS * 2;
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
+#else
+	stats_size = sizeof(struct ice_q_stats);
+
+	total_slen += np->vsi->alloc_txq * (stats_size / sizeof(u64));
+	total_slen += np->vsi->alloc_rxq * (stats_size / sizeof(u64));
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+	total_slen += (ICE_MAX_MACVLANS * 2) * (stats_size / sizeof(u64));
+	/* the napi_poll_cnt isn't included in the MACVLAN stats so reduce
+	 * the count by that many so the stats get printed correctly
+	 */
+	total_slen -= ICE_MAX_MACVLANS * 2;
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+	stats_size = sizeof(struct ice_q_stats);
+	total_slen += np->vsi->num_xdp_txq * (stats_size / sizeof(u64));
+
+	stats_size = sizeof(struct ice_xdp_stats);
+	total_slen += np->vsi->alloc_rxq * (stats_size / sizeof(u64));
+#endif
+#endif
+
+	return total_slen;
 }
 
 #define ICE_PF_STATS_LEN	ARRAY_SIZE(ice_gstrings_pf_stats)
 #define ICE_VSI_STATS_LEN	ARRAY_SIZE(ice_gstrings_vsi_stats)
 
 #define ICE_PFC_STATS_LEN ( \
-		(FIELD_SIZEOF(struct ice_pf, stats.priority_xoff_rx) + \
-		 FIELD_SIZEOF(struct ice_pf, stats.priority_xon_rx) + \
-		 FIELD_SIZEOF(struct ice_pf, stats.priority_xoff_tx) + \
-		 FIELD_SIZEOF(struct ice_pf, stats.priority_xon_tx)) \
+		(sizeof_field(struct ice_pf, stats.priority_xoff_rx) + \
+		 sizeof_field(struct ice_pf, stats.priority_xon_rx) + \
+		 sizeof_field(struct ice_pf, stats.priority_xoff_tx) + \
+		 sizeof_field(struct ice_pf, stats.priority_xon_tx)) \
 		 / sizeof(u64))
 #define ICE_ALL_STATS_LEN(n)	(ICE_PF_STATS_LEN + ICE_PFC_STATS_LEN + \
 				 ICE_VSI_STATS_LEN + ice_q_stats_len(n))
 
 static const struct ice_stats ice_gstrings_vsi_stats[] = {
-	ICE_VSI_STAT("rx_unicast", eth_stats.rx_unicast),
-	ICE_VSI_STAT("tx_unicast", eth_stats.tx_unicast),
-	ICE_VSI_STAT("rx_multicast", eth_stats.rx_multicast),
-	ICE_VSI_STAT("tx_multicast", eth_stats.tx_multicast),
-	ICE_VSI_STAT("rx_broadcast", eth_stats.rx_broadcast),
-	ICE_VSI_STAT("tx_broadcast", eth_stats.tx_broadcast),
-	ICE_VSI_STAT("rx_bytes", eth_stats.rx_bytes),
-	ICE_VSI_STAT("tx_bytes", eth_stats.tx_bytes),
-	ICE_VSI_STAT("rx_dropped", eth_stats.rx_discards),
-	ICE_VSI_STAT("rx_unknown_protocol", eth_stats.rx_unknown_protocol),
-	ICE_VSI_STAT("rx_alloc_fail", rx_buf_failed),
-	ICE_VSI_STAT("rx_pg_alloc_fail", rx_page_failed),
-	ICE_VSI_STAT("tx_errors", eth_stats.tx_errors),
-	ICE_VSI_STAT("tx_linearize", tx_linearize),
+	ICE_VSI_STAT(ICE_RX_UNICAST, eth_stats.rx_unicast),
+	ICE_VSI_STAT(ICE_TX_UNICAST, eth_stats.tx_unicast),
+	ICE_VSI_STAT(ICE_RX_MULTICAST, eth_stats.rx_multicast),
+	ICE_VSI_STAT(ICE_TX_MULTICAST, eth_stats.tx_multicast),
+	ICE_VSI_STAT(ICE_RX_BROADCAST, eth_stats.rx_broadcast),
+	ICE_VSI_STAT(ICE_TX_BROADCAST, eth_stats.tx_broadcast),
+	ICE_VSI_STAT(ICE_RX_BYTES, eth_stats.rx_bytes),
+	ICE_VSI_STAT(ICE_TX_BYTES, eth_stats.tx_bytes),
+	ICE_VSI_STAT(ICE_RX_DROPPED, eth_stats.rx_discards),
+	ICE_VSI_STAT(ICE_RX_UNKNOWN_PROTO, eth_stats.rx_unknown_protocol),
+	ICE_VSI_STAT(ICE_RX_ALLOC_FAIL, rx_buf_failed),
+	ICE_VSI_STAT(ICE_RX_PAGE_ALLOC_FAIL, rx_page_failed),
+#ifdef ICE_ADD_PROBES
+	ICE_VSI_STAT(ICE_RX_PAGE_REUSE, rx_page_reuse),
+#endif /* ICE_ADD_PROBES */
+	ICE_VSI_STAT(ICE_TX_ERRORS, eth_stats.tx_errors),
+	ICE_VSI_STAT(ICE_TX_LINEARIZE, tx_linearize),
+	ICE_VSI_STAT(ICE_TX_BUSY, tx_busy),
+	ICE_VSI_STAT(ICE_TX_RESTART, tx_restart),
+#ifdef ADQ_PERF_COUNTERS
+	ICE_VSI_STAT("chnl_trans_inline_fd", cnt_inline_fd_transition),
+	ICE_VSI_STAT("chnl_fd_table_flushed", cnt_table_flushed),
+	ICE_VSI_STAT("chnl_fd_table_full", cnt_tbl_full),
+#endif /* ADQ_PERF_COUNTERS */
 };
 
 enum ice_ethtool_test_id {
@@ -89,45 +136,77 @@ static const char ice_gstrings_test[][ETH_GSTRING_LEN] = {
  * is queried on the base PF netdev.
  */
 static const struct ice_stats ice_gstrings_pf_stats[] = {
-	ICE_PF_STAT("rx_bytes.nic", stats.eth.rx_bytes),
-	ICE_PF_STAT("tx_bytes.nic", stats.eth.tx_bytes),
-	ICE_PF_STAT("rx_unicast.nic", stats.eth.rx_unicast),
-	ICE_PF_STAT("tx_unicast.nic", stats.eth.tx_unicast),
-	ICE_PF_STAT("rx_multicast.nic", stats.eth.rx_multicast),
-	ICE_PF_STAT("tx_multicast.nic", stats.eth.tx_multicast),
-	ICE_PF_STAT("rx_broadcast.nic", stats.eth.rx_broadcast),
-	ICE_PF_STAT("tx_broadcast.nic", stats.eth.tx_broadcast),
-	ICE_PF_STAT("tx_errors.nic", stats.eth.tx_errors),
-	ICE_PF_STAT("rx_size_64.nic", stats.rx_size_64),
-	ICE_PF_STAT("tx_size_64.nic", stats.tx_size_64),
-	ICE_PF_STAT("rx_size_127.nic", stats.rx_size_127),
-	ICE_PF_STAT("tx_size_127.nic", stats.tx_size_127),
-	ICE_PF_STAT("rx_size_255.nic", stats.rx_size_255),
-	ICE_PF_STAT("tx_size_255.nic", stats.tx_size_255),
-	ICE_PF_STAT("rx_size_511.nic", stats.rx_size_511),
-	ICE_PF_STAT("tx_size_511.nic", stats.tx_size_511),
-	ICE_PF_STAT("rx_size_1023.nic", stats.rx_size_1023),
-	ICE_PF_STAT("tx_size_1023.nic", stats.tx_size_1023),
-	ICE_PF_STAT("rx_size_1522.nic", stats.rx_size_1522),
-	ICE_PF_STAT("tx_size_1522.nic", stats.tx_size_1522),
-	ICE_PF_STAT("rx_size_big.nic", stats.rx_size_big),
-	ICE_PF_STAT("tx_size_big.nic", stats.tx_size_big),
-	ICE_PF_STAT("link_xon_rx.nic", stats.link_xon_rx),
-	ICE_PF_STAT("link_xon_tx.nic", stats.link_xon_tx),
-	ICE_PF_STAT("link_xoff_rx.nic", stats.link_xoff_rx),
-	ICE_PF_STAT("link_xoff_tx.nic", stats.link_xoff_tx),
-	ICE_PF_STAT("tx_dropped_link_down.nic", stats.tx_dropped_link_down),
-	ICE_PF_STAT("rx_undersize.nic", stats.rx_undersize),
-	ICE_PF_STAT("rx_fragments.nic", stats.rx_fragments),
-	ICE_PF_STAT("rx_oversize.nic", stats.rx_oversize),
-	ICE_PF_STAT("rx_jabber.nic", stats.rx_jabber),
-	ICE_PF_STAT("rx_csum_bad.nic", hw_csum_rx_error),
-	ICE_PF_STAT("rx_length_errors.nic", stats.rx_len_errors),
-	ICE_PF_STAT("rx_dropped.nic", stats.eth.rx_discards),
-	ICE_PF_STAT("rx_crc_errors.nic", stats.crc_errors),
-	ICE_PF_STAT("illegal_bytes.nic", stats.illegal_bytes),
-	ICE_PF_STAT("mac_local_faults.nic", stats.mac_local_faults),
-	ICE_PF_STAT("mac_remote_faults.nic", stats.mac_remote_faults),
+	ICE_PF_STAT(ICE_PORT_RX_BYTES, stats.eth.rx_bytes),
+	ICE_PF_STAT(ICE_PORT_TX_BYTES, stats.eth.tx_bytes),
+	ICE_PF_STAT(ICE_PORT_RX_UNICAST, stats.eth.rx_unicast),
+	ICE_PF_STAT(ICE_PORT_TX_UNICAST, stats.eth.tx_unicast),
+	ICE_PF_STAT(ICE_PORT_RX_MULTICAST, stats.eth.rx_multicast),
+	ICE_PF_STAT(ICE_PORT_TX_MULTICAST, stats.eth.tx_multicast),
+	ICE_PF_STAT(ICE_PORT_RX_BROADCAST, stats.eth.rx_broadcast),
+	ICE_PF_STAT(ICE_PORT_TX_BROADCAST, stats.eth.tx_broadcast),
+	ICE_PF_STAT(ICE_PORT_TX_ERRORS, stats.eth.tx_errors),
+	ICE_PF_STAT(ICE_PORT_TX_TIMEOUT, tx_timeout_count),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_64, stats.rx_size_64),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_64, stats.tx_size_64),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_127, stats.rx_size_127),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_127, stats.tx_size_127),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_255, stats.rx_size_255),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_255, stats.tx_size_255),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_511, stats.rx_size_511),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_511, stats.tx_size_511),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_1023, stats.rx_size_1023),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_1023, stats.tx_size_1023),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_1522, stats.rx_size_1522),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_1522, stats.tx_size_1522),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_JUMBO, stats.rx_size_big),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_JUMBO, stats.tx_size_big),
+	ICE_PF_STAT(ICE_PORT_RX_LINK_XON, stats.link_xon_rx),
+	ICE_PF_STAT(ICE_PORT_TX_LINK_XON, stats.link_xon_tx),
+	ICE_PF_STAT(ICE_PORT_RX_LINK_XOFF, stats.link_xoff_rx),
+	ICE_PF_STAT(ICE_PORT_TX_LINK_XOFF, stats.link_xoff_tx),
+	ICE_PF_STAT(ICE_PORT_TX_DROP_LINK_DOWN, stats.tx_dropped_link_down),
+	ICE_PF_STAT(ICE_PORT_RX_UNDERSIZE, stats.rx_undersize),
+	ICE_PF_STAT(ICE_PORT_RX_FRAGMENTS, stats.rx_fragments),
+	ICE_PF_STAT(ICE_PORT_RX_OVERSIZE, stats.rx_oversize),
+	ICE_PF_STAT(ICE_PORT_RX_JABBER, stats.rx_jabber),
+	ICE_PF_STAT(ICE_PORT_RX_CSUM_BAD, hw_csum_rx_error),
+	ICE_PF_STAT(ICE_PORT_RX_LEN_ERRORS, stats.rx_len_errors),
+	ICE_PF_STAT(ICE_PORT_RX_DROPPED, stats.eth.rx_discards),
+	ICE_PF_STAT(ICE_PORT_RX_CRC_ERRORS, stats.crc_errors),
+	ICE_PF_STAT(ICE_PORT_ILLEGAL_BYTES, stats.illegal_bytes),
+	ICE_PF_STAT(ICE_PORT_MAC_LOCAL_FAULTS, stats.mac_local_faults),
+	ICE_PF_STAT(ICE_PORT_MAC_REMOTE_FAULTS, stats.mac_remote_faults),
+#ifdef ICE_ADD_PROBES
+	ICE_PF_STAT(ICE_PORT_TX_TCP_SEGMENTS, tcp_segs),
+	ICE_PF_STAT(ICE_PORT_TX_UDP_SEGMENTS, udp_segs),
+	ICE_PF_STAT(ICE_PORT_RX_TCP_CSO, rx_tcp_cso),
+	ICE_PF_STAT(ICE_PORT_TX_TCP_CSO, tx_tcp_cso),
+	ICE_PF_STAT(ICE_PORT_RX_UDP_CSO, rx_udp_cso),
+	ICE_PF_STAT(ICE_PORT_TX_UDP_CSO, tx_udp_cso),
+	ICE_PF_STAT(ICE_PORT_RX_SCTP_CSO, rx_sctp_cso),
+	ICE_PF_STAT(ICE_PORT_TX_SCTP_CSO, tx_sctp_cso),
+	ICE_PF_STAT(ICE_PORT_RX_IP4_CSO, rx_ip4_cso),
+	ICE_PF_STAT(ICE_PORT_TX_IP4_CSO, tx_ip4_cso),
+	ICE_PF_STAT(ICE_PORT_RX_IP4_CSO_ERROR, rx_ip4_cso_err),
+	ICE_PF_STAT(ICE_PORT_RX_TCP_CSO_ERROR, rx_tcp_cso_err),
+	ICE_PF_STAT(ICE_PORT_RX_UDP_CSO_ERROR, rx_udp_cso_err),
+	ICE_PF_STAT(ICE_PORT_RX_SCTP_CSO_ERROR, rx_sctp_cso_err),
+	ICE_PF_STAT(ICE_PORT_TX_L3_CSO_ERROR, tx_l3_cso_err),
+	ICE_PF_STAT(ICE_PORT_TX_L4_CSO_ERROR, tx_l4_cso_err),
+	ICE_PF_STAT(ICE_PORT_RX_Q_VLANO, rx_q_vlano),
+	ICE_PF_STAT(ICE_PORT_TX_Q_VLANO, tx_q_vlano),
+	ICE_PF_STAT(ICE_PORT_RX_AD_VLANO, rx_ad_vlano),
+	ICE_PF_STAT(ICE_PORT_TX_AD_VLANO, tx_ad_vlano),
+#endif
+	ICE_PF_STAT(ICE_PORT_FDIR_SB_MATCH, stats.fd_sb_match),
+	ICE_PF_STAT(ICE_PORT_FDIR_SB_STATUS, stats.fd_sb_status),
+	ICE_PF_STAT("chnl_inline_fd_match", stats.ch_atr_match),
+#ifdef ICE_ADD_PROBES
+	ICE_PF_STAT(ICE_PORT_ARFS_TCPV4_MATCH, stats.arfs_tcpv4_match),
+	ICE_PF_STAT(ICE_PORT_ARFS_TCPV6_MATCH, stats.arfs_tcpv6_match),
+	ICE_PF_STAT(ICE_PORT_ARFS_UDP4_MATCH, stats.arfs_udpv4_match),
+	ICE_PF_STAT(ICE_PORT_ARFS_UDP6_MATCH, stats.arfs_udpv6_match),
+#endif /* ICE_ADD_PROBES */
 };
 
 static const u32 ice_regs_dump_list[] = {
@@ -138,9 +217,6 @@ static const u32 ice_regs_dump_list[] = {
 	QINT_RQCTL(0),
 	PFINT_OICR_ENA,
 	QRX_ITR(0),
-	PF0INT_ITR_0(0),
-	PF0INT_ITR_1(0),
-	PF0INT_ITR_2(0),
 };
 
 struct ice_priv_flag {
@@ -155,27 +231,98 @@ struct ice_priv_flag {
 
 static const struct ice_priv_flag ice_gstrings_priv_flags[] = {
 	ICE_PRIV_FLAG("link-down-on-close", ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA),
+#ifndef ETHTOOL_GFECPARAM
+	ICE_PRIV_FLAG("rs-fec", ICE_FLAG_RS_FEC),
+	ICE_PRIV_FLAG("base-r-fec", ICE_FLAG_BASE_R_FEC),
+#endif /* !ETHTOOL_GFECPARAM */
 	ICE_PRIV_FLAG("fw-lldp-agent", ICE_FLAG_FW_LLDP_AGENT),
+#ifdef NETIF_F_HW_TC
+	ICE_PRIV_FLAG("channel-inline-flow-director",
+		      ICE_FLAG_CHNL_INLINE_FD_ENA),
+	ICE_PRIV_FLAG("channel-inline-fd-mark",
+		      ICE_FLAG_CHNL_INLINE_FD_MARK_ENA),
+	ICE_PRIV_FLAG("channel-pkt-inspect-optimize",
+		      ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA),
+	ICE_PRIV_FLAG("channel-pkt-clean-bp-stop",
+		      ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_ENA),
+	ICE_PRIV_FLAG("channel-pkt-clean-bp-stop-cfg",
+		      ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_CFG),
+#endif /* NETIF_F_HW_TC */
+	ICE_PRIV_FLAG("vf-true-promisc-support",
+		      ICE_FLAG_VF_TRUE_PROMISC_ENA),
+	ICE_PRIV_FLAG("mdd-auto-reset-vf", ICE_FLAG_MDD_AUTO_RESET_VF),
+	ICE_PRIV_FLAG("vf-vlan-prune-disable", ICE_FLAG_VF_VLAN_PRUNE_DIS),
+	ICE_PRIV_FLAG("legacy-rx", ICE_FLAG_LEGACY_RX),
 };
 
 #define ICE_PRIV_FLAG_ARRAY_SIZE	ARRAY_SIZE(ice_gstrings_priv_flags)
 
 static void
-ice_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
+__ice_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo,
+		  struct ice_vsi *vsi)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	struct ice_orom_info *orom;
+	struct ice_nvm_info *nvm;
+
+	nvm = &hw->flash.nvm;
+	orom = &hw->flash.orom;
+
+	strscpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
+	strscpy(drvinfo->version, ice_drv_ver, sizeof(drvinfo->version));
+
+	/* Display NVM version (from which the firmware version can be
+	 * determined) which contains more pertinent information.
+	 */
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%x.%02x 0x%x %d.%d.%d", nvm->major, nvm->minor,
+		 nvm->eetrack, orom->major, orom->build, orom->patch);
+
+	/* When called via 'ethtool -i|--driver <iface>', log the above with
+	 * additional Netlist version information as a kernel message since it
+	 * will not all fit in the 32-byte fixed-length buffer.
+	 */
+	if (!strncmp(current->comm, "ethtool", 7)) {
+		struct ice_netlist_info *netlist = &hw->flash.netlist;
+
+		/* The netlist versions are stored in packed BCD format */
+		netdev_info(netdev, "NVM version details - %x.%02x, 0x%x, %x.%x.%x-%x.%x.%x.%08x, %d.%d.%d\n",
+			    nvm->major, nvm->minor, nvm->eetrack,
+			    netlist->major, netlist->minor,
+			    netlist->type >> 16, netlist->type & 0xffff,
+			    netlist->rev, netlist->cust_ver, netlist->hash,
+			    orom->major, orom->build, orom->patch);
+	}
 
-	strlcpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
-	strlcpy(drvinfo->version, ice_drv_ver, sizeof(drvinfo->version));
-	strlcpy(drvinfo->fw_version, ice_nvm_version_str(&pf->hw),
-		sizeof(drvinfo->fw_version));
-	strlcpy(drvinfo->bus_info, pci_name(pf->pdev),
+	strscpy(drvinfo->bus_info, pci_name(pf->pdev),
 		sizeof(drvinfo->bus_info));
+
+	if (test_bit(ICE_RECOVERY_MODE, pf->state))
+		return;
+
 	drvinfo->n_priv_flags = ICE_PRIV_FLAG_ARRAY_SIZE;
 }
 
+static void
+ice_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+
+	__ice_get_drvinfo(netdev, drvinfo, np->vsi);
+}
+
+static void
+ice_repr_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
+{
+	struct ice_repr *repr = ice_netdev_to_repr(netdev);
+
+	if (ice_check_vf_ready_for_cfg(repr->vf))
+		return;
+
+	__ice_get_drvinfo(netdev, drvinfo, repr->src_vsi);
+}
+
 static int ice_get_regs_len(struct net_device __always_unused *netdev)
 {
 	return sizeof(ice_regs_dump_list);
@@ -188,7 +335,7 @@ ice_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *p)
 	struct ice_pf *pf = np->vsi->back;
 	struct ice_hw *hw = &pf->hw;
 	u32 *regs_buf = (u32 *)p;
-	int i;
+	unsigned int i;
 
 	regs->version = 1;
 
@@ -230,7 +377,8 @@ static int ice_get_eeprom_len(struct net_device *netdev)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_pf *pf = np->vsi->back;
 
-	return (int)(pf->hw.nvm.sr_words * sizeof(u16));
+	/* Report the flash size, or at least 10MB */
+	return max_t(int, pf->hw.flash.flash_size, 10 * 1024 * 1024);
 }
 
 static int
@@ -238,42 +386,121 @@ ice_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
 	       u8 *bytes)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	u16 first_word, last_word, nwords;
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
 	struct ice_hw *hw = &pf->hw;
 	enum ice_status status;
 	struct device *dev;
 	int ret = 0;
-	u16 *buf;
+	u32 magic;
+	u8 *buf;
+
+	dev = ice_pf_to_dev(pf);
 
-	dev = &pf->pdev->dev;
+	magic = hw->vendor_id | (hw->device_id << 16);
+	if (eeprom->magic && eeprom->magic != magic) {
+		struct ice_nvm_access_cmd *nvm;
+		union ice_nvm_access_data *data;
+
+		nvm = (struct ice_nvm_access_cmd *)eeprom;
+		data = (union ice_nvm_access_data *)bytes;
+
+		netdev_dbg(netdev, "GEEPROM config 0x%08x, offset 0x%08x, data_size 0x%08x\n",
+			   nvm->config, nvm->offset, nvm->data_size);
+
+		status = ice_handle_nvm_access(hw, nvm, data);
+
+		ice_debug_array(hw, ICE_DBG_NVM, 16, 1, (u8 *)data,
+				nvm->data_size);
+
+		if (status) {
+			int err = ice_status_to_errno(status);
+
+			netdev_err(netdev, "NVM read offset 0x%x failed with status %s, error %d\n",
+				   nvm->offset, ice_stat_str(status), err);
+
+			return err;
+		}
 
-	eeprom->magic = hw->vendor_id | (hw->device_id << 16);
+		return 0;
+	}
 
-	first_word = eeprom->offset >> 1;
-	last_word = (eeprom->offset + eeprom->len - 1) >> 1;
-	nwords = last_word - first_word + 1;
+	eeprom->magic = magic;
+	netdev_dbg(netdev, "GEEPROM offset 0x%08x, len 0x%08x\n",
+		   eeprom->offset, eeprom->len);
 
-	buf = devm_kcalloc(dev, nwords, sizeof(u16), GFP_KERNEL);
+	buf = kzalloc(eeprom->len, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
-	status = ice_read_sr_buf(hw, first_word, &nwords, buf);
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
 	if (status) {
-		dev_err(dev, "ice_read_sr_buf failed, err %d aq_err %d\n",
-			status, hw->adminq.sq_last_status);
-		eeprom->len = sizeof(u16) * nwords;
+		dev_err(dev, "ice_acquire_nvm failed: %s %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
 		ret = -EIO;
 		goto out;
 	}
 
-	memcpy(bytes, (u8 *)buf + (eeprom->offset & 1), eeprom->len);
+	status = ice_read_flat_nvm(hw, eeprom->offset, &eeprom->len, buf,
+				   false);
+	if (status) {
+		dev_err(dev, "ice_read_flat_nvm failed: %s %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		ret = -EIO;
+		goto release;
+	}
+
+	memcpy(bytes, buf, eeprom->len);
+release:
+	ice_release_nvm(hw);
 out:
-	devm_kfree(dev, buf);
+	kfree(buf);
 	return ret;
 }
 
+static int
+ice_set_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
+	       u8 *bytes)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_hw *hw = &np->vsi->back->hw;
+	struct ice_pf *pf = np->vsi->back;
+	union ice_nvm_access_data *data;
+	struct ice_nvm_access_cmd *nvm;
+	enum ice_status status = 0;
+	int err = 0;
+	u32 magic;
+
+	/* normal ethtool set_eeprom is not supported */
+	nvm = (struct ice_nvm_access_cmd *)eeprom;
+	data = (union ice_nvm_access_data *)bytes;
+	magic = hw->vendor_id | (hw->device_id << 16);
+
+	netdev_dbg(netdev, "SEEPROM cmd 0x%08x, config 0x%08x, offset 0x%08x, data_size 0x%08x\n",
+		   nvm->command, nvm->config, nvm->offset, nvm->data_size);
+	ice_debug_array(hw, ICE_DBG_NVM, 16, 1, (u8 *)data, nvm->data_size);
+
+	if (eeprom->magic == magic)
+		err = -EOPNOTSUPP;
+	/* check for NVM access method */
+	else if (!eeprom->magic || (eeprom->magic >> 16) != hw->device_id)
+		err = -EINVAL;
+	else if (ice_is_reset_in_progress(pf->state))
+		err = -EBUSY;
+	else
+		status = ice_handle_nvm_access(hw, nvm, data);
+
+	if (status) {
+		err = ice_status_to_errno(status);
+		netdev_err(netdev, "NVM write offset 0x%x failed with status %s, error %d\n",
+			   nvm->offset, ice_stat_str(status), err);
+	}
+
+	return err;
+}
+
 /**
  * ice_active_vfs - check if there are any active VFs
  * @pf: board private structure
@@ -282,12 +509,15 @@ ice_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
  */
 static bool ice_active_vfs(struct ice_pf *pf)
 {
-	struct ice_vf *vf = pf->vf;
-	int i;
+	unsigned int i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
 
-	for (i = 0; i < pf->num_alloc_vfs; i++, vf++)
 		if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
 			return true;
+	}
+
 	return false;
 }
 
@@ -307,7 +537,8 @@ static u64 ice_link_test(struct net_device *netdev)
 	netdev_info(netdev, "link test\n");
 	status = ice_get_link_status(np->vsi->port_info, &link_up);
 	if (status) {
-		netdev_err(netdev, "link query error, status = %d\n", status);
+		netdev_err(netdev, "link query error, status = %s\n",
+			   ice_stat_str(status));
 		return 1;
 	}
 
@@ -341,14 +572,16 @@ static u64 ice_eeprom_test(struct net_device *netdev)
  */
 static int ice_reg_pattern_test(struct ice_hw *hw, u32 reg, u32 mask)
 {
-	struct ice_pf *pf = (struct ice_pf *)hw->back;
 	static const u32 patterns[] = {
 		0x5A5A5A5A, 0xA5A5A5A5,
 		0x00000000, 0xFFFFFFFF
 	};
+	struct ice_pf *pf = hw->back;
+	struct device *dev;
 	u32 val, orig_val;
-	int i;
+	unsigned int i;
 
+	dev = ice_pf_to_dev(pf);
 	orig_val = rd32(hw, reg);
 	for (i = 0; i < ARRAY_SIZE(patterns); ++i) {
 		u32 pattern = patterns[i] & mask;
@@ -357,8 +590,7 @@ static int ice_reg_pattern_test(struct ice_hw *hw, u32 reg, u32 mask)
 		val = rd32(hw, reg);
 		if (val == pattern)
 			continue;
-		dev_err(&pf->pdev->dev,
-			"%s: reg pattern test failed - reg 0x%08x pat 0x%08x val 0x%08x\n"
+		dev_err(dev, "%s: reg pattern test failed - reg 0x%08x pat 0x%08x val 0x%08x\n"
 			, __func__, reg, pattern, val);
 		return 1;
 	}
@@ -366,8 +598,7 @@ static int ice_reg_pattern_test(struct ice_hw *hw, u32 reg, u32 mask)
 	wr32(hw, reg, orig_val);
 	val = rd32(hw, reg);
 	if (val != orig_val) {
-		dev_err(&pf->pdev->dev,
-			"%s: reg restore test failed - reg 0x%08x orig 0x%08x val 0x%08x\n"
+		dev_err(dev, "%s: reg restore test failed - reg 0x%08x orig 0x%08x val 0x%08x\n"
 			, __func__, reg, orig_val, val);
 		return 1;
 	}
@@ -402,7 +633,7 @@ static u64 ice_reg_test(struct net_device *netdev)
 			GLINT_ITR(2, 1) - GLINT_ITR(2, 0)},
 		{GLINT_CTL, 0xffff0001, 1, 0}
 	};
-	int i;
+	unsigned int i;
 
 	netdev_dbg(netdev, "Register test\n");
 	for (i = 0; i < ARRAY_SIZE(ice_reg_list); ++i) {
@@ -447,7 +678,7 @@ static int ice_lbtest_prepare_rings(struct ice_vsi *vsi)
 	if (status)
 		goto err_setup_rx_ring;
 
-	status = ice_vsi_start_rx_rings(vsi);
+	status = ice_vsi_start_all_rx_rings(vsi);
 	if (status)
 		goto err_start_rx_ring;
 
@@ -479,7 +710,7 @@ static int ice_lbtest_disable_rings(struct ice_vsi *vsi)
 		netdev_err(vsi->netdev, "Failed to stop Tx rings, VSI %d error %d\n",
 			   vsi->vsi_num, status);
 
-	status = ice_vsi_stop_rx_rings(vsi);
+	status = ice_vsi_stop_all_rx_rings(vsi);
 	if (status)
 		netdev_err(vsi->netdev, "Failed to stop Rx rings, VSI %d error %d\n",
 			   vsi->vsi_num, status);
@@ -506,7 +737,7 @@ static int ice_lbtest_create_frame(struct ice_pf *pf, u8 **ret_data, u16 size)
 	if (!pf)
 		return -EINVAL;
 
-	data = devm_kzalloc(&pf->pdev->dev, size, GFP_KERNEL);
+	data = devm_kzalloc(ice_pf_to_dev(pf), size, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
@@ -623,7 +854,7 @@ static int ice_lbtest_receive_frames(struct ice_ring *rx_ring)
 			continue;
 
 		rx_buf = &rx_ring->rx_buf[i];
-		received_buf = page_address(rx_buf->page);
+		received_buf = page_address(rx_buf->page) + rx_buf->page_offset;
 
 		if (ice_lbtest_check_frame(received_buf))
 			valid_frames++;
@@ -647,15 +878,16 @@ static u64 ice_loopback_test(struct net_device *netdev)
 	struct ice_ring *tx_ring, *rx_ring;
 	u8 broadcast[ETH_ALEN], ret = 0;
 	int num_frames, valid_frames;
-	LIST_HEAD(tmp_list);
+	struct device *dev;
 	u8 *tx_frame;
 	int i;
 
+	dev = ice_pf_to_dev(pf);
 	netdev_info(netdev, "loopback test\n");
 
 	test_vsi = ice_lb_vsi_setup(pf, pf->hw.port_info);
 	if (!test_vsi) {
-		netdev_err(netdev, "Failed to create a VSI for the loopback test");
+		netdev_err(netdev, "Failed to create a VSI for the loopback test\n");
 		return 1;
 	}
 
@@ -681,16 +913,11 @@ static u64 ice_loopback_test(struct net_device *netdev)
 
 	/* Test VSI needs to receive broadcast packets */
 	eth_broadcast_addr(broadcast);
-	if (ice_add_mac_to_list(test_vsi, &tmp_list, broadcast)) {
+	if (ice_fltr_add_mac(test_vsi, broadcast, ICE_FWD_TO_VSI)) {
 		ret = 5;
 		goto lbtest_mac_dis;
 	}
 
-	if (ice_add_mac(&pf->hw, &tmp_list)) {
-		ret = 6;
-		goto free_mac_list;
-	}
-
 	if (ice_lbtest_create_frame(pf, &tx_frame, ICE_LB_FRAME_SIZE)) {
 		ret = 7;
 		goto remove_mac_filters;
@@ -711,12 +938,10 @@ static u64 ice_loopback_test(struct net_device *netdev)
 		ret = 10;
 
 lbtest_free_frame:
-	devm_kfree(&pf->pdev->dev, tx_frame);
+	devm_kfree(dev, tx_frame);
 remove_mac_filters:
-	if (ice_remove_mac(&pf->hw, &tmp_list))
-		netdev_err(netdev, "Could not remove MAC filter for the test VSI");
-free_mac_list:
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_list);
+	if (ice_fltr_remove_mac(test_vsi, broadcast, ICE_FWD_TO_VSI))
+		netdev_err(netdev, "Could not remove MAC filter for the test VSI\n");
 lbtest_mac_dis:
 	/* Disable MAC loopback after the test is completed. */
 	if (ice_aq_set_mac_loopback(&pf->hw, false, NULL))
@@ -727,7 +952,7 @@ static u64 ice_loopback_test(struct net_device *netdev)
 lbtest_vsi_close:
 	test_vsi->netdev = NULL;
 	if (ice_vsi_release(test_vsi))
-		netdev_err(netdev, "Failed to remove the test VSI");
+		netdev_err(netdev, "Failed to remove the test VSI\n");
 
 	return ret;
 }
@@ -773,22 +998,24 @@ ice_self_test(struct net_device *netdev, struct ethtool_test *eth_test,
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	bool if_running = netif_running(netdev);
 	struct ice_pf *pf = np->vsi->back;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
 
 	if (eth_test->flags == ETH_TEST_FL_OFFLINE) {
 		netdev_info(netdev, "offline testing starting\n");
 
-		set_bit(__ICE_TESTING, pf->state);
+		set_bit(ICE_TESTING, pf->state);
 
-		if (ice_active_vfs(pf)) {
-			dev_warn(&pf->pdev->dev,
-				 "Please take active VFs and Netqueues offline and restart the adapter before running NIC diagnostics\n");
+		if (ice_active_vfs(pf) || ice_active_vmdqs(pf)) {
+			dev_warn(dev, "Please take active VFs and Netqueues offline and restart the adapter before running NIC diagnostics\n");
 			data[ICE_ETH_TEST_REG] = 1;
 			data[ICE_ETH_TEST_EEPROM] = 1;
 			data[ICE_ETH_TEST_INTR] = 1;
 			data[ICE_ETH_TEST_LOOP] = 1;
 			data[ICE_ETH_TEST_LINK] = 1;
 			eth_test->flags |= ETH_TEST_FL_FAILED;
-			clear_bit(__ICE_TESTING, pf->state);
+			clear_bit(ICE_TESTING, pf->state);
 			goto skip_ol_tests;
 		}
 		/* If the device is online then take it offline */
@@ -809,14 +1036,13 @@ ice_self_test(struct net_device *netdev, struct ethtool_test *eth_test,
 		    data[ICE_ETH_TEST_REG])
 			eth_test->flags |= ETH_TEST_FL_FAILED;
 
-		clear_bit(__ICE_TESTING, pf->state);
+		clear_bit(ICE_TESTING, pf->state);
 
 		if (if_running) {
 			int status = ice_open(netdev);
 
 			if (status) {
-				dev_err(&pf->pdev->dev,
-					"Could not open device %s, err %d",
+				dev_err(dev, "Could not open device %s, err %d\n",
 					pf->int_name, status);
 			}
 		}
@@ -839,1441 +1065,2806 @@ ice_self_test(struct net_device *netdev, struct ethtool_test *eth_test,
 	netdev_info(netdev, "testing finished\n");
 }
 
-static void ice_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+/**
+ * ice_get_xdp_rx_strings
+ * @q: queue index
+ * @num_rxq: number of rx queue
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Rx queue XDP related counters strings
+ */
+static void
+ice_get_xdp_rx_strings(unsigned int q, u16 num_rxq, char **loc_in_buf)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	char *p = (char *)data;
-	unsigned int i;
+	char *p;
 
-	switch (stringset) {
-	case ETH_SS_STATS:
-		for (i = 0; i < ICE_VSI_STATS_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 ice_gstrings_vsi_stats[i].stat_string);
-			p += ETH_GSTRING_LEN;
-		}
+	if (!loc_in_buf || q >= num_rxq)
+		return;
 
-		ice_for_each_alloc_txq(vsi, i) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "tx_queue_%u_packets", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN, "tx_queue_%u_bytes", i);
-			p += ETH_GSTRING_LEN;
-		}
+	p = *loc_in_buf;
+
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx_q-%u_bytes", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-passed_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-dropped_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-tx_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-tx-fail_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-unknown_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-redirected_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-redir-fail_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+
+	/* copy back updated length */
+	*loc_in_buf = p;
+}
 
-		ice_for_each_alloc_rxq(vsi, i) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "rx_queue_%u_packets", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_bytes", i);
-			p += ETH_GSTRING_LEN;
-		}
+/**
+ * ice_get_xdp_tx_strings
+ * @vsi: pointer to the VSI structure
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Tx queue XDP related counters strings
+ */
+static void
+ice_get_xdp_tx_strings(struct ice_vsi *vsi, char **loc_in_buf)
+{
+	char *p;
+	u16 q;
 
-		if (vsi->type != ICE_VSI_PF)
-			return;
+	if (!vsi || !loc_in_buf)
+		return;
 
-		for (i = 0; i < ICE_PF_STATS_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 ice_gstrings_pf_stats[i].stat_string);
-			p += ETH_GSTRING_LEN;
-		}
+	for (q = 0; q < vsi->num_xdp_txq; ++q) {
+		p = *loc_in_buf;
 
-		for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "tx_priority_%u_xon.nic", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "tx_priority_%u_xoff.nic", i);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "rx_priority_%u_xon.nic", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "rx_priority_%u_xoff.nic", i);
-			p += ETH_GSTRING_LEN;
-		}
-		break;
-	case ETH_SS_TEST:
-		memcpy(data, ice_gstrings_test, ICE_TEST_LEN * ETH_GSTRING_LEN);
-		break;
-	case ETH_SS_PRIV_FLAGS:
-		for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 ice_gstrings_priv_flags[i].name);
-			p += ETH_GSTRING_LEN;
-		}
-		break;
-	default:
-		break;
+		snprintf(p, ETH_GSTRING_LEN, "xdp-tx_q-%u_pkts", q);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN, "xdp-tx_q-%u_bytes", q);
+		p += ETH_GSTRING_LEN;
+
+		/* copy back updated length */
+		*loc_in_buf = p;
 	}
 }
 
-static int
-ice_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state)
+/**
+ * ice_get_xdp_rx_stats - get stats for Rx rings if XDP is enabled
+ * @xdp_stats: ptr to stats being updated
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ *
+ * This function reads XDP per-action Rx statistics.
+ */
+static void
+ice_get_xdp_rx_stats(struct ice_xdp_stats *xdp_stats, u64 *data, int *idx)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	bool led_active;
+	bool set = !!xdp_stats;
+	int i;
 
-	switch (state) {
-	case ETHTOOL_ID_ACTIVE:
-		led_active = true;
-		break;
-	case ETHTOOL_ID_INACTIVE:
-		led_active = false;
-		break;
-	default:
-		return -EINVAL;
-	}
+	if (!idx)
+		return;
 
-	if (ice_aq_set_port_id_led(np->vsi->port_info, !led_active, NULL))
-		return -EIO;
+	i = *idx; /* start index in data buffer */
 
-	return 0;
+	data[i++] = set ? xdp_stats->xdp_rx_pkts : 0;
+	data[i++] = set ? xdp_stats->xdp_rx_bytes : 0;
+	data[i++] = set ? xdp_stats->xdp_pass : 0;
+	data[i++] = set ? xdp_stats->xdp_drop : 0;
+	data[i++] = set ? xdp_stats->xdp_tx : 0;
+	data[i++] = set ? xdp_stats->xdp_tx_fail : 0;
+	data[i++] = set ? xdp_stats->xdp_unknown : 0;
+	data[i++] = set ? xdp_stats->xdp_redirect : 0;
+	data[i++] = set ? xdp_stats->xdp_redirect_fail : 0;
+
+	/* copy back updated index */
+	*idx = i;
 }
 
 /**
- * ice_set_fec_cfg - Set link FEC options
- * @netdev: network interface device structure
- * @req_fec: FEC mode to configure
+ * ice_get_xdp_tx_stats - get stats for XDP Tx rings if XDP is enabled
+ * @vsi: pointer to the VSI structure
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ *
+ * This function reads XDP per-action Tx statistics.
  */
-static int ice_set_fec_cfg(struct net_device *netdev, enum ice_fec_mode req_fec)
+static void
+ice_get_xdp_tx_stats(struct ice_vsi *vsi, u64 *data, int *idx)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_aqc_set_phy_cfg_data config = { 0 };
-	struct ice_aqc_get_phy_caps_data *caps;
-	struct ice_vsi *vsi = np->vsi;
-	u8 sw_cfg_caps, sw_cfg_fec;
-	struct ice_port_info *pi;
-	enum ice_status status;
-	int err = 0;
-
-	pi = vsi->port_info;
-	if (!pi)
-		return -EOPNOTSUPP;
+	int i, q;
 
-	/* Changing the FEC parameters is not supported if not the PF VSI */
-	if (vsi->type != ICE_VSI_PF) {
-		netdev_info(netdev, "Changing FEC parameters only supported for PF VSI\n");
-		return -EOPNOTSUPP;
-	}
+	if (!idx || !vsi || !vsi->xdp_rings)
+		return;
 
-	/* Get last SW configuration */
-	caps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*caps), GFP_KERNEL);
-	if (!caps)
-		return -ENOMEM;
+	i = *idx; /* start index in data buffer */
 
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG,
-				     caps, NULL);
-	if (status) {
-		err = -EAGAIN;
-		goto done;
-	}
+	for (q = 0; q < vsi->num_xdp_txq; ++q) {
+		struct ice_q_stats *stats;
+		struct ice_ring *ring;
 
-	/* Copy SW configuration returned from PHY caps to PHY config */
-	ice_copy_phy_caps_to_cfg(caps, &config);
-	sw_cfg_caps = caps->caps;
-	sw_cfg_fec = caps->link_fec_options;
+		ring = READ_ONCE(vsi->xdp_rings[q]);
 
-	/* Get toloplogy caps, then copy PHY FEC topoloy caps to PHY config */
-	memset(caps, 0, sizeof(*caps));
+		stats = !!ring ? &ring->stats : NULL;
 
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP,
-				     caps, NULL);
-	if (status) {
-		err = -EAGAIN;
-		goto done;
+		data[i++] = !!ring ? stats->pkts : 0;
+		data[i++] = !!ring ? stats->bytes : 0;
 	}
 
-	config.caps |= (caps->caps & ICE_AQC_PHY_EN_AUTO_FEC);
-	config.link_fec_opt = caps->link_fec_options;
-
-	ice_cfg_phy_fec(&config, req_fec);
-
-	/* If FEC mode has changed, then set PHY configuration and enable AN. */
-	if ((config.caps & ICE_AQ_PHY_ENA_AUTO_FEC) !=
-	    (sw_cfg_caps & ICE_AQC_PHY_EN_AUTO_FEC) ||
-	    config.link_fec_opt != sw_cfg_fec) {
-		if (caps->caps & ICE_AQC_PHY_AN_MODE)
-			config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+	/* copy back updated index */
+	*idx = i;
+}
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
 
-		status = ice_aq_set_phy_cfg(pi->hw, pi->lport, &config, NULL);
+#ifdef ADQ_PERF_COUNTERS
+/**
+ * ice_get_chnl_tx_strings
+ * @vsi: ptr to VSI
+ * @q:  queue index
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Tx queue related strings for ADQ performance counters
+ */
+static void
+ice_get_chnl_tx_strings(struct ice_vsi *vsi, unsigned int q, char **loc_in_buf)
+{
+	char *p;
 
-		if (status)
-			err = -EAGAIN;
-	}
+	if (!loc_in_buf)
+		return;
+	if (q >= vsi->num_txq)
+		return;
 
-done:
-	devm_kfree(&vsi->back->pdev->dev, caps);
-	return err;
+	p = *loc_in_buf;
+
+	/* Tx queue specific extra counters */
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_BUSY_POLL, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_NOT_BUSY_POLL, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_ATR_SETUP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_MARK_ATR_SETUP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_ATR_TEARDOWN, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_ATR_BAIL, q);
+	p += ETH_GSTRING_LEN;
+
+	/* copy back updated length */
+	*loc_in_buf = p;
 }
 
 /**
- * ice_set_fecparam - Set FEC link options
- * @netdev: network interface device structure
- * @fecparam: Ethtool structure to retrieve FEC parameters
+ * ice_get_chnl_tx_stats - get stats for Tx rings if channel enabled
+ * @vsi: ptr to VSI
+ * @q:  queue index
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ * @set: copy counters if true otherwise copy zero
+ *
+ * This function is used to collect performance counters for specific Tx ring.
  */
-static int
-ice_set_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam)
+static void
+ice_get_chnl_tx_stats(struct ice_vsi *vsi, int q, u64 *data, int *idx, bool set)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	enum ice_fec_mode fec;
+	struct ice_ch_q_stats *ch_stats;
+	struct ice_ring *tx_ring;
+	int i;
 
-	switch (fecparam->fec) {
-	case ETHTOOL_FEC_AUTO:
-		fec = ICE_FEC_AUTO;
-		break;
-	case ETHTOOL_FEC_RS:
-		fec = ICE_FEC_RS;
-		break;
-	case ETHTOOL_FEC_BASER:
-		fec = ICE_FEC_BASER;
-		break;
-	case ETHTOOL_FEC_OFF:
-	case ETHTOOL_FEC_NONE:
-		fec = ICE_FEC_NONE;
-		break;
-	default:
-		dev_warn(&vsi->back->pdev->dev, "Unsupported FEC mode: %d\n",
-			 fecparam->fec);
-		return -EINVAL;
-	}
+	if (!idx)
+		return;
+	if (q >= vsi->num_txq)
+		return;
 
-	return ice_set_fec_cfg(netdev, fec);
+	tx_ring = vsi->tx_rings[q];
+	ch_stats = &tx_ring->ch_q_stats;
+	set = set && ch_stats;
+
+	i = *idx; /* start index in data buffer */
+
+	/* Tx queue specific extra counters */
+	data[i++] = set ? ch_stats->poll.bp_packets : 0;
+	data[i++] = set ? ch_stats->poll.np_packets : 0;
+	data[i++] = set ? ch_stats->tx.num_atr_setup : 0;
+	data[i++] = set ? ch_stats->tx.num_mark_atr_setup : 0;
+	data[i++] = set ? ch_stats->tx.num_atr_evict : 0;
+	data[i++] = set ? ch_stats->tx.num_atr_bailouts : 0;
+
+	/* copy back updated index */
+	*idx = i;
 }
 
 /**
- * ice_get_fecparam - Get link FEC options
- * @netdev: network interface device structure
- * @fecparam: Ethtool structure to retrieve FEC parameters
+ * ice_get_chnl_rx_stats - get stats for Rx rings if channel enabled
+ * @vsi: ptr to VSI
+ * @q:  queue index
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ * @set: copy counters if true otherwise copy zero
+ *
+ * This function is used to collect performance counters for specific Rx ring
+ * and related vector. All these counters are related to ADQ.
  */
-static int
-ice_get_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam)
+static void
+ice_get_chnl_rx_stats(struct ice_vsi *vsi, int q, u64 *data, int *idx, bool set)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_aqc_get_phy_caps_data *caps;
-	struct ice_link_status *link_info;
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_port_info *pi;
-	enum ice_status status;
-	int err = 0;
+	struct ice_q_vector_ch_stats *vector_ch_stats = NULL;
+	struct ice_ch_q_stats *ch_stats;
+	struct ice_ring *rx_ring;
+	bool orig_set = set;
+	int i;
 
-	pi = vsi->port_info;
+	if (!idx)
+		return;
+	if (q >= vsi->num_rxq)
+		return;
 
-	if (!pi)
-		return -EOPNOTSUPP;
-	link_info = &pi->phy.link_info;
+	rx_ring = vsi->rx_rings[q];
+	ch_stats = &rx_ring->ch_q_stats;
+	if (rx_ring->q_vector)
+		vector_ch_stats = &rx_ring->q_vector->ch_stats;
+
+	i = *idx; /* start index in data buffer */
+
+	/* Rx queue specific extra counters */
+	set = orig_set && ch_stats;
+	/* busy_poll and not busy_poll packets */
+	data[i++] = set ? ch_stats->poll.bp_packets : 0;
+	data[i++] = set ? ch_stats->poll.np_packets : 0;
+	/* Rx queue set/bailout from override */
+	data[i++] = set ? ch_stats->rx.num_rx_queue_set : 0;
+	data[i++] = set ? ch_stats->rx.num_rx_queue_bailouts : 0;
+	/* ctrl pkts, only ctrl_pkts,, FIN/RST/SYN */
+	data[i++] = set ? ch_stats->rx.num_tcp_ctrl_pkts : 0;
+	data[i++] = set ? ch_stats->rx.num_only_ctrl_pkts : 0;
+	data[i++] = set ? ch_stats->rx.num_tcp_flags_fin : 0;
+	data[i++] = set ? ch_stats->rx.num_tcp_flags_rst : 0;
+	data[i++] = set ? ch_stats->rx.num_tcp_flags_syn : 0;
+	/* BP: no data packets cleaned */
+	data[i++] = set ? ch_stats->rx.num_no_data_pkt_bp : 0;
+
+	/* vector specific extra counters */
+	set = orig_set && vector_ch_stats;
+	/* state machine */
+	data[i++] = set ? vector_ch_stats->in_bp : 0;
+	data[i++] = set ? vector_ch_stats->real_int_to_bp : 0;
+	data[i++] = set ? vector_ch_stats->real_bp_to_bp : 0;
+	data[i++] = set ? vector_ch_stats->in_int : 0;
+	data[i++] = set ? vector_ch_stats->real_bp_to_int : 0;
+	data[i++] = set ? vector_ch_stats->real_int_to_int : 0;
+	/* unlikely_cb_to_bp, once_in_bp */
+	data[i++] = set ? vector_ch_stats->unlikely_cb_to_bp : 0;
+	data[i++] = set ? vector_ch_stats->ucb_o_bp : 0;
+	data[i++] = set ? vector_ch_stats->once_bp_false : 0;
+	/* Busypoll stop due to either need_resched() or possible timeout */
+	data[i++] = set ? vector_ch_stats->num_need_resched_bp_stop : 0;
+	data[i++] = set ? vector_ch_stats->num_timeout_bp_stop : 0;
+	/* Busypoll->Interrupt, last time "cleaned data packets" */
+	data[i++] = set ? vector_ch_stats->cleaned_any_data_pkt : 0;
+	/* need_resched() and !cleaned data packets */
+	data[i++] = set ? vector_ch_stats->num_l_c_data_pkt : 0;
+	/* possible timeout and !cleaned data packets */
+	data[i++] = set ? vector_ch_stats->num_l_c_data_pkt1 : 0;
+	/* software triggered omterrupt either from napi_poll based
+	 * on channel specific heuristic or from service_task
+	 */
+	data[i++] = set ? vector_ch_stats->num_sw_intr_timeout : 0;
+	data[i++] = set ? vector_ch_stats->num_sw_intr_serv_task : 0;
+	/* times, SW triggered interrupt were not fired */
+	data[i++] = set ? vector_ch_stats->num_no_sw_intr_opt_off : 0;
+	/* number of times WB_ON_ITR is set */
+	data[i++] = set ? vector_ch_stats->num_wb_on_itr_set : 0;
+
+	/* number of Rx packets processed when busy_poll_stop is invoked */
+	data[i++] = set ? vector_ch_stats->pkt_bp_stop_bp_budget : 0;
+
+	/* number of Rx packets processed when napi_schedule is invoked because
+	 * busy_poll_stop:napi_poll returned budget
+	 */
+	data[i++] = set ? vector_ch_stats->pkt_bp_stop_napi_budget : 0;
 
-	/* Set FEC mode based on negotiated link info */
-	switch (link_info->fec_info) {
-	case ICE_AQ_LINK_25G_KR_FEC_EN:
-		fecparam->active_fec = ETHTOOL_FEC_BASER;
-		break;
-	case ICE_AQ_LINK_25G_RS_528_FEC_EN:
-		/* fall through */
-	case ICE_AQ_LINK_25G_RS_544_FEC_EN:
-		fecparam->active_fec = ETHTOOL_FEC_RS;
-		break;
-	default:
-		fecparam->active_fec = ETHTOOL_FEC_OFF;
-		break;
-	}
+	/* num of times work_done == budget from busy_poll_stop code path */
+	data[i++] = set ? vector_ch_stats->bp_wd_equals_budget8 : 0;
 
-	caps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*caps), GFP_KERNEL);
-	if (!caps)
-		return -ENOMEM;
+	/* num of times work_done == budget from napi_shedule which gets invoked
+	 * if busy_poll_stop:napi_poll returned "budget"
+	 */
+	data[i++] = set ? vector_ch_stats->bp_wd_equals_budget64 : 0;
 
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP,
-				     caps, NULL);
-	if (status) {
-		err = -EAGAIN;
-		goto done;
-	}
+	/* how many times, kept internal state to be in BUSY_POLL
+	 * when napi_poll is invoked due to busy_poll_stop
+	 */
+	data[i++] = set ? vector_ch_stats->keep_state_bp_budget8 : 0;
 
-	/* Set supported/configured FEC modes based on PHY capability */
-	if (caps->caps & ICE_AQC_PHY_EN_AUTO_FEC)
-		fecparam->fec |= ETHTOOL_FEC_AUTO;
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_REQ)
-		fecparam->fec |= ETHTOOL_FEC_BASER;
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN)
-		fecparam->fec |= ETHTOOL_FEC_RS;
-	if (caps->link_fec_options == 0)
-		fecparam->fec |= ETHTOOL_FEC_OFF;
+	/* how many times, kept internal state to be in BUSY_POLL
+	 * when napi_poll is invoked due to napi_schedule.
+	 */
+	data[i++] = set ? vector_ch_stats->keep_state_bp_budget64 : 0;
 
-done:
-	devm_kfree(&vsi->back->pdev->dev, caps);
-	return err;
+	/* copy back updated index */
+	*idx = i;
 }
 
 /**
- * ice_get_priv_flags - report device private flags
- * @netdev: network interface device structure
+ * ice_get_chnl_rx_strings
+ * @vsi: ptr to VSI
+ * @q:  queue index
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
  *
- * The get string set count and the string set should be matched for each
- * flag returned.  Add new strings for each flag to the ice_gstrings_priv_flags
- * array.
- *
- * Returns a u32 bitmap of flags.
+ * This function returns Rx queue and vector related strings for
+ * ADQ performance counters
  */
-static u32 ice_get_priv_flags(struct net_device *netdev)
+static void
+ice_get_chnl_rx_strings(struct ice_vsi *vsi, unsigned int q, char **loc_in_buf)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	u32 i, ret_flags = 0;
+	char *p;
 
-	for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
-		const struct ice_priv_flag *priv_flag;
+	if (!loc_in_buf)
+		return;
+	if (q >= vsi->num_rxq)
+		return;
 
-		priv_flag = &ice_gstrings_priv_flags[i];
+	p = *loc_in_buf;
+
+	/* Rx queue specific extra counters */
+
+	/* busy and non-busy poll packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BUSY_POLL, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_NOT_BUSY_POLL, q);
+	p += ETH_GSTRING_LEN;
+	/* number of times Rx queue was set thru' Rx queue override logic */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_SET, q);
+	p += ETH_GSTRING_LEN;
+	/* number of times Rx queue was not set thru' Rx queue override logic */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BAIL, q);
+	p += ETH_GSTRING_LEN;
+	/* total TCP ctrl pkts */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TCP_CTRL_PKTS, q);
+	p += ETH_GSTRING_LEN;
+	/* total "only ctrl pkts" */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_ONLY_CTRL_PKTS, q);
+	p += ETH_GSTRING_LEN;
+	/* number of FIN recv */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TCP_FIN_RECV, q);
+	p += ETH_GSTRING_LEN;
+	/* number of RST recv */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TCP_RST_RECV, q);
+	p += ETH_GSTRING_LEN;
+	/* number of SYN recv */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TCP_SYN_RECV, q);
+	p += ETH_GSTRING_LEN;
+	/* BP, but didn't clean any data packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_NO_DATA_PKT, q);
+	p += ETH_GSTRING_LEN;
+
+	/* Vector specific extra counters */
+
+	/* tracking BP, INT, BP->INT, INT->BP */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_IN_BP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_INTR_TO_BP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_TO_BP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_IN_INTR, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_TO_INTR, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_INTR_TO_INTR, q);
+	p += ETH_GSTRING_LEN;
+	/* unlikely comeback to busy_poll */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_UNLIKELY_CB_TO_BP, q);
+	p += ETH_GSTRING_LEN;
+	/* unlikely comeback to busy_poll and once_in_bp is true */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_UCB_ONCE_IN_BP, q);
+	p += ETH_GSTRING_LEN;
+	/* once_in_bp is false */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_INTR_ONCE_IN_BP_FALSE, q);
+	p += ETH_GSTRING_LEN;
+	/* busy_poll stop due to need_resched() */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_STOP_NEED_RESCHED, q);
+	p += ETH_GSTRING_LEN;
+	/* busy_poll stop due to possible due to timeout */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_STOP_TIMEOUT, q);
+	p += ETH_GSTRING_LEN;
+	/* Transition: BP->INT: previously cleaned data packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_CLEANED_ANY_DATA_PKT, q);
+	p += ETH_GSTRING_LEN;
+	/* need_resched(), but didn't clean any data packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_NEED_RESCHED_NO_DATA, q);
+	p += ETH_GSTRING_LEN;
+	/* possible timeout(), but didn't clean any data packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TIMEOUT_NO_DATA, q);
+	p += ETH_GSTRING_LEN;
+	/* number of SW triggered interrupt from napi_poll due to
+	 * possible timeout detected
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_SW_INTR_TIMEOUT, q);
+	p += ETH_GSTRING_LEN;
+	/* number of SW triggered interrupt from service_task */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_SW_INTR_SERV_TASK, q);
+	p += ETH_GSTRING_LEN;
+	/* number of times, SW triggered interrupt is not triggered from
+	 * napi_poll even when unlikely_cb_to_bp is set, once_in_bp is set
+	 * but ethtool private featute flag is off (for interrupt optimization
+	 * strategy
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_NO_SW_INTR_OPT_OFF, q);
+	p += ETH_GSTRING_LEN;
+	/* number of times WB_ON_ITR is set */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_WB_ON_ITR_SET, q);
+	p += ETH_GSTRING_LEN;
+
+	/* number of Rx packet processed due busy_poll_stop */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_PKTS_BP_STOP_BUDGET8, q);
+	p += ETH_GSTRING_LEN;
+
+	/* number of Rx packet processed due to napi_schedule which gets invoked
+	 * if busy_poll_stop returned budget
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_PKTS_BP_STOP_BUDGET64, q);
+	p += ETH_GSTRING_LEN;
 
-		if (test_bit(priv_flag->bitno, pf->flags))
-			ret_flags |= BIT(i);
-	}
+	/* num of times work_done == budget condition met from
+	 * busy_poll_stop:napi_poll code path
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_WD_EQUAL_BUDGET8, q);
+	p += ETH_GSTRING_LEN;
 
-	return ret_flags;
+	/* num of times work_done == budget condition met from
+	 * napi_schedule:napi_poll code path (this happens if busy_poll_stop
+	 * returned "budget")
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_WD_EQUAL_BUDGET64, q);
+	p += ETH_GSTRING_LEN;
+
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_KEEP_STATE_BP_BUDGET8, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_KEEP_STATE_BP_BUDGET64, q);
+	p += ETH_GSTRING_LEN;
+
+	/* copy back updated length */
+	*loc_in_buf = p;
 }
+#endif /* ADQ_PERF_COUNTERS */
 
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
 /**
- * ice_set_priv_flags - set private flags
- * @netdev: network interface device structure
- * @flags: bit flags to be set
+ * ice_get_macvlan
+ * @id: macvlan ID
+ * @pf: pointer to the PF structure
+ *
+ * Returns the MACVLAN matching the provided ID
  */
-static int ice_set_priv_flags(struct net_device *netdev, u32 flags)
+static struct ice_macvlan *ice_get_macvlan(int id, struct ice_pf *pf)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	DECLARE_BITMAP(change_flags, ICE_PF_FLAGS_NBITS);
-	DECLARE_BITMAP(orig_flags, ICE_PF_FLAGS_NBITS);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	int ret = 0;
-	u32 i;
+	struct ice_macvlan *mv;
 
-	if (flags > BIT(ICE_PRIV_FLAG_ARRAY_SIZE))
-		return -EINVAL;
+	/* If the ID is not marked as in use, no need to search */
+	if (!(test_bit(id, pf->avail_macvlan)))
+		return NULL;
 
-	set_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
+	list_for_each_entry(mv, &pf->macvlan_list, list) {
+		if (id == mv->id)
+			return mv;
+	}
 
-	bitmap_copy(orig_flags, pf->flags, ICE_PF_FLAGS_NBITS);
-	for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
-		const struct ice_priv_flag *priv_flag;
+	return NULL;
+}
 
-		priv_flag = &ice_gstrings_priv_flags[i];
+/**
+ * ice_get_macvlan_tx_strings
+ * @pf: pointer to the PF structure
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Tx related strings for MACVLAN offload
+ */
+static void ice_get_macvlan_tx_strings(struct ice_pf *pf, char **loc_in_buf)
+{
+	char *p;
+	int i;
 
-		if (flags & BIT(i))
-			set_bit(priv_flag->bitno, pf->flags);
-		else
-			clear_bit(priv_flag->bitno, pf->flags);
-	}
+	if (!loc_in_buf)
+		return;
+	p = *loc_in_buf;
 
-	bitmap_xor(change_flags, pf->flags, orig_flags, ICE_PF_FLAGS_NBITS);
+	for (i = 0; i < ICE_MAX_MACVLANS; i++) {
+		struct ice_macvlan *mv = ice_get_macvlan(i, pf);
 
-	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, change_flags)) {
-		if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
-			enum ice_status status;
+		if (mv) {
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_TX_PKTS1,
+				 mv->vdev->name);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_TX_BYTES1,
+				 mv->vdev->name);
+			p += ETH_GSTRING_LEN;
+		} else {
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_TX_PKTS2, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_TX_BYTES2, i);
+			p += ETH_GSTRING_LEN;
+		}
+	}
 
-			/* Disable FW LLDP engine */
-			status = ice_cfg_lldp_mib_change(&pf->hw, false);
+	/* copy back updated length */
+	*loc_in_buf = p;
+}
 
-			/* If unregistering for LLDP events fails, this is
-			 * not an error state, as there shouldn't be any
-			 * events to respond to.
-			 */
-			if (status)
-				dev_info(&pf->pdev->dev,
-					 "Failed to unreg for LLDP events\n");
+/**
+ * ice_get_macvlan_tx_stats
+ * @pf: pointer to the PF structure
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ *
+ * This function is used to collect Tx statistics for MACVLAN offload
+ */
+static void ice_get_macvlan_tx_stats(struct ice_pf *pf, u64 *data, int *idx)
+{
+	int i, j;
 
-			/* The AQ call to stop the FW LLDP agent will generate
-			 * an error if the agent is already stopped.
-			 */
-			status = ice_aq_stop_lldp(&pf->hw, true, true, NULL);
-			if (status)
-				dev_warn(&pf->pdev->dev,
-					 "Fail to stop LLDP agent\n");
-			/* Use case for having the FW LLDP agent stopped
-			 * will likely not need DCB, so failure to init is
-			 * not a concern of ethtool
-			 */
-			status = ice_init_pf_dcb(pf, true);
-			if (status)
-				dev_warn(&pf->pdev->dev, "Fail to init DCB\n");
+	if (!idx)
+		return;
+	j = *idx;
 
-			/* Forward LLDP packets to default VSI so that they
-			 * are passed up the stack
-			 */
-			ice_cfg_sw_lldp(vsi, false, true);
-		} else {
-			enum ice_status status;
-			bool dcbx_agent_status;
+	for (i = 0; i < ICE_MAX_MACVLANS; i++) {
+		struct ice_macvlan *mv;
 
-			/* AQ command to start FW LLDP agent will return an
-			 * error if the agent is already started
-			 */
-			status = ice_aq_start_lldp(&pf->hw, true, NULL);
-			if (status)
-				dev_warn(&pf->pdev->dev,
-					 "Fail to start LLDP Agent\n");
+		mv = ice_get_macvlan(i, pf);
+		if (mv) {
+			data[j++] = mv->vsi->net_stats.tx_packets;
+			data[j++] = mv->vsi->net_stats.tx_bytes;
+		} else {
+			data[j++] = 0;
+			data[j++] = 0;
+		}
+	}
 
-			/* AQ command to start FW DCBX agent will fail if
-			 * the agent is already started
-			 */
-			status = ice_aq_start_stop_dcbx(&pf->hw, true,
-							&dcbx_agent_status,
-							NULL);
-			if (status)
-				dev_dbg(&pf->pdev->dev,
-					"Failed to start FW DCBX\n");
+	/* copy back updated index */
+	*idx = j;
+}
 
-			dev_info(&pf->pdev->dev, "FW DCBX agent is %s\n",
-				 dcbx_agent_status ? "ACTIVE" : "DISABLED");
+/**
+ * ice_get_macvlan_rx_strings
+ * @pf: pointer to the PF structure
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Rx related strings for MACVLAN offload
+ */
+static void ice_get_macvlan_rx_strings(struct ice_pf *pf, char **loc_in_buf)
+{
+	char *p;
+	int i;
 
-			/* Failure to configure MIB change or init DCB is not
-			 * relevant to ethtool.  Print notification that
-			 * registration/init failed but do not return error
-			 * state to ethtool
-			 */
-			status = ice_init_pf_dcb(pf, true);
-			if (status)
-				dev_dbg(&pf->pdev->dev, "Fail to init DCB\n");
+	if (!loc_in_buf)
+		return;
+	p = *loc_in_buf;
 
-			/* Remove rule to direct LLDP packets to default VSI.
-			 * The FW LLDP engine will now be consuming them.
-			 */
-			ice_cfg_sw_lldp(vsi, false, false);
+	for (i = 0; i < ICE_MAX_MACVLANS; i++) {
+		struct ice_macvlan *mv = ice_get_macvlan(i, pf);
 
-			/* Register for MIB change events */
-			status = ice_cfg_lldp_mib_change(&pf->hw, true);
-			if (status)
-				dev_dbg(&pf->pdev->dev,
-					"Fail to enable MIB change events\n");
+		if (mv) {
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_RX_PKTS1,
+				 mv->vdev->name);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_RX_BYTES1,
+				 mv->vdev->name);
+			p += ETH_GSTRING_LEN;
+		} else {
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_RX_PKTS2, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_RX_BYTES2, i);
+			p += ETH_GSTRING_LEN;
 		}
 	}
-	clear_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
-	return ret;
+
+	/* copy back updated length */
+	*loc_in_buf = p;
 }
 
-static int ice_get_sset_count(struct net_device *netdev, int sset)
+/**
+ * ice_get_macvlan_rx_stats
+ * @pf: private board structure
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ *
+ * This function is used to collect statistics for MACVLAN offload
+ */
+static void
+ice_get_macvlan_rx_stats(struct ice_pf *pf, u64 *data, int *idx)
 {
-	switch (sset) {
-	case ETH_SS_STATS:
-		/* The number (and order) of strings reported *must* remain
-		 * constant for a given netdevice. This function must not
-		 * report a different number based on run time parameters
-		 * (such as the number of queues in use, or the setting of
-		 * a private ethtool flag). This is due to the nature of the
-		 * ethtool stats API.
-		 *
-		 * Userspace programs such as ethtool must make 3 separate
-		 * ioctl requests, one for size, one for the strings, and
-		 * finally one for the stats. Since these cross into
-		 * userspace, changes to the number or size could result in
-		 * undefined memory access or incorrect string<->value
-		 * correlations for statistics.
-		 *
-		 * Even if it appears to be safe, changes to the size or
-		 * order of strings will suffer from race conditions and are
-		 * not safe.
-		 */
-		return ICE_ALL_STATS_LEN(netdev);
-	case ETH_SS_TEST:
-		return ICE_TEST_LEN;
-	case ETH_SS_PRIV_FLAGS:
-		return ICE_PRIV_FLAG_ARRAY_SIZE;
-	default:
-		return -EOPNOTSUPP;
+	int i, j;
+
+	if (!idx)
+		return;
+	j = *idx;
+
+	for (i = 0; i < ICE_MAX_MACVLANS; i++) {
+		struct ice_macvlan *mv;
+
+		mv = ice_get_macvlan(i, pf);
+		if (mv) {
+			data[j++] = mv->vsi->net_stats.rx_packets;
+			data[j++] = mv->vsi->net_stats.rx_bytes;
+		} else {
+			data[j++] = 0;
+			data[j++] = 0;
+		}
 	}
+
+	/* copy back updated index */
+	*idx = j;
 }
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
 
-static void
-ice_get_ethtool_stats(struct net_device *netdev,
-		      struct ethtool_stats __always_unused *stats, u64 *data)
+static void ice_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	struct ice_ring *ring;
-	unsigned int j;
-	int i = 0;
-	char *p;
+	struct ice_vsi *vsi = ice_get_netdev_priv_vsi(np);
+	char *p = (char *)data;
+	unsigned int i;
 
-	ice_update_pf_stats(pf);
-	ice_update_vsi_stats(vsi);
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < ICE_VSI_STATS_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 ice_gstrings_vsi_stats[i].stat_string);
+			p += ETH_GSTRING_LEN;
+		}
+		if (ice_is_port_repr_netdev(netdev))
+			return;
 
-	for (j = 0; j < ICE_VSI_STATS_LEN; j++) {
-		p = (char *)vsi + ice_gstrings_vsi_stats[j].stat_offset;
-		data[i++] = (ice_gstrings_vsi_stats[j].sizeof_stat ==
-			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
-	}
+		ice_for_each_alloc_txq(vsi, i) {
+			snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_PACKETS, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_BYTES, i);
+			p += ETH_GSTRING_LEN;
+#ifdef ICE_ADD_PROBES
+			snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_NAPI_POLL, i);
+			p += ETH_GSTRING_LEN;
+#endif /* ICE_ADD_PROBES */
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_tx_strings(vsi, i, &p);
+#endif /* ADQ_PERF_COUNTERS */
+		}
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+		ice_get_xdp_tx_strings(vsi, &p);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+		ice_get_macvlan_tx_strings(vsi->back, &p);
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
 
-	/* populate per queue stats */
-	rcu_read_lock();
+		ice_for_each_alloc_rxq(vsi, i) {
+			snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_PACKETS, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BYTES, i);
+			p += ETH_GSTRING_LEN;
+#ifdef ICE_ADD_PROBES
+			snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_NAPI_POLL, i);
+			p += ETH_GSTRING_LEN;
+#endif /* ICE_ADD_PROBES */
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_rx_strings(vsi, i, &p);
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+			ice_get_xdp_rx_strings(i, vsi->num_rxq, &p);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+		}
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+		ice_get_macvlan_rx_strings(vsi->back, &p);
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
 
-	ice_for_each_alloc_txq(vsi, j) {
-		ring = READ_ONCE(vsi->tx_rings[j]);
-		if (ring) {
-			data[i++] = ring->stats.pkts;
-			data[i++] = ring->stats.bytes;
-		} else {
-			data[i++] = 0;
-			data[i++] = 0;
+		if (vsi->type != ICE_VSI_PF)
+			return;
+
+		for (i = 0; i < ICE_PF_STATS_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 ice_gstrings_pf_stats[i].stat_string);
+			p += ETH_GSTRING_LEN;
 		}
-	}
 
-	ice_for_each_alloc_rxq(vsi, j) {
-		ring = READ_ONCE(vsi->rx_rings[j]);
-		if (ring) {
-			data[i++] = ring->stats.pkts;
-			data[i++] = ring->stats.bytes;
-		} else {
-			data[i++] = 0;
-			data[i++] = 0;
+		for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
+			snprintf(p, ETH_GSTRING_LEN, PORT_TX_PRIO_XON, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, PORT_TX_PRIO_XOFF, i);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
+			snprintf(p, ETH_GSTRING_LEN, PORT_RX_PRIO_XON, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, PORT_RX_PRIO_XOFF, i);
+			p += ETH_GSTRING_LEN;
+		}
+		break;
+	case ETH_SS_TEST:
+		memcpy(data, ice_gstrings_test, ICE_TEST_LEN * ETH_GSTRING_LEN);
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 ice_gstrings_priv_flags[i].name);
+			p += ETH_GSTRING_LEN;
 		}
+		break;
+	default:
+		break;
 	}
+}
 
-	rcu_read_unlock();
-
-	if (vsi->type != ICE_VSI_PF)
-		return;
+static int
+ice_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	bool led_active;
 
-	for (j = 0; j < ICE_PF_STATS_LEN; j++) {
-		p = (char *)pf + ice_gstrings_pf_stats[j].stat_offset;
-		data[i++] = (ice_gstrings_pf_stats[j].sizeof_stat ==
-			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		led_active = true;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		led_active = false;
+		break;
+	default:
+		return -EINVAL;
 	}
 
-	for (j = 0; j < ICE_MAX_USER_PRIORITY; j++) {
-		data[i++] = pf->stats.priority_xon_tx[j];
-		data[i++] = pf->stats.priority_xoff_tx[j];
-	}
+	if (ice_aq_set_port_id_led(np->vsi->port_info, !led_active, NULL))
+		return -EIO;
 
-	for (j = 0; j < ICE_MAX_USER_PRIORITY; j++) {
-		data[i++] = pf->stats.priority_xon_rx[j];
-		data[i++] = pf->stats.priority_xoff_rx[j];
-	}
+	return 0;
 }
 
 /**
- * ice_phy_type_to_ethtool - convert the phy_types to ethtool link modes
+ * ice_set_fec_cfg - Set link FEC options
  * @netdev: network interface device structure
- * @ks: ethtool link ksettings struct to fill out
+ * @req_fec: FEC mode to configure
  */
-static void
-ice_phy_type_to_ethtool(struct net_device *netdev,
-			struct ethtool_link_ksettings *ks)
+static int ice_set_fec_cfg(struct net_device *netdev, enum ice_fec_mode req_fec)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_link_status *hw_link_info;
-	bool need_add_adv_mode = false;
+	struct ice_aqc_set_phy_cfg_data config = { 0 };
 	struct ice_vsi *vsi = np->vsi;
-	u64 phy_types_high;
-	u64 phy_types_low;
-
-	hw_link_info = &vsi->port_info->phy.link_info;
-	phy_types_low = vsi->port_info->phy.phy_type_low;
-	phy_types_high = vsi->port_info->phy.phy_type_high;
+	struct ice_port_info *pi;
 
-	ethtool_link_ksettings_zero_link_mode(ks, supported);
-	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+	pi = vsi->port_info;
+	if (!pi)
+		return -EOPNOTSUPP;
 
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100BASE_TX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100M_SGMII) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     100baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_1G_SGMII) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     1000baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_KX) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseKX_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     1000baseKX_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_SX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_LX) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseX_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     1000baseX_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_T) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_2500MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     2500baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_X ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_KX) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseX_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_2500MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     2500baseX_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_KR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     5000baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_5GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     5000baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_DA ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_C2C) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     10000baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_KR_CR1) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseKR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     10000baseKR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_SR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseSR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     10000baseSR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_LR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseLR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     10000baseLR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR_S ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR1 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25G_AUI_C2C) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseCR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     25000baseCR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_SR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_LR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseSR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     25000baseSR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR_S ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR1) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseKR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     25000baseKR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_KR4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseKR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     40000baseKR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_CR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_40G_XLAUI) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseCR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     40000baseCR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_SR4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseSR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     40000baseSR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_LR4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseLR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     40000baseLR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_CR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_LAUI2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_AUI2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_CP ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_SR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_AUI1) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseCR2_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_50GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     50000baseCR2_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_KR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseKR2_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_50GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     50000baseKR2_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_SR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_LR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_FR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_LR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseSR2_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_50GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     50000baseSR2_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100G_CAUI4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100G_AUI4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CP2  ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100G_CAUI2 ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100G_AUI2) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100GB)
-			need_add_adv_mode = true;
-	}
-	if (need_add_adv_mode) {
-		need_add_adv_mode = false;
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseCR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_SR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_SR2) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseSR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100GB)
-			need_add_adv_mode = true;
-	}
-	if (need_add_adv_mode) {
-		need_add_adv_mode = false;
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseSR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_LR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_DR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseLR4_ER4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100GB)
-			need_add_adv_mode = true;
-	}
-	if (need_add_adv_mode) {
-		need_add_adv_mode = false;
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseLR4_ER4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_KR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4 ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseKR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100GB)
-			need_add_adv_mode = true;
+	/* Changing the FEC parameters is not supported if not the PF VSI */
+	if (vsi->type != ICE_VSI_PF) {
+		netdev_info(netdev, "Changing FEC parameters only supported for PF VSI\n");
+		return -EOPNOTSUPP;
 	}
-	if (need_add_adv_mode)
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseKR4_Full);
 
-	/* Autoneg PHY types */
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100BASE_TX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_KX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_KX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_KR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_KR_CR1 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR_S ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR1 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR_S ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR1 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_CR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_KR4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Autoneg);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_CR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_KR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_CP ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Autoneg);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_KR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CP2) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Autoneg);
-	}
-}
+	/* Proceed only if requesting different FEC mode */
+	if (pi->phy.curr_user_fec_req == req_fec)
+		return 0;
 
-#define TEST_SET_BITS_TIMEOUT	50
-#define TEST_SET_BITS_SLEEP_MAX	2000
-#define TEST_SET_BITS_SLEEP_MIN	1000
+	/* Copy the current user PHY configuration. The current user PHY
+	 * configuration is initialized during probe from PHY capabilities
+	 * software mode, and updated on set PHY configuration.
+	 */
+	memcpy(&config, &pi->phy.curr_user_phy_cfg, sizeof(config));
+
+	ice_cfg_phy_fec(pi, &config, req_fec);
+	config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+	if (ice_aq_set_phy_cfg(pi->hw, pi, &config, NULL))
+		return -EAGAIN;
+
+	/* Save requested FEC config */
+	pi->phy.curr_user_fec_req = req_fec;
+
+	return 0;
+}
 
+#ifdef ETHTOOL_GFECPARAM
 /**
- * ice_get_settings_link_up - Get Link settings for when link is up
- * @ks: ethtool ksettings to fill in
+ * ice_set_fecparam - Set FEC link options
  * @netdev: network interface device structure
+ * @fecparam: Ethtool structure to retrieve FEC parameters
  */
-static void
-ice_get_settings_link_up(struct ethtool_link_ksettings *ks,
-			 struct net_device *netdev)
+static int
+ice_set_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_port_info *pi = np->vsi->port_info;
-	struct ethtool_link_ksettings cap_ksettings;
-	struct ice_link_status *link_info;
 	struct ice_vsi *vsi = np->vsi;
-	bool unrecog_phy_high = false;
-	bool unrecog_phy_low = false;
-
-	link_info = &vsi->port_info->phy.link_info;
+	enum ice_fec_mode fec;
 
-	/* Initialize supported and advertised settings based on PHY settings */
-	switch (link_info->phy_type_low) {
-	case ICE_PHY_TYPE_LOW_100BASE_TX:
+	switch (fecparam->fec) {
+	case ETHTOOL_FEC_AUTO:
+		fec = ICE_FEC_AUTO;
+		break;
+	case ETHTOOL_FEC_RS:
+		fec = ICE_FEC_RS;
+		break;
+	case ETHTOOL_FEC_BASER:
+		fec = ICE_FEC_BASER;
+		break;
+	case ETHTOOL_FEC_OFF:
+	case ETHTOOL_FEC_NONE:
+		fec = ICE_FEC_NONE;
+		break;
+	default:
+		dev_warn(ice_pf_to_dev(vsi->back), "Unsupported FEC mode: %d\n",
+			 fecparam->fec);
+		return -EINVAL;
+	}
+
+	return ice_set_fec_cfg(netdev, fec);
+}
+
+/**
+ * ice_get_fecparam - Get link FEC options
+ * @netdev: network interface device structure
+ * @fecparam: Ethtool structure to retrieve FEC parameters
+ */
+static int
+ice_get_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_aqc_get_phy_caps_data *caps;
+	struct ice_link_status *link_info;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_port_info *pi;
+	enum ice_status status;
+	int err = 0;
+
+	pi = vsi->port_info;
+
+	if (!pi)
+		return -EOPNOTSUPP;
+	link_info = &pi->phy.link_info;
+
+	/* Set FEC mode based on negotiated link info */
+	switch (link_info->fec_info) {
+	case ICE_AQ_LINK_25G_KR_FEC_EN:
+		fecparam->active_fec = ETHTOOL_FEC_BASER;
+		break;
+	case ICE_AQ_LINK_25G_RS_528_FEC_EN:
+	case ICE_AQ_LINK_25G_RS_544_FEC_EN:
+		fecparam->active_fec = ETHTOOL_FEC_RS;
+		break;
+	default:
+		fecparam->active_fec = ETHTOOL_FEC_OFF;
+		break;
+	}
+
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
+	if (!caps)
+		return -ENOMEM;
+
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+				     caps, NULL);
+	if (status) {
+		err = -EAGAIN;
+		goto done;
+	}
+
+	/* Set supported/configured FEC modes based on PHY capability */
+	if (caps->caps & ICE_AQC_PHY_EN_AUTO_FEC)
+		fecparam->fec |= ETHTOOL_FEC_AUTO;
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_REQ)
+		fecparam->fec |= ETHTOOL_FEC_BASER;
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN)
+		fecparam->fec |= ETHTOOL_FEC_RS;
+	if (caps->link_fec_options == 0)
+		fecparam->fec |= ETHTOOL_FEC_OFF;
+
+done:
+	kfree(caps);
+	return err;
+}
+#endif /* ETHTOOL_GFECPARAM */
+
+/**
+ * ice_nway_reset - restart autonegotiation
+ * @netdev: network interface device structure
+ */
+static int ice_nway_reset(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	int err;
+
+	/* If VSI state is up, then restart autoneg with link up */
+	if (!test_bit(ICE_DOWN, vsi->back->state))
+		err = ice_set_link(vsi, true);
+	else
+		err = ice_set_link(vsi, false);
+
+	return err;
+}
+
+/**
+ * ice_get_priv_flags - report device private flags
+ * @netdev: network interface device structure
+ *
+ * The get string set count and the string set should be matched for each
+ * flag returned. Add new strings for each flag to the ice_gstrings_priv_flags
+ * array.
+ *
+ * Returns a u32 bitmap of flags.
+ */
+static u32 ice_get_priv_flags(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u32 i, ret_flags = 0;
+
+	for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
+		const struct ice_priv_flag *priv_flag;
+
+		priv_flag = &ice_gstrings_priv_flags[i];
+
+		if (test_bit(priv_flag->bitno, pf->flags))
+			ret_flags |= BIT(i);
+	}
+
+	return ret_flags;
+}
+
+#ifdef NETIF_F_HW_TC
+/**
+ * ice_recfg_chnl_vsis - reconfig channel VSIs
+ * @pf: ptr to PF
+ * @vsi: ptr to main VSI
+ *
+ * This function adjust ADQ VSI's feature flags based on changes
+ * in private flag setting - to avoid stale bits in per ADQ VSI's
+ * feature flag.
+ */
+static void ice_recfg_chnl_vsis(struct ice_pf *pf, struct ice_vsi *vsi)
+{
+	struct ice_channel *ch;
+
+	/* Nothing to be done if there is no active ADQ config */
+	if (!ice_is_adq_active(pf))
+		return;
+
+	list_for_each_entry(ch, &vsi->ch_list, list) {
+		struct ice_vsi *ch_vsi;
+
+		ch_vsi = ch->ch_vsi;
+		if (!ch_vsi)
+			continue;
+		/* set/clear VSI level feature flag for ADQ (aka channel) VSIs
+		 * based on PF level private flags
+		 */
+		if (test_bit(ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA, pf->flags))
+			set_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA,
+				ch_vsi->features);
+		else
+			clear_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA,
+				  ch_vsi->features);
+
+		/* set/clear VSI level feature flag for ADQ (aka channel) VSIs
+		 * based on PF level private flags: this flag meant to harvest
+		 * clean of Rx queue upon busy_poll stop and after that clean
+		 * once only.
+		 */
+		if (test_bit(ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_ENA, pf->flags))
+			set_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+				ch_vsi->features);
+		else
+			clear_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+				  ch_vsi->features);
+
+		/* set/clear inline flow-director bits for ADQ (aka channel)
+		 * VSIs based on PF level private flags
+		 */
+		if (test_bit(ICE_FLAG_CHNL_INLINE_FD_ENA, pf->flags))
+			set_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA,
+				ch_vsi->features);
+		else
+			clear_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA,
+				  ch_vsi->features);
+		if (test_bit(ICE_FLAG_CHNL_INLINE_FD_MARK_ENA, pf->flags))
+			set_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA,
+				ch_vsi->features);
+		else
+			clear_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA,
+				  ch_vsi->features);
+	}
+}
+
+/**
+ * ice_recfg_vsi - reconfig specified VSI
+ * @pf: ptr to PF
+ * @vsi: ptr to main VSI
+ *
+ * Set up per vector configurable param which allows cleanup of Tx and
+ * Rx packets upto that many time if napi_schedule is invoked after
+ * busy_poll_stop (where driver returned "budget") based on driver maintained
+ * state for ADQ specific vector.
+ */
+static void ice_recfg_vsi(struct ice_pf *pf, struct ice_vsi *vsi)
+{
+	int q_vectors = vsi->num_q_vectors;
+	int vector;
+
+	if (!q_vectors)
+		return;
+
+	for (vector = 0; vector < q_vectors; vector++) {
+		struct ice_q_vector *qv = vsi->q_vectors[vector];
+
+		if (!qv)
+			continue;
+		if (test_bit(ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_CFG, pf->flags))
+			qv->max_limit_process_rx_queues =
+						ICE_MAX_LIMIT_PROCESS_RX_PKTS;
+		else
+			qv->max_limit_process_rx_queues =
+					ICE_MAX_LIMIT_PROCESS_RX_PKTS_DFLT;
+	}
+}
+#endif /* ADQ_SUPPORT */
+
+/**
+ * ice_set_priv_flags - set private flags
+ * @netdev: network interface device structure
+ * @flags: bit flags to be set
+ */
+static int ice_set_priv_flags(struct net_device *netdev, u32 flags)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	DECLARE_BITMAP(change_flags, ICE_PF_FLAGS_NBITS);
+	DECLARE_BITMAP(orig_flags, ICE_PF_FLAGS_NBITS);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	int ret = 0;
+	u32 i;
+
+	if (flags > BIT(ICE_PRIV_FLAG_ARRAY_SIZE))
+		return -EINVAL;
+
+	dev = ice_pf_to_dev(pf);
+	set_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
+
+	bitmap_copy(orig_flags, pf->flags, ICE_PF_FLAGS_NBITS);
+	for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
+		const struct ice_priv_flag *priv_flag;
+
+		priv_flag = &ice_gstrings_priv_flags[i];
+
+		if (flags & BIT(i))
+			set_bit(priv_flag->bitno, pf->flags);
+		else
+			clear_bit(priv_flag->bitno, pf->flags);
+	}
+
+#ifdef NETIF_F_HW_TC
+	ice_recfg_chnl_vsis(pf, vsi);
+	ice_recfg_vsi(pf, vsi);
+#endif /* ADQ_SUPPORT */
+
+	bitmap_xor(change_flags, pf->flags, orig_flags, ICE_PF_FLAGS_NBITS);
+
+#ifndef ETHTOOL_GFECPARAM
+	if (test_bit(ICE_FLAG_RS_FEC, change_flags) ||
+	    test_bit(ICE_FLAG_BASE_R_FEC, change_flags)) {
+		enum ice_fec_mode fec = ICE_FEC_NONE;
+		int err;
+
+		/* Check if FEC is supported */
+		if (pf->hw.device_id != ICE_DEV_ID_E810C_BACKPLANE &&
+		    pf->hw.device_id != ICE_DEV_ID_E810C_QSFP &&
+		    pf->hw.device_id != ICE_DEV_ID_E810C_SFP) {
+			dev_warn(dev, "Device does not support changing FEC configuration\n");
+			ret = -EOPNOTSUPP;
+			goto ethtool_exit;
+		}
+
+		/* Set FEC configuration */
+		if (test_bit(ICE_FLAG_RS_FEC, pf->flags) &&
+		    test_bit(ICE_FLAG_BASE_R_FEC, pf->flags))
+			fec = ICE_FEC_AUTO;
+		else if (test_bit(ICE_FLAG_RS_FEC, pf->flags))
+			fec = ICE_FEC_RS;
+		else if (test_bit(ICE_FLAG_BASE_R_FEC, pf->flags))
+			fec = ICE_FEC_BASER;
+
+		err = ice_set_fec_cfg(netdev, fec);
+
+		/* If FEC configuration fails, restore original FEC flags */
+		if (err) {
+			if (test_bit(ICE_FLAG_BASE_R_FEC, orig_flags))
+				set_bit(ICE_FLAG_BASE_R_FEC, pf->flags);
+			else
+				clear_bit(ICE_FLAG_BASE_R_FEC, pf->flags);
+
+			if (test_bit(ICE_FLAG_RS_FEC, orig_flags))
+				set_bit(ICE_FLAG_RS_FEC, pf->flags);
+			else
+				clear_bit(ICE_FLAG_RS_FEC, pf->flags);
+
+			ret = err;
+			goto ethtool_exit;
+		}
+	}
+#endif /* !ETHTOOL_GFECPARAM */
+
+	/* Do not allow change to link-down-on-close when Total Port Shutdown
+	 * is enabled.
+	 */
+	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, change_flags) &&
+	    test_bit(ICE_FLAG_TOTAL_PORT_SHUTDOWN_ENA, pf->flags)) {
+		dev_err(dev, "Setting link-down-on-close not supported on this port\n");
+		set_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags);
+		ret = -EINVAL;
+		goto ethtool_exit;
+	}
+
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, change_flags)) {
+		if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
+			enum ice_status status;
+
+			/* Disable FW LLDP engine */
+			status = ice_cfg_lldp_mib_change(&pf->hw, false);
+
+			/* If unregistering for LLDP events fails, this is
+			 * not an error state, as there shouldn't be any
+			 * events to respond to.
+			 */
+			if (status)
+				dev_info(dev, "Failed to unreg for LLDP events\n");
+
+			/* The AQ call to stop the FW LLDP agent will generate
+			 * an error if the agent is already stopped.
+			 */
+			status = ice_aq_stop_lldp(&pf->hw, true, true, NULL);
+			if (status)
+				dev_warn(dev, "Fail to stop LLDP agent\n");
+			/* Use case for having the FW LLDP agent stopped
+			 * will likely not need DCB, so failure to init is
+			 * not a concern of ethtool
+			 */
+			status = ice_init_pf_dcb(pf, true);
+			if (status)
+				dev_warn(dev, "Fail to init DCB\n");
+
+			pf->dcbx_cap &= ~DCB_CAP_DCBX_LLD_MANAGED;
+			pf->dcbx_cap |= DCB_CAP_DCBX_HOST;
+		} else {
+			enum ice_status status;
+			bool dcbx_agent_status;
+
+#ifdef NETIF_F_HW_TC
+			if (ice_is_adq_active(pf)) {
+				dev_err(dev, "Disable ADQ and try again ex:'tc qdisc del dev <eth0> root'\n");
+				/* fw-lldp flag is set without checking if
+				 * the operation is successful or not, so
+				 * clear this flag when it fails
+				 */
+				clear_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
+				ret = -EOPNOTSUPP;
+				goto ethtool_exit;
+			}
+#endif /* NETIF_F_HW_TC */
+			if (ice_get_pfc_mode(pf) == ICE_QOS_MODE_DSCP) {
+				clear_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
+				dev_err(dev, "QoS in L3 DSCP mode, FW Agent not allowed to start\n");
+				ret = -EOPNOTSUPP;
+				goto ethtool_exit;
+			}
+
+			/* Remove rule to direct LLDP packets to default VSI.
+			 * The FW LLDP engine will now be consuming them.
+			 */
+			ice_cfg_sw_lldp(vsi, false, false);
+
+			/* AQ command to start FW LLDP agent will return an
+			 * error if the agent is already started
+			 */
+			status = ice_aq_start_lldp(&pf->hw, true, NULL);
+			if (status)
+				dev_warn(dev, "Fail to start LLDP Agent\n");
+
+			/* AQ command to start FW DCBX agent will fail if
+			 * the agent is already started
+			 */
+			status = ice_aq_start_stop_dcbx(&pf->hw, true,
+							&dcbx_agent_status,
+							NULL);
+			if (status)
+				dev_dbg(dev, "Failed to start FW DCBX\n");
+
+			dev_info(dev, "FW DCBX agent is %s\n",
+				 dcbx_agent_status ? "ACTIVE" : "DISABLED");
+
+			/* Failure to configure MIB change or init DCB is not
+			 * relevant to ethtool.  Print notification that
+			 * registration/init failed but do not return error
+			 * state to ethtool
+			 */
+			status = ice_init_pf_dcb(pf, true);
+			if (status)
+				dev_dbg(dev, "Fail to init DCB\n");
+
+			/* Register for MIB change events */
+			status = ice_cfg_lldp_mib_change(&pf->hw, true);
+			if (status)
+				dev_dbg(dev, "Fail to enable MIB change events\n");
+
+			pf->dcbx_cap &= ~DCB_CAP_DCBX_HOST;
+			pf->dcbx_cap |= DCB_CAP_DCBX_LLD_MANAGED;
+
+			ice_nway_reset(netdev);
+		}
+	}
+	if (test_bit(ICE_FLAG_LEGACY_RX, change_flags)) {
+		/* down and up VSI so that changes of Rx cfg are reflected. */
+		ice_down(vsi);
+		ice_up(vsi);
+	}
+	/* don't allow modification of this flag when a single VF is in
+	 * promiscuous mode because it's not supported
+	 */
+	if (test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, change_flags) &&
+	    ice_is_any_vf_in_promisc(pf)) {
+		dev_err(dev, "Changing vf-true-promisc-support flag while VF(s) are in promiscuous mode not supported\n");
+		/* toggle bit back to previous state */
+		change_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags);
+		ret = -EAGAIN;
+	}
+
+	if (test_bit(ICE_FLAG_VF_VLAN_PRUNE_DIS, change_flags) &&
+	    pf->num_alloc_vfs) {
+		dev_err(dev, "Changing vf-vlan-prune-disable flag while VF(s) are active is not supported\n");
+		/* toggle bit back to previous state */
+		change_bit(ICE_FLAG_VF_VLAN_PRUNE_DIS, change_flags);
+		ret = -EOPNOTSUPP;
+	}
+ethtool_exit:
+	clear_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
+	return ret;
+}
+
+static int ice_get_sset_count(struct net_device *netdev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		/* The number (and order) of strings reported *must* remain
+		 * constant for a given netdevice. This function must not
+		 * report a different number based on run time parameters
+		 * (such as the number of queues in use, or the setting of
+		 * a private ethtool flag). This is due to the nature of the
+		 * ethtool stats API.
+		 *
+		 * Userspace programs such as ethtool must make 3 separate
+		 * ioctl requests, one for size, one for the strings, and
+		 * finally one for the stats. Since these cross into
+		 * userspace, changes to the number or size could result in
+		 * undefined memory access or incorrect string<->value
+		 * correlations for statistics.
+		 *
+		 * Even if it appears to be safe, changes to the size or
+		 * order of strings will suffer from race conditions and are
+		 * not safe.
+		 */
+		if (ice_is_port_repr_netdev(netdev))
+			return ICE_VSI_STATS_LEN;
+		else
+			return ICE_ALL_STATS_LEN(netdev);
+	case ETH_SS_TEST:
+		return ICE_TEST_LEN;
+	case ETH_SS_PRIV_FLAGS:
+		return ICE_PRIV_FLAG_ARRAY_SIZE;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void
+ice_get_ethtool_stats(struct net_device *netdev,
+		      struct ethtool_stats __always_unused *stats, u64 *data)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = ice_get_netdev_priv_vsi(np);
+	struct ice_pf *pf = vsi->back;
+	struct ice_ring *ring;
+	unsigned int j;
+	int i = 0;
+	char *p;
+
+	ice_update_pf_stats(pf);
+	ice_update_vsi_stats(vsi);
+
+	for (j = 0; j < ICE_VSI_STATS_LEN; j++) {
+		p = (char *)vsi + ice_gstrings_vsi_stats[j].stat_offset;
+		data[i++] = (ice_gstrings_vsi_stats[j].sizeof_stat ==
+			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+	if (ice_is_port_repr_netdev(netdev))
+		return;
+	/* populate per queue stats */
+	rcu_read_lock();
+
+	ice_for_each_alloc_txq(vsi, j) {
+		ring = READ_ONCE(vsi->tx_rings[j]);
+		if (ring) {
+			data[i++] = ring->stats.pkts;
+			data[i++] = ring->stats.bytes;
+#ifdef ICE_ADD_PROBES
+			data[i++] = ring->stats.napi_poll_cnt;
+#endif /* ICE_ADD_PROBES */
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_tx_stats(vsi, j, data, &i, true);
+#endif /* ADQ_PERF_COUNTERS */
+		} else {
+			data[i++] = 0;
+			data[i++] = 0;
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_tx_stats(vsi, j, data, &i, false);
+#endif /* ADQ_PERF_COUNTERS */
+		}
+	}
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+	ice_get_xdp_tx_stats(vsi, data, &i);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+	ice_get_macvlan_tx_stats(vsi->back, data, &i);
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
+
+	ice_for_each_alloc_rxq(vsi, j) {
+		ring = READ_ONCE(vsi->rx_rings[j]);
+		if (ring) {
+			data[i++] = ring->stats.pkts;
+			data[i++] = ring->stats.bytes;
+#ifdef ICE_ADD_PROBES
+			data[i++] = ring->stats.napi_poll_cnt;
+#endif /* ICE_ADD_PROBES */
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_rx_stats(vsi, j, data, &i, true);
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+			ice_get_xdp_rx_stats(&ring->xdp_stats, data, &i);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+		} else {
+			data[i++] = 0;
+			data[i++] = 0;
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_rx_stats(vsi, j, data, &i, false);
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+			ice_get_xdp_rx_stats(NULL, data, &i);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+		}
+	}
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+	ice_get_macvlan_rx_stats(vsi->back, data, &i);
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
+
+	rcu_read_unlock();
+
+	if (vsi->type != ICE_VSI_PF)
+		return;
+
+	for (j = 0; j < ICE_PF_STATS_LEN; j++) {
+		p = (char *)pf + ice_gstrings_pf_stats[j].stat_offset;
+		data[i++] = (ice_gstrings_pf_stats[j].sizeof_stat ==
+			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+
+	for (j = 0; j < ICE_MAX_USER_PRIORITY; j++) {
+		data[i++] = pf->stats.priority_xon_tx[j];
+		data[i++] = pf->stats.priority_xoff_tx[j];
+	}
+
+	for (j = 0; j < ICE_MAX_USER_PRIORITY; j++) {
+		data[i++] = pf->stats.priority_xon_rx[j];
+		data[i++] = pf->stats.priority_xoff_rx[j];
+	}
+}
+
+#define ICE_PHY_TYPE_LOW_MASK_MIN_1G	(ICE_PHY_TYPE_LOW_100BASE_TX | \
+					 ICE_PHY_TYPE_LOW_100M_SGMII)
+
+#define ICE_PHY_TYPE_LOW_MASK_MIN_25G	(ICE_PHY_TYPE_LOW_MASK_MIN_1G | \
+					 ICE_PHY_TYPE_LOW_1000BASE_T | \
+					 ICE_PHY_TYPE_LOW_1000BASE_SX | \
+					 ICE_PHY_TYPE_LOW_1000BASE_LX | \
+					 ICE_PHY_TYPE_LOW_1000BASE_KX | \
+					 ICE_PHY_TYPE_LOW_1G_SGMII | \
+					 ICE_PHY_TYPE_LOW_2500BASE_T | \
+					 ICE_PHY_TYPE_LOW_2500BASE_X | \
+					 ICE_PHY_TYPE_LOW_2500BASE_KX | \
+					 ICE_PHY_TYPE_LOW_5GBASE_T | \
+					 ICE_PHY_TYPE_LOW_5GBASE_KR | \
+					 ICE_PHY_TYPE_LOW_10GBASE_T | \
+					 ICE_PHY_TYPE_LOW_10G_SFI_DA | \
+					 ICE_PHY_TYPE_LOW_10GBASE_SR | \
+					 ICE_PHY_TYPE_LOW_10GBASE_LR | \
+					 ICE_PHY_TYPE_LOW_10GBASE_KR_CR1 | \
+					 ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC | \
+					 ICE_PHY_TYPE_LOW_10G_SFI_C2C)
+
+#define ICE_PHY_TYPE_LOW_MASK_100G	(ICE_PHY_TYPE_LOW_100GBASE_CR4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_SR4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_LR4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_KR4 | \
+					 ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC | \
+					 ICE_PHY_TYPE_LOW_100G_CAUI4 | \
+					 ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC | \
+					 ICE_PHY_TYPE_LOW_100G_AUI4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_CP2 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_SR2 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_DR)
+
+#define ICE_PHY_TYPE_HIGH_MASK_100G	(ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4 | \
+					 ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC |\
+					 ICE_PHY_TYPE_HIGH_100G_CAUI2 | \
+					 ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC | \
+					 ICE_PHY_TYPE_HIGH_100G_AUI2)
+
+#ifdef HAVE_ETHTOOL_100G_BITS
+/**
+ * ice_mask_min_supported_speeds
+ * @phy_types_high: PHY type high
+ * @phy_types_low: PHY type low to apply minimum supported speeds mask
+ *
+ * Apply minimum supported speeds mask to PHY type low. These are the speeds
+ * for ethtool supported link mode.
+ */
+static
+void ice_mask_min_supported_speeds(u64 phy_types_high, u64 *phy_types_low)
+#else
+static void ice_mask_min_supported_speeds(u64 *phy_types_low)
+#endif  /* !HAVE_ETHTOOL_100G_BITS */
+{
+	/* if QSFP connection with 100G speed, minimum supported speed is 25G */
+#ifdef HAVE_ETHTOOL_100G_BITS
+	if (*phy_types_low & ICE_PHY_TYPE_LOW_MASK_100G ||
+	    phy_types_high & ICE_PHY_TYPE_HIGH_MASK_100G)
+#else /* HAVE_ETHTOOL_100G_BITS */
+	if (*phy_types_low & ICE_PHY_TYPE_LOW_MASK_100G)
+#endif /* !HAVE_ETHTOOL_100G_BITS */
+		*phy_types_low &= ~ICE_PHY_TYPE_LOW_MASK_MIN_25G;
+	else
+		*phy_types_low &= ~ICE_PHY_TYPE_LOW_MASK_MIN_1G;
+}
+
+#ifdef HAVE_ETHTOOL_100G_BITS
+#define ice_ethtool_advertise_link_mode(aq_link_speed, ethtool_link_mode)    \
+	do {								     \
+		if (req_speeds & (aq_link_speed) ||			     \
+		    (!req_speeds &&					     \
+		     (advert_phy_type_lo & phy_type_mask_lo ||		     \
+		      advert_phy_type_hi & phy_type_mask_hi)))		     \
+			ethtool_link_ksettings_add_link_mode(ks, advertising,\
+							ethtool_link_mode);  \
+	} while (0)
+#else /* HAVE_ETHTOOL_100G_BITS */
+#define ice_ethtool_advertise_link_mode(aq_link_speed, ethtool_link_mode)    \
+	do {								     \
+		if (req_speeds & (aq_link_speed) ||			     \
+		    (req_speeds && advert_phy_type_lo & phy_type_mask_lo))   \
+			ethtool_link_ksettings_add_link_mode(ks, advertising,\
+							ethtool_link_mode);  \
+	} while (0)
+#endif /* ! HAVE_ETHTOOL_100G_BITS */
+
+/**
+ * ice_phy_type_to_ethtool - convert the phy_types to ethtool link modes
+ * @netdev: network interface device structure
+ * @ks: ethtool link ksettings struct to fill out
+ */
+static void
+ice_phy_type_to_ethtool(struct net_device *netdev,
+			struct ethtool_link_ksettings *ks)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u64 phy_type_mask_lo = 0;
+#ifdef HAVE_ETHTOOL_100G_BITS
+	u64 phy_type_mask_hi = 0;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+	u64 advert_phy_type_lo = 0;
+#ifdef HAVE_ETHTOOL_100G_BITS
+	u64 advert_phy_type_hi = 0;
+	u64 phy_types_high = 0;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+	u64 phy_types_low = 0;
+	u16 req_speeds;
+
+	req_speeds = vsi->port_info->phy.link_info.req_speeds;
+
+	/* Check if lenient mode is supported and enabled, or in strict mode.
+	 *
+	 * In lenient mode the Supported link modes are the PHY types without
+	 * media. The Advertising link mode is either 1. the user requested
+	 * speed, 2. the override PHY mask, or 3. the PHY types with media.
+	 *
+	 * In strict mode Supported link mode are the PHY type with media,
+	 * and Advertising link modes are the media PHY type or the speed
+	 * requested by user.
+	 */
+	if (test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags)) {
+		phy_types_low = le64_to_cpu(pf->nvm_phy_type_lo);
+#ifdef HAVE_ETHTOOL_100G_BITS
+		phy_types_high = le64_to_cpu(pf->nvm_phy_type_hi);
+
+		ice_mask_min_supported_speeds(phy_types_high, &phy_types_low);
+#else /* HAVE_ETHTOOL_100G_BITS */
+		ice_mask_min_supported_speeds(&phy_types_low);
+#endif /* !HAVE_ETHTOOL_100G_BITS */
+		/* determine advertised modes based on link override only
+		 * if it's supported and if the FW doesn't abstract the
+		 * driver from having to account for link overrides
+		 */
+		if (ice_fw_supports_link_override(&pf->hw) &&
+		    !ice_fw_supports_report_dflt_cfg(&pf->hw)) {
+			struct ice_link_default_override_tlv *ldo;
+
+			ldo = &pf->link_dflt_override;
+			/* If override enabled and PHY mask set, then
+			 * Advertising link mode is the intersection of the PHY
+			 * types without media and the override PHY mask.
+			 */
+			if (ldo->options & ICE_LINK_OVERRIDE_EN &&
+			    (ldo->phy_type_low || ldo->phy_type_high)) {
+				advert_phy_type_lo =
+					le64_to_cpu(pf->nvm_phy_type_lo) &
+					ldo->phy_type_low;
+#ifdef HAVE_ETHTOOL_100G_BITS
+				advert_phy_type_hi =
+					le64_to_cpu(pf->nvm_phy_type_hi) &
+					ldo->phy_type_high;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+			}
+		}
+	} else {
+		/* strict mode */
+		phy_types_low = vsi->port_info->phy.phy_type_low;
+#ifdef HAVE_ETHTOOL_100G_BITS
+		phy_types_high = vsi->port_info->phy.phy_type_high;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+	}
+
+	/* If Advertising link mode PHY type is not using override PHY type,
+	 * then use PHY type with media.
+	 */
+#ifdef HAVE_ETHTOOL_100G_BITS
+	if (!advert_phy_type_lo && !advert_phy_type_hi) {
+		advert_phy_type_lo = vsi->port_info->phy.phy_type_low;
+		advert_phy_type_hi = vsi->port_info->phy.phy_type_high;
+	}
+#else
+	if (!advert_phy_type_lo)
+		advert_phy_type_lo = vsi->port_info->phy.phy_type_low;
+#endif /* !HAVE_ETHTOOL_100G_BITS */
+
+	ethtool_link_ksettings_zero_link_mode(ks, supported);
+	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100BASE_TX |
+			   ICE_PHY_TYPE_LOW_100M_SGMII;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100baseT_Full);
+
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100MB,
+						100baseT_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_1000BASE_T |
+			   ICE_PHY_TYPE_LOW_1G_SGMII;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_1000MB,
+						1000baseT_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_1000BASE_KX;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseKX_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_1000MB,
+						1000baseKX_Full);
+	}
+#ifdef HAVE_ETHTOOL_NEW_1G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_1000BASE_SX |
+			   ICE_PHY_TYPE_LOW_1000BASE_LX;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseX_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_1000MB,
+						1000baseX_Full);
+	}
+#else
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_1000BASE_SX |
+			   ICE_PHY_TYPE_LOW_1000BASE_LX;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_1000MB,
+						1000baseT_Full);
+	}
+#endif /* HAVE_ETHTOOL_NEW_1G_BITS */
+#ifdef HAVE_ETHTOOL_NEW_2500MB_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_2500BASE_T;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_2500MB,
+						2500baseT_Full);
+	}
+#else
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_2500BASE_T;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseX_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_2500MB,
+						2500baseX_Full);
+	}
+#endif /* HAVE_ETHTOOL_NEW_2500MB_BITS */
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_2500BASE_X |
+			   ICE_PHY_TYPE_LOW_2500BASE_KX;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseX_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_2500MB,
+						2500baseX_Full);
+	}
+#ifdef HAVE_ETHTOOL_5G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_5GBASE_T |
+			   ICE_PHY_TYPE_LOW_5GBASE_KR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     5000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_5GB,
+						5000baseT_Full);
+	}
+#endif /* HAVE_ETHTOOL_5G_BITS */
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_T |
+			   ICE_PHY_TYPE_LOW_10G_SFI_DA |
+			   ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseT_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_KR_CR1 |
+			   ICE_PHY_TYPE_LOW_10G_SFI_C2C;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseKR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseKR_Full);
+	}
+#ifdef HAVE_ETHTOOL_NEW_10G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_SR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseSR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseSR_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_LR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseLR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseLR_Full);
+	}
+#else
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_SR |
+			   ICE_PHY_TYPE_LOW_10GBASE_LR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseT_Full);
+	}
+#endif /* HAVE_ETHTOOL_NEW_10G_BITS */
+#ifdef HAVE_ETHTOOL_25G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_25GBASE_T |
+			   ICE_PHY_TYPE_LOW_25GBASE_CR |
+			   ICE_PHY_TYPE_LOW_25GBASE_CR_S |
+			   ICE_PHY_TYPE_LOW_25GBASE_CR1 |
+			   ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_25GB,
+						25000baseCR_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_25GBASE_SR |
+			   ICE_PHY_TYPE_LOW_25GBASE_LR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseSR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_25GB,
+						25000baseSR_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_25GBASE_KR |
+			   ICE_PHY_TYPE_LOW_25GBASE_KR_S |
+			   ICE_PHY_TYPE_LOW_25GBASE_KR1 |
+			   ICE_PHY_TYPE_LOW_25G_AUI_C2C;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseKR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_25GB,
+						25000baseKR_Full);
+	}
+#endif /* HAVE_ETHTOOL_25G_BITS */
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_40GBASE_KR4;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseKR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_40GB,
+						40000baseKR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_40GBASE_CR4 |
+			   ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_40G_XLAUI;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_40GB,
+						40000baseCR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_40GBASE_SR4;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseSR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_40GB,
+						40000baseSR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_40GBASE_LR4;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseLR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_40GB,
+						40000baseLR4_Full);
+	}
+#ifdef HAVE_ETHTOOL_50G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_50GBASE_CR2 |
+			   ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_50G_LAUI2 |
+			   ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_50G_AUI2 |
+			   ICE_PHY_TYPE_LOW_50GBASE_CP |
+			   ICE_PHY_TYPE_LOW_50GBASE_SR |
+			   ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_50G_AUI1;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     50000baseCR2_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_50GB,
+						50000baseCR2_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_50GBASE_KR2 |
+			   ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     50000baseKR2_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_50GB,
+						50000baseKR2_Full);
+	}
+#endif /* HAVE_ETHTOOL_50G_BITS */
+#ifdef HAVE_ETHTOOL_NEW_50G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_50GBASE_SR2 |
+			   ICE_PHY_TYPE_LOW_50GBASE_LR2 |
+			   ICE_PHY_TYPE_LOW_50GBASE_FR |
+			   ICE_PHY_TYPE_LOW_50GBASE_LR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     50000baseSR2_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_50GB,
+						50000baseSR2_Full);
+	}
+#endif /* HAVE_ETHTOOL_NEW_50G_BITS */
+#ifdef HAVE_ETHTOOL_100G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100GBASE_CR4 |
+			   ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_100G_CAUI4 |
+			   ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_100G_AUI4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_CP2;
+	phy_type_mask_hi = ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC |
+			   ICE_PHY_TYPE_HIGH_100G_CAUI2 |
+			   ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC |
+			   ICE_PHY_TYPE_HIGH_100G_AUI2;
+	if (phy_types_low & phy_type_mask_lo ||
+	    phy_types_high & phy_type_mask_hi) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100000baseCR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100GB,
+						100000baseCR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100GBASE_SR4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_SR2;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100000baseSR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100GB,
+						100000baseSR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100GBASE_LR4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_DR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100000baseLR4_ER4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100GB,
+						100000baseLR4_ER4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100GBASE_KR4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4;
+	phy_type_mask_hi = ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4;
+	if (phy_types_low & phy_type_mask_lo ||
+	    phy_types_high & phy_type_mask_hi) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100000baseKR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100GB,
+						100000baseKR4_Full);
+	}
+#endif /* HAVE_ETHTOOL_100G_BITS */
+}
+
+#define TEST_SET_BITS_TIMEOUT	50
+#define TEST_SET_BITS_SLEEP_MAX	2000
+#define TEST_SET_BITS_SLEEP_MIN	1000
+
+#ifdef ETHTOOL_GLINKSETTINGS
+/**
+ * ice_get_settings_link_up - Get Link settings for when link is up
+ * @ks: ethtool ksettings to fill in
+ * @netdev: network interface device structure
+ */
+static void
+ice_get_settings_link_up(struct ethtool_link_ksettings *ks,
+			 struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_port_info *pi = np->vsi->port_info;
+	struct ice_link_status *link_info;
+	struct ice_vsi *vsi = np->vsi;
+
+	link_info = &vsi->port_info->phy.link_info;
+
+	/* Get supported and advertised settings from PHY ability with media */
+	ice_phy_type_to_ethtool(netdev, ks);
+
+	switch (link_info->link_speed) {
+#ifdef HAVE_ETHTOOL_100G_BITS
+	case ICE_AQ_LINK_SPEED_100GB:
+		ks->base.speed = SPEED_100000;
+		break;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+#if defined(HAVE_ETHTOOL_50G_BITS) || defined(HAVE_ETHTOOL_NEW_50G_BITS)
+	case ICE_AQ_LINK_SPEED_50GB:
+		ks->base.speed = SPEED_50000;
+		break;
+#endif /* HAVE_ETHTOOL_50G_BITS || HAVE_ETHTOOL_NEW_50G_BITS */
+	case ICE_AQ_LINK_SPEED_40GB:
+		ks->base.speed = SPEED_40000;
+		break;
+#ifdef HAVE_ETHTOOL_25G_BITS
+	case ICE_AQ_LINK_SPEED_25GB:
+		ks->base.speed = SPEED_25000;
+		break;
+#endif /* HAVE_ETHTOOL_25G_BITS */
+	case ICE_AQ_LINK_SPEED_20GB:
+		ks->base.speed = SPEED_20000;
+		break;
+	case ICE_AQ_LINK_SPEED_10GB:
+		ks->base.speed = SPEED_10000;
+		break;
+#ifdef HAVE_ETHTOOL_5G_BITS
+	case ICE_AQ_LINK_SPEED_5GB:
+		ks->base.speed = SPEED_5000;
+		break;
+#endif /* HAVE_ETHTOOL_5G_BITS */
+	case ICE_AQ_LINK_SPEED_2500MB:
+		ks->base.speed = SPEED_2500;
+		break;
+	case ICE_AQ_LINK_SPEED_1000MB:
+		ks->base.speed = SPEED_1000;
+		break;
+	case ICE_AQ_LINK_SPEED_100MB:
+		ks->base.speed = SPEED_100;
+		break;
+	default:
+		netdev_info(netdev, "WARNING: Unrecognized link_speed (0x%x).\n",
+			    link_info->link_speed);
+		break;
+	}
+	ks->base.duplex = DUPLEX_FULL;
+
+	if (link_info->an_info & ICE_AQ_AN_COMPLETED)
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
+						     Autoneg);
+
+	/* Set flow control negotiated Rx/Tx pause */
+	switch (pi->fc.current_mode) {
+	case ICE_FC_FULL:
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising, Pause);
+		break;
+	case ICE_FC_TX_PAUSE:
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising, Pause);
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
+						     Asym_Pause);
+		break;
+	case ICE_FC_RX_PAUSE:
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
+						     Asym_Pause);
+		break;
+	case ICE_FC_PFC:
+	default:
+		ethtool_link_ksettings_del_link_mode(ks, lp_advertising, Pause);
+		ethtool_link_ksettings_del_link_mode(ks, lp_advertising,
+						     Asym_Pause);
+		break;
+	}
+}
+
+/**
+ * ice_get_settings_link_down - Get the Link settings when link is down
+ * @ks: ethtool ksettings to fill in
+ * @netdev: network interface device structure
+ *
+ * Reports link settings that can be determined when link is down
+ */
+static void
+ice_get_settings_link_down(struct ethtool_link_ksettings *ks,
+			   struct net_device *netdev)
+{
+	/* link is down and the driver needs to fall back on
+	 * supported PHY types to figure out what info to display
+	 */
+	ice_phy_type_to_ethtool(netdev, ks);
+
+	/* With no link, speed and duplex are unknown */
+	ks->base.speed = SPEED_UNKNOWN;
+	ks->base.duplex = DUPLEX_UNKNOWN;
+}
+
+/**
+ * ice_get_link_ksettings - Get Link Speed and Duplex settings
+ * @netdev: network interface device structure
+ * @ks: ethtool ksettings
+ *
+ * Reports speed/duplex settings based on media_type
+ */
+static int
+ice_get_link_ksettings(struct net_device *netdev,
+		       struct ethtool_link_ksettings *ks)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_aqc_get_phy_caps_data *caps;
+	struct ice_link_status *hw_link_info;
+	struct ice_vsi *vsi = np->vsi;
+	enum ice_status status;
+	int err = 0;
+
+	ethtool_link_ksettings_zero_link_mode(ks, supported);
+	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+	ethtool_link_ksettings_zero_link_mode(ks, lp_advertising);
+	hw_link_info = &vsi->port_info->phy.link_info;
+
+
+	/* set speed and duplex */
+	if (hw_link_info->link_info & ICE_AQ_LINK_UP)
+		ice_get_settings_link_up(ks, netdev);
+	else
+		ice_get_settings_link_down(ks, netdev);
+
+	/* set autoneg settings */
+	ks->base.autoneg = (hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
+		AUTONEG_ENABLE : AUTONEG_DISABLE;
+
+	/* set media type settings */
+	switch (vsi->port_info->phy.media_type) {
+	case ICE_MEDIA_FIBER:
+		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
+		ks->base.port = PORT_FIBRE;
+		break;
+	case ICE_MEDIA_BASET:
+		ethtool_link_ksettings_add_link_mode(ks, supported, TP);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, TP);
+		ks->base.port = PORT_TP;
+		break;
+	case ICE_MEDIA_BACKPLANE:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Backplane);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Backplane);
+		ks->base.port = PORT_NONE;
+		break;
+	case ICE_MEDIA_DA:
+		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, FIBRE);
+		ks->base.port = PORT_DA;
+		break;
+	default:
+		ks->base.port = PORT_OTHER;
+		break;
+	}
+
+	/* flow control is symmetric and always supported */
+	ethtool_link_ksettings_add_link_mode(ks, supported, Pause);
+
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
+	if (!caps)
+		return -ENOMEM;
+
+	status = ice_aq_get_phy_caps(vsi->port_info, false,
+				     ICE_AQC_REPORT_ACTIVE_CFG, caps, NULL);
+	if (status) {
+		err = -EIO;
+		goto done;
+	}
+
+	/* Set the advertised flow control based on the PHY capability */
+	if ((caps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE) &&
+	    (caps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)) {
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Asym_Pause);
+	} else if (caps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE) {
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Asym_Pause);
+	} else if (caps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE) {
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Asym_Pause);
+	} else {
+		ethtool_link_ksettings_del_link_mode(ks, advertising, Pause);
+		ethtool_link_ksettings_del_link_mode(ks, advertising,
+						     Asym_Pause);
+	}
+
+#ifdef ETHTOOL_GFECPARAM
+	/* Set advertised FEC modes based on PHY capability */
+	ethtool_link_ksettings_add_link_mode(ks, advertising, FEC_NONE);
+
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_REQ)
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     FEC_BASER);
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ)
+		ethtool_link_ksettings_add_link_mode(ks, advertising, FEC_RS);
+#endif /* ETHTOOL_GFECPARAM */
+
+	status = ice_aq_get_phy_caps(vsi->port_info, false,
+				     ICE_AQC_REPORT_TOPO_CAP_MEDIA, caps, NULL);
+	if (status) {
+		err = -EIO;
+		goto done;
+	}
+
+#ifdef ETHTOOL_GFECPARAM
+	/* Set supported FEC modes based on PHY capability */
+	ethtool_link_ksettings_add_link_mode(ks, supported, FEC_NONE);
+
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN)
+		ethtool_link_ksettings_add_link_mode(ks, supported, FEC_BASER);
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN)
+		ethtool_link_ksettings_add_link_mode(ks, supported, FEC_RS);
+#endif /* ETHTOOL_GFECPARAM */
+
+	/* Set supported and advertised autoneg */
+	if (ice_is_phy_caps_an_enabled(caps)) {
 		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100baseT_Full);
 		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100baseT_Full);
-		break;
+	}
+
+done:
+	kfree(caps);
+	return err;
+}
+
+/**
+ * ice_ksettings_find_adv_link_speed - Find advertising link speed
+ * @ks: ethtool ksettings
+ */
+static u16
+ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
+{
+	u16 adv_link_speed = 0;
+
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_100MB;
+#ifdef HAVE_ETHTOOL_NEW_1G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
+#endif /* HAVE_ETHTOOL_NEW_1G_BITS */
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseT_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseKX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
+#ifdef HAVE_ETHTOOL_NEW_2500MB_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  2500baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
+#endif /* HAVE_ETHTOOL_NEW_2500MB_BITS */
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  2500baseX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
+#ifdef HAVE_ETHTOOL_5G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  5000baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_5GB;
+#endif /* HAVE_ETHTOOL_5G_BITS */
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseT_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseKR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
+#ifdef HAVE_ETHTOOL_NEW_10G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseSR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseLR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
+#endif /* HAVE_ETHTOOL_NEW_10G_BITS */
+#ifdef HAVE_ETHTOOL_25G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseCR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseSR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseKR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_25GB;
+#endif /* HAVE_ETHTOOL_25G_BITS */
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseCR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseSR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseLR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseKR4_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_40GB;
+#ifdef HAVE_ETHTOOL_50G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  50000baseCR2_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  50000baseKR2_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_50GB;
+#endif /* HAVE_ETHTOOL_50G_BITS */
+#ifdef HAVE_ETHTOOL_NEW_50G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  50000baseSR2_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_50GB;
+#endif /* HAVE_ETHTOOL_NEW_50G_BITS */
+#ifdef HAVE_ETHTOOL_100G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100000baseCR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100000baseSR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100000baseLR4_ER4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100000baseKR4_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_100GB;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+
+	return adv_link_speed;
+}
+
+/**
+ * ice_setup_autoneg
+ * @p: port info
+ * @ks: ethtool_link_ksettings
+ * @config: configuration that will be sent down to FW
+ * @autoneg_enabled: autonegotiation is enabled or not
+ * @autoneg_changed: will there a change in autonegotiation
+ * @netdev: network interface device structure
+ *
+ * Setup PHY autonegotiation feature
+ */
+static int
+ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
+		  struct ice_aqc_set_phy_cfg_data *config,
+		  u8 autoneg_enabled, u8 *autoneg_changed,
+		  struct net_device *netdev)
+{
+	int err = 0;
+
+	*autoneg_changed = 0;
+
+	/* Check autoneg */
+	if (autoneg_enabled == AUTONEG_ENABLE) {
+		/* If autoneg was not already enabled */
+		if (!(p->phy.link_info.an_info & ICE_AQ_AN_COMPLETED)) {
+			/* If autoneg is not supported, return error */
+			if (!ethtool_link_ksettings_test_link_mode(ks,
+								   supported,
+								   Autoneg)) {
+				netdev_info(netdev, "Autoneg not supported on this phy.\n");
+				err = -EINVAL;
+			} else {
+				/* Autoneg is allowed to change */
+				config->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+				*autoneg_changed = 1;
+			}
+		}
+	} else {
+		/* If autoneg is currently enabled */
+		if (p->phy.link_info.an_info & ICE_AQ_AN_COMPLETED) {
+			/* If autoneg is supported 10GBASE_T is the only PHY
+			 * that can disable it, so otherwise return error
+			 */
+			if (ethtool_link_ksettings_test_link_mode(ks,
+								  supported,
+								  Autoneg)) {
+				netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
+				err = -EINVAL;
+			} else {
+				/* Autoneg is allowed to change */
+				config->caps &= ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+				*autoneg_changed = 1;
+			}
+		}
+	}
+
+	return err;
+}
+
+/**
+ * ice_set_link_ksettings - Set Speed and Duplex
+ * @netdev: network interface device structure
+ * @ks: ethtool ksettings
+ *
+ * Set speed/duplex per media_types advertised/forced
+ */
+static int
+ice_set_link_ksettings(struct net_device *netdev,
+		       const struct ethtool_link_ksettings *ks)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	u8 autoneg, timeout = TEST_SET_BITS_TIMEOUT;
+	struct ethtool_link_ksettings copy_ks = *ks;
+	struct ethtool_link_ksettings safe_ks = {};
+	struct ice_aqc_get_phy_caps_data *phy_caps;
+	struct ice_aqc_set_phy_cfg_data config;
+	u16 adv_link_speed, curr_link_speed;
+	struct ice_pf *pf = np->vsi->back;
+	struct ice_port_info *pi;
+	u8 autoneg_changed = 0;
+	enum ice_status status;
+	u64 phy_type_high = 0;
+	u64 phy_type_low = 0;
+	int err = 0;
+	bool linkup;
+
+	pi = np->vsi->port_info;
+
+	if (!pi)
+		return -EIO;
+
+	if (pi->phy.media_type != ICE_MEDIA_BASET &&
+	    pi->phy.media_type != ICE_MEDIA_FIBER &&
+	    pi->phy.media_type != ICE_MEDIA_BACKPLANE &&
+	    pi->phy.media_type != ICE_MEDIA_DA &&
+	    pi->phy.link_info.link_info & ICE_AQ_LINK_UP)
+		return -EOPNOTSUPP;
+
+	phy_caps = kzalloc(sizeof(*phy_caps), GFP_KERNEL);
+	if (!phy_caps)
+		return -ENOMEM;
+
+	/* Get the PHY capabilities based on media */
+	if (ice_fw_supports_report_dflt_cfg(pi->hw))
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_DFLT_CFG,
+					     phy_caps, NULL);
+	else
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+					     phy_caps, NULL);
+	if (status) {
+		err = -EIO;
+		goto done;
+	}
+
+	/* save autoneg out of ksettings */
+	autoneg = copy_ks.base.autoneg;
+
+	/* Get link modes supported by hardware.*/
+	ice_phy_type_to_ethtool(netdev, &safe_ks);
+
+	/* and check against modes requested by user.
+	 * Return an error if unsupported mode was set.
+	 */
+	if (!bitmap_subset(copy_ks.link_modes.advertising,
+			   safe_ks.link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+		if (!test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags))
+			netdev_info(netdev, "The selected speed is not supported by the current media. Please select a link speed that is supported by the current media.\n");
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+	/* get our own copy of the bits to check against */
+	memset(&safe_ks, 0, sizeof(safe_ks));
+	safe_ks.base.cmd = copy_ks.base.cmd;
+	safe_ks.base.link_mode_masks_nwords =
+		copy_ks.base.link_mode_masks_nwords;
+	ice_get_link_ksettings(netdev, &safe_ks);
+
+	/* set autoneg back to what it currently is */
+	copy_ks.base.autoneg = safe_ks.base.autoneg;
+	/* we don't compare the speed */
+	copy_ks.base.speed = safe_ks.base.speed;
+
+	/* If copy_ks.base and safe_ks.base are not the same now, then they are
+	 * trying to set something that we do not support.
+	 */
+	if (memcmp(&copy_ks.base, &safe_ks.base, sizeof(copy_ks.base))) {
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+	while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) {
+		timeout--;
+		if (!timeout) {
+			err = -EBUSY;
+			goto done;
+		}
+		usleep_range(TEST_SET_BITS_SLEEP_MIN, TEST_SET_BITS_SLEEP_MAX);
+	}
+
+	/* Copy the current user PHY configuration. The current user PHY
+	 * configuration is initialized during probe from PHY capabilities
+	 * software mode, and updated on set PHY configuration.
+	 */
+	config = pi->phy.curr_user_phy_cfg;
+
+	config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+	/* Check autoneg */
+	err = ice_setup_autoneg(pi, &safe_ks, &config, autoneg, &autoneg_changed,
+				netdev);
+
+	if (err)
+		goto done;
+
+	/* Call to get the current link speed */
+	pi->phy.get_link_info = true;
+	status = ice_get_link_status(pi, &linkup);
+	if (status) {
+		err = -EIO;
+		goto done;
+	}
+
+	curr_link_speed = pi->phy.link_info.link_speed;
+	adv_link_speed = ice_ksettings_find_adv_link_speed(ks);
+
+	/* If speed didn't get set, set it to what it currently is.
+	 * This is needed because if advertise is 0 (as it is when autoneg
+	 * is disabled) then speed won't get set.
+	 */
+	if (!adv_link_speed)
+		adv_link_speed = curr_link_speed;
+
+	/* Convert the advertise link speeds to their corresponded PHY_TYPE */
+	ice_update_phy_type(&phy_type_low, &phy_type_high, adv_link_speed);
+
+	if (!autoneg_changed && adv_link_speed == curr_link_speed) {
+		netdev_info(netdev, "Nothing changed, exiting without setting anything.\n");
+		goto done;
+	}
+
+	/* save the requested speeds */
+	pi->phy.link_info.req_speeds = adv_link_speed;
+
+	/* set link and auto negotiation so changes take effect */
+	config.caps |= ICE_AQ_PHY_ENA_LINK;
+
+	/* check if there is a PHY type for the requested advertised speed */
+	if (!(phy_type_low || phy_type_high)) {
+		netdev_info(netdev, "The selected speed is not supported by the current media. Please select a link speed that is supported by the current media.\n");
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+	/* intersect requested advertised speed PHY types with media PHY types
+	 * for set PHY configuration
+	 */
+	config.phy_type_high = cpu_to_le64(phy_type_high) &
+			phy_caps->phy_type_high;
+	config.phy_type_low = cpu_to_le64(phy_type_low) &
+			phy_caps->phy_type_low;
+
+	if (!(config.phy_type_high || config.phy_type_low)) {
+		/* If there is no intersection and lenient mode is enabled, then
+		 * intersect the requested advertised speed with NVM media type
+		 * PHY types.
+		 */
+		if (test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags)) {
+			config.phy_type_high = cpu_to_le64(phy_type_high) &
+					       pf->nvm_phy_type_hi;
+			config.phy_type_low = cpu_to_le64(phy_type_low) &
+					      pf->nvm_phy_type_lo;
+		} else {
+			netdev_info(netdev, "The selected speed is not supported by the current media. Please select a link speed that is supported by the current media.\n");
+			err = -EOPNOTSUPP;
+			goto done;
+		}
+	}
+
+	/* If link is up put link down */
+	if (pi->phy.link_info.link_info & ICE_AQ_LINK_UP) {
+		/* Tell the OS link is going down, the link will go
+		 * back up when fw says it is ready asynchronously
+		 */
+		ice_print_link_msg(np->vsi, false);
+		netif_carrier_off(netdev);
+		netif_tx_stop_all_queues(netdev);
+	}
+
+	/* make the aq call */
+	status = ice_aq_set_phy_cfg(&pf->hw, pi, &config, NULL);
+	if (status) {
+		netdev_info(netdev, "Set phy config failed,\n");
+		err = -EIO;
+		goto done;
+	}
+
+	/* Save speed request */
+	pi->phy.curr_user_speed_req = adv_link_speed;
+done:
+	kfree(phy_caps);
+	clear_bit(ICE_CFG_BUSY, pf->state);
+
+	return err;
+}
+#else /* ETHTOOL_GLINKSETTINGS */
+
+/**
+ * ice_get_legacy_settings_link_up - Get the Link settings for when link is up
+ * @ecmd: ethtool command to fill in
+ * @netdev: network interface device structure
+ *
+ * Reports link settings that can be determined when link is up
+ */
+static void
+ice_get_legacy_settings_link_up(struct ethtool_cmd *ecmd,
+				struct net_device *netdev)
+{
+	struct ethtool_link_ksettings ks, cap_ks;
+	struct ice_link_status *hw_link_info;
+	struct ice_netdev_priv *np;
+	struct ice_vsi *vsi;
+	u64 phy_types_low;
+
+	np = netdev_priv(netdev);
+	vsi = np->vsi;
+	hw_link_info = &vsi->port_info->phy.link_info;
+	phy_types_low = hw_link_info->phy_type_low;
+
+	/* Initialize supported and advertised settings based on PHY settings */
+	switch (phy_types_low) {
+	case ICE_PHY_TYPE_LOW_100BASE_TX:
 	case ICE_PHY_TYPE_LOW_100M_SGMII:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100baseT_Full);
+		ecmd->supported = SUPPORTED_100baseT_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_100M_SGMII)
+			ecmd->advertising = ADVERTISED_100baseT_Full;
 		break;
 	case ICE_PHY_TYPE_LOW_1000BASE_T:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseT_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     1000baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_1G_SGMII:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseT_Full);
-		break;
 	case ICE_PHY_TYPE_LOW_1000BASE_SX:
 	case ICE_PHY_TYPE_LOW_1000BASE_LX:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseX_Full);
+	case ICE_PHY_TYPE_LOW_1G_SGMII:
+		ecmd->supported = SUPPORTED_1000baseT_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_1000BASE_T)
+			ecmd->advertising = ADVERTISED_1000baseT_Full;
 		break;
 	case ICE_PHY_TYPE_LOW_1000BASE_KX:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseKX_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     1000baseKX_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_2500BASE_T:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseT_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     2500baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_2500BASE_X:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseX_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_2500BASE_KX:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseX_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     2500baseX_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_5GBASE_T:
-	case ICE_PHY_TYPE_LOW_5GBASE_KR:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     5000baseT_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     5000baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_T:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseT_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     10000baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
-	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_SR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseSR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_LR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseLR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseKR_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     10000baseKR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_25GBASE_T:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR_S:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR1:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseCR_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     25000baseCR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseCR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_25GBASE_SR:
-	case ICE_PHY_TYPE_LOW_25GBASE_LR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseSR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_25GBASE_KR:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR1:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR_S:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseKR_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     25000baseKR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseCR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     40000baseCR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_40G_XLAUI:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseCR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseSR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseLR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseKR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     40000baseKR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_50GBASE_CR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_CP:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseCR2_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     50000baseCR2_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_LAUI2:
-	case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_AUI2:
-	case ICE_PHY_TYPE_LOW_50GBASE_SR:
-	case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_AUI1:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseCR2_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_50GBASE_KR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseKR2_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     50000baseKR2_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_50GBASE_SR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_LR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_FR:
-	case ICE_PHY_TYPE_LOW_50GBASE_LR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseSR2_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_CR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseCR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_100G_CAUI4:
-	case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_100G_AUI4:
-	case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_CP2:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseCR4_Full);
+		ecmd->supported = SUPPORTED_1000baseKX_Full;
+		ecmd->advertising = ADVERTISED_1000baseKX_Full;
 		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_SR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_SR2:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseSR4_Full);
+	case ICE_PHY_TYPE_LOW_2500BASE_T:
+	case ICE_PHY_TYPE_LOW_2500BASE_X:
+	case ICE_PHY_TYPE_LOW_2500BASE_KX:
+		ecmd->supported = SUPPORTED_2500baseX_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_2500BASE_T ||
+		    phy_types_low == ICE_PHY_TYPE_LOW_2500BASE_KX)
+			ecmd->advertising = ADVERTISED_2500baseX_Full;
 		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_LR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_DR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseLR4_ER4_Full);
+	case ICE_PHY_TYPE_LOW_10GBASE_T:
+	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
+	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_10GBASE_SR:
+	case ICE_PHY_TYPE_LOW_10GBASE_LR:
+		ecmd->supported = SUPPORTED_10000baseT_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_10GBASE_T)
+			ecmd->advertising = ADVERTISED_10000baseT_Full;
 		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_KR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseKR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseKR4_Full);
+	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
+	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
+		ecmd->supported = SUPPORTED_10000baseKR_Full;
+		ecmd->advertising = ADVERTISED_10000baseKR_Full;
 		break;
-	default:
-		unrecog_phy_low = true;
-	}
-
-	switch (link_info->phy_type_high) {
-	case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseKR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseKR4_Full);
+	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI:
+		ecmd->supported = SUPPORTED_40000baseCR4_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_40GBASE_CR4)
+			ecmd->advertising = ADVERTISED_40000baseCR4_Full;
 		break;
-	case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC:
-	case ICE_PHY_TYPE_HIGH_100G_CAUI2:
-	case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC:
-	case ICE_PHY_TYPE_HIGH_100G_AUI2:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
+	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
+		ecmd->supported = SUPPORTED_40000baseSR4_Full;
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
+		ecmd->supported = SUPPORTED_40000baseLR4_Full;
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
+		ecmd->supported = SUPPORTED_40000baseKR4_Full;
+		ecmd->advertising = ADVERTISED_40000baseKR4_Full;
 		break;
 	default:
-		unrecog_phy_high = true;
-	}
-
-	if (unrecog_phy_low && unrecog_phy_high) {
 		/* if we got here and link is up something bad is afoot */
-		netdev_info(netdev,
-			    "WARNING: Unrecognized PHY_Low (0x%llx).\n",
-			    (u64)link_info->phy_type_low);
-		netdev_info(netdev,
-			    "WARNING: Unrecognized PHY_High (0x%llx).\n",
-			    (u64)link_info->phy_type_high);
+		netdev_info(netdev, "WARNING: Link up but PhyType isn't recognized.\n");
+		netdev_info(netdev, "WARNING: Unrecognized PHY_Low (0x%llx).\n",
+			    (u64)phy_types_low);
 	}
 
 	/* Now that we've worked out everything that could be supported by the
-	 * current PHY type, get what is supported by the NVM and intersect
-	 * them to get what is truly supported
+	 * current PHY type, get what is supported by the NVM and and them to
+	 * get what is truly supported
+	 *
+	 * ice_phy_type_to_ethtool uses the new API ethtool_link_ksettings
+	 * struct, so we need to convert ecmd to a ksettings to intersect them,
+	 * then convert back to legacy ethtool_cmd.
 	 */
-	memset(&cap_ksettings, 0, sizeof(cap_ksettings));
-	ice_phy_type_to_ethtool(netdev, &cap_ksettings);
-	ethtool_intersect_link_masks(ks, &cap_ksettings);
-
-	switch (link_info->link_speed) {
-	case ICE_AQ_LINK_SPEED_100GB:
-		ks->base.speed = SPEED_100000;
-		break;
-	case ICE_AQ_LINK_SPEED_50GB:
-		ks->base.speed = SPEED_50000;
-		break;
+	ks.link_modes.supported[0] = ecmd->supported;
+	ks.link_modes.advertising[0] = ecmd->advertising;
+	ice_phy_type_to_ethtool(netdev, &cap_ks);
+	ethtool_intersect_link_masks(&ks, &cap_ks);
+	ecmd->supported = (u32)ks.link_modes.supported[0];
+	ecmd->advertising = (u32)ks.link_modes.advertising[0];
+
+	/* Set speed and duplex */
+	switch (hw_link_info->link_speed) {
 	case ICE_AQ_LINK_SPEED_40GB:
-		ks->base.speed = SPEED_40000;
-		break;
-	case ICE_AQ_LINK_SPEED_25GB:
-		ks->base.speed = SPEED_25000;
-		break;
-	case ICE_AQ_LINK_SPEED_20GB:
-		ks->base.speed = SPEED_20000;
+		ethtool_cmd_speed_set(ecmd, SPEED_40000);
 		break;
 	case ICE_AQ_LINK_SPEED_10GB:
-		ks->base.speed = SPEED_10000;
-		break;
-	case ICE_AQ_LINK_SPEED_5GB:
-		ks->base.speed = SPEED_5000;
+		ethtool_cmd_speed_set(ecmd, SPEED_10000);
 		break;
 	case ICE_AQ_LINK_SPEED_2500MB:
-		ks->base.speed = SPEED_2500;
+		ethtool_cmd_speed_set(ecmd, SPEED_2500);
 		break;
 	case ICE_AQ_LINK_SPEED_1000MB:
-		ks->base.speed = SPEED_1000;
+		ethtool_cmd_speed_set(ecmd, SPEED_1000);
 		break;
 	case ICE_AQ_LINK_SPEED_100MB:
-		ks->base.speed = SPEED_100;
-		break;
-	default:
-		netdev_info(netdev,
-			    "WARNING: Unrecognized link_speed (0x%x).\n",
-			    link_info->link_speed);
-		break;
-	}
-	ks->base.duplex = DUPLEX_FULL;
-
-	if (link_info->an_info & ICE_AQ_AN_COMPLETED)
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
-						     Autoneg);
-
-	/* Set flow control negotiated Rx/Tx pause */
-	switch (pi->fc.current_mode) {
-	case ICE_FC_FULL:
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising, Pause);
-		break;
-	case ICE_FC_TX_PAUSE:
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising, Pause);
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
-						     Asym_Pause);
-		break;
-	case ICE_FC_RX_PAUSE:
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
-						     Asym_Pause);
+		ethtool_cmd_speed_set(ecmd, SPEED_100);
 		break;
-	case ICE_FC_PFC:
-		/* fall through */
 	default:
-		ethtool_link_ksettings_del_link_mode(ks, lp_advertising, Pause);
-		ethtool_link_ksettings_del_link_mode(ks, lp_advertising,
-						     Asym_Pause);
+		netdev_info(netdev, "WARNING: Unrecognized link_speed (0x%x).\n",
+			    hw_link_info->link_speed);
 		break;
 	}
+	ecmd->duplex = DUPLEX_FULL;
 }
 
 /**
- * ice_get_settings_link_down - Get the Link settings when link is down
- * @ks: ethtool ksettings to fill in
+ * ice_get_legacy_settings_link_down - Get the Link settings when link is down
+ * @ecmd: ethtool command to fill in
  * @netdev: network interface device structure
  *
  * Reports link settings that can be determined when link is down
  */
 static void
-ice_get_settings_link_down(struct ethtool_link_ksettings *ks,
-			   struct net_device *netdev)
+ice_get_legacy_settings_link_down(struct ethtool_cmd *ecmd,
+				  struct net_device *netdev)
 {
+	struct ethtool_link_ksettings ks;
+
 	/* link is down and the driver needs to fall back on
 	 * supported PHY types to figure out what info to display
 	 */
-	ice_phy_type_to_ethtool(netdev, ks);
+	ice_phy_type_to_ethtool(netdev, &ks);
+	ecmd->supported = (u32)ks.link_modes.supported[0];
+	ecmd->advertising = (u32)ks.link_modes.advertising[0];
 
-	/* With no link, speed and duplex are unknown */
-	ks->base.speed = SPEED_UNKNOWN;
-	ks->base.duplex = DUPLEX_UNKNOWN;
+	ethtool_cmd_speed_set(ecmd, SPEED_UNKNOWN);
+	ecmd->duplex = DUPLEX_UNKNOWN;
 }
 
 /**
- * ice_get_link_ksettings - Get Link Speed and Duplex settings
+ * ice_get_settings - Get Link Speed and Duplex settings
  * @netdev: network interface device structure
- * @ks: ethtool ksettings
+ * @ecmd: ethtool command
  *
  * Reports speed/duplex settings based on media_type
  */
-static int
-ice_get_link_ksettings(struct net_device *netdev,
-		       struct ethtool_link_ksettings *ks)
+static int ice_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_aqc_get_phy_caps_data *caps;
 	struct ice_link_status *hw_link_info;
 	struct ice_vsi *vsi = np->vsi;
 	enum ice_status status;
-	int err = 0;
+	bool link_up;
 
-	ethtool_link_ksettings_zero_link_mode(ks, supported);
-	ethtool_link_ksettings_zero_link_mode(ks, advertising);
-	ethtool_link_ksettings_zero_link_mode(ks, lp_advertising);
 	hw_link_info = &vsi->port_info->phy.link_info;
+	link_up = hw_link_info->link_info & ICE_AQ_LINK_UP;
+
 
 	/* set speed and duplex */
-	if (hw_link_info->link_info & ICE_AQ_LINK_UP)
-		ice_get_settings_link_up(ks, netdev);
+	if (link_up)
+		ice_get_legacy_settings_link_up(ecmd, netdev);
 	else
-		ice_get_settings_link_down(ks, netdev);
+		ice_get_legacy_settings_link_down(ecmd, netdev);
 
 	/* set autoneg settings */
-	ks->base.autoneg = (hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
+	ecmd->autoneg = (hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
 		AUTONEG_ENABLE : AUTONEG_DISABLE;
 
-	/* set media type settings */
+	/* Set media type settings */
 	switch (vsi->port_info->phy.media_type) {
 	case ICE_MEDIA_FIBER:
-		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
-		ks->base.port = PORT_FIBRE;
+		ecmd->supported |= SUPPORTED_FIBRE;
+		ecmd->port = PORT_FIBRE;
 		break;
 	case ICE_MEDIA_BASET:
-		ethtool_link_ksettings_add_link_mode(ks, supported, TP);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, TP);
-		ks->base.port = PORT_TP;
+		ecmd->supported |= SUPPORTED_TP;
+		ecmd->advertising |= ADVERTISED_TP;
+		ecmd->port = PORT_TP;
 		break;
 	case ICE_MEDIA_BACKPLANE:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported, Backplane);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Backplane);
-		ks->base.port = PORT_NONE;
+		ecmd->supported |= SUPPORTED_Backplane;
+		ecmd->advertising |= ADVERTISED_Backplane;
+		ecmd->port = PORT_NONE;
 		break;
 	case ICE_MEDIA_DA:
-		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, FIBRE);
-		ks->base.port = PORT_DA;
+		ecmd->supported |= SUPPORTED_FIBRE;
+		ecmd->advertising |= ADVERTISED_FIBRE;
+		ecmd->port = PORT_DA;
 		break;
 	default:
-		ks->base.port = PORT_OTHER;
+		ecmd->port = PORT_OTHER;
 		break;
 	}
 
-	/* flow control is symmetric and always supported */
-	ethtool_link_ksettings_add_link_mode(ks, supported, Pause);
-
-	caps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*caps), GFP_KERNEL);
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
 	if (!caps)
 		return -ENOMEM;
 
 	status = ice_aq_get_phy_caps(vsi->port_info, false,
-				     ICE_AQC_REPORT_SW_CFG, caps, NULL);
+				     ICE_AQC_REPORT_TOPO_CAP_MEDIA, caps, NULL);
 	if (status) {
-		err = -EIO;
-		goto done;
+		dev_dbg(ice_pf_to_dev(vsi->back), "get PHY caps failed, status %s\n",
+			ice_stat_str(status));
+		kfree(caps);
+		return -EIO;
 	}
 
-	/* Set the advertised flow control based on the PHY capability */
-	if ((caps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE) &&
-	    (caps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)) {
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Asym_Pause);
-	} else if (caps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE) {
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Asym_Pause);
-	} else if (caps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE) {
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Asym_Pause);
-	} else {
-		ethtool_link_ksettings_del_link_mode(ks, advertising, Pause);
-		ethtool_link_ksettings_del_link_mode(ks, advertising,
-						     Asym_Pause);
+	/* Set supported and advertised autoneg */
+	if (ice_is_phy_caps_an_enabled(caps)) {
+		ecmd->supported |= SUPPORTED_Autoneg;
+		ecmd->advertising |= ADVERTISED_Autoneg;
 	}
 
-	/* Set advertised FEC modes based on PHY capability */
-	ethtool_link_ksettings_add_link_mode(ks, advertising, FEC_NONE);
+	ecmd->transceiver = XCVR_EXTERNAL;
 
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_REQ)
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     FEC_BASER);
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ)
-		ethtool_link_ksettings_add_link_mode(ks, advertising, FEC_RS);
+	/* flow control is symmetric and always supported */
+	ecmd->supported |= SUPPORTED_Pause;
 
-	status = ice_aq_get_phy_caps(vsi->port_info, false,
-				     ICE_AQC_REPORT_TOPO_CAP, caps, NULL);
-	if (status) {
-		err = -EIO;
-		goto done;
+	switch (vsi->port_info->fc.req_mode) {
+	case ICE_FC_RX_PAUSE:
+		ecmd->advertising |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
+		break;
+	case ICE_FC_TX_PAUSE:
+		ecmd->advertising |= ADVERTISED_Asym_Pause;
+		break;
+	case ICE_FC_FULL:
+		ecmd->advertising |= ADVERTISED_Pause;
+		break;
+	case ICE_FC_PFC:
+	default:
+		ecmd->advertising &= ~(ADVERTISED_Pause |
+				       ADVERTISED_Asym_Pause);
+		break;
 	}
 
-	/* Set supported FEC modes based on PHY capability */
-	ethtool_link_ksettings_add_link_mode(ks, supported, FEC_NONE);
-
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN)
-		ethtool_link_ksettings_add_link_mode(ks, supported, FEC_BASER);
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN)
-		ethtool_link_ksettings_add_link_mode(ks, supported, FEC_RS);
-
-done:
-	devm_kfree(&vsi->back->pdev->dev, caps);
-	return err;
+	kfree(caps);
+	return 0;
 }
 
 /**
- * ice_ksettings_find_adv_link_speed - Find advertising link speed
- * @ks: ethtool ksettings
+ * ice_legacy_find_adv_link_speed
+ * @advertise_phy: advertisement PHY value
+ *
+ * Find advertising link speed
  */
 static u16
-ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
+ice_legacy_find_adv_link_speed(u32 advertise_phy)
 {
 	u16 adv_link_speed = 0;
 
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100baseT_Full))
+	if (advertise_phy & ADVERTISED_100baseT_Full)
 		adv_link_speed |= ICE_AQ_LINK_SPEED_100MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  1000baseX_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  1000baseT_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  1000baseKX_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  2500baseT_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  2500baseX_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  5000baseT_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_5GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseT_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseKR_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseSR_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseLR_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  25000baseCR_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  25000baseSR_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  25000baseKR_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_25GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  40000baseCR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  40000baseSR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  40000baseLR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  40000baseKR4_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_40GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  50000baseCR2_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  50000baseKR2_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_50GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  50000baseSR2_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_50GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100000baseCR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100000baseSR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100000baseLR4_ER4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100000baseKR4_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_100GB;
+	if (advertise_phy & ADVERTISED_1000baseT_Full ||
+	    advertise_phy & ADVERTISED_1000baseKX_Full)
+		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
+	if (advertise_phy & ADVERTISED_2500baseX_Full)
+		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
+	if (advertise_phy & ADVERTISED_10000baseT_Full ||
+	    advertise_phy & ADVERTISED_10000baseKR_Full)
+		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
+	if (advertise_phy & ADVERTISED_40000baseKR4_Full ||
+	    advertise_phy & ADVERTISED_40000baseCR4_Full ||
+	    advertise_phy & ADVERTISED_40000baseSR4_Full ||
+	    advertise_phy & ADVERTISED_40000baseLR4_Full)
+		adv_link_speed |= ICE_AQ_LINK_SPEED_40GB;
 
 	return adv_link_speed;
 }
@@ -2281,7 +3872,7 @@ ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
 /**
  * ice_setup_autoneg
  * @p: port info
- * @ks: ethtool_link_ksettings
+ * @ecmd: ethtool command
  * @config: configuration that will be sent down to FW
  * @autoneg_enabled: autonegotiation is enabled or not
  * @autoneg_changed: will there a change in autonegotiation
@@ -2290,7 +3881,7 @@ ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
  * Setup PHY autonegotiation feature
  */
 static int
-ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
+ice_setup_autoneg(struct ice_port_info *p, struct ethtool_cmd *ecmd,
 		  struct ice_aqc_set_phy_cfg_data *config,
 		  u8 autoneg_enabled, u8 *autoneg_changed,
 		  struct net_device *netdev)
@@ -2299,21 +3890,18 @@ ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
 
 	*autoneg_changed = 0;
 
-	/* Check autoneg */
 	if (autoneg_enabled == AUTONEG_ENABLE) {
 		/* If autoneg was not already enabled */
 		if (!(p->phy.link_info.an_info & ICE_AQ_AN_COMPLETED)) {
 			/* If autoneg is not supported, return error */
-			if (!ethtool_link_ksettings_test_link_mode(ks,
-								   supported,
-								   Autoneg)) {
+			if (!(ecmd->supported & SUPPORTED_Autoneg)) {
 				netdev_info(netdev, "Autoneg not supported on this phy.\n");
-				err = -EINVAL;
-			} else {
-				/* Autoneg is allowed to change */
-				config->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-				*autoneg_changed = 1;
+				return -EINVAL;
 			}
+
+			/* Autoneg is allowed to change */
+			config->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+			*autoneg_changed = 1;
 		}
 	} else {
 		/* If autoneg is currently enabled */
@@ -2321,16 +3909,14 @@ ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
 			/* If autoneg is supported 10GBASE_T is the only PHY
 			 * that can disable it, so otherwise return error
 			 */
-			if (ethtool_link_ksettings_test_link_mode(ks,
-								  supported,
-								  Autoneg)) {
-				netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
-				err = -EINVAL;
-			} else {
-				/* Autoneg is allowed to change */
-				config->caps &= ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-				*autoneg_changed = 1;
+			if (ecmd->supported & SUPPORTED_Autoneg) {
+				netdev_info(netdev, "Autoneg cannot be disabled.\n");
+				return -EINVAL;
 			}
+
+			/* Autoneg is allowed to change */
+			config->caps &= ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+			*autoneg_changed = 1;
 		}
 	}
 
@@ -2338,28 +3924,27 @@ ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
 }
 
 /**
- * ice_set_link_ksettings - Set Speed and Duplex
+ * ice_set_settings - Set Speed and Duplex
  * @netdev: network interface device structure
- * @ks: ethtool ksettings
+ * @ecmd: ethtool command
  *
  * Set speed/duplex per media_types advertised/forced
  */
-static int
-ice_set_link_ksettings(struct net_device *netdev,
-		       const struct ethtool_link_ksettings *ks)
+static int ice_set_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
 {
-	u8 autoneg, timeout = TEST_SET_BITS_TIMEOUT, lport = 0;
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ethtool_link_ksettings safe_ks, copy_ks;
 	struct ice_aqc_get_phy_caps_data *abilities;
+	u8 autoneg, timeout = TEST_SET_BITS_TIMEOUT;
 	u16 adv_link_speed, curr_link_speed, idx;
 	struct ice_aqc_set_phy_cfg_data config;
 	struct ice_pf *pf = np->vsi->back;
+	struct ethtool_cmd safe_ecmd;
 	struct ice_port_info *p;
 	u8 autoneg_changed = 0;
 	enum ice_status status;
-	u64 phy_type_high;
-	u64 phy_type_low;
+	u64 phy_type_high = 0;
+	u64 phy_type_low = 0;
+	u32 advertise;
 	int err = 0;
 	bool linkup;
 
@@ -2383,76 +3968,64 @@ ice_set_link_ksettings(struct net_device *netdev,
 	    p->phy.link_info.link_info & ICE_AQ_LINK_UP)
 		return -EOPNOTSUPP;
 
-	/* copy the ksettings to copy_ks to avoid modifying the original */
-	memcpy(&copy_ks, ks, sizeof(copy_ks));
-
-	/* save autoneg out of ksettings */
-	autoneg = copy_ks.base.autoneg;
-
-	memset(&safe_ks, 0, sizeof(safe_ks));
-
-	/* Get link modes supported by hardware.*/
-	ice_phy_type_to_ethtool(netdev, &safe_ks);
-
-	/* and check against modes requested by user.
-	 * Return an error if unsupported mode was set.
-	 */
-	if (!bitmap_subset(copy_ks.link_modes.advertising,
-			   safe_ks.link_modes.supported,
-			   __ETHTOOL_LINK_MODE_MASK_NBITS))
-		return -EINVAL;
-
 	/* get our own copy of the bits to check against */
-	memset(&safe_ks, 0, sizeof(safe_ks));
-	safe_ks.base.cmd = copy_ks.base.cmd;
-	safe_ks.base.link_mode_masks_nwords =
-		copy_ks.base.link_mode_masks_nwords;
-	ice_get_link_ksettings(netdev, &safe_ks);
+	memset(&safe_ecmd, 0, sizeof(safe_ecmd));
+	ice_get_settings(netdev, &safe_ecmd);
 
-	/* set autoneg back to what it currently is */
-	copy_ks.base.autoneg = safe_ks.base.autoneg;
-	/* we don't compare the speed */
-	copy_ks.base.speed = safe_ks.base.speed;
+	/* save autoneg and speed out of ecmd */
+	autoneg = ecmd->autoneg;
+	advertise = ecmd->advertising;
 
-	/* If copy_ks.base and safe_ks.base are not the same now, then they are
-	 * trying to set something that we do not support.
+	/* set autoneg and speed back to what they currently are */
+	ecmd->autoneg = safe_ecmd.autoneg;
+	ecmd->speed = safe_ecmd.speed;
+	ecmd->advertising = safe_ecmd.advertising;
+	ecmd->cmd = safe_ecmd.cmd;
+
+	/* If ecmd and safe_ecmd are not the same now, then they are
+	 * trying to set something that we do not support
 	 */
-	if (memcmp(&copy_ks.base, &safe_ks.base, sizeof(copy_ks.base)))
+	if (memcmp(ecmd, &safe_ecmd, sizeof(*ecmd)))
 		return -EOPNOTSUPP;
 
-	while (test_and_set_bit(__ICE_CFG_BUSY, pf->state)) {
+	while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) {
 		timeout--;
 		if (!timeout)
 			return -EBUSY;
 		usleep_range(TEST_SET_BITS_SLEEP_MIN, TEST_SET_BITS_SLEEP_MAX);
 	}
 
-	abilities = devm_kzalloc(&pf->pdev->dev, sizeof(*abilities),
-				 GFP_KERNEL);
+	abilities = kzalloc(sizeof(*abilities), GFP_KERNEL);
 	if (!abilities)
 		return -ENOMEM;
 
 	/* Get the current PHY config */
-	status = ice_aq_get_phy_caps(p, false, ICE_AQC_REPORT_SW_CFG, abilities,
-				     NULL);
+	status = ice_aq_get_phy_caps(p, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+				     abilities, NULL);
 	if (status) {
 		err = -EAGAIN;
 		goto done;
 	}
 
-	/* Copy abilities to config in case autoneg is not set below */
-	memset(&config, 0, sizeof(config));
-	config.caps = abilities->caps & ~ICE_AQC_PHY_AN_MODE;
-	if (abilities->caps & ICE_AQC_PHY_AN_MODE)
-		config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+	/* Copy the current user PHY configuration. The current user PHY
+	 * configuration is initialized during probe from PHY capabilities
+	 * software mode, and updated on set PHY configuration.
+	 */
+	memcpy(&config, &p->phy.curr_user_phy_cfg, sizeof(config));
 
-	/* Check autoneg */
-	err = ice_setup_autoneg(p, &safe_ks, &config, autoneg, &autoneg_changed,
-				netdev);
+	config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
 
+	/* Check autoneg */
+	err = ice_setup_autoneg(p, &safe_ecmd, &config, autoneg,
+				&autoneg_changed, netdev);
 	if (err)
 		goto done;
 
+	if (advertise & ~safe_ecmd.supported) {
+		err = -EINVAL;
+		goto done;
+	}
+
 	/* Call to get the current link speed */
 	p->phy.get_link_info = true;
 	status = ice_get_link_status(p, &linkup);
@@ -2462,7 +4035,7 @@ ice_set_link_ksettings(struct net_device *netdev,
 	}
 
 	curr_link_speed = p->phy.link_info.link_speed;
-	adv_link_speed = ice_ksettings_find_adv_link_speed(ks);
+	adv_link_speed = ice_legacy_find_adv_link_speed(advertise);
 
 	/* If speed didn't get set, set it to what it currently is.
 	 * This is needed because if advertise is 0 (as it is when autoneg
@@ -2479,12 +4052,6 @@ ice_set_link_ksettings(struct net_device *netdev,
 		goto done;
 	}
 
-	/* copy over the rest of the abilities */
-	config.low_power_ctrl = abilities->low_power_ctrl;
-	config.eee_cap = abilities->eee_cap;
-	config.eeer_value = abilities->eeer_value;
-	config.link_fec_opt = abilities->link_fec_options;
-
 	/* save the requested speeds */
 	p->phy.link_info.req_speeds = adv_link_speed;
 
@@ -2497,34 +4064,283 @@ ice_set_link_ksettings(struct net_device *netdev,
 		config.phy_type_low = cpu_to_le64(phy_type_low) &
 			abilities->phy_type_low;
 	} else {
-		err = -EAGAIN;
 		netdev_info(netdev, "Nothing changed. No PHY_TYPE is corresponded to advertised link speed.\n");
 		goto done;
 	}
 
-	/* If link is up put link down */
+	/* If link is up, put link down */
 	if (p->phy.link_info.link_info & ICE_AQ_LINK_UP) {
 		/* Tell the OS link is going down, the link will go
-		 * back up when fw says it is ready asynchronously
+		 * back up when FW says it is ready asynchronously
 		 */
 		ice_print_link_msg(np->vsi, false);
 		netif_carrier_off(netdev);
 		netif_tx_stop_all_queues(netdev);
 	}
 
-	/* make the aq call */
-	status = ice_aq_set_phy_cfg(&pf->hw, lport, &config, NULL);
+	/* make the AQ call */
+	status = ice_aq_set_phy_cfg(&pf->hw, p, &config, NULL);
 	if (status) {
 		netdev_info(netdev, "Set phy config failed,\n");
 		err = -EAGAIN;
+		goto done;
 	}
 
+	/* Save speed request */
+	p->phy.curr_user_speed_req = adv_link_speed;
 done:
-	devm_kfree(&pf->pdev->dev, abilities);
-	clear_bit(__ICE_CFG_BUSY, pf->state);
+	kfree(abilities);
+	clear_bit(ICE_CFG_BUSY, pf->state);
 
 	return err;
 }
+#endif /* ETHTOOL_GLINKSETTINGS */
+
+/**
+ * ice_parse_hdrs - parses headers from RSS hash input
+ * @nfc: ethtool rxnfc command
+ *
+ * This function parses the rxnfc command and returns intended
+ * header types for RSS configuration
+ */
+static u32 ice_parse_hdrs(struct ethtool_rxnfc *nfc)
+{
+	u32 hdrs = ICE_FLOW_SEG_HDR_NONE;
+
+	switch (nfc->flow_type) {
+	case TCP_V4_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV4;
+		break;
+	case UDP_V4_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV4;
+		break;
+	case SCTP_V4_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV4;
+		break;
+	case TCP_V6_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV6;
+		break;
+	case UDP_V6_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV6;
+		break;
+	case SCTP_V6_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV6;
+		break;
+	default:
+		break;
+	}
+	return hdrs;
+}
+
+#define ICE_FLOW_HASH_FLD_IPV4_SA	BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)
+#define ICE_FLOW_HASH_FLD_IPV6_SA	BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)
+#define ICE_FLOW_HASH_FLD_IPV4_DA	BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)
+#define ICE_FLOW_HASH_FLD_IPV6_DA	BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)
+#define ICE_FLOW_HASH_FLD_TCP_SRC_PORT	BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)
+#define ICE_FLOW_HASH_FLD_TCP_DST_PORT	BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)
+#define ICE_FLOW_HASH_FLD_UDP_SRC_PORT	BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)
+#define ICE_FLOW_HASH_FLD_UDP_DST_PORT	BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)
+#define ICE_FLOW_HASH_FLD_SCTP_SRC_PORT	\
+	BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)
+#define ICE_FLOW_HASH_FLD_SCTP_DST_PORT	\
+	BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)
+
+/**
+ * ice_parse_hash_flds - parses hash fields from RSS hash input
+ * @nfc: ethtool rxnfc command
+ *
+ * This function parses the rxnfc command and returns intended
+ * hash fields for RSS configuration
+ */
+static u64 ice_parse_hash_flds(struct ethtool_rxnfc *nfc)
+{
+	u64 hfld = ICE_HASH_INVALID;
+
+	if (nfc->data & RXH_IP_SRC || nfc->data & RXH_IP_DST) {
+		switch (nfc->flow_type) {
+		case TCP_V4_FLOW:
+		case UDP_V4_FLOW:
+		case SCTP_V4_FLOW:
+			if (nfc->data & RXH_IP_SRC)
+				hfld |= ICE_FLOW_HASH_FLD_IPV4_SA;
+			if (nfc->data & RXH_IP_DST)
+				hfld |= ICE_FLOW_HASH_FLD_IPV4_DA;
+			break;
+		case TCP_V6_FLOW:
+		case UDP_V6_FLOW:
+		case SCTP_V6_FLOW:
+			if (nfc->data & RXH_IP_SRC)
+				hfld |= ICE_FLOW_HASH_FLD_IPV6_SA;
+			if (nfc->data & RXH_IP_DST)
+				hfld |= ICE_FLOW_HASH_FLD_IPV6_DA;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (nfc->data & RXH_L4_B_0_1 || nfc->data & RXH_L4_B_2_3) {
+		switch (nfc->flow_type) {
+		case TCP_V4_FLOW:
+		case TCP_V6_FLOW:
+			if (nfc->data & RXH_L4_B_0_1)
+				hfld |= ICE_FLOW_HASH_FLD_TCP_SRC_PORT;
+			if (nfc->data & RXH_L4_B_2_3)
+				hfld |= ICE_FLOW_HASH_FLD_TCP_DST_PORT;
+			break;
+		case UDP_V4_FLOW:
+		case UDP_V6_FLOW:
+			if (nfc->data & RXH_L4_B_0_1)
+				hfld |= ICE_FLOW_HASH_FLD_UDP_SRC_PORT;
+			if (nfc->data & RXH_L4_B_2_3)
+				hfld |= ICE_FLOW_HASH_FLD_UDP_DST_PORT;
+			break;
+		case SCTP_V4_FLOW:
+		case SCTP_V6_FLOW:
+			if (nfc->data & RXH_L4_B_0_1)
+				hfld |= ICE_FLOW_HASH_FLD_SCTP_SRC_PORT;
+			if (nfc->data & RXH_L4_B_2_3)
+				hfld |= ICE_FLOW_HASH_FLD_SCTP_DST_PORT;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return hfld;
+}
+
+/**
+ * ice_set_rss_hash_opt - Enable/Disable flow types for RSS hash
+ * @vsi: the VSI being configured
+ * @nfc: ethtool rxnfc command
+ *
+ * Returns Success if the flow input set is supported.
+ */
+static int
+ice_set_rss_hash_opt(struct ice_vsi *vsi, struct ethtool_rxnfc *nfc)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_rss_hash_cfg cfg;
+	enum ice_status status;
+	struct device *dev;
+	u64 hashed_flds;
+	u32 hdrs;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	hashed_flds = ice_parse_hash_flds(nfc);
+	if (hashed_flds == ICE_HASH_INVALID) {
+		dev_dbg(dev, "Invalid hash fields, vsi num = %d\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	hdrs = ice_parse_hdrs(nfc);
+	if (hdrs == ICE_FLOW_SEG_HDR_NONE) {
+		dev_dbg(dev, "Header type is not valid, vsi num = %d\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	cfg.hash_flds = hashed_flds;
+	cfg.addl_hdrs = hdrs;
+	cfg.hdr_type = ICE_RSS_ANY_HEADERS;
+	cfg.symm = false;
+	status = ice_add_rss_cfg(&pf->hw, vsi->idx, &cfg);
+	if (status) {
+		dev_dbg(dev, "ice_add_rss_cfg failed, vsi num = %d, error = %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_rss_hash_opt - Retrieve hash fields for a given flow-type
+ * @vsi: the VSI being configured
+ * @nfc: ethtool rxnfc command
+ */
+static void
+ice_get_rss_hash_opt(struct ice_vsi *vsi, struct ethtool_rxnfc *nfc)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	u64 hash_flds;
+	u32 hdrs;
+
+	dev = ice_pf_to_dev(pf);
+
+	nfc->data = 0;
+	if (ice_is_safe_mode(pf)) {
+		dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n",
+			vsi->vsi_num);
+		return;
+	}
+
+	hdrs = ice_parse_hdrs(nfc);
+	if (hdrs == ICE_FLOW_SEG_HDR_NONE) {
+		dev_dbg(dev, "Header type is not valid, vsi num = %d\n",
+			vsi->vsi_num);
+		return;
+	}
+
+	hash_flds = ice_get_rss_cfg(&pf->hw, vsi->idx, hdrs);
+	if (hash_flds == ICE_HASH_INVALID) {
+		dev_dbg(dev, "No hash fields found for the given header type, vsi num = %d\n",
+			vsi->vsi_num);
+		return;
+	}
+
+	if (hash_flds & ICE_FLOW_HASH_FLD_IPV4_SA ||
+	    hash_flds & ICE_FLOW_HASH_FLD_IPV6_SA)
+		nfc->data |= (u64)RXH_IP_SRC;
+
+	if (hash_flds & ICE_FLOW_HASH_FLD_IPV4_DA ||
+	    hash_flds & ICE_FLOW_HASH_FLD_IPV6_DA)
+		nfc->data |= (u64)RXH_IP_DST;
+
+	if (hash_flds & ICE_FLOW_HASH_FLD_TCP_SRC_PORT ||
+	    hash_flds & ICE_FLOW_HASH_FLD_UDP_SRC_PORT ||
+	    hash_flds & ICE_FLOW_HASH_FLD_SCTP_SRC_PORT)
+		nfc->data |= (u64)RXH_L4_B_0_1;
+
+	if (hash_flds & ICE_FLOW_HASH_FLD_TCP_DST_PORT ||
+	    hash_flds & ICE_FLOW_HASH_FLD_UDP_DST_PORT ||
+	    hash_flds & ICE_FLOW_HASH_FLD_SCTP_DST_PORT)
+		nfc->data |= (u64)RXH_L4_B_2_3;
+}
+
+/**
+ * ice_set_rxnfc - command to set Rx flow rules.
+ * @netdev: network interface device structure
+ * @cmd: ethtool rxnfc command
+ *
+ * Returns 0 for success and negative values for errors
+ */
+static int ice_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		return ice_add_ntuple_ethtool(vsi, cmd);
+	case ETHTOOL_SRXCLSRLDEL:
+		return ice_del_ntuple_ethtool(vsi, cmd);
+	case ETHTOOL_SRXFH:
+		return ice_set_rss_hash_opt(vsi, cmd);
+	default:
+		break;
+	}
+	return -EOPNOTSUPP;
+}
 
 /**
  * ice_get_rxnfc - command to get Rx flow classification rules
@@ -2541,12 +4357,31 @@ ice_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd,
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 	int ret = -EOPNOTSUPP;
+	struct ice_hw *hw;
+
+	hw = &vsi->back->hw;
 
 	switch (cmd->cmd) {
 	case ETHTOOL_GRXRINGS:
 		cmd->data = vsi->rss_size;
 		ret = 0;
 		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = hw->fdir_active_fltr;
+		/* report max rule count */
+		cmd->data = ice_ntuple_get_max_fltr_cnt(hw);
+		ret = 0;
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		ret = ice_get_ethtool_fdir_entry(hw, cmd);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		ret = ice_get_fdir_fltr_ids(hw, cmd, (u32 *)rule_locs);
+		break;
+	case ETHTOOL_GRXFH:
+		ice_get_rss_hash_opt(vsi, cmd);
+		ret = 0;
+		break;
 	default:
 		break;
 	}
@@ -2577,10 +4412,13 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 {
 	struct ice_ring *tx_rings = NULL, *rx_rings = NULL;
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+#ifdef HAVE_XDP_SUPPORT
+	struct ice_ring *xdp_rings = NULL;
+#endif /* HAVE_XDP_SUPPORT */
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
 	int i, timeout = 50, err = 0;
-	u32 new_rx_cnt, new_tx_cnt;
+	u16 new_rx_cnt, new_tx_cnt;
 
 	if (ring->tx_pending > ICE_MAX_NUM_DESC ||
 	    ring->tx_pending < ICE_MIN_NUM_DESC ||
@@ -2595,13 +4433,11 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 
 	new_tx_cnt = ALIGN(ring->tx_pending, ICE_REQ_DESC_MULTIPLE);
 	if (new_tx_cnt != ring->tx_pending)
-		netdev_info(netdev,
-			    "Requested Tx descriptor count rounded up to %d\n",
+		netdev_info(netdev, "Requested Tx descriptor count rounded up to %d\n",
 			    new_tx_cnt);
 	new_rx_cnt = ALIGN(ring->rx_pending, ICE_REQ_DESC_MULTIPLE);
 	if (new_rx_cnt != ring->rx_pending)
-		netdev_info(netdev,
-			    "Requested Rx descriptor count rounded up to %d\n",
+		netdev_info(netdev, "Requested Rx descriptor count rounded up to %d\n",
 			    new_rx_cnt);
 
 	/* if nothing to do return success */
@@ -2611,7 +4447,16 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 		return 0;
 	}
 
-	while (test_and_set_bit(__ICE_CFG_BUSY, pf->state)) {
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	/* If there is a AF_XDP UMEM attached to any of Rx rings,
+	 * disallow changing the number of descriptors -- regardless
+	 * if the netdev is running or not.
+	 */
+	if (ice_xsk_any_rx_ring_ena(vsi))
+		return -EBUSY;
+
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) {
 		timeout--;
 		if (!timeout)
 			return -EBUSY;
@@ -2624,6 +4469,13 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 			vsi->tx_rings[i]->count = new_tx_cnt;
 		for (i = 0; i < vsi->alloc_rxq; i++)
 			vsi->rx_rings[i]->count = new_rx_cnt;
+#ifdef HAVE_XDP_SUPPORT
+		if (ice_is_xdp_ena_vsi(vsi))
+			for (i = 0; i < vsi->num_xdp_txq; i++)
+				vsi->xdp_rings[i]->count = new_tx_cnt;
+#endif /* HAVE_XDP_SUPPORT */
+		vsi->num_tx_desc = (u16)new_tx_cnt;
+		vsi->num_rx_desc = (u16)new_rx_cnt;
 		netdev_dbg(netdev, "Link is down, descriptor count change happens when link is brought up\n");
 		goto done;
 	}
@@ -2635,8 +4487,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 	netdev_info(netdev, "Changing Tx descriptor count from %d to %d\n",
 		    vsi->tx_rings[0]->count, new_tx_cnt);
 
-	tx_rings = devm_kcalloc(&pf->pdev->dev, vsi->num_txq,
-				sizeof(*tx_rings), GFP_KERNEL);
+	tx_rings = kcalloc(vsi->num_txq, sizeof(*tx_rings), GFP_KERNEL);
 	if (!tx_rings) {
 		err = -ENOMEM;
 		goto done;
@@ -2650,15 +4501,44 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 		tx_rings[i].tx_buf = NULL;
 		err = ice_setup_tx_ring(&tx_rings[i]);
 		if (err) {
-			while (i) {
-				i--;
+			while (i--)
 				ice_clean_tx_ring(&tx_rings[i]);
-			}
-			devm_kfree(&pf->pdev->dev, tx_rings);
+			kfree(tx_rings);
 			goto done;
 		}
 	}
 
+#ifdef HAVE_XDP_SUPPORT
+	if (!ice_is_xdp_ena_vsi(vsi))
+		goto process_rx;
+
+	/* alloc updated XDP resources */
+	netdev_info(netdev, "Changing XDP descriptor count from %d to %d\n",
+		    vsi->xdp_rings[0]->count, new_tx_cnt);
+
+	xdp_rings = kcalloc(vsi->num_xdp_txq, sizeof(*xdp_rings), GFP_KERNEL);
+	if (!xdp_rings) {
+		err = -ENOMEM;
+		goto free_tx;
+	}
+
+	for (i = 0; i < vsi->num_xdp_txq; i++) {
+		/* clone ring and setup updated count */
+		xdp_rings[i] = *vsi->xdp_rings[i];
+		xdp_rings[i].count = new_tx_cnt;
+		xdp_rings[i].desc = NULL;
+		xdp_rings[i].tx_buf = NULL;
+		err = ice_setup_tx_ring(&xdp_rings[i]);
+		if (err) {
+			while (i--)
+				ice_clean_tx_ring(&xdp_rings[i]);
+			kfree(xdp_rings);
+			goto free_tx;
+		}
+		ice_set_ring_xdp(&xdp_rings[i]);
+	}
+#endif /* HAVE_XDP_SUPPORT */
+
 process_rx:
 	if (new_rx_cnt == vsi->rx_rings[0]->count)
 		goto process_link;
@@ -2667,8 +4547,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 	netdev_info(netdev, "Changing Rx descriptor count from %d to %d\n",
 		    vsi->rx_rings[0]->count, new_rx_cnt);
 
-	rx_rings = devm_kcalloc(&pf->pdev->dev, vsi->num_rxq,
-				sizeof(*rx_rings), GFP_KERNEL);
+	rx_rings = kcalloc(vsi->num_rxq, sizeof(*rx_rings), GFP_KERNEL);
 	if (!rx_rings) {
 		err = -ENOMEM;
 		goto done;
@@ -2698,7 +4577,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 				i--;
 				ice_free_rx_ring(&rx_rings[i]);
 			}
-			devm_kfree(&pf->pdev->dev, rx_rings);
+			kfree(rx_rings);
 			err = -ENOMEM;
 			goto free_tx;
 		}
@@ -2708,7 +4587,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 	/* Bring interface down, copy in the new ring info, then restore the
 	 * interface. if VSI is up, bring it down and then back up
 	 */
-	if (!test_and_set_bit(__ICE_DOWN, vsi->state)) {
+	if (!test_and_set_bit(ICE_VSI_DOWN, vsi->state)) {
 		ice_down(vsi);
 
 		if (tx_rings) {
@@ -2716,7 +4595,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 				ice_free_tx_ring(vsi->tx_rings[i]);
 				*vsi->tx_rings[i] = tx_rings[i];
 			}
-			devm_kfree(&pf->pdev->dev, tx_rings);
+			kfree(tx_rings);
 		}
 
 		if (rx_rings) {
@@ -2734,9 +4613,21 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 				rx_rings[i].next_to_alloc = 0;
 				*vsi->rx_rings[i] = rx_rings[i];
 			}
-			devm_kfree(&pf->pdev->dev, rx_rings);
+			kfree(rx_rings);
+		}
+
+#ifdef HAVE_XDP_SUPPORT
+		if (xdp_rings) {
+			for (i = 0; i < vsi->num_xdp_txq; i++) {
+				ice_free_tx_ring(vsi->xdp_rings[i]);
+				*vsi->xdp_rings[i] = xdp_rings[i];
+			}
+			kfree(xdp_rings);
 		}
+#endif /* HAVE_XDP_SUPPORT */
 
+		vsi->num_tx_desc = new_tx_cnt;
+		vsi->num_rx_desc = new_rx_cnt;
 		ice_up(vsi);
 	}
 	goto done;
@@ -2746,38 +4637,15 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 	if (tx_rings) {
 		ice_for_each_txq(vsi, i)
 			ice_free_tx_ring(&tx_rings[i]);
-		devm_kfree(&pf->pdev->dev, tx_rings);
+		kfree(tx_rings);
 	}
 
 done:
-	clear_bit(__ICE_CFG_BUSY, pf->state);
+	clear_bit(ICE_CFG_BUSY, pf->state);
 	return err;
 }
 
-static int ice_nway_reset(struct net_device *netdev)
-{
-	/* restart autonegotiation */
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_port_info *pi;
-	enum ice_status status;
-
-	pi = vsi->port_info;
-	/* If VSI state is up, then restart autoneg with link up */
-	if (!test_bit(__ICE_DOWN, vsi->back->state))
-		status = ice_aq_set_link_restart_an(pi, true, NULL);
-	else
-		status = ice_aq_set_link_restart_an(pi, false, NULL);
-
-	if (status) {
-		netdev_info(netdev, "link restart failed, err %d aq_err %d\n",
-			    status, pi->hw->adminq.sq_last_status);
-		return -EIO;
-	}
-
-	return 0;
-}
-
+#ifdef ETHTOOL_GLINKSETTINGS
 /**
  * ice_get_pauseparam - Get Flow Control status
  * @netdev: network interface device structure
@@ -2794,7 +4662,6 @@ ice_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_port_info *pi = np->vsi->port_info;
 	struct ice_aqc_get_phy_caps_data *pcaps;
-	struct ice_vsi *vsi = np->vsi;
 	struct ice_dcbx_cfg *dcbx_cfg;
 	enum ice_status status;
 
@@ -2802,34 +4669,82 @@ ice_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	pause->rx_pause = 0;
 	pause->tx_pause = 0;
 
-	dcbx_cfg = &pi->local_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 
-	pcaps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*pcaps),
-			     GFP_KERNEL);
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
 	if (!pcaps)
 		return;
 
 	/* Get current PHY config */
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps,
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
 				     NULL);
 	if (status)
 		goto out;
 
-	pause->autoneg = ((pcaps->caps & ICE_AQC_PHY_AN_MODE) ?
-			AUTONEG_ENABLE : AUTONEG_DISABLE);
+	pause->autoneg = ice_is_phy_caps_an_enabled(pcaps) ? AUTONEG_ENABLE :
+							     AUTONEG_DISABLE;
+
+	if (dcbx_cfg->pfc.pfcena)
+		/* PFC enabled so report LFC as off */
+		goto out;
+
+	if (pcaps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE)
+		pause->tx_pause = 1;
+	if (pcaps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)
+		pause->rx_pause = 1;
+
+out:
+	kfree(pcaps);
+}
+#else /* ETHTOOL_GLINKSETTINGS */
+
+/**
+ * ice_get_pauseparam - Get Flow Control status
+ * @netdev: network interface device structure
+ * @pause: ethernet pause (flow control) parameters
+ *
+ * Get autonegotiated flow control status from link status.
+ */
+static void
+ice_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_port_info *pi = np->vsi->port_info;
+	struct ice_link_status *hw_link_info;
+	struct ice_dcbx_cfg *dcbx_cfg;
+
+	hw_link_info = &pi->phy.link_info;
+
+	/* Initialize pause params */
+	pause->rx_pause = 0;
+	pause->tx_pause = 0;
+
+	pause->autoneg = ((hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
+			  AUTONEG_ENABLE : AUTONEG_DISABLE);
+
+	dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 
 	if (dcbx_cfg->pfc.pfcena)
 		/* PFC enabled so report LFC as off */
-		goto out;
+		return;
 
-	if (pcaps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE)
+	/* Get flow control status based on autonegotiation */
+	switch (pi->fc.current_mode) {
+	case ICE_FC_TX_PAUSE:
 		pause->tx_pause = 1;
-	if (pcaps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)
+		break;
+	case ICE_FC_RX_PAUSE:
 		pause->rx_pause = 1;
-
-out:
-	devm_kfree(&vsi->back->pdev->dev, pcaps);
+		break;
+	case ICE_FC_FULL:
+		pause->tx_pause = 1;
+		pause->rx_pause = 1;
+		break;
+	default:
+		break;
+	}
 }
+#endif /* ETHTOOL_GLINKSETTINGS */
 
 /**
  * ice_set_pauseparam - Set Flow Control parameter
@@ -2840,7 +4755,9 @@ static int
 ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+#ifdef ETHTOOL_GLINKSETTINGS
 	struct ice_aqc_get_phy_caps_data *pcaps;
+#endif
 	struct ice_link_status *hw_link_info;
 	struct ice_pf *pf = np->vsi->back;
 	struct ice_dcbx_cfg *dcbx_cfg;
@@ -2855,7 +4772,7 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 
 	pi = vsi->port_info;
 	hw_link_info = &pi->phy.link_info;
-	dcbx_cfg = &pi->local_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 	link_up = hw_link_info->link_info & ICE_AQ_LINK_UP;
 
 	/* Changing the port's flow control is not supported if this isn't the
@@ -2866,6 +4783,7 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 		return -EOPNOTSUPP;
 	}
 
+#ifdef ETHTOOL_GLINKSETTINGS
 	/* Get pause param reports configured and negotiated flow control pause
 	 * when ETHTOOL_GLINKSETTINGS is defined. Since ETHTOOL_GLINKSETTINGS is
 	 * defined get pause param pause->autoneg reports SW configured setting,
@@ -2877,17 +4795,23 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 		return -ENOMEM;
 
 	/* Get current PHY config */
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps,
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
 				     NULL);
 	if (status) {
 		kfree(pcaps);
 		return -EIO;
 	}
 
-	is_an = ((pcaps->caps & ICE_AQC_PHY_AN_MODE) ?
+	is_an = ice_is_phy_caps_an_enabled(pcaps) ? AUTONEG_ENABLE :
+						    AUTONEG_DISABLE;
+#else /* ETHTOOL_GLINKSETTINGS */
+	is_an = ((hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
 			AUTONEG_ENABLE : AUTONEG_DISABLE);
+#endif /* ETHTOOL_GLINKSETTINGS */
 
+#ifdef ETHTOOL_GLINKSETTINGS
 	kfree(pcaps);
+#endif
 
 	if (pause->autoneg != is_an) {
 		netdev_info(netdev, "To change autoneg please use: ethtool -s <dev> autoneg <on|off>\n");
@@ -2895,7 +4819,7 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	}
 
 	/* If we have link and don't have autoneg */
-	if (!test_bit(__ICE_DOWN, pf->state) &&
+	if (!test_bit(ICE_DOWN, pf->state) &&
 	    !(hw_link_info->an_info & ICE_AQ_AN_COMPLETED)) {
 		/* Send message that it might not necessarily work*/
 		netdev_info(netdev, "Autoneg did not complete so changing settings may not result in an actual change.\n");
@@ -2920,34 +4844,26 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	status = ice_set_fc(pi, &aq_failures, link_up);
 
 	if (aq_failures & ICE_SET_FC_AQ_FAIL_GET) {
-		netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with err %d aq_err %d\n",
-			    status, hw->adminq.sq_last_status);
+		netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with err %s aq_err %s\n",
+			    ice_stat_str(status),
+			    ice_aq_str(hw->adminq.sq_last_status));
 		err = -EAGAIN;
 	} else if (aq_failures & ICE_SET_FC_AQ_FAIL_SET) {
-		netdev_info(netdev, "Set fc failed on the set_phy_config call with err %d aq_err %d\n",
-			    status, hw->adminq.sq_last_status);
+		netdev_info(netdev, "Set fc failed on the set_phy_config call with err %s aq_err %s\n",
+			    ice_stat_str(status),
+			    ice_aq_str(hw->adminq.sq_last_status));
 		err = -EAGAIN;
 	} else if (aq_failures & ICE_SET_FC_AQ_FAIL_UPDATE) {
-		netdev_info(netdev, "Set fc failed on the get_link_info call with err %d aq_err %d\n",
-			    status, hw->adminq.sq_last_status);
+		netdev_info(netdev, "Set fc failed on the get_link_info call with err %s aq_err %s\n",
+			    ice_stat_str(status),
+			    ice_aq_str(hw->adminq.sq_last_status));
 		err = -EAGAIN;
 	}
 
-	if (!test_bit(__ICE_DOWN, pf->state)) {
-		/* Give it a little more time to try to come back. If still
-		 * down, restart autoneg link or reinitialize the interface.
-		 */
-		msleep(75);
-		if (!test_bit(__ICE_DOWN, pf->state))
-			return ice_nway_reset(netdev);
-
-		ice_down(vsi);
-		ice_up(vsi);
-	}
-
 	return err;
 }
 
+#if defined(ETHTOOL_GRSSH) && defined(ETHTOOL_SRSSH)
 /**
  * ice_get_rxfh_key_size - get the RSS hash key size
  * @netdev: network interface device structure
@@ -2972,116 +4888,463 @@ static u32 ice_get_rxfh_indir_size(struct net_device *netdev)
 	return np->vsi->rss_table_size;
 }
 
+#ifdef HAVE_RXFH_HASHFUNC
+/**
+ * ice_get_rxfh - get the Rx flow hash indirection table
+ * @netdev: network interface device structure
+ * @indir: indirection table
+ * @key: hash key
+ * @hfunc: hash function
+ *
+ * Reads the indirection table directly from the hardware.
+ */
+static int
+ice_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc)
+#else
+static int ice_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+#endif
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int err, i;
+	u8 *lut;
+
+#ifdef HAVE_RXFH_HASHFUNC
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+#endif
+
+	if (!indir)
+		return 0;
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+		/* RSS not supported return error here */
+		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
+		return -EIO;
+	}
+
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
+	if (!lut)
+		return -ENOMEM;
+
+	err = ice_get_rss_key(vsi, key);
+	if (err)
+		goto out;
+
+	err = ice_get_rss_lut(vsi, lut, vsi->rss_table_size);
+	if (err)
+		goto out;
+
+	for (i = 0; i < vsi->rss_table_size; i++)
+		indir[i] = (u32)(lut[i]);
+
+out:
+	kfree(lut);
+	return err;
+}
+
+#ifdef HAVE_RXFH_HASHFUNC
+/**
+ * ice_set_rxfh - set the Rx flow hash indirection table
+ * @netdev: network interface device structure
+ * @indir: indirection table
+ * @key: hash key
+ * @hfunc: hash function
+ *
+ * Returns -EINVAL if the table specifies an invalid queue ID, otherwise
+ * returns 0 after programming the table.
+ */
+static int
+ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
+	     const u8 hfunc)
+#elif defined(HAVE_RXFH_NONCONST)
+static int ice_set_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+#else
+static int
+ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key)
+#endif /* HAVE_RXFH_HASHFUNC */
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+#ifdef HAVE_RXFH_HASHFUNC
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+#endif
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+		/* RSS not supported return error here */
+		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
+		return -EIO;
+	}
+
+	/* Verify user input. */
+	if (indir) {
+		int i;
+
+		for (i = 0; i < vsi->rss_table_size; i++)
+			if (indir[i] >= vsi->rss_size)
+				return -EINVAL;
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev,
+			   "Cannot change RSS params with ADQ configured.\n");
+		return -EOPNOTSUPP;
+	}
+
+#endif /* NETIF_F_HW_TC */
+	if (key) {
+		if (!vsi->rss_hkey_user) {
+			vsi->rss_hkey_user =
+				devm_kzalloc(dev, ICE_VSIQF_HKEY_ARRAY_SIZE,
+					     GFP_KERNEL);
+			if (!vsi->rss_hkey_user)
+				return -ENOMEM;
+		}
+		memcpy(vsi->rss_hkey_user, key, ICE_VSIQF_HKEY_ARRAY_SIZE);
+
+		err = ice_set_rss_key(vsi, vsi->rss_hkey_user);
+		if (err)
+			return err;
+	}
+
+	if (!vsi->rss_lut_user) {
+		vsi->rss_lut_user = devm_kzalloc(dev, vsi->rss_table_size,
+						 GFP_KERNEL);
+		if (!vsi->rss_lut_user)
+			return -ENOMEM;
+	}
+
+	/* Each 32 bits pointed by 'indir' is stored with a lut entry */
+	if (indir) {
+		int i;
+
+		for (i = 0; i < vsi->rss_table_size; i++)
+			vsi->rss_lut_user[i] = (u8)(indir[i]);
+	} else {
+		ice_fill_rss_lut(vsi->rss_lut_user, vsi->rss_table_size,
+				 vsi->rss_size);
+	}
+
+	err = ice_set_rss_lut(vsi, vsi->rss_lut_user, vsi->rss_table_size);
+	if (err)
+		return err;
+
+	return 0;
+}
+#endif /* ETHTOOL_GRSSH && ETHTOOL_SRSSH */
+
+static int
+ice_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(dev);
+
+	/* only report timestamping if PTP is enabled */
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return ethtool_op_get_ts_info(dev, info);
+
+	info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE |
+				SOF_TIMESTAMPING_TX_HARDWARE |
+				SOF_TIMESTAMPING_RX_HARDWARE |
+				SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	info->phc_index = ice_get_ptp_clock_index(pf);
+
+	info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON);
+
+	info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) | BIT(HWTSTAMP_FILTER_ALL);
+
+	return 0;
+}
+
+/**
+ * ice_get_max_txq - return the maximum number of Tx queues for in a PF
+ * @pf: PF structure
+ */
+static int ice_get_max_txq(struct ice_pf *pf)
+{
+	return min3(pf->num_lan_msix, (u16)num_online_cpus(),
+		    (u16)pf->hw.func_caps.common_cap.num_txq);
+}
+
+/**
+ * ice_get_max_rxq - return the maximum number of Rx queues for in a PF
+ * @pf: PF structure
+ */
+static int ice_get_max_rxq(struct ice_pf *pf)
+{
+	return min3(pf->num_lan_msix, (u16)num_online_cpus(),
+		    (u16)pf->hw.func_caps.common_cap.num_rxq);
+}
+
+/**
+ * ice_get_combined_cnt - return the current number of combined channels
+ * @vsi: PF VSI pointer
+ *
+ * Go through all queue vectors and count ones that have both Rx and Tx ring
+ * attached
+ */
+static u32 ice_get_combined_cnt(struct ice_vsi *vsi)
+{
+	u32 combined = 0;
+	int q_idx;
+
+	ice_for_each_q_vector(vsi, q_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
+
+		if (q_vector->rx.ring && q_vector->tx.ring)
+			combined++;
+	}
+
+	return combined;
+}
+
+/**
+ * ice_get_channels - get the current and max supported channels
+ * @dev: network interface device structure
+ * @ch: ethtool channel data structure
+ */
+static void
+ice_get_channels(struct net_device *dev, struct ethtool_channels *ch)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+
+	/* report maximum channels */
+	ch->max_rx = ice_get_max_rxq(pf);
+	ch->max_tx = ice_get_max_txq(pf);
+	ch->max_combined = min_t(int, ch->max_rx, ch->max_tx);
+
+	/* report current channels */
+	ch->combined_count = ice_get_combined_cnt(vsi);
+	ch->rx_count = vsi->num_rxq - ch->combined_count;
+	ch->tx_count = vsi->num_txq - ch->combined_count;
+#ifdef HAVE_NETDEV_SB_DEV
+
+	if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags)) {
+		/* L2 forwarding devices are single queue so we infer one
+		 * device is one channel
+		 */
+		ch->max_combined += pf->max_num_macvlan;
+		ch->combined_count += pf->num_macvlan;
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	/* report other queues */
+	ch->other_count = test_bit(ICE_FLAG_FD_ENA, pf->flags) ? 1 : 0;
+	ch->max_other = ch->other_count;
+}
+
+/**
+ * ice_get_valid_rss_size - return valid number of RSS queues
+ * @hw: pointer to the HW structure
+ * @new_size: requested RSS queues
+ */
+static int ice_get_valid_rss_size(struct ice_hw *hw, int new_size)
+{
+	struct ice_hw_common_caps *caps = &hw->func_caps.common_cap;
+
+	return min_t(int, new_size, BIT(caps->rss_table_entry_width));
+}
+
+/**
+ * ice_vsi_set_dflt_rss_lut - set default RSS LUT with requested RSS size
+ * @vsi: VSI to reconfigure RSS LUT on
+ * @req_rss_size: requested range of queue numbers for hashing
+ *
+ * Set the VSI's RSS parameters, configure the RSS LUT based on these.
+ */
+static int ice_vsi_set_dflt_rss_lut(struct ice_vsi *vsi, int req_rss_size)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	struct ice_hw *hw;
+	int err;
+	u8 *lut;
+
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+
+	if (!req_rss_size)
+		return -EINVAL;
+
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
+	if (!lut)
+		return -ENOMEM;
+
+	/* set RSS LUT parameters */
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+		vsi->rss_size = 1;
+	else
+		vsi->rss_size = ice_get_valid_rss_size(hw, req_rss_size);
+
+	/* create/set RSS LUT */
+	ice_fill_rss_lut(lut, vsi->rss_table_size, vsi->rss_size);
+	err = ice_set_rss_lut(vsi, lut, vsi->rss_table_size);
+	if (err)
+		dev_err(dev, "Cannot set RSS lut, err %d aq_err %s\n", err,
+			ice_aq_str(hw->adminq.sq_last_status));
+
+	kfree(lut);
+	return err;
+}
+
+/**
+ * ice_set_channels - set the number channels
+ * @dev: network interface device structure
+ * @ch: ethtool channel data structure
+ */
+static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int new_rx = 0, new_tx = 0;
+	u32 curr_combined;
+
+	/* do not support changing channels in Safe Mode */
+	if (ice_is_safe_mode(pf)) {
+		netdev_err(dev, "Changing channel in Safe Mode is not supported\n");
+		return -EOPNOTSUPP;
+	}
+	/* do not support changing other_count */
+	if (ch->other_count != (test_bit(ICE_FLAG_FD_ENA, pf->flags) ? 1U : 0U))
+		return -EINVAL;
+
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(dev, "Cannot set channels with ADQ configured.\n");
+		return -EOPNOTSUPP;
+	}
+#endif /* NETIF_F_HW_TC */
+#ifdef HAVE_NETDEV_SB_DEV
+	if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags)) {
+		netdev_err(dev, "Cannot set channels when L2 forwarding enabled\n");
+		return -EOPNOTSUPP;
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags) && pf->hw.fdir_active_fltr) {
+		netdev_err(dev, "Cannot set channels when Flow Director filters are active\n");
+		return -EOPNOTSUPP;
+	}
+
+	curr_combined = ice_get_combined_cnt(vsi);
+
+	/* these checks are for cases where user didn't specify a particular
+	 * value on cmd line but we get non-zero value anyway via
+	 * get_channels(); look at ethtool.c in ethtool repository (the user
+	 * space part), particularly, do_schannels() routine
+	 */
+	if (ch->rx_count == vsi->num_rxq - curr_combined)
+		ch->rx_count = 0;
+	if (ch->tx_count == vsi->num_txq - curr_combined)
+		ch->tx_count = 0;
+	if (ch->combined_count == curr_combined)
+		ch->combined_count = 0;
+
+	if (!(ch->combined_count || (ch->rx_count && ch->tx_count))) {
+		netdev_err(dev, "Please specify at least 1 Rx and 1 Tx channel\n");
+		return -EINVAL;
+	}
+
+	new_rx = ch->combined_count + ch->rx_count;
+	new_tx = ch->combined_count + ch->tx_count;
+
+	if (new_rx > ice_get_max_rxq(pf)) {
+		netdev_err(dev, "Maximum allowed Rx channels is %d\n",
+			   ice_get_max_rxq(pf));
+		return -EINVAL;
+	}
+	if (new_tx > ice_get_max_txq(pf)) {
+		netdev_err(dev, "Maximum allowed Tx channels is %d\n",
+			   ice_get_max_txq(pf));
+		return -EINVAL;
+	}
+
+	ice_vsi_recfg_qs(vsi, new_rx, new_tx);
+
+#ifdef IFF_RXFH_CONFIGURED
+	if (!netif_is_rxfh_configured(dev))
+		return ice_vsi_set_dflt_rss_lut(vsi, new_rx);
+
+	/* Update rss_size due to change in Rx queues */
+	vsi->rss_size = ice_get_valid_rss_size(&pf->hw, new_rx);
+
+	return 0;
+#else
+	/* Clear the previous vsi->rss_lut_user because it is assumed to
+	 * be invalid at this point.
+	 */
+	if (vsi->rss_lut_user) {
+		netdev_info(vsi->netdev, "Rx queue count changed, clearing user modified RSS LUT, re-run ethtool [-x|-X] to [check|set] settings if needed\n");
+		devm_kfree(ice_pf_to_dev(pf), vsi->rss_lut_user);
+		vsi->rss_lut_user = NULL;
+	}
+
+	return ice_vsi_set_dflt_rss_lut(vsi, new_rx);
+#endif /* IFF_RXFH_CONFIGURED */
+}
+
 /**
- * ice_get_rxfh - get the Rx flow hash indirection table
+ * ice_get_wol - get current Wake on LAN configuration
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function
- *
- * Reads the indirection table directly from the hardware.
+ * @wol: Ethtool structure to retrieve WoL settings
  */
-static int
-ice_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc)
+static void ice_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	int ret = 0, i;
-	u8 *lut;
-
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-
-	if (!indir)
-		return 0;
-
-	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
-		/* RSS not supported return error here */
-		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
-		return -EIO;
-	}
+	struct ice_pf *pf = np->vsi->back;
 
-	lut = devm_kzalloc(&pf->pdev->dev, vsi->rss_table_size, GFP_KERNEL);
-	if (!lut)
-		return -ENOMEM;
+	if (np->vsi->type != ICE_VSI_PF)
+		netdev_warn(netdev, "Wake on LAN is not supported on this interface!\n");
 
-	if (ice_get_rss(vsi, key, lut, vsi->rss_table_size)) {
-		ret = -EIO;
-		goto out;
+	/* Get WoL settings based on the HW capability */
+	if (ice_is_wol_supported(&pf->hw)) {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = pf->wol_ena ? WAKE_MAGIC : 0;
+	} else {
+		wol->supported = 0;
+		wol->wolopts = 0;
 	}
-
-	for (i = 0; i < vsi->rss_table_size; i++)
-		indir[i] = (u32)(lut[i]);
-
-out:
-	devm_kfree(&pf->pdev->dev, lut);
-	return ret;
 }
 
 /**
- * ice_set_rxfh - set the Rx flow hash indirection table
+ * ice_set_wol - set Wake on LAN on supported device
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function
- *
- * Returns -EINVAL if the table specifies an invalid queue ID, otherwise
- * returns 0 after programming the table.
+ * @wol: Ethtool structure to set WoL
  */
-static int
-ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
-	     const u8 hfunc)
+static int ice_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
-	u8 *seed = NULL;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (vsi->type != ICE_VSI_PF || !ice_is_wol_supported(&pf->hw))
 		return -EOPNOTSUPP;
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
-		/* RSS not supported return error here */
-		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
-		return -EIO;
-	}
-
-	if (key) {
-		if (!vsi->rss_hkey_user) {
-			vsi->rss_hkey_user =
-				devm_kzalloc(&pf->pdev->dev,
-					     ICE_VSIQF_HKEY_ARRAY_SIZE,
-					     GFP_KERNEL);
-			if (!vsi->rss_hkey_user)
-				return -ENOMEM;
-		}
-		memcpy(vsi->rss_hkey_user, key, ICE_VSIQF_HKEY_ARRAY_SIZE);
-		seed = vsi->rss_hkey_user;
-	}
-
-	if (!vsi->rss_lut_user) {
-		vsi->rss_lut_user = devm_kzalloc(&pf->pdev->dev,
-						 vsi->rss_table_size,
-						 GFP_KERNEL);
-		if (!vsi->rss_lut_user)
-			return -ENOMEM;
-	}
-
-	/* Each 32 bits pointed by 'indir' is stored with a lut entry */
-	if (indir) {
-		int i;
+	/* only magic packet is supported */
+	if (wol->wolopts && wol->wolopts != WAKE_MAGIC)
+		return -EOPNOTSUPP;
 
-		for (i = 0; i < vsi->rss_table_size; i++)
-			vsi->rss_lut_user[i] = (u8)(indir[i]);
-	} else {
-		ice_fill_rss_lut(vsi->rss_lut_user, vsi->rss_table_size,
-				 vsi->rss_size);
+	/* Set WoL only if there is a new value */
+	if (pf->wol_ena != !!wol->wolopts) {
+		pf->wol_ena = !!wol->wolopts;
+		device_set_wakeup_enable(ice_pf_to_dev(pf), pf->wol_ena);
+		netdev_dbg(netdev, "WoL magic packet %sabled\n",
+			   pf->wol_ena ? "en" : "dis");
 	}
 
-	if (ice_set_rss(vsi, seed, vsi->rss_lut_user, vsi->rss_table_size))
-		return -EIO;
-
 	return 0;
 }
 
@@ -3106,25 +5369,21 @@ static int
 ice_get_rc_coalesce(struct ethtool_coalesce *ec, enum ice_container_type c_type,
 		    struct ice_ring_container *rc)
 {
-	struct ice_pf *pf;
-
 	if (!rc->ring)
 		return -EINVAL;
 
-	pf = rc->ring->vsi->back;
-
 	switch (c_type) {
 	case ICE_RX_CONTAINER:
-		ec->use_adaptive_rx_coalesce = ITR_IS_DYNAMIC(rc->itr_setting);
-		ec->rx_coalesce_usecs = rc->itr_setting & ~ICE_ITR_DYNAMIC;
+		ec->use_adaptive_rx_coalesce = ITR_IS_DYNAMIC(rc);
+		ec->rx_coalesce_usecs = rc->itr_setting;
 		ec->rx_coalesce_usecs_high = rc->ring->q_vector->intrl;
 		break;
 	case ICE_TX_CONTAINER:
-		ec->use_adaptive_tx_coalesce = ITR_IS_DYNAMIC(rc->itr_setting);
-		ec->tx_coalesce_usecs = rc->itr_setting & ~ICE_ITR_DYNAMIC;
+		ec->use_adaptive_tx_coalesce = ITR_IS_DYNAMIC(rc);
+		ec->tx_coalesce_usecs = rc->itr_setting;
 		break;
 	default:
-		dev_dbg(&pf->pdev->dev, "Invalid c_type %d\n", c_type);
+		dev_dbg(ice_pf_to_dev(rc->ring->vsi->back), "Invalid c_type %d\n", c_type);
 		return -EINVAL;
 	}
 
@@ -3197,12 +5456,14 @@ ice_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
 	return __ice_get_coalesce(netdev, ec, -1);
 }
 
+#ifdef ETHTOOL_PERQUEUE
 static int
 ice_get_per_q_coalesce(struct net_device *netdev, u32 q_num,
 		       struct ethtool_coalesce *ec)
 {
 	return __ice_get_coalesce(netdev, ec, q_num);
 }
+#endif /* ETHTOOL_PERQUEUE */
 
 /**
  * ice_set_rc_coalesce - set ITR values for specific ring container
@@ -3234,17 +5495,21 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec,
 		if (ec->rx_coalesce_usecs_high > ICE_MAX_INTRL ||
 		    (ec->rx_coalesce_usecs_high &&
 		     ec->rx_coalesce_usecs_high < pf->hw.intrl_gran)) {
-			netdev_info(vsi->netdev,
-				    "Invalid value, %s-usecs-high valid values are 0 (disabled), %d-%d\n",
+			netdev_info(vsi->netdev, "Invalid value, %s-usecs-high valid values are 0 (disabled), %d-%d\n",
 				    c_type_str, pf->hw.intrl_gran,
 				    ICE_MAX_INTRL);
 			return -EINVAL;
 		}
+		if (ec->rx_coalesce_usecs_high != rc->ring->q_vector->intrl &&
+		    (ec->use_adaptive_rx_coalesce || ec->use_adaptive_tx_coalesce)) {
+			netdev_info(vsi->netdev, "Invalid value, %s-usecs-high cannot be changed if adaptive-tx or adaptive-rx is enabled\n",
+				    c_type_str);
+			return -EINVAL;
+		}
 		if (ec->rx_coalesce_usecs_high != rc->ring->q_vector->intrl) {
 			rc->ring->q_vector->intrl = ec->rx_coalesce_usecs_high;
-			wr32(&pf->hw, GLINT_RATE(rc->ring->q_vector->reg_idx),
-			     ice_intrl_usec_to_reg(ec->rx_coalesce_usecs_high,
-						   pf->hw.intrl_gran));
+			ice_write_intrl(rc->ring->q_vector,
+					ec->rx_coalesce_usecs_high);
 		}
 
 		use_adaptive_coalesce = ec->use_adaptive_rx_coalesce;
@@ -3252,53 +5517,47 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec,
 
 		break;
 	case ICE_TX_CONTAINER:
-		if (ec->tx_coalesce_usecs_high) {
-			netdev_info(vsi->netdev,
-				    "setting %s-usecs-high is not supported\n",
-				    c_type_str);
-			return -EINVAL;
-		}
+#ifndef ETHTOOL_COALESCE_USECS
+		if (ec->tx_coalesce_usecs_high)
+			return -EOPNOTSUPP;
 
+#endif /* !ETHTOOL_COALESCE_USECS */
 		use_adaptive_coalesce = ec->use_adaptive_tx_coalesce;
 		coalesce_usecs = ec->tx_coalesce_usecs;
 
 		break;
 	default:
-		dev_dbg(&pf->pdev->dev, "Invalid container type %d\n", c_type);
+		dev_dbg(ice_pf_to_dev(pf), "Invalid container type %d\n",
+			c_type);
 		return -EINVAL;
 	}
 
-	itr_setting = rc->itr_setting & ~ICE_ITR_DYNAMIC;
+	itr_setting = rc->itr_setting;
 	if (coalesce_usecs != itr_setting && use_adaptive_coalesce) {
-		netdev_info(vsi->netdev,
-			    "%s interrupt throttling cannot be changed if adaptive-%s is enabled\n",
+		netdev_info(vsi->netdev, "%s interrupt throttling cannot be changed if adaptive-%s is enabled\n",
 			    c_type_str, c_type_str);
 		return -EINVAL;
 	}
 
 	if (coalesce_usecs > ICE_ITR_MAX) {
-		netdev_info(vsi->netdev,
-			    "Invalid value, %s-usecs range is 0-%d\n",
+		netdev_info(vsi->netdev, "Invalid value, %s-usecs range is 0-%d\n",
 			    c_type_str, ICE_ITR_MAX);
 		return -EINVAL;
 	}
 
-	/* hardware only supports an ITR granularity of 2us */
-	if (coalesce_usecs % 2 != 0) {
-		netdev_info(vsi->netdev,
-			    "Invalid value, %s-usecs must be even\n",
-			    c_type_str);
-		return -EINVAL;
-	}
-
 	if (use_adaptive_coalesce) {
-		rc->itr_setting |= ICE_ITR_DYNAMIC;
+		rc->itr_mode = ITR_DYNAMIC;
 	} else {
+		rc->itr_mode = ITR_STATIC;
 		/* store user facing value how it was set */
 		rc->itr_setting = coalesce_usecs;
-		/* set to static and convert to value HW understands */
-		rc->target_itr =
-			ITR_TO_REG(ITR_REG_ALIGN(rc->itr_setting));
+		/* write the change to the register */
+		ice_write_itr(rc, coalesce_usecs);
+		/* force writes to take effect immediately, the flush shouldn't
+		 * be done in the functions above because the intent is for
+		 * them to do lazy writes.
+		 */
+		ice_flush(&pf->hw);
 	}
 
 	return 0;
@@ -3344,6 +5603,50 @@ ice_set_q_coalesce(struct ice_vsi *vsi, struct ethtool_coalesce *ec, int q_num)
 	return 0;
 }
 
+#ifndef ETHTOOL_COALESCE_USECS
+/**
+ * ice_is_coalesce_param_invalid - check for unsupported coalesce parameters
+ * @ec: ethtool structure to fill with driver's coalesce settings
+ */
+static bool ice_is_coalesce_param_invalid(struct ethtool_coalesce *ec)
+{
+	if (ec->stats_block_coalesce_usecs || ec->rate_sample_interval ||
+	    ec->pkt_rate_low || ec->pkt_rate_high ||
+	    ec->rx_max_coalesced_frames || ec->rx_coalesce_usecs_irq ||
+	    ec->rx_max_coalesced_frames_irq || ec->tx_max_coalesced_frames ||
+	    ec->tx_coalesce_usecs_irq || ec->tx_max_coalesced_frames_irq ||
+	    ec->rx_coalesce_usecs_low || ec->rx_max_coalesced_frames_low ||
+	    ec->tx_coalesce_usecs_low || ec->tx_max_coalesced_frames_low ||
+	    ec->rx_max_coalesced_frames_high ||
+	    ec->tx_max_coalesced_frames_high)
+		return true;
+
+	return false;
+}
+#endif /* !ETHTOOL_COALESCE_USECS */
+
+/**
+ * ice_print_if_odd_usecs - print message if user tries to set odd [tx|rx]-usecs
+ * @netdev: netdev used for print
+ * @itr_setting: previous user setting
+ * @use_adaptive_coalesce: if adaptive coalesce is enabled or being enabled
+ * @coalesce_usecs: requested value of [tx|rx]-usecs
+ * @c_type_str: either "rx" or "tx" to match user set field of [tx|rx]-usecs
+ */
+static void
+ice_print_if_odd_usecs(struct net_device *netdev, u16 itr_setting,
+		       u32 use_adaptive_coalesce, u32 coalesce_usecs,
+		       const char *c_type_str)
+{
+	if (use_adaptive_coalesce)
+		return;
+
+	if (itr_setting != coalesce_usecs && (coalesce_usecs % 2))
+		netdev_info(netdev, "User set %s-usecs to %d, device only supports even values. Rounding down and attempting to set %s-usecs to %d\n",
+			    c_type_str, coalesce_usecs, c_type_str,
+			    ITR_REG_ALIGN(coalesce_usecs));
+}
+
 /**
  * __ice_set_coalesce - set ITR/INTRL values for the device
  * @netdev: pointer to the netdev associated with this query
@@ -3360,9 +5663,25 @@ __ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec,
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 
+#ifndef ETHTOOL_COALESCE_USECS
+	if (ice_is_coalesce_param_invalid(ec))
+		return -EOPNOTSUPP;
+#endif /* !ETHTOOL_COALESCE_USECS */
+
 	if (q_num < 0) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[0];
 		int v_idx;
 
+		if (q_vector) {
+			ice_print_if_odd_usecs(netdev, q_vector->rx.itr_setting,
+					       ec->use_adaptive_rx_coalesce,
+					       ec->rx_coalesce_usecs, "rx");
+
+			ice_print_if_odd_usecs(netdev, q_vector->tx.itr_setting,
+					       ec->use_adaptive_tx_coalesce,
+					       ec->tx_coalesce_usecs, "tx");
+		}
+
 		ice_for_each_q_vector(vsi, v_idx) {
 			/* In some cases if DCB is configured the num_[rx|tx]q
 			 * can be less than vsi->num_q_vectors. This check
@@ -3381,7 +5700,6 @@ __ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec,
 		return -EINVAL;
 
 set_complete:
-
 	return 0;
 }
 
@@ -3391,25 +5709,306 @@ ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
 	return __ice_set_coalesce(netdev, ec, -1);
 }
 
+#ifdef ETHTOOL_PERQUEUE
 static int
 ice_set_per_q_coalesce(struct net_device *netdev, u32 q_num,
 		       struct ethtool_coalesce *ec)
 {
 	return __ice_set_coalesce(netdev, ec, q_num);
 }
+#endif /* ETHTOOL_PERQUEUE */
+
+#ifndef ETHTOOL_COALESCE_USECS
+/**
+ * ice_repr_is_coalesce_param_invalid - check for unsupported coalesce params
+ * @ec: ethtool structure to fill with driver's coalesce settings
+ *
+ * Returns true if anything but ec->rx_coalesce_usecs_high is set,
+ * returns false otherwise.
+ *
+ */
+static bool
+ice_repr_is_coalesce_param_invalid(struct ethtool_coalesce *ec)
+{
+	if (ec->rx_coalesce_usecs || ec->rx_max_coalesced_frames ||
+	    ec->rx_coalesce_usecs_irq || ec->rx_max_coalesced_frames_irq ||
+	    ec->tx_coalesce_usecs || ec->tx_max_coalesced_frames ||
+	    ec->tx_coalesce_usecs_irq || ec->tx_max_coalesced_frames_irq ||
+	    ec->stats_block_coalesce_usecs || ec->use_adaptive_rx_coalesce ||
+	    ec->use_adaptive_tx_coalesce || ec->pkt_rate_low ||
+	    ec->rx_coalesce_usecs_low || ec->rx_max_coalesced_frames_low ||
+	    ec->tx_coalesce_usecs_low || ec->tx_max_coalesced_frames_low ||
+	    ec->pkt_rate_high || ec->rx_max_coalesced_frames_high ||
+	    ec->tx_coalesce_usecs_high || ec->tx_max_coalesced_frames_high ||
+	    ec->rate_sample_interval)
+		return true;
+
+	return false;
+}
+#endif /* !ETHTOOL_COALESCE_USECS */
+
+/**
+ * ice_repr_set_coalesce - set coalesce settings for all queues
+ * @netdev: pointer to the netdev associated with this query
+ * @ec: ethtool structure to read the requested coalesce settings
+ *
+ * Return 0 on success, negative otherwise.
+ */
+static int
+ice_repr_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_ring *rx_ring;
+	int v_idx;
+
+#ifndef ETHTOOL_COALESCE_USECS
+	if (ice_repr_is_coalesce_param_invalid(ec))
+		return -EOPNOTSUPP;
+#endif
+
+	if (ec->rx_coalesce_usecs_high > ICE_MAX_INTRL ||
+	    (ec->rx_coalesce_usecs_high &&
+	     ec->rx_coalesce_usecs_high < pf->hw.intrl_gran)) {
+		netdev_info(vsi->netdev, "Invalid value,  rx-usecs-high valid values are 0 (disabled), %d-%d\n",
+			    pf->hw.intrl_gran, ICE_MAX_INTRL);
+		return -EINVAL;
+	}
+
+	ice_for_each_q_vector(vsi, v_idx) {
+		rx_ring = vsi->rx_rings[v_idx];
+		ice_write_intrl(rx_ring->q_vector, ec->rx_coalesce_usecs_high);
+		rx_ring->q_vector->intrl = ec->rx_coalesce_usecs_high;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_repr_get_coalesce - get coalesce settings
+ * @netdev: pointer to the netdev associated with this query
+ * @ec: ethtool structure to read the requested coalesce settings
+ *
+ * Since all queues have the same Rx coalesce high settings,
+ * read the value from te first queue.
+ *
+ * Return 0 on success, negative otherwise.
+ */
+static int
+ice_repr_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+
+	if (!vsi->rx_rings || !vsi->rx_rings[0]->q_vector->rx.ring)
+		return -EINVAL;
+
+	ec->rx_coalesce_usecs_high = vsi->rx_rings[0]->q_vector->intrl;
+
+	return 0;
+}
+
+#ifdef ETHTOOL_GMODULEINFO
+#define ICE_I2C_EEPROM_DEV_ADDR		0xA0
+#define ICE_I2C_EEPROM_DEV_ADDR2	0xA2
+#define ICE_MODULE_TYPE_SFP		0x03
+#define ICE_MODULE_TYPE_QSFP_PLUS	0x0D
+#define ICE_MODULE_TYPE_QSFP28		0x11
+#define ICE_MODULE_SFF_ADDR_MODE	0x04
+#define ICE_MODULE_SFF_DIAG_CAPAB	0x40
+#define ICE_MODULE_REVISION_ADDR	0x01
+#define ICE_MODULE_SFF_8472_COMP	0x5E
+#define ICE_MODULE_SFF_8472_SWAP	0x5C
+#define ICE_MODULE_QSFP_MAX_LEN		640
+
+/**
+ * ice_get_module_info - get SFF module type and revision information
+ * @netdev: network interface device structure
+ * @modinfo: module EEPROM size and layout information structure
+ */
+static int
+ice_get_module_info(struct net_device *netdev,
+		    struct ethtool_modinfo *modinfo)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 sff8472_comp = 0;
+	u8 sff8472_swap = 0;
+	u8 sff8636_rev = 0;
+	u8 value = 0;
+
+	status = ice_aq_sff_eeprom(hw, 0, ICE_I2C_EEPROM_DEV_ADDR, 0x00, 0x00,
+				   0, &value, 1, 0, NULL);
+	if (status)
+		return -EIO;
+
+	switch (value) {
+	case ICE_MODULE_TYPE_SFP:
+		status = ice_aq_sff_eeprom(hw, 0, ICE_I2C_EEPROM_DEV_ADDR,
+					   ICE_MODULE_SFF_8472_COMP, 0x00, 0,
+					   &sff8472_comp, 1, 0, NULL);
+		if (status)
+			return -EIO;
+		status = ice_aq_sff_eeprom(hw, 0, ICE_I2C_EEPROM_DEV_ADDR,
+					   ICE_MODULE_SFF_8472_SWAP, 0x00, 0,
+					   &sff8472_swap, 1, 0, NULL);
+		if (status)
+			return -EIO;
+
+		if (sff8472_swap & ICE_MODULE_SFF_ADDR_MODE) {
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		} else if (sff8472_comp &&
+			   (sff8472_swap & ICE_MODULE_SFF_DIAG_CAPAB)) {
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		}
+		break;
+	case ICE_MODULE_TYPE_QSFP_PLUS:
+	case ICE_MODULE_TYPE_QSFP28:
+		status = ice_aq_sff_eeprom(hw, 0, ICE_I2C_EEPROM_DEV_ADDR,
+					   ICE_MODULE_REVISION_ADDR, 0x00, 0,
+					   &sff8636_rev, 1, 0, NULL);
+		if (status)
+			return -EIO;
+		/* Check revision compliance */
+		if (sff8636_rev > 0x02) {
+			/* Module is SFF-8636 compliant */
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ICE_MODULE_QSFP_MAX_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ICE_MODULE_QSFP_MAX_LEN;
+		}
+		break;
+	default:
+		netdev_warn(netdev, "SFF Module Type not recognized.\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * ice_get_module_eeprom - fill buffer with SFF EEPROM contents
+ * @netdev: network interface device structure
+ * @ee: EEPROM dump request structure
+ * @data: buffer to be filled with EEPROM contents
+ */
+static int
+ice_get_module_eeprom(struct net_device *netdev,
+		      struct ethtool_eeprom *ee, u8 *data)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+#define SFF_READ_BLOCK_SIZE 8
+	u8 value[SFF_READ_BLOCK_SIZE] = {0};
+	u8 addr = ICE_I2C_EEPROM_DEV_ADDR;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	bool is_sfp = false;
+	unsigned int i, j;
+	u16 offset = 0;
+	u8 page = 0;
+
+	if (!ee || !ee->len || !data)
+		return -EINVAL;
+
+	status = ice_aq_sff_eeprom(hw, 0, addr, offset, page, 0, value, 1, 0,
+				   NULL);
+	if (status)
+		return -EIO;
+
+	if (value[0] == ICE_MODULE_TYPE_SFP)
+		is_sfp = true;
+
+	memset(data, 0, ee->len);
+	for (i = 0; i < ee->len; i += SFF_READ_BLOCK_SIZE) {
+		offset = i + ee->offset;
+		page = 0;
+
+		/* Check if we need to access the other memory page */
+		if (is_sfp) {
+			if (offset >= ETH_MODULE_SFF_8079_LEN) {
+				offset -= ETH_MODULE_SFF_8079_LEN;
+				addr = ICE_I2C_EEPROM_DEV_ADDR2;
+			}
+		} else {
+			while (offset >= ETH_MODULE_SFF_8436_LEN) {
+				/* Compute memory page number and offset. */
+				offset -= ETH_MODULE_SFF_8436_LEN / 2;
+				page++;
+			}
+		}
+
+		/* Bit 2 of eeprom address 0x02 declares upper
+		 * pages are disabled on QSFP modules.
+		 * SFP modules only ever use page 0.
+		 */
+		if (page == 0 || !(data[0x2] & 0x4)) {
+			/* If i2c bus is busy due to slow page change or
+			 * link management access, call can fail. This is normal.
+			 * So we retry this a few times.
+			 */
+			for (j = 0; j < 4; j++) {
+				status = ice_aq_sff_eeprom(hw, 0, addr, offset, page,
+							   !is_sfp, value,
+							   SFF_READ_BLOCK_SIZE,
+							   0, NULL);
+				netdev_dbg(netdev, "SFF %02X %02X %02X %X = %02X%02X%02X%02X.%02X%02X%02X%02X (%X)\n",
+					   addr, offset, page, is_sfp,
+					   value[0], value[1], value[2], value[3],
+					   value[4], value[5], value[6], value[7],
+					   status);
+				if (status) {
+					usleep_range(1500, 2500);
+					memset(value, 0, SFF_READ_BLOCK_SIZE);
+					continue;
+				}
+				break;
+			}
+
+			/* Make sure we have enough room for the new block */
+			if ((i + SFF_READ_BLOCK_SIZE) < ee->len)
+				memcpy(data + i, value, SFF_READ_BLOCK_SIZE);
+		}
+	}
+	return 0;
+}
+#endif /* ETHTOOL_GMODULEINFO */
 
 static const struct ethtool_ops ice_ethtool_ops = {
+#ifdef ETHTOOL_COALESCE_USECS
+	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+				     ETHTOOL_COALESCE_USE_ADAPTIVE |
+				     ETHTOOL_COALESCE_RX_USECS_HIGH,
+#endif /* ETHTOOL_COALESCE_USECS */
+#ifdef ETHTOOL_GLINKSETTINGS
 	.get_link_ksettings	= ice_get_link_ksettings,
 	.set_link_ksettings	= ice_set_link_ksettings,
-	.get_drvinfo            = ice_get_drvinfo,
-	.get_regs_len           = ice_get_regs_len,
-	.get_regs               = ice_get_regs,
-	.get_msglevel           = ice_get_msglevel,
-	.set_msglevel           = ice_set_msglevel,
+#else
+	.get_settings		= ice_get_settings,
+	.set_settings		= ice_set_settings,
+#endif /* ETHTOOL_GLINKSETTINGS */
+	.get_drvinfo		= ice_get_drvinfo,
+	.get_regs_len		= ice_get_regs_len,
+	.get_regs		= ice_get_regs,
+	.get_wol		= ice_get_wol,
+	.set_wol		= ice_set_wol,
+	.get_msglevel		= ice_get_msglevel,
+	.set_msglevel		= ice_set_msglevel,
 	.self_test		= ice_self_test,
 	.get_link		= ethtool_op_get_link,
 	.get_eeprom_len		= ice_get_eeprom_len,
 	.get_eeprom		= ice_get_eeprom,
+	.set_eeprom		= ice_set_eeprom,
 	.get_coalesce		= ice_get_coalesce,
 	.set_coalesce		= ice_set_coalesce,
 	.get_strings		= ice_get_strings,
@@ -3419,38 +6018,68 @@ static const struct ethtool_ops ice_ethtool_ops = {
 	.set_priv_flags		= ice_set_priv_flags,
 	.get_sset_count		= ice_get_sset_count,
 	.get_rxnfc		= ice_get_rxnfc,
+	.set_rxnfc		= ice_set_rxnfc,
 	.get_ringparam		= ice_get_ringparam,
 	.set_ringparam		= ice_set_ringparam,
 	.nway_reset		= ice_nway_reset,
 	.get_pauseparam		= ice_get_pauseparam,
 	.set_pauseparam		= ice_set_pauseparam,
+#if defined(ETHTOOL_GRSSH) && defined(ETHTOOL_SRSSH)
 	.get_rxfh_key_size	= ice_get_rxfh_key_size,
 	.get_rxfh_indir_size	= ice_get_rxfh_indir_size,
 	.get_rxfh		= ice_get_rxfh,
 	.set_rxfh		= ice_set_rxfh,
-	.get_ts_info		= ethtool_op_get_ts_info,
-	.get_per_queue_coalesce = ice_get_per_q_coalesce,
-	.set_per_queue_coalesce = ice_set_per_q_coalesce,
+#endif /* ETHTOOL_GRSSH && ETHTOOL_SRSSH */
+	.get_channels		= ice_get_channels,
+	.set_channels		= ice_set_channels,
+	.get_ts_info		= ice_get_ts_info,
+#ifdef ETHTOOL_PERQUEUE
+	.get_per_queue_coalesce	= ice_get_per_q_coalesce,
+	.set_per_queue_coalesce	= ice_set_per_q_coalesce,
+#endif /* ETHTOOL_PERQUEUE */
+#ifdef ETHTOOL_GFECPARAM
 	.get_fecparam		= ice_get_fecparam,
 	.set_fecparam		= ice_set_fecparam,
+#endif /* ETHTOOL_GFECPARAM */
+#ifdef ETHTOOL_GMODULEINFO
+	.get_module_info	= ice_get_module_info,
+	.get_module_eeprom	= ice_get_module_eeprom,
+#endif /* ETHTOOL_GMODULEINFO */
+};
+
+static const struct ethtool_ops ice_ethtool_recovery_ops = {
+	.get_drvinfo		= ice_get_drvinfo,
+	.get_eeprom_len		= ice_get_eeprom_len,
+	.get_eeprom		= ice_get_eeprom,
+	.set_eeprom		= ice_set_eeprom,
 };
 
 static const struct ethtool_ops ice_ethtool_safe_mode_ops = {
+#ifdef ETHTOOL_GLINKSETTINGS
 	.get_link_ksettings	= ice_get_link_ksettings,
 	.set_link_ksettings	= ice_set_link_ksettings,
+#else
+	.get_settings		= ice_get_settings,
+	.set_settings		= ice_set_settings,
+#endif /* ETHTOOL_GLINKSETTINGS */
 	.get_drvinfo		= ice_get_drvinfo,
 	.get_regs_len		= ice_get_regs_len,
 	.get_regs		= ice_get_regs,
+	.get_wol		= ice_get_wol,
+	.set_wol		= ice_set_wol,
 	.get_msglevel		= ice_get_msglevel,
 	.set_msglevel		= ice_set_msglevel,
+	.get_link		= ethtool_op_get_link,
 	.get_eeprom_len		= ice_get_eeprom_len,
 	.get_eeprom		= ice_get_eeprom,
+	.set_eeprom		= ice_set_eeprom,
 	.get_strings		= ice_get_strings,
-	.get_ethtool_stats	= ice_get_ethtool_stats,
+	.get_ethtool_stats      = ice_get_ethtool_stats,
 	.get_sset_count		= ice_get_sset_count,
 	.get_ringparam		= ice_get_ringparam,
 	.set_ringparam		= ice_set_ringparam,
 	.nway_reset		= ice_nway_reset,
+	.get_channels		= ice_get_channels,
 };
 
 /**
@@ -3462,6 +6091,38 @@ void ice_set_ethtool_safe_mode_ops(struct net_device *netdev)
 	netdev->ethtool_ops = &ice_ethtool_safe_mode_ops;
 }
 
+
+static const struct ethtool_ops ice_ethtool_repr_ops = {
+#ifdef ETHTOOL_COALESCE_USECS
+	.supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS_HIGH,
+#endif
+	.get_coalesce		= ice_repr_get_coalesce,
+	.set_coalesce		= ice_repr_set_coalesce,
+	.get_drvinfo		= ice_repr_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_strings		= ice_get_strings,
+	.get_ethtool_stats      = ice_get_ethtool_stats,
+	.get_sset_count		= ice_get_sset_count,
+};
+
+/**
+ * ice_set_ethtool_repr_ops - setup VF's port representor ethtool ops
+ * @netdev: network interface device structure
+ */
+void ice_set_ethtool_repr_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &ice_ethtool_repr_ops;
+}
+
+/**
+ * ice_set_ethtool_recovery_ops - setup FW recovery ethtool ops
+ * @netdev: network interface device structure
+ */
+void ice_set_ethtool_recovery_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &ice_ethtool_recovery_ops;
+}
+
 /**
  * ice_set_ethtool_ops - setup netdev ethtool ops
  * @netdev: network interface device structure
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.h b/drivers/net/ethernet/intel/ice/ice_ethtool.h
new file mode 100644
index 000000000000..3b167694aa18
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_ETHTOOL_H_
+#define _ICE_ETHTOOL_H_
+
+struct ice_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int stat_offset;
+};
+
+#define ICE_STAT(_type, _name, _stat) { \
+	.stat_string = _name, \
+	.sizeof_stat = sizeof_field(_type, _stat), \
+	.stat_offset = offsetof(_type, _stat) \
+}
+
+#define ICE_VSI_STAT(_name, _stat) \
+		ICE_STAT(struct ice_vsi, _name, _stat)
+#define ICE_PF_STAT(_name, _stat) \
+		ICE_STAT(struct ice_pf, _name, _stat)
+
+#ifdef UNIFIED_STATS
+#define PICK(legacy_stat, unified_stat) unified_stat
+#else
+#define PICK(legacy_stat, unified_stat) legacy_stat
+#endif
+
+/* VSI stats */
+#define ICE_RX_UNICAST			"rx_unicast"
+#define ICE_TX_UNICAST			"tx_unicast"
+#define ICE_RX_MULTICAST		"rx_multicast"
+#define ICE_TX_MULTICAST		"tx_multicast"
+#define ICE_RX_BROADCAST		"rx_broadcast"
+#define ICE_TX_BROADCAST		"tx_broadcast"
+#define ICE_RX_BYTES			"rx_bytes"
+#define ICE_TX_BYTES			"tx_bytes"
+#define ICE_RX_DROPPED			PICK("rx_dropped", "rx-dropped_pkts")
+#define ICE_RX_UNKNOWN_PROTO		PICK("rx_unknown_protocol", "rx-unknown-protocol_pkts")
+#define ICE_RX_ALLOC_FAIL		PICK("rx_alloc_fail", "rx-buf-alloc-fail_events")
+#define ICE_RX_PAGE_ALLOC_FAIL		PICK("rx_pg_alloc_fail", "rx-page-alloc-fail_events")
+#define ICE_TX_ERRORS			"tx_errors"
+#define ICE_TX_LINEARIZE		PICK("tx_linearized", "tx-linearized_pkts")
+#define ICE_TX_BUSY			PICK("tx_busy", "tx-busy_events")
+#define ICE_TX_RESTART			"tx_restart"
+#ifdef ICE_ADD_PROBES
+#define ICE_RX_PAGE_REUSE		"rx_page_reuse"
+#endif
+#ifdef ADQ_PERF_COUNTERS
+#endif /* ADQ_PERF_COUNTERS */
+
+/* port stats */
+#define ICE_PORT_RX_BYTES		PICK("rx_bytes.nic", "port-rx_bytes")
+#define ICE_PORT_TX_BYTES		PICK("tx_bytes.nic", "port-tx_bytes")
+#define ICE_PORT_RX_UNICAST		PICK("rx_unicast.nic", "port-rx-unicast_pkts")
+#define ICE_PORT_TX_UNICAST		PICK("tx_unicast.nic", "port-tx-unicast_pkts")
+#define ICE_PORT_RX_MULTICAST		PICK("rx_multicast.nic", "port-rx-multicast_pkts")
+#define ICE_PORT_TX_MULTICAST		PICK("tx_multicast.nic", "port-tx-multicast_pkts")
+#define ICE_PORT_RX_BROADCAST		PICK("rx_broadcast.nic", "port-rx-broadcast_pkts")
+#define ICE_PORT_TX_BROADCAST		PICK("tx_broadcast.nic", "port-tx-broadcast_pkts")
+#define ICE_PORT_TX_ERRORS		PICK("tx_errors.nic", "port-tx_errors")
+#define ICE_PORT_TX_TIMEOUT		PICK("tx_timeout.nic", "port-tx-timeout_events")
+#define ICE_PORT_RX_SIZE_64		PICK("rx_size_64.nic", "port-rx_size-64_pkts")
+#define ICE_PORT_TX_SIZE_64		PICK("tx_size_64.nic", "port-tx_size-64_pkts")
+#define ICE_PORT_RX_SIZE_127		PICK("rx_size_127.nic", "port-rx_size-127_pkts")
+#define ICE_PORT_TX_SIZE_127		PICK("tx_size_127.nic", "port-tx_size-127_pkts")
+#define ICE_PORT_RX_SIZE_255		PICK("rx_size_255.nic", "port-rx_size-255_pkts")
+#define ICE_PORT_TX_SIZE_255		PICK("tx_size_255.nic", "port-tx_size-255_pkts")
+#define ICE_PORT_RX_SIZE_511		PICK("rx_size_511.nic", "port-rx_size-511_pkts")
+#define ICE_PORT_TX_SIZE_511		PICK("tx_size_511.nic", "port-tx_size-511_pkts")
+#define ICE_PORT_RX_SIZE_1023		PICK("rx_size_1023.nic", "port-rx_size-1023_pkts")
+#define ICE_PORT_TX_SIZE_1023		PICK("tx_size_1023.nic", "port-tx_size-1023_pkts")
+#define ICE_PORT_RX_SIZE_1522		PICK("rx_size_1522.nic", "port-rx_size-1522_pkts")
+#define ICE_PORT_TX_SIZE_1522		PICK("tx_size_1522.nic", "port-tx_size-1522_pkts")
+#define ICE_PORT_RX_SIZE_JUMBO		PICK("rx_size_big.nic", "port-rx_size-jumbo_pkts")
+#define ICE_PORT_TX_SIZE_JUMBO		PICK("tx_size_big.nic", "port-tx_size-jumbo_pkts")
+#define ICE_PORT_RX_LINK_XON		PICK("link_xon_rx.nic", "port-rx-xon_events")
+#define ICE_PORT_TX_LINK_XON		PICK("link_xon_tx.nic", "port-tx-xon_events")
+#define ICE_PORT_RX_LINK_XOFF		PICK("link_xoff_rx.nic", "port-rx-xoff_events")
+#define ICE_PORT_TX_LINK_XOFF		PICK("link_xoff_tx.nic", "port-tx-xoff_events")
+#define ICE_PORT_TX_DROP_LINK_DOWN	PICK("tx_dropped_link_down.nic", "port-tx-dropped_link-down_pkts")
+#define ICE_PORT_RX_UNDERSIZE		PICK("rx_undersize.nic", "port-rx-undersized_pkts")
+#define ICE_PORT_RX_FRAGMENTS		PICK("rx_fragments.nic", "port-rx-fragmented_pkts")
+#define ICE_PORT_RX_OVERSIZE		PICK("rx_oversize.nic", "port-rx-oversized_pkts")
+#define ICE_PORT_RX_JABBER		PICK("rx_jabber.nic", "port-rx-jabber_pkts")
+#define ICE_PORT_RX_CSUM_BAD		PICK("rx_csum_bad.nic", "port-rx-csum_errors")
+#define ICE_PORT_RX_LEN_ERRORS		PICK("rx_length_errors.nic", "port-rx-length_errors")
+#define ICE_PORT_RX_DROPPED		PICK("rx_dropped.nic", "port-rx-dropped_pkts")
+#define ICE_PORT_RX_CRC_ERRORS		PICK("rx_crc_errors.nic", "port-rx-crc_errors")
+#define ICE_PORT_ILLEGAL_BYTES		PICK("illegal_bytes.nic", "port-rx-illegal_bytes")
+#define ICE_PORT_MAC_LOCAL_FAULTS	PICK("mac_local_faults.nic", "port-mac-local_faults")
+#define ICE_PORT_MAC_REMOTE_FAULTS	PICK("mac_remote_faults.nic", "port-mac-remote_faults")
+#ifdef ICE_ADD_PROBES
+#define ICE_PORT_TX_TCP_SEGMENTS	PICK("tx_tcp_segments.nic", "port-tx-tcp-segments_count")
+#define ICE_PORT_TX_UDP_SEGMENTS	PICK("tx_udp_segments.nic", "port-tx-udp-segments_count")
+#define ICE_PORT_RX_TCP_CSO		PICK("rx_tcp_cso.nic", "port-rx-tcp-csum-offload_count")
+#define ICE_PORT_TX_TCP_CSO		PICK("tx_tcp_cso.nic", "port-tx-tcp-csum-offload_count")
+#define ICE_PORT_RX_UDP_CSO		PICK("rx_udp_cso.nic", "port-rx-udp-csum-offload_count")
+#define ICE_PORT_TX_UDP_CSO		PICK("tx_udp_cso.nic", "port-tx-udp-csum-offload_count")
+#define ICE_PORT_RX_SCTP_CSO		PICK("rx_sctp_cso.nic", "port-rx-sctp-csum-offload_count")
+#define ICE_PORT_TX_SCTP_CSO		PICK("tx_sctp_cso.nic", "port-tx-sctp-csum-offload_count")
+#define ICE_PORT_RX_IP4_CSO		PICK("rx_ip4_cso.nic", "port-rx-ipv4-csum-offload_count")
+#define ICE_PORT_TX_IP4_CSO		PICK("tx_ip4_cso.nic", "port-tx-ipv4-csum-offload_count")
+#define ICE_PORT_RX_IP4_CSO_ERROR	PICK("rx_ip4_cso_error.nic", "port-rx-ipv4-csum_errors")
+#define ICE_PORT_RX_TCP_CSO_ERROR	PICK("rx_tcp_cso_error.nic", "port-rx-tcp-csum_errors")
+#define ICE_PORT_RX_UDP_CSO_ERROR	PICK("rx_udp_cso_error.nic", "port-rx-udp-csum_errors")
+#define ICE_PORT_RX_SCTP_CSO_ERROR	PICK("rx_sctp_cso_error.nic", "port-rx-sctp-csum_errors")
+#define ICE_PORT_TX_L3_CSO_ERROR	PICK("tx_l3_cso_err.nic", "port-tx-layer-3-csum_errors")
+#define ICE_PORT_TX_L4_CSO_ERROR	PICK("tx_l4_cso_err.nic", "port-tx-layer-4-csum_errors")
+#define ICE_PORT_RX_Q_VLANO		PICK("rx_vlano.nic", "port-rx-q-vlan-offload_pkts")
+#define ICE_PORT_TX_Q_VLANO		PICK("tx_vlano.nic", "port-tx-q-vlan-offload_pkts")
+#define ICE_PORT_RX_AD_VLANO		PICK("rx_ad_vlano.nic", "port-rx-ad-vlan-offload_pkts")
+#define ICE_PORT_TX_AD_VLANO		PICK("tx_ad_vlano.nic", "port-tx-ad-vlan-offload_pkts")
+#endif /* ICE_ADD_PROBES */
+#define ICE_PORT_FDIR_SB_MATCH		PICK("fdir_sb_match.nic", "port-rx-fdir-sideband")
+#define ICE_PORT_FDIR_SB_STATUS		PICK("fdir_sb_status.nic", "port-rx-fdir-sideband-status")
+#ifdef ICE_ADD_PROBES
+#define ICE_PORT_ARFS_TCPV4_MATCH	PICK("arfs_tcpv4_match.nic", "port-rx-arfs-tcpv4-pkts")
+#define ICE_PORT_ARFS_TCPV6_MATCH	PICK("arfs_tcpv6_match.nic", "port-rx-arfs-tcpv6-pkts")
+#define ICE_PORT_ARFS_UDP4_MATCH	PICK("arfs_udpv4_match.nic", "port-rx-arfs-udpv4-pkts")
+#define ICE_PORT_ARFS_UDP6_MATCH	PICK("arfs_udpv6_match.nic", "port-rx-arfs-udpv6-pkts")
+#endif /* ICE_ADD_PROBES */
+#define PORT_TX_PRIO_XON		PICK("tx_priority_%u_xon.nic", "port-tx-xon_prio-%u_events")
+#define PORT_TX_PRIO_XOFF		PICK("tx_priority_%u_xoff.nic", "port-tx-xoff_prio-%u_events")
+#define PORT_RX_PRIO_XON		PICK("rx_priority_%u_xon.nic", "port-rx-xon_prio-%u_events")
+#define PORT_RX_PRIO_XOFF		PICK("rx_priority_%u_xoff.nic", "port-rx-xoff_prio-%u_events")
+
+/* per-queue stats */
+#define ICE_TXQ_PACKETS			PICK("tx_queue_%u_packets", "tx_q-%u_pkts")
+#define ICE_TXQ_BYTES			PICK("tx_queue_%u_bytes", "tx_q-%u_bytes")
+#define ICE_RXQ_PACKETS			PICK("rx_queue_%u_packets", "rx_q-%u_pkts")
+#define ICE_RXQ_BYTES			PICK("rx_queue_%u_bytes", "rx_q-%u_bytes")
+#ifdef ICE_ADD_PROBES
+#define ICE_TXQ_NAPI_POLL		PICK("tx_queue_%u_napi_poll_cnt", "tx_q-%u_napi_poll_count")
+#define ICE_RXQ_NAPI_POLL		PICK("rx_queue_%u_napi_poll_cnt", "rx_q-%u_napi_poll_count")
+#endif /* ICE_ADD_PROBES */
+
+#ifdef HAVE_NETDEV_SB_DEV
+#ifdef ICE_ADD_PROBES
+/* macvlan stats */
+#define L2_FWD_TX_PKTS1			PICK("l2-fwd-%s-tx_pkts", "tx-l2-forward_q-%s_pkts")
+#define L2_FWD_TX_BYTES1		PICK("l2-fwd-%s-tx_bytes", "tx-l2-forward_q-%s_bytes")
+#define L2_FWD_TX_PKTS2			PICK("l2-fwd-%i-tx_pkts", "tx-l2-forward_q-%i_pkts")
+#define L2_FWD_TX_BYTES2		PICK("l2-fwd-%i-tx_bytes", "tx-l2-forward_q-%i_bytes")
+#define L2_FWD_RX_PKTS1			PICK("l2-fwd-%s-rx_pkts", "rx-l2-forward_q-%s_pkts")
+#define L2_FWD_RX_BYTES1		PICK("l2-fwd-%s-rx_bytes", "rx-l2-forward_q-%s_bytes")
+#define L2_FWD_RX_PKTS2			PICK("l2-fwd-%i-rx_pkts", "rx-l2-forward_q-%i_pkts")
+#define L2_FWD_RX_BYTES2		PICK("l2-fwd-%i-rx_bytes", "rx-l2-forward_q-%i_bytes")
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_NETDEV_SB_DEV */
+
+#ifdef ADQ_PERF_COUNTERS
+/* ADQ stats */
+#define ICE_TXQ_BUSY_POLL		PICK("tx_%u.pkt_busy_poll", "tx_q-%u_pkt_busy_poll")
+#define ICE_TXQ_NOT_BUSY_POLL		PICK("tx_%u.pkt_not_busy_poll", "tx_q-%u_pkt_not_busy_poll")
+#define ICE_TXQ_ATR_SETUP		PICK("tx_%u.atr_setup", "tx_q-%u_atr_setup")
+#define ICE_TXQ_MARK_ATR_SETUP		PICK("tx_%u.mark_atr_setup", "tx_q-%u_mark_atr_setup")
+#define ICE_TXQ_ATR_TEARDOWN		PICK("tx_%u.atr_teardown", "tx_q-%u_atr_teardown")
+#define ICE_TXQ_ATR_BAIL		PICK("tx_%u.atr_bailouts", "tx_q-%u_atr_bailouts")
+#define ICE_RXQ_BUSY_POLL		PICK("rx_%u.pkt_busy_poll", "rx_q-%u_pkt_busy_poll")
+#define ICE_RXQ_NOT_BUSY_POLL		PICK("rx_%u.pkt_not_busy_poll", "rx_q-%u_pkt_not_busy_poll")
+#define ICE_RXQ_SET			PICK("rx_%u.queue_set", "rx_q-%u_queue_set")
+#define ICE_RXQ_BAIL			PICK("rx_%u.queue_bailouts", "rx_q-%u_queue_bailouts")
+#define ICE_RXQ_TCP_CTRL_PKTS		PICK("rx_%u.tcp_ctrl_pkts", "rx_q-%u_tcp_ctrl_pkts")
+#define ICE_RXQ_ONLY_CTRL_PKTS		PICK("rx_%u.only_ctrl_pkts", "rx_q-%u_only_ctrl_pkts")
+#define ICE_RXQ_TCP_FIN_RECV		PICK("rx_%u.tcp_fin_recv", "rx_q-%u_tcp_fin_recv")
+#define ICE_RXQ_TCP_RST_RECV		PICK("rx_%u.tcp_rst_recv", "rx_q-%u_tcp_rst_recv")
+#define ICE_RXQ_TCP_SYN_RECV		PICK("rx_%u.tcp_syn_recv", "rx_q-%u_tcp_syn_recv")
+#define ICE_RXQ_BP_NO_DATA_PKT		PICK("rx_%u.bp_no_data_pkt", "rx_q-%u_bp_no_data_pkt")
+#define ICE_RXQ_IN_BP			PICK("rx_%u.in_bp", "rx_q-%u_in_bp")
+#define ICE_RXQ_INTR_TO_BP		PICK("rx_%u.intr_to_bp", "rx_q-%u_intr_to_bp")
+#define ICE_RXQ_BP_TO_BP		PICK("rx_%u.bp_to_bp", "rx_q-%u_bp_to_bp")
+#define ICE_RXQ_IN_INTR			PICK("rx_%u.in_intr", "rx_q-%u_in_intr")
+#define ICE_RXQ_BP_TO_INTR		PICK("rx_%u.bp_to_intr", "rx_q-%u_bp_to_intr")
+#define ICE_RXQ_INTR_TO_INTR		PICK("rx_%u.intr_to_intr", "rx_q-%u_intr_to_intr")
+#define ICE_RXQ_UNLIKELY_CB_TO_BP	PICK("rx_%u.unlikely_cb_to_bp", "rx_q-%u_unlikely_cb_to_bp")
+#define ICE_RXQ_UCB_ONCE_IN_BP		PICK("rx_%u.ucb_once_in_bp", "rx_q-%u_ucb_once_in_bp")
+#define ICE_RXQ_INTR_ONCE_IN_BP_FALSE	PICK("rx_%u.intr_once_in_bp_false", "rx_q-%u_intr_once_in_bp_false")
+#define ICE_RXQ_BP_STOP_NEED_RESCHED	PICK("rx_%u.bp_stop_need_resched", "rx_q-%u_bp_stop_need_resched")
+#define ICE_RXQ_BP_STOP_TIMEOUT		PICK("rx_%u.bp_stop_timeout", "rx_q-%u_bp_stop_timeout")
+#define ICE_RXQ_CLEANED_ANY_DATA_PKT	PICK("rx_%u.cleaned_any_data_pkt", "rx_q-%u_cleaned_any_data_pkt")
+#define ICE_RXQ_NEED_RESCHED_NO_DATA	PICK("rx_%u.need_resched_no_data", "rx_q-%u_need_resched_no_data")
+#define ICE_RXQ_TIMEOUT_NO_DATA		PICK("rx_%u.timeout_no_data", "rx_q-%u_timeout_no_data")
+#define ICE_RXQ_SW_INTR_TIMEOUT		PICK("rx_%u.sw_intr_timeout", "rx_q-%u_sw_intr_timeout")
+#define ICE_RXQ_SW_INTR_SERV_TASK	PICK("rx_%u.sw_intr_service_task", "rx_q-%u_sw_intr_service_task")
+#define ICE_RXQ_NO_SW_INTR_OPT_OFF	PICK("rx_%u.no_sw_intr_opt_off", "rx_q-%u_no_sw_intr_opt_off")
+#define ICE_RXQ_WB_ON_ITR_SET		PICK("rx_%u.wb_on_itr_set", "rx_q-%u_wb_on_itr_set")
+#define ICE_RXQ_PKTS_BP_STOP_BUDGET8	PICK("rx_%u.pkts_bp_stop_budget8", "rx_q-%u_pkts_bp_stop_budget8")
+#define ICE_RXQ_PKTS_BP_STOP_BUDGET64	PICK("rx_%u.pkts_bp_stop_budget64", "rx_q-%u_pkts_bp_stop_budget64")
+#define ICE_RXQ_BP_WD_EQUAL_BUDGET8	PICK("rx_%u.bp_wd_equal_budget8", "rx_q-%u_bp_wd_equal_budget8")
+#define ICE_RXQ_BP_WD_EQUAL_BUDGET64	PICK("rx_%u.bp_wd_equal_budget64", "rx_q-%u_bp_wd_equal_b")
+#define ICE_RXQ_KEEP_STATE_BP_BUDGET8	PICK("rx_%u.keep_state_bp_budget8", "rx_q-%u_keep_state_bp_budget8")
+#define ICE_RXQ_KEEP_STATE_BP_BUDGET64	PICK("rx_%u.keep_state_bp_budget64", "rx_q-%u_keep_state_bp_budget64")
+#endif /* ADQ_PERF_COUNTERS */
+#endif /* !_ICE_ETHTOOL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c
new file mode 100644
index 000000000000..87d09c332eac
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c
@@ -0,0 +1,2180 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* flow director ethtool support for ice */
+
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_fdir.h"
+#include "ice_flow.h"
+
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+static struct in6_addr full_ipv6_addr_mask = {
+	.in6_u = {
+		.u6_addr8 = {
+			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+		}
+	}
+};
+
+static struct in6_addr zero_ipv6_addr_mask = {
+	.in6_u = {
+		.u6_addr8 = {
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		}
+	}
+};
+#endif
+
+/* calls to ice_flow_add_prof require the number of segments in the array
+ * for segs_cnt. In this code that is one more than the index.
+ */
+#define TNL_SEG_CNT(_TNL_) ((_TNL_) + 1)
+
+/**
+ * ice_fltr_to_ethtool_flow - convert filter type values to ethtool
+ * flow type values
+ * @flow: filter type to be converted
+ *
+ * Returns the corresponding ethtool flow type.
+ */
+static int ice_fltr_to_ethtool_flow(enum ice_fltr_ptype flow)
+{
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		return TCP_V4_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		return UDP_V4_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+		return SCTP_V4_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+		return IPV4_USER_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		return TCP_V6_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		return UDP_V6_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+		return SCTP_V6_FLOW;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+		return IPV6_USER_FLOW;
+#endif
+	default:
+		/* 0 is undefined ethtool flow */
+		return 0;
+	}
+}
+
+/**
+ * ice_ethtool_flow_to_fltr - convert ethtool flow type to filter enum
+ * @eth: Ethtool flow type to be converted
+ *
+ * Returns flow enum
+ */
+enum ice_fltr_ptype ice_ethtool_flow_to_fltr(int eth)
+{
+	switch (eth) {
+	case TCP_V4_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV4_TCP;
+	case UDP_V4_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+	case SCTP_V4_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV4_SCTP;
+	case IPV4_USER_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV4_OTHER;
+	case TCP_V6_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV6_TCP;
+	case UDP_V6_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV6_UDP;
+	case SCTP_V6_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV6_SCTP;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case IPV6_USER_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV6_OTHER;
+#endif
+	default:
+		return ICE_FLTR_PTYPE_NONF_NONE;
+	}
+}
+
+/**
+ * ice_is_mask_valid - check mask field set
+ * @mask: full mask to check
+ * @field: field for which mask should be valid
+ *
+ * If the mask is fully set return true. If it is not valid for field return
+ * false.
+ */
+static bool ice_is_mask_valid(u64 mask, u64 field)
+{
+	return (mask & field) == field;
+}
+
+/**
+ * ice_get_ethtool_fdir_entry - fill ethtool structure with fdir filter data
+ * @hw: hardware structure that contains filter list
+ * @cmd: ethtool command data structure to receive the filter data
+ *
+ * Returns 0 on success and -EINVAL on failure
+ */
+int ice_get_ethtool_fdir_entry(struct ice_hw *hw, struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp;
+	struct ice_fdir_fltr *rule;
+	int ret = 0;
+	u16 idx;
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	mutex_lock(&hw->fdir_fltr_lock);
+
+	rule = ice_fdir_find_fltr_by_idx(hw, fsp->location);
+
+	if (!rule || fsp->location != rule->fltr_id) {
+		ret = -EINVAL;
+		goto release_lock;
+	}
+
+	fsp->flow_type = ice_fltr_to_ethtool_flow(rule->flow_type);
+
+	memset(&fsp->m_u, 0, sizeof(fsp->m_u));
+	memset(&fsp->m_ext, 0, sizeof(fsp->m_ext));
+
+	switch (fsp->flow_type) {
+	case IPV4_USER_FLOW:
+		fsp->h_u.usr_ip4_spec.ip_ver = ETH_RX_NFC_IP4;
+		fsp->h_u.usr_ip4_spec.proto = 0;
+		fsp->h_u.usr_ip4_spec.l4_4_bytes = rule->ip.v4.l4_header;
+		fsp->h_u.usr_ip4_spec.tos = rule->ip.v4.tos;
+		fsp->h_u.usr_ip4_spec.ip4src = rule->ip.v4.src_ip;
+		fsp->h_u.usr_ip4_spec.ip4dst = rule->ip.v4.dst_ip;
+		fsp->m_u.usr_ip4_spec.ip4src = rule->mask.v4.src_ip;
+		fsp->m_u.usr_ip4_spec.ip4dst = rule->mask.v4.dst_ip;
+		fsp->m_u.usr_ip4_spec.ip_ver = 0xFF;
+		fsp->m_u.usr_ip4_spec.proto = 0;
+		fsp->m_u.usr_ip4_spec.l4_4_bytes = rule->mask.v4.l4_header;
+		fsp->m_u.usr_ip4_spec.tos = rule->mask.v4.tos;
+		break;
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case SCTP_V4_FLOW:
+		fsp->h_u.tcp_ip4_spec.psrc = rule->ip.v4.src_port;
+		fsp->h_u.tcp_ip4_spec.pdst = rule->ip.v4.dst_port;
+		fsp->h_u.tcp_ip4_spec.ip4src = rule->ip.v4.src_ip;
+		fsp->h_u.tcp_ip4_spec.ip4dst = rule->ip.v4.dst_ip;
+		fsp->m_u.tcp_ip4_spec.psrc = rule->mask.v4.src_port;
+		fsp->m_u.tcp_ip4_spec.pdst = rule->mask.v4.dst_port;
+		fsp->m_u.tcp_ip4_spec.ip4src = rule->mask.v4.src_ip;
+		fsp->m_u.tcp_ip4_spec.ip4dst = rule->mask.v4.dst_ip;
+		break;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case IPV6_USER_FLOW:
+		fsp->h_u.usr_ip6_spec.l4_4_bytes = rule->ip.v6.l4_header;
+		fsp->h_u.usr_ip6_spec.tclass = rule->ip.v6.tc;
+		fsp->h_u.usr_ip6_spec.l4_proto = rule->ip.v6.proto;
+		memcpy(fsp->h_u.tcp_ip6_spec.ip6src, rule->ip.v6.src_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->h_u.tcp_ip6_spec.ip6dst, rule->ip.v6.dst_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->m_u.tcp_ip6_spec.ip6src, rule->mask.v6.src_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->m_u.tcp_ip6_spec.ip6dst, rule->mask.v6.dst_ip,
+		       sizeof(struct in6_addr));
+		fsp->m_u.usr_ip6_spec.l4_4_bytes = rule->mask.v6.l4_header;
+		fsp->m_u.usr_ip6_spec.tclass = rule->mask.v6.tc;
+		fsp->m_u.usr_ip6_spec.l4_proto = rule->mask.v6.proto;
+		break;
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+	case SCTP_V6_FLOW:
+		memcpy(fsp->h_u.tcp_ip6_spec.ip6src, rule->ip.v6.src_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->h_u.tcp_ip6_spec.ip6dst, rule->ip.v6.dst_ip,
+		       sizeof(struct in6_addr));
+		fsp->h_u.tcp_ip6_spec.psrc = rule->ip.v6.src_port;
+		fsp->h_u.tcp_ip6_spec.pdst = rule->ip.v6.dst_port;
+		memcpy(fsp->m_u.tcp_ip6_spec.ip6src,
+		       rule->mask.v6.src_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->m_u.tcp_ip6_spec.ip6dst,
+		       rule->mask.v6.dst_ip,
+		       sizeof(struct in6_addr));
+		fsp->m_u.tcp_ip6_spec.psrc = rule->mask.v6.src_port;
+		fsp->m_u.tcp_ip6_spec.pdst = rule->mask.v6.dst_port;
+		fsp->h_u.tcp_ip6_spec.tclass = rule->ip.v6.tc;
+		fsp->m_u.tcp_ip6_spec.tclass = rule->mask.v6.tc;
+		break;
+#endif /* HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC */
+	default:
+		break;
+	}
+
+	if (rule->dest_ctl == ICE_FLTR_PRGM_DESC_DEST_DROP_PKT)
+		fsp->ring_cookie = RX_CLS_FLOW_DISC;
+	else
+		fsp->ring_cookie = rule->orig_q_index;
+
+
+	idx = ice_ethtool_flow_to_fltr(fsp->flow_type);
+	if (idx == ICE_FLTR_PTYPE_NONF_NONE) {
+		dev_err(ice_hw_to_dev(hw), "Missing input index for flow_type %d\n",
+			rule->flow_type);
+		ret = -EINVAL;
+	}
+
+release_lock:
+	mutex_unlock(&hw->fdir_fltr_lock);
+	return ret;
+}
+
+/**
+ * ice_ntuple_get_max_fltr_cnt - return the maximum number of allowed filters
+ * @hw: hardware structure containing filter information
+ */
+u32 ice_ntuple_get_max_fltr_cnt(struct ice_hw *hw)
+{
+	int acl_cnt;
+
+	if (hw->dev_caps.num_funcs < 8)
+		acl_cnt = ICE_AQC_ACL_TCAM_DEPTH / ICE_ACL_ENTIRE_SLICE;
+	else
+		acl_cnt = ICE_AQC_ACL_TCAM_DEPTH / ICE_ACL_HALF_SLICE;
+
+	return ice_get_fdir_cnt_all(hw) + acl_cnt;
+}
+
+/**
+ * ice_get_fdir_fltr_ids - fill buffer with filter IDs of active filters
+ * @hw: hardware structure containing the filter list
+ * @cmd: ethtool command data structure
+ * @rule_locs: ethtool array passed in from OS to receive filter IDs
+ *
+ * Returns 0 as expected for success by ethtool
+ */
+int
+ice_get_fdir_fltr_ids(struct ice_hw *hw, struct ethtool_rxnfc *cmd,
+		      u32 *rule_locs)
+{
+	struct ice_fdir_fltr *f_rule;
+	unsigned int cnt = 0;
+	int val = 0;
+
+	/* report max rule count */
+	cmd->data = ice_ntuple_get_max_fltr_cnt(hw);
+
+	mutex_lock(&hw->fdir_fltr_lock);
+
+	list_for_each_entry(f_rule, &hw->fdir_list_head, fltr_node) {
+		if (cnt == cmd->rule_cnt) {
+			val = -EMSGSIZE;
+			goto release_lock;
+		}
+		rule_locs[cnt] = f_rule->fltr_id;
+		cnt++;
+	}
+
+release_lock:
+	mutex_unlock(&hw->fdir_fltr_lock);
+	if (!val)
+		cmd->rule_cnt = cnt;
+	return val;
+}
+
+/**
+ * ice_fdir_remap_entries - update the FDir entries in profile
+ * @prof: FDir structure pointer
+ * @tun: tunneled or non-tunneled packet
+ * @idx: FDir entry index
+ */
+static void
+ice_fdir_remap_entries(struct ice_fd_hw_prof *prof, int tun, int idx)
+{
+	if (idx != prof->cnt && tun < ICE_FD_HW_SEG_MAX) {
+		int i;
+
+		for (i = idx; i < (prof->cnt - 1); i++) {
+			u64 old_entry_h;
+
+			old_entry_h = prof->entry_h[i + 1][tun];
+			prof->entry_h[i][tun] = old_entry_h;
+			prof->vsi_h[i] = prof->vsi_h[i + 1];
+		}
+
+		prof->entry_h[i][tun] = 0;
+		prof->vsi_h[i] = 0;
+	}
+}
+
+/**
+ * ice_fdir_rem_adq_chnl - remove a ADQ channel from HW filter rules
+ * @hw: hardware structure containing filter list
+ * @vsi_idx: VSI handle
+ */
+void ice_fdir_rem_adq_chnl(struct ice_hw *hw, u16 vsi_idx)
+{
+	enum ice_status status;
+	int flow;
+
+	if (!hw->fdir_prof)
+		return;
+
+	for (flow = 0; flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		struct ice_fd_hw_prof *prof = hw->fdir_prof[flow];
+		int tun, i;
+
+		if (!prof)
+			continue;
+
+		for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+			u64 prof_id;
+
+			prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+
+			for (i = 0; i < prof->cnt; i++) {
+				if (prof->vsi_h[i] != vsi_idx)
+					continue;
+
+				prof->entry_h[i][tun] = 0;
+				prof->vsi_h[i] = 0;
+				break;
+			}
+
+			/* after clearing FDir entries update the remaining */
+			ice_fdir_remap_entries(prof, tun, i);
+
+			/* find flow profile corresponding to prof_id and clear
+			 * vsi_idx from bitmap.
+			 */
+			status = ice_flow_rem_vsi_prof(hw, ICE_BLK_FD, vsi_idx, prof_id);
+			if (status) {
+				dev_err(ice_hw_to_dev(hw),
+					"ice_flow_rem_vsi_prof() failed status=%d\n",
+					status);
+			}
+		}
+		prof->cnt--;
+	}
+}
+
+/**
+ * ice_fdir_get_hw_prof - return the ice_fd_hw_proc associated with a flow
+ * @hw: hardware structure containing the filter list
+ * @blk: hardware block
+ * @flow: FDir flow type to release
+ */
+static struct ice_fd_hw_prof *
+ice_fdir_get_hw_prof(struct ice_hw *hw, enum ice_block blk, int flow)
+{
+	if (blk == ICE_BLK_ACL && hw->acl_prof)
+		return hw->acl_prof[flow];
+
+	if (blk == ICE_BLK_FD && hw->fdir_prof)
+		return hw->fdir_prof[flow];
+
+	return NULL;
+}
+
+/**
+ * ice_fdir_erase_flow_from_hw - remove a flow from the HW profile tables
+ * @hw: hardware structure containing the filter list
+ * @blk: hardware block
+ * @flow: FDir flow type to release
+ */
+static void
+ice_fdir_erase_flow_from_hw(struct ice_hw *hw, enum ice_block blk, int flow)
+{
+	struct ice_fd_hw_prof *prof = ice_fdir_get_hw_prof(hw, blk, flow);
+	int tun;
+
+	if (!prof)
+		return;
+
+	for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+		u64 prof_id;
+		int j;
+
+		prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+
+		for (j = 0; j < prof->cnt; j++) {
+			u16 vsi_num;
+
+			vsi_num = ice_get_hw_vsi_num(hw, prof->vsi_h[j]);
+
+			if (!prof->entry_h[j][tun] || !prof->vsi_h[j])
+				continue;
+
+			ice_rem_prof_id_flow(hw, blk, vsi_num, prof_id);
+			prof->entry_h[j][tun] = 0;
+		}
+
+		ice_flow_rem_prof(hw, blk, prof_id);
+	}
+}
+
+/**
+ * ice_fdir_rem_flow - release the ice_flow structures for a filter type
+ * @hw: hardware structure containing the filter list
+ * @blk: hardware block
+ * @flow_type: FDir flow type to release
+ */
+static void
+ice_fdir_rem_flow(struct ice_hw *hw, enum ice_block blk,
+		  enum ice_fltr_ptype flow_type)
+{
+	int flow = (int)flow_type & ~FLOW_EXT;
+	struct ice_fd_hw_prof *prof;
+	int tun, i;
+
+	prof = ice_fdir_get_hw_prof(hw, blk, flow);
+	if (!prof)
+		return;
+
+	ice_fdir_erase_flow_from_hw(hw, blk, flow);
+	for (i = 0; i < prof->cnt; i++)
+		prof->vsi_h[i] = 0;
+	for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+		if (!prof->fdir_seg[tun])
+			continue;
+		devm_kfree(ice_hw_to_dev(hw), prof->fdir_seg[tun]);
+		prof->fdir_seg[tun] = NULL;
+	}
+	prof->cnt = 0;
+}
+
+/**
+ * ice_fdir_release_flows - release all flows in use for later replay
+ * @hw: pointer to HW instance
+ */
+void ice_fdir_release_flows(struct ice_hw *hw)
+{
+	int flow;
+
+	/* release Flow Director HW table entries */
+	for (flow = 0; flow < ICE_FLTR_PTYPE_MAX; flow++)
+		ice_fdir_erase_flow_from_hw(hw, ICE_BLK_FD, flow);
+}
+
+/**
+ * ice_fdir_replay_flows - replay HW Flow Director filter info
+ * @hw: pointer to HW instance
+ */
+void ice_fdir_replay_flows(struct ice_hw *hw)
+{
+	int flow;
+
+	for (flow = 0; flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		int tun;
+
+		if (!hw->fdir_prof[flow] || !hw->fdir_prof[flow]->cnt)
+			continue;
+		for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+			struct ice_flow_prof *hw_prof;
+			struct ice_fd_hw_prof *prof;
+			u64 prof_id;
+			int j;
+
+			prof = hw->fdir_prof[flow];
+			prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+			ice_flow_add_prof(hw, ICE_BLK_FD, ICE_FLOW_RX, prof_id,
+					  prof->fdir_seg[tun], TNL_SEG_CNT(tun),
+					  NULL, 0, &hw_prof);
+			for (j = 0; j < prof->cnt; j++) {
+				enum ice_flow_priority prio;
+				u64 entry_h = 0;
+				int err;
+
+				prio = ICE_FLOW_PRIO_NORMAL;
+				err = ice_flow_add_entry(hw, ICE_BLK_FD,
+							 prof_id,
+							 prof->vsi_h[0],
+							 prof->vsi_h[j],
+							 prio, prof->fdir_seg,
+							 NULL, 0, &entry_h);
+				if (err) {
+					dev_err(ice_hw_to_dev(hw), "Could not replay Flow Director, flow type %d\n",
+						flow);
+					continue;
+				}
+				prof->entry_h[j][tun] = entry_h;
+			}
+		}
+	}
+}
+
+/**
+ * ice_parse_rx_flow_user_data - deconstruct user-defined data
+ * @fsp: pointer to ethtool Rx flow specification
+ * @data: pointer to userdef data structure for storage
+ *
+ * Returns 0 on success, negative error value on failure
+ */
+static int
+ice_parse_rx_flow_user_data(struct ethtool_rx_flow_spec *fsp,
+			    struct ice_rx_flow_userdef *data)
+{
+	u64 value, mask;
+
+	memset(data, 0, sizeof(*data));
+	if (!(fsp->flow_type & FLOW_EXT))
+		return 0;
+
+	value = be64_to_cpu(*((__force __be64 *)fsp->h_ext.data));
+	mask = be64_to_cpu(*((__force __be64 *)fsp->m_ext.data));
+	if (!mask)
+		return 0;
+
+#define ICE_USERDEF_FLEX_WORD_M	GENMASK_ULL(15, 0)
+#define ICE_USERDEF_FLEX_OFFS_S	16
+#define ICE_USERDEF_FLEX_OFFS_M	GENMASK_ULL(31, ICE_USERDEF_FLEX_OFFS_S)
+#define ICE_USERDEF_FLEX_FLTR_M	GENMASK_ULL(31, 0)
+
+	/* 0x1fe is the maximum value for offsets stored in the internal
+	 * filtering tables.
+	 */
+#define ICE_USERDEF_FLEX_MAX_OFFS_VAL 0x1fe
+
+	if (!ice_is_mask_valid(mask, ICE_USERDEF_FLEX_FLTR_M) ||
+	    value > ICE_USERDEF_FLEX_FLTR_M)
+		return -EINVAL;
+
+	data->flex_word = value & ICE_USERDEF_FLEX_WORD_M;
+	data->flex_offset = (value & ICE_USERDEF_FLEX_OFFS_M) >>
+			     ICE_USERDEF_FLEX_OFFS_S;
+	if (data->flex_offset > ICE_USERDEF_FLEX_MAX_OFFS_VAL)
+		return -EINVAL;
+
+	data->flex_fltr = true;
+
+	return 0;
+}
+
+/**
+ * ice_fdir_num_avail_fltr - return the number of unused flow director filters
+ * @hw: pointer to hardware structure
+ * @vsi: software VSI structure
+ *
+ * There are 2 filter pools: guaranteed and best effort(shared). Each VSI can
+ * use filters from either pool. The guaranteed pool is divided between VSIs.
+ * The best effort filter pool is common to all VSIs and is a device shared
+ * resource pool. The number of filters available to this VSI is the sum of
+ * the VSIs guaranteed filter pool and the global available best effort
+ * filter pool.
+ *
+ * Returns the number of available flow director filters to this VSI
+ */
+static int ice_fdir_num_avail_fltr(struct ice_hw *hw, struct ice_vsi *vsi)
+{
+	u16 vsi_num = ice_get_hw_vsi_num(hw, vsi->idx);
+	u16 num_guar;
+	u16 num_be;
+
+	/* total guaranteed filters assigned to this VSI */
+	num_guar = vsi->num_gfltr;
+
+	/* minus the guaranteed filters programed by this VSI */
+	num_guar -= (rd32(hw, VSIQF_FD_CNT(vsi_num)) &
+		     VSIQF_FD_CNT_FD_GCNT_M) >> VSIQF_FD_CNT_FD_GCNT_S;
+
+	/* total global best effort filters */
+	num_be = hw->func_caps.fd_fltr_best_effort;
+
+	/* minus the global best effort filters programmed */
+	num_be -= (rd32(hw, GLQF_FD_CNT) & GLQF_FD_CNT_FD_BCNT_M) >>
+		   GLQF_FD_CNT_FD_BCNT_S;
+
+	return num_guar + num_be;
+}
+
+/**
+ * ice_fdir_alloc_flow_prof - allocate FDir flow profile structure(s)
+ * @hw: HW structure containing the FDir flow profile structure(s)
+ * @flow: flow type to allocate the flow profile for
+ *
+ * Allocate the fdir_prof and fdir_prof[flow] if not already created. Return 0
+ * on success and negative on error.
+ */
+static int
+ice_fdir_alloc_flow_prof(struct ice_hw *hw, enum ice_fltr_ptype flow)
+{
+	if (!hw)
+		return -EINVAL;
+
+	if (!hw->fdir_prof) {
+		hw->fdir_prof = devm_kcalloc(ice_hw_to_dev(hw),
+					     ICE_FLTR_PTYPE_MAX,
+					     sizeof(*hw->fdir_prof),
+					     GFP_KERNEL);
+		if (!hw->fdir_prof)
+			return -ENOMEM;
+	}
+
+	if (!hw->fdir_prof[flow]) {
+		hw->fdir_prof[flow] = devm_kzalloc(ice_hw_to_dev(hw),
+						   sizeof(**hw->fdir_prof),
+						   GFP_KERNEL);
+		if (!hw->fdir_prof[flow])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+#ifdef NETIF_F_HW_TC
+/**
+ * ice_fdir_prof_vsi_idx - find or insert a vsi_idx in structure
+ * @prof: pointer to flow director HW profile
+ * @vsi_idx: vsi_idx to locate
+ *
+ * return the index of the vsi_idx. if vsi_idx is not found insert it
+ * into the vsi_h table.
+ */
+static u16
+ice_fdir_prof_vsi_idx(struct ice_fd_hw_prof *prof, int vsi_idx)
+{
+	u16 idx = 0;
+
+	for (idx = 0; idx < prof->cnt; idx++)
+		if (prof->vsi_h[idx] == vsi_idx)
+			return idx;
+
+	if (idx == prof->cnt)
+		prof->vsi_h[prof->cnt++] = vsi_idx;
+	return idx;
+}
+#endif /* NETIF_F_HW_TC */
+
+/**
+ * ice_fdir_set_hw_fltr_rule - Configure HW tables to generate a FDir rule
+ * @pf: pointer to the PF structure
+ * @seg: protocol header description pointer
+ * @flow: filter enum
+ * @tun: FDir segment to program
+ */
+static int
+ice_fdir_set_hw_fltr_rule(struct ice_pf *pf, struct ice_flow_seg_info *seg,
+			  enum ice_fltr_ptype flow, enum ice_fd_hw_seg tun)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_vsi *main_vsi, *ctrl_vsi;
+	struct ice_flow_seg_info *old_seg;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_fd_hw_prof *hw_prof;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u64 entry1_h = 0;
+	u64 entry2_h = 0;
+#ifdef NETIF_F_HW_TC
+	bool del_last;
+#endif /* NETIF_F_HW_TC */
+	u64 prof_id;
+	int err;
+#ifdef NETIF_F_HW_TC
+	int idx;
+#endif /* NETIF_F_HW_TC */
+
+	main_vsi = ice_get_main_vsi(pf);
+	if (!main_vsi)
+		return -EINVAL;
+
+	ctrl_vsi = ice_get_ctrl_vsi(pf);
+	if (!ctrl_vsi)
+		return -EINVAL;
+
+	err = ice_fdir_alloc_flow_prof(hw, flow);
+	if (err)
+		return err;
+
+	hw_prof = hw->fdir_prof[flow];
+	old_seg = hw_prof->fdir_seg[tun];
+	if (old_seg) {
+		/* This flow_type already has a changed input set.
+		 * If it matches the requested input set then we are
+		 * done. Or, if it's different then it's an error.
+		 */
+		if (!memcmp(old_seg, seg, sizeof(*seg)))
+			return -EEXIST;
+
+		/* if there are FDir filters using this flow,
+		 * then return error.
+		 */
+		if (hw->fdir_fltr_cnt[flow]) {
+			dev_err(dev, "Failed to add filter.  Flow director filters on each port must have the same input set.\n");
+			return -EINVAL;
+		}
+		if (ice_is_arfs_using_perfect_flow(hw, flow)) {
+			dev_err(dev, "aRFS using perfect flow type %d, cannot change input set\n",
+				flow);
+			return -EINVAL;
+		}
+
+		/* remove HW filter definition */
+		ice_fdir_rem_flow(hw, ICE_BLK_FD, flow);
+	}
+
+	/* Adding a profile, but there is only one header supported.
+	 * That is the final parameters are 1 header (segment), no
+	 * actions (NULL) and zero actions 0.
+	 */
+	prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+	status = ice_flow_add_prof(hw, ICE_BLK_FD, ICE_FLOW_RX, prof_id, seg,
+				   TNL_SEG_CNT(tun), NULL, 0, &prof);
+	if (status)
+		return ice_status_to_errno(status);
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, main_vsi->idx,
+				    main_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, NULL, 0, &entry1_h);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto err_prof;
+	}
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, main_vsi->idx,
+				    ctrl_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, NULL, 0, &entry2_h);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto err_entry;
+	}
+
+	hw_prof->fdir_seg[tun] = seg;
+	hw_prof->entry_h[0][tun] = entry1_h;
+	hw_prof->entry_h[1][tun] = entry2_h;
+	hw_prof->vsi_h[0] = main_vsi->idx;
+	hw_prof->vsi_h[1] = ctrl_vsi->idx;
+	if (!hw_prof->cnt)
+		hw_prof->cnt = 2;
+
+#ifdef NETIF_F_HW_TC
+	for (idx = 1; idx < ICE_CHNL_MAX_TC; idx++) {
+		u16 vsi_idx;
+		u16 vsi_h;
+
+		if (!ice_is_adq_active(pf) || !main_vsi->tc_map_vsi[idx])
+			continue;
+
+		entry1_h = 0;
+		vsi_h = main_vsi->tc_map_vsi[idx]->idx;
+		status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id,
+					    main_vsi->idx, vsi_h,
+					    ICE_FLOW_PRIO_NORMAL, seg, NULL, 0,
+					    &entry1_h);
+		if (status) {
+			dev_err(dev, "Could not add Channel VSI %d to flow group\n",
+				idx);
+			goto err_unroll;
+		}
+
+		vsi_idx = ice_fdir_prof_vsi_idx(hw_prof,
+						main_vsi->tc_map_vsi[idx]->idx);
+		hw_prof->entry_h[vsi_idx][tun] = entry1_h;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	return 0;
+
+#ifdef NETIF_F_HW_TC
+err_unroll:
+	entry1_h = 0;
+	hw_prof->fdir_seg[tun] = NULL;
+
+	/* The variable del_last will be used to determine when to clean up
+	 * the VSI group data. The VSI data is not needed if there are no
+	 * segments.
+	 */
+	del_last = true;
+	for (idx = 0; idx < ICE_FD_HW_SEG_MAX; idx++)
+		if (hw_prof->fdir_seg[idx]) {
+			del_last = false;
+			break;
+		}
+
+	for (idx = 0; idx < hw_prof->cnt; idx++) {
+		u16 vsi_num = ice_get_hw_vsi_num(hw, hw_prof->vsi_h[idx]);
+
+		if (!hw_prof->entry_h[idx][tun])
+			continue;
+		ice_rem_prof_id_flow(hw, ICE_BLK_FD, vsi_num, prof_id);
+		ice_flow_rem_entry(hw, ICE_BLK_FD, hw_prof->entry_h[idx][tun]);
+		hw_prof->entry_h[idx][tun] = 0;
+		if (del_last)
+			hw_prof->vsi_h[idx] = 0;
+	}
+	if (del_last)
+		hw_prof->cnt = 0;
+#endif /* NETIF_F_HW_TC */
+err_entry:
+	ice_rem_prof_id_flow(hw, ICE_BLK_FD,
+			     ice_get_hw_vsi_num(hw, main_vsi->idx), prof_id);
+	ice_flow_rem_entry(hw, ICE_BLK_FD, entry1_h);
+err_prof:
+	ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf))
+		dev_err(dev, "Failed to add filter.  Flow director filters must have the same input set as ADQ filters.\n");
+	else
+		dev_err(dev, "Failed to add filter.  Flow director filters on each port must have the same input set.\n");
+#else /* !NETIF_F_HW_TC */
+	dev_err(dev, "Failed to add filter.  Flow director filters on each port must have the same input set.\n");
+#endif /* !NETIF_F_HW_TC */
+
+	return err;
+}
+
+/**
+ * ice_set_init_fdir_seg
+ * @seg: flow segment for programming
+ * @l3_proto: ICE_FLOW_SEG_HDR_IPV4 or ICE_FLOW_SEG_HDR_IPV6
+ * @l4_proto: ICE_FLOW_SEG_HDR_TCP or ICE_FLOW_SEG_HDR_UDP
+ *
+ * Set the configuration for perfect filters to the provided flow segment for
+ * programming the HW filter. This is to be called only when initializing
+ * filters as this function it assumes no filters exist.
+ */
+static int
+ice_set_init_fdir_seg(struct ice_flow_seg_info *seg,
+		      enum ice_flow_seg_hdr l3_proto,
+		      enum ice_flow_seg_hdr l4_proto)
+{
+	enum ice_flow_field src_addr, dst_addr, src_port, dst_port;
+
+	if (!seg)
+		return -EINVAL;
+
+	if (l3_proto == ICE_FLOW_SEG_HDR_IPV4) {
+		src_addr = ICE_FLOW_FIELD_IDX_IPV4_SA;
+		dst_addr = ICE_FLOW_FIELD_IDX_IPV4_DA;
+	} else if (l3_proto == ICE_FLOW_SEG_HDR_IPV6) {
+		src_addr = ICE_FLOW_FIELD_IDX_IPV6_SA;
+		dst_addr = ICE_FLOW_FIELD_IDX_IPV6_DA;
+	} else {
+		return -EINVAL;
+	}
+
+	if (l4_proto == ICE_FLOW_SEG_HDR_TCP) {
+		src_port = ICE_FLOW_FIELD_IDX_TCP_SRC_PORT;
+		dst_port = ICE_FLOW_FIELD_IDX_TCP_DST_PORT;
+	} else if (l4_proto == ICE_FLOW_SEG_HDR_UDP) {
+		src_port = ICE_FLOW_FIELD_IDX_UDP_SRC_PORT;
+		dst_port = ICE_FLOW_FIELD_IDX_UDP_DST_PORT;
+	} else {
+		return -EINVAL;
+	}
+
+	ICE_FLOW_SET_HDRS(seg, l3_proto | l4_proto);
+
+	/* IP source address */
+	ice_flow_set_fld(seg, src_addr, ICE_FLOW_FLD_OFF_INVAL,
+			 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* IP destination address */
+	ice_flow_set_fld(seg, dst_addr, ICE_FLOW_FLD_OFF_INVAL,
+			 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* Layer 4 source port */
+	ice_flow_set_fld(seg, src_port, ICE_FLOW_FLD_OFF_INVAL,
+			 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* Layer 4 destination port */
+	ice_flow_set_fld(seg, dst_port, ICE_FLOW_FLD_OFF_INVAL,
+			 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	return 0;
+}
+
+/**
+ * ice_create_init_fdir_rule
+ * @pf: PF structure
+ * @flow: filter enum
+ *
+ * Return error value or 0 on success.
+ */
+static int
+ice_create_init_fdir_rule(struct ice_pf *pf, enum ice_fltr_ptype flow)
+{
+	struct ice_flow_seg_info *seg, *tun_seg;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	int ret;
+
+	/* if there is already a filter rule for kind return -EINVAL */
+	if (hw->fdir_prof && hw->fdir_prof[flow] &&
+	    hw->fdir_prof[flow]->fdir_seg[0])
+		return -EINVAL;
+
+	seg = devm_kzalloc(dev, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	tun_seg = devm_kzalloc(dev, sizeof(*seg) * ICE_FD_HW_SEG_MAX,
+			       GFP_KERNEL);
+	if (!tun_seg) {
+		devm_kfree(dev, seg);
+		return -ENOMEM;
+	}
+
+	if (flow == ICE_FLTR_PTYPE_NONF_IPV4_TCP)
+		ret = ice_set_init_fdir_seg(seg, ICE_FLOW_SEG_HDR_IPV4,
+					    ICE_FLOW_SEG_HDR_TCP);
+	else if (flow == ICE_FLTR_PTYPE_NONF_IPV4_UDP)
+		ret = ice_set_init_fdir_seg(seg, ICE_FLOW_SEG_HDR_IPV4,
+					    ICE_FLOW_SEG_HDR_UDP);
+	else if (flow == ICE_FLTR_PTYPE_NONF_IPV6_TCP)
+		ret = ice_set_init_fdir_seg(seg, ICE_FLOW_SEG_HDR_IPV6,
+					    ICE_FLOW_SEG_HDR_TCP);
+	else if (flow == ICE_FLTR_PTYPE_NONF_IPV6_UDP)
+		ret = ice_set_init_fdir_seg(seg, ICE_FLOW_SEG_HDR_IPV6,
+					    ICE_FLOW_SEG_HDR_UDP);
+	else
+		ret = -EINVAL;
+	if (ret)
+		goto err_exit;
+
+	/* add filter for outer headers */
+	ret = ice_fdir_set_hw_fltr_rule(pf, seg, flow, ICE_FD_HW_SEG_NON_TUN);
+	if (ret)
+		/* could not write filter, free memory */
+		goto err_exit;
+
+	/* make tunneled filter HW entries if possible */
+	memcpy(&tun_seg[1], seg, sizeof(*seg));
+	ret = ice_fdir_set_hw_fltr_rule(pf, tun_seg, flow, ICE_FD_HW_SEG_TUN);
+	if (ret)
+		/* could not write tunnel filter, but outer header filter
+		 * exists
+		 */
+		devm_kfree(dev, tun_seg);
+
+	set_bit(flow, hw->fdir_perfect_fltr);
+	return ret;
+err_exit:
+	devm_kfree(dev, tun_seg);
+	devm_kfree(dev, seg);
+
+	return -EOPNOTSUPP;
+}
+
+/**
+ * ice_ntuple_check_ip4_seg - Check valid fields are provided for filter
+ * @tcp_ip4_spec: mask data from ethtool
+ */
+int ice_ntuple_check_ip4_seg(struct ethtool_tcpip4_spec *tcp_ip4_spec)
+{
+	/* make sure we don't have any empty rule */
+	if (!tcp_ip4_spec->psrc && !tcp_ip4_spec->ip4src &&
+	    !tcp_ip4_spec->pdst && !tcp_ip4_spec->ip4dst)
+		return -EINVAL;
+
+	/* filtering on TOS not supported */
+	if (tcp_ip4_spec->tos)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ice_ntuple_l4_proto_to_port
+ * @l4_proto: Layer 4 protocol to program
+ * @src_port: source flow field value for provided l4 protocol
+ * @dst_port: destination flow field value for provided l4 protocol
+ *
+ * Set associated src and dst port for given l4 protocol
+ */
+int
+ice_ntuple_l4_proto_to_port(enum ice_flow_seg_hdr l4_proto,
+			    enum ice_flow_field *src_port,
+			    enum ice_flow_field *dst_port)
+{
+	if (l4_proto == ICE_FLOW_SEG_HDR_TCP) {
+		*src_port = ICE_FLOW_FIELD_IDX_TCP_SRC_PORT;
+		*dst_port = ICE_FLOW_FIELD_IDX_TCP_DST_PORT;
+	} else if (l4_proto == ICE_FLOW_SEG_HDR_UDP) {
+		*src_port = ICE_FLOW_FIELD_IDX_UDP_SRC_PORT;
+		*dst_port = ICE_FLOW_FIELD_IDX_UDP_DST_PORT;
+	} else if (l4_proto == ICE_FLOW_SEG_HDR_SCTP) {
+		*src_port = ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT;
+		*dst_port = ICE_FLOW_FIELD_IDX_SCTP_DST_PORT;
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_fdir_ip4_seg
+ * @seg: flow segment for programming
+ * @tcp_ip4_spec: mask data from ethtool
+ * @l4_proto: Layer 4 protocol to program
+ * @perfect_fltr: only valid on success; returns true if perfect filter,
+ *		  false if not
+ *
+ * Set the mask data into the flow segment to be used to program HW
+ * table based on provided L4 protocol for IPv4
+ */
+static int
+ice_set_fdir_ip4_seg(struct ice_flow_seg_info *seg,
+		     struct ethtool_tcpip4_spec *tcp_ip4_spec,
+		     enum ice_flow_seg_hdr l4_proto, bool *perfect_fltr)
+{
+	enum ice_flow_field src_port, dst_port;
+	int ret;
+
+	ret = ice_ntuple_check_ip4_seg(tcp_ip4_spec);
+	if (ret)
+		return ret;
+
+	ret = ice_ntuple_l4_proto_to_port(l4_proto, &src_port, &dst_port);
+	if (ret)
+		return ret;
+
+	*perfect_fltr = true;
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4 | l4_proto);
+
+	/* IP source address */
+	if (tcp_ip4_spec->ip4src == htonl(0xFFFFFFFF))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_SA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!tcp_ip4_spec->ip4src)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* IP destination address */
+	if (tcp_ip4_spec->ip4dst == htonl(0xFFFFFFFF))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_DA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!tcp_ip4_spec->ip4dst)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* Layer 4 source port */
+	if (tcp_ip4_spec->psrc == htons(0xFFFF))
+		ice_flow_set_fld(seg, src_port, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 false);
+	else if (!tcp_ip4_spec->psrc)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* Layer 4 destination port */
+	if (tcp_ip4_spec->pdst == htons(0xFFFF))
+		ice_flow_set_fld(seg, dst_port, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 false);
+	else if (!tcp_ip4_spec->pdst)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ice_ntuple_check_ip4_usr_seg - Check valid fields are provided for filter
+ * @usr_ip4_spec: ethtool userdef packet offset
+ */
+int ice_ntuple_check_ip4_usr_seg(struct ethtool_usrip4_spec *usr_ip4_spec)
+{
+	/* first 4 bytes of Layer 4 header */
+	if (usr_ip4_spec->l4_4_bytes)
+		return -EINVAL;
+	if (usr_ip4_spec->tos)
+		return -EINVAL;
+	if (usr_ip4_spec->ip_ver)
+		return -EINVAL;
+	/* Filtering on Layer 4 protocol not supported */
+	if (usr_ip4_spec->proto)
+		return -EOPNOTSUPP;
+	/* empty rules are not valid */
+	if (!usr_ip4_spec->ip4src && !usr_ip4_spec->ip4dst)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_set_fdir_ip4_usr_seg
+ * @seg: flow segment for programming
+ * @usr_ip4_spec: ethtool userdef packet offset
+ * @perfect_fltr: only set on success; returns true if perfect filter, false if
+ *		  not
+ *
+ * Set the offset data into the flow segment to be used to program HW
+ * table for IPv4
+ */
+static int
+ice_set_fdir_ip4_usr_seg(struct ice_flow_seg_info *seg,
+			 struct ethtool_usrip4_spec *usr_ip4_spec,
+			 bool *perfect_fltr)
+{
+	int ret;
+
+	ret = ice_ntuple_check_ip4_usr_seg(usr_ip4_spec);
+	if (ret)
+		return ret;
+
+	*perfect_fltr = true;
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4);
+
+	/* IP source address */
+	if (usr_ip4_spec->ip4src == htonl(0xFFFFFFFF))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_SA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!usr_ip4_spec->ip4src)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* IP destination address */
+	if (usr_ip4_spec->ip4dst == htonl(0xFFFFFFFF))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_DA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!usr_ip4_spec->ip4dst)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+/**
+ * ice_ntuple_check_ip6_seg - Check valid fields are provided for filter
+ * @tcp_ip6_spec: mask data from ethtool
+ */
+static int ice_ntuple_check_ip6_seg(struct ethtool_tcpip6_spec *tcp_ip6_spec)
+{
+	/* make sure we don't have any empty rule */
+	if (!memcmp(tcp_ip6_spec->ip6src, &zero_ipv6_addr_mask,
+		    sizeof(struct in6_addr)) &&
+	    !memcmp(tcp_ip6_spec->ip6dst, &zero_ipv6_addr_mask,
+		    sizeof(struct in6_addr)) &&
+	    !tcp_ip6_spec->psrc && !tcp_ip6_spec->pdst)
+		return -EINVAL;
+
+	/* filtering on TC not supported */
+	if (tcp_ip6_spec->tclass)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ice_set_fdir_ip6_seg
+ * @seg: flow segment for programming
+ * @tcp_ip6_spec: mask data from ethtool
+ * @l4_proto: Layer 4 protocol to program
+ * @perfect_fltr: only valid on success; returns true if perfect filter,
+ *		  false if not
+ *
+ * Set the mask data into the flow segment to be used to program HW
+ * table based on provided L4 protocol for IPv6
+ */
+static int
+ice_set_fdir_ip6_seg(struct ice_flow_seg_info *seg,
+		     struct ethtool_tcpip6_spec *tcp_ip6_spec,
+		     enum ice_flow_seg_hdr l4_proto, bool *perfect_fltr)
+{
+	enum ice_flow_field src_port, dst_port;
+	int ret;
+
+	ret = ice_ntuple_check_ip6_seg(tcp_ip6_spec);
+	if (ret)
+		return ret;
+
+	ret = ice_ntuple_l4_proto_to_port(l4_proto, &src_port, &dst_port);
+	if (ret)
+		return ret;
+
+	*perfect_fltr = true;
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV6 | l4_proto);
+
+	if (!memcmp(tcp_ip6_spec->ip6src, &full_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV6_SA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!memcmp(tcp_ip6_spec->ip6src, &zero_ipv6_addr_mask,
+			 sizeof(struct in6_addr)))
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	if (!memcmp(tcp_ip6_spec->ip6dst, &full_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV6_DA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!memcmp(tcp_ip6_spec->ip6dst, &zero_ipv6_addr_mask,
+			 sizeof(struct in6_addr)))
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* Layer 4 source port */
+	if (tcp_ip6_spec->psrc == htons(0xFFFF))
+		ice_flow_set_fld(seg, src_port, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 false);
+	else if (!tcp_ip6_spec->psrc)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* Layer 4 destination port */
+	if (tcp_ip6_spec->pdst == htons(0xFFFF))
+		ice_flow_set_fld(seg, dst_port, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 false);
+	else if (!tcp_ip6_spec->pdst)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ice_ntuple_check_ip6_usr_seg - Check valid fields are provided for filter
+ * @usr_ip6_spec: ethtool userdef packet offset
+ */
+static int
+ice_ntuple_check_ip6_usr_seg(struct ethtool_usrip6_spec *usr_ip6_spec)
+{
+	/* filtering on Layer 4 bytes not supported */
+	if (usr_ip6_spec->l4_4_bytes)
+		return -EOPNOTSUPP;
+	/* filtering on TC not supported */
+	if (usr_ip6_spec->tclass)
+		return -EOPNOTSUPP;
+	/* filtering on Layer 4 protocol not supported */
+	if (usr_ip6_spec->l4_proto)
+		return -EOPNOTSUPP;
+	/* empty rules are not valid */
+	if (!memcmp(usr_ip6_spec->ip6src, &zero_ipv6_addr_mask,
+		    sizeof(struct in6_addr)) &&
+	    !memcmp(usr_ip6_spec->ip6dst, &zero_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_set_fdir_ip6_usr_seg
+ * @seg: flow segment for programming
+ * @usr_ip6_spec: ethtool userdef packet offset
+ * @perfect_fltr: only set on success; returns true if perfect filter, false if
+ *		  not
+ *
+ * Set the offset data into the flow segment to be used to program HW
+ * table for IPv6
+ */
+static int
+ice_set_fdir_ip6_usr_seg(struct ice_flow_seg_info *seg,
+			 struct ethtool_usrip6_spec *usr_ip6_spec,
+			 bool *perfect_fltr)
+{
+	int ret;
+
+	ret = ice_ntuple_check_ip6_usr_seg(usr_ip6_spec);
+	if (ret)
+		return ret;
+
+	*perfect_fltr = true;
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV6);
+
+	if (!memcmp(usr_ip6_spec->ip6src, &full_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV6_SA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!memcmp(usr_ip6_spec->ip6src, &zero_ipv6_addr_mask,
+			 sizeof(struct in6_addr)))
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	if (!memcmp(usr_ip6_spec->ip6dst, &full_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV6_DA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!memcmp(usr_ip6_spec->ip6dst, &zero_ipv6_addr_mask,
+			 sizeof(struct in6_addr)))
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+#endif /* HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC */
+
+/**
+ * ice_cfg_fdir_xtrct_seq - Configure extraction sequence for the given filter
+ * @pf: PF structure
+ * @fsp: pointer to ethtool Rx flow specification
+ * @user: user defined data from flow specification
+ *
+ * Returns 0 on success.
+ */
+static int
+ice_cfg_fdir_xtrct_seq(struct ice_pf *pf, struct ethtool_rx_flow_spec *fsp,
+		       struct ice_rx_flow_userdef *user)
+{
+	struct ice_flow_seg_info *seg, *tun_seg;
+	struct device *dev = ice_pf_to_dev(pf);
+	enum ice_fltr_ptype fltr_idx;
+	struct ice_hw *hw = &pf->hw;
+	bool perfect_filter;
+	int ret;
+
+	seg = devm_kzalloc(dev, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	tun_seg = devm_kzalloc(dev, sizeof(*seg) * ICE_FD_HW_SEG_MAX,
+			       GFP_KERNEL);
+	if (!tun_seg) {
+		devm_kfree(dev, seg);
+		return -ENOMEM;
+	}
+
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+		ret = ice_set_fdir_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					   ICE_FLOW_SEG_HDR_TCP,
+					   &perfect_filter);
+		break;
+	case UDP_V4_FLOW:
+		ret = ice_set_fdir_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					   ICE_FLOW_SEG_HDR_UDP,
+					   &perfect_filter);
+		break;
+	case SCTP_V4_FLOW:
+		ret = ice_set_fdir_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					   ICE_FLOW_SEG_HDR_SCTP,
+					   &perfect_filter);
+		break;
+	case IPV4_USER_FLOW:
+		ret = ice_set_fdir_ip4_usr_seg(seg, &fsp->m_u.usr_ip4_spec,
+					       &perfect_filter);
+		break;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case TCP_V6_FLOW:
+		ret = ice_set_fdir_ip6_seg(seg, &fsp->m_u.tcp_ip6_spec,
+					   ICE_FLOW_SEG_HDR_TCP,
+					   &perfect_filter);
+		break;
+	case UDP_V6_FLOW:
+		ret = ice_set_fdir_ip6_seg(seg, &fsp->m_u.tcp_ip6_spec,
+					   ICE_FLOW_SEG_HDR_UDP,
+					   &perfect_filter);
+		break;
+	case SCTP_V6_FLOW:
+		ret = ice_set_fdir_ip6_seg(seg, &fsp->m_u.tcp_ip6_spec,
+					   ICE_FLOW_SEG_HDR_SCTP,
+					   &perfect_filter);
+		break;
+	case IPV6_USER_FLOW:
+		ret = ice_set_fdir_ip6_usr_seg(seg, &fsp->m_u.usr_ip6_spec,
+					       &perfect_filter);
+		break;
+#endif /* HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC */
+	default:
+		ret = -EINVAL;
+	}
+	if (ret)
+		goto err_exit;
+
+	/* tunnel segments are shifted up one. */
+	memcpy(&tun_seg[1], seg, sizeof(*seg));
+
+	if (user && user->flex_fltr) {
+		perfect_filter = false;
+		ice_flow_add_fld_raw(seg, user->flex_offset,
+				     ICE_FLTR_PRGM_FLEX_WORD_SIZE,
+				     ICE_FLOW_FLD_OFF_INVAL,
+				     ICE_FLOW_FLD_OFF_INVAL);
+		ice_flow_add_fld_raw(&tun_seg[1], user->flex_offset,
+				     ICE_FLTR_PRGM_FLEX_WORD_SIZE,
+				     ICE_FLOW_FLD_OFF_INVAL,
+				     ICE_FLOW_FLD_OFF_INVAL);
+	}
+
+	/* add filter for outer headers */
+	fltr_idx = ice_ethtool_flow_to_fltr(fsp->flow_type & ~FLOW_EXT);
+	ret = ice_fdir_set_hw_fltr_rule(pf, seg, fltr_idx,
+					ICE_FD_HW_SEG_NON_TUN);
+	if (ret == -EEXIST)
+		/* Rule already exists, free memory and continue */
+		devm_kfree(dev, seg);
+	else if (ret)
+		/* could not write filter, free memory */
+		goto err_exit;
+
+	/* make tunneled filter HW entries if possible */
+	memcpy(&tun_seg[1], seg, sizeof(*seg));
+	ret = ice_fdir_set_hw_fltr_rule(pf, tun_seg, fltr_idx,
+					ICE_FD_HW_SEG_TUN);
+	if (ret == -EEXIST) {
+		/* Rule already exists, free memory and count as success */
+		devm_kfree(dev, tun_seg);
+		ret = 0;
+	} else if (ret) {
+		/* could not write tunnel filter, but outer filter exists */
+		devm_kfree(dev, tun_seg);
+	}
+
+	if (perfect_filter)
+		set_bit(fltr_idx, hw->fdir_perfect_fltr);
+	else
+		clear_bit(fltr_idx, hw->fdir_perfect_fltr);
+
+	return ret;
+
+err_exit:
+	devm_kfree(dev, tun_seg);
+	devm_kfree(dev, seg);
+
+	return -EOPNOTSUPP;
+}
+
+/**
+ * ice_update_per_q_fltr
+ * @vsi: ptr to VSI
+ * @q_index: queue index
+ * @inc: true to increment or false to decrement per queue filter count
+ *
+ * This function is used to keep track of per queue sideband filters
+ */
+static void ice_update_per_q_fltr(struct ice_vsi *vsi, u32 q_index, bool inc)
+{
+	struct ice_ring *rx_ring;
+
+	if (!vsi->num_rxq || q_index >= vsi->num_rxq)
+		return;
+
+	rx_ring = vsi->rx_rings[q_index];
+	if (!rx_ring || !rx_ring->ch)
+		return;
+
+	if (inc)
+		atomic_inc(&rx_ring->ch->num_sb_fltr);
+	else
+		atomic_dec_if_positive(&rx_ring->ch->num_sb_fltr);
+}
+
+/**
+ * ice_fdir_write_fltr - send a flow director filter to the hardware
+ * @pf: PF data structure
+ * @input: filter structure
+ * @add: true adds filter and false removed filter
+ * @is_tun: true adds inner filter on tunnel and false outer headers
+ *
+ * returns 0 on success and negative value on error
+ */
+int
+ice_fdir_write_fltr(struct ice_pf *pf, struct ice_fdir_fltr *input, bool add,
+		    bool is_tun)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	struct ice_fltr_desc desc;
+	struct ice_vsi *ctrl_vsi;
+	enum ice_status status;
+	u8 *pkt, *frag_pkt;
+	bool has_frag;
+	int err;
+
+	ctrl_vsi = ice_get_ctrl_vsi(pf);
+	if (!ctrl_vsi)
+		return -EINVAL;
+
+	pkt = devm_kzalloc(dev, ICE_FDIR_MAX_RAW_PKT_SIZE, GFP_KERNEL);
+	if (!pkt)
+		return -ENOMEM;
+	frag_pkt = devm_kzalloc(dev, ICE_FDIR_MAX_RAW_PKT_SIZE, GFP_KERNEL);
+	if (!frag_pkt) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	ice_fdir_get_prgm_desc(hw, input, &desc, add);
+	status = ice_fdir_get_gen_prgm_pkt(hw, input, pkt, false, is_tun);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto err_free_all;
+	}
+	err = ice_prgm_fdir_fltr(ctrl_vsi, &desc, pkt);
+	if (err)
+		goto err_free_all;
+
+	/* repeat for fragment packet */
+	has_frag = ice_fdir_has_frag(input->flow_type);
+	if (has_frag) {
+		/* does not return error */
+		ice_fdir_get_prgm_desc(hw, input, &desc, add);
+		status = ice_fdir_get_gen_prgm_pkt(hw, input, frag_pkt, true,
+						   is_tun);
+		if (status) {
+			err = ice_status_to_errno(status);
+			goto err_frag;
+		}
+		err = ice_prgm_fdir_fltr(ctrl_vsi, &desc, frag_pkt);
+		if (err)
+			goto err_frag;
+	} else {
+		devm_kfree(dev, frag_pkt);
+	}
+
+	return 0;
+
+err_free_all:
+	devm_kfree(dev, frag_pkt);
+err_free:
+	devm_kfree(dev, pkt);
+	return err;
+
+err_frag:
+	devm_kfree(dev, frag_pkt);
+	return err;
+}
+
+/**
+ * ice_fdir_write_all_fltr - send a flow director filter to the hardware
+ * @pf: PF data structure
+ * @input: filter structure
+ * @add: true adds filter and false removed filter
+ *
+ * returns 0 on success and negative value on error
+ */
+static int
+ice_fdir_write_all_fltr(struct ice_pf *pf, struct ice_fdir_fltr *input,
+			bool add)
+{
+	u16 port_num;
+	int tun;
+
+	for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+		bool is_tun = tun == ICE_FD_HW_SEG_TUN;
+		int err;
+
+		if (is_tun && !ice_get_open_tunnel_port(&pf->hw, TNL_ALL,
+							&port_num))
+			continue;
+		err = ice_fdir_write_fltr(pf, input, add, is_tun);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/**
+ * ice_fdir_replay_fltrs - replay filters from the HW filter list
+ * @pf: board private structure
+ */
+void ice_fdir_replay_fltrs(struct ice_pf *pf)
+{
+	struct ice_fdir_fltr *f_rule;
+	struct ice_hw *hw = &pf->hw;
+
+	list_for_each_entry(f_rule, &hw->fdir_list_head, fltr_node) {
+		int err = ice_fdir_write_all_fltr(pf, f_rule, true);
+
+		if (err)
+			dev_dbg(ice_pf_to_dev(pf), "Flow Director error %d, could not reprogram filter %d\n",
+				err, f_rule->fltr_id);
+	}
+}
+
+/**
+ * ice_fdir_create_dflt_rules - create default perfect filters
+ * @pf: PF data structure
+ *
+ * Returns 0 for success or error.
+ */
+int ice_fdir_create_dflt_rules(struct ice_pf *pf)
+{
+	int err;
+
+	/* Create perfect TCP and UDP rules in hardware. */
+	err = ice_create_init_fdir_rule(pf, ICE_FLTR_PTYPE_NONF_IPV4_TCP);
+	if (err)
+		return err;
+
+	err = ice_create_init_fdir_rule(pf, ICE_FLTR_PTYPE_NONF_IPV4_UDP);
+	if (err)
+		return err;
+
+	err = ice_create_init_fdir_rule(pf, ICE_FLTR_PTYPE_NONF_IPV6_TCP);
+	if (err)
+		return err;
+
+	err = ice_create_init_fdir_rule(pf, ICE_FLTR_PTYPE_NONF_IPV6_UDP);
+
+	return err;
+}
+
+/**
+ * ice_vsi_manage_fdir - turn on/off flow director
+ * @vsi: the VSI being changed
+ * @ena: boolean value indicating if this is an enable or disable request
+ */
+void ice_vsi_manage_fdir(struct ice_vsi *vsi, bool ena)
+{
+	struct ice_fdir_fltr *f_rule, *tmp;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_fltr_ptype flow;
+
+	if (ena) {
+		set_bit(ICE_FLAG_FD_ENA, pf->flags);
+		ice_fdir_create_dflt_rules(pf);
+		return;
+	}
+
+	mutex_lock(&hw->fdir_fltr_lock);
+	if (!test_and_clear_bit(ICE_FLAG_FD_ENA, pf->flags))
+		goto release_lock;
+	list_for_each_entry_safe(f_rule, tmp, &hw->fdir_list_head, fltr_node) {
+		if (!f_rule->acl_fltr)
+			ice_fdir_write_all_fltr(pf, f_rule, false);
+		ice_fdir_update_cntrs(hw, f_rule->flow_type, f_rule->acl_fltr,
+				      false);
+		list_del(&f_rule->fltr_node);
+		devm_kfree(ice_pf_to_dev(pf), f_rule);
+	}
+
+	if (hw->fdir_prof)
+		for (flow = ICE_FLTR_PTYPE_NONF_NONE; flow < ICE_FLTR_PTYPE_MAX;
+		     flow++)
+			if (hw->fdir_prof[flow])
+				ice_fdir_rem_flow(hw, ICE_BLK_FD, flow);
+
+	if (hw->acl_prof)
+		for (flow = ICE_FLTR_PTYPE_NONF_NONE; flow < ICE_FLTR_PTYPE_MAX;
+		     flow++)
+			if (hw->acl_prof[flow])
+				ice_fdir_rem_flow(hw, ICE_BLK_ACL, flow);
+
+release_lock:
+	mutex_unlock(&hw->fdir_fltr_lock);
+}
+
+/**
+ * ice_del_acl_ethtool - Deletes an ACL rule entry.
+ * @hw: pointer to HW instance
+ * @fltr: filter structure
+ *
+ * returns 0 on success and negative value on error
+ */
+static int
+ice_del_acl_ethtool(struct ice_hw *hw, struct ice_fdir_fltr *fltr)
+{
+	u64 entry;
+
+	entry = ice_flow_find_entry(hw, ICE_BLK_ACL, fltr->fltr_id);
+	return ice_status_to_errno(ice_flow_rem_entry(hw, ICE_BLK_ACL, entry));
+}
+
+/**
+ * ice_fdir_do_rem_flow - delete flow and possibly add perfect flow
+ * @pf: PF structure
+ * @flow_type: FDir flow type to release
+ */
+static void
+ice_fdir_do_rem_flow(struct ice_pf *pf, enum ice_fltr_ptype flow_type)
+{
+	struct ice_hw *hw = &pf->hw;
+	bool need_perfect = false;
+
+	if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+	    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+	    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_TCP ||
+	    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_UDP)
+		need_perfect = true;
+
+	if (need_perfect && test_bit(flow_type, hw->fdir_perfect_fltr))
+		return;
+
+	ice_fdir_rem_flow(hw, ICE_BLK_FD, flow_type);
+	if (need_perfect)
+		ice_create_init_fdir_rule(pf, flow_type);
+}
+
+/**
+ * ice_ntuple_update_list_entry - add or delete a filter from the filter list
+ * @pf: PF structure
+ * @input: filter structure
+ * @fltr_idx: ethtool index of filter to modify
+ *
+ * returns 0 on success and negative on errors
+ */
+int
+ice_ntuple_update_list_entry(struct ice_pf *pf, struct ice_fdir_fltr *input,
+			     int fltr_idx)
+{
+	struct ice_fdir_fltr *old_fltr;
+	struct ice_hw *hw = &pf->hw;
+	struct ice_vsi *vsi;
+	int err = -ENOENT;
+
+	/* Do not update filters during reset */
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return -EINVAL;
+
+	old_fltr = ice_fdir_find_fltr_by_idx(hw, fltr_idx);
+	if (old_fltr) {
+		if (!old_fltr->acl_fltr) {
+			/* FD filter */
+			err = ice_fdir_write_all_fltr(pf, old_fltr, false);
+			if (err)
+				return err;
+		} else {
+			/* ACL filter - if the input buffer is present
+			 * then this is an update and we don't want to
+			 * delete the filter from the HW. we've already
+			 * written the change to the HW at this point, so
+			 * just update the SW structures to make sure
+			 * everything is hunky-dory. if no input then this
+			 * is a delete so we should delete the filter from
+			 * the HW and clean up our SW structures.
+			 */
+			if (!input) {
+				err = ice_del_acl_ethtool(hw, old_fltr);
+				if (err)
+					return err;
+			}
+		}
+		ice_fdir_update_cntrs(hw, old_fltr->flow_type,
+				      old_fltr->acl_fltr, false);
+		/* update sb-filters count, specific to ring->channel */
+		ice_update_per_q_fltr(vsi, old_fltr->orig_q_index, false);
+		/* Also delete the HW filter info if we have just deleted the
+		 * last filter of flow_type.
+		 */
+		if (!old_fltr->acl_fltr && !input &&
+		    !hw->fdir_fltr_cnt[old_fltr->flow_type])
+			ice_fdir_do_rem_flow(pf, old_fltr->flow_type);
+		else if (old_fltr->acl_fltr && !input &&
+			 !hw->acl_fltr_cnt[old_fltr->flow_type])
+			ice_fdir_rem_flow(hw, ICE_BLK_ACL, old_fltr->flow_type);
+		list_del(&old_fltr->fltr_node);
+		devm_kfree(ice_pf_to_dev(pf), old_fltr);
+	}
+	if (!input)
+		return err;
+	ice_fdir_list_add_fltr(hw, input);
+	/* update sb-filters count, specific to ring->channel */
+	ice_update_per_q_fltr(vsi, input->orig_q_index, true);
+	ice_fdir_update_cntrs(hw, input->flow_type, input->acl_fltr, true);
+	return 0;
+}
+
+/**
+ * ice_del_ntuple_ethtool - delete Flow Director or ACL filter
+ * @vsi: pointer to target VSI
+ * @cmd: command to add or delete the filter
+ *
+ * Returns 0 on success and negative values for failure
+ */
+int ice_del_ntuple_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp =
+		(struct ethtool_rx_flow_spec *)&cmd->fs;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	int val;
+
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EOPNOTSUPP;
+
+	/* Do not delete filters during reset */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_err(ice_pf_to_dev(pf), "Device is resetting - deleting Flow Director filters not supported during reset\n");
+		return -EBUSY;
+	}
+
+	if (test_bit(ICE_FD_FLUSH_REQ, pf->state))
+		return -EBUSY;
+
+	mutex_lock(&hw->fdir_fltr_lock);
+	val = ice_ntuple_update_list_entry(pf, NULL, fsp->location);
+	mutex_unlock(&hw->fdir_fltr_lock);
+
+	return val;
+}
+
+/**
+ * ice_update_ring_dest_vsi - update dest ring and dest VSI
+ * @vsi: pointer to target VSI
+ * @dest_vsi: ptr to dest VSI index
+ * @ring: ptr to dest ring
+ *
+ * This function updates destination VSI and queue if user specifies
+ * target queue which falls in channel's (aka ADQ) queue region
+ */
+void
+ice_update_ring_dest_vsi(struct ice_vsi *vsi, u16 *dest_vsi, u32 *ring)
+{
+	struct ice_channel *ch;
+
+	if (!ring || !dest_vsi)
+		return;
+
+	list_for_each_entry(ch, &vsi->ch_list, list) {
+		if (!ch->ch_vsi)
+			continue;
+
+		/* make sure to locate corresponding channel based on "queue"
+		 * specified
+		 */
+		if ((*ring < ch->base_q) ||
+		    (*ring > (ch->base_q + ch->num_rxq)))
+			continue;
+
+		/* update the dest_vsi based on channel */
+		*dest_vsi = ch->ch_vsi->idx;
+
+		/* update the "ring" to be correct based on channel */
+		*ring -= ch->base_q;
+	}
+}
+
+/**
+ * ice_is_acl_filter - Checks if it's a FD or ACL filter
+ * @fsp: pointer to ethtool Rx flow specification
+ *
+ * If any field of the provided filter is using a partial mask then this is
+ * an ACL filter.
+ *
+ * Returns true if ACL filter otherwise false.
+ */
+static bool ice_is_acl_filter(struct ethtool_rx_flow_spec *fsp)
+{
+	struct ethtool_tcpip4_spec *tcp_ip4_spec;
+	struct ethtool_usrip4_spec *usr_ip4_spec;
+
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case SCTP_V4_FLOW:
+		tcp_ip4_spec = &fsp->m_u.tcp_ip4_spec;
+
+		/* IP source address */
+		if (tcp_ip4_spec->ip4src &&
+		    tcp_ip4_spec->ip4src != htonl(0xFFFFFFFF))
+			return true;
+
+		/* IP destination address */
+		if (tcp_ip4_spec->ip4dst &&
+		    tcp_ip4_spec->ip4dst != htonl(0xFFFFFFFF))
+			return true;
+
+		/* Layer 4 source port */
+		if (tcp_ip4_spec->psrc && tcp_ip4_spec->psrc != htons(0xFFFF))
+			return true;
+
+		/* Layer 4 destination port */
+		if (tcp_ip4_spec->pdst && tcp_ip4_spec->pdst != htons(0xFFFF))
+			return true;
+
+		break;
+	case IPV4_USER_FLOW:
+		usr_ip4_spec = &fsp->m_u.usr_ip4_spec;
+
+		/* IP source address */
+		if (usr_ip4_spec->ip4src &&
+		    usr_ip4_spec->ip4src != htonl(0xFFFFFFFF))
+			return true;
+
+		/* IP destination address */
+		if (usr_ip4_spec->ip4dst &&
+		    usr_ip4_spec->ip4dst != htonl(0xFFFFFFFF))
+			return true;
+
+		break;
+	}
+
+	return false;
+}
+
+/**
+ * ice_ntuple_set_input_set - Set the input set for specified block
+ * @blk: filter block to configure
+ * @vsi: pointer to target VSI
+ * @fsp: pointer to ethtool Rx flow specification
+ * @input: filter structure
+ */
+int
+ice_ntuple_set_input_set(struct ice_vsi *vsi, enum ice_block blk,
+			 struct ethtool_rx_flow_spec *fsp,
+			 struct ice_fdir_fltr *input)
+{
+	u16 dest_vsi, q_index = 0;
+	int flow_type, flow_mask;
+	u16 orig_q_index = 0;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u8 dest_ctl;
+
+	if (blk == ICE_BLK_FD)
+		flow_mask = FLOW_EXT;
+	else if (blk == ICE_BLK_ACL)
+		flow_mask = FLOW_MAC_EXT;
+	else
+		return -EINVAL;
+
+	pf = vsi->back;
+	hw = &pf->hw;
+
+	dest_vsi = vsi->idx;
+	if (fsp->ring_cookie == RX_CLS_FLOW_DISC) {
+		dest_ctl = ICE_FLTR_PRGM_DESC_DEST_DROP_PKT;
+	} else {
+		u32 ring = ethtool_get_flow_spec_ring(fsp->ring_cookie);
+		u8 vf = ethtool_get_flow_spec_ring_vf(fsp->ring_cookie);
+
+		if (!vf) {
+			if (ring >= vsi->num_rxq)
+				return -EINVAL;
+			orig_q_index = ring;
+			ice_update_ring_dest_vsi(vsi, &dest_vsi, &ring);
+		} else {
+			dev_err(ice_pf_to_dev(pf), "Failed to add filter. %s filters are not supported on VF queues.\n",
+				blk == ICE_BLK_FD ? "Flow Director" : "ACL");
+			return -EINVAL;
+		}
+		dest_ctl = ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QINDEX;
+		q_index = ring;
+	}
+
+	input->fltr_id = fsp->location;
+	input->q_index = q_index;
+	flow_type = fsp->flow_type & ~flow_mask;
+
+	/* Record the original queue as specified by user, because
+	 * due to channel, configuration 'q_index' gets adjusted
+	 * accordingly, but to keep user experience same - queue of
+	 * flow-director filter shall report original queue number
+	 * as specified by user, hence record it and use it later
+	 */
+	input->orig_q_index = orig_q_index;
+	input->dest_vsi = dest_vsi;
+	input->dest_ctl = dest_ctl;
+	input->fltr_status = ICE_FLTR_PRGM_DESC_FD_STATUS_FD_ID;
+	input->cnt_index = ICE_FD_SB_STAT_IDX(hw->fd_ctr_base);
+	input->flow_type = ice_ethtool_flow_to_fltr(flow_type);
+
+	if (fsp->flow_type & FLOW_EXT) {
+		memcpy(input->ext_data.usr_def, fsp->h_ext.data,
+		       sizeof(input->ext_data.usr_def));
+		input->ext_data.vlan_type = fsp->h_ext.vlan_etype;
+		input->ext_data.vlan_tag = fsp->h_ext.vlan_tci;
+		memcpy(input->ext_mask.usr_def, fsp->m_ext.data,
+		       sizeof(input->ext_mask.usr_def));
+		input->ext_mask.vlan_type = fsp->m_ext.vlan_etype;
+		input->ext_mask.vlan_tag = fsp->m_ext.vlan_tci;
+	}
+
+	switch (flow_type) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case SCTP_V4_FLOW:
+		input->ip.v4.dst_port = fsp->h_u.tcp_ip4_spec.pdst;
+		input->ip.v4.src_port = fsp->h_u.tcp_ip4_spec.psrc;
+		input->ip.v4.dst_ip = fsp->h_u.tcp_ip4_spec.ip4dst;
+		input->ip.v4.src_ip = fsp->h_u.tcp_ip4_spec.ip4src;
+		input->mask.v4.dst_port = fsp->m_u.tcp_ip4_spec.pdst;
+		input->mask.v4.src_port = fsp->m_u.tcp_ip4_spec.psrc;
+		input->mask.v4.dst_ip = fsp->m_u.tcp_ip4_spec.ip4dst;
+		input->mask.v4.src_ip = fsp->m_u.tcp_ip4_spec.ip4src;
+		break;
+	case IPV4_USER_FLOW:
+		input->ip.v4.dst_ip = fsp->h_u.usr_ip4_spec.ip4dst;
+		input->ip.v4.src_ip = fsp->h_u.usr_ip4_spec.ip4src;
+		input->ip.v4.l4_header = fsp->h_u.usr_ip4_spec.l4_4_bytes;
+		input->ip.v4.proto = fsp->h_u.usr_ip4_spec.proto;
+		input->ip.v4.ip_ver = fsp->h_u.usr_ip4_spec.ip_ver;
+		input->ip.v4.tos = fsp->h_u.usr_ip4_spec.tos;
+		input->mask.v4.dst_ip = fsp->m_u.usr_ip4_spec.ip4dst;
+		input->mask.v4.src_ip = fsp->m_u.usr_ip4_spec.ip4src;
+		input->mask.v4.l4_header = fsp->m_u.usr_ip4_spec.l4_4_bytes;
+		input->mask.v4.proto = fsp->m_u.usr_ip4_spec.proto;
+		input->mask.v4.ip_ver = fsp->m_u.usr_ip4_spec.ip_ver;
+		input->mask.v4.tos = fsp->m_u.usr_ip4_spec.tos;
+		break;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+	case SCTP_V6_FLOW:
+		memcpy(input->ip.v6.dst_ip, fsp->h_u.tcp_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		memcpy(input->ip.v6.src_ip, fsp->h_u.tcp_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		input->ip.v6.dst_port = fsp->h_u.tcp_ip6_spec.pdst;
+		input->ip.v6.src_port = fsp->h_u.tcp_ip6_spec.psrc;
+		input->ip.v6.tc = fsp->h_u.tcp_ip6_spec.tclass;
+		memcpy(input->mask.v6.dst_ip, fsp->m_u.tcp_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		memcpy(input->mask.v6.src_ip, fsp->m_u.tcp_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		input->mask.v6.dst_port = fsp->m_u.tcp_ip6_spec.pdst;
+		input->mask.v6.src_port = fsp->m_u.tcp_ip6_spec.psrc;
+		input->mask.v6.tc = fsp->m_u.tcp_ip6_spec.tclass;
+		break;
+	case IPV6_USER_FLOW:
+		memcpy(input->ip.v6.dst_ip, fsp->h_u.usr_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		memcpy(input->ip.v6.src_ip, fsp->h_u.usr_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		input->ip.v6.l4_header = fsp->h_u.usr_ip6_spec.l4_4_bytes;
+		input->ip.v6.tc = fsp->h_u.usr_ip6_spec.tclass;
+
+		/* if no protocol requested, use IPPROTO_NONE */
+		if (!fsp->m_u.usr_ip6_spec.l4_proto)
+			input->ip.v6.proto = IPPROTO_NONE;
+		else
+			input->ip.v6.proto = fsp->h_u.usr_ip6_spec.l4_proto;
+
+		memcpy(input->mask.v6.dst_ip, fsp->m_u.usr_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		memcpy(input->mask.v6.src_ip, fsp->m_u.usr_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		input->mask.v6.l4_header = fsp->m_u.usr_ip6_spec.l4_4_bytes;
+		input->mask.v6.tc = fsp->m_u.usr_ip6_spec.tclass;
+		input->mask.v6.proto = fsp->m_u.usr_ip6_spec.l4_proto;
+		break;
+#endif /* HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC */
+	default:
+		/* not doing un-parsed flow types */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_add_ntuple_ethtool - Add/Remove Flow Director  or ACL filter
+ * @vsi: pointer to target VSI
+ * @cmd: command to add or delete the filter
+ *
+ * Returns 0 on success and negative values for failure
+ */
+int ice_add_ntuple_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd)
+{
+	struct ice_rx_flow_userdef userdata;
+	struct ethtool_rx_flow_spec *fsp;
+	struct ice_fdir_fltr *input;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int fltrs_needed;
+	u16 tunnel_port;
+	int ret;
+
+	if (!vsi)
+		return -EINVAL;
+
+	pf = vsi->back;
+	hw = &pf->hw;
+	dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EOPNOTSUPP;
+
+
+	/* Do not program filters during reset */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_err(dev, "Device is resetting - adding ntuple filters not supported during reset\n");
+		return -EBUSY;
+	}
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	if (ice_parse_rx_flow_user_data(fsp, &userdata))
+		return -EINVAL;
+
+	if (fsp->flow_type & FLOW_MAC_EXT)
+		return -EINVAL;
+
+	if (fsp->location >= ice_ntuple_get_max_fltr_cnt(hw)) {
+		dev_err(dev, "Failed to add filter.  The maximum number of ntuple filters has been reached.\n");
+		return -ENOSPC;
+	}
+
+	/* ACL filter */
+	if (pf->hw.acl_tbl && ice_is_acl_filter(fsp))
+		return ice_acl_add_rule_ethtool(vsi, cmd);
+
+	ret = ice_cfg_fdir_xtrct_seq(pf, fsp, &userdata);
+	if (ret)
+		return ret;
+
+	/* return error if not an update and no available filters */
+	fltrs_needed = ice_get_open_tunnel_port(hw, TNL_ALL, &tunnel_port) ?
+		2 : 1;
+	if (!ice_fdir_find_fltr_by_idx(hw, fsp->location) &&
+	    ice_fdir_num_avail_fltr(hw, pf->vsi[vsi->idx]) < fltrs_needed) {
+		dev_err(dev, "Failed to add filter.  The maximum number of flow director filters has been reached.\n");
+		return -ENOSPC;
+	}
+
+	input = devm_kzalloc(dev, sizeof(*input), GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	ret = ice_ntuple_set_input_set(vsi, ICE_BLK_FD, fsp, input);
+	if (ret)
+		goto free_input;
+
+	mutex_lock(&hw->fdir_fltr_lock);
+	if (ice_fdir_is_dup_fltr(hw, input)) {
+		ret = -EINVAL;
+		goto release_lock;
+	}
+
+	if (userdata.flex_fltr) {
+		input->flex_fltr = true;
+		input->flex_word = cpu_to_be16(userdata.flex_word);
+		input->flex_offset = userdata.flex_offset;
+	}
+
+	input->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_THREE;
+	input->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
+	input->cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+
+	/* input struct is added to the HW filter list */
+	ice_ntuple_update_list_entry(pf, input, fsp->location);
+
+	ret = ice_fdir_write_all_fltr(pf, input, true);
+	if (ret)
+		goto remove_sw_rule;
+
+	goto release_lock;
+
+remove_sw_rule:
+	ice_fdir_update_cntrs(hw, input->flow_type, false, false);
+	/* update sb-filters count, specific to ring->channel */
+	ice_update_per_q_fltr(vsi, input->orig_q_index, false);
+	list_del(&input->fltr_node);
+release_lock:
+	mutex_unlock(&hw->fdir_fltr_lock);
+free_input:
+	if (ret)
+		devm_kfree(dev, input);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.c b/drivers/net/ethernet/intel/ice/ice_fdir.c
new file mode 100644
index 000000000000..d63f5d855903
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fdir.c
@@ -0,0 +1,2103 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_common.h"
+#include "ice_fdir.h"
+
+/* These are training packet headers used to program flow director filters. */
+static const u8 ice_fdir_tcpv4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x28, 0x00, 0x01, 0x00, 0x00, 0x40, 0x06,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x00,
+	0x20, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const u8 ice_fdir_udpv4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x1C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_sctpv4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x20, 0x00, 0x00, 0x40, 0x00, 0x40, 0x84,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x40, 0x00, 0x40, 0x10,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00
+};
+
+static const u8 ice_fdir_udp4_vxlan_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4e, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x40, 0x00,
+	0x40, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x38, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xb2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x24,
+	0xbf, 0xc0, 0x30, 0xff, 0x00, 0x14, 0x00, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x14, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x00, 0x3a, 0x3d, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x40, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x2c,
+	0x00, 0x6f, 0x30, 0xff, 0x00, 0x1c, 0x00, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x1c, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x11, 0x3a, 0x24, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x08, 0xbe, 0xc7, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4c, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x9e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x38,
+	0x00, 0x4c, 0x30, 0xff, 0x00, 0x28, 0x00, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x28, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x06, 0x3a, 0x23, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x02, 0x20, 0x00, 0x4e, 0xd2,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_gtpu4_eh_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x42, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa8, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x2e,
+	0xba, 0x1d, 0x34, 0xff, 0x00, 0x1e, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x16, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x00, 0x7c, 0xe5, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_gtpu4_eh_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4a, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa0, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x36,
+	0xb8, 0x23, 0x34, 0xff, 0x00, 0x26, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x1e, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x11, 0x7c, 0xcc, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x0a, 0x01, 0xd8, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_gtpu4_eh_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x56, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x94, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x42,
+	0xb8, 0x00, 0x34, 0xff, 0x00, 0x32, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x2a, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x06, 0x7c, 0xcb, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x02, 0x20, 0x00, 0x91, 0xde,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_gtpu4_eh_dw_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x42, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa8, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x2e,
+	0xba, 0x1d, 0x34, 0xff, 0x00, 0x1e, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x16, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x00, 0x7c, 0xe5, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_gtpu4_eh_dw_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4a, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa0, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x36,
+	0xb8, 0x23, 0x34, 0xff, 0x00, 0x26, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x1e, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x11, 0x7c, 0xcc, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x0a, 0x01, 0xd8, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_gtpu4_eh_dw_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x56, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x94, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x42,
+	0xb8, 0x00, 0x34, 0xff, 0x00, 0x32, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x2a, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x06, 0x7c, 0xcb, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x02, 0x20, 0x00, 0x91, 0xde,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_gtpu4_eh_up_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x42, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa8, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x2e,
+	0xba, 0x0d, 0x34, 0xff, 0x00, 0x1e, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x10,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x16, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x00, 0x7c, 0xe5, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_gtpu4_eh_up_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4a, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa0, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x36,
+	0xb8, 0x13, 0x34, 0xff, 0x00, 0x26, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x10,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x1e, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x11, 0x7c, 0xcc, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x0a, 0x01, 0xd8, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_gtpu4_eh_up_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x56, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x94, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x42,
+	0xb7, 0xf0, 0x34, 0xff, 0x00, 0x32, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x10,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x2a, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x06, 0x7c, 0xcb, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x02, 0x20, 0x00, 0x91, 0xde,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_icmp4_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4c, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x00,
+	0x00, 0x00, 0x34, 0xff, 0x00, 0x28, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x02, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x45, 0x00,
+	0x00, 0x1c, 0x00, 0x00, 0x40, 0x00, 0x40, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4c, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x9e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x38,
+	0x24, 0x42, 0x30, 0xff, 0x00, 0x28, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x3b, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp6_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x54, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x40,
+	0x4e, 0x3d, 0x30, 0xff, 0x00, 0x30, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x08,
+	0x11, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+	0xff, 0xdc, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp6_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x62, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x4e,
+	0x59, 0x08, 0x30, 0xff, 0x00, 0x3e, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x16,
+	0x06, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x14, 0x00, 0x50, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x02,
+	0x20, 0x00, 0x8f, 0x7b, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xdd, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x38, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x68,
+	0x08, 0x68, 0x00, 0x38, 0x22, 0x43, 0x30, 0xff,
+	0x00, 0x28, 0x00, 0x00, 0x00, 0x00, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x3b, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu6_eh_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xdd, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x44, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x68,
+	0x08, 0x68, 0x00, 0x44, 0x1b, 0x9a, 0x34, 0xff,
+	0x00, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x85, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x3b, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu6_eh_dw_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xdd, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x44, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x68,
+	0x08, 0x68, 0x00, 0x44, 0x1b, 0x9a, 0x34, 0xff,
+	0x00, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x85, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x3b, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu6_eh_up_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xdd, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x44, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x68,
+	0x08, 0x68, 0x00, 0x44, 0x1b, 0x8a, 0x34, 0xff,
+	0x00, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x85, 0x02, 0x10, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x3b, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_l2tpv3_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x40, 0x00, 0x40, 0x73,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_l2tpv3_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x73, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_esp_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x40, 0x00, 0x40, 0x32,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00
+};
+
+static const u8 ice_fdir_ipv6_esp_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x32, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_ah_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x40, 0x00, 0x40, 0x33,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00
+};
+
+static const u8 ice_fdir_ipv6_ah_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x33, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_nat_t_esp_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x1C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x11, 0x94, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_nat_t_esp_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x08, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x11, 0x94, 0x00, 0x00, 0x00, 0x08,
+};
+
+static const u8 ice_fdir_ipv4_pfcp_node_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x2C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x22, 0x65, 0x22, 0x65, 0x00, 0x00,
+	0x00, 0x00, 0x20, 0x00, 0x00, 0x10, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_pfcp_session_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x2C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x22, 0x65, 0x22, 0x65, 0x00, 0x00,
+	0x00, 0x00, 0x21, 0x00, 0x00, 0x10, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_pfcp_node_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x18, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x22, 0x65,
+	0x22, 0x65, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00,
+	0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_pfcp_session_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x18, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x22, 0x65,
+	0x22, 0x65, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00,
+	0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_non_ip_l2_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ecpri_tp0_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0xAE, 0xFE, 0x10, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_udp_ecpri_tp0_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x1C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_frag_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x2C, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3B, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_frag_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x20, 0x00, 0x40, 0x10,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00
+};
+
+static const u8 ice_fdir_tcpv6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x14, 0x06, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x00, 0x20, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_udpv6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x08, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_sctpv6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x0C, 0x84, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x3B, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x5a, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x28, 0x00, 0x00, 0x40, 0x00,
+	0x40, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4e, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x40, 0x00,
+	0x40, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_sctp4_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x52, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00,
+	0x40, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ip4_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x46, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00,
+	0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp6_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x6e, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0xdd,
+	0x60, 0x00, 0x00, 0x00, 0x00, 0x14, 0x06, 0x40,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x50, 0x00, 0x20, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp6_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x62, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0xdd,
+	0x60, 0x00, 0x00, 0x00, 0x00, 0x08, 0x11, 0x40,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_sctp6_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x66, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0xdd,
+	0x60, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x84, 0x40,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ip6_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x5a, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0xdd,
+	0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3b, 0x40,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+/* Flow Director no-op training packet table */
+static const struct ice_fdir_base_pkt ice_fdir_pkt[] = {
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_TCP,
+		sizeof(ice_fdir_tcpv4_pkt), ice_fdir_tcpv4_pkt,
+		sizeof(ice_fdir_tcp4_tun_pkt), ice_fdir_tcp4_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_UDP,
+		sizeof(ice_fdir_udpv4_pkt), ice_fdir_udpv4_pkt,
+		sizeof(ice_fdir_udp4_tun_pkt), ice_fdir_udp4_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_SCTP,
+		sizeof(ice_fdir_sctpv4_pkt), ice_fdir_sctpv4_pkt,
+		sizeof(ice_fdir_sctp4_tun_pkt), ice_fdir_sctp4_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_OTHER,
+		sizeof(ice_fdir_ipv4_pkt), ice_fdir_ipv4_pkt,
+		sizeof(ice_fdir_ip4_tun_pkt), ice_fdir_ip4_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_FRAG_IPV4,
+		sizeof(ice_fdir_ipv4_frag_pkt), ice_fdir_ipv4_frag_pkt,
+		sizeof(ice_fdir_ipv4_frag_pkt), ice_fdir_ipv4_frag_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_FRAG_IPV6,
+		sizeof(ice_fdir_ipv6_frag_pkt), ice_fdir_ipv6_frag_pkt,
+		sizeof(ice_fdir_ipv6_frag_pkt), ice_fdir_ipv6_frag_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_dw_pkt),
+		ice_fdir_ipv4_gtpu4_eh_dw_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_dw_pkt),
+		ice_fdir_ipv4_gtpu4_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_up_pkt),
+		ice_fdir_ipv4_gtpu4_eh_up_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_up_pkt),
+		ice_fdir_ipv4_gtpu4_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP,
+		sizeof(ice_fdir_udp4_gtpu4_pkt),
+		ice_fdir_udp4_gtpu4_pkt,
+		sizeof(ice_fdir_udp4_gtpu4_pkt),
+		ice_fdir_udp4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP,
+		sizeof(ice_fdir_tcp4_gtpu4_pkt),
+		ice_fdir_tcp4_gtpu4_pkt,
+		sizeof(ice_fdir_tcp4_gtpu4_pkt),
+		ice_fdir_tcp4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_UDP,
+		sizeof(ice_fdir_udp4_gtpu4_eh_pkt),
+		ice_fdir_udp4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_udp4_gtpu4_eh_pkt),
+		ice_fdir_udp4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_TCP,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_pkt),
+		ice_fdir_tcp4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_pkt),
+		ice_fdir_tcp4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_dw_pkt),
+		ice_fdir_ipv4_gtpu4_eh_dw_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_dw_pkt),
+		ice_fdir_ipv4_gtpu4_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_UDP,
+		sizeof(ice_fdir_udp4_gtpu4_eh_dw_pkt),
+		ice_fdir_udp4_gtpu4_eh_dw_pkt,
+		sizeof(ice_fdir_udp4_gtpu4_eh_dw_pkt),
+		ice_fdir_udp4_gtpu4_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_TCP,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_dw_pkt),
+		ice_fdir_tcp4_gtpu4_eh_dw_pkt,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_dw_pkt),
+		ice_fdir_tcp4_gtpu4_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_up_pkt),
+		ice_fdir_ipv4_gtpu4_eh_up_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_up_pkt),
+		ice_fdir_ipv4_gtpu4_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_UDP,
+		sizeof(ice_fdir_udp4_gtpu4_eh_up_pkt),
+		ice_fdir_udp4_gtpu4_eh_up_pkt,
+		sizeof(ice_fdir_udp4_gtpu4_eh_up_pkt),
+		ice_fdir_udp4_gtpu4_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_TCP,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_up_pkt),
+		ice_fdir_tcp4_gtpu4_eh_up_pkt,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_up_pkt),
+		ice_fdir_tcp4_gtpu4_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_ICMP,
+		sizeof(ice_fdir_icmp4_gtpu4_pkt),
+		ice_fdir_icmp4_gtpu4_pkt,
+		sizeof(ice_fdir_icmp4_gtpu4_pkt),
+		ice_fdir_icmp4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_OTHER,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6,
+		sizeof(ice_fdir_ipv6_gtpu4_pkt),
+		ice_fdir_ipv6_gtpu4_pkt,
+		sizeof(ice_fdir_ipv6_gtpu4_pkt),
+		ice_fdir_ipv6_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_UDP,
+		sizeof(ice_fdir_udp6_gtpu4_pkt),
+		ice_fdir_udp6_gtpu4_pkt,
+		sizeof(ice_fdir_udp6_gtpu4_pkt),
+		ice_fdir_udp6_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_TCP,
+		sizeof(ice_fdir_tcp6_gtpu4_pkt),
+		ice_fdir_tcp6_gtpu4_pkt,
+		sizeof(ice_fdir_tcp6_gtpu4_pkt),
+		ice_fdir_tcp6_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU,
+		sizeof(ice_fdir_ipv6_gtpu6_pkt),
+		ice_fdir_ipv6_gtpu6_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_pkt),
+		ice_fdir_ipv6_gtpu6_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_pkt),
+		ice_fdir_ipv6_gtpu6_eh_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_pkt),
+		ice_fdir_ipv6_gtpu6_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_dw_pkt),
+		ice_fdir_ipv6_gtpu6_eh_dw_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_dw_pkt),
+		ice_fdir_ipv6_gtpu6_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_up_pkt),
+		ice_fdir_ipv6_gtpu6_eh_up_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_up_pkt),
+		ice_fdir_ipv6_gtpu6_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_IPV6_OTHER,
+		sizeof(ice_fdir_ipv6_gtpu6_pkt),
+		ice_fdir_ipv6_gtpu6_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_pkt),
+		ice_fdir_ipv6_gtpu6_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_OTHER,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_IPV6_OTHER,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_pkt),
+		ice_fdir_ipv6_gtpu6_eh_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_pkt),
+		ice_fdir_ipv6_gtpu6_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3,
+		sizeof(ice_fdir_ipv4_l2tpv3_pkt), ice_fdir_ipv4_l2tpv3_pkt,
+		sizeof(ice_fdir_ipv4_l2tpv3_pkt), ice_fdir_ipv4_l2tpv3_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3,
+		sizeof(ice_fdir_ipv6_l2tpv3_pkt), ice_fdir_ipv6_l2tpv3_pkt,
+		sizeof(ice_fdir_ipv6_l2tpv3_pkt), ice_fdir_ipv6_l2tpv3_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_ESP,
+		sizeof(ice_fdir_ipv4_esp_pkt), ice_fdir_ipv4_esp_pkt,
+		sizeof(ice_fdir_ipv4_esp_pkt), ice_fdir_ipv4_esp_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_ESP,
+		sizeof(ice_fdir_ipv6_esp_pkt), ice_fdir_ipv6_esp_pkt,
+		sizeof(ice_fdir_ipv6_esp_pkt), ice_fdir_ipv6_esp_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_AH,
+		sizeof(ice_fdir_ipv4_ah_pkt), ice_fdir_ipv4_ah_pkt,
+		sizeof(ice_fdir_ipv4_ah_pkt), ice_fdir_ipv4_ah_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_AH,
+		sizeof(ice_fdir_ipv6_ah_pkt), ice_fdir_ipv6_ah_pkt,
+		sizeof(ice_fdir_ipv6_ah_pkt), ice_fdir_ipv6_ah_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP,
+		sizeof(ice_fdir_ipv4_nat_t_esp_pkt),
+		ice_fdir_ipv4_nat_t_esp_pkt,
+		sizeof(ice_fdir_ipv4_nat_t_esp_pkt),
+		ice_fdir_ipv4_nat_t_esp_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP,
+		sizeof(ice_fdir_ipv6_nat_t_esp_pkt),
+		ice_fdir_ipv6_nat_t_esp_pkt,
+		sizeof(ice_fdir_ipv6_nat_t_esp_pkt),
+		ice_fdir_ipv6_nat_t_esp_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE,
+		sizeof(ice_fdir_ipv4_pfcp_node_pkt),
+		ice_fdir_ipv4_pfcp_node_pkt,
+		sizeof(ice_fdir_ipv4_pfcp_node_pkt),
+		ice_fdir_ipv4_pfcp_node_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION,
+		sizeof(ice_fdir_ipv4_pfcp_session_pkt),
+		ice_fdir_ipv4_pfcp_session_pkt,
+		sizeof(ice_fdir_ipv4_pfcp_session_pkt),
+		ice_fdir_ipv4_pfcp_session_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE,
+		sizeof(ice_fdir_ipv6_pfcp_node_pkt),
+		ice_fdir_ipv6_pfcp_node_pkt,
+		sizeof(ice_fdir_ipv6_pfcp_node_pkt),
+		ice_fdir_ipv6_pfcp_node_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION,
+		sizeof(ice_fdir_ipv6_pfcp_session_pkt),
+		ice_fdir_ipv6_pfcp_session_pkt,
+		sizeof(ice_fdir_ipv6_pfcp_session_pkt),
+		ice_fdir_ipv6_pfcp_session_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NON_IP_L2,
+		sizeof(ice_fdir_non_ip_l2_pkt), ice_fdir_non_ip_l2_pkt,
+		sizeof(ice_fdir_non_ip_l2_pkt), ice_fdir_non_ip_l2_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_UDP_VXLAN,
+		sizeof(ice_fdir_udp4_vxlan_pkt), ice_fdir_udp4_vxlan_pkt,
+		sizeof(ice_fdir_udp4_vxlan_pkt), ice_fdir_udp4_vxlan_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_ECPRI_TP0,
+		sizeof(ice_fdir_ecpri_tp0_pkt), ice_fdir_ecpri_tp0_pkt,
+		sizeof(ice_fdir_ecpri_tp0_pkt), ice_fdir_ecpri_tp0_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0,
+		sizeof(ice_fdir_ipv4_udp_ecpri_tp0_pkt),
+		ice_fdir_ipv4_udp_ecpri_tp0_pkt,
+		sizeof(ice_fdir_ipv4_udp_ecpri_tp0_pkt),
+		ice_fdir_ipv4_udp_ecpri_tp0_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_TCP,
+		sizeof(ice_fdir_tcpv6_pkt), ice_fdir_tcpv6_pkt,
+		sizeof(ice_fdir_tcp6_tun_pkt), ice_fdir_tcp6_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_UDP,
+		sizeof(ice_fdir_udpv6_pkt), ice_fdir_udpv6_pkt,
+		sizeof(ice_fdir_udp6_tun_pkt), ice_fdir_udp6_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_SCTP,
+		sizeof(ice_fdir_sctpv6_pkt), ice_fdir_sctpv6_pkt,
+		sizeof(ice_fdir_sctp6_tun_pkt), ice_fdir_sctp6_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_OTHER,
+		sizeof(ice_fdir_ipv6_pkt), ice_fdir_ipv6_pkt,
+		sizeof(ice_fdir_ip6_tun_pkt), ice_fdir_ip6_tun_pkt,
+	},
+};
+
+#define ICE_FDIR_NUM_PKT ARRAY_SIZE(ice_fdir_pkt)
+
+/**
+ * ice_set_dflt_val_fd_desc
+ * @fd_fltr_ctx: pointer to fd filter descriptor
+ */
+void ice_set_dflt_val_fd_desc(struct ice_fd_fltr_desc_ctx *fd_fltr_ctx)
+{
+	fd_fltr_ctx->comp_q = ICE_FXD_FLTR_QW0_COMP_Q_ZERO;
+	fd_fltr_ctx->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
+	fd_fltr_ctx->fd_space = ICE_FXD_FLTR_QW0_FD_SPACE_GUAR_BEST;
+	fd_fltr_ctx->cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+	fd_fltr_ctx->evict_ena = ICE_FXD_FLTR_QW0_EVICT_ENA_TRUE;
+	fd_fltr_ctx->toq = ICE_FXD_FLTR_QW0_TO_Q_EQUALS_QINDEX;
+	fd_fltr_ctx->toq_prio = ICE_FXD_FLTR_QW0_TO_Q_PRIO1;
+	fd_fltr_ctx->dpu_recipe = ICE_FXD_FLTR_QW0_DPU_RECIPE_DFLT;
+	fd_fltr_ctx->drop = ICE_FXD_FLTR_QW0_DROP_NO;
+	fd_fltr_ctx->flex_prio = ICE_FXD_FLTR_QW0_FLEX_PRI_NONE;
+	fd_fltr_ctx->flex_mdid = ICE_FXD_FLTR_QW0_FLEX_MDID0;
+	fd_fltr_ctx->flex_val = ICE_FXD_FLTR_QW0_FLEX_VAL0;
+	fd_fltr_ctx->dtype = ICE_TX_DESC_DTYPE_FLTR_PROG;
+	fd_fltr_ctx->desc_prof_prio = ICE_FXD_FLTR_QW1_PROF_PRIO_ZERO;
+	fd_fltr_ctx->desc_prof = ICE_FXD_FLTR_QW1_PROF_ZERO;
+	fd_fltr_ctx->swap = ICE_FXD_FLTR_QW1_SWAP_SET;
+	fd_fltr_ctx->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_ONE;
+	fd_fltr_ctx->fdid_mdid = ICE_FXD_FLTR_QW1_FDID_MDID_FD;
+	fd_fltr_ctx->fdid = ICE_FXD_FLTR_QW1_FDID_ZERO;
+}
+
+/**
+ * ice_set_fd_desc_val
+ * @ctx: pointer to fd filter descriptor context
+ * @fdir_desc: populated with fd filter descriptor values
+ */
+void
+ice_set_fd_desc_val(struct ice_fd_fltr_desc_ctx *ctx,
+		    struct ice_fltr_desc *fdir_desc)
+{
+	u64 qword;
+
+	/* prep QW0 of FD filter programming desc */
+	qword = ((u64)ctx->qindex << ICE_FXD_FLTR_QW0_QINDEX_S) &
+		ICE_FXD_FLTR_QW0_QINDEX_M;
+	qword |= ((u64)ctx->comp_q << ICE_FXD_FLTR_QW0_COMP_Q_S) &
+		 ICE_FXD_FLTR_QW0_COMP_Q_M;
+	qword |= ((u64)ctx->comp_report << ICE_FXD_FLTR_QW0_COMP_REPORT_S) &
+		 ICE_FXD_FLTR_QW0_COMP_REPORT_M;
+	qword |= ((u64)ctx->fd_space << ICE_FXD_FLTR_QW0_FD_SPACE_S) &
+		 ICE_FXD_FLTR_QW0_FD_SPACE_M;
+	qword |= ((u64)ctx->cnt_index << ICE_FXD_FLTR_QW0_STAT_CNT_S) &
+		 ICE_FXD_FLTR_QW0_STAT_CNT_M;
+	qword |= ((u64)ctx->cnt_ena << ICE_FXD_FLTR_QW0_STAT_ENA_S) &
+		 ICE_FXD_FLTR_QW0_STAT_ENA_M;
+	qword |= ((u64)ctx->evict_ena << ICE_FXD_FLTR_QW0_EVICT_ENA_S) &
+		 ICE_FXD_FLTR_QW0_EVICT_ENA_M;
+	qword |= ((u64)ctx->toq << ICE_FXD_FLTR_QW0_TO_Q_S) &
+		 ICE_FXD_FLTR_QW0_TO_Q_M;
+	qword |= ((u64)ctx->toq_prio << ICE_FXD_FLTR_QW0_TO_Q_PRI_S) &
+		 ICE_FXD_FLTR_QW0_TO_Q_PRI_M;
+	qword |= ((u64)ctx->dpu_recipe << ICE_FXD_FLTR_QW0_DPU_RECIPE_S) &
+		 ICE_FXD_FLTR_QW0_DPU_RECIPE_M;
+	qword |= ((u64)ctx->drop << ICE_FXD_FLTR_QW0_DROP_S) &
+		 ICE_FXD_FLTR_QW0_DROP_M;
+	qword |= ((u64)ctx->flex_prio << ICE_FXD_FLTR_QW0_FLEX_PRI_S) &
+		 ICE_FXD_FLTR_QW0_FLEX_PRI_M;
+	qword |= ((u64)ctx->flex_mdid << ICE_FXD_FLTR_QW0_FLEX_MDID_S) &
+		 ICE_FXD_FLTR_QW0_FLEX_MDID_M;
+	qword |= ((u64)ctx->flex_val << ICE_FXD_FLTR_QW0_FLEX_VAL_S) &
+		 ICE_FXD_FLTR_QW0_FLEX_VAL_M;
+	fdir_desc->qidx_compq_space_stat = cpu_to_le64(qword);
+
+	/* prep QW1 of FD filter programming desc */
+	qword = ((u64)ctx->dtype << ICE_FXD_FLTR_QW1_DTYPE_S) &
+		ICE_FXD_FLTR_QW1_DTYPE_M;
+	qword |= ((u64)ctx->pcmd << ICE_FXD_FLTR_QW1_PCMD_S) &
+		 ICE_FXD_FLTR_QW1_PCMD_M;
+	qword |= ((u64)ctx->desc_prof_prio << ICE_FXD_FLTR_QW1_PROF_PRI_S) &
+		 ICE_FXD_FLTR_QW1_PROF_PRI_M;
+	qword |= ((u64)ctx->desc_prof << ICE_FXD_FLTR_QW1_PROF_S) &
+		 ICE_FXD_FLTR_QW1_PROF_M;
+	qword |= ((u64)ctx->fd_vsi << ICE_FXD_FLTR_QW1_FD_VSI_S) &
+		 ICE_FXD_FLTR_QW1_FD_VSI_M;
+	qword |= ((u64)ctx->swap << ICE_FXD_FLTR_QW1_SWAP_S) &
+		 ICE_FXD_FLTR_QW1_SWAP_M;
+	qword |= ((u64)ctx->fdid_prio << ICE_FXD_FLTR_QW1_FDID_PRI_S) &
+		 ICE_FXD_FLTR_QW1_FDID_PRI_M;
+	qword |= ((u64)ctx->fdid_mdid << ICE_FXD_FLTR_QW1_FDID_MDID_S) &
+		 ICE_FXD_FLTR_QW1_FDID_MDID_M;
+	qword |= ((u64)ctx->fdid << ICE_FXD_FLTR_QW1_FDID_S) &
+		 ICE_FXD_FLTR_QW1_FDID_M;
+	fdir_desc->dtype_cmd_vsi_fdid = cpu_to_le64(qword);
+}
+
+/**
+ * ice_fdir_get_prgm_desc - set a fdir descriptor from a fdir filter struct
+ * @hw: pointer to the hardware structure
+ * @input: filter
+ * @fdesc: filter descriptor
+ * @add: if add is true, this is an add operation, false implies delete
+ */
+void
+ice_fdir_get_prgm_desc(struct ice_hw *hw, struct ice_fdir_fltr *input,
+		       struct ice_fltr_desc *fdesc, bool add)
+{
+	struct ice_fd_fltr_desc_ctx fdir_fltr_ctx = { 0 };
+
+	/* set default context info */
+	ice_set_dflt_val_fd_desc(&fdir_fltr_ctx);
+
+	/* change sideband filtering values */
+	fdir_fltr_ctx.fdid = input->fltr_id;
+	if (input->dest_ctl == ICE_FLTR_PRGM_DESC_DEST_DROP_PKT) {
+		fdir_fltr_ctx.drop = ICE_FXD_FLTR_QW0_DROP_YES;
+		fdir_fltr_ctx.qindex = 0;
+	} else if (input->dest_ctl ==
+		   ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_OTHER) {
+		fdir_fltr_ctx.drop = ICE_FXD_FLTR_QW0_DROP_NO;
+		fdir_fltr_ctx.qindex = 0;
+	} else {
+		if (input->dest_ctl ==
+		    ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QGROUP)
+			fdir_fltr_ctx.toq = input->q_region;
+		fdir_fltr_ctx.drop = ICE_FXD_FLTR_QW0_DROP_NO;
+		fdir_fltr_ctx.qindex = input->q_index;
+	}
+	fdir_fltr_ctx.cnt_ena = input->cnt_ena;
+	fdir_fltr_ctx.cnt_index = input->cnt_index;
+	fdir_fltr_ctx.fd_vsi = ice_get_hw_vsi_num(hw, input->dest_vsi);
+	fdir_fltr_ctx.evict_ena = ICE_FXD_FLTR_QW0_EVICT_ENA_FALSE;
+	if (input->dest_ctl == ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_OTHER)
+		fdir_fltr_ctx.toq_prio = 0;
+	else
+		fdir_fltr_ctx.toq_prio = 3;
+	fdir_fltr_ctx.pcmd = add ? ICE_FXD_FLTR_QW1_PCMD_ADD :
+		ICE_FXD_FLTR_QW1_PCMD_REMOVE;
+	fdir_fltr_ctx.swap = ICE_FXD_FLTR_QW1_SWAP_NOT_SET;
+	fdir_fltr_ctx.comp_q = ICE_FXD_FLTR_QW0_COMP_Q_ZERO;
+	fdir_fltr_ctx.comp_report = input->comp_report;
+	fdir_fltr_ctx.fdid_prio = input->fdid_prio;
+	fdir_fltr_ctx.desc_prof = 1;
+	fdir_fltr_ctx.desc_prof_prio = 3;
+	ice_set_fd_desc_val(&fdir_fltr_ctx, fdesc);
+}
+
+/**
+ * ice_alloc_fd_res_cntr - obtain counter resource for FD type
+ * @hw: pointer to the hardware structure
+ * @cntr_id: returns counter index
+ */
+enum ice_status ice_alloc_fd_res_cntr(struct ice_hw *hw, u16 *cntr_id)
+{
+	return ice_alloc_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_COUNTER_BLOCK,
+				  ICE_AQC_RES_TYPE_FLAG_DEDICATED, 1, cntr_id);
+}
+
+/**
+ * ice_free_fd_res_cntr - Free counter resource for FD type
+ * @hw: pointer to the hardware structure
+ * @cntr_id: counter index to be freed
+ */
+enum ice_status ice_free_fd_res_cntr(struct ice_hw *hw, u16 cntr_id)
+{
+	return ice_free_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_COUNTER_BLOCK,
+				 ICE_AQC_RES_TYPE_FLAG_DEDICATED, 1, cntr_id);
+}
+
+/**
+ * ice_alloc_fd_guar_item - allocate resource for FD guaranteed entries
+ * @hw: pointer to the hardware structure
+ * @cntr_id: returns counter index
+ * @num_fltr: number of filter entries to be allocated
+ */
+enum ice_status
+ice_alloc_fd_guar_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr)
+{
+	return ice_alloc_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_GUARANTEED_ENTRIES,
+				  ICE_AQC_RES_TYPE_FLAG_DEDICATED, num_fltr,
+				  cntr_id);
+}
+
+/**
+ * ice_free_fd_guar_item - Free flow director guaranteed entries
+ * @hw: pointer to the hardware structure
+ * @cntr_id: counter index that needs to be freed
+ * @num_fltr: number of filters to be freed
+ */
+enum ice_status
+ice_free_fd_guar_item(struct ice_hw *hw, u16 cntr_id, u16 num_fltr)
+{
+	return ice_free_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_GUARANTEED_ENTRIES,
+				 ICE_AQC_RES_TYPE_FLAG_DEDICATED, num_fltr,
+				 cntr_id);
+}
+
+/**
+ * ice_alloc_fd_shrd_item - allocate resource for flow director shared entries
+ * @hw: pointer to the hardware structure
+ * @cntr_id: returns counter index
+ * @num_fltr: number of filter entries to be allocated
+ */
+enum ice_status
+ice_alloc_fd_shrd_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr)
+{
+	return ice_alloc_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_SHARED_ENTRIES,
+				  ICE_AQC_RES_TYPE_FLAG_DEDICATED, num_fltr,
+				  cntr_id);
+}
+
+/**
+ * ice_free_fd_shrd_item - Free flow director shared entries
+ * @hw: pointer to the hardware structure
+ * @cntr_id: counter index that needs to be freed
+ * @num_fltr: number of filters to be freed
+ */
+enum ice_status
+ice_free_fd_shrd_item(struct ice_hw *hw, u16 cntr_id, u16 num_fltr)
+{
+	return ice_free_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_SHARED_ENTRIES,
+				 ICE_AQC_RES_TYPE_FLAG_DEDICATED, num_fltr,
+				 cntr_id);
+}
+
+/**
+ * ice_get_fdir_cnt_all - get the number of Flow Director filters
+ * @hw: hardware data structure
+ *
+ * Returns the number of filters available on device
+ */
+int ice_get_fdir_cnt_all(struct ice_hw *hw)
+{
+	return hw->func_caps.fd_fltr_guar + hw->func_caps.fd_fltr_best_effort;
+}
+
+/**
+ * ice_pkt_insert_ipv6_addr - insert a be32 IPv6 address into a memory buffer.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @addr: IPv6 address to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_ipv6_addr(u8 *pkt, int offset, __be32 *addr)
+{
+	int idx;
+
+	for (idx = 0; idx < ICE_IPV6_ADDR_LEN_AS_U32; idx++)
+		memcpy(pkt + offset + idx * sizeof(*addr), &addr[idx],
+		       sizeof(*addr));
+}
+
+/**
+ * ice_pkt_insert_u6_qfi - insert a u6 value qfi into a memory buffer for gtpu
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 8 bit value to convert and insert into pkt at offset
+ *
+ * This function is designed for inserting qfi (6 bits) for gtpu.
+ */
+static void ice_pkt_insert_u6_qfi(u8 *pkt, int offset, u8 data)
+{
+	u8 ret;
+
+	ret = (data & 0x3F) + (*(pkt + offset) & 0xC0);
+	memcpy(pkt + offset, &ret, sizeof(ret));
+}
+
+/**
+ * ice_pkt_insert_u8 - insert a u8 value into a memory buffer.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 8 bit value to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_u8(u8 *pkt, int offset, u8 data)
+{
+	memcpy(pkt + offset, &data, sizeof(data));
+}
+
+/**
+ * ice_pkt_insert_u8_tc - insert a u8 value into a memory buffer for TC ipv6.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 8 bit value to convert and insert into pkt at offset
+ *
+ * This function is designed for inserting Traffic Class (TC) for IPv6,
+ * since that TC is not aligned in number of bytes. Here we split it out
+ * into two part and fill each byte with data copy from pkt, then insert
+ * the two bytes data one by one.
+ */
+static void ice_pkt_insert_u8_tc(u8 *pkt, int offset, u8 data)
+{
+	u8 high, low;
+
+	high = (data >> 4) + (*(pkt + offset) & 0xF0);
+	memcpy(pkt + offset, &high, sizeof(high));
+
+	low = (*(pkt + offset + 1) & 0x0F) + ((data & 0x0F) << 4);
+	memcpy(pkt + offset + 1, &low, sizeof(low));
+}
+
+/**
+ * ice_pkt_insert_u16 - insert a be16 value into a memory buffer.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 16 bit value to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_u16(u8 *pkt, int offset, __be16 data)
+{
+	memcpy(pkt + offset, &data, sizeof(data));
+}
+
+/**
+ * ice_pkt_insert_u32 - insert a be32 value into a memory buffer.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 32 bit value to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_u32(u8 *pkt, int offset, __be32 data)
+{
+	memcpy(pkt + offset, &data, sizeof(data));
+}
+
+/**
+ * ice_pkt_insert_mac_addr - insert a MAC addr into a memory buffer.
+ * @pkt: packet buffer
+ * @addr: MAC address to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_mac_addr(u8 *pkt, u8 *addr)
+{
+	ether_addr_copy(pkt, addr);
+}
+
+/**
+ * ice_fdir_get_open_tunnel_port
+ * @hw: pointer to the hardware structure
+ * @flow: flow ptype
+ * @port: returns open port
+ *
+ * returns an open tunnel port specified for this flow type
+ */
+static enum ice_status
+ice_fdir_get_open_tunnel_port(struct ice_hw *hw, enum ice_fltr_ptype flow,
+			      u16 *port)
+{
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0:
+		/* eCPRI tunnel */
+		if (!ice_get_open_tunnel_port(hw, TNL_ECPRI, port))
+			return ICE_ERR_DOES_NOT_EXIST;
+		break;
+	default:
+		if (!ice_get_open_tunnel_port(hw, TNL_VXLAN, port) &&
+		    !ice_get_open_tunnel_port(hw, TNL_GENEVE, port))
+			return ICE_ERR_DOES_NOT_EXIST;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_fdir_get_gen_prgm_pkt - generate a training packet
+ * @hw: pointer to the hardware structure
+ * @input: flow director filter data structure
+ * @pkt: pointer to return filter packet
+ * @frag: generate a fragment packet
+ * @tun: true implies generate a tunnel packet
+ */
+enum ice_status
+ice_fdir_get_gen_prgm_pkt(struct ice_hw *hw, struct ice_fdir_fltr *input,
+			  u8 *pkt, bool frag, bool tun)
+{
+	enum ice_fltr_ptype flow;
+	u16 tnl_port;
+	u8 *loc;
+	u16 idx;
+
+	if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_OTHER) {
+		switch (input->ip.v4.proto) {
+		case IPPROTO_TCP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV4_TCP;
+			break;
+		case IPPROTO_UDP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+			break;
+		case IPPROTO_SCTP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV4_SCTP;
+			break;
+		default:
+			flow = ICE_FLTR_PTYPE_NONF_IPV4_OTHER;
+			break;
+		}
+	} else if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV6_OTHER) {
+		switch (input->ip.v6.proto) {
+		case IPPROTO_TCP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV6_TCP;
+			break;
+		case IPPROTO_UDP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV6_UDP;
+			break;
+		case IPPROTO_SCTP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV6_SCTP;
+			break;
+		default:
+			flow = ICE_FLTR_PTYPE_NONF_IPV6_OTHER;
+			break;
+		}
+	} else {
+		flow = input->flow_type;
+	}
+
+	for (idx = 0; idx < ICE_FDIR_NUM_PKT; idx++)
+		if (ice_fdir_pkt[idx].flow == flow)
+			break;
+	if (idx == ICE_FDIR_NUM_PKT)
+		return ICE_ERR_PARAM;
+	if (!tun) {
+		memcpy(pkt, ice_fdir_pkt[idx].pkt, ice_fdir_pkt[idx].pkt_len);
+		loc = pkt;
+	} else {
+		if (!ice_fdir_pkt[idx].tun_pkt)
+			return ICE_ERR_PARAM;
+
+		switch (flow) {
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_TCP:
+			memcpy(pkt, ice_fdir_pkt[idx].tun_pkt,
+			       ice_fdir_pkt[idx].tun_pkt_len);
+			loc = &pkt[ICE_FDIR_GTPU_IP_INNER_PKT_OFF];
+			break;
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_TCP:
+			memcpy(pkt, ice_fdir_pkt[idx].tun_pkt,
+			       ice_fdir_pkt[idx].tun_pkt_len);
+			loc = &pkt[ICE_FDIR_GTPU_EH_INNER_PKT_OFF];
+			break;
+		default:
+			if (ice_fdir_get_open_tunnel_port(hw, flow, &tnl_port))
+				return ICE_ERR_DOES_NOT_EXIST;
+
+			memcpy(pkt, ice_fdir_pkt[idx].tun_pkt,
+			       ice_fdir_pkt[idx].tun_pkt_len);
+			ice_pkt_insert_u16(pkt, ICE_IPV4_UDP_DST_PORT_OFFSET,
+					   htons(tnl_port));
+			loc = &pkt[ICE_FDIR_TUN_PKT_OFF];
+			break;
+		}
+	}
+
+	/* Reverse the src and dst, since the HW expects them to be from Tx
+	 * perspective. The input from user is from Rx filter perspective.
+	 */
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_TCP_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_TCP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		if (frag)
+			loc[20] = ICE_FDIR_IPV4_PKT_FLAG_MF;
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		ice_pkt_insert_mac_addr(pkt, input->ext_data_outer.dst_mac);
+		ice_pkt_insert_mac_addr(pkt + ETH_ALEN,
+					input->ext_data_outer.src_mac);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip_outer.v4.dst_ip);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip_outer.v4.src_ip);
+		ice_pkt_insert_u8(pkt, ICE_IPV4_TOS_OFFSET, input->ip_outer.v4.tos);
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		ice_pkt_insert_mac_addr(loc + ETH_ALEN,
+					input->ext_data.src_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_SCTP_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_SCTP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_u8(loc, ICE_IPV4_PROTO_OFFSET,
+				  input->ip.v4.proto);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP_VXLAN:
+		ice_pkt_insert_mac_addr(pkt, input->ext_data_outer.dst_mac);
+		ice_pkt_insert_mac_addr(pkt + ETH_ALEN, input->ext_data_outer.src_mac);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip_outer.v4.dst_ip);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip_outer.v4.src_ip);
+		ice_pkt_insert_u8(pkt, ICE_IPV4_TOS_OFFSET, input->ip_outer.v4.tos);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_VXLAN_VNI_OFFSET,
+				   input->vxlan_data.vni);
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		ice_pkt_insert_mac_addr(loc + ETH_ALEN, input->ext_data.src_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_GTPU_TEID_OFFSET,
+				   input->gtpu_data.teid);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4:
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_PROTO_OFFSET,
+				  input->ip.v4.proto);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_GTPU_TEID_OFFSET,
+				   input->gtpu_data.teid);
+		ice_pkt_insert_u6_qfi(loc, ICE_IPV4_GTPU_QFI_OFFSET,
+				      input->gtpu_data.qfi);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_UDP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_UDP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_UDP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_UDP4_NO_MAC_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_UDP4_NO_MAC_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TTL_OFFSET, input->ip.v4.ttl);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_TCP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_TCP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_TCP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_TCP4_NO_MAC_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_TCP4_NO_MAC_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TTL_OFFSET, input->ip.v4.ttl);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_NO_MAC_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_NO_MAC_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_u8(loc, ICE_IPV6_NO_MAC_PROTO_OFFSET,
+				  input->ip.v6.proto);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_UDP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_UDP6_NO_MAC_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_UDP6_NO_MAC_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_NO_MAC_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_NO_MAC_HLIM_OFFSET, input->ip.v6.hlim);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_TCP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_TCP6_NO_MAC_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_TCP6_NO_MAC_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_NO_MAC_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_NO_MAC_HLIM_OFFSET, input->ip.v6.hlim);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_IPV6_OTHER:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV6_GTPU_TEID_OFFSET,
+				   input->gtpu_data.teid);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_IPV6_OTHER:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV6_GTPU_TEID_OFFSET,
+				   input->gtpu_data.teid);
+		ice_pkt_insert_u6_qfi(loc, ICE_IPV6_GTPU_QFI_OFFSET,
+				      input->gtpu_data.qfi);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3:
+		ice_pkt_insert_u32(loc, ICE_IPV4_L2TPV3_SESS_ID_OFFSET,
+				   input->l2tpv3_data.session_id);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3:
+		ice_pkt_insert_u32(loc, ICE_IPV6_L2TPV3_SESS_ID_OFFSET,
+				   input->l2tpv3_data.session_id);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_ESP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_ESP_SPI_OFFSET,
+				   input->ip.v4.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_ESP:
+		ice_pkt_insert_u32(loc, ICE_IPV6_ESP_SPI_OFFSET,
+				   input->ip.v6.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_AH:
+		ice_pkt_insert_u32(loc, ICE_IPV4_AH_SPI_OFFSET,
+				   input->ip.v4.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_AH:
+		ice_pkt_insert_u32(loc, ICE_IPV6_AH_SPI_OFFSET,
+				   input->ip.v6.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_NAT_T_ESP_SPI_OFFSET,
+				   input->ip.v4.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV6_NAT_T_ESP_SPI_OFFSET,
+				   input->ip.v6.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE:
+	case ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION:
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE:
+	case ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION:
+		ice_pkt_insert_u16(loc, ICE_IPV6_UDP_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		break;
+	case ICE_FLTR_PTYPE_NON_IP_L2:
+		ice_pkt_insert_u16(loc, ICE_MAC_ETHTYPE_OFFSET,
+				   input->ext_data.ether_type);
+		break;
+	case ICE_FLTR_PTYPE_NONF_ECPRI_TP0:
+		ice_pkt_insert_u16(loc, ICE_ECPRI_TP0_PC_ID_OFFSET,
+				   input->ecpri_data.pc_id);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0:
+		/* Use pkt instead of loc, since PC_ID is in outter part */
+		ice_pkt_insert_u16(pkt, ICE_IPV4_UDP_ECPRI_TP0_PC_ID_OFFSET,
+				   input->ecpri_data.pc_id);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV6_TCP_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_IPV6_TCP_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV6_UDP_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_IPV6_UDP_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV6_SCTP_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_IPV6_SCTP_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_u8(loc, ICE_IPV6_PROTO_OFFSET,
+				  input->ip.v6.proto);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_FRAG_IPV4:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u16(loc, ICE_IPV4_ID_OFFSET,
+				   input->ip.v4.packet_id);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_u8(loc, ICE_IPV4_PROTO_OFFSET,
+				  input->ip.v4.proto);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_FRAG_IPV6:
+		ice_pkt_insert_u32(loc, ICE_IPV6_ID_OFFSET,
+				   input->ip.v6.packet_id);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+
+	if (input->flex_fltr)
+		ice_pkt_insert_u16(loc, input->flex_offset, input->flex_word);
+
+	return 0;
+}
+
+/**
+ * ice_fdir_get_prgm_pkt - generate a training packet
+ * @input: flow director filter data structure
+ * @pkt: pointer to return filter packet
+ * @frag: generate a fragment packet
+ */
+enum ice_status
+ice_fdir_get_prgm_pkt(struct ice_fdir_fltr *input, u8 *pkt, bool frag)
+{
+	return ice_fdir_get_gen_prgm_pkt(NULL, input, pkt, frag, false);
+}
+
+/**
+ * ice_fdir_has_frag - does flow type have 2 ptypes
+ * @flow: flow ptype
+ *
+ * returns true is there is a fragment packet for this ptype
+ */
+bool ice_fdir_has_frag(enum ice_fltr_ptype flow)
+{
+	if (flow == ICE_FLTR_PTYPE_FRAG_IPV4 ||
+	    flow == ICE_FLTR_PTYPE_FRAG_IPV6)
+		return true;
+	else
+		return false;
+}
+
+/**
+ * ice_fdir_find_fltr_by_idx - find filter with idx
+ * @hw: pointer to hardware structure
+ * @fltr_idx: index to find.
+ *
+ * Returns pointer to filter if found or null
+ */
+struct ice_fdir_fltr *
+ice_fdir_find_fltr_by_idx(struct ice_hw *hw, u32 fltr_idx)
+{
+	struct ice_fdir_fltr *rule;
+
+	list_for_each_entry(rule, &hw->fdir_list_head, fltr_node) {
+		/* rule ID found in the list */
+		if (fltr_idx == rule->fltr_id)
+			return rule;
+		if (fltr_idx < rule->fltr_id)
+			break;
+	}
+	return NULL;
+}
+
+/**
+ * ice_fdir_list_add_fltr - add a new node to the flow director filter list
+ * @hw: hardware structure
+ * @fltr: filter node to add to structure
+ */
+void ice_fdir_list_add_fltr(struct ice_hw *hw, struct ice_fdir_fltr *fltr)
+{
+	struct ice_fdir_fltr *rule, *parent = NULL;
+
+	list_for_each_entry(rule, &hw->fdir_list_head, fltr_node) {
+		/* rule ID found or pass its spot in the list */
+		if (rule->fltr_id >= fltr->fltr_id)
+			break;
+		parent = rule;
+	}
+
+	if (parent)
+		list_add(&fltr->fltr_node, &parent->fltr_node);
+	else
+		list_add(&fltr->fltr_node, &hw->fdir_list_head);
+}
+
+/**
+ * ice_fdir_update_cntrs - increment / decrement filter counter
+ * @hw: pointer to hardware structure
+ * @flow: filter flow type
+ * @acl_fltr: true indicates an ACL filter
+ * @add: true implies filters added
+ */
+void
+ice_fdir_update_cntrs(struct ice_hw *hw, enum ice_fltr_ptype flow,
+		      bool acl_fltr, bool add)
+{
+	int incr;
+
+	incr = add ? 1 : -1;
+	hw->fdir_active_fltr += incr;
+	if (flow == ICE_FLTR_PTYPE_NONF_NONE || flow >= ICE_FLTR_PTYPE_MAX) {
+		ice_debug(hw, ICE_DBG_SW, "Unknown filter type %d\n", flow);
+	} else {
+		if (acl_fltr)
+			hw->acl_fltr_cnt[flow] += incr;
+		else
+			hw->fdir_fltr_cnt[flow] += incr;
+	}
+}
+
+/**
+ * ice_cmp_ipv6_addr - compare 2 IP v6 addresses
+ * @a: IP v6 address
+ * @b: IP v6 address
+ *
+ * Returns 0 on equal, returns non-0 if different
+ */
+static int ice_cmp_ipv6_addr(__be32 *a, __be32 *b)
+{
+	return memcmp(a, b, 4 * sizeof(__be32));
+}
+
+/**
+ * ice_fdir_comp_rules - compare 2 filters
+ * @a: a Flow Director filter data structure
+ * @b: a Flow Director filter data structure
+ * @v6: bool true if v6 filter
+ *
+ * Returns true if the filters match
+ */
+static bool
+ice_fdir_comp_rules(struct ice_fdir_fltr *a,  struct ice_fdir_fltr *b, bool v6)
+{
+	enum ice_fltr_ptype flow_type = a->flow_type;
+
+	/* The calling function already checks that the two filters have the
+	 * same flow_type.
+	 */
+	if (!v6) {
+		if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_SCTP) {
+			if (a->ip.v4.dst_ip == b->ip.v4.dst_ip &&
+			    a->ip.v4.src_ip == b->ip.v4.src_ip &&
+			    a->ip.v4.dst_port == b->ip.v4.dst_port &&
+			    a->ip.v4.src_port == b->ip.v4.src_port)
+				return true;
+		} else if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_OTHER) {
+			if (a->ip.v4.dst_ip == b->ip.v4.dst_ip &&
+			    a->ip.v4.src_ip == b->ip.v4.src_ip &&
+			    a->ip.v4.l4_header == b->ip.v4.l4_header &&
+			    a->ip.v4.proto == b->ip.v4.proto &&
+			    a->ip.v4.ip_ver == b->ip.v4.ip_ver &&
+			    a->ip.v4.tos == b->ip.v4.tos)
+				return true;
+		}
+	} else {
+		if (flow_type == ICE_FLTR_PTYPE_NONF_IPV6_UDP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_TCP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_SCTP) {
+			if (a->ip.v6.dst_port == b->ip.v6.dst_port &&
+			    a->ip.v6.src_port == b->ip.v6.src_port &&
+			    !ice_cmp_ipv6_addr(a->ip.v6.dst_ip,
+					       b->ip.v6.dst_ip) &&
+			    !ice_cmp_ipv6_addr(a->ip.v6.src_ip,
+					       b->ip.v6.src_ip))
+				return true;
+		} else if (flow_type == ICE_FLTR_PTYPE_NONF_IPV6_OTHER) {
+			if (a->ip.v6.dst_port == b->ip.v6.dst_port &&
+			    a->ip.v6.src_port == b->ip.v6.src_port)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+/**
+ * ice_fdir_is_dup_fltr - test if filter is already in list for PF
+ * @hw: hardware data structure
+ * @input: Flow Director filter data structure
+ *
+ * Returns true if the filter is found in the list
+ */
+bool ice_fdir_is_dup_fltr(struct ice_hw *hw, struct ice_fdir_fltr *input)
+{
+	struct ice_fdir_fltr *rule;
+	bool ret = false;
+
+	list_for_each_entry(rule, &hw->fdir_list_head, fltr_node) {
+		enum ice_fltr_ptype flow_type;
+
+		if (rule->flow_type != input->flow_type)
+			continue;
+
+		flow_type = input->flow_type;
+		if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_SCTP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_OTHER)
+			ret = ice_fdir_comp_rules(rule, input, false);
+		else
+			ret = ice_fdir_comp_rules(rule, input, true);
+		if (ret) {
+			if (rule->fltr_id == input->fltr_id &&
+			    rule->q_index != input->q_index)
+				ret = false;
+			else
+				break;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * ice_clear_vsi_fd_table - admin command to clear FD table for a VSI
+ * @hw: hardware data structure
+ * @vsi_num: vsi_num (HW VSI num)
+ *
+ * Clears FD table entries by issuing admin command (direct, 0x0B06)
+ * Must to pass valid vsi_num as returned by "AddVSI".
+ */
+enum ice_status ice_clear_vsi_fd_table(struct ice_hw *hw, u16 vsi_num)
+{
+	struct ice_aqc_clear_fd_table *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.clear_fd_table;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_fd_table);
+	cmd->clear_type = CL_FD_VM_VF_TYPE_VSI_IDX;
+
+	cmd->vsi_index = cpu_to_le16(vsi_num);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
+
+/**
+ * ice_clear_pf_fd_table - admin command to clear FD table for PF
+ * @hw: hardware data structure
+ *
+ * Clears FD table entries for a PF by issuing admin command (direct, 0x0B06)
+ */
+enum ice_status ice_clear_pf_fd_table(struct ice_hw *hw)
+{
+	struct ice_aqc_clear_fd_table *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.clear_fd_table;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_fd_table);
+	cmd->clear_type = CL_FD_VM_VF_TYPE_PF_IDX;
+	/* vsi_index must be 0 to clear FD table for a PF */
+	cmd->vsi_index = cpu_to_le16(0);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.h b/drivers/net/ethernet/intel/ice/ice_fdir.h
new file mode 100644
index 000000000000..7bdf45458fbb
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fdir.h
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FDIR_H_
+#define _ICE_FDIR_H_
+
+#include "ice_common.h"
+
+
+#define ICE_FDIR_GTPU_IP_INNER_PKT_OFF 50
+#define ICE_FDIR_GTPU_EH_INNER_PKT_OFF 58
+
+#define ICE_FDIR_TUN_PKT_OFF		50
+#define ICE_FDIR_MAX_RAW_PKT_SIZE	(512 + ICE_FDIR_TUN_PKT_OFF)
+#define ICE_FDIR_BUF_FULL_MARGIN	10
+
+/* macros for offsets into packets for flow director programming */
+#define ICE_IPV4_SRC_ADDR_OFFSET	26
+#define ICE_IPV4_DST_ADDR_OFFSET	30
+#define ICE_IPV4_TCP_SRC_PORT_OFFSET	34
+#define ICE_IPV4_TCP_DST_PORT_OFFSET	36
+#define ICE_IPV4_UDP_SRC_PORT_OFFSET	34
+#define ICE_IPV4_UDP_DST_PORT_OFFSET	36
+#define ICE_IPV4_SCTP_SRC_PORT_OFFSET	34
+#define ICE_IPV4_SCTP_DST_PORT_OFFSET	36
+#define ICE_IPV4_PROTO_OFFSET		23
+#define ICE_IPV6_SRC_ADDR_OFFSET	22
+#define ICE_IPV6_DST_ADDR_OFFSET	38
+#define ICE_IPV6_TCP_SRC_PORT_OFFSET	54
+#define ICE_IPV6_TCP_DST_PORT_OFFSET	56
+#define ICE_IPV6_UDP_SRC_PORT_OFFSET	54
+#define ICE_IPV6_UDP_DST_PORT_OFFSET	56
+#define ICE_IPV6_SCTP_SRC_PORT_OFFSET	54
+#define ICE_IPV6_SCTP_DST_PORT_OFFSET	56
+
+#define ICE_MAC_ETHTYPE_OFFSET		12
+#define ICE_IPV4_TOS_OFFSET		15
+#define ICE_IPV4_ID_OFFSET		18
+#define ICE_IPV4_TTL_OFFSET		22
+#define ICE_IPV6_TC_OFFSET		14
+#define ICE_IPV6_HLIM_OFFSET		21
+#define ICE_IPV6_PROTO_OFFSET		20
+#define ICE_IPV6_ID_OFFSET		58
+/* For TUN inner (without inner MAC) */
+#define ICE_IPV4_NO_MAC_TOS_OFFSET	1
+#define ICE_IPV4_NO_MAC_TTL_OFFSET	8
+#define ICE_IPV4_NO_MAC_PROTO_OFFSET	9
+#define ICE_IPV4_NO_MAC_SRC_ADDR_OFFSET	12
+#define ICE_IPV4_NO_MAC_DST_ADDR_OFFSET	16
+#define ICE_TCP4_NO_MAC_SRC_PORT_OFFSET	20
+#define ICE_TCP4_NO_MAC_DST_PORT_OFFSET	22
+#define ICE_UDP4_NO_MAC_SRC_PORT_OFFSET	20
+#define ICE_UDP4_NO_MAC_DST_PORT_OFFSET	22
+#define ICE_IPV6_NO_MAC_TC_OFFSET	0
+#define ICE_IPV6_NO_MAC_HLIM_OFFSET	7
+#define ICE_IPV6_NO_MAC_PROTO_OFFSET	6
+#define ICE_IPV6_NO_MAC_SRC_ADDR_OFFSET	8
+#define ICE_IPV6_NO_MAC_DST_ADDR_OFFSET	24
+#define ICE_TCP6_NO_MAC_SRC_PORT_OFFSET	40
+#define ICE_TCP6_NO_MAC_DST_PORT_OFFSET	42
+#define ICE_UDP6_NO_MAC_SRC_PORT_OFFSET	40
+#define ICE_UDP6_NO_MAC_DST_PORT_OFFSET	42
+#define ICE_IPV4_GTPU_TEID_OFFSET	46
+#define ICE_IPV4_GTPU_QFI_OFFSET	56
+#define ICE_IPV6_GTPU_TEID_OFFSET	66
+#define ICE_IPV6_GTPU_QFI_OFFSET	76
+#define ICE_IPV4_L2TPV3_SESS_ID_OFFSET	34
+#define ICE_IPV6_L2TPV3_SESS_ID_OFFSET	54
+#define ICE_IPV4_ESP_SPI_OFFSET		34
+#define ICE_IPV6_ESP_SPI_OFFSET		54
+#define ICE_IPV4_AH_SPI_OFFSET		38
+#define ICE_IPV6_AH_SPI_OFFSET		58
+#define ICE_IPV4_NAT_T_ESP_SPI_OFFSET	42
+#define ICE_IPV6_NAT_T_ESP_SPI_OFFSET	62
+#define ICE_IPV4_VXLAN_VNI_OFFSET	45
+#define ICE_ECPRI_TP0_PC_ID_OFFSET	18
+#define ICE_IPV4_UDP_ECPRI_TP0_PC_ID_OFFSET			46
+
+#define ICE_FDIR_MAX_FLTRS		16384
+
+/* IPv4 has 2 flag bits that enable fragment processing: DF and MF. DF
+ * requests that the packet not be fragmented. MF indicates that a packet has
+ * been fragmented, except that for the last fragment has a non-zero
+ * Fragment Offset field with zero MF.
+ */
+#define ICE_FDIR_IPV4_PKT_FLAG_MF		0x20
+#define ICE_FDIR_IPV4_PKT_FLAG_MF_SHIFT	8
+#define ICE_FDIR_IPV4_PKT_FLAG_DF		0x40
+
+/* For IPv6 fragmented packets, all fragments except the last have
+ * the MF flag set.
+ */
+#define ICE_FDIR_IPV6_PKT_FLAG_MF		0x100
+#define ICE_FDIR_IPV6_PKT_FLAG_MF_SHIFT	8
+
+enum ice_fltr_prgm_desc_dest {
+	ICE_FLTR_PRGM_DESC_DEST_DROP_PKT,
+	ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QINDEX,
+	ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QGROUP,
+	ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_OTHER,
+};
+
+enum ice_fltr_prgm_desc_fd_status {
+	ICE_FLTR_PRGM_DESC_FD_STATUS_NONE,
+	ICE_FLTR_PRGM_DESC_FD_STATUS_FD_ID,
+	ICE_FLTR_PRGM_DESC_FD_STATUS_FD_ID_4FLEX_BYTES,
+	ICE_FLTR_PRGM_DESC_FD_STATUS_8FLEX_BYTES,
+};
+
+/* Flow Director (FD) Filter Programming descriptor */
+struct ice_fd_fltr_desc_ctx {
+	u32 fdid;
+	u16 qindex;
+	u16 cnt_index;
+	u16 fd_vsi;
+	u16 flex_val;
+	u8 comp_q;
+	u8 comp_report;
+	u8 fd_space;
+	u8 cnt_ena;
+	u8 evict_ena;
+	u8 toq;
+	u8 toq_prio;
+	u8 dpu_recipe;
+	u8 drop;
+	u8 flex_prio;
+	u8 flex_mdid;
+	u8 dtype;
+	u8 pcmd;
+	u8 desc_prof_prio;
+	u8 desc_prof;
+	u8 swap;
+	u8 fdid_prio;
+	u8 fdid_mdid;
+};
+
+#define ICE_FLTR_PRGM_FLEX_WORD_SIZE	sizeof(__be16)
+
+struct ice_rx_flow_userdef {
+	u16 flex_word;
+	u16 flex_offset;
+	u16 flex_fltr;
+};
+
+struct ice_fdir_v4 {
+	__be32 dst_ip;
+	__be32 src_ip;
+	__be16 dst_port;
+	__be16 src_port;
+	__be32 l4_header;
+	__be32 sec_parm_idx;	/* security parameter index */
+	u8 tos;
+	u8 ip_ver;
+	u8 proto;
+	u8 ttl;
+	__be16 packet_id;
+};
+
+#define ICE_IPV6_ADDR_LEN_AS_U32		4
+
+struct ice_fdir_v6 {
+	__be32 dst_ip[ICE_IPV6_ADDR_LEN_AS_U32];
+	__be32 src_ip[ICE_IPV6_ADDR_LEN_AS_U32];
+	__be16 dst_port;
+	__be16 src_port;
+	__be32 l4_header; /* next header */
+	__be32 sec_parm_idx; /* security parameter index */
+	u8 tc;
+	u8 proto;
+	u8 hlim;
+	__be32 packet_id;
+};
+
+struct ice_fdir_udp_gtp {
+	u8 flags;
+	u8 msg_type;
+	__be16 rsrvd_len;
+	__be32 teid;
+	__be16 rsrvd_seq_nbr;
+	u8 rsrvd_n_pdu_nbr;
+	u8 rsrvd_next_ext_type;
+	u8 rsvrd_ext_len;
+	u8	pdu_type:4,
+		spare:4;
+	u8	ppp:1,
+		rqi:1,
+		qfi:6;
+	u32 rsvrd;
+	u8 next_ext;
+};
+
+struct ice_fdir_l2tpv3 {
+	__be32 session_id;
+};
+
+struct ice_fdir_udp_vxlan {
+	__be32 vni; /* 8 bits reserved, always be zero */
+};
+
+struct ice_fdir_ecpri {
+	__be16 pc_id;
+};
+
+struct ice_fdir_extra {
+	u8 dst_mac[ETH_ALEN];	/* dest MAC address */
+	u8 src_mac[ETH_ALEN];	/* src MAC address */
+	__be16 ether_type;      /* for NON_IP_L2 */
+	u32 usr_def[2];		/* user data */
+	__be16 vlan_type;	/* VLAN ethertype */
+	__be16 vlan_tag;	/* VLAN tag info */
+};
+
+struct ice_fdir_fltr {
+	struct list_head fltr_node;
+	enum ice_fltr_ptype flow_type;
+
+	union {
+		struct ice_fdir_v4 v4;
+		struct ice_fdir_v6 v6;
+	} ip, mask;
+
+	/* for tunnel outer part */
+	union {
+		struct ice_fdir_v4 v4;
+		struct ice_fdir_v6 v6;
+	} ip_outer, mask_outer;
+
+	struct ice_fdir_extra ext_data_outer;
+	struct ice_fdir_extra ext_mask_outer;
+
+	struct ice_fdir_udp_vxlan vxlan_data;
+	struct ice_fdir_udp_vxlan vxlan_mask;
+
+	struct ice_fdir_udp_gtp gtpu_data;
+	struct ice_fdir_udp_gtp gtpu_mask;
+
+	struct ice_fdir_l2tpv3 l2tpv3_data;
+	struct ice_fdir_l2tpv3 l2tpv3_mask;
+
+	struct ice_fdir_ecpri ecpri_data;
+	struct ice_fdir_ecpri ecpri_mask;
+
+	struct ice_fdir_extra ext_data;
+	struct ice_fdir_extra ext_mask;
+
+	/* flex byte filter data */
+	__be16 flex_word;
+	/* queue region size (=2^q_region) */
+	u8 q_region;
+	u16 flex_offset;
+	u16 flex_fltr;
+
+	/* filter control */
+	u16 q_index;
+	u16 orig_q_index;
+	u16 dest_vsi;
+	u8 dest_ctl;
+	u8 cnt_ena;
+	u8 fltr_status;
+	u16 cnt_index;
+	u32 fltr_id;
+	u8 fdid_prio;
+	u8 comp_report;
+	/* Set to true for an ACL filter */
+	bool acl_fltr;
+};
+
+/* Dummy packet filter definition structure */
+struct ice_fdir_base_pkt {
+	enum ice_fltr_ptype flow;
+	u16 pkt_len;
+	const u8 *pkt;
+	u16 tun_pkt_len;
+	const u8 *tun_pkt;
+};
+
+enum ice_status ice_alloc_fd_res_cntr(struct ice_hw *hw, u16 *cntr_id);
+enum ice_status ice_free_fd_res_cntr(struct ice_hw *hw, u16 cntr_id);
+void
+ice_set_fd_desc_val(struct ice_fd_fltr_desc_ctx *fd_fltr_ctx,
+		    struct ice_fltr_desc *fdir_desc);
+void ice_set_dflt_val_fd_desc(struct ice_fd_fltr_desc_ctx *fd_fltr_ctx);
+enum ice_status
+ice_alloc_fd_guar_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr);
+enum ice_status
+ice_free_fd_guar_item(struct ice_hw *hw, u16 cntr_id, u16 num_fltr);
+enum ice_status
+ice_alloc_fd_shrd_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr);
+enum ice_status
+ice_free_fd_shrd_item(struct ice_hw *hw, u16 cntr_id, u16 num_fltr);
+enum ice_status ice_clear_vsi_fd_table(struct ice_hw *hw, u16 vsi_num);
+enum ice_status ice_clear_pf_fd_table(struct ice_hw *hw);
+void
+ice_fdir_get_prgm_desc(struct ice_hw *hw, struct ice_fdir_fltr *input,
+		       struct ice_fltr_desc *fdesc, bool add);
+enum ice_status
+ice_fdir_get_gen_prgm_pkt(struct ice_hw *hw, struct ice_fdir_fltr *input,
+			  u8 *pkt, bool frag, bool tun);
+enum ice_status
+ice_fdir_get_prgm_pkt(struct ice_fdir_fltr *input, u8 *pkt, bool frag);
+int ice_get_fdir_cnt_all(struct ice_hw *hw);
+bool ice_fdir_is_dup_fltr(struct ice_hw *hw, struct ice_fdir_fltr *input);
+bool ice_fdir_has_frag(enum ice_fltr_ptype flow);
+struct ice_fdir_fltr *
+ice_fdir_find_fltr_by_idx(struct ice_hw *hw, u32 fltr_idx);
+void
+ice_fdir_update_cntrs(struct ice_hw *hw, enum ice_fltr_ptype flow,
+		      bool acl_fltr, bool add);
+void ice_fdir_list_add_fltr(struct ice_hw *hw, struct ice_fdir_fltr *input);
+#endif /* _ICE_FDIR_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
index 6cfe8eb7f47d..b98e4984f7ff 100644
--- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
+++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
@@ -1,8 +1,110 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 #include "ice_flex_pipe.h"
+#include "ice_protocol_type.h"
+#include "ice_flow.h"
+
+
+
+/* For supporting double VLAN mode, it is necessary to enable or disable certain
+ * boost tcam entries. The metadata labels names that match the following
+ * prefixes will be saved to allow enabling double VLAN mode.
+ */
+#define ICE_DVM_PRE	"BOOST_MAC_VLAN_DVM"	/* enable these entries */
+#define ICE_SVM_PRE	"BOOST_MAC_VLAN_SVM"	/* disable these entries */
+
+/* To support tunneling entries by PF, the package will append the PF number to
+ * the label; for example TNL_VXLAN_PF0, TNL_VXLAN_PF1, TNL_VXLAN_PF2, etc.
+ */
+#define ICE_TNL_PRE	"TNL_"
+static const struct ice_tunnel_type_scan tnls[] = {
+	{ TNL_VXLAN,		"TNL_VXLAN_PF" },
+	{ TNL_GENEVE,		"TNL_GENEVE_PF" },
+	{ TNL_ECPRI,		"TNL_UDP_ECPRI_PF" },
+	{ TNL_LAST,		"" }
+};
+
+static const u32 ice_sect_lkup[ICE_BLK_COUNT][ICE_SECT_COUNT] = {
+	/* SWITCH */
+	{
+		ICE_SID_XLT0_SW,
+		ICE_SID_XLT_KEY_BUILDER_SW,
+		ICE_SID_XLT1_SW,
+		ICE_SID_XLT2_SW,
+		ICE_SID_PROFID_TCAM_SW,
+		ICE_SID_PROFID_REDIR_SW,
+		ICE_SID_FLD_VEC_SW,
+		ICE_SID_CDID_KEY_BUILDER_SW,
+		ICE_SID_CDID_REDIR_SW
+	},
+
+	/* ACL */
+	{
+		ICE_SID_XLT0_ACL,
+		ICE_SID_XLT_KEY_BUILDER_ACL,
+		ICE_SID_XLT1_ACL,
+		ICE_SID_XLT2_ACL,
+		ICE_SID_PROFID_TCAM_ACL,
+		ICE_SID_PROFID_REDIR_ACL,
+		ICE_SID_FLD_VEC_ACL,
+		ICE_SID_CDID_KEY_BUILDER_ACL,
+		ICE_SID_CDID_REDIR_ACL
+	},
+
+	/* FD */
+	{
+		ICE_SID_XLT0_FD,
+		ICE_SID_XLT_KEY_BUILDER_FD,
+		ICE_SID_XLT1_FD,
+		ICE_SID_XLT2_FD,
+		ICE_SID_PROFID_TCAM_FD,
+		ICE_SID_PROFID_REDIR_FD,
+		ICE_SID_FLD_VEC_FD,
+		ICE_SID_CDID_KEY_BUILDER_FD,
+		ICE_SID_CDID_REDIR_FD
+	},
+
+	/* RSS */
+	{
+		ICE_SID_XLT0_RSS,
+		ICE_SID_XLT_KEY_BUILDER_RSS,
+		ICE_SID_XLT1_RSS,
+		ICE_SID_XLT2_RSS,
+		ICE_SID_PROFID_TCAM_RSS,
+		ICE_SID_PROFID_REDIR_RSS,
+		ICE_SID_FLD_VEC_RSS,
+		ICE_SID_CDID_KEY_BUILDER_RSS,
+		ICE_SID_CDID_REDIR_RSS
+	},
+
+	/* PE */
+	{
+		ICE_SID_XLT0_PE,
+		ICE_SID_XLT_KEY_BUILDER_PE,
+		ICE_SID_XLT1_PE,
+		ICE_SID_XLT2_PE,
+		ICE_SID_PROFID_TCAM_PE,
+		ICE_SID_PROFID_REDIR_PE,
+		ICE_SID_FLD_VEC_PE,
+		ICE_SID_CDID_KEY_BUILDER_PE,
+		ICE_SID_CDID_REDIR_PE
+	}
+};
+
+/**
+ * ice_sect_id - returns section ID
+ * @blk: block type
+ * @sect: section type
+ *
+ * This helper function returns the proper section ID given a block type and a
+ * section type.
+ */
+static u32 ice_sect_id(enum ice_block blk, enum ice_sect sect)
+{
+	return ice_sect_lkup[blk][sect];
+}
 
 /**
  * ice_pkg_val_buf
@@ -159,1101 +261,3869 @@ ice_pkg_enum_section(struct ice_seg *ice_seg, struct ice_pkg_enum *state,
 }
 
 /**
- * ice_acquire_global_cfg_lock
- * @hw: pointer to the HW structure
- * @access: access type (read or write)
+ * ice_pkg_enum_entry
+ * @ice_seg: pointer to the ice segment (or NULL on subsequent calls)
+ * @state: pointer to the enum state
+ * @sect_type: section type to enumerate
+ * @offset: pointer to variable that receives the offset in the table (optional)
+ * @handler: function that handles access to the entries into the section type
  *
- * This function will request ownership of the global config lock for reading
- * or writing of the package. When attempting to obtain write access, the
- * caller must check for the following two return values:
+ * This function will enumerate all the entries in particular section type in
+ * the ice segment. The first call is made with the ice_seg parameter non-NULL;
+ * on subsequent calls, ice_seg is set to NULL which continues the enumeration.
+ * When the function returns a NULL pointer, then the end of the entries has
+ * been reached.
  *
- * ICE_SUCCESS        - Means the caller has acquired the global config lock
- *                      and can perform writing of the package.
- * ICE_ERR_AQ_NO_WORK - Indicates another driver has already written the
- *                      package or has found that no update was necessary; in
- *                      this case, the caller can just skip performing any
- *                      update of the package.
+ * Since each section may have a different header and entry size, the handler
+ * function is needed to determine the number and location entries in each
+ * section.
+ *
+ * The offset parameter is optional, but should be used for sections that
+ * contain an offset for each section table. For such cases, the section handler
+ * function must return the appropriate offset + index to give the absolution
+ * offset for each entry. For example, if the base for a section's header
+ * indicates a base offset of 10, and the index for the entry is 2, then
+ * section handler function should set the offset to 10 + 2 = 12.
  */
-static enum ice_status
-ice_acquire_global_cfg_lock(struct ice_hw *hw,
-			    enum ice_aq_res_access_type access)
+static void *
+ice_pkg_enum_entry(struct ice_seg *ice_seg, struct ice_pkg_enum *state,
+		   u32 sect_type, u32 *offset,
+		   void *(*handler)(u32 sect_type, void *section,
+				    u32 index, u32 *offset))
 {
-	enum ice_status status;
+	void *entry;
 
-	status = ice_acquire_res(hw, ICE_GLOBAL_CFG_LOCK_RES_ID, access,
-				 ICE_GLOBAL_CFG_LOCK_TIMEOUT);
+	if (ice_seg) {
+		if (!handler)
+			return NULL;
 
-	if (!status)
-		mutex_lock(&ice_global_cfg_lock_sw);
-	else if (status == ICE_ERR_AQ_NO_WORK)
-		ice_debug(hw, ICE_DBG_PKG,
-			  "Global config lock: No work to do\n");
+		if (!ice_pkg_enum_section(ice_seg, state, sect_type))
+			return NULL;
 
-	return status;
+		state->entry_idx = 0;
+		state->handler = handler;
+	} else {
+		state->entry_idx++;
+	}
+
+	if (!state->handler)
+		return NULL;
+
+	/* get entry */
+	entry = state->handler(state->sect_type, state->sect, state->entry_idx,
+			       offset);
+	if (!entry) {
+		/* end of a section, look for another section of this type */
+		if (!ice_pkg_enum_section(NULL, state, 0))
+			return NULL;
+
+		state->entry_idx = 0;
+		entry = state->handler(state->sect_type, state->sect,
+				       state->entry_idx, offset);
+	}
+
+	return entry;
 }
 
 /**
- * ice_release_global_cfg_lock
+ * ice_hw_ptype_ena - check if the PTYPE is enabled or not
  * @hw: pointer to the HW structure
- *
- * This function will release the global config lock.
+ * @ptype: the hardware PTYPE
  */
-static void ice_release_global_cfg_lock(struct ice_hw *hw)
+bool ice_hw_ptype_ena(struct ice_hw *hw, u16 ptype)
 {
-	mutex_unlock(&ice_global_cfg_lock_sw);
-	ice_release_res(hw, ICE_GLOBAL_CFG_LOCK_RES_ID);
+	return ptype < ICE_FLOW_PTYPE_MAX &&
+	       test_bit(ptype, hw->hw_ptype);
 }
 
 /**
- * ice_aq_download_pkg
- * @hw: pointer to the hardware structure
- * @pkg_buf: the package buffer to transfer
- * @buf_size: the size of the package buffer
- * @last_buf: last buffer indicator
- * @error_offset: returns error offset
- * @error_info: returns error information
- * @cd: pointer to command details structure or NULL
+ * ice_marker_ptype_tcam_handler
+ * @sect_type: section type
+ * @section: pointer to section
+ * @index: index of the Marker PType TCAM entry to be returned
+ * @offset: pointer to receive absolute offset, always 0 for ptype TCAM sections
  *
- * Download Package (0x0C40)
+ * This is a callback function that can be passed to ice_pkg_enum_entry.
+ * Handles enumeration of individual Marker PType TCAM entries.
  */
-static enum ice_status
-ice_aq_download_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
-		    u16 buf_size, bool last_buf, u32 *error_offset,
-		    u32 *error_info, struct ice_sq_cd *cd)
+static void *
+ice_marker_ptype_tcam_handler(u32 sect_type, void *section, u32 index,
+			      u32 *offset)
 {
-	struct ice_aqc_download_pkg *cmd;
-	struct ice_aq_desc desc;
-	enum ice_status status;
+	struct ice_marker_ptype_tcam_section *marker_ptype;
 
-	if (error_offset)
-		*error_offset = 0;
-	if (error_info)
-		*error_info = 0;
+	if (!section)
+		return NULL;
 
-	cmd = &desc.params.download_pkg;
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_download_pkg);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	if (sect_type != ICE_SID_RXPARSER_MARKER_PTYPE)
+		return NULL;
 
-	if (last_buf)
-		cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF;
+	if (index > ICE_MAX_MARKER_PTYPE_TCAMS_IN_BUF)
+		return NULL;
 
-	status = ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
-	if (status == ICE_ERR_AQ_ERROR) {
-		/* Read error from buffer only when the FW returned an error */
-		struct ice_aqc_download_pkg_resp *resp;
+	if (offset)
+		*offset = 0;
 
-		resp = (struct ice_aqc_download_pkg_resp *)pkg_buf;
-		if (error_offset)
-			*error_offset = le32_to_cpu(resp->error_offset);
-		if (error_info)
-			*error_info = le32_to_cpu(resp->error_info);
-	}
+	marker_ptype = section;
+	if (index >= le16_to_cpu(marker_ptype->count))
+		return NULL;
 
-	return status;
+	return marker_ptype->tcam + index;
 }
 
 /**
- * ice_find_seg_in_pkg
- * @hw: pointer to the hardware structure
- * @seg_type: the segment type to search for (i.e., SEGMENT_TYPE_CPK)
- * @pkg_hdr: pointer to the package header to be searched
- *
- * This function searches a package file for a particular segment type. On
- * success it returns a pointer to the segment header, otherwise it will
- * return NULL.
+ * ice_fill_hw_ptype - fill the enabled PTYPE bit information
+ * @hw: pointer to the HW structure
  */
-static struct ice_generic_seg_hdr *
-ice_find_seg_in_pkg(struct ice_hw *hw, u32 seg_type,
-		    struct ice_pkg_hdr *pkg_hdr)
+static void
+ice_fill_hw_ptype(struct ice_hw *hw)
 {
-	u32 i;
-
-	ice_debug(hw, ICE_DBG_PKG, "Package format version: %d.%d.%d.%d\n",
-		  pkg_hdr->format_ver.major, pkg_hdr->format_ver.minor,
-		  pkg_hdr->format_ver.update, pkg_hdr->format_ver.draft);
-
-	/* Search all package segments for the requested segment type */
-	for (i = 0; i < le32_to_cpu(pkg_hdr->seg_count); i++) {
-		struct ice_generic_seg_hdr *seg;
+	struct ice_marker_ptype_tcam_entry *tcam;
+	struct ice_seg *seg = hw->seg;
+	struct ice_pkg_enum state;
 
-		seg = (struct ice_generic_seg_hdr *)
-			((u8 *)pkg_hdr + le32_to_cpu(pkg_hdr->seg_offset[i]));
+	bitmap_zero(hw->hw_ptype, ICE_FLOW_PTYPE_MAX);
+	if (!seg)
+		return;
 
-		if (le32_to_cpu(seg->seg_type) == seg_type)
-			return seg;
-	}
+	memset(&state, 0, sizeof(state));
 
-	return NULL;
+	do {
+		tcam = ice_pkg_enum_entry(seg, &state,
+					  ICE_SID_RXPARSER_MARKER_PTYPE, NULL,
+					  ice_marker_ptype_tcam_handler);
+		if (tcam &&
+		    le16_to_cpu(tcam->addr) < ICE_MARKER_PTYPE_TCAM_ADDR_MAX &&
+		    le16_to_cpu(tcam->ptype) < ICE_FLOW_PTYPE_MAX)
+			set_bit(le16_to_cpu(tcam->ptype), hw->hw_ptype);
+
+		seg = NULL;
+	} while (tcam);
 }
 
 /**
- * ice_dwnld_cfg_bufs
- * @hw: pointer to the hardware structure
- * @bufs: pointer to an array of buffers
- * @count: the number of buffers in the array
+ * ice_boost_tcam_handler
+ * @sect_type: section type
+ * @section: pointer to section
+ * @index: index of the boost TCAM entry to be returned
+ * @offset: pointer to receive absolute offset, always 0 for boost TCAM sections
  *
- * Obtains global config lock and downloads the package configuration buffers
- * to the firmware. Metadata buffers are skipped, and the first metadata buffer
- * found indicates that the rest of the buffers are all metadata buffers.
+ * This is a callback function that can be passed to ice_pkg_enum_entry.
+ * Handles enumeration of individual boost TCAM entries.
  */
-static enum ice_status
-ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
+static void *
+ice_boost_tcam_handler(u32 sect_type, void *section, u32 index, u32 *offset)
 {
-	enum ice_status status;
-	struct ice_buf_hdr *bh;
-	u32 offset, info, i;
+	struct ice_boost_tcam_section *boost;
 
-	if (!bufs || !count)
-		return ICE_ERR_PARAM;
-
-	/* If the first buffer's first section has its metadata bit set
-	 * then there are no buffers to be downloaded, and the operation is
-	 * considered a success.
-	 */
-	bh = (struct ice_buf_hdr *)bufs;
-	if (le32_to_cpu(bh->section_entry[0].type) & ICE_METADATA_BUF)
-		return 0;
+	if (!section)
+		return NULL;
 
-	/* reset pkg_dwnld_status in case this function is called in the
-	 * reset/rebuild flow
-	 */
-	hw->pkg_dwnld_status = ICE_AQ_RC_OK;
+	if (sect_type != ICE_SID_RXPARSER_BOOST_TCAM)
+		return NULL;
 
-	status = ice_acquire_global_cfg_lock(hw, ICE_RES_WRITE);
-	if (status) {
-		if (status == ICE_ERR_AQ_NO_WORK)
-			hw->pkg_dwnld_status = ICE_AQ_RC_EEXIST;
-		else
-			hw->pkg_dwnld_status = hw->adminq.sq_last_status;
-		return status;
-	}
+	if (index > ICE_MAX_BST_TCAMS_IN_BUF)
+		return NULL;
 
-	for (i = 0; i < count; i++) {
-		bool last = ((i + 1) == count);
+	if (offset)
+		*offset = 0;
 
-		if (!last) {
-			/* check next buffer for metadata flag */
-			bh = (struct ice_buf_hdr *)(bufs + i + 1);
+	boost = section;
+	if (index >= le16_to_cpu(boost->count))
+		return NULL;
 
-			/* A set metadata flag in the next buffer will signal
-			 * that the current buffer will be the last buffer
-			 * downloaded
-			 */
-			if (le16_to_cpu(bh->section_count))
-				if (le32_to_cpu(bh->section_entry[0].type) &
-				    ICE_METADATA_BUF)
-					last = true;
-		}
+	return boost->tcam + index;
+}
 
-		bh = (struct ice_buf_hdr *)(bufs + i);
+/**
+ * ice_find_boost_entry
+ * @ice_seg: pointer to the ice segment (non-NULL)
+ * @addr: Boost TCAM address of entry to search for
+ * @entry: returns pointer to the entry
+ *
+ * Finds a particular Boost TCAM entry and returns a pointer to that entry
+ * if it is found. The ice_seg parameter must not be NULL since the first call
+ * to ice_pkg_enum_entry requires a pointer to an actual ice_segment structure.
+ */
+static enum ice_status
+ice_find_boost_entry(struct ice_seg *ice_seg, u16 addr,
+		     struct ice_boost_tcam_entry **entry)
+{
+	struct ice_boost_tcam_entry *tcam;
+	struct ice_pkg_enum state;
 
-		status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, last,
-					     &offset, &info, NULL);
+	memset(&state, 0, sizeof(state));
 
-		/* Save AQ status from download package */
-		hw->pkg_dwnld_status = hw->adminq.sq_last_status;
-		if (status) {
-			ice_debug(hw, ICE_DBG_PKG,
-				  "Pkg download failed: err %d off %d inf %d\n",
-				  status, offset, info);
+	if (!ice_seg)
+		return ICE_ERR_PARAM;
 
-			break;
+	do {
+		tcam = ice_pkg_enum_entry(ice_seg, &state,
+					  ICE_SID_RXPARSER_BOOST_TCAM, NULL,
+					  ice_boost_tcam_handler);
+		if (tcam && le16_to_cpu(tcam->addr) == addr) {
+			*entry = tcam;
+			return 0;
 		}
 
-		if (last)
-			break;
-	}
-
-	ice_release_global_cfg_lock(hw);
+		ice_seg = NULL;
+	} while (tcam);
 
-	return status;
+	*entry = NULL;
+	return ICE_ERR_CFG;
 }
 
 /**
- * ice_aq_get_pkg_info_list
- * @hw: pointer to the hardware structure
- * @pkg_info: the buffer which will receive the information list
- * @buf_size: the size of the pkg_info information buffer
- * @cd: pointer to command details structure or NULL
+ * ice_label_enum_handler
+ * @sect_type: section type
+ * @section: pointer to section
+ * @index: index of the label entry to be returned
+ * @offset: pointer to receive absolute offset, always zero for label sections
  *
- * Get Package Info List (0x0C43)
+ * This is a callback function that can be passed to ice_pkg_enum_entry.
+ * Handles enumeration of individual label entries.
  */
-static enum ice_status
-ice_aq_get_pkg_info_list(struct ice_hw *hw,
-			 struct ice_aqc_get_pkg_info_resp *pkg_info,
-			 u16 buf_size, struct ice_sq_cd *cd)
+static void *
+ice_label_enum_handler(u32 __always_unused sect_type, void *section, u32 index,
+		       u32 *offset)
 {
-	struct ice_aq_desc desc;
+	struct ice_label_section *labels;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_pkg_info_list);
+	if (!section)
+		return NULL;
 
-	return ice_aq_send_cmd(hw, &desc, pkg_info, buf_size, cd);
+	if (index > ICE_MAX_LABELS_IN_BUF)
+		return NULL;
+
+	if (offset)
+		*offset = 0;
+
+	labels = section;
+	if (index >= le16_to_cpu(labels->count))
+		return NULL;
+
+	return labels->label + index;
 }
 
 /**
- * ice_download_pkg
- * @hw: pointer to the hardware structure
- * @ice_seg: pointer to the segment of the package to be downloaded
+ * ice_enum_labels
+ * @ice_seg: pointer to the ice segment (NULL on subsequent calls)
+ * @type: the section type that will contain the label (0 on subsequent calls)
+ * @state: ice_pkg_enum structure that will hold the state of the enumeration
+ * @value: pointer to a value that will return the label's value if found
  *
- * Handles the download of a complete package.
+ * Enumerates a list of labels in the package. The caller will call
+ * ice_enum_labels(ice_seg, type, ...) to start the enumeration, then call
+ * ice_enum_labels(NULL, 0, ...) to continue. When the function returns a NULL
+ * the end of the list has been reached.
  */
-static enum ice_status
-ice_download_pkg(struct ice_hw *hw, struct ice_seg *ice_seg)
+static char *
+ice_enum_labels(struct ice_seg *ice_seg, u32 type, struct ice_pkg_enum *state,
+		u16 *value)
 {
-	struct ice_buf_table *ice_buf_tbl;
+	struct ice_label *label;
 
-	ice_debug(hw, ICE_DBG_PKG, "Segment version: %d.%d.%d.%d\n",
-		  ice_seg->hdr.seg_ver.major, ice_seg->hdr.seg_ver.minor,
-		  ice_seg->hdr.seg_ver.update, ice_seg->hdr.seg_ver.draft);
+	/* Check for valid label section on first call */
+	if (type && !(type >= ICE_SID_LBL_FIRST && type <= ICE_SID_LBL_LAST))
+		return NULL;
 
-	ice_debug(hw, ICE_DBG_PKG, "Seg: type 0x%X, size %d, name %s\n",
-		  le32_to_cpu(ice_seg->hdr.seg_type),
-		  le32_to_cpu(ice_seg->hdr.seg_size), ice_seg->hdr.seg_name);
+	label = ice_pkg_enum_entry(ice_seg, state, type, NULL,
+				   ice_label_enum_handler);
+	if (!label)
+		return NULL;
 
-	ice_buf_tbl = ice_find_buf_table(ice_seg);
+	*value = le16_to_cpu(label->value);
+	return label->name;
+}
 
-	ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n",
-		  le32_to_cpu(ice_buf_tbl->buf_count));
+/**
+ * ice_add_tunnel_hint
+ * @hw: pointer to the HW structure
+ * @label_name: label text
+ * @val: value of the tunnel port boost entry
+ */
+static void ice_add_tunnel_hint(struct ice_hw *hw, char *label_name, u16 val)
+{
+	if (hw->tnl.count < ICE_TUNNEL_MAX_ENTRIES) {
+		u16 i;
+
+		for (i = 0; tnls[i].type != TNL_LAST; i++) {
+			size_t len = strlen(tnls[i].label_prefix);
+
+			/* Look for matching label start, before continuing */
+			if (strncmp(label_name, tnls[i].label_prefix, len))
+				continue;
 
-	return ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array,
-				  le32_to_cpu(ice_buf_tbl->buf_count));
+			/* Make sure this label matches our PF. Note that the PF
+			 * character ('0' - '7') will be located where our
+			 * prefix string's null terminator is located.
+			 */
+			if ((label_name[len] - '0') == hw->pf_id) {
+				hw->tnl.tbl[hw->tnl.count].type = tnls[i].type;
+				hw->tnl.tbl[hw->tnl.count].valid = false;
+				hw->tnl.tbl[hw->tnl.count].in_use = false;
+				hw->tnl.tbl[hw->tnl.count].marked = false;
+				hw->tnl.tbl[hw->tnl.count].boost_addr = val;
+				hw->tnl.tbl[hw->tnl.count].port = 0;
+				hw->tnl.count++;
+				break;
+			}
+		}
+	}
 }
 
 /**
- * ice_init_pkg_info
- * @hw: pointer to the hardware structure
- * @pkg_hdr: pointer to the driver's package hdr
+ * ice_add_dvm_hint
+ * @hw: pointer to the HW structure
+ * @val: value of the boost entry
+ * @enable: true if entry needs to be enabled, or false if needs to be disabled
+ */
+static void ice_add_dvm_hint(struct ice_hw *hw, u16 val, bool enable)
+{
+	if (hw->dvm_upd.count < ICE_DVM_MAX_ENTRIES) {
+		hw->dvm_upd.tbl[hw->dvm_upd.count].boost_addr = val;
+		hw->dvm_upd.tbl[hw->dvm_upd.count].enable = enable;
+		hw->dvm_upd.count++;
+	}
+}
+
+/**
+ * ice_init_pkg_hints
+ * @hw: pointer to the HW structure
+ * @ice_seg: pointer to the segment of the package scan (non-NULL)
  *
- * Saves off the package details into the HW structure.
+ * This function will scan the package and save off relevant information
+ * (hints or metadata) for driver use. The ice_seg parameter must not be NULL
+ * since the first call to ice_enum_labels requires a pointer to an actual
+ * ice_seg structure.
  */
-static enum ice_status
-ice_init_pkg_info(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
+static void ice_init_pkg_hints(struct ice_hw *hw, struct ice_seg *ice_seg)
 {
-	struct ice_global_metadata_seg *meta_seg;
-	struct ice_generic_seg_hdr *seg_hdr;
+	struct ice_pkg_enum state;
+	char *label_name;
+	u16 val;
+	int i;
 
-	if (!pkg_hdr)
-		return ICE_ERR_PARAM;
+	memset(&hw->tnl, 0, sizeof(hw->tnl));
+	memset(&state, 0, sizeof(state));
 
-	meta_seg = (struct ice_global_metadata_seg *)
-		   ice_find_seg_in_pkg(hw, SEGMENT_TYPE_METADATA, pkg_hdr);
-	if (meta_seg) {
-		hw->pkg_ver = meta_seg->pkg_ver;
-		memcpy(hw->pkg_name, meta_seg->pkg_name, sizeof(hw->pkg_name));
+	if (!ice_seg)
+		return;
 
-		ice_debug(hw, ICE_DBG_PKG, "Pkg: %d.%d.%d.%d, %s\n",
-			  meta_seg->pkg_ver.major, meta_seg->pkg_ver.minor,
-			  meta_seg->pkg_ver.update, meta_seg->pkg_ver.draft,
-			  meta_seg->pkg_name);
-	} else {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Did not find metadata segment in driver package\n");
-		return ICE_ERR_CFG;
+	label_name = ice_enum_labels(ice_seg, ICE_SID_LBL_RXPARSER_TMEM, &state,
+				     &val);
+
+	while (label_name) {
+		if (!strncmp(label_name, ICE_TNL_PRE, strlen(ICE_TNL_PRE)))
+			/* check for a tunnel entry */
+			ice_add_tunnel_hint(hw, label_name, val);
+
+		/* check for a dvm mode entry */
+		else if (!strncmp(label_name, ICE_DVM_PRE, strlen(ICE_DVM_PRE)))
+			ice_add_dvm_hint(hw, val, true);
+
+		/* check for a svm mode entry */
+		else if (!strncmp(label_name, ICE_SVM_PRE, strlen(ICE_SVM_PRE)))
+			ice_add_dvm_hint(hw, val, false);
+
+		label_name = ice_enum_labels(NULL, 0, &state, &val);
 	}
 
-	seg_hdr = ice_find_seg_in_pkg(hw, SEGMENT_TYPE_ICE, pkg_hdr);
-	if (seg_hdr) {
-		hw->ice_pkg_ver = seg_hdr->seg_ver;
-		memcpy(hw->ice_pkg_name, seg_hdr->seg_name,
-		       sizeof(hw->ice_pkg_name));
-
-		ice_debug(hw, ICE_DBG_PKG, "Ice Pkg: %d.%d.%d.%d, %s\n",
-			  seg_hdr->seg_ver.major, seg_hdr->seg_ver.minor,
-			  seg_hdr->seg_ver.update, seg_hdr->seg_ver.draft,
-			  seg_hdr->seg_name);
-	} else {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Did not find ice segment in driver package\n");
-		return ICE_ERR_CFG;
+	/* Cache the appropriate boost TCAM entry pointers for tunnels */
+	for (i = 0; i < hw->tnl.count; i++) {
+		ice_find_boost_entry(ice_seg, hw->tnl.tbl[i].boost_addr,
+				     &hw->tnl.tbl[i].boost_entry);
+		if (hw->tnl.tbl[i].boost_entry)
+			hw->tnl.tbl[i].valid = true;
 	}
 
-	return 0;
+	/* Cache the appropriate boost TCAM entry pointers for DVM and SVM */
+	for (i = 0; i < hw->dvm_upd.count; i++)
+		ice_find_boost_entry(ice_seg, hw->dvm_upd.tbl[i].boost_addr,
+				     &hw->dvm_upd.tbl[i].boost_entry);
 }
 
+/* Key creation */
+
+#define ICE_DC_KEY	0x1	/* don't care */
+#define ICE_DC_KEYINV	0x1
+#define ICE_NM_KEY	0x0	/* never match */
+#define ICE_NM_KEYINV	0x0
+#define ICE_0_KEY	0x1	/* match 0 */
+#define ICE_0_KEYINV	0x0
+#define ICE_1_KEY	0x0	/* match 1 */
+#define ICE_1_KEYINV	0x1
+
 /**
- * ice_get_pkg_info
- * @hw: pointer to the hardware structure
+ * ice_gen_key_word - generate 16-bits of a key/mask word
+ * @val: the value
+ * @valid: valid bits mask (change only the valid bits)
+ * @dont_care: don't care mask
+ * @nvr_mtch: never match mask
+ * @key: pointer to an array of where the resulting key portion
+ * @key_inv: pointer to an array of where the resulting key invert portion
  *
- * Store details of the package currently loaded in HW into the HW structure.
+ * This function generates 16-bits from a 8-bit value, an 8-bit don't care mask
+ * and an 8-bit never match mask. The 16-bits of output are divided into 8 bits
+ * of key and 8 bits of key invert.
+ *
+ *     '0' =    b01, always match a 0 bit
+ *     '1' =    b10, always match a 1 bit
+ *     '?' =    b11, don't care bit (always matches)
+ *     '~' =    b00, never match bit
+ *
+ * Input:
+ *          val:         b0  1  0  1  0  1
+ *          dont_care:   b0  0  1  1  0  0
+ *          never_mtch:  b0  0  0  0  1  1
+ *          ------------------------------
+ * Result:  key:        b01 10 11 11 00 00
  */
-static enum ice_status ice_get_pkg_info(struct ice_hw *hw)
+static enum ice_status
+ice_gen_key_word(u8 val, u8 valid, u8 dont_care, u8 nvr_mtch, u8 *key,
+		 u8 *key_inv)
 {
-	struct ice_aqc_get_pkg_info_resp *pkg_info;
-	enum ice_status status;
-	u16 size;
-	u32 i;
-
-	size = sizeof(*pkg_info) + (sizeof(pkg_info->pkg_info[0]) *
-				    (ICE_PKG_CNT - 1));
-	pkg_info = kzalloc(size, GFP_KERNEL);
-	if (!pkg_info)
-		return ICE_ERR_NO_MEMORY;
-
-	status = ice_aq_get_pkg_info_list(hw, pkg_info, size, NULL);
-	if (status)
-		goto init_pkg_free_alloc;
+	u8 in_key = *key, in_key_inv = *key_inv;
+	u8 i;
 
-	for (i = 0; i < le32_to_cpu(pkg_info->count); i++) {
-#define ICE_PKG_FLAG_COUNT	4
-		char flags[ICE_PKG_FLAG_COUNT + 1] = { 0 };
-		u8 place = 0;
+	/* 'dont_care' and 'nvr_mtch' masks cannot overlap */
+	if ((dont_care ^ nvr_mtch) != (dont_care | nvr_mtch))
+		return ICE_ERR_CFG;
 
-		if (pkg_info->pkg_info[i].is_active) {
-			flags[place++] = 'A';
-			hw->active_pkg_ver = pkg_info->pkg_info[i].ver;
-			memcpy(hw->active_pkg_name,
-			       pkg_info->pkg_info[i].name,
-			       sizeof(hw->active_pkg_name));
-			hw->active_pkg_in_nvm = pkg_info->pkg_info[i].is_in_nvm;
+	*key = 0;
+	*key_inv = 0;
+
+	/* encode the 8 bits into 8-bit key and 8-bit key invert */
+	for (i = 0; i < 8; i++) {
+		*key >>= 1;
+		*key_inv >>= 1;
+
+		if (!(valid & 0x1)) { /* change only valid bits */
+			*key |= (in_key & 0x1) << 7;
+			*key_inv |= (in_key_inv & 0x1) << 7;
+		} else if (dont_care & 0x1) { /* don't care bit */
+			*key |= ICE_DC_KEY << 7;
+			*key_inv |= ICE_DC_KEYINV << 7;
+		} else if (nvr_mtch & 0x1) { /* never match bit */
+			*key |= ICE_NM_KEY << 7;
+			*key_inv |= ICE_NM_KEYINV << 7;
+		} else if (val & 0x01) { /* exact 1 match */
+			*key |= ICE_1_KEY << 7;
+			*key_inv |= ICE_1_KEYINV << 7;
+		} else { /* exact 0 match */
+			*key |= ICE_0_KEY << 7;
+			*key_inv |= ICE_0_KEYINV << 7;
 		}
-		if (pkg_info->pkg_info[i].is_active_at_boot)
-			flags[place++] = 'B';
-		if (pkg_info->pkg_info[i].is_modified)
-			flags[place++] = 'M';
-		if (pkg_info->pkg_info[i].is_in_nvm)
-			flags[place++] = 'N';
 
-		ice_debug(hw, ICE_DBG_PKG, "Pkg[%d]: %d.%d.%d.%d,%s,%s\n",
-			  i, pkg_info->pkg_info[i].ver.major,
-			  pkg_info->pkg_info[i].ver.minor,
-			  pkg_info->pkg_info[i].ver.update,
-			  pkg_info->pkg_info[i].ver.draft,
-			  pkg_info->pkg_info[i].name, flags);
+		dont_care >>= 1;
+		nvr_mtch >>= 1;
+		valid >>= 1;
+		val >>= 1;
+		in_key >>= 1;
+		in_key_inv >>= 1;
 	}
 
-init_pkg_free_alloc:
-	kfree(pkg_info);
+	return 0;
+}
 
-	return status;
+/**
+ * ice_bits_max_set - determine if the number of bits set is within a maximum
+ * @mask: pointer to the byte array which is the mask
+ * @size: the number of bytes in the mask
+ * @max: the max number of set bits
+ *
+ * This function determines if there are at most 'max' number of bits set in an
+ * array. Returns true if the number for bits set is <= max or will return false
+ * otherwise.
+ */
+static bool ice_bits_max_set(const u8 *mask, u16 size, u16 max)
+{
+	u16 count = 0;
+	u16 i;
+
+	/* check each byte */
+	for (i = 0; i < size; i++) {
+		/* if 0, go to next byte */
+		if (!mask[i])
+			continue;
+
+		/* We know there is at least one set bit in this byte because of
+		 * the above check; if we already have found 'max' number of
+		 * bits set, then we can return failure now.
+		 */
+		if (count == max)
+			return false;
+
+		/* count the bits in this byte, checking threshold */
+		count += hweight8(mask[i]);
+		if (count > max)
+			return false;
+	}
+
+	return true;
 }
 
 /**
- * ice_verify_pkg - verify package
- * @pkg: pointer to the package buffer
- * @len: size of the package buffer
+ * ice_set_key - generate a variable sized key with multiples of 16-bits
+ * @key: pointer to where the key will be stored
+ * @size: the size of the complete key in bytes (must be even)
+ * @val: array of 8-bit values that makes up the value portion of the key
+ * @upd: array of 8-bit masks that determine what key portion to update
+ * @dc: array of 8-bit masks that make up the don't care mask
+ * @nm: array of 8-bit masks that make up the never match mask
+ * @off: the offset of the first byte in the key to update
+ * @len: the number of bytes in the key update
  *
- * Verifies various attributes of the package file, including length, format
- * version, and the requirement of at least one segment.
+ * This function generates a key from a value, a don't care mask and a never
+ * match mask.
+ * upd, dc, and nm are optional parameters, and can be NULL:
+ *	upd == NULL --> upd mask is all 1's (update all bits)
+ *	dc == NULL --> dc mask is all 0's (no don't care bits)
+ *	nm == NULL --> nm mask is all 0's (no never match bits)
  */
-static enum ice_status ice_verify_pkg(struct ice_pkg_hdr *pkg, u32 len)
+enum ice_status
+ice_set_key(u8 *key, u16 size, u8 *val, u8 *upd, u8 *dc, u8 *nm, u16 off,
+	    u16 len)
 {
-	u32 seg_count;
-	u32 i;
+	u16 half_size;
+	u16 i;
 
-	if (len < sizeof(*pkg))
-		return ICE_ERR_BUF_TOO_SHORT;
+	/* size must be a multiple of 2 bytes. */
+	if (size % 2)
+		return ICE_ERR_CFG;
+	half_size = size / 2;
 
-	if (pkg->format_ver.major != ICE_PKG_FMT_VER_MAJ ||
-	    pkg->format_ver.minor != ICE_PKG_FMT_VER_MNR ||
-	    pkg->format_ver.update != ICE_PKG_FMT_VER_UPD ||
-	    pkg->format_ver.draft != ICE_PKG_FMT_VER_DFT)
+	if (off + len > half_size)
 		return ICE_ERR_CFG;
 
-	/* pkg must have at least one segment */
-	seg_count = le32_to_cpu(pkg->seg_count);
-	if (seg_count < 1)
+	/* Make sure at most one bit is set in the never match mask. Having more
+	 * than one never match mask bit set will cause HW to consume excessive
+	 * power otherwise; this is a power management efficiency check.
+	 */
+#define ICE_NVR_MTCH_BITS_MAX	1
+	if (nm && !ice_bits_max_set(nm, len, ICE_NVR_MTCH_BITS_MAX))
 		return ICE_ERR_CFG;
 
-	/* make sure segment array fits in package length */
-	if (len < sizeof(*pkg) + ((seg_count - 1) * sizeof(pkg->seg_offset)))
-		return ICE_ERR_BUF_TOO_SHORT;
+	for (i = 0; i < len; i++)
+		if (ice_gen_key_word(val[i], upd ? upd[i] : 0xff,
+				     dc ? dc[i] : 0, nm ? nm[i] : 0,
+				     key + off + i, key + half_size + off + i))
+			return ICE_ERR_CFG;
 
-	/* all segments must fit within length */
-	for (i = 0; i < seg_count; i++) {
-		u32 off = le32_to_cpu(pkg->seg_offset[i]);
-		struct ice_generic_seg_hdr *seg;
+	return 0;
+}
 
-		/* segment header must fit */
-		if (len < off + sizeof(*seg))
-			return ICE_ERR_BUF_TOO_SHORT;
+/**
+ * ice_acquire_global_cfg_lock
+ * @hw: pointer to the HW structure
+ * @access: access type (read or write)
+ *
+ * This function will request ownership of the global config lock for reading
+ * or writing of the package. When attempting to obtain write access, the
+ * caller must check for the following two return values:
+ *
+ * ICE_SUCCESS        - Means the caller has acquired the global config lock
+ *                      and can perform writing of the package.
+ * ICE_ERR_AQ_NO_WORK - Indicates another driver has already written the
+ *                      package or has found that no update was necessary; in
+ *                      this case, the caller can just skip performing any
+ *                      update of the package.
+ */
+static enum ice_status
+ice_acquire_global_cfg_lock(struct ice_hw *hw,
+			    enum ice_aq_res_access_type access)
+{
+	enum ice_status status;
 
-		seg = (struct ice_generic_seg_hdr *)((u8 *)pkg + off);
+	status = ice_acquire_res(hw, ICE_GLOBAL_CFG_LOCK_RES_ID, access,
+				 ICE_GLOBAL_CFG_LOCK_TIMEOUT);
 
-		/* segment body must fit */
-		if (len < off + le32_to_cpu(seg->seg_size))
-			return ICE_ERR_BUF_TOO_SHORT;
-	}
+	if (!status)
+		mutex_lock(&ice_global_cfg_lock_sw);
+	else if (status == ICE_ERR_AQ_NO_WORK)
+		ice_debug(hw, ICE_DBG_PKG, "Global config lock: No work to do\n");
 
-	return 0;
+	return status;
 }
 
 /**
- * ice_free_seg - free package segment pointer
- * @hw: pointer to the hardware structure
+ * ice_release_global_cfg_lock
+ * @hw: pointer to the HW structure
  *
- * Frees the package segment pointer in the proper manner, depending on if the
- * segment was allocated or just the passed in pointer was stored.
+ * This function will release the global config lock.
  */
-void ice_free_seg(struct ice_hw *hw)
+static void ice_release_global_cfg_lock(struct ice_hw *hw)
 {
-	if (hw->pkg_copy) {
-		devm_kfree(ice_hw_to_dev(hw), hw->pkg_copy);
-		hw->pkg_copy = NULL;
-		hw->pkg_size = 0;
-	}
-	hw->seg = NULL;
+	mutex_unlock(&ice_global_cfg_lock_sw);
+	ice_release_res(hw, ICE_GLOBAL_CFG_LOCK_RES_ID);
 }
 
 /**
- * ice_init_pkg_regs - initialize additional package registers
- * @hw: pointer to the hardware structure
+ * ice_acquire_change_lock
+ * @hw: pointer to the HW structure
+ * @access: access type (read or write)
+ *
+ * This function will request ownership of the change lock.
  */
-static void ice_init_pkg_regs(struct ice_hw *hw)
+enum ice_status
+ice_acquire_change_lock(struct ice_hw *hw, enum ice_aq_res_access_type access)
 {
-#define ICE_SW_BLK_INP_MASK_L 0xFFFFFFFF
-#define ICE_SW_BLK_INP_MASK_H 0x0000FFFF
-#define ICE_SW_BLK_IDX	0
-
-	/* setup Switch block input mask, which is 48-bits in two parts */
-	wr32(hw, GL_PREEXT_L2_PMASK0(ICE_SW_BLK_IDX), ICE_SW_BLK_INP_MASK_L);
-	wr32(hw, GL_PREEXT_L2_PMASK1(ICE_SW_BLK_IDX), ICE_SW_BLK_INP_MASK_H);
+	return ice_acquire_res(hw, ICE_CHANGE_LOCK_RES_ID, access,
+			       ICE_CHANGE_LOCK_TIMEOUT);
 }
 
 /**
- * ice_chk_pkg_version - check package version for compatibility with driver
- * @pkg_ver: pointer to a version structure to check
+ * ice_release_change_lock
+ * @hw: pointer to the HW structure
  *
- * Check to make sure that the package about to be downloaded is compatible with
- * the driver. To be compatible, the major and minor components of the package
- * version must match our ICE_PKG_SUPP_VER_MAJ and ICE_PKG_SUPP_VER_MNR
- * definitions.
+ * This function will release the change lock using the proper Admin Command.
  */
-static enum ice_status ice_chk_pkg_version(struct ice_pkg_ver *pkg_ver)
+void ice_release_change_lock(struct ice_hw *hw)
 {
-	if (pkg_ver->major != ICE_PKG_SUPP_VER_MAJ ||
-	    pkg_ver->minor != ICE_PKG_SUPP_VER_MNR)
-		return ICE_ERR_NOT_SUPPORTED;
-
-	return 0;
+	ice_release_res(hw, ICE_CHANGE_LOCK_RES_ID);
 }
 
 /**
- * ice_init_pkg - initialize/download package
+ * ice_aq_download_pkg
  * @hw: pointer to the hardware structure
- * @buf: pointer to the package buffer
- * @len: size of the package buffer
- *
- * This function initializes a package. The package contains HW tables
- * required to do packet processing. First, the function extracts package
- * information such as version. Then it finds the ice configuration segment
- * within the package; this function then saves a copy of the segment pointer
- * within the supplied package buffer. Next, the function will cache any hints
- * from the package, followed by downloading the package itself. Note, that if
- * a previous PF driver has already downloaded the package successfully, then
- * the current driver will not have to download the package again.
- *
- * The local package contents will be used to query default behavior and to
- * update specific sections of the HW's version of the package (e.g. to update
- * the parse graph to understand new protocols).
+ * @pkg_buf: the package buffer to transfer
+ * @buf_size: the size of the package buffer
+ * @last_buf: last buffer indicator
+ * @error_offset: returns error offset
+ * @error_info: returns error information
+ * @cd: pointer to command details structure or NULL
  *
- * This function stores a pointer to the package buffer memory, and it is
- * expected that the supplied buffer will not be freed immediately. If the
- * package buffer needs to be freed, such as when read from a file, use
- * ice_copy_and_init_pkg() instead of directly calling ice_init_pkg() in this
- * case.
+ * Download Package (0x0C40)
  */
-enum ice_status ice_init_pkg(struct ice_hw *hw, u8 *buf, u32 len)
+static enum ice_status
+ice_aq_download_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
+		    u16 buf_size, bool last_buf, u32 *error_offset,
+		    u32 *error_info, struct ice_sq_cd *cd)
 {
-	struct ice_pkg_hdr *pkg;
+	struct ice_aqc_download_pkg *cmd;
+	struct ice_aq_desc desc;
 	enum ice_status status;
-	struct ice_seg *seg;
 
-	if (!buf || !len)
-		return ICE_ERR_PARAM;
-
-	pkg = (struct ice_pkg_hdr *)buf;
-	status = ice_verify_pkg(pkg, len);
-	if (status) {
-		ice_debug(hw, ICE_DBG_INIT, "failed to verify pkg (err: %d)\n",
-			  status);
-		return status;
-	}
-
-	/* initialize package info */
-	status = ice_init_pkg_info(hw, pkg);
-	if (status)
-		return status;
+	if (error_offset)
+		*error_offset = 0;
+	if (error_info)
+		*error_info = 0;
 
-	/* before downloading the package, check package version for
-	 * compatibility with driver
-	 */
-	status = ice_chk_pkg_version(&hw->pkg_ver);
-	if (status)
-		return status;
+	cmd = &desc.params.download_pkg;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_download_pkg);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
 
-	/* find segment in given package */
-	seg = (struct ice_seg *)ice_find_seg_in_pkg(hw, SEGMENT_TYPE_ICE, pkg);
-	if (!seg) {
-		ice_debug(hw, ICE_DBG_INIT, "no ice segment in package.\n");
-		return ICE_ERR_CFG;
-	}
 
-	/* download package */
-	status = ice_download_pkg(hw, seg);
-	if (status == ICE_ERR_AQ_NO_WORK) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "package previously loaded - no work.\n");
-		status = 0;
-	}
+	if (last_buf)
+		cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF;
 
-	/* Get information on the package currently loaded in HW, then make sure
-	 * the driver is compatible with this version.
-	 */
-	if (!status) {
-		status = ice_get_pkg_info(hw);
-		if (!status)
-			status = ice_chk_pkg_version(&hw->active_pkg_ver);
-	}
+	status = ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
+	if (status == ICE_ERR_AQ_ERROR) {
+		/* Read error from buffer only when the FW returned an error */
+		struct ice_aqc_download_pkg_resp *resp;
 
-	if (!status) {
-		hw->seg = seg;
-		/* on successful package download update other required
-		 * registers to support the package and fill HW tables
-		 * with package content.
-		 */
-		ice_init_pkg_regs(hw);
-		ice_fill_blk_tbls(hw);
-	} else {
-		ice_debug(hw, ICE_DBG_INIT, "package load failed, %d\n",
-			  status);
+		resp = (struct ice_aqc_download_pkg_resp *)pkg_buf;
+		if (error_offset)
+			*error_offset = le32_to_cpu(resp->error_offset);
+		if (error_info)
+			*error_info = le32_to_cpu(resp->error_info);
 	}
 
 	return status;
 }
 
 /**
- * ice_copy_and_init_pkg - initialize/download a copy of the package
+ * ice_aq_upload_section
  * @hw: pointer to the hardware structure
- * @buf: pointer to the package buffer
- * @len: size of the package buffer
- *
- * This function copies the package buffer, and then calls ice_init_pkg() to
- * initialize the copied package contents.
- *
- * The copying is necessary if the package buffer supplied is constant, or if
- * the memory may disappear shortly after calling this function.
- *
- * If the package buffer resides in the data segment and can be modified, the
- * caller is free to use ice_init_pkg() instead of ice_copy_and_init_pkg().
- *
- * However, if the package buffer needs to be copied first, such as when being
- * read from a file, the caller should use ice_copy_and_init_pkg().
+ * @pkg_buf: the package buffer which will receive the section
+ * @buf_size: the size of the package buffer
+ * @cd: pointer to command details structure or NULL
  *
- * This function will first copy the package buffer, before calling
- * ice_init_pkg(). The caller is free to immediately destroy the original
- * package buffer, as the new copy will be managed by this function and
- * related routines.
+ * Upload Section (0x0C41)
  */
-enum ice_status ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, u32 len)
+enum ice_status
+ice_aq_upload_section(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
+		      u16 buf_size, struct ice_sq_cd *cd)
 {
-	enum ice_status status;
-	u8 *buf_copy;
-
-	if (!buf || !len)
-		return ICE_ERR_PARAM;
+	struct ice_aq_desc desc;
 
-	buf_copy = devm_kmemdup(ice_hw_to_dev(hw), buf, len, GFP_KERNEL);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_upload_section);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
 
-	status = ice_init_pkg(hw, buf_copy, len);
-	if (status) {
-		/* Free the copy, since we failed to initialize the package */
-		devm_kfree(ice_hw_to_dev(hw), buf_copy);
-	} else {
-		/* Track the copied pkg so we can free it later */
-		hw->pkg_copy = buf_copy;
-		hw->pkg_size = len;
-	}
 
-	return status;
+	return ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
 }
 
-/* PTG Management */
-
 /**
- * ice_ptg_find_ptype - Search for packet type group using packet type (ptype)
+ * ice_aq_update_pkg
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @ptype: the ptype to search for
- * @ptg: pointer to variable that receives the PTG
+ * @pkg_buf: the package cmd buffer
+ * @buf_size: the size of the package cmd buffer
+ * @last_buf: last buffer indicator
+ * @error_offset: returns error offset
+ * @error_info: returns error information
+ * @cd: pointer to command details structure or NULL
  *
- * This function will search the PTGs for a particular ptype, returning the
- * PTG ID that contains it through the PTG parameter, with the value of
- * ICE_DEFAULT_PTG (0) meaning it is part the default PTG.
+ * Update Package (0x0C42)
  */
 static enum ice_status
-ice_ptg_find_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 *ptg)
+ice_aq_update_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf, u16 buf_size,
+		  bool last_buf, u32 *error_offset, u32 *error_info,
+		  struct ice_sq_cd *cd)
 {
-	if (ptype >= ICE_XLT1_CNT || !ptg)
-		return ICE_ERR_PARAM;
+	struct ice_aqc_download_pkg *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	*ptg = hw->blk[blk].xlt1.ptypes[ptype].ptg;
-	return 0;
+	if (error_offset)
+		*error_offset = 0;
+	if (error_info)
+		*error_info = 0;
+
+	cmd = &desc.params.download_pkg;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_update_pkg);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+
+	if (last_buf)
+		cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF;
+
+	status = ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
+	if (status == ICE_ERR_AQ_ERROR) {
+		/* Read error from buffer only when the FW returned an error */
+		struct ice_aqc_download_pkg_resp *resp;
+
+		resp = (struct ice_aqc_download_pkg_resp *)pkg_buf;
+		if (error_offset)
+			*error_offset = le32_to_cpu(resp->error_offset);
+		if (error_info)
+			*error_info = le32_to_cpu(resp->error_info);
+	}
+
+	return status;
 }
 
 /**
- * ice_ptg_alloc_val - Allocates a new packet type group ID by value
+ * ice_find_seg_in_pkg
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @ptg: the PTG to allocate
+ * @seg_type: the segment type to search for (i.e., SEGMENT_TYPE_CPK)
+ * @pkg_hdr: pointer to the package header to be searched
  *
- * This function allocates a given packet type group ID specified by the PTG
- * parameter.
+ * This function searches a package file for a particular segment type. On
+ * success it returns a pointer to the segment header, otherwise it will
+ * return NULL.
  */
-static void ice_ptg_alloc_val(struct ice_hw *hw, enum ice_block blk, u8 ptg)
+static struct ice_generic_seg_hdr *
+ice_find_seg_in_pkg(struct ice_hw *hw, u32 seg_type,
+		    struct ice_pkg_hdr *pkg_hdr)
 {
-	hw->blk[blk].xlt1.ptg_tbl[ptg].in_use = true;
+	u32 i;
+
+	ice_debug(hw, ICE_DBG_PKG, "Package format version: %d.%d.%d.%d\n",
+		  pkg_hdr->pkg_format_ver.major, pkg_hdr->pkg_format_ver.minor,
+		  pkg_hdr->pkg_format_ver.update,
+		  pkg_hdr->pkg_format_ver.draft);
+
+	/* Search all package segments for the requested segment type */
+	for (i = 0; i < le32_to_cpu(pkg_hdr->seg_count); i++) {
+		struct ice_generic_seg_hdr *seg;
+
+		seg = (struct ice_generic_seg_hdr *)
+			((u8 *)pkg_hdr + le32_to_cpu(pkg_hdr->seg_offset[i]));
+
+		if (le32_to_cpu(seg->seg_type) == seg_type)
+			return seg;
+	}
+
+	return NULL;
 }
 
 /**
- * ice_ptg_remove_ptype - Removes ptype from a particular packet type group
+ * ice_update_pkg_no_lock
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @ptype: the ptype to remove
- * @ptg: the PTG to remove the ptype from
- *
- * This function will remove the ptype from the specific PTG, and move it to
- * the default PTG (ICE_DEFAULT_PTG).
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
  */
 static enum ice_status
-ice_ptg_remove_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 ptg)
+ice_update_pkg_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
 {
-	struct ice_ptg_ptype **ch;
-	struct ice_ptg_ptype *p;
-
-	if (ptype > ICE_XLT1_CNT - 1)
-		return ICE_ERR_PARAM;
+	enum ice_status status = 0;
+	u32 i;
 
-	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].in_use)
-		return ICE_ERR_DOES_NOT_EXIST;
+	for (i = 0; i < count; i++) {
+		struct ice_buf_hdr *bh = (struct ice_buf_hdr *)(bufs + i);
+		bool last = ((i + 1) == count);
+		u32 offset, info;
 
-	/* Should not happen if .in_use is set, bad config */
-	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype)
-		return ICE_ERR_CFG;
+		status = ice_aq_update_pkg(hw, bh, le16_to_cpu(bh->data_end),
+					   last, &offset, &info, NULL);
 
-	/* find the ptype within this PTG, and bypass the link over it */
-	p = hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
-	ch = &hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
-	while (p) {
-		if (ptype == (p - hw->blk[blk].xlt1.ptypes)) {
-			*ch = p->next_ptype;
+		if (status) {
+			ice_debug(hw, ICE_DBG_PKG, "Update pkg failed: err %d off %d inf %d\n",
+				  status, offset, info);
 			break;
 		}
-
-		ch = &p->next_ptype;
-		p = p->next_ptype;
 	}
 
-	hw->blk[blk].xlt1.ptypes[ptype].ptg = ICE_DEFAULT_PTG;
-	hw->blk[blk].xlt1.ptypes[ptype].next_ptype = NULL;
-
-	return 0;
+	return status;
 }
 
 /**
- * ice_ptg_add_mv_ptype - Adds/moves ptype to a particular packet type group
+ * ice_update_pkg
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @ptype: the ptype to add or move
- * @ptg: the PTG to add or move the ptype to
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
  *
- * This function will either add or move a ptype to a particular PTG depending
- * on if the ptype is already part of another group. Note that using a
- * a destination PTG ID of ICE_DEFAULT_PTG (0) will move the ptype to the
- * default PTG.
+ * Obtains change lock and updates package.
  */
 static enum ice_status
-ice_ptg_add_mv_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 ptg)
+ice_update_pkg(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
 {
 	enum ice_status status;
-	u8 original_ptg;
-
-	if (ptype > ICE_XLT1_CNT - 1)
-		return ICE_ERR_PARAM;
 
-	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].in_use && ptg != ICE_DEFAULT_PTG)
-		return ICE_ERR_DOES_NOT_EXIST;
-
-	status = ice_ptg_find_ptype(hw, blk, ptype, &original_ptg);
+	status = ice_acquire_change_lock(hw, ICE_RES_WRITE);
 	if (status)
 		return status;
 
-	/* Is ptype already in the correct PTG? */
-	if (original_ptg == ptg)
-		return 0;
+	status = ice_update_pkg_no_lock(hw, bufs, count);
 
-	/* Remove from original PTG and move back to the default PTG */
-	if (original_ptg != ICE_DEFAULT_PTG)
-		ice_ptg_remove_ptype(hw, blk, ptype, original_ptg);
+	ice_release_change_lock(hw);
 
-	/* Moving to default PTG? Then we're done with this request */
-	if (ptg == ICE_DEFAULT_PTG)
-		return 0;
+	return status;
+}
 
-	/* Add ptype to PTG at beginning of list */
-	hw->blk[blk].xlt1.ptypes[ptype].next_ptype =
-		hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
-	hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype =
-		&hw->blk[blk].xlt1.ptypes[ptype];
-
-	hw->blk[blk].xlt1.ptypes[ptype].ptg = ptg;
-	hw->blk[blk].xlt1.t[ptype] = ptg;
+/**
+ * ice_dwnld_cfg_bufs
+ * @hw: pointer to the hardware structure
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
+ *
+ * Obtains global config lock and downloads the package configuration buffers
+ * to the firmware. Metadata buffers are skipped, and the first metadata buffer
+ * found indicates that the rest of the buffers are all metadata buffers.
+ */
+static enum ice_status
+ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
+{
+	enum ice_status status;
+	struct ice_buf_hdr *bh;
+	u32 offset, info, i;
 
-	return 0;
-}
+	if (!bufs || !count)
+		return ICE_ERR_PARAM;
 
-/* Block / table size info */
-struct ice_blk_size_details {
-	u16 xlt1;			/* # XLT1 entries */
-	u16 xlt2;			/* # XLT2 entries */
-	u16 prof_tcam;			/* # profile ID TCAM entries */
-	u16 prof_id;			/* # profile IDs */
-	u8 prof_cdid_bits;		/* # CDID one-hot bits used in key */
-	u16 prof_redir;			/* # profile redirection entries */
-	u16 es;				/* # extraction sequence entries */
-	u16 fvw;			/* # field vector words */
-	u8 overwrite;			/* overwrite existing entries allowed */
-	u8 reverse;			/* reverse FV order */
-};
+	/* If the first buffer's first section has its metadata bit set
+	 * then there are no buffers to be downloaded, and the operation is
+	 * considered a success.
+	 */
+	bh = (struct ice_buf_hdr *)bufs;
+	if (le32_to_cpu(bh->section_entry[0].type) & ICE_METADATA_BUF)
+		return 0;
 
-static const struct ice_blk_size_details blk_sizes[ICE_BLK_COUNT] = {
-	/**
-	 * Table Definitions
-	 * XLT1 - Number of entries in XLT1 table
-	 * XLT2 - Number of entries in XLT2 table
-	 * TCAM - Number of entries Profile ID TCAM table
-	 * CDID - Control Domain ID of the hardware block
-	 * PRED - Number of entries in the Profile Redirection Table
-	 * FV   - Number of entries in the Field Vector
-	 * FVW  - Width (in WORDs) of the Field Vector
-	 * OVR  - Overwrite existing table entries
-	 * REV  - Reverse FV
+	/* reset pkg_dwnld_status in case this function is called in the
+	 * reset/rebuild flow
 	 */
-	/*          XLT1        , XLT2        ,TCAM, PID,CDID,PRED,   FV, FVW */
-	/*          Overwrite   , Reverse FV */
-	/* SW  */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 256,   0,  256, 256,  48,
-		    false, false },
-	/* ACL */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  32,
-		    false, false },
-	/* FD  */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  24,
-		    false, true  },
-	/* RSS */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  24,
-		    true,  true  },
-	/* PE  */ { ICE_XLT1_CNT, ICE_XLT2_CNT,  64,  32,   0,   32,  32,  24,
-		    false, false },
-};
+	hw->pkg_dwnld_status = ICE_AQ_RC_OK;
 
-enum ice_sid_all {
-	ICE_SID_XLT1_OFF = 0,
-	ICE_SID_XLT2_OFF,
-	ICE_SID_PR_OFF,
-	ICE_SID_PR_REDIR_OFF,
-	ICE_SID_ES_OFF,
-	ICE_SID_OFF_COUNT,
-};
+	status = ice_acquire_global_cfg_lock(hw, ICE_RES_WRITE);
+	if (status) {
+		if (status == ICE_ERR_AQ_NO_WORK)
+			hw->pkg_dwnld_status = ICE_AQ_RC_EEXIST;
+		else
+			hw->pkg_dwnld_status = hw->adminq.sq_last_status;
+		return status;
+	}
 
-/* VSIG Management */
+	for (i = 0; i < count; i++) {
+		bool last = ((i + 1) == count);
+
+		if (!last) {
+			/* check next buffer for metadata flag */
+			bh = (struct ice_buf_hdr *)(bufs + i + 1);
+
+			/* A set metadata flag in the next buffer will signal
+			 * that the current buffer will be the last buffer
+			 * downloaded
+			 */
+			if (le16_to_cpu(bh->section_count))
+				if (le32_to_cpu(bh->section_entry[0].type) &
+				    ICE_METADATA_BUF)
+					last = true;
+		}
+
+		bh = (struct ice_buf_hdr *)(bufs + i);
+
+		status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, last,
+					     &offset, &info, NULL);
+
+		/* Save AQ status from download package */
+		hw->pkg_dwnld_status = hw->adminq.sq_last_status;
+		if (status) {
+			ice_debug(hw, ICE_DBG_PKG, "Pkg download failed: err %d off %d inf %d\n",
+				  status, offset, info);
+
+			break;
+		}
+
+		if (last)
+			break;
+	}
+
+	if (!status) {
+		status = ice_set_vlan_mode(hw);
+		if (status)
+			ice_debug(hw, ICE_DBG_PKG, "Failed to set VLAN mode: err %d\n",
+				  status);
+	}
+
+	ice_release_global_cfg_lock(hw);
+
+	return status;
+}
 
 /**
- * ice_vsig_find_vsi - find a VSIG that contains a specified VSI
+ * ice_aq_get_pkg_info_list
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @vsi: VSI of interest
- * @vsig: pointer to receive the VSI group
+ * @pkg_info: the buffer which will receive the information list
+ * @buf_size: the size of the pkg_info information buffer
+ * @cd: pointer to command details structure or NULL
  *
- * This function will lookup the VSI entry in the XLT2 list and return
- * the VSI group its associated with.
+ * Get Package Info List (0x0C43)
  */
 static enum ice_status
-ice_vsig_find_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 *vsig)
+ice_aq_get_pkg_info_list(struct ice_hw *hw,
+			 struct ice_aqc_get_pkg_info_resp *pkg_info,
+			 u16 buf_size, struct ice_sq_cd *cd)
 {
-	if (!vsig || vsi >= ICE_MAX_VSI)
-		return ICE_ERR_PARAM;
+	struct ice_aq_desc desc;
 
-	/* As long as there's a default or valid VSIG associated with the input
-	 * VSI, the functions returns a success. Any handling of VSIG will be
-	 * done by the following add, update or remove functions.
-	 */
-	*vsig = hw->blk[blk].xlt2.vsis[vsi].vsig;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_pkg_info_list);
 
-	return 0;
+
+	return ice_aq_send_cmd(hw, &desc, pkg_info, buf_size, cd);
 }
 
 /**
- * ice_vsig_alloc_val - allocate a new VSIG by value
+ * ice_download_pkg
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @vsig: the VSIG to allocate
+ * @ice_seg: pointer to the segment of the package to be downloaded
  *
- * This function will allocate a given VSIG specified by the VSIG parameter.
+ * Handles the download of a complete package.
  */
-static u16 ice_vsig_alloc_val(struct ice_hw *hw, enum ice_block blk, u16 vsig)
+static enum ice_status
+ice_download_pkg(struct ice_hw *hw, struct ice_seg *ice_seg)
 {
-	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_buf_table *ice_buf_tbl;
+	enum ice_status status;
 
-	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use) {
-		INIT_LIST_HEAD(&hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst);
-		hw->blk[blk].xlt2.vsig_tbl[idx].in_use = true;
-	}
+	ice_debug(hw, ICE_DBG_PKG, "Segment format version: %d.%d.%d.%d\n",
+		  ice_seg->hdr.seg_format_ver.major,
+		  ice_seg->hdr.seg_format_ver.minor,
+		  ice_seg->hdr.seg_format_ver.update,
+		  ice_seg->hdr.seg_format_ver.draft);
 
-	return ICE_VSIG_VALUE(idx, hw->pf_id);
+	ice_debug(hw, ICE_DBG_PKG, "Seg: type 0x%X, size %d, name %s\n",
+		  le32_to_cpu(ice_seg->hdr.seg_type),
+		  le32_to_cpu(ice_seg->hdr.seg_size), ice_seg->hdr.seg_id);
+
+	ice_buf_tbl = ice_find_buf_table(ice_seg);
+
+	ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n",
+		  le32_to_cpu(ice_buf_tbl->buf_count));
+
+	status = ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array,
+				    le32_to_cpu(ice_buf_tbl->buf_count));
+
+	ice_post_pkg_dwnld_vlan_mode_cfg(hw);
+
+	return status;
 }
 
 /**
- * ice_vsig_remove_vsi - remove VSI from VSIG
+ * ice_init_pkg_info
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @vsi: VSI to remove
- * @vsig: VSI group to remove from
+ * @pkg_hdr: pointer to the driver's package hdr
  *
- * The function will remove the input VSI from its VSI group and move it
- * to the DEFAULT_VSIG.
+ * Saves off the package details into the HW structure.
  */
 static enum ice_status
-ice_vsig_remove_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+ice_init_pkg_info(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
 {
-	struct ice_vsig_vsi **vsi_head, *vsi_cur, *vsi_tgt;
-	u16 idx;
-
-	idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_generic_seg_hdr *seg_hdr;
 
-	if (vsi >= ICE_MAX_VSI || idx >= ICE_MAX_VSIGS)
+	if (!pkg_hdr)
 		return ICE_ERR_PARAM;
 
-	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use)
-		return ICE_ERR_DOES_NOT_EXIST;
-
-	/* entry already in default VSIG, don't have to remove */
-	if (idx == ICE_DEFAULT_VSIG)
-		return 0;
-
-	vsi_head = &hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
-	if (!(*vsi_head))
-		return ICE_ERR_CFG;
+	seg_hdr = (struct ice_generic_seg_hdr *)
+		ice_find_seg_in_pkg(hw, SEGMENT_TYPE_ICE, pkg_hdr);
+	if (seg_hdr) {
+		struct ice_meta_sect *meta;
+		struct ice_pkg_enum state;
 
-	vsi_tgt = &hw->blk[blk].xlt2.vsis[vsi];
-	vsi_cur = (*vsi_head);
+		memset(&state, 0, sizeof(state));
 
-	/* iterate the VSI list, skip over the entry to be removed */
-	while (vsi_cur) {
-		if (vsi_tgt == vsi_cur) {
-			(*vsi_head) = vsi_cur->next_vsi;
-			break;
+		/* Get package information from the Metadata Section */
+		meta = ice_pkg_enum_section((struct ice_seg *)seg_hdr, &state,
+					    ICE_SID_METADATA);
+		if (!meta) {
+			ice_debug(hw, ICE_DBG_INIT, "Did not find ice metadata section in package\n");
+			return ICE_ERR_CFG;
 		}
-		vsi_head = &vsi_cur->next_vsi;
-		vsi_cur = vsi_cur->next_vsi;
-	}
 
-	/* verify if VSI was removed from group list */
-	if (!vsi_cur)
-		return ICE_ERR_DOES_NOT_EXIST;
+		hw->pkg_ver = meta->ver;
+		memcpy(hw->pkg_name, meta->name, sizeof(meta->name));
 
-	vsi_cur->vsig = ICE_DEFAULT_VSIG;
-	vsi_cur->changed = 1;
-	vsi_cur->next_vsi = NULL;
+		ice_debug(hw, ICE_DBG_PKG, "Pkg: %d.%d.%d.%d, %s\n",
+			  meta->ver.major, meta->ver.minor, meta->ver.update,
+			  meta->ver.draft, meta->name);
+
+		hw->ice_seg_fmt_ver = seg_hdr->seg_format_ver;
+		memcpy(hw->ice_seg_id, seg_hdr->seg_id,
+		       sizeof(hw->ice_seg_id));
+
+		ice_debug(hw, ICE_DBG_PKG, "Ice Seg: %d.%d.%d.%d, %s\n",
+			  seg_hdr->seg_format_ver.major,
+			  seg_hdr->seg_format_ver.minor,
+			  seg_hdr->seg_format_ver.update,
+			  seg_hdr->seg_format_ver.draft,
+			  seg_hdr->seg_id);
+	} else {
+		ice_debug(hw, ICE_DBG_INIT, "Did not find ice segment in driver package\n");
+		return ICE_ERR_CFG;
+	}
 
 	return 0;
 }
 
 /**
- * ice_vsig_add_mv_vsi - add or move a VSI to a VSI group
+ * ice_get_pkg_info
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @vsi: VSI to move
- * @vsig: destination VSI group
  *
- * This function will move or add the input VSI to the target VSIG.
- * The function will find the original VSIG the VSI belongs to and
- * move the entry to the DEFAULT_VSIG, update the original VSIG and
- * then move entry to the new VSIG.
+ * Store details of the package currently loaded in HW into the HW structure.
  */
-static enum ice_status
-ice_vsig_add_mv_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+static enum ice_status ice_get_pkg_info(struct ice_hw *hw)
 {
-	struct ice_vsig_vsi *tmp;
+	struct ice_aqc_get_pkg_info_resp *pkg_info;
 	enum ice_status status;
-	u16 orig_vsig, idx;
+	u16 size;
+	u32 i;
 
-	idx = vsig & ICE_VSIG_IDX_M;
+	size = struct_size(pkg_info, pkg_info, ICE_PKG_CNT);
+	pkg_info = devm_kzalloc(ice_hw_to_dev(hw), size, GFP_KERNEL);
+	if (!pkg_info)
+		return ICE_ERR_NO_MEMORY;
 
-	if (vsi >= ICE_MAX_VSI || idx >= ICE_MAX_VSIGS)
-		return ICE_ERR_PARAM;
-
-	/* if VSIG not in use and VSIG is not default type this VSIG
-	 * doesn't exist.
-	 */
-	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use &&
-	    vsig != ICE_DEFAULT_VSIG)
-		return ICE_ERR_DOES_NOT_EXIST;
-
-	status = ice_vsig_find_vsi(hw, blk, vsi, &orig_vsig);
+	status = ice_aq_get_pkg_info_list(hw, pkg_info, size, NULL);
 	if (status)
-		return status;
+		goto init_pkg_free_alloc;
 
-	/* no update required if vsigs match */
-	if (orig_vsig == vsig)
-		return 0;
+	for (i = 0; i < le32_to_cpu(pkg_info->count); i++) {
+#define ICE_PKG_FLAG_COUNT	4
+		char flags[ICE_PKG_FLAG_COUNT + 1] = { 0 };
+		u8 place = 0;
 
-	if (orig_vsig != ICE_DEFAULT_VSIG) {
-		/* remove entry from orig_vsig and add to default VSIG */
-		status = ice_vsig_remove_vsi(hw, blk, vsi, orig_vsig);
-		if (status)
-			return status;
+		if (pkg_info->pkg_info[i].is_active) {
+			flags[place++] = 'A';
+			hw->active_pkg_ver = pkg_info->pkg_info[i].ver;
+			hw->active_track_id =
+				le32_to_cpu(pkg_info->pkg_info[i].track_id);
+			memcpy(hw->active_pkg_name,
+			       pkg_info->pkg_info[i].name,
+			       sizeof(pkg_info->pkg_info[i].name));
+			hw->active_pkg_in_nvm = pkg_info->pkg_info[i].is_in_nvm;
+		}
+		if (pkg_info->pkg_info[i].is_active_at_boot)
+			flags[place++] = 'B';
+		if (pkg_info->pkg_info[i].is_modified)
+			flags[place++] = 'M';
+		if (pkg_info->pkg_info[i].is_in_nvm)
+			flags[place++] = 'N';
+
+		ice_debug(hw, ICE_DBG_PKG, "Pkg[%d]: %d.%d.%d.%d,%s,%s\n",
+			  i, pkg_info->pkg_info[i].ver.major,
+			  pkg_info->pkg_info[i].ver.minor,
+			  pkg_info->pkg_info[i].ver.update,
+			  pkg_info->pkg_info[i].ver.draft,
+			  pkg_info->pkg_info[i].name, flags);
 	}
 
-	if (idx == ICE_DEFAULT_VSIG)
-		return 0;
+init_pkg_free_alloc:
+	devm_kfree(ice_hw_to_dev(hw), pkg_info);
 
-	/* Create VSI entry and add VSIG and prop_mask values */
-	hw->blk[blk].xlt2.vsis[vsi].vsig = vsig;
-	hw->blk[blk].xlt2.vsis[vsi].changed = 1;
+	return status;
+}
 
-	/* Add new entry to the head of the VSIG list */
-	tmp = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
-	hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi =
-		&hw->blk[blk].xlt2.vsis[vsi];
-	hw->blk[blk].xlt2.vsis[vsi].next_vsi = tmp;
-	hw->blk[blk].xlt2.t[vsi] = vsig;
+/**
+ * ice_find_label_value
+ * @ice_seg: pointer to the ice segment (non-NULL)
+ * @name: name of the label to search for
+ * @type: the section type that will contain the label
+ * @value: pointer to a value that will return the label's value if found
+ *
+ * Finds a label's value given the label name and the section type to search.
+ * The ice_seg parameter must not be NULL since the first call to
+ * ice_enum_labels requires a pointer to an actual ice_seg structure.
+ */
+enum ice_status
+ice_find_label_value(struct ice_seg *ice_seg, char const *name, u32 type,
+		     u16 *value)
+{
+	struct ice_pkg_enum state;
+	char *label_name;
+	u16 val;
 
-	return 0;
+	memset(&state, 0, sizeof(state));
+
+	if (!ice_seg)
+		return ICE_ERR_PARAM;
+
+	do {
+		label_name = ice_enum_labels(ice_seg, type, &state, &val);
+		if (label_name && !strcmp(label_name, name)) {
+			*value = val;
+			return 0;
+		}
+
+		ice_seg = NULL;
+	} while (label_name);
+
+	return ICE_ERR_CFG;
 }
 
-/* Block / table section IDs */
-static const u32 ice_blk_sids[ICE_BLK_COUNT][ICE_SID_OFF_COUNT] = {
-	/* SWITCH */
-	{	ICE_SID_XLT1_SW,
-		ICE_SID_XLT2_SW,
-		ICE_SID_PROFID_TCAM_SW,
-		ICE_SID_PROFID_REDIR_SW,
-		ICE_SID_FLD_VEC_SW
-	},
+/**
+ * ice_verify_pkg - verify package
+ * @pkg: pointer to the package buffer
+ * @len: size of the package buffer
+ *
+ * Verifies various attributes of the package file, including length, format
+ * version, and the requirement of at least one segment.
+ */
+static enum ice_status ice_verify_pkg(struct ice_pkg_hdr *pkg, u32 len)
+{
+	u32 seg_count;
+	u32 i;
 
-	/* ACL */
-	{	ICE_SID_XLT1_ACL,
-		ICE_SID_XLT2_ACL,
-		ICE_SID_PROFID_TCAM_ACL,
-		ICE_SID_PROFID_REDIR_ACL,
-		ICE_SID_FLD_VEC_ACL
-	},
+	if (len < struct_size(pkg, seg_offset, 1))
+		return ICE_ERR_BUF_TOO_SHORT;
 
-	/* FD */
-	{	ICE_SID_XLT1_FD,
-		ICE_SID_XLT2_FD,
-		ICE_SID_PROFID_TCAM_FD,
-		ICE_SID_PROFID_REDIR_FD,
-		ICE_SID_FLD_VEC_FD
-	},
+	if (pkg->pkg_format_ver.major != ICE_PKG_FMT_VER_MAJ ||
+	    pkg->pkg_format_ver.minor != ICE_PKG_FMT_VER_MNR ||
+	    pkg->pkg_format_ver.update != ICE_PKG_FMT_VER_UPD ||
+	    pkg->pkg_format_ver.draft != ICE_PKG_FMT_VER_DFT)
+		return ICE_ERR_CFG;
 
-	/* RSS */
-	{	ICE_SID_XLT1_RSS,
-		ICE_SID_XLT2_RSS,
-		ICE_SID_PROFID_TCAM_RSS,
-		ICE_SID_PROFID_REDIR_RSS,
-		ICE_SID_FLD_VEC_RSS
-	},
+	/* pkg must have at least one segment */
+	seg_count = le32_to_cpu(pkg->seg_count);
+	if (seg_count < 1)
+		return ICE_ERR_CFG;
 
-	/* PE */
-	{	ICE_SID_XLT1_PE,
-		ICE_SID_XLT2_PE,
-		ICE_SID_PROFID_TCAM_PE,
-		ICE_SID_PROFID_REDIR_PE,
-		ICE_SID_FLD_VEC_PE
+	/* make sure segment array fits in package length */
+	if (len < struct_size(pkg, seg_offset, seg_count))
+		return ICE_ERR_BUF_TOO_SHORT;
+
+	/* all segments must fit within length */
+	for (i = 0; i < seg_count; i++) {
+		u32 off = le32_to_cpu(pkg->seg_offset[i]);
+		struct ice_generic_seg_hdr *seg;
+
+		/* segment header must fit */
+		if (len < off + sizeof(*seg))
+			return ICE_ERR_BUF_TOO_SHORT;
+
+		seg = (struct ice_generic_seg_hdr *)((u8 *)pkg + off);
+
+		/* segment body must fit */
+		if (len < off + le32_to_cpu(seg->seg_size))
+			return ICE_ERR_BUF_TOO_SHORT;
 	}
-};
+
+	return 0;
+}
 
 /**
- * ice_init_sw_xlt1_db - init software XLT1 database from HW tables
+ * ice_free_seg - free package segment pointer
  * @hw: pointer to the hardware structure
- * @blk: the HW block to initialize
+ *
+ * Frees the package segment pointer in the proper manner, depending on if the
+ * segment was allocated or just the passed in pointer was stored.
  */
-static void ice_init_sw_xlt1_db(struct ice_hw *hw, enum ice_block blk)
+void ice_free_seg(struct ice_hw *hw)
 {
-	u16 pt;
-
-	for (pt = 0; pt < hw->blk[blk].xlt1.count; pt++) {
-		u8 ptg;
-
-		ptg = hw->blk[blk].xlt1.t[pt];
-		if (ptg != ICE_DEFAULT_PTG) {
-			ice_ptg_alloc_val(hw, blk, ptg);
-			ice_ptg_add_mv_ptype(hw, blk, pt, ptg);
-		}
+	if (hw->pkg_copy) {
+		devm_kfree(ice_hw_to_dev(hw), hw->pkg_copy);
+		hw->pkg_copy = NULL;
+		hw->pkg_size = 0;
 	}
+	hw->seg = NULL;
 }
 
 /**
- * ice_init_sw_xlt2_db - init software XLT2 database from HW tables
+ * ice_init_pkg_regs - initialize additional package registers
  * @hw: pointer to the hardware structure
- * @blk: the HW block to initialize
  */
-static void ice_init_sw_xlt2_db(struct ice_hw *hw, enum ice_block blk)
+static void ice_init_pkg_regs(struct ice_hw *hw)
 {
-	u16 vsi;
-
-	for (vsi = 0; vsi < hw->blk[blk].xlt2.count; vsi++) {
-		u16 vsig;
+#define ICE_SW_BLK_INP_MASK_L 0xFFFFFFFF
+#define ICE_SW_BLK_INP_MASK_H 0x0000FFFF
+#define ICE_SW_BLK_IDX	0
 
-		vsig = hw->blk[blk].xlt2.t[vsi];
-		if (vsig) {
-			ice_vsig_alloc_val(hw, blk, vsig);
-			ice_vsig_add_mv_vsi(hw, blk, vsi, vsig);
-			/* no changes at this time, since this has been
-			 * initialized from the original package
-			 */
-			hw->blk[blk].xlt2.vsis[vsi].changed = 0;
-		}
-	}
+	/* setup Switch block input mask, which is 48-bits in two parts */
+	wr32(hw, GL_PREEXT_L2_PMASK0(ICE_SW_BLK_IDX), ICE_SW_BLK_INP_MASK_L);
+	wr32(hw, GL_PREEXT_L2_PMASK1(ICE_SW_BLK_IDX), ICE_SW_BLK_INP_MASK_H);
 }
 
 /**
- * ice_init_sw_db - init software database from HW tables
- * @hw: pointer to the hardware structure
+ * ice_chk_pkg_version - check package version for compatibility with driver
+ * @pkg_ver: pointer to a version structure to check
+ *
+ * Check to make sure that the package about to be downloaded is compatible with
+ * the driver. To be compatible, the major and minor components of the package
+ * version must match our ICE_PKG_SUPP_VER_MAJ and ICE_PKG_SUPP_VER_MNR
+ * definitions.
  */
-static void ice_init_sw_db(struct ice_hw *hw)
+static enum ice_status ice_chk_pkg_version(struct ice_pkg_ver *pkg_ver)
 {
-	u16 i;
+	/* The major is 0xFF indicates that it is a custom DDP */
+	if (pkg_ver->major == 0xFF)
+		return 0;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		ice_init_sw_xlt1_db(hw, (enum ice_block)i);
-		ice_init_sw_xlt2_db(hw, (enum ice_block)i);
-	}
+	if (pkg_ver->major != ICE_PKG_SUPP_VER_MAJ ||
+	    pkg_ver->minor != ICE_PKG_SUPP_VER_MNR)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	return 0;
 }
 
 /**
- * ice_fill_tbl - Reads content of a single table type into database
+ * ice_chk_pkg_compat
  * @hw: pointer to the hardware structure
- * @block_id: Block ID of the table to copy
- * @sid: Section ID of the table to copy
+ * @ospkg: pointer to the package hdr
+ * @seg: pointer to the package segment hdr
  *
- * Will attempt to read the entire content of a given table of a single block
- * into the driver database. We assume that the buffer will always
- * be as large or larger than the data contained in the package. If
- * this condition is not met, there is most likely an error in the package
- * contents.
+ * This function checks the package version compatibility with driver and NVM
  */
-static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
+static enum ice_status
+ice_chk_pkg_compat(struct ice_hw *hw, struct ice_pkg_hdr *ospkg,
+		   struct ice_seg **seg)
 {
-	u32 dst_len, sect_len, offset = 0;
-	struct ice_prof_redir_section *pr;
-	struct ice_prof_id_section *pid;
-	struct ice_xlt1_section *xlt1;
-	struct ice_xlt2_section *xlt2;
-	struct ice_sw_fv_section *es;
-	struct ice_pkg_enum state;
-	u8 *src, *dst;
-	void *sect;
+	struct ice_aqc_get_pkg_info_resp *pkg;
+	enum ice_status status;
+	u16 size;
+	u32 i;
 
-	/* if the HW segment pointer is null then the first iteration of
-	 * ice_pkg_enum_section() will fail. In this case the HW tables will
-	 * not be filled and return success.
-	 */
-	if (!hw->seg) {
-		ice_debug(hw, ICE_DBG_PKG, "hw->seg is NULL, tables are not filled\n");
-		return;
+	/* Check package version compatibility */
+	status = ice_chk_pkg_version(&hw->pkg_ver);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Package version check failed.\n");
+		return status;
 	}
 
-	memset(&state, 0, sizeof(state));
+	/* find ICE segment in given package */
+	*seg = (struct ice_seg *)ice_find_seg_in_pkg(hw, SEGMENT_TYPE_ICE,
+						     ospkg);
+	if (!*seg) {
+		ice_debug(hw, ICE_DBG_INIT, "no ice segment in package.\n");
+		return ICE_ERR_CFG;
+	}
 
-	sect = ice_pkg_enum_section(hw->seg, &state, sid);
+	/* Check if FW is compatible with the OS package */
+	size = struct_size(pkg, pkg_info, ICE_PKG_CNT);
+	pkg = devm_kzalloc(ice_hw_to_dev(hw), size, GFP_KERNEL);
+	if (!pkg)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_aq_get_pkg_info_list(hw, pkg, size, NULL);
+	if (status)
+		goto fw_ddp_compat_free_alloc;
+
+	for (i = 0; i < le32_to_cpu(pkg->count); i++) {
+		/* loop till we find the NVM package */
+		if (!pkg->pkg_info[i].is_in_nvm)
+			continue;
+		if ((*seg)->hdr.seg_format_ver.major !=
+			pkg->pkg_info[i].ver.major ||
+		    (*seg)->hdr.seg_format_ver.minor >
+			pkg->pkg_info[i].ver.minor) {
+			status = ICE_ERR_FW_DDP_MISMATCH;
+			ice_debug(hw, ICE_DBG_INIT, "OS package is not compatible with NVM.\n");
+		}
+		/* done processing NVM package so break */
+		break;
+	}
+fw_ddp_compat_free_alloc:
+	devm_kfree(ice_hw_to_dev(hw), pkg);
+	return status;
+}
+
+/**
+ * ice_sw_fv_handler
+ * @sect_type: section type
+ * @section: pointer to section
+ * @index: index of the field vector entry to be returned
+ * @offset: ptr to variable that receives the offset in the field vector table
+ *
+ * This is a callback function that can be passed to ice_pkg_enum_entry.
+ * This function treats the given section as of type ice_sw_fv_section and
+ * enumerates offset field. "offset" is an index into the field vector table.
+ */
+static void *
+ice_sw_fv_handler(u32 sect_type, void *section, u32 index, u32 *offset)
+{
+	struct ice_sw_fv_section *fv_section = section;
+
+	if (!section || sect_type != ICE_SID_FLD_VEC_SW)
+		return NULL;
+	if (index >= le16_to_cpu(fv_section->count))
+		return NULL;
+	if (offset)
+		/* "index" passed in to this function is relative to a given
+		 * 4k block. To get to the true index into the field vector
+		 * table need to add the relative index to the base_offset
+		 * field of this section
+		 */
+		*offset = le16_to_cpu(fv_section->base_offset) + index;
+	return fv_section->fv + index;
+}
+
+/**
+ * ice_get_prof_index_max - get the max profile index for used profile
+ * @hw: pointer to the HW struct
+ *
+ * Calling this function will get the max profile index for used profile
+ * and store the index number in struct ice_switch_info *switch_info
+ * in hw for following use.
+ */
+static int ice_get_prof_index_max(struct ice_hw *hw)
+{
+	u16 prof_index = 0, j, max_prof_index = 0;
+	struct ice_pkg_enum state;
+	struct ice_seg *ice_seg;
+	bool flag = false;
+	struct ice_fv *fv;
+	u32 offset;
+
+	memset(&state, 0, sizeof(state));
+
+	if (!hw->seg)
+		return ICE_ERR_PARAM;
+
+	ice_seg = hw->seg;
+
+	do {
+		fv = ice_pkg_enum_entry(ice_seg, &state, ICE_SID_FLD_VEC_SW,
+					&offset, ice_sw_fv_handler);
+		if (!fv)
+			break;
+		ice_seg = NULL;
+
+		/* in the profile that not be used, the prot_id is set to 0xff
+		 * and the off is set to 0x1ff for all the field vectors.
+		 */
+		for (j = 0; j < hw->blk[ICE_BLK_SW].es.fvw; j++)
+			if (fv->ew[j].prot_id != ICE_PROT_INVALID ||
+			    fv->ew[j].off != ICE_FV_OFFSET_INVAL)
+				flag = true;
+		if (flag && prof_index > max_prof_index)
+			max_prof_index = prof_index;
+
+		prof_index++;
+		flag = false;
+	} while (fv);
+
+	hw->switch_info->max_used_prof_index = max_prof_index;
+
+	return 0;
+}
+
+/**
+ * ice_init_pkg - initialize/download package
+ * @hw: pointer to the hardware structure
+ * @buf: pointer to the package buffer
+ * @len: size of the package buffer
+ *
+ * This function initializes a package. The package contains HW tables
+ * required to do packet processing. First, the function extracts package
+ * information such as version. Then it finds the ice configuration segment
+ * within the package; this function then saves a copy of the segment pointer
+ * within the supplied package buffer. Next, the function will cache any hints
+ * from the package, followed by downloading the package itself. Note, that if
+ * a previous PF driver has already downloaded the package successfully, then
+ * the current driver will not have to download the package again.
+ *
+ * The local package contents will be used to query default behavior and to
+ * update specific sections of the HW's version of the package (e.g. to update
+ * the parse graph to understand new protocols).
+ *
+ * This function stores a pointer to the package buffer memory, and it is
+ * expected that the supplied buffer will not be freed immediately. If the
+ * package buffer needs to be freed, such as when read from a file, use
+ * ice_copy_and_init_pkg() instead of directly calling ice_init_pkg() in this
+ * case.
+ */
+enum ice_status ice_init_pkg(struct ice_hw *hw, u8 *buf, u32 len)
+{
+	struct ice_pkg_hdr *pkg;
+	enum ice_status status;
+	struct ice_seg *seg;
+
+	if (!buf || !len)
+		return ICE_ERR_PARAM;
+
+	pkg = (struct ice_pkg_hdr *)buf;
+	status = ice_verify_pkg(pkg, len);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "failed to verify pkg (err: %d)\n",
+			  status);
+		return status;
+	}
+
+	/* initialize package info */
+	status = ice_init_pkg_info(hw, pkg);
+	if (status)
+		return status;
+
+	/* before downloading the package, check package version for
+	 * compatibility with driver
+	 */
+	status = ice_chk_pkg_compat(hw, pkg, &seg);
+	if (status)
+		return status;
+
+	/* initialize package hints and then download package */
+	ice_init_pkg_hints(hw, seg);
+	status = ice_download_pkg(hw, seg);
+	if (status == ICE_ERR_AQ_NO_WORK) {
+		ice_debug(hw, ICE_DBG_INIT, "package previously loaded - no work.\n");
+		status = 0;
+	}
+
+	/* Get information on the package currently loaded in HW, then make sure
+	 * the driver is compatible with this version.
+	 */
+	if (!status) {
+		status = ice_get_pkg_info(hw);
+		if (!status)
+			status = ice_chk_pkg_version(&hw->active_pkg_ver);
+	}
+
+	if (!status) {
+		hw->seg = seg;
+		/* on successful package download update other required
+		 * registers to support the package and fill HW tables
+		 * with package content.
+		 */
+		ice_init_pkg_regs(hw);
+		ice_fill_blk_tbls(hw);
+		ice_fill_hw_ptype(hw);
+		ice_get_prof_index_max(hw);
+	} else {
+		ice_debug(hw, ICE_DBG_INIT, "package load failed, %d\n",
+			  status);
+	}
+
+	return status;
+}
+
+/**
+ * ice_copy_and_init_pkg - initialize/download a copy of the package
+ * @hw: pointer to the hardware structure
+ * @buf: pointer to the package buffer
+ * @len: size of the package buffer
+ *
+ * This function copies the package buffer, and then calls ice_init_pkg() to
+ * initialize the copied package contents.
+ *
+ * The copying is necessary if the package buffer supplied is constant, or if
+ * the memory may disappear shortly after calling this function.
+ *
+ * If the package buffer resides in the data segment and can be modified, the
+ * caller is free to use ice_init_pkg() instead of ice_copy_and_init_pkg().
+ *
+ * However, if the package buffer needs to be copied first, such as when being
+ * read from a file, the caller should use ice_copy_and_init_pkg().
+ *
+ * This function will first copy the package buffer, before calling
+ * ice_init_pkg(). The caller is free to immediately destroy the original
+ * package buffer, as the new copy will be managed by this function and
+ * related routines.
+ */
+enum ice_status ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, u32 len)
+{
+	enum ice_status status;
+	u8 *buf_copy;
+
+	if (!buf || !len)
+		return ICE_ERR_PARAM;
+
+	buf_copy = devm_kmemdup(ice_hw_to_dev(hw), buf, len, GFP_KERNEL);
+
+	status = ice_init_pkg(hw, buf_copy, len);
+	if (status) {
+		/* Free the copy, since we failed to initialize the package */
+		devm_kfree(ice_hw_to_dev(hw), buf_copy);
+	} else {
+		/* Track the copied pkg so we can free it later */
+		hw->pkg_copy = buf_copy;
+		hw->pkg_size = len;
+	}
+
+	return status;
+}
+
+/**
+ * ice_pkg_buf_alloc
+ * @hw: pointer to the HW structure
+ *
+ * Allocates a package buffer and returns a pointer to the buffer header.
+ * Note: all package contents must be in Little Endian form.
+ */
+static struct ice_buf_build *ice_pkg_buf_alloc(struct ice_hw *hw)
+{
+	struct ice_buf_build *bld;
+	struct ice_buf_hdr *buf;
+
+	bld = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*bld), GFP_KERNEL);
+	if (!bld)
+		return NULL;
+
+	buf = (struct ice_buf_hdr *)bld;
+	buf->data_end = cpu_to_le16(offsetof(struct ice_buf_hdr,
+					     section_entry));
+	return bld;
+}
+
+/**
+ * ice_get_sw_prof_type - determine switch profile type
+ * @hw: pointer to the HW structure
+ * @fv: pointer to the switch field vector
+ */
+static enum ice_prof_type
+ice_get_sw_prof_type(struct ice_hw *hw, struct ice_fv *fv)
+{
+	u16 i;
+
+	for (i = 0; i < hw->blk[ICE_BLK_SW].es.fvw; i++) {
+		/* UDP tunnel will have UDP_OF protocol ID and VNI offset */
+		if (fv->ew[i].prot_id == (u8)ICE_PROT_UDP_OF &&
+		    fv->ew[i].off == ICE_VNI_OFFSET)
+			return ICE_PROF_TUN_UDP;
+
+		/* GRE tunnel will have GRE protocol */
+		if (fv->ew[i].prot_id == (u8)ICE_PROT_GRE_OF)
+			return ICE_PROF_TUN_GRE;
+	}
+
+	return ICE_PROF_NON_TUN;
+}
+
+/**
+ * ice_get_sw_fv_bitmap - Get switch field vector bitmap based on profile type
+ * @hw: pointer to hardware structure
+ * @req_profs: type of profiles requested
+ * @bm: pointer to memory for returning the bitmap of field vectors
+ */
+void
+ice_get_sw_fv_bitmap(struct ice_hw *hw, enum ice_prof_type req_profs,
+		     unsigned long *bm)
+{
+	struct ice_pkg_enum state;
+	struct ice_seg *ice_seg;
+	struct ice_fv *fv;
+
+	if (req_profs == ICE_PROF_ALL) {
+		bitmap_set(bm, 0, ICE_MAX_NUM_PROFILES);
+		return;
+	}
+
+	memset(&state, 0, sizeof(state));
+	bitmap_zero(bm, ICE_MAX_NUM_PROFILES);
+	ice_seg = hw->seg;
+	do {
+		enum ice_prof_type prof_type;
+		u32 offset;
+
+		fv = ice_pkg_enum_entry(ice_seg, &state, ICE_SID_FLD_VEC_SW,
+					&offset, ice_sw_fv_handler);
+		ice_seg = NULL;
+
+		if (fv) {
+			/* Determine field vector type */
+			prof_type = ice_get_sw_prof_type(hw, fv);
+
+			if (req_profs & prof_type)
+				set_bit((u16)offset, bm);
+		}
+	} while (fv);
+}
+
+/**
+ * ice_get_sw_fv_list
+ * @hw: pointer to the HW structure
+ * @prot_ids: field vector to search for with a given protocol ID
+ * @ids_cnt: lookup/protocol count
+ * @bm: bitmap of field vectors to consider
+ * @fv_list: Head of a list
+ *
+ * Finds all the field vector entries from switch block that contain
+ * a given protocol ID and returns a list of structures of type
+ * "ice_sw_fv_list_entry". Every structure in the list has a field vector
+ * definition and profile ID information
+ * NOTE: The caller of the function is responsible for freeing the memory
+ * allocated for every list entry.
+ */
+enum ice_status
+ice_get_sw_fv_list(struct ice_hw *hw, u8 *prot_ids, u16 ids_cnt,
+		   unsigned long *bm, struct list_head *fv_list)
+{
+	struct ice_sw_fv_list_entry *fvl;
+	struct ice_sw_fv_list_entry *tmp;
+	struct ice_pkg_enum state;
+	struct ice_seg *ice_seg;
+	struct ice_fv *fv;
+	u32 offset;
+
+	memset(&state, 0, sizeof(state));
+
+	if (!ids_cnt || !hw->seg)
+		return ICE_ERR_PARAM;
+
+	ice_seg = hw->seg;
+	do {
+		u16 i;
+
+		fv = ice_pkg_enum_entry(ice_seg, &state, ICE_SID_FLD_VEC_SW,
+					&offset, ice_sw_fv_handler);
+		if (!fv)
+			break;
+		ice_seg = NULL;
+
+		/* If field vector is not in the bitmap list, then skip this
+		 * profile.
+		 */
+		if (!test_bit((u16)offset, bm))
+			continue;
+
+		for (i = 0; i < ids_cnt; i++) {
+			int j;
+
+			/* This code assumes that if a switch field vector line
+			 * has a matching protocol, then this line will contain
+			 * the entries necessary to represent every field in
+			 * that protocol header.
+			 */
+			for (j = 0; j < hw->blk[ICE_BLK_SW].es.fvw; j++)
+				if (fv->ew[j].prot_id == prot_ids[i])
+					break;
+			if (j >= hw->blk[ICE_BLK_SW].es.fvw)
+				break;
+			if (i + 1 == ids_cnt) {
+				fvl = devm_kzalloc(ice_hw_to_dev(hw),
+						   sizeof(*fvl), GFP_KERNEL);
+				if (!fvl)
+					goto err;
+				fvl->fv_ptr = fv;
+				fvl->profile_id = offset;
+				list_add(&fvl->list_entry, fv_list);
+				break;
+			}
+		}
+	} while (fv);
+	if (list_empty(fv_list))
+		return ICE_ERR_CFG;
+	return 0;
+
+err:
+	list_for_each_entry_safe(fvl, tmp, fv_list, list_entry) {
+		list_del(&fvl->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), fvl);
+	}
+
+	return ICE_ERR_NO_MEMORY;
+}
+
+/**
+ * ice_init_prof_result_bm - Initialize the profile result index bitmap
+ * @hw: pointer to hardware structure
+ */
+void ice_init_prof_result_bm(struct ice_hw *hw)
+{
+	struct ice_pkg_enum state;
+	struct ice_seg *ice_seg;
+	struct ice_fv *fv;
+
+	memset(&state, 0, sizeof(state));
+
+	if (!hw->seg)
+		return;
+
+	ice_seg = hw->seg;
+	do {
+		u32 off;
+		u16 i;
+
+		fv = ice_pkg_enum_entry(ice_seg, &state, ICE_SID_FLD_VEC_SW,
+					&off, ice_sw_fv_handler);
+		ice_seg = NULL;
+		if (!fv)
+			break;
+
+		bitmap_zero(hw->switch_info->prof_res_bm[off],
+			    ICE_MAX_FV_WORDS);
+
+		/* Determine empty field vector indices, these can be
+		 * used for recipe results. Skip index 0, since it is
+		 * always used for Switch ID.
+		 */
+		for (i = 1; i < ICE_MAX_FV_WORDS; i++)
+			if (fv->ew[i].prot_id == ICE_PROT_INVALID &&
+			    fv->ew[i].off == ICE_FV_OFFSET_INVAL)
+				set_bit(i, hw->switch_info->prof_res_bm[off]);
+	} while (fv);
+}
+
+/**
+ * ice_pkg_buf_free
+ * @hw: pointer to the HW structure
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ *
+ * Frees a package buffer
+ */
+void ice_pkg_buf_free(struct ice_hw *hw, struct ice_buf_build *bld)
+{
+	devm_kfree(ice_hw_to_dev(hw), bld);
+}
+
+/**
+ * ice_pkg_buf_reserve_section
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ * @count: the number of sections to reserve
+ *
+ * Reserves one or more section table entries in a package buffer. This routine
+ * can be called multiple times as long as they are made before calling
+ * ice_pkg_buf_alloc_section(). Once ice_pkg_buf_alloc_section()
+ * is called once, the number of sections that can be allocated will not be able
+ * to be increased; not using all reserved sections is fine, but this will
+ * result in some wasted space in the buffer.
+ * Note: all package contents must be in Little Endian form.
+ */
+static enum ice_status
+ice_pkg_buf_reserve_section(struct ice_buf_build *bld, u16 count)
+{
+	struct ice_buf_hdr *buf;
+	u16 section_count;
+	u16 data_end;
+
+	if (!bld)
+		return ICE_ERR_PARAM;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+
+	/* already an active section, can't increase table size */
+	section_count = le16_to_cpu(buf->section_count);
+	if (section_count > 0)
+		return ICE_ERR_CFG;
+
+	if (bld->reserved_section_table_entries + count > ICE_MAX_S_COUNT)
+		return ICE_ERR_CFG;
+	bld->reserved_section_table_entries += count;
+
+	data_end = le16_to_cpu(buf->data_end) +
+		flex_array_size(buf, section_entry, count);
+	buf->data_end = cpu_to_le16(data_end);
+
+	return 0;
+}
+
+/**
+ * ice_pkg_buf_alloc_section
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ * @type: the section type value
+ * @size: the size of the section to reserve (in bytes)
+ *
+ * Reserves memory in the buffer for a section's content and updates the
+ * buffers' status accordingly. This routine returns a pointer to the first
+ * byte of the section start within the buffer, which is used to fill in the
+ * section contents.
+ * Note: all package contents must be in Little Endian form.
+ */
+static void *
+ice_pkg_buf_alloc_section(struct ice_buf_build *bld, u32 type, u16 size)
+{
+	struct ice_buf_hdr *buf;
+	u16 sect_count;
+	u16 data_end;
+
+	if (!bld || !type || !size)
+		return NULL;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+
+	/* check for enough space left in buffer */
+	data_end = le16_to_cpu(buf->data_end);
+
+	/* section start must align on 4 byte boundary */
+	data_end = ALIGN(data_end, 4);
+
+	if ((data_end + size) > ICE_MAX_S_DATA_END)
+		return NULL;
+
+	/* check for more available section table entries */
+	sect_count = le16_to_cpu(buf->section_count);
+	if (sect_count < bld->reserved_section_table_entries) {
+		void *section_ptr = ((u8 *)buf) + data_end;
+
+		buf->section_entry[sect_count].offset = cpu_to_le16(data_end);
+		buf->section_entry[sect_count].size = cpu_to_le16(size);
+		buf->section_entry[sect_count].type = cpu_to_le32(type);
+
+		data_end += size;
+		buf->data_end = cpu_to_le16(data_end);
+
+		buf->section_count = cpu_to_le16(sect_count + 1);
+		return section_ptr;
+	}
+
+	/* no free section table entries */
+	return NULL;
+}
+
+/**
+ * ice_pkg_buf_alloc_single_section
+ * @hw: pointer to the HW structure
+ * @type: the section type value
+ * @size: the size of the section to reserve (in bytes)
+ * @section: returns pointer to the section
+ *
+ * Allocates a package buffer with a single section.
+ * Note: all package contents must be in Little Endian form.
+ */
+struct ice_buf_build *
+ice_pkg_buf_alloc_single_section(struct ice_hw *hw, u32 type, u16 size,
+				 void **section)
+{
+	struct ice_buf_build *buf;
+
+	if (!section)
+		return NULL;
+
+	buf = ice_pkg_buf_alloc(hw);
+	if (!buf)
+		return NULL;
+
+	if (ice_pkg_buf_reserve_section(buf, 1))
+		goto ice_pkg_buf_alloc_single_section_err;
+
+	*section = ice_pkg_buf_alloc_section(buf, type, size);
+	if (!*section)
+		goto ice_pkg_buf_alloc_single_section_err;
+
+	return buf;
+
+ice_pkg_buf_alloc_single_section_err:
+	ice_pkg_buf_free(hw, buf);
+	return NULL;
+}
+
+/**
+ * ice_pkg_buf_unreserve_section
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ * @count: the number of sections to unreserve
+ *
+ * Unreserves one or more section table entries in a package buffer, releasing
+ * space that can be used for section data. This routine can be called
+ * multiple times as long as they are made before calling
+ * ice_pkg_buf_alloc_section(). Once ice_pkg_buf_alloc_section()
+ * is called once, the number of sections that can be allocated will not be able
+ * to be increased; not using all reserved sections is fine, but this will
+ * result in some wasted space in the buffer.
+ * Note: all package contents must be in Little Endian form.
+ */
+enum ice_status
+ice_pkg_buf_unreserve_section(struct ice_buf_build *bld, u16 count)
+{
+	struct ice_buf_hdr *buf;
+	u16 section_count;
+	u16 data_end;
+
+	if (!bld)
+		return ICE_ERR_PARAM;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+
+	/* already an active section, can't decrease table size */
+	section_count = le16_to_cpu(buf->section_count);
+	if (section_count > 0)
+		return ICE_ERR_CFG;
+
+	if (count > bld->reserved_section_table_entries)
+		return ICE_ERR_CFG;
+	bld->reserved_section_table_entries -= count;
+
+	data_end = le16_to_cpu(buf->data_end) -
+		flex_array_size(buf, section_entry, count);
+	buf->data_end = cpu_to_le16(data_end);
+
+	return 0;
+}
+
+/**
+ * ice_pkg_buf_get_free_space
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ *
+ * Returns the number of free bytes remaining in the buffer.
+ * Note: all package contents must be in Little Endian form.
+ */
+u16 ice_pkg_buf_get_free_space(struct ice_buf_build *bld)
+{
+	struct ice_buf_hdr *buf;
+
+	if (!bld)
+		return 0;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+	return ICE_MAX_S_DATA_END - le16_to_cpu(buf->data_end);
+}
+
+/**
+ * ice_pkg_buf_get_active_sections
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ *
+ * Returns the number of active sections. Before using the package buffer
+ * in an update package command, the caller should make sure that there is at
+ * least one active section - otherwise, the buffer is not legal and should
+ * not be used.
+ * Note: all package contents must be in Little Endian form.
+ */
+static u16 ice_pkg_buf_get_active_sections(struct ice_buf_build *bld)
+{
+	struct ice_buf_hdr *buf;
+
+	if (!bld)
+		return 0;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+	return le16_to_cpu(buf->section_count);
+}
+
+/**
+ * ice_pkg_buf
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ *
+ * Return a pointer to the buffer's header
+ */
+struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld)
+{
+	if (!bld)
+		return NULL;
+
+	return &bld->buf;
+}
+
+/**
+ * ice_tunnel_port_in_use_hlpr - helper function to determine tunnel usage
+ * @hw: pointer to the HW structure
+ * @port: port to search for
+ * @index: optionally returns index
+ *
+ * Returns whether a port is already in use as a tunnel, and optionally its
+ * index
+ */
+static bool ice_tunnel_port_in_use_hlpr(struct ice_hw *hw, u16 port, u16 *index)
+{
+	u16 i;
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].in_use && hw->tnl.tbl[i].port == port) {
+			if (index)
+				*index = i;
+			return true;
+		}
+
+	return false;
+}
+
+/**
+ * ice_tunnel_port_in_use
+ * @hw: pointer to the HW structure
+ * @port: port to search for
+ * @index: optionally returns index
+ *
+ * Returns whether a port is already in use as a tunnel, and optionally its
+ * index
+ */
+bool ice_tunnel_port_in_use(struct ice_hw *hw, u16 port, u16 *index)
+{
+	bool res;
+
+	mutex_lock(&hw->tnl_lock);
+	res = ice_tunnel_port_in_use_hlpr(hw, port, index);
+	mutex_unlock(&hw->tnl_lock);
+
+	return res;
+}
+
+/**
+ * ice_tunnel_get_type
+ * @hw: pointer to the HW structure
+ * @port: port to search for
+ * @type: returns tunnel index
+ *
+ * For a given port number, will return the type of tunnel.
+ */
+bool
+ice_tunnel_get_type(struct ice_hw *hw, u16 port, enum ice_tunnel_type *type)
+{
+	bool res = false;
+	u16 i;
+
+	mutex_lock(&hw->tnl_lock);
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].in_use && hw->tnl.tbl[i].port == port) {
+			*type = hw->tnl.tbl[i].type;
+			res = true;
+			break;
+		}
+
+	mutex_unlock(&hw->tnl_lock);
+
+	return res;
+}
+
+/**
+ * ice_find_free_tunnel_entry
+ * @hw: pointer to the HW structure
+ * @type: tunnel type
+ * @index: optionally returns index
+ *
+ * Returns whether there is a free tunnel entry, and optionally its index
+ */
+static bool
+ice_find_free_tunnel_entry(struct ice_hw *hw, enum ice_tunnel_type type,
+			   u16 *index)
+{
+	u16 i;
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && !hw->tnl.tbl[i].in_use &&
+		    hw->tnl.tbl[i].type == type) {
+			if (index)
+				*index = i;
+			return true;
+		}
+
+	return false;
+}
+
+/**
+ * ice_get_open_tunnel_port - retrieve an open tunnel port
+ * @hw: pointer to the HW structure
+ * @type: tunnel type (TNL_ALL will return any open port)
+ * @port: returns open port
+ */
+bool
+ice_get_open_tunnel_port(struct ice_hw *hw, enum ice_tunnel_type type,
+			 u16 *port)
+{
+	bool res = false;
+	u16 i;
+
+	mutex_lock(&hw->tnl_lock);
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].in_use &&
+		    (type == TNL_ALL || hw->tnl.tbl[i].type == type)) {
+			*port = hw->tnl.tbl[i].port;
+			res = true;
+			break;
+		}
+
+	mutex_unlock(&hw->tnl_lock);
+
+	return res;
+}
+
+/**
+ * ice_is_create_tunnel_possible
+ * @hw: pointer to the HW structure
+ * @type: type of tunnel
+ * @port: port of tunnel to create
+ *
+ * Function returns ICE_SUCCESS if a tunnel can be created using specified
+ * tunnel type and port. If the tunnel is already present in hardware then
+ * ICE_ERR_ALREADY_EXISTS is returned, or if there's no space, then
+ * ICE_ERR_OUT_OF_RANGE.
+ */
+enum ice_status
+ice_is_create_tunnel_possible(struct ice_hw *hw, enum ice_tunnel_type type,
+			      u16 port)
+{
+	u16 index;
+
+	if (ice_tunnel_port_in_use_hlpr(hw, port, &index))
+		return ICE_ERR_ALREADY_EXISTS;
+
+	if (!ice_find_free_tunnel_entry(hw, type, &index))
+		return ICE_ERR_OUT_OF_RANGE;
+
+	return 0;
+}
+
+/**
+ * ice_is_tunnel_empty - check if udp tunnel is empty
+ * @hw: pointer to the HW structure
+ */
+bool ice_is_tunnel_empty(struct ice_hw *hw)
+{
+	int i;
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].in_use)
+			return false;
+	return true;
+}
+
+/**
+ * ice_upd_dvm_boost_entry
+ * @hw: pointer to the HW structure
+ * @entry: pointer to double vlan boost entry info
+ */
+static enum ice_status
+ice_upd_dvm_boost_entry(struct ice_hw *hw, struct ice_dvm_entry *entry)
+{
+	struct ice_boost_tcam_section *sect_rx, *sect_tx;
+	enum ice_status status = ICE_ERR_MAX_LIMIT;
+	struct ice_buf_build *bld;
+	u8 val, dc, nm;
+
+	bld = ice_pkg_buf_alloc(hw);
+	if (!bld)
+		return ICE_ERR_NO_MEMORY;
+
+	/* allocate 2 sections, one for Rx parser, one for Tx parser */
+	if (ice_pkg_buf_reserve_section(bld, 2))
+		goto ice_upd_dvm_boost_entry_err;
+
+	sect_rx = ice_pkg_buf_alloc_section(bld, ICE_SID_RXPARSER_BOOST_TCAM,
+					    struct_size(sect_rx, tcam, 1));
+	if (!sect_rx)
+		goto ice_upd_dvm_boost_entry_err;
+	sect_rx->count = cpu_to_le16(1);
+
+	sect_tx = ice_pkg_buf_alloc_section(bld, ICE_SID_TXPARSER_BOOST_TCAM,
+					    struct_size(sect_tx, tcam, 1));
+	if (!sect_tx)
+		goto ice_upd_dvm_boost_entry_err;
+	sect_tx->count = cpu_to_le16(1);
+
+	/* copy original boost entry to update package buffer */
+	memcpy(sect_rx->tcam, entry->boost_entry, sizeof(*sect_rx->tcam));
+
+	/* re-write the don't care and never match bits accordingly */
+	if (entry->enable) {
+		/* all bits are don't care */
+		val = 0x00;
+		dc = 0xFF;
+		nm = 0x00;
+	} else {
+		/* disable, one never match bit, the rest are don't care */
+		val = 0x00;
+		dc = 0xF7;
+		nm = 0x08;
+	}
+
+	ice_set_key((u8 *)&sect_rx->tcam[0].key, sizeof(sect_rx->tcam[0].key),
+		    &val, NULL, &dc, &nm, 0, sizeof(u8));
+
+	/* exact copy of entry to Tx section entry */
+	memcpy(sect_tx->tcam, sect_rx->tcam, sizeof(*sect_tx->tcam));
+
+	status = ice_update_pkg_no_lock(hw, ice_pkg_buf(bld), 1);
+
+ice_upd_dvm_boost_entry_err:
+	ice_pkg_buf_free(hw, bld);
+
+	return status;
+}
+
+/**
+ * ice_set_dvm_boost_entries
+ * @hw: pointer to the HW structure
+ *
+ * Enable double vlan by updating the appropriate boost tcam entries.
+ */
+enum ice_status ice_set_dvm_boost_entries(struct ice_hw *hw)
+{
+	enum ice_status status;
+	u16 i;
+
+	for (i = 0; i < hw->dvm_upd.count; i++) {
+		status = ice_upd_dvm_boost_entry(hw, &hw->dvm_upd.tbl[i]);
+		if (status)
+			return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_create_tunnel
+ * @hw: pointer to the HW structure
+ * @type: type of tunnel
+ * @port: port of tunnel to create
+ *
+ * Create a tunnel by updating the parse graph in the parser. We do that by
+ * creating a package buffer with the tunnel info and issuing an update package
+ * command.
+ */
+enum ice_status
+ice_create_tunnel(struct ice_hw *hw, enum ice_tunnel_type type, u16 port)
+{
+	struct ice_boost_tcam_section *sect_rx, *sect_tx;
+	enum ice_status status = ICE_ERR_MAX_LIMIT;
+	struct ice_buf_build *bld;
+	u16 index;
+
+	mutex_lock(&hw->tnl_lock);
+
+	if (ice_tunnel_port_in_use_hlpr(hw, port, &index)) {
+		hw->tnl.tbl[index].ref++;
+		status = 0;
+		goto ice_create_tunnel_end;
+	}
+
+	if (!ice_find_free_tunnel_entry(hw, type, &index)) {
+		status = ICE_ERR_OUT_OF_RANGE;
+		goto ice_create_tunnel_end;
+	}
+
+	bld = ice_pkg_buf_alloc(hw);
+	if (!bld) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_create_tunnel_end;
+	}
+
+	/* allocate 2 sections, one for Rx parser, one for Tx parser */
+	if (ice_pkg_buf_reserve_section(bld, 2))
+		goto ice_create_tunnel_err;
+
+	sect_rx = ice_pkg_buf_alloc_section(bld, ICE_SID_RXPARSER_BOOST_TCAM,
+					    struct_size(sect_rx, tcam, 1));
+	if (!sect_rx)
+		goto ice_create_tunnel_err;
+	sect_rx->count = cpu_to_le16(1);
+
+	sect_tx = ice_pkg_buf_alloc_section(bld, ICE_SID_TXPARSER_BOOST_TCAM,
+					    struct_size(sect_tx, tcam, 1));
+	if (!sect_tx)
+		goto ice_create_tunnel_err;
+	sect_tx->count = cpu_to_le16(1);
+
+	/* copy original boost entry to update package buffer */
+	memcpy(sect_rx->tcam, hw->tnl.tbl[index].boost_entry,
+	       sizeof(*sect_rx->tcam));
+
+	/* over-write the never-match dest port key bits with the encoded port
+	 * bits
+	 */
+	ice_set_key((u8 *)&sect_rx->tcam[0].key, sizeof(sect_rx->tcam[0].key),
+		    (u8 *)&port, NULL, NULL, NULL,
+		    (u16)offsetof(struct ice_boost_key_value, hv_dst_port_key),
+		    sizeof(sect_rx->tcam[0].key.key.hv_dst_port_key));
+
+	/* exact copy of entry to Tx section entry */
+	memcpy(sect_tx->tcam, sect_rx->tcam, sizeof(*sect_tx->tcam));
+
+	status = ice_update_pkg(hw, ice_pkg_buf(bld), 1);
+	if (!status) {
+		hw->tnl.tbl[index].port = port;
+		hw->tnl.tbl[index].in_use = true;
+		hw->tnl.tbl[index].ref = 1;
+	}
+
+ice_create_tunnel_err:
+	ice_pkg_buf_free(hw, bld);
+
+ice_create_tunnel_end:
+	mutex_unlock(&hw->tnl_lock);
+
+	return status;
+}
+
+/**
+ * ice_destroy_tunnel
+ * @hw: pointer to the HW structure
+ * @port: port of tunnel to destroy (ignored if the all parameter is true)
+ * @all: flag that states to destroy all tunnels
+ *
+ * Destroys a tunnel or all tunnels by creating an update package buffer
+ * targeting the specific updates requested and then performing an update
+ * package.
+ */
+enum ice_status ice_destroy_tunnel(struct ice_hw *hw, u16 port, bool all)
+{
+	struct ice_boost_tcam_section *sect_rx, *sect_tx;
+	enum ice_status status = ICE_ERR_MAX_LIMIT;
+	struct ice_buf_build *bld;
+	u16 count = 0;
+	u16 index;
+	u16 size;
+	u16 i, j;
+
+	mutex_lock(&hw->tnl_lock);
+
+	if (!all && ice_tunnel_port_in_use_hlpr(hw, port, &index))
+		if (hw->tnl.tbl[index].ref > 1) {
+			hw->tnl.tbl[index].ref--;
+			status = 0;
+			goto ice_destroy_tunnel_end;
+		}
+
+	/* determine count */
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].in_use &&
+		    (all || hw->tnl.tbl[i].port == port))
+			count++;
+
+	if (!count) {
+		status = ICE_ERR_PARAM;
+		goto ice_destroy_tunnel_end;
+	}
+
+	/* size of section - there is at least one entry */
+	size = struct_size(sect_rx, tcam, count);
+
+	bld = ice_pkg_buf_alloc(hw);
+	if (!bld) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_destroy_tunnel_end;
+	}
+
+	/* allocate 2 sections, one for Rx parser, one for Tx parser */
+	if (ice_pkg_buf_reserve_section(bld, 2))
+		goto ice_destroy_tunnel_err;
+
+	sect_rx = ice_pkg_buf_alloc_section(bld, ICE_SID_RXPARSER_BOOST_TCAM,
+					    size);
+	if (!sect_rx)
+		goto ice_destroy_tunnel_err;
+	sect_rx->count = cpu_to_le16(count);
+
+	sect_tx = ice_pkg_buf_alloc_section(bld, ICE_SID_TXPARSER_BOOST_TCAM,
+					    size);
+	if (!sect_tx)
+		goto ice_destroy_tunnel_err;
+	sect_tx->count = cpu_to_le16(count);
+
+	/* copy original boost entry to update package buffer, one copy to Rx
+	 * section, another copy to the Tx section
+	 */
+	for (i = 0, j = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].in_use &&
+		    (all || hw->tnl.tbl[i].port == port)) {
+			memcpy(sect_rx->tcam + j, hw->tnl.tbl[i].boost_entry,
+			       sizeof(*sect_rx->tcam));
+			memcpy(sect_tx->tcam + j, hw->tnl.tbl[i].boost_entry,
+			       sizeof(*sect_tx->tcam));
+			hw->tnl.tbl[i].marked = true;
+			j++;
+		}
+
+	status = ice_update_pkg(hw, ice_pkg_buf(bld), 1);
+	if (!status)
+		for (i = 0; i < hw->tnl.count &&
+		     i < ICE_TUNNEL_MAX_ENTRIES; i++)
+			if (hw->tnl.tbl[i].marked) {
+				hw->tnl.tbl[i].ref = 0;
+				hw->tnl.tbl[i].port = 0;
+				hw->tnl.tbl[i].in_use = false;
+				hw->tnl.tbl[i].marked = false;
+			}
+
+ice_destroy_tunnel_err:
+	ice_pkg_buf_free(hw, bld);
+
+ice_destroy_tunnel_end:
+	mutex_unlock(&hw->tnl_lock);
+
+	return status;
+}
+
+/**
+ * ice_replay_tunnels
+ * @hw: pointer to the HW structure
+ *
+ * Replays all tunnels
+ */
+enum ice_status ice_replay_tunnels(struct ice_hw *hw)
+{
+	enum ice_status status = 0;
+	u16 i;
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++) {
+		enum ice_tunnel_type type = hw->tnl.tbl[i].type;
+		u16 refs = hw->tnl.tbl[i].ref;
+		u16 port = hw->tnl.tbl[i].port;
+
+		if (!hw->tnl.tbl[i].in_use)
+			continue;
+
+		/* Replay tunnels one at a time by destroying them, then
+		 * recreating them
+		 */
+		hw->tnl.tbl[i].ref = 1; /* make sure to destroy in one call */
+		status = ice_destroy_tunnel(hw, port, false);
+		if (status) {
+			ice_debug(hw, ICE_DBG_PKG, "ERR: 0x%x - destroy tunnel port 0x%x\n",
+				  status, port);
+			break;
+		}
+
+		status = ice_create_tunnel(hw, type, port);
+		if (status) {
+			ice_debug(hw, ICE_DBG_PKG, "ERR: 0x%x - create tunnel port 0x%x\n",
+				  status, port);
+			break;
+		}
+
+		/* reset to original ref count */
+		hw->tnl.tbl[i].ref = refs;
+	}
+
+	return status;
+}
+
+/**
+ * ice_find_prot_off - find prot ID and offset pair, based on prof and FV index
+ * @hw: pointer to the hardware structure
+ * @blk: hardware block
+ * @prof: profile ID
+ * @fv_idx: field vector word index
+ * @prot: variable to receive the protocol ID
+ * @off: variable to receive the protocol offset
+ */
+enum ice_status
+ice_find_prot_off(struct ice_hw *hw, enum ice_block blk, u8 prof, u16 fv_idx,
+		  u8 *prot, u16 *off)
+{
+	struct ice_fv_word *fv_ext;
+
+	if (prof >= hw->blk[blk].es.count)
+		return ICE_ERR_PARAM;
+
+	if (fv_idx >= hw->blk[blk].es.fvw)
+		return ICE_ERR_PARAM;
+
+	fv_ext = hw->blk[blk].es.t + (prof * hw->blk[blk].es.fvw);
+
+	*prot = fv_ext[fv_idx].prot_id;
+	*off = fv_ext[fv_idx].off;
+
+	return 0;
+}
+
+/* PTG Management */
+
+/**
+ * ice_ptg_update_xlt1 - Updates packet type groups in HW via XLT1 table
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ *
+ * This function will update the XLT1 hardware table to reflect the new
+ * packet type group configuration.
+ */
+enum ice_status ice_ptg_update_xlt1(struct ice_hw *hw, enum ice_block blk)
+{
+	struct ice_xlt1_section *sect;
+	struct ice_buf_build *bld;
+	enum ice_status status;
+	u16 index;
+
+	bld = ice_pkg_buf_alloc_single_section(hw, ice_sect_id(blk, ICE_XLT1),
+					       struct_size(sect, value, ICE_XLT1_CNT),
+					       (void **)&sect);
+	if (!bld)
+		return ICE_ERR_NO_MEMORY;
+
+	sect->count = cpu_to_le16(ICE_XLT1_CNT);
+	sect->offset = cpu_to_le16(0);
+	for (index = 0; index < ICE_XLT1_CNT; index++)
+		sect->value[index] = hw->blk[blk].xlt1.ptypes[index].ptg;
+
+	status = ice_update_pkg(hw, ice_pkg_buf(bld), 1);
+
+	ice_pkg_buf_free(hw, bld);
+
+	return status;
+}
+
+/**
+ * ice_ptg_find_ptype - Search for packet type group using packet type (ptype)
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptype: the ptype to search for
+ * @ptg: pointer to variable that receives the PTG
+ *
+ * This function will search the PTGs for a particular ptype, returning the
+ * PTG ID that contains it through the PTG parameter, with the value of
+ * ICE_DEFAULT_PTG (0) meaning it is part the default PTG.
+ */
+static enum ice_status
+ice_ptg_find_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 *ptg)
+{
+	if (ptype >= ICE_XLT1_CNT || !ptg)
+		return ICE_ERR_PARAM;
+
+	*ptg = hw->blk[blk].xlt1.ptypes[ptype].ptg;
+	return 0;
+}
+
+/**
+ * ice_ptg_alloc_val - Allocates a new packet type group ID by value
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptg: the PTG to allocate
+ *
+ * This function allocates a given packet type group ID specified by the PTG
+ * parameter.
+ */
+static void ice_ptg_alloc_val(struct ice_hw *hw, enum ice_block blk, u8 ptg)
+{
+	hw->blk[blk].xlt1.ptg_tbl[ptg].in_use = true;
+}
+
+/**
+ * ice_ptg_free - Frees a packet type group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptg: the PTG ID to free
+ *
+ * This function frees a packet type group, and returns all the current ptypes
+ * within it to the default PTG.
+ */
+void ice_ptg_free(struct ice_hw *hw, enum ice_block blk, u8 ptg)
+{
+	struct ice_ptg_ptype *p, *temp;
+
+	hw->blk[blk].xlt1.ptg_tbl[ptg].in_use = false;
+	p = hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
+	while (p) {
+		p->ptg = ICE_DEFAULT_PTG;
+		temp = p->next_ptype;
+		p->next_ptype = NULL;
+		p = temp;
+	}
+
+	hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype = NULL;
+}
+
+/**
+ * ice_ptg_remove_ptype - Removes ptype from a particular packet type group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptype: the ptype to remove
+ * @ptg: the PTG to remove the ptype from
+ *
+ * This function will remove the ptype from the specific PTG, and move it to
+ * the default PTG (ICE_DEFAULT_PTG).
+ */
+static enum ice_status
+ice_ptg_remove_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 ptg)
+{
+	struct ice_ptg_ptype **ch;
+	struct ice_ptg_ptype *p;
+
+	if (ptype > ICE_XLT1_CNT - 1)
+		return ICE_ERR_PARAM;
+
+	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].in_use)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Should not happen if .in_use is set, bad config */
+	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype)
+		return ICE_ERR_CFG;
+
+	/* find the ptype within this PTG, and bypass the link over it */
+	p = hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
+	ch = &hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
+	while (p) {
+		if (ptype == (p - hw->blk[blk].xlt1.ptypes)) {
+			*ch = p->next_ptype;
+			break;
+		}
+
+		ch = &p->next_ptype;
+		p = p->next_ptype;
+	}
+
+	hw->blk[blk].xlt1.ptypes[ptype].ptg = ICE_DEFAULT_PTG;
+	hw->blk[blk].xlt1.ptypes[ptype].next_ptype = NULL;
+
+	return 0;
+}
+
+/**
+ * ice_ptg_add_mv_ptype - Adds/moves ptype to a particular packet type group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptype: the ptype to add or move
+ * @ptg: the PTG to add or move the ptype to
+ *
+ * This function will either add or move a ptype to a particular PTG depending
+ * on if the ptype is already part of another group. Note that using a
+ * a destination PTG ID of ICE_DEFAULT_PTG (0) will move the ptype to the
+ * default PTG.
+ */
+static enum ice_status
+ice_ptg_add_mv_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 ptg)
+{
+	enum ice_status status;
+	u8 original_ptg;
+
+	if (ptype > ICE_XLT1_CNT - 1)
+		return ICE_ERR_PARAM;
+
+	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].in_use && ptg != ICE_DEFAULT_PTG)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	status = ice_ptg_find_ptype(hw, blk, ptype, &original_ptg);
+	if (status)
+		return status;
+
+	/* Is ptype already in the correct PTG? */
+	if (original_ptg == ptg)
+		return 0;
+
+	/* Remove from original PTG and move back to the default PTG */
+	if (original_ptg != ICE_DEFAULT_PTG)
+		ice_ptg_remove_ptype(hw, blk, ptype, original_ptg);
+
+	/* Moving to default PTG? Then we're done with this request */
+	if (ptg == ICE_DEFAULT_PTG)
+		return 0;
+
+	/* Add ptype to PTG at beginning of list */
+	hw->blk[blk].xlt1.ptypes[ptype].next_ptype =
+		hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
+	hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype =
+		&hw->blk[blk].xlt1.ptypes[ptype];
+
+	hw->blk[blk].xlt1.ptypes[ptype].ptg = ptg;
+	hw->blk[blk].xlt1.t[ptype] = ptg;
+
+	return 0;
+}
+
+/* Block / table size info */
+struct ice_blk_size_details {
+	u16 xlt1;			/* # XLT1 entries */
+	u16 xlt2;			/* # XLT2 entries */
+	u16 prof_tcam;			/* # profile ID TCAM entries */
+	u16 prof_id;			/* # profile IDs */
+	u8 prof_cdid_bits;		/* # CDID one-hot bits used in key */
+	u16 prof_redir;			/* # profile redirection entries */
+	u16 es;				/* # extraction sequence entries */
+	u16 fvw;			/* # field vector words */
+	u8 overwrite;			/* overwrite existing entries allowed */
+	u8 reverse;			/* reverse FV order */
+};
+
+static const struct ice_blk_size_details blk_sizes[ICE_BLK_COUNT] = {
+	/**
+	 * Table Definitions
+	 * XLT1 - Number of entries in XLT1 table
+	 * XLT2 - Number of entries in XLT2 table
+	 * TCAM - Number of entries Profile ID TCAM table
+	 * CDID - Control Domain ID of the hardware block
+	 * PRED - Number of entries in the Profile Redirection Table
+	 * FV   - Number of entries in the Field Vector
+	 * FVW  - Width (in WORDs) of the Field Vector
+	 * OVR  - Overwrite existing table entries
+	 * REV  - Reverse FV
+	 */
+	/*          XLT1        , XLT2        ,TCAM, PID,CDID,PRED,   FV, FVW */
+	/*          Overwrite   , Reverse FV */
+	/* SW  */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 256,   0,  256, 256,  48,
+		    false, false },
+	/* ACL */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  32,
+		    false, false },
+	/* FD  */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  24,
+		    false, true  },
+	/* RSS */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  24,
+		    true,  true  },
+	/* PE  */ { ICE_XLT1_CNT, ICE_XLT2_CNT,  64,  32,   0,   32,  32,  24,
+		    false, false },
+};
+
+enum ice_sid_all {
+	ICE_SID_XLT1_OFF = 0,
+	ICE_SID_XLT2_OFF,
+	ICE_SID_PR_OFF,
+	ICE_SID_PR_REDIR_OFF,
+	ICE_SID_ES_OFF,
+	ICE_SID_OFF_COUNT,
+};
+
+/* Characteristic handling */
+
+/**
+ * ice_match_prop_lst - determine if properties of two lists match
+ * @list1: first properties list
+ * @list2: second properties list
+ *
+ * Count, cookies and the order must match in order to be considered equivalent.
+ */
+static bool
+ice_match_prop_lst(struct list_head *list1, struct list_head *list2)
+{
+	struct ice_vsig_prof *tmp1;
+	struct ice_vsig_prof *tmp2;
+	u16 chk_count = 0;
+	u16 count = 0;
+
+	/* compare counts */
+	list_for_each_entry(tmp1, list1, list)
+		count++;
+	list_for_each_entry(tmp2, list2, list)
+		chk_count++;
+	/* cppcheck-suppress knownConditionTrueFalse */
+	if (!count || count != chk_count)
+		return false;
+
+	tmp1 = list_first_entry(list1, struct ice_vsig_prof, list);
+	tmp2 = list_first_entry(list2, struct ice_vsig_prof, list);
+
+	/* profile cookies must compare, and in the exact same order to take
+	 * into account priority
+	 */
+	while (count--) {
+		if (tmp2->profile_cookie != tmp1->profile_cookie)
+			return false;
+
+		tmp1 = list_next_entry(tmp1, list);
+		tmp2 = list_next_entry(tmp2, list);
+	}
+
+	return true;
+}
+
+/* VSIG Management */
+
+/**
+ * ice_vsig_update_xlt2_sect - update one section of XLT2 table
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsi: HW VSI number to program
+ * @vsig: VSIG for the VSI
+ *
+ * This function will update the XLT2 hardware table with the input VSI
+ * group configuration.
+ */
+static enum ice_status
+ice_vsig_update_xlt2_sect(struct ice_hw *hw, enum ice_block blk, u16 vsi,
+			  u16 vsig)
+{
+	struct ice_xlt2_section *sect;
+	struct ice_buf_build *bld;
+	enum ice_status status;
+
+	bld = ice_pkg_buf_alloc_single_section(hw, ice_sect_id(blk, ICE_XLT2),
+					       struct_size(sect, value, 1),
+					       (void **)&sect);
+	if (!bld)
+		return ICE_ERR_NO_MEMORY;
+
+	sect->count = cpu_to_le16(1);
+	sect->offset = cpu_to_le16(vsi);
+	sect->value[0] = cpu_to_le16(vsig);
+
+	status = ice_update_pkg(hw, ice_pkg_buf(bld), 1);
+
+	ice_pkg_buf_free(hw, bld);
+
+	return status;
+}
+
+/**
+ * ice_vsig_update_xlt2 - update XLT2 table with VSIG configuration
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ *
+ * This function will update the XLT2 hardware table with the input VSI
+ * group configuration of used vsis.
+ */
+enum ice_status ice_vsig_update_xlt2(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 vsi;
+
+	for (vsi = 0; vsi < ICE_MAX_VSI; vsi++) {
+		/* update only vsis that have been changed */
+		if (hw->blk[blk].xlt2.vsis[vsi].changed) {
+			enum ice_status status;
+			u16 vsig;
+
+			vsig = hw->blk[blk].xlt2.vsis[vsi].vsig;
+			status = ice_vsig_update_xlt2_sect(hw, blk, vsi, vsig);
+			if (status)
+				return status;
+
+			hw->blk[blk].xlt2.vsis[vsi].changed = 0;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsig_find_vsi - find a VSIG that contains a specified VSI
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsi: VSI of interest
+ * @vsig: pointer to receive the VSI group
+ *
+ * This function will lookup the VSI entry in the XLT2 list and return
+ * the VSI group its associated with.
+ */
+static enum ice_status
+ice_vsig_find_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 *vsig)
+{
+	if (!vsig || vsi >= ICE_MAX_VSI)
+		return ICE_ERR_PARAM;
+
+	/* As long as there's a default or valid VSIG associated with the input
+	 * VSI, the functions returns a success. Any handling of VSIG will be
+	 * done by the following add, update or remove functions.
+	 */
+	*vsig = hw->blk[blk].xlt2.vsis[vsi].vsig;
+
+	return 0;
+}
+
+/**
+ * ice_vsig_alloc_val - allocate a new VSIG by value
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsig: the VSIG to allocate
+ *
+ * This function will allocate a given VSIG specified by the VSIG parameter.
+ */
+static u16 ice_vsig_alloc_val(struct ice_hw *hw, enum ice_block blk, u16 vsig)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use) {
+		INIT_LIST_HEAD(&hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst);
+		hw->blk[blk].xlt2.vsig_tbl[idx].in_use = true;
+	}
+
+	return ICE_VSIG_VALUE(idx, hw->pf_id);
+}
+
+/**
+ * ice_vsig_alloc - Finds a free entry and allocates a new VSIG
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ *
+ * This function will iterate through the VSIG list and mark the first
+ * unused entry for the new VSIG entry as used and return that value.
+ */
+static u16 ice_vsig_alloc(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 i;
+
+	for (i = 1; i < ICE_MAX_VSIGS; i++)
+		if (!hw->blk[blk].xlt2.vsig_tbl[i].in_use)
+			return ice_vsig_alloc_val(hw, blk, i);
+
+	return ICE_DEFAULT_VSIG;
+}
+
+/**
+ * ice_find_dup_props_vsig - find VSI group with a specified set of properties
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @chs: characteristic list
+ * @vsig: returns the VSIG with the matching profiles, if found
+ *
+ * Each VSIG is associated with a characteristic set; i.e. all VSIs under
+ * a group have the same characteristic set. To check if there exists a VSIG
+ * which has the same characteristics as the input characteristics; this
+ * function will iterate through the XLT2 list and return the VSIG that has a
+ * matching configuration. In order to make sure that priorities are accounted
+ * for, the list must match exactly, including the order in which the
+ * characteristics are listed.
+ */
+static enum ice_status
+ice_find_dup_props_vsig(struct ice_hw *hw, enum ice_block blk,
+			struct list_head *chs, u16 *vsig)
+{
+	struct ice_xlt2 *xlt2 = &hw->blk[blk].xlt2;
+	u16 i;
+
+	for (i = 0; i < xlt2->count; i++)
+		if (xlt2->vsig_tbl[i].in_use &&
+		    ice_match_prop_lst(chs, &xlt2->vsig_tbl[i].prop_lst)) {
+			*vsig = ICE_VSIG_VALUE(i, hw->pf_id);
+			return 0;
+		}
+
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_vsig_free - free VSI group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsig: VSIG to remove
+ *
+ * The function will remove all VSIs associated with the input VSIG and move
+ * them to the DEFAULT_VSIG and mark the VSIG available.
+ */
+static enum ice_status
+ice_vsig_free(struct ice_hw *hw, enum ice_block blk, u16 vsig)
+{
+	struct ice_vsig_prof *dtmp, *del;
+	struct ice_vsig_vsi *vsi_cur;
+	u16 idx;
+
+	idx = vsig & ICE_VSIG_IDX_M;
+	if (idx >= ICE_MAX_VSIGS)
+		return ICE_ERR_PARAM;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	hw->blk[blk].xlt2.vsig_tbl[idx].in_use = false;
+
+	vsi_cur = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	/* If the VSIG has at least 1 VSI then iterate through the
+	 * list and remove the VSIs before deleting the group.
+	 */
+	if (vsi_cur) {
+		/* remove all vsis associated with this VSIG XLT2 entry */
+		do {
+			struct ice_vsig_vsi *tmp = vsi_cur->next_vsi;
+
+			vsi_cur->vsig = ICE_DEFAULT_VSIG;
+			vsi_cur->changed = 1;
+			vsi_cur->next_vsi = NULL;
+			vsi_cur = tmp;
+		} while (vsi_cur);
+
+		/* NULL terminate head of VSI list */
+		hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi = NULL;
+	}
+
+	/* free characteristic list */
+	list_for_each_entry_safe(del, dtmp,
+				 &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+				 list) {
+		list_del(&del->list);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
+
+	/* if VSIG characteristic list was cleared for reset
+	 * re-initialize the list head
+	 */
+	INIT_LIST_HEAD(&hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst);
+
+	return 0;
+}
+
+/**
+ * ice_vsig_remove_vsi - remove VSI from VSIG
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsi: VSI to remove
+ * @vsig: VSI group to remove from
+ *
+ * The function will remove the input VSI from its VSI group and move it
+ * to the DEFAULT_VSIG.
+ */
+static enum ice_status
+ice_vsig_remove_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+{
+	struct ice_vsig_vsi **vsi_head, *vsi_cur, *vsi_tgt;
+	u16 idx;
+
+	idx = vsig & ICE_VSIG_IDX_M;
+
+	if (vsi >= ICE_MAX_VSI || idx >= ICE_MAX_VSIGS)
+		return ICE_ERR_PARAM;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* entry already in default VSIG, don't have to remove */
+	if (idx == ICE_DEFAULT_VSIG)
+		return 0;
+
+	vsi_head = &hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	if (!(*vsi_head))
+		return ICE_ERR_CFG;
+
+	vsi_tgt = &hw->blk[blk].xlt2.vsis[vsi];
+	vsi_cur = (*vsi_head);
+
+	/* iterate the VSI list, skip over the entry to be removed */
+	while (vsi_cur) {
+		if (vsi_tgt == vsi_cur) {
+			(*vsi_head) = vsi_cur->next_vsi;
+			break;
+		}
+		vsi_head = &vsi_cur->next_vsi;
+		vsi_cur = vsi_cur->next_vsi;
+	}
+
+	/* verify if VSI was removed from group list */
+	if (!vsi_cur)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	vsi_cur->vsig = ICE_DEFAULT_VSIG;
+	vsi_cur->changed = 1;
+	vsi_cur->next_vsi = NULL;
+
+	return 0;
+}
+
+/**
+ * ice_vsig_add_mv_vsi - add or move a VSI to a VSI group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsi: VSI to move
+ * @vsig: destination VSI group
+ *
+ * This function will move or add the input VSI to the target VSIG.
+ * The function will find the original VSIG the VSI belongs to and
+ * move the entry to the DEFAULT_VSIG, update the original VSIG and
+ * then move entry to the new VSIG.
+ */
+static enum ice_status
+ice_vsig_add_mv_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+{
+	struct ice_vsig_vsi *tmp;
+	enum ice_status status;
+	u16 orig_vsig, idx;
+
+	idx = vsig & ICE_VSIG_IDX_M;
+
+	if (vsi >= ICE_MAX_VSI || idx >= ICE_MAX_VSIGS)
+		return ICE_ERR_PARAM;
+
+	/* if VSIG not in use and VSIG is not default type this VSIG
+	 * doesn't exist.
+	 */
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use &&
+	    vsig != ICE_DEFAULT_VSIG)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	status = ice_vsig_find_vsi(hw, blk, vsi, &orig_vsig);
+	if (status)
+		return status;
+
+	/* no update required if vsigs match */
+	if (orig_vsig == vsig)
+		return 0;
+
+	if (orig_vsig != ICE_DEFAULT_VSIG) {
+		/* remove entry from orig_vsig and add to default VSIG */
+		status = ice_vsig_remove_vsi(hw, blk, vsi, orig_vsig);
+		if (status)
+			return status;
+	}
+
+	if (idx == ICE_DEFAULT_VSIG)
+		return 0;
+
+	/* Create VSI entry and add VSIG and prop_mask values */
+	hw->blk[blk].xlt2.vsis[vsi].vsig = vsig;
+	hw->blk[blk].xlt2.vsis[vsi].changed = 1;
+
+	/* Add new entry to the head of the VSIG list */
+	tmp = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi =
+		&hw->blk[blk].xlt2.vsis[vsi];
+	hw->blk[blk].xlt2.vsis[vsi].next_vsi = tmp;
+	hw->blk[blk].xlt2.t[vsi] = vsig;
+
+	return 0;
+}
+
+/**
+ * ice_prof_has_mask_idx - determine if profile index masking is identical
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @prof: profile to check
+ * @idx: profile index to check
+ * @mask: mask to match
+ */
+static bool
+ice_prof_has_mask_idx(struct ice_hw *hw, enum ice_block blk, u8 prof, u16 idx,
+		      u16 mask)
+{
+	bool expect_no_mask = false;
+	bool found = false;
+	bool match = false;
+	u16 i;
+
+	/* If mask is 0x0000 or 0xffff, then there is no masking */
+	if (mask == 0 || mask == 0xffff)
+		expect_no_mask = true;
+
+	/* Scan the enabled masks on this profile, for the specified idx */
+	for (i = hw->blk[blk].masks.first; i < hw->blk[blk].masks.first +
+	     hw->blk[blk].masks.count; i++)
+		if (hw->blk[blk].es.mask_ena[prof] & BIT(i))
+			if (hw->blk[blk].masks.masks[i].in_use &&
+			    hw->blk[blk].masks.masks[i].idx == idx) {
+				found = true;
+				if (hw->blk[blk].masks.masks[i].mask == mask)
+					match = true;
+				break;
+			}
+
+	if (expect_no_mask) {
+		if (found)
+			return false;
+	} else {
+		if (!match)
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_prof_has_mask - determine if profile masking is identical
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @prof: profile to check
+ * @masks: masks to match
+ */
+static bool
+ice_prof_has_mask(struct ice_hw *hw, enum ice_block blk, u8 prof, u16 *masks)
+{
+	u16 i;
+
+	/* es->mask_ena[prof] will have the mask */
+	for (i = 0; i < hw->blk[blk].es.fvw; i++)
+		if (!ice_prof_has_mask_idx(hw, blk, prof, i, masks[i]))
+			return false;
+
+	return true;
+}
+
+/**
+ * ice_find_prof_id_with_mask - find profile ID for a given field vector
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @fv: field vector to search for
+ * @masks: masks for fv
+ * @prof_id: receives the profile ID
+ */
+static enum ice_status
+ice_find_prof_id_with_mask(struct ice_hw *hw, enum ice_block blk,
+			   struct ice_fv_word *fv, u16 *masks, u8 *prof_id)
+{
+	struct ice_es *es = &hw->blk[blk].es;
+	u8 i;
+
+	/* For FD and RSS we don't want to re-use a existed profile with the
+	 * same field vector and mask. This will cause rule interference.
+	 */
+	if (blk == ICE_BLK_FD || blk == ICE_BLK_RSS)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	for (i = 0; i < (u8)es->count; i++) {
+		u16 off = i * es->fvw;
+
+		if (memcmp(&es->t[off], fv, es->fvw * sizeof(*fv)))
+			continue;
+
+		/* check if masks settings are the same for this profile */
+		if (masks && !ice_prof_has_mask(hw, blk, i, masks))
+			continue;
+
+		*prof_id = i;
+		return 0;
+	}
+
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_prof_id_rsrc_type - get profile ID resource type for a block type
+ * @blk: the block type
+ * @rsrc_type: pointer to variable to receive the resource type
+ */
+static bool ice_prof_id_rsrc_type(enum ice_block blk, u16 *rsrc_type)
+{
+	switch (blk) {
+	case ICE_BLK_SW:
+		*rsrc_type = ICE_AQC_RES_TYPE_SWITCH_PROF_BLDR_PROFID;
+		break;
+	case ICE_BLK_ACL:
+		*rsrc_type = ICE_AQC_RES_TYPE_ACL_PROF_BLDR_PROFID;
+		break;
+	case ICE_BLK_FD:
+		*rsrc_type = ICE_AQC_RES_TYPE_FD_PROF_BLDR_PROFID;
+		break;
+	case ICE_BLK_RSS:
+		*rsrc_type = ICE_AQC_RES_TYPE_HASH_PROF_BLDR_PROFID;
+		break;
+	case ICE_BLK_PE:
+		*rsrc_type = ICE_AQC_RES_TYPE_QHASH_PROF_BLDR_PROFID;
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
+/**
+ * ice_tcam_ent_rsrc_type - get TCAM entry resource type for a block type
+ * @blk: the block type
+ * @rsrc_type: pointer to variable to receive the resource type
+ */
+static bool ice_tcam_ent_rsrc_type(enum ice_block blk, u16 *rsrc_type)
+{
+	switch (blk) {
+	case ICE_BLK_SW:
+		*rsrc_type = ICE_AQC_RES_TYPE_SWITCH_PROF_BLDR_TCAM;
+		break;
+	case ICE_BLK_ACL:
+		*rsrc_type = ICE_AQC_RES_TYPE_ACL_PROF_BLDR_TCAM;
+		break;
+	case ICE_BLK_FD:
+		*rsrc_type = ICE_AQC_RES_TYPE_FD_PROF_BLDR_TCAM;
+		break;
+	case ICE_BLK_RSS:
+		*rsrc_type = ICE_AQC_RES_TYPE_HASH_PROF_BLDR_TCAM;
+		break;
+	case ICE_BLK_PE:
+		*rsrc_type = ICE_AQC_RES_TYPE_QHASH_PROF_BLDR_TCAM;
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
+/**
+ * ice_alloc_tcam_ent - allocate hardware TCAM entry
+ * @hw: pointer to the HW struct
+ * @blk: the block to allocate the TCAM for
+ * @btm: true to allocate from bottom of table, false to allocate from top
+ * @tcam_idx: pointer to variable to receive the TCAM entry
+ *
+ * This function allocates a new entry in a Profile ID TCAM for a specific
+ * block.
+ */
+static enum ice_status
+ice_alloc_tcam_ent(struct ice_hw *hw, enum ice_block blk, bool btm,
+		   u16 *tcam_idx)
+{
+	u16 res_type;
+
+	if (!ice_tcam_ent_rsrc_type(blk, &res_type))
+		return ICE_ERR_PARAM;
+
+	return ice_alloc_hw_res(hw, res_type, 1, btm, tcam_idx);
+}
+
+/**
+ * ice_free_tcam_ent - free hardware TCAM entry
+ * @hw: pointer to the HW struct
+ * @blk: the block from which to free the TCAM entry
+ * @tcam_idx: the TCAM entry to free
+ *
+ * This function frees an entry in a Profile ID TCAM for a specific block.
+ */
+static enum ice_status
+ice_free_tcam_ent(struct ice_hw *hw, enum ice_block blk, u16 tcam_idx)
+{
+	u16 res_type;
+
+	if (!ice_tcam_ent_rsrc_type(blk, &res_type))
+		return ICE_ERR_PARAM;
+
+	return ice_free_hw_res(hw, res_type, 1, &tcam_idx);
+}
+
+/**
+ * ice_alloc_prof_id - allocate profile ID
+ * @hw: pointer to the HW struct
+ * @blk: the block to allocate the profile ID for
+ * @prof_id: pointer to variable to receive the profile ID
+ *
+ * This function allocates a new profile ID, which also corresponds to a Field
+ * Vector (Extraction Sequence) entry.
+ */
+static enum ice_status
+ice_alloc_prof_id(struct ice_hw *hw, enum ice_block blk, u8 *prof_id)
+{
+	enum ice_status status;
+	u16 res_type;
+	u16 get_prof;
+
+	if (!ice_prof_id_rsrc_type(blk, &res_type))
+		return ICE_ERR_PARAM;
+
+	status = ice_alloc_hw_res(hw, res_type, 1, false, &get_prof);
+	if (!status)
+		*prof_id = (u8)get_prof;
+
+	return status;
+}
+
+/**
+ * ice_free_prof_id - free profile ID
+ * @hw: pointer to the HW struct
+ * @blk: the block from which to free the profile ID
+ * @prof_id: the profile ID to free
+ *
+ * This function frees a profile ID, which also corresponds to a Field Vector.
+ */
+static enum ice_status
+ice_free_prof_id(struct ice_hw *hw, enum ice_block blk, u8 prof_id)
+{
+	u16 tmp_prof_id = (u16)prof_id;
+	u16 res_type;
+
+	if (!ice_prof_id_rsrc_type(blk, &res_type))
+		return ICE_ERR_PARAM;
+
+	return ice_free_hw_res(hw, res_type, 1, &tmp_prof_id);
+}
+
+/**
+ * ice_prof_inc_ref - increment reference count for profile
+ * @hw: pointer to the HW struct
+ * @blk: the block from which to free the profile ID
+ * @prof_id: the profile ID for which to increment the reference count
+ */
+static enum ice_status
+ice_prof_inc_ref(struct ice_hw *hw, enum ice_block blk, u8 prof_id)
+{
+	if (prof_id > hw->blk[blk].es.count)
+		return ICE_ERR_PARAM;
+
+	hw->blk[blk].es.ref_count[prof_id]++;
+
+	return 0;
+}
+
+/**
+ * ice_write_prof_mask_reg - write profile mask register
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @mask_idx: mask index
+ * @idx: index of the FV which will use the mask
+ * @mask: the 16-bit mask
+ */
+static void
+ice_write_prof_mask_reg(struct ice_hw *hw, enum ice_block blk, u16 mask_idx,
+			u16 idx, u16 mask)
+{
+	u32 offset;
+	u32 val;
+
+	switch (blk) {
+	case ICE_BLK_RSS:
+		offset = GLQF_HMASK(mask_idx);
+		val = (idx << GLQF_HMASK_MSK_INDEX_S) &
+			GLQF_HMASK_MSK_INDEX_M;
+		val |= (mask << GLQF_HMASK_MASK_S) & GLQF_HMASK_MASK_M;
+		break;
+	case ICE_BLK_FD:
+		offset = GLQF_FDMASK(mask_idx);
+		val = (idx << GLQF_FDMASK_MSK_INDEX_S) &
+			GLQF_FDMASK_MSK_INDEX_M;
+		val |= (mask << GLQF_FDMASK_MASK_S) &
+			GLQF_FDMASK_MASK_M;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_PKG, "No profile masks for block %d\n",
+			  blk);
+		return;
+	}
+
+	wr32(hw, offset, val);
+	ice_debug(hw, ICE_DBG_PKG, "write mask, blk %d (%d): %x = %x\n",
+		  blk, idx, offset, val);
+}
+
+/**
+ * ice_write_prof_mask_enable_res - write profile mask enable register
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @prof_id: profile ID
+ * @enable_mask: enable mask
+ */
+static void
+ice_write_prof_mask_enable_res(struct ice_hw *hw, enum ice_block blk,
+			       u16 prof_id, u32 enable_mask)
+{
+	u32 offset;
+
+	switch (blk) {
+	case ICE_BLK_RSS:
+		offset = GLQF_HMASK_SEL(prof_id);
+		break;
+	case ICE_BLK_FD:
+		offset = GLQF_FDMASK_SEL(prof_id);
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_PKG, "No profile masks for block %d\n",
+			  blk);
+		return;
+	}
+
+	wr32(hw, offset, enable_mask);
+	ice_debug(hw, ICE_DBG_PKG, "write mask enable, blk %d (%d): %x = %x\n",
+		  blk, prof_id, offset, enable_mask);
+}
+
+/**
+ * ice_init_prof_masks - initial prof masks
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ */
+static void ice_init_prof_masks(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 per_pf;
+	u16 i;
+
+	mutex_init(&hw->blk[blk].masks.lock);
+
+	per_pf = ICE_PROF_MASK_COUNT / hw->dev_caps.num_funcs;
+
+	hw->blk[blk].masks.count = per_pf;
+	hw->blk[blk].masks.first = hw->pf_id * per_pf;
+
+	memset(hw->blk[blk].masks.masks, 0, sizeof(hw->blk[blk].masks.masks));
+
+	for (i = hw->blk[blk].masks.first;
+	     i < hw->blk[blk].masks.first + hw->blk[blk].masks.count; i++)
+		ice_write_prof_mask_reg(hw, blk, i, 0, 0);
+}
+
+/**
+ * ice_init_all_prof_masks - initial all prof masks
+ * @hw: pointer to the HW struct
+ */
+void ice_init_all_prof_masks(struct ice_hw *hw)
+{
+	ice_init_prof_masks(hw, ICE_BLK_RSS);
+	ice_init_prof_masks(hw, ICE_BLK_FD);
+}
+
+/**
+ * ice_alloc_prof_mask - allocate profile mask
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @idx: index of FV which will use the mask
+ * @mask: the 16-bit mask
+ * @mask_idx: variable to receive the mask index
+ */
+static enum ice_status
+ice_alloc_prof_mask(struct ice_hw *hw, enum ice_block blk, u16 idx, u16 mask,
+		    u16 *mask_idx)
+{
+	bool found_unused = false, found_copy = false;
+	enum ice_status status = ICE_ERR_MAX_LIMIT;
+	u16 unused_idx = 0, copy_idx = 0;
+	u16 i;
+
+	if (blk != ICE_BLK_RSS && blk != ICE_BLK_FD)
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->blk[blk].masks.lock);
+
+	for (i = hw->blk[blk].masks.first;
+	     i < hw->blk[blk].masks.first + hw->blk[blk].masks.count; i++)
+		if (hw->blk[blk].masks.masks[i].in_use) {
+			/* if mask is in use and it exactly duplicates the
+			 * desired mask and index, then in can be reused
+			 */
+			if (hw->blk[blk].masks.masks[i].mask == mask &&
+			    hw->blk[blk].masks.masks[i].idx == idx) {
+				found_copy = true;
+				copy_idx = i;
+				break;
+			}
+		} else {
+			/* save off unused index, but keep searching in case
+			 * there is an exact match later on
+			 */
+			if (!found_unused) {
+				found_unused = true;
+				unused_idx = i;
+			}
+		}
+
+	if (found_copy)
+		i = copy_idx;
+	else if (found_unused)
+		i = unused_idx;
+	else
+		goto err_ice_alloc_prof_mask;
+
+	/* update mask for a new entry */
+	if (found_unused) {
+		hw->blk[blk].masks.masks[i].in_use = true;
+		hw->blk[blk].masks.masks[i].mask = mask;
+		hw->blk[blk].masks.masks[i].idx = idx;
+		hw->blk[blk].masks.masks[i].ref = 0;
+		ice_write_prof_mask_reg(hw, blk, i, idx, mask);
+	}
+
+	hw->blk[blk].masks.masks[i].ref++;
+	*mask_idx = i;
+	status = 0;
+
+err_ice_alloc_prof_mask:
+	mutex_unlock(&hw->blk[blk].masks.lock);
+
+	return status;
+}
+
+/**
+ * ice_free_prof_mask - free profile mask
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @mask_idx: index of mask
+ */
+static enum ice_status
+ice_free_prof_mask(struct ice_hw *hw, enum ice_block blk, u16 mask_idx)
+{
+	if (blk != ICE_BLK_RSS && blk != ICE_BLK_FD)
+		return ICE_ERR_PARAM;
+
+	if (!(mask_idx >= hw->blk[blk].masks.first &&
+	      mask_idx < hw->blk[blk].masks.first + hw->blk[blk].masks.count))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	mutex_lock(&hw->blk[blk].masks.lock);
+
+	if (!hw->blk[blk].masks.masks[mask_idx].in_use)
+		goto exit_ice_free_prof_mask;
+
+	if (hw->blk[blk].masks.masks[mask_idx].ref > 1) {
+		hw->blk[blk].masks.masks[mask_idx].ref--;
+		goto exit_ice_free_prof_mask;
+	}
+
+	/* remove mask */
+	hw->blk[blk].masks.masks[mask_idx].in_use = false;
+	hw->blk[blk].masks.masks[mask_idx].mask = 0;
+	hw->blk[blk].masks.masks[mask_idx].idx = 0;
+
+	/* update mask as unused entry */
+	ice_debug(hw, ICE_DBG_PKG, "Free mask, blk %d, mask %d\n", blk,
+		  mask_idx);
+	ice_write_prof_mask_reg(hw, blk, mask_idx, 0, 0);
+
+exit_ice_free_prof_mask:
+	mutex_unlock(&hw->blk[blk].masks.lock);
+
+	return 0;
+}
+
+/**
+ * ice_free_prof_masks - free all profile masks for a profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @prof_id: profile ID
+ */
+static enum ice_status
+ice_free_prof_masks(struct ice_hw *hw, enum ice_block blk, u16 prof_id)
+{
+	u32 mask_bm;
+	u16 i;
+
+	if (blk != ICE_BLK_RSS && blk != ICE_BLK_FD)
+		return ICE_ERR_PARAM;
+
+	mask_bm = hw->blk[blk].es.mask_ena[prof_id];
+	for (i = 0; i < BITS_PER_BYTE * sizeof(mask_bm); i++)
+		if (mask_bm & BIT(i))
+			ice_free_prof_mask(hw, blk, i);
+
+	return 0;
+}
+
+/**
+ * ice_shutdown_prof_masks - releases lock for masking
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ *
+ * This should be called before unloading the driver
+ */
+static void ice_shutdown_prof_masks(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 i;
+
+	mutex_lock(&hw->blk[blk].masks.lock);
+
+	for (i = hw->blk[blk].masks.first;
+	     i < hw->blk[blk].masks.first + hw->blk[blk].masks.count; i++) {
+		ice_write_prof_mask_reg(hw, blk, i, 0, 0);
+
+		hw->blk[blk].masks.masks[i].in_use = false;
+		hw->blk[blk].masks.masks[i].idx = 0;
+		hw->blk[blk].masks.masks[i].mask = 0;
+	}
+
+	mutex_unlock(&hw->blk[blk].masks.lock);
+	mutex_destroy(&hw->blk[blk].masks.lock);
+}
+
+/**
+ * ice_shutdown_all_prof_masks - releases all locks for masking
+ * @hw: pointer to the HW struct
+ *
+ * This should be called before unloading the driver
+ */
+void ice_shutdown_all_prof_masks(struct ice_hw *hw)
+{
+	ice_shutdown_prof_masks(hw, ICE_BLK_RSS);
+	ice_shutdown_prof_masks(hw, ICE_BLK_FD);
+}
+
+/**
+ * ice_update_prof_masking - set registers according to masking
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @prof_id: profile ID
+ * @masks: masks
+ */
+static enum ice_status
+ice_update_prof_masking(struct ice_hw *hw, enum ice_block blk, u16 prof_id,
+			u16 *masks)
+{
+	bool err = false;
+	u32 ena_mask = 0;
+	u16 idx;
+	u16 i;
+
+	/* Only support FD and RSS masking, otherwise nothing to be done */
+	if (blk != ICE_BLK_RSS && blk != ICE_BLK_FD)
+		return 0;
+
+	for (i = 0; i < hw->blk[blk].es.fvw; i++)
+		if (masks[i] && masks[i] != 0xFFFF) {
+			if (!ice_alloc_prof_mask(hw, blk, i, masks[i], &idx)) {
+				ena_mask |= BIT(idx);
+			} else {
+				/* not enough bitmaps */
+				err = true;
+				break;
+			}
+		}
+
+	if (err) {
+		/* free any bitmaps we have allocated */
+		for (i = 0; i < BITS_PER_BYTE * sizeof(ena_mask); i++)
+			if (ena_mask & BIT(i))
+				ice_free_prof_mask(hw, blk, i);
+
+		return ICE_ERR_OUT_OF_RANGE;
+	}
+
+	/* enable the masks for this profile */
+	ice_write_prof_mask_enable_res(hw, blk, prof_id, ena_mask);
+
+	/* store enabled masks with profile so that they can be freed later */
+	hw->blk[blk].es.mask_ena[prof_id] = ena_mask;
+
+	return 0;
+}
+
+/**
+ * ice_write_es - write an extraction sequence to hardware
+ * @hw: pointer to the HW struct
+ * @blk: the block in which to write the extraction sequence
+ * @prof_id: the profile ID to write
+ * @fv: pointer to the extraction sequence to write - NULL to clear extraction
+ */
+static void
+ice_write_es(struct ice_hw *hw, enum ice_block blk, u8 prof_id,
+	     struct ice_fv_word *fv)
+{
+	u16 off;
+
+	off = prof_id * hw->blk[blk].es.fvw;
+	if (!fv) {
+		memset(&hw->blk[blk].es.t[off], 0,
+		       hw->blk[blk].es.fvw * sizeof(*fv));
+		hw->blk[blk].es.written[prof_id] = false;
+	} else {
+		memcpy(&hw->blk[blk].es.t[off], fv,
+		       hw->blk[blk].es.fvw * sizeof(*fv));
+	}
+}
+
+/**
+ * ice_prof_dec_ref - decrement reference count for profile
+ * @hw: pointer to the HW struct
+ * @blk: the block from which to free the profile ID
+ * @prof_id: the profile ID for which to decrement the reference count
+ */
+static enum ice_status
+ice_prof_dec_ref(struct ice_hw *hw, enum ice_block blk, u8 prof_id)
+{
+	if (prof_id > hw->blk[blk].es.count)
+		return ICE_ERR_PARAM;
+
+	if (hw->blk[blk].es.ref_count[prof_id] > 0) {
+		if (!--hw->blk[blk].es.ref_count[prof_id]) {
+			ice_write_es(hw, blk, prof_id, NULL);
+			ice_free_prof_masks(hw, blk, prof_id);
+			return ice_free_prof_id(hw, blk, prof_id);
+		}
+	}
+
+	return 0;
+}
+
+/* Block / table section IDs */
+static const u32 ice_blk_sids[ICE_BLK_COUNT][ICE_SID_OFF_COUNT] = {
+	/* SWITCH */
+	{	ICE_SID_XLT1_SW,
+		ICE_SID_XLT2_SW,
+		ICE_SID_PROFID_TCAM_SW,
+		ICE_SID_PROFID_REDIR_SW,
+		ICE_SID_FLD_VEC_SW
+	},
+
+	/* ACL */
+	{	ICE_SID_XLT1_ACL,
+		ICE_SID_XLT2_ACL,
+		ICE_SID_PROFID_TCAM_ACL,
+		ICE_SID_PROFID_REDIR_ACL,
+		ICE_SID_FLD_VEC_ACL
+	},
+
+	/* FD */
+	{	ICE_SID_XLT1_FD,
+		ICE_SID_XLT2_FD,
+		ICE_SID_PROFID_TCAM_FD,
+		ICE_SID_PROFID_REDIR_FD,
+		ICE_SID_FLD_VEC_FD
+	},
+
+	/* RSS */
+	{	ICE_SID_XLT1_RSS,
+		ICE_SID_XLT2_RSS,
+		ICE_SID_PROFID_TCAM_RSS,
+		ICE_SID_PROFID_REDIR_RSS,
+		ICE_SID_FLD_VEC_RSS
+	},
+
+	/* PE */
+	{	ICE_SID_XLT1_PE,
+		ICE_SID_XLT2_PE,
+		ICE_SID_PROFID_TCAM_PE,
+		ICE_SID_PROFID_REDIR_PE,
+		ICE_SID_FLD_VEC_PE
+	}
+};
+
+/**
+ * ice_init_sw_xlt1_db - init software XLT1 database from HW tables
+ * @hw: pointer to the hardware structure
+ * @blk: the HW block to initialize
+ */
+static void ice_init_sw_xlt1_db(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 pt;
+
+	for (pt = 0; pt < hw->blk[blk].xlt1.count; pt++) {
+		u8 ptg;
+
+		ptg = hw->blk[blk].xlt1.t[pt];
+		if (ptg != ICE_DEFAULT_PTG) {
+			ice_ptg_alloc_val(hw, blk, ptg);
+			ice_ptg_add_mv_ptype(hw, blk, pt, ptg);
+		}
+	}
+}
+
+/**
+ * ice_init_sw_xlt2_db - init software XLT2 database from HW tables
+ * @hw: pointer to the hardware structure
+ * @blk: the HW block to initialize
+ */
+static void ice_init_sw_xlt2_db(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 vsi;
+
+	for (vsi = 0; vsi < hw->blk[blk].xlt2.count; vsi++) {
+		u16 vsig;
+
+		vsig = hw->blk[blk].xlt2.t[vsi];
+		if (vsig) {
+			ice_vsig_alloc_val(hw, blk, vsig);
+			ice_vsig_add_mv_vsi(hw, blk, vsi, vsig);
+			/* no changes at this time, since this has been
+			 * initialized from the original package
+			 */
+			hw->blk[blk].xlt2.vsis[vsi].changed = 0;
+		}
+	}
+}
+
+/**
+ * ice_init_sw_db - init software database from HW tables
+ * @hw: pointer to the hardware structure
+ */
+static void ice_init_sw_db(struct ice_hw *hw)
+{
+	u16 i;
+
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		ice_init_sw_xlt1_db(hw, (enum ice_block)i);
+		ice_init_sw_xlt2_db(hw, (enum ice_block)i);
+	}
+}
+
+/**
+ * ice_fill_tbl - Reads content of a single table type into database
+ * @hw: pointer to the hardware structure
+ * @block_id: Block ID of the table to copy
+ * @sid: Section ID of the table to copy
+ *
+ * Will attempt to read the entire content of a given table of a single block
+ * into the driver database. We assume that the buffer will always
+ * be as large or larger than the data contained in the package. If
+ * this condition is not met, there is most likely an error in the package
+ * contents.
+ */
+static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
+{
+	u32 dst_len, sect_len, offset = 0;
+	struct ice_prof_redir_section *pr;
+	struct ice_prof_id_section *pid;
+	struct ice_xlt1_section *xlt1;
+	struct ice_xlt2_section *xlt2;
+	struct ice_sw_fv_section *es;
+	struct ice_pkg_enum state;
+	u8 *src, *dst;
+	void *sect;
+
+	/* if the HW segment pointer is null then the first iteration of
+	 * ice_pkg_enum_section() will fail. In this case the HW tables will
+	 * not be filled and return success.
+	 */
+	if (!hw->seg) {
+		ice_debug(hw, ICE_DBG_PKG, "hw->seg is NULL, tables are not filled\n");
+		return;
+	}
+
+	memset(&state, 0, sizeof(state));
+
+	sect = ice_pkg_enum_section(hw->seg, &state, sid);
 
 	while (sect) {
 		switch (sid) {
@@ -1262,7 +4132,7 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 		case ICE_SID_XLT1_RSS:
 		case ICE_SID_XLT1_ACL:
 		case ICE_SID_XLT1_PE:
-			xlt1 = (struct ice_xlt1_section *)sect;
+			xlt1 = sect;
 			src = xlt1->value;
 			sect_len = le16_to_cpu(xlt1->count) *
 				sizeof(*hw->blk[block_id].xlt1.t);
@@ -1270,282 +4140,2522 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 			dst_len = hw->blk[block_id].xlt1.count *
 				sizeof(*hw->blk[block_id].xlt1.t);
 			break;
-		case ICE_SID_XLT2_SW:
-		case ICE_SID_XLT2_FD:
-		case ICE_SID_XLT2_RSS:
-		case ICE_SID_XLT2_ACL:
-		case ICE_SID_XLT2_PE:
-			xlt2 = (struct ice_xlt2_section *)sect;
-			src = (__force u8 *)xlt2->value;
-			sect_len = le16_to_cpu(xlt2->count) *
-				sizeof(*hw->blk[block_id].xlt2.t);
-			dst = (u8 *)hw->blk[block_id].xlt2.t;
-			dst_len = hw->blk[block_id].xlt2.count *
-				sizeof(*hw->blk[block_id].xlt2.t);
+		case ICE_SID_XLT2_SW:
+		case ICE_SID_XLT2_FD:
+		case ICE_SID_XLT2_RSS:
+		case ICE_SID_XLT2_ACL:
+		case ICE_SID_XLT2_PE:
+			xlt2 = sect;
+			src = (__force u8 *)xlt2->value;
+			sect_len = le16_to_cpu(xlt2->count) *
+				sizeof(*hw->blk[block_id].xlt2.t);
+			dst = (u8 *)hw->blk[block_id].xlt2.t;
+			dst_len = hw->blk[block_id].xlt2.count *
+				sizeof(*hw->blk[block_id].xlt2.t);
+			break;
+		case ICE_SID_PROFID_TCAM_SW:
+		case ICE_SID_PROFID_TCAM_FD:
+		case ICE_SID_PROFID_TCAM_RSS:
+		case ICE_SID_PROFID_TCAM_ACL:
+		case ICE_SID_PROFID_TCAM_PE:
+			pid = sect;
+			src = (u8 *)pid->entry;
+			sect_len = le16_to_cpu(pid->count) *
+				sizeof(*hw->blk[block_id].prof.t);
+			dst = (u8 *)hw->blk[block_id].prof.t;
+			dst_len = hw->blk[block_id].prof.count *
+				sizeof(*hw->blk[block_id].prof.t);
+			break;
+		case ICE_SID_PROFID_REDIR_SW:
+		case ICE_SID_PROFID_REDIR_FD:
+		case ICE_SID_PROFID_REDIR_RSS:
+		case ICE_SID_PROFID_REDIR_ACL:
+		case ICE_SID_PROFID_REDIR_PE:
+			pr = sect;
+			src = pr->redir_value;
+			sect_len = le16_to_cpu(pr->count) *
+				sizeof(*hw->blk[block_id].prof_redir.t);
+			dst = hw->blk[block_id].prof_redir.t;
+			dst_len = hw->blk[block_id].prof_redir.count *
+				sizeof(*hw->blk[block_id].prof_redir.t);
+			break;
+		case ICE_SID_FLD_VEC_SW:
+		case ICE_SID_FLD_VEC_FD:
+		case ICE_SID_FLD_VEC_RSS:
+		case ICE_SID_FLD_VEC_ACL:
+		case ICE_SID_FLD_VEC_PE:
+			es = sect;
+			src = (u8 *)es->fv;
+			sect_len = (u32)(le16_to_cpu(es->count) *
+					 hw->blk[block_id].es.fvw) *
+				sizeof(*hw->blk[block_id].es.t);
+			dst = (u8 *)hw->blk[block_id].es.t;
+			dst_len = (u32)(hw->blk[block_id].es.count *
+					hw->blk[block_id].es.fvw) *
+				sizeof(*hw->blk[block_id].es.t);
+			break;
+		default:
+			return;
+		}
+
+		/* if the section offset exceeds destination length, terminate
+		 * table fill.
+		 */
+		if (offset > dst_len)
+			return;
+
+		/* if the sum of section size and offset exceed destination size
+		 * then we are out of bounds of the HW table size for that PF.
+		 * Changing section length to fill the remaining table space
+		 * of that PF.
+		 */
+		if ((offset + sect_len) > dst_len)
+			sect_len = dst_len - offset;
+
+		memcpy(dst + offset, src, sect_len);
+		offset += sect_len;
+		sect = ice_pkg_enum_section(NULL, &state, sid);
+	}
+}
+
+/**
+ * ice_fill_blk_tbls - Read package context for tables
+ * @hw: pointer to the hardware structure
+ *
+ * Reads the current package contents and populates the driver
+ * database with the data iteratively for all advanced feature
+ * blocks. Assume that the HW tables have been allocated.
+ */
+void ice_fill_blk_tbls(struct ice_hw *hw)
+{
+	u8 i;
+
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		enum ice_block blk_id = (enum ice_block)i;
+
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].xlt1.sid);
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].xlt2.sid);
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].prof.sid);
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].prof_redir.sid);
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].es.sid);
+	}
+
+	ice_init_sw_db(hw);
+}
+
+/**
+ * ice_free_prof_map - free profile map
+ * @hw: pointer to the hardware structure
+ * @blk_idx: HW block index
+ */
+static void ice_free_prof_map(struct ice_hw *hw, u8 blk_idx)
+{
+	struct ice_es *es = &hw->blk[blk_idx].es;
+	struct ice_prof_map *del, *tmp;
+
+	mutex_lock(&es->prof_map_lock);
+	list_for_each_entry_safe(del, tmp, &es->prof_map, list) {
+		list_del(&del->list);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
+	INIT_LIST_HEAD(&es->prof_map);
+	mutex_unlock(&es->prof_map_lock);
+}
+
+/**
+ * ice_free_flow_profs - free flow profile entries
+ * @hw: pointer to the hardware structure
+ * @blk_idx: HW block index
+ */
+static void ice_free_flow_profs(struct ice_hw *hw, u8 blk_idx)
+{
+	struct ice_flow_prof *p, *tmp;
+
+	mutex_lock(&hw->fl_profs_locks[blk_idx]);
+	list_for_each_entry_safe(p, tmp, &hw->fl_profs[blk_idx], l_entry) {
+		struct ice_flow_entry *e, *t;
+
+		list_for_each_entry_safe(e, t, &p->entries, l_entry)
+			ice_flow_rem_entry(hw, (enum ice_block)blk_idx,
+					   ICE_FLOW_ENTRY_HNDL(e));
+
+		list_del(&p->l_entry);
+		if (p->acts)
+			devm_kfree(ice_hw_to_dev(hw), p->acts);
+
+		mutex_destroy(&p->entries_lock);
+		devm_kfree(ice_hw_to_dev(hw), p);
+	}
+	mutex_unlock(&hw->fl_profs_locks[blk_idx]);
+
+	/* if driver is in reset and tables are being cleared
+	 * re-initialize the flow profile list heads
+	 */
+	INIT_LIST_HEAD(&hw->fl_profs[blk_idx]);
+}
+
+/**
+ * ice_free_vsig_tbl - free complete VSIG table entries
+ * @hw: pointer to the hardware structure
+ * @blk: the HW block on which to free the VSIG table entries
+ */
+static void ice_free_vsig_tbl(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 i;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl)
+		return;
+
+	for (i = 1; i < ICE_MAX_VSIGS; i++)
+		if (hw->blk[blk].xlt2.vsig_tbl[i].in_use)
+			ice_vsig_free(hw, blk, i);
+}
+
+/**
+ * ice_free_hw_tbls - free hardware table memory
+ * @hw: pointer to the hardware structure
+ */
+void ice_free_hw_tbls(struct ice_hw *hw)
+{
+	struct ice_rss_cfg *r, *rt;
+	u8 i;
+
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		if (hw->blk[i].is_list_init) {
+			struct ice_es *es = &hw->blk[i].es;
+
+			ice_free_prof_map(hw, i);
+			mutex_destroy(&es->prof_map_lock);
+
+			ice_free_flow_profs(hw, i);
+			mutex_destroy(&hw->fl_profs_locks[i]);
+
+			hw->blk[i].is_list_init = false;
+		}
+		ice_free_vsig_tbl(hw, (enum ice_block)i);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptypes);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptg_tbl);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.vsig_tbl);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.vsis);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].prof.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].prof_redir.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.ref_count);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.written);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.mask_ena);
+	}
+
+	list_for_each_entry_safe(r, rt, &hw->rss_list_head, l_entry) {
+		list_del(&r->l_entry);
+		devm_kfree(ice_hw_to_dev(hw), r);
+	}
+	mutex_destroy(&hw->rss_locks);
+	ice_shutdown_all_prof_masks(hw);
+	memset(hw->blk, 0, sizeof(hw->blk));
+}
+
+/**
+ * ice_init_flow_profs - init flow profile locks and list heads
+ * @hw: pointer to the hardware structure
+ * @blk_idx: HW block index
+ */
+static void ice_init_flow_profs(struct ice_hw *hw, u8 blk_idx)
+{
+	mutex_init(&hw->fl_profs_locks[blk_idx]);
+	INIT_LIST_HEAD(&hw->fl_profs[blk_idx]);
+}
+
+/**
+ * ice_clear_hw_tbls - clear HW tables and flow profiles
+ * @hw: pointer to the hardware structure
+ */
+void ice_clear_hw_tbls(struct ice_hw *hw)
+{
+	u8 i;
+
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir;
+		struct ice_prof_tcam *prof = &hw->blk[i].prof;
+		struct ice_xlt1 *xlt1 = &hw->blk[i].xlt1;
+		struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2;
+		struct ice_es *es = &hw->blk[i].es;
+
+		if (hw->blk[i].is_list_init) {
+			ice_free_prof_map(hw, i);
+			ice_free_flow_profs(hw, i);
+		}
+
+		ice_free_vsig_tbl(hw, (enum ice_block)i);
+
+		memset(xlt1->ptypes, 0, xlt1->count * sizeof(*xlt1->ptypes));
+		memset(xlt1->ptg_tbl, 0,
+		       ICE_MAX_PTGS * sizeof(*xlt1->ptg_tbl));
+		memset(xlt1->t, 0, xlt1->count * sizeof(*xlt1->t));
+
+		memset(xlt2->vsis, 0, xlt2->count * sizeof(*xlt2->vsis));
+		memset(xlt2->vsig_tbl, 0,
+		       xlt2->count * sizeof(*xlt2->vsig_tbl));
+		memset(xlt2->t, 0, xlt2->count * sizeof(*xlt2->t));
+
+		memset(prof->t, 0, prof->count * sizeof(*prof->t));
+		memset(prof_redir->t, 0,
+		       prof_redir->count * sizeof(*prof_redir->t));
+
+		memset(es->t, 0, es->count * sizeof(*es->t) * es->fvw);
+		memset(es->ref_count, 0, es->count * sizeof(*es->ref_count));
+		memset(es->written, 0, es->count * sizeof(*es->written));
+		memset(es->mask_ena, 0, es->count * sizeof(*es->mask_ena));
+	}
+}
+
+/**
+ * ice_init_hw_tbls - init hardware table memory
+ * @hw: pointer to the hardware structure
+ */
+enum ice_status ice_init_hw_tbls(struct ice_hw *hw)
+{
+	u8 i;
+
+	mutex_init(&hw->rss_locks);
+	INIT_LIST_HEAD(&hw->rss_list_head);
+	ice_init_all_prof_masks(hw);
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir;
+		struct ice_prof_tcam *prof = &hw->blk[i].prof;
+		struct ice_xlt1 *xlt1 = &hw->blk[i].xlt1;
+		struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2;
+		struct ice_es *es = &hw->blk[i].es;
+		u16 j;
+
+		if (hw->blk[i].is_list_init)
+			continue;
+
+		ice_init_flow_profs(hw, i);
+		mutex_init(&es->prof_map_lock);
+		INIT_LIST_HEAD(&es->prof_map);
+		hw->blk[i].is_list_init = true;
+
+		hw->blk[i].overwrite = blk_sizes[i].overwrite;
+		es->reverse = blk_sizes[i].reverse;
+
+		xlt1->sid = ice_blk_sids[i][ICE_SID_XLT1_OFF];
+		xlt1->count = blk_sizes[i].xlt1;
+
+		xlt1->ptypes = devm_kcalloc(ice_hw_to_dev(hw), xlt1->count,
+					    sizeof(*xlt1->ptypes), GFP_KERNEL);
+
+		if (!xlt1->ptypes)
+			goto err;
+
+		xlt1->ptg_tbl = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_PTGS,
+					     sizeof(*xlt1->ptg_tbl),
+					     GFP_KERNEL);
+
+		if (!xlt1->ptg_tbl)
+			goto err;
+
+		xlt1->t = devm_kcalloc(ice_hw_to_dev(hw), xlt1->count,
+				       sizeof(*xlt1->t), GFP_KERNEL);
+		if (!xlt1->t)
+			goto err;
+
+		xlt2->sid = ice_blk_sids[i][ICE_SID_XLT2_OFF];
+		xlt2->count = blk_sizes[i].xlt2;
+
+		xlt2->vsis = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
+					  sizeof(*xlt2->vsis), GFP_KERNEL);
+
+		if (!xlt2->vsis)
+			goto err;
+
+		xlt2->vsig_tbl = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
+					      sizeof(*xlt2->vsig_tbl),
+					      GFP_KERNEL);
+		if (!xlt2->vsig_tbl)
+			goto err;
+
+		for (j = 0; j < xlt2->count; j++)
+			INIT_LIST_HEAD(&xlt2->vsig_tbl[j].prop_lst);
+
+		xlt2->t = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
+				       sizeof(*xlt2->t), GFP_KERNEL);
+		if (!xlt2->t)
+			goto err;
+
+		prof->sid = ice_blk_sids[i][ICE_SID_PR_OFF];
+		prof->count = blk_sizes[i].prof_tcam;
+		prof->max_prof_id = blk_sizes[i].prof_id;
+		prof->cdid_bits = blk_sizes[i].prof_cdid_bits;
+		prof->t = devm_kcalloc(ice_hw_to_dev(hw), prof->count,
+				       sizeof(*prof->t), GFP_KERNEL);
+
+		if (!prof->t)
+			goto err;
+
+		prof_redir->sid = ice_blk_sids[i][ICE_SID_PR_REDIR_OFF];
+		prof_redir->count = blk_sizes[i].prof_redir;
+		prof_redir->t = devm_kcalloc(ice_hw_to_dev(hw),
+					     prof_redir->count,
+					     sizeof(*prof_redir->t),
+					     GFP_KERNEL);
+
+		if (!prof_redir->t)
+			goto err;
+
+		es->sid = ice_blk_sids[i][ICE_SID_ES_OFF];
+		es->count = blk_sizes[i].es;
+		es->fvw = blk_sizes[i].fvw;
+		es->t = devm_kcalloc(ice_hw_to_dev(hw),
+				     (u32)(es->count * es->fvw),
+				     sizeof(*es->t), GFP_KERNEL);
+		if (!es->t)
+			goto err;
+
+		es->ref_count = devm_kcalloc(ice_hw_to_dev(hw), es->count,
+					     sizeof(*es->ref_count),
+					     GFP_KERNEL);
+
+		if (!es->ref_count)
+			goto err;
+
+		es->written = devm_kcalloc(ice_hw_to_dev(hw), es->count,
+					   sizeof(*es->written), GFP_KERNEL);
+
+		if (!es->written)
+			goto err;
+
+		es->mask_ena = devm_kcalloc(ice_hw_to_dev(hw), es->count,
+					    sizeof(*es->mask_ena), GFP_KERNEL);
+
+		if (!es->mask_ena)
+			goto err;
+	}
+	return 0;
+
+err:
+	ice_free_hw_tbls(hw);
+	return ICE_ERR_NO_MEMORY;
+}
+
+/**
+ * ice_prof_gen_key - generate profile ID key
+ * @hw: pointer to the HW struct
+ * @blk: the block in which to write profile ID to
+ * @ptg: packet type group (PTG) portion of key
+ * @vsig: VSIG portion of key
+ * @cdid: CDID portion of key
+ * @flags: flag portion of key
+ * @vl_msk: valid mask
+ * @dc_msk: don't care mask
+ * @nm_msk: never match mask
+ * @key: output of profile ID key
+ */
+static enum ice_status
+ice_prof_gen_key(struct ice_hw *hw, enum ice_block blk, u8 ptg, u16 vsig,
+		 u8 cdid, u16 flags, u8 vl_msk[ICE_TCAM_KEY_VAL_SZ],
+		 u8 dc_msk[ICE_TCAM_KEY_VAL_SZ], u8 nm_msk[ICE_TCAM_KEY_VAL_SZ],
+		 u8 key[ICE_TCAM_KEY_SZ])
+{
+	struct ice_prof_id_key inkey;
+
+	inkey.xlt1 = ptg;
+	inkey.xlt2_cdid = cpu_to_le16(vsig);
+	inkey.flags = cpu_to_le16(flags);
+
+	switch (hw->blk[blk].prof.cdid_bits) {
+	case 0:
+		break;
+	case 2:
+#define ICE_CD_2_M 0xC000U
+#define ICE_CD_2_S 14
+		inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_2_M);
+		inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_2_S);
+		break;
+	case 4:
+#define ICE_CD_4_M 0xF000U
+#define ICE_CD_4_S 12
+		inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_4_M);
+		inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_4_S);
+		break;
+	case 8:
+#define ICE_CD_8_M 0xFF00U
+#define ICE_CD_8_S 16
+		inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_8_M);
+		inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_8_S);
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_PKG, "Error in profile config\n");
+		break;
+	}
+
+	return ice_set_key(key, ICE_TCAM_KEY_SZ, (u8 *)&inkey, vl_msk, dc_msk,
+			   nm_msk, 0, ICE_TCAM_KEY_SZ / 2);
+}
+
+/**
+ * ice_tcam_write_entry - write TCAM entry
+ * @hw: pointer to the HW struct
+ * @blk: the block in which to write profile ID to
+ * @idx: the entry index to write to
+ * @prof_id: profile ID
+ * @ptg: packet type group (PTG) portion of key
+ * @vsig: VSIG portion of key
+ * @cdid: CDID portion of key
+ * @flags: flag portion of key
+ * @vl_msk: valid mask
+ * @dc_msk: don't care mask
+ * @nm_msk: never match mask
+ */
+static enum ice_status
+ice_tcam_write_entry(struct ice_hw *hw, enum ice_block blk, u16 idx,
+		     u8 prof_id, u8 ptg, u16 vsig, u8 cdid, u16 flags,
+		     u8 vl_msk[ICE_TCAM_KEY_VAL_SZ],
+		     u8 dc_msk[ICE_TCAM_KEY_VAL_SZ],
+		     u8 nm_msk[ICE_TCAM_KEY_VAL_SZ])
+{
+	struct ice_prof_tcam_entry;
+	enum ice_status status;
+
+	status = ice_prof_gen_key(hw, blk, ptg, vsig, cdid, flags, vl_msk,
+				  dc_msk, nm_msk, hw->blk[blk].prof.t[idx].key);
+	if (!status) {
+		hw->blk[blk].prof.t[idx].addr = cpu_to_le16(idx);
+		hw->blk[blk].prof.t[idx].prof_id = prof_id;
+	}
+
+	return status;
+}
+
+/**
+ * ice_vsig_get_ref - returns number of VSIs belong to a VSIG
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsig: VSIG to query
+ * @refs: pointer to variable to receive the reference count
+ */
+static enum ice_status
+ice_vsig_get_ref(struct ice_hw *hw, enum ice_block blk, u16 vsig, u16 *refs)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_vsig_vsi *ptr;
+
+	*refs = 0;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	ptr = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	while (ptr) {
+		(*refs)++;
+		ptr = ptr->next_vsi;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_has_prof_vsig - check to see if VSIG has a specific profile
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsig: VSIG to check against
+ * @hdl: profile handle
+ */
+static bool
+ice_has_prof_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_vsig_prof *ent;
+
+	list_for_each_entry(ent, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+			    list)
+		if (ent->profile_cookie == hdl)
+			return true;
+
+	ice_debug(hw, ICE_DBG_INIT, "Characteristic list for VSI group %d not found.\n",
+		  vsig);
+	return false;
+}
+
+/**
+ * ice_prof_bld_es - build profile ID extraction sequence changes
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @bld: the update package buffer build to add to
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_prof_bld_es(struct ice_hw *hw, enum ice_block blk,
+		struct ice_buf_build *bld, struct list_head *chgs)
+{
+	u16 vec_size = hw->blk[blk].es.fvw * sizeof(struct ice_fv_word);
+	struct ice_chs_chg *tmp;
+
+	list_for_each_entry(tmp, chgs, list_entry)
+		if (tmp->type == ICE_PTG_ES_ADD && tmp->add_prof) {
+			u16 off = tmp->prof_id * hw->blk[blk].es.fvw;
+			struct ice_pkg_es *p;
+			u32 id;
+
+			id = ice_sect_id(blk, ICE_VEC_TBL);
+			p = ice_pkg_buf_alloc_section(bld, id,
+						      struct_size(p, es, 1) + vec_size - sizeof(p->es[0]));
+
+			if (!p)
+				return ICE_ERR_MAX_LIMIT;
+
+			p->count = cpu_to_le16(1);
+			p->offset = cpu_to_le16(tmp->prof_id);
+
+			memcpy(p->es, &hw->blk[blk].es.t[off], vec_size);
+		}
+
+	return 0;
+}
+
+/**
+ * ice_prof_bld_tcam - build profile ID TCAM changes
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @bld: the update package buffer build to add to
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_prof_bld_tcam(struct ice_hw *hw, enum ice_block blk,
+		  struct ice_buf_build *bld, struct list_head *chgs)
+{
+	struct ice_chs_chg *tmp;
+
+	list_for_each_entry(tmp, chgs, list_entry)
+		if (tmp->type == ICE_TCAM_ADD && tmp->add_tcam_idx) {
+			struct ice_prof_id_section *p;
+			u32 id;
+
+			id = ice_sect_id(blk, ICE_PROF_TCAM);
+			p = ice_pkg_buf_alloc_section(bld, id,
+						      struct_size(p, entry, 1));
+
+			if (!p)
+				return ICE_ERR_MAX_LIMIT;
+
+			p->count = cpu_to_le16(1);
+			p->entry[0].addr = cpu_to_le16(tmp->tcam_idx);
+			p->entry[0].prof_id = tmp->prof_id;
+
+			memcpy(p->entry[0].key,
+			       &hw->blk[blk].prof.t[tmp->tcam_idx].key,
+			       sizeof(hw->blk[blk].prof.t->key));
+		}
+
+	return 0;
+}
+
+/**
+ * ice_prof_bld_xlt1 - build XLT1 changes
+ * @blk: hardware block
+ * @bld: the update package buffer build to add to
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_prof_bld_xlt1(enum ice_block blk, struct ice_buf_build *bld,
+		  struct list_head *chgs)
+{
+	struct ice_chs_chg *tmp;
+
+	list_for_each_entry(tmp, chgs, list_entry)
+		if (tmp->type == ICE_PTG_ES_ADD && tmp->add_ptg) {
+			struct ice_xlt1_section *p;
+			u32 id;
+
+			id = ice_sect_id(blk, ICE_XLT1);
+			p = ice_pkg_buf_alloc_section(bld, id,
+						      struct_size(p, value, 1));
+
+			if (!p)
+				return ICE_ERR_MAX_LIMIT;
+
+			p->count = cpu_to_le16(1);
+			p->offset = cpu_to_le16(tmp->ptype);
+			p->value[0] = tmp->ptg;
+		}
+
+	return 0;
+}
+
+/**
+ * ice_prof_bld_xlt2 - build XLT2 changes
+ * @blk: hardware block
+ * @bld: the update package buffer build to add to
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_prof_bld_xlt2(enum ice_block blk, struct ice_buf_build *bld,
+		  struct list_head *chgs)
+{
+	struct ice_chs_chg *tmp;
+
+	list_for_each_entry(tmp, chgs, list_entry) {
+		struct ice_xlt2_section *p;
+		u32 id;
+
+		switch (tmp->type) {
+		case ICE_VSIG_ADD:
+		case ICE_VSI_MOVE:
+		case ICE_VSIG_REM:
+			id = ice_sect_id(blk, ICE_XLT2);
+			p = ice_pkg_buf_alloc_section(bld, id,
+						      struct_size(p, value, 1));
+
+			if (!p)
+				return ICE_ERR_MAX_LIMIT;
+
+			p->count = cpu_to_le16(1);
+			p->offset = cpu_to_le16(tmp->vsi);
+			p->value[0] = cpu_to_le16(tmp->vsig);
+			break;
+		default:
 			break;
-		case ICE_SID_PROFID_TCAM_SW:
-		case ICE_SID_PROFID_TCAM_FD:
-		case ICE_SID_PROFID_TCAM_RSS:
-		case ICE_SID_PROFID_TCAM_ACL:
-		case ICE_SID_PROFID_TCAM_PE:
-			pid = (struct ice_prof_id_section *)sect;
-			src = (u8 *)pid->entry;
-			sect_len = le16_to_cpu(pid->count) *
-				sizeof(*hw->blk[block_id].prof.t);
-			dst = (u8 *)hw->blk[block_id].prof.t;
-			dst_len = hw->blk[block_id].prof.count *
-				sizeof(*hw->blk[block_id].prof.t);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_upd_prof_hw - update hardware using the change list
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_upd_prof_hw(struct ice_hw *hw, enum ice_block blk,
+		struct list_head *chgs)
+{
+	struct ice_buf_build *b;
+	struct ice_chs_chg *tmp;
+	enum ice_status status;
+	u16 pkg_sects;
+	u16 xlt1 = 0;
+	u16 xlt2 = 0;
+	u16 tcam = 0;
+	u16 es = 0;
+	u16 sects;
+
+	/* count number of sections we need */
+	list_for_each_entry(tmp, chgs, list_entry) {
+		switch (tmp->type) {
+		case ICE_PTG_ES_ADD:
+			if (tmp->add_ptg)
+				xlt1++;
+			if (tmp->add_prof)
+				es++;
 			break;
-		case ICE_SID_PROFID_REDIR_SW:
-		case ICE_SID_PROFID_REDIR_FD:
-		case ICE_SID_PROFID_REDIR_RSS:
-		case ICE_SID_PROFID_REDIR_ACL:
-		case ICE_SID_PROFID_REDIR_PE:
-			pr = (struct ice_prof_redir_section *)sect;
-			src = pr->redir_value;
-			sect_len = le16_to_cpu(pr->count) *
-				sizeof(*hw->blk[block_id].prof_redir.t);
-			dst = hw->blk[block_id].prof_redir.t;
-			dst_len = hw->blk[block_id].prof_redir.count *
-				sizeof(*hw->blk[block_id].prof_redir.t);
+		case ICE_TCAM_ADD:
+			tcam++;
 			break;
-		case ICE_SID_FLD_VEC_SW:
-		case ICE_SID_FLD_VEC_FD:
-		case ICE_SID_FLD_VEC_RSS:
-		case ICE_SID_FLD_VEC_ACL:
-		case ICE_SID_FLD_VEC_PE:
-			es = (struct ice_sw_fv_section *)sect;
-			src = (u8 *)es->fv;
-			sect_len = (u32)(le16_to_cpu(es->count) *
-					 hw->blk[block_id].es.fvw) *
-				sizeof(*hw->blk[block_id].es.t);
-			dst = (u8 *)hw->blk[block_id].es.t;
-			dst_len = (u32)(hw->blk[block_id].es.count *
-					hw->blk[block_id].es.fvw) *
-				sizeof(*hw->blk[block_id].es.t);
+		case ICE_VSIG_ADD:
+		case ICE_VSI_MOVE:
+		case ICE_VSIG_REM:
+			xlt2++;
 			break;
 		default:
-			return;
+			break;
+		}
+	}
+	sects = xlt1 + xlt2 + tcam + es;
+
+	if (!sects)
+		return 0;
+
+	/* Build update package buffer */
+	b = ice_pkg_buf_alloc(hw);
+	if (!b)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_pkg_buf_reserve_section(b, sects);
+	if (status)
+		goto error_tmp;
+
+	/* Preserve order of table update: ES, TCAM, PTG, VSIG */
+	if (es) {
+		status = ice_prof_bld_es(hw, blk, b, chgs);
+		if (status)
+			goto error_tmp;
+	}
+
+	if (tcam) {
+		status = ice_prof_bld_tcam(hw, blk, b, chgs);
+		if (status)
+			goto error_tmp;
+	}
+
+	if (xlt1) {
+		status = ice_prof_bld_xlt1(blk, b, chgs);
+		if (status)
+			goto error_tmp;
+	}
+
+	if (xlt2) {
+		status = ice_prof_bld_xlt2(blk, b, chgs);
+		if (status)
+			goto error_tmp;
+	}
+
+	/* After package buffer build check if the section count in buffer is
+	 * non-zero and matches the number of sections detected for package
+	 * update.
+	 */
+	pkg_sects = ice_pkg_buf_get_active_sections(b);
+	if (!pkg_sects || pkg_sects != sects) {
+		status = ICE_ERR_INVAL_SIZE;
+		goto error_tmp;
+	}
+
+	/* update package */
+	status = ice_update_pkg(hw, ice_pkg_buf(b), 1);
+	if (status == ICE_ERR_AQ_ERROR)
+		ice_debug(hw, ICE_DBG_INIT, "Unable to update HW profile\n");
+
+error_tmp:
+	ice_pkg_buf_free(hw, b);
+	return status;
+}
+
+/**
+ * ice_update_fd_mask - set Flow Director Field Vector mask for a profile
+ * @hw: pointer to the HW struct
+ * @prof_id: profile ID
+ * @mask_sel: mask select
+ *
+ * This function enable any of the masks selected by the mask select parameter
+ * for the profile specified.
+ */
+static void ice_update_fd_mask(struct ice_hw *hw, u16 prof_id, u32 mask_sel)
+{
+	wr32(hw, GLQF_FDMASK_SEL(prof_id), mask_sel);
+
+	ice_debug(hw, ICE_DBG_INIT, "fd mask(%d): %x = %x\n", prof_id,
+		  GLQF_FDMASK_SEL(prof_id), mask_sel);
+}
+
+struct ice_fd_src_dst_pair {
+	u8 prot_id;
+	u8 count;
+	u16 off;
+};
+
+static const struct ice_fd_src_dst_pair ice_fd_pairs[] = {
+	/* These are defined in pairs */
+	{ ICE_PROT_IPV4_OF_OR_S, 2, 12 },
+	{ ICE_PROT_IPV4_OF_OR_S, 2, 16 },
+
+	{ ICE_PROT_IPV4_IL, 2, 12 },
+	{ ICE_PROT_IPV4_IL, 2, 16 },
+
+	{ ICE_PROT_IPV6_OF_OR_S, 8, 8 },
+	{ ICE_PROT_IPV6_OF_OR_S, 8, 24 },
+
+	{ ICE_PROT_IPV6_IL, 8, 8 },
+	{ ICE_PROT_IPV6_IL, 8, 24 },
+
+	{ ICE_PROT_TCP_IL, 1, 0 },
+	{ ICE_PROT_TCP_IL, 1, 2 },
+
+	{ ICE_PROT_UDP_OF, 1, 0 },
+	{ ICE_PROT_UDP_OF, 1, 2 },
+
+	{ ICE_PROT_UDP_IL_OR_S, 1, 0 },
+	{ ICE_PROT_UDP_IL_OR_S, 1, 2 },
+
+	{ ICE_PROT_SCTP_IL, 1, 0 },
+	{ ICE_PROT_SCTP_IL, 1, 2 }
+};
+
+#define ICE_FD_SRC_DST_PAIR_COUNT	ARRAY_SIZE(ice_fd_pairs)
+
+/**
+ * ice_update_fd_swap - set register appropriately for a FD FV extraction
+ * @hw: pointer to the HW struct
+ * @prof_id: profile ID
+ * @es: extraction sequence (length of array is determined by the block)
+ */
+static enum ice_status
+ice_update_fd_swap(struct ice_hw *hw, u16 prof_id, struct ice_fv_word *es)
+{
+	DECLARE_BITMAP(pair_list, ICE_FD_SRC_DST_PAIR_COUNT);
+	u8 pair_start[ICE_FD_SRC_DST_PAIR_COUNT] = { 0 };
+#define ICE_FD_FV_NOT_FOUND (-2)
+	s8 first_free = ICE_FD_FV_NOT_FOUND;
+	u8 used[ICE_MAX_FV_WORDS] = { 0 };
+	s8 orig_free, si;
+	u32 mask_sel = 0;
+	u8 i, j, k;
+
+	bitmap_zero(pair_list, ICE_FD_SRC_DST_PAIR_COUNT);
+
+	/* This code assumes that the Flow Director field vectors are assigned
+	 * from the end of the FV indexes working towards the zero index, that
+	 * only complete fields will be included and will be consecutive, and
+	 * that there are no gaps between valid indexes.
+	 */
+
+	/* Determine swap fields present */
+	for (i = 0; i < hw->blk[ICE_BLK_FD].es.fvw; i++) {
+		/* Find the first free entry, assuming right to left population.
+		 * This is where we can start adding additional pairs if needed.
+		 */
+		if (first_free == ICE_FD_FV_NOT_FOUND && es[i].prot_id !=
+		    ICE_PROT_INVALID)
+			first_free = i - 1;
+
+		for (j = 0; j < ICE_FD_SRC_DST_PAIR_COUNT; j++)
+			if (es[i].prot_id == ice_fd_pairs[j].prot_id &&
+			    es[i].off == ice_fd_pairs[j].off) {
+				set_bit(j, pair_list);
+				pair_start[j] = i;
+			}
+	}
+
+	orig_free = first_free;
+
+	/* determine missing swap fields that need to be added */
+	for (i = 0; i < ICE_FD_SRC_DST_PAIR_COUNT; i += 2) {
+		u8 bit1 = test_bit(i + 1, pair_list);
+		u8 bit0 = test_bit(i, pair_list);
+
+		if (bit0 ^ bit1) {
+			u8 index;
+
+			/* add the appropriate 'paired' entry */
+			if (!bit0)
+				index = i;
+			else
+				index = i + 1;
+
+			/* check for room */
+			if (first_free + 1 < (s8)ice_fd_pairs[index].count)
+				return ICE_ERR_MAX_LIMIT;
+
+			/* place in extraction sequence */
+			for (k = 0; k < ice_fd_pairs[index].count; k++) {
+				es[first_free - k].prot_id =
+					ice_fd_pairs[index].prot_id;
+				es[first_free - k].off =
+					ice_fd_pairs[index].off + (k * 2);
+
+				if (k > first_free)
+					return ICE_ERR_OUT_OF_RANGE;
+
+				/* keep track of non-relevant fields */
+				mask_sel |= BIT(first_free - k);
+			}
+
+			pair_start[index] = first_free;
+			first_free -= ice_fd_pairs[index].count;
+		}
+	}
+
+	/* fill in the swap array */
+	si = hw->blk[ICE_BLK_FD].es.fvw - 1;
+	while (si >= 0) {
+		u8 indexes_used = 1;
+
+		/* assume flat at this index */
+#define ICE_SWAP_VALID	0x80
+		used[si] = si | ICE_SWAP_VALID;
+
+		if (orig_free == ICE_FD_FV_NOT_FOUND || si <= orig_free) {
+			si -= indexes_used;
+			continue;
+		}
+
+		/* check for a swap location */
+		for (j = 0; j < ICE_FD_SRC_DST_PAIR_COUNT; j++)
+			if (es[si].prot_id == ice_fd_pairs[j].prot_id &&
+			    es[si].off == ice_fd_pairs[j].off) {
+				u8 idx;
+
+				/* determine the appropriate matching field */
+				idx = j + ((j % 2) ? -1 : 1);
+
+				indexes_used = ice_fd_pairs[idx].count;
+				for (k = 0; k < indexes_used; k++) {
+					used[si - k] = (pair_start[idx] - k) |
+						ICE_SWAP_VALID;
+				}
+
+				break;
+			}
+
+		si -= indexes_used;
+	}
+
+	/* for each set of 4 swap and 4 inset indexes, write the appropriate
+	 * register
+	 */
+	for (j = 0; j < hw->blk[ICE_BLK_FD].es.fvw / 4; j++) {
+		u32 raw_swap = 0;
+		u32 raw_in = 0;
+
+		for (k = 0; k < 4; k++) {
+			u8 idx;
+
+			idx = (j * 4) + k;
+			if (used[idx] && !(mask_sel & BIT(idx))) {
+				raw_swap |= used[idx] << (k * BITS_PER_BYTE);
+#define ICE_INSET_DFLT 0x9f
+				raw_in |= ICE_INSET_DFLT << (k * BITS_PER_BYTE);
+			}
+		}
+
+		/* write the appropriate swap register set */
+		wr32(hw, GLQF_FDSWAP(prof_id, j), raw_swap);
+
+		ice_debug(hw, ICE_DBG_INIT, "swap wr(%d, %d): %x = %08x\n",
+			  prof_id, j, GLQF_FDSWAP(prof_id, j), raw_swap);
+
+		/* write the appropriate inset register set */
+		wr32(hw, GLQF_FDINSET(prof_id, j), raw_in);
+
+		ice_debug(hw, ICE_DBG_INIT, "inset wr(%d, %d): %x = %08x\n",
+			  prof_id, j, GLQF_FDINSET(prof_id, j), raw_in);
+	}
+
+	/* initially clear the mask select for this profile */
+	ice_update_fd_mask(hw, prof_id, 0);
+
+	return 0;
+}
+
+/* The entries here needs to match the order of enum ice_ptype_attrib */
+static const struct ice_ptype_attrib_info ice_ptype_attributes[] = {
+	{ ICE_GTP_PDU_EH,	ICE_GTP_PDU_FLAG_MASK },
+	{ ICE_GTP_SESSION,	ICE_GTP_FLAGS_MASK },
+	{ ICE_GTP_DOWNLINK,	ICE_GTP_FLAGS_MASK },
+	{ ICE_GTP_UPLINK,	ICE_GTP_FLAGS_MASK },
+};
+
+/**
+ * ice_get_ptype_attrib_info - get ptype attribute information
+ * @type: attribute type
+ * @info: pointer to variable to the attribute information
+ */
+static void
+ice_get_ptype_attrib_info(enum ice_ptype_attrib_type type,
+			  struct ice_ptype_attrib_info *info)
+{
+	*info = ice_ptype_attributes[type];
+}
+
+/**
+ * ice_add_prof_attrib - add any PTG with attributes to profile
+ * @prof: pointer to the profile to which PTG entries will be added
+ * @ptg: PTG to be added
+ * @ptype: PTYPE that needs to be looked up
+ * @attr: array of attributes that will be considered
+ * @attr_cnt: number of elements in the attribute array
+ */
+static enum ice_status
+ice_add_prof_attrib(struct ice_prof_map *prof, u8 ptg, u16 ptype,
+		    const struct ice_ptype_attributes *attr, u16 attr_cnt)
+{
+	bool found = false;
+	u16 i;
+
+	for (i = 0; i < attr_cnt; i++) {
+		if (attr[i].ptype == ptype) {
+			found = true;
+
+			prof->ptg[prof->ptg_cnt] = ptg;
+			ice_get_ptype_attrib_info(attr[i].attrib,
+						  &prof->attr[prof->ptg_cnt]);
+
+			if (++prof->ptg_cnt >= ICE_MAX_PTG_PER_PROFILE)
+				return ICE_ERR_MAX_LIMIT;
+		}
+	}
+
+	if (!found)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	return 0;
+}
+
+/**
+ * ice_add_prof - add profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ * @ptypes: array of bitmaps indicating ptypes (ICE_FLOW_PTYPE_MAX bits)
+ * @attr: array of attributes
+ * @attr_cnt: number of elements in attrib array
+ * @es: extraction sequence (length of array is determined by the block)
+ * @masks: mask for extraction sequence
+ *
+ * This function registers a profile, which matches a set of PTYPES with a
+ * particular extraction sequence. While the hardware profile is allocated
+ * it will not be written until the first call to ice_add_flow that specifies
+ * the ID value used here.
+ */
+enum ice_status
+ice_add_prof(struct ice_hw *hw, enum ice_block blk, u64 id, u8 ptypes[],
+	     const struct ice_ptype_attributes *attr, u16 attr_cnt,
+	     struct ice_fv_word *es, u16 *masks)
+{
+	u32 bytes = DIV_ROUND_UP(ICE_FLOW_PTYPE_MAX, BITS_PER_BYTE);
+	DECLARE_BITMAP(ptgs_used, ICE_XLT1_CNT);
+	struct ice_prof_map *prof;
+	enum ice_status status;
+	u8 byte = 0;
+	u8 prof_id;
+
+	bitmap_zero(ptgs_used, ICE_XLT1_CNT);
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+
+	/* search for existing profile */
+	status = ice_find_prof_id_with_mask(hw, blk, es, masks, &prof_id);
+	if (status) {
+		/* allocate profile ID */
+		status = ice_alloc_prof_id(hw, blk, &prof_id);
+		if (status)
+			goto err_ice_add_prof;
+		if (blk == ICE_BLK_FD) {
+			/* For Flow Director block, the extraction sequence may
+			 * need to be altered in the case where there are paired
+			 * fields that have no match. This is necessary because
+			 * for Flow Director, src and dest fields need to paired
+			 * for filter programming and these values are swapped
+			 * during Tx.
+			 */
+			status = ice_update_fd_swap(hw, prof_id, es);
+			if (status)
+				goto err_ice_add_prof;
+		}
+		status = ice_update_prof_masking(hw, blk, prof_id, masks);
+		if (status)
+			goto err_ice_add_prof;
+
+		/* and write new es */
+		ice_write_es(hw, blk, prof_id, es);
+	}
+
+	ice_prof_inc_ref(hw, blk, prof_id);
+
+	/* add profile info */
+
+	prof = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*prof), GFP_KERNEL);
+	if (!prof)
+		goto err_ice_add_prof;
+
+	prof->profile_cookie = id;
+	prof->prof_id = prof_id;
+	prof->ptg_cnt = 0;
+	prof->context = 0;
+
+	/* build list of ptgs */
+	while (bytes && prof->ptg_cnt < ICE_MAX_PTG_PER_PROFILE) {
+		u8 bit;
+
+		if (!ptypes[byte]) {
+			bytes--;
+			byte++;
+			continue;
+		}
+
+		/* Examine 8 bits per byte */
+		for_each_set_bit(bit, (unsigned long *)&ptypes[byte],
+				 BITS_PER_BYTE) {
+			u16 ptype;
+			u8 ptg;
+
+			ptype = byte * BITS_PER_BYTE + bit;
+
+			/* The package should place all ptypes in a non-zero
+			 * PTG, so the following call should never fail.
+			 */
+			if (ice_ptg_find_ptype(hw, blk, ptype, &ptg))
+				continue;
+
+			/* If PTG is already added, skip and continue */
+			if (test_bit(ptg, ptgs_used))
+				continue;
+
+			set_bit(ptg, ptgs_used);
+			/* Check to see there are any attributes for this
+			 * ptype, and add them if found.
+			 */
+			status = ice_add_prof_attrib(prof, ptg, ptype, attr,
+						     attr_cnt);
+			if (status == ICE_ERR_MAX_LIMIT)
+				break;
+			if (status) {
+				/* This is simple a ptype/PTG with no
+				 * attribute
+				 */
+				prof->ptg[prof->ptg_cnt] = ptg;
+				prof->attr[prof->ptg_cnt].flags = 0;
+				prof->attr[prof->ptg_cnt].mask = 0;
+
+				if (++prof->ptg_cnt >= ICE_MAX_PTG_PER_PROFILE)
+					break;
+			}
+		}
+
+		bytes--;
+		byte++;
+	}
+
+	list_add(&prof->list, &hw->blk[blk].es.prof_map);
+	status = 0;
+
+err_ice_add_prof:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_search_prof_id - Search for a profile tracking ID
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ *
+ * This will search for a profile tracking ID which was previously added.
+ * The profile map lock should be held before calling this function.
+ */
+struct ice_prof_map *
+ice_search_prof_id(struct ice_hw *hw, enum ice_block blk, u64 id)
+{
+	struct ice_prof_map *entry = NULL;
+	struct ice_prof_map *map;
+
+	list_for_each_entry(map, &hw->blk[blk].es.prof_map, list)
+		if (map->profile_cookie == id) {
+			entry = map;
+			break;
+		}
+
+	return entry;
+}
+
+/**
+ * ice_set_prof_context - Set context for a given profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ * @cntxt: context
+ */
+enum ice_status
+ice_set_prof_context(struct ice_hw *hw, enum ice_block blk, u64 id, u64 cntxt)
+{
+	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
+	struct ice_prof_map *entry;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	entry = ice_search_prof_id(hw, blk, id);
+	if (entry) {
+		entry->context = cntxt;
+		status = 0;
+	}
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_get_prof_context - Get context for a given profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ * @cntxt: pointer to variable to receive the context
+ */
+enum ice_status
+ice_get_prof_context(struct ice_hw *hw, enum ice_block blk, u64 id, u64 *cntxt)
+{
+	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
+	struct ice_prof_map *entry;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	entry = ice_search_prof_id(hw, blk, id);
+	if (entry) {
+		*cntxt = entry->context;
+		status = 0;
+	}
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_vsig_prof_id_count - count profiles in a VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: VSIG to remove the profile from
+ */
+static u16
+ice_vsig_prof_id_count(struct ice_hw *hw, enum ice_block blk, u16 vsig)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M, count = 0;
+	struct ice_vsig_prof *p;
+
+	list_for_each_entry(p, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+			    list)
+		count++;
+
+	return count;
+}
+
+/**
+ * ice_rel_tcam_idx - release a TCAM index
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @idx: the index to release
+ */
+static enum ice_status
+ice_rel_tcam_idx(struct ice_hw *hw, enum ice_block blk, u16 idx)
+{
+	/* Masks to invoke a never match entry */
+	u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+	u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFE, 0xFF, 0xFF, 0xFF, 0xFF };
+	u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x01, 0x00, 0x00, 0x00, 0x00 };
+	enum ice_status status;
+
+	/* write the TCAM entry */
+	status = ice_tcam_write_entry(hw, blk, idx, 0, 0, 0, 0, 0, vl_msk,
+				      dc_msk, nm_msk);
+	if (status)
+		return status;
+
+	/* release the TCAM entry */
+	status = ice_free_tcam_ent(hw, blk, idx);
+
+	return status;
+}
+
+/**
+ * ice_rem_prof_id - remove one profile from a VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @prof: pointer to profile structure to remove
+ */
+static enum ice_status
+ice_rem_prof_id(struct ice_hw *hw, enum ice_block blk,
+		struct ice_vsig_prof *prof)
+{
+	enum ice_status status;
+	u16 i;
+
+	for (i = 0; i < prof->tcam_count; i++)
+		if (prof->tcam[i].in_use) {
+			prof->tcam[i].in_use = false;
+			status = ice_rel_tcam_idx(hw, blk,
+						  prof->tcam[i].tcam_idx);
+			if (status)
+				return ICE_ERR_HW_TABLE;
 		}
 
-		/* if the section offset exceeds destination length, terminate
-		 * table fill.
+	return 0;
+}
+
+/**
+ * ice_rem_vsig - remove VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: the VSIG to remove
+ * @chg: the change list
+ */
+static enum ice_status
+ice_rem_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig,
+	     struct list_head *chg)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_vsig_vsi *vsi_cur;
+	struct ice_vsig_prof *d, *t;
+	enum ice_status status;
+
+	/* remove TCAM entries */
+	list_for_each_entry_safe(d, t,
+				 &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+				 list) {
+		status = ice_rem_prof_id(hw, blk, d);
+		if (status)
+			return status;
+
+		list_del(&d->list);
+		devm_kfree(ice_hw_to_dev(hw), d);
+	}
+
+	/* Move all VSIS associated with this VSIG to the default VSIG */
+	vsi_cur = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	/* If the VSIG has at least 1 VSI then iterate through the list
+	 * and remove the VSIs before deleting the group.
+	 */
+	if (vsi_cur)
+		do {
+			struct ice_vsig_vsi *tmp = vsi_cur->next_vsi;
+			struct ice_chs_chg *p;
+
+			p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p),
+					 GFP_KERNEL);
+			if (!p)
+				return ICE_ERR_NO_MEMORY;
+
+			p->type = ICE_VSIG_REM;
+			p->orig_vsig = vsig;
+			p->vsig = ICE_DEFAULT_VSIG;
+			p->vsi = vsi_cur - hw->blk[blk].xlt2.vsis;
+
+			list_add(&p->list_entry, chg);
+
+			vsi_cur = tmp;
+		} while (vsi_cur);
+
+	return ice_vsig_free(hw, blk, vsig);
+}
+
+/**
+ * ice_rem_prof_id_vsig - remove a specific profile from a VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: VSIG to remove the profile from
+ * @hdl: profile handle indicating which profile to remove
+ * @chg: list to receive a record of changes
+ */
+static enum ice_status
+ice_rem_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl,
+		     struct list_head *chg)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_vsig_prof *p, *t;
+	enum ice_status status;
+
+	list_for_each_entry_safe(p, t,
+				 &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+				 list)
+		if (p->profile_cookie == hdl) {
+			if (ice_vsig_prof_id_count(hw, blk, vsig) == 1)
+				/* this is the last profile, remove the VSIG */
+				return ice_rem_vsig(hw, blk, vsig, chg);
+
+			status = ice_rem_prof_id(hw, blk, p);
+			if (!status) {
+				list_del(&p->list);
+				devm_kfree(ice_hw_to_dev(hw), p);
+			}
+			return status;
+		}
+
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_rem_flow_all - remove all flows with a particular profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ */
+static enum ice_status
+ice_rem_flow_all(struct ice_hw *hw, enum ice_block blk, u64 id)
+{
+	struct ice_chs_chg *del, *tmp;
+	enum ice_status status;
+	struct list_head chg;
+	u16 i;
+
+	INIT_LIST_HEAD(&chg);
+
+	for (i = 1; i < ICE_MAX_VSIGS; i++)
+		if (hw->blk[blk].xlt2.vsig_tbl[i].in_use) {
+			if (ice_has_prof_vsig(hw, blk, i, id)) {
+				status = ice_rem_prof_id_vsig(hw, blk, i, id,
+							      &chg);
+				if (status)
+					goto err_ice_rem_flow_all;
+			}
+		}
+
+	status = ice_upd_prof_hw(hw, blk, &chg);
+
+err_ice_rem_flow_all:
+	list_for_each_entry_safe(del, tmp, &chg, list_entry) {
+		list_del(&del->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
+
+	return status;
+}
+
+/**
+ * ice_rem_prof - remove profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ *
+ * This will remove the profile specified by the ID parameter, which was
+ * previously created through ice_add_prof. If any existing entries
+ * are associated with this profile, they will be removed as well.
+ */
+enum ice_status ice_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 id)
+{
+	struct ice_prof_map *pmap;
+	enum ice_status status;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+
+	pmap = ice_search_prof_id(hw, blk, id);
+	if (!pmap) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto err_ice_rem_prof;
+	}
+
+	/* remove all flows with this profile */
+	status = ice_rem_flow_all(hw, blk, pmap->profile_cookie);
+	if (status)
+		goto err_ice_rem_prof;
+
+	/* dereference profile, and possibly remove */
+	ice_prof_dec_ref(hw, blk, pmap->prof_id);
+
+	list_del(&pmap->list);
+	devm_kfree(ice_hw_to_dev(hw), pmap);
+
+err_ice_rem_prof:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_get_prof - get profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @hdl: profile handle
+ * @chg: change list
+ */
+static enum ice_status
+ice_get_prof(struct ice_hw *hw, enum ice_block blk, u64 hdl,
+	     struct list_head *chg)
+{
+	enum ice_status status = 0;
+	struct ice_prof_map *map;
+	struct ice_chs_chg *p;
+	u16 i;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	/* Get the details on the profile specified by the handle ID */
+	map = ice_search_prof_id(hw, blk, hdl);
+	if (!map) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto err_ice_get_prof;
+	}
+
+	for (i = 0; i < map->ptg_cnt; i++)
+		if (!hw->blk[blk].es.written[map->prof_id]) {
+			/* add ES to change list */
+			p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p),
+					 GFP_KERNEL);
+			if (!p) {
+				status = ICE_ERR_NO_MEMORY;
+				goto err_ice_get_prof;
+			}
+
+			p->type = ICE_PTG_ES_ADD;
+			p->ptype = 0;
+			p->ptg = map->ptg[i];
+			p->attr = map->attr[i];
+			p->add_ptg = 0;
+
+			p->add_prof = 1;
+			p->prof_id = map->prof_id;
+
+			hw->blk[blk].es.written[map->prof_id] = true;
+
+			list_add(&p->list_entry, chg);
+		}
+
+err_ice_get_prof:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	/* let caller clean up the change list */
+	return status;
+}
+
+/**
+ * ice_get_profs_vsig - get a copy of the list of profiles from a VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: VSIG from which to copy the list
+ * @lst: output list
+ *
+ * This routine makes a copy of the list of profiles in the specified VSIG.
+ */
+static enum ice_status
+ice_get_profs_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig,
+		   struct list_head *lst)
+{
+	struct ice_vsig_prof *ent1, *ent2;
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+
+	list_for_each_entry(ent1, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+			    list) {
+		struct ice_vsig_prof *p;
+
+		/* copy to the input list */
+		p = devm_kmemdup(ice_hw_to_dev(hw), ent1, sizeof(*p),
+				 GFP_KERNEL);
+		if (!p)
+			goto err_ice_get_profs_vsig;
+
+		list_add_tail(&p->list, lst);
+	}
+
+	return 0;
+
+err_ice_get_profs_vsig:
+	list_for_each_entry_safe(ent1, ent2, lst, list) {
+		list_del(&ent1->list);
+		devm_kfree(ice_hw_to_dev(hw), ent1);
+	}
+
+	return ICE_ERR_NO_MEMORY;
+}
+
+/**
+ * ice_add_prof_to_lst - add profile entry to a list
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @lst: the list to be added to
+ * @hdl: profile handle of entry to add
+ */
+static enum ice_status
+ice_add_prof_to_lst(struct ice_hw *hw, enum ice_block blk,
+		    struct list_head *lst, u64 hdl)
+{
+	enum ice_status status = 0;
+	struct ice_prof_map *map;
+	struct ice_vsig_prof *p;
+	u16 i;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	map = ice_search_prof_id(hw, blk, hdl);
+	if (!map) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto err_ice_add_prof_to_lst;
+	}
+
+	p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_ice_add_prof_to_lst;
+	}
+
+	p->profile_cookie = map->profile_cookie;
+	p->prof_id = map->prof_id;
+	p->tcam_count = map->ptg_cnt;
+
+	for (i = 0; i < map->ptg_cnt; i++) {
+		p->tcam[i].prof_id = map->prof_id;
+		p->tcam[i].tcam_idx = ICE_INVALID_TCAM;
+		p->tcam[i].ptg = map->ptg[i];
+		p->tcam[i].attr = map->attr[i];
+	}
+
+	list_add(&p->list, lst);
+
+err_ice_add_prof_to_lst:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_move_vsi - move VSI to another VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the VSI to move
+ * @vsig: the VSIG to move the VSI to
+ * @chg: the change list
+ */
+static enum ice_status
+ice_move_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig,
+	     struct list_head *chg)
+{
+	enum ice_status status;
+	struct ice_chs_chg *p;
+	u16 orig_vsig;
+
+	p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_vsig_find_vsi(hw, blk, vsi, &orig_vsig);
+	if (!status)
+		status = ice_vsig_add_mv_vsi(hw, blk, vsi, vsig);
+
+	if (status) {
+		devm_kfree(ice_hw_to_dev(hw), p);
+		return status;
+	}
+
+	p->type = ICE_VSI_MOVE;
+	p->vsi = vsi;
+	p->orig_vsig = orig_vsig;
+	p->vsig = vsig;
+
+	list_add(&p->list_entry, chg);
+
+	return 0;
+}
+
+/**
+ * ice_set_tcam_flags - set TCAM flag don't care mask
+ * @mask: mask for flags
+ * @dc_mask: pointer to the don't care mask
+ */
+static void ice_set_tcam_flags(u16 mask, u8 dc_mask[ICE_TCAM_KEY_VAL_SZ])
+{
+	u16 *flag_word;
+
+	/* flags are lowest u16 */
+	flag_word = (u16 *)dc_mask;
+	*flag_word = ~mask;
+}
+
+/**
+ * ice_rem_chg_tcam_ent - remove a specific TCAM entry from change list
+ * @hw: pointer to the HW struct
+ * @idx: the index of the TCAM entry to remove
+ * @chg: the list of change structures to search
+ */
+static void
+ice_rem_chg_tcam_ent(struct ice_hw *hw, u16 idx, struct list_head *chg)
+{
+	struct ice_chs_chg *pos, *tmp;
+
+	list_for_each_entry_safe(tmp, pos, chg, list_entry)
+		if (tmp->type == ICE_TCAM_ADD && tmp->tcam_idx == idx) {
+			list_del(&tmp->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), tmp);
+		}
+}
+
+/**
+ * ice_prof_tcam_ena_dis - add enable or disable TCAM change
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @enable: true to enable, false to disable
+ * @vsig: the VSIG of the TCAM entry
+ * @tcam: pointer the TCAM info structure of the TCAM to disable
+ * @chg: the change list
+ *
+ * This function appends an enable or disable TCAM entry in the change log
+ */
+static enum ice_status
+ice_prof_tcam_ena_dis(struct ice_hw *hw, enum ice_block blk, bool enable,
+		      u16 vsig, struct ice_tcam_inf *tcam,
+		      struct list_head *chg)
+{
+	enum ice_status status;
+	struct ice_chs_chg *p;
+
+	u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+	u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0x00, 0x00, 0x00 };
+	u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+	/* if disabling, free the TCAM */
+	if (!enable) {
+		status = ice_rel_tcam_idx(hw, blk, tcam->tcam_idx);
+
+		/* if we have already created a change for this TCAM entry, then
+		 * we need to remove that entry, in order to prevent writing to
+		 * a TCAM entry we no longer will have ownership of.
 		 */
-		if (offset > dst_len)
-			return;
+		ice_rem_chg_tcam_ent(hw, tcam->tcam_idx, chg);
+		tcam->tcam_idx = 0;
+		tcam->in_use = 0;
+		return status;
+	}
 
-		/* if the sum of section size and offset exceed destination size
-		 * then we are out of bounds of the HW table size for that PF.
-		 * Changing section length to fill the remaining table space
-		 * of that PF.
+	/* for re-enabling, reallocate a TCAM */
+	/* for entries with empty attribute masks, allocate entry from
+	 * the bottom of the TCAM table; otherwise, allocate from the
+	 * top of the table in order to give it higher priority
+	 */
+	status = ice_alloc_tcam_ent(hw, blk, tcam->attr.mask == 0,
+				    &tcam->tcam_idx);
+	if (status)
+		return status;
+
+	/* add TCAM to change list */
+	p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ICE_ERR_NO_MEMORY;
+
+	/* set don't care masks for TCAM flags */
+	ice_set_tcam_flags(tcam->attr.mask, dc_msk);
+
+	status = ice_tcam_write_entry(hw, blk, tcam->tcam_idx, tcam->prof_id,
+				      tcam->ptg, vsig, 0, tcam->attr.flags,
+				      vl_msk, dc_msk, nm_msk);
+	if (status)
+		goto err_ice_prof_tcam_ena_dis;
+
+	tcam->in_use = 1;
+
+	p->type = ICE_TCAM_ADD;
+	p->add_tcam_idx = true;
+	p->prof_id = tcam->prof_id;
+	p->ptg = tcam->ptg;
+	p->vsig = 0;
+	p->tcam_idx = tcam->tcam_idx;
+
+	/* log change */
+	list_add(&p->list_entry, chg);
+
+	return 0;
+
+err_ice_prof_tcam_ena_dis:
+	devm_kfree(ice_hw_to_dev(hw), p);
+	return status;
+}
+
+/**
+ * ice_ptg_attr_in_use - determine if PTG and attribute pair is in use
+ * @ptg_attr: pointer to the PTG and attribute pair to check
+ * @ptgs_used: bitmap that denotes which PTGs are in use
+ * @attr_used: array of PTG and attributes pairs already used
+ * @attr_cnt: count of entries in the attr_used array
+ */
+static bool
+ice_ptg_attr_in_use(struct ice_tcam_inf *ptg_attr, unsigned long *ptgs_used,
+		    struct ice_tcam_inf *attr_used[], u16 attr_cnt)
+{
+	u16 i;
+
+	if (!test_bit(ptg_attr->ptg, ptgs_used))
+		return false;
+
+	/* the PTG is used, so now look for correct attributes */
+	for (i = 0; i < attr_cnt; i++)
+		if (attr_used[i]->ptg == ptg_attr->ptg &&
+		    attr_used[i]->attr.flags == ptg_attr->attr.flags &&
+		    attr_used[i]->attr.mask == ptg_attr->attr.mask)
+			return true;
+
+	return false;
+}
+
+/**
+ * ice_adj_prof_priorities - adjust profile based on priorities
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: the VSIG for which to adjust profile priorities
+ * @chg: the change list
+ */
+static enum ice_status
+ice_adj_prof_priorities(struct ice_hw *hw, enum ice_block blk, u16 vsig,
+			struct list_head *chg)
+{
+	DECLARE_BITMAP(ptgs_used, ICE_XLT1_CNT);
+	struct ice_tcam_inf **attr_used;
+	enum ice_status status = 0;
+	struct ice_vsig_prof *t;
+	u16 attr_used_cnt = 0;
+	u16 idx;
+
+#define ICE_MAX_PTG_ATTRS	1024
+	attr_used = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_PTG_ATTRS,
+				 sizeof(*attr_used), GFP_KERNEL);
+	if (!attr_used)
+		return ICE_ERR_NO_MEMORY;
+
+	bitmap_zero(ptgs_used, ICE_XLT1_CNT);
+	idx = vsig & ICE_VSIG_IDX_M;
+
+	/* Priority is based on the order in which the profiles are added. The
+	 * newest added profile has highest priority and the oldest added
+	 * profile has the lowest priority. Since the profile property list for
+	 * a VSIG is sorted from newest to oldest, this code traverses the list
+	 * in order and enables the first of each PTG that it finds (that is not
+	 * already enabled); it also disables any duplicate PTGs that it finds
+	 * in the older profiles (that are currently enabled).
+	 */
+
+	list_for_each_entry(t, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+			    list) {
+		u16 i;
+
+		for (i = 0; i < t->tcam_count; i++) {
+			bool used;
+
+			/* Scan the priorities from newest to oldest.
+			 * Make sure that the newest profiles take priority.
+			 */
+			used = ice_ptg_attr_in_use(&t->tcam[i], ptgs_used,
+						   attr_used, attr_used_cnt);
+
+			if (used && t->tcam[i].in_use) {
+				/* need to mark this PTG as never match, as it
+				 * was already in use and therefore duplicate
+				 * (and lower priority)
+				 */
+				status = ice_prof_tcam_ena_dis(hw, blk, false,
+							       vsig,
+							       &t->tcam[i],
+							       chg);
+				if (status)
+					goto err_ice_adj_prof_priorities;
+			} else if (!used && !t->tcam[i].in_use) {
+				/* need to enable this PTG, as it in not in use
+				 * and not enabled (highest priority)
+				 */
+				status = ice_prof_tcam_ena_dis(hw, blk, true,
+							       vsig,
+							       &t->tcam[i],
+							       chg);
+				if (status)
+					goto err_ice_adj_prof_priorities;
+			}
+
+			/* keep track of used ptgs */
+			set_bit(t->tcam[i].ptg, ptgs_used);
+			if (attr_used_cnt < ICE_MAX_PTG_ATTRS)
+				attr_used[attr_used_cnt++] = &t->tcam[i];
+			else
+				ice_debug(hw, ICE_DBG_INIT, "Warn: ICE_MAX_PTG_ATTRS exceeded\n");
+		}
+	}
+
+err_ice_adj_prof_priorities:
+	devm_kfree(ice_hw_to_dev(hw), attr_used);
+	return status;
+}
+
+/**
+ * ice_add_prof_id_vsig - add profile to VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: the VSIG to which this profile is to be added
+ * @hdl: the profile handle indicating the profile to add
+ * @rev: true to add entries to the end of the list
+ * @chg: the change list
+ */
+static enum ice_status
+ice_add_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl,
+		     bool rev, struct list_head *chg)
+{
+	/* Masks that ignore flags */
+	u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+	u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0x00, 0x00, 0x00 };
+	u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x00, 0x00, 0x00, 0x00, 0x00 };
+	enum ice_status status = 0;
+	struct ice_prof_map *map;
+	struct ice_vsig_prof *t;
+	struct ice_chs_chg *p;
+	u16 vsig_idx, i;
+
+	/* Error, if this VSIG already has this profile */
+	if (ice_has_prof_vsig(hw, blk, vsig, hdl))
+		return ICE_ERR_ALREADY_EXISTS;
+
+	/* new VSIG profile structure */
+	t = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ICE_ERR_NO_MEMORY;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	/* Get the details on the profile specified by the handle ID */
+	map = ice_search_prof_id(hw, blk, hdl);
+	if (!map) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto err_ice_add_prof_id_vsig;
+	}
+
+	t->profile_cookie = map->profile_cookie;
+	t->prof_id = map->prof_id;
+	t->tcam_count = map->ptg_cnt;
+
+	/* create TCAM entries */
+	for (i = 0; i < map->ptg_cnt; i++) {
+		u16 tcam_idx;
+
+		/* add TCAM to change list */
+		p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+		if (!p) {
+			status = ICE_ERR_NO_MEMORY;
+			goto err_ice_add_prof_id_vsig;
+		}
+
+		/* allocate the TCAM entry index */
+		/* for entries with empty attribute masks, allocate entry from
+		 * the bottom of the TCAM table; otherwise, allocate from the
+		 * top of the table in order to give it higher priority
 		 */
-		if ((offset + sect_len) > dst_len)
-			sect_len = dst_len - offset;
+		status = ice_alloc_tcam_ent(hw, blk, map->attr[i].mask == 0,
+					    &tcam_idx);
+		if (status) {
+			devm_kfree(ice_hw_to_dev(hw), p);
+			goto err_ice_add_prof_id_vsig;
+		}
+
+		t->tcam[i].ptg = map->ptg[i];
+		t->tcam[i].prof_id = map->prof_id;
+		t->tcam[i].tcam_idx = tcam_idx;
+		t->tcam[i].attr = map->attr[i];
+		t->tcam[i].in_use = true;
+
+		p->type = ICE_TCAM_ADD;
+		p->add_tcam_idx = true;
+		p->prof_id = t->tcam[i].prof_id;
+		p->ptg = t->tcam[i].ptg;
+		p->vsig = vsig;
+		p->tcam_idx = t->tcam[i].tcam_idx;
+
+		/* set don't care masks for TCAM flags */
+		ice_set_tcam_flags(t->tcam[i].attr.mask, dc_msk);
+
+		/* write the TCAM entry */
+		status = ice_tcam_write_entry(hw, blk, t->tcam[i].tcam_idx,
+					      t->tcam[i].prof_id,
+					      t->tcam[i].ptg, vsig, 0,
+					      t->tcam[i].attr.flags, vl_msk,
+					      dc_msk, nm_msk);
+		if (status) {
+			devm_kfree(ice_hw_to_dev(hw), p);
+			goto err_ice_add_prof_id_vsig;
+		}
+
+		/* log change */
+		list_add(&p->list_entry, chg);
+	}
+
+	/* add profile to VSIG */
+	vsig_idx = vsig & ICE_VSIG_IDX_M;
+	if (rev)
+		list_add_tail(&t->list,
+			      &hw->blk[blk].xlt2.vsig_tbl[vsig_idx].prop_lst);
+	else
+		list_add(&t->list,
+			 &hw->blk[blk].xlt2.vsig_tbl[vsig_idx].prop_lst);
+
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
 
-		memcpy(dst + offset, src, sect_len);
-		offset += sect_len;
-		sect = ice_pkg_enum_section(NULL, &state, sid);
-	}
+err_ice_add_prof_id_vsig:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	/* let caller clean up the change list */
+	devm_kfree(ice_hw_to_dev(hw), t);
+	return status;
 }
 
 /**
- * ice_fill_blk_tbls - Read package context for tables
- * @hw: pointer to the hardware structure
- *
- * Reads the current package contents and populates the driver
- * database with the data iteratively for all advanced feature
- * blocks. Assume that the HW tables have been allocated.
+ * ice_create_prof_id_vsig - add a new VSIG with a single profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the initial VSI that will be in VSIG
+ * @hdl: the profile handle of the profile that will be added to the VSIG
+ * @chg: the change list
  */
-void ice_fill_blk_tbls(struct ice_hw *hw)
+static enum ice_status
+ice_create_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl,
+			struct list_head *chg)
 {
-	u8 i;
+	enum ice_status status;
+	struct ice_chs_chg *p;
+	u16 new_vsig;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		enum ice_block blk_id = (enum ice_block)i;
+	p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ICE_ERR_NO_MEMORY;
 
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].xlt1.sid);
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].xlt2.sid);
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].prof.sid);
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].prof_redir.sid);
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].es.sid);
+	new_vsig = ice_vsig_alloc(hw, blk);
+	if (!new_vsig) {
+		status = ICE_ERR_HW_TABLE;
+		goto err_ice_create_prof_id_vsig;
 	}
 
-	ice_init_sw_db(hw);
+	status = ice_move_vsi(hw, blk, vsi, new_vsig, chg);
+	if (status)
+		goto err_ice_create_prof_id_vsig;
+
+	status = ice_add_prof_id_vsig(hw, blk, new_vsig, hdl, false, chg);
+	if (status)
+		goto err_ice_create_prof_id_vsig;
+
+	p->type = ICE_VSIG_ADD;
+	p->vsi = vsi;
+	p->orig_vsig = ICE_DEFAULT_VSIG;
+	p->vsig = new_vsig;
+
+	list_add(&p->list_entry, chg);
+
+	return 0;
+
+err_ice_create_prof_id_vsig:
+	/* let caller clean up the change list */
+	devm_kfree(ice_hw_to_dev(hw), p);
+	return status;
 }
 
 /**
- * ice_free_hw_tbls - free hardware table memory
- * @hw: pointer to the hardware structure
+ * ice_create_vsig_from_lst - create a new VSIG with a list of profiles
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the initial VSI that will be in VSIG
+ * @lst: the list of profile that will be added to the VSIG
+ * @new_vsig: return of new VSIG
+ * @chg: the change list
  */
-void ice_free_hw_tbls(struct ice_hw *hw)
+static enum ice_status
+ice_create_vsig_from_lst(struct ice_hw *hw, enum ice_block blk, u16 vsi,
+			 struct list_head *lst, u16 *new_vsig,
+			 struct list_head *chg)
 {
-	u8 i;
+	struct ice_vsig_prof *t;
+	enum ice_status status;
+	u16 vsig;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		hw->blk[i].is_list_init = false;
+	vsig = ice_vsig_alloc(hw, blk);
+	if (!vsig)
+		return ICE_ERR_HW_TABLE;
 
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptypes);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptg_tbl);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.vsig_tbl);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.vsis);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].prof.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].prof_redir.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.ref_count);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.written);
+	status = ice_move_vsi(hw, blk, vsi, vsig, chg);
+	if (status)
+		return status;
+
+	list_for_each_entry(t, lst, list) {
+		/* Reverse the order here since we are copying the list */
+		status = ice_add_prof_id_vsig(hw, blk, vsig, t->profile_cookie,
+					      true, chg);
+		if (status)
+			return status;
 	}
 
-	memset(hw->blk, 0, sizeof(hw->blk));
+	*new_vsig = vsig;
+
+	return 0;
 }
 
 /**
- * ice_clear_hw_tbls - clear HW tables and flow profiles
- * @hw: pointer to the hardware structure
+ * ice_find_prof_vsig - find a VSIG with a specific profile handle
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @hdl: the profile handle of the profile to search for
+ * @vsig: returns the VSIG with the matching profile
  */
-void ice_clear_hw_tbls(struct ice_hw *hw)
+static bool
+ice_find_prof_vsig(struct ice_hw *hw, enum ice_block blk, u64 hdl, u16 *vsig)
 {
-	u8 i;
+	struct ice_vsig_prof *t;
+	enum ice_status status;
+	struct list_head lst;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir;
-		struct ice_prof_tcam *prof = &hw->blk[i].prof;
-		struct ice_xlt1 *xlt1 = &hw->blk[i].xlt1;
-		struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2;
-		struct ice_es *es = &hw->blk[i].es;
+	INIT_LIST_HEAD(&lst);
 
-		memset(xlt1->ptypes, 0, xlt1->count * sizeof(*xlt1->ptypes));
-		memset(xlt1->ptg_tbl, 0,
-		       ICE_MAX_PTGS * sizeof(*xlt1->ptg_tbl));
-		memset(xlt1->t, 0, xlt1->count * sizeof(*xlt1->t));
+	t = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return false;
 
-		memset(xlt2->vsis, 0, xlt2->count * sizeof(*xlt2->vsis));
-		memset(xlt2->vsig_tbl, 0,
-		       xlt2->count * sizeof(*xlt2->vsig_tbl));
-		memset(xlt2->t, 0, xlt2->count * sizeof(*xlt2->t));
+	t->profile_cookie = hdl;
+	list_add(&t->list, &lst);
 
-		memset(prof->t, 0, prof->count * sizeof(*prof->t));
-		memset(prof_redir->t, 0,
-		       prof_redir->count * sizeof(*prof_redir->t));
+	status = ice_find_dup_props_vsig(hw, blk, &lst, vsig);
 
-		memset(es->t, 0, es->count * sizeof(*es->t));
-		memset(es->ref_count, 0, es->count * sizeof(*es->ref_count));
-		memset(es->written, 0, es->count * sizeof(*es->written));
+	list_del(&t->list);
+	devm_kfree(ice_hw_to_dev(hw), t);
+
+	return !status;
+}
+
+/**
+ * ice_add_vsi_flow - add VSI flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: input VSI
+ * @vsig: target VSIG to include the input VSI
+ *
+ * Calling this function will add the VSI to a given VSIG and
+ * update the HW tables accordingly. This call can be used to
+ * add multiple VSIs to a VSIG if we know beforehand that those
+ * VSIs have the same characteristics of the VSIG. This will
+ * save time in generating a new VSIG and TCAMs till a match is
+ * found and subsequent rollback when a matching VSIG is found.
+ */
+enum ice_status
+ice_add_vsi_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+{
+	struct ice_chs_chg *tmp, *del;
+	struct list_head chg;
+	enum ice_status status;
+
+	/* if target VSIG is default the move is invalid */
+	if ((vsig & ICE_VSIG_IDX_M) == ICE_DEFAULT_VSIG)
+		return ICE_ERR_PARAM;
+
+	INIT_LIST_HEAD(&chg);
+
+	/* move VSI to the VSIG that matches */
+	status = ice_move_vsi(hw, blk, vsi, vsig, &chg);
+	/* update hardware if success */
+	if (!status)
+		status = ice_upd_prof_hw(hw, blk, &chg);
+
+	list_for_each_entry_safe(del, tmp, &chg, list_entry) {
+		list_del(&del->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), del);
 	}
+
+	return status;
 }
 
 /**
- * ice_init_hw_tbls - init hardware table memory
- * @hw: pointer to the hardware structure
+ * ice_add_prof_id_flow - add profile flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the VSI to enable with the profile specified by ID
+ * @hdl: profile handle
+ *
+ * Calling this function will update the hardware tables to enable the
+ * profile indicated by the ID parameter for the VSIs specified in the VSI
+ * array. Once successfully called, the flow will be enabled.
  */
-enum ice_status ice_init_hw_tbls(struct ice_hw *hw)
+enum ice_status
+ice_add_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl)
 {
-	u8 i;
+	struct ice_vsig_prof *tmp1, *del1;
+	struct ice_chs_chg *tmp, *del;
+	struct list_head union_lst;
+	enum ice_status status;
+	struct list_head chg;
+	u16 vsig;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir;
-		struct ice_prof_tcam *prof = &hw->blk[i].prof;
-		struct ice_xlt1 *xlt1 = &hw->blk[i].xlt1;
-		struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2;
-		struct ice_es *es = &hw->blk[i].es;
-		u16 j;
+	INIT_LIST_HEAD(&union_lst);
+	INIT_LIST_HEAD(&chg);
 
-		if (hw->blk[i].is_list_init)
-			continue;
+	/* Get profile */
+	status = ice_get_prof(hw, blk, hdl, &chg);
+	if (status)
+		return status;
 
-		hw->blk[i].is_list_init = true;
+	/* determine if VSI is already part of a VSIG */
+	status = ice_vsig_find_vsi(hw, blk, vsi, &vsig);
+	if (!status && vsig) {
+		bool only_vsi;
+		u16 or_vsig;
+		u16 ref;
 
-		hw->blk[i].overwrite = blk_sizes[i].overwrite;
-		es->reverse = blk_sizes[i].reverse;
+		/* found in VSIG */
+		or_vsig = vsig;
 
-		xlt1->sid = ice_blk_sids[i][ICE_SID_XLT1_OFF];
-		xlt1->count = blk_sizes[i].xlt1;
+		/* make sure that there is no overlap/conflict between the new
+		 * characteristics and the existing ones; we don't support that
+		 * scenario
+		 */
+		if (ice_has_prof_vsig(hw, blk, vsig, hdl)) {
+			status = ICE_ERR_ALREADY_EXISTS;
+			goto err_ice_add_prof_id_flow;
+		}
 
-		xlt1->ptypes = devm_kcalloc(ice_hw_to_dev(hw), xlt1->count,
-					    sizeof(*xlt1->ptypes), GFP_KERNEL);
+		/* last VSI in the VSIG? */
+		status = ice_vsig_get_ref(hw, blk, vsig, &ref);
+		if (status)
+			goto err_ice_add_prof_id_flow;
+		only_vsi = (ref == 1);
 
-		if (!xlt1->ptypes)
-			goto err;
+		/* create a union of the current profiles and the one being
+		 * added
+		 */
+		status = ice_get_profs_vsig(hw, blk, vsig, &union_lst);
+		if (status)
+			goto err_ice_add_prof_id_flow;
 
-		xlt1->ptg_tbl = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_PTGS,
-					     sizeof(*xlt1->ptg_tbl),
-					     GFP_KERNEL);
+		status = ice_add_prof_to_lst(hw, blk, &union_lst, hdl);
+		if (status)
+			goto err_ice_add_prof_id_flow;
+
+		/* search for an existing VSIG with an exact charc match */
+		status = ice_find_dup_props_vsig(hw, blk, &union_lst, &vsig);
+		if (!status) {
+			/* move VSI to the VSIG that matches */
+			status = ice_move_vsi(hw, blk, vsi, vsig, &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+
+			/* VSI has been moved out of or_vsig. If the or_vsig had
+			 * only that VSI it is now empty and can be removed.
+			 */
+			if (only_vsi) {
+				status = ice_rem_vsig(hw, blk, or_vsig, &chg);
+				if (status)
+					goto err_ice_add_prof_id_flow;
+			}
+		} else if (only_vsi) {
+			/* If the original VSIG only contains one VSI, then it
+			 * will be the requesting VSI. In this case the VSI is
+			 * not sharing entries and we can simply add the new
+			 * profile to the VSIG.
+			 */
+			status = ice_add_prof_id_vsig(hw, blk, vsig, hdl, false,
+						      &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+
+			/* Adjust priorities */
+			status = ice_adj_prof_priorities(hw, blk, vsig, &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+		} else {
+			/* No match, so we need a new VSIG */
+			status = ice_create_vsig_from_lst(hw, blk, vsi,
+							  &union_lst, &vsig,
+							  &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+
+			/* Adjust priorities */
+			status = ice_adj_prof_priorities(hw, blk, vsig, &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+		}
+	} else {
+		/* need to find or add a VSIG */
+		/* search for an existing VSIG with an exact charc match */
+		if (ice_find_prof_vsig(hw, blk, hdl, &vsig)) {
+			/* found an exact match */
+			/* add or move VSI to the VSIG that matches */
+			status = ice_move_vsi(hw, blk, vsi, vsig, &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+		} else {
+			/* we did not find an exact match */
+			/* we need to add a VSIG */
+			status = ice_create_prof_id_vsig(hw, blk, vsi, hdl,
+							 &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+		}
+	}
 
-		if (!xlt1->ptg_tbl)
-			goto err;
+	/* update hardware */
+	if (!status)
+		status = ice_upd_prof_hw(hw, blk, &chg);
 
-		xlt1->t = devm_kcalloc(ice_hw_to_dev(hw), xlt1->count,
-				       sizeof(*xlt1->t), GFP_KERNEL);
-		if (!xlt1->t)
-			goto err;
+err_ice_add_prof_id_flow:
+	list_for_each_entry_safe(del, tmp, &chg, list_entry) {
+		list_del(&del->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
 
-		xlt2->sid = ice_blk_sids[i][ICE_SID_XLT2_OFF];
-		xlt2->count = blk_sizes[i].xlt2;
+	list_for_each_entry_safe(del1, tmp1, &union_lst, list) {
+		list_del(&del1->list);
+		devm_kfree(ice_hw_to_dev(hw), del1);
+	}
 
-		xlt2->vsis = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
-					  sizeof(*xlt2->vsis), GFP_KERNEL);
+	return status;
+}
 
-		if (!xlt2->vsis)
-			goto err;
+/**
+ * ice_add_flow - add flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: array of VSIs to enable with the profile specified by ID
+ * @count: number of elements in the VSI array
+ * @id: profile tracking ID
+ *
+ * Calling this function will update the hardware tables to enable the
+ * profile indicated by the ID parameter for the VSIs specified in the VSI
+ * array. Once successfully called, the flow will be enabled.
+ */
+enum ice_status
+ice_add_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi[], u8 count,
+	     u64 id)
+{
+	enum ice_status status;
+	u16 i;
 
-		xlt2->vsig_tbl = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
-					      sizeof(*xlt2->vsig_tbl),
-					      GFP_KERNEL);
-		if (!xlt2->vsig_tbl)
-			goto err;
+	for (i = 0; i < count; i++) {
+		status = ice_add_prof_id_flow(hw, blk, vsi[i], id);
+		if (status)
+			return status;
+	}
 
-		for (j = 0; j < xlt2->count; j++)
-			INIT_LIST_HEAD(&xlt2->vsig_tbl[j].prop_lst);
+	return 0;
+}
 
-		xlt2->t = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
-				       sizeof(*xlt2->t), GFP_KERNEL);
-		if (!xlt2->t)
-			goto err;
+/**
+ * ice_rem_prof_from_list - remove a profile from list
+ * @hw: pointer to the HW struct
+ * @lst: list to remove the profile from
+ * @hdl: the profile handle indicating the profile to remove
+ */
+static enum ice_status
+ice_rem_prof_from_list(struct ice_hw *hw, struct list_head *lst, u64 hdl)
+{
+	struct ice_vsig_prof *ent, *tmp;
 
-		prof->sid = ice_blk_sids[i][ICE_SID_PR_OFF];
-		prof->count = blk_sizes[i].prof_tcam;
-		prof->max_prof_id = blk_sizes[i].prof_id;
-		prof->cdid_bits = blk_sizes[i].prof_cdid_bits;
-		prof->t = devm_kcalloc(ice_hw_to_dev(hw), prof->count,
-				       sizeof(*prof->t), GFP_KERNEL);
+	list_for_each_entry_safe(ent, tmp, lst, list)
+		if (ent->profile_cookie == hdl) {
+			list_del(&ent->list);
+			devm_kfree(ice_hw_to_dev(hw), ent);
+			return 0;
+		}
 
-		if (!prof->t)
-			goto err;
+	return ICE_ERR_DOES_NOT_EXIST;
+}
 
-		prof_redir->sid = ice_blk_sids[i][ICE_SID_PR_REDIR_OFF];
-		prof_redir->count = blk_sizes[i].prof_redir;
-		prof_redir->t = devm_kcalloc(ice_hw_to_dev(hw),
-					     prof_redir->count,
-					     sizeof(*prof_redir->t),
-					     GFP_KERNEL);
+/**
+ * ice_rem_prof_id_flow - remove flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the VSI from which to remove the profile specified by ID
+ * @hdl: profile tracking handle
+ *
+ * Calling this function will update the hardware tables to remove the
+ * profile indicated by the ID parameter for the VSIs specified in the VSI
+ * array. Once successfully called, the flow will be disabled.
+ */
+enum ice_status
+ice_rem_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl)
+{
+	struct ice_vsig_prof *tmp1, *del1;
+	struct ice_chs_chg *tmp, *del;
+	struct list_head chg, copy;
+	enum ice_status status;
+	u16 vsig;
 
-		if (!prof_redir->t)
-			goto err;
+	INIT_LIST_HEAD(&copy);
+	INIT_LIST_HEAD(&chg);
 
-		es->sid = ice_blk_sids[i][ICE_SID_ES_OFF];
-		es->count = blk_sizes[i].es;
-		es->fvw = blk_sizes[i].fvw;
-		es->t = devm_kcalloc(ice_hw_to_dev(hw),
-				     (u32)(es->count * es->fvw),
-				     sizeof(*es->t), GFP_KERNEL);
-		if (!es->t)
-			goto err;
+	/* determine if VSI is already part of a VSIG */
+	status = ice_vsig_find_vsi(hw, blk, vsi, &vsig);
+	if (!status && vsig) {
+		bool last_profile;
+		bool only_vsi;
+		u16 ref;
 
-		es->ref_count = devm_kcalloc(ice_hw_to_dev(hw), es->count,
-					     sizeof(*es->ref_count),
-					     GFP_KERNEL);
-		if (!es->ref_count)
-			goto err;
+		/* found in VSIG */
+		last_profile = ice_vsig_prof_id_count(hw, blk, vsig) == 1;
+		status = ice_vsig_get_ref(hw, blk, vsig, &ref);
+		if (status)
+			goto err_ice_rem_prof_id_flow;
+		only_vsi = (ref == 1);
+
+		if (only_vsi) {
+			/* If the original VSIG only contains one reference,
+			 * which will be the requesting VSI, then the VSI is not
+			 * sharing entries and we can simply remove the specific
+			 * characteristics from the VSIG.
+			 */
 
-		es->written = devm_kcalloc(ice_hw_to_dev(hw), es->count,
-					   sizeof(*es->written), GFP_KERNEL);
-		if (!es->written)
-			goto err;
+			if (last_profile) {
+				/* If there are no profiles left for this VSIG,
+				 * then simply remove the VSIG.
+				 */
+				status = ice_rem_vsig(hw, blk, vsig, &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+			} else {
+				status = ice_rem_prof_id_vsig(hw, blk, vsig,
+							      hdl, &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+
+				/* Adjust priorities */
+				status = ice_adj_prof_priorities(hw, blk, vsig,
+								 &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+			}
+
+		} else {
+			/* Make a copy of the VSIG's list of Profiles */
+			status = ice_get_profs_vsig(hw, blk, vsig, &copy);
+			if (status)
+				goto err_ice_rem_prof_id_flow;
+
+			/* Remove specified profile entry from the list */
+			status = ice_rem_prof_from_list(hw, &copy, hdl);
+			if (status)
+				goto err_ice_rem_prof_id_flow;
+
+			if (list_empty(&copy)) {
+				status = ice_move_vsi(hw, blk, vsi,
+						      ICE_DEFAULT_VSIG, &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+
+			} else if (!ice_find_dup_props_vsig(hw, blk, &copy,
+							    &vsig)) {
+				/* found an exact match */
+				/* add or move VSI to the VSIG that matches */
+				/* Search for a VSIG with a matching profile
+				 * list
+				 */
+
+				/* Found match, move VSI to the matching VSIG */
+				status = ice_move_vsi(hw, blk, vsi, vsig, &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+			} else {
+				/* since no existing VSIG supports this
+				 * characteristic pattern, we need to create a
+				 * new VSIG and TCAM entries
+				 */
+				status = ice_create_vsig_from_lst(hw, blk, vsi,
+								  &copy, &vsig,
+								  &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+
+				/* Adjust priorities */
+				status = ice_adj_prof_priorities(hw, blk, vsig,
+								 &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+			}
+		}
+	} else {
+		status = ICE_ERR_DOES_NOT_EXIST;
 	}
-	return 0;
 
-err:
-	ice_free_hw_tbls(hw);
-	return ICE_ERR_NO_MEMORY;
+	/* update hardware tables */
+	if (!status)
+		status = ice_upd_prof_hw(hw, blk, &chg);
+
+err_ice_rem_prof_id_flow:
+	list_for_each_entry_safe(del, tmp, &chg, list_entry) {
+		list_del(&del->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
+
+	list_for_each_entry_safe(del1, tmp1, &copy, list) {
+		list_del(&del1->list);
+		devm_kfree(ice_hw_to_dev(hw), del1);
+	}
+
+	return status;
+}
+
+/**
+ * ice_rem_flow - remove flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: array of VSIs from which to remove the profile specified by ID
+ * @count: number of elements in the VSI array
+ * @id: profile tracking ID
+ *
+ * The function will remove flows from the specified VSIs that were enabled
+ * using ice_add_flow. The ID value will indicated which profile will be
+ * removed. Once successfully called, the flow will be disabled.
+ */
+enum ice_status
+ice_rem_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi[], u8 count,
+	     u64 id)
+{
+	enum ice_status status;
+	u16 i;
+
+	for (i = 0; i < count; i++) {
+		status = ice_rem_prof_id_flow(hw, blk, vsi[i], id);
+		if (status)
+			return status;
+	}
+
+	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.h b/drivers/net/ethernet/intel/ice/ice_flex_pipe.h
index 37eb282742d1..356d9f46f554 100644
--- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.h
+++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_FLEX_PIPE_H_
 #define _ICE_FLEX_PIPE_H_
@@ -18,6 +18,72 @@
 
 #define ICE_PKG_CNT 4
 
+enum ice_status
+ice_acquire_change_lock(struct ice_hw *hw, enum ice_aq_res_access_type access);
+void ice_release_change_lock(struct ice_hw *hw);
+enum ice_status
+ice_find_prot_off(struct ice_hw *hw, enum ice_block blk, u8 prof, u16 fv_idx,
+		  u8 *prot, u16 *off);
+enum ice_status
+ice_find_label_value(struct ice_seg *ice_seg, char const *name, u32 type,
+		     u16 *value);
+void
+ice_get_sw_fv_bitmap(struct ice_hw *hw, enum ice_prof_type type,
+		     unsigned long *bm);
+void
+ice_init_prof_result_bm(struct ice_hw *hw);
+enum ice_status
+ice_get_sw_fv_list(struct ice_hw *hw, u8 *prot_ids, u16 ids_cnt,
+		   unsigned long *bm, struct list_head *fv_list);
+enum ice_status
+ice_pkg_buf_unreserve_section(struct ice_buf_build *bld, u16 count);
+u16 ice_pkg_buf_get_free_space(struct ice_buf_build *bld);
+enum ice_status
+ice_aq_upload_section(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
+		      u16 buf_size, struct ice_sq_cd *cd);
+bool
+ice_get_open_tunnel_port(struct ice_hw *hw, enum ice_tunnel_type type,
+			 u16 *port);
+enum ice_status
+ice_is_create_tunnel_possible(struct ice_hw *hw, enum ice_tunnel_type type,
+			      u16 port);
+bool ice_is_tunnel_empty(struct ice_hw *hw);
+enum ice_status
+ice_create_tunnel(struct ice_hw *hw, enum ice_tunnel_type type, u16 port);
+enum ice_status ice_set_dvm_boost_entries(struct ice_hw *hw);
+enum ice_status ice_destroy_tunnel(struct ice_hw *hw, u16 port, bool all);
+bool ice_tunnel_port_in_use(struct ice_hw *hw, u16 port, u16 *index);
+bool
+ice_tunnel_get_type(struct ice_hw *hw, u16 port, enum ice_tunnel_type *type);
+enum ice_status ice_replay_tunnels(struct ice_hw *hw);
+
+/* RX parser PType functions */
+bool ice_hw_ptype_ena(struct ice_hw *hw, u16 ptype);
+
+/* XLT1/PType group functions */
+enum ice_status ice_ptg_update_xlt1(struct ice_hw *hw, enum ice_block blk);
+void ice_ptg_free(struct ice_hw *hw, enum ice_block blk, u8 ptg);
+
+/* XLT2/VSI group functions */
+enum ice_status ice_vsig_update_xlt2(struct ice_hw *hw, enum ice_block blk);
+enum ice_status
+ice_add_prof(struct ice_hw *hw, enum ice_block blk, u64 id, u8 ptypes[],
+	     const struct ice_ptype_attributes *attr, u16 attr_cnt,
+	     struct ice_fv_word *es, u16 *masks);
+void ice_init_all_prof_masks(struct ice_hw *hw);
+void ice_shutdown_all_prof_masks(struct ice_hw *hw);
+struct ice_prof_map *
+ice_search_prof_id(struct ice_hw *hw, enum ice_block blk, u64 id);
+enum ice_status
+ice_add_vsi_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig);
+enum ice_status
+ice_add_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl);
+enum ice_status
+ice_rem_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl);
+enum ice_status
+ice_set_prof_context(struct ice_hw *hw, enum ice_block blk, u64 id, u64 cntxt);
+enum ice_status
+ice_get_prof_context(struct ice_hw *hw, enum ice_block blk, u64 id, u64 *cntxt);
 enum ice_status ice_init_pkg(struct ice_hw *hw, u8 *buff, u32 len);
 enum ice_status
 ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, u32 len);
@@ -26,4 +92,21 @@ void ice_free_seg(struct ice_hw *hw);
 void ice_fill_blk_tbls(struct ice_hw *hw);
 void ice_clear_hw_tbls(struct ice_hw *hw);
 void ice_free_hw_tbls(struct ice_hw *hw);
+enum ice_status
+ice_add_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi[], u8 count,
+	     u64 id);
+enum ice_status
+ice_rem_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi[], u8 count,
+	     u64 id);
+enum ice_status
+ice_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 id);
+struct ice_buf_build *
+ice_pkg_buf_alloc_single_section(struct ice_hw *hw, u32 type, u16 size,
+				 void **section);
+struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld);
+void ice_pkg_buf_free(struct ice_hw *hw, struct ice_buf_build *bld);
+
+enum ice_status
+ice_set_key(u8 *key, u16 size, u8 *val, u8 *upd, u8 *dc, u8 *nm, u16 off,
+	    u16 len);
 #endif /* _ICE_FLEX_PIPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_flex_type.h b/drivers/net/ethernet/intel/ice/ice_flex_type.h
index 5d5a7eaffa30..32bcb7704048 100644
--- a/drivers/net/ethernet/intel/ice/ice_flex_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_flex_type.h
@@ -1,8 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_FLEX_TYPE_H_
 #define _ICE_FLEX_TYPE_H_
+
+#define ICE_FV_OFFSET_INVAL	0x1FF
+
 /* Extraction Sequence (Field Vector) Table */
 struct ice_fv_word {
 	u8 prot_id;
@@ -10,6 +13,8 @@ struct ice_fv_word {
 	u8 resvrd;
 } __packed;
 
+#define ICE_MAX_NUM_PROFILES 256
+
 #define ICE_MAX_FV_WORDS 48
 struct ice_fv {
 	struct ice_fv_word ew[ICE_MAX_FV_WORDS];
@@ -17,9 +22,9 @@ struct ice_fv {
 
 /* Package and segment headers and tables */
 struct ice_pkg_hdr {
-	struct ice_pkg_ver format_ver;
+	struct ice_pkg_ver pkg_format_ver;
 	__le32 seg_count;
-	__le32 seg_offset[1];
+	__le32 seg_offset[];
 };
 
 /* generic segment */
@@ -27,9 +32,9 @@ struct ice_generic_seg_hdr {
 #define SEGMENT_TYPE_METADATA	0x00000001
 #define SEGMENT_TYPE_ICE	0x00000010
 	__le32 seg_type;
-	struct ice_pkg_ver seg_ver;
+	struct ice_pkg_ver seg_format_ver;
 	__le32 seg_size;
-	char seg_name[ICE_PKG_NAME_SIZE];
+	char seg_id[ICE_PKG_NAME_SIZE];
 };
 
 /* ice specific segment */
@@ -50,12 +55,12 @@ struct ice_device_id_entry {
 struct ice_seg {
 	struct ice_generic_seg_hdr hdr;
 	__le32 device_table_count;
-	struct ice_device_id_entry device_table[1];
+	struct ice_device_id_entry device_table[];
 };
 
 struct ice_nvm_table {
 	__le32 table_count;
-	__le32 vers[1];
+	__le32 vers[];
 };
 
 struct ice_buf {
@@ -65,14 +70,14 @@ struct ice_buf {
 
 struct ice_buf_table {
 	__le32 buf_count;
-	struct ice_buf buf_array[1];
+	struct ice_buf buf_array[];
 };
 
 /* global metadata specific segment */
 struct ice_global_metadata_seg {
 	struct ice_generic_seg_hdr hdr;
 	struct ice_pkg_ver pkg_ver;
-	__le32 track_id;
+	__le32 rsvd;
 	char pkg_name[ICE_PKG_NAME_SIZE];
 };
 
@@ -98,48 +103,141 @@ struct ice_section_entry {
 struct ice_buf_hdr {
 	__le16 section_count;
 	__le16 data_end;
-	struct ice_section_entry section_entry[1];
+	struct ice_section_entry section_entry[];
 };
 
 #define ICE_MAX_ENTRIES_IN_BUF(hd_sz, ent_sz) ((ICE_PKG_BUF_SIZE - \
-	sizeof(struct ice_buf_hdr) - (hd_sz)) / (ent_sz))
+	struct_size((struct ice_buf_hdr *)0, section_entry, 1) - (hd_sz)) /\
+	(ent_sz))
 
 /* ice package section IDs */
+#define ICE_SID_METADATA		1
+#define ICE_SID_XLT0_SW			10
+#define ICE_SID_XLT_KEY_BUILDER_SW	11
 #define ICE_SID_XLT1_SW			12
 #define ICE_SID_XLT2_SW			13
 #define ICE_SID_PROFID_TCAM_SW		14
 #define ICE_SID_PROFID_REDIR_SW		15
 #define ICE_SID_FLD_VEC_SW		16
+#define ICE_SID_CDID_KEY_BUILDER_SW	17
+#define ICE_SID_CDID_REDIR_SW		18
 
+#define ICE_SID_XLT0_ACL		20
+#define ICE_SID_XLT_KEY_BUILDER_ACL	21
 #define ICE_SID_XLT1_ACL		22
 #define ICE_SID_XLT2_ACL		23
 #define ICE_SID_PROFID_TCAM_ACL		24
 #define ICE_SID_PROFID_REDIR_ACL	25
 #define ICE_SID_FLD_VEC_ACL		26
+#define ICE_SID_CDID_KEY_BUILDER_ACL	27
+#define ICE_SID_CDID_REDIR_ACL		28
 
+#define ICE_SID_XLT0_FD			30
+#define ICE_SID_XLT_KEY_BUILDER_FD	31
 #define ICE_SID_XLT1_FD			32
 #define ICE_SID_XLT2_FD			33
 #define ICE_SID_PROFID_TCAM_FD		34
 #define ICE_SID_PROFID_REDIR_FD		35
 #define ICE_SID_FLD_VEC_FD		36
+#define ICE_SID_CDID_KEY_BUILDER_FD	37
+#define ICE_SID_CDID_REDIR_FD		38
 
+#define ICE_SID_XLT0_RSS		40
+#define ICE_SID_XLT_KEY_BUILDER_RSS	41
 #define ICE_SID_XLT1_RSS		42
 #define ICE_SID_XLT2_RSS		43
 #define ICE_SID_PROFID_TCAM_RSS		44
 #define ICE_SID_PROFID_REDIR_RSS	45
 #define ICE_SID_FLD_VEC_RSS		46
-
+#define ICE_SID_CDID_KEY_BUILDER_RSS	47
+#define ICE_SID_CDID_REDIR_RSS		48
+
+#define ICE_SID_RXPARSER_CAM		50
+#define ICE_SID_RXPARSER_NOMATCH_CAM	51
+#define ICE_SID_RXPARSER_IMEM		52
+#define ICE_SID_RXPARSER_XLT0_BUILDER	53
+#define ICE_SID_RXPARSER_NODE_PTYPE	54
+#define ICE_SID_RXPARSER_MARKER_PTYPE	55
 #define ICE_SID_RXPARSER_BOOST_TCAM	56
-
+#define ICE_SID_RXPARSER_PROTO_GRP	57
+#define ICE_SID_RXPARSER_METADATA_INIT	58
+#define ICE_SID_RXPARSER_XLT0		59
+
+#define ICE_SID_TXPARSER_CAM		60
+#define ICE_SID_TXPARSER_NOMATCH_CAM	61
+#define ICE_SID_TXPARSER_IMEM		62
+#define ICE_SID_TXPARSER_XLT0_BUILDER	63
+#define ICE_SID_TXPARSER_NODE_PTYPE	64
+#define ICE_SID_TXPARSER_MARKER_PTYPE	65
+#define ICE_SID_TXPARSER_BOOST_TCAM	66
+#define ICE_SID_TXPARSER_PROTO_GRP	67
+#define ICE_SID_TXPARSER_METADATA_INIT	68
+#define ICE_SID_TXPARSER_XLT0		69
+
+#define ICE_SID_RXPARSER_INIT_REDIR	70
+#define ICE_SID_TXPARSER_INIT_REDIR	71
+#define ICE_SID_RXPARSER_MARKER_GRP	72
+#define ICE_SID_TXPARSER_MARKER_GRP	73
+#define ICE_SID_RXPARSER_LAST_PROTO	74
+#define ICE_SID_TXPARSER_LAST_PROTO	75
+#define ICE_SID_RXPARSER_PG_SPILL	76
+#define ICE_SID_TXPARSER_PG_SPILL	77
+#define ICE_SID_RXPARSER_NOMATCH_SPILL	78
+#define ICE_SID_TXPARSER_NOMATCH_SPILL	79
+
+#define ICE_SID_XLT0_PE			80
+#define ICE_SID_XLT_KEY_BUILDER_PE	81
 #define ICE_SID_XLT1_PE			82
 #define ICE_SID_XLT2_PE			83
 #define ICE_SID_PROFID_TCAM_PE		84
 #define ICE_SID_PROFID_REDIR_PE		85
 #define ICE_SID_FLD_VEC_PE		86
+#define ICE_SID_CDID_KEY_BUILDER_PE	87
+#define ICE_SID_CDID_REDIR_PE		88
 
 /* Label Metadata section IDs */
 #define ICE_SID_LBL_FIRST		0x80000010
+#define ICE_SID_LBL_RXPARSER_IMEM	0x80000010
+#define ICE_SID_LBL_TXPARSER_IMEM	0x80000011
+#define ICE_SID_LBL_RESERVED_12		0x80000012
+#define ICE_SID_LBL_RESERVED_13		0x80000013
+#define ICE_SID_LBL_RXPARSER_MARKER	0x80000014
+#define ICE_SID_LBL_TXPARSER_MARKER	0x80000015
+#define ICE_SID_LBL_PTYPE		0x80000016
+#define ICE_SID_LBL_PROTOCOL_ID		0x80000017
 #define ICE_SID_LBL_RXPARSER_TMEM	0x80000018
+#define ICE_SID_LBL_TXPARSER_TMEM	0x80000019
+#define ICE_SID_LBL_RXPARSER_PG		0x8000001A
+#define ICE_SID_LBL_TXPARSER_PG		0x8000001B
+#define ICE_SID_LBL_RXPARSER_M_TCAM	0x8000001C
+#define ICE_SID_LBL_TXPARSER_M_TCAM	0x8000001D
+#define ICE_SID_LBL_SW_PROFID_TCAM	0x8000001E
+#define ICE_SID_LBL_ACL_PROFID_TCAM	0x8000001F
+#define ICE_SID_LBL_PE_PROFID_TCAM	0x80000020
+#define ICE_SID_LBL_RSS_PROFID_TCAM	0x80000021
+#define ICE_SID_LBL_FD_PROFID_TCAM	0x80000022
+#define ICE_SID_LBL_FLAG		0x80000023
+#define ICE_SID_LBL_REG			0x80000024
+#define ICE_SID_LBL_SW_PTG		0x80000025
+#define ICE_SID_LBL_ACL_PTG		0x80000026
+#define ICE_SID_LBL_PE_PTG		0x80000027
+#define ICE_SID_LBL_RSS_PTG		0x80000028
+#define ICE_SID_LBL_FD_PTG		0x80000029
+#define ICE_SID_LBL_SW_VSIG		0x8000002A
+#define ICE_SID_LBL_ACL_VSIG		0x8000002B
+#define ICE_SID_LBL_PE_VSIG		0x8000002C
+#define ICE_SID_LBL_RSS_VSIG		0x8000002D
+#define ICE_SID_LBL_FD_VSIG		0x8000002E
+#define ICE_SID_LBL_PTYPE_META		0x8000002F
+#define ICE_SID_LBL_SW_PROFID		0x80000030
+#define ICE_SID_LBL_ACL_PROFID		0x80000031
+#define ICE_SID_LBL_PE_PROFID		0x80000032
+#define ICE_SID_LBL_RSS_PROFID		0x80000033
+#define ICE_SID_LBL_FD_PROFID		0x80000034
+#define ICE_SID_LBL_RXPARSER_MARKER_GRP	0x80000035
+#define ICE_SID_LBL_TXPARSER_MARKER_GRP	0x80000036
+#define ICE_SID_LBL_RXPARSER_PROTO	0x80000037
+#define ICE_SID_LBL_TXPARSER_PROTO	0x80000038
 /* The following define MUST be updated to reflect the last label section ID */
 #define ICE_SID_LBL_LAST		0x80000038
 
@@ -152,6 +250,293 @@ enum ice_block {
 	ICE_BLK_COUNT
 };
 
+enum ice_sect {
+	ICE_XLT0 = 0,
+	ICE_XLT_KB,
+	ICE_XLT1,
+	ICE_XLT2,
+	ICE_PROF_TCAM,
+	ICE_PROF_REDIR,
+	ICE_VEC_TBL,
+	ICE_CDID_KB,
+	ICE_CDID_REDIR,
+	ICE_SECT_COUNT
+};
+
+/* Packet Type (PTYPE) values */
+#define ICE_PTYPE_MAC_PAY		1
+#define ICE_MAC_PTP				2
+#define ICE_MAC_LLDP				6
+#define ICE_MAC_ARP				11
+#define ICE_PTYPE_IPV4FRAG_PAY		22
+#define ICE_PTYPE_IPV4_PAY		23
+#define ICE_PTYPE_IPV4_UDP_PAY		24
+#define ICE_PTYPE_IPV4_TCP_PAY		26
+#define ICE_PTYPE_IPV4_SCTP_PAY		27
+#define ICE_PTYPE_IPV4_ICMP_PAY		28
+#define ICE_MAC_IPV4_IPV4_FRAG			29
+#define ICE_MAC_IPV4_IPV4_PAY			30
+#define ICE_MAC_IPV4_IPV4_UDP_PAY		31
+#define ICE_MAC_IPV4_IPV4_TCP			33
+#define ICE_MAC_IPV4_IPV4_SCTP			34
+#define ICE_MAC_IPV4_IPV4_ICMP			35
+#define ICE_MAC_IPV4_IPV6_FRAG			36
+#define ICE_MAC_IPV4_IPV6_PAY			37
+#define ICE_MAC_IPV4_IPV6_UDP_PAY		38
+#define ICE_MAC_IPV4_IPV6_TCP			40
+#define ICE_MAC_IPV4_IPV6_SCTP			41
+#define ICE_MAC_IPV4_IPV6_ICMPV6		42
+#define ICE_MAC_IPV4_TUN_PAY			43
+#define ICE_MAC_IPV4_TUN_IPV4_FRAG		44
+#define ICE_MAC_IPV4_TUN_IPV4_PAY		45
+#define ICE_MAC_IPV4_TUN_IPV4_UDP_PAY		46
+#define ICE_MAC_IPV4_TUN_IPV4_TCP		48
+#define ICE_MAC_IPV4_TUN_IPV4_SCTP		49
+#define ICE_MAC_IPV4_TUN_IPV4_ICMP		50
+#define ICE_MAC_IPV4_TUN_IPV6_FRAG		51
+#define ICE_MAC_IPV4_TUN_IPV6_PAY		52
+#define ICE_MAC_IPV4_TUN_IPV6_UDP_PAY		53
+#define ICE_MAC_IPV4_TUN_IPV6_TCP		55
+#define ICE_MAC_IPV4_TUN_IPV6_SCTP		56
+#define ICE_MAC_IPV4_TUN_IPV6_ICMPV6		57
+#define ICE_MAC_IPV4_TUN_ICE_MAC_PAY		58
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_FRAG	59
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_PAY	60
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_UDP_PAY	61
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_TCP	63
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_SCTP	64
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_ICMP	65
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_FRAG	66
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_PAY	67
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_UDP_PAY	68
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_TCP	70
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_SCTP	71
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_ICMPV6	72
+#define ICE_PTYPE_IPV6FRAG_PAY		88
+#define ICE_PTYPE_IPV6_PAY		89
+#define ICE_PTYPE_IPV6_UDP_PAY		90
+#define ICE_PTYPE_IPV6_TCP_PAY		92
+#define ICE_PTYPE_IPV6_SCTP_PAY		93
+#define ICE_PTYPE_IPV6_ICMP_PAY		94
+#define ICE_MAC_IPV6_IPV4_FRAG			95
+#define ICE_MAC_IPV6_IPV4_PAY			96
+#define ICE_MAC_IPV6_IPV4_UDP_PAY		97
+#define ICE_MAC_IPV6_IPV4_TCP			99
+#define ICE_MAC_IPV6_IPV4_SCTP			100
+#define ICE_MAC_IPV6_IPV4_ICMP			101
+#define ICE_MAC_IPV6_IPV6_FRAG			102
+#define ICE_MAC_IPV6_IPV6_PAY			103
+#define ICE_MAC_IPV6_IPV6_UDP_PAY		104
+#define ICE_MAC_IPV6_IPV6_TCP			106
+#define ICE_MAC_IPV6_IPV6_SCTP			107
+#define ICE_MAC_IPV6_IPV6_ICMPV6		108
+#define ICE_MAC_IPV6_TUN_PAY			109
+#define ICE_MAC_IPV6_TUN_IPV4_FRAG		110
+#define ICE_MAC_IPV6_TUN_IPV4_PAY		111
+#define ICE_MAC_IPV6_TUN_IPV4_UDP_PAY		112
+#define ICE_MAC_IPV6_TUN_IPV4_TCP		114
+#define ICE_MAC_IPV6_TUN_IPV4_SCTP		115
+#define ICE_MAC_IPV6_TUN_IPV4_ICMP		116
+#define ICE_MAC_IPV6_TUN_IPV6_FRAG		117
+#define ICE_MAC_IPV6_TUN_IPV6_PAY		118
+#define ICE_MAC_IPV6_TUN_IPV6_UDP_PAY		119
+#define ICE_MAC_IPV6_TUN_IPV6_TCP		121
+#define ICE_MAC_IPV6_TUN_IPV6_SCTP		122
+#define ICE_MAC_IPV6_TUN_IPV6_ICMPV6		123
+#define ICE_MAC_IPV6_TUN_MAC_PAY		124
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_FRAG		125
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_PAY		126
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_UDP_PAY	127
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_TCP		129
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_SCTP		130
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_ICMP		131
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_FRAG		132
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_PAY		133
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_UDP_PAY	134
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_TCP		136
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_SCTP		137
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_ICMPV6	138
+#define ICE_MAC_IPV4_ESP			160
+#define ICE_MAC_IPV6_ESP			161
+#define ICE_MAC_IPV4_AH				162
+#define ICE_MAC_IPV6_AH				163
+#define ICE_MAC_IPV4_NAT_T_ESP			164
+#define ICE_MAC_IPV6_NAT_T_ESP			165
+#define ICE_MAC_IPV4_NAT_T_IKE			166
+#define ICE_MAC_IPV6_NAT_T_IKE			167
+#define ICE_MAC_IPV4_NAT_T_KEEP			168
+#define ICE_MAC_IPV6_NAT_T_KEEP			169
+#define ICE_MAC_CONTROL				278
+#define ICE_MAC_PPPOD_PAY			300
+#define ICE_MAC_PPPOE_PAY			301
+#define ICE_MAC_PPPOE_IPV4_FRAG			302
+#define ICE_MAC_PPPOE_IPV4_PAY			303
+#define ICE_MAC_PPPOE_IPV4_UDP_PAY		304
+#define ICE_MAC_PPPOE_IPV4_TCP			305
+#define ICE_MAC_PPPOE_IPV4_SCTP			306
+#define ICE_MAC_PPPOE_IPV4_ICMP			307
+#define ICE_MAC_PPPOE_IPV6_FRAG			308
+#define ICE_MAC_PPPOE_IPV6_PAY			309
+#define ICE_MAC_PPPOE_IPV6_UDP_PAY		310
+#define ICE_MAC_PPPOE_IPV6_TCP			311
+#define ICE_MAC_PPPOE_IPV6_SCTP			312
+#define ICE_MAC_PPPOE_IPV6_ICMPV6		313
+#define ICE_MAC_IPV4_GTPC_TEID			325
+#define ICE_MAC_IPV6_GTPC_TEID			326
+#define ICE_MAC_IPV4_GTPC			327
+#define ICE_MAC_IPV6_GTPC			328
+#define ICE_MAC_IPV4_GTPU			329
+#define ICE_MAC_IPV6_GTPU			330
+#define ICE_MAC_IPV4_GTPU_IPV4_FRAG		331
+#define ICE_MAC_IPV4_GTPU_IPV4_PAY		332
+#define ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY		333
+#define ICE_MAC_IPV4_GTPU_IPV4_TCP		334
+#define ICE_MAC_IPV4_GTPU_IPV4_ICMP		335
+#define ICE_MAC_IPV6_GTPU_IPV4_FRAG		336
+#define ICE_MAC_IPV6_GTPU_IPV4_PAY		337
+#define ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY		338
+#define ICE_MAC_IPV6_GTPU_IPV4_TCP		339
+#define ICE_MAC_IPV6_GTPU_IPV4_ICMP		340
+#define ICE_MAC_IPV4_GTPU_IPV6_FRAG		341
+#define ICE_MAC_IPV4_GTPU_IPV6_PAY		342
+#define ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY		343
+#define ICE_MAC_IPV4_GTPU_IPV6_TCP		344
+#define ICE_MAC_IPV4_GTPU_IPV6_ICMPV6		345
+#define ICE_MAC_IPV6_GTPU_IPV6_FRAG		346
+#define ICE_MAC_IPV6_GTPU_IPV6_PAY		347
+#define ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY		348
+#define ICE_MAC_IPV6_GTPU_IPV6_TCP		349
+#define ICE_MAC_IPV6_GTPU_IPV6_ICMPV6		350
+#define ICE_MAC_IPV4_PFCP_NODE			351
+#define ICE_MAC_IPV4_PFCP_SESSION		352
+#define ICE_MAC_IPV6_PFCP_NODE			353
+#define ICE_MAC_IPV6_PFCP_SESSION		354
+#define ICE_MAC_IPV4_L2TPV3			360
+#define ICE_MAC_IPV6_L2TPV3			361
+#define ICE_MAC_IPV4_L2TPV2_CONTROL		392
+#define ICE_MAC_IPV6_L2TPV2_CONTROL		393
+#define ICE_MAC_IPV4_L2TPV2			394
+#define ICE_MAC_IPV6_L2TPV2			395
+#define ICE_MAC_IPV4_PPPOL2TPV2			396
+#define ICE_MAC_IPV6_PPPOL2TPV2			397
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_FRAG	398
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_PAY	399
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_UDP_PAY	400
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_TCP	401
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_SCTP	402
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_ICMP	403
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_FRAG	404
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_PAY	405
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_UDP_PAY	406
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_TCP	407
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_SCTP	408
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_ICMPV6	409
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_FRAG	410
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_PAY	411
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_UDP_PAY	412
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_TCP	413
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_SCTP	414
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_ICMP	415
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_FRAG	416
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_PAY	417
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_UDP_PAY	418
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_TCP	419
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_SCTP	420
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_ICMPV6	421
+
+/* Attributes that can modify PTYPE definitions.
+ *
+ * These values will represent special attributes for PTYPES, which will
+ * resolve into metadata packet flags definitions that can be used in the TCAM
+ * for identifying a PTYPE with specific characteristics.
+ */
+enum ice_ptype_attrib_type {
+	/* GTP PTYPES */
+	ICE_PTYPE_ATTR_GTP_PDU_EH,
+	ICE_PTYPE_ATTR_GTP_SESSION,
+	ICE_PTYPE_ATTR_GTP_DOWNLINK,
+	ICE_PTYPE_ATTR_GTP_UPLINK,
+};
+
+struct ice_ptype_attrib_info {
+	u16 flags;
+	u16 mask;
+};
+
+/* TCAM flag definitions */
+#define ICE_GTP_PDU			BIT(14)
+#define ICE_GTP_PDU_LINK		BIT(13)
+
+/* GTP attributes */
+#define ICE_GTP_PDU_FLAG_MASK		(ICE_GTP_PDU)
+#define ICE_GTP_PDU_EH			ICE_GTP_PDU
+
+#define ICE_GTP_FLAGS_MASK		(ICE_GTP_PDU | ICE_GTP_PDU_LINK)
+#define ICE_GTP_SESSION			0
+#define ICE_GTP_DOWNLINK		ICE_GTP_PDU
+#define ICE_GTP_UPLINK			(ICE_GTP_PDU | ICE_GTP_PDU_LINK)
+
+struct ice_ptype_attributes {
+	u16 ptype;
+	enum ice_ptype_attrib_type attrib;
+};
+
+struct ice_meta_sect {
+	struct ice_pkg_ver ver;
+#define ICE_META_SECT_NAME_SIZE	28
+	char name[ICE_META_SECT_NAME_SIZE];
+	__le32 track_id;
+};
+
+/* Packet Type Groups (PTG) - Inner Most fields (IM) */
+#define ICE_PTG_IM_IPV4_TCP		16
+#define ICE_PTG_IM_IPV4_UDP		17
+#define ICE_PTG_IM_IPV4_SCTP		18
+#define ICE_PTG_IM_IPV4_PAY		20
+#define ICE_PTG_IM_IPV4_OTHER		21
+#define ICE_PTG_IM_IPV6_TCP		32
+#define ICE_PTG_IM_IPV6_UDP		33
+#define ICE_PTG_IM_IPV6_SCTP		34
+#define ICE_PTG_IM_IPV6_OTHER		37
+#define ICE_PTG_IM_L2_OTHER		67
+
+struct ice_flex_fields {
+	union {
+		struct {
+			u8 src_ip;
+			u8 dst_ip;
+			u8 flow_label;	/* valid for IPv6 only */
+		} ip_fields;
+
+		struct {
+			u8 src_prt;
+			u8 dst_prt;
+		} tcp_udp_fields;
+
+		struct {
+			u8 src_ip;
+			u8 dst_ip;
+			u8 src_prt;
+			u8 dst_prt;
+		} ip_tcp_udp_fields;
+
+		struct {
+			u8 src_prt;
+			u8 dst_prt;
+			u8 flow_label;	/* valid for IPv6 only */
+			u8 spi;
+		} ip_esp_fields;
+
+		struct {
+			u32 offset;
+			u32 length;
+		} off_len;
+	} fields;
+};
+
+#define ICE_XLT1_DFLT_GRP	0
+#define ICE_XLT1_TABLE_SIZE	1024
+
 /* package labels */
 struct ice_label {
 	__le16 value;
@@ -161,17 +546,22 @@ struct ice_label {
 
 struct ice_label_section {
 	__le16 count;
-	struct ice_label label[1];
+	struct ice_label label[];
 };
 
-#define ICE_MAX_LABELS_IN_BUF ICE_MAX_ENTRIES_IN_BUF( \
-	sizeof(struct ice_label_section) - sizeof(struct ice_label), \
-	sizeof(struct ice_label))
+#define ICE_MAX_LABELS_IN_BUF ICE_MAX_ENTRIES_IN_BUF(struct_size((struct ice_label_section *)0, label, 1) - \
+	sizeof(struct ice_label), sizeof(struct ice_label))
 
 struct ice_sw_fv_section {
 	__le16 count;
 	__le16 base_offset;
-	struct ice_fv fv[1];
+	struct ice_fv fv[];
+};
+
+struct ice_sw_fv_list_entry {
+	struct list_head list_entry;
+	u32 profile_id;
+	struct ice_fv *fv_ptr;
 };
 
 /* The BOOST TCAM stores the match packet header in reverse order, meaning
@@ -208,30 +598,54 @@ struct ice_boost_tcam_entry {
 struct ice_boost_tcam_section {
 	__le16 count;
 	__le16 reserved;
-	struct ice_boost_tcam_entry tcam[1];
+	struct ice_boost_tcam_entry tcam[];
 };
 
-#define ICE_MAX_BST_TCAMS_IN_BUF ICE_MAX_ENTRIES_IN_BUF( \
-	sizeof(struct ice_boost_tcam_section) - \
+#define ICE_MAX_BST_TCAMS_IN_BUF ICE_MAX_ENTRIES_IN_BUF(struct_size((struct ice_boost_tcam_section *)0, tcam, 1) - \
 	sizeof(struct ice_boost_tcam_entry), \
 	sizeof(struct ice_boost_tcam_entry))
 
+/* package Marker PType TCAM entry */
+struct ice_marker_ptype_tcam_entry {
+#define ICE_MARKER_PTYPE_TCAM_ADDR_MAX	1024
+	__le16 addr;
+	__le16 ptype;
+	u8 keys[20];
+};
+
+struct ice_marker_ptype_tcam_section {
+	__le16 count;
+	__le16 reserved;
+	struct ice_marker_ptype_tcam_entry tcam[];
+};
+
+#define ICE_MAX_MARKER_PTYPE_TCAMS_IN_BUF ICE_MAX_ENTRIES_IN_BUF(struct_size((struct ice_marker_ptype_tcam_section *)0, tcam, 1) - \
+	sizeof(struct ice_marker_ptype_tcam_entry), \
+	sizeof(struct ice_marker_ptype_tcam_entry))
+
 struct ice_xlt1_section {
 	__le16 count;
 	__le16 offset;
-	u8 value[1];
-} __packed;
+	u8 value[];
+};
 
 struct ice_xlt2_section {
 	__le16 count;
 	__le16 offset;
-	__le16 value[1];
+	__le16 value[];
 };
 
 struct ice_prof_redir_section {
 	__le16 count;
 	__le16 offset;
-	u8 redir_value[1];
+	u8 redir_value[];
+};
+
+/* package buffer building */
+
+struct ice_buf_build {
+	struct ice_buf buf;
+	u16 reserved_section_table_entries;
 };
 
 struct ice_pkg_enum {
@@ -248,11 +662,65 @@ struct ice_pkg_enum {
 	void *(*handler)(u32 sect_type, void *section, u32 index, u32 *offset);
 };
 
+/* Tunnel enabling */
+
+enum ice_tunnel_type {
+	TNL_VXLAN = 0,
+	TNL_GENEVE,
+	TNL_ECPRI,
+	TNL_GTP,
+	TNL_LAST = 0xFF,
+	TNL_ALL = 0xFF,
+};
+
+struct ice_tunnel_type_scan {
+	enum ice_tunnel_type type;
+	const char *label_prefix;
+};
+
+struct ice_tunnel_entry {
+	enum ice_tunnel_type type;
+	u16 boost_addr;
+	u16 port;
+	u16 ref;
+	struct ice_boost_tcam_entry *boost_entry;
+	u8 valid;
+	u8 in_use;
+	u8 marked;
+};
+
+#define ICE_TUNNEL_MAX_ENTRIES	16
+
+struct ice_tunnel_table {
+	struct ice_tunnel_entry tbl[ICE_TUNNEL_MAX_ENTRIES];
+	u16 count;
+};
+
+struct ice_dvm_entry {
+	u16 boost_addr;
+	u16 enable;
+	struct ice_boost_tcam_entry *boost_entry;
+};
+
+#define ICE_DVM_MAX_ENTRIES	48
+
+struct ice_dvm_table {
+	struct ice_dvm_entry tbl[ICE_DVM_MAX_ENTRIES];
+	u16 count;
+};
+
+struct ice_pkg_es {
+	__le16 count;
+	__le16 offset;
+	struct ice_fv_word es[];
+};
+
 struct ice_es {
 	u32 sid;
 	u16 count;
 	u16 fvw;
 	u16 *ref_count;
+	u32 *mask_ena;
 	struct list_head prof_map;
 	struct ice_fv_word *t;
 	struct mutex prof_map_lock;	/* protect access to profiles list */
@@ -280,6 +748,37 @@ struct ice_ptg_ptype {
 	u8 ptg;
 };
 
+#define ICE_MAX_TCAM_PER_PROFILE	32
+#define ICE_MAX_PTG_PER_PROFILE		32
+
+struct ice_prof_map {
+	struct list_head list;
+	u64 profile_cookie;
+	u64 context;
+	u8 prof_id;
+	u8 ptg_cnt;
+	u8 ptg[ICE_MAX_PTG_PER_PROFILE];
+	struct ice_ptype_attrib_info attr[ICE_MAX_PTG_PER_PROFILE];
+};
+
+#define ICE_INVALID_TCAM	0xFFFF
+
+struct ice_tcam_inf {
+	u16 tcam_idx;
+	struct ice_ptype_attrib_info attr;
+	u8 ptg;
+	u8 prof_id;
+	u8 in_use;
+};
+
+struct ice_vsig_prof {
+	struct list_head list;
+	u64 profile_cookie;
+	u8 prof_id;
+	u8 tcam_count;
+	struct ice_tcam_inf tcam[ICE_MAX_TCAM_PER_PROFILE];
+};
+
 struct ice_vsig_entry {
 	struct list_head prop_lst;
 	struct ice_vsig_vsi *first_vsi;
@@ -316,8 +815,8 @@ struct ice_xlt1 {
 #define ICE_PF_NUM_S	13
 #define ICE_PF_NUM_M	(0x07 << ICE_PF_NUM_S)
 #define ICE_VSIG_VALUE(vsig, pf_id) \
-	(u16)((((u16)(vsig)) & ICE_VSIG_IDX_M) | \
-	      (((u16)(pf_id) << ICE_PF_NUM_S) & ICE_PF_NUM_M))
+	((u16)((((u16)(vsig)) & ICE_VSIG_IDX_M) | \
+	       (((u16)(pf_id) << ICE_PF_NUM_S) & ICE_PF_NUM_M)))
 #define ICE_DEFAULT_VSIG	0
 
 /* XLT2 Table */
@@ -329,6 +828,32 @@ struct ice_xlt2 {
 	u16 count;
 };
 
+/* Extraction sequence - list of match fields:
+ * protocol ID, offset, profile length
+ */
+union ice_match_fld {
+	struct {
+		u8 prot_id;
+		u8 offset;
+		u8 length;
+		u8 reserved; /* must be zero */
+	} fld;
+	u32 val;
+};
+
+#define ICE_MATCH_LIST_SZ	20
+struct ice_match {
+	u8 count;
+	union ice_match_fld list[ICE_MATCH_LIST_SZ];
+} __packed;
+
+/* Profile ID Management */
+struct ice_prof_id_key {
+	__le16 flags;
+	u8 xlt1;
+	__le16 xlt2_cdid;
+} __packed;
+
 /* Keys are made up of two values, each one-half the size of the key.
  * For TCAM, the entire key is 80 bits wide (or 2, 40-bit wide values)
  */
@@ -343,8 +868,8 @@ struct ice_prof_tcam_entry {
 
 struct ice_prof_id_section {
 	__le16 count;
-	struct ice_prof_tcam_entry entry[1];
-} __packed;
+	struct ice_prof_tcam_entry entry[];
+};
 
 struct ice_prof_tcam {
 	u32 sid;
@@ -360,6 +885,21 @@ struct ice_prof_redir {
 	u16 count;
 };
 
+struct ice_mask {
+	u16 mask;	/* 16-bit mask */
+	u16 idx;	/* index */
+	u16 ref;	/* reference count */
+	u8 in_use;	/* non-zero if used */
+};
+
+struct ice_masks {
+	struct mutex lock;  /* lock to protect this structure */
+	u16 first;	/* first mask owned by the PF */
+	u16 count;	/* number of masks owned by the PF */
+#define ICE_PROF_MASK_COUNT 32
+	struct ice_mask masks[ICE_PROF_MASK_COUNT];
+};
+
 /* Tables per block */
 struct ice_blk_info {
 	struct ice_xlt1 xlt1;
@@ -367,8 +907,72 @@ struct ice_blk_info {
 	struct ice_prof_tcam prof;
 	struct ice_prof_redir prof_redir;
 	struct ice_es es;
+	struct ice_masks masks;
 	u8 overwrite; /* set to true to allow overwrite of table entries */
 	u8 is_list_init;
 };
 
+enum ice_chg_type {
+	ICE_TCAM_NONE = 0,
+	ICE_PTG_ES_ADD,
+	ICE_TCAM_ADD,
+	ICE_VSIG_ADD,
+	ICE_VSIG_REM,
+	ICE_VSI_MOVE,
+};
+
+struct ice_chs_chg {
+	struct list_head list_entry;
+	enum ice_chg_type type;
+
+	u8 add_ptg;
+	u8 add_vsig;
+	u8 add_tcam_idx;
+	u8 add_prof;
+	u16 ptype;
+	u8 ptg;
+	u8 prof_id;
+	u16 vsi;
+	u16 vsig;
+	u16 orig_vsig;
+	u16 tcam_idx;
+	struct ice_ptype_attrib_info attr;
+};
+
+#define ICE_FLOW_PTYPE_MAX		ICE_XLT1_CNT
+
+enum ice_prof_type {
+	ICE_PROF_NON_TUN = 0x1,
+	ICE_PROF_TUN_UDP = 0x2,
+	ICE_PROF_TUN_GRE = 0x4,
+	ICE_PROF_TUN_PPPOE = 0x8,
+	ICE_PROF_TUN_ALL = 0xE,
+	ICE_PROF_ALL = 0xFF,
+};
+
+/* Number of bits/bytes contained in meta init entry. Note, this should be a
+ * multiple of 32 bits.
+ */
+#define ICE_META_INIT_BITS	192
+#define ICE_META_INIT_DW_CNT	(ICE_META_INIT_BITS / (sizeof(__le32) * \
+				 BITS_PER_BYTE))
+
+/* The meta init Flag field starts at this bit */
+#define ICE_META_FLAGS_ST		123
+
+/* The entry and bit to check for Double VLAN Mode (DVM) support */
+#define ICE_META_VLAN_MODE_ENTRY	0
+#define ICE_META_FLAG_VLAN_MODE		60
+#define ICE_META_VLAN_MODE_BIT		(ICE_META_FLAGS_ST + \
+					 ICE_META_FLAG_VLAN_MODE)
+
+struct ice_meta_init_entry {
+	__le32 bm[ICE_META_INIT_DW_CNT];
+};
+
+struct ice_meta_init_section {
+	__le16 count;
+	__le16 offset;
+	struct ice_meta_init_entry entry[1];
+};
 #endif /* _ICE_FLEX_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.c b/drivers/net/ethernet/intel/ice/ice_flow.c
new file mode 100644
index 000000000000..74802e3d0754
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_flow.c
@@ -0,0 +1,4222 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_common.h"
+#include "ice_flow.h"
+
+
+/* Size of known protocol header fields */
+#define ICE_FLOW_FLD_SZ_ETH_TYPE	2
+#define ICE_FLOW_FLD_SZ_VLAN		2
+#define ICE_FLOW_FLD_SZ_IPV4_ADDR	4
+#define ICE_FLOW_FLD_SZ_IPV6_ADDR	16
+#define ICE_FLOW_FLD_SZ_IPV6_PRE32_ADDR	4
+#define ICE_FLOW_FLD_SZ_IPV6_PRE48_ADDR	6
+#define ICE_FLOW_FLD_SZ_IPV6_PRE64_ADDR	8
+#define ICE_FLOW_FLD_SZ_IPV4_ID		2
+#define ICE_FLOW_FLD_SZ_IPV6_ID		4
+#define ICE_FLOW_FLD_SZ_IP_DSCP		1
+#define ICE_FLOW_FLD_SZ_IP_TTL		1
+#define ICE_FLOW_FLD_SZ_IP_PROT		1
+#define ICE_FLOW_FLD_SZ_PORT		2
+#define ICE_FLOW_FLD_SZ_TCP_FLAGS	1
+#define ICE_FLOW_FLD_SZ_ICMP_TYPE	1
+#define ICE_FLOW_FLD_SZ_ICMP_CODE	1
+#define ICE_FLOW_FLD_SZ_ARP_OPER	2
+#define ICE_FLOW_FLD_SZ_GRE_KEYID	4
+#define ICE_FLOW_FLD_SZ_GTP_TEID	4
+#define ICE_FLOW_FLD_SZ_GTP_QFI		2
+#define ICE_FLOW_FLD_SZ_PPPOE_SESS_ID   2
+#define ICE_FLOW_FLD_SZ_PFCP_SEID 8
+#define ICE_FLOW_FLD_SZ_L2TPV3_SESS_ID	4
+#define ICE_FLOW_FLD_SZ_ESP_SPI	4
+#define ICE_FLOW_FLD_SZ_AH_SPI	4
+#define ICE_FLOW_FLD_SZ_NAT_T_ESP_SPI	4
+#define ICE_FLOW_FLD_SZ_VXLAN_VNI	4
+#define ICE_FLOW_FLD_SZ_ECPRI_TP0_PC_ID	2
+
+/* Describe properties of a protocol header field */
+struct ice_flow_field_info {
+	enum ice_flow_seg_hdr hdr;
+	s16 off;	/* Offset from start of a protocol header, in bits */
+	u16 size;	/* Size of fields in bits */
+	u16 mask;	/* 16-bit mask for field */
+};
+
+#define ICE_FLOW_FLD_INFO(_hdr, _offset_bytes, _size_bytes) { \
+	.hdr = _hdr, \
+	.off = (_offset_bytes) * BITS_PER_BYTE, \
+	.size = (_size_bytes) * BITS_PER_BYTE, \
+	.mask = 0, \
+}
+
+#define ICE_FLOW_FLD_INFO_MSK(_hdr, _offset_bytes, _size_bytes, _mask) { \
+	.hdr = _hdr, \
+	.off = (_offset_bytes) * BITS_PER_BYTE, \
+	.size = (_size_bytes) * BITS_PER_BYTE, \
+	.mask = _mask, \
+}
+
+/* Table containing properties of supported protocol header fields */
+static const
+struct ice_flow_field_info ice_flds_info[ICE_FLOW_FIELD_IDX_MAX] = {
+	/* Ether */
+	/* ICE_FLOW_FIELD_IDX_ETH_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ETH, 0, ETH_ALEN),
+	/* ICE_FLOW_FIELD_IDX_ETH_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ETH, ETH_ALEN, ETH_ALEN),
+	/* ICE_FLOW_FIELD_IDX_S_VLAN */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_VLAN, 12, ICE_FLOW_FLD_SZ_VLAN),
+	/* ICE_FLOW_FIELD_IDX_C_VLAN */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_VLAN, 14, ICE_FLOW_FLD_SZ_VLAN),
+	/* ICE_FLOW_FIELD_IDX_ETH_TYPE */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ETH, 0, ICE_FLOW_FLD_SZ_ETH_TYPE),
+	/* IPv4 / IPv6 */
+	/* ICE_FLOW_FIELD_IDX_IPV4_DSCP */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_IPV4, 0, ICE_FLOW_FLD_SZ_IP_DSCP,
+			      0x00fc),
+	/* ICE_FLOW_FIELD_IDX_IPV6_DSCP */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_IPV6, 0, ICE_FLOW_FLD_SZ_IP_DSCP,
+			      0x0ff0),
+	/* ICE_FLOW_FIELD_IDX_IPV4_TTL */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_NONE, 8,
+			      ICE_FLOW_FLD_SZ_IP_TTL, 0xff00),
+	/* ICE_FLOW_FIELD_IDX_IPV4_PROT */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_NONE, 8,
+			      ICE_FLOW_FLD_SZ_IP_PROT, 0x00ff),
+	/* ICE_FLOW_FIELD_IDX_IPV6_TTL */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_NONE, 6,
+			      ICE_FLOW_FLD_SZ_IP_TTL, 0x00ff),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PROT */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_NONE, 6,
+			      ICE_FLOW_FLD_SZ_IP_PROT, 0xff00),
+	/* ICE_FLOW_FIELD_IDX_IPV4_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV4, 12, ICE_FLOW_FLD_SZ_IPV4_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV4_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV4, 16, ICE_FLOW_FLD_SZ_IPV4_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8, ICE_FLOW_FLD_SZ_IPV6_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24, ICE_FLOW_FLD_SZ_IPV6_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV4_FRAG */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV_FRAG, 4,
+			  ICE_FLOW_FLD_SZ_IPV4_ID),
+	/* ICE_FLOW_FIELD_IDX_IPV6_FRAG */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV_FRAG, 4,
+			  ICE_FLOW_FLD_SZ_IPV6_ID),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE32_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE32_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE32_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE32_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE48_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE48_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE48_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE48_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE64_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE64_ADDR),
+	/* Transport */
+	/* ICE_FLOW_FIELD_IDX_TCP_SRC_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_TCP, 0, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_TCP_DST_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_TCP, 2, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_UDP_SRC_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_UDP, 0, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_UDP_DST_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_UDP, 2, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_SCTP, 0, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_SCTP_DST_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_SCTP, 2, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_TCP_FLAGS */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_TCP, 13, ICE_FLOW_FLD_SZ_TCP_FLAGS),
+	/* ARP */
+	/* ICE_FLOW_FIELD_IDX_ARP_SIP */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 14, ICE_FLOW_FLD_SZ_IPV4_ADDR),
+	/* ICE_FLOW_FIELD_IDX_ARP_DIP */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 24, ICE_FLOW_FLD_SZ_IPV4_ADDR),
+	/* ICE_FLOW_FIELD_IDX_ARP_SHA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 8, ETH_ALEN),
+	/* ICE_FLOW_FIELD_IDX_ARP_DHA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 18, ETH_ALEN),
+	/* ICE_FLOW_FIELD_IDX_ARP_OP */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 6, ICE_FLOW_FLD_SZ_ARP_OPER),
+	/* ICMP */
+	/* ICE_FLOW_FIELD_IDX_ICMP_TYPE */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ICMP, 0, ICE_FLOW_FLD_SZ_ICMP_TYPE),
+	/* ICE_FLOW_FIELD_IDX_ICMP_CODE */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ICMP, 1, ICE_FLOW_FLD_SZ_ICMP_CODE),
+	/* GRE */
+	/* ICE_FLOW_FIELD_IDX_GRE_KEYID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GRE, 12, ICE_FLOW_FLD_SZ_GRE_KEYID),
+	/* GTP */
+	/* ICE_FLOW_FIELD_IDX_GTPC_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPC_TEID, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* ICE_FLOW_FIELD_IDX_GTPU_IP_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPU_IP, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* ICE_FLOW_FIELD_IDX_GTPU_EH_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPU_EH, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* ICE_FLOW_FIELD_IDX_GTPU_EH_QFI */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_GTPU_EH, 22,
+			      ICE_FLOW_FLD_SZ_GTP_QFI, 0x3f00),
+	/* ICE_FLOW_FIELD_IDX_GTPU_UP_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPU_UP, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* ICE_FLOW_FIELD_IDX_GTPU_DWN_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPU_DWN, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* PPPOE */
+	/* ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_PPPOE, 2,
+			  ICE_FLOW_FLD_SZ_PPPOE_SESS_ID),
+	/* PFCP */
+	/* ICE_FLOW_FIELD_IDX_PFCP_SEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_PFCP_SESSION, 12,
+			  ICE_FLOW_FLD_SZ_PFCP_SEID),
+	/* L2TPV3 */
+	/* ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_L2TPV3, 0,
+			  ICE_FLOW_FLD_SZ_L2TPV3_SESS_ID),
+	/* ESP */
+	/* ICE_FLOW_FIELD_IDX_ESP_SPI */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ESP, 0,
+			  ICE_FLOW_FLD_SZ_ESP_SPI),
+	/* AH */
+	/* ICE_FLOW_FIELD_IDX_AH_SPI */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_AH, 4,
+			  ICE_FLOW_FLD_SZ_AH_SPI),
+	/* NAT_T_ESP */
+	/* ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_NAT_T_ESP, 8,
+			  ICE_FLOW_FLD_SZ_NAT_T_ESP_SPI),
+	/* ICE_FLOW_FIELD_IDX_VXLAN_VNI */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_VXLAN, 12,
+			  ICE_FLOW_FLD_SZ_VXLAN_VNI),
+	/* ECPRI_TP0 */
+	/* ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ECPRI_TP0, 4,
+			  ICE_FLOW_FLD_SZ_ECPRI_TP0_PC_ID),
+	/* UDP_ECPRI_TP0 */
+	/* ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0, 12,
+			  ICE_FLOW_FLD_SZ_ECPRI_TP0_PC_ID),
+};
+
+/* Bitmaps indicating relevant packet types for a particular protocol header
+ *
+ * Packet types for packets with an Outer/First/Single MAC header
+ */
+static const u32 ice_ptypes_mac_ofos[] = {
+	0xFDC00846, 0xBFBF7F7E, 0xF70001DF, 0xFEFDFDFB,
+	0x0000077E, 0x000003FF, 0x00000000, 0x00000000,
+	0x00400000, 0x03FFF000, 0xFFFFFFE0, 0x00100707,
+	0xFFFFFF00, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last MAC VLAN header */
+static const u32 ice_ptypes_macvlan_il[] = {
+	0x00000000, 0xBC000000, 0x000001DF, 0xF0000000,
+	0x0000077E, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv4 header, does NOT
+ * include IPV4 other PTYPEs
+ */
+static const u32 ice_ptypes_ipv4_ofos[] = {
+	0x1DC00000, 0x24000800, 0x00000000, 0x00000000,
+	0x00000000, 0x00000155, 0x00000000, 0x00000000,
+	0x00000000, 0x000FC000, 0x000002A0, 0x00100000,
+	0x00001500, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv4 header, includes
+ * IPV4 other PTYPEs
+ */
+static const u32 ice_ptypes_ipv4_ofos_all[] = {
+	0x1DC00000, 0x24000800, 0x00000000, 0x00000000,
+	0x00000000, 0x00000155, 0x00000000, 0x00000000,
+	0x00000000, 0x000FC000, 0x83E0FAA0, 0x00000101,
+	0x03FFD500, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last IPv4 header */
+static const u32 ice_ptypes_ipv4_il[] = {
+	0xE0000000, 0xB807700E, 0x80000003, 0xE01DC03B,
+	0x0000000E, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x001FF800, 0x00100000,
+	0xFC0FC000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv6 header, does NOT
+ * include IVP6 other PTYPEs
+ */
+static const u32 ice_ptypes_ipv6_ofos[] = {
+	0x00000000, 0x00000000, 0x77000000, 0x10002000,
+	0x00000000, 0x000002AA, 0x00000000, 0x00000000,
+	0x00000000, 0x03F00000, 0x00000540, 0x00000000,
+	0x00002A00, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv6 header, includes
+ * IPV6 other PTYPEs
+ */
+static const u32 ice_ptypes_ipv6_ofos_all[] = {
+	0x00000000, 0x00000000, 0x77000000, 0x10002000,
+	0x00000000, 0x000002AA, 0x00000000, 0x00000000,
+	0x00000000, 0x03F00000, 0x7C1F0540, 0x00000206,
+	0xFC002A00, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last IPv6 header */
+static const u32 ice_ptypes_ipv6_il[] = {
+	0x00000000, 0x03B80770, 0x000001DC, 0x0EE00000,
+	0x00000770, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x7FE00000, 0x00000000,
+	0x03F00000, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv4 header - no L4 */
+static const u32 ice_ptypes_ipv4_ofos_no_l4[] = {
+	0x10C00000, 0x04000800, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last IPv4 header - no L4 */
+static const u32 ice_ptypes_ipv4_il_no_l4[] = {
+	0x60000000, 0x18043008, 0x80000002, 0x6010c021,
+	0x00000008, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv6 header - no L4 */
+static const u32 ice_ptypes_ipv6_ofos_no_l4[] = {
+	0x00000000, 0x00000000, 0x43000000, 0x10002000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last IPv6 header - no L4 */
+static const u32 ice_ptypes_ipv6_il_no_l4[] = {
+	0x00000000, 0x02180430, 0x0000010c, 0x086010c0,
+	0x00000430, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outermost/First ARP header */
+static const u32 ice_ptypes_arp_of[] = {
+	0x00000800, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* UDP Packet types for non-tunneled packets or tunneled
+ * packets with inner UDP.
+ */
+static const u32 ice_ptypes_udp_il[] = {
+	0x81000000, 0x20204040, 0x04000010, 0x80810102,
+	0x00000040, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00410000, 0x908427E0, 0x00100007,
+	0x10410000, 0x00000004, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last TCP header */
+static const u32 ice_ptypes_tcp_il[] = {
+	0x04000000, 0x80810102, 0x10000040, 0x02040408,
+	0x00000102, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00820000, 0x21084000, 0x00000000,
+	0x20820000, 0x00000008, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last SCTP header */
+static const u32 ice_ptypes_sctp_il[] = {
+	0x08000000, 0x01020204, 0x20000081, 0x04080810,
+	0x00000204, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x01040000, 0x00000000, 0x00000000,
+	0x41040000, 0x00000010, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outermost/First ICMP header */
+static const u32 ice_ptypes_icmp_of[] = {
+	0x10000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last ICMP header */
+static const u32 ice_ptypes_icmp_il[] = {
+	0x00000000, 0x02040408, 0x40000102, 0x08101020,
+	0x00000408, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x42108000, 0x00000000,
+	0x82080000, 0x00000020, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outermost/First GRE header */
+static const u32 ice_ptypes_gre_of[] = {
+	0x00000000, 0xBFBF7800, 0x000001DF, 0xFEFDE000,
+	0x0000017E, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last MAC header */
+static const u32 ice_ptypes_mac_il[] = {
+	0x00000000, 0x20000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for GTPC */
+static const u32 ice_ptypes_gtpc[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x000001E0, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for VXLAN with VNI */
+static const u32 ice_ptypes_vxlan_vni[] = {
+	0x00000000, 0xBFBFF800, 0x00EFDFDF, 0xFEFDE000,
+	0x03BF7F7E, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for GTPC with TEID */
+static const u32 ice_ptypes_gtpc_tid[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000060, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for GTPU */
+static const struct ice_ptype_attributes ice_attr_gtpu_session[] = {
+	{ ICE_MAC_IPV4_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_SESSION },
+};
+
+static const struct ice_ptype_attributes ice_attr_gtpu_eh[] = {
+	{ ICE_MAC_IPV4_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_PDU_EH },
+};
+
+static const struct ice_ptype_attributes ice_attr_gtpu_down[] = {
+	{ ICE_MAC_IPV4_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+};
+
+static const struct ice_ptype_attributes ice_attr_gtpu_up[] = {
+	{ ICE_MAC_IPV4_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_UPLINK },
+};
+
+static const u32 ice_ptypes_gtpu[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x7FFFFE00, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for pppoe */
+static const u32 ice_ptypes_pppoe[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x03ffe000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with PFCP NODE header */
+static const u32 ice_ptypes_pfcp_node[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x80000000, 0x00000002,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with PFCP SESSION header */
+static const u32 ice_ptypes_pfcp_session[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000005,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for l2tpv3 */
+static const u32 ice_ptypes_l2tpv3[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000300,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for esp */
+static const u32 ice_ptypes_esp[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000003, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for ah */
+static const u32 ice_ptypes_ah[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x0000000C, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with NAT_T ESP header */
+static const u32 ice_ptypes_nat_t_esp[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000030, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_mac_non_ip_ofos[] = {
+	0x00000846, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00400000, 0x03FFF000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_gtpu_no_ip[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000600, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_ecpri_tp0[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000400,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_udp_ecpri_tp0[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00100000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_l2tpv2[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0xFFFFFF00, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_ppp[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0xFFFFF000, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_ipv4_frag[] = {
+	0x00400000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_ipv6_frag[] = {
+	0x00000000, 0x00000000, 0x01000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Manage parameters and info. used during the creation of a flow profile */
+struct ice_flow_prof_params {
+	enum ice_block blk;
+	u16 entry_length; /* # of bytes formatted entry will require */
+	u8 es_cnt;
+	struct ice_flow_prof *prof;
+
+	/* For ACL, the es[0] will have the data of ICE_RX_MDID_PKT_FLAGS_15_0
+	 * This will give us the direction flags.
+	 */
+	struct ice_fv_word es[ICE_MAX_FV_WORDS];
+	/* attributes can be used to add attributes to a particular PTYPE */
+	const struct ice_ptype_attributes *attr;
+	u16 attr_cnt;
+
+	u16 mask[ICE_MAX_FV_WORDS];
+	DECLARE_BITMAP(ptypes, ICE_FLOW_PTYPE_MAX);
+};
+
+#define ICE_FLOW_RSS_HDRS_INNER_MASK \
+	(ICE_FLOW_SEG_HDR_PPPOE | ICE_FLOW_SEG_HDR_GTPC | \
+	ICE_FLOW_SEG_HDR_GTPC_TEID | ICE_FLOW_SEG_HDR_GTPU | \
+	ICE_FLOW_SEG_HDR_PFCP_SESSION | ICE_FLOW_SEG_HDR_L2TPV3 | \
+	ICE_FLOW_SEG_HDR_ESP | ICE_FLOW_SEG_HDR_AH | \
+	ICE_FLOW_SEG_HDR_NAT_T_ESP | ICE_FLOW_SEG_HDR_GTPU_NON_IP | \
+	ICE_FLOW_SEG_HDR_ECPRI_TP0 | ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0 | \
+	ICE_FLOW_SEG_HDR_L2TPV2 | ICE_FLOW_SEG_HDR_PPP)
+
+#define ICE_FLOW_SEG_HDRS_L3_MASK	\
+	(ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV6 | \
+	 ICE_FLOW_SEG_HDR_ARP)
+#define ICE_FLOW_SEG_HDRS_L4_MASK	\
+	(ICE_FLOW_SEG_HDR_ICMP | ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP | \
+	 ICE_FLOW_SEG_HDR_SCTP)
+/* mask for L4 protocols that are NOT part of IPV4/6 OTHER PTYPE groups */
+#define ICE_FLOW_SEG_HDRS_L4_MASK_NO_OTHER	\
+	(ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_SCTP)
+
+/**
+ * ice_flow_val_hdrs - validates packet segments for valid protocol headers
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ */
+static enum ice_status
+ice_flow_val_hdrs(struct ice_flow_seg_info *segs, u8 segs_cnt)
+{
+	u8 i;
+
+	for (i = 0; i < segs_cnt; i++) {
+		/* Multiple L3 headers */
+		if (segs[i].hdrs & ICE_FLOW_SEG_HDRS_L3_MASK &&
+		    !is_power_of_2(segs[i].hdrs & ICE_FLOW_SEG_HDRS_L3_MASK))
+			return ICE_ERR_PARAM;
+
+		/* Multiple L4 headers */
+		if (segs[i].hdrs & ICE_FLOW_SEG_HDRS_L4_MASK &&
+		    !is_power_of_2(segs[i].hdrs & ICE_FLOW_SEG_HDRS_L4_MASK))
+			return ICE_ERR_PARAM;
+	}
+
+	return 0;
+}
+
+/* Sizes of fixed known protocol headers without header options */
+#define ICE_FLOW_PROT_HDR_SZ_MAC	14
+#define ICE_FLOW_PROT_HDR_SZ_MAC_VLAN	(ICE_FLOW_PROT_HDR_SZ_MAC + 2)
+#define ICE_FLOW_PROT_HDR_SZ_IPV4	20
+#define ICE_FLOW_PROT_HDR_SZ_IPV6	40
+#define ICE_FLOW_PROT_HDR_SZ_ARP	28
+#define ICE_FLOW_PROT_HDR_SZ_ICMP	8
+#define ICE_FLOW_PROT_HDR_SZ_TCP	20
+#define ICE_FLOW_PROT_HDR_SZ_UDP	8
+#define ICE_FLOW_PROT_HDR_SZ_SCTP	12
+
+/**
+ * ice_flow_calc_seg_sz - calculates size of a packet segment based on headers
+ * @params: information about the flow to be processed
+ * @seg: index of packet segment whose header size is to be determined
+ */
+static u16 ice_flow_calc_seg_sz(struct ice_flow_prof_params *params, u8 seg)
+{
+	u16 sz;
+
+	/* L2 headers */
+	sz = (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_VLAN) ?
+		ICE_FLOW_PROT_HDR_SZ_MAC_VLAN : ICE_FLOW_PROT_HDR_SZ_MAC;
+
+	/* L3 headers */
+	if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_IPV4)
+		sz += ICE_FLOW_PROT_HDR_SZ_IPV4;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_IPV6)
+		sz += ICE_FLOW_PROT_HDR_SZ_IPV6;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_ARP)
+		sz += ICE_FLOW_PROT_HDR_SZ_ARP;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDRS_L4_MASK)
+		/* A L3 header is required if L4 is specified */
+		return 0;
+
+	/* L4 headers */
+	if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_ICMP)
+		sz += ICE_FLOW_PROT_HDR_SZ_ICMP;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_TCP)
+		sz += ICE_FLOW_PROT_HDR_SZ_TCP;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_UDP)
+		sz += ICE_FLOW_PROT_HDR_SZ_UDP;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_SCTP)
+		sz += ICE_FLOW_PROT_HDR_SZ_SCTP;
+
+	return sz;
+}
+
+/**
+ * ice_flow_proc_seg_hdrs - process protocol headers present in pkt segments
+ * @params: information about the flow to be processed
+ *
+ * This function identifies the packet types associated with the protocol
+ * headers being present in packet segments of the specified flow profile.
+ */
+static enum ice_status
+ice_flow_proc_seg_hdrs(struct ice_flow_prof_params *params)
+{
+	struct ice_flow_prof *prof;
+	u8 i;
+
+	memset(params->ptypes, 0xff, sizeof(params->ptypes));
+
+	prof = params->prof;
+
+	for (i = 0; i < params->prof->segs_cnt; i++) {
+		const unsigned long *src;
+		u32 hdrs;
+
+		hdrs = prof->segs[i].hdrs;
+
+		if (hdrs & ICE_FLOW_SEG_HDR_ETH) {
+			src = !i ? (const unsigned long *)ice_ptypes_mac_ofos :
+				(const unsigned long *)ice_ptypes_mac_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (i && hdrs & ICE_FLOW_SEG_HDR_VLAN) {
+			src = (const unsigned long *)ice_ptypes_macvlan_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (!i && hdrs & ICE_FLOW_SEG_HDR_ARP) {
+			bitmap_and(params->ptypes, params->ptypes,
+				   (const unsigned long *)ice_ptypes_arp_of,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_ECPRI_TP0) {
+			src = (const unsigned long *)ice_ptypes_ecpri_tp0;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if ((hdrs & ICE_FLOW_SEG_HDR_IPV4) &&
+		    (hdrs & ICE_FLOW_SEG_HDR_IPV_OTHER)) {
+			src = i ?
+				(const unsigned long *)ice_ptypes_ipv4_il :
+				(const unsigned long *)ice_ptypes_ipv4_ofos_all;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV6) &&
+			   (hdrs & ICE_FLOW_SEG_HDR_IPV_OTHER)) {
+			src = i ?
+				(const unsigned long *)ice_ptypes_ipv6_il :
+				(const unsigned long *)ice_ptypes_ipv6_ofos_all;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV4) &&
+				(hdrs & ICE_FLOW_SEG_HDR_IPV_FRAG)) {
+			src = (const unsigned long *)ice_ptypes_ipv4_frag;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV6) &&
+				(hdrs & ICE_FLOW_SEG_HDR_IPV_FRAG)) {
+			src = (const unsigned long *)ice_ptypes_ipv6_frag;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV4) &&
+			   !(hdrs & ICE_FLOW_SEG_HDRS_L4_MASK_NO_OTHER)) {
+			src = !i ? (const unsigned long *)ice_ptypes_ipv4_ofos_no_l4 :
+				(const unsigned long *)ice_ptypes_ipv4_il_no_l4;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_IPV4) {
+			src = !i ? (const unsigned long *)ice_ptypes_ipv4_ofos :
+				(const unsigned long *)ice_ptypes_ipv4_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV6) &&
+			   !(hdrs & ICE_FLOW_SEG_HDRS_L4_MASK_NO_OTHER)) {
+			src = !i ? (const unsigned long *)ice_ptypes_ipv6_ofos_no_l4 :
+				(const unsigned long *)ice_ptypes_ipv6_il_no_l4;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_IPV6) {
+			src = !i ? (const unsigned long *)ice_ptypes_ipv6_ofos :
+				(const unsigned long *)ice_ptypes_ipv6_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_ETH_NON_IP) {
+			src = (const unsigned long *)ice_ptypes_mac_non_ip_ofos;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_PPPOE) {
+			src = (const unsigned long *)ice_ptypes_pppoe;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else {
+			src = (const unsigned long *)ice_ptypes_pppoe;
+			bitmap_andnot(params->ptypes, params->ptypes, src,
+				      ICE_FLOW_PTYPE_MAX);
+		}
+		if (hdrs & ICE_FLOW_SEG_HDR_UDP) {
+			src = (const unsigned long *)ice_ptypes_udp_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_TCP) {
+			bitmap_and(params->ptypes, params->ptypes,
+				   (const unsigned long *)ice_ptypes_tcp_il,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_SCTP) {
+			src = (const unsigned long *)ice_ptypes_sctp_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_ICMP) {
+			src = !i ? (const unsigned long *)ice_ptypes_icmp_of :
+				(const unsigned long *)ice_ptypes_icmp_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GRE) {
+			if (!i) {
+				src = (const unsigned long *)ice_ptypes_gre_of;
+				bitmap_and(params->ptypes, params->ptypes,
+					   src, ICE_FLOW_PTYPE_MAX);
+			}
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPC) {
+			src = (const unsigned long *)ice_ptypes_gtpc;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPC_TEID) {
+			src = (const unsigned long *)ice_ptypes_gtpc_tid;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_NON_IP) {
+			src = (const unsigned long *)ice_ptypes_gtpu_no_ip;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_DWN) {
+			src = (const unsigned long *)ice_ptypes_gtpu;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+
+			/* Attributes for GTP packet with downlink */
+			params->attr = ice_attr_gtpu_down;
+			params->attr_cnt = ARRAY_SIZE(ice_attr_gtpu_down);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_UP) {
+			src = (const unsigned long *)ice_ptypes_gtpu;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+
+			/* Attributes for GTP packet with uplink */
+			params->attr = ice_attr_gtpu_up;
+			params->attr_cnt = ARRAY_SIZE(ice_attr_gtpu_up);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_EH) {
+			src = (const unsigned long *)ice_ptypes_gtpu;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+
+			/* Attributes for GTP packet with Extension Header */
+			params->attr = ice_attr_gtpu_eh;
+			params->attr_cnt = ARRAY_SIZE(ice_attr_gtpu_eh);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_IP) {
+			src = (const unsigned long *)ice_ptypes_gtpu;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+
+			/* Attributes for GTP packet without Extension Header */
+			params->attr = ice_attr_gtpu_session;
+			params->attr_cnt = ARRAY_SIZE(ice_attr_gtpu_session);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_L2TPV2) {
+			src = (const unsigned long *)ice_ptypes_l2tpv2;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_L2TPV3) {
+			src = (const unsigned long *)ice_ptypes_l2tpv3;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_ESP) {
+			src = (const unsigned long *)ice_ptypes_esp;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_AH) {
+			src = (const unsigned long *)ice_ptypes_ah;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_NAT_T_ESP) {
+			src = (const unsigned long *)ice_ptypes_nat_t_esp;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_VXLAN) {
+			src = (const unsigned long *)ice_ptypes_vxlan_vni;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0) {
+			src = (const unsigned long *)ice_ptypes_udp_ecpri_tp0;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_PPP) {
+			src = (const unsigned long *)ice_ptypes_ppp;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_PFCP) {
+			if (hdrs & ICE_FLOW_SEG_HDR_PFCP_NODE)
+				src =
+				(const unsigned long *)ice_ptypes_pfcp_node;
+			else
+				src =
+				(const unsigned long *)ice_ptypes_pfcp_session;
+
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else {
+			src = (const unsigned long *)ice_ptypes_pfcp_node;
+			bitmap_andnot(params->ptypes, params->ptypes, src,
+				      ICE_FLOW_PTYPE_MAX);
+
+			src = (const unsigned long *)ice_ptypes_pfcp_session;
+			bitmap_andnot(params->ptypes, params->ptypes, src,
+				      ICE_FLOW_PTYPE_MAX);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_xtract_pkt_flags - Create an extr sequence entry for packet flags
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ * @flags: The value of pkt_flags[x:x] in Rx/Tx MDID metadata.
+ *
+ * This function will allocate an extraction sequence entries for a DWORD size
+ * chunk of the packet flags.
+ */
+static enum ice_status
+ice_flow_xtract_pkt_flags(struct ice_hw *hw,
+			  struct ice_flow_prof_params *params,
+			  enum ice_flex_mdid_pkt_flags flags)
+{
+	u8 fv_words = hw->blk[params->blk].es.fvw;
+	u8 idx;
+
+	/* Make sure the number of extraction sequence entries required does not
+	 * exceed the block's capacity.
+	 */
+	if (params->es_cnt >= fv_words)
+		return ICE_ERR_MAX_LIMIT;
+
+	/* some blocks require a reversed field vector layout */
+	if (hw->blk[params->blk].es.reverse)
+		idx = fv_words - params->es_cnt - 1;
+	else
+		idx = params->es_cnt;
+
+	params->es[idx].prot_id = ICE_PROT_META_ID;
+	params->es[idx].off = flags;
+	params->es_cnt++;
+
+	return 0;
+}
+
+/**
+ * ice_flow_xtract_fld - Create an extraction sequence entry for the given field
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ * @seg: packet segment index of the field to be extracted
+ * @fld: ID of field to be extracted
+ * @match: bitfield of all fields
+ *
+ * This function determines the protocol ID, offset, and size of the given
+ * field. It then allocates one or more extraction sequence entries for the
+ * given field, and fill the entries with protocol ID and offset information.
+ */
+static enum ice_status
+ice_flow_xtract_fld(struct ice_hw *hw, struct ice_flow_prof_params *params,
+		    u8 seg, enum ice_flow_field fld, u64 match)
+{
+	enum ice_flow_field sib = ICE_FLOW_FIELD_IDX_MAX;
+	enum ice_prot_id prot_id = ICE_PROT_ID_INVAL;
+	u8 fv_words = hw->blk[params->blk].es.fvw;
+	struct ice_flow_fld_info *flds;
+	u16 cnt, ese_bits, i;
+	u16 sib_mask = 0;
+	u16 mask;
+	u16 off;
+
+	flds = params->prof->segs[seg].fields;
+
+	switch (fld) {
+	case ICE_FLOW_FIELD_IDX_ETH_DA:
+	case ICE_FLOW_FIELD_IDX_ETH_SA:
+	case ICE_FLOW_FIELD_IDX_S_VLAN:
+	case ICE_FLOW_FIELD_IDX_C_VLAN:
+		prot_id = seg == 0 ? ICE_PROT_MAC_OF_OR_S : ICE_PROT_MAC_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_ETH_TYPE:
+		prot_id = seg == 0 ? ICE_PROT_ETYPE_OL : ICE_PROT_ETYPE_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV4_DSCP:
+		prot_id = seg == 0 ? ICE_PROT_IPV4_OF_OR_S : ICE_PROT_IPV4_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV6_DSCP:
+		prot_id = seg == 0 ? ICE_PROT_IPV6_OF_OR_S : ICE_PROT_IPV6_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV4_TTL:
+	case ICE_FLOW_FIELD_IDX_IPV4_PROT:
+		prot_id = seg == 0 ? ICE_PROT_IPV4_OF_OR_S : ICE_PROT_IPV4_IL;
+
+		/* TTL and PROT share the same extraction seq. entry.
+		 * Each is considered a sibling to the other in terms of sharing
+		 * the same extraction sequence entry.
+		 */
+		if (fld == ICE_FLOW_FIELD_IDX_IPV4_TTL)
+			sib = ICE_FLOW_FIELD_IDX_IPV4_PROT;
+		else
+			sib = ICE_FLOW_FIELD_IDX_IPV4_TTL;
+
+		/* If the sibling field is also included, that field's
+		 * mask needs to be included.
+		 */
+		if (match & BIT(sib))
+			sib_mask = ice_flds_info[sib].mask;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV6_TTL:
+	case ICE_FLOW_FIELD_IDX_IPV6_PROT:
+		prot_id = seg == 0 ? ICE_PROT_IPV6_OF_OR_S : ICE_PROT_IPV6_IL;
+
+		/* TTL and PROT share the same extraction seq. entry.
+		 * Each is considered a sibling to the other in terms of sharing
+		 * the same extraction sequence entry.
+		 */
+		if (fld == ICE_FLOW_FIELD_IDX_IPV6_TTL)
+			sib = ICE_FLOW_FIELD_IDX_IPV6_PROT;
+		else
+			sib = ICE_FLOW_FIELD_IDX_IPV6_TTL;
+
+		/* If the sibling field is also included, that field's
+		 * mask needs to be included.
+		 */
+		if (match & BIT(sib))
+			sib_mask = ice_flds_info[sib].mask;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV4_SA:
+	case ICE_FLOW_FIELD_IDX_IPV4_DA:
+		prot_id = seg == 0 ? ICE_PROT_IPV4_OF_OR_S : ICE_PROT_IPV4_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV4_ID:
+		prot_id = ICE_PROT_IPV4_OF_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV6_SA:
+	case ICE_FLOW_FIELD_IDX_IPV6_DA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE32_SA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE32_DA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE48_SA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE48_DA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA:
+		prot_id = seg == 0 ? ICE_PROT_IPV6_OF_OR_S : ICE_PROT_IPV6_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV6_ID:
+		prot_id = ICE_PROT_IPV6_FRAG;
+		break;
+	case ICE_FLOW_FIELD_IDX_TCP_SRC_PORT:
+	case ICE_FLOW_FIELD_IDX_TCP_DST_PORT:
+	case ICE_FLOW_FIELD_IDX_TCP_FLAGS:
+		prot_id = ICE_PROT_TCP_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_UDP_SRC_PORT:
+	case ICE_FLOW_FIELD_IDX_UDP_DST_PORT:
+		prot_id = ICE_PROT_UDP_IL_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT:
+	case ICE_FLOW_FIELD_IDX_SCTP_DST_PORT:
+		prot_id = ICE_PROT_SCTP_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_VXLAN_VNI:
+	case ICE_FLOW_FIELD_IDX_GTPC_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_IP_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_UP_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_DWN_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_EH_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_EH_QFI:
+		/* GTP is accessed through UDP OF protocol */
+		prot_id = ICE_PROT_UDP_OF;
+		break;
+	case ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID:
+		prot_id = ICE_PROT_PPPOE;
+		break;
+	case ICE_FLOW_FIELD_IDX_PFCP_SEID:
+		prot_id = ICE_PROT_UDP_IL_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID:
+		prot_id = ICE_PROT_L2TPV3;
+		break;
+	case ICE_FLOW_FIELD_IDX_ESP_SPI:
+		prot_id = ICE_PROT_ESP_F;
+		break;
+	case ICE_FLOW_FIELD_IDX_AH_SPI:
+		prot_id = ICE_PROT_ESP_2;
+		break;
+	case ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI:
+		prot_id = ICE_PROT_UDP_IL_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID:
+		prot_id = ICE_PROT_ECPRI;
+		break;
+	case ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID:
+		prot_id = ICE_PROT_UDP_IL_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_ARP_SIP:
+	case ICE_FLOW_FIELD_IDX_ARP_DIP:
+	case ICE_FLOW_FIELD_IDX_ARP_SHA:
+	case ICE_FLOW_FIELD_IDX_ARP_DHA:
+	case ICE_FLOW_FIELD_IDX_ARP_OP:
+		prot_id = ICE_PROT_ARP_OF;
+		break;
+	case ICE_FLOW_FIELD_IDX_ICMP_TYPE:
+	case ICE_FLOW_FIELD_IDX_ICMP_CODE:
+		/* ICMP type and code share the same extraction seq. entry */
+		prot_id = (params->prof->segs[seg].hdrs &
+			   ICE_FLOW_SEG_HDR_IPV4) ?
+			ICE_PROT_ICMP_IL : ICE_PROT_ICMPV6_IL;
+		sib = fld == ICE_FLOW_FIELD_IDX_ICMP_TYPE ?
+			ICE_FLOW_FIELD_IDX_ICMP_CODE :
+			ICE_FLOW_FIELD_IDX_ICMP_TYPE;
+		break;
+	case ICE_FLOW_FIELD_IDX_GRE_KEYID:
+		prot_id = ICE_PROT_GRE_OF;
+		break;
+	default:
+		return ICE_ERR_NOT_IMPL;
+	}
+
+	/* Each extraction sequence entry is a word in size, and extracts a
+	 * word-aligned offset from a protocol header.
+	 */
+	ese_bits = ICE_FLOW_FV_EXTRACT_SZ * BITS_PER_BYTE;
+
+	flds[fld].xtrct.prot_id = prot_id;
+	flds[fld].xtrct.off = (ice_flds_info[fld].off / ese_bits) *
+		ICE_FLOW_FV_EXTRACT_SZ;
+	flds[fld].xtrct.disp = (u8)(ice_flds_info[fld].off % ese_bits);
+	flds[fld].xtrct.idx = params->es_cnt;
+	flds[fld].xtrct.mask = ice_flds_info[fld].mask;
+
+	/* Adjust the next field-entry index after accommodating the number of
+	 * entries this field consumes
+	 */
+	cnt = DIV_ROUND_UP(flds[fld].xtrct.disp + ice_flds_info[fld].size,
+			   ese_bits);
+
+	/* Fill in the extraction sequence entries needed for this field */
+	off = flds[fld].xtrct.off;
+	mask = flds[fld].xtrct.mask;
+	for (i = 0; i < cnt; i++) {
+		/* Only consume an extraction sequence entry if there is no
+		 * sibling field associated with this field or the sibling entry
+		 * already extracts the word shared with this field.
+		 */
+		if (sib == ICE_FLOW_FIELD_IDX_MAX ||
+		    flds[sib].xtrct.prot_id == ICE_PROT_ID_INVAL ||
+		    flds[sib].xtrct.off != off) {
+			u8 idx;
+
+			/* Make sure the number of extraction sequence required
+			 * does not exceed the block's capability
+			 */
+			if (params->es_cnt >= fv_words)
+				return ICE_ERR_MAX_LIMIT;
+
+			/* some blocks require a reversed field vector layout */
+			if (hw->blk[params->blk].es.reverse)
+				idx = fv_words - params->es_cnt - 1;
+			else
+				idx = params->es_cnt;
+
+			params->es[idx].prot_id = prot_id;
+			params->es[idx].off = off;
+			params->mask[idx] = mask | sib_mask;
+			params->es_cnt++;
+		}
+
+		off += ICE_FLOW_FV_EXTRACT_SZ;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_xtract_raws - Create extract sequence entries for raw bytes
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ * @seg: index of packet segment whose raw fields are to be extracted
+ */
+static enum ice_status
+ice_flow_xtract_raws(struct ice_hw *hw, struct ice_flow_prof_params *params,
+		     u8 seg)
+{
+	u16 fv_words;
+	u16 hdrs_sz;
+	u8 i;
+
+	if (!params->prof->segs[seg].raws_cnt)
+		return 0;
+
+	if (params->prof->segs[seg].raws_cnt >
+	    ARRAY_SIZE(params->prof->segs[seg].raws))
+		return ICE_ERR_MAX_LIMIT;
+
+	/* Offsets within the segment headers are not supported */
+	hdrs_sz = ice_flow_calc_seg_sz(params, seg);
+	if (!hdrs_sz)
+		return ICE_ERR_PARAM;
+
+	fv_words = hw->blk[params->blk].es.fvw;
+
+	for (i = 0; i < params->prof->segs[seg].raws_cnt; i++) {
+		struct ice_flow_seg_fld_raw *raw;
+		u16 off, cnt, j;
+
+		raw = &params->prof->segs[seg].raws[i];
+
+		/* Storing extraction information */
+		raw->info.xtrct.prot_id = ICE_PROT_MAC_OF_OR_S;
+		raw->info.xtrct.off = (raw->off / ICE_FLOW_FV_EXTRACT_SZ) *
+			ICE_FLOW_FV_EXTRACT_SZ;
+		raw->info.xtrct.disp = (raw->off % ICE_FLOW_FV_EXTRACT_SZ) *
+			BITS_PER_BYTE;
+		raw->info.xtrct.idx = params->es_cnt;
+
+		/* Determine the number of field vector entries this raw field
+		 * consumes.
+		 */
+		cnt = DIV_ROUND_UP(raw->info.xtrct.disp + (raw->info.src.last * BITS_PER_BYTE),
+				   (ICE_FLOW_FV_EXTRACT_SZ * BITS_PER_BYTE));
+		off = raw->info.xtrct.off;
+		for (j = 0; j < cnt; j++) {
+			u16 idx;
+
+			/* Make sure the number of extraction sequence required
+			 * does not exceed the block's capability
+			 */
+			if (params->es_cnt >= hw->blk[params->blk].es.count ||
+			    params->es_cnt >= ICE_MAX_FV_WORDS)
+				return ICE_ERR_MAX_LIMIT;
+
+			/* some blocks require a reversed field vector layout */
+			if (hw->blk[params->blk].es.reverse)
+				idx = fv_words - params->es_cnt - 1;
+			else
+				idx = params->es_cnt;
+
+			params->es[idx].prot_id = raw->info.xtrct.prot_id;
+			params->es[idx].off = off;
+			params->es_cnt++;
+			off += ICE_FLOW_FV_EXTRACT_SZ;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_create_xtrct_seq - Create an extraction sequence for given segments
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ *
+ * This function iterates through all matched fields in the given segments, and
+ * creates an extraction sequence for the fields.
+ */
+static enum ice_status
+ice_flow_create_xtrct_seq(struct ice_hw *hw,
+			  struct ice_flow_prof_params *params)
+{
+	enum ice_status status = 0;
+	u8 i;
+
+	/* For ACL, we also need to extract the direction bit (Rx,Tx) data from
+	 * packet flags
+	 */
+	if (params->blk == ICE_BLK_ACL) {
+		status = ice_flow_xtract_pkt_flags(hw, params,
+						   ICE_RX_MDID_PKT_FLAGS_15_0);
+		if (status)
+			return status;
+	}
+
+	for (i = 0; i < params->prof->segs_cnt; i++) {
+		u64 match = params->prof->segs[i].match;
+		enum ice_flow_field j;
+
+		for_each_set_bit(j, (unsigned long *)&match,
+				 ICE_FLOW_FIELD_IDX_MAX) {
+			status = ice_flow_xtract_fld(hw, params, i, j, match);
+			if (status)
+				return status;
+			clear_bit(j, (unsigned long *)&match);
+		}
+
+		/* Process raw matching bytes */
+		status = ice_flow_xtract_raws(hw, params, i);
+		if (status)
+			return status;
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_sel_acl_scen - returns the specific scenario
+ * @hw: pointer to the hardware structure
+ * @params: information about the flow to be processed
+ *
+ * This function will return the specific scenario based on the
+ * params passed to it
+ */
+static enum ice_status
+ice_flow_sel_acl_scen(struct ice_hw *hw, struct ice_flow_prof_params *params)
+{
+	/* Find the best-fit scenario for the provided match width */
+	struct ice_acl_scen *cand_scen = NULL, *scen;
+
+	if (!hw->acl_tbl)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Loop through each scenario and match against the scenario width
+	 * to select the specific scenario
+	 */
+	list_for_each_entry(scen, &hw->acl_tbl->scens, list_entry)
+		if (scen->eff_width >= params->entry_length &&
+		    (!cand_scen || cand_scen->eff_width > scen->eff_width))
+			cand_scen = scen;
+	if (!cand_scen)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	params->prof->cfg.scen = cand_scen;
+
+	return 0;
+}
+
+/**
+ * ice_flow_acl_def_entry_frmt - Determine the layout of flow entries
+ * @params: information about the flow to be processed
+ */
+static enum ice_status
+ice_flow_acl_def_entry_frmt(struct ice_flow_prof_params *params)
+{
+	u16 index, i, range_idx = 0;
+
+	index = ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+
+	for (i = 0; i < params->prof->segs_cnt; i++) {
+		struct ice_flow_seg_info *seg = &params->prof->segs[i];
+		u8 j;
+
+		for_each_set_bit(j, (unsigned long *)&seg->match,
+				 ICE_FLOW_FIELD_IDX_MAX) {
+			struct ice_flow_fld_info *fld = &seg->fields[j];
+
+			fld->entry.mask = ICE_FLOW_FLD_OFF_INVAL;
+
+			if (fld->type == ICE_FLOW_FLD_TYPE_RANGE) {
+				fld->entry.last = ICE_FLOW_FLD_OFF_INVAL;
+
+				/* Range checking only supported for single
+				 * words
+				 */
+				if (DIV_ROUND_UP(ice_flds_info[j].size + fld->xtrct.disp, BITS_PER_BYTE * 2) > 1)
+					return ICE_ERR_PARAM;
+
+				/* Ranges must define low and high values */
+				if (fld->src.val == ICE_FLOW_FLD_OFF_INVAL ||
+				    fld->src.last == ICE_FLOW_FLD_OFF_INVAL)
+					return ICE_ERR_PARAM;
+
+				fld->entry.val = range_idx++;
+			} else {
+				/* Store adjusted byte-length of field for later
+				 * use, taking into account potential
+				 * non-byte-aligned displacement
+				 */
+				fld->entry.last = DIV_ROUND_UP(ice_flds_info[j].size + (fld->xtrct.disp % BITS_PER_BYTE),
+							       BITS_PER_BYTE);
+				fld->entry.val = index;
+				index += fld->entry.last;
+			}
+		}
+
+		for (j = 0; j < seg->raws_cnt; j++) {
+			struct ice_flow_seg_fld_raw *raw = &seg->raws[j];
+
+			raw->info.entry.mask = ICE_FLOW_FLD_OFF_INVAL;
+			raw->info.entry.val = index;
+			raw->info.entry.last = raw->info.src.last;
+			index += raw->info.entry.last;
+		}
+	}
+
+	/* Currently only support using the byte selection base, which only
+	 * allows for an effective entry size of 30 bytes. Reject anything
+	 * larger.
+	 */
+	if (index > ICE_AQC_ACL_PROF_BYTE_SEL_ELEMS)
+		return ICE_ERR_PARAM;
+
+	/* Only 8 range checkers per profile, reject anything trying to use
+	 * more
+	 */
+	if (range_idx > ICE_AQC_ACL_PROF_RANGES_NUM_CFG)
+		return ICE_ERR_PARAM;
+
+	/* Store # bytes required for entry for later use */
+	params->entry_length = index - ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+
+	return 0;
+}
+
+/**
+ * ice_flow_proc_segs - process all packet segments associated with a profile
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ */
+static enum ice_status
+ice_flow_proc_segs(struct ice_hw *hw, struct ice_flow_prof_params *params)
+{
+	enum ice_status status;
+
+	status = ice_flow_proc_seg_hdrs(params);
+	if (status)
+		return status;
+
+	status = ice_flow_create_xtrct_seq(hw, params);
+	if (status)
+		return status;
+
+	switch (params->blk) {
+	case ICE_BLK_FD:
+	case ICE_BLK_RSS:
+		status = 0;
+		break;
+	case ICE_BLK_ACL:
+		status = ice_flow_acl_def_entry_frmt(params);
+		if (status)
+			return status;
+		status = ice_flow_sel_acl_scen(hw, params);
+		if (status)
+			return status;
+		break;
+	default:
+		return ICE_ERR_NOT_IMPL;
+	}
+
+	return status;
+}
+
+#define ICE_FLOW_FIND_PROF_CHK_FLDS	0x00000001
+#define ICE_FLOW_FIND_PROF_CHK_VSI	0x00000002
+#define ICE_FLOW_FIND_PROF_NOT_CHK_DIR	0x00000004
+
+/**
+ * ice_flow_find_prof_conds - Find a profile matching headers and conditions
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @dir: flow direction
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ * @vsi_handle: software VSI handle to check VSI (ICE_FLOW_FIND_PROF_CHK_VSI)
+ * @conds: additional conditions to be checked (ICE_FLOW_FIND_PROF_CHK_*)
+ */
+static struct ice_flow_prof *
+ice_flow_find_prof_conds(struct ice_hw *hw, enum ice_block blk,
+			 enum ice_flow_dir dir, struct ice_flow_seg_info *segs,
+			 u8 segs_cnt, u16 vsi_handle, u32 conds)
+{
+	struct ice_flow_prof *p, *prof = NULL;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+	list_for_each_entry(p, &hw->fl_profs[blk], l_entry)
+		if ((p->dir == dir || conds & ICE_FLOW_FIND_PROF_NOT_CHK_DIR) &&
+		    segs_cnt && segs_cnt == p->segs_cnt) {
+			u8 i;
+
+			/* Check for profile-VSI association if specified */
+			if ((conds & ICE_FLOW_FIND_PROF_CHK_VSI) &&
+			    ice_is_vsi_valid(hw, vsi_handle) &&
+			    !test_bit(vsi_handle, p->vsis))
+				continue;
+
+			/* Protocol headers must be checked. Matched fields are
+			 * checked if specified.
+			 */
+			for (i = 0; i < segs_cnt; i++)
+				if (segs[i].hdrs != p->segs[i].hdrs ||
+				    ((conds & ICE_FLOW_FIND_PROF_CHK_FLDS) &&
+				     segs[i].match != p->segs[i].match))
+					break;
+
+			/* A match is found if all segments are matched */
+			if (i == segs_cnt) {
+				prof = p;
+				break;
+			}
+		}
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return prof;
+}
+
+/**
+ * ice_flow_find_prof - Look up a profile matching headers and matched fields
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @dir: flow direction
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ */
+u64
+ice_flow_find_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir,
+		   struct ice_flow_seg_info *segs, u8 segs_cnt)
+{
+	struct ice_flow_prof *p;
+
+	p = ice_flow_find_prof_conds(hw, blk, dir, segs, segs_cnt,
+				     ICE_MAX_VSI, ICE_FLOW_FIND_PROF_CHK_FLDS);
+
+	return p ? p->id : ICE_FLOW_PROF_ID_INVAL;
+}
+
+/**
+ * ice_flow_find_prof_id - Look up a profile with given profile ID
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @prof_id: unique ID to identify this flow profile
+ */
+static struct ice_flow_prof *
+ice_flow_find_prof_id(struct ice_hw *hw, enum ice_block blk, u64 prof_id)
+{
+	struct ice_flow_prof *p;
+
+	list_for_each_entry(p, &hw->fl_profs[blk], l_entry)
+		if (p->id == prof_id)
+			return p;
+
+	return NULL;
+}
+
+/**
+ * ice_dealloc_flow_entry - Deallocate flow entry memory
+ * @hw: pointer to the HW struct
+ * @entry: flow entry to be removed
+ */
+static void
+ice_dealloc_flow_entry(struct ice_hw *hw, struct ice_flow_entry *entry)
+{
+	if (!entry)
+		return;
+
+	if (entry->entry)
+		devm_kfree(ice_hw_to_dev(hw), entry->entry);
+
+	if (entry->range_buf) {
+		devm_kfree(ice_hw_to_dev(hw), entry->range_buf);
+		entry->range_buf = NULL;
+	}
+
+	if (entry->acts) {
+		devm_kfree(ice_hw_to_dev(hw), entry->acts);
+		entry->acts = NULL;
+		entry->acts_cnt = 0;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), entry);
+}
+
+/**
+ * ice_flow_get_hw_prof - return the HW profile for a specific profile ID handle
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @prof_id: the profile ID handle
+ * @hw_prof_id: pointer to variable to receive the HW profile ID
+ */
+enum ice_status
+ice_flow_get_hw_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id,
+		     u8 *hw_prof_id)
+{
+	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
+	struct ice_prof_map *map;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	map = ice_search_prof_id(hw, blk, prof_id);
+	if (map) {
+		*hw_prof_id = map->prof_id;
+		status = 0;
+	}
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+#define ICE_ACL_INVALID_SCEN	0x3f
+
+/**
+ * ice_flow_acl_is_prof_in_use - Verify if the profile is associated to any PF
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ * @buf: destination buffer function writes partial extraction sequence to
+ *
+ * returns ICE_SUCCESS if no PF is associated to the given profile
+ * returns ICE_ERR_IN_USE if at least one PF is associated to the given profile
+ * returns other error code for real error
+ */
+static enum ice_status
+ice_flow_acl_is_prof_in_use(struct ice_hw *hw, struct ice_flow_prof *prof,
+			    struct ice_aqc_acl_prof_generic_frmt *buf)
+{
+	enum ice_status status;
+	u8 prof_id = 0;
+
+	status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id, &prof_id);
+	if (status)
+		return status;
+
+	status = ice_query_acl_prof(hw, prof_id, buf, NULL);
+	if (status)
+		return status;
+
+	/* If all PF's associated scenarios are all 0 or all
+	 * ICE_ACL_INVALID_SCEN (63) for the given profile then the latter has
+	 * not been configured yet.
+	 */
+	if (buf->pf_scenario_num[0] == 0 && buf->pf_scenario_num[1] == 0 &&
+	    buf->pf_scenario_num[2] == 0 && buf->pf_scenario_num[3] == 0 &&
+	    buf->pf_scenario_num[4] == 0 && buf->pf_scenario_num[5] == 0 &&
+	    buf->pf_scenario_num[6] == 0 && buf->pf_scenario_num[7] == 0)
+		return 0;
+
+	if (buf->pf_scenario_num[0] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[1] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[2] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[3] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[4] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[5] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[6] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[7] == ICE_ACL_INVALID_SCEN)
+		return 0;
+
+	return ICE_ERR_IN_USE;
+}
+
+/**
+ * ice_flow_acl_free_act_cntr - Free the ACL rule's actions
+ * @hw: pointer to the hardware structure
+ * @acts: array of actions to be performed on a match
+ * @acts_cnt: number of actions
+ */
+static enum ice_status
+ice_flow_acl_free_act_cntr(struct ice_hw *hw, struct ice_flow_action *acts,
+			   u8 acts_cnt)
+{
+	int i;
+
+	for (i = 0; i < acts_cnt; i++) {
+		if (acts[i].type == ICE_FLOW_ACT_CNTR_PKT ||
+		    acts[i].type == ICE_FLOW_ACT_CNTR_BYTES ||
+		    acts[i].type == ICE_FLOW_ACT_CNTR_PKT_BYTES) {
+			struct ice_acl_cntrs cntrs = { 0 };
+			enum ice_status status;
+
+			/* amount is unused in the dealloc path but the common
+			 * parameter check routine wants a value set, as zero
+			 * is invalid for the check. Just set it.
+			 */
+			cntrs.amount = 1;
+			cntrs.bank = 0; /* Only bank0 for the moment */
+			cntrs.first_cntr =
+					le16_to_cpu(acts[i].data.acl_act.value);
+			cntrs.last_cntr =
+					le16_to_cpu(acts[i].data.acl_act.value);
+
+			if (acts[i].type == ICE_FLOW_ACT_CNTR_PKT_BYTES)
+				cntrs.type = ICE_AQC_ACL_CNT_TYPE_DUAL;
+			else
+				cntrs.type = ICE_AQC_ACL_CNT_TYPE_SINGLE;
+
+			status = ice_aq_dealloc_acl_cntrs(hw, &cntrs, NULL);
+			if (status)
+				return status;
+		}
+	}
+	return 0;
+}
+
+/**
+ * ice_flow_acl_disassoc_scen - Disassociate the scenario from the profile
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ *
+ * Disassociate the scenario from the profile for the PF of the VSI.
+ */
+static enum ice_status
+ice_flow_acl_disassoc_scen(struct ice_hw *hw, struct ice_flow_prof *prof)
+{
+	struct ice_aqc_acl_prof_generic_frmt buf;
+	enum ice_status status = 0;
+	u8 prof_id = 0;
+
+	memset(&buf, 0, sizeof(buf));
+
+	status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id, &prof_id);
+	if (status)
+		return status;
+
+	status = ice_query_acl_prof(hw, prof_id, &buf, NULL);
+	if (status)
+		return status;
+
+	/* Clear scenario for this PF */
+	buf.pf_scenario_num[hw->pf_id] = ICE_ACL_INVALID_SCEN;
+	status = ice_prgm_acl_prof_xtrct(hw, prof_id, &buf, NULL);
+
+	return status;
+}
+
+/**
+ * ice_flow_rem_entry_sync - Remove a flow entry
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @entry: flow entry to be removed
+ */
+static enum ice_status
+ice_flow_rem_entry_sync(struct ice_hw *hw, enum ice_block blk,
+			struct ice_flow_entry *entry)
+{
+	if (!entry)
+		return ICE_ERR_BAD_PTR;
+
+	if (blk == ICE_BLK_ACL) {
+		enum ice_status status;
+
+		if (ice_dcf_is_acl_capable(hw))
+			return ICE_ERR_IN_USE;
+		if (!entry->prof)
+			return ICE_ERR_BAD_PTR;
+
+		status = ice_acl_rem_entry(hw, entry->prof->cfg.scen,
+					   entry->scen_entry_idx);
+		if (status)
+			return status;
+
+		/* Checks if we need to release an ACL counter. */
+		if (entry->acts_cnt && entry->acts)
+			ice_flow_acl_free_act_cntr(hw, entry->acts,
+						   entry->acts_cnt);
+	}
+
+	list_del(&entry->l_entry);
+
+	ice_dealloc_flow_entry(hw, entry);
+
+	return 0;
+}
+
+/**
+ * ice_flow_add_prof_sync - Add a flow profile for packet segments and fields
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @dir: flow direction
+ * @prof_id: unique ID to identify this flow profile
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ * @acts: array of default actions
+ * @acts_cnt: number of default actions
+ * @prof: stores the returned flow profile added
+ *
+ * Assumption: the caller has acquired the lock to the profile list
+ */
+static enum ice_status
+ice_flow_add_prof_sync(struct ice_hw *hw, enum ice_block blk,
+		       enum ice_flow_dir dir, u64 prof_id,
+		       struct ice_flow_seg_info *segs, u8 segs_cnt,
+		       struct ice_flow_action *acts, u8 acts_cnt,
+		       struct ice_flow_prof **prof)
+{
+	struct ice_flow_prof_params *params;
+	enum ice_status status;
+	u8 i;
+
+	if (!prof || (acts_cnt && !acts))
+		return ICE_ERR_BAD_PTR;
+
+	params = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*params), GFP_KERNEL);
+	if (!params)
+		return ICE_ERR_NO_MEMORY;
+
+	params->prof = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*params->prof),
+				    GFP_KERNEL);
+	if (!params->prof) {
+		status = ICE_ERR_NO_MEMORY;
+		goto free_params;
+	}
+
+	/* initialize extraction sequence to all invalid (0xff) */
+	for (i = 0; i < ICE_MAX_FV_WORDS; i++) {
+		params->es[i].prot_id = ICE_PROT_INVALID;
+		params->es[i].off = ICE_FV_OFFSET_INVAL;
+	}
+
+	params->blk = blk;
+	params->prof->id = prof_id;
+	params->prof->dir = dir;
+	params->prof->segs_cnt = segs_cnt;
+
+	/* Make a copy of the segments that need to be persistent in the flow
+	 * profile instance
+	 */
+	for (i = 0; i < segs_cnt; i++)
+		memcpy(&params->prof->segs[i], &segs[i], sizeof(*segs));
+
+	/* Make a copy of the actions that need to be persistent in the flow
+	 * profile instance.
+	 */
+	if (acts_cnt) {
+		params->prof->acts = devm_kmemdup(ice_hw_to_dev(hw), acts,
+						  acts_cnt * sizeof(*acts),
+						  GFP_KERNEL);
+
+		if (!params->prof->acts) {
+			status = ICE_ERR_NO_MEMORY;
+			goto out;
+		}
+	}
+
+
+	status = ice_flow_proc_segs(hw, params);
+	if (status) {
+		ice_debug(hw, ICE_DBG_FLOW, "Error processing a flow's packet segments\n");
+		goto out;
+	}
+
+	/* Add a HW profile for this flow profile */
+	status = ice_add_prof(hw, blk, prof_id, (u8 *)params->ptypes,
+			      params->attr, params->attr_cnt, params->es,
+			      params->mask);
+	if (status) {
+		ice_debug(hw, ICE_DBG_FLOW, "Error adding a HW flow profile\n");
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&params->prof->entries);
+	mutex_init(&params->prof->entries_lock);
+	*prof = params->prof;
+
+out:
+	if (status) {
+		if (params->prof->acts)
+			devm_kfree(ice_hw_to_dev(hw), params->prof->acts);
+		devm_kfree(ice_hw_to_dev(hw), params->prof);
+	}
+free_params:
+	devm_kfree(ice_hw_to_dev(hw), params);
+
+	return status;
+}
+
+/**
+ * ice_flow_rem_prof_sync - remove a flow profile
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @prof: pointer to flow profile to remove
+ *
+ * Assumption: the caller has acquired the lock to the profile list
+ */
+static enum ice_status
+ice_flow_rem_prof_sync(struct ice_hw *hw, enum ice_block blk,
+		       struct ice_flow_prof *prof)
+{
+	enum ice_status status;
+
+	/* Remove all remaining flow entries before removing the flow profile */
+	if (!list_empty(&prof->entries)) {
+		struct ice_flow_entry *e, *t;
+
+		mutex_lock(&prof->entries_lock);
+
+		list_for_each_entry_safe(e, t, &prof->entries, l_entry) {
+			status = ice_flow_rem_entry_sync(hw, blk, e);
+			if (status)
+				break;
+		}
+
+		mutex_unlock(&prof->entries_lock);
+	}
+
+	if (blk == ICE_BLK_ACL) {
+		struct ice_aqc_acl_profile_ranges query_rng_buf;
+		struct ice_aqc_acl_prof_generic_frmt buf;
+		u8 prof_id = 0;
+
+		/* Disassociate the scenario from the profile for the PF */
+		status = ice_flow_acl_disassoc_scen(hw, prof);
+		if (status)
+			return status;
+
+		/* Clear the range-checker if the profile ID is no longer
+		 * used by any PF
+		 */
+		status = ice_flow_acl_is_prof_in_use(hw, prof, &buf);
+		if (status && status != ICE_ERR_IN_USE) {
+			return status;
+		} else if (!status) {
+			/* Clear the range-checker value for profile ID */
+			memset(&query_rng_buf, 0,
+			       sizeof(struct ice_aqc_acl_profile_ranges));
+
+			status = ice_flow_get_hw_prof(hw, blk, prof->id,
+						      &prof_id);
+			if (status)
+				return status;
+
+			status = ice_prog_acl_prof_ranges(hw, prof_id,
+							  &query_rng_buf, NULL);
+			if (status)
+				return status;
+		}
+	}
+
+	/* Remove all hardware profiles associated with this flow profile */
+	status = ice_rem_prof(hw, blk, prof->id);
+	if (!status) {
+		list_del(&prof->l_entry);
+		mutex_destroy(&prof->entries_lock);
+		if (prof->acts)
+			devm_kfree(ice_hw_to_dev(hw), prof->acts);
+		devm_kfree(ice_hw_to_dev(hw), prof);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_acl_set_xtrct_seq_fld - Populate xtrct seq for single field
+ * @buf: Destination buffer function writes partial xtrct sequence to
+ * @info: Info about field
+ */
+static void
+ice_flow_acl_set_xtrct_seq_fld(struct ice_aqc_acl_prof_generic_frmt *buf,
+			       struct ice_flow_fld_info *info)
+{
+	u16 dst, i;
+	u8 src;
+
+	src = info->xtrct.idx * ICE_FLOW_FV_EXTRACT_SZ +
+		info->xtrct.disp / BITS_PER_BYTE;
+	dst = info->entry.val;
+	for (i = 0; i < info->entry.last; i++)
+		/* HW stores field vector words in LE, convert words back to BE
+		 * so constructed entries will end up in network order
+		 */
+		buf->byte_selection[dst++] = src++ ^ 1;
+}
+
+/**
+ * ice_flow_acl_set_xtrct_seq - Program ACL extraction sequence
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ */
+static enum ice_status
+ice_flow_acl_set_xtrct_seq(struct ice_hw *hw, struct ice_flow_prof *prof)
+{
+	struct ice_aqc_acl_prof_generic_frmt buf;
+	struct ice_flow_fld_info *info;
+	enum ice_status status;
+	u8 prof_id = 0;
+	u16 i;
+
+	memset(&buf, 0, sizeof(buf));
+
+	status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id, &prof_id);
+	if (status)
+		return status;
+
+	status = ice_flow_acl_is_prof_in_use(hw, prof, &buf);
+	if (status && status != ICE_ERR_IN_USE)
+		return status;
+
+	if (!status) {
+		/* Program the profile dependent configuration. This is done
+		 * only once regardless of the number of PFs using that profile
+		 */
+		memset(&buf, 0, sizeof(buf));
+
+		for (i = 0; i < prof->segs_cnt; i++) {
+			struct ice_flow_seg_info *seg = &prof->segs[i];
+			u16 j;
+
+			for_each_set_bit(j, (unsigned long *)&seg->match,
+					 ICE_FLOW_FIELD_IDX_MAX) {
+				info = &seg->fields[j];
+
+				if (info->type == ICE_FLOW_FLD_TYPE_RANGE)
+					buf.word_selection[info->entry.val] =
+						info->xtrct.idx;
+				else
+					ice_flow_acl_set_xtrct_seq_fld(&buf,
+								       info);
+			}
+
+			for (j = 0; j < seg->raws_cnt; j++) {
+				info = &seg->raws[j].info;
+				ice_flow_acl_set_xtrct_seq_fld(&buf, info);
+			}
+		}
+
+		memset(&buf.pf_scenario_num[0], ICE_ACL_INVALID_SCEN,
+		       ICE_AQC_ACL_PROF_PF_SCEN_NUM_ELEMS);
+	}
+
+	/* Update the current PF */
+	buf.pf_scenario_num[hw->pf_id] = (u8)prof->cfg.scen->id;
+	status = ice_prgm_acl_prof_xtrct(hw, prof_id, &buf, NULL);
+
+	return status;
+}
+
+/**
+ * ice_flow_assoc_vsig_vsi - associate a VSI with VSIG
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @vsi_handle: software VSI handle
+ * @vsig: target VSI group
+ *
+ * Assumption: the caller has already verified that the VSI to
+ * be added has the same characteristics as the VSIG and will
+ * thereby have access to all resources added to that VSIG.
+ */
+enum ice_status
+ice_flow_assoc_vsig_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi_handle,
+			u16 vsig)
+{
+	enum ice_status status;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle) || blk >= ICE_BLK_COUNT)
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+	status = ice_add_vsi_flow(hw, blk, ice_get_hw_vsi_num(hw, vsi_handle),
+				  vsig);
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return status;
+}
+
+/**
+ * ice_flow_assoc_prof - associate a VSI with a flow profile
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @prof: pointer to flow profile
+ * @vsi_handle: software VSI handle
+ *
+ * Assumption: the caller has acquired the lock to the profile list
+ * and the software VSI handle has been validated
+ */
+static enum ice_status
+ice_flow_assoc_prof(struct ice_hw *hw, enum ice_block blk,
+		    struct ice_flow_prof *prof, u16 vsi_handle)
+{
+	enum ice_status status = 0;
+
+	if (!test_bit(vsi_handle, prof->vsis)) {
+		if (blk == ICE_BLK_ACL) {
+			status = ice_flow_acl_set_xtrct_seq(hw, prof);
+			if (status)
+				return status;
+		}
+		status = ice_add_prof_id_flow(hw, blk,
+					      ice_get_hw_vsi_num(hw,
+								 vsi_handle),
+					      prof->id);
+		if (!status)
+			set_bit(vsi_handle, prof->vsis);
+		else
+			ice_debug(hw, ICE_DBG_FLOW, "HW profile add failed, %d\n",
+				  status);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_disassoc_prof - disassociate a VSI from a flow profile
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @prof: pointer to flow profile
+ * @vsi_handle: software VSI handle
+ *
+ * Assumption: the caller has acquired the lock to the profile list
+ * and the software VSI handle has been validated
+ */
+static enum ice_status
+ice_flow_disassoc_prof(struct ice_hw *hw, enum ice_block blk,
+		       struct ice_flow_prof *prof, u16 vsi_handle)
+{
+	enum ice_status status = 0;
+
+	if (test_bit(vsi_handle, prof->vsis)) {
+		status = ice_rem_prof_id_flow(hw, blk,
+					      ice_get_hw_vsi_num(hw,
+								 vsi_handle),
+					      prof->id);
+		if (!status)
+			clear_bit(vsi_handle, prof->vsis);
+		else
+			ice_debug(hw, ICE_DBG_FLOW, "HW profile remove failed, %d\n",
+				  status);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_add_prof - Add a flow profile for packet segments and matched fields
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @dir: flow direction
+ * @prof_id: unique ID to identify this flow profile
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ * @acts: array of default actions
+ * @acts_cnt: number of default actions
+ * @prof: stores the returned flow profile added
+ */
+enum ice_status
+ice_flow_add_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir,
+		  u64 prof_id, struct ice_flow_seg_info *segs, u8 segs_cnt,
+		  struct ice_flow_action *acts, u8 acts_cnt,
+		  struct ice_flow_prof **prof)
+{
+	enum ice_status status;
+
+	if (segs_cnt > ICE_FLOW_SEG_MAX)
+		return ICE_ERR_MAX_LIMIT;
+
+	if (!segs_cnt)
+		return ICE_ERR_PARAM;
+
+	if (!segs)
+		return ICE_ERR_BAD_PTR;
+
+	status = ice_flow_val_hdrs(segs, segs_cnt);
+	if (status)
+		return status;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+
+	status = ice_flow_add_prof_sync(hw, blk, dir, prof_id, segs, segs_cnt,
+					acts, acts_cnt, prof);
+	if (!status)
+		list_add(&(*prof)->l_entry, &hw->fl_profs[blk]);
+
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return status;
+}
+
+/**
+ * ice_flow_rem_prof - Remove a flow profile and all entries associated with it
+ * @hw: pointer to the HW struct
+ * @blk: the block for which the flow profile is to be removed
+ * @prof_id: unique ID of the flow profile to be removed
+ */
+enum ice_status
+ice_flow_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id)
+{
+	struct ice_flow_prof *prof;
+	enum ice_status status;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+
+	prof = ice_flow_find_prof_id(hw, blk, prof_id);
+	if (!prof) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto out;
+	}
+
+	/* prof becomes invalid after the call */
+	status = ice_flow_rem_prof_sync(hw, blk, prof);
+
+out:
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return status;
+}
+
+/**
+ * ice_flow_find_entry - look for a flow entry using its unique ID
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @entry_id: unique ID to identify this flow entry
+ *
+ * This function looks for the flow entry with the specified unique ID in all
+ * flow profiles of the specified classification stage. If the entry is found,
+ * and it returns the handle to the flow entry. Otherwise, it returns
+ * ICE_FLOW_ENTRY_ID_INVAL.
+ */
+u64 ice_flow_find_entry(struct ice_hw *hw, enum ice_block blk, u64 entry_id)
+{
+	struct ice_flow_entry *found = NULL;
+	struct ice_flow_prof *p;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+
+	list_for_each_entry(p, &hw->fl_profs[blk], l_entry) {
+		struct ice_flow_entry *e;
+
+		mutex_lock(&p->entries_lock);
+		list_for_each_entry(e, &p->entries, l_entry)
+			if (e->id == entry_id) {
+				found = e;
+				break;
+			}
+		mutex_unlock(&p->entries_lock);
+
+		if (found)
+			break;
+	}
+
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return found ? ICE_FLOW_ENTRY_HNDL(found) : ICE_FLOW_ENTRY_HANDLE_INVAL;
+}
+
+/**
+ * ice_flow_acl_check_actions - Checks the ACL rule's actions
+ * @hw: pointer to the hardware structure
+ * @acts: array of actions to be performed on a match
+ * @acts_cnt: number of actions
+ * @cnt_alloc: indicates if an ACL counter has been allocated.
+ */
+static enum ice_status
+ice_flow_acl_check_actions(struct ice_hw *hw, struct ice_flow_action *acts,
+			   u8 acts_cnt, bool *cnt_alloc)
+{
+	DECLARE_BITMAP(dup_check, ICE_AQC_TBL_MAX_ACTION_PAIRS * 2);
+	int i;
+
+	bitmap_zero(dup_check, ICE_AQC_TBL_MAX_ACTION_PAIRS * 2);
+	*cnt_alloc = false;
+
+	if (acts_cnt > ICE_FLOW_ACL_MAX_NUM_ACT)
+		return ICE_ERR_OUT_OF_RANGE;
+
+	for (i = 0; i < acts_cnt; i++) {
+		if (acts[i].type != ICE_FLOW_ACT_NOP &&
+		    acts[i].type != ICE_FLOW_ACT_DROP &&
+		    acts[i].type != ICE_FLOW_ACT_CNTR_PKT &&
+		    acts[i].type != ICE_FLOW_ACT_FWD_QUEUE)
+			return ICE_ERR_CFG;
+
+		/* If the caller want to add two actions of the same type, then
+		 * it is considered invalid configuration.
+		 */
+		if (test_and_set_bit(acts[i].type, dup_check))
+			return ICE_ERR_PARAM;
+	}
+
+	/* Checks if ACL counters are needed. */
+	for (i = 0; i < acts_cnt; i++) {
+		if (acts[i].type == ICE_FLOW_ACT_CNTR_PKT ||
+		    acts[i].type == ICE_FLOW_ACT_CNTR_BYTES ||
+		    acts[i].type == ICE_FLOW_ACT_CNTR_PKT_BYTES) {
+			struct ice_acl_cntrs cntrs = { 0 };
+			enum ice_status status;
+
+			cntrs.amount = 1;
+			cntrs.bank = 0; /* Only bank0 for the moment */
+
+			if (acts[i].type == ICE_FLOW_ACT_CNTR_PKT_BYTES)
+				cntrs.type = ICE_AQC_ACL_CNT_TYPE_DUAL;
+			else
+				cntrs.type = ICE_AQC_ACL_CNT_TYPE_SINGLE;
+
+			status = ice_aq_alloc_acl_cntrs(hw, &cntrs, NULL);
+			if (status)
+				return status;
+			/* Counter index within the bank */
+			acts[i].data.acl_act.value =
+						cpu_to_le16(cntrs.first_cntr);
+			*cnt_alloc = true;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_acl_frmt_entry_range - Format an ACL range checker for a given field
+ * @fld: number of the given field
+ * @info: info about field
+ * @range_buf: range checker configuration buffer
+ * @data: pointer to a data buffer containing flow entry's match values/masks
+ * @range: Input/output param indicating which range checkers are being used
+ */
+static void
+ice_flow_acl_frmt_entry_range(u16 fld, struct ice_flow_fld_info *info,
+			      struct ice_aqc_acl_profile_ranges *range_buf,
+			      u8 *data, u8 *range)
+{
+	u16 new_mask;
+
+	/* If not specified, default mask is all bits in field */
+	new_mask = (info->src.mask == ICE_FLOW_FLD_OFF_INVAL ?
+		    BIT(ice_flds_info[fld].size) - 1 :
+		    (*(u16 *)(data + info->src.mask))) << info->xtrct.disp;
+
+	/* If the mask is 0, then we don't need to worry about this input
+	 * range checker value.
+	 */
+	if (new_mask) {
+		u16 new_high =
+			(*(u16 *)(data + info->src.last)) << info->xtrct.disp;
+		u16 new_low =
+			(*(u16 *)(data + info->src.val)) << info->xtrct.disp;
+		u8 range_idx = info->entry.val;
+
+		range_buf->checker_cfg[range_idx].low_boundary =
+			cpu_to_be16(new_low);
+		range_buf->checker_cfg[range_idx].high_boundary =
+			cpu_to_be16(new_high);
+		range_buf->checker_cfg[range_idx].mask = cpu_to_be16(new_mask);
+
+		/* Indicate which range checker is being used */
+		*range |= BIT(range_idx);
+	}
+}
+
+/**
+ * ice_flow_acl_frmt_entry_fld - Partially format ACL entry for a given field
+ * @fld: number of the given field
+ * @info: info about the field
+ * @buf: buffer containing the entry
+ * @dontcare: buffer containing don't care mask for entry
+ * @data: pointer to a data buffer containing flow entry's match values/masks
+ */
+static void
+ice_flow_acl_frmt_entry_fld(u16 fld, struct ice_flow_fld_info *info, u8 *buf,
+			    u8 *dontcare, u8 *data)
+{
+	u16 dst, src, mask, k, end_disp, tmp_s = 0, tmp_m = 0;
+	bool use_mask = false;
+	u8 disp;
+
+	src = info->src.val;
+	mask = info->src.mask;
+	dst = info->entry.val - ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+	disp = info->xtrct.disp % BITS_PER_BYTE;
+
+	if (mask != ICE_FLOW_FLD_OFF_INVAL)
+		use_mask = true;
+
+	for (k = 0; k < info->entry.last; k++, dst++) {
+		/* Add overflow bits from previous byte */
+		buf[dst] = (tmp_s & 0xff00) >> 8;
+
+		/* If mask is not valid, tmp_m is always zero, so just setting
+		 * dontcare to 0 (no masked bits). If mask is valid, pulls in
+		 * overflow bits of mask from prev byte
+		 */
+		dontcare[dst] = (tmp_m & 0xff00) >> 8;
+
+		/* If there is displacement, last byte will only contain
+		 * displaced data, but there is no more data to read from user
+		 * buffer, so skip so as not to potentially read beyond end of
+		 * user buffer
+		 */
+		if (!disp || k < info->entry.last - 1) {
+			/* Store shifted data to use in next byte */
+			tmp_s = data[src++] << disp;
+
+			/* Add current (shifted) byte */
+			buf[dst] |= tmp_s & 0xff;
+
+			/* Handle mask if valid */
+			if (use_mask) {
+				tmp_m = (~data[mask++] & 0xff) << disp;
+				dontcare[dst] |= tmp_m & 0xff;
+			}
+		}
+	}
+
+	/* Fill in don't care bits at beginning of field */
+	if (disp) {
+		dst = info->entry.val - ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+		for (k = 0; k < disp; k++)
+			dontcare[dst] |= BIT(k);
+	}
+
+	end_disp = (disp + ice_flds_info[fld].size) % BITS_PER_BYTE;
+
+	/* Fill in don't care bits at end of field */
+	if (end_disp) {
+		dst = info->entry.val - ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX +
+		      info->entry.last - 1;
+		for (k = end_disp; k < BITS_PER_BYTE; k++)
+			dontcare[dst] |= BIT(k);
+	}
+}
+
+/**
+ * ice_flow_acl_frmt_entry - Format ACL entry
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ * @e: pointer to the flow entry
+ * @data: pointer to a data buffer containing flow entry's match values/masks
+ * @acts: array of actions to be performed on a match
+ * @acts_cnt: number of actions
+ *
+ * Formats the key (and key_inverse) to be matched from the data passed in,
+ * along with data from the flow profile. This key/key_inverse pair makes up
+ * the 'entry' for an ACL flow entry.
+ */
+static enum ice_status
+ice_flow_acl_frmt_entry(struct ice_hw *hw, struct ice_flow_prof *prof,
+			struct ice_flow_entry *e, u8 *data,
+			struct ice_flow_action *acts, u8 acts_cnt)
+{
+	u8 *buf = NULL, *dontcare = NULL, *key = NULL, range = 0, dir_flag_msk;
+	struct ice_aqc_acl_profile_ranges *range_buf = NULL;
+	enum ice_status status;
+	bool cnt_alloc;
+	u8 prof_id = 0;
+	u16 i, buf_sz;
+
+	status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id, &prof_id);
+	if (status)
+		return status;
+
+	/* Format the result action */
+
+	status = ice_flow_acl_check_actions(hw, acts, acts_cnt, &cnt_alloc);
+	if (status)
+		return status;
+
+	status = ICE_ERR_NO_MEMORY;
+
+	e->acts = devm_kmemdup(ice_hw_to_dev(hw), acts,
+			       acts_cnt * sizeof(*acts), GFP_KERNEL);
+	if (!e->acts)
+		goto out;
+
+	e->acts_cnt = acts_cnt;
+
+	/* Format the matching data */
+	buf_sz = prof->cfg.scen->width;
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_sz, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	dontcare = devm_kzalloc(ice_hw_to_dev(hw), buf_sz, GFP_KERNEL);
+	if (!dontcare)
+		goto out;
+
+	/* 'key' buffer will store both key and key_inverse, so must be twice
+	 * size of buf
+	 */
+	key = devm_kzalloc(ice_hw_to_dev(hw), buf_sz * 2, GFP_KERNEL);
+	if (!key)
+		goto out;
+
+	range_buf = devm_kzalloc(ice_hw_to_dev(hw),
+				 sizeof(struct ice_aqc_acl_profile_ranges),
+				 GFP_KERNEL);
+	if (!range_buf)
+		goto out;
+
+	/* Set don't care mask to all 1's to start, will zero out used bytes */
+	memset(dontcare, 0xff, buf_sz);
+
+	for (i = 0; i < prof->segs_cnt; i++) {
+		struct ice_flow_seg_info *seg = &prof->segs[i];
+		u8 j;
+
+		for_each_set_bit(j, (unsigned long *)&seg->match,
+				 ICE_FLOW_FIELD_IDX_MAX) {
+			struct ice_flow_fld_info *info = &seg->fields[j];
+
+			if (info->type == ICE_FLOW_FLD_TYPE_RANGE)
+				ice_flow_acl_frmt_entry_range(j, info,
+							      range_buf, data,
+							      &range);
+			else
+				ice_flow_acl_frmt_entry_fld(j, info, buf,
+							    dontcare, data);
+		}
+
+		for (j = 0; j < seg->raws_cnt; j++) {
+			struct ice_flow_fld_info *info = &seg->raws[j].info;
+			u16 dst, src, mask, k;
+			bool use_mask = false;
+
+			src = info->src.val;
+			dst = info->entry.val -
+					ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+			mask = info->src.mask;
+
+			if (mask != ICE_FLOW_FLD_OFF_INVAL)
+				use_mask = true;
+
+			for (k = 0; k < info->entry.last; k++, dst++) {
+				buf[dst] = data[src++];
+				if (use_mask)
+					dontcare[dst] = ~data[mask++];
+				else
+					dontcare[dst] = 0;
+			}
+		}
+	}
+
+	buf[prof->cfg.scen->pid_idx] = (u8)prof_id;
+	dontcare[prof->cfg.scen->pid_idx] = 0;
+
+	/* Format the buffer for direction flags */
+	dir_flag_msk = BIT(ICE_FLG_PKT_DIR);
+
+
+	if (prof->dir == ICE_FLOW_RX)
+		buf[prof->cfg.scen->pkt_dir_idx] = dir_flag_msk;
+
+	if (range) {
+		buf[prof->cfg.scen->rng_chk_idx] = range;
+		/* Mark any unused range checkers as don't care */
+		dontcare[prof->cfg.scen->rng_chk_idx] = ~range;
+		e->range_buf = range_buf;
+	} else {
+		devm_kfree(ice_hw_to_dev(hw), range_buf);
+	}
+
+	status = ice_set_key(key, buf_sz * 2, buf, NULL, dontcare, NULL, 0,
+			     buf_sz);
+	if (status)
+		goto out;
+
+	e->entry = key;
+	e->entry_sz = buf_sz * 2;
+
+out:
+	if (buf)
+		devm_kfree(ice_hw_to_dev(hw), buf);
+
+	if (dontcare)
+		devm_kfree(ice_hw_to_dev(hw), dontcare);
+
+	if (status && key)
+		devm_kfree(ice_hw_to_dev(hw), key);
+
+	if (status && range_buf) {
+		devm_kfree(ice_hw_to_dev(hw), range_buf);
+		e->range_buf = NULL;
+	}
+
+	if (status && e->acts) {
+		devm_kfree(ice_hw_to_dev(hw), e->acts);
+		e->acts = NULL;
+		e->acts_cnt = 0;
+	}
+
+	if (status && cnt_alloc)
+		ice_flow_acl_free_act_cntr(hw, acts, acts_cnt);
+
+	return status;
+}
+
+/**
+ * ice_flow_acl_find_scen_entry_cond - Find an ACL scenario entry that matches
+ *				       the compared data.
+ * @prof: pointer to flow profile
+ * @e: pointer to the comparing flow entry
+ * @do_chg_action: decide if we want to change the ACL action
+ * @do_add_entry: decide if we want to add the new ACL entry
+ * @do_rem_entry: decide if we want to remove the current ACL entry
+ *
+ * Find an ACL scenario entry that matches the compared data. In the same time,
+ * this function also figure out:
+ * a/ If we want to change the ACL action
+ * b/ If we want to add the new ACL entry
+ * c/ If we want to remove the current ACL entry
+ */
+static struct ice_flow_entry *
+ice_flow_acl_find_scen_entry_cond(struct ice_flow_prof *prof,
+				  struct ice_flow_entry *e, bool *do_chg_action,
+				  bool *do_add_entry, bool *do_rem_entry)
+{
+	struct ice_flow_entry *p, *return_entry = NULL;
+	u8 i, j;
+
+	/* Check if:
+	 * a/ There exists an entry with same matching data, but different
+	 *    priority, then we remove this existing ACL entry. Then, we
+	 *    will add the new entry to the ACL scenario.
+	 * b/ There exists an entry with same matching data, priority, and
+	 *    result action, then we do nothing
+	 * c/ There exists an entry with same matching data, priority, but
+	 *    different, action, then do only change the action's entry.
+	 * d/ Else, we add this new entry to the ACL scenario.
+	 */
+	*do_chg_action = false;
+	*do_add_entry = true;
+	*do_rem_entry = false;
+	list_for_each_entry(p, &prof->entries, l_entry) {
+		if (memcmp(p->entry, e->entry, p->entry_sz))
+			continue;
+
+		/* From this point, we have the same matching_data. */
+		*do_add_entry = false;
+		return_entry = p;
+
+		if (p->priority != e->priority) {
+			/* matching data && !priority */
+			*do_add_entry = true;
+			*do_rem_entry = true;
+			break;
+		}
+
+		/* From this point, we will have matching_data && priority */
+		if (p->acts_cnt != e->acts_cnt)
+			*do_chg_action = true;
+		for (i = 0; i < p->acts_cnt; i++) {
+			bool found_not_match = false;
+
+			for (j = 0; j < e->acts_cnt; j++)
+				if (memcmp(&p->acts[i], &e->acts[j],
+					   sizeof(struct ice_flow_action))) {
+					found_not_match = true;
+					break;
+				}
+
+			if (found_not_match) {
+				*do_chg_action = true;
+				break;
+			}
+		}
+
+		/* (do_chg_action = true) means :
+		 *    matching_data && priority && !result_action
+		 * (do_chg_action = false) means :
+		 *    matching_data && priority && result_action
+		 */
+		break;
+	}
+
+	return return_entry;
+}
+
+/**
+ * ice_flow_acl_convert_to_acl_prio - Convert to ACL priority
+ * @p: flow priority
+ */
+static enum ice_acl_entry_prio
+ice_flow_acl_convert_to_acl_prio(enum ice_flow_priority p)
+{
+	enum ice_acl_entry_prio acl_prio;
+
+	switch (p) {
+	case ICE_FLOW_PRIO_LOW:
+		acl_prio = ICE_ACL_PRIO_LOW;
+		break;
+	case ICE_FLOW_PRIO_NORMAL:
+		acl_prio = ICE_ACL_PRIO_NORMAL;
+		break;
+	case ICE_FLOW_PRIO_HIGH:
+		acl_prio = ICE_ACL_PRIO_HIGH;
+		break;
+	default:
+		acl_prio = ICE_ACL_PRIO_NORMAL;
+		break;
+	}
+
+	return acl_prio;
+}
+
+/**
+ * ice_flow_acl_union_rng_chk - Perform union operation between two
+ *                              range-range checker buffers
+ * @dst_buf: pointer to destination range checker buffer
+ * @src_buf: pointer to source range checker buffer
+ *
+ * For this function, we do the union between dst_buf and src_buf
+ * range checker buffer, and we will save the result back to dst_buf
+ */
+static enum ice_status
+ice_flow_acl_union_rng_chk(struct ice_aqc_acl_profile_ranges *dst_buf,
+			   struct ice_aqc_acl_profile_ranges *src_buf)
+{
+	u8 i, j;
+
+	if (!dst_buf || !src_buf)
+		return ICE_ERR_BAD_PTR;
+
+	for (i = 0; i < ICE_AQC_ACL_PROF_RANGES_NUM_CFG; i++) {
+		struct ice_acl_rng_data *cfg_data = NULL, *in_data;
+		bool will_populate = false;
+
+		in_data = &src_buf->checker_cfg[i];
+
+		if (!in_data->mask)
+			break;
+
+		for (j = 0; j < ICE_AQC_ACL_PROF_RANGES_NUM_CFG; j++) {
+			cfg_data = &dst_buf->checker_cfg[j];
+
+			if (!cfg_data->mask ||
+			    !memcmp(cfg_data, in_data,
+				    sizeof(struct ice_acl_rng_data))) {
+				will_populate = true;
+				break;
+			}
+		}
+
+		if (will_populate) {
+			memcpy(cfg_data, in_data,
+			       sizeof(struct ice_acl_rng_data));
+		} else {
+			/* No available slot left to program range checker */
+			return ICE_ERR_MAX_LIMIT;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_acl_add_scen_entry_sync - Add entry to ACL scenario sync
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ * @entry: double pointer to the flow entry
+ *
+ * For this function, we will look at the current added entries in the
+ * corresponding ACL scenario. Then, we will perform matching logic to
+ * see if we want to add/modify/do nothing with this new entry.
+ */
+static enum ice_status
+ice_flow_acl_add_scen_entry_sync(struct ice_hw *hw, struct ice_flow_prof *prof,
+				 struct ice_flow_entry **entry)
+{
+	bool do_add_entry, do_rem_entry, do_chg_action, do_chg_rng_chk;
+	struct ice_aqc_acl_profile_ranges query_rng_buf, cfg_rng_buf;
+	struct ice_acl_act_entry *acts = NULL;
+	struct ice_flow_entry *exist;
+	enum ice_status status = 0;
+	struct ice_flow_entry *e;
+	u8 i;
+
+	if (!entry || !(*entry) || !prof)
+		return ICE_ERR_BAD_PTR;
+
+	e = *entry;
+
+	do_chg_rng_chk = false;
+	if (e->range_buf) {
+		u8 prof_id = 0;
+
+		status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id,
+					      &prof_id);
+		if (status)
+			return status;
+
+		/* Query the current range-checker value in FW */
+		status = ice_query_acl_prof_ranges(hw, prof_id, &query_rng_buf,
+						   NULL);
+		if (status)
+			return status;
+		memcpy(&cfg_rng_buf, &query_rng_buf,
+		       sizeof(struct ice_aqc_acl_profile_ranges));
+
+		/* Generate the new range-checker value */
+		status = ice_flow_acl_union_rng_chk(&cfg_rng_buf, e->range_buf);
+		if (status)
+			return status;
+
+		/* Reconfigure the range check if the buffer is changed. */
+		do_chg_rng_chk = false;
+		if (memcmp(&query_rng_buf, &cfg_rng_buf,
+			   sizeof(struct ice_aqc_acl_profile_ranges))) {
+			status = ice_prog_acl_prof_ranges(hw, prof_id,
+							  &cfg_rng_buf, NULL);
+			if (status)
+				return status;
+
+			do_chg_rng_chk = true;
+		}
+	}
+
+	/* Figure out if we want to (change the ACL action) and/or
+	 * (Add the new ACL entry) and/or (Remove the current ACL entry)
+	 */
+	exist = ice_flow_acl_find_scen_entry_cond(prof, e, &do_chg_action,
+						  &do_add_entry, &do_rem_entry);
+	if (do_rem_entry) {
+		status = ice_flow_rem_entry_sync(hw, ICE_BLK_ACL, exist);
+		if (status)
+			return status;
+	}
+
+	/* Prepare the result action buffer */
+	acts = devm_kcalloc(ice_hw_to_dev(hw), e->entry_sz,
+			    sizeof(struct ice_acl_act_entry), GFP_KERNEL);
+	if (!acts)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < e->acts_cnt; i++)
+		memcpy(&acts[i], &e->acts[i].data.acl_act,
+		       sizeof(struct ice_acl_act_entry));
+
+	if (do_add_entry) {
+		enum ice_acl_entry_prio prio;
+		u8 *keys, *inverts;
+		u16 entry_idx;
+
+		keys = (u8 *)e->entry;
+		inverts = keys + (e->entry_sz / 2);
+		prio = ice_flow_acl_convert_to_acl_prio(e->priority);
+
+		status = ice_acl_add_entry(hw, prof->cfg.scen, prio, keys,
+					   inverts, acts, e->acts_cnt,
+					   &entry_idx);
+		if (status)
+			goto out;
+
+		e->scen_entry_idx = entry_idx;
+		list_add(&e->l_entry, &prof->entries);
+	} else {
+		if (do_chg_action) {
+			/* For the action memory info, update the SW's copy of
+			 * exist entry with e's action memory info
+			 */
+			devm_kfree(ice_hw_to_dev(hw), exist->acts);
+			exist->acts_cnt = e->acts_cnt;
+			exist->acts = devm_kcalloc(ice_hw_to_dev(hw),
+						   exist->acts_cnt,
+						   sizeof(struct ice_flow_action),
+						   GFP_KERNEL);
+			if (!exist->acts) {
+				status = ICE_ERR_NO_MEMORY;
+				goto out;
+			}
+
+			memcpy(exist->acts, e->acts,
+			       sizeof(struct ice_flow_action) * e->acts_cnt);
+
+			status = ice_acl_prog_act(hw, prof->cfg.scen, acts,
+						  e->acts_cnt,
+						  exist->scen_entry_idx);
+			if (status)
+				goto out;
+		}
+
+		if (do_chg_rng_chk) {
+			/* In this case, we want to update the range checker
+			 * information of the exist entry
+			 */
+			status = ice_flow_acl_union_rng_chk(exist->range_buf,
+							    e->range_buf);
+			if (status)
+				goto out;
+		}
+
+		/* As we don't add the new entry to our SW DB, deallocate its
+		 * memories, and return the exist entry to the caller
+		 */
+		ice_dealloc_flow_entry(hw, e);
+		*(entry) = exist;
+	}
+out:
+	devm_kfree(ice_hw_to_dev(hw), acts);
+
+	return status;
+}
+
+/**
+ * ice_flow_acl_add_scen_entry - Add entry to ACL scenario
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ * @e: double pointer to the flow entry
+ */
+static enum ice_status
+ice_flow_acl_add_scen_entry(struct ice_hw *hw, struct ice_flow_prof *prof,
+			    struct ice_flow_entry **e)
+{
+	enum ice_status status;
+
+	mutex_lock(&prof->entries_lock);
+	status = ice_flow_acl_add_scen_entry_sync(hw, prof, e);
+	mutex_unlock(&prof->entries_lock);
+
+	return status;
+}
+
+/**
+ * ice_flow_add_entry - Add a flow entry
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @prof_id: ID of the profile to add a new flow entry to
+ * @entry_id: unique ID to identify this flow entry
+ * @vsi_handle: software VSI handle for the flow entry
+ * @prio: priority of the flow entry
+ * @data: pointer to a data buffer containing flow entry's match values/masks
+ * @acts: arrays of actions to be performed on a match
+ * @acts_cnt: number of actions
+ * @entry_h: pointer to buffer that receives the new flow entry's handle
+ */
+enum ice_status
+ice_flow_add_entry(struct ice_hw *hw, enum ice_block blk, u64 prof_id,
+		   u64 entry_id, u16 vsi_handle, enum ice_flow_priority prio,
+		   void *data, struct ice_flow_action *acts, u8 acts_cnt,
+		   u64 *entry_h)
+{
+	struct ice_flow_entry *e = NULL;
+	struct ice_flow_prof *prof;
+	enum ice_status status = 0;
+
+	/* ACL entries must indicate an action */
+	if (blk == ICE_BLK_ACL && (!acts || !acts_cnt))
+		return ICE_ERR_PARAM;
+
+
+	/* No flow entry data is expected for RSS */
+	if (!entry_h || (!data && blk != ICE_BLK_RSS))
+		return ICE_ERR_BAD_PTR;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+
+	prof = ice_flow_find_prof_id(hw, blk, prof_id);
+	if (!prof) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+	} else {
+		/* Allocate memory for the entry being added and associate
+		 * the VSI to the found flow profile
+		 */
+		e = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*e), GFP_KERNEL);
+		if (!e)
+			status = ICE_ERR_NO_MEMORY;
+		else
+			status = ice_flow_assoc_prof(hw, blk, prof, vsi_handle);
+	}
+
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+	if (status)
+		goto out;
+
+	e->id = entry_id;
+	e->vsi_handle = vsi_handle;
+	e->prof = prof;
+	e->priority = prio;
+
+	switch (blk) {
+	case ICE_BLK_FD:
+	case ICE_BLK_RSS:
+		break;
+	case ICE_BLK_ACL:
+		/* ACL will handle the entry management */
+		if (ice_dcf_is_acl_capable(hw)) {
+			status = ICE_ERR_IN_USE;
+			goto out;
+		}
+		status = ice_flow_acl_frmt_entry(hw, prof, e, (u8 *)data, acts,
+						 acts_cnt);
+		if (status)
+			goto out;
+
+		status = ice_flow_acl_add_scen_entry(hw, prof, &e);
+		if (status)
+			goto out;
+
+		break;
+	default:
+		status = ICE_ERR_NOT_IMPL;
+		goto out;
+	}
+
+	if (blk != ICE_BLK_ACL) {
+		/* ACL will handle the entry management */
+		mutex_lock(&prof->entries_lock);
+		list_add(&e->l_entry, &prof->entries);
+		mutex_unlock(&prof->entries_lock);
+	}
+
+	*entry_h = ICE_FLOW_ENTRY_HNDL(e);
+
+out:
+	if (status && e) {
+		if (e->entry)
+			devm_kfree(ice_hw_to_dev(hw), e->entry);
+		devm_kfree(ice_hw_to_dev(hw), e);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_rem_entry - Remove a flow entry
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @entry_h: handle to the flow entry to be removed
+ */
+enum ice_status ice_flow_rem_entry(struct ice_hw *hw, enum ice_block blk,
+				   u64 entry_h)
+{
+	struct ice_flow_entry *entry;
+	struct ice_flow_prof *prof;
+	enum ice_status status = 0;
+
+	if (entry_h == ICE_FLOW_ENTRY_HANDLE_INVAL)
+		return ICE_ERR_PARAM;
+
+	entry = ICE_FLOW_ENTRY_PTR(entry_h);
+
+	/* Retain the pointer to the flow profile as the entry will be freed */
+	prof = entry->prof;
+
+	if (prof) {
+		mutex_lock(&prof->entries_lock);
+		status = ice_flow_rem_entry_sync(hw, blk, entry);
+		mutex_unlock(&prof->entries_lock);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_set_fld_ext - specifies locations of field from entry's input buffer
+ * @seg: packet segment the field being set belongs to
+ * @fld: field to be set
+ * @field_type: type of the field
+ * @val_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of the value to match from
+ *           entry's input buffer
+ * @mask_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of mask value from entry's
+ *            input buffer
+ * @last_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of last/upper value from
+ *            entry's input buffer
+ *
+ * This helper function stores information of a field being matched, including
+ * the type of the field and the locations of the value to match, the mask, and
+ * the upper-bound value in the start of the input buffer for a flow entry.
+ * This function should only be used for fixed-size data structures.
+ *
+ * This function also opportunistically determines the protocol headers to be
+ * present based on the fields being set. Some fields cannot be used alone to
+ * determine the protocol headers present. Sometimes, fields for particular
+ * protocol headers are not matched. In those cases, the protocol headers
+ * must be explicitly set.
+ */
+static void
+ice_flow_set_fld_ext(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+		     enum ice_flow_fld_match_type field_type, u16 val_loc,
+		     u16 mask_loc, u16 last_loc)
+{
+	u64 bit = BIT_ULL(fld);
+
+	seg->match |= bit;
+	if (field_type == ICE_FLOW_FLD_TYPE_RANGE)
+		seg->range |= bit;
+
+	seg->fields[fld].type = field_type;
+	seg->fields[fld].src.val = val_loc;
+	seg->fields[fld].src.mask = mask_loc;
+	seg->fields[fld].src.last = last_loc;
+
+	ICE_FLOW_SET_HDRS(seg, ice_flds_info[fld].hdr);
+}
+
+/**
+ * ice_flow_set_fld - specifies locations of field from entry's input buffer
+ * @seg: packet segment the field being set belongs to
+ * @fld: field to be set
+ * @val_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of the value to match from
+ *           entry's input buffer
+ * @mask_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of mask value from entry's
+ *            input buffer
+ * @last_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of last/upper value from
+ *            entry's input buffer
+ * @range: indicate if field being matched is to be in a range
+ *
+ * This function specifies the locations, in the form of byte offsets from the
+ * start of the input buffer for a flow entry, from where the value to match,
+ * the mask value, and upper value can be extracted. These locations are then
+ * stored in the flow profile. When adding a flow entry associated with the
+ * flow profile, these locations will be used to quickly extract the values and
+ * create the content of a match entry. This function should only be used for
+ * fixed-size data structures.
+ */
+void
+ice_flow_set_fld(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+		 u16 val_loc, u16 mask_loc, u16 last_loc, bool range)
+{
+	enum ice_flow_fld_match_type t = range ?
+		ICE_FLOW_FLD_TYPE_RANGE : ICE_FLOW_FLD_TYPE_REG;
+
+	ice_flow_set_fld_ext(seg, fld, t, val_loc, mask_loc, last_loc);
+}
+
+/**
+ * ice_flow_set_fld_prefix - sets locations of prefix field from entry's buf
+ * @seg: packet segment the field being set belongs to
+ * @fld: field to be set
+ * @val_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of the value to match from
+ *           entry's input buffer
+ * @pref_loc: location of prefix value from entry's input buffer
+ * @pref_sz: size of the location holding the prefix value
+ *
+ * This function specifies the locations, in the form of byte offsets from the
+ * start of the input buffer for a flow entry, from where the value to match
+ * and the IPv4 prefix value can be extracted. These locations are then stored
+ * in the flow profile. When adding flow entries to the associated flow profile,
+ * these locations can be used to quickly extract the values to create the
+ * content of a match entry. This function should only be used for fixed-size
+ * data structures.
+ */
+void
+ice_flow_set_fld_prefix(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+			u16 val_loc, u16 pref_loc, u8 pref_sz)
+{
+	/* For this type of field, the "mask" location is for the prefix value's
+	 * location and the "last" location is for the size of the location of
+	 * the prefix value.
+	 */
+	ice_flow_set_fld_ext(seg, fld, ICE_FLOW_FLD_TYPE_PREFIX, val_loc,
+			     pref_loc, (u16)pref_sz);
+}
+
+/**
+ * ice_flow_add_fld_raw - sets locations of a raw field from entry's input buf
+ * @seg: packet segment the field being set belongs to
+ * @off: offset of the raw field from the beginning of the segment in bytes
+ * @len: length of the raw pattern to be matched
+ * @val_loc: location of the value to match from entry's input buffer
+ * @mask_loc: location of mask value from entry's input buffer
+ *
+ * This function specifies the offset of the raw field to be match from the
+ * beginning of the specified packet segment, and the locations, in the form of
+ * byte offsets from the start of the input buffer for a flow entry, from where
+ * the value to match and the mask value to be extracted. These locations are
+ * then stored in the flow profile. When adding flow entries to the associated
+ * flow profile, these locations can be used to quickly extract the values to
+ * create the content of a match entry. This function should only be used for
+ * fixed-size data structures.
+ */
+void
+ice_flow_add_fld_raw(struct ice_flow_seg_info *seg, u16 off, u8 len,
+		     u16 val_loc, u16 mask_loc)
+{
+	if (seg->raws_cnt < ICE_FLOW_SEG_RAW_FLD_MAX) {
+		seg->raws[seg->raws_cnt].off = off;
+		seg->raws[seg->raws_cnt].info.type = ICE_FLOW_FLD_TYPE_SIZE;
+		seg->raws[seg->raws_cnt].info.src.val = val_loc;
+		seg->raws[seg->raws_cnt].info.src.mask = mask_loc;
+		/* The "last" field is used to store the length of the field */
+		seg->raws[seg->raws_cnt].info.src.last = len;
+	}
+
+	/* Overflows of "raws" will be handled as an error condition later in
+	 * the flow when this information is processed.
+	 */
+	seg->raws_cnt++;
+}
+
+/**
+ * ice_flow_rem_vsi_prof - remove vsi from flow profile
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @vsi_handle: software VSI handle
+ * @prof_id: unique ID to identify this flow profile
+ *
+ * This function removes the flow entries associated to the input
+ * vsi handle and disassociates the vsi from the flow profile.
+ */
+enum ice_status ice_flow_rem_vsi_prof(struct ice_hw *hw, enum ice_block blk, u16 vsi_handle,
+				      u64 prof_id)
+{
+	struct ice_flow_prof *prof = NULL;
+	enum ice_status status = 0;
+
+	if (blk >= ICE_BLK_COUNT || !ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	/* find flow profile pointer with input package block and profile id */
+	prof = ice_flow_find_prof_id(hw, ICE_BLK_FD, prof_id);
+	if (!prof) {
+		ice_debug(hw, ICE_DBG_PKG,
+			  "Cannot find flow profile id=%llu\n", prof_id);
+		return ICE_ERR_DOES_NOT_EXIST;
+	}
+
+	/* Remove all remaining flow entries before removing the flow profile */
+	if (!list_empty(&prof->entries)) {
+		struct ice_flow_entry *e, *t;
+
+		mutex_lock(&prof->entries_lock);
+		list_for_each_entry_safe(e, t, &prof->entries, l_entry) {
+			if (e->vsi_handle != vsi_handle)
+				continue;
+
+			status = ice_flow_rem_entry_sync(hw, blk, e);
+			if (status)
+				break;
+		}
+		mutex_unlock(&prof->entries_lock);
+	}
+	if (status)
+		return status;
+
+	/* disassociate the flow profile from sw vsi handle */
+	status = ice_flow_disassoc_prof(hw, blk, prof, vsi_handle);
+	if (status)
+		ice_debug(hw, ICE_DBG_PKG,
+			  "ice_flow_disassoc_prof() failed with status=%d\n",
+			  status);
+	return status;
+}
+
+#define ICE_FLOW_RSS_SEG_HDR_L2_MASKS \
+(ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN)
+
+#define ICE_FLOW_RSS_SEG_HDR_L3_MASKS \
+	(ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV6)
+
+#define ICE_FLOW_RSS_SEG_HDR_L4_MASKS \
+	(ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_SCTP)
+
+#define ICE_FLOW_RSS_SEG_HDR_VAL_MASKS \
+	(ICE_FLOW_RSS_SEG_HDR_L2_MASKS | \
+	 ICE_FLOW_RSS_SEG_HDR_L3_MASKS | \
+	 ICE_FLOW_RSS_SEG_HDR_L4_MASKS)
+
+/**
+ * ice_flow_set_rss_seg_info - setup packet segments for RSS
+ * @segs: pointer to the flow field segment(s)
+ * @seg_cnt: segment count
+ * @cfg: configure parameters
+ *
+ * Helper function to extract fields from hash bitmap and use flow
+ * header value to set flow field segment for further use in flow
+ * profile entry or removal.
+ */
+static enum ice_status
+ice_flow_set_rss_seg_info(struct ice_flow_seg_info *segs, u8 seg_cnt,
+			  const struct ice_rss_hash_cfg *cfg)
+{
+	struct ice_flow_seg_info *seg;
+	u64 val;
+	u8 i;
+
+	/* set inner most segment */
+	seg = &segs[seg_cnt - 1];
+
+	for_each_set_bit(i, (const unsigned long *)&cfg->hash_flds,
+			 ICE_FLOW_FIELD_IDX_MAX)
+		ice_flow_set_fld(seg, (enum ice_flow_field)i,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+
+	ICE_FLOW_SET_HDRS(seg, cfg->addl_hdrs);
+
+	/* set outer most header */
+	if (cfg->hdr_type == ICE_RSS_INNER_HEADERS_W_OUTER_IPV4)
+		segs[ICE_RSS_OUTER_HEADERS].hdrs |= ICE_FLOW_SEG_HDR_IPV4 |
+						   ICE_FLOW_SEG_HDR_IPV_FRAG |
+						   ICE_FLOW_SEG_HDR_IPV_OTHER;
+	else if (cfg->hdr_type == ICE_RSS_INNER_HEADERS_W_OUTER_IPV6)
+		segs[ICE_RSS_OUTER_HEADERS].hdrs |= ICE_FLOW_SEG_HDR_IPV6 |
+						   ICE_FLOW_SEG_HDR_IPV_FRAG |
+						   ICE_FLOW_SEG_HDR_IPV_OTHER;
+
+	if (seg->hdrs & ~ICE_FLOW_RSS_SEG_HDR_VAL_MASKS &
+	    ~ICE_FLOW_RSS_HDRS_INNER_MASK & ~ICE_FLOW_SEG_HDR_IPV_OTHER &
+	    ~ICE_FLOW_SEG_HDR_IPV_FRAG)
+		return ICE_ERR_PARAM;
+
+	val = (u64)(seg->hdrs & ICE_FLOW_RSS_SEG_HDR_L3_MASKS);
+	if (val && !is_power_of_2(val))
+		return ICE_ERR_CFG;
+
+	val = (u64)(seg->hdrs & ICE_FLOW_RSS_SEG_HDR_L4_MASKS);
+	if (val && !is_power_of_2(val))
+		return ICE_ERR_CFG;
+
+	return 0;
+}
+
+/**
+ * ice_rem_vsi_rss_list - remove VSI from RSS list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ *
+ * Remove the VSI from all RSS configurations in the list.
+ */
+void ice_rem_vsi_rss_list(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_rss_cfg *r, *tmp;
+
+	if (list_empty(&hw->rss_list_head))
+		return;
+
+	mutex_lock(&hw->rss_locks);
+	list_for_each_entry_safe(r, tmp, &hw->rss_list_head, l_entry)
+		if (test_and_clear_bit(vsi_handle, r->vsis))
+			if (bitmap_empty(r->vsis, ICE_MAX_VSI)) {
+				list_del(&r->l_entry);
+				devm_kfree(ice_hw_to_dev(hw), r);
+			}
+	mutex_unlock(&hw->rss_locks);
+}
+
+/**
+ * ice_rem_vsi_rss_cfg - remove RSS configurations associated with VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function will iterate through all flow profiles and disassociate
+ * the VSI from that profile. If the flow profile has no VSIs it will
+ * be removed.
+ */
+enum ice_status ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle)
+{
+	const enum ice_block blk = ICE_BLK_RSS;
+	struct ice_flow_prof *p, *t;
+	enum ice_status status = 0;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	if (list_empty(&hw->fl_profs[blk]))
+		return 0;
+
+	mutex_lock(&hw->rss_locks);
+	list_for_each_entry_safe(p, t, &hw->fl_profs[blk], l_entry)
+		if (test_bit(vsi_handle, p->vsis)) {
+			status = ice_flow_disassoc_prof(hw, blk, p, vsi_handle);
+			if (status)
+				break;
+
+			if (bitmap_empty(p->vsis, ICE_MAX_VSI)) {
+				status = ice_flow_rem_prof(hw, blk, p->id);
+				if (status)
+					break;
+			}
+		}
+	mutex_unlock(&hw->rss_locks);
+
+	return status;
+}
+
+/**
+ * ice_get_rss_hdr_type - get a RSS profile's header type
+ * @prof: RSS flow profile
+ */
+static enum ice_rss_cfg_hdr_type
+ice_get_rss_hdr_type(struct ice_flow_prof *prof)
+{
+	enum ice_rss_cfg_hdr_type hdr_type = ICE_RSS_ANY_HEADERS;
+
+	if (prof->segs_cnt == ICE_FLOW_SEG_SINGLE) {
+		hdr_type = ICE_RSS_OUTER_HEADERS;
+	} else if (prof->segs_cnt == ICE_FLOW_SEG_MAX) {
+		if (prof->segs[ICE_RSS_OUTER_HEADERS].hdrs == ICE_FLOW_SEG_HDR_NONE)
+			hdr_type = ICE_RSS_INNER_HEADERS;
+		if (prof->segs[ICE_RSS_OUTER_HEADERS].hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			hdr_type = ICE_RSS_INNER_HEADERS_W_OUTER_IPV4;
+		if (prof->segs[ICE_RSS_OUTER_HEADERS].hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			hdr_type = ICE_RSS_INNER_HEADERS_W_OUTER_IPV6;
+	}
+
+	return hdr_type;
+}
+
+/**
+ * ice_rem_rss_list - remove RSS configuration from list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @prof: pointer to flow profile
+ *
+ * Assumption: lock has already been acquired for RSS list
+ */
+static void
+ice_rem_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
+{
+	enum ice_rss_cfg_hdr_type hdr_type;
+	struct ice_rss_cfg *r, *tmp;
+
+	/* Search for RSS hash fields associated to the VSI that match the
+	 * hash configurations associated to the flow profile. If found
+	 * remove from the RSS entry list of the VSI context and delete entry.
+	 */
+	hdr_type = ice_get_rss_hdr_type(prof);
+	list_for_each_entry_safe(r, tmp, &hw->rss_list_head, l_entry)
+		if (r->hash.hash_flds == prof->segs[prof->segs_cnt - 1].match &&
+		    r->hash.addl_hdrs == prof->segs[prof->segs_cnt - 1].hdrs &&
+		    r->hash.hdr_type == hdr_type) {
+			clear_bit(vsi_handle, r->vsis);
+			if (bitmap_empty(r->vsis, ICE_MAX_VSI)) {
+				list_del(&r->l_entry);
+				devm_kfree(ice_hw_to_dev(hw), r);
+			}
+			return;
+		}
+}
+
+/**
+ * ice_add_rss_list - add RSS configuration to list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @prof: pointer to flow profile
+ *
+ * Assumption: lock has already been acquired for RSS list
+ */
+static enum ice_status
+ice_add_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
+{
+	enum ice_rss_cfg_hdr_type hdr_type;
+	struct ice_rss_cfg *r, *rss_cfg;
+
+	hdr_type = ice_get_rss_hdr_type(prof);
+	list_for_each_entry(r, &hw->rss_list_head, l_entry)
+		if (r->hash.hash_flds == prof->segs[prof->segs_cnt - 1].match &&
+		    r->hash.addl_hdrs == prof->segs[prof->segs_cnt - 1].hdrs &&
+		    r->hash.hdr_type == hdr_type) {
+			set_bit(vsi_handle, r->vsis);
+			return 0;
+		}
+
+	rss_cfg = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rss_cfg),
+			       GFP_KERNEL);
+	if (!rss_cfg)
+		return ICE_ERR_NO_MEMORY;
+
+	rss_cfg->hash.hash_flds = prof->segs[prof->segs_cnt - 1].match;
+	rss_cfg->hash.addl_hdrs = prof->segs[prof->segs_cnt - 1].hdrs;
+	rss_cfg->hash.hdr_type = hdr_type;
+	rss_cfg->hash.symm = prof->cfg.symm;
+	set_bit(vsi_handle, rss_cfg->vsis);
+
+	list_add_tail(&rss_cfg->l_entry, &hw->rss_list_head);
+
+	return 0;
+}
+
+#define ICE_FLOW_PROF_HASH_S	0
+#define ICE_FLOW_PROF_HASH_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_HASH_S)
+#define ICE_FLOW_PROF_HDR_S	32
+#define ICE_FLOW_PROF_HDR_M	(0x3FFFFFFFULL << ICE_FLOW_PROF_HDR_S)
+#define ICE_FLOW_PROF_ENCAP_S	62
+#define ICE_FLOW_PROF_ENCAP_M	(0x3ULL << ICE_FLOW_PROF_ENCAP_S)
+
+/* Flow profile ID format:
+ * [0:31] - Packet match fields
+ * [32:61] - Protocol header
+ * [62:63] - Encapsulation flag:
+ *	     0 if non-tunneled
+ *	     1 if tunneled
+ *	     2 for tunneled with outer ipv4
+ *	     3 for tunneled with outer ipv6
+ */
+#define ICE_FLOW_GEN_PROFID(hash, hdr, encap) \
+	((u64)(((u64)(hash) & ICE_FLOW_PROF_HASH_M) | \
+	       (((u64)(hdr) << ICE_FLOW_PROF_HDR_S) & ICE_FLOW_PROF_HDR_M) | \
+	       (((u64)(encap) << ICE_FLOW_PROF_ENCAP_S) & ICE_FLOW_PROF_ENCAP_M)))
+
+static void
+ice_rss_config_xor_word(struct ice_hw *hw, u8 prof_id, u8 src, u8 dst)
+{
+	u32 s = ((src % 4) << 3); /* byte shift */
+	u32 v = dst | 0x80; /* value to program */
+	u8 i = src / 4; /* register index */
+	u32 reg;
+
+	reg = rd32(hw, GLQF_HSYMM(prof_id, i));
+	reg = (reg & ~(0xff << s)) | (v << s);
+	wr32(hw, GLQF_HSYMM(prof_id, i), reg);
+}
+
+static void
+ice_rss_config_xor(struct ice_hw *hw, u8 prof_id, u8 src, u8 dst, u8 len)
+{
+	int fv_last_word =
+		ICE_FLOW_SW_FIELD_VECTOR_MAX / ICE_FLOW_FV_EXTRACT_SZ - 1;
+	int i;
+
+	for (i = 0; i < len; i++) {
+		ice_rss_config_xor_word(hw, prof_id,
+					/* Yes, field vector in GLQF_HSYMM and
+					 * GLQF_HINSET is inversed!
+					 */
+					fv_last_word - (src + i),
+					fv_last_word - (dst + i));
+		ice_rss_config_xor_word(hw, prof_id,
+					fv_last_word - (dst + i),
+					fv_last_word - (src + i));
+	}
+}
+
+static void
+ice_rss_update_symm(struct ice_hw *hw,
+		    struct ice_flow_prof *prof)
+{
+	struct ice_prof_map *map;
+	u8 prof_id, m;
+
+	mutex_lock(&hw->blk[ICE_BLK_RSS].es.prof_map_lock);
+	map = ice_search_prof_id(hw, ICE_BLK_RSS, prof->id);
+	if (map)
+		prof_id = map->prof_id;
+	mutex_unlock(&hw->blk[ICE_BLK_RSS].es.prof_map_lock);
+	if (!map)
+		return;
+	/* clear to default */
+	for (m = 0; m < 6; m++)
+		wr32(hw, GLQF_HSYMM(prof_id, m), 0);
+	if (prof->cfg.symm) {
+		struct ice_flow_seg_info *seg =
+			&prof->segs[prof->segs_cnt - 1];
+
+		struct ice_flow_seg_xtrct *ipv4_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_IPV4_SA].xtrct;
+		struct ice_flow_seg_xtrct *ipv4_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_IPV4_DA].xtrct;
+		struct ice_flow_seg_xtrct *ipv6_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_IPV6_SA].xtrct;
+		struct ice_flow_seg_xtrct *ipv6_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_IPV6_DA].xtrct;
+
+		struct ice_flow_seg_xtrct *tcp_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_TCP_SRC_PORT].xtrct;
+		struct ice_flow_seg_xtrct *tcp_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_TCP_DST_PORT].xtrct;
+
+		struct ice_flow_seg_xtrct *udp_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_UDP_SRC_PORT].xtrct;
+		struct ice_flow_seg_xtrct *udp_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_UDP_DST_PORT].xtrct;
+
+		struct ice_flow_seg_xtrct *sctp_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT].xtrct;
+		struct ice_flow_seg_xtrct *sctp_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_SCTP_DST_PORT].xtrct;
+
+		/* xor IPv4 */
+		if (ipv4_src->prot_id != 0 && ipv4_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   ipv4_src->idx, ipv4_dst->idx, 2);
+
+		/* xor IPv6 */
+		if (ipv6_src->prot_id != 0 && ipv6_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   ipv6_src->idx, ipv6_dst->idx, 8);
+
+		/* xor TCP */
+		if (tcp_src->prot_id != 0 && tcp_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   tcp_src->idx, tcp_dst->idx, 1);
+
+		/* xor UDP */
+		if (udp_src->prot_id != 0 && udp_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   udp_src->idx, udp_dst->idx, 1);
+
+		/* xor SCTP */
+		if (sctp_src->prot_id != 0 && sctp_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   sctp_src->idx, sctp_dst->idx, 1);
+	}
+}
+
+/**
+ * ice_add_rss_cfg_sync - add an RSS configuration
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @cfg: configure parameters
+ *
+ * Assumption: lock has already been acquired for RSS list
+ */
+static enum ice_status
+ice_add_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle,
+		     const struct ice_rss_hash_cfg *cfg)
+{
+	const enum ice_block blk = ICE_BLK_RSS;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_flow_seg_info *segs;
+	enum ice_status status;
+	u8 segs_cnt;
+
+
+	segs_cnt = (cfg->hdr_type == ICE_RSS_OUTER_HEADERS) ?
+			ICE_FLOW_SEG_SINGLE : ICE_FLOW_SEG_MAX;
+
+	segs = devm_kcalloc(ice_hw_to_dev(hw), segs_cnt, sizeof(*segs),
+			    GFP_KERNEL);
+	if (!segs)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Construct the packet segment info from the hashed fields */
+	status = ice_flow_set_rss_seg_info(segs, segs_cnt, cfg);
+	if (status)
+		goto exit;
+
+	/* Search for a flow profile that has matching headers, hash fields
+	 * and has the input VSI associated to it. If found, no further
+	 * operations required and exit.
+	 */
+	prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt,
+					vsi_handle,
+					ICE_FLOW_FIND_PROF_CHK_FLDS |
+					ICE_FLOW_FIND_PROF_CHK_VSI);
+	if (prof) {
+		if (prof->cfg.symm == cfg->symm)
+			goto exit;
+		prof->cfg.symm = cfg->symm;
+		goto update_symm;
+	}
+
+	/* Check if a flow profile exists with the same protocol headers and
+	 * associated with the input VSI. If so disassociate the VSI from
+	 * this profile. The VSI will be added to a new profile created with
+	 * the protocol header and new hash field configuration.
+	 */
+	prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt,
+					vsi_handle, ICE_FLOW_FIND_PROF_CHK_VSI);
+	if (prof) {
+		status = ice_flow_disassoc_prof(hw, blk, prof, vsi_handle);
+		if (!status)
+			ice_rem_rss_list(hw, vsi_handle, prof);
+		else
+			goto exit;
+
+		/* Remove profile if it has no VSIs associated */
+		if (bitmap_empty(prof->vsis, ICE_MAX_VSI)) {
+			status = ice_flow_rem_prof(hw, blk, prof->id);
+			if (status)
+				goto exit;
+		}
+	}
+
+	/* Search for a profile that has same match fields only. If this
+	 * exists then associate the VSI to this profile.
+	 */
+	prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt,
+					vsi_handle,
+					ICE_FLOW_FIND_PROF_CHK_FLDS);
+	if (prof) {
+		if (prof->cfg.symm == cfg->symm) {
+			status = ice_flow_assoc_prof(hw, blk, prof,
+						     vsi_handle);
+			if (!status)
+				status = ice_add_rss_list(hw, vsi_handle,
+							  prof);
+		} else {
+			/* if a profile exist but with different symmetric
+			 * requirement, just return error.
+			 */
+			status = ICE_ERR_NOT_SUPPORTED;
+		}
+		goto exit;
+	}
+
+	/* Create a new flow profile with generated profile and packet
+	 * segment information.
+	 */
+	status = ice_flow_add_prof(hw, blk, ICE_FLOW_RX,
+				   ICE_FLOW_GEN_PROFID(cfg->hash_flds,
+						       segs[segs_cnt - 1].hdrs,
+						       cfg->hdr_type),
+				   segs, segs_cnt, NULL, 0, &prof);
+	if (status)
+		goto exit;
+
+	status = ice_flow_assoc_prof(hw, blk, prof, vsi_handle);
+	/* If association to a new flow profile failed then this profile can
+	 * be removed.
+	 */
+	if (status) {
+		ice_flow_rem_prof(hw, blk, prof->id);
+		goto exit;
+	}
+
+	status = ice_add_rss_list(hw, vsi_handle, prof);
+
+	prof->cfg.symm = cfg->symm;
+update_symm:
+	ice_rss_update_symm(hw, prof);
+
+exit:
+	devm_kfree(ice_hw_to_dev(hw), segs);
+	return status;
+}
+
+/**
+ * ice_add_rss_cfg - add an RSS configuration with specified hashed fields
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @cfg: configure parameters
+ *
+ * This function will generate a flow profile based on fields associated with
+ * the input fields to hash on, the flow type and use the VSI number to add
+ * a flow entry to the profile.
+ */
+enum ice_status
+ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg)
+{
+	struct ice_rss_hash_cfg local_cfg;
+	enum ice_status status;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle) ||
+	    !cfg || cfg->hdr_type > ICE_RSS_ANY_HEADERS ||
+	    cfg->hash_flds == ICE_HASH_INVALID)
+		return ICE_ERR_PARAM;
+
+	local_cfg = *cfg;
+	if (cfg->hdr_type < ICE_RSS_ANY_HEADERS) {
+		mutex_lock(&hw->rss_locks);
+		status = ice_add_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+		mutex_unlock(&hw->rss_locks);
+	} else {
+		mutex_lock(&hw->rss_locks);
+		local_cfg.hdr_type = ICE_RSS_OUTER_HEADERS;
+		status = ice_add_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+		if (!status) {
+			local_cfg.hdr_type = ICE_RSS_INNER_HEADERS;
+			status = ice_add_rss_cfg_sync(hw, vsi_handle,
+						      &local_cfg);
+		}
+		mutex_unlock(&hw->rss_locks);
+	}
+
+	return status;
+}
+
+/**
+ * ice_rem_rss_cfg_sync - remove an existing RSS configuration
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @cfg: configure parameters
+ *
+ * Assumption: lock has already been acquired for RSS list
+ */
+static enum ice_status
+ice_rem_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle,
+		     const struct ice_rss_hash_cfg *cfg)
+{
+	const enum ice_block blk = ICE_BLK_RSS;
+	struct ice_flow_seg_info *segs;
+	struct ice_flow_prof *prof;
+	enum ice_status status;
+	u8 segs_cnt;
+
+	segs_cnt = (cfg->hdr_type == ICE_RSS_OUTER_HEADERS) ?
+			ICE_FLOW_SEG_SINGLE : ICE_FLOW_SEG_MAX;
+	segs = devm_kcalloc(ice_hw_to_dev(hw), segs_cnt, sizeof(*segs),
+			    GFP_KERNEL);
+	if (!segs)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Construct the packet segment info from the hashed fields */
+	status = ice_flow_set_rss_seg_info(segs, segs_cnt, cfg);
+	if (status)
+		goto out;
+
+	prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt,
+					vsi_handle,
+					ICE_FLOW_FIND_PROF_CHK_FLDS);
+	if (!prof) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto out;
+	}
+
+	status = ice_flow_disassoc_prof(hw, blk, prof, vsi_handle);
+	if (status)
+		goto out;
+
+	/* Remove RSS configuration from VSI context before deleting
+	 * the flow profile.
+	 */
+	ice_rem_rss_list(hw, vsi_handle, prof);
+
+	if (bitmap_empty(prof->vsis, ICE_MAX_VSI))
+		status = ice_flow_rem_prof(hw, blk, prof->id);
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), segs);
+	return status;
+}
+
+/**
+ * ice_rem_rss_cfg - remove an existing RSS config with matching hashed fields
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @cfg: configure parameters
+ *
+ * This function will lookup the flow profile based on the input
+ * hash field bitmap, iterate through the profile entry list of
+ * that profile and find entry associated with input VSI to be
+ * removed. Calls are made to underlying flow apis which will in
+ * turn build or update buffers for RSS XLT1 section.
+ */
+enum ice_status
+ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg)
+{
+	struct ice_rss_hash_cfg local_cfg;
+	enum ice_status status;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle) ||
+	    !cfg || cfg->hdr_type > ICE_RSS_ANY_HEADERS ||
+	    cfg->hash_flds == ICE_HASH_INVALID)
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->rss_locks);
+	local_cfg = *cfg;
+	if (cfg->hdr_type < ICE_RSS_ANY_HEADERS) {
+		status = ice_rem_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+	} else {
+		local_cfg.hdr_type = ICE_RSS_OUTER_HEADERS;
+		status = ice_rem_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+
+		if (!status) {
+			local_cfg.hdr_type = ICE_RSS_INNER_HEADERS;
+			status = ice_rem_rss_cfg_sync(hw, vsi_handle,
+						      &local_cfg);
+		}
+	}
+	mutex_unlock(&hw->rss_locks);
+
+	return status;
+}
+
+/* Mapping of AVF hash bit fields to an L3-L4 hash combination.
+ * As the ice_flow_avf_hdr_field represent individual bit shifts in a hash,
+ * convert its values to their appropriate flow L3, L4 values.
+ */
+#define ICE_FLOW_AVF_RSS_IPV4_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_OTHER) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV4))
+#define ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP))
+#define ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP))
+#define ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS \
+	(ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS | ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS | \
+	 ICE_FLOW_AVF_RSS_IPV4_MASKS | BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP))
+
+#define ICE_FLOW_AVF_RSS_IPV6_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_OTHER) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV6))
+#define ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_UDP))
+#define ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP))
+#define ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS \
+	(ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS | ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS | \
+	 ICE_FLOW_AVF_RSS_IPV6_MASKS | BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP))
+
+/**
+ * ice_add_avf_rss_cfg - add an RSS configuration for AVF driver
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @avf_hash: hash bit fields (ICE_AVF_FLOW_FIELD_*) to configure
+ *
+ * This function will take the hash bitmap provided by the AVF driver via a
+ * message, convert it to ICE-compatible values, and configure RSS flow
+ * profiles.
+ */
+enum ice_status
+ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 avf_hash)
+{
+	enum ice_status status = 0;
+	struct ice_rss_hash_cfg hcfg;
+	u64 hash_flds;
+
+	if (avf_hash == ICE_AVF_FLOW_FIELD_INVALID ||
+	    !ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	/* Make sure no unsupported bits are specified */
+	if (avf_hash & ~(ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS |
+			 ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS))
+		return ICE_ERR_CFG;
+
+	hash_flds = avf_hash;
+
+	/* Always create an L3 RSS configuration for any L4 RSS configuration */
+	if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS)
+		hash_flds |= ICE_FLOW_AVF_RSS_IPV4_MASKS;
+
+	if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS)
+		hash_flds |= ICE_FLOW_AVF_RSS_IPV6_MASKS;
+
+	/* Create the corresponding RSS configuration for each valid hash bit */
+	while (hash_flds) {
+		u64 rss_hash = ICE_HASH_INVALID;
+
+		if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS) {
+			if (hash_flds & ICE_FLOW_AVF_RSS_IPV4_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV4;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_IPV4_MASKS;
+			} else if (hash_flds &
+				   ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV4 |
+					ICE_FLOW_HASH_TCP_PORT;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS;
+			} else if (hash_flds &
+				   ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV4 |
+					ICE_FLOW_HASH_UDP_PORT;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS;
+			} else if (hash_flds &
+				   BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP)) {
+				rss_hash = ICE_FLOW_HASH_IPV4 |
+					ICE_FLOW_HASH_SCTP_PORT;
+				hash_flds &=
+					~BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP);
+			}
+		} else if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS) {
+			if (hash_flds & ICE_FLOW_AVF_RSS_IPV6_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV6;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_IPV6_MASKS;
+			} else if (hash_flds &
+				   ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV6 |
+					ICE_FLOW_HASH_TCP_PORT;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS;
+			} else if (hash_flds &
+				   ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV6 |
+					ICE_FLOW_HASH_UDP_PORT;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS;
+			} else if (hash_flds &
+				   BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP)) {
+				rss_hash = ICE_FLOW_HASH_IPV6 |
+					ICE_FLOW_HASH_SCTP_PORT;
+				hash_flds &=
+					~BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP);
+			}
+		}
+
+		if (rss_hash == ICE_HASH_INVALID)
+			return ICE_ERR_OUT_OF_RANGE;
+
+		hcfg.addl_hdrs = ICE_FLOW_SEG_HDR_NONE;
+		hcfg.hash_flds = rss_hash;
+		hcfg.symm = false;
+		hcfg.hdr_type = ICE_RSS_ANY_HEADERS;
+		status = ice_add_rss_cfg(hw, vsi_handle, &hcfg);
+		if (status)
+			break;
+	}
+
+	return status;
+}
+
+/**
+ * ice_replay_rss_cfg - replay RSS configurations associated with VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ */
+enum ice_status ice_replay_rss_cfg(struct ice_hw *hw, u16 vsi_handle)
+{
+	enum ice_status status = 0;
+	struct ice_rss_cfg *r;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->rss_locks);
+	list_for_each_entry(r, &hw->rss_list_head, l_entry) {
+		if (test_bit(vsi_handle, r->vsis)) {
+			status = ice_add_rss_cfg_sync(hw, vsi_handle, &r->hash);
+			if (status)
+				break;
+		}
+	}
+	mutex_unlock(&hw->rss_locks);
+
+	return status;
+}
+
+/**
+ * ice_get_rss_cfg - returns hashed fields for the given header types
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @hdrs: protocol header type
+ *
+ * This function will return the match fields of the first instance of flow
+ * profile having the given header types and containing input VSI
+ */
+u64 ice_get_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u32 hdrs)
+{
+	u64 rss_hash = ICE_HASH_INVALID;
+	struct ice_rss_cfg *r;
+
+	/* verify if the protocol header is non zero and VSI is valid */
+	if (hdrs == ICE_FLOW_SEG_HDR_NONE || !ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_HASH_INVALID;
+
+	mutex_lock(&hw->rss_locks);
+	list_for_each_entry(r, &hw->rss_list_head, l_entry)
+		if (test_bit(vsi_handle, r->vsis) &&
+		    r->hash.addl_hdrs == hdrs) {
+			rss_hash = r->hash.hash_flds;
+			break;
+		}
+	mutex_unlock(&hw->rss_locks);
+
+	return rss_hash;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.h b/drivers/net/ethernet/intel/ice/ice_flow.h
new file mode 100644
index 000000000000..d41fcfff3bd3
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_flow.h
@@ -0,0 +1,572 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FLOW_H_
+#define _ICE_FLOW_H_
+
+#include "ice_flex_type.h"
+#include "ice_acl.h"
+
+#define ICE_IPV4_MAKE_PREFIX_MASK(prefix) ((u32)(~0) << (32 - (prefix)))
+#define ICE_FLOW_PROF_ID_INVAL		0xfffffffffffffffful
+#define ICE_FLOW_PROF_ID_BYPASS		0
+#define ICE_FLOW_PROF_ID_DEFAULT	1
+#define ICE_FLOW_ENTRY_HANDLE_INVAL	0
+#define ICE_FLOW_VSI_INVAL		0xffff
+#define ICE_FLOW_FLD_OFF_INVAL		0xffff
+
+/* Generate flow hash field from flow field type(s) */
+#define ICE_FLOW_HASH_ETH	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_DA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_SA))
+#define ICE_FLOW_HASH_IPV4	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA))
+#define ICE_FLOW_HASH_IPV6	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA))
+#define ICE_FLOW_HASH_IPV6_PRE32	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE32_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE32_DA))
+#define ICE_FLOW_HASH_IPV6_PRE48	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE48_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE48_DA))
+#define ICE_FLOW_HASH_IPV6_PRE64	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA))
+#define ICE_FLOW_HASH_TCP_PORT	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT))
+#define ICE_FLOW_HASH_UDP_PORT	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT))
+#define ICE_FLOW_HASH_SCTP_PORT	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT))
+
+#define ICE_HASH_INVALID	0
+#define ICE_HASH_TCP_IPV4	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_TCP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_UDP_IPV4	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_UDP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV4	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_SCTP_PORT)
+#define ICE_HASH_SCTP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_SCTP_PORT)
+
+#define ICE_HASH_TCP_IPV6_PRE32	 \
+	(ICE_FLOW_HASH_IPV6_PRE32 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_UDP_IPV6_PRE32	 \
+	(ICE_FLOW_HASH_IPV6_PRE32 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV6_PRE32 \
+	(ICE_FLOW_HASH_IPV6_PRE32 | ICE_FLOW_HASH_SCTP_PORT)
+#define ICE_HASH_TCP_IPV6_PRE48	 \
+	(ICE_FLOW_HASH_IPV6_PRE48 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_UDP_IPV6_PRE48	 \
+	(ICE_FLOW_HASH_IPV6_PRE48 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV6_PRE48 \
+	(ICE_FLOW_HASH_IPV6_PRE48 | ICE_FLOW_HASH_SCTP_PORT)
+#define ICE_HASH_TCP_IPV6_PRE64	 \
+	(ICE_FLOW_HASH_IPV6_PRE64 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_UDP_IPV6_PRE64	 \
+	(ICE_FLOW_HASH_IPV6_PRE64 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV6_PRE64 \
+	(ICE_FLOW_HASH_IPV6_PRE64 | ICE_FLOW_HASH_SCTP_PORT)
+
+#define ICE_FLOW_HASH_VXLAN_VNI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_VXLAN_VNI))
+
+#define ICE_FLOW_HASH_GTP_TEID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPC_TEID))
+
+#define ICE_FLOW_HASH_GTP_IPV4_TEID \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_GTP_TEID)
+#define ICE_FLOW_HASH_GTP_IPV6_TEID \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_GTP_TEID)
+
+#define ICE_FLOW_HASH_GTP_U_TEID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_IP_TEID))
+
+#define ICE_FLOW_HASH_GTP_U_IPV4_TEID \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_GTP_U_TEID)
+#define ICE_FLOW_HASH_GTP_U_IPV6_TEID \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_GTP_U_TEID)
+
+#define ICE_FLOW_HASH_GTP_U_EH_TEID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_EH_TEID))
+
+#define ICE_FLOW_HASH_GTP_U_EH_QFI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_EH_QFI))
+
+#define ICE_FLOW_HASH_GTP_U_IPV4_EH \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_GTP_U_EH_TEID | \
+	 ICE_FLOW_HASH_GTP_U_EH_QFI)
+#define ICE_FLOW_HASH_GTP_U_IPV6_EH \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_GTP_U_EH_TEID | \
+	 ICE_FLOW_HASH_GTP_U_EH_QFI)
+
+#define ICE_FLOW_HASH_PPPOE_SESS_ID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID))
+
+#define ICE_FLOW_HASH_PPPOE_SESS_ID_ETH \
+	(ICE_FLOW_HASH_ETH | ICE_FLOW_HASH_PPPOE_SESS_ID)
+#define ICE_FLOW_HASH_PPPOE_TCP_ID \
+	(ICE_FLOW_HASH_TCP_PORT | ICE_FLOW_HASH_PPPOE_SESS_ID)
+#define ICE_FLOW_HASH_PPPOE_UDP_ID \
+	(ICE_FLOW_HASH_UDP_PORT | ICE_FLOW_HASH_PPPOE_SESS_ID)
+
+#define ICE_FLOW_HASH_PFCP_SEID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_PFCP_SEID))
+#define ICE_FLOW_HASH_PFCP_IPV4_SEID \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_PFCP_SEID)
+#define ICE_FLOW_HASH_PFCP_IPV6_SEID \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_PFCP_SEID)
+
+#define ICE_FLOW_HASH_L2TPV3_SESS_ID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID))
+#define ICE_FLOW_HASH_L2TPV3_IPV4_SESS_ID \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_L2TPV3_SESS_ID)
+#define ICE_FLOW_HASH_L2TPV3_IPV6_SESS_ID \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_L2TPV3_SESS_ID)
+
+#define ICE_FLOW_HASH_ESP_SPI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_ESP_SPI))
+#define ICE_FLOW_HASH_ESP_IPV4_SPI \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_ESP_SPI)
+#define ICE_FLOW_HASH_ESP_IPV6_SPI \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_ESP_SPI)
+
+#define ICE_FLOW_HASH_AH_SPI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_AH_SPI))
+#define ICE_FLOW_HASH_AH_IPV4_SPI \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_AH_SPI)
+#define ICE_FLOW_HASH_AH_IPV6_SPI \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_AH_SPI)
+
+#define ICE_FLOW_HASH_NAT_T_ESP_SPI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI))
+#define ICE_FLOW_HASH_NAT_T_ESP_IPV4_SPI \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_NAT_T_ESP_SPI)
+#define ICE_FLOW_HASH_NAT_T_ESP_IPV6_SPI \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_NAT_T_ESP_SPI)
+
+/* Protocol header fields within a packet segment. A segment consists of one or
+ * more protocol headers that make up a logical group of protocol headers. Each
+ * logical group of protocol headers encapsulates or is encapsulated using/by
+ * tunneling or encapsulation protocols for network virtualization such as GRE,
+ * VxLAN, etc.
+ */
+enum ice_flow_seg_hdr {
+	ICE_FLOW_SEG_HDR_NONE		= 0x00000000,
+	ICE_FLOW_SEG_HDR_ETH		= 0x00000001,
+	ICE_FLOW_SEG_HDR_VLAN		= 0x00000002,
+	ICE_FLOW_SEG_HDR_IPV4		= 0x00000004,
+	ICE_FLOW_SEG_HDR_IPV6		= 0x00000008,
+	ICE_FLOW_SEG_HDR_ARP		= 0x00000010,
+	ICE_FLOW_SEG_HDR_ICMP		= 0x00000020,
+	ICE_FLOW_SEG_HDR_TCP		= 0x00000040,
+	ICE_FLOW_SEG_HDR_UDP		= 0x00000080,
+	ICE_FLOW_SEG_HDR_SCTP		= 0x00000100,
+	ICE_FLOW_SEG_HDR_GRE		= 0x00000200,
+	ICE_FLOW_SEG_HDR_GTPC		= 0x00000400,
+	ICE_FLOW_SEG_HDR_GTPC_TEID	= 0x00000800,
+	ICE_FLOW_SEG_HDR_GTPU_IP	= 0x00001000,
+	ICE_FLOW_SEG_HDR_GTPU_EH	= 0x00002000,
+	ICE_FLOW_SEG_HDR_GTPU_DWN	= 0x00004000,
+	ICE_FLOW_SEG_HDR_GTPU_UP	= 0x00008000,
+	ICE_FLOW_SEG_HDR_PPPOE		= 0x00010000,
+	ICE_FLOW_SEG_HDR_PFCP_NODE	= 0x00020000,
+	ICE_FLOW_SEG_HDR_PFCP_SESSION	= 0x00040000,
+	ICE_FLOW_SEG_HDR_L2TPV3		= 0x00080000,
+	ICE_FLOW_SEG_HDR_ESP		= 0x00100000,
+	ICE_FLOW_SEG_HDR_AH		= 0x00200000,
+	ICE_FLOW_SEG_HDR_NAT_T_ESP	= 0x00400000,
+	ICE_FLOW_SEG_HDR_ETH_NON_IP	= 0x00800000,
+	ICE_FLOW_SEG_HDR_GTPU_NON_IP	= 0x01000000,
+	ICE_FLOW_SEG_HDR_VXLAN		= 0x02000000,
+	ICE_FLOW_SEG_HDR_ECPRI_TP0	= 0x04000000,
+	ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0	= 0x08000000,
+	ICE_FLOW_SEG_HDR_L2TPV2		= 0x10000000,
+	ICE_FLOW_SEG_HDR_PPP		= 0x20000000,
+	/* The following is an additive bit for ICE_FLOW_SEG_HDR_IPV4 and
+	 * ICE_FLOW_SEG_HDR_IPV6.
+	 */
+	ICE_FLOW_SEG_HDR_IPV_FRAG	= 0x40000000,
+	ICE_FLOW_SEG_HDR_IPV_OTHER	= 0x80000000,
+};
+
+/* These segements all have the same PTYPES, but are otherwise distinguished by
+ * the value of the gtp_eh_pdu and gtp_eh_pdu_link flags:
+ *
+ *                                gtp_eh_pdu     gtp_eh_pdu_link
+ * ICE_FLOW_SEG_HDR_GTPU_IP           0              0
+ * ICE_FLOW_SEG_HDR_GTPU_EH           1              don't care
+ * ICE_FLOW_SEG_HDR_GTPU_DWN          1              0
+ * ICE_FLOW_SEG_HDR_GTPU_UP           1              1
+ */
+#define ICE_FLOW_SEG_HDR_GTPU (ICE_FLOW_SEG_HDR_GTPU_IP | \
+			       ICE_FLOW_SEG_HDR_GTPU_EH | \
+			       ICE_FLOW_SEG_HDR_GTPU_DWN | \
+			       ICE_FLOW_SEG_HDR_GTPU_UP)
+#define ICE_FLOW_SEG_HDR_PFCP (ICE_FLOW_SEG_HDR_PFCP_NODE | \
+			       ICE_FLOW_SEG_HDR_PFCP_SESSION)
+
+enum ice_flow_field {
+	/* L2 */
+	ICE_FLOW_FIELD_IDX_ETH_DA,
+	ICE_FLOW_FIELD_IDX_ETH_SA,
+	ICE_FLOW_FIELD_IDX_S_VLAN,
+	ICE_FLOW_FIELD_IDX_C_VLAN,
+	ICE_FLOW_FIELD_IDX_ETH_TYPE,
+	/* L3 */
+	ICE_FLOW_FIELD_IDX_IPV4_DSCP,
+	ICE_FLOW_FIELD_IDX_IPV6_DSCP,
+	ICE_FLOW_FIELD_IDX_IPV4_TTL,
+	ICE_FLOW_FIELD_IDX_IPV4_PROT,
+	ICE_FLOW_FIELD_IDX_IPV6_TTL,
+	ICE_FLOW_FIELD_IDX_IPV6_PROT,
+	ICE_FLOW_FIELD_IDX_IPV4_SA,
+	ICE_FLOW_FIELD_IDX_IPV4_DA,
+	ICE_FLOW_FIELD_IDX_IPV6_SA,
+	ICE_FLOW_FIELD_IDX_IPV6_DA,
+	ICE_FLOW_FIELD_IDX_IPV4_ID,
+	ICE_FLOW_FIELD_IDX_IPV6_ID,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE32_SA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE32_DA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE48_SA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE48_DA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA,
+	/* L4 */
+	ICE_FLOW_FIELD_IDX_TCP_SRC_PORT,
+	ICE_FLOW_FIELD_IDX_TCP_DST_PORT,
+	ICE_FLOW_FIELD_IDX_UDP_SRC_PORT,
+	ICE_FLOW_FIELD_IDX_UDP_DST_PORT,
+	ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT,
+	ICE_FLOW_FIELD_IDX_SCTP_DST_PORT,
+	ICE_FLOW_FIELD_IDX_TCP_FLAGS,
+	/* ARP */
+	ICE_FLOW_FIELD_IDX_ARP_SIP,
+	ICE_FLOW_FIELD_IDX_ARP_DIP,
+	ICE_FLOW_FIELD_IDX_ARP_SHA,
+	ICE_FLOW_FIELD_IDX_ARP_DHA,
+	ICE_FLOW_FIELD_IDX_ARP_OP,
+	/* ICMP */
+	ICE_FLOW_FIELD_IDX_ICMP_TYPE,
+	ICE_FLOW_FIELD_IDX_ICMP_CODE,
+	/* GRE */
+	ICE_FLOW_FIELD_IDX_GRE_KEYID,
+	/* GTPC_TEID */
+	ICE_FLOW_FIELD_IDX_GTPC_TEID,
+	/* GTPU_IP */
+	ICE_FLOW_FIELD_IDX_GTPU_IP_TEID,
+	/* GTPU_EH */
+	ICE_FLOW_FIELD_IDX_GTPU_EH_TEID,
+	ICE_FLOW_FIELD_IDX_GTPU_EH_QFI,
+	/* GTPU_UP */
+	ICE_FLOW_FIELD_IDX_GTPU_UP_TEID,
+	/* GTPU_DWN */
+	ICE_FLOW_FIELD_IDX_GTPU_DWN_TEID,
+	/* PPPOE */
+	ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID,
+	/* PFCP */
+	ICE_FLOW_FIELD_IDX_PFCP_SEID,
+	/* L2TPV3 */
+	ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID,
+	/* ESP */
+	ICE_FLOW_FIELD_IDX_ESP_SPI,
+	/* AH */
+	ICE_FLOW_FIELD_IDX_AH_SPI,
+	/* NAT_T ESP */
+	ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI,
+	/* VXLAN VNI */
+	ICE_FLOW_FIELD_IDX_VXLAN_VNI,
+	/* ECPRI_TP0 */
+	ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID,
+	/* UDP_ECPRI_TP0 */
+	ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID,
+	 /* The total number of enums must not exceed 64 */
+	ICE_FLOW_FIELD_IDX_MAX
+};
+
+
+/* Flow headers and fields for AVF support */
+enum ice_flow_avf_hdr_field {
+	/* Values 0 - 28 are reserved for future use */
+	ICE_AVF_FLOW_FIELD_INVALID		= 0,
+	ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP	= 29,
+	ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP,
+	ICE_AVF_FLOW_FIELD_IPV4_UDP,
+	ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK,
+	ICE_AVF_FLOW_FIELD_IPV4_TCP,
+	ICE_AVF_FLOW_FIELD_IPV4_SCTP,
+	ICE_AVF_FLOW_FIELD_IPV4_OTHER,
+	ICE_AVF_FLOW_FIELD_FRAG_IPV4,
+	/* Values 37-38 are reserved */
+	ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP	= 39,
+	ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP,
+	ICE_AVF_FLOW_FIELD_IPV6_UDP,
+	ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK,
+	ICE_AVF_FLOW_FIELD_IPV6_TCP,
+	ICE_AVF_FLOW_FIELD_IPV6_SCTP,
+	ICE_AVF_FLOW_FIELD_IPV6_OTHER,
+	ICE_AVF_FLOW_FIELD_FRAG_IPV6,
+	ICE_AVF_FLOW_FIELD_RSVD47,
+	ICE_AVF_FLOW_FIELD_FCOE_OX,
+	ICE_AVF_FLOW_FIELD_FCOE_RX,
+	ICE_AVF_FLOW_FIELD_FCOE_OTHER,
+	/* Values 51-62 are reserved */
+	ICE_AVF_FLOW_FIELD_L2_PAYLOAD		= 63,
+	ICE_AVF_FLOW_FIELD_MAX
+};
+
+/* Supported RSS offloads  This macro is defined to support
+ * VIRTCHNL_OP_GET_RSS_HENA_CAPS ops. PF driver sends the RSS hardware
+ * capabilities to the caller of this ops.
+ */
+#define ICE_DEFAULT_RSS_HENA ( \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_OTHER) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV4) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_OTHER) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV6) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP))
+
+enum ice_rss_cfg_hdr_type {
+	ICE_RSS_OUTER_HEADERS, /* take outer headers as inputset. */
+	ICE_RSS_INNER_HEADERS, /* take inner headers as inputset. */
+	/* take inner headers as inputset for packet with outer ipv4. */
+	ICE_RSS_INNER_HEADERS_W_OUTER_IPV4,
+	/* take inner headers as inputset for packet with outer ipv6. */
+	ICE_RSS_INNER_HEADERS_W_OUTER_IPV6,
+	/* take outer headers first then inner headers as inputset */
+	ICE_RSS_ANY_HEADERS
+};
+
+struct ice_rss_hash_cfg {
+	u32 addl_hdrs; /* protocol header fields */
+	u64 hash_flds; /* hash bit field (ICE_FLOW_HASH_*) to configure */
+	enum ice_rss_cfg_hdr_type hdr_type; /* to specify inner or outer */
+	bool symm; /* symmetric or asymmetric hash */
+};
+
+enum ice_flow_dir {
+	ICE_FLOW_DIR_UNDEFINED	= 0,
+	ICE_FLOW_TX		= 0x01,
+	ICE_FLOW_RX		= 0x02,
+	ICE_FLOW_TX_RX		= ICE_FLOW_RX | ICE_FLOW_TX
+};
+
+enum ice_flow_priority {
+	ICE_FLOW_PRIO_LOW,
+	ICE_FLOW_PRIO_NORMAL,
+	ICE_FLOW_PRIO_HIGH
+};
+
+#define ICE_FLOW_SEG_SINGLE		1
+#define ICE_FLOW_SEG_MAX		2
+#define ICE_FLOW_SEG_RAW_FLD_MAX	2
+#define ICE_FLOW_PROFILE_MAX		1024
+#define ICE_FLOW_SW_FIELD_VECTOR_MAX	48
+#define ICE_FLOW_ACL_FIELD_VECTOR_MAX	32
+#define ICE_FLOW_FV_EXTRACT_SZ		2
+
+#define ICE_FLOW_SET_HDRS(seg, val)	((seg)->hdrs |= (u32)(val))
+
+struct ice_flow_seg_xtrct {
+	u8 prot_id;	/* Protocol ID of extracted header field */
+	u16 off;	/* Starting offset of the field in header in bytes */
+	u8 idx;		/* Index of FV entry used */
+	u8 disp;	/* Displacement of field in bits fr. FV entry's start */
+	u16 mask;	/* Mask for field */
+};
+
+
+enum ice_flow_fld_match_type {
+	ICE_FLOW_FLD_TYPE_REG,		/* Value, mask */
+	ICE_FLOW_FLD_TYPE_RANGE,	/* Value, mask, last (upper bound) */
+	ICE_FLOW_FLD_TYPE_PREFIX,	/* IP address, prefix, size of prefix */
+	ICE_FLOW_FLD_TYPE_SIZE,		/* Value, mask, size of match */
+};
+
+struct ice_flow_fld_loc {
+	/* Describe offsets of field information relative to the beginning of
+	 * input buffer provided when adding flow entries.
+	 */
+	u16 val;	/* Offset where the value is located */
+	u16 mask;	/* Offset where the mask/prefix value is located */
+	u16 last;	/* Length or offset where the upper value is located */
+};
+
+struct ice_flow_fld_info {
+	enum ice_flow_fld_match_type type;
+	/* Location where to retrieve data from an input buffer */
+	struct ice_flow_fld_loc src;
+	/* Location where to put the data into the final entry buffer */
+	struct ice_flow_fld_loc entry;
+	struct ice_flow_seg_xtrct xtrct;
+};
+
+struct ice_flow_seg_fld_raw {
+	struct ice_flow_fld_info info;
+	u16 off;	/* Offset from the start of the segment */
+};
+
+struct ice_flow_seg_info {
+	u32 hdrs;	/* Bitmask indicating protocol headers present */
+	u64 match;	/* Bitmask indicating header fields to be matched */
+	u64 range;	/* Bitmask indicating header fields matched as ranges */
+
+	struct ice_flow_fld_info fields[ICE_FLOW_FIELD_IDX_MAX];
+
+	u8 raws_cnt;	/* Number of raw fields to be matched */
+	struct ice_flow_seg_fld_raw raws[ICE_FLOW_SEG_RAW_FLD_MAX];
+};
+
+/* This structure describes a flow entry, and is tracked only in this file */
+struct ice_flow_entry {
+	struct list_head l_entry;
+
+	u64 id;
+	struct ice_flow_prof *prof;
+	/* Action list */
+	struct ice_flow_action *acts;
+	/* Flow entry's content */
+	void *entry;
+	/* Range buffer (For ACL only) */
+	struct ice_aqc_acl_profile_ranges *range_buf;
+	enum ice_flow_priority priority;
+	u16 vsi_handle;
+	u16 entry_sz;
+	/* Entry index in the ACL's scenario */
+	u16 scen_entry_idx;
+#define ICE_FLOW_ACL_MAX_NUM_ACT	2
+	u8 acts_cnt;
+};
+
+#define ICE_FLOW_ENTRY_HNDL(e)	((u64)e)
+#define ICE_FLOW_ENTRY_PTR(h)	((struct ice_flow_entry *)(h))
+
+struct ice_flow_prof {
+	struct list_head l_entry;
+
+	u64 id;
+	enum ice_flow_dir dir;
+	u8 segs_cnt;
+	u8 acts_cnt;
+
+	/* Keep track of flow entries associated with this flow profile */
+	struct mutex entries_lock;
+	struct list_head entries;
+
+	struct ice_flow_seg_info segs[ICE_FLOW_SEG_MAX];
+
+	/* software VSI handles referenced by this flow profile */
+	DECLARE_BITMAP(vsis, ICE_MAX_VSI);
+
+	union {
+		/* struct sw_recipe */
+		struct ice_acl_scen *scen;
+		/* struct fd */
+		u32 data;
+		bool symm; /* Symmetric Hash for RSS */
+	} cfg;
+
+	/* Default actions */
+	struct ice_flow_action *acts;
+};
+
+struct ice_rss_cfg {
+	struct list_head l_entry;
+	/* bitmap of VSIs added to the RSS entry */
+	DECLARE_BITMAP(vsis, ICE_MAX_VSI);
+	struct ice_rss_hash_cfg hash;
+};
+
+enum ice_flow_action_type {
+	ICE_FLOW_ACT_NOP,
+	ICE_FLOW_ACT_ALLOW,
+	ICE_FLOW_ACT_DROP,
+	ICE_FLOW_ACT_CNTR_PKT,
+	ICE_FLOW_ACT_FWD_VSI,
+	ICE_FLOW_ACT_FWD_VSI_LIST,	/* Should be abstracted away */
+	ICE_FLOW_ACT_FWD_QUEUE,		/* Can Queues be abstracted away? */
+	ICE_FLOW_ACT_FWD_QUEUE_GROUP,	/* Can Queues be abstracted away? */
+	ICE_FLOW_ACT_PUSH,
+	ICE_FLOW_ACT_POP,
+	ICE_FLOW_ACT_MODIFY,
+	ICE_FLOW_ACT_CNTR_BYTES,
+	ICE_FLOW_ACT_CNTR_PKT_BYTES,
+	ICE_FLOW_ACT_GENERIC_0,
+	ICE_FLOW_ACT_GENERIC_1,
+	ICE_FLOW_ACT_GENERIC_2,
+	ICE_FLOW_ACT_GENERIC_3,
+	ICE_FLOW_ACT_GENERIC_4,
+	ICE_FLOW_ACT_RPT_FLOW_ID,
+	ICE_FLOW_ACT_BUILD_PROF_IDX,
+};
+
+struct ice_flow_action {
+	enum ice_flow_action_type type;
+	union {
+		struct ice_acl_act_entry acl_act;
+		u32 dummy;
+	} data;
+};
+
+u64
+ice_flow_find_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir,
+		   struct ice_flow_seg_info *segs, u8 segs_cnt);
+enum ice_status
+ice_flow_add_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir,
+		  u64 prof_id, struct ice_flow_seg_info *segs, u8 segs_cnt,
+		  struct ice_flow_action *acts, u8 acts_cnt,
+		  struct ice_flow_prof **prof);
+enum ice_status
+ice_flow_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id);
+enum ice_status
+ice_flow_assoc_vsig_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi_handle,
+			u16 vsig);
+enum ice_status
+ice_flow_get_hw_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id,
+		     u8 *hw_prof);
+u64 ice_flow_find_entry(struct ice_hw *hw, enum ice_block blk, u64 entry_id);
+enum ice_status
+ice_flow_add_entry(struct ice_hw *hw, enum ice_block blk, u64 prof_id,
+		   u64 entry_id, u16 vsi, enum ice_flow_priority prio,
+		   void *data, struct ice_flow_action *acts, u8 acts_cnt,
+		   u64 *entry_h);
+enum ice_status
+ice_flow_rem_entry(struct ice_hw *hw, enum ice_block blk, u64 entry_h);
+void
+ice_flow_set_fld(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+		 u16 val_loc, u16 mask_loc, u16 last_loc, bool range);
+void
+ice_flow_set_fld_prefix(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+			u16 val_loc, u16 prefix_loc, u8 prefix_sz);
+void
+ice_flow_add_fld_raw(struct ice_flow_seg_info *seg, u16 off, u8 len,
+		     u16 val_loc, u16 mask_loc);
+enum ice_status ice_flow_rem_vsi_prof(struct ice_hw *hw, enum ice_block blk,
+				      u16 vsi_handle, u64 prof_id);
+void ice_rem_vsi_rss_list(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status ice_replay_rss_cfg(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status
+ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds);
+enum ice_status ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status
+ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg);
+enum ice_status
+ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg);
+u64 ice_get_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u32 hdrs);
+#endif /* _ICE_FLOW_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_fltr.c b/drivers/net/ethernet/intel/ice/ice_fltr.c
new file mode 100644
index 000000000000..3acf5656479f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fltr.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_fltr.h"
+
+/**
+ * ice_fltr_free_list - free filter lists helper
+ * @dev: pointer to the device struct
+ * @h: pointer to the list head to be freed
+ *
+ * Helper function to free filter lists previously created using
+ * ice_fltr_add_mac_to_list
+ */
+void ice_fltr_free_list(struct device *dev, struct list_head *h)
+{
+	struct ice_fltr_list_entry *e, *tmp;
+
+	list_for_each_entry_safe(e, tmp, h, list_entry) {
+		list_del(&e->list_entry);
+		devm_kfree(dev, e);
+	}
+}
+
+/**
+ * ice_fltr_add_entry_to_list - allocate and add filter entry to list
+ * @dev: pointer to device needed by alloc function
+ * @info: filter info struct that gets added to the passed in list
+ * @list: pointer to the list which contains MAC filters entry
+ */
+static int
+ice_fltr_add_entry_to_list(struct device *dev, struct ice_fltr_info *info,
+			   struct list_head *list)
+{
+	struct ice_fltr_list_entry *entry;
+
+	entry = devm_kzalloc(dev, sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->fltr_info = *info;
+
+	INIT_LIST_HEAD(&entry->list_entry);
+	list_add(&entry->list_entry, list);
+
+	return 0;
+}
+
+/**
+ * ice_fltr_set_vlan_vsi_promisc
+ * @hw: pointer to the hardware structure
+ * @vsi: the VSI being configured
+ * @promisc_mask: mask of promiscuous config bits
+ *
+ * Set VSI with all associated VLANs to given promiscuous mode(s)
+ */
+enum ice_status
+ice_fltr_set_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask)
+{
+	return ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_mask, false);
+}
+
+/**
+ * ice_fltr_clear_vlan_vsi_promisc
+ * @hw: pointer to the hardware structure
+ * @vsi: the VSI being configured
+ * @promisc_mask: mask of promiscuous config bits
+ *
+ * Clear VSI with all associated VLANs to given promiscuous mode(s)
+ */
+enum ice_status
+ice_fltr_clear_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask)
+{
+	return ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_mask, true);
+}
+
+/**
+ * ice_fltr_clear_vsi_promisc - clear specified promiscuous mode(s)
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to clear mode
+ * @promisc_mask: mask of promiscuous config bits to clear
+ * @vid: VLAN ID to clear VLAN promiscuous
+ * @lport: logical port number to clear mode
+ */
+enum ice_status
+ice_fltr_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			   u16 vid, u8 lport)
+{
+	return ice_clear_vsi_promisc(hw, vsi_handle, promisc_mask, vid);
+}
+
+/**
+ * ice_fltr_set_vsi_promisc - set given VSI to given promiscuous mode(s)
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @vid: VLAN ID to set VLAN promiscuous
+ * @lport: logical port number to set promiscuous mode
+ */
+enum ice_status
+ice_fltr_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			 u16 vid, u8 lport)
+{
+	return ice_set_vsi_promisc(hw, vsi_handle, promisc_mask, vid);
+}
+
+/**
+ * ice_fltr_add_mac_list - add list of MAC filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+enum ice_status
+ice_fltr_add_mac_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_add_mac(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_mac_list - remove list of MAC filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+enum ice_status
+ice_fltr_remove_mac_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_remove_mac(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_add_vlan_list - add list of VLAN filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_add_vlan_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_add_vlan(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_vlan_list - remove list of VLAN filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_remove_vlan_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_remove_vlan(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_add_mac_vlan_list - add list of MAC VLAN filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_add_mac_vlan_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_add_mac_vlan(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_mac_vlan_list - remove list of MAC VLAN filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_remove_mac_vlan_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_remove_mac_vlan(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_add_eth_list - add list of ethertype filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_add_eth_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_add_eth_mac(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_eth_list - remove list of ethertype filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_remove_eth_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_remove_eth_mac(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_all - remove all filters associated with VSI
+ * @vsi: pointer to VSI struct
+ */
+void ice_fltr_remove_all(struct ice_vsi *vsi)
+{
+	ice_remove_vsi_fltr(&vsi->back->hw, vsi->idx);
+}
+
+/**
+ * ice_fltr_add_mac_to_list - add MAC filter info to exsisting list
+ * @vsi: pointer to VSI struct
+ * @list: list to add filter info to
+ * @mac: MAC address to add
+ * @action: filter action
+ */
+int
+ice_fltr_add_mac_to_list(struct ice_vsi *vsi, struct list_head *list,
+			 const u8 *mac, enum ice_sw_fwd_act_type action)
+{
+	struct ice_fltr_info info = { 0 };
+
+	info.flag = ICE_FLTR_TX;
+	info.src_id = ICE_SRC_ID_VSI;
+	info.lkup_type = ICE_SW_LKUP_MAC;
+	info.fltr_act = action;
+	info.vsi_handle = vsi->idx;
+
+	ether_addr_copy(info.l_data.mac.mac_addr, mac);
+
+	return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
+					  list);
+}
+
+/**
+ * ice_fltr_add_vlan_to_list - add VLAN filter info to exsisting list
+ * @vsi: pointer to VSI struct
+ * @list: list to add filter info to
+ * @vlan: VLAN filter details
+ */
+static int
+ice_fltr_add_vlan_to_list(struct ice_vsi *vsi, struct list_head *list,
+			  struct ice_vlan *vlan)
+{
+	struct ice_fltr_info info = { 0 };
+
+	info.flag = ICE_FLTR_TX;
+	info.src_id = ICE_SRC_ID_VSI;
+	info.lkup_type = ICE_SW_LKUP_VLAN;
+	info.fltr_act = vlan->fwd_act;
+	info.vsi_handle = vsi->idx;
+	info.l_data.vlan.vlan_id = vlan->vid;
+	info.l_data.vlan.tpid = vlan->tpid;
+	info.l_data.vlan.tpid_valid = true;
+
+	return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
+					  list);
+}
+
+/**
+ * ice_fltr_add_mac_vlan_to_list - add MAC VLAN filter info to
+ * exsisting list
+ * @vsi: pointer to VSI struct
+ * @list: list to add filter info to
+ * @mac: MAC addr to add
+ * @vlan_id: VLAN ID to add
+ * @action: filter action
+ */
+static int
+ice_fltr_add_mac_vlan_to_list(struct ice_vsi *vsi, struct list_head *list,
+			      const u8 *mac, u16 vlan_id,
+			      enum ice_sw_fwd_act_type action)
+{
+	struct ice_fltr_info info = { 0 };
+
+	if (!is_valid_ether_addr(mac) ||
+	    is_broadcast_ether_addr(mac) || !vlan_id)
+		return -EINVAL;
+
+	info.flag = ICE_FLTR_TX_RX;
+	info.lkup_type = ICE_SW_LKUP_MAC_VLAN;
+	info.fltr_act = action;
+	info.vsi_handle = vsi->idx;
+	info.src = vsi->vsi_num;
+
+	info.l_data.mac_vlan.vlan_id = vlan_id;
+	ether_addr_copy(info.l_data.mac_vlan.mac_addr, mac);
+
+	return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
+					  list);
+}
+
+/**
+ * ice_fltr_add_eth_to_list - add ethertype filter info to exsisting list
+ * @vsi: pointer to VSI struct
+ * @list: list to add filter info to
+ * @ethertype: ethertype of packet that matches filter
+ * @flag: filter direction, Tx or Rx
+ * @action: filter action
+ */
+static int
+ice_fltr_add_eth_to_list(struct ice_vsi *vsi, struct list_head *list,
+			 u16 ethertype, u16 flag,
+			 enum ice_sw_fwd_act_type action)
+{
+	struct ice_fltr_info info = { 0 };
+
+	info.flag = flag;
+	info.lkup_type = ICE_SW_LKUP_ETHERTYPE;
+	info.fltr_act = action;
+	info.vsi_handle = vsi->idx;
+	info.l_data.ethertype_mac.ethertype = ethertype;
+
+	if (flag == ICE_FLTR_TX)
+		info.src_id = ICE_SRC_ID_VSI;
+	else
+		info.src_id = ICE_SRC_ID_LPORT;
+
+	return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
+					  list);
+}
+
+/**
+ * ice_fltr_prepare_mac - add or remove MAC rule
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @action: action to be performed on filter match
+ * @mac_action: pointer to add or remove MAC function
+ */
+static enum ice_status
+ice_fltr_prepare_mac(struct ice_vsi *vsi, const u8 *mac,
+		     enum ice_sw_fwd_act_type action,
+		     enum ice_status (*mac_action)(struct ice_vsi *,
+						   struct list_head *))
+{
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	if (ice_fltr_add_mac_to_list(vsi, &tmp_list, mac, action)) {
+		ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+		return ICE_ERR_NO_MEMORY;
+	}
+
+	result = mac_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_prepare_mac_and_broadcast - add or remove MAC and broadcast filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @action: action to be performed on filter match
+ * @mac_action: pointer to add or remove MAC function
+ */
+static enum ice_status
+ice_fltr_prepare_mac_and_broadcast(struct ice_vsi *vsi, const u8 *mac,
+				   enum ice_sw_fwd_act_type action,
+				   enum ice_status(*mac_action)
+				   (struct ice_vsi *, struct list_head *))
+{
+	u8 broadcast[ETH_ALEN];
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	eth_broadcast_addr(broadcast);
+	if (ice_fltr_add_mac_to_list(vsi, &tmp_list, mac, action) ||
+	    ice_fltr_add_mac_to_list(vsi, &tmp_list, broadcast, action)) {
+		ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+		return ICE_ERR_NO_MEMORY;
+	}
+
+	result = mac_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_prepare_vlan - add or remove VLAN filter
+ * @vsi: pointer to VSI struct
+ * @vlan: VLAN filter details
+ * @vlan_action: pointer to add or remove VLAN function
+ */
+static enum ice_status
+ice_fltr_prepare_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan,
+		      enum ice_status (*vlan_action)(struct ice_vsi *,
+						     struct list_head *))
+{
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	if (ice_fltr_add_vlan_to_list(vsi, &tmp_list, vlan))
+		return ICE_ERR_NO_MEMORY;
+
+	result = vlan_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_prepare_mac_vlan - add or remove MAC VLAN filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @vlan_id: VLAN ID to add
+ * @action: action to be performed on filter match
+ * @mac_vlan_action: pointer to add or remove MAC VLAN function
+ */
+static enum ice_status
+ice_fltr_prepare_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+			  enum ice_sw_fwd_act_type action,
+			  enum ice_status (mac_vlan_action)(struct ice_vsi *,
+							    struct list_head *))
+{
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	if (ice_fltr_add_mac_vlan_to_list(vsi, &tmp_list, mac, vlan_id,
+					  action))
+		return ICE_ERR_NO_MEMORY;
+
+	result = mac_vlan_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_prepare_eth - add or remove ethertype filter
+ * @vsi: pointer to VSI struct
+ * @ethertype: ethertype of packet to be filtered
+ * @flag: direction of packet, Tx or Rx
+ * @action: action to be performed on filter match
+ * @eth_action: pointer to add or remove ethertype function
+ */
+static enum ice_status
+ice_fltr_prepare_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
+		     enum ice_sw_fwd_act_type action,
+		     enum ice_status (*eth_action)(struct ice_vsi *,
+						   struct list_head *))
+{
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	if (ice_fltr_add_eth_to_list(vsi, &tmp_list, ethertype, flag, action))
+		return ICE_ERR_NO_MEMORY;
+
+	result = eth_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_add_mac - add single MAC filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC to add
+ * @action: action to be performed on filter match
+ */
+enum ice_status ice_fltr_add_mac(struct ice_vsi *vsi, const u8 *mac,
+				 enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac(vsi, mac, action, ice_fltr_add_mac_list);
+}
+
+/**
+ * ice_fltr_add_mac_and_broadcast - add single MAC and broadcast
+ * @vsi: pointer to VSI struct
+ * @mac: MAC to add
+ * @action: action to be performed on filter match
+ */
+enum ice_status
+ice_fltr_add_mac_and_broadcast(struct ice_vsi *vsi, const u8 *mac,
+			       enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac_and_broadcast(vsi, mac, action,
+						  ice_fltr_add_mac_list);
+}
+
+/**
+ * ice_fltr_remove_mac - remove MAC filter
+ * @vsi: pointer to VSI struct
+ * @mac: filter MAC to remove
+ * @action: action to remove
+ */
+enum ice_status ice_fltr_remove_mac(struct ice_vsi *vsi, const u8 *mac,
+				    enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac(vsi, mac, action, ice_fltr_remove_mac_list);
+}
+
+/**
+ * ice_fltr_add_vlan - add single VLAN filter
+ * @vsi: pointer to VSI struct
+ * @vlan: VLAN filter details
+ */
+enum ice_status ice_fltr_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	return ice_fltr_prepare_vlan(vsi, vlan, ice_fltr_add_vlan_list);
+}
+
+/**
+ * ice_fltr_remove_vlan - remove VLAN filter
+ * @vsi: pointer to VSI struct
+ * @vlan: VLAN filter details
+ */
+enum ice_status ice_fltr_remove_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	return ice_fltr_prepare_vlan(vsi, vlan, ice_fltr_remove_vlan_list);
+}
+
+/**
+ * ice_fltr_add_mac_vlan - add single MAC VLAN filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @vlan_id: VLAN ID to add
+ * @action: action to be performed on filter match
+ */
+enum ice_status
+ice_fltr_add_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+		      enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac_vlan(vsi, mac, vlan_id, action,
+					 ice_fltr_add_mac_vlan_list);
+}
+
+/**
+ * ice_fltr_remove_mac_vlan - remove MAC VLAN filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @vlan_id: filter MAC VLAN to remove
+ * @action: action to remove
+ */
+enum ice_status
+ice_fltr_remove_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+			 enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac_vlan(vsi, mac, vlan_id, action,
+					 ice_fltr_remove_mac_vlan_list);
+}
+
+/**
+ * ice_fltr_add_eth - add specyfic ethertype filter
+ * @vsi: pointer to VSI struct
+ * @ethertype: ethertype of filter
+ * @flag: direction of packet to be filtered, Tx or Rx
+ * @action: action to be performed on filter match
+ */
+enum ice_status ice_fltr_add_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
+				 enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_eth(vsi, ethertype, flag, action,
+				    ice_fltr_add_eth_list);
+}
+
+/**
+ * ice_fltr_remove_eth - remove ethertype filter
+ * @vsi: pointer to VSI struct
+ * @ethertype: ethertype of filter
+ * @flag: direction of filter
+ * @action: action to remove
+ */
+enum ice_status ice_fltr_remove_eth(struct ice_vsi *vsi, u16 ethertype,
+				    u16 flag, enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_eth(vsi, ethertype, flag, action,
+				    ice_fltr_remove_eth_list);
+}
+
+/**
+ * ice_fltr_update_rule_flags - update lan_en/lb_en flags
+ * @hw: pointer to hw
+ * @rule_id: id of rule being updated
+ * @recipe_id: recipe id of rule
+ * @act: current action field
+ * @type: Rx or Tx
+ * @src: source VSI
+ * @new_flags: combinations of lb_en and lan_en
+ */
+static enum ice_status
+ice_fltr_update_rule_flags(struct ice_hw *hw, u16 rule_id, u16 recipe_id,
+			   u32 act, u16 type, u16 src, u32 new_flags)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status err;
+	u32 flags_mask;
+
+	s_rule = kzalloc(ICE_SW_RULE_RX_TX_NO_HDR_SIZE, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+
+	flags_mask = ICE_SINGLE_ACT_LB_ENABLE | ICE_SINGLE_ACT_LAN_ENABLE;
+	act &= ~flags_mask;
+	act |= (flags_mask & new_flags);
+
+	s_rule->pdata.lkup_tx_rx.recipe_id = cpu_to_le16(recipe_id);
+	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(rule_id);
+	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+
+	if (type & ICE_FLTR_RX) {
+		s_rule->pdata.lkup_tx_rx.src =
+			cpu_to_le16(hw->port_info->lport);
+		s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX);
+
+	} else {
+		s_rule->pdata.lkup_tx_rx.src = cpu_to_le16(src);
+		s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX);
+	}
+
+	err = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_NO_HDR_SIZE, 1,
+			      ice_aqc_opc_update_sw_rules, NULL);
+
+	kfree(s_rule);
+	return err;
+}
+
+/**
+ * ice_fltr_build_action - build action for rule
+ * @vsi_id: id of VSI which is use to build action
+ */
+static u32
+ice_fltr_build_action(u16 vsi_id)
+{
+	return ((vsi_id << ICE_SINGLE_ACT_VSI_ID_S) & ICE_SINGLE_ACT_VSI_ID_M) |
+		ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_VALID_BIT;
+}
+
+/**
+ * ice_fltr_find_adv_entry - find advanced rule
+ * @rules: list of rules
+ * @rule_id: id of wanted rule
+ */
+static struct ice_adv_fltr_mgmt_list_entry *
+ice_fltr_find_adv_entry(struct list_head *rules, u16 rule_id)
+{
+	struct ice_adv_fltr_mgmt_list_entry *entry;
+
+	list_for_each_entry(entry, rules, list_entry) {
+		if (entry->rule_info.fltr_rule_id == rule_id)
+			return entry;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_fltr_update_adv_rule_flags - update flags on advanced rule
+ * @vsi: pointer to VSI
+ * @recipe_id: id of recipe
+ * @entry: advanced rule entry
+ * @new_flags: flags to update
+ */
+static enum ice_status
+ice_fltr_update_adv_rule_flags(struct ice_vsi *vsi, u16 recipe_id,
+			       struct ice_adv_fltr_mgmt_list_entry *entry,
+			       u32 new_flags)
+{
+	struct ice_adv_rule_info *info = &entry->rule_info;
+	struct ice_sw_act_ctrl *act = &info->sw_act;
+	u32 action;
+
+	if (act->fltr_act != ICE_FWD_TO_VSI)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	action = ice_fltr_build_action(act->fwd_id.hw_vsi_id);
+
+	return ice_fltr_update_rule_flags(&vsi->back->hw, info->fltr_rule_id,
+					  recipe_id, action, info->sw_act.flag,
+					  act->src, new_flags);
+}
+
+/**
+ * ice_fltr_find_regular_entry - find regular rule
+ * @rules: list of rules
+ * @rule_id: id of wanted rule
+ */
+static struct ice_fltr_mgmt_list_entry *
+ice_fltr_find_regular_entry(struct list_head *rules, u16 rule_id)
+{
+	struct ice_fltr_mgmt_list_entry *entry;
+
+	list_for_each_entry(entry, rules, list_entry) {
+		if (entry->fltr_info.fltr_rule_id == rule_id)
+			return entry;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_fltr_update_regular_rule - update flags on regular rule
+ * @vsi: pointer to VSI
+ * @recipe_id: id of recipe
+ * @entry: regular rule entry
+ * @new_flags: flags to update
+ */
+static enum ice_status
+ice_fltr_update_regular_rule(struct ice_vsi *vsi, u16 recipe_id,
+			     struct ice_fltr_mgmt_list_entry *entry,
+			     u32 new_flags)
+{
+	struct ice_fltr_info *info = &entry->fltr_info;
+	u32 action;
+
+	if (info->fltr_act != ICE_FWD_TO_VSI)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	action = ice_fltr_build_action(info->fwd_id.hw_vsi_id);
+
+	return ice_fltr_update_rule_flags(&vsi->back->hw, info->fltr_rule_id,
+					  recipe_id, action, info->flag,
+					  info->src, new_flags);
+}
+
+/**
+ * ice_fltr_update_flags - update flags on rule
+ * @vsi: pointer to VSI
+ * @rule_id: id of rule
+ * @recipe_id: id of recipe
+ * @new_flags: flags to update
+ *
+ * Function updates flags on regular and advance rule.
+ *
+ * Flags should be a combination of ICE_SINGLE_ACT_LB_ENABLE and
+ * ICE_SINGLE_ACT_LAN_ENABLE.
+ */
+enum ice_status
+ice_fltr_update_flags(struct ice_vsi *vsi, u16 rule_id, u16 recipe_id,
+		      u32 new_flags)
+{
+	struct ice_adv_fltr_mgmt_list_entry *adv_entry;
+	struct ice_fltr_mgmt_list_entry *regular_entry;
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_sw_recipe *recp_list;
+	struct list_head *fltr_rules;
+
+	recp_list = &hw->switch_info->recp_list[recipe_id];
+	if (!recp_list)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	fltr_rules = &recp_list->filt_rules;
+	regular_entry = ice_fltr_find_regular_entry(fltr_rules, rule_id);
+	if (regular_entry)
+		return ice_fltr_update_regular_rule(vsi, recipe_id,
+						    regular_entry, new_flags);
+
+	adv_entry = ice_fltr_find_adv_entry(fltr_rules, rule_id);
+	if (adv_entry)
+		return ice_fltr_update_adv_rule_flags(vsi, recipe_id,
+						      adv_entry, new_flags);
+
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_fltr_update_flags_dflt_rule - update flags on default rule
+ * @vsi: pointer to VSI
+ * @rule_id: id of rule
+ * @direction: Tx or Rx
+ * @new_flags: flags to update
+ *
+ * Function updates flags on default rule with ICE_SW_LKUP_DFLT.
+ *
+ * Flags should be a combination of ICE_SINGLE_ACT_LB_ENABLE and
+ * ICE_SINGLE_ACT_LAN_ENABLE.
+ */
+enum ice_status
+ice_fltr_update_flags_dflt_rule(struct ice_vsi *vsi, u16 rule_id, u8 direction,
+				u32 new_flags)
+{
+	u32 action = ice_fltr_build_action(vsi->vsi_num);
+	struct ice_hw *hw = &vsi->back->hw;
+
+	return ice_fltr_update_rule_flags(hw, rule_id, ICE_SW_LKUP_DFLT, action,
+					  direction, vsi->vsi_num, new_flags);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fltr.h b/drivers/net/ethernet/intel/ice/ice_fltr.h
new file mode 100644
index 000000000000..20723d0b88e3
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fltr.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FLTR_H_
+#define _ICE_FLTR_H_
+
+#include "ice_vlan.h"
+
+void ice_fltr_free_list(struct device *dev, struct list_head *h);
+enum ice_status
+ice_fltr_set_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask);
+enum ice_status
+ice_fltr_clear_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask);
+enum ice_status
+ice_fltr_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			   u16 vid, u8 lport);
+enum ice_status
+ice_fltr_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			 u16 vid, u8 lport);
+int
+ice_fltr_add_mac_to_list(struct ice_vsi *vsi, struct list_head *list,
+			 const u8 *mac, enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_add_mac(struct ice_vsi *vsi, const u8 *mac,
+		 enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_add_mac_and_broadcast(struct ice_vsi *vsi, const u8 *mac,
+			       enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_add_mac_list(struct ice_vsi *vsi, struct list_head *list);
+enum ice_status
+ice_fltr_remove_mac(struct ice_vsi *vsi, const u8 *mac,
+		    enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_remove_mac_list(struct ice_vsi *vsi, struct list_head *list);
+
+enum ice_status ice_fltr_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+enum ice_status ice_fltr_remove_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+enum ice_status
+ice_fltr_add_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+		      enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_remove_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+			 enum ice_sw_fwd_act_type action);
+
+enum ice_status
+ice_fltr_add_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
+		 enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_remove_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
+		    enum ice_sw_fwd_act_type action);
+void ice_fltr_remove_all(struct ice_vsi *vsi);
+
+enum ice_status
+ice_fltr_update_flags(struct ice_vsi *vsi, u16 rule_id, u16 recipe_id,
+		      u32 new_flags);
+enum ice_status
+ice_fltr_update_flags_dflt_rule(struct ice_vsi *vsi, u16 rule_id, u8 direction,
+				u32 new_flags);
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_fw_update.c b/drivers/net/ethernet/intel/ice/ice_fw_update.c
new file mode 100644
index 000000000000..8cb0d88351fb
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fw_update.c
@@ -0,0 +1,845 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include <asm/unaligned.h>
+#include <linux/uuid.h>
+#include <linux/crc32.h>
+#if IS_ENABLED(CONFIG_PLDMFW)
+#include <linux/pldmfw.h>
+#else
+#include "kcompat_pldmfw.h"
+#endif
+
+#include "ice.h"
+#include "ice_fw_update.h"
+
+struct ice_fwu_priv {
+	struct pldmfw context;
+
+	struct ice_pf *pf;
+	struct netlink_ext_ack *extack;
+
+	/* Track which NVM banks to activate at the end of the update */
+	u8 activate_flags;
+};
+
+/**
+ * ice_send_package_data - Send record package data to firmware
+ * @context: PLDM fw update structure
+ * @data: pointer to the package data
+ * @length: length of the package data
+ *
+ * Send a copy of the package data associated with the PLDM record matching
+ * this device to the firmware.
+ *
+ * Note that this function sends an AdminQ command that will fail unless the
+ * NVM resource has been acquired.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_send_package_data(struct pldmfw *context, const u8 *data, u16 length)
+{
+	struct ice_fwu_priv *priv = container_of(context, struct ice_fwu_priv, context);
+	struct netlink_ext_ack *extack = priv->extack;
+	struct device *dev = context->dev;
+	struct ice_pf *pf = priv->pf;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 *package_data;
+
+	dev_dbg(dev, "Sending PLDM record package data to firmware\n");
+
+	package_data = kmemdup(data, length, GFP_KERNEL);
+	if (!package_data)
+		return -ENOMEM;
+
+	status = ice_nvm_set_pkg_data(hw, false, package_data, length, NULL);
+
+	kfree(package_data);
+
+	if (status) {
+		dev_err(dev, "Failed to send record package data to firmware, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to record package data to firmware");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_check_component_response - Report firmware response to a component
+ * @pf: device private data structure
+ * @id: component id being checked
+ * @response: indicates whether this component can be updated
+ * @code: code indicating reason for response
+ * @extack: netlink extended ACK structure
+ *
+ * Check whether firmware indicates if this component can be updated. Report
+ * a suitable error message over the netlink extended ACK if the component
+ * cannot be updated.
+ *
+ * Returns: zero if the component can be updated, or -ECANCELED of the
+ * firmware indicates the component cannot be updated.
+ */
+static int
+ice_check_component_response(struct ice_pf *pf, u16 id, u8 response, u8 code,
+			     struct netlink_ext_ack *extack)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	const char *component;
+
+	switch (id) {
+	case NVM_COMP_ID_OROM:
+		component = "fw.undi";
+		break;
+	case NVM_COMP_ID_NVM:
+		component = "fw.mgmt";
+		break;
+	case NVM_COMP_ID_NETLIST:
+		component = "fw.netlist";
+		break;
+	default:
+		WARN(1, "Unexpected unknown component identifier 0x%02x", id);
+		return -EINVAL;
+	}
+
+	dev_dbg(dev, "%s: firmware response 0x%x, code 0x%x\n",
+		component, response, code);
+
+	switch (response) {
+	case ICE_AQ_NVM_PASS_COMP_CAN_BE_UPDATED:
+		/* firmware indicated this update is good to proceed */
+		return 0;
+	case ICE_AQ_NVM_PASS_COMP_CAN_MAY_BE_UPDATEABLE:
+		dev_warn(dev, "firmware recommends not updating %s, as it may result in a downgrade. Continuing anyways\n",
+			 component);
+		return 0;
+	case ICE_AQ_NVM_PASS_COMP_CAN_NOT_BE_UPDATED:
+		dev_info(dev, "firmware has rejected updating %s\n", component);
+		break;
+	}
+
+	switch (code) {
+	case ICE_AQ_NVM_PASS_COMP_STAMP_IDENTICAL_CODE:
+		dev_err(dev, "Component comparison stamp for %s is identical to the running image\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component comparison stamp is identical to running image");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_STAMP_LOWER:
+		dev_err(dev, "Component comparison stamp for %s is lower than the running image\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component comparison stamp is lower than running image");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_INVALID_STAMP_CODE:
+		dev_err(dev, "Component comparison stamp for %s is invalid\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component comparison stamp is invalid");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_CONFLICT_CODE:
+		dev_err(dev, "%s conflicts with a previous component table\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component table conflict occurred");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_PRE_REQ_NOT_MET_CODE:
+		dev_err(dev, "Pre-requisites for component %s have not been met\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component pre-requisites not met");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_NOT_SUPPORTED_CODE:
+		dev_err(dev, "%s is not a supported component\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component not supported");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_CANNOT_DOWNGRADE_CODE:
+		dev_err(dev, "Security restrictions prevent %s from being downgraded\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component cannot be downgraded");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_INCOMPLETE_IMAGE_CODE:
+		dev_err(dev, "Received an incomplete component image for %s\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Incomplete component image");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_VER_STR_IDENTICAL_CODE:
+		dev_err(dev, "Component version for %s is identical to the running image\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component version is identical to running image");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_VER_STR_LOWER_CODE:
+		dev_err(dev, "Component version for %s is lower than the running image\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component version is lower than the running image");
+		break;
+	default:
+		dev_err(dev, "Unexpected response code 0x02%x for %s\n",
+			code, component);
+		NL_SET_ERR_MSG_MOD(extack, "Received unexpected response code from firmware");
+		break;
+	}
+
+	return -ECANCELED;
+}
+
+/**
+ * ice_send_component_table - Send PLDM component table to firmware
+ * @context: PLDM fw update structure
+ * @component: the component to process
+ * @transfer_flag: relative transfer order of this component
+ *
+ * Read relevant data from the component and forward it to the device
+ * firmware. Check the response to determine if the firmware indicates that
+ * the update can proceed.
+ *
+ * This function sends AdminQ commands related to the NVM, and assumes that
+ * the NVM resource has been acquired.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_send_component_table(struct pldmfw *context, struct pldmfw_component *component,
+			 u8 transfer_flag)
+{
+	struct ice_fwu_priv *priv = container_of(context, struct ice_fwu_priv, context);
+	struct netlink_ext_ack *extack = priv->extack;
+	struct ice_aqc_nvm_comp_tbl *comp_tbl;
+	u8 comp_response, comp_response_code;
+	struct device *dev = context->dev;
+	struct ice_pf *pf = priv->pf;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	size_t length;
+
+	switch (component->identifier) {
+	case NVM_COMP_ID_OROM:
+	case NVM_COMP_ID_NVM:
+	case NVM_COMP_ID_NETLIST:
+		break;
+	default:
+		dev_err(dev, "Unable to update due to a firmware component with unknown ID %u\n",
+			component->identifier);
+		NL_SET_ERR_MSG_MOD(extack, "Unable to update due to unknown firmware component");
+		return -EOPNOTSUPP;
+	}
+
+	length = struct_size(comp_tbl, cvs, component->version_len);
+	comp_tbl = kzalloc(length, GFP_KERNEL);
+	if (!comp_tbl)
+		return -ENOMEM;
+
+	comp_tbl->comp_class = cpu_to_le16(component->classification);
+	comp_tbl->comp_id = cpu_to_le16(component->identifier);
+	comp_tbl->comp_class_idx = FWU_COMP_CLASS_IDX_NOT_USE;
+	comp_tbl->comp_cmp_stamp = cpu_to_le32(component->comparison_stamp);
+	comp_tbl->cvs_type = component->version_type;
+	comp_tbl->cvs_len = component->version_len;
+	memcpy(comp_tbl->cvs, component->version_string, component->version_len);
+
+	dev_dbg(dev, "Sending component table to firmware\n");
+
+	status = ice_nvm_pass_component_tbl(hw, (u8 *)comp_tbl, length,
+					    transfer_flag, &comp_response,
+					    &comp_response_code, NULL);
+
+	kfree(comp_tbl);
+
+	if (status) {
+		dev_err(dev, "Failed to transfer component table to firmware, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to transfer component table to firmware");
+		return -EIO;
+	}
+
+	return ice_check_component_response(pf, component->identifier, comp_response,
+					    comp_response_code, extack);
+}
+
+/**
+ * ice_write_one_nvm_block - Write an NVM block and await completion response
+ * @pf: the PF data structure
+ * @module: the module to write to
+ * @offset: offset in bytes
+ * @block_size: size of the block to write, up to 4k
+ * @block: pointer to block of data to write
+ * @last_cmd: whether this is the last command
+ * @extack: netlink extended ACK structure
+ *
+ * Write a block of data to a flash module, and await for the completion
+ * response message from firmware.
+ *
+ * Note this function assumes the caller has acquired the NVM resource.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_write_one_nvm_block(struct ice_pf *pf, u16 module, u32 offset,
+			u16 block_size, u8 *block, bool last_cmd,
+			struct netlink_ext_ack *extack)
+{
+	u16 completion_module, completion_retval;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_rq_event_info event;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 completion_offset;
+	int err;
+
+	memset(&event, 0, sizeof(event));
+
+	dev_dbg(dev, "Writing block of %u bytes for module 0x%02x at offset %u\n",
+		block_size, module, offset);
+
+	status = ice_aq_update_nvm(hw, module, offset, block_size, block,
+				   last_cmd, 0, NULL);
+	if (status) {
+		dev_err(dev, "Failed to flash module 0x%02x with block of size %u at offset %u, err %s aq_err %s\n",
+			module, block_size, offset, ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to program flash module");
+		return -EIO;
+	}
+
+	/* In most cases, firmware reports a write completion within a few
+	 * milliseconds. However, it has been observed that a completion might
+	 * take more than a second to complete in some cases. The timeout here
+	 * is conservative and is intended to prevent failure to update when
+	 * firmware is slow to respond.
+	 */
+	err = ice_aq_wait_for_event(pf, ice_aqc_opc_nvm_write, 15*HZ, &event);
+	if (err) {
+		dev_err(dev, "Timed out while trying to flash module 0x%02x with block of size %u at offset %u, err %d\n",
+			module, block_size, offset, err);
+		NL_SET_ERR_MSG_MOD(extack, "Timed out waiting for firmware");
+		return -EIO;
+	}
+
+	completion_module = le16_to_cpu(event.desc.params.nvm.module_typeid);
+	completion_retval = le16_to_cpu(event.desc.retval);
+
+	completion_offset = le16_to_cpu(event.desc.params.nvm.offset_low);
+	completion_offset |= event.desc.params.nvm.offset_high << 16;
+
+	if (completion_module != module) {
+		dev_err(dev, "Unexpected module_typeid in write completion: got 0x%x, expected 0x%x\n",
+			completion_module, module);
+		NL_SET_ERR_MSG_MOD(extack, "Unexpected firmware response");
+		return -EIO;
+	}
+
+	if (completion_offset != offset) {
+		dev_err(dev, "Unexpected offset in write completion: got %u, expected %u\n",
+			completion_offset, offset);
+		NL_SET_ERR_MSG_MOD(extack, "Unexpected firmware response");
+		return -EIO;
+	}
+
+	if (completion_retval) {
+		dev_err(dev, "Firmware ailed to flash module 0x%02x with block of size %u at offset %u, err %s\n",
+			module, block_size, offset,
+			ice_aq_str((enum ice_aq_err)completion_retval));
+		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to program flash module");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_write_nvm_module - Write data to an NVM module
+ * @pf: the PF driver structure
+ * @module: the module id to program
+ * @component: the name of the component being updated
+ * @image: buffer of image data to write to the NVM
+ * @length: length of the buffer
+ * @extack: netlink extended ACK structure
+ *
+ * Loop over the data for a given NVM module and program it in 4 Kb
+ * blocks. Notify devlink core of progress after each block is programmed.
+ * Loops over a block of data and programs the NVM in 4k block chunks.
+ *
+ * Note this function assumes the caller has acquired the NVM resource.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_write_nvm_module(struct ice_pf *pf, u16 module, const char *component,
+		     const u8 *image, u32 length,
+		     struct netlink_ext_ack *extack)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct devlink *devlink;
+	u32 offset = 0;
+	bool last_cmd;
+	u8 *block;
+	int err;
+
+	dev_dbg(dev, "Beginning write of flash component '%s', module 0x%02x\n", component, module);
+
+	devlink = priv_to_devlink(pf);
+
+	devlink_flash_update_status_notify(devlink, "Flashing",
+					   component, 0, length);
+
+	block = kzalloc(ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
+	if (!block)
+		return -ENOMEM;
+
+	do {
+		u32 block_size;
+
+		block_size = min_t(u32, ICE_AQ_MAX_BUF_LEN, length - offset);
+		last_cmd = !(offset + block_size < length);
+
+		/* ice_aq_update_nvm may copy the firmware response into the
+		 * buffer, so we must make a copy since the source data is
+		 * constant.
+		 */
+		memcpy(block, image + offset, block_size);
+
+		err = ice_write_one_nvm_block(pf, module, offset, block_size,
+					      block, last_cmd, extack);
+		if (err)
+			break;
+
+		offset += block_size;
+
+		devlink_flash_update_status_notify(devlink, "Flashing",
+						   component, offset, length);
+	} while (!last_cmd);
+
+	dev_dbg(dev, "Completed write of flash component '%s', module 0x%02x\n", component, module);
+
+	if (err)
+		devlink_flash_update_status_notify(devlink, "Flashing failed",
+						   component, length, length);
+	else
+		devlink_flash_update_status_notify(devlink, "Flashing done",
+						   component, length, length);
+
+	kfree(block);
+	return err;
+}
+
+/**
+ * ice_erase_nvm_module - Erase an NVM module and await firmware completion
+ * @pf: the PF data structure
+ * @module: the module to erase
+ * @component: name of the component being updated
+ * @extack: netlink extended ACK structure
+ *
+ * Erase the inactive NVM bank associated with this module, and await for
+ * a completion response message from firmware.
+ *
+ * Note this function assumes the caller has acquired the NVM resource.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_erase_nvm_module(struct ice_pf *pf, u16 module, const char *component,
+		     struct netlink_ext_ack *extack)
+{
+	u16 completion_module, completion_retval;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_rq_event_info event;
+	struct ice_hw *hw = &pf->hw;
+	struct devlink *devlink;
+	enum ice_status status;
+	int err;
+
+	dev_dbg(dev, "Beginning erase of flash component '%s', module 0x%02x\n", component, module);
+
+	memset(&event, 0, sizeof(event));
+
+	devlink = priv_to_devlink(pf);
+
+	devlink_flash_update_status_notify(devlink, "Erasing", component, 0, 0);
+
+	status = ice_aq_erase_nvm(hw, module, NULL);
+	if (status) {
+		dev_err(dev, "Failed to erase %s (module 0x%02x), err %s aq_err %s\n",
+			component, module, ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to erase flash module");
+		err = -EIO;
+		goto out_notify_devlink;
+	}
+
+	/* Yes, this really can take minutes to complete */
+	err = ice_aq_wait_for_event(pf, ice_aqc_opc_nvm_erase, 300 * HZ, &event);
+	if (err) {
+		dev_err(dev, "Timed out waiting for firmware to respond with erase completion for %s (module 0x%02x), err %d\n",
+			component, module, err);
+		NL_SET_ERR_MSG_MOD(extack, "Timed out waiting for firmware");
+		goto out_notify_devlink;
+	}
+
+	completion_module = le16_to_cpu(event.desc.params.nvm.module_typeid);
+	completion_retval = le16_to_cpu(event.desc.retval);
+
+	if (completion_module != module) {
+		dev_err(dev, "Unexpected module_typeid in erase completion for %s: got 0x%x, expected 0x%x\n",
+			component, completion_module, module);
+		NL_SET_ERR_MSG_MOD(extack, "Unexpected firmware response");
+		err = -EIO;
+		goto out_notify_devlink;
+	}
+
+	if (completion_retval) {
+		dev_err(dev, "Firmware failed to erase %s (module 0x02%x), aq_err %s\n",
+			component, module,
+			ice_aq_str((enum ice_aq_err)completion_retval));
+		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to erase flash");
+		err = -EIO;
+		goto out_notify_devlink;
+	}
+
+	dev_dbg(dev, "Completed erase of flash component '%s', module 0x%02x\n", component, module);
+
+out_notify_devlink:
+	if (err)
+		devlink_flash_update_status_notify(devlink, "Erasing failed",
+						   component, 0, 0);
+	else
+		devlink_flash_update_status_notify(devlink, "Erasing done",
+						   component, 0, 0);
+
+	return err;
+}
+
+/**
+ * ice_switch_flash_banks - Tell firmware to switch NVM banks
+ * @pf: Pointer to the PF data structure
+ * @activate_flags: flags used for the activation command
+ * @extack: netlink extended ACK structure
+ *
+ * Notify firmware to activate the newly written flash banks, and wait for the
+ * firmware response.
+ *
+ * Returns: zero on success or an error code on failure.
+ */
+static int ice_switch_flash_banks(struct ice_pf *pf, u8 activate_flags,
+				  struct netlink_ext_ack *extack)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_rq_event_info event;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u16 completion_retval;
+	int err;
+
+	memset(&event, 0, sizeof(event));
+
+	status = ice_nvm_write_activate(hw, activate_flags);
+	if (status) {
+		dev_err(dev, "Failed to switch active flash banks, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to switch active flash banks");
+		return -EIO;
+	}
+
+	/* In most cases, we expect firmware to respond with a completion
+	 * within a few milliseconds. However, it has been observed in
+	 * practice that firmware may sometimes take longer. The wait time
+	 * here is conservative to reduce the risk of a failed update simply
+	 * because we did not wait long enough for firmware to respond.
+	 */
+	err = ice_aq_wait_for_event(pf, ice_aqc_opc_nvm_write_activate, 30*HZ,
+				    &event);
+	if (err) {
+		dev_err(dev, "Timed out waiting for firmware to switch active flash banks, err %d\n",
+			err);
+		NL_SET_ERR_MSG_MOD(extack, "Timed out waiting for firmware");
+		return err;
+	}
+
+	completion_retval = le16_to_cpu(event.desc.retval);
+	if (completion_retval) {
+		dev_err(dev, "Firmware failed to switch active flash banks aq_err %s\n",
+			ice_aq_str((enum ice_aq_err)completion_retval));
+		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to switch active flash banks");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flash_component - Flash a component of the NVM
+ * @context: PLDM fw update structure
+ * @component: the component table to program
+ *
+ * Program the flash contents for a given component. First, determine the
+ * module id. Then, erase the secondary bank for this module. Finally, write
+ * the contents of the component to the NVM.
+ *
+ * Note this function assumes the caller has acquired the NVM resource.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_flash_component(struct pldmfw *context, struct pldmfw_component *component)
+{
+	struct ice_fwu_priv *priv = container_of(context, struct ice_fwu_priv, context);
+	struct netlink_ext_ack *extack = priv->extack;
+	struct ice_pf *pf = priv->pf;
+	const char *name;
+	u16 module;
+	u8 flag;
+	int err;
+
+	switch (component->identifier) {
+	case NVM_COMP_ID_OROM:
+		module = ICE_SR_1ST_OROM_BANK_PTR;
+		flag = ICE_AQC_NVM_ACTIV_SEL_OROM;
+		name = "fw.undi";
+		break;
+	case NVM_COMP_ID_NVM:
+		module = ICE_SR_1ST_NVM_BANK_PTR;
+		flag = ICE_AQC_NVM_ACTIV_SEL_NVM;
+		name = "fw.mgmt";
+		break;
+	case NVM_COMP_ID_NETLIST:
+		module = ICE_SR_NETLIST_BANK_PTR;
+		flag = ICE_AQC_NVM_ACTIV_SEL_NETLIST;
+		name = "fw.netlist";
+		break;
+	default:
+		/* This should not trigger, since we check the id before
+		 * sending the component table to firmware.
+		 */
+		WARN(1, "Unexpected unknown component identifier 0x%02x",
+		     component->identifier);
+		return -EINVAL;
+	}
+
+	/* Mark this component for activating at the end */
+	priv->activate_flags |= flag;
+
+	err = ice_erase_nvm_module(pf, module, name, extack);
+	if (err)
+		return err;
+
+	return ice_write_nvm_module(pf, module, name, component->component_data,
+				    component->component_size, extack);
+}
+
+/**
+ * ice_finalize_update - Perform last steps to complete device update
+ * @context: PLDM fw update structure
+ *
+ * Called as the last step of the update process. Complete the update by
+ * telling the firmware to switch active banks, and perform a reset of
+ * configured.
+ *
+ * Returns: 0 on success, or an error code on failure.
+ */
+static int ice_finalize_update(struct pldmfw *context)
+{
+	struct ice_fwu_priv *priv = container_of(context, struct ice_fwu_priv, context);
+	struct netlink_ext_ack *extack = priv->extack;
+	struct ice_pf *pf = priv->pf;
+	int err;
+
+	/* Finally, notify firmware to activate the written NVM banks */
+	err = ice_switch_flash_banks(pf, priv->activate_flags, extack);
+	if (err)
+		return err;
+
+	/* Perform an immediate reset only if PRESERVE_ALL is selected */
+	if ((priv->activate_flags & ICE_AQC_NVM_PRESERVATION_M) == ICE_AQC_NVM_PRESERVE_ALL) {
+		struct device *dev = ice_pf_to_dev(pf);
+		struct ice_hw *hw = &pf->hw;
+		enum ice_status status;
+
+		status = ice_aq_nvm_update_empr(hw);
+		if (status) {
+			dev_err(dev, "Failed to trigger immediate device reset, err %s aq_err %s\n",
+				ice_stat_str(status),
+				ice_aq_str(hw->adminq.sq_last_status));
+			NL_SET_ERR_MSG_MOD(extack, "Failed to trigger immediate device reset");
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static const struct pldmfw_ops ice_fwu_ops = {
+	.match_record = &pldmfw_op_pci_match_record,
+	.send_package_data = &ice_send_package_data,
+	.send_component_table = &ice_send_component_table,
+	.flash_component = &ice_flash_component,
+	.finalize_update = &ice_finalize_update,
+};
+
+/**
+ * ice_flash_pldm_image - Write a PLDM-formatted firmware image to the device
+ * @pf: private device driver structure
+ * @fw: firmware object pointing to the relevant firmware file
+ * @preservation: preservation level to request from firmware
+ * @extack: netlink extended ACK structure
+ *
+ * Parse the data for a given firmware file, verifying that it is a valid PLDM
+ * formatted image that matches this device.
+ *
+ * Extract the device record Package Data and Component Tables and send them
+ * to the firmware. Extract and write the flash data for each of the three
+ * main flash components, "fw.mgmt", "fw.undi", and "fw.netlist". Notify
+ * firmware once the data is written to the inactive banks.
+ *
+ * Returns: zero on success or a negative error code on failure.
+ */
+int ice_flash_pldm_image(struct ice_pf *pf, const struct firmware *fw,
+			 u8 preservation, struct netlink_ext_ack *extack)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	struct ice_fwu_priv priv;
+	enum ice_status status;
+	int err;
+
+	switch (preservation) {
+	case ICE_AQC_NVM_PRESERVE_ALL:
+	case ICE_AQC_NVM_PRESERVE_SELECTED:
+	case ICE_AQC_NVM_NO_PRESERVATION:
+	case ICE_AQC_NVM_FACTORY_DEFAULT:
+		break;
+	default:
+		WARN(1, "Unexpected preservation level request %u", preservation);
+		return -EINVAL;
+	}
+
+	memset(&priv, 0, sizeof(priv));
+
+	priv.context.ops = &ice_fwu_ops;
+	priv.context.dev = dev;
+	priv.extack = extack;
+	priv.pf = pf;
+	priv.activate_flags = preservation;
+
+	status = ice_acquire_nvm(hw, ICE_RES_WRITE);
+	if (status) {
+		dev_err(dev, "Failed to acquire device flash lock, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to acquire device flash lock");
+		return -EIO;
+	}
+
+	err = pldmfw_flash_image(&priv.context, fw);
+	if (err == -ENOENT) {
+		dev_err(dev, "Firmware image has no record matching this device\n");
+		NL_SET_ERR_MSG_MOD(extack, "Firmware image has no record matching this device");
+	} else if (err) {
+		/* Do not set a generic extended ACK message here. A more
+		 * specific message may already have been set by one of our
+		 * ops.
+		 */
+		dev_err(dev, "Failed to flash PLDM image, err %d", err);
+	}
+
+	ice_release_nvm(hw);
+
+	return err;
+}
+
+/**
+ * ice_check_for_pending_update - Check for a pending flash update
+ * @pf: the PF driver structure
+ * @component: if not NULL, the name of the component being updated
+ * @extack: Netlink extended ACK structure
+ *
+ * Check whether the device already has a pending flash update. If such an
+ * update is found, cancel it so that the requested update may proceed.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+int ice_check_for_pending_update(struct ice_pf *pf, const char *component,
+				 struct netlink_ext_ack *extack)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw_dev_caps *dev_caps;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 pending = 0;
+	int err;
+
+	dev_caps = kzalloc(sizeof(*dev_caps), GFP_KERNEL);
+	if (!dev_caps)
+		return -ENOMEM;
+
+	/* Read the most recent device capabilities from firmware. Do not use
+	 * the cached values in hw->dev_caps, because the pending update flag
+	 * may have changed, e.g. if an update was previously completed and
+	 * the system has not yet rebooted.
+	 */
+	status = ice_discover_dev_caps(hw, dev_caps);
+	if (status) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to read device capabilities");
+		kfree(dev_caps);
+		return -EIO;
+	}
+
+	if (dev_caps->common_cap.nvm_update_pending_nvm) {
+		dev_info(dev, "The fw.mgmt flash component has a pending update\n");
+		pending |= ICE_AQC_NVM_ACTIV_SEL_NVM;
+	}
+
+	if (dev_caps->common_cap.nvm_update_pending_orom) {
+		dev_info(dev, "The fw.undi flash component has a pending update\n");
+		pending |= ICE_AQC_NVM_ACTIV_SEL_OROM;
+	}
+
+	if (dev_caps->common_cap.nvm_update_pending_netlist) {
+		dev_info(dev, "The fw.netlist flash component has a pending update\n");
+		pending |= ICE_AQC_NVM_ACTIV_SEL_NETLIST;
+	}
+
+	kfree(dev_caps);
+
+	/* If the flash_update request is for a specific component, ignore all
+	 * of the other components.
+	 */
+	if (component) {
+		if (strcmp(component, "fw.mgmt") == 0)
+			pending &= ICE_AQC_NVM_ACTIV_SEL_NVM;
+		else if (strcmp(component, "fw.undi") == 0)
+			pending &= ICE_AQC_NVM_ACTIV_SEL_OROM;
+		else if (strcmp(component, "fw.netlist") == 0)
+			pending &= ICE_AQC_NVM_ACTIV_SEL_NETLIST;
+		else
+			WARN(1, "Unexpected flash component %s", component);
+	}
+
+	/* There is no previous pending update, so this request may continue */
+	if (!pending)
+		return 0;
+
+	/* In order to allow overwriting a previous pending update, notify
+	 * firmware to cancel that update by issuing the appropriate command.
+	 */
+	devlink_flash_update_status_notify(devlink,
+					   "Canceling previous pending update",
+					   component, 0, 0);
+
+	status = ice_acquire_nvm(hw, ICE_RES_WRITE);
+	if (status) {
+		dev_err(dev, "Failed to acquire device flash lock, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to acquire device flash lock");
+		return -EIO;
+	}
+
+	pending |= ICE_AQC_NVM_REVERT_LAST_ACTIV;
+	err = ice_switch_flash_banks(pf, pending, extack);
+
+	ice_release_nvm(hw);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fw_update.h b/drivers/net/ethernet/intel/ice/ice_fw_update.h
new file mode 100644
index 000000000000..0e083c0f9695
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fw_update.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FW_UPDATE_H_
+#define _ICE_FW_UPDATE_H_
+
+int ice_flash_pldm_image(struct ice_pf *pf, const struct firmware *fw,
+			 u8 preservation, struct netlink_ext_ack *extack);
+int ice_check_for_pending_update(struct ice_pf *pf, const char *component,
+				 struct netlink_ext_ack *extack);
+
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c
new file mode 100644
index 000000000000..cee8cf7ef36b
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fwlog.c
@@ -0,0 +1,477 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_common.h"
+#include "ice_fwlog.h"
+
+/**
+ * cache_cfg - Cache FW logging config
+ * @hw: pointer to the HW structure
+ * @cfg: config to cache
+ */
+static void cache_cfg(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	hw->fwlog_cfg = *cfg;
+}
+
+/**
+ * valid_module_entries - validate all the  module entry IDs and log levels
+ * @hw: pointer to the HW structure
+ * @entries: entries to validate
+ * @num_entries: number of entries to validate
+ */
+static bool
+valid_module_entries(struct ice_hw *hw, struct ice_fwlog_module_entry *entries,
+		     u16 num_entries)
+{
+	u16 i;
+
+	if (!entries) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "Null ice_fwlog_module_entry array\n");
+		return false;
+	}
+
+	if (!num_entries) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "num_entries must be non-zero\n");
+		return false;
+	}
+
+	for (i = 0; i < num_entries; i++) {
+		struct ice_fwlog_module_entry *entry = &entries[i];
+
+		if (entry->module_id >= ICE_AQC_FW_LOG_ID_MAX) {
+			ice_debug(hw, ICE_DBG_FW_LOG, "Invalid module_id %u, max valid module_id is %u\n",
+				  entry->module_id, ICE_AQC_FW_LOG_ID_MAX - 1);
+			return false;
+		}
+
+		if (entry->log_level >= ICE_FWLOG_LEVEL_INVALID) {
+			ice_debug(hw, ICE_DBG_FW_LOG, "Invalid log_level %u, max valid log_level is %u\n",
+				  entry->log_level,
+				  ICE_AQC_FW_LOG_ID_MAX - 1);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * valid_cfg - validate entire configuration
+ * @hw: pointer to the HW structure
+ * @cfg: config to validate
+ */
+static bool valid_cfg(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	if (!cfg) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "Null ice_fwlog_cfg\n");
+		return false;
+	}
+
+	if (cfg->log_resolution < ICE_AQC_FW_LOG_MIN_RESOLUTION ||
+	    cfg->log_resolution > ICE_AQC_FW_LOG_MAX_RESOLUTION) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "Unsupported log_resolution %u, must be between %u and %u\n",
+			  cfg->log_resolution, ICE_AQC_FW_LOG_MIN_RESOLUTION,
+			  ICE_AQC_FW_LOG_MAX_RESOLUTION);
+		return false;
+	}
+
+	if (!valid_module_entries(hw, cfg->module_entries,
+				  ICE_AQC_FW_LOG_ID_MAX))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_fwlog_init - Initialize cached structures for tracking FW logging
+ * @hw: pointer to the HW structure
+ * @cfg: config used to initialize the cached structures
+ *
+ * This function should be called on driver initialization and before calling
+ * ice_init_hw(). Firmware logging will be configured based on these settings
+ * and also the PF will be registered on init.
+ */
+enum ice_status
+ice_fwlog_init(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	if (!valid_cfg(hw, cfg))
+		return ICE_ERR_PARAM;
+
+	cache_cfg(hw, cfg);
+
+	return 0;
+}
+
+/**
+ * ice_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30)
+ * @hw: pointer to the HW structure
+ * @entries: entries to configure
+ * @num_entries: number of @entries
+ * @options: options from ice_fwlog_cfg->options structure
+ * @log_resolution: logging resolution
+ */
+static enum ice_status
+ice_aq_fwlog_set(struct ice_hw *hw, struct ice_fwlog_module_entry *entries,
+		 u16 num_entries, u16 options, u16 log_resolution)
+{
+	struct ice_aqc_fw_log_cfg_resp *fw_modules;
+	struct ice_aqc_fw_log *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u16 i;
+
+	fw_modules = devm_kcalloc(ice_hw_to_dev(hw), num_entries,
+				  sizeof(*fw_modules), GFP_KERNEL);
+	if (!fw_modules)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < num_entries; i++) {
+		fw_modules[i].module_identifier =
+			cpu_to_le16(entries[i].module_id);
+		fw_modules[i].log_level = entries[i].log_level;
+	}
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_config);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd = &desc.params.fw_log;
+
+	cmd->cmd_flags = ICE_AQC_FW_LOG_CONF_SET_VALID;
+	cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution);
+	cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries);
+
+	if (options & ICE_FWLOG_OPTION_ARQ_ENA)
+		cmd->cmd_flags |= ICE_AQC_FW_LOG_CONF_AQ_EN;
+	if (options & ICE_FWLOG_OPTION_UART_ENA)
+		cmd->cmd_flags |= ICE_AQC_FW_LOG_CONF_UART_EN;
+
+	status = ice_aq_send_cmd(hw, &desc, fw_modules,
+				 sizeof(*fw_modules) * num_entries,
+				 NULL);
+
+	devm_kfree(ice_hw_to_dev(hw), fw_modules);
+
+	return status;
+}
+
+/**
+ * ice_fwlog_supported - Cached for whether FW supports FW logging or not
+ * @hw: pointer to the HW structure
+ *
+ * This will always return false if called before ice_init_hw(), so it must be
+ * called after ice_init_hw().
+ */
+bool ice_fwlog_supported(struct ice_hw *hw)
+{
+	return hw->fwlog_support_ena;
+}
+
+/**
+ * ice_fwlog_set - Set the firmware logging settings
+ * @hw: pointer to the HW structure
+ * @cfg: config used to set firmware logging
+ *
+ * This function should be called whenever the driver needs to set the firmware
+ * logging configuration. It can be called on initialization, reset, or during
+ * runtime.
+ *
+ * If the PF wishes to receive FW logging then it must register via
+ * ice_fwlog_register. Note, that ice_fwlog_register does not need to be called
+ * for init.
+ */
+enum ice_status
+ice_fwlog_set(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	if (!valid_cfg(hw, cfg))
+		return ICE_ERR_PARAM;
+
+	status = ice_aq_fwlog_set(hw, cfg->module_entries,
+				  ICE_AQC_FW_LOG_ID_MAX, cfg->options,
+				  cfg->log_resolution);
+	if (!status)
+		cache_cfg(hw, cfg);
+
+	return status;
+}
+
+/**
+ * update_cached_entries - Update module entries in cached FW logging config
+ * @hw: pointer to the HW structure
+ * @entries: entries to cache
+ * @num_entries: number of @entries
+ */
+static void
+update_cached_entries(struct ice_hw *hw, struct ice_fwlog_module_entry *entries,
+		      u16 num_entries)
+{
+	u16 i;
+
+	for (i = 0; i < num_entries; i++) {
+		struct ice_fwlog_module_entry *updated = &entries[i];
+		u16 j;
+
+		for (j = 0; j < ICE_AQC_FW_LOG_ID_MAX; j++) {
+			struct ice_fwlog_module_entry *cached =
+				&hw->fwlog_cfg.module_entries[j];
+
+			if (cached->module_id == updated->module_id) {
+				cached->log_level = updated->log_level;
+				break;
+			}
+		}
+	}
+}
+
+/**
+ * ice_fwlog_update_modules - Update the log level 1 or more FW logging modules
+ * @hw: pointer to the HW structure
+ * @entries: array of ice_fwlog_module_entry(s)
+ * @num_entries: number of entries
+ *
+ * This function should be called to update the log level of 1 or more FW
+ * logging modules via module ID.
+ *
+ * Only the entries passed in will be affected. All other firmware logging
+ * settings will be unaffected.
+ */
+enum ice_status
+ice_fwlog_update_modules(struct ice_hw *hw,
+			 struct ice_fwlog_module_entry *entries,
+			 u16 num_entries)
+{
+	struct ice_fwlog_cfg *cfg;
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	if (!valid_module_entries(hw, entries, num_entries))
+		return ICE_ERR_PARAM;
+
+	cfg = devm_kcalloc(ice_hw_to_dev(hw), 1, sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_fwlog_get(hw, cfg);
+	if (status)
+		goto status_out;
+
+	status = ice_aq_fwlog_set(hw, entries, num_entries, cfg->options,
+				  cfg->log_resolution);
+	if (!status)
+		update_cached_entries(hw, entries, num_entries);
+
+status_out:
+	devm_kfree(ice_hw_to_dev(hw), cfg);
+	return status;
+}
+
+/**
+ *ice_aq_fwlog_register - Register PF for firmware logging events (0xFF31)
+ * @hw: pointer to the HW structure
+ * @reg: true to register and false to unregister
+ */
+static enum ice_status ice_aq_fwlog_register(struct ice_hw *hw, bool reg)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_register);
+
+	if (reg)
+		desc.params.fw_log.cmd_flags = ICE_AQC_FW_LOG_AQ_REGISTER;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
+
+/**
+ * ice_fwlog_register - Register the PF for firmware logging
+ * @hw: pointer to the HW structure
+ *
+ * After this call the PF will start to receive firmware logging based on the
+ * configuration set in ice_fwlog_set.
+ */
+enum ice_status ice_fwlog_register(struct ice_hw *hw)
+{
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	status = ice_aq_fwlog_register(hw, true);
+	if (status)
+		ice_debug(hw, ICE_DBG_FW_LOG, "Failed to register for firmware logging events over ARQ\n");
+	else
+		hw->fwlog_cfg.options |= ICE_FWLOG_OPTION_IS_REGISTERED;
+
+	return status;
+}
+
+/**
+ * ice_fwlog_unregister - Unregister the PF from firmware logging
+ * @hw: pointer to the HW structure
+ */
+enum ice_status ice_fwlog_unregister(struct ice_hw *hw)
+{
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	status = ice_aq_fwlog_register(hw, false);
+	if (status)
+		ice_debug(hw, ICE_DBG_FW_LOG, "Failed to unregister from firmware logging events over ARQ\n");
+	else
+		hw->fwlog_cfg.options &= ~ICE_FWLOG_OPTION_IS_REGISTERED;
+
+	return status;
+}
+
+/**
+ * ice_aq_fwlog_get - Get the current firmware logging configuration (0xFF32)
+ * @hw: pointer to the HW structure
+ * @cfg: firmware logging configuration to populate
+ */
+static enum ice_status
+ice_aq_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	struct ice_aqc_fw_log_cfg_resp *fw_modules;
+	struct ice_aqc_fw_log *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u16 i, module_id_cnt;
+	void *buf;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	buf = devm_kcalloc(ice_hw_to_dev(hw), 1, ICE_AQ_MAX_BUF_LEN,
+			   GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_query);
+	cmd = &desc.params.fw_log;
+
+	cmd->cmd_flags = ICE_AQC_FW_LOG_AQ_QUERY;
+
+	status = ice_aq_send_cmd(hw, &desc, buf, ICE_AQ_MAX_BUF_LEN, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "Failed to get FW log configuration\n");
+		goto status_out;
+	}
+
+	module_id_cnt = le16_to_cpu(cmd->ops.cfg.mdl_cnt);
+	if (module_id_cnt < ICE_AQC_FW_LOG_ID_MAX) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "FW returned less than the expected number of FW log module IDs\n");
+	} else {
+		if (module_id_cnt > ICE_AQC_FW_LOG_ID_MAX)
+			ice_debug(hw, ICE_DBG_FW_LOG, "FW returned more than expected number of FW log module IDs, setting module_id_cnt to software expected max %u\n",
+				  ICE_AQC_FW_LOG_ID_MAX);
+		module_id_cnt = ICE_AQC_FW_LOG_ID_MAX;
+	}
+
+	cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution);
+	if (cmd->cmd_flags & ICE_AQC_FW_LOG_CONF_AQ_EN)
+		cfg->options |= ICE_FWLOG_OPTION_ARQ_ENA;
+	if (cmd->cmd_flags & ICE_AQC_FW_LOG_CONF_UART_EN)
+		cfg->options |= ICE_FWLOG_OPTION_UART_ENA;
+	if (cmd->cmd_flags & ICE_AQC_FW_LOG_QUERY_REGISTERED)
+		cfg->options |= ICE_FWLOG_OPTION_IS_REGISTERED;
+
+	fw_modules = (struct ice_aqc_fw_log_cfg_resp *)buf;
+
+	for (i = 0; i < module_id_cnt; i++) {
+		struct ice_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i];
+
+		cfg->module_entries[i].module_id =
+			le16_to_cpu(fw_module->module_identifier);
+		cfg->module_entries[i].log_level = fw_module->log_level;
+	}
+
+status_out:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
+
+/**
+ * ice_fwlog_set_support_ena - Set if FW logging is supported by FW
+ * @hw: pointer to the HW struct
+ *
+ * If FW returns success to the ice_aq_fwlog_get call then it supports FW
+ * logging, else it doesn't. Set the fwlog_support_ena flag accordingly.
+ *
+ * This function is only meant to be called during driver init to determine if
+ * the FW support FW logging.
+ */
+void ice_fwlog_set_support_ena(struct ice_hw *hw)
+{
+	struct ice_fwlog_cfg *cfg;
+	enum ice_status status;
+
+	hw->fwlog_support_ena = false;
+
+	cfg = devm_kcalloc(ice_hw_to_dev(hw), 1, sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return;
+
+	/* don't call ice_fwlog_get() because that would overwrite the cached
+	 * configuration from the call to ice_fwlog_init(), which is expected to
+	 * be called prior to this function
+	 */
+	status = ice_aq_fwlog_get(hw, cfg);
+	if (status)
+		ice_debug(hw, ICE_DBG_FW_LOG, "ice_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n",
+			  status);
+	else
+		hw->fwlog_support_ena = true;
+
+	devm_kfree(ice_hw_to_dev(hw), cfg);
+}
+
+/**
+ * ice_fwlog_get - Get the firmware logging settings
+ * @hw: pointer to the HW structure
+ * @cfg: config to populate based on current firmware logging settings
+ */
+enum ice_status
+ice_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	if (!cfg)
+		return ICE_ERR_PARAM;
+
+	status = ice_aq_fwlog_get(hw, cfg);
+	if (status)
+		return status;
+
+	cache_cfg(hw, cfg);
+
+	return 0;
+}
+
+/**
+ * ice_fwlog_event_dump - Dump the event received over the Admin Receive Queue
+ * @hw: pointer to the HW structure
+ * @desc: Admin Receive Queue descriptor
+ * @buf: buffer that contains the FW log event data
+ *
+ * If the driver receives the ice_aqc_opc_fw_logs_event on the Admin Receive
+ * Queue, then it should call this function to dump the FW log data.
+ */
+void
+ice_fwlog_event_dump(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf)
+{
+	if (!ice_fwlog_supported(hw))
+		return;
+
+	ice_info_fwlog(hw, 32, 1, buf, le16_to_cpu(desc->datalen));
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.h b/drivers/net/ethernet/intel/ice/ice_fwlog.h
new file mode 100644
index 000000000000..0914cb7c627b
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fwlog.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FWLOG_H_
+#define _ICE_FWLOG_H_
+#include "ice_adminq_cmd.h"
+
+struct ice_hw;
+
+/* Only a single log level should be set and all log levels under the set value
+ * are enabled, e.g. if log level is set to ICE_FW_LOG_LEVEL_VERBOSE, then all
+ * other log levels are included (except ICE_FW_LOG_LEVEL_NONE)
+ */
+enum ice_fwlog_level {
+	ICE_FWLOG_LEVEL_NONE = 0,
+	ICE_FWLOG_LEVEL_ERROR = 1,
+	ICE_FWLOG_LEVEL_WARNING = 2,
+	ICE_FWLOG_LEVEL_NORMAL = 3,
+	ICE_FWLOG_LEVEL_VERBOSE = 4,
+	ICE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */
+};
+
+struct ice_fwlog_module_entry {
+	/* module ID for the corresponding firmware logging event */
+	u16 module_id;
+	/* verbosity level for the module_id */
+	u8 log_level;
+};
+
+struct ice_fwlog_cfg {
+	/* list of modules for configuring log level */
+	struct ice_fwlog_module_entry module_entries[ICE_AQC_FW_LOG_ID_MAX];
+#define ICE_FWLOG_OPTION_ARQ_ENA		BIT(0)
+#define ICE_FWLOG_OPTION_UART_ENA		BIT(1)
+	/* set before calling ice_fwlog_init() so the PF registers for firmware
+	 * logging on initialization
+	 */
+#define ICE_FWLOG_OPTION_REGISTER_ON_INIT	BIT(2)
+	/* set in the ice_fwlog_get() response if the PF is registered for FW
+	 * logging events over ARQ
+	 */
+#define ICE_FWLOG_OPTION_IS_REGISTERED		BIT(3)
+	/* options used to configure firmware logging */
+	u16 options;
+	/* minimum number of log events sent per Admin Receive Queue event */
+	u8 log_resolution;
+};
+
+void ice_fwlog_set_support_ena(struct ice_hw *hw);
+bool ice_fwlog_supported(struct ice_hw *hw);
+enum ice_status ice_fwlog_init(struct ice_hw *hw, struct ice_fwlog_cfg *cfg);
+enum ice_status ice_fwlog_set(struct ice_hw *hw, struct ice_fwlog_cfg *cfg);
+enum ice_status ice_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg);
+enum ice_status
+ice_fwlog_update_modules(struct ice_hw *hw,
+			 struct ice_fwlog_module_entry *entries,
+			 u16 num_entries);
+enum ice_status ice_fwlog_register(struct ice_hw *hw);
+enum ice_status ice_fwlog_unregister(struct ice_hw *hw);
+void
+ice_fwlog_event_dump(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf);
+#endif /* _ICE_FWLOG_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
index 9138b19de87e..a42c1af118a8 100644
--- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
+++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
@@ -1,63 +1,3006 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
-
-/* Machine-generated file */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
+/* Machine-generated file; do not edit */
 #ifndef _ICE_HW_AUTOGEN_H_
 #define _ICE_HW_AUTOGEN_H_
 
-#define PF0INT_ITR_0(_i)			(0x03000004 + ((_i) * 4096))
-#define PF0INT_ITR_1(_i)			(0x03000008 + ((_i) * 4096))
-#define PF0INT_ITR_2(_i)			(0x0300000C + ((_i) * 4096))
-#define QTX_COMM_DBELL(_DBQM)			(0x002C0000 + ((_DBQM) * 4))
-#define QTX_COMM_HEAD(_DBQM)			(0x000E0000 + ((_DBQM) * 4))
+
+
+#define GL_RDPU_CNTRL				0x00052054 /* Reset Source: CORER */
+#define GL_RDPU_CNTRL_RX_PAD_EN_S		0
+#define GL_RDPU_CNTRL_RX_PAD_EN_M		BIT(0)
+#define GL_RDPU_CNTRL_UDP_ZERO_EN_S		1
+#define GL_RDPU_CNTRL_UDP_ZERO_EN_M		BIT(1)
+#define GL_RDPU_CNTRL_BLNC_EN_S			2
+#define GL_RDPU_CNTRL_BLNC_EN_M			BIT(2)
+#define GL_RDPU_CNTRL_RECIPE_BYPASS_S		3
+#define GL_RDPU_CNTRL_RECIPE_BYPASS_M		BIT(3)
+#define GL_RDPU_CNTRL_RLAN_ACK_REQ_PM_TH_S	4
+#define GL_RDPU_CNTRL_RLAN_ACK_REQ_PM_TH_M	ICE_M(0x3F, 4)
+#define GL_RDPU_CNTRL_PE_ACK_REQ_PM_TH_S	10
+#define GL_RDPU_CNTRL_PE_ACK_REQ_PM_TH_M	ICE_M(0x3F, 10)
+#define GL_RDPU_CNTRL_REQ_WB_PM_TH_S		16
+#define GL_RDPU_CNTRL_REQ_WB_PM_TH_M		ICE_M(0x1F, 16)
+#define GL_RDPU_CNTRL_ECO_S			21
+#define GL_RDPU_CNTRL_ECO_M			ICE_M(0x7FF, 21)
+#define MSIX_PBA(_i)				(0x00008000 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: FLR */
+#define MSIX_PBA_MAX_INDEX			2
+#define MSIX_PBA_PENBIT_S			0
+#define MSIX_PBA_PENBIT_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TADD(_i)				(0x00000000 + ((_i) * 16)) /* _i=0...64 */ /* Reset Source: FLR */
+#define MSIX_TADD_MAX_INDEX			64
+#define MSIX_TADD_MSIXTADD10_S			0
+#define MSIX_TADD_MSIXTADD10_M			ICE_M(0x3, 0)
+#define MSIX_TADD_MSIXTADD_S			2
+#define MSIX_TADD_MSIXTADD_M			ICE_M(0x3FFFFFFF, 2)
+#define MSIX_TUADD(_i)				(0x00000004 + ((_i) * 16)) /* _i=0...64 */ /* Reset Source: FLR */
+#define MSIX_TUADD_MAX_INDEX			64
+#define MSIX_TUADD_MSIXTUADD_S			0
+#define MSIX_TUADD_MSIXTUADD_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TVCTRL(_i)				(0x0000000C + ((_i) * 16)) /* _i=0...64 */ /* Reset Source: FLR */
+#define MSIX_TVCTRL_MAX_INDEX			64
+#define MSIX_TVCTRL_MASK_S			0
+#define MSIX_TVCTRL_MASK_M			BIT(0)
+#define PF0_FW_HLP_ARQBAH_PAGE			0x02D00180 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQBAH_PAGE_ARQBAH_S		0
+#define PF0_FW_HLP_ARQBAH_PAGE_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_HLP_ARQBAL_PAGE			0x02D00080 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_FW_HLP_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_FW_HLP_ARQBAL_PAGE_ARQBAL_S		6
+#define PF0_FW_HLP_ARQBAL_PAGE_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_HLP_ARQH_PAGE			0x02D00380 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQH_PAGE_ARQH_S		0
+#define PF0_FW_HLP_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ARQLEN_PAGE			0x02D00280 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQLEN_S		0
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQVFE_S		28
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQVFE_M		BIT(28)
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_FW_HLP_ARQT_PAGE			0x02D00480 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQT_PAGE_ARQT_S		0
+#define PF0_FW_HLP_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQBAH_PAGE			0x02D00100 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQBAH_PAGE_ATQBAH_S		0
+#define PF0_FW_HLP_ATQBAH_PAGE_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_HLP_ATQBAL_PAGE			0x02D00000 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQBAL_PAGE_ATQBAL_LSB_S	0
+#define PF0_FW_HLP_ATQBAL_PAGE_ATQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_FW_HLP_ATQBAL_PAGE_ATQBAL_S		6
+#define PF0_FW_HLP_ATQBAL_PAGE_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_HLP_ATQH_PAGE			0x02D00300 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQH_PAGE_ATQH_S		0
+#define PF0_FW_HLP_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQLEN_PAGE			0x02D00200 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQLEN_S		0
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQVFE_S		28
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQVFE_M		BIT(28)
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_FW_HLP_ATQT_PAGE			0x02D00400 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQT_PAGE_ATQT_S		0
+#define PF0_FW_HLP_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQBAH_PAGE			0x02D40180 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQBAH_PAGE_ARQBAH_S		0
+#define PF0_FW_PSM_ARQBAH_PAGE_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_PSM_ARQBAL_PAGE			0x02D40080 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_FW_PSM_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_FW_PSM_ARQBAL_PAGE_ARQBAL_S		6
+#define PF0_FW_PSM_ARQBAL_PAGE_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_PSM_ARQH_PAGE			0x02D40380 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQH_PAGE_ARQH_S		0
+#define PF0_FW_PSM_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQLEN_PAGE			0x02D40280 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQLEN_S		0
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQVFE_S		28
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQVFE_M		BIT(28)
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_FW_PSM_ARQT_PAGE			0x02D40480 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQT_PAGE_ARQT_S		0
+#define PF0_FW_PSM_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQBAH_PAGE			0x02D40100 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQBAH_PAGE_ATQBAH_S		0
+#define PF0_FW_PSM_ATQBAH_PAGE_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_PSM_ATQBAL_PAGE			0x02D40000 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQBAL_PAGE_ATQBAL_LSB_S	0
+#define PF0_FW_PSM_ATQBAL_PAGE_ATQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_FW_PSM_ATQBAL_PAGE_ATQBAL_S		6
+#define PF0_FW_PSM_ATQBAL_PAGE_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_PSM_ATQH_PAGE			0x02D40300 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQH_PAGE_ATQH_S		0
+#define PF0_FW_PSM_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQLEN_PAGE			0x02D40200 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQLEN_S		0
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQVFE_S		28
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQVFE_M		BIT(28)
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_FW_PSM_ATQT_PAGE			0x02D40400 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQT_PAGE_ATQT_S		0
+#define PF0_FW_PSM_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQBAH_PAGE			0x02D80190 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQBAH_PAGE_ARQBAH_S	0
+#define PF0_MBX_CPM_ARQBAH_PAGE_ARQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_CPM_ARQBAL_PAGE			0x02D80090 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_MBX_CPM_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_MBX_CPM_ARQBAL_PAGE_ARQBAL_S	6
+#define PF0_MBX_CPM_ARQBAL_PAGE_ARQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_CPM_ARQH_PAGE			0x02D80390 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQH_PAGE_ARQH_S		0
+#define PF0_MBX_CPM_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQLEN_PAGE			0x02D80290 /* Reset Source: PFR */
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQLEN_S	0
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQVFE_S	28
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQVFE_M	BIT(28)
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_MBX_CPM_ARQT_PAGE			0x02D80490 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQT_PAGE_ARQT_S		0
+#define PF0_MBX_CPM_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQBAH_PAGE			0x02D80110 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQBAH_PAGE_ATQBAH_S	0
+#define PF0_MBX_CPM_ATQBAH_PAGE_ATQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_CPM_ATQBAL_PAGE			0x02D80010 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQBAL_PAGE_ATQBAL_S	6
+#define PF0_MBX_CPM_ATQBAL_PAGE_ATQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_CPM_ATQH_PAGE			0x02D80310 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQH_PAGE_ATQH_S		0
+#define PF0_MBX_CPM_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQLEN_PAGE			0x02D80210 /* Reset Source: PFR */
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQLEN_S	0
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQVFE_S	28
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQVFE_M	BIT(28)
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_MBX_CPM_ATQT_PAGE			0x02D80410 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQT_PAGE_ATQT_S		0
+#define PF0_MBX_CPM_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQBAH_PAGE			0x02D00190 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQBAH_PAGE_ARQBAH_S	0
+#define PF0_MBX_HLP_ARQBAH_PAGE_ARQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_HLP_ARQBAL_PAGE			0x02D00090 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_MBX_HLP_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_MBX_HLP_ARQBAL_PAGE_ARQBAL_S	6
+#define PF0_MBX_HLP_ARQBAL_PAGE_ARQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_HLP_ARQH_PAGE			0x02D00390 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQH_PAGE_ARQH_S		0
+#define PF0_MBX_HLP_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQLEN_PAGE			0x02D00290 /* Reset Source: PFR */
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQLEN_S	0
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQVFE_S	28
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQVFE_M	BIT(28)
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_MBX_HLP_ARQT_PAGE			0x02D00490 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQT_PAGE_ARQT_S		0
+#define PF0_MBX_HLP_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQBAH_PAGE			0x02D00110 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQBAH_PAGE_ATQBAH_S	0
+#define PF0_MBX_HLP_ATQBAH_PAGE_ATQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_HLP_ATQBAL_PAGE			0x02D00010 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQBAL_PAGE_ATQBAL_S	6
+#define PF0_MBX_HLP_ATQBAL_PAGE_ATQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_HLP_ATQH_PAGE			0x02D00310 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQH_PAGE_ATQH_S		0
+#define PF0_MBX_HLP_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQLEN_PAGE			0x02D00210 /* Reset Source: PFR */
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQLEN_S	0
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQVFE_S	28
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQVFE_M	BIT(28)
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_MBX_HLP_ATQT_PAGE			0x02D00410 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQT_PAGE_ATQT_S		0
+#define PF0_MBX_HLP_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQBAH_PAGE			0x02D40190 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQBAH_PAGE_ARQBAH_S	0
+#define PF0_MBX_PSM_ARQBAH_PAGE_ARQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_PSM_ARQBAL_PAGE			0x02D40090 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_MBX_PSM_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_MBX_PSM_ARQBAL_PAGE_ARQBAL_S	6
+#define PF0_MBX_PSM_ARQBAL_PAGE_ARQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_PSM_ARQH_PAGE			0x02D40390 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQH_PAGE_ARQH_S		0
+#define PF0_MBX_PSM_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQLEN_PAGE			0x02D40290 /* Reset Source: PFR */
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQLEN_S	0
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQVFE_S	28
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQVFE_M	BIT(28)
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_MBX_PSM_ARQT_PAGE			0x02D40490 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQT_PAGE_ARQT_S		0
+#define PF0_MBX_PSM_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQBAH_PAGE			0x02D40110 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQBAH_PAGE_ATQBAH_S	0
+#define PF0_MBX_PSM_ATQBAH_PAGE_ATQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_PSM_ATQBAL_PAGE			0x02D40010 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQBAL_PAGE_ATQBAL_S	6
+#define PF0_MBX_PSM_ATQBAL_PAGE_ATQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_PSM_ATQH_PAGE			0x02D40310 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQH_PAGE_ATQH_S		0
+#define PF0_MBX_PSM_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQLEN_PAGE			0x02D40210 /* Reset Source: PFR */
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQLEN_S	0
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQVFE_S	28
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQVFE_M	BIT(28)
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_MBX_PSM_ATQT_PAGE			0x02D40410 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQT_PAGE_ATQT_S		0
+#define PF0_MBX_PSM_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQBAH_PAGE			0x02D801A0 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQBAH_PAGE_ARQBAH_S		0
+#define PF0_SB_CPM_ARQBAH_PAGE_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_CPM_ARQBAL_PAGE			0x02D800A0 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_SB_CPM_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_SB_CPM_ARQBAL_PAGE_ARQBAL_S		6
+#define PF0_SB_CPM_ARQBAL_PAGE_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_CPM_ARQH_PAGE			0x02D803A0 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQH_PAGE_ARQH_S		0
+#define PF0_SB_CPM_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQLEN_PAGE			0x02D802A0 /* Reset Source: PFR */
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQLEN_S		0
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQVFE_S		28
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQVFE_M		BIT(28)
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_SB_CPM_ARQT_PAGE			0x02D804A0 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQT_PAGE_ARQT_S		0
+#define PF0_SB_CPM_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQBAH_PAGE			0x02D80120 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQBAH_PAGE_ATQBAH_S		0
+#define PF0_SB_CPM_ATQBAH_PAGE_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_CPM_ATQBAL_PAGE			0x02D80020 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQBAL_PAGE_ATQBAL_S		6
+#define PF0_SB_CPM_ATQBAL_PAGE_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_CPM_ATQH_PAGE			0x02D80320 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQH_PAGE_ATQH_S		0
+#define PF0_SB_CPM_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQLEN_PAGE			0x02D80220 /* Reset Source: PFR */
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQLEN_S		0
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQVFE_S		28
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQVFE_M		BIT(28)
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_SB_CPM_ATQT_PAGE			0x02D80420 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQT_PAGE_ATQT_S		0
+#define PF0_SB_CPM_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQBAH_PAGE			0x02D001A0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQBAH_PAGE_ARQBAH_S		0
+#define PF0_SB_HLP_ARQBAH_PAGE_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_HLP_ARQBAL_PAGE			0x02D000A0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_SB_HLP_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_SB_HLP_ARQBAL_PAGE_ARQBAL_S		6
+#define PF0_SB_HLP_ARQBAL_PAGE_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_HLP_ARQH_PAGE			0x02D003A0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQH_PAGE_ARQH_S		0
+#define PF0_SB_HLP_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQLEN_PAGE			0x02D002A0 /* Reset Source: PFR */
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQLEN_S		0
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQVFE_S		28
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQVFE_M		BIT(28)
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_SB_HLP_ARQT_PAGE			0x02D004A0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQT_PAGE_ARQT_S		0
+#define PF0_SB_HLP_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQBAH_PAGE			0x02D00120 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQBAH_PAGE_ATQBAH_S		0
+#define PF0_SB_HLP_ATQBAH_PAGE_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_HLP_ATQBAL_PAGE			0x02D00020 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQBAL_PAGE_ATQBAL_S		6
+#define PF0_SB_HLP_ATQBAL_PAGE_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_HLP_ATQH_PAGE			0x02D00320 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQH_PAGE_ATQH_S		0
+#define PF0_SB_HLP_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQLEN_PAGE			0x02D00220 /* Reset Source: PFR */
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQLEN_S		0
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQVFE_S		28
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQVFE_M		BIT(28)
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_SB_HLP_ATQT_PAGE			0x02D00420 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQT_PAGE_ATQT_S		0
+#define PF0_SB_HLP_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0INT_DYN_CTL(_i)			(0x03000000 + ((_i) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define PF0INT_DYN_CTL_MAX_INDEX		2047
+#define PF0INT_DYN_CTL_INTENA_S			0
+#define PF0INT_DYN_CTL_INTENA_M			BIT(0)
+#define PF0INT_DYN_CTL_CLEARPBA_S		1
+#define PF0INT_DYN_CTL_CLEARPBA_M		BIT(1)
+#define PF0INT_DYN_CTL_SWINT_TRIG_S		2
+#define PF0INT_DYN_CTL_SWINT_TRIG_M		BIT(2)
+#define PF0INT_DYN_CTL_ITR_INDX_S		3
+#define PF0INT_DYN_CTL_ITR_INDX_M		ICE_M(0x3, 3)
+#define PF0INT_DYN_CTL_INTERVAL_S		5
+#define PF0INT_DYN_CTL_INTERVAL_M		ICE_M(0xFFF, 5)
+#define PF0INT_DYN_CTL_SW_ITR_INDX_ENA_S	24
+#define PF0INT_DYN_CTL_SW_ITR_INDX_ENA_M	BIT(24)
+#define PF0INT_DYN_CTL_SW_ITR_INDX_S		25
+#define PF0INT_DYN_CTL_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define PF0INT_DYN_CTL_WB_ON_ITR_S		30
+#define PF0INT_DYN_CTL_WB_ON_ITR_M		BIT(30)
+#define PF0INT_DYN_CTL_INTENA_MSK_S		31
+#define PF0INT_DYN_CTL_INTENA_MSK_M		BIT(31)
+#define PF0INT_ITR_0(_i)			(0x03000004 + ((_i) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define PF0INT_ITR_0_MAX_INDEX			2047
+#define PF0INT_ITR_0_INTERVAL_S			0
+#define PF0INT_ITR_0_INTERVAL_M			ICE_M(0xFFF, 0)
+#define PF0INT_ITR_1(_i)			(0x03000008 + ((_i) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define PF0INT_ITR_1_MAX_INDEX			2047
+#define PF0INT_ITR_1_INTERVAL_S			0
+#define PF0INT_ITR_1_INTERVAL_M			ICE_M(0xFFF, 0)
+#define PF0INT_ITR_2(_i)			(0x0300000C + ((_i) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define PF0INT_ITR_2_MAX_INDEX			2047
+#define PF0INT_ITR_2_INTERVAL_S			0
+#define PF0INT_ITR_2_INTERVAL_M			ICE_M(0xFFF, 0)
+#define PF0INT_OICR_CPM_PAGE			0x02D03000 /* Reset Source: CORER */
+#define PF0INT_OICR_CPM_PAGE_INTEVENT_S		0
+#define PF0INT_OICR_CPM_PAGE_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_CPM_PAGE_QUEUE_S		1
+#define PF0INT_OICR_CPM_PAGE_QUEUE_M		BIT(1)
+#define PF0INT_OICR_CPM_PAGE_RSV1_S		2
+#define PF0INT_OICR_CPM_PAGE_RSV1_M		ICE_M(0xFF, 2)
+#define PF0INT_OICR_CPM_PAGE_HH_COMP_S		10
+#define PF0INT_OICR_CPM_PAGE_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_CPM_PAGE_TSYN_TX_S		11
+#define PF0INT_OICR_CPM_PAGE_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_CPM_PAGE_TSYN_EVNT_S	12
+#define PF0INT_OICR_CPM_PAGE_TSYN_EVNT_M	BIT(12)
+#define PF0INT_OICR_CPM_PAGE_TSYN_TGT_S		13
+#define PF0INT_OICR_CPM_PAGE_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_CPM_PAGE_HLP_RDY_S		14
+#define PF0INT_OICR_CPM_PAGE_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_CPM_PAGE_CPM_RDY_S		15
+#define PF0INT_OICR_CPM_PAGE_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_CPM_PAGE_ECC_ERR_S		16
+#define PF0INT_OICR_CPM_PAGE_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_CPM_PAGE_RSV2_S		17
+#define PF0INT_OICR_CPM_PAGE_RSV2_M		ICE_M(0x3, 17)
+#define PF0INT_OICR_CPM_PAGE_MAL_DETECT_S	19
+#define PF0INT_OICR_CPM_PAGE_MAL_DETECT_M	BIT(19)
+#define PF0INT_OICR_CPM_PAGE_GRST_S		20
+#define PF0INT_OICR_CPM_PAGE_GRST_M		BIT(20)
+#define PF0INT_OICR_CPM_PAGE_PCI_EXCEPTION_S	21
+#define PF0INT_OICR_CPM_PAGE_PCI_EXCEPTION_M	BIT(21)
+#define PF0INT_OICR_CPM_PAGE_GPIO_S		22
+#define PF0INT_OICR_CPM_PAGE_GPIO_M		BIT(22)
+#define PF0INT_OICR_CPM_PAGE_RSV3_S		23
+#define PF0INT_OICR_CPM_PAGE_RSV3_M		BIT(23)
+#define PF0INT_OICR_CPM_PAGE_STORM_DETECT_S	24
+#define PF0INT_OICR_CPM_PAGE_STORM_DETECT_M	BIT(24)
+#define PF0INT_OICR_CPM_PAGE_LINK_STAT_CHANGE_S 25
+#define PF0INT_OICR_CPM_PAGE_LINK_STAT_CHANGE_M BIT(25)
+#define PF0INT_OICR_CPM_PAGE_HMC_ERR_S		26
+#define PF0INT_OICR_CPM_PAGE_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_CPM_PAGE_PE_PUSH_S		27
+#define PF0INT_OICR_CPM_PAGE_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_CPM_PAGE_PE_CRITERR_S	28
+#define PF0INT_OICR_CPM_PAGE_PE_CRITERR_M	BIT(28)
+#define PF0INT_OICR_CPM_PAGE_VFLR_S		29
+#define PF0INT_OICR_CPM_PAGE_VFLR_M		BIT(29)
+#define PF0INT_OICR_CPM_PAGE_XLR_HW_DONE_S	30
+#define PF0INT_OICR_CPM_PAGE_XLR_HW_DONE_M	BIT(30)
+#define PF0INT_OICR_CPM_PAGE_SWINT_S		31
+#define PF0INT_OICR_CPM_PAGE_SWINT_M		BIT(31)
+#define PF0INT_OICR_ENA_CPM_PAGE		0x02D03100 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_CPM_PAGE_RSV0_S		0
+#define PF0INT_OICR_ENA_CPM_PAGE_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_CPM_PAGE_INT_ENA_S	1
+#define PF0INT_OICR_ENA_CPM_PAGE_INT_ENA_M	ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_ENA_HLP_PAGE		0x02D01100 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_HLP_PAGE_RSV0_S		0
+#define PF0INT_OICR_ENA_HLP_PAGE_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_HLP_PAGE_INT_ENA_S	1
+#define PF0INT_OICR_ENA_HLP_PAGE_INT_ENA_M	ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_ENA_PSM_PAGE		0x02D02100 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_PSM_PAGE_RSV0_S		0
+#define PF0INT_OICR_ENA_PSM_PAGE_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_PSM_PAGE_INT_ENA_S	1
+#define PF0INT_OICR_ENA_PSM_PAGE_INT_ENA_M	ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_HLP_PAGE			0x02D01000 /* Reset Source: CORER */
+#define PF0INT_OICR_HLP_PAGE_INTEVENT_S		0
+#define PF0INT_OICR_HLP_PAGE_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_HLP_PAGE_QUEUE_S		1
+#define PF0INT_OICR_HLP_PAGE_QUEUE_M		BIT(1)
+#define PF0INT_OICR_HLP_PAGE_RSV1_S		2
+#define PF0INT_OICR_HLP_PAGE_RSV1_M		ICE_M(0xFF, 2)
+#define PF0INT_OICR_HLP_PAGE_HH_COMP_S		10
+#define PF0INT_OICR_HLP_PAGE_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_HLP_PAGE_TSYN_TX_S		11
+#define PF0INT_OICR_HLP_PAGE_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_HLP_PAGE_TSYN_EVNT_S	12
+#define PF0INT_OICR_HLP_PAGE_TSYN_EVNT_M	BIT(12)
+#define PF0INT_OICR_HLP_PAGE_TSYN_TGT_S		13
+#define PF0INT_OICR_HLP_PAGE_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_HLP_PAGE_HLP_RDY_S		14
+#define PF0INT_OICR_HLP_PAGE_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_HLP_PAGE_CPM_RDY_S		15
+#define PF0INT_OICR_HLP_PAGE_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_HLP_PAGE_ECC_ERR_S		16
+#define PF0INT_OICR_HLP_PAGE_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_HLP_PAGE_RSV2_S		17
+#define PF0INT_OICR_HLP_PAGE_RSV2_M		ICE_M(0x3, 17)
+#define PF0INT_OICR_HLP_PAGE_MAL_DETECT_S	19
+#define PF0INT_OICR_HLP_PAGE_MAL_DETECT_M	BIT(19)
+#define PF0INT_OICR_HLP_PAGE_GRST_S		20
+#define PF0INT_OICR_HLP_PAGE_GRST_M		BIT(20)
+#define PF0INT_OICR_HLP_PAGE_PCI_EXCEPTION_S	21
+#define PF0INT_OICR_HLP_PAGE_PCI_EXCEPTION_M	BIT(21)
+#define PF0INT_OICR_HLP_PAGE_GPIO_S		22
+#define PF0INT_OICR_HLP_PAGE_GPIO_M		BIT(22)
+#define PF0INT_OICR_HLP_PAGE_RSV3_S		23
+#define PF0INT_OICR_HLP_PAGE_RSV3_M		BIT(23)
+#define PF0INT_OICR_HLP_PAGE_STORM_DETECT_S	24
+#define PF0INT_OICR_HLP_PAGE_STORM_DETECT_M	BIT(24)
+#define PF0INT_OICR_HLP_PAGE_LINK_STAT_CHANGE_S 25
+#define PF0INT_OICR_HLP_PAGE_LINK_STAT_CHANGE_M BIT(25)
+#define PF0INT_OICR_HLP_PAGE_HMC_ERR_S		26
+#define PF0INT_OICR_HLP_PAGE_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_HLP_PAGE_PE_PUSH_S		27
+#define PF0INT_OICR_HLP_PAGE_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_HLP_PAGE_PE_CRITERR_S	28
+#define PF0INT_OICR_HLP_PAGE_PE_CRITERR_M	BIT(28)
+#define PF0INT_OICR_HLP_PAGE_VFLR_S		29
+#define PF0INT_OICR_HLP_PAGE_VFLR_M		BIT(29)
+#define PF0INT_OICR_HLP_PAGE_XLR_HW_DONE_S	30
+#define PF0INT_OICR_HLP_PAGE_XLR_HW_DONE_M	BIT(30)
+#define PF0INT_OICR_HLP_PAGE_SWINT_S		31
+#define PF0INT_OICR_HLP_PAGE_SWINT_M		BIT(31)
+#define PF0INT_OICR_PSM_PAGE			0x02D02000 /* Reset Source: CORER */
+#define PF0INT_OICR_PSM_PAGE_INTEVENT_S		0
+#define PF0INT_OICR_PSM_PAGE_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_PSM_PAGE_QUEUE_S		1
+#define PF0INT_OICR_PSM_PAGE_QUEUE_M		BIT(1)
+#define PF0INT_OICR_PSM_PAGE_RSV1_S		2
+#define PF0INT_OICR_PSM_PAGE_RSV1_M		ICE_M(0xFF, 2)
+#define PF0INT_OICR_PSM_PAGE_HH_COMP_S		10
+#define PF0INT_OICR_PSM_PAGE_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_PSM_PAGE_TSYN_TX_S		11
+#define PF0INT_OICR_PSM_PAGE_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_PSM_PAGE_TSYN_EVNT_S	12
+#define PF0INT_OICR_PSM_PAGE_TSYN_EVNT_M	BIT(12)
+#define PF0INT_OICR_PSM_PAGE_TSYN_TGT_S		13
+#define PF0INT_OICR_PSM_PAGE_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_PSM_PAGE_HLP_RDY_S		14
+#define PF0INT_OICR_PSM_PAGE_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_PSM_PAGE_CPM_RDY_S		15
+#define PF0INT_OICR_PSM_PAGE_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_PSM_PAGE_ECC_ERR_S		16
+#define PF0INT_OICR_PSM_PAGE_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_PSM_PAGE_RSV2_S		17
+#define PF0INT_OICR_PSM_PAGE_RSV2_M		ICE_M(0x3, 17)
+#define PF0INT_OICR_PSM_PAGE_MAL_DETECT_S	19
+#define PF0INT_OICR_PSM_PAGE_MAL_DETECT_M	BIT(19)
+#define PF0INT_OICR_PSM_PAGE_GRST_S		20
+#define PF0INT_OICR_PSM_PAGE_GRST_M		BIT(20)
+#define PF0INT_OICR_PSM_PAGE_PCI_EXCEPTION_S	21
+#define PF0INT_OICR_PSM_PAGE_PCI_EXCEPTION_M	BIT(21)
+#define PF0INT_OICR_PSM_PAGE_GPIO_S		22
+#define PF0INT_OICR_PSM_PAGE_GPIO_M		BIT(22)
+#define PF0INT_OICR_PSM_PAGE_RSV3_S		23
+#define PF0INT_OICR_PSM_PAGE_RSV3_M		BIT(23)
+#define PF0INT_OICR_PSM_PAGE_STORM_DETECT_S	24
+#define PF0INT_OICR_PSM_PAGE_STORM_DETECT_M	BIT(24)
+#define PF0INT_OICR_PSM_PAGE_LINK_STAT_CHANGE_S 25
+#define PF0INT_OICR_PSM_PAGE_LINK_STAT_CHANGE_M BIT(25)
+#define PF0INT_OICR_PSM_PAGE_HMC_ERR_S		26
+#define PF0INT_OICR_PSM_PAGE_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_PSM_PAGE_PE_PUSH_S		27
+#define PF0INT_OICR_PSM_PAGE_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_PSM_PAGE_PE_CRITERR_S	28
+#define PF0INT_OICR_PSM_PAGE_PE_CRITERR_M	BIT(28)
+#define PF0INT_OICR_PSM_PAGE_VFLR_S		29
+#define PF0INT_OICR_PSM_PAGE_VFLR_M		BIT(29)
+#define PF0INT_OICR_PSM_PAGE_XLR_HW_DONE_S	30
+#define PF0INT_OICR_PSM_PAGE_XLR_HW_DONE_M	BIT(30)
+#define PF0INT_OICR_PSM_PAGE_SWINT_S		31
+#define PF0INT_OICR_PSM_PAGE_SWINT_M		BIT(31)
+#define QRX_TAIL_PAGE(_QRX)			(0x03800000 + ((_QRX) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define QRX_TAIL_PAGE_MAX_INDEX			2047
+#define QRX_TAIL_PAGE_TAIL_S			0
+#define QRX_TAIL_PAGE_TAIL_M			ICE_M(0x1FFF, 0)
+#define QTX_COMM_DBELL_PAGE(_DBQM)		(0x04000000 + ((_DBQM) * 4096)) /* _i=0...16383 */ /* Reset Source: CORER */
+#define QTX_COMM_DBELL_PAGE_MAX_INDEX		16383
+#define QTX_COMM_DBELL_PAGE_QTX_COMM_DBELL_S	0
+#define QTX_COMM_DBELL_PAGE_QTX_COMM_DBELL_M	ICE_M(0xFFFFFFFF, 0)
+#define QTX_COMM_DBLQ_DBELL_PAGE(_DBLQ)		(0x02F00000 + ((_DBLQ) * 4096)) /* _i=0...255 */ /* Reset Source: CORER */
+#define QTX_COMM_DBLQ_DBELL_PAGE_MAX_INDEX	255
+#define QTX_COMM_DBLQ_DBELL_PAGE_TAIL_S		0
+#define QTX_COMM_DBLQ_DBELL_PAGE_TAIL_M		ICE_M(0x1FFF, 0)
+#define VSI_MBX_ARQBAH(_VSI)			(0x02000018 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ARQBAH_MAX_INDEX		767
+#define VSI_MBX_ARQBAH_ARQBAH_S			0
+#define VSI_MBX_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VSI_MBX_ARQBAL(_VSI)			(0x02000014 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ARQBAL_MAX_INDEX		767
+#define VSI_MBX_ARQBAL_ARQBAL_LSB_S		0
+#define VSI_MBX_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VSI_MBX_ARQBAL_ARQBAL_S			6
+#define VSI_MBX_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VSI_MBX_ARQH(_VSI)			(0x02000020 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ARQH_MAX_INDEX			767
+#define VSI_MBX_ARQH_ARQH_S			0
+#define VSI_MBX_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ARQLEN(_VSI)			(0x0200001C + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_MBX_ARQLEN_MAX_INDEX		767
+#define VSI_MBX_ARQLEN_ARQLEN_S			0
+#define VSI_MBX_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ARQLEN_ARQVFE_S			28
+#define VSI_MBX_ARQLEN_ARQVFE_M			BIT(28)
+#define VSI_MBX_ARQLEN_ARQOVFL_S		29
+#define VSI_MBX_ARQLEN_ARQOVFL_M		BIT(29)
+#define VSI_MBX_ARQLEN_ARQCRIT_S		30
+#define VSI_MBX_ARQLEN_ARQCRIT_M		BIT(30)
+#define VSI_MBX_ARQLEN_ARQENABLE_S		31
+#define VSI_MBX_ARQLEN_ARQENABLE_M		BIT(31)
+#define VSI_MBX_ARQT(_VSI)			(0x02000024 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ARQT_MAX_INDEX			767
+#define VSI_MBX_ARQT_ARQT_S			0
+#define VSI_MBX_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ATQBAH(_VSI)			(0x02000004 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ATQBAH_MAX_INDEX		767
+#define VSI_MBX_ATQBAH_ATQBAH_S			0
+#define VSI_MBX_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VSI_MBX_ATQBAL(_VSI)			(0x02000000 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ATQBAL_MAX_INDEX		767
+#define VSI_MBX_ATQBAL_ATQBAL_S			6
+#define VSI_MBX_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VSI_MBX_ATQH(_VSI)			(0x0200000C + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ATQH_MAX_INDEX			767
+#define VSI_MBX_ATQH_ATQH_S			0
+#define VSI_MBX_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ATQLEN(_VSI)			(0x02000008 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_MBX_ATQLEN_MAX_INDEX		767
+#define VSI_MBX_ATQLEN_ATQLEN_S			0
+#define VSI_MBX_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ATQLEN_ATQVFE_S			28
+#define VSI_MBX_ATQLEN_ATQVFE_M			BIT(28)
+#define VSI_MBX_ATQLEN_ATQOVFL_S		29
+#define VSI_MBX_ATQLEN_ATQOVFL_M		BIT(29)
+#define VSI_MBX_ATQLEN_ATQCRIT_S		30
+#define VSI_MBX_ATQLEN_ATQCRIT_M		BIT(30)
+#define VSI_MBX_ATQLEN_ATQENABLE_S		31
+#define VSI_MBX_ATQLEN_ATQENABLE_M		BIT(31)
+#define VSI_MBX_ATQT(_VSI)			(0x02000010 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ATQT_MAX_INDEX			767
+#define VSI_MBX_ATQT_ATQT_S			0
+#define VSI_MBX_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define GL_ACL_ACCESS_CMD			0x00391000 /* Reset Source: CORER */
+#define GL_ACL_ACCESS_CMD_TABLE_ID_S		0
+#define GL_ACL_ACCESS_CMD_TABLE_ID_M		ICE_M(0xFF, 0)
+#define GL_ACL_ACCESS_CMD_ENTRY_INDEX_S		8
+#define GL_ACL_ACCESS_CMD_ENTRY_INDEX_M		ICE_M(0xFFF, 8)
+#define GL_ACL_ACCESS_CMD_OPERATION_S		20
+#define GL_ACL_ACCESS_CMD_OPERATION_M		BIT(20)
+#define GL_ACL_ACCESS_CMD_OBJ_TYPE_S		24
+#define GL_ACL_ACCESS_CMD_OBJ_TYPE_M		ICE_M(0xF, 24)
+#define GL_ACL_ACCESS_CMD_EXECUTE_S		31
+#define GL_ACL_ACCESS_CMD_EXECUTE_M		BIT(31)
+#define GL_ACL_ACCESS_STATUS			0x00391004 /* Reset Source: CORER */
+#define GL_ACL_ACCESS_STATUS_BUSY_S		0
+#define GL_ACL_ACCESS_STATUS_BUSY_M		BIT(0)
+#define GL_ACL_ACCESS_STATUS_DONE_S		1
+#define GL_ACL_ACCESS_STATUS_DONE_M		BIT(1)
+#define GL_ACL_ACCESS_STATUS_ERROR_S		2
+#define GL_ACL_ACCESS_STATUS_ERROR_M		BIT(2)
+#define GL_ACL_ACCESS_STATUS_OPERATION_S	3
+#define GL_ACL_ACCESS_STATUS_OPERATION_M	BIT(3)
+#define GL_ACL_ACCESS_STATUS_ERROR_CODE_S	4
+#define GL_ACL_ACCESS_STATUS_ERROR_CODE_M	ICE_M(0xF, 4)
+#define GL_ACL_ACCESS_STATUS_TABLE_ID_S		8
+#define GL_ACL_ACCESS_STATUS_TABLE_ID_M		ICE_M(0xFF, 8)
+#define GL_ACL_ACCESS_STATUS_ENTRY_INDEX_S	16
+#define GL_ACL_ACCESS_STATUS_ENTRY_INDEX_M	ICE_M(0xFFF, 16)
+#define GL_ACL_ACCESS_STATUS_OBJ_TYPE_S		28
+#define GL_ACL_ACCESS_STATUS_OBJ_TYPE_M		ICE_M(0xF, 28)
+#define GL_ACL_ACTMEM_ACT(_i)			(0x00393824 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GL_ACL_ACTMEM_ACT_MAX_INDEX		1
+#define GL_ACL_ACTMEM_ACT_VALUE_S		0
+#define GL_ACL_ACTMEM_ACT_VALUE_M		ICE_M(0xFFFF, 0)
+#define GL_ACL_ACTMEM_ACT_MDID_S		20
+#define GL_ACL_ACTMEM_ACT_MDID_M		ICE_M(0x3F, 20)
+#define GL_ACL_ACTMEM_ACT_PRIORITY_S		28
+#define GL_ACL_ACTMEM_ACT_PRIORITY_M		ICE_M(0x7, 28)
+#define GL_ACL_CHICKEN_REGISTER			0x00393810 /* Reset Source: CORER */
+#define GL_ACL_CHICKEN_REGISTER_TCAM_DATA_POL_CH_S 0
+#define GL_ACL_CHICKEN_REGISTER_TCAM_DATA_POL_CH_M BIT(0)
+#define GL_ACL_CHICKEN_REGISTER_TCAM_ADDR_POL_CH_S 1
+#define GL_ACL_CHICKEN_REGISTER_TCAM_ADDR_POL_CH_M BIT(1)
+#define GL_ACL_DEFAULT_ACT(_i)			(0x00391168 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GL_ACL_DEFAULT_ACT_MAX_INDEX		15
+#define GL_ACL_DEFAULT_ACT_VALUE_S		0
+#define GL_ACL_DEFAULT_ACT_VALUE_M		ICE_M(0xFFFF, 0)
+#define GL_ACL_DEFAULT_ACT_MDID_S		20
+#define GL_ACL_DEFAULT_ACT_MDID_M		ICE_M(0x3F, 20)
+#define GL_ACL_DEFAULT_ACT_PRIORITY_S		28
+#define GL_ACL_DEFAULT_ACT_PRIORITY_M		ICE_M(0x7, 28)
+#define GL_ACL_PROFILE_BWSB_SEL(_i)		(0x00391008 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_BWSB_SEL_MAX_INDEX	31
+#define GL_ACL_PROFILE_BWSB_SEL_BSB_SRC_OFF_S	0
+#define GL_ACL_PROFILE_BWSB_SEL_BSB_SRC_OFF_M	ICE_M(0x3F, 0)
+#define GL_ACL_PROFILE_BWSB_SEL_WSB_SRC_OFF_S	8
+#define GL_ACL_PROFILE_BWSB_SEL_WSB_SRC_OFF_M	ICE_M(0x1F, 8)
+#define GL_ACL_PROFILE_DWSB_SEL(_i)		(0x00391088 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_DWSB_SEL_MAX_INDEX	15
+#define GL_ACL_PROFILE_DWSB_SEL_DWORD_SEL_OFF_S 0
+#define GL_ACL_PROFILE_DWSB_SEL_DWORD_SEL_OFF_M ICE_M(0xF, 0)
+#define GL_ACL_PROFILE_PF_CFG(_i)		(0x003910C8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_PF_CFG_MAX_INDEX		7
+#define GL_ACL_PROFILE_PF_CFG_SCEN_SEL_S	0
+#define GL_ACL_PROFILE_PF_CFG_SCEN_SEL_M	ICE_M(0x3F, 0)
+#define GL_ACL_PROFILE_RC_CFG(_i)		(0x003910E8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_RC_CFG_MAX_INDEX		7
+#define GL_ACL_PROFILE_RC_CFG_LOW_BOUND_S	0
+#define GL_ACL_PROFILE_RC_CFG_LOW_BOUND_M	ICE_M(0xFFFF, 0)
+#define GL_ACL_PROFILE_RC_CFG_HIGH_BOUND_S	16
+#define GL_ACL_PROFILE_RC_CFG_HIGH_BOUND_M	ICE_M(0xFFFF, 16)
+#define GL_ACL_PROFILE_RCF_MASK(_i)		(0x00391108 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_RCF_MASK_MAX_INDEX	7
+#define GL_ACL_PROFILE_RCF_MASK_MASK_S		0
+#define GL_ACL_PROFILE_RCF_MASK_MASK_M		ICE_M(0xFFFF, 0)
+#define GL_ACL_SCENARIO_ACT_CFG(_i)		(0x003938AC + ((_i) * 4)) /* _i=0...19 */ /* Reset Source: CORER */
+#define GL_ACL_SCENARIO_ACT_CFG_MAX_INDEX	19
+#define GL_ACL_SCENARIO_ACT_CFG_ACTMEM_SEL_S	0
+#define GL_ACL_SCENARIO_ACT_CFG_ACTMEM_SEL_M	ICE_M(0xF, 0)
+#define GL_ACL_SCENARIO_ACT_CFG_ACTMEM_EN_S	8
+#define GL_ACL_SCENARIO_ACT_CFG_ACTMEM_EN_M	BIT(8)
+#define GL_ACL_SCENARIO_CFG_H(_i)		(0x0039386C + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GL_ACL_SCENARIO_CFG_H_MAX_INDEX		15
+#define GL_ACL_SCENARIO_CFG_H_SELECT4_S		0
+#define GL_ACL_SCENARIO_CFG_H_SELECT4_M		ICE_M(0x1F, 0)
+#define GL_ACL_SCENARIO_CFG_H_CHUNKMASK_S	8
+#define GL_ACL_SCENARIO_CFG_H_CHUNKMASK_M	ICE_M(0xFF, 8)
+#define GL_ACL_SCENARIO_CFG_H_START_COMPARE_S	24
+#define GL_ACL_SCENARIO_CFG_H_START_COMPARE_M	BIT(24)
+#define GL_ACL_SCENARIO_CFG_H_START_SET_S	28
+#define GL_ACL_SCENARIO_CFG_H_START_SET_M	BIT(28)
+#define GL_ACL_SCENARIO_CFG_L(_i)		(0x0039382C + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GL_ACL_SCENARIO_CFG_L_MAX_INDEX		15
+#define GL_ACL_SCENARIO_CFG_L_SELECT0_S		0
+#define GL_ACL_SCENARIO_CFG_L_SELECT0_M		ICE_M(0x7F, 0)
+#define GL_ACL_SCENARIO_CFG_L_SELECT1_S		8
+#define GL_ACL_SCENARIO_CFG_L_SELECT1_M		ICE_M(0x7F, 8)
+#define GL_ACL_SCENARIO_CFG_L_SELECT2_S		16
+#define GL_ACL_SCENARIO_CFG_L_SELECT2_M		ICE_M(0x7F, 16)
+#define GL_ACL_SCENARIO_CFG_L_SELECT3_S		24
+#define GL_ACL_SCENARIO_CFG_L_SELECT3_M		ICE_M(0x7F, 24)
+#define GL_ACL_TCAM_KEY_H			0x00393818 /* Reset Source: CORER */
+#define GL_ACL_TCAM_KEY_H_GL_ACL_FFU_TCAM_KEY_H_S 0
+#define GL_ACL_TCAM_KEY_H_GL_ACL_FFU_TCAM_KEY_H_M ICE_M(0xFF, 0)
+#define GL_ACL_TCAM_KEY_INV_H			0x00393820 /* Reset Source: CORER */
+#define GL_ACL_TCAM_KEY_INV_H_GL_ACL_FFU_TCAM_KEY_INV_H_S 0
+#define GL_ACL_TCAM_KEY_INV_H_GL_ACL_FFU_TCAM_KEY_INV_H_M ICE_M(0xFF, 0)
+#define GL_ACL_TCAM_KEY_INV_L			0x0039381C /* Reset Source: CORER */
+#define GL_ACL_TCAM_KEY_INV_L_GL_ACL_FFU_TCAM_KEY_INV_L_S 0
+#define GL_ACL_TCAM_KEY_INV_L_GL_ACL_FFU_TCAM_KEY_INV_L_M ICE_M(0xFFFFFFFF, 0)
+#define GL_ACL_TCAM_KEY_L			0x00393814 /* Reset Source: CORER */
+#define GL_ACL_TCAM_KEY_L_GL_ACL_FFU_TCAM_KEY_L_S 0
+#define GL_ACL_TCAM_KEY_L_GL_ACL_FFU_TCAM_KEY_L_M ICE_M(0xFFFFFFFF, 0)
+#define VSI_ACL_DEF_SEL(_VSI)			(0x00391800 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_ACL_DEF_SEL_MAX_INDEX		767
+#define VSI_ACL_DEF_SEL_RX_PROFILE_MISS_SEL_S	0
+#define VSI_ACL_DEF_SEL_RX_PROFILE_MISS_SEL_M	ICE_M(0x3, 0)
+#define VSI_ACL_DEF_SEL_RX_TABLES_MISS_SEL_S	4
+#define VSI_ACL_DEF_SEL_RX_TABLES_MISS_SEL_M	ICE_M(0x3, 4)
+#define VSI_ACL_DEF_SEL_TX_PROFILE_MISS_SEL_S	8
+#define VSI_ACL_DEF_SEL_TX_PROFILE_MISS_SEL_M	ICE_M(0x3, 8)
+#define VSI_ACL_DEF_SEL_TX_TABLES_MISS_SEL_S	12
+#define VSI_ACL_DEF_SEL_TX_TABLES_MISS_SEL_M	ICE_M(0x3, 12)
+#define GL_SWT_L2TAG0(_i)			(0x000492A8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAG0_MAX_INDEX			7
+#define GL_SWT_L2TAG0_DATA_S			0
+#define GL_SWT_L2TAG0_DATA_M			ICE_M(0xFFFFFFFF, 0)
+#define GL_SWT_L2TAG1(_i)			(0x000492C8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAG1_MAX_INDEX			7
+#define GL_SWT_L2TAG1_DATA_S			0
+#define GL_SWT_L2TAG1_DATA_M			ICE_M(0xFFFFFFFF, 0)
+#define GL_SWT_L2TAGCTRL(_i)			(0x001D2660 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAGCTRL_MAX_INDEX		7
+#define GL_SWT_L2TAGCTRL_LENGTH_S		0
+#define GL_SWT_L2TAGCTRL_LENGTH_M		ICE_M(0x7F, 0)
+#define GL_SWT_L2TAGCTRL_HAS_UP_S		7
+#define GL_SWT_L2TAGCTRL_HAS_UP_M		BIT(7)
+#define GL_SWT_L2TAGCTRL_ISVLAN_S		9
+#define GL_SWT_L2TAGCTRL_ISVLAN_M		BIT(9)
+#define GL_SWT_L2TAGCTRL_INNERUP_S		10
+#define GL_SWT_L2TAGCTRL_INNERUP_M		BIT(10)
+#define GL_SWT_L2TAGCTRL_OUTERUP_S		11
+#define GL_SWT_L2TAGCTRL_OUTERUP_M		BIT(11)
+#define GL_SWT_L2TAGCTRL_LONG_S			12
+#define GL_SWT_L2TAGCTRL_LONG_M			BIT(12)
+#define GL_SWT_L2TAGCTRL_ISMPLS_S		13
+#define GL_SWT_L2TAGCTRL_ISMPLS_M		BIT(13)
+#define GL_SWT_L2TAGCTRL_ISNSH_S		14
+#define GL_SWT_L2TAGCTRL_ISNSH_M		BIT(14)
+#define GL_SWT_L2TAGCTRL_ETHERTYPE_S		16
+#define GL_SWT_L2TAGCTRL_ETHERTYPE_M		ICE_M(0xFFFF, 16)
+#define GL_SWT_L2TAGRXEB(_i)			(0x00052000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAGRXEB_MAX_INDEX		7
+#define GL_SWT_L2TAGRXEB_OFFSET_S		0
+#define GL_SWT_L2TAGRXEB_OFFSET_M		ICE_M(0xFF, 0)
+#define GL_SWT_L2TAGRXEB_LENGTH_S		8
+#define GL_SWT_L2TAGRXEB_LENGTH_M		ICE_M(0x3, 8)
+#define GL_SWT_L2TAGTXIB(_i)			(0x000492E8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAGTXIB_MAX_INDEX		7
+#define GL_SWT_L2TAGTXIB_OFFSET_S		0
+#define GL_SWT_L2TAGTXIB_OFFSET_M		ICE_M(0xFF, 0)
+#define GL_SWT_L2TAGTXIB_LENGTH_S		8
+#define GL_SWT_L2TAGTXIB_LENGTH_M		ICE_M(0x3, 8)
+#define GLCM_PE_CACHESIZE			0x005046B4 /* Reset Source: CORER */
+#define GLCM_PE_CACHESIZE_WORD_SIZE_S		0
+#define GLCM_PE_CACHESIZE_WORD_SIZE_M		ICE_M(0xFFF, 0)
+#define GLCM_PE_CACHESIZE_SETS_S		12
+#define GLCM_PE_CACHESIZE_SETS_M		ICE_M(0xF, 12)
+#define GLCM_PE_CACHESIZE_WAYS_S		16
+#define GLCM_PE_CACHESIZE_WAYS_M		ICE_M(0x1FF, 16)
+#define GLCOMM_CQ_CTL(_CQ)			(0x000F0000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLCOMM_CQ_CTL_MAX_INDEX			511
+#define GLCOMM_CQ_CTL_COMP_TYPE_S		0
+#define GLCOMM_CQ_CTL_COMP_TYPE_M		ICE_M(0x7, 0)
+#define GLCOMM_CQ_CTL_CMD_S			4
+#define GLCOMM_CQ_CTL_CMD_M			ICE_M(0x7, 4)
+#define GLCOMM_CQ_CTL_ID_S			16
+#define GLCOMM_CQ_CTL_ID_M			ICE_M(0x3FFF, 16)
+#define GLCOMM_MIN_MAX_PKT			0x000FC064 /* Reset Source: CORER */
+#define GLCOMM_MIN_MAX_PKT_MAHDL_S		0
+#define GLCOMM_MIN_MAX_PKT_MAHDL_M		ICE_M(0x3FFF, 0)
+#define GLCOMM_MIN_MAX_PKT_MIHDL_S		16
+#define GLCOMM_MIN_MAX_PKT_MIHDL_M		ICE_M(0x3F, 16)
+#define GLCOMM_MIN_MAX_PKT_LSO_COMS_MIHDL_S	22
+#define GLCOMM_MIN_MAX_PKT_LSO_COMS_MIHDL_M	ICE_M(0x3FF, 22)
+#define GLCOMM_PKT_SHAPER_PROF(_i)		(0x002D2DA8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLCOMM_PKT_SHAPER_PROF_MAX_INDEX	7
+#define GLCOMM_PKT_SHAPER_PROF_PKTCNT_S		0
+#define GLCOMM_PKT_SHAPER_PROF_PKTCNT_M		ICE_M(0x3F, 0)
+#define GLCOMM_QTX_CNTX_CTL			0x002D2DC8 /* Reset Source: CORER */
+#define GLCOMM_QTX_CNTX_CTL_QUEUE_ID_S		0
+#define GLCOMM_QTX_CNTX_CTL_QUEUE_ID_M		ICE_M(0x3FFF, 0)
+#define GLCOMM_QTX_CNTX_CTL_CMD_S		16
+#define GLCOMM_QTX_CNTX_CTL_CMD_M		ICE_M(0x7, 16)
+#define GLCOMM_QTX_CNTX_CTL_CMD_EXEC_S		19
+#define GLCOMM_QTX_CNTX_CTL_CMD_EXEC_M		BIT(19)
+#define GLCOMM_QTX_CNTX_DATA(_i)		(0x002D2D40 + ((_i) * 4)) /* _i=0...9 */ /* Reset Source: CORER */
+#define GLCOMM_QTX_CNTX_DATA_MAX_INDEX		9
+#define GLCOMM_QTX_CNTX_DATA_DATA_S		0
+#define GLCOMM_QTX_CNTX_DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLCOMM_QTX_CNTX_STAT			0x002D2DCC /* Reset Source: CORER */
+#define GLCOMM_QTX_CNTX_STAT_CMD_IN_PROG_S	0
+#define GLCOMM_QTX_CNTX_STAT_CMD_IN_PROG_M	BIT(0)
+#define GLCOMM_QUANTA_PROF(_i)			(0x002D2D68 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLCOMM_QUANTA_PROF_MAX_INDEX		15
+#define GLCOMM_QUANTA_PROF_QUANTA_SIZE_S	0
+#define GLCOMM_QUANTA_PROF_QUANTA_SIZE_M	ICE_M(0x3FFF, 0)
+#define GLCOMM_QUANTA_PROF_MAX_CMD_S		16
+#define GLCOMM_QUANTA_PROF_MAX_CMD_M		ICE_M(0xFF, 16)
+#define GLCOMM_QUANTA_PROF_MAX_DESC_S		24
+#define GLCOMM_QUANTA_PROF_MAX_DESC_M		ICE_M(0x3F, 24)
+#define GLLAN_TCLAN_CACHE_CTL			0x000FC0B8 /* Reset Source: CORER */
+#define GLLAN_TCLAN_CACHE_CTL_MIN_FETCH_THRESH_S 0
+#define GLLAN_TCLAN_CACHE_CTL_MIN_FETCH_THRESH_M ICE_M(0x3F, 0)
+#define GLLAN_TCLAN_CACHE_CTL_FETCH_CL_ALIGN_S	6
+#define GLLAN_TCLAN_CACHE_CTL_FETCH_CL_ALIGN_M	BIT(6)
+#define GLLAN_TCLAN_CACHE_CTL_MIN_ALLOC_THRESH_S 7
+#define GLLAN_TCLAN_CACHE_CTL_MIN_ALLOC_THRESH_M ICE_M(0x7F, 7)
+#define GLLAN_TCLAN_CACHE_CTL_CACHE_ENTRY_CNT_S 14
+#define GLLAN_TCLAN_CACHE_CTL_CACHE_ENTRY_CNT_M ICE_M(0xFF, 14)
+#define GLLAN_TCLAN_CACHE_CTL_CACHE_DESC_LIM_S	22
+#define GLLAN_TCLAN_CACHE_CTL_CACHE_DESC_LIM_M	ICE_M(0x3FF, 22)
+#define GLTCLAN_CQ_CNTX0(_CQ)			(0x000F0800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX0_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX0_RING_ADDR_LSB_S	0
+#define GLTCLAN_CQ_CNTX0_RING_ADDR_LSB_M	ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX1(_CQ)			(0x000F1000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX1_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX1_RING_ADDR_MSB_S	0
+#define GLTCLAN_CQ_CNTX1_RING_ADDR_MSB_M	ICE_M(0x1FFFFFF, 0)
+#define GLTCLAN_CQ_CNTX10(_CQ)			(0x000F5800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX10_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX10_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX10_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX11(_CQ)			(0x000F6000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX11_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX11_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX11_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX12(_CQ)			(0x000F6800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX12_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX12_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX12_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX13(_CQ)			(0x000F7000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX13_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX13_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX13_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX14(_CQ)			(0x000F7800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX14_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX14_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX14_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX15(_CQ)			(0x000F8000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX15_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX15_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX15_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX16(_CQ)			(0x000F8800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX16_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX16_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX16_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX17(_CQ)			(0x000F9000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX17_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX17_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX17_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX18(_CQ)			(0x000F9800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX18_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX18_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX18_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX19(_CQ)			(0x000FA000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX19_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX19_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX19_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX2(_CQ)			(0x000F1800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX2_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX2_RING_LEN_S		0
+#define GLTCLAN_CQ_CNTX2_RING_LEN_M		ICE_M(0x3FFFF, 0)
+#define GLTCLAN_CQ_CNTX20(_CQ)			(0x000FA800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX20_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX20_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX20_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX21(_CQ)			(0x000FB000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX21_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX21_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX21_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX3(_CQ)			(0x000F2000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX3_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX3_GENERATION_S		0
+#define GLTCLAN_CQ_CNTX3_GENERATION_M		BIT(0)
+#define GLTCLAN_CQ_CNTX3_CQ_WR_PTR_S		1
+#define GLTCLAN_CQ_CNTX3_CQ_WR_PTR_M		ICE_M(0x3FFFFF, 1)
+#define GLTCLAN_CQ_CNTX4(_CQ)			(0x000F2800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX4_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX4_PF_NUM_S		0
+#define GLTCLAN_CQ_CNTX4_PF_NUM_M		ICE_M(0x7, 0)
+#define GLTCLAN_CQ_CNTX4_VMVF_NUM_S		3
+#define GLTCLAN_CQ_CNTX4_VMVF_NUM_M		ICE_M(0x3FF, 3)
+#define GLTCLAN_CQ_CNTX4_VMVF_TYPE_S		13
+#define GLTCLAN_CQ_CNTX4_VMVF_TYPE_M		ICE_M(0x3, 13)
+#define GLTCLAN_CQ_CNTX5(_CQ)			(0x000F3000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX5_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX5_TPH_EN_S		0
+#define GLTCLAN_CQ_CNTX5_TPH_EN_M		BIT(0)
+#define GLTCLAN_CQ_CNTX5_CPU_ID_S		1
+#define GLTCLAN_CQ_CNTX5_CPU_ID_M		ICE_M(0xFF, 1)
+#define GLTCLAN_CQ_CNTX5_FLUSH_ON_ITR_DIS_S	9
+#define GLTCLAN_CQ_CNTX5_FLUSH_ON_ITR_DIS_M	BIT(9)
+#define GLTCLAN_CQ_CNTX6(_CQ)			(0x000F3800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX6_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX6_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX6_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX7(_CQ)			(0x000F4000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX7_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX7_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX7_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX8(_CQ)			(0x000F4800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX8_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX8_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX8_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX9(_CQ)			(0x000F5000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX9_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX9_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX9_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define QTX_COMM_DBELL(_DBQM)			(0x002C0000 + ((_DBQM) * 4)) /* _i=0...16383 */ /* Reset Source: CORER */
+#define QTX_COMM_DBELL_MAX_INDEX		16383
+#define QTX_COMM_DBELL_QTX_COMM_DBELL_S		0
+#define QTX_COMM_DBELL_QTX_COMM_DBELL_M		ICE_M(0xFFFFFFFF, 0)
+#define QTX_COMM_DBLQ_CNTX(_i, _DBLQ)		(0x002D0000 + ((_i) * 1024 + (_DBLQ) * 4)) /* _i=0...4, _DBLQ=0...255 */ /* Reset Source: CORER */
+#define QTX_COMM_DBLQ_CNTX_MAX_INDEX		4
+#define QTX_COMM_DBLQ_CNTX_DATA_S		0
+#define QTX_COMM_DBLQ_CNTX_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define QTX_COMM_DBLQ_DBELL(_DBLQ)		(0x002D1400 + ((_DBLQ) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define QTX_COMM_DBLQ_DBELL_MAX_INDEX		255
+#define QTX_COMM_DBLQ_DBELL_TAIL_S		0
+#define QTX_COMM_DBLQ_DBELL_TAIL_M		ICE_M(0x1FFF, 0)
+#define QTX_COMM_HEAD(_DBQM)			(0x000E0000 + ((_DBQM) * 4)) /* _i=0...16383 */ /* Reset Source: CORER */
+#define QTX_COMM_HEAD_MAX_INDEX			16383
 #define QTX_COMM_HEAD_HEAD_S			0
 #define QTX_COMM_HEAD_HEAD_M			ICE_M(0x1FFF, 0)
-#define PF_FW_ARQBAH				0x00080180
-#define PF_FW_ARQBAL				0x00080080
-#define PF_FW_ARQH				0x00080380
+#define QTX_COMM_HEAD_RS_PENDING_S		16
+#define QTX_COMM_HEAD_RS_PENDING_M		BIT(16)
+#define GL_FW_TOOL_ARQBAH			0x000801C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQBAH_ARQBAH_S		0
+#define GL_FW_TOOL_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_FW_TOOL_ARQBAL			0x000800C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQBAL_ARQBAL_LSB_S		0
+#define GL_FW_TOOL_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define GL_FW_TOOL_ARQBAL_ARQBAL_S		6
+#define GL_FW_TOOL_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define GL_FW_TOOL_ARQH				0x000803C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQH_ARQH_S			0
+#define GL_FW_TOOL_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ARQLEN			0x000802C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQLEN_ARQLEN_S		0
+#define GL_FW_TOOL_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ARQLEN_ARQVFE_S		28
+#define GL_FW_TOOL_ARQLEN_ARQVFE_M		BIT(28)
+#define GL_FW_TOOL_ARQLEN_ARQOVFL_S		29
+#define GL_FW_TOOL_ARQLEN_ARQOVFL_M		BIT(29)
+#define GL_FW_TOOL_ARQLEN_ARQCRIT_S		30
+#define GL_FW_TOOL_ARQLEN_ARQCRIT_M		BIT(30)
+#define GL_FW_TOOL_ARQLEN_ARQENABLE_S		31
+#define GL_FW_TOOL_ARQLEN_ARQENABLE_M		BIT(31)
+#define GL_FW_TOOL_ARQT				0x000804C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQT_ARQT_S			0
+#define GL_FW_TOOL_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ATQBAH			0x00080140 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQBAH_ATQBAH_S		0
+#define GL_FW_TOOL_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_FW_TOOL_ATQBAL			0x00080040 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQBAL_ATQBAL_LSB_S		0
+#define GL_FW_TOOL_ATQBAL_ATQBAL_LSB_M		ICE_M(0x3F, 0)
+#define GL_FW_TOOL_ATQBAL_ATQBAL_S		6
+#define GL_FW_TOOL_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define GL_FW_TOOL_ATQH				0x00080340 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQH_ATQH_S			0
+#define GL_FW_TOOL_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ATQLEN			0x00080240 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQLEN_ATQLEN_S		0
+#define GL_FW_TOOL_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ATQLEN_ATQVFE_S		28
+#define GL_FW_TOOL_ATQLEN_ATQVFE_M		BIT(28)
+#define GL_FW_TOOL_ATQLEN_ATQOVFL_S		29
+#define GL_FW_TOOL_ATQLEN_ATQOVFL_M		BIT(29)
+#define GL_FW_TOOL_ATQLEN_ATQCRIT_S		30
+#define GL_FW_TOOL_ATQLEN_ATQCRIT_M		BIT(30)
+#define GL_FW_TOOL_ATQLEN_ATQENABLE_S		31
+#define GL_FW_TOOL_ATQLEN_ATQENABLE_M		BIT(31)
+#define GL_FW_TOOL_ATQT				0x00080440 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQT_ATQT_S			0
+#define GL_FW_TOOL_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define GL_MBX_PASID				0x00231EC0 /* Reset Source: CORER */
+#define GL_MBX_PASID_PASID_MODE_S		0
+#define GL_MBX_PASID_PASID_MODE_M		BIT(0)
+#define GL_MBX_PASID_PASID_MODE_VALID_S		1
+#define GL_MBX_PASID_PASID_MODE_VALID_M		BIT(1)
+#define PF_FW_ARQBAH				0x00080180 /* Reset Source: EMPR */
+#define PF_FW_ARQBAH_ARQBAH_S			0
+#define PF_FW_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_FW_ARQBAL				0x00080080 /* Reset Source: EMPR */
+#define PF_FW_ARQBAL_ARQBAL_LSB_S		0
+#define PF_FW_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF_FW_ARQBAL_ARQBAL_S			6
+#define PF_FW_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_FW_ARQH				0x00080380 /* Reset Source: EMPR */
+#define PF_FW_ARQH_ARQH_S			0
 #define PF_FW_ARQH_ARQH_M			ICE_M(0x3FF, 0)
-#define PF_FW_ARQLEN				0x00080280
+#define PF_FW_ARQLEN				0x00080280 /* Reset Source: EMPR */
+#define PF_FW_ARQLEN_ARQLEN_S			0
 #define PF_FW_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define PF_FW_ARQLEN_ARQVFE_S			28
 #define PF_FW_ARQLEN_ARQVFE_M			BIT(28)
+#define PF_FW_ARQLEN_ARQOVFL_S			29
 #define PF_FW_ARQLEN_ARQOVFL_M			BIT(29)
+#define PF_FW_ARQLEN_ARQCRIT_S			30
 #define PF_FW_ARQLEN_ARQCRIT_M			BIT(30)
+#define PF_FW_ARQLEN_ARQENABLE_S		31
 #define PF_FW_ARQLEN_ARQENABLE_M		BIT(31)
-#define PF_FW_ARQT				0x00080480
-#define PF_FW_ATQBAH				0x00080100
-#define PF_FW_ATQBAL				0x00080000
-#define PF_FW_ATQH				0x00080300
+#define PF_FW_ARQT				0x00080480 /* Reset Source: EMPR */
+#define PF_FW_ARQT_ARQT_S			0
+#define PF_FW_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF_FW_ATQBAH				0x00080100 /* Reset Source: EMPR */
+#define PF_FW_ATQBAH_ATQBAH_S			0
+#define PF_FW_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_FW_ATQBAL				0x00080000 /* Reset Source: EMPR */
+#define PF_FW_ATQBAL_ATQBAL_LSB_S		0
+#define PF_FW_ATQBAL_ATQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF_FW_ATQBAL_ATQBAL_S			6
+#define PF_FW_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_FW_ATQH				0x00080300 /* Reset Source: EMPR */
+#define PF_FW_ATQH_ATQH_S			0
 #define PF_FW_ATQH_ATQH_M			ICE_M(0x3FF, 0)
-#define PF_FW_ATQLEN				0x00080200
+#define PF_FW_ATQLEN				0x00080200 /* Reset Source: EMPR */
+#define PF_FW_ATQLEN_ATQLEN_S			0
 #define PF_FW_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define PF_FW_ATQLEN_ATQVFE_S			28
 #define PF_FW_ATQLEN_ATQVFE_M			BIT(28)
+#define PF_FW_ATQLEN_ATQOVFL_S			29
 #define PF_FW_ATQLEN_ATQOVFL_M			BIT(29)
+#define PF_FW_ATQLEN_ATQCRIT_S			30
 #define PF_FW_ATQLEN_ATQCRIT_M			BIT(30)
-#define VF_MBX_ARQLEN(_VF)			(0x0022BC00 + ((_VF) * 4))
+#define PF_FW_ATQLEN_ATQENABLE_S		31
 #define PF_FW_ATQLEN_ATQENABLE_M		BIT(31)
-#define PF_FW_ATQT				0x00080400
-#define PF_MBX_ARQBAH				0x0022E400
-#define PF_MBX_ARQBAL				0x0022E380
-#define PF_MBX_ARQH				0x0022E500
+#define PF_FW_ATQT				0x00080400 /* Reset Source: EMPR */
+#define PF_FW_ATQT_ATQT_S			0
+#define PF_FW_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF_MBX_ARQBAH				0x0022E400 /* Reset Source: CORER */
+#define PF_MBX_ARQBAH_ARQBAH_S			0
+#define PF_MBX_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_MBX_ARQBAL				0x0022E380 /* Reset Source: CORER */
+#define PF_MBX_ARQBAL_ARQBAL_LSB_S		0
+#define PF_MBX_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF_MBX_ARQBAL_ARQBAL_S			6
+#define PF_MBX_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_MBX_ARQH				0x0022E500 /* Reset Source: CORER */
+#define PF_MBX_ARQH_ARQH_S			0
 #define PF_MBX_ARQH_ARQH_M			ICE_M(0x3FF, 0)
-#define PF_MBX_ARQLEN				0x0022E480
+#define PF_MBX_ARQLEN				0x0022E480 /* Reset Source: PFR */
+#define PF_MBX_ARQLEN_ARQLEN_S			0
 #define PF_MBX_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define PF_MBX_ARQLEN_ARQVFE_S			28
+#define PF_MBX_ARQLEN_ARQVFE_M			BIT(28)
+#define PF_MBX_ARQLEN_ARQOVFL_S			29
+#define PF_MBX_ARQLEN_ARQOVFL_M			BIT(29)
+#define PF_MBX_ARQLEN_ARQCRIT_S			30
+#define PF_MBX_ARQLEN_ARQCRIT_M			BIT(30)
+#define PF_MBX_ARQLEN_ARQENABLE_S		31
 #define PF_MBX_ARQLEN_ARQENABLE_M		BIT(31)
-#define PF_MBX_ARQT				0x0022E580
-#define PF_MBX_ATQBAH				0x0022E180
-#define PF_MBX_ATQBAL				0x0022E100
-#define PF_MBX_ATQH				0x0022E280
+#define PF_MBX_ARQT				0x0022E580 /* Reset Source: CORER */
+#define PF_MBX_ARQT_ARQT_S			0
+#define PF_MBX_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF_MBX_ATQBAH				0x0022E180 /* Reset Source: CORER */
+#define PF_MBX_ATQBAH_ATQBAH_S			0
+#define PF_MBX_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_MBX_ATQBAL				0x0022E100 /* Reset Source: CORER */
+#define PF_MBX_ATQBAL_ATQBAL_S			6
+#define PF_MBX_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_MBX_ATQH				0x0022E280 /* Reset Source: CORER */
+#define PF_MBX_ATQH_ATQH_S			0
 #define PF_MBX_ATQH_ATQH_M			ICE_M(0x3FF, 0)
-#define PF_MBX_ATQLEN				0x0022E200
+#define PF_MBX_ATQLEN				0x0022E200 /* Reset Source: PFR */
+#define PF_MBX_ATQLEN_ATQLEN_S			0
 #define PF_MBX_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define PF_MBX_ATQLEN_ATQVFE_S			28
+#define PF_MBX_ATQLEN_ATQVFE_M			BIT(28)
+#define PF_MBX_ATQLEN_ATQOVFL_S			29
+#define PF_MBX_ATQLEN_ATQOVFL_M			BIT(29)
+#define PF_MBX_ATQLEN_ATQCRIT_S			30
+#define PF_MBX_ATQLEN_ATQCRIT_M			BIT(30)
+#define PF_MBX_ATQLEN_ATQENABLE_S		31
 #define PF_MBX_ATQLEN_ATQENABLE_M		BIT(31)
-#define PF_MBX_ATQT				0x0022E300
-#define PRTDCB_GENS				0x00083020
+#define PF_MBX_ATQT				0x0022E300 /* Reset Source: CORER */
+#define PF_MBX_ATQT_ATQT_S			0
+#define PF_MBX_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF_SB_ARQBAH				0x0022FF00 /* Reset Source: CORER */
+#define PF_SB_ARQBAH_ARQBAH_S			0
+#define PF_SB_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_SB_ARQBAL				0x0022FE80 /* Reset Source: CORER */
+#define PF_SB_ARQBAL_ARQBAL_LSB_S		0
+#define PF_SB_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF_SB_ARQBAL_ARQBAL_S			6
+#define PF_SB_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_SB_ARQH				0x00230000 /* Reset Source: CORER */
+#define PF_SB_ARQH_ARQH_S			0
+#define PF_SB_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF_SB_ARQLEN				0x0022FF80 /* Reset Source: PFR */
+#define PF_SB_ARQLEN_ARQLEN_S			0
+#define PF_SB_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define PF_SB_ARQLEN_ARQVFE_S			28
+#define PF_SB_ARQLEN_ARQVFE_M			BIT(28)
+#define PF_SB_ARQLEN_ARQOVFL_S			29
+#define PF_SB_ARQLEN_ARQOVFL_M			BIT(29)
+#define PF_SB_ARQLEN_ARQCRIT_S			30
+#define PF_SB_ARQLEN_ARQCRIT_M			BIT(30)
+#define PF_SB_ARQLEN_ARQENABLE_S		31
+#define PF_SB_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF_SB_ARQT				0x00230080 /* Reset Source: CORER */
+#define PF_SB_ARQT_ARQT_S			0
+#define PF_SB_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF_SB_ATQBAH				0x0022FC80 /* Reset Source: CORER */
+#define PF_SB_ATQBAH_ATQBAH_S			0
+#define PF_SB_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_SB_ATQBAL				0x0022FC00 /* Reset Source: CORER */
+#define PF_SB_ATQBAL_ATQBAL_S			6
+#define PF_SB_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_SB_ATQH				0x0022FD80 /* Reset Source: CORER */
+#define PF_SB_ATQH_ATQH_S			0
+#define PF_SB_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF_SB_ATQLEN				0x0022FD00 /* Reset Source: PFR */
+#define PF_SB_ATQLEN_ATQLEN_S			0
+#define PF_SB_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define PF_SB_ATQLEN_ATQVFE_S			28
+#define PF_SB_ATQLEN_ATQVFE_M			BIT(28)
+#define PF_SB_ATQLEN_ATQOVFL_S			29
+#define PF_SB_ATQLEN_ATQOVFL_M			BIT(29)
+#define PF_SB_ATQLEN_ATQCRIT_S			30
+#define PF_SB_ATQLEN_ATQCRIT_M			BIT(30)
+#define PF_SB_ATQLEN_ATQENABLE_S		31
+#define PF_SB_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF_SB_ATQT				0x0022FE00 /* Reset Source: CORER */
+#define PF_SB_ATQT_ATQT_S			0
+#define PF_SB_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF_SB_REM_DEV_CTL			0x002300F0 /* Reset Source: CORER */
+#define PF_SB_REM_DEV_CTL_DEST_EN_S		0
+#define PF_SB_REM_DEV_CTL_DEST_EN_M		ICE_M(0xFFFF, 0)
+#define PF0_FW_HLP_ARQBAH			0x000801C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQBAH_ARQBAH_S		0
+#define PF0_FW_HLP_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_HLP_ARQBAL			0x000800C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_FW_HLP_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_FW_HLP_ARQBAL_ARQBAL_S		6
+#define PF0_FW_HLP_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_HLP_ARQH				0x000803C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQH_ARQH_S			0
+#define PF0_FW_HLP_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ARQLEN			0x000802C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQLEN_ARQLEN_S		0
+#define PF0_FW_HLP_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ARQLEN_ARQVFE_S		28
+#define PF0_FW_HLP_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_FW_HLP_ARQLEN_ARQOVFL_S		29
+#define PF0_FW_HLP_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_FW_HLP_ARQLEN_ARQCRIT_S		30
+#define PF0_FW_HLP_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_FW_HLP_ARQLEN_ARQENABLE_S		31
+#define PF0_FW_HLP_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_FW_HLP_ARQT				0x000804C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQT_ARQT_S			0
+#define PF0_FW_HLP_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQBAH			0x00080148 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQBAH_ATQBAH_S		0
+#define PF0_FW_HLP_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_HLP_ATQBAL			0x00080048 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQBAL_ATQBAL_LSB_S		0
+#define PF0_FW_HLP_ATQBAL_ATQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_FW_HLP_ATQBAL_ATQBAL_S		6
+#define PF0_FW_HLP_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_HLP_ATQH				0x00080348 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQH_ATQH_S			0
+#define PF0_FW_HLP_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQLEN			0x00080248 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQLEN_ATQLEN_S		0
+#define PF0_FW_HLP_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQLEN_ATQVFE_S		28
+#define PF0_FW_HLP_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_FW_HLP_ATQLEN_ATQOVFL_S		29
+#define PF0_FW_HLP_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_FW_HLP_ATQLEN_ATQCRIT_S		30
+#define PF0_FW_HLP_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_FW_HLP_ATQLEN_ATQENABLE_S		31
+#define PF0_FW_HLP_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_FW_HLP_ATQT				0x00080448 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQT_ATQT_S			0
+#define PF0_FW_HLP_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQBAH			0x000801C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQBAH_ARQBAH_S		0
+#define PF0_FW_PSM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_PSM_ARQBAL			0x000800C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_FW_PSM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_FW_PSM_ARQBAL_ARQBAL_S		6
+#define PF0_FW_PSM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_PSM_ARQH				0x000803C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQH_ARQH_S			0
+#define PF0_FW_PSM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQLEN			0x000802C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQLEN_ARQLEN_S		0
+#define PF0_FW_PSM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQLEN_ARQVFE_S		28
+#define PF0_FW_PSM_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_FW_PSM_ARQLEN_ARQOVFL_S		29
+#define PF0_FW_PSM_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_FW_PSM_ARQLEN_ARQCRIT_S		30
+#define PF0_FW_PSM_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_FW_PSM_ARQLEN_ARQENABLE_S		31
+#define PF0_FW_PSM_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_FW_PSM_ARQT				0x000804C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQT_ARQT_S			0
+#define PF0_FW_PSM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQBAH			0x00080144 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQBAH_ATQBAH_S		0
+#define PF0_FW_PSM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_PSM_ATQBAL			0x00080044 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQBAL_ATQBAL_LSB_S		0
+#define PF0_FW_PSM_ATQBAL_ATQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_FW_PSM_ATQBAL_ATQBAL_S		6
+#define PF0_FW_PSM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_PSM_ATQH				0x00080344 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQH_ATQH_S			0
+#define PF0_FW_PSM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQLEN			0x00080244 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQLEN_ATQLEN_S		0
+#define PF0_FW_PSM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQLEN_ATQVFE_S		28
+#define PF0_FW_PSM_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_FW_PSM_ATQLEN_ATQOVFL_S		29
+#define PF0_FW_PSM_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_FW_PSM_ATQLEN_ATQCRIT_S		30
+#define PF0_FW_PSM_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_FW_PSM_ATQLEN_ATQENABLE_S		31
+#define PF0_FW_PSM_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_FW_PSM_ATQT				0x00080444 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQT_ATQT_S			0
+#define PF0_FW_PSM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQBAH			0x0022E5D8 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQBAH_ARQBAH_S		0
+#define PF0_MBX_CPM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_CPM_ARQBAL			0x0022E5D4 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_MBX_CPM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_MBX_CPM_ARQBAL_ARQBAL_S		6
+#define PF0_MBX_CPM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_CPM_ARQH			0x0022E5E0 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQH_ARQH_S			0
+#define PF0_MBX_CPM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQLEN			0x0022E5DC /* Reset Source: PFR */
+#define PF0_MBX_CPM_ARQLEN_ARQLEN_S		0
+#define PF0_MBX_CPM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQLEN_ARQVFE_S		28
+#define PF0_MBX_CPM_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_MBX_CPM_ARQLEN_ARQOVFL_S		29
+#define PF0_MBX_CPM_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_MBX_CPM_ARQLEN_ARQCRIT_S		30
+#define PF0_MBX_CPM_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_MBX_CPM_ARQLEN_ARQENABLE_S		31
+#define PF0_MBX_CPM_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_MBX_CPM_ARQT			0x0022E5E4 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQT_ARQT_S			0
+#define PF0_MBX_CPM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQBAH			0x0022E5C4 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQBAH_ATQBAH_S		0
+#define PF0_MBX_CPM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_CPM_ATQBAL			0x0022E5C0 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQBAL_ATQBAL_S		6
+#define PF0_MBX_CPM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_CPM_ATQH			0x0022E5CC /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQH_ATQH_S			0
+#define PF0_MBX_CPM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQLEN			0x0022E5C8 /* Reset Source: PFR */
+#define PF0_MBX_CPM_ATQLEN_ATQLEN_S		0
+#define PF0_MBX_CPM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQLEN_ATQVFE_S		28
+#define PF0_MBX_CPM_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_MBX_CPM_ATQLEN_ATQOVFL_S		29
+#define PF0_MBX_CPM_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_MBX_CPM_ATQLEN_ATQCRIT_S		30
+#define PF0_MBX_CPM_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_MBX_CPM_ATQLEN_ATQENABLE_S		31
+#define PF0_MBX_CPM_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_MBX_CPM_ATQT			0x0022E5D0 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQT_ATQT_S			0
+#define PF0_MBX_CPM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQBAH			0x0022E600 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQBAH_ARQBAH_S		0
+#define PF0_MBX_HLP_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_HLP_ARQBAL			0x0022E5FC /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_MBX_HLP_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_MBX_HLP_ARQBAL_ARQBAL_S		6
+#define PF0_MBX_HLP_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_HLP_ARQH			0x0022E608 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQH_ARQH_S			0
+#define PF0_MBX_HLP_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQLEN			0x0022E604 /* Reset Source: PFR */
+#define PF0_MBX_HLP_ARQLEN_ARQLEN_S		0
+#define PF0_MBX_HLP_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQLEN_ARQVFE_S		28
+#define PF0_MBX_HLP_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_MBX_HLP_ARQLEN_ARQOVFL_S		29
+#define PF0_MBX_HLP_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_MBX_HLP_ARQLEN_ARQCRIT_S		30
+#define PF0_MBX_HLP_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_MBX_HLP_ARQLEN_ARQENABLE_S		31
+#define PF0_MBX_HLP_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_MBX_HLP_ARQT			0x0022E60C /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQT_ARQT_S			0
+#define PF0_MBX_HLP_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQBAH			0x0022E5EC /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQBAH_ATQBAH_S		0
+#define PF0_MBX_HLP_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_HLP_ATQBAL			0x0022E5E8 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQBAL_ATQBAL_S		6
+#define PF0_MBX_HLP_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_HLP_ATQH			0x0022E5F4 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQH_ATQH_S			0
+#define PF0_MBX_HLP_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQLEN			0x0022E5F0 /* Reset Source: PFR */
+#define PF0_MBX_HLP_ATQLEN_ATQLEN_S		0
+#define PF0_MBX_HLP_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQLEN_ATQVFE_S		28
+#define PF0_MBX_HLP_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_MBX_HLP_ATQLEN_ATQOVFL_S		29
+#define PF0_MBX_HLP_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_MBX_HLP_ATQLEN_ATQCRIT_S		30
+#define PF0_MBX_HLP_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_MBX_HLP_ATQLEN_ATQENABLE_S		31
+#define PF0_MBX_HLP_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_MBX_HLP_ATQT			0x0022E5F8 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQT_ATQT_S			0
+#define PF0_MBX_HLP_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQBAH			0x0022E628 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQBAH_ARQBAH_S		0
+#define PF0_MBX_PSM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_PSM_ARQBAL			0x0022E624 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_MBX_PSM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_MBX_PSM_ARQBAL_ARQBAL_S		6
+#define PF0_MBX_PSM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_PSM_ARQH			0x0022E630 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQH_ARQH_S			0
+#define PF0_MBX_PSM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQLEN			0x0022E62C /* Reset Source: PFR */
+#define PF0_MBX_PSM_ARQLEN_ARQLEN_S		0
+#define PF0_MBX_PSM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQLEN_ARQVFE_S		28
+#define PF0_MBX_PSM_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_MBX_PSM_ARQLEN_ARQOVFL_S		29
+#define PF0_MBX_PSM_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_MBX_PSM_ARQLEN_ARQCRIT_S		30
+#define PF0_MBX_PSM_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_MBX_PSM_ARQLEN_ARQENABLE_S		31
+#define PF0_MBX_PSM_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_MBX_PSM_ARQT			0x0022E634 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQT_ARQT_S			0
+#define PF0_MBX_PSM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQBAH			0x0022E614 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQBAH_ATQBAH_S		0
+#define PF0_MBX_PSM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_PSM_ATQBAL			0x0022E610 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQBAL_ATQBAL_S		6
+#define PF0_MBX_PSM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_PSM_ATQH			0x0022E61C /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQH_ATQH_S			0
+#define PF0_MBX_PSM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQLEN			0x0022E618 /* Reset Source: PFR */
+#define PF0_MBX_PSM_ATQLEN_ATQLEN_S		0
+#define PF0_MBX_PSM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQLEN_ATQVFE_S		28
+#define PF0_MBX_PSM_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_MBX_PSM_ATQLEN_ATQOVFL_S		29
+#define PF0_MBX_PSM_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_MBX_PSM_ATQLEN_ATQCRIT_S		30
+#define PF0_MBX_PSM_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_MBX_PSM_ATQLEN_ATQENABLE_S		31
+#define PF0_MBX_PSM_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_MBX_PSM_ATQT			0x0022E620 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQT_ATQT_S			0
+#define PF0_MBX_PSM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQBAH			0x0022E650 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQBAH_ARQBAH_S		0
+#define PF0_SB_CPM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_CPM_ARQBAL			0x0022E64C /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_SB_CPM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_SB_CPM_ARQBAL_ARQBAL_S		6
+#define PF0_SB_CPM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_CPM_ARQH				0x0022E658 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQH_ARQH_S			0
+#define PF0_SB_CPM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQLEN			0x0022E654 /* Reset Source: PFR */
+#define PF0_SB_CPM_ARQLEN_ARQLEN_S		0
+#define PF0_SB_CPM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQLEN_ARQVFE_S		28
+#define PF0_SB_CPM_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_SB_CPM_ARQLEN_ARQOVFL_S		29
+#define PF0_SB_CPM_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_SB_CPM_ARQLEN_ARQCRIT_S		30
+#define PF0_SB_CPM_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_SB_CPM_ARQLEN_ARQENABLE_S		31
+#define PF0_SB_CPM_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_SB_CPM_ARQT				0x0022E65C /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQT_ARQT_S			0
+#define PF0_SB_CPM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQBAH			0x0022E63C /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQBAH_ATQBAH_S		0
+#define PF0_SB_CPM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_CPM_ATQBAL			0x0022E638 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQBAL_ATQBAL_S		6
+#define PF0_SB_CPM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_CPM_ATQH				0x0022E644 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQH_ATQH_S			0
+#define PF0_SB_CPM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQLEN			0x0022E640 /* Reset Source: PFR */
+#define PF0_SB_CPM_ATQLEN_ATQLEN_S		0
+#define PF0_SB_CPM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQLEN_ATQVFE_S		28
+#define PF0_SB_CPM_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_SB_CPM_ATQLEN_ATQOVFL_S		29
+#define PF0_SB_CPM_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_SB_CPM_ATQLEN_ATQCRIT_S		30
+#define PF0_SB_CPM_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_SB_CPM_ATQLEN_ATQENABLE_S		31
+#define PF0_SB_CPM_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_SB_CPM_ATQT				0x0022E648 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQT_ATQT_S			0
+#define PF0_SB_CPM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_REM_DEV_CTL			0x002300F4 /* Reset Source: CORER */
+#define PF0_SB_CPM_REM_DEV_CTL_DEST_EN_S	0
+#define PF0_SB_CPM_REM_DEV_CTL_DEST_EN_M	ICE_M(0xFFFF, 0)
+#define PF0_SB_HLP_ARQBAH			0x002300D8 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQBAH_ARQBAH_S		0
+#define PF0_SB_HLP_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_HLP_ARQBAL			0x002300D4 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_SB_HLP_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_SB_HLP_ARQBAL_ARQBAL_S		6
+#define PF0_SB_HLP_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_HLP_ARQH				0x002300E0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQH_ARQH_S			0
+#define PF0_SB_HLP_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQLEN			0x002300DC /* Reset Source: PFR */
+#define PF0_SB_HLP_ARQLEN_ARQLEN_S		0
+#define PF0_SB_HLP_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQLEN_ARQVFE_S		28
+#define PF0_SB_HLP_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_SB_HLP_ARQLEN_ARQOVFL_S		29
+#define PF0_SB_HLP_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_SB_HLP_ARQLEN_ARQCRIT_S		30
+#define PF0_SB_HLP_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_SB_HLP_ARQLEN_ARQENABLE_S		31
+#define PF0_SB_HLP_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_SB_HLP_ARQT				0x002300E4 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQT_ARQT_S			0
+#define PF0_SB_HLP_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQBAH			0x002300C4 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQBAH_ATQBAH_S		0
+#define PF0_SB_HLP_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_HLP_ATQBAL			0x002300C0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQBAL_ATQBAL_S		6
+#define PF0_SB_HLP_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_HLP_ATQH				0x002300CC /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQH_ATQH_S			0
+#define PF0_SB_HLP_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQLEN			0x002300C8 /* Reset Source: PFR */
+#define PF0_SB_HLP_ATQLEN_ATQLEN_S		0
+#define PF0_SB_HLP_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQLEN_ATQVFE_S		28
+#define PF0_SB_HLP_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_SB_HLP_ATQLEN_ATQOVFL_S		29
+#define PF0_SB_HLP_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_SB_HLP_ATQLEN_ATQCRIT_S		30
+#define PF0_SB_HLP_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_SB_HLP_ATQLEN_ATQENABLE_S		31
+#define PF0_SB_HLP_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_SB_HLP_ATQT				0x002300D0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQT_ATQT_S			0
+#define PF0_SB_HLP_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_REM_DEV_CTL			0x002300E8 /* Reset Source: CORER */
+#define PF0_SB_HLP_REM_DEV_CTL_DEST_EN_S	0
+#define PF0_SB_HLP_REM_DEV_CTL_DEST_EN_M	ICE_M(0xFFFF, 0)
+#define SB_REM_DEV_DEST(_i)			(0x002300F8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define SB_REM_DEV_DEST_MAX_INDEX		7
+#define SB_REM_DEV_DEST_DEST_S			0
+#define SB_REM_DEV_DEST_DEST_M			ICE_M(0xF, 0)
+#define SB_REM_DEV_DEST_DEST_VALID_S		31
+#define SB_REM_DEV_DEST_DEST_VALID_M		BIT(31)
+#define VF_MBX_ARQBAH(_VF)			(0x0022B800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ARQBAH_MAX_INDEX			255
+#define VF_MBX_ARQBAH_ARQBAH_S			0
+#define VF_MBX_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_ARQBAL(_VF)			(0x0022B400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ARQBAL_MAX_INDEX			255
+#define VF_MBX_ARQBAL_ARQBAL_LSB_S		0
+#define VF_MBX_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_ARQBAL_ARQBAL_S			6
+#define VF_MBX_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_ARQH(_VF)			(0x0022C000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ARQH_MAX_INDEX			255
+#define VF_MBX_ARQH_ARQH_S			0
+#define VF_MBX_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ARQLEN(_VF)			(0x0022BC00 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VF_MBX_ARQLEN_MAX_INDEX			255
+#define VF_MBX_ARQLEN_ARQLEN_S			0
+#define VF_MBX_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ARQLEN_ARQVFE_S			28
+#define VF_MBX_ARQLEN_ARQVFE_M			BIT(28)
+#define VF_MBX_ARQLEN_ARQOVFL_S			29
+#define VF_MBX_ARQLEN_ARQOVFL_M			BIT(29)
+#define VF_MBX_ARQLEN_ARQCRIT_S			30
+#define VF_MBX_ARQLEN_ARQCRIT_M			BIT(30)
+#define VF_MBX_ARQLEN_ARQENABLE_S		31
+#define VF_MBX_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_MBX_ARQT(_VF)			(0x0022C400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ARQT_MAX_INDEX			255
+#define VF_MBX_ARQT_ARQT_S			0
+#define VF_MBX_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQBAH(_VF)			(0x0022A400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ATQBAH_MAX_INDEX			255
+#define VF_MBX_ATQBAH_ATQBAH_S			0
+#define VF_MBX_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_ATQBAL(_VF)			(0x0022A000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ATQBAL_MAX_INDEX			255
+#define VF_MBX_ATQBAL_ATQBAL_S			6
+#define VF_MBX_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_ATQH(_VF)			(0x0022AC00 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ATQH_MAX_INDEX			255
+#define VF_MBX_ATQH_ATQH_S			0
+#define VF_MBX_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQLEN(_VF)			(0x0022A800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VF_MBX_ATQLEN_MAX_INDEX			255
+#define VF_MBX_ATQLEN_ATQLEN_S			0
+#define VF_MBX_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQLEN_ATQVFE_S			28
+#define VF_MBX_ATQLEN_ATQVFE_M			BIT(28)
+#define VF_MBX_ATQLEN_ATQOVFL_S			29
+#define VF_MBX_ATQLEN_ATQOVFL_M			BIT(29)
+#define VF_MBX_ATQLEN_ATQCRIT_S			30
+#define VF_MBX_ATQLEN_ATQCRIT_M			BIT(30)
+#define VF_MBX_ATQLEN_ATQENABLE_S		31
+#define VF_MBX_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_MBX_ATQT(_VF)			(0x0022B000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ATQT_MAX_INDEX			255
+#define VF_MBX_ATQT_ATQT_S			0
+#define VF_MBX_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQBAH(_VF128)		(0x0022D400 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQBAH_MAX_INDEX		127
+#define VF_MBX_CPM_ARQBAH_ARQBAH_S		0
+#define VF_MBX_CPM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ARQBAL(_VF128)		(0x0022D200 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQBAL_MAX_INDEX		127
+#define VF_MBX_CPM_ARQBAL_ARQBAL_LSB_S		0
+#define VF_MBX_CPM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_CPM_ARQBAL_ARQBAL_S		6
+#define VF_MBX_CPM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_CPM_ARQH(_VF128)			(0x0022D800 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQH_MAX_INDEX		127
+#define VF_MBX_CPM_ARQH_ARQH_S			0
+#define VF_MBX_CPM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQLEN(_VF128)		(0x0022D600 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: PFR */
+#define VF_MBX_CPM_ARQLEN_MAX_INDEX		127
+#define VF_MBX_CPM_ARQLEN_ARQLEN_S		0
+#define VF_MBX_CPM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQLEN_ARQVFE_S		28
+#define VF_MBX_CPM_ARQLEN_ARQVFE_M		BIT(28)
+#define VF_MBX_CPM_ARQLEN_ARQOVFL_S		29
+#define VF_MBX_CPM_ARQLEN_ARQOVFL_M		BIT(29)
+#define VF_MBX_CPM_ARQLEN_ARQCRIT_S		30
+#define VF_MBX_CPM_ARQLEN_ARQCRIT_M		BIT(30)
+#define VF_MBX_CPM_ARQLEN_ARQENABLE_S		31
+#define VF_MBX_CPM_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_MBX_CPM_ARQT(_VF128)			(0x0022DA00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQT_MAX_INDEX		127
+#define VF_MBX_CPM_ARQT_ARQT_S			0
+#define VF_MBX_CPM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQBAH(_VF128)		(0x0022CA00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQBAH_MAX_INDEX		127
+#define VF_MBX_CPM_ATQBAH_ATQBAH_S		0
+#define VF_MBX_CPM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ATQBAL(_VF128)		(0x0022C800 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQBAL_MAX_INDEX		127
+#define VF_MBX_CPM_ATQBAL_ATQBAL_S		6
+#define VF_MBX_CPM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_CPM_ATQH(_VF128)			(0x0022CE00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQH_MAX_INDEX		127
+#define VF_MBX_CPM_ATQH_ATQH_S			0
+#define VF_MBX_CPM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQLEN(_VF128)		(0x0022CC00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: PFR */
+#define VF_MBX_CPM_ATQLEN_MAX_INDEX		127
+#define VF_MBX_CPM_ATQLEN_ATQLEN_S		0
+#define VF_MBX_CPM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQLEN_ATQVFE_S		28
+#define VF_MBX_CPM_ATQLEN_ATQVFE_M		BIT(28)
+#define VF_MBX_CPM_ATQLEN_ATQOVFL_S		29
+#define VF_MBX_CPM_ATQLEN_ATQOVFL_M		BIT(29)
+#define VF_MBX_CPM_ATQLEN_ATQCRIT_S		30
+#define VF_MBX_CPM_ATQLEN_ATQCRIT_M		BIT(30)
+#define VF_MBX_CPM_ATQLEN_ATQENABLE_S		31
+#define VF_MBX_CPM_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_MBX_CPM_ATQT(_VF128)			(0x0022D000 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQT_MAX_INDEX		127
+#define VF_MBX_CPM_ATQT_ATQT_S			0
+#define VF_MBX_CPM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQBAH(_VF16)		(0x0022DD80 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQBAH_MAX_INDEX		15
+#define VF_MBX_HLP_ARQBAH_ARQBAH_S		0
+#define VF_MBX_HLP_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_HLP_ARQBAL(_VF16)		(0x0022DD40 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQBAL_MAX_INDEX		15
+#define VF_MBX_HLP_ARQBAL_ARQBAL_LSB_S		0
+#define VF_MBX_HLP_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_HLP_ARQBAL_ARQBAL_S		6
+#define VF_MBX_HLP_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_HLP_ARQH(_VF16)			(0x0022DE00 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQH_MAX_INDEX		15
+#define VF_MBX_HLP_ARQH_ARQH_S			0
+#define VF_MBX_HLP_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQLEN(_VF16)		(0x0022DDC0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: PFR */
+#define VF_MBX_HLP_ARQLEN_MAX_INDEX		15
+#define VF_MBX_HLP_ARQLEN_ARQLEN_S		0
+#define VF_MBX_HLP_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQLEN_ARQVFE_S		28
+#define VF_MBX_HLP_ARQLEN_ARQVFE_M		BIT(28)
+#define VF_MBX_HLP_ARQLEN_ARQOVFL_S		29
+#define VF_MBX_HLP_ARQLEN_ARQOVFL_M		BIT(29)
+#define VF_MBX_HLP_ARQLEN_ARQCRIT_S		30
+#define VF_MBX_HLP_ARQLEN_ARQCRIT_M		BIT(30)
+#define VF_MBX_HLP_ARQLEN_ARQENABLE_S		31
+#define VF_MBX_HLP_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_MBX_HLP_ARQT(_VF16)			(0x0022DE40 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQT_MAX_INDEX		15
+#define VF_MBX_HLP_ARQT_ARQT_S			0
+#define VF_MBX_HLP_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQBAH(_VF16)		(0x0022DC40 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQBAH_MAX_INDEX		15
+#define VF_MBX_HLP_ATQBAH_ATQBAH_S		0
+#define VF_MBX_HLP_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_HLP_ATQBAL(_VF16)		(0x0022DC00 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQBAL_MAX_INDEX		15
+#define VF_MBX_HLP_ATQBAL_ATQBAL_S		6
+#define VF_MBX_HLP_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_HLP_ATQH(_VF16)			(0x0022DCC0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQH_MAX_INDEX		15
+#define VF_MBX_HLP_ATQH_ATQH_S			0
+#define VF_MBX_HLP_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQLEN(_VF16)		(0x0022DC80 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: PFR */
+#define VF_MBX_HLP_ATQLEN_MAX_INDEX		15
+#define VF_MBX_HLP_ATQLEN_ATQLEN_S		0
+#define VF_MBX_HLP_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQLEN_ATQVFE_S		28
+#define VF_MBX_HLP_ATQLEN_ATQVFE_M		BIT(28)
+#define VF_MBX_HLP_ATQLEN_ATQOVFL_S		29
+#define VF_MBX_HLP_ATQLEN_ATQOVFL_M		BIT(29)
+#define VF_MBX_HLP_ATQLEN_ATQCRIT_S		30
+#define VF_MBX_HLP_ATQLEN_ATQCRIT_M		BIT(30)
+#define VF_MBX_HLP_ATQLEN_ATQENABLE_S		31
+#define VF_MBX_HLP_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_MBX_HLP_ATQT(_VF16)			(0x0022DD00 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQT_MAX_INDEX		15
+#define VF_MBX_HLP_ATQT_ATQT_S			0
+#define VF_MBX_HLP_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQBAH(_VF16)		(0x0022E000 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQBAH_MAX_INDEX		15
+#define VF_MBX_PSM_ARQBAH_ARQBAH_S		0
+#define VF_MBX_PSM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_PSM_ARQBAL(_VF16)		(0x0022DFC0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQBAL_MAX_INDEX		15
+#define VF_MBX_PSM_ARQBAL_ARQBAL_LSB_S		0
+#define VF_MBX_PSM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_PSM_ARQBAL_ARQBAL_S		6
+#define VF_MBX_PSM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_PSM_ARQH(_VF16)			(0x0022E080 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQH_MAX_INDEX		15
+#define VF_MBX_PSM_ARQH_ARQH_S			0
+#define VF_MBX_PSM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQLEN(_VF16)		(0x0022E040 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: PFR */
+#define VF_MBX_PSM_ARQLEN_MAX_INDEX		15
+#define VF_MBX_PSM_ARQLEN_ARQLEN_S		0
+#define VF_MBX_PSM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQLEN_ARQVFE_S		28
+#define VF_MBX_PSM_ARQLEN_ARQVFE_M		BIT(28)
+#define VF_MBX_PSM_ARQLEN_ARQOVFL_S		29
+#define VF_MBX_PSM_ARQLEN_ARQOVFL_M		BIT(29)
+#define VF_MBX_PSM_ARQLEN_ARQCRIT_S		30
+#define VF_MBX_PSM_ARQLEN_ARQCRIT_M		BIT(30)
+#define VF_MBX_PSM_ARQLEN_ARQENABLE_S		31
+#define VF_MBX_PSM_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_MBX_PSM_ARQT(_VF16)			(0x0022E0C0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQT_MAX_INDEX		15
+#define VF_MBX_PSM_ARQT_ARQT_S			0
+#define VF_MBX_PSM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQBAH(_VF16)		(0x0022DEC0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQBAH_MAX_INDEX		15
+#define VF_MBX_PSM_ATQBAH_ATQBAH_S		0
+#define VF_MBX_PSM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_PSM_ATQBAL(_VF16)		(0x0022DE80 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQBAL_MAX_INDEX		15
+#define VF_MBX_PSM_ATQBAL_ATQBAL_S		6
+#define VF_MBX_PSM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_PSM_ATQH(_VF16)			(0x0022DF40 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQH_MAX_INDEX		15
+#define VF_MBX_PSM_ATQH_ATQH_S			0
+#define VF_MBX_PSM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQLEN(_VF16)		(0x0022DF00 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: PFR */
+#define VF_MBX_PSM_ATQLEN_MAX_INDEX		15
+#define VF_MBX_PSM_ATQLEN_ATQLEN_S		0
+#define VF_MBX_PSM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQLEN_ATQVFE_S		28
+#define VF_MBX_PSM_ATQLEN_ATQVFE_M		BIT(28)
+#define VF_MBX_PSM_ATQLEN_ATQOVFL_S		29
+#define VF_MBX_PSM_ATQLEN_ATQOVFL_M		BIT(29)
+#define VF_MBX_PSM_ATQLEN_ATQCRIT_S		30
+#define VF_MBX_PSM_ATQLEN_ATQCRIT_M		BIT(30)
+#define VF_MBX_PSM_ATQLEN_ATQENABLE_S		31
+#define VF_MBX_PSM_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_MBX_PSM_ATQT(_VF16)			(0x0022DF80 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQT_MAX_INDEX		15
+#define VF_MBX_PSM_ATQT_ATQT_S			0
+#define VF_MBX_PSM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQBAH(_VF128)		(0x0022F400 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ARQBAH_MAX_INDEX		127
+#define VF_SB_CPM_ARQBAH_ARQBAH_S		0
+#define VF_SB_CPM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_SB_CPM_ARQBAL(_VF128)		(0x0022F200 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ARQBAL_MAX_INDEX		127
+#define VF_SB_CPM_ARQBAL_ARQBAL_LSB_S		0
+#define VF_SB_CPM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_SB_CPM_ARQBAL_ARQBAL_S		6
+#define VF_SB_CPM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_SB_CPM_ARQH(_VF128)			(0x0022F800 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ARQH_MAX_INDEX		127
+#define VF_SB_CPM_ARQH_ARQH_S			0
+#define VF_SB_CPM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQLEN(_VF128)		(0x0022F600 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: PFR */
+#define VF_SB_CPM_ARQLEN_MAX_INDEX		127
+#define VF_SB_CPM_ARQLEN_ARQLEN_S		0
+#define VF_SB_CPM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQLEN_ARQVFE_S		28
+#define VF_SB_CPM_ARQLEN_ARQVFE_M		BIT(28)
+#define VF_SB_CPM_ARQLEN_ARQOVFL_S		29
+#define VF_SB_CPM_ARQLEN_ARQOVFL_M		BIT(29)
+#define VF_SB_CPM_ARQLEN_ARQCRIT_S		30
+#define VF_SB_CPM_ARQLEN_ARQCRIT_M		BIT(30)
+#define VF_SB_CPM_ARQLEN_ARQENABLE_S		31
+#define VF_SB_CPM_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_SB_CPM_ARQT(_VF128)			(0x0022FA00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ARQT_MAX_INDEX		127
+#define VF_SB_CPM_ARQT_ARQT_S			0
+#define VF_SB_CPM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQBAH(_VF128)		(0x0022EA00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ATQBAH_MAX_INDEX		127
+#define VF_SB_CPM_ATQBAH_ATQBAH_S		0
+#define VF_SB_CPM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_SB_CPM_ATQBAL(_VF128)		(0x0022E800 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ATQBAL_MAX_INDEX		127
+#define VF_SB_CPM_ATQBAL_ATQBAL_S		6
+#define VF_SB_CPM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_SB_CPM_ATQH(_VF128)			(0x0022EE00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ATQH_MAX_INDEX		127
+#define VF_SB_CPM_ATQH_ATQH_S			0
+#define VF_SB_CPM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQLEN(_VF128)		(0x0022EC00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: PFR */
+#define VF_SB_CPM_ATQLEN_MAX_INDEX		127
+#define VF_SB_CPM_ATQLEN_ATQLEN_S		0
+#define VF_SB_CPM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQLEN_ATQVFE_S		28
+#define VF_SB_CPM_ATQLEN_ATQVFE_M		BIT(28)
+#define VF_SB_CPM_ATQLEN_ATQOVFL_S		29
+#define VF_SB_CPM_ATQLEN_ATQOVFL_M		BIT(29)
+#define VF_SB_CPM_ATQLEN_ATQCRIT_S		30
+#define VF_SB_CPM_ATQLEN_ATQCRIT_M		BIT(30)
+#define VF_SB_CPM_ATQLEN_ATQENABLE_S		31
+#define VF_SB_CPM_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_SB_CPM_ATQT(_VF128)			(0x0022F000 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ATQT_MAX_INDEX		127
+#define VF_SB_CPM_ATQT_ATQT_S			0
+#define VF_SB_CPM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_REM_DEV_CTL			0x002300EC /* Reset Source: CORER */
+#define VF_SB_CPM_REM_DEV_CTL_DEST_EN_S		0
+#define VF_SB_CPM_REM_DEV_CTL_DEST_EN_M		ICE_M(0xFFFF, 0)
+#define VP_MBX_CPM_PF_VF_CTRL(_VP128)		(0x00231800 + ((_VP128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VP_MBX_CPM_PF_VF_CTRL_MAX_INDEX		127
+#define VP_MBX_CPM_PF_VF_CTRL_QUEUE_EN_S	0
+#define VP_MBX_CPM_PF_VF_CTRL_QUEUE_EN_M	BIT(0)
+#define VP_MBX_HLP_PF_VF_CTRL(_VP16)		(0x00231A00 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VP_MBX_HLP_PF_VF_CTRL_MAX_INDEX		15
+#define VP_MBX_HLP_PF_VF_CTRL_QUEUE_EN_S	0
+#define VP_MBX_HLP_PF_VF_CTRL_QUEUE_EN_M	BIT(0)
+#define VP_MBX_PF_VF_CTRL(_VSI)			(0x00230800 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VP_MBX_PF_VF_CTRL_MAX_INDEX		767
+#define VP_MBX_PF_VF_CTRL_QUEUE_EN_S		0
+#define VP_MBX_PF_VF_CTRL_QUEUE_EN_M		BIT(0)
+#define VP_MBX_PSM_PF_VF_CTRL(_VP16)		(0x00231A40 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VP_MBX_PSM_PF_VF_CTRL_MAX_INDEX		15
+#define VP_MBX_PSM_PF_VF_CTRL_QUEUE_EN_S	0
+#define VP_MBX_PSM_PF_VF_CTRL_QUEUE_EN_M	BIT(0)
+#define VP_SB_CPM_PF_VF_CTRL(_VP128)		(0x00231C00 + ((_VP128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VP_SB_CPM_PF_VF_CTRL_MAX_INDEX		127
+#define VP_SB_CPM_PF_VF_CTRL_QUEUE_EN_S		0
+#define VP_SB_CPM_PF_VF_CTRL_QUEUE_EN_M		BIT(0)
+#define GL_DCB_TDSCP2TC_BLOCK_DIS		0x00049218 /* Reset Source: CORER */
+#define GL_DCB_TDSCP2TC_BLOCK_DIS_DSCP2TC_BLOCK_DIS_S 0
+#define GL_DCB_TDSCP2TC_BLOCK_DIS_DSCP2TC_BLOCK_DIS_M BIT(0)
+#define GL_DCB_TDSCP2TC_BLOCK_IPV4(_i)		(0x00049018 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_DCB_TDSCP2TC_BLOCK_IPV4_MAX_INDEX	63
+#define GL_DCB_TDSCP2TC_BLOCK_IPV4_TC_BLOCK_LUT_S 0
+#define GL_DCB_TDSCP2TC_BLOCK_IPV4_TC_BLOCK_LUT_M ICE_M(0xFFFFFFFF, 0)
+#define GL_DCB_TDSCP2TC_BLOCK_IPV6(_i)		(0x00049118 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_DCB_TDSCP2TC_BLOCK_IPV6_MAX_INDEX	63
+#define GL_DCB_TDSCP2TC_BLOCK_IPV6_TC_BLOCK_LUT_S 0
+#define GL_DCB_TDSCP2TC_BLOCK_IPV6_TC_BLOCK_LUT_M ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_GENC				0x00083044 /* Reset Source: CORER */
+#define GLDCB_GENC_PCIRTT_S			0
+#define GLDCB_GENC_PCIRTT_M			ICE_M(0xFFFF, 0)
+#define GLDCB_PRS_RETSTCC(_i)			(0x002000B0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_PRS_RETSTCC_MAX_INDEX		31
+#define GLDCB_PRS_RETSTCC_BWSHARE_S		0
+#define GLDCB_PRS_RETSTCC_BWSHARE_M		ICE_M(0x7F, 0)
+#define GLDCB_PRS_RETSTCC_ETSTC_S		31
+#define GLDCB_PRS_RETSTCC_ETSTC_M		BIT(31)
+#define GLDCB_PRS_RSPMC				0x00200160 /* Reset Source: CORER */
+#define GLDCB_PRS_RSPMC_RSPM_S			0
+#define GLDCB_PRS_RSPMC_RSPM_M			ICE_M(0xFF, 0)
+#define GLDCB_PRS_RSPMC_RPM_MODE_S		8
+#define GLDCB_PRS_RSPMC_RPM_MODE_M		ICE_M(0x3, 8)
+#define GLDCB_PRS_RSPMC_PRR_MAX_EXP_S		10
+#define GLDCB_PRS_RSPMC_PRR_MAX_EXP_M		ICE_M(0xF, 10)
+#define GLDCB_PRS_RSPMC_PFCTIMER_S		14
+#define GLDCB_PRS_RSPMC_PFCTIMER_M		ICE_M(0x3FFF, 14)
+#define GLDCB_PRS_RSPMC_RPM_DIS_S		31
+#define GLDCB_PRS_RSPMC_RPM_DIS_M		BIT(31)
+#define GLDCB_RETSTCC(_i)			(0x00122140 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_RETSTCC_MAX_INDEX			31
+#define GLDCB_RETSTCC_BWSHARE_S			0
+#define GLDCB_RETSTCC_BWSHARE_M			ICE_M(0x7F, 0)
+#define GLDCB_RETSTCC_ETSTC_S			31
+#define GLDCB_RETSTCC_ETSTC_M			BIT(31)
+#define GLDCB_RETSTCS(_i)			(0x001221C0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_RETSTCS_MAX_INDEX			31
+#define GLDCB_RETSTCS_CREDITS_S			0
+#define GLDCB_RETSTCS_CREDITS_M			ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_RTC2PFC_RCB			0x00122100 /* Reset Source: CORER */
+#define GLDCB_RTC2PFC_RCB_TC2PFC_S		0
+#define GLDCB_RTC2PFC_RCB_TC2PFC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_SWT_RETSTCC(_i)			(0x0020A040 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_SWT_RETSTCC_MAX_INDEX		31
+#define GLDCB_SWT_RETSTCC_BWSHARE_S		0
+#define GLDCB_SWT_RETSTCC_BWSHARE_M		ICE_M(0x7F, 0)
+#define GLDCB_SWT_RETSTCC_ETSTC_S		31
+#define GLDCB_SWT_RETSTCC_ETSTC_M		BIT(31)
+#define GLDCB_TC2PFC				0x001D2694 /* Reset Source: CORER */
+#define GLDCB_TC2PFC_TC2PFC_S			0
+#define GLDCB_TC2PFC_TC2PFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TCB_MNG_SP			0x000AE12C /* Reset Source: CORER */
+#define GLDCB_TCB_MNG_SP_MNG_SP_S		0
+#define GLDCB_TCB_MNG_SP_MNG_SP_M		BIT(0)
+#define GLDCB_TCB_TCLL_CFG			0x000AE134 /* Reset Source: CORER */
+#define GLDCB_TCB_TCLL_CFG_LLTC_S		0
+#define GLDCB_TCB_TCLL_CFG_LLTC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TCB_WB_SP				0x000AE310 /* Reset Source: CORER */
+#define GLDCB_TCB_WB_SP_WB_SP_S			0
+#define GLDCB_TCB_WB_SP_WB_SP_M			BIT(0)
+#define GLDCB_TCUPM_IMM_EN			0x000BC824 /* Reset Source: CORER */
+#define GLDCB_TCUPM_IMM_EN_IMM_EN_S		0
+#define GLDCB_TCUPM_IMM_EN_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TCUPM_LEGACY_TC			0x000BC828 /* Reset Source: CORER */
+#define GLDCB_TCUPM_LEGACY_TC_LEGTC_S		0
+#define GLDCB_TCUPM_LEGACY_TC_LEGTC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TCUPM_NO_EXCEED_DIS		0x000BC830 /* Reset Source: CORER */
+#define GLDCB_TCUPM_NO_EXCEED_DIS_NON_EXCEED_DIS_S 0
+#define GLDCB_TCUPM_NO_EXCEED_DIS_NON_EXCEED_DIS_M BIT(0)
+#define GLDCB_TCUPM_WB_DIS			0x000BC834 /* Reset Source: CORER */
+#define GLDCB_TCUPM_WB_DIS_PORT_DISABLE_S	0
+#define GLDCB_TCUPM_WB_DIS_PORT_DISABLE_M	BIT(0)
+#define GLDCB_TCUPM_WB_DIS_TC_DISABLE_S		1
+#define GLDCB_TCUPM_WB_DIS_TC_DISABLE_M		BIT(1)
+#define GLDCB_TFPFCI				0x0009949C /* Reset Source: CORER */
+#define GLDCB_TFPFCI_GLDCB_TFPFCI_S		0
+#define GLDCB_TFPFCI_GLDCB_TFPFCI_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TLPM_IMM_TCB			0x000A0190 /* Reset Source: CORER */
+#define GLDCB_TLPM_IMM_TCB_IMM_EN_S		0
+#define GLDCB_TLPM_IMM_TCB_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TLPM_IMM_TCUPM			0x000A018C /* Reset Source: CORER */
+#define GLDCB_TLPM_IMM_TCUPM_IMM_EN_S		0
+#define GLDCB_TLPM_IMM_TCUPM_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TLPM_PCI_DM			0x000A0180 /* Reset Source: CORER */
+#define GLDCB_TLPM_PCI_DM_MONITOR_S		0
+#define GLDCB_TLPM_PCI_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define GLDCB_TLPM_PCI_DTHR			0x000A0184 /* Reset Source: CORER */
+#define GLDCB_TLPM_PCI_DTHR_PCI_TDATA_S		0
+#define GLDCB_TLPM_PCI_DTHR_PCI_TDATA_M		ICE_M(0xFFF, 0)
+#define GLDCB_TPB_IMM_TLPM			0x00099468 /* Reset Source: CORER */
+#define GLDCB_TPB_IMM_TLPM_IMM_EN_S		0
+#define GLDCB_TPB_IMM_TLPM_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TPB_IMM_TPB			0x0009946C /* Reset Source: CORER */
+#define GLDCB_TPB_IMM_TPB_IMM_EN_S		0
+#define GLDCB_TPB_IMM_TPB_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TPB_TCLL_CFG			0x00099464 /* Reset Source: CORER */
+#define GLDCB_TPB_TCLL_CFG_LLTC_S		0
+#define GLDCB_TPB_TCLL_CFG_LLTC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCB_BULK_DWRR_REG_QUANTA		0x000AE0E0 /* Reset Source: CORER */
+#define GLTCB_BULK_DWRR_REG_QUANTA_QUANTA_S	0
+#define GLTCB_BULK_DWRR_REG_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define GLTCB_BULK_DWRR_REG_SAT			0x000AE0F0 /* Reset Source: CORER */
+#define GLTCB_BULK_DWRR_REG_SAT_SATURATION_S	0
+#define GLTCB_BULK_DWRR_REG_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define GLTCB_BULK_DWRR_WB_QUANTA		0x000AE0E4 /* Reset Source: CORER */
+#define GLTCB_BULK_DWRR_WB_QUANTA_QUANTA_S	0
+#define GLTCB_BULK_DWRR_WB_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define GLTCB_BULK_DWRR_WB_SAT			0x000AE0F4 /* Reset Source: CORER */
+#define GLTCB_BULK_DWRR_WB_SAT_SATURATION_S	0
+#define GLTCB_BULK_DWRR_WB_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define GLTCB_CREDIT_EXP_CTL			0x000AE120 /* Reset Source: CORER */
+#define GLTCB_CREDIT_EXP_CTL_EN_S		0
+#define GLTCB_CREDIT_EXP_CTL_EN_M		BIT(0)
+#define GLTCB_CREDIT_EXP_CTL_MIN_PKT_S		1
+#define GLTCB_CREDIT_EXP_CTL_MIN_PKT_M		ICE_M(0x1FF, 1)
+#define GLTCB_LL_DWRR_REG_QUANTA		0x000AE0E8 /* Reset Source: CORER */
+#define GLTCB_LL_DWRR_REG_QUANTA_QUANTA_S	0
+#define GLTCB_LL_DWRR_REG_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define GLTCB_LL_DWRR_REG_SAT			0x000AE0F8 /* Reset Source: CORER */
+#define GLTCB_LL_DWRR_REG_SAT_SATURATION_S	0
+#define GLTCB_LL_DWRR_REG_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define GLTCB_LL_DWRR_WB_QUANTA			0x000AE0EC /* Reset Source: CORER */
+#define GLTCB_LL_DWRR_WB_QUANTA_QUANTA_S	0
+#define GLTCB_LL_DWRR_WB_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define GLTCB_LL_DWRR_WB_SAT			0x000AE0FC /* Reset Source: CORER */
+#define GLTCB_LL_DWRR_WB_SAT_SATURATION_S	0
+#define GLTCB_LL_DWRR_WB_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define GLTCB_WB_RL				0x000AE238 /* Reset Source: CORER */
+#define GLTCB_WB_RL_PERIOD_S			0
+#define GLTCB_WB_RL_PERIOD_M			ICE_M(0xFFFF, 0)
+#define GLTCB_WB_RL_EN_S			16
+#define GLTCB_WB_RL_EN_M			BIT(16)
+#define GLTPB_WB_RL				0x00099460 /* Reset Source: CORER */
+#define GLTPB_WB_RL_PERIOD_S			0
+#define GLTPB_WB_RL_PERIOD_M			ICE_M(0xFFFF, 0)
+#define GLTPB_WB_RL_EN_S			16
+#define GLTPB_WB_RL_EN_M			BIT(16)
+#define PRTDCB_FCCFG				0x001E4640 /* Reset Source: GLOBR */
+#define PRTDCB_FCCFG_TFCE_S			3
+#define PRTDCB_FCCFG_TFCE_M			ICE_M(0x3, 3)
+#define PRTDCB_FCRTV				0x001E4600 /* Reset Source: GLOBR */
+#define PRTDCB_FCRTV_FC_REFRESH_TH_S		0
+#define PRTDCB_FCRTV_FC_REFRESH_TH_M		ICE_M(0xFFFF, 0)
+#define PRTDCB_FCTTVN(_i)			(0x001E4580 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: GLOBR */
+#define PRTDCB_FCTTVN_MAX_INDEX			3
+#define PRTDCB_FCTTVN_TTV_2N_S			0
+#define PRTDCB_FCTTVN_TTV_2N_M			ICE_M(0xFFFF, 0)
+#define PRTDCB_FCTTVN_TTV_2N_P1_S		16
+#define PRTDCB_FCTTVN_TTV_2N_P1_M		ICE_M(0xFFFF, 16)
+#define PRTDCB_GENC				0x00083000 /* Reset Source: CORER */
+#define PRTDCB_GENC_NUMTC_S			2
+#define PRTDCB_GENC_NUMTC_M			ICE_M(0xF, 2)
+#define PRTDCB_GENC_FCOEUP_S			6
+#define PRTDCB_GENC_FCOEUP_M			ICE_M(0x7, 6)
+#define PRTDCB_GENC_FCOEUP_VALID_S		9
+#define PRTDCB_GENC_FCOEUP_VALID_M		BIT(9)
+#define PRTDCB_GENC_PFCLDA_S			16
+#define PRTDCB_GENC_PFCLDA_M			ICE_M(0xFFFF, 16)
+#define PRTDCB_GENS				0x00083020 /* Reset Source: CORER */
 #define PRTDCB_GENS_DCBX_STATUS_S		0
 #define PRTDCB_GENS_DCBX_STATUS_M		ICE_M(0x7, 0)
-#define GL_PREEXT_L2_PMASK0(_i)			(0x0020F0FC + ((_i) * 4))
-#define GL_PREEXT_L2_PMASK1(_i)			(0x0020F108 + ((_i) * 4))
-#define GLFLXP_RXDID_FLAGS(_i, _j)		(0x0045D000 + ((_i) * 4 + (_j) * 256))
+#define PRTDCB_PRS_RETSC			0x002001A0 /* Reset Source: CORER */
+#define PRTDCB_PRS_RETSC_ETS_MODE_S		0
+#define PRTDCB_PRS_RETSC_ETS_MODE_M		BIT(0)
+#define PRTDCB_PRS_RETSC_NON_ETS_MODE_S		1
+#define PRTDCB_PRS_RETSC_NON_ETS_MODE_M		BIT(1)
+#define PRTDCB_PRS_RETSC_ETS_MAX_EXP_S		2
+#define PRTDCB_PRS_RETSC_ETS_MAX_EXP_M		ICE_M(0xF, 2)
+#define PRTDCB_PRS_RPRRC			0x00200180 /* Reset Source: CORER */
+#define PRTDCB_PRS_RPRRC_BWSHARE_S		0
+#define PRTDCB_PRS_RPRRC_BWSHARE_M		ICE_M(0x3FF, 0)
+#define PRTDCB_PRS_RPRRC_BWSHARE_DIS_S		31
+#define PRTDCB_PRS_RPRRC_BWSHARE_DIS_M		BIT(31)
+#define PRTDCB_RETSC				0x001222A0 /* Reset Source: CORER */
+#define PRTDCB_RETSC_ETS_MODE_S			0
+#define PRTDCB_RETSC_ETS_MODE_M			BIT(0)
+#define PRTDCB_RETSC_NON_ETS_MODE_S		1
+#define PRTDCB_RETSC_NON_ETS_MODE_M		BIT(1)
+#define PRTDCB_RETSC_ETS_MAX_EXP_S		2
+#define PRTDCB_RETSC_ETS_MAX_EXP_M		ICE_M(0xF, 2)
+#define PRTDCB_RPRRC				0x001220C0 /* Reset Source: CORER */
+#define PRTDCB_RPRRC_BWSHARE_S			0
+#define PRTDCB_RPRRC_BWSHARE_M			ICE_M(0x3FF, 0)
+#define PRTDCB_RPRRC_BWSHARE_DIS_S		31
+#define PRTDCB_RPRRC_BWSHARE_DIS_M		BIT(31)
+#define PRTDCB_RPRRS				0x001220E0 /* Reset Source: CORER */
+#define PRTDCB_RPRRS_CREDITS_S			0
+#define PRTDCB_RPRRS_CREDITS_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTDCB_RUP_TDPU				0x00040960 /* Reset Source: CORER */
+#define PRTDCB_RUP_TDPU_NOVLANUP_S		0
+#define PRTDCB_RUP_TDPU_NOVLANUP_M		ICE_M(0x7, 0)
+#define PRTDCB_RUP2TC				0x001D2640 /* Reset Source: CORER */
+#define PRTDCB_RUP2TC_UP0TC_S			0
+#define PRTDCB_RUP2TC_UP0TC_M			ICE_M(0x7, 0)
+#define PRTDCB_RUP2TC_UP1TC_S			3
+#define PRTDCB_RUP2TC_UP1TC_M			ICE_M(0x7, 3)
+#define PRTDCB_RUP2TC_UP2TC_S			6
+#define PRTDCB_RUP2TC_UP2TC_M			ICE_M(0x7, 6)
+#define PRTDCB_RUP2TC_UP3TC_S			9
+#define PRTDCB_RUP2TC_UP3TC_M			ICE_M(0x7, 9)
+#define PRTDCB_RUP2TC_UP4TC_S			12
+#define PRTDCB_RUP2TC_UP4TC_M			ICE_M(0x7, 12)
+#define PRTDCB_RUP2TC_UP5TC_S			15
+#define PRTDCB_RUP2TC_UP5TC_M			ICE_M(0x7, 15)
+#define PRTDCB_RUP2TC_UP6TC_S			18
+#define PRTDCB_RUP2TC_UP6TC_M			ICE_M(0x7, 18)
+#define PRTDCB_RUP2TC_UP7TC_S			21
+#define PRTDCB_RUP2TC_UP7TC_M			ICE_M(0x7, 21)
+#define PRTDCB_SWT_RETSC			0x0020A140 /* Reset Source: CORER */
+#define PRTDCB_SWT_RETSC_ETS_MODE_S		0
+#define PRTDCB_SWT_RETSC_ETS_MODE_M		BIT(0)
+#define PRTDCB_SWT_RETSC_NON_ETS_MODE_S		1
+#define PRTDCB_SWT_RETSC_NON_ETS_MODE_M		BIT(1)
+#define PRTDCB_SWT_RETSC_ETS_MAX_EXP_S		2
+#define PRTDCB_SWT_RETSC_ETS_MAX_EXP_M		ICE_M(0xF, 2)
+#define PRTDCB_TCB_DWRR_CREDITS			0x000AE000 /* Reset Source: CORER */
+#define PRTDCB_TCB_DWRR_CREDITS_CREDITS_S	0
+#define PRTDCB_TCB_DWRR_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define PRTDCB_TCB_DWRR_QUANTA			0x000AE020 /* Reset Source: CORER */
+#define PRTDCB_TCB_DWRR_QUANTA_QUANTA_S		0
+#define PRTDCB_TCB_DWRR_QUANTA_QUANTA_M		ICE_M(0x7FF, 0)
+#define PRTDCB_TCB_DWRR_SAT			0x000AE040 /* Reset Source: CORER */
+#define PRTDCB_TCB_DWRR_SAT_SATURATION_S	0
+#define PRTDCB_TCB_DWRR_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define PRTDCB_TCUPM_NO_EXCEED_DM		0x000BC3C0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_NO_EXCEED_DM_MONITOR_S	0
+#define PRTDCB_TCUPM_NO_EXCEED_DM_MONITOR_M	ICE_M(0x7FFFF, 0)
+#define PRTDCB_TCUPM_REG_CM			0x000BC360 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_CM_MONITOR_S		0
+#define PRTDCB_TCUPM_REG_CM_MONITOR_M		ICE_M(0x7FFF, 0)
+#define PRTDCB_TCUPM_REG_CTHR			0x000BC380 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_CTHR_PORTOFFTH_H_S	0
+#define PRTDCB_TCUPM_REG_CTHR_PORTOFFTH_H_M	ICE_M(0x7FFF, 0)
+#define PRTDCB_TCUPM_REG_CTHR_PORTOFFTH_L_S	15
+#define PRTDCB_TCUPM_REG_CTHR_PORTOFFTH_L_M	ICE_M(0x7FFF, 15)
+#define PRTDCB_TCUPM_REG_DM			0x000BC3A0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_DM_MONITOR_S		0
+#define PRTDCB_TCUPM_REG_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define PRTDCB_TCUPM_REG_DTHR			0x000BC3E0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_DTHR_PORTOFFTH_H_S	0
+#define PRTDCB_TCUPM_REG_DTHR_PORTOFFTH_H_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_REG_DTHR_PORTOFFTH_L_S	12
+#define PRTDCB_TCUPM_REG_DTHR_PORTOFFTH_L_M	ICE_M(0xFFF, 12)
+#define PRTDCB_TCUPM_REG_PE_HB_DM		0x000BC400 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_PE_HB_DM_MONITOR_S	0
+#define PRTDCB_TCUPM_REG_PE_HB_DM_MONITOR_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR		0x000BC420 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR_PORTOFFTH_H_S 0
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR_PORTOFFTH_H_M ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR_PORTOFFTH_L_S 12
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR_PORTOFFTH_L_M ICE_M(0xFFF, 12)
+#define PRTDCB_TCUPM_WAIT_PFC_CM		0x000BC440 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_CM_MONITOR_S	0
+#define PRTDCB_TCUPM_WAIT_PFC_CM_MONITOR_M	ICE_M(0x7FFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_CTHR		0x000BC460 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_CTHR_PORTOFFTH_S	0
+#define PRTDCB_TCUPM_WAIT_PFC_CTHR_PORTOFFTH_M	ICE_M(0x7FFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_DM		0x000BC480 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_DM_MONITOR_S	0
+#define PRTDCB_TCUPM_WAIT_PFC_DM_MONITOR_M	ICE_M(0x7FFFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_DTHR		0x000BC4A0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_DTHR_PORTOFFTH_S	0
+#define PRTDCB_TCUPM_WAIT_PFC_DTHR_PORTOFFTH_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DM		0x000BC4C0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DM_MONITOR_S 0
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DM_MONITOR_M ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DTHR	0x000BC4E0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DTHR_PORTOFFTH_S 0
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DTHR_PORTOFFTH_M ICE_M(0xFFF, 0)
+#define PRTDCB_TDPUC				0x00040940 /* Reset Source: CORER */
+#define PRTDCB_TDPUC_MAX_TXFRAME_S		0
+#define PRTDCB_TDPUC_MAX_TXFRAME_M		ICE_M(0xFFFF, 0)
+#define PRTDCB_TDPUC_MAL_LENGTH_S		16
+#define PRTDCB_TDPUC_MAL_LENGTH_M		BIT(16)
+#define PRTDCB_TDPUC_MAL_CMD_S			17
+#define PRTDCB_TDPUC_MAL_CMD_M			BIT(17)
+#define PRTDCB_TDPUC_TTL_DROP_S			18
+#define PRTDCB_TDPUC_TTL_DROP_M			BIT(18)
+#define PRTDCB_TDPUC_UR_DROP_S			19
+#define PRTDCB_TDPUC_UR_DROP_M			BIT(19)
+#define PRTDCB_TDPUC_DUMMY_S			20
+#define PRTDCB_TDPUC_DUMMY_M			BIT(20)
+#define PRTDCB_TDPUC_BIG_PKT_SIZE_S		21
+#define PRTDCB_TDPUC_BIG_PKT_SIZE_M		BIT(21)
+#define PRTDCB_TDPUC_L2_ACCEPT_FAIL_S		22
+#define PRTDCB_TDPUC_L2_ACCEPT_FAIL_M		BIT(22)
+#define PRTDCB_TDPUC_DSCP_CHECK_FAIL_S		23
+#define PRTDCB_TDPUC_DSCP_CHECK_FAIL_M		BIT(23)
+#define PRTDCB_TDPUC_RCU_ANTISPOOF_S		24
+#define PRTDCB_TDPUC_RCU_ANTISPOOF_M		BIT(24)
+#define PRTDCB_TDPUC_NIC_DSI_S			25
+#define PRTDCB_TDPUC_NIC_DSI_M			BIT(25)
+#define PRTDCB_TDPUC_NIC_IPSEC_S		26
+#define PRTDCB_TDPUC_NIC_IPSEC_M		BIT(26)
+#define PRTDCB_TDPUC_CLEAR_DROP_S		31
+#define PRTDCB_TDPUC_CLEAR_DROP_M		BIT(31)
+#define PRTDCB_TFCS				0x001E4560 /* Reset Source: GLOBR */
+#define PRTDCB_TFCS_TXOFF_S			0
+#define PRTDCB_TFCS_TXOFF_M			BIT(0)
+#define PRTDCB_TFCS_TXOFF0_S			8
+#define PRTDCB_TFCS_TXOFF0_M			BIT(8)
+#define PRTDCB_TFCS_TXOFF1_S			9
+#define PRTDCB_TFCS_TXOFF1_M			BIT(9)
+#define PRTDCB_TFCS_TXOFF2_S			10
+#define PRTDCB_TFCS_TXOFF2_M			BIT(10)
+#define PRTDCB_TFCS_TXOFF3_S			11
+#define PRTDCB_TFCS_TXOFF3_M			BIT(11)
+#define PRTDCB_TFCS_TXOFF4_S			12
+#define PRTDCB_TFCS_TXOFF4_M			BIT(12)
+#define PRTDCB_TFCS_TXOFF5_S			13
+#define PRTDCB_TFCS_TXOFF5_M			BIT(13)
+#define PRTDCB_TFCS_TXOFF6_S			14
+#define PRTDCB_TFCS_TXOFF6_M			BIT(14)
+#define PRTDCB_TFCS_TXOFF7_S			15
+#define PRTDCB_TFCS_TXOFF7_M			BIT(15)
+#define PRTDCB_TLPM_REG_DM			0x000A0000 /* Reset Source: CORER */
+#define PRTDCB_TLPM_REG_DM_MONITOR_S		0
+#define PRTDCB_TLPM_REG_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define PRTDCB_TLPM_REG_DTHR			0x000A0020 /* Reset Source: CORER */
+#define PRTDCB_TLPM_REG_DTHR_PORTOFFTH_H_S	0
+#define PRTDCB_TLPM_REG_DTHR_PORTOFFTH_H_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TLPM_REG_DTHR_PORTOFFTH_L_S	12
+#define PRTDCB_TLPM_REG_DTHR_PORTOFFTH_L_M	ICE_M(0xFFF, 12)
+#define PRTDCB_TLPM_WAIT_PFC_DM			0x000A0040 /* Reset Source: CORER */
+#define PRTDCB_TLPM_WAIT_PFC_DM_MONITOR_S	0
+#define PRTDCB_TLPM_WAIT_PFC_DM_MONITOR_M	ICE_M(0x7FFFF, 0)
+#define PRTDCB_TLPM_WAIT_PFC_DTHR		0x000A0060 /* Reset Source: CORER */
+#define PRTDCB_TLPM_WAIT_PFC_DTHR_PORTOFFTH_S	0
+#define PRTDCB_TLPM_WAIT_PFC_DTHR_PORTOFFTH_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TPFCTS(_i)			(0x001E4660 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: GLOBR */
+#define PRTDCB_TPFCTS_MAX_INDEX			7
+#define PRTDCB_TPFCTS_PFCTIMER_S		0
+#define PRTDCB_TPFCTS_PFCTIMER_M		ICE_M(0x3FFF, 0)
+#define PRTDCB_TUP2TC				0x001D26C0 /* Reset Source: CORER */
+#define PRTDCB_TUP2TC_UP0TC_S			0
+#define PRTDCB_TUP2TC_UP0TC_M			ICE_M(0x7, 0)
+#define PRTDCB_TUP2TC_UP1TC_S			3
+#define PRTDCB_TUP2TC_UP1TC_M			ICE_M(0x7, 3)
+#define PRTDCB_TUP2TC_UP2TC_S			6
+#define PRTDCB_TUP2TC_UP2TC_M			ICE_M(0x7, 6)
+#define PRTDCB_TUP2TC_UP3TC_S			9
+#define PRTDCB_TUP2TC_UP3TC_M			ICE_M(0x7, 9)
+#define PRTDCB_TUP2TC_UP4TC_S			12
+#define PRTDCB_TUP2TC_UP4TC_M			ICE_M(0x7, 12)
+#define PRTDCB_TUP2TC_UP5TC_S			15
+#define PRTDCB_TUP2TC_UP5TC_M			ICE_M(0x7, 15)
+#define PRTDCB_TUP2TC_UP6TC_S			18
+#define PRTDCB_TUP2TC_UP6TC_M			ICE_M(0x7, 18)
+#define PRTDCB_TUP2TC_UP7TC_S			21
+#define PRTDCB_TUP2TC_UP7TC_M			ICE_M(0x7, 21)
+#define PRTDCB_TX_DSCP2UP_CTL			0x00040980 /* Reset Source: CORER */
+#define PRTDCB_TX_DSCP2UP_CTL_DSCP2UP_ENA_S	0
+#define PRTDCB_TX_DSCP2UP_CTL_DSCP2UP_ENA_M	BIT(0)
+#define PRTDCB_TX_DSCP2UP_CTL_DSCP_DEFAULT_UP_S 1
+#define PRTDCB_TX_DSCP2UP_CTL_DSCP_DEFAULT_UP_M ICE_M(0x7, 1)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT(_i)		(0x000409A0 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: CORER */
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_MAX_INDEX	7
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_0_S 0
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_0_M ICE_M(0x7, 0)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_1_S 4
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_1_M ICE_M(0x7, 4)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_2_S 8
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_2_M ICE_M(0x7, 8)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_3_S 12
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_3_M ICE_M(0x7, 12)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_4_S 16
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_4_M ICE_M(0x7, 16)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_5_S 20
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_5_M ICE_M(0x7, 20)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_6_S 24
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_6_M ICE_M(0x7, 24)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_7_S 28
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_7_M ICE_M(0x7, 28)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT(_i)		(0x00040AA0 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: CORER */
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_MAX_INDEX	7
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_0_S 0
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_0_M ICE_M(0x7, 0)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_1_S 4
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_1_M ICE_M(0x7, 4)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_2_S 8
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_2_M ICE_M(0x7, 8)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_3_S 12
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_3_M ICE_M(0x7, 12)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_4_S 16
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_4_M ICE_M(0x7, 16)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_5_S 20
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_5_M ICE_M(0x7, 20)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_6_S 24
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_6_M ICE_M(0x7, 24)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_7_S 28
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_7_M ICE_M(0x7, 28)
+#define PRTTCB_BULK_DWRR_REG_CREDITS		0x000AE060 /* Reset Source: CORER */
+#define PRTTCB_BULK_DWRR_REG_CREDITS_CREDITS_S	0
+#define PRTTCB_BULK_DWRR_REG_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define PRTTCB_BULK_DWRR_WB_CREDITS		0x000AE080 /* Reset Source: CORER */
+#define PRTTCB_BULK_DWRR_WB_CREDITS_CREDITS_S	0
+#define PRTTCB_BULK_DWRR_WB_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define PRTTCB_CREDIT_EXP			0x000AE100 /* Reset Source: CORER */
+#define PRTTCB_CREDIT_EXP_EXPANSION_S		0
+#define PRTTCB_CREDIT_EXP_EXPANSION_M		ICE_M(0xFF, 0)
+#define PRTTCB_LL_DWRR_REG_CREDITS		0x000AE0A0 /* Reset Source: CORER */
+#define PRTTCB_LL_DWRR_REG_CREDITS_CREDITS_S	0
+#define PRTTCB_LL_DWRR_REG_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define PRTTCB_LL_DWRR_WB_CREDITS		0x000AE0C0 /* Reset Source: CORER */
+#define PRTTCB_LL_DWRR_WB_CREDITS_CREDITS_S	0
+#define PRTTCB_LL_DWRR_WB_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define TCDCB_TCUPM_WAIT_CM(_i)			(0x000BC520 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_CM_MAX_INDEX		31
+#define TCDCB_TCUPM_WAIT_CM_MONITOR_S		0
+#define TCDCB_TCUPM_WAIT_CM_MONITOR_M		ICE_M(0x7FFF, 0)
+#define TCDCB_TCUPM_WAIT_CTHR(_i)		(0x000BC5A0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_CTHR_MAX_INDEX		31
+#define TCDCB_TCUPM_WAIT_CTHR_TCOFFTH_S		0
+#define TCDCB_TCUPM_WAIT_CTHR_TCOFFTH_M		ICE_M(0x7FFF, 0)
+#define TCDCB_TCUPM_WAIT_DM(_i)			(0x000BC620 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_DM_MAX_INDEX		31
+#define TCDCB_TCUPM_WAIT_DM_MONITOR_S		0
+#define TCDCB_TCUPM_WAIT_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define TCDCB_TCUPM_WAIT_DTHR(_i)		(0x000BC6A0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_DTHR_MAX_INDEX		31
+#define TCDCB_TCUPM_WAIT_DTHR_TCOFFTH_S		0
+#define TCDCB_TCUPM_WAIT_DTHR_TCOFFTH_M		ICE_M(0xFFF, 0)
+#define TCDCB_TCUPM_WAIT_PE_HB_DM(_i)		(0x000BC720 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_PE_HB_DM_MAX_INDEX	31
+#define TCDCB_TCUPM_WAIT_PE_HB_DM_MONITOR_S	0
+#define TCDCB_TCUPM_WAIT_PE_HB_DM_MONITOR_M	ICE_M(0xFFF, 0)
+#define TCDCB_TCUPM_WAIT_PE_HB_DTHR(_i)		(0x000BC7A0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_PE_HB_DTHR_MAX_INDEX	31
+#define TCDCB_TCUPM_WAIT_PE_HB_DTHR_TCOFFTH_S	0
+#define TCDCB_TCUPM_WAIT_PE_HB_DTHR_TCOFFTH_M	ICE_M(0xFFF, 0)
+#define TCDCB_TLPM_WAIT_DM(_i)			(0x000A0080 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TLPM_WAIT_DM_MAX_INDEX		31
+#define TCDCB_TLPM_WAIT_DM_MONITOR_S		0
+#define TCDCB_TLPM_WAIT_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define TCDCB_TLPM_WAIT_DTHR(_i)		(0x000A0100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TLPM_WAIT_DTHR_MAX_INDEX		31
+#define TCDCB_TLPM_WAIT_DTHR_TCOFFTH_S		0
+#define TCDCB_TLPM_WAIT_DTHR_TCOFFTH_M		ICE_M(0xFFF, 0)
+#define TCTCB_WB_RL_TC_CFG(_i)			(0x000AE138 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCTCB_WB_RL_TC_CFG_MAX_INDEX		31
+#define TCTCB_WB_RL_TC_CFG_TOKENS_S		0
+#define TCTCB_WB_RL_TC_CFG_TOKENS_M		ICE_M(0xFFF, 0)
+#define TCTCB_WB_RL_TC_CFG_BURST_SIZE_S		12
+#define TCTCB_WB_RL_TC_CFG_BURST_SIZE_M		ICE_M(0x3FF, 12)
+#define TCTCB_WB_RL_TC_STAT(_i)			(0x000AE1B8 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCTCB_WB_RL_TC_STAT_MAX_INDEX		31
+#define TCTCB_WB_RL_TC_STAT_BUCKET_S		0
+#define TCTCB_WB_RL_TC_STAT_BUCKET_M		ICE_M(0x1FFFF, 0)
+#define TPB_BULK_DWRR_REG_QUANTA		0x00099340 /* Reset Source: CORER */
+#define TPB_BULK_DWRR_REG_QUANTA_QUANTA_S	0
+#define TPB_BULK_DWRR_REG_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define TPB_BULK_DWRR_REG_SAT			0x00099350 /* Reset Source: CORER */
+#define TPB_BULK_DWRR_REG_SAT_SATURATION_S	0
+#define TPB_BULK_DWRR_REG_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define TPB_BULK_DWRR_WB_QUANTA			0x00099344 /* Reset Source: CORER */
+#define TPB_BULK_DWRR_WB_QUANTA_QUANTA_S	0
+#define TPB_BULK_DWRR_WB_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define TPB_BULK_DWRR_WB_SAT			0x00099354 /* Reset Source: CORER */
+#define TPB_BULK_DWRR_WB_SAT_SATURATION_S	0
+#define TPB_BULK_DWRR_WB_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define TPB_GLDCB_TCB_WB_SP			0x0009966C /* Reset Source: CORER */
+#define TPB_GLDCB_TCB_WB_SP_WB_SP_S		0
+#define TPB_GLDCB_TCB_WB_SP_WB_SP_M		BIT(0)
+#define TPB_GLTCB_CREDIT_EXP_CTL		0x00099664 /* Reset Source: CORER */
+#define TPB_GLTCB_CREDIT_EXP_CTL_EN_S		0
+#define TPB_GLTCB_CREDIT_EXP_CTL_EN_M		BIT(0)
+#define TPB_GLTCB_CREDIT_EXP_CTL_MIN_PKT_S	1
+#define TPB_GLTCB_CREDIT_EXP_CTL_MIN_PKT_M	ICE_M(0x1FF, 1)
+#define TPB_LL_DWRR_REG_QUANTA			0x00099348 /* Reset Source: CORER */
+#define TPB_LL_DWRR_REG_QUANTA_QUANTA_S		0
+#define TPB_LL_DWRR_REG_QUANTA_QUANTA_M		ICE_M(0x7FF, 0)
+#define TPB_LL_DWRR_REG_SAT			0x00099358 /* Reset Source: CORER */
+#define TPB_LL_DWRR_REG_SAT_SATURATION_S	0
+#define TPB_LL_DWRR_REG_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define TPB_LL_DWRR_WB_QUANTA			0x0009934C /* Reset Source: CORER */
+#define TPB_LL_DWRR_WB_QUANTA_QUANTA_S		0
+#define TPB_LL_DWRR_WB_QUANTA_QUANTA_M		ICE_M(0x7FF, 0)
+#define TPB_LL_DWRR_WB_SAT			0x0009935C /* Reset Source: CORER */
+#define TPB_LL_DWRR_WB_SAT_SATURATION_S		0
+#define TPB_LL_DWRR_WB_SAT_SATURATION_M		ICE_M(0x1FFFF, 0)
+#define TPB_PRTDCB_TCB_DWRR_CREDITS		0x000991C0 /* Reset Source: CORER */
+#define TPB_PRTDCB_TCB_DWRR_CREDITS_CREDITS_S	0
+#define TPB_PRTDCB_TCB_DWRR_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define TPB_PRTDCB_TCB_DWRR_QUANTA		0x00099220 /* Reset Source: CORER */
+#define TPB_PRTDCB_TCB_DWRR_QUANTA_QUANTA_S	0
+#define TPB_PRTDCB_TCB_DWRR_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define TPB_PRTDCB_TCB_DWRR_SAT			0x00099260 /* Reset Source: CORER */
+#define TPB_PRTDCB_TCB_DWRR_SAT_SATURATION_S	0
+#define TPB_PRTDCB_TCB_DWRR_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define TPB_PRTTCB_BULK_DWRR_REG_CREDITS	0x000992A0 /* Reset Source: CORER */
+#define TPB_PRTTCB_BULK_DWRR_REG_CREDITS_CREDITS_S 0
+#define TPB_PRTTCB_BULK_DWRR_REG_CREDITS_CREDITS_M ICE_M(0x3FFFF, 0)
+#define TPB_PRTTCB_BULK_DWRR_WB_CREDITS		0x000992C0 /* Reset Source: CORER */
+#define TPB_PRTTCB_BULK_DWRR_WB_CREDITS_CREDITS_S 0
+#define TPB_PRTTCB_BULK_DWRR_WB_CREDITS_CREDITS_M ICE_M(0x3FFFF, 0)
+#define TPB_PRTTCB_CREDIT_EXP			0x00099644 /* Reset Source: CORER */
+#define TPB_PRTTCB_CREDIT_EXP_EXPANSION_S	0
+#define TPB_PRTTCB_CREDIT_EXP_EXPANSION_M	ICE_M(0xFF, 0)
+#define TPB_PRTTCB_LL_DWRR_REG_CREDITS		0x00099300 /* Reset Source: CORER */
+#define TPB_PRTTCB_LL_DWRR_REG_CREDITS_CREDITS_S 0
+#define TPB_PRTTCB_LL_DWRR_REG_CREDITS_CREDITS_M ICE_M(0x3FFFF, 0)
+#define TPB_PRTTCB_LL_DWRR_WB_CREDITS		0x00099320 /* Reset Source: CORER */
+#define TPB_PRTTCB_LL_DWRR_WB_CREDITS_CREDITS_S 0
+#define TPB_PRTTCB_LL_DWRR_WB_CREDITS_CREDITS_M ICE_M(0x3FFFF, 0)
+#define TPB_WB_RL_TC_CFG(_i)			(0x00099360 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TPB_WB_RL_TC_CFG_MAX_INDEX		31
+#define TPB_WB_RL_TC_CFG_TOKENS_S		0
+#define TPB_WB_RL_TC_CFG_TOKENS_M		ICE_M(0xFFF, 0)
+#define TPB_WB_RL_TC_CFG_BURST_SIZE_S		12
+#define TPB_WB_RL_TC_CFG_BURST_SIZE_M		ICE_M(0x3FF, 12)
+#define TPB_WB_RL_TC_STAT(_i)			(0x000993E0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TPB_WB_RL_TC_STAT_MAX_INDEX		31
+#define TPB_WB_RL_TC_STAT_BUCKET_S		0
+#define TPB_WB_RL_TC_STAT_BUCKET_M		ICE_M(0x1FFFF, 0)
+#define GL_ACLEXT_CDMD_L1SEL(_i)		(0x00210054 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_CDMD_L1SEL_MAX_INDEX		2
+#define GL_ACLEXT_CDMD_L1SEL_RX_SEL_S		0
+#define GL_ACLEXT_CDMD_L1SEL_RX_SEL_M		ICE_M(0x1F, 0)
+#define GL_ACLEXT_CDMD_L1SEL_TX_SEL_S		8
+#define GL_ACLEXT_CDMD_L1SEL_TX_SEL_M		ICE_M(0x1F, 8)
+#define GL_ACLEXT_CDMD_L1SEL_AUX0_SEL_S		16
+#define GL_ACLEXT_CDMD_L1SEL_AUX0_SEL_M		ICE_M(0x1F, 16)
+#define GL_ACLEXT_CDMD_L1SEL_AUX1_SEL_S		24
+#define GL_ACLEXT_CDMD_L1SEL_AUX1_SEL_M		ICE_M(0x1F, 24)
+#define GL_ACLEXT_CDMD_L1SEL_BIDIR_ENA_S	30
+#define GL_ACLEXT_CDMD_L1SEL_BIDIR_ENA_M	ICE_M(0x3, 30)
+#define GL_ACLEXT_CTLTBL_L2ADDR(_i)		(0x00210084 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_CTLTBL_L2ADDR_MAX_INDEX	2
+#define GL_ACLEXT_CTLTBL_L2ADDR_LINE_OFF_S	0
+#define GL_ACLEXT_CTLTBL_L2ADDR_LINE_OFF_M	ICE_M(0x7, 0)
+#define GL_ACLEXT_CTLTBL_L2ADDR_LINE_IDX_S	8
+#define GL_ACLEXT_CTLTBL_L2ADDR_LINE_IDX_M	ICE_M(0x7, 8)
+#define GL_ACLEXT_CTLTBL_L2ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_CTLTBL_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_CTLTBL_L2DATA(_i)		(0x00210090 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_CTLTBL_L2DATA_MAX_INDEX	2
+#define GL_ACLEXT_CTLTBL_L2DATA_DATA_S		0
+#define GL_ACLEXT_CTLTBL_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_DFLT_L2PRFL(_i)		(0x00210138 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_DFLT_L2PRFL_MAX_INDEX		2
+#define GL_ACLEXT_DFLT_L2PRFL_DFLT_PRFL_S	0
+#define GL_ACLEXT_DFLT_L2PRFL_DFLT_PRFL_M	ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_DFLT_L2PRFL_ACL(_i)		(0x00393800 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_DFLT_L2PRFL_ACL_MAX_INDEX	2
+#define GL_ACLEXT_DFLT_L2PRFL_ACL_DFLT_PRFL_S	0
+#define GL_ACLEXT_DFLT_L2PRFL_ACL_DFLT_PRFL_M	ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_FLGS_L1SEL0_1(_i)		(0x0021006C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FLGS_L1SEL0_1_MAX_INDEX	2
+#define GL_ACLEXT_FLGS_L1SEL0_1_FLS0_S		0
+#define GL_ACLEXT_FLGS_L1SEL0_1_FLS0_M		ICE_M(0x1FF, 0)
+#define GL_ACLEXT_FLGS_L1SEL0_1_FLS1_S		16
+#define GL_ACLEXT_FLGS_L1SEL0_1_FLS1_M		ICE_M(0x1FF, 16)
+#define GL_ACLEXT_FLGS_L1SEL2_3(_i)		(0x00210078 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FLGS_L1SEL2_3_MAX_INDEX	2
+#define GL_ACLEXT_FLGS_L1SEL2_3_FLS2_S		0
+#define GL_ACLEXT_FLGS_L1SEL2_3_FLS2_M		ICE_M(0x1FF, 0)
+#define GL_ACLEXT_FLGS_L1SEL2_3_FLS3_S		16
+#define GL_ACLEXT_FLGS_L1SEL2_3_FLS3_M		ICE_M(0x1FF, 16)
+#define GL_ACLEXT_FLGS_L1TBL(_i)		(0x00210060 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FLGS_L1TBL_MAX_INDEX		2
+#define GL_ACLEXT_FLGS_L1TBL_LSB_S		0
+#define GL_ACLEXT_FLGS_L1TBL_LSB_M		ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_FLGS_L1TBL_MSB_S		16
+#define GL_ACLEXT_FLGS_L1TBL_MSB_M		ICE_M(0xFFFF, 16)
+#define GL_ACLEXT_FORCE_L1CDID(_i)		(0x00210018 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FORCE_L1CDID_MAX_INDEX	2
+#define GL_ACLEXT_FORCE_L1CDID_STATIC_CDID_S	0
+#define GL_ACLEXT_FORCE_L1CDID_STATIC_CDID_M	ICE_M(0xF, 0)
+#define GL_ACLEXT_FORCE_L1CDID_STATIC_CDID_EN_S 31
+#define GL_ACLEXT_FORCE_L1CDID_STATIC_CDID_EN_M BIT(31)
+#define GL_ACLEXT_FORCE_PID(_i)			(0x00210000 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FORCE_PID_MAX_INDEX		2
+#define GL_ACLEXT_FORCE_PID_STATIC_PID_S	0
+#define GL_ACLEXT_FORCE_PID_STATIC_PID_M	ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_FORCE_PID_STATIC_PID_EN_S	31
+#define GL_ACLEXT_FORCE_PID_STATIC_PID_EN_M	BIT(31)
+#define GL_ACLEXT_K2N_L2ADDR(_i)		(0x00210144 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_K2N_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_K2N_L2ADDR_LINE_IDX_S		0
+#define GL_ACLEXT_K2N_L2ADDR_LINE_IDX_M		ICE_M(0x7F, 0)
+#define GL_ACLEXT_K2N_L2ADDR_AUTO_INC_S		31
+#define GL_ACLEXT_K2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_ACLEXT_K2N_L2DATA(_i)		(0x00210150 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_K2N_L2DATA_MAX_INDEX		2
+#define GL_ACLEXT_K2N_L2DATA_DATA0_S		0
+#define GL_ACLEXT_K2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_K2N_L2DATA_DATA1_S		8
+#define GL_ACLEXT_K2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_ACLEXT_K2N_L2DATA_DATA2_S		16
+#define GL_ACLEXT_K2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_ACLEXT_K2N_L2DATA_DATA3_S		24
+#define GL_ACLEXT_K2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_ACLEXT_L2_PMASK0(_i)			(0x002100FC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2_PMASK0_MAX_INDEX		2
+#define GL_ACLEXT_L2_PMASK0_BITMASK_S		0
+#define GL_ACLEXT_L2_PMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_L2_PMASK1(_i)			(0x00210108 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2_PMASK1_MAX_INDEX		2
+#define GL_ACLEXT_L2_PMASK1_BITMASK_S		0
+#define GL_ACLEXT_L2_PMASK1_BITMASK_M		ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_L2_TMASK0(_i)			(0x00210498 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2_TMASK0_MAX_INDEX		2
+#define GL_ACLEXT_L2_TMASK0_BITMASK_S		0
+#define GL_ACLEXT_L2_TMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_L2_TMASK1(_i)			(0x002104A4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2_TMASK1_MAX_INDEX		2
+#define GL_ACLEXT_L2_TMASK1_BITMASK_S		0
+#define GL_ACLEXT_L2_TMASK1_BITMASK_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_L2BMP0_3(_i)			(0x002100A8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2BMP0_3_MAX_INDEX		2
+#define GL_ACLEXT_L2BMP0_3_BMP0_S		0
+#define GL_ACLEXT_L2BMP0_3_BMP0_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_L2BMP0_3_BMP1_S		8
+#define GL_ACLEXT_L2BMP0_3_BMP1_M		ICE_M(0xFF, 8)
+#define GL_ACLEXT_L2BMP0_3_BMP2_S		16
+#define GL_ACLEXT_L2BMP0_3_BMP2_M		ICE_M(0xFF, 16)
+#define GL_ACLEXT_L2BMP0_3_BMP3_S		24
+#define GL_ACLEXT_L2BMP0_3_BMP3_M		ICE_M(0xFF, 24)
+#define GL_ACLEXT_L2BMP4_7(_i)			(0x002100B4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2BMP4_7_MAX_INDEX		2
+#define GL_ACLEXT_L2BMP4_7_BMP4_S		0
+#define GL_ACLEXT_L2BMP4_7_BMP4_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_L2BMP4_7_BMP5_S		8
+#define GL_ACLEXT_L2BMP4_7_BMP5_M		ICE_M(0xFF, 8)
+#define GL_ACLEXT_L2BMP4_7_BMP6_S		16
+#define GL_ACLEXT_L2BMP4_7_BMP6_M		ICE_M(0xFF, 16)
+#define GL_ACLEXT_L2BMP4_7_BMP7_S		24
+#define GL_ACLEXT_L2BMP4_7_BMP7_M		ICE_M(0xFF, 24)
+#define GL_ACLEXT_L2PRTMOD(_i)			(0x0021009C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2PRTMOD_MAX_INDEX		2
+#define GL_ACLEXT_L2PRTMOD_XLT1_S		0
+#define GL_ACLEXT_L2PRTMOD_XLT1_M		ICE_M(0x3, 0)
+#define GL_ACLEXT_L2PRTMOD_XLT2_S		8
+#define GL_ACLEXT_L2PRTMOD_XLT2_M		ICE_M(0x3, 8)
+#define GL_ACLEXT_N2N_L2ADDR(_i)		(0x0021015C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_N2N_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_N2N_L2ADDR_LINE_IDX_S		0
+#define GL_ACLEXT_N2N_L2ADDR_LINE_IDX_M		ICE_M(0x3F, 0)
+#define GL_ACLEXT_N2N_L2ADDR_AUTO_INC_S		31
+#define GL_ACLEXT_N2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_ACLEXT_N2N_L2DATA(_i)		(0x00210168 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_N2N_L2DATA_MAX_INDEX		2
+#define GL_ACLEXT_N2N_L2DATA_DATA0_S		0
+#define GL_ACLEXT_N2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_N2N_L2DATA_DATA1_S		8
+#define GL_ACLEXT_N2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_ACLEXT_N2N_L2DATA_DATA2_S		16
+#define GL_ACLEXT_N2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_ACLEXT_N2N_L2DATA_DATA3_S		24
+#define GL_ACLEXT_N2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_ACLEXT_P2P_L1ADDR(_i)		(0x00210024 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_P2P_L1ADDR_MAX_INDEX		2
+#define GL_ACLEXT_P2P_L1ADDR_LINE_IDX_S		0
+#define GL_ACLEXT_P2P_L1ADDR_LINE_IDX_M		BIT(0)
+#define GL_ACLEXT_P2P_L1ADDR_AUTO_INC_S		31
+#define GL_ACLEXT_P2P_L1ADDR_AUTO_INC_M		BIT(31)
+#define GL_ACLEXT_P2P_L1DATA(_i)		(0x00210030 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_P2P_L1DATA_MAX_INDEX		2
+#define GL_ACLEXT_P2P_L1DATA_DATA_S		0
+#define GL_ACLEXT_P2P_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_PID_L2GKTYPE(_i)		(0x002100F0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_PID_L2GKTYPE_MAX_INDEX	2
+#define GL_ACLEXT_PID_L2GKTYPE_PID_GKTYPE_S	0
+#define GL_ACLEXT_PID_L2GKTYPE_PID_GKTYPE_M	ICE_M(0x3, 0)
+#define GL_ACLEXT_PLVL_SEL(_i)			(0x0021000C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_PLVL_SEL_MAX_INDEX		2
+#define GL_ACLEXT_PLVL_SEL_PLVL_SEL_S		0
+#define GL_ACLEXT_PLVL_SEL_PLVL_SEL_M		BIT(0)
+#define GL_ACLEXT_TCAM_L2ADDR(_i)		(0x00210114 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_TCAM_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_TCAM_L2ADDR_LINE_IDX_S	0
+#define GL_ACLEXT_TCAM_L2ADDR_LINE_IDX_M	ICE_M(0x3FF, 0)
+#define GL_ACLEXT_TCAM_L2ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_TCAM_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_TCAM_L2DATALSB(_i)		(0x00210120 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_TCAM_L2DATALSB_MAX_INDEX	2
+#define GL_ACLEXT_TCAM_L2DATALSB_DATALSB_S	0
+#define GL_ACLEXT_TCAM_L2DATALSB_DATALSB_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_TCAM_L2DATAMSB(_i)		(0x0021012C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_TCAM_L2DATAMSB_MAX_INDEX	2
+#define GL_ACLEXT_TCAM_L2DATAMSB_DATAMSB_S	0
+#define GL_ACLEXT_TCAM_L2DATAMSB_DATAMSB_M	ICE_M(0xFF, 0)
+#define GL_ACLEXT_XLT0_L1ADDR(_i)		(0x0021003C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT0_L1ADDR_MAX_INDEX		2
+#define GL_ACLEXT_XLT0_L1ADDR_LINE_IDX_S	0
+#define GL_ACLEXT_XLT0_L1ADDR_LINE_IDX_M	ICE_M(0xFF, 0)
+#define GL_ACLEXT_XLT0_L1ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_XLT0_L1ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_XLT0_L1DATA(_i)		(0x00210048 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT0_L1DATA_MAX_INDEX		2
+#define GL_ACLEXT_XLT0_L1DATA_DATA_S		0
+#define GL_ACLEXT_XLT0_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_XLT1_L2ADDR(_i)		(0x002100C0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT1_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_XLT1_L2ADDR_LINE_IDX_S	0
+#define GL_ACLEXT_XLT1_L2ADDR_LINE_IDX_M	ICE_M(0x7FF, 0)
+#define GL_ACLEXT_XLT1_L2ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_XLT1_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_XLT1_L2DATA(_i)		(0x002100CC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT1_L2DATA_MAX_INDEX		2
+#define GL_ACLEXT_XLT1_L2DATA_DATA_S		0
+#define GL_ACLEXT_XLT1_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_XLT2_L2ADDR(_i)		(0x002100D8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT2_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_XLT2_L2ADDR_LINE_IDX_S	0
+#define GL_ACLEXT_XLT2_L2ADDR_LINE_IDX_M	ICE_M(0x1FF, 0)
+#define GL_ACLEXT_XLT2_L2ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_XLT2_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_XLT2_L2DATA(_i)		(0x002100E4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT2_L2DATA_MAX_INDEX		2
+#define GL_ACLEXT_XLT2_L2DATA_DATA_S		0
+#define GL_ACLEXT_XLT2_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_CDMD_L1SEL(_i)		(0x0020F054 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_CDMD_L1SEL_MAX_INDEX		2
+#define GL_PREEXT_CDMD_L1SEL_RX_SEL_S		0
+#define GL_PREEXT_CDMD_L1SEL_RX_SEL_M		ICE_M(0x1F, 0)
+#define GL_PREEXT_CDMD_L1SEL_TX_SEL_S		8
+#define GL_PREEXT_CDMD_L1SEL_TX_SEL_M		ICE_M(0x1F, 8)
+#define GL_PREEXT_CDMD_L1SEL_AUX0_SEL_S		16
+#define GL_PREEXT_CDMD_L1SEL_AUX0_SEL_M		ICE_M(0x1F, 16)
+#define GL_PREEXT_CDMD_L1SEL_AUX1_SEL_S		24
+#define GL_PREEXT_CDMD_L1SEL_AUX1_SEL_M		ICE_M(0x1F, 24)
+#define GL_PREEXT_CDMD_L1SEL_BIDIR_ENA_S	30
+#define GL_PREEXT_CDMD_L1SEL_BIDIR_ENA_M	ICE_M(0x3, 30)
+#define GL_PREEXT_CTLTBL_L2ADDR(_i)		(0x0020F084 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_CTLTBL_L2ADDR_MAX_INDEX	2
+#define GL_PREEXT_CTLTBL_L2ADDR_LINE_OFF_S	0
+#define GL_PREEXT_CTLTBL_L2ADDR_LINE_OFF_M	ICE_M(0x7, 0)
+#define GL_PREEXT_CTLTBL_L2ADDR_LINE_IDX_S	8
+#define GL_PREEXT_CTLTBL_L2ADDR_LINE_IDX_M	ICE_M(0x7, 8)
+#define GL_PREEXT_CTLTBL_L2ADDR_AUTO_INC_S	31
+#define GL_PREEXT_CTLTBL_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_CTLTBL_L2DATA(_i)		(0x0020F090 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_CTLTBL_L2DATA_MAX_INDEX	2
+#define GL_PREEXT_CTLTBL_L2DATA_DATA_S		0
+#define GL_PREEXT_CTLTBL_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_DFLT_L2PRFL(_i)		(0x0020F138 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_DFLT_L2PRFL_MAX_INDEX		2
+#define GL_PREEXT_DFLT_L2PRFL_DFLT_PRFL_S	0
+#define GL_PREEXT_DFLT_L2PRFL_DFLT_PRFL_M	ICE_M(0xFFFF, 0)
+#define GL_PREEXT_FLGS_L1SEL0_1(_i)		(0x0020F06C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FLGS_L1SEL0_1_MAX_INDEX	2
+#define GL_PREEXT_FLGS_L1SEL0_1_FLS0_S		0
+#define GL_PREEXT_FLGS_L1SEL0_1_FLS0_M		ICE_M(0x1FF, 0)
+#define GL_PREEXT_FLGS_L1SEL0_1_FLS1_S		16
+#define GL_PREEXT_FLGS_L1SEL0_1_FLS1_M		ICE_M(0x1FF, 16)
+#define GL_PREEXT_FLGS_L1SEL2_3(_i)		(0x0020F078 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FLGS_L1SEL2_3_MAX_INDEX	2
+#define GL_PREEXT_FLGS_L1SEL2_3_FLS2_S		0
+#define GL_PREEXT_FLGS_L1SEL2_3_FLS2_M		ICE_M(0x1FF, 0)
+#define GL_PREEXT_FLGS_L1SEL2_3_FLS3_S		16
+#define GL_PREEXT_FLGS_L1SEL2_3_FLS3_M		ICE_M(0x1FF, 16)
+#define GL_PREEXT_FLGS_L1TBL(_i)		(0x0020F060 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FLGS_L1TBL_MAX_INDEX		2
+#define GL_PREEXT_FLGS_L1TBL_LSB_S		0
+#define GL_PREEXT_FLGS_L1TBL_LSB_M		ICE_M(0xFFFF, 0)
+#define GL_PREEXT_FLGS_L1TBL_MSB_S		16
+#define GL_PREEXT_FLGS_L1TBL_MSB_M		ICE_M(0xFFFF, 16)
+#define GL_PREEXT_FORCE_L1CDID(_i)		(0x0020F018 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FORCE_L1CDID_MAX_INDEX	2
+#define GL_PREEXT_FORCE_L1CDID_STATIC_CDID_S	0
+#define GL_PREEXT_FORCE_L1CDID_STATIC_CDID_M	ICE_M(0xF, 0)
+#define GL_PREEXT_FORCE_L1CDID_STATIC_CDID_EN_S 31
+#define GL_PREEXT_FORCE_L1CDID_STATIC_CDID_EN_M BIT(31)
+#define GL_PREEXT_FORCE_PID(_i)			(0x0020F000 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FORCE_PID_MAX_INDEX		2
+#define GL_PREEXT_FORCE_PID_STATIC_PID_S	0
+#define GL_PREEXT_FORCE_PID_STATIC_PID_M	ICE_M(0xFFFF, 0)
+#define GL_PREEXT_FORCE_PID_STATIC_PID_EN_S	31
+#define GL_PREEXT_FORCE_PID_STATIC_PID_EN_M	BIT(31)
+#define GL_PREEXT_K2N_L2ADDR(_i)		(0x0020F144 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_K2N_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_K2N_L2ADDR_LINE_IDX_S		0
+#define GL_PREEXT_K2N_L2ADDR_LINE_IDX_M		ICE_M(0x7F, 0)
+#define GL_PREEXT_K2N_L2ADDR_AUTO_INC_S		31
+#define GL_PREEXT_K2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_PREEXT_K2N_L2DATA(_i)		(0x0020F150 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_K2N_L2DATA_MAX_INDEX		2
+#define GL_PREEXT_K2N_L2DATA_DATA0_S		0
+#define GL_PREEXT_K2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_K2N_L2DATA_DATA1_S		8
+#define GL_PREEXT_K2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_PREEXT_K2N_L2DATA_DATA2_S		16
+#define GL_PREEXT_K2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_PREEXT_K2N_L2DATA_DATA3_S		24
+#define GL_PREEXT_K2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_PREEXT_L2_PMASK0(_i)			(0x0020F0FC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2_PMASK0_MAX_INDEX		2
+#define GL_PREEXT_L2_PMASK0_BITMASK_S		0
+#define GL_PREEXT_L2_PMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_L2_PMASK1(_i)			(0x0020F108 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2_PMASK1_MAX_INDEX		2
+#define GL_PREEXT_L2_PMASK1_BITMASK_S		0
+#define GL_PREEXT_L2_PMASK1_BITMASK_M		ICE_M(0xFFFF, 0)
+#define GL_PREEXT_L2_TMASK0(_i)			(0x0020F498 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2_TMASK0_MAX_INDEX		2
+#define GL_PREEXT_L2_TMASK0_BITMASK_S		0
+#define GL_PREEXT_L2_TMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_L2_TMASK1(_i)			(0x0020F4A4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2_TMASK1_MAX_INDEX		2
+#define GL_PREEXT_L2_TMASK1_BITMASK_S		0
+#define GL_PREEXT_L2_TMASK1_BITMASK_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_L2BMP0_3(_i)			(0x0020F0A8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2BMP0_3_MAX_INDEX		2
+#define GL_PREEXT_L2BMP0_3_BMP0_S		0
+#define GL_PREEXT_L2BMP0_3_BMP0_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_L2BMP0_3_BMP1_S		8
+#define GL_PREEXT_L2BMP0_3_BMP1_M		ICE_M(0xFF, 8)
+#define GL_PREEXT_L2BMP0_3_BMP2_S		16
+#define GL_PREEXT_L2BMP0_3_BMP2_M		ICE_M(0xFF, 16)
+#define GL_PREEXT_L2BMP0_3_BMP3_S		24
+#define GL_PREEXT_L2BMP0_3_BMP3_M		ICE_M(0xFF, 24)
+#define GL_PREEXT_L2BMP4_7(_i)			(0x0020F0B4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2BMP4_7_MAX_INDEX		2
+#define GL_PREEXT_L2BMP4_7_BMP4_S		0
+#define GL_PREEXT_L2BMP4_7_BMP4_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_L2BMP4_7_BMP5_S		8
+#define GL_PREEXT_L2BMP4_7_BMP5_M		ICE_M(0xFF, 8)
+#define GL_PREEXT_L2BMP4_7_BMP6_S		16
+#define GL_PREEXT_L2BMP4_7_BMP6_M		ICE_M(0xFF, 16)
+#define GL_PREEXT_L2BMP4_7_BMP7_S		24
+#define GL_PREEXT_L2BMP4_7_BMP7_M		ICE_M(0xFF, 24)
+#define GL_PREEXT_L2PRTMOD(_i)			(0x0020F09C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2PRTMOD_MAX_INDEX		2
+#define GL_PREEXT_L2PRTMOD_XLT1_S		0
+#define GL_PREEXT_L2PRTMOD_XLT1_M		ICE_M(0x3, 0)
+#define GL_PREEXT_L2PRTMOD_XLT2_S		8
+#define GL_PREEXT_L2PRTMOD_XLT2_M		ICE_M(0x3, 8)
+#define GL_PREEXT_N2N_L2ADDR(_i)		(0x0020F15C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_N2N_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_N2N_L2ADDR_LINE_IDX_S		0
+#define GL_PREEXT_N2N_L2ADDR_LINE_IDX_M		ICE_M(0x3F, 0)
+#define GL_PREEXT_N2N_L2ADDR_AUTO_INC_S		31
+#define GL_PREEXT_N2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_PREEXT_N2N_L2DATA(_i)		(0x0020F168 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_N2N_L2DATA_MAX_INDEX		2
+#define GL_PREEXT_N2N_L2DATA_DATA0_S		0
+#define GL_PREEXT_N2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_N2N_L2DATA_DATA1_S		8
+#define GL_PREEXT_N2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_PREEXT_N2N_L2DATA_DATA2_S		16
+#define GL_PREEXT_N2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_PREEXT_N2N_L2DATA_DATA3_S		24
+#define GL_PREEXT_N2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_PREEXT_P2P_L1ADDR(_i)		(0x0020F024 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_P2P_L1ADDR_MAX_INDEX		2
+#define GL_PREEXT_P2P_L1ADDR_LINE_IDX_S		0
+#define GL_PREEXT_P2P_L1ADDR_LINE_IDX_M		BIT(0)
+#define GL_PREEXT_P2P_L1ADDR_AUTO_INC_S		31
+#define GL_PREEXT_P2P_L1ADDR_AUTO_INC_M		BIT(31)
+#define GL_PREEXT_P2P_L1DATA(_i)		(0x0020F030 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_P2P_L1DATA_MAX_INDEX		2
+#define GL_PREEXT_P2P_L1DATA_DATA_S		0
+#define GL_PREEXT_P2P_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_PID_L2GKTYPE(_i)		(0x0020F0F0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_PID_L2GKTYPE_MAX_INDEX	2
+#define GL_PREEXT_PID_L2GKTYPE_PID_GKTYPE_S	0
+#define GL_PREEXT_PID_L2GKTYPE_PID_GKTYPE_M	ICE_M(0x3, 0)
+#define GL_PREEXT_PLVL_SEL(_i)			(0x0020F00C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_PLVL_SEL_MAX_INDEX		2
+#define GL_PREEXT_PLVL_SEL_PLVL_SEL_S		0
+#define GL_PREEXT_PLVL_SEL_PLVL_SEL_M		BIT(0)
+#define GL_PREEXT_TCAM_L2ADDR(_i)		(0x0020F114 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_TCAM_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_TCAM_L2ADDR_LINE_IDX_S	0
+#define GL_PREEXT_TCAM_L2ADDR_LINE_IDX_M	ICE_M(0x3FF, 0)
+#define GL_PREEXT_TCAM_L2ADDR_AUTO_INC_S	31
+#define GL_PREEXT_TCAM_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_TCAM_L2DATALSB(_i)		(0x0020F120 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_TCAM_L2DATALSB_MAX_INDEX	2
+#define GL_PREEXT_TCAM_L2DATALSB_DATALSB_S	0
+#define GL_PREEXT_TCAM_L2DATALSB_DATALSB_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_TCAM_L2DATAMSB(_i)		(0x0020F12C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_TCAM_L2DATAMSB_MAX_INDEX	2
+#define GL_PREEXT_TCAM_L2DATAMSB_DATAMSB_S	0
+#define GL_PREEXT_TCAM_L2DATAMSB_DATAMSB_M	ICE_M(0xFF, 0)
+#define GL_PREEXT_XLT0_L1ADDR(_i)		(0x0020F03C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT0_L1ADDR_MAX_INDEX		2
+#define GL_PREEXT_XLT0_L1ADDR_LINE_IDX_S	0
+#define GL_PREEXT_XLT0_L1ADDR_LINE_IDX_M	ICE_M(0xFF, 0)
+#define GL_PREEXT_XLT0_L1ADDR_AUTO_INC_S	31
+#define GL_PREEXT_XLT0_L1ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_XLT0_L1DATA(_i)		(0x0020F048 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT0_L1DATA_MAX_INDEX		2
+#define GL_PREEXT_XLT0_L1DATA_DATA_S		0
+#define GL_PREEXT_XLT0_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_XLT1_L2ADDR(_i)		(0x0020F0C0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT1_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_XLT1_L2ADDR_LINE_IDX_S	0
+#define GL_PREEXT_XLT1_L2ADDR_LINE_IDX_M	ICE_M(0x7FF, 0)
+#define GL_PREEXT_XLT1_L2ADDR_AUTO_INC_S	31
+#define GL_PREEXT_XLT1_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_XLT1_L2DATA(_i)		(0x0020F0CC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT1_L2DATA_MAX_INDEX		2
+#define GL_PREEXT_XLT1_L2DATA_DATA_S		0
+#define GL_PREEXT_XLT1_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_XLT2_L2ADDR(_i)		(0x0020F0D8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT2_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_XLT2_L2ADDR_LINE_IDX_S	0
+#define GL_PREEXT_XLT2_L2ADDR_LINE_IDX_M	ICE_M(0x1FF, 0)
+#define GL_PREEXT_XLT2_L2ADDR_AUTO_INC_S	31
+#define GL_PREEXT_XLT2_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_XLT2_L2DATA(_i)		(0x0020F0E4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT2_L2DATA_MAX_INDEX		2
+#define GL_PREEXT_XLT2_L2DATA_DATA_S		0
+#define GL_PREEXT_XLT2_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_CDMD_L1SEL(_i)		(0x0020E054 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_CDMD_L1SEL_MAX_INDEX		2
+#define GL_PSTEXT_CDMD_L1SEL_RX_SEL_S		0
+#define GL_PSTEXT_CDMD_L1SEL_RX_SEL_M		ICE_M(0x1F, 0)
+#define GL_PSTEXT_CDMD_L1SEL_TX_SEL_S		8
+#define GL_PSTEXT_CDMD_L1SEL_TX_SEL_M		ICE_M(0x1F, 8)
+#define GL_PSTEXT_CDMD_L1SEL_AUX0_SEL_S		16
+#define GL_PSTEXT_CDMD_L1SEL_AUX0_SEL_M		ICE_M(0x1F, 16)
+#define GL_PSTEXT_CDMD_L1SEL_AUX1_SEL_S		24
+#define GL_PSTEXT_CDMD_L1SEL_AUX1_SEL_M		ICE_M(0x1F, 24)
+#define GL_PSTEXT_CDMD_L1SEL_BIDIR_ENA_S	30
+#define GL_PSTEXT_CDMD_L1SEL_BIDIR_ENA_M	ICE_M(0x3, 30)
+#define GL_PSTEXT_CTLTBL_L2ADDR(_i)		(0x0020E084 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_CTLTBL_L2ADDR_MAX_INDEX	2
+#define GL_PSTEXT_CTLTBL_L2ADDR_LINE_OFF_S	0
+#define GL_PSTEXT_CTLTBL_L2ADDR_LINE_OFF_M	ICE_M(0x7, 0)
+#define GL_PSTEXT_CTLTBL_L2ADDR_LINE_IDX_S	8
+#define GL_PSTEXT_CTLTBL_L2ADDR_LINE_IDX_M	ICE_M(0x7, 8)
+#define GL_PSTEXT_CTLTBL_L2ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_CTLTBL_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_CTLTBL_L2DATA(_i)		(0x0020E090 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_CTLTBL_L2DATA_MAX_INDEX	2
+#define GL_PSTEXT_CTLTBL_L2DATA_DATA_S		0
+#define GL_PSTEXT_CTLTBL_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_DFLT_L2PRFL(_i)		(0x0020E138 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_DFLT_L2PRFL_MAX_INDEX		2
+#define GL_PSTEXT_DFLT_L2PRFL_DFLT_PRFL_S	0
+#define GL_PSTEXT_DFLT_L2PRFL_DFLT_PRFL_M	ICE_M(0xFFFF, 0)
+#define GL_PSTEXT_FL15_BMPLSB(_i)		(0x0020E480 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FL15_BMPLSB_MAX_INDEX		2
+#define GL_PSTEXT_FL15_BMPLSB_BMPLSB_S		0
+#define GL_PSTEXT_FL15_BMPLSB_BMPLSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_FL15_BMPMSB(_i)		(0x0020E48C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FL15_BMPMSB_MAX_INDEX		2
+#define GL_PSTEXT_FL15_BMPMSB_BMPMSB_S		0
+#define GL_PSTEXT_FL15_BMPMSB_BMPMSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_FLGS_L1SEL0_1(_i)		(0x0020E06C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FLGS_L1SEL0_1_MAX_INDEX	2
+#define GL_PSTEXT_FLGS_L1SEL0_1_FLS0_S		0
+#define GL_PSTEXT_FLGS_L1SEL0_1_FLS0_M		ICE_M(0x1FF, 0)
+#define GL_PSTEXT_FLGS_L1SEL0_1_FLS1_S		16
+#define GL_PSTEXT_FLGS_L1SEL0_1_FLS1_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_FLGS_L1SEL2_3(_i)		(0x0020E078 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FLGS_L1SEL2_3_MAX_INDEX	2
+#define GL_PSTEXT_FLGS_L1SEL2_3_FLS2_S		0
+#define GL_PSTEXT_FLGS_L1SEL2_3_FLS2_M		ICE_M(0x1FF, 0)
+#define GL_PSTEXT_FLGS_L1SEL2_3_FLS3_S		16
+#define GL_PSTEXT_FLGS_L1SEL2_3_FLS3_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_FLGS_L1TBL(_i)		(0x0020E060 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FLGS_L1TBL_MAX_INDEX		2
+#define GL_PSTEXT_FLGS_L1TBL_LSB_S		0
+#define GL_PSTEXT_FLGS_L1TBL_LSB_M		ICE_M(0xFFFF, 0)
+#define GL_PSTEXT_FLGS_L1TBL_MSB_S		16
+#define GL_PSTEXT_FLGS_L1TBL_MSB_M		ICE_M(0xFFFF, 16)
+#define GL_PSTEXT_FORCE_L1CDID(_i)		(0x0020E018 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FORCE_L1CDID_MAX_INDEX	2
+#define GL_PSTEXT_FORCE_L1CDID_STATIC_CDID_S	0
+#define GL_PSTEXT_FORCE_L1CDID_STATIC_CDID_M	ICE_M(0xF, 0)
+#define GL_PSTEXT_FORCE_L1CDID_STATIC_CDID_EN_S 31
+#define GL_PSTEXT_FORCE_L1CDID_STATIC_CDID_EN_M BIT(31)
+#define GL_PSTEXT_FORCE_PID(_i)			(0x0020E000 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FORCE_PID_MAX_INDEX		2
+#define GL_PSTEXT_FORCE_PID_STATIC_PID_S	0
+#define GL_PSTEXT_FORCE_PID_STATIC_PID_M	ICE_M(0xFFFF, 0)
+#define GL_PSTEXT_FORCE_PID_STATIC_PID_EN_S	31
+#define GL_PSTEXT_FORCE_PID_STATIC_PID_EN_M	BIT(31)
+#define GL_PSTEXT_K2N_L2ADDR(_i)		(0x0020E144 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_K2N_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_K2N_L2ADDR_LINE_IDX_S		0
+#define GL_PSTEXT_K2N_L2ADDR_LINE_IDX_M		ICE_M(0x7F, 0)
+#define GL_PSTEXT_K2N_L2ADDR_AUTO_INC_S		31
+#define GL_PSTEXT_K2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_PSTEXT_K2N_L2DATA(_i)		(0x0020E150 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_K2N_L2DATA_MAX_INDEX		2
+#define GL_PSTEXT_K2N_L2DATA_DATA0_S		0
+#define GL_PSTEXT_K2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_K2N_L2DATA_DATA1_S		8
+#define GL_PSTEXT_K2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_PSTEXT_K2N_L2DATA_DATA2_S		16
+#define GL_PSTEXT_K2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_PSTEXT_K2N_L2DATA_DATA3_S		24
+#define GL_PSTEXT_K2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_PSTEXT_L2_PMASK0(_i)			(0x0020E0FC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2_PMASK0_MAX_INDEX		2
+#define GL_PSTEXT_L2_PMASK0_BITMASK_S		0
+#define GL_PSTEXT_L2_PMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_L2_PMASK1(_i)			(0x0020E108 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2_PMASK1_MAX_INDEX		2
+#define GL_PSTEXT_L2_PMASK1_BITMASK_S		0
+#define GL_PSTEXT_L2_PMASK1_BITMASK_M		ICE_M(0xFFFF, 0)
+#define GL_PSTEXT_L2_TMASK0(_i)			(0x0020E498 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2_TMASK0_MAX_INDEX		2
+#define GL_PSTEXT_L2_TMASK0_BITMASK_S		0
+#define GL_PSTEXT_L2_TMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_L2_TMASK1(_i)			(0x0020E4A4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2_TMASK1_MAX_INDEX		2
+#define GL_PSTEXT_L2_TMASK1_BITMASK_S		0
+#define GL_PSTEXT_L2_TMASK1_BITMASK_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_L2PRTMOD(_i)			(0x0020E09C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2PRTMOD_MAX_INDEX		2
+#define GL_PSTEXT_L2PRTMOD_XLT1_S		0
+#define GL_PSTEXT_L2PRTMOD_XLT1_M		ICE_M(0x3, 0)
+#define GL_PSTEXT_L2PRTMOD_XLT2_S		8
+#define GL_PSTEXT_L2PRTMOD_XLT2_M		ICE_M(0x3, 8)
+#define GL_PSTEXT_N2N_L2ADDR(_i)		(0x0020E15C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_N2N_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_N2N_L2ADDR_LINE_IDX_S		0
+#define GL_PSTEXT_N2N_L2ADDR_LINE_IDX_M		ICE_M(0x3F, 0)
+#define GL_PSTEXT_N2N_L2ADDR_AUTO_INC_S		31
+#define GL_PSTEXT_N2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_PSTEXT_N2N_L2DATA(_i)		(0x0020E168 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_N2N_L2DATA_MAX_INDEX		2
+#define GL_PSTEXT_N2N_L2DATA_DATA0_S		0
+#define GL_PSTEXT_N2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_N2N_L2DATA_DATA1_S		8
+#define GL_PSTEXT_N2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_PSTEXT_N2N_L2DATA_DATA2_S		16
+#define GL_PSTEXT_N2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_PSTEXT_N2N_L2DATA_DATA3_S		24
+#define GL_PSTEXT_N2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_PSTEXT_P2P_L1ADDR(_i)		(0x0020E024 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_P2P_L1ADDR_MAX_INDEX		2
+#define GL_PSTEXT_P2P_L1ADDR_LINE_IDX_S		0
+#define GL_PSTEXT_P2P_L1ADDR_LINE_IDX_M		BIT(0)
+#define GL_PSTEXT_P2P_L1ADDR_AUTO_INC_S		31
+#define GL_PSTEXT_P2P_L1ADDR_AUTO_INC_M		BIT(31)
+#define GL_PSTEXT_P2P_L1DATA(_i)		(0x0020E030 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_P2P_L1DATA_MAX_INDEX		2
+#define GL_PSTEXT_P2P_L1DATA_DATA_S		0
+#define GL_PSTEXT_P2P_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_PID_L2GKTYPE(_i)		(0x0020E0F0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PID_L2GKTYPE_MAX_INDEX	2
+#define GL_PSTEXT_PID_L2GKTYPE_PID_GKTYPE_S	0
+#define GL_PSTEXT_PID_L2GKTYPE_PID_GKTYPE_M	ICE_M(0x3, 0)
+#define GL_PSTEXT_PLVL_SEL(_i)			(0x0020E00C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PLVL_SEL_MAX_INDEX		2
+#define GL_PSTEXT_PLVL_SEL_PLVL_SEL_S		0
+#define GL_PSTEXT_PLVL_SEL_PLVL_SEL_M		BIT(0)
+#define GL_PSTEXT_PRFLM_CTRL(_i)		(0x0020E474 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PRFLM_CTRL_MAX_INDEX		2
+#define GL_PSTEXT_PRFLM_CTRL_PRFL_IDX_S		0
+#define GL_PSTEXT_PRFLM_CTRL_PRFL_IDX_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_PRFLM_CTRL_RD_REQ_S		30
+#define GL_PSTEXT_PRFLM_CTRL_RD_REQ_M		BIT(30)
+#define GL_PSTEXT_PRFLM_CTRL_WR_REQ_S		31
+#define GL_PSTEXT_PRFLM_CTRL_WR_REQ_M		BIT(31)
+#define GL_PSTEXT_PRFLM_DATA_0(_i)		(0x0020E174 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PRFLM_DATA_0_MAX_INDEX	63
+#define GL_PSTEXT_PRFLM_DATA_0_PROT_S		0
+#define GL_PSTEXT_PRFLM_DATA_0_PROT_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_PRFLM_DATA_0_OFF_S		16
+#define GL_PSTEXT_PRFLM_DATA_0_OFF_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_PRFLM_DATA_1(_i)		(0x0020E274 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PRFLM_DATA_1_MAX_INDEX	63
+#define GL_PSTEXT_PRFLM_DATA_1_PROT_S		0
+#define GL_PSTEXT_PRFLM_DATA_1_PROT_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_PRFLM_DATA_1_OFF_S		16
+#define GL_PSTEXT_PRFLM_DATA_1_OFF_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_PRFLM_DATA_2(_i)		(0x0020E374 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PRFLM_DATA_2_MAX_INDEX	63
+#define GL_PSTEXT_PRFLM_DATA_2_PROT_S		0
+#define GL_PSTEXT_PRFLM_DATA_2_PROT_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_PRFLM_DATA_2_OFF_S		16
+#define GL_PSTEXT_PRFLM_DATA_2_OFF_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_TCAM_L2ADDR(_i)		(0x0020E114 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_TCAM_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_TCAM_L2ADDR_LINE_IDX_S	0
+#define GL_PSTEXT_TCAM_L2ADDR_LINE_IDX_M	ICE_M(0x3FF, 0)
+#define GL_PSTEXT_TCAM_L2ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_TCAM_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_TCAM_L2DATALSB(_i)		(0x0020E120 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_TCAM_L2DATALSB_MAX_INDEX	2
+#define GL_PSTEXT_TCAM_L2DATALSB_DATALSB_S	0
+#define GL_PSTEXT_TCAM_L2DATALSB_DATALSB_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_TCAM_L2DATAMSB(_i)		(0x0020E12C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_TCAM_L2DATAMSB_MAX_INDEX	2
+#define GL_PSTEXT_TCAM_L2DATAMSB_DATAMSB_S	0
+#define GL_PSTEXT_TCAM_L2DATAMSB_DATAMSB_M	ICE_M(0xFF, 0)
+#define GL_PSTEXT_XLT0_L1ADDR(_i)		(0x0020E03C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT0_L1ADDR_MAX_INDEX		2
+#define GL_PSTEXT_XLT0_L1ADDR_LINE_IDX_S	0
+#define GL_PSTEXT_XLT0_L1ADDR_LINE_IDX_M	ICE_M(0xFF, 0)
+#define GL_PSTEXT_XLT0_L1ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_XLT0_L1ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_XLT0_L1DATA(_i)		(0x0020E048 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT0_L1DATA_MAX_INDEX		2
+#define GL_PSTEXT_XLT0_L1DATA_DATA_S		0
+#define GL_PSTEXT_XLT0_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_XLT1_L2ADDR(_i)		(0x0020E0C0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT1_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_XLT1_L2ADDR_LINE_IDX_S	0
+#define GL_PSTEXT_XLT1_L2ADDR_LINE_IDX_M	ICE_M(0x7FF, 0)
+#define GL_PSTEXT_XLT1_L2ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_XLT1_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_XLT1_L2DATA(_i)		(0x0020E0CC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT1_L2DATA_MAX_INDEX		2
+#define GL_PSTEXT_XLT1_L2DATA_DATA_S		0
+#define GL_PSTEXT_XLT1_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_XLT2_L2ADDR(_i)		(0x0020E0D8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT2_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_XLT2_L2ADDR_LINE_IDX_S	0
+#define GL_PSTEXT_XLT2_L2ADDR_LINE_IDX_M	ICE_M(0x1FF, 0)
+#define GL_PSTEXT_XLT2_L2ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_XLT2_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_XLT2_L2DATA(_i)		(0x0020E0E4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT2_L2DATA_MAX_INDEX		2
+#define GL_PSTEXT_XLT2_L2DATA_DATA_S		0
+#define GL_PSTEXT_XLT2_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLFLXP_PTYPE_TRANSLATION(_i)		(0x0045C000 + ((_i) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define GLFLXP_PTYPE_TRANSLATION_MAX_INDEX	255
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_S	0
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_M	ICE_M(0xFF, 0)
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_1_S	8
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_1_M	ICE_M(0xFF, 8)
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_2_S	16
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_2_M	ICE_M(0xFF, 16)
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_3_S	24
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_3_M	ICE_M(0xFF, 24)
+#define GLFLXP_RX_CMD_LX_PROT_IDX(_i)		(0x0045C400 + ((_i) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define GLFLXP_RX_CMD_LX_PROT_IDX_MAX_INDEX	255
+#define GLFLXP_RX_CMD_LX_PROT_IDX_INNER_CLOUD_OFFSET_INDEX_S 0
+#define GLFLXP_RX_CMD_LX_PROT_IDX_INNER_CLOUD_OFFSET_INDEX_M ICE_M(0x7, 0)
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L4_OFFSET_INDEX_S 4
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L4_OFFSET_INDEX_M ICE_M(0x7, 4)
+#define GLFLXP_RX_CMD_LX_PROT_IDX_PAYLOAD_OFFSET_INDEX_S 8
+#define GLFLXP_RX_CMD_LX_PROT_IDX_PAYLOAD_OFFSET_INDEX_M ICE_M(0x7, 8)
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L3_PROTOCOL_S 12
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L3_PROTOCOL_M ICE_M(0x3, 12)
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L4_PROTOCOL_S 14
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L4_PROTOCOL_M ICE_M(0x3, 14)
+#define GLFLXP_RX_CMD_PROTIDS(_i, _j)		(0x0045A000 + ((_i) * 4 + (_j) * 1024)) /* _i=0...255, _j=0...5 */ /* Reset Source: CORER */
+#define GLFLXP_RX_CMD_PROTIDS_MAX_INDEX		255
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_S	0
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_M	ICE_M(0xFF, 0)
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_1_S	8
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_1_M	ICE_M(0xFF, 8)
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_2_S	16
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_2_M	ICE_M(0xFF, 16)
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_3_S	24
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_3_M	ICE_M(0xFF, 24)
+#define GLFLXP_RXDID_FLAGS(_i, _j)		(0x0045D000 + ((_i) * 4 + (_j) * 256)) /* _i=0...63, _j=0...4 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLAGS_MAX_INDEX		63
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S	0
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M	ICE_M(0x3F, 0)
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S	8
@@ -66,56 +3009,1480 @@
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_M	ICE_M(0x3F, 16)
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_S	24
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_M	ICE_M(0x3F, 24)
-#define GLFLXP_RXDID_FLX_WRD_0(_i)		(0x0045c800 + ((_i) * 4))
+#define GLFLXP_RXDID_FLAGS1_OVERRIDE(_i)	(0x0045D600 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLAGS1_OVERRIDE_MAX_INDEX	63
+#define GLFLXP_RXDID_FLAGS1_OVERRIDE_FLEXIFLAGS1_OVERRIDE_S 0
+#define GLFLXP_RXDID_FLAGS1_OVERRIDE_FLEXIFLAGS1_OVERRIDE_M ICE_M(0xF, 0)
+#define GLFLXP_RXDID_FLX_WRD_0(_i)		(0x0045C800 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_0_MAX_INDEX	63
 #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_0_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_0_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
 #define GLFLXP_RXDID_FLX_WRD_0_RXDID_OPCODE_S	30
 #define GLFLXP_RXDID_FLX_WRD_0_RXDID_OPCODE_M	ICE_M(0x3, 30)
-#define GLFLXP_RXDID_FLX_WRD_1(_i)		(0x0045c900 + ((_i) * 4))
+#define GLFLXP_RXDID_FLX_WRD_1(_i)		(0x0045C900 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_1_MAX_INDEX	63
 #define GLFLXP_RXDID_FLX_WRD_1_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_1_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_1_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_1_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
 #define GLFLXP_RXDID_FLX_WRD_1_RXDID_OPCODE_S	30
 #define GLFLXP_RXDID_FLX_WRD_1_RXDID_OPCODE_M	ICE_M(0x3, 30)
-#define GLFLXP_RXDID_FLX_WRD_2(_i)		(0x0045ca00 + ((_i) * 4))
+#define GLFLXP_RXDID_FLX_WRD_2(_i)		(0x0045CA00 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_2_MAX_INDEX	63
 #define GLFLXP_RXDID_FLX_WRD_2_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_2_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_2_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_2_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
 #define GLFLXP_RXDID_FLX_WRD_2_RXDID_OPCODE_S	30
 #define GLFLXP_RXDID_FLX_WRD_2_RXDID_OPCODE_M	ICE_M(0x3, 30)
-#define GLFLXP_RXDID_FLX_WRD_3(_i)		(0x0045cb00 + ((_i) * 4))
+#define GLFLXP_RXDID_FLX_WRD_3(_i)		(0x0045CB00 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_3_MAX_INDEX	63
 #define GLFLXP_RXDID_FLX_WRD_3_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_3_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_3_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_3_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
 #define GLFLXP_RXDID_FLX_WRD_3_RXDID_OPCODE_S	30
 #define GLFLXP_RXDID_FLX_WRD_3_RXDID_OPCODE_M	ICE_M(0x3, 30)
-#define QRXFLXP_CNTXT(_QRX)			(0x00480000 + ((_QRX) * 4))
+#define GLFLXP_RXDID_FLX_WRD_4(_i)		(0x0045CC00 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_4_MAX_INDEX	63
+#define GLFLXP_RXDID_FLX_WRD_4_PROT_MDID_S	0
+#define GLFLXP_RXDID_FLX_WRD_4_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_4_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_4_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
+#define GLFLXP_RXDID_FLX_WRD_4_RXDID_OPCODE_S	30
+#define GLFLXP_RXDID_FLX_WRD_4_RXDID_OPCODE_M	ICE_M(0x3, 30)
+#define GLFLXP_RXDID_FLX_WRD_5(_i)		(0x0045CD00 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_5_MAX_INDEX	63
+#define GLFLXP_RXDID_FLX_WRD_5_PROT_MDID_S	0
+#define GLFLXP_RXDID_FLX_WRD_5_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_5_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_5_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
+#define GLFLXP_RXDID_FLX_WRD_5_RXDID_OPCODE_S	30
+#define GLFLXP_RXDID_FLX_WRD_5_RXDID_OPCODE_M	ICE_M(0x3, 30)
+#define GLFLXP_TX_SCHED_CORRECT(_i, _j)		(0x00458000 + ((_i) * 4 + (_j) * 256)) /* _i=0...63, _j=0...31 */ /* Reset Source: CORER */
+#define GLFLXP_TX_SCHED_CORRECT_MAX_INDEX	63
+#define GLFLXP_TX_SCHED_CORRECT_PROTD_ID_2N_S	0
+#define GLFLXP_TX_SCHED_CORRECT_PROTD_ID_2N_M	ICE_M(0xFF, 0)
+#define GLFLXP_TX_SCHED_CORRECT_RECIPE_2N_S	8
+#define GLFLXP_TX_SCHED_CORRECT_RECIPE_2N_M	ICE_M(0x1F, 8)
+#define GLFLXP_TX_SCHED_CORRECT_PROTD_ID_2N_1_S 16
+#define GLFLXP_TX_SCHED_CORRECT_PROTD_ID_2N_1_M ICE_M(0xFF, 16)
+#define GLFLXP_TX_SCHED_CORRECT_RECIPE_2N_1_S	24
+#define GLFLXP_TX_SCHED_CORRECT_RECIPE_2N_1_M	ICE_M(0x1F, 24)
+#define QRXFLXP_CNTXT(_QRX)			(0x00480000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define QRXFLXP_CNTXT_MAX_INDEX			2047
 #define QRXFLXP_CNTXT_RXDID_IDX_S		0
 #define QRXFLXP_CNTXT_RXDID_IDX_M		ICE_M(0x3F, 0)
 #define QRXFLXP_CNTXT_RXDID_PRIO_S		8
 #define QRXFLXP_CNTXT_RXDID_PRIO_M		ICE_M(0x7, 8)
-#define GLGEN_RSTAT				0x000B8188
+#define QRXFLXP_CNTXT_TS_S			11
+#define QRXFLXP_CNTXT_TS_M			BIT(11)
+#define GL_FWSTS				0x00083048 /* Reset Source: POR */
+#define GL_FWSTS_FWS0B_S			0
+#define GL_FWSTS_FWS0B_M			ICE_M(0xFF, 0)
+#define GL_FWSTS_FWROWD_S			8
+#define GL_FWSTS_FWROWD_M			BIT(8)
+#define GL_FWSTS_FWRI_S				9
+#define GL_FWSTS_FWRI_M				BIT(9)
+#define GL_FWSTS_FWS1B_S			16
+#define GL_FWSTS_FWS1B_M			ICE_M(0xFF, 16)
+#define GL_TCVMLR_DRAIN_CNTR_CTL		0x000A21E0 /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_CNTR_CTL_OP_S		0
+#define GL_TCVMLR_DRAIN_CNTR_CTL_OP_M		BIT(0)
+#define GL_TCVMLR_DRAIN_CNTR_CTL_PORT_S		1
+#define GL_TCVMLR_DRAIN_CNTR_CTL_PORT_M		ICE_M(0x7, 1)
+#define GL_TCVMLR_DRAIN_CNTR_CTL_VALUE_S	4
+#define GL_TCVMLR_DRAIN_CNTR_CTL_VALUE_M	ICE_M(0x3FFF, 4)
+#define GL_TCVMLR_DRAIN_DONE_DEC		0x000A21A8 /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_DONE_DEC_TARGET_S	0
+#define GL_TCVMLR_DRAIN_DONE_DEC_TARGET_M	BIT(0)
+#define GL_TCVMLR_DRAIN_DONE_DEC_INDEX_S	1
+#define GL_TCVMLR_DRAIN_DONE_DEC_INDEX_M	ICE_M(0x1F, 1)
+#define GL_TCVMLR_DRAIN_DONE_DEC_VALUE_S	6
+#define GL_TCVMLR_DRAIN_DONE_DEC_VALUE_M	ICE_M(0xFF, 6)
+#define GL_TCVMLR_DRAIN_DONE_TCLAN(_i)		(0x000A20A8 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_DONE_TCLAN_MAX_INDEX	31
+#define GL_TCVMLR_DRAIN_DONE_TCLAN_COUNT_S	0
+#define GL_TCVMLR_DRAIN_DONE_TCLAN_COUNT_M	ICE_M(0xFF, 0)
+#define GL_TCVMLR_DRAIN_DONE_TPB(_i)		(0x000A2128 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_DONE_TPB_MAX_INDEX	31
+#define GL_TCVMLR_DRAIN_DONE_TPB_COUNT_S	0
+#define GL_TCVMLR_DRAIN_DONE_TPB_COUNT_M	ICE_M(0xFF, 0)
+#define GL_TCVMLR_DRAIN_MARKER			0x000A2008 /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_MARKER_PORT_S		0
+#define GL_TCVMLR_DRAIN_MARKER_PORT_M		ICE_M(0x7, 0)
+#define GL_TCVMLR_DRAIN_MARKER_TC_S		3
+#define GL_TCVMLR_DRAIN_MARKER_TC_M		ICE_M(0x1F, 3)
+#define GL_TCVMLR_ERR_STAT			0x000A2024 /* Reset Source: CORER */
+#define GL_TCVMLR_ERR_STAT_ERROR_S		0
+#define GL_TCVMLR_ERR_STAT_ERROR_M		BIT(0)
+#define GL_TCVMLR_ERR_STAT_FW_REQ_S		1
+#define GL_TCVMLR_ERR_STAT_FW_REQ_M		BIT(1)
+#define GL_TCVMLR_ERR_STAT_STAT_S		2
+#define GL_TCVMLR_ERR_STAT_STAT_M		ICE_M(0x7, 2)
+#define GL_TCVMLR_ERR_STAT_ENT_TYPE_S		5
+#define GL_TCVMLR_ERR_STAT_ENT_TYPE_M		ICE_M(0x7, 5)
+#define GL_TCVMLR_ERR_STAT_ENT_ID_S		8
+#define GL_TCVMLR_ERR_STAT_ENT_ID_M		ICE_M(0x3FFF, 8)
+#define GL_TCVMLR_QCFG				0x000A2010 /* Reset Source: CORER */
+#define GL_TCVMLR_QCFG_QID_S			0
+#define GL_TCVMLR_QCFG_QID_M			ICE_M(0x3FFF, 0)
+#define GL_TCVMLR_QCFG_OP_S			14
+#define GL_TCVMLR_QCFG_OP_M			BIT(14)
+#define GL_TCVMLR_QCFG_PORT_S			15
+#define GL_TCVMLR_QCFG_PORT_M			ICE_M(0x7, 15)
+#define GL_TCVMLR_QCFG_TC_S			18
+#define GL_TCVMLR_QCFG_TC_M			ICE_M(0x1F, 18)
+#define GL_TCVMLR_QCFG_RD			0x000A2014 /* Reset Source: CORER */
+#define GL_TCVMLR_QCFG_RD_QID_S			0
+#define GL_TCVMLR_QCFG_RD_QID_M			ICE_M(0x3FFF, 0)
+#define GL_TCVMLR_QCFG_RD_PORT_S		14
+#define GL_TCVMLR_QCFG_RD_PORT_M		ICE_M(0x7, 14)
+#define GL_TCVMLR_QCFG_RD_TC_S			17
+#define GL_TCVMLR_QCFG_RD_TC_M			ICE_M(0x1F, 17)
+#define GL_TCVMLR_QCNTR				0x000A200C /* Reset Source: CORER */
+#define GL_TCVMLR_QCNTR_CNTR_S			0
+#define GL_TCVMLR_QCNTR_CNTR_M			ICE_M(0x7FFF, 0)
+#define GL_TCVMLR_QCTL				0x000A2004 /* Reset Source: CORER */
+#define GL_TCVMLR_QCTL_QID_S			0
+#define GL_TCVMLR_QCTL_QID_M			ICE_M(0x3FFF, 0)
+#define GL_TCVMLR_QCTL_OP_S			14
+#define GL_TCVMLR_QCTL_OP_M			BIT(14)
+#define GL_TCVMLR_REQ_STAT			0x000A2018 /* Reset Source: CORER */
+#define GL_TCVMLR_REQ_STAT_ENT_TYPE_S		0
+#define GL_TCVMLR_REQ_STAT_ENT_TYPE_M		ICE_M(0x7, 0)
+#define GL_TCVMLR_REQ_STAT_ENT_ID_S		3
+#define GL_TCVMLR_REQ_STAT_ENT_ID_M		ICE_M(0x3FFF, 3)
+#define GL_TCVMLR_REQ_STAT_OP_S			17
+#define GL_TCVMLR_REQ_STAT_OP_M			BIT(17)
+#define GL_TCVMLR_REQ_STAT_WRITE_STATUS_S	18
+#define GL_TCVMLR_REQ_STAT_WRITE_STATUS_M	ICE_M(0x7, 18)
+#define GL_TCVMLR_STAT				0x000A201C /* Reset Source: CORER */
+#define GL_TCVMLR_STAT_ENT_TYPE_S		0
+#define GL_TCVMLR_STAT_ENT_TYPE_M		ICE_M(0x7, 0)
+#define GL_TCVMLR_STAT_ENT_ID_S			3
+#define GL_TCVMLR_STAT_ENT_ID_M			ICE_M(0x3FFF, 3)
+#define GL_TCVMLR_STAT_STATUS_S			17
+#define GL_TCVMLR_STAT_STATUS_M			ICE_M(0x7, 17)
+#define GL_XLR_MARKER_TRIG_TCVMLR		0x000A2000 /* Reset Source: CORER */
+#define GL_XLR_MARKER_TRIG_TCVMLR_VM_VF_NUM_S	0
+#define GL_XLR_MARKER_TRIG_TCVMLR_VM_VF_NUM_M	ICE_M(0x3FF, 0)
+#define GL_XLR_MARKER_TRIG_TCVMLR_VM_VF_TYPE_S	10
+#define GL_XLR_MARKER_TRIG_TCVMLR_VM_VF_TYPE_M	ICE_M(0x3, 10)
+#define GL_XLR_MARKER_TRIG_TCVMLR_PF_NUM_S	12
+#define GL_XLR_MARKER_TRIG_TCVMLR_PF_NUM_M	ICE_M(0x7, 12)
+#define GL_XLR_MARKER_TRIG_TCVMLR_PORT_NUM_S	16
+#define GL_XLR_MARKER_TRIG_TCVMLR_PORT_NUM_M	ICE_M(0x7, 16)
+#define GL_XLR_MARKER_TRIG_VMLR			0x00093804 /* Reset Source: CORER */
+#define GL_XLR_MARKER_TRIG_VMLR_VM_VF_NUM_S	0
+#define GL_XLR_MARKER_TRIG_VMLR_VM_VF_NUM_M	ICE_M(0x3FF, 0)
+#define GL_XLR_MARKER_TRIG_VMLR_VM_VF_TYPE_S	10
+#define GL_XLR_MARKER_TRIG_VMLR_VM_VF_TYPE_M	ICE_M(0x3, 10)
+#define GL_XLR_MARKER_TRIG_VMLR_PF_NUM_S	12
+#define GL_XLR_MARKER_TRIG_VMLR_PF_NUM_M	ICE_M(0x7, 12)
+#define GL_XLR_MARKER_TRIG_VMLR_PORT_NUM_S	16
+#define GL_XLR_MARKER_TRIG_VMLR_PORT_NUM_M	ICE_M(0x7, 16)
+#define GLGEN_ANA_ABORT_PTYPE			0x0020C21C /* Reset Source: CORER */
+#define GLGEN_ANA_ABORT_PTYPE_ABORT_S		0
+#define GLGEN_ANA_ABORT_PTYPE_ABORT_M		ICE_M(0x3FF, 0)
+#define GLGEN_ANA_ALU_ACCSS_OUT_OF_PKT		0x0020C208 /* Reset Source: CORER */
+#define GLGEN_ANA_ALU_ACCSS_OUT_OF_PKT_NPC_S	0
+#define GLGEN_ANA_ALU_ACCSS_OUT_OF_PKT_NPC_M	ICE_M(0xFF, 0)
+#define GLGEN_ANA_CFG_CTRL			0x0020C104 /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_CTRL_LINE_IDX_S		0
+#define GLGEN_ANA_CFG_CTRL_LINE_IDX_M		ICE_M(0x3FFFF, 0)
+#define GLGEN_ANA_CFG_CTRL_TABLE_ID_S		18
+#define GLGEN_ANA_CFG_CTRL_TABLE_ID_M		ICE_M(0xFF, 18)
+#define GLGEN_ANA_CFG_CTRL_RESRVED_S		26
+#define GLGEN_ANA_CFG_CTRL_RESRVED_M		ICE_M(0x7, 26)
+#define GLGEN_ANA_CFG_CTRL_OPERATION_ID_S	29
+#define GLGEN_ANA_CFG_CTRL_OPERATION_ID_M	ICE_M(0x7, 29)
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT		0x0020C158 /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_HIT_S	0
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_HIT_M	BIT(0)
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_PG_MEM_IDX_S 1
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_PG_MEM_IDX_M ICE_M(0x7, 1)
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_ADDR_S	4
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_ADDR_M	ICE_M(0x1FF, 4)
+#define GLGEN_ANA_CFG_LU_KEY(_i)		(0x0020C14C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_LU_KEY_MAX_INDEX		2
+#define GLGEN_ANA_CFG_LU_KEY_LU_KEY_S		0
+#define GLGEN_ANA_CFG_LU_KEY_LU_KEY_M		ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_CFG_RDDATA(_i)		(0x0020C10C + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_RDDATA_MAX_INDEX		15
+#define GLGEN_ANA_CFG_RDDATA_RD_DATA_S		0
+#define GLGEN_ANA_CFG_RDDATA_RD_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT		0x0020C15C /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_HIT_S	0
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_HIT_M	BIT(0)
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_RSV_S	1
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_RSV_M	ICE_M(0x7, 1)
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_ADDR_S	4
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_ADDR_M	ICE_M(0x1FF, 4)
+#define GLGEN_ANA_CFG_WRDATA			0x0020C108 /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_WRDATA_WR_DATA_S		0
+#define GLGEN_ANA_CFG_WRDATA_WR_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_DEF_PTYPE			0x0020C100 /* Reset Source: CORER */
+#define GLGEN_ANA_DEF_PTYPE_DEF_PTYPE_S		0
+#define GLGEN_ANA_DEF_PTYPE_DEF_PTYPE_M		ICE_M(0x3FF, 0)
+#define GLGEN_ANA_ERR_CTRL			0x0020C220 /* Reset Source: CORER */
+#define GLGEN_ANA_ERR_CTRL_ERR_MASK_EN_S	0
+#define GLGEN_ANA_ERR_CTRL_ERR_MASK_EN_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_FLAG_MAP(_i)			(0x0020C000 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLGEN_ANA_FLAG_MAP_MAX_INDEX		63
+#define GLGEN_ANA_FLAG_MAP_FLAG_EN_S		0
+#define GLGEN_ANA_FLAG_MAP_FLAG_EN_M		BIT(0)
+#define GLGEN_ANA_FLAG_MAP_EXT_FLAG_ID_S	1
+#define GLGEN_ANA_FLAG_MAP_EXT_FLAG_ID_M	ICE_M(0x3F, 1)
+#define GLGEN_ANA_INV_NODE_PTYPE		0x0020C210 /* Reset Source: CORER */
+#define GLGEN_ANA_INV_NODE_PTYPE_INV_NODE_PTYPE_S 0
+#define GLGEN_ANA_INV_NODE_PTYPE_INV_NODE_PTYPE_M ICE_M(0x7FF, 0)
+#define GLGEN_ANA_INV_PTYPE_MARKER		0x0020C218 /* Reset Source: CORER */
+#define GLGEN_ANA_INV_PTYPE_MARKER_INV_PTYPE_MARKER_S 0
+#define GLGEN_ANA_INV_PTYPE_MARKER_INV_PTYPE_MARKER_M ICE_M(0x7F, 0)
+#define GLGEN_ANA_LAST_PROT_ID(_i)		(0x0020C1E4 + ((_i) * 4)) /* _i=0...5 */ /* Reset Source: CORER */
+#define GLGEN_ANA_LAST_PROT_ID_MAX_INDEX	5
+#define GLGEN_ANA_LAST_PROT_ID_EN_S		0
+#define GLGEN_ANA_LAST_PROT_ID_EN_M		BIT(0)
+#define GLGEN_ANA_LAST_PROT_ID_PROT_ID_S	1
+#define GLGEN_ANA_LAST_PROT_ID_PROT_ID_M	ICE_M(0xFF, 1)
+#define GLGEN_ANA_NMPG_KEYMASK(_i)		(0x0020C1D0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_NMPG_KEYMASK_MAX_INDEX	3
+#define GLGEN_ANA_NMPG_KEYMASK_HASH_KEY_S	0
+#define GLGEN_ANA_NMPG_KEYMASK_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_NMPG0_HASHKEY(_i)		(0x0020C1B0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_NMPG0_HASHKEY_MAX_INDEX	3
+#define GLGEN_ANA_NMPG0_HASHKEY_HASH_KEY_S	0
+#define GLGEN_ANA_NMPG0_HASHKEY_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_NO_HIT_PG_NM_PG		0x0020C204 /* Reset Source: CORER */
+#define GLGEN_ANA_NO_HIT_PG_NM_PG_NPC_S		0
+#define GLGEN_ANA_NO_HIT_PG_NM_PG_NPC_M		ICE_M(0xFF, 0)
+#define GLGEN_ANA_OUT_OF_PKT			0x0020C200 /* Reset Source: CORER */
+#define GLGEN_ANA_OUT_OF_PKT_NPC_S		0
+#define GLGEN_ANA_OUT_OF_PKT_NPC_M		ICE_M(0xFF, 0)
+#define GLGEN_ANA_P2P(_i)			(0x0020C160 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLGEN_ANA_P2P_MAX_INDEX			15
+#define GLGEN_ANA_P2P_TARGET_PROF_S		0
+#define GLGEN_ANA_P2P_TARGET_PROF_M		ICE_M(0xF, 0)
+#define GLGEN_ANA_PG_KEYMASK(_i)		(0x0020C1C0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_PG_KEYMASK_MAX_INDEX		3
+#define GLGEN_ANA_PG_KEYMASK_HASH_KEY_S		0
+#define GLGEN_ANA_PG_KEYMASK_HASH_KEY_M		ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_PG0_HASHKEY(_i)		(0x0020C1A0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_PG0_HASHKEY_MAX_INDEX		3
+#define GLGEN_ANA_PG0_HASHKEY_HASH_KEY_S	0
+#define GLGEN_ANA_PG0_HASHKEY_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_PROFIL_CTRL			0x0020C1FC /* Reset Source: CORER */
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MDID_S 0
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MDID_M ICE_M(0x1F, 0)
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MDSTART_S 5
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MDSTART_M ICE_M(0xF, 5)
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MD_LEN_S 9
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MD_LEN_M ICE_M(0x1F, 9)
+#define GLGEN_ANA_PROFIL_CTRL_NUM_CTRL_DOMAIN_S 14
+#define GLGEN_ANA_PROFIL_CTRL_NUM_CTRL_DOMAIN_M ICE_M(0x3, 14)
+#define GLGEN_ANA_PROFIL_CTRL_DEF_PROF_ID_S	16
+#define GLGEN_ANA_PROFIL_CTRL_DEF_PROF_ID_M	ICE_M(0xF, 16)
+#define GLGEN_ANA_PROFIL_CTRL_SEL_DEF_PROF_ID_S 20
+#define GLGEN_ANA_PROFIL_CTRL_SEL_DEF_PROF_ID_M BIT(20)
+#define GLGEN_ANA_TX_ABORT_PTYPE		0x0020D21C /* Reset Source: CORER */
+#define GLGEN_ANA_TX_ABORT_PTYPE_ABORT_S	0
+#define GLGEN_ANA_TX_ABORT_PTYPE_ABORT_M	ICE_M(0x3FF, 0)
+#define GLGEN_ANA_TX_ALU_ACCSS_OUT_OF_PKT	0x0020D208 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_ALU_ACCSS_OUT_OF_PKT_NPC_S 0
+#define GLGEN_ANA_TX_ALU_ACCSS_OUT_OF_PKT_NPC_M ICE_M(0xFF, 0)
+#define GLGEN_ANA_TX_CFG_CTRL			0x0020D104 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_CTRL_LINE_IDX_S	0
+#define GLGEN_ANA_TX_CFG_CTRL_LINE_IDX_M	ICE_M(0x3FFFF, 0)
+#define GLGEN_ANA_TX_CFG_CTRL_TABLE_ID_S	18
+#define GLGEN_ANA_TX_CFG_CTRL_TABLE_ID_M	ICE_M(0xFF, 18)
+#define GLGEN_ANA_TX_CFG_CTRL_RESRVED_S		26
+#define GLGEN_ANA_TX_CFG_CTRL_RESRVED_M		ICE_M(0x7, 26)
+#define GLGEN_ANA_TX_CFG_CTRL_OPERATION_ID_S	29
+#define GLGEN_ANA_TX_CFG_CTRL_OPERATION_ID_M	ICE_M(0x7, 29)
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT		0x0020D158 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_HIT_S	0
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_HIT_M	BIT(0)
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_PG_MEM_IDX_S 1
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_PG_MEM_IDX_M ICE_M(0x7, 1)
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_ADDR_S	4
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_ADDR_M	ICE_M(0x1FF, 4)
+#define GLGEN_ANA_TX_CFG_LU_KEY(_i)		(0x0020D14C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_LU_KEY_MAX_INDEX	2
+#define GLGEN_ANA_TX_CFG_LU_KEY_LU_KEY_S	0
+#define GLGEN_ANA_TX_CFG_LU_KEY_LU_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_CFG_RDDATA(_i)		(0x0020D10C + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_RDDATA_MAX_INDEX	15
+#define GLGEN_ANA_TX_CFG_RDDATA_RD_DATA_S	0
+#define GLGEN_ANA_TX_CFG_RDDATA_RD_DATA_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT	0x0020D15C /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_HIT_S 0
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_HIT_M BIT(0)
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_RSV_S 1
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_RSV_M ICE_M(0x7, 1)
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_ADDR_S 4
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_ADDR_M ICE_M(0x1FF, 4)
+#define GLGEN_ANA_TX_CFG_WRDATA			0x0020D108 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_WRDATA_WR_DATA_S	0
+#define GLGEN_ANA_TX_CFG_WRDATA_WR_DATA_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_DEF_PTYPE			0x0020D100 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_DEF_PTYPE_DEF_PTYPE_S	0
+#define GLGEN_ANA_TX_DEF_PTYPE_DEF_PTYPE_M	ICE_M(0x3FF, 0)
+#define GLGEN_ANA_TX_DFD_PACE_OUT		0x0020D4CC /* Reset Source: CORER */
+#define GLGEN_ANA_TX_DFD_PACE_OUT_PUSH_S	0
+#define GLGEN_ANA_TX_DFD_PACE_OUT_PUSH_M	BIT(0)
+#define GLGEN_ANA_TX_ERR_CTRL			0x0020D220 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_ERR_CTRL_ERR_MASK_EN_S	0
+#define GLGEN_ANA_TX_ERR_CTRL_ERR_MASK_EN_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_FLAG_MAP(_i)		(0x0020D000 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_FLAG_MAP_MAX_INDEX		63
+#define GLGEN_ANA_TX_FLAG_MAP_FLAG_EN_S		0
+#define GLGEN_ANA_TX_FLAG_MAP_FLAG_EN_M		BIT(0)
+#define GLGEN_ANA_TX_FLAG_MAP_EXT_FLAG_ID_S	1
+#define GLGEN_ANA_TX_FLAG_MAP_EXT_FLAG_ID_M	ICE_M(0x3F, 1)
+#define GLGEN_ANA_TX_INV_NODE_PTYPE		0x0020D210 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_INV_NODE_PTYPE_INV_NODE_PTYPE_S 0
+#define GLGEN_ANA_TX_INV_NODE_PTYPE_INV_NODE_PTYPE_M ICE_M(0x7FF, 0)
+#define GLGEN_ANA_TX_INV_PROT_ID		0x0020D214 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_INV_PROT_ID_INV_PROT_ID_S	0
+#define GLGEN_ANA_TX_INV_PROT_ID_INV_PROT_ID_M	ICE_M(0xFF, 0)
+#define GLGEN_ANA_TX_INV_PTYPE_MARKER		0x0020D218 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_INV_PTYPE_MARKER_INV_PTYPE_MARKER_S 0
+#define GLGEN_ANA_TX_INV_PTYPE_MARKER_INV_PTYPE_MARKER_M ICE_M(0x7F, 0)
+#define GLGEN_ANA_TX_NMPG_KEYMASK(_i)		(0x0020D1D0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_NMPG_KEYMASK_MAX_INDEX	3
+#define GLGEN_ANA_TX_NMPG_KEYMASK_HASH_KEY_S	0
+#define GLGEN_ANA_TX_NMPG_KEYMASK_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_NMPG0_HASHKEY(_i)		(0x0020D1B0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_NMPG0_HASHKEY_MAX_INDEX	3
+#define GLGEN_ANA_TX_NMPG0_HASHKEY_HASH_KEY_S	0
+#define GLGEN_ANA_TX_NMPG0_HASHKEY_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_NO_HIT_PG_NM_PG		0x0020D204 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_NO_HIT_PG_NM_PG_NPC_S	0
+#define GLGEN_ANA_TX_NO_HIT_PG_NM_PG_NPC_M	ICE_M(0xFF, 0)
+#define GLGEN_ANA_TX_P2P(_i)			(0x0020D160 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_P2P_MAX_INDEX		15
+#define GLGEN_ANA_TX_P2P_TARGET_PROF_S		0
+#define GLGEN_ANA_TX_P2P_TARGET_PROF_M		ICE_M(0xF, 0)
+#define GLGEN_ANA_TX_PG_KEYMASK(_i)		(0x0020D1C0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_PG_KEYMASK_MAX_INDEX	3
+#define GLGEN_ANA_TX_PG_KEYMASK_HASH_KEY_S	0
+#define GLGEN_ANA_TX_PG_KEYMASK_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_PG0_HASHKEY(_i)		(0x0020D1A0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_PG0_HASHKEY_MAX_INDEX	3
+#define GLGEN_ANA_TX_PG0_HASHKEY_HASH_KEY_S	0
+#define GLGEN_ANA_TX_PG0_HASHKEY_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_PROFIL_CTRL		0x0020D1FC /* Reset Source: CORER */
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MDID_S 0
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MDID_M ICE_M(0x1F, 0)
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MDSTART_S 5
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MDSTART_M ICE_M(0xF, 5)
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MD_LEN_S 9
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MD_LEN_M ICE_M(0x1F, 9)
+#define GLGEN_ANA_TX_PROFIL_CTRL_NUM_CTRL_DOMAIN_S 14
+#define GLGEN_ANA_TX_PROFIL_CTRL_NUM_CTRL_DOMAIN_M ICE_M(0x3, 14)
+#define GLGEN_ANA_TX_PROFIL_CTRL_DEF_PROF_ID_S	16
+#define GLGEN_ANA_TX_PROFIL_CTRL_DEF_PROF_ID_M	ICE_M(0xF, 16)
+#define GLGEN_ANA_TX_PROFIL_CTRL_SEL_DEF_PROF_ID_S 20
+#define GLGEN_ANA_TX_PROFIL_CTRL_SEL_DEF_PROF_ID_M BIT(20)
+#define GLGEN_ASSERT_HLP			0x000B81E4 /* Reset Source: POR */
+#define GLGEN_ASSERT_HLP_CORE_ON_RST_S		0
+#define GLGEN_ASSERT_HLP_CORE_ON_RST_M		BIT(0)
+#define GLGEN_ASSERT_HLP_FULL_ON_RST_S		1
+#define GLGEN_ASSERT_HLP_FULL_ON_RST_M		BIT(1)
+#define GLGEN_CLKSTAT				0x000B8184 /* Reset Source: POR */
+#define GLGEN_CLKSTAT_U_CLK_SPEED_S		0
+#define GLGEN_CLKSTAT_U_CLK_SPEED_M		ICE_M(0x7, 0)
+#define GLGEN_CLKSTAT_L_CLK_SPEED_S		3
+#define GLGEN_CLKSTAT_L_CLK_SPEED_M		ICE_M(0x7, 3)
+#define GLGEN_CLKSTAT_PSM_CLK_SPEED_S		6
+#define GLGEN_CLKSTAT_PSM_CLK_SPEED_M		ICE_M(0x7, 6)
+#define GLGEN_CLKSTAT_RXCTL_CLK_SPEED_S		9
+#define GLGEN_CLKSTAT_RXCTL_CLK_SPEED_M		ICE_M(0x7, 9)
+#define GLGEN_CLKSTAT_UANA_CLK_SPEED_S		12
+#define GLGEN_CLKSTAT_UANA_CLK_SPEED_M		ICE_M(0x7, 12)
+#define GLGEN_CLKSTAT_PE_CLK_SPEED_S		18
+#define GLGEN_CLKSTAT_PE_CLK_SPEED_M		ICE_M(0x7, 18)
+#define GLGEN_CLKSTAT_SRC			0x000B826C /* Reset Source: POR */
+#define GLGEN_CLKSTAT_SRC_U_CLK_SRC_S		0
+#define GLGEN_CLKSTAT_SRC_U_CLK_SRC_M		ICE_M(0x3, 0)
+#define GLGEN_CLKSTAT_SRC_L_CLK_SRC_S		2
+#define GLGEN_CLKSTAT_SRC_L_CLK_SRC_M		ICE_M(0x3, 2)
+#define GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_S		4
+#define GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_M		ICE_M(0x3, 4)
+#define GLGEN_CLKSTAT_SRC_RXCTL_CLK_SRC_S	6
+#define GLGEN_CLKSTAT_SRC_RXCTL_CLK_SRC_M	ICE_M(0x3, 6)
+#define GLGEN_CLKSTAT_SRC_UANA_CLK_SRC_S	8
+#define GLGEN_CLKSTAT_SRC_UANA_CLK_SRC_M	ICE_M(0xF, 8)
+#define GLGEN_ECC_ERR_INT_TOG_MASK_H		0x00093A00 /* Reset Source: CORER */
+#define GLGEN_ECC_ERR_INT_TOG_MASK_H_CLIENT_NUM_S 0
+#define GLGEN_ECC_ERR_INT_TOG_MASK_H_CLIENT_NUM_M ICE_M(0x7F, 0)
+#define GLGEN_ECC_ERR_INT_TOG_MASK_L		0x000939FC /* Reset Source: CORER */
+#define GLGEN_ECC_ERR_INT_TOG_MASK_L_CLIENT_NUM_S 0
+#define GLGEN_ECC_ERR_INT_TOG_MASK_L_CLIENT_NUM_M ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ECC_ERR_RST_MASK_H		0x000939F8 /* Reset Source: CORER */
+#define GLGEN_ECC_ERR_RST_MASK_H_CLIENT_NUM_S	0
+#define GLGEN_ECC_ERR_RST_MASK_H_CLIENT_NUM_M	ICE_M(0x7F, 0)
+#define GLGEN_ECC_ERR_RST_MASK_L		0x000939F4 /* Reset Source: CORER */
+#define GLGEN_ECC_ERR_RST_MASK_L_CLIENT_NUM_S	0
+#define GLGEN_ECC_ERR_RST_MASK_L_CLIENT_NUM_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_GPIO_CTL(_i)			(0x000880C8 + ((_i) * 4)) /* _i=0...6 */ /* Reset Source: POR */
+#define GLGEN_GPIO_CTL_MAX_INDEX		6
+#define GLGEN_GPIO_CTL_IN_VALUE_S		0
+#define GLGEN_GPIO_CTL_IN_VALUE_M		BIT(0)
+#define GLGEN_GPIO_CTL_IN_TRANSIT_S		1
+#define GLGEN_GPIO_CTL_IN_TRANSIT_M		BIT(1)
+#define GLGEN_GPIO_CTL_OUT_VALUE_S		2
+#define GLGEN_GPIO_CTL_OUT_VALUE_M		BIT(2)
+#define GLGEN_GPIO_CTL_NO_P_UP_S		3
+#define GLGEN_GPIO_CTL_NO_P_UP_M		BIT(3)
+#define GLGEN_GPIO_CTL_PIN_DIR_S		4
+#define GLGEN_GPIO_CTL_PIN_DIR_M		BIT(4)
+#define GLGEN_GPIO_CTL_TRI_CTL_S		5
+#define GLGEN_GPIO_CTL_TRI_CTL_M		BIT(5)
+#define GLGEN_GPIO_CTL_PIN_FUNC_S		8
+#define GLGEN_GPIO_CTL_PIN_FUNC_M		ICE_M(0xF, 8)
+#define GLGEN_GPIO_CTL_INT_MODE_S		12
+#define GLGEN_GPIO_CTL_INT_MODE_M		ICE_M(0x3, 12)
+#define GLGEN_MARKER_COUNT			0x000939E8 /* Reset Source: CORER */
+#define GLGEN_MARKER_COUNT_MARKER_COUNT_S	0
+#define GLGEN_MARKER_COUNT_MARKER_COUNT_M	ICE_M(0xFF, 0)
+#define GLGEN_MARKER_COUNT_MARKER_COUNT_EN_S	31
+#define GLGEN_MARKER_COUNT_MARKER_COUNT_EN_M	BIT(31)
+#define GLGEN_RSTAT				0x000B8188 /* Reset Source: POR */
+#define GLGEN_RSTAT_DEVSTATE_S			0
 #define GLGEN_RSTAT_DEVSTATE_M			ICE_M(0x3, 0)
-#define GLGEN_RSTCTL				0x000B8180
-#define GLGEN_RSTCTL_GRSTDEL_S			0
-#define GLGEN_RSTCTL_GRSTDEL_M			ICE_M(0x3F, GLGEN_RSTCTL_GRSTDEL_S)
 #define GLGEN_RSTAT_RESET_TYPE_S		2
 #define GLGEN_RSTAT_RESET_TYPE_M		ICE_M(0x3, 2)
-#define GLGEN_RTRIG				0x000B8190
+#define GLGEN_RSTAT_CORERCNT_S			4
+#define GLGEN_RSTAT_CORERCNT_M			ICE_M(0x3, 4)
+#define GLGEN_RSTAT_GLOBRCNT_S			6
+#define GLGEN_RSTAT_GLOBRCNT_M			ICE_M(0x3, 6)
+#define GLGEN_RSTAT_EMPRCNT_S			8
+#define GLGEN_RSTAT_EMPRCNT_M			ICE_M(0x3, 8)
+#define GLGEN_RSTAT_TIME_TO_RST_S		10
+#define GLGEN_RSTAT_TIME_TO_RST_M		ICE_M(0x3F, 10)
+#define GLGEN_RSTAT_RTRIG_FLR_S			16
+#define GLGEN_RSTAT_RTRIG_FLR_M			BIT(16)
+#define GLGEN_RSTAT_RTRIG_ECC_S			17
+#define GLGEN_RSTAT_RTRIG_ECC_M			BIT(17)
+#define GLGEN_RSTAT_RTRIG_FW_AUX_S		18
+#define GLGEN_RSTAT_RTRIG_FW_AUX_M		BIT(18)
+#define GLGEN_RSTCTL				0x000B8180 /* Reset Source: POR */
+#define GLGEN_RSTCTL_GRSTDEL_S			0
+#define GLGEN_RSTCTL_GRSTDEL_M			ICE_M(0x3F, 0)
+#define GLGEN_RSTCTL_ECC_RST_ENA_S		8
+#define GLGEN_RSTCTL_ECC_RST_ENA_M		BIT(8)
+#define GLGEN_RSTCTL_ECC_RT_EN_S		30
+#define GLGEN_RSTCTL_ECC_RT_EN_M		BIT(30)
+#define GLGEN_RSTCTL_FLR_RT_EN_S		31
+#define GLGEN_RSTCTL_FLR_RT_EN_M		BIT(31)
+#define GLGEN_RTRIG				0x000B8190 /* Reset Source: CORER */
+#define GLGEN_RTRIG_CORER_S			0
 #define GLGEN_RTRIG_CORER_M			BIT(0)
+#define GLGEN_RTRIG_GLOBR_S			1
 #define GLGEN_RTRIG_GLOBR_M			BIT(1)
-#define GLGEN_STAT				0x000B612C
-#define GLGEN_VFLRSTAT(_i)			(0x00093A04 + ((_i) * 4))
-#define PFGEN_CTRL				0x00091000
+#define GLGEN_RTRIG_EMPFWR_S			2
+#define GLGEN_RTRIG_EMPFWR_M			BIT(2)
+#define GLGEN_STAT				0x000B612C /* Reset Source: POR */
+#define GLGEN_STAT_RSVD4FW_S			0
+#define GLGEN_STAT_RSVD4FW_M			ICE_M(0xFF, 0)
+#define GLGEN_VFLRSTAT(_i)			(0x00093A04 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLGEN_VFLRSTAT_MAX_INDEX		7
+#define GLGEN_VFLRSTAT_VFLRS_S			0
+#define GLGEN_VFLRSTAT_VFLRS_M			ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_XLR_MSK2HLP_RDY			0x000939F0 /* Reset Source: CORER */
+#define GLGEN_XLR_MSK2HLP_RDY_GLGEN_XLR_MSK2HLP_RDY_S 0
+#define GLGEN_XLR_MSK2HLP_RDY_GLGEN_XLR_MSK2HLP_RDY_M BIT(0)
+#define GLGEN_XLR_TRNS_WAIT_COUNT		0x000939EC /* Reset Source: CORER */
+#define GLGEN_XLR_TRNS_WAIT_COUNT_W_BTWN_TRNS_COUNT_S 0
+#define GLGEN_XLR_TRNS_WAIT_COUNT_W_BTWN_TRNS_COUNT_M ICE_M(0x1F, 0)
+#define GLGEN_XLR_TRNS_WAIT_COUNT_W_PEND_TRNS_COUNT_S 8
+#define GLGEN_XLR_TRNS_WAIT_COUNT_W_PEND_TRNS_COUNT_M ICE_M(0xFF, 8)
+#define GLVFGEN_TIMER				0x000B8214 /* Reset Source: POR */
+#define GLVFGEN_TIMER_GTIME_S			0
+#define GLVFGEN_TIMER_GTIME_M			ICE_M(0xFFFFFFFF, 0)
+#define PFGEN_CTRL				0x00091000 /* Reset Source: CORER */
+#define PFGEN_CTRL_PFSWR_S			0
 #define PFGEN_CTRL_PFSWR_M			BIT(0)
-#define PFGEN_STATE				0x00088000
-#define PRTGEN_STATUS				0x000B8100
-#define VFGEN_RSTAT(_VF)			(0x00074000 + ((_VF) * 4))
-#define VPGEN_VFRSTAT(_VF)			(0x00090800 + ((_VF) * 4))
+#define PFGEN_DRUN				0x00091180 /* Reset Source: CORER */
+#define PFGEN_DRUN_DRVUNLD_S			0
+#define PFGEN_DRUN_DRVUNLD_M			BIT(0)
+#define PFGEN_PFRSTAT				0x00091080 /* Reset Source: CORER */
+#define PFGEN_PFRSTAT_PFRD_S			0
+#define PFGEN_PFRSTAT_PFRD_M			BIT(0)
+#define PFGEN_PORTNUM				0x001D2400 /* Reset Source: CORER */
+#define PFGEN_PORTNUM_PORT_NUM_S		0
+#define PFGEN_PORTNUM_PORT_NUM_M		ICE_M(0x7, 0)
+#define PFGEN_STATE				0x00088000 /* Reset Source: CORER */
+#define PFGEN_STATE_PFPEEN_S			0
+#define PFGEN_STATE_PFPEEN_M			BIT(0)
+#define PFGEN_STATE_RSVD_S			1
+#define PFGEN_STATE_RSVD_M			BIT(1)
+#define PFGEN_STATE_PFLINKEN_S			2
+#define PFGEN_STATE_PFLINKEN_M			BIT(2)
+#define PFGEN_STATE_PFSCEN_S			3
+#define PFGEN_STATE_PFSCEN_M			BIT(3)
+#define PRT_TCVMLR_DRAIN_CNTR			0x000A21C0 /* Reset Source: CORER */
+#define PRT_TCVMLR_DRAIN_CNTR_CNTR_S		0
+#define PRT_TCVMLR_DRAIN_CNTR_CNTR_M		ICE_M(0x3FFF, 0)
+#define PRTGEN_CNF				0x000B8120 /* Reset Source: POR */
+#define PRTGEN_CNF_PORT_DIS_S			0
+#define PRTGEN_CNF_PORT_DIS_M			BIT(0)
+#define PRTGEN_CNF_ALLOW_PORT_DIS_S		1
+#define PRTGEN_CNF_ALLOW_PORT_DIS_M		BIT(1)
+#define PRTGEN_CNF_EMP_PORT_DIS_S		2
+#define PRTGEN_CNF_EMP_PORT_DIS_M		BIT(2)
+#define PRTGEN_CNF2				0x000B8160 /* Reset Source: POR */
+#define PRTGEN_CNF2_ACTIVATE_PORT_LINK_S	0
+#define PRTGEN_CNF2_ACTIVATE_PORT_LINK_M	BIT(0)
+#define PRTGEN_CNF3				0x000B8280 /* Reset Source: POR */
+#define PRTGEN_CNF3_PORT_STAGERING_EN_S		0
+#define PRTGEN_CNF3_PORT_STAGERING_EN_M		BIT(0)
+#define PRTGEN_STATUS				0x000B8100 /* Reset Source: POR */
+#define PRTGEN_STATUS_PORT_VALID_S		0
+#define PRTGEN_STATUS_PORT_VALID_M		BIT(0)
+#define PRTGEN_STATUS_PORT_ACTIVE_S		1
+#define PRTGEN_STATUS_PORT_ACTIVE_M		BIT(1)
+#define VFGEN_RSTAT(_VF)			(0x00074000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: VFR */
+#define VFGEN_RSTAT_MAX_INDEX			255
+#define VFGEN_RSTAT_VFR_STATE_S			0
+#define VFGEN_RSTAT_VFR_STATE_M			ICE_M(0x3, 0)
+#define VPGEN_VFRSTAT(_VF)			(0x00090800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPGEN_VFRSTAT_MAX_INDEX			255
+#define VPGEN_VFRSTAT_VFRD_S			0
 #define VPGEN_VFRSTAT_VFRD_M			BIT(0)
-#define VPGEN_VFRTRIG(_VF)			(0x00090000 + ((_VF) * 4))
+#define VPGEN_VFRTRIG(_VF)			(0x00090000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPGEN_VFRTRIG_MAX_INDEX			255
+#define VPGEN_VFRTRIG_VFSWR_S			0
 #define VPGEN_VFRTRIG_VFSWR_M			BIT(0)
-#define PFHMC_ERRORDATA				0x00520500
-#define PFHMC_ERRORINFO				0x00520400
-#define GLINT_CTL				0x0016CC54
+#define VSIGEN_RSTAT(_VSI)			(0x00092800 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIGEN_RSTAT_MAX_INDEX			767
+#define VSIGEN_RSTAT_VMRD_S			0
+#define VSIGEN_RSTAT_VMRD_M			BIT(0)
+#define VSIGEN_RTRIG(_VSI)			(0x00091800 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIGEN_RTRIG_MAX_INDEX			767
+#define VSIGEN_RTRIG_VMSWR_S			0
+#define VSIGEN_RTRIG_VMSWR_M			BIT(0)
+#define GLHMC_APBVTINUSEBASE(_i)		(0x00524A00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_APBVTINUSEBASE_MAX_INDEX		7
+#define GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_S	0
+#define GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_CEQPART(_i)			(0x005031C0 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_CEQPART_MAX_INDEX			7
+#define GLHMC_CEQPART_PMCEQBASE_S		0
+#define GLHMC_CEQPART_PMCEQBASE_M		ICE_M(0x3FF, 0)
+#define GLHMC_CEQPART_PMCEQSIZE_S		16
+#define GLHMC_CEQPART_PMCEQSIZE_M		ICE_M(0x3FF, 16)
+#define GLHMC_DBCQMAX				0x005220F0 /* Reset Source: CORER */
+#define GLHMC_DBCQMAX_GLHMC_DBCQMAX_S		0
+#define GLHMC_DBCQMAX_GLHMC_DBCQMAX_M		ICE_M(0xFFFFF, 0)
+#define GLHMC_DBCQPART(_i)			(0x00503180 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_DBCQPART_MAX_INDEX		7
+#define GLHMC_DBCQPART_PMDBCQBASE_S		0
+#define GLHMC_DBCQPART_PMDBCQBASE_M		ICE_M(0x3FFF, 0)
+#define GLHMC_DBCQPART_PMDBCQSIZE_S		16
+#define GLHMC_DBCQPART_PMDBCQSIZE_M		ICE_M(0x7FFF, 16)
+#define GLHMC_DBQPMAX				0x005220EC /* Reset Source: CORER */
+#define GLHMC_DBQPMAX_GLHMC_DBQPMAX_S		0
+#define GLHMC_DBQPMAX_GLHMC_DBQPMAX_M		ICE_M(0x7FFFF, 0)
+#define GLHMC_DBQPPART(_i)			(0x005044C0 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_DBQPPART_MAX_INDEX		7
+#define GLHMC_DBQPPART_PMDBQPBASE_S		0
+#define GLHMC_DBQPPART_PMDBQPBASE_M		ICE_M(0x3FFF, 0)
+#define GLHMC_DBQPPART_PMDBQPSIZE_S		16
+#define GLHMC_DBQPPART_PMDBQPSIZE_M		ICE_M(0x7FFF, 16)
+#define GLHMC_FSIAVBASE(_i)			(0x00525600 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_FSIAVBASE_MAX_INDEX		7
+#define GLHMC_FSIAVBASE_FPMFSIAVBASE_S		0
+#define GLHMC_FSIAVBASE_FPMFSIAVBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_FSIAVCNT(_i)			(0x00525700 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_FSIAVCNT_MAX_INDEX		7
+#define GLHMC_FSIAVCNT_FPMFSIAVCNT_S		0
+#define GLHMC_FSIAVCNT_FPMFSIAVCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_FSIAVMAX				0x00522068 /* Reset Source: CORER */
+#define GLHMC_FSIAVMAX_PMFSIAVMAX_S		0
+#define GLHMC_FSIAVMAX_PMFSIAVMAX_M		ICE_M(0x3FFFF, 0)
+#define GLHMC_FSIAVOBJSZ			0x00522064 /* Reset Source: CORER */
+#define GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_S		0
+#define GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_FSIMCBASE(_i)			(0x00526000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_FSIMCBASE_MAX_INDEX		7
+#define GLHMC_FSIMCBASE_FPMFSIMCBASE_S		0
+#define GLHMC_FSIMCBASE_FPMFSIMCBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_FSIMCCNT(_i)			(0x00526100 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_FSIMCCNT_MAX_INDEX		7
+#define GLHMC_FSIMCCNT_FPMFSIMCSZ_S		0
+#define GLHMC_FSIMCCNT_FPMFSIMCSZ_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_FSIMCMAX				0x00522060 /* Reset Source: CORER */
+#define GLHMC_FSIMCMAX_PMFSIMCMAX_S		0
+#define GLHMC_FSIMCMAX_PMFSIMCMAX_M		ICE_M(0x3FFF, 0)
+#define GLHMC_FSIMCOBJSZ			0x0052205C /* Reset Source: CORER */
+#define GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_S		0
+#define GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_FWPDINV				0x0052207C /* Reset Source: CORER */
+#define GLHMC_FWPDINV_PMSDIDX_S			0
+#define GLHMC_FWPDINV_PMSDIDX_M			ICE_M(0xFFF, 0)
+#define GLHMC_FWPDINV_PMSDPARTSEL_S		15
+#define GLHMC_FWPDINV_PMSDPARTSEL_M		BIT(15)
+#define GLHMC_FWPDINV_PMPDIDX_S			16
+#define GLHMC_FWPDINV_PMPDIDX_M			ICE_M(0x1FF, 16)
+#define GLHMC_FWPDINV_FPMAT			0x0010207C /* Reset Source: CORER */
+#define GLHMC_FWPDINV_FPMAT_PMSDIDX_S		0
+#define GLHMC_FWPDINV_FPMAT_PMSDIDX_M		ICE_M(0xFFF, 0)
+#define GLHMC_FWPDINV_FPMAT_PMSDPARTSEL_S	15
+#define GLHMC_FWPDINV_FPMAT_PMSDPARTSEL_M	BIT(15)
+#define GLHMC_FWPDINV_FPMAT_PMPDIDX_S		16
+#define GLHMC_FWPDINV_FPMAT_PMPDIDX_M		ICE_M(0x1FF, 16)
+#define GLHMC_FWSDDATAHIGH			0x00522078 /* Reset Source: CORER */
+#define GLHMC_FWSDDATAHIGH_PMSDDATAHIGH_S	0
+#define GLHMC_FWSDDATAHIGH_PMSDDATAHIGH_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_FWSDDATAHIGH_FPMAT		0x00102078 /* Reset Source: CORER */
+#define GLHMC_FWSDDATAHIGH_FPMAT_PMSDDATAHIGH_S 0
+#define GLHMC_FWSDDATAHIGH_FPMAT_PMSDDATAHIGH_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_FWSDDATALOW			0x00522074 /* Reset Source: CORER */
+#define GLHMC_FWSDDATALOW_PMSDVALID_S		0
+#define GLHMC_FWSDDATALOW_PMSDVALID_M		BIT(0)
+#define GLHMC_FWSDDATALOW_PMSDTYPE_S		1
+#define GLHMC_FWSDDATALOW_PMSDTYPE_M		BIT(1)
+#define GLHMC_FWSDDATALOW_PMSDBPCOUNT_S		2
+#define GLHMC_FWSDDATALOW_PMSDBPCOUNT_M		ICE_M(0x3FF, 2)
+#define GLHMC_FWSDDATALOW_PMSDDATALOW_S		12
+#define GLHMC_FWSDDATALOW_PMSDDATALOW_M		ICE_M(0xFFFFF, 12)
+#define GLHMC_FWSDDATALOW_FPMAT			0x00102074 /* Reset Source: CORER */
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDVALID_S	0
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDVALID_M	BIT(0)
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDTYPE_S	1
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDTYPE_M	BIT(1)
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDBPCOUNT_S	2
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDBPCOUNT_M	ICE_M(0x3FF, 2)
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDDATALOW_S	12
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDDATALOW_M	ICE_M(0xFFFFF, 12)
+#define GLHMC_PEARPBASE(_i)			(0x00524800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEARPBASE_MAX_INDEX		7
+#define GLHMC_PEARPBASE_FPMPEARPBASE_S		0
+#define GLHMC_PEARPBASE_FPMPEARPBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEARPCNT(_i)			(0x00524900 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEARPCNT_MAX_INDEX		7
+#define GLHMC_PEARPCNT_FPMPEARPCNT_S		0
+#define GLHMC_PEARPCNT_FPMPEARPCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEARPMAX				0x00522038 /* Reset Source: CORER */
+#define GLHMC_PEARPMAX_PMPEARPMAX_S		0
+#define GLHMC_PEARPMAX_PMPEARPMAX_M		ICE_M(0x1FFFF, 0)
+#define GLHMC_PEARPOBJSZ			0x00522034 /* Reset Source: CORER */
+#define GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_S		0
+#define GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_M		ICE_M(0x7, 0)
+#define GLHMC_PECQBASE(_i)			(0x00524200 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PECQBASE_MAX_INDEX		7
+#define GLHMC_PECQBASE_FPMPECQBASE_S		0
+#define GLHMC_PECQBASE_FPMPECQBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PECQCNT(_i)			(0x00524300 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PECQCNT_MAX_INDEX			7
+#define GLHMC_PECQCNT_FPMPECQCNT_S		0
+#define GLHMC_PECQCNT_FPMPECQCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PECQOBJSZ				0x00522020 /* Reset Source: CORER */
+#define GLHMC_PECQOBJSZ_PMPECQOBJSZ_S		0
+#define GLHMC_PECQOBJSZ_PMPECQOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEHDRBASE(_i)			(0x00526200 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHDRBASE_MAX_INDEX		7
+#define GLHMC_PEHDRBASE_GLHMC_PEHDRBASE_S	0
+#define GLHMC_PEHDRBASE_GLHMC_PEHDRBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEHDRCNT(_i)			(0x00526300 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHDRCNT_MAX_INDEX		7
+#define GLHMC_PEHDRCNT_GLHMC_PEHDRCNT_S		0
+#define GLHMC_PEHDRCNT_GLHMC_PEHDRCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEHDRMAX				0x00522008 /* Reset Source: CORER */
+#define GLHMC_PEHDRMAX_PMPEHDRMAX_S		0
+#define GLHMC_PEHDRMAX_PMPEHDRMAX_M		ICE_M(0x7FFFF, 0)
+#define GLHMC_PEHDRMAX_RSVD_S			19
+#define GLHMC_PEHDRMAX_RSVD_M			ICE_M(0x1FFF, 19)
+#define GLHMC_PEHDROBJSZ			0x00522004 /* Reset Source: CORER */
+#define GLHMC_PEHDROBJSZ_PMPEHDROBJSZ_S		0
+#define GLHMC_PEHDROBJSZ_PMPEHDROBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEHDROBJSZ_RSVD_S			4
+#define GLHMC_PEHDROBJSZ_RSVD_M			ICE_M(0xFFFFFFF, 4)
+#define GLHMC_PEHTCNT(_i)			(0x00524700 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHTCNT_MAX_INDEX			7
+#define GLHMC_PEHTCNT_FPMPEHTCNT_S		0
+#define GLHMC_PEHTCNT_FPMPEHTCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEHTCNT_FPMAT(_i)			(0x00104700 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHTCNT_FPMAT_MAX_INDEX		7
+#define GLHMC_PEHTCNT_FPMAT_FPMPEHTCNT_S	0
+#define GLHMC_PEHTCNT_FPMAT_FPMPEHTCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEHTEBASE(_i)			(0x00524600 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHTEBASE_MAX_INDEX		7
+#define GLHMC_PEHTEBASE_FPMPEHTEBASE_S		0
+#define GLHMC_PEHTEBASE_FPMPEHTEBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEHTEBASE_FPMAT(_i)		(0x00104600 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHTEBASE_FPMAT_MAX_INDEX		7
+#define GLHMC_PEHTEBASE_FPMAT_FPMPEHTEBASE_S	0
+#define GLHMC_PEHTEBASE_FPMAT_FPMPEHTEBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEHTEOBJSZ			0x0052202C /* Reset Source: CORER */
+#define GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_S		0
+#define GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEHTEOBJSZ_FPMAT			0x0010202C /* Reset Source: CORER */
+#define GLHMC_PEHTEOBJSZ_FPMAT_PMPEHTEOBJSZ_S	0
+#define GLHMC_PEHTEOBJSZ_FPMAT_PMPEHTEOBJSZ_M	ICE_M(0xF, 0)
+#define GLHMC_PEHTMAX				0x00522030 /* Reset Source: CORER */
+#define GLHMC_PEHTMAX_PMPEHTMAX_S		0
+#define GLHMC_PEHTMAX_PMPEHTMAX_M		ICE_M(0x1FFFFF, 0)
+#define GLHMC_PEHTMAX_FPMAT			0x00102030 /* Reset Source: CORER */
+#define GLHMC_PEHTMAX_FPMAT_PMPEHTMAX_S		0
+#define GLHMC_PEHTMAX_FPMAT_PMPEHTMAX_M		ICE_M(0x1FFFFF, 0)
+#define GLHMC_PEMDBASE(_i)			(0x00526400 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEMDBASE_MAX_INDEX		7
+#define GLHMC_PEMDBASE_GLHMC_PEMDBASE_S		0
+#define GLHMC_PEMDBASE_GLHMC_PEMDBASE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEMDCNT(_i)			(0x00526500 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEMDCNT_MAX_INDEX			7
+#define GLHMC_PEMDCNT_GLHMC_PEMDCNT_S		0
+#define GLHMC_PEMDCNT_GLHMC_PEMDCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEMDMAX				0x00522010 /* Reset Source: CORER */
+#define GLHMC_PEMDMAX_PMPEMDMAX_S		0
+#define GLHMC_PEMDMAX_PMPEMDMAX_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEMDMAX_RSVD_S			24
+#define GLHMC_PEMDMAX_RSVD_M			ICE_M(0xFF, 24)
+#define GLHMC_PEMDOBJSZ				0x0052200C /* Reset Source: CORER */
+#define GLHMC_PEMDOBJSZ_PMPEMDOBJSZ_S		0
+#define GLHMC_PEMDOBJSZ_PMPEMDOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEMDOBJSZ_RSVD_S			4
+#define GLHMC_PEMDOBJSZ_RSVD_M			ICE_M(0xFFFFFFF, 4)
+#define GLHMC_PEMRBASE(_i)			(0x00524C00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEMRBASE_MAX_INDEX		7
+#define GLHMC_PEMRBASE_FPMPEMRBASE_S		0
+#define GLHMC_PEMRBASE_FPMPEMRBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEMRCNT(_i)			(0x00524D00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEMRCNT_MAX_INDEX			7
+#define GLHMC_PEMRCNT_FPMPEMRSZ_S		0
+#define GLHMC_PEMRCNT_FPMPEMRSZ_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEMRMAX				0x00522040 /* Reset Source: CORER */
+#define GLHMC_PEMRMAX_PMPEMRMAX_S		0
+#define GLHMC_PEMRMAX_PMPEMRMAX_M		ICE_M(0x7FFFFF, 0)
+#define GLHMC_PEMROBJSZ				0x0052203C /* Reset Source: CORER */
+#define GLHMC_PEMROBJSZ_PMPEMROBJSZ_S		0
+#define GLHMC_PEMROBJSZ_PMPEMROBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEOOISCBASE(_i)			(0x00526600 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEOOISCBASE_MAX_INDEX		7
+#define GLHMC_PEOOISCBASE_GLHMC_PEOOISCBASE_S	0
+#define GLHMC_PEOOISCBASE_GLHMC_PEOOISCBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEOOISCCNT(_i)			(0x00526700 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEOOISCCNT_MAX_INDEX		7
+#define GLHMC_PEOOISCCNT_GLHMC_PEOOISCCNT_S	0
+#define GLHMC_PEOOISCCNT_GLHMC_PEOOISCCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEOOISCFFLBASE(_i)		(0x00526C00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEOOISCFFLBASE_MAX_INDEX		7
+#define GLHMC_PEOOISCFFLBASE_GLHMC_PEOOISCFFLBASE_S 0
+#define GLHMC_PEOOISCFFLBASE_GLHMC_PEOOISCFFLBASE_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEOOISCFFLCNT_PMAT(_i)		(0x00526D00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEOOISCFFLCNT_PMAT_MAX_INDEX	7
+#define GLHMC_PEOOISCFFLCNT_PMAT_FPMPEOOISCFLCNT_S 0
+#define GLHMC_PEOOISCFFLCNT_PMAT_FPMPEOOISCFLCNT_M ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEOOISCFFLMAX			0x005220A4 /* Reset Source: CORER */
+#define GLHMC_PEOOISCFFLMAX_PMPEOOISCFFLMAX_S	0
+#define GLHMC_PEOOISCFFLMAX_PMPEOOISCFFLMAX_M	ICE_M(0x7FFFF, 0)
+#define GLHMC_PEOOISCFFLMAX_RSVD_S		19
+#define GLHMC_PEOOISCFFLMAX_RSVD_M		ICE_M(0x1FFF, 19)
+#define GLHMC_PEOOISCMAX			0x00522018 /* Reset Source: CORER */
+#define GLHMC_PEOOISCMAX_PMPEOOISCMAX_S		0
+#define GLHMC_PEOOISCMAX_PMPEOOISCMAX_M		ICE_M(0x7FFFF, 0)
+#define GLHMC_PEOOISCMAX_RSVD_S			19
+#define GLHMC_PEOOISCMAX_RSVD_M			ICE_M(0x1FFF, 19)
+#define GLHMC_PEOOISCOBJSZ			0x00522014 /* Reset Source: CORER */
+#define GLHMC_PEOOISCOBJSZ_PMPEOOISCOBJSZ_S	0
+#define GLHMC_PEOOISCOBJSZ_PMPEOOISCOBJSZ_M	ICE_M(0xF, 0)
+#define GLHMC_PEOOISCOBJSZ_RSVD_S		4
+#define GLHMC_PEOOISCOBJSZ_RSVD_M		ICE_M(0xFFFFFFF, 4)
+#define GLHMC_PEPBLBASE(_i)			(0x00525800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEPBLBASE_MAX_INDEX		7
+#define GLHMC_PEPBLBASE_FPMPEPBLBASE_S		0
+#define GLHMC_PEPBLBASE_FPMPEPBLBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEPBLCNT(_i)			(0x00525900 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEPBLCNT_MAX_INDEX		7
+#define GLHMC_PEPBLCNT_FPMPEPBLCNT_S		0
+#define GLHMC_PEPBLCNT_FPMPEPBLCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEPBLMAX				0x0052206C /* Reset Source: CORER */
+#define GLHMC_PEPBLMAX_PMPEPBLMAX_S		0
+#define GLHMC_PEPBLMAX_PMPEPBLMAX_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEQ1BASE(_i)			(0x00525200 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQ1BASE_MAX_INDEX		7
+#define GLHMC_PEQ1BASE_FPMPEQ1BASE_S		0
+#define GLHMC_PEQ1BASE_FPMPEQ1BASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEQ1CNT(_i)			(0x00525300 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQ1CNT_MAX_INDEX			7
+#define GLHMC_PEQ1CNT_FPMPEQ1CNT_S		0
+#define GLHMC_PEQ1CNT_FPMPEQ1CNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEQ1FLBASE(_i)			(0x00525400 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQ1FLBASE_MAX_INDEX		7
+#define GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_S	0
+#define GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEQ1FLMAX				0x00522058 /* Reset Source: CORER */
+#define GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_S		0
+#define GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_M		ICE_M(0x3FFFFFF, 0)
+#define GLHMC_PEQ1MAX				0x00522054 /* Reset Source: CORER */
+#define GLHMC_PEQ1MAX_PMPEQ1MAX_S		0
+#define GLHMC_PEQ1MAX_PMPEQ1MAX_M		ICE_M(0xFFFFFFF, 0)
+#define GLHMC_PEQ1OBJSZ				0x00522050 /* Reset Source: CORER */
+#define GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_S		0
+#define GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEQPBASE(_i)			(0x00524000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQPBASE_MAX_INDEX		7
+#define GLHMC_PEQPBASE_FPMPEQPBASE_S		0
+#define GLHMC_PEQPBASE_FPMPEQPBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEQPCNT(_i)			(0x00524100 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQPCNT_MAX_INDEX			7
+#define GLHMC_PEQPCNT_FPMPEQPCNT_S		0
+#define GLHMC_PEQPCNT_FPMPEQPCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEQPOBJSZ				0x0052201C /* Reset Source: CORER */
+#define GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_S		0
+#define GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PERRFBASE(_i)			(0x00526800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PERRFBASE_MAX_INDEX		7
+#define GLHMC_PERRFBASE_GLHMC_PERRFBASE_S	0
+#define GLHMC_PERRFBASE_GLHMC_PERRFBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PERRFCNT(_i)			(0x00526900 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PERRFCNT_MAX_INDEX		7
+#define GLHMC_PERRFCNT_GLHMC_PERRFCNT_S		0
+#define GLHMC_PERRFCNT_GLHMC_PERRFCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PERRFFLBASE(_i)			(0x00526A00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PERRFFLBASE_MAX_INDEX		7
+#define GLHMC_PERRFFLBASE_GLHMC_PERRFFLBASE_S	0
+#define GLHMC_PERRFFLBASE_GLHMC_PERRFFLBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PERRFFLCNT_PMAT(_i)		(0x00526B00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PERRFFLCNT_PMAT_MAX_INDEX		7
+#define GLHMC_PERRFFLCNT_PMAT_FPMPERRFFLCNT_S	0
+#define GLHMC_PERRFFLCNT_PMAT_FPMPERRFFLCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PERRFFLMAX			0x005220A0 /* Reset Source: CORER */
+#define GLHMC_PERRFFLMAX_PMPERRFFLMAX_S		0
+#define GLHMC_PERRFFLMAX_PMPERRFFLMAX_M		ICE_M(0x3FFFFFF, 0)
+#define GLHMC_PERRFFLMAX_RSVD_S			26
+#define GLHMC_PERRFFLMAX_RSVD_M			ICE_M(0x3F, 26)
+#define GLHMC_PERRFMAX				0x0052209C /* Reset Source: CORER */
+#define GLHMC_PERRFMAX_PMPERRFMAX_S		0
+#define GLHMC_PERRFMAX_PMPERRFMAX_M		ICE_M(0xFFFFFFF, 0)
+#define GLHMC_PERRFMAX_RSVD_S			28
+#define GLHMC_PERRFMAX_RSVD_M			ICE_M(0xF, 28)
+#define GLHMC_PERRFOBJSZ			0x00522098 /* Reset Source: CORER */
+#define GLHMC_PERRFOBJSZ_PMPERRFOBJSZ_S		0
+#define GLHMC_PERRFOBJSZ_PMPERRFOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PERRFOBJSZ_RSVD_S			4
+#define GLHMC_PERRFOBJSZ_RSVD_M			ICE_M(0xFFFFFFF, 4)
+#define GLHMC_PETIMERBASE(_i)			(0x00525A00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PETIMERBASE_MAX_INDEX		7
+#define GLHMC_PETIMERBASE_FPMPETIMERBASE_S	0
+#define GLHMC_PETIMERBASE_FPMPETIMERBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_PETIMERCNT(_i)			(0x00525B00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PETIMERCNT_MAX_INDEX		7
+#define GLHMC_PETIMERCNT_FPMPETIMERCNT_S	0
+#define GLHMC_PETIMERCNT_FPMPETIMERCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PETIMERMAX			0x00522084 /* Reset Source: CORER */
+#define GLHMC_PETIMERMAX_PMPETIMERMAX_S		0
+#define GLHMC_PETIMERMAX_PMPETIMERMAX_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PETIMEROBJSZ			0x00522080 /* Reset Source: CORER */
+#define GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_S	0
+#define GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_M	ICE_M(0xF, 0)
+#define GLHMC_PEXFBASE(_i)			(0x00524E00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEXFBASE_MAX_INDEX		7
+#define GLHMC_PEXFBASE_FPMPEXFBASE_S		0
+#define GLHMC_PEXFBASE_FPMPEXFBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEXFCNT(_i)			(0x00524F00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEXFCNT_MAX_INDEX			7
+#define GLHMC_PEXFCNT_FPMPEXFCNT_S		0
+#define GLHMC_PEXFCNT_FPMPEXFCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEXFFLBASE(_i)			(0x00525000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEXFFLBASE_MAX_INDEX		7
+#define GLHMC_PEXFFLBASE_FPMPEXFFLBASE_S	0
+#define GLHMC_PEXFFLBASE_FPMPEXFFLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEXFFLMAX				0x0052204C /* Reset Source: CORER */
+#define GLHMC_PEXFFLMAX_PMPEXFFLMAX_S		0
+#define GLHMC_PEXFFLMAX_PMPEXFFLMAX_M		ICE_M(0xFFFFFFF, 0)
+#define GLHMC_PEXFMAX				0x00522048 /* Reset Source: CORER */
+#define GLHMC_PEXFMAX_PMPEXFMAX_S		0
+#define GLHMC_PEXFMAX_PMPEXFMAX_M		ICE_M(0xFFFFFFF, 0)
+#define GLHMC_PEXFOBJSZ				0x00522044 /* Reset Source: CORER */
+#define GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_S		0
+#define GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PFPESDPART(_i)			(0x00520880 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PFPESDPART_MAX_INDEX		7
+#define GLHMC_PFPESDPART_PMSDBASE_S		0
+#define GLHMC_PFPESDPART_PMSDBASE_M		ICE_M(0xFFF, 0)
+#define GLHMC_PFPESDPART_PMSDSIZE_S		16
+#define GLHMC_PFPESDPART_PMSDSIZE_M		ICE_M(0x1FFF, 16)
+#define GLHMC_PFPESDPART_FPMAT(_i)		(0x00100880 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PFPESDPART_FPMAT_MAX_INDEX	7
+#define GLHMC_PFPESDPART_FPMAT_PMSDBASE_S	0
+#define GLHMC_PFPESDPART_FPMAT_PMSDBASE_M	ICE_M(0xFFF, 0)
+#define GLHMC_PFPESDPART_FPMAT_PMSDSIZE_S	16
+#define GLHMC_PFPESDPART_FPMAT_PMSDSIZE_M	ICE_M(0x1FFF, 16)
+#define GLHMC_SDPART(_i)			(0x00520800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_SDPART_MAX_INDEX			7
+#define GLHMC_SDPART_PMSDBASE_S			0
+#define GLHMC_SDPART_PMSDBASE_M			ICE_M(0xFFF, 0)
+#define GLHMC_SDPART_PMSDSIZE_S			16
+#define GLHMC_SDPART_PMSDSIZE_M			ICE_M(0x1FFF, 16)
+#define GLHMC_SDPART_FPMAT(_i)			(0x00100800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_SDPART_FPMAT_MAX_INDEX		7
+#define GLHMC_SDPART_FPMAT_PMSDBASE_S		0
+#define GLHMC_SDPART_FPMAT_PMSDBASE_M		ICE_M(0xFFF, 0)
+#define GLHMC_SDPART_FPMAT_PMSDSIZE_S		16
+#define GLHMC_SDPART_FPMAT_PMSDSIZE_M		ICE_M(0x1FFF, 16)
+#define GLHMC_VFAPBVTINUSEBASE(_i)		(0x0052CA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFAPBVTINUSEBASE_MAX_INDEX	31
+#define GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_S 0
+#define GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_M ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFCEQPART(_i)			(0x00502F00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFCEQPART_MAX_INDEX		31
+#define GLHMC_VFCEQPART_PMCEQBASE_S		0
+#define GLHMC_VFCEQPART_PMCEQBASE_M		ICE_M(0x3FF, 0)
+#define GLHMC_VFCEQPART_PMCEQSIZE_S		16
+#define GLHMC_VFCEQPART_PMCEQSIZE_M		ICE_M(0x3FF, 16)
+#define GLHMC_VFDBCQPART(_i)			(0x00502E00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFDBCQPART_MAX_INDEX		31
+#define GLHMC_VFDBCQPART_PMDBCQBASE_S		0
+#define GLHMC_VFDBCQPART_PMDBCQBASE_M		ICE_M(0x3FFF, 0)
+#define GLHMC_VFDBCQPART_PMDBCQSIZE_S		16
+#define GLHMC_VFDBCQPART_PMDBCQSIZE_M		ICE_M(0x7FFF, 16)
+#define GLHMC_VFDBQPPART(_i)			(0x00504520 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFDBQPPART_MAX_INDEX		31
+#define GLHMC_VFDBQPPART_PMDBQPBASE_S		0
+#define GLHMC_VFDBQPPART_PMDBQPBASE_M		ICE_M(0x3FFF, 0)
+#define GLHMC_VFDBQPPART_PMDBQPSIZE_S		16
+#define GLHMC_VFDBQPPART_PMDBQPSIZE_M		ICE_M(0x7FFF, 16)
+#define GLHMC_VFFSIAVBASE(_i)			(0x0052D600 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFFSIAVBASE_MAX_INDEX		31
+#define GLHMC_VFFSIAVBASE_FPMFSIAVBASE_S	0
+#define GLHMC_VFFSIAVBASE_FPMFSIAVBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFFSIAVCNT(_i)			(0x0052D700 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFFSIAVCNT_MAX_INDEX		31
+#define GLHMC_VFFSIAVCNT_FPMFSIAVCNT_S		0
+#define GLHMC_VFFSIAVCNT_FPMFSIAVCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFFSIMCBASE(_i)			(0x0052E000 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFFSIMCBASE_MAX_INDEX		31
+#define GLHMC_VFFSIMCBASE_FPMFSIMCBASE_S	0
+#define GLHMC_VFFSIMCBASE_FPMFSIMCBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFFSIMCCNT(_i)			(0x0052E100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFFSIMCCNT_MAX_INDEX		31
+#define GLHMC_VFFSIMCCNT_FPMFSIMCSZ_S		0
+#define GLHMC_VFFSIMCCNT_FPMFSIMCSZ_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPDINV(_i)			(0x00528300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPDINV_MAX_INDEX			31
+#define GLHMC_VFPDINV_PMSDIDX_S			0
+#define GLHMC_VFPDINV_PMSDIDX_M			ICE_M(0xFFF, 0)
+#define GLHMC_VFPDINV_PMSDPARTSEL_S		15
+#define GLHMC_VFPDINV_PMSDPARTSEL_M		BIT(15)
+#define GLHMC_VFPDINV_PMPDIDX_S			16
+#define GLHMC_VFPDINV_PMPDIDX_M			ICE_M(0x1FF, 16)
+#define GLHMC_VFPDINV_FPMAT(_i)			(0x00108300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPDINV_FPMAT_MAX_INDEX		31
+#define GLHMC_VFPDINV_FPMAT_PMSDIDX_S		0
+#define GLHMC_VFPDINV_FPMAT_PMSDIDX_M		ICE_M(0xFFF, 0)
+#define GLHMC_VFPDINV_FPMAT_PMSDPARTSEL_S	15
+#define GLHMC_VFPDINV_FPMAT_PMSDPARTSEL_M	BIT(15)
+#define GLHMC_VFPDINV_FPMAT_PMPDIDX_S		16
+#define GLHMC_VFPDINV_FPMAT_PMPDIDX_M		ICE_M(0x1FF, 16)
+#define GLHMC_VFPEARPBASE(_i)			(0x0052C800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEARPBASE_MAX_INDEX		31
+#define GLHMC_VFPEARPBASE_FPMPEARPBASE_S	0
+#define GLHMC_VFPEARPBASE_FPMPEARPBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEARPCNT(_i)			(0x0052C900 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEARPCNT_MAX_INDEX		31
+#define GLHMC_VFPEARPCNT_FPMPEARPCNT_S		0
+#define GLHMC_VFPEARPCNT_FPMPEARPCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPECQBASE(_i)			(0x0052C200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPECQBASE_MAX_INDEX		31
+#define GLHMC_VFPECQBASE_FPMPECQBASE_S		0
+#define GLHMC_VFPECQBASE_FPMPECQBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPECQCNT(_i)			(0x0052C300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPECQCNT_MAX_INDEX		31
+#define GLHMC_VFPECQCNT_FPMPECQCNT_S		0
+#define GLHMC_VFPECQCNT_FPMPECQCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEHDRBASE(_i)			(0x0052E200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHDRBASE_MAX_INDEX		31
+#define GLHMC_VFPEHDRBASE_GLHMC_PEHDRBASE_S	0
+#define GLHMC_VFPEHDRBASE_GLHMC_PEHDRBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEHDRCNT(_i)			(0x0052E300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHDRCNT_MAX_INDEX		31
+#define GLHMC_VFPEHDRCNT_GLHMC_PEHDRCNT_S	0
+#define GLHMC_VFPEHDRCNT_GLHMC_PEHDRCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEHTCNT(_i)			(0x0052C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHTCNT_MAX_INDEX		31
+#define GLHMC_VFPEHTCNT_FPMPEHTCNT_S		0
+#define GLHMC_VFPEHTCNT_FPMPEHTCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEHTCNT_FPMAT(_i)		(0x0010C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHTCNT_FPMAT_MAX_INDEX		31
+#define GLHMC_VFPEHTCNT_FPMAT_FPMPEHTCNT_S	0
+#define GLHMC_VFPEHTCNT_FPMAT_FPMPEHTCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEHTEBASE(_i)			(0x0052C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHTEBASE_MAX_INDEX		31
+#define GLHMC_VFPEHTEBASE_FPMPEHTEBASE_S	0
+#define GLHMC_VFPEHTEBASE_FPMPEHTEBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEHTEBASE_FPMAT(_i)		(0x0010C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHTEBASE_FPMAT_MAX_INDEX	31
+#define GLHMC_VFPEHTEBASE_FPMAT_FPMPEHTEBASE_S	0
+#define GLHMC_VFPEHTEBASE_FPMAT_FPMPEHTEBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEMDBASE(_i)			(0x0052E400 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEMDBASE_MAX_INDEX		31
+#define GLHMC_VFPEMDBASE_GLHMC_PEMDBASE_S	0
+#define GLHMC_VFPEMDBASE_GLHMC_PEMDBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEMDCNT(_i)			(0x0052E500 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEMDCNT_MAX_INDEX		31
+#define GLHMC_VFPEMDCNT_GLHMC_PEMDCNT_S		0
+#define GLHMC_VFPEMDCNT_GLHMC_PEMDCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEMRBASE(_i)			(0x0052CC00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEMRBASE_MAX_INDEX		31
+#define GLHMC_VFPEMRBASE_FPMPEMRBASE_S		0
+#define GLHMC_VFPEMRBASE_FPMPEMRBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEMRCNT(_i)			(0x0052CD00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEMRCNT_MAX_INDEX		31
+#define GLHMC_VFPEMRCNT_FPMPEMRSZ_S		0
+#define GLHMC_VFPEMRCNT_FPMPEMRSZ_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEOOISCBASE(_i)			(0x0052E600 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEOOISCBASE_MAX_INDEX		31
+#define GLHMC_VFPEOOISCBASE_GLHMC_PEOOISCBASE_S 0
+#define GLHMC_VFPEOOISCBASE_GLHMC_PEOOISCBASE_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEOOISCCNT(_i)			(0x0052E700 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEOOISCCNT_MAX_INDEX		31
+#define GLHMC_VFPEOOISCCNT_GLHMC_PEOOISCCNT_S	0
+#define GLHMC_VFPEOOISCCNT_GLHMC_PEOOISCCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEOOISCFFLBASE(_i)		(0x0052EC00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEOOISCFFLBASE_MAX_INDEX	31
+#define GLHMC_VFPEOOISCFFLBASE_GLHMC_PEOOISCFFLBASE_S 0
+#define GLHMC_VFPEOOISCFFLBASE_GLHMC_PEOOISCFFLBASE_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEPBLBASE(_i)			(0x0052D800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEPBLBASE_MAX_INDEX		31
+#define GLHMC_VFPEPBLBASE_FPMPEPBLBASE_S	0
+#define GLHMC_VFPEPBLBASE_FPMPEPBLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEPBLCNT(_i)			(0x0052D900 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEPBLCNT_MAX_INDEX		31
+#define GLHMC_VFPEPBLCNT_FPMPEPBLCNT_S		0
+#define GLHMC_VFPEPBLCNT_FPMPEPBLCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEQ1BASE(_i)			(0x0052D200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQ1BASE_MAX_INDEX		31
+#define GLHMC_VFPEQ1BASE_FPMPEQ1BASE_S		0
+#define GLHMC_VFPEQ1BASE_FPMPEQ1BASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEQ1CNT(_i)			(0x0052D300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQ1CNT_MAX_INDEX		31
+#define GLHMC_VFPEQ1CNT_FPMPEQ1CNT_S		0
+#define GLHMC_VFPEQ1CNT_FPMPEQ1CNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEQ1FLBASE(_i)			(0x0052D400 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQ1FLBASE_MAX_INDEX		31
+#define GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_S	0
+#define GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEQPBASE(_i)			(0x0052C000 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQPBASE_MAX_INDEX		31
+#define GLHMC_VFPEQPBASE_FPMPEQPBASE_S		0
+#define GLHMC_VFPEQPBASE_FPMPEQPBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEQPCNT(_i)			(0x0052C100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQPCNT_MAX_INDEX		31
+#define GLHMC_VFPEQPCNT_FPMPEQPCNT_S		0
+#define GLHMC_VFPEQPCNT_FPMPEQPCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPERRFBASE(_i)			(0x0052E800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPERRFBASE_MAX_INDEX		31
+#define GLHMC_VFPERRFBASE_GLHMC_PERRFBASE_S	0
+#define GLHMC_VFPERRFBASE_GLHMC_PERRFBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPERRFCNT(_i)			(0x0052E900 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPERRFCNT_MAX_INDEX		31
+#define GLHMC_VFPERRFCNT_GLHMC_PERRFCNT_S	0
+#define GLHMC_VFPERRFCNT_GLHMC_PERRFCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPERRFFLBASE(_i)			(0x0052EA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPERRFFLBASE_MAX_INDEX		31
+#define GLHMC_VFPERRFFLBASE_GLHMC_PERRFFLBASE_S 0
+#define GLHMC_VFPERRFFLBASE_GLHMC_PERRFFLBASE_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPETIMERBASE(_i)			(0x0052DA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPETIMERBASE_MAX_INDEX		31
+#define GLHMC_VFPETIMERBASE_FPMPETIMERBASE_S	0
+#define GLHMC_VFPETIMERBASE_FPMPETIMERBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPETIMERCNT(_i)			(0x0052DB00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPETIMERCNT_MAX_INDEX		31
+#define GLHMC_VFPETIMERCNT_FPMPETIMERCNT_S	0
+#define GLHMC_VFPETIMERCNT_FPMPETIMERCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEXFBASE(_i)			(0x0052CE00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEXFBASE_MAX_INDEX		31
+#define GLHMC_VFPEXFBASE_FPMPEXFBASE_S		0
+#define GLHMC_VFPEXFBASE_FPMPEXFBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEXFCNT(_i)			(0x0052CF00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEXFCNT_MAX_INDEX		31
+#define GLHMC_VFPEXFCNT_FPMPEXFCNT_S		0
+#define GLHMC_VFPEXFCNT_FPMPEXFCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEXFFLBASE(_i)			(0x0052D000 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEXFFLBASE_MAX_INDEX		31
+#define GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_S	0
+#define GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFSDDATAHIGH(_i)			(0x00528200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDDATAHIGH_MAX_INDEX		31
+#define GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_S	0
+#define GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFSDDATAHIGH_FPMAT(_i)		(0x00108200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDDATAHIGH_FPMAT_MAX_INDEX	31
+#define GLHMC_VFSDDATAHIGH_FPMAT_PMSDDATAHIGH_S 0
+#define GLHMC_VFSDDATAHIGH_FPMAT_PMSDDATAHIGH_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFSDDATALOW(_i)			(0x00528100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDDATALOW_MAX_INDEX		31
+#define GLHMC_VFSDDATALOW_PMSDVALID_S		0
+#define GLHMC_VFSDDATALOW_PMSDVALID_M		BIT(0)
+#define GLHMC_VFSDDATALOW_PMSDTYPE_S		1
+#define GLHMC_VFSDDATALOW_PMSDTYPE_M		BIT(1)
+#define GLHMC_VFSDDATALOW_PMSDBPCOUNT_S		2
+#define GLHMC_VFSDDATALOW_PMSDBPCOUNT_M		ICE_M(0x3FF, 2)
+#define GLHMC_VFSDDATALOW_PMSDDATALOW_S		12
+#define GLHMC_VFSDDATALOW_PMSDDATALOW_M		ICE_M(0xFFFFF, 12)
+#define GLHMC_VFSDDATALOW_FPMAT(_i)		(0x00108100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDDATALOW_FPMAT_MAX_INDEX	31
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDVALID_S	0
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDVALID_M	BIT(0)
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDTYPE_S	1
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDTYPE_M	BIT(1)
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDBPCOUNT_S	2
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDBPCOUNT_M	ICE_M(0x3FF, 2)
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDDATALOW_S	12
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDDATALOW_M	ICE_M(0xFFFFF, 12)
+#define GLHMC_VFSDPART(_i)			(0x00528800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDPART_MAX_INDEX		31
+#define GLHMC_VFSDPART_PMSDBASE_S		0
+#define GLHMC_VFSDPART_PMSDBASE_M		ICE_M(0xFFF, 0)
+#define GLHMC_VFSDPART_PMSDSIZE_S		16
+#define GLHMC_VFSDPART_PMSDSIZE_M		ICE_M(0x1FFF, 16)
+#define GLHMC_VFSDPART_FPMAT(_i)		(0x00108800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDPART_FPMAT_MAX_INDEX		31
+#define GLHMC_VFSDPART_FPMAT_PMSDBASE_S		0
+#define GLHMC_VFSDPART_FPMAT_PMSDBASE_M		ICE_M(0xFFF, 0)
+#define GLHMC_VFSDPART_FPMAT_PMSDSIZE_S		16
+#define GLHMC_VFSDPART_FPMAT_PMSDSIZE_M		ICE_M(0x1FFF, 16)
+#define GLMDOC_CACHESIZE			0x0051C06C /* Reset Source: CORER */
+#define GLMDOC_CACHESIZE_WORD_SIZE_S		0
+#define GLMDOC_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLMDOC_CACHESIZE_SETS_S			8
+#define GLMDOC_CACHESIZE_SETS_M			ICE_M(0xFFF, 8)
+#define GLMDOC_CACHESIZE_WAYS_S			20
+#define GLMDOC_CACHESIZE_WAYS_M			ICE_M(0xF, 20)
+#define GLPBLOC0_CACHESIZE			0x00518074 /* Reset Source: CORER */
+#define GLPBLOC0_CACHESIZE_WORD_SIZE_S		0
+#define GLPBLOC0_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPBLOC0_CACHESIZE_SETS_S		8
+#define GLPBLOC0_CACHESIZE_SETS_M		ICE_M(0xFFF, 8)
+#define GLPBLOC0_CACHESIZE_WAYS_S		20
+#define GLPBLOC0_CACHESIZE_WAYS_M		ICE_M(0xF, 20)
+#define GLPBLOC1_CACHESIZE			0x0051A074 /* Reset Source: CORER */
+#define GLPBLOC1_CACHESIZE_WORD_SIZE_S		0
+#define GLPBLOC1_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPBLOC1_CACHESIZE_SETS_S		8
+#define GLPBLOC1_CACHESIZE_SETS_M		ICE_M(0xFFF, 8)
+#define GLPBLOC1_CACHESIZE_WAYS_S		20
+#define GLPBLOC1_CACHESIZE_WAYS_M		ICE_M(0xF, 20)
+#define GLPDOC_CACHESIZE			0x00530048 /* Reset Source: CORER */
+#define GLPDOC_CACHESIZE_WORD_SIZE_S		0
+#define GLPDOC_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPDOC_CACHESIZE_SETS_S			8
+#define GLPDOC_CACHESIZE_SETS_M			ICE_M(0xFFF, 8)
+#define GLPDOC_CACHESIZE_WAYS_S			20
+#define GLPDOC_CACHESIZE_WAYS_M			ICE_M(0xF, 20)
+#define GLPDOC_CACHESIZE_FPMAT			0x00110088 /* Reset Source: CORER */
+#define GLPDOC_CACHESIZE_FPMAT_WORD_SIZE_S	0
+#define GLPDOC_CACHESIZE_FPMAT_WORD_SIZE_M	ICE_M(0xFF, 0)
+#define GLPDOC_CACHESIZE_FPMAT_SETS_S		8
+#define GLPDOC_CACHESIZE_FPMAT_SETS_M		ICE_M(0xFFF, 8)
+#define GLPDOC_CACHESIZE_FPMAT_WAYS_S		20
+#define GLPDOC_CACHESIZE_FPMAT_WAYS_M		ICE_M(0xF, 20)
+#define GLPEOC0_CACHESIZE			0x005140A8 /* Reset Source: CORER */
+#define GLPEOC0_CACHESIZE_WORD_SIZE_S		0
+#define GLPEOC0_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPEOC0_CACHESIZE_SETS_S		8
+#define GLPEOC0_CACHESIZE_SETS_M		ICE_M(0xFFF, 8)
+#define GLPEOC0_CACHESIZE_WAYS_S		20
+#define GLPEOC0_CACHESIZE_WAYS_M		ICE_M(0xF, 20)
+#define GLPEOC1_CACHESIZE			0x005160A8 /* Reset Source: CORER */
+#define GLPEOC1_CACHESIZE_WORD_SIZE_S		0
+#define GLPEOC1_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPEOC1_CACHESIZE_SETS_S		8
+#define GLPEOC1_CACHESIZE_SETS_M		ICE_M(0xFFF, 8)
+#define GLPEOC1_CACHESIZE_WAYS_S		20
+#define GLPEOC1_CACHESIZE_WAYS_M		ICE_M(0xF, 20)
+#define PFHMC_ERRORDATA				0x00520500 /* Reset Source: PFR */
+#define PFHMC_ERRORDATA_HMC_ERROR_DATA_S	0
+#define PFHMC_ERRORDATA_HMC_ERROR_DATA_M	ICE_M(0x3FFFFFFF, 0)
+#define PFHMC_ERRORDATA_FPMAT			0x00100500 /* Reset Source: PFR */
+#define PFHMC_ERRORDATA_FPMAT_HMC_ERROR_DATA_S	0
+#define PFHMC_ERRORDATA_FPMAT_HMC_ERROR_DATA_M	ICE_M(0x3FFFFFFF, 0)
+#define PFHMC_ERRORINFO				0x00520400 /* Reset Source: PFR */
+#define PFHMC_ERRORINFO_PMF_INDEX_S		0
+#define PFHMC_ERRORINFO_PMF_INDEX_M		ICE_M(0x1F, 0)
+#define PFHMC_ERRORINFO_PMF_ISVF_S		7
+#define PFHMC_ERRORINFO_PMF_ISVF_M		BIT(7)
+#define PFHMC_ERRORINFO_HMC_ERROR_TYPE_S	8
+#define PFHMC_ERRORINFO_HMC_ERROR_TYPE_M	ICE_M(0xF, 8)
+#define PFHMC_ERRORINFO_HMC_OBJECT_TYPE_S	16
+#define PFHMC_ERRORINFO_HMC_OBJECT_TYPE_M	ICE_M(0x1F, 16)
+#define PFHMC_ERRORINFO_ERROR_DETECTED_S	31
+#define PFHMC_ERRORINFO_ERROR_DETECTED_M	BIT(31)
+#define PFHMC_ERRORINFO_FPMAT			0x00100400 /* Reset Source: PFR */
+#define PFHMC_ERRORINFO_FPMAT_PMF_INDEX_S	0
+#define PFHMC_ERRORINFO_FPMAT_PMF_INDEX_M	ICE_M(0x1F, 0)
+#define PFHMC_ERRORINFO_FPMAT_PMF_ISVF_S	7
+#define PFHMC_ERRORINFO_FPMAT_PMF_ISVF_M	BIT(7)
+#define PFHMC_ERRORINFO_FPMAT_HMC_ERROR_TYPE_S	8
+#define PFHMC_ERRORINFO_FPMAT_HMC_ERROR_TYPE_M	ICE_M(0xF, 8)
+#define PFHMC_ERRORINFO_FPMAT_HMC_OBJECT_TYPE_S 16
+#define PFHMC_ERRORINFO_FPMAT_HMC_OBJECT_TYPE_M ICE_M(0x1F, 16)
+#define PFHMC_ERRORINFO_FPMAT_ERROR_DETECTED_S	31
+#define PFHMC_ERRORINFO_FPMAT_ERROR_DETECTED_M	BIT(31)
+#define PFHMC_PDINV				0x00520300 /* Reset Source: PFR */
+#define PFHMC_PDINV_PMSDIDX_S			0
+#define PFHMC_PDINV_PMSDIDX_M			ICE_M(0xFFF, 0)
+#define PFHMC_PDINV_PMSDPARTSEL_S		15
+#define PFHMC_PDINV_PMSDPARTSEL_M		BIT(15)
+#define PFHMC_PDINV_PMPDIDX_S			16
+#define PFHMC_PDINV_PMPDIDX_M			ICE_M(0x1FF, 16)
+#define PFHMC_PDINV_FPMAT			0x00100300 /* Reset Source: PFR */
+#define PFHMC_PDINV_FPMAT_PMSDIDX_S		0
+#define PFHMC_PDINV_FPMAT_PMSDIDX_M		ICE_M(0xFFF, 0)
+#define PFHMC_PDINV_FPMAT_PMSDPARTSEL_S		15
+#define PFHMC_PDINV_FPMAT_PMSDPARTSEL_M		BIT(15)
+#define PFHMC_PDINV_FPMAT_PMPDIDX_S		16
+#define PFHMC_PDINV_FPMAT_PMPDIDX_M		ICE_M(0x1FF, 16)
+#define PFHMC_SDCMD				0x00520000 /* Reset Source: PFR */
+#define PFHMC_SDCMD_PMSDIDX_S			0
+#define PFHMC_SDCMD_PMSDIDX_M			ICE_M(0xFFF, 0)
+#define PFHMC_SDCMD_PMSDPARTSEL_S		15
+#define PFHMC_SDCMD_PMSDPARTSEL_M		BIT(15)
+#define PFHMC_SDCMD_PMSDWR_S			31
+#define PFHMC_SDCMD_PMSDWR_M			BIT(31)
+#define PFHMC_SDCMD_FPMAT			0x00100000 /* Reset Source: PFR */
+#define PFHMC_SDCMD_FPMAT_PMSDIDX_S		0
+#define PFHMC_SDCMD_FPMAT_PMSDIDX_M		ICE_M(0xFFF, 0)
+#define PFHMC_SDCMD_FPMAT_PMSDPARTSEL_S		15
+#define PFHMC_SDCMD_FPMAT_PMSDPARTSEL_M		BIT(15)
+#define PFHMC_SDCMD_FPMAT_PMSDWR_S		31
+#define PFHMC_SDCMD_FPMAT_PMSDWR_M		BIT(31)
+#define PFHMC_SDDATAHIGH			0x00520200 /* Reset Source: PFR */
+#define PFHMC_SDDATAHIGH_PMSDDATAHIGH_S		0
+#define PFHMC_SDDATAHIGH_PMSDDATAHIGH_M		ICE_M(0xFFFFFFFF, 0)
+#define PFHMC_SDDATAHIGH_FPMAT			0x00100200 /* Reset Source: PFR */
+#define PFHMC_SDDATAHIGH_FPMAT_PMSDDATAHIGH_S	0
+#define PFHMC_SDDATAHIGH_FPMAT_PMSDDATAHIGH_M	ICE_M(0xFFFFFFFF, 0)
+#define PFHMC_SDDATALOW				0x00520100 /* Reset Source: PFR */
+#define PFHMC_SDDATALOW_PMSDVALID_S		0
+#define PFHMC_SDDATALOW_PMSDVALID_M		BIT(0)
+#define PFHMC_SDDATALOW_PMSDTYPE_S		1
+#define PFHMC_SDDATALOW_PMSDTYPE_M		BIT(1)
+#define PFHMC_SDDATALOW_PMSDBPCOUNT_S		2
+#define PFHMC_SDDATALOW_PMSDBPCOUNT_M		ICE_M(0x3FF, 2)
+#define PFHMC_SDDATALOW_PMSDDATALOW_S		12
+#define PFHMC_SDDATALOW_PMSDDATALOW_M		ICE_M(0xFFFFF, 12)
+#define PFHMC_SDDATALOW_FPMAT			0x00100100 /* Reset Source: PFR */
+#define PFHMC_SDDATALOW_FPMAT_PMSDVALID_S	0
+#define PFHMC_SDDATALOW_FPMAT_PMSDVALID_M	BIT(0)
+#define PFHMC_SDDATALOW_FPMAT_PMSDTYPE_S	1
+#define PFHMC_SDDATALOW_FPMAT_PMSDTYPE_M	BIT(1)
+#define PFHMC_SDDATALOW_FPMAT_PMSDBPCOUNT_S	2
+#define PFHMC_SDDATALOW_FPMAT_PMSDBPCOUNT_M	ICE_M(0x3FF, 2)
+#define PFHMC_SDDATALOW_FPMAT_PMSDDATALOW_S	12
+#define PFHMC_SDDATALOW_FPMAT_PMSDDATALOW_M	ICE_M(0xFFFFF, 12)
+#define GL_DSI_REPC				0x00294208 /* Reset Source: CORER */
+#define GL_DSI_REPC_NO_DESC_CNT_S		0
+#define GL_DSI_REPC_NO_DESC_CNT_M		ICE_M(0xFFFF, 0)
+#define GL_DSI_REPC_ERROR_CNT_S			16
+#define GL_DSI_REPC_ERROR_CNT_M			ICE_M(0xFFFF, 16)
+#define GL_MDCK_TDAT_TCLAN			0x000FC0DC /* Reset Source: CORER */
+#define GL_MDCK_TDAT_TCLAN_WRONG_ORDER_FORMAT_DESC_S 0
+#define GL_MDCK_TDAT_TCLAN_WRONG_ORDER_FORMAT_DESC_M BIT(0)
+#define GL_MDCK_TDAT_TCLAN_UR_S			1
+#define GL_MDCK_TDAT_TCLAN_UR_M			BIT(1)
+#define GL_MDCK_TDAT_TCLAN_TAIL_DESC_NOT_DDESC_EOP_NOP_S 2
+#define GL_MDCK_TDAT_TCLAN_TAIL_DESC_NOT_DDESC_EOP_NOP_M BIT(2)
+#define GL_MDCK_TDAT_TCLAN_FALSE_SCHEDULING_S	3
+#define GL_MDCK_TDAT_TCLAN_FALSE_SCHEDULING_M	BIT(3)
+#define GL_MDCK_TDAT_TCLAN_TAIL_VALUE_BIGGER_THAN_RING_LEN_S 4
+#define GL_MDCK_TDAT_TCLAN_TAIL_VALUE_BIGGER_THAN_RING_LEN_M BIT(4)
+#define GL_MDCK_TDAT_TCLAN_MORE_THAN_8_DCMDS_IN_PKT_S 5
+#define GL_MDCK_TDAT_TCLAN_MORE_THAN_8_DCMDS_IN_PKT_M BIT(5)
+#define GL_MDCK_TDAT_TCLAN_NO_HEAD_UPDATE_IN_QUANTA_S 6
+#define GL_MDCK_TDAT_TCLAN_NO_HEAD_UPDATE_IN_QUANTA_M BIT(6)
+#define GL_MDCK_TDAT_TCLAN_PKT_LEN_NOT_LEGAL_S	7
+#define GL_MDCK_TDAT_TCLAN_PKT_LEN_NOT_LEGAL_M	BIT(7)
+#define GL_MDCK_TDAT_TCLAN_TSO_TLEN_NOT_COHERENT_WITH_SUM_BUFS_S 8
+#define GL_MDCK_TDAT_TCLAN_TSO_TLEN_NOT_COHERENT_WITH_SUM_BUFS_M BIT(8)
+#define GL_MDCK_TDAT_TCLAN_TSO_TAIL_REACHED_BEFORE_TLEN_END_S 9
+#define GL_MDCK_TDAT_TCLAN_TSO_TAIL_REACHED_BEFORE_TLEN_END_M BIT(9)
+#define GL_MDCK_TDAT_TCLAN_TSO_MORE_THAN_3_HDRS_S 10
+#define GL_MDCK_TDAT_TCLAN_TSO_MORE_THAN_3_HDRS_M BIT(10)
+#define GL_MDCK_TDAT_TCLAN_TSO_SUM_BUFFS_LT_SUM_HDRS_S 11
+#define GL_MDCK_TDAT_TCLAN_TSO_SUM_BUFFS_LT_SUM_HDRS_M BIT(11)
+#define GL_MDCK_TDAT_TCLAN_TSO_ZERO_MSS_TLEN_HDRS_S 12
+#define GL_MDCK_TDAT_TCLAN_TSO_ZERO_MSS_TLEN_HDRS_M BIT(12)
+#define GL_MDCK_TDAT_TCLAN_TSO_CTX_DESC_IPSEC_S 13
+#define GL_MDCK_TDAT_TCLAN_TSO_CTX_DESC_IPSEC_M BIT(13)
+#define GL_MDCK_TDAT_TCLAN_SSO_COMS_NOT_WHOLE_PKT_NUM_IN_QUANTA_S 14
+#define GL_MDCK_TDAT_TCLAN_SSO_COMS_NOT_WHOLE_PKT_NUM_IN_QUANTA_M BIT(14)
+#define GL_MDCK_TDAT_TCLAN_COMS_QUANTA_BYTES_EXCEED_PKTLEN_X_64_S 15
+#define GL_MDCK_TDAT_TCLAN_COMS_QUANTA_BYTES_EXCEED_PKTLEN_X_64_M BIT(15)
+#define GL_MDCK_TDAT_TCLAN_COMS_QUANTA_CMDS_EXCEED_S 16
+#define GL_MDCK_TDAT_TCLAN_COMS_QUANTA_CMDS_EXCEED_M BIT(16)
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_TSO_DESCS_LAST_LSO_QUANTA_S 17
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_TSO_DESCS_LAST_LSO_QUANTA_M BIT(17)
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_TSO_DESCS_TLEN_S 18
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_TSO_DESCS_TLEN_M BIT(18)
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_QUANTA_FINISHED_TOO_EARLY_S 19
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_QUANTA_FINISHED_TOO_EARLY_M BIT(19)
+#define GL_MDCK_TDAT_TCLAN_COMS_NUM_PKTS_IN_QUANTA_S 20
+#define GL_MDCK_TDAT_TCLAN_COMS_NUM_PKTS_IN_QUANTA_M BIT(20)
+#define GLCORE_CLKCTL_H				0x000B81E8 /* Reset Source: POR */
+#define GLCORE_CLKCTL_H_UPPER_CLK_SRC_H_S	0
+#define GLCORE_CLKCTL_H_UPPER_CLK_SRC_H_M	ICE_M(0x3, 0)
+#define GLCORE_CLKCTL_H_LOWER_CLK_SRC_H_S	2
+#define GLCORE_CLKCTL_H_LOWER_CLK_SRC_H_M	ICE_M(0x3, 2)
+#define GLCORE_CLKCTL_H_PSM_CLK_SRC_H_S		4
+#define GLCORE_CLKCTL_H_PSM_CLK_SRC_H_M		ICE_M(0x3, 4)
+#define GLCORE_CLKCTL_H_RXCTL_CLK_SRC_H_S	6
+#define GLCORE_CLKCTL_H_RXCTL_CLK_SRC_H_M	ICE_M(0x3, 6)
+#define GLCORE_CLKCTL_H_UANA_CLK_SRC_H_S	8
+#define GLCORE_CLKCTL_H_UANA_CLK_SRC_H_M	ICE_M(0x7, 8)
+#define GLCORE_CLKCTL_L				0x000B8254 /* Reset Source: POR */
+#define GLCORE_CLKCTL_L_UPPER_CLK_SRC_L_S	0
+#define GLCORE_CLKCTL_L_UPPER_CLK_SRC_L_M	ICE_M(0x3, 0)
+#define GLCORE_CLKCTL_L_LOWER_CLK_SRC_L_S	2
+#define GLCORE_CLKCTL_L_LOWER_CLK_SRC_L_M	ICE_M(0x3, 2)
+#define GLCORE_CLKCTL_L_PSM_CLK_SRC_L_S		4
+#define GLCORE_CLKCTL_L_PSM_CLK_SRC_L_M		ICE_M(0x3, 4)
+#define GLCORE_CLKCTL_L_RXCTL_CLK_SRC_L_S	6
+#define GLCORE_CLKCTL_L_RXCTL_CLK_SRC_L_M	ICE_M(0x3, 6)
+#define GLCORE_CLKCTL_L_UANA_CLK_SRC_L_S	8
+#define GLCORE_CLKCTL_L_UANA_CLK_SRC_L_M	ICE_M(0x7, 8)
+#define GLCORE_CLKCTL_M				0x000B8258 /* Reset Source: POR */
+#define GLCORE_CLKCTL_M_UPPER_CLK_SRC_M_S	0
+#define GLCORE_CLKCTL_M_UPPER_CLK_SRC_M_M	ICE_M(0x3, 0)
+#define GLCORE_CLKCTL_M_LOWER_CLK_SRC_M_S	2
+#define GLCORE_CLKCTL_M_LOWER_CLK_SRC_M_M	ICE_M(0x3, 2)
+#define GLCORE_CLKCTL_M_PSM_CLK_SRC_M_S		4
+#define GLCORE_CLKCTL_M_PSM_CLK_SRC_M_M		ICE_M(0x3, 4)
+#define GLCORE_CLKCTL_M_RXCTL_CLK_SRC_M_S	6
+#define GLCORE_CLKCTL_M_RXCTL_CLK_SRC_M_M	ICE_M(0x3, 6)
+#define GLCORE_CLKCTL_M_UANA_CLK_SRC_M_S	8
+#define GLCORE_CLKCTL_M_UANA_CLK_SRC_M_M	ICE_M(0x7, 8)
+#define GLFOC_CACHESIZE				0x000AA074 /* Reset Source: CORER */
+#define GLFOC_CACHESIZE_WORD_SIZE_S		0
+#define GLFOC_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLFOC_CACHESIZE_SETS_S			8
+#define GLFOC_CACHESIZE_SETS_M			ICE_M(0xFFF, 8)
+#define GLFOC_CACHESIZE_WAYS_S			20
+#define GLFOC_CACHESIZE_WAYS_M			ICE_M(0xF, 20)
+#define GLMAC_CLKSTAT				0x000B8210 /* Reset Source: POR */
+#define GLMAC_CLKSTAT_P0_CLK_SPEED_S		0
+#define GLMAC_CLKSTAT_P0_CLK_SPEED_M		ICE_M(0xF, 0)
+#define GLMAC_CLKSTAT_P1_CLK_SPEED_S		4
+#define GLMAC_CLKSTAT_P1_CLK_SPEED_M		ICE_M(0xF, 4)
+#define GLMAC_CLKSTAT_P2_CLK_SPEED_S		8
+#define GLMAC_CLKSTAT_P2_CLK_SPEED_M		ICE_M(0xF, 8)
+#define GLMAC_CLKSTAT_P3_CLK_SPEED_S		12
+#define GLMAC_CLKSTAT_P3_CLK_SPEED_M		ICE_M(0xF, 12)
+#define GLMAC_CLKSTAT_P4_CLK_SPEED_S		16
+#define GLMAC_CLKSTAT_P4_CLK_SPEED_M		ICE_M(0xF, 16)
+#define GLMAC_CLKSTAT_P5_CLK_SPEED_S		20
+#define GLMAC_CLKSTAT_P5_CLK_SPEED_M		ICE_M(0xF, 20)
+#define GLMAC_CLKSTAT_P6_CLK_SPEED_S		24
+#define GLMAC_CLKSTAT_P6_CLK_SPEED_M		ICE_M(0xF, 24)
+#define GLMAC_CLKSTAT_P7_CLK_SPEED_S		28
+#define GLMAC_CLKSTAT_P7_CLK_SPEED_M		ICE_M(0xF, 28)
+#define GLTPB_100G_MAC_FC_THRESH		0x00099510 /* Reset Source: CORER */
+#define GLTPB_100G_MAC_FC_THRESH_PORT0_FC_THRESH_S 0
+#define GLTPB_100G_MAC_FC_THRESH_PORT0_FC_THRESH_M ICE_M(0xFFFF, 0)
+#define GLTPB_100G_MAC_FC_THRESH_PORT1_FC_THRESH_S 16
+#define GLTPB_100G_MAC_FC_THRESH_PORT1_FC_THRESH_M ICE_M(0xFFFF, 16)
+#define GLTPB_100G_RPB_FC_THRESH		0x0009963C /* Reset Source: CORER */
+#define GLTPB_100G_RPB_FC_THRESH_PORT0_FC_THRESH_S 0
+#define GLTPB_100G_RPB_FC_THRESH_PORT0_FC_THRESH_M ICE_M(0xFFFF, 0)
+#define GLTPB_100G_RPB_FC_THRESH_PORT1_FC_THRESH_S 16
+#define GLTPB_100G_RPB_FC_THRESH_PORT1_FC_THRESH_M ICE_M(0xFFFF, 16)
+#define GLTPB_PACING_10G			0x000994E4 /* Reset Source: CORER */
+#define GLTPB_PACING_10G_N_S			0
+#define GLTPB_PACING_10G_N_M			ICE_M(0xFF, 0)
+#define GLTPB_PACING_10G_K_S			8
+#define GLTPB_PACING_10G_K_M			ICE_M(0xFF, 8)
+#define GLTPB_PACING_10G_S_S			16
+#define GLTPB_PACING_10G_S_M			ICE_M(0x1FF, 16)
+#define GLTPB_PACING_25G			0x000994E0 /* Reset Source: CORER */
+#define GLTPB_PACING_25G_N_S			0
+#define GLTPB_PACING_25G_N_M			ICE_M(0xFF, 0)
+#define GLTPB_PACING_25G_K_S			8
+#define GLTPB_PACING_25G_K_M			ICE_M(0xFF, 8)
+#define GLTPB_PACING_25G_S_S			16
+#define GLTPB_PACING_25G_S_M			ICE_M(0x1FF, 16)
+#define GLTPB_PORT_PACING_SPEED			0x000994E8 /* Reset Source: CORER */
+#define GLTPB_PORT_PACING_SPEED_PORT0_SPEED_S	0
+#define GLTPB_PORT_PACING_SPEED_PORT0_SPEED_M	BIT(0)
+#define GLTPB_PORT_PACING_SPEED_PORT1_SPEED_S	1
+#define GLTPB_PORT_PACING_SPEED_PORT1_SPEED_M	BIT(1)
+#define GLTPB_PORT_PACING_SPEED_PORT2_SPEED_S	2
+#define GLTPB_PORT_PACING_SPEED_PORT2_SPEED_M	BIT(2)
+#define GLTPB_PORT_PACING_SPEED_PORT3_SPEED_S	3
+#define GLTPB_PORT_PACING_SPEED_PORT3_SPEED_M	BIT(3)
+#define GLTPB_PORT_PACING_SPEED_PORT4_SPEED_S	4
+#define GLTPB_PORT_PACING_SPEED_PORT4_SPEED_M	BIT(4)
+#define GLTPB_PORT_PACING_SPEED_PORT5_SPEED_S	5
+#define GLTPB_PORT_PACING_SPEED_PORT5_SPEED_M	BIT(5)
+#define GLTPB_PORT_PACING_SPEED_PORT6_SPEED_S	6
+#define GLTPB_PORT_PACING_SPEED_PORT6_SPEED_M	BIT(6)
+#define GLTPB_PORT_PACING_SPEED_PORT7_SPEED_S	7
+#define GLTPB_PORT_PACING_SPEED_PORT7_SPEED_M	BIT(7)
+#define TPB_CFG_SCHEDULED_BC_THRESHOLD		0x00099494 /* Reset Source: CORER */
+#define TPB_CFG_SCHEDULED_BC_THRESHOLD_THRESHOLD_S 0
+#define TPB_CFG_SCHEDULED_BC_THRESHOLD_THRESHOLD_M ICE_M(0x7FFF, 0)
+#define GL_UFUSE_SOC				0x000A400C /* Reset Source: POR */
+#define GL_UFUSE_SOC_PORT_MODE_S		0
+#define GL_UFUSE_SOC_PORT_MODE_M		ICE_M(0x3, 0)
+#define GL_UFUSE_SOC_BANDWIDTH_S		2
+#define GL_UFUSE_SOC_BANDWIDTH_M		ICE_M(0x3, 2)
+#define GL_UFUSE_SOC_PE_DISABLE_S		4
+#define GL_UFUSE_SOC_PE_DISABLE_M		BIT(4)
+#define GL_UFUSE_SOC_SWITCH_MODE_S		5
+#define GL_UFUSE_SOC_SWITCH_MODE_M		BIT(5)
+#define GL_UFUSE_SOC_CSR_PROTECTION_ENABLE_S	6
+#define GL_UFUSE_SOC_CSR_PROTECTION_ENABLE_M	BIT(6)
+#define GL_UFUSE_SOC_SERIAL_50G_S		7
+#define GL_UFUSE_SOC_SERIAL_50G_M		BIT(7)
+#define GL_UFUSE_SOC_NIC_ID_S			8
+#define GL_UFUSE_SOC_NIC_ID_M			BIT(8)
+#define GL_UFUSE_SOC_BLOCK_BME_TO_FW_S		9
+#define GL_UFUSE_SOC_BLOCK_BME_TO_FW_M		BIT(9)
+#define GL_UFUSE_SOC_SOC_TYPE_S			10
+#define GL_UFUSE_SOC_SOC_TYPE_M			BIT(10)
+#define GL_UFUSE_SOC_BTS_MODE_S			11
+#define GL_UFUSE_SOC_BTS_MODE_M			BIT(11)
+#define GL_UFUSE_SOC_SPARE_FUSES_S		12
+#define GL_UFUSE_SOC_SPARE_FUSES_M		ICE_M(0xF, 12)
+#define EMPINT_GPIO_ENA				0x000880C0 /* Reset Source: POR */
+#define EMPINT_GPIO_ENA_GPIO0_ENA_S		0
+#define EMPINT_GPIO_ENA_GPIO0_ENA_M		BIT(0)
+#define EMPINT_GPIO_ENA_GPIO1_ENA_S		1
+#define EMPINT_GPIO_ENA_GPIO1_ENA_M		BIT(1)
+#define EMPINT_GPIO_ENA_GPIO2_ENA_S		2
+#define EMPINT_GPIO_ENA_GPIO2_ENA_M		BIT(2)
+#define EMPINT_GPIO_ENA_GPIO3_ENA_S		3
+#define EMPINT_GPIO_ENA_GPIO3_ENA_M		BIT(3)
+#define EMPINT_GPIO_ENA_GPIO4_ENA_S		4
+#define EMPINT_GPIO_ENA_GPIO4_ENA_M		BIT(4)
+#define EMPINT_GPIO_ENA_GPIO5_ENA_S		5
+#define EMPINT_GPIO_ENA_GPIO5_ENA_M		BIT(5)
+#define EMPINT_GPIO_ENA_GPIO6_ENA_S		6
+#define EMPINT_GPIO_ENA_GPIO6_ENA_M		BIT(6)
+#define GLGEN_MAC_LINK_TOPO			0x000B81DC /* Reset Source: GLOBR */
+#define GLGEN_MAC_LINK_TOPO_LINK_TOPO_S		0
+#define GLGEN_MAC_LINK_TOPO_LINK_TOPO_M		ICE_M(0x3, 0)
+#define GLINT_CEQCTL(_INT)			(0x0015C000 + ((_INT) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLINT_CEQCTL_MAX_INDEX			2047
+#define GLINT_CEQCTL_MSIX_INDX_S		0
+#define GLINT_CEQCTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define GLINT_CEQCTL_ITR_INDX_S			11
+#define GLINT_CEQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define GLINT_CEQCTL_CAUSE_ENA_S		30
+#define GLINT_CEQCTL_CAUSE_ENA_M		BIT(30)
+#define GLINT_CEQCTL_INTEVENT_S			31
+#define GLINT_CEQCTL_INTEVENT_M			BIT(31)
+#define GLINT_CTL				0x0016CC54 /* Reset Source: CORER */
+#define GLINT_CTL_DIS_AUTOMASK_S		0
 #define GLINT_CTL_DIS_AUTOMASK_M		BIT(0)
+#define GLINT_CTL_RSVD_S			1
+#define GLINT_CTL_RSVD_M			ICE_M(0x7FFF, 1)
 #define GLINT_CTL_ITR_GRAN_200_S		16
 #define GLINT_CTL_ITR_GRAN_200_M		ICE_M(0xF, 16)
 #define GLINT_CTL_ITR_GRAN_100_S		20
@@ -124,106 +4491,868 @@
 #define GLINT_CTL_ITR_GRAN_50_M			ICE_M(0xF, 24)
 #define GLINT_CTL_ITR_GRAN_25_S			28
 #define GLINT_CTL_ITR_GRAN_25_M			ICE_M(0xF, 28)
-#define GLINT_DYN_CTL(_INT)			(0x00160000 + ((_INT) * 4))
+#define GLINT_DYN_CTL(_INT)			(0x00160000 + ((_INT) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLINT_DYN_CTL_MAX_INDEX			2047
+#define GLINT_DYN_CTL_INTENA_S			0
 #define GLINT_DYN_CTL_INTENA_M			BIT(0)
+#define GLINT_DYN_CTL_CLEARPBA_S		1
 #define GLINT_DYN_CTL_CLEARPBA_M		BIT(1)
+#define GLINT_DYN_CTL_SWINT_TRIG_S		2
 #define GLINT_DYN_CTL_SWINT_TRIG_M		BIT(2)
 #define GLINT_DYN_CTL_ITR_INDX_S		3
 #define GLINT_DYN_CTL_ITR_INDX_M		ICE_M(0x3, 3)
 #define GLINT_DYN_CTL_INTERVAL_S		5
 #define GLINT_DYN_CTL_INTERVAL_M		ICE_M(0xFFF, 5)
+#define GLINT_DYN_CTL_SW_ITR_INDX_ENA_S		24
+#define GLINT_DYN_CTL_SW_ITR_INDX_ENA_M		BIT(24)
+#define GLINT_DYN_CTL_SW_ITR_INDX_S		25
 #define GLINT_DYN_CTL_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define GLINT_DYN_CTL_WB_ON_ITR_S		30
 #define GLINT_DYN_CTL_WB_ON_ITR_M		BIT(30)
+#define GLINT_DYN_CTL_INTENA_MSK_S		31
 #define GLINT_DYN_CTL_INTENA_MSK_M		BIT(31)
-#define GLINT_ITR(_i, _INT)			(0x00154000 + ((_i) * 8192 + (_INT) * 4))
-#define GLINT_RATE(_INT)			(0x0015A000 + ((_INT) * 4))
+#define GLINT_FW_TOOL_CTL			0x0016C840 /* Reset Source: CORER */
+#define GLINT_FW_TOOL_CTL_MSIX_INDX_S		0
+#define GLINT_FW_TOOL_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define GLINT_FW_TOOL_CTL_ITR_INDX_S		11
+#define GLINT_FW_TOOL_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define GLINT_FW_TOOL_CTL_CAUSE_ENA_S		30
+#define GLINT_FW_TOOL_CTL_CAUSE_ENA_M		BIT(30)
+#define GLINT_FW_TOOL_CTL_INTEVENT_S		31
+#define GLINT_FW_TOOL_CTL_INTEVENT_M		BIT(31)
+#define GLINT_ITR(_i, _INT)			(0x00154000 + ((_i) * 8192 + (_INT) * 4)) /* _i=0...2, _INT=0...2047 */ /* Reset Source: CORER */
+#define GLINT_ITR_MAX_INDEX			2
+#define GLINT_ITR_INTERVAL_S			0
+#define GLINT_ITR_INTERVAL_M			ICE_M(0xFFF, 0)
+#define GLINT_RATE(_INT)			(0x0015A000 + ((_INT) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLINT_RATE_MAX_INDEX			2047
+#define GLINT_RATE_INTERVAL_S			0
+#define GLINT_RATE_INTERVAL_M			ICE_M(0x3F, 0)
+#define GLINT_RATE_INTRL_ENA_S			6
 #define GLINT_RATE_INTRL_ENA_M			BIT(6)
-#define GLINT_VECT2FUNC(_INT)			(0x00162000 + ((_INT) * 4))
+#define GLINT_TSYN_PFMSTR(_i)			(0x0016CCC0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLINT_TSYN_PFMSTR_MAX_INDEX		1
+#define GLINT_TSYN_PFMSTR_PF_MASTER_S		0
+#define GLINT_TSYN_PFMSTR_PF_MASTER_M		ICE_M(0x7, 0)
+#define GLINT_TSYN_PHY				0x0016CC50 /* Reset Source: CORER */
+#define GLINT_TSYN_PHY_PHY_INDX_S		0
+#define GLINT_TSYN_PHY_PHY_INDX_M		ICE_M(0x1F, 0)
+#define GLINT_VECT2FUNC(_INT)			(0x00162000 + ((_INT) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLINT_VECT2FUNC_MAX_INDEX		2047
 #define GLINT_VECT2FUNC_VF_NUM_S		0
 #define GLINT_VECT2FUNC_VF_NUM_M		ICE_M(0xFF, 0)
 #define GLINT_VECT2FUNC_PF_NUM_S		12
 #define GLINT_VECT2FUNC_PF_NUM_M		ICE_M(0x7, 12)
 #define GLINT_VECT2FUNC_IS_PF_S			16
 #define GLINT_VECT2FUNC_IS_PF_M			BIT(16)
-#define PFINT_FW_CTL				0x0016C800
+#define PF0INT_FW_HLP_CTL			0x0016C844 /* Reset Source: CORER */
+#define PF0INT_FW_HLP_CTL_MSIX_INDX_S		0
+#define PF0INT_FW_HLP_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_FW_HLP_CTL_ITR_INDX_S		11
+#define PF0INT_FW_HLP_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_FW_HLP_CTL_CAUSE_ENA_S		30
+#define PF0INT_FW_HLP_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_FW_HLP_CTL_INTEVENT_S		31
+#define PF0INT_FW_HLP_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_FW_PSM_CTL			0x0016C848 /* Reset Source: CORER */
+#define PF0INT_FW_PSM_CTL_MSIX_INDX_S		0
+#define PF0INT_FW_PSM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_FW_PSM_CTL_ITR_INDX_S		11
+#define PF0INT_FW_PSM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_FW_PSM_CTL_CAUSE_ENA_S		30
+#define PF0INT_FW_PSM_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_FW_PSM_CTL_INTEVENT_S		31
+#define PF0INT_FW_PSM_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_MBX_CPM_CTL			0x0016B2C0 /* Reset Source: CORER */
+#define PF0INT_MBX_CPM_CTL_MSIX_INDX_S		0
+#define PF0INT_MBX_CPM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_MBX_CPM_CTL_ITR_INDX_S		11
+#define PF0INT_MBX_CPM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_MBX_CPM_CTL_CAUSE_ENA_S		30
+#define PF0INT_MBX_CPM_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_MBX_CPM_CTL_INTEVENT_S		31
+#define PF0INT_MBX_CPM_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_MBX_HLP_CTL			0x0016B2C4 /* Reset Source: CORER */
+#define PF0INT_MBX_HLP_CTL_MSIX_INDX_S		0
+#define PF0INT_MBX_HLP_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_MBX_HLP_CTL_ITR_INDX_S		11
+#define PF0INT_MBX_HLP_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_MBX_HLP_CTL_CAUSE_ENA_S		30
+#define PF0INT_MBX_HLP_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_MBX_HLP_CTL_INTEVENT_S		31
+#define PF0INT_MBX_HLP_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_MBX_PSM_CTL			0x0016B2C8 /* Reset Source: CORER */
+#define PF0INT_MBX_PSM_CTL_MSIX_INDX_S		0
+#define PF0INT_MBX_PSM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_MBX_PSM_CTL_ITR_INDX_S		11
+#define PF0INT_MBX_PSM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_MBX_PSM_CTL_CAUSE_ENA_S		30
+#define PF0INT_MBX_PSM_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_MBX_PSM_CTL_INTEVENT_S		31
+#define PF0INT_MBX_PSM_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_OICR_CPM				0x0016CC40 /* Reset Source: CORER */
+#define PF0INT_OICR_CPM_INTEVENT_S		0
+#define PF0INT_OICR_CPM_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_CPM_QUEUE_S			1
+#define PF0INT_OICR_CPM_QUEUE_M			BIT(1)
+#define PF0INT_OICR_CPM_RSV1_S			2
+#define PF0INT_OICR_CPM_RSV1_M			ICE_M(0xFF, 2)
+#define PF0INT_OICR_CPM_HH_COMP_S		10
+#define PF0INT_OICR_CPM_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_CPM_TSYN_TX_S		11
+#define PF0INT_OICR_CPM_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_CPM_TSYN_EVNT_S		12
+#define PF0INT_OICR_CPM_TSYN_EVNT_M		BIT(12)
+#define PF0INT_OICR_CPM_TSYN_TGT_S		13
+#define PF0INT_OICR_CPM_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_CPM_HLP_RDY_S		14
+#define PF0INT_OICR_CPM_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_CPM_CPM_RDY_S		15
+#define PF0INT_OICR_CPM_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_CPM_ECC_ERR_S		16
+#define PF0INT_OICR_CPM_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_CPM_RSV2_S			17
+#define PF0INT_OICR_CPM_RSV2_M			ICE_M(0x3, 17)
+#define PF0INT_OICR_CPM_MAL_DETECT_S		19
+#define PF0INT_OICR_CPM_MAL_DETECT_M		BIT(19)
+#define PF0INT_OICR_CPM_GRST_S			20
+#define PF0INT_OICR_CPM_GRST_M			BIT(20)
+#define PF0INT_OICR_CPM_PCI_EXCEPTION_S		21
+#define PF0INT_OICR_CPM_PCI_EXCEPTION_M		BIT(21)
+#define PF0INT_OICR_CPM_GPIO_S			22
+#define PF0INT_OICR_CPM_GPIO_M			BIT(22)
+#define PF0INT_OICR_CPM_RSV3_S			23
+#define PF0INT_OICR_CPM_RSV3_M			BIT(23)
+#define PF0INT_OICR_CPM_STORM_DETECT_S		24
+#define PF0INT_OICR_CPM_STORM_DETECT_M		BIT(24)
+#define PF0INT_OICR_CPM_LINK_STAT_CHANGE_S	25
+#define PF0INT_OICR_CPM_LINK_STAT_CHANGE_M	BIT(25)
+#define PF0INT_OICR_CPM_HMC_ERR_S		26
+#define PF0INT_OICR_CPM_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_CPM_PE_PUSH_S		27
+#define PF0INT_OICR_CPM_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_CPM_PE_CRITERR_S		28
+#define PF0INT_OICR_CPM_PE_CRITERR_M		BIT(28)
+#define PF0INT_OICR_CPM_VFLR_S			29
+#define PF0INT_OICR_CPM_VFLR_M			BIT(29)
+#define PF0INT_OICR_CPM_XLR_HW_DONE_S		30
+#define PF0INT_OICR_CPM_XLR_HW_DONE_M		BIT(30)
+#define PF0INT_OICR_CPM_SWINT_S			31
+#define PF0INT_OICR_CPM_SWINT_M			BIT(31)
+#define PF0INT_OICR_CTL_CPM			0x0016CC48 /* Reset Source: CORER */
+#define PF0INT_OICR_CTL_CPM_MSIX_INDX_S		0
+#define PF0INT_OICR_CTL_CPM_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_OICR_CTL_CPM_ITR_INDX_S		11
+#define PF0INT_OICR_CTL_CPM_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_OICR_CTL_CPM_CAUSE_ENA_S		30
+#define PF0INT_OICR_CTL_CPM_CAUSE_ENA_M		BIT(30)
+#define PF0INT_OICR_CTL_CPM_INTEVENT_S		31
+#define PF0INT_OICR_CTL_CPM_INTEVENT_M		BIT(31)
+#define PF0INT_OICR_CTL_HLP			0x0016CC5C /* Reset Source: CORER */
+#define PF0INT_OICR_CTL_HLP_MSIX_INDX_S		0
+#define PF0INT_OICR_CTL_HLP_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_OICR_CTL_HLP_ITR_INDX_S		11
+#define PF0INT_OICR_CTL_HLP_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_OICR_CTL_HLP_CAUSE_ENA_S		30
+#define PF0INT_OICR_CTL_HLP_CAUSE_ENA_M		BIT(30)
+#define PF0INT_OICR_CTL_HLP_INTEVENT_S		31
+#define PF0INT_OICR_CTL_HLP_INTEVENT_M		BIT(31)
+#define PF0INT_OICR_CTL_PSM			0x0016CC64 /* Reset Source: CORER */
+#define PF0INT_OICR_CTL_PSM_MSIX_INDX_S		0
+#define PF0INT_OICR_CTL_PSM_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_OICR_CTL_PSM_ITR_INDX_S		11
+#define PF0INT_OICR_CTL_PSM_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_OICR_CTL_PSM_CAUSE_ENA_S		30
+#define PF0INT_OICR_CTL_PSM_CAUSE_ENA_M		BIT(30)
+#define PF0INT_OICR_CTL_PSM_INTEVENT_S		31
+#define PF0INT_OICR_CTL_PSM_INTEVENT_M		BIT(31)
+#define PF0INT_OICR_ENA_CPM			0x0016CC60 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_CPM_RSV0_S		0
+#define PF0INT_OICR_ENA_CPM_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_CPM_INT_ENA_S		1
+#define PF0INT_OICR_ENA_CPM_INT_ENA_M		ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_ENA_HLP			0x0016CC4C /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_HLP_RSV0_S		0
+#define PF0INT_OICR_ENA_HLP_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_HLP_INT_ENA_S		1
+#define PF0INT_OICR_ENA_HLP_INT_ENA_M		ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_ENA_PSM			0x0016CC58 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_PSM_RSV0_S		0
+#define PF0INT_OICR_ENA_PSM_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_PSM_INT_ENA_S		1
+#define PF0INT_OICR_ENA_PSM_INT_ENA_M		ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_HLP				0x0016CC68 /* Reset Source: CORER */
+#define PF0INT_OICR_HLP_INTEVENT_S		0
+#define PF0INT_OICR_HLP_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_HLP_QUEUE_S			1
+#define PF0INT_OICR_HLP_QUEUE_M			BIT(1)
+#define PF0INT_OICR_HLP_RSV1_S			2
+#define PF0INT_OICR_HLP_RSV1_M			ICE_M(0xFF, 2)
+#define PF0INT_OICR_HLP_HH_COMP_S		10
+#define PF0INT_OICR_HLP_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_HLP_TSYN_TX_S		11
+#define PF0INT_OICR_HLP_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_HLP_TSYN_EVNT_S		12
+#define PF0INT_OICR_HLP_TSYN_EVNT_M		BIT(12)
+#define PF0INT_OICR_HLP_TSYN_TGT_S		13
+#define PF0INT_OICR_HLP_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_HLP_HLP_RDY_S		14
+#define PF0INT_OICR_HLP_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_HLP_CPM_RDY_S		15
+#define PF0INT_OICR_HLP_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_HLP_ECC_ERR_S		16
+#define PF0INT_OICR_HLP_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_HLP_RSV2_S			17
+#define PF0INT_OICR_HLP_RSV2_M			ICE_M(0x3, 17)
+#define PF0INT_OICR_HLP_MAL_DETECT_S		19
+#define PF0INT_OICR_HLP_MAL_DETECT_M		BIT(19)
+#define PF0INT_OICR_HLP_GRST_S			20
+#define PF0INT_OICR_HLP_GRST_M			BIT(20)
+#define PF0INT_OICR_HLP_PCI_EXCEPTION_S		21
+#define PF0INT_OICR_HLP_PCI_EXCEPTION_M		BIT(21)
+#define PF0INT_OICR_HLP_GPIO_S			22
+#define PF0INT_OICR_HLP_GPIO_M			BIT(22)
+#define PF0INT_OICR_HLP_RSV3_S			23
+#define PF0INT_OICR_HLP_RSV3_M			BIT(23)
+#define PF0INT_OICR_HLP_STORM_DETECT_S		24
+#define PF0INT_OICR_HLP_STORM_DETECT_M		BIT(24)
+#define PF0INT_OICR_HLP_LINK_STAT_CHANGE_S	25
+#define PF0INT_OICR_HLP_LINK_STAT_CHANGE_M	BIT(25)
+#define PF0INT_OICR_HLP_HMC_ERR_S		26
+#define PF0INT_OICR_HLP_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_HLP_PE_PUSH_S		27
+#define PF0INT_OICR_HLP_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_HLP_PE_CRITERR_S		28
+#define PF0INT_OICR_HLP_PE_CRITERR_M		BIT(28)
+#define PF0INT_OICR_HLP_VFLR_S			29
+#define PF0INT_OICR_HLP_VFLR_M			BIT(29)
+#define PF0INT_OICR_HLP_XLR_HW_DONE_S		30
+#define PF0INT_OICR_HLP_XLR_HW_DONE_M		BIT(30)
+#define PF0INT_OICR_HLP_SWINT_S			31
+#define PF0INT_OICR_HLP_SWINT_M			BIT(31)
+#define PF0INT_OICR_PSM				0x0016CC44 /* Reset Source: CORER */
+#define PF0INT_OICR_PSM_INTEVENT_S		0
+#define PF0INT_OICR_PSM_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_PSM_QUEUE_S			1
+#define PF0INT_OICR_PSM_QUEUE_M			BIT(1)
+#define PF0INT_OICR_PSM_RSV1_S			2
+#define PF0INT_OICR_PSM_RSV1_M			ICE_M(0xFF, 2)
+#define PF0INT_OICR_PSM_HH_COMP_S		10
+#define PF0INT_OICR_PSM_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_PSM_TSYN_TX_S		11
+#define PF0INT_OICR_PSM_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_PSM_TSYN_EVNT_S		12
+#define PF0INT_OICR_PSM_TSYN_EVNT_M		BIT(12)
+#define PF0INT_OICR_PSM_TSYN_TGT_S		13
+#define PF0INT_OICR_PSM_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_PSM_HLP_RDY_S		14
+#define PF0INT_OICR_PSM_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_PSM_CPM_RDY_S		15
+#define PF0INT_OICR_PSM_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_PSM_ECC_ERR_S		16
+#define PF0INT_OICR_PSM_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_PSM_RSV2_S			17
+#define PF0INT_OICR_PSM_RSV2_M			ICE_M(0x3, 17)
+#define PF0INT_OICR_PSM_MAL_DETECT_S		19
+#define PF0INT_OICR_PSM_MAL_DETECT_M		BIT(19)
+#define PF0INT_OICR_PSM_GRST_S			20
+#define PF0INT_OICR_PSM_GRST_M			BIT(20)
+#define PF0INT_OICR_PSM_PCI_EXCEPTION_S		21
+#define PF0INT_OICR_PSM_PCI_EXCEPTION_M		BIT(21)
+#define PF0INT_OICR_PSM_GPIO_S			22
+#define PF0INT_OICR_PSM_GPIO_M			BIT(22)
+#define PF0INT_OICR_PSM_RSV3_S			23
+#define PF0INT_OICR_PSM_RSV3_M			BIT(23)
+#define PF0INT_OICR_PSM_STORM_DETECT_S		24
+#define PF0INT_OICR_PSM_STORM_DETECT_M		BIT(24)
+#define PF0INT_OICR_PSM_LINK_STAT_CHANGE_S	25
+#define PF0INT_OICR_PSM_LINK_STAT_CHANGE_M	BIT(25)
+#define PF0INT_OICR_PSM_HMC_ERR_S		26
+#define PF0INT_OICR_PSM_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_PSM_PE_PUSH_S		27
+#define PF0INT_OICR_PSM_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_PSM_PE_CRITERR_S		28
+#define PF0INT_OICR_PSM_PE_CRITERR_M		BIT(28)
+#define PF0INT_OICR_PSM_VFLR_S			29
+#define PF0INT_OICR_PSM_VFLR_M			BIT(29)
+#define PF0INT_OICR_PSM_XLR_HW_DONE_S		30
+#define PF0INT_OICR_PSM_XLR_HW_DONE_M		BIT(30)
+#define PF0INT_OICR_PSM_SWINT_S			31
+#define PF0INT_OICR_PSM_SWINT_M			BIT(31)
+#define PF0INT_SB_CPM_CTL			0x0016B2CC /* Reset Source: CORER */
+#define PF0INT_SB_CPM_CTL_MSIX_INDX_S		0
+#define PF0INT_SB_CPM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_SB_CPM_CTL_ITR_INDX_S		11
+#define PF0INT_SB_CPM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_SB_CPM_CTL_CAUSE_ENA_S		30
+#define PF0INT_SB_CPM_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_SB_CPM_CTL_INTEVENT_S		31
+#define PF0INT_SB_CPM_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_SB_HLP_CTL			0x0016B640 /* Reset Source: CORER */
+#define PF0INT_SB_HLP_CTL_MSIX_INDX_S		0
+#define PF0INT_SB_HLP_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_SB_HLP_CTL_ITR_INDX_S		11
+#define PF0INT_SB_HLP_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_SB_HLP_CTL_CAUSE_ENA_S		30
+#define PF0INT_SB_HLP_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_SB_HLP_CTL_INTEVENT_S		31
+#define PF0INT_SB_HLP_CTL_INTEVENT_M		BIT(31)
+#define PFINT_AEQCTL				0x0016CB00 /* Reset Source: CORER */
+#define PFINT_AEQCTL_MSIX_INDX_S		0
+#define PFINT_AEQCTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PFINT_AEQCTL_ITR_INDX_S			11
+#define PFINT_AEQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define PFINT_AEQCTL_CAUSE_ENA_S		30
+#define PFINT_AEQCTL_CAUSE_ENA_M		BIT(30)
+#define PFINT_AEQCTL_INTEVENT_S			31
+#define PFINT_AEQCTL_INTEVENT_M			BIT(31)
+#define PFINT_ALLOC				0x001D2600 /* Reset Source: CORER */
+#define PFINT_ALLOC_FIRST_S			0
+#define PFINT_ALLOC_FIRST_M			ICE_M(0x7FF, 0)
+#define PFINT_ALLOC_LAST_S			12
+#define PFINT_ALLOC_LAST_M			ICE_M(0x7FF, 12)
+#define PFINT_ALLOC_VALID_S			31
+#define PFINT_ALLOC_VALID_M			BIT(31)
+#define PFINT_ALLOC_PCI				0x0009D800 /* Reset Source: PCIR */
+#define PFINT_ALLOC_PCI_FIRST_S			0
+#define PFINT_ALLOC_PCI_FIRST_M			ICE_M(0x7FF, 0)
+#define PFINT_ALLOC_PCI_LAST_S			12
+#define PFINT_ALLOC_PCI_LAST_M			ICE_M(0x7FF, 12)
+#define PFINT_ALLOC_PCI_VALID_S			31
+#define PFINT_ALLOC_PCI_VALID_M			BIT(31)
+#define PFINT_FW_CTL				0x0016C800 /* Reset Source: CORER */
+#define PFINT_FW_CTL_MSIX_INDX_S		0
 #define PFINT_FW_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
 #define PFINT_FW_CTL_ITR_INDX_S			11
 #define PFINT_FW_CTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define PFINT_FW_CTL_CAUSE_ENA_S		30
 #define PFINT_FW_CTL_CAUSE_ENA_M		BIT(30)
-#define PFINT_MBX_CTL				0x0016B280
+#define PFINT_FW_CTL_INTEVENT_S			31
+#define PFINT_FW_CTL_INTEVENT_M			BIT(31)
+#define PFINT_GPIO_ENA				0x00088080 /* Reset Source: CORER */
+#define PFINT_GPIO_ENA_GPIO0_ENA_S		0
+#define PFINT_GPIO_ENA_GPIO0_ENA_M		BIT(0)
+#define PFINT_GPIO_ENA_GPIO1_ENA_S		1
+#define PFINT_GPIO_ENA_GPIO1_ENA_M		BIT(1)
+#define PFINT_GPIO_ENA_GPIO2_ENA_S		2
+#define PFINT_GPIO_ENA_GPIO2_ENA_M		BIT(2)
+#define PFINT_GPIO_ENA_GPIO3_ENA_S		3
+#define PFINT_GPIO_ENA_GPIO3_ENA_M		BIT(3)
+#define PFINT_GPIO_ENA_GPIO4_ENA_S		4
+#define PFINT_GPIO_ENA_GPIO4_ENA_M		BIT(4)
+#define PFINT_GPIO_ENA_GPIO5_ENA_S		5
+#define PFINT_GPIO_ENA_GPIO5_ENA_M		BIT(5)
+#define PFINT_GPIO_ENA_GPIO6_ENA_S		6
+#define PFINT_GPIO_ENA_GPIO6_ENA_M		BIT(6)
+#define PFINT_MBX_CTL				0x0016B280 /* Reset Source: CORER */
+#define PFINT_MBX_CTL_MSIX_INDX_S		0
 #define PFINT_MBX_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
 #define PFINT_MBX_CTL_ITR_INDX_S		11
 #define PFINT_MBX_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PFINT_MBX_CTL_CAUSE_ENA_S		30
 #define PFINT_MBX_CTL_CAUSE_ENA_M		BIT(30)
-#define PFINT_OICR				0x0016CA00
+#define PFINT_MBX_CTL_INTEVENT_S		31
+#define PFINT_MBX_CTL_INTEVENT_M		BIT(31)
+#define PFINT_OICR				0x0016CA00 /* Reset Source: CORER */
+#define PFINT_OICR_INTEVENT_S			0
+#define PFINT_OICR_INTEVENT_M			BIT(0)
+#define PFINT_OICR_QUEUE_S			1
+#define PFINT_OICR_QUEUE_M			BIT(1)
+#define PFINT_OICR_RSV1_S			2
+#define PFINT_OICR_RSV1_M			ICE_M(0xFF, 2)
+#define PFINT_OICR_HH_COMP_S			10
+#define PFINT_OICR_HH_COMP_M			BIT(10)
+#define PFINT_OICR_TSYN_TX_S			11
+#define PFINT_OICR_TSYN_TX_M			BIT(11)
+#define PFINT_OICR_TSYN_EVNT_S			12
+#define PFINT_OICR_TSYN_EVNT_M			BIT(12)
+#define PFINT_OICR_TSYN_TGT_S			13
+#define PFINT_OICR_TSYN_TGT_M			BIT(13)
+#define PFINT_OICR_HLP_RDY_S			14
+#define PFINT_OICR_HLP_RDY_M			BIT(14)
+#define PFINT_OICR_CPM_RDY_S			15
+#define PFINT_OICR_CPM_RDY_M			BIT(15)
+#define PFINT_OICR_ECC_ERR_S			16
 #define PFINT_OICR_ECC_ERR_M			BIT(16)
+#define PFINT_OICR_RSV2_S			17
+#define PFINT_OICR_RSV2_M			ICE_M(0x3, 17)
+#define PFINT_OICR_MAL_DETECT_S			19
 #define PFINT_OICR_MAL_DETECT_M			BIT(19)
+#define PFINT_OICR_GRST_S			20
 #define PFINT_OICR_GRST_M			BIT(20)
+#define PFINT_OICR_PCI_EXCEPTION_S		21
 #define PFINT_OICR_PCI_EXCEPTION_M		BIT(21)
+#define PFINT_OICR_GPIO_S			22
+#define PFINT_OICR_GPIO_M			BIT(22)
+#define PFINT_OICR_RSV3_S			23
+#define PFINT_OICR_RSV3_M			BIT(23)
+#define PFINT_OICR_STORM_DETECT_S		24
+#define PFINT_OICR_STORM_DETECT_M		BIT(24)
+#define PFINT_OICR_LINK_STAT_CHANGE_S		25
+#define PFINT_OICR_LINK_STAT_CHANGE_M		BIT(25)
+#define PFINT_OICR_HMC_ERR_S			26
 #define PFINT_OICR_HMC_ERR_M			BIT(26)
+#define PFINT_OICR_PE_PUSH_S			27
+#define PFINT_OICR_PE_PUSH_M			BIT(27)
+#define PFINT_OICR_PE_CRITERR_S			28
 #define PFINT_OICR_PE_CRITERR_M			BIT(28)
+#define PFINT_OICR_VFLR_S			29
 #define PFINT_OICR_VFLR_M			BIT(29)
+#define PFINT_OICR_XLR_HW_DONE_S		30
+#define PFINT_OICR_XLR_HW_DONE_M		BIT(30)
+#define PFINT_OICR_SWINT_S			31
 #define PFINT_OICR_SWINT_M			BIT(31)
-#define PFINT_OICR_CTL				0x0016CA80
+#define PFINT_OICR_CTL				0x0016CA80 /* Reset Source: CORER */
+#define PFINT_OICR_CTL_MSIX_INDX_S		0
 #define PFINT_OICR_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
 #define PFINT_OICR_CTL_ITR_INDX_S		11
 #define PFINT_OICR_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PFINT_OICR_CTL_CAUSE_ENA_S		30
 #define PFINT_OICR_CTL_CAUSE_ENA_M		BIT(30)
-#define PFINT_OICR_ENA				0x0016C900
-#define QINT_RQCTL(_QRX)			(0x00150000 + ((_QRX) * 4))
+#define PFINT_OICR_CTL_INTEVENT_S		31
+#define PFINT_OICR_CTL_INTEVENT_M		BIT(31)
+#define PFINT_OICR_ENA				0x0016C900 /* Reset Source: CORER */
+#define PFINT_OICR_ENA_RSV0_S			0
+#define PFINT_OICR_ENA_RSV0_M			BIT(0)
+#define PFINT_OICR_ENA_INT_ENA_S		1
+#define PFINT_OICR_ENA_INT_ENA_M		ICE_M(0x7FFFFFFF, 1)
+#define PFINT_SB_CTL				0x0016B600 /* Reset Source: CORER */
+#define PFINT_SB_CTL_MSIX_INDX_S		0
+#define PFINT_SB_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PFINT_SB_CTL_ITR_INDX_S			11
+#define PFINT_SB_CTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define PFINT_SB_CTL_CAUSE_ENA_S		30
+#define PFINT_SB_CTL_CAUSE_ENA_M		BIT(30)
+#define PFINT_SB_CTL_INTEVENT_S			31
+#define PFINT_SB_CTL_INTEVENT_M			BIT(31)
+#define PFINT_TSYN_MSK				0x0016C980 /* Reset Source: CORER */
+#define PFINT_TSYN_MSK_PHY_INDX_S		0
+#define PFINT_TSYN_MSK_PHY_INDX_M		ICE_M(0x1F, 0)
+#define QINT_RQCTL(_QRX)			(0x00150000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define QINT_RQCTL_MAX_INDEX			2047
 #define QINT_RQCTL_MSIX_INDX_S			0
 #define QINT_RQCTL_MSIX_INDX_M			ICE_M(0x7FF, 0)
 #define QINT_RQCTL_ITR_INDX_S			11
 #define QINT_RQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define QINT_RQCTL_CAUSE_ENA_S			30
 #define QINT_RQCTL_CAUSE_ENA_M			BIT(30)
-#define QINT_TQCTL(_DBQM)			(0x00140000 + ((_DBQM) * 4))
+#define QINT_RQCTL_INTEVENT_S			31
+#define QINT_RQCTL_INTEVENT_M			BIT(31)
+#define QINT_TQCTL(_DBQM)			(0x00140000 + ((_DBQM) * 4)) /* _i=0...16383 */ /* Reset Source: CORER */
+#define QINT_TQCTL_MAX_INDEX			16383
 #define QINT_TQCTL_MSIX_INDX_S			0
 #define QINT_TQCTL_MSIX_INDX_M			ICE_M(0x7FF, 0)
 #define QINT_TQCTL_ITR_INDX_S			11
 #define QINT_TQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define QINT_TQCTL_CAUSE_ENA_S			30
 #define QINT_TQCTL_CAUSE_ENA_M			BIT(30)
-#define VPINT_ALLOC(_VF)			(0x001D1000 + ((_VF) * 4))
+#define QINT_TQCTL_INTEVENT_S			31
+#define QINT_TQCTL_INTEVENT_M			BIT(31)
+#define VPINT_AEQCTL(_VF)			(0x0016B800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPINT_AEQCTL_MAX_INDEX			255
+#define VPINT_AEQCTL_MSIX_INDX_S		0
+#define VPINT_AEQCTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_AEQCTL_ITR_INDX_S			11
+#define VPINT_AEQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define VPINT_AEQCTL_CAUSE_ENA_S		30
+#define VPINT_AEQCTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_AEQCTL_INTEVENT_S			31
+#define VPINT_AEQCTL_INTEVENT_M			BIT(31)
+#define VPINT_ALLOC(_VF)			(0x001D1000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPINT_ALLOC_MAX_INDEX			255
 #define VPINT_ALLOC_FIRST_S			0
 #define VPINT_ALLOC_FIRST_M			ICE_M(0x7FF, 0)
 #define VPINT_ALLOC_LAST_S			12
 #define VPINT_ALLOC_LAST_M			ICE_M(0x7FF, 12)
+#define VPINT_ALLOC_VALID_S			31
 #define VPINT_ALLOC_VALID_M			BIT(31)
-#define VPINT_ALLOC_PCI(_VF)			(0x0009D000 + ((_VF) * 4))
+#define VPINT_ALLOC_PCI(_VF)			(0x0009D000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PCIR */
+#define VPINT_ALLOC_PCI_MAX_INDEX		255
 #define VPINT_ALLOC_PCI_FIRST_S			0
 #define VPINT_ALLOC_PCI_FIRST_M			ICE_M(0x7FF, 0)
 #define VPINT_ALLOC_PCI_LAST_S			12
 #define VPINT_ALLOC_PCI_LAST_M			ICE_M(0x7FF, 12)
+#define VPINT_ALLOC_PCI_VALID_S			31
 #define VPINT_ALLOC_PCI_VALID_M			BIT(31)
-#define VPINT_MBX_CTL(_VSI)			(0x0016A000 + ((_VSI) * 4))
+#define VPINT_MBX_CPM_CTL(_VP128)		(0x0016B000 + ((_VP128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VPINT_MBX_CPM_CTL_MAX_INDEX		127
+#define VPINT_MBX_CPM_CTL_MSIX_INDX_S		0
+#define VPINT_MBX_CPM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_MBX_CPM_CTL_ITR_INDX_S		11
+#define VPINT_MBX_CPM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_MBX_CPM_CTL_CAUSE_ENA_S		30
+#define VPINT_MBX_CPM_CTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_MBX_CPM_CTL_INTEVENT_S		31
+#define VPINT_MBX_CPM_CTL_INTEVENT_M		BIT(31)
+#define VPINT_MBX_CTL(_VSI)			(0x0016A000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VPINT_MBX_CTL_MAX_INDEX			767
+#define VPINT_MBX_CTL_MSIX_INDX_S		0
+#define VPINT_MBX_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_MBX_CTL_ITR_INDX_S		11
+#define VPINT_MBX_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_MBX_CTL_CAUSE_ENA_S		30
 #define VPINT_MBX_CTL_CAUSE_ENA_M		BIT(30)
-#define GLLAN_RCTL_0				0x002941F8
-#define QRX_CONTEXT(_i, _QRX)			(0x00280000 + ((_i) * 8192 + (_QRX) * 4))
-#define QRX_CTRL(_QRX)				(0x00120000 + ((_QRX) * 4))
+#define VPINT_MBX_CTL_INTEVENT_S		31
+#define VPINT_MBX_CTL_INTEVENT_M		BIT(31)
+#define VPINT_MBX_HLP_CTL(_VP16)		(0x0016B200 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VPINT_MBX_HLP_CTL_MAX_INDEX		15
+#define VPINT_MBX_HLP_CTL_MSIX_INDX_S		0
+#define VPINT_MBX_HLP_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_MBX_HLP_CTL_ITR_INDX_S		11
+#define VPINT_MBX_HLP_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_MBX_HLP_CTL_CAUSE_ENA_S		30
+#define VPINT_MBX_HLP_CTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_MBX_HLP_CTL_INTEVENT_S		31
+#define VPINT_MBX_HLP_CTL_INTEVENT_M		BIT(31)
+#define VPINT_MBX_PSM_CTL(_VP16)		(0x0016B240 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VPINT_MBX_PSM_CTL_MAX_INDEX		15
+#define VPINT_MBX_PSM_CTL_MSIX_INDX_S		0
+#define VPINT_MBX_PSM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_MBX_PSM_CTL_ITR_INDX_S		11
+#define VPINT_MBX_PSM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_MBX_PSM_CTL_CAUSE_ENA_S		30
+#define VPINT_MBX_PSM_CTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_MBX_PSM_CTL_INTEVENT_S		31
+#define VPINT_MBX_PSM_CTL_INTEVENT_M		BIT(31)
+#define VPINT_SB_CPM_CTL(_VP128)		(0x0016B400 + ((_VP128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VPINT_SB_CPM_CTL_MAX_INDEX		127
+#define VPINT_SB_CPM_CTL_MSIX_INDX_S		0
+#define VPINT_SB_CPM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_SB_CPM_CTL_ITR_INDX_S		11
+#define VPINT_SB_CPM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_SB_CPM_CTL_CAUSE_ENA_S		30
+#define VPINT_SB_CPM_CTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_SB_CPM_CTL_INTEVENT_S		31
+#define VPINT_SB_CPM_CTL_INTEVENT_M		BIT(31)
+#define GL_HLP_PRT_IPG_PREAMBLE_SIZE(_i)	(0x00049240 + ((_i) * 4)) /* _i=0...20 */ /* Reset Source: CORER */
+#define GL_HLP_PRT_IPG_PREAMBLE_SIZE_MAX_INDEX	20
+#define GL_HLP_PRT_IPG_PREAMBLE_SIZE_IPG_PREAMBLE_SIZE_S 0
+#define GL_HLP_PRT_IPG_PREAMBLE_SIZE_IPG_PREAMBLE_SIZE_M ICE_M(0xFF, 0)
+#define GL_TDPU_PSM_DEFAULT_RECIPE(_i)		(0x00049294 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GL_TDPU_PSM_DEFAULT_RECIPE_MAX_INDEX	3
+#define GL_TDPU_PSM_DEFAULT_RECIPE_ADD_IPG_S	0
+#define GL_TDPU_PSM_DEFAULT_RECIPE_ADD_IPG_M	BIT(0)
+#define GL_TDPU_PSM_DEFAULT_RECIPE_SUB_CRC_S	1
+#define GL_TDPU_PSM_DEFAULT_RECIPE_SUB_CRC_M	BIT(1)
+#define GL_TDPU_PSM_DEFAULT_RECIPE_SUB_ESP_TRAILER_S 2
+#define GL_TDPU_PSM_DEFAULT_RECIPE_SUB_ESP_TRAILER_M BIT(2)
+#define GL_TDPU_PSM_DEFAULT_RECIPE_INCLUDE_L2_PAD_S 3
+#define GL_TDPU_PSM_DEFAULT_RECIPE_INCLUDE_L2_PAD_M BIT(3)
+#define GL_TDPU_PSM_DEFAULT_RECIPE_DEFAULT_UPDATE_MODE_S 4
+#define GL_TDPU_PSM_DEFAULT_RECIPE_DEFAULT_UPDATE_MODE_M BIT(4)
+#define GLLAN_PF_RECIPE(_i)			(0x0029420C + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLLAN_PF_RECIPE_MAX_INDEX		7
+#define GLLAN_PF_RECIPE_RECIPE_S		0
+#define GLLAN_PF_RECIPE_RECIPE_M		ICE_M(0x3, 0)
+#define GLLAN_RCTL_0				0x002941F8 /* Reset Source: CORER */
+#define GLLAN_RCTL_0_PXE_MODE_S			0
+#define GLLAN_RCTL_0_PXE_MODE_M			BIT(0)
+#define GLLAN_RCTL_1				0x002941FC /* Reset Source: CORER */
+#define GLLAN_RCTL_1_RXMAX_EXPANSION_S		12
+#define GLLAN_RCTL_1_RXMAX_EXPANSION_M		ICE_M(0xF, 12)
+#define GLLAN_RCTL_1_RXDRDCTL_S			17
+#define GLLAN_RCTL_1_RXDRDCTL_M			BIT(17)
+#define GLLAN_RCTL_1_RXDESCRDROEN_S		18
+#define GLLAN_RCTL_1_RXDESCRDROEN_M		BIT(18)
+#define GLLAN_RCTL_1_RXDATAWRROEN_S		19
+#define GLLAN_RCTL_1_RXDATAWRROEN_M		BIT(19)
+#define GLLAN_TSOMSK_F				0x00049308 /* Reset Source: CORER */
+#define GLLAN_TSOMSK_F_TCPMSKF_S		0
+#define GLLAN_TSOMSK_F_TCPMSKF_M		ICE_M(0xFFF, 0)
+#define GLLAN_TSOMSK_L				0x00049310 /* Reset Source: CORER */
+#define GLLAN_TSOMSK_L_TCPMSKL_S		0
+#define GLLAN_TSOMSK_L_TCPMSKL_M		ICE_M(0xFFF, 0)
+#define GLLAN_TSOMSK_M				0x0004930C /* Reset Source: CORER */
+#define GLLAN_TSOMSK_M_TCPMSKM_S		0
+#define GLLAN_TSOMSK_M_TCPMSKM_M		ICE_M(0xFFF, 0)
+#define PFLAN_CP_QALLOC				0x00075700 /* Reset Source: CORER */
+#define PFLAN_CP_QALLOC_FIRSTQ_S		0
+#define PFLAN_CP_QALLOC_FIRSTQ_M		ICE_M(0x1FF, 0)
+#define PFLAN_CP_QALLOC_LASTQ_S			16
+#define PFLAN_CP_QALLOC_LASTQ_M			ICE_M(0x1FF, 16)
+#define PFLAN_CP_QALLOC_VALID_S			31
+#define PFLAN_CP_QALLOC_VALID_M			BIT(31)
+#define PFLAN_DB_QALLOC				0x00075680 /* Reset Source: CORER */
+#define PFLAN_DB_QALLOC_FIRSTQ_S		0
+#define PFLAN_DB_QALLOC_FIRSTQ_M		ICE_M(0xFF, 0)
+#define PFLAN_DB_QALLOC_LASTQ_S			16
+#define PFLAN_DB_QALLOC_LASTQ_M			ICE_M(0xFF, 16)
+#define PFLAN_DB_QALLOC_VALID_S			31
+#define PFLAN_DB_QALLOC_VALID_M			BIT(31)
+#define PFLAN_RX_QALLOC				0x001D2500 /* Reset Source: CORER */
+#define PFLAN_RX_QALLOC_FIRSTQ_S		0
+#define PFLAN_RX_QALLOC_FIRSTQ_M		ICE_M(0x7FF, 0)
+#define PFLAN_RX_QALLOC_LASTQ_S			16
+#define PFLAN_RX_QALLOC_LASTQ_M			ICE_M(0x7FF, 16)
+#define PFLAN_RX_QALLOC_VALID_S			31
+#define PFLAN_RX_QALLOC_VALID_M			BIT(31)
+#define PFLAN_TX_QALLOC				0x001D2580 /* Reset Source: CORER */
+#define PFLAN_TX_QALLOC_FIRSTQ_S		0
+#define PFLAN_TX_QALLOC_FIRSTQ_M		ICE_M(0x3FFF, 0)
+#define PFLAN_TX_QALLOC_LASTQ_S			16
+#define PFLAN_TX_QALLOC_LASTQ_M			ICE_M(0x3FFF, 16)
+#define PFLAN_TX_QALLOC_VALID_S			31
+#define PFLAN_TX_QALLOC_VALID_M			BIT(31)
+#define PRT_TDPUL2TAGSEN			0x00040BA0 /* Reset Source: CORER */
+#define PRT_TDPUL2TAGSEN_ENABLE_S		0
+#define PRT_TDPUL2TAGSEN_ENABLE_M		ICE_M(0xFF, 0)
+#define PRT_TDPUL2TAGSEN_NONLAST_TAG_S		8
+#define PRT_TDPUL2TAGSEN_NONLAST_TAG_M		ICE_M(0xFF, 8)
+#define QRX_CONTEXT(_i, _QRX)			(0x00280000 + ((_i) * 8192 + (_QRX) * 4)) /* _i=0...7, _QRX=0...2047 */ /* Reset Source: CORER */
+#define QRX_CONTEXT_MAX_INDEX			7
+#define QRX_CONTEXT_RXQ_CONTEXT_S		0
+#define QRX_CONTEXT_RXQ_CONTEXT_M		ICE_M(0xFFFFFFFF, 0)
+#define QRX_CTRL(_QRX)				(0x00120000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: PFR */
 #define QRX_CTRL_MAX_INDEX			2047
 #define QRX_CTRL_QENA_REQ_S			0
 #define QRX_CTRL_QENA_REQ_M			BIT(0)
+#define QRX_CTRL_FAST_QDIS_S			1
+#define QRX_CTRL_FAST_QDIS_M			BIT(1)
 #define QRX_CTRL_QENA_STAT_S			2
 #define QRX_CTRL_QENA_STAT_M			BIT(2)
-#define QRX_ITR(_QRX)				(0x00292000 + ((_QRX) * 4))
-#define QRX_TAIL(_QRX)				(0x00290000 + ((_QRX) * 4))
+#define QRX_CTRL_CDE_S				3
+#define QRX_CTRL_CDE_M				BIT(3)
+#define QRX_CTRL_CDS_S				4
+#define QRX_CTRL_CDS_M				BIT(4)
+#define QRX_ITR(_QRX)				(0x00292000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define QRX_ITR_MAX_INDEX			2047
+#define QRX_ITR_NO_EXPR_S			0
+#define QRX_ITR_NO_EXPR_M			BIT(0)
+#define QRX_TAIL(_QRX)				(0x00290000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
 #define QRX_TAIL_MAX_INDEX			2047
 #define QRX_TAIL_TAIL_S				0
 #define QRX_TAIL_TAIL_M				ICE_M(0x1FFF, 0)
-#define VPLAN_RX_QBASE(_VF)			(0x00072000 + ((_VF) * 4))
+#define VPDSI_RX_QTABLE(_i, _VP16)		(0x00074C00 + ((_i) * 64 + (_VP16) * 4)) /* _i=0...15, _VP16=0...15 */ /* Reset Source: CORER */
+#define VPDSI_RX_QTABLE_MAX_INDEX		15
+#define VPDSI_RX_QTABLE_PAGE_INDEX0_S		0
+#define VPDSI_RX_QTABLE_PAGE_INDEX0_M		ICE_M(0x7F, 0)
+#define VPDSI_RX_QTABLE_PAGE_INDEX1_S		8
+#define VPDSI_RX_QTABLE_PAGE_INDEX1_M		ICE_M(0x7F, 8)
+#define VPDSI_RX_QTABLE_PAGE_INDEX2_S		16
+#define VPDSI_RX_QTABLE_PAGE_INDEX2_M		ICE_M(0x7F, 16)
+#define VPDSI_RX_QTABLE_PAGE_INDEX3_S		24
+#define VPDSI_RX_QTABLE_PAGE_INDEX3_M		ICE_M(0x7F, 24)
+#define VPDSI_TX_QTABLE(_i, _VP16)		(0x001D2000 + ((_i) * 64 + (_VP16) * 4)) /* _i=0...15, _VP16=0...15 */ /* Reset Source: CORER */
+#define VPDSI_TX_QTABLE_MAX_INDEX		15
+#define VPDSI_TX_QTABLE_PAGE_INDEX0_S		0
+#define VPDSI_TX_QTABLE_PAGE_INDEX0_M		ICE_M(0x7F, 0)
+#define VPDSI_TX_QTABLE_PAGE_INDEX1_S		8
+#define VPDSI_TX_QTABLE_PAGE_INDEX1_M		ICE_M(0x7F, 8)
+#define VPDSI_TX_QTABLE_PAGE_INDEX2_S		16
+#define VPDSI_TX_QTABLE_PAGE_INDEX2_M		ICE_M(0x7F, 16)
+#define VPDSI_TX_QTABLE_PAGE_INDEX3_S		24
+#define VPDSI_TX_QTABLE_PAGE_INDEX3_M		ICE_M(0x7F, 24)
+#define VPLAN_DB_QTABLE(_i, _VF)		(0x00070000 + ((_i) * 2048 + (_VF) * 4)) /* _i=0...3, _VF=0...255 */ /* Reset Source: CORER */
+#define VPLAN_DB_QTABLE_MAX_INDEX		3
+#define VPLAN_DB_QTABLE_QINDEX_S		0
+#define VPLAN_DB_QTABLE_QINDEX_M		ICE_M(0x1FF, 0)
+#define VPLAN_DSI_VF_MODE(_VP16)		(0x002D2C00 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VPLAN_DSI_VF_MODE_MAX_INDEX		15
+#define VPLAN_DSI_VF_MODE_LAN_DSI_VF_MODE_S	0
+#define VPLAN_DSI_VF_MODE_LAN_DSI_VF_MODE_M	BIT(0)
+#define VPLAN_RX_QBASE(_VF)			(0x00072000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPLAN_RX_QBASE_MAX_INDEX		255
 #define VPLAN_RX_QBASE_VFFIRSTQ_S		0
 #define VPLAN_RX_QBASE_VFFIRSTQ_M		ICE_M(0x7FF, 0)
 #define VPLAN_RX_QBASE_VFNUMQ_S			16
 #define VPLAN_RX_QBASE_VFNUMQ_M			ICE_M(0xFF, 16)
-#define VPLAN_RXQ_MAPENA(_VF)			(0x00073000 + ((_VF) * 4))
+#define VPLAN_RX_QBASE_VFQTABLE_ENA_S		31
+#define VPLAN_RX_QBASE_VFQTABLE_ENA_M		BIT(31)
+#define VPLAN_RX_QTABLE(_i, _VF)		(0x00060000 + ((_i) * 2048 + (_VF) * 4)) /* _i=0...15, _VF=0...255 */ /* Reset Source: CORER */
+#define VPLAN_RX_QTABLE_MAX_INDEX		15
+#define VPLAN_RX_QTABLE_QINDEX_S		0
+#define VPLAN_RX_QTABLE_QINDEX_M		ICE_M(0xFFF, 0)
+#define VPLAN_RXQ_MAPENA(_VF)			(0x00073000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPLAN_RXQ_MAPENA_MAX_INDEX		255
+#define VPLAN_RXQ_MAPENA_RX_ENA_S		0
 #define VPLAN_RXQ_MAPENA_RX_ENA_M		BIT(0)
-#define VPLAN_TX_QBASE(_VF)			(0x001D1800 + ((_VF) * 4))
+#define VPLAN_TX_QBASE(_VF)			(0x001D1800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPLAN_TX_QBASE_MAX_INDEX		255
 #define VPLAN_TX_QBASE_VFFIRSTQ_S		0
 #define VPLAN_TX_QBASE_VFFIRSTQ_M		ICE_M(0x3FFF, 0)
 #define VPLAN_TX_QBASE_VFNUMQ_S			16
 #define VPLAN_TX_QBASE_VFNUMQ_M			ICE_M(0xFF, 16)
-#define VPLAN_TXQ_MAPENA(_VF)			(0x00073800 + ((_VF) * 4))
+#define VPLAN_TX_QBASE_VFQTABLE_ENA_S		31
+#define VPLAN_TX_QBASE_VFQTABLE_ENA_M		BIT(31)
+#define VPLAN_TX_QTABLE(_i, _VF)		(0x001C0000 + ((_i) * 2048 + (_VF) * 4)) /* _i=0...15, _VF=0...255 */ /* Reset Source: CORER */
+#define VPLAN_TX_QTABLE_MAX_INDEX		15
+#define VPLAN_TX_QTABLE_QINDEX_S		0
+#define VPLAN_TX_QTABLE_QINDEX_M		ICE_M(0x7FFF, 0)
+#define VPLAN_TXQ_MAPENA(_VF)			(0x00073800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPLAN_TXQ_MAPENA_MAX_INDEX		255
+#define VPLAN_TXQ_MAPENA_TX_ENA_S		0
 #define VPLAN_TXQ_MAPENA_TX_ENA_M		BIT(0)
-#define GL_MDET_RX				0x00294C00
+#define VSILAN_QBASE(_VSI)			(0x0044C000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSILAN_QBASE_MAX_INDEX			767
+#define VSILAN_QBASE_VSIBASE_S			0
+#define VSILAN_QBASE_VSIBASE_M			ICE_M(0x7FF, 0)
+#define VSILAN_QBASE_VSIQTABLE_ENA_S		11
+#define VSILAN_QBASE_VSIQTABLE_ENA_M		BIT(11)
+#define VSILAN_QTABLE(_i, _VSI)			(0x00440000 + ((_i) * 4096 + (_VSI) * 4)) /* _i=0...7, _VSI=0...767 */ /* Reset Source: PFR */
+#define VSILAN_QTABLE_MAX_INDEX			7
+#define VSILAN_QTABLE_QINDEX_0_S		0
+#define VSILAN_QTABLE_QINDEX_0_M		ICE_M(0x7FF, 0)
+#define VSILAN_QTABLE_QINDEX_1_S		16
+#define VSILAN_QTABLE_QINDEX_1_M		ICE_M(0x7FF, 16)
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GCP		0x001E31C0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_S 0
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_M BIT(0)
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GPP		0x001E34C0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_S 0
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_M BIT(0)
+#define PRTMAC_HSEC_CTL_RX_ENABLE_PPP		0x001E35C0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_S 0
+#define PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_M BIT(0)
+#define PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL	0x001E36C0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_S 0
+#define PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_M BIT(0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1 0x001E3220 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2 0x001E3240 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE		0x001E3180 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_M ICE_M(0x1FF, 0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1	0x001E3280 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2	0x001E32A0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_RX_QUANTA_S		0x001E3C40 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_QUANTA_SHIFT_PRTMAC_HSEC_CTL_RX_QUANTA_SHIFT_S 0
+#define PRTMAC_HSEC_CTL_RX_QUANTA_SHIFT_PRTMAC_HSEC_CTL_RX_QUANTA_SHIFT_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE		0x001E31A0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_S 0
+#define PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_M ICE_M(0x1FF, 0)
+#define PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA(_i)	(0x001E36E0 + ((_i) * 32)) /* _i=0...8 */ /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX 8
+#define PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_S 0
+#define PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER(_i) (0x001E3800 + ((_i) * 32)) /* _i=0...8 */ /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_MAX_INDEX 8
+#define PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_S 0
+#define PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_TX_SA_PART1		0x001E3960 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_S 0
+#define PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_HSEC_CTL_TX_SA_PART2		0x001E3980 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_S 0
+#define PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_M ICE_M(0xFFFF, 0)
+#define PRTMAC_LINK_DOWN_COUNTER		0x001E47C0 /* Reset Source: GLOBR */
+#define PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_S 0
+#define PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_M ICE_M(0xFFFF, 0)
+#define PRTMAC_MD_OVRRIDE_ENABLE(_i)		(0x001E3C60 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: GLOBR */
+#define PRTMAC_MD_OVRRIDE_ENABLE_MAX_INDEX	7
+#define PRTMAC_MD_OVRRIDE_ENABLE_PRTMAC_MD_OVRRIDE_ENABLE_S 0
+#define PRTMAC_MD_OVRRIDE_ENABLE_PRTMAC_MD_OVRRIDE_ENABLE_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_MD_OVRRIDE_VAL(_i)		(0x001E3D60 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: GLOBR */
+#define PRTMAC_MD_OVRRIDE_VAL_MAX_INDEX		7
+#define PRTMAC_MD_OVRRIDE_VAL_PRTMAC_MD_OVRRIDE_ENABLE_S 0
+#define PRTMAC_MD_OVRRIDE_VAL_PRTMAC_MD_OVRRIDE_ENABLE_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_RX_CNT_MRKR			0x001E48E0 /* Reset Source: GLOBR */
+#define PRTMAC_RX_CNT_MRKR_RX_CNT_MRKR_S	0
+#define PRTMAC_RX_CNT_MRKR_RX_CNT_MRKR_M	ICE_M(0xFFFF, 0)
+#define PRTMAC_RX_PKT_DRP_CNT			0x001E3C20 /* Reset Source: GLOBR */
+#define PRTMAC_RX_PKT_DRP_CNT_RX_PKT_DRP_CNT_S	0
+#define PRTMAC_RX_PKT_DRP_CNT_RX_PKT_DRP_CNT_M	ICE_M(0xFFFF, 0)
+#define PRTMAC_RX_PKT_DRP_CNT_RX_MKR_PKT_DRP_CNT_S 16
+#define PRTMAC_RX_PKT_DRP_CNT_RX_MKR_PKT_DRP_CNT_M ICE_M(0xFFFF, 16)
+#define PRTMAC_TX_CNT_MRKR			0x001E48C0 /* Reset Source: GLOBR */
+#define PRTMAC_TX_CNT_MRKR_TX_CNT_MRKR_S	0
+#define PRTMAC_TX_CNT_MRKR_TX_CNT_MRKR_M	ICE_M(0xFFFF, 0)
+#define PRTMAC_TX_LNK_UP_CNT			0x001E4840 /* Reset Source: GLOBR */
+#define PRTMAC_TX_LNK_UP_CNT_TX_LINK_UP_CNT_S	0
+#define PRTMAC_TX_LNK_UP_CNT_TX_LINK_UP_CNT_M	ICE_M(0xFFFF, 0)
+#define GL_MDCK_CFG1_TX_PQM			0x002D2DF4 /* Reset Source: CORER */
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_DATA_LEN_S	0
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_DATA_LEN_M	ICE_M(0xFF, 0)
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_PKT_CNT_S	8
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_PKT_CNT_M	ICE_M(0x3F, 8)
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_DESC_CNT_S	16
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_DESC_CNT_M	ICE_M(0x3F, 16)
+#define GL_MDCK_EN_TX_PQM			0x002D2DFC /* Reset Source: CORER */
+#define GL_MDCK_EN_TX_PQM_PCI_DUMMY_COMP_S	0
+#define GL_MDCK_EN_TX_PQM_PCI_DUMMY_COMP_M	BIT(0)
+#define GL_MDCK_EN_TX_PQM_PCI_UR_COMP_S		1
+#define GL_MDCK_EN_TX_PQM_PCI_UR_COMP_M		BIT(1)
+#define GL_MDCK_EN_TX_PQM_RCV_SH_BE_LSO_S	3
+#define GL_MDCK_EN_TX_PQM_RCV_SH_BE_LSO_M	BIT(3)
+#define GL_MDCK_EN_TX_PQM_Q_FL_MNG_EPY_CH_S	4
+#define GL_MDCK_EN_TX_PQM_Q_FL_MNG_EPY_CH_M	BIT(4)
+#define GL_MDCK_EN_TX_PQM_Q_EPY_MNG_FL_CH_S	5
+#define GL_MDCK_EN_TX_PQM_Q_EPY_MNG_FL_CH_M	BIT(5)
+#define GL_MDCK_EN_TX_PQM_LSO_NUMDESCS_ZERO_S	6
+#define GL_MDCK_EN_TX_PQM_LSO_NUMDESCS_ZERO_M	BIT(6)
+#define GL_MDCK_EN_TX_PQM_LSO_LENGTH_ZERO_S	7
+#define GL_MDCK_EN_TX_PQM_LSO_LENGTH_ZERO_M	BIT(7)
+#define GL_MDCK_EN_TX_PQM_LSO_MSS_BELOW_MIN_S	8
+#define GL_MDCK_EN_TX_PQM_LSO_MSS_BELOW_MIN_M	BIT(8)
+#define GL_MDCK_EN_TX_PQM_LSO_MSS_ABOVE_MAX_S	9
+#define GL_MDCK_EN_TX_PQM_LSO_MSS_ABOVE_MAX_M	BIT(9)
+#define GL_MDCK_EN_TX_PQM_LSO_HDR_SIZE_ZERO_S	10
+#define GL_MDCK_EN_TX_PQM_LSO_HDR_SIZE_ZERO_M	BIT(10)
+#define GL_MDCK_EN_TX_PQM_RCV_CNT_BE_LSO_S	11
+#define GL_MDCK_EN_TX_PQM_RCV_CNT_BE_LSO_M	BIT(11)
+#define GL_MDCK_EN_TX_PQM_SKIP_ONE_QT_ONLY_S	12
+#define GL_MDCK_EN_TX_PQM_SKIP_ONE_QT_ONLY_M	BIT(12)
+#define GL_MDCK_EN_TX_PQM_LSO_PKTCNT_ZERO_S	13
+#define GL_MDCK_EN_TX_PQM_LSO_PKTCNT_ZERO_M	BIT(13)
+#define GL_MDCK_EN_TX_PQM_SSO_LENGTH_ZERO_S	14
+#define GL_MDCK_EN_TX_PQM_SSO_LENGTH_ZERO_M	BIT(14)
+#define GL_MDCK_EN_TX_PQM_SSO_LENGTH_EXCEED_S	15
+#define GL_MDCK_EN_TX_PQM_SSO_LENGTH_EXCEED_M	BIT(15)
+#define GL_MDCK_EN_TX_PQM_SSO_PKTCNT_ZERO_S	16
+#define GL_MDCK_EN_TX_PQM_SSO_PKTCNT_ZERO_M	BIT(16)
+#define GL_MDCK_EN_TX_PQM_SSO_PKTCNT_EXCEED_S	17
+#define GL_MDCK_EN_TX_PQM_SSO_PKTCNT_EXCEED_M	BIT(17)
+#define GL_MDCK_EN_TX_PQM_SSO_NUMDESCS_ZERO_S	18
+#define GL_MDCK_EN_TX_PQM_SSO_NUMDESCS_ZERO_M	BIT(18)
+#define GL_MDCK_EN_TX_PQM_SSO_NUMDESCS_EXCEED_S 19
+#define GL_MDCK_EN_TX_PQM_SSO_NUMDESCS_EXCEED_M BIT(19)
+#define GL_MDCK_EN_TX_PQM_TAIL_GT_RING_LENGTH_S 20
+#define GL_MDCK_EN_TX_PQM_TAIL_GT_RING_LENGTH_M BIT(20)
+#define GL_MDCK_EN_TX_PQM_RESERVED_DBL_TYPE_S	21
+#define GL_MDCK_EN_TX_PQM_RESERVED_DBL_TYPE_M	BIT(21)
+#define GL_MDCK_EN_TX_PQM_ILLEGAL_HEAD_DROP_DBL_S 22
+#define GL_MDCK_EN_TX_PQM_ILLEGAL_HEAD_DROP_DBL_M BIT(22)
+#define GL_MDCK_EN_TX_PQM_LSO_OVER_COMMS_Q_S	23
+#define GL_MDCK_EN_TX_PQM_LSO_OVER_COMMS_Q_M	BIT(23)
+#define GL_MDCK_EN_TX_PQM_ILLEGAL_VF_QNUM_S	24
+#define GL_MDCK_EN_TX_PQM_ILLEGAL_VF_QNUM_M	BIT(24)
+#define GL_MDCK_EN_TX_PQM_QTAIL_GT_RING_LENGTH_S 25
+#define GL_MDCK_EN_TX_PQM_QTAIL_GT_RING_LENGTH_M BIT(25)
+#define GL_MDCK_EN_TX_PQM_RSVD_S		26
+#define GL_MDCK_EN_TX_PQM_RSVD_M		ICE_M(0x3F, 26)
+#define GL_MDCK_RX				0x0029422C /* Reset Source: CORER */
+#define GL_MDCK_RX_DESC_ADDR_S			0
+#define GL_MDCK_RX_DESC_ADDR_M			BIT(0)
+#define GL_MDCK_TX_TDPU				0x00049348 /* Reset Source: CORER */
+#define GL_MDCK_TX_TDPU_TTL_ERR_ITR_DIS_S	0
+#define GL_MDCK_TX_TDPU_TTL_ERR_ITR_DIS_M	BIT(0)
+#define GL_MDCK_TX_TDPU_RCU_ANTISPOOF_ITR_DIS_S 1
+#define GL_MDCK_TX_TDPU_RCU_ANTISPOOF_ITR_DIS_M BIT(1)
+#define GL_MDCK_TX_TDPU_PCIE_UR_ITR_DIS_S	2
+#define GL_MDCK_TX_TDPU_PCIE_UR_ITR_DIS_M	BIT(2)
+#define GL_MDCK_TX_TDPU_MAL_OFFSET_ITR_DIS_S	3
+#define GL_MDCK_TX_TDPU_MAL_OFFSET_ITR_DIS_M	BIT(3)
+#define GL_MDCK_TX_TDPU_MAL_CMD_ITR_DIS_S	4
+#define GL_MDCK_TX_TDPU_MAL_CMD_ITR_DIS_M	BIT(4)
+#define GL_MDCK_TX_TDPU_BIG_PKT_SIZE_ITR_DIS_S	5
+#define GL_MDCK_TX_TDPU_BIG_PKT_SIZE_ITR_DIS_M	BIT(5)
+#define GL_MDCK_TX_TDPU_L2_ACCEPT_FAIL_ITR_DIS_S 6
+#define GL_MDCK_TX_TDPU_L2_ACCEPT_FAIL_ITR_DIS_M BIT(6)
+#define GL_MDCK_TX_TDPU_NIC_DSI_ITR_DIS_S	7
+#define GL_MDCK_TX_TDPU_NIC_DSI_ITR_DIS_M	BIT(7)
+#define GL_MDCK_TX_TDPU_MAL_IPSEC_CMD_ITR_DIS_S 8
+#define GL_MDCK_TX_TDPU_MAL_IPSEC_CMD_ITR_DIS_M BIT(8)
+#define GL_MDCK_TX_TDPU_DSCP_CHECK_FAIL_ITR_DIS_S 9
+#define GL_MDCK_TX_TDPU_DSCP_CHECK_FAIL_ITR_DIS_M BIT(9)
+#define GL_MDCK_TX_TDPU_NIC_IPSEC_ITR_DIS_S	10
+#define GL_MDCK_TX_TDPU_NIC_IPSEC_ITR_DIS_M	BIT(10)
+#define GL_MDET_RX				0x00294C00 /* Reset Source: CORER */
 #define GL_MDET_RX_QNUM_S			0
 #define GL_MDET_RX_QNUM_M			ICE_M(0x7FFF, 0)
 #define GL_MDET_RX_VF_NUM_S			15
@@ -232,8 +5361,9 @@
 #define GL_MDET_RX_PF_NUM_M			ICE_M(0x7, 23)
 #define GL_MDET_RX_MAL_TYPE_S			26
 #define GL_MDET_RX_MAL_TYPE_M			ICE_M(0x1F, 26)
+#define GL_MDET_RX_VALID_S			31
 #define GL_MDET_RX_VALID_M			BIT(31)
-#define GL_MDET_TX_PQM				0x002D2E00
+#define GL_MDET_TX_PQM				0x002D2E00 /* Reset Source: CORER */
 #define GL_MDET_TX_PQM_PF_NUM_S			0
 #define GL_MDET_TX_PQM_PF_NUM_M			ICE_M(0x7, 0)
 #define GL_MDET_TX_PQM_VF_NUM_S			4
@@ -242,8 +5372,9 @@
 #define GL_MDET_TX_PQM_QNUM_M			ICE_M(0x3FFF, 12)
 #define GL_MDET_TX_PQM_MAL_TYPE_S		26
 #define GL_MDET_TX_PQM_MAL_TYPE_M		ICE_M(0x1F, 26)
+#define GL_MDET_TX_PQM_VALID_S			31
 #define GL_MDET_TX_PQM_VALID_M			BIT(31)
-#define GL_MDET_TX_TCLAN			0x000FC068
+#define GL_MDET_TX_TCLAN			0x000FC068 /* Reset Source: CORER */
 #define GL_MDET_TX_TCLAN_QNUM_S			0
 #define GL_MDET_TX_TCLAN_QNUM_M			ICE_M(0x7FFF, 0)
 #define GL_MDET_TX_TCLAN_VF_NUM_S		15
@@ -252,102 +5383,4071 @@
 #define GL_MDET_TX_TCLAN_PF_NUM_M		ICE_M(0x7, 23)
 #define GL_MDET_TX_TCLAN_MAL_TYPE_S		26
 #define GL_MDET_TX_TCLAN_MAL_TYPE_M		ICE_M(0x1F, 26)
+#define GL_MDET_TX_TCLAN_VALID_S		31
 #define GL_MDET_TX_TCLAN_VALID_M		BIT(31)
-#define PF_MDET_RX				0x00294280
+#define GL_MDET_TX_TDPU				0x00049350 /* Reset Source: CORER */
+#define GL_MDET_TX_TDPU_QNUM_S			0
+#define GL_MDET_TX_TDPU_QNUM_M			ICE_M(0x7FFF, 0)
+#define GL_MDET_TX_TDPU_VF_NUM_S		15
+#define GL_MDET_TX_TDPU_VF_NUM_M		ICE_M(0xFF, 15)
+#define GL_MDET_TX_TDPU_PF_NUM_S		23
+#define GL_MDET_TX_TDPU_PF_NUM_M		ICE_M(0x7, 23)
+#define GL_MDET_TX_TDPU_MAL_TYPE_S		26
+#define GL_MDET_TX_TDPU_MAL_TYPE_M		ICE_M(0x1F, 26)
+#define GL_MDET_TX_TDPU_VALID_S			31
+#define GL_MDET_TX_TDPU_VALID_M			BIT(31)
+#define GLRLAN_MDET				0x00294200 /* Reset Source: CORER */
+#define GLRLAN_MDET_PCKT_EXTRCT_ERR_S		0
+#define GLRLAN_MDET_PCKT_EXTRCT_ERR_M		BIT(0)
+#define PF_MDET_RX				0x00294280 /* Reset Source: CORER */
+#define PF_MDET_RX_VALID_S			0
 #define PF_MDET_RX_VALID_M			BIT(0)
-#define PF_MDET_TX_PQM				0x002D2C80
+#define PF_MDET_TX_PQM				0x002D2C80 /* Reset Source: CORER */
+#define PF_MDET_TX_PQM_VALID_S			0
 #define PF_MDET_TX_PQM_VALID_M			BIT(0)
-#define PF_MDET_TX_TCLAN			0x000FC000
+#define PF_MDET_TX_TCLAN			0x000FC000 /* Reset Source: CORER */
+#define PF_MDET_TX_TCLAN_VALID_S		0
 #define PF_MDET_TX_TCLAN_VALID_M		BIT(0)
-#define VP_MDET_RX(_VF)				(0x00294400 + ((_VF) * 4))
+#define PF_MDET_TX_TDPU				0x00040800 /* Reset Source: CORER */
+#define PF_MDET_TX_TDPU_VALID_S			0
+#define PF_MDET_TX_TDPU_VALID_M			BIT(0)
+#define VP_MDET_RX(_VF)				(0x00294400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VP_MDET_RX_MAX_INDEX			255
+#define VP_MDET_RX_VALID_S			0
 #define VP_MDET_RX_VALID_M			BIT(0)
-#define VP_MDET_TX_PQM(_VF)			(0x002D2000 + ((_VF) * 4))
+#define VP_MDET_TX_PQM(_VF)			(0x002D2000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VP_MDET_TX_PQM_MAX_INDEX		255
+#define VP_MDET_TX_PQM_VALID_S			0
 #define VP_MDET_TX_PQM_VALID_M			BIT(0)
-#define VP_MDET_TX_TCLAN(_VF)			(0x000FB800 + ((_VF) * 4))
+#define VP_MDET_TX_TCLAN(_VF)			(0x000FB800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VP_MDET_TX_TCLAN_MAX_INDEX		255
+#define VP_MDET_TX_TCLAN_VALID_S		0
 #define VP_MDET_TX_TCLAN_VALID_M		BIT(0)
-#define VP_MDET_TX_TDPU(_VF)			(0x00040000 + ((_VF) * 4))
+#define VP_MDET_TX_TDPU(_VF)			(0x00040000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VP_MDET_TX_TDPU_MAX_INDEX		255
+#define VP_MDET_TX_TDPU_VALID_S			0
 #define VP_MDET_TX_TDPU_VALID_M			BIT(0)
-#define GLNVM_FLA				0x000B6108
+#define GENERAL_MNG_FW_DBG_CSR(_i)		(0x000B6180 + ((_i) * 4)) /* _i=0...9 */ /* Reset Source: POR */
+#define GENERAL_MNG_FW_DBG_CSR_MAX_INDEX	9
+#define GENERAL_MNG_FW_DBG_CSR_GENERAL_FW_DBG_S 0
+#define GENERAL_MNG_FW_DBG_CSR_GENERAL_FW_DBG_M ICE_M(0xFFFFFFFF, 0)
+#define GL_FWRESETCNT				0x00083100 /* Reset Source: POR */
+#define GL_FWRESETCNT_FWRESETCNT_S		0
+#define GL_FWRESETCNT_FWRESETCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_MNG_FW_RAM_STAT			0x0008309C /* Reset Source: POR */
+#define GL_MNG_FW_RAM_STAT_FW_RAM_RST_STAT_S	0
+#define GL_MNG_FW_RAM_STAT_FW_RAM_RST_STAT_M	BIT(0)
+#define GL_MNG_FW_RAM_STAT_MNG_MEM_ECC_ERR_S	1
+#define GL_MNG_FW_RAM_STAT_MNG_MEM_ECC_ERR_M	BIT(1)
+#define GL_MNG_FWSM				0x000B6134 /* Reset Source: POR */
+#define GL_MNG_FWSM_FW_MODES_S			0
+#define GL_MNG_FWSM_FW_MODES_M			ICE_M(0x7, 0)
+#define GL_MNG_FWSM_RSV0_S			3
+#define GL_MNG_FWSM_RSV0_M			ICE_M(0x7F, 3)
+#define GL_MNG_FWSM_EEP_RELOAD_IND_S		10
+#define GL_MNG_FWSM_EEP_RELOAD_IND_M		BIT(10)
+#define GL_MNG_FWSM_RSV1_S			11
+#define GL_MNG_FWSM_RSV1_M			ICE_M(0xF, 11)
+#define GL_MNG_FWSM_RSV2_S			15
+#define GL_MNG_FWSM_RSV2_M			BIT(15)
+#define GL_MNG_FWSM_PCIR_AL_FAILURE_S		16
+#define GL_MNG_FWSM_PCIR_AL_FAILURE_M		BIT(16)
+#define GL_MNG_FWSM_POR_AL_FAILURE_S		17
+#define GL_MNG_FWSM_POR_AL_FAILURE_M		BIT(17)
+#define GL_MNG_FWSM_RSV3_S			18
+#define GL_MNG_FWSM_RSV3_M			BIT(18)
+#define GL_MNG_FWSM_EXT_ERR_IND_S		19
+#define GL_MNG_FWSM_EXT_ERR_IND_M		ICE_M(0x3F, 19)
+#define GL_MNG_FWSM_RSV4_S			25
+#define GL_MNG_FWSM_RSV4_M			BIT(25)
+#define GL_MNG_FWSM_RESERVED_11_S		26
+#define GL_MNG_FWSM_RESERVED_11_M		ICE_M(0xF, 26)
+#define GL_MNG_FWSM_RSV5_S			30
+#define GL_MNG_FWSM_RSV5_M			ICE_M(0x3, 30)
+#define GL_MNG_HWARB_CTRL			0x000B6130 /* Reset Source: POR */
+#define GL_MNG_HWARB_CTRL_NCSI_ARB_EN_S		0
+#define GL_MNG_HWARB_CTRL_NCSI_ARB_EN_M		BIT(0)
+#define GL_MNG_SHA_EXTEND(_i)			(0x00083120 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: EMPR */
+#define GL_MNG_SHA_EXTEND_MAX_INDEX		7
+#define GL_MNG_SHA_EXTEND_GL_MNG_SHA_EXTEND_S	0
+#define GL_MNG_SHA_EXTEND_GL_MNG_SHA_EXTEND_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_MNG_SHA_EXTEND_ROM(_i)		(0x00083160 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: EMPR */
+#define GL_MNG_SHA_EXTEND_ROM_MAX_INDEX		7
+#define GL_MNG_SHA_EXTEND_ROM_GL_MNG_SHA_EXTEND_ROM_S 0
+#define GL_MNG_SHA_EXTEND_ROM_GL_MNG_SHA_EXTEND_ROM_M ICE_M(0xFFFFFFFF, 0)
+#define GL_MNG_SHA_EXTEND_STATUS		0x00083148 /* Reset Source: EMPR */
+#define GL_MNG_SHA_EXTEND_STATUS_STAGE_S	0
+#define GL_MNG_SHA_EXTEND_STATUS_STAGE_M	ICE_M(0x7, 0)
+#define GL_MNG_SHA_EXTEND_STATUS_FW_HALTED_S	30
+#define GL_MNG_SHA_EXTEND_STATUS_FW_HALTED_M	BIT(30)
+#define GL_MNG_SHA_EXTEND_STATUS_DONE_S		31
+#define GL_MNG_SHA_EXTEND_STATUS_DONE_M		BIT(31)
+#define GL_SWT_PRT2MDEF(_i)			(0x00216018 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: POR */
+#define GL_SWT_PRT2MDEF_MAX_INDEX		31
+#define GL_SWT_PRT2MDEF_MDEFIDX_S		0
+#define GL_SWT_PRT2MDEF_MDEFIDX_M		ICE_M(0x7, 0)
+#define GL_SWT_PRT2MDEF_MDEFENA_S		31
+#define GL_SWT_PRT2MDEF_MDEFENA_M		BIT(31)
+#define PRT_MNG_MANC				0x00214720 /* Reset Source: POR */
+#define PRT_MNG_MANC_FLOW_CONTROL_DISCARD_S	0
+#define PRT_MNG_MANC_FLOW_CONTROL_DISCARD_M	BIT(0)
+#define PRT_MNG_MANC_NCSI_DISCARD_S		1
+#define PRT_MNG_MANC_NCSI_DISCARD_M		BIT(1)
+#define PRT_MNG_MANC_RCV_TCO_EN_S		17
+#define PRT_MNG_MANC_RCV_TCO_EN_M		BIT(17)
+#define PRT_MNG_MANC_RCV_ALL_S			19
+#define PRT_MNG_MANC_RCV_ALL_M			BIT(19)
+#define PRT_MNG_MANC_FIXED_NET_TYPE_S		25
+#define PRT_MNG_MANC_FIXED_NET_TYPE_M		BIT(25)
+#define PRT_MNG_MANC_NET_TYPE_S			26
+#define PRT_MNG_MANC_NET_TYPE_M			BIT(26)
+#define PRT_MNG_MANC_EN_BMC2OS_S		28
+#define PRT_MNG_MANC_EN_BMC2OS_M		BIT(28)
+#define PRT_MNG_MANC_EN_BMC2NET_S		29
+#define PRT_MNG_MANC_EN_BMC2NET_M		BIT(29)
+#define PRT_MNG_MAVTV(_i)			(0x00214780 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: POR */
+#define PRT_MNG_MAVTV_MAX_INDEX			7
+#define PRT_MNG_MAVTV_VID_S			0
+#define PRT_MNG_MAVTV_VID_M			ICE_M(0xFFF, 0)
+#define PRT_MNG_MDEF(_i)			(0x00214880 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: POR */
+#define PRT_MNG_MDEF_MAX_INDEX			7
+#define PRT_MNG_MDEF_MAC_EXACT_AND_S		0
+#define PRT_MNG_MDEF_MAC_EXACT_AND_M		ICE_M(0xF, 0)
+#define PRT_MNG_MDEF_BROADCAST_AND_S		4
+#define PRT_MNG_MDEF_BROADCAST_AND_M		BIT(4)
+#define PRT_MNG_MDEF_VLAN_AND_S			5
+#define PRT_MNG_MDEF_VLAN_AND_M			ICE_M(0xFF, 5)
+#define PRT_MNG_MDEF_IPV4_ADDRESS_AND_S		13
+#define PRT_MNG_MDEF_IPV4_ADDRESS_AND_M		ICE_M(0xF, 13)
+#define PRT_MNG_MDEF_IPV6_ADDRESS_AND_S		17
+#define PRT_MNG_MDEF_IPV6_ADDRESS_AND_M		ICE_M(0xF, 17)
+#define PRT_MNG_MDEF_MAC_EXACT_OR_S		21
+#define PRT_MNG_MDEF_MAC_EXACT_OR_M		ICE_M(0xF, 21)
+#define PRT_MNG_MDEF_BROADCAST_OR_S		25
+#define PRT_MNG_MDEF_BROADCAST_OR_M		BIT(25)
+#define PRT_MNG_MDEF_MULTICAST_AND_S		26
+#define PRT_MNG_MDEF_MULTICAST_AND_M		BIT(26)
+#define PRT_MNG_MDEF_ARP_REQUEST_OR_S		27
+#define PRT_MNG_MDEF_ARP_REQUEST_OR_M		BIT(27)
+#define PRT_MNG_MDEF_ARP_RESPONSE_OR_S		28
+#define PRT_MNG_MDEF_ARP_RESPONSE_OR_M		BIT(28)
+#define PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_S 29
+#define PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_M BIT(29)
+#define PRT_MNG_MDEF_PORT_0X298_OR_S		30
+#define PRT_MNG_MDEF_PORT_0X298_OR_M		BIT(30)
+#define PRT_MNG_MDEF_PORT_0X26F_OR_S		31
+#define PRT_MNG_MDEF_PORT_0X26F_OR_M		BIT(31)
+#define PRT_MNG_MDEF_EXT(_i)			(0x00214A00 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: POR */
+#define PRT_MNG_MDEF_EXT_MAX_INDEX		7
+#define PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_S	0
+#define PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_M	ICE_M(0xF, 0)
+#define PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_S	4
+#define PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_M	ICE_M(0xF, 4)
+#define PRT_MNG_MDEF_EXT_FLEX_PORT_OR_S		8
+#define PRT_MNG_MDEF_EXT_FLEX_PORT_OR_M		ICE_M(0xFFFF, 8)
+#define PRT_MNG_MDEF_EXT_FLEX_TCO_S		24
+#define PRT_MNG_MDEF_EXT_FLEX_TCO_M		BIT(24)
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_S 25
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_M BIT(25)
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_S 26
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_M BIT(26)
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_S 27
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_M BIT(27)
+#define PRT_MNG_MDEF_EXT_ICMP_OR_S		28
+#define PRT_MNG_MDEF_EXT_ICMP_OR_M		BIT(28)
+#define PRT_MNG_MDEF_EXT_MLD_S			29
+#define PRT_MNG_MDEF_EXT_MLD_M			BIT(29)
+#define PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_S 30
+#define PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_M BIT(30)
+#define PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_S 31
+#define PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_M BIT(31)
+#define PRT_MNG_MDEFVSI(_i)			(0x00214980 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_MDEFVSI_MAX_INDEX		3
+#define PRT_MNG_MDEFVSI_MDEFVSI_2N_S		0
+#define PRT_MNG_MDEFVSI_MDEFVSI_2N_M		ICE_M(0xFFFF, 0)
+#define PRT_MNG_MDEFVSI_MDEFVSI_2NP1_S		16
+#define PRT_MNG_MDEFVSI_MDEFVSI_2NP1_M		ICE_M(0xFFFF, 16)
+#define PRT_MNG_METF(_i)			(0x00214120 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_METF_MAX_INDEX			3
+#define PRT_MNG_METF_ETYPE_S			0
+#define PRT_MNG_METF_ETYPE_M			ICE_M(0xFFFF, 0)
+#define PRT_MNG_METF_POLARITY_S			30
+#define PRT_MNG_METF_POLARITY_M			BIT(30)
+#define PRT_MNG_MFUTP(_i)			(0x00214320 + ((_i) * 32)) /* _i=0...15 */ /* Reset Source: POR */
+#define PRT_MNG_MFUTP_MAX_INDEX			15
+#define PRT_MNG_MFUTP_MFUTP_N_S			0
+#define PRT_MNG_MFUTP_MFUTP_N_M			ICE_M(0xFFFF, 0)
+#define PRT_MNG_MFUTP_UDP_S			16
+#define PRT_MNG_MFUTP_UDP_M			BIT(16)
+#define PRT_MNG_MFUTP_TCP_S			17
+#define PRT_MNG_MFUTP_TCP_M			BIT(17)
+#define PRT_MNG_MFUTP_SOURCE_DESTINATION_S	18
+#define PRT_MNG_MFUTP_SOURCE_DESTINATION_M	BIT(18)
+#define PRT_MNG_MIPAF4(_i)			(0x002141A0 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_MIPAF4_MAX_INDEX		3
+#define PRT_MNG_MIPAF4_MIPAF_S			0
+#define PRT_MNG_MIPAF4_MIPAF_M			ICE_M(0xFFFFFFFF, 0)
+#define PRT_MNG_MIPAF6(_i)			(0x00214520 + ((_i) * 32)) /* _i=0...15 */ /* Reset Source: POR */
+#define PRT_MNG_MIPAF6_MAX_INDEX		15
+#define PRT_MNG_MIPAF6_MIPAF_S			0
+#define PRT_MNG_MIPAF6_MIPAF_M			ICE_M(0xFFFFFFFF, 0)
+#define PRT_MNG_MMAH(_i)			(0x00214220 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_MMAH_MAX_INDEX			3
+#define PRT_MNG_MMAH_MMAH_S			0
+#define PRT_MNG_MMAH_MMAH_M			ICE_M(0xFFFF, 0)
+#define PRT_MNG_MMAL(_i)			(0x002142A0 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_MMAL_MAX_INDEX			3
+#define PRT_MNG_MMAL_MMAL_S			0
+#define PRT_MNG_MMAL_MMAL_M			ICE_M(0xFFFFFFFF, 0)
+#define PRT_MNG_MNGONLY				0x00214740 /* Reset Source: POR */
+#define PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_S 0
+#define PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_M ICE_M(0xFF, 0)
+#define PRT_MNG_MSFM				0x00214760 /* Reset Source: POR */
+#define PRT_MNG_MSFM_PORT_26F_UDP_S		0
+#define PRT_MNG_MSFM_PORT_26F_UDP_M		BIT(0)
+#define PRT_MNG_MSFM_PORT_26F_TCP_S		1
+#define PRT_MNG_MSFM_PORT_26F_TCP_M		BIT(1)
+#define PRT_MNG_MSFM_PORT_298_UDP_S		2
+#define PRT_MNG_MSFM_PORT_298_UDP_M		BIT(2)
+#define PRT_MNG_MSFM_PORT_298_TCP_S		3
+#define PRT_MNG_MSFM_PORT_298_TCP_M		BIT(3)
+#define PRT_MNG_MSFM_IPV6_0_MASK_S		4
+#define PRT_MNG_MSFM_IPV6_0_MASK_M		BIT(4)
+#define PRT_MNG_MSFM_IPV6_1_MASK_S		5
+#define PRT_MNG_MSFM_IPV6_1_MASK_M		BIT(5)
+#define PRT_MNG_MSFM_IPV6_2_MASK_S		6
+#define PRT_MNG_MSFM_IPV6_2_MASK_M		BIT(6)
+#define PRT_MNG_MSFM_IPV6_3_MASK_S		7
+#define PRT_MNG_MSFM_IPV6_3_MASK_M		BIT(7)
+#define MSIX_PBA_PAGE(_i)			(0x02E08000 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: FLR */
+#define MSIX_PBA_PAGE_MAX_INDEX			63
+#define MSIX_PBA_PAGE_PENBIT_S			0
+#define MSIX_PBA_PAGE_PENBIT_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_PBA1(_i)				(0x00008000 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: FLR */
+#define MSIX_PBA1_MAX_INDEX			63
+#define MSIX_PBA1_PENBIT_S			0
+#define MSIX_PBA1_PENBIT_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TADD_PAGE(_i)			(0x02E00000 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TADD_PAGE_MAX_INDEX		2047
+#define MSIX_TADD_PAGE_MSIXTADD10_S		0
+#define MSIX_TADD_PAGE_MSIXTADD10_M		ICE_M(0x3, 0)
+#define MSIX_TADD_PAGE_MSIXTADD_S		2
+#define MSIX_TADD_PAGE_MSIXTADD_M		ICE_M(0x3FFFFFFF, 2)
+#define MSIX_TADD1(_i)				(0x00000000 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TADD1_MAX_INDEX			2047
+#define MSIX_TADD1_MSIXTADD10_S			0
+#define MSIX_TADD1_MSIXTADD10_M			ICE_M(0x3, 0)
+#define MSIX_TADD1_MSIXTADD_S			2
+#define MSIX_TADD1_MSIXTADD_M			ICE_M(0x3FFFFFFF, 2)
+#define MSIX_TMSG(_i)				(0x00000008 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TMSG_MAX_INDEX			2047
+#define MSIX_TMSG_MSIXTMSG_S			0
+#define MSIX_TMSG_MSIXTMSG_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TMSG_PAGE(_i)			(0x02E00008 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TMSG_PAGE_MAX_INDEX		2047
+#define MSIX_TMSG_PAGE_MSIXTMSG_S		0
+#define MSIX_TMSG_PAGE_MSIXTMSG_M		ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TUADD_PAGE(_i)			(0x02E00004 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TUADD_PAGE_MAX_INDEX		2047
+#define MSIX_TUADD_PAGE_MSIXTUADD_S		0
+#define MSIX_TUADD_PAGE_MSIXTUADD_M		ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TUADD1(_i)				(0x00000004 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TUADD1_MAX_INDEX			2047
+#define MSIX_TUADD1_MSIXTUADD_S			0
+#define MSIX_TUADD1_MSIXTUADD_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TVCTRL_PAGE(_i)			(0x02E0000C + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TVCTRL_PAGE_MAX_INDEX		2047
+#define MSIX_TVCTRL_PAGE_MASK_S			0
+#define MSIX_TVCTRL_PAGE_MASK_M			BIT(0)
+#define MSIX_TVCTRL1(_i)			(0x0000000C + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TVCTRL1_MAX_INDEX			2047
+#define MSIX_TVCTRL1_MASK_S			0
+#define MSIX_TVCTRL1_MASK_M			BIT(0)
+#define GLNVM_AL_DONE_HLP			0x000824C4 /* Reset Source: POR */
+#define GLNVM_AL_DONE_HLP_HLP_CORER_S		0
+#define GLNVM_AL_DONE_HLP_HLP_CORER_M		BIT(0)
+#define GLNVM_AL_DONE_HLP_HLP_FULLR_S		1
+#define GLNVM_AL_DONE_HLP_HLP_FULLR_M		BIT(1)
+#define GLNVM_ALTIMERS				0x000B6140 /* Reset Source: POR */
+#define GLNVM_ALTIMERS_PCI_ALTIMER_S		0
+#define GLNVM_ALTIMERS_PCI_ALTIMER_M		ICE_M(0xFFF, 0)
+#define GLNVM_ALTIMERS_GEN_ALTIMER_S		12
+#define GLNVM_ALTIMERS_GEN_ALTIMER_M		ICE_M(0xFFFFF, 12)
+#define GLNVM_FLA				0x000B6108 /* Reset Source: POR */
+#define GLNVM_FLA_LOCKED_S			6
 #define GLNVM_FLA_LOCKED_M			BIT(6)
-#define GLNVM_GENS				0x000B6100
+#define GLNVM_GENS				0x000B6100 /* Reset Source: POR */
+#define GLNVM_GENS_NVM_PRES_S			0
+#define GLNVM_GENS_NVM_PRES_M			BIT(0)
 #define GLNVM_GENS_SR_SIZE_S			5
 #define GLNVM_GENS_SR_SIZE_M			ICE_M(0x7, 5)
-#define GLNVM_ULD				0x000B6008
+#define GLNVM_GENS_BANK1VAL_S			8
+#define GLNVM_GENS_BANK1VAL_M			BIT(8)
+#define GLNVM_GENS_ALT_PRST_S			23
+#define GLNVM_GENS_ALT_PRST_M			BIT(23)
+#define GLNVM_GENS_FL_AUTO_RD_S			25
+#define GLNVM_GENS_FL_AUTO_RD_M			BIT(25)
+#define GLNVM_PROTCSR(_i)			(0x000B6010 + ((_i) * 4)) /* _i=0...59 */ /* Reset Source: POR */
+#define GLNVM_PROTCSR_MAX_INDEX			59
+#define GLNVM_PROTCSR_ADDR_BLOCK_S		0
+#define GLNVM_PROTCSR_ADDR_BLOCK_M		ICE_M(0xFFFFFF, 0)
+#define GLNVM_ULD				0x000B6008 /* Reset Source: POR */
+#define GLNVM_ULD_PCIER_DONE_S			0
 #define GLNVM_ULD_PCIER_DONE_M			BIT(0)
+#define GLNVM_ULD_PCIER_DONE_1_S		1
 #define GLNVM_ULD_PCIER_DONE_1_M		BIT(1)
+#define GLNVM_ULD_CORER_DONE_S			3
 #define GLNVM_ULD_CORER_DONE_M			BIT(3)
+#define GLNVM_ULD_GLOBR_DONE_S			4
 #define GLNVM_ULD_GLOBR_DONE_M			BIT(4)
+#define GLNVM_ULD_POR_DONE_S			5
 #define GLNVM_ULD_POR_DONE_M			BIT(5)
+#define GLNVM_ULD_POR_DONE_1_S			8
 #define GLNVM_ULD_POR_DONE_1_M			BIT(8)
+#define GLNVM_ULD_PCIER_DONE_2_S		9
 #define GLNVM_ULD_PCIER_DONE_2_M		BIT(9)
+#define GLNVM_ULD_PE_DONE_S			10
 #define GLNVM_ULD_PE_DONE_M			BIT(10)
-#define GLPCI_CNF2				0x000BE004
+#define GLNVM_ULD_HLP_CORE_DONE_S		11
+#define GLNVM_ULD_HLP_CORE_DONE_M		BIT(11)
+#define GLNVM_ULD_HLP_FULL_DONE_S		12
+#define GLNVM_ULD_HLP_FULL_DONE_M		BIT(12)
+#define GLNVM_ULT				0x000B6154 /* Reset Source: POR */
+#define GLNVM_ULT_CONF_PCIR_AE_S		0
+#define GLNVM_ULT_CONF_PCIR_AE_M		BIT(0)
+#define GLNVM_ULT_CONF_PCIRTL_AE_S		1
+#define GLNVM_ULT_CONF_PCIRTL_AE_M		BIT(1)
+#define GLNVM_ULT_RESERVED_1_S			2
+#define GLNVM_ULT_RESERVED_1_M			BIT(2)
+#define GLNVM_ULT_CONF_CORE_AE_S		3
+#define GLNVM_ULT_CONF_CORE_AE_M		BIT(3)
+#define GLNVM_ULT_CONF_GLOBAL_AE_S		4
+#define GLNVM_ULT_CONF_GLOBAL_AE_M		BIT(4)
+#define GLNVM_ULT_CONF_POR_AE_S			5
+#define GLNVM_ULT_CONF_POR_AE_M			BIT(5)
+#define GLNVM_ULT_RESERVED_2_S			6
+#define GLNVM_ULT_RESERVED_2_M			BIT(6)
+#define GLNVM_ULT_RESERVED_3_S			7
+#define GLNVM_ULT_RESERVED_3_M			BIT(7)
+#define GLNVM_ULT_RESERVED_5_S			8
+#define GLNVM_ULT_RESERVED_5_M			BIT(8)
+#define GLNVM_ULT_CONF_PCIALT_AE_S		9
+#define GLNVM_ULT_CONF_PCIALT_AE_M		BIT(9)
+#define GLNVM_ULT_CONF_PE_AE_S			10
+#define GLNVM_ULT_CONF_PE_AE_M			BIT(10)
+#define GLNVM_ULT_RESERVED_4_S			11
+#define GLNVM_ULT_RESERVED_4_M			ICE_M(0x1FFFFF, 11)
+#define GL_COTF_MARKER_STATUS			0x00200200 /* Reset Source: CORER */
+#define GL_COTF_MARKER_STATUS_MRKR_BUSY_S	0
+#define GL_COTF_MARKER_STATUS_MRKR_BUSY_M	ICE_M(0xFF, 0)
+#define GL_COTF_MARKER_TRIG_RCU_PRS(_i)		(0x002001D4 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_COTF_MARKER_TRIG_RCU_PRS_MAX_INDEX	7
+#define GL_COTF_MARKER_TRIG_RCU_PRS_SET_RST_S	0
+#define GL_COTF_MARKER_TRIG_RCU_PRS_SET_RST_M	BIT(0)
+#define GL_PRS_MARKER_ERROR			0x00200204 /* Reset Source: CORER */
+#define GL_PRS_MARKER_ERROR_XLR_CFG_ERR_S	0
+#define GL_PRS_MARKER_ERROR_XLR_CFG_ERR_M	BIT(0)
+#define GL_PRS_MARKER_ERROR_QH_CFG_ERR_S	1
+#define GL_PRS_MARKER_ERROR_QH_CFG_ERR_M	BIT(1)
+#define GL_PRS_MARKER_ERROR_COTF_CFG_ERR_S	2
+#define GL_PRS_MARKER_ERROR_COTF_CFG_ERR_M	BIT(2)
+#define GL_PRS_RX_PIPE_INIT0(_i)		(0x0020000C + ((_i) * 4)) /* _i=0...6 */ /* Reset Source: CORER */
+#define GL_PRS_RX_PIPE_INIT0_MAX_INDEX		6
+#define GL_PRS_RX_PIPE_INIT0_GPCSR_INIT_S	0
+#define GL_PRS_RX_PIPE_INIT0_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_RX_PIPE_INIT1			0x00200028 /* Reset Source: CORER */
+#define GL_PRS_RX_PIPE_INIT1_GPCSR_INIT_S	0
+#define GL_PRS_RX_PIPE_INIT1_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_RX_PIPE_INIT2			0x0020002C /* Reset Source: CORER */
+#define GL_PRS_RX_PIPE_INIT2_GPCSR_INIT_S	0
+#define GL_PRS_RX_PIPE_INIT2_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_RX_SIZE_CTRL			0x00200004 /* Reset Source: CORER */
+#define GL_PRS_RX_SIZE_CTRL_MIN_SIZE_S		0
+#define GL_PRS_RX_SIZE_CTRL_MIN_SIZE_M		ICE_M(0x3FF, 0)
+#define GL_PRS_RX_SIZE_CTRL_MIN_SIZE_EN_S	15
+#define GL_PRS_RX_SIZE_CTRL_MIN_SIZE_EN_M	BIT(15)
+#define GL_PRS_RX_SIZE_CTRL_MAX_SIZE_S		16
+#define GL_PRS_RX_SIZE_CTRL_MAX_SIZE_M		ICE_M(0x3FF, 16)
+#define GL_PRS_RX_SIZE_CTRL_MAX_SIZE_EN_S	31
+#define GL_PRS_RX_SIZE_CTRL_MAX_SIZE_EN_M	BIT(31)
+#define GL_PRS_TX_PIPE_INIT0(_i)		(0x00202018 + ((_i) * 4)) /* _i=0...6 */ /* Reset Source: CORER */
+#define GL_PRS_TX_PIPE_INIT0_MAX_INDEX		6
+#define GL_PRS_TX_PIPE_INIT0_GPCSR_INIT_S	0
+#define GL_PRS_TX_PIPE_INIT0_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_TX_PIPE_INIT1			0x00202034 /* Reset Source: CORER */
+#define GL_PRS_TX_PIPE_INIT1_GPCSR_INIT_S	0
+#define GL_PRS_TX_PIPE_INIT1_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_TX_PIPE_INIT2			0x00202038 /* Reset Source: CORER */
+#define GL_PRS_TX_PIPE_INIT2_GPCSR_INIT_S	0
+#define GL_PRS_TX_PIPE_INIT2_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_TX_SIZE_CTRL			0x00202014 /* Reset Source: CORER */
+#define GL_PRS_TX_SIZE_CTRL_MIN_SIZE_S		0
+#define GL_PRS_TX_SIZE_CTRL_MIN_SIZE_M		ICE_M(0x3FF, 0)
+#define GL_PRS_TX_SIZE_CTRL_MIN_SIZE_EN_S	15
+#define GL_PRS_TX_SIZE_CTRL_MIN_SIZE_EN_M	BIT(15)
+#define GL_PRS_TX_SIZE_CTRL_MAX_SIZE_S		16
+#define GL_PRS_TX_SIZE_CTRL_MAX_SIZE_M		ICE_M(0x3FF, 16)
+#define GL_PRS_TX_SIZE_CTRL_MAX_SIZE_EN_S	31
+#define GL_PRS_TX_SIZE_CTRL_MAX_SIZE_EN_M	BIT(31)
+#define GL_QH_MARKER_STATUS			0x002001FC /* Reset Source: CORER */
+#define GL_QH_MARKER_STATUS_MRKR_BUSY_S		0
+#define GL_QH_MARKER_STATUS_MRKR_BUSY_M		ICE_M(0xF, 0)
+#define GL_QH_MARKER_TRIG_RCU_PRS(_i)		(0x002001C4 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GL_QH_MARKER_TRIG_RCU_PRS_MAX_INDEX	3
+#define GL_QH_MARKER_TRIG_RCU_PRS_QPID_S	0
+#define GL_QH_MARKER_TRIG_RCU_PRS_QPID_M	ICE_M(0x3FFFF, 0)
+#define GL_QH_MARKER_TRIG_RCU_PRS_PE_TAG_S	18
+#define GL_QH_MARKER_TRIG_RCU_PRS_PE_TAG_M	ICE_M(0xFF, 18)
+#define GL_QH_MARKER_TRIG_RCU_PRS_PORT_NUM_S	26
+#define GL_QH_MARKER_TRIG_RCU_PRS_PORT_NUM_M	ICE_M(0x7, 26)
+#define GL_QH_MARKER_TRIG_RCU_PRS_SET_RST_S	31
+#define GL_QH_MARKER_TRIG_RCU_PRS_SET_RST_M	BIT(31)
+#define GL_RPRS_ANA_CSR_CTRL			0x00200708 /* Reset Source: CORER */
+#define GL_RPRS_ANA_CSR_CTRL_SELECT_EN_S	0
+#define GL_RPRS_ANA_CSR_CTRL_SELECT_EN_M	BIT(0)
+#define GL_RPRS_ANA_CSR_CTRL_SELECTED_ANA_S	1
+#define GL_RPRS_ANA_CSR_CTRL_SELECTED_ANA_M	BIT(1)
+#define GL_TPRS_ANA_CSR_CTRL			0x00202100 /* Reset Source: CORER */
+#define GL_TPRS_ANA_CSR_CTRL_SELECT_EN_S	0
+#define GL_TPRS_ANA_CSR_CTRL_SELECT_EN_M	BIT(0)
+#define GL_TPRS_ANA_CSR_CTRL_SELECTED_ANA_S	1
+#define GL_TPRS_ANA_CSR_CTRL_SELECTED_ANA_M	BIT(1)
+#define GL_TPRS_MNG_PM_THR			0x00202004 /* Reset Source: CORER */
+#define GL_TPRS_MNG_PM_THR_MNG_PM_THR_S		0
+#define GL_TPRS_MNG_PM_THR_MNG_PM_THR_M		ICE_M(0x3FFF, 0)
+#define GL_TPRS_PM_CNT(_i)			(0x00202008 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GL_TPRS_PM_CNT_MAX_INDEX		1
+#define GL_TPRS_PM_CNT_GL_PRS_PM_CNT_S		0
+#define GL_TPRS_PM_CNT_GL_PRS_PM_CNT_M		ICE_M(0x3FFF, 0)
+#define GL_TPRS_PM_THR				0x00202000 /* Reset Source: CORER */
+#define GL_TPRS_PM_THR_PM_THR_S			0
+#define GL_TPRS_PM_THR_PM_THR_M			ICE_M(0x3FFF, 0)
+#define GL_XLR_MARKER_LOG_RCU_PRS(_i)		(0x00200208 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_XLR_MARKER_LOG_RCU_PRS_MAX_INDEX	63
+#define GL_XLR_MARKER_LOG_RCU_PRS_XLR_TRIG_S	0
+#define GL_XLR_MARKER_LOG_RCU_PRS_XLR_TRIG_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_XLR_MARKER_STATUS(_i)		(0x002001F4 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GL_XLR_MARKER_STATUS_MAX_INDEX		1
+#define GL_XLR_MARKER_STATUS_MRKR_BUSY_S	0
+#define GL_XLR_MARKER_STATUS_MRKR_BUSY_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_XLR_MARKER_TRIG_PE			0x005008C0 /* Reset Source: CORER */
+#define GL_XLR_MARKER_TRIG_PE_VM_VF_NUM_S	0
+#define GL_XLR_MARKER_TRIG_PE_VM_VF_NUM_M	ICE_M(0x3FF, 0)
+#define GL_XLR_MARKER_TRIG_PE_VM_VF_TYPE_S	10
+#define GL_XLR_MARKER_TRIG_PE_VM_VF_TYPE_M	ICE_M(0x3, 10)
+#define GL_XLR_MARKER_TRIG_PE_PF_NUM_S		12
+#define GL_XLR_MARKER_TRIG_PE_PF_NUM_M		ICE_M(0x7, 12)
+#define GL_XLR_MARKER_TRIG_PE_PORT_NUM_S	16
+#define GL_XLR_MARKER_TRIG_PE_PORT_NUM_M	ICE_M(0x7, 16)
+#define GL_XLR_MARKER_TRIG_RCU_PRS		0x002001C0 /* Reset Source: CORER */
+#define GL_XLR_MARKER_TRIG_RCU_PRS_VM_VF_NUM_S	0
+#define GL_XLR_MARKER_TRIG_RCU_PRS_VM_VF_NUM_M	ICE_M(0x3FF, 0)
+#define GL_XLR_MARKER_TRIG_RCU_PRS_VM_VF_TYPE_S 10
+#define GL_XLR_MARKER_TRIG_RCU_PRS_VM_VF_TYPE_M ICE_M(0x3, 10)
+#define GL_XLR_MARKER_TRIG_RCU_PRS_PF_NUM_S	12
+#define GL_XLR_MARKER_TRIG_RCU_PRS_PF_NUM_M	ICE_M(0x7, 12)
+#define GL_XLR_MARKER_TRIG_RCU_PRS_PORT_NUM_S	16
+#define GL_XLR_MARKER_TRIG_RCU_PRS_PORT_NUM_M	ICE_M(0x7, 16)
+#define GL_CLKGATE_EVENTS			0x0009DE70 /* Reset Source: PERST */
+#define GL_CLKGATE_EVENTS_PRIMARY_CLKGATE_EVENTS_S 0
+#define GL_CLKGATE_EVENTS_PRIMARY_CLKGATE_EVENTS_M ICE_M(0xFFFF, 0)
+#define GL_CLKGATE_EVENTS_SIDEBAND_CLKGATE_EVENTS_S 16
+#define GL_CLKGATE_EVENTS_SIDEBAND_CLKGATE_EVENTS_M ICE_M(0xFFFF, 16)
+#define GLPCI_BYTCTH_NP_C			0x000BFDA8 /* Reset Source: PCIR */
+#define GLPCI_BYTCTH_NP_C_PCI_COUNT_BW_BCT_S	0
+#define GLPCI_BYTCTH_NP_C_PCI_COUNT_BW_BCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_BYTCTH_P				0x0009E970 /* Reset Source: PCIR */
+#define GLPCI_BYTCTH_P_PCI_COUNT_BW_BCT_S	0
+#define GLPCI_BYTCTH_P_PCI_COUNT_BW_BCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_BYTCTL_NP_C			0x000BFDAC /* Reset Source: PCIR */
+#define GLPCI_BYTCTL_NP_C_PCI_COUNT_BW_BCT_S	0
+#define GLPCI_BYTCTL_NP_C_PCI_COUNT_BW_BCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_BYTCTL_P				0x0009E994 /* Reset Source: PCIR */
+#define GLPCI_BYTCTL_P_PCI_COUNT_BW_BCT_S	0
+#define GLPCI_BYTCTL_P_PCI_COUNT_BW_BCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_CAPCTRL				0x0009DE88 /* Reset Source: PCIR */
+#define GLPCI_CAPCTRL_VPD_EN_S			0
+#define GLPCI_CAPCTRL_VPD_EN_M			BIT(0)
+#define GLPCI_CAPSUP				0x0009DE8C /* Reset Source: PCIR */
+#define GLPCI_CAPSUP_PCIE_VER_S			0
+#define GLPCI_CAPSUP_PCIE_VER_M			BIT(0)
+#define GLPCI_CAPSUP_RESERVED_2_S		1
+#define GLPCI_CAPSUP_RESERVED_2_M		BIT(1)
+#define GLPCI_CAPSUP_LTR_EN_S			2
+#define GLPCI_CAPSUP_LTR_EN_M			BIT(2)
+#define GLPCI_CAPSUP_TPH_EN_S			3
+#define GLPCI_CAPSUP_TPH_EN_M			BIT(3)
+#define GLPCI_CAPSUP_ARI_EN_S			4
+#define GLPCI_CAPSUP_ARI_EN_M			BIT(4)
+#define GLPCI_CAPSUP_IOV_EN_S			5
+#define GLPCI_CAPSUP_IOV_EN_M			BIT(5)
+#define GLPCI_CAPSUP_ACS_EN_S			6
+#define GLPCI_CAPSUP_ACS_EN_M			BIT(6)
+#define GLPCI_CAPSUP_SEC_EN_S			7
+#define GLPCI_CAPSUP_SEC_EN_M			BIT(7)
+#define GLPCI_CAPSUP_PASID_EN_S			8
+#define GLPCI_CAPSUP_PASID_EN_M			BIT(8)
+#define GLPCI_CAPSUP_DLFE_EN_S			9
+#define GLPCI_CAPSUP_DLFE_EN_M			BIT(9)
+#define GLPCI_CAPSUP_GEN4_EXT_EN_S		10
+#define GLPCI_CAPSUP_GEN4_EXT_EN_M		BIT(10)
+#define GLPCI_CAPSUP_GEN4_MARG_EN_S		11
+#define GLPCI_CAPSUP_GEN4_MARG_EN_M		BIT(11)
+#define GLPCI_CAPSUP_ECRC_GEN_EN_S		16
+#define GLPCI_CAPSUP_ECRC_GEN_EN_M		BIT(16)
+#define GLPCI_CAPSUP_ECRC_CHK_EN_S		17
+#define GLPCI_CAPSUP_ECRC_CHK_EN_M		BIT(17)
+#define GLPCI_CAPSUP_IDO_EN_S			18
+#define GLPCI_CAPSUP_IDO_EN_M			BIT(18)
+#define GLPCI_CAPSUP_MSI_MASK_S			19
+#define GLPCI_CAPSUP_MSI_MASK_M			BIT(19)
+#define GLPCI_CAPSUP_CSR_CONF_EN_S		20
+#define GLPCI_CAPSUP_CSR_CONF_EN_M		BIT(20)
+#define GLPCI_CAPSUP_WAKUP_EN_S			21
+#define GLPCI_CAPSUP_WAKUP_EN_M			BIT(21)
+#define GLPCI_CAPSUP_LOAD_SUBSYS_ID_S		30
+#define GLPCI_CAPSUP_LOAD_SUBSYS_ID_M		BIT(30)
+#define GLPCI_CAPSUP_LOAD_DEV_ID_S		31
+#define GLPCI_CAPSUP_LOAD_DEV_ID_M		BIT(31)
+#define GLPCI_CNF				0x0009DEA0 /* Reset Source: POR */
+#define GLPCI_CNF_FLEX10_S			1
+#define GLPCI_CNF_FLEX10_M			BIT(1)
+#define GLPCI_CNF_WAKE_PIN_EN_S			2
+#define GLPCI_CNF_WAKE_PIN_EN_M			BIT(2)
+#define GLPCI_CNF_MSIX_ECC_BLOCK_DISABLE_S	3
+#define GLPCI_CNF_MSIX_ECC_BLOCK_DISABLE_M	BIT(3)
+#define GLPCI_CNF2				0x000BE004 /* Reset Source: PCIR */
+#define GLPCI_CNF2_RO_DIS_S			0
+#define GLPCI_CNF2_RO_DIS_M			BIT(0)
+#define GLPCI_CNF2_CACHELINE_SIZE_S		1
 #define GLPCI_CNF2_CACHELINE_SIZE_M		BIT(1)
-#define PF_FUNC_RID				0x0009E880
-#define PF_FUNC_RID_FUNC_NUM_S			0
-#define PF_FUNC_RID_FUNC_NUM_M			ICE_M(0x7, 0)
-#define PF_PCI_CIAA				0x0009E580
+#define GLPCI_DREVID				0x0009E9AC /* Reset Source: PCIR */
+#define GLPCI_DREVID_DEFAULT_REVID_S		0
+#define GLPCI_DREVID_DEFAULT_REVID_M		ICE_M(0xFF, 0)
+#define GLPCI_GSCL_1_NP_C			0x000BFDA4 /* Reset Source: PCIR */
+#define GLPCI_GSCL_1_NP_C_RT_MODE_S		8
+#define GLPCI_GSCL_1_NP_C_RT_MODE_M		BIT(8)
+#define GLPCI_GSCL_1_NP_C_RT_EVENT_S		9
+#define GLPCI_GSCL_1_NP_C_RT_EVENT_M		ICE_M(0x1F, 9)
+#define GLPCI_GSCL_1_NP_C_PCI_COUNT_BW_EN_S	14
+#define GLPCI_GSCL_1_NP_C_PCI_COUNT_BW_EN_M	BIT(14)
+#define GLPCI_GSCL_1_NP_C_PCI_COUNT_BW_EV_S	15
+#define GLPCI_GSCL_1_NP_C_PCI_COUNT_BW_EV_M	ICE_M(0x1F, 15)
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_RESET_S	29
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_RESET_M	BIT(29)
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_STOP_S	30
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_STOP_M	BIT(30)
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_START_S	31
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_START_M	BIT(31)
+#define GLPCI_GSCL_1_P				0x0009E9B4 /* Reset Source: PCIR */
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_0_S		0
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_0_M		BIT(0)
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_1_S		1
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_1_M		BIT(1)
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_2_S		2
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_2_M		BIT(2)
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_3_S		3
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_3_M		BIT(3)
+#define GLPCI_GSCL_1_P_LBC_ENABLE_0_S		4
+#define GLPCI_GSCL_1_P_LBC_ENABLE_0_M		BIT(4)
+#define GLPCI_GSCL_1_P_LBC_ENABLE_1_S		5
+#define GLPCI_GSCL_1_P_LBC_ENABLE_1_M		BIT(5)
+#define GLPCI_GSCL_1_P_LBC_ENABLE_2_S		6
+#define GLPCI_GSCL_1_P_LBC_ENABLE_2_M		BIT(6)
+#define GLPCI_GSCL_1_P_LBC_ENABLE_3_S		7
+#define GLPCI_GSCL_1_P_LBC_ENABLE_3_M		BIT(7)
+#define GLPCI_GSCL_1_P_PCI_COUNT_BW_EN_S	14
+#define GLPCI_GSCL_1_P_PCI_COUNT_BW_EN_M	BIT(14)
+#define GLPCI_GSCL_1_P_GIO_64_BIT_EN_S		28
+#define GLPCI_GSCL_1_P_GIO_64_BIT_EN_M		BIT(28)
+#define GLPCI_GSCL_1_P_GIO_COUNT_RESET_S	29
+#define GLPCI_GSCL_1_P_GIO_COUNT_RESET_M	BIT(29)
+#define GLPCI_GSCL_1_P_GIO_COUNT_STOP_S		30
+#define GLPCI_GSCL_1_P_GIO_COUNT_STOP_M		BIT(30)
+#define GLPCI_GSCL_1_P_GIO_COUNT_START_S	31
+#define GLPCI_GSCL_1_P_GIO_COUNT_START_M	BIT(31)
+#define GLPCI_GSCL_2				0x0009E998 /* Reset Source: PCIR */
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_0_S		0
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_0_M		ICE_M(0xFF, 0)
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_1_S		8
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_1_M		ICE_M(0xFF, 8)
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_2_S		16
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_2_M		ICE_M(0xFF, 16)
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_3_S		24
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_3_M		ICE_M(0xFF, 24)
+#define GLPCI_GSCL_5_8(_i)			(0x0009E954 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: PCIR */
+#define GLPCI_GSCL_5_8_MAX_INDEX		3
+#define GLPCI_GSCL_5_8_LBC_THRESHOLD_N_S	0
+#define GLPCI_GSCL_5_8_LBC_THRESHOLD_N_M	ICE_M(0xFFFF, 0)
+#define GLPCI_GSCL_5_8_LBC_TIMER_N_S		16
+#define GLPCI_GSCL_5_8_LBC_TIMER_N_M		ICE_M(0xFFFF, 16)
+#define GLPCI_GSCN_0_3(_i)			(0x0009E99C + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: PCIR */
+#define GLPCI_GSCN_0_3_MAX_INDEX		3
+#define GLPCI_GSCN_0_3_EVENT_COUNTER_S		0
+#define GLPCI_GSCN_0_3_EVENT_COUNTER_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_LATCT_NP_C			0x000BFDA0 /* Reset Source: PCIR */
+#define GLPCI_LATCT_NP_C_PCI_LATENCY_COUNT_S	0
+#define GLPCI_LATCT_NP_C_PCI_LATENCY_COUNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_LBARCTRL				0x0009DE74 /* Reset Source: POR */
+#define GLPCI_LBARCTRL_PREFBAR_S		0
+#define GLPCI_LBARCTRL_PREFBAR_M		BIT(0)
+#define GLPCI_LBARCTRL_BAR32_S			1
+#define GLPCI_LBARCTRL_BAR32_M			BIT(1)
+#define GLPCI_LBARCTRL_PAGES_SPACE_EN_PF_S	2
+#define GLPCI_LBARCTRL_PAGES_SPACE_EN_PF_M	BIT(2)
+#define GLPCI_LBARCTRL_FLASH_EXPOSE_S		3
+#define GLPCI_LBARCTRL_FLASH_EXPOSE_M		BIT(3)
+#define GLPCI_LBARCTRL_PE_DB_SIZE_S		4
+#define GLPCI_LBARCTRL_PE_DB_SIZE_M		ICE_M(0x3, 4)
+#define GLPCI_LBARCTRL_PAGES_SPACE_EN_VF_S	9
+#define GLPCI_LBARCTRL_PAGES_SPACE_EN_VF_M	BIT(9)
+#define GLPCI_LBARCTRL_EXROM_SIZE_S		11
+#define GLPCI_LBARCTRL_EXROM_SIZE_M		ICE_M(0x7, 11)
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_S		14
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_M		ICE_M(0x3, 14)
+#define GLPCI_LINKCAP				0x0009DE90 /* Reset Source: PCIR */
+#define GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_S	0
+#define GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_M	ICE_M(0x3F, 0)
+#define GLPCI_LINKCAP_MAX_LINK_WIDTH_S		9
+#define GLPCI_LINKCAP_MAX_LINK_WIDTH_M		ICE_M(0xF, 9)
+#define GLPCI_NPQ_CFG				0x000BFD80 /* Reset Source: PCIR */
+#define GLPCI_NPQ_CFG_EXTEND_TO_S		0
+#define GLPCI_NPQ_CFG_EXTEND_TO_M		BIT(0)
+#define GLPCI_NPQ_CFG_SMALL_TO_S		1
+#define GLPCI_NPQ_CFG_SMALL_TO_M		BIT(1)
+#define GLPCI_NPQ_CFG_WEIGHT_AVG_S		2
+#define GLPCI_NPQ_CFG_WEIGHT_AVG_M		ICE_M(0xF, 2)
+#define GLPCI_NPQ_CFG_NPQ_SPARE_S		6
+#define GLPCI_NPQ_CFG_NPQ_SPARE_M		ICE_M(0x3FF, 6)
+#define GLPCI_NPQ_CFG_NPQ_ERR_STAT_S		16
+#define GLPCI_NPQ_CFG_NPQ_ERR_STAT_M		ICE_M(0xF, 16)
+#define GLPCI_PKTCT_NP_C			0x000BFD9C /* Reset Source: PCIR */
+#define GLPCI_PKTCT_NP_C_PCI_COUNT_BW_PCT_S	0
+#define GLPCI_PKTCT_NP_C_PCI_COUNT_BW_PCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_PKTCT_P				0x0009E9B0 /* Reset Source: PCIR */
+#define GLPCI_PKTCT_P_PCI_COUNT_BW_PCT_S	0
+#define GLPCI_PKTCT_P_PCI_COUNT_BW_PCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_PMSUP				0x0009DE94 /* Reset Source: PCIR */
+#define GLPCI_PMSUP_RESERVED_0_S		0
+#define GLPCI_PMSUP_RESERVED_0_M		ICE_M(0x3, 0)
+#define GLPCI_PMSUP_RESERVED_1_S		2
+#define GLPCI_PMSUP_RESERVED_1_M		ICE_M(0x7, 2)
+#define GLPCI_PMSUP_RESERVED_2_S		5
+#define GLPCI_PMSUP_RESERVED_2_M		ICE_M(0x7, 5)
+#define GLPCI_PMSUP_L0S_ACC_LAT_S		8
+#define GLPCI_PMSUP_L0S_ACC_LAT_M		ICE_M(0x7, 8)
+#define GLPCI_PMSUP_L1_ACC_LAT_S		11
+#define GLPCI_PMSUP_L1_ACC_LAT_M		ICE_M(0x7, 11)
+#define GLPCI_PMSUP_RESERVED_3_S		14
+#define GLPCI_PMSUP_RESERVED_3_M		BIT(14)
+#define GLPCI_PMSUP_OBFF_SUP_S			15
+#define GLPCI_PMSUP_OBFF_SUP_M			ICE_M(0x3, 15)
+#define GLPCI_PUSH_PE_IF_TO_STATUS		0x0009DF44 /* Reset Source: PCIR */
+#define GLPCI_PUSH_PE_IF_TO_STATUS_GLPCI_PUSH_PE_IF_TO_STATUS_S 0
+#define GLPCI_PUSH_PE_IF_TO_STATUS_GLPCI_PUSH_PE_IF_TO_STATUS_M BIT(0)
+#define GLPCI_PWRDATA				0x0009DE7C /* Reset Source: PCIR */
+#define GLPCI_PWRDATA_D0_POWER_S		0
+#define GLPCI_PWRDATA_D0_POWER_M		ICE_M(0xFF, 0)
+#define GLPCI_PWRDATA_COMM_POWER_S		8
+#define GLPCI_PWRDATA_COMM_POWER_M		ICE_M(0xFF, 8)
+#define GLPCI_PWRDATA_D3_POWER_S		16
+#define GLPCI_PWRDATA_D3_POWER_M		ICE_M(0xFF, 16)
+#define GLPCI_PWRDATA_DATA_SCALE_S		24
+#define GLPCI_PWRDATA_DATA_SCALE_M		ICE_M(0x3, 24)
+#define GLPCI_REVID				0x0009DE98 /* Reset Source: PCIR */
+#define GLPCI_REVID_NVM_REVID_S			0
+#define GLPCI_REVID_NVM_REVID_M			ICE_M(0xFF, 0)
+#define GLPCI_SERH				0x0009DE84 /* Reset Source: PCIR */
+#define GLPCI_SERH_SER_NUM_H_S			0
+#define GLPCI_SERH_SER_NUM_H_M			ICE_M(0xFFFF, 0)
+#define GLPCI_SERL				0x0009DE80 /* Reset Source: PCIR */
+#define GLPCI_SERL_SER_NUM_L_S			0
+#define GLPCI_SERL_SER_NUM_L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_SUBVENID				0x0009DEE8 /* Reset Source: PCIR */
+#define GLPCI_SUBVENID_SUB_VEN_ID_S		0
+#define GLPCI_SUBVENID_SUB_VEN_ID_M		ICE_M(0xFFFF, 0)
+#define GLPCI_UPADD				0x000BE0D4 /* Reset Source: PCIR */
+#define GLPCI_UPADD_ADDRESS_S			1
+#define GLPCI_UPADD_ADDRESS_M			ICE_M(0x7FFFFFFF, 1)
+#define GLPCI_VENDORID				0x0009DEC8 /* Reset Source: PCIR */
+#define GLPCI_VENDORID_VENDORID_S		0
+#define GLPCI_VENDORID_VENDORID_M		ICE_M(0xFFFF, 0)
+#define GLPCI_VFSUP				0x0009DE9C /* Reset Source: PCIR */
+#define GLPCI_VFSUP_VF_PREFETCH_S		0
+#define GLPCI_VFSUP_VF_PREFETCH_M		BIT(0)
+#define GLPCI_VFSUP_VR_BAR_TYPE_S		1
+#define GLPCI_VFSUP_VR_BAR_TYPE_M		BIT(1)
+#define GLPCI_WATMK_CLNT_PIPEMON		0x000BFD90 /* Reset Source: PCIR */
+#define GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_S	0
+#define GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_M	ICE_M(0xFFFF, 0)
+#define PF_FUNC_RID				0x0009E880 /* Reset Source: PCIR */
+#define PF_FUNC_RID_FUNCTION_NUMBER_S		0
+#define PF_FUNC_RID_FUNCTION_NUMBER_M		ICE_M(0x7, 0)
+#define PF_FUNC_RID_DEVICE_NUMBER_S		3
+#define PF_FUNC_RID_DEVICE_NUMBER_M		ICE_M(0x1F, 3)
+#define PF_FUNC_RID_BUS_NUMBER_S		8
+#define PF_FUNC_RID_BUS_NUMBER_M		ICE_M(0xFF, 8)
+#define PF_PCI_CIAA				0x0009E580 /* Reset Source: FLR */
+#define PF_PCI_CIAA_ADDRESS_S			0
+#define PF_PCI_CIAA_ADDRESS_M			ICE_M(0xFFF, 0)
 #define PF_PCI_CIAA_VF_NUM_S			12
-#define PF_PCI_CIAD				0x0009E500
-#define GL_PWR_MODE_CTL				0x000B820C
+#define PF_PCI_CIAA_VF_NUM_M			ICE_M(0xFF, 12)
+#define PF_PCI_CIAD				0x0009E500 /* Reset Source: FLR */
+#define PF_PCI_CIAD_DATA_S			0
+#define PF_PCI_CIAD_DATA_M			ICE_M(0xFFFFFFFF, 0)
+#define PFPCI_CLASS				0x0009DB00 /* Reset Source: PCIR */
+#define PFPCI_CLASS_STORAGE_CLASS_S		0
+#define PFPCI_CLASS_STORAGE_CLASS_M		BIT(0)
+#define PFPCI_CLASS_PF_IS_LAN_S			2
+#define PFPCI_CLASS_PF_IS_LAN_M			BIT(2)
+#define PFPCI_CNF				0x0009DF00 /* Reset Source: PCIR */
+#define PFPCI_CNF_MSI_EN_S			2
+#define PFPCI_CNF_MSI_EN_M			BIT(2)
+#define PFPCI_CNF_EXROM_DIS_S			3
+#define PFPCI_CNF_EXROM_DIS_M			BIT(3)
+#define PFPCI_CNF_IO_BAR_S			4
+#define PFPCI_CNF_IO_BAR_M			BIT(4)
+#define PFPCI_CNF_INT_PIN_S			5
+#define PFPCI_CNF_INT_PIN_M			ICE_M(0x3, 5)
+#define PFPCI_DEVID				0x0009DE00 /* Reset Source: PCIR */
+#define PFPCI_DEVID_PF_DEV_ID_S			0
+#define PFPCI_DEVID_PF_DEV_ID_M			ICE_M(0xFFFF, 0)
+#define PFPCI_DEVID_VF_DEV_ID_S			16
+#define PFPCI_DEVID_VF_DEV_ID_M			ICE_M(0xFFFF, 16)
+#define PFPCI_FACTPS				0x0009E900 /* Reset Source: FLR */
+#define PFPCI_FACTPS_FUNC_POWER_STATE_S		0
+#define PFPCI_FACTPS_FUNC_POWER_STATE_M		ICE_M(0x3, 0)
+#define PFPCI_FACTPS_FUNC_AUX_EN_S		3
+#define PFPCI_FACTPS_FUNC_AUX_EN_M		BIT(3)
+#define PFPCI_FUNC				0x0009D980 /* Reset Source: POR */
+#define PFPCI_FUNC_FUNC_DIS_S			0
+#define PFPCI_FUNC_FUNC_DIS_M			BIT(0)
+#define PFPCI_FUNC_ALLOW_FUNC_DIS_S		1
+#define PFPCI_FUNC_ALLOW_FUNC_DIS_M		BIT(1)
+#define PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_S	2
+#define PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_M	BIT(2)
+#define PFPCI_PF_FLUSH_DONE			0x0009E400 /* Reset Source: PCIR */
+#define PFPCI_PF_FLUSH_DONE_FLUSH_DONE_S	0
+#define PFPCI_PF_FLUSH_DONE_FLUSH_DONE_M	BIT(0)
+#define PFPCI_PM				0x0009DA80 /* Reset Source: POR */
+#define PFPCI_PM_PME_EN_S			0
+#define PFPCI_PM_PME_EN_M			BIT(0)
+#define PFPCI_STATUS1				0x0009DA00 /* Reset Source: POR */
+#define PFPCI_STATUS1_FUNC_VALID_S		0
+#define PFPCI_STATUS1_FUNC_VALID_M		BIT(0)
+#define PFPCI_SUBSYSID				0x0009D880 /* Reset Source: PCIR */
+#define PFPCI_SUBSYSID_PF_SUBSYS_ID_S		0
+#define PFPCI_SUBSYSID_PF_SUBSYS_ID_M		ICE_M(0xFFFF, 0)
+#define PFPCI_SUBSYSID_VF_SUBSYS_ID_S		16
+#define PFPCI_SUBSYSID_VF_SUBSYS_ID_M		ICE_M(0xFFFF, 16)
+#define PFPCI_VF_FLUSH_DONE(_VF)		(0x0009E000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PCIR */
+#define PFPCI_VF_FLUSH_DONE_MAX_INDEX		255
+#define PFPCI_VF_FLUSH_DONE_FLUSH_DONE_S	0
+#define PFPCI_VF_FLUSH_DONE_FLUSH_DONE_M	BIT(0)
+#define PFPCI_VM_FLUSH_DONE			0x0009E480 /* Reset Source: PCIR */
+#define PFPCI_VM_FLUSH_DONE_FLUSH_DONE_S	0
+#define PFPCI_VM_FLUSH_DONE_FLUSH_DONE_M	BIT(0)
+#define PFPCI_VMINDEX				0x0009E600 /* Reset Source: PCIR */
+#define PFPCI_VMINDEX_VMINDEX_S			0
+#define PFPCI_VMINDEX_VMINDEX_M			ICE_M(0x3FF, 0)
+#define PFPCI_VMPEND				0x0009E800 /* Reset Source: PCIR */
+#define PFPCI_VMPEND_PENDING_S			0
+#define PFPCI_VMPEND_PENDING_M			BIT(0)
+#define PQ_FIFO_STATUS				0x0009DF40 /* Reset Source: PCIR */
+#define PQ_FIFO_STATUS_PQ_FIFO_COUNT_S		0
+#define PQ_FIFO_STATUS_PQ_FIFO_COUNT_M		ICE_M(0x7FFFFFFF, 0)
+#define PQ_FIFO_STATUS_PQ_FIFO_EMPTY_S		31
+#define PQ_FIFO_STATUS_PQ_FIFO_EMPTY_M		BIT(31)
+#define GLPE_CPUSTATUS0				0x0050BA5C /* Reset Source: CORER */
+#define GLPE_CPUSTATUS0_PECPUSTATUS0_S		0
+#define GLPE_CPUSTATUS0_PECPUSTATUS0_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPE_CPUSTATUS1				0x0050BA60 /* Reset Source: CORER */
+#define GLPE_CPUSTATUS1_PECPUSTATUS1_S		0
+#define GLPE_CPUSTATUS1_PECPUSTATUS1_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPE_CPUSTATUS2				0x0050BA64 /* Reset Source: CORER */
+#define GLPE_CPUSTATUS2_PECPUSTATUS2_S		0
+#define GLPE_CPUSTATUS2_PECPUSTATUS2_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPE_MDQ_BASE(_i)			(0x00536000 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLPE_MDQ_BASE_MAX_INDEX			511
+#define GLPE_MDQ_BASE_MDOC_INDEX_S		0
+#define GLPE_MDQ_BASE_MDOC_INDEX_M		ICE_M(0xFFFFFFF, 0)
+#define GLPE_MDQ_PTR(_i)			(0x00537000 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLPE_MDQ_PTR_MAX_INDEX			511
+#define GLPE_MDQ_PTR_MDQ_HEAD_S			0
+#define GLPE_MDQ_PTR_MDQ_HEAD_M			ICE_M(0x3FFF, 0)
+#define GLPE_MDQ_PTR_MDQ_TAIL_S			16
+#define GLPE_MDQ_PTR_MDQ_TAIL_M			ICE_M(0x3FFF, 16)
+#define GLPE_MDQ_SIZE(_i)			(0x00536800 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLPE_MDQ_SIZE_MAX_INDEX			511
+#define GLPE_MDQ_SIZE_MDQ_SIZE_S		0
+#define GLPE_MDQ_SIZE_MDQ_SIZE_M		ICE_M(0x3FFF, 0)
+#define GLPE_PEPM_CTRL				0x0050C000 /* Reset Source: PERST */
+#define GLPE_PEPM_CTRL_PEPM_ENABLE_S		0
+#define GLPE_PEPM_CTRL_PEPM_ENABLE_M		BIT(0)
+#define GLPE_PEPM_CTRL_PEPM_HALT_S		8
+#define GLPE_PEPM_CTRL_PEPM_HALT_M		BIT(8)
+#define GLPE_PEPM_CTRL_PEPM_PUSH_MARGIN_S	16
+#define GLPE_PEPM_CTRL_PEPM_PUSH_MARGIN_M	ICE_M(0xFF, 16)
+#define GLPE_PEPM_DEALLOC			0x0050C004 /* Reset Source: PERST */
+#define GLPE_PEPM_DEALLOC_MDQ_CREDITS_S		0
+#define GLPE_PEPM_DEALLOC_MDQ_CREDITS_M		ICE_M(0x3FFF, 0)
+#define GLPE_PEPM_DEALLOC_PSQ_CREDITS_S		14
+#define GLPE_PEPM_DEALLOC_PSQ_CREDITS_M		ICE_M(0x1F, 14)
+#define GLPE_PEPM_DEALLOC_PQID_S		19
+#define GLPE_PEPM_DEALLOC_PQID_M		ICE_M(0x1FF, 19)
+#define GLPE_PEPM_DEALLOC_PORT_S		28
+#define GLPE_PEPM_DEALLOC_PORT_M		ICE_M(0x7, 28)
+#define GLPE_PEPM_DEALLOC_DEALLOC_RDY_S		31
+#define GLPE_PEPM_DEALLOC_DEALLOC_RDY_M		BIT(31)
+#define GLPE_PEPM_PSQ_COUNT			0x0050C020 /* Reset Source: PERST */
+#define GLPE_PEPM_PSQ_COUNT_PEPM_PSQ_COUNT_S	0
+#define GLPE_PEPM_PSQ_COUNT_PEPM_PSQ_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PEPM_THRESH(_i)			(0x0050C840 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: PERST */
+#define GLPE_PEPM_THRESH_MAX_INDEX		511
+#define GLPE_PEPM_THRESH_PEPM_PSQ_THRESH_S	0
+#define GLPE_PEPM_THRESH_PEPM_PSQ_THRESH_M	ICE_M(0x1F, 0)
+#define GLPE_PEPM_THRESH_PEPM_MDQ_THRESH_S	16
+#define GLPE_PEPM_THRESH_PEPM_MDQ_THRESH_M	ICE_M(0x3FFF, 16)
+#define GLPE_PFAEQEDROPCNT(_i)			(0x00503240 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFAEQEDROPCNT_MAX_INDEX		7
+#define GLPE_PFAEQEDROPCNT_AEQEDROPCNT_S	0
+#define GLPE_PFAEQEDROPCNT_AEQEDROPCNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFCEQEDROPCNT(_i)			(0x00503220 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFCEQEDROPCNT_MAX_INDEX		7
+#define GLPE_PFCEQEDROPCNT_CEQEDROPCNT_S	0
+#define GLPE_PFCEQEDROPCNT_CEQEDROPCNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFCQEDROPCNT(_i)			(0x00503200 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFCQEDROPCNT_MAX_INDEX		7
+#define GLPE_PFCQEDROPCNT_CQEDROPCNT_S		0
+#define GLPE_PFCQEDROPCNT_CQEDROPCNT_M		ICE_M(0xFFFF, 0)
+#define GLPE_PFFLMOOISCALLOCERR(_i)		(0x0050B960 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFFLMOOISCALLOCERR_MAX_INDEX	7
+#define GLPE_PFFLMOOISCALLOCERR_ERROR_COUNT_S	0
+#define GLPE_PFFLMOOISCALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFFLMQ1ALLOCERR(_i)		(0x0050B920 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFFLMQ1ALLOCERR_MAX_INDEX		7
+#define GLPE_PFFLMQ1ALLOCERR_ERROR_COUNT_S	0
+#define GLPE_PFFLMQ1ALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFFLMRRFALLOCERR(_i)		(0x0050B940 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFFLMRRFALLOCERR_MAX_INDEX		7
+#define GLPE_PFFLMRRFALLOCERR_ERROR_COUNT_S	0
+#define GLPE_PFFLMRRFALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFFLMXMITALLOCERR(_i)		(0x0050B900 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFFLMXMITALLOCERR_MAX_INDEX	7
+#define GLPE_PFFLMXMITALLOCERR_ERROR_COUNT_S	0
+#define GLPE_PFFLMXMITALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFTCPNOW50USCNT(_i)		(0x0050B8C0 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFTCPNOW50USCNT_MAX_INDEX		7
+#define GLPE_PFTCPNOW50USCNT_CNT_S		0
+#define GLPE_PFTCPNOW50USCNT_CNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPE_PUSH_PEPM				0x0053241C /* Reset Source: CORER */
+#define GLPE_PUSH_PEPM_MDQ_CREDITS_S		0
+#define GLPE_PUSH_PEPM_MDQ_CREDITS_M		ICE_M(0xFF, 0)
+#define GLPE_VFAEQEDROPCNT(_i)			(0x00503100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFAEQEDROPCNT_MAX_INDEX		31
+#define GLPE_VFAEQEDROPCNT_AEQEDROPCNT_S	0
+#define GLPE_VFAEQEDROPCNT_AEQEDROPCNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFCEQEDROPCNT(_i)			(0x00503080 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFCEQEDROPCNT_MAX_INDEX		31
+#define GLPE_VFCEQEDROPCNT_CEQEDROPCNT_S	0
+#define GLPE_VFCEQEDROPCNT_CEQEDROPCNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFCQEDROPCNT(_i)			(0x00503000 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFCQEDROPCNT_MAX_INDEX		31
+#define GLPE_VFCQEDROPCNT_CQEDROPCNT_S		0
+#define GLPE_VFCQEDROPCNT_CQEDROPCNT_M		ICE_M(0xFFFF, 0)
+#define GLPE_VFFLMOOISCALLOCERR(_i)		(0x0050B580 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFFLMOOISCALLOCERR_MAX_INDEX	31
+#define GLPE_VFFLMOOISCALLOCERR_ERROR_COUNT_S	0
+#define GLPE_VFFLMOOISCALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFFLMQ1ALLOCERR(_i)		(0x0050B480 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFFLMQ1ALLOCERR_MAX_INDEX		31
+#define GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_S	0
+#define GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFFLMRRFALLOCERR(_i)		(0x0050B500 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFFLMRRFALLOCERR_MAX_INDEX		31
+#define GLPE_VFFLMRRFALLOCERR_ERROR_COUNT_S	0
+#define GLPE_VFFLMRRFALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFFLMXMITALLOCERR(_i)		(0x0050B400 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFFLMXMITALLOCERR_MAX_INDEX	31
+#define GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_S	0
+#define GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFTCPNOW50USCNT(_i)		(0x0050B300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: PE_CORER */
+#define GLPE_VFTCPNOW50USCNT_MAX_INDEX		31
+#define GLPE_VFTCPNOW50USCNT_CNT_S		0
+#define GLPE_VFTCPNOW50USCNT_CNT_M		ICE_M(0xFFFFFFFF, 0)
+#define PFPE_AEQALLOC				0x00502D00 /* Reset Source: PFR */
+#define PFPE_AEQALLOC_AECOUNT_S			0
+#define PFPE_AEQALLOC_AECOUNT_M			ICE_M(0xFFFFFFFF, 0)
+#define PFPE_CCQPHIGH				0x0050A100 /* Reset Source: PFR */
+#define PFPE_CCQPHIGH_PECCQPHIGH_S		0
+#define PFPE_CCQPHIGH_PECCQPHIGH_M		ICE_M(0xFFFFFFFF, 0)
+#define PFPE_CCQPLOW				0x0050A080 /* Reset Source: PFR */
+#define PFPE_CCQPLOW_PECCQPLOW_S		0
+#define PFPE_CCQPLOW_PECCQPLOW_M		ICE_M(0xFFFFFFFF, 0)
+#define PFPE_CCQPSTATUS				0x0050A000 /* Reset Source: PFR */
+#define PFPE_CCQPSTATUS_CCQP_DONE_S		0
+#define PFPE_CCQPSTATUS_CCQP_DONE_M		BIT(0)
+#define PFPE_CCQPSTATUS_HMC_PROFILE_S		4
+#define PFPE_CCQPSTATUS_HMC_PROFILE_M		ICE_M(0x7, 4)
+#define PFPE_CCQPSTATUS_RDMA_EN_VFS_S		16
+#define PFPE_CCQPSTATUS_RDMA_EN_VFS_M		ICE_M(0x3F, 16)
+#define PFPE_CCQPSTATUS_CCQP_ERR_S		31
+#define PFPE_CCQPSTATUS_CCQP_ERR_M		BIT(31)
+#define PFPE_CQACK				0x00502C80 /* Reset Source: PFR */
+#define PFPE_CQACK_PECQID_S			0
+#define PFPE_CQACK_PECQID_M			ICE_M(0x7FFFF, 0)
+#define PFPE_CQARM				0x00502C00 /* Reset Source: PFR */
+#define PFPE_CQARM_PECQID_S			0
+#define PFPE_CQARM_PECQID_M			ICE_M(0x7FFFF, 0)
+#define PFPE_CQPDB				0x00500800 /* Reset Source: PFR */
+#define PFPE_CQPDB_WQHEAD_S			0
+#define PFPE_CQPDB_WQHEAD_M			ICE_M(0x7FF, 0)
+#define PFPE_CQPERRCODES			0x0050A200 /* Reset Source: PFR */
+#define PFPE_CQPERRCODES_CQP_MINOR_CODE_S	0
+#define PFPE_CQPERRCODES_CQP_MINOR_CODE_M	ICE_M(0xFFFF, 0)
+#define PFPE_CQPERRCODES_CQP_MAJOR_CODE_S	16
+#define PFPE_CQPERRCODES_CQP_MAJOR_CODE_M	ICE_M(0xFFFF, 16)
+#define PFPE_CQPTAIL				0x00500880 /* Reset Source: PFR */
+#define PFPE_CQPTAIL_WQTAIL_S			0
+#define PFPE_CQPTAIL_WQTAIL_M			ICE_M(0x7FF, 0)
+#define PFPE_CQPTAIL_CQP_OP_ERR_S		31
+#define PFPE_CQPTAIL_CQP_OP_ERR_M		BIT(31)
+#define PFPE_IPCONFIG0				0x0050A180 /* Reset Source: PFR */
+#define PFPE_IPCONFIG0_PEIPID_S			0
+#define PFPE_IPCONFIG0_PEIPID_M			ICE_M(0xFFFF, 0)
+#define PFPE_IPCONFIG0_USEENTIREIDRANGE_S	16
+#define PFPE_IPCONFIG0_USEENTIREIDRANGE_M	BIT(16)
+#define PFPE_IPCONFIG0_UDP_SRC_PORT_MASK_EN_S	17
+#define PFPE_IPCONFIG0_UDP_SRC_PORT_MASK_EN_M	BIT(17)
+#define PFPE_MRTEIDXMASK			0x0050A300 /* Reset Source: PFR */
+#define PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_S	0
+#define PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_M	ICE_M(0x1F, 0)
+#define PFPE_RCVUNEXPECTEDERROR			0x0050A380 /* Reset Source: PFR */
+#define PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_S 0
+#define PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_M ICE_M(0xFFFFFF, 0)
+#define PFPE_TCPNOWTIMER			0x0050A280 /* Reset Source: PFR */
+#define PFPE_TCPNOWTIMER_TCP_NOW_S		0
+#define PFPE_TCPNOWTIMER_TCP_NOW_M		ICE_M(0xFFFFFFFF, 0)
+#define PFPE_WQEALLOC				0x00504400 /* Reset Source: PFR */
+#define PFPE_WQEALLOC_PEQPID_S			0
+#define PFPE_WQEALLOC_PEQPID_M			ICE_M(0x3FFFF, 0)
+#define PFPE_WQEALLOC_WQE_DESC_INDEX_S		20
+#define PFPE_WQEALLOC_WQE_DESC_INDEX_M		ICE_M(0xFFF, 20)
+#define PRT_PEPM_COUNT(_i)			(0x0050C040 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: PERST */
+#define PRT_PEPM_COUNT_MAX_INDEX		511
+#define PRT_PEPM_COUNT_PEPM_PSQ_COUNT_S		0
+#define PRT_PEPM_COUNT_PEPM_PSQ_COUNT_M		ICE_M(0x1F, 0)
+#define PRT_PEPM_COUNT_PEPM_MDQ_COUNT_S		16
+#define PRT_PEPM_COUNT_PEPM_MDQ_COUNT_M		ICE_M(0x3FFF, 16)
+#define VFPE_AEQALLOC(_VF)			(0x00502800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_AEQALLOC_MAX_INDEX			255
+#define VFPE_AEQALLOC_AECOUNT_S			0
+#define VFPE_AEQALLOC_AECOUNT_M			ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPHIGH(_VF)			(0x00508800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CCQPHIGH_MAX_INDEX			255
+#define VFPE_CCQPHIGH_PECCQPHIGH_S		0
+#define VFPE_CCQPHIGH_PECCQPHIGH_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPLOW(_VF)			(0x00508400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CCQPLOW_MAX_INDEX			255
+#define VFPE_CCQPLOW_PECCQPLOW_S		0
+#define VFPE_CCQPLOW_PECCQPLOW_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPSTATUS(_VF)			(0x00508000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CCQPSTATUS_MAX_INDEX		255
+#define VFPE_CCQPSTATUS_CCQP_DONE_S		0
+#define VFPE_CCQPSTATUS_CCQP_DONE_M		BIT(0)
+#define VFPE_CCQPSTATUS_HMC_PROFILE_S		4
+#define VFPE_CCQPSTATUS_HMC_PROFILE_M		ICE_M(0x7, 4)
+#define VFPE_CCQPSTATUS_RDMA_EN_VFS_S		16
+#define VFPE_CCQPSTATUS_RDMA_EN_VFS_M		ICE_M(0x3F, 16)
+#define VFPE_CCQPSTATUS_CCQP_ERR_S		31
+#define VFPE_CCQPSTATUS_CCQP_ERR_M		BIT(31)
+#define VFPE_CQACK(_VF)				(0x00502400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQACK_MAX_INDEX			255
+#define VFPE_CQACK_PECQID_S			0
+#define VFPE_CQACK_PECQID_M			ICE_M(0x7FFFF, 0)
+#define VFPE_CQARM(_VF)				(0x00502000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQARM_MAX_INDEX			255
+#define VFPE_CQARM_PECQID_S			0
+#define VFPE_CQARM_PECQID_M			ICE_M(0x7FFFF, 0)
+#define VFPE_CQPDB(_VF)				(0x00500000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQPDB_MAX_INDEX			255
+#define VFPE_CQPDB_WQHEAD_S			0
+#define VFPE_CQPDB_WQHEAD_M			ICE_M(0x7FF, 0)
+#define VFPE_CQPERRCODES(_VF)			(0x00509000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQPERRCODES_MAX_INDEX		255
+#define VFPE_CQPERRCODES_CQP_MINOR_CODE_S	0
+#define VFPE_CQPERRCODES_CQP_MINOR_CODE_M	ICE_M(0xFFFF, 0)
+#define VFPE_CQPERRCODES_CQP_MAJOR_CODE_S	16
+#define VFPE_CQPERRCODES_CQP_MAJOR_CODE_M	ICE_M(0xFFFF, 16)
+#define VFPE_CQPTAIL(_VF)			(0x00500400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQPTAIL_MAX_INDEX			255
+#define VFPE_CQPTAIL_WQTAIL_S			0
+#define VFPE_CQPTAIL_WQTAIL_M			ICE_M(0x7FF, 0)
+#define VFPE_CQPTAIL_CQP_OP_ERR_S		31
+#define VFPE_CQPTAIL_CQP_OP_ERR_M		BIT(31)
+#define VFPE_IPCONFIG0(_VF)			(0x00508C00 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_IPCONFIG0_MAX_INDEX		255
+#define VFPE_IPCONFIG0_PEIPID_S			0
+#define VFPE_IPCONFIG0_PEIPID_M			ICE_M(0xFFFF, 0)
+#define VFPE_IPCONFIG0_USEENTIREIDRANGE_S	16
+#define VFPE_IPCONFIG0_USEENTIREIDRANGE_M	BIT(16)
+#define VFPE_IPCONFIG0_UDP_SRC_PORT_MASK_EN_S	17
+#define VFPE_IPCONFIG0_UDP_SRC_PORT_MASK_EN_M	BIT(17)
+#define VFPE_RCVUNEXPECTEDERROR(_VF)		(0x00509C00 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_RCVUNEXPECTEDERROR_MAX_INDEX	255
+#define VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_S 0
+#define VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_M ICE_M(0xFFFFFF, 0)
+#define VFPE_TCPNOWTIMER(_VF)			(0x00509400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_TCPNOWTIMER_MAX_INDEX		255
+#define VFPE_TCPNOWTIMER_TCP_NOW_S		0
+#define VFPE_TCPNOWTIMER_TCP_NOW_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_WQEALLOC(_VF)			(0x00504000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_WQEALLOC_MAX_INDEX			255
+#define VFPE_WQEALLOC_PEQPID_S			0
+#define VFPE_WQEALLOC_PEQPID_M			ICE_M(0x3FFFF, 0)
+#define VFPE_WQEALLOC_WQE_DESC_INDEX_S		20
+#define VFPE_WQEALLOC_WQE_DESC_INDEX_M		ICE_M(0xFFF, 20)
+#define GLPES_PFIP4RXDISCARD(_i)		(0x00541400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXDISCARD_MAX_INDEX		127
+#define GLPES_PFIP4RXDISCARD_IP4RXDISCARD_S	0
+#define GLPES_PFIP4RXDISCARD_IP4RXDISCARD_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXFRAGSHI(_i)		(0x00541C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXFRAGSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_S	0
+#define GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXFRAGSLO(_i)		(0x00541C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXFRAGSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_S	0
+#define GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXMCOCTSHI(_i)		(0x00542404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXMCOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_S	0
+#define GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXMCOCTSLO(_i)		(0x00542400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXMCOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_S	0
+#define GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXMCPKTSHI(_i)		(0x00542C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXMCPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_S	0
+#define GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXMCPKTSLO(_i)		(0x00542C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXMCPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_S	0
+#define GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXOCTSHI(_i)			(0x00540404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_S	0
+#define GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXOCTSLO(_i)			(0x00540400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_S	0
+#define GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXPKTSHI(_i)			(0x00540C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_S	0
+#define GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXPKTSLO(_i)			(0x00540C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_S	0
+#define GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXTRUNC(_i)			(0x00541800 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXTRUNC_MAX_INDEX		127
+#define GLPES_PFIP4RXTRUNC_IP4RXTRUNC_S		0
+#define GLPES_PFIP4RXTRUNC_IP4RXTRUNC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXFRAGSHI(_i)		(0x00547404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXFRAGSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_S	0
+#define GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXFRAGSLO(_i)		(0x00547400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXFRAGSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_S	0
+#define GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXMCOCTSHI(_i)		(0x00547C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXMCOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_S	0
+#define GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXMCOCTSLO(_i)		(0x00547C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXMCOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_S	0
+#define GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXMCPKTSHI(_i)		(0x00548404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXMCPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_S	0
+#define GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXMCPKTSLO(_i)		(0x00548400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXMCPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_S	0
+#define GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXNOROUTE(_i)		(0x0054B400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXNOROUTE_MAX_INDEX		127
+#define GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_S	0
+#define GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_PFIP4TXOCTSHI(_i)			(0x00546404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_S	0
+#define GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXOCTSLO(_i)			(0x00546400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_S	0
+#define GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXPKTSHI(_i)			(0x00546C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_S	0
+#define GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXPKTSLO(_i)			(0x00546C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_S	0
+#define GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXDISCARD(_i)		(0x00544400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXDISCARD_MAX_INDEX		127
+#define GLPES_PFIP6RXDISCARD_IP6RXDISCARD_S	0
+#define GLPES_PFIP6RXDISCARD_IP6RXDISCARD_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXFRAGSHI(_i)		(0x00544C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXFRAGSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_S	0
+#define GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXFRAGSLO(_i)		(0x00544C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXFRAGSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_S	0
+#define GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXMCOCTSHI(_i)		(0x00545404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXMCOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_S	0
+#define GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXMCOCTSLO(_i)		(0x00545400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXMCOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_S	0
+#define GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXMCPKTSHI(_i)		(0x00545C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXMCPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_S	0
+#define GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXMCPKTSLO(_i)		(0x00545C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXMCPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_S	0
+#define GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXOCTSHI(_i)			(0x00543404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_S	0
+#define GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXOCTSLO(_i)			(0x00543400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_S	0
+#define GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXPKTSHI(_i)			(0x00543C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_S	0
+#define GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXPKTSLO(_i)			(0x00543C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_S	0
+#define GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXTRUNC(_i)			(0x00544800 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXTRUNC_MAX_INDEX		127
+#define GLPES_PFIP6RXTRUNC_IP6RXTRUNC_S		0
+#define GLPES_PFIP6RXTRUNC_IP6RXTRUNC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXFRAGSHI(_i)		(0x00549C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXFRAGSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_S	0
+#define GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXFRAGSLO(_i)		(0x00549C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXFRAGSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_S	0
+#define GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXMCOCTSHI(_i)		(0x0054A404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXMCOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_S	0
+#define GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXMCOCTSLO(_i)		(0x0054A400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXMCOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_S	0
+#define GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXMCPKTSHI(_i)		(0x0054AC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXMCPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_S	0
+#define GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXMCPKTSLO(_i)		(0x0054AC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXMCPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_S	0
+#define GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXNOROUTE(_i)		(0x0054B800 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXNOROUTE_MAX_INDEX		127
+#define GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_S	0
+#define GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_PFIP6TXOCTSHI(_i)			(0x00548C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_S	0
+#define GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXOCTSLO(_i)			(0x00548C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_S	0
+#define GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXPKTSHI(_i)			(0x00549404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_S	0
+#define GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXPKTSLO(_i)			(0x00549400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_S	0
+#define GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMARXRDSHI(_i)			(0x0054EC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXRDSHI_MAX_INDEX		127
+#define GLPES_PFRDMARXRDSHI_RDMARXRDSHI_S	0
+#define GLPES_PFRDMARXRDSHI_RDMARXRDSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMARXRDSLO(_i)			(0x0054EC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXRDSLO_MAX_INDEX		127
+#define GLPES_PFRDMARXRDSLO_RDMARXRDSLO_S	0
+#define GLPES_PFRDMARXRDSLO_RDMARXRDSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMARXSNDSHI(_i)		(0x0054F404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXSNDSHI_MAX_INDEX		127
+#define GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_S	0
+#define GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMARXSNDSLO(_i)		(0x0054F400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXSNDSLO_MAX_INDEX		127
+#define GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_S	0
+#define GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMARXWRSHI(_i)			(0x0054E404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXWRSHI_MAX_INDEX		127
+#define GLPES_PFRDMARXWRSHI_RDMARXWRSHI_S	0
+#define GLPES_PFRDMARXWRSHI_RDMARXWRSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMARXWRSLO(_i)			(0x0054E400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXWRSLO_MAX_INDEX		127
+#define GLPES_PFRDMARXWRSLO_RDMARXWRSLO_S	0
+#define GLPES_PFRDMARXWRSLO_RDMARXWRSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMATXRDSHI(_i)			(0x00550404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXRDSHI_MAX_INDEX		127
+#define GLPES_PFRDMATXRDSHI_RDMARXRDSHI_S	0
+#define GLPES_PFRDMATXRDSHI_RDMARXRDSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMATXRDSLO(_i)			(0x00550400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXRDSLO_MAX_INDEX		127
+#define GLPES_PFRDMATXRDSLO_RDMARXRDSLO_S	0
+#define GLPES_PFRDMATXRDSLO_RDMARXRDSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMATXSNDSHI(_i)		(0x00550C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXSNDSHI_MAX_INDEX		127
+#define GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_S	0
+#define GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMATXSNDSLO(_i)		(0x00550C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXSNDSLO_MAX_INDEX		127
+#define GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_S	0
+#define GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMATXWRSHI(_i)			(0x0054FC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXWRSHI_MAX_INDEX		127
+#define GLPES_PFRDMATXWRSHI_RDMARXWRSHI_S	0
+#define GLPES_PFRDMATXWRSHI_RDMARXWRSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMATXWRSLO(_i)			(0x0054FC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXWRSLO_MAX_INDEX		127
+#define GLPES_PFRDMATXWRSLO_RDMARXWRSLO_S	0
+#define GLPES_PFRDMATXWRSLO_RDMARXWRSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMAVBNDHI(_i)			(0x00551404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMAVBNDHI_MAX_INDEX		127
+#define GLPES_PFRDMAVBNDHI_RDMAVBNDHI_S		0
+#define GLPES_PFRDMAVBNDHI_RDMAVBNDHI_M		ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMAVBNDLO(_i)			(0x00551400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMAVBNDLO_MAX_INDEX		127
+#define GLPES_PFRDMAVBNDLO_RDMAVBNDLO_S		0
+#define GLPES_PFRDMAVBNDLO_RDMAVBNDLO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMAVINVHI(_i)			(0x00551C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMAVINVHI_MAX_INDEX		127
+#define GLPES_PFRDMAVINVHI_RDMAVINVHI_S		0
+#define GLPES_PFRDMAVINVHI_RDMAVINVHI_M		ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMAVINVLO(_i)			(0x00551C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMAVINVLO_MAX_INDEX		127
+#define GLPES_PFRDMAVINVLO_RDMAVINVLO_S		0
+#define GLPES_PFRDMAVINVLO_RDMAVINVLO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRXVLANERR(_i)			(0x00540000 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRXVLANERR_MAX_INDEX		127
+#define GLPES_PFRXVLANERR_RXVLANERR_S		0
+#define GLPES_PFRXVLANERR_RXVLANERR_M		ICE_M(0xFFFFFF, 0)
+#define GLPES_PFTCPRTXSEG(_i)			(0x00552400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRTXSEG_MAX_INDEX		127
+#define GLPES_PFTCPRTXSEG_TCPRTXSEG_S		0
+#define GLPES_PFTCPRTXSEG_TCPRTXSEG_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFTCPRXOPTERR(_i)			(0x0054C400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRXOPTERR_MAX_INDEX		127
+#define GLPES_PFTCPRXOPTERR_TCPRXOPTERR_S	0
+#define GLPES_PFTCPRXOPTERR_TCPRXOPTERR_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_PFTCPRXPROTOERR(_i)		(0x0054C800 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRXPROTOERR_MAX_INDEX		127
+#define GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_S	0
+#define GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_PFTCPRXSEGSHI(_i)			(0x0054BC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRXSEGSHI_MAX_INDEX		127
+#define GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_S	0
+#define GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFTCPRXSEGSLO(_i)			(0x0054BC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRXSEGSLO_MAX_INDEX		127
+#define GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_S	0
+#define GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFTCPTXSEGHI(_i)			(0x0054CC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPTXSEGHI_MAX_INDEX		127
+#define GLPES_PFTCPTXSEGHI_TCPTXSEGHI_S		0
+#define GLPES_PFTCPTXSEGHI_TCPTXSEGHI_M		ICE_M(0xFFFF, 0)
+#define GLPES_PFTCPTXSEGLO(_i)			(0x0054CC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPTXSEGLO_MAX_INDEX		127
+#define GLPES_PFTCPTXSEGLO_TCPTXSEGLO_S		0
+#define GLPES_PFTCPTXSEGLO_TCPTXSEGLO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFUDPRXPKTSHI(_i)			(0x0054D404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFUDPRXPKTSHI_MAX_INDEX		127
+#define GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_S	0
+#define GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFUDPRXPKTSLO(_i)			(0x0054D400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFUDPRXPKTSLO_MAX_INDEX		127
+#define GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_S	0
+#define GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFUDPTXPKTSHI(_i)			(0x0054DC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFUDPTXPKTSHI_MAX_INDEX		127
+#define GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_S	0
+#define GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFUDPTXPKTSLO(_i)			(0x0054DC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFUDPTXPKTSLO_MAX_INDEX		127
+#define GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_S	0
+#define GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_RDMARXMULTFPDUSHI			0x0055E00C /* Reset Source: CORER */
+#define GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_S 0
+#define GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_RDMARXMULTFPDUSLO			0x0055E008 /* Reset Source: CORER */
+#define GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_S 0
+#define GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_RDMARXOOODDPHI			0x0055E014 /* Reset Source: CORER */
+#define GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_S	0
+#define GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_RDMARXOOODDPLO			0x0055E010 /* Reset Source: CORER */
+#define GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_S	0
+#define GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_RDMARXOOONOMARK			0x0055E004 /* Reset Source: CORER */
+#define GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_S	0
+#define GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_RDMARXUNALIGN			0x0055E000 /* Reset Source: CORER */
+#define GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_S	0
+#define GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXFOURHOLEHI			0x0055E03C /* Reset Source: CORER */
+#define GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_S 0
+#define GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXFOURHOLELO			0x0055E038 /* Reset Source: CORER */
+#define GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_S 0
+#define GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXONEHOLEHI			0x0055E024 /* Reset Source: CORER */
+#define GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_S	0
+#define GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXONEHOLELO			0x0055E020 /* Reset Source: CORER */
+#define GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_S	0
+#define GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXPUREACKHI			0x0055E01C /* Reset Source: CORER */
+#define GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_S	0
+#define GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXPUREACKSLO			0x0055E018 /* Reset Source: CORER */
+#define GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_S	0
+#define GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXTHREEHOLEHI			0x0055E034 /* Reset Source: CORER */
+#define GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_S 0
+#define GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXTHREEHOLELO			0x0055E030 /* Reset Source: CORER */
+#define GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_S 0
+#define GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXTWOHOLEHI			0x0055E02C /* Reset Source: CORER */
+#define GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_S	0
+#define GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXTWOHOLELO			0x0055E028 /* Reset Source: CORER */
+#define GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_S	0
+#define GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPTXRETRANSFASTHI		0x0055E044 /* Reset Source: CORER */
+#define GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_S 0
+#define GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPTXRETRANSFASTLO		0x0055E040 /* Reset Source: CORER */
+#define GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_S 0
+#define GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPTXTOUTSFASTHI			0x0055E04C /* Reset Source: CORER */
+#define GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_S 0
+#define GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPTXTOUTSFASTLO			0x0055E048 /* Reset Source: CORER */
+#define GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_S 0
+#define GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPTXTOUTSHI			0x0055E054 /* Reset Source: CORER */
+#define GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_S	0
+#define GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPTXTOUTSLO			0x0055E050 /* Reset Source: CORER */
+#define GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_S	0
+#define GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_PWR_MODE_CTL				0x000B820C /* Reset Source: POR */
+#define GL_PWR_MODE_CTL_SWITCH_PWR_MODE_EN_S	0
+#define GL_PWR_MODE_CTL_SWITCH_PWR_MODE_EN_M	BIT(0)
+#define GL_PWR_MODE_CTL_NIC_PWR_MODE_EN_S	1
+#define GL_PWR_MODE_CTL_NIC_PWR_MODE_EN_M	BIT(1)
+#define GL_PWR_MODE_CTL_S5_PWR_MODE_EN_S	2
+#define GL_PWR_MODE_CTL_S5_PWR_MODE_EN_M	BIT(2)
+#define GL_PWR_MODE_CTL_CAR_MAX_SW_CONFIG_S	3
+#define GL_PWR_MODE_CTL_CAR_MAX_SW_CONFIG_M	ICE_M(0x3, 3)
 #define GL_PWR_MODE_CTL_CAR_MAX_BW_S		30
 #define GL_PWR_MODE_CTL_CAR_MAX_BW_M		ICE_M(0x3, 30)
-#define GLPRT_BPRCL(_i)				(0x00381380 + ((_i) * 8))
-#define GLPRT_BPTCL(_i)				(0x00381240 + ((_i) * 8))
-#define GLPRT_CRCERRS(_i)			(0x00380100 + ((_i) * 8))
-#define GLPRT_GORCL(_i)				(0x00380000 + ((_i) * 8))
-#define GLPRT_GOTCL(_i)				(0x00380B40 + ((_i) * 8))
-#define GLPRT_ILLERRC(_i)			(0x003801C0 + ((_i) * 8))
-#define GLPRT_LXOFFRXC(_i)			(0x003802C0 + ((_i) * 8))
-#define GLPRT_LXOFFTXC(_i)			(0x00381180 + ((_i) * 8))
-#define GLPRT_LXONRXC(_i)			(0x00380280 + ((_i) * 8))
-#define GLPRT_LXONTXC(_i)			(0x00381140 + ((_i) * 8))
-#define GLPRT_MLFC(_i)				(0x00380040 + ((_i) * 8))
-#define GLPRT_MPRCL(_i)				(0x00381340 + ((_i) * 8))
-#define GLPRT_MPTCL(_i)				(0x00381200 + ((_i) * 8))
-#define GLPRT_MRFC(_i)				(0x00380080 + ((_i) * 8))
-#define GLPRT_PRC1023L(_i)			(0x00380A00 + ((_i) * 8))
-#define GLPRT_PRC127L(_i)			(0x00380940 + ((_i) * 8))
-#define GLPRT_PRC1522L(_i)			(0x00380A40 + ((_i) * 8))
-#define GLPRT_PRC255L(_i)			(0x00380980 + ((_i) * 8))
-#define GLPRT_PRC511L(_i)			(0x003809C0 + ((_i) * 8))
-#define GLPRT_PRC64L(_i)			(0x00380900 + ((_i) * 8))
-#define GLPRT_PRC9522L(_i)			(0x00380A80 + ((_i) * 8))
-#define GLPRT_PTC1023L(_i)			(0x00380C80 + ((_i) * 8))
-#define GLPRT_PTC127L(_i)			(0x00380BC0 + ((_i) * 8))
-#define GLPRT_PTC1522L(_i)			(0x00380CC0 + ((_i) * 8))
-#define GLPRT_PTC255L(_i)			(0x00380C00 + ((_i) * 8))
-#define GLPRT_PTC511L(_i)			(0x00380C40 + ((_i) * 8))
-#define GLPRT_PTC64L(_i)			(0x00380B80 + ((_i) * 8))
-#define GLPRT_PTC9522L(_i)			(0x00380D00 + ((_i) * 8))
-#define GLPRT_PXOFFRXC(_i, _j)			(0x00380500 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_PXOFFTXC(_i, _j)			(0x00380F40 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_PXONRXC(_i, _j)			(0x00380300 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_PXONTXC(_i, _j)			(0x00380D40 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_RFC(_i)				(0x00380AC0 + ((_i) * 8))
-#define GLPRT_RJC(_i)				(0x00380B00 + ((_i) * 8))
-#define GLPRT_RLEC(_i)				(0x00380140 + ((_i) * 8))
-#define GLPRT_ROC(_i)				(0x00380240 + ((_i) * 8))
-#define GLPRT_RUC(_i)				(0x00380200 + ((_i) * 8))
-#define GLPRT_RXON2OFFCNT(_i, _j)		(0x00380700 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_TDOLD(_i)				(0x00381280 + ((_i) * 8))
-#define GLPRT_UPRCL(_i)				(0x00381300 + ((_i) * 8))
-#define GLPRT_UPTCL(_i)				(0x003811C0 + ((_i) * 8))
-#define GLV_BPRCL(_i)				(0x003B6000 + ((_i) * 8))
-#define GLV_BPTCL(_i)				(0x0030E000 + ((_i) * 8))
-#define GLV_GORCL(_i)				(0x003B0000 + ((_i) * 8))
-#define GLV_GOTCL(_i)				(0x00300000 + ((_i) * 8))
-#define GLV_MPRCL(_i)				(0x003B4000 + ((_i) * 8))
-#define GLV_MPTCL(_i)				(0x0030C000 + ((_i) * 8))
-#define GLV_RDPC(_i)				(0x00294C04 + ((_i) * 4))
-#define GLV_TEPC(_VSI)				(0x00312000 + ((_VSI) * 4))
-#define GLV_UPRCL(_i)				(0x003B2000 + ((_i) * 8))
-#define GLV_UPTCL(_i)				(0x0030A000 + ((_i) * 8))
-#define PF_VT_PFALLOC_HIF			0x0009DD80
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT	0x000B825C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_PECLK_S 0
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_PECLK_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_UCLK_S 3
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_UCLK_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_LCLK_S 6
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_LCLK_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_PSM_S 9
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_PSM_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_RXCTL_S 12
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_RXCTL_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_UANA_S 15
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_UANA_M ICE_M(0x7, 15)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_S5_S 18
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_S5_M ICE_M(0x7, 18)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT	0x000B8218 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_PECLK_S 0
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_PECLK_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_UCLK_S 3
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_UCLK_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_LCLK_S 6
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_LCLK_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_PSM_S 9
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_PSM_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_RXCTL_S 12
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_RXCTL_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_UANA_S 15
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_UANA_M ICE_M(0x7, 15)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_S5_S 18
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_S5_M ICE_M(0x7, 18)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT	0x000B8260 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_PECLK_S 0
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_PECLK_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_UCLK_S 3
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_UCLK_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_LCLK_S 6
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_LCLK_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_PSM_S 9
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_PSM_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_RXCTL_S 12
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_RXCTL_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_UANA_S 15
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_UANA_M ICE_M(0x7, 15)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_S5_S 18
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_S5_M ICE_M(0x7, 18)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK	0x000B8200 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK	0x000B81F0 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM	0x000B81FC /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL	0x000B81F8 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA	0x000B8208 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK	0x000B81F4 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK	0x000B8244 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK	0x000B8220 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM	0x000B8240 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL	0x000B823C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA	0x000B8248 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK	0x000B8238 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK	0x000B8230 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK	0x000B821C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM	0x000B822C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL	0x000B8228 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA	0x000B8234 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK	0x000B8224 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL		0x000B81EC /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL		0x000B824C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL		0x000B8250 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_S5_PWR_MODE_EXIT_CTL			0x000B8270 /* Reset Source: POR */
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_AUTO_EXIT_S 0
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_AUTO_EXIT_M BIT(0)
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_FW_EXIT_S 1
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_FW_EXIT_M BIT(1)
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_PRST_FLOWS_ON_CORER_S 3
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_PRST_FLOWS_ON_CORER_M BIT(3)
+#define GLGEN_PME_TO				0x000B81BC /* Reset Source: POR */
+#define GLGEN_PME_TO_PME_TO_FOR_PE_S		0
+#define GLGEN_PME_TO_PME_TO_FOR_PE_M		BIT(0)
+#define PRTPM_EEE_STAT				0x001E4320 /* Reset Source: GLOBR */
+#define PRTPM_EEE_STAT_EEE_NEG_S		29
+#define PRTPM_EEE_STAT_EEE_NEG_M		BIT(29)
+#define PRTPM_EEE_STAT_RX_LPI_STATUS_S		30
+#define PRTPM_EEE_STAT_RX_LPI_STATUS_M		BIT(30)
+#define PRTPM_EEE_STAT_TX_LPI_STATUS_S		31
+#define PRTPM_EEE_STAT_TX_LPI_STATUS_M		BIT(31)
+#define PRTPM_EEEC				0x001E4380 /* Reset Source: GLOBR */
+#define PRTPM_EEEC_TW_WAKE_MIN_S		16
+#define PRTPM_EEEC_TW_WAKE_MIN_M		ICE_M(0x3F, 16)
+#define PRTPM_EEEC_TX_LU_LPI_DLY_S		24
+#define PRTPM_EEEC_TX_LU_LPI_DLY_M		ICE_M(0x3, 24)
+#define PRTPM_EEEC_TEEE_DLY_S			26
+#define PRTPM_EEEC_TEEE_DLY_M			ICE_M(0x3F, 26)
+#define PRTPM_EEEFWD				0x001E4400 /* Reset Source: GLOBR */
+#define PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_S	31
+#define PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_M	BIT(31)
+#define PRTPM_EEER				0x001E4360 /* Reset Source: GLOBR */
+#define PRTPM_EEER_TW_SYSTEM_S			0
+#define PRTPM_EEER_TW_SYSTEM_M			ICE_M(0xFFFF, 0)
+#define PRTPM_EEER_TX_LPI_EN_S			16
+#define PRTPM_EEER_TX_LPI_EN_M			BIT(16)
+#define PRTPM_EEETXC				0x001E43E0 /* Reset Source: GLOBR */
+#define PRTPM_EEETXC_TW_PHY_S			0
+#define PRTPM_EEETXC_TW_PHY_M			ICE_M(0xFFFF, 0)
+#define PRTPM_RLPIC				0x001E43A0 /* Reset Source: GLOBR */
+#define PRTPM_RLPIC_ERLPIC_S			0
+#define PRTPM_RLPIC_ERLPIC_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTPM_TLPIC				0x001E43C0 /* Reset Source: GLOBR */
+#define PRTPM_TLPIC_ETLPIC_S			0
+#define PRTPM_TLPIC_ETLPIC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLRPB_DHW(_i)				(0x000AC000 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPB_DHW_MAX_INDEX			15
+#define GLRPB_DHW_DHW_TCN_S			0
+#define GLRPB_DHW_DHW_TCN_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_DLW(_i)				(0x000AC044 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPB_DLW_MAX_INDEX			15
+#define GLRPB_DLW_DLW_TCN_S			0
+#define GLRPB_DLW_DLW_TCN_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_DPS(_i)				(0x000AC084 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPB_DPS_MAX_INDEX			15
+#define GLRPB_DPS_DPS_TCN_S			0
+#define GLRPB_DPS_DPS_TCN_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_DSI_EN				0x000AC324 /* Reset Source: CORER */
+#define GLRPB_DSI_EN_DSI_EN_S			0
+#define GLRPB_DSI_EN_DSI_EN_M			BIT(0)
+#define GLRPB_DSI_EN_DSI_L2_MAC_ERR_DROP_EN_S	1
+#define GLRPB_DSI_EN_DSI_L2_MAC_ERR_DROP_EN_M	BIT(1)
+#define GLRPB_SHW(_i)				(0x000AC120 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPB_SHW_MAX_INDEX			7
+#define GLRPB_SHW_SHW_S				0
+#define GLRPB_SHW_SHW_M				ICE_M(0xFFFFF, 0)
+#define GLRPB_SLW(_i)				(0x000AC140 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPB_SLW_MAX_INDEX			7
+#define GLRPB_SLW_SLW_S				0
+#define GLRPB_SLW_SLW_M				ICE_M(0xFFFFF, 0)
+#define GLRPB_SPS(_i)				(0x000AC0C4 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPB_SPS_MAX_INDEX			7
+#define GLRPB_SPS_SPS_TCN_S			0
+#define GLRPB_SPS_SPS_TCN_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_TC_CFG(_i)			(0x000AC2A4 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPB_TC_CFG_MAX_INDEX			31
+#define GLRPB_TC_CFG_D_POOL_S			0
+#define GLRPB_TC_CFG_D_POOL_M			ICE_M(0xFFFF, 0)
+#define GLRPB_TC_CFG_S_POOL_S			16
+#define GLRPB_TC_CFG_S_POOL_M			ICE_M(0xFFFF, 16)
+#define GLRPB_TCHW(_i)				(0x000AC330 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPB_TCHW_MAX_INDEX			31
+#define GLRPB_TCHW_TCHW_S			0
+#define GLRPB_TCHW_TCHW_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_TCLW(_i)				(0x000AC3B0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPB_TCLW_MAX_INDEX			31
+#define GLRPB_TCLW_TCLW_S			0
+#define GLRPB_TCLW_TCLW_M			ICE_M(0xFFFFF, 0)
+#define GLQF_APBVT(_i)				(0x00450000 + ((_i) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLQF_APBVT_MAX_INDEX			2047
+#define GLQF_APBVT_APBVT_S			0
+#define GLQF_APBVT_APBVT_M			ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FD_CLSN_0				0x00460028 /* Reset Source: CORER */
+#define GLQF_FD_CLSN_0_HITSBCNT_S		0
+#define GLQF_FD_CLSN_0_HITSBCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FD_CLSN1				0x00460030 /* Reset Source: CORER */
+#define GLQF_FD_CLSN1_HITLBCNT_S		0
+#define GLQF_FD_CLSN1_HITLBCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FD_CNT				0x00460018 /* Reset Source: CORER */
+#define GLQF_FD_CNT_FD_GCNT_S			0
+#define GLQF_FD_CNT_FD_GCNT_M			ICE_M(0x7FFF, 0)
+#define GLQF_FD_CNT_FD_BCNT_S			16
+#define GLQF_FD_CNT_FD_BCNT_M			ICE_M(0x7FFF, 16)
+#define GLQF_FD_CTL				0x00460000 /* Reset Source: CORER */
+#define GLQF_FD_CTL_FDLONG_S			0
+#define GLQF_FD_CTL_FDLONG_M			ICE_M(0xF, 0)
+#define GLQF_FD_CTL_HASH_REPORT_S		4
+#define GLQF_FD_CTL_HASH_REPORT_M		BIT(4)
+#define GLQF_FD_CTL_FLT_ADDR_REPORT_S		5
+#define GLQF_FD_CTL_FLT_ADDR_REPORT_M		BIT(5)
+#define GLQF_FD_SIZE				0x00460010 /* Reset Source: CORER */
+#define GLQF_FD_SIZE_FD_GSIZE_S			0
+#define GLQF_FD_SIZE_FD_GSIZE_M			ICE_M(0x7FFF, 0)
+#define GLQF_FD_SIZE_FD_BSIZE_S			16
+#define GLQF_FD_SIZE_FD_BSIZE_M			ICE_M(0x7FFF, 16)
+#define GLQF_FDCNT_0				0x00460020 /* Reset Source: CORER */
+#define GLQF_FDCNT_0_BUCKETCNT_S		0
+#define GLQF_FDCNT_0_BUCKETCNT_M		ICE_M(0x7FFF, 0)
+#define GLQF_FDCNT_0_CNT_NOT_VLD_S		31
+#define GLQF_FDCNT_0_CNT_NOT_VLD_M		BIT(31)
+#define GLQF_FDEVICTENA(_i)			(0x00452000 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLQF_FDEVICTENA_MAX_INDEX		3
+#define GLQF_FDEVICTENA_FDEVICTENA_S		0
+#define GLQF_FDEVICTENA_FDEVICTENA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FDINSET(_i, _j)			(0x00412000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_FDINSET_MAX_INDEX			127
+#define GLQF_FDINSET_FV_WORD_INDX0_S		0
+#define GLQF_FDINSET_FV_WORD_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_FDINSET_FV_WORD_VAL0_S		7
+#define GLQF_FDINSET_FV_WORD_VAL0_M		BIT(7)
+#define GLQF_FDINSET_FV_WORD_INDX1_S		8
+#define GLQF_FDINSET_FV_WORD_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_FDINSET_FV_WORD_VAL1_S		15
+#define GLQF_FDINSET_FV_WORD_VAL1_M		BIT(15)
+#define GLQF_FDINSET_FV_WORD_INDX2_S		16
+#define GLQF_FDINSET_FV_WORD_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_FDINSET_FV_WORD_VAL2_S		23
+#define GLQF_FDINSET_FV_WORD_VAL2_M		BIT(23)
+#define GLQF_FDINSET_FV_WORD_INDX3_S		24
+#define GLQF_FDINSET_FV_WORD_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_FDINSET_FV_WORD_VAL3_S		31
+#define GLQF_FDINSET_FV_WORD_VAL3_M		BIT(31)
+#define GLQF_FDMASK(_i)				(0x00410800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLQF_FDMASK_MAX_INDEX			31
+#define GLQF_FDMASK_MSK_INDEX_S			0
+#define GLQF_FDMASK_MSK_INDEX_M			ICE_M(0x1F, 0)
+#define GLQF_FDMASK_MASK_S			16
+#define GLQF_FDMASK_MASK_M			ICE_M(0xFFFF, 16)
+#define GLQF_FDMASK_SEL(_i)			(0x00410400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLQF_FDMASK_SEL_MAX_INDEX		127
+#define GLQF_FDMASK_SEL_MASK_SEL_S		0
+#define GLQF_FDMASK_SEL_MASK_SEL_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FDSWAP(_i, _j)			(0x00413000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_FDSWAP_MAX_INDEX			127
+#define GLQF_FDSWAP_FV_WORD_INDX0_S		0
+#define GLQF_FDSWAP_FV_WORD_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_FDSWAP_FV_WORD_VAL0_S		7
+#define GLQF_FDSWAP_FV_WORD_VAL0_M		BIT(7)
+#define GLQF_FDSWAP_FV_WORD_INDX1_S		8
+#define GLQF_FDSWAP_FV_WORD_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_FDSWAP_FV_WORD_VAL1_S		15
+#define GLQF_FDSWAP_FV_WORD_VAL1_M		BIT(15)
+#define GLQF_FDSWAP_FV_WORD_INDX2_S		16
+#define GLQF_FDSWAP_FV_WORD_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_FDSWAP_FV_WORD_VAL2_S		23
+#define GLQF_FDSWAP_FV_WORD_VAL2_M		BIT(23)
+#define GLQF_FDSWAP_FV_WORD_INDX3_S		24
+#define GLQF_FDSWAP_FV_WORD_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_FDSWAP_FV_WORD_VAL3_S		31
+#define GLQF_FDSWAP_FV_WORD_VAL3_M		BIT(31)
+#define GLQF_HINSET(_i, _j)			(0x0040E000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_HINSET_MAX_INDEX			127
+#define GLQF_HINSET_FV_WORD_INDX0_S		0
+#define GLQF_HINSET_FV_WORD_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_HINSET_FV_WORD_VAL0_S		7
+#define GLQF_HINSET_FV_WORD_VAL0_M		BIT(7)
+#define GLQF_HINSET_FV_WORD_INDX1_S		8
+#define GLQF_HINSET_FV_WORD_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_HINSET_FV_WORD_VAL1_S		15
+#define GLQF_HINSET_FV_WORD_VAL1_M		BIT(15)
+#define GLQF_HINSET_FV_WORD_INDX2_S		16
+#define GLQF_HINSET_FV_WORD_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_HINSET_FV_WORD_VAL2_S		23
+#define GLQF_HINSET_FV_WORD_VAL2_M		BIT(23)
+#define GLQF_HINSET_FV_WORD_INDX3_S		24
+#define GLQF_HINSET_FV_WORD_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_HINSET_FV_WORD_VAL3_S		31
+#define GLQF_HINSET_FV_WORD_VAL3_M		BIT(31)
+#define GLQF_HKEY(_i)				(0x00456000 + ((_i) * 4)) /* _i=0...12 */ /* Reset Source: CORER */
+#define GLQF_HKEY_MAX_INDEX			12
+#define GLQF_HKEY_KEY_0_S			0
+#define GLQF_HKEY_KEY_0_M			ICE_M(0xFF, 0)
+#define GLQF_HKEY_KEY_1_S			8
+#define GLQF_HKEY_KEY_1_M			ICE_M(0xFF, 8)
+#define GLQF_HKEY_KEY_2_S			16
+#define GLQF_HKEY_KEY_2_M			ICE_M(0xFF, 16)
+#define GLQF_HKEY_KEY_3_S			24
+#define GLQF_HKEY_KEY_3_M			ICE_M(0xFF, 24)
+#define GLQF_HLUT(_i, _j)			(0x00438000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...15 */ /* Reset Source: CORER */
+#define GLQF_HLUT_MAX_INDEX			127
+#define GLQF_HLUT_LUT0_S			0
+#define GLQF_HLUT_LUT0_M			ICE_M(0x3F, 0)
+#define GLQF_HLUT_LUT1_S			8
+#define GLQF_HLUT_LUT1_M			ICE_M(0x3F, 8)
+#define GLQF_HLUT_LUT2_S			16
+#define GLQF_HLUT_LUT2_M			ICE_M(0x3F, 16)
+#define GLQF_HLUT_LUT3_S			24
+#define GLQF_HLUT_LUT3_M			ICE_M(0x3F, 24)
+#define GLQF_HLUT_SIZE(_i)			(0x00455400 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLQF_HLUT_SIZE_MAX_INDEX		15
+#define GLQF_HLUT_SIZE_HSIZE_S			0
+#define GLQF_HLUT_SIZE_HSIZE_M			BIT(0)
+#define GLQF_HMASK(_i)				(0x0040FC00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLQF_HMASK_MAX_INDEX			31
+#define GLQF_HMASK_MSK_INDEX_S			0
+#define GLQF_HMASK_MSK_INDEX_M			ICE_M(0x1F, 0)
+#define GLQF_HMASK_MASK_S			16
+#define GLQF_HMASK_MASK_M			ICE_M(0xFFFF, 16)
+#define GLQF_HMASK_SEL(_i)			(0x00410000 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLQF_HMASK_SEL_MAX_INDEX		127
+#define GLQF_HMASK_SEL_MASK_SEL_S		0
+#define GLQF_HMASK_SEL_MASK_SEL_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_HSYMM(_i, _j)			(0x0040F000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_HSYMM_MAX_INDEX			127
+#define GLQF_HSYMM_FV_SYMM_INDX0_S		0
+#define GLQF_HSYMM_FV_SYMM_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_HSYMM_SYMM0_ENA_S			7
+#define GLQF_HSYMM_SYMM0_ENA_M			BIT(7)
+#define GLQF_HSYMM_FV_SYMM_INDX1_S		8
+#define GLQF_HSYMM_FV_SYMM_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_HSYMM_SYMM1_ENA_S			15
+#define GLQF_HSYMM_SYMM1_ENA_M			BIT(15)
+#define GLQF_HSYMM_FV_SYMM_INDX2_S		16
+#define GLQF_HSYMM_FV_SYMM_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_HSYMM_SYMM2_ENA_S			23
+#define GLQF_HSYMM_SYMM2_ENA_M			BIT(23)
+#define GLQF_HSYMM_FV_SYMM_INDX3_S		24
+#define GLQF_HSYMM_FV_SYMM_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_HSYMM_SYMM3_ENA_S			31
+#define GLQF_HSYMM_SYMM3_ENA_M			BIT(31)
+#define GLQF_PE_APBVT_CNT			0x00455500 /* Reset Source: CORER */
+#define GLQF_PE_APBVT_CNT_APBVT_LAN_S		0
+#define GLQF_PE_APBVT_CNT_APBVT_LAN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_PE_CMD				0x00471080 /* Reset Source: CORER */
+#define GLQF_PE_CMD_ADDREM_STS_S		0
+#define GLQF_PE_CMD_ADDREM_STS_M		ICE_M(0xFFFFFF, 0)
+#define GLQF_PE_CMD_ADDREM_ID_S			28
+#define GLQF_PE_CMD_ADDREM_ID_M			ICE_M(0xF, 28)
+#define GLQF_PE_CTL				0x004710C0 /* Reset Source: CORER */
+#define GLQF_PE_CTL_PELONG_S			0
+#define GLQF_PE_CTL_PELONG_M			ICE_M(0xF, 0)
+#define GLQF_PE_CTL2(_i)			(0x00455200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLQF_PE_CTL2_MAX_INDEX			31
+#define GLQF_PE_CTL2_TO_QH_S			0
+#define GLQF_PE_CTL2_TO_QH_M			ICE_M(0x3, 0)
+#define GLQF_PE_CTL2_APBVT_ENA_S		2
+#define GLQF_PE_CTL2_APBVT_ENA_M		BIT(2)
+#define GLQF_PE_FVE				0x0020E514 /* Reset Source: CORER */
+#define GLQF_PE_FVE_W_ENA_S			0
+#define GLQF_PE_FVE_W_ENA_M			ICE_M(0xFFFFFF, 0)
+#define GLQF_PE_OSR_STS				0x00471040 /* Reset Source: CORER */
+#define GLQF_PE_OSR_STS_QH_SRCH_MAXOSR_S	0
+#define GLQF_PE_OSR_STS_QH_SRCH_MAXOSR_M	ICE_M(0x3FF, 0)
+#define GLQF_PE_OSR_STS_QH_CMD_MAXOSR_S		16
+#define GLQF_PE_OSR_STS_QH_CMD_MAXOSR_M		ICE_M(0x3FF, 16)
+#define GLQF_PEINSET(_i, _j)			(0x00415000 + ((_i) * 4 + (_j) * 128)) /* _i=0...31, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_PEINSET_MAX_INDEX			31
+#define GLQF_PEINSET_FV_WORD_INDX0_S		0
+#define GLQF_PEINSET_FV_WORD_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_PEINSET_FV_WORD_VAL0_S		7
+#define GLQF_PEINSET_FV_WORD_VAL0_M		BIT(7)
+#define GLQF_PEINSET_FV_WORD_INDX1_S		8
+#define GLQF_PEINSET_FV_WORD_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_PEINSET_FV_WORD_VAL1_S		15
+#define GLQF_PEINSET_FV_WORD_VAL1_M		BIT(15)
+#define GLQF_PEINSET_FV_WORD_INDX2_S		16
+#define GLQF_PEINSET_FV_WORD_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_PEINSET_FV_WORD_VAL2_S		23
+#define GLQF_PEINSET_FV_WORD_VAL2_M		BIT(23)
+#define GLQF_PEINSET_FV_WORD_INDX3_S		24
+#define GLQF_PEINSET_FV_WORD_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_PEINSET_FV_WORD_VAL3_S		31
+#define GLQF_PEINSET_FV_WORD_VAL3_M		BIT(31)
+#define GLQF_PEMASK(_i)				(0x00415400 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLQF_PEMASK_MAX_INDEX			15
+#define GLQF_PEMASK_MSK_INDEX_S			0
+#define GLQF_PEMASK_MSK_INDEX_M			ICE_M(0x1F, 0)
+#define GLQF_PEMASK_MASK_S			16
+#define GLQF_PEMASK_MASK_M			ICE_M(0xFFFF, 16)
+#define GLQF_PEMASK_SEL(_i)			(0x00415500 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLQF_PEMASK_SEL_MAX_INDEX		31
+#define GLQF_PEMASK_SEL_MASK_SEL_S		0
+#define GLQF_PEMASK_SEL_MASK_SEL_M		ICE_M(0xFFFF, 0)
+#define GLQF_PETABLE_CLR(_i)			(0x000AA078 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLQF_PETABLE_CLR_MAX_INDEX		1
+#define GLQF_PETABLE_CLR_VM_VF_NUM_S		0
+#define GLQF_PETABLE_CLR_VM_VF_NUM_M		ICE_M(0x3FF, 0)
+#define GLQF_PETABLE_CLR_VM_VF_TYPE_S		10
+#define GLQF_PETABLE_CLR_VM_VF_TYPE_M		ICE_M(0x3, 10)
+#define GLQF_PETABLE_CLR_PF_NUM_S		12
+#define GLQF_PETABLE_CLR_PF_NUM_M		ICE_M(0x7, 12)
+#define GLQF_PETABLE_CLR_PE_BUSY_S		16
+#define GLQF_PETABLE_CLR_PE_BUSY_M		BIT(16)
+#define GLQF_PETABLE_CLR_PE_CLEAR_S		17
+#define GLQF_PETABLE_CLR_PE_CLEAR_M		BIT(17)
+#define GLQF_PROF2TC(_i, _j)			(0x0044D000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...3 */ /* Reset Source: CORER */
+#define GLQF_PROF2TC_MAX_INDEX			127
+#define GLQF_PROF2TC_OVERRIDE_ENA_0_S		0
+#define GLQF_PROF2TC_OVERRIDE_ENA_0_M		BIT(0)
+#define GLQF_PROF2TC_REGION_0_S			1
+#define GLQF_PROF2TC_REGION_0_M			ICE_M(0x7, 1)
+#define GLQF_PROF2TC_OVERRIDE_ENA_1_S		4
+#define GLQF_PROF2TC_OVERRIDE_ENA_1_M		BIT(4)
+#define GLQF_PROF2TC_REGION_1_S			5
+#define GLQF_PROF2TC_REGION_1_M			ICE_M(0x7, 5)
+#define GLQF_PROF2TC_OVERRIDE_ENA_2_S		8
+#define GLQF_PROF2TC_OVERRIDE_ENA_2_M		BIT(8)
+#define GLQF_PROF2TC_REGION_2_S			9
+#define GLQF_PROF2TC_REGION_2_M			ICE_M(0x7, 9)
+#define GLQF_PROF2TC_OVERRIDE_ENA_3_S		12
+#define GLQF_PROF2TC_OVERRIDE_ENA_3_M		BIT(12)
+#define GLQF_PROF2TC_REGION_3_S			13
+#define GLQF_PROF2TC_REGION_3_M			ICE_M(0x7, 13)
+#define GLQF_PROF2TC_OVERRIDE_ENA_4_S		16
+#define GLQF_PROF2TC_OVERRIDE_ENA_4_M		BIT(16)
+#define GLQF_PROF2TC_REGION_4_S			17
+#define GLQF_PROF2TC_REGION_4_M			ICE_M(0x7, 17)
+#define GLQF_PROF2TC_OVERRIDE_ENA_5_S		20
+#define GLQF_PROF2TC_OVERRIDE_ENA_5_M		BIT(20)
+#define GLQF_PROF2TC_REGION_5_S			21
+#define GLQF_PROF2TC_REGION_5_M			ICE_M(0x7, 21)
+#define GLQF_PROF2TC_OVERRIDE_ENA_6_S		24
+#define GLQF_PROF2TC_OVERRIDE_ENA_6_M		BIT(24)
+#define GLQF_PROF2TC_REGION_6_S			25
+#define GLQF_PROF2TC_REGION_6_M			ICE_M(0x7, 25)
+#define GLQF_PROF2TC_OVERRIDE_ENA_7_S		28
+#define GLQF_PROF2TC_OVERRIDE_ENA_7_M		BIT(28)
+#define GLQF_PROF2TC_REGION_7_S			29
+#define GLQF_PROF2TC_REGION_7_M			ICE_M(0x7, 29)
+#define PFQF_FD_CNT				0x00460180 /* Reset Source: CORER */
+#define PFQF_FD_CNT_FD_GCNT_S			0
+#define PFQF_FD_CNT_FD_GCNT_M			ICE_M(0x7FFF, 0)
+#define PFQF_FD_CNT_FD_BCNT_S			16
+#define PFQF_FD_CNT_FD_BCNT_M			ICE_M(0x7FFF, 16)
+#define PFQF_FD_ENA				0x0043A000 /* Reset Source: CORER */
+#define PFQF_FD_ENA_FD_ENA_S			0
+#define PFQF_FD_ENA_FD_ENA_M			BIT(0)
+#define PFQF_FD_SIZE				0x00460100 /* Reset Source: CORER */
+#define PFQF_FD_SIZE_FD_GSIZE_S			0
+#define PFQF_FD_SIZE_FD_GSIZE_M			ICE_M(0x7FFF, 0)
+#define PFQF_FD_SIZE_FD_BSIZE_S			16
+#define PFQF_FD_SIZE_FD_BSIZE_M			ICE_M(0x7FFF, 16)
+#define PFQF_FD_SUBTRACT			0x00460200 /* Reset Source: CORER */
+#define PFQF_FD_SUBTRACT_FD_GCNT_S		0
+#define PFQF_FD_SUBTRACT_FD_GCNT_M		ICE_M(0x7FFF, 0)
+#define PFQF_FD_SUBTRACT_FD_BCNT_S		16
+#define PFQF_FD_SUBTRACT_FD_BCNT_M		ICE_M(0x7FFF, 16)
+#define PFQF_HLUT(_i)				(0x00430000 + ((_i) * 64)) /* _i=0...511 */ /* Reset Source: CORER */
+#define PFQF_HLUT_MAX_INDEX			511
+#define PFQF_HLUT_LUT0_S			0
+#define PFQF_HLUT_LUT0_M			ICE_M(0xFF, 0)
+#define PFQF_HLUT_LUT1_S			8
+#define PFQF_HLUT_LUT1_M			ICE_M(0xFF, 8)
+#define PFQF_HLUT_LUT2_S			16
+#define PFQF_HLUT_LUT2_M			ICE_M(0xFF, 16)
+#define PFQF_HLUT_LUT3_S			24
+#define PFQF_HLUT_LUT3_M			ICE_M(0xFF, 24)
+#define PFQF_HLUT_SIZE				0x00455480 /* Reset Source: CORER */
+#define PFQF_HLUT_SIZE_HSIZE_S			0
+#define PFQF_HLUT_SIZE_HSIZE_M			ICE_M(0x3, 0)
+#define PFQF_PE_CLSN0				0x00470480 /* Reset Source: CORER */
+#define PFQF_PE_CLSN0_HITSBCNT_S		0
+#define PFQF_PE_CLSN0_HITSBCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define PFQF_PE_CLSN1				0x00470500 /* Reset Source: CORER */
+#define PFQF_PE_CLSN1_HITLBCNT_S		0
+#define PFQF_PE_CLSN1_HITLBCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define PFQF_PE_CTL1				0x00470000 /* Reset Source: CORER */
+#define PFQF_PE_CTL1_PEHSIZE_S			0
+#define PFQF_PE_CTL1_PEHSIZE_M			ICE_M(0xF, 0)
+#define PFQF_PE_CTL2				0x00470040 /* Reset Source: CORER */
+#define PFQF_PE_CTL2_PEDSIZE_S			0
+#define PFQF_PE_CTL2_PEDSIZE_M			ICE_M(0xF, 0)
+#define PFQF_PE_FILTERING_ENA			0x0043A080 /* Reset Source: CORER */
+#define PFQF_PE_FILTERING_ENA_PE_ENA_S		0
+#define PFQF_PE_FILTERING_ENA_PE_ENA_M		BIT(0)
+#define PFQF_PE_FLHD				0x00470100 /* Reset Source: CORER */
+#define PFQF_PE_FLHD_FLHD_S			0
+#define PFQF_PE_FLHD_FLHD_M			ICE_M(0xFFFFFF, 0)
+#define PFQF_PE_ST_CTL				0x00470400 /* Reset Source: CORER */
+#define PFQF_PE_ST_CTL_PF_CNT_EN_S		0
+#define PFQF_PE_ST_CTL_PF_CNT_EN_M		BIT(0)
+#define PFQF_PE_ST_CTL_VFS_CNT_EN_S		1
+#define PFQF_PE_ST_CTL_VFS_CNT_EN_M		BIT(1)
+#define PFQF_PE_ST_CTL_VF_CNT_EN_S		2
+#define PFQF_PE_ST_CTL_VF_CNT_EN_M		BIT(2)
+#define PFQF_PE_ST_CTL_VF_NUM_S			16
+#define PFQF_PE_ST_CTL_VF_NUM_M			ICE_M(0xFF, 16)
+#define PFQF_PE_TC_CTL				0x00452080 /* Reset Source: CORER */
+#define PFQF_PE_TC_CTL_TC_EN_PF_S		0
+#define PFQF_PE_TC_CTL_TC_EN_PF_M		ICE_M(0xFF, 0)
+#define PFQF_PE_TC_CTL_TC_EN_VF_S		16
+#define PFQF_PE_TC_CTL_TC_EN_VF_M		ICE_M(0xFF, 16)
+#define PFQF_PECNT_0				0x00470200 /* Reset Source: CORER */
+#define PFQF_PECNT_0_BUCKETCNT_S		0
+#define PFQF_PECNT_0_BUCKETCNT_M		ICE_M(0x3FFFF, 0)
+#define PFQF_PECNT_1				0x00470300 /* Reset Source: CORER */
+#define PFQF_PECNT_1_FLTCNT_S			0
+#define PFQF_PECNT_1_FLTCNT_M			ICE_M(0x3FFFF, 0)
+#define VPQF_PE_CTL1(_VF)			(0x00474000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PE_CTL1_MAX_INDEX			255
+#define VPQF_PE_CTL1_PEHSIZE_S			0
+#define VPQF_PE_CTL1_PEHSIZE_M			ICE_M(0xF, 0)
+#define VPQF_PE_CTL2(_VF)			(0x00474800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PE_CTL2_MAX_INDEX			255
+#define VPQF_PE_CTL2_PEDSIZE_S			0
+#define VPQF_PE_CTL2_PEDSIZE_M			ICE_M(0xF, 0)
+#define VPQF_PE_FILTERING_ENA(_VF)		(0x00455800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PE_FILTERING_ENA_MAX_INDEX		255
+#define VPQF_PE_FILTERING_ENA_PE_ENA_S		0
+#define VPQF_PE_FILTERING_ENA_PE_ENA_M		BIT(0)
+#define VPQF_PE_FLHD(_VF)			(0x00472000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PE_FLHD_MAX_INDEX			255
+#define VPQF_PE_FLHD_FLHD_S			0
+#define VPQF_PE_FLHD_FLHD_M			ICE_M(0xFFFFFF, 0)
+#define VPQF_PECNT_0(_VF)			(0x00472800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PECNT_0_MAX_INDEX			255
+#define VPQF_PECNT_0_BUCKETCNT_S		0
+#define VPQF_PECNT_0_BUCKETCNT_M		ICE_M(0x3FFFF, 0)
+#define VPQF_PECNT_1(_VF)			(0x00473000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PECNT_1_MAX_INDEX			255
+#define VPQF_PECNT_1_FLTCNT_S			0
+#define VPQF_PECNT_1_FLTCNT_M			ICE_M(0x3FFFF, 0)
+#define GLDCB_RMPMC				0x001223C8 /* Reset Source: CORER */
+#define GLDCB_RMPMC_RSPM_S			0
+#define GLDCB_RMPMC_RSPM_M			ICE_M(0x3F, 0)
+#define GLDCB_RMPMC_MIQ_NODROP_MODE_S		6
+#define GLDCB_RMPMC_MIQ_NODROP_MODE_M		ICE_M(0x1F, 6)
+#define GLDCB_RMPMC_RPM_DIS_S			31
+#define GLDCB_RMPMC_RPM_DIS_M			BIT(31)
+#define GLDCB_RMPMS				0x001223CC /* Reset Source: CORER */
+#define GLDCB_RMPMS_RMPM_S			0
+#define GLDCB_RMPMS_RMPM_M			ICE_M(0xFFFF, 0)
+#define GLDCB_RPCC				0x00122260 /* Reset Source: CORER */
+#define GLDCB_RPCC_EN_S				0
+#define GLDCB_RPCC_EN_M				BIT(0)
+#define GLDCB_RPCC_SCL_FACT_S			4
+#define GLDCB_RPCC_SCL_FACT_M			ICE_M(0x1F, 4)
+#define GLDCB_RPCC_THRSH_S			16
+#define GLDCB_RPCC_THRSH_M			ICE_M(0xFFF, 16)
+#define GLDCB_RSPMC				0x001223C4 /* Reset Source: CORER */
+#define GLDCB_RSPMC_RSPM_S			0
+#define GLDCB_RSPMC_RSPM_M			ICE_M(0xFF, 0)
+#define GLDCB_RSPMC_RPM_MODE_S			8
+#define GLDCB_RSPMC_RPM_MODE_M			ICE_M(0x3, 8)
+#define GLDCB_RSPMC_PRR_MAX_EXP_S		10
+#define GLDCB_RSPMC_PRR_MAX_EXP_M		ICE_M(0xF, 10)
+#define GLDCB_RSPMC_PFCTIMER_S			14
+#define GLDCB_RSPMC_PFCTIMER_M			ICE_M(0x3FFF, 14)
+#define GLDCB_RSPMC_RPM_DIS_S			31
+#define GLDCB_RSPMC_RPM_DIS_M			BIT(31)
+#define GLDCB_RSPMS				0x001223C0 /* Reset Source: CORER */
+#define GLDCB_RSPMS_RSPM_S			0
+#define GLDCB_RSPMS_RSPM_M			ICE_M(0x3FFFF, 0)
+#define GLDCB_RTCTI				0x001223D0 /* Reset Source: CORER */
+#define GLDCB_RTCTI_PFCTIMEOUT_TC_S		0
+#define GLDCB_RTCTI_PFCTIMEOUT_TC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_RTCTQ(_i)				(0x001222C0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_RTCTQ_MAX_INDEX			31
+#define GLDCB_RTCTQ_RXQNUM_S			0
+#define GLDCB_RTCTQ_RXQNUM_M			ICE_M(0x7FF, 0)
+#define GLDCB_RTCTQ_IS_PF_Q_S			16
+#define GLDCB_RTCTQ_IS_PF_Q_M			BIT(16)
+#define GLDCB_RTCTS(_i)				(0x00122340 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_RTCTS_MAX_INDEX			31
+#define GLDCB_RTCTS_PFCTIMER_S			0
+#define GLDCB_RTCTS_PFCTIMER_M			ICE_M(0x3FFF, 0)
+#define GLRCB_CFG_COTF_CNT(_i)			(0x001223D4 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRCB_CFG_COTF_CNT_MAX_INDEX		7
+#define GLRCB_CFG_COTF_CNT_MRKR_COTF_CNT_S	0
+#define GLRCB_CFG_COTF_CNT_MRKR_COTF_CNT_M	ICE_M(0x3F, 0)
+#define GLRCB_CFG_COTF_ST			0x001223F4 /* Reset Source: CORER */
+#define GLRCB_CFG_COTF_ST_MRKR_COTF_ST_S	0
+#define GLRCB_CFG_COTF_ST_MRKR_COTF_ST_M	ICE_M(0xFF, 0)
+#define GLRPRS_PMCFG_DHW(_i)			(0x00200388 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_DHW_MAX_INDEX		15
+#define GLRPRS_PMCFG_DHW_DHW_S			0
+#define GLRPRS_PMCFG_DHW_DHW_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_DLW(_i)			(0x002003C8 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_DLW_MAX_INDEX		15
+#define GLRPRS_PMCFG_DLW_DLW_S			0
+#define GLRPRS_PMCFG_DLW_DLW_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_DPS(_i)			(0x00200308 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_DPS_MAX_INDEX		15
+#define GLRPRS_PMCFG_DPS_DPS_S			0
+#define GLRPRS_PMCFG_DPS_DPS_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_SHW(_i)			(0x00200448 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_SHW_MAX_INDEX		7
+#define GLRPRS_PMCFG_SHW_SHW_S			0
+#define GLRPRS_PMCFG_SHW_SHW_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_SLW(_i)			(0x00200468 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_SLW_MAX_INDEX		7
+#define GLRPRS_PMCFG_SLW_SLW_S			0
+#define GLRPRS_PMCFG_SLW_SLW_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_SPS(_i)			(0x00200408 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_SPS_MAX_INDEX		7
+#define GLRPRS_PMCFG_SPS_SPS_S			0
+#define GLRPRS_PMCFG_SPS_SPS_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_TC_CFG(_i)			(0x00200488 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_TC_CFG_MAX_INDEX		31
+#define GLRPRS_PMCFG_TC_CFG_D_POOL_S		0
+#define GLRPRS_PMCFG_TC_CFG_D_POOL_M		ICE_M(0xF, 0)
+#define GLRPRS_PMCFG_TC_CFG_S_POOL_S		16
+#define GLRPRS_PMCFG_TC_CFG_S_POOL_M		ICE_M(0x7, 16)
+#define GLRPRS_PMCFG_TCHW(_i)			(0x00200588 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_TCHW_MAX_INDEX		31
+#define GLRPRS_PMCFG_TCHW_TCHW_S		0
+#define GLRPRS_PMCFG_TCHW_TCHW_M		ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_TCLW(_i)			(0x00200608 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_TCLW_MAX_INDEX		31
+#define GLRPRS_PMCFG_TCLW_TCLW_S		0
+#define GLRPRS_PMCFG_TCLW_TCLW_M		ICE_M(0xFFFFF, 0)
+#define GLSWT_PMCFG_TC_CFG(_i)			(0x00204900 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSWT_PMCFG_TC_CFG_MAX_INDEX		31
+#define GLSWT_PMCFG_TC_CFG_D_POOL_S		0
+#define GLSWT_PMCFG_TC_CFG_D_POOL_M		ICE_M(0xF, 0)
+#define GLSWT_PMCFG_TC_CFG_S_POOL_S		16
+#define GLSWT_PMCFG_TC_CFG_S_POOL_M		ICE_M(0x7, 16)
+#define PRTDCB_RLANPMS				0x00122280 /* Reset Source: CORER */
+#define PRTDCB_RLANPMS_LANRPPM_S		0
+#define PRTDCB_RLANPMS_LANRPPM_M		ICE_M(0x3FFFF, 0)
+#define PRTDCB_RPPMC				0x00122240 /* Reset Source: CORER */
+#define PRTDCB_RPPMC_LANRPPM_S			0
+#define PRTDCB_RPPMC_LANRPPM_M			ICE_M(0xFF, 0)
+#define PRTDCB_RPPMC_RDMARPPM_S			8
+#define PRTDCB_RPPMC_RDMARPPM_M			ICE_M(0xFF, 8)
+#define PRTDCB_RRDMAPMS				0x00122120 /* Reset Source: CORER */
+#define PRTDCB_RRDMAPMS_RDMARPPM_S		0
+#define PRTDCB_RRDMAPMS_RDMARPPM_M		ICE_M(0x3FFFF, 0)
+#define GL_STAT_SWR_BPCH(_i)			(0x00347804 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_BPCH_MAX_INDEX		127
+#define GL_STAT_SWR_BPCH_VLBPCH_S		0
+#define GL_STAT_SWR_BPCH_VLBPCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_BPCL(_i)			(0x00347800 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_BPCL_MAX_INDEX		127
+#define GL_STAT_SWR_BPCL_VLBPCL_S		0
+#define GL_STAT_SWR_BPCL_VLBPCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_STAT_SWR_GORCH(_i)			(0x00342004 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_GORCH_MAX_INDEX		127
+#define GL_STAT_SWR_GORCH_VLBCH_S		0
+#define GL_STAT_SWR_GORCH_VLBCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_GORCL(_i)			(0x00342000 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_GORCL_MAX_INDEX		127
+#define GL_STAT_SWR_GORCL_VLBCL_S		0
+#define GL_STAT_SWR_GORCL_VLBCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_STAT_SWR_GOTCH(_i)			(0x00304004 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_GOTCH_MAX_INDEX		127
+#define GL_STAT_SWR_GOTCH_VLBCH_S		0
+#define GL_STAT_SWR_GOTCH_VLBCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_GOTCL(_i)			(0x00304000 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_GOTCL_MAX_INDEX		127
+#define GL_STAT_SWR_GOTCL_VLBCL_S		0
+#define GL_STAT_SWR_GOTCL_VLBCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_STAT_SWR_MPCH(_i)			(0x00347404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_MPCH_MAX_INDEX		127
+#define GL_STAT_SWR_MPCH_VLMPCH_S		0
+#define GL_STAT_SWR_MPCH_VLMPCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_MPCL(_i)			(0x00347400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_MPCL_MAX_INDEX		127
+#define GL_STAT_SWR_MPCL_VLMPCL_S		0
+#define GL_STAT_SWR_MPCL_VLMPCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_STAT_SWR_UPCH(_i)			(0x00347004 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_UPCH_MAX_INDEX		127
+#define GL_STAT_SWR_UPCH_VLUPCH_S		0
+#define GL_STAT_SWR_UPCH_VLUPCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_UPCL(_i)			(0x00347000 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_UPCL_MAX_INDEX		127
+#define GL_STAT_SWR_UPCL_VLUPCL_S		0
+#define GL_STAT_SWR_UPCL_VLUPCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_AORCL(_i)				(0x003812C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_AORCL_MAX_INDEX			7
+#define GLPRT_AORCL_AORCL_S			0
+#define GLPRT_AORCL_AORCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_BPRCH(_i)				(0x00381384 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_BPRCH_MAX_INDEX			7
+#define GLPRT_BPRCH_UPRCH_S			0
+#define GLPRT_BPRCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLPRT_BPRCL(_i)				(0x00381380 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_BPRCL_MAX_INDEX			7
+#define GLPRT_BPRCL_UPRCH_S			0
+#define GLPRT_BPRCL_UPRCH_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_BPTCH(_i)				(0x00381244 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_BPTCH_MAX_INDEX			7
+#define GLPRT_BPTCH_UPRCH_S			0
+#define GLPRT_BPTCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLPRT_BPTCL(_i)				(0x00381240 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_BPTCL_MAX_INDEX			7
+#define GLPRT_BPTCL_UPRCH_S			0
+#define GLPRT_BPTCL_UPRCH_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_CRCERRS(_i)			(0x00380100 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_CRCERRS_MAX_INDEX			7
+#define GLPRT_CRCERRS_CRCERRS_S			0
+#define GLPRT_CRCERRS_CRCERRS_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_CRCERRS_H(_i)			(0x00380104 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_CRCERRS_H_MAX_INDEX		7
+#define GLPRT_CRCERRS_H_CRCERRS_S		0
+#define GLPRT_CRCERRS_H_CRCERRS_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_GORCH(_i)				(0x00380004 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_GORCH_MAX_INDEX			7
+#define GLPRT_GORCH_GORCH_S			0
+#define GLPRT_GORCH_GORCH_M			ICE_M(0xFF, 0)
+#define GLPRT_GORCL(_i)				(0x00380000 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_GORCL_MAX_INDEX			7
+#define GLPRT_GORCL_GORCL_S			0
+#define GLPRT_GORCL_GORCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_GOTCH(_i)				(0x00380B44 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_GOTCH_MAX_INDEX			7
+#define GLPRT_GOTCH_GOTCH_S			0
+#define GLPRT_GOTCH_GOTCH_M			ICE_M(0xFF, 0)
+#define GLPRT_GOTCL(_i)				(0x00380B40 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_GOTCL_MAX_INDEX			7
+#define GLPRT_GOTCL_GOTCL_S			0
+#define GLPRT_GOTCL_GOTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_ILLERRC(_i)			(0x003801C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_ILLERRC_MAX_INDEX			7
+#define GLPRT_ILLERRC_ILLERRC_S			0
+#define GLPRT_ILLERRC_ILLERRC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_ILLERRC_H(_i)			(0x003801C4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_ILLERRC_H_MAX_INDEX		7
+#define GLPRT_ILLERRC_H_ILLERRC_S		0
+#define GLPRT_ILLERRC_H_ILLERRC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXOFFRXC(_i)			(0x003802C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXOFFRXC_MAX_INDEX		7
+#define GLPRT_LXOFFRXC_LXOFFRXCNT_S		0
+#define GLPRT_LXOFFRXC_LXOFFRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXOFFRXC_H(_i)			(0x003802C4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXOFFRXC_H_MAX_INDEX		7
+#define GLPRT_LXOFFRXC_H_LXOFFRXCNT_S		0
+#define GLPRT_LXOFFRXC_H_LXOFFRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXOFFTXC(_i)			(0x00381180 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXOFFTXC_MAX_INDEX		7
+#define GLPRT_LXOFFTXC_LXOFFTXC_S		0
+#define GLPRT_LXOFFTXC_LXOFFTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXOFFTXC_H(_i)			(0x00381184 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXOFFTXC_H_MAX_INDEX		7
+#define GLPRT_LXOFFTXC_H_LXOFFTXC_S		0
+#define GLPRT_LXOFFTXC_H_LXOFFTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXONRXC(_i)			(0x00380280 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXONRXC_MAX_INDEX			7
+#define GLPRT_LXONRXC_LXONRXCNT_S		0
+#define GLPRT_LXONRXC_LXONRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXONRXC_H(_i)			(0x00380284 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXONRXC_H_MAX_INDEX		7
+#define GLPRT_LXONRXC_H_LXONRXCNT_S		0
+#define GLPRT_LXONRXC_H_LXONRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXONTXC(_i)			(0x00381140 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXONTXC_MAX_INDEX			7
+#define GLPRT_LXONTXC_LXONTXC_S			0
+#define GLPRT_LXONTXC_LXONTXC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXONTXC_H(_i)			(0x00381144 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXONTXC_H_MAX_INDEX		7
+#define GLPRT_LXONTXC_H_LXONTXC_S		0
+#define GLPRT_LXONTXC_H_LXONTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MLFC(_i)				(0x00380040 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MLFC_MAX_INDEX			7
+#define GLPRT_MLFC_MLFC_S			0
+#define GLPRT_MLFC_MLFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MLFC_H(_i)			(0x00380044 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MLFC_H_MAX_INDEX			7
+#define GLPRT_MLFC_H_MLFC_S			0
+#define GLPRT_MLFC_H_MLFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MPRCH(_i)				(0x00381344 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MPRCH_MAX_INDEX			7
+#define GLPRT_MPRCH_MPRCH_S			0
+#define GLPRT_MPRCH_MPRCH_M			ICE_M(0xFF, 0)
+#define GLPRT_MPRCL(_i)				(0x00381340 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MPRCL_MAX_INDEX			7
+#define GLPRT_MPRCL_MPRCL_S			0
+#define GLPRT_MPRCL_MPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MPTCH(_i)				(0x00381204 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MPTCH_MAX_INDEX			7
+#define GLPRT_MPTCH_MPTCH_S			0
+#define GLPRT_MPTCH_MPTCH_M			ICE_M(0xFF, 0)
+#define GLPRT_MPTCL(_i)				(0x00381200 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MPTCL_MAX_INDEX			7
+#define GLPRT_MPTCL_MPTCL_S			0
+#define GLPRT_MPTCL_MPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MRFC(_i)				(0x00380080 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MRFC_MAX_INDEX			7
+#define GLPRT_MRFC_MRFC_S			0
+#define GLPRT_MRFC_MRFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MRFC_H(_i)			(0x00380084 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MRFC_H_MAX_INDEX			7
+#define GLPRT_MRFC_H_MRFC_S			0
+#define GLPRT_MRFC_H_MRFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC1023H(_i)			(0x00380A04 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC1023H_MAX_INDEX		7
+#define GLPRT_PRC1023H_PRC1023H_S		0
+#define GLPRT_PRC1023H_PRC1023H_M		ICE_M(0xFF, 0)
+#define GLPRT_PRC1023L(_i)			(0x00380A00 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC1023L_MAX_INDEX		7
+#define GLPRT_PRC1023L_PRC1023L_S		0
+#define GLPRT_PRC1023L_PRC1023L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC127H(_i)			(0x00380944 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC127H_MAX_INDEX			7
+#define GLPRT_PRC127H_PRC127H_S			0
+#define GLPRT_PRC127H_PRC127H_M			ICE_M(0xFF, 0)
+#define GLPRT_PRC127L(_i)			(0x00380940 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC127L_MAX_INDEX			7
+#define GLPRT_PRC127L_PRC127L_S			0
+#define GLPRT_PRC127L_PRC127L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC1522H(_i)			(0x00380A44 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC1522H_MAX_INDEX		7
+#define GLPRT_PRC1522H_PRC1522H_S		0
+#define GLPRT_PRC1522H_PRC1522H_M		ICE_M(0xFF, 0)
+#define GLPRT_PRC1522L(_i)			(0x00380A40 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC1522L_MAX_INDEX		7
+#define GLPRT_PRC1522L_PRC1522L_S		0
+#define GLPRT_PRC1522L_PRC1522L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC255H(_i)			(0x00380984 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC255H_MAX_INDEX			7
+#define GLPRT_PRC255H_PRTPRC255H_S		0
+#define GLPRT_PRC255H_PRTPRC255H_M		ICE_M(0xFF, 0)
+#define GLPRT_PRC255L(_i)			(0x00380980 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC255L_MAX_INDEX			7
+#define GLPRT_PRC255L_PRC255L_S			0
+#define GLPRT_PRC255L_PRC255L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC511H(_i)			(0x003809C4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC511H_MAX_INDEX			7
+#define GLPRT_PRC511H_PRC511H_S			0
+#define GLPRT_PRC511H_PRC511H_M			ICE_M(0xFF, 0)
+#define GLPRT_PRC511L(_i)			(0x003809C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC511L_MAX_INDEX			7
+#define GLPRT_PRC511L_PRC511L_S			0
+#define GLPRT_PRC511L_PRC511L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC64H(_i)			(0x00380904 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC64H_MAX_INDEX			7
+#define GLPRT_PRC64H_PRC64H_S			0
+#define GLPRT_PRC64H_PRC64H_M			ICE_M(0xFF, 0)
+#define GLPRT_PRC64L(_i)			(0x00380900 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC64L_MAX_INDEX			7
+#define GLPRT_PRC64L_PRC64L_S			0
+#define GLPRT_PRC64L_PRC64L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC9522H(_i)			(0x00380A84 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC9522H_MAX_INDEX		7
+#define GLPRT_PRC9522H_PRC1522H_S		0
+#define GLPRT_PRC9522H_PRC1522H_M		ICE_M(0xFF, 0)
+#define GLPRT_PRC9522L(_i)			(0x00380A80 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC9522L_MAX_INDEX		7
+#define GLPRT_PRC9522L_PRC1522L_S		0
+#define GLPRT_PRC9522L_PRC1522L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC1023H(_i)			(0x00380C84 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC1023H_MAX_INDEX		7
+#define GLPRT_PTC1023H_PTC1023H_S		0
+#define GLPRT_PTC1023H_PTC1023H_M		ICE_M(0xFF, 0)
+#define GLPRT_PTC1023L(_i)			(0x00380C80 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC1023L_MAX_INDEX		7
+#define GLPRT_PTC1023L_PTC1023L_S		0
+#define GLPRT_PTC1023L_PTC1023L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC127H(_i)			(0x00380BC4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC127H_MAX_INDEX			7
+#define GLPRT_PTC127H_PTC127H_S			0
+#define GLPRT_PTC127H_PTC127H_M			ICE_M(0xFF, 0)
+#define GLPRT_PTC127L(_i)			(0x00380BC0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC127L_MAX_INDEX			7
+#define GLPRT_PTC127L_PTC127L_S			0
+#define GLPRT_PTC127L_PTC127L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC1522H(_i)			(0x00380CC4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC1522H_MAX_INDEX		7
+#define GLPRT_PTC1522H_PTC1522H_S		0
+#define GLPRT_PTC1522H_PTC1522H_M		ICE_M(0xFF, 0)
+#define GLPRT_PTC1522L(_i)			(0x00380CC0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC1522L_MAX_INDEX		7
+#define GLPRT_PTC1522L_PTC1522L_S		0
+#define GLPRT_PTC1522L_PTC1522L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC255H(_i)			(0x00380C04 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC255H_MAX_INDEX			7
+#define GLPRT_PTC255H_PTC255H_S			0
+#define GLPRT_PTC255H_PTC255H_M			ICE_M(0xFF, 0)
+#define GLPRT_PTC255L(_i)			(0x00380C00 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC255L_MAX_INDEX			7
+#define GLPRT_PTC255L_PTC255L_S			0
+#define GLPRT_PTC255L_PTC255L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC511H(_i)			(0x00380C44 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC511H_MAX_INDEX			7
+#define GLPRT_PTC511H_PTC511H_S			0
+#define GLPRT_PTC511H_PTC511H_M			ICE_M(0xFF, 0)
+#define GLPRT_PTC511L(_i)			(0x00380C40 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC511L_MAX_INDEX			7
+#define GLPRT_PTC511L_PTC511L_S			0
+#define GLPRT_PTC511L_PTC511L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC64H(_i)			(0x00380B84 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC64H_MAX_INDEX			7
+#define GLPRT_PTC64H_PTC64H_S			0
+#define GLPRT_PTC64H_PTC64H_M			ICE_M(0xFF, 0)
+#define GLPRT_PTC64L(_i)			(0x00380B80 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC64L_MAX_INDEX			7
+#define GLPRT_PTC64L_PTC64L_S			0
+#define GLPRT_PTC64L_PTC64L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC9522H(_i)			(0x00380D04 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC9522H_MAX_INDEX		7
+#define GLPRT_PTC9522H_PTC9522H_S		0
+#define GLPRT_PTC9522H_PTC9522H_M		ICE_M(0xFF, 0)
+#define GLPRT_PTC9522L(_i)			(0x00380D00 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC9522L_MAX_INDEX		7
+#define GLPRT_PTC9522L_PTC9522L_S		0
+#define GLPRT_PTC9522L_PTC9522L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXOFFRXC(_i, _j)			(0x00380500 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXOFFRXC_MAX_INDEX		7
+#define GLPRT_PXOFFRXC_PRPXOFFRXCNT_S		0
+#define GLPRT_PXOFFRXC_PRPXOFFRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXOFFRXC_H(_i, _j)		(0x00380504 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXOFFRXC_H_MAX_INDEX		7
+#define GLPRT_PXOFFRXC_H_PRPXOFFRXCNT_S		0
+#define GLPRT_PXOFFRXC_H_PRPXOFFRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXOFFTXC(_i, _j)			(0x00380F40 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXOFFTXC_MAX_INDEX		7
+#define GLPRT_PXOFFTXC_PRPXOFFTXCNT_S		0
+#define GLPRT_PXOFFTXC_PRPXOFFTXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXOFFTXC_H(_i, _j)		(0x00380F44 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXOFFTXC_H_MAX_INDEX		7
+#define GLPRT_PXOFFTXC_H_PRPXOFFTXCNT_S		0
+#define GLPRT_PXOFFTXC_H_PRPXOFFTXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXONRXC(_i, _j)			(0x00380300 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXONRXC_MAX_INDEX			7
+#define GLPRT_PXONRXC_PRPXONRXCNT_S		0
+#define GLPRT_PXONRXC_PRPXONRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXONRXC_H(_i, _j)			(0x00380304 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXONRXC_H_MAX_INDEX		7
+#define GLPRT_PXONRXC_H_PRPXONRXCNT_S		0
+#define GLPRT_PXONRXC_H_PRPXONRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXONTXC(_i, _j)			(0x00380D40 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXONTXC_MAX_INDEX			7
+#define GLPRT_PXONTXC_PRPXONTXC_S		0
+#define GLPRT_PXONTXC_PRPXONTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXONTXC_H(_i, _j)			(0x00380D44 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXONTXC_H_MAX_INDEX		7
+#define GLPRT_PXONTXC_H_PRPXONTXC_S		0
+#define GLPRT_PXONTXC_H_PRPXONTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RFC(_i)				(0x00380AC0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RFC_MAX_INDEX			7
+#define GLPRT_RFC_RFC_S				0
+#define GLPRT_RFC_RFC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RFC_H(_i)				(0x00380AC4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RFC_H_MAX_INDEX			7
+#define GLPRT_RFC_H_RFC_S			0
+#define GLPRT_RFC_H_RFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RJC(_i)				(0x00380B00 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RJC_MAX_INDEX			7
+#define GLPRT_RJC_RJC_S				0
+#define GLPRT_RJC_RJC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RJC_H(_i)				(0x00380B04 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RJC_H_MAX_INDEX			7
+#define GLPRT_RJC_H_RJC_S			0
+#define GLPRT_RJC_H_RJC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RLEC(_i)				(0x00380140 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RLEC_MAX_INDEX			7
+#define GLPRT_RLEC_RLEC_S			0
+#define GLPRT_RLEC_RLEC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RLEC_H(_i)			(0x00380144 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RLEC_H_MAX_INDEX			7
+#define GLPRT_RLEC_H_RLEC_S			0
+#define GLPRT_RLEC_H_RLEC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_ROC(_i)				(0x00380240 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_ROC_MAX_INDEX			7
+#define GLPRT_ROC_ROC_S				0
+#define GLPRT_ROC_ROC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_ROC_H(_i)				(0x00380244 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_ROC_H_MAX_INDEX			7
+#define GLPRT_ROC_H_ROC_S			0
+#define GLPRT_ROC_H_ROC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RUC(_i)				(0x00380200 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RUC_MAX_INDEX			7
+#define GLPRT_RUC_RUC_S				0
+#define GLPRT_RUC_RUC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RUC_H(_i)				(0x00380204 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RUC_H_MAX_INDEX			7
+#define GLPRT_RUC_H_RUC_S			0
+#define GLPRT_RUC_H_RUC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RXON2OFFCNT(_i, _j)		(0x00380700 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RXON2OFFCNT_MAX_INDEX		7
+#define GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_S	0
+#define GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RXON2OFFCNT_H(_i, _j)		(0x00380704 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RXON2OFFCNT_H_MAX_INDEX		7
+#define GLPRT_RXON2OFFCNT_H_PRRXON2OFFCNT_S	0
+#define GLPRT_RXON2OFFCNT_H_PRRXON2OFFCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_STDC(_i)				(0x00340000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_STDC_MAX_INDEX			7
+#define GLPRT_STDC_STDC_S			0
+#define GLPRT_STDC_STDC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_TDOLD(_i)				(0x00381280 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_TDOLD_MAX_INDEX			7
+#define GLPRT_TDOLD_GLPRT_TDOLD_S		0
+#define GLPRT_TDOLD_GLPRT_TDOLD_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_TDOLD_H(_i)			(0x00381284 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_TDOLD_H_MAX_INDEX			7
+#define GLPRT_TDOLD_H_GLPRT_TDOLD_S		0
+#define GLPRT_TDOLD_H_GLPRT_TDOLD_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_UPRCH(_i)				(0x00381304 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_UPRCH_MAX_INDEX			7
+#define GLPRT_UPRCH_UPRCH_S			0
+#define GLPRT_UPRCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLPRT_UPRCL(_i)				(0x00381300 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_UPRCL_MAX_INDEX			7
+#define GLPRT_UPRCL_UPRCL_S			0
+#define GLPRT_UPRCL_UPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_UPTCH(_i)				(0x003811C4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_UPTCH_MAX_INDEX			7
+#define GLPRT_UPTCH_UPTCH_S			0
+#define GLPRT_UPTCH_UPTCH_M			ICE_M(0xFF, 0)
+#define GLPRT_UPTCL(_i)				(0x003811C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_UPTCL_MAX_INDEX			7
+#define GLPRT_UPTCL_VUPTCH_S			0
+#define GLPRT_UPTCL_VUPTCH_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_ACL_CNT_0_H(_i)			(0x00388004 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_0_H_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_0_H_CNT_MSB_S		0
+#define GLSTAT_ACL_CNT_0_H_CNT_MSB_M		ICE_M(0xFF, 0)
+#define GLSTAT_ACL_CNT_0_L(_i)			(0x00388000 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_0_L_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_0_L_CNT_LSB_S		0
+#define GLSTAT_ACL_CNT_0_L_CNT_LSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_ACL_CNT_1_H(_i)			(0x00389004 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_1_H_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_1_H_CNT_MSB_S		0
+#define GLSTAT_ACL_CNT_1_H_CNT_MSB_M		ICE_M(0xFF, 0)
+#define GLSTAT_ACL_CNT_1_L(_i)			(0x00389000 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_1_L_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_1_L_CNT_LSB_S		0
+#define GLSTAT_ACL_CNT_1_L_CNT_LSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_ACL_CNT_2_H(_i)			(0x0038A004 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_2_H_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_2_H_CNT_MSB_S		0
+#define GLSTAT_ACL_CNT_2_H_CNT_MSB_M		ICE_M(0xFF, 0)
+#define GLSTAT_ACL_CNT_2_L(_i)			(0x0038A000 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_2_L_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_2_L_CNT_LSB_S		0
+#define GLSTAT_ACL_CNT_2_L_CNT_LSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_ACL_CNT_3_H(_i)			(0x0038B004 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_3_H_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_3_H_CNT_MSB_S		0
+#define GLSTAT_ACL_CNT_3_H_CNT_MSB_M		ICE_M(0xFF, 0)
+#define GLSTAT_ACL_CNT_3_L(_i)			(0x0038B000 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_3_L_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_3_L_CNT_LSB_S		0
+#define GLSTAT_ACL_CNT_3_L_CNT_LSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_FD_CNT0H(_i)			(0x003A0004 + ((_i) * 8)) /* _i=0...4095 */ /* Reset Source: CORER */
+#define GLSTAT_FD_CNT0H_MAX_INDEX		4095
+#define GLSTAT_FD_CNT0H_FD0_CNT_H_S		0
+#define GLSTAT_FD_CNT0H_FD0_CNT_H_M		ICE_M(0xFF, 0)
+#define GLSTAT_FD_CNT0L(_i)			(0x003A0000 + ((_i) * 8)) /* _i=0...4095 */ /* Reset Source: CORER */
+#define GLSTAT_FD_CNT0L_MAX_INDEX		4095
+#define GLSTAT_FD_CNT0L_FD0_CNT_L_S		0
+#define GLSTAT_FD_CNT0L_FD0_CNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_FD_CNT1H(_i)			(0x003A8004 + ((_i) * 8)) /* _i=0...4095 */ /* Reset Source: CORER */
+#define GLSTAT_FD_CNT1H_MAX_INDEX		4095
+#define GLSTAT_FD_CNT1H_FD0_CNT_H_S		0
+#define GLSTAT_FD_CNT1H_FD0_CNT_H_M		ICE_M(0xFF, 0)
+#define GLSTAT_FD_CNT1L(_i)			(0x003A8000 + ((_i) * 8)) /* _i=0...4095 */ /* Reset Source: CORER */
+#define GLSTAT_FD_CNT1L_MAX_INDEX		4095
+#define GLSTAT_FD_CNT1L_FD0_CNT_L_S		0
+#define GLSTAT_FD_CNT1L_FD0_CNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSW_BPRCH(_i)				(0x00346204 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_BPRCH_MAX_INDEX			31
+#define GLSW_BPRCH_BPRCH_S			0
+#define GLSW_BPRCH_BPRCH_M			ICE_M(0xFF, 0)
+#define GLSW_BPRCL(_i)				(0x00346200 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_BPRCL_MAX_INDEX			31
+#define GLSW_BPRCL_BPRCL_S			0
+#define GLSW_BPRCL_BPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_BPTCH(_i)				(0x00310204 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_BPTCH_MAX_INDEX			31
+#define GLSW_BPTCH_BPTCH_S			0
+#define GLSW_BPTCH_BPTCH_M			ICE_M(0xFF, 0)
+#define GLSW_BPTCL(_i)				(0x00310200 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_BPTCL_MAX_INDEX			31
+#define GLSW_BPTCL_BPTCL_S			0
+#define GLSW_BPTCL_BPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_GORCH(_i)				(0x00341004 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_GORCH_MAX_INDEX			31
+#define GLSW_GORCH_GORCH_S			0
+#define GLSW_GORCH_GORCH_M			ICE_M(0xFF, 0)
+#define GLSW_GORCL(_i)				(0x00341000 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_GORCL_MAX_INDEX			31
+#define GLSW_GORCL_GORCL_S			0
+#define GLSW_GORCL_GORCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_GOTCH(_i)				(0x00302004 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_GOTCH_MAX_INDEX			31
+#define GLSW_GOTCH_GOTCH_S			0
+#define GLSW_GOTCH_GOTCH_M			ICE_M(0xFF, 0)
+#define GLSW_GOTCL(_i)				(0x00302000 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_GOTCL_MAX_INDEX			31
+#define GLSW_GOTCL_GOTCL_S			0
+#define GLSW_GOTCL_GOTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_MPRCH(_i)				(0x00346104 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_MPRCH_MAX_INDEX			31
+#define GLSW_MPRCH_MPRCH_S			0
+#define GLSW_MPRCH_MPRCH_M			ICE_M(0xFF, 0)
+#define GLSW_MPRCL(_i)				(0x00346100 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_MPRCL_MAX_INDEX			31
+#define GLSW_MPRCL_MPRCL_S			0
+#define GLSW_MPRCL_MPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_MPTCH(_i)				(0x00310104 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_MPTCH_MAX_INDEX			31
+#define GLSW_MPTCH_MPTCH_S			0
+#define GLSW_MPTCH_MPTCH_M			ICE_M(0xFF, 0)
+#define GLSW_MPTCL(_i)				(0x00310100 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_MPTCL_MAX_INDEX			31
+#define GLSW_MPTCL_MPTCL_S			0
+#define GLSW_MPTCL_MPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_UPRCH(_i)				(0x00346004 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_UPRCH_MAX_INDEX			31
+#define GLSW_UPRCH_UPRCH_S			0
+#define GLSW_UPRCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLSW_UPRCL(_i)				(0x00346000 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_UPRCL_MAX_INDEX			31
+#define GLSW_UPRCL_UPRCL_S			0
+#define GLSW_UPRCL_UPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_UPTCH(_i)				(0x00310004 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_UPTCH_MAX_INDEX			31
+#define GLSW_UPTCH_UPTCH_S			0
+#define GLSW_UPTCH_UPTCH_M			ICE_M(0xFF, 0)
+#define GLSW_UPTCL(_i)				(0x00310000 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_UPTCL_MAX_INDEX			31
+#define GLSW_UPTCL_UPTCL_S			0
+#define GLSW_UPTCL_UPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSWID_RUPP(_i)				(0x00345000 + ((_i) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define GLSWID_RUPP_MAX_INDEX			255
+#define GLSWID_RUPP_RUPP_S			0
+#define GLSWID_RUPP_RUPP_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_BPRCH(_i)				(0x003B6004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_BPRCH_MAX_INDEX			767
+#define GLV_BPRCH_BPRCH_S			0
+#define GLV_BPRCH_BPRCH_M			ICE_M(0xFF, 0)
+#define GLV_BPRCL(_i)				(0x003B6000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_BPRCL_MAX_INDEX			767
+#define GLV_BPRCL_BPRCL_S			0
+#define GLV_BPRCL_BPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_BPTCH(_i)				(0x0030E004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_BPTCH_MAX_INDEX			767
+#define GLV_BPTCH_BPTCH_S			0
+#define GLV_BPTCH_BPTCH_M			ICE_M(0xFF, 0)
+#define GLV_BPTCL(_i)				(0x0030E000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_BPTCL_MAX_INDEX			767
+#define GLV_BPTCL_BPTCL_S			0
+#define GLV_BPTCL_BPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_GORCH(_i)				(0x003B0004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_GORCH_MAX_INDEX			767
+#define GLV_GORCH_GORCH_S			0
+#define GLV_GORCH_GORCH_M			ICE_M(0xFF, 0)
+#define GLV_GORCL(_i)				(0x003B0000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_GORCL_MAX_INDEX			767
+#define GLV_GORCL_GORCL_S			0
+#define GLV_GORCL_GORCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_GOTCH(_i)				(0x00300004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_GOTCH_MAX_INDEX			767
+#define GLV_GOTCH_GOTCH_S			0
+#define GLV_GOTCH_GOTCH_M			ICE_M(0xFF, 0)
+#define GLV_GOTCL(_i)				(0x00300000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_GOTCL_MAX_INDEX			767
+#define GLV_GOTCL_GOTCL_S			0
+#define GLV_GOTCL_GOTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_MPRCH(_i)				(0x003B4004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_MPRCH_MAX_INDEX			767
+#define GLV_MPRCH_MPRCH_S			0
+#define GLV_MPRCH_MPRCH_M			ICE_M(0xFF, 0)
+#define GLV_MPRCL(_i)				(0x003B4000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_MPRCL_MAX_INDEX			767
+#define GLV_MPRCL_MPRCL_S			0
+#define GLV_MPRCL_MPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_MPTCH(_i)				(0x0030C004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_MPTCH_MAX_INDEX			767
+#define GLV_MPTCH_MPTCH_S			0
+#define GLV_MPTCH_MPTCH_M			ICE_M(0xFF, 0)
+#define GLV_MPTCL(_i)				(0x0030C000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_MPTCL_MAX_INDEX			767
+#define GLV_MPTCL_MPTCL_S			0
+#define GLV_MPTCL_MPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_RDPC(_i)				(0x00294C04 + ((_i) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_RDPC_MAX_INDEX			767
+#define GLV_RDPC_RDPC_S				0
+#define GLV_RDPC_RDPC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLV_REPC(_i)				(0x00295804 + ((_i) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_REPC_MAX_INDEX			767
+#define GLV_REPC_NO_DESC_CNT_S			0
+#define GLV_REPC_NO_DESC_CNT_M			ICE_M(0xFFFF, 0)
+#define GLV_REPC_ERROR_CNT_S			16
+#define GLV_REPC_ERROR_CNT_M			ICE_M(0xFFFF, 16)
+#define GLV_TEPC(_VSI)				(0x00312000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_TEPC_MAX_INDEX			767
+#define GLV_TEPC_TEPC_S				0
+#define GLV_TEPC_TEPC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLV_UPRCH(_i)				(0x003B2004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_UPRCH_MAX_INDEX			767
+#define GLV_UPRCH_UPRCH_S			0
+#define GLV_UPRCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLV_UPRCL(_i)				(0x003B2000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_UPRCL_MAX_INDEX			767
+#define GLV_UPRCL_UPRCL_S			0
+#define GLV_UPRCL_UPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_UPTCH(_i)				(0x0030A004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_UPTCH_MAX_INDEX			767
+#define GLV_UPTCH_GLVUPTCH_S			0
+#define GLV_UPTCH_GLVUPTCH_M			ICE_M(0xFF, 0)
+#define GLV_UPTCL(_i)				(0x0030A000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_UPTCL_MAX_INDEX			767
+#define GLV_UPTCL_UPTCL_S			0
+#define GLV_UPTCL_UPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLVEBUP_RBCH(_i, _j)			(0x00343004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_RBCH_MAX_INDEX			7
+#define GLVEBUP_RBCH_UPBCH_S			0
+#define GLVEBUP_RBCH_UPBCH_M			ICE_M(0xFF, 0)
+#define GLVEBUP_RBCL(_i, _j)			(0x00343000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_RBCL_MAX_INDEX			7
+#define GLVEBUP_RBCL_UPBCL_S			0
+#define GLVEBUP_RBCL_UPBCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLVEBUP_RPCH(_i, _j)			(0x00344004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_RPCH_MAX_INDEX			7
+#define GLVEBUP_RPCH_UPPCH_S			0
+#define GLVEBUP_RPCH_UPPCH_M			ICE_M(0xFF, 0)
+#define GLVEBUP_RPCL(_i, _j)			(0x00344000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_RPCL_MAX_INDEX			7
+#define GLVEBUP_RPCL_UPPCL_S			0
+#define GLVEBUP_RPCL_UPPCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLVEBUP_TBCH(_i, _j)			(0x00306004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_TBCH_MAX_INDEX			7
+#define GLVEBUP_TBCH_UPBCH_S			0
+#define GLVEBUP_TBCH_UPBCH_M			ICE_M(0xFF, 0)
+#define GLVEBUP_TBCL(_i, _j)			(0x00306000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_TBCL_MAX_INDEX			7
+#define GLVEBUP_TBCL_UPBCL_S			0
+#define GLVEBUP_TBCL_UPBCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLVEBUP_TPCH(_i, _j)			(0x00308004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_TPCH_MAX_INDEX			7
+#define GLVEBUP_TPCH_UPPCH_S			0
+#define GLVEBUP_TPCH_UPPCH_M			ICE_M(0xFF, 0)
+#define GLVEBUP_TPCL(_i, _j)			(0x00308000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_TPCL_MAX_INDEX			7
+#define GLVEBUP_TPCL_UPPCL_S			0
+#define GLVEBUP_TPCL_UPPCL_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTRPB_LDPC				0x000AC280 /* Reset Source: CORER */
+#define PRTRPB_LDPC_CRCERRS_S			0
+#define PRTRPB_LDPC_CRCERRS_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTRPB_RDPC				0x000AC260 /* Reset Source: CORER */
+#define PRTRPB_RDPC_CRCERRS_S			0
+#define PRTRPB_RDPC_CRCERRS_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTTPB_STAT_TC_BYTES_SENTL(_i)		(0x00098200 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define PRTTPB_STAT_TC_BYTES_SENTL_MAX_INDEX	63
+#define PRTTPB_STAT_TC_BYTES_SENTL_TCCNT_S	0
+#define PRTTPB_STAT_TC_BYTES_SENTL_TCCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define TPB_PRTTPB_STAT_PKT_SENT(_i)		(0x00099470 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define TPB_PRTTPB_STAT_PKT_SENT_MAX_INDEX	7
+#define TPB_PRTTPB_STAT_PKT_SENT_PKTCNT_S	0
+#define TPB_PRTTPB_STAT_PKT_SENT_PKTCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define TPB_PRTTPB_STAT_TC_BYTES_SENT(_i)	(0x00099094 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define TPB_PRTTPB_STAT_TC_BYTES_SENT_MAX_INDEX 63
+#define TPB_PRTTPB_STAT_TC_BYTES_SENT_TCCNT_S	0
+#define TPB_PRTTPB_STAT_TC_BYTES_SENT_TCCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define EMP_SWT_PRUNIND				0x00204020 /* Reset Source: CORER */
+#define EMP_SWT_PRUNIND_OPCODE_S		0
+#define EMP_SWT_PRUNIND_OPCODE_M		ICE_M(0xF, 0)
+#define EMP_SWT_PRUNIND_LIST_INDEX_NUM_S	4
+#define EMP_SWT_PRUNIND_LIST_INDEX_NUM_M	ICE_M(0x3FF, 4)
+#define EMP_SWT_PRUNIND_VSI_NUM_S		16
+#define EMP_SWT_PRUNIND_VSI_NUM_M		ICE_M(0x3FF, 16)
+#define EMP_SWT_PRUNIND_BIT_VALUE_S		31
+#define EMP_SWT_PRUNIND_BIT_VALUE_M		BIT(31)
+#define EMP_SWT_REPIND				0x0020401C /* Reset Source: CORER */
+#define EMP_SWT_REPIND_OPCODE_S			0
+#define EMP_SWT_REPIND_OPCODE_M			ICE_M(0xF, 0)
+#define EMP_SWT_REPIND_LIST_INDEX_NUMBER_S	4
+#define EMP_SWT_REPIND_LIST_INDEX_NUMBER_M	ICE_M(0x3FF, 4)
+#define EMP_SWT_REPIND_VSI_NUM_S		16
+#define EMP_SWT_REPIND_VSI_NUM_M		ICE_M(0x3FF, 16)
+#define EMP_SWT_REPIND_BIT_VALUE_S		31
+#define EMP_SWT_REPIND_BIT_VALUE_M		BIT(31)
+#define GL_OVERRIDEC				0x002040A4 /* Reset Source: CORER */
+#define GL_OVERRIDEC_OVERRIDE_ATTEMPTC_S	0
+#define GL_OVERRIDEC_OVERRIDE_ATTEMPTC_M	ICE_M(0xFFFF, 0)
+#define GL_OVERRIDEC_LAST_VSI_S			16
+#define GL_OVERRIDEC_LAST_VSI_M			ICE_M(0x3FF, 16)
+#define GL_PLG_AVG_CALC_CFG			0x0020A5AC /* Reset Source: CORER */
+#define GL_PLG_AVG_CALC_CFG_CYCLE_LEN_S		0
+#define GL_PLG_AVG_CALC_CFG_CYCLE_LEN_M		ICE_M(0x7FFFFFFF, 0)
+#define GL_PLG_AVG_CALC_CFG_MODE_S		31
+#define GL_PLG_AVG_CALC_CFG_MODE_M		BIT(31)
+#define GL_PLG_AVG_CALC_ST			0x0020A5B0 /* Reset Source: CORER */
+#define GL_PLG_AVG_CALC_ST_IN_DATA_S		0
+#define GL_PLG_AVG_CALC_ST_IN_DATA_M		ICE_M(0x7FFF, 0)
+#define GL_PLG_AVG_CALC_ST_OUT_DATA_S		16
+#define GL_PLG_AVG_CALC_ST_OUT_DATA_M		ICE_M(0x7FFF, 16)
+#define GL_PLG_AVG_CALC_ST_VALID_S		31
+#define GL_PLG_AVG_CALC_ST_VALID_M		BIT(31)
+#define GL_PRE_CFG_CMD				0x00214090 /* Reset Source: CORER */
+#define GL_PRE_CFG_CMD_ADDR_S			0
+#define GL_PRE_CFG_CMD_ADDR_M			ICE_M(0x1FFF, 0)
+#define GL_PRE_CFG_CMD_TBLIDX_S			16
+#define GL_PRE_CFG_CMD_TBLIDX_M			ICE_M(0x7, 16)
+#define GL_PRE_CFG_CMD_CMD_S			29
+#define GL_PRE_CFG_CMD_CMD_M			BIT(29)
+#define GL_PRE_CFG_CMD_DONE_S			31
+#define GL_PRE_CFG_CMD_DONE_M			BIT(31)
+#define GL_PRE_CFG_DATA(_i)			(0x00214074 + ((_i) * 4)) /* _i=0...6 */ /* Reset Source: CORER */
+#define GL_PRE_CFG_DATA_MAX_INDEX		6
+#define GL_PRE_CFG_DATA_GL_PRE_RCP_DATA_S	0
+#define GL_PRE_CFG_DATA_GL_PRE_RCP_DATA_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_SWT_FUNCFILT				0x001D2698 /* Reset Source: CORER */
+#define GL_SWT_FUNCFILT_FUNCFILT_S		0
+#define GL_SWT_FUNCFILT_FUNCFILT_M		BIT(0)
+#define GL_SWT_FW_STS(_i)			(0x00216000 + ((_i) * 4)) /* _i=0...5 */ /* Reset Source: CORER */
+#define GL_SWT_FW_STS_MAX_INDEX			5
+#define GL_SWT_FW_STS_GL_SWT_FW_STS_S		0
+#define GL_SWT_FW_STS_GL_SWT_FW_STS_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_SWT_LAT_DOUBLE			0x00204004 /* Reset Source: CORER */
+#define GL_SWT_LAT_DOUBLE_BASE_S		0
+#define GL_SWT_LAT_DOUBLE_BASE_M		ICE_M(0x7FF, 0)
+#define GL_SWT_LAT_DOUBLE_SIZE_S		16
+#define GL_SWT_LAT_DOUBLE_SIZE_M		ICE_M(0x7FF, 16)
+#define GL_SWT_LAT_QUAD				0x00204008 /* Reset Source: CORER */
+#define GL_SWT_LAT_QUAD_BASE_S			0
+#define GL_SWT_LAT_QUAD_BASE_M			ICE_M(0x7FF, 0)
+#define GL_SWT_LAT_QUAD_SIZE_S			16
+#define GL_SWT_LAT_QUAD_SIZE_M			ICE_M(0x7FF, 16)
+#define GL_SWT_LAT_SINGLE			0x00204000 /* Reset Source: CORER */
+#define GL_SWT_LAT_SINGLE_BASE_S		0
+#define GL_SWT_LAT_SINGLE_BASE_M		ICE_M(0x7FF, 0)
+#define GL_SWT_LAT_SINGLE_SIZE_S		16
+#define GL_SWT_LAT_SINGLE_SIZE_M		ICE_M(0x7FF, 16)
+#define GL_SWT_MD_PRI				0x002040AC /* Reset Source: CORER */
+#define GL_SWT_MD_PRI_VSI_PRI_S			0
+#define GL_SWT_MD_PRI_VSI_PRI_M			ICE_M(0x7, 0)
+#define GL_SWT_MD_PRI_LB_PRI_S			4
+#define GL_SWT_MD_PRI_LB_PRI_M			ICE_M(0x7, 4)
+#define GL_SWT_MD_PRI_LAN_EN_PRI_S		8
+#define GL_SWT_MD_PRI_LAN_EN_PRI_M		ICE_M(0x7, 8)
+#define GL_SWT_MD_PRI_QH_PRI_S			12
+#define GL_SWT_MD_PRI_QH_PRI_M			ICE_M(0x7, 12)
+#define GL_SWT_MD_PRI_QL_PRI_S			16
+#define GL_SWT_MD_PRI_QL_PRI_M			ICE_M(0x7, 16)
+#define GL_SWT_MIRTARVSI(_i)			(0x00204500 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_SWT_MIRTARVSI_MAX_INDEX		63
+#define GL_SWT_MIRTARVSI_VFVMNUMBER_S		0
+#define GL_SWT_MIRTARVSI_VFVMNUMBER_M		ICE_M(0x3FF, 0)
+#define GL_SWT_MIRTARVSI_FUNCTIONTYPE_S		10
+#define GL_SWT_MIRTARVSI_FUNCTIONTYPE_M		ICE_M(0x3, 10)
+#define GL_SWT_MIRTARVSI_PFNUMBER_S		12
+#define GL_SWT_MIRTARVSI_PFNUMBER_M		ICE_M(0x7, 12)
+#define GL_SWT_MIRTARVSI_TARGETVSI_S		20
+#define GL_SWT_MIRTARVSI_TARGETVSI_M		ICE_M(0x3FF, 20)
+#define GL_SWT_MIRTARVSI_RULEENABLE_S		31
+#define GL_SWT_MIRTARVSI_RULEENABLE_M		BIT(31)
+#define GL_SWT_SWIDFVIDX			0x00214114 /* Reset Source: CORER */
+#define GL_SWT_SWIDFVIDX_SWIDFVIDX_S		0
+#define GL_SWT_SWIDFVIDX_SWIDFVIDX_M		ICE_M(0x3F, 0)
+#define GL_SWT_SWIDFVIDX_PORT_TYPE_S		31
+#define GL_SWT_SWIDFVIDX_PORT_TYPE_M		BIT(31)
+#define GL_VP_SWITCHID(_i)			(0x00214094 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GL_VP_SWITCHID_MAX_INDEX		31
+#define GL_VP_SWITCHID_SWITCHID_S		0
+#define GL_VP_SWITCHID_SWITCHID_M		ICE_M(0xFF, 0)
+#define GLSWID_STAT_BLOCK(_i)			(0x0020A1A4 + ((_i) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define GLSWID_STAT_BLOCK_MAX_INDEX		255
+#define GLSWID_STAT_BLOCK_VEBID_S		0
+#define GLSWID_STAT_BLOCK_VEBID_M		ICE_M(0x1F, 0)
+#define GLSWID_STAT_BLOCK_VEBID_VALID_S		31
+#define GLSWID_STAT_BLOCK_VEBID_VALID_M		BIT(31)
+#define GLSWT_ACT_RESP_0			0x0020A5A4 /* Reset Source: CORER */
+#define GLSWT_ACT_RESP_0_GLSWT_ACT_RESP_S	0
+#define GLSWT_ACT_RESP_0_GLSWT_ACT_RESP_M	ICE_M(0xFFFFFFFF, 0)
+#define GLSWT_ACT_RESP_1			0x0020A5A8 /* Reset Source: CORER */
+#define GLSWT_ACT_RESP_1_GLSWT_ACT_RESP_S	0
+#define GLSWT_ACT_RESP_1_GLSWT_ACT_RESP_M	ICE_M(0xFFFFFFFF, 0)
+#define GLSWT_ARB_MODE				0x0020A674 /* Reset Source: CORER */
+#define GLSWT_ARB_MODE_FLU_PRI_SHM_S		0
+#define GLSWT_ARB_MODE_FLU_PRI_SHM_M		BIT(0)
+#define GLSWT_ARB_MODE_TX_RX_FWD_PRI_S		1
+#define GLSWT_ARB_MODE_TX_RX_FWD_PRI_M		BIT(1)
+#define PRT_SBPVSI				0x00204120 /* Reset Source: CORER */
+#define PRT_SBPVSI_BAD_FRAMES_VSI_S		0
+#define PRT_SBPVSI_BAD_FRAMES_VSI_M		ICE_M(0x3FF, 0)
+#define PRT_SBPVSI_SBP_S			31
+#define PRT_SBPVSI_SBP_M			BIT(31)
+#define PRT_SCSTS				0x00204140 /* Reset Source: CORER */
+#define PRT_SCSTS_BSCA_S			0
+#define PRT_SCSTS_BSCA_M			BIT(0)
+#define PRT_SCSTS_BSCAP_S			1
+#define PRT_SCSTS_BSCAP_M			BIT(1)
+#define PRT_SCSTS_MSCA_S			2
+#define PRT_SCSTS_MSCA_M			BIT(2)
+#define PRT_SCSTS_MSCAP_S			3
+#define PRT_SCSTS_MSCAP_M			BIT(3)
+#define PRT_SWT_BSCCNT				0x00204160 /* Reset Source: CORER */
+#define PRT_SWT_BSCCNT_CCOUNT_S			0
+#define PRT_SWT_BSCCNT_CCOUNT_M			ICE_M(0x1FFFFFF, 0)
+#define PRT_SWT_BSCTRH				0x00204180 /* Reset Source: CORER */
+#define PRT_SWT_BSCTRH_UTRESH_S			0
+#define PRT_SWT_BSCTRH_UTRESH_M			ICE_M(0x7FFFF, 0)
+#define PRT_SWT_MIREG				0x002042A0 /* Reset Source: CORER */
+#define PRT_SWT_MIREG_MIRRULE_S			0
+#define PRT_SWT_MIREG_MIRRULE_M			ICE_M(0x3F, 0)
+#define PRT_SWT_MIREG_MIRENA_S			7
+#define PRT_SWT_MIREG_MIRENA_M			BIT(7)
+#define PRT_SWT_MIRIG				0x00204280 /* Reset Source: CORER */
+#define PRT_SWT_MIRIG_MIRRULE_S			0
+#define PRT_SWT_MIRIG_MIRRULE_M			ICE_M(0x3F, 0)
+#define PRT_SWT_MIRIG_MIRENA_S			7
+#define PRT_SWT_MIRIG_MIRENA_M			BIT(7)
+#define PRT_SWT_MSCCNT				0x00204100 /* Reset Source: CORER */
+#define PRT_SWT_MSCCNT_CCOUNT_S			0
+#define PRT_SWT_MSCCNT_CCOUNT_M			ICE_M(0x1FFFFFF, 0)
+#define PRT_SWT_MSCTRH				0x002041C0 /* Reset Source: CORER */
+#define PRT_SWT_MSCTRH_UTRESH_S			0
+#define PRT_SWT_MSCTRH_UTRESH_M			ICE_M(0x7FFFF, 0)
+#define PRT_SWT_SCBI				0x002041E0 /* Reset Source: CORER */
+#define PRT_SWT_SCBI_BI_S			0
+#define PRT_SWT_SCBI_BI_M			ICE_M(0x1FFFFFF, 0)
+#define PRT_SWT_SCCRL				0x00204200 /* Reset Source: CORER */
+#define PRT_SWT_SCCRL_MDIPW_S			0
+#define PRT_SWT_SCCRL_MDIPW_M			BIT(0)
+#define PRT_SWT_SCCRL_MDICW_S			1
+#define PRT_SWT_SCCRL_MDICW_M			BIT(1)
+#define PRT_SWT_SCCRL_BDIPW_S			2
+#define PRT_SWT_SCCRL_BDIPW_M			BIT(2)
+#define PRT_SWT_SCCRL_BDICW_S			3
+#define PRT_SWT_SCCRL_BDICW_M			BIT(3)
+#define PRT_SWT_SCCRL_INTERVAL_S		8
+#define PRT_SWT_SCCRL_INTERVAL_M		ICE_M(0xFFFFF, 8)
+#define PRT_TCTUPR(_i)				(0x00040840 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define PRT_TCTUPR_MAX_INDEX			31
+#define PRT_TCTUPR_UP0_S			0
+#define PRT_TCTUPR_UP0_M			ICE_M(0x7, 0)
+#define PRT_TCTUPR_UP1_S			4
+#define PRT_TCTUPR_UP1_M			ICE_M(0x7, 4)
+#define PRT_TCTUPR_UP2_S			8
+#define PRT_TCTUPR_UP2_M			ICE_M(0x7, 8)
+#define PRT_TCTUPR_UP3_S			12
+#define PRT_TCTUPR_UP3_M			ICE_M(0x7, 12)
+#define PRT_TCTUPR_UP4_S			16
+#define PRT_TCTUPR_UP4_M			ICE_M(0x7, 16)
+#define PRT_TCTUPR_UP5_S			20
+#define PRT_TCTUPR_UP5_M			ICE_M(0x7, 20)
+#define PRT_TCTUPR_UP6_S			24
+#define PRT_TCTUPR_UP6_M			ICE_M(0x7, 24)
+#define PRT_TCTUPR_UP7_S			28
+#define PRT_TCTUPR_UP7_M			ICE_M(0x7, 28)
+#define GLHH_ART_CTL				0x000A41D4 /* Reset Source: POR */
+#define GLHH_ART_CTL_ACTIVE_S			0
+#define GLHH_ART_CTL_ACTIVE_M			BIT(0)
+#define GLHH_ART_CTL_TIME_OUT1_S		1
+#define GLHH_ART_CTL_TIME_OUT1_M		BIT(1)
+#define GLHH_ART_CTL_TIME_OUT2_S		2
+#define GLHH_ART_CTL_TIME_OUT2_M		BIT(2)
+#define GLHH_ART_CTL_RESET_HH_S			31
+#define GLHH_ART_CTL_RESET_HH_M			BIT(31)
+#define GLHH_ART_DATA				0x000A41E0 /* Reset Source: POR */
+#define GLHH_ART_DATA_AGENT_TYPE_S		0
+#define GLHH_ART_DATA_AGENT_TYPE_M		ICE_M(0x7, 0)
+#define GLHH_ART_DATA_SYNC_TYPE_S		3
+#define GLHH_ART_DATA_SYNC_TYPE_M		BIT(3)
+#define GLHH_ART_DATA_MAX_DELAY_S		4
+#define GLHH_ART_DATA_MAX_DELAY_M		ICE_M(0xF, 4)
+#define GLHH_ART_DATA_TIME_BASE_S		8
+#define GLHH_ART_DATA_TIME_BASE_M		ICE_M(0xF, 8)
+#define GLHH_ART_DATA_RSV_DATA_S		12
+#define GLHH_ART_DATA_RSV_DATA_M		ICE_M(0xFFFFF, 12)
+#define GLHH_ART_TIME_H				0x000A41D8 /* Reset Source: POR */
+#define GLHH_ART_TIME_H_ART_TIME_H_S		0
+#define GLHH_ART_TIME_H_ART_TIME_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHH_ART_TIME_L				0x000A41DC /* Reset Source: POR */
+#define GLHH_ART_TIME_L_ART_TIME_L_S		0
+#define GLHH_ART_TIME_L_ART_TIME_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_AUX_IN_0(_i)			(0x000889D8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_IN_0_MAX_INDEX		1
+#define GLTSYN_AUX_IN_0_EVNTLVL_S		0
+#define GLTSYN_AUX_IN_0_EVNTLVL_M		ICE_M(0x3, 0)
+#define GLTSYN_AUX_IN_0_INT_ENA_S		4
+#define GLTSYN_AUX_IN_0_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_IN_1(_i)			(0x000889E0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_IN_1_MAX_INDEX		1
+#define GLTSYN_AUX_IN_1_EVNTLVL_S		0
+#define GLTSYN_AUX_IN_1_EVNTLVL_M		ICE_M(0x3, 0)
+#define GLTSYN_AUX_IN_1_INT_ENA_S		4
+#define GLTSYN_AUX_IN_1_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_IN_2(_i)			(0x000889E8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_IN_2_MAX_INDEX		1
+#define GLTSYN_AUX_IN_2_EVNTLVL_S		0
+#define GLTSYN_AUX_IN_2_EVNTLVL_M		ICE_M(0x3, 0)
+#define GLTSYN_AUX_IN_2_INT_ENA_S		4
+#define GLTSYN_AUX_IN_2_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_0(_i)			(0x00088998 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_OUT_0_MAX_INDEX		1
+#define GLTSYN_AUX_OUT_0_OUT_ENA_S		0
+#define GLTSYN_AUX_OUT_0_OUT_ENA_M		BIT(0)
+#define GLTSYN_AUX_OUT_0_OUTMOD_S		1
+#define GLTSYN_AUX_OUT_0_OUTMOD_M		ICE_M(0x3, 1)
+#define GLTSYN_AUX_OUT_0_OUTLVL_S		3
+#define GLTSYN_AUX_OUT_0_OUTLVL_M		BIT(3)
+#define GLTSYN_AUX_OUT_0_INT_ENA_S		4
+#define GLTSYN_AUX_OUT_0_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_0_PULSEW_S		8
+#define GLTSYN_AUX_OUT_0_PULSEW_M		ICE_M(0xF, 8)
+#define GLTSYN_AUX_OUT_1(_i)			(0x000889A0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_OUT_1_MAX_INDEX		1
+#define GLTSYN_AUX_OUT_1_OUT_ENA_S		0
+#define GLTSYN_AUX_OUT_1_OUT_ENA_M		BIT(0)
+#define GLTSYN_AUX_OUT_1_OUTMOD_S		1
+#define GLTSYN_AUX_OUT_1_OUTMOD_M		ICE_M(0x3, 1)
+#define GLTSYN_AUX_OUT_1_OUTLVL_S		3
+#define GLTSYN_AUX_OUT_1_OUTLVL_M		BIT(3)
+#define GLTSYN_AUX_OUT_1_INT_ENA_S		4
+#define GLTSYN_AUX_OUT_1_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_1_PULSEW_S		8
+#define GLTSYN_AUX_OUT_1_PULSEW_M		ICE_M(0xF, 8)
+#define GLTSYN_AUX_OUT_2(_i)			(0x000889A8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_OUT_2_MAX_INDEX		1
+#define GLTSYN_AUX_OUT_2_OUT_ENA_S		0
+#define GLTSYN_AUX_OUT_2_OUT_ENA_M		BIT(0)
+#define GLTSYN_AUX_OUT_2_OUTMOD_S		1
+#define GLTSYN_AUX_OUT_2_OUTMOD_M		ICE_M(0x3, 1)
+#define GLTSYN_AUX_OUT_2_OUTLVL_S		3
+#define GLTSYN_AUX_OUT_2_OUTLVL_M		BIT(3)
+#define GLTSYN_AUX_OUT_2_INT_ENA_S		4
+#define GLTSYN_AUX_OUT_2_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_2_PULSEW_S		8
+#define GLTSYN_AUX_OUT_2_PULSEW_M		ICE_M(0xF, 8)
+#define GLTSYN_AUX_OUT_3(_i)			(0x000889B0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_OUT_3_MAX_INDEX		1
+#define GLTSYN_AUX_OUT_3_OUT_ENA_S		0
+#define GLTSYN_AUX_OUT_3_OUT_ENA_M		BIT(0)
+#define GLTSYN_AUX_OUT_3_OUTMOD_S		1
+#define GLTSYN_AUX_OUT_3_OUTMOD_M		ICE_M(0x3, 1)
+#define GLTSYN_AUX_OUT_3_OUTLVL_S		3
+#define GLTSYN_AUX_OUT_3_OUTLVL_M		BIT(3)
+#define GLTSYN_AUX_OUT_3_INT_ENA_S		4
+#define GLTSYN_AUX_OUT_3_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_3_PULSEW_S		8
+#define GLTSYN_AUX_OUT_3_PULSEW_M		ICE_M(0xF, 8)
+#define GLTSYN_CLKO_0(_i)			(0x000889B8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_CLKO_0_MAX_INDEX			1
+#define GLTSYN_CLKO_0_TSYNCLKO_S		0
+#define GLTSYN_CLKO_0_TSYNCLKO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_CLKO_1(_i)			(0x000889C0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_CLKO_1_MAX_INDEX			1
+#define GLTSYN_CLKO_1_TSYNCLKO_S		0
+#define GLTSYN_CLKO_1_TSYNCLKO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_CLKO_2(_i)			(0x000889C8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_CLKO_2_MAX_INDEX			1
+#define GLTSYN_CLKO_2_TSYNCLKO_S		0
+#define GLTSYN_CLKO_2_TSYNCLKO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_CLKO_3(_i)			(0x000889D0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_CLKO_3_MAX_INDEX			1
+#define GLTSYN_CLKO_3_TSYNCLKO_S		0
+#define GLTSYN_CLKO_3_TSYNCLKO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_CMD				0x00088810 /* Reset Source: CORER */
+#define GLTSYN_CMD_CMD_S			0
+#define GLTSYN_CMD_CMD_M			ICE_M(0xFF, 0)
+#define GLTSYN_CMD_SEL_MASTER_S			8
+#define GLTSYN_CMD_SEL_MASTER_M			BIT(8)
+#define GLTSYN_CMD_SYNC				0x00088814 /* Reset Source: CORER */
+#define GLTSYN_CMD_SYNC_SYNC_S			0
+#define GLTSYN_CMD_SYNC_SYNC_M			ICE_M(0x3, 0)
+#define GLTSYN_ENA(_i)				(0x00088808 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_ENA_MAX_INDEX			1
+#define GLTSYN_ENA_TSYN_ENA_S			0
+#define GLTSYN_ENA_TSYN_ENA_M			BIT(0)
+#define GLTSYN_EVNT_H_0(_i)			(0x00088970 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_H_0_MAX_INDEX		1
+#define GLTSYN_EVNT_H_0_TSYNEVNT_H_S		0
+#define GLTSYN_EVNT_H_0_TSYNEVNT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_H_1(_i)			(0x00088980 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_H_1_MAX_INDEX		1
+#define GLTSYN_EVNT_H_1_TSYNEVNT_H_S		0
+#define GLTSYN_EVNT_H_1_TSYNEVNT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_H_2(_i)			(0x00088990 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_H_2_MAX_INDEX		1
+#define GLTSYN_EVNT_H_2_TSYNEVNT_H_S		0
+#define GLTSYN_EVNT_H_2_TSYNEVNT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_L_0(_i)			(0x00088968 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_L_0_MAX_INDEX		1
+#define GLTSYN_EVNT_L_0_TSYNEVNT_L_S		0
+#define GLTSYN_EVNT_L_0_TSYNEVNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_L_1(_i)			(0x00088978 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_L_1_MAX_INDEX		1
+#define GLTSYN_EVNT_L_1_TSYNEVNT_L_S		0
+#define GLTSYN_EVNT_L_1_TSYNEVNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_L_2(_i)			(0x00088988 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_L_2_MAX_INDEX		1
+#define GLTSYN_EVNT_L_2_TSYNEVNT_L_S		0
+#define GLTSYN_EVNT_L_2_TSYNEVNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_HHTIME_H(_i)			(0x00088900 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_HHTIME_H_MAX_INDEX		1
+#define GLTSYN_HHTIME_H_TSYNEVNT_H_S		0
+#define GLTSYN_HHTIME_H_TSYNEVNT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_HHTIME_L(_i)			(0x000888F8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_HHTIME_L_MAX_INDEX		1
+#define GLTSYN_HHTIME_L_TSYNEVNT_L_S		0
+#define GLTSYN_HHTIME_L_TSYNEVNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_INCVAL_H(_i)			(0x00088920 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_INCVAL_H_MAX_INDEX		1
+#define GLTSYN_INCVAL_H_INCVAL_H_S		0
+#define GLTSYN_INCVAL_H_INCVAL_H_M		ICE_M(0xFF, 0)
+#define GLTSYN_INCVAL_L(_i)			(0x00088918 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_INCVAL_L_MAX_INDEX		1
+#define GLTSYN_INCVAL_L_INCVAL_L_S		0
+#define GLTSYN_INCVAL_L_INCVAL_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHADJ_H(_i)			(0x00088910 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHADJ_H_MAX_INDEX		1
+#define GLTSYN_SHADJ_H_ADJUST_H_S		0
+#define GLTSYN_SHADJ_H_ADJUST_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHADJ_L(_i)			(0x00088908 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHADJ_L_MAX_INDEX		1
+#define GLTSYN_SHADJ_L_ADJUST_L_S		0
+#define GLTSYN_SHADJ_L_ADJUST_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHTIME_0(_i)			(0x000888E0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHTIME_0_MAX_INDEX		1
+#define GLTSYN_SHTIME_0_TSYNTIME_0_S		0
+#define GLTSYN_SHTIME_0_TSYNTIME_0_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHTIME_H(_i)			(0x000888F0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHTIME_H_MAX_INDEX		1
+#define GLTSYN_SHTIME_H_TSYNTIME_H_S		0
+#define GLTSYN_SHTIME_H_TSYNTIME_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHTIME_L(_i)			(0x000888E8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHTIME_L_MAX_INDEX		1
+#define GLTSYN_SHTIME_L_TSYNTIME_L_S		0
+#define GLTSYN_SHTIME_L_TSYNTIME_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_STAT(_i)				(0x000888C0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_STAT_MAX_INDEX			1
+#define GLTSYN_STAT_EVENT0_S			0
+#define GLTSYN_STAT_EVENT0_M			BIT(0)
+#define GLTSYN_STAT_EVENT1_S			1
+#define GLTSYN_STAT_EVENT1_M			BIT(1)
+#define GLTSYN_STAT_EVENT2_S			2
+#define GLTSYN_STAT_EVENT2_M			BIT(2)
+#define GLTSYN_STAT_TGT0_S			4
+#define GLTSYN_STAT_TGT0_M			BIT(4)
+#define GLTSYN_STAT_TGT1_S			5
+#define GLTSYN_STAT_TGT1_M			BIT(5)
+#define GLTSYN_STAT_TGT2_S			6
+#define GLTSYN_STAT_TGT2_M			BIT(6)
+#define GLTSYN_STAT_TGT3_S			7
+#define GLTSYN_STAT_TGT3_M			BIT(7)
+#define GLTSYN_SYNC_DLAY			0x00088818 /* Reset Source: CORER */
+#define GLTSYN_SYNC_DLAY_SYNC_DELAY_S		0
+#define GLTSYN_SYNC_DLAY_SYNC_DELAY_M		ICE_M(0x1F, 0)
+#define GLTSYN_TGT_H_0(_i)			(0x00088930 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_H_0_MAX_INDEX		1
+#define GLTSYN_TGT_H_0_TSYNTGTT_H_S		0
+#define GLTSYN_TGT_H_0_TSYNTGTT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_H_1(_i)			(0x00088940 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_H_1_MAX_INDEX		1
+#define GLTSYN_TGT_H_1_TSYNTGTT_H_S		0
+#define GLTSYN_TGT_H_1_TSYNTGTT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_H_2(_i)			(0x00088950 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_H_2_MAX_INDEX		1
+#define GLTSYN_TGT_H_2_TSYNTGTT_H_S		0
+#define GLTSYN_TGT_H_2_TSYNTGTT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_H_3(_i)			(0x00088960 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_H_3_MAX_INDEX		1
+#define GLTSYN_TGT_H_3_TSYNTGTT_H_S		0
+#define GLTSYN_TGT_H_3_TSYNTGTT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_L_0(_i)			(0x00088928 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_L_0_MAX_INDEX		1
+#define GLTSYN_TGT_L_0_TSYNTGTT_L_S		0
+#define GLTSYN_TGT_L_0_TSYNTGTT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_L_1(_i)			(0x00088938 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_L_1_MAX_INDEX		1
+#define GLTSYN_TGT_L_1_TSYNTGTT_L_S		0
+#define GLTSYN_TGT_L_1_TSYNTGTT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_L_2(_i)			(0x00088948 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_L_2_MAX_INDEX		1
+#define GLTSYN_TGT_L_2_TSYNTGTT_L_S		0
+#define GLTSYN_TGT_L_2_TSYNTGTT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_L_3(_i)			(0x00088958 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_L_3_MAX_INDEX		1
+#define GLTSYN_TGT_L_3_TSYNTGTT_L_S		0
+#define GLTSYN_TGT_L_3_TSYNTGTT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TIME_0(_i)			(0x000888C8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TIME_0_MAX_INDEX			1
+#define GLTSYN_TIME_0_TSYNTIME_0_S		0
+#define GLTSYN_TIME_0_TSYNTIME_0_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TIME_H(_i)			(0x000888D8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TIME_H_MAX_INDEX			1
+#define GLTSYN_TIME_H_TSYNTIME_H_S		0
+#define GLTSYN_TIME_H_TSYNTIME_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TIME_L(_i)			(0x000888D0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TIME_L_MAX_INDEX			1
+#define GLTSYN_TIME_L_TSYNTIME_L_S		0
+#define GLTSYN_TIME_L_TSYNTIME_L_M		ICE_M(0xFFFFFFFF, 0)
+#define PFHH_SEM				0x000A4200 /* Reset Source: PFR */
+#define PFHH_SEM_BUSY_S				0
+#define PFHH_SEM_BUSY_M				BIT(0)
+#define PFHH_SEM_PF_OWNER_S			4
+#define PFHH_SEM_PF_OWNER_M			ICE_M(0x7, 4)
+#define PFTSYN_SEM				0x00088880 /* Reset Source: PFR */
+#define PFTSYN_SEM_BUSY_S			0
+#define PFTSYN_SEM_BUSY_M			BIT(0)
+#define PFTSYN_SEM_PF_OWNER_S			4
+#define PFTSYN_SEM_PF_OWNER_M			ICE_M(0x7, 4)
+#define GLPE_TSCD_FLR(_i)			(0x0051E24C + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLPE_TSCD_FLR_MAX_INDEX			3
+#define GLPE_TSCD_FLR_DRAIN_VCTR_ID_S		0
+#define GLPE_TSCD_FLR_DRAIN_VCTR_ID_M		ICE_M(0x3, 0)
+#define GLPE_TSCD_FLR_PORT_S			2
+#define GLPE_TSCD_FLR_PORT_M			ICE_M(0x7, 2)
+#define GLPE_TSCD_FLR_PF_NUM_S			5
+#define GLPE_TSCD_FLR_PF_NUM_M			ICE_M(0x7, 5)
+#define GLPE_TSCD_FLR_VM_VF_TYPE_S		8
+#define GLPE_TSCD_FLR_VM_VF_TYPE_M		ICE_M(0x3, 8)
+#define GLPE_TSCD_FLR_VM_VF_NUM_S		16
+#define GLPE_TSCD_FLR_VM_VF_NUM_M		ICE_M(0x3FF, 16)
+#define GLPE_TSCD_FLR_VLD_S			31
+#define GLPE_TSCD_FLR_VLD_M			BIT(31)
+#define GLPE_TSCD_PEPM				0x0051E228 /* Reset Source: CORER */
+#define GLPE_TSCD_PEPM_MDQ_CREDITS_S		0
+#define GLPE_TSCD_PEPM_MDQ_CREDITS_M		ICE_M(0xFF, 0)
+#define PF_VIRT_VSTATUS				0x0009E680 /* Reset Source: PFR */
+#define PF_VIRT_VSTATUS_NUM_VFS_S		0
+#define PF_VIRT_VSTATUS_NUM_VFS_M		ICE_M(0xFF, 0)
+#define PF_VIRT_VSTATUS_TOTAL_VFS_S		8
+#define PF_VIRT_VSTATUS_TOTAL_VFS_M		ICE_M(0xFF, 8)
+#define PF_VIRT_VSTATUS_IOV_ACTIVE_S		16
+#define PF_VIRT_VSTATUS_IOV_ACTIVE_M		BIT(16)
+#define PF_VT_PFALLOC				0x001D2480 /* Reset Source: CORER */
+#define PF_VT_PFALLOC_FIRSTVF_S			0
+#define PF_VT_PFALLOC_FIRSTVF_M			ICE_M(0xFF, 0)
+#define PF_VT_PFALLOC_LASTVF_S			8
+#define PF_VT_PFALLOC_LASTVF_M			ICE_M(0xFF, 8)
+#define PF_VT_PFALLOC_VALID_S			31
+#define PF_VT_PFALLOC_VALID_M			BIT(31)
+#define PF_VT_PFALLOC_HIF			0x0009DD80 /* Reset Source: PCIR */
+#define PF_VT_PFALLOC_HIF_FIRSTVF_S		0
+#define PF_VT_PFALLOC_HIF_FIRSTVF_M		ICE_M(0xFF, 0)
+#define PF_VT_PFALLOC_HIF_LASTVF_S		8
+#define PF_VT_PFALLOC_HIF_LASTVF_M		ICE_M(0xFF, 8)
+#define PF_VT_PFALLOC_HIF_VALID_S		31
+#define PF_VT_PFALLOC_HIF_VALID_M		BIT(31)
+#define PF_VT_PFALLOC_PCIE			0x000BE080 /* Reset Source: PCIR */
+#define PF_VT_PFALLOC_PCIE_FIRSTVF_S		0
+#define PF_VT_PFALLOC_PCIE_FIRSTVF_M		ICE_M(0xFF, 0)
+#define PF_VT_PFALLOC_PCIE_LASTVF_S		8
+#define PF_VT_PFALLOC_PCIE_LASTVF_M		ICE_M(0xFF, 8)
+#define PF_VT_PFALLOC_PCIE_VALID_S		31
+#define PF_VT_PFALLOC_PCIE_VALID_M		BIT(31)
+#define VSI_L2TAGSTXVALID(_VSI)			(0x00046000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_L2TAGSTXVALID_MAX_INDEX		767
+#define VSI_L2TAGSTXVALID_L2TAG1INSERTID_S	0
+#define VSI_L2TAGSTXVALID_L2TAG1INSERTID_M	ICE_M(0x7, 0)
+#define VSI_L2TAGSTXVALID_L2TAG1INSERTID_VALID_S 3
+#define VSI_L2TAGSTXVALID_L2TAG1INSERTID_VALID_M BIT(3)
+#define VSI_L2TAGSTXVALID_L2TAG2INSERTID_S	4
+#define VSI_L2TAGSTXVALID_L2TAG2INSERTID_M	ICE_M(0x7, 4)
+#define VSI_L2TAGSTXVALID_L2TAG2INSERTID_VALID_S 7
+#define VSI_L2TAGSTXVALID_L2TAG2INSERTID_VALID_M BIT(7)
+#define VSI_L2TAGSTXVALID_TIR0INSERTID_S	16
+#define VSI_L2TAGSTXVALID_TIR0INSERTID_M	ICE_M(0x7, 16)
+#define VSI_L2TAGSTXVALID_TIR0_INSERT_S		19
+#define VSI_L2TAGSTXVALID_TIR0_INSERT_M		BIT(19)
+#define VSI_L2TAGSTXVALID_TIR1INSERTID_S	20
+#define VSI_L2TAGSTXVALID_TIR1INSERTID_M	ICE_M(0x7, 20)
+#define VSI_L2TAGSTXVALID_TIR1_INSERT_S		23
+#define VSI_L2TAGSTXVALID_TIR1_INSERT_M		BIT(23)
+#define VSI_L2TAGSTXVALID_TIR2INSERTID_S	24
+#define VSI_L2TAGSTXVALID_TIR2INSERTID_M	ICE_M(0x7, 24)
+#define VSI_L2TAGSTXVALID_TIR2_INSERT_S		27
+#define VSI_L2TAGSTXVALID_TIR2_INSERT_M		BIT(27)
+#define VSI_PASID(_VSI)				(0x0009C000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_PASID_MAX_INDEX			767
+#define VSI_PASID_PASID_S			0
+#define VSI_PASID_PASID_M			ICE_M(0xFFFFF, 0)
+#define VSI_PASID_EN_S				31
+#define VSI_PASID_EN_M				BIT(31)
+#define VSI_RUPR(_VSI)				(0x00050000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_RUPR_MAX_INDEX			767
+#define VSI_RUPR_UP0_S				0
+#define VSI_RUPR_UP0_M				ICE_M(0x7, 0)
+#define VSI_RUPR_UP1_S				3
+#define VSI_RUPR_UP1_M				ICE_M(0x7, 3)
+#define VSI_RUPR_UP2_S				6
+#define VSI_RUPR_UP2_M				ICE_M(0x7, 6)
+#define VSI_RUPR_UP3_S				9
+#define VSI_RUPR_UP3_M				ICE_M(0x7, 9)
+#define VSI_RUPR_UP4_S				12
+#define VSI_RUPR_UP4_M				ICE_M(0x7, 12)
+#define VSI_RUPR_UP5_S				15
+#define VSI_RUPR_UP5_M				ICE_M(0x7, 15)
+#define VSI_RUPR_UP6_S				18
+#define VSI_RUPR_UP6_M				ICE_M(0x7, 18)
+#define VSI_RUPR_UP7_S				21
+#define VSI_RUPR_UP7_M				ICE_M(0x7, 21)
+#define VSI_RXSWCTRL(_VSI)			(0x00205000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_RXSWCTRL_MAX_INDEX			767
+#define VSI_RXSWCTRL_MACVSIPRUNEENABLE_S	8
+#define VSI_RXSWCTRL_MACVSIPRUNEENABLE_M	BIT(8)
+#define VSI_RXSWCTRL_PRUNEENABLE_S		9
+#define VSI_RXSWCTRL_PRUNEENABLE_M		ICE_M(0xF, 9)
+#define VSI_RXSWCTRL_SRCPRUNEENABLE_S		13
+#define VSI_RXSWCTRL_SRCPRUNEENABLE_M		BIT(13)
+#define VSI_SRCSWCTRL(_VSI)			(0x00209000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_SRCSWCTRL_MAX_INDEX			767
+#define VSI_SRCSWCTRL_ALLOWDESTOVERRIDE_S	0
+#define VSI_SRCSWCTRL_ALLOWDESTOVERRIDE_M	BIT(0)
+#define VSI_SRCSWCTRL_ALLOWLOOPBACK_S		1
+#define VSI_SRCSWCTRL_ALLOWLOOPBACK_M		BIT(1)
+#define VSI_SRCSWCTRL_LANENABLE_S		2
+#define VSI_SRCSWCTRL_LANENABLE_M		BIT(2)
+#define VSI_SRCSWCTRL_MACAS_S			3
+#define VSI_SRCSWCTRL_MACAS_M			BIT(3)
+#define VSI_SRCSWCTRL_PRUNEENABLE_S		4
+#define VSI_SRCSWCTRL_PRUNEENABLE_M		ICE_M(0xF, 4)
+#define VSI_SWITCHID(_VSI)			(0x00215000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_SWITCHID_MAX_INDEX			767
+#define VSI_SWITCHID_SWITCHID_S			0
+#define VSI_SWITCHID_SWITCHID_M			ICE_M(0xFF, 0)
+#define VSI_SWT_MIREG(_VSI)			(0x00207000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_SWT_MIREG_MAX_INDEX			767
+#define VSI_SWT_MIREG_MIRRULE_S			0
+#define VSI_SWT_MIREG_MIRRULE_M			ICE_M(0x3F, 0)
+#define VSI_SWT_MIREG_MIRENA_S			7
+#define VSI_SWT_MIREG_MIRENA_M			BIT(7)
+#define VSI_SWT_MIRIG(_VSI)			(0x00208000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_SWT_MIRIG_MAX_INDEX			767
+#define VSI_SWT_MIRIG_MIRRULE_S			0
+#define VSI_SWT_MIRIG_MIRRULE_M			ICE_M(0x3F, 0)
+#define VSI_SWT_MIRIG_MIRENA_S			7
+#define VSI_SWT_MIRIG_MIRENA_M			BIT(7)
+#define VSI_TAIR(_VSI)				(0x00044000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_TAIR_MAX_INDEX			767
+#define VSI_TAIR_PORT_TAG_ID_S			0
+#define VSI_TAIR_PORT_TAG_ID_M			ICE_M(0xFFFF, 0)
+#define VSI_TAR(_VSI)				(0x00045000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TAR_MAX_INDEX			767
+#define VSI_TAR_ACCEPTTAGGED_S			0
+#define VSI_TAR_ACCEPTTAGGED_M			ICE_M(0x3FF, 0)
+#define VSI_TAR_ACCEPTUNTAGGED_S		16
+#define VSI_TAR_ACCEPTUNTAGGED_M		ICE_M(0x3FF, 16)
+#define VSI_TIR_0(_VSI)				(0x00041000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TIR_0_MAX_INDEX			767
+#define VSI_TIR_0_PORT_TAG_ID_S			0
+#define VSI_TIR_0_PORT_TAG_ID_M			ICE_M(0xFFFF, 0)
+#define VSI_TIR_1(_VSI)				(0x00042000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TIR_1_MAX_INDEX			767
+#define VSI_TIR_1_PORT_TAG_ID_S			0
+#define VSI_TIR_1_PORT_TAG_ID_M			ICE_M(0xFFFFFFFF, 0)
+#define VSI_TIR_2(_VSI)				(0x00043000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TIR_2_MAX_INDEX			767
+#define VSI_TIR_2_PORT_TAG_ID_S			0
+#define VSI_TIR_2_PORT_TAG_ID_M			ICE_M(0xFFFF, 0)
+#define VSI_TSR(_VSI)				(0x00051000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TSR_MAX_INDEX			767
+#define VSI_TSR_STRIPTAG_S			0
+#define VSI_TSR_STRIPTAG_M			ICE_M(0x3FF, 0)
+#define VSI_TSR_SHOWTAG_S			10
+#define VSI_TSR_SHOWTAG_M			ICE_M(0x3FF, 10)
+#define VSI_TSR_SHOWPRIONLY_S			20
+#define VSI_TSR_SHOWPRIONLY_M			ICE_M(0x3FF, 20)
+#define VSI_TUPIOM(_VSI)			(0x00048000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TUPIOM_MAX_INDEX			767
+#define VSI_TUPIOM_UP0_S			0
+#define VSI_TUPIOM_UP0_M			ICE_M(0x7, 0)
+#define VSI_TUPIOM_UP1_S			3
+#define VSI_TUPIOM_UP1_M			ICE_M(0x7, 3)
+#define VSI_TUPIOM_UP2_S			6
+#define VSI_TUPIOM_UP2_M			ICE_M(0x7, 6)
+#define VSI_TUPIOM_UP3_S			9
+#define VSI_TUPIOM_UP3_M			ICE_M(0x7, 9)
+#define VSI_TUPIOM_UP4_S			12
+#define VSI_TUPIOM_UP4_M			ICE_M(0x7, 12)
+#define VSI_TUPIOM_UP5_S			15
+#define VSI_TUPIOM_UP5_M			ICE_M(0x7, 15)
+#define VSI_TUPIOM_UP6_S			18
+#define VSI_TUPIOM_UP6_M			ICE_M(0x7, 18)
+#define VSI_TUPIOM_UP7_S			21
+#define VSI_TUPIOM_UP7_M			ICE_M(0x7, 21)
+#define VSI_TUPR(_VSI)				(0x00047000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TUPR_MAX_INDEX			767
+#define VSI_TUPR_UP0_S				0
+#define VSI_TUPR_UP0_M				ICE_M(0x7, 0)
+#define VSI_TUPR_UP1_S				3
+#define VSI_TUPR_UP1_M				ICE_M(0x7, 3)
+#define VSI_TUPR_UP2_S				6
+#define VSI_TUPR_UP2_M				ICE_M(0x7, 6)
+#define VSI_TUPR_UP3_S				9
+#define VSI_TUPR_UP3_M				ICE_M(0x7, 9)
+#define VSI_TUPR_UP4_S				12
+#define VSI_TUPR_UP4_M				ICE_M(0x7, 12)
+#define VSI_TUPR_UP5_S				15
+#define VSI_TUPR_UP5_M				ICE_M(0x7, 15)
+#define VSI_TUPR_UP6_S				18
+#define VSI_TUPR_UP6_M				ICE_M(0x7, 18)
+#define VSI_TUPR_UP7_S				21
+#define VSI_TUPR_UP7_M				ICE_M(0x7, 21)
+#define VSI_VSI2F(_VSI)				(0x001D0000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_VSI2F_MAX_INDEX			767
+#define VSI_VSI2F_VFVMNUMBER_S			0
+#define VSI_VSI2F_VFVMNUMBER_M			ICE_M(0x3FF, 0)
+#define VSI_VSI2F_FUNCTIONTYPE_S		10
+#define VSI_VSI2F_FUNCTIONTYPE_M		ICE_M(0x3, 10)
+#define VSI_VSI2F_PFNUMBER_S			12
+#define VSI_VSI2F_PFNUMBER_M			ICE_M(0x7, 12)
+#define VSI_VSI2F_BUFFERNUMBER_S		16
+#define VSI_VSI2F_BUFFERNUMBER_M		ICE_M(0x7, 16)
+#define VSI_VSI2F_VSI_NUMBER_S			20
+#define VSI_VSI2F_VSI_NUMBER_M			ICE_M(0x3FF, 20)
+#define VSI_VSI2F_VSI_ENABLE_S			31
+#define VSI_VSI2F_VSI_ENABLE_M			BIT(31)
+#define VSIQF_FD_CNT(_VSI)			(0x00464000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSIQF_FD_CNT_MAX_INDEX			767
+#define VSIQF_FD_CNT_FD_GCNT_S			0
+#define VSIQF_FD_CNT_FD_GCNT_M			ICE_M(0x3FFF, 0)
+#define VSIQF_FD_CNT_FD_BCNT_S			16
+#define VSIQF_FD_CNT_FD_BCNT_M			ICE_M(0x3FFF, 16)
+#define VSIQF_FD_CTL1(_VSI)			(0x00411000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_FD_CTL1_MAX_INDEX			767
+#define VSIQF_FD_CTL1_FLT_ENA_S			0
+#define VSIQF_FD_CTL1_FLT_ENA_M			BIT(0)
+#define VSIQF_FD_CTL1_CFG_ENA_S			1
+#define VSIQF_FD_CTL1_CFG_ENA_M			BIT(1)
+#define VSIQF_FD_CTL1_EVICT_ENA_S		2
+#define VSIQF_FD_CTL1_EVICT_ENA_M		BIT(2)
+#define VSIQF_FD_DFLT(_VSI)			(0x00457000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_FD_DFLT_MAX_INDEX			767
+#define VSIQF_FD_DFLT_DEFLT_QINDX_S		0
+#define VSIQF_FD_DFLT_DEFLT_QINDX_M		ICE_M(0x7FF, 0)
+#define VSIQF_FD_DFLT_DEFLT_TOQUEUE_S		12
+#define VSIQF_FD_DFLT_DEFLT_TOQUEUE_M		ICE_M(0x7, 12)
+#define VSIQF_FD_DFLT_COMP_QINDX_S		16
+#define VSIQF_FD_DFLT_COMP_QINDX_M		ICE_M(0x7FF, 16)
+#define VSIQF_FD_DFLT_DEFLT_QINDX_PRIO_S	28
+#define VSIQF_FD_DFLT_DEFLT_QINDX_PRIO_M	ICE_M(0x7, 28)
+#define VSIQF_FD_DFLT_DEFLT_DROP_S		31
+#define VSIQF_FD_DFLT_DEFLT_DROP_M		BIT(31)
+#define VSIQF_FD_SIZE(_VSI)			(0x00462000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_FD_SIZE_MAX_INDEX			767
+#define VSIQF_FD_SIZE_FD_GSIZE_S		0
+#define VSIQF_FD_SIZE_FD_GSIZE_M		ICE_M(0x3FFF, 0)
+#define VSIQF_FD_SIZE_FD_BSIZE_S		16
+#define VSIQF_FD_SIZE_FD_BSIZE_M		ICE_M(0x3FFF, 16)
+#define VSIQF_HASH_CTL(_VSI)			(0x0040D000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_HASH_CTL_MAX_INDEX		767
+#define VSIQF_HASH_CTL_HASH_LUT_SEL_S		0
+#define VSIQF_HASH_CTL_HASH_LUT_SEL_M		ICE_M(0x3, 0)
+#define VSIQF_HASH_CTL_GLOB_LUT_S		2
+#define VSIQF_HASH_CTL_GLOB_LUT_M		ICE_M(0xF, 2)
+#define VSIQF_HASH_CTL_HASH_SCHEME_S		6
+#define VSIQF_HASH_CTL_HASH_SCHEME_M		ICE_M(0x3, 6)
+#define VSIQF_HASH_CTL_TC_OVER_SEL_S		8
+#define VSIQF_HASH_CTL_TC_OVER_SEL_M		ICE_M(0x1F, 8)
+#define VSIQF_HASH_CTL_TC_OVER_ENA_S		15
+#define VSIQF_HASH_CTL_TC_OVER_ENA_M		BIT(15)
+#define VSIQF_HKEY(_i, _VSI)			(0x00400000 + ((_i) * 4096 + (_VSI) * 4)) /* _i=0...12, _VSI=0...767 */ /* Reset Source: PFR */
 #define VSIQF_HKEY_MAX_INDEX			12
+#define VSIQF_HKEY_KEY_0_S			0
+#define VSIQF_HKEY_KEY_0_M			ICE_M(0xFF, 0)
+#define VSIQF_HKEY_KEY_1_S			8
+#define VSIQF_HKEY_KEY_1_M			ICE_M(0xFF, 8)
+#define VSIQF_HKEY_KEY_2_S			16
+#define VSIQF_HKEY_KEY_2_M			ICE_M(0xFF, 16)
+#define VSIQF_HKEY_KEY_3_S			24
+#define VSIQF_HKEY_KEY_3_M			ICE_M(0xFF, 24)
+#define VSIQF_HLUT(_i, _VSI)			(0x00420000 + ((_i) * 4096 + (_VSI) * 4)) /* _i=0...15, _VSI=0...767 */ /* Reset Source: PFR */
 #define VSIQF_HLUT_MAX_INDEX			15
-#define VFINT_DYN_CTLN(_i)			(0x00003800 + ((_i) * 4))
+#define VSIQF_HLUT_LUT0_S			0
+#define VSIQF_HLUT_LUT0_M			ICE_M(0xF, 0)
+#define VSIQF_HLUT_LUT1_S			8
+#define VSIQF_HLUT_LUT1_M			ICE_M(0xF, 8)
+#define VSIQF_HLUT_LUT2_S			16
+#define VSIQF_HLUT_LUT2_M			ICE_M(0xF, 16)
+#define VSIQF_HLUT_LUT3_S			24
+#define VSIQF_HLUT_LUT3_M			ICE_M(0xF, 24)
+#define VSIQF_PE_CTL1(_VSI)			(0x00414000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_PE_CTL1_MAX_INDEX			767
+#define VSIQF_PE_CTL1_PE_FLTENA_S		0
+#define VSIQF_PE_CTL1_PE_FLTENA_M		BIT(0)
+#define VSIQF_TC_REGION(_i, _VSI)		(0x00448000 + ((_i) * 4096 + (_VSI) * 4)) /* _i=0...3, _VSI=0...767 */ /* Reset Source: CORER */
+#define VSIQF_TC_REGION_MAX_INDEX		3
+#define VSIQF_TC_REGION_TC_BASE0_S		0
+#define VSIQF_TC_REGION_TC_BASE0_M		ICE_M(0x7FF, 0)
+#define VSIQF_TC_REGION_TC_SIZE0_S		11
+#define VSIQF_TC_REGION_TC_SIZE0_M		ICE_M(0xF, 11)
+#define VSIQF_TC_REGION_TC_BASE1_S		16
+#define VSIQF_TC_REGION_TC_BASE1_M		ICE_M(0x7FF, 16)
+#define VSIQF_TC_REGION_TC_SIZE1_S		27
+#define VSIQF_TC_REGION_TC_SIZE1_M		ICE_M(0xF, 27)
+#define GLPM_WUMC				0x0009DEE4 /* Reset Source: POR */
+#define GLPM_WUMC_MNG_WU_PF_S			16
+#define GLPM_WUMC_MNG_WU_PF_M			ICE_M(0xFF, 16)
+#define PFPM_APM				0x000B8080 /* Reset Source: POR */
+#define PFPM_APM_APME_S				0
+#define PFPM_APM_APME_M				BIT(0)
+#define PFPM_WUC				0x0009DC80 /* Reset Source: POR */
+#define PFPM_WUC_EN_APM_D0_S			5
+#define PFPM_WUC_EN_APM_D0_M			BIT(5)
+#define PFPM_WUFC				0x0009DC00 /* Reset Source: POR */
+#define PFPM_WUFC_LNKC_S			0
+#define PFPM_WUFC_LNKC_M			BIT(0)
+#define PFPM_WUFC_MAG_S				1
+#define PFPM_WUFC_MAG_M				BIT(1)
+#define PFPM_WUFC_MNG_S				3
+#define PFPM_WUFC_MNG_M				BIT(3)
+#define PFPM_WUFC_FLX0_ACT_S			4
+#define PFPM_WUFC_FLX0_ACT_M			BIT(4)
+#define PFPM_WUFC_FLX1_ACT_S			5
+#define PFPM_WUFC_FLX1_ACT_M			BIT(5)
+#define PFPM_WUFC_FLX2_ACT_S			6
+#define PFPM_WUFC_FLX2_ACT_M			BIT(6)
+#define PFPM_WUFC_FLX3_ACT_S			7
+#define PFPM_WUFC_FLX3_ACT_M			BIT(7)
+#define PFPM_WUFC_FLX4_ACT_S			8
+#define PFPM_WUFC_FLX4_ACT_M			BIT(8)
+#define PFPM_WUFC_FLX5_ACT_S			9
+#define PFPM_WUFC_FLX5_ACT_M			BIT(9)
+#define PFPM_WUFC_FLX6_ACT_S			10
+#define PFPM_WUFC_FLX6_ACT_M			BIT(10)
+#define PFPM_WUFC_FLX7_ACT_S			11
+#define PFPM_WUFC_FLX7_ACT_M			BIT(11)
+#define PFPM_WUFC_FLX0_S			16
+#define PFPM_WUFC_FLX0_M			BIT(16)
+#define PFPM_WUFC_FLX1_S			17
+#define PFPM_WUFC_FLX1_M			BIT(17)
+#define PFPM_WUFC_FLX2_S			18
+#define PFPM_WUFC_FLX2_M			BIT(18)
+#define PFPM_WUFC_FLX3_S			19
+#define PFPM_WUFC_FLX3_M			BIT(19)
+#define PFPM_WUFC_FLX4_S			20
+#define PFPM_WUFC_FLX4_M			BIT(20)
+#define PFPM_WUFC_FLX5_S			21
+#define PFPM_WUFC_FLX5_M			BIT(21)
+#define PFPM_WUFC_FLX6_S			22
+#define PFPM_WUFC_FLX6_M			BIT(22)
+#define PFPM_WUFC_FLX7_S			23
+#define PFPM_WUFC_FLX7_M			BIT(23)
+#define PFPM_WUFC_FW_RST_WK_S			31
+#define PFPM_WUFC_FW_RST_WK_M			BIT(31)
+#define PFPM_WUS				0x0009DB80 /* Reset Source: POR */
+#define PFPM_WUS_LNKC_S				0
+#define PFPM_WUS_LNKC_M				BIT(0)
+#define PFPM_WUS_MAG_S				1
+#define PFPM_WUS_MAG_M				BIT(1)
+#define PFPM_WUS_PME_STATUS_S			2
+#define PFPM_WUS_PME_STATUS_M			BIT(2)
+#define PFPM_WUS_MNG_S				3
+#define PFPM_WUS_MNG_M				BIT(3)
+#define PFPM_WUS_FLX0_S				16
+#define PFPM_WUS_FLX0_M				BIT(16)
+#define PFPM_WUS_FLX1_S				17
+#define PFPM_WUS_FLX1_M				BIT(17)
+#define PFPM_WUS_FLX2_S				18
+#define PFPM_WUS_FLX2_M				BIT(18)
+#define PFPM_WUS_FLX3_S				19
+#define PFPM_WUS_FLX3_M				BIT(19)
+#define PFPM_WUS_FLX4_S				20
+#define PFPM_WUS_FLX4_M				BIT(20)
+#define PFPM_WUS_FLX5_S				21
+#define PFPM_WUS_FLX5_M				BIT(21)
+#define PFPM_WUS_FLX6_S				22
+#define PFPM_WUS_FLX6_M				BIT(22)
+#define PFPM_WUS_FLX7_S				23
+#define PFPM_WUS_FLX7_M				BIT(23)
+#define PFPM_WUS_FW_RST_WK_S			31
+#define PFPM_WUS_FW_RST_WK_M			BIT(31)
+#define PRTPM_SAH(_i)				(0x001E3BA0 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: PFR */
+#define PRTPM_SAH_MAX_INDEX			3
+#define PRTPM_SAH_PFPM_SAH_S			0
+#define PRTPM_SAH_PFPM_SAH_M			ICE_M(0xFFFF, 0)
+#define PRTPM_SAH_PF_NUM_S			26
+#define PRTPM_SAH_PF_NUM_M			ICE_M(0xF, 26)
+#define PRTPM_SAH_MC_MAG_EN_S			30
+#define PRTPM_SAH_MC_MAG_EN_M			BIT(30)
+#define PRTPM_SAH_AV_S				31
+#define PRTPM_SAH_AV_M				BIT(31)
+#define PRTPM_SAL(_i)				(0x001E3B20 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: PFR */
+#define PRTPM_SAL_MAX_INDEX			3
+#define PRTPM_SAL_PFPM_SAL_S			0
+#define PRTPM_SAL_PFPM_SAL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPE_CQM_FUNC_INVALIDATE		0x00503300 /* Reset Source: CORER */
+#define GLPE_CQM_FUNC_INVALIDATE_PF_NUM_S	0
+#define GLPE_CQM_FUNC_INVALIDATE_PF_NUM_M	ICE_M(0x7, 0)
+#define GLPE_CQM_FUNC_INVALIDATE_VM_VF_NUM_S	3
+#define GLPE_CQM_FUNC_INVALIDATE_VM_VF_NUM_M	ICE_M(0x3FF, 3)
+#define GLPE_CQM_FUNC_INVALIDATE_VM_VF_TYPE_S	13
+#define GLPE_CQM_FUNC_INVALIDATE_VM_VF_TYPE_M	ICE_M(0x3, 13)
+#define GLPE_CQM_FUNC_INVALIDATE_ENABLE_S	31
+#define GLPE_CQM_FUNC_INVALIDATE_ENABLE_M	BIT(31)
+#define VFPE_MRTEIDXMASK			0x00009000 /* Reset Source: PFR */
+#define VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_S	0
+#define VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_M	ICE_M(0x1F, 0)
+#define GLTSYN_HH_DLAY				0x0008881C /* Reset Source: CORER */
+#define GLTSYN_HH_DLAY_SYNC_DELAY_S		0
+#define GLTSYN_HH_DLAY_SYNC_DELAY_M		ICE_M(0xF, 0)
+#define VF_MBX_ARQBAH1				0x00006000 /* Reset Source: CORER */
+#define VF_MBX_ARQBAH1_ARQBAH_S			0
+#define VF_MBX_ARQBAH1_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_ARQBAL1				0x00006C00 /* Reset Source: CORER */
+#define VF_MBX_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_MBX_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_ARQBAL1_ARQBAL_S			6
+#define VF_MBX_ARQBAL1_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_ARQH1				0x00007400 /* Reset Source: CORER */
+#define VF_MBX_ARQH1_ARQH_S			0
+#define VF_MBX_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ARQLEN1				0x00008000 /* Reset Source: PFR */
+#define VF_MBX_ARQLEN1_ARQLEN_S			0
+#define VF_MBX_ARQLEN1_ARQLEN_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ARQLEN1_ARQVFE_S			28
+#define VF_MBX_ARQLEN1_ARQVFE_M			BIT(28)
+#define VF_MBX_ARQLEN1_ARQOVFL_S		29
+#define VF_MBX_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_MBX_ARQLEN1_ARQCRIT_S		30
+#define VF_MBX_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_MBX_ARQLEN1_ARQENABLE_S		31
+#define VF_MBX_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_MBX_ARQT1				0x00007000 /* Reset Source: CORER */
+#define VF_MBX_ARQT1_ARQT_S			0
+#define VF_MBX_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQBAH1				0x00007800 /* Reset Source: CORER */
+#define VF_MBX_ATQBAH1_ATQBAH_S			0
+#define VF_MBX_ATQBAH1_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_ATQBAL1				0x00007C00 /* Reset Source: CORER */
+#define VF_MBX_ATQBAL1_ATQBAL_S			6
+#define VF_MBX_ATQBAL1_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_ATQH1				0x00006400 /* Reset Source: CORER */
+#define VF_MBX_ATQH1_ATQH_S			0
+#define VF_MBX_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQLEN1				0x00006800 /* Reset Source: PFR */
+#define VF_MBX_ATQLEN1_ATQLEN_S			0
+#define VF_MBX_ATQLEN1_ATQLEN_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQLEN1_ATQVFE_S			28
+#define VF_MBX_ATQLEN1_ATQVFE_M			BIT(28)
+#define VF_MBX_ATQLEN1_ATQOVFL_S		29
+#define VF_MBX_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_MBX_ATQLEN1_ATQCRIT_S		30
+#define VF_MBX_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_MBX_ATQLEN1_ATQENABLE_S		31
+#define VF_MBX_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_MBX_ATQT1				0x00008400 /* Reset Source: CORER */
+#define VF_MBX_ATQT1_ATQT_S			0
+#define VF_MBX_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define PFPCI_VF_FLUSH_DONE1			0x0000E400 /* Reset Source: PCIR */
+#define PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_S	0
+#define PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_M	BIT(0)
+#define VFGEN_RSTAT1				0x00008800 /* Reset Source: VFR */
+#define VFGEN_RSTAT1_VFR_STATE_S		0
+#define VFGEN_RSTAT1_VFR_STATE_M		ICE_M(0x3, 0)
+#define VFINT_DYN_CTL0				0x00005C00 /* Reset Source: CORER */
+#define VFINT_DYN_CTL0_INTENA_S			0
+#define VFINT_DYN_CTL0_INTENA_M			BIT(0)
+#define VFINT_DYN_CTL0_CLEARPBA_S		1
+#define VFINT_DYN_CTL0_CLEARPBA_M		BIT(1)
+#define VFINT_DYN_CTL0_SWINT_TRIG_S		2
+#define VFINT_DYN_CTL0_SWINT_TRIG_M		BIT(2)
+#define VFINT_DYN_CTL0_ITR_INDX_S		3
+#define VFINT_DYN_CTL0_ITR_INDX_M		ICE_M(0x3, 3)
+#define VFINT_DYN_CTL0_INTERVAL_S		5
+#define VFINT_DYN_CTL0_INTERVAL_M		ICE_M(0xFFF, 5)
+#define VFINT_DYN_CTL0_SW_ITR_INDX_ENA_S	24
+#define VFINT_DYN_CTL0_SW_ITR_INDX_ENA_M	BIT(24)
+#define VFINT_DYN_CTL0_SW_ITR_INDX_S		25
+#define VFINT_DYN_CTL0_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define VFINT_DYN_CTL0_WB_ON_ITR_S		30
+#define VFINT_DYN_CTL0_WB_ON_ITR_M		BIT(30)
+#define VFINT_DYN_CTL0_INTENA_MSK_S		31
+#define VFINT_DYN_CTL0_INTENA_MSK_M		BIT(31)
+#define VFINT_DYN_CTLN(_i)			(0x00003800 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define VFINT_DYN_CTLN_MAX_INDEX		63
+#define VFINT_DYN_CTLN_INTENA_S			0
+#define VFINT_DYN_CTLN_INTENA_M			BIT(0)
+#define VFINT_DYN_CTLN_CLEARPBA_S		1
 #define VFINT_DYN_CTLN_CLEARPBA_M		BIT(1)
-#define PRTRPB_RDPC				0x000AC260
+#define VFINT_DYN_CTLN_SWINT_TRIG_S		2
+#define VFINT_DYN_CTLN_SWINT_TRIG_M		BIT(2)
+#define VFINT_DYN_CTLN_ITR_INDX_S		3
+#define VFINT_DYN_CTLN_ITR_INDX_M		ICE_M(0x3, 3)
+#define VFINT_DYN_CTLN_INTERVAL_S		5
+#define VFINT_DYN_CTLN_INTERVAL_M		ICE_M(0xFFF, 5)
+#define VFINT_DYN_CTLN_SW_ITR_INDX_ENA_S	24
+#define VFINT_DYN_CTLN_SW_ITR_INDX_ENA_M	BIT(24)
+#define VFINT_DYN_CTLN_SW_ITR_INDX_S		25
+#define VFINT_DYN_CTLN_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define VFINT_DYN_CTLN_WB_ON_ITR_S		30
+#define VFINT_DYN_CTLN_WB_ON_ITR_M		BIT(30)
+#define VFINT_DYN_CTLN_INTENA_MSK_S		31
+#define VFINT_DYN_CTLN_INTENA_MSK_M		BIT(31)
+#define VFINT_ITR0(_i)				(0x00004C00 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define VFINT_ITR0_MAX_INDEX			2
+#define VFINT_ITR0_INTERVAL_S			0
+#define VFINT_ITR0_INTERVAL_M			ICE_M(0xFFF, 0)
+#define VFINT_ITRN(_i, _j)			(0x00002800 + ((_i) * 4 + (_j) * 12)) /* _i=0...2, _j=0...63 */ /* Reset Source: CORER */
+#define VFINT_ITRN_MAX_INDEX			2
+#define VFINT_ITRN_INTERVAL_S			0
+#define VFINT_ITRN_INTERVAL_M			ICE_M(0xFFF, 0)
+#define QRX_TAIL1(_QRX)				(0x00002000 + ((_QRX) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define QRX_TAIL1_MAX_INDEX			255
+#define QRX_TAIL1_TAIL_S			0
+#define QRX_TAIL1_TAIL_M			ICE_M(0x1FFF, 0)
+#define QTX_TAIL(_DBQM)				(0x00000000 + ((_DBQM) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define QTX_TAIL_MAX_INDEX			255
+#define QTX_TAIL_QTX_COMM_DBELL_S		0
+#define QTX_TAIL_QTX_COMM_DBELL_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ARQBAH1			0x0000F060 /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQBAH1_ARQBAH_S		0
+#define VF_MBX_CPM_ARQBAH1_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ARQBAL1			0x0000F050 /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_MBX_CPM_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_CPM_ARQBAL1_ARQBAL_S		6
+#define VF_MBX_CPM_ARQBAL1_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_CPM_ARQH1			0x0000F080 /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQH1_ARQH_S			0
+#define VF_MBX_CPM_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQLEN1			0x0000F070 /* Reset Source: PFR */
+#define VF_MBX_CPM_ARQLEN1_ARQLEN_S		0
+#define VF_MBX_CPM_ARQLEN1_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQLEN1_ARQVFE_S		28
+#define VF_MBX_CPM_ARQLEN1_ARQVFE_M		BIT(28)
+#define VF_MBX_CPM_ARQLEN1_ARQOVFL_S		29
+#define VF_MBX_CPM_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_MBX_CPM_ARQLEN1_ARQCRIT_S		30
+#define VF_MBX_CPM_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_MBX_CPM_ARQLEN1_ARQENABLE_S		31
+#define VF_MBX_CPM_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_MBX_CPM_ARQT1			0x0000F090 /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQT1_ARQT_S			0
+#define VF_MBX_CPM_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQBAH1			0x0000F010 /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQBAH1_ATQBAH_S		0
+#define VF_MBX_CPM_ATQBAH1_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ATQBAL1			0x0000F000 /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQBAL1_ATQBAL_S		6
+#define VF_MBX_CPM_ATQBAL1_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_CPM_ATQH1			0x0000F030 /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQH1_ATQH_S			0
+#define VF_MBX_CPM_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQLEN1			0x0000F020 /* Reset Source: PFR */
+#define VF_MBX_CPM_ATQLEN1_ATQLEN_S		0
+#define VF_MBX_CPM_ATQLEN1_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQLEN1_ATQVFE_S		28
+#define VF_MBX_CPM_ATQLEN1_ATQVFE_M		BIT(28)
+#define VF_MBX_CPM_ATQLEN1_ATQOVFL_S		29
+#define VF_MBX_CPM_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_MBX_CPM_ATQLEN1_ATQCRIT_S		30
+#define VF_MBX_CPM_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_MBX_CPM_ATQLEN1_ATQENABLE_S		31
+#define VF_MBX_CPM_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_MBX_CPM_ATQT1			0x0000F040 /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQT1_ATQT_S			0
+#define VF_MBX_CPM_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQBAH1			0x00020060 /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQBAH1_ARQBAH_S		0
+#define VF_MBX_HLP_ARQBAH1_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_HLP_ARQBAL1			0x00020050 /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_MBX_HLP_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_HLP_ARQBAL1_ARQBAL_S		6
+#define VF_MBX_HLP_ARQBAL1_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_HLP_ARQH1			0x00020080 /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQH1_ARQH_S			0
+#define VF_MBX_HLP_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQLEN1			0x00020070 /* Reset Source: PFR */
+#define VF_MBX_HLP_ARQLEN1_ARQLEN_S		0
+#define VF_MBX_HLP_ARQLEN1_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQLEN1_ARQVFE_S		28
+#define VF_MBX_HLP_ARQLEN1_ARQVFE_M		BIT(28)
+#define VF_MBX_HLP_ARQLEN1_ARQOVFL_S		29
+#define VF_MBX_HLP_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_MBX_HLP_ARQLEN1_ARQCRIT_S		30
+#define VF_MBX_HLP_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_MBX_HLP_ARQLEN1_ARQENABLE_S		31
+#define VF_MBX_HLP_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_MBX_HLP_ARQT1			0x00020090 /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQT1_ARQT_S			0
+#define VF_MBX_HLP_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQBAH1			0x00020010 /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQBAH1_ATQBAH_S		0
+#define VF_MBX_HLP_ATQBAH1_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_HLP_ATQBAL1			0x00020000 /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQBAL1_ATQBAL_S		6
+#define VF_MBX_HLP_ATQBAL1_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_HLP_ATQH1			0x00020030 /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQH1_ATQH_S			0
+#define VF_MBX_HLP_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQLEN1			0x00020020 /* Reset Source: PFR */
+#define VF_MBX_HLP_ATQLEN1_ATQLEN_S		0
+#define VF_MBX_HLP_ATQLEN1_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQLEN1_ATQVFE_S		28
+#define VF_MBX_HLP_ATQLEN1_ATQVFE_M		BIT(28)
+#define VF_MBX_HLP_ATQLEN1_ATQOVFL_S		29
+#define VF_MBX_HLP_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_MBX_HLP_ATQLEN1_ATQCRIT_S		30
+#define VF_MBX_HLP_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_MBX_HLP_ATQLEN1_ATQENABLE_S		31
+#define VF_MBX_HLP_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_MBX_HLP_ATQT1			0x00020040 /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQT1_ATQT_S			0
+#define VF_MBX_HLP_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQBAH1			0x00021060 /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQBAH1_ARQBAH_S		0
+#define VF_MBX_PSM_ARQBAH1_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_PSM_ARQBAL1			0x00021050 /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_MBX_PSM_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_PSM_ARQBAL1_ARQBAL_S		6
+#define VF_MBX_PSM_ARQBAL1_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_PSM_ARQH1			0x00021080 /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQH1_ARQH_S			0
+#define VF_MBX_PSM_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQLEN1			0x00021070 /* Reset Source: PFR */
+#define VF_MBX_PSM_ARQLEN1_ARQLEN_S		0
+#define VF_MBX_PSM_ARQLEN1_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQLEN1_ARQVFE_S		28
+#define VF_MBX_PSM_ARQLEN1_ARQVFE_M		BIT(28)
+#define VF_MBX_PSM_ARQLEN1_ARQOVFL_S		29
+#define VF_MBX_PSM_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_MBX_PSM_ARQLEN1_ARQCRIT_S		30
+#define VF_MBX_PSM_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_MBX_PSM_ARQLEN1_ARQENABLE_S		31
+#define VF_MBX_PSM_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_MBX_PSM_ARQT1			0x00021090 /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQT1_ARQT_S			0
+#define VF_MBX_PSM_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQBAH1			0x00021010 /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQBAH1_ATQBAH_S		0
+#define VF_MBX_PSM_ATQBAH1_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_PSM_ATQBAL1			0x00021000 /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQBAL1_ATQBAL_S		6
+#define VF_MBX_PSM_ATQBAL1_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_PSM_ATQH1			0x00021030 /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQH1_ATQH_S			0
+#define VF_MBX_PSM_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQLEN1			0x00021020 /* Reset Source: PFR */
+#define VF_MBX_PSM_ATQLEN1_ATQLEN_S		0
+#define VF_MBX_PSM_ATQLEN1_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQLEN1_ATQVFE_S		28
+#define VF_MBX_PSM_ATQLEN1_ATQVFE_M		BIT(28)
+#define VF_MBX_PSM_ATQLEN1_ATQOVFL_S		29
+#define VF_MBX_PSM_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_MBX_PSM_ATQLEN1_ATQCRIT_S		30
+#define VF_MBX_PSM_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_MBX_PSM_ATQLEN1_ATQENABLE_S		31
+#define VF_MBX_PSM_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_MBX_PSM_ATQT1			0x00021040 /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQT1_ATQT_S			0
+#define VF_MBX_PSM_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQBAH1			0x0000F160 /* Reset Source: CORER */
+#define VF_SB_CPM_ARQBAH1_ARQBAH_S		0
+#define VF_SB_CPM_ARQBAH1_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_SB_CPM_ARQBAL1			0x0000F150 /* Reset Source: CORER */
+#define VF_SB_CPM_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_SB_CPM_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_SB_CPM_ARQBAL1_ARQBAL_S		6
+#define VF_SB_CPM_ARQBAL1_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_SB_CPM_ARQH1				0x0000F180 /* Reset Source: CORER */
+#define VF_SB_CPM_ARQH1_ARQH_S			0
+#define VF_SB_CPM_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQLEN1			0x0000F170 /* Reset Source: PFR */
+#define VF_SB_CPM_ARQLEN1_ARQLEN_S		0
+#define VF_SB_CPM_ARQLEN1_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQLEN1_ARQVFE_S		28
+#define VF_SB_CPM_ARQLEN1_ARQVFE_M		BIT(28)
+#define VF_SB_CPM_ARQLEN1_ARQOVFL_S		29
+#define VF_SB_CPM_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_SB_CPM_ARQLEN1_ARQCRIT_S		30
+#define VF_SB_CPM_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_SB_CPM_ARQLEN1_ARQENABLE_S		31
+#define VF_SB_CPM_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_SB_CPM_ARQT1				0x0000F190 /* Reset Source: CORER */
+#define VF_SB_CPM_ARQT1_ARQT_S			0
+#define VF_SB_CPM_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQBAH1			0x0000F110 /* Reset Source: CORER */
+#define VF_SB_CPM_ATQBAH1_ATQBAH_S		0
+#define VF_SB_CPM_ATQBAH1_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_SB_CPM_ATQBAL1			0x0000F100 /* Reset Source: CORER */
+#define VF_SB_CPM_ATQBAL1_ATQBAL_S		6
+#define VF_SB_CPM_ATQBAL1_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_SB_CPM_ATQH1				0x0000F130 /* Reset Source: CORER */
+#define VF_SB_CPM_ATQH1_ATQH_S			0
+#define VF_SB_CPM_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQLEN1			0x0000F120 /* Reset Source: PFR */
+#define VF_SB_CPM_ATQLEN1_ATQLEN_S		0
+#define VF_SB_CPM_ATQLEN1_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQLEN1_ATQVFE_S		28
+#define VF_SB_CPM_ATQLEN1_ATQVFE_M		BIT(28)
+#define VF_SB_CPM_ATQLEN1_ATQOVFL_S		29
+#define VF_SB_CPM_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_SB_CPM_ATQLEN1_ATQCRIT_S		30
+#define VF_SB_CPM_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_SB_CPM_ATQLEN1_ATQENABLE_S		31
+#define VF_SB_CPM_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_SB_CPM_ATQT1				0x0000F140 /* Reset Source: CORER */
+#define VF_SB_CPM_ATQT1_ATQT_S			0
+#define VF_SB_CPM_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define VFINT_DYN_CTL(_i)			(0x00023000 + ((_i) * 4096)) /* _i=0...7 */ /* Reset Source: CORER */
+#define VFINT_DYN_CTL_MAX_INDEX			7
+#define VFINT_DYN_CTL_INTENA_S			0
+#define VFINT_DYN_CTL_INTENA_M			BIT(0)
+#define VFINT_DYN_CTL_CLEARPBA_S		1
+#define VFINT_DYN_CTL_CLEARPBA_M		BIT(1)
+#define VFINT_DYN_CTL_SWINT_TRIG_S		2
+#define VFINT_DYN_CTL_SWINT_TRIG_M		BIT(2)
+#define VFINT_DYN_CTL_ITR_INDX_S		3
+#define VFINT_DYN_CTL_ITR_INDX_M		ICE_M(0x3, 3)
+#define VFINT_DYN_CTL_INTERVAL_S		5
+#define VFINT_DYN_CTL_INTERVAL_M		ICE_M(0xFFF, 5)
+#define VFINT_DYN_CTL_SW_ITR_INDX_ENA_S		24
+#define VFINT_DYN_CTL_SW_ITR_INDX_ENA_M		BIT(24)
+#define VFINT_DYN_CTL_SW_ITR_INDX_S		25
+#define VFINT_DYN_CTL_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define VFINT_DYN_CTL_WB_ON_ITR_S		30
+#define VFINT_DYN_CTL_WB_ON_ITR_M		BIT(30)
+#define VFINT_DYN_CTL_INTENA_MSK_S		31
+#define VFINT_DYN_CTL_INTENA_MSK_M		BIT(31)
+#define VFINT_ITR_0(_i)				(0x00023004 + ((_i) * 4096)) /* _i=0...7 */ /* Reset Source: CORER */
+#define VFINT_ITR_0_MAX_INDEX			7
+#define VFINT_ITR_0_INTERVAL_S			0
+#define VFINT_ITR_0_INTERVAL_M			ICE_M(0xFFF, 0)
+#define VFINT_ITR_1(_i)				(0x00023008 + ((_i) * 4096)) /* _i=0...7 */ /* Reset Source: CORER */
+#define VFINT_ITR_1_MAX_INDEX			7
+#define VFINT_ITR_1_INTERVAL_S			0
+#define VFINT_ITR_1_INTERVAL_M			ICE_M(0xFFF, 0)
+#define VFINT_ITR_2(_i)				(0x0002300C + ((_i) * 4096)) /* _i=0...7 */ /* Reset Source: CORER */
+#define VFINT_ITR_2_MAX_INDEX			7
+#define VFINT_ITR_2_INTERVAL_S			0
+#define VFINT_ITR_2_INTERVAL_M			ICE_M(0xFFF, 0)
+#define VFQRX_TAIL(_QRX)			(0x0002E000 + ((_QRX) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VFQRX_TAIL_MAX_INDEX			255
+#define VFQRX_TAIL_TAIL_S			0
+#define VFQRX_TAIL_TAIL_M			ICE_M(0x1FFF, 0)
+#define VFQTX_COMM_DBELL(_DBQM)			(0x00030000 + ((_DBQM) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VFQTX_COMM_DBELL_MAX_INDEX		255
+#define VFQTX_COMM_DBELL_QTX_COMM_DBELL_S	0
+#define VFQTX_COMM_DBELL_QTX_COMM_DBELL_M	ICE_M(0xFFFFFFFF, 0)
+#define VFQTX_COMM_DBLQ_DBELL(_DBLQ)		(0x00022000 + ((_DBLQ) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define VFQTX_COMM_DBLQ_DBELL_MAX_INDEX		3
+#define VFQTX_COMM_DBLQ_DBELL_TAIL_S		0
+#define VFQTX_COMM_DBLQ_DBELL_TAIL_M		ICE_M(0x1FFF, 0)
+#define MSIX_TMSG1(_i)				(0x00000008 + ((_i) * 16)) /* _i=0...64 */ /* Reset Source: FLR */
+#define MSIX_TMSG1_MAX_INDEX			64
+#define MSIX_TMSG1_MSIXTMSG_S			0
+#define MSIX_TMSG1_MSIXTMSG_M			ICE_M(0xFFFFFFFF, 0)
+#define VFPE_AEQALLOC1				0x0000A400 /* Reset Source: VFR */
+#define VFPE_AEQALLOC1_AECOUNT_S		0
+#define VFPE_AEQALLOC1_AECOUNT_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPHIGH1				0x00009800 /* Reset Source: VFR */
+#define VFPE_CCQPHIGH1_PECCQPHIGH_S		0
+#define VFPE_CCQPHIGH1_PECCQPHIGH_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPLOW1				0x0000AC00 /* Reset Source: VFR */
+#define VFPE_CCQPLOW1_PECCQPLOW_S		0
+#define VFPE_CCQPLOW1_PECCQPLOW_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPSTATUS1			0x0000B800 /* Reset Source: VFR */
+#define VFPE_CCQPSTATUS1_CCQP_DONE_S		0
+#define VFPE_CCQPSTATUS1_CCQP_DONE_M		BIT(0)
+#define VFPE_CCQPSTATUS1_HMC_PROFILE_S		4
+#define VFPE_CCQPSTATUS1_HMC_PROFILE_M		ICE_M(0x7, 4)
+#define VFPE_CCQPSTATUS1_RDMA_EN_VFS_S		16
+#define VFPE_CCQPSTATUS1_RDMA_EN_VFS_M		ICE_M(0x3F, 16)
+#define VFPE_CCQPSTATUS1_CCQP_ERR_S		31
+#define VFPE_CCQPSTATUS1_CCQP_ERR_M		BIT(31)
+#define VFPE_CQACK1				0x0000B000 /* Reset Source: VFR */
+#define VFPE_CQACK1_PECQID_S			0
+#define VFPE_CQACK1_PECQID_M			ICE_M(0x7FFFF, 0)
+#define VFPE_CQARM1				0x0000B400 /* Reset Source: VFR */
+#define VFPE_CQARM1_PECQID_S			0
+#define VFPE_CQARM1_PECQID_M			ICE_M(0x7FFFF, 0)
+#define VFPE_CQPDB1				0x0000BC00 /* Reset Source: VFR */
+#define VFPE_CQPDB1_WQHEAD_S			0
+#define VFPE_CQPDB1_WQHEAD_M			ICE_M(0x7FF, 0)
+#define VFPE_CQPERRCODES1			0x00009C00 /* Reset Source: VFR */
+#define VFPE_CQPERRCODES1_CQP_MINOR_CODE_S	0
+#define VFPE_CQPERRCODES1_CQP_MINOR_CODE_M	ICE_M(0xFFFF, 0)
+#define VFPE_CQPERRCODES1_CQP_MAJOR_CODE_S	16
+#define VFPE_CQPERRCODES1_CQP_MAJOR_CODE_M	ICE_M(0xFFFF, 16)
+#define VFPE_CQPTAIL1				0x0000A000 /* Reset Source: VFR */
+#define VFPE_CQPTAIL1_WQTAIL_S			0
+#define VFPE_CQPTAIL1_WQTAIL_M			ICE_M(0x7FF, 0)
+#define VFPE_CQPTAIL1_CQP_OP_ERR_S		31
+#define VFPE_CQPTAIL1_CQP_OP_ERR_M		BIT(31)
+#define VFPE_IPCONFIG01				0x00008C00 /* Reset Source: VFR */
+#define VFPE_IPCONFIG01_PEIPID_S		0
+#define VFPE_IPCONFIG01_PEIPID_M		ICE_M(0xFFFF, 0)
+#define VFPE_IPCONFIG01_USEENTIREIDRANGE_S	16
+#define VFPE_IPCONFIG01_USEENTIREIDRANGE_M	BIT(16)
+#define VFPE_IPCONFIG01_UDP_SRC_PORT_MASK_EN_S	17
+#define VFPE_IPCONFIG01_UDP_SRC_PORT_MASK_EN_M	BIT(17)
+#define VFPE_MRTEIDXMASK1(_VF)			(0x00509800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_MRTEIDXMASK1_MAX_INDEX		255
+#define VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_S	0
+#define VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_M	ICE_M(0x1F, 0)
+#define VFPE_RCVUNEXPECTEDERROR1		0x00009400 /* Reset Source: VFR */
+#define VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_S 0
+#define VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_M ICE_M(0xFFFFFF, 0)
+#define VFPE_TCPNOWTIMER1			0x0000A800 /* Reset Source: VFR */
+#define VFPE_TCPNOWTIMER1_TCP_NOW_S		0
+#define VFPE_TCPNOWTIMER1_TCP_NOW_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_WQEALLOC1				0x0000C000 /* Reset Source: VFR */
+#define VFPE_WQEALLOC1_PEQPID_S			0
+#define VFPE_WQEALLOC1_PEQPID_M			ICE_M(0x3FFFF, 0)
+#define VFPE_WQEALLOC1_WQE_DESC_INDEX_S		20
+#define VFPE_WQEALLOC1_WQE_DESC_INDEX_M		ICE_M(0xFFF, 20)
 
-#endif /* _ICE_HW_AUTOGEN_H_ */
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c
new file mode 100644
index 000000000000..512bca2f98d0
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc.c
@@ -0,0 +1,1488 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* Inter-Driver Communication */
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+#include "ice_dcb_lib.h"
+#include "ice_ptp.h"
+
+DEFINE_IDA(ice_peer_index_ida);
+
+
+
+static struct mfd_cell ice_mfd_cells[] = ASSIGN_PEER_INFO;
+
+/**
+ * ice_is_vsi_state_nominal
+ * @vsi: pointer to the VSI struct
+ *
+ * returns true if VSI state is nominal, false otherwise
+ */
+static bool ice_is_vsi_state_nominal(struct ice_vsi *vsi)
+{
+	if (!vsi)
+		return false;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state) ||
+	    test_bit(ICE_VSI_NEEDS_RESTART, vsi->state))
+		return false;
+
+	return true;
+}
+
+
+/**
+ * ice_peer_state_change - manage state machine for peer
+ * @peer_obj: pointer to peer's configuration
+ * @new_state: the state requested to transition into
+ * @locked: boolean to determine if call made with mutex held
+ *
+ * This function handles all state transitions for peer objects.
+ *
+ * The state machine is as follows:
+ *
+ *     +<-----------------------+<-----------------------------+
+ *				|<-------+<----------+	       +
+ *				\/	 +	     +	       +
+ *    INIT  --------------> PROBED --> OPENING	  CLOSED --> REMOVED
+ *					 +           +
+ *				       OPENED --> CLOSING
+ *					 +	     +
+ *				       PREP_RST	     +
+ *					 +	     +
+ *				      PREPPED	     +
+ *					 +---------->+
+ *
+ * NOTE: there is an error condition that can take a peer from OPENING
+ * to REMOVED.
+ */
+static void
+ice_peer_state_change(struct ice_peer_obj_int *peer_obj, long new_state,
+		      bool locked)
+{
+	struct device *dev;
+
+	dev = bus_find_device_by_name(&platform_bus_type, NULL,
+				      peer_obj->plat_name);
+
+	if (!locked)
+		mutex_lock(&peer_obj->peer_obj_state_mutex);
+
+	switch (new_state) {
+	case ICE_PEER_OBJ_STATE_INIT:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_REMOVED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_INIT, peer_obj->state);
+			dev_dbg(dev, "state transition from _REMOVED to _INIT\n");
+		} else {
+			set_bit(ICE_PEER_OBJ_STATE_INIT, peer_obj->state);
+			if (dev)
+				dev_dbg(dev, "state set to _INIT\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_PROBED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_INIT,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj->state);
+			dev_dbg(dev, "state transition from _INIT to _PROBED\n");
+		} else if (test_and_clear_bit(ICE_PEER_OBJ_STATE_REMOVED,
+					      peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj->state);
+			dev_dbg(dev, "state transition from _REMOVED to _PROBED\n");
+		} else if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENING,
+					      peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj->state);
+			dev_dbg(dev, "state transition from _OPENING to _PROBED\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_OPENING:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_PROBED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_OPENING, peer_obj->state);
+			dev_dbg(dev, "state transition from _PROBED to _OPENING\n");
+		} else if (test_and_clear_bit(ICE_PEER_OBJ_STATE_CLOSED,
+					      peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_OPENING, peer_obj->state);
+			dev_dbg(dev, "state transition from _CLOSED to _OPENING\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_OPENED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENING,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj->state);
+			dev_dbg(dev, "state transition from _OPENING to _OPENED\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_PREP_RST:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PREP_RST, peer_obj->state);
+			dev_dbg(dev, "state transition from _OPENED to _PREP_RST\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_PREPPED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_PREP_RST,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj->state);
+			dev_dbg(dev, "state transition _PREP_RST to _PREPPED\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_CLOSING:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_CLOSING, peer_obj->state);
+			dev_dbg(dev, "state transition from _OPENED to _CLOSING\n");
+		}
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_PREPPED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_CLOSING, peer_obj->state);
+			dev_dbg(dev, "state transition _PREPPED to _CLOSING\n");
+		}
+		/* NOTE - up to peer to handle this situation correctly */
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_PREP_RST,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_CLOSING, peer_obj->state);
+			dev_warn(dev,
+				 "WARN: Peer state _PREP_RST to _CLOSING\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_CLOSED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_CLOSING,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_CLOSED, peer_obj->state);
+			dev_dbg(dev, "state transition from _CLOSING to _CLOSED\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_REMOVED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENED,
+				       peer_obj->state) ||
+		    test_and_clear_bit(ICE_PEER_OBJ_STATE_CLOSED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_REMOVED, peer_obj->state);
+			dev_dbg(dev, "state from _OPENED/_CLOSED to _REMOVED\n");
+			/* Clear registration for events when peer removed */
+			bitmap_zero(peer_obj->events, ICE_PEER_OBJ_STATE_NBITS);
+		}
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENING,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_REMOVED, peer_obj->state);
+			dev_warn(dev, "Peer failed to open, set to _REMOVED");
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (!locked)
+		mutex_unlock(&peer_obj->peer_obj_state_mutex);
+
+	put_device(dev);
+}
+
+/**
+ * ice_peer_close - close a peer object
+ * @peer_obj_int: peer object to close
+ * @data: pointer to opaque data
+ *
+ * This function will also set the state bit for the peer to CLOSED. This
+ * function is meant to be called from a ice_for_each_peer().
+ */
+int ice_peer_close(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	enum ice_close_reason reason = *(enum ice_close_reason *)(data);
+	struct ice_peer_obj *peer_obj;
+	struct ice_pf *pf;
+	int i;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	/* return 0 so ice_for_each_peer will continue closing other peers */
+	if (!ice_validate_peer_obj(peer_obj))
+		return 0;
+	pf = pci_get_drvdata(peer_obj->pdev);
+
+	if (test_bit(ICE_DOWN, pf->state) ||
+	    test_bit(ICE_SUSPENDED, pf->state) ||
+	    test_bit(ICE_NEEDS_RESTART, pf->state))
+		return 0;
+
+	mutex_lock(&peer_obj_int->peer_obj_state_mutex);
+
+	/* no peer driver, already closed, closing or opening nothing to do */
+	if (test_bit(ICE_PEER_OBJ_STATE_CLOSED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_CLOSING, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_OPENING, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_REMOVED, peer_obj_int->state))
+		goto peer_close_out;
+
+	/* Set the peer state to CLOSING */
+	ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSING, true);
+
+	for (i = 0; i < ICE_EVENT_NBITS; i++)
+		bitmap_zero(peer_obj_int->current_events[i].type,
+			    ICE_EVENT_NBITS);
+
+	if (peer_obj->peer_ops && peer_obj->peer_ops->close)
+		peer_obj->peer_ops->close(peer_obj, reason);
+
+	/* Set the peer state to CLOSED */
+	ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSED, true);
+
+peer_close_out:
+	mutex_unlock(&peer_obj_int->peer_obj_state_mutex);
+
+	return 0;
+}
+
+/**
+ * ice_close_peer_for_reset - queue work to close peer for reset
+ * @peer_obj_int: pointer peer object internal struct
+ * @data: pointer to opaque data used for reset type
+ */
+int ice_close_peer_for_reset(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	struct ice_peer_obj *peer_obj;
+	enum ice_reset_req reset;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!ice_validate_peer_obj(peer_obj) ||
+	    (!test_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj_int->state) &&
+	     !test_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj_int->state)))
+		return 0;
+
+	reset = *(enum ice_reset_req *)data;
+
+	switch (reset) {
+	case ICE_RESET_EMPR:
+		peer_obj_int->rst_type = ICE_REASON_EMPR_REQ;
+		break;
+	case ICE_RESET_GLOBR:
+		peer_obj_int->rst_type = ICE_REASON_GLOBR_REQ;
+		break;
+	case ICE_RESET_CORER:
+		peer_obj_int->rst_type = ICE_REASON_CORER_REQ;
+		break;
+	case ICE_RESET_PFR:
+		peer_obj_int->rst_type = ICE_REASON_PFR_REQ;
+		break;
+	default:
+		/* reset type is invalid */
+		return 1;
+	}
+	queue_work(peer_obj_int->ice_peer_wq, &peer_obj_int->peer_close_task);
+	return 0;
+}
+
+/**
+ * ice_check_peer_drv_for_events - check peer_drv for events to report
+ * @peer_obj: peer object to report to
+ */
+static void ice_check_peer_drv_for_events(struct ice_peer_obj *peer_obj)
+{
+	const struct ice_peer_ops *p_ops = peer_obj->peer_ops;
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_peer_drv_int *peer_drv_int;
+	int i;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	if (!peer_obj_int)
+		return;
+	peer_drv_int = peer_obj_int->peer_drv_int;
+
+	for_each_set_bit(i, peer_obj_int->events, ICE_EVENT_NBITS) {
+		struct ice_event *curr = &peer_drv_int->current_events[i];
+
+		if (!bitmap_empty(curr->type, ICE_EVENT_NBITS) &&
+		    p_ops->event_handler)
+			p_ops->event_handler(peer_obj, curr);
+	}
+}
+
+/**
+ * ice_check_peer_for_events - check peer_objs for events new peer reg'd for
+ * @src_peer_int: peer to check for events
+ * @data: ptr to opaque data, to be used for the peer struct that opened
+ *
+ * This function is to be called when a peer object is opened.
+ *
+ * Since a new peer opening would have missed any events that would
+ * have happened before its opening, we need to walk the peers and see
+ * if any of them have events that the new peer cares about
+ *
+ * This function is meant to be called by a ice_for_each_peer.
+ */
+static int
+ice_check_peer_for_events(struct ice_peer_obj_int *src_peer_int, void *data)
+{
+	struct ice_peer_obj *new_peer = (struct ice_peer_obj *)data;
+	const struct ice_peer_ops *p_ops = new_peer->peer_ops;
+	struct ice_peer_obj_int *new_peer_int;
+	struct ice_peer_obj *src_peer;
+	unsigned long i;
+
+	src_peer = ice_get_peer_obj(src_peer_int);
+	if (!ice_validate_peer_obj(new_peer) ||
+	    !ice_validate_peer_obj(src_peer))
+		return 0;
+
+	new_peer_int = peer_to_ice_obj_int(new_peer);
+
+	for_each_set_bit(i, new_peer_int->events, ICE_EVENT_NBITS) {
+		struct ice_event *curr = &src_peer_int->current_events[i];
+
+		if (!bitmap_empty(curr->type, ICE_EVENT_NBITS) &&
+		    new_peer->peer_obj_id != src_peer->peer_obj_id &&
+		    p_ops->event_handler)
+			p_ops->event_handler(new_peer, curr);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_for_each_peer - iterate across and call function for each peer obj
+ * @pf: pointer to private board struct
+ * @data: data to pass to function on each call
+ * @fn: pointer to function to call for each peer
+ */
+int
+ice_for_each_peer(struct ice_pf *pf, void *data,
+		  int (*fn)(struct ice_peer_obj_int *, void *))
+{
+	unsigned int i;
+
+	if (!pf->peers)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(ice_mfd_cells); i++) {
+		struct ice_peer_obj_int *peer_obj_int;
+
+		peer_obj_int = pf->peers[i];
+		if (peer_obj_int) {
+			int ret = fn(peer_obj_int, data);
+
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_finish_init_peer_obj - complete peer object initialization
+ * @peer_obj_int: ptr to peer object internal struct
+ * @data: ptr to opaque data
+ *
+ * This function completes remaining initialization of peer objects
+ */
+int
+ice_finish_init_peer_obj(struct ice_peer_obj_int *peer_obj_int,
+			 void __always_unused *data)
+{
+	struct ice_peer_obj *peer_obj;
+	struct ice_peer_drv *peer_drv;
+	struct device *dev;
+	struct ice_pf *pf;
+	int ret = 0;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	/* peer_obj will not always be populated at the time of this check */
+	if (!ice_validate_peer_obj(peer_obj))
+		return ret;
+
+	peer_drv = peer_obj->peer_drv;
+	pf = pci_get_drvdata(peer_obj->pdev);
+	dev = ice_pf_to_dev(pf);
+	/* There will be several assessments of the peer_obj's state in this
+	 * chunk of logic.  We need to hold the peer_obj_int's state mutex
+	 * for the entire part so that the flow progresses without another
+	 * context changing things mid-flow
+	 */
+	mutex_lock(&peer_obj_int->peer_obj_state_mutex);
+
+	if (!peer_obj->peer_ops) {
+		dev_err(dev, "peer_ops not defined in peer obj\n");
+		goto init_unlock;
+	}
+
+	if (!peer_obj->peer_ops->open) {
+		dev_err(dev, "peer_ops:open not defined in peer obj\n");
+		goto init_unlock;
+	}
+
+	if (!peer_obj->peer_ops->close) {
+		dev_err(dev, "peer_ops:close not defined in peer obj\n");
+		goto init_unlock;
+	}
+
+	/* Peer driver expected to set driver_id during registration */
+	if (!peer_drv->driver_id) {
+		dev_err(dev, "Peer driver did not set driver_id\n");
+		goto init_unlock;
+	}
+
+	if ((test_bit(ICE_PEER_OBJ_STATE_CLOSED, peer_obj_int->state) ||
+	     test_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj_int->state)) &&
+	    ice_pf_state_is_nominal(pf)) {
+		/* If the RTNL is locked, we defer opening the peer
+		 * until the next time this function is called by the
+		 * service task.
+		 */
+		if (rtnl_is_locked())
+			goto init_unlock;
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_OPENING,
+				      true);
+		ret = peer_obj->peer_ops->open(peer_obj);
+		if (ret == -EAGAIN) {
+			dev_err(dev, "Peer %d failed to open\n",
+				peer_obj->peer_obj_id);
+			ice_peer_state_change(peer_obj_int,
+					      ICE_PEER_OBJ_STATE_PROBED, true);
+			goto init_unlock;
+		} else if (ret) {
+			ice_peer_state_change(peer_obj_int,
+					      ICE_PEER_OBJ_STATE_REMOVED, true);
+			peer_obj->peer_ops = NULL;
+			goto init_unlock;
+		}
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_OPENED,
+				      true);
+		ret = ice_for_each_peer(pf, peer_obj,
+					ice_check_peer_for_events);
+		ice_check_peer_drv_for_events(peer_obj);
+	}
+
+	if (test_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj_int->state)) {
+		enum ice_close_reason reason = ICE_REASON_CORER_REQ;
+		int i;
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSING,
+				      true);
+		for (i = 0; i < ICE_EVENT_NBITS; i++)
+			bitmap_zero(peer_obj_int->current_events[i].type,
+				    ICE_EVENT_NBITS);
+
+		peer_obj->peer_ops->close(peer_obj, reason);
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSED,
+				      true);
+	}
+
+init_unlock:
+	mutex_unlock(&peer_obj_int->peer_obj_state_mutex);
+
+	return ret;
+}
+
+/**
+ * ice_unreg_peer_obj - unregister specified peer object
+ * @peer_obj_int: ptr to peer object internal
+ * @data: ptr to opaque data
+ *
+ * This function invokes object unregistration, removes ID associated with
+ * the specified object.
+ */
+int ice_unreg_peer_obj(struct ice_peer_obj_int *peer_obj_int, void __always_unused *data)
+{
+	struct ice_peer_drv_int *peer_drv_int;
+	struct ice_peer_obj *peer_obj;
+	struct pci_dev *pdev;
+	struct device *dev;
+	struct ice_pf *pf;
+
+	if (!peer_obj_int)
+		return 0;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	pdev = peer_obj->pdev;
+	if (!pdev)
+		return 0;
+
+	pf = pci_get_drvdata(pdev);
+	if (!pf)
+		return 0;
+	dev = ice_pf_to_dev(pf);
+
+	mfd_remove_devices(&pdev->dev);
+
+	peer_drv_int = peer_obj_int->peer_drv_int;
+
+	if (peer_obj_int->ice_peer_wq) {
+		if (peer_obj_int->peer_prep_task.func)
+			cancel_work_sync(&peer_obj_int->peer_prep_task);
+
+		if (peer_obj_int->peer_close_task.func)
+			cancel_work_sync(&peer_obj_int->peer_close_task);
+		destroy_workqueue(peer_obj_int->ice_peer_wq);
+	}
+
+	devm_kfree(dev, peer_drv_int);
+
+	devm_kfree(dev, peer_obj_int);
+
+	return 0;
+}
+
+/**
+ * ice_unroll_peer - destroy peers and peer_wq in case of error
+ * @peer_obj_int: ptr to peer object internal struct
+ * @data: ptr to opaque data
+ *
+ * This function releases resources in the event of a failure in creating
+ * peer objects or their individual work_queues. Meant to be called from
+ * a ice_for_each_peer invocation
+ */
+int ice_unroll_peer(struct ice_peer_obj_int *peer_obj_int, void __always_unused *data)
+{
+	struct ice_peer_obj *peer_obj;
+	struct ice_pf *pf;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!peer_obj || !peer_obj->pdev)
+		return 0;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (!pf)
+		return 0;
+
+	if (peer_obj_int->ice_peer_wq)
+		destroy_workqueue(peer_obj_int->ice_peer_wq);
+
+	if (peer_obj_int->peer_drv_int)
+		devm_kfree(ice_pf_to_dev(pf), peer_obj_int->peer_drv_int);
+
+	devm_kfree(ice_pf_to_dev(pf), peer_obj_int);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+/**
+ * ice_peer_refresh_msix - load new values into ice_peer_obj structs
+ * @pf: pointer to private board struct
+ */
+void ice_peer_refresh_msix(struct ice_pf *pf)
+{
+	struct ice_peer_obj *peer;
+	unsigned int i;
+
+	if (!pf->peers)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(ice_mfd_cells); i++) {
+		if (!pf->peers[i])
+			continue;
+
+		peer = ice_get_peer_obj(pf->peers[i]);
+		if (!peer)
+			continue;
+
+		switch (peer->peer_obj_id) {
+		case ICE_PEER_RDMA_ID:
+			peer->msix_count = pf->num_rdma_msix;
+			peer->msix_entries = &pf->msix_entries[pf->rdma_base_vector];
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+#endif /* CONFIG_PM */
+/**
+ * ice_find_vsi - Find the VSI from VSI ID
+ * @pf: The PF pointer to search in
+ * @vsi_num: The VSI ID to search for
+ */
+static struct ice_vsi *ice_find_vsi(struct ice_pf *pf, u16 vsi_num)
+{
+	int i;
+
+	ice_for_each_vsi(pf, i)
+		if (pf->vsi[i] && pf->vsi[i]->vsi_num == vsi_num)
+			return  pf->vsi[i];
+	return NULL;
+}
+
+/**
+ * ice_peer_alloc_rdma_qsets - Allocate Leaf Nodes for RDMA Qset
+ * @peer_obj: peer that is requesting the Leaf Nodes
+ * @res: Resources to be allocated
+ * @partial_acceptable: If partial allocation is acceptable to the peer
+ *
+ * This function allocates Leaf Nodes for given RDMA Qset resources
+ * for the peer object.
+ */
+static int
+ice_peer_alloc_rdma_qsets(struct ice_peer_obj *peer_obj, struct ice_res *res,
+			  int __always_unused partial_acceptable)
+{
+	u16 max_rdmaqs[ICE_MAX_TRAFFIC_CLASS];
+	enum ice_status status;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_pf *pf;
+	int i, ret = 0;
+	u32 *qset_teid;
+	u16 *qs_handle;
+
+	if (!ice_validate_peer_obj(peer_obj) || !res)
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_FLAG_IWARP_ENA, pf->flags))
+		return -EINVAL;
+
+	if (res->cnt_req > ICE_MAX_TXQ_PER_TXQG)
+		return -EINVAL;
+
+	qset_teid = kcalloc(res->cnt_req, sizeof(*qset_teid), GFP_KERNEL);
+	if (!qset_teid)
+		return -ENOMEM;
+
+	qs_handle = kcalloc(res->cnt_req, sizeof(*qs_handle), GFP_KERNEL);
+	if (!qs_handle) {
+		kfree(qset_teid);
+		return -ENOMEM;
+	}
+
+	ice_for_each_traffic_class(i)
+		max_rdmaqs[i] = 0;
+
+	for (i = 0; i < res->cnt_req; i++) {
+		struct ice_rdma_qset_params *qset;
+
+		qset = &res->res[i].res.qsets;
+		if (qset->vsi_id != peer_obj->pf_vsi_num) {
+			dev_err(dev, "RDMA QSet invalid VSI requested\n");
+			ret = -EINVAL;
+			goto out;
+		}
+		max_rdmaqs[qset->tc]++;
+		qs_handle[i] = qset->qs_handle;
+	}
+
+	vsi = ice_find_vsi(pf, peer_obj->pf_vsi_num);
+	if (!vsi) {
+		dev_err(dev, "RDMA QSet invalid VSI\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	status = ice_cfg_vsi_rdma(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
+				  max_rdmaqs);
+	if (status) {
+		dev_err(dev, "Failed VSI RDMA qset config\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < res->cnt_req; i++) {
+		struct ice_rdma_qset_params *qset;
+
+		qset = &res->res[i].res.qsets;
+		status = ice_ena_vsi_rdma_qset(vsi->port_info, vsi->idx,
+					       qset->tc, &qs_handle[i], 1,
+					       &qset_teid[i]);
+		if (status) {
+			dev_err(dev, "Failed VSI RDMA qset enable\n");
+			ret = -EINVAL;
+			goto out;
+		}
+		vsi->qset_handle[qset->tc] = qset->qs_handle;
+		qset->teid = qset_teid[i];
+	}
+
+out:
+	kfree(qset_teid);
+	kfree(qs_handle);
+	return ret;
+}
+
+/**
+ * ice_peer_free_rdma_qsets - Free leaf nodes for RDMA Qset
+ * @peer_obj: peer that requested qsets to be freed
+ * @res: Resource to be freed
+ */
+static int
+ice_peer_free_rdma_qsets(struct ice_peer_obj *peer_obj, struct ice_res *res)
+{
+	enum ice_status status;
+	int count, i, ret = 0;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_pf *pf;
+	u16 vsi_id;
+	u32 *teid;
+	u16 *q_id;
+
+	if (!ice_validate_peer_obj(peer_obj) || !res)
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	dev = ice_pf_to_dev(pf);
+
+	count = res->res_allocated;
+	if (count > ICE_MAX_TXQ_PER_TXQG)
+		return -EINVAL;
+
+	teid = kcalloc(count, sizeof(*teid), GFP_KERNEL);
+	if (!teid)
+		return -ENOMEM;
+
+	q_id = kcalloc(count, sizeof(*q_id), GFP_KERNEL);
+	if (!q_id) {
+		kfree(teid);
+		return -ENOMEM;
+	}
+
+	vsi_id = res->res[0].res.qsets.vsi_id;
+	vsi = ice_find_vsi(pf, vsi_id);
+	if (!vsi) {
+		dev_err(dev, "RDMA Invalid VSI\n");
+		ret = -EINVAL;
+		goto rdma_free_out;
+	}
+
+	for (i = 0; i < count; i++) {
+		struct ice_rdma_qset_params *qset;
+
+		qset = &res->res[i].res.qsets;
+		if (qset->vsi_id != vsi_id) {
+			dev_err(dev, "RDMA Invalid VSI ID\n");
+			ret = -EINVAL;
+			goto rdma_free_out;
+		}
+		q_id[i] = qset->qs_handle;
+		teid[i] = qset->teid;
+
+		vsi->qset_handle[qset->tc] = 0;
+	}
+
+	status = ice_dis_vsi_rdma_qset(vsi->port_info, count, teid, q_id);
+	if (status)
+		ret = -EINVAL;
+
+rdma_free_out:
+	kfree(teid);
+	kfree(q_id);
+
+	return ret;
+}
+
+/**
+ * ice_peer_alloc_res - Allocate requested resources for peer objects
+ * @peer_obj: peer that is requesting resources
+ * @res: Resources to be allocated
+ * @partial_acceptable: If partial allocation is acceptable to the peer
+ *
+ * This function allocates requested resources for the peer object.
+ */
+static int
+ice_peer_alloc_res(struct ice_peer_obj *peer_obj, struct ice_res *res,
+		   int partial_acceptable)
+{
+	struct ice_pf *pf;
+	int ret;
+
+	if (!ice_validate_peer_obj(peer_obj) || !res)
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (!ice_pf_state_is_nominal(pf))
+		return -EBUSY;
+
+	switch (res->res_type) {
+	case ICE_RDMA_QSETS_TXSCHED:
+		ret = ice_peer_alloc_rdma_qsets(peer_obj, res,
+						partial_acceptable);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_peer_free_res - Free given resources
+ * @peer_obj: peer that is requesting freeing of resources
+ * @res: Resources to be freed
+ *
+ * Free/Release resources allocated to given peer onjects.
+ */
+static int
+ice_peer_free_res(struct ice_peer_obj *peer_obj, struct ice_res *res)
+{
+	int ret;
+
+	if (!ice_validate_peer_obj(peer_obj) || !res)
+		return -EINVAL;
+
+	switch (res->res_type) {
+	case ICE_RDMA_QSETS_TXSCHED:
+		ret = ice_peer_free_rdma_qsets(peer_obj, res);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_peer_reg_for_notif - register a peer to receive specific notifications
+ * @peer_obj: peer that is registering for event notifications
+ * @events: mask of event types peer is registering for
+ */
+static void
+ice_peer_reg_for_notif(struct ice_peer_obj *peer_obj, struct ice_event *events)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_pf *pf;
+
+	if (!ice_validate_peer_obj(peer_obj) || !events)
+		return;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	pf = pci_get_drvdata(peer_obj->pdev);
+
+	bitmap_or(peer_obj_int->events, peer_obj_int->events, events->type,
+		  ICE_EVENT_NBITS);
+
+	/* Check to see if any events happened previous to peer registering */
+	ice_for_each_peer(pf, peer_obj, ice_check_peer_for_events);
+	ice_check_peer_drv_for_events(peer_obj);
+}
+
+/**
+ * ice_peer_unreg_for_notif - unreg a peer from receiving certain notifications
+ * @peer_obj: peer that is unregistering from event notifications
+ * @events: mask of event types peer is unregistering for
+ */
+static void
+ice_peer_unreg_for_notif(struct ice_peer_obj *peer_obj,
+			 struct ice_event *events)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+
+	if (!ice_validate_peer_obj(peer_obj) || !events)
+		return;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+
+	bitmap_andnot(peer_obj_int->events, peer_obj_int->events, events->type,
+		      ICE_EVENT_NBITS);
+}
+
+/**
+ * ice_peer_check_for_reg - check to see if any peers are reg'd for event
+ * @peer_obj_int: ptr to peer object internal struct
+ * @data: ptr to opaque data, to be used for ice_event to report
+ *
+ * This function is to be called by ice_for_each_peer to handle an
+ * event reported by a peer or the ice driver.
+ */
+int ice_peer_check_for_reg(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	struct ice_event *event = (struct ice_event *)data;
+	DECLARE_BITMAP(comp_events, ICE_EVENT_NBITS);
+	struct ice_peer_obj *peer_obj;
+	bool check = true;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+
+	if (!ice_validate_peer_obj(peer_obj) || !data)
+	/* If invalid obj, in this case return 0 instead of error
+	 * because caller ignores this return value
+	 */
+		return 0;
+
+	if (event->reporter)
+		check = event->reporter->peer_obj_id != peer_obj->peer_obj_id;
+
+	if (bitmap_and(comp_events, event->type, peer_obj_int->events,
+		       ICE_EVENT_NBITS) &&
+	    (test_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj_int->state) ||
+	     test_bit(ICE_PEER_OBJ_STATE_PREP_RST, peer_obj_int->state) ||
+	     test_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj_int->state)) &&
+	    check &&
+	    peer_obj->peer_ops->event_handler)
+		peer_obj->peer_ops->event_handler(peer_obj, event);
+
+	return 0;
+}
+
+/**
+ * ice_peer_report_state_change - accept report of a peer state change
+ * @peer_obj: peer that is sending notification about state change
+ * @event: ice_event holding info on what the state change is
+ *
+ * We also need to parse the list of peers to see if anyone is registered
+ * for notifications about this state change event, and if so, notify them.
+ */
+static void
+ice_peer_report_state_change(struct ice_peer_obj *peer_obj,
+			     struct ice_event *event)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_peer_drv_int *peer_drv_int;
+	unsigned int e_type;
+	int drv_event = 0;
+	struct ice_pf *pf;
+
+	if (!ice_validate_peer_obj(peer_obj) || !event)
+		return;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	peer_drv_int = peer_obj_int->peer_drv_int;
+
+	e_type = find_first_bit(event->type, ICE_EVENT_NBITS);
+	if (!e_type)
+		return;
+
+	switch (e_type) {
+	/* Check for peer_drv events */
+	case ICE_EVENT_MBX_CHANGE:
+		drv_event = 1;
+		if (event->info.mbx_rdy)
+			set_bit(ICE_PEER_DRV_STATE_MBX_RDY,
+				peer_drv_int->state);
+		else
+			clear_bit(ICE_PEER_DRV_STATE_MBX_RDY,
+				  peer_drv_int->state);
+		break;
+
+	/* Check for peer_obj events */
+	case ICE_EVENT_API_CHANGE:
+		if (event->info.api_rdy) {
+			set_bit(ICE_PEER_OBJ_STATE_API_RDY,
+				peer_obj_int->state);
+		} else {
+			clear_bit(ICE_PEER_OBJ_STATE_API_RDY,
+				  peer_obj_int->state);
+		}
+		break;
+
+	default:
+		return;
+	}
+
+	/* store the event and state to notify any new peers opening */
+	if (drv_event)
+		memcpy(&peer_drv_int->current_events[e_type], event,
+		       sizeof(*event));
+	else
+		memcpy(&peer_obj_int->current_events[e_type], event,
+		       sizeof(*event));
+
+	ice_for_each_peer(pf, event, ice_peer_check_for_reg);
+}
+
+/**
+ * ice_peer_unregister - request to unregister peer
+ * @peer_obj: peer object
+ *
+ * This function triggers close/remove on peer_obj allowing peer
+ * to unregister.
+ */
+static int ice_peer_unregister(struct ice_peer_obj *peer_obj)
+{
+	enum ice_close_reason reason = ICE_REASON_PEER_DRV_UNREG;
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_pf *pf;
+	int ret;
+
+	if (!ice_validate_peer_obj(peer_obj))
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+
+	ret = ice_peer_close(peer_obj_int, &reason);
+	if (ret)
+		return ret;
+
+	switch (peer_obj->peer_obj_id) {
+	case ICE_PEER_RDMA_ID:
+		pf->rdma_peer = NULL;
+		break;
+	default:
+		break;
+	}
+
+	peer_obj->peer_ops = NULL;
+
+	ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_REMOVED, false);
+	return 0;
+}
+
+/**
+ * ice_peer_register - Called by peer to open communication with LAN
+ * @peer_obj: ptr to peer object
+ *
+ * registering peer is expected to populate the ice_peerdrv->name field
+ * before calling this function.
+ */
+static int ice_peer_register(struct ice_peer_obj *peer_obj)
+{
+	struct ice_peer_drv_int *peer_drv_int;
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_peer_drv *peer_drv;
+
+
+	if (!peer_obj) {
+		pr_err("Failed to reg peer_obj: peer_obj ptr NULL\n");
+		return -EINVAL;
+	}
+
+	if (!peer_obj->pdev) {
+		pr_err("Failed to reg peer_obj: peer_obj pdev NULL\n");
+		return -EINVAL;
+	}
+
+	if (!peer_obj->peer_ops || !peer_obj->ops) {
+		pr_err("Failed to reg peer_obj: peer_obj peer_ops/ops NULL\n");
+		return -EINVAL;
+	}
+
+	peer_drv = peer_obj->peer_drv;
+	if (!peer_drv) {
+		pr_err("Failed to reg peer_obj: peer drv NULL\n");
+		return -EINVAL;
+	}
+
+
+	if (peer_drv->ver.major != ICE_PEER_MAJOR_VER ||
+	    peer_drv->ver.minor != ICE_PEER_MINOR_VER) {
+		pr_err("failed to register due to version mismatch:\n");
+		pr_err("expected major ver %d, caller specified major ver %d\n",
+		       ICE_PEER_MAJOR_VER, peer_drv->ver.major);
+		pr_err("expected minor ver %d, caller specified minor ver %d\n",
+		       ICE_PEER_MINOR_VER, peer_drv->ver.minor);
+		return -EINVAL;
+	}
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	peer_drv_int = peer_obj_int->peer_drv_int;
+	if (!peer_drv_int) {
+		pr_err("Failed to match peer_drv_int to peer_obj\n");
+		return -EINVAL;
+	}
+
+	peer_drv_int->peer_drv = peer_drv;
+
+	ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_PROBED, false);
+
+	return 0;
+}
+
+
+/**
+ * ice_peer_request_reset - accept request from peer to perform a reset
+ * @peer_obj: peer object that is requesting a reset
+ * @reset_type: type of reset the peer is requesting
+ */
+static int
+ice_peer_request_reset(struct ice_peer_obj *peer_obj, enum ice_peer_reset_type reset_type)
+{
+	enum ice_reset_req reset;
+	struct ice_pf *pf;
+
+	if (!ice_validate_peer_obj(peer_obj))
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+
+	switch (reset_type) {
+	case ICE_PEER_PFR:
+		reset = ICE_RESET_PFR;
+		break;
+	case ICE_PEER_CORER:
+		reset = ICE_RESET_CORER;
+		break;
+	case ICE_PEER_GLOBR:
+		reset = ICE_RESET_GLOBR;
+		break;
+	default:
+		dev_err(ice_pf_to_dev(pf), "incorrect reset request from peer\n");
+		return -EINVAL;
+	}
+
+	return ice_schedule_reset(pf, reset);
+}
+
+/**
+ * ice_peer_is_vsi_ready - query if VSI in nominal state
+ * @peer_obj: pointer to ice_peer_obj struct
+ */
+static int ice_peer_is_vsi_ready(struct ice_peer_obj *peer_obj)
+{
+	struct ice_netdev_priv *np;
+	struct ice_vsi *vsi;
+
+	/* If the peer_obj or associated values are not valid, then return
+	 * 0 as there is no ready port associated with the values passed in
+	 * as parameters.
+	 */
+
+	if (!peer_obj || !peer_obj->pdev || !pci_get_drvdata(peer_obj->pdev) ||
+	    !peer_to_ice_obj_int(peer_obj))
+		return 0;
+
+	if (!peer_obj->netdev)
+		return 0;
+
+	np = netdev_priv(peer_obj->netdev);
+	vsi = np->vsi;
+
+	return ice_is_vsi_state_nominal(vsi);
+}
+
+/**
+ * ice_peer_update_vsi_filter - update main VSI filters for RDMA
+ * @peer_obj: pointer to RDMA peer object
+ * @filter: selection of filters to enable or disable
+ * @enable: bool whether to enable or disable filters
+ */
+static int
+ice_peer_update_vsi_filter(struct ice_peer_obj *peer_obj,
+			   enum ice_rdma_filter __maybe_unused filter,
+			   bool enable)
+{
+	struct ice_vsi *vsi;
+	struct ice_pf *pf;
+	int ret;
+
+	if (!ice_validate_peer_obj(peer_obj))
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return -EINVAL;
+
+	ret = ice_cfg_iwarp_fltr(&pf->hw, vsi->idx, enable);
+
+	if (ret) {
+		dev_err(ice_pf_to_dev(pf), "Failed to  %sable iWARP filtering\n",
+			enable ? "en" : "dis");
+	} else {
+		if (enable)
+			vsi->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+		else
+			vsi->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_peer_vc_send - send a virt channel message from a peer
+ * @peer_obj: pointer to a peer object
+ * @vf_id: the absolute VF ID of recipient of message
+ * @msg: pointer to message contents
+ * @len: len of message
+ */
+static int
+ice_peer_vc_send(struct ice_peer_obj *peer_obj, u32 vf_id, u8 *msg, u16 len)
+{
+	enum ice_status status;
+	struct ice_pf *pf;
+
+	if (!ice_validate_peer_obj(peer_obj))
+		return -EINVAL;
+	if (!msg || !len)
+		return -ENOMEM;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (len > ICE_AQ_MAX_BUF_LEN)
+		return -EINVAL;
+
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+	switch (peer_obj->peer_drv->driver_id) {
+	case ICE_PEER_RDMA_DRIVER:
+		if (vf_id >= pf->num_alloc_vfs)
+			return -ENODEV;
+
+		/* VIRTCHNL_OP_RDMA is being used for RoCEv2 msg also */
+		status = ice_aq_send_msg_to_vf(&pf->hw, vf_id, VIRTCHNL_OP_RDMA,
+					       0, msg, len, NULL);
+		break;
+	default:
+		dev_err(ice_pf_to_dev(pf),
+			"Peer driver (%u) not supported!", (u32)peer_obj->peer_drv->driver_id);
+		return -ENODEV;
+	}
+
+	if (status)
+		dev_err(ice_pf_to_dev(pf), "Unable to send msg to VF, error %s\n",
+			ice_stat_str(status));
+	return ice_status_to_errno(status);
+}
+
+/**
+ * ice_reserve_peer_qvector - Reserve vector resources for peer drivers
+ * @pf: board private structure to initialize
+ */
+static int ice_reserve_peer_qvector(struct ice_pf *pf)
+{
+	if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+		int index;
+
+		index = ice_get_res(pf, pf->irq_tracker, pf->num_rdma_msix, ICE_RES_RDMA_VEC_ID);
+		if (index < 0)
+			return index;
+		pf->num_avail_sw_msix -= pf->num_rdma_msix;
+		pf->rdma_base_vector = (u16)index;
+	}
+	return 0;
+}
+
+/**
+ * ice_peer_close_task - call peer's close asynchronously
+ * @work: pointer to work_struct contained by the peer_obj_int struct
+ *
+ * This method (asynchronous) of calling a peer's close function is
+ * meant to be used in the reset path.
+ */
+static void ice_peer_close_task(struct work_struct *work)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_peer_obj *peer_obj;
+
+	peer_obj_int = container_of(work, struct ice_peer_obj_int, peer_close_task);
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!peer_obj || !peer_obj->peer_ops)
+		return;
+
+	/* If this peer_obj is going to close, we do not want any state changes
+	 * to happen until after we successfully finish or abort the close.
+	 * Grab the peer_obj_state_mutex to protect this flow
+	 */
+	mutex_lock(&peer_obj_int->peer_obj_state_mutex);
+
+	/* Only allow a close to go to the peer if they are in a state
+	 * to accept it. The last state of PREP_RST is a special case
+	 * that will not normally happen, but it is up to the peer
+	 * to handle it correctly.
+	 */
+	if (test_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_PREP_RST, peer_obj_int->state)) {
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSING, true);
+
+		if (peer_obj->peer_ops->close)
+			peer_obj->peer_ops->close(peer_obj, peer_obj_int->rst_type);
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSED, true);
+	}
+
+	mutex_unlock(&peer_obj_int->peer_obj_state_mutex);
+}
+
+/**
+ * ice_peer_update_vsi - update the pf_vsi info in peer_obj struct
+ * @peer_obj_int: pointer to peer_obj internal struct
+ * @data: opaque pointer - VSI to be updated
+ */
+int ice_peer_update_vsi(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	struct ice_vsi *vsi = (struct ice_vsi *)data;
+	struct ice_peer_obj *peer_obj;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!peer_obj)
+		return 0;
+
+	peer_obj->pf_vsi_num = vsi->vsi_num;
+	return 0;
+}
+
+/* Initialize the ice_ops struct, which is used in 'ice_init_peer_devices' */
+static const struct ice_ops ops = {
+	.alloc_res			= ice_peer_alloc_res,
+	.free_res			= ice_peer_free_res,
+	.is_vsi_ready			= ice_peer_is_vsi_ready,
+	.reg_for_notification		= ice_peer_reg_for_notif,
+	.unreg_for_notification		= ice_peer_unreg_for_notif,
+	.notify_state_change		= ice_peer_report_state_change,
+	.request_reset			= ice_peer_request_reset,
+	.peer_register			= ice_peer_register,
+	.peer_unregister		= ice_peer_unregister,
+	.update_vsi_filter		= ice_peer_update_vsi_filter,
+	.vc_send			= ice_peer_vc_send,
+
+};
+
+/**
+ * ice_init_peer_devices - initializes peer objects and aux devices
+ * @pf: ptr to ice_pf
+ *
+ * This function initializes peer objects and auxiliary device, then
+ * associates them with specified pci_dev as their parent.
+ */
+int ice_init_peer_devices(struct ice_pf *pf)
+{
+	struct ice_port_info *port_info = pf->hw.port_info;
+	struct ice_vsi *vsi = pf->vsi[0];
+	struct pci_dev *pdev = pf->pdev;
+	struct device *dev = &pdev->dev;
+	int status = 0;
+	unsigned int i;
+
+	/* Reserve vector resources */
+	status = ice_reserve_peer_qvector(pf);
+	if (status < 0) {
+		dev_err(dev, "failed to reserve vectors for peer drivers\n");
+		return status;
+	}
+	for (i = 0; i < ARRAY_SIZE(ice_mfd_cells); i++) {
+		struct ice_peer_obj_platform_data *platform_data;
+		struct ice_peer_obj_int *peer_obj_int;
+		struct ice_peer_drv_int *peer_drv_int;
+		struct msix_entry *entry = NULL;
+		struct ice_qos_params *qos_info;
+		struct ice_peer_obj *peer_obj;
+		int j;
+
+		peer_obj_int = devm_kzalloc(dev, sizeof(*peer_obj_int),
+					    GFP_KERNEL);
+		if (!peer_obj_int)
+			return -ENOMEM;
+		pf->peers[i] = peer_obj_int;
+
+		peer_drv_int = devm_kzalloc(dev, sizeof(*peer_drv_int),
+					    GFP_KERNEL);
+		if (!peer_drv_int)
+			return -ENOMEM;
+
+		peer_obj_int->peer_drv_int = peer_drv_int;
+
+		/* Initialize driver values */
+		for (j = 0; j < ICE_EVENT_NBITS; j++)
+			bitmap_zero(peer_drv_int->current_events[j].type,
+				    ICE_EVENT_NBITS);
+
+		mutex_init(&peer_obj_int->peer_obj_state_mutex);
+
+		peer_obj = ice_get_peer_obj(peer_obj_int);
+		peer_obj_int->plat_data.peer_obj = peer_obj;
+		platform_data = &peer_obj_int->plat_data;
+		peer_obj->peer_ops = NULL;
+		peer_obj->hw_addr = (u8 __iomem *)pf->hw.hw_addr;
+		peer_obj->ver.major = ICE_PEER_MAJOR_VER;
+		peer_obj->ver.minor = ICE_PEER_MINOR_VER;
+		peer_obj->ver.support = ICE_IDC_FEATURES;
+		peer_obj->peer_obj_id = ice_mfd_cells[i].id;
+		peer_obj->pf_vsi_num = vsi->vsi_num;
+		peer_obj->netdev = vsi->netdev;
+		peer_obj->initial_mtu = vsi->netdev->mtu;
+		ether_addr_copy(peer_obj->lan_addr, port_info->mac.lan_addr);
+
+		ice_mfd_cells[i].platform_data = platform_data;
+		ice_mfd_cells[i].pdata_size = sizeof(*platform_data);
+
+		peer_obj_int->ice_peer_wq =
+			alloc_ordered_workqueue("ice_peer_wq_%d", WQ_UNBOUND,
+						i);
+		if (!peer_obj_int->ice_peer_wq)
+			return -ENOMEM;
+		INIT_WORK(&peer_obj_int->peer_close_task, ice_peer_close_task);
+
+		peer_obj->pdev = pdev;
+		peer_obj->ari_ena = pci_ari_enabled(pdev->bus);
+		peer_obj->bus_num = PCI_BUS_NUM(pdev->devfn);
+		if (!peer_obj->ari_ena) {
+			peer_obj->dev_num = PCI_SLOT(pdev->devfn);
+			peer_obj->fn_num = PCI_FUNC(pdev->devfn);
+		} else {
+			peer_obj->dev_num = 0;
+			peer_obj->fn_num = pdev->devfn & 0xff;
+		}
+
+		qos_info = &peer_obj->initial_qos_info;
+
+		/* setup qos_info fields with defaults */
+		qos_info->num_apps = 0;
+		qos_info->num_tc = 1;
+
+		for (j = 0; j < ICE_IDC_MAX_USER_PRIORITY; j++)
+			qos_info->up2tc[j] = 0;
+
+		qos_info->tc_info[0].rel_bw = 100;
+		for (j = 1; j < IEEE_8021QAZ_MAX_TCS; j++)
+			qos_info->tc_info[j].rel_bw = 0;
+
+		/* for DCB, override the qos_info defaults. */
+		ice_setup_dcb_qos_info(pf, qos_info);
+		/* Initialize ice_ops */
+		peer_obj->ops = &ops;
+
+		/* make sure peer specific resources such as msix_count and
+		 * msix_entries are initialized
+		 */
+		switch (ice_mfd_cells[i].id) {
+		case ICE_PEER_RDMA_ID:
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+				peer_obj->msix_count = pf->num_rdma_msix;
+				entry = &pf->msix_entries[pf->rdma_base_vector];
+			}
+			pf->rdma_peer = peer_obj;
+			break;
+		default:
+			break;
+		}
+
+		peer_obj->msix_entries = entry;
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_INIT,
+				      false);
+	}
+
+	status = ida_simple_get(&ice_peer_index_ida, 0, 0, GFP_KERNEL);
+	if (status < 0) {
+		dev_err(&pdev->dev, "failed to get unique index for device\n");
+		return status;
+	}
+
+	pf->peer_idx = status;
+
+	status = mfd_add_devices(dev, pf->peer_idx, ice_mfd_cells,
+				 ARRAY_SIZE(ice_mfd_cells), NULL, 0, NULL);
+	if (status) {
+		dev_err(dev, "Failure adding MFD devs for peers: %d\n", status);
+		return status;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ice_mfd_cells); i++) {
+		snprintf(pf->peers[i]->plat_name, ICE_MAX_PEER_NAME, "%s.%d",
+			 ice_mfd_cells[i].name,
+			 pf->peer_idx + ice_mfd_cells[i].id);
+		dev = bus_find_device_by_name(&platform_bus_type, NULL,
+					      pf->peers[i]->plat_name);
+		if (dev) {
+			dev_dbg(dev, "Peer Created: %s %d\n",
+				pf->peers[i]->plat_name, pf->peer_idx);
+			put_device(dev);
+		}
+	}
+
+	return status;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_idc.h b/drivers/net/ethernet/intel/ice/ice_idc.h
new file mode 100644
index 000000000000..d2b74e0c539c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc.h
@@ -0,0 +1,422 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_IDC_H_
+#define _ICE_IDC_H_
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/dcbnl.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+/* This major and minor version represent IDC API version information.
+ * During peer driver registration, peer driver specifies major and minor
+ * version information (via. peer_driver:ver_info). It gets checked against
+ * following defines and if mismatch, then peer driver registration
+ * fails and appropriate message gets logged.
+ */
+#define ICE_PEER_MAJOR_VER		7
+#define ICE_PEER_MINOR_VER		1
+
+enum ice_peer_features {
+	ICE_PEER_FEATURE_ADK_SUPPORT,
+	ICE_PEER_FEATURE_PTP_SUPPORT,
+	ICE_PEER_FEATURE_SRIOV_SUPPORT,
+	ICE_PEER_FEATURE_PCIIOV_SUPPORT,
+	ICE_PEER_FEATURE_NBITS
+};
+
+#define ICE_ADK_SUP		0
+
+#define ICE_PTP_SUP		BIT(ICE_PEER_FEATURE_PTP_SUPPORT)
+
+#define ICE_SRIOV_SUP		BIT(ICE_PEER_FEATURE_SRIOV_SUPPORT)
+
+#ifdef CONFIG_PCI_IOV
+#define ICE_PCIIOV_SUP		BIT(ICE_PEER_FEATURE_PCIIOV_SUPPORT)
+#else
+#define ICE_PCIIOV_SUP		0
+#endif /* CONFIG_PCI_IOV */
+
+#define ICE_IDC_FEATURES (ICE_ADK_SUP | ICE_PTP_SUP | ICE_SRIOV_SUP |\
+			  ICE_PCIIOV_SUP)
+
+enum ice_event_type {
+	ICE_EVENT_LINK_CHANGE = 0x0,
+	ICE_EVENT_MTU_CHANGE,
+	ICE_EVENT_TC_CHANGE,
+	ICE_EVENT_API_CHANGE,
+	ICE_EVENT_MBX_CHANGE,
+	ICE_EVENT_CRIT_ERR,
+	ICE_EVENT_NBITS		/* must be last */
+};
+
+enum ice_res_type {
+	ICE_INVAL_RES = 0x0,
+	ICE_VSI,
+	ICE_VEB,
+	ICE_EVENT_Q,
+	ICE_EGRESS_CMPL_Q,
+	ICE_CMPL_EVENT_Q,
+	ICE_ASYNC_EVENT_Q,
+	ICE_DOORBELL_Q,
+	ICE_RDMA_QSETS_TXSCHED,
+};
+
+enum ice_peer_reset_type {
+	ICE_PEER_PFR = 0,
+	ICE_PEER_CORER,
+	ICE_PEER_CORER_SW_CORE,
+	ICE_PEER_CORER_SW_FULL,
+	ICE_PEER_GLOBR,
+};
+
+/* reason notified to peer driver as part of event handling */
+enum ice_close_reason {
+	ICE_REASON_INVAL = 0x0,
+	ICE_REASON_HW_UNRESPONSIVE,
+	ICE_REASON_INTERFACE_DOWN, /* Administrative down */
+	ICE_REASON_PEER_DRV_UNREG, /* peer driver getting unregistered */
+	ICE_REASON_PEER_OBJ_UNINIT,
+	ICE_REASON_GLOBR_REQ,
+	ICE_REASON_CORER_REQ,
+	ICE_REASON_EMPR_REQ,
+	ICE_REASON_PFR_REQ,
+	ICE_REASON_HW_RESET_PENDING,
+	ICE_REASON_RECOVERY_MODE,
+	ICE_REASON_PARAM_CHANGE,
+};
+
+enum ice_rdma_filter {
+	ICE_RDMA_FILTER_INVAL = 0x0,
+	ICE_RDMA_FILTER_IWARP,
+	ICE_RDMA_FILTER_ROCEV2,
+	ICE_RDMA_FILTER_BOTH,
+};
+
+/* This information is needed to handle peer driver registration,
+ * instead of adding more params to peer_drv_registration function,
+ * let's get it thru' peer_drv object.
+ */
+struct ice_ver_info {
+	u16 major;
+	u16 minor;
+	u64 support;
+};
+
+/* Struct to hold per DCB APP info */
+struct ice_dcb_app_info {
+	u8  priority;
+	u8  selector;
+	u16 prot_id;
+};
+
+struct ice_peer_obj;
+struct ice_peer_obj_int;
+
+#define ICE_IDC_MAX_USER_PRIORITY        8
+#define ICE_IDC_MAX_APPS		64
+#define ICE_IDC_DSCP_NUM_VAL		64
+
+/* Source timer mode */
+enum ice_src_tmr_mode {
+	ICE_SRC_TMR_MODE_NANOSECONDS,
+	ICE_SRC_TMR_MODE_LOCKED,
+
+	NUM_ICE_SRC_TMR_MODE
+};
+
+
+
+/* Struct to hold per RDMA Qset info */
+struct ice_rdma_qset_params {
+	u32 teid;	/* qset TEID */
+	u16 qs_handle; /* RDMA driver provides this */
+	u16 vsi_id; /* VSI index */
+	u8 tc; /* TC branch the QSet should belong to */
+	u8 reserved[3];
+};
+
+struct ice_res_base {
+	/* Union for future provision e.g. other res_type */
+	union {
+		struct ice_rdma_qset_params qsets;
+	} res;
+};
+
+struct ice_res {
+	/* Type of resource. Filled by peer driver */
+	enum ice_res_type res_type;
+	/* Count requested by peer driver */
+	u16 cnt_req;
+
+
+	/* Number of resources allocated. Filled in by callee.
+	 * Based on this value, caller to fill up "resources"
+	 */
+	u16 res_allocated;
+
+	/* Unique handle to resources allocated. Zero if call fails.
+	 * Allocated by callee and for now used by caller for internal
+	 * tracking purpose.
+	 */
+	u32 res_handle;
+
+	/* Peer driver has to allocate sufficient memory, to accommodate
+	 * cnt_requested before calling this function.
+	 * Memory has to be zero initialized. It is input/output param.
+	 * As a result of alloc_res API, this structures will be populated.
+	 */
+	struct ice_res_base res[1];
+};
+
+struct ice_qos_info {
+	u64 tc_ctx;
+	u8 rel_bw;
+	u8 prio_type;
+	u8 egress_virt_up;
+	u8 ingress_virt_up;
+};
+
+#define IDC_QOS_MODE_VLAN       0x0
+#define IDC_QOS_MODE_DSCP       0x1
+
+/* Struct to hold QoS info */
+struct ice_qos_params {
+	struct ice_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
+	u8 up2tc[ICE_IDC_MAX_USER_PRIORITY];
+	u8 vsi_relative_bw;
+	u8 vsi_priority_type;
+	u32 num_apps;
+	u8 pfc_mode;
+	u8 dscp_map[ICE_IDC_DSCP_NUM_VAL];
+	struct ice_dcb_app_info apps[ICE_IDC_MAX_APPS];
+	u8 num_tc;
+};
+
+union ice_event_info {
+	/* ICE_EVENT_LINK_CHANGE */
+	struct {
+		struct net_device *lwr_nd;
+		u16 vsi_num; /* HW index of VSI corresponding to lwr ndev */
+		u8 new_link_state;
+		u8 lport;
+	} link_info;
+	/* ICE_EVENT_MTU_CHANGE */
+	u16 mtu;
+	/* ICE_EVENT_TC_CHANGE */
+	struct ice_qos_params port_qos;
+	/* ICE_EVENT_API_CHANGE */
+	u8 api_rdy;
+	/* ICE_EVENT_MBX_CHANGE */
+	u8 mbx_rdy;
+	/* ICE_EVENT_CRIT_ERR */
+	u32 reg;
+};
+
+/* ice_event elements are to be passed back and forth between the ice driver
+ * and the peer drivers. They are to be used to both register/unregister
+ * for event reporting and to report an event (events can be either ice
+ * generated or peer generated).
+ *
+ * For (un)registering for events, the structure needs to be populated with:
+ *   reporter - pointer to the ice_peer_obj struct of the peer (un)registering
+ *   type - bitmap with bits set for event types to (un)register for
+ *
+ * For reporting events, the structure needs to be populated with:
+ *   reporter - pointer to peer that generated the event (NULL for ice)
+ *   type - bitmap with single bit set for this event type
+ *   info - union containing data relevant to this event type
+ */
+struct ice_event {
+	struct ice_peer_obj *reporter;
+	DECLARE_BITMAP(type, ICE_EVENT_NBITS);
+	union ice_event_info info;
+};
+
+/* Following APIs are implemented by ICE driver and invoked by peer drivers */
+struct ice_ops {
+	/* APIs to allocate resources such as VEB, VSI, Doorbell queues,
+	 * completion queues, Tx/Rx queues, etc...
+	 */
+	int (*alloc_res)(struct ice_peer_obj *peer_obj,
+			 struct ice_res *res,
+			 int partial_acceptable);
+	int (*free_res)(struct ice_peer_obj *peer_obj,
+			struct ice_res *res);
+
+	int (*is_vsi_ready)(struct ice_peer_obj *peer_obj);
+	int (*peer_register)(struct ice_peer_obj *peer_obj);
+	int (*peer_unregister)(struct ice_peer_obj *peer_obj);
+	int (*request_reset)(struct ice_peer_obj *obj,
+			     enum ice_peer_reset_type reset_type);
+
+	void (*notify_state_change)(struct ice_peer_obj *obj,
+				    struct ice_event *event);
+
+	/* Notification APIs */
+	void (*reg_for_notification)(struct ice_peer_obj *obj,
+				     struct ice_event *event);
+	void (*unreg_for_notification)(struct ice_peer_obj *obj,
+				       struct ice_event *event);
+	int (*update_vsi_filter)(struct ice_peer_obj *peer_obj,
+				 enum ice_rdma_filter filter, bool enable);
+	int (*vc_send)(struct ice_peer_obj *peer_obj, u32 vf_id, u8 *msg,
+		       u16 len);
+};
+
+/* Following APIs are implemented by peer drivers and invoked by ICE driver */
+struct ice_peer_ops {
+	void (*event_handler)(struct ice_peer_obj *peer_obj,
+			      struct ice_event *event);
+
+	/* Why we have 'open' and when it is expected to be called:
+	 * 1. symmetric set of API w.r.t close
+	 * 2. To be invoked form driver initialization path
+	 *     - call peer_driver:open once ice driver is fully initialized
+	 * 3. To be invoked upon RESET complete
+	 *
+	 * Calls to open are performed from ice_finish_init_peer_obj
+	 * which is invoked from the service task. This helps keep objects
+	 * from having their open called until the ice driver is ready and
+	 * has scheduled its service task.
+	 */
+	int (*open)(struct ice_peer_obj *peer_obj);
+
+	/* Peer's close function is to be called when the peer needs to be
+	 * quiesced. This can be for a variety of reasons (enumerated in the
+	 * ice_close_reason enum struct). A call to close will only be
+	 * followed by a call to either remove or open. No IDC calls from the
+	 * peer should be accepted until it is re-opened.
+	 *
+	 * The *reason* parameter is the reason for the call to close. This
+	 * can be for any reason enumerated in the ice_close_reason struct.
+	 * It's primary reason is for the peer's bookkeeping and in case the
+	 * peer want to perform any different tasks dictated by the reason.
+	 */
+	void (*close)(struct ice_peer_obj *peer_obj,
+		      enum ice_close_reason reason);
+
+	int (*vc_receive)(struct ice_peer_obj *peer_obj, u32 vf_id, u8 *msg,
+			  u16 len);
+	/* tell RDMA peer to prepare for TC change in a blocking call
+	 * that will directly precede the change event
+	 */
+	void (*prep_tc_change)(struct ice_peer_obj *peer_obj);
+};
+
+#define ICE_PEER_RDMA_NAME	"ice_rdma"
+#define ICE_PEER_RDMA_ID	0x00000010
+#define ICE_MAX_NUM_PEERS	4
+
+/* The const struct that instantiates peer_obj_id needs to be initialized
+ * in the .c with the macro ASSIGN_PEER_INFO.
+ * For example:
+ * static const struct peer_obj_id peer_obj_ids[] = ASSIGN_PEER_INFO;
+ */
+struct peer_obj_id {
+	char *name;
+	int id;
+};
+
+#define IDC_RDMA_INFO   { .name = ICE_PEER_RDMA_NAME,  .id = ICE_PEER_RDMA_ID },
+#define IDC_AE_INFO
+#define IDC_IPSEC_INFO
+#define IDC_SWITCH_INFO
+#define IDC_ADK_INFO
+/* this is a list of all possible peers, some are unused but left for clarity */
+#define ASSIGN_PEER_INFO	\
+{				\
+	IDC_RDMA_INFO		\
+	IDC_AE_INFO		\
+	IDC_IPSEC_INFO		\
+	IDC_SWITCH_INFO		\
+	IDC_ADK_INFO		\
+}
+
+#define ice_peer_priv(x) ((x)->peer_priv)
+
+/* structure representing peer_object */
+struct ice_peer_obj {
+	struct ice_ver_info ver;
+	struct pci_dev *pdev; /* PCI device of corresponding to main function */
+	/* KVA / Linear address corresponding to BAR0 of underlying
+	 * pci_device.
+	 */
+	u8 __iomem *hw_addr;
+	int peer_obj_id;
+
+	int index;
+
+	/* Opaque pointer for peer specific data tracking.  This memory will
+	 * be alloc'd and freed by the peer driver and used for private data
+	 * accessible only to the specific peer.  It is stored here so that
+	 * when this struct is passed to the peer via an IDC call, the data
+	 * can be accessed by the peer at that time.
+	 * The peers should only retrieve the pointer by the macro:
+	 *    ice_peer_priv(struct ice_peer_obj *)
+	 */
+	void *peer_priv;
+
+
+	u8 ftype;	/* PF(false) or VF (true) */
+
+	/* Data VSI created by driver */
+	u16 pf_vsi_num;
+
+	u8 lan_addr[ETH_ALEN]; /* default MAC address of main netdev */
+	u16 initial_mtu; /* Initial MTU of main netdev */
+	struct ice_qos_params initial_qos_info;
+	struct net_device *netdev;
+	/* PCI info */
+	u8 ari_ena;
+	u16 bus_num;
+	u16 dev_num;
+	u16 fn_num;
+
+	/* Based on peer driver type, this shall point to corresponding MSIx
+	 * entries in pf->msix_entries (which were allocated as part of driver
+	 * initialization) e.g. for RDMA driver, msix_entries reserved will be
+	 * num_online_cpus + 1.
+	 */
+	u16 msix_count; /* How many vectors are reserved for this device */
+	struct msix_entry *msix_entries;
+
+	/* Following struct contains function pointers to be initialized
+	 * by ICE driver and called by peer driver
+	 */
+	const struct ice_ops *ops;
+
+	/* Following struct contains function pointers to be initialized
+	 * by peer driver and called by ICE driver
+	 */
+	const struct ice_peer_ops *peer_ops;
+
+	/* Pointer to peer_drv struct to be populated by peer driver */
+	struct ice_peer_drv *peer_drv;
+};
+
+struct ice_peer_obj_platform_data {
+	struct ice_peer_obj *peer_obj;
+};
+
+/* structure representing peer driver
+ * Peer driver to initialize those function ptrs and
+ * it will be invoked by ICE as part of driver_registration
+ * via bus infrastructure
+ */
+struct ice_peer_drv {
+	u16 driver_id;
+#define ICE_PEER_LAN_DRIVER		0
+#define ICE_PEER_RDMA_DRIVER		4
+#define ICE_PEER_ADK_DRIVER		5
+
+	struct ice_ver_info ver;
+	const char *name;
+
+};
+
+#endif /* _ICE_IDC_H_*/
diff --git a/drivers/net/ethernet/intel/ice/ice_idc_int.h b/drivers/net/ethernet/intel/ice/ice_idc_int.h
new file mode 100644
index 000000000000..6939681f1498
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc_int.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_IDC_INT_H_
+#define _ICE_IDC_INT_H_
+
+#include "ice.h"
+#include "ice_idc.h"
+
+
+enum ice_peer_obj_state {
+	ICE_PEER_OBJ_STATE_INIT,
+	ICE_PEER_OBJ_STATE_PROBED,
+	ICE_PEER_OBJ_STATE_OPENING,
+	ICE_PEER_OBJ_STATE_OPENED,
+	ICE_PEER_OBJ_STATE_PREP_RST,
+	ICE_PEER_OBJ_STATE_PREPPED,
+	ICE_PEER_OBJ_STATE_CLOSING,
+	ICE_PEER_OBJ_STATE_CLOSED,
+	ICE_PEER_OBJ_STATE_REMOVED,
+	ICE_PEER_OBJ_STATE_API_RDY,
+	ICE_PEER_OBJ_STATE_NBITS,               /* must be last */
+};
+
+enum ice_peer_drv_state {
+	ICE_PEER_DRV_STATE_MBX_RDY,
+	ICE_PEER_DRV_STATE_NBITS,               /* must be last */
+};
+
+struct ice_peer_drv_int {
+	struct ice_peer_drv *peer_drv;
+
+	/* States associated with peer driver */
+	DECLARE_BITMAP(state, ICE_PEER_DRV_STATE_NBITS);
+
+	/* if this peer_obj is the originator of an event, these are the
+	 * most recent events of each type
+	 */
+	struct ice_event current_events[ICE_EVENT_NBITS];
+};
+
+#define ICE_MAX_PEER_NAME 64
+
+struct ice_peer_obj_int {
+	struct ice_peer_obj peer_obj;
+	struct ice_peer_drv_int *peer_drv_int; /* driver private structure */
+	char plat_name[ICE_MAX_PEER_NAME];
+	struct ice_peer_obj_platform_data plat_data;
+
+	/* if this peer_obj is the originator of an event, these are the
+	 * most recent events of each type
+	 */
+	struct ice_event current_events[ICE_EVENT_NBITS];
+	/* Events a peer has registered to be notified about */
+	DECLARE_BITMAP(events, ICE_EVENT_NBITS);
+
+	/* States associated with peer_obj */
+	DECLARE_BITMAP(state, ICE_PEER_OBJ_STATE_NBITS);
+	struct mutex peer_obj_state_mutex; /* peer_obj state mutex */
+
+	/* per peer workqueue */
+	struct workqueue_struct *ice_peer_wq;
+
+	struct work_struct peer_prep_task;
+	struct work_struct peer_close_task;
+
+	enum ice_close_reason rst_type;
+};
+
+static inline struct
+ice_peer_obj_int *peer_to_ice_obj_int(struct ice_peer_obj *peer_obj)
+{
+	return peer_obj ? container_of(peer_obj, struct ice_peer_obj_int,
+				       peer_obj) : NULL;
+}
+
+static inline struct
+ice_peer_obj *ice_get_peer_obj(struct ice_peer_obj_int *peer_obj_int)
+{
+	if (peer_obj_int)
+		return &peer_obj_int->peer_obj;
+	else
+		return NULL;
+}
+
+#if IS_ENABLED(CONFIG_MFD_CORE)
+int ice_peer_update_vsi(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_close_peer_for_reset(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_unroll_peer(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_unreg_peer_obj(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_peer_close(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_peer_check_for_reg(struct ice_peer_obj_int *peer_obj_int, void *data);
+int
+ice_finish_init_peer_obj(struct ice_peer_obj_int *peer_obj_int, void *data);
+static inline bool ice_validate_peer_obj(struct ice_peer_obj *peer_obj)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_pf *pf;
+
+	if (!peer_obj || !peer_obj->pdev)
+		return false;
+
+	if (!peer_obj->peer_ops)
+		return false;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (!pf)
+		return false;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	if (!peer_obj_int)
+		return false;
+
+	if (test_bit(ICE_PEER_OBJ_STATE_REMOVED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_INIT, peer_obj_int->state))
+		return false;
+
+	return true;
+}
+#else /* !CONFIG_MFD_CORE */
+static inline int
+ice_peer_update_vsi(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_close_peer_for_reset(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_unroll_peer(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_unreg_peer_obj(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_peer_close(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_peer_check_for_reg(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_finish_init_peer_obj(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline bool ice_validate_peer_obj(struct ice_peer_obj *peer)
+{
+	return true;
+}
+
+#endif /* !CONFIG_MFD_CORE */
+
+#endif /* !_ICE_IDC_INT_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c
new file mode 100644
index 000000000000..ef36be8fb867
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_lag.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* Link Aggregation code */
+
+#include "ice.h"
+#ifdef HAVE_NETDEV_UPPER_INFO
+#include "ice_lag.h"
+
+/**
+ * ice_lag_set_primary - set PF LAG state as Primary
+ * @lag: LAG info struct
+ */
+static void ice_lag_set_primary(struct ice_lag *lag)
+{
+	struct ice_pf *pf = lag->pf;
+
+	if (!pf)
+		return;
+
+	if (lag->role != ICE_LAG_UNSET && lag->role != ICE_LAG_BACKUP) {
+		dev_warn(ice_pf_to_dev(pf), "%s: Attempt to be Primary, but incompatible state.\n",
+			 netdev_name(lag->netdev));
+		return;
+	}
+
+	lag->role = ICE_LAG_PRIMARY;
+}
+
+/**
+ * ice_lag_set_backup - set PF LAG state to Backup
+ * @lag: LAG info struct
+ */
+static void ice_lag_set_backup(struct ice_lag *lag)
+{
+	struct ice_pf *pf = lag->pf;
+
+	if (!pf)
+		return;
+
+	if (lag->role != ICE_LAG_UNSET && lag->role != ICE_LAG_PRIMARY) {
+		dev_dbg(ice_pf_to_dev(pf), "%s: Attempt to be Backup, but incompatible state\n",
+			netdev_name(lag->netdev));
+		return;
+	}
+
+	lag->role = ICE_LAG_BACKUP;
+}
+
+/**
+ * ice_display_lag_info - print LAG info
+ * @lag: LAG info struct
+ */
+static void ice_display_lag_info(struct ice_lag *lag)
+{
+	const char *name, *peer, *upper, *role, *bonded, *master;
+	struct device *dev = &lag->pf->pdev->dev;
+
+	name = lag->netdev ? netdev_name(lag->netdev) : "unset";
+	peer = lag->peer_netdev ? netdev_name(lag->peer_netdev) : "unset";
+	upper = lag->upper_netdev ? netdev_name(lag->upper_netdev) : "unset";
+	master = lag->master ? "TRUE" : "FALSE";
+	bonded = lag->bonded ? "BONDED" : "UNBONDED";
+
+	switch (lag->role) {
+	case ICE_LAG_NONE:
+		role = "NONE";
+		break;
+	case ICE_LAG_PRIMARY:
+		role = "PRIMARY";
+		break;
+	case ICE_LAG_BACKUP:
+		role = "BACKUP";
+		break;
+	case ICE_LAG_UNSET:
+		role = "UNSET";
+		break;
+	default:
+		role = "ERROR";
+	}
+
+	dev_dbg(dev, "%s %s, peer:%s, upper:%s, role:%s, master:%s\n", name,
+		bonded, peer, upper, role, master);
+}
+
+/**
+ * ice_lag_info_event - handle NETDEV_BONDING_INFO event
+ * @lag: LAG info struct
+ * @ptr: opaque data pointer
+ *
+ * ptr is to be cast to (netdev_notifier_bonding_info *)
+ */
+static void ice_lag_info_event(struct ice_lag *lag, void *ptr)
+{
+	struct net_device *event_netdev, *netdev_tmp;
+	struct netdev_notifier_bonding_info *info;
+	struct netdev_bonding_info *bonding_info;
+	const char *lag_netdev_name;
+
+	event_netdev = netdev_notifier_info_to_dev(ptr);
+	info = ptr;
+	lag_netdev_name = netdev_name(lag->netdev);
+	bonding_info = &info->bonding_info;
+
+	if (event_netdev != lag->netdev || !lag->bonded || !lag->upper_netdev)
+		return;
+
+	if (bonding_info->master.bond_mode != BOND_MODE_ACTIVEBACKUP) {
+		netdev_dbg(lag->netdev, "Bonding event recv, but mode not active/backup\n");
+		goto lag_out;
+	}
+
+	if (strcmp(bonding_info->slave.slave_name, lag_netdev_name)) {
+		netdev_dbg(lag->netdev, "Bonding event recv, but slave info not for us\n");
+		goto lag_out;
+	}
+
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(lag->upper_netdev, netdev_tmp) {
+		if (!netif_is_ice(netdev_tmp))
+			continue;
+
+		if (netdev_tmp && netdev_tmp != lag->netdev &&
+		    lag->peer_netdev != netdev_tmp) {
+			dev_hold(netdev_tmp);
+			lag->peer_netdev = netdev_tmp;
+		}
+	}
+	rcu_read_unlock();
+
+	if (bonding_info->slave.state)
+		ice_lag_set_backup(lag);
+	else
+		ice_lag_set_primary(lag);
+
+lag_out:
+	ice_display_lag_info(lag);
+}
+
+/**
+ * ice_lag_link - handle LAG link event
+ * @lag: LAG info struct
+ * @info: info from the netdev notifier
+ */
+static void
+ice_lag_link(struct ice_lag *lag, struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *netdev_tmp, *upper = info->upper_dev;
+	struct ice_pf *pf = lag->pf;
+	int peers = 0;
+
+
+	if (lag->bonded)
+		dev_warn(ice_pf_to_dev(pf), "%s Already part of a bond\n",
+			 netdev_name(lag->netdev));
+
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, netdev_tmp)
+		peers++;
+	rcu_read_unlock();
+
+	if (lag->upper_netdev != upper) {
+		dev_hold(upper);
+		lag->upper_netdev = upper;
+	}
+
+	lag->bonded = true;
+	lag->role = ICE_LAG_UNSET;
+
+	/* if this is the first element in an LAG mark as master */
+	lag->master = !!(peers == 1);
+}
+
+/**
+ * ice_lag_unlink - handle unlink event
+ * @lag: LAG info struct
+ * @info: info from netdev notification
+ */
+static void
+ice_lag_unlink(struct ice_lag *lag,
+	       struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *netdev_tmp, *upper = info->upper_dev;
+	bool found = false;
+
+	if (!lag->bonded) {
+		netdev_dbg(lag->netdev, "bonding unlink event on non-LAG netdev\n");
+		return;
+	}
+
+	/* determine if we are in the new LAG config or not */
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, netdev_tmp) {
+		if (netdev_tmp == lag->netdev) {
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (found)
+		return;
+
+	if (lag->upper_netdev) {
+		dev_put(lag->upper_netdev);
+		lag->upper_netdev = NULL;
+	}
+
+	if (lag->peer_netdev) {
+		dev_put(lag->peer_netdev);
+		lag->peer_netdev = NULL;
+	}
+
+	lag->bonded = false;
+	lag->role = ICE_LAG_NONE;
+}
+
+/**
+ * ice_lag_changeupper_event - handle LAG changeupper event
+ * @lag: LAG info struct
+ * @ptr: opaque pointer data
+ *
+ * ptr is to be cast into netdev_notifier_changeupper_info
+ */
+static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr)
+{
+	struct netdev_notifier_changeupper_info *info;
+	struct net_device *netdev;
+
+	info = ptr;
+	netdev = netdev_notifier_info_to_dev(ptr);
+
+	/* not for this netdev */
+	if (netdev != lag->netdev)
+		return;
+
+	if (!info->upper_dev) {
+		netdev_dbg(netdev, "changeupper rcvd, but no upper defined\n");
+		return;
+	}
+
+	netdev_dbg(netdev, "bonding %s\n", info->linking ? "LINK" : "UNLINK");
+
+	if (!netif_is_lag_master(info->upper_dev)) {
+		netdev_dbg(netdev, "changeupper rcvd, but not master. bail\n");
+		return;
+	}
+
+	if (info->linking)
+		ice_lag_link(lag, info);
+	else
+		ice_lag_unlink(lag, info);
+
+	ice_display_lag_info(lag);
+}
+
+/**
+ * ice_lag_changelower_event - handle LAG changelower event
+ * @lag: LAG info struct
+ * @ptr: opaque data pointer
+ *
+ * ptr to be cast to netdev_notifier_changelowerstate_info
+ */
+static void ice_lag_changelower_event(struct ice_lag *lag, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+	if (netdev != lag->netdev)
+		return;
+
+	netdev_dbg(netdev, "bonding info\n");
+
+	if (!netif_is_lag_port(netdev)) {
+		netdev_dbg(netdev, "CHANGELOWER rcvd, but netdev not in LAG. Bail\n");
+		return;
+	}
+
+}
+
+/**
+ * ice_lag_event_handler - handle LAG events from netdev
+ * @notif_blk: notifier block registered by this netdev
+ * @event: event type
+ * @ptr: opaque data containing notifier event
+ */
+static int
+ice_lag_event_handler(struct notifier_block *notif_blk, unsigned long event,
+		      void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct ice_lag *lag;
+
+	lag = container_of(notif_blk, struct ice_lag, notif_block);
+
+	if (!lag->netdev)
+		return NOTIFY_DONE;
+
+	/* Check that the netdev is in the working namespace */
+	if (!net_eq(dev_net(netdev), &init_net))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		ice_lag_changeupper_event(lag, ptr);
+		break;
+	case NETDEV_CHANGELOWERSTATE:
+		ice_lag_changelower_event(lag, ptr);
+		break;
+	case NETDEV_BONDING_INFO:
+		ice_lag_info_event(lag, ptr);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/**
+ * ice_register_lag_handler - register LAG handler on netdev
+ * @lag: LAG struct
+ */
+static int ice_register_lag_handler(struct ice_lag *lag)
+{
+	struct device *dev = ice_pf_to_dev(lag->pf);
+	struct notifier_block *notif_blk;
+
+	notif_blk = &lag->notif_block;
+
+	if (!notif_blk->notifier_call) {
+		notif_blk->notifier_call = ice_lag_event_handler;
+		if (register_netdevice_notifier(notif_blk)) {
+			notif_blk->notifier_call = NULL;
+			dev_err(dev, "FAIL register LAG event handler!\n");
+			return -EINVAL;
+		}
+		dev_dbg(dev, "LAG event handler registered\n");
+	}
+	return 0;
+}
+
+/**
+ * ice_unregister_lag_handler - unregister LAG handler on netdev
+ * @lag: LAG struct
+ */
+static void ice_unregister_lag_handler(struct ice_lag *lag)
+{
+	struct device *dev = ice_pf_to_dev(lag->pf);
+	struct notifier_block *notif_blk;
+
+	notif_blk = &lag->notif_block;
+	if (notif_blk->notifier_call) {
+		unregister_netdevice_notifier(notif_blk);
+		dev_dbg(dev, "LAG event handler unregistered\n");
+	}
+}
+
+/**
+ * ice_init_lag - initialize support for LAG
+ * @pf: PF struct
+ *
+ * Alloc memory for LAG structs and initialize the elements.
+ * Memory will be freed in ice_deinit_lag
+ */
+int ice_init_lag(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_lag *lag;
+	struct ice_vsi *vsi;
+	int err;
+
+	pf->lag = kzalloc(sizeof(*lag), GFP_KERNEL);
+	if (!pf->lag)
+		return -ENOMEM;
+	lag = pf->lag;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi) {
+		dev_err(dev, "couldn't get main vsi, link aggregation init fail\n");
+		err = -EIO;
+		goto lag_error;
+	}
+
+	lag->pf = pf;
+	lag->netdev = vsi->netdev;
+	lag->role = ICE_LAG_NONE;
+	lag->bonded = false;
+	lag->peer_netdev = NULL;
+	lag->upper_netdev = NULL;
+	lag->notif_block.notifier_call = NULL;
+
+	err = ice_register_lag_handler(lag);
+	if (err) {
+		dev_warn(dev, "INIT LAG: Failed to register event handler\n");
+		goto lag_error;
+	}
+
+	ice_display_lag_info(lag);
+
+	dev_dbg(dev, "INIT LAG complete\n");
+	return 0;
+
+lag_error:
+	kfree(lag);
+	pf->lag = NULL;
+	return err;
+}
+
+/**
+ * ice_deinit_lag - Clean up LAG
+ * @pf: PF struct
+ *
+ * Clean up kernel LAG info and free memory
+ * This function is meant to only be called on driver remove/shutdown
+ */
+void ice_deinit_lag(struct ice_pf *pf)
+{
+	struct ice_lag *lag;
+
+	lag = pf->lag;
+
+	if (!lag)
+		return;
+
+	if (lag->pf)
+		ice_unregister_lag_handler(lag);
+
+	if (lag->upper_netdev)
+		dev_put(lag->upper_netdev);
+
+	if (lag->peer_netdev)
+		dev_put(lag->peer_netdev);
+
+	kfree(lag);
+
+	pf->lag = NULL;
+}
+#endif /* HAVE_NETDEV_UPPER_INFO */
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.h b/drivers/net/ethernet/intel/ice/ice_lag.h
new file mode 100644
index 000000000000..1603c0f973c9
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_lag.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_LAG_H_
+#define _ICE_LAG_H_
+#ifdef HAVE_NETDEV_UPPER_INFO
+
+#include <linux/netdevice.h>
+
+/* LAG roles for netdev */
+enum ice_lag_role {
+	ICE_LAG_NONE,
+	ICE_LAG_PRIMARY,
+	ICE_LAG_BACKUP,
+	ICE_LAG_UNSET
+};
+
+struct ice_pf;
+
+/* LAG info struct */
+struct ice_lag {
+	struct ice_pf *pf; /* backlink to PF struct */
+	struct net_device *netdev; /* this PF's netdev */
+	struct net_device *peer_netdev;
+	struct net_device *upper_netdev; /* upper bonding netdev */
+	struct notifier_block notif_block;
+	u8 bonded:1; /* currently bonded */
+	u8 master:1; /* this is a master */
+	u8 handler:1; /* did we register a rx_netdev_handler */
+	/* each thing blocking bonding will increment this value by one.
+	 * If this value is zero, then bonding is allowed.
+	 */
+	u16 dis_lag;
+	u8 role;
+};
+
+int ice_init_lag(struct ice_pf *pf);
+void ice_deinit_lag(struct ice_pf *pf);
+#endif /* HAVE_NETDEV_UPPER_INFO */
+#endif /* _ICE_LAG_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
index 2aac8f13daeb..72adcf278824 100644
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -1,8 +1,33 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_LAN_TX_RX_H_
 #define _ICE_LAN_TX_RX_H_
+#include "ice_osdep.h"
+
+/* Rx Descriptors */
+union ice_16byte_rx_desc {
+	struct {
+		__le64 pkt_addr; /* Packet buffer address */
+		__le64 hdr_addr; /* Header buffer address */
+	} read;
+	struct {
+		struct {
+			struct {
+				__le16 mirroring_status;
+				__le16 l2tag1;
+			} lo_dword;
+			union {
+				__le32 rss; /* RSS Hash */
+				__le32 fd_id; /* Flow Director filter ID */
+			} hi_dword;
+		} qword0;
+		struct {
+			/* ext status/error/PTYPE/length */
+			__le64 status_error_len;
+		} qword1;
+	} wb;  /* writeback */
+};
 
 union ice_32byte_rx_desc {
 	struct {
@@ -40,8 +65,240 @@ union ice_32byte_rx_desc {
 	} wb; /* writeback */
 };
 
+struct ice_fltr_desc {
+	__le64 qidx_compq_space_stat;
+	__le64 dtype_cmd_vsi_fdid;
+};
+
+#define ICE_FXD_FLTR_QW0_QINDEX_S	0
+#define ICE_FXD_FLTR_QW0_QINDEX_M	(0x7FFULL << ICE_FXD_FLTR_QW0_QINDEX_S)
+#define ICE_FXD_FLTR_QW0_COMP_Q_S	11
+#define ICE_FXD_FLTR_QW0_COMP_Q_M	BIT_ULL(ICE_FXD_FLTR_QW0_COMP_Q_S)
+#define ICE_FXD_FLTR_QW0_COMP_Q_ZERO	0x0ULL
+#define ICE_FXD_FLTR_QW0_COMP_Q_QINDX	0x1ULL
+
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_S	12
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_M	\
+				(0x3ULL << ICE_FXD_FLTR_QW0_COMP_REPORT_S)
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_NONE	0x0ULL
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL	0x1ULL
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_SW		0x2ULL
+
+#define ICE_FXD_FLTR_QW0_FD_SPACE_S	14
+#define ICE_FXD_FLTR_QW0_FD_SPACE_M	(0x3ULL << ICE_FXD_FLTR_QW0_FD_SPACE_S)
+#define ICE_FXD_FLTR_QW0_FD_SPACE_GUAR			0x0ULL
+#define ICE_FXD_FLTR_QW0_FD_SPACE_BEST_EFFORT		0x1ULL
+#define ICE_FXD_FLTR_QW0_FD_SPACE_GUAR_BEST		0x2ULL
+#define ICE_FXD_FLTR_QW0_FD_SPACE_BEST_GUAR		0x3ULL
+
+#define ICE_FXD_FLTR_QW0_STAT_CNT_S	16
+#define ICE_FXD_FLTR_QW0_STAT_CNT_M	\
+				(0x1FFFULL << ICE_FXD_FLTR_QW0_STAT_CNT_S)
+#define ICE_FXD_FLTR_QW0_STAT_ENA_S	29
+#define ICE_FXD_FLTR_QW0_STAT_ENA_M	(0x3ULL << ICE_FXD_FLTR_QW0_STAT_ENA_S)
+#define ICE_FXD_FLTR_QW0_STAT_ENA_NONE		0x0ULL
+#define ICE_FXD_FLTR_QW0_STAT_ENA_PKTS		0x1ULL
+#define ICE_FXD_FLTR_QW0_STAT_ENA_BYTES		0x2ULL
+#define ICE_FXD_FLTR_QW0_STAT_ENA_PKTS_BYTES	0x3ULL
+
+#define ICE_FXD_FLTR_QW0_EVICT_ENA_S	31
+#define ICE_FXD_FLTR_QW0_EVICT_ENA_M	BIT_ULL(ICE_FXD_FLTR_QW0_EVICT_ENA_S)
+#define ICE_FXD_FLTR_QW0_EVICT_ENA_FALSE	0x0ULL
+#define ICE_FXD_FLTR_QW0_EVICT_ENA_TRUE		0x1ULL
+
+#define ICE_FXD_FLTR_QW0_TO_Q_S		32
+#define ICE_FXD_FLTR_QW0_TO_Q_M		(0x7ULL << ICE_FXD_FLTR_QW0_TO_Q_S)
+#define ICE_FXD_FLTR_QW0_TO_Q_EQUALS_QINDEX	0x0ULL
+
+#define ICE_FXD_FLTR_QW0_TO_Q_PRI_S	35
+#define ICE_FXD_FLTR_QW0_TO_Q_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW0_TO_Q_PRI_S)
+#define ICE_FXD_FLTR_QW0_TO_Q_PRIO1	0x1ULL
+
+#define ICE_FXD_FLTR_QW0_DPU_RECIPE_S	38
+#define ICE_FXD_FLTR_QW0_DPU_RECIPE_M	\
+			(0x3ULL << ICE_FXD_FLTR_QW0_DPU_RECIPE_S)
+#define ICE_FXD_FLTR_QW0_DPU_RECIPE_DFLT	0x0ULL
+
+#define ICE_FXD_FLTR_QW0_DROP_S		40
+#define ICE_FXD_FLTR_QW0_DROP_M		BIT_ULL(ICE_FXD_FLTR_QW0_DROP_S)
+#define ICE_FXD_FLTR_QW0_DROP_NO	0x0ULL
+#define ICE_FXD_FLTR_QW0_DROP_YES	0x1ULL
+
+#define ICE_FXD_FLTR_QW0_FLEX_PRI_S	41
+#define ICE_FXD_FLTR_QW0_FLEX_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW0_FLEX_PRI_S)
+#define ICE_FXD_FLTR_QW0_FLEX_PRI_NONE	0x0ULL
+
+#define ICE_FXD_FLTR_QW0_FLEX_MDID_S	44
+#define ICE_FXD_FLTR_QW0_FLEX_MDID_M	(0xFULL << ICE_FXD_FLTR_QW0_FLEX_MDID_S)
+#define ICE_FXD_FLTR_QW0_FLEX_MDID0	0x0ULL
+
+#define ICE_FXD_FLTR_QW0_FLEX_VAL_S	48
+#define ICE_FXD_FLTR_QW0_FLEX_VAL_M	\
+				(0xFFFFULL << ICE_FXD_FLTR_QW0_FLEX_VAL_S)
+#define ICE_FXD_FLTR_QW0_FLEX_VAL0	0x0ULL
+
+#define ICE_FXD_FLTR_QW1_DTYPE_S	0
+#define ICE_FXD_FLTR_QW1_DTYPE_M	(0xFULL << ICE_FXD_FLTR_QW1_DTYPE_S)
+#define ICE_FXD_FLTR_QW1_PCMD_S		4
+#define ICE_FXD_FLTR_QW1_PCMD_M		BIT_ULL(ICE_FXD_FLTR_QW1_PCMD_S)
+#define ICE_FXD_FLTR_QW1_PCMD_ADD	0x0ULL
+#define ICE_FXD_FLTR_QW1_PCMD_REMOVE	0x1ULL
+
+#define ICE_FXD_FLTR_QW1_PROF_PRI_S	5
+#define ICE_FXD_FLTR_QW1_PROF_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW1_PROF_PRI_S)
+#define ICE_FXD_FLTR_QW1_PROF_PRIO_ZERO	0x0ULL
+
+#define ICE_FXD_FLTR_QW1_PROF_S		8
+#define ICE_FXD_FLTR_QW1_PROF_M		(0x3FULL << ICE_FXD_FLTR_QW1_PROF_S)
+#define ICE_FXD_FLTR_QW1_PROF_ZERO	0x0ULL
+
+#define ICE_FXD_FLTR_QW1_FD_VSI_S	14
+#define ICE_FXD_FLTR_QW1_FD_VSI_M	(0x3FFULL << ICE_FXD_FLTR_QW1_FD_VSI_S)
+#define ICE_FXD_FLTR_QW1_SWAP_S		24
+#define ICE_FXD_FLTR_QW1_SWAP_M		BIT_ULL(ICE_FXD_FLTR_QW1_SWAP_S)
+#define ICE_FXD_FLTR_QW1_SWAP_NOT_SET	0x0ULL
+#define ICE_FXD_FLTR_QW1_SWAP_SET	0x1ULL
+
+#define ICE_FXD_FLTR_QW1_FDID_PRI_S	25
+#define ICE_FXD_FLTR_QW1_FDID_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW1_FDID_PRI_S)
+#define ICE_FXD_FLTR_QW1_FDID_PRI_ZERO	0x0ULL
+#define ICE_FXD_FLTR_QW1_FDID_PRI_ONE	0x1ULL
+#define ICE_FXD_FLTR_QW1_FDID_PRI_THREE	0x3ULL
+
+#define ICE_FXD_FLTR_QW1_FDID_MDID_S	28
+#define ICE_FXD_FLTR_QW1_FDID_MDID_M	(0xFULL << ICE_FXD_FLTR_QW1_FDID_MDID_S)
+#define ICE_FXD_FLTR_QW1_FDID_MDID_FD	0x05ULL
+
+#define ICE_FXD_FLTR_QW1_FDID_S		32
+#define ICE_FXD_FLTR_QW1_FDID_M		\
+			(0xFFFFFFFFULL << ICE_FXD_FLTR_QW1_FDID_S)
+#define ICE_FXD_FLTR_QW1_FDID_ZERO	0x0ULL
+
+/* definition for FD filter programming status descriptor WB format */
+#define ICE_FXD_FLTR_WB_QW0_BUKT_LEN_S	28
+#define ICE_FXD_FLTR_WB_QW0_BUKT_LEN_M	\
+			(0xFULL << ICE_FXD_FLTR_WB_QW0_BUKT_LEN_S)
+
+#define ICE_FXD_FLTR_WB_QW0_FLTR_STAT_S	32
+#define ICE_FXD_FLTR_WB_QW0_FLTR_STAT_M	\
+			(0xFFFFFFFFULL << ICE_FXD_FLTR_WB_QW0_FLTR_STAT_S)
+
+#define ICE_FXD_FLTR_WB_QW1_DD_S	0
+#define ICE_FXD_FLTR_WB_QW1_DD_M	(0x1ULL << ICE_FXD_FLTR_WB_QW1_DD_S)
+#define ICE_FXD_FLTR_WB_QW1_DD_YES	0x1ULL
+
+#define ICE_FXD_FLTR_WB_QW1_PROG_ID_S	1
+#define ICE_FXD_FLTR_WB_QW1_PROG_ID_M	\
+				(0x3ULL << ICE_FXD_FLTR_WB_QW1_PROG_ID_S)
+#define ICE_FXD_FLTR_WB_QW1_PROG_ADD	0x0ULL
+#define ICE_FXD_FLTR_WB_QW1_PROG_DEL	0x1ULL
+
+#define ICE_FXD_FLTR_WB_QW1_FAIL_S	4
+#define ICE_FXD_FLTR_WB_QW1_FAIL_M	(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_S)
+#define ICE_FXD_FLTR_WB_QW1_FAIL_YES	0x1ULL
+
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S	5
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_M	\
+				(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S)
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES	0x1ULL
+
+#define ICE_FXD_FLTR_WB_QW1_FLT_ADDR_S	8
+#define ICE_FXD_FLTR_WB_QW1_FLT_ADDR_M	\
+				(0x3FFFULL << ICE_FXD_FLTR_WB_QW1_FLT_ADDR_S)
+
+#define ICE_FXD_FLTR_WB_QW1_PKT_PROF_S	28
+#define ICE_FXD_FLTR_WB_QW1_PKT_PROF_M	\
+				(0x7FULL << ICE_FXD_FLTR_WB_QW1_PKT_PROF_S)
+
+#define ICE_FXD_FLTR_WB_QW1_BUKT_HASH_S	38
+#define ICE_FXD_FLTR_WB_QW1_BUKT_HASH_M	\
+				(0x3FFFFFF << ICE_FXD_FLTR_WB_QW1_BUKT_HASH_S)
+
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_M	\
+				(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S)
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES	0x1ULL
+
+
+enum ice_rx_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_DESC_STATUS_DD_S			= 0,
+	ICE_RX_DESC_STATUS_EOF_S		= 1,
+	ICE_RX_DESC_STATUS_L2TAG1P_S		= 2,
+	ICE_RX_DESC_STATUS_L3L4P_S		= 3,
+	ICE_RX_DESC_STATUS_CRCP_S		= 4,
+	ICE_RX_DESC_STATUS_TSYNINDX_S		= 5,
+	ICE_RX_DESC_STATUS_TSYNVALID_S		= 7,
+	ICE_RX_DESC_STATUS_EXT_UDP_0_S		= 8,
+	ICE_RX_DESC_STATUS_UMBCAST_S		= 9,
+	ICE_RX_DESC_STATUS_FLM_S		= 11,
+	ICE_RX_DESC_STATUS_FLTSTAT_S		= 12,
+	ICE_RX_DESC_STATUS_LPBK_S		= 14,
+	ICE_RX_DESC_STATUS_IPV6EXADD_S		= 15,
+	ICE_RX_DESC_STATUS_RESERVED2_S		= 16,
+	ICE_RX_DESC_STATUS_INT_UDP_0_S		= 18,
+	ICE_RX_DESC_STATUS_LAST /* this entry must be last!!! */
+};
+
+#define ICE_RXD_QW1_STATUS_S	0
+#define ICE_RXD_QW1_STATUS_M	((BIT(ICE_RX_DESC_STATUS_LAST) - 1) << \
+				 ICE_RXD_QW1_STATUS_S)
+
+#define ICE_RXD_QW1_STATUS_TSYNINDX_S ICE_RX_DESC_STATUS_TSYNINDX_S
+#define ICE_RXD_QW1_STATUS_TSYNINDX_M (0x3UL << ICE_RXD_QW1_STATUS_TSYNINDX_S)
+
+#define ICE_RXD_QW1_STATUS_TSYNVALID_S ICE_RX_DESC_STATUS_TSYNVALID_S
+#define ICE_RXD_QW1_STATUS_TSYNVALID_M BIT_ULL(ICE_RXD_QW1_STATUS_TSYNVALID_S)
+
+
+enum ice_rx_desc_fltstat_values {
+	ICE_RX_DESC_FLTSTAT_NO_DATA	= 0,
+	ICE_RX_DESC_FLTSTAT_RSV_FD_ID	= 1, /* 16byte desc? FD_ID : RSV */
+	ICE_RX_DESC_FLTSTAT_RSV		= 2,
+	ICE_RX_DESC_FLTSTAT_RSS_HASH	= 3,
+};
+
+
+#define ICE_RXD_QW1_ERROR_S	19
+#define ICE_RXD_QW1_ERROR_M		(0xFFUL << ICE_RXD_QW1_ERROR_S)
+
+enum ice_rx_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_DESC_ERROR_RXE_S			= 0,
+	ICE_RX_DESC_ERROR_RECIPE_S		= 1,
+	ICE_RX_DESC_ERROR_HBO_S			= 2,
+	ICE_RX_DESC_ERROR_L3L4E_S		= 3, /* 3 BITS */
+	ICE_RX_DESC_ERROR_IPE_S			= 3,
+	ICE_RX_DESC_ERROR_L4E_S			= 4,
+	ICE_RX_DESC_ERROR_EIPE_S		= 5,
+	ICE_RX_DESC_ERROR_OVERSIZE_S		= 6,
+	ICE_RX_DESC_ERROR_PPRS_S		= 7
+};
+
+enum ice_rx_desc_error_l3l4e_masks {
+	ICE_RX_DESC_ERROR_L3L4E_NONE		= 0,
+	ICE_RX_DESC_ERROR_L3L4E_PROT		= 1,
+};
+
+#define ICE_RXD_QW1_PTYPE_S	30
+#define ICE_RXD_QW1_PTYPE_M	(0xFFULL << ICE_RXD_QW1_PTYPE_S)
+
+/* Packet type non-ip values */
+enum ice_rx_l2_ptype {
+	ICE_RX_PTYPE_L2_RESERVED	= 0,
+	ICE_RX_PTYPE_L2_MAC_PAY2	= 1,
+	ICE_RX_PTYPE_L2_TIMESYNC_PAY2	= 2,
+	ICE_RX_PTYPE_L2_FIP_PAY2	= 3,
+	ICE_RX_PTYPE_L2_OUI_PAY2	= 4,
+	ICE_RX_PTYPE_L2_MACCNTRL_PAY2	= 5,
+	ICE_RX_PTYPE_L2_LLDP_PAY2	= 6,
+	ICE_RX_PTYPE_L2_ECP_PAY2	= 7,
+	ICE_RX_PTYPE_L2_EVB_PAY2	= 8,
+	ICE_RX_PTYPE_L2_QCN_PAY2	= 9,
+	ICE_RX_PTYPE_L2_EAPOL_PAY2	= 10,
+	ICE_RX_PTYPE_L2_ARP		= 11,
+};
+
 struct ice_rx_ptype_decoded {
-	u32 ptype:10;
 	u32 known:1;
 	u32 outer_ip:1;
 	u32 outer_ip_ver:2;
@@ -99,10 +356,74 @@ enum ice_rx_ptype_payload_layer {
 	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
 };
 
-/* Rx Flex Descriptor
- * This descriptor is used instead of the legacy version descriptor when
+
+#define ICE_RXD_QW1_LEN_PBUF_S	38
+#define ICE_RXD_QW1_LEN_PBUF_M	(0x3FFFULL << ICE_RXD_QW1_LEN_PBUF_S)
+
+#define ICE_RXD_QW1_LEN_HBUF_S	52
+#define ICE_RXD_QW1_LEN_HBUF_M	(0x7FFULL << ICE_RXD_QW1_LEN_HBUF_S)
+
+#define ICE_RXD_QW1_LEN_SPH_S	63
+#define ICE_RXD_QW1_LEN_SPH_M	BIT_ULL(ICE_RXD_QW1_LEN_SPH_S)
+
+
+enum ice_rx_desc_ext_status_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_DESC_EXT_STATUS_L2TAG2P_S	= 0,
+	ICE_RX_DESC_EXT_STATUS_L2TAG3P_S	= 1,
+	ICE_RX_DESC_EXT_STATUS_FLEXBL_S		= 2,
+	ICE_RX_DESC_EXT_STATUS_FLEXBH_S		= 4,
+	ICE_RX_DESC_EXT_STATUS_FDLONGB_S	= 9,
+	ICE_RX_DESC_EXT_STATUS_PELONGB_S	= 11,
+};
+
+
+enum ice_rx_desc_pe_status_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_DESC_PE_STATUS_QPID_S		= 0, /* 18 BITS */
+	ICE_RX_DESC_PE_STATUS_L4PORT_S		= 0, /* 16 BITS */
+	ICE_RX_DESC_PE_STATUS_IPINDEX_S		= 16, /* 8 BITS */
+	ICE_RX_DESC_PE_STATUS_QPIDHIT_S		= 24,
+	ICE_RX_DESC_PE_STATUS_APBVTHIT_S	= 25,
+	ICE_RX_DESC_PE_STATUS_PORTV_S		= 26,
+	ICE_RX_DESC_PE_STATUS_URG_S		= 27,
+	ICE_RX_DESC_PE_STATUS_IPFRAG_S		= 28,
+	ICE_RX_DESC_PE_STATUS_IPOPT_S		= 29
+};
+
+#define ICE_RX_PROG_STATUS_DESC_LEN_S	38
+#define ICE_RX_PROG_STATUS_DESC_LEN	0x2000000
+
+#define ICE_RX_PROG_STATUS_DESC_QW1_PROGID_S	2
+#define ICE_RX_PROG_STATUS_DESC_QW1_PROGID_M	\
+			(0x7UL << ICE_RX_PROG_STATUS_DESC_QW1_PROGID_S)
+
+
+#define ICE_RX_PROG_STATUS_DESC_QW1_ERROR_S	19
+#define ICE_RX_PROG_STATUS_DESC_QW1_ERROR_M	\
+			(0x3FUL << ICE_RX_PROG_STATUS_DESC_QW1_ERROR_S)
+
+enum ice_rx_prog_status_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_PROG_STATUS_DESC_DD_S		= 0,
+	ICE_RX_PROG_STATUS_DESC_PROG_ID_S	= 2 /* 3 BITS */
+};
+
+enum ice_rx_prog_status_desc_prog_id_masks {
+	ICE_RX_PROG_STATUS_DESC_FD_FLTR_STATUS	= 1,
+};
+
+enum ice_rx_prog_status_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_PROG_STATUS_DESC_FD_TBL_FULL_S	= 0,
+	ICE_RX_PROG_STATUS_DESC_NO_FD_ENTRY_S	= 1,
+};
+
+/* Rx Flex Descriptors
+ * These descriptors are used instead of the legacy version descriptors when
  * ice_rlan_ctx.adv_desc is set
  */
+
 union ice_32b_rx_flex_desc {
 	struct {
 		__le64 pkt_addr; /* Packet buffer address */
@@ -148,8 +469,12 @@ union ice_32b_rx_flex_desc {
 };
 
 /* Rx Flex Descriptor NIC Profile
- * This descriptor corresponds to RxDID 2 which contains
- * metadata fields for RSS, flow ID and timestamp info
+ * RxDID Profile ID 2
+ * Flex-field 0: RSS hash lower 16-bits
+ * Flex-field 1: RSS hash upper 16-bits
+ * Flex-field 2: Flow ID lower 16-bits
+ * Flex-field 3: Flow ID higher 16-bits
+ * Flex-field 4: reserved, VLAN ID taken from L2Tag
  */
 struct ice_32b_rx_flex_desc_nic {
 	/* Qword 0 */
@@ -175,13 +500,149 @@ struct ice_32b_rx_flex_desc_nic {
 	__le32 flow_id;
 	union {
 		struct {
-			__le16 vlan_id;
+			__le16 rsvd;
 			__le16 flow_id_ipv6;
 		} flex;
 		__le32 ts_high;
 	} flex_ts;
 };
 
+/* Rx Flex Descriptor Switch Profile
+ * RxDID Profile ID 3
+ * Flex-field 0: Source VSI
+ */
+struct ice_32b_rx_flex_desc_sw {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 src_vsi; /* [10:15] are reserved */
+	__le16 flex_md1_rsvd;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC VEB Profile
+ * RxDID Profile ID 4
+ * Flex-field 0: Destination VSI
+ */
+struct ice_32b_rx_flex_desc_nic_veb_dbg {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 dst_vsi; /* [0:12]: destination VSI */
+			/* 13: VSI valid bit */
+			/* [14:15] are reserved */
+	__le16 flex_field_1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC ACL Profile
+ * RxDID Profile ID 5
+ * Flex-field 0: ACL Counter 0
+ * Flex-field 1: ACL Counter 1
+ * Flex-field 2: ACL Counter 2
+ */
+struct ice_32b_rx_flex_desc_nic_acl_dbg {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 acl_ctr0;
+	__le16 acl_ctr1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 acl_ctr2;
+	__le16 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC Profile
+ * RxDID Profile ID 6
+ * Flex-field 0: RSS hash lower 16-bits
+ * Flex-field 1: RSS hash upper 16-bits
+ * Flex-field 2: Flow ID lower 16-bits
+ * Flex-field 3: Source VSI
+ * Flex-field 4: reserved, VLAN ID taken from L2Tag
+ */
+struct ice_32b_rx_flex_desc_nic_2 {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le32 rss_hash;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flexi_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 flow_id;
+	__le16 src_vsi;
+	union {
+		struct {
+			__le16 rsvd;
+			__le16 flow_id_ipv6;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
+
 /* Receive Flex Descriptor profile IDs: There are a total
  * of 64 profiles where profile IDs 0/1 are for legacy; and
  * profiles 2-63 are flex profiles that can be programmed
@@ -191,27 +652,72 @@ enum ice_rxdid {
 	ICE_RXDID_LEGACY_0		= 0,
 	ICE_RXDID_LEGACY_1		= 1,
 	ICE_RXDID_FLEX_NIC		= 2,
+	ICE_RXDID_FLEX_NIC_VEB_DBG	= 4,
+	ICE_RXDID_FLEX_NIC_ACL_DBG	= 5,
 	ICE_RXDID_FLEX_NIC_2		= 6,
 	ICE_RXDID_HW			= 7,
 	ICE_RXDID_LAST			= 63,
 };
 
+/* Recceive Flex descriptor Dword Index */
+enum ice_flex_word {
+	ICE_RX_FLEX_DWORD_0 = 0,
+	ICE_RX_FLEX_DWORD_1,
+	ICE_RX_FLEX_DWORD_2,
+	ICE_RX_FLEX_DWORD_3,
+	ICE_RX_FLEX_DWORD_4,
+	ICE_RX_FLEX_DWORD_5
+};
+
 /* Receive Flex Descriptor Rx opcode values */
-#define ICE_RX_OPC_MDID		0x01
+enum ice_flex_opcode {
+	ICE_RX_OPC_DEBUG = 0,
+	ICE_RX_OPC_MDID,
+	ICE_RX_OPC_EXTRACT,
+	ICE_RX_OPC_PROTID
+};
+
+/* Receive Descriptor MDID values that access packet flags */
+enum ice_flex_mdid_pkt_flags {
+	ICE_RX_MDID_PKT_FLAGS_15_0	= 20,
+	ICE_RX_MDID_PKT_FLAGS_31_16,
+	ICE_RX_MDID_PKT_FLAGS_47_32,
+	ICE_RX_MDID_PKT_FLAGS_63_48,
+};
 
-/* Receive Descriptor MDID values */
-enum ice_flex_rx_mdid {
-	ICE_RX_MDID_FLOW_ID_LOWER	= 5,
-	ICE_RX_MDID_FLOW_ID_HIGH,
-	ICE_RX_MDID_SRC_VSI		= 19,
-	ICE_RX_MDID_HASH_LOW		= 56,
-	ICE_RX_MDID_HASH_HIGH,
+/* Generic descriptor MDID values */
+enum ice_flex_mdid {
+	ICE_MDID_GENERIC_WORD_0,
+	ICE_MDID_GENERIC_WORD_1,
+	ICE_MDID_GENERIC_WORD_2,
+	ICE_MDID_GENERIC_WORD_3,
+	ICE_MDID_GENERIC_WORD_4,
+	ICE_MDID_FLOW_ID_LOWER,
+	ICE_MDID_FLOW_ID_HIGH,
+	ICE_MDID_RX_DESCR_PROF_IDX,
+	ICE_MDID_RX_PKT_DROP,
+	ICE_MDID_RX_DST_Q		= 12,
+	ICE_MDID_RX_DST_VSI,
+	ICE_MDID_SRC_VSI		= 19,
+	ICE_MDID_ACL_NOP		= 55,
+	/* Entry 56 */
+	ICE_MDID_RX_HASH_LOW,
+	ICE_MDID_ACL_CNTR_PKT		= ICE_MDID_RX_HASH_LOW,
+	/* Entry 57 */
+	ICE_MDID_RX_HASH_HIGH,
+	ICE_MDID_ACL_CNTR_BYTES		= ICE_MDID_RX_HASH_HIGH,
+	ICE_MDID_ACL_CNTR_PKT_BYTES
 };
 
+/* for ice_32byte_rx_flex_desc.mir_id_umb_cast member */
+#define ICE_RX_FLEX_DESC_MIRROR_M	(0x3F) /* 6-bits */
+
 /* Rx/Tx Flag64 packet flag bits */
 enum ice_flg64_bits {
 	ICE_FLG_PKT_DSI		= 0,
-	ICE_FLG_EVLAN_x8100	= 15,
+	/* If there is a 1 in this bit position then that means Rx packet */
+	ICE_FLG_PKT_DIR		= 4,
+	ICE_FLG_EVLAN_x8100	= 14,
 	ICE_FLG_EVLAN_x9100,
 	ICE_FLG_VLAN_x8100,
 	ICE_FLG_TNL_MAC		= 22,
@@ -227,12 +733,50 @@ enum ice_flg64_bits {
 	ICE_FLG_RSVD		= 63
 };
 
+enum ice_rx_flex_desc_umb_cast_bits { /* field is 2 bits long */
+	ICE_RX_FLEX_DESC_UMB_CAST_S = 6,
+	ICE_RX_FLEX_DESC_UMB_CAST_LAST /* this entry must be last!!! */
+};
+
+enum ice_umbcast_dest_addr_types {
+	ICE_DEST_UNICAST = 0,
+	ICE_DEST_MULTICAST,
+	ICE_DEST_BROADCAST,
+	ICE_DEST_MIRRORED,
+};
+
 /* for ice_32byte_rx_flex_desc.ptype_flexi_flags0 member */
 #define ICE_RX_FLEX_DESC_PTYPE_M	(0x3FF) /* 10-bits */
 
+enum ice_rx_flex_desc_flexi_flags0_bits { /* field is 6 bits long */
+	ICE_RX_FLEX_DESC_FLEXI_FLAGS0_S = 10,
+	ICE_RX_FLEX_DESC_FLEXI_FLAGS0_LAST /* this entry must be last!!! */
+};
+
 /* for ice_32byte_rx_flex_desc.pkt_length member */
 #define ICE_RX_FLX_DESC_PKT_LEN_M	(0x3FFF) /* 14-bits */
 
+/* for ice_32byte_rx_flex_desc.header_length_sph_flexi_flags1 member */
+#define ICE_RX_FLEX_DESC_HEADER_LEN_M	(0x7FF) /* 11-bits */
+
+enum ice_rx_flex_desc_sph_bits { /* field is 1 bit long */
+	ICE_RX_FLEX_DESC_SPH_S = 11,
+	ICE_RX_FLEX_DESC_SPH_LAST /* this entry must be last!!! */
+};
+
+enum ice_rx_flex_desc_flexi_flags1_bits { /* field is 4 bits long */
+	ICE_RX_FLEX_DESC_FLEXI_FLAGS1_S = 12,
+	ICE_RX_FLEX_DESC_FLEXI_FLAGS1_LAST /* this entry must be last!!! */
+};
+
+enum ice_rx_flex_desc_ext_status_bits { /* field is 4 bits long */
+	ICE_RX_FLEX_DESC_EXT_STATUS_EXT_UDP_S = 12,
+	ICE_RX_FLEX_DESC_EXT_STATUS_INT_UDP_S = 13,
+	ICE_RX_FLEX_DESC_EXT_STATUS_RECIPE_S = 14,
+	ICE_RX_FLEX_DESC_EXT_STATUS_OVERSIZE_S = 15,
+	ICE_RX_FLEX_DESC_EXT_STATUS_LAST /* entry must be last!!! */
+};
+
 enum ice_rx_flex_desc_status_error_0_bits {
 	/* Note: These are predefined bit offsets */
 	ICE_RX_FLEX_DESC_STATUS0_DD_S = 0,
@@ -254,6 +798,29 @@ enum ice_rx_flex_desc_status_error_0_bits {
 	ICE_RX_FLEX_DESC_STATUS0_LAST /* this entry must be last!!! */
 };
 
+enum ice_rx_flex_desc_status_error_1_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_FLEX_DESC_STATUS1_CPM_S = 0, /* 4 bits */
+	ICE_RX_FLEX_DESC_STATUS1_NAT_S = 4,
+	ICE_RX_FLEX_DESC_STATUS1_CRYPTO_S = 5,
+	/* [10:6] reserved */
+	ICE_RX_FLEX_DESC_STATUS1_L2TAG2P_S = 11,
+	ICE_RX_FLEX_DESC_STATUS1_XTRMD2_VALID_S = 12,
+	ICE_RX_FLEX_DESC_STATUS1_XTRMD3_VALID_S = 13,
+	ICE_RX_FLEX_DESC_STATUS1_XTRMD4_VALID_S = 14,
+	ICE_RX_FLEX_DESC_STATUS1_XTRMD5_VALID_S = 15,
+	ICE_RX_FLEX_DESC_STATUS1_LAST /* this entry must be last!!! */
+};
+
+enum ice_rx_flex_desc_exstat_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_FLEX_DESC_EXSTAT_EXTUDP_S = 0,
+	ICE_RX_FLEX_DESC_EXSTAT_INTUDP_S = 1,
+	ICE_RX_FLEX_DESC_EXSTAT_RECIPE_S = 2,
+	ICE_RX_FLEX_DESC_EXSTAT_OVERSIZE_S = 3,
+};
+
+
 #define ICE_RXQ_CTX_SIZE_DWORDS		8
 #define ICE_RXQ_CTX_SZ			(ICE_RXQ_CTX_SIZE_DWORDS * sizeof(u32))
 #define ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS	22
@@ -302,7 +869,7 @@ struct ice_ctx_ele {
 
 #define ICE_CTX_STORE(_struct, _ele, _width, _lsb) {	\
 	.offset = offsetof(struct _struct, _ele),	\
-	.size_of = FIELD_SIZEOF(struct _struct, _ele),	\
+	.size_of = sizeof_field(struct _struct, _ele),	\
 	.width = _width,				\
 	.lsb = _lsb,					\
 }
@@ -329,9 +896,15 @@ struct ice_tx_desc {
 	__le64 cmd_type_offset_bsz;
 };
 
+#define ICE_TXD_QW1_DTYPE_S	0
+#define ICE_TXD_QW1_DTYPE_M	(0xFUL << ICE_TXD_QW1_DTYPE_S)
+
 enum ice_tx_desc_dtype_value {
 	ICE_TX_DESC_DTYPE_DATA		= 0x0,
 	ICE_TX_DESC_DTYPE_CTX		= 0x1,
+	ICE_TX_DESC_DTYPE_IPSEC		= 0x3,
+	ICE_TX_DESC_DTYPE_FLTR_PROG	= 0x8,
+	ICE_TX_DESC_DTYPE_HLP_META	= 0x9,
 	/* DESC_DONE - HW has completed write-back of descriptor */
 	ICE_TX_DESC_DTYPE_DESC_DONE	= 0xF,
 };
@@ -342,13 +915,20 @@ enum ice_tx_desc_dtype_value {
 enum ice_tx_desc_cmd_bits {
 	ICE_TX_DESC_CMD_EOP			= 0x0001,
 	ICE_TX_DESC_CMD_RS			= 0x0002,
+	ICE_TX_DESC_CMD_RSVD			= 0x0004,
 	ICE_TX_DESC_CMD_IL2TAG1			= 0x0008,
+	ICE_TX_DESC_CMD_DUMMY			= 0x0010,
+	ICE_TX_DESC_CMD_IIPT_NONIP		= 0x0000,
 	ICE_TX_DESC_CMD_IIPT_IPV6		= 0x0020,
 	ICE_TX_DESC_CMD_IIPT_IPV4		= 0x0040,
 	ICE_TX_DESC_CMD_IIPT_IPV4_CSUM		= 0x0060,
+	ICE_TX_DESC_CMD_RSVD2			= 0x0080,
+	ICE_TX_DESC_CMD_L4T_EOFT_UNK		= 0x0000,
 	ICE_TX_DESC_CMD_L4T_EOFT_TCP		= 0x0100,
 	ICE_TX_DESC_CMD_L4T_EOFT_SCTP		= 0x0200,
 	ICE_TX_DESC_CMD_L4T_EOFT_UDP		= 0x0300,
+	ICE_TX_DESC_CMD_RE			= 0x0400,
+	ICE_TX_DESC_CMD_RSVD3			= 0x0800,
 };
 
 #define ICE_TXD_QW1_OFFSET_S	16
@@ -374,7 +954,10 @@ enum ice_tx_desc_len_fields {
 			    ICE_TX_DESC_LEN_L4_LEN_S) * ICE_BYTES_PER_DWORD)
 
 #define ICE_TXD_QW1_TX_BUF_SZ_S	34
+#define ICE_TXD_QW1_TX_BUF_SZ_M	(0x3FFFULL << ICE_TXD_QW1_TX_BUF_SZ_S)
+
 #define ICE_TXD_QW1_L2TAG1_S	48
+#define ICE_TXD_QW1_L2TAG1_M	(0xFFFFULL << ICE_TXD_QW1_L2TAG1_S)
 
 /* Context descriptors */
 struct ice_tx_ctx_desc {
@@ -384,14 +967,29 @@ struct ice_tx_ctx_desc {
 	__le64 qw1;
 };
 
+#define ICE_TXD_CTX_QW1_DTYPE_S	0
+#define ICE_TXD_CTX_QW1_DTYPE_M	(0xFUL << ICE_TXD_CTX_QW1_DTYPE_S)
+
 #define ICE_TXD_CTX_QW1_CMD_S	4
 #define ICE_TXD_CTX_QW1_CMD_M	(0x7FUL << ICE_TXD_CTX_QW1_CMD_S)
 
+#define ICE_TXD_CTX_QW1_IPSEC_S	11
+#define ICE_TXD_CTX_QW1_IPSEC_M	(0x7FUL << ICE_TXD_CTX_QW1_IPSEC_S)
+
 #define ICE_TXD_CTX_QW1_TSO_LEN_S	30
 #define ICE_TXD_CTX_QW1_TSO_LEN_M	\
 			(0x3FFFFULL << ICE_TXD_CTX_QW1_TSO_LEN_S)
 
+#define ICE_TXD_CTX_QW1_TSYN_S	ICE_TXD_CTX_QW1_TSO_LEN_S
+#define ICE_TXD_CTX_QW1_TSYN_M	ICE_TXD_CTX_QW1_TSO_LEN_M
+
 #define ICE_TXD_CTX_QW1_MSS_S	50
+#define ICE_TXD_CTX_QW1_MSS_M	(0x3FFFULL << ICE_TXD_CTX_QW1_MSS_S)
+#define ICE_TXD_CTX_MIN_MSS	64
+#define ICE_TXD_CTX_MAX_MSS	9668
+
+#define ICE_TXD_CTX_QW1_VSI_S	50
+#define ICE_TXD_CTX_QW1_VSI_M	(0x3FFULL << ICE_TXD_CTX_QW1_VSI_S)
 
 enum ice_tx_ctx_desc_cmd_bits {
 	ICE_TX_CTX_DESC_TSO		= 0x01,
@@ -405,6 +1003,40 @@ enum ice_tx_ctx_desc_cmd_bits {
 	ICE_TX_CTX_DESC_RESERVED	= 0x40
 };
 
+enum ice_tx_ctx_desc_eipt_offload {
+	ICE_TX_CTX_EIPT_NONE		= 0x0,
+	ICE_TX_CTX_EIPT_IPV6		= 0x1,
+	ICE_TX_CTX_EIPT_IPV4_NO_CSUM	= 0x2,
+	ICE_TX_CTX_EIPT_IPV4		= 0x3
+};
+
+#define ICE_TXD_CTX_QW0_EIPT_S	0
+#define ICE_TXD_CTX_QW0_EIPT_M	(0x3ULL << ICE_TXD_CTX_QW0_EIPT_S)
+
+#define ICE_TXD_CTX_QW0_EIPLEN_S	2
+#define ICE_TXD_CTX_QW0_EIPLEN_M	(0x7FUL << ICE_TXD_CTX_QW0_EIPLEN_S)
+
+#define ICE_TXD_CTX_QW0_L4TUNT_S	9
+#define ICE_TXD_CTX_QW0_L4TUNT_M	(0x3ULL << ICE_TXD_CTX_QW0_L4TUNT_S)
+
+#define ICE_TXD_CTX_UDP_TUNNELING	BIT_ULL(ICE_TXD_CTX_QW0_L4TUNT_S)
+#define ICE_TXD_CTX_GRE_TUNNELING	(0x2ULL << ICE_TXD_CTX_QW0_L4TUNT_S)
+
+#define ICE_TXD_CTX_QW0_EIP_NOINC_S	11
+#define ICE_TXD_CTX_QW0_EIP_NOINC_M	BIT_ULL(ICE_TXD_CTX_QW0_EIP_NOINC_S)
+
+#define ICE_TXD_CTX_EIP_NOINC_IPID_CONST	ICE_TXD_CTX_QW0_EIP_NOINC_M
+
+#define ICE_TXD_CTX_QW0_NATLEN_S	12
+#define ICE_TXD_CTX_QW0_NATLEN_M	(0X7FULL << ICE_TXD_CTX_QW0_NATLEN_S)
+
+#define ICE_TXD_CTX_QW0_DECTTL_S	19
+#define ICE_TXD_CTX_QW0_DECTTL_M	(0xFULL << ICE_TXD_CTX_QW0_DECTTL_S)
+
+#define ICE_TXD_CTX_QW0_L4T_CS_S	23
+#define ICE_TXD_CTX_QW0_L4T_CS_M	BIT_ULL(ICE_TXD_CTX_QW0_L4T_CS_S)
+
+
 #define ICE_LAN_TXQ_MAX_QGRPS	127
 #define ICE_LAN_TXQ_MAX_QDIS	1023
 
@@ -447,12 +1079,92 @@ struct ice_tlan_ctx {
 	u8 drop_ena;
 	u8 cache_prof_idx;
 	u8 pkt_shaper_prof_idx;
-	u8 int_q_state;	/* width not needed - internal do not write */
+	u8 int_q_state;	/* width not needed - internal - DO NOT WRITE!!! */
 };
 
-/* macro to make the table lines short */
+/* LAN Tx Completion Queue data */
+struct ice_tx_cmpltnq {
+	u16 txq_id;
+	u8 generation;
+	u16 tx_head;
+	u8 cmpl_type;
+} __packed;
+
+
+/* LAN Tx Completion Queue Context */
+struct ice_tx_cmpltnq_ctx {
+	u64 base;
+#define ICE_TX_CMPLTNQ_CTX_BASE_S	7
+	u32 q_len;
+#define ICE_TX_CMPLTNQ_CTX_Q_LEN_S	4
+	u8 generation;
+	u32 wrt_ptr;
+	u8 pf_num;
+	u16 vmvf_num;
+	u8 vmvf_type;
+#define ICE_TX_CMPLTNQ_CTX_VMVF_TYPE_VF		0
+#define ICE_TX_CMPLTNQ_CTX_VMVF_TYPE_VMQ	1
+#define ICE_TX_CMPLTNQ_CTX_VMVF_TYPE_PF		2
+	u8 tph_desc_wr;
+	u8 cpuid;
+	u32 cmpltn_cache[16];
+} __packed;
+
+/* LAN Tx Doorbell Descriptor Format */
+struct ice_tx_drbell_fmt {
+	u16 txq_id;
+	u8 dd;
+	u8 rs;
+	u32 db;
+};
+
+
+/* LAN Tx Doorbell Queue Context */
+struct ice_tx_drbell_q_ctx {
+	u64 base;
+#define ICE_TX_DRBELL_Q_CTX_BASE_S	7
+	u16 ring_len;
+#define ICE_TX_DRBELL_Q_CTX_RING_LEN_S	4
+	u8 pf_num;
+	u16 vf_num;
+	u8 vmvf_type;
+#define ICE_TX_DRBELL_Q_CTX_VMVF_TYPE_VF	0
+#define ICE_TX_DRBELL_Q_CTX_VMVF_TYPE_VMQ	1
+#define ICE_TX_DRBELL_Q_CTX_VMVF_TYPE_PF	2
+	u8 cpuid;
+	u8 tph_desc_rd;
+	u8 tph_desc_wr;
+	u8 db_q_en;
+	u16 rd_head;
+	u16 rd_tail;
+} __packed;
+
+/* The ice_ptype_lkup table is used to convert from the 10-bit ptype in the
+ * hardware to a bit-field that can be used by SW to more easily determine the
+ * packet type.
+ *
+ * Macros are used to shorten the table lines and make this table human
+ * readable.
+ *
+ * We store the PTYPE in the top byte of the bit field - this is just so that
+ * we can check that the table doesn't have a row missing, as the index into
+ * the table should be the PTYPE.
+ *
+ * Typical work flow:
+ *
+ * IF NOT ice_ptype_lkup[ptype].known
+ * THEN
+ *      Packet is unknown
+ * ELSE IF ice_ptype_lkup[ptype].outer_ip == ICE_RX_PTYPE_OUTER_IP
+ *      Use the rest of the fields to look at the tunnels, inner protocols, etc
+ * ELSE
+ *      Use the enum ice_rx_l2_ptype to decode the packet type
+ * ENDIF
+ */
+
+/* macro to make the table lines short, use explicit indexing with [PTYPE] */
 #define ICE_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	{	PTYPE, \
+	[PTYPE] = { \
 		1, \
 		ICE_RX_PTYPE_OUTER_##OUTER_IP, \
 		ICE_RX_PTYPE_OUTER_##OUTER_IP_VER, \
@@ -463,17 +1175,220 @@ struct ice_tlan_ctx {
 		ICE_RX_PTYPE_INNER_PROT_##I, \
 		ICE_RX_PTYPE_PAYLOAD_LAYER_##PL }
 
-#define ICE_PTT_UNUSED_ENTRY(PTYPE) { PTYPE, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+#define ICE_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
 
 /* shorter macros makes the table fit but are terse */
 #define ICE_RX_PTYPE_NOF		ICE_RX_PTYPE_NOT_FRAG
+#define ICE_RX_PTYPE_FRG		ICE_RX_PTYPE_FRAG
+#define ICE_RX_PTYPE_INNER_PROT_TS	ICE_RX_PTYPE_INNER_PROT_TIMESYNC
 
-/* Lookup table mapping the HW PTYPE to the bit field for decoding */
-static const struct ice_rx_ptype_decoded ice_ptype_lkup[] = {
+/* Lookup table mapping the 10-bit HW PTYPE to the bit field for decoding */
+static const struct ice_rx_ptype_decoded ice_ptype_lkup[1024] = {
 	/* L2 Packet types */
 	ICE_PTT_UNUSED_ENTRY(0),
 	ICE_PTT(1, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	ICE_PTT(2, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT_UNUSED_ENTRY(2),
+	ICE_PTT_UNUSED_ENTRY(3),
+	ICE_PTT_UNUSED_ENTRY(4),
+	ICE_PTT_UNUSED_ENTRY(5),
+	ICE_PTT(6, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT(7, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT_UNUSED_ENTRY(8),
+	ICE_PTT_UNUSED_ENTRY(9),
+	ICE_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT_UNUSED_ENTRY(12),
+	ICE_PTT_UNUSED_ENTRY(13),
+	ICE_PTT_UNUSED_ENTRY(14),
+	ICE_PTT_UNUSED_ENTRY(15),
+	ICE_PTT_UNUSED_ENTRY(16),
+	ICE_PTT_UNUSED_ENTRY(17),
+	ICE_PTT_UNUSED_ENTRY(18),
+	ICE_PTT_UNUSED_ENTRY(19),
+	ICE_PTT_UNUSED_ENTRY(20),
+	ICE_PTT_UNUSED_ENTRY(21),
+
+	/* Non Tunneled IPv4 */
+	ICE_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
+	ICE_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
+	ICE_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(25),
+	ICE_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
+	ICE_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
+	ICE_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
+
+	/* IPv4 --> IPv4 */
+	ICE_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(32),
+	ICE_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> IPv6 */
+	ICE_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(39),
+	ICE_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT */
+	ICE_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 --> GRE/NAT --> IPv4 */
+	ICE_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(47),
+	ICE_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> IPv6 */
+	ICE_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(54),
+	ICE_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> MAC */
+	ICE_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
+	ICE_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(62),
+	ICE_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
+	ICE_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(69),
+	ICE_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> MAC/VLAN */
+	ICE_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
+	ICE_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(77),
+	ICE_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
+	ICE_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(84),
+	ICE_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
+
+	/* Non Tunneled IPv6 */
+	ICE_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
+	ICE_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
+	ICE_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(91),
+	ICE_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
+	ICE_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
+	ICE_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
+
+	/* IPv6 --> IPv4 */
+	ICE_PTT(95, IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(96, IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(97, IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(98),
+	ICE_PTT(99, IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> IPv6 */
+	ICE_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(105),
+	ICE_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT */
+	ICE_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> IPv4 */
+	ICE_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(113),
+	ICE_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> IPv6 */
+	ICE_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(120),
+	ICE_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC */
+	ICE_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
+	ICE_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(128),
+	ICE_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
+	ICE_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(135),
+	ICE_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN */
+	ICE_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
+	ICE_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(143),
+	ICE_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
+	ICE_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(150),
+	ICE_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
+
+	[154 ... 1023] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
 };
 
 static inline struct ice_rx_ptype_decoded ice_decode_rx_desc_ptype(u16 ptype)
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index cc755382df25..820dcab6d774 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1,246 +1,64 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice.h"
+#include "ice_base.h"
 #include "ice_lib.h"
+#include "ice_fltr.h"
 #include "ice_dcb_lib.h"
+#include "ice_devlink.h"
+#include "ice_vsi_vlan_ops.h"
 
 /**
- * ice_setup_rx_ctx - Configure a receive ring context
- * @ring: The Rx ring to configure
- *
- * Configure the Rx descriptor ring in RLAN context.
- */
-static int ice_setup_rx_ctx(struct ice_ring *ring)
-{
-	struct ice_vsi *vsi = ring->vsi;
-	struct ice_hw *hw = &vsi->back->hw;
-	u32 rxdid = ICE_RXDID_FLEX_NIC;
-	struct ice_rlan_ctx rlan_ctx;
-	u32 regval;
-	u16 pf_q;
-	int err;
-
-	/* what is Rx queue number in global space of 2K Rx queues */
-	pf_q = vsi->rxq_map[ring->q_index];
-
-	/* clear the context structure first */
-	memset(&rlan_ctx, 0, sizeof(rlan_ctx));
-
-	rlan_ctx.base = ring->dma >> 7;
-
-	rlan_ctx.qlen = ring->count;
-
-	/* Receive Packet Data Buffer Size.
-	 * The Packet Data Buffer Size is defined in 128 byte units.
-	 */
-	rlan_ctx.dbuf = vsi->rx_buf_len >> ICE_RLAN_CTX_DBUF_S;
-
-	/* use 32 byte descriptors */
-	rlan_ctx.dsize = 1;
-
-	/* Strip the Ethernet CRC bytes before the packet is posted to host
-	 * memory.
-	 */
-	rlan_ctx.crcstrip = 1;
-
-	/* L2TSEL flag defines the reported L2 Tags in the receive descriptor */
-	rlan_ctx.l2tsel = 1;
-
-	rlan_ctx.dtype = ICE_RX_DTYPE_NO_SPLIT;
-	rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_NO_SPLIT;
-	rlan_ctx.hsplit_1 = ICE_RLAN_RX_HSPLIT_1_NO_SPLIT;
-
-	/* This controls whether VLAN is stripped from inner headers
-	 * The VLAN in the inner L2 header is stripped to the receive
-	 * descriptor if enabled by this flag.
-	 */
-	rlan_ctx.showiv = 0;
-
-	/* Max packet size for this queue - must not be set to a larger value
-	 * than 5 x DBUF
-	 */
-	rlan_ctx.rxmax = min_t(u16, vsi->max_frame,
-			       ICE_MAX_CHAINED_RX_BUFS * vsi->rx_buf_len);
-
-	/* Rx queue threshold in units of 64 */
-	rlan_ctx.lrxqthresh = 1;
-
-	 /* Enable Flexible Descriptors in the queue context which
-	  * allows this driver to select a specific receive descriptor format
-	  */
-	if (vsi->type != ICE_VSI_VF) {
-		regval = rd32(hw, QRXFLXP_CNTXT(pf_q));
-		regval |= (rxdid << QRXFLXP_CNTXT_RXDID_IDX_S) &
-			QRXFLXP_CNTXT_RXDID_IDX_M;
-
-		/* increasing context priority to pick up profile ID;
-		 * default is 0x01; setting to 0x03 to ensure profile
-		 * is programming if prev context is of same priority
-		 */
-		regval |= (0x03 << QRXFLXP_CNTXT_RXDID_PRIO_S) &
-			QRXFLXP_CNTXT_RXDID_PRIO_M;
-
-		wr32(hw, QRXFLXP_CNTXT(pf_q), regval);
-	}
-
-	/* Absolute queue number out of 2K needs to be passed */
-	err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q);
-	if (err) {
-		dev_err(&vsi->back->pdev->dev,
-			"Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n",
-			pf_q, err);
-		return -EIO;
-	}
-
-	if (vsi->type == ICE_VSI_VF)
-		return 0;
-
-	/* init queue specific tail register */
-	ring->tail = hw->hw_addr + QRX_TAIL(pf_q);
-	writel(0, ring->tail);
-	ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring));
-
-	return 0;
-}
-
-/**
- * ice_setup_tx_ctx - setup a struct ice_tlan_ctx instance
- * @ring: The Tx ring to configure
- * @tlan_ctx: Pointer to the Tx LAN queue context structure to be initialized
- * @pf_q: queue index in the PF space
- *
- * Configure the Tx descriptor ring in TLAN context.
+ * ice_vsi_type_str - maps VSI type enum to string equivalents
+ * @vsi_type: VSI type enum
  */
-static void
-ice_setup_tx_ctx(struct ice_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf_q)
+const char *ice_vsi_type_str(enum ice_vsi_type vsi_type)
 {
-	struct ice_vsi *vsi = ring->vsi;
-	struct ice_hw *hw = &vsi->back->hw;
-
-	tlan_ctx->base = ring->dma >> ICE_TLAN_CTX_BASE_S;
-
-	tlan_ctx->port_num = vsi->port_info->lport;
-
-	/* Transmit Queue Length */
-	tlan_ctx->qlen = ring->count;
-
-	ice_set_cgd_num(tlan_ctx, ring);
-
-	/* PF number */
-	tlan_ctx->pf_num = hw->pf_id;
-
-	/* queue belongs to a specific VSI type
-	 * VF / VM index should be programmed per vmvf_type setting:
-	 * for vmvf_type = VF, it is VF number between 0-256
-	 * for vmvf_type = VM, it is VM number between 0-767
-	 * for PF or EMP this field should be set to zero
-	 */
-	switch (vsi->type) {
-	case ICE_VSI_LB:
-		/* fall through */
+	switch (vsi_type) {
 	case ICE_VSI_PF:
-		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_PF;
-		break;
+		return "ICE_VSI_PF";
 	case ICE_VSI_VF:
-		/* Firmware expects vmvf_num to be absolute VF ID */
-		tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_id;
-		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
-		break;
+		return "ICE_VSI_VF";
+	case ICE_VSI_VMDQ2:
+		return "ICE_VSI_VMDQ2";
+	case ICE_VSI_CTRL:
+		return "ICE_VSI_CTRL";
+	case ICE_VSI_CHNL:
+		return "ICE_VSI_CHNL";
+	case ICE_VSI_OFFLOAD_MACVLAN:
+		return "ICE_VSI_OFFLOAD_MACVLAN";
+	case ICE_VSI_LB:
+		return "ICE_VSI_LB";
+	case ICE_VSI_SWITCHDEV_CTRL:
+		return "ICE_VSI_SWITCHDEV_CTRL";
 	default:
-		return;
-	}
-
-	/* make sure the context is associated with the right VSI */
-	tlan_ctx->src_vsi = ice_get_hw_vsi_num(hw, vsi->idx);
-
-	tlan_ctx->tso_ena = ICE_TX_LEGACY;
-	tlan_ctx->tso_qnum = pf_q;
-
-	/* Legacy or Advanced Host Interface:
-	 * 0: Advanced Host Interface
-	 * 1: Legacy Host Interface
-	 */
-	tlan_ctx->legacy_int = ICE_TX_LEGACY;
-}
-
-/**
- * ice_pf_rxq_wait - Wait for a PF's Rx queue to be enabled or disabled
- * @pf: the PF being configured
- * @pf_q: the PF queue
- * @ena: enable or disable state of the queue
- *
- * This routine will wait for the given Rx queue of the PF to reach the
- * enabled or disabled state.
- * Returns -ETIMEDOUT in case of failing to reach the requested state after
- * multiple retries; else will return 0 in case of success.
- */
-static int ice_pf_rxq_wait(struct ice_pf *pf, int pf_q, bool ena)
-{
-	int i;
-
-	for (i = 0; i < ICE_Q_WAIT_MAX_RETRY; i++) {
-		if (ena == !!(rd32(&pf->hw, QRX_CTRL(pf_q)) &
-			      QRX_CTRL_QENA_STAT_M))
-			return 0;
-
-		usleep_range(20, 40);
+		return "unknown";
 	}
-
-	return -ETIMEDOUT;
 }
 
 /**
- * ice_vsi_ctrl_rx_ring - Start or stop a VSI's Rx ring
+ * ice_vsi_ctrl_all_rx_rings - Start or stop a VSI's Rx rings
  * @vsi: the VSI being configured
  * @ena: start or stop the Rx rings
- * @rxq_idx: Rx queue index
+ *
+ * First enable/disable all of the Rx rings, flush any remaining writes, and
+ * then verify that they have all been enabled/disabled successfully. This will
+ * let all of the register writes complete when enabling/disabling the Rx rings
+ * before waiting for the change in hardware to complete.
  */
-#ifndef CONFIG_PCI_IOV
-static
-#endif /* !CONFIG_PCI_IOV */
-int ice_vsi_ctrl_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx)
+static int ice_vsi_ctrl_all_rx_rings(struct ice_vsi *vsi, bool ena)
 {
-	int pf_q = vsi->rxq_map[rxq_idx];
-	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
 	int ret = 0;
-	u32 rx_reg;
-
-	rx_reg = rd32(hw, QRX_CTRL(pf_q));
-
-	/* Skip if the queue is already in the requested state */
-	if (ena == !!(rx_reg & QRX_CTRL_QENA_STAT_M))
-		return 0;
-
-	/* turn on/off the queue */
-	if (ena)
-		rx_reg |= QRX_CTRL_QENA_REQ_M;
-	else
-		rx_reg &= ~QRX_CTRL_QENA_REQ_M;
-	wr32(hw, QRX_CTRL(pf_q), rx_reg);
-
-	/* wait for the change to finish */
-	ret = ice_pf_rxq_wait(pf, pf_q, ena);
-	if (ret)
-		dev_err(&pf->pdev->dev,
-			"VSI idx %d Rx ring %d %sable timeout\n",
-			vsi->idx, pf_q, (ena ? "en" : "dis"));
+	u16 i;
 
-	return ret;
-}
+	for (i = 0; i < vsi->num_rxq; i++)
+		ice_vsi_ctrl_one_rx_ring(vsi, ena, i, false);
 
-/**
- * ice_vsi_ctrl_rx_rings - Start or stop a VSI's Rx rings
- * @vsi: the VSI being configured
- * @ena: start or stop the Rx rings
- */
-static int ice_vsi_ctrl_rx_rings(struct ice_vsi *vsi, bool ena)
-{
-	int i, ret = 0;
+	ice_flush(&vsi->back->hw);
 
 	for (i = 0; i < vsi->num_rxq; i++) {
-		ret = ice_vsi_ctrl_rx_ring(vsi, ena, i);
+		ret = ice_vsi_wait_one_rx_ring(vsi, ena, i);
 		if (ret)
 			break;
 	}
@@ -258,36 +76,46 @@ static int ice_vsi_ctrl_rx_rings(struct ice_vsi *vsi, bool ena)
 static int ice_vsi_alloc_arrays(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (vsi->type == ICE_VSI_CHNL)
+		return 0;
 
 	/* allocate memory for both Tx and Rx ring pointers */
-	vsi->tx_rings = devm_kcalloc(&pf->pdev->dev, vsi->alloc_txq,
+	vsi->tx_rings = devm_kcalloc(dev, vsi->alloc_txq,
 				     sizeof(*vsi->tx_rings), GFP_KERNEL);
 	if (!vsi->tx_rings)
 		return -ENOMEM;
 
-	vsi->rx_rings = devm_kcalloc(&pf->pdev->dev, vsi->alloc_rxq,
+	vsi->rx_rings = devm_kcalloc(dev, vsi->alloc_rxq,
 				     sizeof(*vsi->rx_rings), GFP_KERNEL);
 	if (!vsi->rx_rings)
 		goto err_rings;
 
-	vsi->txq_map = devm_kcalloc(&pf->pdev->dev, vsi->alloc_txq,
+#ifdef HAVE_XDP_SUPPORT
+	/* XDP will have vsi->alloc_txq Tx queues as well, so double the size */
+	vsi->txq_map = devm_kcalloc(dev, (2 * vsi->alloc_txq),
+				    sizeof(*vsi->txq_map), GFP_KERNEL);
+#else
+	vsi->txq_map = devm_kcalloc(dev, vsi->alloc_txq,
 				    sizeof(*vsi->txq_map), GFP_KERNEL);
+#endif /* HAVE_XDP_SUPPORT */
 
 	if (!vsi->txq_map)
 		goto err_txq_map;
 
-	vsi->rxq_map = devm_kcalloc(&pf->pdev->dev, vsi->alloc_rxq,
+	vsi->rxq_map = devm_kcalloc(dev, vsi->alloc_rxq,
 				    sizeof(*vsi->rxq_map), GFP_KERNEL);
 	if (!vsi->rxq_map)
 		goto err_rxq_map;
 
-
 	/* There is no need to allocate q_vectors for a loopback VSI. */
 	if (vsi->type == ICE_VSI_LB)
 		return 0;
 
 	/* allocate memory for q_vector pointers */
-	vsi->q_vectors = devm_kcalloc(&pf->pdev->dev, vsi->num_q_vectors,
+	vsi->q_vectors = devm_kcalloc(dev, vsi->num_q_vectors,
 				      sizeof(*vsi->q_vectors), GFP_KERNEL);
 	if (!vsi->q_vectors)
 		goto err_vectors;
@@ -295,13 +123,13 @@ static int ice_vsi_alloc_arrays(struct ice_vsi *vsi)
 	return 0;
 
 err_vectors:
-	devm_kfree(&pf->pdev->dev, vsi->rxq_map);
+	devm_kfree(dev, vsi->rxq_map);
 err_rxq_map:
-	devm_kfree(&pf->pdev->dev, vsi->txq_map);
+	devm_kfree(dev, vsi->txq_map);
 err_txq_map:
-	devm_kfree(&pf->pdev->dev, vsi->rx_rings);
+	devm_kfree(dev, vsi->rx_rings);
 err_rings:
-	devm_kfree(&pf->pdev->dev, vsi->tx_rings);
+	devm_kfree(dev, vsi->tx_rings);
 	return -ENOMEM;
 }
 
@@ -313,14 +141,22 @@ static void ice_vsi_set_num_desc(struct ice_vsi *vsi)
 {
 	switch (vsi->type) {
 	case ICE_VSI_PF:
-		/* fall through */
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+	case ICE_VSI_CTRL:
 	case ICE_VSI_LB:
-		vsi->num_rx_desc = ICE_DFLT_NUM_RX_DESC;
-		vsi->num_tx_desc = ICE_DFLT_NUM_TX_DESC;
+		/* a user could change the values of num_[tr]x_desc using
+		 * ethtool -G so we should keep those values instead of
+		 * overwriting them with the defaults.
+		 */
+		if (!vsi->num_rx_desc)
+			vsi->num_rx_desc = ICE_DFLT_NUM_RX_DESC;
+		if (!vsi->num_tx_desc)
+			vsi->num_tx_desc = ICE_DFLT_NUM_TX_DESC;
 		break;
 	default:
-		dev_dbg(&vsi->back->pdev->dev,
-			"Not setting number of Tx/Rx descriptors for VSI type %d\n",
+		dev_dbg(ice_pf_to_dev(vsi->back), "Not setting number of Tx/Rx descriptors for VSI type %d\n",
 			vsi->type);
 		break;
 	}
@@ -340,42 +176,92 @@ static void ice_vsi_set_num_qs(struct ice_vsi *vsi, u16 vf_id)
 
 	if (vsi->type == ICE_VSI_VF)
 		vsi->vf_id = vf_id;
-
+	else
+		vsi->vf_id = ICE_INVAL_VFID;
 	switch (vsi->type) {
 	case ICE_VSI_PF:
-		vsi->alloc_txq = min_t(int, ice_get_avail_txq_count(pf),
-				       num_online_cpus());
+		/* default to 1 Tx queue per MSI-X to not hurt our performance */
+		vsi->alloc_txq = min3(pf->num_lan_msix,
+				      ice_get_avail_txq_count(pf),
+				      (u16)num_online_cpus());
+		if (vsi->req_txq) {
+			vsi->alloc_txq = vsi->req_txq;
+			vsi->num_txq = vsi->req_txq;
+		}
 
 		pf->num_lan_tx = vsi->alloc_txq;
 
 		/* only 1 Rx queue unless RSS is enabled */
-		if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+		if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
 			vsi->alloc_rxq = 1;
-		else
-			vsi->alloc_rxq = min_t(int, ice_get_avail_rxq_count(pf),
-					       num_online_cpus());
+		} else {
+			/* default to 1 Rx queue per MSI-X to not hurt our performance */
+			vsi->alloc_rxq = min3(pf->num_lan_msix,
+					      ice_get_avail_rxq_count(pf),
+					      (u16)num_online_cpus());
+			if (vsi->req_rxq) {
+				vsi->alloc_rxq = vsi->req_rxq;
+				vsi->num_rxq = vsi->req_rxq;
+			}
+		}
 
 		pf->num_lan_rx = vsi->alloc_rxq;
 
-		vsi->num_q_vectors = max_t(int, vsi->alloc_rxq, vsi->alloc_txq);
+		vsi->num_q_vectors = min_t(int, pf->num_lan_msix,
+					   max_t(int, vsi->alloc_rxq, vsi->alloc_txq));
+		break;
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+		vsi->alloc_txq = ICE_DFLT_TXQ_VMDQ_VSI;
+		vsi->alloc_rxq = ICE_DFLT_RXQ_VMDQ_VSI;
+		vsi->num_q_vectors = ICE_DFLT_VEC_VMDQ_VSI;
+		break;
+	case ICE_VSI_SWITCHDEV_CTRL:
+		/* The number of queues for ctrl vsi is equal to number of VFs.
+		 * Each ring is associated to the corresponding VF_PR netdev.
+		 */
+		vsi->alloc_txq = pf->num_alloc_vfs;
+		vsi->alloc_rxq = pf->num_alloc_vfs;
+		vsi->num_q_vectors = 1;
 		break;
 	case ICE_VSI_VF:
 		vf = &pf->vf[vsi->vf_id];
-		vsi->alloc_txq = vf->num_vf_qs;
-		vsi->alloc_rxq = vf->num_vf_qs;
-		/* pf->num_vf_msix includes (VF miscellaneous vector +
+		/* pf->num_msix_per_vf includes (VF miscellaneous vector +
 		 * data queue interrupts). Since vsi->num_q_vectors is number
 		 * of queues vectors, subtract 1 (ICE_NONQ_VECS_VF) from the
 		 * original vector count
 		 */
-		vsi->num_q_vectors = pf->num_vf_msix - ICE_NONQ_VECS_VF;
+		if (vf->adq_enabled && vf->num_tc) {
+			u8 tc = vsi->vf_adq_tc;
+
+			vsi->alloc_txq = vf->ch[tc].num_qps;
+			vsi->alloc_rxq = vf->ch[tc].num_qps;
+			vsi->num_q_vectors = vf->ch[tc].num_qps;
+		} else {
+			if (vf->num_req_qs)
+				vf->num_vf_qs = vf->num_req_qs;
+
+			vsi->alloc_txq = vf->num_vf_qs;
+			vsi->alloc_rxq = vf->num_vf_qs;
+			vsi->num_q_vectors = pf->num_msix_per_vf -
+				ICE_NONQ_VECS_VF;
+		}
+		break;
+	case ICE_VSI_CTRL:
+		vsi->alloc_txq = 1;
+		vsi->alloc_rxq = 1;
+		vsi->num_q_vectors = 1;
+		break;
+	case ICE_VSI_CHNL:
+		vsi->alloc_txq = 0;
+		vsi->alloc_rxq = 0;
 		break;
 	case ICE_VSI_LB:
 		vsi->alloc_txq = 1;
 		vsi->alloc_rxq = 1;
 		break;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n", vsi->type);
+		dev_warn(ice_pf_to_dev(pf), "Unknown VSI type %d\n", vsi->type);
 		break;
 	}
 
@@ -383,7 +269,7 @@ static void ice_vsi_set_num_qs(struct ice_vsi *vsi, u16 vf_id)
 }
 
 /**
- * ice_get_free_slot - get the next non-NULL location index in array
+ * ice_get_free_slot - get the next available free slot in array
  * @array: array to search
  * @size: size of the array
  * @curr: last known occupied index to be used as a search hint
@@ -421,7 +307,7 @@ void ice_vsi_delete(struct ice_vsi *vsi)
 	struct ice_vsi_ctx *ctxt;
 	enum ice_status status;
 
-	ctxt = devm_kzalloc(&pf->pdev->dev, sizeof(*ctxt), GFP_KERNEL);
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
 	if (!ctxt)
 		return;
 
@@ -433,10 +319,10 @@ void ice_vsi_delete(struct ice_vsi *vsi)
 
 	status = ice_free_vsi(&pf->hw, vsi->idx, ctxt, false, NULL);
 	if (status)
-		dev_err(&pf->pdev->dev, "Failed to delete VSI %i in FW\n",
-			vsi->vsi_num);
+		dev_err(ice_pf_to_dev(pf), "Failed to delete VSI %i in FW - error: %s\n",
+			vsi->vsi_num, ice_stat_str(status));
 
-	devm_kfree(&pf->pdev->dev, ctxt);
+	kfree(ctxt);
 }
 
 /**
@@ -446,30 +332,99 @@ void ice_vsi_delete(struct ice_vsi *vsi)
 static void ice_vsi_free_arrays(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
 
 	/* free the ring and vector containers */
 	if (vsi->q_vectors) {
-		devm_kfree(&pf->pdev->dev, vsi->q_vectors);
+		devm_kfree(dev, vsi->q_vectors);
 		vsi->q_vectors = NULL;
 	}
 	if (vsi->tx_rings) {
-		devm_kfree(&pf->pdev->dev, vsi->tx_rings);
+		devm_kfree(dev, vsi->tx_rings);
 		vsi->tx_rings = NULL;
 	}
 	if (vsi->rx_rings) {
-		devm_kfree(&pf->pdev->dev, vsi->rx_rings);
+		devm_kfree(dev, vsi->rx_rings);
 		vsi->rx_rings = NULL;
 	}
 	if (vsi->txq_map) {
-		devm_kfree(&pf->pdev->dev, vsi->txq_map);
+		devm_kfree(dev, vsi->txq_map);
 		vsi->txq_map = NULL;
 	}
 	if (vsi->rxq_map) {
-		devm_kfree(&pf->pdev->dev, vsi->rxq_map);
+		devm_kfree(dev, vsi->rxq_map);
 		vsi->rxq_map = NULL;
 	}
 }
 
+static void ice_vsi_free_rss_global_lut_memory(struct ice_vsi *vsi)
+{
+	if (vsi->global_lut_id) {
+		devm_kfree(ice_pf_to_dev(vsi->back), vsi->global_lut_id);
+		vsi->global_lut_id = NULL;
+	}
+}
+
+/**
+ * ice_vsi_free_rss_global_lut - free the VSI's global RSS LUT
+ * @vsi: VSI to free global RSS LUT for
+ *
+ * If the VSI didn't allocate a global RSS LUT, then there is nothing to do. The vsi->global_lut_id
+ * will always be cleared and freed regardless of the result of freeing the global RSS LUT.
+ */
+static void ice_vsi_free_rss_global_lut(struct ice_vsi *vsi)
+{
+	enum ice_status status;
+
+	if (!vsi->global_lut_id)
+		return;
+
+	status = ice_free_rss_global_lut(&vsi->back->hw, *vsi->global_lut_id);
+	if (status)
+		dev_dbg(ice_pf_to_dev(vsi->back),
+			"Failed to free RSS global LUT ID %d for %s %d, status %d\n",
+			*vsi->global_lut_id, ice_vsi_type_str(vsi->type), vsi->idx, status);
+
+	ice_vsi_free_rss_global_lut_memory(vsi);
+}
+
+/**
+ * ice_vsi_alloc_rss_global_lut - allocate a global RSS LUT for this VSI
+ * @vsi: VSI to allocate global RSS LUT for
+ *
+ * Allocate a global RSS LUT if the VSI supports it. The caller must take care if allocating the
+ * global RSS LUT fails since vsi->global_lut_id will be NULL, which means there are no global RSS
+ * LUT resources available. There might be other causes of failure, but they can all be treated as
+ * the device having no more global RSS LUT resources available.
+ */
+static void ice_vsi_alloc_rss_global_lut(struct ice_vsi *vsi)
+{
+	enum ice_status status;
+	struct device *dev;
+
+	if (vsi->type != ICE_VSI_VF)
+		return;
+
+	/* VSI LUT is wide enough for queue groups up to ICE_MAX_SMALL_RS_QS */
+	if (vsi->alloc_rxq <= ICE_MAX_SMALL_RSS_QS)
+		return;
+
+	dev = ice_pf_to_dev(vsi->back);
+	vsi->global_lut_id = devm_kzalloc(dev, sizeof(vsi->global_lut_id),
+					  GFP_KERNEL);
+	if (!vsi->global_lut_id)
+		return;
+
+	status = ice_alloc_rss_global_lut(&vsi->back->hw, false, vsi->global_lut_id);
+	if (status) {
+		dev_dbg(dev, "failed to allocate RSS global LUT for %s %d, status %d\n",
+			ice_vsi_type_str(vsi->type), vsi->idx, status);
+		ice_vsi_free_rss_global_lut_memory(vsi);
+	}
+}
+
 /**
  * ice_vsi_clear - clean up and deallocate the provided VSI
  * @vsi: pointer to VSI being cleared
@@ -482,6 +437,7 @@ static void ice_vsi_free_arrays(struct ice_vsi *vsi)
 int ice_vsi_clear(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = NULL;
+	struct device *dev;
 
 	if (!vsi)
 		return 0;
@@ -490,10 +446,10 @@ int ice_vsi_clear(struct ice_vsi *vsi)
 		return -EINVAL;
 
 	pf = vsi->back;
+	dev = ice_pf_to_dev(pf);
 
 	if (!pf->vsi[vsi->idx] || pf->vsi[vsi->idx] != vsi) {
-		dev_dbg(&pf->pdev->dev, "vsi does not exist at pf->vsi[%d]\n",
-			vsi->idx);
+		dev_dbg(dev, "vsi does not exist at pf->vsi[%d]\n", vsi->idx);
 		return -EINVAL;
 	}
 
@@ -501,44 +457,92 @@ int ice_vsi_clear(struct ice_vsi *vsi)
 	/* updates the PF for this cleared VSI */
 
 	pf->vsi[vsi->idx] = NULL;
-	if (vsi->idx < pf->next_vsi)
+	if (vsi->idx < pf->next_vsi && vsi->type != ICE_VSI_CTRL)
+		pf->next_vsi = vsi->idx;
+	if (vsi->idx < pf->next_vsi && vsi->type == ICE_VSI_CTRL &&
+	    vsi->vf_id != ICE_INVAL_VFID)
 		pf->next_vsi = vsi->idx;
 
 	ice_vsi_free_arrays(vsi);
+	ice_vsi_free_rss_global_lut(vsi);
 	mutex_unlock(&pf->sw_mutex);
-	devm_kfree(&pf->pdev->dev, vsi);
+	devm_kfree(dev, vsi);
 
 	return 0;
 }
 
+/**
+ * ice_msix_clean_ctrl_vsi - MSIX mode interrupt handler for ctrl VSI
+ * @irq: interrupt number
+ * @data: pointer to a q_vector
+ */
+static irqreturn_t ice_msix_clean_ctrl_vsi(int __always_unused irq, void *data)
+{
+	struct ice_q_vector *q_vector = (struct ice_q_vector *)data;
+
+	if (!q_vector->tx.ring)
+		return IRQ_HANDLED;
+
+#define FDIR_RX_DESC_CLEAN_BUDGET 64
+	ice_clean_rx_irq(q_vector->rx.ring, FDIR_RX_DESC_CLEAN_BUDGET);
+	ice_clean_ctrl_tx_irq(q_vector->tx.ring);
+
+	return IRQ_HANDLED;
+}
+
 /**
  * ice_msix_clean_rings - MSIX mode Interrupt Handler
  * @irq: interrupt number
  * @data: pointer to a q_vector
  */
+#ifdef HAVE_NETPOLL_CONTROLLER
+irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data)
+#else
 static irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data)
+#endif /* HAVE_NETPOLL_CONTROLLER */
 {
 	struct ice_q_vector *q_vector = (struct ice_q_vector *)data;
 
 	if (!q_vector->tx.ring && !q_vector->rx.ring)
 		return IRQ_HANDLED;
 
+	q_vector->total_events++;
+
 	napi_schedule(&q_vector->napi);
 
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t ice_eswitch_msix_clean_rings(int __always_unused irq, void *data)
+{
+	struct ice_q_vector *q_vector = (struct ice_q_vector *)data;
+	struct ice_pf *pf = q_vector->vsi->back;
+	int i;
+
+	if (!q_vector->tx.ring && !q_vector->rx.ring)
+		return IRQ_HANDLED;
+
+	ice_for_each_vf(pf, i)
+		napi_schedule(&pf->vf[i].repr->q_vector->napi);
+
+	return IRQ_HANDLED;
+}
+
 /**
  * ice_vsi_alloc - Allocates the next available struct VSI in the PF
  * @pf: board private structure
- * @type: type of VSI
+ * @vsi_type: type of VSI
+ * @ch: ptr to channel
+ * @tc: traffic class number for VF ADQ
  * @vf_id: ID of the VF being configured
  *
  * returns a pointer to a VSI on success, NULL on failure.
  */
 static struct ice_vsi *
-ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type, u16 vf_id)
+ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type vsi_type,
+	      struct ice_channel *ch, u16 vf_id, u8 tc)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_vsi *vsi = NULL;
 
 	/* Need to protect the allocation of the VSIs at the PF level */
@@ -549,26 +553,28 @@ ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type, u16 vf_id)
 	 * is available to be populated
 	 */
 	if (pf->next_vsi == ICE_NO_VSI) {
-		dev_dbg(&pf->pdev->dev, "out of VSI slots!\n");
+		dev_dbg(dev, "out of VSI slots!\n");
 		goto unlock_pf;
 	}
 
-	vsi = devm_kzalloc(&pf->pdev->dev, sizeof(*vsi), GFP_KERNEL);
+	vsi = devm_kzalloc(dev, sizeof(*vsi), GFP_KERNEL);
 	if (!vsi)
 		goto unlock_pf;
 
-	vsi->type = type;
+	vsi->type = vsi_type;
 	vsi->back = pf;
-	set_bit(__ICE_DOWN, vsi->state);
-
-	vsi->idx = pf->next_vsi;
+	if (vsi_type == ICE_VSI_VF)
+		vsi->vf_adq_tc = tc;
+	set_bit(ICE_VSI_DOWN, vsi->state);
 
-	if (type == ICE_VSI_VF)
+	if (vsi_type == ICE_VSI_VF)
 		ice_vsi_set_num_qs(vsi, vf_id);
-	else
+	else if (vsi_type != ICE_VSI_CHNL)
 		ice_vsi_set_num_qs(vsi, ICE_INVAL_VFID);
 
 	switch (vsi->type) {
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
 	case ICE_VSI_PF:
 		if (ice_vsi_alloc_arrays(vsi))
 			goto err_rings;
@@ -576,29 +582,61 @@ ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type, u16 vf_id)
 		/* Setup default MSIX irq handler for VSI */
 		vsi->irq_handler = ice_msix_clean_rings;
 		break;
+	case ICE_VSI_SWITCHDEV_CTRL:
+		if (ice_vsi_alloc_arrays(vsi))
+			goto err_rings;
+
+		/* Setup eswitch MSIX irq handler for VSI */
+		vsi->irq_handler = ice_eswitch_msix_clean_rings;
+		break;
+	case ICE_VSI_CTRL:
+		if (ice_vsi_alloc_arrays(vsi))
+			goto err_rings;
+
+		/* Setup ctrl VSI MSIX irq handler */
+		vsi->irq_handler = ice_msix_clean_ctrl_vsi;
+		break;
 	case ICE_VSI_VF:
 		if (ice_vsi_alloc_arrays(vsi))
 			goto err_rings;
 		break;
+	case ICE_VSI_CHNL:
+		if (!ch)
+			goto err_rings;
+		vsi->num_rxq = ch->num_rxq;
+		vsi->num_txq = ch->num_txq;
+		vsi->next_base_q = ch->base_q;
+		break;
 	case ICE_VSI_LB:
 		if (ice_vsi_alloc_arrays(vsi))
 			goto err_rings;
 		break;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n", vsi->type);
+		dev_warn(dev, "Unknown VSI type %d\n", vsi->type);
 		goto unlock_pf;
 	}
 
-	/* fill VSI slot in the PF struct */
-	pf->vsi[pf->next_vsi] = vsi;
+	if (vsi->type == ICE_VSI_CTRL && vf_id == ICE_INVAL_VFID) {
+		/* Use the last VSI slot as the index for PF control VSI */
+		vsi->idx = pf->num_alloc_vsi - 1;
+		pf->ctrl_vsi_idx = vsi->idx;
+		pf->vsi[vsi->idx] = vsi;
+	} else {
+		/* fill slot and make note of the index */
+		vsi->idx = pf->next_vsi;
+		pf->vsi[pf->next_vsi] = vsi;
+
+		/* prepare pf->next_vsi for next use */
+		pf->next_vsi = ice_get_free_slot(pf->vsi, pf->num_alloc_vsi,
+						 pf->next_vsi);
+	}
 
-	/* prepare pf->next_vsi for next use */
-	pf->next_vsi = ice_get_free_slot(pf->vsi, pf->num_alloc_vsi,
-					 pf->next_vsi);
+	if (vsi->type == ICE_VSI_CTRL && vf_id != ICE_INVAL_VFID)
+		pf->vf[vf_id].ctrl_vsi_idx = vsi->idx;
 	goto unlock_pf;
 
 err_rings:
-	devm_kfree(&pf->pdev->dev, vsi);
+	devm_kfree(dev, vsi);
 	vsi = NULL;
 unlock_pf:
 	mutex_unlock(&pf->sw_mutex);
@@ -606,85 +644,99 @@ ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type, u16 vf_id)
 }
 
 /**
- * __ice_vsi_get_qs_contig - Assign a contiguous chunk of queues to VSI
- * @qs_cfg: gathered variables needed for PF->VSI queues assignment
+ * ice_alloc_fd_res - Allocate FD resource for a VSI
+ * @vsi: pointer to the ice_vsi
  *
- * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
+ * This allocates the FD resources
+ *
+ * Returns 0 on success, -EPERM on no-op or -EIO on failure
  */
-static int __ice_vsi_get_qs_contig(struct ice_qs_cfg *qs_cfg)
+static int ice_alloc_fd_res(struct ice_vsi *vsi)
 {
-	int offset, i;
+	struct ice_pf *pf = vsi->back;
+	u32 g_val, b_val;
 
-	mutex_lock(qs_cfg->qs_mutex);
-	offset = bitmap_find_next_zero_area(qs_cfg->pf_map, qs_cfg->pf_map_size,
-					    0, qs_cfg->q_count, 0);
-	if (offset >= qs_cfg->pf_map_size) {
-		mutex_unlock(qs_cfg->qs_mutex);
-		return -ENOMEM;
-	}
+	/* Flow Director filters are only allocated/assigned to the PF VSI or
+	 * CHNL VSI which passes the traffic. The CTRL VSI is only used to
+	 * add/delete filters so we don't allocate resources to it
+	 */
 
-	bitmap_set(qs_cfg->pf_map, offset, qs_cfg->q_count);
-	for (i = 0; i < qs_cfg->q_count; i++)
-		qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = i + offset;
-	mutex_unlock(qs_cfg->qs_mutex);
+	/* FD filters from guaranteed pool per VSI */
+	g_val = pf->hw.func_caps.fd_fltr_guar;
+	if (!g_val)
+		return -EPERM;
 
-	return 0;
-}
+	/* FD filters from best effort pool */
+	b_val = pf->hw.func_caps.fd_fltr_best_effort;
+	if (!b_val)
+		return -EPERM;
 
-/**
- * __ice_vsi_get_qs_sc - Assign a scattered queues from PF to VSI
- * @qs_cfg: gathered variables needed for pf->vsi queues assignment
- *
- * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
- */
-static int __ice_vsi_get_qs_sc(struct ice_qs_cfg *qs_cfg)
-{
-	int i, index = 0;
+	if (!(vsi->type == ICE_VSI_PF || vsi->type == ICE_VSI_CHNL ||
+	      vsi->type == ICE_VSI_VF))
+		return -EPERM;
 
-	mutex_lock(qs_cfg->qs_mutex);
-	for (i = 0; i < qs_cfg->q_count; i++) {
-		index = find_next_zero_bit(qs_cfg->pf_map,
-					   qs_cfg->pf_map_size, index);
-		if (index >= qs_cfg->pf_map_size)
-			goto err_scatter;
-		set_bit(index, qs_cfg->pf_map);
-		qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = index;
-	}
-	mutex_unlock(qs_cfg->qs_mutex);
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EPERM;
 
-	return 0;
-err_scatter:
-	for (index = 0; index < i; index++) {
-		clear_bit(qs_cfg->vsi_map[index], qs_cfg->pf_map);
-		qs_cfg->vsi_map[index + qs_cfg->vsi_map_offset] = 0;
-	}
-	mutex_unlock(qs_cfg->qs_mutex);
+	/* PF main VSI gets only 64 FD resources from guaranteed pool
+	 * when ADQ is configured. This is current policy, change as needed
+	 */
+#define ICE_PF_VSI_GFLTR	64
 
-	return -ENOMEM;
-}
+	/* determines FD filter resources per VSI from shared(best effort) and
+	 * dedicated pool
+	 */
+	if (vsi->type == ICE_VSI_PF) {
+		vsi->num_gfltr = g_val;
+#ifdef NETIF_F_HW_TC
+		/* if MQPRIO ic configured, main VSI doesn't get all
+		 * FD resources from guaranteed pool. Current policy is,
+		 * PF VSI gets 64 FD resources
+		 */
+		if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+			if (g_val < ICE_PF_VSI_GFLTR)
+				return -EPERM;
+			/* allow bare minimum entries for PF VSI */
+			vsi->num_gfltr = ICE_PF_VSI_GFLTR;
+		}
+#endif /* NETIF_F_HW_TC */
 
-/**
- * __ice_vsi_get_qs - helper function for assigning queues from PF to VSI
- * @qs_cfg: gathered variables needed for pf->vsi queues assignment
- *
- * This function first tries to find contiguous space. If it is not successful,
- * it tries with the scatter approach.
- *
- * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
- */
-static int __ice_vsi_get_qs(struct ice_qs_cfg *qs_cfg)
-{
-	int ret = 0;
+		/* each VSI gets same "best_effort" quota */
+		vsi->num_bfltr = b_val;
+	} else if (vsi->type == ICE_VSI_VF) {
+		vsi->num_gfltr = 0;
+
+		/* each VSI gets same "best_effort" quota */
+		vsi->num_bfltr = b_val;
+	} else {
+		struct ice_vsi *main_vsi;
+		int numtc;
+
+		main_vsi = ice_get_main_vsi(pf);
+		if (!main_vsi)
+			return -EPERM;
+
+		if (!main_vsi->all_numtc)
+			return -EINVAL;
+
+		/* figure out ADQ numtc */
+		numtc = main_vsi->all_numtc - ICE_CHNL_START_TC;
+
+		/* only one TC but still asking resources for channels,
+		 * invalid config
+		 */
+		if (numtc < ICE_CHNL_START_TC)
+			return -EPERM;
 
-	ret = __ice_vsi_get_qs_contig(qs_cfg);
-	if (ret) {
-		/* contig failed, so try with scatter approach */
-		qs_cfg->mapping_mode = ICE_VSI_MAP_SCATTER;
-		qs_cfg->q_count = min_t(u16, qs_cfg->q_count,
-					qs_cfg->scatter_count);
-		ret = __ice_vsi_get_qs_sc(qs_cfg);
+		g_val -= ICE_PF_VSI_GFLTR;
+		/* channel VSIs gets equal share from guaranteed pool */
+		vsi->num_gfltr = g_val / numtc;
+
+		/* each VSI gets same "best_effort" quota */
+		vsi->num_bfltr = b_val;
 	}
-	return ret;
+
+	return 0;
 }
 
 /**
@@ -704,7 +756,7 @@ static int ice_vsi_get_qs(struct ice_vsi *vsi)
 		.scatter_count = ICE_MAX_SCATTER_TXQS,
 		.vsi_map = vsi->txq_map,
 		.vsi_map_offset = 0,
-		.mapping_mode = vsi->tx_mapping_mode
+		.mapping_mode = ICE_VSI_MAP_CONTIG
 	};
 	struct ice_qs_cfg rx_qs_cfg = {
 		.qs_mutex = &pf->avail_q_mutex,
@@ -714,18 +766,24 @@ static int ice_vsi_get_qs(struct ice_vsi *vsi)
 		.scatter_count = ICE_MAX_SCATTER_RXQS,
 		.vsi_map = vsi->rxq_map,
 		.vsi_map_offset = 0,
-		.mapping_mode = vsi->rx_mapping_mode
+		.mapping_mode = ICE_VSI_MAP_CONTIG
 	};
-	int ret = 0;
+	int ret;
 
-	vsi->tx_mapping_mode = ICE_VSI_MAP_CONTIG;
-	vsi->rx_mapping_mode = ICE_VSI_MAP_CONTIG;
+	if (vsi->type == ICE_VSI_CHNL)
+		return 0;
 
 	ret = __ice_vsi_get_qs(&tx_qs_cfg);
-	if (!ret)
-		ret = __ice_vsi_get_qs(&rx_qs_cfg);
+	if (ret)
+		return ret;
+	vsi->tx_mapping_mode = tx_qs_cfg.mapping_mode;
 
-	return ret;
+	ret = __ice_vsi_get_qs(&rx_qs_cfg);
+	if (ret)
+		return ret;
+	vsi->rx_mapping_mode = rx_qs_cfg.mapping_mode;
+
+	return 0;
 }
 
 /**
@@ -764,69 +822,128 @@ bool ice_is_safe_mode(struct ice_pf *pf)
 }
 
 /**
- * ice_rss_clean - Delete RSS related VSI structures that hold user inputs
- * @vsi: the VSI being removed
+ * ice_is_peer_ena
+ * @pf: pointer to the PF struct
+ *
+ * returns true if peer devices/drivers are supported, false otherwise
  */
-static void ice_rss_clean(struct ice_vsi *vsi)
+bool ice_is_peer_ena(struct ice_pf *pf)
 {
-	struct ice_pf *pf;
-
-	pf = vsi->back;
-
-	if (vsi->rss_hkey_user)
-		devm_kfree(&pf->pdev->dev, vsi->rss_hkey_user);
-	if (vsi->rss_lut_user)
-		devm_kfree(&pf->pdev->dev, vsi->rss_lut_user);
+	return test_bit(ICE_FLAG_PEER_ENA, pf->flags);
 }
 
 /**
- * ice_vsi_set_rss_params - Setup RSS capabilities per VSI type
- * @vsi: the VSI being configured
+ * ice_vsi_clean_rss_flow_fld - Delete RSS configuration
+ * @vsi: the VSI being cleaned up
+ *
+ * This function deletes RSS input set for all flows that were configured
+ * for this VSI
  */
-static void ice_vsi_set_rss_params(struct ice_vsi *vsi)
+static void ice_vsi_clean_rss_flow_fld(struct ice_vsi *vsi)
 {
-	struct ice_hw_common_caps *cap;
 	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
-		vsi->rss_size = 1;
+	if (ice_is_safe_mode(pf))
 		return;
-	}
 
-	cap = &pf->hw.func_caps.common_cap;
+	status = ice_rem_vsi_rss_cfg(&pf->hw, vsi->idx);
+	if (status)
+		dev_dbg(ice_pf_to_dev(pf), "ice_rem_vsi_rss_cfg failed for vsi = %d, error = %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+}
+
+/**
+ * ice_rss_clean - Delete RSS related VSI structures and configuration
+ * @vsi: the VSI being removed
+ *
+ * This function deletes RSS related VSI structures that hold user inputs
+ * and removes RSS configuration
+ */
+static void ice_rss_clean(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (vsi->rss_hkey_user)
+		devm_kfree(dev, vsi->rss_hkey_user);
+	if (vsi->rss_lut_user)
+		devm_kfree(dev, vsi->rss_lut_user);
+
+	ice_vsi_clean_rss_flow_fld(vsi);
+	/* remove RSS replay list */
+	if (!ice_is_safe_mode(pf))
+		ice_rem_vsi_rss_list(&pf->hw, vsi->idx);
+}
+
+/**
+ * ice_vsi_set_rss_params - Setup RSS capabilities per VSI type
+ * @vsi: the VSI being configured
+ */
+static void ice_vsi_set_rss_params(struct ice_vsi *vsi)
+{
+	struct ice_hw_common_caps *cap;
+	struct ice_pf *pf = vsi->back;
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+		vsi->rss_size = 1;
+		return;
+	}
+
+	cap = &pf->hw.func_caps.common_cap;
 	switch (vsi->type) {
+	case ICE_VSI_CHNL:
 	case ICE_VSI_PF:
 		/* PF VSI will inherit RSS instance of PF */
-		vsi->rss_table_size = cap->rss_table_size;
-		vsi->rss_size = min_t(int, num_online_cpus(),
-				      BIT(cap->rss_table_entry_width));
+		vsi->rss_table_size = (u16)cap->rss_table_size;
+		if (vsi->type == ICE_VSI_CHNL)
+			vsi->rss_size = min_t(u16, vsi->num_rxq,
+					      BIT(cap->rss_table_entry_width));
+		else
+			vsi->rss_size = min_t(u16, num_online_cpus(),
+					      BIT(cap->rss_table_entry_width));
 		vsi->rss_lut_type = ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF;
 		break;
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		vsi->rss_table_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
+		vsi->rss_size = min_t(u16, num_online_cpus(),
+				      BIT(cap->rss_table_entry_width));
+		vsi->rss_lut_type = ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI;
+		break;
 	case ICE_VSI_VF:
-		/* VF VSI will gets a small RSS table
-		 * For VSI_LUT, LUT size should be set to 64 bytes
+		/* VF VSI will get a small RSS table.
+		 * For VSI_LUT, LUT size should be set to 64 bytes.
 		 */
 		vsi->rss_table_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
-		vsi->rss_size = min_t(int, num_online_cpus(),
-				      BIT(cap->rss_table_entry_width));
+		vsi->rss_size = ICE_MAX_RSS_QS_PER_VF;
 		vsi->rss_lut_type = ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI;
+		if (vsi->global_lut_id) {
+			vsi->rss_table_size = ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512;
+			vsi->rss_size = ICE_MAX_MEDIUM_RSS_QS;
+			vsi->rss_lut_type = ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL;
+		}
 		break;
 	case ICE_VSI_LB:
 		break;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n",
-			 vsi->type);
+		dev_dbg(ice_pf_to_dev(pf), "Unsupported VSI type %s\n",
+			ice_vsi_type_str(vsi->type));
 		break;
 	}
 }
 
 /**
  * ice_set_dflt_vsi_ctx - Set default VSI context before adding a VSI
+ * @hw: HW structure used to determine the VLAN mode of the device
  * @ctxt: the VSI context being set
  *
  * This initializes a default VSI context for all sections except the Queues.
  */
-static void ice_set_dflt_vsi_ctx(struct ice_vsi_ctx *ctxt)
+static void ice_set_dflt_vsi_ctx(struct ice_hw *hw, struct ice_vsi_ctx *ctxt)
 {
 	u32 table = 0;
 
@@ -837,13 +954,27 @@ static void ice_set_dflt_vsi_ctx(struct ice_vsi_ctx *ctxt)
 	ctxt->info.sw_flags = ICE_AQ_VSI_SW_FLAG_SRC_PRUNE;
 	/* Traffic from VSI can be sent to LAN */
 	ctxt->info.sw_flags2 = ICE_AQ_VSI_SW_FLAG_LAN_ENA;
-	/* By default bits 3 and 4 in vlan_flags are 0's which results in legacy
-	 * behavior (show VLAN, DEI, and UP) in descriptor. Also, allow all
-	 * packets untagged/tagged.
+	/* allow all untagged/tagged packets by default on Tx */
+	ctxt->info.inner_vlan_flags = ((ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL &
+				  ICE_AQ_VSI_INNER_VLAN_TX_MODE_M) >>
+				 ICE_AQ_VSI_INNER_VLAN_TX_MODE_S);
+	/* SVM - by default bits 3 and 4 in inner_vlan_flags are 0's which
+	 * results in legacy behavior (show VLAN, DEI, and UP) in descriptor.
+	 *
+	 * DVM - leave inner VLAN in packet by default
 	 */
-	ctxt->info.vlan_flags = ((ICE_AQ_VSI_VLAN_MODE_ALL &
-				  ICE_AQ_VSI_VLAN_MODE_M) >>
-				 ICE_AQ_VSI_VLAN_MODE_S);
+	if (ice_is_dvm_ena(hw)) {
+		ctxt->info.inner_vlan_flags |=
+			ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
+		ctxt->info.outer_vlan_flags =
+			(ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+			 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+			ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M;
+		ctxt->info.outer_vlan_flags |=
+			(ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
+			 ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+			ICE_AQ_VSI_OUTER_TAG_TYPE_M;
+	}
 	/* Have 1:1 UP mapping for both ingress/egress tables */
 	table |= ICE_UP_TABLE_TRANSLATE(0, 0);
 	table |= ICE_UP_TABLE_TRANSLATE(1, 1);
@@ -867,34 +998,28 @@ static void ice_set_dflt_vsi_ctx(struct ice_vsi_ctx *ctxt)
  */
 static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 {
-	u16 offset = 0, qmap = 0, tx_count = 0;
+	u16 offset = 0, qmap = 0, tx_count = 0, pow = 0;
+	u16 num_txq_per_tc, num_rxq_per_tc;
 	u16 qcount_tx = vsi->alloc_txq;
 	u16 qcount_rx = vsi->alloc_rxq;
-	u16 tx_numq_tc, rx_numq_tc;
-	u16 pow = 0, max_rss = 0;
-	bool ena_tc0 = false;
 	u8 netdev_tc = 0;
 	int i;
 
-	/* at least TC0 should be enabled by default */
-	if (vsi->tc_cfg.numtc) {
-		if (!(vsi->tc_cfg.ena_tc & BIT(0)))
-			ena_tc0 = true;
-	} else {
-		ena_tc0 = true;
+	if (!vsi->tc_cfg.numtc) {
+		/* at least TC0 should be enabled by default */
+		vsi->tc_cfg.numtc = 1;
+		vsi->tc_cfg.ena_tc = 1;
 	}
 
-	if (ena_tc0) {
-		vsi->tc_cfg.numtc++;
-		vsi->tc_cfg.ena_tc |= 1;
-	}
+	num_rxq_per_tc = min_t(u16, qcount_rx / vsi->tc_cfg.numtc, ICE_MAX_RXQS_PER_TC);
+	if (!num_rxq_per_tc)
+		num_rxq_per_tc = 1;
+	num_txq_per_tc = qcount_tx / vsi->tc_cfg.numtc;
+	if (!num_txq_per_tc)
+		num_txq_per_tc = 1;
 
-	rx_numq_tc = qcount_rx / vsi->tc_cfg.numtc;
-	if (!rx_numq_tc)
-		rx_numq_tc = 1;
-	tx_numq_tc = qcount_tx / vsi->tc_cfg.numtc;
-	if (!tx_numq_tc)
-		tx_numq_tc = 1;
+	/* find the (rounded up) power-of-2 of qcount */
+	pow = (u16)order_base_2(num_rxq_per_tc);
 
 	/* TC mapping is a function of the number of Rx queues assigned to the
 	 * VSI for each traffic class and the offset of these queues.
@@ -907,24 +1032,6 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	 *
 	 * Setup number and offset of Rx queues for all TCs for the VSI
 	 */
-
-	qcount_rx = rx_numq_tc;
-
-	/* qcount will change if RSS is enabled */
-	if (test_bit(ICE_FLAG_RSS_ENA, vsi->back->flags)) {
-		if (vsi->type == ICE_VSI_PF || vsi->type == ICE_VSI_VF) {
-			if (vsi->type == ICE_VSI_PF)
-				max_rss = ICE_MAX_LG_RSS_QS;
-			else
-				max_rss = ICE_MAX_SMALL_RSS_QS;
-			qcount_rx = min_t(int, rx_numq_tc, max_rss);
-			qcount_rx = min_t(int, qcount_rx, vsi->rss_size);
-		}
-	}
-
-	/* find the (rounded up) power-of-2 of qcount */
-	pow = order_base_2(qcount_rx);
-
 	ice_for_each_traffic_class(i) {
 		if (!(vsi->tc_cfg.ena_tc & BIT(i))) {
 			/* TC is not enabled */
@@ -938,16 +1045,16 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 
 		/* TC is enabled */
 		vsi->tc_cfg.tc_info[i].qoffset = offset;
-		vsi->tc_cfg.tc_info[i].qcount_rx = qcount_rx;
-		vsi->tc_cfg.tc_info[i].qcount_tx = tx_numq_tc;
+		vsi->tc_cfg.tc_info[i].qcount_rx = num_rxq_per_tc;
+		vsi->tc_cfg.tc_info[i].qcount_tx = num_txq_per_tc;
 		vsi->tc_cfg.tc_info[i].netdev_tc = netdev_tc++;
 
 		qmap = ((offset << ICE_AQ_VSI_TC_Q_OFFSET_S) &
 			ICE_AQ_VSI_TC_Q_OFFSET_M) |
 			((pow << ICE_AQ_VSI_TC_Q_NUM_S) &
 			 ICE_AQ_VSI_TC_Q_NUM_M);
-		offset += qcount_rx;
-		tx_count += tx_numq_tc;
+		offset += num_rxq_per_tc;
+		tx_count += num_txq_per_tc;
 		ctxt->info.tc_mapping[i] = cpu_to_le16(qmap);
 	}
 
@@ -960,12 +1067,12 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	if (offset)
 		vsi->num_rxq = offset;
 	else
-		vsi->num_rxq = qcount_rx;
+		vsi->num_rxq = num_rxq_per_tc;
 
 	vsi->num_txq = tx_count;
 
 	if (vsi->type == ICE_VSI_VF && vsi->num_txq != vsi->num_rxq) {
-		dev_dbg(&vsi->back->pdev->dev, "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n");
+		dev_dbg(ice_pf_to_dev(vsi->back), "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n");
 		/* since there is a chance that num_rxq could have been changed
 		 * in the above for loop, make num_txq equal to num_rxq.
 		 */
@@ -982,6 +1089,60 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	ctxt->info.q_mapping[1] = cpu_to_le16(vsi->num_rxq);
 }
 
+/**
+ * ice_set_fd_vsi_ctx - Set FD VSI context before adding a VSI
+ * @ctxt: the VSI context being set
+ * @vsi: the VSI being configured
+ */
+static void ice_set_fd_vsi_ctx(struct ice_vsi_ctx *ctxt, struct ice_vsi *vsi)
+{
+	u8 dflt_q_group, dflt_q_prio;
+	u16 dflt_q, report_q, val;
+
+	if (vsi->type != ICE_VSI_PF && vsi->type != ICE_VSI_CTRL &&
+	    vsi->type != ICE_VSI_VF && vsi->type != ICE_VSI_CHNL)
+		return;
+
+	val = ICE_AQ_VSI_PROP_FLOW_DIR_VALID | ICE_AQ_VSI_PROP_ACL_VALID;
+	ctxt->info.valid_sections |= cpu_to_le16(val);
+	dflt_q = 0;
+	dflt_q_group = 0;
+	report_q = 0;
+	dflt_q_prio = 0;
+
+	/* enable flow director filtering/programming */
+	val = ICE_AQ_VSI_FD_ENABLE | ICE_AQ_VSI_FD_PROG_ENABLE;
+	ctxt->info.fd_options = cpu_to_le16(val);
+	/* max of allocated flow director filters */
+	ctxt->info.max_fd_fltr_dedicated =
+			cpu_to_le16(vsi->num_gfltr);
+	/* max of shared flow director filters any VSI may program */
+	ctxt->info.max_fd_fltr_shared =
+			cpu_to_le16(vsi->num_bfltr);
+	/* default queue index within the VSI of the default FD */
+	val = ((dflt_q << ICE_AQ_VSI_FD_DEF_Q_S) &
+	       ICE_AQ_VSI_FD_DEF_Q_M);
+	/* target queue or queue group to the FD filter */
+	val |= ((dflt_q_group << ICE_AQ_VSI_FD_DEF_GRP_S) &
+		ICE_AQ_VSI_FD_DEF_GRP_M);
+	ctxt->info.fd_def_q = cpu_to_le16(val);
+	/* queue index on which FD filter completion is reported */
+	val = ((report_q << ICE_AQ_VSI_FD_REPORT_Q_S) &
+	       ICE_AQ_VSI_FD_REPORT_Q_M);
+	/* priority of the default qindex action */
+	val |= ((dflt_q_prio << ICE_AQ_VSI_FD_DEF_PRIORITY_S) &
+		ICE_AQ_VSI_FD_DEF_PRIORITY_M);
+	ctxt->info.fd_report_opt = cpu_to_le16(val);
+
+#define ICE_ACL_RX_PROF_MISS_CNTR ((2 << ICE_AQ_VSI_ACL_DEF_RX_PROF_S) & \
+				   ICE_AQ_VSI_ACL_DEF_RX_PROF_M)
+#define ICE_ACL_RX_TBL_MISS_CNTR ((3 << ICE_AQ_VSI_ACL_DEF_RX_TABLE_S) & \
+				  ICE_AQ_VSI_ACL_DEF_RX_TABLE_M)
+
+	val = ICE_ACL_RX_PROF_MISS_CNTR | ICE_ACL_RX_TBL_MISS_CNTR;
+	ctxt->info.acl_def_act = cpu_to_le16(val);
+}
+
 /**
  * ice_set_rss_vsi_ctx - Set RSS VSI context before adding a VSI
  * @ctxt: the VSI context being set
@@ -989,88 +1150,153 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
  */
 static void ice_set_rss_vsi_ctx(struct ice_vsi_ctx *ctxt, struct ice_vsi *vsi)
 {
-	u8 lut_type, hash_type;
+	u8 lut_type, hash_type, global_lut_id = 0;
+	struct device *dev;
 	struct ice_pf *pf;
 
 	pf = vsi->back;
+	dev = ice_pf_to_dev(pf);
 
 	switch (vsi->type) {
+	case ICE_VSI_CHNL:
 	case ICE_VSI_PF:
 		/* PF VSI will inherit RSS instance of PF */
 		lut_type = ICE_AQ_VSI_Q_OPT_RSS_LUT_PF;
 		hash_type = ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
 		break;
 	case ICE_VSI_VF:
-		/* VF VSI will gets a small RSS table which is a VSI LUT type */
 		lut_type = ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI;
+		if (vsi->global_lut_id) {
+			lut_type = ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL;
+			global_lut_id = *vsi->global_lut_id;
+		}
 		hash_type = ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
 		break;
-	case ICE_VSI_LB:
-		dev_dbg(&pf->pdev->dev, "Unsupported VSI type %d\n", vsi->type);
-		return;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n", vsi->type);
+		dev_dbg(dev, "Unsupported VSI type %s\n",
+			ice_vsi_type_str(vsi->type));
 		return;
 	}
 
 	ctxt->info.q_opt_rss = ((lut_type << ICE_AQ_VSI_Q_OPT_RSS_LUT_S) &
 				ICE_AQ_VSI_Q_OPT_RSS_LUT_M) |
-				((hash_type << ICE_AQ_VSI_Q_OPT_RSS_HASH_S) &
-				 ICE_AQ_VSI_Q_OPT_RSS_HASH_M);
+				(hash_type & ICE_AQ_VSI_Q_OPT_RSS_HASH_M) |
+				((global_lut_id << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S) &
+				ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M);
+}
+
+static void
+ice_chnl_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
+{
+	struct ice_pf *pf = vsi->back;
+	u16 qcount, qmap;
+	u8 offset = 0;
+	int pow;
+
+	qcount = min_t(int, vsi->num_rxq, pf->num_lan_msix);
+
+	pow = order_base_2(qcount);
+	qmap = ((offset << ICE_AQ_VSI_TC_Q_OFFSET_S) &
+		 ICE_AQ_VSI_TC_Q_OFFSET_M) |
+		 ((pow << ICE_AQ_VSI_TC_Q_NUM_S) &
+		   ICE_AQ_VSI_TC_Q_NUM_M);
+
+	ctxt->info.tc_mapping[0] = cpu_to_le16(qmap);
+	ctxt->info.mapping_flags |= cpu_to_le16(ICE_AQ_VSI_Q_MAP_CONTIG);
+	ctxt->info.q_mapping[0] = cpu_to_le16(vsi->next_base_q);
+	ctxt->info.q_mapping[1] = cpu_to_le16(qcount);
 }
 
 /**
  * ice_vsi_init - Create and initialize a VSI
  * @vsi: the VSI being configured
+ * @init_vsi: is this call creating a VSI
  *
  * This initializes a VSI context depending on the VSI type to be added and
  * passes it down to the add_vsi aq command to create a new VSI.
  */
-static int ice_vsi_init(struct ice_vsi *vsi)
+static int ice_vsi_init(struct ice_vsi *vsi, bool init_vsi)
 {
 	struct ice_pf *pf = vsi->back;
 	struct ice_hw *hw = &pf->hw;
 	struct ice_vsi_ctx *ctxt;
+	struct device *dev;
 	int ret = 0;
 
-	ctxt = devm_kzalloc(&pf->pdev->dev, sizeof(*ctxt), GFP_KERNEL);
+	dev = ice_pf_to_dev(pf);
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
 	if (!ctxt)
 		return -ENOMEM;
 
-	ctxt->info = vsi->info;
 	switch (vsi->type) {
+	case ICE_VSI_CTRL:
 	case ICE_VSI_LB:
-		/* fall through */
 	case ICE_VSI_PF:
 		ctxt->flags = ICE_AQ_VSI_TYPE_PF;
 		break;
+	case ICE_VSI_CHNL:
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		ctxt->flags = ICE_AQ_VSI_TYPE_VMDQ2;
+		break;
 	case ICE_VSI_VF:
 		ctxt->flags = ICE_AQ_VSI_TYPE_VF;
 		/* VF number here is the absolute VF number (0-255) */
 		ctxt->vf_num = vsi->vf_id + hw->func_caps.vf_base_id;
 		break;
 	default:
-		return -ENODEV;
+		ret = -ENODEV;
+		goto out;
+	}
+
+	/* Handle VLAN pruning for channel VSI if main VSI has VLAN
+	 * prune enabled
+	 */
+	if (vsi->type == ICE_VSI_CHNL) {
+		struct ice_vsi *main_vsi;
+
+		main_vsi = ice_get_main_vsi(pf);
+		if (main_vsi && ice_vsi_is_vlan_pruning_ena(main_vsi))
+			ctxt->info.sw_flags2 |=
+				ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+		else
+			ctxt->info.sw_flags2 &=
+				~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
 	}
 
-	ice_set_dflt_vsi_ctx(ctxt);
+	ice_set_dflt_vsi_ctx(hw, ctxt);
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		ice_set_fd_vsi_ctx(ctxt, vsi);
 	/* if the switch is in VEB mode, allow VSI loopback */
 	if (vsi->vsw->bridge_mode == BRIDGE_MODE_VEB)
-		ctxt->info.sw_flags |= ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
+		ctxt->info.sw_flags |= (ICE_AQ_VSI_SW_FLAG_ALLOW_LB |
+					ICE_AQ_VSI_SW_FLAG_LOCAL_LB);
 
 	/* Set LUT type and HASH type if RSS is enabled */
-	if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+	if (test_bit(ICE_FLAG_RSS_ENA, pf->flags) &&
+	    vsi->type != ICE_VSI_CTRL) {
 		ice_set_rss_vsi_ctx(ctxt, vsi);
+		/* if updating VSI context, make sure to set valid_section:
+		 * to indicate which section of VSI context being updated
+		 */
+		if (!init_vsi)
+			ctxt->info.valid_sections |=
+				cpu_to_le16(ICE_AQ_VSI_PROP_Q_OPT_VALID);
+	}
 
 	ctxt->info.sw_id = vsi->port_info->sw_id;
-	ice_vsi_setup_q_map(vsi, ctxt);
 
-	/* Enable MAC Antispoof with new VSI being initialized or updated */
-	if (vsi->type == ICE_VSI_VF && pf->vf[vsi->vf_id].spoofchk) {
-		ctxt->info.valid_sections |=
-			cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
-		ctxt->info.sec_flags |=
-			ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
+	if (vsi->type == ICE_VSI_CHNL) {
+		ice_chnl_vsi_setup_q_map(vsi, ctxt);
+	} else {
+		ice_vsi_setup_q_map(vsi, ctxt);
+		if (!init_vsi) /* means VSI being updated */
+			/* must to indicate which section of VSI context are
+			 * being modified
+			 */
+			ctxt->info.valid_sections |=
+				cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID);
 	}
 
 	/* Allow control frames out of main VSI */
@@ -1080,11 +1306,20 @@ static int ice_vsi_init(struct ice_vsi *vsi)
 			cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
 	}
 
-	ret = ice_add_vsi(hw, vsi->idx, ctxt, NULL);
-	if (ret) {
-		dev_err(&pf->pdev->dev,
-			"Add VSI failed, err %d\n", ret);
-		return -EIO;
+	if (init_vsi) {
+		ret = ice_add_vsi(hw, vsi->idx, ctxt, NULL);
+		if (ret) {
+			dev_err(dev, "Add VSI failed, err %d\n", ret);
+			ret = -EIO;
+			goto out;
+		}
+	} else {
+		ret = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+		if (ret) {
+			dev_err(dev, "Update VSI failed, err %d\n", ret);
+			ret = -EIO;
+			goto out;
+		}
 	}
 
 	/* keep context for update VSI operations */
@@ -1093,131 +1328,121 @@ static int ice_vsi_init(struct ice_vsi *vsi)
 	/* record VSI number returned */
 	vsi->vsi_num = ctxt->vsi_num;
 
-	devm_kfree(&pf->pdev->dev, ctxt);
+out:
+	kfree(ctxt);
 	return ret;
 }
 
 /**
- * ice_free_q_vector - Free memory allocated for a specific interrupt vector
- * @vsi: VSI having the memory freed
- * @v_idx: index of the vector to be freed
+ * ice_free_res - free a block of resources
+ * @res: pointer to the resource
+ * @index: starting index previously returned by ice_get_res
+ * @id: identifier to track owner
+ *
+ * Returns number of resources freed
  */
-static void ice_free_q_vector(struct ice_vsi *vsi, int v_idx)
+int ice_free_res(struct ice_res_tracker *res, u16 index, u16 id)
 {
-	struct ice_q_vector *q_vector;
-	struct ice_pf *pf = vsi->back;
-	struct ice_ring *ring;
-
-	if (!vsi->q_vectors[v_idx]) {
-		dev_dbg(&pf->pdev->dev, "Queue vector at index %d not found\n",
-			v_idx);
-		return;
-	}
-	q_vector = vsi->q_vectors[v_idx];
-
-	ice_for_each_ring(ring, q_vector->tx)
-		ring->q_vector = NULL;
-	ice_for_each_ring(ring, q_vector->rx)
-		ring->q_vector = NULL;
-
-	/* only VSI with an associated netdev is set up with NAPI */
-	if (vsi->netdev)
-		netif_napi_del(&q_vector->napi);
+	int count = 0;
+	int i;
 
-	devm_kfree(&pf->pdev->dev, q_vector);
-	vsi->q_vectors[v_idx] = NULL;
-}
+	if (!res || index >= res->end)
+		return -EINVAL;
 
-/**
- * ice_vsi_free_q_vectors - Free memory allocated for interrupt vectors
- * @vsi: the VSI having memory freed
- */
-void ice_vsi_free_q_vectors(struct ice_vsi *vsi)
-{
-	int v_idx;
+	id |= ICE_RES_VALID_BIT;
+	for (i = index; i < res->end && res->list[i] == id; i++) {
+		res->list[i] = 0;
+		count++;
+	}
 
-	ice_for_each_q_vector(vsi, v_idx)
-		ice_free_q_vector(vsi, v_idx);
+	return count;
 }
 
 /**
- * ice_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
- * @vsi: the VSI being configured
- * @v_idx: index of the vector in the VSI struct
+ * ice_search_res - Search the tracker for a block of resources
+ * @res: pointer to the resource
+ * @needed: size of the block needed
+ * @id: identifier to track owner
  *
- * We allocate one q_vector. If allocation fails we return -ENOMEM.
+ * Returns the base item index of the block, or -ENOMEM for error
  */
-static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, int v_idx)
+static int ice_search_res(struct ice_res_tracker *res, u16 needed, u16 id)
 {
-	struct ice_pf *pf = vsi->back;
-	struct ice_q_vector *q_vector;
+	u16 start = 0, end = 0;
 
-	/* allocate q_vector */
-	q_vector = devm_kzalloc(&pf->pdev->dev, sizeof(*q_vector), GFP_KERNEL);
-	if (!q_vector)
+	if (needed > res->end)
 		return -ENOMEM;
 
-	q_vector->vsi = vsi;
-	q_vector->v_idx = v_idx;
-	if (vsi->type == ICE_VSI_VF)
-		goto out;
-	/* only set affinity_mask if the CPU is online */
-	if (cpu_online(v_idx))
-		cpumask_set_cpu(v_idx, &q_vector->affinity_mask);
+	id |= ICE_RES_VALID_BIT;
 
-	/* This will not be called in the driver load path because the netdev
-	 * will not be created yet. All other cases with register the NAPI
-	 * handler here (i.e. resume, reset/rebuild, etc.)
-	 */
-	if (vsi->netdev)
-		netif_napi_add(vsi->netdev, &q_vector->napi, ice_napi_poll,
-			       NAPI_POLL_WEIGHT);
+	do {
+		/* skip already allocated entries */
+		if (res->list[end++] & ICE_RES_VALID_BIT) {
+			start = end;
+			if ((start + needed) > res->end)
+				break;
+		}
 
-out:
-	/* tie q_vector and VSI together */
-	vsi->q_vectors[v_idx] = q_vector;
+		if (end == (start + needed)) {
+			int i = start;
 
-	return 0;
+			/* there was enough, so assign it to the requestor */
+			while (i != end)
+				res->list[i++] = id;
+
+			return start;
+		}
+	} while (end < res->end);
+
+	return -ENOMEM;
 }
 
 /**
- * ice_vsi_alloc_q_vectors - Allocate memory for interrupt vectors
- * @vsi: the VSI being configured
- *
- * We allocate one q_vector per queue interrupt. If allocation fails we
- * return -ENOMEM.
+ * ice_get_free_res_count - Get free count from a resource tracker
+ * @res: Resource tracker instance
  */
-static int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi)
+static u16 ice_get_free_res_count(struct ice_res_tracker *res)
 {
-	struct ice_pf *pf = vsi->back;
-	int v_idx = 0, num_q_vectors;
-	int err;
+	u16 i, count = 0;
 
-	if (vsi->q_vectors[0]) {
-		dev_dbg(&pf->pdev->dev, "VSI %d has existing q_vectors\n",
-			vsi->vsi_num);
-		return -EEXIST;
-	}
+	for (i = 0; i < res->end; i++)
+		if (!(res->list[i] & ICE_RES_VALID_BIT))
+			count++;
 
-	num_q_vectors = vsi->num_q_vectors;
+	return count;
+}
 
-	for (v_idx = 0; v_idx < num_q_vectors; v_idx++) {
-		err = ice_vsi_alloc_q_vector(vsi, v_idx);
-		if (err)
-			goto err_out;
-	}
+/**
+ * ice_get_valid_res_count - Get in-use count from a resource tracker
+ * @res: Resource tracker instance
+ */
+u16 ice_get_valid_res_count(struct ice_res_tracker *res)
+{
+	return res->end - ice_get_free_res_count(res);
+}
 
-	return 0;
+/**
+ * ice_get_res - get a block of resources
+ * @pf: board private structure
+ * @res: pointer to the resource
+ * @needed: size of the block needed
+ * @id: identifier to track owner
+ *
+ * Returns the base item index of the block, or negative for error
+ */
+int
+ice_get_res(struct ice_pf *pf, struct ice_res_tracker *res, u16 needed, u16 id)
+{
+	if (!res || !pf)
+		return -EINVAL;
 
-err_out:
-	while (v_idx--)
-		ice_free_q_vector(vsi, v_idx);
+	if (!needed || needed > res->num_entries || id >= ICE_RES_VALID_BIT) {
+		dev_err(ice_pf_to_dev(pf), "param err: needed=%d, num_entries = %d id=0x%04x\n",
+			needed, res->num_entries, id);
+		return -EINVAL;
+	}
 
-	dev_err(&pf->pdev->dev,
-		"Failed to allocate %d q_vector for VSI %d, ret=%d\n",
-		vsi->num_q_vectors, vsi->vsi_num, err);
-	vsi->num_q_vectors = 0;
-	return err;
+	return ice_search_res(res, needed, id);
 }
 
 /**
@@ -1233,28 +1458,51 @@ static int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi)
 static int ice_vsi_setup_vector_base(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
+	struct device *dev;
 	u16 num_q_vectors;
+	int base = 0;
 
+	dev = ice_pf_to_dev(pf);
 	/* SRIOV doesn't grab irq_tracker entries for each VSI */
 	if (vsi->type == ICE_VSI_VF)
 		return 0;
+	if (vsi->type == ICE_VSI_CHNL)
+		return 0;
 
 	if (vsi->base_vector) {
-		dev_dbg(&pf->pdev->dev, "VSI %d has non-zero base vector %d\n",
+		dev_dbg(dev, "VSI %d has non-zero base vector %d\n",
 			vsi->vsi_num, vsi->base_vector);
 		return -EEXIST;
 	}
 
 	num_q_vectors = vsi->num_q_vectors;
 	/* reserve slots from OS requested IRQs */
-	vsi->base_vector = ice_get_res(pf, pf->irq_tracker, num_q_vectors,
-				       vsi->idx);
-	if (vsi->base_vector < 0) {
-		dev_err(&pf->pdev->dev,
-			"Failed to get tracking for %d vectors for VSI %d, err=%d\n",
-			num_q_vectors, vsi->vsi_num, vsi->base_vector);
+	if (vsi->type == ICE_VSI_CTRL && vsi->vf_id != ICE_INVAL_VFID) {
+		int i;
+
+		ice_for_each_vf(pf, i) {
+			struct ice_vf *vf = &pf->vf[i];
+
+			if (i != vsi->vf_id && vf->ctrl_vsi_idx != ICE_NO_VSI) {
+				base = pf->vsi[vf->ctrl_vsi_idx]->base_vector;
+				break;
+			}
+		}
+		if (i == pf->num_alloc_vfs)
+			base = ice_get_res(pf, pf->irq_tracker, num_q_vectors,
+					   ICE_RES_VF_CTRL_VEC_ID);
+	} else {
+		base = ice_get_res(pf, pf->irq_tracker, num_q_vectors,
+				   vsi->idx);
+	}
+
+	if (base < 0) {
+		dev_err(dev, "%d MSI-X interrupts available. %s %d failed to get %d MSI-X vectors\n",
+			ice_get_free_res_count(pf->irq_tracker),
+			ice_vsi_type_str(vsi->type), vsi->idx, num_q_vectors);
 		return -ENOENT;
 	}
+	vsi->base_vector = (u16)base;
 	pf->num_avail_sw_msix -= num_q_vectors;
 
 	return 0;
@@ -1268,11 +1516,23 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi)
 {
 	int i;
 
+	/* Avoid stale references by clearing map from vector to ring */
+	if (vsi->q_vectors) {
+		ice_for_each_q_vector(vsi, i) {
+			struct ice_q_vector *q_vector = vsi->q_vectors[i];
+
+			if (q_vector) {
+				q_vector->tx.ring = NULL;
+				q_vector->rx.ring = NULL;
+			}
+		}
+	}
+
 	if (vsi->tx_rings) {
 		for (i = 0; i < vsi->alloc_txq; i++) {
 			if (vsi->tx_rings[i]) {
 				kfree_rcu(vsi->tx_rings[i], rcu);
-				vsi->tx_rings[i] = NULL;
+				WRITE_ONCE(vsi->tx_rings[i], NULL);
 			}
 		}
 	}
@@ -1280,7 +1540,7 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi)
 		for (i = 0; i < vsi->alloc_rxq; i++) {
 			if (vsi->rx_rings[i]) {
 				kfree_rcu(vsi->rx_rings[i], rcu);
-				vsi->rx_rings[i] = NULL;
+				WRITE_ONCE(vsi->rx_rings[i], NULL);
 			}
 		}
 	}
@@ -1292,9 +1552,12 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi)
  */
 static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 {
+	bool dvm_ena = ice_is_dvm_ena(&vsi->back->hw);
 	struct ice_pf *pf = vsi->back;
-	int i;
+	struct device *dev;
+	u16 i;
 
+	dev = ice_pf_to_dev(pf);
 	/* Allocate Tx rings */
 	for (i = 0; i < vsi->alloc_txq; i++) {
 		struct ice_ring *ring;
@@ -1307,11 +1570,14 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 
 		ring->q_index = i;
 		ring->reg_idx = vsi->txq_map[i];
-		ring->ring_active = false;
 		ring->vsi = vsi;
-		ring->dev = &pf->pdev->dev;
+		ring->dev = dev;
 		ring->count = vsi->num_tx_desc;
-		vsi->tx_rings[i] = ring;
+		if (dvm_ena)
+			ring->flags |= ICE_TX_FLAGS_VLAN_TAG_LOC_L2TAG2;
+		else
+			ring->flags |= ICE_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+		WRITE_ONCE(vsi->tx_rings[i], ring);
 	}
 
 	/* Allocate Rx rings */
@@ -1325,12 +1591,11 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 
 		ring->q_index = i;
 		ring->reg_idx = vsi->rxq_map[i];
-		ring->ring_active = false;
 		ring->vsi = vsi;
 		ring->netdev = vsi->netdev;
-		ring->dev = &pf->pdev->dev;
+		ring->dev = dev;
 		ring->count = vsi->num_rx_desc;
-		vsi->rx_rings[i] = ring;
+		WRITE_ONCE(vsi->rx_rings[i], ring);
 	}
 
 	return 0;
@@ -1341,63 +1606,41 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_map_rings_to_vectors - Map VSI rings to interrupt vectors
- * @vsi: the VSI being configured
- *
- * This function maps descriptor rings to the queue-specific vectors allotted
- * through the MSI-X enabling code. On a constrained vector budget, we map Tx
- * and Rx rings to the vector as "efficiently" as possible.
+ * ice_vsi_reset_stats - Reset all stats of a given VSI
+ * @vsi: the VSI whose stats needs to be cleared
  */
-#ifdef CONFIG_DCB
-void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
-#else
-static void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
-#endif /* CONFIG_DCB */
-{
-	int q_vectors = vsi->num_q_vectors;
-	int tx_rings_rem, rx_rings_rem;
-	int v_id;
-
-	/* initially assigning remaining rings count to VSIs num queue value */
-	tx_rings_rem = vsi->num_txq;
-	rx_rings_rem = vsi->num_rxq;
-
-	for (v_id = 0; v_id < q_vectors; v_id++) {
-		struct ice_q_vector *q_vector = vsi->q_vectors[v_id];
-		int tx_rings_per_v, rx_rings_per_v, q_id, q_base;
-
-		/* Tx rings mapping to vector */
-		tx_rings_per_v = DIV_ROUND_UP(tx_rings_rem, q_vectors - v_id);
-		q_vector->num_ring_tx = tx_rings_per_v;
-		q_vector->tx.ring = NULL;
-		q_vector->tx.itr_idx = ICE_TX_ITR;
-		q_base = vsi->num_txq - tx_rings_rem;
-
-		for (q_id = q_base; q_id < (q_base + tx_rings_per_v); q_id++) {
-			struct ice_ring *tx_ring = vsi->tx_rings[q_id];
-
-			tx_ring->q_vector = q_vector;
-			tx_ring->next = q_vector->tx.ring;
-			q_vector->tx.ring = tx_ring;
-		}
-		tx_rings_rem -= tx_rings_per_v;
+static void ice_vsi_reset_stats(struct ice_vsi *vsi)
+{
+	int i;
 
-		/* Rx rings mapping to vector */
-		rx_rings_per_v = DIV_ROUND_UP(rx_rings_rem, q_vectors - v_id);
-		q_vector->num_ring_rx = rx_rings_per_v;
-		q_vector->rx.ring = NULL;
-		q_vector->rx.itr_idx = ICE_RX_ITR;
-		q_base = vsi->num_rxq - rx_rings_rem;
+	if (!vsi)
+		return;
 
-		for (q_id = q_base; q_id < (q_base + rx_rings_per_v); q_id++) {
-			struct ice_ring *rx_ring = vsi->rx_rings[q_id];
+	memset(&vsi->net_stats, 0, sizeof(vsi->net_stats));
+	memset(&vsi->eth_stats, 0, sizeof(vsi->eth_stats));
+	memset(&vsi->eth_stats_prev, 0, sizeof(vsi->eth_stats_prev));
 
-			rx_ring->q_vector = q_vector;
-			rx_ring->next = q_vector->rx.ring;
-			q_vector->rx.ring = rx_ring;
+	if (vsi->tx_rings) {
+		ice_for_each_txq(vsi, i) {
+			if (vsi->tx_rings[i]) {
+				memset(&vsi->tx_rings[i]->stats, 0,
+				       sizeof(vsi->tx_rings[i]->stats));
+				memset(&vsi->tx_rings[i]->tx_stats, 0,
+				       sizeof(vsi->tx_rings[i])->tx_stats);
+			}
+		}
+	}
+	if (vsi->rx_rings) {
+		ice_for_each_rxq(vsi, i) {
+			if (vsi->rx_rings[i]) {
+				memset(&vsi->rx_rings[i]->stats, 0,
+				       sizeof(vsi->rx_rings[i]->stats));
+				memset(&vsi->rx_rings[i]->rx_stats, 0,
+				       sizeof(vsi->rx_rings[i])->rx_stats);
+			}
 		}
-		rx_rings_rem -= rx_rings_per_v;
 	}
+	vsi->stat_offsets_loaded = false;
 }
 
 /**
@@ -1409,15 +1652,13 @@ static void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
  * LUT, while in the event of enable request for RSS, it will reconfigure RSS
  * LUT.
  */
-int ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena)
+void ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena)
 {
-	int err = 0;
 	u8 *lut;
 
-	lut = devm_kzalloc(&vsi->back->pdev->dev, vsi->rss_table_size,
-			   GFP_KERNEL);
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
 	if (!lut)
-		return -ENOMEM;
+		return;
 
 	if (ena) {
 		if (vsi->rss_lut_user)
@@ -1427,26 +1668,45 @@ int ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena)
 					 vsi->rss_size);
 	}
 
-	err = ice_set_rss(vsi, NULL, lut, vsi->rss_table_size);
-	devm_kfree(&vsi->back->pdev->dev, lut);
-	return err;
+	ice_set_rss_lut(vsi, lut, vsi->rss_table_size);
+	kfree(lut);
 }
 
 /**
  * ice_vsi_cfg_rss_lut_key - Configure RSS params for a VSI
  * @vsi: VSI to be configured
  */
-static int ice_vsi_cfg_rss_lut_key(struct ice_vsi *vsi)
+int ice_vsi_cfg_rss_lut_key(struct ice_vsi *vsi)
 {
-	struct ice_aqc_get_set_rss_keys *key;
 	struct ice_pf *pf = vsi->back;
-	enum ice_status status;
-	int err = 0;
-	u8 *lut;
+	struct device *dev;
+	u8 *lut, *key;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+#ifdef NETIF_F_HW_TC
+	if (vsi->type == ICE_VSI_PF && vsi->ch_rss_size &&
+	    (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))) {
+		vsi->rss_size = min_t(u16, vsi->rss_size, vsi->ch_rss_size);
+	} else {
+		vsi->rss_size = min_t(u16, vsi->rss_size, vsi->num_rxq);
 
-	vsi->rss_size = min_t(int, vsi->rss_size, vsi->num_rxq);
+		/* If orig_rss_size is valid and it is less than determined
+		 * main VSI's rss_size, update main VSI's rss_size to be
+		 * orig_rss_size so that when tc-qdisc is deleted, main VSI
+		 * RSS table gets programmed to be correct (whatever it was
+		 * to begin with (prior to setup-tc for ADQ config)
+		 */
+		if (vsi->orig_rss_size && vsi->rss_size < vsi->orig_rss_size &&
+		    vsi->orig_rss_size <= vsi->num_rxq) {
+			vsi->rss_size = vsi->orig_rss_size;
+			/* now orig_rss_size is used, reset it to zero */
+			vsi->orig_rss_size = 0;
+		}
+	}
+#endif /* NETIF_F_HW_TC */
 
-	lut = devm_kzalloc(&pf->pdev->dev, vsi->rss_table_size, GFP_KERNEL);
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
 	if (!lut)
 		return -ENOMEM;
 
@@ -1455,75 +1715,146 @@ static int ice_vsi_cfg_rss_lut_key(struct ice_vsi *vsi)
 	else
 		ice_fill_rss_lut(lut, vsi->rss_table_size, vsi->rss_size);
 
-	status = ice_aq_set_rss_lut(&pf->hw, vsi->idx, vsi->rss_lut_type, lut,
-				    vsi->rss_table_size);
-
-	if (status) {
-		dev_err(&pf->pdev->dev,
-			"set_rss_lut failed, error %d\n", status);
-		err = -EIO;
+	err = ice_set_rss_lut(vsi, lut, vsi->rss_table_size);
+	if (err) {
+		dev_err(dev, "set_rss_lut failed, error %d\n", err);
 		goto ice_vsi_cfg_rss_exit;
 	}
 
-	key = devm_kzalloc(&pf->pdev->dev, sizeof(*key), GFP_KERNEL);
+	key = kzalloc(ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE, GFP_KERNEL);
 	if (!key) {
 		err = -ENOMEM;
 		goto ice_vsi_cfg_rss_exit;
 	}
 
 	if (vsi->rss_hkey_user)
-		memcpy(key,
-		       (struct ice_aqc_get_set_rss_keys *)vsi->rss_hkey_user,
-		       ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE);
+		memcpy(key, vsi->rss_hkey_user, ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE);
 	else
-		netdev_rss_key_fill((void *)key,
-				    ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE);
-
-	status = ice_aq_set_rss_key(&pf->hw, vsi->idx, key);
+		netdev_rss_key_fill((void *)key, ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE);
 
-	if (status) {
-		dev_err(&pf->pdev->dev, "set_rss_key failed, error %d\n",
-			status);
-		err = -EIO;
-	}
+	err = ice_set_rss_key(vsi, key);
+	if (err)
+		dev_err(dev, "set_rss_key failed, error %d\n", err);
 
-	devm_kfree(&pf->pdev->dev, key);
+	kfree(key);
 ice_vsi_cfg_rss_exit:
-	devm_kfree(&pf->pdev->dev, lut);
+	kfree(lut);
 	return err;
 }
 
 /**
- * ice_add_mac_to_list - Add a MAC address filter entry to the list
- * @vsi: the VSI to be forwarded to
- * @add_list: pointer to the list which contains MAC filter entries
- * @macaddr: the MAC address to be added.
+ * ice_vsi_set_vf_rss_flow_fld - Sets VF VSI RSS input set for different flows
+ * @vsi: VSI to be configured
  *
- * Adds MAC address filter entry to the temp list
+ * This function will only be called during the VF VSI setup. Upon successful
+ * completion of package download, this function will configure default RSS
+ * input sets for VF VSI.
+ */
+static void ice_vsi_set_vf_rss_flow_fld(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n",
+			vsi->vsi_num);
+		return;
+	}
+
+	status = ice_add_avf_rss_cfg(&pf->hw, vsi->idx, ICE_DEFAULT_RSS_HENA);
+	if (status)
+		dev_dbg(dev, "ice_add_avf_rss_cfg failed for vsi = %d, error = %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+}
+
+
+static const struct ice_rss_hash_cfg default_rss_cfgs[] = {
+	/* configure RSS for IPv4 with input set IP src/dst */
+	{ICE_FLOW_SEG_HDR_IPV4, ICE_FLOW_HASH_IPV4, ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for IPv6 with input set IPv6 src/dst */
+	{ICE_FLOW_SEG_HDR_IPV6, ICE_FLOW_HASH_IPV6, ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for tcp4 with input set IP src/dst, TCP src/dst */
+	{ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV4,
+				ICE_HASH_TCP_IPV4,  ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for udp4 with input set IP src/dst, UDP src/dst */
+	{ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV4,
+				ICE_HASH_UDP_IPV4,  ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for sctp4 with input set IP src/dst */
+	{ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV4,
+				ICE_HASH_SCTP_IPV4, ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for tcp6 with input set IPv6 src/dst, TCP src/dst */
+	{ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV6,
+				ICE_HASH_TCP_IPV6,  ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for udp6 with input set IPv6 src/dst, UDP src/dst */
+	{ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV6,
+				ICE_HASH_UDP_IPV6,  ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for sctp6 with input set IPv6 src/dst */
+	{ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV6,
+				ICE_HASH_SCTP_IPV6, ICE_RSS_ANY_HEADERS, false},
+};
+
+/**
+ * ice_vsi_set_rss_flow_fld - Sets RSS input set for different flows
+ * @vsi: VSI to be configured
  *
- * Returns 0 on success or ENOMEM on failure.
+ * This function will only be called after successful download package call
+ * during initialization of PF. Since the downloaded package will erase the
+ * RSS section, this function will configure RSS input sets for different
+ * flow types. The last profile added has the highest priority, therefore 2
+ * tuple profiles (i.e. IPv4 src/dst) are added before 4 tuple profiles
+ * (i.e. IPv4 src/dst TCP src/dst port).
  */
-int ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list,
-			const u8 *macaddr)
+static void ice_vsi_set_rss_flow_fld(struct ice_vsi *vsi)
 {
-	struct ice_fltr_list_entry *tmp;
+	u16 vsi_handle = vsi->idx, vsi_num = vsi->vsi_num;
 	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct device *dev;
+	u32 i;
 
-	tmp = devm_kzalloc(&pf->pdev->dev, sizeof(*tmp), GFP_ATOMIC);
-	if (!tmp)
-		return -ENOMEM;
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n",
+			vsi_num);
+		return;
+	}
 
-	tmp->fltr_info.flag = ICE_FLTR_TX;
-	tmp->fltr_info.src_id = ICE_SRC_ID_VSI;
-	tmp->fltr_info.lkup_type = ICE_SW_LKUP_MAC;
-	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-	tmp->fltr_info.vsi_handle = vsi->idx;
-	ether_addr_copy(tmp->fltr_info.l_data.mac.mac_addr, macaddr);
+	for (i = 0; i < ARRAY_SIZE(default_rss_cfgs); i++) {
+		const struct ice_rss_hash_cfg *cfg = &default_rss_cfgs[i];
 
-	INIT_LIST_HEAD(&tmp->list_entry);
-	list_add(&tmp->list_entry, add_list);
+		status = ice_add_rss_cfg(hw, vsi_handle, cfg);
+		if (status)
+			dev_dbg(dev, "ice_add_rss_cfg failed, addl_hdrs = %x, hash_flds = %llx, hdr_type = %d, symm = %d\n",
+				cfg->addl_hdrs, cfg->hash_flds, cfg->hdr_type,
+				cfg->symm);
+	}
+}
 
-	return 0;
+/**
+ * ice_pf_state_is_nominal - checks the PF for nominal state
+ * @pf: pointer to PF to check
+ *
+ * Check the PF's state for a collection of bits that would indicate
+ * the PF is in a state that would inhibit normal operation for
+ * driver functionality.
+ *
+ * Returns true if PF is in a nominal state, false otherwise
+ */
+bool ice_pf_state_is_nominal(struct ice_pf *pf)
+{
+	DECLARE_BITMAP(check_bits, ICE_STATE_NBITS) = { 0 };
+
+	if (!pf)
+		return false;
+
+	bitmap_set(check_bits, 0, ICE_STATE_NOMINAL_CHECK_BITS);
+	if (bitmap_intersects(pf->state, check_bits, ICE_STATE_NBITS))
+		return false;
+
+	return true;
 }
 
 /**
@@ -1573,103 +1904,86 @@ void ice_update_eth_stats(struct ice_vsi *vsi)
 }
 
 /**
- * ice_free_fltr_list - free filter lists helper
- * @dev: pointer to the device struct
- * @h: pointer to the list head to be freed
- *
- * Helper function to free filter lists previously created using
- * ice_add_mac_to_list
+ * ice_vsi_cfg_frame_size - setup max frame size and Rx buffer length
+ * @vsi: VSI
  */
-void ice_free_fltr_list(struct device *dev, struct list_head *h)
+void ice_vsi_cfg_frame_size(struct ice_vsi *vsi)
 {
-	struct ice_fltr_list_entry *e, *tmp;
-
-	list_for_each_entry_safe(e, tmp, h, list_entry) {
-		list_del(&e->list_entry);
-		devm_kfree(dev, e);
+	if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags)) {
+		vsi->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX;
+		vsi->rx_buf_len = ICE_RXBUF_2048;
+#if (PAGE_SIZE < 8192)
+	} else if (!ICE_2K_TOO_SMALL_WITH_PADDING &&
+		   (vsi->netdev->mtu <= ETH_DATA_LEN)) {
+		vsi->max_frame = ICE_RXBUF_1536 - NET_IP_ALIGN;
+		vsi->rx_buf_len = ICE_RXBUF_1536 - NET_IP_ALIGN;
+#endif
+	} else {
+		vsi->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX;
+#if (PAGE_SIZE < 8192)
+		vsi->rx_buf_len = ICE_RXBUF_3072;
+#else
+		vsi->rx_buf_len = ICE_RXBUF_2048;
+#endif
 	}
 }
 
 /**
- * ice_vsi_add_vlan - Add VSI membership for given VLAN
- * @vsi: the VSI being configured
- * @vid: VLAN ID to be added
+ * ice_write_qrxflxp_cntxt - write/configure QRXFLXP_CNTXT register
+ * @hw: HW pointer
+ * @pf_q: index of the Rx queue in the PF's queue space
+ * @rxdid: flexible descriptor RXDID
+ * @prio: priority for the RXDID for this queue
+ * @ena_ts: true to enable timestamp and false to disable timestamp
  */
-int ice_vsi_add_vlan(struct ice_vsi *vsi, u16 vid)
+void
+ice_write_qrxflxp_cntxt(struct ice_hw *hw, u16 pf_q, u32 rxdid, u32 prio,
+			bool ena_ts)
 {
-	struct ice_fltr_list_entry *tmp;
-	struct ice_pf *pf = vsi->back;
-	LIST_HEAD(tmp_add_list);
-	enum ice_status status;
-	int err = 0;
+	int regval = rd32(hw, QRXFLXP_CNTXT(pf_q));
 
-	tmp = devm_kzalloc(&pf->pdev->dev, sizeof(*tmp), GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
+	/* clear any previous values */
+	regval &= ~(QRXFLXP_CNTXT_RXDID_IDX_M |
+		    QRXFLXP_CNTXT_RXDID_PRIO_M |
+		    QRXFLXP_CNTXT_TS_M);
 
-	tmp->fltr_info.lkup_type = ICE_SW_LKUP_VLAN;
-	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-	tmp->fltr_info.flag = ICE_FLTR_TX;
-	tmp->fltr_info.src_id = ICE_SRC_ID_VSI;
-	tmp->fltr_info.vsi_handle = vsi->idx;
-	tmp->fltr_info.l_data.vlan.vlan_id = vid;
+	regval |= (rxdid << QRXFLXP_CNTXT_RXDID_IDX_S) &
+		QRXFLXP_CNTXT_RXDID_IDX_M;
 
-	INIT_LIST_HEAD(&tmp->list_entry);
-	list_add(&tmp->list_entry, &tmp_add_list);
+	regval |= (prio << QRXFLXP_CNTXT_RXDID_PRIO_S) &
+		QRXFLXP_CNTXT_RXDID_PRIO_M;
 
-	status = ice_add_vlan(&pf->hw, &tmp_add_list);
-	if (status) {
-		err = -ENODEV;
-		dev_err(&pf->pdev->dev, "Failure Adding VLAN %d on VSI %i\n",
-			vid, vsi->vsi_num);
-	}
+	if (ena_ts)
+		/* Enable TimeSync on this queue */
+		regval |= QRXFLXP_CNTXT_TS_M;
 
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
-	return err;
+	wr32(hw, QRXFLXP_CNTXT(pf_q), regval);
 }
 
-/**
- * ice_vsi_kill_vlan - Remove VSI membership for a given VLAN
- * @vsi: the VSI being configured
- * @vid: VLAN ID to be removed
- *
- * Returns 0 on success and negative on failure
- */
-int ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid)
+int ice_vsi_cfg_single_rxq(struct ice_vsi *vsi, u16 q_idx)
 {
-	struct ice_fltr_list_entry *list;
-	struct ice_pf *pf = vsi->back;
-	LIST_HEAD(tmp_add_list);
-	enum ice_status status;
-	int err = 0;
+	if (q_idx >= vsi->num_rxq)
+		return -EINVAL;
+
+	return ice_vsi_cfg_rxq(vsi->rx_rings[q_idx]);
+}
+
+int ice_vsi_cfg_single_txq(struct ice_vsi *vsi, struct ice_ring **tx_rings, u16 q_idx)
+{
+	struct ice_aqc_add_tx_qgrp *qg_buf;
+	int err;
+
+	if (q_idx >= vsi->alloc_txq || !tx_rings || !tx_rings[q_idx])
+		return -EINVAL;
 
-	list = devm_kzalloc(&pf->pdev->dev, sizeof(*list), GFP_KERNEL);
-	if (!list)
+	qg_buf = kzalloc(struct_size(qg_buf, txqs, 1), GFP_KERNEL);
+	if (!qg_buf)
 		return -ENOMEM;
 
-	list->fltr_info.lkup_type = ICE_SW_LKUP_VLAN;
-	list->fltr_info.vsi_handle = vsi->idx;
-	list->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-	list->fltr_info.l_data.vlan.vlan_id = vid;
-	list->fltr_info.flag = ICE_FLTR_TX;
-	list->fltr_info.src_id = ICE_SRC_ID_VSI;
-
-	INIT_LIST_HEAD(&list->list_entry);
-	list_add(&list->list_entry, &tmp_add_list);
-
-	status = ice_remove_vlan(&pf->hw, &tmp_add_list);
-	if (status == ICE_ERR_DOES_NOT_EXIST) {
-		dev_dbg(&pf->pdev->dev,
-			"Failed to remove VLAN %d on VSI %i, it does not exist, status: %d\n",
-			vid, vsi->vsi_num, status);
-	} else if (status) {
-		dev_err(&pf->pdev->dev,
-			"Error removing VLAN %d on vsi %i error: %d\n",
-			vid, vsi->vsi_num, status);
-		err = -EIO;
-	}
+	qg_buf->num_txqs = 1;
 
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
+	err = ice_vsi_cfg_txq(vsi, tx_rings[q_idx], qg_buf);
+	kfree(qg_buf);
 	return err;
 }
 
@@ -1687,83 +2001,16 @@ int ice_vsi_cfg_rxqs(struct ice_vsi *vsi)
 	if (vsi->type == ICE_VSI_VF)
 		goto setup_rings;
 
-	if (vsi->netdev && vsi->netdev->mtu > ETH_DATA_LEN)
-		vsi->max_frame = vsi->netdev->mtu +
-			ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
-	else
-		vsi->max_frame = ICE_RXBUF_2048;
-
-	vsi->rx_buf_len = ICE_RXBUF_2048;
+	ice_vsi_cfg_frame_size(vsi);
 setup_rings:
 	/* set up individual rings */
-	for (i = 0; i < vsi->num_rxq; i++) {
-		int err;
+	ice_for_each_rxq(vsi, i) {
+		int err = ice_vsi_cfg_rxq(vsi->rx_rings[i]);
 
-		err = ice_setup_rx_ctx(vsi->rx_rings[i]);
-		if (err) {
-			dev_err(&vsi->back->pdev->dev,
-				"ice_setup_rx_ctx failed for RxQ %d, err %d\n",
-				i, err);
+		if (err)
 			return err;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * ice_vsi_cfg_txq - Configure single Tx queue
- * @vsi: the VSI that queue belongs to
- * @ring: Tx ring to be configured
- * @tc_q_idx: queue index within given TC
- * @qg_buf: queue group buffer
- * @tc: TC that Tx ring belongs to
- */
-static int
-ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring, u16 tc_q_idx,
-		struct ice_aqc_add_tx_qgrp *qg_buf, u8 tc)
-{
-	struct ice_tlan_ctx tlan_ctx = { 0 };
-	struct ice_aqc_add_txqs_perq *txq;
-	struct ice_pf *pf = vsi->back;
-	u8 buf_len = sizeof(*qg_buf);
-	enum ice_status status;
-	u16 pf_q;
-
-	pf_q = ring->reg_idx;
-	ice_setup_tx_ctx(ring, &tlan_ctx, pf_q);
-	/* copy context contents into the qg_buf */
-	qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q);
-	ice_set_ctx((u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx,
-		    ice_tlan_ctx_info);
-
-	/* init queue specific tail reg. It is referred as
-	 * transmit comm scheduler queue doorbell.
-	 */
-	ring->tail = pf->hw.hw_addr + QTX_COMM_DBELL(pf_q);
-
-	/* Add unique software queue handle of the Tx queue per
-	 * TC into the VSI Tx ring
-	 */
-	ring->q_handle = tc_q_idx;
-
-	status = ice_ena_vsi_txq(vsi->port_info, vsi->idx, tc, ring->q_handle,
-				 1, qg_buf, buf_len, NULL);
-	if (status) {
-		dev_err(&pf->pdev->dev,
-			"Failed to set LAN Tx queue context, error: %d\n",
-			status);
-		return -ENODEV;
 	}
 
-	/* Add Tx Queue TEID into the VSI Tx ring from the
-	 * response. This will complete configuring and
-	 * enabling the queue.
-	 */
-	txq = &qg_buf->txqs[0];
-	if (pf_q == le16_to_cpu(txq->txq_id))
-		ring->txq_teid = le32_to_cpu(txq->q_teid);
-
 	return 0;
 }
 
@@ -1771,42 +2018,31 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring, u16 tc_q_idx,
  * ice_vsi_cfg_txqs - Configure the VSI for Tx
  * @vsi: the VSI being configured
  * @rings: Tx ring array to be configured
- * @offset: offset within vsi->txq_map
  *
  * Return 0 on success and a negative value on error
  * Configure the Tx VSI for operation.
  */
 static int
-ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings, int offset)
+ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings)
 {
 	struct ice_aqc_add_tx_qgrp *qg_buf;
-	struct ice_pf *pf = vsi->back;
-	u16 q_idx = 0, i;
+	u16 q_idx = 0;
 	int err = 0;
-	u8 tc;
 
-	qg_buf = devm_kzalloc(&pf->pdev->dev, sizeof(*qg_buf), GFP_KERNEL);
+	qg_buf = kzalloc(struct_size(qg_buf, txqs, 1), GFP_KERNEL);
 	if (!qg_buf)
 		return -ENOMEM;
 
 	qg_buf->num_txqs = 1;
 
-	/* set up and configure the Tx queues for each enabled TC */
-	ice_for_each_traffic_class(tc) {
-		if (!(vsi->tc_cfg.ena_tc & BIT(tc)))
-			break;
-
-		for (i = 0; i < vsi->tc_cfg.tc_info[tc].qcount_tx; i++) {
-			err = ice_vsi_cfg_txq(vsi, rings[q_idx], i + offset,
-					      qg_buf, tc);
-			if (err)
-				goto err_cfg_txqs;
-
-			q_idx++;
-		}
+	for (q_idx = 0; q_idx < vsi->num_txq; q_idx++) {
+		err = ice_vsi_cfg_txq(vsi, rings[q_idx], qg_buf);
+		if (err)
+			goto err_cfg_txqs;
 	}
+
 err_cfg_txqs:
-	devm_kfree(&pf->pdev->dev, qg_buf);
+	kfree(qg_buf);
 	return err;
 }
 
@@ -1819,8 +2055,36 @@ ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings, int offset)
  */
 int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi)
 {
-	return ice_vsi_cfg_txqs(vsi, vsi->tx_rings, 0);
+	return ice_vsi_cfg_txqs(vsi, vsi->tx_rings);
+}
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_vsi_cfg_xdp_txqs - Configure Tx queues dedicated for XDP in given VSI
+ * @vsi: the VSI being configured
+ *
+ * Return 0 on success and a negative value on error
+ * Configure the Tx queues dedicated for XDP in given VSI for operation.
+ */
+int ice_vsi_cfg_xdp_txqs(struct ice_vsi *vsi)
+{
+	int ret;
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	int i;
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	ret = ice_vsi_cfg_txqs(vsi, vsi->xdp_rings);
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (ret)
+		return ret;
+
+	for (i = 0; i < vsi->num_xdp_txq; i++)
+		vsi->xdp_rings[i]->xsk_pool = ice_xsk_umem(vsi->xdp_rings[i]);
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	return ret;
 }
+#endif /* HAVE_XDP_SUPPORT */
 
 /**
  * ice_intrl_usec_to_reg - convert interrupt rate limit to register value
@@ -1830,7 +2094,7 @@ int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi)
  * This function converts a decimal interrupt rate limit in usecs to the format
  * expected by firmware.
  */
-u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran)
+static u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran)
 {
 	u32 val = intrl / gran;
 
@@ -1840,138 +2104,73 @@ u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran)
 }
 
 /**
- * ice_cfg_itr_gran - set the ITR granularity to 2 usecs if not already set
- * @hw: board specific structure
+ * ice_write_intrl - write throttle rate limit to interrupt specific register
+ * @q_vector: pointer to interrupt specific structure
+ * @intrl: throttle rate limit in microseconds to write
  */
-static void ice_cfg_itr_gran(struct ice_hw *hw)
+void ice_write_intrl(struct ice_q_vector *q_vector, u8 intrl)
 {
-	u32 regval = rd32(hw, GLINT_CTL);
-
-	/* no need to update global register if ITR gran is already set */
-	if (!(regval & GLINT_CTL_DIS_AUTOMASK_M) &&
-	    (((regval & GLINT_CTL_ITR_GRAN_200_M) >>
-	     GLINT_CTL_ITR_GRAN_200_S) == ICE_ITR_GRAN_US) &&
-	    (((regval & GLINT_CTL_ITR_GRAN_100_M) >>
-	     GLINT_CTL_ITR_GRAN_100_S) == ICE_ITR_GRAN_US) &&
-	    (((regval & GLINT_CTL_ITR_GRAN_50_M) >>
-	     GLINT_CTL_ITR_GRAN_50_S) == ICE_ITR_GRAN_US) &&
-	    (((regval & GLINT_CTL_ITR_GRAN_25_M) >>
-	      GLINT_CTL_ITR_GRAN_25_S) == ICE_ITR_GRAN_US))
-		return;
+	struct ice_hw *hw = &q_vector->vsi->back->hw;
 
-	regval = ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_200_S) &
-		  GLINT_CTL_ITR_GRAN_200_M) |
-		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_100_S) &
-		  GLINT_CTL_ITR_GRAN_100_M) |
-		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_50_S) &
-		  GLINT_CTL_ITR_GRAN_50_M) |
-		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_25_S) &
-		  GLINT_CTL_ITR_GRAN_25_M);
-	wr32(hw, GLINT_CTL, regval);
+	wr32(hw, GLINT_RATE(q_vector->reg_idx),
+	     ice_intrl_usec_to_reg(intrl, ICE_INTRL_GRAN_ABOVE_25));
 }
 
 /**
- * ice_cfg_itr - configure the initial interrupt throttle values
- * @hw: pointer to the HW structure
- * @q_vector: interrupt vector that's being configured
- *
- * Configure interrupt throttling values for the ring containers that are
- * associated with the interrupt vector passed in.
+ * __ice_write_itr - write throttle rate to register
+ * @q_vector: pointer to interrupt data structure
+ * @rc: pointer to ring container
+ * @itr: throttle rate in microseconds to write
  */
-static void
-ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector)
+static void __ice_write_itr(struct ice_q_vector *q_vector,
+			    struct ice_ring_container *rc, u16 itr)
 {
-	ice_cfg_itr_gran(hw);
-
-	if (q_vector->num_ring_rx) {
-		struct ice_ring_container *rc = &q_vector->rx;
-
-		/* if this value is set then don't overwrite with default */
-		if (!rc->itr_setting)
-			rc->itr_setting = ICE_DFLT_RX_ITR;
-
-		rc->target_itr = ITR_TO_REG(rc->itr_setting);
-		rc->next_update = jiffies + 1;
-		rc->current_itr = rc->target_itr;
-		wr32(hw, GLINT_ITR(rc->itr_idx, q_vector->reg_idx),
-		     ITR_REG_ALIGN(rc->current_itr) >> ICE_ITR_GRAN_S);
-	}
-
-	if (q_vector->num_ring_tx) {
-		struct ice_ring_container *rc = &q_vector->tx;
+	struct ice_hw *hw = &q_vector->vsi->back->hw;
 
-		/* if this value is set then don't overwrite with default */
-		if (!rc->itr_setting)
-			rc->itr_setting = ICE_DFLT_TX_ITR;
-
-		rc->target_itr = ITR_TO_REG(rc->itr_setting);
-		rc->next_update = jiffies + 1;
-		rc->current_itr = rc->target_itr;
-		wr32(hw, GLINT_ITR(rc->itr_idx, q_vector->reg_idx),
-		     ITR_REG_ALIGN(rc->current_itr) >> ICE_ITR_GRAN_S);
-	}
+	wr32(hw, GLINT_ITR(rc->itr_idx, q_vector->reg_idx),
+	     ITR_REG_ALIGN(itr) >> ICE_ITR_GRAN_S);
 }
 
 /**
- * ice_cfg_txq_interrupt - configure interrupt on Tx queue
- * @vsi: the VSI being configured
- * @txq: Tx queue being mapped to MSI-X vector
- * @msix_idx: MSI-X vector index within the function
- * @itr_idx: ITR index of the interrupt cause
- *
- * Configure interrupt on Tx queue by associating Tx queue to MSI-X vector
- * within the function space.
+ * ice_write_itr - write throttle rate to queue specific register
+ * @rc: pointer to ring container
+ * @itr: throttle rate in microseconds to write
  */
-#ifdef CONFIG_PCI_IOV
-void
-ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx)
-#else
-static void
-ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx)
-#endif /* CONFIG_PCI_IOV */
+void ice_write_itr(struct ice_ring_container *rc, u16 itr)
 {
-	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
-	u32 val;
+	struct ice_q_vector *q_vector;
 
-	itr_idx = (itr_idx << QINT_TQCTL_ITR_INDX_S) & QINT_TQCTL_ITR_INDX_M;
+	if (!rc->ring)
+		return;
 
-	val = QINT_TQCTL_CAUSE_ENA_M | itr_idx |
-	      ((msix_idx << QINT_TQCTL_MSIX_INDX_S) & QINT_TQCTL_MSIX_INDX_M);
+	q_vector = rc->ring->q_vector;
 
-	wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), val);
+	__ice_write_itr(q_vector, rc, itr);
 }
 
 /**
- * ice_cfg_rxq_interrupt - configure interrupt on Rx queue
- * @vsi: the VSI being configured
- * @rxq: Rx queue being mapped to MSI-X vector
- * @msix_idx: MSI-X vector index within the function
- * @itr_idx: ITR index of the interrupt cause
+ * ice_set_q_vector_intrl - set up interrupt rate limiting
+ * @q_vector: the vector to be configured
  *
- * Configure interrupt on Rx queue by associating Rx queue to MSI-X vector
- * within the function space.
+ * Interrupt rate limiting is local to the vector, not per-queue so we must
+ * detect if either ring container has dynamic enabled to decide what to set
+ * the interrupt rate limit to via INTRL settings. In the case of dynamic
+ * disabled on both, write the value with the cached setting to make sure
+ * INTRL matches user visible value.
  */
-#ifdef CONFIG_PCI_IOV
-void
-ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx)
-#else
-static void
-ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx)
-#endif /* CONFIG_PCI_IOV */
+void ice_set_q_vector_intrl(struct ice_q_vector *q_vector)
 {
-	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
-	u32 val;
-
-	itr_idx = (itr_idx << QINT_RQCTL_ITR_INDX_S) & QINT_RQCTL_ITR_INDX_M;
-
-	val = QINT_RQCTL_CAUSE_ENA_M | itr_idx |
-	      ((msix_idx << QINT_RQCTL_MSIX_INDX_S) & QINT_RQCTL_MSIX_INDX_M);
-
-	wr32(hw, QINT_RQCTL(vsi->rxq_map[rxq]), val);
-
-	ice_flush(hw);
+	if (ITR_IS_DYNAMIC(&q_vector->tx) || ITR_IS_DYNAMIC(&q_vector->rx)) {
+		/* in the case of dynamic enabled, cap each vector to no more
+		 * than (4 us) 250,000 ints/sec, which allows low latency
+		 * but still less than 500,000 interrupts per second, which
+		 * reduces CPU a bit in the case of the lowest latency
+		 * setting. The 4 here is a value in microseconds.
+		 */
+		ice_write_intrl(q_vector, 4);
+	} else {
+		ice_write_intrl(q_vector, q_vector->intrl);
+	}
 }
 
 /**
@@ -1985,7 +2184,7 @@ void ice_vsi_cfg_msix(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
 	struct ice_hw *hw = &pf->hw;
-	u32 txq = 0, rxq = 0;
+	u16 txq = 0, rxq = 0;
 	int i, q;
 
 	for (i = 0; i < vsi->num_q_vectors; i++) {
@@ -1994,9 +2193,6 @@ void ice_vsi_cfg_msix(struct ice_vsi *vsi)
 
 		ice_cfg_itr(hw, q_vector);
 
-		wr32(hw, GLINT_RATE(reg_idx),
-		     ice_intrl_usec_to_reg(q_vector->intrl, hw->intrl_gran));
-
 		/* Both Transmit Queue Interrupt Cause Control register
 		 * and Receive Queue Interrupt Cause control register
 		 * expects MSIX_INDX field to be the vector index
@@ -2023,217 +2219,25 @@ void ice_vsi_cfg_msix(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_manage_vlan_insertion - Manage VLAN insertion for the VSI for Tx
- * @vsi: the VSI being changed
- */
-int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi)
-{
-	struct device *dev = &vsi->back->pdev->dev;
-	struct ice_hw *hw = &vsi->back->hw;
-	struct ice_vsi_ctx *ctxt;
-	enum ice_status status;
-	int ret = 0;
-
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
-
-	/* Here we are configuring the VSI to let the driver add VLAN tags by
-	 * setting vlan_flags to ICE_AQ_VSI_VLAN_MODE_ALL. The actual VLAN tag
-	 * insertion happens in the Tx hot path, in ice_tx_map.
-	 */
-	ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_MODE_ALL;
-
-	/* Preserve existing VLAN strip setting */
-	ctxt->info.vlan_flags |= (vsi->info.vlan_flags &
-				  ICE_AQ_VSI_VLAN_EMOD_M);
-
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
-
-	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		dev_err(dev, "update VSI for VLAN insert failed, err %d aq_err %d\n",
-			status, hw->adminq.sq_last_status);
-		ret = -EIO;
-		goto out;
-	}
-
-	vsi->info.vlan_flags = ctxt->info.vlan_flags;
-out:
-	devm_kfree(dev, ctxt);
-	return ret;
-}
-
-/**
- * ice_vsi_manage_vlan_stripping - Manage VLAN stripping for the VSI for Rx
- * @vsi: the VSI being changed
- * @ena: boolean value indicating if this is a enable or disable request
- */
-int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena)
-{
-	struct device *dev = &vsi->back->pdev->dev;
-	struct ice_hw *hw = &vsi->back->hw;
-	struct ice_vsi_ctx *ctxt;
-	enum ice_status status;
-	int ret = 0;
-
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
-
-	/* Here we are configuring what the VSI should do with the VLAN tag in
-	 * the Rx packet. We can either leave the tag in the packet or put it in
-	 * the Rx descriptor.
-	 */
-	if (ena)
-		/* Strip VLAN tag from Rx packet and put it in the desc */
-		ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_EMOD_STR_BOTH;
-	else
-		/* Disable stripping. Leave tag in packet */
-		ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_EMOD_NOTHING;
-
-	/* Allow all packets untagged/tagged */
-	ctxt->info.vlan_flags |= ICE_AQ_VSI_VLAN_MODE_ALL;
-
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
-
-	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		dev_err(dev, "update VSI for VLAN strip failed, ena = %d err %d aq_err %d\n",
-			ena, status, hw->adminq.sq_last_status);
-		ret = -EIO;
-		goto out;
-	}
-
-	vsi->info.vlan_flags = ctxt->info.vlan_flags;
-out:
-	devm_kfree(dev, ctxt);
-	return ret;
-}
-
-/**
- * ice_vsi_start_rx_rings - start VSI's Rx rings
- * @vsi: the VSI whose rings are to be started
+ * ice_vsi_start_all_rx_rings - start/enable all of a VSI's Rx rings
+ * @vsi: the VSI whose rings are to be enabled
  *
  * Returns 0 on success and a negative value on error
  */
-int ice_vsi_start_rx_rings(struct ice_vsi *vsi)
+int ice_vsi_start_all_rx_rings(struct ice_vsi *vsi)
 {
-	return ice_vsi_ctrl_rx_rings(vsi, true);
+	return ice_vsi_ctrl_all_rx_rings(vsi, true);
 }
 
 /**
- * ice_vsi_stop_rx_rings - stop VSI's Rx rings
- * @vsi: the VSI
+ * ice_vsi_stop_all_rx_rings - stop/disable all of a VSI's Rx rings
+ * @vsi: the VSI whose rings are to be disabled
  *
  * Returns 0 on success and a negative value on error
  */
-int ice_vsi_stop_rx_rings(struct ice_vsi *vsi)
-{
-	return ice_vsi_ctrl_rx_rings(vsi, false);
-}
-
-/**
- * ice_trigger_sw_intr - trigger a software interrupt
- * @hw: pointer to the HW structure
- * @q_vector: interrupt vector to trigger the software interrupt for
- */
-void ice_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector)
-{
-	wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx),
-	     (ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) |
-	     GLINT_DYN_CTL_SWINT_TRIG_M |
-	     GLINT_DYN_CTL_INTENA_M);
-}
-
-/**
- * ice_vsi_stop_tx_ring - Disable single Tx ring
- * @vsi: the VSI being configured
- * @rst_src: reset source
- * @rel_vmvf_num: Relative ID of VF/VM
- * @ring: Tx ring to be stopped
- * @txq_meta: Meta data of Tx ring to be stopped
- */
-#ifndef CONFIG_PCI_IOV
-static
-#endif /* !CONFIG_PCI_IOV */
-int
-ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
-		     u16 rel_vmvf_num, struct ice_ring *ring,
-		     struct ice_txq_meta *txq_meta)
-{
-	struct ice_pf *pf = vsi->back;
-	struct ice_q_vector *q_vector;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status status;
-	u32 val;
-
-	/* clear cause_ena bit for disabled queues */
-	val = rd32(hw, QINT_TQCTL(ring->reg_idx));
-	val &= ~QINT_TQCTL_CAUSE_ENA_M;
-	wr32(hw, QINT_TQCTL(ring->reg_idx), val);
-
-	/* software is expected to wait for 100 ns */
-	ndelay(100);
-
-	/* trigger a software interrupt for the vector
-	 * associated to the queue to schedule NAPI handler
-	 */
-	q_vector = ring->q_vector;
-	if (q_vector)
-		ice_trigger_sw_intr(hw, q_vector);
-
-	status = ice_dis_vsi_txq(vsi->port_info, txq_meta->vsi_idx,
-				 txq_meta->tc, 1, &txq_meta->q_handle,
-				 &txq_meta->q_id, &txq_meta->q_teid, rst_src,
-				 rel_vmvf_num, NULL);
-
-	/* if the disable queue command was exercised during an
-	 * active reset flow, ICE_ERR_RESET_ONGOING is returned.
-	 * This is not an error as the reset operation disables
-	 * queues at the hardware level anyway.
-	 */
-	if (status == ICE_ERR_RESET_ONGOING) {
-		dev_dbg(&vsi->back->pdev->dev,
-			"Reset in progress. LAN Tx queues already disabled\n");
-	} else if (status == ICE_ERR_DOES_NOT_EXIST) {
-		dev_dbg(&vsi->back->pdev->dev,
-			"LAN Tx queues do not exist, nothing to disable\n");
-	} else if (status) {
-		dev_err(&vsi->back->pdev->dev,
-			"Failed to disable LAN Tx queues, error: %d\n", status);
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-/**
- * ice_fill_txq_meta - Prepare the Tx queue's meta data
- * @vsi: VSI that ring belongs to
- * @ring: ring that txq_meta will be based on
- * @txq_meta: a helper struct that wraps Tx queue's information
- *
- * Set up a helper struct that will contain all the necessary fields that
- * are needed for stopping Tx queue
- */
-#ifndef CONFIG_PCI_IOV
-static
-#endif /* !CONFIG_PCI_IOV */
-void
-ice_fill_txq_meta(struct ice_vsi *vsi, struct ice_ring *ring,
-		  struct ice_txq_meta *txq_meta)
+int ice_vsi_stop_all_rx_rings(struct ice_vsi *vsi)
 {
-	u8 tc = 0;
-
-#ifdef CONFIG_DCB
-	tc = ring->dcb_tc;
-#endif /* CONFIG_DCB */
-	txq_meta->q_id = ring->reg_idx;
-	txq_meta->q_teid = ring->txq_teid;
-	txq_meta->q_handle = ring->q_handle;
-	txq_meta->vsi_idx = vsi->idx;
-	txq_meta->tc = tc;
+	return ice_vsi_ctrl_all_rx_rings(vsi, false);
 }
 
 /**
@@ -2247,34 +2251,24 @@ static int
 ice_vsi_stop_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
 		      u16 rel_vmvf_num, struct ice_ring **rings)
 {
-	u16 i, q_idx = 0;
-	int status;
-	u8 tc;
+	u16 q_idx;
 
 	if (vsi->num_txq > ICE_LAN_TXQ_MAX_QDIS)
 		return -EINVAL;
 
-	/* set up the Tx queue list to be disabled for each enabled TC */
-	ice_for_each_traffic_class(tc) {
-		if (!(vsi->tc_cfg.ena_tc & BIT(tc)))
-			break;
-
-		for (i = 0; i < vsi->tc_cfg.tc_info[tc].qcount_tx; i++) {
-			struct ice_txq_meta txq_meta = { };
+	for (q_idx = 0; q_idx < vsi->num_txq; q_idx++) {
+		struct ice_txq_meta txq_meta = { };
+		int status;
 
-			if (!rings || !rings[q_idx])
-				return -EINVAL;
+		if (!rings || !rings[q_idx])
+			return -EINVAL;
 
-			ice_fill_txq_meta(vsi, rings[q_idx], &txq_meta);
-			status = ice_vsi_stop_tx_ring(vsi, rst_src,
-						      rel_vmvf_num,
-						      rings[q_idx], &txq_meta);
+		ice_fill_txq_meta(vsi, rings[q_idx], &txq_meta);
+		status = ice_vsi_stop_tx_ring(vsi, rst_src, rel_vmvf_num,
+					      rings[q_idx], &txq_meta);
 
-			if (status)
-				return status;
-
-			q_idx++;
-		}
+		if (status)
+			return status;
 	}
 
 	return 0;
@@ -2293,82 +2287,53 @@ ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
 	return ice_vsi_stop_tx_rings(vsi, rst_src, rel_vmvf_num, vsi->tx_rings);
 }
 
+#ifdef HAVE_XDP_SUPPORT
 /**
- * ice_cfg_vlan_pruning - enable or disable VLAN pruning on the VSI
- * @vsi: VSI to enable or disable VLAN pruning on
- * @ena: set to true to enable VLAN pruning and false to disable it
- * @vlan_promisc: enable valid security flags if not in VLAN promiscuous mode
- *
- * returns 0 if VSI is updated, negative otherwise
+ * ice_vsi_stop_xdp_tx_rings - Disable XDP Tx rings
+ * @vsi: the VSI being configured
  */
-int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena, bool vlan_promisc)
+int ice_vsi_stop_xdp_tx_rings(struct ice_vsi *vsi)
 {
-	struct ice_vsi_ctx *ctxt;
-	struct device *dev;
-	struct ice_pf *pf;
-	int status;
+	return ice_vsi_stop_tx_rings(vsi, ICE_NO_RESET, 0, vsi->xdp_rings);
+}
 
-	if (!vsi)
-		return -EINVAL;
+#endif /* HAVE_XDP_SUPPORT */
 
-	pf = vsi->back;
-	dev = &pf->pdev->dev;
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
-
-	ctxt->info = vsi->info;
-
-	if (ena) {
-		ctxt->info.sec_flags |=
-			ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
-			ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S;
-		ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-	} else {
-		ctxt->info.sec_flags &=
-			~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
-			  ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
-		ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-	}
-
-	if (!vlan_promisc)
-		ctxt->info.valid_sections =
-			cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID |
-				    ICE_AQ_VSI_PROP_SW_VALID);
-
-	status = ice_update_vsi(&pf->hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		netdev_err(vsi->netdev, "%sabling VLAN pruning on VSI handle: %d, VSI HW ID: %d failed, err = %d, aq_err = %d\n",
-			   ena ? "En" : "Dis", vsi->idx, vsi->vsi_num, status,
-			   pf->hw.adminq.sq_last_status);
-		goto err_out;
-	}
+/**
+ * ice_vsi_is_vlan_pruning_ena - check if VLAN pruning is enabled or not
+ * @vsi: VSI to check whether or not VLAN pruning is enabled.
+ *
+ * returns true if Rx VLAN pruning is enabled and false otherwise.
+ */
+bool ice_vsi_is_vlan_pruning_ena(struct ice_vsi *vsi)
+{
+	if (!vsi)
+		return false;
 
-	vsi->info.sec_flags = ctxt->info.sec_flags;
-	vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+	return (vsi->info.sw_flags2 & ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA);
+}
 
-	devm_kfree(dev, ctxt);
-	return 0;
 
-err_out:
-	devm_kfree(dev, ctxt);
-	return -EIO;
-}
 
 static void ice_vsi_set_tc_cfg(struct ice_vsi *vsi)
 {
-	struct ice_dcbx_cfg *cfg = &vsi->port_info->local_dcbx_cfg;
+	if (!test_bit(ICE_FLAG_DCB_ENA, vsi->back->flags)) {
+		vsi->tc_cfg.ena_tc = ICE_DFLT_TRAFFIC_CLASS;
+		vsi->tc_cfg.numtc = 1;
+		return;
+	}
 
-	vsi->tc_cfg.ena_tc = ice_dcb_get_ena_tc(cfg);
-	vsi->tc_cfg.numtc = ice_dcb_get_num_tc(cfg);
+	/* set VSI TC information based on DCB config */
+	ice_vsi_set_dcb_tc_cfg(vsi);
 }
 
 /**
  * ice_vsi_set_q_vectors_reg_idx - set the HW register index for all q_vectors
  * @vsi: VSI to set the q_vectors register index on
+ * @tc: traffic class for VF ADQ
  */
-static int
-ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi)
+static int ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi,
+					 u8 __maybe_unused tc)
 {
 	u16 i;
 
@@ -2379,8 +2344,7 @@ ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi)
 		struct ice_q_vector *q_vector = vsi->q_vectors[i];
 
 		if (!q_vector) {
-			dev_err(&vsi->back->pdev->dev,
-				"Failed to set reg_idx on q_vector %d VSI %d\n",
+			dev_err(ice_pf_to_dev(vsi->back), "Failed to set reg_idx on q_vector %d VSI %d\n",
 				i, vsi->vsi_num);
 			goto clear_reg_idx;
 		}
@@ -2388,7 +2352,8 @@ ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi)
 		if (vsi->type == ICE_VSI_VF) {
 			struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
 
-			q_vector->reg_idx = ice_calc_vf_reg_idx(vf, q_vector);
+			q_vector->reg_idx =
+					ice_calc_vf_reg_idx(vf, q_vector, tc);
 		} else {
 			q_vector->reg_idx =
 				q_vector->v_idx + vsi->base_vector;
@@ -2409,101 +2374,185 @@ ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_add_rem_eth_mac - Program VSI ethertype based filter with rule
+ * ice_cfg_sw_lldp - Config switch rules for LLDP packet handling
  * @vsi: the VSI being configured
- * @add_rule: boolean value to add or remove ethertype filter rule
+ * @tx: bool to determine Tx or Rx rule
+ * @create: bool to determine create or remove Rule
  */
-static void
-ice_vsi_add_rem_eth_mac(struct ice_vsi *vsi, bool add_rule)
+void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create)
 {
-	struct ice_fltr_list_entry *list;
+	enum ice_status (*eth_fltr)(struct ice_vsi *v, u16 type, u16 flag,
+				    enum ice_sw_fwd_act_type act);
 	struct ice_pf *pf = vsi->back;
-	LIST_HEAD(tmp_add_list);
 	enum ice_status status;
+	struct device *dev;
 
-	list = devm_kzalloc(&pf->pdev->dev, sizeof(*list), GFP_KERNEL);
-	if (!list)
-		return;
-
-	list->fltr_info.lkup_type = ICE_SW_LKUP_ETHERTYPE;
-	list->fltr_info.fltr_act = ICE_DROP_PACKET;
-	list->fltr_info.flag = ICE_FLTR_TX;
-	list->fltr_info.src_id = ICE_SRC_ID_VSI;
-	list->fltr_info.vsi_handle = vsi->idx;
-	list->fltr_info.l_data.ethertype_mac.ethertype = vsi->ethtype;
-
-	INIT_LIST_HEAD(&list->list_entry);
-	list_add(&list->list_entry, &tmp_add_list);
+	dev = ice_pf_to_dev(pf);
+	eth_fltr = create ? ice_fltr_add_eth : ice_fltr_remove_eth;
 
-	if (add_rule)
-		status = ice_add_eth_mac(&pf->hw, &tmp_add_list);
-	else
-		status = ice_remove_eth_mac(&pf->hw, &tmp_add_list);
+	if (tx) {
+		status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_TX,
+				  ICE_DROP_PACKET);
+	} else {
+		if (ice_fw_supports_lldp_fltr_ctrl(&pf->hw)) {
+			status = ice_lldp_fltr_add_remove(&pf->hw, vsi->vsi_num,
+							  create);
+		} else {
+			status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_RX,
+					  ICE_FWD_TO_VSI);
+		}
+	}
 
 	if (status)
-		dev_err(&pf->pdev->dev,
-			"Failure Adding or Removing Ethertype on VSI %i error: %d\n",
-			vsi->vsi_num, status);
-
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
+		dev_dbg(dev, "Fail %s %s LLDP rule on VSI %i error: %s\n",
+			create ? "adding" : "removing", tx ? "TX" : "RX",
+			vsi->vsi_num, ice_stat_str(status));
 }
 
 /**
- * ice_cfg_sw_lldp - Config switch rules for LLDP packet handling
- * @vsi: the VSI being configured
- * @tx: bool to determine Tx or Rx rule
- * @create: bool to determine create or remove Rule
+ * ice_set_agg_vsi - sets up scheduler aggregator node and move VSI into it
+ * @vsi: pointer to the VSI
+ *
+ * This function will allocate new scheduler aggregator now if needed and will
+ * move specified VSI into it.
  */
-void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create)
+static void ice_set_agg_vsi(struct ice_vsi *vsi)
 {
-	struct ice_fltr_list_entry *list;
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	struct ice_agg_node *agg_node_iter = NULL;
+	u32 agg_id = ICE_INVALID_AGG_NODE_ID;
+	struct ice_agg_node *agg_node = NULL;
+	int node_offset, max_agg_nodes = 0;
+	struct ice_port_info *port_info;
 	struct ice_pf *pf = vsi->back;
-	LIST_HEAD(tmp_add_list);
+	u32 agg_node_id_start = 0;
 	enum ice_status status;
 
-	list = devm_kzalloc(&pf->pdev->dev, sizeof(*list), GFP_KERNEL);
-	if (!list)
+	/* create (as needed) scheduler aggregator node and move VSI into
+	 * corresponding aggregator node
+	 * - PF aggregator node to contains VSIs of type _PF, _CTRL, _CHNL
+	 * - MACVLAN aggregator node to contain MACVLAN VSIs only
+	 * - VF aggregator nodes will contain VF VSI including VF ADQ VSIs
+	 */
+	port_info = pf->hw.port_info;
+	if (!port_info)
 		return;
 
-	list->fltr_info.lkup_type = ICE_SW_LKUP_ETHERTYPE;
-	list->fltr_info.vsi_handle = vsi->idx;
-	list->fltr_info.l_data.ethertype_mac.ethertype = ETH_P_LLDP;
+	switch (vsi->type) {
+	case ICE_VSI_CTRL:
+	case ICE_VSI_CHNL:
+	case ICE_VSI_LB:
+	case ICE_VSI_PF:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		max_agg_nodes = ICE_MAX_PF_AGG_NODES;
+		agg_node_id_start = ICE_PF_AGG_NODE_ID_START;
+		agg_node_iter = &pf->pf_agg_node[0];
+		break;
+	case ICE_VSI_VF:
+		/* user can create 'n' VFs on a given PF, but since max children
+		 * per aggregator node can be only 64. Following code handles
+		 * aggregator(s) for VF VSIs, either selects a agg_node which
+		 * was already created provided num_vsis < 64, otherwise
+		 * select next available node, which woll be created
+		 */
+		max_agg_nodes = ICE_MAX_VF_AGG_NODES;
+		agg_node_id_start = ICE_VF_AGG_NODE_ID_START;
+		agg_node_iter = &pf->vf_agg_node[0];
+		break;
+#ifdef HAVE_NETDEV_SB_DEV
+	case ICE_VSI_OFFLOAD_MACVLAN:
+		/* there can be 'n' offloaded NACVLAN, hence select the desired
+		 * aggregator node for offloaded MACVLAN VSI
+		 */
+		max_agg_nodes = ICE_MAX_MACVLAN_AGG_NODES;
+		agg_node_id_start = ICE_MACVLAN_AGG_NODE_ID_START;
+		agg_node_iter = &pf->macvlan_agg_node[0];
+		break;
+#endif
+	default:
+		/* other VSI type, handle later if needed */
+		dev_dbg(dev, "unexpected VSI type %s\n",
+			ice_vsi_type_str(vsi->type));
+		return;
+	}
 
-	if (tx) {
-		list->fltr_info.fltr_act = ICE_DROP_PACKET;
-		list->fltr_info.flag = ICE_FLTR_TX;
-		list->fltr_info.src_id = ICE_SRC_ID_VSI;
-	} else {
-		list->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-		list->fltr_info.flag = ICE_FLTR_RX;
-		list->fltr_info.src_id = ICE_SRC_ID_LPORT;
+	/* find the appropriate aggregator node */
+	for (node_offset = 0; node_offset < max_agg_nodes; node_offset++) {
+		/* see if we can find space in previously created
+		 * node if num_vsis < 64, otherwise skip
+		 */
+		if (agg_node_iter->num_vsis &&
+		    agg_node_iter->num_vsis == ICE_MAX_VSIS_IN_AGG_NODE) {
+			agg_node_iter++;
+			continue;
+		}
+
+		if (agg_node_iter->valid &&
+		    agg_node_iter->agg_id != ICE_INVALID_AGG_NODE_ID) {
+			agg_id = agg_node_iter->agg_id;
+			agg_node = agg_node_iter;
+			break;
+		}
+
+		/* find unclaimed agg_id */
+		if (agg_node_iter->agg_id == ICE_INVALID_AGG_NODE_ID) {
+			agg_id = node_offset + agg_node_id_start;
+			agg_node = agg_node_iter;
+			break;
+		}
+		/* move to next agg_node */
+		agg_node_iter++;
 	}
 
-	INIT_LIST_HEAD(&list->list_entry);
-	list_add(&list->list_entry, &tmp_add_list);
+	if (!agg_node)
+		return;
 
-	if (create)
-		status = ice_add_eth_mac(&pf->hw, &tmp_add_list);
-	else
-		status = ice_remove_eth_mac(&pf->hw, &tmp_add_list);
+	/* if selected aggregator node was not created, create it */
+	if (!agg_node->valid) {
+		status = ice_cfg_agg(port_info, agg_id, ICE_AGG_TYPE_AGG,
+				     (u8)vsi->tc_cfg.ena_tc);
+		if (status) {
+			dev_err(dev, "unable to create aggregator node with agg_id %u\n",
+				agg_id);
+			return;
+		}
+		/* aggregator node is created, store the neeeded info */
+		agg_node->valid = true;
+		agg_node->agg_id = agg_id;
+	}
 
-	if (status)
-		dev_err(&pf->pdev->dev,
-			"Fail %s %s LLDP rule on VSI %i error: %d\n",
-			create ? "adding" : "removing", tx ? "TX" : "RX",
-			vsi->vsi_num, status);
+	/* move VSI to corresponding aggregator node */
+	status = ice_move_vsi_to_agg(port_info, agg_id, vsi->idx,
+				     (u8)vsi->tc_cfg.ena_tc);
+	if (status) {
+		dev_err(dev, "unable to move VSI idx %u into aggregator %u node",
+			vsi->idx, agg_id);
+		return;
+	}
+
+	/* keep active children count for aggregator node */
+	agg_node->num_vsis++;
 
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
+	/* cache the 'agg_id' in VSI, so that after reset - VSI will be moved
+	 * to aggregator node
+	 */
+	vsi->agg_node = agg_node;
+	dev_dbg(dev, "successfully moved VSI idx %u tc_bitmap 0x%x) into aggregator node %d which has num_vsis %u\n",
+		vsi->idx, vsi->tc_cfg.ena_tc, vsi->agg_node->agg_id,
+		vsi->agg_node->num_vsis);
 }
 
 /**
  * ice_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
  * @pi: pointer to the port_info instance
- * @type: VSI type
+ * @vsi_type: VSI type
  * @vf_id: defines VF ID to which this VSI connects. This field is meant to be
  *         used only for ICE_VSI_VF VSI type. For other VSI types, should
  *         fill-in ICE_INVAL_VFID as input.
+ * @ch: ptr to channel
+ * @tc: traffic class for VF ADQ
  *
  * This allocates the sw VSI structure and its queue resources.
  *
@@ -2512,18 +2561,22 @@ void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create)
  */
 struct ice_vsi *
 ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
-	      enum ice_vsi_type type, u16 vf_id)
+	      enum ice_vsi_type vsi_type, u16 vf_id, struct ice_channel *ch,
+	      u8 tc)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
-	struct device *dev = &pf->pdev->dev;
+	struct device *dev = ice_pf_to_dev(pf);
 	enum ice_status status;
 	struct ice_vsi *vsi;
 	int ret, i;
 
-	if (type == ICE_VSI_VF)
-		vsi = ice_vsi_alloc(pf, type, vf_id);
-	else
-		vsi = ice_vsi_alloc(pf, type, ICE_INVAL_VFID);
+	if (vsi_type == ICE_VSI_CHNL)
+		vsi = ice_vsi_alloc(pf, vsi_type, ch, ICE_INVAL_VFID, 0);
+	else if (vsi_type == ICE_VSI_VF || vsi_type == ICE_VSI_CTRL) {
+		vsi = ice_vsi_alloc(pf, vsi_type, NULL, vf_id, tc);
+	} else {
+		vsi = ice_vsi_alloc(pf, vsi_type, NULL, ICE_INVAL_VFID, tc);
+	}
 
 	if (!vsi) {
 		dev_err(dev, "could not allocate VSI\n");
@@ -2535,15 +2588,21 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	if (vsi->type == ICE_VSI_PF)
 		vsi->ethtype = ETH_P_PAUSE;
 
-	if (vsi->type == ICE_VSI_VF)
+	if (vsi->type == ICE_VSI_VF || vsi->type == ICE_VSI_CTRL)
 		vsi->vf_id = vf_id;
 
-	if (ice_vsi_get_qs(vsi)) {
-		dev_err(dev, "Failed to allocate queues. vsi->idx = %d\n",
-			vsi->idx);
-		goto unroll_get_qs;
+	ice_alloc_fd_res(vsi);
+
+	if (vsi_type != ICE_VSI_CHNL) {
+		if (ice_vsi_get_qs(vsi)) {
+			dev_err(dev, "Failed to allocate queues. vsi->idx = %d\n",
+				vsi->idx);
+			goto unroll_vsi_alloc;
+		}
 	}
 
+	ice_vsi_alloc_rss_global_lut(vsi);
+
 	/* set RSS capabilities */
 	ice_vsi_set_rss_params(vsi);
 
@@ -2551,11 +2610,17 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	ice_vsi_set_tc_cfg(vsi);
 
 	/* create the VSI */
-	ret = ice_vsi_init(vsi);
+	ret = ice_vsi_init(vsi, true);
 	if (ret)
 		goto unroll_get_qs;
 
+	ice_vsi_init_vlan_ops(vsi);
+
 	switch (vsi->type) {
+	case ICE_VSI_CTRL:
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
 	case ICE_VSI_PF:
 		ret = ice_vsi_alloc_q_vectors(vsi);
 		if (ret)
@@ -2565,7 +2630,8 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 		if (ret)
 			goto unroll_alloc_q_vector;
 
-		ret = ice_vsi_set_q_vectors_reg_idx(vsi);
+		ret = ice_vsi_set_q_vectors_reg_idx(vsi, 0);
+
 		if (ret)
 			goto unroll_vector_base;
 
@@ -2574,13 +2640,29 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 			goto unroll_vector_base;
 
 		ice_vsi_map_rings_to_vectors(vsi);
-
-		/* Do not exit if configuring RSS had an issue, at least
-		 * receive traffic on first queue. Hence no need to capture
-		 * return value
+		ice_vsi_reset_stats(vsi);
+		/* Perform an initial read of the statistics registers now to
+		 * set the baseline before the VSI becomes operational.
 		 */
-		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+		ice_update_eth_stats(vsi);
+
+		/* ICE_VSI_CTRL does not need RSS so skip RSS processing */
+		if (vsi->type != ICE_VSI_CTRL)
+			/* Do not exit if configuring RSS had an issue, at
+			 * least receive traffic on first queue. Hence no
+			 * need to capture return value
+			 */
+			if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+				ice_vsi_cfg_rss_lut_key(vsi);
+				ice_vsi_set_rss_flow_fld(vsi);
+			}
+		ice_init_arfs(vsi);
+		break;
+	case ICE_VSI_CHNL:
+		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
 			ice_vsi_cfg_rss_lut_key(vsi);
+			ice_vsi_set_rss_flow_fld(vsi);
+		}
 		break;
 	case ICE_VSI_VF:
 		/* VF driver will take care of creating netdev for this type and
@@ -2596,7 +2678,7 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 		if (ret)
 			goto unroll_alloc_q_vector;
 
-		ret = ice_vsi_set_q_vectors_reg_idx(vsi);
+		ret = ice_vsi_set_q_vectors_reg_idx(vsi, tc);
 		if (ret)
 			goto unroll_vector_base;
 
@@ -2604,8 +2686,10 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 		 * receive traffic on first queue. Hence no need to capture
 		 * return value
 		 */
-		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
 			ice_vsi_cfg_rss_lut_key(vsi);
+			ice_vsi_set_vf_rss_flow_fld(vsi);
+		}
 		break;
 	case ICE_VSI_LB:
 		ret = ice_vsi_alloc_rings(vsi);
@@ -2618,16 +2702,27 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	}
 
 	/* configure VSI nodes based on number of queues and TC's */
-	for (i = 0; i < vsi->tc_cfg.numtc; i++)
-		max_txqs[i] = vsi->alloc_txq;
+	ice_for_each_traffic_class(i) {
+		if (!(vsi->tc_cfg.ena_tc & BIT(i)))
+			continue;
+
+		if (vsi->type == ICE_VSI_CHNL) {
+			if (!vsi->alloc_txq && vsi->num_txq)
+				max_txqs[i] = vsi->num_txq;
+			else
+				max_txqs[i] = pf->num_lan_tx;
+		} else {
+			max_txqs[i] = vsi->alloc_txq;
+		}
+	}
 
+	dev_dbg(dev, "vsi->tc_cfg.ena_tc = %d\n", vsi->tc_cfg.ena_tc);
 	status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
 				 max_txqs);
 	if (status) {
-		dev_err(&pf->pdev->dev,
-			"VSI %d failed lan queue config, error %d\n",
-			vsi->vsi_num, status);
-		goto unroll_vector_base;
+		dev_err(dev, "VSI %d failed lan queue config, error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		goto unroll_clear_rings;
 	}
 
 	/* Add switch rule to drop all Tx Flow Control Frames, of look up
@@ -2635,26 +2730,25 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	 * out PAUSE or PFC frames. If enabled, FW can still send FC frames.
 	 * The rule is added once for PF VSI in order to create appropriate
 	 * recipe, since VSI/VSI list is ignored with drop action...
-	 * Also add rules to handle LLDP Tx and Rx packets.  Tx LLDP packets
-	 * need to be dropped so that VFs cannot send LLDP packets to reconfig
-	 * DCB settings in the HW.  Also, if the FW DCBX engine is not running
-	 * then Rx LLDP packets need to be redirected up the stack.
+	 * Also add rules to handle LLDP Tx packets.  Tx LLDP packets need to
+	 * be dropped so that VFs cannot send LLDP packets to reconfig DCB
+	 * settings in the HW.
 	 */
-	if (!ice_is_safe_mode(pf)) {
+	if (!ice_is_safe_mode(pf))
 		if (vsi->type == ICE_VSI_PF) {
-			ice_vsi_add_rem_eth_mac(vsi, true);
-
-			/* Tx LLDP packets */
+			ice_fltr_add_eth(vsi, ETH_P_PAUSE, ICE_FLTR_TX,
+					 ICE_DROP_PACKET);
 			ice_cfg_sw_lldp(vsi, true, true);
-
-			/* Rx LLDP packets */
-			if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags))
-				ice_cfg_sw_lldp(vsi, false, true);
 		}
-	}
+
+
+	if (!vsi->agg_node)
+		ice_set_agg_vsi(vsi);
 
 	return vsi;
 
+unroll_clear_rings:
+	ice_vsi_clear_rings(vsi);
 unroll_vector_base:
 	/* reclaim SW interrupts back to the common pool */
 	ice_free_res(pf->irq_tracker, vsi->base_vector, vsi->idx);
@@ -2665,6 +2759,7 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	ice_vsi_delete(vsi);
 unroll_get_qs:
 	ice_vsi_put_qs(vsi);
+unroll_vsi_alloc:
 	ice_vsi_clear(vsi);
 
 	return NULL;
@@ -2684,16 +2779,23 @@ static void ice_vsi_release_msix(struct ice_vsi *vsi)
 
 	for (i = 0; i < vsi->num_q_vectors; i++) {
 		struct ice_q_vector *q_vector = vsi->q_vectors[i];
-		u16 reg_idx = q_vector->reg_idx;
 
-		wr32(hw, GLINT_ITR(ICE_IDX_ITR0, reg_idx), 0);
-		wr32(hw, GLINT_ITR(ICE_IDX_ITR1, reg_idx), 0);
+		ice_write_intrl(q_vector, 0);
 		for (q = 0; q < q_vector->num_ring_tx; q++) {
+			ice_write_itr(&q_vector->tx, 0);
 			wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), 0);
+#ifdef HAVE_XDP_SUPPORT
+			if (ice_is_xdp_ena_vsi(vsi)) {
+				u32 xdp_txq = txq + vsi->num_xdp_txq;
+
+				wr32(hw, QINT_TQCTL(vsi->txq_map[xdp_txq]), 0);
+			}
+#endif /* HAVE_XDP_SUPPORT */
 			txq++;
 		}
 
 		for (q = 0; q < q_vector->num_ring_rx; q++) {
+			ice_write_itr(&q_vector->rx, 0);
 			wr32(hw, QINT_RQCTL(vsi->rxq_map[rxq]), 0);
 			rxq++;
 		}
@@ -2738,8 +2840,7 @@ void ice_vsi_free_irq(struct ice_vsi *vsi)
 		/* clear the affinity_mask in the IRQ descriptor */
 		irq_set_affinity_hint(irq_num, NULL);
 		synchronize_irq(irq_num);
-		devm_free_irq(&pf->pdev->dev, irq_num,
-			      vsi->q_vectors[i]);
+		devm_free_irq(ice_pf_to_dev(pf), irq_num, vsi->q_vectors[i]);
 	}
 }
 
@@ -2781,101 +2882,104 @@ void ice_vsi_free_rx_rings(struct ice_vsi *vsi)
  */
 void ice_vsi_close(struct ice_vsi *vsi)
 {
-	if (!test_and_set_bit(__ICE_DOWN, vsi->state))
-		ice_down(vsi);
+	enum ice_close_reason reason = ICE_REASON_INTERFACE_DOWN;
+
+	if (test_bit(ICE_CORER_REQ, vsi->back->state))
+		reason = ICE_REASON_CORER_REQ;
+	if (test_bit(ICE_GLOBR_REQ, vsi->back->state))
+		reason = ICE_REASON_GLOBR_REQ;
+	if (test_bit(ICE_PFR_REQ, vsi->back->state))
+		reason = ICE_REASON_PFR_REQ;
+	if (!ice_is_safe_mode(vsi->back) && vsi->type == ICE_VSI_PF) {
+		int ret = ice_for_each_peer(vsi->back, &reason, ice_peer_close);
 
+		if (ret)
+			dev_dbg(ice_pf_to_dev(vsi->back), "Peer device did not implement close function\n");
+	}
+
+	if (!test_and_set_bit(ICE_VSI_DOWN, vsi->state))
+		ice_down(vsi);
 	ice_vsi_free_irq(vsi);
 	ice_vsi_free_tx_rings(vsi);
 	ice_vsi_free_rx_rings(vsi);
 }
 
 /**
- * ice_free_res - free a block of resources
- * @res: pointer to the resource
- * @index: starting index previously returned by ice_get_res
- * @id: identifier to track owner
- *
- * Returns number of resources freed
+ * ice_ena_vsi - resume a VSI
+ * @vsi: the VSI being resume
+ * @locked: is the rtnl_lock already held
  */
-int ice_free_res(struct ice_res_tracker *res, u16 index, u16 id)
+int ice_ena_vsi(struct ice_vsi *vsi, bool locked)
 {
-	int count = 0;
-	int i;
-
-	if (!res || index >= res->end)
-		return -EINVAL;
-
-	id |= ICE_RES_VALID_BIT;
-	for (i = index; i < res->end && res->list[i] == id; i++) {
-		res->list[i] = 0;
-		count++;
-	}
+	int err = 0;
 
-	return count;
-}
+	if (!test_bit(ICE_VSI_NEEDS_RESTART, vsi->state))
+		return 0;
 
-/**
- * ice_search_res - Search the tracker for a block of resources
- * @res: pointer to the resource
- * @needed: size of the block needed
- * @id: identifier to track owner
- *
- * Returns the base item index of the block, or -ENOMEM for error
- */
-static int ice_search_res(struct ice_res_tracker *res, u16 needed, u16 id)
-{
-	int start = 0, end = 0;
+	clear_bit(ICE_VSI_NEEDS_RESTART, vsi->state);
 
-	if (needed > res->end)
-		return -ENOMEM;
+	if (vsi->netdev && vsi->type == ICE_VSI_PF) {
+		if (netif_running(vsi->netdev)) {
+			if (!locked)
+				rtnl_lock();
 
-	id |= ICE_RES_VALID_BIT;
+			err = ice_open_internal(vsi->netdev);
 
-	do {
-		/* skip already allocated entries */
-		if (res->list[end++] & ICE_RES_VALID_BIT) {
-			start = end;
-			if ((start + needed) > res->end)
-				break;
+			if (!locked)
+				rtnl_unlock();
 		}
+#ifdef HAVE_NETDEV_SB_DEV
 
-		if (end == (start + needed)) {
-			int i = start;
-
-			/* there was enough, so assign it to the requestor */
-			while (i != end)
-				res->list[i++] = id;
-
-			return start;
-		}
-	} while (end < res->end);
+		if (err)
+			return err;
+		if (test_bit(ICE_FLAG_MACVLAN_ENA, vsi->back->flags) &&
+		    !ice_is_adq_active(vsi->back))
+			err = ice_vsi_cfg_netdev_tc0(vsi);
+#endif /* HAVE_NETDEV_SB_DEV */
+	} else if (vsi->type == ICE_VSI_CTRL) {
+		err = ice_vsi_open_ctrl(vsi);
+#ifdef HAVE_NETDEV_SB_DEV
+	} else if (vsi->type == ICE_VSI_OFFLOAD_MACVLAN) {
+		err = ice_vsi_open(vsi);
+#endif /* HAVE_NETDEV_SB_DEV */
+	}
 
-	return -ENOMEM;
+	return err;
 }
 
 /**
- * ice_get_res - get a block of resources
- * @pf: board private structure
- * @res: pointer to the resource
- * @needed: size of the block needed
- * @id: identifier to track owner
- *
- * Returns the base item index of the block, or negative for error
+ * ice_dis_vsi - pause a VSI
+ * @vsi: the VSI being paused
+ * @locked: is the rtnl_lock already held
  */
-int
-ice_get_res(struct ice_pf *pf, struct ice_res_tracker *res, u16 needed, u16 id)
+void ice_dis_vsi(struct ice_vsi *vsi, bool locked)
 {
-	if (!res || !pf)
-		return -EINVAL;
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return;
 
-	if (!needed || needed > res->num_entries || id >= ICE_RES_VALID_BIT) {
-		dev_err(&pf->pdev->dev,
-			"param err: needed=%d, num_entries = %d id=0x%04x\n",
-			needed, res->num_entries, id);
-		return -EINVAL;
-	}
+	set_bit(ICE_VSI_NEEDS_RESTART, vsi->state);
 
-	return ice_search_res(res, needed, id);
+	if (vsi->type == ICE_VSI_PF && vsi->netdev) {
+		if (netif_running(vsi->netdev)) {
+			if (!locked)
+				rtnl_lock();
+
+			ice_vsi_close(vsi);
+
+			if (!locked)
+				rtnl_unlock();
+		} else {
+			ice_vsi_close(vsi);
+		}
+	} else if (vsi->type == ICE_VSI_CTRL) {
+		ice_vsi_close(vsi);
+#ifdef HAVE_NETDEV_SB_DEV
+	} else if (vsi->type == ICE_VSI_OFFLOAD_MACVLAN) {
+		ice_vsi_close(vsi);
+#endif /* HAVE_NETDEV_SB_DEV */
+	} else if (vsi->type == ICE_VSI_SWITCHDEV_CTRL) {
+		ice_vsi_close(vsi);
+	}
 }
 
 /**
@@ -2921,6 +3025,7 @@ void ice_vsi_dis_irq(struct ice_vsi *vsi)
 	ice_for_each_q_vector(vsi, i) {
 		if (!vsi->q_vectors[i])
 			continue;
+
 		wr32(hw, GLINT_DYN_CTL(vsi->q_vectors[i]->reg_idx), 0);
 	}
 
@@ -2969,8 +3074,14 @@ int ice_vsi_release(struct ice_vsi *vsi)
 	 * PF that is running the work queue items currently. This is done to
 	 * avoid check_flush_dependency() warning on this wq
 	 */
-	if (vsi->netdev && !ice_is_reset_in_progress(pf->state))
+	if (vsi->netdev && !ice_is_reset_in_progress(pf->state) &&
+	    (test_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state))) {
 		unregister_netdev(vsi->netdev);
+		clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+
+		if (vsi->type == ICE_VSI_PF)
+			ice_devlink_destroy_pf_port(pf);
+	}
 
 	if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
 		ice_rss_clean(vsi);
@@ -2985,7 +3096,24 @@ int ice_vsi_release(struct ice_vsi *vsi)
 	 * many interrupts each VF needs. SR-IOV MSIX resources are also
 	 * cleared in the same manner.
 	 */
-	if (vsi->type != ICE_VSI_VF) {
+	if (vsi->type == ICE_VSI_CTRL && vsi->vf_id != ICE_INVAL_VFID) {
+		int i;
+
+		ice_for_each_vf(pf, i) {
+			struct ice_vf *vf = &pf->vf[i];
+
+			if (i != vsi->vf_id && vf->ctrl_vsi_idx != ICE_NO_VSI)
+				break;
+		}
+		if (i == pf->num_alloc_vfs) {
+			/* No other VFs left that have control VSI, reclaim SW
+			 * interrupts back to the common pool
+			 */
+			ice_free_res(pf->irq_tracker, vsi->base_vector,
+				     ICE_RES_VF_CTRL_VEC_ID);
+			pf->num_avail_sw_msix += vsi->num_q_vectors;
+		}
+	} else if (vsi->type != ICE_VSI_VF) {
 		/* reclaim SW interrupts back to the common pool */
 		ice_free_res(pf->irq_tracker, vsi->base_vector, vsi->idx);
 		pf->num_avail_sw_msix += vsi->num_q_vectors;
@@ -2993,7 +3121,8 @@ int ice_vsi_release(struct ice_vsi *vsi)
 
 	if (!ice_is_safe_mode(pf)) {
 		if (vsi->type == ICE_VSI_PF) {
-			ice_vsi_add_rem_eth_mac(vsi, false);
+			ice_fltr_remove_eth(vsi, ETH_P_PAUSE, ICE_FLTR_TX,
+					    ICE_DROP_PACKET);
 			ice_cfg_sw_lldp(vsi, true, false);
 			/* The Rx rule will only exist to remove if the LLDP FW
 			 * engine is currently stopped
@@ -3003,17 +3132,26 @@ int ice_vsi_release(struct ice_vsi *vsi)
 		}
 	}
 
-	ice_remove_vsi_fltr(&pf->hw, vsi->idx);
+	ice_fltr_remove_all(vsi);
 	ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx);
 	ice_vsi_delete(vsi);
 	ice_vsi_free_q_vectors(vsi);
 
-	/* make sure unregister_netdev() was called by checking __ICE_DOWN */
-	if (vsi->netdev && test_bit(__ICE_DOWN, vsi->state)) {
-		free_netdev(vsi->netdev);
-		vsi->netdev = NULL;
+	if (vsi->netdev) {
+		if (test_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state)) {
+			unregister_netdev(vsi->netdev);
+			clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+		}
+		if (test_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state)) {
+			free_netdev(vsi->netdev);
+			vsi->netdev = NULL;
+			clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+		}
 	}
 
+	if (vsi->type == ICE_VSI_VF &&
+	    vsi->agg_node && vsi->agg_node->valid)
+		vsi->agg_node->num_vsis--;
 	ice_vsi_clear_rings(vsi);
 
 	ice_vsi_put_qs(vsi);
@@ -3028,16 +3166,134 @@ int ice_vsi_release(struct ice_vsi *vsi)
 	return 0;
 }
 
+/**
+ * ice_vsi_rebuild_get_coalesce - get coalesce from all q_vectors
+ * @vsi: VSI connected with q_vectors
+ * @coalesce: array of struct with stored coalesce
+ *
+ * Returns array size.
+ */
+static int
+ice_vsi_rebuild_get_coalesce(struct ice_vsi *vsi,
+			     struct ice_coalesce_stored *coalesce)
+{
+	int i;
+
+	ice_for_each_q_vector(vsi, i) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[i];
+
+		coalesce[i].itr_tx = q_vector->tx.itr_setting;
+		coalesce[i].itr_rx = q_vector->rx.itr_setting;
+		coalesce[i].intrl = q_vector->intrl;
+
+		if (i < vsi->num_txq)
+			coalesce[i].tx_valid = true;
+		if (i < vsi->num_rxq)
+			coalesce[i].rx_valid = true;
+	}
+
+	return vsi->num_q_vectors;
+}
+
+/**
+ * ice_vsi_rebuild_set_coalesce - set coalesce from earlier saved arrays
+ * @vsi: VSI connected with q_vectors
+ * @coalesce: pointer to array of struct with stored coalesce
+ * @size: size of coalesce array
+ *
+ * Before this function, ice_vsi_rebuild_get_coalesce should be called to save
+ * ITR params in arrays. If size is 0 or coalesce wasn't stored set coalesce
+ * to default value.
+ */
+static void
+ice_vsi_rebuild_set_coalesce(struct ice_vsi *vsi,
+			     struct ice_coalesce_stored *coalesce, int size)
+{
+	struct ice_ring_container *rc;
+	int i;
+
+	if ((size && !coalesce) || !vsi)
+		return;
+
+	/* there are a couple of cases that have to be handled here:
+	 *   1. The case where the number of queue vectors stays the same, but
+	 *      the number of tx or rx rings changes (the first for loop)
+	 *   2. The case where the number of queue vectors increased (the
+	 *      second for loop)
+	 */
+	for (i = 0; i < size && i < vsi->num_q_vectors; i++) {
+		/* there are 2 cases to handle here and they are the same for
+		 * both TX and RX:
+		 *   if the entry was valid previously (coalesce[i].[tr]x_valid
+		 *   and the loop variable is less than the number of rings
+		 *   allocated, then write the previous values
+		 *
+		 *   if the entry was not valid previously, but the number of
+		 *   rings is less than are allocated (this means the number of
+		 *   rings increased from previously), then write out the
+		 *   values in the first element
+		 *
+		 *   Also, always write the ITR, even if in ITR_IS_DYNAMIC
+		 *   as there is no harm because the dynamic algorithm
+		 *   will just overwrite.
+		 */
+		if (i < vsi->alloc_rxq && coalesce[i].rx_valid) {
+			rc = &vsi->q_vectors[i]->rx;
+			rc->itr_setting = coalesce[i].itr_rx;
+			ice_write_itr(rc, rc->itr_setting);
+		} else if (i < vsi->alloc_rxq) {
+			rc = &vsi->q_vectors[i]->rx;
+			rc->itr_setting = coalesce[0].itr_rx;
+			ice_write_itr(rc, rc->itr_setting);
+		}
+
+		if (i < vsi->alloc_txq && coalesce[i].tx_valid) {
+			rc = &vsi->q_vectors[i]->tx;
+			rc->itr_setting = coalesce[i].itr_tx;
+			ice_write_itr(rc, rc->itr_setting);
+		} else if (i < vsi->alloc_txq) {
+			rc = &vsi->q_vectors[i]->tx;
+			rc->itr_setting = coalesce[0].itr_tx;
+			ice_write_itr(rc, rc->itr_setting);
+		}
+
+		vsi->q_vectors[i]->intrl = coalesce[i].intrl;
+		ice_write_intrl(vsi->q_vectors[i], coalesce[i].intrl);
+	}
+
+	/* the number of queue vectors increased so write whatever is in
+	 * the first element
+	 */
+	for (; i < vsi->num_q_vectors; i++) {
+		/* transmit */
+		rc = &vsi->q_vectors[i]->tx;
+		rc->itr_setting = coalesce[0].itr_tx;
+		ice_write_itr(rc, rc->itr_setting);
+
+		/* receive */
+		rc = &vsi->q_vectors[i]->rx;
+		rc->itr_setting = coalesce[0].itr_rx;
+		ice_write_itr(rc, rc->itr_setting);
+
+		vsi->q_vectors[i]->intrl = coalesce[0].intrl;
+		ice_write_intrl(vsi->q_vectors[i], coalesce[0].intrl);
+	}
+}
+
 /**
  * ice_vsi_rebuild - Rebuild VSI after reset
  * @vsi: VSI to be rebuild
+ * @init_vsi: is this an initialization or a reconfigure of the VSI
  *
  * Returns 0 on success and negative value on failure
  */
-int ice_vsi_rebuild(struct ice_vsi *vsi)
+int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
+	struct ice_coalesce_stored *coalesce;
+	int prev_num_q_vectors = 0;
 	struct ice_vf *vf = NULL;
+	enum ice_vsi_type vtype;
 	enum ice_status status;
 	struct ice_pf *pf;
 	int ret, i;
@@ -3046,10 +3302,20 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 		return -EINVAL;
 
 	pf = vsi->back;
-	if (vsi->type == ICE_VSI_VF)
+	vtype = vsi->type;
+	if (vtype == ICE_VSI_VF)
 		vf = &pf->vf[vsi->vf_id];
 
-	ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx);
+	ice_vsi_init_vlan_ops(vsi);
+
+	coalesce = kcalloc(vsi->num_q_vectors,
+			   sizeof(struct ice_coalesce_stored), GFP_KERNEL);
+	if (!coalesce)
+		return -ENOMEM;
+
+	prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce);
+
+	ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx);
 	ice_vsi_free_q_vectors(vsi);
 
 	/* SR-IOV determines needed MSIX resources all at once instead of per
@@ -3057,18 +3323,25 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 	 * many interrupts each VF needs. SR-IOV MSIX resources are also
 	 * cleared in the same manner.
 	 */
-	if (vsi->type != ICE_VSI_VF) {
+	if (vtype != ICE_VSI_VF) {
 		/* reclaim SW interrupts back to the common pool */
 		ice_free_res(pf->irq_tracker, vsi->base_vector, vsi->idx);
 		pf->num_avail_sw_msix += vsi->num_q_vectors;
 		vsi->base_vector = 0;
 	}
 
+#ifdef HAVE_XDP_SUPPORT
+	if (ice_is_xdp_ena_vsi(vsi))
+		/* return value check can be skipped here, it always returns
+		 * 0 if reset is in progress
+		 */
+		ice_destroy_xdp_rings(vsi);
+#endif /* HAVE_XDP_SUPPORT */
 	ice_vsi_put_qs(vsi);
 	ice_vsi_clear_rings(vsi);
 	ice_vsi_free_arrays(vsi);
-	ice_dev_onetime_setup(&pf->hw);
-	if (vsi->type == ICE_VSI_VF)
+	ice_vsi_free_rss_global_lut(vsi);
+	if (vtype == ICE_VSI_VF)
 		ice_vsi_set_num_qs(vsi, vf->vf_id);
 	else
 		ice_vsi_set_num_qs(vsi, ICE_INVAL_VFID);
@@ -3078,15 +3351,21 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 		goto err_vsi;
 
 	ice_vsi_get_qs(vsi);
+	ice_vsi_alloc_rss_global_lut(vsi);
+
+	ice_alloc_fd_res(vsi);
 	ice_vsi_set_tc_cfg(vsi);
 
 	/* Initialize VSI struct elements and create VSI in FW */
-	ret = ice_vsi_init(vsi);
+	ret = ice_vsi_init(vsi, init_vsi);
 	if (ret < 0)
 		goto err_vsi;
 
-
-	switch (vsi->type) {
+	switch (vtype) {
+	case ICE_VSI_CTRL:
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
 	case ICE_VSI_PF:
 		ret = ice_vsi_alloc_q_vectors(vsi);
 		if (ret)
@@ -3096,7 +3375,7 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 		if (ret)
 			goto err_vectors;
 
-		ret = ice_vsi_set_q_vectors_reg_idx(vsi);
+		ret = ice_vsi_set_q_vectors_reg_idx(vsi, 0);
 		if (ret)
 			goto err_vectors;
 
@@ -3105,19 +3384,34 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 			goto err_vectors;
 
 		ice_vsi_map_rings_to_vectors(vsi);
-		/* Do not exit if configuring RSS had an issue, at least
-		 * receive traffic on first queue. Hence no need to capture
-		 * return value
+		ice_vsi_reset_stats(vsi);
+		/* Perform an initial read of the statistics registers now to
+		 * set the baseline before the VSI becomes operational.
 		 */
-		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
-			ice_vsi_cfg_rss_lut_key(vsi);
+		ice_update_eth_stats(vsi);
+#ifdef HAVE_XDP_SUPPORT
+		if (ice_is_xdp_ena_vsi(vsi)) {
+			vsi->num_xdp_txq = vsi->alloc_rxq;
+			ret = ice_prepare_xdp_rings(vsi, vsi->xdp_prog);
+			if (ret)
+				goto err_vectors;
+		}
+#endif /* HAVE_XDP_SUPPORT */
+		/* ICE_VSI_CTRL does not need RSS so skip RSS processing */
+		if (vtype != ICE_VSI_CTRL)
+			/* Do not exit if configuring RSS had an issue, at
+			 * least receive traffic on first queue. Hence no
+			 * need to capture return value
+			 */
+			if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+				ice_vsi_cfg_rss_lut_key(vsi);
 		break;
 	case ICE_VSI_VF:
 		ret = ice_vsi_alloc_q_vectors(vsi);
 		if (ret)
 			goto err_rings;
 
-		ret = ice_vsi_set_q_vectors_reg_idx(vsi);
+		ret = ice_vsi_set_q_vectors_reg_idx(vsi, 0);
 		if (ret)
 			goto err_vectors;
 
@@ -3125,37 +3419,73 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 		if (ret)
 			goto err_vectors;
 
+		ice_vsi_reset_stats(vsi);
+		break;
+	case ICE_VSI_CHNL:
+		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+			ice_vsi_cfg_rss_lut_key(vsi);
+			ice_vsi_set_rss_flow_fld(vsi);
+		}
 		break;
 	default:
 		break;
 	}
 
-	/* configure VSI nodes based on number of queues and TC's */
-	for (i = 0; i < vsi->tc_cfg.numtc; i++)
-		max_txqs[i] = vsi->alloc_txq;
+	for (i = 0; i < vsi->tc_cfg.numtc; i++) {
+		/* configure VSI nodes based on number of queues and TC's.
+		 * ADQ creates VSIs for each TC/Channel but doesn't
+		 * allocate queues instead it reconfigures the PF queues
+		 * as per the TC command. So max_txqs should point to the
+		 * PF Tx queues.
+		 */
+		if (vtype == ICE_VSI_CHNL)
+			max_txqs[i] = pf->num_lan_tx;
+		else
+			max_txqs[i] = vsi->alloc_txq;
+
+#ifdef HAVE_XDP_SUPPORT
+		if (ice_is_xdp_ena_vsi(vsi))
+			max_txqs[i] += vsi->num_xdp_txq;
+#endif /* HAVE_XDP_SUPPORT */
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		/* If MQPRIO ise set, means channel code path, hence for main
+		 * VSI's, use TC as 1
+		 */
+		status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, 1, max_txqs);
+	else
+#endif /* NETIF_F_HW_TC */
+		status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx,
+					 vsi->tc_cfg.ena_tc, max_txqs);
 
-	status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
-				 max_txqs);
 	if (status) {
-		dev_err(&pf->pdev->dev,
-			"VSI %d failed lan queue config, error %d\n",
-			vsi->vsi_num, status);
-		goto err_vectors;
+		dev_err(ice_pf_to_dev(pf), "VSI %d failed lan queue config, error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		if (init_vsi) {
+			ret = -EIO;
+			goto err_vectors;
+		} else {
+			return ice_schedule_reset(pf, ICE_RESET_PFR);
+		}
 	}
+
+	ice_vsi_rebuild_set_coalesce(vsi, coalesce, prev_num_q_vectors);
+	kfree(coalesce);
+
 	return 0;
 
 err_vectors:
 	ice_vsi_free_q_vectors(vsi);
 err_rings:
-	if (vsi->netdev) {
-		vsi->current_netdev_flags = 0;
-		unregister_netdev(vsi->netdev);
-		free_netdev(vsi->netdev);
-		vsi->netdev = NULL;
-	}
+	ice_vsi_clear_rings(vsi);
 err_vsi:
-	ice_vsi_clear(vsi);
-	set_bit(__ICE_RESET_FAILED, pf->state);
+	if (init_vsi)
+		ice_vsi_delete(vsi);
+	ice_vsi_put_qs(vsi);
+	set_bit(ICE_RESET_FAILED, pf->state);
+	kfree(coalesce);
 	return ret;
 }
 
@@ -3165,13 +3495,47 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
  */
 bool ice_is_reset_in_progress(unsigned long *state)
 {
-	return test_bit(__ICE_RESET_OICR_RECV, state) ||
-	       test_bit(__ICE_PFR_REQ, state) ||
-	       test_bit(__ICE_CORER_REQ, state) ||
-	       test_bit(__ICE_GLOBR_REQ, state);
+	return test_bit(ICE_RESET_OICR_RECV, state) ||
+	       test_bit(ICE_PFR_REQ, state) ||
+	       test_bit(ICE_CORER_REQ, state) ||
+	       test_bit(ICE_GLOBR_REQ, state);
+}
+
+/**
+ * ice_wait_for_reset - Wait for driver to finish reset and rebuild
+ * @pf: pointer to the PF structure
+ * @timeout: length of time to wait, in jiffies
+ *
+ * Wait (sleep) for a short time until the driver finishes cleaning up from
+ * a device reset. The caller must be able to sleep. Use this to delay
+ * operations that could fail while the driver is cleaning up after a device
+ * reset.
+ *
+ * Returns 0 on success, -EBUSY if the reset is not finished within the
+ * timeout, and -ERESTARTSYS if the thread was interrupted.
+ */
+int ice_wait_for_reset(struct ice_pf *pf, unsigned long timeout)
+{
+	long ret;
+
+#ifdef __CHECKER__
+	/* Suppress sparse warning from kernel macro:
+	 * warning: symbol '__ret' shadows an earlier one
+	 */
+	ret = timeout;
+#else
+	ret = wait_event_interruptible_timeout(pf->reset_wait_queue,
+					       !ice_is_reset_in_progress(pf->state),
+					       timeout);
+#endif
+	if (ret < 0)
+		return ret;
+	else if (!ret)
+		return -EBUSY;
+	else
+		return 0;
 }
 
-#ifdef CONFIG_DCB
 /**
  * ice_vsi_update_q_map - update our copy of the VSI info with new queue map
  * @vsi: VSI being configured
@@ -3186,6 +3550,156 @@ static void ice_vsi_update_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctx)
 	       sizeof(vsi->info.tc_mapping));
 }
 
+/**
+ * ice_vsi_cfg_netdev_tc - Setup the netdev TC configuration
+ * @vsi: the VSI being configured
+ * @ena_tc: TC map to be enabled
+ */
+void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc)
+{
+	struct net_device *netdev = vsi->netdev;
+	struct ice_pf *pf = vsi->back;
+	int numtc = vsi->tc_cfg.numtc;
+	struct ice_dcbx_cfg *dcbcfg;
+	u8 netdev_tc;
+	int i;
+
+	if (!netdev)
+		return;
+
+#ifdef NETIF_F_HW_TC
+	/* CHNL VSI doesn't have it's own netdev, hence, no netdev_tc */
+	if (vsi->type == ICE_VSI_CHNL)
+		return;
+#endif /* NETIF_F_HW_TC */
+
+	if (!ena_tc) {
+		netdev_reset_tc(netdev);
+		return;
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (vsi->type == ICE_VSI_PF && ice_is_adq_active(pf))
+		numtc = vsi->all_numtc;
+#endif /* NETIF_F_HW_TC */
+
+	if (netdev_set_num_tc(netdev, numtc))
+		return;
+
+	dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	ice_for_each_traffic_class(i)
+		if (vsi->tc_cfg.ena_tc & BIT(i))
+			netdev_set_tc_queue(netdev,
+					    vsi->tc_cfg.tc_info[i].netdev_tc,
+					    vsi->tc_cfg.tc_info[i].qcount_tx,
+					    vsi->tc_cfg.tc_info[i].qoffset);
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	/* setup TC queue map for CHNL TCs */
+	ice_for_each_chnl_tc(i) {
+		if (!(vsi->all_enatc & BIT(i)))
+			break;
+		if (!vsi->mqprio_qopt.qopt.count[i])
+			break;
+		netdev_set_tc_queue(netdev, i,
+				    vsi->mqprio_qopt.qopt.count[i],
+				    vsi->mqprio_qopt.qopt.offset[i]);
+	}
+
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#ifdef NETIF_F_HW_TC
+	if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		return;
+#endif /* NETIF_F_HW_TC */
+
+	for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
+		u8 ets_tc = dcbcfg->etscfg.prio_table[i];
+
+		/* Get the mapped netdev TC# for the UP */
+		netdev_tc = vsi->tc_cfg.tc_info[ets_tc].netdev_tc;
+		netdev_set_prio_tc_map(netdev, i, netdev_tc);
+	}
+}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+/**
+ * ice_vsi_setup_q_map_mqprio - Prepares mqprio based tc_config
+ * @vsi: the VSI being configured,
+ * @ctxt: VSI context structure
+ * @ena_tc: number of traffic classes to enable
+ *
+ * Prepares VSI tc_config to have queue configurations based on MQPRIO options.
+ */
+static void
+ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt,
+			   u8 ena_tc)
+{
+	u16 pow, offset = 0, qcount_tx = 0, qcount_rx = 0, qmap;
+	u16 tc0_offset = vsi->mqprio_qopt.qopt.offset[0];
+	int tc0_qcount = vsi->mqprio_qopt.qopt.count[0];
+	u8 netdev_tc = 0;
+	int i;
+
+	vsi->tc_cfg.ena_tc = ena_tc ? ena_tc : 1;
+
+	pow = order_base_2(tc0_qcount);
+	qmap = ((tc0_offset << ICE_AQ_VSI_TC_Q_OFFSET_S) &
+		ICE_AQ_VSI_TC_Q_OFFSET_M) |
+		((pow << ICE_AQ_VSI_TC_Q_NUM_S) & ICE_AQ_VSI_TC_Q_NUM_M);
+
+	ice_for_each_traffic_class(i) {
+		if (!(vsi->tc_cfg.ena_tc & BIT(i))) {
+			/* TC is not enabled */
+			vsi->tc_cfg.tc_info[i].qoffset = 0;
+			vsi->tc_cfg.tc_info[i].qcount_rx = 1;
+			vsi->tc_cfg.tc_info[i].qcount_tx = 1;
+			vsi->tc_cfg.tc_info[i].netdev_tc = 0;
+			ctxt->info.tc_mapping[i] = 0;
+			continue;
+		}
+
+		offset = vsi->mqprio_qopt.qopt.offset[i];
+		qcount_rx = vsi->mqprio_qopt.qopt.count[i];
+		qcount_tx = vsi->mqprio_qopt.qopt.count[i];
+		vsi->tc_cfg.tc_info[i].qoffset = offset;
+		vsi->tc_cfg.tc_info[i].qcount_rx = qcount_rx;
+		vsi->tc_cfg.tc_info[i].qcount_tx = qcount_tx;
+		vsi->tc_cfg.tc_info[i].netdev_tc = netdev_tc++;
+	}
+
+	if (vsi->all_numtc && vsi->all_numtc != vsi->tc_cfg.numtc) {
+		ice_for_each_chnl_tc(i) {
+			if (!(vsi->all_enatc & BIT(i)))
+				continue;
+			offset = vsi->mqprio_qopt.qopt.offset[i];
+			qcount_rx = vsi->mqprio_qopt.qopt.count[i];
+			qcount_tx = vsi->mqprio_qopt.qopt.count[i];
+		}
+	}
+
+	/* Set actual Tx/Rx queue pairs */
+	vsi->num_txq = offset + qcount_tx;
+	vsi->num_rxq = offset + qcount_rx;
+
+	/* Setup queue TC[0].qmap for given VSI context */
+	ctxt->info.tc_mapping[0] = cpu_to_le16(qmap);
+	ctxt->info.q_mapping[0] = cpu_to_le16(vsi->rxq_map[0]);
+	ctxt->info.q_mapping[1] = cpu_to_le16(tc0_qcount);
+
+	/* Find queue count available for channel VSIs and starting offset
+	 * for channel VSIs
+	 */
+	if (tc0_qcount && tc0_qcount < vsi->num_rxq) {
+		vsi->cnt_q_avail = vsi->num_rxq - tc0_qcount;
+		vsi->next_base_q = tc0_qcount;
+	}
+	dev_dbg(ice_pf_to_dev(vsi->back), "vsi->num_txq = %d\n",  vsi->num_txq);
+	dev_dbg(ice_pf_to_dev(vsi->back), "vsi->num_rxq = %d\n",  vsi->num_rxq);
+	dev_dbg(ice_pf_to_dev(vsi->back), "%s: all_numtc %u, all_enatc: 0x%04x, tc_cfg.numtc %u\n",
+		__func__, vsi->all_numtc, vsi->all_enatc, vsi->tc_cfg.numtc);
+}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
 /**
  * ice_vsi_cfg_tc - Configure VSI Tx Sched for given TC map
  * @vsi: VSI to be configured
@@ -3196,48 +3710,81 @@ static void ice_vsi_update_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctx)
 int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
-	struct ice_vsi_ctx *ctx;
 	struct ice_pf *pf = vsi->back;
+	struct ice_vsi_ctx *ctx;
 	enum ice_status status;
+	struct device *dev;
 	int i, ret = 0;
 	u8 num_tc = 0;
 
+	dev = ice_pf_to_dev(pf);
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (vsi->tc_cfg.ena_tc == ena_tc &&
+	    vsi->mqprio_qopt.mode != TC_MQPRIO_MODE_CHANNEL)
+		return ret;
+#else
+	if (vsi->tc_cfg.ena_tc == ena_tc)
+		return 0;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
 	ice_for_each_traffic_class(i) {
 		/* build bitmap of enabled TCs */
 		if (ena_tc & BIT(i))
 			num_tc++;
 		/* populate max_txqs per TC */
 		max_txqs[i] = vsi->alloc_txq;
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+		/* Update max_txqs if it is CHNL VSI, because alloc_t[r]xq are
+		 * zero for CHNL VSI, hence use num_txq instead as max_txqs
+		 */
+		if (vsi->type == ICE_VSI_CHNL &&
+		    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+			max_txqs[i] = vsi->num_txq;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 	}
 
 	vsi->tc_cfg.ena_tc = ena_tc;
 	vsi->tc_cfg.numtc = num_tc;
 
-	ctx = devm_kzalloc(&pf->pdev->dev, sizeof(*ctx), GFP_KERNEL);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
 	ctx->vf_num = 0;
 	ctx->info = vsi->info;
 
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (vsi->type == ICE_VSI_PF &&
+	    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		ice_vsi_setup_q_map_mqprio(vsi, ctx, ena_tc);
+	else
+		ice_vsi_setup_q_map(vsi, ctx);
+#else
 	ice_vsi_setup_q_map(vsi, ctx);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 
 	/* must to indicate which section of VSI context are being modified */
 	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID);
 	status = ice_update_vsi(&pf->hw, vsi->idx, ctx, NULL);
 	if (status) {
-		dev_info(&pf->pdev->dev, "Failed VSI Update\n");
+		dev_info(dev, "Failed VSI Update\n");
 		ret = -EIO;
 		goto out;
 	}
 
-	status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
-				 max_txqs);
+#ifdef NETIF_F_HW_TC
+	if (vsi->type == ICE_VSI_PF &&
+	    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		status  = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, 1,
+					  max_txqs);
+	else
+#endif /* NETIF_F_HW_TC */
+		status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx,
+					 vsi->tc_cfg.ena_tc, max_txqs);
 
 	if (status) {
-		dev_err(&pf->pdev->dev,
-			"VSI %d failed TC config, error %d\n",
-			vsi->vsi_num, status);
+		dev_err(dev, "VSI %d failed TC config, error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
 		ret = -EIO;
 		goto out;
 	}
@@ -3246,56 +3793,697 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 
 	ice_vsi_cfg_netdev_tc(vsi, ena_tc);
 out:
-	devm_kfree(&pf->pdev->dev, ctx);
+	kfree(ctx);
 	return ret;
 }
-#endif /* CONFIG_DCB */
 
 /**
- * ice_nvm_version_str - format the NVM version strings
- * @hw: ptr to the hardware info
+ * ice_update_ring_stats - Update ring statistics
+ * @ring: ring to update
+ * @pkts: number of processed packets
+ * @bytes: number of processed bytes
+ *
+ * This function assumes that caller has acquired a u64_stats_sync lock.
+ */
+static void ice_update_ring_stats(struct ice_ring *ring, u64 pkts, u64 bytes)
+{
+	ring->stats.bytes += bytes;
+	ring->stats.pkts += pkts;
+}
+
+#ifdef ADQ_PERF_COUNTERS
+/**
+ * ice_update_adq_stats - Update channel's counters for busy poll and NAPI
+ * @ring: ring to update
+ * @pkts: number of processed packets
+ *
+ * This function assumes that caller has acquired a u64_stats_sync lock.
+ */
+static void ice_update_adq_stats(struct ice_ring *ring, u64 pkts)
+{
+	/* separate accounting of packets (eithet from busy_poll or
+	 * napi_poll depending upon state of vector specific
+	 * flag 'in_bp', 'prev_in_bp'
+	 */
+	if (ring->q_vector->state_flags & ICE_CHNL_IN_BP) {
+		ring->ch_q_stats.poll.bp_packets += pkts;
+	} else {
+		if (ring->q_vector->state_flags & ICE_CHNL_PREV_IN_BP)
+			ring->ch_q_stats.poll.bp_packets += pkts;
+		else
+			ring->ch_q_stats.poll.np_packets += pkts;
+	}
+}
+#endif /* ADQ_PERF_COUNTERS */
+
+/**
+ * ice_update_tx_ring_stats - Update Tx ring specific counters
+ * @tx_ring: ring to update
+ * @pkts: number of processed packets
+ * @bytes: number of processed bytes
+ */
+void ice_update_tx_ring_stats(struct ice_ring *tx_ring, u64 pkts, u64 bytes)
+{
+	u64_stats_update_begin(&tx_ring->syncp);
+	ice_update_ring_stats(tx_ring, pkts, bytes);
+#ifdef ADQ_PERF_COUNTERS
+	ice_update_adq_stats(tx_ring, pkts);
+#endif /* ADQ_PERF_COUNTERS */
+	u64_stats_update_end(&tx_ring->syncp);
+}
+
+/**
+ * ice_update_rx_ring_stats - Update Rx ring specific counters
+ * @rx_ring: ring to update
+ * @pkts: number of processed packets
+ * @bytes: number of processed bytes
+ */
+void ice_update_rx_ring_stats(struct ice_ring *rx_ring, u64 pkts, u64 bytes)
+{
+	u64_stats_update_begin(&rx_ring->syncp);
+	ice_update_ring_stats(rx_ring, pkts, bytes);
+#ifdef ADQ_PERF_COUNTERS
+	ice_update_adq_stats(rx_ring, pkts);
+#endif /* ADQ_PERF_COUNTERS */
+
+	/* if vector is transitioning from BP->INT (due to busy_poll_stop()) and
+	 * we find no packets, in that case: to avoid entering into INTR mode
+	 * (which happens from napi_poll - enabling interrupt if
+	 * unlikely_comeback_to_bp getting set), make "prev_data_pkt_recv" to be
+	 * non-zero, so that interrupts won't be enabled. This is to address the
+	 * issue where num_force_wb on some queues is 2 to 3 times higher than
+	 * other queues and those queues also sees lot of interrupts
+	 */
+	if (ice_ring_ch_enabled(rx_ring) &&
+	    ice_vector_busypoll_intr(rx_ring->q_vector)) {
+		if (!pkts)
+			rx_ring->q_vector->state_flags |=
+					ICE_CHNL_PREV_DATA_PKT_RECV;
+#ifdef ADQ_PERF_COUNTERS
+	} else if (ice_ring_ch_enabled(rx_ring)) {
+		struct ice_q_vector *q_vector = rx_ring->q_vector;
+
+		if (pkts &&
+		    !(q_vector->state_flags & ICE_CHNL_PREV_DATA_PKT_RECV))
+			rx_ring->ch_q_stats.rx.num_only_ctrl_pkts++;
+		if (q_vector->state_flags & ICE_CHNL_IN_BP &&
+		    !(q_vector->state_flags & ICE_CHNL_PREV_DATA_PKT_RECV))
+			rx_ring->ch_q_stats.rx.num_no_data_pkt_bp++;
+	}
+#else
+	}
+#endif /* ADQ_PERF_COUNTERS */
+
+	u64_stats_update_end(&rx_ring->syncp);
+}
+
+/**
+ * ice_status_to_errno - convert from enum ice_status to Linux errno
+ * @err: ice_status value to convert
  */
-char *ice_nvm_version_str(struct ice_hw *hw)
+int ice_status_to_errno(enum ice_status err)
 {
-	u8 oem_ver, oem_patch, ver_hi, ver_lo;
-	static char buf[ICE_NVM_VER_LEN];
-	u16 oem_build;
+	switch (err) {
+	case ICE_SUCCESS:
+		return 0;
+	case ICE_ERR_DOES_NOT_EXIST:
+		return -ENOENT;
+	case ICE_ERR_OUT_OF_RANGE:
+		return -ENOTTY;
+	case ICE_ERR_PARAM:
+		return -EINVAL;
+	case ICE_ERR_NO_MEMORY:
+		return -ENOMEM;
+	case ICE_ERR_MAX_LIMIT:
+		return -EAGAIN;
+	default:
+		return -EINVAL;
+	}
+}
 
-	ice_get_nvm_version(hw, &oem_ver, &oem_build, &oem_patch, &ver_hi,
-			    &ver_lo);
 
-	snprintf(buf, sizeof(buf), "%x.%02x 0x%x %d.%d.%d", ver_hi, ver_lo,
-		 hw->nvm.eetrack, oem_ver, oem_build, oem_patch);
+/**
+ * ice_is_dflt_vsi_in_use - check if the default forwarding VSI is being used
+ * @sw: switch to check if its default forwarding VSI is free
+ *
+ * Return true if the default forwarding VSI is already being used, else returns
+ * false signalling that it's available to use.
+ */
+bool ice_is_dflt_vsi_in_use(struct ice_sw *sw)
+{
+	return (sw->dflt_vsi && sw->dflt_vsi_ena);
+}
 
-	return buf;
+/**
+ * ice_is_vsi_dflt_vsi - check if the VSI passed in is the default VSI
+ * @sw: switch for the default forwarding VSI to compare against
+ * @vsi: VSI to compare against default forwarding VSI
+ *
+ * If this VSI passed in is the default forwarding VSI then return true, else
+ * return false
+ */
+bool ice_is_vsi_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi)
+{
+	return (sw->dflt_vsi == vsi && sw->dflt_vsi_ena);
 }
 
 /**
- * ice_vsi_cfg_mac_fltr - Add or remove a MAC address filter for a VSI
- * @vsi: the VSI being configured MAC filter
- * @macaddr: the MAC address to be added.
- * @set: Add or delete a MAC filter
+ * ice_set_dflt_vsi - set the default forwarding VSI
+ * @sw: switch used to assign the default forwarding VSI
+ * @vsi: VSI getting set as the default forwarding VSI on the switch
  *
- * Adds or removes MAC address filter entry for VF VSI
+ * If the VSI passed in is already the default VSI and it's enabled just return
+ * success.
+ *
+ * If there is already a default VSI on the switch and it's enabled then return
+ * -EEXIST since there can only be one default VSI per switch.
+ *
+ *  Otherwise try to set the VSI passed in as the switch's default VSI and
+ *  return the result.
  */
-enum ice_status
-ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set)
+int ice_set_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi)
 {
-	LIST_HEAD(tmp_add_list);
 	enum ice_status status;
+	struct device *dev;
 
-	 /* Update MAC filter list to be added or removed for a VSI */
-	if (ice_add_mac_to_list(vsi, &tmp_add_list, macaddr)) {
-		status = ICE_ERR_NO_MEMORY;
-		goto cfg_mac_fltr_exit;
+	if (!sw || !vsi)
+		return -EINVAL;
+
+	dev = ice_pf_to_dev(vsi->back);
+
+	/* the VSI passed in is already the default VSI */
+	if (ice_is_vsi_dflt_vsi(sw, vsi)) {
+		dev_dbg(dev, "VSI %d passed in is already the default forwarding VSI, nothing to do\n",
+			vsi->vsi_num);
+		return 0;
+	}
+
+	/* another VSI is already the default VSI for this switch */
+	if (ice_is_dflt_vsi_in_use(sw)) {
+		dev_err(dev, "Default forwarding VSI %d already in use, disable it and try again\n",
+			sw->dflt_vsi->vsi_num);
+		return -EEXIST;
 	}
 
-	if (set)
-		status = ice_add_mac(&vsi->back->hw, &tmp_add_list);
+	status = ice_cfg_dflt_vsi(vsi->port_info, vsi->idx, true, ICE_FLTR_RX);
+	if (status) {
+		dev_err(dev, "Failed to set VSI %d as the default forwarding VSI, error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		return -EIO;
+	}
+
+	sw->dflt_vsi = vsi;
+	sw->dflt_vsi_ena = true;
+
+	return 0;
+}
+
+/**
+ * ice_clear_dflt_vsi - clear the default forwarding VSI
+ * @sw: switch used to clear the default VSI
+ *
+ * If the switch has no default VSI or it's not enabled then return error.
+ *
+ * Otherwise try to clear the default VSI and return the result.
+ */
+int ice_clear_dflt_vsi(struct ice_sw *sw)
+{
+	struct ice_vsi *dflt_vsi;
+	enum ice_status status;
+	struct device *dev;
+
+	if (!sw)
+		return -EINVAL;
+
+	dev = ice_pf_to_dev(sw->pf);
+
+	dflt_vsi = sw->dflt_vsi;
+
+	/* there is no default VSI configured */
+	if (!ice_is_dflt_vsi_in_use(sw))
+		return -ENODEV;
+
+	status = ice_cfg_dflt_vsi(dflt_vsi->port_info, dflt_vsi->idx, false,
+				  ICE_FLTR_RX);
+	if (status) {
+		dev_err(dev, "Failed to clear the default forwarding VSI %d, error %s\n",
+			dflt_vsi->vsi_num, ice_stat_str(status));
+		return -EIO;
+	}
+
+	sw->dflt_vsi = NULL;
+	sw->dflt_vsi_ena = false;
+
+	return 0;
+}
+
+
+/**
+ * ice_get_link_speed_mbps - get link speed in Mbps
+ * @vsi: the VSI whose link speed is being queried
+ *
+ * Return current VSI link speed, else ICE_LINK_SPEED_UNKNOWN (0) is
+ * returned.
+ */
+int ice_get_link_speed_mbps(struct ice_vsi *vsi)
+{
+	switch (vsi->port_info->phy.link_info.link_speed) {
+	case ICE_AQ_LINK_SPEED_100GB:
+		return ICE_LINK_SPEED_100000MBPS;
+	case ICE_AQ_LINK_SPEED_50GB:
+		return ICE_LINK_SPEED_50000MBPS;
+	case ICE_AQ_LINK_SPEED_40GB:
+		return ICE_LINK_SPEED_40000MBPS;
+	case ICE_AQ_LINK_SPEED_25GB:
+		return ICE_LINK_SPEED_25000MBPS;
+	case ICE_AQ_LINK_SPEED_20GB:
+		return ICE_LINK_SPEED_20000MBPS;
+	case ICE_AQ_LINK_SPEED_10GB:
+		return ICE_LINK_SPEED_10000MBPS;
+	case ICE_AQ_LINK_SPEED_5GB:
+		return ICE_LINK_SPEED_5000MBPS;
+	case ICE_AQ_LINK_SPEED_2500MB:
+		return ICE_LINK_SPEED_2500MBPS;
+	case ICE_AQ_LINK_SPEED_1000MB:
+		return ICE_LINK_SPEED_1000MBPS;
+	case ICE_AQ_LINK_SPEED_100MB:
+		return ICE_LINK_SPEED_100MBPS;
+	case ICE_AQ_LINK_SPEED_10MB:
+		return ICE_LINK_SPEED_10MBPS;
+	case ICE_AQ_LINK_SPEED_UNKNOWN:
+	default:
+		return ICE_LINK_SPEED_UNKNOWN;
+	}
+}
+
+
+/**
+ * ice_get_link_speed_kbps - get link speed in Kbps
+ * @vsi: the VSI whose link speed is being queried
+ *
+ * Return current VSI link speed, else ICE_LINK_SPEED_UNKNOWN (0) is
+ * returned.
+ */
+int ice_get_link_speed_kbps(struct ice_vsi *vsi)
+{
+	int speed_mbps;
+
+	speed_mbps = ice_get_link_speed_mbps(vsi);
+
+	return speed_mbps * 1000;
+}
+
+/**
+ * ice_set_min_bw_limit - setup minimum BW limit for Tx based on min_tx_rate
+ * @vsi: VSI to be configured
+ * @min_tx_rate: min Tx rate in Kbps to be configured as BW limit
+ *
+ * If the min_tx_rate is specified as 0 that means to clear the minimum BW limit
+ * profile, otherwise a non-zero value will force a minimum BW limit for the VSI
+ * on TC 0.
+ */
+int ice_set_min_bw_limit(struct ice_vsi *vsi, u64 min_tx_rate)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+	int speed;
+
+	dev = ice_pf_to_dev(pf);
+	if (!vsi->port_info) {
+		dev_dbg(dev, "VSI %d, type %u specified doesn't have valid port_info\n",
+			vsi->idx, vsi->type);
+		return -EINVAL;
+	}
+
+	speed = ice_get_link_speed_kbps(vsi);
+	if (min_tx_rate > (u64)speed) {
+		dev_err(dev, "invalid min Tx rate %llu Kbps specified for %s %d is greater than current link speed %u Kbps\n",
+			min_tx_rate, ice_vsi_type_str(vsi->type), vsi->idx,
+			speed);
+		return -EINVAL;
+	}
+
+	/* Configure min BW for VSI limit */
+	if (min_tx_rate) {
+		status = ice_cfg_vsi_bw_lmt_per_tc(vsi->port_info, vsi->idx, 0,
+						   ICE_MIN_BW, min_tx_rate);
+		if (status) {
+			dev_err(dev, "failed to set min Tx rate(%llu Kbps) for %s %d\n",
+				min_tx_rate, ice_vsi_type_str(vsi->type),
+				vsi->idx);
+			return -EIO;
+		}
+
+		dev_dbg(dev, "set min Tx rate(%llu Kbps) for %s\n",
+			min_tx_rate, ice_vsi_type_str(vsi->type));
+	} else {
+		status = ice_cfg_vsi_bw_dflt_lmt_per_tc(vsi->port_info,
+							vsi->idx, 0,
+							ICE_MIN_BW);
+		if (status) {
+			dev_err(dev, "failed to clear min Tx rate configuration for %s %d\n",
+				ice_vsi_type_str(vsi->type), vsi->idx);
+			return -EIO;
+		}
+
+		dev_dbg(dev, "cleared min Tx rate configuration for %s %d\n",
+			ice_vsi_type_str(vsi->type), vsi->idx);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_max_bw_limit - setup maximum BW limit for Tx based on max_tx_rate
+ * @vsi: VSI to be configured
+ * @max_tx_rate: max Tx rate in Kbps to be configured as BW limit
+ *
+ * If the max_tx_rate is specified as 0 that means to clear the maximum BW limit
+ * profile, otherwise a non-zero value will force a maximum BW limit for the VSI
+ * on TC 0.
+ */
+int ice_set_max_bw_limit(struct ice_vsi *vsi, u64 max_tx_rate)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+	int speed;
+
+	dev = ice_pf_to_dev(pf);
+	if (!vsi->port_info) {
+		dev_dbg(dev, "VSI %d, type %u specified doesn't have valid port_info\n",
+			vsi->idx, vsi->type);
+		return -EINVAL;
+	}
+
+	speed = ice_get_link_speed_kbps(vsi);
+	if (max_tx_rate > (u64)speed) {
+		dev_err(dev, "invalid max Tx rate %llu Kbps specified for %s %d is greater than current link speed %u Kbps\n",
+			max_tx_rate, ice_vsi_type_str(vsi->type), vsi->idx,
+			speed);
+		return -EINVAL;
+	}
+
+	/* Configure max BW for VSI limit */
+	if (max_tx_rate) {
+		status = ice_cfg_vsi_bw_lmt_per_tc(vsi->port_info, vsi->idx, 0,
+						   ICE_MAX_BW, max_tx_rate);
+		if (status) {
+			dev_err(dev, "failed setting max Tx rate(%llu Kbps) for %s %d\n",
+				max_tx_rate, ice_vsi_type_str(vsi->type),
+				vsi->idx);
+			return -EIO;
+		}
+
+		dev_dbg(dev, "set max Tx rate(%llu Kbps) for %s %d\n",
+			max_tx_rate, ice_vsi_type_str(vsi->type), vsi->idx);
+	} else {
+		status = ice_cfg_vsi_bw_dflt_lmt_per_tc(vsi->port_info,
+							vsi->idx, 0,
+							ICE_MAX_BW);
+		if (status) {
+			dev_err(dev, "failed clearing max Tx rate configuration for %s %d\n",
+				ice_vsi_type_str(vsi->type), vsi->idx);
+			return -EIO;
+		}
+
+		dev_dbg(dev, "cleared max Tx rate configuration for %s %d\n",
+			ice_vsi_type_str(vsi->type), vsi->idx);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_link - turn on/off physical link
+ * @vsi: VSI to modify physical link on
+ * @ena: turn on/off physical link
+ */
+int ice_set_link(struct ice_vsi *vsi, bool ena)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	struct ice_port_info *pi = vsi->port_info;
+	struct ice_hw *hw = pi->hw;
+	enum ice_status status;
+
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	status = ice_aq_set_link_restart_an(pi, ena, NULL);
+
+	/* if link is owned by manageability, FW will return ICE_AQ_RC_EMODE.
+	 * this is not a fatal error, so print a warning message and return
+	 * a success code. Return an error if FW returns an error code other
+	 * than ICE_AQ_RC_EMODE
+	 */
+	if (status == ICE_ERR_AQ_ERROR) {
+		if (hw->adminq.sq_last_status == ICE_AQ_RC_EMODE)
+			dev_warn(dev, "can't set link to %s, err %s aq_err %s. not fatal, continuing\n",
+				 (ena ? "ON" : "OFF"), ice_stat_str(status),
+				 ice_aq_str(hw->adminq.sq_last_status));
+	} else if (status) {
+		dev_err(dev, "can't set link to %s, err %s aq_err %s\n",
+			(ena ? "ON" : "OFF"), ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsi_update_security - update security block in VSI
+ * @vsi: pointer to VSI structure
+ * @fill: function pointer to fill ctx
+ */
+int ice_vsi_update_security(struct ice_vsi *vsi,
+			    void (*fill)(struct ice_vsi_ctx *))
+{
+	struct ice_vsi_ctx ctx = { 0 };
+
+	ctx.info = vsi->info;
+	ctx.info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+	fill(&ctx);
+
+	if (ice_update_vsi(&vsi->back->hw, vsi->idx, &ctx, NULL))
+		return -ENODEV;
+
+	vsi->info = ctx.info;
+	return 0;
+}
+
+#ifdef HAVE_METADATA_PORT_INFO
+/**
+ * ice_vsi_ctx_set_antispoof - set antispoof function in VSI ctx
+ * @ctx: pointer to VSI ctx structure
+ */
+void ice_vsi_ctx_set_antispoof(struct ice_vsi_ctx *ctx)
+{
+	ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF |
+		(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+		 ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+}
+
+/**
+ * ice_vsi_ctx_clear_antispoof - clear antispoof function in VSI ctx
+ * @ctx: pointer to VSI ctx structure
+ */
+void ice_vsi_ctx_clear_antispoof(struct ice_vsi_ctx *ctx)
+{
+	ctx->info.sec_flags &= ~ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF &
+		~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+		  ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+
+/**
+ * ice_vsi_ctx_set_allow_override - allow destination override on VSI
+ * @ctx: pointer to VSI ctx structure
+ */
+void ice_vsi_ctx_set_allow_override(struct ice_vsi_ctx *ctx)
+{
+	ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD;
+}
+
+/**
+ * ice_vsi_ctx_clear_allow_override - turn off destination override on VSI
+ * @ctx: pointer to VSI ctx structure
+ */
+void ice_vsi_ctx_clear_allow_override(struct ice_vsi_ctx *ctx)
+{
+	ctx->info.sec_flags &= ~ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD;
+}
+
+/**
+ * ice_vsi_add_vlan_zero - add VLAN 0 filter(s) for this VSI
+ * @vsi: VSI used to add VLAN filters
+ *
+ * In Single VLAN Mode (SVM), single VLAN filters via ICE_SW_LKUP_VLAN are based
+ * on the inner VLAN ID, so the VLAN TPID (i.e. 0x8100 or 0x888a8) doesn't
+ * matter. In Double VLAN Mode (DVM), outer/single VLAN filters via
+ * ICE_SW_LKUP_VLAN are based on the outer/single VLAN ID + VLAN TPID.
+ *
+ * For both modes add a VLAN 0 + no VLAN TPID filter to handle untagged traffic
+ * when VLAN pruning is enabled. Also, this handles VLAN 0 priority tagged
+ * traffic in SVM, since the VLAN TPID isn't part of filtering.
+ *
+ * If DVM is enabled then an explicit VLAN 0 + VLAN TPID filter needs to be
+ * added to allow VLAN 0 priority tagged traffic in DVM, since the VLAN TPID is
+ * part of filtering.
+ */
+int ice_vsi_add_vlan_zero(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	struct ice_vlan vlan;
+	int err;
+
+	vlan = ICE_VLAN(0, 0, 0, ICE_FWD_TO_VSI);
+	err = vlan_ops->add_vlan(vsi, &vlan);
+	if (err && err != -EEXIST)
+		return err;
+
+	/* in SVM both VLAN 0 filters are identical */
+	if (!ice_is_dvm_ena(&vsi->back->hw))
+		return 0;
+
+	vlan = ICE_VLAN(ETH_P_8021Q, 0, 0, ICE_FWD_TO_VSI);
+	err = vlan_ops->add_vlan(vsi, &vlan);
+	if (err && err != -EEXIST)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vsi_del_vlan_zero - delete VLAN 0 filter(s) for this VSI
+ * @vsi: VSI used to add VLAN filters
+ *
+ * Delete the VLAN 0 filters in the same manner that they were added in
+ * ice_vsi_add_vlan_zero.
+ */
+int ice_vsi_del_vlan_zero(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	struct ice_vlan vlan;
+	int err;
+
+	vlan = ICE_VLAN(0, 0, 0, ICE_FWD_TO_VSI);
+	err = vlan_ops->del_vlan(vsi, &vlan);
+	if (err && err != -EEXIST)
+		return err;
+
+	/* in SVM both VLAN 0 filters are identical */
+	if (!ice_is_dvm_ena(&vsi->back->hw))
+		return 0;
+
+	vlan = ICE_VLAN(ETH_P_8021Q, 0, 0, ICE_FWD_TO_VSI);
+	err = vlan_ops->del_vlan(vsi, &vlan);
+	if (err && err != -EEXIST)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vsi_num_zero_vlans - get number of VLAN 0 filters based on VLAN mode
+ * @vsi: VSI used to get the VLAN mode
+ *
+ * If DVM is enabled then 2 VLAN 0 filters are added, else if SVM is enabled
+ * then 1 VLAN 0 filter is added. See ice_vsi_add_vlan_zero for more details.
+ */
+static u16 ice_vsi_num_zero_vlans(struct ice_vsi *vsi)
+{
+#define ICE_DVM_NUM_ZERO_VLAN_FLTRS	2
+#define ICE_SVM_NUM_ZERO_VLAN_FLTRS	1
+	/* no VLAN 0 filter is created when a port VLAN is active */
+	if (vsi->type == ICE_VSI_VF &&
+	    ice_vf_is_port_vlan_ena(&vsi->back->vf[vsi->vf_id]))
+		return 0;
+	if (ice_is_dvm_ena(&vsi->back->hw))
+		return ICE_DVM_NUM_ZERO_VLAN_FLTRS;
 	else
-		status = ice_remove_mac(&vsi->back->hw, &tmp_add_list);
+		return ICE_SVM_NUM_ZERO_VLAN_FLTRS;
+}
+
+/**
+ * ice_vsi_has_non_zero_vlans - check is VSI has any non-zero VLANs
+ * @vsi: VSI used to determine if any non-zero VLANs have been added
+ */
+bool ice_vsi_has_non_zero_vlans(struct ice_vsi *vsi)
+{
+	return (vsi->num_vlan > ice_vsi_num_zero_vlans(vsi));
+}
+
+/**
+ * ice_vsi_num_non_zero_vlans - get the number of non-zero VLANs for this VSI
+ * @vsi: VSI used to get the number of non-zero VLANs added
+ */
+u16 ice_vsi_num_non_zero_vlans(struct ice_vsi *vsi)
+{
+	return (vsi->num_vlan - ice_vsi_num_zero_vlans(vsi));
+}
+
+/**
+ * ice_is_feature_supported
+ * @pf: pointer to the struct ice_pf instance
+ * @f: feature enum to be checked
+ *
+ * returns true if feature is supported, false otherwise
+ */
+bool ice_is_feature_supported(struct ice_pf *pf, enum ice_feature f)
+{
+	if (f < 0 || f >= ICE_F_MAX)
+		return false;
+
+	return test_bit(f, pf->features);
+}
 
-cfg_mac_fltr_exit:
-	ice_free_fltr_list(&vsi->back->pdev->dev, &tmp_add_list);
-	return status;
+/**
+ * ice_set_feature_support
+ * @pf: pointer to the struct ice_pf instance
+ * @f: feature enum to set
+ */
+void ice_set_feature_support(struct ice_pf *pf, enum ice_feature f)
+{
+	if (f < 0 || f >= ICE_F_MAX)
+		return;
+
+	set_bit(f, pf->features);
+}
+
+/**
+ * ice_clear_feature_support
+ * @pf: pointer to the struct ice_pf instance
+ * @f: feature enum to clear
+ */
+void ice_clear_feature_support(struct ice_pf *pf, enum ice_feature f)
+{
+	if (f < 0 || f >= ICE_F_MAX)
+		return;
+
+	clear_bit(f, pf->features);
+}
+
+/**
+ * ice_init_feature_support
+ * @pf: pointer to the struct ice_pf instance
+ *
+ * called during init to setup supported feature
+ */
+void ice_init_feature_support(struct ice_pf *pf)
+{
+	switch (pf->hw.device_id) {
+	case ICE_DEV_ID_E810C_BACKPLANE:
+	case ICE_DEV_ID_E810C_QSFP:
+	case ICE_DEV_ID_E810C_SFP:
+	case ICE_DEV_ID_E810_XXV_BACKPLANE:
+	case ICE_DEV_ID_E810_XXV_QSFP:
+	case ICE_DEV_ID_E810_XXV_SFP:
+		ice_set_feature_support(pf, ICE_F_DSCP);
+		ice_set_feature_support(pf, ICE_F_PTP_EXTTS);
+		break;
+	default:
+		break;
+	}
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h
index 47bc033fff20..4331d2786d75 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_lib.h
@@ -1,87 +1,62 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_LIB_H_
 #define _ICE_LIB_H_
 
 #include "ice.h"
 
-struct ice_txq_meta {
-	/* Tx-scheduler element identifier */
-	u32 q_teid;
-	/* Entry in VSI's txq_map bitmap */
-	u16 q_id;
-	/* Relative index of Tx queue within TC */
-	u16 q_handle;
-	/* VSI index that Tx queue belongs to */
-	u16 vsi_idx;
-	/* TC number that Tx queue belongs to */
-	u8 tc;
-};
+const char *ice_vsi_type_str(enum ice_vsi_type vsi_type);
 
-int
-ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list,
-		    const u8 *macaddr);
-
-void ice_free_fltr_list(struct device *dev, struct list_head *h);
+bool ice_pf_state_is_nominal(struct ice_pf *pf);
 
 void ice_update_eth_stats(struct ice_vsi *vsi);
 
+int ice_vsi_cfg_single_rxq(struct ice_vsi *vsi, u16 q_idx);
+
+int ice_vsi_cfg_single_txq(struct ice_vsi *vsi, struct ice_ring **tx_rings, u16 q_idx);
+
 int ice_vsi_cfg_rxqs(struct ice_vsi *vsi);
 
 int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi);
 
 void ice_vsi_cfg_msix(struct ice_vsi *vsi);
 
-#ifdef CONFIG_PCI_IOV
-void
-ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx);
+int ice_vsi_start_all_rx_rings(struct ice_vsi *vsi);
 
-void
-ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx);
+int ice_vsi_stop_all_rx_rings(struct ice_vsi *vsi);
 
 int
-ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
-		     u16 rel_vmvf_num, struct ice_ring *ring,
-		     struct ice_txq_meta *txq_meta);
-
-void ice_fill_txq_meta(struct ice_vsi *vsi, struct ice_ring *ring,
-		       struct ice_txq_meta *txq_meta);
-
-int ice_vsi_ctrl_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx);
-#endif /* CONFIG_PCI_IOV */
-
-int ice_vsi_add_vlan(struct ice_vsi *vsi, u16 vid);
-
-int ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid);
+ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
+			  u16 rel_vmvf_num);
+#ifdef HAVE_XDP_SUPPORT
 
-int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi);
+int ice_vsi_cfg_xdp_txqs(struct ice_vsi *vsi);
 
-int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena);
+int ice_vsi_stop_xdp_tx_rings(struct ice_vsi *vsi);
 
-int ice_vsi_start_rx_rings(struct ice_vsi *vsi);
+#endif /* HAVE_XDP_SUPPORT */
 
-int ice_vsi_stop_rx_rings(struct ice_vsi *vsi);
-
-int
-ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
-			  u16 rel_vmvf_num);
-
-int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena, bool vlan_promisc);
+bool ice_vsi_is_vlan_pruning_ena(struct ice_vsi *vsi);
 
 void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create);
 
-void ice_vsi_delete(struct ice_vsi *vsi);
+int ice_set_link(struct ice_vsi *vsi, bool ena);
 
+void ice_vsi_delete(struct ice_vsi *vsi);
 int ice_vsi_clear(struct ice_vsi *vsi);
+void ice_vsi_put_qs(struct ice_vsi *vsi);
+
+void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc);
 
-#ifdef CONFIG_DCB
 int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc);
-#endif /* CONFIG_DCB */
+
+int ice_vsi_cfg_rss_lut_key(struct ice_vsi *vsi);
 
 struct ice_vsi *
 ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
-	      enum ice_vsi_type type, u16 vf_id);
+	      enum ice_vsi_type vsi_type, u16 vf_id, struct ice_channel *ch,
+	      u8 tc);
 
 void ice_napi_del(struct ice_vsi *vsi);
 
@@ -89,24 +64,21 @@ int ice_vsi_release(struct ice_vsi *vsi);
 
 void ice_vsi_close(struct ice_vsi *vsi);
 
+int ice_ena_vsi(struct ice_vsi *vsi, bool locked);
+
+void ice_dis_vsi(struct ice_vsi *vsi, bool locked);
+
 int ice_free_res(struct ice_res_tracker *res, u16 index, u16 id);
 
+u16 ice_get_valid_res_count(struct ice_res_tracker *res);
+
 int
 ice_get_res(struct ice_pf *pf, struct ice_res_tracker *res, u16 needed, u16 id);
 
-int ice_vsi_rebuild(struct ice_vsi *vsi);
+int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi);
 
 bool ice_is_reset_in_progress(unsigned long *state);
-
-void ice_vsi_free_q_vectors(struct ice_vsi *vsi);
-
-void ice_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector);
-
-void ice_vsi_put_qs(struct ice_vsi *vsi);
-
-#ifdef CONFIG_DCB
-void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi);
-#endif /* CONFIG_DCB */
+int ice_wait_for_reset(struct ice_pf *pf, unsigned long timeout);
 
 void ice_vsi_dis_irq(struct ice_vsi *vsi);
 
@@ -116,14 +88,54 @@ void ice_vsi_free_rx_rings(struct ice_vsi *vsi);
 
 void ice_vsi_free_tx_rings(struct ice_vsi *vsi);
 
-int ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena);
+void ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena);
+
+void ice_update_tx_ring_stats(struct ice_ring *ring, u64 pkts, u64 bytes);
 
-u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran);
+void ice_update_rx_ring_stats(struct ice_ring *ring, u64 pkts, u64 bytes);
 
-char *ice_nvm_version_str(struct ice_hw *hw);
+void ice_vsi_cfg_frame_size(struct ice_vsi *vsi);
+
+int ice_status_to_errno(enum ice_status err);
+
+void
+ice_write_qrxflxp_cntxt(struct ice_hw *hw, u16 pf_q, u32 rxdid, u32 prio,
+			bool __maybe_unused ena_ts);
+
+#ifdef HAVE_NETPOLL_CONTROLLER
+irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data);
+#endif /* HAVE_NETPOLL_CONTROLLER */
+
+void ice_write_intrl(struct ice_q_vector *q_vector, u8 intrl);
+void ice_write_itr(struct ice_ring_container *rc, u16 itr);
+void ice_set_q_vector_intrl(struct ice_q_vector *q_vector);
 
 enum ice_status
 ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set);
-
 bool ice_is_safe_mode(struct ice_pf *pf);
+bool ice_is_peer_ena(struct ice_pf *pf);
+bool ice_is_dflt_vsi_in_use(struct ice_sw *sw);
+bool ice_is_vsi_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi);
+int ice_set_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi);
+int ice_clear_dflt_vsi(struct ice_sw *sw);
+int ice_set_min_bw_limit(struct ice_vsi *vsi, u64 min_tx_rate);
+int ice_set_max_bw_limit(struct ice_vsi *vsi, u64 max_tx_rate);
+int ice_get_link_speed_kbps(struct ice_vsi *vsi);
+int ice_get_link_speed_mbps(struct ice_vsi *vsi);
+int ice_vsi_update_security(struct ice_vsi *vsi,
+			    void (*fill)(struct ice_vsi_ctx *));
+#ifdef HAVE_METADATA_PORT_INFO
+void ice_vsi_ctx_set_antispoof(struct ice_vsi_ctx *ctx);
+void ice_vsi_ctx_clear_antispoof(struct ice_vsi_ctx *ctx);
+#endif /* HAVE_METADATA_PORT_INFO */
+void ice_vsi_ctx_set_allow_override(struct ice_vsi_ctx *ctx);
+void ice_vsi_ctx_clear_allow_override(struct ice_vsi_ctx *ctx);
+int ice_vsi_add_vlan_zero(struct ice_vsi *vsi);
+int ice_vsi_del_vlan_zero(struct ice_vsi *vsi);
+bool ice_vsi_has_non_zero_vlans(struct ice_vsi *vsi);
+u16 ice_vsi_num_non_zero_vlans(struct ice_vsi *vsi);
+bool ice_is_feature_supported(struct ice_pf *pf, enum ice_feature f);
+void ice_set_feature_support(struct ice_pf *pf, enum ice_feature f);
+void ice_clear_feature_support(struct ice_pf *pf, enum ice_feature f);
+void ice_init_feature_support(struct ice_pf *pf);
 #endif /* !_ICE_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index d0ccb7ad447b..7151053ca28c 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -1,34 +1,57 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 /* Intel(R) Ethernet Connection E800 Series Linux Driver */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include "ice.h"
+#include "ice_base.h"
 #include "ice_lib.h"
+#include "ice_fltr.h"
 #include "ice_dcb_lib.h"
-
-#define DRV_VERSION_MAJOR 0
-#define DRV_VERSION_MINOR 8
-#define DRV_VERSION_BUILD 1
-
-#define DRV_VERSION	__stringify(DRV_VERSION_MAJOR) "." \
-			__stringify(DRV_VERSION_MINOR) "." \
-			__stringify(DRV_VERSION_BUILD) "-k"
+#include "ice_dcb_nl.h"
+#include "ice_devlink.h"
+#include "ice_eswitch.h"
+/* Including ice_trace.h with CREATE_TRACE_POINTS defined will generate the
+ * ice tracepoint functions. This must be done exactly once across the
+ * ice driver.
+ */
+#define CREATE_TRACE_POINTS
+#include "ice_trace.h"
+#undef CREATE_TRACE_POINTS
+#include "ice_tc_lib.h"
+#include "ice_vsi_vlan_ops.h"
+#include "ice_fwlog.h"
+
+#define DRV_VERSION_MAJOR 1
+#define DRV_VERSION_MINOR 6
+#define DRV_VERSION_BUILD 7
+
+#define DRV_VERSION	"1.6.7.1.1"
 #define DRV_SUMMARY	"Intel(R) Ethernet Connection E800 Series Linux Driver"
-const char ice_drv_ver[] = DRV_VERSION;
+#ifdef ICE_ADD_PROBES
+#define DRV_VERSION_EXTRA "_probes"
+#else
+#define DRV_VERSION_EXTRA ""
+#endif /* ICE_ADD_PROBES */
+
+const char ice_drv_ver[] = DRV_VERSION DRV_VERSION_EXTRA;
 static const char ice_driver_string[] = DRV_SUMMARY;
-static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation.";
+static const char ice_copyright[] = "Copyright (C) 2018-2021, Intel Corporation.";
 
 /* DDP Package file located in firmware search paths (e.g. /lib/firmware/) */
+#if UTS_UBUNTU_RELEASE_ABI
+#define ICE_DDP_PKG_PATH	"updates/intel/ice/ddp/"
+#else /* UTS_UBUNTU_RELEASE_ABI */
 #define ICE_DDP_PKG_PATH	"intel/ice/ddp/"
+#endif /* UTS_UBUNTU_RELEASE_ABI */
 #define ICE_DDP_PKG_FILE	ICE_DDP_PKG_PATH "ice.pkg"
 
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION(DRV_SUMMARY);
 MODULE_LICENSE("GPL v2");
-MODULE_VERSION(DRV_VERSION);
+MODULE_VERSION(DRV_VERSION DRV_VERSION_EXTRA);
 MODULE_FIRMWARE(ICE_DDP_PKG_FILE);
 
 static int debug = -1;
@@ -39,14 +62,323 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all), hw debug_mask (0x8XXXX
 MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
 #endif /* !CONFIG_DYNAMIC_DEBUG */
 
+
+
+static ushort fwlog_level = ICE_FWLOG_LEVEL_NONE;
+module_param(fwlog_level, ushort, 0644);
+MODULE_PARM_DESC(fwlog_level, "FW event level to log. All levels <= to the specified value are enabled. Values: 0=none, 1=error, 2=warning, 3=normal, 4=verbose. Invalid values: >=5\n");
+/* Apply a bitmask to control which categories of FW logging is recorded
+ * 00000001 - General/Minor
+ * 00000002 - Control (Resets/Autoload)
+ * 00000004 - Link Management
+ * 00000008 - Link Topology Detection
+ * 00000010 - Dreadnought Lake
+ * 00000020 - I2C
+ * 00000040 - SDP
+ * 00000080 - MDIO
+ * 00000100 - Admin Queue
+ * 00000200 - HDMA
+ * 00000400 - LLDP
+ * 00000800 - DCBx
+ * 00001000 - DCB
+ * 00002000 - XLR
+ * 00004000 - NVM
+ * 00008000 - Authentication
+ * 00010000 - VPD
+ * 00020000 - IOSF
+ * 00040000 - Parser
+ * 00080000 - Switch
+ * 00100000 - Scheduler
+ * 00200000 - TX Queue Management
+ * 00400000 - ACL
+ * 00800000 - Post
+ * 01000000 - Watchdog
+ * 02000000 - Task Dispatcher
+ * 04000000 - Manageability
+ * 08000000 - Synce
+ * 10000000 - Health
+ * 20000000 - TimeSync
+ * 40000000 - PF Registration
+ * 80000000 - Module Version
+ */
+static unsigned long fwlog_events; /* no enabled events by default */
+module_param(fwlog_events, ulong, 0644);
+MODULE_PARM_DESC(fwlog_events, "FW events to log (32-bit mask)\n");
+
 static struct workqueue_struct *ice_wq;
+
+static const struct net_device_ops ice_netdev_recovery_ops;
 static const struct net_device_ops ice_netdev_safe_mode_ops;
 static const struct net_device_ops ice_netdev_ops;
-
 static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type);
 
+static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type);
 static void ice_vsi_release_all(struct ice_pf *pf);
 
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+static int ice_rebuild_channels(struct ice_pf *pf);
+static void ice_remove_q_channels(struct ice_vsi *vsi, bool rem_adv_fltr);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+
+bool netif_is_ice(struct net_device *dev)
+{
+	return dev && (dev->netdev_ops == &ice_netdev_ops);
+}
+
+#ifdef HAVE_NETDEV_SB_DEV
+static void ice_deinit_macvlan(struct ice_vsi *vsi);
+
+#endif /* HAVE_NETDEV_SB_DEV */
+#ifdef HAVE_TC_INDIR_BLOCK
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+static int
+ice_indr_setup_tc_cb(struct net_device *netdev, struct Qdisc *sch,
+		     void *cb_priv, enum tc_setup_type type, void *type_data,
+		     void *data,
+		     void (*cleanup)(struct flow_block_cb *block_cb));
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+static int
+ice_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
+		     enum tc_setup_type type, void *type_data, void *data,
+		     void (*cleanup)(struct flow_block_cb *block_cb));
+#elif defined(HAVE_TC_FLOW_INDIR_DEV)
+static int ice_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
+				enum tc_setup_type type, void *type_data);
+static int ice_indr_setup_block_cb(enum tc_setup_type type, void *type_data,
+				   void *indr_priv);
+#else /* !HAVE_TC_FLOW_INDIR_DEV */
+static int
+ice_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr);
+static void ice_indr_clean_block_privs(struct ice_netdev_priv *np);
+#endif /* HAVE_TC_INDIR_BLOCK */
+#endif
+/**
+ * ice_chnl_subtask_handle_interrupt - if needed, trigger SW interrupt on
+ * channel enabled vector
+ * @pf: pointer to PF struct
+ *
+ * This function process all channel enabled vectors and based on jiffy
+ * and delta between jiffies, decides and triggers software initiated
+ * interrupt on each of such vectors. Logic used:
+ * if on given vector jiffies delta is greated than 1 second and old
+ * snapshot of jiffies is valid, then trigger software interrupt.
+ * Jiffies snapshot is stored/updated in vector whenever vector
+ * is serviced through busy-poll.
+ */
+static void ice_chnl_subtask_handle_interrupt(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+	unsigned long end;
+	unsigned int i;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi || test_bit(ICE_VSI_DOWN, vsi->state))
+		return;
+
+	if (!(vsi->netdev && netif_carrier_ok(vsi->netdev)))
+		return;
+
+	for (i = 0; i < vsi->num_txq; i++) {
+		struct ice_ring *tx_ring = vsi->tx_rings[i];
+		struct ice_ring *rx_ring = vsi->rx_rings[i];
+		struct ice_q_vector *q_vector;
+
+		if (!(tx_ring && tx_ring->desc && rx_ring))
+			continue;
+		q_vector = tx_ring->q_vector;
+		if (!q_vector || !ice_vector_ch_enabled(q_vector))
+			continue;
+
+		end = tx_ring->q_vector->jiffy;
+		if (!end)
+			continue;
+
+		/* trigger software interrupt (to revive queue processing) if
+		 * vector is channel enabled and only if current jiffies is at
+		 * least 1 sec (worth of jiffies, hence multiplying by HZ) more
+		 * than old_jiffies
+		 */
+#define ICE_JIFFY_DELTA_IN_SEC	(1 * HZ)
+		end += ICE_JIFFY_DELTA_IN_SEC;
+		if (time_is_before_jiffies(end) &&
+		    (q_vector->state_flags & ICE_CHNL_ONCE_IN_BP)) {
+#ifdef ADQ_PERF_COUNTERS
+			ice_sw_intr_cntr(q_vector, false);
+#endif /* ADQ_PERF_COUNTERS */
+			ice_adq_trigger_sw_intr(&pf->hw, q_vector);
+		}
+	}
+}
+
+/**
+ * ice_flush_vsi_fd_fltrs - flush VSI specific FD entries
+ * @vsi: ptr to VSI
+ *
+ * This function flushes all FD entries specific to VSI from
+ * HW FD table
+ */
+static inline void ice_flush_vsi_fd_fltrs(struct ice_vsi *vsi)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	enum ice_status status;
+
+	status = ice_clear_vsi_fd_table(&vsi->back->hw, vsi->vsi_num);
+	if (status)
+		dev_err(dev, "Failed to clear FD table for %s, vsi_num: %u, status: %s\n",
+			ice_vsi_type_str(vsi->type), vsi->vsi_num,
+			ice_stat_str(status));
+}
+
+/**
+ * ice_chnl_handle_fd_transition - handle VSI specific FD transition
+ * @main_vsi: ptr to main VSI (ICE_VSI_PF)
+ * @ch: ptr to channel
+ * @hw_fd_cnt: HW FD count specific to VSI
+ * @fd_pkt_cnt: packets services thru' inline-FD filter
+ * @sw_fd_cnt: SW tracking, number of inline-FD filtere were programmed per VSI
+ *
+ * This function determines whether given VSI should continue to use inline-FD
+ * resources or not and sets the bit accordingly. It also flushes the FD entries
+ * occupied per VSI if detected table full condition 'n' times and no more
+ * packets serviced thru' inline-FD filter
+ */
+static void
+ice_chnl_handle_fd_transition(struct ice_vsi *main_vsi, struct ice_channel *ch,
+			      u32 hw_fd_cnt, u64 fd_pkt_cnt, int sw_fd_cnt)
+{
+	struct ice_vsi *vsi;
+
+	if (!ch || !main_vsi)
+		return;
+
+	vsi = ch->ch_vsi;
+	if (!vsi)
+		return;
+
+	/* did we reach table full condition and no activity w.r.t
+	 * inline-FD filter being hit in HW table during last 'n' runs of
+	 * service task, then it is safe to "drop HW table entries"
+	 */
+	/* check to see if given VSI reached max limit of FD entries */
+	if (ice_is_vsi_fd_table_full(vsi, hw_fd_cnt)) {
+		/* check to see if there are any hits using inline-FD filters,
+		 * if not start "table_full" counter
+		 */
+		if (!ch->fd_pkt_cnt && !fd_pkt_cnt &&
+		    ch->fd_pkt_cnt == fd_pkt_cnt) {
+			/* HW table is FULL: and no more packets being serviced
+			 * thru' inline-FD filters (by looking at the current
+			 * and prev packets serviced).
+			 * Logic to see if that current and prev packet count
+			 * is changing or not, if not changing means,
+			 * it is safe to assume that even though there are
+			 * inline-FD filters exist in HW table, but flows
+			 * associated with those filters are ended via.
+			 * RST code path
+			 */
+			vsi->cnt_tbl_full++;
+			main_vsi->cnt_tbl_full++;
+		} else {
+			vsi->cnt_tbl_full = 0;
+		}
+
+		/* detected that HW table remained full during
+		 * last 'n' times, now it is the time to purge
+		 * HW table entries.
+		 * detected that HW FD table full condition
+		 * based on SW counter based hueristics,
+		 * give around 4 second to be in same condition
+		 * otherwise proceed with purging HW table
+		 * entries
+		 */
+		if (vsi->cnt_tbl_full < ICE_TBL_FULL_TIMES)
+			return;
+
+		/* if we are here, then safe to flush HW inline-FD filters */
+		ice_flush_vsi_fd_fltrs(vsi);
+		/* stats to keep track, how many times HW table is flushed */
+		vsi->cnt_table_flushed++;
+		main_vsi->cnt_table_flushed++;
+
+		/* reset VSI specific counters */
+		atomic_set(&vsi->inline_fd_active_cnt, 0);
+		vsi->cnt_tbl_full = 0;
+		/* clear the feature flag for inline-FD/RSS */
+		clear_bit(ICE_SWITCH_TO_RSS, vsi->adv_state);
+	} else if ((u32)sw_fd_cnt > hw_fd_cnt) {
+		/* HW table (inline-FD filters) is not full and SW count is
+		 * higher than actual entries in HW table, time to sync SW
+		 * counter with HW counter (tracking inline-FD filter count)
+		 * and transition back to using inline-FD filters
+		 */
+		atomic_set(&vsi->inline_fd_active_cnt, hw_fd_cnt);
+		vsi->cnt_tbl_full = 0;
+		/* stats to keep track, how many times transitioned into
+		 * inline-FD from RSS
+		 */
+		vsi->cnt_inline_fd_transition++;
+		main_vsi->cnt_inline_fd_transition++;
+		/* clear the feature flag for inline-FD/RSS */
+		clear_bit(ICE_SWITCH_TO_RSS, vsi->adv_state);
+	}
+}
+
+/**
+ * ice_channel_sync_global_cntrs - sync SW and HW FD specific counters
+ * @pf: ptr to PF
+ *
+ * This function iterates thru' all channel VSIs and handles transition of
+ * FD (Flow-director) -> RSS and vice versa, if needed also flushes VSI
+ * specific FD entries from HW table
+ */
+static void ice_channel_sync_global_cntrs(struct ice_pf *pf)
+{
+	struct ice_vsi *main_vsi;
+	struct ice_channel *ch;
+
+	main_vsi = ice_get_main_vsi(pf);
+	if (!main_vsi)
+		return;
+
+	list_for_each_entry(ch, &main_vsi->ch_list, list) {
+		struct ice_vsi *ch_vsi;
+		u64 fd_pkt_cnt;
+		int sw_fd_cnt;
+		u32 hw_fd_cnt;
+
+		ch_vsi = ch->ch_vsi;
+		if (!ch_vsi)
+			continue;
+		if (!ice_vsi_fd_ena(ch_vsi))
+			continue;
+		/* bailout if SWITCH_TO_RSS is not set */
+		if (!test_bit(ICE_SWITCH_TO_RSS, ch_vsi->adv_state))
+			continue;
+		/* first counter index is always taken by sideband flow
+		 * director, hence channel specific counter index has
+		 * to be non-zero, otherwise skip...
+		 */
+		if (!ch->fd_cnt_index)
+			continue;
+
+		/* read SW count */
+		sw_fd_cnt = atomic_read(&ch_vsi->inline_fd_active_cnt);
+		/* Read HW count */
+		hw_fd_cnt = ice_get_current_fd_cnt(ch_vsi);
+		/* Read the HW counter which was associated with inline-FD */
+		fd_pkt_cnt = ice_read_cntr(pf, ch->fd_cnt_index);
+
+		/* handle VSI specific transition: inline-FD/RSS
+		 * if needed flush FD entries specific to VSI
+		 */
+		ice_chnl_handle_fd_transition(main_vsi, ch, hw_fd_cnt,
+					      fd_pkt_cnt, sw_fd_cnt);
+		/* store the value of fd_pkt_cnt per channel */
+		ch->fd_pkt_cnt = fd_pkt_cnt;
+	}
+}
+
 /**
  * ice_get_tx_pending - returns number of Tx descriptors not processed
  * @ring: the ring of descriptors
@@ -82,7 +414,7 @@ static void ice_check_for_hang_subtask(struct ice_pf *pf)
 			break;
 		}
 
-	if (!vsi || test_bit(__ICE_DOWN, vsi->state))
+	if (!vsi || test_bit(ICE_VSI_DOWN, vsi->state))
 		return;
 
 	if (!(vsi->netdev && netif_carrier_ok(vsi->netdev)))
@@ -93,7 +425,12 @@ static void ice_check_for_hang_subtask(struct ice_pf *pf)
 	for (i = 0; i < vsi->num_txq; i++) {
 		struct ice_ring *tx_ring = vsi->tx_rings[i];
 
-		if (tx_ring && tx_ring->desc) {
+		if (!tx_ring)
+			continue;
+		if (ice_ring_ch_enabled(tx_ring))
+			continue;
+
+		if (tx_ring->desc) {
 			/* If packet counter has not changed the queue is
 			 * likely stalled, so force an interrupt for this
 			 * queue.
@@ -129,45 +466,19 @@ static void ice_check_for_hang_subtask(struct ice_pf *pf)
 static int ice_init_mac_fltr(struct ice_pf *pf)
 {
 	enum ice_status status;
-	u8 broadcast[ETH_ALEN];
 	struct ice_vsi *vsi;
+	u8 *perm_addr;
 
 	vsi = ice_get_main_vsi(pf);
 	if (!vsi)
 		return -EINVAL;
 
-	/* To add a MAC filter, first add the MAC to a list and then
-	 * pass the list to ice_add_mac.
-	 */
-
-	 /* Add a unicast MAC filter so the VSI can get its packets */
-	status = ice_vsi_cfg_mac_fltr(vsi, vsi->port_info->mac.perm_addr, true);
+	perm_addr = vsi->port_info->mac.perm_addr;
+	status = ice_fltr_add_mac_and_broadcast(vsi, perm_addr, ICE_FWD_TO_VSI);
 	if (status)
-		goto unregister;
-
-	/* VSI needs to receive broadcast traffic, so add the broadcast
-	 * MAC address to the list as well.
-	 */
-	eth_broadcast_addr(broadcast);
-	status = ice_vsi_cfg_mac_fltr(vsi, broadcast, true);
-	if (status)
-		goto unregister;
+		return -EIO;
 
 	return 0;
-unregister:
-	/* We aren't useful with no MAC filters, so unregister if we
-	 * had an error
-	 */
-	if (status && vsi->netdev->reg_state == NETREG_REGISTERED) {
-		dev_err(&pf->pdev->dev,
-			"Could not add MAC filters error %d. Unregistering device\n",
-			status);
-		unregister_netdev(vsi->netdev);
-		free_netdev(vsi->netdev);
-		vsi->netdev = NULL;
-	}
-
-	return -EIO;
 }
 
 /**
@@ -185,7 +496,8 @@ static int ice_add_mac_to_sync_list(struct net_device *netdev, const u8 *addr)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 
-	if (ice_add_mac_to_list(vsi, &vsi->tmp_sync_list, addr))
+	if (ice_fltr_add_mac_to_list(vsi, &vsi->tmp_sync_list, addr,
+				     ICE_FWD_TO_VSI))
 		return -EINVAL;
 
 	return 0;
@@ -206,7 +518,8 @@ static int ice_add_mac_to_unsync_list(struct net_device *netdev, const u8 *addr)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 
-	if (ice_add_mac_to_list(vsi, &vsi->tmp_unsync_list, addr))
+	if (ice_fltr_add_mac_to_list(vsi, &vsi->tmp_unsync_list, addr,
+				     ICE_FWD_TO_VSI))
 		return -EINVAL;
 
 	return 0;
@@ -220,37 +533,54 @@ static int ice_add_mac_to_unsync_list(struct net_device *netdev, const u8 *addr)
  */
 static bool ice_vsi_fltr_changed(struct ice_vsi *vsi)
 {
-	return test_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags) ||
-	       test_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags) ||
-	       test_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
+	return test_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state) ||
+	       test_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state) ||
+	       test_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
 }
 
 /**
- * ice_cfg_promisc - Enable or disable promiscuous mode for a given PF
+ * ice_set_promisc - Enable promiscuous mode for a given PF
  * @vsi: the VSI being configured
  * @promisc_m: mask of promiscuous config bits
- * @set_promisc: enable or disable promisc flag request
  *
  */
-static int ice_cfg_promisc(struct ice_vsi *vsi, u8 promisc_m, bool set_promisc)
+static int ice_set_promisc(struct ice_vsi *vsi, u8 promisc_m)
 {
-	struct ice_hw *hw = &vsi->back->hw;
-	enum ice_status status = 0;
+	enum ice_status status;
 
 	if (vsi->type != ICE_VSI_PF)
 		return 0;
 
-	if (vsi->vlan_ena) {
-		status = ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_m,
-						  set_promisc);
-	} else {
-		if (set_promisc)
-			status = ice_set_vsi_promisc(hw, vsi->idx, promisc_m,
-						     0);
-		else
-			status = ice_clear_vsi_promisc(hw, vsi->idx, promisc_m,
-						       0);
-	}
+	if (ice_vsi_has_non_zero_vlans(vsi))
+		status = ice_fltr_set_vlan_vsi_promisc(&vsi->back->hw, vsi, promisc_m);
+	else
+		status = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m, 0,
+						  vsi->port_info->lport);
+
+	if (status)
+		return -EIO;
+
+	return 0;
+}
+
+/**
+ * ice_clear_promisc - Disable promiscuous mode for a given PF
+ * @vsi: the VSI being configured
+ * @promisc_m: mask of promiscuous config bits
+ *
+ */
+static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m)
+{
+	enum ice_status status;
+
+	if (vsi->type != ICE_VSI_PF)
+		return 0;
+
+	if (ice_vsi_has_non_zero_vlans(vsi))
+		status = ice_fltr_clear_vlan_vsi_promisc(&vsi->back->hw, vsi, promisc_m);
+	else
+		status = ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m, 0,
+						    vsi->port_info->lport);
 
 	if (status)
 		return -EIO;
@@ -258,6 +588,7 @@ static int ice_cfg_promisc(struct ice_vsi *vsi, u8 promisc_m, bool set_promisc)
 	return 0;
 }
 
+
 /**
  * ice_vsi_sync_fltr - Update the VSI filter list to the HW
  * @vsi: ptr to the VSI
@@ -266,7 +597,8 @@ static int ice_cfg_promisc(struct ice_vsi *vsi, u8 promisc_m, bool set_promisc)
  */
 static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 {
-	struct device *dev = &vsi->back->pdev->dev;
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	struct device *dev = ice_pf_to_dev(vsi->back);
 	struct net_device *netdev = vsi->netdev;
 	bool promisc_forced_on = false;
 	struct ice_pf *pf = vsi->back;
@@ -279,7 +611,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	if (!vsi->netdev)
 		return -EINVAL;
 
-	while (test_and_set_bit(__ICE_CFG_BUSY, vsi->state))
+	while (test_and_set_bit(ICE_CFG_BUSY, vsi->state))
 		usleep_range(1000, 2000);
 
 	changed_flags = vsi->current_netdev_flags ^ vsi->netdev->flags;
@@ -289,9 +621,9 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	INIT_LIST_HEAD(&vsi->tmp_unsync_list);
 
 	if (ice_vsi_fltr_changed(vsi)) {
-		clear_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
-		clear_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
-		clear_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
+		clear_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state);
+		clear_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state);
+		clear_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
 
 		/* grab the netdev's addr_list_lock */
 		netif_addr_lock_bh(netdev);
@@ -304,8 +636,8 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	}
 
 	/* Remove MAC addresses in the unsync list */
-	status = ice_remove_mac(hw, &vsi->tmp_unsync_list);
-	ice_free_fltr_list(dev, &vsi->tmp_unsync_list);
+	status = ice_fltr_remove_mac_list(vsi, &vsi->tmp_unsync_list);
+	ice_fltr_free_list(dev, &vsi->tmp_unsync_list);
 	if (status) {
 		netdev_err(netdev, "Failed to delete MAC filters\n");
 		/* if we failed because of alloc failures, just bail */
@@ -316,8 +648,8 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	}
 
 	/* Add MAC addresses in the sync list */
-	status = ice_add_mac(hw, &vsi->tmp_sync_list);
-	ice_free_fltr_list(dev, &vsi->tmp_sync_list);
+	status = ice_fltr_add_mac_list(vsi, &vsi->tmp_sync_list);
+	ice_fltr_free_list(dev, &vsi->tmp_sync_list);
 	/* If filter is added successfully or already exists, do not go into
 	 * 'if' condition and report it as error. Instead continue processing
 	 * rest of the function.
@@ -329,11 +661,10 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 		 * space reserved for promiscuous filters.
 		 */
 		if (hw->adminq.sq_last_status == ICE_AQ_RC_ENOSPC &&
-		    !test_and_set_bit(__ICE_FLTR_OVERFLOW_PROMISC,
+		    !test_and_set_bit(ICE_FLTR_OVERFLOW_PROMISC,
 				      vsi->state)) {
 			promisc_forced_on = true;
-			netdev_warn(netdev,
-				    "Reached MAC filter limit, forcing promisc mode on VSI %d\n",
+			netdev_warn(netdev, "Reached MAC filter limit, forcing promisc mode on VSI %d\n",
 				    vsi->vsi_num);
 		} else {
 			err = -EIO;
@@ -343,25 +674,26 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	/* check for changes in promiscuous modes */
 	if (changed_flags & IFF_ALLMULTI) {
 		if (vsi->current_netdev_flags & IFF_ALLMULTI) {
-			if (vsi->vlan_ena)
+			if (ice_vsi_has_non_zero_vlans(vsi))
 				promisc_m = ICE_MCAST_VLAN_PROMISC_BITS;
 			else
 				promisc_m = ICE_MCAST_PROMISC_BITS;
 
-			err = ice_cfg_promisc(vsi, promisc_m, true);
+			err = ice_set_promisc(vsi, promisc_m);
 			if (err) {
 				netdev_err(netdev, "Error setting Multicast promiscuous mode on VSI %i\n",
 					   vsi->vsi_num);
 				vsi->current_netdev_flags &= ~IFF_ALLMULTI;
 				goto out_promisc;
 			}
-		} else if (!(vsi->current_netdev_flags & IFF_ALLMULTI)) {
-			if (vsi->vlan_ena)
+		} else {
+			/* !(vsi->current_netdev_flags & IFF_ALLMULTI) */
+			if (ice_vsi_has_non_zero_vlans(vsi))
 				promisc_m = ICE_MCAST_VLAN_PROMISC_BITS;
 			else
 				promisc_m = ICE_MCAST_PROMISC_BITS;
 
-			err = ice_cfg_promisc(vsi, promisc_m, false);
+			err = ice_clear_promisc(vsi, promisc_m);
 			if (err) {
 				netdev_err(netdev, "Error clearing Multicast promiscuous mode on VSI %i\n",
 					   vsi->vsi_num);
@@ -372,43 +704,49 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	}
 
 	if (((changed_flags & IFF_PROMISC) || promisc_forced_on) ||
-	    test_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags)) {
-		clear_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags);
+	    test_bit(ICE_VSI_PROMISC_CHANGED, vsi->state)) {
+		clear_bit(ICE_VSI_PROMISC_CHANGED, vsi->state);
 		if (vsi->current_netdev_flags & IFF_PROMISC) {
 			/* Apply Rx filter rule to get traffic from wire */
-			status = ice_cfg_dflt_vsi(hw, vsi->idx, true,
-						  ICE_FLTR_RX);
-			if (status) {
-				netdev_err(netdev, "Error setting default VSI %i Rx rule\n",
-					   vsi->vsi_num);
-				vsi->current_netdev_flags &= ~IFF_PROMISC;
-				err = -EIO;
-				goto out_promisc;
+			if (!ice_is_dflt_vsi_in_use(pf->first_sw)) {
+				err = ice_set_dflt_vsi(pf->first_sw, vsi);
+				if (err && err != -EEXIST) {
+					netdev_err(netdev, "Error %d setting default VSI %i Rx rule\n",
+						   err, vsi->vsi_num);
+					vsi->current_netdev_flags &=
+						~IFF_PROMISC;
+					goto out_promisc;
+				}
+				vlan_ops->dis_rx_filtering(vsi);
 			}
 		} else {
 			/* Clear Rx filter to remove traffic from wire */
-			status = ice_cfg_dflt_vsi(hw, vsi->idx, false,
-						  ICE_FLTR_RX);
-			if (status) {
-				netdev_err(netdev, "Error clearing default VSI %i Rx rule\n",
-					   vsi->vsi_num);
-				vsi->current_netdev_flags |= IFF_PROMISC;
-				err = -EIO;
-				goto out_promisc;
+			if (ice_is_vsi_dflt_vsi(pf->first_sw, vsi)) {
+				err = ice_clear_dflt_vsi(pf->first_sw);
+				if (err) {
+					netdev_err(netdev, "Error %d clearing default VSI %i Rx rule\n",
+						   err, vsi->vsi_num);
+					vsi->current_netdev_flags |=
+						IFF_PROMISC;
+					goto out_promisc;
+				}
+				if (vsi->current_netdev_flags &
+				    NETIF_F_HW_VLAN_CTAG_FILTER)
+					vlan_ops->ena_rx_filtering(vsi);
 			}
 		}
 	}
 	goto exit;
 
 out_promisc:
-	set_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags);
+	set_bit(ICE_VSI_PROMISC_CHANGED, vsi->state);
 	goto exit;
 out:
 	/* if something went wrong then set the changed flag so we try again */
-	set_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
-	set_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
+	set_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state);
+	set_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state);
 exit:
-	clear_bit(__ICE_CFG_BUSY, vsi->state);
+	clear_bit(ICE_CFG_BUSY, vsi->state);
 	return err;
 }
 
@@ -435,64 +773,119 @@ static void ice_sync_fltr_subtask(struct ice_pf *pf)
 }
 
 /**
- * ice_dis_vsi - pause a VSI
- * @vsi: the VSI being paused
+ * ice_pf_dis_all_vsi - Pause all VSIs on a PF
+ * @pf: the PF
  * @locked: is the rtnl_lock already held
  */
-static void ice_dis_vsi(struct ice_vsi *vsi, bool locked)
+static void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked)
 {
-	if (test_bit(__ICE_DOWN, vsi->state))
-		return;
+	int node;
+	int v;
 
-	set_bit(__ICE_NEEDS_RESTART, vsi->state);
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v])
+			ice_dis_vsi(pf->vsi[v], locked);
 
-	if (vsi->type == ICE_VSI_PF && vsi->netdev) {
-		if (netif_running(vsi->netdev)) {
-			if (!locked)
-				rtnl_lock();
+	for (node = 0; node < ICE_MAX_PF_AGG_NODES; node++)
+		pf->pf_agg_node[node].num_vsis = 0;
 
-			ice_stop(vsi->netdev);
+	for (node = 0; node < ICE_MAX_VF_AGG_NODES; node++)
+		pf->vf_agg_node[node].num_vsis = 0;
 
-			if (!locked)
-				rtnl_unlock();
-		} else {
-			ice_vsi_close(vsi);
+#ifdef HAVE_NETDEV_SB_DEV
+	for (node = 0; node < ICE_MAX_MACVLAN_AGG_NODES; node++)
+		pf->macvlan_agg_node[node].num_vsis = 0;
+#endif
+}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_remove_tc_fltrs - clear TC filters configuration
+ * @pf: ptr to PF, TC-flower based filter are tracked at PF level
+ * @remove_from_list: true if filter to be removed from tc_fltr list
+ *
+ * Remove all advanced TC flower switch filters from software bookkeeping.
+ * If 'remove_from_list' parameter is set to true, all filters are also
+ * removed from PF's tc_flower_fltr_list list and it's not possible to
+ * restore them (after the reset for example).
+ */
+static void ice_remove_tc_fltrs(struct ice_pf *pf, bool remove_from_list)
+{
+	struct ice_tc_flower_fltr *fltr;
+	struct hlist_node *node2;
+
+	hlist_for_each_entry_safe(fltr, node2,
+				  &pf->tc_flower_fltr_list,
+				  tc_flower_node) {
+		struct ice_adv_fltr_mgmt_list_entry *entry, *tmp;
+		struct list_head *list_head;
+		struct mutex *rule_lock;
+		struct ice_switch_info *sw;
+
+		sw = pf->hw.switch_info;
+		if (!sw->recp_list[fltr->rid].recp_created)
+			continue;
+		rule_lock = &sw->recp_list[fltr->rid].filt_rule_lock;
+		list_head = &sw->recp_list[fltr->rid].filt_rules;
+
+		list_for_each_entry_safe(entry, tmp, list_head, list_entry) {
+			if (entry->rule_info.fltr_rule_id == fltr->rule_id) {
+				mutex_lock(rule_lock);
+				list_del(&entry->list_entry);
+				devm_kfree(ice_pf_to_dev(pf), entry->lkups);
+				devm_kfree(ice_pf_to_dev(pf), entry);
+				mutex_unlock(rule_lock);
+				break;
+			}
+		}
+
+		if (remove_from_list) {
+			hlist_del(&fltr->tc_flower_node);
+			kfree(fltr);
 		}
 	}
 }
 
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 /**
- * ice_pf_dis_all_vsi - Pause all VSIs on a PF
- * @pf: the PF
- * @locked: is the rtnl_lock already held
+ * ice_clear_sw_switch_recipes - clear switch recipes
+ * @pf: board private structure
+ *
+ * Mark switch recipes as not created in sw structures. There are cases where
+ * rules (especially advanced rules) need to be restored, either re-read from
+ * hardware or added again. For example after the reset. 'recp_created' flag
+ * prevents from doing that and need to be cleared upfront.
  */
-#ifdef CONFIG_DCB
-void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked)
-#else
-static void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked)
-#endif /* CONFIG_DCB */
+static void ice_clear_sw_switch_recipes(struct ice_pf *pf)
 {
-	int v;
+	struct ice_sw_recipe *recp;
+	u8 i;
 
-	ice_for_each_vsi(pf, v)
-		if (pf->vsi[v])
-			ice_dis_vsi(pf->vsi[v], locked);
+	recp = pf->hw.switch_info->recp_list;
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++)
+		recp[i].recp_created = false;
 }
 
 /**
- * ice_prepare_for_reset - prep for the core to reset
+ * ice_prepare_for_reset - prep for reset
  * @pf: board private structure
+ * @reset_type: reset type requested
  *
  * Inform or close all dependent features in prep for reset.
  */
 static void
-ice_prepare_for_reset(struct ice_pf *pf)
+ice_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
 {
 	struct ice_hw *hw = &pf->hw;
-	int i;
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	struct ice_vsi *vsi;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+	unsigned int i;
+
+	dev_dbg(ice_pf_to_dev(pf), "reset_type=%d\n", reset_type);
 
 	/* already prepared for reset */
-	if (test_bit(__ICE_PREPARED_FOR_RESET, pf->state))
+	if (test_bit(ICE_PREPARED_FOR_RESET, pf->state))
 		return;
 
 	/* Notify VFs of impending reset */
@@ -500,133 +893,335 @@ ice_prepare_for_reset(struct ice_pf *pf)
 		ice_vc_notify_reset(pf);
 
 	/* Disable VFs until reset is completed */
-	for (i = 0; i < pf->num_alloc_vfs; i++)
+	ice_for_each_vf(pf, i)
 		ice_set_vf_state_qs_dis(&pf->vf[i]);
 
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	/* release ADQ specific HW and SW resources */
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		goto skip;
+
+	/* to be on safer size, reset orig_rss_size so that normal flow
+	 * of deciding rss_size can take precedence
+	 */
+	vsi->orig_rss_size = 0;
+
+	if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+		if (reset_type == ICE_RESET_PFR) {
+			vsi->old_ena_tc = vsi->all_enatc;
+			vsi->old_numtc = vsi->all_numtc;
+		} else {
+			ice_remove_q_channels(vsi, true);
+
+			/* for other reset type, do not support "rebuild
+			 * of channel, hence reset needed info
+			 */
+			vsi->old_ena_tc = 0;
+			vsi->all_enatc = 0;
+			vsi->old_numtc = 0;
+			vsi->all_numtc = 0;
+			vsi->req_txq = 0;
+			vsi->req_rxq = 0;
+			clear_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+			memset(&vsi->mqprio_qopt, 0, sizeof(vsi->mqprio_qopt));
+		}
+	}
+skip:
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+	if (ice_is_eswitch_mode_switchdev(pf)) {
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		ice_remove_tc_fltrs(pf, false);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+		if (reset_type != ICE_RESET_PFR)
+			ice_clear_sw_switch_recipes(pf);
+	}
+
 	/* clear SW filtering DB */
 	ice_clear_hw_tbls(hw);
 	/* disable the VSIs and their queues that are not already DOWN */
 	ice_pf_dis_all_vsi(pf, false);
 
+	if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		ice_ptp_release(pf);
+
 	if (hw->port_info)
 		ice_sched_clear_port(hw->port_info);
 
 	ice_shutdown_all_ctrlq(hw);
 
-	set_bit(__ICE_PREPARED_FOR_RESET, pf->state);
+	set_bit(ICE_PREPARED_FOR_RESET, pf->state);
 }
 
+
 /**
- * ice_do_reset - Initiate one of many types of resets
- * @pf: board private structure
- * @reset_type: reset type requested
- * before this function was called.
+ * ice_print_recovery_msg - print recovery mode message
+ * @dev: pointer to the device instance
  */
-static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
+static void ice_print_recovery_msg(struct device *dev)
 {
-	struct device *dev = &pf->pdev->dev;
-	struct ice_hw *hw = &pf->hw;
+	dev_err(dev, "Firmware recovery mode detected. Limiting functionality. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode\n");
+}
 
-	dev_dbg(dev, "reset_type 0x%x requested\n", reset_type);
-	WARN_ON(in_interrupt());
+/**
+ * ice_prepare_for_recovery_mode - prepare the driver for FW recovery mode
+ * @pf: pointer to the PF instance
+ */
+static void ice_prepare_for_recovery_mode(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
 
-	ice_prepare_for_reset(pf);
+	ice_print_recovery_msg(ice_pf_to_dev(pf));
+	set_bit(ICE_RECOVERY_MODE, pf->state);
 
-	/* trigger the reset */
-	if (ice_reset(hw, reset_type)) {
-		dev_err(dev, "reset %d failed\n", reset_type);
-		set_bit(__ICE_RESET_FAILED, pf->state);
-		clear_bit(__ICE_RESET_OICR_RECV, pf->state);
-		clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
-		clear_bit(__ICE_PFR_REQ, pf->state);
-		clear_bit(__ICE_CORER_REQ, pf->state);
-		clear_bit(__ICE_GLOBR_REQ, pf->state);
-		return;
+	vsi = ice_get_main_vsi(pf);
+	if (vsi && vsi->netdev) {
+		ice_set_ethtool_recovery_ops(vsi->netdev);
+		netif_carrier_off(vsi->netdev);
+		netif_tx_stop_all_queues(vsi->netdev);
 	}
 
-	/* PFR is a bit of a special case because it doesn't result in an OICR
-	 * interrupt. So for PFR, rebuild after the reset and clear the reset-
-	 * associated state bits.
-	 */
-	if (reset_type == ICE_RESET_PFR) {
-		pf->pfr_count++;
-		ice_rebuild(pf, reset_type);
-		clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
-		clear_bit(__ICE_PFR_REQ, pf->state);
-		ice_reset_all_vfs(pf, true);
+	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags))
+		if (!pci_vfs_assigned(pf->pdev))
+			pci_disable_sriov(pf->pdev);
+
+	if (pf->ptp.clock)
+		ptp_clock_unregister(pf->ptp.clock);
+
+	set_bit(ICE_PREPPED_RECOVERY_MODE, pf->state);
+}
+
+/**
+ * ice_remove_recovery_mode - Unload helper when in FW recovery mode
+ * @pf: pointer to the PF instance
+ */
+static void ice_remove_recovery_mode(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi = ice_get_main_vsi(pf);
+	struct device *dev = ice_pf_to_dev(pf);
+
+	if (vsi && vsi->netdev) {
+		unregister_netdev(vsi->netdev);
+		clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+		free_netdev(vsi->netdev);
+		clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+		devm_kfree(dev, vsi);
 	}
+
+	ice_reset(&pf->hw, ICE_RESET_PFR);
+	pci_disable_pcie_error_reporting(pf->pdev);
+	ice_devlink_unregister(pf);
 }
 
 /**
- * ice_reset_subtask - Set up for resetting the device and driver
- * @pf: board private structure
+ * ice_probe_recovery_mode - Load helper when in FW recovery mode
+ * @pf: pointer to the PF instance
  */
-static void ice_reset_subtask(struct ice_pf *pf)
+static int ice_probe_recovery_mode(struct ice_pf *pf)
 {
-	enum ice_reset_req reset_type = ICE_RESET_INVAL;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_netdev_priv *np;
+	struct net_device *netdev;
+	struct ice_vsi *vsi;
+	int err;
 
-	/* When a CORER/GLOBR/EMPR is about to happen, the hardware triggers an
+	ice_print_recovery_msg(dev);
+	set_bit(ICE_RECOVERY_MODE, pf->state);
+
+	/* create one single VSI instance and netdev to allow for ethtool
+	 * recovery ops. This VSI cannot be backed by a VSI in the HW as
+	 * the FW is in recovery mode. Thus, no traffic is possible on this
+	 * VSI/netdev
+	 */
+	pf->vsi = devm_kcalloc(dev, 1, sizeof(*pf->vsi), GFP_KERNEL);
+	if (!pf->vsi)
+		return -ENOMEM;
+
+	vsi = devm_kzalloc(dev, sizeof(*vsi), GFP_KERNEL);
+	if (!vsi) {
+		err = -ENOMEM;
+		goto err_vsi;
+	}
+
+	pf->vsi[0] = vsi;
+	vsi->back = pf;
+
+	/* allocate an etherdev with 1 queue pair */
+	netdev = alloc_etherdev(sizeof(*np));
+	if (!netdev) {
+		err = -ENOMEM;
+		goto err_netdev;
+	}
+
+	set_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+	vsi->netdev = netdev;
+	np = netdev_priv(netdev);
+	np->vsi = vsi;
+	SET_NETDEV_DEV(netdev, dev);
+	eth_hw_addr_random(netdev);
+
+	netdev->netdev_ops = &ice_netdev_recovery_ops;
+	ice_set_ethtool_recovery_ops(netdev);
+
+	err = register_netdev(netdev);
+	if (err)
+		goto err_register;
+
+	set_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+	netif_carrier_off(netdev);
+	netif_tx_stop_all_queues(netdev);
+
+	return 0;
+
+err_register:
+	free_netdev(netdev);
+	clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+err_netdev:
+	devm_kfree(dev, vsi);
+err_vsi:
+	devm_kfree(dev, pf->vsi);
+	return err;
+}
+
+/**
+ * ice_do_reset - Initiate one of many types of resets
+ * @pf: board private structure
+ * @reset_type: reset type requested before this function was called.
+ */
+static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+
+	dev_dbg(dev, "reset_type 0x%x requested\n", reset_type);
+
+	ice_prepare_for_reset(pf, reset_type);
+
+	/* trigger the reset */
+	if (ice_reset(hw, reset_type)) {
+		dev_err(dev, "reset %d failed\n", reset_type);
+		set_bit(ICE_RESET_FAILED, pf->state);
+		clear_bit(ICE_RESET_OICR_RECV, pf->state);
+		clear_bit(ICE_PREPARED_FOR_RESET, pf->state);
+		clear_bit(ICE_PFR_REQ, pf->state);
+		clear_bit(ICE_CORER_REQ, pf->state);
+		clear_bit(ICE_GLOBR_REQ, pf->state);
+		wake_up(&pf->reset_wait_queue);
+		return;
+	}
+
+	/* PFR is a bit of a special case because it doesn't result in an OICR
+	 * interrupt. So for PFR, rebuild after the reset and clear the reset-
+	 * associated state bits.
+	 */
+	if (reset_type == ICE_RESET_PFR) {
+		pf->pfr_count++;
+		ice_rebuild(pf, reset_type);
+		clear_bit(ICE_PREPARED_FOR_RESET, pf->state);
+		clear_bit(ICE_PFR_REQ, pf->state);
+		wake_up(&pf->reset_wait_queue);
+		ice_reset_all_vfs(pf, true);
+	}
+}
+
+/**
+ * ice_reset_subtask - Set up for resetting the device and driver
+ * @pf: board private structure
+ */
+static void ice_reset_subtask(struct ice_pf *pf)
+{
+	enum ice_reset_req reset_type = ICE_RESET_INVAL;
+
+	/* When a CORER/GLOBR/EMPR is about to happen, the hardware triggers an
 	 * OICR interrupt. The OICR handler (ice_misc_intr) determines what type
 	 * of reset is pending and sets bits in pf->state indicating the reset
-	 * type and __ICE_RESET_OICR_RECV. So, if the latter bit is set
+	 * type and ICE_RESET_OICR_RECV. So, if the latter bit is set
 	 * prepare for pending reset if not already (for PF software-initiated
 	 * global resets the software should already be prepared for it as
-	 * indicated by __ICE_PREPARED_FOR_RESET; for global resets initiated
+	 * indicated by ICE_PREPARED_FOR_RESET; for global resets initiated
 	 * by firmware or software on other PFs, that bit is not set so prepare
 	 * for the reset now), poll for reset done, rebuild and return.
 	 */
-	if (test_bit(__ICE_RESET_OICR_RECV, pf->state)) {
+	if (test_bit(ICE_RESET_OICR_RECV, pf->state)) {
 		/* Perform the largest reset requested */
-		if (test_and_clear_bit(__ICE_CORER_RECV, pf->state))
+		if (test_and_clear_bit(ICE_CORER_RECV, pf->state))
 			reset_type = ICE_RESET_CORER;
-		if (test_and_clear_bit(__ICE_GLOBR_RECV, pf->state))
+		if (test_and_clear_bit(ICE_GLOBR_RECV, pf->state))
 			reset_type = ICE_RESET_GLOBR;
-		if (test_and_clear_bit(__ICE_EMPR_RECV, pf->state))
+		if (test_and_clear_bit(ICE_EMPR_RECV, pf->state))
 			reset_type = ICE_RESET_EMPR;
 		/* return if no valid reset type requested */
 		if (reset_type == ICE_RESET_INVAL)
 			return;
-		ice_prepare_for_reset(pf);
+		if (ice_is_peer_ena(pf))
+			ice_for_each_peer(pf, &reset_type,
+					  ice_close_peer_for_reset);
+		ice_prepare_for_reset(pf, reset_type);
 
 		/* make sure we are ready to rebuild */
 		if (ice_check_reset(&pf->hw)) {
-			set_bit(__ICE_RESET_FAILED, pf->state);
-		} else {
-			/* done with reset. start rebuild */
-			pf->hw.reset_ongoing = false;
-			ice_rebuild(pf, reset_type);
-			/* clear bit to resume normal operations, but
-			 * ICE_NEEDS_RESTART bit is set in case rebuild failed
-			 */
-			clear_bit(__ICE_RESET_OICR_RECV, pf->state);
-			clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
-			clear_bit(__ICE_PFR_REQ, pf->state);
-			clear_bit(__ICE_CORER_REQ, pf->state);
-			clear_bit(__ICE_GLOBR_REQ, pf->state);
-			ice_reset_all_vfs(pf, true);
+			set_bit(ICE_RESET_FAILED, pf->state);
+			clear_bit(ICE_RESET_OICR_RECV, pf->state);
+			clear_bit(ICE_PREPARED_FOR_RESET, pf->state);
+			clear_bit(ICE_PFR_REQ, pf->state);
+			clear_bit(ICE_CORER_REQ, pf->state);
+			clear_bit(ICE_GLOBR_REQ, pf->state);
+			wake_up(&pf->reset_wait_queue);
+			if (ice_get_fw_mode(&pf->hw) == ICE_FW_MODE_REC)
+				ice_prepare_for_recovery_mode(pf);
+			return;
 		}
 
+
+
+		/* came out of reset. check if an NVM rollback happened */
+		if (ice_get_fw_mode(&pf->hw) == ICE_FW_MODE_ROLLBACK)
+			ice_print_rollback_msg(&pf->hw);
+
+		/* done with reset. start rebuild */
+		pf->hw.reset_ongoing = false;
+		ice_rebuild(pf, reset_type);
+		/* clear bit to resume normal operations, but
+		 * ICE_NEEDS_RESTART bit is set in case rebuild failed
+		 */
+		clear_bit(ICE_RESET_OICR_RECV, pf->state);
+		clear_bit(ICE_PREPARED_FOR_RESET, pf->state);
+		clear_bit(ICE_PFR_REQ, pf->state);
+		clear_bit(ICE_CORER_REQ, pf->state);
+		clear_bit(ICE_GLOBR_REQ, pf->state);
+		wake_up(&pf->reset_wait_queue);
+		ice_reset_all_vfs(pf, true);
 		return;
 	}
 
 	/* No pending resets to finish processing. Check for new resets */
-	if (test_bit(__ICE_PFR_REQ, pf->state))
+	if (test_bit(ICE_PFR_REQ, pf->state))
 		reset_type = ICE_RESET_PFR;
-	if (test_bit(__ICE_CORER_REQ, pf->state))
+	if (test_bit(ICE_CORER_REQ, pf->state))
 		reset_type = ICE_RESET_CORER;
-	if (test_bit(__ICE_GLOBR_REQ, pf->state))
+	if (test_bit(ICE_GLOBR_REQ, pf->state))
 		reset_type = ICE_RESET_GLOBR;
 	/* If no valid reset type requested just return */
 	if (reset_type == ICE_RESET_INVAL)
 		return;
 
 	/* reset if not already down or busy */
-	if (!test_bit(__ICE_DOWN, pf->state) &&
-	    !test_bit(__ICE_CFG_BUSY, pf->state)) {
+	if (!test_bit(ICE_DOWN, pf->state) &&
+	    !test_bit(ICE_CFG_BUSY, pf->state)) {
 		ice_do_reset(pf, reset_type);
 	}
 }
 
+
+/**
+ * ice_sync_udp_fltr_subtask - sync the VSI filter list with HW
+ * @pf: board private structure
+ */
+static void ice_sync_udp_fltr_subtask(struct ice_pf __always_unused *pf)
+{
+}
+
 /**
  * ice_print_topo_conflict - print topology conflict message
  * @vsi: the VSI whose topology status is being checked
@@ -636,7 +1231,16 @@ static void ice_print_topo_conflict(struct ice_vsi *vsi)
 	switch (vsi->port_info->phy.link_info.topo_media_conflict) {
 	case ICE_AQ_LINK_TOPO_CONFLICT:
 	case ICE_AQ_LINK_MEDIA_CONFLICT:
-		netdev_info(vsi->netdev, "Possible mis-configuration of the Ethernet port detected, please use the Intel(R) Ethernet Port Configuration Tool application to address the issue.\n");
+	case ICE_AQ_LINK_TOPO_UNREACH_PRT:
+	case ICE_AQ_LINK_TOPO_UNDRUTIL_PRT:
+	case ICE_AQ_LINK_TOPO_UNDRUTIL_MEDIA:
+		netdev_info(vsi->netdev, "Potential misconfiguration of the Ethernet port detected. If it was not intended, please use the Intel (R) Ethernet Port Configuration Tool to address the issue.\n");
+		break;
+	case ICE_AQ_LINK_TOPO_UNSUPP_MEDIA:
+		if (test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, vsi->back->flags))
+			netdev_warn(vsi->netdev, "An unsupported module type was detected. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules\n");
+		else
+			netdev_err(vsi->netdev, "Rx/Tx is disabled on this device because an unsupported module type was detected. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n");
 		break;
 	default:
 		break;
@@ -651,6 +1255,7 @@ static void ice_print_topo_conflict(struct ice_vsi *vsi)
 void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 {
 	struct ice_aqc_get_phy_caps_data *caps;
+	const char *an_advertised;
 	enum ice_status status;
 	const char *fec_req;
 	const char *speed;
@@ -703,7 +1308,7 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 		speed = "100 M";
 		break;
 	default:
-		speed = "Unknown";
+		speed = "Unknown ";
 		break;
 	}
 
@@ -728,7 +1333,6 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 	/* Get FEC mode based on negotiated link info */
 	switch (vsi->port_info->phy.link_info.fec_info) {
 	case ICE_AQ_LINK_25G_RS_528_FEC_EN:
-		/* fall through */
 	case ICE_AQ_LINK_25G_RS_544_FEC_EN:
 		fec = "RS-FEC";
 		break;
@@ -747,17 +1351,20 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 		an = "False";
 
 	/* Get FEC mode requested based on PHY caps last SW configuration */
-	caps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*caps), GFP_KERNEL);
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
 	if (!caps) {
 		fec_req = "Unknown";
+		an_advertised = "Unknown";
 		goto done;
 	}
 
 	status = ice_aq_get_phy_caps(vsi->port_info, false,
-				     ICE_AQC_REPORT_SW_CFG, caps, NULL);
+				     ICE_AQC_REPORT_ACTIVE_CFG, caps, NULL);
 	if (status)
 		netdev_info(vsi->netdev, "Get phy capability failed.\n");
 
+	an_advertised = ice_is_phy_caps_an_enabled(caps) ? "On" : "Off";
+
 	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
 	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ)
 		fec_req = "RS-FEC";
@@ -767,11 +1374,10 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 	else
 		fec_req = "NONE";
 
-	devm_kfree(&vsi->back->pdev->dev, caps);
-
+	kfree(caps);
 done:
-	netdev_info(vsi->netdev, "NIC Link is up %sbps, Requested FEC: %s, FEC: %s, Autoneg: %s, Flow Control: %s\n",
-		    speed, fec_req, fec, an, fc);
+	netdev_info(vsi->netdev, "NIC Link is up %sbps Full Duplex, Requested FEC: %s, Negotiated FEC: %s, Autoneg Advertised: %s, Autoneg Negotiated: %s, Flow Control: %s\n",
+		    speed, fec_req, fec, an_advertised, an, fc);
 	ice_print_topo_conflict(vsi);
 }
 
@@ -785,7 +1391,7 @@ static void ice_vsi_link_event(struct ice_vsi *vsi, bool link_up)
 	if (!vsi)
 		return;
 
-	if (test_bit(__ICE_DOWN, vsi->state) || !vsi->netdev)
+	if (test_bit(ICE_VSI_DOWN, vsi->state) || !vsi->netdev)
 		return;
 
 	if (vsi->type == ICE_VSI_PF) {
@@ -802,6 +1408,127 @@ static void ice_vsi_link_event(struct ice_vsi *vsi, bool link_up)
 	}
 }
 
+
+/**
+ * ice_set_dflt_mib - send a default config MIB to the FW
+ * @pf: private PF struct
+ *
+ * This function sends a default configuration MIB to the FW.
+ *
+ * If this function errors out at any point, the driver is still able to
+ * function.  The main impact is that LFC may not operate as expected.
+ * Therefore an error state in this function should be treated with a DBG
+ * message and continue on with driver rebuild/reenable.
+ */
+static void ice_set_dflt_mib(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	u8 mib_type, *buf, *lldpmib = NULL;
+	u16 len, typelen, offset = 0;
+	struct ice_lldp_org_tlv *tlv;
+	struct ice_hw *hw = &pf->hw;
+	u32 ouisubtype;
+
+	mib_type = SET_LOCAL_MIB_TYPE_LOCAL_MIB;
+	lldpmib = kzalloc(ICE_LLDPDU_SIZE, GFP_KERNEL);
+	if (!lldpmib) {
+		dev_dbg(dev, "%s Failed to allocate MIB memory\n",
+			__func__);
+		return;
+	}
+
+	/* Add ETS CFG TLV */
+	tlv = (struct ice_lldp_org_tlv *)lldpmib;
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_IEEE_ETS_TLV_LEN);
+	tlv->typelen = htons(typelen);
+	ouisubtype = ((ICE_IEEE_8021QAZ_OUI << ICE_LLDP_TLV_OUI_S) |
+		      ICE_IEEE_SUBTYPE_ETS_CFG);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	buf = tlv->tlvinfo;
+	buf[0] = 0;
+
+	/* ETS CFG all UPs map to TC 0. Next 4 (1 - 4) Octets = 0.
+	 * Octets 5 - 12 are BW values, set octet 5 to 100% BW.
+	 * Octets 13 - 20 are TSA values - leave as zeros
+	 */
+	buf[5] = 0x64;
+	len = (typelen & ICE_LLDP_TLV_LEN_M) >> ICE_LLDP_TLV_LEN_S;
+	offset += len + 2;
+	tlv = (struct ice_lldp_org_tlv *)
+		((char *)tlv + sizeof(tlv->typelen) + len);
+
+	/* Add ETS REC TLV */
+	buf = tlv->tlvinfo;
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = ((ICE_IEEE_8021QAZ_OUI << ICE_LLDP_TLV_OUI_S) |
+		      ICE_IEEE_SUBTYPE_ETS_REC);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* First octet of buf is reserved
+	 * Octets 1 - 4 map UP to TC - all UPs map to zero
+	 * Octets 5 - 12 are BW values - set TC 0 to 100%.
+	 * Octets 13 - 20 are TSA value - leave as zeros
+	 */
+	buf[5] = 0x64;
+	offset += len + 2;
+	tlv = (struct ice_lldp_org_tlv *)
+		((char *)tlv + sizeof(tlv->typelen) + len);
+
+	/* Add PFC CFG TLV */
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_IEEE_PFC_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = ((ICE_IEEE_8021QAZ_OUI << ICE_LLDP_TLV_OUI_S) |
+		      ICE_IEEE_SUBTYPE_PFC_CFG);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* Octet 1 left as all zeros - PFC disabled */
+	buf[0] = 0x08;
+	len = (typelen & ICE_LLDP_TLV_LEN_M) >> ICE_LLDP_TLV_LEN_S;
+	offset += len + 2;
+
+	if (ice_aq_set_lldp_mib(hw, mib_type, (void *)lldpmib, offset, NULL))
+		dev_dbg(dev, "%s Failed to set default LLDP MIB\n", __func__);
+
+	kfree(lldpmib);
+}
+
+/**
+ * ice_check_module_power
+ * @pf: pointer to PF struct
+ * @link_cfg_err: bitmap from the link info structure
+ *
+ * check module power level returned by a previous call to aq_get_link_info
+ * and print error messages if module power level is not supported
+ */
+static void ice_check_module_power(struct ice_pf *pf, u8 link_cfg_err)
+{
+	/* if module power level is supported, clear the flag */
+	if (!(link_cfg_err & (ICE_AQ_LINK_INVAL_MAX_POWER_LIMIT |
+			      ICE_AQ_LINK_MODULE_POWER_UNSUPPORTED))) {
+		clear_bit(ICE_FLAG_MOD_POWER_UNSUPPORTED, pf->flags);
+		return;
+	}
+
+	/* if ICE_FLAG_MOD_POWER_UNSUPPORTED was previously set and the
+	 * above block didn't clear this bit, there's nothing to do
+	 */
+	if (test_bit(ICE_FLAG_MOD_POWER_UNSUPPORTED, pf->flags))
+		return;
+
+	if (link_cfg_err & ICE_AQ_LINK_INVAL_MAX_POWER_LIMIT) {
+		dev_err(ice_pf_to_dev(pf), "The installed module is incompatible with the device's NVM image. Cannot start link\n");
+		set_bit(ICE_FLAG_MOD_POWER_UNSUPPORTED, pf->flags);
+	} else if (link_cfg_err & ICE_AQ_LINK_MODULE_POWER_UNSUPPORTED) {
+		dev_err(ice_pf_to_dev(pf), "The module's power requirements exceed the device's power supply. Cannot start link\n");
+		set_bit(ICE_FLAG_MOD_POWER_UNSUPPORTED, pf->flags);
+	}
+}
+
 /**
  * ice_link_event - process the link event
  * @pf: PF that the link event is associated with
@@ -815,11 +1542,12 @@ static int
 ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up,
 	       u16 link_speed)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_phy_info *phy_info;
+	enum ice_status status;
 	struct ice_vsi *vsi;
 	u16 old_link_speed;
 	bool old_link;
-	int result;
 
 	phy_info = &pi->phy;
 	phy_info->link_info_old = phy_info->link_info;
@@ -830,15 +1558,19 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up,
 	/* update the link info structures and re-enable link events,
 	 * don't bail on failure due to other book keeping needed
 	 */
-	result = ice_update_link_info(pi);
-	if (result)
-		dev_dbg(&pf->pdev->dev,
-			"Failed to update link status and re-enable link events for port %d\n",
-			pi->lport);
+	status = ice_update_link_info(pi);
+	if (status)
+		dev_dbg(dev, "Failed to update link status on port %d, err %s aq_err %s\n",
+			pi->lport, ice_stat_str(status),
+			ice_aq_str(pi->hw->adminq.sq_last_status));
 
-	/* if the old link up/down and speed is the same as the new */
-	if (link_up == old_link && link_speed == old_link_speed)
-		return result;
+	ice_check_module_power(pf, pi->phy.link_info.link_cfg_err);
+
+	/* Check if the link state is up after updating link info, and treat
+	 * this event as an UP event since the link is actually UP now.
+	 */
+	if (phy_info->link_info.link_info & ICE_AQ_LINK_UP)
+		link_up = true;
 
 	vsi = ice_get_main_vsi(pf);
 	if (!vsi || !vsi->port_info)
@@ -848,23 +1580,31 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up,
 	if (!test_bit(ICE_FLAG_NO_MEDIA, pf->flags) &&
 	    !(pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE)) {
 		set_bit(ICE_FLAG_NO_MEDIA, pf->flags);
-
-		result = ice_aq_set_link_restart_an(pi, false, NULL);
-		if (result) {
-			dev_dbg(&pf->pdev->dev,
-				"Failed to set link down, VSI %d error %d\n",
-				vsi->vsi_num, result);
-			return result;
-		}
+		ice_set_link(vsi, false);
 	}
 
+
+	/* if the old link up/down and speed is the same as the new */
+	if (link_up == old_link && link_speed == old_link_speed)
+		return 0;
+
+	if (!ice_is_e810(&pf->hw))
+		ice_ptp_link_change(pf, pf->hw.pf_id, link_up);
+
+	if (ice_is_dcb_active(pf)) {
+		if (test_bit(ICE_FLAG_DCB_ENA, pf->flags))
+			ice_dcb_rebuild(pf);
+	} else {
+		if (link_up)
+			ice_set_dflt_mib(pf);
+	}
 	ice_vsi_link_event(vsi, link_up);
 	ice_print_link_msg(vsi, link_up);
 
-	if (pf->num_alloc_vfs)
-		ice_vc_notify_link_state(pf);
+	ice_vc_notify_link_state(pf);
 
-	return result;
+
+	return 0;
 }
 
 /**
@@ -876,8 +1616,8 @@ static void ice_watchdog_subtask(struct ice_pf *pf)
 	int i;
 
 	/* if interface is down do nothing */
-	if (test_bit(__ICE_DOWN, pf->state) ||
-	    test_bit(__ICE_CFG_BUSY, pf->state))
+	if (test_bit(ICE_DOWN, pf->state) ||
+	    test_bit(ICE_CFG_BUSY, pf->state))
 		return;
 
 	/* make sure we don't do these things too often */
@@ -886,14 +1626,16 @@ static void ice_watchdog_subtask(struct ice_pf *pf)
 		return;
 
 	pf->serv_tmr_prev = jiffies;
-
+	if (!ice_is_e810(&pf->hw))
+		ice_ptp_set_timestamp_offsets(pf);
 	/* Update the stats for active netdevs so the network stack
 	 * can look at updated numbers whenever it cares to
 	 */
 	ice_update_pf_stats(pf);
-	ice_for_each_vsi(pf, i)
+	ice_for_each_vsi(pf, i) {
 		if (pf->vsi[i] && pf->vsi[i]->netdev)
 			ice_update_vsi_stats(pf->vsi[i]);
+	}
 }
 
 /**
@@ -906,19 +1648,23 @@ static int ice_init_link_events(struct ice_port_info *pi)
 {
 	u16 mask;
 
+	if (test_bit(ICE_BAD_EEPROM,
+		     ((struct ice_pf *)pi->hw->back)->state)) {
+		dev_err(ice_hw_to_dev(pi->hw), "Link events disabled due to corrupted eeprom\n");
+		return 0;
+	}
+
 	mask = ~((u16)(ICE_AQ_LINK_EVENT_UPDOWN | ICE_AQ_LINK_EVENT_MEDIA_NA |
 		       ICE_AQ_LINK_EVENT_MODULE_QUAL_FAIL));
 
 	if (ice_aq_set_event_mask(pi->hw, pi->lport, mask, NULL)) {
-		dev_dbg(ice_hw_to_dev(pi->hw),
-			"Failed to set link event mask for port %d\n",
+		dev_dbg(ice_hw_to_dev(pi->hw), "Failed to set link event mask for port %d\n",
 			pi->lport);
 		return -EIO;
 	}
 
 	if (ice_aq_get_link_info(pi, true, NULL, NULL)) {
-		dev_dbg(ice_hw_to_dev(pi->hw),
-			"Failed to enable link events for port %d\n",
+		dev_dbg(ice_hw_to_dev(pi->hw), "Failed to enable link events for port %d\n",
 			pi->lport);
 		return -EIO;
 	}
@@ -947,138 +1693,522 @@ ice_handle_link_event(struct ice_pf *pf, struct ice_rq_event_info *event)
 				!!(link_data->link_info & ICE_AQ_LINK_UP),
 				le16_to_cpu(link_data->link_speed));
 	if (status)
-		dev_dbg(&pf->pdev->dev,
-			"Could not process link event, error %d\n", status);
+		dev_dbg(ice_pf_to_dev(pf), "Could not process link event, error %d\n",
+			status);
 
 	return status;
 }
 
+
 /**
- * __ice_clean_ctrlq - helper function to clean controlq rings
- * @pf: ptr to struct ice_pf
- * @q_type: specific Control queue type
+ * ice_print_health_status_string - Print message for given FW health event
+ * @pf: pointer to the PF structure
+ * @hse: pointer to the health status element containing the health status code
+ *
+ * Print the error/diagnostic string in response to the given Health Status
+ * Event code.
  */
-static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
+static void
+ice_print_health_status_string(struct ice_pf *pf,
+			       struct ice_aqc_health_status_elem *hse)
 {
-	struct ice_rq_event_info event;
-	struct ice_hw *hw = &pf->hw;
-	struct ice_ctl_q_info *cq;
-	u16 pending, i = 0;
-	const char *qtype;
-	u32 oldval, val;
+	struct ice_vsi *vsi = ice_get_main_vsi(pf);
+	u32 internal_data1, internal_data2;
+	struct net_device *netdev;
+	u16 status_code;
 
-	/* Do not clean control queue if/when PF reset fails */
-	if (test_bit(__ICE_RESET_FAILED, pf->state))
-		return 0;
+	if (!hse || !vsi)
+		return;
 
-	switch (q_type) {
-	case ICE_CTL_Q_ADMIN:
-		cq = &hw->adminq;
-		qtype = "Admin";
+	netdev = vsi->netdev;
+	status_code = le16_to_cpu(hse->health_status_code);
+	internal_data1 = le32_to_cpu(hse->internal_data1);
+	internal_data2 = le32_to_cpu(hse->internal_data2);
+
+	switch (status_code) {
+	case ICE_AQC_HEALTH_STATUS_INFO_RECOVERY:
+		netdev_info(netdev, "The device is in firmware recovery mode.\n");
+		netdev_info(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_info(netdev, "Extended Error: 0x%08x.\n",
+			    internal_data1);
 		break;
-	case ICE_CTL_Q_MAILBOX:
-		cq = &hw->mailboxq;
-		qtype = "Mailbox";
+	case ICE_AQC_HEALTH_STATUS_ERR_FLASH_ACCESS:
+		netdev_err(netdev, "The flash chip cannot be accessed.\n");
+		netdev_err(netdev, "Possible Solution: If issue persists, call customer support.\n");
+		netdev_err(netdev, "Access Type: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_NVM_AUTH:
+		netdev_err(netdev, "NVM authentication failed.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_OROM_AUTH:
+		netdev_err(netdev, "Option ROM authentication failed.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_DDP_AUTH:
+		netdev_err(netdev, "DDP package failed.\n");
+		netdev_err(netdev, "Possible Solution: Update to latest base driver and DDP package.\n");
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_NVM_COMPAT:
+		netdev_err(netdev, "NVM image is incompatible.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_OROM_COMPAT:
+		netdev_err(netdev, "Option ROM is incompatible.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Expected PCI Device Id: 0x%08x and Expected Module Id: 0x%08x.\n",
+			   internal_data1, internal_data2);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_DCB_MIB:
+		netdev_err(netdev, "Supplied MIB file is invalid. DCB reverted to default configuration.\n");
+		netdev_err(netdev, "Possible Solution: Disable FW-LLDP and check DCBx system configuration.\n");
+		netdev_err(netdev, "Port Number: %d and MIB Id: %d.\n",
+			   internal_data1, internal_data2);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_STRICT:
+	case ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_LENIENT:
+		netdev_err(netdev, "An unsupported module was detected.\n");
+		netdev_err(netdev, "Possible Solution 1: Check your cable connection.\n");
+		netdev_err(netdev, "Possible Solution 2: Change or replace the module or cable.\n");
+		netdev_err(netdev, "Possible Solution 3: Manually set speed and duplex.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_TYPE:
+		netdev_err(netdev, "Module type is not supported.\n");
+		netdev_err(netdev, "Possible Solution: Change or replace the module or cable.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_QUAL:
+		netdev_err(netdev, "Module is not qualified.\n");
+		netdev_err(netdev, "Possible Solution 1: Check your cable connection.\n");
+		netdev_err(netdev, "Possible Solution 2: Change or replace the module or cable.\n");
+		netdev_err(netdev, "Possible Solution 3: Manually set speed and duplex.\n");
+		netdev_err(netdev, "Port Number: %d\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_COMM:
+		netdev_err(netdev, "Device cannot communicate with the module.\n");
+		netdev_err(netdev, "Possible Solution 1: Check your cable connection.\n");
+		netdev_err(netdev, "Possible Solution 2: Change or replace the module or cable.\n");
+		netdev_err(netdev, "Possible Solution 3: Manually set speed and duplex.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_CONFLICT:
+		netdev_err(netdev, "Unresolved module conflict.\n");
+		netdev_err(netdev, "Possible Solution 1: Manually set speed/duplex or use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_err(netdev, "Possible Solution 2: If the problem persists, use a cable/module that is found in the supported modules and cables list for this device.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_NOT_PRESENT:
+		netdev_err(netdev, "Module is not present.\n");
+		netdev_err(netdev, "Possible Solution 1: Check that the module is inserted correctly.\n");
+		netdev_err(netdev, "Possible Solution 2: If the problem persists, use a cable/module that is found in the supported modules and cables list for this device.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_INFO_MOD_UNDERUTILIZED:
+		netdev_info(netdev, "Underutilized module.\n");
+		netdev_info(netdev, "Possible Solution 1: Change or replace the module or cable.\n");
+		netdev_info(netdev, "Possible Solution 2: Use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_info(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_INVALID_LINK_CFG:
+		netdev_err(netdev, "Invalid link configuration.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_PORT_ACCESS:
+		netdev_err(netdev, "Port hardware access error.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_PORT_UNREACHABLE:
+		netdev_err(netdev, "A port is unreachable.\n");
+		netdev_err(netdev, "Possible Solution 1: Use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_err(netdev, "Possible Solution 2: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_MOD_LIMITED:
+		netdev_info(netdev, "Port speed is limited due to module.\n");
+		netdev_info(netdev, "Possible Solution: Change the module or use Intel(R) Ethernet Port Configuration Tool to configure the port option to match the current module speed.\n");
+		netdev_info(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_PARALLEL_FAULT:
+		netdev_err(netdev, "A parallel fault was detected.\n");
+		netdev_err(netdev, "Possible Solution: Check link partner connection and configuration.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_PHY_LIMITED:
+		netdev_info(netdev, "Port speed is limited by PHY capabilities.\n");
+		netdev_info(netdev, "Possible Solution 1: Change the module to align to port option.\n");
+		netdev_info(netdev, "Possible Solution 2: Use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_info(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_NETLIST_TOPO:
+		netdev_err(netdev, "LOM topology netlist is corrupted.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_NETLIST:
+		netdev_err(netdev, "Unrecoverable netlist error.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_TOPO_CONFLICT:
+		netdev_err(netdev, "Port topology conflict.\n");
+		netdev_err(netdev, "Possible Solution 1: Use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_err(netdev, "Possible Solution 2: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_LINK_HW_ACCESS:
+		netdev_err(netdev, "Unrecoverable hardware access error.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_LINK_RUNTIME:
+		netdev_err(netdev, "Unrecoverable runtime error.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_DNL_INIT:
+		netdev_err(netdev, "Link management engine failed to initialize.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
 		break;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown control queue type 0x%x\n",
-			 q_type);
-		return 0;
+		break;
 	}
+}
 
-	/* check for error indications - PF_xx_AxQLEN register layout for
-	 * FW/MBX/SB are identical so just use defines for PF_FW_AxQLEN.
+/**
+ * ice_process_health_status_event - Process the health status event from FW
+ * @pf: pointer to the PF structure
+ * @event: event structure containing the Health Status Event opcode
+ *
+ * Decode the Health Status Events and print the associated messages
+ */
+static void ice_process_health_status_event(struct ice_pf *pf,
+					    struct ice_rq_event_info *event)
+{
+	struct ice_aqc_health_status_elem *health_info;
+	u16 count;
+	int i;
+
+	health_info = (struct ice_aqc_health_status_elem *)event->msg_buf;
+	count = le16_to_cpu(event->desc.params.get_health_status.health_status_count);
+
+	/* In practice it's rare to encounter this event, and even more uncommon
+	 * to observe multiple health status elements in a single message, but
+	 * there's no boundary defined that separates an unlikely scenario from
+	 * an erroneous one. If the count reported by the firmware is clearly
+	 * incorrect then don't process the message and return.
 	 */
-	val = rd32(hw, cq->rq.len);
-	if (val & (PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M |
-		   PF_FW_ARQLEN_ARQCRIT_M)) {
-		oldval = val;
-		if (val & PF_FW_ARQLEN_ARQVFE_M)
-			dev_dbg(&pf->pdev->dev,
-				"%s Receive Queue VF Error detected\n", qtype);
-		if (val & PF_FW_ARQLEN_ARQOVFL_M) {
-			dev_dbg(&pf->pdev->dev,
-				"%s Receive Queue Overflow Error detected\n",
-				qtype);
-		}
-		if (val & PF_FW_ARQLEN_ARQCRIT_M)
-			dev_dbg(&pf->pdev->dev,
-				"%s Receive Queue Critical Error detected\n",
-				qtype);
-		val &= ~(PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M |
-			 PF_FW_ARQLEN_ARQCRIT_M);
-		if (oldval != val)
-			wr32(hw, cq->rq.len, val);
+	if (count > (event->buf_len / sizeof(*health_info))) {
+		dev_err(ice_pf_to_dev(pf), "Received a health status event with invalid element count\n");
+		return;
 	}
 
-	val = rd32(hw, cq->sq.len);
-	if (val & (PF_FW_ATQLEN_ATQVFE_M | PF_FW_ATQLEN_ATQOVFL_M |
-		   PF_FW_ATQLEN_ATQCRIT_M)) {
-		oldval = val;
-		if (val & PF_FW_ATQLEN_ATQVFE_M)
-			dev_dbg(&pf->pdev->dev,
-				"%s Send Queue VF Error detected\n", qtype);
-		if (val & PF_FW_ATQLEN_ATQOVFL_M) {
-			dev_dbg(&pf->pdev->dev,
-				"%s Send Queue Overflow Error detected\n",
-				qtype);
-		}
-		if (val & PF_FW_ATQLEN_ATQCRIT_M)
-			dev_dbg(&pf->pdev->dev,
-				"%s Send Queue Critical Error detected\n",
-				qtype);
-		val &= ~(PF_FW_ATQLEN_ATQVFE_M | PF_FW_ATQLEN_ATQOVFL_M |
-			 PF_FW_ATQLEN_ATQCRIT_M);
-		if (oldval != val)
-			wr32(hw, cq->sq.len, val);
+	for (i = 0; i < count; i++) {
+		ice_print_health_status_string(pf, health_info);
+		health_info++;
 	}
+}
 
-	event.buf_len = cq->rq_buf_size;
-	event.msg_buf = devm_kzalloc(&pf->pdev->dev, event.buf_len,
-				     GFP_KERNEL);
-	if (!event.msg_buf)
-		return 0;
 
-	do {
-		enum ice_status ret;
-		u16 opcode;
+enum ice_aq_task_state {
+	ICE_AQ_TASK_WAITING = 0,
+	ICE_AQ_TASK_COMPLETE,
+	ICE_AQ_TASK_CANCELED,
+};
 
-		ret = ice_clean_rq_elem(hw, cq, &event, &pending);
-		if (ret == ICE_ERR_AQ_NO_WORK)
-			break;
-		if (ret) {
-			dev_err(&pf->pdev->dev,
-				"%s Receive Queue event error %d\n", qtype,
-				ret);
-			break;
+struct ice_aq_task {
+	struct hlist_node entry;
+
+	u16 opcode;
+	struct ice_rq_event_info *event;
+	enum ice_aq_task_state state;
+};
+
+/**
+ * ice_aq_wait_for_event - Wait for an AdminQ event from firmware
+ * @pf: pointer to the PF private structure
+ * @opcode: the opcode to wait for
+ * @timeout: how long to wait, in jiffies
+ * @event: storage for the event info
+ *
+ * Waits for a specific AdminQ completion event on the ARQ for a given PF. The
+ * current thread will be put to sleep until the specified event occurs or
+ * until the given timeout is reached.
+ *
+ * To obtain only the descriptor contents, pass an event without an allocated
+ * msg_buf. If the complete data buffer is desired, allocate the
+ * event->msg_buf with enough space ahead of time.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+int ice_aq_wait_for_event(struct ice_pf *pf, u16 opcode, unsigned long timeout,
+			  struct ice_rq_event_info *event)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_aq_task *task;
+	unsigned long start;
+	long ret;
+	int err;
+
+	task = kzalloc(sizeof(*task), GFP_KERNEL);
+	if (!task)
+		return -ENOMEM;
+
+	INIT_HLIST_NODE(&task->entry);
+	task->opcode = opcode;
+	task->event = event;
+	task->state = ICE_AQ_TASK_WAITING;
+
+	spin_lock_bh(&pf->aq_wait_lock);
+	hlist_add_head(&task->entry, &pf->aq_wait_list);
+	spin_unlock_bh(&pf->aq_wait_lock);
+
+	start = jiffies;
+
+#ifdef __CHECKER__
+	/* Suppress sparse warning from kernel macro:
+	 * warning: symbol '__ret' shadows an earlier one
+	 */
+	ret = timeout;
+#else
+	ret = wait_event_interruptible_timeout(pf->aq_wait_queue, task->state,
+					       timeout);
+#endif
+	switch (task->state) {
+	case ICE_AQ_TASK_WAITING:
+		err = ret < 0 ? ret : -ETIMEDOUT;
+		break;
+	case ICE_AQ_TASK_CANCELED:
+		err = ret < 0 ? ret : -ECANCELED;
+		break;
+	case ICE_AQ_TASK_COMPLETE:
+		err = ret < 0 ? ret : 0;
+		break;
+	default:
+		WARN(1, "Unexpected AdminQ wait task state %u", task->state);
+		err = -EINVAL;
+		break;
+	}
+
+	dev_dbg(dev, "Waited %u msecs (max %u msecs) for firmware response to op 0x%04x\n",
+		jiffies_to_msecs(jiffies - start),
+		jiffies_to_msecs(timeout),
+		opcode);
+
+	spin_lock_bh(&pf->aq_wait_lock);
+	hlist_del(&task->entry);
+	spin_unlock_bh(&pf->aq_wait_lock);
+	kfree(task);
+
+	return err;
+}
+
+/**
+ * ice_aq_check_events - Check if any thread is waiting for an AdminQ event
+ * @pf: pointer to the PF private structure
+ * @opcode: the opcode of the event
+ * @event: the event to check
+ *
+ * Loops over the current list of pending threads waiting for an AdminQ event.
+ * For each matching task, copy the contents of the event into the task
+ * structure and wake up the thread.
+ *
+ * If multiple threads wait for the same opcode, they will all be woken up.
+ *
+ * Note that event->msg_buf will only be duplicated if the event has a buffer
+ * with enough space already allocated. Otherwise, only the descriptor and
+ * message length will be copied.
+ *
+ * Returns: true if an event was found, false otherwise
+ */
+static void ice_aq_check_events(struct ice_pf *pf, u16 opcode,
+				struct ice_rq_event_info *event)
+{
+	struct ice_aq_task *task;
+	bool found = false;
+
+	spin_lock_bh(&pf->aq_wait_lock);
+	hlist_for_each_entry(task, &pf->aq_wait_list, entry) {
+		if (task->state || task->opcode != opcode)
+			continue;
+
+		memcpy(&task->event->desc, &event->desc, sizeof(event->desc));
+		task->event->msg_len = event->msg_len;
+
+		/* Only copy the data buffer if a destination was set */
+		if (task->event->msg_buf &&
+		    task->event->buf_len > event->buf_len) {
+			memcpy(task->event->msg_buf, event->msg_buf,
+			       event->buf_len);
+			task->event->buf_len = event->buf_len;
+		}
+
+		task->state = ICE_AQ_TASK_COMPLETE;
+		found = true;
+	}
+	spin_unlock_bh(&pf->aq_wait_lock);
+
+	if (found)
+		wake_up(&pf->aq_wait_queue);
+}
+
+/**
+ * ice_aq_cancel_waiting_tasks - Immediately cancel all waiting tasks
+ * @pf: the PF private structure
+ *
+ * Set all waiting tasks to ICE_AQ_TASK_CANCELED, and wake up their threads.
+ * This will then cause ice_aq_wait_for_event to exit with -ECANCELED.
+ */
+static void ice_aq_cancel_waiting_tasks(struct ice_pf *pf)
+{
+	struct ice_aq_task *task;
+
+	spin_lock_bh(&pf->aq_wait_lock);
+	hlist_for_each_entry(task, &pf->aq_wait_list, entry)
+		task->state = ICE_AQ_TASK_CANCELED;
+	spin_unlock_bh(&pf->aq_wait_lock);
+
+	wake_up(&pf->aq_wait_queue);
+}
+
+/**
+ * __ice_clean_ctrlq - helper function to clean controlq rings
+ * @pf: ptr to struct ice_pf
+ * @q_type: specific Control queue type
+ */
+static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_rq_event_info event;
+	struct ice_hw *hw = &pf->hw;
+	struct ice_ctl_q_info *cq;
+	u16 pending, i = 0;
+	const char *qtype;
+	u32 oldval, val;
+
+	/* Do not clean control queue if/when PF reset fails */
+	if (test_bit(ICE_RESET_FAILED, pf->state))
+		return 0;
+
+	switch (q_type) {
+	case ICE_CTL_Q_ADMIN:
+		cq = &hw->adminq;
+		qtype = "Admin";
+		break;
+	case ICE_CTL_Q_SB:
+		cq = &hw->sbq;
+		qtype = "Sideband";
+		break;
+	case ICE_CTL_Q_MAILBOX:
+		cq = &hw->mailboxq;
+		qtype = "Mailbox";
+		/* we are going to try to detect a malicious VF, so set the
+		 * state to begin detection
+		 */
+		hw->mbx_snapshot.mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+		break;
+	default:
+		dev_warn(dev, "Unknown control queue type 0x%x\n", q_type);
+		return 0;
+	}
+
+	/* check for error indications - PF_xx_AxQLEN register layout for
+	 * FW/MBX/SB are identical so just use defines for PF_FW_AxQLEN.
+	 */
+	val = rd32(hw, cq->rq.len);
+	if (val & (PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M |
+		   PF_FW_ARQLEN_ARQCRIT_M)) {
+		oldval = val;
+		if (val & PF_FW_ARQLEN_ARQVFE_M)
+			dev_dbg(dev, "%s Receive Queue VF Error detected\n",
+				qtype);
+		if (val & PF_FW_ARQLEN_ARQOVFL_M) {
+			dev_dbg(dev, "%s Receive Queue Overflow Error detected\n",
+				qtype);
+		}
+		if (val & PF_FW_ARQLEN_ARQCRIT_M)
+			dev_dbg(dev, "%s Receive Queue Critical Error detected\n",
+				qtype);
+		val &= ~(PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M |
+			 PF_FW_ARQLEN_ARQCRIT_M);
+		if (oldval != val)
+			wr32(hw, cq->rq.len, val);
+	}
+
+	val = rd32(hw, cq->sq.len);
+	if (val & (PF_FW_ATQLEN_ATQVFE_M | PF_FW_ATQLEN_ATQOVFL_M |
+		   PF_FW_ATQLEN_ATQCRIT_M)) {
+		oldval = val;
+		if (val & PF_FW_ATQLEN_ATQVFE_M)
+			dev_dbg(dev, "%s Send Queue VF Error detected\n",
+				qtype);
+		if (val & PF_FW_ATQLEN_ATQOVFL_M) {
+			dev_dbg(dev, "%s Send Queue Overflow Error detected\n",
+				qtype);
+		}
+		if (val & PF_FW_ATQLEN_ATQCRIT_M)
+			dev_dbg(dev, "%s Send Queue Critical Error detected\n",
+				qtype);
+		val &= ~(PF_FW_ATQLEN_ATQVFE_M | PF_FW_ATQLEN_ATQOVFL_M |
+			 PF_FW_ATQLEN_ATQCRIT_M);
+		if (oldval != val)
+			wr32(hw, cq->sq.len, val);
+	}
+
+	event.buf_len = cq->rq_buf_size;
+	event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL);
+	if (!event.msg_buf)
+		return 0;
+
+	do {
+		enum ice_status ret;
+		u16 opcode;
+
+		ret = ice_clean_rq_elem(hw, cq, &event, &pending);
+		if (ret == ICE_ERR_AQ_NO_WORK)
+			break;
+		if (ret) {
+			dev_err(dev, "%s Receive Queue event error %s\n", qtype,
+				ice_stat_str(ret));
+			break;
 		}
 
 		opcode = le16_to_cpu(event.desc.opcode);
 
+		/* Notify any thread that might be waiting for this event */
+		ice_aq_check_events(pf, opcode, &event);
+
 		switch (opcode) {
 		case ice_aqc_opc_get_link_status:
 			if (ice_handle_link_event(pf, &event))
-				dev_err(&pf->pdev->dev,
-					"Could not handle link event\n");
+				dev_err(dev, "Could not handle link event\n");
+			break;
+		case ice_aqc_opc_event_lan_overflow:
+			ice_vf_lan_overflow_event(pf, &event);
 			break;
 		case ice_mbx_opc_send_msg_to_pf:
-			ice_vc_process_vf_msg(pf, &event);
+			if (!ice_is_malicious_vf(pf, &event, i, pending))
+				ice_vc_process_vf_msg(pf, &event);
 			break;
-		case ice_aqc_opc_fw_logging:
-			ice_output_fw_log(hw, &event.desc, event.msg_buf);
+		case ice_aqc_opc_fw_logs_event:
+			ice_fwlog_event_dump(hw, &event.desc, event.msg_buf);
 			break;
 		case ice_aqc_opc_lldp_set_mib_change:
 			ice_dcb_process_lldp_set_mib_change(pf, &event);
 			break;
+		case ice_aqc_opc_get_health_status:
+			ice_process_health_status_event(pf, &event);
+			break;
 		default:
-			dev_dbg(&pf->pdev->dev,
-				"%s Receive Queue unknown event 0x%04x ignored\n",
+			dev_dbg(dev, "%s Receive Queue unknown event 0x%04x ignored\n",
 				qtype, opcode);
 			break;
 		}
 	} while (pending && (i++ < ICE_DFLT_IRQ_WORK));
 
-	devm_kfree(&pf->pdev->dev, event.msg_buf);
+	kfree(event.msg_buf);
 
 	return pending && (i == ICE_DFLT_IRQ_WORK);
 }
@@ -1106,13 +2236,13 @@ static void ice_clean_adminq_subtask(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
 
-	if (!test_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state))
+	if (!test_bit(ICE_ADMINQ_EVENT_PENDING, pf->state))
 		return;
 
 	if (__ice_clean_ctrlq(pf, ICE_CTL_Q_ADMIN))
 		return;
 
-	clear_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state);
+	clear_bit(ICE_ADMINQ_EVENT_PENDING, pf->state);
 
 	/* There might be a situation where new messages arrive to a control
 	 * queue between processing the last message and clearing the
@@ -1133,13 +2263,13 @@ static void ice_clean_mailboxq_subtask(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
 
-	if (!test_bit(__ICE_MAILBOXQ_EVENT_PENDING, pf->state))
+	if (!test_bit(ICE_MAILBOXQ_EVENT_PENDING, pf->state))
 		return;
 
 	if (__ice_clean_ctrlq(pf, ICE_CTL_Q_MAILBOX))
 		return;
 
-	clear_bit(__ICE_MAILBOXQ_EVENT_PENDING, pf->state);
+	clear_bit(ICE_MAILBOXQ_EVENT_PENDING, pf->state);
 
 	if (ice_ctrlq_pending(hw, &hw->mailboxq))
 		__ice_clean_ctrlq(pf, ICE_CTL_Q_MAILBOX);
@@ -1147,17 +2277,48 @@ static void ice_clean_mailboxq_subtask(struct ice_pf *pf)
 	ice_flush(hw);
 }
 
+/**
+ * ice_clean_sbq_subtask - clean the Sideband Queue rings
+ * @pf: board private structure
+ */
+static void ice_clean_sbq_subtask(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	/* if mac_type is not generic, sideband is not supported
+	 * and there's nothing to do here
+	 */
+	if (!ice_is_generic_mac(hw)) {
+		clear_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state);
+		return;
+	}
+
+	if (!test_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state))
+		return;
+
+	if (__ice_clean_ctrlq(pf, ICE_CTL_Q_SB))
+		return;
+
+	clear_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state);
+
+	if (ice_ctrlq_pending(hw, &hw->sbq))
+		__ice_clean_ctrlq(pf, ICE_CTL_Q_SB);
+
+	ice_flush(hw);
+}
+
 /**
  * ice_service_task_schedule - schedule the service task to wake up
  * @pf: board private structure
  *
  * If not already scheduled, this puts the task into the work queue.
  */
-static void ice_service_task_schedule(struct ice_pf *pf)
+void ice_service_task_schedule(struct ice_pf *pf)
 {
-	if (!test_bit(__ICE_SERVICE_DIS, pf->state) &&
-	    !test_and_set_bit(__ICE_SERVICE_SCHED, pf->state) &&
-	    !test_bit(__ICE_NEEDS_RESTART, pf->state))
+	if (!test_bit(ICE_SERVICE_DIS, pf->state) &&
+	    !test_and_set_bit(ICE_SERVICE_SCHED, pf->state) &&
+	    !test_bit(ICE_RECOVERY_MODE, pf->state) &&
+	    !test_bit(ICE_NEEDS_RESTART, pf->state))
 		queue_work(ice_wq, &pf->serv_task);
 }
 
@@ -1167,27 +2328,33 @@ static void ice_service_task_schedule(struct ice_pf *pf)
  */
 static void ice_service_task_complete(struct ice_pf *pf)
 {
-	WARN_ON(!test_bit(__ICE_SERVICE_SCHED, pf->state));
+	WARN_ON(!test_bit(ICE_SERVICE_SCHED, pf->state));
 
 	/* force memory (pf->state) to sync before next service task */
 	smp_mb__before_atomic();
-	clear_bit(__ICE_SERVICE_SCHED, pf->state);
+	clear_bit(ICE_SERVICE_SCHED, pf->state);
 }
 
 /**
  * ice_service_task_stop - stop service task and cancel works
  * @pf: board private structure
+ *
+ * Return 0 if the ICE_SERVICE_DIS bit was not already set,
+ * 1 otherwise.
  */
-static void ice_service_task_stop(struct ice_pf *pf)
+static int ice_service_task_stop(struct ice_pf *pf)
 {
-	set_bit(__ICE_SERVICE_DIS, pf->state);
+	int ret;
+
+	ret = test_and_set_bit(ICE_SERVICE_DIS, pf->state);
 
 	if (pf->serv_tmr.function)
 		del_timer_sync(&pf->serv_tmr);
 	if (pf->serv_task.func)
 		cancel_work_sync(&pf->serv_task);
 
-	clear_bit(__ICE_SERVICE_SCHED, pf->state);
+	clear_bit(ICE_SERVICE_SCHED, pf->state);
+	return ret;
 }
 
 /**
@@ -1198,7 +2365,7 @@ static void ice_service_task_stop(struct ice_pf *pf)
  */
 static void ice_service_task_restart(struct ice_pf *pf)
 {
-	clear_bit(__ICE_SERVICE_DIS, pf->state);
+	clear_bit(ICE_SERVICE_DIS, pf->state);
 	ice_service_task_schedule(pf);
 }
 
@@ -1218,19 +2385,28 @@ static void ice_service_timer(struct timer_list *t)
  * ice_handle_mdd_event - handle malicious driver detect event
  * @pf: pointer to the PF structure
  *
- * Called from service task. OICR interrupt handler indicates MDD event
+ * Called from service task. OICR interrupt handler indicates MDD event.
+ * VF MDD logging is guarded by net_ratelimit. Additional PF and VF log
+ * messages are wrapped by netif_msg_[rx|tx]_err. Since VF Rx MDD events
+ * disable the queue, the PF can be configured to reset the VF using ethtool
+ * private flag mdd-auto-reset-vf.
  */
 static void ice_handle_mdd_event(struct ice_pf *pf)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
-	bool mdd_detected = false;
+	unsigned int i;
 	u32 reg;
-	int i;
 
-	if (!test_and_clear_bit(__ICE_MDD_EVENT_PENDING, pf->state))
+	if (!test_and_clear_bit(ICE_MDD_EVENT_PENDING, pf->state)) {
+		/* Since the VF MDD event logging is rate limited, check if
+		 * there are pending MDD events.
+		 */
+		ice_print_vfs_mdd_events(pf);
 		return;
+	}
 
-	/* find what triggered the MDD event */
+	/* find what triggered an MDD event */
 	reg = rd32(hw, GL_MDET_TX_PQM);
 	if (reg & GL_MDET_TX_PQM_VALID_M) {
 		u8 pf_num = (reg & GL_MDET_TX_PQM_PF_NUM_M) >>
@@ -1243,10 +2419,9 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 				GL_MDET_TX_PQM_QNUM_S);
 
 		if (netif_msg_tx_err(pf))
-			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
+			dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
 				 event, queue, pf_num, vf_num);
 		wr32(hw, GL_MDET_TX_PQM, 0xffffffff);
-		mdd_detected = true;
 	}
 
 	reg = rd32(hw, GL_MDET_TX_TCLAN);
@@ -1260,11 +2435,10 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 		u16 queue = ((reg & GL_MDET_TX_TCLAN_QNUM_M) >>
 				GL_MDET_TX_TCLAN_QNUM_S);
 
-		if (netif_msg_rx_err(pf))
-			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
+		if (netif_msg_tx_err(pf))
+			dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
 				 event, queue, pf_num, vf_num);
 		wr32(hw, GL_MDET_TX_TCLAN, 0xffffffff);
-		mdd_detected = true;
 	}
 
 	reg = rd32(hw, GL_MDET_RX);
@@ -1279,89 +2453,93 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 				GL_MDET_RX_QNUM_S);
 
 		if (netif_msg_rx_err(pf))
-			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n",
+			dev_info(dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n",
 				 event, queue, pf_num, vf_num);
 		wr32(hw, GL_MDET_RX, 0xffffffff);
-		mdd_detected = true;
 	}
 
-	if (mdd_detected) {
-		bool pf_mdd_detected = false;
-
-		reg = rd32(hw, PF_MDET_TX_PQM);
-		if (reg & PF_MDET_TX_PQM_VALID_M) {
-			wr32(hw, PF_MDET_TX_PQM, 0xFFFF);
-			dev_info(&pf->pdev->dev, "TX driver issue detected, PF reset issued\n");
-			pf_mdd_detected = true;
-		}
+	/* check to see if this PF caused an MDD event */
+	reg = rd32(hw, PF_MDET_TX_PQM);
+	if (reg & PF_MDET_TX_PQM_VALID_M) {
+		wr32(hw, PF_MDET_TX_PQM, 0xFFFF);
+		if (netif_msg_tx_err(pf))
+			dev_info(dev, "Malicious Driver Detection event TX_PQM detected on PF\n");
+	}
 
-		reg = rd32(hw, PF_MDET_TX_TCLAN);
-		if (reg & PF_MDET_TX_TCLAN_VALID_M) {
-			wr32(hw, PF_MDET_TX_TCLAN, 0xFFFF);
-			dev_info(&pf->pdev->dev, "TX driver issue detected, PF reset issued\n");
-			pf_mdd_detected = true;
-		}
+	reg = rd32(hw, PF_MDET_TX_TCLAN);
+	if (reg & PF_MDET_TX_TCLAN_VALID_M) {
+		wr32(hw, PF_MDET_TX_TCLAN, 0xFFFF);
+		if (netif_msg_tx_err(pf))
+			dev_info(dev, "Malicious Driver Detection event TX_TCLAN detected on PF\n");
+	}
 
-		reg = rd32(hw, PF_MDET_RX);
-		if (reg & PF_MDET_RX_VALID_M) {
-			wr32(hw, PF_MDET_RX, 0xFFFF);
-			dev_info(&pf->pdev->dev, "RX driver issue detected, PF reset issued\n");
-			pf_mdd_detected = true;
-		}
-		/* Queue belongs to the PF initiate a reset */
-		if (pf_mdd_detected) {
-			set_bit(__ICE_NEEDS_RESTART, pf->state);
-			ice_service_task_schedule(pf);
-		}
+	reg = rd32(hw, PF_MDET_RX);
+	if (reg & PF_MDET_RX_VALID_M) {
+		wr32(hw, PF_MDET_RX, 0xFFFF);
+		if (netif_msg_rx_err(pf))
+			dev_info(dev, "Malicious Driver Detection event RX detected on PF\n");
 	}
 
-	/* check to see if one of the VFs caused the MDD */
-	for (i = 0; i < pf->num_alloc_vfs; i++) {
+	/* Check to see if one of the VFs caused an MDD event, and then
+	 * increment counters and set print pending
+	 */
+	ice_for_each_vf(pf, i) {
 		struct ice_vf *vf = &pf->vf[i];
 
-		bool vf_mdd_detected = false;
-
 		reg = rd32(hw, VP_MDET_TX_PQM(i));
 		if (reg & VP_MDET_TX_PQM_VALID_M) {
 			wr32(hw, VP_MDET_TX_PQM(i), 0xFFFF);
-			vf_mdd_detected = true;
-			dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n",
-				 i);
+			vf->mdd_tx_events.count++;
+			set_bit(ICE_MDD_VF_PRINT_PENDING, pf->state);
+			if (netif_msg_tx_err(pf))
+				dev_info(dev, "Malicious Driver Detection event TX_PQM detected on VF %d\n",
+					 i);
 		}
 
 		reg = rd32(hw, VP_MDET_TX_TCLAN(i));
 		if (reg & VP_MDET_TX_TCLAN_VALID_M) {
 			wr32(hw, VP_MDET_TX_TCLAN(i), 0xFFFF);
-			vf_mdd_detected = true;
-			dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n",
-				 i);
+			vf->mdd_tx_events.count++;
+			set_bit(ICE_MDD_VF_PRINT_PENDING, pf->state);
+			if (netif_msg_tx_err(pf))
+				dev_info(dev, "Malicious Driver Detection event TX_TCLAN detected on VF %d\n",
+					 i);
 		}
 
 		reg = rd32(hw, VP_MDET_TX_TDPU(i));
 		if (reg & VP_MDET_TX_TDPU_VALID_M) {
 			wr32(hw, VP_MDET_TX_TDPU(i), 0xFFFF);
-			vf_mdd_detected = true;
-			dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n",
-				 i);
+			vf->mdd_tx_events.count++;
+			set_bit(ICE_MDD_VF_PRINT_PENDING, pf->state);
+			if (netif_msg_tx_err(pf))
+				dev_info(dev, "Malicious Driver Detection event TX_TDPU detected on VF %d\n",
+					 i);
 		}
 
 		reg = rd32(hw, VP_MDET_RX(i));
 		if (reg & VP_MDET_RX_VALID_M) {
 			wr32(hw, VP_MDET_RX(i), 0xFFFF);
-			vf_mdd_detected = true;
-			dev_info(&pf->pdev->dev, "RX driver issue detected on VF %d\n",
-				 i);
-		}
-
-		if (vf_mdd_detected) {
-			vf->num_mdd_events++;
-			if (vf->num_mdd_events &&
-			    vf->num_mdd_events <= ICE_MDD_EVENTS_THRESHOLD)
-				dev_info(&pf->pdev->dev,
-					 "VF %d has had %llu MDD events since last boot, Admin might need to reload AVF driver with this number of events\n",
-					 i, vf->num_mdd_events);
+			vf->mdd_rx_events.count++;
+			set_bit(ICE_MDD_VF_PRINT_PENDING, pf->state);
+			if (netif_msg_rx_err(pf))
+				dev_info(dev, "Malicious Driver Detection event RX detected on VF %d\n",
+					 i);
+
+			/* Since the queue is disabled on VF Rx MDD events, the
+			 * PF can be configured to reset the VF through ethtool
+			 * private flag mdd-auto-reset-vf.
+			 */
+			if (test_bit(ICE_FLAG_MDD_AUTO_RESET_VF, pf->flags)) {
+				/* VF MDD event counters will be cleared by
+				 * reset, so print the event prior to reset.
+				 */
+				ice_print_vf_rx_mdd_event(vf);
+				ice_reset_vf(&pf->vf[i], false);
+			}
 		}
 	}
+
+	ice_print_vfs_mdd_events(pf);
 }
 
 /**
@@ -1389,19 +2567,18 @@ static int ice_force_phys_link_state(struct ice_vsi *vsi, bool link_up)
 	if (vsi->type != ICE_VSI_PF)
 		return 0;
 
-	dev = &vsi->back->pdev->dev;
+	dev = ice_pf_to_dev(vsi->back);
 
 	pi = vsi->port_info;
 
-	pcaps = devm_kzalloc(dev, sizeof(*pcaps), GFP_KERNEL);
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
 	if (!pcaps)
 		return -ENOMEM;
 
-	retcode = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps,
+	retcode = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
 				      NULL);
 	if (retcode) {
-		dev_err(dev,
-			"Failed to get phy capabilities, VSI %d error %d\n",
+		dev_err(dev, "Failed to get phy capabilities, VSI %d error %d\n",
 			vsi->vsi_num, retcode);
 		retcode = -EIO;
 		goto out;
@@ -1412,125 +2589,576 @@ static int ice_force_phys_link_state(struct ice_vsi *vsi, bool link_up)
 	    link_up == !!(pi->phy.link_info.link_info & ICE_AQ_LINK_UP))
 		goto out;
 
-	cfg = devm_kzalloc(dev, sizeof(*cfg), GFP_KERNEL);
+	/* Use the current user PHY configuration. The current user PHY
+	 * configuration is initialized during probe from PHY capabilities
+	 * software mode, and updated on set PHY configuration.
+	 */
+	cfg = kmemdup(&pi->phy.curr_user_phy_cfg, sizeof(*cfg), GFP_KERNEL);
 	if (!cfg) {
 		retcode = -ENOMEM;
 		goto out;
 	}
 
-	cfg->phy_type_low = pcaps->phy_type_low;
-	cfg->phy_type_high = pcaps->phy_type_high;
-	cfg->caps = pcaps->caps | ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-	cfg->low_power_ctrl = pcaps->low_power_ctrl;
-	cfg->eee_cap = pcaps->eee_cap;
-	cfg->eeer_value = pcaps->eeer_value;
-	cfg->link_fec_opt = pcaps->link_fec_options;
+	cfg->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
 	if (link_up)
 		cfg->caps |= ICE_AQ_PHY_ENA_LINK;
 	else
 		cfg->caps &= ~ICE_AQ_PHY_ENA_LINK;
 
-	retcode = ice_aq_set_phy_cfg(&vsi->back->hw, pi->lport, cfg, NULL);
+	retcode = ice_aq_set_phy_cfg(&vsi->back->hw, pi, cfg, NULL);
 	if (retcode) {
 		dev_err(dev, "Failed to set phy config, VSI %d error %d\n",
 			vsi->vsi_num, retcode);
 		retcode = -EIO;
 	}
 
-	devm_kfree(dev, cfg);
+	kfree(cfg);
 out:
-	devm_kfree(dev, pcaps);
+	kfree(pcaps);
 	return retcode;
 }
 
 /**
- * ice_check_media_subtask - Check for media; bring link up if detected.
- * @pf: pointer to PF struct
+ * ice_init_nvm_phy_type - Initialize the NVM PHY type
+ * @pi: port info structure
+ *
+ * Initialize nvm_phy_type_[low|high] for link lenient mode support
  */
-static void ice_check_media_subtask(struct ice_pf *pf)
+static int ice_init_nvm_phy_type(struct ice_port_info *pi)
 {
-	struct ice_port_info *pi;
-	struct ice_vsi *vsi;
-	int err;
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	struct ice_pf *pf = pi->hw->back;
+	enum ice_status status;
+	int err = 0;
 
-	vsi = ice_get_main_vsi(pf);
-	if (!vsi)
-		return;
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return -ENOMEM;
 
-	/* No need to check for media if it's already present or the interface
-	 * is down
-	 */
-	if (!test_bit(ICE_FLAG_NO_MEDIA, pf->flags) ||
-	    test_bit(__ICE_DOWN, vsi->state))
-		return;
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_NO_MEDIA, pcaps,
+				     NULL);
 
-	/* Refresh link info and check if media is present */
-	pi = vsi->port_info;
-	err = ice_update_link_info(pi);
-	if (err)
-		return;
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Get PHY capability failed.\n");
+		err = -EIO;
+		goto out;
+	}
 
-	if (pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE) {
-		err = ice_force_phys_link_state(vsi, true);
-		if (err)
-			return;
-		clear_bit(ICE_FLAG_NO_MEDIA, pf->flags);
+	pf->nvm_phy_type_hi = pcaps->phy_type_high;
+	pf->nvm_phy_type_lo = pcaps->phy_type_low;
 
-		/* A Link Status Event will be generated; the event handler
-		 * will complete bringing the interface up
-		 */
-	}
+out:
+	kfree(pcaps);
+	return err;
 }
 
 /**
- * ice_service_task - manage and run subtasks
- * @work: pointer to work_struct contained by the PF struct
+ * ice_init_link_dflt_override - Initialize link default override
+ * @pi: port info structure
+ *
+ * Initialize link default override and PHY total port shutdown during probe
  */
-static void ice_service_task(struct work_struct *work)
+static void ice_init_link_dflt_override(struct ice_port_info *pi)
 {
-	struct ice_pf *pf = container_of(work, struct ice_pf, serv_task);
-	unsigned long start_time = jiffies;
-
-	/* subtasks */
-
-	/* process reset requests first */
-	ice_reset_subtask(pf);
+	struct ice_link_default_override_tlv *ldo;
+	struct ice_pf *pf = pi->hw->back;
 
-	/* bail if a reset/recovery cycle is pending or rebuild failed */
-	if (ice_is_reset_in_progress(pf->state) ||
-	    test_bit(__ICE_SUSPENDED, pf->state) ||
-	    test_bit(__ICE_NEEDS_RESTART, pf->state)) {
-		ice_service_task_complete(pf);
+	ldo = &pf->link_dflt_override;
+	if (ice_get_link_default_override(ldo, pi))
 		return;
-	}
 
-	ice_clean_adminq_subtask(pf);
-	ice_check_media_subtask(pf);
-	ice_check_for_hang_subtask(pf);
-	ice_sync_fltr_subtask(pf);
-	ice_handle_mdd_event(pf);
-	ice_watchdog_subtask(pf);
-
-	if (ice_is_safe_mode(pf)) {
-		ice_service_task_complete(pf);
+	if (!(ldo->options & ICE_LINK_OVERRIDE_PORT_DIS))
 		return;
-	}
-
-	ice_process_vflr_event(pf);
-	ice_clean_mailboxq_subtask(pf);
 
-	/* Clear __ICE_SERVICE_SCHED flag to allow scheduling next event */
-	ice_service_task_complete(pf);
+	/* Enable Total Port Shutdown (override/replace link-down-on-close
+	 * ethtool private flag) for ports with Port Disable bit set.
+	 */
+	set_bit(ICE_FLAG_TOTAL_PORT_SHUTDOWN_ENA, pf->flags);
+	set_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags);
+}
+
+/**
+ * ice_init_phy_cfg_dflt_override - Initialize PHY cfg default override settings
+ * @pi: port info structure
+ *
+ * If default override is enabled, initialize the user PHY cfg speed and FEC
+ * settings using the default override mask from the NVM.
+ *
+ * The PHY should only be configured with the default override settings the
+ * first time media is available. The ICE_LINK_DEFAULT_OVERRIDE_PENDING state
+ * is used to indicate that the user PHY cfg default override is initialized
+ * and the PHY has not been configured with the default override settings. The
+ * state is set here, and cleared in ice_configure_phy the first time the PHY is
+ * configured.
+ *
+ * This function should be called only if the FW doesn't support default
+ * configuration mode, as reported by ice_fw_supports_report_dflt_cfg.
+ */
+static void ice_init_phy_cfg_dflt_override(struct ice_port_info *pi)
+{
+	struct ice_link_default_override_tlv *ldo;
+	struct ice_aqc_set_phy_cfg_data *cfg;
+	struct ice_phy_info *phy = &pi->phy;
+	struct ice_pf *pf = pi->hw->back;
+
+	ldo = &pf->link_dflt_override;
+
+	/* If link default override is enabled, use to mask NVM PHY capabilities
+	 * for speed and FEC default configuration.
+	 */
+	cfg = &phy->curr_user_phy_cfg;
+
+	if (ldo->phy_type_low || ldo->phy_type_high) {
+		cfg->phy_type_low = pf->nvm_phy_type_lo &
+				    cpu_to_le64(ldo->phy_type_low);
+		cfg->phy_type_high = pf->nvm_phy_type_hi &
+				     cpu_to_le64(ldo->phy_type_high);
+	}
+	cfg->link_fec_opt = ldo->fec_options;
+	phy->curr_user_fec_req = ICE_FEC_AUTO;
+
+	set_bit(ICE_LINK_DEFAULT_OVERRIDE_PENDING, pf->state);
+}
+
+/**
+ * ice_init_phy_user_cfg - Initialize the PHY user configuration
+ * @pi: port info structure
+ *
+ * Initialize the current user PHY configuration, speed, FEC, and FC requested
+ * mode to default. The PHY defaults are from get PHY capabilities topology
+ * with media so call when media is first available. An error is returned if
+ * called when media is not available. The PHY initialization completed state is
+ * set here.
+ *
+ * These configurations are used when setting PHY
+ * configuration. The user PHY configuration is updated on set PHY
+ * configuration. Returns 0 on success, negative on failure
+ */
+static int ice_init_phy_user_cfg(struct ice_port_info *pi)
+{
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	struct ice_phy_info *phy = &pi->phy;
+	struct ice_pf *pf = pi->hw->back;
+	enum ice_status status;
+	int err = 0;
+
+	if (!(phy->link_info.link_info & ICE_AQ_MEDIA_AVAILABLE))
+		return -EIO;
+
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return -ENOMEM;
+
+	if (ice_fw_supports_report_dflt_cfg(pi->hw))
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_DFLT_CFG,
+					     pcaps, NULL);
+	else
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+					     pcaps, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Get PHY capability failed.\n");
+		err = -EIO;
+		goto err_out;
+	}
+
+	ice_copy_phy_caps_to_cfg(pi, pcaps, &pi->phy.curr_user_phy_cfg);
+
+	/* check if lenient mode is supported and enabled */
+	if (ice_fw_supports_link_override(pi->hw) &&
+	    !(pcaps->module_compliance_enforcement &
+	      ICE_AQC_MOD_ENFORCE_STRICT_MODE)) {
+		set_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags);
+
+		/* if the FW supports default PHY configuration mode, then the driver
+		 * does not have to apply link override settings. If not,
+		 * initialize user PHY configuration with link override values
+		 */
+		if (!ice_fw_supports_report_dflt_cfg(pi->hw) &&
+		    (pf->link_dflt_override.options & ICE_LINK_OVERRIDE_EN)) {
+			ice_init_phy_cfg_dflt_override(pi);
+			goto out;
+		}
+	}
+
+	/* if link default override is not enabled, set user flow control and
+	 * FEC settings based on what get_phy_caps returned
+	 */
+	phy->curr_user_fec_req = ice_caps_to_fec_mode(pcaps->caps,
+						      pcaps->link_fec_options);
+	phy->curr_user_fc_req = ice_caps_to_fc_mode(pcaps->caps);
+
+out:
+	phy->curr_user_speed_req = ICE_AQ_LINK_SPEED_M;
+	set_bit(ICE_PHY_INIT_COMPLETE, pf->state);
+err_out:
+	kfree(pcaps);
+	return err;
+}
+
+/**
+ * ice_configure_phy - configure PHY
+ * @vsi: VSI of PHY
+ *
+ * Set the PHY configuration. If the current PHY configuration is the same as
+ * the curr_user_phy_cfg, then do nothing to avoid link flap. Otherwise
+ * configure the based get PHY capabilities for topology with media.
+ */
+static int ice_configure_phy(struct ice_vsi *vsi)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	struct ice_port_info *pi = vsi->port_info;
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	struct ice_aqc_set_phy_cfg_data *cfg;
+	struct ice_phy_info *phy = &pi->phy;
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	int err = 0;
+
+	/* Ensure we have media as we cannot configure a medialess port */
+	if (!(phy->link_info.link_info & ICE_AQ_MEDIA_AVAILABLE))
+		return -EPERM;
+
+	ice_print_topo_conflict(vsi);
+
+	if (!test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags) &&
+	    phy->link_info.topo_media_conflict == ICE_AQ_LINK_TOPO_UNSUPP_MEDIA)
+		return -EPERM;
+
+	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags))
+		return ice_force_phys_link_state(vsi, true);
+
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return -ENOMEM;
+
+	/* Get current PHY config */
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
+				     NULL);
+	if (status) {
+		dev_err(dev, "Failed to get PHY configuration, VSI %d error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		err = -EIO;
+		goto done;
+	}
+
+	/* If PHY enable link is configured and configuration has not changed,
+	 * there's nothing to do
+	 */
+	if (pcaps->caps & ICE_AQC_PHY_EN_LINK &&
+	    ice_phy_caps_equals_cfg(pcaps, &phy->curr_user_phy_cfg))
+		goto done;
+
+	/* Use PHY topology as baseline for configuration */
+	memset(pcaps, 0, sizeof(*pcaps));
+	if (ice_fw_supports_report_dflt_cfg(pi->hw))
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_DFLT_CFG,
+					     pcaps, NULL);
+	else
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+					     pcaps, NULL);
+	if (status) {
+		dev_err(dev, "Failed to get PHY caps, VSI %d error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		err = -EIO;
+		goto done;
+	}
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	ice_copy_phy_caps_to_cfg(pi, pcaps, cfg);
+
+	/* Speed - If default override pending, use curr_user_phy_cfg set in
+	 * ice_init_phy_user_cfg_ldo.
+	 */
+	if (test_and_clear_bit(ICE_LINK_DEFAULT_OVERRIDE_PENDING, pf->state)) {
+		cfg->phy_type_low = phy->curr_user_phy_cfg.phy_type_low;
+		cfg->phy_type_high = phy->curr_user_phy_cfg.phy_type_high;
+	} else {
+		u64 phy_low = 0, phy_high = 0;
+
+		ice_update_phy_type(&phy_low, &phy_high,
+				    phy->curr_user_speed_req);
+		cfg->phy_type_low = pcaps->phy_type_low & cpu_to_le64(phy_low);
+		cfg->phy_type_high = pcaps->phy_type_high &
+				     cpu_to_le64(phy_high);
+	}
+
+	/* Can't provide what was requested; use PHY capabilities */
+	if (!cfg->phy_type_low && !cfg->phy_type_high) {
+		cfg->phy_type_low = pcaps->phy_type_low;
+		cfg->phy_type_high = pcaps->phy_type_high;
+	}
+
+	/* FEC */
+	ice_cfg_phy_fec(pi, cfg, phy->curr_user_fec_req);
+
+	/* Can't provide what was requested; use PHY capabilities */
+	if (cfg->link_fec_opt !=
+	    (cfg->link_fec_opt & pcaps->link_fec_options)) {
+		cfg->caps |= pcaps->caps & ICE_AQC_PHY_EN_AUTO_FEC;
+		cfg->link_fec_opt = pcaps->link_fec_options;
+	}
+
+	/* Flow Control - always supported; no need to check against
+	 * capabilities
+	 */
+	ice_cfg_phy_fc(pi, cfg, phy->curr_user_fc_req);
+
+	/* Enable link and link update */
+	cfg->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT | ICE_AQ_PHY_ENA_LINK;
+
+	status = ice_aq_set_phy_cfg(&pf->hw, pi, cfg, NULL);
+	if (status) {
+		dev_err(dev, "Failed to set phy config, VSI %d error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		err = -EIO;
+	}
+
+	kfree(cfg);
+done:
+	kfree(pcaps);
+	return err;
+}
+
+/**
+ * ice_check_media_subtask - Check for media
+ * @pf: pointer to PF struct
+ *
+ * If media is available, then initialize PHY user configuration if it is not
+ * been, and configure the PHY if the interface is up.
+ */
+static void ice_check_media_subtask(struct ice_pf *pf)
+{
+	struct ice_port_info *pi;
+	struct ice_vsi *vsi;
+	int err;
+
+	/* No need to check for media if it's already present */
+	if (!test_bit(ICE_FLAG_NO_MEDIA, pf->flags))
+		return;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	/* Refresh link info and check if media is present */
+	pi = vsi->port_info;
+	err = ice_update_link_info(pi);
+	if (err)
+		return;
+
+	ice_check_module_power(pf, pi->phy.link_info.link_cfg_err);
+
+	if (pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE) {
+		if (!test_bit(ICE_PHY_INIT_COMPLETE, pf->state))
+			ice_init_phy_user_cfg(pi);
+
+		/* PHY settings are reset on media insertion, reconfigure
+		 * PHY to preserve settings.
+		 */
+		if (test_bit(ICE_VSI_DOWN, vsi->state) &&
+		    test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags))
+			return;
+
+		err = ice_configure_phy(vsi);
+		if (!err)
+			clear_bit(ICE_FLAG_NO_MEDIA, pf->flags);
+
+		/* A Link Status Event will be generated; the event handler
+		 * will complete bringing the interface up
+		 */
+	}
+}
+
+
+/**
+ * ice_find_tnl - return the matching tunnel entry if it exists
+ * @pf: pointer to PF struct
+ * @tnl_type: tunnel type
+ * @port: tunnel port
+ */
+static struct ice_tnl_entry *
+ice_find_tnl(struct ice_pf *pf, enum ice_tunnel_type tnl_type, u16 port)
+{
+	struct ice_tnl_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, &pf->tnl_list, node)
+		if (entry->port == port && entry->type == tnl_type)
+			return entry;
+	return NULL;
+}
+
+/**
+ * ice_handle_tmp_tnl_list - duplicate the tunnel entry and add it to tmp_list
+ * @pf: pointer to PF struct
+ * @entry: tunnel entry to duplicate
+ * @tmp_list: list to add the entry to
+ */
+static void
+ice_handle_tmp_tnl_list(struct ice_pf *pf, struct ice_tnl_entry *entry,
+			struct list_head *tmp_list)
+{
+	struct ice_tnl_entry *tnl;
+
+	tnl = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*tnl), GFP_ATOMIC);
+	if (!tnl)
+		return;
+
+	tnl->type = entry->type;
+	tnl->port = entry->port;
+	INIT_LIST_HEAD(&tnl->node);
+	list_add_tail(&tnl->node, tmp_list);
+}
+
+/**
+ * ice_handle_tunnel - update tunnel entries in hardware
+ * @pf: pointer to PF struct
+ *
+ * Check the list of tunnel entries and add or remove any that that have
+ * changed.
+ */
+static void ice_handle_tunnel(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_tnl_entry *entry, *tmp;
+	LIST_HEAD(tmp_del_list);
+	LIST_HEAD(tmp_add_list);
+	enum ice_status status;
+
+	if (list_empty(&pf->tnl_list))
+		return;
+
+	spin_lock(&pf->tnl_lock);
+	list_for_each_entry_safe(entry, tmp, &pf->tnl_list, node) {
+		if (entry->state & ICE_TNL_STATE_TO_DEL &&
+		    entry->state & ICE_TNL_STATE_ACTIVE)
+			list_move(&entry->node, &tmp_del_list);
+		else if (entry->state & ICE_TNL_STATE_TO_ADD)
+			ice_handle_tmp_tnl_list(pf, entry, &tmp_add_list);
+	}
+	spin_unlock(&pf->tnl_lock);
+
+	/* Process tmp_del_list and call ice_destroy_tunnel for each entry */
+	list_for_each_entry_safe(entry, tmp, &tmp_del_list, node) {
+		status = ice_destroy_tunnel(&pf->hw, entry->port, false);
+		if (status)
+			dev_err(dev, "error deleting port %d from UDP tunnels list\n",
+				entry->port);
+
+		list_del(&entry->node);
+		devm_kfree(dev, entry);
+	}
+
+	/* Process tmp_add_list and create_tunnel, if failed, delete element
+	 * from original tunnel list otherwise set state to ACTIVE.
+	 * purge all entries from tmp_add_list.
+	 */
+	list_for_each_entry_safe(entry, tmp, &tmp_add_list, node) {
+		struct ice_tnl_entry *tnl;
+
+		status = ice_create_tunnel(&pf->hw, entry->type, entry->port);
+		if (status == ICE_ERR_OUT_OF_RANGE)
+			dev_dbg(dev, "Max tunneled UDP ports reached, port %d not added\n",
+				entry->port);
+		else if (status)
+			dev_err(dev, "Error adding UDP tunnel - %s for tnl port %u\n",
+				ice_stat_str(status), entry->port);
+
+		/* delete entry from original tunnel list if failed to add,
+		 * otherwise set state to ACTIVE
+		 */
+		spin_lock(&pf->tnl_lock);
+		tnl = ice_find_tnl(pf, entry->type, entry->port);
+		if (tnl) {
+			if (status) {
+				list_del(&tnl->node);
+				devm_kfree(dev, tnl);
+			} else {
+				/* The tunnel was created successfully, mark
+				 * state of tunnel as ACTIVE, indicating it
+				 * was offloaded in HW.
+				 */
+				tnl->state = ICE_TNL_STATE_ACTIVE;
+			}
+		}
+		spin_unlock(&pf->tnl_lock);
+
+		list_del(&entry->node);
+		devm_kfree(dev, entry);
+	}
+}
+
+/**
+ * ice_service_task - manage and run subtasks
+ * @work: pointer to work_struct contained by the PF struct
+ */
+static void ice_service_task(struct work_struct *work)
+{
+	struct ice_pf *pf = container_of(work, struct ice_pf, serv_task);
+	unsigned long start_time = jiffies;
+
+	/* subtasks */
+
+	/* process reset requests first */
+	ice_reset_subtask(pf);
+
+	/* bail if a reset/recovery cycle is pending or rebuild failed */
+	if (ice_is_reset_in_progress(pf->state) ||
+	    test_bit(ICE_SUSPENDED, pf->state) ||
+	    test_bit(ICE_NEEDS_RESTART, pf->state)) {
+		ice_service_task_complete(pf);
+		return;
+	}
+
+	/* If we are in FW recovery mode, need to exit service tasks here */
+	if (test_bit(ICE_RECOVERY_MODE, pf->state))
+		return;
+
+	ice_clean_adminq_subtask(pf);
+	ice_check_media_subtask(pf);
+	ice_check_for_hang_subtask(pf);
+	ice_sync_fltr_subtask(pf);
+	ice_handle_mdd_event(pf);
+	ice_watchdog_subtask(pf);
+
+	if (ice_is_safe_mode(pf)) {
+		ice_service_task_complete(pf);
+		return;
+	}
+
+	/* Invoke remaining initialization of peer_objs */
+	ice_for_each_peer(pf, NULL, ice_finish_init_peer_obj);
+
+	ice_chnl_subtask_handle_interrupt(pf);
+	ice_channel_sync_global_cntrs(pf);
+	ice_process_vflr_event(pf);
+	ice_sync_udp_fltr_subtask(pf);
+	ice_clean_mailboxq_subtask(pf);
+	ice_clean_sbq_subtask(pf);
+	ice_clean_ptp_subtask(pf);
+	ice_sync_arfs_fltrs(pf);
+	ice_flush_fdir_ctx(pf);
+
+	ice_handle_tunnel(pf);
+
+	/* Clear ICE_SERVICE_SCHED flag to allow scheduling next event */
+	ice_service_task_complete(pf);
 
 	/* If the tasks have taken longer than one service timer period
 	 * or there is more work to be done, reset the service timer to
 	 * schedule the service task now.
 	 */
 	if (time_after(jiffies, (start_time + pf->serv_tmr_period)) ||
-	    test_bit(__ICE_MDD_EVENT_PENDING, pf->state) ||
-	    test_bit(__ICE_VFLR_EVENT_PENDING, pf->state) ||
-	    test_bit(__ICE_MAILBOXQ_EVENT_PENDING, pf->state) ||
-	    test_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state))
+	    test_bit(ICE_MDD_EVENT_PENDING, pf->state) ||
+	    test_bit(ICE_VFLR_EVENT_PENDING, pf->state) ||
+	    test_bit(ICE_MAILBOXQ_EVENT_PENDING, pf->state) ||
+	    test_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state) ||
+	    test_bit(ICE_FD_VF_FLUSH_CTX, pf->state) ||
+	    test_bit(ICE_ADMINQ_EVENT_PENDING, pf->state))
 		mod_timer(&pf->serv_tmr, jiffies);
 }
 
@@ -1544,10 +3172,52 @@ static void ice_set_ctrlq_len(struct ice_hw *hw)
 	hw->adminq.num_sq_entries = ICE_AQ_LEN;
 	hw->adminq.rq_buf_size = ICE_AQ_MAX_BUF_LEN;
 	hw->adminq.sq_buf_size = ICE_AQ_MAX_BUF_LEN;
-	hw->mailboxq.num_rq_entries = ICE_MBXRQ_LEN;
+	hw->mailboxq.num_rq_entries = PF_MBX_ARQLEN_ARQLEN_M;
 	hw->mailboxq.num_sq_entries = ICE_MBXSQ_LEN;
 	hw->mailboxq.rq_buf_size = ICE_MBXQ_MAX_BUF_LEN;
 	hw->mailboxq.sq_buf_size = ICE_MBXQ_MAX_BUF_LEN;
+	hw->sbq.num_rq_entries = ICE_SBQ_LEN;
+	hw->sbq.num_sq_entries = ICE_SBQ_LEN;
+	hw->sbq.rq_buf_size = ICE_SBQ_MAX_BUF_LEN;
+	hw->sbq.sq_buf_size = ICE_SBQ_MAX_BUF_LEN;
+}
+
+/**
+ * ice_schedule_reset - schedule a reset
+ * @pf: board private structure
+ * @reset: reset being requested
+ */
+int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+
+	/* bail out if earlier reset has failed */
+	if (test_bit(ICE_RESET_FAILED, pf->state)) {
+		dev_dbg(dev, "earlier reset has failed\n");
+		return -EIO;
+	}
+	/* bail if reset/recovery already in progress */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_dbg(dev, "Reset already in progress\n");
+		return -EBUSY;
+	}
+
+	switch (reset) {
+	case ICE_RESET_PFR:
+		set_bit(ICE_PFR_REQ, pf->state);
+		break;
+	case ICE_RESET_CORER:
+		set_bit(ICE_CORER_REQ, pf->state);
+		break;
+	case ICE_RESET_GLOBR:
+		set_bit(ICE_GLOBR_REQ, pf->state);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ice_service_task_schedule(pf);
+	return 0;
 }
 
 /**
@@ -1604,11 +3274,13 @@ static int ice_vsi_req_irq_msix(struct ice_vsi *vsi, char *basename)
 	int q_vectors = vsi->num_q_vectors;
 	struct ice_pf *pf = vsi->back;
 	int base = vsi->base_vector;
+	struct device *dev;
 	int rx_int_idx = 0;
 	int tx_int_idx = 0;
 	int vector, err;
 	int irq_num;
 
+	dev = ice_pf_to_dev(pf);
 	for (vector = 0; vector < q_vectors; vector++) {
 		struct ice_q_vector *q_vector = vsi->q_vectors[vector];
 
@@ -1628,19 +3300,28 @@ static int ice_vsi_req_irq_msix(struct ice_vsi *vsi, char *basename)
 			/* skip this unused q_vector */
 			continue;
 		}
-		err = devm_request_irq(&pf->pdev->dev, irq_num,
-				       vsi->irq_handler, 0,
-				       q_vector->name, q_vector);
+		if (vsi->type == ICE_VSI_CTRL && vsi->vf_id != ICE_INVAL_VFID)
+			err = devm_request_irq(dev, irq_num, vsi->irq_handler,
+					       IRQF_SHARED, q_vector->name,
+					       q_vector);
+		else
+			err = devm_request_irq(dev, irq_num, vsi->irq_handler,
+					       0, q_vector->name, q_vector);
 		if (err) {
-			netdev_err(vsi->netdev,
-				   "MSIX request_irq failed, error: %d\n", err);
+			netdev_err(vsi->netdev, "MSIX request_irq failed, error: %d\n",
+				   err);
 			goto free_q_irqs;
 		}
 
 		/* register for affinity change notifications */
-		q_vector->affinity_notify.notify = ice_irq_affinity_notify;
-		q_vector->affinity_notify.release = ice_irq_affinity_release;
-		irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify);
+		if (!IS_ENABLED(CONFIG_RFS_ACCEL)) {
+			struct irq_affinity_notify *affinity_notify;
+
+			affinity_notify = &q_vector->affinity_notify;
+			affinity_notify->notify = ice_irq_affinity_notify;
+			affinity_notify->release = ice_irq_affinity_release;
+			irq_set_affinity_notifier(irq_num, affinity_notify);
+		}
 
 		/* assign the mask for this irq */
 		irq_set_affinity_hint(irq_num, &q_vector->affinity_mask);
@@ -1652,23 +3333,415 @@ static int ice_vsi_req_irq_msix(struct ice_vsi *vsi, char *basename)
 free_q_irqs:
 	while (vector) {
 		vector--;
-		irq_num = pf->msix_entries[base + vector].vector,
-		irq_set_affinity_notifier(irq_num, NULL);
+		irq_num = pf->msix_entries[base + vector].vector;
+		if (!IS_ENABLED(CONFIG_RFS_ACCEL))
+			irq_set_affinity_notifier(irq_num, NULL);
 		irq_set_affinity_hint(irq_num, NULL);
-		devm_free_irq(&pf->pdev->dev, irq_num, &vsi->q_vectors[vector]);
+		devm_free_irq(dev, irq_num, &vsi->q_vectors[vector]);
 	}
 	return err;
 }
 
+#ifdef HAVE_XDP_SUPPORT
 /**
- * ice_ena_misc_vector - enable the non-queue interrupts
- * @pf: board private structure
+ * ice_xdp_alloc_setup_rings - Allocate and setup Tx rings for XDP
+ * @vsi: VSI to setup Tx rings used by XDP
+ *
+ * Return 0 on success and negative value on error
  */
-static void ice_ena_misc_vector(struct ice_pf *pf)
+static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
 {
-	struct ice_hw *hw = &pf->hw;
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	int i;
+
+	for (i = 0; i < vsi->num_xdp_txq; i++) {
+		u16 xdp_q_idx = vsi->alloc_txq + i;
+		struct ice_ring *xdp_ring;
+
+		xdp_ring = kzalloc(sizeof(*xdp_ring), GFP_KERNEL);
+
+		if (!xdp_ring)
+			goto free_xdp_rings;
+
+		xdp_ring->q_index = xdp_q_idx;
+		xdp_ring->reg_idx = vsi->txq_map[xdp_q_idx];
+		xdp_ring->vsi = vsi;
+		xdp_ring->netdev = NULL;
+		xdp_ring->dev = dev;
+		xdp_ring->count = vsi->num_tx_desc;
+		WRITE_ONCE(vsi->xdp_rings[i], xdp_ring);
+		if (ice_setup_tx_ring(xdp_ring))
+			goto free_xdp_rings;
+		ice_set_ring_xdp(xdp_ring);
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+		xdp_ring->xsk_pool = ice_xsk_umem(xdp_ring);
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	}
+
+	return 0;
+
+free_xdp_rings:
+	for (; i >= 0; i--)
+		if (vsi->xdp_rings[i] && vsi->xdp_rings[i]->desc)
+			ice_free_tx_ring(vsi->xdp_rings[i]);
+	return -ENOMEM;
+}
+
+/**
+ * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
+ * @vsi: VSI to set the bpf prog on
+ * @prog: the bpf prog pointer
+ */
+static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
+{
+	struct bpf_prog *old_prog;
+	int i;
+
+	old_prog = xchg(&vsi->xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	ice_for_each_rxq(vsi, i)
+		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
+}
+
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+/**
+ * ice_clear_xdp_stats - clear all Rx XDP statistics on VSI
+ * @vsi: VSI to clear Rx XDP statistics on
+ */
+static void ice_clear_xdp_stats(struct ice_vsi *vsi)
+{
+	int i;
+	struct ice_ring *ring;
+
+	ice_for_each_alloc_rxq(vsi, i) {
+		ring = READ_ONCE(vsi->rx_rings[i]);
+		if (ring)
+			memset(&ring->xdp_stats, 0, sizeof(ring->xdp_stats));
+	}
+}
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+
+/**
+ * ice_prepare_xdp_rings - Allocate, configure and setup Tx rings for XDP
+ * @vsi: VSI to bring up Tx rings used by XDP
+ * @prog: bpf program that will be assigned to VSI
+ *
+ * Return 0 on success and negative value on error
+ */
+int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog)
+{
+	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
+	int xdp_rings_rem = vsi->num_xdp_txq;
+	struct ice_pf *pf = vsi->back;
+	struct ice_qs_cfg xdp_qs_cfg = {
+		.qs_mutex = &pf->avail_q_mutex,
+		.pf_map = pf->avail_txqs,
+		.pf_map_size = pf->max_pf_txqs,
+		.q_count = vsi->num_xdp_txq,
+		.scatter_count = ICE_MAX_SCATTER_TXQS,
+		.vsi_map = vsi->txq_map,
+		.vsi_map_offset = vsi->alloc_txq,
+		.mapping_mode = ICE_VSI_MAP_CONTIG
+	};
+	enum ice_status status;
+	struct device *dev;
+	int i, v_idx;
+
+	dev = ice_pf_to_dev(pf);
+	vsi->xdp_rings = devm_kcalloc(dev, vsi->num_xdp_txq,
+				      sizeof(*vsi->xdp_rings), GFP_KERNEL);
+	if (!vsi->xdp_rings)
+		return -ENOMEM;
+
+	if (__ice_vsi_get_qs(&xdp_qs_cfg))
+		goto err_map_xdp;
+
+	vsi->xdp_mapping_mode = xdp_qs_cfg.mapping_mode;
+	if (ice_xdp_alloc_setup_rings(vsi))
+		goto clear_xdp_rings;
+
+	/* follow the logic from ice_vsi_map_rings_to_vectors */
+	ice_for_each_q_vector(vsi, v_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[v_idx];
+		int xdp_rings_per_v, q_id, q_base;
+
+		xdp_rings_per_v = DIV_ROUND_UP(xdp_rings_rem,
+					       vsi->num_q_vectors - v_idx);
+		q_base = vsi->num_xdp_txq - xdp_rings_rem;
+
+		for (q_id = q_base; q_id < (q_base + xdp_rings_per_v); q_id++) {
+			struct ice_ring *xdp_ring = vsi->xdp_rings[q_id];
+
+			xdp_ring->q_vector = q_vector;
+			xdp_ring->next = q_vector->tx.ring;
+			q_vector->tx.ring = xdp_ring;
+		}
+		xdp_rings_rem -= xdp_rings_per_v;
+	}
+
+	/* omit the scheduler update if in reset path; XDP queues will be
+	 * taken into account at the end of ice_vsi_rebuild, where
+	 * ice_cfg_vsi_lan is being called
+	 */
+	if (ice_is_reset_in_progress(pf->state))
+		return 0;
+
+	/* tell the Tx scheduler that right now we have
+	 * additional queues
+	 */
+	for (i = 0; i < vsi->tc_cfg.numtc; i++)
+		max_txqs[i] = vsi->num_txq + vsi->num_xdp_txq;
+
+	status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
+				 max_txqs);
+	if (status) {
+		dev_err(dev, "Failed VSI LAN queue config for XDP, error: %s\n",
+			ice_stat_str(status));
+		goto clear_xdp_rings;
+	}
+	ice_vsi_assign_bpf_prog(vsi, prog);
+
+	return 0;
+clear_xdp_rings:
+	for (i = 0; i < vsi->num_xdp_txq; i++)
+		if (vsi->xdp_rings[i]) {
+			kfree_rcu(vsi->xdp_rings[i], rcu);
+			vsi->xdp_rings[i] = NULL;
+		}
+
+err_map_xdp:
+	mutex_lock(&pf->avail_q_mutex);
+	for (i = 0; i < vsi->num_xdp_txq; i++) {
+		clear_bit(vsi->txq_map[i + vsi->alloc_txq], pf->avail_txqs);
+		vsi->txq_map[i + vsi->alloc_txq] = ICE_INVAL_Q_INDEX;
+	}
+	mutex_unlock(&pf->avail_q_mutex);
+
+	devm_kfree(dev, vsi->xdp_rings);
+	return -ENOMEM;
+}
+
+/**
+ * ice_destroy_xdp_rings - undo the configuration made by ice_prepare_xdp_rings
+ * @vsi: VSI to remove XDP rings
+ *
+ * Detach XDP rings from irq vectors, clean up the PF bitmap and free
+ * resources
+ */
+int ice_destroy_xdp_rings(struct ice_vsi *vsi)
+{
+	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
+	struct ice_pf *pf = vsi->back;
+	int i, v_idx;
+
+	/* q_vectors are freed in reset path so there's no point in detaching
+	 * rings; in case of rebuild being triggered not from reset reset bits
+	 * in pf->state won't be set, so additionally check first q_vector
+	 * against NULL
+	 */
+	if (ice_is_reset_in_progress(pf->state) || !vsi->q_vectors[0])
+		goto free_qmap;
+
+	ice_for_each_q_vector(vsi, v_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[v_idx];
+		struct ice_ring *ring;
+
+		ice_for_each_ring(ring, q_vector->tx)
+			if (!ring->tx_buf || !ice_ring_is_xdp(ring))
+				break;
+
+		/* restore the value of last node prior to XDP setup */
+		q_vector->tx.ring = ring;
+	}
+
+free_qmap:
+	mutex_lock(&pf->avail_q_mutex);
+	for (i = 0; i < vsi->num_xdp_txq; i++) {
+		clear_bit(vsi->txq_map[i + vsi->alloc_txq], pf->avail_txqs);
+		vsi->txq_map[i + vsi->alloc_txq] = ICE_INVAL_Q_INDEX;
+	}
+	mutex_unlock(&pf->avail_q_mutex);
+
+	for (i = 0; i < vsi->num_xdp_txq; i++)
+		if (vsi->xdp_rings[i]) {
+			if (vsi->xdp_rings[i]->desc)
+				ice_free_tx_ring(vsi->xdp_rings[i]);
+			kfree_rcu(vsi->xdp_rings[i], rcu);
+			vsi->xdp_rings[i] = NULL;
+		}
+
+	devm_kfree(ice_pf_to_dev(pf), vsi->xdp_rings);
+	vsi->xdp_rings = NULL;
+
+	if (ice_is_reset_in_progress(pf->state) || !vsi->q_vectors[0])
+		return 0;
+
+	ice_vsi_assign_bpf_prog(vsi, NULL);
+
+	/* notify Tx scheduler that we destroyed XDP queues and bring
+	 * back the old number of child nodes
+	 */
+	for (i = 0; i < vsi->tc_cfg.numtc; i++)
+		max_txqs[i] = vsi->num_txq;
+
+	/* change number of XDP Tx queues to 0 */
+	vsi->num_xdp_txq = 0;
+
+	return ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
+			       max_txqs);
+}
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+/**
+ * ice_vsi_rx_napi_schedule - Schedule napi on RX queues from VSI
+ * @vsi: VSI to schedule napi on
+ */
+static void ice_vsi_rx_napi_schedule(struct ice_vsi *vsi)
+{
+	int i;
+
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *rx_ring = vsi->rx_rings[i];
+
+		if (rx_ring->xsk_pool)
+			napi_schedule(&rx_ring->q_vector->napi);
+	}
+}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+/**
+ * ice_xdp_setup_prog - Add or remove XDP eBPF program
+ * @vsi: VSI to setup XDP for
+ * @prog: XDP program
+ * @extack: netlink extended ack
+ */
+static int
+ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog,
+		   struct netlink_ext_ack *extack)
+{
+	int frame_size = vsi->netdev->mtu + ICE_ETH_PKT_HDR_PAD;
+	bool if_running = netif_running(vsi->netdev);
+	int ret = 0, xdp_ring_err = 0;
+
+	if (frame_size > vsi->rx_buf_len) {
+		NL_SET_ERR_MSG_MOD(extack, "MTU too large for loading XDP");
+		return -EOPNOTSUPP;
+	}
+
+	/* need to stop netdev while setting up the program for Rx rings */
+	if (if_running && !test_and_set_bit(ICE_VSI_DOWN, vsi->state)) {
+		ret = ice_down(vsi);
+		if (ret) {
+			NL_SET_ERR_MSG_MOD(extack, "Preparing device for XDP attach failed");
+			return ret;
+		}
+	}
+
+	if (!ice_is_xdp_ena_vsi(vsi) && prog) {
+		vsi->num_xdp_txq = vsi->alloc_rxq;
+		xdp_ring_err = ice_prepare_xdp_rings(vsi, prog);
+		if (xdp_ring_err)
+			NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Tx resources failed");
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+		ice_clear_xdp_stats(vsi);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+	} else if (ice_is_xdp_ena_vsi(vsi) && !prog) {
+		xdp_ring_err = ice_destroy_xdp_rings(vsi);
+		if (xdp_ring_err)
+			NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Tx resources failed");
+	} else {
+		ice_vsi_assign_bpf_prog(vsi, prog);
+	}
+
+	if (if_running)
+		ret = ice_up(vsi);
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (!ret && prog && vsi->xsk_umems)
+		ice_vsi_rx_napi_schedule(vsi);
+#else
+	if (!ret && prog)
+		ice_vsi_rx_napi_schedule(vsi);
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	return (ret || xdp_ring_err) ? -ENOMEM : 0;
+}
+
+/**
+ * ice_xdp - implements XDP handler
+ * @dev: netdevice
+ * @xdp: XDP command
+ */
+#ifdef HAVE_NDO_BPF
+static int ice_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+#else
+static int ice_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+#endif
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+
+	if (vsi->type != ICE_VSI_PF) {
+		NL_SET_ERR_MSG_MOD(xdp->extack, "XDP can be loaded only on PF VSI");
+		return -EINVAL;
+	}
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack);
+#ifdef HAVE_XDP_QUERY_PROG
+	case XDP_QUERY_PROG:
+#ifndef NO_NETDEV_BPF_PROG_ATTACHED
+		xdp->prog_attached = ice_is_xdp_ena_vsi(vsi);
+#endif /* !NO_NETDEV_BPF_PROG_ATTACHED */
+		xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
+		return 0;
+#endif /* HAVE_XDP_QUERY_PROG */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	case XDP_SETUP_XSK_POOL:
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+		return ice_xsk_umem_setup(vsi, xdp->xsk.pool,
+#else
+		return ice_xsk_umem_setup(vsi, xdp->xsk.umem,
+#endif /* HAVE_NETDEV_BPF_XSK_POOL */
+					  xdp->xsk.queue_id);
+#ifndef NO_XDP_QUERY_XSK_UMEM
+	case XDP_QUERY_XSK_UMEM:
+		return ice_xsk_umem_query(vsi, &xdp->xsk.umem,
+					  xdp->xsk.queue_id);
+#endif /* !NO_XDP_QUERY_XSK_UMEM */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	default:
+		return -EINVAL;
+	}
+}
+#endif /* HAVE_XDP_SUPPORT */
+
+/**
+ * ice_ena_misc_vector - enable the non-queue interrupts
+ * @pf: board private structure
+ */
+static void ice_ena_misc_vector(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
 	u32 val;
 
+	/* Disable anti-spoof detection interrupt to prevent spurious event
+	 * interrupts during a function reset. Anti-spoof functionally is
+	 * still supported.
+	 */
+	val = rd32(hw, GL_MDCK_TX_TDPU);
+	val |= GL_MDCK_TX_TDPU_RCU_ANTISPOOF_ITR_DIS_M;
+	wr32(hw, GL_MDCK_TX_TDPU, val);
+
 	/* clear things first */
 	wr32(hw, PFINT_OICR_ENA, 0);	/* disable all */
 	rd32(hw, PFINT_OICR);		/* read to clear */
@@ -1679,7 +3752,9 @@ static void ice_ena_misc_vector(struct ice_pf *pf)
 	       PFINT_OICR_PCI_EXCEPTION_M |
 	       PFINT_OICR_VFLR_M |
 	       PFINT_OICR_HMC_ERR_M |
-	       PFINT_OICR_PE_CRITERR_M);
+	       PFINT_OICR_PE_PUSH_M |
+	       PFINT_OICR_PE_CRITERR_M |
+	       PFINT_OICR_SWINT_M);
 
 	wr32(hw, PFINT_OICR_ENA, val);
 
@@ -1698,10 +3773,13 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 	struct ice_pf *pf = (struct ice_pf *)data;
 	struct ice_hw *hw = &pf->hw;
 	irqreturn_t ret = IRQ_NONE;
+	struct device *dev;
 	u32 oicr, ena_mask;
 
-	set_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state);
-	set_bit(__ICE_MAILBOXQ_EVENT_PENDING, pf->state);
+	dev = ice_pf_to_dev(pf);
+	set_bit(ICE_ADMINQ_EVENT_PENDING, pf->state);
+	set_bit(ICE_MAILBOXQ_EVENT_PENDING, pf->state);
+	set_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state);
 
 	oicr = rd32(hw, PFINT_OICR);
 	ena_mask = rd32(hw, PFINT_OICR_ENA);
@@ -1713,11 +3791,19 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 
 	if (oicr & PFINT_OICR_MAL_DETECT_M) {
 		ena_mask &= ~PFINT_OICR_MAL_DETECT_M;
-		set_bit(__ICE_MDD_EVENT_PENDING, pf->state);
+		set_bit(ICE_MDD_EVENT_PENDING, pf->state);
 	}
 	if (oicr & PFINT_OICR_VFLR_M) {
-		ena_mask &= ~PFINT_OICR_VFLR_M;
-		set_bit(__ICE_VFLR_EVENT_PENDING, pf->state);
+		/* disable any further VFLR event notifications */
+		if (test_bit(ICE_VF_RESETS_DISABLED, pf->state)) {
+			u32 reg = rd32(hw, PFINT_OICR_ENA);
+
+			reg &= ~PFINT_OICR_VFLR_M;
+			wr32(hw, PFINT_OICR_ENA, reg);
+		} else {
+			ena_mask &= ~PFINT_OICR_VFLR_M;
+			set_bit(ICE_VFLR_EVENT_PENDING, pf->state);
+		}
 	}
 
 	if (oicr & PFINT_OICR_GRST_M) {
@@ -1735,21 +3821,20 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 		else if (reset == ICE_RESET_EMPR)
 			pf->empr_count++;
 		else
-			dev_dbg(&pf->pdev->dev, "Invalid reset type %d\n",
-				reset);
+			dev_dbg(dev, "Invalid reset type %d\n", reset);
 
 		/* If a reset cycle isn't already in progress, we set a bit in
 		 * pf->state so that the service task can start a reset/rebuild.
 		 * We also make note of which reset happened so that peer
 		 * devices/drivers can be informed.
 		 */
-		if (!test_and_set_bit(__ICE_RESET_OICR_RECV, pf->state)) {
+		if (!test_and_set_bit(ICE_RESET_OICR_RECV, pf->state)) {
 			if (reset == ICE_RESET_CORER)
-				set_bit(__ICE_CORER_RECV, pf->state);
+				set_bit(ICE_CORER_RECV, pf->state);
 			else if (reset == ICE_RESET_GLOBR)
-				set_bit(__ICE_GLOBR_RECV, pf->state);
+				set_bit(ICE_GLOBR_RECV, pf->state);
 			else
-				set_bit(__ICE_EMPR_RECV, pf->state);
+				set_bit(ICE_EMPR_RECV, pf->state);
 
 			/* There are couple of different bits at play here.
 			 * hw->reset_ongoing indicates whether the hardware is
@@ -1757,7 +3842,7 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 			 * is received and set back to false after the driver
 			 * has determined that the hardware is out of reset.
 			 *
-			 * __ICE_RESET_OICR_RECV in pf->state indicates
+			 * ICE_RESET_OICR_RECV in pf->state indicates
 			 * that a post reset rebuild is required before the
 			 * driver is operational again. This is set above.
 			 *
@@ -1768,35 +3853,56 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 		}
 	}
 
-	if (oicr & PFINT_OICR_HMC_ERR_M) {
+	if (oicr & PFINT_OICR_TSYN_TX_M) {
+		ena_mask &= ~PFINT_OICR_TSYN_TX_M;
+		set_bit(ICE_PTP_TX_TS_READY, pf->state);
+	}
+
+	if (oicr & PFINT_OICR_TSYN_EVNT_M) {
+		u8 tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+		u32 gltsyn_stat = rd32(hw, GLTSYN_STAT(tmr_idx));
+
+		/* Save EVENTs from GTSYN register */
+		pf->ptp.ext_ts_irq |= gltsyn_stat & (GLTSYN_STAT_EVENT0_M |
+						     GLTSYN_STAT_EVENT1_M |
+						     GLTSYN_STAT_EVENT2_M);
+		ena_mask &= ~PFINT_OICR_TSYN_EVNT_M;
+		set_bit(ICE_PTP_EXT_TS_READY, pf->state);
+	}
+	if (oicr & (PFINT_OICR_PE_CRITERR_M | PFINT_OICR_HMC_ERR_M | PFINT_OICR_PE_PUSH_M)) {
+		struct ice_event *event;
+
 		ena_mask &= ~PFINT_OICR_HMC_ERR_M;
-		dev_dbg(&pf->pdev->dev,
-			"HMC Error interrupt - info 0x%x, data 0x%x\n",
-			rd32(hw, PFHMC_ERRORINFO),
-			rd32(hw, PFHMC_ERRORDATA));
+		ena_mask &= ~PFINT_OICR_PE_CRITERR_M;
+		ena_mask &= ~PFINT_OICR_PE_PUSH_M;
+		event = kzalloc(sizeof(*event), GFP_ATOMIC);
+		if (event) {
+			set_bit(ICE_EVENT_CRIT_ERR, event->type);
+			event->reporter = NULL;
+			/* report the entire OICR value to peer */
+			event->info.reg = oicr;
+			ice_for_each_peer(pf, event, ice_peer_check_for_reg);
+			kfree(event);
+		}
 	}
 
 	/* Report any remaining unexpected interrupts */
 	oicr &= ena_mask;
 	if (oicr) {
-		dev_dbg(&pf->pdev->dev, "unhandled interrupt oicr=0x%08x\n",
-			oicr);
+		dev_dbg(dev, "unhandled interrupt oicr=0x%08x\n", oicr);
 		/* If a critical error is pending there is no choice but to
 		 * reset the device.
 		 */
-		if (oicr & (PFINT_OICR_PE_CRITERR_M |
-			    PFINT_OICR_PCI_EXCEPTION_M |
+		if (oicr & (PFINT_OICR_PCI_EXCEPTION_M |
 			    PFINT_OICR_ECC_ERR_M)) {
-			set_bit(__ICE_PFR_REQ, pf->state);
+			set_bit(ICE_PFR_REQ, pf->state);
 			ice_service_task_schedule(pf);
 		}
 	}
 	ret = IRQ_HANDLED;
 
-	if (!test_bit(__ICE_DOWN, pf->state)) {
-		ice_service_task_schedule(pf);
-		ice_irq_dynamic_ena(hw, NULL, NULL);
-	}
+	ice_service_task_schedule(pf);
+	ice_irq_dynamic_ena(hw, NULL, NULL);
 
 	return ret;
 }
@@ -1815,6 +3921,9 @@ static void ice_dis_ctrlq_interrupts(struct ice_hw *hw)
 	wr32(hw, PFINT_MBX_CTL,
 	     rd32(hw, PFINT_MBX_CTL) & ~PFINT_MBX_CTL_CAUSE_ENA_M);
 
+	wr32(hw, PFINT_SB_CTL,
+	     rd32(hw, PFINT_SB_CTL) & ~PFINT_SB_CTL_CAUSE_ENA_M);
+
 	/* disable Control queue Interrupt causes */
 	wr32(hw, PFINT_OICR_CTL,
 	     rd32(hw, PFINT_OICR_CTL) & ~PFINT_OICR_CTL_CAUSE_ENA_M);
@@ -1838,7 +3947,7 @@ static void ice_free_irq_msix_misc(struct ice_pf *pf)
 
 	if (pf->msix_entries) {
 		synchronize_irq(pf->msix_entries[pf->oicr_idx].vector);
-		devm_free_irq(&pf->pdev->dev,
+		devm_free_irq(ice_pf_to_dev(pf),
 			      pf->msix_entries[pf->oicr_idx].vector, pf);
 	}
 
@@ -1869,6 +3978,11 @@ static void ice_ena_ctrlq_interrupts(struct ice_hw *hw, u16 reg_idx)
 	       PFINT_MBX_CTL_CAUSE_ENA_M);
 	wr32(hw, PFINT_MBX_CTL, val);
 
+	/* This enables Sideband queue Interrupt causes */
+	val = ((reg_idx & PFINT_SB_CTL_MSIX_INDX_M) |
+	       PFINT_SB_CTL_CAUSE_ENA_M);
+	wr32(hw, PFINT_SB_CTL, val);
+
 	ice_flush(hw);
 }
 
@@ -1882,13 +3996,13 @@ static void ice_ena_ctrlq_interrupts(struct ice_hw *hw, u16 reg_idx)
  */
 static int ice_req_irq_msix_misc(struct ice_pf *pf)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
 	int oicr_idx, err = 0;
 
 	if (!pf->int_name[0])
 		snprintf(pf->int_name, sizeof(pf->int_name) - 1, "%s-%s:misc",
-			 dev_driver_string(&pf->pdev->dev),
-			 dev_name(&pf->pdev->dev));
+			 dev_driver_string(dev), dev_name(dev));
 
 	/* Do not request IRQ but do enable OICR interrupt since settings are
 	 * lost during reset. Note that this function is called only during
@@ -1903,14 +4017,12 @@ static int ice_req_irq_msix_misc(struct ice_pf *pf)
 		return oicr_idx;
 
 	pf->num_avail_sw_msix -= 1;
-	pf->oicr_idx = oicr_idx;
+	pf->oicr_idx = (u16)oicr_idx;
 
-	err = devm_request_irq(&pf->pdev->dev,
-			       pf->msix_entries[pf->oicr_idx].vector,
+	err = devm_request_irq(dev, pf->msix_entries[pf->oicr_idx].vector,
 			       ice_misc_intr, 0, pf->int_name, pf);
 	if (err) {
-		dev_err(&pf->pdev->dev,
-			"devm_request_irq for %s failed: %d\n",
+		dev_err(dev, "devm_request_irq for %s failed: %d\n",
 			pf->int_name, err);
 		ice_free_res(pf->irq_tracker, 1, ICE_RES_MISC_VEC_ID);
 		pf->num_avail_sw_msix += 1;
@@ -1963,7 +4075,6 @@ static void ice_set_ops(struct net_device *netdev)
 		ice_set_ethtool_safe_mode_ops(netdev);
 		return;
 	}
-
 	netdev->netdev_ops = &ice_netdev_ops;
 	ice_set_ethtool_ops(netdev);
 }
@@ -1975,6 +4086,7 @@ static void ice_set_ops(struct net_device *netdev)
 static void ice_set_netdev_features(struct net_device *netdev)
 {
 	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	bool is_dvm_ena = ice_is_dvm_ena(&pf->hw);
 	netdev_features_t csumo_features;
 	netdev_features_t vlano_features;
 	netdev_features_t dflt_features;
@@ -1989,6 +4101,7 @@ static void ice_set_netdev_features(struct net_device *netdev)
 
 	dflt_features = NETIF_F_SG	|
 			NETIF_F_HIGHDMA	|
+			NETIF_F_NTUPLE	|
 			NETIF_F_RXHASH;
 
 	csumo_features = NETIF_F_RXCSUM	  |
@@ -2000,12 +4113,51 @@ static void ice_set_netdev_features(struct net_device *netdev)
 			 NETIF_F_HW_VLAN_CTAG_TX     |
 			 NETIF_F_HW_VLAN_CTAG_RX;
 
-	tso_features = NETIF_F_TSO;
-
+	/* Enable CTAG/STAG filtering by default in Double VLAN Mode (DVM) */
+	if (is_dvm_ena)
+		vlano_features |= NETIF_F_HW_VLAN_STAG_FILTER;
+
+	tso_features = NETIF_F_TSO			|
+		       NETIF_F_TSO_ECN			|
+		       NETIF_F_TSO6			|
+		       NETIF_F_GSO_GRE			|
+		       NETIF_F_GSO_UDP_TUNNEL		|
+#ifdef NETIF_F_GSO_GRE_CSUM
+		       NETIF_F_GSO_GRE_CSUM		|
+		       NETIF_F_GSO_UDP_TUNNEL_CSUM	|
+#endif
+#ifdef NETIF_F_GSO_PARTIAL
+		       NETIF_F_GSO_PARTIAL		|
+#endif
+#ifdef NETIF_F_GSO_IPXIP4
+		       NETIF_F_GSO_IPXIP4		|
+		       NETIF_F_GSO_IPXIP6		|
+#else
+#ifdef NETIF_F_GSO_IPIP
+		       NETIF_F_GSO_IPIP		|
+		       NETIF_F_GSO_SIT		|
+#endif
+#endif /* NETIF_F_GSO_IPXIP4 */
+#ifdef NETIF_F_GSO_UDP_L4
+		       NETIF_F_GSO_UDP_L4	|
+#endif /* NETIF_F_GSO_UDP_L4 */
+		       0;
+
+#ifndef NETIF_F_GSO_PARTIAL
+	tso_features ^= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+#else
+	netdev->gso_partial_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM |
+					NETIF_F_GSO_GRE_CSUM;
+#endif
 	/* set features that user can change */
 	netdev->hw_features = dflt_features | csumo_features |
 			      vlano_features | tso_features;
 
+#ifdef HAVE_MPLS_FEATURES
+	/* add support for HW_CSUM on packets with MPLS header */
+	netdev->mpls_features =  NETIF_F_HW_CSUM;
+#endif /* HAVE_MPLS_FEATURES */
+
 	/* enable features */
 	netdev->features |= netdev->hw_features;
 	/* encap and VLAN devices inherit default, csumo and tso features */
@@ -2013,6 +4165,26 @@ static void ice_set_netdev_features(struct net_device *netdev)
 				   tso_features;
 	netdev->vlan_features |= dflt_features | csumo_features |
 				 tso_features;
+
+#ifdef NETIF_F_HW_TC
+	netdev->hw_features |= NETIF_F_HW_TC;
+#endif /* NETIF_F_HW_TC */
+
+	/* advertise support but don't enable by default since only one type of
+	 * VLAN offload can be enabled at a time (i.e. CTAG or STAG). When one
+	 * type turns on the other has to be turned off. This is enforced by the
+	 * ice_fix_features() ndo callback.
+	 */
+	if (is_dvm_ena) {
+		netdev->hw_features |= NETIF_F_HW_VLAN_STAG_RX |
+			NETIF_F_HW_VLAN_STAG_TX;
+	}
+
+#ifdef HAVE_NETDEV_SB_DEV
+	/* Enable macvlan offloads */
+	if (test_bit(ICE_FLAG_VMDQ_ENA, pf->flags))
+		netdev->hw_features |= NETIF_F_HW_L2FW_DOFFLOAD;
+#endif /* HAVE_NETDEV_SB_DEV */
 }
 
 /**
@@ -2023,17 +4195,25 @@ static void ice_set_netdev_features(struct net_device *netdev)
  */
 static int ice_cfg_netdev(struct ice_vsi *vsi)
 {
-	struct ice_pf *pf = vsi->back;
 	struct ice_netdev_priv *np;
 	struct net_device *netdev;
 	u8 mac_addr[ETH_ALEN];
-	int err;
 
+#ifdef HAVE_NETDEV_SB_DEV
+	/* Inform Kernel beforehand about max number of MACVLAN queues
+	 * supported.
+	 */
+	netdev = alloc_etherdev_mqs(sizeof(*np),
+				    ICE_MAX_MACVLANS + vsi->alloc_txq,
+				    ICE_MAX_MACVLANS + vsi->alloc_rxq);
+#else /* !HAVE_NETDEV_SB_DEV */
 	netdev = alloc_etherdev_mqs(sizeof(*np), vsi->alloc_txq,
 				    vsi->alloc_rxq);
+#endif /* !HAVE_NETDEV_SB_DEV */
 	if (!netdev)
 		return -ENOMEM;
 
+	set_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
 	vsi->netdev = netdev;
 	np = netdev_priv(netdev);
 	np->vsi = vsi;
@@ -2043,7 +4223,7 @@ static int ice_cfg_netdev(struct ice_vsi *vsi)
 	ice_set_ops(netdev);
 
 	if (vsi->type == ICE_VSI_PF) {
-		SET_NETDEV_DEV(netdev, &pf->pdev->dev);
+		SET_NETDEV_DEV(netdev, ice_pf_to_dev(vsi->back));
 		ether_addr_copy(mac_addr, vsi->port_info->mac.perm_addr);
 		ether_addr_copy(netdev->dev_addr, mac_addr);
 		ether_addr_copy(netdev->perm_addr, mac_addr);
@@ -2057,17 +4237,15 @@ static int ice_cfg_netdev(struct ice_vsi *vsi)
 	/* setup watchdog timeout value to be 5 second */
 	netdev->watchdog_timeo = 5 * HZ;
 
+#ifdef HAVE_NETDEVICE_MIN_MAX_MTU
+#ifdef HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+	netdev->extended->min_mtu = ETH_MIN_MTU;
+	netdev->extended->max_mtu = ICE_MAX_MTU;
+#else
 	netdev->min_mtu = ETH_MIN_MTU;
 	netdev->max_mtu = ICE_MAX_MTU;
-
-	err = register_netdev(vsi->netdev);
-	if (err)
-		return err;
-
-	netif_carrier_off(vsi->netdev);
-
-	/* make sure transmit queues start off as stopped */
-	netif_tx_stop_all_queues(vsi->netdev);
+#endif /* HAVE_RHEL7_EXTENDED_MIN_MAX_MTU */
+#endif /* HAVE_NETDEVICE_MIN_MAX_MTU */
 
 	return 0;
 }
@@ -2097,7 +4275,47 @@ void ice_fill_rss_lut(u8 *lut, u16 rss_table_size, u16 rss_size)
 static struct ice_vsi *
 ice_pf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
 {
-	return ice_vsi_setup(pf, pi, ICE_VSI_PF, ICE_INVAL_VFID);
+	return ice_vsi_setup(pf, pi, ICE_VSI_PF, ICE_INVAL_VFID, NULL, 0);
+}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+static struct ice_vsi *
+ice_chnl_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
+		   struct ice_channel *ch)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_CHNL, ICE_INVAL_VFID, ch, 0);
+}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+#ifdef HAVE_NETDEV_SB_DEV
+/**
+ * ice_macvlan_vsi_setup - Set up a MACVLAN VSI
+ * @pf: board private structure
+ * @pi: pointer to the port_info instance
+ *
+ * Returns pointer to the successfully allocated VSI software struct
+ * on success, otherwise returns NULL on failure.
+ */
+static struct ice_vsi *
+ice_macvlan_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_OFFLOAD_MACVLAN, ICE_INVAL_VFID,
+			     NULL, 0);
+}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+/**
+ * ice_ctrl_vsi_setup - Set up a control VSI
+ * @pf: board private structure
+ * @pi: pointer to the port_info instance
+ *
+ * Returns pointer to the successfully allocated VSI software struct
+ * on success, otherwise returns NULL on failure.
+ */
+static struct ice_vsi *
+ice_ctrl_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_CTRL, ICE_INVAL_VFID, NULL, 0);
 }
 
 /**
@@ -2111,50 +4329,38 @@ ice_pf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
 struct ice_vsi *
 ice_lb_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
 {
-	return ice_vsi_setup(pf, pi, ICE_VSI_LB, ICE_INVAL_VFID);
+	return ice_vsi_setup(pf, pi, ICE_VSI_LB, ICE_INVAL_VFID, NULL, 0);
 }
 
 /**
  * ice_vlan_rx_add_vid - Add a VLAN ID filter to HW offload
  * @netdev: network interface to be adjusted
- * @proto: unused protocol
+ * @proto: VLAN TPID
  * @vid: VLAN ID to be added
  *
  * net_device_ops implementation for adding VLAN IDs
  */
 static int
-ice_vlan_rx_add_vid(struct net_device *netdev, __always_unused __be16 proto,
-		    u16 vid)
+ice_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi_vlan_ops *vlan_ops;
 	struct ice_vsi *vsi = np->vsi;
+	struct ice_vlan vlan;
 	int ret;
 
-	if (vid >= VLAN_N_VID) {
-		netdev_err(netdev, "VLAN id requested %d is out of range %d\n",
-			   vid, VLAN_N_VID);
-		return -EINVAL;
-	}
-
-	if (vsi->info.pvid)
-		return -EINVAL;
+	if (!vid)
+		return 0;
 
-	/* Enable VLAN pruning when VLAN 0 is added */
-	if (unlikely(!vid)) {
-		ret = ice_cfg_vlan_pruning(vsi, true, false);
-		if (ret)
-			return ret;
-	}
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
-	/* Add all VLAN IDs including 0 to the switch filter. VLAN ID 0 is
-	 * needed to continue allowing all untagged packets since VLAN prune
-	 * list is applied to all packets by the switch
+	/* Add a switch rule for this VLAN ID so its corresponding VLAN tagged
+	 * packets aren't pruned by the device's internal switch on Rx
 	 */
-	ret = ice_vsi_add_vlan(vsi, vid);
-	if (!ret) {
-		vsi->vlan_ena = true;
-		set_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
-	}
+	vlan = ICE_VLAN(be16_to_cpu(proto), vid, 0, ICE_FWD_TO_VSI);
+	ret = vlan_ops->add_vlan(vsi, &vlan);
+	if (!ret)
+		set_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
 
 	return ret;
 }
@@ -2162,66 +4368,200 @@ ice_vlan_rx_add_vid(struct net_device *netdev, __always_unused __be16 proto,
 /**
  * ice_vlan_rx_kill_vid - Remove a VLAN ID filter from HW offload
  * @netdev: network interface to be adjusted
- * @proto: unused protocol
+ * @proto: VLAN TPID
  * @vid: VLAN ID to be removed
  *
  * net_device_ops implementation for removing VLAN IDs
  */
 static int
-ice_vlan_rx_kill_vid(struct net_device *netdev, __always_unused __be16 proto,
-		     u16 vid)
+ice_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi_vlan_ops *vlan_ops;
 	struct ice_vsi *vsi = np->vsi;
+	struct ice_vlan vlan;
 	int ret;
 
-	if (vsi->info.pvid)
-		return -EINVAL;
+	if (!vid)
+		return 0;
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
-	/* Make sure ice_vsi_kill_vlan is successful before updating VLAN
+	/* Make sure VLAN delete is successful before updating VLAN
 	 * information
 	 */
-	ret = ice_vsi_kill_vlan(vsi, vid);
+	vlan = ICE_VLAN(be16_to_cpu(proto), vid, 0, ICE_FWD_TO_VSI);
+	ret = vlan_ops->del_vlan(vsi, &vlan);
 	if (ret)
 		return ret;
 
-	/* Disable VLAN pruning when VLAN 0 is removed */
-	if (unlikely(!vid))
-		ret = ice_cfg_vlan_pruning(vsi, false, false);
-
-	vsi->vlan_ena = false;
-	set_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
-	return ret;
+	set_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
+	return 0;
 }
 
 /**
- * ice_setup_pf_sw - Setup the HW switch on startup or after reset
+ * ice_pf_reset_stats - Reset all of the stats for the given PF
  * @pf: board private structure
- *
- * Returns 0 on success, negative value on failure
  */
-static int ice_setup_pf_sw(struct ice_pf *pf)
+static void ice_pf_reset_stats(struct ice_pf *pf)
 {
-	struct ice_vsi *vsi;
-	int status = 0;
+	memset(&pf->stats, 0, sizeof(pf->stats));
+	memset(&pf->stats_prev, 0, sizeof(pf->stats_prev));
+	pf->stat_prev_loaded = false;
+
+	pf->hw_csum_rx_error = 0;
+#ifdef ICE_ADD_PROBES
+	pf->tcp_segs = 0;
+	pf->udp_segs = 0;
+	pf->tx_tcp_cso = 0;
+	pf->tx_udp_cso = 0;
+	pf->tx_sctp_cso = 0;
+	pf->tx_ip4_cso = 0;
+	pf->tx_l3_cso_err = 0;
+	pf->tx_l4_cso_err = 0;
+	pf->rx_tcp_cso = 0;
+	pf->rx_udp_cso = 0;
+	pf->rx_sctp_cso = 0;
+	pf->rx_ip4_cso = 0;
+	pf->rx_ip4_cso_err = 0;
+	pf->rx_tcp_cso_err = 0;
+	pf->rx_udp_cso_err = 0;
+	pf->rx_sctp_cso_err = 0;
+	pf->tx_q_vlano = 0;
+	pf->rx_q_vlano = 0;
+	pf->tx_ad_vlano = 0;
+	pf->rx_ad_vlano = 0;
+#endif
+}
 
-	if (ice_is_reset_in_progress(pf->state))
-		return -EBUSY;
 
-	vsi = ice_pf_vsi_setup(pf, pf->hw.port_info);
-	if (!vsi) {
-		status = -ENOMEM;
-		goto unroll_vsi_setup;
-	}
+#ifdef HAVE_TC_INDIR_BLOCK
+#ifdef HAVE_FLOW_BLOCK_API
+/**
+ * ice_rep_indr_tc_block_unbind
+ * @cb_priv: indirection block private data
+ */
+static void ice_rep_indr_tc_block_unbind(void *cb_priv)
+{
+	struct ice_indr_block_priv *indr_priv = cb_priv;
 
-	status = ice_cfg_netdev(vsi);
-	if (status) {
-		status = -ENODEV;
-		goto unroll_vsi_setup;
-	}
+	list_del(&indr_priv->list);
+	devm_kfree(&indr_priv->netdev->dev, indr_priv);
+}
+#endif /* HAVE_FLOW_BLOCK_API */
 
-	/* registering the NAPI handler requires both the queues and
-	 * netdev to be created, which are done in ice_pf_vsi_setup()
+/**
+ * ice_tc_indir_block_unregister - Unregister TC indirect block notifications
+ * @vsi: VSI struct which has the netdev
+ */
+static void ice_tc_indir_block_unregister(struct ice_vsi *vsi)
+{
+	struct ice_netdev_priv *np = netdev_priv(vsi->netdev);
+#ifndef HAVE_TC_FLOW_INDIR_DEV
+	/* clean indirect TC block notifications */
+	unregister_netdevice_notifier(&np->netdevice_nb);
+	ice_indr_clean_block_privs(np);
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && ((defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP) && defined(HAVE_FLOW_BLOCK_API)) || defined(HAVE_FLOW_INDIR_BLOCK_QDISC))
+	flow_indr_dev_unregister(ice_indr_setup_tc_cb, np,
+				 ice_rep_indr_tc_block_unbind);
+#else
+	flow_indr_dev_unregister(ice_indr_setup_tc_cb, np,
+				 ice_indr_setup_block_cb);
+#endif /* HAVE_TC_FLOW_INDIR_DEV */
+}
+
+/**
+ * ice_tc_indir_block_remove - clean indirect TC block notifications
+ * @pf: PF structure
+ */
+static void ice_tc_indir_block_remove(struct ice_pf *pf)
+{
+	struct ice_vsi *pf_vsi = ice_get_main_vsi(pf);
+
+	if (!pf_vsi)
+		return;
+
+	ice_tc_indir_block_unregister(pf_vsi);
+}
+
+/**
+ * ice_tc_indir_block_register - Register TC indirect block notifications
+ * @vsi: VSI struct which has the netdev
+ *
+ * Returns 0 on success, negative value on failure
+ */
+static int ice_tc_indir_block_register(struct ice_vsi *vsi)
+{
+	struct ice_netdev_priv *np;
+
+	if (!vsi || !vsi->netdev)
+		return -EINVAL;
+
+	np = netdev_priv(vsi->netdev);
+
+	INIT_LIST_HEAD(&np->tc_indr_block_priv_list);
+#ifndef HAVE_TC_FLOW_INDIR_DEV
+	np->netdevice_nb.notifier_call = ice_netdevice_event;
+	return register_netdevice_notifier(&np->netdevice_nb);
+#else
+	return flow_indr_dev_register(ice_indr_setup_tc_cb, np);
+#endif /* HAVE_TC_FLOW_INDIR_DEV */
+}
+
+#endif /* HAVE_TC_INDIR_BLOCK */
+
+/**
+ * ice_setup_pf_sw - Setup the HW switch on startup or after reset
+ * @pf: board private structure
+ *
+ * Returns 0 on success, negative value on failure
+ */
+static int ice_setup_pf_sw(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	bool dvm = ice_is_dvm_ena(&pf->hw);
+	struct ice_vsi *vsi;
+	int status = 0;
+
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+
+	status = ice_aq_set_port_params(pf->hw.port_info, 0, false, false, dvm,
+					NULL);
+	if (status)
+		return -EIO;
+
+	vsi = ice_pf_vsi_setup(pf, pf->hw.port_info);
+	if (!vsi)
+		return -ENOMEM;
+
+	/* init channel list */
+	INIT_LIST_HEAD(&vsi->ch_list);
+
+	status = ice_cfg_netdev(vsi);
+	if (status) {
+		status = -ENODEV;
+		goto unroll_vsi_setup;
+	}
+
+	/* netdev has to be configured before setting frame size */
+	ice_vsi_cfg_frame_size(vsi);
+
+#ifdef HAVE_TC_INDIR_BLOCK
+	/* init indirect block notifications */
+	status = ice_tc_indir_block_register(vsi);
+	if (status) {
+		dev_err(dev, "Failed to register netdev notifier\n");
+		goto unroll_cfg_netdev;
+	}
+#endif /* HAVE_TC_INDIR_BLOCK */
+
+	/* Setup DCB netlink interface */
+	ice_dcbnl_setup(vsi);
+
+	/* registering the NAPI handler requires both the queues and
+	 * netdev to be created, which are done in ice_pf_vsi_setup()
 	 * and ice_cfg_netdev() respectively
 	 */
 	ice_napi_add(vsi);
@@ -2230,26 +4570,30 @@ static int ice_setup_pf_sw(struct ice_pf *pf)
 	if (status)
 		goto unroll_napi_add;
 
+	status = ice_set_cpu_rx_rmap(vsi);
+	if (status) {
+		dev_err(dev, "Failed to set CPU Rx map VSI %d error %d\n",
+			vsi->vsi_num, status);
+		status = -EINVAL;
+		goto unroll_napi_add;
+	}
+
 	return status;
 
 unroll_napi_add:
-	if (vsi) {
-		ice_napi_del(vsi);
-		if (vsi->netdev) {
-			if (vsi->netdev->reg_state == NETREG_REGISTERED)
-				unregister_netdev(vsi->netdev);
-			free_netdev(vsi->netdev);
-			vsi->netdev = NULL;
-		}
+	ice_napi_del(vsi);
+#ifdef HAVE_TC_INDIR_BLOCK
+	ice_tc_indir_block_unregister(vsi);
+unroll_cfg_netdev:
+#endif /* HAVE_TC_INDIR_BLOCK */
+	if (vsi->netdev) {
+		clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+		free_netdev(vsi->netdev);
+		vsi->netdev = NULL;
 	}
-
 unroll_vsi_setup:
-	if (vsi) {
-		ice_vsi_free_q_vectors(vsi);
-		ice_vsi_delete(vsi);
-		ice_vsi_put_qs(vsi);
-		ice_vsi_clear(vsi);
-	}
+	ice_vsi_release(vsi);
+
 	return status;
 }
 
@@ -2262,7 +4606,8 @@ static int ice_setup_pf_sw(struct ice_pf *pf)
 static u16
 ice_get_avail_q_count(unsigned long *pf_qmap, struct mutex *lock, u16 size)
 {
-	u16 count = 0, bit;
+	unsigned long bit;
+	u16 count = 0;
 
 	mutex_lock(lock);
 	for_each_clear_bit(bit, pf_qmap, size)
@@ -2300,6 +4645,7 @@ static void ice_deinit_pf(struct ice_pf *pf)
 {
 	ice_service_task_stop(pf);
 	mutex_destroy(&pf->sw_mutex);
+	mutex_destroy(&pf->tc_mutex);
 	mutex_destroy(&pf->avail_q_mutex);
 
 	if (pf->avail_txqs) {
@@ -2321,21 +4667,50 @@ static void ice_set_pf_caps(struct ice_pf *pf)
 {
 	struct ice_hw_func_caps *func_caps = &pf->hw.func_caps;
 
+	clear_bit(ICE_FLAG_VMDQ_ENA, pf->flags);
+	if (func_caps->common_cap.vmdq)
+		set_bit(ICE_FLAG_VMDQ_ENA, pf->flags);
+	clear_bit(ICE_FLAG_IWARP_ENA, pf->flags);
+	clear_bit(ICE_FLAG_PEER_ENA, pf->flags);
+	if (func_caps->common_cap.iwarp && IS_ENABLED(CONFIG_MFD_CORE)) {
+		set_bit(ICE_FLAG_IWARP_ENA, pf->flags);
+		set_bit(ICE_FLAG_PEER_ENA, pf->flags);
+	}
 	clear_bit(ICE_FLAG_DCB_CAPABLE, pf->flags);
 	if (func_caps->common_cap.dcb)
 		set_bit(ICE_FLAG_DCB_CAPABLE, pf->flags);
-#ifdef CONFIG_PCI_IOV
 	clear_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags);
+	clear_bit(ICE_FLAG_ESWITCH_CAPABLE, pf->flags);
 	if (func_caps->common_cap.sr_iov_1_1) {
 		set_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags);
+		set_bit(ICE_FLAG_ESWITCH_CAPABLE, pf->flags);
 		pf->num_vfs_supported = min_t(int, func_caps->num_allocd_vfs,
 					      ICE_MAX_VF_COUNT);
 	}
-#endif /* CONFIG_PCI_IOV */
 	clear_bit(ICE_FLAG_RSS_ENA, pf->flags);
 	if (func_caps->common_cap.rss_table_size)
 		set_bit(ICE_FLAG_RSS_ENA, pf->flags);
 
+	clear_bit(ICE_FLAG_FD_ENA, pf->flags);
+	if (func_caps->fd_fltr_guar > 0 || func_caps->fd_fltr_best_effort > 0) {
+		u16 unused;
+
+		/* ctrl_vsi_idx will be set to a valid value when flow director
+		 * is setup by ice_init_fdir
+		 */
+		pf->ctrl_vsi_idx = ICE_NO_VSI;
+		set_bit(ICE_FLAG_FD_ENA, pf->flags);
+		/* force guaranteed filter pool for PF */
+		ice_alloc_fd_guar_item(&pf->hw, &unused,
+				       func_caps->fd_fltr_guar);
+		/* force shared filter pool for PF */
+		ice_alloc_fd_shrd_item(&pf->hw, &unused,
+				       func_caps->fd_fltr_best_effort);
+	}
+	clear_bit(ICE_FLAG_PTP_ENA, pf->flags);
+	if (func_caps->common_cap.ieee_1588)
+		set_bit(ICE_FLAG_PTP_ENA, pf->flags);
+
 	pf->max_pf_txqs = func_caps->common_cap.num_txq;
 	pf->max_pf_rxqs = func_caps->common_cap.num_rxq;
 }
@@ -2349,12 +4724,19 @@ static int ice_init_pf(struct ice_pf *pf)
 	ice_set_pf_caps(pf);
 
 	mutex_init(&pf->sw_mutex);
+	mutex_init(&pf->tc_mutex);
+
+	INIT_HLIST_HEAD(&pf->aq_wait_list);
+	spin_lock_init(&pf->aq_wait_lock);
+	init_waitqueue_head(&pf->aq_wait_queue);
+
+	init_waitqueue_head(&pf->reset_wait_queue);
 
 	/* setup service timer and periodic service task */
 	timer_setup(&pf->serv_tmr, ice_service_timer, 0);
 	pf->serv_tmr_period = HZ;
 	INIT_WORK(&pf->serv_task, ice_service_task);
-	clear_bit(__ICE_SERVICE_SCHED, pf->state);
+	clear_bit(ICE_SERVICE_SCHED, pf->state);
 
 	mutex_init(&pf->avail_q_mutex);
 	pf->avail_txqs = bitmap_zalloc(pf->max_pf_txqs, GFP_KERNEL);
@@ -2363,94 +4745,173 @@ static int ice_init_pf(struct ice_pf *pf)
 
 	pf->avail_rxqs = bitmap_zalloc(pf->max_pf_rxqs, GFP_KERNEL);
 	if (!pf->avail_rxqs) {
-		devm_kfree(&pf->pdev->dev, pf->avail_txqs);
+		devm_kfree(ice_pf_to_dev(pf), pf->avail_txqs);
 		pf->avail_txqs = NULL;
 		return -ENOMEM;
 	}
 
+	/* init tunnel list and lock */
+	spin_lock_init(&pf->tnl_lock);
+	INIT_LIST_HEAD(&pf->tnl_list);
+
+	return 0;
+}
+
+static int ice_alloc_msix_entries(struct ice_pf *pf, u16 num_entries)
+{
+	u16 i;
+
+	pf->msix_entries = devm_kcalloc(ice_pf_to_dev(pf), num_entries,
+					sizeof(*pf->msix_entries), GFP_KERNEL);
+	if (!pf->msix_entries)
+		return -ENOMEM;
+
+	for (i = 0; i < num_entries; i++)
+		pf->msix_entries[i].entry = i;
+
 	return 0;
 }
 
+static void ice_free_msix_entries(struct ice_pf *pf)
+{
+	devm_kfree(ice_pf_to_dev(pf), pf->msix_entries);
+	pf->msix_entries = NULL;
+}
+
 /**
- * ice_ena_msix_range - Request a range of MSIX vectors from the OS
+ * ice_ena_msix_range - request a range of MSI-X vectors from the OS
  * @pf: board private structure
  *
- * compute the number of MSIX vectors required (v_budget) and request from
- * the OS. Return the number of vectors reserved or negative on failure
+ * The driver first tries to enable best-case scenario MSI-X vectors. If that
+ * doesn't succeeed then a fall-back method is employed.
+ *
+ * The fall-back logic is described below with each [#] being an attempt at
+ * enabling a certain number of MSI-X. If any of the steps succeed, then return
+ * the number of MSI-X enabled from pci_ena_msix_exact(). If any of the attempts
+ * fail, then goto the next step.
+ *
+ * Attempt [0]: Enable the best-case scenario MSI-X vectors.
+ *
+ * Attempt [1]: Enable MSI-X vectors with eswitch support disabled
+ *
+ * Attempt [2]: Enable MSI-X vectors with MACVLAN support disabled, which
+ * reduces the request by the MSI-X vectors needed for MACVLAN.
+ *
+ * Attempt [3]: Enable MSI-X vectors with the number of pf->num_lan_msix reduced
+ * by a factor of 2 from the previous attempts (i.e. num_online_cpus() / 2).
+ * Also, with the number of pf->num_rdma_msix reduced by a factor of ~2 from the
+ * previous attempts (i.e. num_online_cpus() / 2 + ICE_RDMA_NUM_AEQ_MSIX).
+ *
+ * Attempt [4]: Same as attempt [3], except reduce both by a factor of 4.
+ *
+ * Attempt [5]: Enable the bare-minimum MSI-X vectors.
+ *
+ * Also, if the adjusted_base_msix ever hits the mimimum required for LAN or
+ * RDMA, then just set the needed MSI-X for that feature to the minimum (similar
+ * to attempt [5]).
  */
 static int ice_ena_msix_range(struct ice_pf *pf)
 {
-	int v_left, v_actual, v_budget = 0;
-	int needed, err, i;
+	int err = -ENOSPC, num_cpus, attempt, adjusted_msix_divisor = 1, needed;
+	struct device *dev = ice_pf_to_dev(pf);
 
-	v_left = pf->hw.func_caps.common_cap.num_msix_vectors;
+	num_cpus = num_online_cpus();
 
-	/* reserve one vector for miscellaneous handler */
-	needed = 1;
-	if (v_left < needed)
-		goto no_hw_vecs_left_err;
-	v_budget += needed;
-	v_left -= needed;
+#define ICE_MAX_ENABLE_MSIX_ATTEMPTS 5
+	/* make multiple passes at enabling MSI-X vectors in case there aren't
+	 * enough available for the best-case scenario
+	 */
+	for (attempt = 0; attempt <= ICE_MAX_ENABLE_MSIX_ATTEMPTS; attempt++) {
+		int adjusted_base_msix = num_cpus / adjusted_msix_divisor;
 
-	/* reserve vectors for LAN traffic */
-	needed = min_t(int, num_online_cpus(), v_left);
-	if (v_left < needed)
-		goto no_hw_vecs_left_err;
-	pf->num_lan_msix = needed;
-	v_budget += needed;
-	v_left -= needed;
+		/* attempt to enable minimum MSI-X range */
+		if (attempt == ICE_MAX_ENABLE_MSIX_ATTEMPTS) {
+			needed = ICE_MIN_MSIX;
+			pf->num_lan_msix = ICE_MIN_LAN_MSIX;
 
-	pf->msix_entries = devm_kcalloc(&pf->pdev->dev, v_budget,
-					sizeof(*pf->msix_entries), GFP_KERNEL);
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+				needed += ICE_MIN_RDMA_MSIX;
+				pf->num_rdma_msix = ICE_MIN_RDMA_MSIX;
+			}
+		} else {
+			if (adjusted_base_msix > ICE_MIN_LAN_MSIX)
+				pf->num_lan_msix = adjusted_base_msix;
+			else
+				pf->num_lan_msix = ICE_MIN_LAN_MSIX;
 
-	if (!pf->msix_entries) {
-		err = -ENOMEM;
-		goto exit_err;
-	}
+			needed = pf->num_lan_msix + ICE_OICR_MSIX;
 
-	for (i = 0; i < v_budget; i++)
-		pf->msix_entries[i].entry = i;
+			if (attempt == 0 &&
+			    test_bit(ICE_FLAG_ESWITCH_CAPABLE, pf->flags)) {
+				needed += ICE_ESWITCH_MSIX;
+			} else if (attempt == 1) {
+				dev_warn(dev, "Not enough MSI-X for eswitch support, disabling feature\n");
+				clear_bit(ICE_FLAG_ESWITCH_CAPABLE, pf->flags);
+			}
+#ifdef HAVE_NETDEV_SB_DEV
+
+			/* only reserve MACVLAN MSI-X on the first and second
+			 * attempt
+			 */
+			if ((attempt == 0 || attempt == 1) &&
+			    test_bit(ICE_FLAG_VMDQ_ENA, pf->flags)) {
+				needed += ICE_MAX_MACVLANS * ICE_DFLT_VEC_VMDQ_VSI;
+			} else if (attempt == 2) {
+				dev_warn(dev, "Not enough MSI-X for hardware MACVLAN support, disabling feature.\n");
+				clear_bit(ICE_FLAG_VMDQ_ENA, pf->flags);
+			}
+#endif /* HAVE_NETDEV_SB_DEV */
 
-	/* actually reserve the vectors */
-	v_actual = pci_enable_msix_range(pf->pdev, pf->msix_entries,
-					 ICE_MIN_MSIX, v_budget);
+			/* reserve vectors for RDMA peer driver */
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+				if (adjusted_base_msix > ICE_MIN_RDMA_MSIX)
+					pf->num_rdma_msix = adjusted_base_msix +
+						ICE_RDMA_NUM_AEQ_MSIX;
+				else
+					pf->num_rdma_msix = ICE_MIN_RDMA_MSIX;
 
-	if (v_actual < 0) {
-		dev_err(&pf->pdev->dev, "unable to reserve MSI-X vectors\n");
-		err = v_actual;
-		goto msix_err;
-	}
+				needed += pf->num_rdma_msix;
+			}
+		}
 
-	if (v_actual < v_budget) {
-		dev_warn(&pf->pdev->dev,
-			 "not enough OS MSI-X vectors. requested = %d, obtained = %d\n",
-			 v_budget, v_actual);
-/* 2 vectors for LAN (traffic + OICR) */
-#define ICE_MIN_LAN_VECS 2
+		if (test_bit(ICE_FLAG_FD_ENA, pf->flags))
+			needed += ICE_FDIR_MSIX;
 
-		if (v_actual < ICE_MIN_LAN_VECS) {
-			/* error if we can't get minimum vectors */
-			pci_disable_msix(pf->pdev);
-			err = -ERANGE;
-			goto msix_err;
+		err = ice_alloc_msix_entries(pf, needed);
+		if (err)
+			goto err_out;
+
+		dev_dbg(dev, "attempting to enable %d MSI-X vectors\n", needed);
+		err = pci_enable_msix_exact(pf->pdev, pf->msix_entries, needed);
+		if (err < 0) {
+			ice_free_msix_entries(pf);
+			dev_notice(dev, "Couldn't get %d MSI-X vectors due to OS, Platform, and/or PCI-function limitations. Reducing request and retrying.",
+				   needed);
+
+			/* MACVLAN support already disabled and we still failed
+			 * to enable MSI-X, so  make another attempt at enabling
+			 * MSI-X by reducing the needed amount
+			 */
+			if (attempt > 1)
+				adjusted_msix_divisor *= 2;
 		} else {
-			pf->num_lan_msix = ICE_MIN_LAN_VECS;
-		}
-	}
+			if (pf->num_lan_msix != num_cpus)
+				dev_notice(dev, "Enabled %d MSI-X vectors for LAN traffic.\n",
+					   pf->num_lan_msix);
 
-	return v_actual;
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags) &&
+			    pf->num_rdma_msix != (num_cpus + ICE_RDMA_NUM_AEQ_MSIX))
+				dev_notice(dev, "Enabled %d MSI-X vectors for RDMA.\n",
+					   pf->num_rdma_msix);
 
-msix_err:
-	devm_kfree(&pf->pdev->dev, pf->msix_entries);
-	goto exit_err;
+			return needed;
+		}
+	}
 
-no_hw_vecs_left_err:
-	dev_err(&pf->pdev->dev,
-		"not enough device MSI-X vectors. requested = %d, available = %d\n",
-		needed, v_left);
-	err = -ERANGE;
-exit_err:
+err_out:
+	dev_err(dev, "failed to enable MSI-X vectors\n");
 	pf->num_lan_msix = 0;
+	pf->num_rdma_msix = 0;
 	return err;
 }
 
@@ -2461,8 +4922,7 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 static void ice_dis_msix(struct ice_pf *pf)
 {
 	pci_disable_msix(pf->pdev);
-	devm_kfree(&pf->pdev->dev, pf->msix_entries);
-	pf->msix_entries = NULL;
+	ice_free_msix_entries(pf);
 }
 
 /**
@@ -2474,7 +4934,7 @@ static void ice_clear_interrupt_scheme(struct ice_pf *pf)
 	ice_dis_msix(pf);
 
 	if (pf->irq_tracker) {
-		devm_kfree(&pf->pdev->dev, pf->irq_tracker);
+		devm_kfree(ice_pf_to_dev(pf), pf->irq_tracker);
 		pf->irq_tracker = NULL;
 	}
 }
@@ -2494,21 +4954,142 @@ static int ice_init_interrupt_scheme(struct ice_pf *pf)
 
 	/* set up vector assignment tracking */
 	pf->irq_tracker =
-		devm_kzalloc(&pf->pdev->dev, sizeof(*pf->irq_tracker) +
-			     (sizeof(u16) * vectors), GFP_KERNEL);
+		devm_kzalloc(ice_pf_to_dev(pf),
+			     struct_size(pf->irq_tracker, list, vectors),
+			     GFP_KERNEL);
 	if (!pf->irq_tracker) {
 		ice_dis_msix(pf);
 		return -ENOMEM;
 	}
 
 	/* populate SW interrupts pool with number of OS granted IRQs. */
-	pf->num_avail_sw_msix = vectors;
-	pf->irq_tracker->num_entries = vectors;
+	pf->num_avail_sw_msix = (u16)vectors;
+	pf->irq_tracker->num_entries = (u16)vectors;
 	pf->irq_tracker->end = pf->irq_tracker->num_entries;
 
 	return 0;
 }
 
+/**
+ * ice_is_wol_supported - check if WoL is supported
+ * @hw: pointer to hardware info
+ *
+ * Check if WoL is supported based on the HW configuration.
+ * Returns true if NVM supports and enables WoL for this port, false otherwise
+ */
+bool ice_is_wol_supported(struct ice_hw *hw)
+{
+	u16 wol_ctrl;
+
+	/* A bit set to 1 in the NVM Software Reserved Word 2 (WoL control
+	 * word) indicates WoL is not supported on the corresponding PF ID.
+	 */
+	if (ice_read_sr_word(hw, ICE_SR_NVM_WOL_CFG, &wol_ctrl))
+		return false;
+
+	return !(BIT(hw->port_info->lport) & wol_ctrl);
+}
+
+/**
+ * ice_vsi_recfg_qs - Change the number of queues on a VSI
+ * @vsi: VSI being changed
+ * @new_rx: new number of Rx queues
+ * @new_tx: new number of Tx queues
+ *
+ * Only change the number of queues if new_tx, or new_rx is non-0.
+ *
+ * Returns 0 on success.
+ */
+int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx)
+{
+	struct ice_pf *pf = vsi->back;
+	int err = 0, timeout = 50;
+
+	if (!new_rx && !new_tx)
+		return -EINVAL;
+
+	while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) {
+		timeout--;
+		if (!timeout)
+			return -EBUSY;
+		usleep_range(1000, 2000);
+	}
+
+	if (new_tx)
+		vsi->req_txq = (u16)new_tx;
+	if (new_rx)
+		vsi->req_rxq = (u16)new_rx;
+
+	/* set for the next time the netdev is started */
+	if (!netif_running(vsi->netdev)) {
+		ice_vsi_rebuild(vsi, false);
+		dev_dbg(ice_pf_to_dev(pf), "Link is down, queue count change happens when link is brought up\n");
+		goto done;
+	}
+
+	ice_vsi_close(vsi);
+	ice_vsi_rebuild(vsi, false);
+	ice_pf_dcb_recfg(pf);
+	ice_vsi_open(vsi);
+done:
+	clear_bit(ICE_CFG_BUSY, pf->state);
+	return err;
+}
+
+/**
+ * ice_set_safe_mode_vlan_cfg - configure PF VSI to allow all VLANs in safe mode
+ * @pf: PF to configure
+ *
+ * No VLAN offloads/filtering are advertised in safe mode so make sure the PF
+ * VSI can still Tx/Rx VLAN tagged packets.
+ */
+static void ice_set_safe_mode_vlan_cfg(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi = ice_get_main_vsi(pf);
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	struct ice_hw *hw;
+
+	if (!vsi)
+		return;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return;
+
+	hw = &pf->hw;
+	ctxt->info = vsi->info;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
+			    ICE_AQ_VSI_PROP_SECURITY_VALID |
+			    ICE_AQ_VSI_PROP_SW_VALID);
+
+	/* disable VLAN anti-spoof */
+	ctxt->info.sec_flags &= ~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+				  ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+
+	/* disable VLAN pruning and keep all other settings */
+	ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	/* allow all VLANs on Tx and don't strip on Rx */
+	ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL |
+		ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to update VSI for safe mode VLANs, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+	} else {
+		vsi->info.sec_flags = ctxt->info.sec_flags;
+		vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+		vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
+	}
+
+	kfree(ctxt);
+}
+
 /**
  * ice_log_pkg_init - log result of DDP package load
  * @hw: pointer to hardware info
@@ -2517,9 +5098,10 @@ static int ice_init_interrupt_scheme(struct ice_pf *pf)
 static void
 ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 {
-	struct ice_pf *pf = (struct ice_pf *)hw->back;
-	struct device *dev = &pf->pdev->dev;
+	struct ice_pf *pf = hw->back;
+	struct device *dev;
 
+	dev = ice_pf_to_dev(pf);
 	switch (*status) {
 	case ICE_SUCCESS:
 		/* The package download AdminQ command returned success because
@@ -2533,16 +5115,14 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 		    !memcmp(hw->pkg_name, hw->active_pkg_name,
 			    sizeof(hw->pkg_name))) {
 			if (hw->pkg_dwnld_status == ICE_AQ_RC_EEXIST)
-				dev_info(dev,
-					 "DDP package already present on device: %s version %d.%d.%d.%d\n",
+				dev_info(dev, "DDP package already present on device: %s version %d.%d.%d.%d\n",
 					 hw->active_pkg_name,
 					 hw->active_pkg_ver.major,
 					 hw->active_pkg_ver.minor,
 					 hw->active_pkg_ver.update,
 					 hw->active_pkg_ver.draft);
 			else
-				dev_info(dev,
-					 "The DDP package was successfully loaded: %s version %d.%d.%d.%d\n",
+				dev_info(dev, "The DDP package was successfully loaded: %s version %d.%d.%d.%d\n",
 					 hw->active_pkg_name,
 					 hw->active_pkg_ver.major,
 					 hw->active_pkg_ver.minor,
@@ -2550,8 +5130,7 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 					 hw->active_pkg_ver.draft);
 		} else if (hw->active_pkg_ver.major != ICE_PKG_SUPP_VER_MAJ ||
 			   hw->active_pkg_ver.minor != ICE_PKG_SUPP_VER_MNR) {
-			dev_err(dev,
-				"The device has a DDP package that is not supported by the driver.  The device has package '%s' version %d.%d.x.x.  The driver requires version %d.%d.x.x.  Entering Safe Mode.\n",
+			dev_err(dev, "The device has a DDP package that is not supported by the driver.  The device has package '%s' version %d.%d.x.x.  The driver requires version %d.%d.x.x.  Entering Safe Mode.\n",
 				hw->active_pkg_name,
 				hw->active_pkg_ver.major,
 				hw->active_pkg_ver.minor,
@@ -2559,8 +5138,7 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 			*status = ICE_ERR_NOT_SUPPORTED;
 		} else if (hw->active_pkg_ver.major == ICE_PKG_SUPP_VER_MAJ &&
 			   hw->active_pkg_ver.minor == ICE_PKG_SUPP_VER_MNR) {
-			dev_info(dev,
-				 "The driver could not load the DDP package file because a compatible DDP package is already present on the device.  The device has package '%s' version %d.%d.%d.%d.  The package file found by the driver: '%s' version %d.%d.%d.%d.\n",
+			dev_info(dev, "The driver could not load the DDP package file because a compatible DDP package is already present on the device.  The device has package '%s' version %d.%d.%d.%d.  The package file found by the driver: '%s' version %d.%d.%d.%d.\n",
 				 hw->active_pkg_name,
 				 hw->active_pkg_ver.major,
 				 hw->active_pkg_ver.minor,
@@ -2572,54 +5150,51 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 				 hw->pkg_ver.update,
 				 hw->pkg_ver.draft);
 		} else {
-			dev_err(dev,
-				"An unknown error occurred when loading the DDP package, please reboot the system.  If the problem persists, update the NVM.  Entering Safe Mode.\n");
+			dev_err(dev, "An unknown error occurred when loading the DDP package, please reboot the system.  If the problem persists, update the NVM.  Entering Safe Mode.\n");
 			*status = ICE_ERR_NOT_SUPPORTED;
 		}
 		break;
+	case ICE_ERR_FW_DDP_MISMATCH:
+		dev_err(dev, "The firmware loaded on the device is not compatible with the DDP package.  Please update the device's NVM.  Entering safe mode.\n");
+		break;
 	case ICE_ERR_BUF_TOO_SHORT:
-		/* fall-through */
 	case ICE_ERR_CFG:
-		dev_err(dev,
-			"The DDP package file is invalid. Entering Safe Mode.\n");
+		dev_err(dev, "The DDP package file is invalid. Entering Safe Mode.\n");
 		break;
 	case ICE_ERR_NOT_SUPPORTED:
 		/* Package File version not supported */
 		if (hw->pkg_ver.major > ICE_PKG_SUPP_VER_MAJ ||
 		    (hw->pkg_ver.major == ICE_PKG_SUPP_VER_MAJ &&
 		     hw->pkg_ver.minor > ICE_PKG_SUPP_VER_MNR))
-			dev_err(dev,
-				"The DDP package file version is higher than the driver supports.  Please use an updated driver.  Entering Safe Mode.\n");
+			dev_err(dev, "The DDP package file version is higher than the driver supports.  Please use an updated driver.  Entering Safe Mode.\n");
 		else if (hw->pkg_ver.major < ICE_PKG_SUPP_VER_MAJ ||
 			 (hw->pkg_ver.major == ICE_PKG_SUPP_VER_MAJ &&
 			  hw->pkg_ver.minor < ICE_PKG_SUPP_VER_MNR))
-			dev_err(dev,
-				"The DDP package file version is lower than the driver supports.  The driver requires version %d.%d.x.x.  Please use an updated DDP Package file.  Entering Safe Mode.\n",
+			dev_err(dev, "The DDP package file version is lower than the driver supports.  The driver requires version %d.%d.x.x.  Please use an updated DDP Package file.  Entering Safe Mode.\n",
 				ICE_PKG_SUPP_VER_MAJ, ICE_PKG_SUPP_VER_MNR);
 		break;
 	case ICE_ERR_AQ_ERROR:
-		switch (hw->adminq.sq_last_status) {
+		switch (hw->pkg_dwnld_status) {
 		case ICE_AQ_RC_ENOSEC:
 		case ICE_AQ_RC_EBADSIG:
-			dev_err(dev,
-				"The DDP package could not be loaded because its signature is not valid.  Please use a valid DDP Package.  Entering Safe Mode.\n");
+			dev_err(dev, "The DDP package could not be loaded because its signature is not valid.  Please use a valid DDP Package.  Entering Safe Mode.\n");
 			return;
 		case ICE_AQ_RC_ESVN:
-			dev_err(dev,
-				"The DDP Package could not be loaded because its security revision is too low.  Please use an updated DDP Package.  Entering Safe Mode.\n");
+			dev_err(dev, "The DDP Package could not be loaded because its security revision is too low.  Please use an updated DDP Package.  Entering Safe Mode.\n");
 			return;
 		case ICE_AQ_RC_EBADMAN:
 		case ICE_AQ_RC_EBADBUF:
-			dev_err(dev,
-				"An error occurred on the device while loading the DDP package.  The device will be reset.\n");
+			dev_err(dev, "An error occurred on the device while loading the DDP package.  The device will be reset.\n");
+			/* poll for reset to complete */
+			if (ice_check_reset(hw))
+				dev_err(dev, "Error resetting device. Please reload the driver\n");
 			return;
 		default:
 			break;
 		}
 		/* fall-through */
 	default:
-		dev_err(dev,
-			"An unknown error (%d) occurred when loading the DDP package.  Entering Safe Mode.\n",
+		dev_err(dev, "An unknown error (%d) occurred when loading the DDP package.  Entering Safe Mode.\n",
 			*status);
 		break;
 	}
@@ -2637,7 +5212,7 @@ static void
 ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf)
 {
 	enum ice_status status = ICE_ERR_PARAM;
-	struct device *dev = &pf->pdev->dev;
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
 
 	/* Load DDP Package */
@@ -2650,8 +5225,7 @@ ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf)
 		status = ice_init_pkg(hw, hw->pkg_copy, hw->pkg_size);
 		ice_log_pkg_init(hw, &status);
 	} else {
-		dev_err(dev,
-			"The DDP package file failed to load. Entering Safe Mode.\n");
+		dev_err(dev, "The DDP package file failed to load. Entering Safe Mode.\n");
 	}
 
 	if (status) {
@@ -2666,6 +5240,83 @@ ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf)
 	set_bit(ICE_FLAG_ADV_FEATURES, pf->flags);
 }
 
+/**
+ * ice_prepare_for_safe_mode - Disable advanced features
+ * @pf: board private structure
+ *
+ * If package download failed during reset, then driver clears
+ * ICE_FLAG_ADV_FEATURES PF flag bit, and device is official in safe mode.
+ * So, all advance features have to be disabled.
+ */
+static int ice_prepare_for_safe_mode(struct ice_pf *pf)
+{
+	struct ice_vsi *pf_vsi;
+	u16 val;
+	int err;
+
+	/* Device not in Safe Mode, so bail out here */
+	if (!ice_is_safe_mode(pf))
+		return 0;
+
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi)
+		return -EINVAL;
+
+	/* only one queue pair in safe mode */
+	pf_vsi->req_txq = 1;
+	pf_vsi->req_rxq = 1;
+
+	/* remove RSS configuration */
+	ice_rem_vsi_rss_list(&pf_vsi->back->hw, pf_vsi->idx);
+
+	/* if the PF VSI was flow director enabled, disable it
+	 * in the VSI context as we won't be doing flow director
+	 * in safe mode. Not doing this causes the add VSI in
+	 * ice_rebuild to fail.
+	 */
+	val = le16_to_cpu(pf_vsi->info.fd_options);
+	val &= ~(ICE_AQ_VSI_FD_ENABLE | ICE_AQ_VSI_FD_PROG_ENABLE);
+	pf_vsi->info.fd_options = cpu_to_le16(val);
+	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags))
+		ice_free_vfs(pf);
+
+#ifdef HAVE_NETDEV_SB_DEV
+	if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags)) {
+		int v;
+
+		ice_for_each_vsi(pf, v) {
+			struct ice_vsi *vsi = pf->vsi[v];
+
+			if (vsi && vsi->type == ICE_VSI_OFFLOAD_MACVLAN)
+				ice_deinit_macvlan(vsi);
+		}
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	ice_set_safe_mode_vlan_cfg(pf);
+
+	/* Need to update netdev features, netdev ops and ethtool ops
+	 * for safe mode, so free the PF netdev and setup a new one
+	 */
+	unregister_netdev(pf_vsi->netdev);
+	clear_bit(ICE_VSI_NETDEV_REGISTERED, pf_vsi->state);
+	free_netdev(pf_vsi->netdev);
+	clear_bit(ICE_VSI_NETDEV_ALLOCD, pf_vsi->state);
+	pf_vsi->netdev = NULL;
+
+	ice_set_safe_mode_caps(&pf->hw);
+	ice_set_pf_caps(pf);
+	err = ice_cfg_netdev(pf_vsi);
+	if (err) {
+		dev_err(ice_pf_to_dev(pf), "could not allocate netdev, err %d\n",
+			err);
+		return err;
+	}
+
+	return 0;
+}
+
+
 /**
  * ice_verify_cacheline_size - verify driver's assumption of 64 Byte cache lines
  * @pf: pointer to the PF structure
@@ -2677,8 +5328,7 @@ ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf)
 static void ice_verify_cacheline_size(struct ice_pf *pf)
 {
 	if (rd32(&pf->hw, GLPCI_CNF2) & GLPCI_CNF2_CACHELINE_SIZE_M)
-		dev_warn(&pf->pdev->dev,
-			 "%d Byte cache line assumption is invalid, driver may have Tx timeouts!\n",
+		dev_warn(ice_pf_to_dev(pf), "%d Byte cache line assumption is invalid, driver may have Tx timeouts!\n",
 			 ICE_CACHE_LINE_BYTES);
 }
 
@@ -2696,61 +5346,152 @@ static enum ice_status ice_send_version(struct ice_pf *pf)
 	dv.minor_ver = DRV_VERSION_MINOR;
 	dv.build_ver = DRV_VERSION_BUILD;
 	dv.subbuild_ver = 0;
-	strscpy((char *)dv.driver_string, DRV_VERSION,
+	strscpy((char *)dv.driver_string, DRV_VERSION DRV_VERSION_EXTRA,
 		sizeof(dv.driver_string));
 	return ice_aq_send_driver_ver(&pf->hw, &dv, NULL);
 }
 
 /**
- * ice_get_opt_fw_name - return optional firmware file name or NULL
- * @pf: pointer to the PF instance
+ * ice_init_acl - Initializes the ACL block
+ * @pf: ptr to PF device
+ *
+ * returns 0 on success, negative on error
  */
-static char *ice_get_opt_fw_name(struct ice_pf *pf)
+int ice_init_acl(struct ice_pf *pf)
 {
-	/* Optional firmware name same as default with additional dash
-	 * followed by a EUI-64 identifier (PCIe Device Serial Number)
+	struct ice_acl_tbl_params params;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	int divider;
+	u16 scen_id;
+
+	/* Creates a single ACL table that consist of src_ip(4 byte),
+	 * dest_ip(4 byte), src_port(2 byte) and dst_port(2 byte) for a total
+	 * of 12 bytes (96 bits), hence 120 bit wide keys, i.e. 3 TCAM slices.
+	 * If the given hardware card contains less than 8 PFs (ports) then
+	 * each PF will have its own TCAM slices. For 8 PFs, a given slice will
+	 * be shared by 2 different PFs.
 	 */
-	struct pci_dev *pdev = pf->pdev;
-	char *opt_fw_filename = NULL;
-	u32 dword;
-	u8 dsn[8];
-	int pos;
+	if (hw->dev_caps.num_funcs < 8)
+		divider = ICE_ACL_ENTIRE_SLICE;
+	else
+		divider = ICE_ACL_HALF_SLICE;
 
-	/* Determine the name of the optional file using the DSN (two
-	 * dwords following the start of the DSN Capability).
-	 */
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DSN);
-	if (pos) {
-		opt_fw_filename = kzalloc(NAME_MAX, GFP_KERNEL);
-		if (!opt_fw_filename)
-			return NULL;
+	memset(&params, 0, sizeof(params));
+	params.width = ICE_AQC_ACL_KEY_WIDTH_BYTES * 3;
+	params.depth = ICE_AQC_ACL_TCAM_DEPTH / divider;
+	params.entry_act_pairs = 1;
+	params.concurr = false;
 
-		pci_read_config_dword(pdev, pos + 4, &dword);
-		put_unaligned_le32(dword, &dsn[0]);
-		pci_read_config_dword(pdev, pos + 8, &dword);
-		put_unaligned_le32(dword, &dsn[4]);
-		snprintf(opt_fw_filename, NAME_MAX,
-			 "%sice-%02x%02x%02x%02x%02x%02x%02x%02x.pkg",
-			 ICE_DDP_PKG_PATH,
-			 dsn[7], dsn[6], dsn[5], dsn[4],
-			 dsn[3], dsn[2], dsn[1], dsn[0]);
-	}
 
-	return opt_fw_filename;
+	status = ice_acl_create_tbl(hw, &params);
+	if (status)
+		return ice_status_to_errno(status);
+
+	return ice_status_to_errno(ice_acl_create_scen(hw, params.width,
+						       params.depth, &scen_id));
 }
 
 /**
- * ice_request_fw - Device initialization routine
+ * ice_deinit_acl - Unroll the initialization of the ACL block
+ * @pf: ptr to PF device
+ */
+static void ice_deinit_acl(struct ice_pf *pf)
+{
+	ice_acl_destroy_tbl(&pf->hw);
+}
+
+/**
+ * ice_init_fdir - Initialize flow director VSI and configuration
  * @pf: pointer to the PF instance
+ *
+ * returns 0 on success, negative on error
  */
-static void ice_request_fw(struct ice_pf *pf)
+static int ice_init_fdir(struct ice_pf *pf)
 {
-	char *opt_fw_filename = ice_get_opt_fw_name(pf);
-	const struct firmware *firmware = NULL;
-	struct device *dev = &pf->pdev->dev;
-	int err = 0;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_vsi *ctrl_vsi;
+	int err;
 
-	/* optional device-specific DDP (if present) overrides the default DDP
+	/* Side Band Flow Director needs to have a control VSI.
+	 * Allocate it and store it in the PF.
+	 */
+	ctrl_vsi = ice_ctrl_vsi_setup(pf, pf->hw.port_info);
+	if (!ctrl_vsi) {
+		dev_dbg(dev, "could not create control VSI\n");
+		return -ENOMEM;
+	}
+
+	err = ice_vsi_open_ctrl(ctrl_vsi);
+	if (err) {
+		dev_dbg(dev, "could not open control VSI\n");
+		goto err_vsi_open;
+	}
+
+	mutex_init(&pf->hw.fdir_fltr_lock);
+
+	err = ice_fdir_create_dflt_rules(pf);
+	if (err)
+		goto err_fdir_rule;
+
+	return 0;
+
+err_fdir_rule:
+	ice_fdir_release_flows(&pf->hw);
+	ice_vsi_close(ctrl_vsi);
+err_vsi_open:
+	ice_vsi_release(ctrl_vsi);
+	if (pf->ctrl_vsi_idx != ICE_NO_VSI) {
+		pf->vsi[pf->ctrl_vsi_idx] = NULL;
+		pf->ctrl_vsi_idx = ICE_NO_VSI;
+	}
+	return err;
+}
+
+/**
+ * ice_get_opt_fw_name - return optional firmware file name or NULL
+ * @pf: pointer to the PF instance
+ */
+static char *ice_get_opt_fw_name(struct ice_pf *pf)
+{
+	/* Optional firmware name same as default with additional dash
+	 * followed by a EUI-64 identifier (PCIe Device Serial Number)
+	 */
+	struct pci_dev *pdev = pf->pdev;
+	char *opt_fw_filename;
+	u64 dsn;
+
+	/* Determine the name of the optional file using the DSN (two
+	 * dwords following the start of the DSN Capability).
+	 */
+	dsn = pci_get_dsn(pdev);
+	if (!dsn)
+		return NULL;
+
+	opt_fw_filename = kzalloc(NAME_MAX, GFP_KERNEL);
+	if (!opt_fw_filename)
+		return NULL;
+
+	snprintf(opt_fw_filename, NAME_MAX, "%sice-%016llx.pkg",
+		 ICE_DDP_PKG_PATH, dsn);
+
+	memcpy(pf->dcf.dsn, &dsn, sizeof(pf->dcf.dsn));
+
+	return opt_fw_filename;
+}
+
+/**
+ * ice_request_fw - Device initialization routine
+ * @pf: pointer to the PF instance
+ */
+static void ice_request_fw(struct ice_pf *pf)
+{
+	char *opt_fw_filename = ice_get_opt_fw_name(pf);
+	const struct firmware *firmware = NULL;
+	struct device *dev = ice_pf_to_dev(pf);
+	int err = 0;
+
+	/* optional device-specific DDP (if present) overrides the default DDP
 	 * package file. kernel logs a debug message if the file doesn't exist,
 	 * and warning messages for other errors.
 	 */
@@ -2771,8 +5512,7 @@ static void ice_request_fw(struct ice_pf *pf)
 dflt_pkg_load:
 	err = request_firmware(&firmware, ICE_DDP_PKG_FILE, dev);
 	if (err) {
-		dev_err(dev,
-			"The DDP package file was not found or could not be read. Entering Safe Mode\n");
+		dev_err(dev, "The DDP package file was not found or could not be read. Entering Safe Mode\n");
 		return;
 	}
 
@@ -2781,6 +5521,240 @@ static void ice_request_fw(struct ice_pf *pf)
 	release_firmware(firmware);
 }
 
+/**
+ * ice_verify_eeprom - make sure eeprom is good to use
+ * @pf: board private structure
+ */
+static void ice_verify_eeprom(struct ice_pf *pf)
+{
+	int err;
+
+	err = ice_nvm_validate_checksum(&pf->hw);
+	if (err) {
+		set_bit(ICE_BAD_EEPROM, pf->state);
+		dev_err(ice_pf_to_dev(pf), "Bad EEPROM checksum detected, err %d, please update your NVM.\n",
+			err);
+	} else {
+		clear_bit(ICE_BAD_EEPROM, pf->state);
+	}
+}
+
+/*
+ * ice_print_wake_reason - show the wake up cause in the log
+ * @pf: pointer to the PF struct
+ */
+static void ice_print_wake_reason(struct ice_pf *pf)
+{
+	u32 wus = pf->wakeup_reason;
+	const char *wake_str;
+
+	/* if no wake event, nothing to print */
+	if (!wus)
+		return;
+
+	if (wus & PFPM_WUS_LNKC_M)
+		wake_str = "Link\n";
+	else if (wus & PFPM_WUS_MAG_M)
+		wake_str = "Magic Packet\n";
+	else if (wus & PFPM_WUS_MNG_M)
+		wake_str = "Management\n";
+	else if (wus & PFPM_WUS_FW_RST_WK_M)
+		wake_str = "Firmware Reset\n";
+	else
+		wake_str = "Unknown\n";
+
+	dev_info(ice_pf_to_dev(pf), "Wake reason: %s", wake_str);
+}
+
+/*
+ * ice_config_health_events - Enable or disable FW health event reporting
+ * @pf: pointer to the PF struct
+ * @enable: whether to enable or disable the events
+ */
+static void
+ice_config_health_events(struct ice_pf *pf, bool enable)
+{
+	enum ice_status ret;
+	u8 enable_bits = 0;
+
+	if (!ice_is_fw_health_report_supported(&pf->hw))
+		return;
+
+	if (enable)
+		enable_bits = ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK |
+			      ICE_AQC_HEALTH_STATUS_SET_GLOBAL_MASK;
+
+	ret = ice_aq_set_health_status_config(&pf->hw, enable_bits, NULL);
+
+	if (ret)
+		dev_err(ice_pf_to_dev(pf), "Failed to %sable firmware health events, err %s aq_err %s\n",
+			enable ? "en" : "dis",
+			ice_stat_str(ret),
+			ice_aq_str(pf->hw.adminq.sq_last_status));
+}
+
+/*
+ * ice_register_netdev - register netdev and devlink port
+ * @pf: pointer to the PF struct
+ */
+static int ice_register_netdev(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+	int err = 0;
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi || !vsi->netdev)
+		return -EIO;
+
+	err = register_netdev(vsi->netdev);
+	if (err)
+		goto err_register_netdev;
+
+	set_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+	netif_carrier_off(vsi->netdev);
+	netif_tx_stop_all_queues(vsi->netdev);
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	err = ice_devlink_create_pf_port(pf);
+	if (err)
+		goto err_devlink_create;
+
+	devlink_port_type_eth_set(&pf->devlink_port, vsi->netdev);
+#endif /* CONFIG_NET_DEVLINK */
+
+	return 0;
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+err_devlink_create:
+	unregister_netdev(vsi->netdev);
+	clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+#endif /* CONFIG_NET_DEVLINK */
+err_register_netdev:
+	free_netdev(vsi->netdev);
+	vsi->netdev = NULL;
+	clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+	return err;
+}
+
+
+/**
+ * ice_pf_fwlog_is_input_valid - validate user input level/events
+ * @pf: pointer to the PF struct
+ * @user_input: input parameters to validate
+ */
+static bool
+ice_pf_fwlog_is_input_valid(struct ice_pf *pf,
+			    struct ice_fwlog_user_input *user_input)
+{
+	unsigned long events = user_input->events;
+	u8 log_level = user_input->log_level;
+
+	if (log_level >= ICE_FWLOG_LEVEL_INVALID) {
+		dev_err(ice_pf_to_dev(pf), "Invalid FW log level %u, all level(s) >= %u are invalid\n",
+			log_level, ICE_FWLOG_LEVEL_INVALID);
+		return false;
+	}
+
+	if (events >= BIT(ICE_AQC_FW_LOG_ID_MAX)) {
+		dev_err(ice_pf_to_dev(pf), "Invalid FW log events 0x%lx, all FW log event bits >= 0x%lx are invalid\n",
+			events, BIT(ICE_AQC_FW_LOG_ID_MAX));
+		return false;
+	}
+
+
+	return true;
+}
+
+
+/**
+ * ice_pf_fwlog_populate_cfg - populate FW log configuration
+ * @cfg: configuration to populate
+ * @user_input: input parameters to validate
+ *
+ * For each set event, set the @cfg's log_level to the @log_level. For all
+ * cleared events set ICE_FWLOG_LEVEL_NONE.
+ */
+static void
+ice_pf_fwlog_populate_cfg(struct ice_fwlog_cfg *cfg,
+			  struct ice_fwlog_user_input *user_input)
+{
+	u16 module_id;
+
+#define ICE_FWLOG_DFLT_LOG_RESOLUTION	10
+	cfg->log_resolution = ICE_FWLOG_DFLT_LOG_RESOLUTION;
+	cfg->options = ICE_FWLOG_OPTION_ARQ_ENA;
+
+	for (module_id = 0; module_id < ICE_AQC_FW_LOG_ID_MAX; module_id++) {
+		struct ice_fwlog_module_entry *entry =
+			&cfg->module_entries[module_id];
+
+		entry->module_id = module_id;
+		if (test_bit(module_id, &user_input->events))
+			entry->log_level = user_input->log_level;
+		else
+			entry->log_level = ICE_FWLOG_LEVEL_NONE;
+	}
+}
+
+/**
+ * ice_pf_fwlog_set - set FW logging configuration
+ * @pf: pointer to the PF struct
+ * @user_input: input parameters to validate
+ *
+ * After calling this function the @events at the specified @log_level will be
+ * enabled. However, the PF must still register for FW logging events if it has
+ * not yet done so. Otherwise no events will be received.
+ */
+static int
+ice_pf_fwlog_set(struct ice_pf *pf, struct ice_fwlog_user_input *user_input)
+{
+	struct ice_fwlog_cfg cfg = {};
+	enum ice_status status;
+
+	if (!ice_pf_fwlog_is_input_valid(pf, user_input))
+		return -EINVAL;
+
+	ice_pf_fwlog_populate_cfg(&cfg, user_input);
+
+	status = ice_fwlog_set(&pf->hw, &cfg);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Failed to set FW log configuration. fwlog_events: 0x%lx fwlog_level: %u\n",
+			user_input->events, user_input->log_level);
+		return ice_status_to_errno(status);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_pf_fwlog_init - initialize FW logging configuration on device init
+ * @pf: pointer to the PF struct
+ * @user_input: input parameters to validate
+ *
+ * This should always be called before ice_hw_init() as this will enable FW
+ * logging @events at the specified @log_level to be enabled/registered as soon
+ * as the driver can communicate with FW.
+ */
+static int
+ice_pf_fwlog_init(struct ice_pf *pf, struct ice_fwlog_user_input *user_input)
+{
+	struct ice_fwlog_cfg cfg = {};
+	enum ice_status status;
+
+	if (!ice_pf_fwlog_is_input_valid(pf, user_input))
+		return -EINVAL;
+
+	ice_pf_fwlog_populate_cfg(&cfg, user_input);
+	if (hweight_long(user_input->events))
+		cfg.options |= ICE_FWLOG_OPTION_REGISTER_ON_INIT;
+
+	status = ice_fwlog_init(&pf->hw, &cfg);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Failed to init FW log configuration. fwlog_events: 0x%lx fwlog_level: %u\n",
+			user_input->events, user_input->log_level);
+		return ice_status_to_errno(status);
+	}
+
+	return 0;
+}
+
 /**
  * ice_probe - Device initialization routine
  * @pdev: PCI device information struct
@@ -2791,23 +5765,27 @@ static void ice_request_fw(struct ice_pf *pf)
 static int
 ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 {
+	struct ice_fwlog_user_input user_input = { 0 };
 	struct device *dev = &pdev->dev;
 	struct ice_pf *pf;
 	struct ice_hw *hw;
 	int err;
 
-	/* this driver uses devres, see Documentation/driver-api/driver-model/devres.rst */
+	/* this driver uses devres, see
+	 * Documentation/driver-api/driver-model/devres.rst
+	 */
 	err = pcim_enable_device(pdev);
 	if (err)
 		return err;
 
-	err = pcim_iomap_regions(pdev, BIT(ICE_BAR0), pci_name(pdev));
+	err = pcim_iomap_regions(pdev, BIT(ICE_BAR0), dev_driver_string(dev));
 	if (err) {
 		dev_err(dev, "BAR0 I/O map error %d\n", err);
 		return err;
 	}
 
-	pf = devm_kzalloc(dev, sizeof(*pf), GFP_KERNEL);
+
+	pf = ice_allocate_pf(dev);
 	if (!pf)
 		return -ENOMEM;
 
@@ -2825,12 +5803,14 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 
 	pf->pdev = pdev;
 	pci_set_drvdata(pdev, pf);
-	set_bit(__ICE_DOWN, pf->state);
+	set_bit(ICE_DOWN, pf->state);
 	/* Disable service task until DOWN bit is cleared */
-	set_bit(__ICE_SERVICE_DIS, pf->state);
+	set_bit(ICE_SERVICE_DIS, pf->state);
 
 	hw = &pf->hw;
 	hw->hw_addr = pcim_iomap_table(pdev)[ICE_BAR0];
+	pci_save_state(pdev);
+
 	hw->back = pf;
 	hw->vendor_id = pdev->vendor;
 	hw->device_id = pdev->device;
@@ -2843,11 +5823,36 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 
 	pf->msg_enable = netif_msg_init(debug, ICE_DFLT_NETIF_M);
 
+	err = ice_devlink_register(pf);
+	if (err) {
+		dev_err(dev, "ice_devlink_register failed: %d\n", err);
+		goto err_exit_unroll;
+	}
+
 #ifndef CONFIG_DYNAMIC_DEBUG
 	if (debug < -1)
 		hw->debug_mask = debug;
 #endif
 
+	/* check if device FW is in recovery mode */
+	if (ice_get_fw_mode(hw) == ICE_FW_MODE_REC) {
+		err = ice_probe_recovery_mode(pf);
+		if (err)
+			goto err_rec_mode;
+
+		return 0;
+	}
+
+	ice_debugfs_pf_init(pf);
+
+	user_input.log_level = fwlog_level;
+	user_input.events = fwlog_events;
+	if (ice_pf_fwlog_init(pf, &user_input)) {
+		dev_err(dev, "failed to initialize FW logging: %d\n", err);
+		err = -EIO;
+		goto err_exit_unroll;
+	}
+
 	err = ice_init_hw(hw);
 	if (err) {
 		dev_err(dev, "ice_init_hw failed: %d\n", err);
@@ -2855,10 +5860,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		goto err_exit_unroll;
 	}
 
-	dev_info(dev, "firmware %d.%d.%d api %d.%d.%d nvm %s build 0x%08x\n",
-		 hw->fw_maj_ver, hw->fw_min_ver, hw->fw_patch,
-		 hw->api_maj_ver, hw->api_min_ver, hw->api_patch,
-		 ice_nvm_version_str(hw), hw->fw_build);
+	ice_init_feature_support(pf);
 
 	ice_request_fw(pf);
 
@@ -2867,8 +5869,6 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	 * true
 	 */
 	if (ice_is_safe_mode(pf)) {
-		dev_err(dev,
-			"Package download failed. Advanced features disabled - Device now in Safe Mode\n");
 		/* we already got function/device capabilities but these don't
 		 * reflect what the driver needs to do in safe mode. Instead of
 		 * adding conditional logic everywhere to ignore these
@@ -2877,11 +5877,36 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		ice_set_safe_mode_caps(hw);
 	}
 
+	ice_set_umac_shared(hw);
+
+
 	err = ice_init_pf(pf);
 	if (err) {
 		dev_err(dev, "ice_init_pf failed: %d\n", err);
 		goto err_init_pf_unroll;
 	}
+	ice_verify_eeprom(pf);
+#ifndef ETHTOOL_GFECPARAM
+	switch (pf->hw.port_info->phy.link_info.fec_info) {
+	case (ICE_AQ_LINK_25G_RS_528_FEC_EN | ICE_AQ_LINK_25G_KR_FEC_EN):
+	case (ICE_AQ_LINK_25G_RS_544_FEC_EN | ICE_AQ_LINK_25G_KR_FEC_EN):
+		set_bit(ICE_FLAG_RS_FEC, pf->flags);
+		set_bit(ICE_FLAG_BASE_R_FEC, pf->flags);
+		break;
+	case ICE_AQ_LINK_25G_RS_528_FEC_EN:
+	case ICE_AQ_LINK_25G_RS_544_FEC_EN:
+		set_bit(ICE_FLAG_RS_FEC, pf->flags);
+		break;
+	case ICE_AQ_LINK_25G_KR_FEC_EN:
+		set_bit(ICE_FLAG_BASE_R_FEC, pf->flags);
+		break;
+	default:
+		break;
+	}
+#endif /* ETHTOOL_GFECPARAM */
+
+	ice_devlink_init_regions(pf);
+	ice_devlink_params_publish(pf);
 
 	pf->num_alloc_vsi = hw->func_caps.guar_num_vsi;
 	if (!pf->num_alloc_vsi) {
@@ -2903,9 +5928,6 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		goto err_init_vsi_unroll;
 	}
 
-	/* Driver is mostly up */
-	clear_bit(__ICE_DOWN, pf->state);
-
 	/* In case of MSIX we are going to setup the misc vector right here
 	 * to handle admin queue events etc. In case of legacy and MSI
 	 * the misc functionality and queue processing is combined in
@@ -2917,6 +5939,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		goto err_init_interrupt_unroll;
 	}
 
+
 	/* create switch struct for the switch element created by FW on boot */
 	pf->first_sw = devm_kzalloc(dev, sizeof(*pf->first_sw), GFP_KERNEL);
 	if (!pf->first_sw) {
@@ -2936,19 +5959,21 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 
 	err = ice_setup_pf_sw(pf);
 	if (err) {
-		dev_err(dev, "probe failed due to setup PF switch:%d\n", err);
+		dev_err(dev, "probe failed due to setup PF switch: %d\n", err);
 		goto err_alloc_sw_unroll;
 	}
 
-	clear_bit(__ICE_SERVICE_DIS, pf->state);
+	clear_bit(ICE_SERVICE_DIS, pf->state);
+
+	/* by default, set the PF level feature flags to be ON */
+	set_bit(ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA, pf->flags);
 
 	/* tell the firmware we are up */
 	err = ice_send_version(pf);
 	if (err) {
-		dev_err(dev,
-			"probe failed sending driver version %s. error: %d\n",
+		dev_err(dev, "probe failed sending driver version %s. error: %d\n",
 			ice_drv_ver, err);
-		goto err_alloc_sw_unroll;
+		goto err_send_version_unroll;
 	}
 
 	/* since everything is good, start the service timer */
@@ -2957,17 +5982,100 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	err = ice_init_link_events(pf->hw.port_info);
 	if (err) {
 		dev_err(dev, "ice_init_link_events failed: %d\n", err);
-		goto err_alloc_sw_unroll;
+		goto err_send_version_unroll;
+	}
+
+	/* not a fatal error if this fails */
+	err = ice_init_nvm_phy_type(pf->hw.port_info);
+	if (err)
+		dev_err(dev, "ice_init_nvm_phy_type failed: %d\n", err);
+
+	ice_init_link_dflt_override(pf->hw.port_info);
+
+	/* not a fatal error if this fails */
+	err = ice_update_link_info(pf->hw.port_info);
+	if (err)
+		dev_err(dev, "ice_update_link_info failed: %d\n", err);
+
+	ice_check_module_power(pf, pf->hw.port_info->phy.link_info.link_cfg_err);
+
+	/* if media available, initialize PHY settings */
+	if (pf->hw.port_info->phy.link_info.link_info &
+	    ICE_AQ_MEDIA_AVAILABLE) {
+		/* not a fatal error if this fails */
+		err = ice_init_phy_user_cfg(pf->hw.port_info);
+		if (err)
+			dev_err(dev, "ice_init_phy_user_cfg failed: %d\n", err);
+
+		if (!test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags)) {
+			struct ice_vsi *vsi = ice_get_main_vsi(pf);
+
+			if (vsi)
+				ice_configure_phy(vsi);
+		}
+	} else {
+		set_bit(ICE_FLAG_NO_MEDIA, pf->flags);
 	}
 
 	ice_verify_cacheline_size(pf);
 
-	/* If no DDP driven features have to be setup, return here */
-	if (ice_is_safe_mode(pf))
-		return 0;
+	/* Save wakeup reason register for later use */
+	pf->wakeup_reason = rd32(hw, PFPM_WUS);
+
+	/* check for a power management event */
+	ice_print_wake_reason(pf);
+
+	/* clear wake status, all bits */
+	wr32(hw, PFPM_WUS, U32_MAX);
+
+	/* Disable WoL at init, wait for user to enable */
+	device_set_wakeup_enable(dev, false);
+
+	/* init peers only if supported */
+	if (ice_is_peer_ena(pf)) {
+		pf->peers = devm_kcalloc(dev, ICE_MAX_NUM_PEERS,
+					 sizeof(*pf->peers), GFP_KERNEL);
+		if (!pf->peers) {
+			err = -ENOMEM;
+			goto err_init_peer_unroll;
+		}
+
+		err = ice_init_peer_devices(pf);
+		if (err) {
+			dev_err(dev, "Failed to initialize peer_objs: 0x%x\n",
+				err);
+			err = -EIO;
+			goto err_init_peer_unroll;
+		}
+	} else {
+		dev_warn(dev, "RDMA is not supported on this device\n");
+	}
+
+	if (ice_is_safe_mode(pf)) {
+		ice_set_safe_mode_vlan_cfg(pf);
+		goto probe_done;
+	}
 
 	/* initialize DDP driven features */
+	if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		ice_ptp_init(pf);
+
+	/* Note: Flow director init failure is non-fatal to load */
+	if (ice_init_fdir(pf))
+		dev_err(dev, "could not initialize flow director\n");
+
+	/* set DCF ACL enable flag as false by default */
+	hw->dcf_caps &= ~DCF_ACL_CAP;
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) {
+		/* Note: ACL init failure is non-fatal to load */
+		err = ice_init_acl(pf);
+		if (err)
+			dev_err(&pf->pdev->dev,
+				"Failed to initialize ACL: %d\n", err);
+	}
 
+	/* set DCF UDP tunnel enable flag as false by default */
+	hw->dcf_caps &= ~DCF_UDP_TUNNEL_CAP;
 	/* Note: DCB init failure is non-fatal to load */
 	if (ice_init_pf_dcb(pf, false)) {
 		clear_bit(ICE_FLAG_DCB_CAPABLE, pf->flags);
@@ -2976,12 +6084,42 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		ice_cfg_lldp_mib_change(&pf->hw, true);
 	}
 
+#ifdef HAVE_NETDEV_UPPER_INFO
+	if (ice_init_lag(pf))
+		dev_warn(dev, "Failed to init link aggregation support\n");
+
+#endif /* HAVE_NETDEV_UPPER_INFO */
+	/* print PCI link speed and width */
+	pcie_print_link_status(pf->pdev);
+
+probe_done:
+	err = ice_register_netdev(pf);
+	if (err)
+		goto err_netdev_reg;
+
+	ice_config_health_events(pf, true);
+
+	/* ready to go, so clear down state bit */
+	clear_bit(ICE_DOWN, pf->state);
+
 	return 0;
 
+	/* Unwind non-managed device resources, etc. if something failed */
+err_netdev_reg:
+err_init_peer_unroll:
+	if (ice_is_peer_ena(pf)) {
+		ice_for_each_peer(pf, NULL, ice_unroll_peer);
+		if (pf->peers) {
+			devm_kfree(dev, pf->peers);
+			pf->peers = NULL;
+		}
+	}
+err_send_version_unroll:
+	ice_vsi_release_all(pf);
 err_alloc_sw_unroll:
-	set_bit(__ICE_SERVICE_DIS, pf->state);
-	set_bit(__ICE_DOWN, pf->state);
-	devm_kfree(&pf->pdev->dev, pf->first_sw);
+	set_bit(ICE_SERVICE_DIS, pf->state);
+	set_bit(ICE_DOWN, pf->state);
+	devm_kfree(dev, pf->first_sw);
 err_msix_misc_unroll:
 	ice_free_irq_msix_misc(pf);
 err_init_interrupt_unroll:
@@ -2990,52 +6128,412 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	devm_kfree(dev, pf->vsi);
 err_init_pf_unroll:
 	ice_deinit_pf(pf);
+	ice_devlink_params_unpublish(pf);
+	ice_devlink_destroy_regions(pf);
 	ice_deinit_hw(hw);
 err_exit_unroll:
+	ice_debugfs_pf_exit(pf);
+err_rec_mode:
+	ice_devlink_unregister(pf);
 	pci_disable_pcie_error_reporting(pdev);
+	pci_disable_device(pdev);
 	return err;
 }
 
+
 /**
- * ice_remove - Device removal routine
- * @pdev: PCI device information struct
+ * ice_set_wake - enable or disable Wake on LAN
+ * @pf: pointer to the PF struct
+ *
+ * Simple helper for WoL control
  */
-static void ice_remove(struct pci_dev *pdev)
+static void ice_set_wake(struct ice_pf *pf)
 {
-	struct ice_pf *pf = pci_get_drvdata(pdev);
-	int i;
+	struct ice_hw *hw = &pf->hw;
+	bool wol = pf->wol_ena;
 
-	if (!pf)
-		return;
+	/* clear wake state, otherwise new wake events won't fire */
+	wr32(hw, PFPM_WUS, U32_MAX);
 
-	for (i = 0; i < ICE_MAX_RESET_WAIT; i++) {
-		if (!ice_is_reset_in_progress(pf->state))
-			break;
-		msleep(100);
-	}
+	/* enable / disable APM wake up, no RMW needed */
+	wr32(hw, PFPM_APM, wol ? PFPM_APM_APME_M : 0);
 
-	set_bit(__ICE_DOWN, pf->state);
-	ice_service_task_stop(pf);
+	/* set magic packet filter enabled */
+	wr32(hw, PFPM_WUFC, wol ? PFPM_WUFC_MAG_M : 0);
+}
 
-	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags))
-		ice_free_vfs(pf);
-	ice_vsi_release_all(pf);
-	ice_free_irq_msix_misc(pf);
-	ice_for_each_vsi(pf, i) {
+/**
+ * ice_setup_mc_magic_wake - setup device to wake on multicast magic packet
+ * @pf: pointer to the PF struct
+ *
+ * Issue firmware command to enable multicast magic wake, making
+ * sure that any locally administered address (LAA) is used for
+ * wake, and that PF reset doesn't undo the LAA.
+ */
+static void ice_setup_mc_magic_wake(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 mac_addr[ETH_ALEN];
+	struct ice_vsi *vsi;
+	u8 flags;
+
+	if (!pf->wol_ena)
+		return;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	/* Get current MAC address in case it's an LAA */
+	if (vsi->netdev)
+		ether_addr_copy(mac_addr, vsi->netdev->dev_addr);
+	else
+		ether_addr_copy(mac_addr, vsi->port_info->mac.perm_addr);
+
+	flags = ICE_AQC_MAN_MAC_WR_MC_MAG_EN |
+		ICE_AQC_MAN_MAC_UPDATE_LAA_WOL |
+		ICE_AQC_MAN_MAC_WR_WOL_LAA_PFR_KEEP;
+
+	status = ice_aq_manage_mac_write(hw, mac_addr, flags, NULL);
+	if (status)
+		dev_err(dev, "Failed to enable Multicast Magic Packet wake, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+}
+
+/**
+ * ice_remove - Device removal routine
+ * @pdev: PCI device information struct
+ */
+static void ice_remove(struct pci_dev *pdev)
+{
+	struct ice_pf *pf = pci_get_drvdata(pdev);
+	int i;
+
+	if (!pf)
+		return;
+
+	/* ICE_PREPPED_RECOVERY_MODE is set when the up and running
+	 * driver transitions to recovery mode. If this is not set
+	 * it means that the driver went into recovery mode on load.
+	 * For the former case, go through the usual flow for module
+	 * unload. For the latter, call ice_remove_recovery_mode
+	 * and return.
+	 */
+	if (!test_bit(ICE_PREPPED_RECOVERY_MODE, pf->state) &&
+	    test_bit(ICE_RECOVERY_MODE, pf->state)) {
+		ice_remove_recovery_mode(pf);
+		return;
+	}
+
+	for (i = 0; i < ICE_MAX_RESET_WAIT; i++) {
+		if (!ice_is_reset_in_progress(pf->state))
+			break;
+		msleep(100);
+	}
+#ifdef HAVE_TC_INDIR_BLOCK
+	/* clear indirect block notification before cleaning up of ADQ
+	 * resources
+	 */
+	ice_tc_indir_block_remove(pf);
+#endif /* HAVE_TC_INDIR_BLOCK */
+
+
+	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags)) {
+		set_bit(ICE_VF_RESETS_DISABLED, pf->state);
+		ice_free_vfs(pf);
+	}
+
+	ice_service_task_stop(pf);
+
+	ice_aq_cancel_waiting_tasks(pf);
+
+	if (ice_is_peer_ena(pf)) {
+		enum ice_close_reason reason;
+
+		reason = ICE_REASON_INTERFACE_DOWN;
+		ice_for_each_peer(pf, &reason, ice_peer_close);
+	}
+	set_bit(ICE_DOWN, pf->state);
+
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		ice_deinit_acl(pf);
+	mutex_destroy(&(&pf->hw)->fdir_fltr_lock);
+#ifdef HAVE_NETDEV_UPPER_INFO
+	ice_deinit_lag(pf);
+#endif /* HAVE_NETDEV_UPPER_INFO */
+	if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		ice_ptp_release(pf);
+	if (!ice_is_safe_mode(pf))
+		ice_remove_arfs(pf);
+	ice_setup_mc_magic_wake(pf);
+	ice_vsi_release_all(pf);
+	if (ice_is_peer_ena(pf)) {
+#if IS_ENABLED(CONFIG_MFD_CORE)
+		ida_simple_remove(&ice_peer_index_ida, pf->peer_idx);
+#endif
+		ice_for_each_peer(pf, NULL, ice_unreg_peer_obj);
+		devm_kfree(&pdev->dev, pf->peers);
+	}
+	ice_set_wake(pf);
+	ice_free_irq_msix_misc(pf);
+	ice_for_each_vsi(pf, i) {
 		if (!pf->vsi[i])
 			continue;
 		ice_vsi_free_q_vectors(pf->vsi[i]);
 	}
+
 	ice_deinit_pf(pf);
+	ice_devlink_params_unpublish(pf);
+	ice_devlink_destroy_regions(pf);
+	if (ice_fwlog_unregister(&pf->hw))
+		dev_dbg(&pdev->dev, "failed to unregister from FW logging\n");
 	ice_deinit_hw(&pf->hw);
-	ice_clear_interrupt_scheme(pf);
+	ice_devlink_unregister(pf);
+	ice_debugfs_pf_exit(pf);
 	/* Issue a PFR as part of the prescribed driver unload flow.  Do not
 	 * do it via ice_schedule_reset() since there is no need to rebuild
 	 * and the service task is already stopped.
 	 */
 	ice_reset(&pf->hw, ICE_RESET_PFR);
+	pci_wait_for_pending_transaction(pdev);
+	ice_clear_interrupt_scheme(pf);
 	pci_disable_pcie_error_reporting(pdev);
+	pci_disable_device(pdev);
+}
+
+/**
+ * ice_shutdown - PCI callback for shutting down device
+ * @pdev: PCI device information struct
+ */
+static void ice_shutdown(struct pci_dev *pdev)
+{
+	struct ice_pf *pf = pci_get_drvdata(pdev);
+
+	ice_remove(pdev);
+
+	if (system_state == SYSTEM_POWER_OFF) {
+		pci_wake_from_d3(pdev, pf->wol_ena);
+		pci_set_power_state(pdev, PCI_D3hot);
+	}
+}
+
+#ifdef CONFIG_PM
+/**
+ * ice_prepare_for_shutdown - prep for PCI shutdown
+ * @pf: board private structure
+ *
+ * Inform or close all dependent features in prep for PCI device shutdown
+ */
+static void ice_prepare_for_shutdown(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
+	u32 v;
+
+	/* Notify VFs of impending reset */
+	if (ice_check_sq_alive(hw, &hw->mailboxq))
+		ice_vc_notify_reset(pf);
+
+	dev_dbg(ice_pf_to_dev(pf), "Tearing down internal switch for shutdown\n");
+
+	/* disable the VSIs and their queues that are not already DOWN */
+	ice_pf_dis_all_vsi(pf, false);
+
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v])
+			pf->vsi[v]->vsi_num = 0;
+
+	ice_shutdown_all_ctrlq(hw);
+}
+
+/**
+ * ice_reinit_interrupt_scheme - Reinitialize interrupt scheme
+ * @pf: board private structure to reinitialize
+ *
+ * This routine reinitialize interrupt scheme that was cleared during
+ * power management suspend callback.
+ *
+ * This should be called during resume routine to re-allocate the q_vectors
+ * and reacquire interrupts.
+ */
+static int ice_reinit_interrupt_scheme(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	int ret, v;
+
+	/* Since we clear MSIX flag during suspend, we need to
+	 * set it back during resume...
+	 */
+
+	ret = ice_init_interrupt_scheme(pf);
+	if (ret) {
+		dev_err(dev, "Failed to re-initialize interrupt %d\n", ret);
+		return ret;
+	}
+
+	/* Remap vectors and rings, after successful re-init interrupts */
+	ice_for_each_vsi(pf, v) {
+		if (!pf->vsi[v])
+			continue;
+
+		ret = ice_vsi_alloc_q_vectors(pf->vsi[v]);
+		if (ret)
+			goto err_reinit;
+		ice_vsi_map_rings_to_vectors(pf->vsi[v]);
+	}
+
+	ret = ice_req_irq_msix_misc(pf);
+	if (ret) {
+		dev_err(dev, "Setting up misc vector failed after device suspend %d\n",
+			ret);
+		goto err_reinit;
+	}
+
+	return 0;
+
+err_reinit:
+	while (v--)
+		if (pf->vsi[v])
+			ice_vsi_free_q_vectors(pf->vsi[v]);
+
+	return ret;
+}
+
+/**
+ * ice_suspend
+ * @dev: generic device information structure
+ *
+ * Power Management callback to quiesce the device and prepare
+ * for D3 transition.
+ */
+static int __maybe_unused ice_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct ice_pf *pf;
+	int disabled, v;
+
+	pf = pci_get_drvdata(pdev);
+
+	if (!ice_pf_state_is_nominal(pf)) {
+		dev_err(dev, "Device is not ready, no need to suspend it\n");
+		return -EBUSY;
+	}
+
+	/* Stop watchdog tasks until resume completion.
+	 * Even though it is most likely that the service task is
+	 * disabled if the device is suspended or down, the service task's
+	 * state is controlled by a different state bit, and we should
+	 * store and honor whatever state that bit is in at this point.
+	 */
+	disabled = ice_service_task_stop(pf);
+
+	if (ice_is_peer_ena(pf)) {
+		enum ice_close_reason reason;
+
+		reason = ICE_REASON_INTERFACE_DOWN;
+		ice_for_each_peer(pf, &reason, ice_peer_close);
+	}
+
+	/* Already suspended?, then there is nothing to do */
+	if (test_and_set_bit(ICE_SUSPENDED, pf->state)) {
+		if (!disabled)
+			ice_service_task_restart(pf);
+		return 0;
+	}
+
+	if (test_bit(ICE_DOWN, pf->state) ||
+	    ice_is_reset_in_progress(pf->state)) {
+		dev_err(dev, "can't suspend device in reset or already down\n");
+		if (!disabled)
+			ice_service_task_restart(pf);
+		return 0;
+	}
+
+	ice_setup_mc_magic_wake(pf);
+
+	ice_prepare_for_shutdown(pf);
+
+	ice_set_wake(pf);
+
+	/* Free vectors, clear the interrupt scheme and release IRQs
+	 * for proper hibernation, especially with large number of CPUs.
+	 * Otherwise hibernation might fail when mapping all the vectors back
+	 * to CPU0.
+	 */
+	ice_free_irq_msix_misc(pf);
+	ice_for_each_vsi(pf, v) {
+		if (!pf->vsi[v])
+			continue;
+		ice_vsi_free_q_vectors(pf->vsi[v]);
+	}
+	ice_clear_interrupt_scheme(pf);
+
+	pci_save_state(pdev);
+	pci_wake_from_d3(pdev, pf->wol_ena);
+	pci_set_power_state(pdev, PCI_D3hot);
+	return 0;
+}
+
+/**
+ * ice_resume - PM callback for waking up from D3
+ * @dev: generic device information structure
+ */
+static int __maybe_unused ice_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	enum ice_reset_req reset_type;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int ret;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+
+	if (!pci_device_is_present(pdev))
+		return -ENODEV;
+
+	ret = pci_enable_device_mem(pdev);
+	if (ret) {
+		dev_err(dev, "Cannot enable device after suspend\n");
+		return ret;
+	}
+
+	pf = pci_get_drvdata(pdev);
+	hw = &pf->hw;
+
+	pf->wakeup_reason = rd32(hw, PFPM_WUS);
+	ice_print_wake_reason(pf);
+
+	/* We cleared the interrupt scheme when we suspended, so we need to
+	 * restore it now to resume device functionality.
+	 */
+	ret = ice_reinit_interrupt_scheme(pf);
+	if (ret)
+		dev_err(dev, "Cannot restore interrupt scheme: %d\n", ret);
+
+	ice_peer_refresh_msix(pf);
+
+	clear_bit(ICE_DOWN, pf->state);
+	/* Now perform PF reset and rebuild */
+	reset_type = ICE_RESET_PFR;
+	/* re-enable service task for reset, but allow reset to schedule it */
+	clear_bit(ICE_SERVICE_DIS, pf->state);
+
+	if (ice_schedule_reset(pf, reset_type))
+		dev_err(dev, "Reset during resume failed.\n");
+
+	clear_bit(ICE_SUSPENDED, pf->state);
+	ice_service_task_restart(pf);
+
+	/* Restart the service task */
+	mod_timer(&pf->serv_tmr, round_jiffies(jiffies + pf->serv_tmr_period));
+
+	return 0;
 }
+#endif /* CONFIG_PM */
 
 /**
  * ice_pci_err_detected - warning that PCI error has been detected
@@ -3046,7 +6544,7 @@ static void ice_remove(struct pci_dev *pdev)
  * is in progress.  Allows the driver to gracefully prepare/handle PCI errors.
  */
 static pci_ers_result_t
-ice_pci_err_detected(struct pci_dev *pdev, enum pci_channel_state err)
+ice_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t err)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
 
@@ -3056,12 +6554,12 @@ ice_pci_err_detected(struct pci_dev *pdev, enum pci_channel_state err)
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	if (!test_bit(__ICE_SUSPENDED, pf->state)) {
+	if (!test_bit(ICE_SUSPENDED, pf->state)) {
 		ice_service_task_stop(pf);
 
-		if (!test_bit(__ICE_PREPARED_FOR_RESET, pf->state)) {
-			set_bit(__ICE_PFR_REQ, pf->state);
-			ice_prepare_for_reset(pf);
+		if (!test_bit(ICE_PREPARED_FOR_RESET, pf->state)) {
+			set_bit(ICE_PFR_REQ, pf->state);
+			ice_prepare_for_reset(pf, ICE_RESET_PFR);
 		}
 	}
 
@@ -3084,8 +6582,7 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev)
 
 	err = pci_enable_device_mem(pdev);
 	if (err) {
-		dev_err(&pdev->dev,
-			"Cannot re-enable PCI device after reset, error %d\n",
+		dev_err(&pdev->dev, "Cannot re-enable PCI device after reset, error %d\n",
 			err);
 		result = PCI_ERS_RESULT_DISCONNECT;
 	} else {
@@ -3102,10 +6599,9 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev)
 			result = PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	err = pci_cleanup_aer_uncorrect_error_status(pdev);
+	err = pci_aer_clear_nonfatal_status(pdev);
 	if (err)
-		dev_dbg(&pdev->dev,
-			"pci_cleanup_aer_uncorrect_error_status failed, error %d\n",
+		dev_dbg(&pdev->dev, "pci_aer_clear_nonfatal_status failed, error %d\n",
 			err);
 		/* non-fatal, continue */
 
@@ -3124,22 +6620,25 @@ static void ice_pci_err_resume(struct pci_dev *pdev)
 	struct ice_pf *pf = pci_get_drvdata(pdev);
 
 	if (!pf) {
-		dev_err(&pdev->dev,
-			"%s failed, device is unrecoverable\n", __func__);
+		dev_err(&pdev->dev, "%s failed, device is unrecoverable\n",
+			__func__);
 		return;
 	}
 
-	if (test_bit(__ICE_SUSPENDED, pf->state)) {
+	if (test_bit(ICE_SUSPENDED, pf->state)) {
 		dev_dbg(&pdev->dev, "%s failed to resume normal operations!\n",
 			__func__);
 		return;
 	}
 
+	ice_restore_all_vfs_msi_state(pdev);
+
 	ice_do_reset(pf, ICE_RESET_PFR);
 	ice_service_task_restart(pf);
 	mod_timer(&pf->serv_tmr, round_jiffies(jiffies + pf->serv_tmr_period));
 }
 
+#if defined(HAVE_PCI_ERROR_HANDLER_RESET_PREPARE) || defined(HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY) || defined(HAVE_RHEL7_PCI_RESET_NOTIFY)
 /**
  * ice_pci_err_reset_prepare - prepare device driver for PCI reset
  * @pdev: PCI device information struct
@@ -3148,12 +6647,12 @@ static void ice_pci_err_reset_prepare(struct pci_dev *pdev)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
 
-	if (!test_bit(__ICE_SUSPENDED, pf->state)) {
+	if (!test_bit(ICE_SUSPENDED, pf->state)) {
 		ice_service_task_stop(pf);
 
-		if (!test_bit(__ICE_PREPARED_FOR_RESET, pf->state)) {
-			set_bit(__ICE_PFR_REQ, pf->state);
-			ice_prepare_for_reset(pf);
+		if (!test_bit(ICE_PREPARED_FOR_RESET, pf->state)) {
+			set_bit(ICE_PFR_REQ, pf->state);
+			ice_prepare_for_reset(pf, ICE_RESET_PFR);
 		}
 	}
 }
@@ -3166,6 +6665,24 @@ static void ice_pci_err_reset_done(struct pci_dev *pdev)
 {
 	ice_pci_err_resume(pdev);
 }
+#endif /* HAVE_PCI_ERROR_HANDLER_RESET_PREPARE || HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY || HAVE_RHEL7_PCI_RESET_NOTIFY */
+
+#if defined(HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY) || (defined(HAVE_RHEL7_PCI_RESET_NOTIFY) && defined(HAVE_RHEL7_PCI_DRIVER_RH))
+/**
+ * ice_pci_err_reset_notify - notify device driver of pci reset
+ * @pdev: PCI device information struct
+ * @prepare: whether or not to prepare for reset or reset is complete
+ *
+ * Called to perform PF reset when a PCI function level reset is triggered
+ */
+static void ice_pci_err_reset_notify(struct pci_dev *pdev, bool prepare)
+{
+	if (prepare)
+		ice_pci_err_reset_prepare(pdev);
+	else
+		ice_pci_err_reset_done(pdev);
+}
+#endif /* HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY || (HAVE_RHEL7_PCI_RESET_NOTIFY && HAVE_RHEL7_PCI_DRIVER_RH) */
 
 /* ice_pci_tbl - PCI Device ID Table
  *
@@ -3179,25 +6696,73 @@ static const struct pci_device_id ice_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_BACKPLANE), 0 },
 	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_QSFP), 0 },
 	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_QSFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_QSFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_10G_BASE_T), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_SGMII), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_QSFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_10G_BASE_T), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_SGMII), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_10G_BASE_T), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_SGMII), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_10G_BASE_T), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_1GBE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_QSFP), 0 },
 	/* required last entry */
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, ice_pci_tbl);
 
+static __maybe_unused SIMPLE_DEV_PM_OPS(ice_pm_ops, ice_suspend, ice_resume);
+
+#ifdef HAVE_CONST_STRUCT_PCI_ERROR_HANDLERS
 static const struct pci_error_handlers ice_pci_err_handler = {
+#else
+static struct pci_error_handlers ice_pci_err_handler = {
+#endif /* HAVE_CONST_STRUCT_PCI_ERROR_HANDLERS */
 	.error_detected = ice_pci_err_detected,
 	.slot_reset = ice_pci_err_slot_reset,
+#ifdef HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY
+	.reset_notify = ice_pci_err_reset_notify,
+#endif /* HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY */
+#ifdef HAVE_PCI_ERROR_HANDLER_RESET_PREPARE
 	.reset_prepare = ice_pci_err_reset_prepare,
 	.reset_done = ice_pci_err_reset_done,
+#endif /* HAVE_PCI_ERROR_HANDLER_RESET_PREPARE */
 	.resume = ice_pci_err_resume
 };
 
+#ifdef HAVE_RHEL7_PCI_DRIVER_RH
+static struct pci_driver_rh ice_driver_rh = {
+#ifdef HAVE_RHEL7_PCI_RESET_NOTIFY
+	.reset_notify = ice_pci_err_reset_notify,
+#endif /* HAVE_RHEL7_PCI_RESET_NOTIFY */
+};
+#endif /* HAVE_RHEL7_PCI_DRIVER_RH */
+
 static struct pci_driver ice_driver = {
 	.name = KBUILD_MODNAME,
 	.id_table = ice_pci_tbl,
 	.probe = ice_probe,
 	.remove = ice_remove,
+#ifdef CONFIG_PM
+	.driver.pm = &ice_pm_ops,
+#endif /* CONFIG_PM */
+	.shutdown = ice_shutdown,
 	.sriov_configure = ice_sriov_configure,
+#ifdef HAVE_RHEL7_PCI_DRIVER_RH
+	.pci_driver_rh = &ice_driver_rh,
+#endif /* HAVE_RHEL7_PCI_DRIVER_RH */
 	.err_handler = &ice_pci_err_handler
 };
 
@@ -3214,16 +6779,31 @@ static int __init ice_module_init(void)
 	pr_info("%s - version %s\n", ice_driver_string, ice_drv_ver);
 	pr_info("%s\n", ice_copyright);
 
-	ice_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, KBUILD_MODNAME);
+
+	ice_wq = alloc_workqueue("%s", 0, 0, KBUILD_MODNAME);
 	if (!ice_wq) {
 		pr_err("Failed to create workqueue\n");
 		return -ENOMEM;
 	}
 
+#ifdef HAVE_RHEL7_PCI_DRIVER_RH
+	/* The size member must be initialized in the driver via a call to
+	 * set_pci_driver_rh_size before pci_register_driver is called
+	 */
+	set_pci_driver_rh_size(ice_driver_rh);
+
+#endif /* HAVE_RHEL7_PCI_DRIVER_RH */
+
+	ice_debugfs_init();
+
 	status = pci_register_driver(&ice_driver);
 	if (status) {
 		pr_err("failed to register PCI driver, err %d\n", status);
 		destroy_workqueue(ice_wq);
+		ice_debugfs_exit();
+#if IS_ENABLED(CONFIG_MFD_CORE)
+		ida_destroy(&ice_peer_index_ida);
+#endif
 	}
 
 	return status;
@@ -3240,6 +6820,13 @@ static void __exit ice_module_exit(void)
 {
 	pci_unregister_driver(&ice_driver);
 	destroy_workqueue(ice_wq);
+	ice_debugfs_exit();
+	/* release all cached layer within ida tree, associated with
+	 * ice_peer_index_ida object
+	 */
+#if IS_ENABLED(CONFIG_MFD_CORE)
+	ida_destroy(&ice_peer_index_ida);
+#endif
 	pr_info("module unloaded\n");
 }
 module_exit(ice_module_exit);
@@ -3273,32 +6860,46 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi)
 		return 0;
 	}
 
-	if (test_bit(__ICE_DOWN, pf->state) ||
+	if (test_bit(ICE_DOWN, pf->state) ||
 	    ice_is_reset_in_progress(pf->state)) {
 		netdev_err(netdev, "can't set mac %pM. device not ready\n",
 			   mac);
 		return -EBUSY;
 	}
 
-	/* When we change the MAC address we also have to change the MAC address
-	 * based filter rules that were created previously for the old MAC
-	 * address. So first, we remove the old filter rule using ice_remove_mac
-	 * and then create a new filter rule using ice_add_mac via
-	 * ice_vsi_cfg_mac_fltr function call for both add and/or remove
-	 * filters.
-	 */
-	status = ice_vsi_cfg_mac_fltr(vsi, netdev->dev_addr, false);
-	if (status) {
-		err = -EADDRNOTAVAIL;
-		goto err_update_filters;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	if (ice_chnl_dmac_fltr_cnt(pf)) {
+		netdev_err(netdev,
+			   "can't set mac %pM. Device has tc-flower filters, delete all of them and try again\n",
+			   mac);
+		return -EAGAIN;
 	}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 
-	status = ice_vsi_cfg_mac_fltr(vsi, mac, true);
-	if (status) {
+	/* Clean up old MAC filter. Not an error if old filter doesn't exist */
+	status = ice_fltr_remove_mac(vsi, netdev->dev_addr, ICE_FWD_TO_VSI);
+	if (status && status != ICE_ERR_DOES_NOT_EXIST) {
 		err = -EADDRNOTAVAIL;
 		goto err_update_filters;
 	}
 
+	/* Add filter for new MAC. If filter exists, return success */
+	status = ice_fltr_add_mac(vsi, mac, ICE_FWD_TO_VSI);
+	if (status == ICE_ERR_ALREADY_EXISTS) {
+		/* Although this MAC filter is already present in hardware it's
+		 * possible in some cases (e.g. bonding) that dev_addr was
+		 * modified outside of the driver and needs to be restored back
+		 * to this value.
+		 */
+		memcpy(netdev->dev_addr, mac, netdev->addr_len);
+		netdev_dbg(netdev, "filter for MAC %pM already exists\n", mac);
+		return 0;
+	}
+
+	/* error if the new filter addition failed */
+	if (status)
+		err = -EADDRNOTAVAIL;
+
 err_update_filters:
 	if (err) {
 		netdev_err(netdev, "can't set MAC %pM. filter update failed\n",
@@ -3315,8 +6916,8 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi)
 	flags = ICE_AQC_MAN_MAC_UPDATE_LAA_WOL;
 	status = ice_aq_manage_mac_write(hw, mac, flags, NULL);
 	if (status) {
-		netdev_err(netdev, "can't set MAC %pM. write to firmware failed error %d\n",
-			   mac, status);
+		netdev_err(netdev, "can't set MAC %pM. write to firmware failed error %s\n",
+			   mac, ice_stat_str(status));
 	}
 	return 0;
 }
@@ -3337,8 +6938,8 @@ static void ice_set_rx_mode(struct net_device *netdev)
 	 * ndo_set_rx_mode may be triggered even without a change in netdev
 	 * flags
 	 */
-	set_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
-	set_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
+	set_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state);
+	set_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state);
 	set_bit(ICE_FLAG_FLTR_SYNC, vsi->back->flags);
 
 	/* schedule our worker thread which will take care of
@@ -3347,6 +6948,50 @@ static void ice_set_rx_mode(struct net_device *netdev)
 	ice_service_task_schedule(vsi->back);
 }
 
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+/**
+ * ice_set_tx_maxrate - NDO callback to set the maximum per-queue bitrate
+ * @netdev: network interface device structure
+ * @queue_index: Queue ID
+ * @maxrate: maximum bandwidth in Mbps
+ */
+static int
+ice_set_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	enum ice_status status;
+	u16 q_handle;
+	u8 tc;
+
+	/* Validate maxrate requested is within permitted range */
+	if (maxrate && (maxrate > (ICE_SCHED_MAX_BW / 1000))) {
+		netdev_err(netdev, "Invalid max rate %d specified for the queue %d\n",
+			   maxrate, queue_index);
+		return -EINVAL;
+	}
+
+	q_handle = vsi->tx_rings[queue_index]->q_handle;
+	tc = ice_dcb_get_tc(vsi, queue_index);
+
+	/* Set BW back to default, when user set maxrate to 0 */
+	if (!maxrate)
+		status = ice_cfg_q_bw_dflt_lmt(vsi->port_info, vsi->idx, tc,
+					       q_handle, ICE_MAX_BW);
+	else
+		status = ice_cfg_q_bw_lmt(vsi->port_info, vsi->idx, tc,
+					  q_handle, ICE_MAX_BW, maxrate * 1000);
+	if (status) {
+		netdev_err(netdev, "Unable to set Tx max rate, error %s\n",
+			   ice_stat_str(status));
+		return -EIO;
+	}
+
+	return 0;
+}
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
+
+#ifdef HAVE_NDO_FDB_ADD_EXTACK
 /**
  * ice_fdb_add - add an entry to the hardware database
  * @ndm: the input from the stack
@@ -3361,13 +7006,25 @@ static int
 ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
 	    struct net_device *dev, const unsigned char *addr, u16 vid,
 	    u16 flags, struct netlink_ext_ack __always_unused *extack)
+#elif defined(HAVE_NDO_FDB_ADD_VID)
+static int
+ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
+	    struct net_device *dev, const unsigned char *addr, u16 vid,
+	    u16 flags)
+#else
+static int
+ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
+	    struct net_device *dev, const unsigned char *addr, u16 flags)
+#endif /* HAVE_NDO_FDB_ADD_VID */
 {
 	int err;
 
+#ifdef HAVE_NDO_FDB_ADD_VID
 	if (vid) {
 		netdev_err(dev, "VLANs aren't supported yet for dev_uc|mc_add()\n");
 		return -EINVAL;
 	}
+#endif
 	if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
 		netdev_err(dev, "FDB only supports static addresses\n");
 		return -EINVAL;
@@ -3387,6 +7044,7 @@ ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
 	return err;
 }
 
+#ifdef HAVE_NDO_FDB_ADD_VID
 /**
  * ice_fdb_del - delete an entry from the hardware database
  * @ndm: the input from the stack
@@ -3399,6 +7057,11 @@ static int
 ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
 	    struct net_device *dev, const unsigned char *addr,
 	    __always_unused u16 vid)
+#else
+static int
+ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
+	    struct net_device *dev, const unsigned char *addr)
+#endif
 {
 	int err;
 
@@ -3417,473 +7080,820 @@ ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
 	return err;
 }
 
+#ifdef HAVE_NETDEV_SB_DEV
 /**
- * ice_set_features - set the netdev feature flags
- * @netdev: ptr to the netdev being adjusted
- * @features: the feature set that the stack is suggesting
+ * ice_vsi_cfg_netdev_tc0 - Setup the netdev TC 0 configuration
+ * @vsi: the VSI being configured
+ *
+ * This function configures netdev parameters for traffic class 0
  */
-static int
-ice_set_features(struct net_device *netdev, netdev_features_t features)
+int ice_vsi_cfg_netdev_tc0(struct ice_vsi *vsi)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	int ret = 0;
+	struct net_device *netdev = vsi->netdev;
+	int ret;
 
-	/* Don't set any netdev advanced features with device in Safe Mode */
-	if (ice_is_safe_mode(vsi->back)) {
-		dev_err(&vsi->back->pdev->dev,
-			"Device is in Safe Mode - not enabling advanced netdev features\n");
+	if (!netdev)
+		return -EINVAL;
+
+	ret = netdev_set_num_tc(netdev, 1);
+	if (ret) {
+		netdev_err(netdev, "Error setting num TC\n");
 		return ret;
 	}
 
-	/* Multiple features can be changed in one call so keep features in
-	 * separate if/else statements to guarantee each feature is checked
-	 */
-	if (features & NETIF_F_RXHASH && !(netdev->features & NETIF_F_RXHASH))
-		ret = ice_vsi_manage_rss_lut(vsi, true);
-	else if (!(features & NETIF_F_RXHASH) &&
-		 netdev->features & NETIF_F_RXHASH)
-		ret = ice_vsi_manage_rss_lut(vsi, false);
-
-	if ((features & NETIF_F_HW_VLAN_CTAG_RX) &&
-	    !(netdev->features & NETIF_F_HW_VLAN_CTAG_RX))
-		ret = ice_vsi_manage_vlan_stripping(vsi, true);
-	else if (!(features & NETIF_F_HW_VLAN_CTAG_RX) &&
-		 (netdev->features & NETIF_F_HW_VLAN_CTAG_RX))
-		ret = ice_vsi_manage_vlan_stripping(vsi, false);
-
-	if ((features & NETIF_F_HW_VLAN_CTAG_TX) &&
-	    !(netdev->features & NETIF_F_HW_VLAN_CTAG_TX))
-		ret = ice_vsi_manage_vlan_insertion(vsi);
-	else if (!(features & NETIF_F_HW_VLAN_CTAG_TX) &&
-		 (netdev->features & NETIF_F_HW_VLAN_CTAG_TX))
-		ret = ice_vsi_manage_vlan_insertion(vsi);
-
-	if ((features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
-	    !(netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
-		ret = ice_cfg_vlan_pruning(vsi, true, false);
-	else if (!(features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
-		 (netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
-		ret = ice_cfg_vlan_pruning(vsi, false, false);
+	/* Set queue information for lowerdev */
+	ret = netdev_set_tc_queue(netdev, 0, vsi->num_txq, 0);
+	if (ret) {
+		netdev_err(netdev, "Error setting TC queue\n");
+		goto set_tc_queue_err;
+	}
 
+	return 0;
+set_tc_queue_err:
+	netdev_set_num_tc(netdev, 0);
 	return ret;
 }
 
 /**
- * ice_vsi_vlan_setup - Setup VLAN offload properties on a VSI
- * @vsi: VSI to setup VLAN properties for
+ * ice_fwd_add_macvlan - Configure MACVLAN interface
+ * @netdev: Main net device to configure
+ * @vdev: MACVLAN subordinate device
  */
-static int ice_vsi_vlan_setup(struct ice_vsi *vsi)
+static void *
+ice_fwd_add_macvlan(struct net_device *netdev, struct net_device *vdev)
 {
-	int ret = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *parent_vsi = np->vsi, *vsi;
+	struct ice_pf *pf = parent_vsi->back;
+	struct ice_macvlan *mv = NULL;
+	int avail_id, ret, offset, i;
+	enum ice_status status;
+	struct device *dev;
+	u8 mac[ETH_ALEN];
 
-	if (vsi->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
-		ret = ice_vsi_manage_vlan_stripping(vsi, true);
-	if (vsi->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)
-		ret = ice_vsi_manage_vlan_insertion(vsi);
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		netdev_err(netdev, "Can't do MACVLAN offload. Device is in Safe Mode\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
 
-	return ret;
-}
+	if (pf->num_macvlan == pf->max_num_macvlan) {
+		netdev_err(netdev, "MACVLAN offload limit reached\n");
+		return ERR_PTR(-ENOSPC);
+	}
 
-/**
- * ice_vsi_cfg - Setup the VSI
- * @vsi: the VSI being configured
- *
- * Return 0 on success and negative value on error
- */
-int ice_vsi_cfg(struct ice_vsi *vsi)
-{
-	int err;
+	if (vdev->num_rx_queues != 1 || vdev->num_tx_queues != 1) {
+		netdev_err(netdev, "Can't do MACVLAN offload. %s has multiple queues\n",
+			   vdev->name);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
 
-	if (vsi->netdev) {
-		ice_set_rx_mode(vsi->netdev);
+	if (ice_get_avail_txq_count(pf) < ICE_DFLT_TXQ_VMDQ_VSI ||
+	    ice_get_avail_rxq_count(pf) < ICE_DFLT_RXQ_VMDQ_VSI) {
+		netdev_err(netdev, "Can't do MACVLAN offload. Not enough queues\n");
+		return ERR_PTR(-ENOSPC);
+	}
 
-		err = ice_vsi_vlan_setup(vsi);
+	avail_id = find_first_zero_bit(pf->avail_macvlan, pf->max_num_macvlan);
 
-		if (err)
-			return err;
+	vsi = ice_macvlan_vsi_setup(pf, pf->hw.port_info);
+	if (!vsi) {
+		netdev_err(netdev, "Failed to create MACVLAN offload (VMDQ) VSI\n");
+		return ERR_PTR(-EIO);
 	}
-	ice_vsi_cfg_dcb_rings(vsi);
 
-	err = ice_vsi_cfg_lan_txqs(vsi);
-	if (!err)
-		err = ice_vsi_cfg_rxqs(vsi);
 
-	return err;
+	pf->num_macvlan++;
+	offset = parent_vsi->alloc_txq + avail_id;
+
+	ret = netdev_set_sb_channel(vdev, avail_id + 1);
+	if (ret) {
+		netdev_err(netdev, "Error setting netdev_set_sb_channel %d\n",
+			   ret);
+		goto set_sb_channel_err;
+	}
+
+	/* configure sbdev with the number of queues and offset within PF
+	 * queues range
+	 */
+	ret = netdev_bind_sb_channel_queue(netdev, vdev, 0, vsi->num_txq,
+					   offset);
+	if (ret) {
+		netdev_err(netdev, "Error setting netdev_bind_sb_channel_queue %d\n",
+			   ret);
+		goto bind_sb_channel_err;
+	}
+
+	vsi->netdev = vdev;
+	/* Set MACVLAN ring in root device Tx rings */
+	ice_for_each_txq(vsi, i)
+		parent_vsi->tx_rings[offset + i] = vsi->tx_rings[i];
+
+	ice_napi_add(vsi);
+
+	ret = ice_vsi_open(vsi);
+	if (ret)
+		goto vsi_open_err;
+
+	ether_addr_copy(mac, vdev->dev_addr);
+	status = ice_fltr_add_mac(vsi, mac, ICE_FWD_TO_VSI);
+	if (status == ICE_ERR_ALREADY_EXISTS) {
+		dev_info(dev, "can't add MAC filters %pM for VSI %d, error %s\n",
+			 mac, vsi->idx, ice_stat_str(status));
+	} else if (status) {
+		dev_err(dev, "can't add MAC filters %pM for VSI %d, error %s\n",
+			mac, vsi->idx, ice_stat_str(status));
+		ret = -ENOMEM;
+		goto add_mac_err;
+	}
+
+	mv = devm_kzalloc(dev, sizeof(*mv), GFP_KERNEL);
+	if (!mv) {
+		ret = -ENOMEM;
+		goto mv_init_err;
+	}
+	INIT_LIST_HEAD(&mv->list);
+	mv->parent_vsi = parent_vsi;
+	mv->vsi = vsi;
+	mv->id = avail_id;
+	mv->vdev = vdev;
+	ether_addr_copy(mv->mac, mac);
+	list_add(&mv->list, &pf->macvlan_list);
+
+	set_bit(avail_id, pf->avail_macvlan);
+	netdev_info(netdev, "MACVLAN offloads for %s are on\n", vdev->name);
+	return mv;
+
+mv_init_err:
+	ice_fltr_remove_all(vsi);
+add_mac_err:
+	ice_vsi_close(vsi);
+vsi_open_err:
+	ice_napi_del(vsi);
+	vsi->netdev = NULL;
+	netdev_unbind_sb_channel(netdev, vdev);
+bind_sb_channel_err:
+	netdev_set_sb_channel(vdev, 0);
+set_sb_channel_err:
+	pf->num_macvlan--;
+	ice_vsi_release(vsi);
+	return ERR_PTR(ret);
 }
 
 /**
- * ice_napi_enable_all - Enable NAPI for all q_vectors in the VSI
- * @vsi: the VSI being configured
+ * ice_fwd_del_macvlan - Delete MACVLAN interface resources
+ * @netdev: Main net device
+ * @accel_priv: MACVLAN sub ordinate device
  */
-static void ice_napi_enable_all(struct ice_vsi *vsi)
+static void ice_fwd_del_macvlan(struct net_device *netdev, void *accel_priv)
 {
-	int q_idx;
+	struct ice_macvlan *mv = (struct ice_macvlan *)accel_priv;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *parent_vsi = np->vsi;
+	struct ice_pf *pf = parent_vsi->back;
+	struct net_device *vdev = mv->vdev;
 
-	if (!vsi->netdev)
-		return;
+	netdev_unbind_sb_channel(netdev, vdev);
+	netdev_set_sb_channel(vdev, 0);
 
-	ice_for_each_q_vector(vsi, q_idx) {
-		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
+	ice_vsi_release(mv->vsi);
+	parent_vsi->tx_rings[parent_vsi->num_txq + mv->id] = NULL;
 
-		if (q_vector->rx.ring || q_vector->tx.ring)
-			napi_enable(&q_vector->napi);
-	}
+	pf->num_macvlan--;
+
+	clear_bit(mv->id, pf->avail_macvlan);
+	list_del(&mv->list);
+	devm_kfree(ice_pf_to_dev(pf), mv);
+
+	netdev_info(netdev, "MACVLAN offloads for %s are off\n", vdev->name);
 }
 
 /**
- * ice_up_complete - Finish the last steps of bringing up a connection
- * @vsi: The VSI being configured
- *
- * Return 0 on success and negative value on error
+ * ice_init_macvlan - Configure PF VSI to be able to offload MACVLAN
+ * @vsi: Main VSI pointer where sb_dev is attached to
+ * @init: Set to false when called in replay path otherwise true
  */
-static int ice_up_complete(struct ice_vsi *vsi)
+static int ice_init_macvlan(struct ice_vsi *vsi, bool init)
 {
+	struct net_device *netdev = vsi->netdev;
 	struct ice_pf *pf = vsi->back;
-	int err;
+	struct ice_ring **tmp_rings;
+	unsigned int total_rings;
+	struct device *dev;
+	int i, ret;
 
-	ice_vsi_cfg_msix(vsi);
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_FLAG_VMDQ_ENA, pf->flags)) {
+		dev_err(dev, "MACVLAN offload cannot be supported - VMDQ is disabled\n");
+		return -EPERM;
+	}
 
-	/* Enable only Rx rings, Tx rings were enabled by the FW when the
-	 * Tx queue group list was configured and the context bits were
-	 * programmed using ice_vsi_cfg_txqs
-	 */
-	err = ice_vsi_start_rx_rings(vsi);
-	if (err)
-		return err;
+	if (ice_is_safe_mode(pf)) {
+		dev_err(dev, "MACVLAN offload cannot be configured - Device is in Safe Mode\n");
+		return -EOPNOTSUPP;
+	}
 
-	clear_bit(__ICE_DOWN, vsi->state);
-	ice_napi_enable_all(vsi);
-	ice_vsi_ena_irq(vsi);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		dev_err(dev, "MACVLAN offload cannot be configured - ADQ is active. Delete ADQ configs using TC and try again\n");
+		return -EOPNOTSUPP;
+	}
+#endif /* NETIF_F_HW_TC */
 
-	if (vsi->port_info &&
-	    (vsi->port_info->phy.link_info.link_info & ICE_AQ_LINK_UP) &&
-	    vsi->netdev) {
-		ice_print_link_msg(vsi, true);
-		netif_tx_start_all_queues(vsi->netdev);
-		netif_carrier_on(vsi->netdev);
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. L2 Forwarding Offload cannot be enabled.\n");
+		return -EOPNOTSUPP;
 	}
 
-	ice_service_task_schedule(pf);
+	if (ice_is_eswitch_mode_switchdev(pf)) {
+		dev_err(dev, "MACVLAN offload cannot be configured - switchdev is enabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	pf->max_num_macvlan = min3(ice_get_avail_txq_count(pf),
+				   ice_get_avail_rxq_count(pf),
+				   (u16)ICE_MAX_MACVLANS);
+
+	total_rings = vsi->alloc_txq + pf->max_num_macvlan;
+
+	/* Allocate memory for Tx and MACVLAN ring pointers */
+	tmp_rings = devm_kcalloc(dev, total_rings, sizeof(*tmp_rings),
+				 GFP_KERNEL);
+	if (!tmp_rings) {
+		ret = -ENOMEM;
+		goto alloc_ring_err;
+	}
+
+	/* Copy existing ring pointers to new temporary ones */
+	for (i = 0; i < vsi->alloc_txq; i++)
+		tmp_rings[i] = vsi->tx_rings[i];
+	vsi->base_tx_rings = vsi->tx_rings;
+	vsi->tx_rings = tmp_rings;
+
+	if (!init)
+		return 0;
+
+	ret = netif_set_real_num_tx_queues(netdev, total_rings);
+	if (ret) {
+		netdev_err(netdev, "Error setting real num queue\n");
+		goto set_num_real_txq_err;
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (!ice_is_adq_active(pf)) {
+		ret = ice_vsi_cfg_netdev_tc0(vsi);
+		if (ret)
+			goto set_num_tc_err;
+	}
+#else
+	ret = ice_vsi_cfg_netdev_tc0(vsi);
+	if (ret)
+		goto set_num_tc_err;
+#endif /* NETIF_F_HW_TC */
+
+	INIT_LIST_HEAD(&pf->macvlan_list);
+	set_bit(ICE_FLAG_MACVLAN_ENA, pf->flags);
 
 	return 0;
+
+set_num_tc_err:
+	netif_set_real_num_tx_queues(netdev, vsi->num_txq);
+set_num_real_txq_err:
+	vsi->tx_rings = vsi->base_tx_rings;
+	vsi->base_tx_rings = NULL;
+	devm_kfree(dev, tmp_rings);
+alloc_ring_err:
+	pf->max_num_macvlan = 0;
+	return ret;
 }
 
 /**
- * ice_up - Bring the connection back up after being down
- * @vsi: VSI being configured
+ * ice_deinit_macvlan - Release and cleanup MACVLAN resources
+ * @vsi: Main VSI pointer where sb_dev is attached to
  */
-int ice_up(struct ice_vsi *vsi)
+static void ice_deinit_macvlan(struct ice_vsi *vsi)
 {
-	int err;
+	struct ice_macvlan *mv, *mv_tmp;
+	struct ice_pf *pf = vsi->back;
+	struct ice_ring **tmp_rings;
 
-	err = ice_vsi_cfg(vsi);
-	if (!err)
-		err = ice_up_complete(vsi);
+	clear_bit(ICE_FLAG_MACVLAN_ENA, pf->flags);
 
-	return err;
+	/* Remove offload from existing MACVLANs; clear software book-keeping
+	 * structures and reclaim hardware resources
+	 */
+	list_for_each_entry_safe(mv, mv_tmp, &pf->macvlan_list, list) {
+		ice_fltr_remove_mac(mv->vsi, mv->mac, ICE_FWD_TO_VSI);
+		macvlan_release_l2fw_offload(mv->vdev);
+		ice_fwd_del_macvlan(mv->parent_vsi->netdev, mv);
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (!ice_is_adq_active(pf))
+		netdev_set_num_tc(vsi->netdev, 0);
+#else
+	netdev_set_num_tc(vsi->netdev, 0);
+#endif /* NETIF_F_HW_TC */
+	netif_set_real_num_tx_queues(vsi->netdev, vsi->num_txq);
+	pf->max_num_macvlan = 0;
+
+	/* Restore original Tx ring pointers */
+	tmp_rings = vsi->tx_rings;
+	vsi->tx_rings = vsi->base_tx_rings;
+	devm_kfree(ice_pf_to_dev(pf), tmp_rings);
 }
 
 /**
- * ice_fetch_u64_stats_per_ring - get packets and bytes stats per ring
- * @ring: Tx or Rx ring to read stats from
- * @pkts: packets stats counter
- * @bytes: bytes stats counter
- *
- * This function fetches stats from the ring considering the atomic operations
- * that needs to be performed to read u64 values in 32 bit machine.
+ * ice_vsi_replay_macvlan - Configure MACVLAN netdev settings after reset
+ * @pf: board private structure
  */
-static void
-ice_fetch_u64_stats_per_ring(struct ice_ring *ring, u64 *pkts, u64 *bytes)
+static void ice_vsi_replay_macvlan(struct ice_pf *pf)
 {
-	unsigned int start;
-	*pkts = 0;
-	*bytes = 0;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_macvlan *mv, *mv_temp;
 
-	if (!ring)
-		return;
-	do {
-		start = u64_stats_fetch_begin_irq(&ring->syncp);
-		*pkts = ring->stats.pkts;
-		*bytes = ring->stats.bytes;
-	} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
+	list_for_each_entry_safe(mv, mv_temp, &pf->macvlan_list, list) {
+		struct ice_vsi *vsi = mv->parent_vsi;
+		int offset = vsi->alloc_txq + mv->id;
+		int ret = 0, i;
+
+		ice_for_each_txq(mv->vsi, i)
+			vsi->tx_rings[offset + i] = mv->vsi->tx_rings[i];
+
+		ret = netdev_set_sb_channel(mv->vdev, mv->id + 1);
+		if (ret) {
+			dev_dbg(dev, "Error setting netdev_set_sb_channel %d\n",
+				ret);
+			/* Do not return error, try to configure as many as
+			 * possible
+			 */
+			ice_fltr_remove_mac(mv->vsi, mv->mac, ICE_FWD_TO_VSI);
+			macvlan_release_l2fw_offload(mv->vdev);
+			ice_fwd_del_macvlan(mv->parent_vsi->netdev, mv);
+			continue;
+		}
+
+		ret = netdev_bind_sb_channel_queue(vsi->netdev, mv->vdev, 0,
+						   mv->vsi->num_txq, offset);
+		if (ret) {
+			dev_dbg(dev, "Error setting netdev_bind_sb_channel_queue %d\n",
+				ret);
+			/* Do not return error, try to configure as many as
+			 * possible
+			 */
+			ice_fltr_remove_mac(mv->vsi, mv->mac, ICE_FWD_TO_VSI);
+			macvlan_release_l2fw_offload(mv->vdev);
+			ice_fwd_del_macvlan(mv->parent_vsi->netdev, mv);
+			continue;
+		}
+	}
 }
+#endif /* HAVE_NETDEV_SB_DEV */
+
+#define NETIF_VLAN_OFFLOAD_FEATURES	(NETIF_F_HW_VLAN_CTAG_RX | \
+					 NETIF_F_HW_VLAN_CTAG_TX | \
+					 NETIF_F_HW_VLAN_STAG_RX | \
+					 NETIF_F_HW_VLAN_STAG_TX)
+
+#define NETIF_VLAN_FILTERING_FEATURES	(NETIF_F_HW_VLAN_CTAG_FILTER | \
+					 NETIF_F_HW_VLAN_STAG_FILTER)
 
 /**
- * ice_update_vsi_ring_stats - Update VSI stats counters
- * @vsi: the VSI to be updated
+ * ice_fix_features - fix the netdev features flags based on device limitations
+ * @netdev: ptr to the netdev that flags are being fixed on
+ * @features: features that need to be checked and possibly fixed
+ *
+ * Make sure any fixups are made to features in this callback. This enables the
+ * driver to not have to check unsupported configurations throughout the driver
+ * because that's the responsiblity of this callback.
+ *
+ * Single VLAN Mode (SVM) Supported Features:
+ *	NETIF_F_HW_VLAN_CTAG_FILTER
+ *	NETIF_F_HW_VLAN_CTAG_RX
+ *	NETIF_F_HW_VLAN_CTAG_TX
+ *
+ * Double VLAN Mode (DVM) Supported Features:
+ *	NETIF_F_HW_VLAN_CTAG_FILTER
+ *	NETIF_F_HW_VLAN_CTAG_RX
+ *	NETIF_F_HW_VLAN_CTAG_TX
+ *
+ *	NETIF_F_HW_VLAN_STAG_FILTER
+ *	NETIF_HW_VLAN_STAG_RX
+ *	NETIF_HW_VLAN_STAG_TX
+ *
+ * Features that need fixing:
+ *	Cannot simultaneously enable CTAG and STAG stripping and/or insertion.
+ *	These are mutually exlusive as the VSI context cannot support multiple
+ *	VLAN ethertypes simultaneously for stripping and/or insertion. If this
+ *	is not done, then default to clearing the requested STAG offload
+ *	settings.
+ *
+ *	All supported filtering has to be enabled or disabled together. For
+ *	example, in DVM, CTAG and STAG filtering have to be enabled and disabled
+ *	together. If this is not done, then default to VLAN filtering disabled.
+ *	These are mutually exclusive as there is currently no way to
+ *	enable/disable VLAN filtering based on VLAN ethertype when using VLAN
+ *	prune rules.
  */
-static void ice_update_vsi_ring_stats(struct ice_vsi *vsi)
+static netdev_features_t
+ice_fix_features(struct net_device *netdev, netdev_features_t features)
 {
-	struct rtnl_link_stats64 *vsi_stats = &vsi->net_stats;
-	struct ice_ring *ring;
-	u64 pkts, bytes;
-	int i;
-
-	/* reset netdev stats */
-	vsi_stats->tx_packets = 0;
-	vsi_stats->tx_bytes = 0;
-	vsi_stats->rx_packets = 0;
-	vsi_stats->rx_bytes = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	netdev_features_t supported_vlan_filtering;
+	netdev_features_t requested_vlan_filtering;
+	struct ice_vsi *vsi = np->vsi;
 
-	/* reset non-netdev (extended) stats */
-	vsi->tx_restart = 0;
-	vsi->tx_busy = 0;
-	vsi->tx_linearize = 0;
-	vsi->rx_buf_failed = 0;
-	vsi->rx_page_failed = 0;
+	requested_vlan_filtering = features & NETIF_VLAN_FILTERING_FEATURES;
 
-	rcu_read_lock();
+	/* make sure supported_vlan_filtering works for both SVM and DVM */
+	supported_vlan_filtering = NETIF_F_HW_VLAN_CTAG_FILTER;
+	if (ice_is_dvm_ena(&vsi->back->hw))
+		supported_vlan_filtering |= NETIF_F_HW_VLAN_STAG_FILTER;
 
-	/* update Tx rings counters */
-	ice_for_each_txq(vsi, i) {
-		ring = READ_ONCE(vsi->tx_rings[i]);
-		ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes);
-		vsi_stats->tx_packets += pkts;
-		vsi_stats->tx_bytes += bytes;
-		vsi->tx_restart += ring->tx_stats.restart_q;
-		vsi->tx_busy += ring->tx_stats.tx_busy;
-		vsi->tx_linearize += ring->tx_stats.tx_linearize;
+	if (requested_vlan_filtering &&
+	    requested_vlan_filtering != supported_vlan_filtering) {
+		if (requested_vlan_filtering & NETIF_F_HW_VLAN_CTAG_FILTER) {
+			netdev_warn(netdev, "cannot support requested VLAN filtering settings, enabling all supported VLAN filtering settings\n");
+			features |= supported_vlan_filtering;
+		} else {
+			netdev_warn(netdev, "cannot support requested VLAN filtering settings, clearing all supported VLAN filtering settings\n");
+			features &= ~supported_vlan_filtering;
+		}
 	}
 
-	/* update Rx rings counters */
-	ice_for_each_rxq(vsi, i) {
-		ring = READ_ONCE(vsi->rx_rings[i]);
-		ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes);
-		vsi_stats->rx_packets += pkts;
-		vsi_stats->rx_bytes += bytes;
-		vsi->rx_buf_failed += ring->rx_stats.alloc_buf_failed;
-		vsi->rx_page_failed += ring->rx_stats.alloc_page_failed;
+	if ((features & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX)) &&
+	    (features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_TX))) {
+		netdev_warn(netdev, "cannot support CTAG and STAG VLAN stripping and/or insertion simultaneously since CTAG and STAG offloads are mutually exclusive, clearing STAG offload settings\n");
+		features &= ~(NETIF_F_HW_VLAN_STAG_RX |
+			      NETIF_F_HW_VLAN_STAG_TX);
 	}
 
-	rcu_read_unlock();
+	return features;
 }
 
 /**
- * ice_update_vsi_stats - Update VSI stats counters
- * @vsi: the VSI to be updated
+ * ice_set_vlan_offload_features - set VLAN offload features for the PF VSI
+ * @vsi: PF's VSI
+ * @features: features used to determine VLAN offload settings
+ *
+ * First, determine the vlan_ethertype based on the VLAN offload bits in
+ * features. Then determine if stripping and insertion should be enabled or
+ * disabled. Finally enable or disable VLAN stripping and insertion.
  */
-void ice_update_vsi_stats(struct ice_vsi *vsi)
+static int
+ice_set_vlan_offload_features(struct ice_vsi *vsi, netdev_features_t features)
 {
-	struct rtnl_link_stats64 *cur_ns = &vsi->net_stats;
-	struct ice_eth_stats *cur_es = &vsi->eth_stats;
-	struct ice_pf *pf = vsi->back;
+	bool enable_stripping = true, enable_insertion = true;
+	struct ice_vsi_vlan_ops *vlan_ops;
+	int strip_err = 0, insert_err = 0;
+	u16 vlan_ethertype = 0;
 
-	if (test_bit(__ICE_DOWN, vsi->state) ||
-	    test_bit(__ICE_CFG_BUSY, pf->state))
-		return;
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
-	/* get stats as recorded by Tx/Rx rings */
-	ice_update_vsi_ring_stats(vsi);
+	if (features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_TX))
+		vlan_ethertype = ETH_P_8021AD;
+	else if (features & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX))
+		vlan_ethertype = ETH_P_8021Q;
 
-	/* get VSI stats as recorded by the hardware */
-	ice_update_eth_stats(vsi);
+	if (!(features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_CTAG_RX)))
+		enable_stripping = false;
+	if (!(features & (NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_CTAG_TX)))
+		enable_insertion = false;
 
-	cur_ns->tx_errors = cur_es->tx_errors;
-	cur_ns->rx_dropped = cur_es->rx_discards;
-	cur_ns->tx_dropped = cur_es->tx_discards;
-	cur_ns->multicast = cur_es->rx_multicast;
+	if (enable_stripping)
+		strip_err = vlan_ops->ena_stripping(vsi, vlan_ethertype);
+	else
+		strip_err = vlan_ops->dis_stripping(vsi);
 
-	/* update some more netdev stats if this is main VSI */
-	if (vsi->type == ICE_VSI_PF) {
-		cur_ns->rx_crc_errors = pf->stats.crc_errors;
-		cur_ns->rx_errors = pf->stats.crc_errors +
-				    pf->stats.illegal_bytes;
-		cur_ns->rx_length_errors = pf->stats.rx_len_errors;
-		/* record drops from the port level */
-		cur_ns->rx_missed_errors = pf->stats.eth.rx_discards;
-	}
+	if (enable_insertion)
+		insert_err = vlan_ops->ena_insertion(vsi, vlan_ethertype);
+	else
+		insert_err = vlan_ops->dis_insertion(vsi);
+
+	if (strip_err || insert_err)
+		return -EIO;
+
+	return 0;
 }
 
 /**
- * ice_update_pf_stats - Update PF port stats counters
- * @pf: PF whose stats needs to be updated
+ * ice_set_vlan_filtering_features - set VLAN filtering features for the PF VSI
+ * @vsi: PF's VSI
+ * @features: features used to determine VLAN filtering settings
+ *
+ * Enable or disable Rx VLAN filtering based on the VLAN filtering bits in the
+ * features.
  */
-void ice_update_pf_stats(struct ice_pf *pf)
+static int
+ice_set_vlan_filtering_features(struct ice_vsi *vsi, netdev_features_t features)
 {
-	struct ice_hw_port_stats *prev_ps, *cur_ps;
-	struct ice_hw *hw = &pf->hw;
-	u8 port;
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	int err = 0;
 
-	port = hw->port_info->lport;
-	prev_ps = &pf->stats_prev;
-	cur_ps = &pf->stats;
+	/* support Single VLAN Mode (SVM) and Double VLAN Mode (DVM) by checking
+	 * if either bit is set
+	 */
+	if (features &
+	    (NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER))
+		err = vlan_ops->ena_rx_filtering(vsi);
+	else
+		err = vlan_ops->dis_rx_filtering(vsi);
 
-	ice_stat_update40(hw, GLPRT_GORCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_bytes,
-			  &cur_ps->eth.rx_bytes);
+	return err;
+}
 
-	ice_stat_update40(hw, GLPRT_UPRCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_unicast,
-			  &cur_ps->eth.rx_unicast);
+/**
+ * ice_set_vlan_features - set VLAN settings based on suggested feature set
+ * @netdev: ptr to the netdev being adjusted
+ * @features: the feature set that the stack is suggesting
+ *
+ * Only update VLAN settings if the requested_vlan_features are different than
+ * the current_vlan_features.
+ */
+static int
+ice_set_vlan_features(struct net_device *netdev, netdev_features_t features)
+{
+	netdev_features_t current_vlan_features, requested_vlan_features;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	int err;
 
-	ice_stat_update40(hw, GLPRT_MPRCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_multicast,
-			  &cur_ps->eth.rx_multicast);
+	current_vlan_features = netdev->features & NETIF_VLAN_OFFLOAD_FEATURES;
+	requested_vlan_features = features & NETIF_VLAN_OFFLOAD_FEATURES;
+	if (current_vlan_features ^ requested_vlan_features) {
+		err = ice_set_vlan_offload_features(vsi, features);
+		if (err)
+			return err;
+	}
 
-	ice_stat_update40(hw, GLPRT_BPRCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_broadcast,
-			  &cur_ps->eth.rx_broadcast);
+	current_vlan_features = netdev->features &
+		NETIF_VLAN_FILTERING_FEATURES;
+	requested_vlan_features = features & NETIF_VLAN_FILTERING_FEATURES;
+	if (current_vlan_features ^ requested_vlan_features) {
+		err = ice_set_vlan_filtering_features(vsi, features);
+		if (err)
+			return err;
+	}
 
-	ice_stat_update32(hw, PRTRPB_RDPC, pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_discards,
-			  &cur_ps->eth.rx_discards);
+	return 0;
+}
 
-	ice_stat_update40(hw, GLPRT_GOTCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.tx_bytes,
-			  &cur_ps->eth.tx_bytes);
+/**
+ * ice_set_features - set the netdev feature flags
+ * @netdev: ptr to the netdev being adjusted
+ * @features: the feature set that the stack is suggesting
+ */
+static int
+ice_set_features(struct net_device *netdev, netdev_features_t features)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int ret = 0;
 
-	ice_stat_update40(hw, GLPRT_UPTCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.tx_unicast,
-			  &cur_ps->eth.tx_unicast);
+	/* Don't set any netdev advanced features with device in Safe Mode */
+	if (ice_is_safe_mode(vsi->back)) {
+		dev_err(ice_pf_to_dev(vsi->back), "Device is in Safe Mode - not enabling advanced netdev features\n");
+		return ret;
+	}
 
-	ice_stat_update40(hw, GLPRT_MPTCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.tx_multicast,
-			  &cur_ps->eth.tx_multicast);
+	/* Do not change setting during reset */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_err(ice_pf_to_dev(vsi->back), "Device is resetting, changing advanced netdev features temporarily unavailable.\n");
+		return -EBUSY;
+	}
 
-	ice_stat_update40(hw, GLPRT_BPTCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.tx_broadcast,
-			  &cur_ps->eth.tx_broadcast);
+	/* Multiple features can be changed in one call so keep features in
+	 * separate if/else statements to guarantee each feature is checked
+	 */
+	if (features & NETIF_F_RXHASH && !(netdev->features & NETIF_F_RXHASH))
+		ice_vsi_manage_rss_lut(vsi, true);
+	else if (!(features & NETIF_F_RXHASH) &&
+		 netdev->features & NETIF_F_RXHASH)
+		ice_vsi_manage_rss_lut(vsi, false);
 
-	ice_stat_update32(hw, GLPRT_TDOLD(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_dropped_link_down,
-			  &cur_ps->tx_dropped_link_down);
+	ret = ice_set_vlan_features(netdev, features);
+	if (ret)
+		return ret;
 
-	ice_stat_update40(hw, GLPRT_PRC64L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_64, &cur_ps->rx_size_64);
+#ifdef HAVE_NETDEV_SB_DEV
 
-	ice_stat_update40(hw, GLPRT_PRC127L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_127, &cur_ps->rx_size_127);
+	if ((features & NETIF_F_HW_L2FW_DOFFLOAD) &&
+	    !(netdev->features & NETIF_F_HW_L2FW_DOFFLOAD)) {
+		ret = ice_init_macvlan(vsi, true);
+		if (ret)
+			return ret;
+	} else if (!(features & NETIF_F_HW_L2FW_DOFFLOAD) &&
+		 (netdev->features & NETIF_F_HW_L2FW_DOFFLOAD)) {
+		ice_deinit_macvlan(vsi);
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
 
-	ice_stat_update40(hw, GLPRT_PRC255L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_255, &cur_ps->rx_size_255);
+	if ((features & NETIF_F_NTUPLE) &&
+	    !(netdev->features & NETIF_F_NTUPLE)) {
+		ice_vsi_manage_fdir(vsi, true);
+		ice_init_arfs(vsi);
+	} else if (!(features & NETIF_F_NTUPLE) &&
+		 (netdev->features & NETIF_F_NTUPLE)) {
+		ice_vsi_manage_fdir(vsi, false);
+		ice_clear_arfs(vsi);
+	}
 
-	ice_stat_update40(hw, GLPRT_PRC511L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_511, &cur_ps->rx_size_511);
+#ifdef NETIF_F_HW_TC
+	/* don't turn off hw_tc_offload when ADQ is already enabled */
+	if (!(features & NETIF_F_HW_TC) && ice_is_adq_active(pf)) {
+		dev_err(ice_pf_to_dev(pf), "ADQ is active, can't turn hw_tc_offload off\n");
+		return -EACCES;
+	}
 
-	ice_stat_update40(hw, GLPRT_PRC1023L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_1023, &cur_ps->rx_size_1023);
+	if ((features & NETIF_F_HW_TC) &&
+	    !(netdev->features & NETIF_F_HW_TC))
+		set_bit(ICE_FLAG_CLS_FLOWER, pf->flags);
+	else
+		clear_bit(ICE_FLAG_CLS_FLOWER, pf->flags);
+#endif /* NETIF_F_HW_TC */
 
-	ice_stat_update40(hw, GLPRT_PRC1522L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_1522, &cur_ps->rx_size_1522);
+	return 0;
+}
 
-	ice_stat_update40(hw, GLPRT_PRC9522L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_big, &cur_ps->rx_size_big);
+/**
+ * ice_vsi_vlan_setup - Setup VLAN offload properties on a PF VSI
+ * @vsi: VSI to setup VLAN properties for
+ */
+static int ice_vsi_vlan_setup(struct ice_vsi *vsi)
+{
+	int err;
 
-	ice_stat_update40(hw, GLPRT_PTC64L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_64, &cur_ps->tx_size_64);
+	err = ice_set_vlan_offload_features(vsi, vsi->netdev->features);
+	if (err)
+		return err;
 
-	ice_stat_update40(hw, GLPRT_PTC127L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_127, &cur_ps->tx_size_127);
+	err = ice_set_vlan_filtering_features(vsi, vsi->netdev->features);
+	if (err)
+		return err;
 
-	ice_stat_update40(hw, GLPRT_PTC255L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_255, &cur_ps->tx_size_255);
+	err = ice_vsi_add_vlan_zero(vsi);
+	if (err)
+		return err;
 
-	ice_stat_update40(hw, GLPRT_PTC511L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_511, &cur_ps->tx_size_511);
+	return 0;
+}
 
-	ice_stat_update40(hw, GLPRT_PTC1023L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_1023, &cur_ps->tx_size_1023);
+/**
+ * ice_vsi_cfg - Setup the VSI
+ * @vsi: the VSI being configured
+ *
+ * Return 0 on success and negative value on error
+ */
+int ice_vsi_cfg(struct ice_vsi *vsi)
+{
+	int err;
 
-	ice_stat_update40(hw, GLPRT_PTC1522L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_1522, &cur_ps->tx_size_1522);
+	if (vsi->netdev && vsi->type == ICE_VSI_PF) {
+		ice_set_rx_mode(vsi->netdev);
 
-	ice_stat_update40(hw, GLPRT_PTC9522L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_big, &cur_ps->tx_size_big);
+		err = ice_vsi_vlan_setup(vsi);
 
-	ice_stat_update32(hw, GLPRT_LXONRXC(port), pf->stat_prev_loaded,
-			  &prev_ps->link_xon_rx, &cur_ps->link_xon_rx);
+		if (err)
+			return err;
+	}
+	ice_vsi_cfg_dcb_rings(vsi);
 
-	ice_stat_update32(hw, GLPRT_LXOFFRXC(port), pf->stat_prev_loaded,
-			  &prev_ps->link_xoff_rx, &cur_ps->link_xoff_rx);
+	err = ice_vsi_cfg_lan_txqs(vsi);
+#ifdef HAVE_XDP_SUPPORT
+	if (!err && ice_is_xdp_ena_vsi(vsi))
+		err = ice_vsi_cfg_xdp_txqs(vsi);
+#endif /* HAVE_XDP_SUPPORT */
+	if (!err)
+		err = ice_vsi_cfg_rxqs(vsi);
 
-	ice_stat_update32(hw, GLPRT_LXONTXC(port), pf->stat_prev_loaded,
-			  &prev_ps->link_xon_tx, &cur_ps->link_xon_tx);
+	return err;
+}
 
-	ice_stat_update32(hw, GLPRT_LXOFFTXC(port), pf->stat_prev_loaded,
-			  &prev_ps->link_xoff_tx, &cur_ps->link_xoff_tx);
+/* THEORY OF MODERATION:
+ * The ice driver hardware works differently than the hardware that DIMLIB was
+ * originally made for. ice hardware doesn't have packet count limits that
+ * can trigger an interrupt, but it *does* have interrupt rate limit support,
+ * which is hard-coded to limit to 250,000 ints/second.
+ * If not using dynamic moderation, the INTRL value can be modified
+ * by ethtool rx-usecs-high.
+ */
+struct ice_dim {
+	/* the throttle rate for interrupts, basically worst case delay before
+	 * an initial interrupt fires, value is stored in microseconds.
+	 */
+	u16 itr;
+};
 
-	ice_update_dcb_stats(pf);
+/* Make a different profile for RX that doesn't allow quite so aggressive
+ * moderation at the high end (it maxxes out at 126us or about 8k interrupts a
+ * second.
+ */
+static const struct ice_dim rx_profile[] = {
+	{2},    /* 500,000 ints/s, capped at 250K by INTRL */
+	{8},    /* 125,000 ints/s */
+	{16},   /*  62,500 ints/s */
+	{62},   /*  16,129 ints/s */
+	{126}   /*   7,936 ints/s */
+};
 
-	ice_stat_update32(hw, GLPRT_CRCERRS(port), pf->stat_prev_loaded,
-			  &prev_ps->crc_errors, &cur_ps->crc_errors);
+/* The transmit profile, which has the same sorts of values
+ * as the previous struct
+ */
+static const struct ice_dim tx_profile[] = {
+	{2},    /* 500,000 ints/s, capped at 250K by INTRL */
+	{8},    /* 125,000 ints/s */
+	{40},   /*  16,125 ints/s */
+	{128},  /*   7,812 ints/s */
+	{256}   /*   3,906 ints/s */
+};
 
-	ice_stat_update32(hw, GLPRT_ILLERRC(port), pf->stat_prev_loaded,
-			  &prev_ps->illegal_bytes, &cur_ps->illegal_bytes);
+static void ice_tx_dim_work(struct work_struct *work)
+{
+	struct ice_ring_container *rc;
+	struct dim *dim;
+	u16 itr;
 
-	ice_stat_update32(hw, GLPRT_MLFC(port), pf->stat_prev_loaded,
-			  &prev_ps->mac_local_faults,
-			  &cur_ps->mac_local_faults);
+	dim = container_of(work, struct dim, work);
+	rc = (struct ice_ring_container *)dim->priv;
 
-	ice_stat_update32(hw, GLPRT_MRFC(port), pf->stat_prev_loaded,
-			  &prev_ps->mac_remote_faults,
-			  &cur_ps->mac_remote_faults);
+	WARN_ON(dim->profile_ix >= ARRAY_SIZE(tx_profile));
 
-	ice_stat_update32(hw, GLPRT_RLEC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_len_errors, &cur_ps->rx_len_errors);
+	/* look up the values in our local table */
+	itr = tx_profile[dim->profile_ix].itr;
 
-	ice_stat_update32(hw, GLPRT_RUC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_undersize, &cur_ps->rx_undersize);
+	ice_trace(tx_dim_work, container_of(rc, struct ice_q_vector, tx), dim);
+	ice_write_itr(rc, itr);
 
-	ice_stat_update32(hw, GLPRT_RFC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_fragments, &cur_ps->rx_fragments);
+	dim->state = DIM_START_MEASURE;
+}
 
-	ice_stat_update32(hw, GLPRT_ROC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_oversize, &cur_ps->rx_oversize);
+static void ice_rx_dim_work(struct work_struct *work)
+{
+	struct ice_ring_container *rc;
+	struct dim *dim;
+	u16 itr;
 
-	ice_stat_update32(hw, GLPRT_RJC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_jabber, &cur_ps->rx_jabber);
+	dim = container_of(work, struct dim, work);
+	rc = (struct ice_ring_container *)dim->priv;
 
-	pf->stat_prev_loaded = true;
-}
+	WARN_ON(dim->profile_ix >= ARRAY_SIZE(rx_profile));
 
-/**
- * ice_get_stats64 - get statistics for network device structure
- * @netdev: network interface device structure
- * @stats: main device statistics structure
- */
-static
-void ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
-{
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct rtnl_link_stats64 *vsi_stats;
-	struct ice_vsi *vsi = np->vsi;
+	/* look up the values in our local table */
+	itr = rx_profile[dim->profile_ix].itr;
 
-	vsi_stats = &vsi->net_stats;
+	ice_trace(rx_dim_work, container_of(rc, struct ice_q_vector, rx), dim);
+	ice_write_itr(rc, itr);
 
-	if (!vsi->num_txq || !vsi->num_rxq)
-		return;
+	dim->state = DIM_START_MEASURE;
+}
 
-	/* netdev packet/byte stats come from ring counter. These are obtained
-	 * by summing up ring counters (done by ice_update_vsi_ring_stats).
-	 * But, only call the update routine and read the registers if VSI is
-	 * not down.
-	 */
-	if (!test_bit(__ICE_DOWN, vsi->state))
-		ice_update_vsi_ring_stats(vsi);
-	stats->tx_packets = vsi_stats->tx_packets;
-	stats->tx_bytes = vsi_stats->tx_bytes;
-	stats->rx_packets = vsi_stats->rx_packets;
-	stats->rx_bytes = vsi_stats->rx_bytes;
+#define ICE_DIM_DEFAULT_PROFILE_IX 1
 
-	/* The rest of the stats can be read from the hardware but instead we
-	 * just return values that the watchdog task has already obtained from
-	 * the hardware.
-	 */
-	stats->multicast = vsi_stats->multicast;
-	stats->tx_errors = vsi_stats->tx_errors;
-	stats->tx_dropped = vsi_stats->tx_dropped;
-	stats->rx_errors = vsi_stats->rx_errors;
-	stats->rx_dropped = vsi_stats->rx_dropped;
-	stats->rx_crc_errors = vsi_stats->rx_crc_errors;
-	stats->rx_length_errors = vsi_stats->rx_length_errors;
+/**
+ * ice_init_moderation - set up interrupt moderation
+ * @q_vector: the vector containing rings to be configured
+ *
+ * Set up interrupt moderation registers, with the intent to do the right thing
+ * when called from reset or from probe, and whether or not dynamic moderation
+ * is enabled or not. Take special care to write all the registers in both
+ * dynamic mode or not in order to make sure hardware is in a known state.
+ */
+static void ice_init_moderation(struct ice_q_vector *q_vector)
+{
+	struct ice_ring_container *rc;
+	bool tx_dynamic, rx_dynamic;
+
+	rc = &q_vector->tx;
+	INIT_WORK(&rc->dim.work, ice_tx_dim_work);
+	rc->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	rc->dim.profile_ix = ICE_DIM_DEFAULT_PROFILE_IX;
+	rc->dim.priv = rc;
+	tx_dynamic = ITR_IS_DYNAMIC(rc);
+
+	/* set the initial TX ITR to match the above */
+	ice_write_itr(rc, tx_dynamic ?
+		      tx_profile[rc->dim.profile_ix].itr : rc->itr_setting);
+
+	rc = &q_vector->rx;
+	INIT_WORK(&rc->dim.work, ice_rx_dim_work);
+	rc->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	rc->dim.profile_ix = ICE_DIM_DEFAULT_PROFILE_IX;
+	rc->dim.priv = rc;
+	rx_dynamic = ITR_IS_DYNAMIC(rc);
+
+	/* set the initial RX ITR to match the above */
+	ice_write_itr(rc, rx_dynamic ? rx_profile[rc->dim.profile_ix].itr :
+				       rc->itr_setting);
+
+	ice_set_q_vector_intrl(q_vector);
 }
 
 /**
- * ice_napi_disable_all - Disable NAPI for all q_vectors in the VSI
- * @vsi: VSI having NAPI disabled
+ * ice_napi_enable_all - Enable NAPI for all q_vectors in the VSI
+ * @vsi: the VSI being configured
  */
-static void ice_napi_disable_all(struct ice_vsi *vsi)
+static void ice_napi_enable_all(struct ice_vsi *vsi)
 {
 	int q_idx;
 
@@ -3893,8 +7903,495 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)
 	ice_for_each_q_vector(vsi, q_idx) {
 		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
 
+		ice_init_moderation(q_vector);
+
 		if (q_vector->rx.ring || q_vector->tx.ring)
-			napi_disable(&q_vector->napi);
+			napi_enable(&q_vector->napi);
+	}
+}
+
+/**
+ * ice_up_complete - Finish the last steps of bringing up a connection
+ * @vsi: The VSI being configured
+ *
+ * Return 0 on success and negative value on error
+ */
+static int ice_up_complete(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	int err;
+
+	ice_vsi_cfg_msix(vsi);
+
+	/* Enable only Rx rings, Tx rings were enabled by the FW when the
+	 * Tx queue group list was configured and the context bits were
+	 * programmed using ice_vsi_cfg_txqs
+	 */
+	err = ice_vsi_start_all_rx_rings(vsi);
+	if (err)
+		return err;
+
+
+	clear_bit(ICE_VSI_DOWN, vsi->state);
+	ice_napi_enable_all(vsi);
+	ice_vsi_ena_irq(vsi);
+
+	if (vsi->port_info &&
+	    (vsi->port_info->phy.link_info.link_info & ICE_AQ_LINK_UP) &&
+	    vsi->netdev && vsi->type == ICE_VSI_PF) {
+		ice_print_link_msg(vsi, true);
+		netif_tx_start_all_queues(vsi->netdev);
+		netif_carrier_on(vsi->netdev);
+		if (!ice_is_e810(&pf->hw))
+			ice_ptp_link_change(pf, pf->hw.pf_id, true);
+	}
+
+
+
+	if (vsi->type == ICE_VSI_PF)
+		ice_service_task_schedule(pf);
+
+	return 0;
+}
+
+/**
+ * ice_up - Bring the connection back up after being down
+ * @vsi: VSI being configured
+ */
+int ice_up(struct ice_vsi *vsi)
+{
+	int err;
+
+	err = ice_vsi_cfg(vsi);
+	if (!err)
+		err = ice_up_complete(vsi);
+
+	return err;
+}
+
+/**
+ * ice_fetch_u64_stats_per_ring - get packets and bytes stats per ring
+ * @ring: Tx or Rx ring to read stats from
+ * @pkts: packets stats counter
+ * @bytes: bytes stats counter
+ *
+ * This function fetches stats from the ring considering the atomic operations
+ * that needs to be performed to read u64 values in 32 bit machine.
+ */
+static void
+ice_fetch_u64_stats_per_ring(struct ice_ring *ring, u64 *pkts, u64 *bytes)
+{
+	unsigned int start;
+	*pkts = 0;
+	*bytes = 0;
+
+	if (!ring)
+		return;
+	do {
+		start = u64_stats_fetch_begin_irq(&ring->syncp);
+		*pkts = ring->stats.pkts;
+		*bytes = ring->stats.bytes;
+	} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
+}
+
+/**
+ * ice_update_vsi_tx_ring_stats - Update VSI Tx ring stats counters
+ * @vsi: the VSI to be updated
+ * @rings: rings to work on
+ * @count: number of rings
+ */
+static void
+ice_update_vsi_tx_ring_stats(struct ice_vsi *vsi, struct ice_ring **rings,
+			     u16 count)
+{
+	struct rtnl_link_stats64 *vsi_stats = &vsi->net_stats;
+	u16 i;
+
+	for (i = 0; i < count; i++) {
+		struct ice_ring *ring;
+		u64 pkts, bytes;
+
+		ring = READ_ONCE(rings[i]);
+		ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes);
+		vsi_stats->tx_packets += pkts;
+		vsi_stats->tx_bytes += bytes;
+		vsi->tx_restart += ring->tx_stats.restart_q;
+		vsi->tx_busy += ring->tx_stats.tx_busy;
+		vsi->tx_linearize += ring->tx_stats.tx_linearize;
+	}
+}
+
+/**
+ * ice_update_vsi_ring_stats - Update VSI stats counters
+ * @vsi: the VSI to be updated
+ */
+static void ice_update_vsi_ring_stats(struct ice_vsi *vsi)
+{
+	struct rtnl_link_stats64 *vsi_stats = &vsi->net_stats;
+	u64 pkts, bytes;
+	int i;
+
+	/* reset netdev stats */
+	vsi_stats->tx_packets = 0;
+	vsi_stats->tx_bytes = 0;
+	vsi_stats->rx_packets = 0;
+	vsi_stats->rx_bytes = 0;
+
+	/* reset non-netdev (extended) stats */
+	vsi->tx_restart = 0;
+	vsi->tx_busy = 0;
+	vsi->tx_linearize = 0;
+	vsi->rx_buf_failed = 0;
+	vsi->rx_page_failed = 0;
+#ifdef ICE_ADD_PROBES
+	vsi->rx_page_reuse = 0;
+#endif /* ICE_ADD_PROBES */
+
+	rcu_read_lock();
+
+	/* update Tx rings counters */
+	ice_update_vsi_tx_ring_stats(vsi, vsi->tx_rings, vsi->num_txq);
+
+	/* update Rx rings counters */
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *ring = READ_ONCE(vsi->rx_rings[i]);
+
+		ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes);
+		vsi_stats->rx_packets += pkts;
+		vsi_stats->rx_bytes += bytes;
+		vsi->rx_buf_failed += ring->rx_stats.alloc_buf_failed;
+		vsi->rx_page_failed += ring->rx_stats.alloc_page_failed;
+#ifdef ICE_ADD_PROBES
+		vsi->rx_page_reuse += ring->rx_stats.page_reuse;
+#endif /* ICE_ADD_PROBES */
+	}
+
+#ifdef HAVE_XDP_SUPPORT
+	/* update XDP Tx rings counters */
+	if (ice_is_xdp_ena_vsi(vsi))
+		ice_update_vsi_tx_ring_stats(vsi, vsi->xdp_rings,
+					     vsi->num_xdp_txq);
+
+#endif /* HAVE_XDP_SUPPORT */
+	rcu_read_unlock();
+}
+
+/**
+ * ice_update_vsi_stats - Update VSI stats counters
+ * @vsi: the VSI to be updated
+ */
+void ice_update_vsi_stats(struct ice_vsi *vsi)
+{
+	struct rtnl_link_stats64 *cur_ns = &vsi->net_stats;
+	struct ice_eth_stats *cur_es = &vsi->eth_stats;
+	struct ice_pf *pf = vsi->back;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state) ||
+	    test_bit(ICE_CFG_BUSY, pf->state))
+		return;
+
+	/* get stats as recorded by Tx/Rx rings */
+	ice_update_vsi_ring_stats(vsi);
+
+	/* get VSI stats as recorded by the hardware */
+	ice_update_eth_stats(vsi);
+
+	cur_ns->tx_errors = cur_es->tx_errors;
+	cur_ns->rx_dropped = cur_es->rx_discards;
+	cur_ns->tx_dropped = cur_es->tx_discards;
+	cur_ns->multicast = cur_es->rx_multicast;
+
+	/* update some more netdev stats if this is main VSI */
+	if (vsi->type == ICE_VSI_PF) {
+		cur_ns->rx_crc_errors = pf->stats.crc_errors;
+		cur_ns->rx_errors = pf->stats.crc_errors +
+				    pf->stats.illegal_bytes +
+				    pf->stats.rx_len_errors +
+				    pf->stats.rx_undersize +
+				    pf->hw_csum_rx_error +
+				    pf->stats.rx_jabber +
+				    pf->stats.rx_fragments +
+				    pf->stats.rx_oversize;
+		cur_ns->rx_length_errors = pf->stats.rx_len_errors;
+		/* record drops from the port level */
+		cur_ns->rx_missed_errors = pf->stats.eth.rx_discards;
+	}
+}
+
+/**
+ * ice_update_pf_stats - Update PF port stats counters
+ * @pf: PF whose stats needs to be updated
+ */
+void ice_update_pf_stats(struct ice_pf *pf)
+{
+	struct ice_hw_port_stats *prev_ps, *cur_ps;
+	struct ice_hw *hw = &pf->hw;
+	u16 fd_ctr_base;
+	u8 port;
+
+	port = hw->port_info->lport;
+	prev_ps = &pf->stats_prev;
+	cur_ps = &pf->stats;
+
+	ice_stat_update40(hw, GLPRT_GORCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_bytes,
+			  &cur_ps->eth.rx_bytes);
+
+	ice_stat_update40(hw, GLPRT_UPRCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_unicast,
+			  &cur_ps->eth.rx_unicast);
+
+	ice_stat_update40(hw, GLPRT_MPRCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_multicast,
+			  &cur_ps->eth.rx_multicast);
+
+	ice_stat_update40(hw, GLPRT_BPRCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_broadcast,
+			  &cur_ps->eth.rx_broadcast);
+
+	ice_stat_update32(hw, PRTRPB_RDPC, pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_discards,
+			  &cur_ps->eth.rx_discards);
+
+	ice_stat_update40(hw, GLPRT_GOTCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.tx_bytes,
+			  &cur_ps->eth.tx_bytes);
+
+	ice_stat_update40(hw, GLPRT_UPTCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.tx_unicast,
+			  &cur_ps->eth.tx_unicast);
+
+	ice_stat_update40(hw, GLPRT_MPTCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.tx_multicast,
+			  &cur_ps->eth.tx_multicast);
+
+	ice_stat_update40(hw, GLPRT_BPTCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.tx_broadcast,
+			  &cur_ps->eth.tx_broadcast);
+
+	ice_stat_update32(hw, GLPRT_TDOLD(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_dropped_link_down,
+			  &cur_ps->tx_dropped_link_down);
+
+	ice_stat_update40(hw, GLPRT_PRC64L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_64, &cur_ps->rx_size_64);
+
+	ice_stat_update40(hw, GLPRT_PRC127L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_127, &cur_ps->rx_size_127);
+
+	ice_stat_update40(hw, GLPRT_PRC255L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_255, &cur_ps->rx_size_255);
+
+	ice_stat_update40(hw, GLPRT_PRC511L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_511, &cur_ps->rx_size_511);
+
+	ice_stat_update40(hw, GLPRT_PRC1023L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_1023, &cur_ps->rx_size_1023);
+
+	ice_stat_update40(hw, GLPRT_PRC1522L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_1522, &cur_ps->rx_size_1522);
+
+	ice_stat_update40(hw, GLPRT_PRC9522L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_big, &cur_ps->rx_size_big);
+
+	ice_stat_update40(hw, GLPRT_PTC64L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_64, &cur_ps->tx_size_64);
+
+	ice_stat_update40(hw, GLPRT_PTC127L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_127, &cur_ps->tx_size_127);
+
+	ice_stat_update40(hw, GLPRT_PTC255L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_255, &cur_ps->tx_size_255);
+
+	ice_stat_update40(hw, GLPRT_PTC511L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_511, &cur_ps->tx_size_511);
+
+	ice_stat_update40(hw, GLPRT_PTC1023L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_1023, &cur_ps->tx_size_1023);
+
+	ice_stat_update40(hw, GLPRT_PTC1522L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_1522, &cur_ps->tx_size_1522);
+
+	ice_stat_update40(hw, GLPRT_PTC9522L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_big, &cur_ps->tx_size_big);
+
+	fd_ctr_base = hw->fd_ctr_base;
+
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_FD_SB_STAT_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->fd_sb_match,
+			  &cur_ps->fd_sb_match);
+#ifdef ADQ_PERF_COUNTERS
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_FD_CH_STAT_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->ch_atr_match,
+			  &cur_ps->ch_atr_match);
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef ICE_ADD_PROBES
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_ARFS_STAT_TCPV4_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->arfs_tcpv4_match,
+			  &cur_ps->arfs_tcpv4_match);
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_ARFS_STAT_TCPV6_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->arfs_tcpv6_match,
+			  &cur_ps->arfs_tcpv6_match);
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_ARFS_STAT_UDPV4_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->arfs_udpv4_match,
+			  &cur_ps->arfs_udpv4_match);
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_ARFS_STAT_UDPV6_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->arfs_udpv6_match,
+			  &cur_ps->arfs_udpv6_match);
+#endif /* ICE_ADD_PROBES */
+	ice_stat_update32(hw, GLPRT_LXONRXC(port), pf->stat_prev_loaded,
+			  &prev_ps->link_xon_rx, &cur_ps->link_xon_rx);
+
+	ice_stat_update32(hw, GLPRT_LXOFFRXC(port), pf->stat_prev_loaded,
+			  &prev_ps->link_xoff_rx, &cur_ps->link_xoff_rx);
+
+	ice_stat_update32(hw, GLPRT_LXONTXC(port), pf->stat_prev_loaded,
+			  &prev_ps->link_xon_tx, &cur_ps->link_xon_tx);
+
+	ice_stat_update32(hw, GLPRT_LXOFFTXC(port), pf->stat_prev_loaded,
+			  &prev_ps->link_xoff_tx, &cur_ps->link_xoff_tx);
+
+	ice_update_dcb_stats(pf);
+
+	ice_stat_update32(hw, GLPRT_CRCERRS(port), pf->stat_prev_loaded,
+			  &prev_ps->crc_errors, &cur_ps->crc_errors);
+
+	ice_stat_update32(hw, GLPRT_ILLERRC(port), pf->stat_prev_loaded,
+			  &prev_ps->illegal_bytes, &cur_ps->illegal_bytes);
+
+	ice_stat_update32(hw, GLPRT_MLFC(port), pf->stat_prev_loaded,
+			  &prev_ps->mac_local_faults,
+			  &cur_ps->mac_local_faults);
+
+	ice_stat_update32(hw, GLPRT_MRFC(port), pf->stat_prev_loaded,
+			  &prev_ps->mac_remote_faults,
+			  &cur_ps->mac_remote_faults);
+
+	ice_stat_update32(hw, GLPRT_RLEC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_len_errors, &cur_ps->rx_len_errors);
+
+	ice_stat_update32(hw, GLPRT_RUC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_undersize, &cur_ps->rx_undersize);
+
+	ice_stat_update32(hw, GLPRT_RFC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_fragments, &cur_ps->rx_fragments);
+
+	ice_stat_update32(hw, GLPRT_ROC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_oversize, &cur_ps->rx_oversize);
+
+	ice_stat_update32(hw, GLPRT_RJC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_jabber, &cur_ps->rx_jabber);
+
+
+	cur_ps->fd_sb_status = test_bit(ICE_FLAG_FD_ENA, pf->flags) ? 1 : 0;
+
+	pf->stat_prev_loaded = true;
+}
+
+/**
+ * ice_get_stats64 - get statistics for network device structure
+ * @netdev: network interface device structure
+ * @stats: main device statistics structure
+ */
+static
+#ifdef HAVE_VOID_NDO_GET_STATS64
+void ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
+#else /* HAVE_VOID_NDO_GET_STATS64 */
+struct rtnl_link_stats64 *
+ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
+#endif /* !HAVE_VOID_NDO_GET_STATS64 */
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct rtnl_link_stats64 *vsi_stats;
+	struct ice_vsi *vsi = np->vsi;
+
+	vsi_stats = &vsi->net_stats;
+
+	if (!vsi->num_txq || !vsi->num_rxq)
+#ifdef HAVE_VOID_NDO_GET_STATS64
+		return;
+#else
+		return stats;
+#endif
+
+	/* netdev packet/byte stats come from ring counter. These are obtained
+	 * by summing up ring counters (done by ice_update_vsi_ring_stats).
+	 * But, only call the update routine and read the registers if VSI is
+	 * not down.
+	 */
+	if (!test_bit(ICE_VSI_DOWN, vsi->state))
+		ice_update_vsi_ring_stats(vsi);
+	stats->tx_packets = vsi_stats->tx_packets;
+	stats->tx_bytes = vsi_stats->tx_bytes;
+	stats->rx_packets = vsi_stats->rx_packets;
+	stats->rx_bytes = vsi_stats->rx_bytes;
+
+	/* The rest of the stats can be read from the hardware but instead we
+	 * just return values that the watchdog task has already obtained from
+	 * the hardware.
+	 */
+	stats->multicast = vsi_stats->multicast;
+	stats->tx_errors = vsi_stats->tx_errors;
+	stats->tx_dropped = vsi_stats->tx_dropped;
+	stats->rx_errors = vsi_stats->rx_errors;
+	stats->rx_dropped = vsi_stats->rx_dropped;
+	stats->rx_crc_errors = vsi_stats->rx_crc_errors;
+	stats->rx_length_errors = vsi_stats->rx_length_errors;
+#ifndef HAVE_VOID_NDO_GET_STATS64
+
+	return stats;
+#endif
+}
+
+#ifdef HAVE_NETPOLL_CONTROLLER
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/**
+ * ice_netpoll - polling "interrupt" handler
+ * @netdev: network interface device structure
+ *
+ * Used by netconsole to send skbs without having to re-enable interrupts.
+ * This is not called in the normal interrupt path.
+ */
+static void ice_netpoll(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	int i;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return;
+
+	ice_for_each_q_vector(vsi, i)
+		ice_msix_clean_rings(0, vsi->q_vectors[i]);
+}
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+#endif /* HAVE_NETPOLL_CONTROLLER */
+
+/**
+ * ice_napi_disable_all - Disable NAPI for all q_vectors in the VSI
+ * @vsi: VSI having NAPI disabled
+ */
+static void ice_napi_disable_all(struct ice_vsi *vsi)
+{
+	int q_idx;
+
+	if (!vsi->netdev)
+		return;
+
+	ice_for_each_q_vector(vsi, q_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
+
+		if (q_vector->rx.ring || q_vector->tx.ring)
+			napi_disable(&q_vector->napi);
+
+		cancel_work_sync(&q_vector->tx.dim.work);
+		cancel_work_sync(&q_vector->rx.dim.work);
 	}
 }
 
@@ -3902,827 +8399,2967 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)
  * ice_down - Shutdown the connection
  * @vsi: The VSI being stopped
  */
-int ice_down(struct ice_vsi *vsi)
+int ice_down(struct ice_vsi *vsi)
+{
+	int link_err = 0, vlan_err = 0;
+	int i, tx_err, rx_err;
+
+	/* Caller of this function is expected to set the
+	 * vsi->state ICE_DOWN bit
+	 */
+	if (vsi->netdev && vsi->type == ICE_VSI_PF) {
+		vlan_err = ice_vsi_del_vlan_zero(vsi);
+		if (!ice_is_e810(&vsi->back->hw))
+			ice_ptp_link_change(vsi->back, vsi->back->hw.pf_id, false);
+		netif_carrier_off(vsi->netdev);
+		netif_tx_disable(vsi->netdev);
+	} else if (vsi->type == ICE_VSI_SWITCHDEV_CTRL) {
+		ice_eswitch_stop_all_tx_queues(vsi->back);
+	}
+
+
+	ice_vsi_dis_irq(vsi);
+
+	tx_err = ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, 0);
+	if (tx_err)
+		netdev_err(vsi->netdev, "Failed stop Tx rings, VSI %d error %d\n",
+			   vsi->vsi_num, tx_err);
+#ifdef HAVE_XDP_SUPPORT
+	if (!tx_err && ice_is_xdp_ena_vsi(vsi)) {
+		tx_err = ice_vsi_stop_xdp_tx_rings(vsi);
+		if (tx_err)
+			netdev_err(vsi->netdev, "Failed stop XDP rings, VSI %d error %d\n",
+				   vsi->vsi_num, tx_err);
+	}
+#endif /* HAVE_XDP_SUPPORT */
+
+	rx_err = ice_vsi_stop_all_rx_rings(vsi);
+	if (rx_err)
+		netdev_err(vsi->netdev, "Failed stop Rx rings, VSI %d error %d\n",
+			   vsi->vsi_num, rx_err);
+
+	ice_napi_disable_all(vsi);
+
+	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) {
+		link_err = ice_force_phys_link_state(vsi, false);
+		if (link_err)
+			netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n",
+				   vsi->vsi_num, link_err);
+	}
+
+	ice_for_each_txq(vsi, i)
+		ice_clean_tx_ring(vsi->tx_rings[i]);
+
+	ice_for_each_rxq(vsi, i)
+		ice_clean_rx_ring(vsi->rx_rings[i]);
+
+	if (tx_err || rx_err || link_err || vlan_err) {
+		netdev_err(vsi->netdev, "Failed to close VSI 0x%04X on switch 0x%04X\n",
+			   vsi->vsi_num, vsi->vsw->sw_id);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsi_setup_tx_rings - Allocate VSI Tx queue resources
+ * @vsi: VSI having resources allocated
+ *
+ * Return 0 on success, negative on failure
+ */
+int ice_vsi_setup_tx_rings(struct ice_vsi *vsi)
+{
+	int i, err = 0;
+
+	if (!vsi->num_txq) {
+		dev_err(ice_pf_to_dev(vsi->back), "VSI %d has 0 Tx queues\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	ice_for_each_txq(vsi, i) {
+		struct ice_ring *ring = vsi->tx_rings[i];
+
+		if (!ring)
+			return -EINVAL;
+		if (vsi->netdev)
+			ring->netdev = vsi->netdev;
+		err = ice_setup_tx_ring(ring);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_setup_rx_rings - Allocate VSI Rx queue resources
+ * @vsi: VSI having resources allocated
+ *
+ * Return 0 on success, negative on failure
+ */
+int ice_vsi_setup_rx_rings(struct ice_vsi *vsi)
+{
+	int i, err = 0;
+
+	if (!vsi->num_rxq) {
+		dev_err(ice_pf_to_dev(vsi->back), "VSI %d has 0 Rx queues\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *ring = vsi->rx_rings[i];
+
+		if (!ring)
+			return -EINVAL;
+		if (vsi->netdev)
+			ring->netdev = vsi->netdev;
+		err = ice_setup_rx_ring(ring);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_open_ctrl - open control VSI for use
+ * @vsi: the VSI to open
+ *
+ * Initialization of the Control VSI
+ *
+ * Returns 0 on success, negative value on error
+ */
+int ice_vsi_open_ctrl(struct ice_vsi *vsi)
+{
+	char int_name[ICE_INT_NAME_STR_LEN];
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	/* allocate descriptors */
+	err = ice_vsi_setup_tx_rings(vsi);
+	if (err)
+		goto err_setup_tx;
+
+	err = ice_vsi_setup_rx_rings(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	err = ice_vsi_cfg(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	snprintf(int_name, sizeof(int_name) - 1, "%s-%s:ctrl",
+		 dev_driver_string(dev), dev_name(dev));
+	err = ice_vsi_req_irq_msix(vsi, int_name);
+	if (err)
+		goto err_setup_rx;
+
+	ice_vsi_cfg_msix(vsi);
+
+	err = ice_vsi_start_all_rx_rings(vsi);
+	if (err)
+		goto err_up_complete;
+
+	clear_bit(ICE_VSI_DOWN, vsi->state);
+	ice_vsi_ena_irq(vsi);
+
+	return 0;
+
+err_up_complete:
+	ice_down(vsi);
+err_setup_rx:
+	ice_vsi_free_rx_rings(vsi);
+err_setup_tx:
+	ice_vsi_free_tx_rings(vsi);
+
+	return err;
+}
+
+/**
+ * ice_vsi_open - Called when a network interface is made active
+ * @vsi: the VSI to open
+ *
+ * Initialization of the VSI
+ *
+ * Returns 0 on success, negative value on error
+ */
+int ice_vsi_open(struct ice_vsi *vsi)
+{
+	char int_name[ICE_INT_NAME_STR_LEN];
+	struct ice_pf *pf = vsi->back;
+	int err;
+
+	/* allocate descriptors */
+	err = ice_vsi_setup_tx_rings(vsi);
+	if (err)
+		goto err_setup_tx;
+
+	err = ice_vsi_setup_rx_rings(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	err = ice_vsi_cfg(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	snprintf(int_name, sizeof(int_name) - 1, "%s-%s",
+		 dev_driver_string(ice_pf_to_dev(pf)), vsi->netdev->name);
+	err = ice_vsi_req_irq_msix(vsi, int_name);
+	if (err)
+		goto err_setup_rx;
+
+	if (vsi->type == ICE_VSI_PF) {
+#ifdef HAVE_NETDEV_SB_DEV
+		unsigned int total_qs = vsi->num_txq;
+
+		if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags))
+			total_qs = vsi->alloc_txq + pf->max_num_macvlan;
+
+		/* Notify the stack of the actual queue counts. */
+		err = netif_set_real_num_tx_queues(vsi->netdev, total_qs);
+#else
+		/* Notify the stack of the actual queue counts. */
+		err = netif_set_real_num_tx_queues(vsi->netdev, vsi->num_txq);
+#endif /* HAVE_NETDEV_SB_DEV */
+		if (err)
+			goto err_set_qs;
+
+		err = netif_set_real_num_rx_queues(vsi->netdev, vsi->num_rxq);
+		if (err)
+			goto err_set_qs;
+	}
+
+	err = ice_up_complete(vsi);
+	if (err)
+		goto err_up_complete;
+	return 0;
+
+err_up_complete:
+	ice_down(vsi);
+err_set_qs:
+	ice_vsi_free_irq(vsi);
+err_setup_rx:
+	ice_vsi_free_rx_rings(vsi);
+err_setup_tx:
+	ice_vsi_free_tx_rings(vsi);
+
+	return err;
+}
+
+/**
+ * ice_vsi_release_all - Delete all VSIs
+ * @pf: PF from which all VSIs are being removed
+ */
+static void ice_vsi_release_all(struct ice_pf *pf)
+{
+	int err, i;
+
+	if (!pf->vsi)
+		return;
+
+	ice_for_each_vsi(pf, i) {
+		if (!pf->vsi[i])
+			continue;
+
+		if (pf->vsi[i]->type == ICE_VSI_CHNL)
+			continue;
+
+		err = ice_vsi_release(pf->vsi[i]);
+		if (err)
+			dev_dbg(ice_pf_to_dev(pf), "Failed to release pf->vsi[%d], err %d, vsi_num = %d\n",
+				i, err, pf->vsi[i]->vsi_num);
+	}
+}
+
+/**
+ * ice_vsi_rebuild_by_type - Rebuild VSI of a given type
+ * @pf: pointer to the PF instance
+ * @type: VSI type to rebuild
+ *
+ * Iterates through the pf->vsi array and rebuilds VSIs of the requested type
+ */
+static int ice_vsi_rebuild_by_type(struct ice_pf *pf, enum ice_vsi_type type)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	enum ice_status status;
+	int i, err;
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (!vsi || vsi->type != type)
+			continue;
+
+		/* rebuild the VSI */
+		err = ice_vsi_rebuild(vsi, true);
+		if (err) {
+			dev_err(dev, "rebuild VSI failed, err %d, VSI index %d, type %s\n",
+				err, vsi->idx, ice_vsi_type_str(type));
+			return err;
+		}
+
+		/* replay filters for the VSI */
+		status = ice_replay_vsi(&pf->hw, vsi->idx);
+		if (status) {
+			dev_err(dev, "replay VSI failed, status %s, VSI index %d, type %s\n",
+				ice_stat_str(status), vsi->idx,
+				ice_vsi_type_str(type));
+			return -EIO;
+		}
+
+		/* Re-map HW VSI number, using VSI handle that has been
+		 * previously validated in ice_replay_vsi() call above
+		 */
+		vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+
+		/* enable the VSI */
+		err = ice_ena_vsi(vsi, false);
+		if (err) {
+			dev_err(dev, "enable VSI failed, err %d, VSI index %d, type %s\n",
+				err, vsi->idx, ice_vsi_type_str(type));
+			return err;
+		}
+
+		dev_info(dev, "VSI rebuilt. VSI index %d, type %s\n", vsi->idx,
+			 ice_vsi_type_str(type));
+	}
+
+	return 0;
+}
+
+/**
+ * ice_update_pf_netdev_link - Update PF netdev link status
+ * @pf: pointer to the PF instance
+ */
+static void ice_update_pf_netdev_link(struct ice_pf *pf)
+{
+	bool link_up;
+	int i;
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (!vsi || vsi->type != ICE_VSI_PF)
+			return;
+
+		ice_get_link_status(pf->vsi[i]->port_info, &link_up);
+		if (link_up) {
+			netif_carrier_on(pf->vsi[i]->netdev);
+			netif_tx_wake_all_queues(pf->vsi[i]->netdev);
+		} else {
+			netif_carrier_off(pf->vsi[i]->netdev);
+			netif_tx_stop_all_queues(pf->vsi[i]->netdev);
+		}
+	}
+}
+
+/**
+ * ice_rebuild - rebuild after reset
+ * @pf: PF to rebuild
+ * @reset_type: type of reset
+ *
+ * Do not rebuild VF VSI in this flow because that is already handled via
+ * ice_reset_all_vfs(). This is because requirements for resetting a VF after a
+ * PFR/CORER/GLOBER/etc. are different than the normal flow. Also, we don't want
+ * to reset/rebuild all the VF VSI twice.
+ */
+static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
+{
+	struct ice_fwlog_user_input user_input = { 0 };
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status ret;
+	bool dvm;
+	int err;
+
+	if (test_bit(ICE_DOWN, pf->state))
+		goto clear_recovery;
+
+	dev_dbg(dev, "rebuilding PF after reset_type=%d\n", reset_type);
+
+#define ICE_EMP_RESET_SLEEP 5000
+	if (reset_type == ICE_RESET_EMPR)
+		msleep(ICE_EMP_RESET_SLEEP);
+
+
+	ret = ice_init_all_ctrlq(hw);
+	if (ret) {
+		dev_err(dev, "control queues init failed %s\n",
+			ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	user_input.log_level = fwlog_level;
+	user_input.events = fwlog_events;
+	if (!ice_pf_fwlog_set(pf, &user_input)) {
+		if (ice_fwlog_register(hw))
+			dev_dbg(dev, "Failed to register for FW logging events\n");
+	} else {
+		dev_dbg(dev, "Failed to re-enable FW logging\n");
+	}
+	/* if DDP was previously loaded successfully */
+	if (!ice_is_safe_mode(pf)) {
+		/* reload the SW DB of filter tables */
+		if (reset_type == ICE_RESET_PFR) {
+			ice_fill_blk_tbls(hw);
+		} else {
+			/* Reload DDP Package after CORER/GLOBR reset */
+			ice_load_pkg(NULL, pf);
+
+			/* check if package reloaded */
+			if (ice_is_safe_mode(pf)) {
+				dev_err(dev, "failed to reload DDP Package\n");
+				if (ice_prepare_for_safe_mode(pf)) {
+					dev_err(dev, "could not transition to safe mode\n");
+					goto err_init_ctrlq;
+				}
+			}
+		}
+	}
+
+	ret = ice_clear_pf_cfg(hw);
+	if (ret) {
+		dev_err(dev, "clear PF configuration failed %s\n",
+			ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	if (pf->first_sw->dflt_vsi_ena)
+		dev_info(dev, "Clearing default VSI, re-enable after reset completes\n");
+	/* clear the default VSI configuration if it exists */
+	pf->first_sw->dflt_vsi = NULL;
+	pf->first_sw->dflt_vsi_ena = false;
+
+	ice_clear_pxe_mode(hw);
+
+	ret = ice_init_nvm(hw);
+	if (ret) {
+		dev_err(dev, "ice_init_nvm failed %s\n", ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	ret = ice_get_caps(hw);
+	if (ret) {
+		dev_err(dev, "ice_get_caps failed %s\n", ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	ret = ice_aq_set_mac_cfg(hw, ICE_AQ_SET_MAC_FRAME_SIZE_MAX, NULL);
+	if (ret) {
+		dev_err(dev, "set_mac_cfg failed %s\n", ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	dvm = ice_is_dvm_ena(hw);
+
+	err = ice_aq_set_port_params(pf->hw.port_info, 0, false, false, dvm,
+				     NULL);
+	if (err)
+		goto err_init_ctrlq;
+
+	err = ice_sched_init_port(hw->port_info);
+	if (err)
+		goto err_sched_init_port;
+
+
+	ice_pf_reset_stats(pf);
+
+	/* start misc vector */
+	err = ice_req_irq_msix_misc(pf);
+	if (err) {
+		dev_err(dev, "misc vector setup failed: %d\n", err);
+		goto err_sched_init_port;
+	}
+
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) {
+		wr32(hw, PFQF_FD_ENA, PFQF_FD_ENA_FD_ENA_M);
+		if (!rd32(hw, PFQF_FD_SIZE)) {
+			u16 unused, guar, b_effort;
+
+			guar = hw->func_caps.fd_fltr_guar;
+			b_effort = hw->func_caps.fd_fltr_best_effort;
+
+			/* force guaranteed filter pool for PF */
+			ice_alloc_fd_guar_item(hw, &unused, guar);
+			/* force shared filter pool for PF */
+			ice_alloc_fd_shrd_item(hw, &unused, b_effort);
+		}
+	}
+
+	if (test_bit(ICE_FLAG_DCB_ENA, pf->flags))
+		ice_dcb_rebuild(pf);
+
+	/* If the PF previously had enabled PTP, PTP init needs to happen before
+	 * the VSI rebuild. If not, this causes the PTP link status events to
+	 * fail.
+	 */
+	if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		ice_ptp_init(pf);
+
+
+	/* rebuild PF VSI */
+	err = ice_vsi_rebuild_by_type(pf, ICE_VSI_PF);
+	if (err) {
+		dev_err(dev, "PF VSI rebuild failed: %d\n", err);
+		goto err_vsi_rebuild;
+	}
+	if (ice_is_peer_ena(pf)) {
+		struct ice_vsi *vsi = ice_get_main_vsi(pf);
+
+		if (!vsi) {
+			dev_err(dev, "No PF_VSI to update peer\n");
+			goto err_vsi_rebuild;
+		}
+		ice_for_each_peer(pf, vsi, ice_peer_update_vsi);
+	}
+#ifdef HAVE_NETDEV_SB_DEV
+	if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags)) {
+		struct ice_vsi *vsi;
+
+		err = ice_vsi_rebuild_by_type(pf, ICE_VSI_OFFLOAD_MACVLAN);
+		if (err) {
+			dev_err(dev, "MACVLAN VSI rebuild failed: %d\n", err);
+			goto err_vsi_rebuild;
+		}
+
+		vsi = ice_get_main_vsi(pf);
+		if (!vsi) {
+			dev_err(dev, "main VSI doesn't exist\n");
+			goto err_vsi_rebuild;
+		}
+
+		err = ice_init_macvlan(vsi, false);
+		if (err) {
+			dev_err(dev, "Failed to init macvlan\n");
+			goto err_vsi_rebuild;
+		}
+
+		ice_vsi_replay_macvlan(pf);
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	err = ice_vsi_rebuild_by_type(pf, ICE_VSI_SWITCHDEV_CTRL);
+	if (err) {
+		dev_err(dev, "Switchdev CTRL VSI rebuild failed: %d\n", err);
+		goto err_vsi_rebuild;
+	}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (reset_type == ICE_RESET_PFR) {
+		err = ice_rebuild_channels(pf);
+		if (err) {
+			dev_err(dev, "failed to rebuild and replay ADQ VSIs, err %d\n",
+				err);
+			goto err_vsi_rebuild;
+		}
+	}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+	/* If Flow Director is active */
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) {
+		err = ice_vsi_rebuild_by_type(pf, ICE_VSI_CTRL);
+		if (err) {
+			dev_err(dev, "control VSI rebuild failed: %d\n", err);
+			goto err_vsi_rebuild;
+		}
+
+		/* replay HW Flow Director recipes */
+		if (hw->fdir_prof)
+			ice_fdir_replay_flows(hw);
+
+		/* replay Flow Director filters */
+		ice_fdir_replay_fltrs(pf);
+
+		ice_rebuild_arfs(pf);
+	}
+
+
+	ice_update_pf_netdev_link(pf);
+
+	ice_config_health_events(pf, true);
+
+	/* tell the firmware we are up */
+	ret = ice_send_version(pf);
+	if (ret) {
+		dev_err(dev, "Rebuild failed due to error sending driver version: %s\n",
+			ice_stat_str(ret));
+		goto err_vsi_rebuild;
+	}
+
+	ice_replay_post(hw);
+	/* if we get here, reset flow is successful */
+	clear_bit(ICE_RESET_FAILED, pf->state);
+	return;
+
+err_vsi_rebuild:
+
+err_sched_init_port:
+	ice_sched_cleanup_all(hw);
+err_init_ctrlq:
+	ice_shutdown_all_ctrlq(hw);
+	set_bit(ICE_RESET_FAILED, pf->state);
+clear_recovery:
+	/* set this bit in PF state to control service task scheduling */
+	set_bit(ICE_NEEDS_RESTART, pf->state);
+	dev_err(dev, "Rebuild failed, unload and reload driver\n");
+}
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_max_xdp_frame_size - returns the maximum allowed frame size for XDP
+ * @vsi: Pointer to VSI structure
+ */
+static int ice_max_xdp_frame_size(struct ice_vsi *vsi)
+{
+	if (PAGE_SIZE >= 8192 || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags))
+		return ICE_RXBUF_2048 - XDP_PACKET_HEADROOM;
+	else
+		return ICE_RXBUF_3072;
+}
+#endif /* HAVE_XDP_SUPPORT */
+
+/**
+ * ice_change_mtu - NDO callback to change the MTU
+ * @netdev: network interface device structure
+ * @new_mtu: new value for maximum frame size
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_event *event;
+	u8 count = 0;
+	int err = 0;
+
+	if (new_mtu == (int)netdev->mtu) {
+		netdev_warn(netdev, "MTU is already %u\n", netdev->mtu);
+		return 0;
+	}
+#ifdef HAVE_XDP_SUPPORT
+
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		int frame_size = ice_max_xdp_frame_size(vsi);
+
+		if (new_mtu + ICE_ETH_PKT_HDR_PAD > frame_size) {
+			netdev_err(netdev, "max MTU for XDP usage is %d\n",
+				   frame_size - ICE_ETH_PKT_HDR_PAD);
+			return -EINVAL;
+		}
+	}
+
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_NETDEVICE_MIN_MAX_MTU
+#ifdef HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+	if (new_mtu < netdev->extended->min_mtu) {
+		netdev_err(netdev, "new MTU invalid. min_mtu is %d\n",
+			   netdev->extended->min_mtu);
+		return -EINVAL;
+	} else if (new_mtu > netdev->extended->max_mtu) {
+		netdev_err(netdev, "new MTU invalid. max_mtu is %d\n",
+			   netdev->extended->min_mtu);
+		return -EINVAL;
+	}
+#else /* HAVE_RHEL7_EXTENDED_MIN_MAX_MTU */
+	if (new_mtu < (int)netdev->min_mtu) {
+		netdev_err(netdev, "new MTU invalid. min_mtu is %d\n",
+			   netdev->min_mtu);
+		return -EINVAL;
+	} else if (new_mtu > (int)netdev->max_mtu) {
+		netdev_err(netdev, "new MTU invalid. max_mtu is %d\n",
+			   netdev->min_mtu);
+		return -EINVAL;
+	}
+#endif /* HAVE_RHEL7_EXTENDED_MIN_MAX_MTU */
+#else /* HAVE_NETDEVICE_MIN_MAX_MTU */
+	if (new_mtu < ETH_MIN_MTU) {
+		netdev_err(netdev, "new MTU invalid. min_mtu is %d\n",
+			   ETH_MIN_MTU);
+		return -EINVAL;
+	} else if (new_mtu > ICE_MAX_MTU) {
+		netdev_err(netdev, "new MTU invalid. max_mtu is %d\n",
+			   ICE_MAX_MTU);
+		return -EINVAL;
+	}
+#endif /* HAVE_NETDEVICE_MIN_MAX_MTU */
+	/* if a reset is in progress, wait for some time for it to complete */
+	do {
+		if (ice_is_reset_in_progress(pf->state)) {
+			count++;
+			usleep_range(1000, 2000);
+		} else {
+			break;
+		}
+
+	} while (count < 100);
+
+	if (count == 100) {
+		netdev_err(netdev, "can't change MTU. Device is busy\n");
+		return -EBUSY;
+	}
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+
+	netdev->mtu = (unsigned int)new_mtu;
+
+	/* if VSI is up, bring it down and then back up */
+	if (!test_and_set_bit(ICE_VSI_DOWN, vsi->state)) {
+		err = ice_down(vsi);
+		if (err) {
+			netdev_err(netdev, "change MTU if_down err %d\n", err);
+			goto free_event;
+		}
+
+		err = ice_up(vsi);
+		if (err) {
+			netdev_err(netdev, "change MTU if_up err %d\n", err);
+			goto free_event;
+		}
+	}
+
+	if (ice_is_safe_mode(pf))
+		goto out;
+
+	set_bit(ICE_EVENT_MTU_CHANGE, event->type);
+	event->reporter = NULL;
+	event->info.mtu = (u16)new_mtu;
+	ice_for_each_peer(pf, event, ice_peer_check_for_reg);
+
+out:
+	netdev_dbg(netdev, "changed MTU to %d\n", new_mtu);
+free_event:
+	kfree(event);
+	return err;
+}
+
+/**
+ * ice_do_ioctl - Access the hwtstamp interface
+ * @netdev: network interface device structure
+ * @ifr: interface request data
+ * @cmd: ioctl command
+ */
+static int ice_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_pf *pf = np->vsi->back;
+
+	switch (cmd) {
+#ifdef SIOCGHWTSTAMP
+	case SIOCGHWTSTAMP:
+		return ice_ptp_get_ts_config(pf, ifr);
+#endif
+	case SIOCSHWTSTAMP:
+		return ice_ptp_set_ts_config(pf, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/**
+ * ice_aq_str - convert AQ err code to a string
+ * @aq_err: the AQ error code to convert
+ */
+const char *ice_aq_str(enum ice_aq_err aq_err)
+{
+	switch (aq_err) {
+	case ICE_AQ_RC_OK:
+		return "OK";
+	case ICE_AQ_RC_EPERM:
+		return "ICE_AQ_RC_EPERM";
+	case ICE_AQ_RC_ENOENT:
+		return "ICE_AQ_RC_ENOENT";
+	case ICE_AQ_RC_ESRCH:
+		return "ICE_AQ_RC_ESRCH";
+	case ICE_AQ_RC_EINTR:
+		return "ICE_AQ_RC_EINTR";
+	case ICE_AQ_RC_EIO:
+		return "ICE_AQ_RC_EIO";
+	case ICE_AQ_RC_ENXIO:
+		return "ICE_AQ_RC_ENXIO";
+	case ICE_AQ_RC_E2BIG:
+		return "ICE_AQ_RC_E2BIG";
+	case ICE_AQ_RC_EAGAIN:
+		return "ICE_AQ_RC_EAGAIN";
+	case ICE_AQ_RC_ENOMEM:
+		return "ICE_AQ_RC_ENOMEM";
+	case ICE_AQ_RC_EACCES:
+		return "ICE_AQ_RC_EACCES";
+	case ICE_AQ_RC_EFAULT:
+		return "ICE_AQ_RC_EFAULT";
+	case ICE_AQ_RC_EBUSY:
+		return "ICE_AQ_RC_EBUSY";
+	case ICE_AQ_RC_EEXIST:
+		return "ICE_AQ_RC_EEXIST";
+	case ICE_AQ_RC_EINVAL:
+		return "ICE_AQ_RC_EINVAL";
+	case ICE_AQ_RC_ENOTTY:
+		return "ICE_AQ_RC_ENOTTY";
+	case ICE_AQ_RC_ENOSPC:
+		return "ICE_AQ_RC_ENOSPC";
+	case ICE_AQ_RC_ENOSYS:
+		return "ICE_AQ_RC_ENOSYS";
+	case ICE_AQ_RC_ERANGE:
+		return "ICE_AQ_RC_ERANGE";
+	case ICE_AQ_RC_EFLUSHED:
+		return "ICE_AQ_RC_EFLUSHED";
+	case ICE_AQ_RC_BAD_ADDR:
+		return "ICE_AQ_RC_BAD_ADDR";
+	case ICE_AQ_RC_EMODE:
+		return "ICE_AQ_RC_EMODE";
+	case ICE_AQ_RC_EFBIG:
+		return "ICE_AQ_RC_EFBIG";
+	case ICE_AQ_RC_ESBCOMP:
+		return "ICE_AQ_RC_ESBCOMP";
+	case ICE_AQ_RC_ENOSEC:
+		return "ICE_AQ_RC_ENOSEC";
+	case ICE_AQ_RC_EBADSIG:
+		return "ICE_AQ_RC_EBADSIG";
+	case ICE_AQ_RC_ESVN:
+		return "ICE_AQ_RC_ESVN";
+	case ICE_AQ_RC_EBADMAN:
+		return "ICE_AQ_RC_EBADMAN";
+	case ICE_AQ_RC_EBADBUF:
+		return "ICE_AQ_RC_EBADBUF";
+	case ICE_AQ_RC_EACCES_BMCU:
+		return "ICE_AQ_RC_EACCES_BMCU";
+	}
+
+	return "ICE_AQ_RC_UNKNOWN";
+}
+
+/**
+ * ice_stat_str - convert status err code to a string
+ * @stat_err: the status error code to convert
+ */
+const char *ice_stat_str(enum ice_status stat_err)
+{
+	switch (stat_err) {
+	case ICE_SUCCESS:
+		return "OK";
+	case ICE_ERR_PARAM:
+		return "ICE_ERR_PARAM";
+	case ICE_ERR_NOT_IMPL:
+		return "ICE_ERR_NOT_IMPL";
+	case ICE_ERR_NOT_READY:
+		return "ICE_ERR_NOT_READY";
+	case ICE_ERR_NOT_SUPPORTED:
+		return "ICE_ERR_NOT_SUPPORTED";
+	case ICE_ERR_BAD_PTR:
+		return "ICE_ERR_BAD_PTR";
+	case ICE_ERR_INVAL_SIZE:
+		return "ICE_ERR_INVAL_SIZE";
+	case ICE_ERR_DEVICE_NOT_SUPPORTED:
+		return "ICE_ERR_DEVICE_NOT_SUPPORTED";
+	case ICE_ERR_RESET_FAILED:
+		return "ICE_ERR_RESET_FAILED";
+	case ICE_ERR_FW_API_VER:
+		return "ICE_ERR_FW_API_VER";
+	case ICE_ERR_NO_MEMORY:
+		return "ICE_ERR_NO_MEMORY";
+	case ICE_ERR_CFG:
+		return "ICE_ERR_CFG";
+	case ICE_ERR_OUT_OF_RANGE:
+		return "ICE_ERR_OUT_OF_RANGE";
+	case ICE_ERR_ALREADY_EXISTS:
+		return "ICE_ERR_ALREADY_EXISTS";
+	case ICE_ERR_NVM:
+		return "ICE_ERR_NVM";
+	case ICE_ERR_NVM_CHECKSUM:
+		return "ICE_ERR_NVM_CHECKSUM";
+	case ICE_ERR_BUF_TOO_SHORT:
+		return "ICE_ERR_BUF_TOO_SHORT";
+	case ICE_ERR_NVM_BLANK_MODE:
+		return "ICE_ERR_NVM_BLANK_MODE";
+	case ICE_ERR_IN_USE:
+		return "ICE_ERR_IN_USE";
+	case ICE_ERR_MAX_LIMIT:
+		return "ICE_ERR_MAX_LIMIT";
+	case ICE_ERR_RESET_ONGOING:
+		return "ICE_ERR_RESET_ONGOING";
+	case ICE_ERR_HW_TABLE:
+		return "ICE_ERR_HW_TABLE";
+	case ICE_ERR_DOES_NOT_EXIST:
+		return "ICE_ERR_DOES_NOT_EXIST";
+	case ICE_ERR_FW_DDP_MISMATCH:
+		return "ICE_ERR_FW_DDP_MISMATCH";
+	case ICE_ERR_AQ_ERROR:
+		return "ICE_ERR_AQ_ERROR";
+	case ICE_ERR_AQ_TIMEOUT:
+		return "ICE_ERR_AQ_TIMEOUT";
+	case ICE_ERR_AQ_FULL:
+		return "ICE_ERR_AQ_FULL";
+	case ICE_ERR_AQ_NO_WORK:
+		return "ICE_ERR_AQ_NO_WORK";
+	case ICE_ERR_AQ_EMPTY:
+		return "ICE_ERR_AQ_EMPTY";
+	case ICE_ERR_AQ_FW_CRITICAL:
+		return "ICE_ERR_AQ_FW_CRITICAL";
+	}
+
+	return "ICE_ERR_UNKNOWN";
+}
+
+/**
+ * ice_set_rss_lut - Set RSS LUT
+ * @vsi: Pointer to VSI structure
+ * @lut: Lookup table
+ * @lut_size: Lookup table size
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_set_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size)
+{
+	struct ice_aq_get_set_rss_lut_params params = {};
+	struct ice_hw *hw = &vsi->back->hw;
+	enum ice_status status;
+
+	if (!lut)
+		return -EINVAL;
+
+	params.vsi_handle = vsi->idx;
+	params.lut_size = lut_size;
+	params.lut_type = vsi->rss_lut_type;
+	params.lut = lut;
+	if (vsi->global_lut_id)
+		params.global_lut_id = *vsi->global_lut_id;
+
+	status = ice_aq_set_rss_lut(hw, &params);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Cannot set RSS lut, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_rss_key - Set RSS key
+ * @vsi: Pointer to the VSI structure
+ * @seed: RSS hash seed
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_set_rss_key(struct ice_vsi *vsi, u8 *seed)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	enum ice_status status;
+
+	if (!seed)
+		return -EINVAL;
+
+	status = ice_aq_set_rss_key(hw, vsi->idx, (struct ice_aqc_get_set_rss_keys *)seed);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Cannot set RSS key, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_rss_lut - Get RSS LUT
+ * @vsi: Pointer to VSI structure
+ * @lut: Buffer to store the lookup table entries
+ * @lut_size: Size of buffer to store the lookup table entries
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_get_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size)
+{
+	struct ice_aq_get_set_rss_lut_params params = {};
+	struct ice_hw *hw = &vsi->back->hw;
+	enum ice_status status;
+
+	if (!lut)
+		return -EINVAL;
+
+	params.vsi_handle = vsi->idx;
+	params.lut_size = lut_size;
+	params.lut_type = vsi->rss_lut_type;
+	params.lut = lut;
+	if (vsi->global_lut_id)
+		params.global_lut_id = *vsi->global_lut_id;
+
+	status = ice_aq_get_rss_lut(hw, &params);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Cannot get RSS lut, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_rss_key - Get RSS key
+ * @vsi: Pointer to VSI structure
+ * @seed: Buffer to store the key in
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_get_rss_key(struct ice_vsi *vsi, u8 *seed)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	enum ice_status status;
+
+	if (!seed)
+		return -EINVAL;
+
+	status = ice_aq_get_rss_key(hw, vsi->idx, (struct ice_aqc_get_set_rss_keys *)seed);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Cannot get RSS key, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_bridge_getlink - Get the hardware bridge mode
+ * @skb: skb buff
+ * @pid: process ID
+ * @seq: RTNL message seq
+ * @dev: the netdev being configured
+ * @filter_mask: filter mask passed in
+ * @nlflags: netlink flags passed in
+ *
+ * Return the bridge mode (VEB/VEPA)
+ */
+static int
+#ifdef HAVE_NDO_DFLT_BRIDGE_GETLINK_VLAN_SUPPORT
+ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		   struct net_device *dev, u32 filter_mask, int nlflags)
+#elif defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS)
+ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		   struct net_device *dev, u32 __always_unused filter_mask,
+		   int nlflags)
+#else
+ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		   struct net_device *dev, u32 __always_unused filter_mask)
+#endif
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u16 bmode;
+
+	bmode = pf->first_sw->bridge_mode;
+
+#ifdef HAVE_NDO_DFLT_BRIDGE_GETLINK_VLAN_SUPPORT
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0, nlflags,
+				       filter_mask, NULL);
+#elif defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS)
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0,
+				       nlflags);
+#elif defined(HAVE_NDO_FDB_ADD_VID) || defined(NDO_DFLT_BRIDGE_GETLINK_HAS_BRFLAGS)
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0);
+#else
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode);
+#endif
+}
+
+/**
+ * ice_vsi_update_bridge_mode - Update VSI for switching bridge mode (VEB/VEPA)
+ * @vsi: Pointer to VSI structure
+ * @bmode: Hardware bridge mode (VEB/VEPA)
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode)
+{
+	struct ice_aqc_vsi_props *vsi_props;
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int ret = 0;
+
+	vsi_props = &vsi->info;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+
+	if (bmode == BRIDGE_MODE_VEB)
+		/* change from VEPA to VEB mode */
+		ctxt->info.sw_flags |= (ICE_AQ_VSI_SW_FLAG_ALLOW_LB |
+					ICE_AQ_VSI_SW_FLAG_LOCAL_LB);
+	else
+		/* change from VEB to VEPA mode */
+		ctxt->info.sw_flags &= ~(ICE_AQ_VSI_SW_FLAG_ALLOW_LB |
+					 ICE_AQ_VSI_SW_FLAG_LOCAL_LB);
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for bridge mode failed, bmode = %d err %s aq_err %s\n",
+			bmode, ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		ret = -EIO;
+		goto out;
+	}
+	/* Update sw flags for book keeping */
+	vsi_props->sw_flags = ctxt->info.sw_flags;
+
+out:
+	kfree(ctxt);
+	return ret;
+}
+
+#ifdef HAVE_NDO_BRIDGE_SETLINK_EXTACK
+/**
+ * ice_bridge_setlink - Set the hardware bridge mode
+ * @dev: the netdev being configured
+ * @nlh: RTNL message
+ * @flags: bridge setlink flags
+ * @extack: netlink extended ack
+ *
+ * Sets the bridge mode (VEB/VEPA) of the switch to which the netdev (VSI) is
+ * hooked up to. Iterates through the PF VSI list and sets the loopback mode (if
+ * not already set for all VSIs connected to this switch. And also update the
+ * unicast switch filter rules for the corresponding switch of the netdev.
+ */
+static int
+ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+		   u16 __always_unused flags,
+		   struct netlink_ext_ack __always_unused *extack)
+#elif defined(HAVE_NDO_BRIDGE_SET_DEL_LINK_FLAGS)
+static int
+ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+		   u16 __always_unused flags)
+#else
+static int ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh)
+#endif
 {
-	int i, tx_err, rx_err, link_err = 0;
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_pf *pf = np->vsi->back;
+	struct nlattr *attr, *br_spec;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct ice_sw *pf_sw;
+	int rem, v, err = 0;
 
-	/* Caller of this function is expected to set the
-	 * vsi->state __ICE_DOWN bit
-	 */
-	if (vsi->netdev) {
-		netif_carrier_off(vsi->netdev);
-		netif_tx_disable(vsi->netdev);
+	pf_sw = pf->first_sw;
+	/* find the attribute in the netlink message */
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+
+	nla_for_each_nested(attr, br_spec, rem) {
+		__u16 mode;
+
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+		mode = nla_get_u16(attr);
+		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
+			return -EINVAL;
+		/* Continue  if bridge mode is not being flipped */
+		if (mode == pf_sw->bridge_mode)
+			continue;
+		/* Iterates through the PF VSI list and update the loopback
+		 * mode of the VSI
+		 */
+		ice_for_each_vsi(pf, v) {
+			if (!pf->vsi[v])
+				continue;
+			err = ice_vsi_update_bridge_mode(pf->vsi[v], mode);
+			if (err)
+				return err;
+		}
+
+		hw->evb_veb = (mode == BRIDGE_MODE_VEB);
+		/* Update the unicast switch filter rules for the corresponding
+		 * switch of the netdev
+		 */
+		status = ice_update_sw_rule_bridge_mode(hw);
+		if (status) {
+			netdev_err(dev, "switch rule update failed, mode = %d err %s aq_err %s\n",
+				   mode, ice_stat_str(status),
+				   ice_aq_str(hw->adminq.sq_last_status));
+			/* revert hw->evb_veb */
+			hw->evb_veb = (pf_sw->bridge_mode == BRIDGE_MODE_VEB);
+			return -EIO;
+		}
+
+		pf_sw->bridge_mode = mode;
 	}
 
-	ice_vsi_dis_irq(vsi);
+	return 0;
+}
 
-	tx_err = ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, 0);
-	if (tx_err)
-		netdev_err(vsi->netdev,
-			   "Failed stop Tx rings, VSI %d error %d\n",
-			   vsi->vsi_num, tx_err);
+#ifdef HAVE_TX_TIMEOUT_TXQUEUE
+/**
+ * ice_tx_timeout - Respond to a Tx Hang
+ * @netdev: network interface device structure
+ * @txqueue: Tx queue
+ */
+static void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue)
+#else
+static void ice_tx_timeout(struct net_device *netdev)
+#endif
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_ring *tx_ring = NULL;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+#ifndef HAVE_TX_TIMEOUT_TXQUEUE
+	unsigned int txqueue = 0;
+#endif /* !HAVE_TX_TIMEOUT_TXQUEUE */
+	unsigned int i;
 
-	rx_err = ice_vsi_stop_rx_rings(vsi);
-	if (rx_err)
-		netdev_err(vsi->netdev,
-			   "Failed stop Rx rings, VSI %d error %d\n",
-			   vsi->vsi_num, rx_err);
+	pf->tx_timeout_count++;
 
-	ice_napi_disable_all(vsi);
+#ifndef HAVE_TX_TIMEOUT_TXQUEUE
+	/* find the stopped queue the same way dev_watchdog() does */
+	for (i = 0; i < netdev->num_tx_queues; i++) {
+		unsigned long trans_start;
+		struct netdev_queue *q;
 
-	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) {
-		link_err = ice_force_phys_link_state(vsi, false);
-		if (link_err)
-			netdev_err(vsi->netdev,
-				   "Failed to set physical link down, VSI %d error %d\n",
-				   vsi->vsi_num, link_err);
+		q = netdev_get_tx_queue(netdev, i);
+		trans_start = q->trans_start;
+		if (netif_xmit_stopped(q) &&
+		    time_after(jiffies,
+			       trans_start + netdev->watchdog_timeo)) {
+			txqueue = i;
+			break;
+		}
 	}
 
-	ice_for_each_txq(vsi, i)
-		ice_clean_tx_ring(vsi->tx_rings[i]);
+	if (i == netdev->num_tx_queues) {
+		netdev_info(netdev, "tx_timeout: no netdev hung queue found\n");
+		return;
+	}
+#endif /* !HAVE_TX_TIMEOUT_TXQUEUE */
 
-	ice_for_each_rxq(vsi, i)
-		ice_clean_rx_ring(vsi->rx_rings[i]);
+	/* Check if PFC is enabled for the TC to which the queue belongs
+	 * to. If yes then Tx timeout is not caused by a hung queue, no
+	 * need to reset and rebuild
+	 */
+	if (ice_is_pfc_causing_hung_q(pf, txqueue)) {
+		dev_info(ice_pf_to_dev(pf), "Fake Tx hang detected on queue %u, timeout caused by PFC storm\n",
+			 txqueue);
+		return;
+	}
 
-	if (tx_err || rx_err || link_err) {
-		netdev_err(vsi->netdev,
-			   "Failed to close VSI 0x%04X on switch 0x%04X\n",
-			   vsi->vsi_num, vsi->vsw->sw_id);
-		return -EIO;
+	/* now that we have an index, find the tx_ring struct */
+	for (i = 0; i < vsi->num_txq; i++)
+		if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc)
+			if (txqueue == vsi->tx_rings[i]->q_index) {
+				tx_ring = vsi->tx_rings[i];
+				break;
+			}
+
+	/* Reset recovery level if enough time has elapsed after last timeout.
+	 * Also ensure no new reset action happens before next timeout period.
+	 */
+	if (time_after(jiffies, (pf->tx_timeout_last_recovery + HZ * 20)))
+		pf->tx_timeout_recovery_level = 1;
+	else if (time_before(jiffies, (pf->tx_timeout_last_recovery +
+				       netdev->watchdog_timeo)))
+		return;
+
+	if (tx_ring) {
+		struct ice_hw *hw = &pf->hw;
+		u32 head, val = 0;
+
+		head = (rd32(hw, QTX_COMM_HEAD(vsi->txq_map[txqueue])) &
+			QTX_COMM_HEAD_HEAD_M) >> QTX_COMM_HEAD_HEAD_S;
+		/* Read interrupt register */
+		val = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx));
+
+		netdev_info(netdev, "tx_timeout: VSI_num: %d, Q %u, NTC: 0x%x, HW_HEAD: 0x%x, NTU: 0x%x, INT: 0x%x\n",
+			    vsi->vsi_num, tx_ring->q_index,
+			    tx_ring->next_to_clean, head, tx_ring->next_to_use,
+			    val);
 	}
 
-	return 0;
+	pf->tx_timeout_last_recovery = jiffies;
+	netdev_info(netdev, "tx_timeout recovery level %d, txqueue %u\n",
+		    pf->tx_timeout_recovery_level, txqueue);
+
+	switch (pf->tx_timeout_recovery_level) {
+	case 1:
+		set_bit(ICE_PFR_REQ, pf->state);
+		break;
+	case 2:
+		set_bit(ICE_CORER_REQ, pf->state);
+		break;
+	case 3:
+		set_bit(ICE_GLOBR_REQ, pf->state);
+		break;
+	default:
+		netdev_err(netdev, "tx_timeout recovery unsuccessful, device is in unrecoverable state.\n");
+		set_bit(ICE_DOWN, pf->state);
+		set_bit(ICE_VSI_NEEDS_RESTART, vsi->state);
+		set_bit(ICE_SERVICE_DIS, pf->state);
+		break;
+	}
+
+	ice_service_task_schedule(pf);
+	pf->tx_timeout_recovery_level++;
 }
 
 /**
- * ice_vsi_setup_tx_rings - Allocate VSI Tx queue resources
- * @vsi: VSI having resources allocated
- *
- * Return 0 on success, negative on failure
+ * ice_udp_tunnel_add - Get notifications about UDP tunnel ports that come up
+ * @netdev: This physical port's netdev
+ * @ti: Tunnel endpoint information
  */
-int ice_vsi_setup_tx_rings(struct ice_vsi *vsi)
+static void __maybe_unused
+ice_udp_tunnel_add(struct net_device *netdev, struct udp_tunnel_info *ti)
 {
-	int i, err = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_tnl_entry *tnl_entry;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	enum ice_tunnel_type tnl_type;
+	u16 port = ntohs(ti->port);
+	enum ice_status status;
 
-	if (!vsi->num_txq) {
-		dev_err(&vsi->back->pdev->dev, "VSI %d has 0 Tx queues\n",
-			vsi->vsi_num);
-		return -EINVAL;
+	if (ice_dcf_is_udp_tunnel_capable(&pf->hw)) {
+		netdev_info(netdev, "Cannot config tunnel, the capability is used by DCF\n");
+		return;
 	}
 
-	ice_for_each_txq(vsi, i) {
-		struct ice_ring *ring = vsi->tx_rings[i];
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		tnl_type = TNL_VXLAN;
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		tnl_type = TNL_GENEVE;
+		break;
+	default:
+		netdev_err(netdev, "Unknown tunnel type\n");
+		return;
+	}
 
-		if (!ring)
-			return -EINVAL;
+	status = ice_is_create_tunnel_possible(&pf->hw, tnl_type, port);
+	if (status == ICE_ERR_OUT_OF_RANGE) {
+		netdev_info(netdev, "Max tunneled UDP ports reached, port %d not added\n",
+			    port);
+		return;
+	}
 
-		ring->netdev = vsi->netdev;
-		err = ice_setup_tx_ring(ring);
-		if (err)
-			break;
+	spin_lock(&pf->tnl_lock);
+	tnl_entry = ice_find_tnl(pf, tnl_type, port);
+
+	if (tnl_entry) {
+		tnl_entry->ref_cnt++;
+		/* if the entry is scheduled for deletion, cancel this */
+		tnl_entry->state &= ~ICE_TNL_STATE_TO_DEL;
+	} else {
+		tnl_entry = devm_kzalloc(ice_pf_to_dev(pf),
+					 sizeof(*tnl_entry), GFP_ATOMIC);
+		if (!tnl_entry) {
+			spin_unlock(&pf->tnl_lock);
+			return;
+		}
+		tnl_entry->type = tnl_type;
+		tnl_entry->port = port;
+		tnl_entry->state = ICE_TNL_STATE_TO_ADD;
+		tnl_entry->ref_cnt = 1;
+		INIT_LIST_HEAD(&tnl_entry->node);
+		list_add_tail(&tnl_entry->node, &pf->tnl_list);
 	}
+	spin_unlock(&pf->tnl_lock);
 
-	return err;
+	/* kick the service_task so that it can create the tunnel */
+	ice_service_task_schedule(vsi->back);
 }
 
 /**
- * ice_vsi_setup_rx_rings - Allocate VSI Rx queue resources
- * @vsi: VSI having resources allocated
- *
- * Return 0 on success, negative on failure
+ * ice_udp_tunnel_del - Get notifications about UDP tunnel ports that go away
+ * @netdev: This physical port's netdev
+ * @ti: Tunnel endpoint information
  */
-int ice_vsi_setup_rx_rings(struct ice_vsi *vsi)
+static void __maybe_unused
+ice_udp_tunnel_del(struct net_device *netdev, struct udp_tunnel_info *ti)
 {
-	int i, err = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	enum ice_tunnel_type tnl_type;
+	struct ice_tnl_entry *entry;
+	u16 port = ntohs(ti->port);
 
-	if (!vsi->num_rxq) {
-		dev_err(&vsi->back->pdev->dev, "VSI %d has 0 Rx queues\n",
-			vsi->vsi_num);
-		return -EINVAL;
+	if (ice_dcf_is_udp_tunnel_capable(&pf->hw)) {
+		netdev_info(netdev, "Cannot config tunnel, the capability is used by DCF\n");
+		return;
 	}
 
-	ice_for_each_rxq(vsi, i) {
-		struct ice_ring *ring = vsi->rx_rings[i];
-
-		if (!ring)
-			return -EINVAL;
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		tnl_type = TNL_VXLAN;
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		tnl_type = TNL_GENEVE;
+		break;
+	default:
+		netdev_err(netdev, "Unknown tunnel type\n");
+		return;
+	}
 
-		ring->netdev = vsi->netdev;
-		err = ice_setup_rx_ring(ring);
-		if (err)
-			break;
+	spin_lock(&pf->tnl_lock);
+	entry = ice_find_tnl(pf, tnl_type, port);
+	if (entry) {
+		if (entry->ref_cnt > 1) {
+			entry->ref_cnt--;
+		} else if (entry->state & ICE_TNL_STATE_ACTIVE) {
+			entry->ref_cnt = 0;
+			entry->state |= ICE_TNL_STATE_TO_DEL;
+		} else {
+			list_del(&entry->node);
+			devm_kfree(ice_pf_to_dev(pf), entry);
+		}
+	} else {
+		netdev_err(netdev, "Unable to find Tunnel, port %u, tnl_type %u\n",
+			   port, tnl_type);
 	}
+	spin_unlock(&pf->tnl_lock);
 
-	return err;
+	/* kick the service_task so that it can destroy the tunnel */
+	ice_service_task_schedule(vsi->back);
 }
 
+#if defined(HAVE_VXLAN_RX_OFFLOAD) && !defined(HAVE_UDP_ENC_RX_OFFLOAD)
+#if IS_ENABLED(CONFIG_VXLAN)
 /**
- * ice_vsi_open - Called when a network interface is made active
- * @vsi: the VSI to open
- *
- * Initialization of the VSI
- *
- * Returns 0 on success, negative value on error
+ * ice_add_vxlan_port - Get notifications about VxLAN ports that come up
+ * @netdev: This physical port's netdev
+ * @sa_family: Socket Family that VxLAN is notifying us about
+ * @port: New UDP port number that VxLAN started listening to
  */
-static int ice_vsi_open(struct ice_vsi *vsi)
+static void
+ice_add_vxlan_port(struct net_device *netdev, sa_family_t sa_family,
+		   __be16 port)
 {
-	char int_name[ICE_INT_NAME_STR_LEN];
-	struct ice_pf *pf = vsi->back;
-	int err;
+	struct udp_tunnel_info ti = {
+		.type = UDP_TUNNEL_TYPE_VXLAN,
+		.sa_family = sa_family,
+		.port = port,
+	};
 
-	/* allocate descriptors */
-	err = ice_vsi_setup_tx_rings(vsi);
-	if (err)
-		goto err_setup_tx;
+	ice_udp_tunnel_add(netdev, &ti);
+}
 
-	err = ice_vsi_setup_rx_rings(vsi);
-	if (err)
-		goto err_setup_rx;
+/**
+ * ice_del_vxlan_port - Get notifications about VxLAN ports that go away
+ * @netdev: This physical port's netdev
+ * @sa_family: Socket Family that VxLAN is notifying us about
+ * @port: UDP port number that VxLAN stopped listening to
+ */
+static void
+ice_del_vxlan_port(struct net_device *netdev, sa_family_t sa_family,
+		   __be16 port)
+{
+	struct udp_tunnel_info ti = {
+		.type = UDP_TUNNEL_TYPE_VXLAN,
+		.sa_family = sa_family,
+		.port = port,
+	};
 
-	err = ice_vsi_cfg(vsi);
-	if (err)
-		goto err_setup_rx;
+	ice_udp_tunnel_del(netdev, &ti);
+}
+#endif /* CONFIG_VXLAN */
+#endif /* HAVE_VXLAN_RX_OFFLOAD && !HAVE_UDP_ENC_RX_OFFLOAD */
 
-	snprintf(int_name, sizeof(int_name) - 1, "%s-%s",
-		 dev_driver_string(&pf->pdev->dev), vsi->netdev->name);
-	err = ice_vsi_req_irq_msix(vsi, int_name);
-	if (err)
-		goto err_setup_rx;
+#if defined(HAVE_GENEVE_RX_OFFLOAD) && !defined(HAVE_UDP_ENC_RX_OFFLOAD)
+#if IS_ENABLED(CONFIG_GENEVE)
+/**
+ * ice_add_geneve_port - Get notifications about GENEVE ports that come up
+ * @netdev: This physical port's netdev
+ * @sa_family: Socket Family that GENEVE is notifying us about
+ * @port: New UDP port number that GENEVE started listening to
+ */
+static void
+ice_add_geneve_port(struct net_device *netdev, sa_family_t sa_family,
+		    __be16 port)
+{
+	struct udp_tunnel_info ti = {
+		.type = UDP_TUNNEL_TYPE_GENEVE,
+		.sa_family = sa_family,
+		.port = port,
+	};
+
+	ice_udp_tunnel_add(netdev, &ti);
+}
+
+/**
+ * ice_del_geneve_port - Get notifications about GENEVE ports that go away
+ * @netdev: This physical port's netdev
+ * @sa_family: Socket Family that GENEVE is notifying us about
+ * @port: UDP port number that GENEVE stopped listening to
+ */
+static void
+ice_del_geneve_port(struct net_device *netdev, sa_family_t sa_family,
+		    __be16 port)
+{
+	struct udp_tunnel_info ti = {
+		.type = UDP_TUNNEL_TYPE_GENEVE,
+		.sa_family = sa_family,
+		.port = port,
+	};
 
-	/* Notify the stack of the actual queue counts. */
-	err = netif_set_real_num_tx_queues(vsi->netdev, vsi->num_txq);
-	if (err)
-		goto err_set_qs;
+	ice_udp_tunnel_del(netdev, &ti);
+}
 
-	err = netif_set_real_num_rx_queues(vsi->netdev, vsi->num_rxq);
-	if (err)
-		goto err_set_qs;
+#endif /* CONFIG_GENEVE */
+#endif /* HAVE_GENEVE_RX_OFFLOAD  && !HAVE_UDP_ENC_RX_OFFLOAD */
 
-	err = ice_up_complete(vsi);
-	if (err)
-		goto err_up_complete;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_setup_tc_cls_flower - flower classifier offloads
+ * @np: net device to configure
+ * @filter_dev: device on which filter is added
+ * @cls_flower: offload data
+ */
+#ifdef HAVE_TC_INDIR_BLOCK
+static int
+ice_setup_tc_cls_flower(struct ice_netdev_priv *np,
+			struct net_device *filter_dev,
+			struct flow_cls_offload *cls_flower)
+#else
+static int
+ice_setup_tc_cls_flower(struct ice_netdev_priv *np,
+			struct net_device __always_unused *filter_dev,
+			struct tc_cls_flower_offload *cls_flower)
+#endif /* HAVE_TC_INDIR_BLOCK */
+{
+	struct ice_vsi *vsi = np->vsi;
 
-	return 0;
+#ifdef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+	if (cls_flower->common.chain_index)
+		return -EOPNOTSUPP;
+#endif /* HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV */
 
-err_up_complete:
-	ice_down(vsi);
-err_set_qs:
-	ice_vsi_free_irq(vsi);
-err_setup_rx:
-	ice_vsi_free_rx_rings(vsi);
-err_setup_tx:
-	ice_vsi_free_tx_rings(vsi);
+	if (ice_is_dcf_enabled(vsi->back))
+		return -EOPNOTSUPP;
 
-	return err;
+	switch (cls_flower->command) {
+	case FLOW_CLS_REPLACE:
+		return ice_add_cls_flower(filter_dev, vsi, cls_flower);
+	case FLOW_CLS_DESTROY:
+		return ice_del_cls_flower(vsi, cls_flower);
+	default:
+		return -EINVAL;
+	}
 }
 
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
 /**
- * ice_vsi_release_all - Delete all VSIs
- * @pf: PF from which all VSIs are being removed
+ * ice_setup_tc_block_cb - callback handler registered for TC block
+ * @type: TC SETUP type
+ * @type_data: TC flower offload data that contains user input
+ * @cb_priv: netdev private data
  */
-static void ice_vsi_release_all(struct ice_pf *pf)
+static int
+ice_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
 {
-	int err, i;
-
-	if (!pf->vsi)
-		return;
-
-	ice_for_each_vsi(pf, i) {
-		if (!pf->vsi[i])
-			continue;
+	struct ice_netdev_priv *np = (struct ice_netdev_priv *)cb_priv;
 
-		err = ice_vsi_release(pf->vsi[i]);
-		if (err)
-			dev_dbg(&pf->pdev->dev,
-				"Failed to release pf->vsi[%d], err %d, vsi_num = %d\n",
-				i, err, pf->vsi[i]->vsi_num);
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return ice_setup_tc_cls_flower(np, np->vsi->netdev,
+					       (struct flow_cls_offload *)
+					       type_data);
+	default:
+		return -EOPNOTSUPP;
 	}
 }
 
 /**
- * ice_ena_vsi - resume a VSI
- * @vsi: the VSI being resume
- * @locked: is the rtnl_lock already held
+ * ice_validate_mqprio_qopt - Validate TCF input parameters
+ * @vsi: Pointer to VSI
+ * @mqprio_qopt: input parameters for mqprio queue configuration
+ *
+ * This function validates MQPRIO params, such as qcount (power of 2 wherever
+ * needed), and make sure user doesn't specify qcount and BW rate limit
+ * for TCs, which are more than "num_tc"
  */
-static int ice_ena_vsi(struct ice_vsi *vsi, bool locked)
+static int
+ice_validate_mqprio_qopt(struct ice_vsi *vsi,
+			 struct tc_mqprio_qopt_offload *mqprio_qopt)
 {
-	int err = 0;
+	u64 sum_max_rate = 0, sum_min_rate = 0;
+	int non_power_of_2_qcount = 0;
+	struct ice_pf *pf = vsi->back;
+	int max_rss_q_cnt = 0;
+	struct device *dev;
+	int i, speed;
+	u8 num_tc;
 
-	if (!test_bit(__ICE_NEEDS_RESTART, vsi->state))
-		return 0;
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	if (mqprio_qopt->qopt.offset[0] != 0 ||
+	    mqprio_qopt->qopt.num_tc < 1 ||
+	    mqprio_qopt->qopt.num_tc > ICE_CHNL_MAX_TC)
+		return -EINVAL;
 
-	clear_bit(__ICE_NEEDS_RESTART, vsi->state);
+	dev = ice_pf_to_dev(pf);
+	vsi->ch_rss_size = 0;
+	num_tc = mqprio_qopt->qopt.num_tc;
 
-	if (vsi->netdev && vsi->type == ICE_VSI_PF) {
-		if (netif_running(vsi->netdev)) {
-			if (!locked)
-				rtnl_lock();
+	for (i = 0; num_tc; i++) {
+		int qcount = mqprio_qopt->qopt.count[i];
+		u64 max_rate, min_rate;
+
+		max_rate = mqprio_qopt->max_rate[i];
+
+		if (!qcount)
+			return -EINVAL;
+
+		if (!i && !is_power_of_2(qcount)) {
+			dev_err(dev, "TC0:qcount[%d] must be a power of 2\n",
+				qcount);
+			return -EINVAL;
+		} else if (non_power_of_2_qcount) {
+			if (qcount > non_power_of_2_qcount) {
+				dev_err(dev, "TC%d:qcount[%d] > non_power_of_2_qcount [%d]\n",
+					i, qcount, non_power_of_2_qcount);
+				return -EINVAL;
+			} else if (qcount < non_power_of_2_qcount) {
+				/* it must be power of 2, otherwise fail */
+				if (!is_power_of_2(qcount)) {
+					dev_err(dev, "qcount must be a power of 2, TC%d: qcnt[%d] < non_power_of_2_qcount [%d]\n",
+						i, qcount,
+						non_power_of_2_qcount);
+					return -EINVAL;
+				}
+			}
+		} else if (!is_power_of_2(qcount)) {
+			/* after tc0, next TCs qcount can be non-power of 2,
+			 * if so, set channel RSS size to be the count of that
+			 * TC
+			 */
+			non_power_of_2_qcount = qcount;
+			max_rss_q_cnt = qcount;
+			dev_dbg(dev, "TC%d:count[%d] non power of 2\n", i,
+				qcount);
+		}
+
+		/* figure out max_rss_q_cnt based on TC's qcount */
+		if (max_rss_q_cnt) {
+			if (qcount > max_rss_q_cnt)
+				max_rss_q_cnt = qcount;
+		} else {
+			max_rss_q_cnt = qcount;
+		}
 
-			err = ice_open(vsi->netdev);
+		/* Convert input bandwidth from Bytes/s to Kbps */
+		/* TC tool converts the bandwidth rate limit into Bytes/s when
+		 * passing it down to the driver whereas the TC command can
+		 * take bandwidth inputs in Kbps, Mbps or Gbps
+		 */
+		do_div(max_rate, ICE_BW_KBPS_DIVISOR);
+		sum_max_rate += max_rate;
+
+		/* min_rate is minimum guaranteed rate and it can't be zero */
+		min_rate = mqprio_qopt->min_rate[i];
+		do_div(min_rate, ICE_BW_KBPS_DIVISOR);
+		if (min_rate && min_rate < ICE_MIN_BW_LIMIT) {
+			dev_err(dev, "TC%d: min_rate(%llu Kbps) < %u Kbps\n", i,
+				min_rate, ICE_MIN_BW_LIMIT);
+			return -EINVAL;
+		}
+		if (min_rate % ICE_MIN_BW_LIMIT != 0) {
+			dev_err(dev, "TC%d: Min Rate not in increment of %u Kbps",
+				i, ICE_MIN_BW_LIMIT);
+			return -EINVAL;
+		}
+		if (max_rate % ICE_MIN_BW_LIMIT != 0) {
+			dev_err(dev, "TC%d: Max Rate not in increment of %u Kbps",
+				i, ICE_MIN_BW_LIMIT);
+			return -EINVAL;
+		}
+		sum_min_rate += min_rate;
 
-			if (!locked)
-				rtnl_unlock();
+		/* min_rate can't be more than max_rate, except when max_rate
+		 * is zero (which is valid and it means bandwidth is sought for
+		 * max line rate). In such a case min_rate can be more than max.
+		 */
+		if (max_rate && min_rate > max_rate) {
+			dev_err(dev, "min_rate %llu Kbps can't be more than max_rate %llu Kbps\n",
+				min_rate, max_rate);
+			return -EINVAL;
 		}
+
+		if (i >= mqprio_qopt->qopt.num_tc - 1)
+			break;
+		if (mqprio_qopt->qopt.offset[i + 1] !=
+		    (mqprio_qopt->qopt.offset[i] + qcount))
+			return -EINVAL;
+	}
+	if (vsi->num_rxq <
+	    (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i]))
+		return -EINVAL;
+	if (vsi->num_txq <
+	    (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i]))
+		return -EINVAL;
+
+	speed = ice_get_link_speed_kbps(vsi);
+	if (sum_min_rate && sum_min_rate > (u64)speed) {
+		dev_err(dev, "Invalid min Tx rate(%llu) Kbps > speed (%u) Kbps specified\n",
+			sum_min_rate, speed);
+		return -EINVAL;
+	}
+	if (sum_max_rate && sum_max_rate > (u64)speed) {
+		dev_err(dev, "Invalid max Tx rate(%llu) Kbps > speed(%u) Kbps specified\n",
+			sum_max_rate, speed);
+		return -EINVAL;
 	}
 
-	return err;
+	/* make sure vsi->ch_rss_size is set correctly based on TC's qcount */
+	vsi->ch_rss_size = max_rss_q_cnt;
+
+	return 0;
 }
 
 /**
- * ice_pf_ena_all_vsi - Resume all VSIs on a PF
- * @pf: the PF
- * @locked: is the rtnl_lock already held
+ * ice_add_vsi_to_fdir - add a VSI to the flow director group for PF
+ * @pf: ptr to PF device
+ * @vsi: ptr to VSI
  */
-#ifdef CONFIG_DCB
-int ice_pf_ena_all_vsi(struct ice_pf *pf, bool locked)
+static int ice_add_vsi_to_fdir(struct ice_pf *pf, struct ice_vsi *vsi)
 {
-	int v;
+	struct device *dev = ice_pf_to_dev(pf);
+	bool added = false;
+	struct ice_hw *hw;
+	int flow;
 
-	ice_for_each_vsi(pf, v)
-		if (pf->vsi[v])
-			if (ice_ena_vsi(pf->vsi[v], locked))
-				return -EIO;
+	if (!(vsi->num_gfltr || vsi->num_bfltr))
+		return -EINVAL;
+
+	hw = &pf->hw;
+	for (flow = 0; flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		enum ice_block blk = ICE_BLK_FD;
+		struct ice_fd_hw_prof *prof;
+		enum ice_status status;
+		u64 entry_h;
+		int tun;
+
+		if (!(hw->fdir_prof && hw->fdir_prof[flow] &&
+		      hw->fdir_prof[flow]->cnt))
+			continue;
+
+		for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+			enum ice_flow_priority prio;
+			u64 prof_id;
+
+			/* add this VSI to FDir profile for this flow */
+			prio = ICE_FLOW_PRIO_NORMAL;
+			prof = hw->fdir_prof[flow];
+			prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+			status = ice_flow_add_entry(hw, blk, prof_id,
+						    prof->vsi_h[0], vsi->idx,
+						    prio, prof->fdir_seg[tun],
+						    NULL, 0, &entry_h);
+			if (status) {
+				dev_err(dev, "channel VSI idx %d, not able to add to group %d\n",
+					vsi->idx, flow);
+				continue;
+			}
+
+			prof->entry_h[prof->cnt][tun] = entry_h;
+		}
 
+		/* store VSI for filter replay and delete */
+		prof->vsi_h[prof->cnt] = vsi->idx;
+		prof->cnt++;
+
+		/* loop bookkeeping */
+		added = true;
+		dev_dbg(dev, "VSI idx %d added to fdir group %d\n", vsi->idx,
+			flow);
+	}
+
+	if (!added)
+		dev_dbg(dev, "VSI idx %d not added to fdir groups\n", vsi->idx);
+	else
+		set_bit(ICE_CHNL_FEATURE_FD_ENA, vsi->features);
 	return 0;
 }
-#endif /* CONFIG_DCB */
 
 /**
- * ice_vsi_rebuild_by_type - Rebuild VSI of a given type
- * @pf: pointer to the PF instance
- * @type: VSI type to rebuild
+ * ice_add_channel - add a channel by adding VSI
+ * @pf: ptr to PF device
+ * @sw_id: underlying HW switching element ID
+ * @ch: ptr to channel structure
  *
- * Iterates through the pf->vsi array and rebuilds VSIs of the requested type
+ * Add a channel (VSI) using add_vsi and queue_map
  */
-static int ice_vsi_rebuild_by_type(struct ice_pf *pf, enum ice_vsi_type type)
+static int ice_add_channel(struct ice_pf *pf, u16 sw_id, struct ice_channel *ch)
 {
-	enum ice_status status;
-	int i, err;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_vsi *vsi;
 
-	ice_for_each_vsi(pf, i) {
-		struct ice_vsi *vsi = pf->vsi[i];
+	if (ch->type != ICE_VSI_CHNL) {
+		dev_err(dev, "add new VSI failed, ch->type %d\n", ch->type);
+		return -EINVAL;
+	}
 
-		if (!vsi || vsi->type != type)
-			continue;
+	vsi = ice_chnl_vsi_setup(pf, pf->hw.port_info, ch);
+	if (!vsi || vsi->type != ICE_VSI_CHNL) {
+		dev_err(dev, "create chnl VSI failure\n");
+		return -EINVAL;
+	}
 
-		/* rebuild the VSI */
-		err = ice_vsi_rebuild(vsi);
-		if (err) {
-			dev_err(&pf->pdev->dev,
-				"rebuild VSI failed, err %d, VSI index %d, type %d\n",
-				err, vsi->idx, type);
-			return err;
-		}
+	/* set/clear VSI level feature flag for packet based optimization
+	 * (this is related to SW triggred interrupt from napi_poll - which is
+	 * generally based off data packets or not)
+	 */
+	if (test_bit(ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA, pf->flags))
+		set_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA, vsi->features);
+	else
+		clear_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA, vsi->features);
 
-		/* replay filters for the VSI */
-		status = ice_replay_vsi(&pf->hw, vsi->idx);
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"replay VSI failed, status %d, VSI index %d, type %d\n",
-				status, vsi->idx, type);
-			return -EIO;
-		}
+	/* set/clear VSI level feature flag for ADQ (aka channel) VSIs
+	 * based on PF level private flags: this flag meant to harvest
+	 * clean of Rx queue upon busy_poll stop and after that clean
+	 * once only.
+	 */
+	if (test_bit(ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_ENA, pf->flags))
+		set_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+			vsi->features);
+	else
+		clear_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+			  vsi->features);
 
-		/* Re-map HW VSI number, using VSI handle that has been
-		 * previously validated in ice_replay_vsi() call above
-		 */
-		vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+	/* set/clear inline flow-director bits for newly created VSI based
+	 * on PF level private flags
+	 */
+	if (test_bit(ICE_FLAG_CHNL_INLINE_FD_ENA, pf->flags))
+		set_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA, vsi->features);
+	else
+		clear_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA, vsi->features);
 
-		/* enable the VSI */
-		err = ice_ena_vsi(vsi, false);
-		if (err) {
-			dev_err(&pf->pdev->dev,
-				"enable VSI failed, err %d, VSI index %d, type %d\n",
-				err, vsi->idx, type);
-			return err;
-		}
+	if (test_bit(ICE_FLAG_CHNL_INLINE_FD_MARK_ENA, pf->flags))
+		set_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA, vsi->features);
+	else
+		clear_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA, vsi->features);
+
+	/* if VSI has some FD resources reserved (either from guaranteed or
+	 * best-effort quota), add VSI into VSI group which has FD
+	 * input set defined so that, newly created VSI can use FD
+	 * resources (side-band flow director type filter and/or
+	 * inline flow-director type of filters which are typically
+	 * setup during normal transmit path if packet being transmitted
+	 * has SYN, SYN+ACK, RST, FIN flags set)
+	 */
+	clear_bit(ICE_CHNL_FEATURE_FD_ENA, vsi->features);
 
-		dev_info(&pf->pdev->dev, "VSI rebuilt. VSI index %d, type %d\n",
-			 vsi->idx, type);
-	}
+	ice_add_vsi_to_fdir(pf, vsi);
+
+	ch->sw_id = sw_id;
+	ch->vsi_num = vsi->vsi_num;
+	ch->info.mapping_flags = vsi->info.mapping_flags;
+	ch->ch_vsi = vsi;
+	/* initialize filter type to be INVALID */
+	ch->fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+	/* set the back pointer of channel for newly created VSI */
+	vsi->ch = ch;
+
+	memcpy(&ch->info.q_mapping, &vsi->info.q_mapping,
+	       sizeof(vsi->info.q_mapping));
+	memcpy(&ch->info.tc_mapping, vsi->info.tc_mapping,
+	       sizeof(vsi->info.tc_mapping));
 
 	return 0;
 }
 
 /**
- * ice_update_pf_netdev_link - Update PF netdev link status
- * @pf: pointer to the PF instance
+ * ice_chnl_cfg_res
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ *
+ * Configure channel specific resources such as rings, vector.
  */
-static void ice_update_pf_netdev_link(struct ice_pf *pf)
+static void ice_chnl_cfg_res(struct ice_vsi *vsi, struct ice_channel *ch)
 {
-	bool link_up;
 	int i;
 
-	ice_for_each_vsi(pf, i) {
-		struct ice_vsi *vsi = pf->vsi[i];
 
-		if (!vsi || vsi->type != ICE_VSI_PF)
-			return;
+	for (i = 0; i < ch->num_txq; i++) {
+		struct ice_q_vector *tx_q_vector, *rx_q_vector;
+		struct ice_ring *tx_ring, *rx_ring;
+		struct ice_ring_container *rc;
 
-		ice_get_link_status(pf->vsi[i]->port_info, &link_up);
-		if (link_up) {
-			netif_carrier_on(pf->vsi[i]->netdev);
-			netif_tx_wake_all_queues(pf->vsi[i]->netdev);
-		} else {
-			netif_carrier_off(pf->vsi[i]->netdev);
-			netif_tx_stop_all_queues(pf->vsi[i]->netdev);
+		tx_ring = vsi->tx_rings[ch->base_q + i];
+		rx_ring = vsi->rx_rings[ch->base_q + i];
+		if (!tx_ring || !rx_ring)
+			continue;
+
+		/* setup ring being channel enabled */
+		tx_ring->ch = ch;
+		rx_ring->ch = ch;
+
+		tx_ring->ch_inline_fd_cnt_index = ch->fd_cnt_index;
+
+		/* following code block sets up vector specific attributes */
+		tx_q_vector = tx_ring->q_vector;
+		rx_q_vector = rx_ring->q_vector;
+		if (!tx_q_vector && !rx_q_vector)
+			continue;
+
+		if (tx_q_vector) {
+			tx_q_vector->ch = ch;
+			tx_q_vector->state_flags = 0;
+			tx_q_vector->max_limit_process_rx_queues =
+					ICE_MAX_LIMIT_PROCESS_RX_PKTS_DFLT;
+			/* setup Tx and Rx ITR setting if DIM is off */
+			rc = &tx_q_vector->tx;
+			if (!ITR_IS_DYNAMIC(rc))
+				ice_write_itr(rc, rc->itr_setting);
+		}
+		if (rx_q_vector) {
+			rx_q_vector->ch = ch;
+			rx_q_vector->state_flags = 0;
+			rx_q_vector->max_limit_process_rx_queues =
+					ICE_MAX_LIMIT_PROCESS_RX_PKTS_DFLT;
+			/* setup Tx and Rx ITR setting if DIM is off */
+			rc = &rx_q_vector->rx;
+			if (!ITR_IS_DYNAMIC(rc))
+				ice_write_itr(rc, rc->itr_setting);
 		}
 	}
+
+	/* it is safe to assume that, if channel has non-zero num_t[r]xq, then
+	 * GLINT_ITR register would have written to perform in-context
+	 * update, hence perform flush
+	 */
+	if (ch->num_txq || ch->num_rxq)
+		ice_flush(&vsi->back->hw);
 }
 
 /**
- * ice_rebuild - rebuild after reset
- * @pf: PF to rebuild
- * @reset_type: type of reset
+ * ice_cfg_chnl_all_res - configure channel resources
+ * @vsi: pte to main_vsi
+ * @ch: ptr to channel structure
+ *
+ * This function configures channel specific resources such as flow-director
+ * counter index, and other resources such as queues, vectors, ITR settings
  */
-static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
+static void
+ice_cfg_chnl_all_res(struct ice_vsi *vsi, struct ice_channel *ch)
 {
-	struct device *dev = &pf->pdev->dev;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status ret;
-	int err;
-
-	if (test_bit(__ICE_DOWN, pf->state))
-		goto clear_recovery;
-
-	dev_dbg(dev, "rebuilding PF after reset_type=%d\n", reset_type);
+	struct ice_pf *pf = vsi->back;
 
-	ret = ice_init_all_ctrlq(hw);
-	if (ret) {
-		dev_err(dev, "control queues init failed %d\n", ret);
-		goto err_init_ctrlq;
-	}
+	/* setup inline-FD counter index per channel, eventually
+	 * used separate counter index per channel, to offer
+	 * better granularity and QoS per channel for RSS and FD
+	 */
+	ch->fd_cnt_index = ICE_FD_CH_STAT_IDX(pf->hw.fd_ctr_base);
+	/* reset source for all counters is CORER, typically upon
+	 * driver load, those counters may have stale value, hence
+	 * initialize counter to zero, access type for counters is RWC
+	 */
+	ice_clear_cntr(pf, ch->fd_cnt_index);
 
-	/* if DDP was previously loaded successfully */
-	if (!ice_is_safe_mode(pf)) {
-		/* reload the SW DB of filter tables */
-		if (reset_type == ICE_RESET_PFR)
-			ice_fill_blk_tbls(hw);
-		else
-			/* Reload DDP Package after CORER/GLOBR reset */
-			ice_load_pkg(NULL, pf);
-	}
+	/* configure channel (aka ADQ) resources such as queues, vectors,
+	 * ITR settings for channel specific vectors and anything else
+	 */
+	ice_chnl_cfg_res(vsi, ch);
+}
 
-	ret = ice_clear_pf_cfg(hw);
-	if (ret) {
-		dev_err(dev, "clear PF configuration failed %d\n", ret);
-		goto err_init_ctrlq;
-	}
+/**
+ * ice_setup_hw_channel - setup new channel
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ * @sw_id: underlying HW switching element ID
+ * @type: type of channel to be created (VMDq2/VF)
+ *
+ * Setup new channel (VSI) based on specified type (VMDq2/VF)
+ * and configures Tx rings accordingly
+ */
+static int
+ice_setup_hw_channel(struct ice_pf *pf, struct ice_vsi *vsi,
+		     struct ice_channel *ch, u16 sw_id, u8 type)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	int ret;
 
-	ice_clear_pxe_mode(hw);
+	ch->base_q = vsi->next_base_q;
+	ch->type = type;
 
-	ret = ice_get_caps(hw);
+	ret = ice_add_channel(pf, sw_id, ch);
 	if (ret) {
-		dev_err(dev, "ice_get_caps failed %d\n", ret);
-		goto err_init_ctrlq;
+		dev_err(dev, "failed to add_channel using sw_id %u\n", sw_id);
+		return ret;
 	}
 
-	err = ice_sched_init_port(hw->port_info);
-	if (err)
-		goto err_sched_init_port;
-
-	err = ice_update_link_info(hw->port_info);
-	if (err)
-		dev_err(&pf->pdev->dev, "Get link status error %d\n", err);
+	/* configure/setup ADQ specific resources */
+	ice_cfg_chnl_all_res(vsi, ch);
 
-	/* start misc vector */
-	err = ice_req_irq_msix_misc(pf);
-	if (err) {
-		dev_err(dev, "misc vector setup failed: %d\n", err);
-		goto err_sched_init_port;
-	}
+	/* make sure to update the next_base_q so that subsequent channel's
+	 * (aka ADQ) VSI queue map is correct
+	 */
+	vsi->next_base_q = vsi->next_base_q + ch->num_rxq;
+	dev_dbg(dev, "added channel: vsi_num %u, num_rxq %u\n", ch->vsi_num,
+		ch->num_rxq);
 
-	if (test_bit(ICE_FLAG_DCB_ENA, pf->flags))
-		ice_dcb_rebuild(pf);
+	return 0;
+}
 
-	/* rebuild PF VSI */
-	err = ice_vsi_rebuild_by_type(pf, ICE_VSI_PF);
-	if (err) {
-		dev_err(dev, "PF VSI rebuild failed: %d\n", err);
-		goto err_vsi_rebuild;
-	}
+/**
+ * ice_setup_channel - setup new channel using uplink element
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ *
+ * Setup new channel (VSI) based on specified type (VMDq2/VF)
+ * and uplink switching element
+ */
+static bool
+ice_setup_channel(struct ice_pf *pf, struct ice_vsi *vsi,
+		  struct ice_channel *ch)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	u16 sw_id;
+	int ret;
 
-	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags)) {
-		err = ice_vsi_rebuild_by_type(pf, ICE_VSI_VF);
-		if (err) {
-			dev_err(dev, "VF VSI rebuild failed: %d\n", err);
-			goto err_vsi_rebuild;
-		}
+	if (vsi->type != ICE_VSI_PF) {
+		dev_err(dev, "unsupported parent VSI type(%d)\n", vsi->type);
+		return false;
 	}
 
-	ice_update_pf_netdev_link(pf);
+	sw_id = pf->first_sw->sw_id;
 
-	/* tell the firmware we are up */
-	ret = ice_send_version(pf);
+	/* create channel (VSI) */
+	ret = ice_setup_hw_channel(pf, vsi, ch, sw_id, ICE_VSI_CHNL);
 	if (ret) {
-		dev_err(dev,
-			"Rebuild failed due to error sending driver version: %d\n",
-			ret);
-		goto err_vsi_rebuild;
+		dev_err(dev, "failed to setup hw_channel\n");
+		return false;
 	}
+	dev_dbg(dev, "successfully created channel()\n");
 
-	ice_replay_post(hw);
+	return ch->ch_vsi ? true : false;
+}
 
-	/* if we get here, reset flow is successful */
-	clear_bit(__ICE_RESET_FAILED, pf->state);
-	return;
+/**
+ * ice_set_bw_limit - setup BW limit for Tx traffic based on max_tx_rate
+ * ice_set_min_bw_limit - setup minimum BW limit for Tx based on min_tx_rate
+ * @vsi: VSI to be configured
+ * @max_tx_rate: max Tx rate in Kbps to be configured as maximum BW limit
+ * @min_tx_rate: min Tx rate in Kbps to be configured as mimimum BW limit
+ */
+static int
+ice_set_bw_limit(struct ice_vsi *vsi, u64 max_tx_rate, u64 min_tx_rate)
+{
+	int err;
 
-err_vsi_rebuild:
-err_sched_init_port:
-	ice_sched_cleanup_all(hw);
-err_init_ctrlq:
-	ice_shutdown_all_ctrlq(hw);
-	set_bit(__ICE_RESET_FAILED, pf->state);
-clear_recovery:
-	/* set this bit in PF state to control service task scheduling */
-	set_bit(__ICE_NEEDS_RESTART, pf->state);
-	dev_err(dev, "Rebuild failed, unload and reload driver\n");
+	err = ice_set_min_bw_limit(vsi, min_tx_rate);
+	if (err)
+		return err;
+
+	err = ice_set_max_bw_limit(vsi, max_tx_rate);
+	if (err)
+		return err;
+
+	return 0;
 }
 
 /**
- * ice_change_mtu - NDO callback to change the MTU
- * @netdev: network interface device structure
- * @new_mtu: new value for maximum frame size
+ * ice_create_q_channel - function to create channel
+ * @vsi: VSI to be configured
+ * @ch: ptr to channel (it contains channel specific params)
  *
- * Returns 0 on success, negative on failure
+ * This function creates channel (VSI) using num_queues specified by user,
+ * reconfigs RSS if needed.
  */
-static int ice_change_mtu(struct net_device *netdev, int new_mtu)
+static int ice_create_q_channel(struct ice_vsi *vsi, struct ice_channel *ch)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
-	u8 count = 0;
+	struct device *dev;
 
-	if (new_mtu == netdev->mtu) {
-		netdev_warn(netdev, "MTU is already %u\n", netdev->mtu);
-		return 0;
-	}
+	if (!ch)
+		return -EINVAL;
 
-	if (new_mtu < netdev->min_mtu) {
-		netdev_err(netdev, "new MTU invalid. min_mtu is %d\n",
-			   netdev->min_mtu);
+	dev = ice_pf_to_dev(pf);
+	if (!ch->num_txq || !ch->num_rxq) {
+		dev_err(dev, "Invalid num_queues requested: %d\n", ch->num_rxq);
 		return -EINVAL;
-	} else if (new_mtu > netdev->max_mtu) {
-		netdev_err(netdev, "new MTU invalid. max_mtu is %d\n",
-			   netdev->min_mtu);
+	}
+
+	if (!vsi->cnt_q_avail || vsi->cnt_q_avail < ch->num_txq) {
+		dev_err(dev, "Error: cnt_q_avail (%u) less than num_queues %d\n",
+			vsi->cnt_q_avail, ch->num_txq);
 		return -EINVAL;
 	}
-	/* if a reset is in progress, wait for some time for it to complete */
-	do {
-		if (ice_is_reset_in_progress(pf->state)) {
-			count++;
-			usleep_range(1000, 2000);
-		} else {
-			break;
-		}
 
-	} while (count < 100);
+	if (!ice_setup_channel(pf, vsi, ch)) {
+		dev_info(dev, "Failed to setup channel\n");
+		return -EINVAL;
+	}
+	/* configure BW rate limit */
+	if (ch->ch_vsi && (ch->max_tx_rate || ch->min_tx_rate)) {
+		int ret;
 
-	if (count == 100) {
-		netdev_err(netdev, "can't change MTU. Device is busy\n");
-		return -EBUSY;
+		ret = ice_set_bw_limit(ch->ch_vsi, ch->max_tx_rate,
+				       ch->min_tx_rate);
+		if (ret)
+			dev_err(dev, "failed to set Tx rate of %llu Kbps for VSI(%u)\n",
+				ch->max_tx_rate, ch->ch_vsi->vsi_num);
+		else
+			dev_dbg(dev, "set Tx rate of %llu Kbps for VSI(%u)\n",
+				ch->max_tx_rate, ch->ch_vsi->vsi_num);
 	}
 
-	netdev->mtu = new_mtu;
+	vsi->cnt_q_avail -= ch->num_txq;
 
-	/* if VSI is up, bring it down and then back up */
-	if (!test_and_set_bit(__ICE_DOWN, vsi->state)) {
-		int err;
+	return 0;
+}
 
-		err = ice_down(vsi);
-		if (err) {
-			netdev_err(netdev, "change MTU if_up err %d\n", err);
-			return err;
-		}
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_rem_all_chnl_fltrs - removes all channel filters
+ * @pf: ptr to PF, TC-flower based filter are tracked at PF level
+ *
+ * Remove all advanced switch filters only if they are channel specific
+ * tc-flower based filter
+ */
+static void ice_rem_all_chnl_fltrs(struct ice_pf *pf)
+{
+	struct ice_tc_flower_fltr *fltr;
+	struct hlist_node *node2;
+
+	/* to remove all channel filters, iterate an ordered list of filters */
+	hlist_for_each_entry_safe(fltr, node2,
+				  &pf->tc_flower_fltr_list,
+				  tc_flower_node) {
+		struct ice_rule_query_data rule;
+		enum ice_status status;
+
+		/* for now process only channel specific filters */
+		if (!ice_is_chnl_fltr(fltr))
+			continue;
 
-		err = ice_up(vsi);
-		if (err) {
-			netdev_err(netdev, "change MTU if_up err %d\n", err);
-			return err;
+		rule.rid = fltr->rid;
+		rule.rule_id = fltr->rule_id;
+		rule.vsi_handle = fltr->dest_id;
+		status = ice_rem_adv_rule_by_id(&pf->hw, &rule);
+		if (status) {
+			if (status == ICE_ERR_DOES_NOT_EXIST)
+				dev_dbg(ice_pf_to_dev(pf),
+					"TC flower filter (rule_id %u) does not exist\n",
+					rule.rule_id);
+			else
+				dev_err(ice_pf_to_dev(pf),
+					"failed to delete TC flower filter, status %d\n",
+					status);
+		} else if (fltr->dest_vsi) {
+			/* update advanced switch filter count */
+			if (fltr->dest_vsi->type == ICE_VSI_CHNL) {
+				u32 flags = fltr->flags;
+
+				fltr->dest_vsi->num_chnl_fltr--;
+				if (flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+					     ICE_TC_FLWR_FIELD_ENC_DST_MAC))
+					pf->num_dmac_chnl_fltrs--;
+			}
 		}
-	}
 
-	netdev_info(netdev, "changed MTU to %d\n", new_mtu);
-	return 0;
+		hlist_del(&fltr->tc_flower_node);
+		kfree(fltr);
+	}
 }
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
 
 /**
- * ice_set_rss - Set RSS keys and lut
- * @vsi: Pointer to VSI structure
- * @seed: RSS hash seed
- * @lut: Lookup table
- * @lut_size: Lookup table size
+ * ice_remove_q_channels - Remove queue channels for the TCs
+ * @vsi: VSI to be configured
+ * @rem_fltr: delete advanced switch filter or not
  *
- * Returns 0 on success, negative on failure
+ * Remove queue channels for the TCs
  */
-int ice_set_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+static void ice_remove_q_channels(struct ice_vsi *vsi, bool rem_fltr)
+#else
+static void ice_remove_q_channels(struct ice_vsi *vsi,
+				  bool __always_unused rem_fltr)
+#endif
 {
+	struct ice_channel *ch, *ch_tmp;
 	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status status;
+	int i;
 
-	if (seed) {
-		struct ice_aqc_get_set_rss_keys *buf =
-				  (struct ice_aqc_get_set_rss_keys *)seed;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	/* remove all tc-flower based filter if they are channel filters only */
+	if (rem_fltr)
+		ice_rem_all_chnl_fltrs(pf);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 
-		status = ice_aq_set_rss_key(hw, vsi->idx, buf);
+	/* perform cleanup for channels if they exist */
+	list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) {
+		struct ice_vsi *ch_vsi;
 
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"Cannot set RSS key, err %d aq_err %d\n",
-				status, hw->adminq.rq_last_status);
-			return -EIO;
+		list_del(&ch->list);
+		ch_vsi = ch->ch_vsi;
+		if (!ch_vsi) {
+			kfree(ch);
+			continue;
 		}
-	}
 
-	if (lut) {
-		status = ice_aq_set_rss_lut(hw, vsi->idx, vsi->rss_lut_type,
-					    lut, lut_size);
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"Cannot set RSS lut, err %d aq_err %d\n",
-				status, hw->adminq.rq_last_status);
-			return -EIO;
+		/* Reset queue contexts */
+		for (i = 0; i < ch->num_rxq; i++) {
+			struct ice_ring *tx_ring, *rx_ring;
+
+			tx_ring = vsi->tx_rings[ch->base_q + i];
+			rx_ring = vsi->rx_rings[ch->base_q + i];
+			if (tx_ring) {
+				tx_ring->ch = NULL;
+				if (tx_ring->q_vector)
+					tx_ring->q_vector->ch = NULL;
+			}
+			if (rx_ring) {
+				rx_ring->ch = NULL;
+				if (rx_ring->q_vector)
+					rx_ring->q_vector->ch = NULL;
+			}
 		}
+
+		/* Release FD resources for the channel VSI */
+		ice_fdir_rem_adq_chnl(&pf->hw, ch->ch_vsi->idx);
+
+		/* clear the VSI from schedular tree */
+		ice_rm_vsi_lan_cfg(ch->ch_vsi->port_info, ch->ch_vsi->idx);
+
+		/* Delete VSI from FW */
+		ice_vsi_delete(ch->ch_vsi);
+
+		/* Delete VSI from PF and HW VSI arrays */
+		ice_vsi_clear(ch->ch_vsi);
+
+		/* free the channel */
+		kfree(ch);
 	}
 
-	return 0;
+	/* clear the channel VSI map which is stored in main VSI */
+	ice_for_each_chnl_tc(i)
+		vsi->tc_map_vsi[i] = NULL;
+
+	/* reset main VSI's all TC information */
+	vsi->all_enatc = 0;
+	vsi->all_numtc = 0;
 }
 
 /**
- * ice_get_rss - Get RSS keys and lut
- * @vsi: Pointer to VSI structure
- * @seed: Buffer to store the keys
- * @lut: Buffer to store the lookup table entries
- * @lut_size: Size of buffer to store the lookup table entries
+ * ice_rebuild_channels - rebuild channel
+ * @pf: ptr to PF
  *
- * Returns 0 on success, negative on failure
+ * Recreate channel VSIs and replay filters
  */
-int ice_get_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
+static int ice_rebuild_channels(struct ice_pf *pf)
 {
-	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status status;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_vsi *main_vsi;
+	bool rem_adv_fltr = true;
+	struct ice_channel *ch;
+	struct ice_vsi *vsi;
+	int tc_idx = 1;
+	int i, err;
 
-	if (seed) {
-		struct ice_aqc_get_set_rss_keys *buf =
-				  (struct ice_aqc_get_set_rss_keys *)seed;
+	main_vsi = ice_get_main_vsi(pf);
+	if (!main_vsi)
+		return 0;
 
-		status = ice_aq_get_rss_key(hw, vsi->idx, buf);
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"Cannot get RSS key, err %d aq_err %d\n",
-				status, hw->adminq.rq_last_status);
-			return -EIO;
-		}
+	if (!test_bit(ICE_FLAG_TC_MQPRIO, pf->flags) ||
+	    main_vsi->old_numtc == 1)
+		return 0; /* nothing to be done */
+
+
+	/* reconfigure main VSI based on old value of TC and cached values
+	 * for MQPRIO opts
+	 */
+	err = ice_vsi_cfg_tc(main_vsi, main_vsi->old_ena_tc);
+	if (err) {
+		dev_err(dev, "failed configuring TC(ena_tc:0x%02x) for HW VSI=%u\n",
+			main_vsi->old_ena_tc, main_vsi->vsi_num);
+		return err;
 	}
 
-	if (lut) {
-		status = ice_aq_get_rss_lut(hw, vsi->idx, vsi->rss_lut_type,
-					    lut, lut_size);
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"Cannot get RSS lut, err %d aq_err %d\n",
-				status, hw->adminq.rq_last_status);
-			return -EIO;
+	/* rebuild ADQ VSIs */
+	ice_for_each_vsi(pf, i) {
+		enum ice_vsi_type type;
+
+		vsi = pf->vsi[i];
+		if (!vsi || vsi->type != ICE_VSI_CHNL)
+			continue;
+
+		type = vsi->type;
+
+		/* rebuild ADQ VSI */
+		err = ice_vsi_rebuild(vsi, true);
+		if (err) {
+			dev_err(dev, "VSI (type:%s) at index %d rebuild failed, err %d\n",
+				ice_vsi_type_str(type), vsi->idx, err);
+			goto cleanup;
 		}
+
+		/* Re-map HW VSI number, using VSI handle that has been
+		 * previously validated in ice_replay_vsi() call above
+		 */
+		vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+
+		/* replay filters for the VSI */
+		err = ice_replay_vsi(&pf->hw, vsi->idx);
+		if (err) {
+			dev_err(dev, "VSI (type:%s) replay failed, err %d, VSI index %d\n",
+				ice_vsi_type_str(type), err, vsi->idx);
+			rem_adv_fltr = false;
+			goto cleanup;
+		}
+		dev_info(dev, "VSI (type:%s) at index %d rebuilt successfully\n",
+			 ice_vsi_type_str(type), vsi->idx);
+
+		/* store ADQ VSI at correct TC index in main VSI's
+		 * map of TC to VSI
+		 */
+		main_vsi->tc_map_vsi[tc_idx++] = vsi;
+	}
+
+	/* ADQ VSI(s) has been rebuild successfully, so setup
+	 * channel for main VSI's Tx and Rx rings
+	 */
+	list_for_each_entry(ch, &main_vsi->ch_list, list) {
+		struct ice_vsi *ch_vsi;
+
+		ch_vsi = ch->ch_vsi;
+		if (!ch_vsi)
+			continue;
+
+		/* reconfig channel resources */
+		ice_cfg_chnl_all_res(main_vsi, ch);
+
+		/* replay BW rate limit it it is non-zero */
+		if (!ch->max_tx_rate && !ch->min_tx_rate)
+			continue;
+
+		err = ice_set_bw_limit(ch_vsi, ch->max_tx_rate,
+				       ch->min_tx_rate);
+		if (err)
+			dev_err(dev, "failed (err:%d) to rebuild BW rate limit, max_tx_rate: %llu Kbps, min_tx_rate: %llu Kbps for VSI(%u)\n",
+				err, ch->max_tx_rate, ch->min_tx_rate,
+				ch_vsi->vsi_num);
+		else
+			dev_dbg(dev, "successfully rebuild BW rate limit, max_tx_rate: %llu Kbps, min_tx_rate: %llu Kbps for VSI(%u)\n",
+				ch->max_tx_rate, ch->min_tx_rate,
+				ch_vsi->vsi_num);
 	}
 
+	/* reconfig RSS for main VSI */
+	if (main_vsi->ch_rss_size)
+		ice_vsi_cfg_rss_lut_key(main_vsi);
+
 	return 0;
+
+cleanup:
+	ice_remove_q_channels(main_vsi, rem_adv_fltr);
+	return err;
 }
 
 /**
- * ice_bridge_getlink - Get the hardware bridge mode
- * @skb: skb buff
- * @pid: process ID
- * @seq: RTNL message seq
- * @dev: the netdev being configured
- * @filter_mask: filter mask passed in
- * @nlflags: netlink flags passed in
+ * ice_cfg_q_channels - Add queue channel for the given TCs
+ * @vsi: VSI to be configured
  *
- * Return the bridge mode (VEB/VEPA)
+ * Configures queue channel mapping to the given TCs
  */
-static int
-ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
-		   struct net_device *dev, u32 filter_mask, int nlflags)
+static int ice_cfg_q_channels(struct ice_vsi *vsi)
 {
-	struct ice_netdev_priv *np = netdev_priv(dev);
-	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
-	u16 bmode;
+	struct ice_channel *ch;
+	int ret = 0, i;
 
-	bmode = pf->first_sw->bridge_mode;
+	ice_for_each_chnl_tc(i) {
+		if (!(vsi->all_enatc & BIT(i)))
+			continue;
 
-	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0, nlflags,
-				       filter_mask, NULL);
+		ch = kzalloc(sizeof(*ch), GFP_KERNEL);
+		if (!ch) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+		INIT_LIST_HEAD(&ch->list);
+		ch->num_rxq = vsi->mqprio_qopt.qopt.count[i];
+		ch->num_txq = vsi->mqprio_qopt.qopt.count[i];
+		ch->base_q = vsi->mqprio_qopt.qopt.offset[i];
+		ch->max_tx_rate = vsi->mqprio_qopt.max_rate[i];
+		ch->min_tx_rate = vsi->mqprio_qopt.min_rate[i];
+
+		/* convert to Kbits/s */
+		if (ch->max_tx_rate)
+			do_div(ch->max_tx_rate, ICE_BW_KBPS_DIVISOR);
+		if (ch->min_tx_rate)
+			do_div(ch->min_tx_rate, ICE_BW_KBPS_DIVISOR);
+
+		ret = ice_create_q_channel(vsi, ch);
+		if (ret) {
+			dev_err(ice_pf_to_dev(pf),
+				"failed creating channel TC:%d\n", i);
+			kfree(ch);
+			goto err_free;
+		}
+		list_add_tail(&ch->list, &vsi->ch_list);
+		vsi->tc_map_vsi[i] = ch->ch_vsi;
+		dev_dbg(ice_pf_to_dev(pf),
+			"successfully created channel: VSI %pK\n", ch->ch_vsi);
+	}
+	return ret;
+
+err_free:
+	ice_remove_q_channels(vsi, false);
+
+	return ret;
 }
 
 /**
- * ice_vsi_update_bridge_mode - Update VSI for switching bridge mode (VEB/VEPA)
- * @vsi: Pointer to VSI structure
- * @bmode: Hardware bridge mode (VEB/VEPA)
- *
- * Returns 0 on success, negative on failure
+ * ice_setup_tc_qdisc - configure multiple traffic classes
+ * @netdev: net device to configure
+ * @type_data: TC offload data
  */
-static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode)
+static int ice_setup_tc_qdisc(struct net_device *netdev, void *type_data)
 {
-	struct device *dev = &vsi->back->pdev->dev;
-	struct ice_aqc_vsi_props *vsi_props;
-	struct ice_hw *hw = &vsi->back->hw;
-	struct ice_vsi_ctx *ctxt;
-	enum ice_status status;
-	int ret = 0;
+	struct tc_mqprio_qopt_offload *mqprio_qopt = type_data;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u16 mode, ena_tc_qdisc = 0;
+	int cur_txq, cur_rxq;
+	u8 hw = 0, num_tcf;
+	struct device *dev;
+	int ret, i;
+
+	dev = ice_pf_to_dev(pf);
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	num_tcf = mqprio_qopt->qopt.num_tc;
+	hw = mqprio_qopt->qopt.hw;
+	mode = mqprio_qopt->mode;
+	if (!hw) {
+		clear_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+		vsi->ch_rss_size = 0;
+		memcpy(&vsi->mqprio_qopt, mqprio_qopt, sizeof(*mqprio_qopt));
+		goto config_tcf;
+	}
+
+	/* Generate queue region map for number of TCF requested */
+	for (i = 0; i < num_tcf; i++)
+		ena_tc_qdisc |= BIT(i);
+
+	switch (mode) {
+	case TC_MQPRIO_MODE_DCB:
+		netdev_err(netdev, "TC_MQPRIO_MODE_DCB not supported yet\n");
+		return -EINVAL;
+	case TC_MQPRIO_MODE_CHANNEL:
+		if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
+			netdev_err(netdev, "TC MQPRIO offload not supported,FW LLDP is enabled\n");
+			return -EINVAL;
+		}
 
-	vsi_props = &vsi->info;
+		ret = ice_validate_mqprio_qopt(vsi, mqprio_qopt);
+		if (ret) {
+			netdev_err(netdev, "failed to validate_mqprio_qopt(), ret %d\n",
+				   ret);
+			return ret;
+		}
+		memcpy(&vsi->mqprio_qopt, mqprio_qopt, sizeof(*mqprio_qopt));
+		set_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+		/* don't assume state of hw_tc_offload during driver load
+		 * and set the flag for TC flower filter if hw_tc_offload
+		 * already ON
+		 */
+		if (vsi->netdev->features & NETIF_F_HW_TC)
+			set_bit(ICE_FLAG_CLS_FLOWER, pf->flags);
+		break;
+	default:
+		return -EINVAL;
+	}
 
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
+config_tcf:
+#else
+	num_tcf =  tc;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 
-	ctxt->info = vsi->info;
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	/* Requesting same TCF configuration as already enabled */
+	if (ena_tc_qdisc == vsi->tc_cfg.ena_tc &&
+	    mode != TC_MQPRIO_MODE_CHANNEL)
+		return 0;
 
-	if (bmode == BRIDGE_MODE_VEB)
-		/* change from VEPA to VEB mode */
-		ctxt->info.sw_flags |= ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
-	else
-		/* change from VEB to VEPA mode */
-		ctxt->info.sw_flags &= ~ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+	/* Pause VSI queues */
+	ice_dis_vsi(vsi, true);
+
+	if (!hw && !test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		ice_remove_q_channels(vsi, true);
+#else
+	if (ena_tc_qdisc == vsi->tc_cfg.ena_tc)
+		return 0;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+	if (!hw && !test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+		vsi->req_txq = min_t(int, ice_get_avail_txq_count(pf),
+				     num_online_cpus());
+		vsi->req_rxq = min_t(int, ice_get_avail_rxq_count(pf),
+				     num_online_cpus());
+	} else {
+		/* logic to rebuild VSI, same like ethtool -L */
+		u16 offset = 0, qcount_tx = 0, qcount_rx = 0;
+
+		for (i = 0; i < num_tcf; i++) {
+			if (!(ena_tc_qdisc & BIT(i)))
+				continue;
+
+			offset = vsi->mqprio_qopt.qopt.offset[i];
+			qcount_rx = vsi->mqprio_qopt.qopt.count[i];
+			qcount_tx = vsi->mqprio_qopt.qopt.count[i];
+		}
+		vsi->req_txq = offset + qcount_tx;
+		vsi->req_rxq = offset + qcount_rx;
+
+		/* store away original rss_size info, so that it gets reused
+		 * form ice_vsi_rebuild during tc-qdisc delete stage - to
+		 * determine, what should be the rss_sizefor main VSI
+		 */
+		vsi->orig_rss_size = vsi->rss_size;
+	}
+
+	/* save current values of Tx and Rx queues before calling VSI rebuild
+	 * for fallback option
+	 */
+	cur_txq = vsi->num_txq;
+	cur_rxq = vsi->num_rxq;
+
+	/* proceed with rebuild main VSI using correct number of queues */
+	ret = ice_vsi_rebuild(vsi, false);
+	if (ret) {
+		/* fallback to current number of queues */
+		dev_info(dev, "Rebuild failed with new queues, try with current number of queues\n");
+		vsi->req_txq = cur_txq;
+		vsi->req_rxq = cur_rxq;
+		clear_bit(ICE_RESET_FAILED, pf->state);
+		if (ice_vsi_rebuild(vsi, false)) {
+			dev_err(dev, "Rebuild of main VSI failed again\n");
+			return ret;
+		}
+	}
+
+	vsi->all_numtc = num_tcf;
+	vsi->all_enatc = ena_tc_qdisc;
+	ret = ice_vsi_cfg_tc(vsi, ena_tc_qdisc);
+	if (ret) {
+		netdev_err(netdev, "failed configuring TC for VSI id=%d\n",
+			   vsi->vsi_num);
+		goto exit;
+	}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+		u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0];
+		u64 min_tx_rate = vsi->mqprio_qopt.min_rate[0];
+
+		/* set TC0 rate limit if specified */
+		if (max_tx_rate || min_tx_rate) {
+			/* convert to Kbits/s */
+			if (max_tx_rate)
+				do_div(max_tx_rate, ICE_BW_KBPS_DIVISOR);
+			if (min_tx_rate)
+				do_div(min_tx_rate, ICE_BW_KBPS_DIVISOR);
+
+			ret = ice_set_bw_limit(vsi, max_tx_rate, min_tx_rate);
+			if (!ret) {
+				dev_dbg(dev, "set Tx rate max %llu min %llu for VSI(%u)\n",
+					max_tx_rate, min_tx_rate, vsi->vsi_num);
+			} else {
+				dev_err(dev, "failed to set Tx rate max %llu min %llu for VSI(%u)\n",
+					max_tx_rate, min_tx_rate, vsi->vsi_num);
+				goto exit;
+			}
+		}
+		ret = ice_cfg_q_channels(vsi);
+		if (ret) {
+			netdev_err(netdev, "failed configuring queue channels\n");
+			goto exit;
+		} else {
+			netdev_dbg(netdev, "successfully configured channels\n");
+		}
+	}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 
-	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		dev_err(dev, "update VSI for bridge mode failed, bmode = %d err %d aq_err %d\n",
-			bmode, status, hw->adminq.sq_last_status);
-		ret = -EIO;
-		goto out;
+	if (vsi->ch_rss_size)
+		ice_vsi_cfg_rss_lut_key(vsi);
+
+exit:
+	/* if error, reset the all_numtc and all_enatc */
+	if (ret) {
+		vsi->all_numtc = 0;
+		vsi->all_enatc = 0;
 	}
-	/* Update sw flags for book keeping */
-	vsi_props->sw_flags = ctxt->info.sw_flags;
+	/* resume VSI */
+	ice_ena_vsi(vsi, true);
 
-out:
-	devm_kfree(dev, ctxt);
 	return ret;
 }
 
-/**
- * ice_bridge_setlink - Set the hardware bridge mode
- * @dev: the netdev being configured
- * @nlh: RTNL message
- * @flags: bridge setlink flags
- * @extack: netlink extended ack
- *
- * Sets the bridge mode (VEB/VEPA) of the switch to which the netdev (VSI) is
- * hooked up to. Iterates through the PF VSI list and sets the loopback mode (if
- * not already set for all VSIs connected to this switch. And also update the
- * unicast switch filter rules for the corresponding switch of the netdev.
- */
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+static LIST_HEAD(ice_block_cb_list);
+#endif
+
 static int
-ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
-		   u16 __always_unused flags,
-		   struct netlink_ext_ack __always_unused *extack)
+#ifdef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+ice_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+	     void *type_data)
+#elif defined(HAVE_NDO_SETUP_TC_CHAIN_INDEX)
+ice_setup_tc(struct net_device *netdev, u32 __always_unused handle,
+	     u32 __always_unused chain_index, __be16 proto,
+	     struct tc_to_netdev *tc)
+#else
+ice_setup_tc(struct net_device *netdev, u32 __always_unused handle,
+	     __be16 __always_unused proto, struct tc_to_netdev *tc)
+#endif
 {
-	struct ice_netdev_priv *np = netdev_priv(dev);
+#ifndef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+	struct tc_cls_flower_offload *cls_flower = tc->cls_flower;
+	unsigned int type = tc->type;
+#elif !defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO)
+	struct tc_cls_flower_offload *cls_flower = (struct
+						   tc_cls_flower_offload *)
+						   type_data;
+#endif /* HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV */
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
 	struct ice_pf *pf = np->vsi->back;
-	struct nlattr *attr, *br_spec;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status status;
-	struct ice_sw *pf_sw;
-	int rem, v, err = 0;
-
-	pf_sw = pf->first_sw;
-	/* find the attribute in the netlink message */
-	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
-
-	nla_for_each_nested(attr, br_spec, rem) {
-		__u16 mode;
+	int err;
 
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-		mode = nla_get_u16(attr);
-		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
-			return -EINVAL;
-		/* Continue  if bridge mode is not being flipped */
-		if (mode == pf_sw->bridge_mode)
-			continue;
-		/* Iterates through the PF VSI list and update the loopback
-		 * mode of the VSI
-		 */
-		ice_for_each_vsi(pf, v) {
-			if (!pf->vsi[v])
-				continue;
-			err = ice_vsi_update_bridge_mode(pf->vsi[v], mode);
-			if (err)
-				return err;
+	switch (type) {
+	case TC_SETUP_QDISC_MQPRIO:
+		/* setup traffic classifier for receive side */
+		mutex_lock(&pf->tc_mutex);
+		if (ice_is_dcb_active(pf)) {
+			if (pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) {
+				netdev_err(netdev,
+					   "TC_SETUP_QDISC_MQPRIO not supported when DCB is active, managed by FW LLDP agent\n");
+				mutex_unlock(&pf->tc_mutex);
+				return -EOPNOTSUPP;
+			} else if (pf->dcbx_cap & DCB_CAP_DCBX_HOST) {
+				/* if SW LLDP is running and if numtc is more
+				 * than 1, then SW LLDP must have enabled
+				 * multi TC mode - only in that scenario
+				 * stop ADQ config. This change will allow
+				 * co-existence of lldpad (SW LLDP) running
+				 * and ADQ. Once ADQ is configured, subsequent
+				 * SW LLDP - via netlink are handled (not
+				 * supported if ADQ is active)
+				 */
+				if (np->vsi->tc_cfg.numtc > 1) {
+					netdev_err(netdev,
+						   "TC_SETUP_QDISC_MQPRIO not supported when DCB is active, managed by SW LLDP agent, num_tc %u\n",
+						   np->vsi->tc_cfg.numtc);
+					mutex_unlock(&pf->tc_mutex);
+					return -EOPNOTSUPP;
+				}
+			}
 		}
-
-		hw->evb_veb = (mode == BRIDGE_MODE_VEB);
-		/* Update the unicast switch filter rules for the corresponding
-		 * switch of the netdev
-		 */
-		status = ice_update_sw_rule_bridge_mode(hw);
-		if (status) {
-			netdev_err(dev, "switch rule update failed, mode = %d err %d aq_err %d\n",
-				   mode, status, hw->adminq.sq_last_status);
-			/* revert hw->evb_veb */
-			hw->evb_veb = (pf_sw->bridge_mode == BRIDGE_MODE_VEB);
-			return -EIO;
+#ifdef HAVE_NETDEV_SB_DEV
+		if (ice_is_offloaded_macvlan_ena(pf)) {
+			netdev_err(netdev, "TC_SETUP_QDISC_MQPRIO not supported when MACVLAN offloade support is ON. Turn off MACVLAN offload support thru ethtool and try again\n");
+			mutex_unlock(&pf->tc_mutex);
+			return -EOPNOTSUPP;
+		}
+#endif /* HAVE_NETDEV_SB_DEV */
+		if (ice_is_dcf_enabled(pf)) {
+			netdev_err(netdev, "TC_SETUP_QDISC_MQPRIO not supported when Device Control Functionality is enabled.\n");
+			mutex_unlock(&pf->tc_mutex);
+			return -EOPNOTSUPP;
 		}
+		if (ice_is_eswitch_mode_switchdev(pf)) {
+			netdev_err(netdev, "TC MQPRIO offload not supported, switchdev is enabled\n");
+			mutex_unlock(&pf->tc_mutex);
+			return -EOPNOTSUPP;
+		}
+		err = ice_setup_tc_qdisc(netdev, type_data);
+		mutex_unlock(&pf->tc_mutex);
+		return err;
+	case TC_SETUP_BLOCK:
+		return flow_block_cb_setup_simple(type_data,
+						  &ice_block_cb_list,
+						  ice_setup_tc_block_cb,
+						  np, np, true);
+	default:
+		return -EOPNOTSUPP;
+	}
+#elif !defined(HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV) || !defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO)
 
-		pf_sw->bridge_mode = mode;
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return ice_setup_tc_cls_flower(np, np->vsi->netdev, cls_flower);
+	default:
+		return -EOPNOTSUPP;
 	}
+#endif
+	return -EOPNOTSUPP;
+}
 
-	return 0;
+#ifdef HAVE_TC_INDIR_BLOCK
+static struct ice_indr_block_priv *
+ice_indr_block_priv_lookup(struct ice_netdev_priv *np,
+			   struct net_device *netdev)
+{
+	struct ice_indr_block_priv *cb_priv;
+
+	/* All callback list access should be protected by RTNL. */
+	ASSERT_RTNL();
+
+	list_for_each_entry(cb_priv, &np->tc_indr_block_priv_list, list) {
+		if (!cb_priv->netdev)
+			return NULL;
+		if (cb_priv->netdev == netdev)
+			return cb_priv;
+	}
+	return NULL;
 }
 
-/**
- * ice_tx_timeout - Respond to a Tx Hang
- * @netdev: network interface device structure
- */
-static void ice_tx_timeout(struct net_device *netdev)
+static int
+ice_indr_setup_block_cb(enum tc_setup_type type, void *type_data,
+			void *indr_priv)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_ring *tx_ring = NULL;
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	int hung_queue = -1;
-	u32 i;
+	struct ice_indr_block_priv *priv = indr_priv;
+	struct ice_netdev_priv *np = priv->np;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return ice_setup_tc_cls_flower(np, priv->netdev,
+					       (struct flow_cls_offload *)
+					       type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
 
-	pf->tx_timeout_count++;
+static int
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+ice_indr_setup_tc_block(struct net_device *netdev, struct Qdisc *sch,
+			struct ice_netdev_priv *np,
+			struct flow_block_offload *f, void *data,
+			void (*cleanup)(struct flow_block_cb *block_cb))
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+ice_indr_setup_tc_block(struct net_device *netdev, struct ice_netdev_priv *np,
+			struct flow_block_offload *f, void *data,
+			void (*cleanup)(struct flow_block_cb *block_cb))
+#else
+ice_indr_setup_tc_block(struct net_device *netdev, struct ice_netdev_priv *np,
+			struct flow_block_offload *f)
+#endif
+{
+	struct ice_indr_block_priv *indr_priv;
+#ifdef HAVE_FLOW_BLOCK_API
+	struct flow_block_cb *block_cb;
+#else
+	int err = 0;
+#endif
+#ifdef HAVE_TC_FLOW_INDIR_DEV
+	int tunnel_type = ice_tc_tun_get_type(netdev, NULL);
 
-	/* find the stopped queue the same way dev_watchdog() does */
-	for (i = 0; i < netdev->num_tx_queues; i++) {
-		unsigned long trans_start;
-		struct netdev_queue *q;
+	if (tunnel_type != TNL_VXLAN && tunnel_type != TNL_GENEVE &&
+	    !(is_vlan_dev(netdev) &&
+	    vlan_dev_real_dev(netdev) == np->vsi->netdev))
+		return -EOPNOTSUPP;
+#endif
 
-		q = netdev_get_tx_queue(netdev, i);
-		trans_start = q->trans_start;
-		if (netif_xmit_stopped(q) &&
-		    time_after(jiffies,
-			       trans_start + netdev->watchdog_timeo)) {
-			hung_queue = i;
-			break;
+	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case FLOW_BLOCK_BIND:
+		indr_priv = ice_indr_block_priv_lookup(np, netdev);
+		if (indr_priv)
+			return -EEXIST;
+
+		indr_priv = devm_kzalloc(&netdev->dev, sizeof(*indr_priv),
+					 GFP_KERNEL);
+		if (!indr_priv)
+			return -ENOMEM;
+
+		indr_priv->netdev = netdev;
+		indr_priv->np = np;
+		list_add(&indr_priv->list, &np->tc_indr_block_priv_list);
+
+#ifdef HAVE_FLOW_BLOCK_API
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+		block_cb =
+			flow_indr_block_cb_alloc(ice_indr_setup_block_cb,
+						 indr_priv, indr_priv,
+						 ice_rep_indr_tc_block_unbind,
+						 f, netdev, sch, data, np,
+						 cleanup);
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+		block_cb =
+			flow_indr_block_cb_alloc(ice_indr_setup_block_cb,
+						 indr_priv, indr_priv,
+						 ice_rep_indr_tc_block_unbind,
+						 f, netdev, data, np, cleanup);
+#else
+		block_cb = flow_block_cb_alloc(ice_indr_setup_block_cb,
+					       indr_priv, indr_priv,
+					       ice_rep_indr_tc_block_unbind);
+#endif
+		if (IS_ERR(block_cb)) {
+			list_del(&indr_priv->list);
+			devm_kfree(&netdev->dev, indr_priv);
+			return PTR_ERR(block_cb);
+		}
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &ice_block_cb_list);
+		return 0;
+#else /* !HAVE_FLOW_BLOCK_API */
+		err = tcf_block_cb_register(f->block, ice_indr_setup_block_cb,
+					    indr_priv, indr_priv, f->extack);
+		if (err) {
+			list_del(&indr_priv->list);
+			devm_kfree(&netdev->dev, indr_priv);
 		}
+		return err;
+#endif /* !HAVE_FLOW_BLOCK_API */
+	case FLOW_BLOCK_UNBIND:
+		indr_priv = ice_indr_block_priv_lookup(np, netdev);
+		if (!indr_priv)
+			return -ENOENT;
+
+#ifdef HAVE_FLOW_BLOCK_API
+		block_cb = flow_block_cb_lookup(f->block,
+						ice_indr_setup_block_cb,
+						indr_priv);
+		if (!block_cb)
+			return -ENOENT;
+
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+		flow_indr_block_cb_remove(block_cb, f);
+#else
+		flow_block_cb_remove(block_cb, f);
+#endif /* HAVE_TC_FLOW_INDIR_DEV && HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP */
+		list_del(&block_cb->driver_list);
+		return 0;
+#else
+		tcf_block_cb_unregister(f->block, ice_indr_setup_block_cb,
+					indr_priv);
+		list_del(&indr_priv->list);
+		devm_kfree(&netdev->dev, indr_priv);
+		return 0;
+#endif
+	default:
+		return -EOPNOTSUPP;
 	}
+	return 0;
+}
 
-	if (i == netdev->num_tx_queues)
-		netdev_info(netdev, "tx_timeout: no netdev hung queue found\n");
-	else
-		/* now that we have an index, find the tx_ring struct */
-		for (i = 0; i < vsi->num_txq; i++)
-			if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc)
-				if (hung_queue == vsi->tx_rings[i]->q_index) {
-					tx_ring = vsi->tx_rings[i];
-					break;
-				}
-
-	/* Reset recovery level if enough time has elapsed after last timeout.
-	 * Also ensure no new reset action happens before next timeout period.
-	 */
-	if (time_after(jiffies, (pf->tx_timeout_last_recovery + HZ * 20)))
-		pf->tx_timeout_recovery_level = 1;
-	else if (time_before(jiffies, (pf->tx_timeout_last_recovery +
-				       netdev->watchdog_timeo)))
-		return;
+static int
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+ice_indr_setup_tc_cb(struct net_device *netdev, struct Qdisc *sch,
+		     void *cb_priv, enum tc_setup_type type, void *type_data,
+		     void *data,
+		     void (*cleanup)(struct flow_block_cb *block_cb))
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+ice_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
+		     enum tc_setup_type type, void *type_data, void *data,
+		     void (*cleanup)(struct flow_block_cb *block_cb))
+#else
+ice_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
+		     enum tc_setup_type type, void *type_data)
+#endif
+{
+	switch (type) {
+	case TC_SETUP_BLOCK:
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+		return ice_indr_setup_tc_block(netdev, sch, cb_priv, type_data,
+					       data, cleanup);
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+		return ice_indr_setup_tc_block(netdev, cb_priv, type_data, data,
+					       cleanup);
+#else
+		return ice_indr_setup_tc_block(netdev, cb_priv, type_data);
+#endif
 
-	if (tx_ring) {
-		struct ice_hw *hw = &pf->hw;
-		u32 head, val = 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
 
-		head = (rd32(hw, QTX_COMM_HEAD(vsi->txq_map[hung_queue])) &
-			QTX_COMM_HEAD_HEAD_M) >> QTX_COMM_HEAD_HEAD_S;
-		/* Read interrupt register */
-		val = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx));
+#ifndef HAVE_TC_FLOW_INDIR_DEV
+static int
+ice_indr_register_block(struct ice_netdev_priv *np, struct net_device *netdev)
+{
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int err;
 
-		netdev_info(netdev, "tx_timeout: VSI_num: %d, Q %d, NTC: 0x%x, HW_HEAD: 0x%x, NTU: 0x%x, INT: 0x%x\n",
-			    vsi->vsi_num, hung_queue, tx_ring->next_to_clean,
-			    head, tx_ring->next_to_use, val);
+	err = __flow_indr_block_cb_register(netdev, np, ice_indr_setup_tc_cb,
+					    np);
+	if (err) {
+		dev_err(ice_pf_to_dev(pf), "Failed to register remote block notifier for %s err=%d\n",
+			netdev_name(netdev), err);
 	}
+	return err;
+}
 
-	pf->tx_timeout_last_recovery = jiffies;
-	netdev_info(netdev, "tx_timeout recovery level %d, hung_queue %d\n",
-		    pf->tx_timeout_recovery_level, hung_queue);
+static void
+ice_indr_unregister_block(struct ice_netdev_priv *np, struct net_device *netdev)
+{
+	__flow_indr_block_cb_unregister(netdev, ice_indr_setup_tc_cb, np);
+}
 
-	switch (pf->tx_timeout_recovery_level) {
-	case 1:
-		set_bit(__ICE_PFR_REQ, pf->state);
-		break;
-	case 2:
-		set_bit(__ICE_CORER_REQ, pf->state);
-		break;
-	case 3:
-		set_bit(__ICE_GLOBR_REQ, pf->state);
+static void ice_indr_clean_block_privs(struct ice_netdev_priv *np)
+{
+	struct ice_indr_block_priv *cb_priv, *temp;
+	struct list_head *head = &np->tc_indr_block_priv_list;
+
+	list_for_each_entry_safe(cb_priv, temp, head, list) {
+		ice_indr_unregister_block(np, cb_priv->netdev);
+		devm_kfree(&cb_priv->netdev->dev, cb_priv);
+	}
+}
+
+static int
+ice_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+	struct ice_netdev_priv *np = container_of(nb, struct ice_netdev_priv,
+						  netdevice_nb);
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	int tunnel_type = ice_tc_tun_get_type(netdev, NULL);
+
+	if (tunnel_type != TNL_VXLAN && tunnel_type != TNL_GENEVE &&
+	    !(is_vlan_dev(netdev) &&
+	    vlan_dev_real_dev(netdev) == np->vsi->netdev))
+		return NOTIFY_OK;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		ice_indr_register_block(np, netdev);
 		break;
-	default:
-		netdev_err(netdev, "tx_timeout recovery unsuccessful, device is in unrecoverable state.\n");
-		set_bit(__ICE_DOWN, pf->state);
-		set_bit(__ICE_NEEDS_RESTART, vsi->state);
-		set_bit(__ICE_SERVICE_DIS, pf->state);
+	case NETDEV_UNREGISTER:
+		ice_indr_unregister_block(np, netdev);
 		break;
 	}
-
-	ice_service_task_schedule(pf);
-	pf->tx_timeout_recovery_level++;
+	return NOTIFY_OK;
 }
+#endif /* HAVE_TC_FLOW_INDIR_DEV */
+#endif /* HAVE_TC_INDIR_BLOCK */
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 
 /**
  * ice_open - Called when a network interface becomes active
@@ -4737,13 +11374,41 @@ static void ice_tx_timeout(struct net_device *netdev)
  * Returns 0 on success, negative value on failure
  */
 int ice_open(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_pf *pf = np->vsi->back;
+
+	if (ice_is_reset_in_progress(pf->state)) {
+		netdev_err(netdev, "can't open net device while reset is in progress");
+		return -EBUSY;
+	}
+
+	return ice_open_internal(netdev);
+}
+
+/**
+ * ice_open_internal - Called when a network interface becomes active
+ * @netdev: network interface device structure
+ *
+ * Internal ice_open implementation. Should not be used directly except for ice_open and reset
+ * handling routine
+ *
+ * Returns 0 on success, negative value on failure
+ */
+int ice_open_internal(struct net_device *netdev)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
 	struct ice_port_info *pi;
+	enum ice_status status;
 	int err;
 
-	if (test_bit(__ICE_NEEDS_RESTART, vsi->back->state)) {
+	/* disallow open if eeprom is corrupted */
+	if (test_bit(ICE_BAD_EEPROM, pf->state))
+		return -EOPNOTSUPP;
+
+	if (test_bit(ICE_NEEDS_RESTART, pf->state)) {
 		netdev_err(netdev, "driver needs to be unloaded and reloaded\n");
 		return -EIO;
 	}
@@ -4751,36 +11416,59 @@ int ice_open(struct net_device *netdev)
 	netif_carrier_off(netdev);
 
 	pi = vsi->port_info;
-	err = ice_update_link_info(pi);
-	if (err) {
-		netdev_err(netdev, "Failed to get link info, error %d\n",
-			   err);
-		return err;
+	status = ice_update_link_info(pi);
+	if (status) {
+		netdev_err(netdev, "Failed to get link info, error %s\n",
+			   ice_stat_str(status));
+		return -EIO;
 	}
 
+	ice_check_module_power(pf, pi->phy.link_info.link_cfg_err);
+
 	/* Set PHY if there is media, otherwise, turn off PHY */
 	if (pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE) {
-		err = ice_force_phys_link_state(vsi, true);
+		clear_bit(ICE_FLAG_NO_MEDIA, pf->flags);
+		if (!test_bit(ICE_PHY_INIT_COMPLETE, pf->state)) {
+			err = ice_init_phy_user_cfg(pi);
+			if (err) {
+				netdev_err(netdev, "Failed to initialize PHY settings, error %d\n",
+					   err);
+				return err;
+			}
+		}
+
+		err = ice_configure_phy(vsi);
 		if (err) {
-			netdev_err(netdev,
-				   "Failed to set physical link up, error %d\n",
+			netdev_err(netdev, "Failed to set physical link up, error %d\n",
 				   err);
 			return err;
 		}
 	} else {
-		err = ice_aq_set_link_restart_an(pi, false, NULL);
-		if (err) {
-			netdev_err(netdev, "Failed to set PHY state, VSI %d error %d\n",
-				   vsi->vsi_num, err);
-			return err;
-		}
-		set_bit(ICE_FLAG_NO_MEDIA, vsi->back->flags);
+		set_bit(ICE_FLAG_NO_MEDIA, pf->flags);
+		ice_set_link(vsi, false);
 	}
 
 	err = ice_vsi_open(vsi);
 	if (err)
 		netdev_err(netdev, "Failed to open VSI 0x%04X on switch 0x%04X\n",
 			   vsi->vsi_num, vsi->vsw->sw_id);
+
+	/* Update existing tunnels information */
+#ifdef HAVE_UDP_ENC_RX_OFFLOAD
+	udp_tunnel_get_rx_info(netdev);
+#else /* HAVE_UDP_ENC_RX_OFFLOAD */
+#ifdef HAVE_VXLAN_RX_OFFLOAD
+#if IS_ENABLED(CONFIG_VXLAN)
+	vxlan_get_rx_port(netdev);
+#endif
+#endif /* HAVE_VXLAN_RX_OFFLOAD */
+#ifdef HAVE_GENEVE_RX_OFFLOAD
+#if IS_ENABLED(CONFIG_GENEVE)
+	geneve_get_rx_port(netdev);
+#endif
+#endif /* HAVE_GENEVE_RX_OFFLOAD */
+#endif /* HAVE_UDP_ENC_RX_OFFLOAD */
+
 	return err;
 }
 
@@ -4798,12 +11486,19 @@ int ice_stop(struct net_device *netdev)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+
+	if (ice_is_reset_in_progress(pf->state)) {
+		netdev_err(netdev, "can't stop net device while reset is in progress");
+		return -EBUSY;
+	}
 
 	ice_vsi_close(vsi);
 
 	return 0;
 }
 
+#ifdef HAVE_NDO_FEATURES_CHECK
 /**
  * ice_features_check - Validate encapsulated packet conforms to limits
  * @skb: skb buffer
@@ -4831,21 +11526,21 @@ ice_features_check(struct sk_buff *skb,
 		features &= ~NETIF_F_GSO_MASK;
 
 	len = skb_network_header(skb) - skb->data;
-	if (len & ~(ICE_TXD_MACLEN_MAX))
+	if (len > ICE_TXD_MACLEN_MAX || len & 0x1)
 		goto out_rm_features;
 
 	len = skb_transport_header(skb) - skb_network_header(skb);
-	if (len & ~(ICE_TXD_IPLEN_MAX))
+	if (len > ICE_TXD_IPLEN_MAX || len & 0x1)
 		goto out_rm_features;
 
 	if (skb->encapsulation) {
 		len = skb_inner_network_header(skb) - skb_transport_header(skb);
-		if (len & ~(ICE_TXD_L4LEN_MAX))
+		if (len > ICE_TXD_L4LEN_MAX || len & 0x1)
 			goto out_rm_features;
 
 		len = skb_inner_transport_header(skb) -
 		      skb_inner_network_header(skb);
-		if (len & ~(ICE_TXD_IPLEN_MAX))
+		if (len > ICE_TXD_IPLEN_MAX || len & 0x1)
 			goto out_rm_features;
 	}
 
@@ -4853,6 +11548,7 @@ ice_features_check(struct sk_buff *skb,
 out_rm_features:
 	return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 }
+#endif /* HAVE_NDO_FEATURES_CHECK */
 
 static const struct net_device_ops ice_netdev_safe_mode_ops = {
 	.ndo_open = ice_open,
@@ -4860,7 +11556,11 @@ static const struct net_device_ops ice_netdev_safe_mode_ops = {
 	.ndo_start_xmit = ice_start_xmit,
 	.ndo_set_mac_address = ice_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
+#ifdef HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+	.extended.ndo_change_mtu = ice_change_mtu,
+#else
 	.ndo_change_mtu = ice_change_mtu,
+#endif
 	.ndo_get_stats64 = ice_get_stats64,
 	.ndo_tx_timeout = ice_tx_timeout,
 };
@@ -4869,24 +11569,127 @@ static const struct net_device_ops ice_netdev_ops = {
 	.ndo_open = ice_open,
 	.ndo_stop = ice_stop,
 	.ndo_start_xmit = ice_start_xmit,
+#ifdef HAVE_NDO_FEATURES_CHECK
 	.ndo_features_check = ice_features_check,
+#endif /* HAVE_NDO_FEATURES_CHECK */
+	.ndo_fix_features = ice_fix_features,
 	.ndo_set_rx_mode = ice_set_rx_mode,
 	.ndo_set_mac_address = ice_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
+#ifdef HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+	.extended.ndo_change_mtu = ice_change_mtu,
+#else
 	.ndo_change_mtu = ice_change_mtu,
+#endif
 	.ndo_get_stats64 = ice_get_stats64,
+#ifdef HAVE_NETPOLL_CONTROLLER
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = ice_netpoll,
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+#endif /* HAVE_NETPOLL_CONTROLLER */
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+#ifdef HAVE_RHEL7_EXTENDED_NDO_SET_TX_MAXRATE
+	.extended.ndo_set_tx_maxrate = ice_set_tx_maxrate,
+#else
+	.ndo_set_tx_maxrate = ice_set_tx_maxrate,
+#endif /* HAVE_RHEL7_EXTENDED_NDO_SET_TX_MAXRATE */
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
+	.ndo_do_ioctl = ice_do_ioctl,
 	.ndo_set_vf_spoofchk = ice_set_vf_spoofchk,
+#ifdef HAVE_NDO_SET_VF_TRUST
 	.ndo_set_vf_mac = ice_set_vf_mac,
 	.ndo_get_vf_config = ice_get_vf_cfg,
+#ifdef HAVE_RHEL7_NET_DEVICE_OPS_EXT
+	/* RHEL7 requires ndo_size to be defined to enable extended ops */
+	.ndo_size = sizeof(const struct net_device_ops),
+	.extended.ndo_set_vf_trust = ice_set_vf_trust,
+#else
 	.ndo_set_vf_trust = ice_set_vf_trust,
+#endif /* HAVE_RHEL7_NET_DEVICE_OPS_EXT */
+#endif /* HAVE_NDO_SET_VF_TRUST */
+#ifdef HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SET_VF_VLAN
+	.extended.ndo_set_vf_vlan = ice_set_vf_port_vlan,
+#else
 	.ndo_set_vf_vlan = ice_set_vf_port_vlan,
+#endif /* HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SET_VF_VLAN */
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 	.ndo_set_vf_link_state = ice_set_vf_link_state,
+#endif
+#ifdef HAVE_VF_STATS
+	.ndo_get_vf_stats = ice_get_vf_stats,
+#endif /* HAVE_VF_STATS */
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+	.ndo_set_vf_rate = ice_set_vf_bw,
+#else
+	.ndo_set_vf_tx_rate = ice_set_vf_bw,
+#endif
 	.ndo_vlan_rx_add_vid = ice_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = ice_vlan_rx_kill_vid,
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#ifdef HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC
+	.extended.ndo_setup_tc_rh = ice_setup_tc,
+#else
+	.ndo_setup_tc = ice_setup_tc,
+#endif /* HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC */
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 	.ndo_set_features = ice_set_features,
 	.ndo_bridge_getlink = ice_bridge_getlink,
 	.ndo_bridge_setlink = ice_bridge_setlink,
 	.ndo_fdb_add = ice_fdb_add,
 	.ndo_fdb_del = ice_fdb_del,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer = ice_rx_flow_steer,
+#endif
 	.ndo_tx_timeout = ice_tx_timeout,
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_NDO_BPF
+	.ndo_bpf = ice_xdp,
+#else
+	.ndo_xdp = ice_xdp,
+#endif /* HAVE_NDO_BPF */
+	.ndo_xdp_xmit = ice_xdp_xmit,
+#ifndef NO_NDO_XDP_FLUSH
+	.ndo_xdp_flush = ice_xdp_flush,
+#endif /* !NO_NDO_XDP_FLUSH */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifdef HAVE_NDO_XSK_WAKEUP
+	.ndo_xsk_wakeup = ice_xsk_wakeup,
+#else
+	.ndo_xsk_async_xmit = ice_xsk_async_xmit,
+#endif /* HAVE_NDO_XSK_WAKEUP */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_UDP_ENC_RX_OFFLOAD
+#ifdef HAVE_RHEL7_NETDEV_OPS_EXT_NDO_UDP_TUNNEL
+	.extended.ndo_udp_tunnel_add = ice_udp_tunnel_add,
+	.extended.ndo_udp_tunnel_del = ice_udp_tunnel_del,
+#else
+#ifndef HAVE_UDP_TUNNEL_NIC_INFO
+	.ndo_udp_tunnel_add = ice_udp_tunnel_add,
+	.ndo_udp_tunnel_del = ice_udp_tunnel_del,
+#endif /* !HAVE_UDP_TUNNEL_NIC_INFO */
+#endif
+#else /* !HAVE_UDP_ENC_RX_OFFLOAD */
+#ifdef HAVE_VXLAN_RX_OFFLOAD
+#if IS_ENABLED(CONFIG_VXLAN)
+	.ndo_add_vxlan_port = ice_add_vxlan_port,
+	.ndo_del_vxlan_port = ice_del_vxlan_port,
+#endif
+#endif /* HAVE_VXLAN_RX_OFFLOAD */
+#ifdef HAVE_GENEVE_RX_OFFLOAD
+#if IS_ENABLED(CONFIG_GENEVE)
+	.ndo_add_geneve_port = ice_add_geneve_port,
+	.ndo_del_geneve_port = ice_del_geneve_port,
+#endif
+#endif /* HAVE_GENEVE_RX_OFFLOAD */
+#endif /* HAVE_UDP_ENC_RX_OFFLOAD */
+#ifdef HAVE_NETDEV_SB_DEV
+#ifdef HAVE_RHEL7_NET_DEVICE_OPS_EXT
+	.extended.ndo_dfwd_add_station = ice_fwd_add_macvlan,
+	.extended.ndo_dfwd_del_station = ice_fwd_del_macvlan,
+#else
+	.ndo_dfwd_add_station = ice_fwd_add_macvlan,
+	.ndo_dfwd_del_station = ice_fwd_del_macvlan,
+#endif /* HAVE_RHEL7_NET_DEVICE_OPS_EXT */
+#endif /* HAVE_NETDEV_SB_DEV */
 };
diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c
index bcb431f1bd92..51ebb17dac85 100644
--- a/drivers/net/ethernet/intel/ice/ice_nvm.c
+++ b/drivers/net/ethernet/intel/ice/ice_nvm.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 
+
 /**
  * ice_aq_read_nvm
  * @hw: pointer to the HW struct
@@ -11,25 +12,29 @@
  * @length: length of the section to be read (in bytes from the offset)
  * @data: command buffer (size [bytes] = length)
  * @last_command: tells if this is the last command in a series
+ * @read_shadow_ram: tell if this is a shadow RAM read
  * @cd: pointer to command details structure or NULL
  *
  * Read the NVM using the admin queue commands (0x0701)
  */
 static enum ice_status
 ice_aq_read_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset, u16 length,
-		void *data, bool last_command, struct ice_sq_cd *cd)
+		void *data, bool last_command, bool read_shadow_ram,
+		struct ice_sq_cd *cd)
 {
 	struct ice_aq_desc desc;
 	struct ice_aqc_nvm *cmd;
 
 	cmd = &desc.params.nvm;
 
-	/* In offset the highest byte must be zeroed. */
-	if (offset & 0xFF000000)
+	if (offset > ICE_AQC_NVM_MAX_OFFSET)
 		return ICE_ERR_PARAM;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_read);
 
+	if (!read_shadow_ram && module_typeid == ICE_AQC_NVM_START_POINT)
+		cmd->cmd_flags |= ICE_AQC_NVM_FLASH_ONLY;
+
 	/* If this is the last command in a series, set the proper flag. */
 	if (last_command)
 		cmd->cmd_flags |= ICE_AQC_NVM_LAST_CMD;
@@ -42,173 +47,406 @@ ice_aq_read_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset, u16 length,
 }
 
 /**
- * ice_check_sr_access_params - verify params for Shadow RAM R/W operations.
- * @hw: pointer to the HW structure
- * @offset: offset in words from module start
- * @words: number of words to access
+ * ice_read_flat_nvm - Read portion of NVM by flat offset
+ * @hw: pointer to the HW struct
+ * @offset: offset from beginning of NVM
+ * @length: (in) number of bytes to read; (out) number of bytes actually read
+ * @data: buffer to return data in (sized to fit the specified length)
+ * @read_shadow_ram: if true, read from shadow RAM instead of NVM
+ *
+ * Reads a portion of the NVM, as a flat memory space. This function correctly
+ * breaks read requests across Shadow RAM sectors and ensures that no single
+ * read request exceeds the maximum 4KB read for a single AdminQ command.
+ *
+ * Returns a status code on failure. Note that the data pointer may be
+ * partially updated if some reads succeed before a failure.
  */
-static enum ice_status
-ice_check_sr_access_params(struct ice_hw *hw, u32 offset, u16 words)
+enum ice_status
+ice_read_flat_nvm(struct ice_hw *hw, u32 offset, u32 *length, u8 *data,
+		  bool read_shadow_ram)
 {
-	if ((offset + words) > hw->nvm.sr_words) {
-		ice_debug(hw, ICE_DBG_NVM,
-			  "NVM error: offset beyond SR lmt.\n");
-		return ICE_ERR_PARAM;
-	}
+	enum ice_status status;
+	u32 inlen = *length;
+	u32 bytes_read = 0;
+	bool last_cmd;
+
+	*length = 0;
 
-	if (words > ICE_SR_SECTOR_SIZE_IN_WORDS) {
-		/* We can access only up to 4KB (one sector), in one AQ write */
-		ice_debug(hw, ICE_DBG_NVM,
-			  "NVM error: tried to access %d words, limit is %d.\n",
-			  words, ICE_SR_SECTOR_SIZE_IN_WORDS);
+	/* Verify the length of the read if this is for the Shadow RAM */
+	if (read_shadow_ram && ((offset + inlen) > (hw->flash.sr_words * 2u))) {
+		ice_debug(hw, ICE_DBG_NVM, "NVM error: requested data is beyond Shadow RAM limit\n");
 		return ICE_ERR_PARAM;
 	}
 
-	if (((offset + (words - 1)) / ICE_SR_SECTOR_SIZE_IN_WORDS) !=
-	    (offset / ICE_SR_SECTOR_SIZE_IN_WORDS)) {
-		/* A single access cannot spread over two sectors */
-		ice_debug(hw, ICE_DBG_NVM,
-			  "NVM error: cannot spread over two sectors.\n");
+	do {
+		u32 read_size, sector_offset;
+
+		/* ice_aq_read_nvm cannot read more than 4KB at a time.
+		 * Additionally, a read from the Shadow RAM may not cross over
+		 * a sector boundary. Conveniently, the sector size is also
+		 * 4KB.
+		 */
+		sector_offset = offset % ICE_AQ_MAX_BUF_LEN;
+		read_size = min_t(u32, ICE_AQ_MAX_BUF_LEN - sector_offset,
+				  inlen - bytes_read);
+
+		last_cmd = !(bytes_read + read_size < inlen);
+
+		/* ice_aq_read_nvm takes the length as a u16. Our read_size is
+		 * calculated using a u32, but the ICE_AQ_MAX_BUF_LEN maximum
+		 * size guarantees that it will fit within the 2 bytes.
+		 */
+		status = ice_aq_read_nvm(hw, ICE_AQC_NVM_START_POINT,
+					 offset, (u16)read_size,
+					 data + bytes_read, last_cmd,
+					 read_shadow_ram, NULL);
+		if (status)
+			break;
+
+		bytes_read += read_size;
+		offset += read_size;
+	} while (!last_cmd);
+
+	*length = bytes_read;
+	return status;
+}
+
+/**
+ * ice_aq_update_nvm
+ * @hw: pointer to the HW struct
+ * @module_typeid: module pointer location in words from the NVM beginning
+ * @offset: byte offset from the module beginning
+ * @length: length of the section to be written (in bytes from the offset)
+ * @data: command buffer (size [bytes] = length)
+ * @last_command: tells if this is the last command in a series
+ * @command_flags: command parameters
+ * @cd: pointer to command details structure or NULL
+ *
+ * Update the NVM using the admin queue commands (0x0703)
+ */
+enum ice_status
+ice_aq_update_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset,
+		  u16 length, void *data, bool last_command, u8 command_flags,
+		  struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+	struct ice_aqc_nvm *cmd;
+
+	cmd = &desc.params.nvm;
+
+	/* In offset the highest byte must be zeroed. */
+	if (offset & 0xFF000000)
 		return ICE_ERR_PARAM;
-	}
 
-	return 0;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_write);
+
+	cmd->cmd_flags |= command_flags;
+
+	/* If this is the last command in a series, set the proper flag. */
+	if (last_command)
+		cmd->cmd_flags |= ICE_AQC_NVM_LAST_CMD;
+	cmd->module_typeid = cpu_to_le16(module_typeid);
+	cmd->offset_low = cpu_to_le16(offset & 0xFFFF);
+	cmd->offset_high = (offset >> 16) & 0xFF;
+	cmd->length = cpu_to_le16(length);
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	return ice_aq_send_cmd(hw, &desc, data, length, cd);
 }
 
 /**
- * ice_read_sr_aq - Read Shadow RAM.
- * @hw: pointer to the HW structure
- * @offset: offset in words from module start
- * @words: number of words to read
- * @data: buffer for words reads from Shadow RAM
- * @last_command: tells the AdminQ that this is the last command
+ * ice_aq_erase_nvm
+ * @hw: pointer to the HW struct
+ * @module_typeid: module pointer location in words from the NVM beginning
+ * @cd: pointer to command details structure or NULL
  *
- * Reads 16-bit word buffers from the Shadow RAM using the admin command.
+ * Erase the NVM sector using the admin queue commands (0x0702)
  */
-static enum ice_status
-ice_read_sr_aq(struct ice_hw *hw, u32 offset, u16 words, u16 *data,
-	       bool last_command)
+enum ice_status
+ice_aq_erase_nvm(struct ice_hw *hw, u16 module_typeid, struct ice_sq_cd *cd)
 {
+	struct ice_aq_desc desc;
+	struct ice_aqc_nvm *cmd;
 	enum ice_status status;
+	__le16 len;
 
-	status = ice_check_sr_access_params(hw, offset, words);
+	/* read a length value from SR, so module_typeid is equal to 0 */
+	/* calculate offset where module size is placed from bytes to words */
+	/* set last command and read from SR values to true */
+	status = ice_aq_read_nvm(hw, 0, 2 * module_typeid + 2, 2, &len, true,
+				 true, NULL);
+	if (status)
+		return status;
 
-	/* values in "offset" and "words" parameters are sized as words
-	 * (16 bits) but ice_aq_read_nvm expects these values in bytes.
-	 * So do this conversion while calling ice_aq_read_nvm.
-	 */
-	if (!status)
-		status = ice_aq_read_nvm(hw, 0, 2 * offset, 2 * words, data,
-					 last_command, NULL);
+	cmd = &desc.params.nvm;
 
-	return status;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_erase);
+
+	cmd->module_typeid = cpu_to_le16(module_typeid);
+	cmd->length = len;
+	cmd->offset_low = 0;
+	cmd->offset_high = 0;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
+
 /**
  * ice_read_sr_word_aq - Reads Shadow RAM via AQ
  * @hw: pointer to the HW structure
  * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
  * @data: word read from the Shadow RAM
  *
- * Reads one 16 bit word from the Shadow RAM using the ice_read_sr_aq method.
+ * Reads one 16 bit word from the Shadow RAM using ice_read_flat_nvm.
  */
 static enum ice_status
 ice_read_sr_word_aq(struct ice_hw *hw, u16 offset, u16 *data)
 {
+	u32 bytes = sizeof(u16);
 	enum ice_status status;
+	__le16 data_local;
 
-	status = ice_read_sr_aq(hw, offset, 1, data, true);
-	if (!status)
-		*data = le16_to_cpu(*(__force __le16 *)data);
+	/* Note that ice_read_flat_nvm checks if the read is past the Shadow
+	 * RAM size, and ensures we don't read across a Shadow RAM sector
+	 * boundary
+	 */
+	status = ice_read_flat_nvm(hw, offset * sizeof(u16), &bytes,
+				   (__force u8 *)&data_local, true);
+	if (status)
+		return status;
 
-	return status;
+	*data = le16_to_cpu(data_local);
+	return 0;
 }
 
+
 /**
- * ice_read_sr_buf_aq - Reads Shadow RAM buf via AQ
+ * ice_acquire_nvm - Generic request for acquiring the NVM ownership
  * @hw: pointer to the HW structure
- * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
- * @words: (in) number of words to read; (out) number of words actually read
- * @data: words read from the Shadow RAM
+ * @access: NVM access type (read or write)
+ *
+ * This function will request NVM ownership.
+ */
+enum ice_status
+ice_acquire_nvm(struct ice_hw *hw, enum ice_aq_res_access_type access)
+{
+	if (hw->flash.blank_nvm_mode)
+		return 0;
+
+	return ice_acquire_res(hw, ICE_NVM_RES_ID, access, ICE_NVM_TIMEOUT);
+}
+
+/**
+ * ice_release_nvm - Generic request for releasing the NVM ownership
+ * @hw: pointer to the HW structure
+ *
+ * This function will release NVM ownership.
+ */
+void ice_release_nvm(struct ice_hw *hw)
+{
+	if (hw->flash.blank_nvm_mode)
+		return;
+
+	ice_release_res(hw, ICE_NVM_RES_ID);
+}
+
+/**
+ * ice_get_flash_bank_offset - Get offset into requested flash bank
+ * @hw: pointer to the HW structure
+ * @bank: whether to read from the active or inactive flash bank
+ * @module: the module to read from
+ *
+ * Based on the module, lookup the module offset from the beginning of the
+ * flash.
+ *
+ * Returns the flash offset. Note that a value of zero is invalid and must be
+ * treated as an error.
+ */
+static u32 ice_get_flash_bank_offset(struct ice_hw *hw, enum ice_bank_select bank, u16 module)
+{
+	struct ice_bank_info *banks = &hw->flash.banks;
+	enum ice_flash_bank active_bank;
+	bool second_bank_active;
+	u32 offset, size;
+
+	switch (module) {
+	case ICE_SR_1ST_NVM_BANK_PTR:
+		offset = banks->nvm_ptr;
+		size = banks->nvm_size;
+		active_bank = banks->nvm_bank;
+		break;
+	case ICE_SR_1ST_OROM_BANK_PTR:
+		offset = banks->orom_ptr;
+		size = banks->orom_size;
+		active_bank = banks->orom_bank;
+		break;
+	case ICE_SR_NETLIST_BANK_PTR:
+		offset = banks->netlist_ptr;
+		size = banks->netlist_size;
+		active_bank = banks->netlist_bank;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_NVM, "Unexpected value for flash module: 0x%04x\n", module);
+		return 0;
+	}
+
+	switch (active_bank) {
+	case ICE_1ST_FLASH_BANK:
+		second_bank_active = false;
+		break;
+	case ICE_2ND_FLASH_BANK:
+		second_bank_active = true;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_NVM, "Unexpected value for active flash bank: %u\n",
+			  active_bank);
+		return 0;
+	}
+
+	/* The second flash bank is stored immediately following the first
+	 * bank. Based on whether the 1st or 2nd bank is active, and whether
+	 * we want the active or inactive bank, calculate the desired offset.
+	 */
+	switch (bank) {
+	case ICE_ACTIVE_FLASH_BANK:
+		return offset + (second_bank_active ? size : 0);
+	case ICE_INACTIVE_FLASH_BANK:
+		return offset + (second_bank_active ? 0 : size);
+	}
+
+	ice_debug(hw, ICE_DBG_NVM, "Unexpected value for flash bank selection: %u\n", bank);
+	return 0;
+}
+
+/**
+ * ice_read_flash_module - Read a word from one of the main NVM modules
+ * @hw: pointer to the HW structure
+ * @bank: which bank of the module to read
+ * @module: the module to read
+ * @offset: the offset into the module in bytes
+ * @data: storage for the word read from the flash
+ * @length: bytes of data to read
+ *
+ * Read data from the specified flash module. The bank parameter indicates
+ * whether or not to read from the active bank or the inactive bank of that
+ * module.
  *
- * Reads 16 bit words (data buf) from the SR using the ice_read_sr_aq
- * method. Ownership of the NVM is taken before reading the buffer and later
- * released.
+ * The word will be read using flat NVM access, and relies on the
+ * hw->flash.banks data being setup by ice_determine_active_flash_banks()
+ * during initialization.
  */
 static enum ice_status
-ice_read_sr_buf_aq(struct ice_hw *hw, u16 offset, u16 *words, u16 *data)
+ice_read_flash_module(struct ice_hw *hw, enum ice_bank_select bank, u16 module,
+		      u32 offset, u8 *data, u32 length)
 {
 	enum ice_status status;
-	bool last_cmd = false;
-	u16 words_read = 0;
-	u16 i = 0;
+	u32 start;
 
-	do {
-		u16 read_size, off_w;
+	start = ice_get_flash_bank_offset(hw, bank, module);
+	if (!start) {
+		ice_debug(hw, ICE_DBG_NVM, "Unable to calculate flash bank offset for module 0x%04x\n",
+			  module);
+		return ICE_ERR_PARAM;
+	}
 
-		/* Calculate number of bytes we should read in this step.
-		 * It's not allowed to read more than one page at a time or
-		 * to cross page boundaries.
-		 */
-		off_w = offset % ICE_SR_SECTOR_SIZE_IN_WORDS;
-		read_size = off_w ?
-			min_t(u16, *words,
-			      (ICE_SR_SECTOR_SIZE_IN_WORDS - off_w)) :
-			min_t(u16, (*words - words_read),
-			      ICE_SR_SECTOR_SIZE_IN_WORDS);
-
-		/* Check if this is last command, if so set proper flag */
-		if ((words_read + read_size) >= *words)
-			last_cmd = true;
-
-		status = ice_read_sr_aq(hw, offset, read_size,
-					data + words_read, last_cmd);
-		if (status)
-			goto read_nvm_buf_aq_exit;
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
 
-		/* Increment counter for words already read and move offset to
-		 * new read location
-		 */
-		words_read += read_size;
-		offset += read_size;
-	} while (words_read < *words);
+	status = ice_read_flat_nvm(hw, start + offset, &length, data, false);
 
-	for (i = 0; i < *words; i++)
-		data[i] = le16_to_cpu(((__force __le16 *)data)[i]);
+	ice_release_nvm(hw);
 
-read_nvm_buf_aq_exit:
-	*words = words_read;
 	return status;
 }
 
 /**
- * ice_acquire_nvm - Generic request for acquiring the NVM ownership
+ * ice_read_nvm_module - Read from the active main NVM module
  * @hw: pointer to the HW structure
- * @access: NVM access type (read or write)
+ * @bank: whether to read from active or inactive NVM module
+ * @offset: offset into the NVM module to read, in words
+ * @data: storage for returned word value
  *
- * This function will request NVM ownership.
+ * Read the specified word from the active NVM module. This includes the CSS
+ * header at the start of the NVM module.
  */
 static enum ice_status
-ice_acquire_nvm(struct ice_hw *hw, enum ice_aq_res_access_type access)
+ice_read_nvm_module(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data)
 {
-	if (hw->nvm.blank_nvm_mode)
-		return 0;
+	enum ice_status status;
+	__le16 data_local;
 
-	return ice_acquire_res(hw, ICE_NVM_RES_ID, access, ICE_NVM_TIMEOUT);
+	status = ice_read_flash_module(hw, bank, ICE_SR_1ST_NVM_BANK_PTR, offset * sizeof(u16),
+				       (__force u8 *)&data_local, sizeof(u16));
+	if (!status)
+		*data = le16_to_cpu(data_local);
+
+	return status;
 }
 
 /**
- * ice_release_nvm - Generic request for releasing the NVM ownership
+ * ice_read_nvm_sr_copy - Read a word from the Shadow RAM copy in the NVM bank
  * @hw: pointer to the HW structure
+ * @bank: whether to read from the active or inactive NVM module
+ * @offset: offset into the Shadow RAM copy to read, in words
+ * @data: storage for returned word value
  *
- * This function will release NVM ownership.
+ * Read the specified word from the copy of the Shadow RAM found in the
+ * specified NVM module.
  */
-static void ice_release_nvm(struct ice_hw *hw)
+static enum ice_status
+ice_read_nvm_sr_copy(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data)
 {
-	if (hw->nvm.blank_nvm_mode)
-		return;
+	return ice_read_nvm_module(hw, bank, ICE_NVM_SR_COPY_WORD_OFFSET + offset, data);
+}
 
-	ice_release_res(hw, ICE_NVM_RES_ID);
+/**
+ * ice_read_orom_module - Read from the active Option ROM module
+ * @hw: pointer to the HW structure
+ * @bank: whether to read from active or inactive OROM module
+ * @offset: offset into the OROM module to read, in words
+ * @data: storage for returned word value
+ *
+ * Read the specified word from the active Option ROM module of the flash.
+ * Note that unlike the NVM module, the CSS data is stored at the end of the
+ * module instead of at the beginning.
+ */
+static enum ice_status
+ice_read_orom_module(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data)
+{
+	enum ice_status status;
+	__le16 data_local;
+
+	status = ice_read_flash_module(hw, bank, ICE_SR_1ST_OROM_BANK_PTR, offset * sizeof(u16),
+				       (__force u8 *)&data_local, sizeof(u16));
+	if (!status)
+		*data = le16_to_cpu(data_local);
+
+	return status;
+}
+
+/**
+ * ice_read_netlist_module - Read data from the netlist module area
+ * @hw: pointer to the HW structure
+ * @bank: whether to read from the active or inactive module
+ * @offset: offset into the netlist to read from
+ * @data: storage for returned word value
+ *
+ * Read a word from the specified netlist bank.
+ */
+static enum ice_status
+ice_read_netlist_module(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data)
+{
+	enum ice_status status;
+	__le16 data_local;
+
+	status = ice_read_flash_module(hw, bank, ICE_SR_NETLIST_BANK_PTR, offset * sizeof(u16),
+				       (__force u8 *)&data_local, sizeof(u16));
+	if (!status)
+		*data = le16_to_cpu(data_local);
+
+	return status;
 }
 
 /**
@@ -219,8 +457,7 @@ static void ice_release_nvm(struct ice_hw *hw)
  *
  * Reads one 16 bit word from the Shadow RAM using the ice_read_sr_word_aq.
  */
-static enum ice_status
-ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data)
+enum ice_status ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data)
 {
 	enum ice_status status;
 
@@ -234,111 +471,734 @@ ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data)
 }
 
 /**
- * ice_init_nvm - initializes NVM setting
- * @hw: pointer to the HW struct
+ * ice_get_pfa_module_tlv - Reads sub module TLV from NVM PFA
+ * @hw: pointer to hardware structure
+ * @module_tlv: pointer to module TLV to return
+ * @module_tlv_len: pointer to module TLV length to return
+ * @module_type: module type requested
  *
- * This function reads and populates NVM settings such as Shadow RAM size,
- * max_timeout, and blank_nvm_mode
+ * Finds the requested sub module TLV type from the Preserved Field
+ * Area (PFA) and returns the TLV pointer and length. The caller can
+ * use these to read the variable length TLV value.
  */
-enum ice_status ice_init_nvm(struct ice_hw *hw)
+enum ice_status
+ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len,
+		       u16 module_type)
 {
-	struct ice_nvm_info *nvm = &hw->nvm;
-	u16 eetrack_lo, eetrack_hi;
-	enum ice_status status = 0;
-	u32 fla, gens_stat;
-	u8 sr_size;
-
-	/* The SR size is stored regardless of the NVM programming mode
-	 * as the blank mode may be used in the factory line.
-	 */
-	gens_stat = rd32(hw, GLNVM_GENS);
-	sr_size = (gens_stat & GLNVM_GENS_SR_SIZE_M) >> GLNVM_GENS_SR_SIZE_S;
-
-	/* Switching to words (sr_size contains power of 2) */
-	nvm->sr_words = BIT(sr_size) * ICE_SR_WORDS_IN_1KB;
+	enum ice_status status;
+	u16 pfa_len, pfa_ptr;
+	u16 next_tlv;
 
-	/* Check if we are in the normal or blank NVM programming mode */
-	fla = rd32(hw, GLNVM_FLA);
-	if (fla & GLNVM_FLA_LOCKED_M) { /* Normal programming mode */
-		nvm->blank_nvm_mode = false;
-	} else { /* Blank programming mode */
-		nvm->blank_nvm_mode = true;
-		status = ICE_ERR_NVM_BLANK_MODE;
-		ice_debug(hw, ICE_DBG_NVM,
-			  "NVM init error: unsupported blank mode.\n");
+	status = ice_read_sr_word(hw, ICE_SR_PFA_PTR, &pfa_ptr);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Preserved Field Array pointer.\n");
 		return status;
 	}
-
-	status = ice_read_sr_word(hw, ICE_SR_NVM_DEV_STARTER_VER, &hw->nvm.ver);
+	status = ice_read_sr_word(hw, pfa_ptr, &pfa_len);
 	if (status) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Failed to read DEV starter version.\n");
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read PFA length.\n");
 		return status;
 	}
+	/* Starting with first TLV after PFA length, iterate through the list
+	 * of TLVs to find the requested one.
+	 */
+	next_tlv = pfa_ptr + 1;
+	while (next_tlv < pfa_ptr + pfa_len) {
+		u16 tlv_sub_module_type;
+		u16 tlv_len;
+
+		/* Read TLV type */
+		status = ice_read_sr_word(hw, next_tlv, &tlv_sub_module_type);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read TLV type.\n");
+			break;
+		}
+		/* Read TLV length */
+		status = ice_read_sr_word(hw, next_tlv + 1, &tlv_len);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read TLV length.\n");
+			break;
+		}
+		if (tlv_sub_module_type == module_type) {
+			if (tlv_len) {
+				*module_tlv = next_tlv;
+				*module_tlv_len = tlv_len;
+				return 0;
+			}
+			return ICE_ERR_INVAL_SIZE;
+		}
+		/* Check next TLV, i.e. current TLV pointer + length + 2 words
+		 * (for current TLV's type and length)
+		 */
+		next_tlv = next_tlv + tlv_len + 2;
+	}
+	/* Module does not exist */
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_read_pba_string - Reads part number string from NVM
+ * @hw: pointer to hardware structure
+ * @pba_num: stores the part number string from the NVM
+ * @pba_num_size: part number string buffer length
+ *
+ * Reads the part number string from the NVM.
+ */
+enum ice_status
+ice_read_pba_string(struct ice_hw *hw, u8 *pba_num, u32 pba_num_size)
+{
+	u16 pba_tlv, pba_tlv_len;
+	enum ice_status status;
+	u16 pba_word, pba_size;
+	u16 i;
 
-	status = ice_read_sr_word(hw, ICE_SR_NVM_EETRACK_LO, &eetrack_lo);
+	status = ice_get_pfa_module_tlv(hw, &pba_tlv, &pba_tlv_len,
+					ICE_SR_PBA_BLOCK_PTR);
 	if (status) {
-		ice_debug(hw, ICE_DBG_INIT, "Failed to read EETRACK lo.\n");
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read PBA Block TLV.\n");
 		return status;
 	}
-	status = ice_read_sr_word(hw, ICE_SR_NVM_EETRACK_HI, &eetrack_hi);
+
+	/* pba_size is the next word */
+	status = ice_read_sr_word(hw, (pba_tlv + 2), &pba_size);
 	if (status) {
-		ice_debug(hw, ICE_DBG_INIT, "Failed to read EETRACK hi.\n");
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read PBA Section size.\n");
 		return status;
 	}
 
-	hw->nvm.eetrack = (eetrack_hi << 16) | eetrack_lo;
+	if (pba_tlv_len < pba_size) {
+		ice_debug(hw, ICE_DBG_INIT, "Invalid PBA Block TLV size.\n");
+		return ICE_ERR_INVAL_SIZE;
+	}
+
+	/* Subtract one to get PBA word count (PBA Size word is included in
+	 * total size)
+	 */
+	pba_size--;
+	if (pba_num_size < (((u32)pba_size * 2) + 1)) {
+		ice_debug(hw, ICE_DBG_INIT, "Buffer too small for PBA data.\n");
+		return ICE_ERR_PARAM;
+	}
+
+	for (i = 0; i < pba_size; i++) {
+		status = ice_read_sr_word(hw, (pba_tlv + 2 + 1) + i, &pba_word);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read PBA Block word %d.\n", i);
+			return status;
+		}
+
+		pba_num[(i * 2)] = (pba_word >> 8) & 0xFF;
+		pba_num[(i * 2) + 1] = pba_word & 0xFF;
+	}
+	pba_num[(pba_size * 2)] = '\0';
 
 	return status;
 }
 
 /**
- * ice_read_sr_buf - Reads Shadow RAM buf and acquire lock if necessary
- * @hw: pointer to the HW structure
- * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
- * @words: (in) number of words to read; (out) number of words actually read
- * @data: words read from the Shadow RAM
+ * ice_get_nvm_srev - Read the security revision from the NVM CSS header
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash bank
+ * @srev: storage for security revision
  *
- * Reads 16 bit words (data buf) from the SR using the ice_read_nvm_buf_aq
- * method. The buf read is preceded by the NVM ownership take
- * and followed by the release.
+ * Read the security revision out of the CSS header of the active NVM module
+ * bank.
  */
-enum ice_status
-ice_read_sr_buf(struct ice_hw *hw, u16 offset, u16 *words, u16 *data)
+static enum ice_status ice_get_nvm_srev(struct ice_hw *hw, enum ice_bank_select bank, u32 *srev)
 {
 	enum ice_status status;
+	u16 srev_l, srev_h;
 
-	status = ice_acquire_nvm(hw, ICE_RES_READ);
-	if (!status) {
-		status = ice_read_sr_buf_aq(hw, offset, words, data);
-		ice_release_nvm(hw);
-	}
+	status = ice_read_nvm_module(hw, bank, ICE_NVM_CSS_SREV_L, &srev_l);
+	if (status)
+		return status;
 
-	return status;
+	status = ice_read_nvm_module(hw, bank, ICE_NVM_CSS_SREV_H, &srev_h);
+	if (status)
+		return status;
+
+	*srev = srev_h << 16 | srev_l;
+
+	return 0;
 }
 
 /**
- * ice_nvm_validate_checksum
+ * ice_get_nvm_ver_info - Read NVM version information
  * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash bank
+ * @nvm: pointer to NVM info structure
  *
- * Verify NVM PFA checksum validity (0x0706)
+ * Read the NVM EETRACK ID and map version of the main NVM image bank, filling
+ * in the nvm info structure.
  */
-enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw)
+static enum ice_status
+ice_get_nvm_ver_info(struct ice_hw *hw, enum ice_bank_select bank, struct ice_nvm_info *nvm)
 {
-	struct ice_aqc_nvm_checksum *cmd;
-	struct ice_aq_desc desc;
+	u16 eetrack_lo, eetrack_hi, ver;
 	enum ice_status status;
 
-	status = ice_acquire_nvm(hw, ICE_RES_READ);
-	if (status)
+	status = ice_read_nvm_sr_copy(hw, bank, ICE_SR_NVM_DEV_STARTER_VER, &ver);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read DEV starter version.\n");
 		return status;
+	}
 
-	cmd = &desc.params.nvm_checksum;
+	nvm->major = (ver & ICE_NVM_VER_HI_MASK) >> ICE_NVM_VER_HI_SHIFT;
+	nvm->minor = (ver & ICE_NVM_VER_LO_MASK) >> ICE_NVM_VER_LO_SHIFT;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_checksum);
-	cmd->flags = ICE_AQC_NVM_CHECKSUM_VERIFY;
+	status = ice_read_nvm_sr_copy(hw, bank, ICE_SR_NVM_EETRACK_LO, &eetrack_lo);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read EETRACK lo.\n");
+		return status;
+	}
+	status = ice_read_nvm_sr_copy(hw, bank, ICE_SR_NVM_EETRACK_HI, &eetrack_hi);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read EETRACK hi.\n");
+		return status;
+	}
+
+	nvm->eetrack = (eetrack_hi << 16) | eetrack_lo;
+
+	status = ice_get_nvm_srev(hw, bank, &nvm->srev);
+	if (status)
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read NVM security revision.\n");
+
+	return 0;
+}
+
+/**
+ * ice_get_inactive_nvm_ver - Read Option ROM version from the inactive bank
+ * @hw: pointer to the HW structure
+ * @nvm: storage for Option ROM version information
+ *
+ * Reads the NVM EETRACK ID, Map version, and security revision of the
+ * inactive NVM bank. Used to access version data for a pending update that
+ * has not yet been activated.
+ */
+enum ice_status ice_get_inactive_nvm_ver(struct ice_hw *hw, struct ice_nvm_info *nvm)
+{
+	return ice_get_nvm_ver_info(hw, ICE_INACTIVE_FLASH_BANK, nvm);
+}
+
+/**
+ * ice_get_orom_srev - Read the security revision from the OROM CSS header
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from active or inactive flash module
+ * @srev: storage for security revision
+ *
+ * Read the security revision out of the CSS header of the active OROM module
+ * bank.
+ */
+static enum ice_status ice_get_orom_srev(struct ice_hw *hw, enum ice_bank_select bank, u32 *srev)
+{
+	enum ice_status status;
+	u16 srev_l, srev_h;
+	u32 css_start;
+
+	if (hw->flash.banks.orom_size < ICE_NVM_OROM_TRAILER_LENGTH) {
+		ice_debug(hw, ICE_DBG_NVM, "Unexpected Option ROM Size of %u\n",
+			  hw->flash.banks.orom_size);
+		return ICE_ERR_CFG;
+	}
+
+	/* calculate how far into the Option ROM the CSS header starts. Note
+	 * that ice_read_orom_module takes a word offset so we need to
+	 * divide by 2 here.
+	 */
+	css_start = (hw->flash.banks.orom_size - ICE_NVM_OROM_TRAILER_LENGTH) / 2;
+
+	status = ice_read_orom_module(hw, bank, css_start + ICE_NVM_CSS_SREV_L, &srev_l);
+	if (status)
+		return status;
+
+	status = ice_read_orom_module(hw, bank, css_start + ICE_NVM_CSS_SREV_H, &srev_h);
+	if (status)
+		return status;
+
+	*srev = srev_h << 16 | srev_l;
+
+	return 0;
+}
+
+/**
+ * ice_get_orom_civd_data - Get the combo version information from Option ROM
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash module
+ * @civd: storage for the Option ROM CIVD data.
+ *
+ * Searches through the Option ROM flash contents to locate the CIVD data for
+ * the image.
+ */
+static enum ice_status
+ice_get_orom_civd_data(struct ice_hw *hw, enum ice_bank_select bank,
+		       struct ice_orom_civd_info *civd)
+{
+	struct ice_orom_civd_info tmp;
+	enum ice_status status;
+	u32 offset;
+
+	/* The CIVD section is located in the Option ROM aligned to 512 bytes.
+	 * The first 4 bytes must contain the ASCII characters "$CIV".
+	 * A simple modulo 256 sum of all of the bytes of the structure must
+	 * equal 0.
+	 */
+	for (offset = 0; (offset + 512) <= hw->flash.banks.orom_size; offset += 512) {
+		u8 sum = 0, i;
+
+		status = ice_read_flash_module(hw, bank, ICE_SR_1ST_OROM_BANK_PTR,
+					       offset, (u8 *)&tmp, sizeof(tmp));
+		if (status) {
+			ice_debug(hw, ICE_DBG_NVM, "Unable to read Option ROM CIVD data\n");
+			return status;
+		}
+
+		/* Skip forward until we find a matching signature */
+		if (memcmp("$CIV", tmp.signature, sizeof(tmp.signature)) != 0)
+			continue;
+
+		/* Verify that the simple checksum is zero */
+		for (i = 0; i < sizeof(tmp); i++)
+			/* cppcheck-suppress objectIndex */
+			sum += ((u8 *)&tmp)[i];
+
+		if (sum) {
+			ice_debug(hw, ICE_DBG_NVM, "Found CIVD data with invalid checksum of %u\n",
+				  sum);
+			return ICE_ERR_NVM;
+		}
+
+		*civd = tmp;
+		return 0;
+	}
+
+	return ICE_ERR_NVM;
+}
+
+/**
+ * ice_get_orom_ver_info - Read Option ROM version information
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash module
+ * @orom: pointer to Option ROM info structure
+ *
+ * Read Option ROM version and security revision from the Option ROM flash
+ * section.
+ */
+static enum ice_status
+ice_get_orom_ver_info(struct ice_hw *hw, enum ice_bank_select bank, struct ice_orom_info *orom)
+{
+	struct ice_orom_civd_info civd;
+	enum ice_status status;
+	u32 combo_ver;
+
+	status = ice_get_orom_civd_data(hw, bank, &civd);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to locate valid Option ROM CIVD data\n");
+		return status;
+	}
+
+	combo_ver = le32_to_cpu(civd.combo_ver);
+
+	orom->major = (u8)((combo_ver & ICE_OROM_VER_MASK) >> ICE_OROM_VER_SHIFT);
+	orom->patch = (u8)(combo_ver & ICE_OROM_VER_PATCH_MASK);
+	orom->build = (u16)((combo_ver & ICE_OROM_VER_BUILD_MASK) >> ICE_OROM_VER_BUILD_SHIFT);
+
+	status = ice_get_orom_srev(hw, bank, &orom->srev);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read Option ROM security revision.\n");
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_inactive_orom_ver - Read Option ROM version from the inactive bank
+ * @hw: pointer to the HW structure
+ * @orom: storage for Option ROM version information
+ *
+ * Reads the Option ROM version and security revision data for the inactive
+ * section of flash. Used to access version data for a pending update that has
+ * not yet been activated.
+ */
+enum ice_status ice_get_inactive_orom_ver(struct ice_hw *hw, struct ice_orom_info *orom)
+{
+	return ice_get_orom_ver_info(hw, ICE_INACTIVE_FLASH_BANK, orom);
+}
+
+/**
+ * ice_get_netlist_info
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash bank
+ * @netlist: pointer to netlist version info structure
+ *
+ * Get the netlist version information from the requested bank. Reads the Link
+ * Topology section to find the Netlist ID block and extract the relevant
+ * information into the netlist version structure.
+ */
+static enum ice_status
+ice_get_netlist_info(struct ice_hw *hw, enum ice_bank_select bank,
+		     struct ice_netlist_info *netlist)
+{
+	u16 module_id, length, node_count, i;
+	enum ice_status status;
+	u16 *id_blk;
+
+	status = ice_read_netlist_module(hw, bank, ICE_NETLIST_TYPE_OFFSET, &module_id);
+	if (status)
+		return status;
+
+	if (module_id != ICE_NETLIST_LINK_TOPO_MOD_ID) {
+		ice_debug(hw, ICE_DBG_NVM, "Expected netlist module_id ID of 0x%04x, but got 0x%04x\n",
+			  ICE_NETLIST_LINK_TOPO_MOD_ID, module_id);
+		return ICE_ERR_NVM;
+	}
+
+	status = ice_read_netlist_module(hw, bank, ICE_LINK_TOPO_MODULE_LEN, &length);
+	if (status)
+		return status;
+
+	/* sanity check that we have at least enough words to store the netlist ID block */
+	if (length < ICE_NETLIST_ID_BLK_SIZE) {
+		ice_debug(hw, ICE_DBG_NVM, "Netlist Link Topology module too small. Expected at least %u words, but got %u words.\n",
+			  ICE_NETLIST_ID_BLK_SIZE, length);
+		return ICE_ERR_NVM;
+	}
+
+	status = ice_read_netlist_module(hw, bank, ICE_LINK_TOPO_NODE_COUNT, &node_count);
+	if (status)
+		return status;
+	node_count &= ICE_LINK_TOPO_NODE_COUNT_M;
+
+	id_blk = devm_kcalloc(ice_hw_to_dev(hw), ICE_NETLIST_ID_BLK_SIZE,
+			      sizeof(*id_blk), GFP_KERNEL);
+	if (!id_blk)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Read out the entire Netlist ID Block at once. */
+	status = ice_read_flash_module(hw, bank, ICE_SR_NETLIST_BANK_PTR,
+				       ICE_NETLIST_ID_BLK_OFFSET(node_count) * sizeof(u16),
+				       (u8 *)id_blk, ICE_NETLIST_ID_BLK_SIZE * sizeof(u16));
+	if (status)
+		goto exit_error;
+
+	for (i = 0; i < ICE_NETLIST_ID_BLK_SIZE; i++)
+		id_blk[i] = le16_to_cpu(((__force __le16 *)id_blk)[i]);
+
+	netlist->major = id_blk[ICE_NETLIST_ID_BLK_MAJOR_VER_HIGH] << 16 |
+			 id_blk[ICE_NETLIST_ID_BLK_MAJOR_VER_LOW];
+	netlist->minor = id_blk[ICE_NETLIST_ID_BLK_MINOR_VER_HIGH] << 16 |
+			 id_blk[ICE_NETLIST_ID_BLK_MINOR_VER_LOW];
+	netlist->type = id_blk[ICE_NETLIST_ID_BLK_TYPE_HIGH] << 16 |
+			id_blk[ICE_NETLIST_ID_BLK_TYPE_LOW];
+	netlist->rev = id_blk[ICE_NETLIST_ID_BLK_REV_HIGH] << 16 |
+		       id_blk[ICE_NETLIST_ID_BLK_REV_LOW];
+	netlist->cust_ver = id_blk[ICE_NETLIST_ID_BLK_CUST_VER];
+	/* Read the left most 4 bytes of SHA */
+	netlist->hash = id_blk[ICE_NETLIST_ID_BLK_SHA_HASH_WORD(15)] << 16 |
+			id_blk[ICE_NETLIST_ID_BLK_SHA_HASH_WORD(14)];
+
+exit_error:
+	devm_kfree(ice_hw_to_dev(hw), id_blk);
+
+	return status;
+}
+
+
+/**
+ * ice_get_inactive_netlist_ver
+ * @hw: pointer to the HW struct
+ * @netlist: pointer to netlist version info structure
+ *
+ * Read the netlist version data from the inactive netlist bank. Used to
+ * extract version data of a pending flash update in order to display the
+ * version data.
+ */
+enum ice_status ice_get_inactive_netlist_ver(struct ice_hw *hw, struct ice_netlist_info *netlist)
+{
+	return ice_get_netlist_info(hw, ICE_INACTIVE_FLASH_BANK, netlist);
+}
+
+/**
+ * ice_discover_flash_size - Discover the available flash size.
+ * @hw: pointer to the HW struct
+ *
+ * The device flash could be up to 16MB in size. However, it is possible that
+ * the actual size is smaller. Use bisection to determine the accessible size
+ * of flash memory.
+ */
+static enum ice_status ice_discover_flash_size(struct ice_hw *hw)
+{
+	u32 min_size = 0, max_size = ICE_AQC_NVM_MAX_OFFSET + 1;
+	enum ice_status status;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
+
+	while ((max_size - min_size) > 1) {
+		u32 offset = (max_size + min_size) / 2;
+		u32 len = 1;
+		u8 data;
+
+		status = ice_read_flat_nvm(hw, offset, &len, &data, false);
+		if (status == ICE_ERR_AQ_ERROR &&
+		    hw->adminq.sq_last_status == ICE_AQ_RC_EINVAL) {
+			ice_debug(hw, ICE_DBG_NVM, "%s: New upper bound of %u bytes\n",
+				  __func__, offset);
+			status = 0;
+			max_size = offset;
+		} else if (!status) {
+			ice_debug(hw, ICE_DBG_NVM, "%s: New lower bound of %u bytes\n",
+				  __func__, offset);
+			min_size = offset;
+		} else {
+			/* an unexpected error occurred */
+			goto err_read_flat_nvm;
+		}
+	}
+
+	ice_debug(hw, ICE_DBG_NVM, "Predicted flash size is %u bytes\n", max_size);
+
+	hw->flash.flash_size = max_size;
+
+err_read_flat_nvm:
+	ice_release_nvm(hw);
+
+	return status;
+}
+
+/**
+ * ice_read_sr_pointer - Read the value of a Shadow RAM pointer word
+ * @hw: pointer to the HW structure
+ * @offset: the word offset of the Shadow RAM word to read
+ * @pointer: pointer value read from Shadow RAM
+ *
+ * Read the given Shadow RAM word, and convert it to a pointer value specified
+ * in bytes. This function assumes the specified offset is a valid pointer
+ * word.
+ *
+ * Each pointer word specifies whether it is stored in word size or 4KB
+ * sector size by using the highest bit. The reported pointer value will be in
+ * bytes, intended for flat NVM reads.
+ */
+static enum ice_status
+ice_read_sr_pointer(struct ice_hw *hw, u16 offset, u32 *pointer)
+{
+	enum ice_status status;
+	u16 value;
+
+	status = ice_read_sr_word(hw, offset, &value);
+	if (status)
+		return status;
+
+	/* Determine if the pointer is in 4KB or word units */
+	if (value & ICE_SR_NVM_PTR_4KB_UNITS)
+		*pointer = (value & ~ICE_SR_NVM_PTR_4KB_UNITS) * 4 * 1024;
+	else
+		*pointer = value * 2;
+
+	return 0;
+}
+
+/**
+ * ice_read_sr_area_size - Read an area size from a Shadow RAM word
+ * @hw: pointer to the HW structure
+ * @offset: the word offset of the Shadow RAM to read
+ * @size: size value read from the Shadow RAM
+ *
+ * Read the given Shadow RAM word, and convert it to an area size value
+ * specified in bytes. This function assumes the specified offset is a valid
+ * area size word.
+ *
+ * Each area size word is specified in 4KB sector units. This function reports
+ * the size in bytes, intended for flat NVM reads.
+ */
+static enum ice_status
+ice_read_sr_area_size(struct ice_hw *hw, u16 offset, u32 *size)
+{
+	enum ice_status status;
+	u16 value;
+
+	status = ice_read_sr_word(hw, offset, &value);
+	if (status)
+		return status;
+
+	/* Area sizes are always specified in 4KB units */
+	*size = value * 4 * 1024;
+
+	return 0;
+}
+
+/**
+ * ice_determine_active_flash_banks - Discover active bank for each module
+ * @hw: pointer to the HW struct
+ *
+ * Read the Shadow RAM control word and determine which banks are active for
+ * the NVM, OROM, and Netlist modules. Also read and calculate the associated
+ * pointer and size. These values are then cached into the ice_flash_info
+ * structure for later use in order to calculate the correct offset to read
+ * from the active module.
+ */
+static enum ice_status
+ice_determine_active_flash_banks(struct ice_hw *hw)
+{
+	struct ice_bank_info *banks = &hw->flash.banks;
+	enum ice_status status;
+	u16 ctrl_word;
+
+	status = ice_read_sr_word(hw, ICE_SR_NVM_CTRL_WORD, &ctrl_word);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read the Shadow RAM control word\n");
+		return status;
+	}
+
+	/* Check that the control word indicates validity */
+	if ((ctrl_word & ICE_SR_CTRL_WORD_1_M) >> ICE_SR_CTRL_WORD_1_S != ICE_SR_CTRL_WORD_VALID) {
+		ice_debug(hw, ICE_DBG_NVM, "Shadow RAM control word is invalid\n");
+		return ICE_ERR_CFG;
+	}
+
+	if (!(ctrl_word & ICE_SR_CTRL_WORD_NVM_BANK))
+		banks->nvm_bank = ICE_1ST_FLASH_BANK;
+	else
+		banks->nvm_bank = ICE_2ND_FLASH_BANK;
+
+	if (!(ctrl_word & ICE_SR_CTRL_WORD_OROM_BANK))
+		banks->orom_bank = ICE_1ST_FLASH_BANK;
+	else
+		banks->orom_bank = ICE_2ND_FLASH_BANK;
+
+	if (!(ctrl_word & ICE_SR_CTRL_WORD_NETLIST_BANK))
+		banks->netlist_bank = ICE_1ST_FLASH_BANK;
+	else
+		banks->netlist_bank = ICE_2ND_FLASH_BANK;
+
+	status = ice_read_sr_pointer(hw, ICE_SR_1ST_NVM_BANK_PTR, &banks->nvm_ptr);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read NVM bank pointer\n");
+		return status;
+	}
+
+	status = ice_read_sr_area_size(hw, ICE_SR_NVM_BANK_SIZE, &banks->nvm_size);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read NVM bank area size\n");
+		return status;
+	}
+
+	status = ice_read_sr_pointer(hw, ICE_SR_1ST_OROM_BANK_PTR, &banks->orom_ptr);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read OROM bank pointer\n");
+		return status;
+	}
+
+	status = ice_read_sr_area_size(hw, ICE_SR_OROM_BANK_SIZE, &banks->orom_size);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read OROM bank area size\n");
+		return status;
+	}
+
+	status = ice_read_sr_pointer(hw, ICE_SR_NETLIST_BANK_PTR, &banks->netlist_ptr);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read Netlist bank pointer\n");
+		return status;
+	}
+
+	status = ice_read_sr_area_size(hw, ICE_SR_NETLIST_BANK_SIZE, &banks->netlist_size);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read Netlist bank area size\n");
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_init_nvm - initializes NVM setting
+ * @hw: pointer to the HW struct
+ *
+ * This function reads and populates NVM settings such as Shadow RAM size,
+ * max_timeout, and blank_nvm_mode
+ */
+enum ice_status ice_init_nvm(struct ice_hw *hw)
+{
+	struct ice_flash_info *flash = &hw->flash;
+	enum ice_status status;
+	u32 fla, gens_stat;
+	u8 sr_size;
+
+	/* The SR size is stored regardless of the NVM programming mode
+	 * as the blank mode may be used in the factory line.
+	 */
+	gens_stat = rd32(hw, GLNVM_GENS);
+	sr_size = (gens_stat & GLNVM_GENS_SR_SIZE_M) >> GLNVM_GENS_SR_SIZE_S;
+
+	/* Switching to words (sr_size contains power of 2) */
+	flash->sr_words = BIT(sr_size) * ICE_SR_WORDS_IN_1KB;
+
+	/* Check if we are in the normal or blank NVM programming mode */
+	fla = rd32(hw, GLNVM_FLA);
+	if (fla & GLNVM_FLA_LOCKED_M) { /* Normal programming mode */
+		flash->blank_nvm_mode = false;
+	} else {
+		/* Blank programming mode */
+		flash->blank_nvm_mode = true;
+		ice_debug(hw, ICE_DBG_NVM, "NVM init error: unsupported blank mode.\n");
+		return ICE_ERR_NVM_BLANK_MODE;
+	}
+
+	status = ice_discover_flash_size(hw);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "NVM init error: failed to discover flash size.\n");
+		return status;
+	}
+
+	status = ice_determine_active_flash_banks(hw);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to determine active flash banks.\n");
+		return status;
+	}
+
+	status = ice_get_nvm_ver_info(hw, ICE_ACTIVE_FLASH_BANK, &flash->nvm);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read NVM info.\n");
+		return status;
+	}
+
+	status = ice_get_orom_ver_info(hw, ICE_ACTIVE_FLASH_BANK, &flash->orom);
+	if (status)
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read Option ROM info.\n");
+
+	/* read the netlist version information */
+	status = ice_get_netlist_info(hw, ICE_ACTIVE_FLASH_BANK, &flash->netlist);
+	if (status)
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read netlist info.\n");
+	return 0;
+}
+
+
+/**
+ * ice_nvm_validate_checksum
+ * @hw: pointer to the HW struct
+ *
+ * Verify NVM PFA checksum validity (0x0706)
+ */
+enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw)
+{
+	struct ice_aqc_nvm_checksum *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
+
+	cmd = &desc.params.nvm_checksum;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_checksum);
+	cmd->flags = ICE_AQC_NVM_CHECKSUM_VERIFY;
 
 	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+
 	ice_release_nvm(hw);
 
 	if (!status)
@@ -347,3 +1207,496 @@ enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw)
 
 	return status;
 }
+
+/**
+ * ice_nvm_recalculate_checksum
+ * @hw: pointer to the HW struct
+ *
+ * Recalculate NVM PFA checksum (0x0706)
+ */
+enum ice_status ice_nvm_recalculate_checksum(struct ice_hw *hw)
+{
+	struct ice_aqc_nvm_checksum *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
+
+	cmd = &desc.params.nvm_checksum;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_checksum);
+	cmd->flags = ICE_AQC_NVM_CHECKSUM_RECALC;
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+
+	ice_release_nvm(hw);
+
+	return status;
+}
+
+/**
+ * ice_nvm_write_activate
+ * @hw: pointer to the HW struct
+ * @cmd_flags: NVM activate admin command bits (banks to be validated)
+ *
+ * Update the control word with the required banks' validity bits
+ * and dumps the Shadow RAM to flash (0x0707)
+ */
+enum ice_status ice_nvm_write_activate(struct ice_hw *hw, u8 cmd_flags)
+{
+	struct ice_aqc_nvm *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.nvm;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_write_activate);
+
+	cmd->cmd_flags = cmd_flags;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
+
+/**
+ * ice_get_nvm_minsrevs - Get the Minimum Security Revision values from flash
+ * @hw: pointer to the HW struct
+ * @minsrevs: structure to store NVM and OROM minsrev values
+ *
+ * Read the Minimum Security Revision TLV and extract the revision values from
+ * the flash image into a readable structure for processing.
+ */
+enum ice_status
+ice_get_nvm_minsrevs(struct ice_hw *hw, struct ice_minsrev_info *minsrevs)
+{
+	struct ice_aqc_nvm_minsrev data;
+	enum ice_status status;
+	u16 valid;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
+
+	status = ice_aq_read_nvm(hw, ICE_AQC_NVM_MINSREV_MOD_ID, 0, sizeof(data),
+				 &data, true, false, NULL);
+
+	ice_release_nvm(hw);
+
+	if (status)
+		return status;
+
+	valid = le16_to_cpu(data.validity);
+
+	/* Extract NVM minimum security revision */
+	if (valid & ICE_AQC_NVM_MINSREV_NVM_VALID) {
+		u16 minsrev_l, minsrev_h;
+
+		minsrev_l = le16_to_cpu(data.nvm_minsrev_l);
+		minsrev_h = le16_to_cpu(data.nvm_minsrev_h);
+
+		minsrevs->nvm = minsrev_h << 16 | minsrev_l;
+		minsrevs->nvm_valid = true;
+	}
+
+	/* Extract the OROM minimum security revision */
+	if (valid & ICE_AQC_NVM_MINSREV_OROM_VALID) {
+		u16 minsrev_l, minsrev_h;
+
+		minsrev_l = le16_to_cpu(data.orom_minsrev_l);
+		minsrev_h = le16_to_cpu(data.orom_minsrev_h);
+
+		minsrevs->orom = minsrev_h << 16 | minsrev_l;
+		minsrevs->orom_valid = true;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_update_nvm_minsrevs - Update minimum security revision TLV data in flash
+ * @hw: pointer to the HW struct
+ * @minsrevs: minimum security revision information
+ *
+ * Update the NVM or Option ROM minimum security revision fields in the PFA
+ * area of the flash. Reads the minsrevs->nvm_valid and minsrevs->orom_valid
+ * fields to determine what update is being requested. If the valid bit is not
+ * set for that module, then the associated minsrev will be left as is.
+ */
+enum ice_status
+ice_update_nvm_minsrevs(struct ice_hw *hw, struct ice_minsrev_info *minsrevs)
+{
+	struct ice_aqc_nvm_minsrev data;
+	enum ice_status status;
+
+	if (!minsrevs->nvm_valid && !minsrevs->orom_valid) {
+		ice_debug(hw, ICE_DBG_NVM, "At least one of NVM and OROM MinSrev must be valid");
+		return ICE_ERR_PARAM;
+	}
+
+	status = ice_acquire_nvm(hw, ICE_RES_WRITE);
+	if (status)
+		return status;
+
+	/* Get current data */
+	status = ice_aq_read_nvm(hw, ICE_AQC_NVM_MINSREV_MOD_ID, 0, sizeof(data),
+				 &data, true, false, NULL);
+	if (status)
+		goto exit_release_res;
+
+	if (minsrevs->nvm_valid) {
+		data.nvm_minsrev_l = cpu_to_le16(minsrevs->nvm & 0xFFFF);
+		data.nvm_minsrev_h = cpu_to_le16(minsrevs->nvm >> 16);
+		data.validity |= cpu_to_le16(ICE_AQC_NVM_MINSREV_NVM_VALID);
+	}
+
+	if (minsrevs->orom_valid) {
+		data.orom_minsrev_l = cpu_to_le16(minsrevs->orom & 0xFFFF);
+		data.orom_minsrev_h = cpu_to_le16(minsrevs->orom >> 16);
+		data.validity |= cpu_to_le16(ICE_AQC_NVM_MINSREV_OROM_VALID);
+	}
+
+	/* Update flash data */
+	status = ice_aq_update_nvm(hw, ICE_AQC_NVM_MINSREV_MOD_ID, 0, sizeof(data), &data,
+				   true, ICE_AQC_NVM_SPECIAL_UPDATE, NULL);
+	if (status)
+		goto exit_release_res;
+
+	/* Dump the Shadow RAM to the flash */
+	status = ice_nvm_write_activate(hw, 0);
+
+exit_release_res:
+	ice_release_nvm(hw);
+
+	return status;
+}
+
+/**
+ * ice_nvm_access_get_features - Return the NVM access features structure
+ * @cmd: NVM access command to process
+ * @data: storage for the driver NVM features
+ *
+ * Fill in the data section of the NVM access request with a copy of the NVM
+ * features structure.
+ */
+static enum ice_status
+ice_nvm_access_get_features(struct ice_nvm_access_cmd *cmd,
+			    union ice_nvm_access_data *data)
+{
+	/* The provided data_size must be at least as large as our NVM
+	 * features structure. A larger size should not be treated as an
+	 * error, to allow future extensions to the features structure to
+	 * work on older drivers.
+	 */
+	if (cmd->data_size < sizeof(struct ice_nvm_features))
+		return ICE_ERR_NO_MEMORY;
+
+	/* Initialize the data buffer to zeros */
+	memset(data, 0, cmd->data_size);
+
+	/* Fill in the features data */
+	data->drv_features.major = ICE_NVM_ACCESS_MAJOR_VER;
+	data->drv_features.minor = ICE_NVM_ACCESS_MINOR_VER;
+	data->drv_features.size = sizeof(struct ice_nvm_features);
+	data->drv_features.features[0] = ICE_NVM_FEATURES_0_REG_ACCESS;
+
+	return 0;
+}
+
+/**
+ * ice_nvm_access_get_module - Helper function to read module value
+ * @cmd: NVM access command structure
+ *
+ * Reads the module value out of the NVM access config field.
+ */
+static u32 ice_nvm_access_get_module(struct ice_nvm_access_cmd *cmd)
+{
+	return ((cmd->config & ICE_NVM_CFG_MODULE_M) >> ICE_NVM_CFG_MODULE_S);
+}
+
+/**
+ * ice_nvm_access_get_flags - Helper function to read flags value
+ * @cmd: NVM access command structure
+ *
+ * Reads the flags value out of the NVM access config field.
+ */
+static u32 ice_nvm_access_get_flags(struct ice_nvm_access_cmd *cmd)
+{
+	return ((cmd->config & ICE_NVM_CFG_FLAGS_M) >> ICE_NVM_CFG_FLAGS_S);
+}
+
+/**
+ * ice_nvm_access_get_adapter - Helper function to read adapter info
+ * @cmd: NVM access command structure
+ *
+ * Read the adapter info value out of the NVM access config field.
+ */
+static u32 ice_nvm_access_get_adapter(struct ice_nvm_access_cmd *cmd)
+{
+	return ((cmd->config & ICE_NVM_CFG_ADAPTER_INFO_M) >>
+		ICE_NVM_CFG_ADAPTER_INFO_S);
+}
+
+/**
+ * ice_validate_nvm_rw_reg - Check than an NVM access request is valid
+ * @cmd: NVM access command structure
+ *
+ * Validates that an NVM access structure is request to read or write a valid
+ * register offset. First validates that the module and flags are correct, and
+ * then ensures that the register offset is one of the accepted registers.
+ */
+static enum ice_status
+ice_validate_nvm_rw_reg(struct ice_nvm_access_cmd *cmd)
+{
+	u32 module, flags, offset;
+	u16 i;
+
+	module = ice_nvm_access_get_module(cmd);
+	flags = ice_nvm_access_get_flags(cmd);
+	offset = cmd->offset;
+
+	/* Make sure the module and flags indicate a read/write request */
+	if (module != ICE_NVM_REG_RW_MODULE ||
+	    flags != ICE_NVM_REG_RW_FLAGS ||
+	    cmd->data_size != sizeof_field(union ice_nvm_access_data, regval))
+		return ICE_ERR_PARAM;
+
+	switch (offset) {
+	case GL_HICR:
+	case GL_HICR_EN: /* Note, this register is read only */
+	case GL_FWSTS:
+	case GL_MNG_FWSM:
+	case GLGEN_CSR_DEBUG_C:
+	case GLGEN_RSTAT:
+	case GLPCI_LBARCTRL:
+	case GLNVM_GENS:
+	case GLNVM_FLA:
+	case PF_FUNC_RID:
+		return 0;
+	default:
+		break;
+	}
+
+	for (i = 0; i <= ICE_NVM_ACCESS_GL_HIDA_MAX; i++)
+		if (offset == (u32)GL_HIDA(i))
+			return 0;
+
+	for (i = 0; i <= ICE_NVM_ACCESS_GL_HIBA_MAX; i++)
+		if (offset == (u32)GL_HIBA(i))
+			return 0;
+
+	/* All other register offsets are not valid */
+	return ICE_ERR_OUT_OF_RANGE;
+}
+
+/**
+ * ice_nvm_access_read - Handle an NVM read request
+ * @hw: pointer to the HW struct
+ * @cmd: NVM access command to process
+ * @data: storage for the register value read
+ *
+ * Process an NVM access request to read a register.
+ */
+static enum ice_status
+ice_nvm_access_read(struct ice_hw *hw, struct ice_nvm_access_cmd *cmd,
+		    union ice_nvm_access_data *data)
+{
+	enum ice_status status;
+
+	/* Always initialize the output data, even on failure */
+	memset(data, 0, cmd->data_size);
+
+	/* Make sure this is a valid read/write access request */
+	status = ice_validate_nvm_rw_reg(cmd);
+	if (status)
+		return status;
+
+	ice_debug(hw, ICE_DBG_NVM, "NVM access: reading register %08x\n",
+		  cmd->offset);
+
+	/* Read the register and store the contents in the data field */
+	data->regval = rd32(hw, cmd->offset);
+
+	return 0;
+}
+
+/**
+ * ice_nvm_access_write - Handle an NVM write request
+ * @hw: pointer to the HW struct
+ * @cmd: NVM access command to process
+ * @data: NVM access data to write
+ *
+ * Process an NVM access request to write a register.
+ */
+static enum ice_status
+ice_nvm_access_write(struct ice_hw *hw, struct ice_nvm_access_cmd *cmd,
+		     union ice_nvm_access_data *data)
+{
+	enum ice_status status;
+
+	/* Make sure this is a valid read/write access request */
+	status = ice_validate_nvm_rw_reg(cmd);
+	if (status)
+		return status;
+
+	/* Reject requests to write to read-only registers */
+	switch (cmd->offset) {
+	case GL_HICR_EN:
+	case GLGEN_RSTAT:
+		return ICE_ERR_OUT_OF_RANGE;
+	default:
+		break;
+	}
+
+	ice_debug(hw, ICE_DBG_NVM, "NVM access: writing register %08x with value %08x\n",
+		  cmd->offset, data->regval);
+
+	/* Write the data field to the specified register */
+	wr32(hw, cmd->offset, data->regval);
+
+	return 0;
+}
+
+/**
+ * ice_handle_nvm_access - Handle an NVM access request
+ * @hw: pointer to the HW struct
+ * @cmd: NVM access command info
+ * @data: pointer to read or return data
+ *
+ * Process an NVM access request. Read the command structure information and
+ * determine if it is valid. If not, report an error indicating the command
+ * was invalid.
+ *
+ * For valid commands, perform the necessary function, copying the data into
+ * the provided data buffer.
+ */
+enum ice_status
+ice_handle_nvm_access(struct ice_hw *hw, struct ice_nvm_access_cmd *cmd,
+		      union ice_nvm_access_data *data)
+{
+	u32 module, flags, adapter_info;
+
+	/* Extended flags are currently reserved and must be zero */
+	if ((cmd->config & ICE_NVM_CFG_EXT_FLAGS_M) != 0)
+		return ICE_ERR_PARAM;
+
+	/* Adapter info must match the HW device ID */
+	adapter_info = ice_nvm_access_get_adapter(cmd);
+	if (adapter_info != hw->device_id)
+		return ICE_ERR_PARAM;
+
+	switch (cmd->command) {
+	case ICE_NVM_CMD_READ:
+		module = ice_nvm_access_get_module(cmd);
+		flags = ice_nvm_access_get_flags(cmd);
+
+		/* Getting the driver's NVM features structure shares the same
+		 * command type as reading a register. Read the config field
+		 * to determine if this is a request to get features.
+		 */
+		if (module == ICE_NVM_GET_FEATURES_MODULE &&
+		    flags == ICE_NVM_GET_FEATURES_FLAGS &&
+		    cmd->offset == 0)
+			return ice_nvm_access_get_features(cmd, data);
+		else
+			return ice_nvm_access_read(hw, cmd, data);
+	case ICE_NVM_CMD_WRITE:
+		return ice_nvm_access_write(hw, cmd, data);
+	default:
+		return ICE_ERR_PARAM;
+	}
+}
+
+/**
+ * ice_aq_nvm_update_empr
+ * @hw: pointer to the HW struct
+ *
+ * Update empr (0x0709). This command allows SW to
+ * request an EMPR to activate new FW.
+ */
+enum ice_status ice_aq_nvm_update_empr(struct ice_hw *hw)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_update_empr);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
+
+/* ice_nvm_set_pkg_data
+ * @hw: pointer to the HW struct
+ * @del_pkg_data_flag: If is set then the current pkg_data store by FW
+ *		       is deleted.
+ *		       If bit is set to 1, then buffer should be size 0.
+ * @data: pointer to buffer
+ * @length: length of the buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set package data (0x070A). This command is equivalent to the reception
+ * of a PLDM FW Update GetPackageData cmd. This command should be sent
+ * as part of the NVM update as the first cmd in the flow.
+ */
+
+enum ice_status
+ice_nvm_set_pkg_data(struct ice_hw *hw, bool del_pkg_data_flag, u8 *data,
+		     u16 length, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_nvm_pkg_data *cmd;
+	struct ice_aq_desc desc;
+
+	if (length != 0 && !data)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.pkg_data;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_pkg_data);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	if (del_pkg_data_flag)
+		cmd->cmd_flags |= ICE_AQC_NVM_PKG_DELETE;
+
+	return ice_aq_send_cmd(hw, &desc, data, length, cd);
+}
+
+/* ice_nvm_pass_component_tbl
+ * @hw: pointer to the HW struct
+ * @data: pointer to buffer
+ * @length: length of the buffer
+ * @transfer_flag: parameter for determining stage of the update
+ * @comp_response: a pointer to the response from the 0x070B AQC.
+ * @comp_response_code: a pointer to the response code from the 0x070B AQC.
+ * @cd: pointer to command details structure or NULL
+ *
+ * Pass component table (0x070B). This command is equivalent to the reception
+ * of a PLDM FW Update PassComponentTable cmd. This command should be sent once
+ * per component. It can be only sent after Set Package Data cmd and before
+ * actual update. FW will assume these commands are going to be sent until
+ * the TransferFlag is set to End or StartAndEnd.
+ */
+
+enum ice_status
+ice_nvm_pass_component_tbl(struct ice_hw *hw, u8 *data, u16 length,
+			   u8 transfer_flag, u8 *comp_response,
+			   u8 *comp_response_code, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_nvm_pass_comp_tbl *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!data || !comp_response || !comp_response_code)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.pass_comp_tbl;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_nvm_pass_component_tbl);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->transfer_flag = transfer_flag;
+	status = ice_aq_send_cmd(hw, &desc, data, length, cd);
+
+	if (!status) {
+		*comp_response = cmd->component_response;
+		*comp_response_code = cmd->component_response_code;
+	}
+	return status;
+}
+
diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.h b/drivers/net/ethernet/intel/ice/ice_nvm.h
new file mode 100644
index 000000000000..c37481f62786
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_nvm.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_NVM_H_
+#define _ICE_NVM_H_
+
+#define ICE_NVM_CMD_READ		0x0000000B
+#define ICE_NVM_CMD_WRITE		0x0000000C
+
+/* NVM Access config bits */
+#define ICE_NVM_CFG_MODULE_M		ICE_M(0xFF, 0)
+#define ICE_NVM_CFG_MODULE_S		0
+#define ICE_NVM_CFG_FLAGS_M		ICE_M(0xF, 8)
+#define ICE_NVM_CFG_FLAGS_S		8
+#define ICE_NVM_CFG_EXT_FLAGS_M		ICE_M(0xF, 12)
+#define ICE_NVM_CFG_EXT_FLAGS_S		12
+#define ICE_NVM_CFG_ADAPTER_INFO_M	ICE_M(0xFFFF, 16)
+#define ICE_NVM_CFG_ADAPTER_INFO_S	16
+
+/* NVM Read Get Driver Features */
+#define ICE_NVM_GET_FEATURES_MODULE	0xE
+#define ICE_NVM_GET_FEATURES_FLAGS	0xF
+
+/* NVM Read/Write Mapped Space */
+#define ICE_NVM_REG_RW_MODULE	0x0
+#define ICE_NVM_REG_RW_FLAGS	0x1
+
+struct ice_orom_civd_info {
+	u8 signature[4];	/* Must match ASCII '$CIV' characters */
+	u8 checksum;		/* Simple modulo 256 sum of all structure bytes must equal 0 */
+	__le32 combo_ver;	/* Combo Image Version number */
+	u8 combo_name_len;	/* Length of the unicode combo image version string, max of 32 */
+	__le16 combo_name[32];	/* Unicode string representing the Combo Image version */
+} __packed;
+
+#define ICE_NVM_ACCESS_MAJOR_VER	0
+#define ICE_NVM_ACCESS_MINOR_VER	5
+
+/* NVM Access feature flags. Other bits in the features field are reserved and
+ * should be set to zero when reporting the ice_nvm_features structure.
+ */
+#define ICE_NVM_FEATURES_0_REG_ACCESS	BIT(1)
+
+/* NVM Access Features */
+struct ice_nvm_features {
+	u8 major;		/* Major version (informational only) */
+	u8 minor;		/* Minor version (informational only) */
+	u16 size;		/* size of ice_nvm_features structure */
+	u8 features[12];	/* Array of feature bits */
+};
+
+/* NVM Access command */
+struct ice_nvm_access_cmd {
+	u32 command;		/* NVM command: READ or WRITE */
+	u32 config;		/* NVM command configuration */
+	u32 offset;		/* offset to read/write, in bytes */
+	u32 data_size;		/* size of data field, in bytes */
+};
+
+/* NVM Access data */
+union ice_nvm_access_data {
+	u32 regval;	/* Storage for register value */
+	struct ice_nvm_features drv_features; /* NVM features */
+};
+
+/* NVM Access registers */
+#define GL_HIDA(_i)			(0x00082000 + ((_i) * 4))
+#define GL_HIBA(_i)			(0x00081000 + ((_i) * 4))
+#define GL_HICR				0x00082040
+#define GL_HICR_EN			0x00082044
+#define GLGEN_CSR_DEBUG_C		0x00075750
+#define GLPCI_LBARCTRL			0x0009DE74
+#define GLNVM_GENS			0x000B6100
+#define GLNVM_FLA			0x000B6108
+
+#define ICE_NVM_ACCESS_GL_HIDA_MAX	15
+#define ICE_NVM_ACCESS_GL_HIBA_MAX	1023
+
+enum ice_status
+ice_handle_nvm_access(struct ice_hw *hw, struct ice_nvm_access_cmd *cmd,
+		      union ice_nvm_access_data *data);
+enum ice_status
+ice_acquire_nvm(struct ice_hw *hw, enum ice_aq_res_access_type access);
+void ice_release_nvm(struct ice_hw *hw);
+enum ice_status
+ice_read_flat_nvm(struct ice_hw *hw, u32 offset, u32 *length, u8 *data,
+		  bool read_shadow_ram);
+enum ice_status
+ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len,
+		       u16 module_type);
+enum ice_status
+ice_get_nvm_minsrevs(struct ice_hw *hw, struct ice_minsrev_info *minsrevs);
+enum ice_status
+ice_update_nvm_minsrevs(struct ice_hw *hw, struct ice_minsrev_info *minsrevs);
+enum ice_status
+ice_get_inactive_orom_ver(struct ice_hw *hw, struct ice_orom_info *orom);
+enum ice_status
+ice_get_inactive_nvm_ver(struct ice_hw *hw, struct ice_nvm_info *nvm);
+enum ice_status
+ice_get_inactive_netlist_ver(struct ice_hw *hw, struct ice_netlist_info *netlist);
+enum ice_status
+ice_read_pba_string(struct ice_hw *hw, u8 *pba_num, u32 pba_num_size);
+enum ice_status ice_init_nvm(struct ice_hw *hw);
+enum ice_status ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data);
+enum ice_status
+ice_aq_erase_nvm(struct ice_hw *hw, u16 module_typeid, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_update_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset,
+		  u16 length, void *data, bool last_command, u8 command_flags,
+		  struct ice_sq_cd *cd);
+enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw);
+enum ice_status ice_nvm_recalculate_checksum(struct ice_hw *hw);
+enum ice_status ice_nvm_write_activate(struct ice_hw *hw, u8 cmd_flags);
+enum ice_status ice_aq_nvm_update_empr(struct ice_hw *hw);
+enum ice_status
+ice_nvm_set_pkg_data(struct ice_hw *hw, bool del_pkg_data_flag, u8 *data,
+		     u16 length, struct ice_sq_cd *cd);
+enum ice_status
+ice_nvm_pass_component_tbl(struct ice_hw *hw, u8 *data, u16 length,
+			   u8 transfer_flag, u8 *comp_response,
+			   u8 *comp_response_code, struct ice_sq_cd *cd);
+#endif /* _ICE_NVM_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_osdep.h b/drivers/net/ethernet/intel/ice/ice_osdep.h
index f57c414bc0a9..3dacbda6597b 100644
--- a/drivers/net/ethernet/intel/ice/ice_osdep.h
+++ b/drivers/net/ethernet/intel/ice/ice_osdep.h
@@ -1,14 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_OSDEP_H_
 #define _ICE_OSDEP_H_
 
 #include <linux/types.h>
 #include <linux/io.h>
-#ifndef CONFIG_64BIT
-#include <linux/io-64-nonatomic-lo-hi.h>
-#endif
+#include <linux/bitops.h>
+#include <linux/if_ether.h>
+#include "kcompat.h"
 
 #define wr32(a, reg, value)	writel((value), ((a)->hw_addr + (reg)))
 #define rd32(a, reg)		readl((a)->hw_addr + (reg))
@@ -16,6 +16,7 @@
 #define rd64(a, reg)		readq((a)->hw_addr + (reg))
 
 #define ice_flush(a)		rd32((a), GLGEN_STAT)
+
 #define ICE_M(m, s)		((m) << (s))
 
 struct ice_dma_mem {
@@ -27,10 +28,18 @@ struct ice_dma_mem {
 #define ice_hw_to_dev(ptr)	\
 	(&(container_of((ptr), struct ice_pf, hw))->pdev->dev)
 
+#define ice_info_fwlog(hw, rowsize, groupsize, buf, len)	\
+	print_hex_dump(KERN_INFO, " FWLOG: ",			\
+		       DUMP_PREFIX_NONE,			\
+		       rowsize, groupsize, buf,			\
+		       len, false)
+
+
 #ifdef CONFIG_DYNAMIC_DEBUG
 #define ice_debug(hw, type, fmt, args...) \
 	dev_dbg(ice_hw_to_dev(hw), fmt, ##args)
 
+
 #define ice_debug_array(hw, type, rowsize, groupsize, buf, len) \
 	print_hex_dump_debug(KBUILD_MODNAME " ",		\
 			     DUMP_PREFIX_OFFSET, rowsize,	\
@@ -51,6 +60,7 @@ do {								\
 				     rowsize, groupsize, buf,	\
 				     len, false);		\
 } while (0)
+
 #else
 #define ice_debug_array(hw, type, rowsize, groupsize, buf, len) \
 do {								\
@@ -67,6 +77,7 @@ do {								\
 				  i, ((len_l) - i), ((buf_l) + i));\
 	}							\
 } while (0)
+
 #endif /* DEBUG */
 #endif /* CONFIG_DYNAMIC_DEBUG */
 
diff --git a/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c
new file mode 100644
index 000000000000..4c9581da9ccc
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_ops.h"
+#include "ice_vsi_vlan_lib.h"
+#include "ice_vlan_mode.h"
+#include "ice.h"
+#include "ice_pf_vsi_vlan_ops.h"
+
+void ice_pf_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops;
+
+	if (ice_is_dvm_ena(&vsi->back->hw)) {
+		vlan_ops = &vsi->outer_vlan_ops;
+
+		vlan_ops->add_vlan = ice_vsi_add_vlan;
+		vlan_ops->del_vlan = ice_vsi_del_vlan;
+		vlan_ops->ena_stripping = ice_vsi_ena_outer_stripping;
+		vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+		vlan_ops->ena_insertion = ice_vsi_ena_outer_insertion;
+		vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+		vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering;
+		vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+	} else {
+		vlan_ops = &vsi->inner_vlan_ops;
+
+		vlan_ops->add_vlan = ice_vsi_add_vlan;
+		vlan_ops->del_vlan = ice_vsi_del_vlan;
+		vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+		vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+		vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+		vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+		vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering;
+		vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+	}
+}
+
diff --git a/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h
new file mode 100644
index 000000000000..14f8b7ecac7e
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PF_VSI_VLAN_OPS_H_
+#define _ICE_PF_VSI_VLAN_OPS_H_
+
+#include "ice_vsi_vlan_ops.h"
+
+struct ice_vsi;
+
+void ice_pf_vsi_init_vlan_ops(struct ice_vsi *vsi);
+
+#endif /* _ICE_PF_VSI_VLAN_OPS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_protocol_type.h b/drivers/net/ethernet/intel/ice/ice_protocol_type.h
new file mode 100644
index 000000000000..8b5861138096
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_protocol_type.h
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PROTOCOL_TYPE_H_
+#define _ICE_PROTOCOL_TYPE_H_
+#include "ice_flex_type.h"
+#define ICE_IPV6_ADDR_LENGTH 16
+
+/* Each recipe can match up to 5 different fields. Fields to match can be meta-
+ * data, values extracted from packet headers, or results from other recipes.
+ * One of the 5 fields is reserved for matching the switch ID. So, up to 4
+ * recipes can provide intermediate results to another one through chaining,
+ * e.g. recipes 0, 1, 2, and 3 can provide intermediate results to recipe 4.
+ */
+#define ICE_NUM_WORDS_RECIPE 4
+
+/* Max recipes that can be chained */
+#define ICE_MAX_CHAIN_RECIPE 5
+
+/* 1 word reserved for switch ID from allowed 5 words.
+ * So a recipe can have max 4 words. And you can chain 5 such recipes
+ * together. So maximum words that can be programmed for look up is 5 * 4.
+ */
+#define ICE_MAX_CHAIN_WORDS (ICE_NUM_WORDS_RECIPE * ICE_MAX_CHAIN_RECIPE)
+
+/* Field vector index corresponding to chaining */
+#define ICE_CHAIN_FV_INDEX_START 47
+
+enum ice_protocol_type {
+	ICE_MAC_OFOS = 0,
+	ICE_MAC_IL,
+	ICE_ETYPE_OL,
+	ICE_VLAN_OFOS,
+	ICE_IPV4_OFOS,
+	ICE_IPV4_IL,
+	ICE_IPV6_OFOS,
+	ICE_IPV6_IL,
+	ICE_TCP_IL,
+	ICE_UDP_OF,
+	ICE_UDP_ILOS,
+	ICE_SCTP_IL,
+	ICE_VXLAN,
+	ICE_GENEVE,
+	ICE_VXLAN_GPE,
+	ICE_NVGRE,
+	ICE_GTP,
+	ICE_PPPOE,
+	ICE_PFCP,
+	ICE_L2TPV3,
+	ICE_ESP,
+	ICE_AH,
+	ICE_NAT_T,
+	ICE_GTP_NO_PAY,
+	ICE_VLAN_EX,
+	ICE_VLAN_IN,
+	ICE_PROTOCOL_LAST
+};
+
+enum ice_sw_tunnel_type {
+	ICE_NON_TUN = 0,
+	ICE_SW_TUN_AND_NON_TUN,
+	ICE_SW_TUN_VXLAN_GPE,
+	ICE_SW_TUN_GENEVE,      /* GENEVE matches only non-VLAN pkts */
+	ICE_SW_TUN_GENEVE_VLAN, /* GENEVE matches both VLAN and non-VLAN pkts */
+	ICE_SW_TUN_VXLAN,	/* VXLAN matches only non-VLAN pkts */
+	ICE_SW_TUN_VXLAN_VLAN,  /* VXLAN matches both VLAN and non-VLAN pkts */
+	ICE_SW_TUN_NVGRE,
+	ICE_SW_TUN_UDP, /* This means all "UDP" tunnel types: VXLAN-GPE, VXLAN
+			 * and GENEVE
+			 */
+	ICE_SW_TUN_IPV4_GTP_IPV4_TCP,
+	ICE_SW_TUN_IPV4_GTP_IPV4_UDP,
+	ICE_SW_TUN_IPV4_GTP_IPV6_TCP,
+	ICE_SW_TUN_IPV4_GTP_IPV6_UDP,
+	ICE_SW_TUN_IPV6_GTP_IPV4_TCP,
+	ICE_SW_TUN_IPV6_GTP_IPV4_UDP,
+	ICE_SW_TUN_IPV6_GTP_IPV6_TCP,
+	ICE_SW_TUN_IPV6_GTP_IPV6_UDP,
+
+	/* following adds support for GTP, just using inner protocols,
+	 * outer L3 and L4 protocols can be anything
+	 */
+	ICE_SW_TUN_GTP_IPV4_TCP,
+	ICE_SW_TUN_GTP_IPV4_UDP,
+	ICE_SW_TUN_GTP_IPV6_TCP,
+	ICE_SW_TUN_GTP_IPV6_UDP,
+	ICE_SW_TUN_IPV4_GTPU_IPV4,
+	ICE_SW_TUN_IPV4_GTPU_IPV6,
+	ICE_SW_TUN_IPV6_GTPU_IPV4,
+	ICE_SW_TUN_IPV6_GTPU_IPV6,
+	ICE_SW_TUN_GTP_IPV4,
+	ICE_SW_TUN_GTP_IPV6,
+	ICE_ALL_TUNNELS /* All tunnel types including NVGRE */
+};
+
+/* Decoders for ice_prot_id:
+ * - F: First
+ * - I: Inner
+ * - L: Last
+ * - O: Outer
+ * - S: Single
+ */
+enum ice_prot_id {
+	ICE_PROT_ID_INVAL	= 0,
+	ICE_PROT_MAC_OF_OR_S	= 1,
+	ICE_PROT_MAC_O2		= 2,
+	ICE_PROT_MAC_IL		= 4,
+	ICE_PROT_MAC_IN_MAC	= 7,
+	ICE_PROT_ETYPE_OL	= 9,
+	ICE_PROT_ETYPE_IL	= 10,
+	ICE_PROT_PAY		= 15,
+	ICE_PROT_EVLAN_O	= 16,
+	ICE_PROT_VLAN_O		= 17,
+	ICE_PROT_VLAN_IF	= 18,
+	ICE_PROT_MPLS_OL_MINUS_1 = 27,
+	ICE_PROT_MPLS_OL_OR_OS	= 28,
+	ICE_PROT_MPLS_IL	= 29,
+	ICE_PROT_IPV4_OF_OR_S	= 32,
+	ICE_PROT_IPV4_IL	= 33,
+	ICE_PROT_IPV6_OF_OR_S	= 40,
+	ICE_PROT_IPV6_IL	= 41,
+	ICE_PROT_IPV6_FRAG	= 47,
+	ICE_PROT_TCP_IL		= 49,
+	ICE_PROT_UDP_OF		= 52,
+	ICE_PROT_UDP_IL_OR_S	= 53,
+	ICE_PROT_GRE_OF		= 64,
+	ICE_PROT_NSH_F		= 84,
+	ICE_PROT_ESP_F		= 88,
+	ICE_PROT_ESP_2		= 89,
+	ICE_PROT_SCTP_IL	= 96,
+	ICE_PROT_ICMP_IL	= 98,
+	ICE_PROT_ICMPV6_IL	= 100,
+	ICE_PROT_VRRP_F		= 101,
+	ICE_PROT_OSPF		= 102,
+	ICE_PROT_PPPOE		= 103,
+	ICE_PROT_L2TPV3		= 104,
+	ICE_PROT_ECPRI		= 105,
+	ICE_PROT_PPP		= 106,
+	ICE_PROT_ATAOE_OF	= 114,
+	ICE_PROT_CTRL_OF	= 116,
+	ICE_PROT_LLDP_OF	= 117,
+	ICE_PROT_ARP_OF		= 118,
+	ICE_PROT_EAPOL_OF	= 120,
+	ICE_PROT_META_ID	= 255, /* when offset == metaddata */
+	ICE_PROT_INVALID	= 255  /* when offset == ICE_FV_OFFSET_INVAL */
+};
+
+#define ICE_VNI_OFFSET		12 /* offset of VNI from ICE_PROT_UDP_OF */
+
+
+#define ICE_MAC_OFOS_HW		1
+#define ICE_MAC_IL_HW		4
+#define ICE_ETYPE_OL_HW		9
+#define ICE_VLAN_OF_HW		16
+#define ICE_VLAN_OL_HW		17
+#define ICE_IPV4_OFOS_HW	32
+#define ICE_IPV4_IL_HW		33
+#define ICE_IPV6_OFOS_HW	40
+#define ICE_IPV6_IL_HW		41
+#define ICE_TCP_IL_HW		49
+#define ICE_UDP_ILOS_HW		53
+#define ICE_ESP_HW			88
+#define ICE_AH_HW			89
+#define ICE_SCTP_IL_HW		96
+#define ICE_PPPOE_HW		103
+#define ICE_L2TPV3_HW		104
+
+/* ICE_UDP_OF is used to identify all 3 tunnel types
+ * VXLAN, GENEVE and VXLAN_GPE. To differentiate further
+ * need to use flags from the field vector
+ */
+#define ICE_UDP_OF_HW	52 /* UDP Tunnels */
+#define ICE_GRE_OF_HW	64 /* NVGRE */
+#define ICE_META_DATA_ID_HW 255 /* this is used for tunnel type */
+
+#define ICE_MDID_SIZE 2
+#define ICE_TUN_FLAG_MDID 21
+#define ICE_TUN_FLAG_MDID_OFF (ICE_MDID_SIZE * ICE_TUN_FLAG_MDID)
+#define ICE_TUN_FLAG_MASK 0xFF
+#define ICE_TUN_FLAG_VLAN_MASK 0x01
+#define ICE_TUN_FLAG_FV_IND 2
+
+#define ICE_PROTOCOL_MAX_ENTRIES 16
+
+/* Mapping of software defined protocol ID to hardware defined protocol ID */
+struct ice_protocol_entry {
+	enum ice_protocol_type type;
+	u8 protocol_id;
+};
+
+
+struct ice_ether_hdr {
+	u8 dst_addr[ETH_ALEN];
+	u8 src_addr[ETH_ALEN];
+};
+
+struct ice_ethtype_hdr {
+	__be16 ethtype_id;
+};
+
+struct ice_ether_vlan_hdr {
+	u8 dst_addr[ETH_ALEN];
+	u8 src_addr[ETH_ALEN];
+	__be32 vlan_id;
+};
+
+struct ice_vlan_hdr {
+	__be16 vlan;
+	__be16 type;
+};
+
+struct ice_ipv4_hdr {
+	u8 version;
+	u8 tos;
+	__be16 total_length;
+	__be16 id;
+	__be16 frag_off;
+	u8 time_to_live;
+	u8 protocol;
+	__be16 check;
+	__be32 src_addr;
+	__be32 dst_addr;
+};
+
+struct ice_le_ver_tc_flow {
+	union {
+		struct {
+			u32 flow_label : 20;
+			u32 tc : 8;
+			u32 version : 4;
+		} fld;
+		u32 val;
+	} u;
+};
+
+struct ice_ipv6_hdr {
+	__be32 be_ver_tc_flow;
+	__be16 payload_len;
+	u8 next_hdr;
+	u8 hop_limit;
+	u8 src_addr[ICE_IPV6_ADDR_LENGTH];
+	u8 dst_addr[ICE_IPV6_ADDR_LENGTH];
+};
+
+struct ice_sctp_hdr {
+	__be16 src_port;
+	__be16 dst_port;
+	__be32 verification_tag;
+	__be32 check;
+};
+
+struct ice_l4_hdr {
+	__be16 src_port;
+	__be16 dst_port;
+	__be16 len;
+	__be16 check;
+};
+
+struct ice_udp_tnl_hdr {
+	__be16 field;
+	__be16 proto_type;
+	__be32 vni;	/* only use lower 24-bits */
+};
+
+struct ice_udp_gtp_hdr {
+	u8 flags;
+	u8 msg_type;
+	__be16 rsrvd_len;
+	__be32 teid;
+	__be16 rsrvd_seq_nbr;
+	u8 rsrvd_n_pdu_nbr;
+	u8 rsrvd_next_ext;
+	u8 rsvrd_ext_len;
+	u8 pdu_type;
+	u8 qfi;
+	u8 rsvrd;
+};
+
+struct ice_pppoe_hdr {
+	u8 rsrvd_ver_type;
+	u8 rsrvd_code;
+	__be16 session_id;
+	__be16 length;
+	__be16 ppp_prot_id; /* control and data only */
+};
+
+struct ice_pfcp_hdr {
+	u8 flags;
+	u8 msg_type;
+	__be16 length;
+	__be64 seid;
+	__be32 seq;
+	u8 spare;
+};
+
+struct ice_l2tpv3_sess_hdr {
+	__be32 session_id;
+	__be64 cookie;
+};
+
+struct ice_esp_hdr {
+	__be32 spi;
+	__be32 seq;
+};
+
+struct ice_ah_hdr {
+	u8 next_hdr;
+	u8 paylen;
+	__be16 rsrvd;
+	__be32 spi;
+	__be32 seq;
+};
+
+struct ice_nat_t_hdr {
+	struct ice_esp_hdr esp;
+};
+
+
+struct ice_nvgre {
+	__be16 flags;
+	__be16 protocol;
+	__be32 tni_flow;
+};
+
+union ice_prot_hdr {
+	struct ice_ether_hdr eth_hdr;
+	struct ice_ethtype_hdr ethertype;
+	struct ice_vlan_hdr vlan_hdr;
+	struct ice_ipv4_hdr ipv4_hdr;
+	struct ice_ipv6_hdr ipv6_hdr;
+	struct ice_l4_hdr l4_hdr;
+	struct ice_sctp_hdr sctp_hdr;
+	struct ice_udp_tnl_hdr tnl_hdr;
+	struct ice_nvgre nvgre_hdr;
+	struct ice_udp_gtp_hdr gtp_hdr;
+	struct ice_pppoe_hdr pppoe_hdr;
+	struct ice_pfcp_hdr pfcp_hdr;
+	struct ice_l2tpv3_sess_hdr l2tpv3_sess_hdr;
+	struct ice_esp_hdr esp_hdr;
+	struct ice_ah_hdr ah_hdr;
+	struct ice_nat_t_hdr nat_t_hdr;
+};
+
+/* This is mapping table entry that maps every word within a given protocol
+ * structure to the real byte offset as per the specification of that
+ * protocol header.
+ * for e.g. dst address is 3 words in ethertype header and corresponding bytes
+ * are 0, 2, 3 in the actual packet header and src address is at 4, 6, 8
+ */
+struct ice_prot_ext_tbl_entry {
+	enum ice_protocol_type prot_type;
+	/* Byte offset into header of given protocol type */
+	u8 offs[sizeof(union ice_prot_hdr)];
+};
+
+/* Extractions to be looked up for a given recipe */
+struct ice_prot_lkup_ext {
+	u16 prot_type;
+	u8 n_val_words;
+	/* create a buffer to hold max words per recipe */
+	u16 field_off[ICE_MAX_CHAIN_WORDS];
+	u16 field_mask[ICE_MAX_CHAIN_WORDS];
+
+	struct ice_fv_word fv_words[ICE_MAX_CHAIN_WORDS];
+
+	/* Indicate field offsets that have field vector indices assigned */
+	DECLARE_BITMAP(done, ICE_MAX_CHAIN_WORDS);
+};
+
+struct ice_pref_recipe_group {
+	u8 n_val_pairs;		/* Number of valid pairs */
+	struct ice_fv_word pairs[ICE_NUM_WORDS_RECIPE];
+	u16 mask[ICE_NUM_WORDS_RECIPE];
+};
+
+struct ice_recp_grp_entry {
+	struct list_head l_entry;
+
+#define ICE_INVAL_CHAIN_IND 0xFF
+	u16 rid;
+	u8 chain_idx;
+	u16 fv_idx[ICE_NUM_WORDS_RECIPE];
+	u16 fv_mask[ICE_NUM_WORDS_RECIPE];
+	struct ice_pref_recipe_group r_group;
+};
+#endif /* _ICE_PROTOCOL_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
new file mode 100644
index 000000000000..7198f55cdb0d
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -0,0 +1,3648 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_lib.h"
+
+#define E810_OUT_PROP_DELAY_NS 1
+
+
+#define LOCKED_INCVAL_E822 0x100000000ULL
+
+static const struct ptp_pin_desc ice_e810t_pin_desc[] = {
+	/* name     idx   func         chan */
+	 { "GNSS",  GNSS, PTP_PF_EXTTS, 0, { 0, } },
+	 { "SMA1",  SMA1, PTP_PF_NONE, 1, { 0, } },
+	 { "U.FL1", UFL1, PTP_PF_NONE, 1, { 0, } },
+	 { "SMA2",  SMA2, PTP_PF_NONE, 2, { 0, } },
+	 { "U.FL2", UFL2, PTP_PF_NONE, 2, { 0, } },
+};
+
+/**
+ * ice_enable_e810t_sma_ctrl
+ * @hw: pointer to the hw struct
+ * @ena: set true to enable and false to disable
+ *
+ * Enables or disable the SMA control logic
+ */
+static int ice_enable_e810t_sma_ctrl(struct ice_hw *hw, bool ena)
+{
+	int err;
+	u8 data;
+
+	/* Set expander bits as outputs */
+	err = ice_read_e810t_pca9575_reg(hw, ICE_PCA9575_P1_CFG, &data);
+	if (err)
+		return err;
+
+	if (ena)
+		data &= (~ICE_E810T_SMA_CTRL_MASK);
+	else
+		data |= ICE_E810T_SMA_CTRL_MASK;
+
+	return ice_write_e810t_pca9575_reg(hw, ICE_PCA9575_P1_CFG, data);
+}
+
+/**
+ * ice_get_e810t_sma_config
+ * @hw: pointer to the hw struct
+ * @ptp_pins:pointer to the ptp_pin_desc struture
+ *
+ * Read the configuration of the SMA control logic and put it into the
+ * ptp_pin_desc structure
+ */
+static int
+ice_get_e810t_sma_config(struct ice_hw *hw, struct ptp_pin_desc *ptp_pins)
+{
+	enum ice_status status;
+	u8 data, i;
+
+	/* Read initial pin state */
+	status = ice_read_e810t_pca9575_reg(hw, ICE_PCA9575_P1_OUT, &data);
+	if (status)
+		return ice_status_to_errno(status);
+
+	/* initialize with defaults */
+	for (i = 0; i < NUM_E810T_PTP_PINS; i++) {
+		snprintf(ptp_pins[i].name, sizeof(ptp_pins[i].name),
+			 "%s", ice_e810t_pin_desc[i].name);
+		ptp_pins[i].index = ice_e810t_pin_desc[i].index;
+		ptp_pins[i].func = ice_e810t_pin_desc[i].func;
+		ptp_pins[i].chan = ice_e810t_pin_desc[i].chan;
+	}
+
+	/* Parse SMA1/UFL1 */
+	switch (data & ICE_E810T_SMA1_CTRL_MASK) {
+	case ICE_E810T_SMA1_CTRL_MASK:
+	default:
+		ptp_pins[SMA1].func = PTP_PF_NONE;
+		ptp_pins[UFL1].func = PTP_PF_NONE;
+		break;
+	case ICE_E810T_P1_SMA1_DIR_EN:
+		ptp_pins[SMA1].func = PTP_PF_PEROUT;
+		ptp_pins[UFL1].func = PTP_PF_NONE;
+		break;
+	case ICE_E810T_P1_SMA1_TX_EN:
+		ptp_pins[SMA1].func = PTP_PF_EXTTS;
+		ptp_pins[UFL1].func = PTP_PF_NONE;
+		break;
+	case 0:
+		ptp_pins[SMA1].func = PTP_PF_EXTTS;
+		ptp_pins[UFL1].func = PTP_PF_PEROUT;
+		break;
+	}
+
+	/* Parse SMA2/UFL2 */
+	switch (data & ICE_E810T_SMA2_CTRL_MASK) {
+	case ICE_E810T_SMA2_CTRL_MASK:
+	default:
+		ptp_pins[SMA2].func = PTP_PF_NONE;
+		ptp_pins[UFL2].func = PTP_PF_NONE;
+		break;
+	case (ICE_E810T_P1_SMA2_TX_EN | ICE_E810T_P1_SMA2_UFL2_RX_DIS):
+		ptp_pins[SMA2].func = PTP_PF_EXTTS;
+		ptp_pins[UFL2].func = PTP_PF_NONE;
+		break;
+	case (ICE_E810T_P1_SMA2_DIR_EN | ICE_E810T_P1_SMA2_UFL2_RX_DIS):
+		ptp_pins[SMA2].func = PTP_PF_PEROUT;
+		ptp_pins[UFL2].func = PTP_PF_NONE;
+		break;
+	case (ICE_E810T_P1_SMA2_DIR_EN | ICE_E810T_P1_SMA2_TX_EN):
+		ptp_pins[SMA2].func = PTP_PF_NONE;
+		ptp_pins[UFL2].func = PTP_PF_EXTTS;
+		break;
+	case ICE_E810T_P1_SMA2_DIR_EN:
+		ptp_pins[SMA2].func = PTP_PF_PEROUT;
+		ptp_pins[UFL2].func = PTP_PF_EXTTS;
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_set_e810t_sma_state
+ * @hw: pointer to the hw struct
+ * @ptp_pins: pointer to the ptp_pin_desc struture
+ *
+ * Set the configuration of the SMA control logic based on the configuration in
+ * num_pins parameter
+ */
+static int
+ice_ptp_set_e810t_sma_state(struct ice_hw *hw,
+			    const struct ptp_pin_desc *ptp_pins)
+{
+	enum ice_status status;
+	u8 data;
+
+	/* SMA1 and UFL1 cannot be set to TX at the same time */
+	if (ptp_pins[SMA1].func == PTP_PF_PEROUT &&
+	    ptp_pins[UFL1].func == PTP_PF_PEROUT)
+		return ICE_ERR_PARAM;
+
+	/* SMA2 and UFL2 cannot be set to RX at the same time */
+	if (ptp_pins[SMA2].func == PTP_PF_EXTTS &&
+	    ptp_pins[UFL2].func == PTP_PF_EXTTS)
+		return ICE_ERR_PARAM;
+
+	/* Read initial pin state value */
+	status = ice_read_e810t_pca9575_reg(hw, ICE_PCA9575_P1_OUT, &data);
+	if (status)
+		return ice_status_to_errno(status);
+
+	/* Set the right sate based on the desired configuration */
+	data &= ~ICE_E810T_SMA1_CTRL_MASK;
+	if (ptp_pins[SMA1].func == PTP_PF_NONE &&
+	    ptp_pins[UFL1].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA1 + U.FL1 disabled");
+		data |= ICE_E810T_SMA1_CTRL_MASK;
+	} else if (ptp_pins[SMA1].func == PTP_PF_EXTTS &&
+		   ptp_pins[UFL1].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA1 RX");
+		data |= ICE_E810T_P1_SMA1_TX_EN;
+	} else if (ptp_pins[SMA1].func == PTP_PF_NONE &&
+		   ptp_pins[UFL1].func == PTP_PF_PEROUT) {
+		/* U.FL 1 TX will always enable SMA 1 RX */
+		dev_info(ice_hw_to_dev(hw), "SMA1 RX + U.FL1 TX");
+	} else if (ptp_pins[SMA1].func == PTP_PF_EXTTS &&
+		   ptp_pins[UFL1].func == PTP_PF_PEROUT) {
+		dev_info(ice_hw_to_dev(hw), "SMA1 RX + U.FL1 TX");
+	} else if (ptp_pins[SMA1].func == PTP_PF_PEROUT &&
+		   ptp_pins[UFL1].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA1 TX");
+		data |= ICE_E810T_P1_SMA1_DIR_EN;
+	}
+
+	data &= (~ICE_E810T_SMA2_CTRL_MASK);
+	if (ptp_pins[SMA2].func == PTP_PF_NONE &&
+	    ptp_pins[UFL2].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA2 + U.FL2 disabled");
+		data |= ICE_E810T_SMA2_CTRL_MASK;
+	} else if (ptp_pins[SMA2].func == PTP_PF_EXTTS &&
+			ptp_pins[UFL2].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA2 RX");
+		data |= (ICE_E810T_P1_SMA2_TX_EN |
+			 ICE_E810T_P1_SMA2_UFL2_RX_DIS);
+	} else if (ptp_pins[SMA2].func == PTP_PF_NONE &&
+		   ptp_pins[UFL2].func == PTP_PF_EXTTS) {
+		dev_info(ice_hw_to_dev(hw), "UFL2 RX");
+		data |= (ICE_E810T_P1_SMA2_DIR_EN | ICE_E810T_P1_SMA2_TX_EN);
+	} else if (ptp_pins[SMA2].func == PTP_PF_PEROUT &&
+		   ptp_pins[UFL2].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA2 TX");
+		data |= (ICE_E810T_P1_SMA2_DIR_EN |
+			 ICE_E810T_P1_SMA2_UFL2_RX_DIS);
+	} else if (ptp_pins[SMA2].func == PTP_PF_PEROUT &&
+		   ptp_pins[UFL2].func == PTP_PF_EXTTS) {
+		dev_info(ice_hw_to_dev(hw), "SMA2 TX + U.FL2 RX");
+		data |= ICE_E810T_P1_SMA2_DIR_EN;
+	}
+
+	status = ice_write_e810t_pca9575_reg(hw, ICE_PCA9575_P1_OUT, data);
+	if (status)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_set_e810t_sma
+ * @info: the driver's PTP info structure
+ * @pin: pin index in kernel structure
+ * @func: Pin function to be set (PTP_PF_NONE, PTP_PF_EXTTS or PTP_PF_PEROUT)
+ *
+ * Set the configuration of a single SMA pin
+ */
+static int
+ice_ptp_set_e810t_sma(struct ptp_clock_info *info, unsigned int pin,
+		      enum ptp_pin_function func)
+{
+	struct ptp_pin_desc ptp_pins[NUM_E810T_PTP_PINS];
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_hw *hw = &pf->hw;
+	int err;
+
+	if (pin < SMA1 || func > PTP_PF_PEROUT)
+		return -EOPNOTSUPP;
+
+	err = ice_get_e810t_sma_config(hw, ptp_pins);
+	if (err)
+		return err;
+
+	/* Disable the same function on the other pin sharing the channel */
+	if (pin == SMA1 && ptp_pins[UFL1].func == func)
+		ptp_pins[UFL1].func = PTP_PF_NONE;
+	if (pin == UFL1 && ptp_pins[SMA1].func == func)
+		ptp_pins[SMA1].func = PTP_PF_NONE;
+
+	if (pin == SMA2 && ptp_pins[UFL2].func == func)
+		ptp_pins[UFL2].func = PTP_PF_NONE;
+	if (pin == UFL2 && ptp_pins[SMA2].func == func)
+		ptp_pins[SMA2].func = PTP_PF_NONE;
+
+	/* Set up new pin function in the temp table */
+	ptp_pins[pin].func = func;
+
+	return ice_ptp_set_e810t_sma_state(hw, ptp_pins);
+}
+
+/**
+ * ice_e810t_verify_pin
+ * @info: the driver's PTP info structure
+ * @pin: Pin index
+ * @func: Assigned function
+ * @chan: Assigned channel
+ *
+ * Verify if pin supports requested pin function. If the Check pins consistency.
+ * Reconfigure the SMA logic attached to the given pin to enable its
+ * desired functionality
+ */
+static int
+ice_e810t_verify_pin(struct ptp_clock_info *info, unsigned int pin,
+		     enum ptp_pin_function func, unsigned int chan)
+{
+	/* Don't allow channel reassignment */
+	if (chan != ice_e810t_pin_desc[pin].chan)
+		return -EOPNOTSUPP;
+
+	/* Check if functions are properly assigned */
+	switch (func) {
+	case PTP_PF_NONE:
+		break;
+	case PTP_PF_EXTTS:
+		if (pin == UFL1)
+			return -EOPNOTSUPP;
+		break;
+	case PTP_PF_PEROUT:
+		if (pin == UFL2 || pin == GNSS)
+			return -EOPNOTSUPP;
+		break;
+	case PTP_PF_PHYSYNC:
+		return -EOPNOTSUPP;
+	}
+
+	return ice_ptp_set_e810t_sma(info, pin, func);
+}
+
+
+
+/**
+ * mul_u128_u64_fac - Multiplies two 64bit factors to the 128b result
+ * @a: First factor to multiply
+ * @b: Second factor to multiply
+ * @hi: Pointer for higher part of 128b result
+ * @lo: Pointer for lower part of 128b result
+ *
+ * This function performs multiplication of two 64 bit factors with 128b
+ * output.
+ */
+static inline void mul_u128_u64_fac(u64 a, u64 b, u64 *hi, u64 *lo)
+{
+	u64 mask = GENMASK_ULL(31, 0);
+	u64 a_lo = a & mask;
+	u64 b_lo = b & mask;
+
+	a >>= 32;
+	b >>= 32;
+
+	*hi = (a * b) + (((a * b_lo) + ((a_lo * b_lo) >> 32)) >> 32) +
+	      (((a_lo * b) + (((a * b_lo) + ((a_lo * b_lo) >> 32)) & mask)) >> 32);
+	*lo = (((a_lo * b) + (((a * b_lo) + ((a_lo * b_lo) >> 32)) & mask)) << 32) +
+	      ((a_lo * b_lo) & mask);
+}
+
+
+/**
+ * ice_set_tx_tstamp - Enable or disable Tx timestamping
+ * @pf: The PF pointer to search in
+ * @on: bool value for whether timestamps are enabled or disabled
+ */
+static void ice_set_tx_tstamp(struct ice_pf *pf, bool on)
+{
+	struct ice_vsi *vsi;
+	u32 val;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	vsi->ptp_tx = on;
+
+	/* Enable/disable the TX timestamp interrupt  */
+	val = rd32(&pf->hw, PFINT_OICR_ENA);
+	if (on)
+		val |= PFINT_OICR_TSYN_TX_M;
+	else
+		val &= ~PFINT_OICR_TSYN_TX_M;
+	wr32(&pf->hw, PFINT_OICR_ENA, val);
+
+	if (on)
+		pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_ON;
+	else
+		pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_OFF;
+}
+
+/**
+ * ice_set_rx_tstamp - Enable or disable Rx timestamping
+ * @pf: The PF pointer to search in
+ * @on: bool value for whether timestamps are enabled or disabled
+ */
+static void ice_set_rx_tstamp(struct ice_pf *pf, bool on)
+{
+	struct ice_vsi *vsi;
+	u16 i;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	ice_for_each_rxq(vsi, i) {
+		if (!vsi->rx_rings[i])
+			continue;
+		vsi->rx_rings[i]->ptp_rx = on;
+	}
+
+	if (on)
+		pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_ALL;
+	else
+		pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
+}
+
+
+/**
+ * ice_ptp_cfg_timestamp - Configure timestamp for init/deinit
+ * @pf: Board private structure
+ * @ena: bool value to enable or disable time stamp
+ *
+ * This function will configure timestamping during PTP initialization
+ * and deinitialization
+ */
+static void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena)
+{
+	ice_set_tx_tstamp(pf, ena);
+	ice_set_rx_tstamp(pf, ena);
+
+}
+
+/**
+ * ice_get_ptp_clock_index - Get the PTP clock index
+ * @pf: the PF pointer
+ *
+ * Determine the clock index of the PTP clock associated with this device. If
+ * this is the PF controlling the clock, just use the local access to the
+ * clock device pointer.
+ *
+ * Otherwise, read from the driver shared parameters to determine the clock
+ * index value.
+ *
+ * Returns: the index of the PTP clock associated with this device, or -1 if
+ * there is no associated clock.
+ */
+int ice_get_ptp_clock_index(struct ice_pf *pf)
+{
+	enum ice_aqc_driver_params param_idx;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 tmr_idx;
+	u32 value;
+
+	/* Use the ptp_clock structure if we're the main PF */
+	if (pf->ptp.clock)
+		return ptp_clock_index(pf->ptp.clock);
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_assoc;
+	if (!tmr_idx)
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR0;
+	else
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR1;
+
+	status = ice_aq_get_driver_param(hw, param_idx, &value, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf),
+			"Failed to read PTP clock index parameter, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -1;
+	}
+
+	/* The PTP clock index is an integer, and will be between 0 and
+	 * INT_MAX. The highest bit of the driver shared parameter is used to
+	 * indicate whether or not the currently stored clock index is valid.
+	 */
+	if (!(value & PTP_SHARED_CLK_IDX_VALID))
+		return -1;
+
+	return value & ~PTP_SHARED_CLK_IDX_VALID;
+}
+
+/**
+ * ice_set_ptp_clock_index - Set the PTP clock index
+ * @pf: the PF pointer
+ *
+ * Set the PTP clock index for this device into the shared driver parameters,
+ * so that other PFs associated with this device can read it.
+ *
+ * If the PF is unable to store the clock index, it will log an error, but
+ * will continue operating PTP.
+ */
+static void ice_set_ptp_clock_index(struct ice_pf *pf)
+{
+	enum ice_aqc_driver_params param_idx;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 tmr_idx;
+	u32 value;
+
+	if (!pf->ptp.clock)
+		return;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_assoc;
+	if (!tmr_idx)
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR0;
+	else
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR1;
+
+	value = (u32)ptp_clock_index(pf->ptp.clock);
+	if (value > INT_MAX) {
+		dev_err(ice_pf_to_dev(pf), "PTP Clock index is too large to store\n");
+		return;
+	}
+	value |= PTP_SHARED_CLK_IDX_VALID;
+
+	status = ice_aq_set_driver_param(hw, param_idx, value, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf),
+			"Failed to set PTP clock index parameter, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+	}
+}
+
+/**
+ * ice_clear_ptp_clock_index - Clear the PTP clock index
+ * @pf: the PF pointer
+ *
+ * Clear the PTP clock index for this device. Must be called when
+ * unregistering the PTP clock, in order to ensure other PFs stop reporting
+ * a clock object that no longer exists.
+ */
+static void ice_clear_ptp_clock_index(struct ice_pf *pf)
+{
+	enum ice_aqc_driver_params param_idx;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 tmr_idx;
+
+	/* Do not clear the index if we don't own the timer */
+	if (!hw->func_caps.ts_func_info.src_tmr_owned)
+		return;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_assoc;
+	if (!tmr_idx)
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR0;
+	else
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR1;
+
+	status = ice_aq_set_driver_param(hw, param_idx, 0, NULL);
+	if (status) {
+		dev_dbg(ice_pf_to_dev(pf),
+			"Failed to clear PTP clock index parameter, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+	}
+}
+
+/**
+ * ice_ptp_read_src_clk_reg - Read the source clock register
+ * @pf: Board private structure
+ * @sts: Optional parameter for holding a pair of system timestamps from
+ *       the system clock. Will be ignored if NULL is given.
+ */
+u64
+ice_ptp_read_src_clk_reg(struct ice_pf *pf, struct ptp_system_timestamp *sts)
+{
+	struct ice_hw *hw = &pf->hw;
+	u32 hi, lo, lo2;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+	/* Read the system timestamp pre PHC read */
+	if (sts)
+		ptp_read_system_prets(sts);
+
+	lo = rd32(hw, GLTSYN_TIME_L(tmr_idx));
+
+	/* Read the system timestamp post PHC read */
+	if (sts)
+		ptp_read_system_postts(sts);
+
+	hi = rd32(hw, GLTSYN_TIME_H(tmr_idx));
+	lo2 = rd32(hw, GLTSYN_TIME_L(tmr_idx));
+
+	if (lo2 < lo) {
+		/* if TIME_L rolled over read TIME_L again and update
+		 *system timestamps
+		 */
+		if (sts)
+			ptp_read_system_prets(sts);
+		lo = rd32(hw, GLTSYN_TIME_L(tmr_idx));
+		if (sts)
+			ptp_read_system_postts(sts);
+		hi = rd32(hw, GLTSYN_TIME_H(tmr_idx));
+	}
+
+	return ((u64)hi << 32) | lo;
+}
+
+
+/**
+ * ice_ptp_update_cached_systime - Update the cached system time values
+ * @pf: Board specific private structure
+ *
+ * This function updates the system time values which are cached in the PF
+ * structure and the Rx rings.
+ *
+ * This should be called periodically at least once a second, and whenever the
+ * system time has been adjusted.
+ */
+static void ice_ptp_update_cached_systime(struct ice_pf *pf)
+{
+	u64 systime;
+	int i;
+
+	/* Read the current system time */
+	systime = ice_ptp_read_src_clk_reg(pf, NULL);
+
+	/* Update the cached system time stored in the PF structure */
+	WRITE_ONCE(pf->ptp.cached_phc_time, systime);
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+		int j;
+
+		if (!vsi)
+			continue;
+
+#ifdef HAVE_NETDEV_SB_DEV
+		if (vsi->type != ICE_VSI_PF &&
+		    vsi->type != ICE_VSI_OFFLOAD_MACVLAN)
+			continue;
+#else
+		if (vsi->type != ICE_VSI_PF)
+			continue;
+#endif /* HAVE_NETDEV_SB_DEV */
+
+		ice_for_each_rxq(vsi, j) {
+			if (!vsi->rx_rings[j])
+				continue;
+			WRITE_ONCE(vsi->rx_rings[j]->cached_systime, systime);
+		}
+	}
+}
+
+/**
+ * ice_ptp_extend_32b_ts - Convert a 32b nanoseconds timestamp to 64b
+ * @cached_phc_time: recently cached copy of PHC time
+ * @in_tstamp: Ingress/egress 32b nanoseconds timestamp value
+ *
+ * Hardware captures timestamps which contain only 32 bits of nominal
+ * nanoseconds, as opposed to the 64bit timestamps that the stack expects.
+ * Note that the captured timestamp values may be 40 bits, but the lower
+ * 8 bits are sub-nanoseconds and generally discarded.
+ *
+ * Extend the 32bit nanosecond timestamp using the following algorithm and
+ * assumptions:
+ *
+ * 1) have a recently cached copy of the PHC time
+ * 2) assume that the in_tstamp was captured 2^31 nanoseconds (~2.1
+ *    seconds) before or after the PHC time was captured.
+ * 3) calculate the delta between the cached time and the timestamp
+ * 4) if the delta is smaller than 2^31 nanoseconds, then the timestamp was
+ *    captured after the PHC time. In this case, the full timestamp is just
+ *    the cached PHC time plus the delta.
+ * 5) otherwise, if the delta is larger than 2^31 nanoseconds, then the
+ *    timestamp was captured *before* the PHC time, i.e. because the PHC
+ *    cache was updated after the timestamp was captured by hardware. In this
+ *    case, the full timestamp is the cached time minus the inverse delta.
+ *
+ * This algorithm works even if the PHC time was updated after a Tx timestamp
+ * was requested, but before the Tx timestamp event was reported from
+ * hardware.
+ *
+ * This calculation primarily relies on keeping the cached PHC time up to
+ * date. If the timestamp was captured more than 2^31 nanoseconds after the
+ * PHC time, it is possible that the lower 32bits of PHC time have
+ * overflowed more than once, and we might generate an incorrect timestamp.
+ *
+ * This is prevented by (a) periodically updating the cached PHC time once
+ * a second, and (b) discarding any Tx timestamp packet if it has waited for
+ * a timestamp for more than one second.
+ */
+static u64 ice_ptp_extend_32b_ts(u64 cached_phc_time, u32 in_tstamp)
+{
+	u32 delta, phc_time_lo;
+	u64 ns;
+
+	/* Extract the lower 32 bits of the PHC time */
+	phc_time_lo = (u32)cached_phc_time;
+
+	/* Calculate the delta between the lower 32bits of the cached PHC
+	 * time and the in_tstamp value
+	 */
+	delta = (in_tstamp - phc_time_lo);
+
+	/* Do not assume that the in_tstamp is always more recent than the
+	 * cached PHC time. If the delta is large, it indicates that the
+	 * in_tstamp was taken in the past, and should be converted
+	 * forward.
+	 */
+	if (delta > (U32_MAX / 2)) {
+		/* reverse the delta calculation here */
+		delta = (phc_time_lo - in_tstamp);
+		ns = cached_phc_time - delta;
+	} else {
+		ns = cached_phc_time + delta;
+	}
+
+	return ns;
+}
+
+/**
+ * ice_ptp_extend_40b_ts - Convert a 40b timestamp to 64b nanoseconds
+ * @pf: Board private structure
+ * @in_tstamp: Ingress/egress 40b timestamp value
+ *
+ * The Tx and Rx timestamps are 40 bits wide, including 32 bits of nominal
+ * nanoseconds, 7 bits of sub-nanoseconds, and a valid bit.
+ *
+ *  *--------------------------------------------------------------*
+ *  | 32 bits of nanoseconds | 7 high bits of sub ns underflow | v |
+ *  *--------------------------------------------------------------*
+ *
+ * The low bit is an indicator of whether the timestamp is valid. The next
+ * 7 bits are a capture of the upper 7 bits of the sub-nanosecond underflow,
+ * and the remaining 32 bits are the lower 32 bits of the PHC timer.
+ *
+ * It is assumed that the caller verifies the timestamp is valid prior to
+ * calling this function.
+ *
+ * Extract the 32bit nominal nanoseconds and extend them. Use the cached PHC
+ * time stored in the device private PTP structure as the basis for timestamp
+ * extension.
+ *
+ * See ice_ptp_extend_32b_ts for a detailed explanation of the extension
+ * algorithm.
+ */
+static u64 ice_ptp_extend_40b_ts(struct ice_pf *pf, u64 in_tstamp)
+{
+	const u64 mask = GENMASK_ULL(31, 0);
+	return ice_ptp_extend_32b_ts(pf->ptp.cached_phc_time,
+				     (in_tstamp >> 8) & mask);
+}
+
+/**
+ * ice_ptp_get_ts_idx - Find the free Tx index based on current logical port
+ * @vsi: lport corresponding VSI
+ */
+int ice_ptp_get_ts_idx(struct ice_vsi *vsi)
+{
+	u8 own_idx_start, own_idx_end, lport, qport;
+	int i;
+
+	lport = vsi->port_info->lport;
+	qport = lport % ICE_PORTS_PER_QUAD;
+	/* Check on own idx window */
+	own_idx_start = qport * INDEX_PER_PORT;
+	own_idx_end = own_idx_start + INDEX_PER_PORT;
+
+	for (i = own_idx_start; i < own_idx_end; i++) {
+		if (!test_and_set_bit(i, vsi->ptp_tx_idx))
+			return i;
+	}
+
+	return -1;
+}
+
+/**
+ * ice_ptp_rel_all_skb - Free all pending skb waiting for timestamp
+ * @pf: The PF private structure
+ */
+static void ice_ptp_rel_all_skb(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+	int idx;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+	for (idx = 0; idx < INDEX_PER_QUAD; idx++) {
+		if (vsi->ptp_tx_skb[idx]) {
+			dev_kfree_skb_any(vsi->ptp_tx_skb[idx]);
+			vsi->ptp_tx_skb[idx] = NULL;
+		}
+	}
+}
+
+static const u64 txrx_lane_par_clk[NUM_ICE_PTP_LNK_SPD] = {
+	31250000,	/* 1G */
+	257812500,	/* 10G */
+	644531250,	/* 25G */
+	161132812,	/* 25G RS */
+	257812500,	/* 40G */
+	644531250,	/* 50G */
+	644531250,	/* 50G RS */
+	644531250,	/* 100G RS */
+};
+
+static const u64 txrx_lane_pcs_clk[NUM_ICE_PTP_LNK_SPD] = {
+	125000000,	/* 1G */
+	156250000,	/* 10G */
+	390625000,	/* 25G */
+	97656250,	/* 25G RS */
+	156250000,	/* 40G */
+	390625000,	/* 50G */
+	644531250,	/* 50G RS */
+	644531250,	/* 100G RS */
+};
+
+static const u64 txrx_rsgb_par_clk[NUM_ICE_PTP_LNK_SPD] = {
+	0,		/* 1G */
+	0,		/* 10G */
+	0,		/* 25G */
+	322265625,	/* 25G RS */
+	0,		/* 40G */
+	0,		/* 50G */
+	644531250,	/* 50G RS */
+	1289062500,	/* 100G RS */
+};
+
+static const u64 txrx_rsgb_pcs_clk[NUM_ICE_PTP_LNK_SPD] = {
+	0, 0, 0, 97656250, 0, 0, 195312500, 390625000
+};
+
+static const u64 rx_desk_par_pcs_clk[NUM_ICE_PTP_LNK_SPD] = {
+	0,		/* 1G */
+	0,		/* 10G */
+	0,		/* 25G */
+	0,		/* 25G RS */
+	156250000,	/* 40G */
+	19531250,	/* 50G */
+	644531250,	/* 50G RS */
+	644531250,	/* 100G RS */
+};
+
+/**
+ * ice_ptp_port_phy_set_parpcs_incval - Set PAR/PCS PHY cycle count
+ * @pf: Board private struct
+ * @port: Port we are configuring PHY for
+ *
+ * Note that this function is only expected to be called during port up and
+ * during a link event.
+ */
+static void ice_ptp_port_phy_set_parpcs_incval(struct ice_pf *pf, int port)
+{
+	u64 cur_freq, clk_incval, uix, phy_tus;
+	enum ice_ptp_link_spd link_spd;
+	enum ice_ptp_fec_mode fec_mode;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+
+	cur_freq = ice_e822_pll_freq(pf->ptp.time_ref_freq);
+	clk_incval = ice_ptp_read_src_incval(hw);
+
+	status = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, &fec_mode);
+	if (status)
+		goto exit;
+
+	/* UIX programming */
+	/* We split a 'divide by 1e11' operation into a 'divide by 256' and a
+	 * 'divide by 390625000' operation to be able to do the calculation
+	 * using fixed-point math.
+	 */
+	if (link_spd == ICE_PTP_LNK_SPD_10G ||
+	    link_spd == ICE_PTP_LNK_SPD_40G) {
+#define LINE_UI_10G_40G 640 /* 6600 UI at 10Gb line rate */
+		uix = (cur_freq * LINE_UI_10G_40G) >> 8;
+		uix *= clk_incval;
+		uix /= 390625000;
+
+		val = TS_LOW_M & uix;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_UIX66_10G_40G_L,
+						val);
+		if (status)
+			goto exit;
+		val = (uix >> 32) & TS_LOW_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_UIX66_10G_40G_U,
+						val);
+		if (status)
+			goto exit;
+	} else if (link_spd == ICE_PTP_LNK_SPD_25G ||
+		   link_spd == ICE_PTP_LNK_SPD_100G_RS) {
+#define LINE_UI_25G_100G 256 /* 6600 UI at 25Gb line rate */
+		uix = (cur_freq * LINE_UI_25G_100G) >> 8;
+		uix *= clk_incval;
+		uix /= 390625000;
+
+		val = TS_LOW_M & uix;
+		status = ice_write_phy_reg_e822(hw, port,
+						P_REG_UIX66_25G_100G_L, val);
+		if (status)
+			goto exit;
+		val = (uix >> 32) & TS_LOW_M;
+		status = ice_write_phy_reg_e822(hw, port,
+						P_REG_UIX66_25G_100G_U, val);
+		if (status)
+			goto exit;
+	}
+
+	if (link_spd == ICE_PTP_LNK_SPD_25G_RS) {
+		phy_tus = (cur_freq * clk_incval * 2) /
+			  txrx_rsgb_par_clk[link_spd];
+		val = phy_tus & TS_PHY_LOW_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PAR_RX_TUS_L, val);
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PAR_TX_TUS_L, val);
+		val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PAR_RX_TUS_U, val);
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PAR_TX_TUS_U, val);
+
+		phy_tus = (cur_freq * clk_incval) /
+			  txrx_rsgb_pcs_clk[link_spd];
+		val = phy_tus & TS_PHY_LOW_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PCS_RX_TUS_L, val);
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PCS_TX_TUS_L, val);
+		val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PCS_RX_TUS_U, val);
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PCS_TX_TUS_U, val);
+	} else {
+		phy_tus = (cur_freq * clk_incval) /
+			txrx_lane_par_clk[link_spd];
+		val = phy_tus & TS_PHY_LOW_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_PAR_RX_TUS_L, val);
+		val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_PAR_RX_TUS_U, val);
+
+		if (link_spd != ICE_PTP_LNK_SPD_50G_RS &&
+		    link_spd != ICE_PTP_LNK_SPD_100G_RS) {
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_PAR_TX_TUS_L, val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_PAR_TX_TUS_U, val);
+		} else {
+			phy_tus = (cur_freq * clk_incval * 2) /
+				txrx_rsgb_par_clk[link_spd];
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_RX_TUS_L, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_TX_TUS_L, val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_RX_TUS_U, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_TX_TUS_U, val);
+		}
+
+		phy_tus = (cur_freq * clk_incval) /
+			txrx_lane_pcs_clk[link_spd];
+		val = phy_tus & TS_PHY_LOW_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_PCS_RX_TUS_L, val);
+		val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_PCS_RX_TUS_U, val);
+
+		if (link_spd != ICE_PTP_LNK_SPD_50G_RS &&
+		    link_spd != ICE_PTP_LNK_SPD_100G_RS) {
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port, P_REG_PCS_TX_TUS_L,
+					       val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port, P_REG_PCS_TX_TUS_U,
+					       val);
+		} else {
+			phy_tus = (cur_freq * clk_incval) /
+				txrx_rsgb_pcs_clk[link_spd];
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_RX_TUS_L, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_TX_TUS_L, val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_RX_TUS_U, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_TX_TUS_U, val);
+		}
+
+		if (link_spd == ICE_PTP_LNK_SPD_40G ||
+		    link_spd == ICE_PTP_LNK_SPD_50G) {
+			phy_tus = (cur_freq * clk_incval) /
+				rx_desk_par_pcs_clk[link_spd];
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_RX_TUS_L, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_RX_TUS_L, val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_RX_TUS_U, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_RX_TUS_U, val);
+		}
+	}
+
+exit:
+	if (status)
+		dev_err(ice_pf_to_dev(pf), "PTP Vernier configuration failed on port %d, status %s\n",
+			port, ice_stat_str(status));
+}
+
+/* Values of tx_offset_delay in units of 1/100th of a nanosecond */
+static const u64 tx_offset_delay[NUM_ICE_PTP_LNK_SPD] = {
+	25140,	/* 1G */
+	6938,	/* 10G */
+	2778,	/* 25G */
+	3928,	/* 25G RS */
+	5666,	/* 40G */
+	2778,	/* 50G */
+	2095,	/* 50G RS */
+	1620,	/* 100G RS */
+};
+
+/**
+ * ice_ptp_port_phy_set_tx_offset - Set PHY clock Tx timestamp offset
+ * @ptp_port: the PTP port we are configuring the PHY for
+ */
+static int ice_ptp_port_phy_set_tx_offset(struct ice_ptp_port *ptp_port)
+{
+	u64 cur_freq, clk_incval, offset;
+	enum ice_ptp_link_spd link_spd;
+	enum ice_status status;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int port;
+	u32 val;
+
+	pf = ptp_port_to_pf(ptp_port);
+	port = ptp_port->port_num;
+	hw = &pf->hw;
+
+	/* Get the PTP HW lock */
+	if (!ice_ptp_lock(hw))
+		return -EBUSY;
+
+	clk_incval = ice_ptp_read_src_incval(hw);
+	ice_ptp_unlock(hw);
+
+	cur_freq = ice_e822_pll_freq(pf->ptp.time_ref_freq);
+
+	status = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, NULL);
+	if (status)
+		goto exit;
+
+	offset = cur_freq * clk_incval;
+	offset /= 10000;
+	offset *= tx_offset_delay[link_spd];
+	offset /= 10000000;
+
+	if (link_spd == ICE_PTP_LNK_SPD_1G ||
+	    link_spd == ICE_PTP_LNK_SPD_10G ||
+	    link_spd == ICE_PTP_LNK_SPD_25G ||
+	    link_spd == ICE_PTP_LNK_SPD_25G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_40G ||
+	    link_spd == ICE_PTP_LNK_SPD_50G) {
+		status = ice_read_phy_reg_e822(hw, port,
+					       P_REG_PAR_PCS_TX_OFFSET_L,
+					       &val);
+		if (status)
+			goto exit;
+		offset += val;
+		status = ice_read_phy_reg_e822(hw, port,
+					       P_REG_PAR_PCS_TX_OFFSET_U,
+					       &val);
+		if (status)
+			goto exit;
+		offset += (u64)val << 32;
+	}
+
+	if (link_spd == ICE_PTP_LNK_SPD_50G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_100G_RS) {
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_TX_TIME_L,
+					       &val);
+		if (status)
+			goto exit;
+		offset += val;
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_TX_TIME_U,
+					       &val);
+		if (status)
+			goto exit;
+		offset += (u64)val << 32;
+	}
+
+	val = (u32)offset;
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TOTAL_TX_OFFSET_L, val);
+	if (status)
+		goto exit;
+	val = (u32)(offset >> 32);
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TOTAL_TX_OFFSET_U, val);
+	if (status)
+		goto exit;
+
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TX_OR, 1);
+	if (status)
+		goto exit;
+
+	atomic_set(&ptp_port->tx_offset_ready, 1);
+exit:
+	if (status)
+		dev_err(ice_pf_to_dev(pf),
+			"PTP tx offset configuration failed on port %d status=%s\n",
+			port, ice_stat_str(status));
+	return ice_status_to_errno(status);
+}
+
+/**
+ * ice_ptp_calc_pmd_adj - Calculate PMD adjustment using integers
+ * @cur_freq: PHY clock frequency
+ * @clk_incval: Source clock incval
+ * @calc_numerator: Value to divide
+ * @calc_denominator: Remainder of the division
+ *
+ * This is the integer math calculation which attempts to avoid overflowing
+ * a u64. The division (in this case 1/25.78125e9) is split into two parts 125
+ * and the remainder, which is the stored in calc_denominator.
+ */
+static u64
+ice_ptp_calc_pmd_adj(u64 cur_freq, u64 clk_incval, u64 calc_numerator,
+		     u64 calc_denominator)
+{
+	u64 pmd_adj = calc_numerator;
+
+	pmd_adj *= cur_freq;
+	pmd_adj /= 125;
+	pmd_adj *= clk_incval;
+	pmd_adj /= calc_denominator;
+	return pmd_adj;
+}
+
+/**
+ * ice_ptp_get_pmd_adj - Calculate total PMD adjustment
+ * @pf: Board private struct
+ * @port: Port we are configuring PHY for
+ * @cur_freq: PHY clock frequency
+ * @link_spd: PHY link speed
+ * @clk_incval: source clock incval
+ * @mode: FEC mode
+ * @pmd_adj: PMD adjustment to be calculated
+ */
+static int ice_ptp_get_pmd_adj(struct ice_pf *pf, int port, u64 cur_freq,
+			       enum ice_ptp_link_spd link_spd, u64 clk_incval,
+			       enum ice_ptp_fec_mode mode, u64 *pmd_adj)
+{
+	u64 calc_numerator, calc_denominator;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+	u8 pmd;
+
+	status = ice_read_phy_reg_e822(hw, port, P_REG_PMD_ALIGNMENT, &val);
+	if (status)
+		return -EIO;
+
+	pmd = (u8)val;
+
+	/* RS mode overrides all the other pmd_alignment calculations. */
+	if (link_spd == ICE_PTP_LNK_SPD_25G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_50G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_100G_RS) {
+		u64 pmd_cycle_adj = 0;
+		u8 rx_cycle;
+
+		if (link_spd == ICE_PTP_LNK_SPD_50G ||
+		    link_spd == ICE_PTP_LNK_SPD_50G_RS) {
+			ice_read_phy_reg_e822(hw, port, P_REG_RX_80_TO_160_CNT,
+					      &val);
+			rx_cycle = val & P_REG_RX_80_TO_160_CNT_RXCYC_M;
+		} else {
+			ice_read_phy_reg_e822(hw, port, P_REG_RX_40_TO_160_CNT,
+					      &val);
+			rx_cycle = val & P_REG_RX_40_TO_160_CNT_RXCYC_M;
+		}
+		calc_numerator = pmd;
+		if (pmd < 17)
+			calc_numerator += 40;
+		calc_denominator = 206250000;
+
+		*pmd_adj = ice_ptp_calc_pmd_adj(cur_freq, clk_incval,
+						calc_numerator,
+						calc_denominator);
+
+		if (rx_cycle != 0) {
+			if (link_spd == ICE_PTP_LNK_SPD_25G_RS)
+				calc_numerator = 4 - rx_cycle;
+			else if (link_spd == ICE_PTP_LNK_SPD_50G_RS)
+				calc_numerator = rx_cycle;
+			else
+				calc_numerator = 0;
+			calc_numerator *= 40;
+			pmd_cycle_adj = ice_ptp_calc_pmd_adj(cur_freq,
+							     clk_incval,
+							     calc_numerator,
+							     calc_denominator);
+		}
+		*pmd_adj += pmd_cycle_adj;
+	} else {
+		calc_numerator = 0;
+		calc_denominator = 1;
+		if (link_spd == ICE_PTP_LNK_SPD_1G) {
+			if (pmd == 4)
+				calc_numerator = 10;
+			else
+				calc_numerator = (pmd + 6) % 10;
+			calc_denominator = 10000000;
+		} else if (link_spd == ICE_PTP_LNK_SPD_10G ||
+			   link_spd == ICE_PTP_LNK_SPD_40G) {
+			if (pmd != 65 || mode == ICE_PTP_FEC_MODE_CLAUSE74) {
+				calc_numerator = pmd;
+				calc_denominator = 82500000;
+			}
+		} else if (link_spd == ICE_PTP_LNK_SPD_25G) {
+			if (pmd != 65 || mode == ICE_PTP_FEC_MODE_CLAUSE74) {
+				calc_numerator = pmd;
+				calc_denominator = 206250000;
+			}
+		} else if (link_spd == ICE_PTP_LNK_SPD_50G) {
+			if (pmd != 65 || mode == ICE_PTP_FEC_MODE_CLAUSE74) {
+				calc_numerator = pmd * 2;
+				calc_denominator = 206250000;
+			}
+		}
+		*pmd_adj = ice_ptp_calc_pmd_adj(cur_freq, clk_incval,
+						calc_numerator,
+						calc_denominator);
+	}
+
+	return 0;
+}
+
+/* Values of rx_offset_delay in units of 1/100th of a nanosecond */
+static const u64 rx_offset_delay[NUM_ICE_PTP_LNK_SPD] = {
+	17372,	/* 1G */
+	6212,	/* 10G */
+	2491,	/* 25G */
+	29535,	/* 25G RS */
+	4244,	/* 40G */
+	2868,	/* 50G */
+	14524,	/* 50G RS */
+	7775,	/* 100G RS */
+};
+
+/**
+ * ice_ptp_port_phy_set_rx_offset - Set PHY clock Tx timestamp offset
+ * @ptp_port: PTP port we are configuring PHY for
+ */
+static int ice_ptp_port_phy_set_rx_offset(struct ice_ptp_port *ptp_port)
+{
+	u64 cur_freq, clk_incval, offset, pmd_adj;
+	enum ice_ptp_link_spd link_spd;
+	enum ice_ptp_fec_mode fec_mode;
+	enum ice_status status;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int err, port;
+	u32 val;
+
+	pf = ptp_port_to_pf(ptp_port);
+	port = ptp_port->port_num;
+	hw = &pf->hw;
+
+	/* Get the PTP HW lock */
+	if (!ice_ptp_lock(hw)) {
+		err = -EBUSY;
+		goto exit;
+	}
+
+	clk_incval = ice_ptp_read_src_incval(hw);
+	ice_ptp_unlock(hw);
+
+	cur_freq = ice_e822_pll_freq(pf->ptp.time_ref_freq);
+
+	status = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, &fec_mode);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+
+	offset = cur_freq * clk_incval;
+	offset /= 10000;
+	offset *= rx_offset_delay[link_spd];
+	offset /= 10000000;
+
+	status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_PCS_RX_OFFSET_L,
+				       &val);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+	offset += val;
+	status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_PCS_RX_OFFSET_U,
+				       &val);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+	offset += (u64)val << 32;
+
+	if (link_spd == ICE_PTP_LNK_SPD_40G ||
+	    link_spd == ICE_PTP_LNK_SPD_50G ||
+	    link_spd == ICE_PTP_LNK_SPD_50G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_100G_RS) {
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_RX_TIME_L,
+					       &val);
+		if (status) {
+			err = ice_status_to_errno(status);
+			goto exit;
+		}
+		offset += val;
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_RX_TIME_U,
+					       &val);
+		if (status) {
+			err = ice_status_to_errno(status);
+			goto exit;
+		}
+		offset += (u64)val << 32;
+	}
+
+	err = ice_ptp_get_pmd_adj(pf, port, cur_freq, link_spd, clk_incval,
+				  fec_mode, &pmd_adj);
+	if (err)
+		goto exit;
+
+	if (fec_mode == ICE_PTP_FEC_MODE_RS_FEC)
+		offset += pmd_adj;
+	else
+		offset -= pmd_adj;
+
+	val = (u32)offset;
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TOTAL_RX_OFFSET_L, val);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+	val = (u32)(offset >> 32);
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TOTAL_RX_OFFSET_U, val);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+
+	status = ice_write_phy_reg_e822(hw, port, P_REG_RX_OR, 1);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+
+	atomic_set(&ptp_port->rx_offset_ready, 1);
+exit:
+	if (err)
+		dev_err(ice_pf_to_dev(pf),
+			"PTP rx offset configuration failed on port %d, err=%d\n",
+			port, err);
+	return err;
+}
+
+/**
+ * ice_ptp_port_sync_src_timer - Sync PHY timer with source timer
+ * @pf: Board private structure
+ * @port: Port for which the PHY start is set
+ *
+ * Sync PHY timer with source timer after calculating and setting Tx/Rx
+ * Vernier offset.
+ */
+static enum ice_status ice_ptp_port_sync_src_timer(struct ice_pf *pf, int port)
+{
+	u64 src_time = 0x0, tx_time, rx_time, temp_adj;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	s64 time_adj;
+	u32 zo, lo;
+	u8 tmr_idx;
+
+	/* Get the PTP HW lock */
+	if (!ice_ptp_lock(hw)) {
+		dev_err(dev, "PTP failed to acquire semaphore\n");
+		return ICE_ERR_NOT_READY;
+	}
+
+	/* Program cmd to source timer */
+	ice_ptp_src_cmd(hw, READ_TIME);
+
+	/* Program cmd to PHY port */
+	status = ice_ptp_one_port_cmd(hw, port, READ_TIME, true);
+	if (status)
+		goto unlock;
+
+	/* Issue sync to activate commands */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+
+	/* Read source timer SHTIME_0 and SHTIME_L */
+	zo = rd32(hw, GLTSYN_SHTIME_0(tmr_idx));
+	lo = rd32(hw, GLTSYN_SHTIME_L(tmr_idx));
+	src_time |= (u64)lo;
+	src_time = (src_time << 32) | (u64)zo;
+
+	/* Read Tx and Rx capture from PHY */
+	status = ice_ptp_read_port_capture(hw, port, &tx_time, &rx_time);
+	if (status)
+		goto unlock;
+
+	if (tx_time != rx_time)
+		dev_info(dev, "Port %d Rx and Tx times do not match\n", port);
+
+	/* Calculate amount to adjust port timer and account for case where
+	 * delta is larger/smaller than S64_MAX/S64_MIN
+	 */
+	if (src_time > tx_time) {
+		temp_adj = src_time - tx_time;
+		if (temp_adj & BIT_ULL(63)) {
+			time_adj = temp_adj >> 1;
+		} else {
+			time_adj = temp_adj;
+			/* Set to zero to indicate adjustment done */
+			temp_adj = 0x0;
+		}
+	} else {
+		temp_adj = tx_time - src_time;
+		if (temp_adj & BIT_ULL(63)) {
+			time_adj = -(temp_adj >> 1);
+		} else {
+			time_adj = -temp_adj;
+			/* Set to zero to indicate adjustment done */
+			temp_adj = 0x0;
+		}
+	}
+
+	status = ice_ptp_prep_port_adj_e822(hw, port, time_adj, true);
+	if (status)
+		goto unlock;
+
+	status = ice_ptp_one_port_cmd(hw, port, ADJ_TIME, true);
+	if (status)
+		goto unlock;
+
+	/* Issue sync to activate commands */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	/* Do a second adjustment if original was too large/small to fit into
+	 * a S64
+	 */
+	if (temp_adj) {
+		status = ice_ptp_prep_port_adj_e822(hw, port, time_adj, true);
+		if (status)
+			goto unlock;
+
+		status = ice_ptp_one_port_cmd(hw, port, ADJ_TIME, true);
+		if (!status)
+			/* Issue sync to activate commands */
+			wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+	}
+
+	/* This second register read is to flush out the port and source
+	 * command registers. Multiple successive calls to this function
+	 * require this
+	 */
+
+	/* Program cmd to source timer */
+	ice_ptp_src_cmd(hw, READ_TIME);
+
+	/* Program cmd to PHY port */
+	status = ice_ptp_one_port_cmd(hw, port, READ_TIME, true);
+	if (status)
+		goto unlock;
+
+	/* Issue sync to activate commands */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	/* Read source timer SHTIME_0 and SHTIME_L */
+	zo = rd32(hw, GLTSYN_SHTIME_0(tmr_idx));
+	lo = rd32(hw, GLTSYN_SHTIME_L(tmr_idx));
+	src_time = (u64)lo;
+	src_time = (src_time << 32) | (u64)zo;
+
+	/* Read Tx and Rx capture from PHY */
+	status = ice_ptp_read_port_capture(hw, port, &tx_time, &rx_time);
+
+	if (status)
+		goto unlock;
+	dev_info(dev, "Port %d PTP synced to source 0x%016llX, 0x%016llX\n",
+		 port, src_time, tx_time);
+unlock:
+	ice_ptp_unlock(hw);
+
+	if (status)
+		dev_err(dev, "PTP failed to sync port %d PHY time, status %s\n",
+			port, ice_stat_str(status));
+
+	return status;
+}
+
+/**
+ * ice_ptp_read_time - Read the time from the device
+ * @pf: Board private structure
+ * @ts: timespec structure to hold the current time value
+ * @sts: Optional parameter for holding a pair of system timestamps from
+ *       the system clock. Will be ignored if NULL is given.
+ *
+ * This function reads the source clock registers and stores them in a timespec.
+ * However, since the registers are 64 bits of nanoseconds, we must convert the
+ * result to a timespec before we can return.
+ */
+static void ice_ptp_read_time(struct ice_pf *pf, struct timespec64 *ts,
+			      struct ptp_system_timestamp *sts)
+{
+	u64 time_ns;
+
+	if (pf->ptp.src_tmr_mode != ICE_SRC_TMR_MODE_NANOSECONDS) {
+		dev_err(ice_pf_to_dev(pf),
+			"PTP Locked mode is not supported!\n");
+		return;
+	}
+	time_ns = ice_ptp_read_src_clk_reg(pf, sts);
+
+	*ts = ns_to_timespec64(time_ns);
+}
+
+/**
+ * ice_ptp_write_init - Set PHC time to provided value
+ * @pf: Board private structure
+ * @ts: timespec structure that holds the new time value
+ *
+ * Set the PHC time to the specified time provided in the timespec.
+ */
+static int ice_ptp_write_init(struct ice_pf *pf, struct timespec64 *ts)
+{
+	u64 ns = timespec64_to_ns(ts);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u64 val;
+
+	if (pf->ptp.src_tmr_mode != ICE_SRC_TMR_MODE_NANOSECONDS) {
+		dev_err(ice_pf_to_dev(pf),
+			"PTP Locked mode is not supported!\n");
+		return ICE_ERR_NOT_SUPPORTED;
+	}
+	val = ns;
+
+	status = ice_ptp_init_time(hw, val);
+	if (status)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_write_adj - Adjust PHC clock time atomically
+ * @pf: Board private structure
+ * @adj: Adjustment in nanoseconds
+ * @lock_sbq: true to lock the sbq sq_lock (the usual case); false if the
+ *            sq_lock has already been locked at a higher level
+ *
+ * Perform an atomic adjustment of the PHC time by the specified number of
+ * nanoseconds.
+ */
+static int
+ice_ptp_write_adj(struct ice_pf *pf, s32 adj, bool lock_sbq)
+{
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+
+
+	status = ice_ptp_adj_clock(hw, adj, lock_sbq);
+	if (status)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+
+/**
+ * ice_ptp_get_incval - Get clock increment params
+ * @pf: Board private structure
+ * @time_ref_freq: TIME_REF frequency
+ * @src_tmr_mode: Source timer mode (nanoseconds or locked)
+ */
+int ice_ptp_get_incval(struct ice_pf *pf, enum ice_time_ref_freq *time_ref_freq,
+		       enum ice_src_tmr_mode *src_tmr_mode)
+{
+	*time_ref_freq = pf->ptp.time_ref_freq;
+	*src_tmr_mode = pf->ptp.src_tmr_mode;
+
+	return 0;
+}
+
+/**
+ * ice_base_incval - Get base timer increment value
+ * @pf: Board private structure
+ *
+ * Look up the base timer increment value for this device. The base increment
+ * value is used to define the nominal clock tick rate. This increment value
+ * is programmed during device initialization. It is also used as the basis
+ * for calculating adjustments using scaled_ppm.
+ */
+static u64 ice_base_incval(struct ice_pf *pf)
+{
+	u64 incval;
+
+	if (ice_is_e810(&pf->hw))
+		incval = ICE_PTP_NOMINAL_INCVAL_E810;
+	else if (pf->ptp.time_ref_freq < NUM_ICE_TIME_REF_FREQ)
+		incval = ice_e822_nominal_incval(pf->ptp.time_ref_freq);
+	else
+		incval = LOCKED_INCVAL_E822;
+
+	dev_dbg(ice_pf_to_dev(pf), "PTP: using base increment value of 0x%016llx\n",
+		incval);
+
+	return incval;
+}
+
+/**
+ * ice_ptp_reset_ts_memory_quad - Reset timestamp memory for one quad
+ * @pf: The PF private data structure
+ * @quad: The quad (0-4)
+ */
+static void ice_ptp_reset_ts_memory_quad(struct ice_pf *pf, int quad)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	ice_write_quad_reg_e822(hw, quad, Q_REG_TS_CTRL, Q_REG_TS_CTRL_M);
+	ice_write_quad_reg_e822(hw, quad, Q_REG_TS_CTRL, ~(u32)Q_REG_TS_CTRL_M);
+}
+
+/**
+ * ice_ptp_check_tx_fifo - Check whether Tx FIFO is in an OK state
+ * @port: PTP port for which Tx FIFO is checked
+ */
+static int ice_ptp_check_tx_fifo(struct ice_ptp_port *port)
+{
+	int quad = port->port_num / ICE_PORTS_PER_QUAD;
+	int offs = port->port_num % ICE_PORTS_PER_QUAD;
+	enum ice_status status;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u32 val, phy_sts;
+
+	pf = ptp_port_to_pf(port);
+	hw = &pf->hw;
+
+
+	if (port->tx_fifo_busy_cnt == FIFO_OK)
+		return 0;
+
+	/* need to read FIFO state */
+	if (offs == 0 || offs == 1)
+		status = ice_read_quad_reg_e822(hw, quad, Q_REG_FIFO01_STATUS,
+						&val);
+	else
+		status = ice_read_quad_reg_e822(hw, quad, Q_REG_FIFO23_STATUS,
+						&val);
+
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to check port %d Tx FIFO, status %s\n",
+			port->port_num, ice_stat_str(status));
+		return ice_status_to_errno(status);
+	}
+
+	if (offs & 0x1)
+		phy_sts = (val & Q_REG_FIFO13_M) >> Q_REG_FIFO13_S;
+	else
+		phy_sts = (val & Q_REG_FIFO02_M) >> Q_REG_FIFO02_S;
+
+	if (phy_sts & FIFO_EMPTY) {
+		port->tx_fifo_busy_cnt = FIFO_OK;
+		return 0;
+	}
+
+	port->tx_fifo_busy_cnt++;
+
+	dev_dbg(ice_pf_to_dev(pf), "Try %d, port %d FIFO not empty\n",
+		port->tx_fifo_busy_cnt, port->port_num);
+
+	if (port->tx_fifo_busy_cnt == ICE_PTP_FIFO_NUM_CHECKS) {
+		dev_dbg(ice_pf_to_dev(pf),
+			"Port %d Tx FIFO still not empty; resetting quad %d\n",
+			port->port_num, quad);
+		ice_ptp_reset_ts_memory_quad(pf, quad);
+		port->tx_fifo_busy_cnt = FIFO_OK;
+		return 0;
+	}
+
+	return -EAGAIN;
+}
+
+/**
+ * ice_ptp_check_tx_offset_valid - Check if the Tx PHY offset is valid
+ * @port: the PTP port to check
+ *
+ * Checks whether the Tx offset for the PHY associated with this port is
+ * valid. Returns 0 if the offset is valid, and a non-zero error code if it is
+ * not.
+ */
+static int ice_ptp_check_tx_offset_valid(struct ice_ptp_port *port)
+{
+	struct ice_pf *pf = ptp_port_to_pf(port);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+	int err;
+
+	/* Check if the offset is already valid */
+	if (atomic_read(&port->tx_offset_ready))
+		return 0;
+
+	/* Take the bit lock to prevent cross thread interaction */
+	if (atomic_cmpxchg(&port->tx_offset_lock, false, true))
+		return -EBUSY;
+
+	err = ice_ptp_check_tx_fifo(port);
+	if (err)
+		goto out_unlock;
+
+	status = ice_read_phy_reg_e822(hw, port->port_num, P_REG_TX_OV_STATUS,
+				       &val);
+	if (status) {
+		dev_err(dev, "Failed to read TX_OV_STATUS for port %d, status %s\n",
+			port->port_num, ice_stat_str(status));
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	if (!(val & P_REG_TX_OV_STATUS_OV_M)) {
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	err = ice_ptp_port_phy_set_tx_offset(port);
+	if (err) {
+		dev_err(dev, "Failed to set PHY Rx offset for port %d, err %d\n",
+			port->port_num, err);
+		goto out_unlock;
+	}
+
+	dev_info(dev, "Port %d Tx calibration complete\n", port->port_num);
+
+
+out_unlock:
+	atomic_set(&port->tx_offset_lock, false);
+
+	return err;
+}
+
+/**
+ * ice_ptp_check_rx_offset_valid - Check if the Rx PHY offset is valid
+ * @port: the PTP port to check
+ *
+ * Checks whether the Rx offset for the PHY associated with this port is
+ * valid. Returns 0 if the offset is valid, and a non-zero error code if it is
+ * not.
+ */
+static int ice_ptp_check_rx_offset_valid(struct ice_ptp_port *port)
+{
+	struct ice_pf *pf = ptp_port_to_pf(port);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+	int err;
+
+	/* Check if the offset is already valid */
+	if (atomic_read(&port->rx_offset_ready))
+		return 0;
+
+	/* Take the bit lock to prevent cross thread interaction */
+	if (atomic_cmpxchg(&port->rx_offset_lock, false, true))
+		return -EBUSY;
+
+	status = ice_read_phy_reg_e822(hw, port->port_num, P_REG_RX_OV_STATUS,
+				       &val);
+	if (status) {
+		dev_err(dev, "Failed to read RX_OV_STATUS for port %d, status %s\n",
+			port->port_num, ice_stat_str(status));
+		err = ice_status_to_errno(status);
+		goto out_unlock;
+	}
+
+	if (!(val & P_REG_RX_OV_STATUS_OV_M)) {
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	err = ice_ptp_port_phy_set_rx_offset(port);
+	if (err) {
+		dev_err(dev, "Failed to set PHY Rx offset for port %d, err %d\n",
+			port->port_num, err);
+		goto out_unlock;
+	}
+
+	dev_info(dev, "Port %d Rx calibration complete\n", port->port_num);
+
+out_unlock:
+	atomic_set(&port->rx_offset_lock, false);
+
+	return err;
+}
+
+/**
+ * ice_ptp_check_offset_valid - Check port offset valid bit
+ * @port: Port for which offset valid bit is checked
+ *
+ * Returns 0 if both Tx and Rx offset are valid, and -EAGAIN if one of the
+ * offset is not ready.
+ */
+static int ice_ptp_check_offset_valid(struct ice_ptp_port *port)
+{
+	int tx_err, rx_err;
+
+	/* always check both Tx and Rx offset validity */
+	tx_err = ice_ptp_check_tx_offset_valid(port);
+	rx_err = ice_ptp_check_rx_offset_valid(port);
+
+	if (tx_err || rx_err)
+		return -EAGAIN;
+
+	return 0;
+}
+
+/**
+ * ice_ptp_wait_for_offset_valid - Poll offset valid reg until set or timeout
+ * @work: Pointer to struct work_struct
+ */
+static void ice_ptp_wait_for_offset_valid(struct work_struct *work)
+{
+	struct ice_ptp_port *port;
+	struct ice_pf *pf;
+	int i;
+
+	port = container_of(work, struct ice_ptp_port, ov_task);
+	pf = ptp_port_to_pf(port);
+
+#define OV_POLL_PERIOD_MS 10
+#define OV_POLL_ATTEMPTS 20
+	for (i = 0; i < OV_POLL_ATTEMPTS; i++) {
+		if (atomic_read(&pf->ptp.phy_reset_lock))
+			return;
+
+		if (!ice_ptp_check_offset_valid(port))
+			return;
+
+		msleep(OV_POLL_PERIOD_MS);
+	}
+}
+
+/**
+ * ice_ptp_port_phy_start - Set or clear PHY start for port timestamping
+ * @ptp_port: PTP port for which the PHY start is set
+ * @phy_start: Value to be set
+ */
+static int
+ice_ptp_port_phy_start(struct ice_ptp_port *ptp_port, bool phy_start)
+{
+	struct ice_pf *pf = ptp_port_to_pf(ptp_port);
+	u8 port = ptp_port->port_num;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+
+	mutex_lock(&ptp_port->ps_lock);
+
+	atomic_set(&ptp_port->tx_offset_ready, 0);
+	atomic_set(&ptp_port->rx_offset_ready, 0);
+	ptp_port->tx_fifo_busy_cnt = 0;
+
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TX_OR, 0);
+	if (status)
+		goto out_unlock;
+
+	status = ice_write_phy_reg_e822(hw, port, P_REG_RX_OR, 0);
+	if (status)
+		goto out_unlock;
+
+	status = ice_read_phy_reg_e822(hw, port, P_REG_PS, &val);
+	if (status)
+		goto out_unlock;
+
+	val &= ~P_REG_PS_START_M;
+	status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+	if (status)
+		goto out_unlock;
+
+	val &= ~P_REG_PS_ENA_CLK_M;
+	status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+	if (status)
+		goto out_unlock;
+
+
+	if (phy_start && ptp_port->link_up) {
+		ice_phy_cfg_lane_e822(hw, port);
+		ice_ptp_port_phy_set_parpcs_incval(pf, port);
+
+		status = ice_ptp_write_incval_locked(hw, ice_base_incval(pf));
+		if (status)
+			goto out_unlock;
+
+
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PS, &val);
+		if (status)
+			goto out_unlock;
+
+		val |= P_REG_PS_SFT_RESET_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		val |= P_REG_PS_START_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		val &= ~P_REG_PS_SFT_RESET_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		status = ice_ptp_write_incval_locked(hw, ice_base_incval(pf));
+		if (status)
+			goto out_unlock;
+
+		val |= P_REG_PS_ENA_CLK_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		val |= P_REG_PS_LOAD_OFFSET_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		wr32(&pf->hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+		status = ice_ptp_port_sync_src_timer(pf, port);
+		if (status)
+			goto out_unlock;
+
+		queue_work(pf->ptp.ov_wq, &ptp_port->ov_task);
+	}
+
+out_unlock:
+	if (status)
+		dev_err(ice_pf_to_dev(pf), "PTP failed to set PHY port %d %s, status=%s\n",
+			port, phy_start ? "up" : "down", ice_stat_str(status));
+
+	mutex_unlock(&ptp_port->ps_lock);
+
+	return ice_status_to_errno(status);
+}
+
+/**
+ * ice_ptp_link_change - Set or clear port registers for timestamping
+ * @pf: Board private structure
+ * @port: Port for which the PHY start is set
+ * @linkup: Link is up or down
+ */
+int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup)
+{
+	/* If PTP is not supported on this function, nothing to do */
+	if (!test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		return 0;
+
+	if (linkup && !test_bit(ICE_FLAG_PTP, pf->flags)) {
+		dev_err(ice_pf_to_dev(pf), "PTP not ready, failed to prepare port %d\n",
+			port);
+		return -EAGAIN;
+	}
+
+	if (port >= ICE_NUM_EXTERNAL_PORTS)
+		return -EINVAL;
+
+	pf->ptp.port.link_up = linkup;
+
+	return ice_ptp_port_phy_start(&pf->ptp.port, linkup);
+}
+
+
+/**
+ * ice_ptp_reset_ts_memory - Reset timestamp memory for all quads
+ * @pf: The PF private data structure
+ */
+static void ice_ptp_reset_ts_memory(struct ice_pf *pf)
+{
+	int quad;
+
+	quad = pf->hw.port_info->lport / ICE_PORTS_PER_QUAD;
+	ice_ptp_reset_ts_memory_quad(pf, quad);
+}
+
+/**
+ * ice_ptp_tx_ena_intr - Enable or disable the Tx timestamp interrupt
+ * @pf: PF private structure
+ * @ena: bool value to enable or disable interrupt
+ * @threshold: Minimum number of packets at which intr is triggered
+ *
+ * Utility function to enable or disable Tx timestamp interrupt and threshold
+ */
+static int ice_ptp_tx_ena_intr(struct ice_pf *pf, bool ena, u32 threshold)
+{
+	enum ice_status status = 0;
+	struct ice_hw *hw = &pf->hw;
+	int quad;
+	u32 val;
+
+	ice_ptp_reset_ts_memory(pf);
+
+	for (quad = 0; quad < ICE_MAX_QUAD; quad++) {
+		status = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG,
+						&val);
+		if (status)
+			break;
+
+		if (ena) {
+			val |= Q_REG_TX_MEM_GBL_CFG_INTR_ENA_M;
+			val &= ~Q_REG_TX_MEM_GBL_CFG_INTR_THR_M;
+			val |= ((threshold << Q_REG_TX_MEM_GBL_CFG_INTR_THR_S) &
+				Q_REG_TX_MEM_GBL_CFG_INTR_THR_M);
+		} else {
+			val &= ~Q_REG_TX_MEM_GBL_CFG_INTR_ENA_M;
+		}
+
+		status = ice_write_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG,
+						 val);
+		if (status)
+			break;
+	}
+
+	if (status)
+		dev_err(ice_pf_to_dev(pf), "PTP failed in intr ena, status %s\n",
+			ice_stat_str(status));
+	return ice_status_to_errno(status);
+}
+
+/**
+ * ice_ptp_reset_phy_timestamping - Reset PHY timestamp registers values
+ * @pf: Board private structure
+ */
+static void ice_ptp_reset_phy_timestamping(struct ice_pf *pf)
+{
+	int i;
+
+#define PHY_RESET_TRIES		5
+#define PHY_RESET_SLEEP_MS	5
+
+	for (i = 0; i < PHY_RESET_TRIES; i++) {
+		if (atomic_cmpxchg(&pf->ptp.phy_reset_lock, false, true))
+			goto reset;
+
+		msleep(PHY_RESET_SLEEP_MS);
+	}
+	return;
+
+reset:
+	flush_workqueue(pf->ptp.ov_wq);
+	ice_ptp_port_phy_start(&pf->ptp.port, false);
+	if (pf->ptp.port.link_up)
+		ice_ptp_port_phy_start(&pf->ptp.port, true);
+
+	ice_ptp_reset_ts_memory(pf);
+	atomic_set(&pf->ptp.phy_reset_lock, false);
+}
+
+/**
+ * ice_ptp_update_incval - Update clock increment rate
+ * @pf: Board private structure
+ * @time_ref_freq: TIME_REF frequency to use
+ * @src_tmr_mode: Src timer mode (nanoseconds or locked)
+ */
+int
+ice_ptp_update_incval(struct ice_pf *pf, enum ice_time_ref_freq time_ref_freq,
+		      enum ice_src_tmr_mode src_tmr_mode)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct timespec64 ts;
+	s64 incval;
+	int err;
+
+	if (!test_bit(ICE_FLAG_PTP, pf->flags)) {
+		dev_err(dev, "PTP not ready, failed to update incval\n");
+		return -EINVAL;
+	}
+
+	if ((time_ref_freq >= NUM_ICE_TIME_REF_FREQ ||
+	     src_tmr_mode >= NUM_ICE_SRC_TMR_MODE))
+		return -EINVAL;
+
+	if (src_tmr_mode == ICE_SRC_TMR_MODE_NANOSECONDS)
+		incval = ice_e822_nominal_incval(time_ref_freq);
+	else
+		incval = LOCKED_INCVAL_E822;
+
+	if (!ice_ptp_lock(hw))
+		return -EBUSY;
+
+	status = ice_ptp_write_incval(hw, incval);
+	if (status) {
+		dev_err(dev, "PTP failed to update incval, status %s\n",
+			ice_stat_str(status));
+		err = ice_status_to_errno(status);
+		goto err_unlock;
+	}
+
+	pf->ptp.time_ref_freq = time_ref_freq;
+	pf->ptp.src_tmr_mode = src_tmr_mode;
+
+	ts = ktime_to_timespec64(ktime_get_real());
+	err = ice_ptp_write_init(pf, &ts);
+	if (err) {
+		dev_err(dev, "PTP failed to program time registers, err %d\n",
+			err);
+		goto err_unlock;
+	}
+
+	/* unlock PTP semaphore first before resetting PHY timestamping */
+	ice_ptp_unlock(hw);
+	ice_ptp_reset_phy_timestamping(pf);
+
+	return 0;
+
+err_unlock:
+	ice_ptp_unlock(hw);
+
+	return err;
+}
+
+#ifdef HAVE_PTP_CLOCK_INFO_ADJFINE
+/**
+ * ice_ptp_adjfine - Adjust clock increment rate
+ * @info: the driver's PTP info structure
+ * @scaled_ppm: Parts per million with 16-bit fractional field
+ *
+ * Adjust the frequency of the clock by the indicated scaled ppm from the
+ * base frequency.
+ */
+static int ice_ptp_adjfine(struct ptp_clock_info *info, long scaled_ppm)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	u64 freq, divisor = 1000000ULL;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	s64 incval, diff;
+	int neg_adj = 0;
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(ice_pf_to_dev(pf),
+			"adjfreq not supported in locked mode\n");
+		return -EPERM;
+	}
+
+	incval = ice_base_incval(pf);
+
+	if (scaled_ppm < 0) {
+		neg_adj = 1;
+		scaled_ppm = -scaled_ppm;
+	}
+
+	while ((u64)scaled_ppm > div_u64(U64_MAX, incval)) {
+		/* handle overflow by scaling down the scaled_ppm and
+		 * the divisor, losing some precision
+		 */
+		scaled_ppm >>= 2;
+		divisor >>= 2;
+	}
+
+	freq = (incval * (u64)scaled_ppm) >> 16;
+	diff = div_u64(freq, divisor);
+
+	if (neg_adj)
+		incval -= diff;
+	else
+		incval += diff;
+
+	status = ice_ptp_write_incval_locked(hw, incval);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to set incval, status %s\n",
+			ice_stat_str(status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#else
+/**
+ * ice_ptp_adjfreq - Adjust the frequency of the clock
+ * @info: the driver's PTP info structure
+ * @ppb: Parts per billion adjustment from the base
+ *
+ * Adjust the frequency of the clock by the indicated parts per billion from the
+ * base frequency.
+ */
+static int ice_ptp_adjfreq(struct ptp_clock_info *info, s32 ppb)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	s64 incval, freq, diff;
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(ice_pf_to_dev(pf),
+			"adjfreq not supported in locked mode\n");
+		return -EPERM;
+	}
+
+	incval = ice_base_incval(pf);
+
+	freq = incval * ppb;
+	diff = div_s64(freq, 1000000000ULL);
+	incval += diff;
+
+	status = ice_ptp_write_incval_locked(hw, incval);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to set incval, status %s\n",
+			ice_stat_str(status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#endif
+/**
+ * ice_ptp_extts_work - Workqueue task function
+ * @pf: Board private structure
+ *
+ * Service for PTP external clock event
+ */
+static void ice_ptp_extts_work(struct ice_pf *pf)
+{
+	struct ptp_clock_event event;
+	struct ice_hw *hw = &pf->hw;
+	u8 chan, tmr_idx;
+	u32 hi, lo;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	/* Event time is captured by one of the two matched registers
+	 *      GLTSYN_EVNT_L: 32 LSB of sampled time event
+	 *      GLTSYN_EVNT_H: 32 MSB of sampled time event
+	 * Event is defined in GLTSYN_EVNT_0 register
+	 */
+	for (chan = 0; chan < GLTSYN_EVNT_H_IDX_MAX; chan++) {
+		/* Check if channel is enabled */
+		if (pf->ptp.ext_ts_irq & (1 << chan)) {
+			lo = rd32(hw, GLTSYN_EVNT_L(chan, tmr_idx));
+			hi = rd32(hw, GLTSYN_EVNT_H(chan, tmr_idx));
+			event.timestamp = (((u64)hi) << 32) | lo;
+			event.type = PTP_CLOCK_EXTTS;
+			event.index = chan;
+
+			/* Fire event */
+			ptp_clock_event(pf->ptp.clock, &event);
+			pf->ptp.ext_ts_irq &= ~(1 << chan);
+		}
+	}
+}
+
+/**
+ * ice_ptp_cfg_extts - Configure EXTTS pin and channel
+ * @pf: Board private structure
+ * @ena: true to enable; false to disable
+ * @chan: GPIO channel (0-3)
+ * @gpio_pin: GPIO pin
+ * @extts_flags: request flags from the ptp_extts_request.flags
+ */
+static int
+ice_ptp_cfg_extts(struct ice_pf *pf, bool ena, unsigned int chan, u32 gpio_pin,
+		  unsigned int extts_flags)
+{
+	u32 func, aux_reg, gpio_reg, irq_reg;
+	struct ice_hw *hw = &pf->hw;
+	u8 tmr_idx;
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(ice_pf_to_dev(pf), "Locked mode EXTTS not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (chan > (unsigned int)pf->ptp.info.n_ext_ts)
+		return -EINVAL;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	irq_reg = rd32(hw, PFINT_OICR_ENA);
+
+	if (ena) {
+		/* Enable the interrupt */
+		irq_reg |= PFINT_OICR_TSYN_EVNT_M;
+		aux_reg = GLTSYN_AUX_IN_0_INT_ENA_M;
+
+#define GLTSYN_AUX_IN_0_EVNTLVL_RISING_EDGE	BIT(0)
+#define GLTSYN_AUX_IN_0_EVNTLVL_FALLING_EDGE	BIT(1)
+
+		/* set event level to requested edge */
+		if (extts_flags & PTP_FALLING_EDGE)
+			aux_reg |= GLTSYN_AUX_IN_0_EVNTLVL_FALLING_EDGE;
+		if (extts_flags & PTP_RISING_EDGE)
+			aux_reg |= GLTSYN_AUX_IN_0_EVNTLVL_RISING_EDGE;
+
+		/* Write GPIO CTL reg.
+		 * 0x1 is input sampled by EVENT register(channel)
+		 * + num_in_channels * tmr_idx
+		 */
+		func = 1 + chan + (tmr_idx * 3);
+		gpio_reg = ((func << GLGEN_GPIO_CTL_PIN_FUNC_S) &
+			    GLGEN_GPIO_CTL_PIN_FUNC_M);
+		pf->ptp.ext_ts_chan |= (1 << chan);
+	} else {
+		/* clear the values we set to reset defaults */
+		aux_reg = 0;
+		gpio_reg = 0;
+		pf->ptp.ext_ts_chan &= ~(1 << chan);
+		if (!pf->ptp.ext_ts_chan)
+			irq_reg &= ~PFINT_OICR_TSYN_EVNT_M;
+	}
+
+	wr32(hw, PFINT_OICR_ENA, irq_reg);
+	wr32(hw, GLTSYN_AUX_IN(chan, tmr_idx), aux_reg);
+	wr32(hw, GLGEN_GPIO_CTL(gpio_pin), gpio_reg);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_cfg_clkout - Configure clock to generate periodic wave
+ * @pf: Board private structure
+ * @chan: GPIO channel (0-3)
+ * @config: desired periodic clk configuration. NULL will disable channel
+ * @store: If set to true the values will be stored
+ *
+ * Configure the internal clock generator modules to generate the clock wave of
+ * specified period.
+ */
+int ice_ptp_cfg_clkout(struct ice_pf *pf, unsigned int chan,
+		       struct ice_perout_channel *config, bool store)
+{
+	struct ice_hw *hw = &pf->hw;
+	u64 current_time, period, start_time;
+	u32 func, val, gpio_pin;
+	u8 tmr_idx;
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(ice_pf_to_dev(pf),
+			"locked mode PPS/PEROUT not supported\n");
+		return -EIO;
+	}
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* 0. Reset mode & out_en in AUX_OUT */
+	wr32(hw, GLTSYN_AUX_OUT(chan, tmr_idx), 0);
+
+	/* If we're disabling the output, clear out CLKO and TGT and keep
+	 * output level low
+	 */
+	if (!config || !config->ena) {
+		wr32(hw, GLTSYN_CLKO(chan, tmr_idx), 0);
+		wr32(hw, GLTSYN_TGT_L(chan, tmr_idx), 0);
+		wr32(hw, GLTSYN_TGT_H(chan, tmr_idx), 0);
+
+		val = GLGEN_GPIO_CTL_PIN_DIR_M;
+		gpio_pin = pf->ptp.perout_channels[chan].gpio_pin;
+		wr32(hw, GLGEN_GPIO_CTL(gpio_pin), val);
+
+		/* Store the value if requested */
+		if (store)
+			memset(&pf->ptp.perout_channels[chan], 0,
+			       sizeof(struct ice_perout_channel));
+
+		return 0;
+	}
+	period = config->period;
+	start_time = config->start_time;
+	gpio_pin = config->gpio_pin;
+
+	/* 1. Write clkout with half of required period value */
+	if (period & 0x1) {
+		dev_err(ice_pf_to_dev(pf), "CLK Period must be an even value\n");
+		goto err;
+	}
+
+	period >>= 1;
+
+	/* For proper operation, the GLTSYN_CLKO must be larger than clock tick
+	 */
+#define MIN_PULSE 3
+	if (period <= MIN_PULSE || period > U32_MAX) {
+		dev_err(ice_pf_to_dev(pf), "CLK Period must be > %d && < 2^33",
+			MIN_PULSE * 2);
+		goto err;
+	}
+
+	wr32(hw, GLTSYN_CLKO(chan, tmr_idx), lower_32_bits(period));
+
+	/* Allow time for programming before start_time is hit */
+	current_time = ice_ptp_read_src_clk_reg(pf, NULL);
+
+	/* if start time is in the past start the timer at the nearest second
+	 * maintaining phase
+	 */
+	if (start_time < current_time)
+		start_time = roundup(current_time + NSEC_PER_MSEC,
+				     NSEC_PER_SEC) + start_time % NSEC_PER_SEC;
+
+	if (ice_is_e810(hw))
+		start_time -= E810_OUT_PROP_DELAY_NS;
+	else
+		start_time -= ice_e822_pps_delay(pf->ptp.time_ref_freq);
+
+	/* 2. Write TARGET time */
+	wr32(hw, GLTSYN_TGT_L(chan, tmr_idx), lower_32_bits(start_time));
+	wr32(hw, GLTSYN_TGT_H(chan, tmr_idx), upper_32_bits(start_time));
+
+	/* 3. Write AUX_OUT register */
+	val = GLTSYN_AUX_OUT_0_OUT_ENA_M | GLTSYN_AUX_OUT_0_OUTMOD_M;
+	wr32(hw, GLTSYN_AUX_OUT(chan, tmr_idx), val);
+
+	/* 4. write GPIO CTL reg */
+	func = 8 + chan + (tmr_idx * 4);
+	val = GLGEN_GPIO_CTL_PIN_DIR_M |
+	      ((func << GLGEN_GPIO_CTL_PIN_FUNC_S) & GLGEN_GPIO_CTL_PIN_FUNC_M);
+	wr32(hw, GLGEN_GPIO_CTL(gpio_pin), val);
+
+	/* Store the value if requested */
+	if (store) {
+		memcpy(&pf->ptp.perout_channels[chan], config,
+		       sizeof(struct ice_perout_channel));
+		pf->ptp.perout_channels[chan].start_time %= NSEC_PER_SEC;
+	}
+
+	return 0;
+err:
+	dev_err(ice_pf_to_dev(pf), "PTP failed to cfg per_clk\n");
+	return -EFAULT;
+}
+
+/**
+ * ice_ptp_gettimex64 - Get the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec64 structure to hold the current time value
+ * @sts: Optional parameter for holding a pair of system timestamps from
+ *       the system clock. Will be ignored if NULL is given.
+ *
+ * Read the device clock and return the correct value on ns, after converting it
+ * into a timespec struct.
+ */
+static int
+ice_ptp_gettimex64(struct ptp_clock_info *info, struct timespec64 *ts,
+		   struct ptp_system_timestamp *sts)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_hw *hw = &pf->hw;
+
+	if (!ice_ptp_lock(hw)) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to get time\n");
+		return -EBUSY;
+	}
+
+	ice_ptp_read_time(pf, ts, sts);
+	ice_ptp_unlock(hw);
+
+	return 0;
+}
+
+#ifndef HAVE_PTP_CLOCK_INFO_GETTIMEX64
+/**
+ * ice_ptp_gettime64 - Get the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec64 structure to hold the current time value
+ *
+ * Read the device clock and return the correct value on ns, after converting it
+ * into a timespec struct.
+ */
+static int ice_ptp_gettime64(struct ptp_clock_info *info, struct timespec64 *ts)
+{
+	return ice_ptp_gettimex64(info, ts, NULL);
+}
+
+#ifndef HAVE_PTP_CLOCK_INFO_GETTIME64
+/**
+ * ice_ptp_gettime32 - Get the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * Read the device clock and return the correct value on ns, after converting it
+ * into a timespec struct.
+ */
+static int ice_ptp_gettime32(struct ptp_clock_info *info, struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	if (ice_ptp_gettime64(info, &ts64))
+		return -EFAULT;
+
+	*ts = timespec64_to_timespec(ts64);
+	return 0;
+}
+
+#endif /* !HAVE_PTP_CLOCK_INFO_GETTIME64 */
+#endif /* !HAVE_PTP_CLOCK_INFO_GETTIMEX64 */
+/**
+ * ice_ptp_settime64 - Set the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec64 structure that holds the new time value
+ *
+ * Set the device clock to the user input value. The conversion from timespec
+ * to ns happens in the write function.
+ */
+static int
+ice_ptp_settime64(struct ptp_clock_info *info, const struct timespec64 *ts)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct timespec64 ts64 = *ts;
+	struct ice_hw *hw = &pf->hw;
+	u8 i;
+	int err;
+
+	/* For Vernier mode, we need to recalibrate after new settime
+	 * Start with disabling timestamp block
+	 */
+	if (pf->ptp.port.link_up)
+		ice_ptp_port_phy_start(&pf->ptp.port, false);
+
+	if (!ice_ptp_lock(hw)) {
+		err = -EBUSY;
+		goto exit;
+	}
+
+	/* Disable periodic outputs */
+	for (i = 0; i < info->n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, NULL, false);
+
+	err = ice_ptp_write_init(pf, &ts64);
+	ice_ptp_unlock(hw);
+
+	if (!err)
+		ice_ptp_update_cached_systime(pf);
+
+	/* Reenable periodic outputs */
+	for (i = 0; i < info->n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, &pf->ptp.perout_channels[i],
+					   false);
+
+	/* Recalibrate and re-enable timestamp block */
+	if (pf->ptp.port.link_up)
+		ice_ptp_port_phy_start(&pf->ptp.port, true);
+exit:
+	if (err) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to set time %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+#ifndef HAVE_PTP_CLOCK_INFO_GETTIME64
+/**
+ * ice_ptp_settime32 - Set the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec structure that holds the new time value
+ *
+ * Set the device clock to the user input value. The conversion from timespec
+ * to ns happens in the write function.
+ */
+static int
+ice_ptp_settime32(struct ptp_clock_info *info, const struct timespec *ts)
+{
+	struct timespec64 ts64 = timespec_to_timespec64(*ts);
+
+	return ice_ptp_settime64(info, &ts64);
+}
+#endif /* !HAVE_PTP_CLOCK_INFO_GETTIME64 */
+
+/**
+ * ice_ptp_adjtime_nonatomic - Do a non-atomic clock adjustment
+ * @info: the driver's PTP info structure
+ * @delta: Offset in nanoseconds to adjust the time by
+ */
+static int ice_ptp_adjtime_nonatomic(struct ptp_clock_info *info, s64 delta)
+{
+	struct timespec64 now, then;
+
+	then = ns_to_timespec64(delta);
+	ice_ptp_gettimex64(info, &now, NULL);
+	now = timespec64_add(now, then);
+
+	return ice_ptp_settime64(info, (const struct timespec64 *)&now);
+}
+
+
+/**
+ * ice_ptp_adjtime - Adjust the time of the clock by the indicated delta
+ * @info: the driver's PTP info structure
+ * @delta: Offset in nanoseconds to adjust the time by
+ */
+static int ice_ptp_adjtime(struct ptp_clock_info *info, s64 delta)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_hw *hw = &pf->hw;
+	struct device *dev;
+	int err;
+	u8 i;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(dev, "Locked Mode adjtime not supported\n");
+		return -EIO;
+	}
+
+	/* Hardware only supports atomic adjustments using signed 32-bit
+	 * integers. For any adjustment outside this range, perform
+	 * a non-atomic get->adjust->set flow.
+	 */
+	if (delta > S32_MAX || delta < S32_MIN) {
+		dev_dbg(dev, "delta = %lld, adjtime non-atomic\n", delta);
+		return ice_ptp_adjtime_nonatomic(info, delta);
+	}
+
+	if (!ice_ptp_lock(hw)) {
+		dev_err(dev, "PTP failed to acquire semaphore in adjtime\n");
+		return -EBUSY;
+	}
+
+	/* Disable periodic outputs */
+	for (i = 0; i < info->n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, NULL, false);
+
+	err = ice_ptp_write_adj(pf, delta, true);
+
+	/* Reenable periodic outputs */
+	for (i = 0; i < info->n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, &pf->ptp.perout_channels[i],
+					   false);
+
+	ice_ptp_unlock(hw);
+
+	/* Check error after restarting periodic outputs and releasing the PTP
+	 * hardware lock.
+	 */
+	if (err) {
+		dev_err(dev, "PTP failed to adjust time, err %d\n", err);
+		return err;
+	}
+
+	ice_ptp_update_cached_systime(pf);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_gpio_enable_e822 - Enable/disable ancillary features of PHC
+ * @info: the driver's PTP info structure
+ * @rq: The requested feature to change
+ * @on: Enable/disable flag
+ */
+static int
+ice_ptp_gpio_enable_e822(struct ptp_clock_info *info,
+			 struct ptp_clock_request *rq, int on)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_perout_channel clk_cfg = {0};
+	int err;
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_PEROUT:
+		clk_cfg.gpio_pin = PPS_PIN_INDEX;
+		clk_cfg.period = ((rq->perout.period.sec * NSEC_PER_SEC) +
+				   rq->perout.period.nsec);
+		clk_cfg.start_time = ((rq->perout.start.sec * NSEC_PER_SEC) +
+				       rq->perout.start.nsec);
+		clk_cfg.ena = !!on;
+
+		err = ice_ptp_cfg_clkout(pf, rq->perout.index, &clk_cfg, true);
+		break;
+	case PTP_CLK_REQ_EXTTS:
+		err = ice_ptp_cfg_extts(pf, !!on, rq->extts.index,
+					TIME_SYNC_PIN_INDEX, rq->extts.flags);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+/**
+ * ice_ptp_gpio_enable_e810 - Enable/disable ancillary features of PHC
+ * @info: the driver's PTP info structure
+ * @rq: The requested feature to change
+ * @on: Enable/disable flag
+ */
+static int
+ice_ptp_gpio_enable_e810(struct ptp_clock_info *info,
+			 struct ptp_clock_request *rq, int on)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_perout_channel clk_cfg = {0};
+	unsigned int chan;
+	u32 gpio_pin;
+	int err;
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_PEROUT:
+		chan = rq->perout.index;
+		if (ice_is_e810t(&pf->hw)) {
+			if (chan == ice_e810t_pin_desc[SMA1].chan)
+				clk_cfg.gpio_pin = GPIO_20;
+			else if (chan == ice_e810t_pin_desc[SMA2].chan)
+				clk_cfg.gpio_pin = GPIO_22;
+			else
+				return -1;
+		} else if (chan == PPS_CLK_GEN_CHAN) {
+			clk_cfg.gpio_pin = PPS_PIN_INDEX;
+		} else {
+			clk_cfg.gpio_pin = chan;
+		}
+
+		clk_cfg.period = ((rq->perout.period.sec * NSEC_PER_SEC) +
+				   rq->perout.period.nsec);
+		clk_cfg.start_time = ((rq->perout.start.sec * NSEC_PER_SEC) +
+				       rq->perout.start.nsec);
+		clk_cfg.ena = !!on;
+
+		err = ice_ptp_cfg_clkout(pf, chan, &clk_cfg, true);
+		break;
+	case PTP_CLK_REQ_EXTTS:
+		chan = rq->extts.index;
+		if (ice_is_e810t(&pf->hw)) {
+			if (chan < 2)
+				gpio_pin = GPIO_21;
+			else
+				gpio_pin = GPIO_23;
+		} else {
+			gpio_pin = chan;
+		}
+
+		err = ice_ptp_cfg_extts(pf, !!on, chan, gpio_pin,
+					rq->extts.flags);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+#ifdef HAVE_PTP_CROSSTIMESTAMP
+/**
+ * ice_ptp_get_syncdevicetime - Get the cross time stamp info
+ * @device: Current device time
+ * @system: System counter value read synchronously with device time
+ * @ctx: Context provided by timekeeping code
+ *
+ * Read device and system (ART) clock simultaneously and return the corrected
+ * clock values in ns.
+ */
+static int
+ice_ptp_get_syncdevicetime(ktime_t *device,
+			   struct system_counterval_t *system,
+			   void *ctx)
+{
+	struct ice_pf *pf = (struct ice_pf *)ctx;
+	struct ice_hw *hw = &pf->hw;
+	u32 hh_lock, hh_art_ctl;
+	int i;
+
+	/* Get the HW lock */
+	hh_lock = rd32(hw, PFHH_SEM + (PFTSYN_SEM_BYTES * hw->pf_id));
+	if (hh_lock & PFHH_SEM_BUSY_M) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to get hh lock\n");
+		return -EFAULT;
+	}
+
+	/* Start the ART and device clock sync sequence */
+	hh_art_ctl = rd32(hw, GLHH_ART_CTL);
+	hh_art_ctl = hh_art_ctl | GLHH_ART_CTL_ACTIVE_M;
+	wr32(hw, GLHH_ART_CTL, hh_art_ctl);
+
+#define MAX_HH_LOCK_TRIES 100
+
+	for (i = 0; i < MAX_HH_LOCK_TRIES; i++) {
+		/* Wait for sync to complete */
+		hh_art_ctl = rd32(hw, GLHH_ART_CTL);
+		if (hh_art_ctl & GLHH_ART_CTL_ACTIVE_M) {
+			udelay(1);
+			continue;
+		} else {
+			u32 hh_ts_lo, hh_ts_hi, tmr_idx;
+			u64 hh_ts;
+
+			tmr_idx = hw->func_caps.ts_func_info.tmr_index_assoc;
+			/* Read ART time */
+			hh_ts_lo = rd32(hw, GLHH_ART_TIME_L);
+			hh_ts_hi = rd32(hw, GLHH_ART_TIME_H);
+			hh_ts = ((u64)hh_ts_hi << 32) | hh_ts_lo;
+			*system = convert_art_ns_to_tsc(hh_ts);
+			/* Read Device source clock time */
+			hh_ts_lo = rd32(hw, GLTSYN_HHTIME_L(tmr_idx));
+			hh_ts_hi = rd32(hw, GLTSYN_HHTIME_H(tmr_idx));
+			hh_ts = ((u64)hh_ts_hi << 32) | hh_ts_lo;
+			*device = ns_to_ktime(hh_ts);
+			break;
+		}
+	}
+	/* Release HW lock */
+	hh_lock = rd32(hw, PFHH_SEM + (PFTSYN_SEM_BYTES * hw->pf_id));
+	hh_lock = hh_lock & ~PFHH_SEM_BUSY_M;
+	wr32(hw, PFHH_SEM + (PFTSYN_SEM_BYTES * hw->pf_id), hh_lock);
+
+	if (i == MAX_HH_LOCK_TRIES)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+/**
+ * ice_ptp_getcrosststamp_e822 - Capture a device cross timestamp
+ * @info: the driver's PTP info structure
+ * @cts: The memory to fill the cross timestamp info
+ *
+ * Capture a cross timestamp between the ART and the device PTP hardware
+ * clock. Fill the cross timestamp information and report it back to the
+ * caller.
+ *
+ * This is only valid for E822 devices which have support for generating the
+ * cross timestamp via PCIe PTM.
+ *
+ * In order to correctly correlate the ART timestamp back to the TSC time, the
+ * CPU must have X86_FEATURE_TSC_KNOWN_FREQ.
+ */
+static int
+ice_ptp_getcrosststamp_e822(struct ptp_clock_info *info,
+			    struct system_device_crosststamp *cts)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	return get_device_system_crosststamp(ice_ptp_get_syncdevicetime,
+					     pf, NULL, cts);
+}
+#endif /* HAVE_PTP_CROSSTIMESTAMP */
+
+/**
+ * ice_ptp_set_timestamp_mode - Setup driver for requested timestamp mode
+ * @pf: Board private structure
+ * @config: hwtstamp settings requested or saved
+ */
+static int
+ice_ptp_set_timestamp_mode(struct ice_pf *pf, struct hwtstamp_config *config)
+{
+	/* Reserved for future extensions. */
+	if (config->flags)
+		return -EINVAL;
+
+	switch (config->tx_type) {
+	case HWTSTAMP_TX_OFF:
+		ice_set_tx_tstamp(pf, false);
+		break;
+	case HWTSTAMP_TX_ON:
+		ice_set_tx_tstamp(pf, true);
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	switch (config->rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		ice_set_rx_tstamp(pf, false);
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+#ifdef HAVE_HWTSTAMP_FILTER_NTP_ALL
+	case HWTSTAMP_FILTER_NTP_ALL:
+#endif /* HAVE_HWTSTAMP_FILTER_NTP_ALL */
+	case HWTSTAMP_FILTER_ALL:
+		ice_set_rx_tstamp(pf, true);
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_get_ts_config - ioctl interface to read the timestamping config
+ * @pf: Board private structure
+ * @ifr: ioctl data
+ *
+ * Copy the timestamping config to user buffer
+ */
+int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr)
+{
+	struct hwtstamp_config *config;
+
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return -EIO;
+
+	config = &pf->ptp.tstamp_config;
+
+	return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ?
+		-EFAULT : 0;
+}
+
+/**
+ * ice_ptp_set_ts_config - ioctl interface to control the timestamping
+ * @pf: Board private structure
+ * @ifr: ioctl data
+ *
+ * Get the user config and store it
+ */
+int ice_ptp_set_ts_config(struct ice_pf *pf, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	int err;
+
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return -EAGAIN;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	err = ice_ptp_set_timestamp_mode(pf, &config);
+	if (err)
+		return err;
+
+	/* Save these settings for future reference */
+	pf->ptp.tstamp_config = config;
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
+/**
+ * ice_ptp_get_tx_hwtstamp_ver - Returns the Tx timestamp and valid bits
+ * @pf: Board specific private structure
+ * @tx_idx_req: Bitmap of timestamp indices to read
+ * @quad: Quad to read
+ * @ts: Timestamps read from PHY
+ * @ts_read: On return, if non-NULL: bitmap of read timestamp indices
+ *
+ * Read the value of the Tx timestamp from the registers and build a
+ * bitmap of successfully read indices and count of the number successfully
+ * read.
+ *
+ * There are 3 possible return values,
+ * 0 = success
+ *
+ * -EIO = unable to read a register, this could be to a variety of issues but
+ *  should be very rare.  Up to caller how to respond to this (retry, abandon,
+ *  etc).  But once this situation occurs, stop reading as we cannot
+ *  guarantee what state the PHY or Timestamp Unit is in.
+ *
+ * -EINVAL = (at least) one of the timestamps that was read did not have the
+ *  TS_VALID bit set, and is probably zero.  Be aware that not all of the
+ *  timestamps that were read (so the TS_READY bit for this timestamp was
+ *  cleared but no valid TS was retrieved) are present.  Expect at least one
+ *  ts_read index that should be 1 is zero.
+ */
+static int ice_ptp_get_tx_hwtstamp_ver(struct ice_pf *pf, u64 tx_idx_req,
+				       u8 quad, u64 *ts, u64 *ts_read)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	unsigned long i;
+	u64 ts_ns;
+
+
+	for_each_set_bit(i, (unsigned long *)&tx_idx_req, INDEX_PER_QUAD) {
+		ts[i] = 0x0;
+
+		status = ice_read_phy_tstamp(hw, quad, i, &ts_ns);
+		if (status) {
+			dev_dbg(dev, "PTP Tx read failed, status %s\n",
+				ice_stat_str(status));
+			return ice_status_to_errno(status);
+		}
+
+		if (ts_read)
+			*ts_read |= BIT(i);
+
+		if (!(ts_ns & ICE_PTP_TS_VALID)) {
+			dev_dbg(dev, "PTP tx invalid\n");
+			continue;
+		}
+
+		ts_ns = ice_ptp_extend_40b_ts(pf, ts_ns);
+		/* Each timestamp will be offset in the array of
+		 * timestamps by the index's value.  So the timestamp
+		 * from index n will be in ts[n] position.
+		 */
+		ts[i] = ts_ns;
+	}
+
+	return 0;
+}
+
+
+/**
+ * ice_ptp_get_tx_hwtstamp_ready - Get the Tx timestamp ready bitmap
+ * @pf: The PF private data structure
+ * @quad: Quad to read (0-4)
+ * @ts_ready: Bitmap where each bit set indicates that the corresponding
+ *            timestamp register is ready to read
+ *
+ * Read the PHY timestamp ready registers for a particular bank.
+ */
+static void
+ice_ptp_get_tx_hwtstamp_ready(struct ice_pf *pf, u8 quad, u64 *ts_ready)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u64 bitmap;
+	u32 val;
+
+	status = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEMORY_STATUS_U,
+					&val);
+	if (status) {
+		dev_dbg(dev, "TX_MEMORY_STATUS_U read failed for quad %u\n",
+			quad);
+		return;
+	}
+
+	bitmap = val;
+
+	status = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEMORY_STATUS_L,
+					&val);
+	if (status) {
+		dev_dbg(dev, "TX_MEMORY_STATUS_L read failed for quad %u\n",
+			quad);
+		return;
+	}
+
+	bitmap = (bitmap << 32) | val;
+
+	*ts_ready = bitmap;
+
+}
+
+/**
+ * ice_ptp_tx_hwtstamp_vsi - Return the Tx timestamp for a specified VSI
+ * @vsi: lport corresponding VSI
+ * @idx: Index of timestamp read from QUAD memory
+ * @hwtstamp: Timestamps read from PHY
+ *
+ * Helper function for ice_ptp_tx_hwtstamp.
+ */
+static void
+ice_ptp_tx_hwtstamp_vsi(struct ice_vsi *vsi, int idx, u64 hwtstamp)
+{
+	struct skb_shared_hwtstamps shhwtstamps = {};
+	struct sk_buff *skb;
+
+	skb = vsi->ptp_tx_skb[idx];
+	if (!skb)
+		return;
+
+	shhwtstamps.hwtstamp = ns_to_ktime(hwtstamp);
+
+	vsi->ptp_tx_skb[idx] = NULL;
+
+	/* Notify the stack and free the skb after we've unlocked */
+	skb_tstamp_tx(skb, &shhwtstamps);
+	dev_kfree_skb_any(skb);
+	clear_bit(idx, vsi->ptp_tx_idx);
+}
+
+/**
+ * ice_ptp_tx_hwtstamp - Return the Tx timestamps
+ * @pf: Board private structure
+ *
+ * Read the tx_memory_status registers for the PHY timestamp block. Determine
+ * which entries contain a valid ready timestamp. Read out the timestamp from
+ * the table. Convert the 40b timestamp value into the 64b nanosecond value
+ * consumed by the stack, and then report it as part of the related skb's
+ * shhwtstamps structure.
+ *
+ * Note that new timestamps might come in while we're reading the timestamp
+ * block. However, no interrupts will be triggered until the intr_threshold is
+ * crossed again. Thus we read the status registers in a loop until no more
+ * timestamps are ready.
+ */
+static void ice_ptp_tx_hwtstamp(struct ice_pf *pf)
+{
+	u8 quad, lport, qport;
+	struct ice_vsi *vsi;
+	int msk_shft;
+	u64 rdy_msk;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	lport = vsi->port_info->lport;
+	qport = lport % ICE_PORTS_PER_QUAD;
+	quad = lport / ICE_PORTS_PER_QUAD;
+	msk_shft = qport * INDEX_PER_PORT;
+	rdy_msk = GENMASK_ULL(msk_shft + INDEX_PER_PORT - 1, msk_shft);
+
+	while (true) {
+		u64 ready_map = 0, valid_map = 0;
+		u64 hwtstamps[INDEX_PER_QUAD];
+		int i, ret;
+
+		ice_ptp_get_tx_hwtstamp_ready(pf, quad, &ready_map);
+		ready_map &= rdy_msk;
+		if (!ready_map)
+			break;
+
+		ret = ice_ptp_get_tx_hwtstamp_ver(pf, ready_map, quad,
+						  hwtstamps, &valid_map);
+		if (ret == -EIO)
+			break;
+
+		for_each_set_bit(i, (unsigned long *)&valid_map, INDEX_PER_QUAD)
+			if (test_bit(i, vsi->ptp_tx_idx))
+				ice_ptp_tx_hwtstamp_vsi(vsi, i, hwtstamps[i]);
+	}
+}
+
+/**
+ * ice_ptp_tx_hwtstamp_ext - Return the Tx timestamp
+ * @pf: Board private structure
+ *
+ * Read the value of the Tx timestamp from the registers, convert it into
+ * a value consumable by the stack, and store that result into the shhwtstamps
+ * struct before returning it up the stack.
+ */
+static void ice_ptp_tx_hwtstamp_ext(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
+	struct ice_vsi *vsi;
+	u8 lport;
+	int idx;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi || !vsi->ptp_tx)
+		return;
+	lport = hw->port_info->lport;
+
+	/* Don't attempt to timestamp if we don't have an skb */
+	for (idx = 0; idx < INDEX_PER_QUAD; idx++) {
+		struct skb_shared_hwtstamps shhwtstamps = {};
+		enum ice_status status;
+		struct sk_buff *skb;
+		u64 ts_ns;
+
+		skb = vsi->ptp_tx_skb[idx];
+		if (!skb)
+			continue;
+
+		status = ice_read_phy_tstamp(hw, lport, idx, &ts_ns);
+		if (status) {
+			dev_err(ice_pf_to_dev(pf), "PTP tx rd failed, status %s\n",
+				ice_stat_str(status));
+			vsi->ptp_tx_skb[idx] = NULL;
+			dev_kfree_skb_any(skb);
+			clear_bit(idx, vsi->ptp_tx_idx);
+		}
+
+		ts_ns = ice_ptp_extend_40b_ts(pf, ts_ns);
+
+		shhwtstamps.hwtstamp = ns_to_ktime(ts_ns);
+
+		vsi->ptp_tx_skb[idx] = NULL;
+
+		/* Notify the stack and free the skb after
+		 * we've unlocked
+		 */
+		skb_tstamp_tx(skb, &shhwtstamps);
+		dev_kfree_skb_any(skb);
+		clear_bit(idx, vsi->ptp_tx_idx);
+	}
+}
+
+/**
+ * ice_ptp_rx_hwtstamp - Check for an Rx timestamp
+ * @rx_ring: Ring to get the VSI info
+ * @rx_desc: Receive descriptor
+ * @skb: Particular skb to send timestamp with
+ *
+ * The driver receives a notification in the receive descriptor with timestamp.
+ * The timestamp is in ns, so we must convert the result first.
+ */
+void ice_ptp_rx_hwtstamp(struct ice_ring *rx_ring,
+			 union ice_32b_rx_flex_desc *rx_desc,
+			 struct sk_buff *skb)
+{
+	u32 ts_high;
+	u64 ts_ns;
+
+	/* Populate timesync data into skb */
+	if (rx_desc->wb.time_stamp_low & ICE_PTP_TS_VALID) {
+		struct skb_shared_hwtstamps *hwtstamps;
+
+		/* Use ice_ptp_extend_32b_ts directly, using the ring-specific
+		 * cached PHC value, rather than accessing the PF. This also
+		 * allows us to simply pass the upper 32bits of nanoseconds
+		 * directly. Calling ice_ptp_extend_40b_ts is unnecessary as
+		 * it would just discard these bits itself.
+		 */
+		ts_high = le32_to_cpu(rx_desc->wb.flex_ts.ts_high);
+		ts_ns = ice_ptp_extend_32b_ts(rx_ring->cached_systime, ts_high);
+
+		hwtstamps = skb_hwtstamps(skb);
+		memset(hwtstamps, 0, sizeof(*hwtstamps));
+		hwtstamps->hwtstamp = ns_to_ktime(ts_ns);
+	}
+}
+
+/**
+ * ice_ptp_setup_pins_e810t - Setup PTP pins in sysfs
+ * @pf: pointer to the PF instance
+ * @info: PTP clock capabilities
+ */
+static void
+ice_ptp_setup_pins_e810t(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+	info->n_per_out = E810T_N_PER_OUT;
+
+	if (!ice_is_feature_supported(pf, ICE_F_PTP_EXTTS))
+		return;
+
+	info->n_ext_ts = E810_N_EXT_TS;
+	info->n_pins = NUM_E810T_PTP_PINS;
+	info->verify = ice_e810t_verify_pin;
+}
+
+/**
+ * ice_ptp_setup_pins_e810 - Setup PTP pins in sysfs
+ * @pf: pointer to the PF instance
+ * @info: PTP clock capabilities
+ */
+static void
+ice_ptp_setup_pins_e810(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+	info->n_per_out = E810_N_PER_OUT;
+
+	if (!ice_is_feature_supported(pf, ICE_F_PTP_EXTTS))
+		return;
+
+	info->n_ext_ts = E810_N_EXT_TS;
+}
+
+/**
+ * ice_ptp_setup_pins_e822 - Setup PTP pins in sysfs
+ * @pf: pointer to the PF instance
+ * @info: PTP clock capabilities
+ */
+static void
+ice_ptp_setup_pins_e822(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+	info->pps = 1;
+	info->n_per_out = 1;
+	if (!ice_is_feature_supported(pf, ICE_F_PTP_EXTTS))
+		return;
+	info->n_ext_ts = 1;
+}
+
+/**
+ * ice_ptp_set_funcs_e822 - Set specialized functions for E822 support
+ * @pf: Board private structure
+ * @info: PTP info to fill
+ *
+ * Assign functions to the PTP capabiltiies structure for E822 devices.
+ * Functions which operate across all device families should be set directly
+ * in ice_ptp_set_caps. Only add functions here which are distinct for E822
+ * devices.
+ */
+static void
+ice_ptp_set_funcs_e822(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+#ifdef HAVE_PTP_CROSSTIMESTAMP
+	if (boot_cpu_has(X86_FEATURE_ART) &&
+	    boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ))
+		info->getcrosststamp = ice_ptp_getcrosststamp_e822;
+#endif /* HAVE_PTP_CROSSTIMESTAMP */
+	info->enable = ice_ptp_gpio_enable_e822;
+
+	ice_ptp_setup_pins_e822(pf, info);
+}
+
+/**
+ * ice_ptp_set_funcs_e810 - Set specialized functions for E810 support
+ * @pf: Board private structure
+ * @info: PTP info to fill
+ *
+ * Assign functions to the PTP capabiltiies structure for E810 devices.
+ * Functions which operate across all device families should be set directly
+ * in ice_ptp_set_caps. Only add functions here which are distinct for e810
+ * devices.
+ */
+static void
+ice_ptp_set_funcs_e810(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+	info->enable = ice_ptp_gpio_enable_e810;
+
+	if (ice_is_e810t(&pf->hw))
+		ice_ptp_setup_pins_e810t(pf, info);
+	else
+		ice_ptp_setup_pins_e810(pf, info);
+}
+
+/**
+ * ice_ptp_set_caps - Set PTP capabilities
+ * @pf: Board private structure
+ */
+static void ice_ptp_set_caps(struct ice_pf *pf)
+{
+	struct ptp_clock_info *info = &pf->ptp.info;
+	struct device *dev = ice_pf_to_dev(pf);
+
+	snprintf(info->name, sizeof(info->name) - 1, "%s-%s-clk",
+		 dev_driver_string(dev), dev_name(dev));
+	info->owner = THIS_MODULE;
+	info->max_adj = 999999999;
+	info->adjtime = ice_ptp_adjtime;
+#ifdef HAVE_PTP_CLOCK_INFO_ADJFINE
+	info->adjfine = ice_ptp_adjfine;
+#else
+	info->adjfreq = ice_ptp_adjfreq;
+#endif
+#if defined(HAVE_PTP_CLOCK_INFO_GETTIMEX64)
+	info->gettimex64 = ice_ptp_gettimex64;
+#elif defined(HAVE_PTP_CLOCK_INFO_GETTIME64)
+	info->gettime64 = ice_ptp_gettime64;
+#else
+	info->gettime = ice_ptp_gettime32;
+#endif
+#ifdef HAVE_PTP_CLOCK_INFO_GETTIME64
+	info->settime64 = ice_ptp_settime64;
+#else
+	info->settime = ice_ptp_settime32;
+#endif /* HAVE_PTP_CLOCK_INFO_GETTIME64 */
+
+	if (ice_is_e810(&pf->hw))
+		ice_ptp_set_funcs_e810(pf, info);
+	else
+		ice_ptp_set_funcs_e822(pf, info);
+}
+
+/**
+ * ice_ptp_create_clock - Create PTP clock device for userspace
+ * @pf: Board private structure
+ *
+ * This function creates a new PTP clock device. It only creates one if we
+ * don't already have one. Will return error if it can't create one, but success
+ * if we already have a device. Should be used by ice_ptp_init to create clock
+ * initially, and prevent global resets from creating new clock devices.
+ */
+static long ice_ptp_create_clock(struct ice_pf *pf)
+{
+	struct ptp_clock_info *info;
+	struct ptp_clock *clock;
+	struct device *dev;
+
+	/* No need to create a clock device if we already have one */
+	if (pf->ptp.clock)
+		return 0;
+
+	ice_ptp_set_caps(pf);
+
+	info = &pf->ptp.info;
+	dev = ice_pf_to_dev(pf);
+
+	/* Allocate memory for kernel pins interface */
+	if (info->n_pins) {
+		info->pin_config = devm_kcalloc(dev, info->n_pins,
+						sizeof(*info->pin_config),
+						GFP_KERNEL);
+		if (!info->pin_config) {
+			info->n_pins = 0;
+			return ICE_ERR_NO_MEMORY;
+		}
+	}
+
+	if (ice_is_e810t(&pf->hw)) {
+		/* Enable SMA controller */
+		int err = ice_enable_e810t_sma_ctrl(&pf->hw, true);
+
+		if (err)
+			return err;
+
+		/* Read current SMA status */
+		err = ice_get_e810t_sma_config(&pf->hw, info->pin_config);
+		if (err)
+			return err;
+	}
+
+	/* Attempt to register the clock before enabling the hardware. */
+	clock = ptp_clock_register(info, dev);
+	if (IS_ERR(clock))
+		return PTR_ERR(clock);
+
+	pf->ptp.clock = clock;
+
+	return 0;
+}
+
+/**
+ * ice_ptp_init_owner - Initialize PTP_1588_CLOCK device
+ * @pf: Board private structure
+ *
+ * Setup and initialize a PTP clock device that represents the device hardware
+ * clock. Save the clock index for other functions connected to the same
+ * hardware resource.
+ */
+static int ice_ptp_init_owner(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct timespec64 ts;
+	int err, itr = 1;
+	u8 src_idx;
+	u32 regval;
+
+	if (ice_is_e810(hw))
+		wr32(hw, GLTSYN_SYNC_DLAY, 0);
+
+	/* Clear some HW residue and enable source clock */
+	src_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Enable source clocks */
+	wr32(hw, GLTSYN_ENA(src_idx), GLTSYN_ENA_TSYN_ENA_M);
+
+	if (ice_is_e810(hw)) {
+		/* Enable PHY time sync */
+		status = ice_ptp_init_phy_e810(hw);
+		if (status) {
+			err = ice_status_to_errno(status);
+			goto err_exit;
+		}
+	}
+
+	/* Clear event status indications for auxiliary pins */
+	(void)rd32(hw, GLTSYN_STAT(src_idx));
+
+#define PF_SB_REM_DEV_CTL_PHY0	BIT(2)
+	if (!ice_is_e810(hw)) {
+		regval = rd32(hw, PF_SB_REM_DEV_CTL);
+		regval |= PF_SB_REM_DEV_CTL_PHY0;
+		wr32(hw, PF_SB_REM_DEV_CTL, regval);
+	}
+
+	/* Acquire the global hardware lock */
+	if (!ice_ptp_lock(hw)) {
+		err = -EBUSY;
+		goto err_exit;
+	}
+
+	/* Write the increment time value to PHY and LAN */
+	status = ice_ptp_write_incval(hw, ice_base_incval(pf));
+	if (status) {
+		err = ice_status_to_errno(status);
+		ice_ptp_unlock(hw);
+		goto err_exit;
+	}
+
+	ts = ktime_to_timespec64(ktime_get_real());
+	/* Write the initial Time value to PHY and LAN */
+	err = ice_ptp_write_init(pf, &ts);
+	if (err) {
+		ice_ptp_unlock(hw);
+		goto err_exit;
+	}
+
+	/* Release the global hardware lock */
+	ice_ptp_unlock(hw);
+
+	if (!ice_is_e810(hw)) {
+		/* Set window length for all the ports */
+		status = ice_ptp_set_vernier_wl(hw);
+		if (status)  {
+			err = ice_status_to_errno(status);
+			goto err_exit;
+		}
+
+		/* Enable quad interrupts */
+		err = ice_ptp_tx_ena_intr(pf, true, itr);
+		if (err)
+			goto err_exit;
+
+		/* Reset timestamping memory in QUADs */
+		ice_ptp_reset_ts_memory(pf);
+	}
+
+	/* Ensure we have a clock device */
+	err = ice_ptp_create_clock(pf);
+	if (err)
+		goto err_clk;
+
+	/* Store the PTP clock index for other PFs */
+	ice_set_ptp_clock_index(pf);
+
+	return 0;
+
+err_clk:
+	pf->ptp.clock = NULL;
+err_exit:
+	dev_err(dev, "PTP failed to register clock, err %d\n", err);
+
+	return err;
+}
+
+/**
+ * ice_ptp_init - Initialize PTP hardware clock support
+ * @pf: Board private structure
+ *
+ * Setup the device for interacting with the PTP hardware clock for all
+ * functions, both the function that owns the clock hardware, and the
+ * functions connected to the clock hardware.
+ *
+ * The clock owner will allocate and register a ptp_clock with the
+ * PTP_1588_CLOCK infrastructure. All functions allocate a kthread and work
+ * items used for asynchronous work such as Tx timestamps and periodic work.
+ */
+void ice_ptp_init(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	int err;
+
+
+	/* If this function owns the clock hardware, it must allocate and
+	 * configure the PTP clock device to represent it.
+	 */
+	if (hw->func_caps.ts_func_info.src_tmr_owned) {
+		err = ice_ptp_init_owner(pf);
+		if (err)
+			return;
+	}
+
+	/* Disable timestamping for both Tx and Rx */
+	ice_ptp_cfg_timestamp(pf, false);
+
+	/* Initialize work structures */
+	mutex_init(&pf->ptp.port.ps_lock);
+	pf->ptp.port.link_up = false;
+	pf->ptp.port.port_num = pf->hw.pf_id;
+	INIT_WORK(&pf->ptp.port.ov_task, ice_ptp_wait_for_offset_valid);
+
+	/* Allocate workqueue for 2nd part of Vernier calibration */
+	pf->ptp.ov_wq = alloc_workqueue("%s_ov", WQ_MEM_RECLAIM, 0,
+					KBUILD_MODNAME);
+	if (!pf->ptp.ov_wq) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
+
+	set_bit(ICE_FLAG_PTP, pf->flags);
+	dev_info(dev, "PTP init successful\n");
+
+	if (hw->func_caps.ts_func_info.src_tmr_owned && !ice_is_e810(hw))
+		ice_cgu_init_state(pf);
+	return;
+
+err_wq:
+	/* If we registered a PTP clock, release it */
+	if (pf->ptp.clock) {
+		ptp_clock_unregister(pf->ptp.clock);
+		pf->ptp.clock = NULL;
+	}
+	dev_err(dev, "PTP failed %d\n", err);
+}
+
+/**
+ * ice_ptp_release - Disable the driver/HW support and unregister the clock
+ * @pf: Board private structure
+ *
+ * This function handles the cleanup work required from the initialization by
+ * clearing out the important information and unregistering the clock
+ */
+void ice_ptp_release(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+	char *dev_name;
+	u8 quad, i;
+
+	if (!pf)
+		return;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi || !test_bit(ICE_FLAG_PTP, pf->flags))
+		return;
+
+	dev_name = vsi->netdev->name;
+
+	/* Disable timestamping for both Tx and Rx */
+	ice_ptp_cfg_timestamp(pf, false);
+	/* Clear PHY bank residues if any */
+	quad = vsi->port_info->lport / ICE_PORTS_PER_QUAD;
+
+	if (!ice_is_e810(&pf->hw) && !pf->hw.reset_ongoing) {
+		u64 tx_idx = ~((u64)0);
+		u64 ts[INDEX_PER_QUAD];
+
+		ice_ptp_get_tx_hwtstamp_ver(pf, tx_idx, quad, ts, NULL);
+	} else {
+		ice_ptp_tx_hwtstamp_ext(pf);
+	}
+
+	/* Release any pending skb */
+	ice_ptp_rel_all_skb(pf);
+
+	clear_bit(ICE_FLAG_PTP, pf->flags);
+
+	pf->ptp.port.link_up = false;
+	if (pf->ptp.ov_wq) {
+		destroy_workqueue(pf->ptp.ov_wq);
+		pf->ptp.ov_wq = NULL;
+	}
+
+	if (!pf->ptp.clock)
+		return;
+
+	/* Disable periodic outputs */
+	for (i = 0; i < pf->ptp.info.n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, NULL, false);
+
+	ice_clear_ptp_clock_index(pf);
+	ptp_clock_unregister(pf->ptp.clock);
+	pf->ptp.clock = NULL;
+
+	/* Free pin config */
+	if (pf->ptp.info.pin_config) {
+		devm_kfree(ice_pf_to_dev(pf), pf->ptp.info.pin_config);
+		pf->ptp.info.pin_config = NULL;
+	}
+
+	dev_info(ice_pf_to_dev(pf), "removed Clock from %s\n", dev_name);
+}
+
+/**
+ * ice_ptp_set_timestamp_offsets - Calculate timestamp offsets on each port
+ * @pf: Board private structure
+ *
+ * This function calculates timestamp Tx/Rx offset on each port after at least
+ * one packet was sent/received by the PHY.
+ */
+void ice_ptp_set_timestamp_offsets(struct ice_pf *pf)
+{
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return;
+
+	if (atomic_read(&pf->ptp.phy_reset_lock))
+		return;
+
+	ice_ptp_check_offset_valid(&pf->ptp.port);
+}
+
+/**
+ * ice_clean_ptp_subtask - Handle the service task events
+ * @pf: Board private structure
+ */
+void ice_clean_ptp_subtask(struct ice_pf *pf)
+{
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return;
+
+	ice_ptp_update_cached_systime(pf);
+	if (test_and_clear_bit(ICE_PTP_EXT_TS_READY, pf->state))
+		ice_ptp_extts_work(pf);
+	if (test_and_clear_bit(ICE_PTP_TX_TS_READY, pf->state)) {
+		struct ice_hw *hw = &pf->hw;
+
+		if (ice_is_e810(hw))
+			ice_ptp_tx_hwtstamp_ext(pf);
+		else
+			ice_ptp_tx_hwtstamp(pf);
+	}
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.h b/drivers/net/ethernet/intel/ice/ice_ptp.h
new file mode 100644
index 000000000000..c4ac0fa76523
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.h
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PTP_H_
+#define _ICE_PTP_H_
+
+#include <linux/clocksource.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_classify.h>
+#include <linux/highuid.h>
+
+#include "ice_ptp_hw.h"
+
+enum ice_ptp_pin {
+	GPIO_20 = 0,
+	GPIO_21,
+	GPIO_22,
+	GPIO_23,
+	NUM_ICE_PTP_PIN
+};
+
+
+#define ICE_E810T_SMA1_CTRL_MASK	(ICE_E810T_P1_SMA1_DIR_EN | \
+						ICE_E810T_P1_SMA1_TX_EN)
+#define ICE_E810T_SMA2_CTRL_MASK	(ICE_E810T_P1_SMA2_UFL2_RX_DIS | \
+						ICE_E810T_P1_SMA2_DIR_EN | \
+						ICE_E810T_P1_SMA2_TX_EN)
+#define ICE_E810T_SMA_CTRL_MASK		(ICE_E810T_SMA1_CTRL_MASK | \
+						ICE_E810T_SMA2_CTRL_MASK)
+
+enum ice_e810t_ptp_pins {
+	GNSS = 0,
+	SMA1,
+	UFL1,
+	SMA2,
+	UFL2,
+	NUM_E810T_PTP_PINS
+};
+
+#define ICE_SUBDEV_ID_E810_T 0x000E
+
+static inline bool ice_is_e810t(struct ice_hw *hw)
+{
+	return (hw->device_id == ICE_DEV_ID_E810C_SFP &&
+		hw->subsystem_device_id == ICE_SUBDEV_ID_E810_T);
+}
+
+struct ice_perout_channel {
+	bool ena;
+	u32 gpio_pin;
+	u64 period;
+	u64 start_time;
+};
+
+
+/**
+ * struct ice_ptp_port - data used to initialize an external port for PTP
+ *
+ * This structure contains data indicating whether a single external port is
+ * ready for PTP functionality. It is used to track the port initialization
+ * and determine when the port's PHY offset is valid.
+ *
+ * @ov_task: work task for tracking when PHY offset is valid
+ * @tx_offset_ready: indicates the Tx offset for the port is ready
+ * @rx_offset_ready: indicates the Rx offset for the port is ready
+ * @tx_offset_lock: lock used to protect the tx_offset_ready field
+ * @rx_offset_lock: lock used to protect the rx_offset_ready field
+ * @ps_lock: mutex used to protect the overall PTP PHY start procedure
+ * @link_up: indicates whether the link is up
+ * @tx_fifo_busy_cnt: number of times the Tx FIFO was busy
+ * @port_num: the port number this structure represents
+ */
+struct ice_ptp_port {
+	struct work_struct ov_task;
+	atomic_t tx_offset_ready;
+	atomic_t rx_offset_ready;
+	atomic_t tx_offset_lock;
+	atomic_t rx_offset_lock;
+	struct mutex ps_lock; /* protects overall PTP PHY start procedure */
+	bool link_up;
+	u8 tx_fifo_busy_cnt;
+	u8 port_num;
+};
+
+#define GLTSYN_TGT_H_IDX_MAX		4
+
+/**
+ * struct ice_ptp - data used for integrating with CONFIG_PTP_1588_CLOCK
+ * @port: data for the PHY port initialization procedure
+ * @cached_phc_time: a cached copy of the PHC time for timestamp extension
+ * @ext_ts_chan: the external timestamp channel in use
+ * @ext_ts_irq: the external timestamp IRQ in use
+ * @phy_reset_lock: bit lock for preventing PHY start while resetting
+ * @ov_wq: work queue for the offset validity task
+ * @perout_channels: periodic output data
+ * @info: structure defining PTP hardware capabilities
+ * @clock: pointer to registered PTP clock device
+ * @tstamp_config: hardware timestamping configuration
+ * @time_ref_freq: current device timer frequency (for E822 devices)
+ * @src_tmr_mode: current device timer mode (locked or nanoseconds)
+ */
+struct ice_ptp {
+	struct ice_ptp_port port;
+	u64 cached_phc_time;
+	u8 ext_ts_chan;
+	u8 ext_ts_irq;
+	atomic_t phy_reset_lock;
+	struct workqueue_struct *ov_wq;
+	struct ice_perout_channel perout_channels[GLTSYN_TGT_H_IDX_MAX];
+	struct ptp_clock_info info;
+	struct ptp_clock *clock;
+	struct hwtstamp_config tstamp_config;
+	enum ice_time_ref_freq time_ref_freq;
+	enum ice_src_tmr_mode src_tmr_mode;
+};
+
+#define __ptp_port_to_ptp(p) \
+	container_of((p), struct ice_ptp, port)
+#define ptp_port_to_pf(p) \
+	container_of(__ptp_port_to_ptp((p)), struct ice_pf, ptp)
+
+#define __ptp_info_to_ptp(i) \
+	container_of((i), struct ice_ptp, info)
+#define ptp_info_to_pf(i) \
+	container_of(__ptp_info_to_ptp((i)), struct ice_pf, ptp)
+
+#define MAC_RX_LINK_COUNTER(_port)	(0x600090 + 0x1000 * (_port))
+#define PFTSYN_SEM_BYTES		4
+#define PTP_SHARED_CLK_IDX_VALID	BIT(31)
+#define PHY_TIMER_SELECT_VALID_BIT	0
+#define PHY_TIMER_SELECT_BIT		1
+#define PHY_TIMER_SELECT_MASK		0xFFFFFFFC
+#define TS_CMD_MASK_EXT			0xFF
+#define TS_CMD_MASK			0xF
+#define SYNC_EXEC_CMD			0x3
+#define ICE_PTP_TS_VALID		BIT(0)
+#define FIFO_EMPTY			BIT(2)
+#define FIFO_OK				0xFF
+#define ICE_PTP_FIFO_NUM_CHECKS		5
+/* PHY, quad and port definitions */
+#define INDEX_PER_QUAD			64
+#define INDEX_PER_PORT			(INDEX_PER_QUAD / ICE_PORTS_PER_QUAD)
+#define TX_INTR_QUAD_MASK		0x03
+/* Per-channel register definitions */
+#define GLTSYN_AUX_OUT(_chan, _idx)	(GLTSYN_AUX_OUT_0(_idx) + ((_chan) * 8))
+#define GLTSYN_AUX_IN(_chan, _idx)	(GLTSYN_AUX_IN_0(_idx) + ((_chan) * 8))
+#define GLTSYN_CLKO(_chan, _idx)	(GLTSYN_CLKO_0(_idx) + ((_chan) * 8))
+#define GLTSYN_TGT_L(_chan, _idx)	(GLTSYN_TGT_L_0(_idx) + ((_chan) * 16))
+#define GLTSYN_TGT_H(_chan, _idx)	(GLTSYN_TGT_H_0(_idx) + ((_chan) * 16))
+#define GLTSYN_EVNT_L(_chan, _idx)	(GLTSYN_EVNT_L_0(_idx) + ((_chan) * 16))
+#define GLTSYN_EVNT_H(_chan, _idx)	(GLTSYN_EVNT_H_0(_idx) + ((_chan) * 16))
+#define GLTSYN_EVNT_H_IDX_MAX		3
+
+/* Pin definitions for PTP PPS out */
+#define PPS_CLK_GEN_CHAN		3
+#define PPS_CLK_SRC_CHAN		2
+#define PPS_PIN_INDEX			5
+#define TIME_SYNC_PIN_INDEX		4
+#define E810_N_EXT_TS			3
+#define E810_N_PER_OUT			4
+#define E810T_N_PER_OUT			3
+/* Macros to derive the low and high addresses for PHY */
+#define LOWER_ADDR_SIZE			16
+/* Macros to derive offsets for TimeStampLow and TimeStampHigh */
+#define PORT_TIMER_ASSOC(_i)		(0x0300102C + ((_i) * 256))
+#define ETH_GLTSYN_ENA(_i)		(0x03000348 + ((_i) * 4))
+
+/* Time allowed for programming periodic clock output */
+#define START_OFFS_NS 100000000
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+struct ice_pf;
+int ice_ptp_set_ts_config(struct ice_pf *pf, struct ifreq *ifr);
+int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr);
+int ice_ptp_get_ts_idx(struct ice_vsi *vsi);
+int ice_get_ptp_clock_index(struct ice_pf *pf);
+
+void ice_clean_ptp_subtask(struct ice_pf *pf);
+void ice_ptp_set_timestamp_offsets(struct ice_pf *pf);
+u64
+ice_ptp_read_src_clk_reg(struct ice_pf *pf, struct ptp_system_timestamp *sts);
+void ice_ptp_rx_hwtstamp(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
+			 struct sk_buff *skb);
+void ice_ptp_init(struct ice_pf *pf);
+void ice_ptp_release(struct ice_pf *pf);
+int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup);
+int ice_ptp_check_rx_fifo(struct ice_pf *pf, u8 port);
+int ptp_ts_enable(struct ice_pf *pf, u8 port, bool enable);
+int ice_ptp_cfg_clkout(struct ice_pf *pf, unsigned int chan,
+		       struct ice_perout_channel *config, bool store);
+int ice_ptp_update_incval(struct ice_pf *pf, enum ice_time_ref_freq time_ref_freq,
+			  enum ice_src_tmr_mode src_tmr_mode);
+int ice_ptp_get_incval(struct ice_pf *pf, enum ice_time_ref_freq *time_ref_freq,
+		       enum ice_src_tmr_mode *src_tmr_mode);
+#else /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
+static inline int ice_ptp_set_ts_config(struct ice_pf __always_unused *pf,
+					struct ifreq __always_unused *ifr)
+{
+	return 0;
+}
+
+static inline int ice_ptp_get_ts_config(struct ice_pf __always_unused *pf,
+					struct ifreq __always_unused *ifr)
+{
+	return 0;
+}
+
+static inline int
+ice_ptp_check_rx_fifo(struct ice_pf __always_unused *pf,
+		      u8 __always_unused port)
+{
+	return 0;
+}
+
+static inline int ice_ptp_get_ts_idx(struct ice_vsi __always_unused *vsi)
+{
+	return 0;
+}
+
+static inline int ice_get_ptp_clock_index(struct ice_pf __always_unused *pf)
+{
+	return 0;
+}
+static inline void ice_clean_ptp_subtask(struct ice_pf *pf) { }
+static inline void ice_ptp_set_timestamp_offsets(struct ice_pf *pf) { }
+static inline void ice_ptp_rx_hwtstamp(struct ice_ring *rx_ring,
+				       union ice_32b_rx_flex_desc *rx_desc,
+				       struct sk_buff *skb) { }
+static inline void ice_ptp_init(struct ice_pf *pf) { }
+static inline void ice_ptp_release(struct ice_pf *pf) { }
+static inline int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup)
+{ return 0; }
+#endif /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
+#endif /* _ICE_PTP_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
new file mode 100644
index 000000000000..4c0d390e6276
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PTP_CONSTS_H_
+#define _ICE_PTP_CONSTS_H_
+
+/* Constant definitions related to the hardware clock used for PTP 1588
+ * features and functionality.
+ */
+/* Constants defined for the PTP 1588 clock hardware. */
+
+/*
+ * struct ice_time_ref_info_e822
+ *
+ * E822 hardware can use different sources as the reference for the PTP
+ * hardware clock. Each clock has different characteristics such as a slightly
+ * different frequency, etc.
+ *
+ * This lookup table defines several constants that depend on the current time
+ * reference. See the struct ice_time_ref_info_e822 for information about the
+ * meaning of each constant.
+ */
+const struct ice_time_ref_info_e822 e822_time_ref[NUM_ICE_TIME_REF_FREQ] = {
+	/* ICE_TIME_REF_FREQ_25_000 -> 25 MHz */
+	{
+		/* pll_freq */
+		823437500, /* 823.4375 MHz PLL */
+		/* nominal_incval */
+		0x136e44fabULL,
+		/* pps_delay */
+		11,
+	},
+
+	/* ICE_TIME_REF_FREQ_122_880 -> 122.88 MHz */
+	{
+		/* pll_freq */
+		783360000, /* 783.36 MHz */
+		/* nominal_incval */
+		0x146cc2177ULL,
+		/* pps_delay */
+		12,
+	},
+
+	/* ICE_TIME_REF_FREQ_125_000 -> 125 MHz */
+	{
+		/* pll_freq */
+		796875000, /* 796.875 MHz */
+		/* nominal_incval */
+		0x141414141ULL,
+		/* pps_delay */
+		12,
+	},
+
+	/* ICE_TIME_REF_FREQ_153_600 -> 153.6 MHz */
+	{
+		/* pll_freq */
+		816000000, /* 816 MHz */
+		/* nominal_incval */
+		0x139b9b9baULL,
+		/* pps_delay */
+		12,
+	},
+
+	/* ICE_TIME_REF_FREQ_156_250 -> 156.25 MHz */
+	{
+		/* pll_freq */
+		830078125, /* 830.78125 MHz */
+		/* nominal_incval */
+		0x134679aceULL,
+		/* pps_delay */
+		11,
+	},
+
+	/* ICE_TIME_REF_FREQ_245_760 -> 245.76 MHz */
+	{
+		/* pll_freq */
+		783360000, /* 783.36 MHz */
+		/* nominal_incval */
+		0x146cc2177ULL,
+		/* pps_delay */
+		12,
+	},
+};
+
+#endif /* _ICE_PTP_CONSTS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
new file mode 100644
index 000000000000..7e1c33bed88d
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -0,0 +1,1774 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_type.h"
+#include "ice_common.h"
+#include "ice_ptp_hw.h"
+#include "ice_ptp_consts.h"
+
+/* Low level functions for interacting with and managing the device clock used
+ * for the Precision Time Protocol.
+ *
+ * The ice hardware represents the current time using three registers:
+ *
+ *    GLTSYN_TIME_H     GLTSYN_TIME_L     GLTSYN_TIME_R
+ *  +---------------+ +---------------+ +---------------+
+ *  |    32 bits    | |    32 bits    | |    32 bits    |
+ *  +---------------+ +---------------+ +---------------+
+ *
+ * The registers are incremented every clock tick using a 40bit increment
+ * value defined over two registers:
+ *
+ *                     GLTSYN_INCVAL_H   GLTSYN_INCVAL_L
+ *                    +---------------+ +---------------+
+ *                    |    8 bit s    | |    32 bits    |
+ *                    +---------------+ +---------------+
+ *
+ * The increment value is added to the GLSTYN_TIME_R and GLSTYN_TIME_L
+ * registers every clock source tick. Depending on the specific device
+ * configuration, the clock source frequency could be one of a number of
+ * values.
+ *
+ * For E810 devices, the increment frequency is 812.5 MHz
+ *
+ * For E822 devices the clock can be derived from different sources, and the
+ * increment has an effective frequency of one of the following:
+ * - 823.4375 MHz
+ * - 783.36 MHz
+ * - 796.875 MHz
+ * - 816 MHz
+ * - 830.078125 MHz
+ * - 783.36 MHz
+ *
+ * The hardware captures timestamps in the PHY for incoming packets, and for
+ * outgoing packets on request. To support this, the PHY maintains a timer
+ * that matches the lower 64 bits of the global source timer.
+ *
+ * In order to ensure that the PHY timers and the source timer are equivalent,
+ * shadow registers are used to prepare the desired initial values. A special
+ * sync command is issued to trigger copying from the shadow registers into
+ * the appropriate source and PHY registers simultaneously.
+ *
+ * The driver supports devices which have different PHYs with subtly different
+ * mechanisms to program and control the timers. We divide the devices into
+ * families named after the first major device, E810 and similar devices, and
+ * E822 and similar devices.
+ *
+ * - E822 based devices have additional support for fine grained Vernier
+ *   calibration which requires significant setup
+ * - The layout of timestamp data in the PHY register blocks is different
+ * - The way timer synchronization commands are issued is different.
+ *
+ * To support this, very low level functions have an e810 or e822 suffix
+ * indicating what type of device they work on. Higher level abstractions for
+ * tasks that can be done on both devices do not have the suffix and will
+ * correctly look up the appropriate low level function when running.
+ *
+ * Functions which only make sense on a single device family may not have
+ * a suitable generic implementation
+ */
+
+/**
+ * ice_get_ptp_src_clock_index - determine source clock index
+ * @hw: pointer to HW struct
+ *
+ * Determine the source clock index currently in use, based on device
+ * capabilities reported during initialization.
+ */
+u8 ice_get_ptp_src_clock_index(struct ice_hw *hw)
+{
+	return hw->func_caps.ts_func_info.tmr_index_assoc;
+}
+
+/**
+ * ice_ptp_read_src_incval - Read source timer increment value
+ * @hw: pointer to HW struct
+ *
+ * Read the increment value of the source timer and return it.
+ */
+u64 ice_ptp_read_src_incval(struct ice_hw *hw)
+{
+	u32 lo, hi;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+
+	lo = rd32(hw, GLTSYN_INCVAL_L(tmr_idx));
+	hi = rd32(hw, GLTSYN_INCVAL_H(tmr_idx));
+
+	return ((u64)(hi & INCVAL_HIGH_M) << 32) | lo;
+}
+
+/* E822 family functions
+ *
+ * The following functions operate on the E822 family of devices.
+ */
+
+/**
+ * ice_fill_phy_msg_e822 - Fill message data for a PHY register access
+ * @msg: the PHY message buffer to fill in
+ * @port: the port to access
+ * @offset: the register offset
+ */
+static void
+ice_fill_phy_msg_e822(struct ice_sbq_msg_input *msg, u8 port, u16 offset)
+{
+	int phy_port, phy, quadtype;
+
+	phy_port = port % ICE_PORTS_PER_PHY;
+	phy = port / ICE_PORTS_PER_PHY;
+	quadtype = (port / ICE_PORTS_PER_QUAD) % ICE_NUM_QUAD_TYPE;
+
+	if (quadtype == 0) {
+		msg->msg_addr_low = P_Q0_L(P_0_BASE + offset, phy_port);
+		msg->msg_addr_high = P_Q0_H(P_0_BASE + offset, phy_port);
+	} else {
+		msg->msg_addr_low = P_Q1_L(P_4_BASE + offset, phy_port);
+		msg->msg_addr_high = P_Q1_H(P_4_BASE + offset, phy_port);
+	}
+
+	if (phy == 0)
+		msg->dest_dev = rmn_0;
+	else if (phy == 1)
+		msg->dest_dev = rmn_1;
+	else
+		msg->dest_dev = rmn_2;
+}
+
+/**
+ * ice_read_phy_reg_e822_lp - Read a PHY register
+ * @hw: pointer to the HW struct
+ * @port: PHY port to read from
+ * @offset: PHY register offset to read
+ * @val: on return, the contents read from the PHY
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Read a PHY register for the given port over the device sideband queue.
+ */
+static enum ice_status
+ice_read_phy_reg_e822_lp(struct ice_hw *hw, u8 port, u16 offset, u32 *val,
+			 bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+
+	ice_fill_phy_msg_e822(&msg, port, offset);
+	msg.opcode = ice_sbq_msg_rd;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	*val = msg.data;
+
+	return 0;
+}
+
+enum ice_status
+ice_read_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 *val)
+{
+	return ice_read_phy_reg_e822_lp(hw, port, offset, val, true);
+}
+
+/**
+ * ice_write_phy_reg_e822_lp - Write a PHY register
+ * @hw: pointer to the HW struct
+ * @port: PHY port to write to
+ * @offset: PHY register offset to write
+ * @val: The value to write to the register
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Write a PHY register for the given port over the device sideband queue.
+ */
+static enum ice_status
+ice_write_phy_reg_e822_lp(struct ice_hw *hw, u8 port, u16 offset, u32 val,
+			  bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+
+	ice_fill_phy_msg_e822(&msg, port, offset);
+	msg.opcode = ice_sbq_msg_wr;
+	msg.data = val;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+enum ice_status
+ice_write_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 val)
+{
+	return ice_write_phy_reg_e822_lp(hw, port, offset, val, true);
+}
+
+/**
+ * ice_fill_quad_msg_e822 - Fill message data for quad register access
+ * @msg: the PHY message buffer to fill in
+ * @quad: the quad to access
+ * @offset: the register offset
+ *
+ * Fill a message buffer for accessing a register in a quad shared between
+ * multiple PHYs.
+ */
+static void
+ice_fill_quad_msg_e822(struct ice_sbq_msg_input *msg, u8 quad, u16 offset)
+{
+	u32 addr;
+
+	msg->dest_dev = rmn_0;
+
+	if ((quad % ICE_NUM_QUAD_TYPE) == 0)
+		addr = Q_0_BASE + offset;
+	else
+		addr = Q_1_BASE + offset;
+
+	msg->msg_addr_low = ICE_LO_WORD(addr);
+	msg->msg_addr_high = ICE_HI_WORD(addr);
+}
+
+/**
+ * ice_read_quad_reg_e822_lp - Read a PHY quad register
+ * @hw: pointer to the HW struct
+ * @quad: quad to read from
+ * @offset: quad register offset to read
+ * @val: on return, the contents read from the quad
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Read a quad register over the device sideband queue. Quad registers are
+ * shared between multiple PHYs.
+ */
+static enum ice_status
+ice_read_quad_reg_e822_lp(struct ice_hw *hw, u8 quad, u16 offset, u32 *val,
+			  bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+	if (quad >= ICE_MAX_QUAD)
+		return ICE_ERR_PARAM;
+
+	ice_fill_quad_msg_e822(&msg, quad, offset);
+	msg.opcode = ice_sbq_msg_rd;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	*val = msg.data;
+
+	return 0;
+}
+
+enum ice_status
+ice_read_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 *val)
+{
+	return ice_read_quad_reg_e822_lp(hw, quad, offset, val, true);
+}
+
+/**
+ * ice_write_quad_reg_e822_lp - Write a PHY quad register
+ * @hw: pointer to the HW struct
+ * @quad: quad to write to
+ * @offset: quad register offset to write
+ * @val: The value to write to the register
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Write a quad register over the device sideband queue. Quad registers are
+ * shared between multiple PHYs.
+ */
+static enum ice_status
+ice_write_quad_reg_e822_lp(struct ice_hw *hw, u8 quad, u16 offset, u32 val,
+			   bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+	if (quad >= ICE_MAX_QUAD)
+		return ICE_ERR_PARAM;
+
+	ice_fill_quad_msg_e822(&msg, quad, offset);
+	msg.opcode = ice_sbq_msg_wr;
+	msg.data = val;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+enum ice_status
+ice_write_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 val)
+{
+	return ice_write_quad_reg_e822_lp(hw, quad, offset, val, true);
+}
+
+/**
+ * ice_read_phy_tstamp_e822 - Read a PHY timestamp out of the quad block
+ * @hw: pointer to the HW struct
+ * @quad: the quad to read from
+ * @idx: the timestamp index to read
+ * @tstamp: on return, the 40bit timestamp value
+ *
+ * Read a 40bit timestamp value out of the two associated registers in the
+ * quad memory block that is shared between the internal PHYs of the E822
+ * family of devices.
+ */
+static enum ice_status
+ice_read_phy_tstamp_e822(struct ice_hw *hw, u8 quad, u8 idx, u64 *tstamp)
+{
+	enum ice_status status;
+	u16 lo_addr, hi_addr;
+	u32 lo, hi;
+
+	lo_addr = (u16)TS_L(Q_REG_TX_MEMORY_BANK_START, idx);
+	hi_addr = (u16)TS_H(Q_REG_TX_MEMORY_BANK_START, idx);
+
+	status = ice_read_quad_reg_e822(hw, quad, lo_addr, &lo);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_quad_reg_e822(hw, quad, hi_addr, &hi);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* For E822 based internal PHYs, the timestamp is reported with the
+	 * lower 8 bits in the low register, and the upper 32 bits in the high
+	 * register.
+	 */
+	*tstamp = ((u64)hi) << TS_PHY_HIGH_S | ((u64)lo & TS_PHY_LOW_M);
+
+	return 0;
+}
+
+/**
+ * ice_clear_phy_tstamp_e822 - Clear a timestamp from the quad block
+ * @hw: pointer to the HW struct
+ * @quad: the quad to read from
+ * @idx: the timestamp index to reset
+ *
+ * Clear a timestamp, resetting its valid bit, from the PHY quad block that is
+ * shared between the internal PHYs on the E822 devices.
+ */
+static enum ice_status
+ice_clear_phy_tstamp_e822(struct ice_hw *hw, u8 quad, u8 idx)
+{
+	enum ice_status status;
+	u16 lo_addr, hi_addr;
+
+	lo_addr = (u16)TS_L(Q_REG_TX_MEMORY_BANK_START, idx);
+	hi_addr = (u16)TS_H(Q_REG_TX_MEMORY_BANK_START, idx);
+
+	status = ice_write_quad_reg_e822(hw, quad, lo_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_quad_reg_e822(hw, quad, hi_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_time_e822 - Prepare PHY port with initial time
+ * @hw: pointer to the HW struct
+ * @time: Time to initialize the PHY port clocks to
+ *
+ * Program the PHY port registers with a new initial time value. The port
+ * clock will be initialized once the driver issues an INIT_TIME sync
+ * command. The time value is the upper 32 bits of the PHY timer, usually in
+ * units of nominal nanoseconds.
+ */
+static enum ice_status
+ice_ptp_prep_phy_time_e822(struct ice_hw *hw, u32 time)
+{
+	enum ice_status status;
+	u8 port;
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+
+		/* Tx case */
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_TX_TIMER_INC_PRE_L,
+						   0, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_TX_TIMER_INC_PRE_U,
+						   time, true);
+		if (status)
+			goto exit_err;
+
+		/* Rx case */
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_RX_TIMER_INC_PRE_L,
+						   0, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_RX_TIMER_INC_PRE_U,
+						   time, true);
+		if (status)
+			goto exit_err;
+	}
+
+	return 0;
+
+exit_err:
+	ice_debug(hw, ICE_DBG_PTP, "Failed to write init time for port %u, status %d\n",
+		  port, status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_prep_port_adj_e822 - Prepare a single port for time adjust
+ * @hw: pointer to HW struct
+ * @port: Port number to be programmed
+ * @time: time in cycles to adjust the port Tx and Rx clocks
+ * @lock_sbq: true to lock the sbq sq_lock (the usual case); false if the
+ *            sq_lock has already been locked at a higher level
+ *
+ * Program the port for an atomic adjustment by writing the Tx and Rx timer
+ * registers. The atomic adjustment won't be completed until the driver issues
+ * an ADJ_TIME command.
+ *
+ * Note that time is not in units of nanoseconds. It is in clock time
+ * including the lower sub-nanosecond portion of the port timer.
+ *
+ * Negative adjustments are supported using 2s complement arithmetic.
+ */
+enum ice_status
+ice_ptp_prep_port_adj_e822(struct ice_hw *hw, u8 port, s64 time,
+			   bool lock_sbq)
+{
+	enum ice_status status;
+	u32 l_time, u_time;
+
+	l_time = lower_32_bits(time);
+	u_time = upper_32_bits(time);
+
+	/* Tx case */
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TX_TIMER_INC_PRE_L,
+					   l_time, lock_sbq);
+	if (status)
+		goto exit_err;
+
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TX_TIMER_INC_PRE_U,
+					   u_time, lock_sbq);
+	if (status)
+		goto exit_err;
+
+	/* Rx case */
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_RX_TIMER_INC_PRE_L,
+					   l_time, lock_sbq);
+	if (status)
+		goto exit_err;
+
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_RX_TIMER_INC_PRE_U,
+					   u_time, lock_sbq);
+	if (status)
+		goto exit_err;
+
+	return 0;
+
+exit_err:
+	ice_debug(hw, ICE_DBG_PTP, "Failed to write time adjust for port %u, status %d\n",
+		  port, status);
+	return status;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_e822 - Prep PHY ports for a time adjustment
+ * @hw: pointer to HW struct
+ * @adj: adjustment in nanoseconds
+ * @lock_sbq: true to lock the sbq sq_lock (the usual case); false if the
+ *            sq_lock has already been locked at a higher level
+ *
+ * Prepare the PHY ports for an atomic time adjustment by programming the PHY
+ * Tx and Rx port registers. The actual adjustment is completed by issuing an
+ * ADJ_TIME or ADJ_TIME_AT_TIME sync command.
+ */
+static enum ice_status
+ice_ptp_prep_phy_adj_e822(struct ice_hw *hw, s32 adj, bool lock_sbq)
+{
+	s64 cycles;
+	u8 port;
+
+	/* The port clock supports adjustment of the sub-nanosecond portion of
+	 * the clock. We shift the provided adjustment in nanoseconds to
+	 * calculate the appropriate adjustment to program into the PHY ports.
+	 */
+	if (adj > 0)
+		cycles = (s64)adj << 32;
+	else
+		cycles = -(((s64)-adj) << 32);
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+		enum ice_status status;
+
+
+		status = ice_ptp_prep_port_adj_e822(hw, port, cycles,
+						    lock_sbq);
+		if (status)
+			return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_incval_e822 - Prepare PHY ports for time adjustment
+ * @hw: pointer to HW struct
+ * @incval: new increment value to prepare
+ *
+ * Prepare each of the PHY ports for a new increment value by programming each
+ * port's TIMETUS registers. The new increment value will be updated after
+ * issuing an INIT_INCVAL command.
+ */
+static enum ice_status
+ice_ptp_prep_phy_incval_e822(struct ice_hw *hw, u64 incval)
+{
+	enum ice_status status;
+	u32 high, low;
+	u8 port;
+
+	/* The PHY registers for the increment value divide the lower 8 bits
+	 * into the first low register, and the next 32 bits into the second
+	 * high register.
+	 */
+	low = (u32)(incval & P_REG_TIMETUS_LOW_M);
+	high = (u32)(incval >> P_REG_TIMETUS_HIGH_S);
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+		status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TIMETUS_L,
+						   low, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TIMETUS_U,
+						   high, true);
+		if (status)
+			goto exit_err;
+	}
+
+	return 0;
+
+exit_err:
+	ice_debug(hw, ICE_DBG_PTP, "Failed to write incval for port %u, status %d\n",
+		  port, status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_read_phy_incval_e822 - Read a PHY port's current incval
+ * @hw: pointer to the HW struct
+ * @port: the port to read
+ * @incval: on return, the time_clk_cyc incval for this port
+ *
+ * Read the time_clk_cyc increment value for a given PHY port.
+ */
+enum ice_status
+ice_ptp_read_phy_incval_e822(struct ice_hw *hw, u8 port, u64 *incval)
+{
+	enum ice_status status;
+	u32 high, low;
+
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TIMETUS_L,
+					  &low, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TIMETUS_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TIMETUS_U,
+					  &high, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TIMETUS_U, status %d\n",
+			  status);
+		return status;
+	}
+
+	*incval = high << P_REG_TIMETUS_HIGH_S | (low & P_REG_TIMETUS_LOW_M);
+	ice_debug(hw, ICE_DBG_PTP, "read INCVAL = 0x%08x%08x\n", high, low);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_target_e822 - Prepare PHY for adjust at target time
+ * @hw: pointer to HW struct
+ * @target_time: target time to program
+ *
+ * Program the PHY port Tx and Rx TIMER_CNT_ADJ registers used for the
+ * ADJ_TIME_AT_TIME command. This should be used in conjunction with
+ * ice_ptp_prep_phy_adj_e822 to program an atomic adjustment that is
+ * delayed until a specified target time.
+ *
+ * Note that a target time adjustment is not currently supported on E810
+ * devices.
+ */
+static enum ice_status
+ice_ptp_prep_phy_adj_target_e822(struct ice_hw *hw, u32 target_time)
+{
+	enum ice_status status;
+	u8 port;
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+
+		/* Tx case */
+		/* No sub-nanoseconds data */
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_TX_TIMER_CNT_ADJ_L,
+						   0, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_TX_TIMER_CNT_ADJ_U,
+						   target_time, true);
+		if (status)
+			goto exit_err;
+
+		/* Rx case */
+		/* No sub-nanoseconds data */
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_RX_TIMER_CNT_ADJ_L,
+						   0, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_RX_TIMER_CNT_ADJ_U,
+						   target_time, true);
+		if (status)
+			goto exit_err;
+	}
+
+	return 0;
+
+exit_err:
+	ice_debug(hw, ICE_DBG_PTP, "Failed to write target time for port %u, status %d\n",
+		  port, status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_read_port_capture - Read a port's local time capture
+ * @hw: pointer to HW struct
+ * @port: Port number to read
+ * @tx_ts: on return, the Tx port time capture
+ * @rx_ts: on return, the Rx port time capture
+ *
+ * Read the port's Tx and Rx local time capture values.
+ *
+ * Note this has no equivalent for the E810 devices.
+ */
+enum ice_status
+ice_ptp_read_port_capture(struct ice_hw *hw, u8 port, u64 *tx_ts, u64 *rx_ts)
+{
+	enum ice_status status;
+	u32 high, low;
+
+	/* Tx case */
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TX_CAPTURE_L,
+					  &low, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read REG_TX_CAPTURE_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TX_CAPTURE_U,
+					  &high, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_CAPTURE_U, status %d\n",
+			  status);
+		return status;
+	}
+
+	*tx_ts = (u64)high << 32 | low;
+	ice_debug(hw, ICE_DBG_PTP, "tx_init = 0x%016llx\n", *tx_ts);
+
+	/* Rx case */
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_RX_CAPTURE_L,
+					  &low, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_CAPTURE_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_RX_CAPTURE_U,
+					  &high, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_CAPTURE_U, status %d\n",
+			  status);
+		return status;
+	}
+
+	*rx_ts = (u64)high << 32 | low;
+	ice_debug(hw, ICE_DBG_PTP, "rx_init = 0x%016llx\n", *rx_ts);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_one_port_cmd - Prepare a single PHY port for a timer command
+ * @hw: pointer to HW struct
+ * @port: Port to which cmd has to be sent
+ * @cmd: Command to be sent to the port
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Prepare the requested port for an upcoming timer sync command.
+ *
+ * Note there is no equivalent of this operation on E810, as that device
+ * always handles all external PHYs internally.
+ */
+enum ice_status
+ice_ptp_one_port_cmd(struct ice_hw *hw, u8 port, enum ice_ptp_tmr_cmd cmd,
+		     bool lock_sbq)
+{
+	enum ice_status status;
+	u32 cmd_val, val;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+	cmd_val = tmr_idx << SEL_PHY_SRC;
+	switch (cmd) {
+	case INIT_TIME:
+		cmd_val |= PHY_CMD_INIT_TIME;
+		break;
+	case INIT_INCVAL:
+		cmd_val |= PHY_CMD_INIT_INCVAL;
+		break;
+	case ADJ_TIME:
+		cmd_val |= PHY_CMD_ADJ_TIME;
+		break;
+	case ADJ_TIME_AT_TIME:
+		cmd_val |= PHY_CMD_ADJ_TIME_AT_TIME;
+		break;
+	case READ_TIME:
+		cmd_val |= PHY_CMD_READ_TIME;
+		break;
+	default:
+		dev_warn(ice_hw_to_dev(hw), "Unknown timer command %u\n", cmd);
+		return ICE_ERR_PARAM;
+	}
+
+	/* Tx case */
+	/* Read, modify, write */
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TX_TMR_CMD, &val,
+					  lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_TMR_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* Modify necessary bits only and perform write */
+	val &= ~TS_CMD_MASK;
+	val |= cmd_val;
+
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TX_TMR_CMD, val,
+					   lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back TX_TMR_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* Rx case */
+	/* Read, modify, write */
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_RX_TMR_CMD, &val,
+					  lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_TMR_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* Modify necessary bits only and perform write */
+	val &= ~TS_CMD_MASK;
+	val |= cmd_val;
+
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_RX_TMR_CMD, val,
+					   lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back RX_TMR_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_port_cmd_e822 - Prepare all ports for a timer command
+ * @hw: pointer to the HW struct
+ * @cmd: timer command to prepare
+ * @lock_sbq: true if the sideband queue lock must  be acquired
+ *
+ * Prepare all ports connected to this device for an upcoming timer sync
+ * command.
+ */
+static enum ice_status
+ice_ptp_port_cmd_e822(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd,
+		      bool lock_sbq)
+{
+	u8 port;
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+		enum ice_status status;
+
+
+		status = ice_ptp_one_port_cmd(hw, port, cmd, lock_sbq);
+		if (status)
+			return status;
+	}
+
+	return 0;
+}
+
+/* E822 Vernier calibration functions
+ *
+ * The following functions are used as part of the vernier calibration of
+ * a port. This calibration increases the precision of the timestamps on the
+ * port.
+ */
+
+/**
+ * ice_ptp_set_vernier_wl - Set the window length for vernier calibration
+ * @hw: pointer to the HW struct
+ *
+ * Set the window length used for the vernier port calibration process.
+ */
+enum ice_status ice_ptp_set_vernier_wl(struct ice_hw *hw)
+{
+	u8 port;
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+		enum ice_status status;
+
+
+		status = ice_write_phy_reg_e822_lp(hw, port, P_REG_WL,
+						   PTP_VERNIER_WL, true);
+		if (status) {
+			ice_debug(hw, ICE_DBG_PTP, "Failed to set vernier window length for port %u, status %d\n",
+				  port, status);
+			return status;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_phy_get_speed_and_fec_e822 - Get link speed and FEC based on serdes mode
+ * @hw: pointer to HW struct
+ * @port: the port to read from
+ * @link_out: if non-NULL, holds link speed on success
+ * @fec_out: if non-NULL, holds FEC algorithm on success
+ *
+ * Read the serdes data for the PHY port and extract the link speed and FEC
+ * algorithm.
+ */
+enum ice_status
+ice_phy_get_speed_and_fec_e822(struct ice_hw *hw, u8 port,
+			       enum ice_ptp_link_spd *link_out,
+			       enum ice_ptp_fec_mode *fec_out)
+{
+	enum ice_ptp_link_spd link;
+	enum ice_ptp_fec_mode fec;
+	enum ice_status status;
+	u32 serdes;
+
+	status = ice_read_phy_reg_e822(hw, port, P_REG_LINK_SPEED, &serdes);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read serdes info\n");
+		return status;
+	}
+
+	/* Determine the FEC algorithm */
+	fec = (enum ice_ptp_fec_mode)P_REG_LINK_SPEED_FEC_MODE(serdes);
+
+	serdes &= P_REG_LINK_SPEED_SERDES_M;
+
+	/* Determine the link speed */
+	if (fec == ICE_PTP_FEC_MODE_RS_FEC) {
+		switch (serdes) {
+		case ICE_PTP_SERDES_25G:
+			link = ICE_PTP_LNK_SPD_25G_RS;
+			break;
+		case ICE_PTP_SERDES_50G:
+			link = ICE_PTP_LNK_SPD_50G_RS;
+			break;
+		case ICE_PTP_SERDES_100G:
+			link = ICE_PTP_LNK_SPD_100G_RS;
+			break;
+		default:
+			return ICE_ERR_OUT_OF_RANGE;
+		}
+	} else {
+		switch (serdes) {
+		case ICE_PTP_SERDES_1G:
+			link = ICE_PTP_LNK_SPD_1G;
+			break;
+		case ICE_PTP_SERDES_10G:
+			link = ICE_PTP_LNK_SPD_10G;
+			break;
+		case ICE_PTP_SERDES_25G:
+			link = ICE_PTP_LNK_SPD_25G;
+			break;
+		case ICE_PTP_SERDES_40G:
+			link = ICE_PTP_LNK_SPD_40G;
+			break;
+		case ICE_PTP_SERDES_50G:
+			link = ICE_PTP_LNK_SPD_50G;
+			break;
+		default:
+			return ICE_ERR_OUT_OF_RANGE;
+		}
+	}
+
+	if (link_out)
+		*link_out = link;
+	if (fec_out)
+		*fec_out = fec;
+
+	return 0;
+}
+
+/**
+ * ice_phy_cfg_lane_e822 - Configure PHY quad for single/multi-lane timestamp
+ * @hw: pointer to HW struct
+ * @port: to configure the quad for
+ */
+void ice_phy_cfg_lane_e822(struct ice_hw *hw, u8 port)
+{
+	enum ice_ptp_link_spd link_spd;
+	enum ice_status status;
+	int quad;
+	u32 val;
+
+	quad = port / ICE_PORTS_PER_QUAD;
+
+	status = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to get PHY link speed, status %d\n",
+			  status);
+		return;
+	}
+
+	status = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG, &val);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_MEM_GLB_CFG, status %d\n",
+			  status);
+		return;
+	}
+
+	if (link_spd >= ICE_PTP_LNK_SPD_40G)
+		val &= ~Q_REG_TX_MEM_GBL_CFG_LANE_TYPE_M;
+	else
+		val |= Q_REG_TX_MEM_GBL_CFG_LANE_TYPE_M;
+
+	status = ice_write_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG, val);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back TX_MEM_GBL_CFG, status %d\n",
+			  status);
+		return;
+	}
+}
+
+/* E810 functions
+ *
+ * The following functions operate on the E810 series devices which use
+ * a separate external PHY.
+ */
+
+/**
+ * ice_read_phy_reg_e810_lp - Read register from external PHY on E810
+ * @hw: pointer to the HW struct
+ * @addr: the address to read from
+ * @val: On return, the value read from the PHY
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Read a register from the external PHY on the E810 device.
+ */
+static enum ice_status
+ice_read_phy_reg_e810_lp(struct ice_hw *hw, u32 addr, u32 *val, bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+	msg.msg_addr_low = ICE_LO_WORD(addr);
+	msg.msg_addr_high = ICE_HI_WORD(addr);
+	msg.opcode = ice_sbq_msg_rd;
+	msg.dest_dev = rmn_0;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	*val = msg.data;
+
+	return 0;
+}
+
+static enum ice_status
+ice_read_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 *val)
+{
+	return ice_read_phy_reg_e810_lp(hw, addr, val, true);
+}
+
+/**
+ * ice_write_phy_reg_e810_lp - Write register on external PHY on E810
+ * @hw: pointer to the HW struct
+ * @addr: the address to writem to
+ * @val: the value to write to the PHY
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Write a value to a register of the external PHY on the E810 device.
+ */
+static enum ice_status
+ice_write_phy_reg_e810_lp(struct ice_hw *hw, u32 addr, u32 val, bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+	msg.msg_addr_low = ICE_LO_WORD(addr);
+	msg.msg_addr_high = ICE_HI_WORD(addr);
+	msg.opcode = ice_sbq_msg_wr;
+	msg.dest_dev = rmn_0;
+	msg.data = val;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+static enum ice_status
+ice_write_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 val)
+{
+	return ice_write_phy_reg_e810_lp(hw, addr, val, true);
+}
+
+/**
+ * ice_read_phy_tstamp_e810 - Read a PHY timestamp out of the external PHY
+ * @hw: pointer to the HW struct
+ * @lport: the lport to read from
+ * @idx: the timestamp index to read
+ * @tstamp: on return, the 40bit timestamp value
+ *
+ * Read a 40bit timestamp value out of the timestamp block of the external PHY
+ * on the E810 device.
+ */
+static enum ice_status
+ice_read_phy_tstamp_e810(struct ice_hw *hw, u8 lport, u8 idx, u64 *tstamp)
+{
+	enum ice_status status;
+	u32 lo_addr, hi_addr, lo, hi;
+
+	lo_addr = TS_EXT(LOW_TX_MEMORY_BANK_START, lport, idx);
+	hi_addr = TS_EXT(HIGH_TX_MEMORY_BANK_START, lport, idx);
+
+	status = ice_read_phy_reg_e810(hw, lo_addr, &lo);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e810(hw, hi_addr, &hi);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* For E810 devices, the timestamp is reported with the lower 32 bits
+	 * in the low register, and the upper 8 bits in the high register.
+	 */
+	*tstamp = ((u64)hi) << TS_HIGH_S | ((u64)lo & TS_LOW_M);
+
+	return 0;
+}
+
+/**
+ * ice_clear_phy_tstamp_e810 - Clear a timestamp from the external PHY
+ * @hw: pointer to the HW struct
+ * @lport: the lport to read from
+ * @idx: the timestamp index to reset
+ *
+ * Clear a timestamp, resetting its valid bit, from the timestamp block of the
+ * external PHY on the E810 device.
+ */
+static enum ice_status
+ice_clear_phy_tstamp_e810(struct ice_hw *hw, u8 lport, u8 idx)
+{
+	enum ice_status status;
+	u32 lo_addr, hi_addr;
+
+	lo_addr = TS_EXT(LOW_TX_MEMORY_BANK_START, lport, idx);
+	hi_addr = TS_EXT(HIGH_TX_MEMORY_BANK_START, lport, idx);
+
+	status = ice_write_phy_reg_e810(hw, lo_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, hi_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_init_phy_e810 - Enable PTP function on the external PHY
+ * @hw: pointer to HW struct
+ *
+ * Enable the timesync PTP functionality for the external PHY connected to
+ * this function.
+ *
+ * Note there is no equivalent function needed on E822 based devices.
+ */
+enum ice_status ice_ptp_init_phy_e810(struct ice_hw *hw)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_ENA(tmr_idx),
+					GLTSYN_ENA_TSYN_ENA_M);
+	if (status)
+		ice_debug(hw, ICE_DBG_PTP, "PTP failed in ena_phy_time_syn %d\n",
+			  status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_prep_phy_time_e810 - Prepare PHY port with initial time
+ * @hw: Board private structure
+ * @time: Time to initialize the PHY port clock to
+ *
+ * Program the PHY port ETH_GLTSYN_SHTIME registers in preparation setting the
+ * initial clock time. The time will not actually be programmed until the
+ * driver issues an INIT_TIME command.
+ *
+ * The time value is the upper 32 bits of the PHY timer, usually in units of
+ * nominal nanoseconds.
+ */
+static enum ice_status ice_ptp_prep_phy_time_e810(struct ice_hw *hw, u32 time)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_0(tmr_idx), 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write SHTIME_0, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_L(tmr_idx), time);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write SHTIME_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_e810 - Prep PHY port for a time adjustment
+ * @hw: pointer to HW struct
+ * @adj: adjustment value to program
+ * @lock_sbq: true if the sideband queue luck must be acquired
+ *
+ * Prepare the PHY port for an atomic adjustment by programming the PHY
+ * ETH_GLTSYN_SHADJ_L and ETH_GLTSYN_SHADJ_H registers. The actual adjustment
+ * is completed by issuing an ADJ_TIME sync command.
+ *
+ * The adjustment value only contains the portion used for the upper 32bits of
+ * the PHY timer, usually in units of nominal nanoseconds. Negative
+ * adjustments are supported using 2s complement arithmetic.
+ */
+static enum ice_status
+ice_ptp_prep_phy_adj_e810(struct ice_hw *hw, s32 adj, bool lock_sbq)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Adjustments are represented as signed 2's complement values in
+	 * nanoseconds. Sub-nanosecond adjustment is not supported.
+	 */
+	status = ice_write_phy_reg_e810_lp(hw, ETH_GLTSYN_SHADJ_L(tmr_idx),
+					   0, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write adj to PHY SHADJ_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810_lp(hw, ETH_GLTSYN_SHADJ_H(tmr_idx),
+					   adj, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write adj to PHY SHADJ_H, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_incval_e810 - Prep PHY port increment value change
+ * @hw: pointer to HW struct
+ * @incval: The new 40bit increment value to prepare
+ *
+ * Prepare the PHY port for a new increment value by programming the PHY
+ * ETH_GLTSYN_SHADJ_L and ETH_GLTSYN_SHADJ_H registers. The actual change is
+ * completed by issuing an INIT_INCVAL command.
+ */
+static enum ice_status
+ice_ptp_prep_phy_incval_e810(struct ice_hw *hw, u64 incval)
+{
+	enum ice_status status;
+	u32 high, low;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	low = lower_32_bits(incval);
+	high = upper_32_bits(incval);
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHADJ_L(tmr_idx), low);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write incval to PHY SHADJ_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHADJ_H(tmr_idx), high);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write incval PHY SHADJ_H, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_target_e810 - Prepare PHY port with adjust target
+ * @hw: Board private structure
+ * @target_time: Time to trigger the clock adjustment at
+ *
+ * Program the PHY port ETH_GLTSYN_SHTIME registers in preparation for
+ * a target time adjust, which will trigger an adjustment of the clock in the
+ * future. The actual adjustment will occur the next time the PHY port timer
+ * crosses over the provided value after the driver issues an ADJ_TIME_AT_TIME
+ * command.
+ *
+ * The time value is the upper 32 bits of the PHY timer, usually in units of
+ * nominal nanoseconds.
+ */
+static enum ice_status
+ice_ptp_prep_phy_adj_target_e810(struct ice_hw *hw, u32 target_time)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_0(tmr_idx), 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write target time to SHTIME_0, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_L(tmr_idx),
+					target_time);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write target time to SHTIME_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_port_cmd_e810 - Prepare all external PHYs for a timer command
+ * @hw: pointer to HW struct
+ * @cmd: Command to be sent to the port
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Prepare the external PHYs connected to this device for a timer sync
+ * command.
+ */
+static enum ice_status
+ice_ptp_port_cmd_e810(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd,
+		      bool lock_sbq)
+{
+	enum ice_status status;
+	u32 cmd_val, val;
+
+	switch (cmd) {
+	case INIT_TIME:
+		cmd_val = GLTSYN_CMD_INIT_TIME;
+		break;
+	case INIT_INCVAL:
+		cmd_val = GLTSYN_CMD_INIT_INCVAL;
+		break;
+	case ADJ_TIME:
+		cmd_val = GLTSYN_CMD_ADJ_TIME;
+		break;
+	case ADJ_TIME_AT_TIME:
+		cmd_val = GLTSYN_CMD_ADJ_INIT_TIME;
+		break;
+	case READ_TIME:
+		cmd_val = GLTSYN_CMD_READ_TIME;
+		break;
+	default:
+		dev_warn(ice_hw_to_dev(hw), "Unknown timer command %u\n", cmd);
+		return ICE_ERR_PARAM;
+	}
+
+	/* Read, modify, write */
+	status = ice_read_phy_reg_e810_lp(hw, ETH_GLTSYN_CMD, &val, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read GLTSYN_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* Modify necessary bits only and perform write */
+	val &= ~TS_CMD_MASK_E810;
+	val |= cmd_val;
+
+	status = ice_write_phy_reg_e810_lp(hw, ETH_GLTSYN_CMD, val, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back GLTSYN_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/* Device agnostic functions
+ *
+ * The following functions implement shared behavior common to both E822 and
+ * E810 devices, possibly calling a device specific implementation where
+ * necessary.
+ */
+
+/**
+ * ice_ptp_lock - Acquire PTP global semaphore register lock
+ * @hw: pointer to the HW struct
+ *
+ * Acquire the global PTP hardware semaphore lock. Returns true if the lock
+ * was acquired, false otherwise.
+ *
+ * The PFTSYN_SEM register sets the busy bit on read, returning the previous
+ * value. If software sees the busy bit cleared, this means that this function
+ * acquired the lock (and the busy bit is now set). If software sees the busy
+ * bit set, it means that another function acquired the lock.
+ *
+ * Software must clear the busy bit with a write to release the lock for other
+ * functions when done.
+ */
+bool ice_ptp_lock(struct ice_hw *hw)
+{
+	u32 hw_lock;
+	int i;
+
+#define MAX_TRIES 5
+
+	for (i = 0; i < MAX_TRIES; i++) {
+		hw_lock = rd32(hw, PFTSYN_SEM + (PFTSYN_SEM_BYTES * hw->pf_id));
+		hw_lock = hw_lock & PFTSYN_SEM_BUSY_M;
+		if (hw_lock) {
+			/* Somebody is holding the lock */
+			msleep(10);
+			continue;
+		} else {
+			break;
+		}
+	}
+
+	return !hw_lock;
+}
+
+/**
+ * ice_ptp_unlock - Release PTP global semaphore register lock
+ * @hw: pointer to the HW struct
+ *
+ * Release the global PTP hardware semaphore lock. This is done by writing to
+ * the PFTSYN_SEM register.
+ */
+void ice_ptp_unlock(struct ice_hw *hw)
+{
+	wr32(hw, PFTSYN_SEM + (PFTSYN_SEM_BYTES * hw->pf_id), 0);
+}
+
+/**
+ * ice_ptp_src_cmd - Prepare source timer for a timer command
+ * @hw: pointer to HW structure
+ * @cmd: Timer command
+ *
+ * Prepare the source timer for an upcoming timer sync command.
+ */
+void ice_ptp_src_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd)
+{
+	u32 cmd_val;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+	cmd_val = tmr_idx << SEL_CPK_SRC;
+
+	switch (cmd) {
+	case INIT_TIME:
+		cmd_val |= GLTSYN_CMD_INIT_TIME;
+		break;
+	case INIT_INCVAL:
+		cmd_val |= GLTSYN_CMD_INIT_INCVAL;
+		break;
+	case ADJ_TIME:
+		cmd_val |= GLTSYN_CMD_ADJ_TIME;
+		break;
+	case ADJ_TIME_AT_TIME:
+		cmd_val |= GLTSYN_CMD_ADJ_INIT_TIME;
+		break;
+	case READ_TIME:
+		cmd_val |= GLTSYN_CMD_READ_TIME;
+		break;
+	default:
+		dev_warn(ice_hw_to_dev(hw), "Unknown timer command %u\n", cmd);
+		return;
+	}
+
+	wr32(hw, GLTSYN_CMD, cmd_val);
+}
+
+/**
+ * ice_ptp_tmr_cmd - Prepare and trigger a timer sync command
+ * @hw: pointer to HW struct
+ * @cmd: the command to issue
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Prepare the source timer and PHY timers and then trigger the requested
+ * command. This causes the shadow registers previously written in preparation
+ * for the command to be synchronously applied to both the source and PHY
+ * timers.
+ */
+static enum ice_status
+ice_ptp_tmr_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd, bool lock_sbq)
+{
+	enum ice_status status;
+
+	/* First, prepare the source timer */
+	ice_ptp_src_cmd(hw, cmd);
+
+	/* Next, prepare the ports */
+	if (ice_is_e810(hw))
+		status = ice_ptp_port_cmd_e810(hw, cmd, lock_sbq);
+	else
+		status = ice_ptp_port_cmd_e822(hw, cmd, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to prepare PHY ports for timer command %u, status %d\n",
+			  cmd, status);
+		return status;
+	}
+
+	/* Write the sync command register to drive both source and PHY timer
+	 * commands synchronously
+	 */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_init_time - Initialize device time to provided value
+ * @hw: pointer to HW struct
+ * @time: 64bits of time (GLTSYN_TIME_L and GLTSYN_TIME_H)
+ *
+ * Initialize the device to the specified time provided. This requires a three
+ * step process:
+ *
+ * 1) write the new init time to the source timer shadow registers
+ * 2) write the new init time to the phy timer shadow registers
+ * 3) issue an init_time timer command to synchronously switch both the source
+ *    and port timers to the new init time value at the next clock cycle.
+ */
+enum ice_status ice_ptp_init_time(struct ice_hw *hw, u64 time)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Source timers */
+	wr32(hw, GLTSYN_SHTIME_L(tmr_idx), lower_32_bits(time));
+	wr32(hw, GLTSYN_SHTIME_H(tmr_idx), upper_32_bits(time));
+	wr32(hw, GLTSYN_SHTIME_0(tmr_idx), 0);
+
+	/* PHY Clks */
+	/* Fill Rx and Tx ports and send msg to PHY */
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_time_e810(hw, time & 0xFFFFFFFF);
+	else
+		status = ice_ptp_prep_phy_time_e822(hw, time & 0xFFFFFFFF);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, INIT_TIME, true);
+}
+
+/**
+ * ice_ptp_write_incval - Program PHC with new increment value
+ * @hw: pointer to HW struct
+ * @incval: Source timer increment value per clock cycle
+ *
+ * Program the PHC with a new increment value. This requires a three-step
+ * process:
+ *
+ * 1) Write the increment value to the source timer shadow registers
+ * 2) Write the increment value to the PHY timer shadow registers
+ * 3) Issue an INIT_INCVAL timer command to synchronously switch both the
+ *    source and port timers to the new increment value at the next clock
+ *    cycle.
+ */
+enum ice_status ice_ptp_write_incval(struct ice_hw *hw, u64 incval)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Shadow Adjust */
+	wr32(hw, GLTSYN_SHADJ_L(tmr_idx), lower_32_bits(incval));
+	wr32(hw, GLTSYN_SHADJ_H(tmr_idx), upper_32_bits(incval));
+
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_incval_e810(hw, incval);
+	else
+		status = ice_ptp_prep_phy_incval_e822(hw, incval);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, INIT_INCVAL, true);
+}
+
+/**
+ * ice_ptp_write_incval_locked - Program new incval while holding semaphore
+ * @hw: pointer to HW struct
+ * @incval: Source timer increment value per clock cycle
+ *
+ * Program a new PHC incval while holding the PTP semaphore.
+ */
+enum ice_status ice_ptp_write_incval_locked(struct ice_hw *hw, u64 incval)
+{
+	enum ice_status status;
+
+	if (!ice_ptp_lock(hw))
+		return ICE_ERR_NOT_READY;
+
+	status = ice_ptp_write_incval(hw, incval);
+
+	ice_ptp_unlock(hw);
+
+	return status;
+}
+
+/**
+ * ice_ptp_adj_clock - Adjust PHC clock time atomically
+ * @hw: pointer to HW struct
+ * @adj: Adjustment in nanoseconds
+ * @lock_sbq: true to lock the sbq sq_lock (the usual case); false if the
+ *            sq_lock has already been locked at a higher level
+ *
+ * Perform an atomic adjustment of the PHC time by the specified number of
+ * nanoseconds. This requires a three-step process:
+ *
+ * 1) Write the adjustment to the source timer shadow registers
+ * 2) Write the adjustment to the PHY timer shadow registers
+ * 3) Issue an ADJ_TIME timer command to synchronously apply the adjustment to
+ *    both the source and port timers at the next clock cycle.
+ */
+enum ice_status ice_ptp_adj_clock(struct ice_hw *hw, s32 adj, bool lock_sbq)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Write the desired clock adjustment into the GLTSYN_SHADJ register.
+	 * For an ADJ_TIME command, this set of registers represents the value
+	 * to add to the clock time. It supports subtraction by interpreting
+	 * the value as a 2's complement integer.
+	 */
+	wr32(hw, GLTSYN_SHADJ_L(tmr_idx), 0);
+	wr32(hw, GLTSYN_SHADJ_H(tmr_idx), adj);
+
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_adj_e810(hw, adj, lock_sbq);
+	else
+		status = ice_ptp_prep_phy_adj_e822(hw, adj, lock_sbq);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, ADJ_TIME, lock_sbq);
+}
+
+/**
+ * ice_ptp_adj_clock_at_time - Adjust PHC atomically at specified time
+ * @hw: pointer to HW struct
+ * @at_time: Time in nanoseconds at which to perform the adjustment
+ * @adj: Adjustment in nanoseconds
+ *
+ * Perform an atomic adjustment to the PHC clock at the specified time. This
+ * requires a five-step process:
+ *
+ * 1) Write the adjustment to the source timer shadow adjust registers
+ * 2) Write the target time to the source timer shadow time registers
+ * 3) Write the adjustment to the PHY timers shadow adjust registers
+ * 4) Write the target time to the PHY timers shadow adjust registers
+ * 5) Issue an ADJ_TIME_AT_TIME command to initiate the atomic adjustment.
+ */
+enum ice_status
+ice_ptp_adj_clock_at_time(struct ice_hw *hw, u64 at_time, s32 adj)
+{
+	enum ice_status status;
+	u32 time_lo, time_hi;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	time_lo = lower_32_bits(at_time);
+	time_hi = upper_32_bits(at_time);
+
+	/* Write the desired clock adjustment into the GLTSYN_SHADJ register.
+	 * For an ADJ_TIME_AT_TIME command, this set of registers represents
+	 * the value to add to the clock time. It supports subtraction by
+	 * interpreting the value as a 2's complement integer.
+	 */
+	wr32(hw, GLTSYN_SHADJ_L(tmr_idx), 0);
+	wr32(hw, GLTSYN_SHADJ_H(tmr_idx), adj);
+
+	/* Write the target time to trigger the adjustment for source clock */
+	wr32(hw, GLTSYN_SHTIME_0(tmr_idx), 0);
+	wr32(hw, GLTSYN_SHTIME_L(tmr_idx), time_lo);
+	wr32(hw, GLTSYN_SHTIME_H(tmr_idx), time_hi);
+
+	/* Prepare PHY port adjustments */
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_adj_e810(hw, adj, true);
+	else
+		status = ice_ptp_prep_phy_adj_e822(hw, adj, true);
+	if (status)
+		return status;
+
+	/* Set target time for each PHY port */
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_adj_target_e810(hw, time_lo);
+	else
+		status = ice_ptp_prep_phy_adj_target_e822(hw, time_lo);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, ADJ_TIME_AT_TIME, true);
+}
+
+/**
+ * ice_read_phy_tstamp - Read a PHY timestamp from the timestamo block
+ * @hw: pointer to the HW struct
+ * @block: the block to read from
+ * @idx: the timestamp index to read
+ * @tstamp: on return, the 40bit timestamp value
+ *
+ * Read a 40bit timestamp value out of the timestamp block. For E822 devices,
+ * the block is the quad to read from. For E810 devices, the block is the
+ * logical port to read from.
+ */
+enum ice_status
+ice_read_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx, u64 *tstamp)
+{
+	if (ice_is_e810(hw))
+		return ice_read_phy_tstamp_e810(hw, block, idx, tstamp);
+	else
+		return ice_read_phy_tstamp_e822(hw, block, idx, tstamp);
+}
+
+/**
+ * ice_clear_phy_tstamp - Clear a timestamp from the timestamp block
+ * @hw: pointer to the HW struct
+ * @block: the block to read from
+ * @idx: the timestamp index to reset
+ *
+ * Clear a timestamp, resetting its valid bit, from the timestamp block. For
+ * E822 devices, the block is the quad to clear from. For E810 devices, the
+ * block is the logical port to clear from.
+ */
+enum ice_status
+ice_clear_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx)
+{
+	if (ice_is_e810(hw))
+		return ice_clear_phy_tstamp_e810(hw, block, idx);
+	else
+		return ice_clear_phy_tstamp_e822(hw, block, idx);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
new file mode 100644
index 000000000000..e63b9ca75260
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PTP_HW_H_
+#define _ICE_PTP_HW_H_
+
+enum ice_ptp_tmr_cmd {
+	INIT_TIME,
+	INIT_INCVAL,
+	ADJ_TIME,
+	ADJ_TIME_AT_TIME,
+	READ_TIME
+};
+
+enum ice_ptp_serdes {
+	ICE_PTP_SERDES_1G,
+	ICE_PTP_SERDES_10G,
+	ICE_PTP_SERDES_25G,
+	ICE_PTP_SERDES_40G,
+	ICE_PTP_SERDES_50G,
+	ICE_PTP_SERDES_100G
+};
+
+enum ice_ptp_link_spd {
+	ICE_PTP_LNK_SPD_1G,
+	ICE_PTP_LNK_SPD_10G,
+	ICE_PTP_LNK_SPD_25G,
+	ICE_PTP_LNK_SPD_25G_RS,
+	ICE_PTP_LNK_SPD_40G,
+	ICE_PTP_LNK_SPD_50G,
+	ICE_PTP_LNK_SPD_50G_RS,
+	ICE_PTP_LNK_SPD_100G_RS,
+	NUM_ICE_PTP_LNK_SPD /* Must be last */
+};
+
+enum ice_ptp_fec_mode {
+	ICE_PTP_FEC_MODE_NONE,
+	ICE_PTP_FEC_MODE_CLAUSE74,
+	ICE_PTP_FEC_MODE_RS_FEC
+};
+
+/**
+ * struct ice_time_ref_info_e822
+ * @pll_freq: Frequency of PLL that drives timer ticks in Hz
+ * @nominal_incval: increment to generate nanoseconds in GLTSYN_TIME_L
+ * @pps_delay: propagation delay of the PPS output signal
+ *
+ * Characteristic information for the various TIME_REF sources possible in the
+ * E822 devices
+ */
+struct ice_time_ref_info_e822 {
+	u64 pll_freq;
+	u64 nominal_incval;
+	u8 pps_delay;
+};
+
+/* Table of constants related to possible TIME_REF sources */
+extern const struct ice_time_ref_info_e822 e822_time_ref[NUM_ICE_TIME_REF_FREQ];
+
+/* Increment value to generate nanoseconds in the GLTSYN_TIME_L register for
+ * the E810 devices. Based off of a PLL with an 812.5 MHz frequency.
+ */
+#define ICE_PTP_NOMINAL_INCVAL_E810 0x13b13b13bULL
+
+/* Device agnostic functions */
+u8 ice_get_ptp_src_clock_index(struct ice_hw *hw);
+u64 ice_ptp_read_src_incval(struct ice_hw *hw);
+bool ice_ptp_lock(struct ice_hw *hw);
+void ice_ptp_unlock(struct ice_hw *hw);
+void ice_ptp_src_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd);
+enum ice_status ice_ptp_init_time(struct ice_hw *hw, u64 time);
+enum ice_status ice_ptp_write_incval(struct ice_hw *hw, u64 incval);
+enum ice_status ice_ptp_write_incval_locked(struct ice_hw *hw, u64 incval);
+enum ice_status ice_ptp_adj_clock(struct ice_hw *hw, s32 adj, bool lock_sbq);
+enum ice_status
+ice_ptp_adj_clock_at_time(struct ice_hw *hw, u64 at_time, s32 adj);
+enum ice_status
+ice_read_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx, u64 *tstamp);
+enum ice_status
+ice_clear_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx);
+
+/* E822 family functions */
+enum ice_status
+ice_read_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 *val);
+enum ice_status
+ice_write_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 val);
+enum ice_status
+ice_read_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 *val);
+enum ice_status
+ice_write_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 val);
+enum ice_status
+ice_ptp_prep_port_adj_e822(struct ice_hw *hw, u8 port, s64 time,
+			   bool lock_sbq);
+enum ice_status
+ice_ptp_read_phy_incval_e822(struct ice_hw *hw, u8 port, u64 *incval);
+enum ice_status
+ice_ptp_read_port_capture(struct ice_hw *hw, u8 port, u64 *tx_ts, u64 *rx_ts);
+enum ice_status
+ice_ptp_one_port_cmd(struct ice_hw *hw, u8 port, enum ice_ptp_tmr_cmd cmd,
+		     bool lock_sbq);
+
+static inline u64 ice_e822_pll_freq(enum ice_time_ref_freq time_ref)
+{
+	return e822_time_ref[time_ref].pll_freq;
+}
+
+static inline u64 ice_e822_nominal_incval(enum ice_time_ref_freq time_ref)
+{
+	return e822_time_ref[time_ref].nominal_incval;
+}
+
+static inline u64 ice_e822_pps_delay(enum ice_time_ref_freq time_ref)
+{
+	return e822_time_ref[time_ref].pps_delay;
+}
+
+/* E822 Vernier calibration functions */
+enum ice_status ice_ptp_set_vernier_wl(struct ice_hw *hw);
+enum ice_status
+ice_phy_get_speed_and_fec_e822(struct ice_hw *hw, u8 port,
+			       enum ice_ptp_link_spd *link_out,
+			       enum ice_ptp_fec_mode *fec_out);
+void ice_phy_cfg_lane_e822(struct ice_hw *hw, u8 port);
+
+/* E810 family functions */
+enum ice_status ice_ptp_init_phy_e810(struct ice_hw *hw);
+
+#define PFTSYN_SEM_BYTES	4
+
+#define ICE_PTP_CLOCK_INDEX_0	0x00
+#define ICE_PTP_CLOCK_INDEX_1	0x01
+
+/* PHY timer commands */
+#define SEL_CPK_SRC	8
+#define SEL_PHY_SRC	3
+
+/* Time Sync command Definitions */
+#define GLTSYN_CMD_INIT_TIME		BIT(0)
+#define GLTSYN_CMD_INIT_INCVAL		BIT(1)
+#define GLTSYN_CMD_INIT_TIME_INCVAL	(BIT(0) | BIT(1))
+#define GLTSYN_CMD_ADJ_TIME		BIT(2)
+#define GLTSYN_CMD_ADJ_INIT_TIME	(BIT(2) | BIT(3))
+#define GLTSYN_CMD_READ_TIME		BIT(7)
+
+/* PHY port Time Sync command definitions */
+#define PHY_CMD_INIT_TIME		BIT(0)
+#define PHY_CMD_INIT_INCVAL		BIT(1)
+#define PHY_CMD_ADJ_TIME		(BIT(0) | BIT(1))
+#define PHY_CMD_ADJ_TIME_AT_TIME	(BIT(0) | BIT(2))
+#define PHY_CMD_READ_TIME		(BIT(0) | BIT(1) | BIT(2))
+
+#define TS_CMD_MASK_E810		0xFF
+#define TS_CMD_MASK			0xF
+#define SYNC_EXEC_CMD			0x3
+
+/* Macros to derive port low and high addresses on both quads */
+#define P_Q0_L(a, p) ((((a) + (0x2000 * (p)))) & 0xFFFF)
+#define P_Q0_H(a, p) ((((a) + (0x2000 * (p)))) >> 16)
+#define P_Q1_L(a, p) ((((a) - (0x2000 * ((p) - ICE_PORTS_PER_QUAD)))) & 0xFFFF)
+#define P_Q1_H(a, p) ((((a) - (0x2000 * ((p) - ICE_PORTS_PER_QUAD)))) >> 16)
+
+/* PHY QUAD register base addresses */
+#define Q_0_BASE			0x94000
+#define Q_1_BASE			0x114000
+
+/* Timestamp memory reset registers */
+#define Q_REG_TS_CTRL			0x618
+#define Q_REG_TS_CTRL_S			0
+#define Q_REG_TS_CTRL_M			BIT(0)
+
+/* Timestamp availability status registers */
+#define Q_REG_TX_MEMORY_STATUS_L	0xCF0
+#define Q_REG_TX_MEMORY_STATUS_U	0xCF4
+
+/* Tx FIFO status registers */
+#define Q_REG_FIFO23_STATUS		0xCF8
+#define Q_REG_FIFO01_STATUS		0xCFC
+#define Q_REG_FIFO02_S			0
+#define Q_REG_FIFO02_M			ICE_M(0x3FF, 0)
+#define Q_REG_FIFO13_S			10
+#define Q_REG_FIFO13_M			ICE_M(0x3FF, 10)
+
+/* Interrupt control Config registers */
+#define Q_REG_TX_MEM_GBL_CFG		0xC08
+#define Q_REG_TX_MEM_GBL_CFG_LANE_TYPE_S	0
+#define Q_REG_TX_MEM_GBL_CFG_LANE_TYPE_M	BIT(0)
+#define Q_REG_TX_MEM_GBL_CFG_TX_TYPE_S	1
+#define Q_REG_TX_MEM_GBL_CFG_TX_TYPE_M	ICE_M(0xFF, 1)
+#define Q_REG_TX_MEM_GBL_CFG_INTR_THR_S	9
+#define Q_REG_TX_MEM_GBL_CFG_INTR_THR_M ICE_M(0x3F, 9)
+#define Q_REG_TX_MEM_GBL_CFG_INTR_ENA_S	15
+#define Q_REG_TX_MEM_GBL_CFG_INTR_ENA_M	BIT(15)
+
+/* Tx Timestamp data registers */
+#define Q_REG_TX_MEMORY_BANK_START	0xA00
+
+/* PHY port register base addresses */
+#define P_0_BASE			0x80000
+#define P_4_BASE			0x106000
+
+/* Timestamp init registers */
+#define P_REG_RX_TIMER_INC_PRE_L	0x46C
+#define P_REG_RX_TIMER_INC_PRE_U	0x470
+#define P_REG_TX_TIMER_INC_PRE_L	0x44C
+#define P_REG_TX_TIMER_INC_PRE_U	0x450
+
+/* Timestamp match and adjust target registers */
+#define P_REG_RX_TIMER_CNT_ADJ_L	0x474
+#define P_REG_RX_TIMER_CNT_ADJ_U	0x478
+#define P_REG_TX_TIMER_CNT_ADJ_L	0x454
+#define P_REG_TX_TIMER_CNT_ADJ_U	0x458
+
+/* Timestamp capture registers */
+#define P_REG_RX_CAPTURE_L		0x4D8
+#define P_REG_RX_CAPTURE_U		0x4DC
+#define P_REG_TX_CAPTURE_L		0x4B4
+#define P_REG_TX_CAPTURE_U		0x4B8
+
+/* Timestamp PHY incval registers */
+#define P_REG_TIMETUS_L			0x410
+#define P_REG_TIMETUS_U			0x414
+
+#define P_REG_TIMETUS_LOW_M		0xFF
+#define P_REG_TIMETUS_HIGH_S		8
+
+/* PHY window length registers */
+#define P_REG_WL			0x40C
+
+#define PTP_VERNIER_WL			0x111ed
+
+/* PHY start registers */
+#define P_REG_PS			0x408
+#define P_REG_PS_START_S		0
+#define P_REG_PS_START_M		BIT(0)
+#define P_REG_PS_BYPASS_MODE_S		1
+#define P_REG_PS_BYPASS_MODE_M		BIT(1)
+#define P_REG_PS_ENA_CLK_S		2
+#define P_REG_PS_ENA_CLK_M		BIT(2)
+#define P_REG_PS_LOAD_OFFSET_S		3
+#define P_REG_PS_LOAD_OFFSET_M		BIT(3)
+#define P_REG_PS_SFT_RESET_S		11
+#define P_REG_PS_SFT_RESET_M		BIT(11)
+
+/* PHY offset valid registers */
+#define P_REG_TX_OV_STATUS		0x4D4
+#define P_REG_TX_OV_STATUS_OV_S		0
+#define P_REG_TX_OV_STATUS_OV_M		BIT(0)
+#define P_REG_RX_OV_STATUS		0x4F8
+#define P_REG_RX_OV_STATUS_OV_S		0
+#define P_REG_RX_OV_STATUS_OV_M		BIT(0)
+
+/* PHY offset ready registers */
+#define P_REG_TX_OR			0x45C
+#define P_REG_RX_OR			0x47C
+
+/* PHY total offset registers */
+#define P_REG_TOTAL_RX_OFFSET_L		0x460
+#define P_REG_TOTAL_RX_OFFSET_U		0x464
+#define P_REG_TOTAL_TX_OFFSET_L		0x440
+#define P_REG_TOTAL_TX_OFFSET_U		0x444
+
+/* Timestamp PAR/PCS registers */
+#define P_REG_UIX66_10G_40G_L		0x480
+#define P_REG_UIX66_10G_40G_U		0x484
+#define P_REG_UIX66_25G_100G_L		0x488
+#define P_REG_UIX66_25G_100G_U		0x48C
+#define P_REG_DESK_PAR_RX_TUS_L		0x490
+#define P_REG_DESK_PAR_RX_TUS_U		0x494
+#define P_REG_DESK_PAR_TX_TUS_L		0x498
+#define P_REG_DESK_PAR_TX_TUS_U		0x49C
+#define P_REG_DESK_PCS_RX_TUS_L		0x4A0
+#define P_REG_DESK_PCS_RX_TUS_U		0x4A4
+#define P_REG_DESK_PCS_TX_TUS_L		0x4A8
+#define P_REG_DESK_PCS_TX_TUS_U		0x4AC
+#define P_REG_PAR_RX_TUS_L		0x420
+#define P_REG_PAR_RX_TUS_U		0x424
+#define P_REG_PAR_TX_TUS_L		0x428
+#define P_REG_PAR_TX_TUS_U		0x42C
+#define P_REG_PCS_RX_TUS_L		0x430
+#define P_REG_PCS_RX_TUS_U		0x434
+#define P_REG_PCS_TX_TUS_L		0x438
+#define P_REG_PCS_TX_TUS_U		0x43C
+#define P_REG_PAR_RX_TIME_L		0x4F0
+#define P_REG_PAR_RX_TIME_U		0x4F4
+#define P_REG_PAR_TX_TIME_L		0x4CC
+#define P_REG_PAR_TX_TIME_U		0x4D0
+#define P_REG_PAR_PCS_RX_OFFSET_L	0x4E8
+#define P_REG_PAR_PCS_RX_OFFSET_U	0x4EC
+#define P_REG_PAR_PCS_TX_OFFSET_L	0x4C4
+#define P_REG_PAR_PCS_TX_OFFSET_U	0x4C8
+#define P_REG_LINK_SPEED		0x4FC
+#define P_REG_LINK_SPEED_SERDES_S	0
+#define P_REG_LINK_SPEED_SERDES_M	ICE_M(0x7, 0)
+#define P_REG_LINK_SPEED_FEC_MODE_S	3
+#define P_REG_LINK_SPEED_FEC_MODE_M	ICE_M(0x3, 3)
+#define P_REG_LINK_SPEED_FEC_MODE(reg)			\
+	(((reg) & P_REG_LINK_SPEED_FEC_MODE_M) >>	\
+	 P_REG_LINK_SPEED_FEC_MODE_S)
+
+/* PHY timestamp related registers */
+#define P_REG_PMD_ALIGNMENT		0x0FC
+#define P_REG_RX_80_TO_160_CNT		0x6FC
+#define P_REG_RX_80_TO_160_CNT_RXCYC_S	0
+#define P_REG_RX_80_TO_160_CNT_RXCYC_M	BIT(0)
+#define P_REG_RX_40_TO_160_CNT		0x8FC
+#define P_REG_RX_40_TO_160_CNT_RXCYC_S	0
+#define P_REG_RX_40_TO_160_CNT_RXCYC_M	ICE_M(0x3, 0)
+
+/* Rx FIFO status registers */
+#define P_REG_RX_OV_FS			0x4F8
+#define P_REG_RX_OV_FS_FIFO_STATUS_S	2
+#define P_REG_RX_OV_FS_FIFO_STATUS_M	ICE_M(0x3FF, 2)
+
+/* Timestamp command registers */
+#define P_REG_TX_TMR_CMD		0x448
+#define P_REG_RX_TMR_CMD		0x468
+
+/* E810 timesync enable register */
+#define ETH_GLTSYN_ENA(_i)		(0x03000348 + ((_i) * 4))
+
+/* E810 shadow init time registers */
+#define ETH_GLTSYN_SHTIME_0(i)		(0x03000368 + ((i) * 32))
+#define ETH_GLTSYN_SHTIME_L(i)		(0x0300036C + ((i) * 32))
+
+/* E810 shadow time adjust registers */
+#define ETH_GLTSYN_SHADJ_L(_i)		(0x03000378 + ((_i) * 32))
+#define ETH_GLTSYN_SHADJ_H(_i)		(0x0300037C + ((_i) * 32))
+
+/* E810 timer command register */
+#define ETH_GLTSYN_CMD			0x03000344
+
+/* Source timer incval macros */
+#define INCVAL_HIGH_M			0xFF
+
+/* Timestamp block macros */
+#define TS_LOW_M			0xFFFFFFFF
+#define TS_HIGH_M			0xFF
+#define TS_HIGH_S			32
+
+#define TS_PHY_LOW_M			0xFF
+#define TS_PHY_HIGH_M			0xFFFFFFFF
+#define TS_PHY_HIGH_S			8
+
+#define BYTES_PER_IDX_ADDR_L_U		8
+#define BYTES_PER_IDX_ADDR_L		4
+
+/* Internal PHY timestamp address */
+#define TS_L(a, idx) ((a) + ((idx) * BYTES_PER_IDX_ADDR_L_U))
+#define TS_H(a, idx) ((a) + ((idx) * BYTES_PER_IDX_ADDR_L_U +		\
+			     BYTES_PER_IDX_ADDR_L))
+
+/* External PHY timestamp address */
+#define TS_EXT(a, port, idx) ((a) + (0x1000 * (port)) +			\
+				 ((idx) * BYTES_PER_IDX_ADDR_L_U))
+
+#define LOW_TX_MEMORY_BANK_START	0x03090000
+#define HIGH_TX_MEMORY_BANK_START	0x03090004
+
+#endif /* _ICE_PTP_HW_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c
new file mode 100644
index 000000000000..0f4d9b6e8a33
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_repr.c
@@ -0,0 +1,467 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_eswitch.h"
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#include "ice_devlink.h"
+#endif /* CONFIG_NET_DEVLINK */
+#include "ice_virtchnl_pf.h"
+#include "ice_tc_lib.h"
+
+#ifdef HAVE_NDO_GET_PHYS_PORT_NAME
+/**
+ * ice_repr_get_sw_port_id - get port ID associated with representor
+ * @repr: pointer to port representor
+ */
+static int ice_repr_get_sw_port_id(struct ice_repr *repr)
+{
+	return repr->vf->pf->hw.port_info->lport;
+}
+
+/**
+ * ice_repr_get_phys_port_name - get phys port name
+ * @netdev: pointer to port representor netdev
+ * @buf: write here port name
+ * @len: max length of buf
+ */
+static int
+ice_repr_get_phys_port_name(struct net_device *netdev, char *buf, size_t len)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_repr *repr = np->repr;
+	int res;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	/* Devlink port is registered and devlink core is taking care of name formatting. */
+	if (repr->vf->devlink_port.registered)
+		return -EOPNOTSUPP;
+#endif /* CONFIG_NET_DEVLINK */
+
+	res = snprintf(buf, len, "pf%dvfr%d", ice_repr_get_sw_port_id(repr),
+		       repr->vf->vf_id);
+	if (res <= 0)
+		return -EOPNOTSUPP;
+	return 0;
+}
+#endif /* HAVE_NDO_GET_PHYS_PORT_NAME */
+
+/**
+ * ice_repr_get_stats64 - get VF stats for VFPR use
+ * @netdev: pointer to port representor netdev
+ * @stats: pointer to struct where stats can be stored
+ */
+#ifdef HAVE_VOID_NDO_GET_STATS64
+static void
+#else
+static struct rtnl_link_stats64 *
+#endif
+ice_repr_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_eth_stats *eth_stats;
+	struct ice_vsi *vsi;
+
+	if (ice_check_vf_ready_for_cfg(np->repr->vf))
+#ifdef HAVE_VOID_NDO_GET_STATS64
+		return;
+#else
+		return stats;
+#endif
+	vsi = np->repr->src_vsi;
+
+	ice_update_vsi_stats(vsi);
+	eth_stats = &vsi->eth_stats;
+
+	stats->tx_packets = eth_stats->tx_unicast + eth_stats->tx_broadcast +
+			    eth_stats->tx_multicast;
+	stats->rx_packets = eth_stats->rx_unicast + eth_stats->rx_broadcast +
+			    eth_stats->rx_multicast;
+	stats->tx_bytes = eth_stats->tx_bytes;
+	stats->rx_bytes = eth_stats->rx_bytes;
+	stats->multicast = eth_stats->rx_multicast;
+	stats->tx_errors = eth_stats->tx_errors;
+	stats->tx_dropped = eth_stats->tx_discards;
+	stats->rx_dropped = eth_stats->rx_discards;
+#ifndef HAVE_VOID_NDO_GET_STATS64
+
+	return stats;
+#endif
+}
+
+/**
+ * ice_netdev_to_repr - Get port representor for given netdevice
+ * @netdev: pointer to port representor netdev
+ */
+struct ice_repr *ice_netdev_to_repr(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+
+	return np->repr;
+}
+
+/**
+ * ice_repr_open - Enable port representor's network interface
+ * @netdev: network interface device structure
+ *
+ * The open entry point is called when a port representor's network
+ * interface is made active by the system (IFF_UP). Corresponding
+ * VF is notified about link status change.
+ *
+ * Returns 0 on success, negative value on failure
+ */
+static int ice_repr_open(struct net_device *netdev)
+{
+	struct ice_repr *repr = ice_netdev_to_repr(netdev);
+	struct ice_vf *vf;
+
+	vf = repr->vf;
+	vf->link_forced = true;
+	vf->link_up = true;
+	ice_vc_notify_vf_link_state(vf);
+
+	netif_carrier_on(netdev);
+	netif_tx_start_all_queues(netdev);
+
+	return 0;
+}
+
+/**
+ * ice_repr_stop - Disable port representor's network interface
+ * @netdev: network interface device structure
+ *
+ * The stop entry point is called when a port representor's network
+ * interface is de-activated by the system. Corresponding
+ * VF is notified about link status change.
+ *
+ * Returns 0 on success, negative value on failure
+ */
+static int ice_repr_stop(struct net_device *netdev)
+{
+	struct ice_repr *repr = ice_netdev_to_repr(netdev);
+	struct ice_vf *vf;
+
+	vf = repr->vf;
+	vf->link_forced = true;
+	vf->link_up = false;
+	ice_vc_notify_vf_link_state(vf);
+
+	netif_carrier_off(netdev);
+	netif_tx_stop_all_queues(netdev);
+
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK) && defined(HAVE_DEVLINK_PORT_ATTR_PCI_VF)
+static struct devlink_port *
+ice_repr_get_devlink_port(struct net_device *netdev)
+{
+	struct ice_repr *repr = ice_netdev_to_repr(netdev);
+
+	return &repr->vf->devlink_port;
+}
+#endif /* CONFIG_NET_DEVLINK && HAVE_DEVLINK_PORT_ATTR_PCI_VF*/
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+static int
+ice_repr_setup_tc_cls_flower(struct ice_repr *repr,
+			     struct flow_cls_offload *flower)
+{
+	switch (flower->command) {
+	case FLOW_CLS_REPLACE:
+		return ice_add_cls_flower(repr->netdev, repr->src_vsi, flower);
+	case FLOW_CLS_DESTROY:
+		return ice_del_cls_flower(repr->src_vsi, flower);
+	default:
+		return -EINVAL;
+	}
+}
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+static int
+ice_repr_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+			   void *cb_priv)
+{
+	struct flow_cls_offload *flower = (struct flow_cls_offload *)type_data;
+	struct ice_netdev_priv *np = (struct ice_netdev_priv *)cb_priv;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return ice_repr_setup_tc_cls_flower(np->repr, flower);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static LIST_HEAD(ice_repr_block_cb_list);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+static int
+#ifdef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+ice_repr_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+		  void *type_data)
+#elif defined(HAVE_NDO_SETUP_TC_CHAIN_INDEX)
+ice_repr_setup_tc(struct net_device *netdev, u32 __always_unused handle,
+		  __always_unused chain_index, __be16 proto,
+		  struct tc_to_netdev *tc)
+#else
+ice_repr_setup_tc(struct net_device *netdev, u32 __always_unused handle,
+		  __be16 __always_unused proto, struct tc_to_netdev *tc)
+#endif
+{
+#ifndef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+	struct tc_cls_flower_offload *cls_flower = tc->cls_flower;
+	unsigned int type = tc->type;
+#elif !defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO)
+	struct tc_cls_flower_offload *cls_flower = (struct
+						   tc_cls_flower_offload *)
+						   type_data;
+#endif /* HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV */
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+
+	switch (type) {
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	case TC_SETUP_BLOCK:
+		return flow_block_cb_setup_simple((struct flow_block_offload *)
+						  type_data,
+						  &ice_repr_block_cb_list,
+						  ice_repr_setup_tc_block_cb,
+						  np, np, true);
+#elif !defined(HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV) || !defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO)
+	case TC_SETUP_CLSFLOWER:
+		return ice_repr_setup_tc_cls_flower(np->repr, cls_flower);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+#endif /* ESWITCH_SUPPORT */
+
+static const struct net_device_ops ice_repr_netdev_ops = {
+#ifdef HAVE_NDO_GET_PHYS_PORT_NAME
+	.ndo_get_phys_port_name = ice_repr_get_phys_port_name,
+#endif /* HAVE_NDO_GET_PHYS_PORT_NAME */
+	.ndo_get_stats64 = ice_repr_get_stats64,
+	.ndo_open = ice_repr_open,
+	.ndo_stop = ice_repr_stop,
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	.ndo_start_xmit = ice_eswitch_port_start_xmit,
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	.ndo_get_devlink_port = ice_repr_get_devlink_port,
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#ifdef HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC
+	.extended.ndo_setup_tc_rh = ice_repr_setup_tc,
+#else
+	.ndo_setup_tc = ice_repr_setup_tc,
+#endif /* HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC */
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+#endif /* CONFIG_NET_DEVLINK */
+};
+
+/**
+ * ice_is_port_repr_netdev - Check if a given netdevice is a port representor
+ * netdev
+ * @netdev: pointer to netdev
+ */
+bool ice_is_port_repr_netdev(struct net_device *netdev)
+{
+	return netdev && (netdev->netdev_ops == &ice_repr_netdev_ops);
+}
+
+/**
+ * ice_repr_reg_netdev - register port representor netdev
+ * @netdev: pointer to port representor netdev
+ */
+static int
+ice_repr_reg_netdev(struct net_device *netdev)
+{
+	eth_hw_addr_random(netdev);
+	netdev->netdev_ops = &ice_repr_netdev_ops;
+	ice_set_ethtool_repr_ops(netdev);
+
+#ifdef NETIF_F_HW_TC
+	netdev->hw_features |= NETIF_F_HW_TC;
+#endif /* NETIF_F_HW_TC */
+
+	netif_carrier_off(netdev);
+	netif_tx_stop_all_queues(netdev);
+
+	return register_netdev(netdev);
+}
+
+/**
+ * ice_repr_add - add representor for VF
+ * @vf: pointer to VF structure
+ */
+static int ice_repr_add(struct ice_vf *vf)
+{
+	struct ice_q_vector *q_vector;
+	struct ice_netdev_priv *np;
+	struct ice_repr *repr;
+	int err;
+
+	repr = kzalloc(sizeof(*repr), GFP_KERNEL);
+	if (!repr)
+		return -ENOMEM;
+
+	repr->netdev = alloc_etherdev(sizeof(struct ice_netdev_priv));
+	if (!repr->netdev) {
+		err =  -ENOMEM;
+		goto err_alloc;
+	}
+
+	repr->src_vsi = ice_get_vf_vsi(vf);
+	repr->vf = vf;
+	vf->repr = repr;
+	np = netdev_priv(repr->netdev);
+	np->repr = repr;
+
+	q_vector = kzalloc(sizeof(*q_vector), GFP_KERNEL);
+	if (!q_vector) {
+		err = -ENOMEM;
+		goto err_alloc_q_vector;
+	}
+	repr->q_vector = q_vector;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	err = ice_devlink_create_vf_port(vf);
+	if (err)
+		goto err_devlink;
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* CONFIG_NET_DEVLINK */
+
+	err = ice_repr_reg_netdev(repr->netdev);
+	if (err)
+		goto err_netdev;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	devlink_port_type_eth_set(&vf->devlink_port, repr->netdev);
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* CONFIG_NET_DEVLINK */
+
+	return 0;
+
+err_netdev:
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	ice_devlink_destroy_vf_port(vf);
+err_devlink:
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* CONFIG_NET_DEVLINK */
+	kfree(repr->q_vector);
+	vf->repr->q_vector = NULL;
+err_alloc_q_vector:
+	free_netdev(repr->netdev);
+	repr->netdev = NULL;
+err_alloc:
+	kfree(repr);
+	vf->repr = NULL;
+	return err;
+}
+
+/**
+ * ice_repr_rem - remove representor from VF
+ * @vf: pointer to VF structure
+ */
+static void ice_repr_rem(struct ice_vf *vf)
+{
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	ice_devlink_destroy_vf_port(vf);
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* CONFIG_NET_DEVLINK */
+	kfree(vf->repr->q_vector);
+	vf->repr->q_vector = NULL;
+	unregister_netdev(vf->repr->netdev);
+	free_netdev(vf->repr->netdev);
+	vf->repr->netdev = NULL;
+	kfree(vf->repr);
+	vf->repr = NULL;
+}
+
+
+/**
+ * ice_repr_add_for_all_vfs - add port representor for all VFs
+ * @pf: pointer to PF structure
+ */
+int ice_repr_add_for_all_vfs(struct ice_pf *pf)
+{
+	int err;
+	int i;
+
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		err = ice_repr_add(vf);
+		if (err)
+			goto err;
+
+		ice_vc_change_ops_to_repr(&vf->vc_ops);
+	}
+
+	return 0;
+
+err:
+	for (i = i - 1; i >= 0; i--) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_repr_rem(vf);
+		ice_vc_set_dflt_vf_ops(&vf->vc_ops);
+	}
+
+	return err;
+}
+
+/**
+ * ice_repr_rem_from_all_vfs - remove port representor for all VFs
+ * @pf: pointer to PF structure
+ */
+void ice_repr_rem_from_all_vfs(struct ice_pf *pf)
+{
+	int i;
+
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_repr_rem(vf);
+		ice_vc_set_dflt_vf_ops(&vf->vc_ops);
+	}
+}
+
+/**
+ * ice_repr_start_tx_queues - start Tx queues of port representor
+ * @repr: pointer to repr structure
+ */
+void ice_repr_start_tx_queues(struct ice_repr *repr)
+{
+	netif_carrier_on(repr->netdev);
+	netif_tx_start_all_queues(repr->netdev);
+}
+
+/**
+ * ice_repr_stop_tx_queues - stop Tx queues of port representor
+ * @repr: pointer to repr structure
+ */
+void ice_repr_stop_tx_queues(struct ice_repr *repr)
+{
+	netif_carrier_off(repr->netdev);
+	netif_tx_stop_all_queues(repr->netdev);
+}
+
+#ifdef HAVE_METADATA_PORT_INFO
+/**
+ * ice_repr_set_traffic_vsi - set traffic VSI for port representor
+ * @repr: repr on with VSI will be set
+ * @vsi: pointer to VSI that will be used by port representor to pass traffic
+ */
+void ice_repr_set_traffic_vsi(struct ice_repr *repr, struct ice_vsi *vsi)
+{
+	struct ice_netdev_priv *np = netdev_priv(repr->netdev);
+
+	np->vsi = vsi;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
diff --git a/drivers/net/ethernet/intel/ice/ice_repr.h b/drivers/net/ethernet/intel/ice/ice_repr.h
new file mode 100644
index 000000000000..a20c9d5ebe4f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_repr.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_REPR_H_
+#define _ICE_REPR_H_
+#include "ice.h"
+
+struct ice_repr {
+	struct ice_vsi *src_vsi;
+	struct ice_vf *vf;
+	struct ice_q_vector *q_vector;
+	struct net_device *netdev;
+	struct metadata_dst *dst;
+};
+
+int ice_repr_add_for_all_vfs(struct ice_pf *pf);
+void ice_repr_rem_from_all_vfs(struct ice_pf *pf);
+void ice_repr_start_tx_queues(struct ice_repr *repr);
+void ice_repr_stop_tx_queues(struct ice_repr *repr);
+#ifdef HAVE_METADATA_PORT_INFO
+void ice_repr_set_traffic_vsi(struct ice_repr *repr, struct ice_vsi *vsi);
+#endif /* HAVE_METADATA_PORT_INFO */
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+bool ice_is_port_repr_netdev(struct net_device *netdev);
+struct ice_repr *ice_netdev_to_repr(struct net_device *netdev);
+#else
+static inline
+bool ice_is_port_repr_netdev(struct net_device *netdev) { return false; }
+static inline
+struct ice_repr *ice_netdev_to_repr(struct net_device *netdev) { return NULL; }
+#endif /* CONFIG_NET_DEVLINK */
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h b/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h
new file mode 100644
index 000000000000..0610fd43fd53
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_SBQ_CMD_H_
+#define _ICE_SBQ_CMD_H_
+
+/* This header file defines the Sideband Queue commands, error codes and
+ * descriptor format. It is shared between Firmware and Software.
+ */
+
+/* Sideband Queue command structure and opcodes */
+enum ice_sbq_opc {
+	/* Sideband Queue commands */
+	ice_sbq_opc_neigh_dev_req			= 0x0C00,
+	ice_sbq_opc_neigh_dev_ev			= 0x0C01
+};
+
+/* Sideband Queue descriptor. Indirect command
+ * and non posted
+ */
+struct ice_sbq_cmd_desc {
+	__le16 flags;
+	__le16 opcode;
+	__le16 datalen;
+	__le16 cmd_retval;
+
+	/* Opaque message data */
+	__le32 cookie_high;
+	__le32 cookie_low;
+
+	union {
+		__le16 cmd_len;
+		__le16 cmpl_len;
+	} param0;
+
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+struct ice_sbq_evt_desc {
+	__le16 flags;
+	__le16 opcode;
+	__le16 datalen;
+	__le16 cmd_retval;
+	u8 data[24];
+};
+
+enum ice_sbq_msg_dev {
+	rmn_0	= 0x02,
+	rmn_1	= 0x03,
+	rmn_2	= 0x04,
+	cgu	= 0x06
+};
+
+enum ice_sbq_msg_opcode {
+	ice_sbq_msg_rd	= 0x00,
+	ice_sbq_msg_wr	= 0x01
+};
+
+#define ICE_SBQ_MSG_FLAGS	0x40
+#define ICE_SBQ_MSG_SBE_FBE	0x0F
+
+struct ice_sbq_msg_req {
+	u8 dest_dev;
+	u8 src_dev;
+	u8 opcode;
+	u8 flags;
+	u8 sbe_fbe;
+	u8 func_id;
+	__le16 msg_addr_low;
+	__le32 msg_addr_high;
+	__le32 data;
+};
+
+struct ice_sbq_msg_cmpl {
+	u8 dest_dev;
+	u8 src_dev;
+	u8 opcode;
+	u8 flags;
+	__le32 data;
+};
+
+/* Internal struct */
+struct ice_sbq_msg_input {
+	u8 dest_dev;
+	u8 opcode;
+	u16 msg_addr_low;
+	u32 msg_addr_high;
+	u32 data;
+};
+#endif /* _ICE_SBQ_CMD_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c
index 2fde9653a608..bcfed501e999 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.c
+++ b/drivers/net/ethernet/intel/ice/ice_sched.c
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_sched.h"
 
+
+
 /**
  * ice_sched_add_root_node - Insert the Tx scheduler root node in SW DB
  * @pi: port information structure
@@ -129,7 +131,7 @@ ice_aqc_send_sched_elem_cmd(struct ice_hw *hw, enum ice_adminq_opc cmd_opc,
  */
 enum ice_status
 ice_aq_query_sched_elems(struct ice_hw *hw, u16 elems_req,
-			 struct ice_aqc_get_elem *buf, u16 buf_size,
+			 struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
 			 u16 *elems_ret, struct ice_sq_cd *cd)
 {
 	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_get_sched_elems,
@@ -149,8 +151,8 @@ enum ice_status
 ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 		   struct ice_aqc_txsched_elem_data *info)
 {
+	struct ice_aqc_txsched_elem_data elem;
 	struct ice_sched_node *parent;
-	struct ice_aqc_get_elem elem;
 	struct ice_sched_node *node;
 	enum ice_status status;
 	struct ice_hw *hw;
@@ -164,19 +166,17 @@ ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 	parent = ice_sched_find_node_by_teid(pi->root,
 					     le32_to_cpu(info->parent_teid));
 	if (!parent) {
-		ice_debug(hw, ICE_DBG_SCHED,
-			  "Parent Node not found for parent_teid=0x%x\n",
+		ice_debug(hw, ICE_DBG_SCHED, "Parent Node not found for parent_teid=0x%x\n",
 			  le32_to_cpu(info->parent_teid));
 		return ICE_ERR_PARAM;
 	}
 
-	/* query the current node information from FW  before additing it
+	/* query the current node information from FW before adding it
 	 * to the SW DB
 	 */
 	status = ice_sched_query_elem(hw, le32_to_cpu(info->node_teid), &elem);
 	if (status)
 		return status;
-
 	node = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*node), GFP_KERNEL);
 	if (!node)
 		return ICE_ERR_NO_MEMORY;
@@ -195,7 +195,7 @@ ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 	node->parent = parent;
 	node->tx_sched_layer = layer;
 	parent->children[parent->num_children++] = node;
-	memcpy(&node->info, &elem.generic[0], sizeof(node->info));
+	node->info = elem;
 	return 0;
 }
 
@@ -238,7 +238,7 @@ ice_sched_remove_elems(struct ice_hw *hw, struct ice_sched_node *parent,
 	enum ice_status status;
 	u16 buf_size;
 
-	buf_size = sizeof(*buf) + sizeof(u32) * (num_nodes - 1);
+	buf_size = struct_size(buf, teid, num_nodes);
 	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
 	if (!buf)
 		return ICE_ERR_NO_MEMORY;
@@ -370,7 +370,7 @@ void ice_free_sched_node(struct ice_port_info *pi, struct ice_sched_node *node)
  *
  * Get default scheduler topology (0x400)
  */
-static enum ice_status
+enum ice_status
 ice_aq_get_dflt_topo(struct ice_hw *hw, u8 lport,
 		     struct ice_aqc_get_topo_elem *buf, u16 buf_size,
 		     u8 *num_branches, struct ice_sq_cd *cd)
@@ -410,6 +410,48 @@ ice_aq_add_sched_elems(struct ice_hw *hw, u16 grps_req,
 					   grps_added, cd);
 }
 
+/**
+ * ice_aq_cfg_sched_elems - configures scheduler elements
+ * @hw: pointer to the HW struct
+ * @elems_req: number of elements to configure
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @elems_cfgd: returns total number of elements configured
+ * @cd: pointer to command details structure or NULL
+ *
+ * Configure scheduling elements (0x0403)
+ */
+static enum ice_status
+ice_aq_cfg_sched_elems(struct ice_hw *hw, u16 elems_req,
+		       struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
+		       u16 *elems_cfgd, struct ice_sq_cd *cd)
+{
+	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_cfg_sched_elems,
+					   elems_req, (void *)buf, buf_size,
+					   elems_cfgd, cd);
+}
+
+/**
+ * ice_aq_move_sched_elems - move scheduler elements
+ * @hw: pointer to the HW struct
+ * @grps_req: number of groups to move
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @grps_movd: returns total number of groups moved
+ * @cd: pointer to command details structure or NULL
+ *
+ * Move scheduling elements (0x0408)
+ */
+static enum ice_status
+ice_aq_move_sched_elems(struct ice_hw *hw, u16 grps_req,
+			struct ice_aqc_move_elem *buf, u16 buf_size,
+			u16 *grps_movd, struct ice_sq_cd *cd)
+{
+	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_move_sched_elems,
+					   grps_req, (void *)buf, buf_size,
+					   grps_movd, cd);
+}
+
 /**
  * ice_aq_suspend_sched_elems - suspend scheduler elements
  * @hw: pointer to the HW struct
@@ -422,8 +464,7 @@ ice_aq_add_sched_elems(struct ice_hw *hw, u16 grps_req,
  * Suspend scheduling elements (0x0409)
  */
 static enum ice_status
-ice_aq_suspend_sched_elems(struct ice_hw *hw, u16 elems_req,
-			   struct ice_aqc_suspend_resume_elem *buf,
+ice_aq_suspend_sched_elems(struct ice_hw *hw, u16 elems_req, __le32 *buf,
 			   u16 buf_size, u16 *elems_ret, struct ice_sq_cd *cd)
 {
 	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_suspend_sched_elems,
@@ -443,8 +484,7 @@ ice_aq_suspend_sched_elems(struct ice_hw *hw, u16 elems_req,
  * resume scheduling elements (0x040A)
  */
 static enum ice_status
-ice_aq_resume_sched_elems(struct ice_hw *hw, u16 elems_req,
-			  struct ice_aqc_suspend_resume_elem *buf,
+ice_aq_resume_sched_elems(struct ice_hw *hw, u16 elems_req, __le32 *buf,
 			  u16 buf_size, u16 *elems_ret, struct ice_sq_cd *cd)
 {
 	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_resume_sched_elems,
@@ -485,9 +525,9 @@ static enum ice_status
 ice_sched_suspend_resume_elems(struct ice_hw *hw, u8 num_nodes, u32 *node_teids,
 			       bool suspend)
 {
-	struct ice_aqc_suspend_resume_elem *buf;
 	u16 i, buf_size, num_elem_ret = 0;
 	enum ice_status status;
+	__le32 *buf;
 
 	buf_size = sizeof(*buf) * num_nodes;
 	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
@@ -495,7 +535,7 @@ ice_sched_suspend_resume_elems(struct ice_hw *hw, u8 num_nodes, u32 *node_teids,
 		return ICE_ERR_NO_MEMORY;
 
 	for (i = 0; i < num_nodes; i++)
-		buf->teid[i] = cpu_to_le32(node_teids[i]);
+		buf[i] = cpu_to_le32(node_teids[i]);
 
 	if (suspend)
 		status = ice_aq_suspend_sched_elems(hw, num_nodes, buf,
@@ -556,6 +596,208 @@ ice_alloc_lan_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 new_numqs)
 	return 0;
 }
 
+/**
+ * ice_alloc_rdma_q_ctx - allocate RDMA queue contexts for the given VSI and TC
+ * @hw: pointer to the HW struct
+ * @vsi_handle: VSI handle
+ * @tc: TC number
+ * @new_numqs: number of queues
+ */
+static enum ice_status
+ice_alloc_rdma_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 new_numqs)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+	struct ice_q_ctx *q_ctx;
+
+	vsi_ctx = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	/* allocate RDMA queue contexts */
+	if (!vsi_ctx->rdma_q_ctx[tc]) {
+		vsi_ctx->rdma_q_ctx[tc] = devm_kcalloc(ice_hw_to_dev(hw),
+						       new_numqs,
+						       sizeof(*q_ctx),
+						       GFP_KERNEL);
+		if (!vsi_ctx->rdma_q_ctx[tc])
+			return ICE_ERR_NO_MEMORY;
+		vsi_ctx->num_rdma_q_entries[tc] = new_numqs;
+		return 0;
+	}
+	/* num queues are increased, update the queue contexts */
+	if (new_numqs > vsi_ctx->num_rdma_q_entries[tc]) {
+		u16 prev_num = vsi_ctx->num_rdma_q_entries[tc];
+
+		q_ctx = devm_kcalloc(ice_hw_to_dev(hw), new_numqs,
+				     sizeof(*q_ctx), GFP_KERNEL);
+		if (!q_ctx)
+			return ICE_ERR_NO_MEMORY;
+		memcpy(q_ctx, vsi_ctx->rdma_q_ctx[tc],
+		       prev_num * sizeof(*q_ctx));
+		devm_kfree(ice_hw_to_dev(hw), vsi_ctx->rdma_q_ctx[tc]);
+		vsi_ctx->rdma_q_ctx[tc] = q_ctx;
+		vsi_ctx->num_rdma_q_entries[tc] = new_numqs;
+	}
+	return 0;
+}
+
+/**
+ * ice_aq_rl_profile - performs a rate limiting task
+ * @hw: pointer to the HW struct
+ * @opcode: opcode for add, query, or remove profile(s)
+ * @num_profiles: the number of profiles
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @num_processed: number of processed add or remove profile(s) to return
+ * @cd: pointer to command details structure
+ *
+ * RL profile function to add, query, or remove profile(s)
+ */
+static enum ice_status
+ice_aq_rl_profile(struct ice_hw *hw, enum ice_adminq_opc opcode,
+		  u16 num_profiles, struct ice_aqc_rl_profile_elem *buf,
+		  u16 buf_size, u16 *num_processed, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_rl_profile *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.rl_profile;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd->num_profiles = cpu_to_le16(num_profiles);
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status && num_processed)
+		*num_processed = le16_to_cpu(cmd->num_processed);
+	return status;
+}
+
+/**
+ * ice_aq_add_rl_profile - adds rate limiting profile(s)
+ * @hw: pointer to the HW struct
+ * @num_profiles: the number of profile(s) to be add
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @num_profiles_added: total number of profiles added to return
+ * @cd: pointer to command details structure
+ *
+ * Add RL profile (0x0410)
+ */
+static enum ice_status
+ice_aq_add_rl_profile(struct ice_hw *hw, u16 num_profiles,
+		      struct ice_aqc_rl_profile_elem *buf, u16 buf_size,
+		      u16 *num_profiles_added, struct ice_sq_cd *cd)
+{
+	return ice_aq_rl_profile(hw, ice_aqc_opc_add_rl_profiles, num_profiles,
+				 buf, buf_size, num_profiles_added, cd);
+}
+
+/**
+ * ice_aq_query_rl_profile - query rate limiting profile(s)
+ * @hw: pointer to the HW struct
+ * @num_profiles: the number of profile(s) to query
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @cd: pointer to command details structure
+ *
+ * Query RL profile (0x0411)
+ */
+enum ice_status
+ice_aq_query_rl_profile(struct ice_hw *hw, u16 num_profiles,
+			struct ice_aqc_rl_profile_elem *buf, u16 buf_size,
+			struct ice_sq_cd *cd)
+{
+	return ice_aq_rl_profile(hw, ice_aqc_opc_query_rl_profiles,
+				 num_profiles, buf, buf_size, NULL, cd);
+}
+
+/**
+ * ice_aq_remove_rl_profile - removes RL profile(s)
+ * @hw: pointer to the HW struct
+ * @num_profiles: the number of profile(s) to remove
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @num_profiles_removed: total number of profiles removed to return
+ * @cd: pointer to command details structure or NULL
+ *
+ * Remove RL profile (0x0415)
+ */
+static enum ice_status
+ice_aq_remove_rl_profile(struct ice_hw *hw, u16 num_profiles,
+			 struct ice_aqc_rl_profile_elem *buf, u16 buf_size,
+			 u16 *num_profiles_removed, struct ice_sq_cd *cd)
+{
+	return ice_aq_rl_profile(hw, ice_aqc_opc_remove_rl_profiles,
+				 num_profiles, buf, buf_size,
+				 num_profiles_removed, cd);
+}
+
+/**
+ * ice_sched_del_rl_profile - remove RL profile
+ * @hw: pointer to the HW struct
+ * @rl_info: rate limit profile information
+ *
+ * If the profile ID is not referenced anymore, it removes profile ID with
+ * its associated parameters from HW DB,and locally. The caller needs to
+ * hold scheduler lock.
+ */
+static enum ice_status
+ice_sched_del_rl_profile(struct ice_hw *hw,
+			 struct ice_aqc_rl_profile_info *rl_info)
+{
+	struct ice_aqc_rl_profile_elem *buf;
+	u16 num_profiles_removed;
+	enum ice_status status;
+	u16 num_profiles = 1;
+
+	if (rl_info->prof_id_ref != 0)
+		return ICE_ERR_IN_USE;
+
+	/* Safe to remove profile ID */
+	buf = &rl_info->profile;
+	status = ice_aq_remove_rl_profile(hw, num_profiles, buf, sizeof(*buf),
+					  &num_profiles_removed, NULL);
+	if (status || num_profiles_removed != num_profiles)
+		return ICE_ERR_CFG;
+
+	/* Delete stale entry now */
+	list_del(&rl_info->list_entry);
+	devm_kfree(ice_hw_to_dev(hw), rl_info);
+	return status;
+}
+
+/**
+ * ice_sched_clear_rl_prof - clears RL prof entries
+ * @pi: port information structure
+ *
+ * This function removes all RL profile from HW as well as from SW DB.
+ */
+static void ice_sched_clear_rl_prof(struct ice_port_info *pi)
+{
+	u16 ln;
+	struct ice_hw *hw = pi->hw;
+
+
+	for (ln = 0; ln < hw->num_tx_sched_layers; ln++) {
+		struct ice_aqc_rl_profile_info *rl_prof_elem;
+		struct ice_aqc_rl_profile_info *rl_prof_tmp;
+
+		list_for_each_entry_safe(rl_prof_elem, rl_prof_tmp,
+					 &hw->rl_prof_list[ln], list_entry) {
+			enum ice_status status;
+
+			rl_prof_elem->prof_id_ref = 0;
+			status = ice_sched_del_rl_profile(hw, rl_prof_elem);
+			if (status) {
+				ice_debug(hw, ICE_DBG_SCHED, "Remove rl profile failed\n");
+				/* On error, free mem required */
+				list_del(&rl_prof_elem->list_entry);
+				devm_kfree(ice_hw_to_dev(hw), rl_prof_elem);
+			}
+		}
+	}
+}
+
 /**
  * ice_sched_clear_agg - clears the aggregator related information
  * @hw: pointer to the hardware structure
@@ -592,6 +834,8 @@ static void ice_sched_clear_tx_topo(struct ice_port_info *pi)
 {
 	if (!pi)
 		return;
+	/* remove RL profiles related lists */
+	ice_sched_clear_rl_prof(pi);
 	if (pi->root) {
 		ice_free_sched_node(pi, pi->root);
 		pi->root = NULL;
@@ -632,8 +876,7 @@ void ice_sched_cleanup_all(struct ice_hw *hw)
 		hw->layer_info = NULL;
 	}
 
-	if (hw->port_info)
-		ice_sched_clear_port(hw->port_info);
+	ice_sched_clear_port(hw->port_info);
 
 	hw->num_tx_sched_layers = 0;
 	hw->num_tx_sched_phys_layers = 0;
@@ -641,6 +884,33 @@ void ice_sched_cleanup_all(struct ice_hw *hw)
 	hw->max_cgds = 0;
 }
 
+/**
+ * ice_aq_cfg_l2_node_cgd - configures L2 node to CGD mapping
+ * @hw: pointer to the HW struct
+ * @num_l2_nodes: the number of L2 nodes whose CGDs to configure
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @cd: pointer to command details structure or NULL
+ *
+ * Configure L2 Node CGD (0x0414)
+ */
+enum ice_status
+ice_aq_cfg_l2_node_cgd(struct ice_hw *hw, u16 num_l2_nodes,
+		       struct ice_aqc_cfg_l2_node_cgd_elem *buf,
+		       u16 buf_size, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_cfg_l2_node_cgd *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.cfg_l2_node_cgd;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_cfg_l2_node_cgd);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_l2_nodes = cpu_to_le16(num_l2_nodes);
+	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+}
+
+
 /**
  * ice_sched_add_elems - add nodes to HW and SW DB
  * @pi: port information structure
@@ -663,10 +933,10 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 	u16 i, num_groups_added = 0;
 	enum ice_status status = 0;
 	struct ice_hw *hw = pi->hw;
-	size_t buf_size;
+	u16 buf_size;
 	u32 teid;
 
-	buf_size = struct_size(buf, generic, num_nodes - 1);
+	buf_size = struct_size(buf, generic, num_nodes);
 	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
 	if (!buf)
 		return ICE_ERR_NO_MEMORY;
@@ -704,8 +974,7 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 	for (i = 0; i < num_nodes; i++) {
 		status = ice_sched_add_node(pi, layer, &buf->generic[i]);
 		if (status) {
-			ice_debug(hw, ICE_DBG_SCHED,
-				  "add nodes in SW DB failed status =%d\n",
+			ice_debug(hw, ICE_DBG_SCHED, "add nodes in SW DB failed status =%d\n",
 				  status);
 			break;
 		}
@@ -713,8 +982,7 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 		teid = le32_to_cpu(buf->generic[i].node_teid);
 		new_node = ice_sched_find_node_by_teid(parent, teid);
 		if (!new_node) {
-			ice_debug(hw, ICE_DBG_SCHED,
-				  "Node is missing for teid =%d\n", teid);
+			ice_debug(hw, ICE_DBG_SCHED, "Node is missing for teid =%d\n", teid);
 			break;
 		}
 
@@ -743,7 +1011,7 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 }
 
 /**
- * ice_sched_add_nodes_to_layer - Add nodes to a given layer
+ * ice_sched_add_nodes_to_hw_layer - Add nodes to hw layer
  * @pi: port information structure
  * @tc_node: pointer to TC node
  * @parent: pointer to parent node
@@ -752,82 +1020,107 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
  * @first_node_teid: pointer to the first node TEID
  * @num_nodes_added: pointer to number of nodes added
  *
- * This function add nodes to a given layer.
+ * Add nodes into specific hw layer.
  */
 static enum ice_status
-ice_sched_add_nodes_to_layer(struct ice_port_info *pi,
-			     struct ice_sched_node *tc_node,
-			     struct ice_sched_node *parent, u8 layer,
-			     u16 num_nodes, u32 *first_node_teid,
-			     u16 *num_nodes_added)
+ice_sched_add_nodes_to_hw_layer(struct ice_port_info *pi,
+				struct ice_sched_node *tc_node,
+				struct ice_sched_node *parent, u8 layer,
+				u16 num_nodes, u32 *first_node_teid,
+				u16 *num_nodes_added)
 {
-	u32 *first_teid_ptr = first_node_teid;
-	u16 new_num_nodes, max_child_nodes;
-	enum ice_status status = 0;
-	struct ice_hw *hw = pi->hw;
-	u16 num_added = 0;
-	u32 temp;
+	u16 max_child_nodes;
 
 	*num_nodes_added = 0;
 
 	if (!num_nodes)
-		return status;
+		return 0;
 
-	if (!parent || layer < hw->sw_entry_point_layer)
+	if (!parent || layer < pi->hw->sw_entry_point_layer)
 		return ICE_ERR_PARAM;
 
 	/* max children per node per layer */
-	max_child_nodes = hw->max_children[parent->tx_sched_layer];
+	max_child_nodes = pi->hw->max_children[parent->tx_sched_layer];
 
-	/* current number of children + required nodes exceed max children ? */
+	/* current number of children + required nodes exceed max children */
 	if ((parent->num_children + num_nodes) > max_child_nodes) {
 		/* Fail if the parent is a TC node */
 		if (parent == tc_node)
 			return ICE_ERR_CFG;
+		return ICE_ERR_MAX_LIMIT;
+	}
+
+	return ice_sched_add_elems(pi, tc_node, parent, layer, num_nodes,
+				   num_nodes_added, first_node_teid);
+}
+
+/**
+ * ice_sched_add_nodes_to_layer - Add nodes to a given layer
+ * @pi: port information structure
+ * @tc_node: pointer to TC node
+ * @parent: pointer to parent node
+ * @layer: layer number to add nodes
+ * @num_nodes: number of nodes to be added
+ * @first_node_teid: pointer to the first node TEID
+ * @num_nodes_added: pointer to number of nodes added
+ *
+ * This function add nodes to a given layer.
+ */
+static enum ice_status
+ice_sched_add_nodes_to_layer(struct ice_port_info *pi,
+			     struct ice_sched_node *tc_node,
+			     struct ice_sched_node *parent, u8 layer,
+			     u16 num_nodes, u32 *first_node_teid,
+			     u16 *num_nodes_added)
+{
+	u32 *first_teid_ptr = first_node_teid;
+	u16 new_num_nodes = num_nodes;
+	enum ice_status status = 0;
 
+	*num_nodes_added = 0;
+	while (*num_nodes_added < num_nodes) {
+		u16 max_child_nodes, num_added = 0;
+		/* cppcheck-suppress unusedVariable */
+		u32 temp;
+
+		status = ice_sched_add_nodes_to_hw_layer(pi, tc_node, parent,
+							 layer,	new_num_nodes,
+							 first_teid_ptr,
+							 &num_added);
+		if (!status)
+			*num_nodes_added += num_added;
+		/* added more nodes than requested ? */
+		if (*num_nodes_added > num_nodes) {
+			ice_debug(pi->hw, ICE_DBG_SCHED, "added extra nodes %d %d\n", num_nodes,
+				  *num_nodes_added);
+			status = ICE_ERR_CFG;
+			break;
+		}
+		/* break if all the nodes are added successfully */
+		if (!status && (*num_nodes_added == num_nodes))
+			break;
+		/* break if the error is not max limit */
+		if (status && status != ICE_ERR_MAX_LIMIT)
+			break;
+		/* Exceeded the max children */
+		max_child_nodes = pi->hw->max_children[parent->tx_sched_layer];
 		/* utilize all the spaces if the parent is not full */
 		if (parent->num_children < max_child_nodes) {
 			new_num_nodes = max_child_nodes - parent->num_children;
-			/* this recursion is intentional, and wouldn't
-			 * go more than 2 calls
+		} else {
+			/* This parent is full, try the next sibling */
+			parent = parent->sibling;
+			/* Don't modify the first node TEID memory if the
+			 * first node was added already in the above call.
+			 * Instead send some temp memory for all other
+			 * recursive calls.
 			 */
-			status = ice_sched_add_nodes_to_layer(pi, tc_node,
-							      parent, layer,
-							      new_num_nodes,
-							      first_node_teid,
-							      &num_added);
-			if (status)
-				return status;
+			if (num_added)
+				first_teid_ptr = &temp;
 
-			*num_nodes_added += num_added;
+			new_num_nodes = num_nodes - *num_nodes_added;
 		}
-		/* Don't modify the first node TEID memory if the first node was
-		 * added already in the above call. Instead send some temp
-		 * memory for all other recursive calls.
-		 */
-		if (num_added)
-			first_teid_ptr = &temp;
-
-		new_num_nodes = num_nodes - num_added;
-
-		/* This parent is full, try the next sibling */
-		parent = parent->sibling;
-
-		/* this recursion is intentional, for 1024 queues
-		 * per VSI, it goes max of 16 iterations.
-		 * 1024 / 8 = 128 layer 8 nodes
-		 * 128 /8 = 16 (add 8 nodes per iteration)
-		 */
-		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent,
-						      layer, new_num_nodes,
-						      first_teid_ptr,
-						      &num_added);
-		*num_nodes_added += num_added;
-		return status;
 	}
-
-	status = ice_sched_add_elems(pi, tc_node, parent, layer, num_nodes,
-				     num_nodes_added, first_node_teid);
 	return status;
 }
 
@@ -866,6 +1159,29 @@ static u8 ice_sched_get_vsi_layer(struct ice_hw *hw)
 	return hw->sw_entry_point_layer;
 }
 
+/**
+ * ice_sched_get_agg_layer - get the current aggregator layer number
+ * @hw: pointer to the HW struct
+ *
+ * This function returns the current aggregator layer number
+ */
+static u8 ice_sched_get_agg_layer(struct ice_hw *hw)
+{
+	/* Num Layers       aggregator layer
+	 *     9               4
+	 *     7 or less       sw_entry_point_layer
+	 */
+	/* calculate the aggregator layer based on number of layers. */
+	if (hw->num_tx_sched_layers > ICE_AGG_LAYER_OFFSET + 1) {
+		u8 layer = hw->num_tx_sched_layers - ICE_AGG_LAYER_OFFSET;
+
+		if (layer > hw->sw_entry_point_layer)
+			return layer;
+	}
+	return hw->sw_entry_point_layer;
+}
+
+
 /**
  * ice_rm_dflt_leaf_node - remove the default leaf node in the tree
  * @pi: port information structure
@@ -944,6 +1260,7 @@ enum ice_status ice_sched_init_port(struct ice_port_info *pi)
 		return ICE_ERR_PARAM;
 	hw = pi->hw;
 
+
 	/* Query the Default Topology from FW */
 	buf = devm_kzalloc(ice_hw_to_dev(hw), ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
 	if (!buf)
@@ -1014,6 +1331,8 @@ enum ice_status ice_sched_init_port(struct ice_port_info *pi)
 	/* initialize the port for handling the scheduler tree */
 	pi->port_state = ICE_SCHED_PORT_STATE_READY;
 	mutex_init(&pi->sched_lock);
+	for (i = 0; i < ICE_AQC_TOPO_MAX_LEVEL_NUM; i++)
+		INIT_LIST_HEAD(&hw->rl_prof_list[i]);
 
 err_init_port:
 	if (status && pi->root) {
@@ -1025,6 +1344,32 @@ enum ice_status ice_sched_init_port(struct ice_port_info *pi)
 	return status;
 }
 
+/**
+ * ice_sched_get_node - Get the struct ice_sched_node for given TEID
+ * @pi: port information structure
+ * @teid: Scheduler node TEID
+ *
+ * This function retrieves the ice_sched_node struct for given TEID from
+ * the SW DB and returns it to the caller.
+ */
+struct ice_sched_node *ice_sched_get_node(struct ice_port_info *pi, u32 teid)
+{
+	struct ice_sched_node *node;
+
+	if (!pi)
+		return NULL;
+
+	/* Find the node starting from root */
+	mutex_lock(&pi->sched_lock);
+	node = ice_sched_find_node_by_teid(pi->root, teid);
+	mutex_unlock(&pi->sched_lock);
+
+	if (!node)
+		ice_debug(pi->hw, ICE_DBG_SCHED, "Node not found for teid=0x%x\n", teid);
+
+	return node;
+}
+
 /**
  * ice_sched_query_res_alloc - query the FW for num of logical sched layers
  * @hw: pointer to the HW struct
@@ -1036,7 +1381,7 @@ enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
 	struct ice_aqc_query_txsched_res_resp *buf;
 	enum ice_status status = 0;
 	__le16 max_sibl;
-	u16 i;
+	u8 i;
 
 	if (hw->layer_info)
 		return status;
@@ -1062,25 +1407,65 @@ enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
 	 * and so on. This array will be populated from root (index 0) to
 	 * qgroup layer 7. Leaf node has no children.
 	 */
-	for (i = 0; i < hw->num_tx_sched_layers; i++) {
-		max_sibl = buf->layer_props[i].max_sibl_grp_sz;
+	for (i = 0; i < hw->num_tx_sched_layers - 1; i++) {
+		max_sibl = buf->layer_props[i + 1].max_sibl_grp_sz;
 		hw->max_children[i] = le16_to_cpu(max_sibl);
 	}
 
 	hw->layer_info = devm_kmemdup(ice_hw_to_dev(hw), buf->layer_props,
-				      (hw->num_tx_sched_layers *
-				       sizeof(*hw->layer_info)),
+				      (hw->num_tx_sched_layers * sizeof(*hw->layer_info)),
 				      GFP_KERNEL);
 	if (!hw->layer_info) {
 		status = ICE_ERR_NO_MEMORY;
 		goto sched_query_out;
 	}
 
+
 sched_query_out:
 	devm_kfree(ice_hw_to_dev(hw), buf);
 	return status;
 }
 
+/**
+ * ice_sched_get_psm_clk_freq - determine the PSM clock frequency
+ * @hw: pointer to the HW struct
+ *
+ * Determine the PSM clock frequency and store in HW struct
+ */
+void ice_sched_get_psm_clk_freq(struct ice_hw *hw)
+{
+	u32 val, clk_src;
+
+	val = rd32(hw, GLGEN_CLKSTAT_SRC);
+	clk_src = (val & GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_M) >>
+		GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_S;
+
+#define PSM_CLK_SRC_367_MHZ 0x0
+#define PSM_CLK_SRC_416_MHZ 0x1
+#define PSM_CLK_SRC_446_MHZ 0x2
+#define PSM_CLK_SRC_390_MHZ 0x3
+
+	switch (clk_src) {
+	case PSM_CLK_SRC_367_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_367MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_416_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_416MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_446_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_446MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_390_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_390MHZ_IN_HZ;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_SCHED, "PSM clk_src unexpected %u\n",
+			  clk_src);
+		/* fall back to a safe default */
+		hw->psm_clk_freq = ICE_PSM_CLK_446MHZ_IN_HZ;
+	}
+}
+
 /**
  * ice_sched_find_node_in_subtree - Find node in part of base node subtree
  * @hw: pointer to the HW struct
@@ -1090,7 +1475,7 @@ enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
  * This function checks whether a given node is part of the base node
  * subtree or not
  */
-static bool
+bool
 ice_sched_find_node_in_subtree(struct ice_hw *hw, struct ice_sched_node *base,
 			       struct ice_sched_node *node)
 {
@@ -1114,6 +1499,53 @@ ice_sched_find_node_in_subtree(struct ice_hw *hw, struct ice_sched_node *base,
 	return false;
 }
 
+/**
+ * ice_sched_get_free_qgrp - Scan all queue group siblings and find a free node
+ * @pi: port information structure
+ * @vsi_node: software VSI handle
+ * @qgrp_node: first queue group node identified for scanning
+ * @owner: LAN or RDMA
+ *
+ * This function retrieves a free LAN or RDMA queue group node by scanning
+ * qgrp_node and its siblings for the queue group with the fewest number
+ * of queues currently assigned.
+ */
+static struct ice_sched_node *
+ice_sched_get_free_qgrp(struct ice_port_info *pi,
+			struct ice_sched_node *vsi_node,
+			struct ice_sched_node *qgrp_node, u8 owner)
+{
+	struct ice_sched_node *min_qgrp;
+	u8 min_children;
+
+	if (!qgrp_node)
+		return qgrp_node;
+	min_children = qgrp_node->num_children;
+	if (!min_children)
+		return qgrp_node;
+	min_qgrp = qgrp_node;
+	/* scan all queue groups until find a node which has less than the
+	 * minimum number of children. This way all queue group nodes get
+	 * equal number of shares and active. The bandwidth will be equally
+	 * distributed across all queues.
+	 */
+	while (qgrp_node) {
+		/* make sure the qgroup node is part of the VSI subtree */
+		if (ice_sched_find_node_in_subtree(pi->hw, vsi_node, qgrp_node))
+			if (qgrp_node->num_children < min_children &&
+			    qgrp_node->owner == owner) {
+				/* replace the new min queue group node */
+				min_qgrp = qgrp_node;
+				min_children = min_qgrp->num_children;
+				/* break if it has no children, */
+				if (!min_children)
+					break;
+			}
+		qgrp_node = qgrp_node->sibling;
+	}
+	return min_qgrp;
+}
+
 /**
  * ice_sched_get_free_qparent - Get a free LAN or RDMA queue group node
  * @pi: port information structure
@@ -1127,7 +1559,7 @@ struct ice_sched_node *
 ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 			   u8 owner)
 {
-	struct ice_sched_node *vsi_node, *qgrp_node = NULL;
+	struct ice_sched_node *vsi_node, *qgrp_node;
 	struct ice_vsi_ctx *vsi_ctx;
 	u16 max_children;
 	u8 qgrp_layer;
@@ -1141,7 +1573,7 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 	vsi_node = vsi_ctx->sched.vsi_node[tc];
 	/* validate invalid VSI ID */
 	if (!vsi_node)
-		goto lan_q_exit;
+		return NULL;
 
 	/* get the first queue group node from VSI sub-tree */
 	qgrp_node = ice_sched_get_first_node(pi, vsi_node, qgrp_layer);
@@ -1154,28 +1586,28 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 		qgrp_node = qgrp_node->sibling;
 	}
 
-lan_q_exit:
-	return qgrp_node;
+	/* Select the best queue group */
+	return ice_sched_get_free_qgrp(pi, vsi_node, qgrp_node, owner);
 }
 
 /**
  * ice_sched_get_vsi_node - Get a VSI node based on VSI ID
- * @hw: pointer to the HW struct
+ * @pi: pointer to the port information structure
  * @tc_node: pointer to the TC node
  * @vsi_handle: software VSI handle
  *
  * This function retrieves a VSI node for a given VSI ID from a given
  * TC branch
  */
-static struct ice_sched_node *
-ice_sched_get_vsi_node(struct ice_hw *hw, struct ice_sched_node *tc_node,
+struct ice_sched_node *
+ice_sched_get_vsi_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 		       u16 vsi_handle)
 {
 	struct ice_sched_node *node;
 	u8 vsi_layer;
 
-	vsi_layer = ice_sched_get_vsi_layer(hw);
-	node = ice_sched_get_first_node(hw->port_info, tc_node, vsi_layer);
+	vsi_layer = ice_sched_get_vsi_layer(pi->hw);
+	node = ice_sched_get_first_node(pi, tc_node, vsi_layer);
 
 	/* Check whether it already exists */
 	while (node) {
@@ -1187,6 +1619,65 @@ ice_sched_get_vsi_node(struct ice_hw *hw, struct ice_sched_node *tc_node,
 	return node;
 }
 
+/**
+ * ice_sched_get_agg_node - Get an aggregator node based on aggregator ID
+ * @pi: pointer to the port information structure
+ * @tc_node: pointer to the TC node
+ * @agg_id: aggregator ID
+ *
+ * This function retrieves an aggregator node for a given aggregator ID from
+ * a given TC branch
+ */
+static struct ice_sched_node *
+ice_sched_get_agg_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
+		       u32 agg_id)
+{
+	struct ice_sched_node *node;
+	struct ice_hw *hw = pi->hw;
+	u8 agg_layer;
+
+	if (!hw)
+		return NULL;
+	agg_layer = ice_sched_get_agg_layer(hw);
+	node = ice_sched_get_first_node(pi, tc_node, agg_layer);
+
+	/* Check whether it already exists */
+	while (node) {
+		if (node->agg_id == agg_id)
+			return node;
+		node = node->sibling;
+	}
+
+	return node;
+}
+
+/**
+ * ice_sched_check_node - Compare node parameters between SW DB and HW DB
+ * @hw: pointer to the HW struct
+ * @node: pointer to the ice_sched_node struct
+ *
+ * This function queries and compares the HW element with SW DB node parameters
+ */
+static bool ice_sched_check_node(struct ice_hw *hw, struct ice_sched_node *node)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	enum ice_status status;
+	u32 node_teid;
+
+	node_teid = le32_to_cpu(node->info.node_teid);
+	status = ice_sched_query_elem(hw, node_teid, &buf);
+	if (status)
+		return false;
+
+	if (memcmp(&buf, &node->info, sizeof(buf))) {
+		ice_debug(hw, ICE_DBG_SCHED, "Node mismatch for teid=0x%x\n",
+			  node_teid);
+		return false;
+	}
+
+	return true;
+}
+
 /**
  * ice_sched_calc_vsi_child_nodes - calculate number of VSI child nodes
  * @hw: pointer to the HW struct
@@ -1240,7 +1731,7 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 
 	qgl = ice_sched_get_qgrp_layer(hw);
 	vsil = ice_sched_get_vsi_layer(hw);
-	parent = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	parent = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 	for (i = vsil + 1; i <= qgl; i++) {
 		if (!parent)
 			return ICE_ERR_CFG;
@@ -1273,7 +1764,7 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 
 /**
  * ice_sched_calc_vsi_support_nodes - calculate number of VSI support nodes
- * @hw: pointer to the HW struct
+ * @pi: pointer to the port info structure
  * @tc_node: pointer to TC node
  * @num_nodes: pointer to num nodes array
  *
@@ -1282,15 +1773,15 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
  * layers
  */
 static void
-ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
+ice_sched_calc_vsi_support_nodes(struct ice_port_info *pi,
 				 struct ice_sched_node *tc_node, u16 *num_nodes)
 {
 	struct ice_sched_node *node;
 	u8 vsil;
 	int i;
 
-	vsil = ice_sched_get_vsi_layer(hw);
-	for (i = vsil; i >= hw->sw_entry_point_layer; i--)
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+	for (i = vsil; i >= pi->hw->sw_entry_point_layer; i--)
 		/* Add intermediate nodes if TC has no children and
 		 * need at least one node for VSI
 		 */
@@ -1300,11 +1791,11 @@ ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
 			/* If intermediate nodes are reached max children
 			 * then add a new one.
 			 */
-			node = ice_sched_get_first_node(hw->port_info, tc_node,
-							(u8)i);
+			node = ice_sched_get_first_node(pi, tc_node, (u8)i);
 			/* scan all the siblings */
 			while (node) {
-				if (node->num_children < hw->max_children[i])
+				if (node->num_children <
+				    pi->hw->max_children[i])
 					break;
 				node = node->sibling;
 			}
@@ -1384,14 +1875,13 @@ ice_sched_add_vsi_to_topo(struct ice_port_info *pi, u16 vsi_handle, u8 tc)
 {
 	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
 	struct ice_sched_node *tc_node;
-	struct ice_hw *hw = pi->hw;
 
 	tc_node = ice_sched_get_tc_node(pi, tc);
 	if (!tc_node)
 		return ICE_ERR_PARAM;
 
 	/* calculate number of supported nodes needed for this VSI */
-	ice_sched_calc_vsi_support_nodes(hw, tc_node, num_nodes);
+	ice_sched_calc_vsi_support_nodes(pi, tc_node, num_nodes);
 
 	/* add VSI supported nodes to TC subtree */
 	return ice_sched_add_vsi_support_nodes(pi, vsi_handle, tc_node,
@@ -1424,7 +1914,7 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 	if (!tc_node)
 		return ICE_ERR_CFG;
 
-	vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 	if (!vsi_node)
 		return ICE_ERR_CFG;
 
@@ -1432,13 +1922,22 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 	if (!vsi_ctx)
 		return ICE_ERR_PARAM;
 
-	prev_numqs = vsi_ctx->sched.max_lanq[tc];
+	if (owner == ICE_SCHED_NODE_OWNER_LAN)
+		prev_numqs = vsi_ctx->sched.max_lanq[tc];
+	else
+		prev_numqs = vsi_ctx->sched.max_rdmaq[tc];
 	/* num queues are not changed or less than the previous number */
 	if (new_numqs <= prev_numqs)
 		return status;
-	status = ice_alloc_lan_q_ctx(hw, vsi_handle, tc, new_numqs);
-	if (status)
-		return status;
+	if (owner == ICE_SCHED_NODE_OWNER_LAN) {
+		status = ice_alloc_lan_q_ctx(hw, vsi_handle, tc, new_numqs);
+		if (status)
+			return status;
+	} else {
+		status = ice_alloc_rdma_q_ctx(hw, vsi_handle, tc, new_numqs);
+		if (status)
+			return status;
+	}
 
 	if (new_numqs)
 		ice_sched_calc_vsi_child_nodes(hw, new_numqs, new_num_nodes);
@@ -1453,7 +1952,10 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 					       new_num_nodes, owner);
 	if (status)
 		return status;
-	vsi_ctx->sched.max_lanq[tc] = new_numqs;
+	if (owner == ICE_SCHED_NODE_OWNER_LAN)
+		vsi_ctx->sched.max_lanq[tc] = new_numqs;
+	else
+		vsi_ctx->sched.max_rdmaq[tc] = new_numqs;
 
 	return 0;
 }
@@ -1487,7 +1989,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 	vsi_ctx = ice_get_vsi_ctx(hw, vsi_handle);
 	if (!vsi_ctx)
 		return ICE_ERR_PARAM;
-	vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 
 	/* suspend the VSI if TC is not enabled */
 	if (!enable) {
@@ -1508,7 +2010,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 		if (status)
 			return status;
 
-		vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 		if (!vsi_node)
 			return ICE_ERR_CFG;
 
@@ -1519,6 +2021,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 		 * recreate the child nodes all the time in these cases.
 		 */
 		vsi_ctx->sched.max_lanq[tc] = 0;
+		vsi_ctx->sched.max_rdmaq[tc] = 0;
 	}
 
 	/* update the VSI child nodes */
@@ -1540,15 +2043,14 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 }
 
 /**
- * ice_sched_rm_agg_vsi_entry - remove aggregator related VSI info entry
+ * ice_sched_rm_agg_vsi_info - remove aggregator related VSI info entry
  * @pi: port information structure
  * @vsi_handle: software VSI handle
  *
  * This function removes single aggregator VSI info entry from
  * aggregator list.
  */
-static void
-ice_sched_rm_agg_vsi_info(struct ice_port_info *pi, u16 vsi_handle)
+static void ice_sched_rm_agg_vsi_info(struct ice_port_info *pi, u16 vsi_handle)
 {
 	struct ice_sched_agg_info *agg_info;
 	struct ice_sched_agg_info *atmp;
@@ -1618,13 +2120,12 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner)
 		if (!tc_node)
 			continue;
 
-		vsi_node = ice_sched_get_vsi_node(pi->hw, tc_node, vsi_handle);
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 		if (!vsi_node)
 			continue;
 
 		if (ice_sched_is_leaf_node_present(vsi_node)) {
-			ice_debug(pi->hw, ICE_DBG_SCHED,
-				  "VSI has leaf nodes in TC %d\n", i);
+			ice_debug(pi->hw, ICE_DBG_SCHED, "VSI has leaf nodes in TC %d\n", i);
 			status = ICE_ERR_IN_USE;
 			goto exit_sched_rm_vsi_cfg;
 		}
@@ -1650,6 +2151,8 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner)
 		}
 		if (owner == ICE_SCHED_NODE_OWNER_LAN)
 			vsi_ctx->sched.max_lanq[i] = 0;
+		else
+			vsi_ctx->sched.max_rdmaq[i] = 0;
 	}
 	status = 0;
 
@@ -1670,3 +2173,3654 @@ enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle)
 {
 	return ice_sched_rm_vsi_cfg(pi, vsi_handle, ICE_SCHED_NODE_OWNER_LAN);
 }
+
+/**
+ * ice_rm_vsi_rdma_cfg - remove VSI and its RDMA children nodes
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function clears the VSI and its RDMA children nodes from scheduler tree
+ * for all TCs.
+ */
+enum ice_status ice_rm_vsi_rdma_cfg(struct ice_port_info *pi, u16 vsi_handle)
+{
+	return ice_sched_rm_vsi_cfg(pi, vsi_handle, ICE_SCHED_NODE_OWNER_RDMA);
+}
+
+/**
+ * ice_sched_is_tree_balanced - Check tree nodes are identical or not
+ * @hw: pointer to the HW struct
+ * @node: pointer to the ice_sched_node struct
+ *
+ * This function compares all the nodes for a given tree against HW DB nodes
+ * This function needs to be called with the port_info->sched_lock held
+ */
+bool ice_sched_is_tree_balanced(struct ice_hw *hw, struct ice_sched_node *node)
+{
+	u8 i;
+
+	/* start from the leaf node */
+	for (i = 0; i < node->num_children; i++)
+		/* Fail if node doesn't match with the SW DB
+		 * this recursion is intentional, and wouldn't
+		 * go more than 9 calls
+		 */
+		if (!ice_sched_is_tree_balanced(hw, node->children[i]))
+			return false;
+
+	return ice_sched_check_node(hw, node);
+}
+
+/**
+ * ice_aq_query_node_to_root - retrieve the tree topology for a given node TEID
+ * @hw: pointer to the HW struct
+ * @node_teid: node TEID
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function retrieves the tree topology from the firmware for a given
+ * node TEID to the root node.
+ */
+enum ice_status
+ice_aq_query_node_to_root(struct ice_hw *hw, u32 node_teid,
+			  struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
+			  struct ice_sq_cd *cd)
+{
+	struct ice_aqc_query_node_to_root *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.query_node_to_root;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_query_node_to_root);
+	cmd->teid = cpu_to_le32(node_teid);
+	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+}
+
+/**
+ * ice_get_agg_info - get the aggregator ID
+ * @hw: pointer to the hardware structure
+ * @agg_id: aggregator ID
+ *
+ * This function validates aggregator ID. The function returns info if
+ * aggregator ID is present in list otherwise it returns null.
+ */
+static struct ice_sched_agg_info *
+ice_get_agg_info(struct ice_hw *hw, u32 agg_id)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id)
+			return agg_info;
+
+	return NULL;
+}
+
+/**
+ * ice_sched_get_free_vsi_parent - Find a free parent node in aggregator subtree
+ * @hw: pointer to the HW struct
+ * @node: pointer to a child node
+ * @num_nodes: num nodes count array
+ *
+ * This function walks through the aggregator subtree to find a free parent
+ * node
+ */
+static struct ice_sched_node *
+ice_sched_get_free_vsi_parent(struct ice_hw *hw, struct ice_sched_node *node,
+			      u16 *num_nodes)
+{
+	u8 l = node->tx_sched_layer;
+	u8 vsil, i;
+
+	vsil = ice_sched_get_vsi_layer(hw);
+
+	/* Is it VSI parent layer ? */
+	if (l == vsil - 1)
+		return (node->num_children < hw->max_children[l]) ? node : NULL;
+
+	/* We have intermediate nodes. Let's walk through the subtree. If the
+	 * intermediate node has space to add a new node then clear the count
+	 */
+	if (node->num_children < hw->max_children[l])
+		num_nodes[l] = 0;
+	/* The below recursive call is intentional and wouldn't go more than
+	 * 2 or 3 iterations.
+	 */
+
+	for (i = 0; i < node->num_children; i++) {
+		struct ice_sched_node *parent;
+
+		parent = ice_sched_get_free_vsi_parent(hw, node->children[i],
+						       num_nodes);
+		if (parent)
+			return parent;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_sched_update_parent - update the new parent in SW DB
+ * @new_parent: pointer to a new parent node
+ * @node: pointer to a child node
+ *
+ * This function removes the child from the old parent and adds it to a new
+ * parent
+ */
+static void
+ice_sched_update_parent(struct ice_sched_node *new_parent,
+			struct ice_sched_node *node)
+{
+	struct ice_sched_node *old_parent;
+	u8 i, j;
+
+	old_parent = node->parent;
+
+	/* update the old parent children */
+	for (i = 0; i < old_parent->num_children; i++)
+		if (old_parent->children[i] == node) {
+			for (j = i + 1; j < old_parent->num_children; j++)
+				old_parent->children[j - 1] =
+					old_parent->children[j];
+			old_parent->num_children--;
+			break;
+		}
+
+	/* now move the node to a new parent */
+	new_parent->children[new_parent->num_children++] = node;
+	node->parent = new_parent;
+	node->info.parent_teid = new_parent->info.node_teid;
+}
+
+/**
+ * ice_sched_move_nodes - move child nodes to a given parent
+ * @pi: port information structure
+ * @parent: pointer to parent node
+ * @num_items: number of child nodes to be moved
+ * @list: pointer to child node teids
+ *
+ * This function move the child nodes to a given parent.
+ */
+static enum ice_status
+ice_sched_move_nodes(struct ice_port_info *pi, struct ice_sched_node *parent,
+		     u16 num_items, u32 *list)
+{
+	struct ice_aqc_move_elem *buf;
+	struct ice_sched_node *node;
+	enum ice_status status = 0;
+	u16 i, grps_movd = 0;
+	struct ice_hw *hw;
+	u16 buf_len;
+
+	hw = pi->hw;
+
+	if (!parent || !num_items)
+		return ICE_ERR_PARAM;
+
+	/* Does parent have enough space */
+	if (parent->num_children + num_items >
+	    hw->max_children[parent->tx_sched_layer])
+		return ICE_ERR_AQ_FULL;
+
+	buf_len = struct_size(buf, teid, 1);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < num_items; i++) {
+		node = ice_sched_find_node_by_teid(pi->root, list[i]);
+		if (!node) {
+			status = ICE_ERR_PARAM;
+			goto move_err_exit;
+		}
+
+		buf->hdr.src_parent_teid = node->info.parent_teid;
+		buf->hdr.dest_parent_teid = parent->info.node_teid;
+		buf->teid[0] = node->info.node_teid;
+		buf->hdr.num_elems = cpu_to_le16(1);
+		status = ice_aq_move_sched_elems(hw, 1, buf, buf_len,
+						 &grps_movd, NULL);
+		if (status && grps_movd != 1) {
+			status = ICE_ERR_CFG;
+			goto move_err_exit;
+		}
+
+		/* update the SW DB */
+		ice_sched_update_parent(parent, node);
+	}
+
+move_err_exit:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
+
+/**
+ * ice_sched_move_vsi_to_agg - move VSI to aggregator node
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function moves a VSI to an aggregator node or its subtree.
+ * Intermediate nodes may be created if required.
+ */
+static enum ice_status
+ice_sched_move_vsi_to_agg(struct ice_port_info *pi, u16 vsi_handle, u32 agg_id,
+			  u8 tc)
+{
+	struct ice_sched_node *vsi_node, *agg_node, *tc_node, *parent;
+	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	u32 first_node_teid, vsi_teid;
+	enum ice_status status;
+	u16 num_nodes_added;
+	u8 aggl, vsil, i;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+	if (!vsi_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Is this VSI already part of given aggregator? */
+	if (ice_sched_find_node_in_subtree(pi->hw, agg_node, vsi_node))
+		return 0;
+
+	aggl = ice_sched_get_agg_layer(pi->hw);
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+
+	/* set intermediate node count to 1 between aggregator and VSI layers */
+	for (i = aggl + 1; i < vsil; i++)
+		num_nodes[i] = 1;
+
+	/* Check if the aggregator subtree has any free node to add the VSI */
+	for (i = 0; i < agg_node->num_children; i++) {
+		parent = ice_sched_get_free_vsi_parent(pi->hw,
+						       agg_node->children[i],
+						       num_nodes);
+		if (parent)
+			goto move_nodes;
+	}
+
+	/* add new nodes */
+	parent = agg_node;
+	for (i = aggl + 1; i < vsil; i++) {
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent, i,
+						      num_nodes[i],
+						      &first_node_teid,
+						      &num_nodes_added);
+		if (status || num_nodes[i] != num_nodes_added)
+			return ICE_ERR_CFG;
+
+		/* The newly added node can be a new parent for the next
+		 * layer nodes
+		 */
+		if (num_nodes_added)
+			parent = ice_sched_find_node_by_teid(tc_node,
+							     first_node_teid);
+		else
+			parent = parent->children[0];
+
+		if (!parent)
+			return ICE_ERR_CFG;
+	}
+
+move_nodes:
+	vsi_teid = le32_to_cpu(vsi_node->info.node_teid);
+	return ice_sched_move_nodes(pi, parent, 1, &vsi_teid);
+}
+
+/**
+ * ice_move_all_vsi_to_dflt_agg - move all VSI(s) to default aggregator
+ * @pi: port information structure
+ * @agg_info: aggregator info
+ * @tc: traffic class number
+ * @rm_vsi_info: true or false
+ *
+ * This function move all the VSI(s) to the default aggregator and delete
+ * aggregator VSI info based on passed in boolean parameter rm_vsi_info. The
+ * caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_move_all_vsi_to_dflt_agg(struct ice_port_info *pi,
+			     struct ice_sched_agg_info *agg_info, u8 tc,
+			     bool rm_vsi_info)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_agg_vsi_info *tmp;
+	enum ice_status status = 0;
+
+	list_for_each_entry_safe(agg_vsi_info, tmp, &agg_info->agg_vsi_list,
+				 list_entry) {
+		u16 vsi_handle = agg_vsi_info->vsi_handle;
+
+		/* Move VSI to default aggregator */
+		if (!ice_is_tc_ena(agg_vsi_info->tc_bitmap[0], tc))
+			continue;
+
+		status = ice_sched_move_vsi_to_agg(pi, vsi_handle,
+						   ICE_DFLT_AGG_ID, tc);
+		if (status)
+			break;
+
+		clear_bit(tc, agg_vsi_info->tc_bitmap);
+		if (rm_vsi_info && !agg_vsi_info->tc_bitmap[0]) {
+			list_del(&agg_vsi_info->list_entry);
+			devm_kfree(ice_hw_to_dev(pi->hw), agg_vsi_info);
+		}
+	}
+
+	return status;
+}
+
+/**
+ * ice_sched_is_agg_inuse - check whether the aggregator is in use or not
+ * @pi: port information structure
+ * @node: node pointer
+ *
+ * This function checks whether the aggregator is attached with any VSI or not.
+ */
+static bool
+ice_sched_is_agg_inuse(struct ice_port_info *pi, struct ice_sched_node *node)
+{
+	u8 vsil, i;
+
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+	if (node->tx_sched_layer < vsil - 1) {
+		for (i = 0; i < node->num_children; i++)
+			if (ice_sched_is_agg_inuse(pi, node->children[i]))
+				return true;
+		return false;
+	} else {
+		return node->num_children ? true : false;
+	}
+}
+
+/**
+ * ice_sched_rm_agg_cfg - remove the aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function removes the aggregator node and intermediate nodes if any
+ * from the given TC
+ */
+static enum ice_status
+ice_sched_rm_agg_cfg(struct ice_port_info *pi, u32 agg_id, u8 tc)
+{
+	struct ice_sched_node *tc_node, *agg_node;
+	struct ice_hw *hw = pi->hw;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Can't remove the aggregator node if it has children */
+	if (ice_sched_is_agg_inuse(pi, agg_node))
+		return ICE_ERR_IN_USE;
+
+	/* need to remove the whole subtree if aggregator node is the
+	 * only child.
+	 */
+	while (agg_node->tx_sched_layer > hw->sw_entry_point_layer) {
+		struct ice_sched_node *parent = agg_node->parent;
+
+		if (!parent)
+			return ICE_ERR_CFG;
+
+		if (parent->num_children > 1)
+			break;
+
+		agg_node = parent;
+	}
+
+	ice_free_sched_node(pi, agg_node);
+	return 0;
+}
+
+/**
+ * ice_rm_agg_cfg_tc - remove aggregator configuration for TC
+ * @pi: port information structure
+ * @agg_info: aggregator ID
+ * @tc: TC number
+ * @rm_vsi_info: bool value true or false
+ *
+ * This function removes aggregator reference to VSI of given TC. It removes
+ * the aggregator configuration completely for requested TC. The caller needs
+ * to hold the scheduler lock.
+ */
+static enum ice_status
+ice_rm_agg_cfg_tc(struct ice_port_info *pi, struct ice_sched_agg_info *agg_info,
+		  u8 tc, bool rm_vsi_info)
+{
+	enum ice_status status = 0;
+
+	/* If nothing to remove - return success */
+	if (!ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+		goto exit_rm_agg_cfg_tc;
+
+	status = ice_move_all_vsi_to_dflt_agg(pi, agg_info, tc, rm_vsi_info);
+	if (status)
+		goto exit_rm_agg_cfg_tc;
+
+	/* Delete aggregator node(s) */
+	status = ice_sched_rm_agg_cfg(pi, agg_info->agg_id, tc);
+	if (status)
+		goto exit_rm_agg_cfg_tc;
+
+	clear_bit(tc, agg_info->tc_bitmap);
+exit_rm_agg_cfg_tc:
+	return status;
+}
+
+/**
+ * ice_save_agg_tc_bitmap - save aggregator TC bitmap
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc_bitmap: 8 bits TC bitmap
+ *
+ * Save aggregator TC bitmap. This function needs to be called with scheduler
+ * lock held.
+ */
+static enum ice_status
+ice_save_agg_tc_bitmap(struct ice_port_info *pi, u32 agg_id,
+		       unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	bitmap_copy(agg_info->replay_tc_bitmap, tc_bitmap,
+		    ICE_MAX_TRAFFIC_CLASS);
+	return 0;
+}
+
+/**
+ * ice_sched_add_agg_cfg - create an aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function creates an aggregator node and intermediate nodes if required
+ * for the given TC
+ */
+static enum ice_status
+ice_sched_add_agg_cfg(struct ice_port_info *pi, u32 agg_id, u8 tc)
+{
+	struct ice_sched_node *parent, *agg_node, *tc_node;
+	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u32 first_node_teid;
+	u16 num_nodes_added;
+	u8 i, aggl;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	/* Does Agg node already exist ? */
+	if (agg_node)
+		return status;
+
+	aggl = ice_sched_get_agg_layer(hw);
+
+	/* need one node in Agg layer */
+	num_nodes[aggl] = 1;
+
+	/* Check whether the intermediate nodes have space to add the
+	 * new aggregator. If they are full, then SW needs to allocate a new
+	 * intermediate node on those layers
+	 */
+	for (i = hw->sw_entry_point_layer; i < aggl; i++) {
+		parent = ice_sched_get_first_node(pi, tc_node, i);
+
+		/* scan all the siblings */
+		while (parent) {
+			if (parent->num_children < hw->max_children[i])
+				break;
+			parent = parent->sibling;
+		}
+
+		/* all the nodes are full, reserve one for this layer */
+		if (!parent)
+			num_nodes[i]++;
+	}
+
+	/* add the aggregator node */
+	parent = tc_node;
+	for (i = hw->sw_entry_point_layer; i <= aggl; i++) {
+		if (!parent)
+			return ICE_ERR_CFG;
+
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent, i,
+						      num_nodes[i],
+						      &first_node_teid,
+						      &num_nodes_added);
+		if (status || num_nodes[i] != num_nodes_added)
+			return ICE_ERR_CFG;
+
+		/* The newly added node can be a new parent for the next
+		 * layer nodes
+		 */
+		if (num_nodes_added) {
+			parent = ice_sched_find_node_by_teid(tc_node,
+							     first_node_teid);
+			/* register aggregator ID with the aggregator node */
+			if (parent && i == aggl)
+				parent->agg_id = agg_id;
+		} else {
+			parent = parent->children[0];
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_sched_cfg_agg - configure aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @agg_type: aggregator type queue, VSI, or aggregator group
+ * @tc_bitmap: bits TC bitmap
+ *
+ * It registers a unique aggregator node into scheduler services. It
+ * allows a user to register with a unique ID to track it's resources.
+ * The aggregator type determines if this is a queue group, VSI group
+ * or aggregator group. It then creates the aggregator node(s) for requested
+ * TC(s) or removes an existing aggregator node including its configuration
+ * if indicated via tc_bitmap. Call ice_rm_agg_cfg to release aggregator
+ * resources and remove aggregator ID.
+ * This function needs to be called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_cfg_agg(struct ice_port_info *pi, u32 agg_id,
+		  enum ice_agg_type agg_type, unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u8 tc;
+
+	agg_info = ice_get_agg_info(hw, agg_id);
+	if (!agg_info) {
+		/* Create new entry for new aggregator ID */
+		agg_info = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*agg_info),
+					GFP_KERNEL);
+		if (!agg_info)
+			return ICE_ERR_NO_MEMORY;
+
+		agg_info->agg_id = agg_id;
+		agg_info->agg_type = agg_type;
+		agg_info->tc_bitmap[0] = 0;
+
+		/* Initialize the aggregator VSI list head */
+		INIT_LIST_HEAD(&agg_info->agg_vsi_list);
+
+		/* Add new entry in aggregator list */
+		list_add(&agg_info->list_entry, &hw->agg_list);
+	}
+	/* Create aggregator node(s) for requested TC(s) */
+	ice_for_each_traffic_class(tc) {
+		if (!ice_is_tc_ena(*tc_bitmap, tc)) {
+			/* Delete aggregator cfg TC if it exists previously */
+			status = ice_rm_agg_cfg_tc(pi, agg_info, tc, false);
+			if (status)
+				break;
+			continue;
+		}
+
+		/* Check if aggregator node for TC already exists */
+		if (ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+			continue;
+
+		/* Create new aggregator node for TC */
+		status = ice_sched_add_agg_cfg(pi, agg_id, tc);
+		if (status)
+			break;
+
+		/* Save aggregator node's TC information */
+		set_bit(tc, agg_info->tc_bitmap);
+	}
+
+	return status;
+}
+
+/**
+ * ice_cfg_agg - config aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @agg_type: aggregator type queue, VSI, or aggregator group
+ * @tc_bitmap: bits TC bitmap
+ *
+ * This function configures aggregator node(s).
+ */
+enum ice_status
+ice_cfg_agg(struct ice_port_info *pi, u32 agg_id, enum ice_agg_type agg_type,
+	    u8 tc_bitmap)
+{
+	unsigned long bitmap = tc_bitmap;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_cfg_agg(pi, agg_id, agg_type,
+				   (unsigned long *)&bitmap);
+	if (!status)
+		status = ice_save_agg_tc_bitmap(pi, agg_id,
+						(unsigned long *)&bitmap);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_get_agg_vsi_info - get the aggregator ID
+ * @agg_info: aggregator info
+ * @vsi_handle: software VSI handle
+ *
+ * The function returns aggregator VSI info based on VSI handle. This function
+ * needs to be called with scheduler lock held.
+ */
+static struct ice_sched_agg_vsi_info *
+ice_get_agg_vsi_info(struct ice_sched_agg_info *agg_info, u16 vsi_handle)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+	list_for_each_entry(agg_vsi_info, &agg_info->agg_vsi_list, list_entry)
+		if (agg_vsi_info->vsi_handle == vsi_handle)
+			return agg_vsi_info;
+
+	return NULL;
+}
+
+/**
+ * ice_get_vsi_agg_info - get the aggregator info of VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: Sw VSI handle
+ *
+ * The function returns aggregator info of VSI represented via vsi_handle. The
+ * VSI has in this case a different aggregator than the default one. This
+ * function needs to be called with scheduler lock held.
+ */
+static struct ice_sched_agg_info *
+ice_get_vsi_agg_info(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry) {
+		struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+		agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+		if (agg_vsi_info)
+			return agg_info;
+	}
+	return NULL;
+}
+
+/**
+ * ice_save_agg_vsi_tc_bitmap - save aggregator VSI TC bitmap
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * Save VSI to aggregator TC bitmap. This function needs to call with scheduler
+ * lock held.
+ */
+static enum ice_status
+ice_save_agg_vsi_tc_bitmap(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+			   unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	/* check if entry already exist */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info)
+		return ICE_ERR_PARAM;
+	bitmap_copy(agg_vsi_info->replay_tc_bitmap, tc_bitmap,
+		    ICE_MAX_TRAFFIC_CLASS);
+	return 0;
+}
+
+/**
+ * ice_sched_assoc_vsi_to_agg - associate/move VSI to new/default aggregator
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * This function moves VSI to a new or default aggregator node. If VSI is
+ * already associated to the aggregator node then no operation is performed on
+ * the tree. This function needs to be called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_assoc_vsi_to_agg(struct ice_port_info *pi, u32 agg_id,
+			   u16 vsi_handle, unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info, *old_agg_vsi_info = NULL;
+	struct ice_sched_agg_info *agg_info, *old_agg_info;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	agg_info = ice_get_agg_info(hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	/* If the vsi is already part of another aggregator then update
+	 * its vsi info list
+	 */
+	old_agg_info = ice_get_vsi_agg_info(hw, vsi_handle);
+	if (old_agg_info && old_agg_info != agg_info) {
+		struct ice_sched_agg_vsi_info *vtmp;
+
+		list_for_each_entry_safe(old_agg_vsi_info, vtmp,
+					 &old_agg_info->agg_vsi_list,
+					 list_entry)
+			if (old_agg_vsi_info->vsi_handle == vsi_handle)
+				break;
+	}
+
+	/* check if entry already exist */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info) {
+		/* Create new entry for VSI under aggregator list */
+		agg_vsi_info = devm_kzalloc(ice_hw_to_dev(hw),
+					    sizeof(*agg_vsi_info), GFP_KERNEL);
+		if (!agg_vsi_info)
+			return ICE_ERR_PARAM;
+
+		/* add VSI ID into the aggregator list */
+		agg_vsi_info->vsi_handle = vsi_handle;
+		list_add(&agg_vsi_info->list_entry, &agg_info->agg_vsi_list);
+	}
+	/* Move VSI node to new aggregator node for requested TC(s) */
+	ice_for_each_traffic_class(tc) {
+		if (!ice_is_tc_ena(*tc_bitmap, tc))
+			continue;
+
+		/* Move VSI to new aggregator */
+		status = ice_sched_move_vsi_to_agg(pi, vsi_handle, agg_id, tc);
+		if (status)
+			break;
+
+		set_bit(tc, agg_vsi_info->tc_bitmap);
+		if (old_agg_vsi_info)
+			clear_bit(tc, old_agg_vsi_info->tc_bitmap);
+	}
+	if (old_agg_vsi_info && !old_agg_vsi_info->tc_bitmap[0]) {
+		list_del(&old_agg_vsi_info->list_entry);
+		devm_kfree(ice_hw_to_dev(pi->hw), old_agg_vsi_info);
+	}
+	return status;
+}
+
+/**
+ * ice_sched_rm_unused_rl_prof - remove unused RL profile
+ * @hw: pointer to the hardware structure
+ *
+ * This function removes unused rate limit profiles from the HW and
+ * SW DB. The caller needs to hold scheduler lock.
+ */
+static void ice_sched_rm_unused_rl_prof(struct ice_hw *hw)
+{
+	u16 ln;
+
+	for (ln = 0; ln < hw->num_tx_sched_layers; ln++) {
+		struct ice_aqc_rl_profile_info *rl_prof_elem;
+		struct ice_aqc_rl_profile_info *rl_prof_tmp;
+
+		list_for_each_entry_safe(rl_prof_elem, rl_prof_tmp,
+					 &hw->rl_prof_list[ln], list_entry) {
+			if (!ice_sched_del_rl_profile(hw, rl_prof_elem))
+				ice_debug(hw, ICE_DBG_SCHED, "Removed rl profile\n");
+		}
+	}
+}
+
+/**
+ * ice_sched_update_elem - update element
+ * @hw: pointer to the HW struct
+ * @node: pointer to node
+ * @info: node info to update
+ *
+ * Update the HW DB, and local SW DB of node. Update the scheduling
+ * parameters of node from argument info data buffer (Info->data buf) and
+ * returns success or error on config sched element failure. The caller
+ * needs to hold scheduler lock.
+ */
+static enum ice_status
+ice_sched_update_elem(struct ice_hw *hw, struct ice_sched_node *node,
+		      struct ice_aqc_txsched_elem_data *info)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	enum ice_status status;
+	u16 elem_cfgd = 0;
+	u16 num_elems = 1;
+
+	buf = *info;
+	/* Parent TEID is reserved field in this aq call */
+	buf.parent_teid = 0;
+	/* Element type is reserved field in this aq call */
+	buf.data.elem_type = 0;
+	/* Flags is reserved field in this aq call */
+	buf.data.flags = 0;
+
+	/* Update HW DB */
+	/* Configure element node */
+	status = ice_aq_cfg_sched_elems(hw, num_elems, &buf, sizeof(buf),
+					&elem_cfgd, NULL);
+	if (status || elem_cfgd != num_elems) {
+		ice_debug(hw, ICE_DBG_SCHED, "Config sched elem error\n");
+		return ICE_ERR_CFG;
+	}
+
+	/* Config success case */
+	/* Now update local SW DB */
+	/* Only copy the data portion of info buffer */
+	node->info.data = info->data;
+	return status;
+}
+
+/**
+ * ice_sched_cfg_node_bw_alloc - configure node BW weight/alloc params
+ * @hw: pointer to the HW struct
+ * @node: sched node to configure
+ * @rl_type: rate limit type CIR, EIR, or shared
+ * @bw_alloc: BW weight/allocation
+ *
+ * This function configures node element's BW allocation.
+ */
+static enum ice_status
+ice_sched_cfg_node_bw_alloc(struct ice_hw *hw, struct ice_sched_node *node,
+			    enum ice_rl_type rl_type, u16 bw_alloc)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+	enum ice_status status;
+
+	buf = node->info;
+	data = &buf.data;
+	if (rl_type == ICE_MIN_BW) {
+		data->valid_sections |= ICE_AQC_ELEM_VALID_CIR;
+		data->cir_bw.bw_alloc = cpu_to_le16(bw_alloc);
+	} else if (rl_type == ICE_MAX_BW) {
+		data->valid_sections |= ICE_AQC_ELEM_VALID_EIR;
+		data->eir_bw.bw_alloc = cpu_to_le16(bw_alloc);
+	} else {
+		return ICE_ERR_PARAM;
+	}
+
+	/* Configure element */
+	status = ice_sched_update_elem(hw, node, &buf);
+	return status;
+}
+
+/**
+ * ice_move_vsi_to_agg - moves VSI to new or default aggregator
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * Move or associate VSI to a new or default aggregator node.
+ */
+enum ice_status
+ice_move_vsi_to_agg(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+		    u8 tc_bitmap)
+{
+	unsigned long bitmap = tc_bitmap;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_assoc_vsi_to_agg(pi, agg_id, vsi_handle,
+					    (unsigned long *)&bitmap);
+	if (!status)
+		status = ice_save_agg_vsi_tc_bitmap(pi, agg_id, vsi_handle,
+						    (unsigned long *)&bitmap);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_rm_agg_cfg - remove aggregator configuration
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ *
+ * This function removes aggregator reference to VSI and delete aggregator ID
+ * info. It removes the aggregator configuration completely.
+ */
+enum ice_status ice_rm_agg_cfg(struct ice_port_info *pi, u32 agg_id)
+{
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status = 0;
+	u8 tc;
+
+	mutex_lock(&pi->sched_lock);
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto exit_ice_rm_agg_cfg;
+	}
+
+	ice_for_each_traffic_class(tc) {
+		status = ice_rm_agg_cfg_tc(pi, agg_info, tc, true);
+		if (status)
+			goto exit_ice_rm_agg_cfg;
+	}
+
+	if (!bitmap_empty(agg_info->tc_bitmap, ICE_MAX_TRAFFIC_CLASS)) {
+		status = ICE_ERR_IN_USE;
+		goto exit_ice_rm_agg_cfg;
+	}
+
+	/* Safe to delete entry now */
+	list_del(&agg_info->list_entry);
+	devm_kfree(ice_hw_to_dev(pi->hw), agg_info);
+
+	/* Remove unused RL profile IDs from HW and SW DB */
+	ice_sched_rm_unused_rl_prof(pi->hw);
+
+exit_ice_rm_agg_cfg:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_set_clear_cir_bw_alloc - set or clear CIR BW alloc information
+ * @bw_t_info: bandwidth type information structure
+ * @bw_alloc: Bandwidth allocation information
+ *
+ * Save or clear CIR BW alloc information (bw_alloc) in the passed param
+ * bw_t_info.
+ */
+static void
+ice_set_clear_cir_bw_alloc(struct ice_bw_type_info *bw_t_info, u16 bw_alloc)
+{
+	bw_t_info->cir_bw.bw_alloc = bw_alloc;
+	if (bw_t_info->cir_bw.bw_alloc)
+		set_bit(ICE_BW_TYPE_CIR_WT, bw_t_info->bw_t_bitmap);
+	else
+		clear_bit(ICE_BW_TYPE_CIR_WT, bw_t_info->bw_t_bitmap);
+}
+
+/**
+ * ice_set_clear_eir_bw_alloc - set or clear EIR BW alloc information
+ * @bw_t_info: bandwidth type information structure
+ * @bw_alloc: Bandwidth allocation information
+ *
+ * Save or clear EIR BW alloc information (bw_alloc) in the passed param
+ * bw_t_info.
+ */
+static void
+ice_set_clear_eir_bw_alloc(struct ice_bw_type_info *bw_t_info, u16 bw_alloc)
+{
+	bw_t_info->eir_bw.bw_alloc = bw_alloc;
+	if (bw_t_info->eir_bw.bw_alloc)
+		set_bit(ICE_BW_TYPE_EIR_WT, bw_t_info->bw_t_bitmap);
+	else
+		clear_bit(ICE_BW_TYPE_EIR_WT, bw_t_info->bw_t_bitmap);
+}
+
+/**
+ * ice_sched_save_vsi_bw_alloc - save VSI node's BW alloc information
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @rl_type: rate limit type min or max
+ * @bw_alloc: Bandwidth allocation information
+ *
+ * Save BW alloc information of VSI type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_vsi_bw_alloc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			    enum ice_rl_type rl_type, u16 bw_alloc)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw_alloc(&vsi_ctx->sched.bw_t_info[tc],
+					   bw_alloc);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw_alloc(&vsi_ctx->sched.bw_t_info[tc],
+					   bw_alloc);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_set_clear_cir_bw - set or clear CIR BW
+ * @bw_t_info: bandwidth type information structure
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save or clear CIR bandwidth (BW) in the passed param bw_t_info.
+ */
+static void ice_set_clear_cir_bw(struct ice_bw_type_info *bw_t_info, u32 bw)
+{
+	if (bw == ICE_SCHED_DFLT_BW) {
+		clear_bit(ICE_BW_TYPE_CIR, bw_t_info->bw_t_bitmap);
+		bw_t_info->cir_bw.bw = 0;
+	} else {
+		/* Save type of BW information */
+		set_bit(ICE_BW_TYPE_CIR, bw_t_info->bw_t_bitmap);
+		bw_t_info->cir_bw.bw = bw;
+	}
+}
+
+/**
+ * ice_set_clear_eir_bw - set or clear EIR BW
+ * @bw_t_info: bandwidth type information structure
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save or clear EIR bandwidth (BW) in the passed param bw_t_info.
+ */
+static void ice_set_clear_eir_bw(struct ice_bw_type_info *bw_t_info, u32 bw)
+{
+	if (bw == ICE_SCHED_DFLT_BW) {
+		clear_bit(ICE_BW_TYPE_EIR, bw_t_info->bw_t_bitmap);
+		bw_t_info->eir_bw.bw = 0;
+	} else {
+		/* save EIR BW information */
+		set_bit(ICE_BW_TYPE_EIR, bw_t_info->bw_t_bitmap);
+		bw_t_info->eir_bw.bw = bw;
+	}
+}
+
+/**
+ * ice_set_clear_shared_bw - set or clear shared BW
+ * @bw_t_info: bandwidth type information structure
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save or clear shared bandwidth (BW) in the passed param bw_t_info.
+ */
+static void ice_set_clear_shared_bw(struct ice_bw_type_info *bw_t_info, u32 bw)
+{
+	if (bw == ICE_SCHED_DFLT_BW) {
+		clear_bit(ICE_BW_TYPE_SHARED, bw_t_info->bw_t_bitmap);
+		bw_t_info->shared_bw = 0;
+	} else {
+		/* save shared BW information */
+		set_bit(ICE_BW_TYPE_SHARED, bw_t_info->bw_t_bitmap);
+		bw_t_info->shared_bw = bw;
+	}
+}
+
+/**
+ * ice_sched_save_vsi_bw - save VSI node's BW information
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save BW information of VSI type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_vsi_bw(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      enum ice_rl_type rl_type, u32 bw)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw(&vsi_ctx->sched.bw_t_info[tc], bw);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw(&vsi_ctx->sched.bw_t_info[tc], bw);
+		break;
+	case ICE_SHARED_BW:
+		ice_set_clear_shared_bw(&vsi_ctx->sched.bw_t_info[tc], bw);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_set_clear_prio - set or clear priority information
+ * @bw_t_info: bandwidth type information structure
+ * @prio: priority to save
+ *
+ * Save or clear priority (prio) in the passed param bw_t_info.
+ */
+static void ice_set_clear_prio(struct ice_bw_type_info *bw_t_info, u8 prio)
+{
+	bw_t_info->generic = prio;
+	if (bw_t_info->generic)
+		set_bit(ICE_BW_TYPE_PRIO, bw_t_info->bw_t_bitmap);
+	else
+		clear_bit(ICE_BW_TYPE_PRIO, bw_t_info->bw_t_bitmap);
+}
+
+/**
+ * ice_sched_save_vsi_prio - save VSI node's priority information
+ * @pi: port information structure
+ * @vsi_handle: Software VSI handle
+ * @tc: traffic class
+ * @prio: priority to save
+ *
+ * Save priority information of VSI type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_vsi_prio(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			u8 prio)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return ICE_ERR_PARAM;
+	ice_set_clear_prio(&vsi_ctx->sched.bw_t_info[tc], prio);
+	return 0;
+}
+
+/**
+ * ice_sched_save_agg_bw_alloc - save aggregator node's BW alloc information
+ * @pi: port information structure
+ * @agg_id: node aggregator ID
+ * @tc: traffic class
+ * @rl_type: rate limit type min or max
+ * @bw_alloc: bandwidth alloc information
+ *
+ * Save BW alloc information of AGG type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_agg_bw_alloc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			    enum ice_rl_type rl_type, u16 bw_alloc)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	if (!ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw_alloc(&agg_info->bw_t_info[tc], bw_alloc);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw_alloc(&agg_info->bw_t_info[tc], bw_alloc);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_save_agg_bw - save aggregator node's BW information
+ * @pi: port information structure
+ * @agg_id: node aggregator ID
+ * @tc: traffic class
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save BW information of AGG type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_agg_bw(struct ice_port_info *pi, u32 agg_id, u8 tc,
+		      enum ice_rl_type rl_type, u32 bw)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	if (!ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw(&agg_info->bw_t_info[tc], bw);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw(&agg_info->bw_t_info[tc], bw);
+		break;
+	case ICE_SHARED_BW:
+		ice_set_clear_shared_bw(&agg_info->bw_t_info[tc], bw);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_cfg_vsi_bw_lmt_per_tc - configure VSI BW limit per TC
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: traffic class
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function configures BW limit of VSI scheduling node based on TC
+ * information.
+ */
+enum ice_status
+ice_cfg_vsi_bw_lmt_per_tc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status;
+
+	status = ice_sched_set_node_bw_lmt_per_tc(pi, vsi_handle,
+						  ICE_AGG_TYPE_VSI,
+						  tc, rl_type, bw);
+	if (!status) {
+		mutex_lock(&pi->sched_lock);
+		status = ice_sched_save_vsi_bw(pi, vsi_handle, tc, rl_type, bw);
+		mutex_unlock(&pi->sched_lock);
+	}
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_bw_dflt_lmt_per_tc - configure default VSI BW limit per TC
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: traffic class
+ * @rl_type: min or max
+ *
+ * This function configures default BW limit of VSI scheduling node based on TC
+ * information.
+ */
+enum ice_status
+ice_cfg_vsi_bw_dflt_lmt_per_tc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			       enum ice_rl_type rl_type)
+{
+	enum ice_status status;
+
+	status = ice_sched_set_node_bw_lmt_per_tc(pi, vsi_handle,
+						  ICE_AGG_TYPE_VSI,
+						  tc, rl_type,
+						  ICE_SCHED_DFLT_BW);
+	if (!status) {
+		mutex_lock(&pi->sched_lock);
+		status = ice_sched_save_vsi_bw(pi, vsi_handle, tc, rl_type,
+					       ICE_SCHED_DFLT_BW);
+		mutex_unlock(&pi->sched_lock);
+	}
+	return status;
+}
+
+/**
+ * ice_cfg_agg_bw_lmt_per_tc - configure aggregator BW limit per TC
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function applies BW limit to aggregator scheduling node based on TC
+ * information.
+ */
+enum ice_status
+ice_cfg_agg_bw_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status;
+
+	status = ice_sched_set_node_bw_lmt_per_tc(pi, agg_id, ICE_AGG_TYPE_AGG,
+						  tc, rl_type, bw);
+	if (!status) {
+		mutex_lock(&pi->sched_lock);
+		status = ice_sched_save_agg_bw(pi, agg_id, tc, rl_type, bw);
+		mutex_unlock(&pi->sched_lock);
+	}
+	return status;
+}
+
+/**
+ * ice_cfg_agg_bw_dflt_lmt_per_tc - configure aggregator BW default limit per TC
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @rl_type: min or max
+ *
+ * This function applies default BW limit to aggregator scheduling node based
+ * on TC information.
+ */
+enum ice_status
+ice_cfg_agg_bw_dflt_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			       enum ice_rl_type rl_type)
+{
+	enum ice_status status;
+
+	status = ice_sched_set_node_bw_lmt_per_tc(pi, agg_id, ICE_AGG_TYPE_AGG,
+						  tc, rl_type,
+						  ICE_SCHED_DFLT_BW);
+	if (!status) {
+		mutex_lock(&pi->sched_lock);
+		status = ice_sched_save_agg_bw(pi, agg_id, tc, rl_type,
+					       ICE_SCHED_DFLT_BW);
+		mutex_unlock(&pi->sched_lock);
+	}
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_bw_shared_lmt - configure VSI BW shared limit
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of all VSI type nodes across all traffic
+ * classes for VSI matching handle.
+ */
+enum ice_status
+ice_cfg_vsi_bw_shared_lmt(struct ice_port_info *pi, u16 vsi_handle, u32 min_bw,
+			  u32 max_bw, u32 shared_bw)
+{
+	return ice_sched_set_vsi_bw_shared_lmt(pi, vsi_handle, min_bw, max_bw,
+					       shared_bw);
+}
+
+/**
+ * ice_cfg_vsi_bw_no_shared_lmt - configure VSI BW for no shared limiter
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function removes the shared rate limiter(SRL) of all VSI type nodes
+ * across all traffic classes for VSI matching handle.
+ */
+enum ice_status
+ice_cfg_vsi_bw_no_shared_lmt(struct ice_port_info *pi, u16 vsi_handle)
+{
+	return ice_sched_set_vsi_bw_shared_lmt(pi, vsi_handle,
+					       ICE_SCHED_DFLT_BW,
+					       ICE_SCHED_DFLT_BW,
+					       ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_cfg_agg_bw_shared_lmt - configure aggregator BW shared limit
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id.
+ */
+enum ice_status
+ice_cfg_agg_bw_shared_lmt(struct ice_port_info *pi, u32 agg_id, u32 min_bw,
+			  u32 max_bw, u32 shared_bw)
+{
+	return ice_sched_set_agg_bw_shared_lmt(pi, agg_id, min_bw, max_bw,
+					       shared_bw);
+}
+
+/**
+ * ice_cfg_agg_bw_no_shared_lmt - configure aggregator BW for no shared limiter
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ *
+ * This function removes the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id.
+ */
+enum ice_status
+ice_cfg_agg_bw_no_shared_lmt(struct ice_port_info *pi, u32 agg_id)
+{
+	return ice_sched_set_agg_bw_shared_lmt(pi, agg_id, ICE_SCHED_DFLT_BW,
+					       ICE_SCHED_DFLT_BW,
+					       ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_cfg_agg_bw_shared_lmt_per_tc - config aggregator BW shared limit per tc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id.
+ */
+enum ice_status
+ice_cfg_agg_bw_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+				 u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	return ice_sched_set_agg_bw_shared_lmt_per_tc(pi, agg_id, tc, min_bw,
+						      max_bw, shared_bw);
+}
+
+/**
+ * ice_cfg_agg_bw_no_shared_lmt_per_tc - cfg aggregator BW shared limit per tc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ *
+ * This function configures the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id.
+ */
+enum ice_status
+ice_cfg_agg_bw_no_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc)
+{
+	return ice_sched_set_agg_bw_shared_lmt_per_tc(pi, agg_id, tc,
+						      ICE_SCHED_DFLT_BW,
+						      ICE_SCHED_DFLT_BW,
+						      ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_cfg_vsi_q_priority - config VSI queue priority of node
+ * @pi: port information structure
+ * @num_qs: number of VSI queues
+ * @q_ids: queue IDs array
+ * @q_prio: queue priority array
+ *
+ * This function configures the queue node priority (Sibling Priority) of the
+ * passed in VSI's queue(s) for a given traffic class (TC).
+ */
+enum ice_status
+ice_cfg_vsi_q_priority(struct ice_port_info *pi, u16 num_qs, u32 *q_ids,
+		       u8 *q_prio)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	u16 i;
+
+	mutex_lock(&pi->sched_lock);
+
+	for (i = 0; i < num_qs; i++) {
+		struct ice_sched_node *node;
+
+		node = ice_sched_find_node_by_teid(pi->root, q_ids[i]);
+		if (!node || node->info.data.elem_type !=
+		    ICE_AQC_ELEM_TYPE_LEAF) {
+			status = ICE_ERR_PARAM;
+			break;
+		}
+		/* Configure Priority */
+		status = ice_sched_cfg_sibl_node_prio(pi, node, q_prio[i]);
+		if (status)
+			break;
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_agg_vsi_priority_per_tc - config aggregator's VSI priority per TC
+ * @pi: port information structure
+ * @agg_id: Aggregator ID
+ * @num_vsis: number of VSI(s)
+ * @vsi_handle_arr: array of software VSI handles
+ * @node_prio: pointer to node priority
+ * @tc: traffic class
+ *
+ * This function configures the node priority (Sibling Priority) of the
+ * passed in VSI's for a given traffic class (TC) of an Aggregator ID.
+ */
+enum ice_status
+ice_cfg_agg_vsi_priority_per_tc(struct ice_port_info *pi, u32 agg_id,
+				u16 num_vsis, u16 *vsi_handle_arr,
+				u8 *node_prio, u8 tc)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_node *tc_node, *agg_node;
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_agg_info *agg_info;
+	bool agg_id_present = false;
+	struct ice_hw *hw = pi->hw;
+	u16 i;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id) {
+			agg_id_present = true;
+			break;
+		}
+	if (!agg_id_present)
+		goto exit_agg_priority_per_tc;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		goto exit_agg_priority_per_tc;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		goto exit_agg_priority_per_tc;
+
+	if (num_vsis > hw->max_children[agg_node->tx_sched_layer])
+		goto exit_agg_priority_per_tc;
+
+	for (i = 0; i < num_vsis; i++) {
+		struct ice_sched_node *vsi_node;
+		bool vsi_handle_valid = false;
+		u16 vsi_handle;
+
+		status = ICE_ERR_PARAM;
+		vsi_handle = vsi_handle_arr[i];
+		if (!ice_is_vsi_valid(hw, vsi_handle))
+			goto exit_agg_priority_per_tc;
+		/* Verify child nodes before applying settings */
+		list_for_each_entry(agg_vsi_info, &agg_info->agg_vsi_list,
+				    list_entry)
+			if (agg_vsi_info->vsi_handle == vsi_handle) {
+				/* cppcheck-suppress unreadVariable */
+				vsi_handle_valid = true;
+				break;
+			}
+
+		if (!vsi_handle_valid)
+			goto exit_agg_priority_per_tc;
+
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			goto exit_agg_priority_per_tc;
+
+		if (ice_sched_find_node_in_subtree(hw, agg_node, vsi_node)) {
+			/* Configure Priority */
+			status = ice_sched_cfg_sibl_node_prio(pi, vsi_node,
+							      node_prio[i]);
+			if (status)
+				break;
+			status = ice_sched_save_vsi_prio(pi, vsi_handle, tc,
+							 node_prio[i]);
+			if (status)
+				break;
+		}
+	}
+
+exit_agg_priority_per_tc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_bw_alloc - config VSI BW alloc per TC
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @ena_tcmap: enabled TC map
+ * @rl_type: Rate limit type CIR/EIR
+ * @bw_alloc: Array of BW alloc
+ *
+ * This function configures the BW allocation of the passed in VSI's
+ * node(s) for enabled traffic class.
+ */
+enum ice_status
+ice_cfg_vsi_bw_alloc(struct ice_port_info *pi, u16 vsi_handle, u8 ena_tcmap,
+		     enum ice_rl_type rl_type, u8 *bw_alloc)
+{
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *vsi_node;
+
+		if (!ice_is_tc_ena(ena_tcmap, tc))
+			continue;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			continue;
+
+		status = ice_sched_cfg_node_bw_alloc(pi->hw, vsi_node, rl_type,
+						     bw_alloc[tc]);
+		if (status)
+			break;
+		status = ice_sched_save_vsi_bw_alloc(pi, vsi_handle, tc,
+						     rl_type, bw_alloc[tc]);
+		if (status)
+			break;
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_agg_bw_alloc - config aggregator BW alloc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @ena_tcmap: enabled TC map
+ * @rl_type: rate limit type CIR/EIR
+ * @bw_alloc: array of BW alloc
+ *
+ * This function configures the BW allocation of passed in aggregator for
+ * enabled traffic class(s).
+ */
+enum ice_status
+ice_cfg_agg_bw_alloc(struct ice_port_info *pi, u32 agg_id, u8 ena_tcmap,
+		     enum ice_rl_type rl_type, u8 *bw_alloc)
+{
+	struct ice_sched_agg_info *agg_info;
+	bool agg_id_present = false;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u8 tc;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id) {
+			agg_id_present = true;
+			break;
+		}
+	if (!agg_id_present) {
+		status = ICE_ERR_PARAM;
+		goto exit_cfg_agg_bw_alloc;
+	}
+
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *agg_node;
+
+		if (!ice_is_tc_ena(ena_tcmap, tc))
+			continue;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+		if (!agg_node)
+			continue;
+
+		status = ice_sched_cfg_node_bw_alloc(hw, agg_node, rl_type,
+						     bw_alloc[tc]);
+		if (status)
+			break;
+		status = ice_sched_save_agg_bw_alloc(pi, agg_id, tc, rl_type,
+						     bw_alloc[tc]);
+		if (status)
+			break;
+	}
+
+exit_cfg_agg_bw_alloc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_calc_wakeup - calculate RL profile wakeup parameter
+ * @hw: pointer to the HW struct
+ * @bw: bandwidth in Kbps
+ *
+ * This function calculates the wakeup parameter of RL profile.
+ */
+static u16 ice_sched_calc_wakeup(struct ice_hw *hw, s32 bw)
+{
+	s64 bytes_per_sec, wakeup_int, wakeup_a, wakeup_b, wakeup_f;
+	s32 wakeup_f_int;
+	u16 wakeup = 0;
+
+	/* Get the wakeup integer value */
+	bytes_per_sec = div64_long(((s64)bw * 1000), BITS_PER_BYTE);
+	wakeup_int = div64_long(hw->psm_clk_freq, bytes_per_sec);
+	if (wakeup_int > 63) {
+		wakeup = (u16)((1 << 15) | wakeup_int);
+	} else {
+		/* Calculate fraction value up to 4 decimals
+		 * Convert Integer value to a constant multiplier
+		 */
+		wakeup_b = (s64)ICE_RL_PROF_MULTIPLIER * wakeup_int;
+		wakeup_a = div64_long((s64)ICE_RL_PROF_MULTIPLIER * hw->psm_clk_freq,
+				      bytes_per_sec);
+
+		/* Get Fraction value */
+		wakeup_f = wakeup_a - wakeup_b;
+
+		/* Round up the Fractional value via Ceil(Fractional value) */
+		if (wakeup_f > div64_long(ICE_RL_PROF_MULTIPLIER, 2))
+			wakeup_f += 1;
+
+		wakeup_f_int = (s32) div64_long(wakeup_f * ICE_RL_PROF_FRACTION,
+						ICE_RL_PROF_MULTIPLIER);
+		wakeup |= (u16)(wakeup_int << 9);
+		wakeup |= (u16)(0x1ff & wakeup_f_int);
+	}
+
+	return wakeup;
+}
+
+/**
+ * ice_sched_bw_to_rl_profile - convert BW to profile parameters
+ * @hw: pointer to the HW struct
+ * @bw: bandwidth in Kbps
+ * @profile: profile parameters to return
+ *
+ * This function converts the BW to profile structure format.
+ */
+static enum ice_status
+ice_sched_bw_to_rl_profile(struct ice_hw *hw, u32 bw,
+			   struct ice_aqc_rl_profile_elem *profile)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	s64 bytes_per_sec, ts_rate, mv_tmp;
+	bool found = false;
+	s32 encode = 0;
+	s64 mv = 0;
+	s32 i;
+
+	/* Bw settings range is from 0.5Mb/sec to 100Gb/sec */
+	if (bw < ICE_SCHED_MIN_BW || bw > ICE_SCHED_MAX_BW)
+		return status;
+
+	/* Bytes per second from Kbps */
+	bytes_per_sec = div64_long(((s64)bw * 1000), BITS_PER_BYTE);
+
+	/* encode is 6 bits but really useful are 5 bits */
+	for (i = 0; i < 64; i++) {
+		u64 pow_result = BIT_ULL(i);
+
+		ts_rate = div64_long((s64)hw->psm_clk_freq,
+				     pow_result * ICE_RL_PROF_TS_MULTIPLIER);
+		if (ts_rate <= 0)
+			continue;
+
+		/* Multiplier value */
+		mv_tmp = div64_long(bytes_per_sec * ICE_RL_PROF_MULTIPLIER,
+				    ts_rate);
+
+		/* Round to the nearest ICE_RL_PROF_MULTIPLIER */
+		mv = round_up_64bit(mv_tmp, ICE_RL_PROF_MULTIPLIER);
+
+		/* First multiplier value greater than the given
+		 * accuracy bytes
+		 */
+		if (mv > ICE_RL_PROF_ACCURACY_BYTES) {
+			encode = i;
+			found = true;
+			break;
+		}
+	}
+	if (found) {
+		u16 wm;
+
+		wm = ice_sched_calc_wakeup(hw, bw);
+		profile->rl_multiply = cpu_to_le16(mv);
+		profile->wake_up_calc = cpu_to_le16(wm);
+		profile->rl_encode = cpu_to_le16(encode);
+		status = 0;
+	} else {
+		status = ICE_ERR_DOES_NOT_EXIST;
+	}
+
+	return status;
+}
+
+/**
+ * ice_sched_add_rl_profile - add RL profile
+ * @hw: pointer to the hardware structure
+ * @rl_type: type of rate limit BW - min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ * @layer_num: specifies in which layer to create profile
+ *
+ * This function first checks the existing list for corresponding BW
+ * parameter. If it exists, it returns the associated profile otherwise
+ * it creates a new rate limit profile for requested BW, and adds it to
+ * the HW DB and local list. It returns the new profile or null on error.
+ * The caller needs to hold the scheduler lock.
+ */
+static struct ice_aqc_rl_profile_info *
+ice_sched_add_rl_profile(struct ice_hw *hw, enum ice_rl_type rl_type,
+			 u32 bw, u8 layer_num)
+{
+	struct ice_aqc_rl_profile_info *rl_prof_elem;
+	u16 profiles_added = 0, num_profiles = 1;
+	struct ice_aqc_rl_profile_elem *buf;
+	enum ice_status status;
+	u8 profile_type;
+
+	if (layer_num >= ICE_AQC_TOPO_MAX_LEVEL_NUM)
+		return NULL;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_CIR;
+		break;
+	case ICE_MAX_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_EIR;
+		break;
+	case ICE_SHARED_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_SRL;
+		break;
+	default:
+		return NULL;
+	}
+
+	if (!hw)
+		return NULL;
+	list_for_each_entry(rl_prof_elem, &hw->rl_prof_list[layer_num],
+			    list_entry)
+		if ((rl_prof_elem->profile.flags & ICE_AQC_RL_PROFILE_TYPE_M) ==
+		    profile_type && rl_prof_elem->bw == bw)
+			/* Return existing profile ID info */
+			return rl_prof_elem;
+
+	/* Create new profile ID */
+	rl_prof_elem = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rl_prof_elem),
+				    GFP_KERNEL);
+
+	if (!rl_prof_elem)
+		return NULL;
+
+	status = ice_sched_bw_to_rl_profile(hw, bw, &rl_prof_elem->profile);
+	if (status)
+		goto exit_add_rl_prof;
+
+	rl_prof_elem->bw = bw;
+	/* layer_num is zero relative, and fw expects level from 1 to 9 */
+	rl_prof_elem->profile.level = layer_num + 1;
+	rl_prof_elem->profile.flags = profile_type;
+	rl_prof_elem->profile.max_burst_size = cpu_to_le16(hw->max_burst_size);
+
+	/* Create new entry in HW DB */
+	buf = &rl_prof_elem->profile;
+	status = ice_aq_add_rl_profile(hw, num_profiles, buf, sizeof(*buf),
+				       &profiles_added, NULL);
+	if (status || profiles_added != num_profiles)
+		goto exit_add_rl_prof;
+
+	/* Good entry - add in the list */
+	rl_prof_elem->prof_id_ref = 0;
+	list_add(&rl_prof_elem->list_entry, &hw->rl_prof_list[layer_num]);
+	return rl_prof_elem;
+
+exit_add_rl_prof:
+	devm_kfree(ice_hw_to_dev(hw), rl_prof_elem);
+	return NULL;
+}
+
+/**
+ * ice_sched_cfg_node_bw_lmt - configure node sched params
+ * @hw: pointer to the HW struct
+ * @node: sched node to configure
+ * @rl_type: rate limit type CIR, EIR, or shared
+ * @rl_prof_id: rate limit profile ID
+ *
+ * This function configures node element's BW limit.
+ */
+static enum ice_status
+ice_sched_cfg_node_bw_lmt(struct ice_hw *hw, struct ice_sched_node *node,
+			  enum ice_rl_type rl_type, u16 rl_prof_id)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+
+	buf = node->info;
+	data = &buf.data;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		data->valid_sections |= ICE_AQC_ELEM_VALID_CIR;
+		data->cir_bw.bw_profile_idx = cpu_to_le16(rl_prof_id);
+		break;
+	case ICE_MAX_BW:
+		data->valid_sections |= ICE_AQC_ELEM_VALID_EIR;
+		data->eir_bw.bw_profile_idx = cpu_to_le16(rl_prof_id);
+		break;
+	case ICE_SHARED_BW:
+		data->valid_sections |= ICE_AQC_ELEM_VALID_SHARED;
+		data->srl_id = cpu_to_le16(rl_prof_id);
+		break;
+	default:
+		/* Unknown rate limit type */
+		return ICE_ERR_PARAM;
+	}
+
+	/* Configure element */
+	return ice_sched_update_elem(hw, node, &buf);
+}
+
+/**
+ * ice_sched_get_node_rl_prof_id - get node's rate limit profile ID
+ * @node: sched node
+ * @rl_type: rate limit type
+ *
+ * If existing profile matches, it returns the corresponding rate
+ * limit profile ID, otherwise it returns an invalid ID as error.
+ */
+static u16
+ice_sched_get_node_rl_prof_id(struct ice_sched_node *node,
+			      enum ice_rl_type rl_type)
+{
+	u16 rl_prof_id = ICE_SCHED_INVAL_PROF_ID;
+	struct ice_aqc_txsched_elem *data;
+
+	data = &node->info.data;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		if (data->valid_sections & ICE_AQC_ELEM_VALID_CIR)
+			rl_prof_id = le16_to_cpu(data->cir_bw.bw_profile_idx);
+		break;
+	case ICE_MAX_BW:
+		if (data->valid_sections & ICE_AQC_ELEM_VALID_EIR)
+			rl_prof_id = le16_to_cpu(data->eir_bw.bw_profile_idx);
+		break;
+	case ICE_SHARED_BW:
+		if (data->valid_sections & ICE_AQC_ELEM_VALID_SHARED)
+			rl_prof_id = le16_to_cpu(data->srl_id);
+		break;
+	default:
+		break;
+	}
+
+	return rl_prof_id;
+}
+
+/**
+ * ice_sched_get_rl_prof_layer - selects rate limit profile creation layer
+ * @pi: port information structure
+ * @rl_type: type of rate limit BW - min, max, or shared
+ * @layer_index: layer index
+ *
+ * This function returns requested profile creation layer.
+ */
+static u8
+ice_sched_get_rl_prof_layer(struct ice_port_info *pi, enum ice_rl_type rl_type,
+			    u8 layer_index)
+{
+	struct ice_hw *hw = pi->hw;
+
+	if (layer_index >= hw->num_tx_sched_layers)
+		return ICE_SCHED_INVAL_LAYER_NUM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		if (hw->layer_info[layer_index].max_cir_rl_profiles)
+			return layer_index;
+		break;
+	case ICE_MAX_BW:
+		if (hw->layer_info[layer_index].max_eir_rl_profiles)
+			return layer_index;
+		break;
+	case ICE_SHARED_BW:
+		/* if current layer doesn't support SRL profile creation
+		 * then try a layer up or down.
+		 */
+		if (hw->layer_info[layer_index].max_srl_profiles)
+			return layer_index;
+		else if (layer_index < hw->num_tx_sched_layers - 1 &&
+			 hw->layer_info[layer_index + 1].max_srl_profiles)
+			return layer_index + 1;
+		else if (layer_index > 0 &&
+			 hw->layer_info[layer_index - 1].max_srl_profiles)
+			return layer_index - 1;
+		break;
+	default:
+		break;
+	}
+	return ICE_SCHED_INVAL_LAYER_NUM;
+}
+
+/**
+ * ice_sched_get_srl_node - get shared rate limit node
+ * @node: tree node
+ * @srl_layer: shared rate limit layer
+ *
+ * This function returns SRL node to be used for shared rate limit purpose.
+ * The caller needs to hold scheduler lock.
+ */
+static struct ice_sched_node *
+ice_sched_get_srl_node(struct ice_sched_node *node, u8 srl_layer)
+{
+	if (srl_layer > node->tx_sched_layer)
+		return node->children[0];
+	else if (srl_layer < node->tx_sched_layer)
+		/* Node can't be created without a parent. It will always
+		 * have a valid parent except root node.
+		 */
+		return node->parent;
+	else
+		return node;
+}
+
+/**
+ * ice_sched_rm_rl_profile - remove RL profile ID
+ * @hw: pointer to the hardware structure
+ * @layer_num: layer number where profiles are saved
+ * @profile_type: profile type like EIR, CIR, or SRL
+ * @profile_id: profile ID to remove
+ *
+ * This function removes rate limit profile from layer 'layer_num' of type
+ * 'profile_type' and profile ID as 'profile_id'. The caller needs to hold
+ * scheduler lock.
+ */
+static enum ice_status
+ice_sched_rm_rl_profile(struct ice_hw *hw, u8 layer_num, u8 profile_type,
+			u16 profile_id)
+{
+	struct ice_aqc_rl_profile_info *rl_prof_elem;
+	enum ice_status status = 0;
+
+	if (layer_num >= ICE_AQC_TOPO_MAX_LEVEL_NUM)
+		return ICE_ERR_PARAM;
+	/* Check the existing list for RL profile */
+	list_for_each_entry(rl_prof_elem, &hw->rl_prof_list[layer_num],
+			    list_entry)
+		if ((rl_prof_elem->profile.flags & ICE_AQC_RL_PROFILE_TYPE_M) ==
+		    profile_type &&
+		    le16_to_cpu(rl_prof_elem->profile.profile_id) ==
+		    profile_id) {
+			if (rl_prof_elem->prof_id_ref)
+				rl_prof_elem->prof_id_ref--;
+
+			/* Remove old profile ID from database */
+			status = ice_sched_del_rl_profile(hw, rl_prof_elem);
+			if (status && status != ICE_ERR_IN_USE)
+				ice_debug(hw, ICE_DBG_SCHED, "Remove rl profile failed\n");
+			break;
+		}
+	if (status == ICE_ERR_IN_USE)
+		status = 0;
+	return status;
+}
+
+/**
+ * ice_sched_set_node_bw_dflt - set node's bandwidth limit to default
+ * @pi: port information structure
+ * @node: pointer to node structure
+ * @rl_type: rate limit type min, max, or shared
+ * @layer_num: layer number where RL profiles are saved
+ *
+ * This function configures node element's BW rate limit profile ID of
+ * type CIR, EIR, or SRL to default. This function needs to be called
+ * with the scheduler lock held.
+ */
+static enum ice_status
+ice_sched_set_node_bw_dflt(struct ice_port_info *pi,
+			   struct ice_sched_node *node,
+			   enum ice_rl_type rl_type, u8 layer_num)
+{
+	enum ice_status status;
+	struct ice_hw *hw;
+	u8 profile_type;
+	u16 rl_prof_id;
+	u16 old_id;
+
+	hw = pi->hw;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_CIR;
+		rl_prof_id = ICE_SCHED_DFLT_RL_PROF_ID;
+		break;
+	case ICE_MAX_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_EIR;
+		rl_prof_id = ICE_SCHED_DFLT_RL_PROF_ID;
+		break;
+	case ICE_SHARED_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_SRL;
+		/* No SRL is configured for default case */
+		rl_prof_id = ICE_SCHED_NO_SHARED_RL_PROF_ID;
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	/* Save existing RL prof ID for later clean up */
+	old_id = ice_sched_get_node_rl_prof_id(node, rl_type);
+	/* Configure BW scheduling parameters */
+	status = ice_sched_cfg_node_bw_lmt(hw, node, rl_type, rl_prof_id);
+	if (status)
+		return status;
+
+	/* Remove stale RL profile ID */
+	if (old_id == ICE_SCHED_DFLT_RL_PROF_ID ||
+	    old_id == ICE_SCHED_INVAL_PROF_ID)
+		return 0;
+
+	return ice_sched_rm_rl_profile(hw, layer_num, profile_type, old_id);
+}
+
+/**
+ * ice_sched_set_node_bw - set node's bandwidth
+ * @pi: port information structure
+ * @node: tree node
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ * @layer_num: layer number
+ *
+ * This function adds new profile corresponding to requested BW, configures
+ * node's RL profile ID of type CIR, EIR, or SRL, and removes old profile
+ * ID from local database. The caller needs to hold scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_node_bw(struct ice_port_info *pi, struct ice_sched_node *node,
+		      enum ice_rl_type rl_type, u32 bw, u8 layer_num)
+{
+	struct ice_aqc_rl_profile_info *rl_prof_info;
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_hw *hw = pi->hw;
+	u16 old_id, rl_prof_id;
+
+	rl_prof_info = ice_sched_add_rl_profile(hw, rl_type, bw, layer_num);
+	if (!rl_prof_info)
+		return status;
+
+	rl_prof_id = le16_to_cpu(rl_prof_info->profile.profile_id);
+
+	/* Save existing RL prof ID for later clean up */
+	old_id = ice_sched_get_node_rl_prof_id(node, rl_type);
+	/* Configure BW scheduling parameters */
+	status = ice_sched_cfg_node_bw_lmt(hw, node, rl_type, rl_prof_id);
+	if (status)
+		return status;
+
+	/* New changes has been applied */
+	/* Increment the profile ID reference count */
+	rl_prof_info->prof_id_ref++;
+
+	/* Check for old ID removal */
+	if ((old_id == ICE_SCHED_DFLT_RL_PROF_ID && rl_type != ICE_SHARED_BW) ||
+	    old_id == ICE_SCHED_INVAL_PROF_ID || old_id == rl_prof_id)
+		return 0;
+
+	return ice_sched_rm_rl_profile(hw, layer_num,
+				       rl_prof_info->profile.flags &
+				       ICE_AQC_RL_PROFILE_TYPE_M, old_id);
+}
+
+/**
+ * ice_sched_set_node_bw_lmt - set node's BW limit
+ * @pi: port information structure
+ * @node: tree node
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * It updates node's BW limit parameters like BW RL profile ID of type CIR,
+ * EIR, or SRL. The caller needs to hold scheduler lock.
+ *
+ * NOTE: Caller provides the correct SRL node in case of shared profile
+ * settings.
+ */
+static enum ice_status
+ice_sched_set_node_bw_lmt(struct ice_port_info *pi, struct ice_sched_node *node,
+			  enum ice_rl_type rl_type, u32 bw)
+{
+	struct ice_hw *hw;
+	u8 layer_num;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+	hw = pi->hw;
+	/* Remove unused RL profile IDs from HW and SW DB */
+	ice_sched_rm_unused_rl_prof(hw);
+
+	layer_num = ice_sched_get_rl_prof_layer(pi, rl_type,
+						node->tx_sched_layer);
+	if (layer_num >= hw->num_tx_sched_layers)
+		return ICE_ERR_PARAM;
+
+	if (bw == ICE_SCHED_DFLT_BW)
+		return ice_sched_set_node_bw_dflt(pi, node, rl_type, layer_num);
+	return ice_sched_set_node_bw(pi, node, rl_type, bw, layer_num);
+}
+
+
+/**
+ * ice_sched_set_node_bw_dflt_lmt - set node's BW limit to default
+ * @pi: port information structure
+ * @node: pointer to node structure
+ * @rl_type: rate limit type min, max, or shared
+ *
+ * This function configures node element's BW rate limit profile ID of
+ * type CIR, EIR, or SRL to default. This function needs to be called
+ * with the scheduler lock held.
+ */
+static enum ice_status
+ice_sched_set_node_bw_dflt_lmt(struct ice_port_info *pi,
+			       struct ice_sched_node *node,
+			       enum ice_rl_type rl_type)
+{
+	return ice_sched_set_node_bw_lmt(pi, node, rl_type,
+					 ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_sched_validate_srl_node - Check node for SRL applicability
+ * @node: sched node to configure
+ * @sel_layer: selected SRL layer
+ *
+ * This function checks if the SRL can be applied to a selceted layer node on
+ * behalf of the requested node (first argument). This function needs to be
+ * called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_validate_srl_node(struct ice_sched_node *node, u8 sel_layer)
+{
+	/* SRL profiles are not available on all layers. Check if the
+	 * SRL profile can be applied to a node above or below the
+	 * requested node. SRL configuration is possible only if the
+	 * selected layer's node has single child.
+	 */
+	if (sel_layer == node->tx_sched_layer ||
+	    ((sel_layer == node->tx_sched_layer + 1) &&
+	    node->num_children == 1) ||
+	    ((sel_layer == node->tx_sched_layer - 1) &&
+	    (node->parent && node->parent->num_children == 1)))
+		return 0;
+
+	return ICE_ERR_CFG;
+}
+
+/**
+ * ice_sched_save_q_bw - save queue node's BW information
+ * @q_ctx: queue context structure
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save BW information of queue type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_q_bw(struct ice_q_ctx *q_ctx, enum ice_rl_type rl_type, u32 bw)
+{
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw(&q_ctx->bw_t_info, bw);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw(&q_ctx->bw_t_info, bw);
+		break;
+	case ICE_SHARED_BW:
+		ice_set_clear_shared_bw(&q_ctx->bw_t_info, bw);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_set_q_bw_lmt - sets queue BW limit
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @q_handle: software queue handle
+ * @rl_type: min, max, or shared
+ * @bw: bandwidth in Kbps
+ *
+ * This function sets BW limit of queue scheduling node.
+ */
+static enum ice_status
+ice_sched_set_q_bw_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		       u16 q_handle, enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_node *node;
+	struct ice_q_ctx *q_ctx;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	mutex_lock(&pi->sched_lock);
+	q_ctx = ice_get_lan_q_ctx(pi->hw, vsi_handle, tc, q_handle);
+	if (!q_ctx)
+		goto exit_q_bw_lmt;
+	node = ice_sched_find_node_by_teid(pi->root, q_ctx->q_teid);
+	if (!node) {
+		ice_debug(pi->hw, ICE_DBG_SCHED, "Wrong q_teid\n");
+		goto exit_q_bw_lmt;
+	}
+
+	/* Return error if it is not a leaf node */
+	if (node->info.data.elem_type != ICE_AQC_ELEM_TYPE_LEAF)
+		goto exit_q_bw_lmt;
+
+	/* SRL bandwidth layer selection */
+	if (rl_type == ICE_SHARED_BW) {
+		u8 sel_layer; /* selected layer */
+
+		sel_layer = ice_sched_get_rl_prof_layer(pi, rl_type,
+							node->tx_sched_layer);
+		if (sel_layer >= pi->hw->num_tx_sched_layers) {
+			status = ICE_ERR_PARAM;
+			goto exit_q_bw_lmt;
+		}
+		status = ice_sched_validate_srl_node(node, sel_layer);
+		if (status)
+			goto exit_q_bw_lmt;
+	}
+
+	if (bw == ICE_SCHED_DFLT_BW)
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node, rl_type);
+	else
+		status = ice_sched_set_node_bw_lmt(pi, node, rl_type, bw);
+
+	if (!status)
+		status = ice_sched_save_q_bw(q_ctx, rl_type, bw);
+
+exit_q_bw_lmt:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_q_bw_lmt - configure queue BW limit
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @q_handle: software queue handle
+ * @rl_type: min, max, or shared
+ * @bw: bandwidth in Kbps
+ *
+ * This function configures BW limit of queue scheduling node.
+ */
+enum ice_status
+ice_cfg_q_bw_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		 u16 q_handle, enum ice_rl_type rl_type, u32 bw)
+{
+	return ice_sched_set_q_bw_lmt(pi, vsi_handle, tc, q_handle, rl_type,
+				      bw);
+}
+
+/**
+ * ice_cfg_q_bw_dflt_lmt - configure queue BW default limit
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @q_handle: software queue handle
+ * @rl_type: min, max, or shared
+ *
+ * This function configures BW default limit of queue scheduling node.
+ */
+enum ice_status
+ice_cfg_q_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 q_handle, enum ice_rl_type rl_type)
+{
+	return ice_sched_set_q_bw_lmt(pi, vsi_handle, tc, q_handle, rl_type,
+				      ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_sched_save_tc_node_bw - save TC node BW limit
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function saves the modified values of bandwidth settings for later
+ * replay purpose (restore) after reset.
+ */
+static enum ice_status
+ice_sched_save_tc_node_bw(struct ice_port_info *pi, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw)
+{
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw(&pi->tc_node_bw_t_info[tc], bw);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw(&pi->tc_node_bw_t_info[tc], bw);
+		break;
+	case ICE_SHARED_BW:
+		ice_set_clear_shared_bw(&pi->tc_node_bw_t_info[tc], bw);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_set_tc_node_bw_lmt - sets TC node BW limit
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function configures bandwidth limit of TC node.
+ */
+static enum ice_status
+ice_sched_set_tc_node_bw_lmt(struct ice_port_info *pi, u8 tc,
+			     enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_node *tc_node;
+
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return status;
+	mutex_lock(&pi->sched_lock);
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		goto exit_set_tc_node_bw;
+	if (bw == ICE_SCHED_DFLT_BW)
+		status = ice_sched_set_node_bw_dflt_lmt(pi, tc_node, rl_type);
+	else
+		status = ice_sched_set_node_bw_lmt(pi, tc_node, rl_type, bw);
+	if (!status)
+		status = ice_sched_save_tc_node_bw(pi, tc, rl_type, bw);
+
+exit_set_tc_node_bw:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_tc_node_bw_lmt - configure TC node BW limit
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function configures BW limit of TC node.
+ * Note: The minimum guaranteed reservation is done via DCBX.
+ */
+enum ice_status
+ice_cfg_tc_node_bw_lmt(struct ice_port_info *pi, u8 tc,
+		       enum ice_rl_type rl_type, u32 bw)
+{
+	return ice_sched_set_tc_node_bw_lmt(pi, tc, rl_type, bw);
+}
+
+/**
+ * ice_cfg_tc_node_bw_dflt_lmt - configure TC node BW default limit
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ *
+ * This function configures BW default limit of TC node.
+ */
+enum ice_status
+ice_cfg_tc_node_bw_dflt_lmt(struct ice_port_info *pi, u8 tc,
+			    enum ice_rl_type rl_type)
+{
+	return ice_sched_set_tc_node_bw_lmt(pi, tc, rl_type, ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_sched_save_tc_node_bw_alloc - save TC node's BW alloc information
+ * @pi: port information structure
+ * @tc: traffic class
+ * @rl_type: rate limit type min or max
+ * @bw_alloc: Bandwidth allocation information
+ *
+ * Save BW alloc information of VSI type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_tc_node_bw_alloc(struct ice_port_info *pi, u8 tc,
+				enum ice_rl_type rl_type, u16 bw_alloc)
+{
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw_alloc(&pi->tc_node_bw_t_info[tc],
+					   bw_alloc);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw_alloc(&pi->tc_node_bw_t_info[tc],
+					   bw_alloc);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_set_tc_node_bw_alloc - set TC node BW alloc
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw_alloc: bandwidth alloc
+ *
+ * This function configures bandwidth alloc of TC node, also saves the
+ * changed settings for replay purpose, and return success if it succeeds
+ * in modifying bandwidth alloc setting.
+ */
+static enum ice_status
+ice_sched_set_tc_node_bw_alloc(struct ice_port_info *pi, u8 tc,
+			       enum ice_rl_type rl_type, u8 bw_alloc)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_node *tc_node;
+
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return status;
+	mutex_lock(&pi->sched_lock);
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		goto exit_set_tc_node_bw_alloc;
+	status = ice_sched_cfg_node_bw_alloc(pi->hw, tc_node, rl_type,
+					     bw_alloc);
+	if (status)
+		goto exit_set_tc_node_bw_alloc;
+	status = ice_sched_save_tc_node_bw_alloc(pi, tc, rl_type, bw_alloc);
+
+exit_set_tc_node_bw_alloc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_tc_node_bw_alloc - configure TC node BW alloc
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw_alloc: bandwidth alloc
+ *
+ * This function configures BW limit of TC node.
+ * Note: The minimum guaranteed reservation is done via DCBX.
+ */
+enum ice_status
+ice_cfg_tc_node_bw_alloc(struct ice_port_info *pi, u8 tc,
+			 enum ice_rl_type rl_type, u8 bw_alloc)
+{
+	return ice_sched_set_tc_node_bw_alloc(pi, tc, rl_type, bw_alloc);
+}
+
+/**
+ * ice_sched_set_agg_bw_dflt_lmt - set aggregator node's BW limit to default
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function retrieves the aggregator ID based on VSI ID and TC,
+ * and sets node's BW limit to default. This function needs to be
+ * called with the scheduler lock held.
+ */
+enum ice_status
+ice_sched_set_agg_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *node;
+
+		node = vsi_ctx->sched.ag_node[tc];
+		if (!node)
+			continue;
+
+		/* Set min profile to default */
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node, ICE_MIN_BW);
+		if (status)
+			break;
+
+		/* Set max profile to default */
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node, ICE_MAX_BW);
+		if (status)
+			break;
+
+		/* Remove shared profile, if there is one */
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node,
+							ICE_SHARED_BW);
+		if (status)
+			break;
+	}
+
+	return status;
+}
+
+/**
+ * ice_sched_get_node_by_id_type - get node from ID type
+ * @pi: port information structure
+ * @id: identifier
+ * @agg_type: type of aggregator
+ * @tc: traffic class
+ *
+ * This function returns node identified by ID of type aggregator, and
+ * based on traffic class (TC). This function needs to be called with
+ * the scheduler lock held.
+ */
+static struct ice_sched_node *
+ice_sched_get_node_by_id_type(struct ice_port_info *pi, u32 id,
+			      enum ice_agg_type agg_type, u8 tc)
+{
+	struct ice_sched_node *node = NULL;
+	struct ice_sched_node *child_node;
+
+	switch (agg_type) {
+	case ICE_AGG_TYPE_VSI: {
+		struct ice_vsi_ctx *vsi_ctx;
+		u16 vsi_handle = (u16)id;
+
+		if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+			break;
+		/* Get sched_vsi_info */
+		vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+		if (!vsi_ctx)
+			break;
+		node = vsi_ctx->sched.vsi_node[tc];
+		break;
+	}
+
+	case ICE_AGG_TYPE_AGG: {
+		struct ice_sched_node *tc_node;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (tc_node)
+			node = ice_sched_get_agg_node(pi, tc_node, id);
+		break;
+	}
+
+	case ICE_AGG_TYPE_Q:
+		/* The current implementation allows single queue to modify */
+		node = ice_sched_get_node(pi, id);
+		break;
+
+	case ICE_AGG_TYPE_QG:
+		/* The current implementation allows single qg to modify */
+		child_node = ice_sched_get_node(pi, id);
+		if (!child_node)
+			break;
+		node = child_node->parent;
+		break;
+
+	default:
+		break;
+	}
+
+	return node;
+}
+
+/**
+ * ice_sched_set_node_bw_lmt_per_tc - set node BW limit per TC
+ * @pi: port information structure
+ * @id: ID (software VSI handle or AGG ID)
+ * @agg_type: aggregator type (VSI or AGG type node)
+ * @tc: traffic class
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function sets BW limit of VSI or Aggregator scheduling node
+ * based on TC information from passed in argument BW.
+ */
+enum ice_status
+ice_sched_set_node_bw_lmt_per_tc(struct ice_port_info *pi, u32 id,
+				 enum ice_agg_type agg_type, u8 tc,
+				 enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_node *node;
+
+	if (!pi)
+		return status;
+
+	if (rl_type == ICE_UNKNOWN_BW)
+		return status;
+
+	mutex_lock(&pi->sched_lock);
+	node = ice_sched_get_node_by_id_type(pi, id, agg_type, tc);
+	if (!node) {
+		ice_debug(pi->hw, ICE_DBG_SCHED, "Wrong id, agg type, or tc\n");
+		goto exit_set_node_bw_lmt_per_tc;
+	}
+	if (bw == ICE_SCHED_DFLT_BW)
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node, rl_type);
+	else
+		status = ice_sched_set_node_bw_lmt(pi, node, rl_type, bw);
+
+exit_set_node_bw_lmt_per_tc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_validate_vsi_srl_node - validate VSI SRL node
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function validates SRL node of the VSI node if available SRL layer is
+ * different than the VSI node layer on all TC(s).This function needs to be
+ * called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_validate_vsi_srl_node(struct ice_port_info *pi, u16 vsi_handle)
+{
+	u8 sel_layer = ICE_SCHED_INVAL_LAYER_NUM;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *vsi_node;
+		enum ice_rl_type rl_type = ICE_SHARED_BW;
+		enum ice_status status;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			continue;
+
+		/* SRL bandwidth layer selection */
+		if (sel_layer == ICE_SCHED_INVAL_LAYER_NUM) {
+			u8 node_layer = vsi_node->tx_sched_layer;
+			u8 layer_num;
+
+			layer_num = ice_sched_get_rl_prof_layer(pi, rl_type,
+								node_layer);
+			if (layer_num >= pi->hw->num_tx_sched_layers)
+				return ICE_ERR_PARAM;
+			sel_layer = layer_num;
+		}
+
+		status = ice_sched_validate_srl_node(vsi_node, sel_layer);
+		if (status)
+			return status;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_set_save_vsi_srl_node_bw - set VSI shared limit values
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: traffic class
+ * @srl_node: sched node to configure
+ * @rl_type: rate limit type minimum, maximum, or shared
+ * @bw: minimum, maximum, or shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of VSI type nodes across given traffic
+ * class, and saves those value for later use for replaying purposes. The
+ * caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_save_vsi_srl_node_bw(struct ice_port_info *pi, u16 vsi_handle,
+				   u8 tc, struct ice_sched_node *srl_node,
+				   enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status;
+
+	if (bw == ICE_SCHED_DFLT_BW) {
+		status = ice_sched_set_node_bw_dflt_lmt(pi, srl_node, rl_type);
+	} else {
+		status = ice_sched_set_node_bw_lmt(pi, srl_node, rl_type, bw);
+		if (status)
+			return status;
+		status = ice_sched_save_vsi_bw(pi, vsi_handle, tc, rl_type, bw);
+	}
+	return status;
+}
+
+/**
+ * ice_sched_set_vsi_node_srl_per_tc - set VSI node BW shared limit for tc
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: traffic class
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of  VSI type nodes across requested
+ * traffic class for VSI matching handle. When BW value of ICE_SCHED_DFLT_BW
+ * is passed, it removes the corresponding bw from the node. The caller
+ * holds scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_vsi_node_srl_per_tc(struct ice_port_info *pi, u16 vsi_handle,
+				  u8 tc, u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	struct ice_sched_node *tc_node, *vsi_node, *cfg_node;
+	enum ice_status status;
+	u8 layer_num;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+	if (!vsi_node)
+		return ICE_ERR_CFG;
+
+	layer_num = ice_sched_get_rl_prof_layer(pi, ICE_SHARED_BW,
+						vsi_node->tx_sched_layer);
+	if (layer_num >= pi->hw->num_tx_sched_layers)
+		return ICE_ERR_PARAM;
+
+	/* SRL node may be different */
+	cfg_node = ice_sched_get_srl_node(vsi_node, layer_num);
+	if (!cfg_node)
+		return ICE_ERR_CFG;
+
+	status = ice_sched_set_save_vsi_srl_node_bw(pi, vsi_handle, tc,
+						    cfg_node, ICE_MIN_BW,
+						    min_bw);
+	if (status)
+		return status;
+
+	status = ice_sched_set_save_vsi_srl_node_bw(pi, vsi_handle, tc,
+						    cfg_node, ICE_MAX_BW,
+						    max_bw);
+	if (status)
+		return status;
+
+	return ice_sched_set_save_vsi_srl_node_bw(pi, vsi_handle, tc, cfg_node,
+						  ICE_SHARED_BW, shared_bw);
+}
+
+/**
+ * ice_sched_set_vsi_bw_shared_lmt - set VSI BW shared limit
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of all VSI type nodes across all traffic
+ * classes for VSI matching handle. When BW value of ICE_SCHED_DFLT_BW is
+ * passed, it removes those value(s) from the node.
+ */
+enum ice_status
+ice_sched_set_vsi_bw_shared_lmt(struct ice_port_info *pi, u16 vsi_handle,
+				u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_validate_vsi_srl_node(pi, vsi_handle);
+	if (status)
+		goto exit_set_vsi_bw_shared_lmt;
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *vsi_node;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			continue;
+
+		status = ice_sched_set_vsi_node_srl_per_tc(pi, vsi_handle, tc,
+							   min_bw, max_bw,
+							   shared_bw);
+		if (status)
+			break;
+	}
+
+exit_set_vsi_bw_shared_lmt:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_validate_agg_srl_node - validate AGG SRL node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ *
+ * This function validates SRL node of the AGG node if available SRL layer is
+ * different than the AGG node layer on all TC(s).This function needs to be
+ * called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_validate_agg_srl_node(struct ice_port_info *pi, u32 agg_id)
+{
+	u8 sel_layer = ICE_SCHED_INVAL_LAYER_NUM;
+	struct ice_sched_agg_info *agg_info;
+	bool agg_id_present = false;
+	enum ice_status status = 0;
+	u8 tc;
+
+	list_for_each_entry(agg_info, &pi->hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id) {
+			agg_id_present = true;
+			break;
+		}
+	if (!agg_id_present)
+		return ICE_ERR_PARAM;
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *agg_node;
+		enum ice_rl_type rl_type = ICE_SHARED_BW;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+		if (!agg_node)
+			continue;
+		/* SRL bandwidth layer selection */
+		if (sel_layer == ICE_SCHED_INVAL_LAYER_NUM) {
+			u8 node_layer = agg_node->tx_sched_layer;
+			u8 layer_num;
+
+			layer_num = ice_sched_get_rl_prof_layer(pi, rl_type,
+								node_layer);
+			if (layer_num >= pi->hw->num_tx_sched_layers)
+				return ICE_ERR_PARAM;
+			sel_layer = layer_num;
+		}
+
+		status = ice_sched_validate_srl_node(agg_node, sel_layer);
+		if (status)
+			break;
+	}
+	return status;
+}
+
+/**
+ * ice_sched_validate_agg_id - Validate aggregator id
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ *
+ * This function validates aggregator id. Caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_sched_validate_agg_id(struct ice_port_info *pi, u32 agg_id)
+{
+	struct ice_sched_agg_info *agg_info;
+	struct ice_sched_agg_info *tmp;
+	bool agg_id_present = false;
+	enum ice_status status;
+
+	status = ice_sched_validate_agg_srl_node(pi, agg_id);
+	if (status)
+		return status;
+
+	list_for_each_entry_safe(agg_info, tmp, &pi->hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id) {
+			agg_id_present = true;
+			break;
+		}
+
+	if (!agg_id_present)
+		return ICE_ERR_PARAM;
+
+	return 0;
+}
+
+/**
+ * ice_sched_set_save_agg_srl_node_bw - set aggregator shared limit values
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @srl_node: sched node to configure
+ * @rl_type: rate limit type minimum, maximum, or shared
+ * @bw: minimum, maximum, or shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of aggregator type nodes across
+ * requested traffic class, and saves those value for later use for
+ * replaying purposes. The caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_save_agg_srl_node_bw(struct ice_port_info *pi, u32 agg_id, u8 tc,
+				   struct ice_sched_node *srl_node,
+				   enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status;
+
+	if (bw == ICE_SCHED_DFLT_BW) {
+		status = ice_sched_set_node_bw_dflt_lmt(pi, srl_node, rl_type);
+	} else {
+		status = ice_sched_set_node_bw_lmt(pi, srl_node, rl_type, bw);
+		if (status)
+			return status;
+		status = ice_sched_save_agg_bw(pi, agg_id, tc, rl_type, bw);
+	}
+	return status;
+}
+
+/**
+ * ice_sched_set_agg_node_srl_per_tc - set aggregator SRL per tc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of aggregator type
+ * node for a given traffic class for aggregator matching agg_id. When BW
+ * value of ICE_SCHED_DFLT_BW is passed, it removes SRL from the node. Caller
+ * holds the scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_agg_node_srl_per_tc(struct ice_port_info *pi, u32 agg_id,
+				  u8 tc, u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	struct ice_sched_node *tc_node, *agg_node, *cfg_node;
+	enum ice_rl_type rl_type = ICE_SHARED_BW;
+	enum ice_status status = ICE_ERR_CFG;
+	u8 layer_num;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		return ICE_ERR_CFG;
+
+	layer_num = ice_sched_get_rl_prof_layer(pi, rl_type,
+						agg_node->tx_sched_layer);
+	if (layer_num >= pi->hw->num_tx_sched_layers)
+		return ICE_ERR_PARAM;
+
+	/* SRL node may be different */
+	cfg_node = ice_sched_get_srl_node(agg_node, layer_num);
+	if (!cfg_node)
+		return ICE_ERR_CFG;
+
+	status = ice_sched_set_save_agg_srl_node_bw(pi, agg_id, tc, cfg_node,
+						    ICE_MIN_BW, min_bw);
+	if (status)
+		return status;
+
+	status = ice_sched_set_save_agg_srl_node_bw(pi, agg_id, tc, cfg_node,
+						    ICE_MAX_BW, max_bw);
+	if (status)
+		return status;
+
+	status = ice_sched_set_save_agg_srl_node_bw(pi, agg_id, tc, cfg_node,
+						    ICE_SHARED_BW, shared_bw);
+	return status;
+}
+
+/**
+ * ice_sched_set_agg_bw_shared_lmt - set aggregator BW shared limit
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id. When
+ * BW value of ICE_SCHED_DFLT_BW is passed, it removes SRL from the
+ * node(s).
+ */
+enum ice_status
+ice_sched_set_agg_bw_shared_lmt(struct ice_port_info *pi, u32 agg_id,
+				u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	enum ice_status status;
+	u8 tc;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_validate_agg_id(pi, agg_id);
+	if (status)
+		goto exit_agg_bw_shared_lmt;
+
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *agg_node;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+		if (!agg_node)
+			continue;
+
+		status = ice_sched_set_agg_node_srl_per_tc(pi, agg_id, tc,
+							   min_bw, max_bw,
+							   shared_bw);
+		if (status)
+			break;
+	}
+
+exit_agg_bw_shared_lmt:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_set_agg_bw_shared_lmt_per_tc - set aggregator BW shared lmt per tc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of aggregator type
+ * node for a given traffic class for aggregator matching agg_id. When BW
+ * value of ICE_SCHED_DFLT_BW is passed, it removes SRL from the node.
+ */
+enum ice_status
+ice_sched_set_agg_bw_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id,
+				       u8 tc, u32 min_bw, u32 max_bw,
+				       u32 shared_bw)
+{
+	enum ice_status status;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_validate_agg_id(pi, agg_id);
+	if (status)
+		goto exit_agg_bw_shared_lmt_per_tc;
+
+	status = ice_sched_set_agg_node_srl_per_tc(pi, agg_id, tc, min_bw,
+						   max_bw, shared_bw);
+
+exit_agg_bw_shared_lmt_per_tc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_cfg_sibl_node_prio - configure node sibling priority
+ * @pi: port information structure
+ * @node: sched node to configure
+ * @priority: sibling priority
+ *
+ * This function configures node element's sibling priority only. This
+ * function needs to be called with scheduler lock held.
+ */
+enum ice_status
+ice_sched_cfg_sibl_node_prio(struct ice_port_info *pi,
+			     struct ice_sched_node *node, u8 priority)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+	struct ice_hw *hw = pi->hw;
+	enum ice_status status;
+
+	if (!hw)
+		return ICE_ERR_PARAM;
+	buf = node->info;
+	data = &buf.data;
+	data->valid_sections |= ICE_AQC_ELEM_VALID_GENERIC;
+	priority = (priority << ICE_AQC_ELEM_GENERIC_PRIO_S) &
+		   ICE_AQC_ELEM_GENERIC_PRIO_M;
+	data->generic &= ~ICE_AQC_ELEM_GENERIC_PRIO_M;
+	data->generic |= priority;
+
+	/* Configure element */
+	status = ice_sched_update_elem(hw, node, &buf);
+	return status;
+}
+
+/**
+ * ice_cfg_rl_burst_size - Set burst size value
+ * @hw: pointer to the HW struct
+ * @bytes: burst size in bytes
+ *
+ * This function configures/set the burst size to requested new value. The new
+ * burst size value is used for future rate limit calls. It doesn't change the
+ * existing or previously created RL profiles.
+ */
+enum ice_status ice_cfg_rl_burst_size(struct ice_hw *hw, u32 bytes)
+{
+	u16 burst_size_to_prog;
+
+	if (bytes < ICE_MIN_BURST_SIZE_ALLOWED ||
+	    bytes > ICE_MAX_BURST_SIZE_ALLOWED)
+		return ICE_ERR_PARAM;
+	if (ice_round_to_num(bytes, 64) <=
+	    ICE_MAX_BURST_SIZE_64_BYTE_GRANULARITY) {
+		/* 64 byte granularity case */
+		/* Disable MSB granularity bit */
+		burst_size_to_prog = ICE_64_BYTE_GRANULARITY;
+		/* round number to nearest 64 byte granularity */
+		bytes = ice_round_to_num(bytes, 64);
+		/* The value is in 64 byte chunks */
+		burst_size_to_prog |= (u16)(bytes / 64);
+	} else {
+		/* k bytes granularity case */
+		/* Enable MSB granularity bit */
+		burst_size_to_prog = ICE_KBYTE_GRANULARITY;
+		/* round number to nearest 1024 granularity */
+		bytes = ice_round_to_num(bytes, 1024);
+		/* check rounding doesn't go beyond allowed */
+		if (bytes > ICE_MAX_BURST_SIZE_KBYTE_GRANULARITY)
+			bytes = ICE_MAX_BURST_SIZE_KBYTE_GRANULARITY;
+		/* The value is in k bytes */
+		burst_size_to_prog |= (u16)(bytes / 1024);
+	}
+	hw->max_burst_size = burst_size_to_prog;
+	return 0;
+}
+
+/**
+ * ice_sched_replay_node_prio - re-configure node priority
+ * @hw: pointer to the HW struct
+ * @node: sched node to configure
+ * @priority: priority value
+ *
+ * This function configures node element's priority value. It
+ * needs to be called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_replay_node_prio(struct ice_hw *hw, struct ice_sched_node *node,
+			   u8 priority)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+	enum ice_status status;
+
+	buf = node->info;
+	data = &buf.data;
+	data->valid_sections |= ICE_AQC_ELEM_VALID_GENERIC;
+	data->generic = priority;
+
+	/* Configure element */
+	status = ice_sched_update_elem(hw, node, &buf);
+	return status;
+}
+
+/**
+ * ice_sched_replay_node_bw - replay node(s) BW
+ * @hw: pointer to the HW struct
+ * @node: sched node to configure
+ * @bw_t_info: BW type information
+ *
+ * This function restores node's BW from bw_t_info. The caller needs
+ * to hold the scheduler lock.
+ */
+static enum ice_status
+ice_sched_replay_node_bw(struct ice_hw *hw, struct ice_sched_node *node,
+			 struct ice_bw_type_info *bw_t_info)
+{
+	struct ice_port_info *pi = hw->port_info;
+	enum ice_status status = ICE_ERR_PARAM;
+	u16 bw_alloc;
+
+	if (!node)
+		return status;
+	if (bitmap_empty(bw_t_info->bw_t_bitmap, ICE_BW_TYPE_CNT))
+		return 0;
+	if (test_bit(ICE_BW_TYPE_PRIO, bw_t_info->bw_t_bitmap)) {
+		status = ice_sched_replay_node_prio(hw, node,
+						    bw_t_info->generic);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_CIR, bw_t_info->bw_t_bitmap)) {
+		status = ice_sched_set_node_bw_lmt(pi, node, ICE_MIN_BW,
+						   bw_t_info->cir_bw.bw);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_CIR_WT, bw_t_info->bw_t_bitmap)) {
+		bw_alloc = bw_t_info->cir_bw.bw_alloc;
+		status = ice_sched_cfg_node_bw_alloc(hw, node, ICE_MIN_BW,
+						     bw_alloc);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_EIR, bw_t_info->bw_t_bitmap)) {
+		status = ice_sched_set_node_bw_lmt(pi, node, ICE_MAX_BW,
+						   bw_t_info->eir_bw.bw);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_EIR_WT, bw_t_info->bw_t_bitmap)) {
+		bw_alloc = bw_t_info->eir_bw.bw_alloc;
+		status = ice_sched_cfg_node_bw_alloc(hw, node, ICE_MAX_BW,
+						     bw_alloc);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_SHARED, bw_t_info->bw_t_bitmap))
+		status = ice_sched_set_node_bw_lmt(pi, node, ICE_SHARED_BW,
+						   bw_t_info->shared_bw);
+	return status;
+}
+
+/**
+ * ice_sched_replay_agg_bw - replay aggregator node(s) BW
+ * @hw: pointer to the HW struct
+ * @agg_info: aggregator data structure
+ *
+ * This function re-creates aggregator type nodes. The caller needs to hold
+ * the scheduler lock.
+ */
+static enum ice_status
+ice_sched_replay_agg_bw(struct ice_hw *hw, struct ice_sched_agg_info *agg_info)
+{
+	struct ice_sched_node *tc_node, *agg_node;
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	ice_for_each_traffic_class(tc) {
+		if (bitmap_empty(agg_info->bw_t_info[tc].bw_t_bitmap, ICE_BW_TYPE_CNT))
+			continue;
+		tc_node = ice_sched_get_tc_node(hw->port_info, tc);
+		if (!tc_node) {
+			status = ICE_ERR_PARAM;
+			break;
+		}
+		agg_node = ice_sched_get_agg_node(hw->port_info, tc_node,
+						  agg_info->agg_id);
+		if (!agg_node) {
+			status = ICE_ERR_PARAM;
+			break;
+		}
+		status = ice_sched_replay_node_bw(hw, agg_node,
+						  &agg_info->bw_t_info[tc]);
+		if (status)
+			break;
+	}
+	return status;
+}
+
+/**
+ * ice_sched_get_ena_tc_bitmap - get enabled TC bitmap
+ * @pi: port info struct
+ * @tc_bitmap: 8 bits TC bitmap to check
+ * @ena_tc_bitmap: 8 bits enabled TC bitmap to return
+ *
+ * This function returns enabled TC bitmap in variable ena_tc_bitmap. Some TCs
+ * may be missing, it returns enabled TCs. This function needs to be called with
+ * scheduler lock held.
+ */
+static void
+ice_sched_get_ena_tc_bitmap(struct ice_port_info *pi,
+			    unsigned long *tc_bitmap,
+			    unsigned long *ena_tc_bitmap)
+{
+	u8 tc;
+
+	/* Some TC(s) may be missing after reset, adjust for replay */
+	ice_for_each_traffic_class(tc)
+		if (ice_is_tc_ena(*tc_bitmap, tc) &&
+		    (ice_sched_get_tc_node(pi, tc)))
+			set_bit(tc, ena_tc_bitmap);
+}
+
+/**
+ * ice_sched_replay_agg - recreate aggregator node(s)
+ * @hw: pointer to the HW struct
+ *
+ * This function recreate aggregator type nodes which are not replayed earlier.
+ * It also replay aggregator BW information. These aggregator nodes are not
+ * associated with VSI type node yet.
+ */
+void ice_sched_replay_agg(struct ice_hw *hw)
+{
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		/* replay aggregator (re-create aggregator node) */
+		if (!bitmap_equal(agg_info->tc_bitmap, agg_info->replay_tc_bitmap, ICE_MAX_TRAFFIC_CLASS)) {
+			DECLARE_BITMAP(replay_bitmap,
+					   ICE_MAX_TRAFFIC_CLASS);
+			enum ice_status status;
+
+			bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+			ice_sched_get_ena_tc_bitmap(pi,
+						    agg_info->replay_tc_bitmap,
+						    replay_bitmap);
+			status = ice_sched_cfg_agg(hw->port_info,
+						   agg_info->agg_id,
+						   ICE_AGG_TYPE_AGG,
+						   replay_bitmap);
+			if (status) {
+				dev_info(ice_hw_to_dev(hw),
+					 "Replay agg id[%d] failed\n",
+					 agg_info->agg_id);
+				/* Move on to next one */
+				continue;
+			}
+			/* Replay aggregator node BW (restore aggregator BW) */
+			status = ice_sched_replay_agg_bw(hw, agg_info);
+			if (status)
+				dev_info(ice_hw_to_dev(hw),
+					 "Replay agg bw [id=%d] failed\n",
+					 agg_info->agg_id);
+		}
+	mutex_unlock(&pi->sched_lock);
+}
+
+/**
+ * ice_sched_replay_agg_vsi_preinit - Agg/VSI replay pre initialization
+ * @hw: pointer to the HW struct
+ *
+ * This function initialize aggregator(s) TC bitmap to zero. A required
+ * preinit step for replaying aggregators.
+ */
+void ice_sched_replay_agg_vsi_preinit(struct ice_hw *hw)
+{
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry) {
+		struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+		agg_info->tc_bitmap[0] = 0;
+		list_for_each_entry(agg_vsi_info, &agg_info->agg_vsi_list,
+				    list_entry)
+			agg_vsi_info->tc_bitmap[0] = 0;
+	}
+	mutex_unlock(&pi->sched_lock);
+}
+
+/**
+ * ice_sched_replay_root_node_bw - replay root node BW
+ * @pi: port information structure
+ *
+ * Replay root node BW settings.
+ */
+enum ice_status ice_sched_replay_root_node_bw(struct ice_port_info *pi)
+{
+	enum ice_status status = 0;
+
+	if (!pi->hw)
+		return ICE_ERR_PARAM;
+	mutex_lock(&pi->sched_lock);
+
+	status = ice_sched_replay_node_bw(pi->hw, pi->root,
+					  &pi->root_node_bw_t_info);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_replay_tc_node_bw - replay TC node(s) BW
+ * @pi: port information structure
+ *
+ * This function replay TC nodes.
+ */
+enum ice_status ice_sched_replay_tc_node_bw(struct ice_port_info *pi)
+{
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!pi->hw)
+		return ICE_ERR_PARAM;
+	mutex_lock(&pi->sched_lock);
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue; /* TC not present */
+		status = ice_sched_replay_node_bw(pi->hw, tc_node,
+						  &pi->tc_node_bw_t_info[tc]);
+		if (status)
+			break;
+	}
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_replay_vsi_bw - replay VSI type node(s) BW
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: 8 bits TC bitmap
+ *
+ * This function replays VSI type nodes bandwidth. This function needs to be
+ * called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_replay_vsi_bw(struct ice_hw *hw, u16 vsi_handle,
+			unsigned long *tc_bitmap)
+{
+	struct ice_sched_node *vsi_node, *tc_node;
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_bw_type_info *bw_t_info;
+	struct ice_vsi_ctx *vsi_ctx;
+	enum ice_status status = 0;
+	u8 tc;
+
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	ice_for_each_traffic_class(tc) {
+		if (!ice_is_tc_ena(*tc_bitmap, tc))
+			continue;
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			continue;
+		bw_t_info = &vsi_ctx->sched.bw_t_info[tc];
+		status = ice_sched_replay_node_bw(hw, vsi_node, bw_t_info);
+		if (status)
+			break;
+	}
+	return status;
+}
+
+/**
+ * ice_sched_replay_vsi_agg - replay aggregator & VSI to aggregator node(s)
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ *
+ * This function replays aggregator node, VSI to aggregator type nodes, and
+ * their node bandwidth information. This function needs to be called with
+ * scheduler lock held.
+ */
+static enum ice_status
+ice_sched_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle)
+{
+	DECLARE_BITMAP(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status;
+
+	bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	agg_info = ice_get_vsi_agg_info(hw, vsi_handle);
+	if (!agg_info)
+		return 0; /* Not present in list - default Agg case */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info)
+		return 0; /* Not present in list - default Agg case */
+	ice_sched_get_ena_tc_bitmap(pi, agg_info->replay_tc_bitmap,
+				    replay_bitmap);
+	/* Replay aggregator node associated to vsi_handle */
+	status = ice_sched_cfg_agg(hw->port_info, agg_info->agg_id,
+				   ICE_AGG_TYPE_AGG, replay_bitmap);
+	if (status)
+		return status;
+	/* Replay aggregator node BW (restore aggregator BW) */
+	status = ice_sched_replay_agg_bw(hw, agg_info);
+	if (status)
+		return status;
+
+	bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	ice_sched_get_ena_tc_bitmap(pi, agg_vsi_info->replay_tc_bitmap,
+				    replay_bitmap);
+	/* Move this VSI (vsi_handle) to above aggregator */
+	status = ice_sched_assoc_vsi_to_agg(pi, agg_info->agg_id, vsi_handle,
+					    replay_bitmap);
+	if (status)
+		return status;
+	/* Replay VSI BW (restore VSI BW) */
+	return ice_sched_replay_vsi_bw(hw, vsi_handle,
+				       agg_vsi_info->tc_bitmap);
+}
+
+/**
+ * ice_replay_vsi_agg - replay VSI to aggregator node
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ *
+ * This function replays association of VSI to aggregator type nodes, and
+ * node bandwidth information.
+ */
+enum ice_status ice_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_port_info *pi = hw->port_info;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_replay_vsi_agg(hw, vsi_handle);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_replay_q_bw - replay queue type node BW
+ * @pi: port information structure
+ * @q_ctx: queue context structure
+ *
+ * This function replays queue type node bandwidth. This function needs to be
+ * called with scheduler lock held.
+ */
+enum ice_status
+ice_sched_replay_q_bw(struct ice_port_info *pi, struct ice_q_ctx *q_ctx)
+{
+	struct ice_sched_node *q_node;
+
+	/* Following also checks the presence of node in tree */
+	q_node = ice_sched_find_node_by_teid(pi->root, q_ctx->q_teid);
+	if (!q_node)
+		return ICE_ERR_PARAM;
+	return ice_sched_replay_node_bw(pi->hw, q_node, &q_ctx->bw_t_info);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.h b/drivers/net/ethernet/intel/ice/ice_sched.h
index 3902a8ad3025..48642a54f8a9 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.h
+++ b/drivers/net/ethernet/intel/ice/ice_sched.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_SCHED_H_
 #define _ICE_SCHED_H_
@@ -8,11 +8,56 @@
 
 #define ICE_QGRP_LAYER_OFFSET	2
 #define ICE_VSI_LAYER_OFFSET	4
+#define ICE_AGG_LAYER_OFFSET	6
+#define ICE_SCHED_INVAL_LAYER_NUM	0xFF
+/* Burst size is a 12 bits register that is configured while creating the RL
+ * profile(s). MSB is a granularity bit and tells the granularity type
+ * 0 - LSB bits are in 64 bytes granularity
+ * 1 - LSB bits are in 1K bytes granularity
+ */
+#define ICE_64_BYTE_GRANULARITY			0
+#define ICE_KBYTE_GRANULARITY			BIT(11)
+#define ICE_MIN_BURST_SIZE_ALLOWED		64 /* In Bytes */
+#define ICE_MAX_BURST_SIZE_ALLOWED \
+	((BIT(11) - 1) * 1024) /* In Bytes */
+#define ICE_MAX_BURST_SIZE_64_BYTE_GRANULARITY \
+	((BIT(11) - 1) * 64) /* In Bytes */
+#define ICE_MAX_BURST_SIZE_KBYTE_GRANULARITY	ICE_MAX_BURST_SIZE_ALLOWED
+
+#define ICE_RL_PROF_ACCURACY_BYTES 128
+#define ICE_RL_PROF_MULTIPLIER 10000
+#define ICE_RL_PROF_TS_MULTIPLIER 32
+#define ICE_RL_PROF_FRACTION 512
+
+#define ICE_PSM_CLK_367MHZ_IN_HZ 367647059
+#define ICE_PSM_CLK_416MHZ_IN_HZ 416666667
+#define ICE_PSM_CLK_446MHZ_IN_HZ 446428571
+#define ICE_PSM_CLK_390MHZ_IN_HZ 390625000
+
+
+struct rl_profile_params {
+	u32 bw;			/* in Kbps */
+	u16 rl_multiplier;
+	u16 wake_up_calc;
+	u16 rl_encode;
+};
+
+/* BW rate limit profile parameters list entry along
+ * with bandwidth maintained per layer in port info
+ */
+struct ice_aqc_rl_profile_info {
+	struct ice_aqc_rl_profile_elem profile;
+	struct list_head list_entry;
+	u32 bw;			/* requested */
+	u16 prof_id_ref;	/* profile ID to node association ref count */
+};
 
 struct ice_sched_agg_vsi_info {
 	struct list_head list_entry;
 	DECLARE_BITMAP(tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 	u16 vsi_handle;
+	/* save aggregator VSI TC bitmap */
+	DECLARE_BITMAP(replay_tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 };
 
 struct ice_sched_agg_info {
@@ -21,21 +66,43 @@ struct ice_sched_agg_info {
 	DECLARE_BITMAP(tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 	u32 agg_id;
 	enum ice_agg_type agg_type;
+	/* bw_t_info saves aggregator BW information */
+	struct ice_bw_type_info bw_t_info[ICE_MAX_TRAFFIC_CLASS];
+	/* save aggregator TC bitmap */
+	DECLARE_BITMAP(replay_tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 };
 
 /* FW AQ command calls */
 enum ice_status
+ice_aq_get_dflt_topo(struct ice_hw *hw, u8 lport,
+		     struct ice_aqc_get_topo_elem *buf, u16 buf_size,
+		     u8 *num_branches, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_rl_profile(struct ice_hw *hw, u16 num_profiles,
+			struct ice_aqc_rl_profile_elem *buf, u16 buf_size,
+			struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_cfg_l2_node_cgd(struct ice_hw *hw, u16 num_nodes,
+		       struct ice_aqc_cfg_l2_node_cgd_elem *buf, u16 buf_size,
+		       struct ice_sq_cd *cd);
+enum ice_status
 ice_aq_query_sched_elems(struct ice_hw *hw, u16 elems_req,
-			 struct ice_aqc_get_elem *buf, u16 buf_size,
+			 struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
 			 u16 *elems_ret, struct ice_sq_cd *cd);
 enum ice_status ice_sched_init_port(struct ice_port_info *pi);
 enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw);
+void ice_sched_get_psm_clk_freq(struct ice_hw *hw);
+
+/* Functions to cleanup scheduler SW DB */
 void ice_sched_clear_port(struct ice_port_info *pi);
 void ice_sched_cleanup_all(struct ice_hw *hw);
 void ice_sched_clear_agg(struct ice_hw *hw);
 
+/* Get a scheduling node from SW DB for given TEID */
+struct ice_sched_node *ice_sched_get_node(struct ice_port_info *pi, u32 teid);
 struct ice_sched_node *
 ice_sched_find_node_by_teid(struct ice_sched_node *start_node, u32 teid);
+/* Add a scheduling node into SW DB for given info */
 enum ice_status
 ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 		   struct ice_aqc_txsched_elem_data *info);
@@ -48,4 +115,110 @@ enum ice_status
 ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 		  u8 owner, bool enable);
 enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle);
+enum ice_status
+ice_rm_vsi_rdma_cfg(struct ice_port_info *pi, u16 vsi_handle);
+struct ice_sched_node *
+ice_sched_get_vsi_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
+		       u16 vsi_handle);
+bool ice_sched_is_tree_balanced(struct ice_hw *hw, struct ice_sched_node *node);
+enum ice_status
+ice_aq_query_node_to_root(struct ice_hw *hw, u32 node_teid,
+			  struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
+			  struct ice_sq_cd *cd);
+
+/* Tx scheduler rate limiter functions */
+enum ice_status
+ice_cfg_agg(struct ice_port_info *pi, u32 agg_id,
+	    enum ice_agg_type agg_type, u8 tc_bitmap);
+enum ice_status
+ice_move_vsi_to_agg(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+		    u8 tc_bitmap);
+enum ice_status ice_rm_agg_cfg(struct ice_port_info *pi, u32 agg_id);
+enum ice_status
+ice_cfg_q_bw_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		 u16 q_handle, enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_cfg_q_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 q_handle, enum ice_rl_type rl_type);
+enum ice_status
+ice_cfg_tc_node_bw_lmt(struct ice_port_info *pi, u8 tc,
+		       enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_cfg_tc_node_bw_dflt_lmt(struct ice_port_info *pi, u8 tc,
+			    enum ice_rl_type rl_type);
+enum ice_status
+ice_cfg_vsi_bw_lmt_per_tc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_cfg_vsi_bw_dflt_lmt_per_tc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			       enum ice_rl_type rl_type);
+enum ice_status
+ice_cfg_agg_bw_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_cfg_agg_bw_dflt_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			       enum ice_rl_type rl_type);
+enum ice_status
+ice_cfg_vsi_bw_shared_lmt(struct ice_port_info *pi, u16 vsi_handle, u32 min_bw,
+			  u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_cfg_vsi_bw_no_shared_lmt(struct ice_port_info *pi, u16 vsi_handle);
+enum ice_status
+ice_cfg_agg_bw_shared_lmt(struct ice_port_info *pi, u32 agg_id, u32 min_bw,
+			  u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_cfg_agg_bw_no_shared_lmt(struct ice_port_info *pi, u32 agg_id);
+enum ice_status
+ice_cfg_agg_bw_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+				 u32 min_bw, u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_cfg_agg_bw_no_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id,
+				    u8 tc);
+enum ice_status
+ice_cfg_vsi_q_priority(struct ice_port_info *pi, u16 num_qs, u32 *q_ids,
+		       u8 *q_prio);
+enum ice_status
+ice_cfg_vsi_bw_alloc(struct ice_port_info *pi, u16 vsi_handle, u8 ena_tcmap,
+		     enum ice_rl_type rl_type, u8 *bw_alloc);
+enum ice_status
+ice_cfg_agg_vsi_priority_per_tc(struct ice_port_info *pi, u32 agg_id,
+				u16 num_vsis, u16 *vsi_handle_arr,
+				u8 *node_prio, u8 tc);
+enum ice_status
+ice_cfg_agg_bw_alloc(struct ice_port_info *pi, u32 agg_id, u8 ena_tcmap,
+		     enum ice_rl_type rl_type, u8 *bw_alloc);
+bool
+ice_sched_find_node_in_subtree(struct ice_hw *hw, struct ice_sched_node *base,
+			       struct ice_sched_node *node);
+enum ice_status
+ice_sched_set_agg_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle);
+enum ice_status
+ice_sched_set_node_bw_lmt_per_tc(struct ice_port_info *pi, u32 id,
+				 enum ice_agg_type agg_type, u8 tc,
+				 enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_sched_set_vsi_bw_shared_lmt(struct ice_port_info *pi, u16 vsi_handle,
+				u32 min_bw, u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_sched_set_agg_bw_shared_lmt(struct ice_port_info *pi, u32 agg_id, u32 min_bw,
+				u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_sched_set_agg_bw_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id,
+				       u8 tc, u32 min_bw, u32 max_bw,
+				       u32 shared_bw);
+enum ice_status
+ice_sched_cfg_sibl_node_prio(struct ice_port_info *pi,
+			     struct ice_sched_node *node, u8 priority);
+enum ice_status
+ice_cfg_tc_node_bw_alloc(struct ice_port_info *pi, u8 tc,
+			 enum ice_rl_type rl_type, u8 bw_alloc);
+enum ice_status ice_cfg_rl_burst_size(struct ice_hw *hw, u32 bytes);
+void ice_sched_replay_agg_vsi_preinit(struct ice_hw *hw);
+void ice_sched_replay_agg(struct ice_hw *hw);
+enum ice_status ice_sched_replay_tc_node_bw(struct ice_port_info *pi);
+enum ice_status ice_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status ice_sched_replay_root_node_bw(struct ice_port_info *pi);
+enum ice_status
+ice_sched_replay_q_bw(struct ice_port_info *pi, struct ice_q_ctx *q_ctx);
+
 #endif /* _ICE_SCHED_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
index d2db0d04e117..a66f896b5a28 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.c
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
@@ -1,8 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
-#include "ice_adminq_cmd.h"
 #include "ice_sriov.h"
 
 /**
@@ -40,6 +39,7 @@ ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
 	return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
 }
 
+
 /**
  * ice_conv_link_speed_to_virtchnl
  * @adv_link_support: determines the format of the returned link speed
@@ -121,9 +121,7 @@ u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed)
 			speed = (u32)VIRTCHNL_LINK_SPEED_25GB;
 			break;
 		case ICE_AQ_LINK_SPEED_40GB:
-			/* fall through */
 		case ICE_AQ_LINK_SPEED_50GB:
-			/* fall through */
 		case ICE_AQ_LINK_SPEED_100GB:
 			speed = (u32)VIRTCHNL_LINK_SPEED_40GB;
 			break;
@@ -134,3 +132,404 @@ u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed)
 
 	return speed;
 }
+
+/* The mailbox overflow detection algorithm helps to check if there
+ * is a possibility of a malicious VF transmitting too many MBX messages to the
+ * PF.
+ * 1. The mailbox snapshot structure, ice_mbx_snapshot, is initialized during
+ * driver initialization in ice_init_hw() using ice_mbx_init_snapshot().
+ * The struct ice_mbx_snapshot helps to track and traverse a static window of
+ * messages within the mailbox queue while looking for a malicious VF.
+ *
+ * 2. When the caller starts processing its mailbox queue in response to an
+ * interrupt, the structure ice_mbx_snapshot is expected to be cleared before
+ * the algorithm can be run for the first time for that interrupt. This can be
+ * done via ice_mbx_reset_snapshot().
+ *
+ * 3. For every message read by the caller from the MBX Queue, the caller must
+ * call the detection algorithm's entry function ice_mbx_vf_state_handler().
+ * Before every call to ice_mbx_vf_state_handler() the struct ice_mbx_data is
+ * filled as it is required to be passed to the algorithm.
+ *
+ * 4. Every time a message is read from the MBX queue, a VFId is received which
+ * is passed to the state handler. The boolean output is_malvf of the state
+ * handler ice_mbx_vf_state_handler() serves as an indicator to the caller
+ * whether this VF is malicious or not.
+ *
+ * 5. When a VF is identified to be malicious, the caller can send a message
+ * to the system administrator. The caller can invoke ice_mbx_report_malvf()
+ * to help determine if a malicious VF is to be reported or not. This function
+ * requires the caller to maintain a global bitmap to track all malicious VFs
+ * and pass that to ice_mbx_report_malvf() along with the VFID which was identified
+ * to be malicious by ice_mbx_vf_state_handler().
+ *
+ * 6. The global bitmap maintained by PF can be cleared completely if PF is in
+ * reset or the bit corresponding to a VF can be cleared if that VF is in reset.
+ * When a VF is shut down and brought back up, we assume that the new VF
+ * brought up is not malicious and hence report it if found malicious.
+ *
+ * 7. The function ice_mbx_reset_snapshot() is called to reset the information
+ * in ice_mbx_snapshot for every new mailbox interrupt handled.
+ *
+ * 8. The memory allocated for variables in ice_mbx_snapshot is de-allocated
+ * when driver is unloaded.
+ */
+#define ICE_RQ_DATA_MASK(rq_data) ((rq_data) & PF_MBX_ARQH_ARQH_M)
+/* Using the highest value for an unsigned 16-bit value 0xFFFF to indicate that
+ * the max messages check must be ignored in the algorithm
+ */
+#define ICE_IGNORE_MAX_MSG_CNT	0xFFFF
+
+/**
+ * ice_mbx_traverse - Pass through mailbox snapshot
+ * @hw: pointer to the HW struct
+ * @new_state: new algorithm state
+ *
+ * Traversing the mailbox static snapshot without checking
+ * for malicious VFs.
+ */
+static void
+ice_mbx_traverse(struct ice_hw *hw,
+		 enum ice_mbx_snapshot_state *new_state)
+{
+	struct ice_mbx_snap_buffer_data *snap_buf;
+	u32 num_iterations;
+
+	snap_buf = &hw->mbx_snapshot.mbx_buf;
+
+	/* As mailbox buffer is circular, applying a mask
+	 * on the incremented iteration count.
+	 */
+	num_iterations = ICE_RQ_DATA_MASK(++snap_buf->num_iterations);
+
+	/* Checking either of the below conditions to exit snapshot traversal:
+	 * Condition-1: If the number of iterations in the mailbox is equal to
+	 * the mailbox head which would indicate that we have reached the end
+	 * of the static snapshot.
+	 * Condition-2: If the maximum messages serviced in the mailbox for a
+	 * given interrupt is the highest possible value then there is no need
+	 * to check if the number of messages processed is equal to it. If not
+	 * check if the number of messages processed is greater than or equal
+	 * to the maximum number of mailbox entries serviced in current work item.
+	 */
+	if (num_iterations == snap_buf->head ||
+	    (snap_buf->max_num_msgs_mbx < ICE_IGNORE_MAX_MSG_CNT &&
+	     ++snap_buf->num_msg_proc >= snap_buf->max_num_msgs_mbx))
+		*new_state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+}
+
+/**
+ * ice_mbx_detect_malvf - Detect malicious VF in snapshot
+ * @hw: pointer to the HW struct
+ * @vf_id: relative virtual function ID
+ * @new_state: new algorithm state
+ * @is_malvf: boolean output to indicate if VF is malicious
+ *
+ * This function tracks the number of asynchronous messages
+ * sent per VF and marks the VF as malicious if it exceeds
+ * the permissible number of messages to send.
+ */
+static enum ice_status
+ice_mbx_detect_malvf(struct ice_hw *hw, u16 vf_id,
+		     enum ice_mbx_snapshot_state *new_state,
+		     bool *is_malvf)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+	if (vf_id >= snap->mbx_vf.vfcntr_len)
+		return ICE_ERR_OUT_OF_RANGE;
+
+	/* increment the message count in the VF array */
+	snap->mbx_vf.vf_cntr[vf_id]++;
+
+	if (snap->mbx_vf.vf_cntr[vf_id] >= ICE_ASYNC_VF_MSG_THRESHOLD)
+		*is_malvf = true;
+
+	/* continue to iterate through the mailbox snapshot */
+	ice_mbx_traverse(hw, new_state);
+
+	return 0;
+}
+
+/**
+ * ice_mbx_reset_snapshot - Reset mailbox snapshot structure
+ * @snap: pointer to mailbox snapshot structure in the ice_hw struct
+ *
+ * Reset the mailbox snapshot structure and clear VF counter array.
+ */
+static void ice_mbx_reset_snapshot(struct ice_mbx_snapshot *snap)
+{
+	u32 vfcntr_len;
+
+	if (!snap || !snap->mbx_vf.vf_cntr)
+		return;
+
+	/* Clear VF counters. */
+	vfcntr_len = snap->mbx_vf.vfcntr_len;
+	if (vfcntr_len)
+		memset(snap->mbx_vf.vf_cntr, 0,
+		       (vfcntr_len * sizeof(*snap->mbx_vf.vf_cntr)));
+
+	/* Reset mailbox snapshot for a new capture. */
+	memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf));
+	snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+}
+
+/**
+ * ice_mbx_vf_state_handler - Handle states of the overflow algorithm
+ * @hw: pointer to the HW struct
+ * @mbx_data: pointer to structure containing mailbox data
+ * @vf_id: relative virtual function (VF) ID
+ * @is_malvf: boolean output to indicate if VF is malicious
+ *
+ * The function serves as an entry point for the malicious VF
+ * detection algorithm by handling the different states and state
+ * transitions of the algorithm:
+ * New snapshot: This state is entered when creating a new static
+ * snapshot. The data from any previous mailbox snapshot is
+ * cleared and a new capture of the mailbox head and tail is
+ * logged. This will be the new static snapshot to detect
+ * asynchronous messages sent by VFs. On capturing the snapshot
+ * and depending on whether the number of pending messages in that
+ * snapshot exceed the watermark value, the state machine enters
+ * traverse or detect states.
+ * Traverse: If pending message count is below watermark then iterate
+ * through the snapshot without any action on VF.
+ * Detect: If pending message count exceeds watermark traverse
+ * the static snapshot and look for a malicious VF.
+ */
+enum ice_status
+ice_mbx_vf_state_handler(struct ice_hw *hw,
+			 struct ice_mbx_data *mbx_data, u16 vf_id,
+			 bool *is_malvf)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+	struct ice_mbx_snap_buffer_data *snap_buf;
+	struct ice_ctl_q_info *cq = &hw->mailboxq;
+	enum ice_mbx_snapshot_state new_state;
+	enum ice_status status = 0;
+
+	if (!is_malvf || !mbx_data)
+		return ICE_ERR_BAD_PTR;
+
+	/* When entering the mailbox state machine assume that the VF
+	 * is not malicious until detected.
+	 */
+	*is_malvf = false;
+
+	 /* Checking if max messages allowed to be processed while servicing current
+	  * interrupt is not less than the defined AVF message threshold.
+	  */
+	if (mbx_data->max_num_msgs_mbx <= ICE_ASYNC_VF_MSG_THRESHOLD)
+		return ICE_ERR_INVAL_SIZE;
+
+	/* The watermark value should not be lesser than the threshold limit
+	 * set for the number of asynchronous messages a VF can send to mailbox
+	 * nor should it be greater than the maximum number of messages in the
+	 * mailbox serviced in current interrupt.
+	 */
+	if (mbx_data->async_watermark_val < ICE_ASYNC_VF_MSG_THRESHOLD ||
+	    mbx_data->async_watermark_val > mbx_data->max_num_msgs_mbx)
+		return ICE_ERR_PARAM;
+
+	new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+	snap_buf = &snap->mbx_buf;
+
+	switch (snap_buf->state) {
+	case ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT:
+		/* Clear any previously held data in mailbox snapshot structure. */
+		ice_mbx_reset_snapshot(snap);
+
+		/* Collect the pending ARQ count, number of messages processed and
+		 * the maximum number of messages allowed to be processed from the
+		 * Mailbox for current interrupt.
+		 */
+		snap_buf->num_pending_arq = mbx_data->num_pending_arq;
+		snap_buf->num_msg_proc = mbx_data->num_msg_proc;
+		snap_buf->max_num_msgs_mbx = mbx_data->max_num_msgs_mbx;
+
+		/* Capture a new static snapshot of the mailbox by logging the
+		 * head and tail of snapshot and set num_iterations to the tail
+		 * value to mark the start of the iteration through the snapshot.
+		 */
+		snap_buf->head = ICE_RQ_DATA_MASK(cq->rq.next_to_clean +
+						  mbx_data->num_pending_arq);
+		snap_buf->tail = ICE_RQ_DATA_MASK(cq->rq.next_to_clean - 1);
+		snap_buf->num_iterations = snap_buf->tail;
+
+		/* Pending ARQ messages returned by ice_clean_rq_elem
+		 * is the difference between the head and tail of the
+		 * mailbox queue. Comparing this value against the watermark
+		 * helps to check if we potentially have malicious VFs.
+		 */
+		if (snap_buf->num_pending_arq >=
+		    mbx_data->async_watermark_val) {
+			new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+			status = ice_mbx_detect_malvf(hw, vf_id, &new_state, is_malvf);
+		} else {
+			new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+			ice_mbx_traverse(hw, &new_state);
+		}
+		break;
+
+	case ICE_MAL_VF_DETECT_STATE_TRAVERSE:
+		new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+		ice_mbx_traverse(hw, &new_state);
+		break;
+
+	case ICE_MAL_VF_DETECT_STATE_DETECT:
+		new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+		status = ice_mbx_detect_malvf(hw, vf_id, &new_state, is_malvf);
+		break;
+
+	default:
+		new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+		status = ICE_ERR_CFG;
+	}
+
+	snap_buf->state = new_state;
+
+	return status;
+}
+
+/**
+ * ice_mbx_report_malvf - Track and note malicious VF
+ * @hw: pointer to the HW struct
+ * @all_malvfs: all malicious VFs tracked by PF
+ * @bitmap_len: length of bitmap in bits
+ * @vf_id: relative virtual function ID of the malicious VF
+ * @report_malvf: boolean to indicate if malicious VF must be reported
+ *
+ * This function will update a bitmap that keeps track of the malicious
+ * VFs attached to the PF. A malicious VF must be reported only once if
+ * discovered between VF resets or loading so the function checks
+ * the input vf_id against the bitmap to verify if the VF has been
+ * detected in any previous mailbox iterations.
+ */
+enum ice_status
+ice_mbx_report_malvf(struct ice_hw *hw, unsigned long *all_malvfs,
+		     u16 bitmap_len, u16 vf_id, bool *report_malvf)
+{
+	if (!all_malvfs || !report_malvf)
+		return ICE_ERR_PARAM;
+
+	*report_malvf = false;
+
+	if (bitmap_len < hw->mbx_snapshot.mbx_vf.vfcntr_len)
+		return ICE_ERR_INVAL_SIZE;
+
+	if (vf_id >= bitmap_len)
+		return ICE_ERR_OUT_OF_RANGE;
+
+	/* If the vf_id is found in the bitmap set bit and boolean to true */
+	if (!test_bit(vf_id, all_malvfs)) {
+		set_bit(vf_id, all_malvfs);
+		*report_malvf = true;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_mbx_clear_malvf - Clear VF bitmap and counter for VF ID
+ * @snap: pointer to the mailbox snapshot structure
+ * @all_malvfs: all malicious VFs tracked by PF
+ * @bitmap_len: length of bitmap in bits
+ * @vf_id: relative virtual function ID of the malicious VF
+ *
+ * In case of a VF reset, this function can be called to clear
+ * the bit corresponding to the VF ID in the bitmap tracking all
+ * malicious VFs attached to the PF. The function also clears the
+ * VF counter array at the index of the VF ID. This is to ensure
+ * that the new VF loaded is not considered malicious before going
+ * through the overflow detection algorithm.
+ */
+enum ice_status
+ice_mbx_clear_malvf(struct ice_mbx_snapshot *snap, unsigned long *all_malvfs,
+		    u16 bitmap_len, u16 vf_id)
+{
+	if (!snap || !all_malvfs)
+		return ICE_ERR_PARAM;
+
+	if (bitmap_len < snap->mbx_vf.vfcntr_len)
+		return ICE_ERR_INVAL_SIZE;
+
+	/* Ensure VF ID value is not larger than bitmap or VF counter length */
+	if (vf_id >= bitmap_len || vf_id >= snap->mbx_vf.vfcntr_len)
+		return ICE_ERR_OUT_OF_RANGE;
+
+	/* Clear VF ID bit in the bitmap tracking malicious VFs attached to PF */
+	clear_bit(vf_id, all_malvfs);
+
+	/* Clear the VF counter in the mailbox snapshot structure for that VF ID.
+	 * This is to ensure that if a VF is unloaded and a new one brought back
+	 * up with the same VF ID for a snapshot currently in traversal or detect
+	 * state the counter for that VF ID does not increment on top of existing
+	 * values in the mailbox overflow detection algorithm.
+	 */
+	snap->mbx_vf.vf_cntr[vf_id] = 0;
+
+	return 0;
+}
+
+/**
+ * ice_mbx_init_snapshot - Initialize mailbox snapshot structure
+ * @hw: pointer to the hardware structure
+ * @vf_count: number of VFs allocated on a PF
+ *
+ * Clear the mailbox snapshot structure and allocate memory
+ * for the VF counter array based on the number of VFs allocated
+ * on that PF.
+ *
+ * Assumption: This function will assume ice_get_caps() has already been
+ * called to ensure that the vf_count can be compared against the number
+ * of VFs supported as defined in the functional capabilities of the device.
+ */
+enum ice_status ice_mbx_init_snapshot(struct ice_hw *hw, u16 vf_count)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+	/* Ensure that the number of VFs allocated is non-zero and
+	 * is not greater than the number of supported VFs defined in
+	 * the functional capabilities of the PF.
+	 */
+	if (!vf_count || vf_count > hw->func_caps.num_allocd_vfs)
+		return ICE_ERR_INVAL_SIZE;
+
+	snap->mbx_vf.vf_cntr = devm_kcalloc(ice_hw_to_dev(hw), vf_count,
+					    sizeof(*snap->mbx_vf.vf_cntr),
+					    GFP_KERNEL);
+	if (!snap->mbx_vf.vf_cntr)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Setting the VF counter length to the number of allocated
+	 * VFs for given PF's functional capabilities.
+	 */
+	snap->mbx_vf.vfcntr_len = vf_count;
+
+	/* Clear mbx_buf in the mailbox snaphot structure and setting the
+	 * mailbox snapshot state to a new capture.
+	 */
+	memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf));
+	snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+
+	return 0;
+}
+
+/**
+ * ice_mbx_deinit_snapshot - Free mailbox snapshot structure
+ * @hw: pointer to the hardware structure
+ *
+ * Clear the mailbox snapshot structure and free the VF counter array.
+ */
+void ice_mbx_deinit_snapshot(struct ice_hw *hw)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+	/* Free VF counter array and reset vf counter length */
+	devm_kfree(ice_hw_to_dev(hw), snap->mbx_vf.vf_cntr);
+	snap->mbx_vf.vfcntr_len = 0;
+
+	/* Clear mbx_buf in the mailbox snaphot structure */
+	memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf));
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.h b/drivers/net/ethernet/intel/ice/ice_sriov.h
index 3d78a0795138..15adb2e5aaf2 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.h
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.h
@@ -1,10 +1,17 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_SRIOV_H_
 #define _ICE_SRIOV_H_
 
-#include "ice_common.h"
+#include "ice_type.h"
+#include "ice_controlq.h"
+
+/* Defining the mailbox message threshold as 63 asynchronous
+ * pending messages. Normal VF functionality does not require
+ * sending more than 63 asynchronous pending message.
+ */
+#define ICE_ASYNC_VF_MSG_THRESHOLD	63
 
 #ifdef CONFIG_PCI_IOV
 enum ice_status
@@ -12,6 +19,17 @@ ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
 		      u8 *msg, u16 msglen, struct ice_sq_cd *cd);
 
 u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed);
+enum ice_status
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+			 u16 vf_id, bool *is_mal_vf);
+enum ice_status
+ice_mbx_clear_malvf(struct ice_mbx_snapshot *snap, unsigned long *all_malvfs,
+		    u16 bitmap_len, u16 vf_id);
+enum ice_status ice_mbx_init_snapshot(struct ice_hw *hw, u16 vf_count);
+void ice_mbx_deinit_snapshot(struct ice_hw *hw);
+enum ice_status
+ice_mbx_report_malvf(struct ice_hw *hw, unsigned long *all_malvfs,
+		     u16 bitmap_len, u16 vf_id, bool *report_malvf);
 #else /* CONFIG_PCI_IOV */
 static inline enum ice_status
 ice_aq_send_msg_to_vf(struct ice_hw __always_unused *hw,
diff --git a/drivers/net/ethernet/intel/ice/ice_status.h b/drivers/net/ethernet/intel/ice/ice_status.h
index c01597885629..aa279abfe32a 100644
--- a/drivers/net/ethernet/intel/ice/ice_status.h
+++ b/drivers/net/ethernet/intel/ice/ice_status.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_STATUS_H_
 #define _ICE_STATUS_H_
@@ -26,14 +26,22 @@ enum ice_status {
 	ICE_ERR_IN_USE				= -16,
 	ICE_ERR_MAX_LIMIT			= -17,
 	ICE_ERR_RESET_ONGOING			= -18,
+	ICE_ERR_HW_TABLE			= -19,
+	ICE_ERR_FW_DDP_MISMATCH			= -20,
+
+	/* NVM specific error codes: Range -50..-59 */
+	ICE_ERR_NVM				= -50,
 	ICE_ERR_NVM_CHECKSUM			= -51,
 	ICE_ERR_BUF_TOO_SHORT			= -52,
 	ICE_ERR_NVM_BLANK_MODE			= -53,
+
+	/* ARQ/ASQ specific error codes. Range -100..-109 */
 	ICE_ERR_AQ_ERROR			= -100,
 	ICE_ERR_AQ_TIMEOUT			= -101,
 	ICE_ERR_AQ_FULL				= -102,
 	ICE_ERR_AQ_NO_WORK			= -103,
 	ICE_ERR_AQ_EMPTY			= -104,
+	ICE_ERR_AQ_FW_CRITICAL			= -105,
 };
 
 #endif /* _ICE_STATUS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 7ff2e07f6d38..c0e622d13893 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -1,7 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_switch.h"
+#include "ice_flex_type.h"
+#include "ice_flow.h"
+
 
 #define ICE_ETH_DA_OFFSET		0
 #define ICE_ETH_ETHTYPE_OFFSET		12
@@ -23,76 +26,1286 @@
  *	In case of Ether type filter it is treated as header without VLAN tag
  *	and byte 12 and 13 is used to program a given Ether type instead
  */
-#define DUMMY_ETH_HDR_LEN		16
 static const u8 dummy_eth_header[DUMMY_ETH_HDR_LEN] = { 0x2, 0, 0, 0, 0, 0,
 							0x2, 0, 0, 0, 0, 0,
 							0x81, 0, 0, 0};
 
-#define ICE_SW_RULE_RX_TX_ETH_HDR_SIZE \
-	(sizeof(struct ice_aqc_sw_rules_elem) - \
-	 sizeof(((struct ice_aqc_sw_rules_elem *)0)->pdata) + \
-	 sizeof(struct ice_sw_rule_lkup_rx_tx) + DUMMY_ETH_HDR_LEN - 1)
-#define ICE_SW_RULE_RX_TX_NO_HDR_SIZE \
-	(sizeof(struct ice_aqc_sw_rules_elem) - \
-	 sizeof(((struct ice_aqc_sw_rules_elem *)0)->pdata) + \
-	 sizeof(struct ice_sw_rule_lkup_rx_tx) - 1)
-#define ICE_SW_RULE_LG_ACT_SIZE(n) \
-	(sizeof(struct ice_aqc_sw_rules_elem) - \
-	 sizeof(((struct ice_aqc_sw_rules_elem *)0)->pdata) + \
-	 sizeof(struct ice_sw_rule_lg_act) - \
-	 sizeof(((struct ice_sw_rule_lg_act *)0)->act) + \
-	 ((n) * sizeof(((struct ice_sw_rule_lg_act *)0)->act)))
-#define ICE_SW_RULE_VSI_LIST_SIZE(n) \
-	(sizeof(struct ice_aqc_sw_rules_elem) - \
-	 sizeof(((struct ice_aqc_sw_rules_elem *)0)->pdata) + \
-	 sizeof(struct ice_sw_rule_vsi_list) - \
-	 sizeof(((struct ice_sw_rule_vsi_list *)0)->vsi) + \
-	 ((n) * sizeof(((struct ice_sw_rule_vsi_list *)0)->vsi)))
-
-/**
- * ice_aq_alloc_free_res - command to allocate/free resources
- * @hw: pointer to the HW struct
- * @num_entries: number of resource entries in buffer
- * @buf: Indirect buffer to hold data parameters and response
- * @buf_size: size of buffer for indirect commands
- * @opc: pass in the command opcode
- * @cd: pointer to command details structure or NULL
+
+struct ice_dummy_pkt_offsets {
+	enum ice_protocol_type type;
+	u16 offset; /* ICE_PROTOCOL_LAST indicates end of list */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_gre_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_NVGRE,		34 },
+	{ ICE_MAC_IL,		42 },
+	{ ICE_IPV4_IL,		56 },
+	{ ICE_TCP_IL,		76 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_gre_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00,	/* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x3E,	/* ICE_IPV4_OFOS 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x2F, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x80, 0x00, 0x65, 0x58,	/* ICE_NVGRE 34 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00,	/* ICE_MAC_IL 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x14,	/* ICE_IPV4_IL 56 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00,	/* ICE_TCP_IL 76 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x02, 0x20, 0x00,
+	0x00, 0x00, 0x00, 0x00
+};
+
+static const struct ice_dummy_pkt_offsets dummy_gre_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_NVGRE,		34 },
+	{ ICE_MAC_IL,		42 },
+	{ ICE_IPV4_IL,		56 },
+	{ ICE_UDP_ILOS,		76 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_gre_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00,	/* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x3E,	/* ICE_IPV4_OFOS 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x2F, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x80, 0x00, 0x65, 0x58,	/* ICE_NVGRE 34 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00,	/* ICE_MAC_IL 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x14,	/* ICE_IPV4_IL 56 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00,	/* ICE_UDP_ILOS 76 */
+	0x00, 0x08, 0x00, 0x00,
+};
+
+static const struct ice_dummy_pkt_offsets dummy_udp_tun_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_VXLAN,		42 },
+	{ ICE_GENEVE,		42 },
+	{ ICE_VXLAN_GPE,	42 },
+	{ ICE_MAC_IL,		50 },
+	{ ICE_IPV4_IL,		64 },
+	{ ICE_TCP_IL,		84 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_udp_tun_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00,  /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x5a, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x01, 0x00, 0x00,
+	0x40, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x12, 0xb5, /* ICE_UDP_OF 34 */
+	0x00, 0x46, 0x00, 0x00,
+
+	0x00, 0x00, 0x65, 0x58, /* ICE_VXLAN 42 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_IL 50 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x28, /* ICE_IPV4_IL 64 */
+	0x00, 0x01, 0x00, 0x00,
+	0x40, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 84 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x02, 0x20, 0x00,
+	0x00, 0x00, 0x00, 0x00
+};
+
+static const struct ice_dummy_pkt_offsets dummy_udp_tun_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_VXLAN,		42 },
+	{ ICE_GENEVE,		42 },
+	{ ICE_VXLAN_GPE,	42 },
+	{ ICE_MAC_IL,		50 },
+	{ ICE_IPV4_IL,		64 },
+	{ ICE_UDP_ILOS,		84 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_udp_tun_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00,  /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x4e, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x12, 0xb5, /* ICE_UDP_OF 34 */
+	0x00, 0x3a, 0x00, 0x00,
+
+	0x00, 0x00, 0x65, 0x58, /* ICE_VXLAN 42 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_IL 50 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x1c, /* ICE_IPV4_IL 64 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 84 */
+	0x00, 0x08, 0x00, 0x00,
+};
+
+/* offset info for MAC + IPv4 + UDP dummy packet */
+static const struct ice_dummy_pkt_offsets dummy_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_ILOS,		34 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* Dummy packet for MAC + IPv4 + UDP */
+static const u8 dummy_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x1c, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 34 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00,	/* 2 bytes for 4 byte alignment */
+};
+
+/* offset info for MAC + VLAN + IPv4 + UDP dummy packet */
+static const struct ice_dummy_pkt_offsets dummy_vlan_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_VLAN_OFOS,	14 },
+	{ ICE_IPV4_OFOS,	18 },
+	{ ICE_UDP_ILOS,		38 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* C-tag (801.1Q), IPv4:UDP dummy packet */
+static const u8 dummy_vlan_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x81, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x00, 0x00, 0x08, 0x00, /* ICE_VLAN_OFOS 14 */
+
+	0x45, 0x00, 0x00, 0x1c, /* ICE_IPV4_OFOS 18 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 38 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00,	/* 2 bytes for 4 byte alignment */
+};
+
+/* offset info for MAC + IPv4 + TCP dummy packet */
+static const struct ice_dummy_pkt_offsets dummy_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_TCP_IL,		34 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* Dummy packet for MAC + IPv4 + TCP */
+static const u8 dummy_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x28, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 34 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,	/* 2 bytes for 4 byte alignment */
+};
+
+/* offset info for MAC + VLAN (C-tag, 802.1Q) + IPv4 + TCP dummy packet */
+static const struct ice_dummy_pkt_offsets dummy_vlan_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_VLAN_OFOS,	14 },
+	{ ICE_IPV4_OFOS,	18 },
+	{ ICE_TCP_IL,		38 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* C-tag (801.1Q), IPv4:TCP dummy packet */
+static const u8 dummy_vlan_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x81, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x00, 0x00, 0x08, 0x00, /* ICE_VLAN_OFOS 14 */
+
+	0x45, 0x00, 0x00, 0x28, /* ICE_IPV4_OFOS 18 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 38 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,	/* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_tcp_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_TCP_IL,		54 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_tcp_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x86, 0xDD,		/* ICE_ETYPE_OL 12 */
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 40 */
+	0x00, 0x14, 0x06, 0x00, /* Next header is TCP */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 54 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* C-tag (802.1Q): IPv6 + TCP */
+static const struct ice_dummy_pkt_offsets
+dummy_vlan_tcp_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_VLAN_OFOS,	14 },
+	{ ICE_IPV6_OFOS,	18 },
+	{ ICE_TCP_IL,		58 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* C-tag (802.1Q), IPv6 + TCP dummy packet */
+static const u8 dummy_vlan_tcp_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x81, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x00, 0x00, 0x86, 0xDD, /* ICE_VLAN_OFOS 14 */
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 18 */
+	0x00, 0x14, 0x06, 0x00, /* Next header is TCP */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 58 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* IPv6 + UDP */
+static const struct ice_dummy_pkt_offsets dummy_udp_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_ILOS,		54 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* IPv6 + UDP dummy packet */
+static const u8 dummy_udp_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x86, 0xDD,		/* ICE_ETYPE_OL 12 */
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 40 */
+	0x00, 0x10, 0x11, 0x00, /* Next header UDP */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 54 */
+	0x00, 0x10, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* needed for ESP packets */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* C-tag (802.1Q): IPv6 + UDP */
+static const struct ice_dummy_pkt_offsets
+dummy_vlan_udp_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_VLAN_OFOS,	14 },
+	{ ICE_IPV6_OFOS,	18 },
+	{ ICE_UDP_ILOS,		58 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* C-tag (802.1Q), IPv6 + UDP dummy packet */
+static const u8 dummy_vlan_udp_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x81, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x00, 0x00, 0x86, 0xDD, /* ICE_VLAN_OFOS 14 */
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 18 */
+	0x00, 0x08, 0x11, 0x00, /* Next header UDP */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 58 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* Outer IPv4 + Outer UDP + GTP + Inner IPv4 + Inner TCP */
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv4_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV4_IL,		62 },
+	{ ICE_TCP_IL,		82 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv4_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x58, /* IP 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 34 */
+	0x00, 0x44, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x34, /* GTP-U Header 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x28, /* IP 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* TCP 82 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* Outer IPv4 + Outer UDP + GTP + Inner IPv4 + Inner UDP */
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv4_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV4_IL,		62 },
+	{ ICE_UDP_ILOS,		82 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv4_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x4c, /* IP 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 34 */
+	0x00, 0x38, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28, /* GTP-U Header 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x1c, /* IP 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* UDP 82 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* Outer IPv6 + Outer UDP + GTP + Inner IPv4 + Inner TCP */
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv6_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV6_IL,		62 },
+	{ ICE_TCP_IL,		102 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv6_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x6c, /* IP 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 34 */
+	0x00, 0x58, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x48, /* GTP-U Header 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 62 */
+	0x00, 0x14, 0x06, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* TCP 102 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv6_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV6_IL,		62 },
+	{ ICE_UDP_ILOS,		102 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv6_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x60, /* IP 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 34 */
+	0x00, 0x4c, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x3c, /* GTP-U Header 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 62 */
+	0x00, 0x08, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* UDP 102 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv4_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV4_IL,		82 },
+	{ ICE_TCP_IL,		102 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv4_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 14 */
+	0x00, 0x44, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 54 */
+	0x00, 0x44, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x34, /* GTP-U Header 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 74 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x28, /* IP 82 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* TCP 102 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv4_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV4_IL,		82 },
+	{ ICE_UDP_ILOS,		102 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv4_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 14 */
+	0x00, 0x38, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 54 */
+	0x00, 0x38, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28, /* GTP-U Header 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 74 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x1c, /* IP 82 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* UDP 102 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv6_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV6_IL,		82 },
+	{ ICE_TCP_IL,		122 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv6_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 14 */
+	0x00, 0x58, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 54 */
+	0x00, 0x58, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x48, /* GTP-U Header 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 74 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 82 */
+	0x00, 0x14, 0x06, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* TCP 122 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv6_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV6_IL,		82 },
+	{ ICE_UDP_ILOS,		122 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv6_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 14 */
+	0x00, 0x4c, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 54 */
+	0x00, 0x4c, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x3c, /* GTP-U Header 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 74 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 82 */
+	0x00, 0x08, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* UDP 122 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv4_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV4_IL,		62 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv4_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x44, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x00, 0x40, 0x00,
+	0x40, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x68, 0x08, 0x68, /* ICE_UDP_OF 34 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28,  /* ICE_GTP 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* PDU Session extension header */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x14, /* ICE_IPV4_IL 62 */
+	0x00, 0x00, 0x40, 0x00,
+	0x40, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV6_IL,		62 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x58, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x00, 0x40, 0x00,
+	0x40, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x68, 0x08, 0x68, /* ICE_UDP_OF 34 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28,  /* ICE_GTP 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* PDU Session extension header */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_IL 62 */
+	0x00, 0x00, 0x3b, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv4_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV4_IL,		82 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv4_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 14 */
+	0x00, 0x58, 0x11, 0x00, /* Next header UDP*/
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x68, 0x08, 0x68, /* ICE_UDP_OF 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28,  /* ICE_GTP 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* PDU Session extension header */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x14, /* ICE_IPV4_IL 82 */
+	0x00, 0x00, 0x40, 0x00,
+	0x40, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV6_IL,		82 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 14 */
+	0x00, 0x6c, 0x11, 0x00, /* Next header UDP*/
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x68, 0x08, 0x68, /* ICE_UDP_OF 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28,  /* ICE_GTP 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* PDU Session extension header */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFIL 82 */
+	0x00, 0x00, 0x3b, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,
+};
+
+
+/* this is a recipe to profile association bitmap */
+static DECLARE_BITMAP(recipe_to_profile[ICE_MAX_NUM_RECIPES],
+			  ICE_MAX_NUM_PROFILES);
+
+/* this is a profile to recipe association bitmap */
+static DECLARE_BITMAP(profile_to_recipe[ICE_MAX_NUM_PROFILES],
+			  ICE_MAX_NUM_RECIPES);
+
+static void ice_get_recp_to_prof_map(struct ice_hw *hw);
+
+/**
+ * ice_collect_result_idx - copy result index values
+ * @buf: buffer that contains the result index
+ * @recp: the recipe struct to copy data into
+ */
+static void ice_collect_result_idx(struct ice_aqc_recipe_data_elem *buf,
+				   struct ice_sw_recipe *recp)
+{
+	if (buf->content.result_indx & ICE_AQ_RECIPE_RESULT_EN)
+		set_bit(buf->content.result_indx & ~ICE_AQ_RECIPE_RESULT_EN,
+		        recp->res_idxs);
+}
+
+
+/**
+ * ice_get_recp_frm_fw - update SW bookkeeping from FW recipe entries
+ * @hw: pointer to hardware structure
+ * @recps: struct that we need to populate
+ * @rid: recipe ID that we are populating
+ * @refresh_required: true if we should get recipe to profile mapping from FW
  *
- * Helper function to allocate/free resources using the admin queue commands
+ * This function is used to populate all the necessary entries into our
+ * bookkeeping so that we have a current list of all the recipes that are
+ * programmed in the firmware.
  */
 static enum ice_status
-ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries,
-		      struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size,
-		      enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+ice_get_recp_frm_fw(struct ice_hw *hw, struct ice_sw_recipe *recps, u8 rid,
+		    bool *refresh_required)
 {
-	struct ice_aqc_alloc_free_res_cmd *cmd;
-	struct ice_aq_desc desc;
+	DECLARE_BITMAP(result_bm, ICE_MAX_FV_WORDS);
+	struct ice_aqc_recipe_data_elem *tmp;
+	u16 num_recps = ICE_MAX_NUM_RECIPES;
+	struct ice_prot_lkup_ext *lkup_exts;
+	enum ice_status status;
+	u8 fv_word_idx = 0;
+	u16 sub_recps;
 
-	cmd = &desc.params.sw_res_ctrl;
+	bitmap_zero(result_bm, ICE_MAX_FV_WORDS);
 
-	if (!buf)
-		return ICE_ERR_PARAM;
+	/* we need a buffer big enough to accommodate all the recipes */
+	tmp = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_NUM_RECIPES,
+			   sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return ICE_ERR_NO_MEMORY;
 
-	if (buf_size < (num_entries * sizeof(buf->elem[0])))
-		return ICE_ERR_PARAM;
+	tmp[0].recipe_indx = rid;
+	status = ice_aq_get_recipe(hw, tmp, &num_recps, rid, NULL);
+	/* non-zero status meaning recipe doesn't exist */
+	if (status)
+		goto err_unroll;
+
+	/* Get recipe to profile map so that we can get the fv from lkups that
+	 * we read for a recipe from FW. Since we want to minimize the number of
+	 * times we make this FW call, just make one call and cache the copy
+	 * until a new recipe is added. This operation is only required the
+	 * first time to get the changes from FW. Then to search existing
+	 * entries we don't need to update the cache again until another recipe
+	 * gets added.
+	 */
+	if (*refresh_required) {
+		ice_get_recp_to_prof_map(hw);
+		*refresh_required = false;
+	}
 
-	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	/* Start populating all the entries for recps[rid] based on lkups from
+	 * firmware. Note that we are only creating the root recipe in our
+	 * database.
+	 */
+	lkup_exts = &recps[rid].lkup_exts;
+
+	for (sub_recps = 0; sub_recps < num_recps; sub_recps++) {
+		struct ice_aqc_recipe_data_elem root_bufs = tmp[sub_recps];
+		struct ice_recp_grp_entry *rg_entry;
+		u8 i, prof, idx, prot = 0;
+		bool is_root;
+		u16 off = 0;
+
+		rg_entry = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rg_entry),
+					GFP_KERNEL);
+		if (!rg_entry) {
+			status = ICE_ERR_NO_MEMORY;
+			goto err_unroll;
+		}
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+		idx = root_bufs.recipe_indx;
+		is_root = root_bufs.content.rid & ICE_AQ_RECIPE_ID_IS_ROOT;
+
+		/* Mark all result indices in this chain */
+		if (root_bufs.content.result_indx & ICE_AQ_RECIPE_RESULT_EN)
+			set_bit(root_bufs.content.result_indx & ~ICE_AQ_RECIPE_RESULT_EN,
+				result_bm);
+
+		/* get the first profile that is associated with rid */
+		prof = find_first_bit(recipe_to_profile[idx],
+				      ICE_MAX_NUM_PROFILES);
+		for (i = 0; i < ICE_NUM_WORDS_RECIPE; i++) {
+			u8 lkup_indx = root_bufs.content.lkup_indx[i + 1];
+
+			rg_entry->fv_idx[i] = lkup_indx;
+			rg_entry->fv_mask[i] =
+				le16_to_cpu(root_bufs.content.mask[i + 1]);
+
+			/* If the recipe is a chained recipe then all its
+			 * child recipe's result will have a result index.
+			 * To fill fv_words we should not use those result
+			 * index, we only need the protocol ids and offsets.
+			 * We will skip all the fv_idx which stores result
+			 * index in them. We also need to skip any fv_idx which
+			 * has ICE_AQ_RECIPE_LKUP_IGNORE or 0 since it isn't a
+			 * valid offset value.
+			 */
+			if (test_bit(rg_entry->fv_idx[i], hw->switch_info->prof_res_bm[prof]) ||
+			    rg_entry->fv_idx[i] & ICE_AQ_RECIPE_LKUP_IGNORE ||
+			    rg_entry->fv_idx[i] == 0)
+				continue;
+
+			ice_find_prot_off(hw, ICE_BLK_SW, prof,
+					  rg_entry->fv_idx[i], &prot, &off);
+			lkup_exts->fv_words[fv_word_idx].prot_id = prot;
+			lkup_exts->fv_words[fv_word_idx].off = off;
+			lkup_exts->field_mask[fv_word_idx] =
+				rg_entry->fv_mask[i];
+			fv_word_idx++;
+		}
+		/* populate rg_list with the data from the child entry of this
+		 * recipe
+		 */
+		list_add(&rg_entry->l_entry, &recps[rid].rg_list);
+
+		/* Propagate some data to the recipe database */
+		recps[idx].is_root = !!is_root;
+		recps[idx].priority = root_bufs.content.act_ctrl_fwd_priority;
+		bitmap_zero(recps[idx].res_idxs, ICE_MAX_FV_WORDS);
+		if (root_bufs.content.result_indx & ICE_AQ_RECIPE_RESULT_EN) {
+			recps[idx].chain_idx = root_bufs.content.result_indx &
+				~ICE_AQ_RECIPE_RESULT_EN;
+			set_bit(recps[idx].chain_idx, recps[idx].res_idxs);
+		} else {
+			recps[idx].chain_idx = ICE_INVAL_CHAIN_IND;
+		}
+
+		if (!is_root)
+			continue;
+
+		/* Only do the following for root recipes entries */
+		memcpy(recps[idx].r_bitmap, root_bufs.recipe_bitmap,
+		       sizeof(recps[idx].r_bitmap));
+		recps[idx].root_rid = root_bufs.content.rid &
+			~ICE_AQ_RECIPE_ID_IS_ROOT;
+		recps[idx].priority = root_bufs.content.act_ctrl_fwd_priority;
+	}
+
+	/* Complete initialization of the root recipe entry */
+	lkup_exts->n_val_words = fv_word_idx;
+	recps[rid].big_recp = (num_recps > 1);
+	recps[rid].n_grp_count = (u8)num_recps;
+	recps[rid].root_buf = devm_kmemdup(ice_hw_to_dev(hw), tmp,
+					   recps[rid].n_grp_count * sizeof(*recps[rid].root_buf),
+					   GFP_KERNEL);
+	if (!recps[rid].root_buf)
+		goto err_unroll;
+
+	/* Copy result indexes */
+	bitmap_copy(recps[rid].res_idxs, result_bm, ICE_MAX_FV_WORDS);
+	recps[rid].recp_created = true;
+
+err_unroll:
+	devm_kfree(ice_hw_to_dev(hw), tmp);
+	return status;
+}
+
+/**
+ * ice_get_recp_to_prof_map - updates recipe to profile mapping
+ * @hw: pointer to hardware structure
+ *
+ * This function is used to populate recipe_to_profile matrix where index to
+ * this array is the recipe ID and the element is the mapping of which profiles
+ * is this recipe mapped to.
+ */
+static void ice_get_recp_to_prof_map(struct ice_hw *hw)
+{
+	DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+	u16 i;
 
-	cmd->num_entries = cpu_to_le16(num_entries);
+	for (i = 0; i < hw->switch_info->max_used_prof_index + 1; i++) {
+		u16 j;
 
-	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+		bitmap_zero(profile_to_recipe[i], ICE_MAX_NUM_RECIPES);
+		bitmap_zero(r_bitmap, ICE_MAX_NUM_RECIPES);
+		if (ice_aq_get_recipe_to_profile(hw, i, (u8 *)r_bitmap, NULL))
+			continue;
+		bitmap_copy(profile_to_recipe[i], r_bitmap,
+			    ICE_MAX_NUM_RECIPES);
+		for_each_set_bit(j, r_bitmap, ICE_MAX_NUM_RECIPES)
+			set_bit(i, recipe_to_profile[j]);
+	}
 }
 
 /**
  * ice_init_def_sw_recp - initialize the recipe book keeping tables
  * @hw: pointer to the HW struct
+ * @recp_list: pointer to sw recipe list
  *
  * Allocate memory for the entire recipe table and initialize the structures/
  * entries corresponding to basic recipes.
  */
-enum ice_status ice_init_def_sw_recp(struct ice_hw *hw)
+enum ice_status
+ice_init_def_sw_recp(struct ice_hw *hw, struct ice_sw_recipe **recp_list)
 {
 	struct ice_sw_recipe *recps;
 	u8 i;
@@ -102,14 +1315,15 @@ enum ice_status ice_init_def_sw_recp(struct ice_hw *hw)
 	if (!recps)
 		return ICE_ERR_NO_MEMORY;
 
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
 		recps[i].root_rid = i;
 		INIT_LIST_HEAD(&recps[i].filt_rules);
 		INIT_LIST_HEAD(&recps[i].filt_replay_rules);
+		INIT_LIST_HEAD(&recps[i].rg_list);
 		mutex_init(&recps[i].filt_rule_lock);
 	}
 
-	hw->switch_info->recp_list = recps;
+	*recp_list = recps;
 
 	return 0;
 }
@@ -123,7 +1337,7 @@ enum ice_status ice_init_def_sw_recp(struct ice_hw *hw)
  * @num_elems: pointer to number of elements
  * @cd: pointer to command details structure or NULL
  *
- * Get switch configuration (0x0200) to be placed in 'buff'.
+ * Get switch configuration (0x0200) to be placed in buf.
  * This admin command returns information such as initial VSI/port number
  * and switch ID it belongs to.
  *
@@ -139,14 +1353,14 @@ enum ice_status ice_init_def_sw_recp(struct ice_hw *hw)
  * in response buffer. The caller of this function to use *num_elems while
  * parsing the response buffer.
  */
-static enum ice_status
-ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp *buf,
+enum ice_status
+ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp_elem *buf,
 		  u16 buf_size, u16 *req_desc, u16 *num_elems,
 		  struct ice_sq_cd *cd)
 {
 	struct ice_aqc_get_sw_cfg *cmd;
-	enum ice_status status;
 	struct ice_aq_desc desc;
+	enum ice_status status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_sw_cfg);
 	cmd = &desc.params.get_sw_conf;
@@ -161,6 +1375,299 @@ ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp *buf,
 	return status;
 }
 
+/**
+ * ice_dump_sw_cfg - get and print switch config as seen by firmware
+ * @hw: ice hardware struct
+ */
+enum ice_status ice_dump_sw_cfg(struct ice_hw *hw)
+{
+	struct ice_aqc_get_sw_cfg_resp_elem *rbuf;
+	enum ice_status ret;
+	u16 req_desc = 0;
+	u16 num_elems;
+	u16 i;
+
+	rbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_SW_CFG_MAX_BUF_LEN,
+			    GFP_KERNEL);
+	if (!rbuf)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Multiple calls to ice_aq_get_sw_cfg may be required
+	 * to get all the switch configuration information. The need
+	 * for additional calls is indicated by ice_aq_get_sw_cfg
+	 * writing a non-zero value in req_desc.
+	 */
+	do {
+		struct ice_aqc_get_sw_cfg_resp_elem *ele;
+
+		ret = ice_aq_get_sw_cfg(hw, rbuf, ICE_SW_CFG_MAX_BUF_LEN,
+					&req_desc, &num_elems, NULL);
+		if (ret)
+			break;
+
+		for (i = 0, ele = rbuf; i < num_elems; i++, ele++) {
+			u16 vsi_port_num, pf_vf_num, swid;
+
+			vsi_port_num = le16_to_cpu(ele->vsi_port_num) &
+				ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_M;
+
+			pf_vf_num = le16_to_cpu(ele->pf_vf_num) &
+				ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_M;
+
+			swid = le16_to_cpu(ele->swid);
+
+			dev_info(ice_hw_to_dev(hw), "element[%d]\n", i);
+
+			switch (le16_to_cpu(ele->vsi_port_num) >>
+				ICE_AQC_GET_SW_CONF_RESP_TYPE_S) {
+			case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
+				dev_info(ice_hw_to_dev(hw),
+					 "\tphy_port = %d\n", vsi_port_num);
+				break;
+			case ICE_AQC_GET_SW_CONF_RESP_VIRT_PORT:
+				dev_info(ice_hw_to_dev(hw),
+					 "\tvir_port = %d\n", vsi_port_num);
+				break;
+			case ICE_AQC_GET_SW_CONF_RESP_VSI:
+				dev_info(ice_hw_to_dev(hw),
+					 "\tvsi_num = %d\n", vsi_port_num);
+				break;
+
+			default:
+				dev_info(ice_hw_to_dev(hw),
+					 "\tincorrect vsi/port type\n");
+				ret = ICE_ERR_CFG;
+				break;
+			}
+
+			dev_info(ice_hw_to_dev(hw), "\tswid = %d\n", swid);
+
+			if (le16_to_cpu(ele->pf_vf_num) &
+			    ICE_AQC_GET_SW_CONF_RESP_IS_VF)
+				dev_info(ice_hw_to_dev(hw), "\tvf_id = %d\n",
+					 pf_vf_num);
+			else
+				dev_info(ice_hw_to_dev(hw), "\tpf_id = %d\n",
+					 pf_vf_num);
+		}
+	} while (req_desc && !ret);
+
+	devm_kfree(ice_hw_to_dev(hw), rbuf);
+	return ret;
+}
+
+/**
+ * ice_alloc_rss_global_lut - allocate a RSS global LUT
+ * @hw: pointer to the HW struct
+ * @shared_res: true to allocate as a shared resource and false to allocate as a dedicated resource
+ * @global_lut_id: output parameter for the RSS global LUT's ID
+ */
+enum ice_status ice_alloc_rss_global_lut(struct ice_hw *hw, bool shared_res, u16 *global_lut_id)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	enum ice_status status;
+	u16 buf_len;
+
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	sw_buf->num_elems = cpu_to_le16(1);
+	sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_GLOBAL_RSS_HASH |
+				       (shared_res ? ICE_AQC_RES_TYPE_FLAG_SHARED :
+				       ICE_AQC_RES_TYPE_FLAG_DEDICATED));
+
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len, ice_aqc_opc_alloc_res, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_RES, "Failed to allocate %s RSS global LUT, status %d\n",
+			  shared_res ? "shared" : "dedicated", status);
+		goto ice_alloc_global_lut_exit;
+	}
+
+	*global_lut_id = le16_to_cpu(sw_buf->elem[0].e.sw_resp);
+
+ice_alloc_global_lut_exit:
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+	return status;
+}
+
+/**
+ * ice_free_rss_global_lut - free a RSS global LUT
+ * @hw: pointer to the HW struct
+ * @global_lut_id: ID of the RSS global LUT to free
+ */
+enum ice_status ice_free_rss_global_lut(struct ice_hw *hw, u16 global_lut_id)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	u16 buf_len, num_elems = 1;
+	enum ice_status status;
+
+	buf_len = struct_size(sw_buf, elem, num_elems);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	sw_buf->num_elems = cpu_to_le16(num_elems);
+	sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_GLOBAL_RSS_HASH);
+	sw_buf->elem[0].e.sw_resp = cpu_to_le16(global_lut_id);
+
+	status = ice_aq_alloc_free_res(hw, num_elems, sw_buf, buf_len, ice_aqc_opc_free_res, NULL);
+	if (status)
+		ice_debug(hw, ICE_DBG_RES, "Failed to free RSS global LUT %d, status %d\n",
+			  global_lut_id, status);
+
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+	return status;
+}
+
+/**
+ * ice_alloc_sw - allocate resources specific to switch
+ * @hw: pointer to the HW struct
+ * @ena_stats: true to turn on VEB stats
+ * @shared_res: true for shared resource, false for dedicated resource
+ * @sw_id: switch ID returned
+ * @counter_id: VEB counter ID returned
+ *
+ * allocates switch resources (SWID and VEB counter) (0x0208)
+ */
+enum ice_status
+ice_alloc_sw(struct ice_hw *hw, bool ena_stats, bool shared_res, u16 *sw_id,
+	     u16 *counter_id)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	struct ice_aqc_res_elem *sw_ele;
+	enum ice_status status;
+	u16 buf_len;
+
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Prepare buffer for switch ID.
+	 * The number of resource entries in buffer is passed as 1 since only a
+	 * single switch/VEB instance is allocated, and hence a single sw_id
+	 * is requested.
+	 */
+	sw_buf->num_elems = cpu_to_le16(1);
+	sw_buf->res_type =
+		cpu_to_le16(ICE_AQC_RES_TYPE_SWID |
+			    (shared_res ? ICE_AQC_RES_TYPE_FLAG_SHARED :
+			    ICE_AQC_RES_TYPE_FLAG_DEDICATED));
+
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+
+	if (status)
+		goto ice_alloc_sw_exit;
+
+	sw_ele = &sw_buf->elem[0];
+	*sw_id = le16_to_cpu(sw_ele->e.sw_resp);
+
+	if (ena_stats) {
+		/* Prepare buffer for VEB Counter */
+		enum ice_adminq_opc opc = ice_aqc_opc_alloc_res;
+		struct ice_aqc_alloc_free_res_elem *counter_buf;
+		struct ice_aqc_res_elem *counter_ele;
+
+		counter_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len,
+					   GFP_KERNEL);
+		if (!counter_buf) {
+			status = ICE_ERR_NO_MEMORY;
+			goto ice_alloc_sw_exit;
+		}
+
+		/* The number of resource entries in buffer is passed as 1 since
+		 * only a single switch/VEB instance is allocated, and hence a
+		 * single VEB counter is requested.
+		 */
+		counter_buf->num_elems = cpu_to_le16(1);
+		counter_buf->res_type =
+			cpu_to_le16(ICE_AQC_RES_TYPE_VEB_COUNTER |
+				    ICE_AQC_RES_TYPE_FLAG_DEDICATED);
+		status = ice_aq_alloc_free_res(hw, 1, counter_buf, buf_len,
+					       opc, NULL);
+
+		if (status) {
+			devm_kfree(ice_hw_to_dev(hw), counter_buf);
+			goto ice_alloc_sw_exit;
+		}
+		counter_ele = &counter_buf->elem[0];
+		*counter_id = le16_to_cpu(counter_ele->e.sw_resp);
+		devm_kfree(ice_hw_to_dev(hw), counter_buf);
+	}
+
+ice_alloc_sw_exit:
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+	return status;
+}
+
+/**
+ * ice_free_sw - free resources specific to switch
+ * @hw: pointer to the HW struct
+ * @sw_id: switch ID returned
+ * @counter_id: VEB counter ID returned
+ *
+ * free switch resources (SWID and VEB counter) (0x0209)
+ *
+ * NOTE: This function frees multiple resources. It continues
+ * releasing other resources even after it encounters error.
+ * The error code returned is the last error it encountered.
+ */
+enum ice_status ice_free_sw(struct ice_hw *hw, u16 sw_id, u16 counter_id)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf, *counter_buf;
+	enum ice_status status, ret_status;
+	u16 buf_len;
+
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Prepare buffer to free for switch ID res.
+	 * The number of resource entries in buffer is passed as 1 since only a
+	 * single switch/VEB instance is freed, and hence a single sw_id
+	 * is released.
+	 */
+	sw_buf->num_elems = cpu_to_le16(1);
+	sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_SWID);
+	sw_buf->elem[0].e.sw_resp = cpu_to_le16(sw_id);
+
+	ret_status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len,
+					   ice_aqc_opc_free_res, NULL);
+
+	if (ret_status)
+		ice_debug(hw, ICE_DBG_SW, "CQ CMD Buffer:\n");
+
+	/* Prepare buffer to free for VEB Counter resource */
+	counter_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!counter_buf) {
+		devm_kfree(ice_hw_to_dev(hw), sw_buf);
+		return ICE_ERR_NO_MEMORY;
+	}
+
+	/* The number of resource entries in buffer is passed as 1 since only a
+	 * single switch/VEB instance is freed, and hence a single VEB counter
+	 * is released
+	 */
+	counter_buf->num_elems = cpu_to_le16(1);
+	counter_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VEB_COUNTER);
+	counter_buf->elem[0].e.sw_resp = cpu_to_le16(counter_id);
+
+	status = ice_aq_alloc_free_res(hw, 1, counter_buf, buf_len,
+				       ice_aqc_opc_free_res, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_SW, "VEB counter resource could not be freed\n");
+		ret_status = status;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), counter_buf);
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+	return ret_status;
+}
+
 /**
  * ice_aq_add_vsi
  * @hw: pointer to the HW struct
@@ -346,6 +1853,10 @@ static void ice_clear_vsi_q_ctx(struct ice_hw *hw, u16 vsi_handle)
 			devm_kfree(ice_hw_to_dev(hw), vsi->lan_q_ctx[i]);
 			vsi->lan_q_ctx[i] = NULL;
 		}
+		if (vsi->rdma_q_ctx[i]) {
+			devm_kfree(ice_hw_to_dev(hw), vsi->rdma_q_ctx[i]);
+			vsi->rdma_q_ctx[i] = NULL;
+		}
 	}
 }
 
@@ -413,11 +1924,11 @@ ice_add_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 			return ICE_ERR_NO_MEMORY;
 		}
 		*tmp_vsi_ctx = *vsi_ctx;
+
 		ice_save_vsi_ctx(hw, vsi_handle, tmp_vsi_ctx);
 	} else {
 		/* update with new HW VSI num */
-		if (tmp_vsi_ctx->vsi_num != vsi_ctx->vsi_num)
-			tmp_vsi_ctx->vsi_num = vsi_ctx->vsi_num;
+		tmp_vsi_ctx->vsi_num = vsi_ctx->vsi_num;
 	}
 
 	return 0;
@@ -468,2253 +1979,6114 @@ ice_update_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 }
 
 /**
- * ice_aq_alloc_free_vsi_list
- * @hw: pointer to the HW struct
- * @vsi_list_id: VSI list ID returned or used for lookup
- * @lkup_type: switch rule filter lookup type
- * @opc: switch rules population command type - pass in the command opcode
- *
- * allocates or free a VSI list resource
+ * ice_cfg_iwarp_fltr - enable/disable iWARP filtering on VSI
+ * @hw: pointer to HW struct
+ * @vsi_handle: VSI SW index
+ * @enable: boolean for enable/disable
  */
-static enum ice_status
-ice_aq_alloc_free_vsi_list(struct ice_hw *hw, u16 *vsi_list_id,
-			   enum ice_sw_lkup_type lkup_type,
-			   enum ice_adminq_opc opc)
+enum ice_status
+ice_cfg_iwarp_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable)
 {
-	struct ice_aqc_alloc_free_res_elem *sw_buf;
-	struct ice_aqc_res_elem *vsi_ele;
-	enum ice_status status;
-	u16 buf_len;
-
-	buf_len = sizeof(*sw_buf);
-	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
-	if (!sw_buf)
-		return ICE_ERR_NO_MEMORY;
-	sw_buf->num_elems = cpu_to_le16(1);
-
-	if (lkup_type == ICE_SW_LKUP_MAC ||
-	    lkup_type == ICE_SW_LKUP_MAC_VLAN ||
-	    lkup_type == ICE_SW_LKUP_ETHERTYPE ||
-	    lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
-	    lkup_type == ICE_SW_LKUP_PROMISC ||
-	    lkup_type == ICE_SW_LKUP_PROMISC_VLAN) {
-		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_REP);
-	} else if (lkup_type == ICE_SW_LKUP_VLAN) {
-		sw_buf->res_type =
-			cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_PRUNE);
-	} else {
-		status = ICE_ERR_PARAM;
-		goto ice_aq_alloc_free_vsi_list_exit;
-	}
+	struct ice_vsi_ctx *ctx;
 
-	if (opc == ice_aqc_opc_free_res)
-		sw_buf->elem[0].e.sw_resp = cpu_to_le16(*vsi_list_id);
-
-	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len, opc, NULL);
-	if (status)
-		goto ice_aq_alloc_free_vsi_list_exit;
+	ctx = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!ctx)
+		return ICE_ERR_DOES_NOT_EXIST;
 
-	if (opc == ice_aqc_opc_alloc_res) {
-		vsi_ele = &sw_buf->elem[0];
-		*vsi_list_id = le16_to_cpu(vsi_ele->e.sw_resp);
-	}
+	if (enable)
+		ctx->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+	else
+		ctx->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
 
-ice_aq_alloc_free_vsi_list_exit:
-	devm_kfree(ice_hw_to_dev(hw), sw_buf);
-	return status;
+	return ice_update_vsi(hw, vsi_handle, ctx, NULL);
 }
 
 /**
- * ice_aq_sw_rules - add/update/remove switch rules
+ * ice_aq_get_vsi_params
  * @hw: pointer to the HW struct
- * @rule_list: pointer to switch rule population list
- * @rule_list_sz: total size of the rule list in bytes
- * @num_rules: number of switch rules in the rule_list
- * @opc: switch rules population command type - pass in the command opcode
+ * @vsi_ctx: pointer to a VSI context struct
  * @cd: pointer to command details structure or NULL
  *
- * Add(0x02a0)/Update(0x02a1)/Remove(0x02a2) switch rules commands to firmware
+ * Get VSI context info from hardware (0x0212)
  */
-static enum ice_status
-ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
-		u8 num_rules, enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+enum ice_status
+ice_aq_get_vsi_params(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
+		      struct ice_sq_cd *cd)
 {
+	struct ice_aqc_add_get_update_free_vsi *cmd;
+	struct ice_aqc_get_vsi_resp *resp;
 	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	if (opc != ice_aqc_opc_add_sw_rules &&
-	    opc != ice_aqc_opc_update_sw_rules &&
-	    opc != ice_aqc_opc_remove_sw_rules)
-		return ICE_ERR_PARAM;
+	cmd = &desc.params.vsi_cmd;
+	resp = &desc.params.get_vsi_resp;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_vsi_params);
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-	desc.params.sw_rules.num_rules_fltr_entry_index =
-		cpu_to_le16(num_rules);
-	return ice_aq_send_cmd(hw, &desc, rule_list, rule_list_sz, cd);
+	cmd->vsi_num = cpu_to_le16(vsi_ctx->vsi_num | ICE_AQ_VSI_IS_VALID);
+
+	status = ice_aq_send_cmd(hw, &desc, &vsi_ctx->info,
+				 sizeof(vsi_ctx->info), cd);
+	if (!status) {
+		vsi_ctx->vsi_num = le16_to_cpu(resp->vsi_num) &
+					ICE_AQ_VSI_NUM_M;
+		vsi_ctx->vf_num = resp->vf_id;
+		vsi_ctx->vsis_allocd = le16_to_cpu(resp->vsi_used);
+		vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
+	}
+
+	return status;
 }
 
-/* ice_init_port_info - Initialize port_info with switch configuration data
- * @pi: pointer to port_info
- * @vsi_port_num: VSI number or port number
- * @type: Type of switch element (port or VSI)
- * @swid: switch ID of the switch the element is attached to
- * @pf_vf_num: PF or VF number
- * @is_vf: true if the element is a VF, false otherwise
+/**
+ * ice_aq_add_update_mir_rule - add/update a mirror rule
+ * @hw: pointer to the HW struct
+ * @rule_type: Rule Type
+ * @dest_vsi: VSI number to which packets will be mirrored
+ * @count: length of the list
+ * @mr_buf: buffer for list of mirrored VSI numbers
+ * @cd: pointer to command details structure or NULL
+ * @rule_id: Rule ID
+ *
+ * Add/Update Mirror Rule (0x260).
  */
-static void
-ice_init_port_info(struct ice_port_info *pi, u16 vsi_port_num, u8 type,
-		   u16 swid, u16 pf_vf_num, bool is_vf)
+enum ice_status
+ice_aq_add_update_mir_rule(struct ice_hw *hw, u16 rule_type, u16 dest_vsi,
+			   u16 count, struct ice_mir_rule_buf *mr_buf,
+			   struct ice_sq_cd *cd, u16 *rule_id)
 {
-	switch (type) {
-	case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
-		pi->lport = (u8)(vsi_port_num & ICE_LPORT_MASK);
-		pi->sw_id = swid;
-		pi->pf_vf_num = pf_vf_num;
-		pi->is_vf = is_vf;
-		pi->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
-		pi->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
+	struct ice_aqc_add_update_mir_rule *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	__le16 *mr_list = NULL;
+	u16 buf_size = 0;
+
+	switch (rule_type) {
+	case ICE_AQC_RULE_TYPE_VPORT_INGRESS:
+	case ICE_AQC_RULE_TYPE_VPORT_EGRESS:
+		/* Make sure count and mr_buf are set for these rule_types */
+		if (!(count && mr_buf))
+			return ICE_ERR_PARAM;
+
+		buf_size = count * sizeof(__le16);
+		mr_list = devm_kzalloc(ice_hw_to_dev(hw), buf_size,
+				       GFP_KERNEL);
+		if (!mr_list)
+			return ICE_ERR_NO_MEMORY;
 		break;
-	default:
-		ice_debug(pi->hw, ICE_DBG_SW,
-			  "incorrect VSI/port type received\n");
+	case ICE_AQC_RULE_TYPE_PPORT_INGRESS:
+	case ICE_AQC_RULE_TYPE_PPORT_EGRESS:
+		/* Make sure count and mr_buf are not set for these
+		 * rule_types
+		 */
+		if (count || mr_buf)
+			return ICE_ERR_PARAM;
 		break;
+	default:
+		ice_debug(hw, ICE_DBG_SW, "Error due to unsupported rule_type %u\n", rule_type);
+		return ICE_ERR_OUT_OF_RANGE;
+	}
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_update_mir_rule);
+
+	/* Pre-process 'mr_buf' items for add/update of virtual port
+	 * ingress/egress mirroring (but not physical port ingress/egress
+	 * mirroring)
+	 */
+	if (mr_buf) {
+		int i;
+
+		for (i = 0; i < count; i++) {
+			u16 id;
+
+			id = mr_buf[i].vsi_idx & ICE_AQC_RULE_MIRRORED_VSI_M;
+
+			/* Validate specified VSI number, make sure it is less
+			 * than ICE_MAX_VSI, if not return with error.
+			 */
+			if (id >= ICE_MAX_VSI) {
+				ice_debug(hw, ICE_DBG_SW, "Error VSI index (%u) out-of-range\n",
+					  id);
+				devm_kfree(ice_hw_to_dev(hw), mr_list);
+				return ICE_ERR_OUT_OF_RANGE;
+			}
+
+			/* add VSI to mirror rule */
+			if (mr_buf[i].add)
+				mr_list[i] =
+					cpu_to_le16(id | ICE_AQC_RULE_ACT_M);
+			else /* remove VSI from mirror rule */
+				mr_list[i] = cpu_to_le16(id);
+		}
 	}
+
+	cmd = &desc.params.add_update_rule;
+	if ((*rule_id) != ICE_INVAL_MIRROR_RULE_ID)
+		cmd->rule_id = cpu_to_le16(((*rule_id) & ICE_AQC_RULE_ID_M) |
+					   ICE_AQC_RULE_ID_VALID_M);
+	cmd->rule_type = cpu_to_le16(rule_type & ICE_AQC_RULE_TYPE_M);
+	cmd->num_entries = cpu_to_le16(count);
+	cmd->dest = cpu_to_le16(dest_vsi);
+
+	status = ice_aq_send_cmd(hw, &desc, mr_list, buf_size, cd);
+	if (!status)
+		*rule_id = le16_to_cpu(cmd->rule_id) & ICE_AQC_RULE_ID_M;
+
+	devm_kfree(ice_hw_to_dev(hw), mr_list);
+
+	return status;
 }
 
-/* ice_get_initial_sw_cfg - Get initial port and default VSI data
- * @hw: pointer to the hardware structure
+/**
+ * ice_aq_delete_mir_rule - delete a mirror rule
+ * @hw: pointer to the HW struct
+ * @rule_id: Mirror rule ID (to be deleted)
+ * @keep_allocd: if set, the VSI stays part of the PF allocated res,
+ *		 otherwise it is returned to the shared pool
+ * @cd: pointer to command details structure or NULL
+ *
+ * Delete Mirror Rule (0x261).
  */
-enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw)
+enum ice_status
+ice_aq_delete_mir_rule(struct ice_hw *hw, u16 rule_id, bool keep_allocd,
+		       struct ice_sq_cd *cd)
 {
-	struct ice_aqc_get_sw_cfg_resp *rbuf;
-	enum ice_status status;
-	u16 req_desc = 0;
-	u16 num_elems;
-	u16 i;
-
-	rbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_SW_CFG_MAX_BUF_LEN,
-			    GFP_KERNEL);
+	struct ice_aqc_delete_mir_rule *cmd;
+	struct ice_aq_desc desc;
 
-	if (!rbuf)
-		return ICE_ERR_NO_MEMORY;
+	/* rule_id should be in the range 0...63 */
+	if (rule_id >= ICE_MAX_NUM_MIRROR_RULES)
+		return ICE_ERR_OUT_OF_RANGE;
 
-	/* Multiple calls to ice_aq_get_sw_cfg may be required
-	 * to get all the switch configuration information. The need
-	 * for additional calls is indicated by ice_aq_get_sw_cfg
-	 * writing a non-zero value in req_desc
-	 */
-	do {
-		status = ice_aq_get_sw_cfg(hw, rbuf, ICE_SW_CFG_MAX_BUF_LEN,
-					   &req_desc, &num_elems, NULL);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_del_mir_rule);
 
-		if (status)
-			break;
+	cmd = &desc.params.del_rule;
+	rule_id |= ICE_AQC_RULE_ID_VALID_M;
+	cmd->rule_id = cpu_to_le16(rule_id);
 
-		for (i = 0; i < num_elems; i++) {
-			struct ice_aqc_get_sw_cfg_resp_elem *ele;
-			u16 pf_vf_num, swid, vsi_port_num;
-			bool is_vf = false;
-			u8 type;
+	if (keep_allocd)
+		cmd->flags = cpu_to_le16(ICE_AQC_FLAG_KEEP_ALLOCD_M);
 
-			ele = rbuf[i].elements;
-			vsi_port_num = le16_to_cpu(ele->vsi_port_num) &
-				ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_M;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
 
-			pf_vf_num = le16_to_cpu(ele->pf_vf_num) &
-				ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_M;
+/**
+ * ice_aq_alloc_free_vsi_list
+ * @hw: pointer to the HW struct
+ * @vsi_list_id: VSI list ID returned or used for lookup
+ * @lkup_type: switch rule filter lookup type
+ * @opc: switch rules population command type - pass in the command opcode
+ *
+ * allocates or free a VSI list resource
+ */
+static enum ice_status
+ice_aq_alloc_free_vsi_list(struct ice_hw *hw, u16 *vsi_list_id,
+			   enum ice_sw_lkup_type lkup_type,
+			   enum ice_adminq_opc opc)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	struct ice_aqc_res_elem *vsi_ele;
+	enum ice_status status;
+	u16 buf_len;
 
-			swid = le16_to_cpu(ele->swid);
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+	sw_buf->num_elems = cpu_to_le16(1);
 
-			if (le16_to_cpu(ele->pf_vf_num) &
-			    ICE_AQC_GET_SW_CONF_RESP_IS_VF)
-				is_vf = true;
+	if (lkup_type == ICE_SW_LKUP_MAC ||
+	    lkup_type == ICE_SW_LKUP_MAC_VLAN ||
+	    lkup_type == ICE_SW_LKUP_ETHERTYPE ||
+	    lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
+	    lkup_type == ICE_SW_LKUP_PROMISC ||
+	    lkup_type == ICE_SW_LKUP_PROMISC_VLAN ||
+	    lkup_type == ICE_SW_LKUP_LAST) {
+		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_REP);
+	} else if (lkup_type == ICE_SW_LKUP_VLAN) {
+		sw_buf->res_type =
+			cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_PRUNE);
+	} else {
+		status = ICE_ERR_PARAM;
+		goto ice_aq_alloc_free_vsi_list_exit;
+	}
 
-			type = le16_to_cpu(ele->vsi_port_num) >>
-				ICE_AQC_GET_SW_CONF_RESP_TYPE_S;
+	if (opc == ice_aqc_opc_free_res)
+		sw_buf->elem[0].e.sw_resp = cpu_to_le16(*vsi_list_id);
 
-			if (type == ICE_AQC_GET_SW_CONF_RESP_VSI) {
-				/* FW VSI is not needed. Just continue. */
-				continue;
-			}
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len, opc, NULL);
+	if (status)
+		goto ice_aq_alloc_free_vsi_list_exit;
 
-			ice_init_port_info(hw->port_info, vsi_port_num,
-					   type, swid, pf_vf_num, is_vf);
-		}
-	} while (req_desc && !status);
+	if (opc == ice_aqc_opc_alloc_res) {
+		vsi_ele = &sw_buf->elem[0];
+		*vsi_list_id = le16_to_cpu(vsi_ele->e.sw_resp);
+	}
 
-	devm_kfree(ice_hw_to_dev(hw), (void *)rbuf);
+ice_aq_alloc_free_vsi_list_exit:
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
 	return status;
 }
 
 /**
- * ice_fill_sw_info - Helper function to populate lb_en and lan_en
- * @hw: pointer to the hardware structure
- * @fi: filter info structure to fill/update
+ * ice_aq_set_storm_ctrl - Sets storm control configuration
+ * @hw: pointer to the HW struct
+ * @bcast_thresh: represents the upper threshold for broadcast storm control
+ * @mcast_thresh: represents the upper threshold for multicast storm control
+ * @ctl_bitmask: storm control knobs
  *
- * This helper function populates the lb_en and lan_en elements of the provided
- * ice_fltr_info struct using the switch's type and characteristics of the
- * switch rule being configured.
+ * Sets the storm control configuration (0x0280)
  */
-static void ice_fill_sw_info(struct ice_hw *hw, struct ice_fltr_info *fi)
+enum ice_status
+ice_aq_set_storm_ctrl(struct ice_hw *hw, u32 bcast_thresh, u32 mcast_thresh,
+		      u32 ctl_bitmask)
 {
-	fi->lb_en = false;
-	fi->lan_en = false;
-	if ((fi->flag & ICE_FLTR_TX) &&
-	    (fi->fltr_act == ICE_FWD_TO_VSI ||
-	     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
-	     fi->fltr_act == ICE_FWD_TO_Q ||
-	     fi->fltr_act == ICE_FWD_TO_QGRP)) {
-		/* Setting LB for prune actions will result in replicated
-		 * packets to the internal switch that will be dropped.
-		 */
-		if (fi->lkup_type != ICE_SW_LKUP_VLAN)
-			fi->lb_en = true;
+	struct ice_aqc_storm_cfg *cmd;
+	struct ice_aq_desc desc;
 
-		/* Set lan_en to TRUE if
-		 * 1. The switch is a VEB AND
-		 * 2
-		 * 2.1 The lookup is a directional lookup like ethertype,
-		 * promiscuous, ethertype-MAC, promiscuous-VLAN
-		 * and default-port OR
-		 * 2.2 The lookup is VLAN, OR
-		 * 2.3 The lookup is MAC with mcast or bcast addr for MAC, OR
-		 * 2.4 The lookup is MAC_VLAN with mcast or bcast addr for MAC.
-		 *
-		 * OR
-		 *
-		 * The switch is a VEPA.
-		 *
-		 * In all other cases, the LAN enable has to be set to false.
-		 */
-		if (hw->evb_veb) {
-			if (fi->lkup_type == ICE_SW_LKUP_ETHERTYPE ||
-			    fi->lkup_type == ICE_SW_LKUP_PROMISC ||
-			    fi->lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
-			    fi->lkup_type == ICE_SW_LKUP_PROMISC_VLAN ||
-			    fi->lkup_type == ICE_SW_LKUP_DFLT ||
-			    fi->lkup_type == ICE_SW_LKUP_VLAN ||
-			    (fi->lkup_type == ICE_SW_LKUP_MAC &&
-			     !is_unicast_ether_addr(fi->l_data.mac.mac_addr)) ||
-			    (fi->lkup_type == ICE_SW_LKUP_MAC_VLAN &&
-			     !is_unicast_ether_addr(fi->l_data.mac.mac_addr)))
-				fi->lan_en = true;
-		} else {
-			fi->lan_en = true;
-		}
-	}
+	cmd = &desc.params.storm_conf;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_storm_cfg);
+
+	cmd->bcast_thresh_size = cpu_to_le32(bcast_thresh & ICE_AQ_THRESHOLD_M);
+	cmd->mcast_thresh_size = cpu_to_le32(mcast_thresh & ICE_AQ_THRESHOLD_M);
+	cmd->storm_ctrl_ctrl = cpu_to_le32(ctl_bitmask);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 }
 
 /**
- * ice_fill_sw_rule - Helper function to fill switch rule structure
- * @hw: pointer to the hardware structure
- * @f_info: entry containing packet forwarding information
- * @s_rule: switch rule structure to be filled in based on mac_entry
- * @opc: switch rules population command type - pass in the command opcode
+ * ice_aq_get_storm_ctrl - gets storm control configuration
+ * @hw: pointer to the HW struct
+ * @bcast_thresh: represents the upper threshold for broadcast storm control
+ * @mcast_thresh: represents the upper threshold for multicast storm control
+ * @ctl_bitmask: storm control knobs
+ *
+ * Gets the storm control configuration (0x0281)
  */
-static void
-ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
-		 struct ice_aqc_sw_rules_elem *s_rule, enum ice_adminq_opc opc)
+enum ice_status
+ice_aq_get_storm_ctrl(struct ice_hw *hw, u32 *bcast_thresh, u32 *mcast_thresh,
+		      u32 *ctl_bitmask)
 {
-	u16 vlan_id = ICE_MAX_VLAN_ID + 1;
-	void *daddr = NULL;
-	u16 eth_hdr_sz;
-	u8 *eth_hdr;
-	u32 act = 0;
-	__be16 *off;
-	u8 q_rgn;
+	enum ice_status status;
+	struct ice_aq_desc desc;
 
-	if (opc == ice_aqc_opc_remove_sw_rules) {
-		s_rule->pdata.lkup_tx_rx.act = 0;
-		s_rule->pdata.lkup_tx_rx.index =
-			cpu_to_le16(f_info->fltr_rule_id);
-		s_rule->pdata.lkup_tx_rx.hdr_len = 0;
-		return;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_storm_cfg);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+	if (!status) {
+		struct ice_aqc_storm_cfg *resp = &desc.params.storm_conf;
+
+		if (bcast_thresh)
+			*bcast_thresh = le32_to_cpu(resp->bcast_thresh_size) &
+				ICE_AQ_THRESHOLD_M;
+		if (mcast_thresh)
+			*mcast_thresh = le32_to_cpu(resp->mcast_thresh_size) &
+				ICE_AQ_THRESHOLD_M;
+		if (ctl_bitmask)
+			*ctl_bitmask = le32_to_cpu(resp->storm_ctrl_ctrl);
 	}
 
-	eth_hdr_sz = sizeof(dummy_eth_header);
-	eth_hdr = s_rule->pdata.lkup_tx_rx.hdr;
+	return status;
+}
 
-	/* initialize the ether header with a dummy header */
-	memcpy(eth_hdr, dummy_eth_header, eth_hdr_sz);
-	ice_fill_sw_info(hw, f_info);
+/**
+ * ice_aq_sw_rules - add/update/remove switch rules
+ * @hw: pointer to the HW struct
+ * @rule_list: pointer to switch rule population list
+ * @rule_list_sz: total size of the rule list in bytes
+ * @num_rules: number of switch rules in the rule_list
+ * @opc: switch rules population command type - pass in the command opcode
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add(0x02a0)/Update(0x02a1)/Remove(0x02a2) switch rules commands to firmware
+ */
+enum ice_status __maybe_unused
+ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
+		u8 num_rules, enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	switch (f_info->fltr_act) {
-	case ICE_FWD_TO_VSI:
-		act |= (f_info->fwd_id.hw_vsi_id << ICE_SINGLE_ACT_VSI_ID_S) &
-			ICE_SINGLE_ACT_VSI_ID_M;
-		if (f_info->lkup_type != ICE_SW_LKUP_VLAN)
-			act |= ICE_SINGLE_ACT_VSI_FORWARDING |
-				ICE_SINGLE_ACT_VALID_BIT;
-		break;
-	case ICE_FWD_TO_VSI_LIST:
-		act |= ICE_SINGLE_ACT_VSI_LIST;
-		act |= (f_info->fwd_id.vsi_list_id <<
-			ICE_SINGLE_ACT_VSI_LIST_ID_S) &
-			ICE_SINGLE_ACT_VSI_LIST_ID_M;
-		if (f_info->lkup_type != ICE_SW_LKUP_VLAN)
-			act |= ICE_SINGLE_ACT_VSI_FORWARDING |
-				ICE_SINGLE_ACT_VALID_BIT;
-		break;
-	case ICE_FWD_TO_Q:
-		act |= ICE_SINGLE_ACT_TO_Q;
-		act |= (f_info->fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
-			ICE_SINGLE_ACT_Q_INDEX_M;
-		break;
-	case ICE_DROP_PACKET:
-		act |= ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_DROP |
-			ICE_SINGLE_ACT_VALID_BIT;
-		break;
-	case ICE_FWD_TO_QGRP:
-		q_rgn = f_info->qgrp_size > 0 ?
-			(u8)ilog2(f_info->qgrp_size) : 0;
-		act |= ICE_SINGLE_ACT_TO_Q;
-		act |= (f_info->fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
-			ICE_SINGLE_ACT_Q_INDEX_M;
-		act |= (q_rgn << ICE_SINGLE_ACT_Q_REGION_S) &
-			ICE_SINGLE_ACT_Q_REGION_M;
-		break;
-	default:
-		return;
+	if (opc != ice_aqc_opc_add_sw_rules &&
+	    opc != ice_aqc_opc_update_sw_rules &&
+	    opc != ice_aqc_opc_remove_sw_rules)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.params.sw_rules.num_rules_fltr_entry_index =
+		cpu_to_le16(num_rules);
+	status = ice_aq_send_cmd(hw, &desc, rule_list, rule_list_sz, cd);
+	if (opc != ice_aqc_opc_add_sw_rules &&
+	    hw->adminq.sq_last_status == ICE_AQ_RC_ENOENT)
+		status = ICE_ERR_DOES_NOT_EXIST;
+
+	return status;
+}
+
+/**
+ * ice_aq_add_recipe - add switch recipe
+ * @hw: pointer to the HW struct
+ * @s_recipe_list: pointer to switch rule population list
+ * @num_recipes: number of switch recipes in the list
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add(0x0290)
+ */
+enum ice_status
+ice_aq_add_recipe(struct ice_hw *hw,
+		  struct ice_aqc_recipe_data_elem *s_recipe_list,
+		  u16 num_recipes, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_get_recipe *cmd;
+	struct ice_aq_desc desc;
+	u16 buf_size;
+
+	cmd = &desc.params.add_get_recipe;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_recipe);
+
+	cmd->num_sub_recipes = cpu_to_le16(num_recipes);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	buf_size = num_recipes * sizeof(*s_recipe_list);
+
+	return ice_aq_send_cmd(hw, &desc, s_recipe_list, buf_size, cd);
+}
+
+/**
+ * ice_aq_get_recipe - get switch recipe
+ * @hw: pointer to the HW struct
+ * @s_recipe_list: pointer to switch rule population list
+ * @num_recipes: pointer to the number of recipes (input and output)
+ * @recipe_root: root recipe number of recipe(s) to retrieve
+ * @cd: pointer to command details structure or NULL
+ *
+ * Get(0x0292)
+ *
+ * On input, *num_recipes should equal the number of entries in s_recipe_list.
+ * On output, *num_recipes will equal the number of entries returned in
+ * s_recipe_list.
+ *
+ * The caller must supply enough space in s_recipe_list to hold all possible
+ * recipes and *num_recipes must equal ICE_MAX_NUM_RECIPES.
+ */
+enum ice_status
+ice_aq_get_recipe(struct ice_hw *hw,
+		  struct ice_aqc_recipe_data_elem *s_recipe_list,
+		  u16 *num_recipes, u16 recipe_root, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_get_recipe *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u16 buf_size;
+
+	if (*num_recipes != ICE_MAX_NUM_RECIPES)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.add_get_recipe;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_recipe);
+
+	cmd->return_index = cpu_to_le16(recipe_root);
+	cmd->num_sub_recipes = 0;
+
+	buf_size = *num_recipes * sizeof(*s_recipe_list);
+
+	status = ice_aq_send_cmd(hw, &desc, s_recipe_list, buf_size, cd);
+	*num_recipes = le16_to_cpu(cmd->num_sub_recipes);
+
+	return status;
+}
+
+/**
+ * ice_update_recipe_lkup_idx - update a default recipe based on the lkup_idx
+ * @hw: pointer to the HW struct
+ * @params: parameters used to update the default recipe
+ *
+ * This function only supports updating default recipes and it only supports
+ * updating a single recipe based on the lkup_idx at a time.
+ *
+ * This is done as a read-modify-write operation. First, get the current recipe
+ * contents based on the recipe's ID. Then modify the field vector index and
+ * mask if it's valid at the lkup_idx. Finally, use the add recipe AQ to update
+ * the pre-existing recipe with the modifications.
+ */
+enum ice_status
+ice_update_recipe_lkup_idx(struct ice_hw *hw,
+			   struct ice_update_recipe_lkup_idx_params *params)
+{
+	struct ice_aqc_recipe_data_elem *rcp_list;
+	u16 num_recps = ICE_MAX_NUM_RECIPES;
+	enum ice_status status;
+
+	rcp_list = devm_kzalloc(ice_hw_to_dev(hw),
+				num_recps * sizeof(*rcp_list), GFP_KERNEL);
+	if (!rcp_list)
+		return ICE_ERR_NO_MEMORY;
+
+	/* read current recipe list from firmware */
+	rcp_list->recipe_indx = params->rid;
+	status = ice_aq_get_recipe(hw, rcp_list, &num_recps, params->rid, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_SW, "Failed to get recipe %d, status %d\n",
+			  params->rid, status);
+		goto error_out;
 	}
 
-	if (f_info->lb_en)
-		act |= ICE_SINGLE_ACT_LB_ENABLE;
-	if (f_info->lan_en)
-		act |= ICE_SINGLE_ACT_LAN_ENABLE;
+	/* only modify existing recipe's lkup_idx and mask if valid, while
+	 * leaving all other fields the same, then update the recipe firmware
+	 */
+	rcp_list->content.lkup_indx[params->lkup_idx] = params->fv_idx;
+	if (params->mask_valid)
+		rcp_list->content.mask[params->lkup_idx] =
+			cpu_to_le16(params->mask);
 
-	switch (f_info->lkup_type) {
-	case ICE_SW_LKUP_MAC:
-		daddr = f_info->l_data.mac.mac_addr;
-		break;
-	case ICE_SW_LKUP_VLAN:
-		vlan_id = f_info->l_data.vlan.vlan_id;
-		if (f_info->fltr_act == ICE_FWD_TO_VSI ||
-		    f_info->fltr_act == ICE_FWD_TO_VSI_LIST) {
-			act |= ICE_SINGLE_ACT_PRUNE;
-			act |= ICE_SINGLE_ACT_EGRESS | ICE_SINGLE_ACT_INGRESS;
-		}
-		break;
-	case ICE_SW_LKUP_ETHERTYPE_MAC:
-		daddr = f_info->l_data.ethertype_mac.mac_addr;
-		/* fall-through */
-	case ICE_SW_LKUP_ETHERTYPE:
-		off = (__force __be16 *)(eth_hdr + ICE_ETH_ETHTYPE_OFFSET);
-		*off = cpu_to_be16(f_info->l_data.ethertype_mac.ethertype);
-		break;
-	case ICE_SW_LKUP_MAC_VLAN:
-		daddr = f_info->l_data.mac_vlan.mac_addr;
-		vlan_id = f_info->l_data.mac_vlan.vlan_id;
-		break;
-	case ICE_SW_LKUP_PROMISC_VLAN:
-		vlan_id = f_info->l_data.mac_vlan.vlan_id;
-		/* fall-through */
-	case ICE_SW_LKUP_PROMISC:
-		daddr = f_info->l_data.mac_vlan.mac_addr;
+	if (params->ignore_valid)
+		rcp_list->content.lkup_indx[params->lkup_idx] |=
+			ICE_AQ_RECIPE_LKUP_IGNORE;
+
+	status = ice_aq_add_recipe(hw, &rcp_list[0], 1, NULL);
+	if (status)
+		ice_debug(hw, ICE_DBG_SW, "Failed to update recipe %d lkup_idx %d fv_idx %d mask %d mask_valid %s, status %d\n",
+			  params->rid, params->lkup_idx, params->fv_idx,
+			  params->mask, params->mask_valid ? "true" : "false",
+			  status);
+
+error_out:
+	devm_kfree(ice_hw_to_dev(hw), rcp_list);
+	return status;
+}
+
+/**
+ * ice_aq_map_recipe_to_profile - Map recipe to packet profile
+ * @hw: pointer to the HW struct
+ * @profile_id: package profile ID to associate the recipe with
+ * @r_bitmap: Recipe bitmap filled in and need to be returned as response
+ * @cd: pointer to command details structure or NULL
+ * Recipe to profile association (0x0291)
+ */
+enum ice_status
+ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap,
+			     struct ice_sq_cd *cd)
+{
+	struct ice_aqc_recipe_to_profile *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.recipe_to_profile;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_recipe_to_profile);
+	cmd->profile_id = cpu_to_le16(profile_id);
+	/* Set the recipe ID bit in the bitmask to let the device know which
+	 * profile we are associating the recipe to
+	 */
+	memcpy(cmd->recipe_assoc, r_bitmap, sizeof(cmd->recipe_assoc));
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_get_recipe_to_profile - Map recipe to packet profile
+ * @hw: pointer to the HW struct
+ * @profile_id: package profile ID to associate the recipe with
+ * @r_bitmap: Recipe bitmap filled in and need to be returned as response
+ * @cd: pointer to command details structure or NULL
+ * Associate profile ID with given recipe (0x0293)
+ */
+enum ice_status
+ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap,
+			     struct ice_sq_cd *cd)
+{
+	struct ice_aqc_recipe_to_profile *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.recipe_to_profile;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_recipe_to_profile);
+	cmd->profile_id = cpu_to_le16(profile_id);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (!status)
+		memcpy(r_bitmap, cmd->recipe_assoc, sizeof(cmd->recipe_assoc));
+
+	return status;
+}
+
+/**
+ * ice_alloc_recipe - add recipe resource
+ * @hw: pointer to the hardware structure
+ * @rid: recipe ID returned as response to AQ call
+ */
+enum ice_status ice_alloc_recipe(struct ice_hw *hw, u16 *rid)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	enum ice_status status;
+	u16 buf_len;
+
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	sw_buf->num_elems = cpu_to_le16(1);
+	sw_buf->res_type = cpu_to_le16((ICE_AQC_RES_TYPE_RECIPE <<
+					ICE_AQC_RES_TYPE_S) |
+					ICE_AQC_RES_TYPE_FLAG_SHARED);
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+	if (!status)
+		*rid = le16_to_cpu(sw_buf->elem[0].e.sw_resp);
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+
+	return status;
+}
+
+/* ice_init_port_info - Initialize port_info with switch configuration data
+ * @pi: pointer to port_info
+ * @vsi_port_num: VSI number or port number
+ * @type: Type of switch element (port or VSI)
+ * @swid: switch ID of the switch the element is attached to
+ * @pf_vf_num: PF or VF number
+ * @is_vf: true if the element is a VF, false otherwise
+ */
+static void
+ice_init_port_info(struct ice_port_info *pi, u16 vsi_port_num, u8 type,
+		   u16 swid, u16 pf_vf_num, bool is_vf)
+{
+	switch (type) {
+	case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
+		pi->lport = (u8)(vsi_port_num & ICE_LPORT_MASK);
+		pi->sw_id = swid;
+		pi->pf_vf_num = pf_vf_num;
+		pi->is_vf = is_vf;
+		pi->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
+		pi->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
 		break;
 	default:
+		ice_debug(pi->hw, ICE_DBG_SW, "incorrect VSI/port type received\n");
 		break;
 	}
+}
 
-	s_rule->type = (f_info->flag & ICE_FLTR_RX) ?
-		cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX) :
-		cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX);
+/* ice_get_initial_sw_cfg - Get initial port and default VSI data
+ * @hw: pointer to the hardware structure
+ */
+enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw)
+{
+	struct ice_aqc_get_sw_cfg_resp_elem *rbuf;
+	enum ice_status status;
+	u8 num_total_ports;
+	u16 req_desc = 0;
+	u16 num_elems;
+	u8 j = 0;
+	u16 i;
 
-	/* Recipe set depending on lookup type */
-	s_rule->pdata.lkup_tx_rx.recipe_id = cpu_to_le16(f_info->lkup_type);
-	s_rule->pdata.lkup_tx_rx.src = cpu_to_le16(f_info->src);
-	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+	num_total_ports = 1;
 
-	if (daddr)
-		ether_addr_copy(eth_hdr + ICE_ETH_DA_OFFSET, daddr);
+	rbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_SW_CFG_MAX_BUF_LEN,
+			    GFP_KERNEL);
 
-	if (!(vlan_id > ICE_MAX_VLAN_ID)) {
-		off = (__force __be16 *)(eth_hdr + ICE_ETH_VLAN_TCI_OFFSET);
-		*off = cpu_to_be16(vlan_id);
-	}
+	if (!rbuf)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Multiple calls to ice_aq_get_sw_cfg may be required
+	 * to get all the switch configuration information. The need
+	 * for additional calls is indicated by ice_aq_get_sw_cfg
+	 * writing a non-zero value in req_desc
+	 */
+	do {
+		struct ice_aqc_get_sw_cfg_resp_elem *ele;
+
+		status = ice_aq_get_sw_cfg(hw, rbuf, ICE_SW_CFG_MAX_BUF_LEN,
+					   &req_desc, &num_elems, NULL);
+
+		if (status)
+			break;
+
+		for (i = 0, ele = rbuf; i < num_elems; i++, ele++) {
+			u16 pf_vf_num, swid, vsi_port_num;
+			bool is_vf = false;
+			u8 res_type;
+
+			vsi_port_num = le16_to_cpu(ele->vsi_port_num) &
+				ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_M;
+
+			pf_vf_num = le16_to_cpu(ele->pf_vf_num) &
+				ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_M;
+
+			swid = le16_to_cpu(ele->swid);
+
+			if (le16_to_cpu(ele->pf_vf_num) &
+			    ICE_AQC_GET_SW_CONF_RESP_IS_VF)
+				is_vf = true;
+
+			res_type = (u8)(le16_to_cpu(ele->vsi_port_num) >>
+					ICE_AQC_GET_SW_CONF_RESP_TYPE_S);
+
+			switch (res_type) {
+			case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
+			case ICE_AQC_GET_SW_CONF_RESP_VIRT_PORT:
+				if (j == num_total_ports) {
+					ice_debug(hw, ICE_DBG_SW, "more ports than expected\n");
+					status = ICE_ERR_CFG;
+					goto out;
+				}
+				ice_init_port_info(hw->port_info,
+						   vsi_port_num, res_type, swid,
+						   pf_vf_num, is_vf);
+				j++;
+				break;
+			default:
+				break;
+			}
+		}
+	} while (req_desc && !status);
+
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), rbuf);
+	return status;
+}
+
+/**
+ * ice_dump_all_sw_rules
+ * @hw: pointer to the hardware structure
+ * @lkup: switch rule filter lookup type
+ * @recp_list: pointer to recipes
+ *
+ * Helper function to print filter information of all entries in the list for a
+ * given lookup type
+ */
+static void
+ice_dump_all_sw_rules(struct ice_hw *hw, enum ice_sw_lkup_type lkup,
+		      struct ice_sw_recipe *recp_list)
+{
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	struct ice_fltr_info *fi;
+
+	rule_lock = &recp_list->filt_rule_lock;
+	rule_head = &recp_list->filt_rules;
+
+	switch (lkup) {
+	case ICE_SW_LKUP_MAC:
+		/* dump MAC hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump MAC hash list of lookup type %d\n", lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tmac: %pM, vsi_count = %d, fw_act_flag = %d, lb_en = %d, lan_en = %d, filt_act = %d, filt_rule_id = %d\n",
+				 fi->l_data.mac.mac_addr, fm_entry->vsi_count,
+				 fi->flag, fi->lb_en, fi->lan_en,
+				 fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_VLAN:
+		/* dump VLAN hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump VLAN hash list of lookup type %d\n", lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tvlan_id = %d, vlan_tpid = 0x%04x, vsi_count = %d, vsi_list_id = %d, fw_act_flag = %d, filt_act = %d, lb_en = %d, lan_en = %d, filt_rule_id = %d\n",
+				 fi->l_data.vlan.vlan_id,
+				 fi->l_data.vlan.tpid_valid ? fi->l_data.vlan.tpid : ETH_P_8021Q,
+				 fm_entry->vsi_count, fi->fwd_id.vsi_list_id,
+				 fi->flag, fi->lb_en, fi->lan_en,
+				 fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_MAC_VLAN:
+		/* dump MAC VLAN hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump MAC VLAN hash list of lookup type %d\n",
+			 lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tmac: %pM, vlan_id = %d, vsi_count = %d, fw_act_flag = %d, lb_en = %d, lan_en = %d, filt_act = %d, filt_rule_id = %d\n",
+				 fi->l_data.mac_vlan.mac_addr,
+				 fi->l_data.mac_vlan.vlan_id,
+				 fm_entry->vsi_count, fi->flag, fi->lb_en,
+				 fi->lan_en, fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_ETHERTYPE:
+		/* dump Ethertype/Ethertype MAC hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump Ethertype hash list of lookup type %d\n",
+			 lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tethertype = %d, vsi_count = %d, fw_act_flag = %d, filt_act = %d, lb_en = %d, lan_en = %d, filt_rule_id = %d\n",
+				 fi->l_data.ethertype_mac.ethertype,
+				 fm_entry->vsi_count, fi->flag, fi->fltr_act,
+				 fi->lb_en, fi->lan_en, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_ETHERTYPE_MAC:
+		/* dump Ethertype/Ethertype MAC hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump Ethertype MAC hash list of lookup type %d\n",
+			 lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tmac: %pM, ethertype = %d, vsi_count = %d, fw_act_flag = %d, filt_act = %d, lb_en = %d, lan_en = %d, filt_rule_id = %d\n",
+				 fi->l_data.ethertype_mac.mac_addr,
+				 fi->l_data.ethertype_mac.ethertype,
+				 fm_entry->vsi_count, fi->flag, fi->lb_en,
+				 fi->lan_en, fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_PROMISC:
+		/* dump Promisc mode hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump Promisc/Promisc VLAN mode hash list of lookup type %d\n",
+			 lkup);
+		dev_info(ice_hw_to_dev(hw),
+			 "\tNote: Ignore VLAN in case of Promisc only lookup type & ignore MAC in case of Promisc VLAN lookup type\n");
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tmac: %pM, vlan_id = %d, vsi_count = %d, fw_act_flag = %d, lb_en = %d, lan_en = %d, filt_act = %d, filt_rule_id = %d\n",
+				 fi->l_data.mac_vlan.mac_addr,
+				 fi->l_data.mac_vlan.vlan_id,
+				 fm_entry->vsi_count, fi->flag, fi->lb_en,
+				 fi->lan_en, fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_DFLT:
+		/* dump default VSI filter rule */
+		if (hw->port_info->dflt_tx_vsi_num != ICE_DFLT_VSI_INVAL)
+			dev_info(ice_hw_to_dev(hw),
+				 "\tDefault VSI filter (lookup type %d): tx_vsi_id = %d, filt_act = %d, tx_filt_rule_id = %d\n",
+				 lkup, hw->port_info->dflt_tx_vsi_num,
+				 ICE_FWD_TO_VSI,
+				 hw->port_info->dflt_tx_vsi_rule_id);
+
+		if (hw->port_info->dflt_rx_vsi_num != ICE_DFLT_VSI_INVAL)
+			dev_info(ice_hw_to_dev(hw),
+				 "\tDefault VSI filter (lookup type %d): rx_vsi_id = %d, filt_act = %d, rx_filt_rule_id = %d\n",
+				 lkup, hw->port_info->dflt_rx_vsi_num,
+				 ICE_FWD_TO_VSI,
+				 hw->port_info->dflt_rx_vsi_rule_id);
+		break;
+	case ICE_SW_LKUP_PROMISC_VLAN:
+	case ICE_SW_LKUP_LAST:
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump for this lookup type hasn't been implemented yet\n");
+		break;
+	}
+}
+
+
+/**
+ * ice_fill_sw_info - Helper function to populate lb_en and lan_en
+ * @hw: pointer to the hardware structure
+ * @fi: filter info structure to fill/update
+ *
+ * This helper function populates the lb_en and lan_en elements of the provided
+ * ice_fltr_info struct using the switch's type and characteristics of the
+ * switch rule being configured.
+ */
+static void ice_fill_sw_info(struct ice_hw *hw, struct ice_fltr_info *fi)
+{
+	fi->lb_en = false;
+	fi->lan_en = false;
+	if ((fi->flag & ICE_FLTR_TX) &&
+	    (fi->fltr_act == ICE_FWD_TO_VSI ||
+	     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
+	     fi->fltr_act == ICE_FWD_TO_Q ||
+	     fi->fltr_act == ICE_FWD_TO_QGRP)) {
+		/* Setting LB for prune actions will result in replicated
+		 * packets to the internal switch that will be dropped.
+		 */
+		if (fi->lkup_type != ICE_SW_LKUP_VLAN)
+			fi->lb_en = true;
+
+		/* Set lan_en to TRUE if
+		 * 1. The switch is a VEB AND
+		 * 2
+		 * 2.1 The lookup is a directional lookup like ethertype,
+		 * promiscuous, ethertype-MAC, promiscuous-VLAN
+		 * and default-port OR
+		 * 2.2 The lookup is VLAN, OR
+		 * 2.3 The lookup is MAC with mcast or bcast addr for MAC, OR
+		 * 2.4 The lookup is MAC_VLAN with mcast or bcast addr for MAC.
+		 *
+		 * OR
+		 *
+		 * The switch is a VEPA.
+		 *
+		 * In all other cases, the LAN enable has to be set to false.
+		 */
+		if (hw->evb_veb) {
+			if (fi->lkup_type == ICE_SW_LKUP_ETHERTYPE ||
+			    fi->lkup_type == ICE_SW_LKUP_PROMISC ||
+			    fi->lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
+			    fi->lkup_type == ICE_SW_LKUP_PROMISC_VLAN ||
+			    fi->lkup_type == ICE_SW_LKUP_DFLT ||
+			    fi->lkup_type == ICE_SW_LKUP_VLAN ||
+			    (fi->lkup_type == ICE_SW_LKUP_MAC &&
+			     !is_unicast_ether_addr(fi->l_data.mac.mac_addr)) ||
+			    (fi->lkup_type == ICE_SW_LKUP_MAC_VLAN &&
+			     !is_unicast_ether_addr(fi->l_data.mac.mac_addr)))
+				fi->lan_en = true;
+		} else {
+			fi->lan_en = true;
+		}
+	}
+}
+
+/**
+ * ice_fill_sw_rule - Helper function to fill switch rule structure
+ * @hw: pointer to the hardware structure
+ * @f_info: entry containing packet forwarding information
+ * @s_rule: switch rule structure to be filled in based on mac_entry
+ * @opc: switch rules population command type - pass in the command opcode
+ */
+static void
+ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
+		 struct ice_aqc_sw_rules_elem *s_rule, enum ice_adminq_opc opc)
+{
+	u16 vlan_id = ICE_MAX_VLAN_ID + 1;
+	u16 vlan_tpid = ETH_P_8021Q;
+	void *daddr = NULL;
+	u16 eth_hdr_sz;
+	u8 *eth_hdr;
+	u32 act = 0;
+	__be16 *off;
+	u8 q_rgn;
+
+	if (opc == ice_aqc_opc_remove_sw_rules) {
+		s_rule->pdata.lkup_tx_rx.act = 0;
+		s_rule->pdata.lkup_tx_rx.index =
+			cpu_to_le16(f_info->fltr_rule_id);
+		s_rule->pdata.lkup_tx_rx.hdr_len = 0;
+		return;
+	}
+
+	eth_hdr_sz = sizeof(dummy_eth_header);
+	eth_hdr = s_rule->pdata.lkup_tx_rx.hdr;
+
+	/* initialize the ether header with a dummy header */
+	memcpy(eth_hdr, dummy_eth_header, eth_hdr_sz);
+	ice_fill_sw_info(hw, f_info);
+
+	switch (f_info->fltr_act) {
+	case ICE_FWD_TO_VSI:
+		act |= (f_info->fwd_id.hw_vsi_id << ICE_SINGLE_ACT_VSI_ID_S) &
+			ICE_SINGLE_ACT_VSI_ID_M;
+		if (f_info->lkup_type != ICE_SW_LKUP_VLAN)
+			act |= ICE_SINGLE_ACT_VSI_FORWARDING |
+				ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	case ICE_FWD_TO_VSI_LIST:
+		act |= ICE_SINGLE_ACT_VSI_LIST;
+		act |= (f_info->fwd_id.vsi_list_id <<
+			ICE_SINGLE_ACT_VSI_LIST_ID_S) &
+			ICE_SINGLE_ACT_VSI_LIST_ID_M;
+		if (f_info->lkup_type != ICE_SW_LKUP_VLAN)
+			act |= ICE_SINGLE_ACT_VSI_FORWARDING |
+				ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	case ICE_FWD_TO_Q:
+		act |= ICE_SINGLE_ACT_TO_Q;
+		act |= (f_info->fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
+			ICE_SINGLE_ACT_Q_INDEX_M;
+		break;
+	case ICE_DROP_PACKET:
+		act |= ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_DROP |
+			ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	case ICE_FWD_TO_QGRP:
+		q_rgn = f_info->qgrp_size > 0 ?
+			(u8)ilog2(f_info->qgrp_size) : 0;
+		act |= ICE_SINGLE_ACT_TO_Q;
+		act |= (f_info->fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
+			ICE_SINGLE_ACT_Q_INDEX_M;
+		act |= (q_rgn << ICE_SINGLE_ACT_Q_REGION_S) &
+			ICE_SINGLE_ACT_Q_REGION_M;
+		break;
+	default:
+		return;
+	}
+
+	if (f_info->lb_en)
+		act |= ICE_SINGLE_ACT_LB_ENABLE;
+	if (f_info->lan_en)
+		act |= ICE_SINGLE_ACT_LAN_ENABLE;
+
+	switch (f_info->lkup_type) {
+	case ICE_SW_LKUP_MAC:
+		daddr = f_info->l_data.mac.mac_addr;
+		break;
+	case ICE_SW_LKUP_VLAN:
+		vlan_id = f_info->l_data.vlan.vlan_id;
+		if (f_info->l_data.vlan.tpid_valid)
+			vlan_tpid = f_info->l_data.vlan.tpid;
+		if (f_info->fltr_act == ICE_FWD_TO_VSI ||
+		    f_info->fltr_act == ICE_FWD_TO_VSI_LIST) {
+			act |= ICE_SINGLE_ACT_PRUNE;
+			act |= ICE_SINGLE_ACT_EGRESS | ICE_SINGLE_ACT_INGRESS;
+		}
+		break;
+	case ICE_SW_LKUP_ETHERTYPE_MAC:
+		daddr = f_info->l_data.ethertype_mac.mac_addr;
+		/* fall-through */
+	case ICE_SW_LKUP_ETHERTYPE:
+		off = (__force __be16 *)(eth_hdr + ICE_ETH_ETHTYPE_OFFSET);
+		*off = cpu_to_be16(f_info->l_data.ethertype_mac.ethertype);
+		break;
+	case ICE_SW_LKUP_MAC_VLAN:
+		daddr = f_info->l_data.mac_vlan.mac_addr;
+		vlan_id = f_info->l_data.mac_vlan.vlan_id;
+		break;
+	case ICE_SW_LKUP_PROMISC_VLAN:
+		vlan_id = f_info->l_data.mac_vlan.vlan_id;
+		/* fall-through */
+	case ICE_SW_LKUP_PROMISC:
+		daddr = f_info->l_data.mac_vlan.mac_addr;
+		break;
+	default:
+		break;
+	}
+
+	s_rule->type = (f_info->flag & ICE_FLTR_RX) ?
+		cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX) :
+		cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX);
+
+	/* Recipe set depending on lookup type */
+	s_rule->pdata.lkup_tx_rx.recipe_id = cpu_to_le16(f_info->lkup_type);
+	s_rule->pdata.lkup_tx_rx.src = cpu_to_le16(f_info->src);
+	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+
+	if (daddr)
+		ether_addr_copy(eth_hdr + ICE_ETH_DA_OFFSET, daddr);
+
+	if (!(vlan_id > ICE_MAX_VLAN_ID)) {
+		off = (__force __be16 *)(eth_hdr + ICE_ETH_VLAN_TCI_OFFSET);
+		*off = cpu_to_be16(vlan_id);
+		off = (__force __be16 *)(eth_hdr + ICE_ETH_ETHTYPE_OFFSET);
+		*off = cpu_to_be16(vlan_tpid);
+	}
+
+	/* Create the switch rule with the final dummy Ethernet header */
+	if (opc != ice_aqc_opc_update_sw_rules)
+		s_rule->pdata.lkup_tx_rx.hdr_len = cpu_to_le16(eth_hdr_sz);
+}
+
+
+/**
+ * ice_dump_sw_rules - Function to dump sw rules
+ * @hw: pointer to the hardware structure
+ * @lkup: rules type to be dumped
+ */
+void ice_dump_sw_rules(struct ice_hw *hw, enum ice_sw_lkup_type lkup)
+{
+	ice_dump_all_sw_rules(hw, lkup, &hw->switch_info->recp_list[lkup]);
+}
+
+/**
+ * ice_add_marker_act
+ * @hw: pointer to the hardware structure
+ * @m_ent: the management entry for which sw marker needs to be added
+ * @sw_marker: sw marker to tag the Rx descriptor with
+ * @l_id: large action resource ID
+ *
+ * Create a large action to hold software marker and update the switch rule
+ * entry pointed by m_ent with newly created large action
+ */
+static enum ice_status
+ice_add_marker_act(struct ice_hw *hw, struct ice_fltr_mgmt_list_entry *m_ent,
+		   u16 sw_marker, u16 l_id)
+{
+	struct ice_aqc_sw_rules_elem *lg_act, *rx_tx;
+	/* For software marker we need 3 large actions
+	 * 1. FWD action: FWD TO VSI or VSI LIST
+	 * 2. GENERIC VALUE action to hold the profile ID
+	 * 3. GENERIC VALUE action to hold the software marker ID
+	 */
+	const u16 num_lg_acts = 3;
+	enum ice_status status;
+	u16 lg_act_size;
+	u16 rules_size;
+	u32 act;
+	u16 id;
+
+	if (m_ent->fltr_info.lkup_type != ICE_SW_LKUP_MAC)
+		return ICE_ERR_PARAM;
+
+	/* Create two back-to-back switch rules and submit them to the HW using
+	 * one memory buffer:
+	 *    1. Large Action
+	 *    2. Look up Tx Rx
+	 */
+	lg_act_size = (u16)ICE_SW_RULE_LG_ACT_SIZE(num_lg_acts);
+	rules_size = lg_act_size + ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
+	lg_act = devm_kzalloc(ice_hw_to_dev(hw), rules_size, GFP_KERNEL);
+	if (!lg_act)
+		return ICE_ERR_NO_MEMORY;
+
+	rx_tx = (struct ice_aqc_sw_rules_elem *)((u8 *)lg_act + lg_act_size);
+
+	/* Fill in the first switch rule i.e. large action */
+	lg_act->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LG_ACT);
+	lg_act->pdata.lg_act.index = cpu_to_le16(l_id);
+	lg_act->pdata.lg_act.size = cpu_to_le16(num_lg_acts);
+
+	/* First action VSI forwarding or VSI list forwarding depending on how
+	 * many VSIs
+	 */
+	id = (m_ent->vsi_count > 1) ? m_ent->fltr_info.fwd_id.vsi_list_id :
+		m_ent->fltr_info.fwd_id.hw_vsi_id;
+
+	act = ICE_LG_ACT_VSI_FORWARDING | ICE_LG_ACT_VALID_BIT;
+	act |= (id << ICE_LG_ACT_VSI_LIST_ID_S) & ICE_LG_ACT_VSI_LIST_ID_M;
+	if (m_ent->vsi_count > 1)
+		act |= ICE_LG_ACT_VSI_LIST;
+	lg_act->pdata.lg_act.act[0] = cpu_to_le32(act);
+
+	/* Second action descriptor type */
+	act = ICE_LG_ACT_GENERIC;
+
+	act |= (1 << ICE_LG_ACT_GENERIC_VALUE_S) & ICE_LG_ACT_GENERIC_VALUE_M;
+	lg_act->pdata.lg_act.act[1] = cpu_to_le32(act);
+
+	act = (ICE_LG_ACT_GENERIC_OFF_RX_DESC_PROF_IDX <<
+	       ICE_LG_ACT_GENERIC_OFFSET_S) & ICE_LG_ACT_GENERIC_OFFSET_M;
+
+	/* Third action Marker value */
+	act |= ICE_LG_ACT_GENERIC;
+	act |= (sw_marker << ICE_LG_ACT_GENERIC_VALUE_S) &
+		ICE_LG_ACT_GENERIC_VALUE_M;
+
+	lg_act->pdata.lg_act.act[2] = cpu_to_le32(act);
+
+	/* call the fill switch rule to fill the lookup Tx Rx structure */
+	ice_fill_sw_rule(hw, &m_ent->fltr_info, rx_tx,
+			 ice_aqc_opc_update_sw_rules);
+
+	/* Update the action to point to the large action ID */
+	rx_tx->pdata.lkup_tx_rx.act =
+		cpu_to_le32(ICE_SINGLE_ACT_PTR |
+			    ((l_id << ICE_SINGLE_ACT_PTR_VAL_S) &
+			     ICE_SINGLE_ACT_PTR_VAL_M));
+
+	/* Use the filter rule ID of the previously created rule with single
+	 * act. Once the update happens, hardware will treat this as large
+	 * action
+	 */
+	rx_tx->pdata.lkup_tx_rx.index =
+		cpu_to_le16(m_ent->fltr_info.fltr_rule_id);
+
+	status = ice_aq_sw_rules(hw, lg_act, rules_size, 2,
+				 ice_aqc_opc_update_sw_rules, NULL);
+	if (!status) {
+		m_ent->lg_act_idx = l_id;
+		m_ent->sw_marker_id = sw_marker;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), lg_act);
+	return status;
+}
+
+/**
+ * ice_add_counter_act - add/update filter rule with counter action
+ * @hw: pointer to the hardware structure
+ * @m_ent: the management entry for which counter needs to be added
+ * @counter_id: VLAN counter ID returned as part of allocate resource
+ * @l_id: large action resource ID
+ */
+static enum ice_status
+ice_add_counter_act(struct ice_hw *hw, struct ice_fltr_mgmt_list_entry *m_ent,
+		    u16 counter_id, u16 l_id)
+{
+	struct ice_aqc_sw_rules_elem *lg_act;
+	struct ice_aqc_sw_rules_elem *rx_tx;
+	enum ice_status status;
+	/* 2 actions will be added while adding a large action counter */
+	const int num_acts = 2;
+	u16 lg_act_size;
+	u16 rules_size;
+	u16 f_rule_id;
+	u32 act;
+	u16 id;
+
+	if (m_ent->fltr_info.lkup_type != ICE_SW_LKUP_MAC)
+		return ICE_ERR_PARAM;
+
+	/* Create two back-to-back switch rules and submit them to the HW using
+	 * one memory buffer:
+	 * 1. Large Action
+	 * 2. Look up Tx Rx
+	 */
+	lg_act_size = (u16)ICE_SW_RULE_LG_ACT_SIZE(num_acts);
+	rules_size = lg_act_size + ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
+	lg_act = devm_kzalloc(ice_hw_to_dev(hw), rules_size, GFP_KERNEL);
+	if (!lg_act)
+		return ICE_ERR_NO_MEMORY;
+
+	rx_tx = (struct ice_aqc_sw_rules_elem *)((u8 *)lg_act + lg_act_size);
+
+	/* Fill in the first switch rule i.e. large action */
+	lg_act->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LG_ACT);
+	lg_act->pdata.lg_act.index = cpu_to_le16(l_id);
+	lg_act->pdata.lg_act.size = cpu_to_le16(num_acts);
+
+	/* First action VSI forwarding or VSI list forwarding depending on how
+	 * many VSIs
+	 */
+	id = (m_ent->vsi_count > 1) ?  m_ent->fltr_info.fwd_id.vsi_list_id :
+		m_ent->fltr_info.fwd_id.hw_vsi_id;
+
+	act = ICE_LG_ACT_VSI_FORWARDING | ICE_LG_ACT_VALID_BIT;
+	act |= (id << ICE_LG_ACT_VSI_LIST_ID_S) &
+		ICE_LG_ACT_VSI_LIST_ID_M;
+	if (m_ent->vsi_count > 1)
+		act |= ICE_LG_ACT_VSI_LIST;
+	lg_act->pdata.lg_act.act[0] = cpu_to_le32(act);
+
+	/* Second action counter ID */
+	act = ICE_LG_ACT_STAT_COUNT;
+	act |= (counter_id << ICE_LG_ACT_STAT_COUNT_S) &
+		ICE_LG_ACT_STAT_COUNT_M;
+	lg_act->pdata.lg_act.act[1] = cpu_to_le32(act);
+
+	/* call the fill switch rule to fill the lookup Tx Rx structure */
+	ice_fill_sw_rule(hw, &m_ent->fltr_info, rx_tx,
+			 ice_aqc_opc_update_sw_rules);
+
+	act = ICE_SINGLE_ACT_PTR;
+	act |= (l_id << ICE_SINGLE_ACT_PTR_VAL_S) & ICE_SINGLE_ACT_PTR_VAL_M;
+	rx_tx->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+
+	/* Use the filter rule ID of the previously created rule with single
+	 * act. Once the update happens, hardware will treat this as large
+	 * action
+	 */
+	f_rule_id = m_ent->fltr_info.fltr_rule_id;
+	rx_tx->pdata.lkup_tx_rx.index = cpu_to_le16(f_rule_id);
+
+	status = ice_aq_sw_rules(hw, lg_act, rules_size, 2,
+				 ice_aqc_opc_update_sw_rules, NULL);
+	if (!status) {
+		m_ent->lg_act_idx = l_id;
+		m_ent->counter_index = counter_id;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), lg_act);
+	return status;
+}
+
+/**
+ * ice_create_vsi_list_map
+ * @hw: pointer to the hardware structure
+ * @vsi_handle_arr: array of VSI handles to set in the VSI mapping
+ * @num_vsi: number of VSI handles in the array
+ * @vsi_list_id: VSI list ID generated as part of allocate resource
+ *
+ * Helper function to create a new entry of VSI list ID to VSI mapping
+ * using the given VSI list ID
+ */
+static struct ice_vsi_list_map_info *
+ice_create_vsi_list_map(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
+			u16 vsi_list_id)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_vsi_list_map_info *v_map;
+	int i;
+
+	v_map = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*v_map), GFP_KERNEL);
+	if (!v_map)
+		return NULL;
+
+	v_map->vsi_list_id = vsi_list_id;
+	v_map->ref_cnt = 1;
+	for (i = 0; i < num_vsi; i++)
+		set_bit(vsi_handle_arr[i], v_map->vsi_map);
+
+	list_add(&v_map->list_entry, &sw->vsi_list_map_head);
+	return v_map;
+}
+
+/**
+ * ice_update_vsi_list_rule
+ * @hw: pointer to the hardware structure
+ * @vsi_handle_arr: array of VSI handles to form a VSI list
+ * @num_vsi: number of VSI handles in the array
+ * @vsi_list_id: VSI list ID generated as part of allocate resource
+ * @remove: Boolean value to indicate if this is a remove action
+ * @opc: switch rules population command type - pass in the command opcode
+ * @lkup_type: lookup type of the filter
+ *
+ * Call AQ command to add a new switch rule or update existing switch rule
+ * using the given VSI list ID
+ */
+static enum ice_status
+ice_update_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
+			 u16 vsi_list_id, bool remove, enum ice_adminq_opc opc,
+			 enum ice_sw_lkup_type lkup_type)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+	u16 s_rule_size;
+	u16 rule_type;
+	int i;
+
+	if (!num_vsi)
+		return ICE_ERR_PARAM;
+
+	if (lkup_type == ICE_SW_LKUP_MAC ||
+	    lkup_type == ICE_SW_LKUP_MAC_VLAN ||
+	    lkup_type == ICE_SW_LKUP_ETHERTYPE ||
+	    lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
+	    lkup_type == ICE_SW_LKUP_PROMISC ||
+	    lkup_type == ICE_SW_LKUP_PROMISC_VLAN ||
+	    lkup_type == ICE_SW_LKUP_LAST)
+		rule_type = remove ? ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR :
+			ICE_AQC_SW_RULES_T_VSI_LIST_SET;
+	else if (lkup_type == ICE_SW_LKUP_VLAN)
+		rule_type = remove ? ICE_AQC_SW_RULES_T_PRUNE_LIST_CLEAR :
+			ICE_AQC_SW_RULES_T_PRUNE_LIST_SET;
+	else
+		return ICE_ERR_PARAM;
+
+	s_rule_size = (u16)ICE_SW_RULE_VSI_LIST_SIZE(num_vsi);
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+	for (i = 0; i < num_vsi; i++) {
+		if (!ice_is_vsi_valid(hw, vsi_handle_arr[i])) {
+			status = ICE_ERR_PARAM;
+			goto exit;
+		}
+		/* AQ call requires hw_vsi_id(s) */
+		s_rule->pdata.vsi_list.vsi[i] =
+			cpu_to_le16(ice_get_hw_vsi_num(hw, vsi_handle_arr[i]));
+	}
+
+	s_rule->type = cpu_to_le16(rule_type);
+	s_rule->pdata.vsi_list.number_vsi = cpu_to_le16(num_vsi);
+	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_id);
+
+	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opc, NULL);
+
+exit:
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_create_vsi_list_rule - Creates and populates a VSI list rule
+ * @hw: pointer to the HW struct
+ * @vsi_handle_arr: array of VSI handles to form a VSI list
+ * @num_vsi: number of VSI handles in the array
+ * @vsi_list_id: stores the ID of the VSI list to be created
+ * @lkup_type: switch rule filter's lookup type
+ */
+static enum ice_status
+ice_create_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
+			 u16 *vsi_list_id, enum ice_sw_lkup_type lkup_type)
+{
+	enum ice_status status;
+
+	status = ice_aq_alloc_free_vsi_list(hw, vsi_list_id, lkup_type,
+					    ice_aqc_opc_alloc_res);
+	if (status)
+		return status;
+
+	/* Update the newly created VSI list to include the specified VSIs */
+	return ice_update_vsi_list_rule(hw, vsi_handle_arr, num_vsi,
+					*vsi_list_id, false,
+					ice_aqc_opc_add_sw_rules, lkup_type);
+}
+
+/**
+ * ice_create_pkt_fwd_rule
+ * @hw: pointer to the hardware structure
+ * @recp_list: corresponding filter management list
+ * @f_entry: entry containing packet forwarding information
+ *
+ * Create switch rule with given filter information and add an entry
+ * to the corresponding filter management list to track this switch rule
+ * and VSI mapping
+ */
+static enum ice_status
+ice_create_pkt_fwd_rule(struct ice_hw *hw, struct ice_sw_recipe *recp_list,
+			struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
+			      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+	fm_entry = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*fm_entry),
+				GFP_KERNEL);
+	if (!fm_entry) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_create_pkt_fwd_rule_exit;
+	}
+
+	fm_entry->fltr_info = f_entry->fltr_info;
+
+	/* Initialize all the fields for the management entry */
+	fm_entry->vsi_count = 1;
+	fm_entry->lg_act_idx = ICE_INVAL_LG_ACT_INDEX;
+	fm_entry->sw_marker_id = ICE_INVAL_SW_MARKER_ID;
+	fm_entry->counter_index = ICE_INVAL_COUNTER_ID;
+
+	ice_fill_sw_rule(hw, &fm_entry->fltr_info, s_rule,
+			 ice_aqc_opc_add_sw_rules);
+
+	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
+				 ice_aqc_opc_add_sw_rules, NULL);
+	if (status) {
+		devm_kfree(ice_hw_to_dev(hw), fm_entry);
+		goto ice_create_pkt_fwd_rule_exit;
+	}
+
+	f_entry->fltr_info.fltr_rule_id =
+		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+	fm_entry->fltr_info.fltr_rule_id =
+		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+
+	/* The book keeping entries will get removed when base driver
+	 * calls remove filter AQ command
+	 */
+	list_add(&fm_entry->list_entry, &recp_list->filt_rules);
+
+ice_create_pkt_fwd_rule_exit:
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_update_pkt_fwd_rule
+ * @hw: pointer to the hardware structure
+ * @f_info: filter information for switch rule
+ *
+ * Call AQ command to update a previously created switch rule with a
+ * VSI list ID
+ */
+static enum ice_status
+ice_update_pkt_fwd_rule(struct ice_hw *hw, struct ice_fltr_info *f_info)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
+			      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+
+	ice_fill_sw_rule(hw, f_info, s_rule, ice_aqc_opc_update_sw_rules);
+
+	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(f_info->fltr_rule_id);
+
+	/* Update switch rule with new rule set to forward VSI list */
+	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
+				 ice_aqc_opc_update_sw_rules, NULL);
+
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_update_sw_rule_bridge_mode
+ * @hw: pointer to the HW struct
+ *
+ * Updates unicast switch filter rules based on VEB/VEPA mode
+ */
+enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	enum ice_status status = 0;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+
+	mutex_lock(rule_lock);
+	list_for_each_entry(fm_entry, rule_head, list_entry) {
+		struct ice_fltr_info *fi = &fm_entry->fltr_info;
+		u8 *addr = fi->l_data.mac.mac_addr;
+
+		/* Update unicast Tx rules to reflect the selected
+		 * VEB/VEPA mode
+		 */
+		if ((fi->flag & ICE_FLTR_TX) && is_unicast_ether_addr(addr) &&
+		    (fi->fltr_act == ICE_FWD_TO_VSI ||
+		     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
+		     fi->fltr_act == ICE_FWD_TO_Q ||
+		     fi->fltr_act == ICE_FWD_TO_QGRP)) {
+			status = ice_update_pkt_fwd_rule(hw, fi);
+			if (status)
+				break;
+		}
+	}
+
+	mutex_unlock(rule_lock);
+
+	return status;
+}
+
+/**
+ * ice_add_update_vsi_list
+ * @hw: pointer to the hardware structure
+ * @m_entry: pointer to current filter management list entry
+ * @cur_fltr: filter information from the book keeping entry
+ * @new_fltr: filter information with the new VSI to be added
+ *
+ * Call AQ command to add or update previously created VSI list with new VSI.
+ *
+ * Helper function to do book keeping associated with adding filter information
+ * The algorithm to do the book keeping is described below :
+ * When a VSI needs to subscribe to a given filter (MAC/VLAN/Ethtype etc.)
+ *	if only one VSI has been added till now
+ *		Allocate a new VSI list and add two VSIs
+ *		to this list using switch rule command
+ *		Update the previously created switch rule with the
+ *		newly created VSI list ID
+ *	if a VSI list was previously created
+ *		Add the new VSI to the previously created VSI list set
+ *		using the update switch rule command
+ */
+static enum ice_status
+ice_add_update_vsi_list(struct ice_hw *hw,
+			struct ice_fltr_mgmt_list_entry *m_entry,
+			struct ice_fltr_info *cur_fltr,
+			struct ice_fltr_info *new_fltr)
+{
+	enum ice_status status = 0;
+	u16 vsi_list_id = 0;
+
+	if ((cur_fltr->fltr_act == ICE_FWD_TO_Q ||
+	     cur_fltr->fltr_act == ICE_FWD_TO_QGRP))
+		return ICE_ERR_NOT_IMPL;
+
+	if ((new_fltr->fltr_act == ICE_FWD_TO_Q ||
+	     new_fltr->fltr_act == ICE_FWD_TO_QGRP) &&
+	    (cur_fltr->fltr_act == ICE_FWD_TO_VSI ||
+	     cur_fltr->fltr_act == ICE_FWD_TO_VSI_LIST))
+		return ICE_ERR_NOT_IMPL;
+
+	if (m_entry->vsi_count < 2 && !m_entry->vsi_list_info) {
+		/* Only one entry existed in the mapping and it was not already
+		 * a part of a VSI list. So, create a VSI list with the old and
+		 * new VSIs.
+		 */
+		struct ice_fltr_info tmp_fltr;
+		u16 vsi_handle_arr[2];
+
+		/* A rule already exists with the new VSI being added */
+		if (cur_fltr->fwd_id.hw_vsi_id == new_fltr->fwd_id.hw_vsi_id)
+			return ICE_ERR_ALREADY_EXISTS;
+
+		vsi_handle_arr[0] = cur_fltr->vsi_handle;
+		vsi_handle_arr[1] = new_fltr->vsi_handle;
+		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
+						  &vsi_list_id,
+						  new_fltr->lkup_type);
+		if (status)
+			return status;
+
+		tmp_fltr = *new_fltr;
+		tmp_fltr.fltr_rule_id = cur_fltr->fltr_rule_id;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
+		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
+		/* Update the previous switch rule of "MAC forward to VSI" to
+		 * "MAC fwd to VSI list"
+		 */
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
+		if (status)
+			return status;
+
+		cur_fltr->fwd_id.vsi_list_id = vsi_list_id;
+		cur_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
+		m_entry->vsi_list_info =
+			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
+						vsi_list_id);
+
+		if (!m_entry->vsi_list_info)
+			return ICE_ERR_NO_MEMORY;
+
+		/* If this entry was large action then the large action needs
+		 * to be updated to point to FWD to VSI list
+		 */
+		if (m_entry->sw_marker_id != ICE_INVAL_SW_MARKER_ID)
+			status =
+			    ice_add_marker_act(hw, m_entry,
+					       m_entry->sw_marker_id,
+					       m_entry->lg_act_idx);
+	} else {
+		u16 vsi_handle = new_fltr->vsi_handle;
+		enum ice_adminq_opc opcode;
+
+		if (!m_entry->vsi_list_info)
+			return ICE_ERR_CFG;
+
+		/* A rule already exists with the new VSI being added */
+		if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map))
+			return 0;
+
+		/* Update the previously created VSI list set with
+		 * the new VSI ID passed in
+		 */
+		vsi_list_id = cur_fltr->fwd_id.vsi_list_id;
+		opcode = ice_aqc_opc_update_sw_rules;
+
+		status = ice_update_vsi_list_rule(hw, &vsi_handle, 1,
+						  vsi_list_id, false, opcode,
+						  new_fltr->lkup_type);
+		/* update VSI list mapping info with new VSI ID */
+		if (!status)
+			set_bit(vsi_handle, m_entry->vsi_list_info->vsi_map);
+	}
+	if (!status)
+		m_entry->vsi_count++;
+	return status;
+}
+
+/**
+ * ice_find_rule_entry - Search a rule entry
+ * @list_head: head of rule list
+ * @f_info: rule information
+ *
+ * Helper function to search for a given rule entry
+ * Returns pointer to entry storing the rule if found
+ */
+static struct ice_fltr_mgmt_list_entry *
+ice_find_rule_entry(struct list_head *list_head,
+		    struct ice_fltr_info *f_info)
+{
+	struct ice_fltr_mgmt_list_entry *list_itr, *ret = NULL;
+
+	list_for_each_entry(list_itr, list_head, list_entry) {
+		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
+			    sizeof(f_info->l_data)) &&
+		    f_info->flag == list_itr->fltr_info.flag) {
+			ret = list_itr;
+			break;
+		}
+	}
+	return ret;
+}
+
+/**
+ * ice_find_vsi_list_entry - Search VSI list map with VSI count 1
+ * @recp_list: VSI lists needs to be searched
+ * @vsi_handle: VSI handle to be found in VSI list
+ * @vsi_list_id: VSI list ID found containing vsi_handle
+ *
+ * Helper function to search a VSI list with single entry containing given VSI
+ * handle element. This can be extended further to search VSI list with more
+ * than 1 vsi_count. Returns pointer to VSI list entry if found.
+ */
+static struct ice_vsi_list_map_info *
+ice_find_vsi_list_entry(struct ice_sw_recipe *recp_list, u16 vsi_handle,
+			u16 *vsi_list_id)
+{
+	struct ice_vsi_list_map_info *map_info = NULL;
+	struct list_head *list_head;
+
+	list_head = &recp_list->filt_rules;
+	if (recp_list->adv_rule) {
+		struct ice_adv_fltr_mgmt_list_entry *list_itr;
+
+		list_for_each_entry(list_itr, list_head, list_entry) {
+			if (list_itr->vsi_list_info) {
+				map_info = list_itr->vsi_list_info;
+				if (test_bit(vsi_handle, map_info->vsi_map)) {
+					*vsi_list_id = map_info->vsi_list_id;
+					return map_info;
+				}
+			}
+		}
+	} else {
+		struct ice_fltr_mgmt_list_entry *list_itr;
+
+		list_for_each_entry(list_itr, list_head, list_entry) {
+			if (list_itr->vsi_count == 1 &&
+			    list_itr->vsi_list_info) {
+				map_info = list_itr->vsi_list_info;
+				if (test_bit(vsi_handle, map_info->vsi_map)) {
+					*vsi_list_id = map_info->vsi_list_id;
+					return map_info;
+				}
+			}
+		}
+	}
+	return NULL;
+}
+
+/**
+ * ice_add_rule_internal - add rule for a given lookup type
+ * @hw: pointer to the hardware structure
+ * @recp_list: recipe list for which rule has to be added
+ * @lport: logic port number on which function add rule
+ * @f_entry: structure containing MAC forwarding information
+ *
+ * Adds or updates the rule lists for a given recipe
+ */
+static enum ice_status
+ice_add_rule_internal(struct ice_hw *hw, struct ice_sw_recipe *recp_list,
+		      u8 lport, struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_fltr_info *new_fltr, *cur_fltr;
+	struct ice_fltr_mgmt_list_entry *m_entry;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+
+	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
+		return ICE_ERR_PARAM;
+
+	/* Load the hw_vsi_id only if the fwd action is fwd to VSI */
+	if (f_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI)
+		f_entry->fltr_info.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+
+	rule_lock = &recp_list->filt_rule_lock;
+
+	mutex_lock(rule_lock);
+	new_fltr = &f_entry->fltr_info;
+	if (new_fltr->flag & ICE_FLTR_RX)
+		new_fltr->src = lport;
+	else if (new_fltr->flag & ICE_FLTR_TX)
+		new_fltr->src =
+			ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+
+	m_entry = ice_find_rule_entry(&recp_list->filt_rules, new_fltr);
+	if (!m_entry) {
+		status = ice_create_pkt_fwd_rule(hw, recp_list, f_entry);
+		goto exit_add_rule_internal;
+	}
+
+	cur_fltr = &m_entry->fltr_info;
+	status = ice_add_update_vsi_list(hw, m_entry, cur_fltr, new_fltr);
+
+exit_add_rule_internal:
+	mutex_unlock(rule_lock);
+	return status;
+}
+
+/**
+ * ice_remove_vsi_list_rule
+ * @hw: pointer to the hardware structure
+ * @vsi_list_id: VSI list ID generated as part of allocate resource
+ * @lkup_type: switch rule filter lookup type
+ *
+ * The VSI list should be emptied before this function is called to remove the
+ * VSI list.
+ */
+static enum ice_status
+ice_remove_vsi_list_rule(struct ice_hw *hw, u16 vsi_list_id,
+			 enum ice_sw_lkup_type lkup_type)
+{
+	/* Free the vsi_list resource that we allocated. It is assumed that the
+	 * list is empty at this point.
+	 */
+	return ice_aq_alloc_free_vsi_list(hw, &vsi_list_id, lkup_type,
+					    ice_aqc_opc_free_res);
+}
+
+/**
+ * ice_rem_update_vsi_list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle of the VSI to remove
+ * @fm_list: filter management entry for which the VSI list management needs to
+ *	     be done
+ */
+static enum ice_status
+ice_rem_update_vsi_list(struct ice_hw *hw, u16 vsi_handle,
+			struct ice_fltr_mgmt_list_entry *fm_list)
+{
+	enum ice_sw_lkup_type lkup_type;
+	enum ice_status status = 0;
+	u16 vsi_list_id;
+
+	if (fm_list->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST ||
+	    fm_list->vsi_count == 0)
+		return ICE_ERR_PARAM;
+
+	/* A rule with the VSI being removed does not exist */
+	if (!test_bit(vsi_handle, fm_list->vsi_list_info->vsi_map))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	lkup_type = fm_list->fltr_info.lkup_type;
+	vsi_list_id = fm_list->fltr_info.fwd_id.vsi_list_id;
+	status = ice_update_vsi_list_rule(hw, &vsi_handle, 1, vsi_list_id, true,
+					  ice_aqc_opc_update_sw_rules,
+					  lkup_type);
+	if (status)
+		return status;
+
+	fm_list->vsi_count--;
+	clear_bit(vsi_handle, fm_list->vsi_list_info->vsi_map);
+
+	if (fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) {
+		struct ice_fltr_info tmp_fltr_info = fm_list->fltr_info;
+		struct ice_vsi_list_map_info *vsi_list_info =
+			fm_list->vsi_list_info;
+		u16 rem_vsi_handle;
+
+		rem_vsi_handle = find_first_bit(vsi_list_info->vsi_map,
+						ICE_MAX_VSI);
+		if (!ice_is_vsi_valid(hw, rem_vsi_handle))
+			return ICE_ERR_OUT_OF_RANGE;
+
+		/* Make sure VSI list is empty before removing it below */
+		status = ice_update_vsi_list_rule(hw, &rem_vsi_handle, 1,
+						  vsi_list_id, true,
+						  ice_aqc_opc_update_sw_rules,
+						  lkup_type);
+		if (status)
+			return status;
+
+		tmp_fltr_info.fltr_act = ICE_FWD_TO_VSI;
+		tmp_fltr_info.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, rem_vsi_handle);
+		tmp_fltr_info.vsi_handle = rem_vsi_handle;
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr_info);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SW, "Failed to update pkt fwd rule to FWD_TO_VSI on HW VSI %d, error %d\n",
+				  tmp_fltr_info.fwd_id.hw_vsi_id, status);
+			return status;
+		}
+
+		fm_list->fltr_info = tmp_fltr_info;
+	}
+
+	if ((fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) ||
+	    (fm_list->vsi_count == 0 && lkup_type == ICE_SW_LKUP_VLAN)) {
+		struct ice_vsi_list_map_info *vsi_list_info =
+			fm_list->vsi_list_info;
+
+		/* Remove the VSI list since it is no longer used */
+		status = ice_remove_vsi_list_rule(hw, vsi_list_id, lkup_type);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SW, "Failed to remove VSI list %d, error %d\n",
+				  vsi_list_id, status);
+			return status;
+		}
+
+		list_del(&vsi_list_info->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), vsi_list_info);
+		fm_list->vsi_list_info = NULL;
+	}
+
+	return status;
+}
+
+/**
+ * ice_remove_rule_internal - Remove a filter rule of a given type
+ *
+ * @hw: pointer to the hardware structure
+ * @recp_list: recipe list for which the rule needs to removed
+ * @f_entry: rule entry containing filter information
+ */
+static enum ice_status
+ice_remove_rule_internal(struct ice_hw *hw, struct ice_sw_recipe *recp_list,
+			 struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_fltr_mgmt_list_entry *list_elem;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	bool remove_rule = false;
+	u16 vsi_handle;
+
+	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
+		return ICE_ERR_PARAM;
+	f_entry->fltr_info.fwd_id.hw_vsi_id =
+		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+
+	rule_lock = &recp_list->filt_rule_lock;
+	mutex_lock(rule_lock);
+	list_elem = ice_find_rule_entry(&recp_list->filt_rules,
+					&f_entry->fltr_info);
+	if (!list_elem) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto exit;
+	}
+
+	if (list_elem->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST) {
+		remove_rule = true;
+	} else if (!list_elem->vsi_list_info) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto exit;
+	} else if (list_elem->vsi_list_info->ref_cnt > 1) {
+		/* a ref_cnt > 1 indicates that the vsi_list is being
+		 * shared by multiple rules. Decrement the ref_cnt and
+		 * remove this rule, but do not modify the list, as it
+		 * is in-use by other rules.
+		 */
+		list_elem->vsi_list_info->ref_cnt--;
+		remove_rule = true;
+	} else {
+		/* a ref_cnt of 1 indicates the vsi_list is only used
+		 * by one rule. However, the original removal request is only
+		 * for a single VSI. Update the vsi_list first, and only
+		 * remove the rule if there are no further VSIs in this list.
+		 */
+		vsi_handle = f_entry->fltr_info.vsi_handle;
+		status = ice_rem_update_vsi_list(hw, vsi_handle, list_elem);
+		if (status)
+			goto exit;
+		/* if VSI count goes to zero after updating the VSI list */
+		if (list_elem->vsi_count == 0)
+			remove_rule = true;
+	}
+
+	if (remove_rule) {
+		/* Remove the lookup rule */
+		struct ice_aqc_sw_rules_elem *s_rule;
+
+		s_rule = devm_kzalloc(ice_hw_to_dev(hw),
+				      ICE_SW_RULE_RX_TX_NO_HDR_SIZE,
+				      GFP_KERNEL);
+		if (!s_rule) {
+			status = ICE_ERR_NO_MEMORY;
+			goto exit;
+		}
+
+		ice_fill_sw_rule(hw, &list_elem->fltr_info, s_rule,
+				 ice_aqc_opc_remove_sw_rules);
+
+		status = ice_aq_sw_rules(hw, s_rule,
+					 ICE_SW_RULE_RX_TX_NO_HDR_SIZE, 1,
+					 ice_aqc_opc_remove_sw_rules, NULL);
+
+		/* Remove a book keeping from the list */
+		devm_kfree(ice_hw_to_dev(hw), s_rule);
+
+		if (status)
+			goto exit;
+
+		list_del(&list_elem->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), list_elem);
+	}
+exit:
+	mutex_unlock(rule_lock);
+	return status;
+}
+
+/**
+ * ice_aq_get_res_alloc - get allocated resources
+ * @hw: pointer to the HW struct
+ * @num_entries: pointer to u16 to store the number of resource entries returned
+ * @buf: pointer to buffer
+ * @buf_size: size of buf
+ * @cd: pointer to command details structure or NULL
+ *
+ * The caller-supplied buffer must be large enough to store the resource
+ * information for all resource types. Each resource type is an
+ * ice_aqc_get_res_resp_elem structure.
+ */
+enum ice_status
+ice_aq_get_res_alloc(struct ice_hw *hw, u16 *num_entries,
+		     struct ice_aqc_get_res_resp_elem *buf, u16 buf_size,
+		     struct ice_sq_cd *cd)
+{
+	struct ice_aqc_get_res_alloc *resp;
+	enum ice_status status;
+	struct ice_aq_desc desc;
+
+	if (!buf)
+		return ICE_ERR_BAD_PTR;
+
+	if (buf_size < ICE_AQ_GET_RES_ALLOC_BUF_LEN)
+		return ICE_ERR_INVAL_SIZE;
+
+	resp = &desc.params.get_res;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_res_alloc);
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+
+	if (!status && num_entries)
+		*num_entries = le16_to_cpu(resp->resp_elem_num);
+
+	return status;
+}
+
+/**
+ * ice_aq_get_res_descs - get allocated resource descriptors
+ * @hw: pointer to the hardware structure
+ * @num_entries: number of resource entries in buffer
+ * @buf: structure to hold response data buffer
+ * @buf_size: size of buffer
+ * @res_type: resource type
+ * @res_shared: is resource shared
+ * @desc_id: input - first desc ID to start; output - next desc ID
+ * @cd: pointer to command details structure or NULL
+ */
+enum ice_status
+ice_aq_get_res_descs(struct ice_hw *hw, u16 num_entries,
+		     struct ice_aqc_res_elem *buf, u16 buf_size, u16 res_type,
+		     bool res_shared, u16 *desc_id, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_get_allocd_res_desc *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.get_res_desc;
+
+	if (!buf)
+		return ICE_ERR_PARAM;
+
+	if (buf_size != (num_entries * sizeof(*buf)))
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_allocd_res_desc);
+
+	cmd->ops.cmd.res = cpu_to_le16(((res_type << ICE_AQC_RES_TYPE_S) &
+					 ICE_AQC_RES_TYPE_M) | (res_shared ?
+					ICE_AQC_RES_TYPE_FLAG_SHARED : 0));
+	cmd->ops.cmd.first_desc = cpu_to_le16(*desc_id);
+
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status)
+		*desc_id = le16_to_cpu(cmd->ops.resp.next_desc);
+
+	return status;
+}
+
+/**
+ * ice_mac_fltr_exist - does this MAC filter exist for given VSI
+ * @hw: pointer to the hardware structure
+ * @mac: MAC address to be checked (for MAC filter)
+ * @vsi_handle: check MAC filter for this VSI
+ */
+bool ice_mac_fltr_exist(struct ice_hw *hw, u8 *mac, u16 vsi_handle)
+{
+	struct ice_fltr_mgmt_list_entry *entry;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	struct ice_switch_info *sw;
+	u16 hw_vsi_id;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return false;
+
+	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+	sw = hw->switch_info;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+	if (!rule_head)
+		return false;
+
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
+	mutex_lock(rule_lock);
+	list_for_each_entry(entry, rule_head, list_entry) {
+		struct ice_fltr_info *f_info = &entry->fltr_info;
+		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
+
+		if (is_zero_ether_addr(mac_addr))
+			continue;
+
+		if (f_info->flag != ICE_FLTR_TX ||
+		    f_info->src_id != ICE_SRC_ID_VSI ||
+		    f_info->lkup_type != ICE_SW_LKUP_MAC ||
+		    f_info->fltr_act != ICE_FWD_TO_VSI ||
+		    hw_vsi_id != f_info->fwd_id.hw_vsi_id)
+			continue;
+
+		if (ether_addr_equal(mac, mac_addr)) {
+			mutex_unlock(rule_lock);
+			return true;
+		}
+	}
+	mutex_unlock(rule_lock);
+	return false;
+}
+
+/**
+ * ice_vlan_fltr_exist - does this VLAN filter exist for given VSI
+ * @hw: pointer to the hardware structure
+ * @vlan_id: VLAN ID
+ * @vsi_handle: check MAC filter for this VSI
+ */
+bool ice_vlan_fltr_exist(struct ice_hw *hw, u16 vlan_id, u16 vsi_handle)
+{
+	struct ice_fltr_mgmt_list_entry *entry;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	struct ice_switch_info *sw;
+	u16 hw_vsi_id;
+
+	if (vlan_id > ICE_MAX_VLAN_ID)
+		return false;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return false;
+
+	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+	sw = hw->switch_info;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rules;
+	if (!rule_head)
+		return false;
+
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
+	mutex_lock(rule_lock);
+	list_for_each_entry(entry, rule_head, list_entry) {
+		struct ice_fltr_info *f_info = &entry->fltr_info;
+		u16 entry_vlan_id = f_info->l_data.vlan.vlan_id;
+		struct ice_vsi_list_map_info *map_info;
+
+		if (entry_vlan_id > ICE_MAX_VLAN_ID)
+			continue;
+
+		if (f_info->flag != ICE_FLTR_TX ||
+		    f_info->src_id != ICE_SRC_ID_VSI ||
+		    f_info->lkup_type != ICE_SW_LKUP_VLAN)
+			continue;
+
+		/* Only allowed filter action are FWD_TO_VSI/_VSI_LIST */
+		if (f_info->fltr_act != ICE_FWD_TO_VSI &&
+		    f_info->fltr_act != ICE_FWD_TO_VSI_LIST)
+			continue;
+
+		if (f_info->fltr_act == ICE_FWD_TO_VSI) {
+			if (hw_vsi_id != f_info->fwd_id.hw_vsi_id)
+				continue;
+		} else if (f_info->fltr_act == ICE_FWD_TO_VSI_LIST) {
+			/* If filter_action is FWD_TO_VSI_LIST, make sure
+			 * that VSI being checked is part of VSI list
+			 */
+			if (entry->vsi_count == 1 &&
+			    entry->vsi_list_info) {
+				map_info = entry->vsi_list_info;
+				if (!test_bit(vsi_handle, map_info->vsi_map))
+					continue;
+			}
+		}
+
+		if (vlan_id == entry_vlan_id) {
+			mutex_unlock(rule_lock);
+			return true;
+		}
+	}
+	mutex_unlock(rule_lock);
+
+	return false;
+}
+
+/**
+ * ice_add_mac_rule - Add a MAC address based filter rule
+ * @hw: pointer to the hardware structure
+ * @m_list: list of MAC addresses and forwarding information
+ * @sw: pointer to switch info struct for which function add rule
+ * @lport: logic port number on which function add rule
+ *
+ * IMPORTANT: When the umac_shared flag is set to false and m_list has
+ * multiple unicast addresses, the function assumes that all the
+ * addresses are unique in a given add_mac call. It doesn't
+ * check for duplicates in this case, removing duplicates from a given
+ * list should be taken care of in the caller of this function.
+ */
+static enum ice_status
+ice_add_mac_rule(struct ice_hw *hw, struct list_head *m_list,
+		 struct ice_switch_info *sw, u8 lport)
+{
+	struct ice_sw_recipe *recp_list = &sw->recp_list[ICE_SW_LKUP_MAC];
+	struct ice_aqc_sw_rules_elem *s_rule, *r_iter;
+	struct ice_fltr_list_entry *m_list_itr;
+	struct list_head *rule_head;
+	u16 total_elem_left, s_rule_size;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	u16 num_unicast = 0;
+	u8 elem_sent;
+
+	s_rule = NULL;
+	rule_lock = &recp_list->filt_rule_lock;
+	rule_head = &recp_list->filt_rules;
+
+	list_for_each_entry(m_list_itr, m_list, list_entry) {
+		u8 *add = &m_list_itr->fltr_info.l_data.mac.mac_addr[0];
+		u16 vsi_handle;
+		u16 hw_vsi_id;
+
+		m_list_itr->fltr_info.flag = ICE_FLTR_TX;
+		vsi_handle = m_list_itr->fltr_info.vsi_handle;
+		if (!ice_is_vsi_valid(hw, vsi_handle))
+			return ICE_ERR_PARAM;
+		hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+		m_list_itr->fltr_info.fwd_id.hw_vsi_id = hw_vsi_id;
+		/* update the src in case it is VSI num */
+		if (m_list_itr->fltr_info.src_id != ICE_SRC_ID_VSI)
+			return ICE_ERR_PARAM;
+		m_list_itr->fltr_info.src = hw_vsi_id;
+		if (m_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_MAC ||
+		    is_zero_ether_addr(add))
+			return ICE_ERR_PARAM;
+		if (is_unicast_ether_addr(add) && !hw->umac_shared) {
+			/* Don't overwrite the unicast address */
+			mutex_lock(rule_lock);
+			if (ice_find_rule_entry(rule_head,
+						&m_list_itr->fltr_info)) {
+				mutex_unlock(rule_lock);
+				return ICE_ERR_ALREADY_EXISTS;
+			}
+			mutex_unlock(rule_lock);
+			num_unicast++;
+		} else if (is_multicast_ether_addr(add) ||
+			   (is_unicast_ether_addr(add) && hw->umac_shared)) {
+			m_list_itr->status =
+				ice_add_rule_internal(hw, recp_list, lport,
+						      m_list_itr);
+			if (m_list_itr->status)
+				return m_list_itr->status;
+		}
+	}
+
+	mutex_lock(rule_lock);
+	/* Exit if no suitable entries were found for adding bulk switch rule */
+	if (!num_unicast) {
+		status = 0;
+		goto ice_add_mac_exit;
+	}
+
+	/* Allocate switch rule buffer for the bulk update for unicast */
+	s_rule_size = ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
+	s_rule = devm_kcalloc(ice_hw_to_dev(hw), num_unicast, s_rule_size,
+			      GFP_KERNEL);
+	if (!s_rule) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_add_mac_exit;
+	}
+
+	r_iter = s_rule;
+	list_for_each_entry(m_list_itr, m_list, list_entry) {
+		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
+		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
+
+		if (is_unicast_ether_addr(mac_addr)) {
+			ice_fill_sw_rule(hw, &m_list_itr->fltr_info, r_iter,
+					 ice_aqc_opc_add_sw_rules);
+			r_iter = (struct ice_aqc_sw_rules_elem *)
+				((u8 *)r_iter + s_rule_size);
+		}
+	}
+
+	/* Call AQ bulk switch rule update for all unicast addresses */
+	r_iter = s_rule;
+	/* Call AQ switch rule in AQ_MAX chunk */
+	for (total_elem_left = num_unicast; total_elem_left > 0;
+	     total_elem_left -= elem_sent) {
+		struct ice_aqc_sw_rules_elem *entry = r_iter;
+
+		elem_sent = min_t(u8, total_elem_left,
+				  (ICE_AQ_MAX_BUF_LEN / s_rule_size));
+		status = ice_aq_sw_rules(hw, entry, elem_sent * s_rule_size,
+					 elem_sent, ice_aqc_opc_add_sw_rules,
+					 NULL);
+		if (status)
+			goto ice_add_mac_exit;
+		r_iter = (struct ice_aqc_sw_rules_elem *)
+			((u8 *)r_iter + (elem_sent * s_rule_size));
+	}
+
+	/* Fill up rule ID based on the value returned from FW */
+	r_iter = s_rule;
+	list_for_each_entry(m_list_itr, m_list, list_entry) {
+		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
+		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
+		struct ice_fltr_mgmt_list_entry *fm_entry;
+
+		if (is_unicast_ether_addr(mac_addr)) {
+			f_info->fltr_rule_id =
+				le16_to_cpu(r_iter->pdata.lkup_tx_rx.index);
+			f_info->fltr_act = ICE_FWD_TO_VSI;
+			/* Create an entry to track this MAC address */
+			fm_entry = devm_kzalloc(ice_hw_to_dev(hw),
+						sizeof(*fm_entry), GFP_KERNEL);
+			if (!fm_entry) {
+				status = ICE_ERR_NO_MEMORY;
+				goto ice_add_mac_exit;
+			}
+			fm_entry->fltr_info = *f_info;
+			fm_entry->vsi_count = 1;
+			/* The book keeping entries will get removed when
+			 * base driver calls remove filter AQ command
+			 */
+
+			list_add(&fm_entry->list_entry, rule_head);
+			r_iter = (struct ice_aqc_sw_rules_elem *)
+				((u8 *)r_iter + s_rule_size);
+		}
+	}
+
+ice_add_mac_exit:
+	mutex_unlock(rule_lock);
+	if (s_rule)
+		devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+
+/**
+ * ice_add_mac - Add a MAC address based filter rule
+ * @hw: pointer to the hardware structure
+ * @m_list: list of MAC addresses and forwarding information
+ *
+ * Function add MAC rule for logical port from HW struct
+ */
+enum ice_status ice_add_mac(struct ice_hw *hw, struct list_head *m_list)
+{
+	if (!m_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_add_mac_rule(hw, m_list, hw->switch_info,
+				hw->port_info->lport);
+}
+
+/**
+ * ice_add_vlan_internal - Add one VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @recp_list: recipe list for which rule has to be added
+ * @f_entry: filter entry containing one VLAN information
+ */
+static enum ice_status
+ice_add_vlan_internal(struct ice_hw *hw, struct ice_sw_recipe *recp_list,
+		      struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_fltr_mgmt_list_entry *v_list_itr;
+	struct ice_fltr_info *new_fltr, *cur_fltr;
+	enum ice_sw_lkup_type lkup_type;
+	u16 vsi_list_id = 0, vsi_handle;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+
+	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
+		return ICE_ERR_PARAM;
+
+	f_entry->fltr_info.fwd_id.hw_vsi_id =
+		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+	new_fltr = &f_entry->fltr_info;
+
+	/* VLAN ID should only be 12 bits */
+	if (new_fltr->l_data.vlan.vlan_id > ICE_MAX_VLAN_ID)
+		return ICE_ERR_PARAM;
+
+	if (new_fltr->src_id != ICE_SRC_ID_VSI)
+		return ICE_ERR_PARAM;
+
+	new_fltr->src = new_fltr->fwd_id.hw_vsi_id;
+	lkup_type = new_fltr->lkup_type;
+	vsi_handle = new_fltr->vsi_handle;
+	rule_lock = &recp_list->filt_rule_lock;
+	mutex_lock(rule_lock);
+	v_list_itr = ice_find_rule_entry(&recp_list->filt_rules, new_fltr);
+	if (!v_list_itr) {
+		struct ice_vsi_list_map_info *map_info = NULL;
+
+		if (new_fltr->fltr_act == ICE_FWD_TO_VSI) {
+			/* All VLAN pruning rules use a VSI list. Check if
+			 * there is already a VSI list containing VSI that we
+			 * want to add. If found, use the same vsi_list_id for
+			 * this new VLAN rule or else create a new list.
+			 */
+			map_info = ice_find_vsi_list_entry(recp_list,
+							   vsi_handle,
+							   &vsi_list_id);
+			if (!map_info) {
+				status = ice_create_vsi_list_rule(hw,
+								  &vsi_handle,
+								  1,
+								  &vsi_list_id,
+								  lkup_type);
+				if (status)
+					goto exit;
+			}
+			/* Convert the action to forwarding to a VSI list. */
+			new_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
+			new_fltr->fwd_id.vsi_list_id = vsi_list_id;
+		}
+
+		status = ice_create_pkt_fwd_rule(hw, recp_list, f_entry);
+		if (!status) {
+			v_list_itr = ice_find_rule_entry(&recp_list->filt_rules,
+							 new_fltr);
+			if (!v_list_itr) {
+				status = ICE_ERR_DOES_NOT_EXIST;
+				goto exit;
+			}
+			/* reuse VSI list for new rule and increment ref_cnt */
+			if (map_info) {
+				v_list_itr->vsi_list_info = map_info;
+				map_info->ref_cnt++;
+			} else {
+				v_list_itr->vsi_list_info =
+					ice_create_vsi_list_map(hw, &vsi_handle,
+								1, vsi_list_id);
+			}
+		}
+	} else if (v_list_itr->vsi_list_info->ref_cnt == 1) {
+		/* Update existing VSI list to add new VSI ID only if it used
+		 * by one VLAN rule.
+		 */
+		cur_fltr = &v_list_itr->fltr_info;
+		status = ice_add_update_vsi_list(hw, v_list_itr, cur_fltr,
+						 new_fltr);
+	} else {
+		/* If VLAN rule exists and VSI list being used by this rule is
+		 * referenced by more than 1 VLAN rule. Then create a new VSI
+		 * list appending previous VSI with new VSI and update existing
+		 * VLAN rule to point to new VSI list ID
+		 */
+		struct ice_fltr_info tmp_fltr;
+		u16 vsi_handle_arr[2];
+		u16 cur_handle;
+
+		/* Current implementation only supports reusing VSI list with
+		 * one VSI count. We should never hit below condition
+		 */
+		if (v_list_itr->vsi_count > 1 &&
+		    v_list_itr->vsi_list_info->ref_cnt > 1) {
+			ice_debug(hw, ICE_DBG_SW, "Invalid configuration: Optimization to reuse VSI list with more than one VSI is not being done yet\n");
+			status = ICE_ERR_CFG;
+			goto exit;
+		}
+
+		cur_handle =
+			find_first_bit(v_list_itr->vsi_list_info->vsi_map,
+				       ICE_MAX_VSI);
+
+		/* A rule already exists with the new VSI being added */
+		if (cur_handle == vsi_handle) {
+			status = ICE_ERR_ALREADY_EXISTS;
+			goto exit;
+		}
+
+		vsi_handle_arr[0] = cur_handle;
+		vsi_handle_arr[1] = vsi_handle;
+		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
+						  &vsi_list_id, lkup_type);
+		if (status)
+			goto exit;
+
+		tmp_fltr = v_list_itr->fltr_info;
+		tmp_fltr.fltr_rule_id = v_list_itr->fltr_info.fltr_rule_id;
+		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
+		/* Update the previous switch rule to a new VSI list which
+		 * includes current VSI that is requested
+		 */
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
+		if (status)
+			goto exit;
+
+		/* before overriding VSI list map info. decrement ref_cnt of
+		 * previous VSI list
+		 */
+		v_list_itr->vsi_list_info->ref_cnt--;
+
+		/* now update to newly created list */
+		v_list_itr->fltr_info.fwd_id.vsi_list_id = vsi_list_id;
+		v_list_itr->vsi_list_info =
+			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
+						vsi_list_id);
+		v_list_itr->vsi_count++;
+	}
+
+exit:
+	mutex_unlock(rule_lock);
+	return status;
+}
+
+/**
+ * ice_add_vlan_rule - Add VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of VLAN entries and forwarding information
+ * @sw: pointer to switch info struct for which function add rule
+ */
+static enum ice_status
+ice_add_vlan_rule(struct ice_hw *hw, struct list_head *v_list,
+		  struct ice_switch_info *sw)
+{
+	struct ice_fltr_list_entry *v_list_itr;
+	struct ice_sw_recipe *recp_list;
+
+	recp_list = &sw->recp_list[ICE_SW_LKUP_VLAN];
+	list_for_each_entry(v_list_itr, v_list, list_entry) {
+		if (v_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_VLAN)
+			return ICE_ERR_PARAM;
+		v_list_itr->fltr_info.flag = ICE_FLTR_TX;
+		v_list_itr->status = ice_add_vlan_internal(hw, recp_list,
+							   v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_add_vlan - Add a VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of VLAN and forwarding information
+ *
+ * Function add VLAN rule for logical port from HW struct
+ */
+enum ice_status ice_add_vlan(struct ice_hw *hw, struct list_head *v_list)
+{
+	if (!v_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_add_vlan_rule(hw, v_list, hw->switch_info);
+}
+
+/**
+ * ice_add_mac_vlan_rule - Add MAC and VLAN pair based filter rule
+ * @hw: pointer to the hardware structure
+ * @mv_list: list of MAC and VLAN filters
+ * @sw: pointer to switch info struct for which function add rule
+ * @lport: logic port number on which function add rule
+ *
+ * If the VSI on which the MAC-VLAN pair has to be added has Rx and Tx VLAN
+ * pruning bits enabled, then it is the responsibility of the caller to make
+ * sure to add a VLAN only filter on the same VSI. Packets belonging to that
+ * VLAN won't be received on that VSI otherwise.
+ */
+static enum ice_status
+ice_add_mac_vlan_rule(struct ice_hw *hw, struct list_head *mv_list,
+		      struct ice_switch_info *sw, u8 lport)
+{
+	struct ice_fltr_list_entry *mv_list_itr;
+	struct ice_sw_recipe *recp_list;
+
+	if (!mv_list || !hw)
+		return ICE_ERR_PARAM;
+
+	recp_list = &sw->recp_list[ICE_SW_LKUP_MAC_VLAN];
+	list_for_each_entry(mv_list_itr, mv_list, list_entry) {
+		enum ice_sw_lkup_type l_type =
+			mv_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_MAC_VLAN)
+			return ICE_ERR_PARAM;
+		mv_list_itr->fltr_info.flag = ICE_FLTR_TX;
+		mv_list_itr->status =
+			ice_add_rule_internal(hw, recp_list, lport,
+					      mv_list_itr);
+		if (mv_list_itr->status)
+			return mv_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_add_mac_vlan - Add a MAC VLAN address based filter rule
+ * @hw: pointer to the hardware structure
+ * @mv_list: list of MAC VLAN addresses and forwarding information
+ *
+ * Function add MAC VLAN rule for logical port from HW struct
+ */
+enum ice_status
+ice_add_mac_vlan(struct ice_hw *hw, struct list_head *mv_list)
+{
+	if (!mv_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_add_mac_vlan_rule(hw, mv_list, hw->switch_info,
+				     hw->port_info->lport);
+}
+
+/**
+ * ice_add_eth_mac_rule - Add ethertype and MAC based filter rule
+ * @hw: pointer to the hardware structure
+ * @em_list: list of ether type MAC filter, MAC is optional
+ * @sw: pointer to switch info struct for which function add rule
+ * @lport: logic port number on which function add rule
+ *
+ * This function requires the caller to populate the entries in
+ * the filter list with the necessary fields (including flags to
+ * indicate Tx or Rx rules).
+ */
+static enum ice_status
+ice_add_eth_mac_rule(struct ice_hw *hw, struct list_head *em_list,
+		     struct ice_switch_info *sw, u8 lport)
+{
+	struct ice_fltr_list_entry *em_list_itr;
+
+	list_for_each_entry(em_list_itr, em_list, list_entry) {
+		struct ice_sw_recipe *recp_list;
+		enum ice_sw_lkup_type l_type;
+
+		l_type = em_list_itr->fltr_info.lkup_type;
+		recp_list = &sw->recp_list[l_type];
+
+		if (l_type != ICE_SW_LKUP_ETHERTYPE_MAC &&
+		    l_type != ICE_SW_LKUP_ETHERTYPE)
+			return ICE_ERR_PARAM;
+
+		em_list_itr->status = ice_add_rule_internal(hw, recp_list,
+							    lport,
+							    em_list_itr);
+		if (em_list_itr->status)
+			return em_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_add_eth_mac - Add a ethertype based filter rule
+ * @hw: pointer to the hardware structure
+ * @em_list: list of ethertype and forwarding information
+ *
+ * Function add ethertype rule for logical port from HW struct
+ */
+enum ice_status
+ice_add_eth_mac(struct ice_hw *hw, struct list_head *em_list)
+{
+	if (!em_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_add_eth_mac_rule(hw, em_list, hw->switch_info,
+				    hw->port_info->lport);
+}
+
+/**
+ * ice_remove_eth_mac_rule - Remove an ethertype (or MAC) based filter rule
+ * @hw: pointer to the hardware structure
+ * @em_list: list of ethertype or ethertype MAC entries
+ * @sw: pointer to switch info struct for which function add rule
+ */
+static enum ice_status
+ice_remove_eth_mac_rule(struct ice_hw *hw, struct list_head *em_list,
+			struct ice_switch_info *sw)
+{
+	struct ice_fltr_list_entry *em_list_itr, *tmp;
+
+	list_for_each_entry_safe(em_list_itr, tmp, em_list, list_entry) {
+		struct ice_sw_recipe *recp_list;
+		enum ice_sw_lkup_type l_type;
+
+		l_type = em_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_ETHERTYPE_MAC &&
+		    l_type != ICE_SW_LKUP_ETHERTYPE)
+			return ICE_ERR_PARAM;
+
+		recp_list = &sw->recp_list[l_type];
+		em_list_itr->status = ice_remove_rule_internal(hw, recp_list,
+							       em_list_itr);
+		if (em_list_itr->status)
+			return em_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_remove_eth_mac - remove a ethertype based filter rule
+ * @hw: pointer to the hardware structure
+ * @em_list: list of ethertype and forwarding information
+ *
+ */
+enum ice_status
+ice_remove_eth_mac(struct ice_hw *hw, struct list_head *em_list)
+{
+	if (!em_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_remove_eth_mac_rule(hw, em_list, hw->switch_info);
+}
+
+
+/**
+ * ice_rem_sw_rule_info
+ * @hw: pointer to the hardware structure
+ * @rule_head: pointer to the switch list structure that we want to delete
+ */
+static void
+ice_rem_sw_rule_info(struct ice_hw *hw, struct list_head *rule_head)
+{
+	if (!list_empty(rule_head)) {
+		struct ice_fltr_mgmt_list_entry *entry;
+		struct ice_fltr_mgmt_list_entry *tmp;
+
+		list_for_each_entry_safe(entry, tmp, rule_head, list_entry) {
+			list_del(&entry->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), entry);
+		}
+	}
+}
+
+/**
+ * ice_rem_adv_rule_info
+ * @hw: pointer to the hardware structure
+ * @rule_head: pointer to the switch list structure that we want to delete
+ */
+static void
+ice_rem_adv_rule_info(struct ice_hw *hw, struct list_head *rule_head)
+{
+	struct ice_adv_fltr_mgmt_list_entry *tmp_entry;
+	struct ice_adv_fltr_mgmt_list_entry *lst_itr;
+
+	if (list_empty(rule_head))
+		return;
+
+	list_for_each_entry_safe(lst_itr, tmp_entry, rule_head, list_entry) {
+		list_del(&lst_itr->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), lst_itr->lkups);
+		devm_kfree(ice_hw_to_dev(hw), lst_itr);
+	}
+}
+
+/**
+ * ice_rem_all_sw_rules_info
+ * @hw: pointer to the hardware structure
+ */
+void ice_rem_all_sw_rules_info(struct ice_hw *hw)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	u8 i;
+
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
+		struct list_head *rule_head;
+
+		rule_head = &sw->recp_list[i].filt_rules;
+		if (!sw->recp_list[i].adv_rule)
+			ice_rem_sw_rule_info(hw, rule_head);
+		else
+			ice_rem_adv_rule_info(hw, rule_head);
+		if (sw->recp_list[i].adv_rule &&
+		    list_empty(&sw->recp_list[i].filt_rules))
+			sw->recp_list[i].adv_rule = false;
+	}
+}
+
+/**
+ * ice_cfg_dflt_vsi - change state of VSI to set/clear default
+ * @pi: pointer to the port_info structure
+ * @vsi_handle: VSI handle to set as default
+ * @set: true to add the above mentioned switch rule, false to remove it
+ * @direction: ICE_FLTR_RX or ICE_FLTR_TX
+ *
+ * add filter rule to set/unset given VSI as default VSI for the switch
+ * (represented by swid)
+ */
+enum ice_status
+ice_cfg_dflt_vsi(struct ice_port_info *pi, u16 vsi_handle, bool set,
+		 u8 direction)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	struct ice_fltr_info f_info;
+	struct ice_hw *hw = pi->hw;
+	enum ice_adminq_opc opcode;
+	enum ice_status status;
+	u16 s_rule_size;
+	u16 hw_vsi_id;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+
+	s_rule_size = set ? ICE_SW_RULE_RX_TX_ETH_HDR_SIZE :
+		ICE_SW_RULE_RX_TX_NO_HDR_SIZE;
+
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+
+	memset(&f_info, 0, sizeof(f_info));
+
+	f_info.lkup_type = ICE_SW_LKUP_DFLT;
+	f_info.flag = direction;
+	f_info.fltr_act = ICE_FWD_TO_VSI;
+	f_info.fwd_id.hw_vsi_id = hw_vsi_id;
+
+	if (f_info.flag & ICE_FLTR_RX) {
+		f_info.src = pi->lport;
+		f_info.src_id = ICE_SRC_ID_LPORT;
+		if (!set)
+			f_info.fltr_rule_id =
+				pi->dflt_rx_vsi_rule_id;
+	} else if (f_info.flag & ICE_FLTR_TX) {
+		f_info.src_id = ICE_SRC_ID_VSI;
+		f_info.src = hw_vsi_id;
+		if (!set)
+			f_info.fltr_rule_id =
+				pi->dflt_tx_vsi_rule_id;
+	}
+
+	if (set)
+		opcode = ice_aqc_opc_add_sw_rules;
+	else
+		opcode = ice_aqc_opc_remove_sw_rules;
+
+	ice_fill_sw_rule(hw, &f_info, s_rule, opcode);
+
+	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opcode, NULL);
+	if (status || !(f_info.flag & ICE_FLTR_TX_RX))
+		goto out;
+	if (set) {
+		u16 index = le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+
+		if (f_info.flag & ICE_FLTR_TX) {
+			pi->dflt_tx_vsi_num = hw_vsi_id;
+			pi->dflt_tx_vsi_rule_id = index;
+		} else if (f_info.flag & ICE_FLTR_RX) {
+			pi->dflt_rx_vsi_num = hw_vsi_id;
+			pi->dflt_rx_vsi_rule_id = index;
+		}
+	} else {
+		if (f_info.flag & ICE_FLTR_TX) {
+			pi->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
+			pi->dflt_tx_vsi_rule_id = ICE_INVAL_ACT;
+		} else if (f_info.flag & ICE_FLTR_RX) {
+			pi->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
+			pi->dflt_rx_vsi_rule_id = ICE_INVAL_ACT;
+		}
+	}
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_find_ucast_rule_entry - Search for a unicast MAC filter rule entry
+ * @list_head: head of rule list
+ * @f_info: rule information
+ *
+ * Helper function to search for a unicast rule entry - this is to be used
+ * to remove unicast MAC filter that is not shared with other VSIs on the
+ * PF switch.
+ *
+ * Returns pointer to entry storing the rule if found
+ */
+static struct ice_fltr_mgmt_list_entry *
+ice_find_ucast_rule_entry(struct list_head *list_head,
+			  struct ice_fltr_info *f_info)
+{
+	struct ice_fltr_mgmt_list_entry *list_itr;
+
+	list_for_each_entry(list_itr, list_head, list_entry) {
+		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
+			    sizeof(f_info->l_data)) &&
+		    f_info->fwd_id.hw_vsi_id ==
+		    list_itr->fltr_info.fwd_id.hw_vsi_id &&
+		    f_info->flag == list_itr->fltr_info.flag)
+			return list_itr;
+	}
+	return NULL;
+}
+
+/**
+ * ice_remove_mac_rule - remove a MAC based filter rule
+ * @hw: pointer to the hardware structure
+ * @m_list: list of MAC addresses and forwarding information
+ * @recp_list: list from which function remove MAC address
+ *
+ * This function removes either a MAC filter rule or a specific VSI from a
+ * VSI list for a multicast MAC address.
+ *
+ * Returns ICE_ERR_DOES_NOT_EXIST if a given entry was not added by
+ * ice_add_mac. Caller should be aware that this call will only work if all
+ * the entries passed into m_list were added previously. It will not attempt to
+ * do a partial remove of entries that were found.
+ */
+static enum ice_status
+ice_remove_mac_rule(struct ice_hw *hw, struct list_head *m_list,
+		    struct ice_sw_recipe *recp_list)
+{
+	struct ice_fltr_list_entry *list_itr, *tmp;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+
+	if (!m_list)
+		return ICE_ERR_PARAM;
+
+	rule_lock = &recp_list->filt_rule_lock;
+	list_for_each_entry_safe(list_itr, tmp, m_list, list_entry) {
+		enum ice_sw_lkup_type l_type = list_itr->fltr_info.lkup_type;
+		u8 *add = &list_itr->fltr_info.l_data.mac.mac_addr[0];
+		u16 vsi_handle;
+
+		if (l_type != ICE_SW_LKUP_MAC)
+			return ICE_ERR_PARAM;
+
+		vsi_handle = list_itr->fltr_info.vsi_handle;
+		if (!ice_is_vsi_valid(hw, vsi_handle))
+			return ICE_ERR_PARAM;
+
+		list_itr->fltr_info.fwd_id.hw_vsi_id =
+					ice_get_hw_vsi_num(hw, vsi_handle);
+		if (is_unicast_ether_addr(add) && !hw->umac_shared) {
+			/* Don't remove the unicast address that belongs to
+			 * another VSI on the switch, since it is not being
+			 * shared...
+			 */
+			mutex_lock(rule_lock);
+			if (!ice_find_ucast_rule_entry(&recp_list->filt_rules,
+						       &list_itr->fltr_info)) {
+				mutex_unlock(rule_lock);
+				return ICE_ERR_DOES_NOT_EXIST;
+			}
+			mutex_unlock(rule_lock);
+		}
+		list_itr->status = ice_remove_rule_internal(hw, recp_list,
+							    list_itr);
+		if (list_itr->status)
+			return list_itr->status;
+	}
+	return 0;
+}
+
+/**
+ * ice_remove_mac - remove a MAC address based filter rule
+ * @hw: pointer to the hardware structure
+ * @m_list: list of MAC addresses and forwarding information
+ *
+ */
+enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_list)
+{
+	struct ice_sw_recipe *recp_list;
+
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC];
+	return ice_remove_mac_rule(hw, m_list, recp_list);
+}
+
+
+/**
+ * ice_remove_vlan_rule - Remove VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of VLAN entries and forwarding information
+ * @recp_list: list from which function remove VLAN
+ */
+static enum ice_status
+ice_remove_vlan_rule(struct ice_hw *hw, struct list_head *v_list,
+		     struct ice_sw_recipe *recp_list)
+{
+	struct ice_fltr_list_entry *v_list_itr, *tmp;
+
+	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
+		enum ice_sw_lkup_type l_type = v_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_VLAN)
+			return ICE_ERR_PARAM;
+		v_list_itr->status = ice_remove_rule_internal(hw, recp_list,
+							      v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_remove_vlan - remove a VLAN address based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of VLAN and forwarding information
+ *
+ */
+enum ice_status
+ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list)
+{
+	struct ice_sw_recipe *recp_list;
+
+	if (!v_list || !hw)
+		return ICE_ERR_PARAM;
+
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_VLAN];
+	return ice_remove_vlan_rule(hw, v_list, recp_list);
+}
+
+/**
+ * ice_remove_mac_vlan_rule - Remove MAC VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of MAC VLAN entries and forwarding information
+ * @recp_list: list from which function remove MAC VLAN
+ */
+static enum ice_status
+ice_remove_mac_vlan_rule(struct ice_hw *hw, struct list_head *v_list,
+			 struct ice_sw_recipe *recp_list)
+{
+	struct ice_fltr_list_entry *v_list_itr, *tmp;
+
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC_VLAN];
+	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
+		enum ice_sw_lkup_type l_type = v_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_MAC_VLAN)
+			return ICE_ERR_PARAM;
+		v_list_itr->status =
+			ice_remove_rule_internal(hw, recp_list,
+						 v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_remove_mac_vlan - remove a MAC VLAN address based filter rule
+ * @hw: pointer to the hardware structure
+ * @mv_list: list of MAC VLAN and forwarding information
+ */
+enum ice_status
+ice_remove_mac_vlan(struct ice_hw *hw, struct list_head *mv_list)
+{
+	struct ice_sw_recipe *recp_list;
+
+	if (!mv_list || !hw)
+		return ICE_ERR_PARAM;
+
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC_VLAN];
+	return ice_remove_mac_vlan_rule(hw, mv_list, recp_list);
+}
+
+
+/**
+ * ice_vsi_uses_fltr - Determine if given VSI uses specified filter
+ * @fm_entry: filter entry to inspect
+ * @vsi_handle: VSI handle to compare with filter info
+ */
+static bool
+ice_vsi_uses_fltr(struct ice_fltr_mgmt_list_entry *fm_entry, u16 vsi_handle)
+{
+	return ((fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI &&
+		 fm_entry->fltr_info.vsi_handle == vsi_handle) ||
+		(fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI_LIST &&
+		 fm_entry->vsi_list_info &&
+		 (test_bit(vsi_handle, fm_entry->vsi_list_info->vsi_map))));
+}
+
+/**
+ * ice_add_entry_to_vsi_fltr_list - Add copy of fltr_list_entry to remove list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to remove filters from
+ * @vsi_list_head: pointer to the list to add entry to
+ * @fi: pointer to fltr_info of filter entry to copy & add
+ *
+ * Helper function, used when creating a list of filters to remove from
+ * a specific VSI. The entry added to vsi_list_head is a COPY of the
+ * original filter entry, with the exception of fltr_info.fltr_act and
+ * fltr_info.fwd_id fields. These are set such that later logic can
+ * extract which VSI to remove the fltr from, and pass on that information.
+ */
+static enum ice_status
+ice_add_entry_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_handle,
+			       struct list_head *vsi_list_head,
+			       struct ice_fltr_info *fi)
+{
+	struct ice_fltr_list_entry *tmp;
+
+	/* this memory is freed up in the caller function
+	 * once filters for this VSI are removed
+	 */
+	tmp = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return ICE_ERR_NO_MEMORY;
+
+	tmp->fltr_info = *fi;
+
+	/* Overwrite these fields to indicate which VSI to remove filter from,
+	 * so find and remove logic can extract the information from the
+	 * list entries. Note that original entries will still have proper
+	 * values.
+	 */
+	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
+	tmp->fltr_info.vsi_handle = vsi_handle;
+	tmp->fltr_info.fwd_id.hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+
+	list_add(&tmp->list_entry, vsi_list_head);
+
+	return 0;
+}
+
+/**
+ * ice_add_to_vsi_fltr_list - Add VSI filters to the list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to remove filters from
+ * @lkup_list_head: pointer to the list that has certain lookup type filters
+ * @vsi_list_head: pointer to the list pertaining to VSI with vsi_handle
+ *
+ * Locates all filters in lkup_list_head that are used by the given VSI,
+ * and adds COPIES of those entries to vsi_list_head (intended to be used
+ * to remove the listed filters).
+ * Note that this means all entries in vsi_list_head must be explicitly
+ * deallocated by the caller when done with list.
+ */
+static enum ice_status
+ice_add_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_handle,
+			 struct list_head *lkup_list_head,
+			 struct list_head *vsi_list_head)
+{
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	enum ice_status status = 0;
+
+	/* check to make sure VSI ID is valid and within boundary */
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	list_for_each_entry(fm_entry, lkup_list_head, list_entry) {
+		if (!ice_vsi_uses_fltr(fm_entry, vsi_handle))
+			continue;
+
+		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_handle,
+							vsi_list_head,
+							&fm_entry->fltr_info);
+		if (status)
+			return status;
+	}
+	return status;
+}
+
+
+/**
+ * ice_determine_promisc_mask
+ * @fi: filter info to parse
+ *
+ * Helper function to determine which ICE_PROMISC_ mask corresponds
+ * to given filter into.
+ */
+static u8 ice_determine_promisc_mask(struct ice_fltr_info *fi)
+{
+	u16 vid = fi->l_data.mac_vlan.vlan_id;
+	u8 *macaddr = fi->l_data.mac.mac_addr;
+	bool is_tx_fltr = false;
+	u8 promisc_mask = 0;
+
+	if (fi->flag == ICE_FLTR_TX)
+		is_tx_fltr = true;
+
+	if (is_broadcast_ether_addr(macaddr))
+		promisc_mask |= is_tx_fltr ?
+			ICE_PROMISC_BCAST_TX : ICE_PROMISC_BCAST_RX;
+	else if (is_multicast_ether_addr(macaddr))
+		promisc_mask |= is_tx_fltr ?
+			ICE_PROMISC_MCAST_TX : ICE_PROMISC_MCAST_RX;
+	else if (is_unicast_ether_addr(macaddr))
+		promisc_mask |= is_tx_fltr ?
+			ICE_PROMISC_UCAST_TX : ICE_PROMISC_UCAST_RX;
+	if (vid)
+		promisc_mask |= is_tx_fltr ?
+			ICE_PROMISC_VLAN_TX : ICE_PROMISC_VLAN_RX;
 
-	/* Create the switch rule with the final dummy Ethernet header */
-	if (opc != ice_aqc_opc_update_sw_rules)
-		s_rule->pdata.lkup_tx_rx.hdr_len = cpu_to_le16(eth_hdr_sz);
+	return promisc_mask;
 }
 
+
 /**
- * ice_add_marker_act
+ * _ice_get_vsi_promisc - get promiscuous mode of given VSI
  * @hw: pointer to the hardware structure
- * @m_ent: the management entry for which sw marker needs to be added
- * @sw_marker: sw marker to tag the Rx descriptor with
- * @l_id: large action resource ID
- *
- * Create a large action to hold software marker and update the switch rule
- * entry pointed by m_ent with newly created large action
+ * @vsi_handle: VSI handle to retrieve info from
+ * @promisc_mask: pointer to mask to be filled in
+ * @vid: VLAN ID of promisc VLAN VSI
+ * @sw: pointer to switch info struct for which function add rule
  */
 static enum ice_status
-ice_add_marker_act(struct ice_hw *hw, struct ice_fltr_mgmt_list_entry *m_ent,
-		   u16 sw_marker, u16 l_id)
+_ice_get_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+		     u16 *vid, struct ice_switch_info *sw)
 {
-	struct ice_aqc_sw_rules_elem *lg_act, *rx_tx;
-	/* For software marker we need 3 large actions
-	 * 1. FWD action: FWD TO VSI or VSI LIST
-	 * 2. GENERIC VALUE action to hold the profile ID
-	 * 3. GENERIC VALUE action to hold the software marker ID
-	 */
-	const u16 num_lg_acts = 3;
-	enum ice_status status;
-	u16 lg_act_size;
-	u16 rules_size;
-	u32 act;
-	u16 id;
+	struct ice_fltr_mgmt_list_entry *itr;
+	struct list_head *rule_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
 
-	if (m_ent->fltr_info.lkup_type != ICE_SW_LKUP_MAC)
+	if (!ice_is_vsi_valid(hw, vsi_handle))
 		return ICE_ERR_PARAM;
 
-	/* Create two back-to-back switch rules and submit them to the HW using
-	 * one memory buffer:
-	 *    1. Large Action
-	 *    2. Look up Tx Rx
-	 */
-	lg_act_size = (u16)ICE_SW_RULE_LG_ACT_SIZE(num_lg_acts);
-	rules_size = lg_act_size + ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
-	lg_act = devm_kzalloc(ice_hw_to_dev(hw), rules_size, GFP_KERNEL);
-	if (!lg_act)
-		return ICE_ERR_NO_MEMORY;
-
-	rx_tx = (struct ice_aqc_sw_rules_elem *)((u8 *)lg_act + lg_act_size);
-
-	/* Fill in the first switch rule i.e. large action */
-	lg_act->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LG_ACT);
-	lg_act->pdata.lg_act.index = cpu_to_le16(l_id);
-	lg_act->pdata.lg_act.size = cpu_to_le16(num_lg_acts);
-
-	/* First action VSI forwarding or VSI list forwarding depending on how
-	 * many VSIs
-	 */
-	id = (m_ent->vsi_count > 1) ? m_ent->fltr_info.fwd_id.vsi_list_id :
-		m_ent->fltr_info.fwd_id.hw_vsi_id;
-
-	act = ICE_LG_ACT_VSI_FORWARDING | ICE_LG_ACT_VALID_BIT;
-	act |= (id << ICE_LG_ACT_VSI_LIST_ID_S) &
-		ICE_LG_ACT_VSI_LIST_ID_M;
-	if (m_ent->vsi_count > 1)
-		act |= ICE_LG_ACT_VSI_LIST;
-	lg_act->pdata.lg_act.act[0] = cpu_to_le32(act);
+	*vid = 0;
+	*promisc_mask = 0;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_PROMISC].filt_rules;
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_PROMISC].filt_rule_lock;
 
-	/* Second action descriptor type */
-	act = ICE_LG_ACT_GENERIC;
+	mutex_lock(rule_lock);
+	list_for_each_entry(itr, rule_head, list_entry) {
+		/* Continue if this filter doesn't apply to this VSI or the
+		 * VSI ID is not in the VSI map for this filter
+		 */
+		if (!ice_vsi_uses_fltr(itr, vsi_handle))
+			continue;
 
-	act |= (1 << ICE_LG_ACT_GENERIC_VALUE_S) & ICE_LG_ACT_GENERIC_VALUE_M;
-	lg_act->pdata.lg_act.act[1] = cpu_to_le32(act);
+		*promisc_mask |= ice_determine_promisc_mask(&itr->fltr_info);
+	}
+	mutex_unlock(rule_lock);
 
-	act = (ICE_LG_ACT_GENERIC_OFF_RX_DESC_PROF_IDX <<
-	       ICE_LG_ACT_GENERIC_OFFSET_S) & ICE_LG_ACT_GENERIC_OFFSET_M;
+	return 0;
+}
 
-	/* Third action Marker value */
-	act |= ICE_LG_ACT_GENERIC;
-	act |= (sw_marker << ICE_LG_ACT_GENERIC_VALUE_S) &
-		ICE_LG_ACT_GENERIC_VALUE_M;
+/**
+ * ice_get_vsi_promisc - get promiscuous mode of given VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to retrieve info from
+ * @promisc_mask: pointer to mask to be filled in
+ * @vid: VLAN ID of promisc VLAN VSI
+ */
+enum ice_status
+ice_get_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+		    u16 *vid)
+{
+	return _ice_get_vsi_promisc(hw, vsi_handle, promisc_mask,
+				    vid, hw->switch_info);
+}
 
-	lg_act->pdata.lg_act.act[2] = cpu_to_le32(act);
+/**
+ * _ice_get_vsi_vlan_promisc - get VLAN promiscuous mode of given VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to retrieve info from
+ * @promisc_mask: pointer to mask to be filled in
+ * @vid: VLAN ID of promisc VLAN VSI
+ * @sw: pointer to switch info struct for which function add rule
+ */
+static enum ice_status
+_ice_get_vsi_vlan_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+			  u16 *vid, struct ice_switch_info *sw)
+{
+	struct ice_fltr_mgmt_list_entry *itr;
+	struct list_head *rule_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
 
-	/* call the fill switch rule to fill the lookup Tx Rx structure */
-	ice_fill_sw_rule(hw, &m_ent->fltr_info, rx_tx,
-			 ice_aqc_opc_update_sw_rules);
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
 
-	/* Update the action to point to the large action ID */
-	rx_tx->pdata.lkup_tx_rx.act =
-		cpu_to_le32(ICE_SINGLE_ACT_PTR |
-			    ((l_id << ICE_SINGLE_ACT_PTR_VAL_S) &
-			     ICE_SINGLE_ACT_PTR_VAL_M));
+	*vid = 0;
+	*promisc_mask = 0;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_PROMISC_VLAN].filt_rules;
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_PROMISC_VLAN].filt_rule_lock;
 
-	/* Use the filter rule ID of the previously created rule with single
-	 * act. Once the update happens, hardware will treat this as large
-	 * action
-	 */
-	rx_tx->pdata.lkup_tx_rx.index =
-		cpu_to_le16(m_ent->fltr_info.fltr_rule_id);
+	mutex_lock(rule_lock);
+	list_for_each_entry(itr, rule_head, list_entry) {
+		/* Continue if this filter doesn't apply to this VSI or the
+		 * VSI ID is not in the VSI map for this filter
+		 */
+		if (!ice_vsi_uses_fltr(itr, vsi_handle))
+			continue;
 
-	status = ice_aq_sw_rules(hw, lg_act, rules_size, 2,
-				 ice_aqc_opc_update_sw_rules, NULL);
-	if (!status) {
-		m_ent->lg_act_idx = l_id;
-		m_ent->sw_marker_id = sw_marker;
+		*promisc_mask |= ice_determine_promisc_mask(&itr->fltr_info);
 	}
+	mutex_unlock(rule_lock);
 
-	devm_kfree(ice_hw_to_dev(hw), lg_act);
-	return status;
+	return 0;
 }
 
 /**
- * ice_create_vsi_list_map
+ * ice_get_vsi_vlan_promisc - get VLAN promiscuous mode of given VSI
  * @hw: pointer to the hardware structure
- * @vsi_handle_arr: array of VSI handles to set in the VSI mapping
- * @num_vsi: number of VSI handles in the array
- * @vsi_list_id: VSI list ID generated as part of allocate resource
- *
- * Helper function to create a new entry of VSI list ID to VSI mapping
- * using the given VSI list ID
+ * @vsi_handle: VSI handle to retrieve info from
+ * @promisc_mask: pointer to mask to be filled in
+ * @vid: VLAN ID of promisc VLAN VSI
  */
-static struct ice_vsi_list_map_info *
-ice_create_vsi_list_map(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
-			u16 vsi_list_id)
+enum ice_status
+ice_get_vsi_vlan_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+			 u16 *vid)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_vsi_list_map_info *v_map;
-	int i;
-
-	v_map = devm_kcalloc(ice_hw_to_dev(hw), 1, sizeof(*v_map), GFP_KERNEL);
-	if (!v_map)
-		return NULL;
+	return _ice_get_vsi_vlan_promisc(hw, vsi_handle, promisc_mask,
+					 vid, hw->switch_info);
+}
 
-	v_map->vsi_list_id = vsi_list_id;
-	v_map->ref_cnt = 1;
-	for (i = 0; i < num_vsi; i++)
-		set_bit(vsi_handle_arr[i], v_map->vsi_map);
+/**
+ * ice_remove_promisc - Remove promisc based filter rules
+ * @hw: pointer to the hardware structure
+ * @recp_id: recipe ID for which the rule needs to removed
+ * @v_list: list of promisc entries
+ */
+static enum ice_status
+ice_remove_promisc(struct ice_hw *hw, u8 recp_id,
+		   struct list_head *v_list)
+{
+	struct ice_fltr_list_entry *v_list_itr, *tmp;
+	struct ice_sw_recipe *recp_list;
 
-	list_add(&v_map->list_entry, &sw->vsi_list_map_head);
-	return v_map;
+	recp_list = &hw->switch_info->recp_list[recp_id];
+	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
+		v_list_itr->status =
+			ice_remove_rule_internal(hw, recp_list, v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
+	}
+	return 0;
 }
 
 /**
- * ice_update_vsi_list_rule
+ * _ice_clear_vsi_promisc - clear specified promiscuous mode(s)
  * @hw: pointer to the hardware structure
- * @vsi_handle_arr: array of VSI handles to form a VSI list
- * @num_vsi: number of VSI handles in the array
- * @vsi_list_id: VSI list ID generated as part of allocate resource
- * @remove: Boolean value to indicate if this is a remove action
- * @opc: switch rules population command type - pass in the command opcode
- * @lkup_type: lookup type of the filter
- *
- * Call AQ command to add a new switch rule or update existing switch rule
- * using the given VSI list ID
+ * @vsi_handle: VSI handle to clear mode
+ * @promisc_mask: mask of promiscuous config bits to clear
+ * @vid: VLAN ID to clear VLAN promiscuous
+ * @sw: pointer to switch info struct for which function add rule
  */
 static enum ice_status
-ice_update_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
-			 u16 vsi_list_id, bool remove, enum ice_adminq_opc opc,
-			 enum ice_sw_lkup_type lkup_type)
+_ice_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+		       u16 vid, struct ice_switch_info *sw)
 {
-	struct ice_aqc_sw_rules_elem *s_rule;
-	enum ice_status status;
-	u16 s_rule_size;
-	u16 type;
-	int i;
+	struct ice_fltr_list_entry *fm_entry, *tmp;
+	struct list_head remove_list_head;
+	struct ice_fltr_mgmt_list_entry *itr;
+	struct list_head *rule_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	u8 recipe_id;
 
-	if (!num_vsi)
+	if (!ice_is_vsi_valid(hw, vsi_handle))
 		return ICE_ERR_PARAM;
 
-	if (lkup_type == ICE_SW_LKUP_MAC ||
-	    lkup_type == ICE_SW_LKUP_MAC_VLAN ||
-	    lkup_type == ICE_SW_LKUP_ETHERTYPE ||
-	    lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
-	    lkup_type == ICE_SW_LKUP_PROMISC ||
-	    lkup_type == ICE_SW_LKUP_PROMISC_VLAN)
-		type = remove ? ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR :
-				ICE_AQC_SW_RULES_T_VSI_LIST_SET;
-	else if (lkup_type == ICE_SW_LKUP_VLAN)
-		type = remove ? ICE_AQC_SW_RULES_T_PRUNE_LIST_CLEAR :
-				ICE_AQC_SW_RULES_T_PRUNE_LIST_SET;
+	if (promisc_mask & (ICE_PROMISC_VLAN_RX | ICE_PROMISC_VLAN_TX))
+		recipe_id = ICE_SW_LKUP_PROMISC_VLAN;
 	else
-		return ICE_ERR_PARAM;
+		recipe_id = ICE_SW_LKUP_PROMISC;
 
-	s_rule_size = (u16)ICE_SW_RULE_VSI_LIST_SIZE(num_vsi);
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
-	for (i = 0; i < num_vsi; i++) {
-		if (!ice_is_vsi_valid(hw, vsi_handle_arr[i])) {
-			status = ICE_ERR_PARAM;
-			goto exit;
+	rule_head = &sw->recp_list[recipe_id].filt_rules;
+	rule_lock = &sw->recp_list[recipe_id].filt_rule_lock;
+
+	INIT_LIST_HEAD(&remove_list_head);
+
+	mutex_lock(rule_lock);
+	list_for_each_entry(itr, rule_head, list_entry) {
+		struct ice_fltr_info *fltr_info;
+		u8 fltr_promisc_mask = 0;
+
+		if (!ice_vsi_uses_fltr(itr, vsi_handle))
+			continue;
+		fltr_info = &itr->fltr_info;
+
+		if (recipe_id == ICE_SW_LKUP_PROMISC_VLAN &&
+		    vid != fltr_info->l_data.mac_vlan.vlan_id)
+			continue;
+
+		fltr_promisc_mask |= ice_determine_promisc_mask(fltr_info);
+
+		/* Skip if filter is not completely specified by given mask */
+		if (fltr_promisc_mask & ~promisc_mask)
+			continue;
+
+		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_handle,
+							&remove_list_head,
+							fltr_info);
+		if (status) {
+			mutex_unlock(rule_lock);
+			goto free_fltr_list;
 		}
-		/* AQ call requires hw_vsi_id(s) */
-		s_rule->pdata.vsi_list.vsi[i] =
-			cpu_to_le16(ice_get_hw_vsi_num(hw, vsi_handle_arr[i]));
 	}
+	mutex_unlock(rule_lock);
 
-	s_rule->type = cpu_to_le16(type);
-	s_rule->pdata.vsi_list.number_vsi = cpu_to_le16(num_vsi);
-	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_id);
+	status = ice_remove_promisc(hw, recipe_id, &remove_list_head);
 
-	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opc, NULL);
+free_fltr_list:
+	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
+		list_del(&fm_entry->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), fm_entry);
+	}
 
-exit:
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
 	return status;
 }
 
 /**
- * ice_create_vsi_list_rule - Creates and populates a VSI list rule
- * @hw: pointer to the HW struct
- * @vsi_handle_arr: array of VSI handles to form a VSI list
- * @num_vsi: number of VSI handles in the array
- * @vsi_list_id: stores the ID of the VSI list to be created
- * @lkup_type: switch rule filter's lookup type
+ * ice_clear_vsi_promisc - clear specified promiscuous mode(s) for given VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to clear mode
+ * @promisc_mask: mask of promiscuous config bits to clear
+ * @vid: VLAN ID to clear VLAN promiscuous
  */
-static enum ice_status
-ice_create_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
-			 u16 *vsi_list_id, enum ice_sw_lkup_type lkup_type)
+enum ice_status
+ice_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle,
+		      u8 promisc_mask, u16 vid)
 {
-	enum ice_status status;
-
-	status = ice_aq_alloc_free_vsi_list(hw, vsi_list_id, lkup_type,
-					    ice_aqc_opc_alloc_res);
-	if (status)
-		return status;
-
-	/* Update the newly created VSI list to include the specified VSIs */
-	return ice_update_vsi_list_rule(hw, vsi_handle_arr, num_vsi,
-					*vsi_list_id, false,
-					ice_aqc_opc_add_sw_rules, lkup_type);
+	return _ice_clear_vsi_promisc(hw, vsi_handle, promisc_mask,
+				      vid, hw->switch_info);
 }
 
 /**
- * ice_create_pkt_fwd_rule
+ * _ice_set_vsi_promisc - set given VSI to given promiscuous mode(s)
  * @hw: pointer to the hardware structure
- * @f_entry: entry containing packet forwarding information
- *
- * Create switch rule with given filter information and add an entry
- * to the corresponding filter management list to track this switch rule
- * and VSI mapping
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @vid: VLAN ID to set VLAN promiscuous
+ * @lport: logical port number to configure promisc mode
+ * @sw: pointer to switch info struct for which function add rule
  */
 static enum ice_status
-ice_create_pkt_fwd_rule(struct ice_hw *hw,
-			struct ice_fltr_list_entry *f_entry)
+_ice_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+		     u16 vid, u8 lport, struct ice_switch_info *sw)
 {
-	struct ice_fltr_mgmt_list_entry *fm_entry;
-	struct ice_aqc_sw_rules_elem *s_rule;
-	enum ice_sw_lkup_type l_type;
-	struct ice_sw_recipe *recp;
-	enum ice_status status;
+	enum { UCAST_FLTR = 1, MCAST_FLTR, BCAST_FLTR };
+	struct ice_fltr_list_entry f_list_entry;
+	struct ice_fltr_info new_fltr;
+	enum ice_status status = 0;
+	bool is_tx_fltr;
+	u16 hw_vsi_id;
+	int pkt_type;
+	u8 recipe_id;
 
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
-			      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
-	fm_entry = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*fm_entry),
-				GFP_KERNEL);
-	if (!fm_entry) {
-		status = ICE_ERR_NO_MEMORY;
-		goto ice_create_pkt_fwd_rule_exit;
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+
+	memset(&new_fltr, 0, sizeof(new_fltr));
+
+	if (promisc_mask & (ICE_PROMISC_VLAN_RX | ICE_PROMISC_VLAN_TX)) {
+		new_fltr.lkup_type = ICE_SW_LKUP_PROMISC_VLAN;
+		new_fltr.l_data.mac_vlan.vlan_id = vid;
+		recipe_id = ICE_SW_LKUP_PROMISC_VLAN;
+	} else {
+		new_fltr.lkup_type = ICE_SW_LKUP_PROMISC;
+		recipe_id = ICE_SW_LKUP_PROMISC;
 	}
 
-	fm_entry->fltr_info = f_entry->fltr_info;
+	/* Separate filters must be set for each direction/packet type
+	 * combination, so we will loop over the mask value, store the
+	 * individual type, and clear it out in the input mask as it
+	 * is found.
+	 */
+	while (promisc_mask) {
+		struct ice_sw_recipe *recp_list;
+		u8 *mac_addr;
 
-	/* Initialize all the fields for the management entry */
-	fm_entry->vsi_count = 1;
-	fm_entry->lg_act_idx = ICE_INVAL_LG_ACT_INDEX;
-	fm_entry->sw_marker_id = ICE_INVAL_SW_MARKER_ID;
-	fm_entry->counter_index = ICE_INVAL_COUNTER_ID;
+		pkt_type = 0;
+		is_tx_fltr = false;
 
-	ice_fill_sw_rule(hw, &fm_entry->fltr_info, s_rule,
-			 ice_aqc_opc_add_sw_rules);
+		if (promisc_mask & ICE_PROMISC_UCAST_RX) {
+			promisc_mask &= ~ICE_PROMISC_UCAST_RX;
+			pkt_type = UCAST_FLTR;
+		} else if (promisc_mask & ICE_PROMISC_UCAST_TX) {
+			promisc_mask &= ~ICE_PROMISC_UCAST_TX;
+			pkt_type = UCAST_FLTR;
+			is_tx_fltr = true;
+		} else if (promisc_mask & ICE_PROMISC_MCAST_RX) {
+			promisc_mask &= ~ICE_PROMISC_MCAST_RX;
+			pkt_type = MCAST_FLTR;
+		} else if (promisc_mask & ICE_PROMISC_MCAST_TX) {
+			promisc_mask &= ~ICE_PROMISC_MCAST_TX;
+			pkt_type = MCAST_FLTR;
+			is_tx_fltr = true;
+		} else if (promisc_mask & ICE_PROMISC_BCAST_RX) {
+			promisc_mask &= ~ICE_PROMISC_BCAST_RX;
+			pkt_type = BCAST_FLTR;
+		} else if (promisc_mask & ICE_PROMISC_BCAST_TX) {
+			promisc_mask &= ~ICE_PROMISC_BCAST_TX;
+			pkt_type = BCAST_FLTR;
+			is_tx_fltr = true;
+		}
 
-	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
-				 ice_aqc_opc_add_sw_rules, NULL);
-	if (status) {
-		devm_kfree(ice_hw_to_dev(hw), fm_entry);
-		goto ice_create_pkt_fwd_rule_exit;
-	}
+		/* Check for VLAN promiscuous flag */
+		if (promisc_mask & ICE_PROMISC_VLAN_RX) {
+			promisc_mask &= ~ICE_PROMISC_VLAN_RX;
+		} else if (promisc_mask & ICE_PROMISC_VLAN_TX) {
+			promisc_mask &= ~ICE_PROMISC_VLAN_TX;
+			is_tx_fltr = true;
+		}
 
-	f_entry->fltr_info.fltr_rule_id =
-		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
-	fm_entry->fltr_info.fltr_rule_id =
-		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+		/* Set filter DA based on packet type */
+		mac_addr = new_fltr.l_data.mac.mac_addr;
+		if (pkt_type == BCAST_FLTR) {
+			eth_broadcast_addr(mac_addr);
+		} else if (pkt_type == MCAST_FLTR ||
+			   pkt_type == UCAST_FLTR) {
+			/* Use the dummy ether header DA */
+			ether_addr_copy(mac_addr, dummy_eth_header);
+			if (pkt_type == MCAST_FLTR)
+				mac_addr[0] |= 0x1;	/* Set multicast bit */
+		}
 
-	/* The book keeping entries will get removed when base driver
-	 * calls remove filter AQ command
-	 */
-	l_type = fm_entry->fltr_info.lkup_type;
-	recp = &hw->switch_info->recp_list[l_type];
-	list_add(&fm_entry->list_entry, &recp->filt_rules);
+		/* Need to reset this to zero for all iterations */
+		new_fltr.flag = 0;
+		if (is_tx_fltr) {
+			new_fltr.flag |= ICE_FLTR_TX;
+			new_fltr.src = hw_vsi_id;
+		} else {
+			new_fltr.flag |= ICE_FLTR_RX;
+			new_fltr.src = lport;
+		}
 
-ice_create_pkt_fwd_rule_exit:
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
+		new_fltr.fltr_act = ICE_FWD_TO_VSI;
+		new_fltr.vsi_handle = vsi_handle;
+		new_fltr.fwd_id.hw_vsi_id = hw_vsi_id;
+		f_list_entry.fltr_info = new_fltr;
+		recp_list = &sw->recp_list[recipe_id];
+
+		status = ice_add_rule_internal(hw, recp_list, lport,
+					       &f_list_entry);
+		if (status)
+			goto set_promisc_exit;
+	}
+
+set_promisc_exit:
 	return status;
 }
 
 /**
- * ice_update_pkt_fwd_rule
+ * ice_set_vsi_promisc - set given VSI to given promiscuous mode(s)
  * @hw: pointer to the hardware structure
- * @f_info: filter information for switch rule
- *
- * Call AQ command to update a previously created switch rule with a
- * VSI list ID
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @vid: VLAN ID to set VLAN promiscuous
  */
-static enum ice_status
-ice_update_pkt_fwd_rule(struct ice_hw *hw, struct ice_fltr_info *f_info)
+enum ice_status
+ice_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+		    u16 vid)
 {
-	struct ice_aqc_sw_rules_elem *s_rule;
-	enum ice_status status;
-
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
-			      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
-
-	ice_fill_sw_rule(hw, f_info, s_rule, ice_aqc_opc_update_sw_rules);
-
-	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(f_info->fltr_rule_id);
-
-	/* Update switch rule with new rule set to forward VSI list */
-	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
-				 ice_aqc_opc_update_sw_rules, NULL);
-
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
-	return status;
+	return _ice_set_vsi_promisc(hw, vsi_handle, promisc_mask, vid,
+				    hw->port_info->lport,
+				    hw->switch_info);
 }
 
 /**
- * ice_update_sw_rule_bridge_mode
- * @hw: pointer to the HW struct
+ * _ice_set_vlan_vsi_promisc
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @rm_vlan_promisc: Clear VLANs VSI promisc mode
+ * @lport: logical port number to configure promisc mode
+ * @sw: pointer to switch info struct for which function add rule
  *
- * Updates unicast switch filter rules based on VEB/VEPA mode
+ * Configure VSI with all associated VLANs to given promiscuous mode(s)
  */
-enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw)
+static enum ice_status
+_ice_set_vlan_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			  bool rm_vlan_promisc, u8 lport,
+			  struct ice_switch_info *sw)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *fm_entry;
-	enum ice_status status = 0;
-	struct list_head *rule_head;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-
-	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
-	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+	struct ice_fltr_list_entry *list_itr, *tmp;
+	struct list_head vsi_list_head;
+	struct list_head *vlan_head;
+	struct mutex *vlan_lock; /* Lock to protect filter rule list */
+	enum ice_status status;
+	u16 vlan_id;
 
-	mutex_lock(rule_lock);
-	list_for_each_entry(fm_entry, rule_head, list_entry) {
-		struct ice_fltr_info *fi = &fm_entry->fltr_info;
-		u8 *addr = fi->l_data.mac.mac_addr;
+	INIT_LIST_HEAD(&vsi_list_head);
+	vlan_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
+	vlan_head = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rules;
+	mutex_lock(vlan_lock);
+	status = ice_add_to_vsi_fltr_list(hw, vsi_handle, vlan_head,
+					  &vsi_list_head);
+	mutex_unlock(vlan_lock);
+	if (status)
+		goto free_fltr_list;
 
-		/* Update unicast Tx rules to reflect the selected
-		 * VEB/VEPA mode
-		 */
-		if ((fi->flag & ICE_FLTR_TX) && is_unicast_ether_addr(addr) &&
-		    (fi->fltr_act == ICE_FWD_TO_VSI ||
-		     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
-		     fi->fltr_act == ICE_FWD_TO_Q ||
-		     fi->fltr_act == ICE_FWD_TO_QGRP)) {
-			status = ice_update_pkt_fwd_rule(hw, fi);
-			if (status)
-				break;
-		}
+	list_for_each_entry(list_itr, &vsi_list_head, list_entry) {
+		vlan_id = list_itr->fltr_info.l_data.vlan.vlan_id;
+		if (rm_vlan_promisc)
+			status =  _ice_clear_vsi_promisc(hw, vsi_handle,
+							 promisc_mask,
+							 vlan_id, sw);
+		else
+			status =  _ice_set_vsi_promisc(hw, vsi_handle,
+						       promisc_mask, vlan_id,
+						       lport, sw);
+		if (status)
+			break;
 	}
 
-	mutex_unlock(rule_lock);
-
+free_fltr_list:
+	list_for_each_entry_safe(list_itr, tmp, &vsi_list_head, list_entry) {
+		list_del(&list_itr->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), list_itr);
+	}
 	return status;
 }
 
 /**
- * ice_add_update_vsi_list
+ * ice_set_vlan_vsi_promisc
  * @hw: pointer to the hardware structure
- * @m_entry: pointer to current filter management list entry
- * @cur_fltr: filter information from the book keeping entry
- * @new_fltr: filter information with the new VSI to be added
- *
- * Call AQ command to add or update previously created VSI list with new VSI.
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @rm_vlan_promisc: Clear VLANs VSI promisc mode
  *
- * Helper function to do book keeping associated with adding filter information
- * The algorithm to do the book keeping is described below :
- * When a VSI needs to subscribe to a given filter (MAC/VLAN/Ethtype etc.)
- *	if only one VSI has been added till now
- *		Allocate a new VSI list and add two VSIs
- *		to this list using switch rule command
- *		Update the previously created switch rule with the
- *		newly created VSI list ID
- *	if a VSI list was previously created
- *		Add the new VSI to the previously created VSI list set
- *		using the update switch rule command
+ * Configure VSI with all associated VLANs to given promiscuous mode(s)
  */
-static enum ice_status
-ice_add_update_vsi_list(struct ice_hw *hw,
-			struct ice_fltr_mgmt_list_entry *m_entry,
-			struct ice_fltr_info *cur_fltr,
-			struct ice_fltr_info *new_fltr)
+enum ice_status
+ice_set_vlan_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			 bool rm_vlan_promisc)
 {
-	enum ice_status status = 0;
-	u16 vsi_list_id = 0;
-
-	if ((cur_fltr->fltr_act == ICE_FWD_TO_Q ||
-	     cur_fltr->fltr_act == ICE_FWD_TO_QGRP))
-		return ICE_ERR_NOT_IMPL;
-
-	if ((new_fltr->fltr_act == ICE_FWD_TO_Q ||
-	     new_fltr->fltr_act == ICE_FWD_TO_QGRP) &&
-	    (cur_fltr->fltr_act == ICE_FWD_TO_VSI ||
-	     cur_fltr->fltr_act == ICE_FWD_TO_VSI_LIST))
-		return ICE_ERR_NOT_IMPL;
-
-	if (m_entry->vsi_count < 2 && !m_entry->vsi_list_info) {
-		/* Only one entry existed in the mapping and it was not already
-		 * a part of a VSI list. So, create a VSI list with the old and
-		 * new VSIs.
-		 */
-		struct ice_fltr_info tmp_fltr;
-		u16 vsi_handle_arr[2];
-
-		/* A rule already exists with the new VSI being added */
-		if (cur_fltr->fwd_id.hw_vsi_id == new_fltr->fwd_id.hw_vsi_id)
-			return ICE_ERR_ALREADY_EXISTS;
-
-		vsi_handle_arr[0] = cur_fltr->vsi_handle;
-		vsi_handle_arr[1] = new_fltr->vsi_handle;
-		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
-						  &vsi_list_id,
-						  new_fltr->lkup_type);
-		if (status)
-			return status;
-
-		tmp_fltr = *new_fltr;
-		tmp_fltr.fltr_rule_id = cur_fltr->fltr_rule_id;
-		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
-		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
-		/* Update the previous switch rule of "MAC forward to VSI" to
-		 * "MAC fwd to VSI list"
-		 */
-		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
-		if (status)
-			return status;
-
-		cur_fltr->fwd_id.vsi_list_id = vsi_list_id;
-		cur_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
-		m_entry->vsi_list_info =
-			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
-						vsi_list_id);
-
-		if (!m_entry->vsi_list_info)
-			return ICE_ERR_NO_MEMORY;
-
-		/* If this entry was large action then the large action needs
-		 * to be updated to point to FWD to VSI list
-		 */
-		if (m_entry->sw_marker_id != ICE_INVAL_SW_MARKER_ID)
-			status =
-			    ice_add_marker_act(hw, m_entry,
-					       m_entry->sw_marker_id,
-					       m_entry->lg_act_idx);
-	} else {
-		u16 vsi_handle = new_fltr->vsi_handle;
-		enum ice_adminq_opc opcode;
+	return _ice_set_vlan_vsi_promisc(hw, vsi_handle, promisc_mask,
+					 rm_vlan_promisc, hw->port_info->lport,
+					 hw->switch_info);
+}
 
-		if (!m_entry->vsi_list_info)
-			return ICE_ERR_CFG;
+/**
+ * ice_remove_vsi_lkup_fltr - Remove lookup type filters for a VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to remove filters from
+ * @recp_list: recipe list from which function remove fltr
+ * @lkup: switch rule filter lookup type
+ */
+static void
+ice_remove_vsi_lkup_fltr(struct ice_hw *hw, u16 vsi_handle,
+			 struct ice_sw_recipe *recp_list,
+			 enum ice_sw_lkup_type lkup)
+{
+	struct ice_fltr_list_entry *fm_entry;
+	struct list_head remove_list_head;
+	struct list_head *rule_head;
+	struct ice_fltr_list_entry *tmp;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
+	enum ice_status status;
 
-		/* A rule already exists with the new VSI being added */
-		if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map))
-			return 0;
+	INIT_LIST_HEAD(&remove_list_head);
+	rule_lock = &recp_list[lkup].filt_rule_lock;
+	rule_head = &recp_list[lkup].filt_rules;
+	mutex_lock(rule_lock);
+	status = ice_add_to_vsi_fltr_list(hw, vsi_handle, rule_head,
+					  &remove_list_head);
+	mutex_unlock(rule_lock);
+	if (status)
+		goto free_fltr_list;
 
-		/* Update the previously created VSI list set with
-		 * the new VSI ID passed in
-		 */
-		vsi_list_id = cur_fltr->fwd_id.vsi_list_id;
-		opcode = ice_aqc_opc_update_sw_rules;
+	switch (lkup) {
+	case ICE_SW_LKUP_MAC:
+		ice_remove_mac_rule(hw, &remove_list_head, &recp_list[lkup]);
+		break;
+	case ICE_SW_LKUP_VLAN:
+		ice_remove_vlan_rule(hw, &remove_list_head, &recp_list[lkup]);
+		break;
+	case ICE_SW_LKUP_PROMISC:
+	case ICE_SW_LKUP_PROMISC_VLAN:
+		ice_remove_promisc(hw, lkup, &remove_list_head);
+		break;
+	case ICE_SW_LKUP_MAC_VLAN:
+		ice_remove_mac_vlan(hw, &remove_list_head);
+		break;
+	case ICE_SW_LKUP_ETHERTYPE:
+	case ICE_SW_LKUP_ETHERTYPE_MAC:
+		ice_remove_eth_mac(hw, &remove_list_head);
+		break;
+	case ICE_SW_LKUP_DFLT:
+		ice_debug(hw, ICE_DBG_SW, "Remove filters for this lookup type hasn't been implemented yet\n");
+		break;
+	case ICE_SW_LKUP_LAST:
+		ice_debug(hw, ICE_DBG_SW, "Unsupported lookup type\n");
+		break;
+	}
 
-		status = ice_update_vsi_list_rule(hw, &vsi_handle, 1,
-						  vsi_list_id, false, opcode,
-						  new_fltr->lkup_type);
-		/* update VSI list mapping info with new VSI ID */
-		if (!status)
-			set_bit(vsi_handle, m_entry->vsi_list_info->vsi_map);
+free_fltr_list:
+	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
+		list_del(&fm_entry->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), fm_entry);
 	}
-	if (!status)
-		m_entry->vsi_count++;
-	return status;
 }
 
 /**
- * ice_find_rule_entry - Search a rule entry
+ * ice_remove_vsi_fltr_rule - Remove all filters for a VSI
  * @hw: pointer to the hardware structure
- * @recp_id: lookup type for which the specified rule needs to be searched
- * @f_info: rule information
- *
- * Helper function to search for a given rule entry
- * Returns pointer to entry storing the rule if found
+ * @vsi_handle: VSI handle to remove filters from
+ * @sw: pointer to switch info struct
  */
-static struct ice_fltr_mgmt_list_entry *
-ice_find_rule_entry(struct ice_hw *hw, u8 recp_id, struct ice_fltr_info *f_info)
+static void
+ice_remove_vsi_fltr_rule(struct ice_hw *hw, u16 vsi_handle,
+			 struct ice_switch_info *sw)
 {
-	struct ice_fltr_mgmt_list_entry *list_itr, *ret = NULL;
-	struct ice_switch_info *sw = hw->switch_info;
-	struct list_head *list_head;
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_MAC);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_MAC_VLAN);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_PROMISC);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_VLAN);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_DFLT);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_ETHERTYPE);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_ETHERTYPE_MAC);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_PROMISC_VLAN);
+}
 
-	list_head = &sw->recp_list[recp_id].filt_rules;
-	list_for_each_entry(list_itr, list_head, list_entry) {
-		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
-			    sizeof(f_info->l_data)) &&
-		    f_info->flag == list_itr->fltr_info.flag) {
-			ret = list_itr;
-			break;
-		}
-	}
-	return ret;
+
+/**
+ * ice_remove_vsi_fltr - Remove all filters for a VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to remove filters from
+ */
+void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle)
+{
+	ice_remove_vsi_fltr_rule(hw, vsi_handle, hw->switch_info);
 }
 
 /**
- * ice_find_vsi_list_entry - Search VSI list map with VSI count 1
+ * ice_alloc_res_cntr - allocating resource counter
  * @hw: pointer to the hardware structure
- * @recp_id: lookup type for which VSI lists needs to be searched
- * @vsi_handle: VSI handle to be found in VSI list
- * @vsi_list_id: VSI list ID found containing vsi_handle
- *
- * Helper function to search a VSI list with single entry containing given VSI
- * handle element. This can be extended further to search VSI list with more
- * than 1 vsi_count. Returns pointer to VSI list entry if found.
+ * @type: type of resource
+ * @alloc_shared: if set it is shared else dedicated
+ * @num_items: number of entries requested for FD resource type
+ * @counter_id: counter index returned by AQ call
  */
-static struct ice_vsi_list_map_info *
-ice_find_vsi_list_entry(struct ice_hw *hw, u8 recp_id, u16 vsi_handle,
-			u16 *vsi_list_id)
+enum ice_status
+ice_alloc_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items,
+		   u16 *counter_id)
 {
-	struct ice_vsi_list_map_info *map_info = NULL;
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *list_itr;
-	struct list_head *list_head;
+	struct ice_aqc_alloc_free_res_elem *buf;
+	enum ice_status status;
+	u16 buf_len;
 
-	list_head = &sw->recp_list[recp_id].filt_rules;
-	list_for_each_entry(list_itr, list_head, list_entry) {
-		if (list_itr->vsi_count == 1 && list_itr->vsi_list_info) {
-			map_info = list_itr->vsi_list_info;
-			if (test_bit(vsi_handle, map_info->vsi_map)) {
-				*vsi_list_id = map_info->vsi_list_id;
-				return map_info;
-			}
-		}
-	}
-	return NULL;
+	/* Allocate resource */
+	buf_len = struct_size(buf, elem, 1);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	buf->num_elems = cpu_to_le16(num_items);
+	buf->res_type = cpu_to_le16(((type << ICE_AQC_RES_TYPE_S) &
+				      ICE_AQC_RES_TYPE_M) | alloc_shared);
+
+	status = ice_aq_alloc_free_res(hw, 1, buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+	if (status)
+		goto exit;
+
+	*counter_id = le16_to_cpu(buf->elem[0].e.sw_resp);
+
+exit:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
 }
 
 /**
- * ice_add_rule_internal - add rule for a given lookup type
+ * ice_free_res_cntr - free resource counter
  * @hw: pointer to the hardware structure
- * @recp_id: lookup type (recipe ID) for which rule has to be added
- * @f_entry: structure containing MAC forwarding information
- *
- * Adds or updates the rule lists for a given recipe
+ * @type: type of resource
+ * @alloc_shared: if set it is shared else dedicated
+ * @num_items: number of entries to be freed for FD resource type
+ * @counter_id: counter ID resource which needs to be freed
  */
-static enum ice_status
-ice_add_rule_internal(struct ice_hw *hw, u8 recp_id,
-		      struct ice_fltr_list_entry *f_entry)
+enum ice_status
+ice_free_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items,
+		  u16 counter_id)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_info *new_fltr, *cur_fltr;
-	struct ice_fltr_mgmt_list_entry *m_entry;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-	enum ice_status status = 0;
+	struct ice_aqc_alloc_free_res_elem *buf;
+	enum ice_status status;
+	u16 buf_len;
 
-	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
-		return ICE_ERR_PARAM;
-	f_entry->fltr_info.fwd_id.hw_vsi_id =
-		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+	/* Free resource */
+	buf_len = struct_size(buf, elem, 1);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
 
-	rule_lock = &sw->recp_list[recp_id].filt_rule_lock;
+	buf->num_elems = cpu_to_le16(num_items);
+	buf->res_type = cpu_to_le16(((type << ICE_AQC_RES_TYPE_S) &
+				      ICE_AQC_RES_TYPE_M) | alloc_shared);
+	buf->elem[0].e.sw_resp = cpu_to_le16(counter_id);
 
-	mutex_lock(rule_lock);
-	new_fltr = &f_entry->fltr_info;
-	if (new_fltr->flag & ICE_FLTR_RX)
-		new_fltr->src = hw->port_info->lport;
-	else if (new_fltr->flag & ICE_FLTR_TX)
-		new_fltr->src = f_entry->fltr_info.fwd_id.hw_vsi_id;
+	status = ice_aq_alloc_free_res(hw, 1, buf, buf_len,
+				       ice_aqc_opc_free_res, NULL);
+	if (status)
+		ice_debug(hw, ICE_DBG_SW, "counter resource could not be freed\n");
 
-	m_entry = ice_find_rule_entry(hw, recp_id, new_fltr);
-	if (!m_entry) {
-		mutex_unlock(rule_lock);
-		return ice_create_pkt_fwd_rule(hw, f_entry);
-	}
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
 
-	cur_fltr = &m_entry->fltr_info;
-	status = ice_add_update_vsi_list(hw, m_entry, cur_fltr, new_fltr);
-	mutex_unlock(rule_lock);
+/**
+ * ice_alloc_vlan_res_counter - obtain counter resource for VLAN type
+ * @hw: pointer to the hardware structure
+ * @counter_id: returns counter index
+ */
+enum ice_status ice_alloc_vlan_res_counter(struct ice_hw *hw, u16 *counter_id)
+{
+	return ice_alloc_res_cntr(hw, ICE_AQC_RES_TYPE_VLAN_COUNTER,
+				  ICE_AQC_RES_TYPE_FLAG_DEDICATED, 1,
+				  counter_id);
+}
 
-	return status;
+/**
+ * ice_free_vlan_res_counter - Free counter resource for VLAN type
+ * @hw: pointer to the hardware structure
+ * @counter_id: counter index to be freed
+ */
+enum ice_status ice_free_vlan_res_counter(struct ice_hw *hw, u16 counter_id)
+{
+	return ice_free_res_cntr(hw, ICE_AQC_RES_TYPE_VLAN_COUNTER,
+				 ICE_AQC_RES_TYPE_FLAG_DEDICATED, 1,
+				 counter_id);
 }
 
 /**
- * ice_remove_vsi_list_rule
+ * ice_alloc_res_lg_act - add large action resource
  * @hw: pointer to the hardware structure
- * @vsi_list_id: VSI list ID generated as part of allocate resource
- * @lkup_type: switch rule filter lookup type
- *
- * The VSI list should be emptied before this function is called to remove the
- * VSI list.
+ * @l_id: large action ID to fill it in
+ * @num_acts: number of actions to hold with a large action entry
  */
 static enum ice_status
-ice_remove_vsi_list_rule(struct ice_hw *hw, u16 vsi_list_id,
-			 enum ice_sw_lkup_type lkup_type)
+ice_alloc_res_lg_act(struct ice_hw *hw, u16 *l_id, u16 num_acts)
 {
-	struct ice_aqc_sw_rules_elem *s_rule;
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
 	enum ice_status status;
-	u16 s_rule_size;
+	u16 buf_len;
 
-	s_rule_size = (u16)ICE_SW_RULE_VSI_LIST_SIZE(0);
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
-	if (!s_rule)
+	if (num_acts > ICE_MAX_LG_ACT || num_acts == 0)
+		return ICE_ERR_PARAM;
+
+	/* Allocate resource for large action */
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
 		return ICE_ERR_NO_MEMORY;
 
-	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR);
-	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_id);
+	sw_buf->num_elems = cpu_to_le16(1);
 
-	/* Free the vsi_list resource that we allocated. It is assumed that the
-	 * list is empty at this point.
+	/* If num_acts is 1, use ICE_AQC_RES_TYPE_WIDE_TABLE_1.
+	 * If num_acts is 2, use ICE_AQC_RES_TYPE_WIDE_TABLE_3.
+	 * If num_acts is greater than 2, then use
+	 * ICE_AQC_RES_TYPE_WIDE_TABLE_4.
+	 * The num_acts cannot exceed 4. This was ensured at the
+	 * beginning of the function.
 	 */
-	status = ice_aq_alloc_free_vsi_list(hw, &vsi_list_id, lkup_type,
-					    ice_aqc_opc_free_res);
+	if (num_acts == 1)
+		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_WIDE_TABLE_1);
+	else if (num_acts == 2)
+		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_WIDE_TABLE_2);
+	else
+		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_WIDE_TABLE_4);
 
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+	if (!status)
+		*l_id = le16_to_cpu(sw_buf->elem[0].e.sw_resp);
+
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
 	return status;
 }
 
 /**
- * ice_rem_update_vsi_list
+ * ice_add_mac_with_sw_marker - add filter with sw marker
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle of the VSI to remove
- * @fm_list: filter management entry for which the VSI list management needs to
- *           be done
+ * @f_info: filter info structure containing the MAC filter information
+ * @sw_marker: sw marker to tag the Rx descriptor with
  */
-static enum ice_status
-ice_rem_update_vsi_list(struct ice_hw *hw, u16 vsi_handle,
-			struct ice_fltr_mgmt_list_entry *fm_list)
+enum ice_status
+ice_add_mac_with_sw_marker(struct ice_hw *hw, struct ice_fltr_info *f_info,
+			   u16 sw_marker)
 {
-	enum ice_sw_lkup_type lkup_type;
-	enum ice_status status = 0;
-	u16 vsi_list_id;
+	struct ice_fltr_mgmt_list_entry *m_entry;
+	struct ice_fltr_list_entry fl_info;
+	struct ice_sw_recipe *recp_list;
+	struct list_head l_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
+	enum ice_status ret;
+	bool entry_exists;
+	u16 lg_act_id;
 
-	if (fm_list->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST ||
-	    fm_list->vsi_count == 0)
+	if (f_info->fltr_act != ICE_FWD_TO_VSI)
 		return ICE_ERR_PARAM;
 
-	/* A rule with the VSI being removed does not exist */
-	if (!test_bit(vsi_handle, fm_list->vsi_list_info->vsi_map))
-		return ICE_ERR_DOES_NOT_EXIST;
+	if (f_info->lkup_type != ICE_SW_LKUP_MAC)
+		return ICE_ERR_PARAM;
 
-	lkup_type = fm_list->fltr_info.lkup_type;
-	vsi_list_id = fm_list->fltr_info.fwd_id.vsi_list_id;
-	status = ice_update_vsi_list_rule(hw, &vsi_handle, 1, vsi_list_id, true,
-					  ice_aqc_opc_update_sw_rules,
-					  lkup_type);
-	if (status)
-		return status;
+	if (sw_marker == ICE_INVAL_SW_MARKER_ID)
+		return ICE_ERR_PARAM;
 
-	fm_list->vsi_count--;
-	clear_bit(vsi_handle, fm_list->vsi_list_info->vsi_map);
+	if (!ice_is_vsi_valid(hw, f_info->vsi_handle))
+		return ICE_ERR_PARAM;
+	f_info->fwd_id.hw_vsi_id = ice_get_hw_vsi_num(hw, f_info->vsi_handle);
 
-	if (fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) {
-		struct ice_fltr_info tmp_fltr_info = fm_list->fltr_info;
-		struct ice_vsi_list_map_info *vsi_list_info =
-			fm_list->vsi_list_info;
-		u16 rem_vsi_handle;
+	/* Add filter if it doesn't exist so then the adding of large
+	 * action always results in update
+	 */
 
-		rem_vsi_handle = find_first_bit(vsi_list_info->vsi_map,
-						ICE_MAX_VSI);
-		if (!ice_is_vsi_valid(hw, rem_vsi_handle))
-			return ICE_ERR_OUT_OF_RANGE;
+	INIT_LIST_HEAD(&l_head);
+	fl_info.fltr_info = *f_info;
+	list_add(&fl_info.list_entry, &l_head);
 
-		/* Make sure VSI list is empty before removing it below */
-		status = ice_update_vsi_list_rule(hw, &rem_vsi_handle, 1,
-						  vsi_list_id, true,
-						  ice_aqc_opc_update_sw_rules,
-						  lkup_type);
-		if (status)
-			return status;
+	entry_exists = false;
+	ret = ice_add_mac_rule(hw, &l_head, hw->switch_info,
+			       hw->port_info->lport);
+	if (ret == ICE_ERR_ALREADY_EXISTS)
+		entry_exists = true;
+	else if (ret)
+		return ret;
 
-		tmp_fltr_info.fltr_act = ICE_FWD_TO_VSI;
-		tmp_fltr_info.fwd_id.hw_vsi_id =
-			ice_get_hw_vsi_num(hw, rem_vsi_handle);
-		tmp_fltr_info.vsi_handle = rem_vsi_handle;
-		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr_info);
-		if (status) {
-			ice_debug(hw, ICE_DBG_SW,
-				  "Failed to update pkt fwd rule to FWD_TO_VSI on HW VSI %d, error %d\n",
-				  tmp_fltr_info.fwd_id.hw_vsi_id, status);
-			return status;
-		}
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC];
+	rule_lock = &recp_list->filt_rule_lock;
+	mutex_lock(rule_lock);
+	/* Get the book keeping entry for the filter */
+	m_entry = ice_find_rule_entry(&recp_list->filt_rules, f_info);
+	if (!m_entry)
+		goto exit_error;
 
-		fm_list->fltr_info = tmp_fltr_info;
+	/* If counter action was enabled for this rule then don't enable
+	 * sw marker large action
+	 */
+	if (m_entry->counter_index != ICE_INVAL_COUNTER_ID) {
+		ret = ICE_ERR_PARAM;
+		goto exit_error;
 	}
 
-	if ((fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) ||
-	    (fm_list->vsi_count == 0 && lkup_type == ICE_SW_LKUP_VLAN)) {
-		struct ice_vsi_list_map_info *vsi_list_info =
-			fm_list->vsi_list_info;
+	/* if same marker was added before */
+	if (m_entry->sw_marker_id == sw_marker) {
+		ret = ICE_ERR_ALREADY_EXISTS;
+		goto exit_error;
+	}
 
-		/* Remove the VSI list since it is no longer used */
-		status = ice_remove_vsi_list_rule(hw, vsi_list_id, lkup_type);
-		if (status) {
-			ice_debug(hw, ICE_DBG_SW,
-				  "Failed to remove VSI list %d, error %d\n",
-				  vsi_list_id, status);
-			return status;
-		}
+	/* Allocate a hardware table entry to hold large act. Three actions
+	 * for marker based large action
+	 */
+	ret = ice_alloc_res_lg_act(hw, &lg_act_id, 3);
+	if (ret)
+		goto exit_error;
 
-		list_del(&vsi_list_info->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), vsi_list_info);
-		fm_list->vsi_list_info = NULL;
+	if (lg_act_id == ICE_INVAL_LG_ACT_INDEX)
+		goto exit_error;
+
+	/* Update the switch rule to add the marker action */
+	ret = ice_add_marker_act(hw, m_entry, sw_marker, lg_act_id);
+	if (!ret) {
+		mutex_unlock(rule_lock);
+		return ret;
 	}
 
-	return status;
+exit_error:
+	mutex_unlock(rule_lock);
+	/* only remove entry if it did not exist previously */
+	if (!entry_exists)
+		ret = ice_remove_mac(hw, &l_head);
+
+	return ret;
 }
 
 /**
- * ice_remove_rule_internal - Remove a filter rule of a given type
+ * ice_add_mac_with_counter - add filter with counter enabled
  * @hw: pointer to the hardware structure
- * @recp_id: recipe ID for which the rule needs to removed
- * @f_entry: rule entry containing filter information
+ * @f_info: pointer to filter info structure containing the MAC filter
+ *          information
  */
-static enum ice_status
-ice_remove_rule_internal(struct ice_hw *hw, u8 recp_id,
-			 struct ice_fltr_list_entry *f_entry)
+enum ice_status
+ice_add_mac_with_counter(struct ice_hw *hw, struct ice_fltr_info *f_info)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *list_elem;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-	enum ice_status status = 0;
-	bool remove_rule = false;
-	u16 vsi_handle;
+	struct ice_fltr_mgmt_list_entry *m_entry;
+	struct ice_fltr_list_entry fl_info;
+	struct ice_sw_recipe *recp_list;
+	struct list_head l_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
+	enum ice_status ret;
+	bool entry_exist;
+	u16 counter_id;
+	u16 lg_act_id;
 
-	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
+	if (f_info->fltr_act != ICE_FWD_TO_VSI)
 		return ICE_ERR_PARAM;
-	f_entry->fltr_info.fwd_id.hw_vsi_id =
-		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
 
-	rule_lock = &sw->recp_list[recp_id].filt_rule_lock;
+	if (f_info->lkup_type != ICE_SW_LKUP_MAC)
+		return ICE_ERR_PARAM;
+
+	if (!ice_is_vsi_valid(hw, f_info->vsi_handle))
+		return ICE_ERR_PARAM;
+	f_info->fwd_id.hw_vsi_id = ice_get_hw_vsi_num(hw, f_info->vsi_handle);
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC];
+
+	entry_exist = false;
+
+	rule_lock = &recp_list->filt_rule_lock;
+
+	/* Add filter if it doesn't exist so then the adding of large
+	 * action always results in update
+	 */
+	INIT_LIST_HEAD(&l_head);
+
+	fl_info.fltr_info = *f_info;
+	list_add(&fl_info.list_entry, &l_head);
+
+	ret = ice_add_mac_rule(hw, &l_head, hw->switch_info,
+			       hw->port_info->lport);
+	if (ret == ICE_ERR_ALREADY_EXISTS)
+		entry_exist = true;
+	else if (ret)
+		return ret;
+
 	mutex_lock(rule_lock);
-	list_elem = ice_find_rule_entry(hw, recp_id, &f_entry->fltr_info);
-	if (!list_elem) {
-		status = ICE_ERR_DOES_NOT_EXIST;
-		goto exit;
+	m_entry = ice_find_rule_entry(&recp_list->filt_rules, f_info);
+	if (!m_entry) {
+		ret = ICE_ERR_BAD_PTR;
+		goto exit_error;
 	}
 
-	if (list_elem->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST) {
-		remove_rule = true;
-	} else if (!list_elem->vsi_list_info) {
-		status = ICE_ERR_DOES_NOT_EXIST;
-		goto exit;
-	} else if (list_elem->vsi_list_info->ref_cnt > 1) {
-		/* a ref_cnt > 1 indicates that the vsi_list is being
-		 * shared by multiple rules. Decrement the ref_cnt and
-		 * remove this rule, but do not modify the list, as it
-		 * is in-use by other rules.
-		 */
-		list_elem->vsi_list_info->ref_cnt--;
-		remove_rule = true;
-	} else {
-		/* a ref_cnt of 1 indicates the vsi_list is only used
-		 * by one rule. However, the original removal request is only
-		 * for a single VSI. Update the vsi_list first, and only
-		 * remove the rule if there are no further VSIs in this list.
-		 */
-		vsi_handle = f_entry->fltr_info.vsi_handle;
-		status = ice_rem_update_vsi_list(hw, vsi_handle, list_elem);
-		if (status)
-			goto exit;
-		/* if VSI count goes to zero after updating the VSI list */
-		if (list_elem->vsi_count == 0)
-			remove_rule = true;
+	/* Don't enable counter for a filter for which sw marker was enabled */
+	if (m_entry->sw_marker_id != ICE_INVAL_SW_MARKER_ID) {
+		ret = ICE_ERR_PARAM;
+		goto exit_error;
 	}
 
-	if (remove_rule) {
-		/* Remove the lookup rule */
-		struct ice_aqc_sw_rules_elem *s_rule;
-
-		s_rule = devm_kzalloc(ice_hw_to_dev(hw),
-				      ICE_SW_RULE_RX_TX_NO_HDR_SIZE,
-				      GFP_KERNEL);
-		if (!s_rule) {
-			status = ICE_ERR_NO_MEMORY;
-			goto exit;
-		}
-
-		ice_fill_sw_rule(hw, &list_elem->fltr_info, s_rule,
-				 ice_aqc_opc_remove_sw_rules);
+	/* If a counter was already enabled then don't need to add again */
+	if (m_entry->counter_index != ICE_INVAL_COUNTER_ID) {
+		ret = ICE_ERR_ALREADY_EXISTS;
+		goto exit_error;
+	}
 
-		status = ice_aq_sw_rules(hw, s_rule,
-					 ICE_SW_RULE_RX_TX_NO_HDR_SIZE, 1,
-					 ice_aqc_opc_remove_sw_rules, NULL);
+	/* Allocate a hardware table entry to VLAN counter */
+	ret = ice_alloc_vlan_res_counter(hw, &counter_id);
+	if (ret)
+		goto exit_error;
 
-		/* Remove a book keeping from the list */
-		devm_kfree(ice_hw_to_dev(hw), s_rule);
+	/* Allocate a hardware table entry to hold large act. Two actions for
+	 * counter based large action
+	 */
+	ret = ice_alloc_res_lg_act(hw, &lg_act_id, 2);
+	if (ret)
+		goto exit_error;
 
-		if (status)
-			goto exit;
+	if (lg_act_id == ICE_INVAL_LG_ACT_INDEX)
+		goto exit_error;
 
-		list_del(&list_elem->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), list_elem);
+	/* Update the switch rule to add the counter action */
+	ret = ice_add_counter_act(hw, m_entry, counter_id, lg_act_id);
+	if (!ret) {
+		mutex_unlock(rule_lock);
+		return ret;
 	}
-exit:
+
+exit_error:
 	mutex_unlock(rule_lock);
-	return status;
+	/* only remove entry if it did not exist previously */
+	if (!entry_exist)
+		ret = ice_remove_mac(hw, &l_head);
+
+	return ret;
 }
 
+/* This is mapping table entry that maps every word within a given protocol
+ * structure to the real byte offset as per the specification of that
+ * protocol header.
+ * for example dst address is 3 words in ethertype header and corresponding
+ * bytes are 0, 2, 3 in the actual packet header and src address is at 4, 6, 8
+ * IMPORTANT: Every structure part of "ice_prot_hdr" union should have a
+ * matching entry describing its field. This needs to be updated if new
+ * structure is added to that union.
+ */
+static const struct ice_prot_ext_tbl_entry ice_prot_ext[ICE_PROTOCOL_LAST] = {
+	{ ICE_MAC_OFOS,		{ 0, 2, 4, 6, 8, 10, 12 } },
+	{ ICE_MAC_IL,		{ 0, 2, 4, 6, 8, 10, 12 } },
+	{ ICE_ETYPE_OL,		{ 0 } },
+	{ ICE_VLAN_OFOS,	{ 0, 2 } },
+	{ ICE_IPV4_OFOS,	{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18 } },
+	{ ICE_IPV4_IL,		{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18 } },
+	{ ICE_IPV6_OFOS,	{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
+				 26, 28, 30, 32, 34, 36, 38 } },
+	{ ICE_IPV6_IL,		{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
+				 26, 28, 30, 32, 34, 36, 38 } },
+	{ ICE_TCP_IL,		{ 0, 2 } },
+	{ ICE_UDP_OF,		{ 0, 2 } },
+	{ ICE_UDP_ILOS,		{ 0, 2 } },
+	{ ICE_SCTP_IL,		{ 0, 2 } },
+	{ ICE_VXLAN,		{ 8, 10, 12, 14 } },
+	{ ICE_GENEVE,		{ 8, 10, 12, 14 } },
+	{ ICE_VXLAN_GPE,	{ 8, 10, 12, 14 } },
+	{ ICE_NVGRE,		{ 0, 2, 4, 6 } },
+	{ ICE_GTP,		{ 8, 10, 12, 14, 16, 18, 20, 22 } },
+};
+
+/* The following table describes preferred grouping of recipes.
+ * If a recipe that needs to be programmed is a superset or matches one of the
+ * following combinations, then the recipe needs to be chained as per the
+ * following policy.
+ */
+
+static struct ice_protocol_entry ice_prot_id_tbl[ICE_PROTOCOL_LAST] = {
+	{ ICE_MAC_OFOS,		ICE_MAC_OFOS_HW },
+	{ ICE_MAC_IL,		ICE_MAC_IL_HW },
+	{ ICE_ETYPE_OL,		ICE_ETYPE_OL_HW },
+	{ ICE_VLAN_OFOS,	ICE_VLAN_OL_HW },
+	{ ICE_IPV4_OFOS,	ICE_IPV4_OFOS_HW },
+	{ ICE_IPV4_IL,		ICE_IPV4_IL_HW },
+	{ ICE_IPV6_OFOS,	ICE_IPV6_OFOS_HW },
+	{ ICE_IPV6_IL,		ICE_IPV6_IL_HW },
+	{ ICE_TCP_IL,		ICE_TCP_IL_HW },
+	{ ICE_UDP_OF,		ICE_UDP_OF_HW },
+	{ ICE_UDP_ILOS,		ICE_UDP_ILOS_HW },
+	{ ICE_SCTP_IL,		ICE_SCTP_IL_HW },
+	{ ICE_VXLAN,		ICE_UDP_OF_HW },
+	{ ICE_GENEVE,		ICE_UDP_OF_HW },
+	{ ICE_VXLAN_GPE,	ICE_UDP_OF_HW },
+	{ ICE_NVGRE,		ICE_GRE_OF_HW },
+	{ ICE_GTP,		ICE_UDP_OF_HW },
+};
+
 /**
- * ice_add_mac - Add a MAC address based filter rule
+ * ice_find_recp - find a recipe
  * @hw: pointer to the hardware structure
- * @m_list: list of MAC addresses and forwarding information
+ * @lkup_exts: extension sequence to match
  *
- * IMPORTANT: When the ucast_shared flag is set to false and m_list has
- * multiple unicast addresses, the function assumes that all the
- * addresses are unique in a given add_mac call. It doesn't
- * check for duplicates in this case, removing duplicates from a given
- * list should be taken care of in the caller of this function.
+ * Returns index of matching recipe, or ICE_MAX_NUM_RECIPES if not found.
  */
-enum ice_status
-ice_add_mac(struct ice_hw *hw, struct list_head *m_list)
+static u16 ice_find_recp(struct ice_hw *hw, struct ice_prot_lkup_ext *lkup_exts)
 {
-	struct ice_aqc_sw_rules_elem *s_rule, *r_iter;
-	struct ice_fltr_list_entry *m_list_itr;
-	struct list_head *rule_head;
-	u16 elem_sent, total_elem_left;
-	struct ice_switch_info *sw;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-	enum ice_status status = 0;
-	u16 num_unicast = 0;
-	u16 s_rule_size;
+	bool refresh_required = true;
+	struct ice_sw_recipe *recp;
+	u8 i;
 
-	if (!m_list || !hw)
-		return ICE_ERR_PARAM;
+	/* Walk through existing recipes to find a match */
+	recp = hw->switch_info->recp_list;
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
+		/* If recipe was not created for this ID, in SW bookkeeping,
+		 * check if FW has an entry for this recipe. If the FW has an
+		 * entry update it in our SW bookkeeping and continue with the
+		 * matching.
+		 */
+		if (!recp[i].recp_created)
+			if (ice_get_recp_frm_fw(hw,
+						hw->switch_info->recp_list, i,
+						&refresh_required))
+				continue;
 
-	s_rule = NULL;
-	sw = hw->switch_info;
-	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		u8 *add = &m_list_itr->fltr_info.l_data.mac.mac_addr[0];
-		u16 vsi_handle;
-		u16 hw_vsi_id;
+		/* Skip inverse action recipes */
+		if (recp[i].root_buf && recp[i].root_buf->content.act_ctrl &
+		    ICE_AQ_RECIPE_ACT_INV_ACT)
+			continue;
 
-		m_list_itr->fltr_info.flag = ICE_FLTR_TX;
-		vsi_handle = m_list_itr->fltr_info.vsi_handle;
-		if (!ice_is_vsi_valid(hw, vsi_handle))
-			return ICE_ERR_PARAM;
-		hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
-		m_list_itr->fltr_info.fwd_id.hw_vsi_id = hw_vsi_id;
-		/* update the src in case it is VSI num */
-		if (m_list_itr->fltr_info.src_id != ICE_SRC_ID_VSI)
-			return ICE_ERR_PARAM;
-		m_list_itr->fltr_info.src = hw_vsi_id;
-		if (m_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_MAC ||
-		    is_zero_ether_addr(add))
-			return ICE_ERR_PARAM;
-		if (is_unicast_ether_addr(add) && !hw->ucast_shared) {
-			/* Don't overwrite the unicast address */
-			mutex_lock(rule_lock);
-			if (ice_find_rule_entry(hw, ICE_SW_LKUP_MAC,
-						&m_list_itr->fltr_info)) {
-				mutex_unlock(rule_lock);
-				return ICE_ERR_ALREADY_EXISTS;
+		/* if number of words we are looking for match */
+		if (lkup_exts->n_val_words == recp[i].lkup_exts.n_val_words) {
+			struct ice_fv_word *ar = recp[i].lkup_exts.fv_words;
+			struct ice_fv_word *be = lkup_exts->fv_words;
+			u16 *cr = recp[i].lkup_exts.field_mask;
+			u16 *de = lkup_exts->field_mask;
+			bool found = true;
+			u8 pe, qr;
+
+			/* ar, cr, and qr are related to the recipe words, while
+			 * be, de, and pe are related to the lookup words
+			 */
+			for (pe = 0; pe < lkup_exts->n_val_words; pe++) {
+				for (qr = 0; qr < recp[i].lkup_exts.n_val_words;
+				     qr++) {
+					if (ar[qr].off == be[pe].off &&
+					    ar[qr].prot_id == be[pe].prot_id &&
+					    cr[qr] == de[pe])
+						/* Found the "pe"th word in the
+						 * given recipe
+						 */
+						break;
+				}
+				/* After walking through all the words in the
+				 * "i"th recipe if "p"th word was not found then
+				 * this recipe is not what we are looking for.
+				 * So break out from this loop and try the next
+				 * recipe
+				 */
+				if (qr >= recp[i].lkup_exts.n_val_words) {
+					found = false;
+					break;
+				}
 			}
-			mutex_unlock(rule_lock);
-			num_unicast++;
-		} else if (is_multicast_ether_addr(add) ||
-			   (is_unicast_ether_addr(add) && hw->ucast_shared)) {
-			m_list_itr->status =
-				ice_add_rule_internal(hw, ICE_SW_LKUP_MAC,
-						      m_list_itr);
-			if (m_list_itr->status)
-				return m_list_itr->status;
+			/* If for "i"th recipe the found was never set to false
+			 * then it means we found our match
+			 */
+			if (found)
+				return i; /* Return the recipe ID */
 		}
 	}
+	return ICE_MAX_NUM_RECIPES;
+}
 
-	mutex_lock(rule_lock);
-	/* Exit if no suitable entries were found for adding bulk switch rule */
-	if (!num_unicast) {
-		status = 0;
-		goto ice_add_mac_exit;
-	}
+/**
+ * ice_change_proto_id_to_dvm - change proto id in prot_id_tbl
+ *
+ * As protocol id for outer vlan is different in dvm and svm, if dvm is
+ * supported protocol array record for outer vlan has to be modified to
+ * reflect the value proper for DVM.
+ */
+void ice_change_proto_id_to_dvm(void)
+{
+	u8 i;
 
-	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+	for (i = 0; i < ARRAY_SIZE(ice_prot_id_tbl); i++)
+		if (ice_prot_id_tbl[i].type == ICE_VLAN_OFOS &&
+		    ice_prot_id_tbl[i].protocol_id != ICE_VLAN_OF_HW)
+			ice_prot_id_tbl[i].protocol_id = ICE_VLAN_OF_HW;
+}
 
-	/* Allocate switch rule buffer for the bulk update for unicast */
-	s_rule_size = ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
-	s_rule = devm_kcalloc(ice_hw_to_dev(hw), num_unicast, s_rule_size,
-			      GFP_KERNEL);
-	if (!s_rule) {
-		status = ICE_ERR_NO_MEMORY;
-		goto ice_add_mac_exit;
-	}
+/**
+ * ice_prot_type_to_id - get protocol ID from protocol type
+ * @type: protocol type
+ * @id: pointer to variable that will receive the ID
+ *
+ * Returns true if found, false otherwise
+ */
+static bool ice_prot_type_to_id(enum ice_protocol_type type, u8 *id)
+{
+	u8 i;
 
-	r_iter = s_rule;
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
-		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
+	for (i = 0; i < ARRAY_SIZE(ice_prot_id_tbl); i++)
+		if (ice_prot_id_tbl[i].type == type) {
+			*id = ice_prot_id_tbl[i].protocol_id;
+			return true;
+		}
+	return false;
+}
 
-		if (is_unicast_ether_addr(mac_addr)) {
-			ice_fill_sw_rule(hw, &m_list_itr->fltr_info, r_iter,
-					 ice_aqc_opc_add_sw_rules);
-			r_iter = (struct ice_aqc_sw_rules_elem *)
-				((u8 *)r_iter + s_rule_size);
+/**
+ * ice_fill_valid_words - count valid words
+ * @rule: advanced rule with lookup information
+ * @lkup_exts: byte offset extractions of the words that are valid
+ *
+ * calculate valid words in a lookup rule using mask value
+ */
+static u8
+ice_fill_valid_words(struct ice_adv_lkup_elem *rule,
+		     struct ice_prot_lkup_ext *lkup_exts)
+{
+	u8 j, word, prot_id, ret_val;
+
+	if (!ice_prot_type_to_id(rule->type, &prot_id))
+		return 0;
+
+	word = lkup_exts->n_val_words;
+
+	for (j = 0; j < sizeof(rule->m_u) / sizeof(u16); j++)
+		if (((u16 *)&rule->m_u)[j] &&
+		    rule->type < ARRAY_SIZE(ice_prot_ext)) {
+			/* No more space to accommodate */
+			if (word >= ICE_MAX_CHAIN_WORDS)
+				return 0;
+			lkup_exts->fv_words[word].off =
+				ice_prot_ext[rule->type].offs[j];
+			lkup_exts->fv_words[word].prot_id =
+				ice_prot_id_tbl[rule->type].protocol_id;
+			lkup_exts->field_mask[word] =
+				be16_to_cpu(((__force __be16 *)&rule->m_u)[j]);
+			word++;
 		}
-	}
 
-	/* Call AQ bulk switch rule update for all unicast addresses */
-	r_iter = s_rule;
-	/* Call AQ switch rule in AQ_MAX chunk */
-	for (total_elem_left = num_unicast; total_elem_left > 0;
-	     total_elem_left -= elem_sent) {
-		struct ice_aqc_sw_rules_elem *entry = r_iter;
+	ret_val = word - lkup_exts->n_val_words;
+	lkup_exts->n_val_words = word;
 
-		elem_sent = min(total_elem_left,
-				(u16)(ICE_AQ_MAX_BUF_LEN / s_rule_size));
-		status = ice_aq_sw_rules(hw, entry, elem_sent * s_rule_size,
-					 elem_sent, ice_aqc_opc_add_sw_rules,
-					 NULL);
-		if (status)
-			goto ice_add_mac_exit;
-		r_iter = (struct ice_aqc_sw_rules_elem *)
-			((u8 *)r_iter + (elem_sent * s_rule_size));
-	}
+	return ret_val;
+}
 
-	/* Fill up rule ID based on the value returned from FW */
-	r_iter = s_rule;
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
-		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
-		struct ice_fltr_mgmt_list_entry *fm_entry;
 
-		if (is_unicast_ether_addr(mac_addr)) {
-			f_info->fltr_rule_id =
-				le16_to_cpu(r_iter->pdata.lkup_tx_rx.index);
-			f_info->fltr_act = ICE_FWD_TO_VSI;
-			/* Create an entry to track this MAC address */
-			fm_entry = devm_kzalloc(ice_hw_to_dev(hw),
-						sizeof(*fm_entry), GFP_KERNEL);
-			if (!fm_entry) {
-				status = ICE_ERR_NO_MEMORY;
-				goto ice_add_mac_exit;
+
+/**
+ * ice_create_first_fit_recp_def - Create a recipe grouping
+ * @hw: pointer to the hardware structure
+ * @lkup_exts: an array of protocol header extractions
+ * @rg_list: pointer to a list that stores new recipe groups
+ * @recp_cnt: pointer to a variable that stores returned number of recipe groups
+ *
+ * Using first fit algorithm, take all the words that are still not done
+ * and start grouping them in 4-word groups. Each group makes up one
+ * recipe.
+ */
+static enum ice_status
+ice_create_first_fit_recp_def(struct ice_hw *hw,
+			      struct ice_prot_lkup_ext *lkup_exts,
+			      struct list_head *rg_list,
+			      u8 *recp_cnt)
+{
+	struct ice_pref_recipe_group *grp = NULL;
+	u8 j;
+
+
+	*recp_cnt = 0;
+
+	/* Walk through every word in the rule to check if it is not done. If so
+	 * then this word needs to be part of a new recipe.
+	 */
+	for (j = 0; j < lkup_exts->n_val_words; j++)
+		if (!test_bit(j, lkup_exts->done)) {
+			if (!grp ||
+			    grp->n_val_pairs == ICE_NUM_WORDS_RECIPE) {
+				struct ice_recp_grp_entry *entry;
+
+				entry = devm_kzalloc(ice_hw_to_dev(hw),
+						     sizeof(*entry),
+						     GFP_KERNEL);
+				if (!entry)
+					return ICE_ERR_NO_MEMORY;
+				list_add(&entry->l_entry, rg_list);
+				grp = &entry->r_group;
+				(*recp_cnt)++;
 			}
-			fm_entry->fltr_info = *f_info;
-			fm_entry->vsi_count = 1;
-			/* The book keeping entries will get removed when
-			 * base driver calls remove filter AQ command
-			 */
 
-			list_add(&fm_entry->list_entry, rule_head);
-			r_iter = (struct ice_aqc_sw_rules_elem *)
-				((u8 *)r_iter + s_rule_size);
+			grp->pairs[grp->n_val_pairs].prot_id =
+				lkup_exts->fv_words[j].prot_id;
+			grp->pairs[grp->n_val_pairs].off =
+				lkup_exts->fv_words[j].off;
+			grp->mask[grp->n_val_pairs] = lkup_exts->field_mask[j];
+			grp->n_val_pairs++;
 		}
-	}
 
-ice_add_mac_exit:
-	mutex_unlock(rule_lock);
-	if (s_rule)
-		devm_kfree(ice_hw_to_dev(hw), s_rule);
-	return status;
+	return 0;
 }
 
 /**
- * ice_add_vlan_internal - Add one VLAN based filter rule
+ * ice_fill_fv_word_index - fill in the field vector indices for a recipe group
  * @hw: pointer to the hardware structure
- * @f_entry: filter entry containing one VLAN information
+ * @fv_list: field vector with the extraction sequence information
+ * @rg_list: recipe groupings with protocol-offset pairs
+ *
+ * Helper function to fill in the field vector indices for protocol-offset
+ * pairs. These indexes are then ultimately programmed into a recipe.
  */
 static enum ice_status
-ice_add_vlan_internal(struct ice_hw *hw, struct ice_fltr_list_entry *f_entry)
+ice_fill_fv_word_index(struct ice_hw *hw, struct list_head *fv_list,
+		       struct list_head *rg_list)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *v_list_itr;
-	struct ice_fltr_info *new_fltr, *cur_fltr;
-	enum ice_sw_lkup_type lkup_type;
-	u16 vsi_list_id = 0, vsi_handle;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-	enum ice_status status = 0;
+	struct ice_sw_fv_list_entry *fv;
+	struct ice_recp_grp_entry *rg;
+	struct ice_fv_word *fv_ext;
+
+	if (list_empty(fv_list))
+		return 0;
+
+	fv = list_first_entry(fv_list, struct ice_sw_fv_list_entry,
+			      list_entry);
+	fv_ext = fv->fv_ptr->ew;
+
+	list_for_each_entry(rg, rg_list, l_entry) {
+		u8 i;
+
+		for (i = 0; i < rg->r_group.n_val_pairs; i++) {
+			struct ice_fv_word *pr;
+			bool found = false;
+			u16 mask;
+			u8 j;
+
+			pr = &rg->r_group.pairs[i];
+			mask = rg->r_group.mask[i];
+
+			for (j = 0; j < hw->blk[ICE_BLK_SW].es.fvw; j++)
+				if (fv_ext[j].prot_id == pr->prot_id &&
+				    fv_ext[j].off == pr->off) {
+					found = true;
+
+					/* Store index of field vector */
+					rg->fv_idx[i] = j;
+					rg->fv_mask[i] = mask;
+					break;
+				}
+
+			/* Protocol/offset could not be found, caller gave an
+			 * invalid pair
+			 */
+			if (!found)
+				return ICE_ERR_PARAM;
+		}
+	}
 
-	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
-		return ICE_ERR_PARAM;
+	return 0;
+}
 
-	f_entry->fltr_info.fwd_id.hw_vsi_id =
-		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
-	new_fltr = &f_entry->fltr_info;
+/**
+ * ice_find_free_recp_res_idx - find free result indexes for recipe
+ * @hw: pointer to hardware structure
+ * @profiles: bitmap of profiles that will be associated with the new recipe
+ * @free_idx: pointer to variable to receive the free index bitmap
+ *
+ * The algorithm used here is:
+ *	1. When creating a new recipe, create a set P which contains all
+ *	   Profiles that will be associated with our new recipe
+ *
+ *	2. For each Profile p in set P:
+ *	    a. Add all recipes associated with Profile p into set R
+ *	    b. Optional : PossibleIndexes &= profile[p].possibleIndexes
+ *		[initially PossibleIndexes should be 0xFFFFFFFFFFFFFFFF]
+ *		i. Or just assume they all have the same possible indexes:
+ *			44, 45, 46, 47
+ *			i.e., PossibleIndexes = 0x0000F00000000000
+ *
+ *	3. For each Recipe r in set R:
+ *	    a. UsedIndexes |= (bitwise or ) recipe[r].res_indexes
+ *	    b. FreeIndexes = UsedIndexes ^ PossibleIndexes
+ *
+ *	FreeIndexes will contain the bits indicating the indexes free for use,
+ *      then the code needs to update the recipe[r].used_result_idx_bits to
+ *      indicate which indexes were selected for use by this recipe.
+ */
+static u16
+ice_find_free_recp_res_idx(struct ice_hw *hw, const unsigned long *profiles,
+			   unsigned long *free_idx)
+{
+	DECLARE_BITMAP(possible_idx, ICE_MAX_FV_WORDS);
+	DECLARE_BITMAP(recipes, ICE_MAX_NUM_RECIPES);
+	DECLARE_BITMAP(used_idx, ICE_MAX_FV_WORDS);
+	u16 bit;
+
+	bitmap_zero(possible_idx, ICE_MAX_FV_WORDS);
+	bitmap_zero(recipes, ICE_MAX_NUM_RECIPES);
+	bitmap_zero(used_idx, ICE_MAX_FV_WORDS);
+	bitmap_zero(free_idx, ICE_MAX_FV_WORDS);
+
+	bitmap_set(possible_idx, 0, ICE_MAX_FV_WORDS);
+
+	/* For each profile we are going to associate the recipe with, add the
+	 * recipes that are associated with that profile. This will give us
+	 * the set of recipes that our recipe may collide with. Also, determine
+	 * what possible result indexes are usable given this set of profiles.
+	 */
+	for_each_set_bit(bit, profiles, ICE_MAX_NUM_PROFILES) {
+		bitmap_or(recipes, recipes, profile_to_recipe[bit],
+			  ICE_MAX_NUM_RECIPES);
+		bitmap_and(possible_idx, possible_idx,
+			   hw->switch_info->prof_res_bm[bit],
+			   ICE_MAX_FV_WORDS);
+	}
 
-	/* VLAN ID should only be 12 bits */
-	if (new_fltr->l_data.vlan.vlan_id > ICE_MAX_VLAN_ID)
-		return ICE_ERR_PARAM;
+	/* For each recipe that our new recipe may collide with, determine
+	 * which indexes have been used.
+	 */
+	for_each_set_bit(bit, recipes, ICE_MAX_NUM_RECIPES)
+		bitmap_or(used_idx, used_idx,
+			  hw->switch_info->recp_list[bit].res_idxs,
+			  ICE_MAX_FV_WORDS);
 
-	if (new_fltr->src_id != ICE_SRC_ID_VSI)
-		return ICE_ERR_PARAM;
+	bitmap_xor(free_idx, used_idx, possible_idx, ICE_MAX_FV_WORDS);
 
-	new_fltr->src = new_fltr->fwd_id.hw_vsi_id;
-	lkup_type = new_fltr->lkup_type;
-	vsi_handle = new_fltr->vsi_handle;
-	rule_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
-	mutex_lock(rule_lock);
-	v_list_itr = ice_find_rule_entry(hw, ICE_SW_LKUP_VLAN, new_fltr);
-	if (!v_list_itr) {
-		struct ice_vsi_list_map_info *map_info = NULL;
+	/* return number of free indexes */
+	return (u16) bitmap_weight(free_idx, ICE_MAX_FV_WORDS);
+}
 
-		if (new_fltr->fltr_act == ICE_FWD_TO_VSI) {
-			/* All VLAN pruning rules use a VSI list. Check if
-			 * there is already a VSI list containing VSI that we
-			 * want to add. If found, use the same vsi_list_id for
-			 * this new VLAN rule or else create a new list.
+/**
+ * ice_add_sw_recipe - function to call AQ calls to create switch recipe
+ * @hw: pointer to hardware structure
+ * @rm: recipe management list entry
+ * @match_tun_mask: tunnel mask that needs to be programmed
+ * @profiles: bitmap of profiles that will be associated.
+ */
+static enum ice_status
+ice_add_sw_recipe(struct ice_hw *hw, struct ice_sw_recipe *rm,
+		  u16 match_tun_mask, unsigned long *profiles)
+{
+	DECLARE_BITMAP(result_idx_bm, ICE_MAX_FV_WORDS);
+	struct ice_aqc_recipe_data_elem *tmp;
+	struct ice_aqc_recipe_data_elem *buf;
+	struct ice_recp_grp_entry *entry;
+	enum ice_status status;
+	u16 free_res_idx;
+	u16 recipe_count;
+	u8 chain_idx;
+	u8 recps = 0;
+
+	/* When more than one recipe are required, another recipe is needed to
+	 * chain them together. Matching a tunnel metadata ID takes up one of
+	 * the match fields in the chaining recipe reducing the number of
+	 * chained recipes by one.
+	 */
+	 /* check number of free result indices */
+	bitmap_zero(result_idx_bm, ICE_MAX_FV_WORDS);
+	free_res_idx = ice_find_free_recp_res_idx(hw, profiles, result_idx_bm);
+
+	ice_debug(hw, ICE_DBG_SW, "Result idx slots: %d, need %d\n",
+		  free_res_idx, rm->n_grp_count);
+
+	if (rm->n_grp_count > 1) {
+		if (rm->n_grp_count > free_res_idx)
+			return ICE_ERR_MAX_LIMIT;
+
+		rm->n_grp_count++;
+	}
+
+	if (rm->n_grp_count > ICE_MAX_CHAIN_RECIPE)
+		return ICE_ERR_MAX_LIMIT;
+
+	tmp = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_NUM_RECIPES,
+			   sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return ICE_ERR_NO_MEMORY;
+
+	buf = devm_kcalloc(ice_hw_to_dev(hw), rm->n_grp_count, sizeof(*buf),
+			   GFP_KERNEL);
+	if (!buf) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_mem;
+	}
+
+	bitmap_zero(rm->r_bitmap, ICE_MAX_NUM_RECIPES);
+	recipe_count = ICE_MAX_NUM_RECIPES;
+	status = ice_aq_get_recipe(hw, tmp, &recipe_count, ICE_SW_LKUP_MAC,
+				   NULL);
+	if (status || recipe_count == 0)
+		goto err_unroll;
+
+	/* Allocate the recipe resources, and configure them according to the
+	 * match fields from protocol headers and extracted field vectors.
+	 */
+	chain_idx = find_first_bit(result_idx_bm, ICE_MAX_FV_WORDS);
+	list_for_each_entry(entry, &rm->rg_list, l_entry) {
+		u8 i;
+
+		status = ice_alloc_recipe(hw, &entry->rid);
+		if (status)
+			goto err_unroll;
+
+		/* Clear the result index of the located recipe, as this will be
+		 * updated, if needed, later in the recipe creation process.
+		 */
+		tmp[0].content.result_indx = 0;
+
+		buf[recps] = tmp[0];
+		buf[recps].recipe_indx = (u8)entry->rid;
+		/* if the recipe is a non-root recipe RID should be programmed
+		 * as 0 for the rules to be applied correctly.
+		 */
+		buf[recps].content.rid = 0;
+		memset(&buf[recps].content.lkup_indx, 0,
+		       sizeof(buf[recps].content.lkup_indx));
+
+		/* All recipes use look-up index 0 to match switch ID. */
+		buf[recps].content.lkup_indx[0] = ICE_AQ_SW_ID_LKUP_IDX;
+		buf[recps].content.mask[0] =
+			cpu_to_le16(ICE_AQ_SW_ID_LKUP_MASK);
+		/* Setup lkup_indx 1..4 to INVALID/ignore and set the mask
+		 * to be 0
+		 */
+		for (i = 1; i <= ICE_NUM_WORDS_RECIPE; i++) {
+			buf[recps].content.lkup_indx[i] = 0x80;
+			buf[recps].content.mask[i] = 0;
+		}
+
+		for (i = 0; i < entry->r_group.n_val_pairs; i++) {
+			buf[recps].content.lkup_indx[i + 1] = entry->fv_idx[i];
+			buf[recps].content.mask[i + 1] =
+				cpu_to_le16(entry->fv_mask[i]);
+		}
+
+		if (rm->n_grp_count > 1) {
+			/* Checks to see if there really is a valid result index
+			 * that can be used.
 			 */
-			map_info = ice_find_vsi_list_entry(hw, ICE_SW_LKUP_VLAN,
-							   vsi_handle,
-							   &vsi_list_id);
-			if (!map_info) {
-				status = ice_create_vsi_list_rule(hw,
-								  &vsi_handle,
-								  1,
-								  &vsi_list_id,
-								  lkup_type);
-				if (status)
-					goto exit;
+			if (chain_idx >= ICE_MAX_FV_WORDS) {
+				ice_debug(hw, ICE_DBG_SW, "No chain index available\n");
+				status = ICE_ERR_MAX_LIMIT;
+				goto err_unroll;
 			}
-			/* Convert the action to forwarding to a VSI list. */
-			new_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
-			new_fltr->fwd_id.vsi_list_id = vsi_list_id;
+
+			entry->chain_idx = chain_idx;
+			buf[recps].content.result_indx =
+				ICE_AQ_RECIPE_RESULT_EN |
+				((chain_idx << ICE_AQ_RECIPE_RESULT_DATA_S) &
+				 ICE_AQ_RECIPE_RESULT_DATA_M);
+			clear_bit(chain_idx, result_idx_bm);
+			chain_idx = find_first_bit(result_idx_bm,
+						   ICE_MAX_FV_WORDS);
 		}
 
-		status = ice_create_pkt_fwd_rule(hw, f_entry);
-		if (!status) {
-			v_list_itr = ice_find_rule_entry(hw, ICE_SW_LKUP_VLAN,
-							 new_fltr);
-			if (!v_list_itr) {
-				status = ICE_ERR_DOES_NOT_EXIST;
-				goto exit;
-			}
-			/* reuse VSI list for new rule and increment ref_cnt */
-			if (map_info) {
-				v_list_itr->vsi_list_info = map_info;
-				map_info->ref_cnt++;
-			} else {
-				v_list_itr->vsi_list_info =
-					ice_create_vsi_list_map(hw, &vsi_handle,
-								1, vsi_list_id);
-			}
+		/* fill recipe dependencies */
+		bitmap_zero((unsigned long *)buf[recps].recipe_bitmap,
+			    ICE_MAX_NUM_RECIPES);
+		set_bit(buf[recps].recipe_indx,
+			(unsigned long *)buf[recps].recipe_bitmap);
+		buf[recps].content.act_ctrl_fwd_priority = rm->priority;
+		recps++;
+	}
+
+	if (rm->n_grp_count == 1) {
+		rm->root_rid = buf[0].recipe_indx;
+		set_bit(buf[0].recipe_indx, rm->r_bitmap);
+		buf[0].content.rid = rm->root_rid | ICE_AQ_RECIPE_ID_IS_ROOT;
+		if (sizeof(buf[0].recipe_bitmap) >= sizeof(rm->r_bitmap)) {
+			memcpy(buf[0].recipe_bitmap, rm->r_bitmap,
+			       sizeof(buf[0].recipe_bitmap));
+		} else {
+			status = ICE_ERR_BAD_PTR;
+			goto err_unroll;
 		}
-	} else if (v_list_itr->vsi_list_info->ref_cnt == 1) {
-		/* Update existing VSI list to add new VSI ID only if it used
-		 * by one VLAN rule.
+		/* Applicable only for ROOT_RECIPE, set the fwd_priority for
+		 * the recipe which is getting created if specified
+		 * by user. Usually any advanced switch filter, which results
+		 * into new extraction sequence, ended up creating a new recipe
+		 * of type ROOT and usually recipes are associated with profiles
+		 * Switch rule referreing newly created recipe, needs to have
+		 * either/or 'fwd' or 'join' priority, otherwise switch rule
+		 * evaluation will not happen correctly. In other words, if
+		 * switch rule to be evaluated on priority basis, then recipe
+		 * needs to have priority, otherwise it will be evaluated last.
 		 */
-		cur_fltr = &v_list_itr->fltr_info;
-		status = ice_add_update_vsi_list(hw, v_list_itr, cur_fltr,
-						 new_fltr);
+		buf[0].content.act_ctrl_fwd_priority = rm->priority;
 	} else {
-		/* If VLAN rule exists and VSI list being used by this rule is
-		 * referenced by more than 1 VLAN rule. Then create a new VSI
-		 * list appending previous VSI with new VSI and update existing
-		 * VLAN rule to point to new VSI list ID
+		struct ice_recp_grp_entry *last_chain_entry;
+		u16 rid, i;
+
+		/* Allocate the last recipe that will chain the outcomes of the
+		 * other recipes together
 		 */
-		struct ice_fltr_info tmp_fltr;
-		u16 vsi_handle_arr[2];
-		u16 cur_handle;
+		status = ice_alloc_recipe(hw, &rid);
+		if (status)
+			goto err_unroll;
 
-		/* Current implementation only supports reusing VSI list with
-		 * one VSI count. We should never hit below condition
+		buf[recps].recipe_indx = (u8)rid;
+		buf[recps].content.rid = (u8)rid;
+		buf[recps].content.rid |= ICE_AQ_RECIPE_ID_IS_ROOT;
+		/* the new entry created should also be part of rg_list to
+		 * make sure we have complete recipe
 		 */
-		if (v_list_itr->vsi_count > 1 &&
-		    v_list_itr->vsi_list_info->ref_cnt > 1) {
-			ice_debug(hw, ICE_DBG_SW,
-				  "Invalid configuration: Optimization to reuse VSI list with more than one VSI is not being done yet\n");
-			status = ICE_ERR_CFG;
-			goto exit;
+		last_chain_entry = devm_kzalloc(ice_hw_to_dev(hw),
+						sizeof(*last_chain_entry),
+						GFP_KERNEL);
+		if (!last_chain_entry) {
+			status = ICE_ERR_NO_MEMORY;
+			goto err_unroll;
+		}
+		last_chain_entry->rid = rid;
+		memset(&buf[recps].content.lkup_indx, 0,
+		       sizeof(buf[recps].content.lkup_indx));
+		/* All recipes use look-up index 0 to match switch ID. */
+		buf[recps].content.lkup_indx[0] = ICE_AQ_SW_ID_LKUP_IDX;
+		buf[recps].content.mask[0] =
+			cpu_to_le16(ICE_AQ_SW_ID_LKUP_MASK);
+		for (i = 1; i <= ICE_NUM_WORDS_RECIPE; i++) {
+			buf[recps].content.lkup_indx[i] =
+				ICE_AQ_RECIPE_LKUP_IGNORE;
+			buf[recps].content.mask[i] = 0;
+		}
+
+		i = 1;
+		/* update r_bitmap with the recp that is used for chaining */
+		set_bit(rid, rm->r_bitmap);
+		/* this is the recipe that chains all the other recipes so it
+		 * should not have a chaining ID to indicate the same
+		 */
+		last_chain_entry->chain_idx = ICE_INVAL_CHAIN_IND;
+		list_for_each_entry(entry, &rm->rg_list, l_entry) {
+			last_chain_entry->fv_idx[i] = entry->chain_idx;
+			buf[recps].content.lkup_indx[i] = entry->chain_idx;
+			buf[recps].content.mask[i++] = cpu_to_le16(0xFFFF);
+			set_bit(entry->rid, rm->r_bitmap);
+		}
+		list_add(&last_chain_entry->l_entry, &rm->rg_list);
+		if (sizeof(buf[recps].recipe_bitmap) >=
+		    sizeof(rm->r_bitmap)) {
+			memcpy(buf[recps].recipe_bitmap, rm->r_bitmap,
+			       sizeof(buf[recps].recipe_bitmap));
+		} else {
+			status = ICE_ERR_BAD_PTR;
+			goto err_unroll;
 		}
+		buf[recps].content.act_ctrl_fwd_priority = rm->priority;
+
+		/* To differentiate among different UDP tunnels, a meta data ID
+		 * flag is used.
+		 */
+		if (match_tun_mask) {
+			buf[recps].content.lkup_indx[i] = ICE_TUN_FLAG_FV_IND;
+			buf[recps].content.mask[i] =
+				cpu_to_le16(match_tun_mask);
+		}
+
+		recps++;
+		rm->root_rid = (u8)rid;
+	}
+	status = ice_acquire_change_lock(hw, ICE_RES_WRITE);
+	if (status)
+		goto err_unroll;
+
+	status = ice_aq_add_recipe(hw, buf, rm->n_grp_count, NULL);
+	ice_release_change_lock(hw);
+	if (status)
+		goto err_unroll;
 
-		cur_handle =
-			find_first_bit(v_list_itr->vsi_list_info->vsi_map,
-				       ICE_MAX_VSI);
+	/* Every recipe that just got created add it to the recipe
+	 * book keeping list
+	 */
+	list_for_each_entry(entry, &rm->rg_list, l_entry) {
+		struct ice_switch_info *sw = hw->switch_info;
+		bool is_root, idx_found = false;
+		struct ice_sw_recipe *recp;
+		u16 idx, buf_idx = 0;
+
+		/* find buffer index for copying some data */
+		for (idx = 0; idx < rm->n_grp_count; idx++)
+			if (buf[idx].recipe_indx == entry->rid) {
+				buf_idx = idx;
+				idx_found = true;
+			}
 
-		/* A rule already exists with the new VSI being added */
-		if (cur_handle == vsi_handle) {
-			status = ICE_ERR_ALREADY_EXISTS;
-			goto exit;
+		if (!idx_found) {
+			status = ICE_ERR_OUT_OF_RANGE;
+			goto err_unroll;
 		}
 
-		vsi_handle_arr[0] = cur_handle;
-		vsi_handle_arr[1] = vsi_handle;
-		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
-						  &vsi_list_id, lkup_type);
-		if (status)
-			goto exit;
+		recp = &sw->recp_list[entry->rid];
+		is_root = (rm->root_rid == entry->rid);
+		recp->is_root = is_root;
 
-		tmp_fltr = v_list_itr->fltr_info;
-		tmp_fltr.fltr_rule_id = v_list_itr->fltr_info.fltr_rule_id;
-		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
-		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
-		/* Update the previous switch rule to a new VSI list which
-		 * includes current VSI that is requested
-		 */
-		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
-		if (status)
-			goto exit;
+		recp->root_rid = entry->rid;
+		recp->big_recp = (is_root && rm->n_grp_count > 1);
 
-		/* before overriding VSI list map info. decrement ref_cnt of
-		 * previous VSI list
+		memcpy(&recp->ext_words, entry->r_group.pairs,
+		       entry->r_group.n_val_pairs * sizeof(struct ice_fv_word));
+
+		memcpy(recp->r_bitmap, buf[buf_idx].recipe_bitmap,
+		       sizeof(recp->r_bitmap));
+
+		/* Copy non-result fv index values and masks to recipe. This
+		 * call will also update the result recipe bitmask.
 		 */
-		v_list_itr->vsi_list_info->ref_cnt--;
+		ice_collect_result_idx(&buf[buf_idx], recp);
 
-		/* now update to newly created list */
-		v_list_itr->fltr_info.fwd_id.vsi_list_id = vsi_list_id;
-		v_list_itr->vsi_list_info =
-			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
-						vsi_list_id);
-		v_list_itr->vsi_count++;
+		/* for non-root recipes, also copy to the root, this allows
+		 * easier matching of a complete chained recipe
+		 */
+		if (!is_root)
+			ice_collect_result_idx(&buf[buf_idx],
+					       &sw->recp_list[rm->root_rid]);
+
+		recp->n_ext_words = entry->r_group.n_val_pairs;
+		recp->chain_idx = entry->chain_idx;
+		recp->priority = buf[buf_idx].content.act_ctrl_fwd_priority;
+		recp->n_grp_count = rm->n_grp_count;
+		recp->tun_type = rm->tun_type;
+		recp->recp_created = true;
 	}
+	rm->root_buf = buf;
+	devm_kfree(ice_hw_to_dev(hw), tmp);
+	return status;
 
-exit:
-	mutex_unlock(rule_lock);
+err_unroll:
+err_mem:
+	devm_kfree(ice_hw_to_dev(hw), tmp);
+	devm_kfree(ice_hw_to_dev(hw), buf);
 	return status;
 }
 
 /**
- * ice_add_vlan - Add VLAN based filter rule
- * @hw: pointer to the hardware structure
- * @v_list: list of VLAN entries and forwarding information
+ * ice_create_recipe_group - creates recipe group
+ * @hw: pointer to hardware structure
+ * @rm: recipe management list entry
+ * @lkup_exts: lookup elements
  */
-enum ice_status
-ice_add_vlan(struct ice_hw *hw, struct list_head *v_list)
+static enum ice_status
+ice_create_recipe_group(struct ice_hw *hw, struct ice_sw_recipe *rm,
+			struct ice_prot_lkup_ext *lkup_exts)
 {
-	struct ice_fltr_list_entry *v_list_itr;
+	enum ice_status status;
+	u8 recp_count = 0;
 
-	if (!v_list || !hw)
-		return ICE_ERR_PARAM;
+	rm->n_grp_count = 0;
 
-	list_for_each_entry(v_list_itr, v_list, list_entry) {
-		if (v_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_VLAN)
-			return ICE_ERR_PARAM;
-		v_list_itr->fltr_info.flag = ICE_FLTR_TX;
-		v_list_itr->status = ice_add_vlan_internal(hw, v_list_itr);
-		if (v_list_itr->status)
-			return v_list_itr->status;
+	/* Create recipes for words that are marked not done by packing them
+	 * as best fit.
+	 */
+	status = ice_create_first_fit_recp_def(hw, lkup_exts,
+					       &rm->rg_list, &recp_count);
+	if (!status) {
+		rm->n_grp_count += recp_count;
+		rm->n_ext_words = lkup_exts->n_val_words;
+		memcpy(&rm->ext_words, lkup_exts->fv_words,
+		       sizeof(rm->ext_words));
+		memcpy(rm->word_masks, lkup_exts->field_mask,
+		       sizeof(rm->word_masks));
 	}
-	return 0;
+
+	return status;
 }
 
 /**
- * ice_add_eth_mac - Add ethertype and MAC based filter rule
- * @hw: pointer to the hardware structure
- * @em_list: list of ether type MAC filter, MAC is optional
- *
- * This function requires the caller to populate the entries in
- * the filter list with the necessary fields (including flags to
- * indicate Tx or Rx rules).
+ * ice_get_fv - get field vectors/extraction sequences for spec. lookup types
+ * @hw: pointer to hardware structure
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *	   structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @bm: bitmap of field vectors to consider
+ * @fv_list: pointer to a list that holds the returned field vectors
  */
-enum ice_status
-ice_add_eth_mac(struct ice_hw *hw, struct list_head *em_list)
+static enum ice_status
+ice_get_fv(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups, u16 lkups_cnt,
+	   unsigned long *bm, struct list_head *fv_list)
 {
-	struct ice_fltr_list_entry *em_list_itr;
+	enum ice_status status;
+	u8 *prot_ids;
+	u16 i;
 
-	if (!em_list || !hw)
-		return ICE_ERR_PARAM;
+	prot_ids = devm_kcalloc(ice_hw_to_dev(hw), lkups_cnt,
+				sizeof(*prot_ids), GFP_KERNEL);
+	if (!prot_ids)
+		return ICE_ERR_NO_MEMORY;
 
-	list_for_each_entry(em_list_itr, em_list, list_entry) {
-		enum ice_sw_lkup_type l_type =
-			em_list_itr->fltr_info.lkup_type;
+	for (i = 0; i < lkups_cnt; i++)
+		if (!ice_prot_type_to_id(lkups[i].type, &prot_ids[i])) {
+			status = ICE_ERR_CFG;
+			goto free_mem;
+		}
 
-		if (l_type != ICE_SW_LKUP_ETHERTYPE_MAC &&
-		    l_type != ICE_SW_LKUP_ETHERTYPE)
-			return ICE_ERR_PARAM;
+	/* Find field vectors that include all specified protocol types */
+	status = ice_get_sw_fv_list(hw, prot_ids, lkups_cnt, bm, fv_list);
 
-		em_list_itr->status = ice_add_rule_internal(hw, l_type,
-							    em_list_itr);
-		if (em_list_itr->status)
-			return em_list_itr->status;
-	}
-	return 0;
+free_mem:
+	devm_kfree(ice_hw_to_dev(hw), prot_ids);
+	return status;
 }
 
 /**
- * ice_remove_eth_mac - Remove an ethertype (or MAC) based filter rule
- * @hw: pointer to the hardware structure
- * @em_list: list of ethertype or ethertype MAC entries
+ * ice_tun_type_match_word - determine if tun type needs a match mask
+ * @tun_type: tunnel type
+ * @mask: mask to be used for the tunnel
  */
-enum ice_status
-ice_remove_eth_mac(struct ice_hw *hw, struct list_head *em_list)
+static bool ice_tun_type_match_word(enum ice_sw_tunnel_type tun_type, u16 *mask)
 {
-	struct ice_fltr_list_entry *em_list_itr, *tmp;
+	switch (tun_type) {
+	case ICE_SW_TUN_VXLAN_GPE:
+	case ICE_SW_TUN_GENEVE:
+	case ICE_SW_TUN_VXLAN:
+	case ICE_SW_TUN_NVGRE:
+	case ICE_SW_TUN_UDP:
+	case ICE_ALL_TUNNELS:
+	case ICE_SW_TUN_IPV4_GTPU_IPV4:
+	case ICE_SW_TUN_IPV4_GTPU_IPV6:
+	case ICE_SW_TUN_IPV6_GTPU_IPV4:
+	case ICE_SW_TUN_IPV6_GTPU_IPV6:
+	case ICE_SW_TUN_IPV4_GTP_IPV4_TCP:
+	case ICE_SW_TUN_IPV4_GTP_IPV4_UDP:
+	case ICE_SW_TUN_IPV4_GTP_IPV6_TCP:
+	case ICE_SW_TUN_IPV4_GTP_IPV6_UDP:
+	case ICE_SW_TUN_IPV6_GTP_IPV4_TCP:
+	case ICE_SW_TUN_IPV6_GTP_IPV4_UDP:
+	case ICE_SW_TUN_IPV6_GTP_IPV6_TCP:
+	case ICE_SW_TUN_IPV6_GTP_IPV6_UDP:
+	/* support for GTP, using only inner protocols,
+	 * outer protocols can be anything
+	 */
+	case ICE_SW_TUN_GTP_IPV4:
+	case ICE_SW_TUN_GTP_IPV6:
+	case ICE_SW_TUN_GTP_IPV4_TCP:
+	case ICE_SW_TUN_GTP_IPV4_UDP:
+	case ICE_SW_TUN_GTP_IPV6_TCP:
+	case ICE_SW_TUN_GTP_IPV6_UDP:
+		*mask = ICE_TUN_FLAG_MASK;
+		return true;
+
+	case ICE_SW_TUN_GENEVE_VLAN:
+	case ICE_SW_TUN_VXLAN_VLAN:
+		*mask = ICE_TUN_FLAG_MASK & ~ICE_TUN_FLAG_VLAN_MASK;
+		return true;
 
-	if (!em_list || !hw)
-		return ICE_ERR_PARAM;
+	default:
+		*mask = 0;
+		return false;
+	}
+}
 
-	list_for_each_entry_safe(em_list_itr, tmp, em_list, list_entry) {
-		enum ice_sw_lkup_type l_type =
-			em_list_itr->fltr_info.lkup_type;
+/**
+ * ice_add_special_words - Add words that are not protocols, such as metadata
+ * @rinfo: other information regarding the rule e.g. priority and action info
+ * @lkup_exts: lookup word structure
+ */
+static enum ice_status
+ice_add_special_words(struct ice_adv_rule_info *rinfo,
+		      struct ice_prot_lkup_ext *lkup_exts)
+{
+	u16 mask;
 
-		if (l_type != ICE_SW_LKUP_ETHERTYPE_MAC &&
-		    l_type != ICE_SW_LKUP_ETHERTYPE)
-			return ICE_ERR_PARAM;
+	/* If this is a tunneled packet, then add recipe index to match the
+	 * tunnel bit in the packet metadata flags.
+	 */
+	if (ice_tun_type_match_word(rinfo->tun_type, &mask)) {
+		if (lkup_exts->n_val_words < ICE_MAX_CHAIN_WORDS) {
+			u8 word = lkup_exts->n_val_words++;
 
-		em_list_itr->status = ice_remove_rule_internal(hw, l_type,
-							       em_list_itr);
-		if (em_list_itr->status)
-			return em_list_itr->status;
+			lkup_exts->fv_words[word].prot_id = ICE_META_DATA_ID_HW;
+			lkup_exts->fv_words[word].off = ICE_TUN_FLAG_MDID_OFF;
+			lkup_exts->field_mask[word] = mask;
+		} else {
+			return ICE_ERR_MAX_LIMIT;
+		}
 	}
+
 	return 0;
 }
 
-/**
- * ice_rem_sw_rule_info
- * @hw: pointer to the hardware structure
- * @rule_head: pointer to the switch list structure that we want to delete
+/* ice_get_compat_fv_bitmap - Get compatible field vector bitmap for rule
+ * @hw: pointer to hardware structure
+ * @rinfo: other information regarding the rule e.g. priority and action info
+ * @bm: pointer to memory for returning the bitmap of field vectors
  */
 static void
-ice_rem_sw_rule_info(struct ice_hw *hw, struct list_head *rule_head)
+ice_get_compat_fv_bitmap(struct ice_hw *hw, struct ice_adv_rule_info *rinfo,
+			 unsigned long *bm)
 {
-	if (!list_empty(rule_head)) {
-		struct ice_fltr_mgmt_list_entry *entry;
-		struct ice_fltr_mgmt_list_entry *tmp;
+	enum ice_prof_type prof_type;
 
-		list_for_each_entry_safe(entry, tmp, rule_head, list_entry) {
-			list_del(&entry->list_entry);
-			devm_kfree(ice_hw_to_dev(hw), entry);
-		}
+	bitmap_zero(bm, ICE_MAX_NUM_PROFILES);
+
+	switch (rinfo->tun_type) {
+	case ICE_NON_TUN:
+		prof_type = ICE_PROF_NON_TUN;
+		break;
+	case ICE_ALL_TUNNELS:
+		prof_type = ICE_PROF_TUN_ALL;
+		break;
+	case ICE_SW_TUN_VXLAN_GPE:
+	case ICE_SW_TUN_GENEVE:
+	case ICE_SW_TUN_GENEVE_VLAN:
+	case ICE_SW_TUN_VXLAN:
+	case ICE_SW_TUN_VXLAN_VLAN:
+	case ICE_SW_TUN_UDP:
+		prof_type = ICE_PROF_TUN_UDP;
+		break;
+
+	/* Support for GTP tunnel + L3 */
+	case ICE_SW_TUN_IPV4_GTPU_IPV4:
+	case ICE_SW_TUN_IPV4_GTPU_IPV6:
+	case ICE_SW_TUN_IPV6_GTPU_IPV4:
+	case ICE_SW_TUN_IPV6_GTPU_IPV6:
+	/* Support for GTP tunnel + L3 + L4 */
+	case ICE_SW_TUN_IPV4_GTP_IPV4_UDP:
+	case ICE_SW_TUN_IPV4_GTP_IPV6_TCP:
+	case ICE_SW_TUN_IPV4_GTP_IPV6_UDP:
+	case ICE_SW_TUN_IPV6_GTP_IPV4_TCP:
+	case ICE_SW_TUN_IPV6_GTP_IPV4_UDP:
+	case ICE_SW_TUN_IPV6_GTP_IPV6_TCP:
+	case ICE_SW_TUN_IPV6_GTP_IPV6_UDP:
+	case ICE_SW_TUN_GTP_IPV4:
+	case ICE_SW_TUN_GTP_IPV6:
+	case ICE_SW_TUN_GTP_IPV4_TCP:
+	case ICE_SW_TUN_GTP_IPV4_UDP:
+	case ICE_SW_TUN_GTP_IPV6_TCP:
+	case ICE_SW_TUN_GTP_IPV6_UDP:
+		prof_type = ICE_PROF_TUN_UDP;
+		break;
+
+	case ICE_SW_TUN_NVGRE:
+		prof_type = ICE_PROF_TUN_GRE;
+		break;
+	case ICE_SW_TUN_AND_NON_TUN:
+	default:
+		prof_type = ICE_PROF_ALL;
+		break;
 	}
+
+	ice_get_sw_fv_bitmap(hw, prof_type, bm);
 }
 
 /**
- * ice_cfg_dflt_vsi - change state of VSI to set/clear default
- * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to set as default
- * @set: true to add the above mentioned switch rule, false to remove it
- * @direction: ICE_FLTR_RX or ICE_FLTR_TX
- *
- * add filter rule to set/unset given VSI as default VSI for the switch
- * (represented by swid)
+ * ice_add_adv_recipe - Add an advanced recipe that is not part of the default
+ * @hw: pointer to hardware structure
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *  structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @rinfo: other information regarding the rule e.g. priority and action info
+ * @rid: return the recipe ID of the recipe created
  */
-enum ice_status
-ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_handle, bool set, u8 direction)
+static enum ice_status
+ice_add_adv_recipe(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		   u16 lkups_cnt, struct ice_adv_rule_info *rinfo, u16 *rid)
 {
-	struct ice_aqc_sw_rules_elem *s_rule;
-	struct ice_fltr_info f_info;
-	enum ice_adminq_opc opcode;
-	enum ice_status status;
-	u16 s_rule_size;
-	u16 hw_vsi_id;
+	DECLARE_BITMAP(fv_bitmap, ICE_MAX_NUM_PROFILES);
+	DECLARE_BITMAP(profiles, ICE_MAX_NUM_PROFILES);
+	struct ice_prot_lkup_ext *lkup_exts;
+	struct ice_recp_grp_entry *r_entry;
+	struct ice_sw_fv_list_entry *fvit;
+	struct ice_recp_grp_entry *r_tmp;
+	struct ice_sw_fv_list_entry *tmp;
+	enum ice_status status = 0;
+	struct ice_sw_recipe *rm;
+	u16 match_tun_mask = 0;
+	u16 mask;
+	u8 i;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
+	if (!lkups_cnt)
 		return ICE_ERR_PARAM;
-	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
 
-	s_rule_size = set ? ICE_SW_RULE_RX_TX_ETH_HDR_SIZE :
-			    ICE_SW_RULE_RX_TX_NO_HDR_SIZE;
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
-	if (!s_rule)
+	lkup_exts = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*lkup_exts),
+				 GFP_KERNEL);
+	if (!lkup_exts)
 		return ICE_ERR_NO_MEMORY;
 
-	memset(&f_info, 0, sizeof(f_info));
+	/* Determine the number of words to be matched and if it exceeds a
+	 * recipe's restrictions
+	 */
+	for (i = 0; i < lkups_cnt; i++) {
+		u16 count;
+
+		if (lkups[i].type >= ICE_PROTOCOL_LAST) {
+			status = ICE_ERR_CFG;
+			goto err_free_lkup_exts;
+		}
+
+		count = ice_fill_valid_words(&lkups[i], lkup_exts);
+		if (!count) {
+			status = ICE_ERR_CFG;
+			goto err_free_lkup_exts;
+		}
+	}
+
+	rm = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rm), GFP_KERNEL);
+	if (!rm) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_free_lkup_exts;
+	}
+
+	/* Get field vectors that contain fields extracted from all the protocol
+	 * headers being programmed.
+	 */
+	INIT_LIST_HEAD(&rm->fv_list);
+	INIT_LIST_HEAD(&rm->rg_list);
+
+	/* Get bitmap of field vectors (profiles) that are compatible with the
+	 * rule request; only these will be searched in the subsequent call to
+	 * ice_get_fv.
+	 */
+	ice_get_compat_fv_bitmap(hw, rinfo, fv_bitmap);
+
+	status = ice_get_fv(hw, lkups, lkups_cnt, fv_bitmap, &rm->fv_list);
+	if (status)
+		goto err_unroll;
+
+
+	/* Group match words into recipes using preferred recipe grouping
+	 * criteria.
+	 */
+	status = ice_create_recipe_group(hw, rm, lkup_exts);
+	if (status)
+		goto err_unroll;
+
+	/* For certain tunnel types it is necessary to use a metadata ID flag to
+	 * differentiate different tunnel types. A separate recipe needs to be
+	 * used for the metadata.
+	 */
+	if (ice_tun_type_match_word(rinfo->tun_type, &mask) &&
+	    rm->n_grp_count > 1)
+		match_tun_mask = mask;
+
+	/* set the recipe priority if specified */
+	rm->priority = (u8)rinfo->priority;
+
+	/* Find offsets from the field vector. Pick the first one for all the
+	 * recipes.
+	 */
+	status = ice_fill_fv_word_index(hw, &rm->fv_list, &rm->rg_list);
+	if (status)
+		goto err_unroll;
+
+
+	/* get bitmap of all profiles the recipe will be associated with */
+	bitmap_zero(profiles, ICE_MAX_NUM_PROFILES);
+	list_for_each_entry(fvit, &rm->fv_list, list_entry) {
+		ice_debug(hw, ICE_DBG_SW, "profile: %d\n", fvit->profile_id);
+		set_bit((u16)fvit->profile_id, profiles);
+	}
+
+	/* Create any special protocol/offset pairs, such as looking at tunnel
+	 * bits by extracting metadata
+	 */
+	status = ice_add_special_words(rinfo, lkup_exts);
+	if (status)
+		goto err_free_lkup_exts;
+
+	/* Look for a recipe which matches our requested fv / mask list */
+	*rid = ice_find_recp(hw, lkup_exts);
+	if (*rid < ICE_MAX_NUM_RECIPES)
+		/* Success if found a recipe that match the existing criteria */
+		goto err_unroll;
+
+	/* Recipe we need does not exist, add a recipe */
+	status = ice_add_sw_recipe(hw, rm, match_tun_mask, profiles);
+	if (status)
+		goto err_unroll;
+
+	/* Associate all the recipes created with all the profiles in the
+	 * common field vector.
+	 */
+	list_for_each_entry(fvit, &rm->fv_list, list_entry) {
+		DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+		u16 j;
+
+		status = ice_aq_get_recipe_to_profile(hw, fvit->profile_id,
+						      (u8 *)r_bitmap, NULL);
+		if (status)
+			goto err_unroll;
+
+		bitmap_or(r_bitmap, r_bitmap, rm->r_bitmap,
+			  ICE_MAX_NUM_RECIPES);
+		status = ice_acquire_change_lock(hw, ICE_RES_WRITE);
+		if (status)
+			goto err_unroll;
+
+		status = ice_aq_map_recipe_to_profile(hw, fvit->profile_id,
+						      (u8 *)r_bitmap,
+						      NULL);
+		ice_release_change_lock(hw);
+
+		if (status)
+			goto err_unroll;
+
+		/* Update profile to recipe bitmap array */
+		bitmap_copy(profile_to_recipe[fvit->profile_id], r_bitmap,
+			    ICE_MAX_NUM_RECIPES);
+
+		/* Update recipe to profile bitmap array */
+		for_each_set_bit(j, rm->r_bitmap, ICE_MAX_NUM_RECIPES)
+			set_bit((u16)fvit->profile_id, recipe_to_profile[j]);
+	}
+
+	*rid = rm->root_rid;
+	memcpy(&hw->switch_info->recp_list[*rid].lkup_exts, lkup_exts,
+	       sizeof(*lkup_exts));
+err_unroll:
+	list_for_each_entry_safe(r_entry, r_tmp, &rm->rg_list, l_entry) {
+		list_del(&r_entry->l_entry);
+		devm_kfree(ice_hw_to_dev(hw), r_entry);
+	}
+
+	list_for_each_entry_safe(fvit, tmp, &rm->fv_list, list_entry) {
+		list_del(&fvit->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), fvit);
+	}
+
+	if (rm->root_buf)
+		devm_kfree(ice_hw_to_dev(hw), rm->root_buf);
+
+	devm_kfree(ice_hw_to_dev(hw), rm);
+
+err_free_lkup_exts:
+	devm_kfree(ice_hw_to_dev(hw), lkup_exts);
+
+	return status;
+}
+
+/**
+ * ice_find_dummy_packet - find dummy packet by tunnel type
+ *
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *	   structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @tun_type: tunnel type from the match criteria
+ * @pkt: dummy packet to fill according to filter match criteria
+ * @pkt_len: packet length of dummy packet
+ * @offsets: pointer to receive the pointer to the offsets for the packet
+ */
+static void
+ice_find_dummy_packet(struct ice_adv_lkup_elem *lkups, u16 lkups_cnt,
+		      enum ice_sw_tunnel_type tun_type, const u8 **pkt,
+		      u16 *pkt_len,
+		      const struct ice_dummy_pkt_offsets **offsets)
+{
+	bool tcp = false, udp = false, ipv6 = false, vlan = false;
+	u16 i;
+
+	for (i = 0; i < lkups_cnt; i++) {
+		if (lkups[i].type == ICE_UDP_ILOS)
+			udp = true;
+		else if (lkups[i].type == ICE_TCP_IL)
+			tcp = true;
+		else if (lkups[i].type == ICE_IPV6_OFOS)
+			ipv6 = true;
+		else if (lkups[i].type == ICE_VLAN_OFOS)
+			vlan = true;
+	}
+
+	/* figure out which dummy packet and dummy offset to use if user
+	 * wants to add filter for GTP (UDP based tunnel, where tunnel port for
+	 * GTP is fixed, 2152) tunnel where inner/outer L3 could be IPv4[6] and
+	 * likewise inner L4 could be TCP/UDP
+	 */
+	if (tun_type == ICE_SW_TUN_IPV4_GTP_IPV4_TCP ||
+	    tun_type == ICE_SW_TUN_GTP_IPV4_TCP) {
+		*pkt = dummy_ipv4_gtpu_ipv4_tcp_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv4_tcp_packet);
+		*offsets = dummy_ipv4_gtpu_ipv4_tcp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV4_GTP_IPV4_UDP ||
+	    tun_type == ICE_SW_TUN_GTP_IPV4_UDP) {
+		*pkt = dummy_ipv4_gtpu_ipv4_udp_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv4_udp_packet);
+		*offsets = dummy_ipv4_gtpu_ipv4_udp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV4_GTP_IPV6_TCP ||
+	    tun_type == ICE_SW_TUN_GTP_IPV6_TCP) {
+		*pkt = dummy_ipv4_gtpu_ipv6_tcp_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv6_tcp_packet);
+		*offsets = dummy_ipv4_gtpu_ipv6_tcp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV4_GTP_IPV6_UDP ||
+	    tun_type == ICE_SW_TUN_GTP_IPV6_UDP) {
+		*pkt = dummy_ipv4_gtpu_ipv6_udp_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv6_udp_packet);
+		*offsets = dummy_ipv4_gtpu_ipv6_udp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV6_GTP_IPV4_TCP) {
+		*pkt = dummy_ipv6_gtpu_ipv4_tcp_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv4_tcp_packet);
+		*offsets = dummy_ipv6_gtpu_ipv4_tcp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV6_GTP_IPV4_UDP) {
+		*pkt = dummy_ipv6_gtpu_ipv4_udp_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv4_udp_packet);
+		*offsets = dummy_ipv6_gtpu_ipv4_udp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV6_GTP_IPV6_TCP) {
+		*pkt = dummy_ipv6_gtpu_ipv6_tcp_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv6_tcp_packet);
+		*offsets = dummy_ipv6_gtpu_ipv6_tcp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV6_GTP_IPV6_UDP) {
+		*pkt = dummy_ipv6_gtpu_ipv6_udp_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv6_udp_packet);
+		*offsets = dummy_ipv6_gtpu_ipv6_udp_packet_offsets;
+		return;
+	}
+
+	/* Support GTP tunnel + L3 */
+	if (tun_type == ICE_SW_TUN_IPV4_GTPU_IPV4 ||
+	    tun_type == ICE_SW_TUN_GTP_IPV4) {
+		*pkt = dummy_ipv4_gtpu_ipv4_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv4_packet);
+		*offsets = dummy_ipv4_gtpu_ipv4_packet_offsets;
+		return;
+	}
+	if (tun_type == ICE_SW_TUN_IPV4_GTPU_IPV6 ||
+	    tun_type == ICE_SW_TUN_GTP_IPV6) {
+		*pkt = dummy_ipv4_gtpu_ipv6_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv6_packet);
+		*offsets = dummy_ipv4_gtpu_ipv6_packet_offsets;
+		return;
+	}
+	if (tun_type == ICE_SW_TUN_IPV6_GTPU_IPV4) {
+		*pkt = dummy_ipv6_gtpu_ipv4_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv4_packet);
+		*offsets = dummy_ipv6_gtpu_ipv4_packet_offsets;
+		return;
+	}
+	if (tun_type == ICE_SW_TUN_IPV6_GTPU_IPV6) {
+		*pkt = dummy_ipv6_gtpu_ipv6_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv6_packet);
+		*offsets = dummy_ipv6_gtpu_ipv6_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_ALL_TUNNELS) {
+		*pkt = dummy_gre_udp_packet;
+		*pkt_len = sizeof(dummy_gre_udp_packet);
+		*offsets = dummy_gre_udp_packet_offsets;
+		return;
+	}
 
-	f_info.lkup_type = ICE_SW_LKUP_DFLT;
-	f_info.flag = direction;
-	f_info.fltr_act = ICE_FWD_TO_VSI;
-	f_info.fwd_id.hw_vsi_id = hw_vsi_id;
+	if (tun_type == ICE_SW_TUN_NVGRE) {
+		if (tcp) {
+			*pkt = dummy_gre_tcp_packet;
+			*pkt_len = sizeof(dummy_gre_tcp_packet);
+			*offsets = dummy_gre_tcp_packet_offsets;
+			return;
+		}
 
-	if (f_info.flag & ICE_FLTR_RX) {
-		f_info.src = hw->port_info->lport;
-		f_info.src_id = ICE_SRC_ID_LPORT;
-		if (!set)
-			f_info.fltr_rule_id =
-				hw->port_info->dflt_rx_vsi_rule_id;
-	} else if (f_info.flag & ICE_FLTR_TX) {
-		f_info.src_id = ICE_SRC_ID_VSI;
-		f_info.src = hw_vsi_id;
-		if (!set)
-			f_info.fltr_rule_id =
-				hw->port_info->dflt_tx_vsi_rule_id;
+		*pkt = dummy_gre_udp_packet;
+		*pkt_len = sizeof(dummy_gre_udp_packet);
+		*offsets = dummy_gre_udp_packet_offsets;
+		return;
 	}
 
-	if (set)
-		opcode = ice_aqc_opc_add_sw_rules;
-	else
-		opcode = ice_aqc_opc_remove_sw_rules;
-
-	ice_fill_sw_rule(hw, &f_info, s_rule, opcode);
+	if (tun_type == ICE_SW_TUN_VXLAN || tun_type == ICE_SW_TUN_GENEVE ||
+	    tun_type == ICE_SW_TUN_VXLAN_GPE || tun_type == ICE_SW_TUN_UDP ||
+	    tun_type == ICE_SW_TUN_GENEVE_VLAN ||
+	    tun_type == ICE_SW_TUN_VXLAN_VLAN) {
+		if (tcp) {
+			*pkt = dummy_udp_tun_tcp_packet;
+			*pkt_len = sizeof(dummy_udp_tun_tcp_packet);
+			*offsets = dummy_udp_tun_tcp_packet_offsets;
+			return;
+		}
 
-	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opcode, NULL);
-	if (status || !(f_info.flag & ICE_FLTR_TX_RX))
-		goto out;
-	if (set) {
-		u16 index = le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+		*pkt = dummy_udp_tun_udp_packet;
+		*pkt_len = sizeof(dummy_udp_tun_udp_packet);
+		*offsets = dummy_udp_tun_udp_packet_offsets;
+		return;
+	}
 
-		if (f_info.flag & ICE_FLTR_TX) {
-			hw->port_info->dflt_tx_vsi_num = hw_vsi_id;
-			hw->port_info->dflt_tx_vsi_rule_id = index;
-		} else if (f_info.flag & ICE_FLTR_RX) {
-			hw->port_info->dflt_rx_vsi_num = hw_vsi_id;
-			hw->port_info->dflt_rx_vsi_rule_id = index;
+	if (udp && !ipv6) {
+		if (vlan) {
+			*pkt = dummy_vlan_udp_packet;
+			*pkt_len = sizeof(dummy_vlan_udp_packet);
+			*offsets = dummy_vlan_udp_packet_offsets;
+			return;
 		}
-	} else {
-		if (f_info.flag & ICE_FLTR_TX) {
-			hw->port_info->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
-			hw->port_info->dflt_tx_vsi_rule_id = ICE_INVAL_ACT;
-		} else if (f_info.flag & ICE_FLTR_RX) {
-			hw->port_info->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
-			hw->port_info->dflt_rx_vsi_rule_id = ICE_INVAL_ACT;
+		*pkt = dummy_udp_packet;
+		*pkt_len = sizeof(dummy_udp_packet);
+		*offsets = dummy_udp_packet_offsets;
+		return;
+	} else if (udp && ipv6) {
+		if (vlan) {
+			*pkt = dummy_vlan_udp_ipv6_packet;
+			*pkt_len = sizeof(dummy_vlan_udp_ipv6_packet);
+			*offsets = dummy_vlan_udp_ipv6_packet_offsets;
+			return;
 		}
+		*pkt = dummy_udp_ipv6_packet;
+		*pkt_len = sizeof(dummy_udp_ipv6_packet);
+		*offsets = dummy_udp_ipv6_packet_offsets;
+		return;
+	} else if ((tcp && ipv6) || ipv6) {
+		if (vlan) {
+			*pkt = dummy_vlan_tcp_ipv6_packet;
+			*pkt_len = sizeof(dummy_vlan_tcp_ipv6_packet);
+			*offsets = dummy_vlan_tcp_ipv6_packet_offsets;
+			return;
+		}
+		*pkt = dummy_tcp_ipv6_packet;
+		*pkt_len = sizeof(dummy_tcp_ipv6_packet);
+		*offsets = dummy_tcp_ipv6_packet_offsets;
+		return;
 	}
 
-out:
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
-	return status;
-}
-
-/**
- * ice_find_ucast_rule_entry - Search for a unicast MAC filter rule entry
- * @hw: pointer to the hardware structure
- * @recp_id: lookup type for which the specified rule needs to be searched
- * @f_info: rule information
- *
- * Helper function to search for a unicast rule entry - this is to be used
- * to remove unicast MAC filter that is not shared with other VSIs on the
- * PF switch.
- *
- * Returns pointer to entry storing the rule if found
- */
-static struct ice_fltr_mgmt_list_entry *
-ice_find_ucast_rule_entry(struct ice_hw *hw, u8 recp_id,
-			  struct ice_fltr_info *f_info)
-{
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *list_itr;
-	struct list_head *list_head;
-
-	list_head = &sw->recp_list[recp_id].filt_rules;
-	list_for_each_entry(list_itr, list_head, list_entry) {
-		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
-			    sizeof(f_info->l_data)) &&
-		    f_info->fwd_id.hw_vsi_id ==
-		    list_itr->fltr_info.fwd_id.hw_vsi_id &&
-		    f_info->flag == list_itr->fltr_info.flag)
-			return list_itr;
+	if (vlan) {
+		*pkt = dummy_vlan_tcp_packet;
+		*pkt_len = sizeof(dummy_vlan_tcp_packet);
+		*offsets = dummy_vlan_tcp_packet_offsets;
+	} else {
+		*pkt = dummy_tcp_packet;
+		*pkt_len = sizeof(dummy_tcp_packet);
+		*offsets = dummy_tcp_packet_offsets;
 	}
-	return NULL;
 }
 
 /**
- * ice_remove_mac - remove a MAC address based filter rule
- * @hw: pointer to the hardware structure
- * @m_list: list of MAC addresses and forwarding information
- *
- * This function removes either a MAC filter rule or a specific VSI from a
- * VSI list for a multicast MAC address.
+ * ice_fill_adv_dummy_packet - fill a dummy packet with given match criteria
  *
- * Returns ICE_ERR_DOES_NOT_EXIST if a given entry was not added by
- * ice_add_mac. Caller should be aware that this call will only work if all
- * the entries passed into m_list were added previously. It will not attempt to
- * do a partial remove of entries that were found.
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *	   structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @s_rule: stores rule information from the match criteria
+ * @dummy_pkt: dummy packet to fill according to filter match criteria
+ * @pkt_len: packet length of dummy packet
+ * @offsets: offset info for the dummy packet
  */
-enum ice_status
-ice_remove_mac(struct ice_hw *hw, struct list_head *m_list)
+static enum ice_status
+ice_fill_adv_dummy_packet(struct ice_adv_lkup_elem *lkups, u16 lkups_cnt,
+			  struct ice_aqc_sw_rules_elem *s_rule,
+			  const u8 *dummy_pkt, u16 pkt_len,
+			  const struct ice_dummy_pkt_offsets *offsets)
 {
-	struct ice_fltr_list_entry *list_itr, *tmp;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	u8 *pkt;
+	u16 i;
 
-	if (!m_list)
-		return ICE_ERR_PARAM;
+	/* Start with a packet with a pre-defined/dummy content. Then, fill
+	 * in the header values to be looked up or matched.
+	 */
+	pkt = s_rule->pdata.lkup_tx_rx.hdr;
 
-	rule_lock = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
-	list_for_each_entry_safe(list_itr, tmp, m_list, list_entry) {
-		enum ice_sw_lkup_type l_type = list_itr->fltr_info.lkup_type;
-		u8 *add = &list_itr->fltr_info.l_data.mac.mac_addr[0];
-		u16 vsi_handle;
+	memcpy(pkt, dummy_pkt, pkt_len);
 
-		if (l_type != ICE_SW_LKUP_MAC)
-			return ICE_ERR_PARAM;
+	for (i = 0; i < lkups_cnt; i++) {
+		enum ice_protocol_type type;
+		u16 offset = 0, len = 0, j;
+		bool found = false;
 
-		vsi_handle = list_itr->fltr_info.vsi_handle;
-		if (!ice_is_vsi_valid(hw, vsi_handle))
+		/* find the start of this layer; it should be found since this
+		 * was already checked when search for the dummy packet
+		 */
+		type = lkups[i].type;
+		for (j = 0; offsets[j].type != ICE_PROTOCOL_LAST; j++) {
+			if (type == offsets[j].type) {
+				offset = offsets[j].offset;
+				found = true;
+				break;
+			}
+		}
+		/* this should never happen in a correct calling sequence */
+		if (!found)
 			return ICE_ERR_PARAM;
 
-		list_itr->fltr_info.fwd_id.hw_vsi_id =
-					ice_get_hw_vsi_num(hw, vsi_handle);
-		if (is_unicast_ether_addr(add) && !hw->ucast_shared) {
-			/* Don't remove the unicast address that belongs to
-			 * another VSI on the switch, since it is not being
-			 * shared...
-			 */
-			mutex_lock(rule_lock);
-			if (!ice_find_ucast_rule_entry(hw, ICE_SW_LKUP_MAC,
-						       &list_itr->fltr_info)) {
-				mutex_unlock(rule_lock);
-				return ICE_ERR_DOES_NOT_EXIST;
-			}
-			mutex_unlock(rule_lock);
+		switch (lkups[i].type) {
+		case ICE_MAC_OFOS:
+		case ICE_MAC_IL:
+			len = sizeof(struct ice_ether_hdr);
+			break;
+		case ICE_ETYPE_OL:
+			len = sizeof(struct ice_ethtype_hdr);
+			break;
+		case ICE_VLAN_OFOS:
+			len = sizeof(struct ice_vlan_hdr);
+			break;
+		case ICE_IPV4_OFOS:
+		case ICE_IPV4_IL:
+			len = sizeof(struct ice_ipv4_hdr);
+			break;
+		case ICE_IPV6_OFOS:
+		case ICE_IPV6_IL:
+			len = sizeof(struct ice_ipv6_hdr);
+			break;
+		case ICE_TCP_IL:
+		case ICE_UDP_OF:
+		case ICE_UDP_ILOS:
+			len = sizeof(struct ice_l4_hdr);
+			break;
+		case ICE_SCTP_IL:
+			len = sizeof(struct ice_sctp_hdr);
+			break;
+		case ICE_NVGRE:
+			len = sizeof(struct ice_nvgre);
+			break;
+		case ICE_VXLAN:
+		case ICE_GENEVE:
+		case ICE_VXLAN_GPE:
+			len = sizeof(struct ice_udp_tnl_hdr);
+			break;
+
+		case ICE_GTP:
+			len = sizeof(struct ice_udp_gtp_hdr);
+			break;
+		default:
+			return ICE_ERR_PARAM;
 		}
-		list_itr->status = ice_remove_rule_internal(hw,
-							    ICE_SW_LKUP_MAC,
-							    list_itr);
-		if (list_itr->status)
-			return list_itr->status;
+
+		/* the length should be a word multiple */
+		if (len % ICE_BYTES_PER_WORD)
+			return ICE_ERR_CFG;
+
+		/* We have the offset to the header start, the length, the
+		 * caller's header values and mask. Use this information to
+		 * copy the data into the dummy packet appropriately based on
+		 * the mask. Note that we need to only write the bits as
+		 * indicated by the mask to make sure we don't improperly write
+		 * over any significant packet data.
+		 */
+		for (j = 0; j < len / sizeof(u16); j++)
+			/* cppcheck-suppress objectIndex */
+			if (((u16 *)&lkups[i].m_u)[j])
+				((u16 *)(pkt + offset))[j] =
+					(((u16 *)(pkt + offset))[j] &
+					 /* cppcheck-suppress objectIndex */
+					 ~((u16 *)&lkups[i].m_u)[j]) |
+					/* cppcheck-suppress objectIndex */
+					(((u16 *)&lkups[i].h_u)[j] &
+					 /* cppcheck-suppress objectIndex */
+					 ((u16 *)&lkups[i].m_u)[j]);
 	}
+
+	s_rule->pdata.lkup_tx_rx.hdr_len = cpu_to_le16(pkt_len);
+
 	return 0;
 }
 
 /**
- * ice_remove_vlan - Remove VLAN based filter rule
+ * ice_fill_adv_packet_tun - fill dummy packet with udp tunnel port
  * @hw: pointer to the hardware structure
- * @v_list: list of VLAN entries and forwarding information
+ * @tun_type: tunnel type
+ * @pkt: dummy packet to fill in
+ * @offsets: offset info for the dummy packet
  */
-enum ice_status
-ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list)
+static enum ice_status
+ice_fill_adv_packet_tun(struct ice_hw *hw, enum ice_sw_tunnel_type tun_type,
+			u8 *pkt, const struct ice_dummy_pkt_offsets *offsets)
 {
-	struct ice_fltr_list_entry *v_list_itr, *tmp;
+	u16 open_port, i;
+
+	switch (tun_type) {
+	case ICE_SW_TUN_AND_NON_TUN:
+	case ICE_SW_TUN_VXLAN_GPE:
+	case ICE_SW_TUN_VXLAN:
+	case ICE_SW_TUN_VXLAN_VLAN:
+	case ICE_SW_TUN_UDP:
+		if (!ice_get_open_tunnel_port(hw, TNL_VXLAN, &open_port))
+			return ICE_ERR_CFG;
+		break;
 
-	if (!v_list || !hw)
-		return ICE_ERR_PARAM;
+	case ICE_SW_TUN_GENEVE:
+	case ICE_SW_TUN_GENEVE_VLAN:
+		if (!ice_get_open_tunnel_port(hw, TNL_GENEVE, &open_port))
+			return ICE_ERR_CFG;
+		break;
 
-	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
-		enum ice_sw_lkup_type l_type = v_list_itr->fltr_info.lkup_type;
+	default:
+		/* Nothing needs to be done for this tunnel type */
+		return 0;
+	}
 
-		if (l_type != ICE_SW_LKUP_VLAN)
-			return ICE_ERR_PARAM;
-		v_list_itr->status = ice_remove_rule_internal(hw,
-							      ICE_SW_LKUP_VLAN,
-							      v_list_itr);
-		if (v_list_itr->status)
-			return v_list_itr->status;
+	/* Find the outer UDP protocol header and insert the port number */
+	for (i = 0; offsets[i].type != ICE_PROTOCOL_LAST; i++) {
+		if (offsets[i].type == ICE_UDP_OF) {
+			struct ice_l4_hdr *hdr;
+			u16 offset;
+
+			offset = offsets[i].offset;
+			hdr = (struct ice_l4_hdr *)&pkt[offset];
+			hdr->dst_port = cpu_to_be16(open_port);
+
+			return 0;
+		}
 	}
-	return 0;
-}
 
-/**
- * ice_vsi_uses_fltr - Determine if given VSI uses specified filter
- * @fm_entry: filter entry to inspect
- * @vsi_handle: VSI handle to compare with filter info
- */
-static bool
-ice_vsi_uses_fltr(struct ice_fltr_mgmt_list_entry *fm_entry, u16 vsi_handle)
-{
-	return ((fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI &&
-		 fm_entry->fltr_info.vsi_handle == vsi_handle) ||
-		(fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI_LIST &&
-		 fm_entry->vsi_list_info &&
-		 (test_bit(vsi_handle, fm_entry->vsi_list_info->vsi_map))));
+	return ICE_ERR_CFG;
 }
 
 /**
- * ice_add_entry_to_vsi_fltr_list - Add copy of fltr_list_entry to remove list
+ * ice_find_adv_rule_entry - Search a rule entry
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to remove filters from
- * @vsi_list_head: pointer to the list to add entry to
- * @fi: pointer to fltr_info of filter entry to copy & add
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *	   structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @recp_id: recipe ID for which we are finding the rule
+ * @rinfo: other information regarding the rule e.g. priority and action info
  *
- * Helper function, used when creating a list of filters to remove from
- * a specific VSI. The entry added to vsi_list_head is a COPY of the
- * original filter entry, with the exception of fltr_info.fltr_act and
- * fltr_info.fwd_id fields. These are set such that later logic can
- * extract which VSI to remove the fltr from, and pass on that information.
+ * Helper function to search for a given advance rule entry
+ * Returns pointer to entry storing the rule if found
  */
-static enum ice_status
-ice_add_entry_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_handle,
-			       struct list_head *vsi_list_head,
-			       struct ice_fltr_info *fi)
+static struct ice_adv_fltr_mgmt_list_entry *
+ice_find_adv_rule_entry(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+			u16 lkups_cnt, u16 recp_id,
+			struct ice_adv_rule_info *rinfo)
 {
-	struct ice_fltr_list_entry *tmp;
-
-	/* this memory is freed up in the caller function
-	 * once filters for this VSI are removed
-	 */
-	tmp = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*tmp), GFP_KERNEL);
-	if (!tmp)
-		return ICE_ERR_NO_MEMORY;
-
-	tmp->fltr_info = *fi;
-
-	/* Overwrite these fields to indicate which VSI to remove filter from,
-	 * so find and remove logic can extract the information from the
-	 * list entries. Note that original entries will still have proper
-	 * values.
-	 */
-	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-	tmp->fltr_info.vsi_handle = vsi_handle;
-	tmp->fltr_info.fwd_id.hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+	struct ice_adv_fltr_mgmt_list_entry *list_itr;
+	struct ice_switch_info *sw = hw->switch_info;
+	int i;
 
-	list_add(&tmp->list_entry, vsi_list_head);
+	list_for_each_entry(list_itr, &sw->recp_list[recp_id].filt_rules,
+			    list_entry) {
+		bool lkups_matched = true;
 
-	return 0;
+		if (lkups_cnt != list_itr->lkups_cnt)
+			continue;
+		for (i = 0; i < list_itr->lkups_cnt; i++)
+			if (memcmp(&list_itr->lkups[i], &lkups[i],
+				   sizeof(*lkups))) {
+				lkups_matched = false;
+				break;
+			}
+		if (rinfo->sw_act.flag == list_itr->rule_info.sw_act.flag &&
+		    rinfo->tun_type == list_itr->rule_info.tun_type &&
+		    lkups_matched)
+			return list_itr;
+	}
+	return NULL;
 }
 
 /**
- * ice_add_to_vsi_fltr_list - Add VSI filters to the list
+ * ice_adv_add_update_vsi_list
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to remove filters from
- * @lkup_list_head: pointer to the list that has certain lookup type filters
- * @vsi_list_head: pointer to the list pertaining to VSI with vsi_handle
+ * @m_entry: pointer to current adv filter management list entry
+ * @cur_fltr: filter information from the book keeping entry
+ * @new_fltr: filter information with the new VSI to be added
  *
- * Locates all filters in lkup_list_head that are used by the given VSI,
- * and adds COPIES of those entries to vsi_list_head (intended to be used
- * to remove the listed filters).
- * Note that this means all entries in vsi_list_head must be explicitly
- * deallocated by the caller when done with list.
+ * Call AQ command to add or update previously created VSI list with new VSI.
+ *
+ * Helper function to do book keeping associated with adding filter information
+ * The algorithm to do the booking keeping is described below :
+ * When a VSI needs to subscribe to a given advanced filter
+ *	if only one VSI has been added till now
+ *		Allocate a new VSI list and add two VSIs
+ *		to this list using switch rule command
+ *		Update the previously created switch rule with the
+ *		newly created VSI list ID
+ *	if a VSI list was previously created
+ *		Add the new VSI to the previously created VSI list set
+ *		using the update switch rule command
  */
 static enum ice_status
-ice_add_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_handle,
-			 struct list_head *lkup_list_head,
-			 struct list_head *vsi_list_head)
+ice_adv_add_update_vsi_list(struct ice_hw *hw,
+			    struct ice_adv_fltr_mgmt_list_entry *m_entry,
+			    struct ice_adv_rule_info *cur_fltr,
+			    struct ice_adv_rule_info *new_fltr)
 {
-	struct ice_fltr_mgmt_list_entry *fm_entry;
-	enum ice_status status = 0;
+	enum ice_status status;
+	u16 vsi_list_id = 0;
+
+	if (cur_fltr->sw_act.fltr_act == ICE_FWD_TO_Q ||
+	    cur_fltr->sw_act.fltr_act == ICE_FWD_TO_QGRP ||
+	    cur_fltr->sw_act.fltr_act == ICE_DROP_PACKET)
+		return ICE_ERR_NOT_IMPL;
+
+	if ((new_fltr->sw_act.fltr_act == ICE_FWD_TO_Q ||
+	     new_fltr->sw_act.fltr_act == ICE_FWD_TO_QGRP) &&
+	    (cur_fltr->sw_act.fltr_act == ICE_FWD_TO_VSI ||
+	     cur_fltr->sw_act.fltr_act == ICE_FWD_TO_VSI_LIST))
+		return ICE_ERR_NOT_IMPL;
 
-	/* check to make sure VSI ID is valid and within boundary */
-	if (!ice_is_vsi_valid(hw, vsi_handle))
-		return ICE_ERR_PARAM;
+	if (m_entry->vsi_count < 2 && !m_entry->vsi_list_info) {
+		 /* Only one entry existed in the mapping and it was not already
+		  * a part of a VSI list. So, create a VSI list with the old and
+		  * new VSIs.
+		  */
+		struct ice_fltr_info tmp_fltr;
+		u16 vsi_handle_arr[2];
 
-	list_for_each_entry(fm_entry, lkup_list_head, list_entry) {
-		if (!ice_vsi_uses_fltr(fm_entry, vsi_handle))
-			continue;
+		/* A rule already exists with the new VSI being added */
+		if (cur_fltr->sw_act.fwd_id.hw_vsi_id ==
+		    new_fltr->sw_act.fwd_id.hw_vsi_id)
+			return ICE_ERR_ALREADY_EXISTS;
 
-		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_handle,
-							vsi_list_head,
-							&fm_entry->fltr_info);
+		vsi_handle_arr[0] = cur_fltr->sw_act.vsi_handle;
+		vsi_handle_arr[1] = new_fltr->sw_act.vsi_handle;
+		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
+						  &vsi_list_id,
+						  ICE_SW_LKUP_LAST);
 		if (status)
 			return status;
-	}
-	return status;
-}
 
-/**
- * ice_determine_promisc_mask
- * @fi: filter info to parse
- *
- * Helper function to determine which ICE_PROMISC_ mask corresponds
- * to given filter into.
- */
-static u8 ice_determine_promisc_mask(struct ice_fltr_info *fi)
-{
-	u16 vid = fi->l_data.mac_vlan.vlan_id;
-	u8 *macaddr = fi->l_data.mac.mac_addr;
-	bool is_tx_fltr = false;
-	u8 promisc_mask = 0;
+		memset(&tmp_fltr, 0, sizeof(tmp_fltr));
+		tmp_fltr.flag = m_entry->rule_info.sw_act.flag;
+		tmp_fltr.fltr_rule_id = cur_fltr->fltr_rule_id;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
+		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
+		tmp_fltr.lkup_type = ICE_SW_LKUP_LAST;
 
-	if (fi->flag == ICE_FLTR_TX)
-		is_tx_fltr = true;
+		/* Update the previous switch rule of "forward to VSI" to
+		 * "fwd to VSI list"
+		 */
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
+		if (status)
+			return status;
 
-	if (is_broadcast_ether_addr(macaddr))
-		promisc_mask |= is_tx_fltr ?
-			ICE_PROMISC_BCAST_TX : ICE_PROMISC_BCAST_RX;
-	else if (is_multicast_ether_addr(macaddr))
-		promisc_mask |= is_tx_fltr ?
-			ICE_PROMISC_MCAST_TX : ICE_PROMISC_MCAST_RX;
-	else if (is_unicast_ether_addr(macaddr))
-		promisc_mask |= is_tx_fltr ?
-			ICE_PROMISC_UCAST_TX : ICE_PROMISC_UCAST_RX;
-	if (vid)
-		promisc_mask |= is_tx_fltr ?
-			ICE_PROMISC_VLAN_TX : ICE_PROMISC_VLAN_RX;
+		cur_fltr->sw_act.fwd_id.vsi_list_id = vsi_list_id;
+		cur_fltr->sw_act.fltr_act = ICE_FWD_TO_VSI_LIST;
+		m_entry->vsi_list_info =
+			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
+						vsi_list_id);
+	} else {
+		u16 vsi_handle = new_fltr->sw_act.vsi_handle;
 
-	return promisc_mask;
-}
+		if (!m_entry->vsi_list_info)
+			return ICE_ERR_CFG;
 
-/**
- * ice_remove_promisc - Remove promisc based filter rules
- * @hw: pointer to the hardware structure
- * @recp_id: recipe ID for which the rule needs to removed
- * @v_list: list of promisc entries
- */
-static enum ice_status
-ice_remove_promisc(struct ice_hw *hw, u8 recp_id,
-		   struct list_head *v_list)
-{
-	struct ice_fltr_list_entry *v_list_itr, *tmp;
+		/* A rule already exists with the new VSI being added */
+		if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map))
+			return 0;
 
-	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
-		v_list_itr->status =
-			ice_remove_rule_internal(hw, recp_id, v_list_itr);
-		if (v_list_itr->status)
-			return v_list_itr->status;
+		/* Update the previously created VSI list set with
+		 * the new VSI ID passed in
+		 */
+		vsi_list_id = cur_fltr->sw_act.fwd_id.vsi_list_id;
+
+		status = ice_update_vsi_list_rule(hw, &vsi_handle, 1,
+						  vsi_list_id, false,
+						  ice_aqc_opc_update_sw_rules,
+						  ICE_SW_LKUP_LAST);
+		/* update VSI list mapping info with new VSI ID */
+		if (!status)
+			set_bit(vsi_handle, m_entry->vsi_list_info->vsi_map);
 	}
-	return 0;
+	if (!status)
+		m_entry->vsi_count++;
+	return status;
 }
 
 /**
- * ice_clear_vsi_promisc - clear specified promiscuous mode(s) for given VSI
+ * ice_add_adv_rule - helper function to create an advanced switch rule
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to clear mode
- * @promisc_mask: mask of promiscuous config bits to clear
- * @vid: VLAN ID to clear VLAN promiscuous
+ * @lkups: information on the words that needs to be looked up. All words
+ * together makes one recipe
+ * @lkups_cnt: num of entries in the lkups array
+ * @rinfo: other information related to the rule that needs to be programmed
+ * @added_entry: this will return recipe_id, rule_id and vsi_handle. should be
+ *               ignored is case of error.
+ *
+ * This function can program only 1 rule at a time. The lkups is used to
+ * describe the all the words that forms the "lookup" portion of the recipe.
+ * These words can span multiple protocols. Callers to this function need to
+ * pass in a list of protocol headers with lookup information along and mask
+ * that determines which words are valid from the given protocol header.
+ * rinfo describes other information related to this rule such as forwarding
+ * IDs, priority of this rule, etc.
  */
 enum ice_status
-ice_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
-		      u16 vid)
+ice_add_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		 u16 lkups_cnt, struct ice_adv_rule_info *rinfo,
+		 struct ice_rule_query_data *added_entry)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_list_entry *fm_entry, *tmp;
-	struct list_head remove_list_head;
-	struct ice_fltr_mgmt_list_entry *itr;
+	struct ice_adv_fltr_mgmt_list_entry *m_entry, *adv_fltr = NULL;
+	u16 rid = 0, i, pkt_len, rule_buf_sz, vsi_handle;
+	const struct ice_dummy_pkt_offsets *pkt_offsets;
+	struct ice_aqc_sw_rules_elem *s_rule = NULL;
 	struct list_head *rule_head;
-	struct mutex *rule_lock;	/* Lock to protect filter rule list */
-	enum ice_status status = 0;
-	u8 recipe_id;
+	struct ice_switch_info *sw;
+	enum ice_status status;
+	const u8 *pkt = NULL;
+	u16 word_cnt;
+	u32 act = 0;
+	u8 q_rgn;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
+	/* Initialize profile to result index bitmap */
+	if (!hw->switch_info->prof_res_bm_init) {
+		hw->switch_info->prof_res_bm_init = 1;
+		ice_init_prof_result_bm(hw);
+	}
+
+	if (!lkups_cnt)
 		return ICE_ERR_PARAM;
 
-	if (vid)
-		recipe_id = ICE_SW_LKUP_PROMISC_VLAN;
-	else
-		recipe_id = ICE_SW_LKUP_PROMISC;
+	/* get # of words we need to match */
+	word_cnt = 0;
+	for (i = 0; i < lkups_cnt; i++) {
+		u16 j, *ptr;
 
-	rule_head = &sw->recp_list[recipe_id].filt_rules;
-	rule_lock = &sw->recp_list[recipe_id].filt_rule_lock;
+		ptr = (u16 *)&lkups[i].m_u;
+		for (j = 0; j < sizeof(lkups->m_u) / sizeof(u16); j++)
+			/* cppcheck-suppress objectIndex */
+			if (ptr[j] != 0)
+				word_cnt++;
+	}
 
-	INIT_LIST_HEAD(&remove_list_head);
+	if (!word_cnt || word_cnt > ICE_MAX_CHAIN_WORDS)
+		return ICE_ERR_PARAM;
 
-	mutex_lock(rule_lock);
-	list_for_each_entry(itr, rule_head, list_entry) {
-		u8 fltr_promisc_mask = 0;
+	/* make sure that we can locate a dummy packet */
+	ice_find_dummy_packet(lkups, lkups_cnt, rinfo->tun_type, &pkt, &pkt_len,
+			      &pkt_offsets);
+	if (!pkt) {
+		status = ICE_ERR_PARAM;
+		goto err_ice_add_adv_rule;
+	}
 
-		if (!ice_vsi_uses_fltr(itr, vsi_handle))
-			continue;
+	if (!(rinfo->sw_act.fltr_act == ICE_FWD_TO_VSI ||
+	      rinfo->sw_act.fltr_act == ICE_FWD_TO_Q ||
+	      rinfo->sw_act.fltr_act == ICE_FWD_TO_QGRP ||
+	      rinfo->sw_act.fltr_act == ICE_DROP_PACKET))
+		return ICE_ERR_CFG;
 
-		fltr_promisc_mask |=
-			ice_determine_promisc_mask(&itr->fltr_info);
+	vsi_handle = rinfo->sw_act.vsi_handle;
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
 
-		/* Skip if filter is not completely specified by given mask */
-		if (fltr_promisc_mask & ~promisc_mask)
-			continue;
+	if (rinfo->sw_act.fltr_act == ICE_FWD_TO_VSI)
+		rinfo->sw_act.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, vsi_handle);
+	if (rinfo->sw_act.flag & ICE_FLTR_TX)
+		rinfo->sw_act.src = ice_get_hw_vsi_num(hw, vsi_handle);
 
-		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_handle,
-							&remove_list_head,
-							&itr->fltr_info);
-		if (status) {
-			mutex_unlock(rule_lock);
-			goto free_fltr_list;
+	status = ice_add_adv_recipe(hw, lkups, lkups_cnt, rinfo, &rid);
+	if (status)
+		return status;
+	m_entry = ice_find_adv_rule_entry(hw, lkups, lkups_cnt, rid, rinfo);
+	if (m_entry) {
+		/* we have to add VSI to VSI_LIST and increment vsi_count.
+		 * Also Update VSI list so that we can change forwarding rule
+		 * if the rule already exists, we will check if it exists with
+		 * same vsi_id, if not then add it to the VSI list if it already
+		 * exists if not then create a VSI list and add the existing VSI
+		 * ID and the new VSI ID to the list
+		 * We will add that VSI to the list
+		 */
+		status = ice_adv_add_update_vsi_list(hw, m_entry,
+						     &m_entry->rule_info,
+						     rinfo);
+		if (added_entry) {
+			added_entry->rid = rid;
+			added_entry->rule_id = m_entry->rule_info.fltr_rule_id;
+			added_entry->vsi_handle = rinfo->sw_act.vsi_handle;
 		}
+		return status;
+	}
+	rule_buf_sz = ICE_SW_RULE_RX_TX_NO_HDR_SIZE + pkt_len;
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw), rule_buf_sz, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+	act |= ICE_SINGLE_ACT_LB_ENABLE | ICE_SINGLE_ACT_LAN_ENABLE;
+	switch (rinfo->sw_act.fltr_act) {
+	case ICE_FWD_TO_VSI:
+		act |= (rinfo->sw_act.fwd_id.hw_vsi_id <<
+			ICE_SINGLE_ACT_VSI_ID_S) & ICE_SINGLE_ACT_VSI_ID_M;
+		act |= ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	case ICE_FWD_TO_Q:
+		act |= ICE_SINGLE_ACT_TO_Q;
+		act |= (rinfo->sw_act.fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
+		       ICE_SINGLE_ACT_Q_INDEX_M;
+		break;
+	case ICE_FWD_TO_QGRP:
+		q_rgn = rinfo->sw_act.qgrp_size > 0 ?
+			(u8)ilog2(rinfo->sw_act.qgrp_size) : 0;
+		act |= ICE_SINGLE_ACT_TO_Q;
+		act |= (rinfo->sw_act.fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
+		       ICE_SINGLE_ACT_Q_INDEX_M;
+		act |= (q_rgn << ICE_SINGLE_ACT_Q_REGION_S) &
+		       ICE_SINGLE_ACT_Q_REGION_M;
+		break;
+	case ICE_DROP_PACKET:
+		act |= ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_DROP |
+		       ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	default:
+		status = ICE_ERR_CFG;
+		goto err_ice_add_adv_rule;
 	}
-	mutex_unlock(rule_lock);
 
-	status = ice_remove_promisc(hw, recipe_id, &remove_list_head);
+	/* set the rule LOOKUP type based on caller specified 'RX'
+	 * instead of hardcoding it to be either LOOKUP_TX/RX
+	 *
+	 * for 'RX' set the source to be the port number
+	 * for 'TX' set the source to be the source HW VSI number (determined
+	 * by caller)
+	 */
+	if (rinfo->rx) {
+		s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX);
+		s_rule->pdata.lkup_tx_rx.src =
+			cpu_to_le16(hw->port_info->lport);
+	} else {
+		s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX);
+		s_rule->pdata.lkup_tx_rx.src = cpu_to_le16(rinfo->sw_act.src);
+	}
 
-free_fltr_list:
-	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
-		list_del(&fm_entry->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), fm_entry);
+	s_rule->pdata.lkup_tx_rx.recipe_id = cpu_to_le16(rid);
+	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+
+	status = ice_fill_adv_dummy_packet(lkups, lkups_cnt, s_rule, pkt,
+					   pkt_len, pkt_offsets);
+	if (status)
+		goto err_ice_add_adv_rule;
+
+	if (rinfo->tun_type != ICE_NON_TUN &&
+	    rinfo->tun_type != ICE_SW_TUN_AND_NON_TUN) {
+		status = ice_fill_adv_packet_tun(hw, rinfo->tun_type,
+						 s_rule->pdata.lkup_tx_rx.hdr,
+						 pkt_offsets);
+		if (status)
+			goto err_ice_add_adv_rule;
+	}
+
+	status = ice_aq_sw_rules(hw, (struct ice_aqc_sw_rules *)s_rule,
+				 rule_buf_sz, 1, ice_aqc_opc_add_sw_rules,
+				 NULL);
+	if (status)
+		goto err_ice_add_adv_rule;
+	adv_fltr = devm_kzalloc(ice_hw_to_dev(hw),
+				sizeof(struct ice_adv_fltr_mgmt_list_entry),
+				GFP_KERNEL);
+	if (!adv_fltr) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_ice_add_adv_rule;
+	}
+
+	adv_fltr->lkups = devm_kmemdup(ice_hw_to_dev(hw), lkups,
+				       lkups_cnt * sizeof(*lkups), GFP_KERNEL);
+	if (!adv_fltr->lkups) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_ice_add_adv_rule;
+	}
+
+	adv_fltr->lkups_cnt = lkups_cnt;
+	adv_fltr->rule_info = *rinfo;
+	adv_fltr->rule_info.fltr_rule_id =
+		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+	sw = hw->switch_info;
+	sw->recp_list[rid].adv_rule = true;
+	rule_head = &sw->recp_list[rid].filt_rules;
+
+	if (rinfo->sw_act.fltr_act == ICE_FWD_TO_VSI)
+		adv_fltr->vsi_count = 1;
+
+	/* Add rule entry to book keeping list */
+	list_add(&adv_fltr->list_entry, rule_head);
+	if (added_entry) {
+		added_entry->rid = rid;
+		added_entry->rule_id = adv_fltr->rule_info.fltr_rule_id;
+		added_entry->vsi_handle = rinfo->sw_act.vsi_handle;
 	}
+err_ice_add_adv_rule:
+	if (status && adv_fltr) {
+		devm_kfree(ice_hw_to_dev(hw), adv_fltr->lkups);
+		devm_kfree(ice_hw_to_dev(hw), adv_fltr);
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
 
 	return status;
 }
 
 /**
- * ice_set_vsi_promisc - set given VSI to given promiscuous mode(s)
+ * ice_adv_rem_update_vsi_list
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to configure
- * @promisc_mask: mask of promiscuous config bits
- * @vid: VLAN ID to set VLAN promiscuous
+ * @vsi_handle: VSI handle of the VSI to remove
+ * @fm_list: filter management entry for which the VSI list management needs to
+ *	     be done
  */
-enum ice_status
-ice_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask, u16 vid)
+static enum ice_status
+ice_adv_rem_update_vsi_list(struct ice_hw *hw, u16 vsi_handle,
+			    struct ice_adv_fltr_mgmt_list_entry *fm_list)
 {
-	enum { UCAST_FLTR = 1, MCAST_FLTR, BCAST_FLTR };
-	struct ice_fltr_list_entry f_list_entry;
-	struct ice_fltr_info new_fltr;
-	enum ice_status status = 0;
-	bool is_tx_fltr;
-	u16 hw_vsi_id;
-	int pkt_type;
-	u8 recipe_id;
+	struct ice_vsi_list_map_info *vsi_list_info;
+	enum ice_sw_lkup_type lkup_type;
+	enum ice_status status;
+	u16 vsi_list_id;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
+	if (fm_list->rule_info.sw_act.fltr_act != ICE_FWD_TO_VSI_LIST ||
+	    fm_list->vsi_count == 0)
 		return ICE_ERR_PARAM;
-	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
-
-	memset(&new_fltr, 0, sizeof(new_fltr));
-
-	if (promisc_mask & (ICE_PROMISC_VLAN_RX | ICE_PROMISC_VLAN_TX)) {
-		new_fltr.lkup_type = ICE_SW_LKUP_PROMISC_VLAN;
-		new_fltr.l_data.mac_vlan.vlan_id = vid;
-		recipe_id = ICE_SW_LKUP_PROMISC_VLAN;
-	} else {
-		new_fltr.lkup_type = ICE_SW_LKUP_PROMISC;
-		recipe_id = ICE_SW_LKUP_PROMISC;
-	}
-
-	/* Separate filters must be set for each direction/packet type
-	 * combination, so we will loop over the mask value, store the
-	 * individual type, and clear it out in the input mask as it
-	 * is found.
-	 */
-	while (promisc_mask) {
-		u8 *mac_addr;
 
-		pkt_type = 0;
-		is_tx_fltr = false;
-
-		if (promisc_mask & ICE_PROMISC_UCAST_RX) {
-			promisc_mask &= ~ICE_PROMISC_UCAST_RX;
-			pkt_type = UCAST_FLTR;
-		} else if (promisc_mask & ICE_PROMISC_UCAST_TX) {
-			promisc_mask &= ~ICE_PROMISC_UCAST_TX;
-			pkt_type = UCAST_FLTR;
-			is_tx_fltr = true;
-		} else if (promisc_mask & ICE_PROMISC_MCAST_RX) {
-			promisc_mask &= ~ICE_PROMISC_MCAST_RX;
-			pkt_type = MCAST_FLTR;
-		} else if (promisc_mask & ICE_PROMISC_MCAST_TX) {
-			promisc_mask &= ~ICE_PROMISC_MCAST_TX;
-			pkt_type = MCAST_FLTR;
-			is_tx_fltr = true;
-		} else if (promisc_mask & ICE_PROMISC_BCAST_RX) {
-			promisc_mask &= ~ICE_PROMISC_BCAST_RX;
-			pkt_type = BCAST_FLTR;
-		} else if (promisc_mask & ICE_PROMISC_BCAST_TX) {
-			promisc_mask &= ~ICE_PROMISC_BCAST_TX;
-			pkt_type = BCAST_FLTR;
-			is_tx_fltr = true;
-		}
+	/* A rule with the VSI being removed does not exist */
+	if (!test_bit(vsi_handle, fm_list->vsi_list_info->vsi_map))
+		return ICE_ERR_DOES_NOT_EXIST;
 
-		/* Check for VLAN promiscuous flag */
-		if (promisc_mask & ICE_PROMISC_VLAN_RX) {
-			promisc_mask &= ~ICE_PROMISC_VLAN_RX;
-		} else if (promisc_mask & ICE_PROMISC_VLAN_TX) {
-			promisc_mask &= ~ICE_PROMISC_VLAN_TX;
-			is_tx_fltr = true;
-		}
+	lkup_type = ICE_SW_LKUP_LAST;
+	vsi_list_id = fm_list->rule_info.sw_act.fwd_id.vsi_list_id;
+	status = ice_update_vsi_list_rule(hw, &vsi_handle, 1, vsi_list_id, true,
+					  ice_aqc_opc_update_sw_rules,
+					  lkup_type);
+	if (status)
+		return status;
 
-		/* Set filter DA based on packet type */
-		mac_addr = new_fltr.l_data.mac.mac_addr;
-		if (pkt_type == BCAST_FLTR) {
-			eth_broadcast_addr(mac_addr);
-		} else if (pkt_type == MCAST_FLTR ||
-			   pkt_type == UCAST_FLTR) {
-			/* Use the dummy ether header DA */
-			ether_addr_copy(mac_addr, dummy_eth_header);
-			if (pkt_type == MCAST_FLTR)
-				mac_addr[0] |= 0x1;	/* Set multicast bit */
-		}
+	fm_list->vsi_count--;
+	clear_bit(vsi_handle, fm_list->vsi_list_info->vsi_map);
+	vsi_list_info = fm_list->vsi_list_info;
+	if (fm_list->vsi_count == 1) {
+		struct ice_fltr_info tmp_fltr;
+		u16 rem_vsi_handle;
 
-		/* Need to reset this to zero for all iterations */
-		new_fltr.flag = 0;
-		if (is_tx_fltr) {
-			new_fltr.flag |= ICE_FLTR_TX;
-			new_fltr.src = hw_vsi_id;
-		} else {
-			new_fltr.flag |= ICE_FLTR_RX;
-			new_fltr.src = hw->port_info->lport;
+		rem_vsi_handle = find_first_bit(vsi_list_info->vsi_map,
+						ICE_MAX_VSI);
+		if (!ice_is_vsi_valid(hw, rem_vsi_handle))
+			return ICE_ERR_OUT_OF_RANGE;
+
+		/* Make sure VSI list is empty before removing it below */
+		status = ice_update_vsi_list_rule(hw, &rem_vsi_handle, 1,
+						  vsi_list_id, true,
+						  ice_aqc_opc_update_sw_rules,
+						  lkup_type);
+		if (status)
+			return status;
+
+		memset(&tmp_fltr, 0, sizeof(tmp_fltr));
+		tmp_fltr.flag = fm_list->rule_info.sw_act.flag;
+		tmp_fltr.fltr_rule_id = fm_list->rule_info.fltr_rule_id;
+		fm_list->rule_info.sw_act.fltr_act = ICE_FWD_TO_VSI;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI;
+		tmp_fltr.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, rem_vsi_handle);
+		fm_list->rule_info.sw_act.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, rem_vsi_handle);
+		fm_list->rule_info.sw_act.vsi_handle = rem_vsi_handle;
+
+		/* Update the previous switch rule of "MAC forward to VSI" to
+		 * "MAC fwd to VSI list"
+		 */
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SW, "Failed to update pkt fwd rule to FWD_TO_VSI on HW VSI %d, error %d\n",
+				  tmp_fltr.fwd_id.hw_vsi_id, status);
+			return status;
 		}
+		fm_list->vsi_list_info->ref_cnt--;
 
-		new_fltr.fltr_act = ICE_FWD_TO_VSI;
-		new_fltr.vsi_handle = vsi_handle;
-		new_fltr.fwd_id.hw_vsi_id = hw_vsi_id;
-		f_list_entry.fltr_info = new_fltr;
+		/* Remove the VSI list since it is no longer used */
+		status = ice_remove_vsi_list_rule(hw, vsi_list_id, lkup_type);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SW, "Failed to remove VSI list %d, error %d\n",
+				  vsi_list_id, status);
+			return status;
+		}
 
-		status = ice_add_rule_internal(hw, recipe_id, &f_list_entry);
-		if (status)
-			goto set_promisc_exit;
+		list_del(&vsi_list_info->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), vsi_list_info);
+		fm_list->vsi_list_info = NULL;
 	}
 
-set_promisc_exit:
 	return status;
 }
 
 /**
- * ice_set_vlan_vsi_promisc
+ * ice_rem_adv_rule - removes existing advanced switch rule
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to configure
- * @promisc_mask: mask of promiscuous config bits
- * @rm_vlan_promisc: Clear VLANs VSI promisc mode
+ * @lkups: information on the words that needs to be looked up. All words
+ *         together makes one recipe
+ * @lkups_cnt: num of entries in the lkups array
+ * @rinfo: Its the pointer to the rule information for the rule
  *
- * Configure VSI with all associated VLANs to given promiscuous mode(s)
+ * This function can be used to remove 1 rule at a time. The lkups is
+ * used to describe all the words that forms the "lookup" portion of the
+ * rule. These words can span multiple protocols. Callers to this function
+ * need to pass in a list of protocol headers with lookup information along
+ * and mask that determines which words are valid from the given protocol
+ * header. rinfo describes other information related to this rule such as
+ * forwarding IDs, priority of this rule, etc.
  */
 enum ice_status
-ice_set_vlan_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
-			 bool rm_vlan_promisc)
+ice_rem_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		 u16 lkups_cnt, struct ice_adv_rule_info *rinfo)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_list_entry *list_itr, *tmp;
-	struct list_head vsi_list_head;
-	struct list_head *vlan_head;
-	struct mutex *vlan_lock; /* Lock to protect filter rule list */
-	enum ice_status status;
-	u16 vlan_id;
+	struct ice_adv_fltr_mgmt_list_entry *list_elem;
+	struct ice_prot_lkup_ext lkup_exts;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	bool remove_rule = false;
+	u16 i, rid, vsi_handle;
 
-	INIT_LIST_HEAD(&vsi_list_head);
-	vlan_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
-	vlan_head = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rules;
-	mutex_lock(vlan_lock);
-	status = ice_add_to_vsi_fltr_list(hw, vsi_handle, vlan_head,
-					  &vsi_list_head);
-	mutex_unlock(vlan_lock);
+	memset(&lkup_exts, 0, sizeof(lkup_exts));
+	for (i = 0; i < lkups_cnt; i++) {
+		u16 count;
+
+		if (lkups[i].type >= ICE_PROTOCOL_LAST)
+			return ICE_ERR_CFG;
+
+		count = ice_fill_valid_words(&lkups[i], &lkup_exts);
+		if (!count)
+			return ICE_ERR_CFG;
+	}
+
+	/* Create any special protocol/offset pairs, such as looking at tunnel
+	 * bits by extracting metadata
+	 */
+	status = ice_add_special_words(rinfo, &lkup_exts);
 	if (status)
-		goto free_fltr_list;
+		return status;
 
-	list_for_each_entry(list_itr, &vsi_list_head, list_entry) {
-		vlan_id = list_itr->fltr_info.l_data.vlan.vlan_id;
-		if (rm_vlan_promisc)
-			status = ice_clear_vsi_promisc(hw, vsi_handle,
-						       promisc_mask, vlan_id);
-		else
-			status = ice_set_vsi_promisc(hw, vsi_handle,
-						     promisc_mask, vlan_id);
-		if (status)
-			break;
+	rid = ice_find_recp(hw, &lkup_exts);
+	/* If did not find a recipe that match the existing criteria */
+	if (rid == ICE_MAX_NUM_RECIPES)
+		return ICE_ERR_PARAM;
+
+	rule_lock = &hw->switch_info->recp_list[rid].filt_rule_lock;
+	list_elem = ice_find_adv_rule_entry(hw, lkups, lkups_cnt, rid, rinfo);
+	/* the rule is already removed */
+	if (!list_elem)
+		return 0;
+	mutex_lock(rule_lock);
+	if (list_elem->rule_info.sw_act.fltr_act != ICE_FWD_TO_VSI_LIST) {
+		remove_rule = true;
+	} else if (list_elem->vsi_count > 1) {
+		remove_rule = false;
+		vsi_handle = rinfo->sw_act.vsi_handle;
+		status = ice_adv_rem_update_vsi_list(hw, vsi_handle, list_elem);
+	} else {
+		vsi_handle = rinfo->sw_act.vsi_handle;
+		status = ice_adv_rem_update_vsi_list(hw, vsi_handle, list_elem);
+		if (status) {
+			mutex_unlock(rule_lock);
+			return status;
+		}
+		if (list_elem->vsi_count == 0)
+			remove_rule = true;
 	}
+	mutex_unlock(rule_lock);
+	if (remove_rule) {
+		struct ice_aqc_sw_rules_elem *s_rule;
+		u16 rule_buf_sz;
 
-free_fltr_list:
-	list_for_each_entry_safe(list_itr, tmp, &vsi_list_head, list_entry) {
-		list_del(&list_itr->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), list_itr);
+		rule_buf_sz = ICE_SW_RULE_RX_TX_NO_HDR_SIZE;
+		s_rule = devm_kzalloc(ice_hw_to_dev(hw), rule_buf_sz,
+				      GFP_KERNEL);
+		if (!s_rule)
+			return ICE_ERR_NO_MEMORY;
+		s_rule->pdata.lkup_tx_rx.act = 0;
+		s_rule->pdata.lkup_tx_rx.index =
+			cpu_to_le16(list_elem->rule_info.fltr_rule_id);
+		s_rule->pdata.lkup_tx_rx.hdr_len = 0;
+		status = ice_aq_sw_rules(hw, (struct ice_aqc_sw_rules *)s_rule,
+					 rule_buf_sz, 1,
+					 ice_aqc_opc_remove_sw_rules, NULL);
+		if (!status || status == ICE_ERR_DOES_NOT_EXIST) {
+			struct ice_switch_info *sw = hw->switch_info;
+
+			mutex_lock(rule_lock);
+			list_del(&list_elem->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), list_elem->lkups);
+			devm_kfree(ice_hw_to_dev(hw), list_elem);
+			mutex_unlock(rule_lock);
+			if (list_empty(&sw->recp_list[rid].filt_rules))
+				sw->recp_list[rid].adv_rule = false;
+		}
+		devm_kfree(ice_hw_to_dev(hw), s_rule);
 	}
 	return status;
 }
 
 /**
- * ice_remove_vsi_lkup_fltr - Remove lookup type filters for a VSI
+ * ice_rem_adv_rule_by_id - removes existing advanced switch rule by ID
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to remove filters from
- * @lkup: switch rule filter lookup type
+ * @remove_entry: data struct which holds rule_id, VSI handle and recipe ID
+ *
+ * This function is used to remove 1 rule at a time. The removal is based on
+ * the remove_entry parameter. This function will remove rule for a given
+ * vsi_handle with a given rule_id which is passed as parameter in remove_entry
  */
-static void
-ice_remove_vsi_lkup_fltr(struct ice_hw *hw, u16 vsi_handle,
-			 enum ice_sw_lkup_type lkup)
+enum ice_status
+ice_rem_adv_rule_by_id(struct ice_hw *hw,
+		       struct ice_rule_query_data *remove_entry)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_list_entry *fm_entry;
-	struct list_head remove_list_head;
-	struct list_head *rule_head;
-	struct ice_fltr_list_entry *tmp;
-	struct mutex *rule_lock;	/* Lock to protect filter rule list */
-	enum ice_status status;
-
-	INIT_LIST_HEAD(&remove_list_head);
-	rule_lock = &sw->recp_list[lkup].filt_rule_lock;
-	rule_head = &sw->recp_list[lkup].filt_rules;
-	mutex_lock(rule_lock);
-	status = ice_add_to_vsi_fltr_list(hw, vsi_handle, rule_head,
-					  &remove_list_head);
-	mutex_unlock(rule_lock);
-	if (status)
-		goto free_fltr_list;
-
-	switch (lkup) {
-	case ICE_SW_LKUP_MAC:
-		ice_remove_mac(hw, &remove_list_head);
-		break;
-	case ICE_SW_LKUP_VLAN:
-		ice_remove_vlan(hw, &remove_list_head);
-		break;
-	case ICE_SW_LKUP_PROMISC:
-	case ICE_SW_LKUP_PROMISC_VLAN:
-		ice_remove_promisc(hw, lkup, &remove_list_head);
-		break;
-	case ICE_SW_LKUP_MAC_VLAN:
-	case ICE_SW_LKUP_ETHERTYPE:
-	case ICE_SW_LKUP_ETHERTYPE_MAC:
-	case ICE_SW_LKUP_DFLT:
-	case ICE_SW_LKUP_LAST:
-	default:
-		ice_debug(hw, ICE_DBG_SW, "Unsupported lookup type %d\n", lkup);
-		break;
-	}
+	struct ice_adv_fltr_mgmt_list_entry *list_itr;
+	struct list_head *list_head;
+	struct ice_adv_rule_info rinfo;
+	struct ice_switch_info *sw;
 
-free_fltr_list:
-	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
-		list_del(&fm_entry->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), fm_entry);
+	sw = hw->switch_info;
+	if (!sw->recp_list[remove_entry->rid].recp_created)
+		return ICE_ERR_PARAM;
+	list_head = &sw->recp_list[remove_entry->rid].filt_rules;
+	list_for_each_entry(list_itr, list_head, list_entry) {
+		if (list_itr->rule_info.fltr_rule_id ==
+		    remove_entry->rule_id) {
+			rinfo = list_itr->rule_info;
+			rinfo.sw_act.vsi_handle = remove_entry->vsi_handle;
+			return ice_rem_adv_rule(hw, list_itr->lkups,
+						list_itr->lkups_cnt, &rinfo);
+		}
 	}
+	/* either list is empty or unable to find rule */
+	return ICE_ERR_DOES_NOT_EXIST;
 }
 
 /**
- * ice_remove_vsi_fltr - Remove all filters for a VSI
+ * ice_rem_adv_rule_for_vsi - removes existing advanced switch rules for a
+ *                       given VSI handle
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to remove filters from
+ * @vsi_handle: VSI handle for which we are supposed to remove all the rules.
+ *
+ * This function is used to remove all the rules for a given VSI and as soon
+ * as removing a rule fails, it will return immediately with the error code,
+ * else it will return ICE_SUCCESS
  */
-void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle)
+enum ice_status ice_rem_adv_rule_for_vsi(struct ice_hw *hw, u16 vsi_handle)
 {
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_MAC);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_MAC_VLAN);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_PROMISC);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_VLAN);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_DFLT);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_ETHERTYPE);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_ETHERTYPE_MAC);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_PROMISC_VLAN);
+	struct ice_adv_fltr_mgmt_list_entry *list_itr, *tmp_entry;
+	struct ice_vsi_list_map_info *map_info;
+	struct list_head *list_head;
+	struct ice_adv_rule_info rinfo;
+	struct ice_switch_info *sw;
+	enum ice_status status;
+	u8 rid;
+
+	sw = hw->switch_info;
+	for (rid = 0; rid < ICE_MAX_NUM_RECIPES; rid++) {
+		if (!sw->recp_list[rid].recp_created)
+			continue;
+		if (!sw->recp_list[rid].adv_rule)
+			continue;
+
+		list_head = &sw->recp_list[rid].filt_rules;
+		list_for_each_entry_safe(list_itr, tmp_entry, list_head,
+					 list_entry) {
+			rinfo = list_itr->rule_info;
+
+			if (rinfo.sw_act.fltr_act == ICE_FWD_TO_VSI_LIST) {
+				map_info = list_itr->vsi_list_info;
+				if (!map_info)
+					continue;
+
+				if (!test_bit(vsi_handle, map_info->vsi_map))
+					continue;
+			} else if (rinfo.sw_act.vsi_handle != vsi_handle) {
+				continue;
+			}
+
+			rinfo.sw_act.vsi_handle = vsi_handle;
+			status = ice_rem_adv_rule(hw, list_itr->lkups,
+						  list_itr->lkups_cnt, &rinfo);
+
+			if (status)
+				return status;
+		}
+	}
+	return 0;
 }
 
+
 /**
  * ice_replay_vsi_fltr - Replay filters for requested VSI
  * @hw: pointer to the hardware structure
+ * @pi: pointer to port information structure
+ * @sw: pointer to switch info struct for which function replays filters
  * @vsi_handle: driver VSI handle
  * @recp_id: Recipe ID for which rules need to be replayed
  * @list_head: list for which filters need to be replayed
@@ -2723,15 +8095,18 @@ void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle)
  * It is required to pass valid VSI handle.
  */
 static enum ice_status
-ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id,
+ice_replay_vsi_fltr(struct ice_hw *hw, struct ice_port_info *pi,
+		    struct ice_switch_info *sw, u16 vsi_handle, u8 recp_id,
 		    struct list_head *list_head)
 {
 	struct ice_fltr_mgmt_list_entry *itr;
 	enum ice_status status = 0;
+	struct ice_sw_recipe *recp_list;
 	u16 hw_vsi_id;
 
 	if (list_empty(list_head))
 		return status;
+	recp_list = &sw->recp_list[recp_id];
 	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
 
 	list_for_each_entry(itr, list_head, list_entry) {
@@ -2743,7 +8118,9 @@ ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id,
 			/* update the src in case it is VSI num */
 			if (f_entry.fltr_info.src_id == ICE_SRC_ID_VSI)
 				f_entry.fltr_info.src = hw_vsi_id;
-			status = ice_add_rule_internal(hw, recp_id, &f_entry);
+			status = ice_add_rule_internal(hw, recp_list,
+						       pi->lport,
+						       &f_entry);
 			if (status)
 				goto end;
 			continue;
@@ -2759,9 +8136,11 @@ ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id,
 		if (f_entry.fltr_info.src_id == ICE_SRC_ID_VSI)
 			f_entry.fltr_info.src = hw_vsi_id;
 		if (recp_id == ICE_SW_LKUP_VLAN)
-			status = ice_add_vlan_internal(hw, &f_entry);
+			status = ice_add_vlan_internal(hw, recp_list, &f_entry);
 		else
-			status = ice_add_rule_internal(hw, recp_id, &f_entry);
+			status = ice_add_rule_internal(hw, recp_list,
+						       pi->lport,
+						       &f_entry);
 		if (status)
 			goto end;
 	}
@@ -2769,50 +8148,106 @@ ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id,
 	return status;
 }
 
+/**
+ * ice_replay_vsi_adv_rule - Replay advanced rule for requested VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: driver VSI handle
+ * @list_head: list for which filters need to be replayed
+ *
+ * Replay the advanced rule for the given VSI.
+ */
+static enum ice_status
+ice_replay_vsi_adv_rule(struct ice_hw *hw, u16 vsi_handle,
+			struct list_head *list_head)
+{
+	struct ice_rule_query_data added_entry = { 0 };
+	struct ice_adv_fltr_mgmt_list_entry *adv_fltr;
+	enum ice_status status = 0;
+
+	if (list_empty(list_head))
+		return status;
+	list_for_each_entry(adv_fltr, list_head, list_entry) {
+		struct ice_adv_rule_info *rinfo = &adv_fltr->rule_info;
+		u16 lk_cnt = adv_fltr->lkups_cnt;
+
+		if (vsi_handle != rinfo->sw_act.vsi_handle)
+			continue;
+		status = ice_add_adv_rule(hw, adv_fltr->lkups, lk_cnt, rinfo,
+					  &added_entry);
+		if (status)
+			break;
+	}
+	return status;
+}
+
 /**
  * ice_replay_vsi_all_fltr - replay all filters stored in bookkeeping lists
  * @hw: pointer to the hardware structure
+ * @pi: pointer to port information structure
  * @vsi_handle: driver VSI handle
  *
  * Replays filters for requested VSI via vsi_handle.
  */
-enum ice_status ice_replay_vsi_all_fltr(struct ice_hw *hw, u16 vsi_handle)
+enum ice_status
+ice_replay_vsi_all_fltr(struct ice_hw *hw, struct ice_port_info *pi,
+			u16 vsi_handle)
 {
 	struct ice_switch_info *sw = hw->switch_info;
-	enum ice_status status = 0;
+	enum ice_status status;
 	u8 i;
 
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+	/* Update the recipes that were created */
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
 		struct list_head *head;
 
 		head = &sw->recp_list[i].filt_replay_rules;
-		status = ice_replay_vsi_fltr(hw, vsi_handle, i, head);
+		if (!sw->recp_list[i].adv_rule)
+			status = ice_replay_vsi_fltr(hw, pi, sw, vsi_handle, i,
+						     head);
+		else
+			status = ice_replay_vsi_adv_rule(hw, vsi_handle, head);
 		if (status)
 			return status;
 	}
-	return status;
+
+	return 0;
 }
 
 /**
- * ice_rm_all_sw_replay_rule_info - deletes filter replay rules
+ * ice_rm_sw_replay_rule_info - helper function to delete filter replay rules
  * @hw: pointer to the HW struct
+ * @sw: pointer to switch info struct for which function removes filters
  *
- * Deletes the filter replay rules.
+ * Deletes the filter replay rules for given switch
  */
-void ice_rm_all_sw_replay_rule_info(struct ice_hw *hw)
+void ice_rm_sw_replay_rule_info(struct ice_hw *hw, struct ice_switch_info *sw)
 {
-	struct ice_switch_info *sw = hw->switch_info;
 	u8 i;
 
 	if (!sw)
 		return;
 
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
 		if (!list_empty(&sw->recp_list[i].filt_replay_rules)) {
 			struct list_head *l_head;
 
 			l_head = &sw->recp_list[i].filt_replay_rules;
-			ice_rem_sw_rule_info(hw, l_head);
+			if (!sw->recp_list[i].adv_rule)
+				ice_rem_sw_rule_info(hw, l_head);
+			else
+				ice_rem_adv_rule_info(hw, l_head);
 		}
 	}
 }
+
+/**
+ * ice_rm_all_sw_replay_rule_info - deletes filter replay rules
+ * @hw: pointer to the HW struct
+ *
+ * Deletes the filter replay rules.
+ */
+void ice_rm_all_sw_replay_rule_info(struct ice_hw *hw)
+{
+	ice_rm_sw_replay_rule_info(hw, hw->switch_info);
+}
+
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index cb123fbe30be..adc07b10c1dc 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -1,23 +1,42 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_SWITCH_H_
 #define _ICE_SWITCH_H_
 
 #include "ice_common.h"
+#include "ice_protocol_type.h"
 
 #define ICE_SW_CFG_MAX_BUF_LEN 2048
+#define ICE_MAX_SW 256
 #define ICE_DFLT_VSI_INVAL 0xff
 #define ICE_FLTR_RX BIT(0)
 #define ICE_FLTR_TX BIT(1)
 #define ICE_FLTR_TX_RX (ICE_FLTR_RX | ICE_FLTR_TX)
-#define ICE_VSI_INVAL_ID 0xffff
-#define ICE_INVAL_Q_HANDLE 0xFFFF
 
-/* VSI queue context structure */
-struct ice_q_ctx {
-	u16  q_handle;
-};
+
+#define DUMMY_ETH_HDR_LEN		16
+#define ICE_SW_RULE_RX_TX_ETH_HDR_SIZE \
+	(offsetof(struct ice_aqc_sw_rules_elem, pdata.lkup_tx_rx.hdr) + \
+	 (DUMMY_ETH_HDR_LEN * \
+	  sizeof(((struct ice_sw_rule_lkup_rx_tx *)0)->hdr[0])))
+#define ICE_SW_RULE_RX_TX_NO_HDR_SIZE \
+	(offsetof(struct ice_aqc_sw_rules_elem, pdata.lkup_tx_rx.hdr))
+#define ICE_SW_RULE_LG_ACT_SIZE(n) \
+	(offsetof(struct ice_aqc_sw_rules_elem, pdata.lg_act.act) + \
+	 ((n) * sizeof(((struct ice_sw_rule_lg_act *)0)->act[0])))
+#define ICE_SW_RULE_VSI_LIST_SIZE(n) \
+	(offsetof(struct ice_aqc_sw_rules_elem, pdata.vsi_list.vsi) + \
+	 ((n) * sizeof(((struct ice_sw_rule_vsi_list *)0)->vsi[0])))
+
+
+/* Worst case buffer length for ice_aqc_opc_get_res_alloc */
+#define ICE_MAX_RES_TYPES 0x80
+#define ICE_AQ_GET_RES_ALLOC_BUF_LEN \
+	(ICE_MAX_RES_TYPES * sizeof(struct ice_aqc_get_res_resp_elem))
+
+#define ICE_VSI_INVAL_ID 0xFFFF
+#define ICE_INVAL_Q_HANDLE 0xFFFF
 
 /* VSI context structure for add/get/update/free operations */
 struct ice_vsi_ctx {
@@ -31,15 +50,22 @@ struct ice_vsi_ctx {
 	u8 vf_num;
 	u16 num_lan_q_entries[ICE_MAX_TRAFFIC_CLASS];
 	struct ice_q_ctx *lan_q_ctx[ICE_MAX_TRAFFIC_CLASS];
+	u16 num_rdma_q_entries[ICE_MAX_TRAFFIC_CLASS];
+	struct ice_q_ctx *rdma_q_ctx[ICE_MAX_TRAFFIC_CLASS];
 };
 
-enum ice_sw_fwd_act_type {
-	ICE_FWD_TO_VSI = 0,
-	ICE_FWD_TO_VSI_LIST, /* Do not use this when adding filter */
-	ICE_FWD_TO_Q,
-	ICE_FWD_TO_QGRP,
-	ICE_DROP_PACKET,
-	ICE_INVAL_ACT
+/* This is to be used by add/update mirror rule Admin Queue command */
+struct ice_mir_rule_buf {
+	u16 vsi_idx; /* VSI index */
+
+	/* For each VSI, user can specify whether corresponding VSI
+	 * should be added/removed to/from mirror rule
+	 *
+	 * add mirror rule: this should always be TRUE.
+	 * update mirror rule:  add(true) or remove(false) VSI to/from
+	 * mirror rule
+	 */
+	u8 add;
 };
 
 /* Switch recipe ID enum values are specific to hardware */
@@ -86,6 +112,8 @@ struct ice_fltr_info {
 		} mac_vlan;
 		struct {
 			u16 vlan_id;
+			u16 tpid;
+			u8 tpid_valid;
 		} vlan;
 		/* Set lkup_type as ICE_SW_LKUP_ETHERTYPE
 		 * if just using ethertype as filter. Set lkup_type as
@@ -125,30 +153,123 @@ struct ice_fltr_info {
 	u8 lan_en;	/* Indicate if packet can be forwarded to the uplink */
 };
 
+struct ice_update_recipe_lkup_idx_params {
+	u16 rid;
+	u16 fv_idx;
+	bool ignore_valid;
+	u16 mask;
+	bool mask_valid;
+	u8 lkup_idx;
+};
+
+struct ice_adv_lkup_elem {
+	enum ice_protocol_type type;
+	union ice_prot_hdr h_u;	/* Header values */
+	union ice_prot_hdr m_u;	/* Mask of header values to match */
+};
+
+
+struct ice_sw_act_ctrl {
+	/* Source VSI for LOOKUP_TX or source port for LOOKUP_RX */
+	u16 src;
+	u16 flag;
+	enum ice_sw_fwd_act_type fltr_act;
+	/* Depending on filter action */
+	union {
+		/* This is a queue ID in case of ICE_FWD_TO_Q and starting
+		 * queue ID in case of ICE_FWD_TO_QGRP.
+		 */
+		u16 q_id:11;
+		u16 vsi_id:10;
+		u16 hw_vsi_id:10;
+		u16 vsi_list_id:10;
+	} fwd_id;
+	/* software VSI handle */
+	u16 vsi_handle;
+	u8 qgrp_size;
+};
+
+struct ice_rule_query_data {
+	/* Recipe ID for which the requested rule was added */
+	u16 rid;
+	/* Rule ID that was added or is supposed to be removed */
+	u16 rule_id;
+	/* vsi_handle for which Rule was added or is supposed to be removed */
+	u16 vsi_handle;
+};
+
+struct ice_adv_rule_info {
+	enum ice_sw_tunnel_type tun_type;
+	struct ice_sw_act_ctrl sw_act;
+	u32 priority;
+	u8 rx; /* true means LOOKUP_RX otherwise LOOKUP_TX */
+	u16 fltr_rule_id;
+};
+
+/* A collection of one or more four word recipe */
 struct ice_sw_recipe {
-	struct list_head l_entry;
+	/* For a chained recipe the root recipe is what should be used for
+	 * programming rules
+	 */
+	u8 is_root;
+	u8 root_rid;
+	u8 recp_created;
+
+	/* Number of extraction words */
+	u8 n_ext_words;
+	/* Protocol ID and Offset pair (extraction word) to describe the
+	 * recipe
+	 */
+	struct ice_fv_word ext_words[ICE_MAX_CHAIN_WORDS];
+	u16 word_masks[ICE_MAX_CHAIN_WORDS];
+
+	/* if this recipe is a collection of other recipe */
+	u8 big_recp;
+
+	/* if this recipe is part of another bigger recipe then chain index
+	 * corresponding to this recipe
+	 */
+	u8 chain_idx;
 
-	/* To protect modification of filt_rule list
-	 * defined below
+	/* if this recipe is a collection of other recipe then count of other
+	 * recipes and recipe IDs of those recipes
 	 */
-	struct mutex filt_rule_lock;
+	u8 n_grp_count;
+
+	/* Bit map specifying the IDs associated with this group of recipe */
+	DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+
+	enum ice_sw_tunnel_type tun_type;
 
-	/* List of type ice_fltr_mgmt_list_entry */
+	/* List of type ice_fltr_mgmt_list_entry or adv_rule */
+	u8 adv_rule;
 	struct list_head filt_rules;
 	struct list_head filt_replay_rules;
 
-	/* linked list of type recipe_list_entry */
-	struct list_head rg_list;
-	/* linked list of type ice_sw_fv_list_entry*/
+	struct mutex filt_rule_lock;	/* protect filter rule structure */
+
+	/* Profiles this recipe should be associated with */
 	struct list_head fv_list;
-	struct ice_aqc_recipe_data_elem *r_buf;
-	u8 recp_count;
-	u8 root_rid;
-	u8 num_profs;
-	u8 *prof_ids;
 
-	/* recipe bitmap: what all recipes makes this recipe */
-	DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+	/* Profiles this recipe is associated with */
+	u8 num_profs, *prof_ids;
+
+	/* Bit map for possible result indexes */
+	DECLARE_BITMAP(res_idxs, ICE_MAX_FV_WORDS);
+
+	/* This allows user to specify the recipe priority.
+	 * For now, this becomes 'fwd_priority' when recipe
+	 * is created, usually recipes can have 'fwd' and 'join'
+	 * priority.
+	 */
+	u8 priority;
+
+	struct list_head rg_list;
+
+	/* AQ buffer associated with this recipe */
+	struct ice_aqc_recipe_data_elem *root_buf;
+	/* This struct saves the fv_words for a given lookup */
+	struct ice_prot_lkup_ext lkup_exts;
 };
 
 /* Bookkeeping structure to hold bitmap of VSIs corresponding to VSI list ID */
@@ -166,6 +287,7 @@ struct ice_fltr_list_entry {
 	struct ice_fltr_info fltr_info;
 };
 
+
 /* This defines an entry in the list that maintains MAC or VLAN membership
  * to HW list mapping, since multiple VSIs can subscribe to the same MAC or
  * VLAN. As an optimization the VSI list should be created only when a
@@ -186,6 +308,16 @@ struct ice_fltr_mgmt_list_entry {
 	u8 counter_index;
 };
 
+struct ice_adv_fltr_mgmt_list_entry {
+	struct list_head list_entry;
+
+	struct ice_adv_lkup_elem *lkups;
+	struct ice_adv_rule_info rule_info;
+	u16 lkups_cnt;
+	struct ice_vsi_list_map_info *vsi_list_info;
+	u16 vsi_count;
+};
+
 enum ice_promisc_flags {
 	ICE_PROMISC_UCAST_RX = 0x1,
 	ICE_PROMISC_UCAST_TX = 0x2,
@@ -207,28 +339,91 @@ ice_free_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 enum ice_status
 ice_update_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 	       struct ice_sq_cd *cd);
-bool ice_is_vsi_valid(struct ice_hw *hw, u16 vsi_handle);
 struct ice_vsi_ctx *ice_get_vsi_ctx(struct ice_hw *hw, u16 vsi_handle);
 void ice_clear_all_vsi_ctx(struct ice_hw *hw);
+enum ice_status
+ice_aq_get_vsi_params(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
+		      struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_add_update_mir_rule(struct ice_hw *hw, u16 rule_type, u16 dest_vsi,
+			   u16 count, struct ice_mir_rule_buf *mr_buf,
+			   struct ice_sq_cd *cd, u16 *rule_id);
+enum ice_status
+ice_aq_delete_mir_rule(struct ice_hw *hw, u16 rule_id, bool keep_allocd,
+		       struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_get_storm_ctrl(struct ice_hw *hw, u32 *bcast_thresh, u32 *mcast_thresh,
+		      u32 *ctl_bitmask);
+enum ice_status
+ice_aq_set_storm_ctrl(struct ice_hw *hw, u32 bcast_thresh, u32 mcast_thresh,
+		      u32 ctl_bitmask);
 /* Switch config */
+enum ice_status
+ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp_elem *buf,
+		  u16 buf_size, u16 *req_desc, u16 *num_elems,
+		  struct ice_sq_cd *cd);
 enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw);
 
-/* Switch/bridge related commands */
+enum ice_status
+ice_alloc_vlan_res_counter(struct ice_hw *hw, u16 *counter_id);
+enum ice_status
+ice_free_vlan_res_counter(struct ice_hw *hw, u16 counter_id);
+enum ice_status
+ice_alloc_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items,
+		   u16 *counter_id);
+enum ice_status
+ice_free_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items,
+		  u16 counter_id);
+
 enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw);
+enum ice_status ice_alloc_rss_global_lut(struct ice_hw *hw, bool shared_res, u16 *global_lut_id);
+enum ice_status ice_free_rss_global_lut(struct ice_hw *hw, u16 global_lut_id);
+enum ice_status
+ice_alloc_sw(struct ice_hw *hw, bool ena_stats, bool shared_res, u16 *sw_id,
+	     u16 *counter_id);
+enum ice_status
+ice_free_sw(struct ice_hw *hw, u16 sw_id, u16 counter_id);
+enum ice_status
+ice_aq_get_res_alloc(struct ice_hw *hw, u16 *num_entries,
+		     struct ice_aqc_get_res_resp_elem *buf, u16 buf_size,
+		     struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_get_res_descs(struct ice_hw *hw, u16 num_entries,
+		     struct ice_aqc_res_elem *buf, u16 buf_size, u16 res_type,
+		     bool res_shared, u16 *desc_id, struct ice_sq_cd *cd);
+enum ice_status
+ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
+enum ice_status
+ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
+void ice_rem_all_sw_rules_info(struct ice_hw *hw);
 enum ice_status ice_add_mac(struct ice_hw *hw, struct list_head *m_lst);
 enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_lst);
+bool ice_mac_fltr_exist(struct ice_hw *hw, u8 *mac, u16 vsi_handle);
+bool ice_vlan_fltr_exist(struct ice_hw *hw, u16 vlan_id, u16 vsi_handle);
 enum ice_status
 ice_add_eth_mac(struct ice_hw *hw, struct list_head *em_list);
 enum ice_status
 ice_remove_eth_mac(struct ice_hw *hw, struct list_head *em_list);
-void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle);
+void ice_dump_sw_rules(struct ice_hw *hw, enum ice_sw_lkup_type lookup);
 enum ice_status
-ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
-enum ice_status ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
+ice_cfg_iwarp_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable);
+enum ice_status
+ice_add_mac_vlan(struct ice_hw *hw, struct list_head *m_list);
+enum ice_status
+ice_remove_mac_vlan(struct ice_hw *hw, struct list_head *v_list);
+
+enum ice_status
+ice_add_mac_with_sw_marker(struct ice_hw *hw, struct ice_fltr_info *f_info,
+			   u16 sw_marker);
+enum ice_status
+ice_add_mac_with_counter(struct ice_hw *hw, struct ice_fltr_info *f_info);
+void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle);
+
 
 /* Promisc/defport setup for VSIs */
 enum ice_status
-ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_handle, bool set, u8 direction);
+ice_cfg_dflt_vsi(struct ice_port_info *pi, u16 vsi_handle, bool set,
+		 u8 direction);
 enum ice_status
 ice_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
 		    u16 vid);
@@ -239,11 +434,62 @@ enum ice_status
 ice_set_vlan_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
 			 bool rm_vlan_promisc);
 
-enum ice_status ice_init_def_sw_recp(struct ice_hw *hw);
+/* Get VSIs Promisc/defport settings */
+enum ice_status
+ice_get_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+		    u16 *vid);
+enum ice_status
+ice_get_vsi_vlan_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+			 u16 *vid);
+
+enum ice_status
+ice_aq_add_recipe(struct ice_hw *hw,
+		  struct ice_aqc_recipe_data_elem *s_recipe_list,
+		  u16 num_recipes, struct ice_sq_cd *cd);
+
+enum ice_status
+ice_aq_get_recipe(struct ice_hw *hw,
+		  struct ice_aqc_recipe_data_elem *s_recipe_list,
+		  u16 *num_recipes, u16 recipe_root, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap,
+			     struct ice_sq_cd *cd);
+
+enum ice_status
+ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap,
+			     struct ice_sq_cd *cd);
+
+enum ice_status ice_alloc_recipe(struct ice_hw *hw, u16 *recipe_id);
+enum ice_status
+ice_add_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		 u16 lkups_cnt, struct ice_adv_rule_info *rinfo,
+		 struct ice_rule_query_data *added_entry);
+enum ice_status
+ice_rem_adv_rule_for_vsi(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status
+ice_rem_adv_rule_by_id(struct ice_hw *hw,
+		       struct ice_rule_query_data *remove_entry);
+enum ice_status
+ice_rem_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		 u16 lkups_cnt, struct ice_adv_rule_info *rinfo);
+
+enum ice_status ice_dump_sw_cfg(struct ice_hw *hw);
+
+enum ice_status
+ice_init_def_sw_recp(struct ice_hw *hw, struct ice_sw_recipe **recp_list);
 u16 ice_get_hw_vsi_num(struct ice_hw *hw, u16 vsi_handle);
 bool ice_is_vsi_valid(struct ice_hw *hw, u16 vsi_handle);
 
-enum ice_status ice_replay_vsi_all_fltr(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status
+ice_replay_vsi_all_fltr(struct ice_hw *hw, struct ice_port_info *pi,
+			u16 vsi_handle);
+void ice_rm_sw_replay_rule_info(struct ice_hw *hw, struct ice_switch_info *sw);
 void ice_rm_all_sw_replay_rule_info(struct ice_hw *hw);
-
+enum ice_status
+ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
+		u8 num_rules, enum ice_adminq_opc opc, struct ice_sq_cd *cd);
+enum ice_status
+ice_update_recipe_lkup_idx(struct ice_hw *hw,
+			   struct ice_update_recipe_lkup_idx_params *params);
+void ice_change_proto_id_to_dvm(void);
 #endif /* _ICE_SWITCH_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
new file mode 100644
index 000000000000..62e6546ceeda
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
@@ -0,0 +1,2279 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_tc_lib.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_detect_filter_conflict - detect filter conflict across TC
+ * @pf: Pointer to PF structure
+ * @tc_fltr: Pointer to TC flower filter structure
+ *
+ * This function detects filter mismatch type but using same port_number
+ * across TC and allow/deny desired filter combination. Example is,
+ * filter 1, dest_ip + dest_port (80) -> action is forward to TC 1
+ * filter 2: dest_ip + src_port (80) -> action is forward to TC 2
+ *
+ * We do not want to support such config, to avoid situation where
+ * packets are getting duplicated across both the TCs if incoming Rx
+ * packet has same dest_ip + src_port (80) + dst_port (80).
+ * Due to both filter being same high prio filter in HW, both rule
+ * can match (whereas that is not expectation) and cause unexpected
+ * packet mirroring.
+ */
+static int
+ice_detect_filter_conflict(struct ice_pf *pf,
+			   struct ice_tc_flower_fltr *tc_fltr)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_tc_flower_fltr *fltr;
+	struct ice_tc_l4_hdr *l4_key;
+	u16 sport = 0, dport = 0;
+
+	/* header = outer header for non-tunnel filter,
+	 * otherwise inner_headers
+	 */
+	headers = &tc_fltr->outer_headers;
+	if (tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+		headers = &tc_fltr->inner_headers;
+
+	l4_key = &headers->l4_key;
+	if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+		sport = be16_to_cpu(l4_key->src_port);
+	if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+		dport = be16_to_cpu(l4_key->dst_port);
+
+	hlist_for_each_entry(fltr, &pf->tc_flower_fltr_list, tc_flower_node) {
+		struct ice_tc_flower_lyr_2_4_hdrs *fltr_headers;
+		struct ice_tc_l4_hdr *fltr_l4_key;
+		u16 dst_port = 0, src_port = 0;
+
+		/* if tc_class is same, skip, no check needed */
+		if (fltr->action.tc_class == tc_fltr->action.tc_class)
+			continue;
+
+		/* if only either of them are set, skip it */
+		if ((fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID) ^
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID))
+			continue;
+
+		/* if this is tunnel filter, make sure tunnel ID is not same */
+		if ((fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)) {
+			if (fltr->tenant_id && tc_fltr->tenant_id &&
+			    fltr->tenant_id == tc_fltr->tenant_id) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same tunnel key for other TC(see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same tunnel key (%u)\n",
+					fltr->action.tc_class,
+					be32_to_cpu(fltr->tenant_id));
+				return -EOPNOTSUPP;
+			}
+		}
+
+		fltr_headers = &fltr->outer_headers;
+		if (fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+			fltr_headers = &fltr->inner_headers;
+
+		/* access L4 params */
+		fltr_l4_key = &fltr_headers->l4_key;
+		if (fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+			dst_port = be16_to_cpu(fltr_l4_key->dst_port);
+		if (fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+			src_port = be16_to_cpu(fltr_l4_key->src_port);
+
+		/* proceed only if tc_class is different and filter types
+		 * are different but actual value(s) of say port number are
+		 * same, flag warning to user.
+		 * e.g if filter one is like dest port = 80 -> tc_class(1)
+		 * and second filter is like, src_port = 80 -> tc_class(2)
+		 * Invariably packet can match both the filter and user
+		 * will get expected packet mirroring to both the destination
+		 * (means tc_class(1) and tc_class(2)). To avoid such
+		 * behavior, block user from adding such conficting filter
+		 */
+		if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) {
+			if (dport && dst_port && dport == dst_port) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same destination port for other TC, as destination port based filter(see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same port number (%u) as destination port based filter. This is to avoid unexpected packet mirroring.\n",
+					fltr->action.tc_class, dst_port);
+				return -EOPNOTSUPP;
+			}
+			if (dport && src_port && dport == src_port) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same destination port for other TC, as source port based filter(see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same port number (%u) as source port based filter. This is to avoid unexpected packet mirroring.\n",
+					fltr->action.tc_class, src_port);
+				return -EOPNOTSUPP;
+			}
+		}
+
+		if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)  {
+			if (sport && dst_port && sport == dst_port) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same source port for other TC, as destination port based filter (see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same port number (%u) as destination port based filter. This is to avoid unexpected packet mirroring.\n",
+					fltr->action.tc_class, dst_port);
+				return -EOPNOTSUPP;
+			}
+			if (sport && src_port && sport == src_port) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same source port for other TC, as source port based filter (see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same port number (%u) as source port based filter. This is to avoid unexpected packet mirroring.\n",
+					fltr->action.tc_class, src_port);
+				return -EOPNOTSUPP;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_chnl_fltr_type_chk - filter type check
+ * @pf: Pointer to PF
+ * @tc_fltr: Pointer to TC flower filter structure
+ * @final_fltr_type: Ptr to filter type (dest/src/dest+src port)
+ *
+ * This function is used to determine if given filter (based on input params)
+ * should be allowed or not. For a given channel (aka ADQ VSI), supported
+ * filter types are src port, dest port , src+dest port. SO this function
+ * checks if any filter exist for specified channel (if so, channel specific
+ * filter_type will be set), and see if it matches with the filter being added.
+ * It returns 0 (upon success) or POSIX error code
+ */
+static int
+ice_chnl_fltr_type_chk(struct ice_pf *pf, struct ice_tc_flower_fltr *tc_fltr,
+		       enum ice_channel_fltr_type *final_fltr_type)
+{
+	enum ice_channel_fltr_type fltr_type = *final_fltr_type;
+	struct device *dev = ice_pf_to_dev(pf);
+
+	if (fltr_type == ICE_CHNL_FLTR_TYPE_INVALID) {
+		/* L4 based filter, more granular, hence should be checked
+		 * beore L3
+		 */
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT))
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_PORT;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_PORT;
+		/* L3 (IPv4) based filter check */
+		else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) &&
+			 (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4))
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4)
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV4;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4)
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV4;
+		/* L3 (IPv6) based filter check */
+		else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) &&
+			 (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6))
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6)
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV6;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6)
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV6;
+		/* Tunnel filter check, inner criteria is open:
+		 * any combination of inner L3 and/or L4
+		 */
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+			fltr_type = ICE_CHNL_FLTR_TYPE_TENANT_ID;
+		else
+			return -EOPNOTSUPP;
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_PORT) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC_PORT to SRC + DEST_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC_PORT to DEST_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_PORT;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_DEST_PORT) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST_PORT to SRC + DEST_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST_PORT to SRC_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_PORT;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT) {
+		/* must to have src/dest/src+dest port as part of filter
+		 * criteria
+		 */
+		if ((!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT)) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)))
+			return -EOPNOTSUPP;
+
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST_PORT to DEST_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_PORT;
+		} else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT) &&
+			   (!(tc_fltr->flags &
+			      ICE_TC_FLWR_FIELD_DEST_L4_PORT))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST_PORT to SRC_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_PORT;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_TENANT_ID) {
+		/* Now only allow filters which has VNI */
+		if (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID))
+			return -EOPNOTSUPP;
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4) {
+		/* must to have src/dest/src+dest IPv4 addr as part of filter
+		 * criteria
+		 */
+		if ((!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4)) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4)))
+			return -EOPNOTSUPP;
+
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST IPv4 addr to DEST IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV4;
+		} else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4) &&
+			   (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST IPv4 to SRC IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV4;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_DEST_IPV4) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST IPv4 addr to SRC + DEST IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST IPv4 addr to SRC IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV4;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_IPV4) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC IPv4 addr to SRC + DEST IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC IPv4 addr to DEST IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV4;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6) {
+		/* must to have src/dest/src+dest IPv6 addr as part of filter
+		 * criteria
+		 */
+		if ((!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6)) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6)))
+			return -EOPNOTSUPP;
+
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST IPv6 addr to DEST IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV6;
+		} else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6) &&
+			   (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST IPv6 to SRC IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV6;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_DEST_IPV6) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST IPv6 addr to SRC + DEST IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST IPv6 addr to SRC IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV6;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_IPV6) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC IPv6 addr to SRC + DEST IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC IPv6 addr to DEST IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV6;
+		}
+	} else {
+		return -EINVAL; /* unsupported filter type */
+	}
+
+	/* return the selected fltr_type */
+	*final_fltr_type = fltr_type;
+
+	return 0;
+}
+
+/**
+ * ice_determine_gtp_tun_type - determine TUN type based on user params
+ * @pf: Pointer to PF
+ * @l4_proto : vale of L4 protocol type
+ * @flags: TC filter flags
+ * @rule_info: Pointer to rule_info structure
+ *
+ * Determine TUN type based on user input. For VxLAN and Geneve, it is
+ * straight forward. But to detect, correct TUN type for GTP is
+ * challenging because there is no native support for GTP in kernel
+ * and user may want to filter on
+ *          Outer UDP + GTP (optional) + Inner L3 + Inner L4
+ * Actual API to add advanced switch filter expects caller to detect
+ * and specify correct TUN type and based on TUN type, appropriate
+ * type of rule is added in HW.
+ */
+static bool
+ice_determine_gtp_tun_type(struct ice_pf *pf, u16 l4_proto, u32 flags,
+			   struct ice_adv_rule_info *rule_info)
+{
+	u8 outer_ipv6 = 0, inner_ipv6 = 0;
+	u8 outer_ipv4 = 0, inner_ipv4 = 0;
+
+	/* if user specified enc IPv6 src/dest/src+dest IP */
+	if (flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV6 |
+		     ICE_TC_FLWR_FIELD_ENC_SRC_IPV6))
+		outer_ipv6 = 1;
+	else if (flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV4))
+		outer_ipv4 = 1;
+
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV6 |
+		     ICE_TC_FLWR_FIELD_SRC_IPV6))
+		inner_ipv6 = 1;
+	else if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV4 |
+			  ICE_TC_FLWR_FIELD_SRC_IPV4))
+		inner_ipv4 = 1;
+	else
+		/* for GTP encap, specifying inner L3 is must at this point,
+		 * inner L4 is optional
+		 */
+		return false;
+
+	/* following block support various protocol combinations for GTP
+	 * (at this pint we know that detected tunnel type is GTP based
+	 * on outer UDP port (2152: GTP_U):
+	 *     Outer IPv4 + Inner IPv4[6] + Inner TCP/UDP
+	 *     Outer IPv4 + Inner IPv4[6]
+	 *     Outer IPv6 + Inner IPv4[6] + Inner TCP/UDP
+	 *     Outer IPv6 + Inner IPv4[6]
+	 */
+	if (!outer_ipv6 && !outer_ipv4) {
+		if (inner_ipv4 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV4_TCP;
+		else if (inner_ipv4 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV4_UDP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV6_TCP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV6_UDP;
+		else if (inner_ipv4)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV4;
+		else if (inner_ipv6)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV6;
+		else
+			/* no reason to proceed, error condition (must to
+			 * specify inner L3 and/or inner L3 + inner L4)
+			 */
+			return false;
+	} else if (outer_ipv4) {
+		if (inner_ipv4 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTP_IPV4_TCP;
+		else if (inner_ipv4 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTP_IPV4_UDP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTP_IPV6_TCP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTP_IPV6_UDP;
+		else if (inner_ipv4)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTPU_IPV4;
+		else if (inner_ipv6)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTPU_IPV6;
+		else
+			/* no reason to proceed, error condition (must to
+			 * specify inner L3 and/or inner L3 + inner L4)
+			 */
+			return false;
+	} else if (outer_ipv6) {
+		if (inner_ipv4 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTP_IPV4_TCP;
+		else if (inner_ipv4 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTP_IPV4_UDP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTP_IPV6_TCP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTP_IPV6_UDP;
+		else if (inner_ipv4)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTPU_IPV4;
+		else if (inner_ipv6)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTPU_IPV6;
+		else
+			/* no reason to proceed, error condition (must to
+			 * specify inner L3 and/or inner L3 + inner L4)
+			 */
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_tc_count_lkups - determine lookup count for switch filter
+ * @flags: tc-flower flags
+ * @headers: Pointer to TC flower filter header structure
+ * @fltr: Pointer to outer TC filter structure
+ *
+ * Determine lookup count based on TC flower input for switch filter.
+ */
+static int
+ice_tc_count_lkups(u32 flags, struct ice_tc_flower_lyr_2_4_hdrs *headers,
+		   struct ice_tc_flower_fltr *fltr)
+{
+	int lkups_cnt = 0;
+
+	if (flags & ICE_TC_FLWR_FIELD_ETH_TYPE_ID)
+		lkups_cnt++;
+
+	/* is Tunnel ID specified */
+	if (flags & ICE_TC_FLWR_FIELD_TENANT_ID) {
+		/* For ADQ filter, outer DMAC gets added implictly */
+		if (flags & ICE_TC_FLWR_FIELD_ENC_DST_MAC)
+			lkups_cnt++;
+		/* Copy outer L4 port for non-GTP tunnel */
+		if (fltr->tunnel_type != TNL_GTP) {
+			if (flags & ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT)
+				if (headers->l3_key.ip_proto == IPPROTO_UDP)
+					lkups_cnt++;
+		}
+		/* due to tunnel */
+		lkups_cnt++;
+	}
+
+	/* is MAC fields specified? */
+	if (flags & (ICE_TC_FLWR_FIELD_DST_MAC | ICE_TC_FLWR_FIELD_SRC_MAC))
+		lkups_cnt++;
+
+	/* is VLAN specified? */
+	if (flags & ICE_TC_FLWR_FIELD_VLAN)
+		lkups_cnt++;
+
+	/* is IPv[4|6] fields specified? */
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV4 | ICE_TC_FLWR_FIELD_SRC_IPV4))
+		lkups_cnt++;
+	else if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV6 |
+			  ICE_TC_FLWR_FIELD_SRC_IPV6))
+		lkups_cnt++;
+
+	/* is L4 (TCP/UDP/any other L4 protocol fields specified? */
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_L4_PORT |
+		     ICE_TC_FLWR_FIELD_SRC_L4_PORT))
+		lkups_cnt++;
+
+	return lkups_cnt;
+}
+
+/**
+ * ice_tc_fill_rules - fill filter rules based on tc fltr
+ * @hw: pointer to hw structure
+ * @flags: tc flower field flags
+ * @tc_fltr: pointer to tc flower filter
+ * @list: list of advance rule elements
+ * @rule_info: pointer to information about rule
+ * @l4_proto: pointer to information such as L4 proto type
+ *
+ * Fill ice_adv_lkup_elem list based on tc flower flags and
+ * tc flower headers. This list should be used to add
+ * advance filter in hardware.
+ */
+static int
+ice_tc_fill_rules(struct ice_hw *hw, u32 flags,
+		  struct ice_tc_flower_fltr *tc_fltr,
+		  struct ice_adv_lkup_elem *list,
+		  struct ice_adv_rule_info *rule_info,
+		  u16 *l4_proto)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &tc_fltr->outer_headers;
+	int i = 0;
+
+	if (flags & ICE_TC_FLWR_FIELD_ETH_TYPE_ID) {
+		list[i].type = ICE_ETYPE_OL;
+		list[i].h_u.ethertype.ethtype_id = headers->l2_key.n_proto;
+		list[i].m_u.ethertype.ethtype_id = headers->l2_mask.n_proto;
+		i++;
+	}
+
+	/* copy L2 (MAC) fields, Outer UDP (in case of tunnel) port info */
+	if (flags & ICE_TC_FLWR_FIELD_TENANT_ID) {
+		u32 tenant_id;
+
+		/* copy L2 (MAC) fields if specified, For tunnel outer DMAC
+		 * is needed and supported and is part of outer_headers.dst_mac
+		 * For VxLAN tunnel, supported ADQ filter config is:
+		 * - Outer dest MAC + VNI + Inner IPv4 + Inner L4 ports
+		 */
+		if (flags & ICE_TC_FLWR_FIELD_ENC_DST_MAC) {
+			list[i].type = ICE_MAC_OFOS;
+			ether_addr_copy(list[i].h_u.eth_hdr.dst_addr,
+					headers->l2_key.dst_mac);
+			ether_addr_copy(list[i].m_u.eth_hdr.dst_addr,
+					headers->l2_mask.dst_mac);
+			i++;
+		}
+		/* copy outer UDP (enc_dst_port) only for non-GTP tunnel */
+		if (tc_fltr->tunnel_type != TNL_GTP) {
+			if ((flags & ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT) &&
+			    headers->l3_key.ip_proto == IPPROTO_UDP) {
+				list[i].type = ICE_UDP_OF;
+				list[i].h_u.l4_hdr.dst_port =
+					headers->l4_key.dst_port;
+				list[i].m_u.l4_hdr.dst_port =
+					headers->l4_mask.dst_port;
+				i++;
+			}
+		}
+
+		/* setup encap info in list elements such as VNI/encap key-id,
+		 * mask, type of tunnel
+		 */
+		if (tc_fltr->tunnel_type == TNL_VXLAN)
+			list[i].type = ICE_VXLAN;
+		else if (tc_fltr->tunnel_type == TNL_GENEVE)
+			list[i].type = ICE_GENEVE;
+		else if (tc_fltr->tunnel_type == TNL_GTP)
+			list[i].type = ICE_GTP;
+
+		if (tc_fltr->tunnel_type == TNL_VXLAN ||
+		    tc_fltr->tunnel_type == TNL_GENEVE) {
+			tenant_id = be32_to_cpu(tc_fltr->tenant_id) << 8;
+			list[i].h_u.tnl_hdr.vni = cpu_to_be32(tenant_id);
+			if (tenant_id)
+				/* 24 bit tunnel key: mask "\xff\xff\xff\x00" */
+				memcpy(&list[i].m_u.tnl_hdr.vni,
+				       "\xff\xff\xff\x00", 4);
+			else
+				memcpy(&list[i].m_u.tnl_hdr.vni,
+				       "\x00\x00\x00\x00", 4);
+		} else if (tc_fltr->tunnel_type == TNL_GTP) {
+			tenant_id = be32_to_cpu(tc_fltr->tenant_id);
+			list[i].h_u.gtp_hdr.teid = cpu_to_be32(tenant_id);
+			if (tenant_id)
+				/* 32 bit tunnel key: mask "\xff\xff\xff\xff" */
+				memcpy(&list[i].m_u.gtp_hdr.teid,
+				       "\xff\xff\xff\xff", 4);
+			else
+				memcpy(&list[i].m_u.gtp_hdr.teid,
+				       "\x00\x00\x00x00", 4);
+		}
+		/* advance list index */
+		i++;
+
+		/* now access values from inner_headers such as inner MAC (if
+		 * supported), inner IPv4[6], Inner L4 ports, hence update
+		 * "headers" to point to inner_headers
+		 */
+		headers = &tc_fltr->inner_headers;
+	} else {
+		rule_info->tun_type = ICE_NON_TUN;
+		/* copy L2 (MAC) fields, for non-tunnel case */
+		if (flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+			     ICE_TC_FLWR_FIELD_SRC_MAC)) {
+			struct ice_tc_l2_hdr *l2_key, *l2_mask;
+
+			l2_key = &headers->l2_key;
+			l2_mask = &headers->l2_mask;
+
+			list[i].type = ICE_MAC_OFOS;
+			if (flags & ICE_TC_FLWR_FIELD_DST_MAC) {
+				ether_addr_copy(list[i].h_u.eth_hdr.dst_addr,
+						l2_key->dst_mac);
+				ether_addr_copy(list[i].m_u.eth_hdr.dst_addr,
+						l2_mask->dst_mac);
+			}
+			if (flags & ICE_TC_FLWR_FIELD_SRC_MAC) {
+				ether_addr_copy(list[i].h_u.eth_hdr.src_addr,
+						l2_key->src_mac);
+				ether_addr_copy(list[i].m_u.eth_hdr.src_addr,
+						l2_mask->src_mac);
+			}
+			i++;
+		}
+	}
+
+	/* copy VLAN info */
+	if (flags & ICE_TC_FLWR_FIELD_VLAN) {
+		list[i].type = ICE_VLAN_OFOS;
+		list[i].h_u.vlan_hdr.vlan = headers->vlan_hdr.vlan_id;
+		list[i].m_u.vlan_hdr.vlan = cpu_to_be16(0xFFFF);
+		i++;
+	}
+
+
+	/* copy L3 (IPv[4|6]: src, dest) address */
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV4 |
+		     ICE_TC_FLWR_FIELD_SRC_IPV4)) {
+		struct ice_tc_l3_hdr *l3_key, *l3_mask;
+
+		/* For encap, Outer L3 and L4 based are not supported,
+		 * hence if user specified L3, L4 fields, they are treated
+		 * as inner L3 and L4 respectivelt
+		 */
+		if (flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+			list[i].type = ICE_IPV4_IL;
+		else
+			list[i].type = ICE_IPV4_OFOS;
+
+		l3_key = &headers->l3_key;
+		l3_mask = &headers->l3_mask;
+		if (flags & ICE_TC_FLWR_FIELD_DEST_IPV4) {
+			list[i].h_u.ipv4_hdr.dst_addr = l3_key->dst_ipv4;
+			list[i].m_u.ipv4_hdr.dst_addr = l3_mask->dst_ipv4;
+		}
+		if (flags & ICE_TC_FLWR_FIELD_SRC_IPV4) {
+			list[i].h_u.ipv4_hdr.src_addr = l3_key->src_ipv4;
+			list[i].m_u.ipv4_hdr.src_addr = l3_mask->src_ipv4;
+		}
+		i++;
+	} else if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV6 |
+			    ICE_TC_FLWR_FIELD_SRC_IPV6)) {
+		struct ice_ipv6_hdr *ipv6_hdr, *ipv6_mask;
+		struct ice_tc_l3_hdr *l3_key, *l3_mask;
+
+		if (flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+			list[i].type = ICE_IPV6_IL;
+		else
+			list[i].type = ICE_IPV6_OFOS;
+		ipv6_hdr = &list[i].h_u.ipv6_hdr;
+		ipv6_mask = &list[i].m_u.ipv6_hdr;
+		l3_key = &headers->l3_key;
+		l3_mask = &headers->l3_mask;
+
+		if (flags & ICE_TC_FLWR_FIELD_DEST_IPV6) {
+			memcpy(&ipv6_hdr->dst_addr, &l3_key->dst_ipv6_addr,
+			       sizeof(l3_key->dst_ipv6_addr));
+			memcpy(&ipv6_mask->dst_addr, &l3_mask->dst_ipv6_addr,
+			       sizeof(l3_mask->dst_ipv6_addr));
+		}
+		if (flags & ICE_TC_FLWR_FIELD_SRC_IPV6) {
+			memcpy(&ipv6_hdr->src_addr, &l3_key->src_ipv6_addr,
+			       sizeof(l3_key->src_ipv6_addr));
+			memcpy(&ipv6_mask->src_addr, &l3_mask->src_ipv6_addr,
+			       sizeof(l3_mask->src_ipv6_addr));
+		}
+		i++;
+	}
+
+	/* copy L4 (src, dest) port */
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_L4_PORT |
+		     ICE_TC_FLWR_FIELD_SRC_L4_PORT)) {
+		struct ice_tc_l4_hdr *l4_key, *l4_mask;
+		u16 dst_port;
+
+		l4_key = &headers->l4_key;
+		l4_mask = &headers->l4_mask;
+		dst_port = be16_to_cpu(l4_key->dst_port);
+		if (headers->l3_key.ip_proto == IPPROTO_TCP) {
+			list[i].type = ICE_TCP_IL;
+			/* detected L4 proto is TCP */
+			if (l4_proto)
+				*l4_proto = IPPROTO_TCP;
+		} else if (headers->l3_key.ip_proto == IPPROTO_UDP) {
+			/* Check if UDP dst port is known as a tunnel port */
+			if (ice_tunnel_port_in_use(hw, dst_port, NULL)) {
+				list[i].type = ICE_UDP_OF;
+				rule_info->tun_type = ICE_SW_TUN_VXLAN;
+			} else {
+				list[i].type = ICE_UDP_ILOS;
+			}
+			/* detected L4 proto is UDP */
+			if (l4_proto)
+				*l4_proto = IPPROTO_UDP;
+		}
+		if (flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) {
+			list[i].h_u.l4_hdr.dst_port = l4_key->dst_port;
+			list[i].m_u.l4_hdr.dst_port = l4_mask->dst_port;
+		}
+		if (flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT) {
+			list[i].h_u.l4_hdr.src_port = l4_key->src_port;
+			list[i].m_u.l4_hdr.src_port = l4_mask->src_port;
+		}
+		i++;
+	}
+
+	return i;
+}
+
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+static int ice_eswitch_tc_parse_action(struct ice_tc_flower_fltr *fltr,
+				       struct flow_action_entry *act)
+{
+	struct ice_repr *repr;
+
+	switch (act->id) {
+	case FLOW_ACTION_DROP:
+		fltr->action.fltr_act = ICE_DROP_PACKET;
+		break;
+
+	case FLOW_ACTION_REDIRECT:
+		fltr->action.fltr_act = ICE_FWD_TO_VSI;
+
+		if (ice_is_port_repr_netdev(act->dev)) {
+			repr = ice_netdev_to_repr(act->dev);
+
+			fltr->dest_vsi = repr->src_vsi;
+			fltr->direction = ICE_ESWITCH_FLTR_INGRESS;
+		} else if (netif_is_ice(act->dev)) {
+			struct ice_netdev_priv *np = netdev_priv(act->dev);
+
+			fltr->dest_vsi = np->vsi;
+			fltr->direction = ICE_ESWITCH_FLTR_EGRESS;
+		} else {
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Unsupported netdevice in switchdev mode");
+			return -EINVAL;
+		}
+
+		break;
+
+	default:
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unsupported action in switchdev mode");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif /* HAVE_TC_FLOW_RULE_INFRASTRUCTURE */
+
+static int
+ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &fltr->outer_headers;
+	struct ice_adv_rule_info rule_info = { 0 };
+	struct ice_rule_query_data rule_added;
+	struct ice_adv_lkup_elem *list;
+	struct ice_hw *hw = &vsi->back->hw;
+	u32 flags = fltr->flags;
+	enum ice_status status;
+	int lkups_cnt;
+	int ret = 0;
+	int i;
+
+	if (!flags || (flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_DEST_IPV6 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV6 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT))) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unsupported encap field(s)");
+		return -EOPNOTSUPP;
+	}
+
+	lkups_cnt = ice_tc_count_lkups(flags, headers, fltr);
+	list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC);
+	if (!list)
+		return -ENOMEM;
+
+	i = ice_tc_fill_rules(hw, flags, fltr, list, &rule_info, NULL);
+	if (i != lkups_cnt) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	rule_info.sw_act.fltr_act = fltr->action.fltr_act;
+	rule_info.sw_act.vsi_handle = fltr->dest_vsi->idx;
+	rule_info.priority = 7;
+
+	if (fltr->direction == ICE_ESWITCH_FLTR_INGRESS) {
+		rule_info.sw_act.flag |= ICE_FLTR_RX;
+		rule_info.sw_act.src = hw->pf_id;
+		rule_info.rx = true;
+	} else {
+		rule_info.sw_act.flag |= ICE_FLTR_TX;
+		rule_info.sw_act.src = vsi->idx;
+		rule_info.rx = false;
+	}
+
+	/* specify the cookie as filter_rule_id */
+	rule_info.fltr_rule_id = fltr->cookie;
+
+	status = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, &rule_added);
+	if (status == ICE_ERR_ALREADY_EXISTS) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because it already exist");
+		ret = -EINVAL;
+		goto exit;
+	} else if (status) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter due to error");
+		ret = -EIO;
+		goto exit;
+	}
+
+	/* store the output params, which are needed later for removing
+	 * advanced switch filter
+	 */
+	fltr->rid = rule_added.rid;
+	fltr->rule_id = rule_added.rule_id;
+
+	if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS) {
+		if (ice_fltr_update_flags(vsi, fltr->rule_id, fltr->rid,
+					  ICE_SINGLE_ACT_LAN_ENABLE))
+			ice_rem_adv_rule_by_id(hw, &rule_added);
+	}
+
+exit:
+	kfree(list);
+	return ret;
+}
+
+/**
+ * ice_add_tc_flower_adv_fltr - add appropriate filter rules
+ * @vsi: Pointer to VSI
+ * @tc_fltr: Pointer to TC flower filter structure
+ *
+ * based on filter parameters using Advance recipes supported
+ * by OS package.
+ */
+int
+ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi,
+			   struct ice_tc_flower_fltr *tc_fltr)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &tc_fltr->outer_headers;
+	enum ice_channel_fltr_type fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+	struct ice_adv_rule_info rule_info = {0};
+	struct ice_rule_query_data rule_added;
+	struct ice_channel_vf *vf_ch = NULL;
+	struct ice_adv_lkup_elem *list;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	u32 flags = tc_fltr->flags;
+	enum ice_status status;
+	struct ice_vsi *ch_vsi;
+	struct device *dev;
+	struct ice_vf *vf;
+	u16 lkups_cnt = 0;
+	u16 l4_proto = 0;
+	int ret = 0;
+	u16 i = 0;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Unable to add filter because driver is in safe mode");
+		return -EOPNOTSUPP;
+	}
+
+	if (!flags || (flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_DEST_IPV6 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV6 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT))) {
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Unsupported encap field(s)");
+		return -EOPNOTSUPP;
+	}
+
+	/* get the channel (aka ADQ VSI) */
+	if (tc_fltr->dest_vsi)
+		ch_vsi = tc_fltr->dest_vsi;
+	else
+		ch_vsi = vsi->tc_map_vsi[tc_fltr->action.tc_class];
+
+	lkups_cnt = ice_tc_count_lkups(flags, headers, tc_fltr);
+	list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC);
+	if (!list)
+		return -ENOMEM;
+
+	i = ice_tc_fill_rules(hw, flags, tc_fltr, list, &rule_info, &l4_proto);
+	if (i != lkups_cnt) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	/* Now determine correct TUN type of based on encap params */
+	if ((flags & ICE_TC_FLWR_FIELD_TENANT_ID) &&
+	    tc_fltr->tunnel_type == TNL_GTP) {
+		if (!ice_determine_gtp_tun_type(pf, l4_proto, tc_fltr->flags,
+						&rule_info)) {
+			if (vsi->type == ICE_VSI_VF)
+				dev_err(dev, "Unable to add filter because could not determine tun type, VSI %u, vf_id:%u\n",
+					vsi->vsi_num, vsi->vf_id);
+			else
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unable to add filter because could not determine TUN type. ");
+			ret = -EINVAL;
+			goto exit;
+		}
+	}
+
+	rule_info.sw_act.fltr_act = tc_fltr->action.fltr_act;
+	if (tc_fltr->action.tc_class >= ICE_CHNL_START_TC) {
+		if (!ch_vsi) {
+			NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+					   "Unable to add filter because specified destination doesn't exist");
+			ret = -EINVAL;
+			goto exit;
+		}
+
+		/* dest_vsi is preset, means it is from virtchnl message */
+		if (tc_fltr->dest_vsi) {
+			if (vsi->type != ICE_VSI_VF ||
+			    tc_fltr->dest_vsi->type != ICE_VSI_VF) {
+				dev_err(dev, "Unexpected VSI(vf_id:%u) type: %u\n",
+					vsi->vf_id, vsi->type);
+				ret = -EINVAL;
+				goto exit;
+			}
+			vf = &pf->vf[vsi->vf_id];
+			if (!vf) {
+				dev_err(dev, "VF is NULL for VSI->type: ICE_VF_VSI and vf_id %d\n",
+					vsi->vf_id);
+				ret = -EINVAL;
+				goto exit;
+			}
+			vf_ch = &vf->ch[tc_fltr->action.tc_class];
+
+			fltr_type = (enum ice_channel_fltr_type)
+				    vf_ch->fltr_type;
+		} else if (ch_vsi->ch) {
+			fltr_type = ch_vsi->ch->fltr_type;
+		} else {
+			dev_err(dev, "Can't add switch rule, neither dest_vsi is valid now VSI channel but tc_class sepcified is %u\n",
+				tc_fltr->action.tc_class);
+			ret = -EINVAL;
+			goto exit;
+		}
+
+		/* perform fltr_type check for channel (aka ADQ) VSI */
+		ret = ice_chnl_fltr_type_chk(pf, tc_fltr, &fltr_type);
+		if (ret) {
+			NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+					   "Unable to add filter because filter type check failed");
+			dev_err(dev, "Unable to add filter because filter type check failed");
+			ret = -EINVAL;
+			goto exit;
+		}
+
+		/* Code is applicable only for PF ADQ, for VF ADQ - such
+		 * checks to be handled by VF driver
+		 */
+		if (ch_vsi && (ch_vsi->type == ICE_VSI_PF ||
+			       ch_vsi->type == ICE_VSI_CHNL)) {
+			ret = ice_detect_filter_conflict(pf, tc_fltr);
+			if (ret)
+				goto exit;
+		}
+
+		if (tc_fltr->dest_vsi) {
+			if (vf_ch && !fltr_type)
+				vf_ch->fltr_type = fltr_type;
+		} else if (ch_vsi->ch) {
+			ch_vsi->ch->fltr_type = fltr_type;
+		}
+
+		rule_info.sw_act.fltr_act = ICE_FWD_TO_VSI;
+#ifdef __CHECKER__
+		/* cppcheck-suppress nullPointerRedundantCheck */
+#endif /* _CHECKER__ */
+		rule_info.sw_act.vsi_handle = ch_vsi->idx;
+		rule_info.priority = 7;
+
+		rule_info.sw_act.src = hw->pf_id;
+		rule_info.rx = true;
+
+		dev_dbg(dev, "add switch rule for TC:%u vsi_idx:%u, lkups_cnt:%u\n",
+			tc_fltr->action.tc_class,
+			rule_info.sw_act.vsi_handle, lkups_cnt);
+	} else {
+		rule_info.sw_act.flag |= ICE_FLTR_TX;
+		rule_info.sw_act.src = vsi->idx;
+		rule_info.rx = false;
+	}
+
+	/* specify the cookie as filter_rule_id */
+	rule_info.fltr_rule_id = tc_fltr->cookie;
+
+	status = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, &rule_added);
+	if (status == ICE_ERR_ALREADY_EXISTS) {
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Unable to add filter because it already exist");
+		ret = -EINVAL;
+		goto exit;
+	} else if (status) {
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Unable to add filter due to error");
+		ret = -EIO;
+		goto exit;
+	}
+
+	/* store the output params, which are needed later for removing
+	 * advanced switch filter
+	 */
+	tc_fltr->rid = rule_added.rid;
+	tc_fltr->rule_id = rule_added.rule_id;
+	if (tc_fltr->action.tc_class > 0 && ch_vsi) {
+		/* For PF ADQ, VSI type is set as ICE_VSI_CHNL, and
+		 * for PF ADQ filter, it is not yet set in tc_fltr,
+		 * hence store the dest_vsi ptr in tc_fltr
+		 */
+		if (ch_vsi->type == ICE_VSI_CHNL)
+			tc_fltr->dest_vsi = ch_vsi;
+		/* keep track of advanced switch filter for
+		 * destination VSI (channel VSI)
+		 */
+		ch_vsi->num_chnl_fltr++;
+		/* in this case, dest_id is VSI handle (sw handle) */
+		tc_fltr->dest_id = rule_added.vsi_handle;
+
+		/* keeps track of channel filters for PF VSI */
+		if (vsi->type == ICE_VSI_PF &&
+		    (flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+			      ICE_TC_FLWR_FIELD_ENC_DST_MAC)))
+			pf->num_dmac_chnl_fltrs++;
+	}
+	dev_dbg(dev, "added switch rule (lkups_cnt %u, flags 0x%x) for TC %u, rid %u, rule_id %u, vsi_idx %u\n",
+		lkups_cnt, flags,
+		tc_fltr->action.tc_class, rule_added.rid,
+		rule_added.rule_id, rule_added.vsi_handle);
+exit:
+	kfree(list);
+	return ret;
+}
+
+/**
+ * ice_tc_set_ipv4 - Parse IPv4 addresses from TC flower filter
+ * @match: Pointer to flow match structure
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ * @is_encap: set true for tunnel IPv4 address
+ */
+static int
+ice_tc_set_ipv4(struct flow_match_ipv4_addrs *match,
+		struct ice_tc_flower_fltr *fltr,
+		struct ice_tc_flower_lyr_2_4_hdrs *headers, bool is_encap)
+{
+	if (match->key->dst) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DEST_IPV4;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_IPV4;
+		headers->l3_key.dst_ipv4 = match->key->dst;
+		headers->l3_mask.dst_ipv4 = match->mask->dst;
+	}
+	if (match->key->src) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_SRC_IPV4;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_IPV4;
+		headers->l3_key.src_ipv4 = match->key->src;
+		headers->l3_mask.src_ipv4 = match->mask->src;
+	}
+	return 0;
+}
+
+/**
+ * ice_tc_set_ipv6 - Parse IPv6 addresses from TC flower filter
+ * @match: Pointer to flow match structure
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ * @is_encap: set true for tunnel IPv6 address
+ */
+static int
+ice_tc_set_ipv6(struct flow_match_ipv6_addrs *match,
+		struct ice_tc_flower_fltr *fltr,
+		struct ice_tc_flower_lyr_2_4_hdrs *headers, bool is_encap)
+{
+	struct ice_tc_l3_hdr *l3_key, *l3_mask;
+
+	/* src and dest IPV6 address should not be LOOPBACK
+	 * (0:0:0:0:0:0:0:1), which can be represented as ::1
+	 */
+	if (ipv6_addr_loopback(&match->key->dst) ||
+	    ipv6_addr_loopback(&match->key->src)) {
+		NL_SET_ERR_MSG_MOD(fltr->extack, "Bad ipv6, addr is LOOPBACK");
+		return -EINVAL;
+	}
+	/* if src/dest IPv6 address is *,* error */
+	if (ipv6_addr_any(&match->mask->dst) &&
+	    ipv6_addr_any(&match->mask->src)) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Bad src/dest IPV6, addr is any");
+		return -EINVAL;
+	}
+	if (!ipv6_addr_any(&match->mask->dst)) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DEST_IPV6;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_IPV6;
+	}
+	if (!ipv6_addr_any(&match->mask->src)) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_SRC_IPV6;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_IPV6;
+	}
+
+	l3_key = &headers->l3_key;
+	l3_mask = &headers->l3_mask;
+
+	if (fltr->flags & (ICE_TC_FLWR_FIELD_ENC_SRC_IPV6 |
+			   ICE_TC_FLWR_FIELD_SRC_IPV6)) {
+		memcpy(&l3_key->src_ipv6_addr, &match->key->src.s6_addr,
+		       sizeof(match->key->src.s6_addr));
+		memcpy(&l3_mask->src_ipv6_addr, &match->mask->src.s6_addr,
+		       sizeof(match->mask->src.s6_addr));
+	}
+	if (fltr->flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV6 |
+			   ICE_TC_FLWR_FIELD_DEST_IPV6)) {
+		memcpy(&l3_key->dst_ipv6_addr, &match->key->dst.s6_addr,
+		       sizeof(match->key->dst.s6_addr));
+		memcpy(&l3_mask->dst_ipv6_addr, &match->mask->dst.s6_addr,
+		       sizeof(match->mask->dst.s6_addr));
+	}
+
+	return 0;
+}
+
+/**
+ * ice_tc_set_port - Parse ports from TC flower filter
+ * @match: Flow match structure
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ * @is_encap: set true for tunnel port
+ */
+static int
+ice_tc_set_port(struct flow_match_ports match,
+		struct ice_tc_flower_fltr *fltr,
+		struct ice_tc_flower_lyr_2_4_hdrs *headers, bool is_encap)
+{
+	if (match.key->dst) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_L4_PORT;
+		headers->l4_key.dst_port = match.key->dst;
+		headers->l4_mask.dst_port = match.mask->dst;
+	}
+	if (match.key->src) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_L4_PORT;
+		headers->l4_key.src_port = match.key->src;
+		headers->l4_mask.src_port = match.mask->src;
+	}
+	return 0;
+}
+
+#if defined(HAVE_TC_FLOWER_ENC) && defined(HAVE_TC_INDIR_BLOCK)
+/**
+ * ice_is_tnl_gtp - detect if tunnel type is GTP or not
+ * @tunnel_dev: ptr to tunnel device
+ * @rule: ptr to flow_rule
+ *
+ * If curr_tnl_type is TNL_LAST and "flow_rule" is non-NULL, then
+ * check if enc_dst_port is well known GTP port (2152)
+ * if so - return true (indicating that tunnel type is GTP), otherwise false.
+ */
+static bool
+ice_is_tnl_gtp(struct net_device *tunnel_dev,
+	       struct flow_rule *rule)
+{
+	/* if flow_rule is non-NULL, proceed with detecting possibility
+	 * of GTP tunnel. Unlike VXLAN and GENEVE, there is no such API
+	 * like  netif_is_gtp since GTP is not natively supported in kernel
+	 */
+	if (rule && (!is_vlan_dev(tunnel_dev))) {
+		struct flow_match_ports match;
+		u16 enc_dst_port;
+
+		if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+			netdev_err(tunnel_dev,
+				   "Tunnel HW offload is not supported, ENC_PORTs are not specified\n");
+			return false;
+		}
+
+		/* get ENC_PORTS info */
+		flow_rule_match_enc_ports(rule, &match);
+		enc_dst_port = be16_to_cpu(match.key->dst);
+
+		/* Outer UDP port is GTP well known port,
+		 * if 'enc_dst_port' matched with GTP wellknown port,
+		 * return true from this function.
+		 */
+		if (enc_dst_port != ICE_GTP_TNL_WELLKNOWN_PORT) {
+			netdev_err(tunnel_dev,
+				   "Tunnel HW offload is not supported for non-GTP tunnel, ENC_DST_PORT is %u\n",
+				   enc_dst_port);
+			return false;
+		}
+
+		/* all checks passed including outer UDP port  to be qualified
+		 * for GTP tunnel
+		 */
+		return true;
+	}
+	return false;
+}
+
+/**
+ * ice_tc_tun_get_type - get the tunnel type
+ * @tunnel_dev: ptr to tunnel device
+ * @rule: ptr to flow_rule
+ *
+ * This function detects appropriate tunnel_type if specified device is
+ * tunnel device such as vxlan/geneve othertwise it tries to detect
+ * tunnel type based on outer GTP port (2152)
+ */
+int
+ice_tc_tun_get_type(struct net_device *tunnel_dev,
+		    struct flow_rule *rule)
+{
+#ifdef HAVE_VXLAN_TYPE
+#if IS_ENABLED(CONFIG_VXLAN)
+	if (netif_is_vxlan(tunnel_dev))
+		return TNL_VXLAN;
+#endif /* HAVE_VXLAN_TYPE */
+#elif defined(HAVE_GENEVE_TYPE)
+#if IS_ENABLED(CONFIG_GENEVE)
+	if (netif_is_geneve(tunnel_dev))
+		return TNL_GENEVE;
+#endif
+#endif /* HAVE_GENEVE_TYPE */
+	/* detect possibility of GTP tunnel type based on input */
+	if (ice_is_tnl_gtp(tunnel_dev, rule))
+		return TNL_GTP;
+
+	return TNL_LAST;
+}
+
+/**
+ * ice_tc_tun_info - Parse and store tunnel info
+ * @pf: ptr to PF device
+ * @f: Pointer to struct flow_cls_offload
+ * @fltr: Pointer to filter structure
+ * @tunnel: type of tunnel (e.g. VxLAN, Geneve, GTP)
+ *
+ * Parse tunnel attributes such as tunnel_id and store them.
+ */
+static int
+ice_tc_tun_info(struct ice_pf *pf, struct flow_cls_offload *f,
+		struct ice_tc_flower_fltr *fltr,
+		enum ice_tunnel_type tunnel)
+{
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+
+	/* match on VNI */
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		struct device *dev = ice_pf_to_dev(pf);
+		struct flow_match_enc_keyid enc_keyid;
+		u32 key_id;
+
+		flow_rule_match_enc_keyid(rule, &enc_keyid);
+		if (!enc_keyid.mask->keyid) {
+			dev_err(dev, "Bad mask for encap key_id 0x%04x, it must be non-zero\n",
+				be32_to_cpu(enc_keyid.mask->keyid));
+			return -EINVAL;
+		}
+
+		if (enc_keyid.mask->keyid !=
+				cpu_to_be32(ICE_TC_FLOWER_MASK_32)) {
+			dev_err(dev, "Bad mask value for encap key_id 0x%04x\n",
+				be32_to_cpu(enc_keyid.mask->keyid));
+			return -EINVAL;
+		}
+
+		key_id = be32_to_cpu(enc_keyid.key->keyid);
+		if (tunnel == TNL_VXLAN || tunnel == TNL_GENEVE) {
+			/* VNI is only 3 bytes, applicable for VXLAN/GENEVE */
+			if (key_id > ICE_TC_FLOWER_VNI_MAX) {
+				dev_err(dev, "VNI out of range : 0x%x\n",
+					key_id);
+				return -EINVAL;
+			}
+		}
+		fltr->flags |= ICE_TC_FLWR_FIELD_TENANT_ID;
+		fltr->tenant_id = enc_keyid.key->keyid;
+	} else if (tunnel == TNL_GTP) {
+		/* User didn't specify tunnel_key but indicated
+		 * intention about GTP tunnel.
+		 * For GTP tunnel, support for wild-card tunnel-ID
+		 */
+		fltr->flags |= ICE_TC_FLWR_FIELD_TENANT_ID;
+		fltr->tenant_id = 0;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_tc_tun_parse - Parse tunnel attributes from TC flower filter
+ * @filter_dev: Pointer to device on which filter is being added
+ * @vsi: Pointer to VSI structure
+ * @f: Pointer to struct flow_cls_offload
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ */
+static int
+ice_tc_tun_parse(struct net_device *filter_dev, struct ice_vsi *vsi,
+		 struct flow_cls_offload *f,
+		 struct ice_tc_flower_fltr *fltr,
+		 struct ice_tc_flower_lyr_2_4_hdrs *headers)
+{
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+	enum ice_tunnel_type tunnel_type;
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	int err = 0;
+
+	dev = ice_pf_to_dev(pf);
+	tunnel_type = ice_tc_tun_get_type(filter_dev, rule);
+
+	/* VXLAN and GTP tunnel are supported now */
+	if (tunnel_type == TNL_VXLAN || tunnel_type == TNL_GTP) {
+		err = ice_tc_tun_info(pf, f, fltr, tunnel_type);
+		if (err) {
+			dev_err(dev, "Failed to parse tunnel (tunnel_type %u) attributes\n",
+				tunnel_type);
+			return err;
+		}
+	} else {
+		dev_err(dev, "Tunnel HW offload is not supported for the tunnel type: %d\n",
+			tunnel_type);
+		return -EOPNOTSUPP;
+	}
+	fltr->tunnel_type = tunnel_type;
+	headers->l3_key.ip_proto = IPPROTO_UDP;
+	return err;
+}
+
+/**
+ * ice_parse_tunnel_attr - Parse tunnel attributes from TC flower filter
+ * @filter_dev: Pointer to device on which filter is being added
+ * @vsi: Pointer to VSI structure
+ * @f: Pointer to struct flow_cls_offload
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ */
+static int
+ice_parse_tunnel_attr(struct net_device *filter_dev, struct ice_vsi *vsi,
+		      struct flow_cls_offload *f,
+		      struct ice_tc_flower_fltr *fltr,
+		      struct ice_tc_flower_lyr_2_4_hdrs *headers)
+{
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+	struct flow_match_control enc_control;
+	int err;
+
+	err = ice_tc_tun_parse(filter_dev, vsi, f, fltr, headers);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "failed to parse tunnel attributes");
+		return err;
+	}
+
+	flow_rule_match_enc_control(rule, &enc_control);
+
+	if (enc_control.key->addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_match_ipv4_addrs match;
+
+		flow_rule_match_enc_ipv4_addrs(rule, &match);
+		if (ice_tc_set_ipv4(&match, fltr, headers, true))
+			return -EINVAL;
+	} else if (enc_control.key->addr_type ==
+					FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_match_ipv6_addrs match;
+
+		flow_rule_match_enc_ipv6_addrs(rule, &match);
+		if (ice_tc_set_ipv6(&match, fltr, headers, true))
+			return -EINVAL;
+	}
+
+#ifdef HAVE_TC_FLOWER_ENC_IP
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) {
+		struct flow_match_ip match;
+
+		flow_rule_match_enc_ip(rule, &match);
+		headers->l3_key.tos = match.key->tos;
+		headers->l3_key.ttl = match.key->ttl;
+		headers->l3_mask.tos = match.mask->tos;
+		headers->l3_mask.ttl = match.mask->ttl;
+	}
+#endif /* HAVE_TC_FLOWER_ENC_IP */
+
+	if (fltr->tunnel_type == TNL_GTP &&
+	    flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+		struct flow_match_ports match;
+
+		flow_rule_match_enc_ports(rule, &match);
+		/* store away outer L4 port info and mark it for tunnel */
+		if (ice_tc_set_port(match, fltr, headers, true))
+			return -EINVAL;
+	}
+	return 0;
+}
+#endif /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
+
+/**
+ * ice_parse_cls_flower - Parse TC flower filters provided by kernel
+ * @vsi: Pointer to the VSI
+ * @filter_dev: Pointer to device on which filter is being added
+ * @f: Pointer to struct flow_cls_offload
+ * @fltr: Pointer to filter structure
+ */
+#ifdef HAVE_TC_INDIR_BLOCK
+static int
+ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi,
+		     struct flow_cls_offload *f,
+		     struct ice_tc_flower_fltr *fltr)
+#else
+static int
+ice_parse_cls_flower(struct net_device __always_unused *filter_dev,
+		     struct ice_vsi __always_unused *vsi,
+		     struct tc_cls_flower_offload *f,
+		     struct ice_tc_flower_fltr *fltr)
+#endif /* HAVE_TC_INDIR_BLOCK */
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &fltr->outer_headers;
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+	struct flow_dissector *dissector = rule->match.dissector;
+	u16 n_proto_mask = 0, n_proto_key = 0, addr_type = 0;
+
+	if (dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+#ifdef HAVE_TC_FLOWER_VLAN_IN_TAGS
+	      BIT(FLOW_DISSECTOR_KEY_VLANID) |
+#endif
+#ifndef HAVE_TC_FLOWER_VLAN_IN_TAGS
+	      BIT(FLOW_DISSECTOR_KEY_VLAN) |
+#endif
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+#ifdef HAVE_TC_FLOWER_ENC
+	      BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) |
+#ifdef HAVE_TC_FLOWER_ENC_IP
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IP) |
+#endif /* HAVE_TC_FLOWER_ENC_IP */
+#endif /* HAVE_TC_FLOWER_ENC */
+	      BIT(FLOW_DISSECTOR_KEY_PORTS))) {
+		NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported key used");
+		return -EOPNOTSUPP;
+	}
+
+#if defined(HAVE_TC_FLOWER_ENC) && defined(HAVE_TC_INDIR_BLOCK)
+	if ((flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) ||
+	     flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) ||
+	     flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID) ||
+	     flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS))) {
+		int err;
+
+		err = ice_parse_tunnel_attr(filter_dev, vsi, f, fltr, headers);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Failed to parse TC flower tunnel attributes");
+			return err;
+		}
+
+		/* header pointers should point to the inner headers, outer
+		 * header were already set by ice_parse_tunnel_attr
+		 */
+		headers = &fltr->inner_headers;
+	} else {
+		fltr->tunnel_type = TNL_LAST;
+	}
+#else /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
+	fltr->tunnel_type = TNL_LAST;
+#endif /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
+
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_match_basic match;
+
+		flow_rule_match_basic(rule, &match);
+
+		n_proto_key = ntohs(match.key->n_proto);
+		n_proto_mask = ntohs(match.mask->n_proto);
+
+		if (n_proto_key == ETH_P_ALL || n_proto_key == 0) {
+			n_proto_key = 0;
+			n_proto_mask = 0;
+		} else {
+			if (!ice_is_adq_active(vsi->back))
+				fltr->flags |= ICE_TC_FLWR_FIELD_ETH_TYPE_ID;
+		}
+
+		headers->l2_key.n_proto = cpu_to_be16(n_proto_key);
+		headers->l2_mask.n_proto = cpu_to_be16(n_proto_mask);
+		headers->l3_key.ip_proto = match.key->ip_proto;
+	}
+
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_match_eth_addrs match;
+
+		flow_rule_match_eth_addrs(rule, &match);
+
+		if (!is_zero_ether_addr(match.key->dst)) {
+			ether_addr_copy(headers->l2_key.dst_mac,
+					match.key->dst);
+			ether_addr_copy(headers->l2_mask.dst_mac,
+					match.mask->dst);
+			fltr->flags |= ICE_TC_FLWR_FIELD_DST_MAC;
+		}
+
+		if (!is_zero_ether_addr(match.key->src)) {
+			ether_addr_copy(headers->l2_key.src_mac,
+					match.key->src);
+			ether_addr_copy(headers->l2_mask.src_mac,
+					match.mask->src);
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_MAC;
+		}
+	}
+
+#ifdef HAVE_TC_FLOWER_VLAN_IN_TAGS
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_VLANID)) {
+		struct flow_dissector_key_tags *key =
+			(struct flow_dissector_key_tags *)
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLANID,
+						  f->key);
+		struct flow_dissector_key_tags *mask =
+			(struct flow_dissector_key_tags *)
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLANID,
+						  f->mask);
+
+		if (mask->vlan_id) {
+			if (mask->vlan_id == VLAN_VID_MASK) {
+				fltr->flags |= ICE_TC_FLWR_FIELD_VLAN;
+				fltr->flags &= ~ICE_TC_FLWR_FIELD_ETH_TYPE_ID;
+			} else {
+				NL_SET_ERR_MSG_MOD(fltr->extack,
+						   "Bad VLAN mask");
+				return -EINVAL;
+			}
+		}
+		headers->vlan_hdr.vlan_id =
+				cpu_to_be16(key->vlan_id & VLAN_VID_MASK);
+#ifdef HAVE_FLOW_DISSECTOR_VLAN_PRIO
+		if (mask->vlan_priority)
+			headers->vlan_hdr.vlan_prio = key->vlan_priority;
+#endif
+	}
+#else /* !HAVE_TC_FLOWER_VLAN_IN_TAGS */
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN) ||
+	    is_vlan_dev(filter_dev)) {
+		struct flow_dissector_key_vlan mask;
+		struct flow_dissector_key_vlan key;
+		struct flow_match_vlan match;
+
+		if (is_vlan_dev(filter_dev)) {
+			match.key = &key;
+			match.key->vlan_id = vlan_dev_vlan_id(filter_dev);
+			match.key->vlan_priority = 0;
+			match.mask = &mask;
+			memset(match.mask, 0xff, sizeof(*match.mask));
+			match.mask->vlan_priority = 0;
+		} else {
+			flow_rule_match_vlan(rule, &match);
+		}
+
+		if (match.mask->vlan_id) {
+			if (match.mask->vlan_id == VLAN_VID_MASK) {
+				fltr->flags |= ICE_TC_FLWR_FIELD_VLAN;
+				fltr->flags &= ~ICE_TC_FLWR_FIELD_ETH_TYPE_ID;
+			} else {
+				NL_SET_ERR_MSG_MOD(fltr->extack,
+						   "Bad VLAN mask");
+				return -EINVAL;
+			}
+		}
+
+		headers->vlan_hdr.vlan_id =
+				cpu_to_be16(match.key->vlan_id & VLAN_VID_MASK);
+#ifdef HAVE_FLOW_DISSECTOR_VLAN_PRIO
+		if (match.mask->vlan_priority)
+			headers->vlan_hdr.vlan_prio = match.key->vlan_priority;
+#endif
+	}
+#endif /* HAVE_TC_FLOWER_VLAN_IN_TAGS */
+
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_match_control match;
+
+		flow_rule_match_control(rule, &match);
+
+		addr_type = match.key->addr_type;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_match_ipv4_addrs match;
+
+		flow_rule_match_ipv4_addrs(rule, &match);
+		if (ice_tc_set_ipv4(&match, fltr, headers, false))
+			return -EINVAL;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_match_ipv6_addrs match;
+
+		flow_rule_match_ipv6_addrs(rule, &match);
+		if (ice_tc_set_ipv6(&match, fltr, headers, false))
+			return -EINVAL;
+	}
+
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_PORTS)) {
+		struct flow_match_ports match;
+
+		flow_rule_match_ports(rule, &match);
+		if (ice_tc_set_port(match, fltr, headers, false))
+			return -EINVAL;
+		switch (headers->l3_key.ip_proto) {
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+			break;
+		default:
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Only UDP and TCP transport are supported");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/**
+ * ice_add_remove_tc_flower_dflt_fltr - add or remove default filter
+ * @vsi: Pointer to VSI
+ * @tc_fltr: Pointer to TC flower filter structure
+ * @add: true if filter is being added.
+ *
+ * Add or remove default filter using default recipes to add MAC
+ * or VLAN or MAC-VLAN filters.
+ */
+static int
+ice_add_remove_tc_flower_dflt_fltr(struct ice_vsi *vsi,
+				   struct ice_tc_flower_fltr *tc_fltr, bool add)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &tc_fltr->outer_headers;
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	enum ice_sw_fwd_act_type act = tc_fltr->action.fltr_act;
+	u16 vlan_id =  be16_to_cpu(headers->vlan_hdr.vlan_id);
+	const u8 *dst_mac = headers->l2_key.dst_mac;
+	int err;
+
+	switch (tc_fltr->flags) {
+	case ICE_TC_FLWR_FLTR_FLAGS_DST_MAC:
+		if (add) {
+			err = ice_fltr_add_mac(vsi, dst_mac, act);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not add MAC filters");
+		} else {
+			err = ice_fltr_remove_mac(vsi, dst_mac, act);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not remove MAC filters");
+		}
+		break;
+	case ICE_TC_FLWR_FLTR_FLAGS_VLAN:
+		if (add) {
+			struct ice_vlan vlan =
+				ICE_VLAN(ETH_P_8021Q, vlan_id, 0, act);
+			err = vlan_ops->add_vlan(vsi, &vlan);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not add VLAN filters");
+		} else {
+			struct ice_vlan vlan =
+				ICE_VLAN(ETH_P_8021Q, vlan_id, 0, act);
+			err = vlan_ops->del_vlan(vsi, &vlan);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not delete VLAN filters");
+		}
+		break;
+	case ICE_TC_FLWR_FLTR_FLAGS_DST_MAC_VLAN:
+		if (add) {
+			err = ice_fltr_add_mac_vlan(vsi, dst_mac, vlan_id, act);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not add MAC VLAN filters");
+		} else {
+			err = ice_fltr_remove_mac_vlan(vsi, dst_mac, vlan_id,
+						       act);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not remove MAC VLAN filters");
+		}
+		break;
+	default:
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Not a default filter type");
+		err = -EOPNOTSUPP;
+		break;
+	}
+	return err;
+}
+
+/**
+ * ice_add_switch_fltr - Add TC flower filters
+ * @vsi: Pointer to VSI
+ * @fltr: Pointer to struct ice_tc_flower_fltr
+ *
+ * Add filter in HW switch block
+ */
+static int
+ice_add_switch_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr)
+{
+	if (ice_is_eswitch_mode_switchdev(vsi->back))
+		return ice_eswitch_add_tc_fltr(vsi, fltr);
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (fltr->action.fltr_act == ICE_FWD_TO_QGRP)
+		return -EOPNOTSUPP;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+	if (fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_DST_MAC ||
+	    fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_VLAN ||
+	    fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_DST_MAC_VLAN)
+		return ice_add_remove_tc_flower_dflt_fltr(vsi, fltr, true);
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	return ice_add_tc_flower_adv_fltr(vsi, fltr);
+#else
+	return -EOPNOTSUPP;
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+/**
+ * ice_handle_tclass_action - Support directing to a traffic class
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to TC flower offload structure
+ * @fltr: Pointer to TC flower filter structure
+ *
+ * Support directing traffic to a traffic class
+ */
+static int
+ice_handle_tclass_action(struct ice_vsi *vsi,
+			 struct flow_cls_offload *cls_flower,
+			 struct ice_tc_flower_fltr *fltr)
+{
+	int tc = tc_classid_to_hwtc(vsi->netdev, cls_flower->classid);
+	struct ice_vsi *main_vsi;
+
+	if (tc < 0) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because specified destination is invalid");
+		return -EINVAL;
+	}
+	if (!tc) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because of invalid destination");
+		return -EINVAL;
+	}
+
+	if (!(vsi->all_enatc & BIT(tc))) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because of non-existence destination");
+		return -EINVAL;
+	}
+
+	/* Redirect to a TC class or Queue Group */
+	main_vsi = ice_get_main_vsi(vsi->back);
+	if (!main_vsi || !main_vsi->netdev) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because of invalid netdevice");
+		return -EINVAL;
+	}
+
+	if ((fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID) &&
+	    (fltr->flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+			   ICE_TC_FLWR_FIELD_SRC_MAC))) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because filter using tunnel key and inner MAC is unsupported combination");
+		return -EOPNOTSUPP;
+	}
+
+	/* For ADQ, filter must include dest MAC address, otherwise unwanted
+	 * packets with unrelated MAC address get delivered to ADQ VSIs as long
+	 * as remaining filter criteria is satisfied such as dest IP address
+	 * and dest/src L4 port. Following code is trying to handle:
+	 * 1. For non-tunnel, if user specify MAC addresses, use them (means
+	 * this code won't do anything
+	 * 2. For non-tunnel, if user didn't specify MAC address, add implicit
+	 * dest MAC to be lower netdev's active unicast MAC address
+	 * 3. For tunnel,  as of now tc-filter thru flower classifier doesn't
+	 * have provision for user to specify outer DMAC, hence driver to
+	 * implicitly add outer dest MAC to be lower netdev's active unicast
+	 * MAC address.
+	 */
+	if (fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)  {
+		if (!(fltr->flags & ICE_TC_FLWR_FIELD_ENC_DST_MAC)) {
+			ether_addr_copy(fltr->outer_headers.l2_key.dst_mac,
+					main_vsi->netdev->dev_addr);
+			eth_broadcast_addr(fltr->outer_headers.l2_mask.dst_mac);
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DST_MAC;
+		}
+	} else if (!(fltr->flags & ICE_TC_FLWR_FIELD_DST_MAC)) {
+		ether_addr_copy(fltr->outer_headers.l2_key.dst_mac,
+				main_vsi->netdev->dev_addr);
+		eth_broadcast_addr(fltr->outer_headers.l2_mask.dst_mac);
+		fltr->flags |= ICE_TC_FLWR_FIELD_DST_MAC;
+	}
+
+	/* validate specified dest MAC address, make sure either it belongs to
+	 * lower netdev or any of non-offloaded MACVLAN. Non-offloaded MACVLANs
+	 * MAC address are added as unicast MAC filter destined to main VSI.
+	 */
+	if (!ice_mac_fltr_exist(&main_vsi->back->hw,
+				fltr->outer_headers.l2_key.dst_mac,
+				main_vsi->idx)) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because legacy MAC filter for specified destination doesn't exist");
+		return -EINVAL;
+	}
+
+	/* Make sure VLAN is already added to main VSI, before allowing ADQ to
+	 * add a VLAN based filter such as MAC + VLAN + L4 port.
+	 */
+	if (fltr->flags & ICE_TC_FLWR_FIELD_VLAN) {
+		u16 vlan_id = be16_to_cpu(fltr->outer_headers.vlan_hdr.vlan_id);
+
+		if (!ice_vlan_fltr_exist(&main_vsi->back->hw, vlan_id,
+					 main_vsi->idx)) {
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Unable to add filter because legacy VLAN filter for specified destination doesn't exist");
+			return -EINVAL;
+		}
+	}
+	fltr->action.fltr_act = ICE_FWD_TO_VSI;
+	fltr->action.tc_class = tc;
+
+	return 0;
+}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+/**
+ * ice_parse_tc_flower_actions - Parse the actions for a TC filter
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to TC flower offload structure
+ * @fltr: Pointer to TC flower filter structure
+ *
+ * Parse the actions for a TC filter
+ */
+static int
+ice_parse_tc_flower_actions(struct ice_vsi *vsi,
+			    struct flow_cls_offload *cls_flower,
+			    struct ice_tc_flower_fltr *fltr)
+{
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+	struct flow_rule *rule = flow_cls_offload_flow_rule(cls_flower);
+	struct flow_action *flow_action = &rule->action;
+	struct flow_action_entry *act;
+	int i;
+#else
+	struct tcf_exts *exts = cls_flower->exts;
+	struct tc_action *tc_act;
+#if defined(HAVE_TCF_EXTS_FOR_EACH_ACTION)
+	int i;
+#else
+	struct tc_action *temp;
+	LIST_HEAD(tc_actions);
+#endif
+#endif /* HAVE_TC_FLOW_RULE_INFRASTRUCTURE */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (cls_flower->classid)
+		return ice_handle_tclass_action(vsi, cls_flower, fltr);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+	if (!flow_action_has_entries(flow_action))
+#elif defined(HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV)
+	if (!tcf_exts_has_actions(exts))
+#else
+	if (tc_no_actions(exts))
+#endif
+		return -EINVAL;
+
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+	flow_action_for_each(i, act, flow_action) {
+#elif defined(HAVE_TCF_EXTS_FOR_EACH_ACTION)
+	tcf_exts_for_each_action(i, tc_act, exts) {
+#elif defined(HAVE_TCF_EXTS_TO_LIST)
+	tcf_exts_to_list(exts, &tc_actions);
+
+	list_for_each_entry_safe(tc_act, temp, &tc_actions, list) {
+#else
+	list_for_each_entry_safe(tc_act, temp, &(exts)->actions, list) {
+#endif /* HAVE_TCF_EXTS_TO_LIST */
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+		if (ice_is_eswitch_mode_switchdev(vsi->back)) {
+			int err =  ice_eswitch_tc_parse_action(fltr, act);
+
+			if (err)
+				return err;
+		}
+#else
+		if (ice_is_eswitch_mode_switchdev(vsi->back))
+			return -EINVAL;
+#endif /* HAVE_TC_FLOW_RULE_INFRASTRUCTURE */
+		/* Allow only one rule per filter */
+
+		/* Drop action */
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+		if (act->id == FLOW_ACTION_DROP) {
+#else
+		if (is_tcf_gact_shot(tc_act)) {
+#endif
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Unsupported action DROP");
+			return -EINVAL;
+		}
+		fltr->action.fltr_act = ICE_FWD_TO_VSI;
+	}
+	return 0;
+}
+
+/**
+ * ice_del_tc_fltr - deletes a filter from HW table
+ * @vsi: Pointer to VSI
+ * @fltr: Pointer to struct ice_tc_flower_fltr
+ *
+ * This function deletes a filter from HW table and manages book-keeping
+ */
+static int ice_del_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr)
+{
+	struct ice_pf *pf = vsi->back;
+	int err;
+
+	if (fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_DST_MAC ||
+	    fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_VLAN ||
+	    fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_DST_MAC_VLAN) {
+		err = ice_add_remove_tc_flower_dflt_fltr(vsi, fltr, false);
+	} else {
+		struct ice_rule_query_data rule_rem;
+
+		rule_rem.rid = fltr->rid;
+		rule_rem.rule_id = fltr->rule_id;
+		rule_rem.vsi_handle = fltr->dest_id;
+		err = ice_rem_adv_rule_by_id(&pf->hw, &rule_rem);
+	}
+
+	if (err) {
+		if (err == ICE_ERR_DOES_NOT_EXIST) {
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "filter does not exist\n");
+			return -ENOENT;
+		}
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Failed to delete TC flower filter");
+		return -EIO;
+	}
+
+	/* update advanced switch filter count for destination
+	 * VSI if filter destination was VSI
+	 */
+	if (fltr->dest_vsi) {
+		if (fltr->dest_vsi->type == ICE_VSI_CHNL) {
+			struct ice_channel *ch = fltr->dest_vsi->ch;
+
+			fltr->dest_vsi->num_chnl_fltr--;
+
+			/* reset filter type for channel if channel filter
+			 * count reaches zero
+			 */
+			if (!fltr->dest_vsi->num_chnl_fltr && ch)
+				ch->fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+
+			/* keeps track of channel filters for PF VSI */
+			if (vsi->type == ICE_VSI_PF &&
+			    (fltr->flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+					    ICE_TC_FLWR_FIELD_ENC_DST_MAC)))
+				pf->num_dmac_chnl_fltrs--;
+		}
+	}
+	return 0;
+}
+
+/**
+ * ice_add_tc_fltr - adds a TC flower filter
+ * @netdev: Pointer to netdev
+ * @vsi: Pointer to VSI
+ * @f: Pointer to flower offload structure
+ * @__fltr: Pointer to struct ice_tc_flower_fltr
+ *
+ * This function parses tc-flower input fields, parses action,
+ * and adds a filter.
+ */
+#ifdef HAVE_TC_INDIR_BLOCK
+static int
+ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi,
+		struct flow_cls_offload *f,
+		struct ice_tc_flower_fltr **__fltr)
+#else
+static int
+ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi,
+		struct tc_cls_flower_offload *f,
+		struct ice_tc_flower_fltr **__fltr)
+#endif /* HAVE_TC_INDIR_BLOCK */
+{
+	struct ice_tc_flower_fltr *fltr;
+	int err;
+
+	/* by default, set output to be INVALID */
+	*__fltr = NULL;
+
+	fltr = kzalloc(sizeof(*fltr), GFP_KERNEL);
+	if (!fltr)
+		return -ENOMEM;
+
+	fltr->cookie = f->cookie;
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+	fltr->extack = f->common.extack;
+#endif
+	fltr->src_vsi = vsi;
+	INIT_HLIST_NODE(&fltr->tc_flower_node);
+
+	err = ice_parse_cls_flower(netdev, vsi, f, fltr);
+	if (err < 0)
+		goto err;
+
+	err = ice_parse_tc_flower_actions(vsi, f, fltr);
+	if (err < 0)
+		goto err;
+
+	err = ice_add_switch_fltr(vsi, fltr);
+	if (err < 0)
+		goto err;
+
+	/* return the newly created filter */
+	*__fltr = fltr;
+
+	return 0;
+err:
+	kfree(fltr);
+	return err;
+}
+
+/**
+ * ice_find_tc_flower_fltr - Find the TC flower filter in the list
+ * @pf: Pointer to PF
+ * @cookie: filter specific cookie
+ */
+static struct ice_tc_flower_fltr *
+ice_find_tc_flower_fltr(struct ice_pf *pf, unsigned long cookie)
+{
+	struct ice_tc_flower_fltr *fltr;
+
+	hlist_for_each_entry(fltr, &pf->tc_flower_fltr_list, tc_flower_node)
+		if (cookie == fltr->cookie)
+			return fltr;
+
+	return NULL;
+}
+
+/**
+ * ice_add_cls_flower - add TC flower filters
+ * @netdev: Pointer to filter device
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to flower offload structure
+ */
+int
+#ifdef HAVE_TC_INDIR_BLOCK
+ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi,
+		   struct flow_cls_offload *cls_flower)
+#else
+ice_add_cls_flower(struct net_device __always_unused *netdev,
+		   struct ice_vsi *vsi,
+		   struct tc_cls_flower_offload *cls_flower)
+#endif /* HAVE_TC_INDIR_BLOCK */
+{
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+	struct netlink_ext_ack *extack = cls_flower->common.extack;
+#endif /* HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK */
+	struct net_device *vsi_netdev = vsi->netdev;
+	struct ice_tc_flower_fltr *fltr;
+	struct ice_pf *pf = vsi->back;
+	int err = 0;
+
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags))
+		return -EINVAL;
+
+#ifdef HAVE_TC_FLOW_INDIR_DEV
+	if ((ice_tc_tun_get_type(netdev, NULL) == TNL_LAST) &&
+	    ice_is_port_repr_netdev(netdev))
+		vsi_netdev = netdev;
+#else
+	if (ice_is_port_repr_netdev(netdev))
+		vsi_netdev = netdev;
+#endif /* HAVE_TC_FLOW_INDIR_DEV */
+
+	if (!(vsi_netdev->features & NETIF_F_HW_TC) &&
+	    !test_bit(ICE_FLAG_CLS_FLOWER, pf->flags)) {
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+#ifdef HAVE_TC_INDIR_BLOCK
+		/* Based on TC indirect notifications from kernel, all ice
+		 * devices get an instance of rule from higher level device.
+		 * Avoid triggering explicit error in this case.
+		 */
+		if (netdev == vsi_netdev)
+			NL_SET_ERR_MSG_MOD(extack,
+					   "can't apply TC flower filters, turn ON hw-tc-offload and try again");
+#else
+		NL_SET_ERR_MSG_MOD(extack,
+				   "can't apply TC flower filters, turn ON hw-tc-offload and try again");
+#endif /* HAVE_TC_INDIR_BLOCK */
+#else  /* !HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK */
+		netdev_err(vsi_netdev,
+			   "can't apply TC flower filters, turn ON hw-tc-offload and try again\n");
+#endif /* HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK */
+		return -EINVAL;
+	}
+
+	/* avoid duplicate entries, if exists - return error */
+	fltr = ice_find_tc_flower_fltr(pf, cls_flower->cookie);
+	if (fltr) {
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+		NL_SET_ERR_MSG_MOD(extack,
+				   "filter cookie already exists, ignoring");
+#else
+		netdev_warn(vsi_netdev,
+			    "filter cookie %lx already exists, ignoring\n",
+			    fltr->cookie);
+#endif /* HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK */
+		return -EEXIST;
+	}
+
+	/* prep and add tc-flower filter in HW */
+	err = ice_add_tc_fltr(netdev, vsi, cls_flower, &fltr);
+	if (err)
+		return err;
+
+	/* add filter into an ordered list */
+	hlist_add_head(&fltr->tc_flower_node, &pf->tc_flower_fltr_list);
+	return 0;
+}
+
+/**
+ * ice_del_cls_flower - delete TC flower filters
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to struct flow_cls_offload
+ */
+int
+ice_del_cls_flower(struct ice_vsi *vsi, struct flow_cls_offload *cls_flower)
+{
+	struct ice_tc_flower_fltr *fltr;
+	struct ice_pf *pf = vsi->back;
+	int err;
+
+	/* find filter */
+	fltr = ice_find_tc_flower_fltr(pf, cls_flower->cookie);
+	if (!fltr) {
+		/* when egress qdisc is deleted, driver deletes all channel
+		 * filters so that there are no stale filters left in
+		 * HW (as per design) because deleting egress qdisc means,
+		 * deleting all channel VSIs, hence no reason to keep filters
+		 * destined to those channel VSIs. But software (OS) still
+		 * sees those filters being offloaded in HW. In this situation
+		 * user can try to delete those filters or OS will try to
+		 * delete them one by one when ingress qdisc is deleted from
+		 * given interace (ethX) and driver won't find those filters in
+		 * its list of filters, hence don't return error. Return the
+		 * error only when there are still active channel(s) and can't
+		 * find requested filter and/or failed to delet the filter,
+		 * otherwise return success
+		 */
+		/* means no channels are configured or channels are deleted and
+		 * channel filter list is empty
+		 */
+		if (!test_bit(ICE_FLAG_TC_MQPRIO, pf->flags) &&
+		    hlist_empty(&pf->tc_flower_fltr_list))
+			return 0;
+
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+		NL_SET_ERR_MSG_MOD(cls_flower->common.extack,
+				   "failed to delete TC flower filter because unable to find it");
+#else
+		dev_err(ice_pf_to_dev(pf),
+			"failed to delete TC flower filter because unable to find it\n");
+#endif
+		return -EINVAL;
+	}
+
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+	fltr->extack = cls_flower->common.extack;
+#endif
+	/* delete filter from HW */
+	err = ice_del_tc_fltr(vsi, fltr);
+	if (err)
+		return err;
+
+	/* delete filter from an ordered list */
+	hlist_del(&fltr->tc_flower_node);
+
+	/* free the filter node */
+	kfree(fltr);
+
+	return 0;
+}
+
+/**
+ * ice_replay_tc_fltrs - replay tc filters
+ * @pf: pointer to PF struct
+ */
+void ice_replay_tc_fltrs(struct ice_pf *pf)
+{
+	struct ice_tc_flower_fltr *fltr;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_safe(fltr, node,
+				  &pf->tc_flower_fltr_list,
+				  tc_flower_node) {
+		fltr->extack = NULL;
+		ice_add_switch_fltr(fltr->src_vsi, fltr);
+	}
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.h b/drivers/net/ethernet/intel/ice/ice_tc_lib.h
new file mode 100644
index 000000000000..6a6f4566e1fb
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_TC_LIB_H_
+#define _ICE_TC_LIB_H_
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#define ICE_TC_FLWR_FIELD_DST_MAC		BIT(0)
+#define ICE_TC_FLWR_FIELD_SRC_MAC		BIT(1)
+#define ICE_TC_FLWR_FIELD_VLAN			BIT(2)
+#define ICE_TC_FLWR_FIELD_DEST_IPV4		BIT(3)
+#define ICE_TC_FLWR_FIELD_SRC_IPV4		BIT(4)
+#define ICE_TC_FLWR_FIELD_DEST_IPV6		BIT(5)
+#define ICE_TC_FLWR_FIELD_SRC_IPV6		BIT(6)
+#define ICE_TC_FLWR_FIELD_DEST_L4_PORT		BIT(7)
+#define ICE_TC_FLWR_FIELD_SRC_L4_PORT		BIT(8)
+#define ICE_TC_FLWR_FIELD_TENANT_ID		BIT(9)
+#define ICE_TC_FLWR_FIELD_ENC_DEST_IPV4		BIT(10)
+#define ICE_TC_FLWR_FIELD_ENC_SRC_IPV4		BIT(11)
+#define ICE_TC_FLWR_FIELD_ENC_DEST_IPV6		BIT(12)
+#define ICE_TC_FLWR_FIELD_ENC_SRC_IPV6		BIT(13)
+#define ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT	BIT(14)
+#define ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT	BIT(15)
+#define ICE_TC_FLWR_FIELD_ENC_DST_MAC		BIT(16)
+#define ICE_TC_FLWR_FIELD_ETH_TYPE_ID		BIT(17)
+
+/* TC flower supported filter match */
+#define ICE_TC_FLWR_FLTR_FLAGS_DST_MAC		ICE_TC_FLWR_FIELD_DST_MAC
+#define ICE_TC_FLWR_FLTR_FLAGS_VLAN		ICE_TC_FLWR_FIELD_VLAN
+#define ICE_TC_FLWR_FLTR_FLAGS_DST_MAC_VLAN	(ICE_TC_FLWR_FIELD_DST_MAC | \
+						 ICE_TC_FLWR_FIELD_VLAN)
+#define ICE_TC_FLWR_FLTR_FLAGS_IPV4_DST_PORT	(ICE_TC_FLWR_FIELD_DEST_IPV4 | \
+						 ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+#define ICE_TC_FLWR_FLTR_FLAGS_IPV4_SRC_PORT	(ICE_TC_FLWR_FIELD_DEST_IPV4 | \
+						 ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+#define ICE_TC_FLWR_FLTR_FLAGS_IPV6_DST_PORT	(ICE_TC_FLWR_FIELD_DEST_IPV6 | \
+						 ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+#define ICE_TC_FLWR_FLTR_FLAGS_IPV6_SRC_PORT	(ICE_TC_FLWR_FIELD_DEST_IPV6 | \
+						 ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+
+#define ICE_TC_FLOWER_MASK_32	0xFFFFFFFF
+#define ICE_TC_FLOWER_MASK_16	0xFFFF
+#define ICE_TC_FLOWER_VNI_MAX	0xFFFFFFU
+
+#ifdef HAVE_TC_INDIR_BLOCK
+struct ice_indr_block_priv {
+	struct net_device *netdev;
+	struct ice_netdev_priv *np;
+	struct list_head list;
+};
+#endif /* HAVE_TC_INDIR_BLOCK */
+
+struct ice_tc_flower_action {
+	u32 tc_class;
+	enum ice_sw_fwd_act_type fltr_act;
+};
+
+struct ice_tc_vlan_hdr {
+	__be16 vlan_id; /* Only last 12 bits valid */
+#ifdef HAVE_FLOW_DISSECTOR_VLAN_PRIO
+	u16 vlan_prio; /* Only last 3 bits valid (valid values: 0..7) */
+#endif
+};
+
+struct ice_tc_l2_hdr {
+	u8 dst_mac[ETH_ALEN];
+	u8 src_mac[ETH_ALEN];
+	__be16 n_proto;    /* Ethernet Protocol */
+};
+
+struct ice_tc_l3_hdr {
+	u8 ip_proto;    /* IPPROTO value */
+	union {
+		struct {
+			struct in_addr dst_ip;
+			struct in_addr src_ip;
+		} v4;
+		struct {
+			struct in6_addr dst_ip6;
+			struct in6_addr src_ip6;
+		} v6;
+	} ip;
+#define dst_ipv6	ip.v6.dst_ip6.s6_addr32
+#define dst_ipv6_addr	ip.v6.dst_ip6.s6_addr
+#define src_ipv6	ip.v6.src_ip6.s6_addr32
+#define src_ipv6_addr	ip.v6.src_ip6.s6_addr
+#define dst_ipv4	ip.v4.dst_ip.s_addr
+#define src_ipv4	ip.v4.src_ip.s_addr
+
+	u8 tos;
+	u8 ttl;
+};
+
+struct ice_tc_l4_hdr {
+	__be16 dst_port;
+	__be16 src_port;
+};
+
+struct ice_tc_flower_lyr_2_4_hdrs {
+	/* L2 layer fields with their mask */
+	struct ice_tc_l2_hdr l2_key;
+	struct ice_tc_l2_hdr l2_mask;
+	struct ice_tc_vlan_hdr vlan_hdr;
+	/* L3 (IPv4[6]) layer fields with their mask */
+	struct ice_tc_l3_hdr l3_key;
+	struct ice_tc_l3_hdr l3_mask;
+
+	/* L4 layer fields with their mask */
+	struct ice_tc_l4_hdr l4_key;
+	struct ice_tc_l4_hdr l4_mask;
+};
+
+enum ice_eswitch_fltr_direction {
+	ICE_ESWITCH_FLTR_INGRESS,
+	ICE_ESWITCH_FLTR_EGRESS,
+};
+
+struct ice_tc_flower_fltr {
+	struct hlist_node tc_flower_node;
+
+	/* cookie becomes filter_rule_id if rule is added successfully */
+	unsigned long cookie;
+
+	/* add_adv_rule returns information like recipe ID, rule_id. Store
+	 * those values since they are needed to remove advanced rule
+	 */
+	u16 rid;
+	u16 rule_id;
+	/* this could be queue/vsi_idx (sw handle)/queue_group, depending upon
+	 * destination type
+	 */
+	u16 dest_id;
+	/* if dest_id is vsi_idx, then need to store destination VSI ptr */
+	struct ice_vsi *dest_vsi;
+	/* direction of fltr for eswitch use case */
+	enum ice_eswitch_fltr_direction direction;
+
+	/* Parsed TC flower configuration params */
+	struct ice_tc_flower_lyr_2_4_hdrs outer_headers;
+	struct ice_tc_flower_lyr_2_4_hdrs inner_headers;
+	struct ice_vsi *src_vsi;
+	__be32 tenant_id;
+	u32 flags;
+#define ICE_TC_FLWR_TNL_TYPE_NONE        0xff
+	u8 tunnel_type;
+	struct ice_tc_flower_action	action;
+
+	/* cache ptr which is used wherever needed to communicate netlink
+	 * messages
+	 */
+	struct netlink_ext_ack *extack;
+};
+
+/**
+ * ice_is_chnl_fltr - is this a valid channel filter
+ * @f: Pointer to tc-flower filter
+ *
+ * Criteria to determine of given filter is valid channel filter
+ * or not is based on its "destination". If destination is hw_tc (aka tc_class)
+ * and it is non-zero, then it is valid channel (aka ADQ) filter
+ */
+static inline bool ice_is_chnl_fltr(struct ice_tc_flower_fltr *f)
+{
+	return !!f->action.tc_class;
+}
+
+/**
+ * ice_chnl_dmac_fltr_cnt - DMAC based CHNL filter count
+ * @pf: Pointer to PF
+ */
+static inline int ice_chnl_dmac_fltr_cnt(struct ice_pf *pf)
+{
+	return pf->num_dmac_chnl_fltrs;
+}
+int
+ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi,
+			   struct ice_tc_flower_fltr *tc_fltr);
+#if defined(HAVE_TC_FLOWER_ENC) && defined(HAVE_TC_INDIR_BLOCK)
+int
+ice_tc_tun_get_type(struct net_device *tunnel_dev,
+		    struct flow_rule *rule);
+#endif /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
+int
+#ifdef HAVE_TC_INDIR_BLOCK
+ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi,
+		   struct flow_cls_offload *cls_flower);
+#else
+ice_add_cls_flower(struct net_device __always_unused *netdev,
+		   struct ice_vsi *vsi,
+		   struct tc_cls_flower_offload *cls_flower);
+#endif /* HAVE_TC_INDIR_BLOCK */
+int
+ice_del_cls_flower(struct ice_vsi *vsi, struct flow_cls_offload *cls_flower);
+void ice_replay_tc_fltrs(struct ice_pf *pf);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+#endif /* _ICE_TC_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_trace.h b/drivers/net/ethernet/intel/ice/ice_trace.h
new file mode 100644
index 000000000000..3e50808fe490
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_trace.h
@@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#if !IS_ENABLED(CONFIG_TRACEPOINTS) || defined(__CHECKER__)
+#if !defined(_ICE_TRACE_H_)
+#define _ICE_TRACE_H_
+/* If the Linux kernel tracepoints are not available then the ice_trace*
+ * macros become nops.
+ */
+
+#define ice_trace(trace_name, args...)
+#define ice_trace_enabled(trace_name) (0)
+#endif /* !defined(_ICE_TRACE_H_) */
+#else /* CONFIG_TRACEPOINTS */
+/*
+ * Modeled on trace-events-sample.h
+ */
+
+/*
+ * The trace subsystem name for ice will be "ice".
+ *
+ * This file is named ice_trace.h.
+ *
+ * Since this include file's name is different from the trace
+ * subsystem name, we'll have to define TRACE_INCLUDE_FILE at the end
+ * of this file.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ice
+
+/*
+ * See trace-events-sample.h for a detailed description of why this
+ * guard clause is different from most normal include files.
+ */
+#if !defined(_ICE_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _ICE_TRACE_H_
+
+#include "ice_txrx.h"
+#include <linux/tracepoint.h>
+
+/**
+ * ice_trace() enables trace points
+ * like:
+ *
+ * trace_ice_example(args...)
+ *
+ * ... as:
+ *
+ * ice_trace(example, args...)
+ *
+ * ... to resolve to the PF version of the tracepoint without
+ * ifdefs, and to allow tracepoints to be disabled entirely at build
+ * time.
+ *
+ * Trace point should always be referred to in the driver via this
+ * macro.
+ *
+ * Similarly, ice_trace_enabled(trace_name) wraps references to
+ * trace_ice_<trace_name>_enabled() functions.
+ * @trace_name: name of tracepoint
+ */
+#define _ICE_TRACE_NAME(trace_name) (trace_##ice##_##trace_name)
+#define ICE_TRACE_NAME(trace_name) _ICE_TRACE_NAME(trace_name)
+
+#define ice_trace(trace_name, args...) ICE_TRACE_NAME(trace_name)(args)
+
+#define ice_trace_enabled(trace_name) ICE_TRACE_NAME(trace_name##_enabled)()
+
+/*
+ * This is for events common to PF. Corresponding versions will be named
+ * trace_ice_*. The ice_trace() macro above will select the right trace point
+ * name for the driver.
+ */
+
+/* Begin tracepoints */
+
+/* Global tracepoints */
+DECLARE_EVENT_CLASS(ice_print_msg,
+		    TP_PROTO(char *msg),
+
+		    TP_ARGS(msg),
+
+		    TP_STRUCT__entry(__string(msg, msg)),
+
+		    TP_fast_assign(__assign_str(msg, msg);),
+
+		    TP_printk("%s", __get_str(msg))
+);
+
+#define DEFINE_PRINT_MSG_EVENT(name) \
+DEFINE_EVENT(ice_print_msg, name, \
+	     TP_PROTO(char *msg), \
+	     TP_ARGS(msg))
+
+DEFINE_PRINT_MSG_EVENT(ice_print_err);
+DEFINE_PRINT_MSG_EVENT(ice_print_warn);
+DEFINE_PRINT_MSG_EVENT(ice_print_adminq_msg);
+DEFINE_PRINT_MSG_EVENT(ice_print_adminq_desc);
+DEFINE_PRINT_MSG_EVENT(ice_print_netdev_err);
+DEFINE_PRINT_MSG_EVENT(ice_print_netdev_warn);
+DEFINE_PRINT_MSG_EVENT(ice_print_netdev_info);
+DEFINE_PRINT_MSG_EVENT(ice_print_peer_err);
+
+/* Events related to DIM, q_vectors and ring containers */
+DECLARE_EVENT_CLASS(ice_rx_dim_template,
+		    TP_PROTO(struct ice_q_vector *q_vector, struct dim *dim),
+		    TP_ARGS(q_vector, dim),
+		    TP_STRUCT__entry(__field(struct ice_q_vector *, q_vector)
+				     __field(struct dim *, dim)
+				     __string(devname, q_vector->rx.ring->netdev->name)),
+
+		    TP_fast_assign(__entry->q_vector = q_vector;
+				   __entry->dim = dim;
+				   __assign_str(devname, q_vector->rx.ring->netdev->name);),
+
+		    TP_printk("netdev: %s Rx-Q: %d dim-state: %d dim-profile: %d dim-tune: %d dim-st-right: %d dim-st-left: %d dim-tired: %d",
+			      __get_str(devname),
+			      __entry->q_vector->rx.ring->q_index,
+			      __entry->dim->state,
+			      __entry->dim->profile_ix,
+			      __entry->dim->tune_state,
+			      __entry->dim->steps_right,
+			      __entry->dim->steps_left,
+			      __entry->dim->tired)
+);
+
+DEFINE_EVENT(ice_rx_dim_template, ice_rx_dim_work,
+	     TP_PROTO(struct ice_q_vector *q_vector, struct dim *dim),
+	     TP_ARGS(q_vector, dim)
+);
+
+DECLARE_EVENT_CLASS(ice_tx_dim_template,
+		    TP_PROTO(struct ice_q_vector *q_vector, struct dim *dim),
+		    TP_ARGS(q_vector, dim),
+		    TP_STRUCT__entry(__field(struct ice_q_vector *, q_vector)
+				     __field(struct dim *, dim)
+				     __string(devname, q_vector->tx.ring->netdev->name)),
+
+		    TP_fast_assign(__entry->q_vector = q_vector;
+				   __entry->dim = dim;
+				   __assign_str(devname, q_vector->tx.ring->netdev->name);),
+
+		    TP_printk("netdev: %s Tx-Q: %d dim-state: %d dim-profile: %d dim-tune: %d dim-st-right: %d dim-st-left: %d dim-tired: %d",
+			      __get_str(devname),
+			      __entry->q_vector->rx.ring->q_index,
+			      __entry->dim->state,
+			      __entry->dim->profile_ix,
+			      __entry->dim->tune_state,
+			      __entry->dim->steps_right,
+			      __entry->dim->steps_left,
+			      __entry->dim->tired)
+);
+
+DEFINE_EVENT(ice_tx_dim_template, ice_tx_dim_work,
+	     TP_PROTO(struct ice_q_vector *q_vector, struct dim *dim),
+	     TP_ARGS(q_vector, dim)
+);
+
+/* Events related to a vsi & ring */
+DECLARE_EVENT_CLASS(ice_tx_template,
+		    TP_PROTO(struct ice_ring *ring, struct ice_tx_desc *desc,
+			     struct ice_tx_buf *buf),
+
+		    TP_ARGS(ring, desc, buf),
+		    TP_STRUCT__entry(__field(void *, ring)
+				     __field(void *, desc)
+				     __field(void *, buf)
+				     __string(devname, ring->netdev->name)),
+
+		    TP_fast_assign(__entry->ring = ring;
+				   __entry->desc = desc;
+				   __entry->buf = buf;
+				   __assign_str(devname, ring->netdev->name);),
+
+		    TP_printk("netdev: %s ring: %pK desc: %pK buf %pK", __get_str(devname),
+			      __entry->ring, __entry->desc, __entry->buf)
+);
+
+#define DEFINE_TX_TEMPLATE_OP_EVENT(name) \
+DEFINE_EVENT(ice_tx_template, name, \
+	     TP_PROTO(struct ice_ring *ring, \
+		      struct ice_tx_desc *desc, \
+		      struct ice_tx_buf *buf), \
+	     TP_ARGS(ring, desc, buf))
+
+DEFINE_TX_TEMPLATE_OP_EVENT(ice_clean_tx_irq);
+DEFINE_TX_TEMPLATE_OP_EVENT(ice_clean_tx_irq_unmap);
+DEFINE_TX_TEMPLATE_OP_EVENT(ice_clean_tx_irq_unmap_eop);
+
+DECLARE_EVENT_CLASS(ice_rx_template,
+		    TP_PROTO(struct ice_ring *ring, union ice_32b_rx_flex_desc *desc),
+
+		    TP_ARGS(ring, desc),
+
+		    TP_STRUCT__entry(__field(void *, ring)
+				     __field(void *, desc)
+				     __string(devname, ring->netdev->name)),
+
+		    TP_fast_assign(__entry->ring = ring;
+				   __entry->desc = desc;
+				   __assign_str(devname, ring->netdev->name);),
+
+		    TP_printk("netdev: %s ring: %pK desc: %pK", __get_str(devname),
+			      __entry->ring, __entry->desc)
+);
+DEFINE_EVENT(ice_rx_template, ice_clean_rx_irq,
+	     TP_PROTO(struct ice_ring *ring, union ice_32b_rx_flex_desc *desc),
+	     TP_ARGS(ring, desc)
+);
+
+DECLARE_EVENT_CLASS(ice_rx_indicate_template,
+		    TP_PROTO(struct ice_ring *ring, union ice_32b_rx_flex_desc *desc,
+			     struct sk_buff *skb),
+
+		    TP_ARGS(ring, desc, skb),
+
+		    TP_STRUCT__entry(__field(void *, ring)
+				     __field(void *, desc)
+				     __field(void *, skb)
+				     __string(devname, ring->netdev->name)),
+
+		    TP_fast_assign(__entry->ring = ring;
+				   __entry->desc = desc;
+				   __entry->skb = skb;
+				   __assign_str(devname, ring->netdev->name);),
+
+		    TP_printk("netdev: %s ring: %pK desc: %pK skb %pK", __get_str(devname),
+			      __entry->ring, __entry->desc, __entry->skb)
+);
+
+DEFINE_EVENT(ice_rx_indicate_template, ice_clean_rx_irq_indicate,
+	     TP_PROTO(struct ice_ring *ring, union ice_32b_rx_flex_desc *desc,
+		      struct sk_buff *skb),
+	     TP_ARGS(ring, desc, skb)
+);
+
+DECLARE_EVENT_CLASS(ice_xmit_template,
+		    TP_PROTO(struct ice_ring *ring, struct sk_buff *skb),
+
+		    TP_ARGS(ring, skb),
+
+		    TP_STRUCT__entry(__field(void *, ring)
+				     __field(void *, skb)
+				     __string(devname, ring->netdev->name)),
+
+		    TP_fast_assign(__entry->ring = ring;
+				   __entry->skb = skb;
+				   __assign_str(devname, ring->netdev->name);),
+
+		    TP_printk("netdev: %s skb: %pK ring: %pK", __get_str(devname),
+			      __entry->skb, __entry->ring)
+);
+
+#define DEFINE_XMIT_TEMPLATE_OP_EVENT(name) \
+DEFINE_EVENT(ice_xmit_template, name, \
+	     TP_PROTO(struct ice_ring *ring, struct sk_buff *skb), \
+	     TP_ARGS(ring, skb))
+
+DEFINE_XMIT_TEMPLATE_OP_EVENT(ice_xmit_frame_ring);
+DEFINE_XMIT_TEMPLATE_OP_EVENT(ice_xmit_frame_ring_drop);
+
+/* End tracepoints */
+
+#endif /* _ICE_TRACE_H_ */
+/* This must be outside ifdef _ICE_TRACE_H */
+
+/* This trace include file is not located in the .../include/trace
+ * with the kernel tracepoint definitions, because we're a loadable
+ * module.
+ */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE ice_trace
+#include <trace/define_trace.h>
+#endif /* CONFIG_TRACEPOINTS */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 33dd103035dc..c3c2746fe977 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1,15 +1,116 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 /* The driver transmit and receive code */
 
 #include <linux/prefetch.h>
 #include <linux/mm.h>
+#include "ice_txrx_lib.h"
+#include "ice_lib.h"
 #include "ice.h"
 #include "ice_dcb_lib.h"
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#include "ice_xsk.h"
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#include <linux/bpf_trace.h>
+#ifdef HAVE_XDP_BUFF_IN_XDP_H
+#include <net/xdp.h>
+#else
+#include <linux/filter.h>
+#endif /* HAVE_XDP_BUFF_IN_XDP_H */
+#endif /* HAVE_XDP_SUPPORT */
+#include "ice_eswitch.h"
+#include <net/busy_poll.h>
 
 #define ICE_RX_HDR_SIZE		256
 
+
+#define FDIR_DESC_RXDID 0x40
+#define ICE_FDIR_CLEAN_DELAY 10
+
+/**
+ * ice_prgm_fdir_fltr - Program a Flow Director filter
+ * @vsi: VSI to send dummy packet
+ * @fdir_desc: flow director descriptor
+ * @raw_packet: allocated buffer for flow director
+ */
+int
+ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
+		   u8 *raw_packet)
+{
+	struct ice_tx_buf *tx_buf, *first;
+	struct ice_fltr_desc *f_desc;
+	struct ice_tx_desc *tx_desc;
+	struct ice_ring *tx_ring;
+	struct device *dev;
+	dma_addr_t dma;
+	u32 td_cmd;
+	u16 i;
+
+	/* VSI and Tx ring */
+	if (!vsi)
+		return -ENOENT;
+	tx_ring = vsi->tx_rings[0];
+	if (!tx_ring || !tx_ring->desc)
+		return -ENOENT;
+	dev = tx_ring->dev;
+
+	/* we are using two descriptors to add/del a filter and we can wait */
+	for (i = ICE_FDIR_CLEAN_DELAY; ICE_DESC_UNUSED(tx_ring) < 2; i--) {
+		if (!i)
+			return -EAGAIN;
+		msleep_interruptible(1);
+	}
+
+	dma = dma_map_single(dev, raw_packet, ICE_FDIR_MAX_RAW_PKT_SIZE,
+			     DMA_TO_DEVICE);
+
+	if (dma_mapping_error(dev, dma))
+		return -EINVAL;
+
+	/* grab the next descriptor */
+	i = tx_ring->next_to_use;
+	first = &tx_ring->tx_buf[i];
+	f_desc = ICE_TX_FDIRDESC(tx_ring, i);
+	memcpy(f_desc, fdir_desc, sizeof(*f_desc));
+
+	i++;
+	i = (i < tx_ring->count) ? i : 0;
+
+	tx_desc = ICE_TX_DESC(tx_ring, i);
+	tx_buf = &tx_ring->tx_buf[i];
+
+	i++;
+	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
+
+	memset(tx_buf, 0, sizeof(*tx_buf));
+	dma_unmap_len_set(tx_buf, len, ICE_FDIR_MAX_RAW_PKT_SIZE);
+	dma_unmap_addr_set(tx_buf, dma, dma);
+
+	tx_desc->buf_addr = cpu_to_le64(dma);
+	td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
+		 ICE_TX_DESC_CMD_RE;
+
+	tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT;
+	tx_buf->raw_buf = (void *)raw_packet;
+
+	tx_desc->cmd_type_offset_bsz =
+		ice_build_ctob(td_cmd, 0, ICE_FDIR_MAX_RAW_PKT_SIZE, 0);
+
+	/* Force memory write to complete before letting h/w know
+	 * there are new descriptors to fetch.
+	 */
+	wmb();
+
+	/* mark the data descriptor to be watched */
+	first->next_to_watch = tx_desc;
+
+	writel(tx_ring->next_to_use, tx_ring->tail);
+
+	return 0;
+}
+
 /**
  * ice_unmap_and_free_tx_buf - Release a Tx buffer
  * @ring: the ring that owns the buffer
@@ -18,13 +119,27 @@
 static void
 ice_unmap_and_free_tx_buf(struct ice_ring *ring, struct ice_tx_buf *tx_buf)
 {
+	struct ice_vsi *vsi = ring->vsi;
+
 	if (tx_buf->skb) {
-		dev_kfree_skb_any(tx_buf->skb);
+		if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
+			devm_kfree(ring->dev, tx_buf->raw_buf);
+#ifdef HAVE_XDP_SUPPORT
+		else if (ice_ring_is_xdp(ring))
+			page_frag_free(tx_buf->raw_buf);
+#endif /* HAVE_XDP_SUPPORT */
+		else
+			dev_kfree_skb_any(tx_buf->skb);
 		if (dma_unmap_len(tx_buf, len))
 			dma_unmap_single(ring->dev,
 					 dma_unmap_addr(tx_buf, dma),
 					 dma_unmap_len(tx_buf, len),
 					 DMA_TO_DEVICE);
+		if (unlikely(tx_buf->tx_flags & ICE_TX_FLAGS_TSYN)) {
+			dev_kfree_skb_any(vsi->ptp_tx_skb[tx_buf->ptp_ts_idx]);
+			vsi->ptp_tx_skb[tx_buf->ptp_ts_idx] = NULL;
+			tx_buf->ptp_ts_idx = -1;
+		}
 	} else if (dma_unmap_len(tx_buf, len)) {
 		dma_unmap_page(ring->dev,
 			       dma_unmap_addr(tx_buf, dma),
@@ -51,6 +166,13 @@ void ice_clean_tx_ring(struct ice_ring *tx_ring)
 {
 	u16 i;
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (ice_ring_is_xdp(tx_ring) && tx_ring->xsk_pool) {
+		ice_xsk_clean_xdp_ring(tx_ring);
+		goto tx_skip_free;
+	}
+
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 	/* ring already cleared, nothing to do */
 	if (!tx_ring->tx_buf)
 		return;
@@ -59,6 +181,9 @@ void ice_clean_tx_ring(struct ice_ring *tx_ring)
 	for (i = 0; i < tx_ring->count; i++)
 		ice_unmap_and_free_tx_buf(tx_ring, &tx_ring->tx_buf[i]);
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+tx_skip_free:
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 	memset(tx_ring->tx_buf, 0, sizeof(*tx_ring->tx_buf) * tx_ring->count);
 
 	/* Zero out the descriptor ring */
@@ -124,6 +249,7 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 
 		smp_rmb();	/* prevent any other reads prior to eop_desc */
 
+		ice_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
 		/* if the descriptor isn't done, no work yet to do */
 		if (!(eop_desc->cmd_type_offset_bsz &
 		      cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
@@ -136,8 +262,16 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 		total_bytes += tx_buf->bytecount;
 		total_pkts += tx_buf->gso_segs;
 
+#ifdef HAVE_XDP_SUPPORT
+		if (ice_ring_is_xdp(tx_ring))
+			page_frag_free(tx_buf->raw_buf);
+		else
+			/* free the skb */
+			napi_consume_skb(tx_buf->skb, napi_budget);
+#else
 		/* free the skb */
 		napi_consume_skb(tx_buf->skb, napi_budget);
+#endif /* HAVE_XDP_SUPPORT */
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -151,6 +285,7 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 
 		/* unmap remaining buffers */
 		while (tx_desc != eop_desc) {
+			ice_trace(clean_tx_irq_unmap, tx_ring, tx_desc, tx_buf);
 			tx_buf++;
 			tx_desc++;
 			i++;
@@ -169,6 +304,7 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 				dma_unmap_len_set(tx_buf, len, 0);
 			}
 		}
+		ice_trace(clean_tx_irq_unmap_eop, tx_ring, tx_desc, tx_buf);
 
 		/* move us one more past the eop_desc for start of next pkt */
 		tx_buf++;
@@ -188,12 +324,13 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 
 	i += tx_ring->count;
 	tx_ring->next_to_clean = i;
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->stats.bytes += total_bytes;
-	tx_ring->stats.pkts += total_pkts;
-	u64_stats_update_end(&tx_ring->syncp);
-	tx_ring->q_vector->tx.total_bytes += total_bytes;
-	tx_ring->q_vector->tx.total_pkts += total_pkts;
+
+	ice_update_tx_ring_stats(tx_ring, total_pkts, total_bytes);
+
+#ifdef HAVE_XDP_SUPPORT
+	if (ice_ring_is_xdp(tx_ring))
+		return !!budget;
+#endif /* HAVE_XDP_SUPPORT */
 
 	netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts,
 				  total_bytes);
@@ -207,7 +344,7 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 		smp_mb();
 		if (__netif_subqueue_stopped(tx_ring->netdev,
 					     tx_ring->q_index) &&
-		    !test_bit(__ICE_DOWN, vsi->state)) {
+		    !test_bit(ICE_VSI_DOWN, vsi->state)) {
 			netif_wake_subqueue(tx_ring->netdev,
 					    tx_ring->q_index);
 			++tx_ring->tx_stats.restart_q;
@@ -273,6 +410,13 @@ void ice_clean_rx_ring(struct ice_ring *rx_ring)
 	if (!rx_ring->rx_buf)
 		return;
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (rx_ring->xsk_pool) {
+		ice_xsk_clean_rx_ring(rx_ring);
+		goto rx_skip_free;
+	}
+
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
 		struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
@@ -289,17 +433,26 @@ void ice_clean_rx_ring(struct ice_ring *rx_ring)
 		 */
 		dma_sync_single_range_for_cpu(dev, rx_buf->dma,
 					      rx_buf->page_offset,
-					      ICE_RXBUF_2048, DMA_FROM_DEVICE);
+					      rx_ring->rx_buf_len,
+					      DMA_FROM_DEVICE);
 
+#ifndef HAVE_STRUCT_DMA_ATTRS
 		/* free resources associated with mapping */
-		dma_unmap_page_attrs(dev, rx_buf->dma, PAGE_SIZE,
+		dma_unmap_page_attrs(dev, rx_buf->dma, ice_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
+#else
+		dma_unmap_page(dev, rx_buf->dma, ice_rx_pg_size(rx_ring),
+			       DMA_FROM_DEVICE);
+#endif
 		__page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
 
 		rx_buf->page = NULL;
 		rx_buf->page_offset = 0;
 	}
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+rx_skip_free:
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 	memset(rx_ring->rx_buf, 0, sizeof(*rx_ring->rx_buf) * rx_ring->count);
 
 	/* Zero out the descriptor ring */
@@ -319,6 +472,14 @@ void ice_clean_rx_ring(struct ice_ring *rx_ring)
 void ice_free_rx_ring(struct ice_ring *rx_ring)
 {
 	ice_clean_rx_ring(rx_ring);
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_XDP_BUFF_RXQ
+	if (rx_ring->vsi->type == ICE_VSI_PF)
+		if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+			xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+#endif /* HAVE_XDP_BUFF_RXQ */
+	rx_ring->xdp_prog = NULL;
+#endif /* HAVE_XDP_SUPPORT */
 	devm_kfree(rx_ring->dev, rx_ring->rx_buf);
 	rx_ring->rx_buf = NULL;
 
@@ -363,6 +524,19 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring)
 
 	rx_ring->next_to_use = 0;
 	rx_ring->next_to_clean = 0;
+
+#ifdef HAVE_XDP_SUPPORT
+	if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+		WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
+
+#ifdef HAVE_XDP_BUFF_RXQ
+	if (rx_ring->vsi->type == ICE_VSI_PF &&
+	    !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+		if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+				     rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
+			goto err;
+#endif /* HAVE_XDP_BUFF_RXQ */
+#endif /* HAVE_XDP_SUPPORT */
 	return 0;
 
 err:
@@ -372,35 +546,206 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring)
 }
 
 /**
- * ice_release_rx_desc - Store the new tail and head values
- * @rx_ring: ring to bump
- * @val: new head index
+ * ice_rx_offset - Return expected offset into page to access data
+ * @rx_ring: Ring we are requesting offset of
+ *
+ * Returns the offset value for ring into the data buffer.
  */
-static void ice_release_rx_desc(struct ice_ring *rx_ring, u32 val)
+static unsigned int ice_rx_offset(struct ice_ring *rx_ring)
 {
-	u16 prev_ntu = rx_ring->next_to_use;
+	if (ice_ring_uses_build_skb(rx_ring))
+		return ICE_SKB_PAD;
+#ifdef HAVE_XDP_SUPPORT
+	else if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+		return XDP_PACKET_HEADROOM;
+#endif /* HAVE_XDP_SUPPORT */
 
-	rx_ring->next_to_use = val;
+	return 0;
+}
 
-	/* update next to alloc since we have filled the ring */
-	rx_ring->next_to_alloc = val;
+#ifdef HAVE_XDP_BUFF_FRAME_SZ
+/**
+ * ice_rx_frame_truesize - Returns an actual size of Rx frame in memory
+ * @rx_ring: Rx ring we are requesting the frame size of
+ * @size: Packet length from rx_desc
+ *
+ * Returns an actual size of Rx frame in memory, considering page size
+ * and SKB data alignment.
+ */
+static unsigned int
+ice_rx_frame_truesize(struct ice_ring *rx_ring, unsigned int __maybe_unused size)
+{
+	unsigned int truesize;
 
-	/* QRX_TAIL will be updated with any tail value, but hardware ignores
-	 * the lower 3 bits. This makes it so we only bump tail on meaningful
-	 * boundaries. Also, this allows us to bump tail on intervals of 8 up to
-	 * the budget depending on the current traffic load.
-	 */
-	val &= ~0x7;
-	if (prev_ntu != val) {
-		/* Force memory writes to complete before letting h/w
-		 * know there are new descriptors to fetch. (Only
-		 * applicable for weak-ordered memory model archs,
-		 * such as IA-64).
-		 */
-		wmb();
-		writel(val, rx_ring->tail);
+#if (PAGE_SIZE < 8192)
+	truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
+#else
+	truesize = ice_rx_offset(rx_ring) ?
+		SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + size) +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+		SKB_DATA_ALIGN(size);
+#endif
+	return truesize;
+}
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_run_xdp - Executes an XDP program on initialized xdp_buff
+ * @rx_ring: Rx ring
+ * @xdp: xdp_buff used as input to the XDP program
+ * @xdp_prog: XDP program to run
+ *
+ * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
+ */
+static int
+ice_run_xdp(struct ice_ring *rx_ring, struct xdp_buff *xdp,
+	    struct bpf_prog *xdp_prog)
+{
+	int err, result = ICE_XDP_PASS;
+	struct ice_ring *xdp_ring;
+	u32 act;
+#ifdef ICE_ADD_PROBES
+	u64 rx_bytes = (u64)(xdp->data_end - xdp->data);
+
+	rx_ring->xdp_stats.xdp_rx_pkts++;
+	rx_ring->xdp_stats.xdp_rx_bytes += rx_bytes;
+#endif
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+	switch (act) {
+	case XDP_PASS:
+#ifdef ICE_ADD_PROBES
+		rx_ring->xdp_stats.xdp_pass++;
+#endif
+		break;
+	case XDP_TX:
+		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->q_index];
+		result = ice_xmit_xdp_buff(xdp, xdp_ring);
+#ifdef ICE_ADD_PROBES
+		if (result == ICE_XDP_TX)
+			rx_ring->xdp_stats.xdp_tx++;
+		else
+			rx_ring->xdp_stats.xdp_tx_fail++;
+#endif
+		break;
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+		result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
+#ifdef ICE_ADD_PROBES
+		if (!err)
+			rx_ring->xdp_stats.xdp_redirect++;
+		else
+			rx_ring->xdp_stats.xdp_redirect_fail++;
+#endif
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fallthrough -- not supported action */
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+#ifdef ICE_ADD_PROBES
+		rx_ring->xdp_stats.xdp_unknown++;
+#endif
+		/* fallthrough -- handle aborts by dropping frame */
+	case XDP_DROP:
+		result = ICE_XDP_CONSUMED;
+#ifdef ICE_ADD_PROBES
+		rx_ring->xdp_stats.xdp_drop++;
+#endif
+		break;
+	}
+
+	return result;
+}
+
+#ifdef HAVE_XDP_FRAME_STRUCT
+/**
+ * ice_xdp_xmit - submit packets to XDP ring for transmission
+ * @dev: netdev
+ * @n: number of XDP frames to be transmitted
+ * @frames: XDP frames to be transmitted
+ * @flags: transmit flags
+ *
+ * Returns number of frames successfully sent. Frames that fail are
+ * free'ed via XDP return API.
+ * For error cases, a negative errno code is returned and no-frames
+ * are transmitted (caller must handle freeing frames).
+ */
+int
+ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+	     u32 flags)
+#else
+int ice_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+#endif /* HAVE_XDP_FRAME_STRUCT */
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	unsigned int queue_index = smp_processor_id();
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_ring *xdp_ring;
+#ifdef HAVE_XDP_FRAME_STRUCT
+	int drops = 0, i;
+#else
+	int err;
+#endif /* HAVE_XDP_FRAME_STRUCT */
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return -ENETDOWN;
+
+	if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq)
+		return -ENXIO;
+
+#ifdef HAVE_XDP_FRAME_STRUCT
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+#endif
+
+	xdp_ring = vsi->xdp_rings[queue_index];
+#ifdef HAVE_XDP_FRAME_STRUCT
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
+
+		err = ice_xmit_xdp_ring(xdpf->data, xdpf->len, xdp_ring);
+		if (err != ICE_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
 	}
+
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		ice_xdp_ring_update_tail(xdp_ring);
+
+	return n - drops;
+#else
+	err = ice_xmit_xdp_ring(xdp->data,
+				(u8 *)xdp->data_end - (u8 *)xdp->data,
+				xdp_ring);
+	return err == ICE_XDP_TX ? 0 : -EFAULT;
+#endif /* HAVE_XDP_FRAME_STRUCT */
+}
+
+#ifndef NO_NDO_XDP_FLUSH
+/**
+ * ice_xdp_flush - flush XDP ring and transmit all submitted packets
+ * @dev: netdev
+ */
+void ice_xdp_flush(struct net_device *dev)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	unsigned int queue_index = smp_processor_id();
+	struct ice_vsi *vsi = np->vsi;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return;
+
+	if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq)
+		return;
+
+	ice_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
 }
+#endif /* !NO_NDO_XDP_FLUSH */
+#endif /* HAVE_XDP_SUPPORT */
 
 /**
  * ice_alloc_mapped_page - recycle or make a new page
@@ -418,35 +763,46 @@ ice_alloc_mapped_page(struct ice_ring *rx_ring, struct ice_rx_buf *bi)
 
 	/* since we are recycling buffers we should seldom need to alloc */
 	if (likely(page)) {
-		rx_ring->rx_stats.page_reuse_count++;
+#ifdef ICE_ADD_PROBES
+		rx_ring->rx_stats.page_reuse++;
+#endif /* ICE_ADD_PROBES */
 		return true;
 	}
 
 	/* alloc new page for storage */
-	page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+	page = dev_alloc_pages(ice_rx_pg_order(rx_ring));
 	if (unlikely(!page)) {
 		rx_ring->rx_stats.alloc_page_failed++;
 		return false;
 	}
 
 	/* map page for use */
-	dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
+#ifndef HAVE_STRUCT_DMA_ATTRS
+	dma = dma_map_page_attrs(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring),
 				 DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
+#else
+	dma = dma_map_page(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring),
+			   DMA_FROM_DEVICE);
+#endif
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
 	 */
 	if (dma_mapping_error(rx_ring->dev, dma)) {
-		__free_pages(page, 0);
+		__free_pages(page, ice_rx_pg_order(rx_ring));
 		rx_ring->rx_stats.alloc_page_failed++;
 		return false;
 	}
 
 	bi->dma = dma;
 	bi->page = page;
-	bi->page_offset = 0;
+	bi->page_offset = ice_rx_offset(rx_ring);
+#ifdef HAVE_PAGE_COUNT_BULK_UPDATE
 	page_ref_add(page, USHRT_MAX - 1);
 	bi->pagecnt_bias = USHRT_MAX;
+#else
+	bi->pagecnt_bias = 1;
+#endif
 
 	return true;
 }
@@ -471,7 +827,8 @@ bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count)
 	struct ice_rx_buf *bi;
 
 	/* do nothing if no valid netdev defined */
-	if (!rx_ring->netdev || !cleaned_count)
+	if ((!rx_ring->netdev && rx_ring->vsi->type != ICE_VSI_CTRL) ||
+	    !cleaned_count)
 		return false;
 
 	/* get the Rx descriptor and buffer based on next_to_use */
@@ -486,14 +843,13 @@ bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count)
 		/* sync the buffer for use by the device */
 		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
 						 bi->page_offset,
-						 ICE_RXBUF_2048,
+						 rx_ring->rx_buf_len,
 						 DMA_FROM_DEVICE);
 
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
 		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
-
 		rx_desc++;
 		bi++;
 		ntu++;
@@ -532,7 +888,7 @@ static bool ice_page_is_reserved(struct page *page)
  * Update the offset within page so that Rx buf will be ready to be reused.
  * For systems with PAGE_SIZE < 8192 this function will flip the page offset
  * so the second half of page assigned to Rx buffer will be used, otherwise
- * the offset is moved by the @size bytes
+ * the offset is moved by "size" bytes
  */
 static void
 ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
@@ -557,9 +913,6 @@ ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
  */
 static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
 {
-#if (PAGE_SIZE >= 8192)
-	unsigned int last_offset = PAGE_SIZE - ICE_RXBUF_2048;
-#endif
 	unsigned int pagecnt_bias = rx_buf->pagecnt_bias;
 	struct page *page = rx_buf->page;
 
@@ -572,7 +925,9 @@ static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
 	if (unlikely((page_count(page) - pagecnt_bias) > 1))
 		return false;
 #else
-	if (rx_buf->page_offset > last_offset)
+#define ICE_LAST_OFFSET \
+	(SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048)
+	if (rx_buf->page_offset > ICE_LAST_OFFSET)
 		return false;
 #endif /* PAGE_SIZE < 8192) */
 
@@ -580,16 +935,24 @@ static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
 	 * the pagecnt_bias and page count so that we fully restock the
 	 * number of references the driver holds.
 	 */
+#ifdef HAVE_PAGE_COUNT_BULK_UPDATE
 	if (unlikely(pagecnt_bias == 1)) {
 		page_ref_add(page, USHRT_MAX - 1);
 		rx_buf->pagecnt_bias = USHRT_MAX;
 	}
+#else
+	if (likely(!pagecnt_bias)) {
+		get_page(page);
+		rx_buf->pagecnt_bias = 1;
+	}
+#endif
 
 	return true;
 }
 
 /**
  * ice_add_rx_frag - Add contents of Rx buffer to sk_buff as a frag
+ * @rx_ring: Rx descriptor ring to transact packets on
  * @rx_buf: buffer containing page to add
  * @skb: sk_buff to place the data into
  * @size: packet length from rx_desc
@@ -599,13 +962,13 @@ static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
  * The function will then update the page offset.
  */
 static void
-ice_add_rx_frag(struct ice_rx_buf *rx_buf, struct sk_buff *skb,
-		unsigned int size)
+ice_add_rx_frag(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
+		struct sk_buff *skb, unsigned int size)
 {
 #if (PAGE_SIZE >= 8192)
-	unsigned int truesize = SKB_DATA_ALIGN(size);
+	unsigned int truesize = SKB_DATA_ALIGN(size + ice_rx_offset(rx_ring));
 #else
-	unsigned int truesize = ICE_RXBUF_2048;
+	unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
 #endif
 
 	if (!size)
@@ -678,11 +1041,70 @@ ice_get_rx_buf(struct ice_ring *rx_ring, struct sk_buff **skb,
 	return rx_buf;
 }
 
+/**
+ * ice_build_skb - Build skb around an existing buffer
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @rx_buf: Rx buffer to pull data from
+ * @xdp: xdp_buff pointing to the data
+ *
+ * This function builds an skb around an existing Rx buffer, taking care
+ * to set up the skb correctly and avoid any memcpy overhead.
+ */
+static struct sk_buff *
+ice_build_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
+	      struct xdp_buff *xdp)
+{
+#ifdef HAVE_XDP_BUFF_DATA_META
+	u8 metasize = xdp->data - xdp->data_meta;
+#endif /* HAVE_XDP_BUFF_DATA_META */
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
+				SKB_DATA_ALIGN(xdp->data_end -
+					       xdp->data_hard_start);
+#endif
+	struct sk_buff *skb;
+
+#ifdef HAVE_XDP_BUFF_DATA_META
+	/* Prefetch first cache line of first page. If xdp->data_meta
+	 * is unused, this points exactly as xdp->data, otherwise we
+	 * likely have a consumer accessing first few bytes of meta
+	 * data, and then actual data.
+	 */
+	net_prefetch(xdp->data_meta);
+#else
+	net_prefetch(xdp->data);
+#endif /* HAVE_XDP_BUFF_DATA_META */
+	/* build an skb around the page buffer */
+	skb = build_skb(xdp->data_hard_start, truesize);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* must to record Rx queue, otherwise OS features such as
+	 * symmetric queue won't work
+	 */
+	skb_record_rx_queue(skb, rx_ring->q_index);
+
+	/* update pointers within the skb to store the data */
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	__skb_put(skb, xdp->data_end - xdp->data);
+#ifdef HAVE_XDP_BUFF_DATA_META
+	if (metasize)
+		skb_metadata_set(skb, metasize);
+#endif /* HAVE_XDP_BUFF_DATA_META */
+
+	/* buffer is used by skb, update page_offset */
+	ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
+
+	return skb;
+}
+
 /**
  * ice_construct_skb - Allocate skb and populate it
  * @rx_ring: Rx descriptor ring to transact packets on
  * @rx_buf: Rx buffer to pull data from
- * @size: the length of the packet
+ * @xdp: xdp_buff pointing to the data
  *
  * This function allocates an skb. It then populates it with the page
  * data from the current receive descriptor, taking care to set up the
@@ -690,17 +1112,14 @@ ice_get_rx_buf(struct ice_ring *rx_ring, struct sk_buff **skb,
  */
 static struct sk_buff *
 ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
-		  unsigned int size)
+		  struct xdp_buff *xdp)
 {
-	void *va = page_address(rx_buf->page) + rx_buf->page_offset;
+	unsigned int size = xdp->data_end - xdp->data;
 	unsigned int headlen;
 	struct sk_buff *skb;
 
 	/* prefetch first cache line of first page */
-	prefetch(va);
-#if L1_CACHE_BYTES < 128
-	prefetch((u8 *)va + L1_CACHE_BYTES);
-#endif /* L1_CACHE_BYTES */
+	net_prefetch(xdp->data);
 
 	/* allocate a skb to store the frags */
 	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE,
@@ -712,10 +1131,11 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > ICE_RX_HDR_SIZE)
-		headlen = eth_get_headlen(skb->dev, va, ICE_RX_HDR_SIZE);
+		headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE);
 
 	/* align pull length to size of long to optimize memcpy performance */
-	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
+	memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen,
+							 sizeof(long)));
 
 	/* if we exhaust the linear part then add what is left as a frag */
 	size -= headlen;
@@ -723,7 +1143,7 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
 #if (PAGE_SIZE >= 8192)
 		unsigned int truesize = SKB_DATA_ALIGN(size);
 #else
-		unsigned int truesize = ICE_RXBUF_2048;
+		unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
 #endif
 		skb_add_rx_frag(skb, 0, rx_buf->page,
 				rx_buf->page_offset + headlen, size, truesize);
@@ -745,22 +1165,37 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
  * @rx_ring: Rx descriptor ring to transact packets on
  * @rx_buf: Rx buffer to pull data from
  *
- * This function will  clean up the contents of the rx_buf. It will
- * either recycle the buffer or unmap it and free the associated resources.
+ * This function will update next_to_clean and then clean up the contents
+ * of the rx_buf. It will either recycle the buffer or unmap it and free
+ * the associated resources.
  */
 static void ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 {
+	u16 ntc = rx_ring->next_to_clean + 1;
+
+	/* fetch, update, and store next to clean */
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+
 	if (!rx_buf)
 		return;
 
 	if (ice_can_reuse_rx_page(rx_buf)) {
 		/* hand second half of page back to the ring */
 		ice_reuse_rx_page(rx_ring, rx_buf);
-		rx_ring->rx_stats.page_reuse_count++;
+#ifdef ICE_ADD_PROBES
+		rx_ring->rx_stats.page_reuse++;
+#endif /* ICE_ADD_PROBES */
 	} else {
 		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma, PAGE_SIZE,
-				     DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
+#ifndef HAVE_STRUCT_DMA_ATTRS
+		dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma,
+				     ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE,
+				     ICE_RX_DMA_ATTR);
+#else
+		dma_unmap_page(rx_ring->dev, rx_buf->dma,
+			       ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
+#endif
 		__page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
 	}
 
@@ -769,245 +1204,292 @@ static void ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 	rx_buf->skb = NULL;
 }
 
-/**
- * ice_cleanup_headers - Correct empty headers
- * @skb: pointer to current skb being fixed
- *
- * Also address the case where we are pulling data in on pages only
- * and as such no data is present in the skb header.
- *
- * In addition if skb is not at least 60 bytes we need to pad it so that
- * it is large enough to qualify as a valid Ethernet frame.
- *
- * Returns true if an error was encountered and skb was freed.
- */
-static bool ice_cleanup_headers(struct sk_buff *skb)
-{
-	/* if eth_skb_pad returns an error the skb was freed */
-	if (eth_skb_pad(skb))
-		return true;
-
-	return false;
-}
-
-/**
- * ice_test_staterr - tests bits in Rx descriptor status and error fields
- * @rx_desc: pointer to receive descriptor (in le64 format)
- * @stat_err_bits: value to mask
- *
- * This function does some fast chicanery in order to return the
- * value of the mask which is really only used for boolean tests.
- * The status_error_len doesn't need to be shifted because it begins
- * at offset zero.
- */
-static bool
-ice_test_staterr(union ice_32b_rx_flex_desc *rx_desc, const u16 stat_err_bits)
-{
-	return !!(rx_desc->wb.status_error0 &
-		  cpu_to_le16(stat_err_bits));
-}
-
 /**
  * ice_is_non_eop - process handling of non-EOP buffers
  * @rx_ring: Rx ring being processed
  * @rx_desc: Rx descriptor for current buffer
  * @skb: Current socket buffer containing buffer in progress
  *
- * This function updates next to clean. If the buffer is an EOP buffer
- * this function exits returning false, otherwise it will place the
- * sk_buff in the next buffer to be chained and return true indicating
- * that this is in fact a non-EOP buffer.
+ * If the buffer is an EOP buffer, this function exits returning false,
+ * otherwise return true indicating that this is in fact a non-EOP buffer.
  */
 static bool
 ice_is_non_eop(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
 	       struct sk_buff *skb)
 {
-	u32 ntc = rx_ring->next_to_clean + 1;
-
-	/* fetch, update, and store next to clean */
-	ntc = (ntc < rx_ring->count) ? ntc : 0;
-	rx_ring->next_to_clean = ntc;
-
-	prefetch(ICE_RX_DESC(rx_ring, ntc));
-
 	/* if we are the last buffer then there is nothing else to do */
 #define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)
-	if (likely(ice_test_staterr(rx_desc, ICE_RXD_EOF)))
+	if (likely(ice_test_staterr(rx_desc->wb.status_error0, ICE_RXD_EOF)))
 		return false;
 
 	/* place skb in next buffer to be received */
-	rx_ring->rx_buf[ntc].skb = skb;
+	rx_ring->rx_buf[rx_ring->next_to_clean].skb = skb;
 	rx_ring->rx_stats.non_eop_descs++;
 
 	return true;
 }
 
 /**
- * ice_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
+ * ice_detect_dis_inline_fd_usage  - detect and disable usage of inline-fd
+ * @ch_vsi : ptr to channel VSI
  *
- * Returns a hash type to be used by skb_set_hash
+ * This function to detect FD table full condition and if so,
+ * return true otherwise false
  */
-static enum pkt_hash_types ice_ptype_to_htype(u8 __always_unused ptype)
+static bool
+ice_detect_dis_inline_fd_usage(struct ice_vsi *ch_vsi)
 {
-	return PKT_HASH_TYPE_NONE;
+	int total_fd_allowed = ch_vsi->num_gfltr + ch_vsi->num_bfltr;
+	int inline_fd_active;
+
+	/* detect if transitioned to RSS mode, if so return true */
+	if (test_bit(ICE_SWITCH_TO_RSS, ch_vsi->adv_state))
+		return true;
+
+	/* for some reason if channel VSI doesn't have any FD resources
+	 * reserved (from guaranteed or best effort pool), stay in RSS
+	 */
+	if (!total_fd_allowed) {
+		set_bit(ICE_SWITCH_TO_RSS, ch_vsi->adv_state);
+		return true;
+	}
+
+	/* inline_fd_active_cnt is decremented from ice_chnl_inline_fd
+	 * function when evicting FD entry upon FIN/RST transmit
+	 */
+	inline_fd_active = atomic_inc_return(&ch_vsi->inline_fd_active_cnt) - 1;
+	if (inline_fd_active >= total_fd_allowed) {
+		set_bit(ICE_SWITCH_TO_RSS, ch_vsi->adv_state);
+		return true;
+	}
+
+	return false;
 }
 
+/* Rx desc:flexi_flags are bits 15:10 applicable when RXDID=2 as defined
+ * by package
+ */
+#define ICE_RX_FLEXI_FLAGS_ACK	BIT(2)
+#define ICE_RX_FLEXI_FLAGS_FIN	BIT(3)
+#define ICE_RX_FLEXI_FLAGS_SYN	BIT(4)
+#define ICE_RX_FLEXI_FLAGS_RST	BIT(5)
+
+/* Rx desc:flexi_flags2, applicable when RXDID=2 */
+#define ICE_RX_FLEXI_FLAGS2_TNL_0 BIT(5)
+#define ICE_RX_FLEXI_FLAGS2_TNL_1 BIT(6)
+#define ICE_RX_SUPPORTED_TNL_FLEXI_FLAGS (ICE_RX_FLEXI_FLAGS2_TNL_0 | \
+					  ICE_RX_FLEXI_FLAGS2_TNL_1)
+
 /**
- * ice_rx_hash - set the hash value in the skb
- * @rx_ring: descriptor ring
- * @rx_desc: specific descriptor
- * @skb: pointer to current skb
- * @rx_ptype: the ptype value from the descriptor
+ * ice_is_ctrl_pkt - determine if given packet is control/data packet
+ * @skb: receive buffer
+ * @rx_ring: ptr to Rx ring
+ * @rx_desc: ptr to Rx desc
+ * @ptype: packet type
+ * @flags: value of flexi_flags0 from Rx desc
+ *
+ * Determine if given packet is control/data packet. Definition of control
+ * packet is if it consist of SYN/FIN/RST flags, otherwise data packet. This
+ * check is applicable only for TCP/IPv4[6]. This function is expected to
+ * work correctly even for tunnel if inner protocol is TCP/IPv4[6] as long
+ * as device parser understands it as known packet.
+ *
+ * Function returns TRUE of this is control packet otherwise false if given
+ * packet is classified as data packet. For all error condition, it returns
+ * true so that it gets treated as control packet. This control versus data
+ * packet logic feeds into deferring interrupt enablement from napi_poll
+ * when busy_poll:stop is called.
  */
-static void
-ice_rx_hash(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
-	    struct sk_buff *skb, u8 rx_ptype)
+static bool
+ice_is_ctrl_pkt(struct sk_buff *skb, struct ice_ring *rx_ring,
+		union ice_32b_rx_flex_desc *rx_desc, u16 ptype, u16 *flags)
 {
 	struct ice_32b_rx_flex_desc_nic *nic_mdid;
-	u32 hash;
+	struct ice_rx_ptype_decoded decoded;
+	u16 flexi_flags;
 
-	if (!(rx_ring->netdev->features & NETIF_F_RXHASH))
-		return;
+	*flags = 0;
 
+	/* RXDID must be set to FLEX, otherwise no gurantee that "flags"
+	 * will be available in Rx desc.flexi_flags0
+	 */
 	if (rx_desc->wb.rxdid != ICE_RXDID_FLEX_NIC)
-		return;
+		return true;
+
+	/* process PTYPE from Rx desc */
+	decoded = ice_decode_rx_desc_ptype(ptype);
+	if (!decoded.known)
+		return true;
+
+	/* Make sure packet is L4 and L4 proto (inner most) is TCP */
+	if (!(decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4 &&
+	      decoded.inner_prot == ICE_RX_PTYPE_INNER_PROT_TCP))
+		return true;
 
 	nic_mdid = (struct ice_32b_rx_flex_desc_nic *)rx_desc;
-	hash = le32_to_cpu(nic_mdid->rss_hash);
-	skb_set_hash(skb, hash, ice_ptype_to_htype(rx_ptype));
+	if (decoded.tunnel_type != ICE_RX_PTYPE_TUNNEL_NONE) {
+		/* Determine which tunnel to support and allow only
+		 * well known tunnel as determined thru' Rx desc
+		 * either PTYPE (this needs additional support for
+		 * tunnels like GTP, PTYPE 325 onwards) or via
+		 * flexi_flag2:TNL_2..0
+		 *   TNL_0 : VxLAN
+		 *   TNL_1 and TNL_0 : Geneve
+		 */
+		/* This only takes care of VxLAN and Geneve */
+		if (!(nic_mdid->flexi_flags2 &
+		      ICE_RX_SUPPORTED_TNL_FLEXI_FLAGS))
+			return true;
+	}
+
+	flexi_flags = le16_to_cpu(nic_mdid->ptype_flexi_flags0) >>
+				  ICE_RX_FLEX_DESC_FLEXI_FLAGS0_S;
+
+#ifdef ADQ_PERF_COUNTERS
+	if (flexi_flags & ICE_RX_FLEXI_FLAGS_FIN)
+		rx_ring->ch_q_stats.rx.num_tcp_flags_fin++;
+	else if (flexi_flags & ICE_RX_FLEXI_FLAGS_RST)
+		rx_ring->ch_q_stats.rx.num_tcp_flags_rst++;
+	else if (flexi_flags & ICE_RX_FLEXI_FLAGS_SYN)
+		rx_ring->ch_q_stats.rx.num_tcp_flags_syn++;
+#endif /* ADQ_PERF_COUNTERS */
+
+	/* return the flexi_flags to caller */
+	*flags = flexi_flags;
+
+	/* Packet is ctrl_pkt : if SYN|FIN|RST|SYN+ACK|FIN+ACK set */
+	if (flexi_flags & (ICE_RX_FLEXI_FLAGS_FIN | ICE_RX_FLEXI_FLAGS_RST |
+			   ICE_RX_FLEXI_FLAGS_SYN))
+		return true;
+
+	/* if reached here, means packet is DATA packet */
+	return false;
 }
 
 /**
- * ice_rx_csum - Indicate in skb if checksum is good
- * @ring: the ring we care about
- * @skb: skb currently being received and modified
- * @rx_desc: the receive descriptor
- * @ptype: the packet type decoded by hardware
+ * ice_rx_queue_override - override Rx queue if needed and update skb
+ * @skb: receive buffer
+ * @rx_ring: ptr to Rx ring
+ * @flags: value of flexi_flags (such as TCP flags)
  *
- * skb->protocol must be set before this function is called
+ * Override Rx queue if packet being processed is SYN only and records
+ * new Rx queue in skb. This is applicable only for TCP/IPv4[6].
  */
 static void
-ice_rx_csum(struct ice_ring *ring, struct sk_buff *skb,
-	    union ice_32b_rx_flex_desc *rx_desc, u8 ptype)
+ice_rx_queue_override(struct sk_buff *skb, struct ice_ring *rx_ring,
+		      u16 flags)
 {
-	struct ice_rx_ptype_decoded decoded;
-	u32 rx_error, rx_status;
-	bool ipv4, ipv6;
+	struct ice_channel *ch = rx_ring->ch;
+	struct ice_vsi *vsi = rx_ring->vsi;
+	struct ice_ring *ring; /* selected ring for override */
+	int queue_to_use;
 
-	rx_status = le16_to_cpu(rx_desc->wb.status_error0);
-	rx_error = rx_status;
+	/* make sure ring is channel enabled before proceeding with Rx queue
+	 * override logic
+	 */
+	if (!ice_ring_ch_enabled(rx_ring))
+		return;
+	/* SYN must be set to proceed */
+	if (!(flags & ICE_RX_FLEXI_FLAGS_SYN))
+		return;
+	/* ACK must not be set to proceed */
+	if (flags & ICE_RX_FLEXI_FLAGS_ACK)
+		return;
 
-	decoded = ice_decode_rx_desc_ptype(ptype);
 
-	/* Start with CHECKSUM_NONE and by default csum_level = 0 */
-	skb->ip_summed = CHECKSUM_NONE;
-	skb_checksum_none_assert(skb);
+	/* proceed only when filter type for channel is of type dest
+	 * port or src+dest port or tunnel
+	 */
+	if (!(ch->fltr_type == ICE_CHNL_FLTR_TYPE_DEST_PORT ||
+	      ch->fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT ||
+	      ch->fltr_type == ICE_CHNL_FLTR_TYPE_TENANT_ID))
+		return;
 
-	/* check if Rx checksum is enabled */
-	if (!(ring->netdev->features & NETIF_F_RXCSUM))
+	/* make sure channel VSI is FD capable and enabled for
+	 * inline flow-director usage
+	 */
+	if (!ice_vsi_fd_ena(ch->ch_vsi) ||
+	    !ice_vsi_inline_fd_ena(ch->ch_vsi))
 		return;
 
-	/* check if HW has decoded the packet and checksum */
-	if (!(rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
+	/* Detection logic to check if HW table is about to get full,
+	 * if so, switch to RSS mode, means don't change Rx queue
+	 */
+	if (ice_detect_dis_inline_fd_usage(ch->ch_vsi)) {
+#ifdef ADQ_PERF_COUNTERS
+		rx_ring->ch_q_stats.rx.num_rx_queue_bailouts++;
+#endif /* ADQ_PERF_COUNTERS */
 		return;
+	}
+
+	/* Pick the Rx queue based on round-robin policy for the
+	 * connection, limited to queue region of specific channel
+	 */
+	queue_to_use = (atomic_inc_return(&ch->fd_queue) - 1) %
+			ch->num_rxq;
 
-	if (!(decoded.known && decoded.outer_ip))
+	/* adjust the queue based on channel's base_queue, so that
+	 * correct Rx queue number is recorded in skb
+	 */
+	queue_to_use += ch->base_q;
+
+	/* Get the selected ring ptr */
+	ring = vsi->rx_rings[queue_to_use];
+	if (!ring || !ring->q_vector)
 		return;
 
-	ipv4 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6);
+	/* re-record selected queue as Rx queue in SKB */
+	skb_record_rx_queue(skb, queue_to_use);
 
-	if (ipv4 && (rx_error & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) |
-				 BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S))))
-		goto checksum_fail;
-	else if (ipv6 && (rx_status &
-		 (BIT(ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S))))
-		goto checksum_fail;
+#ifdef ADQ_PERF_COUNTERS
+	ring->ch_q_stats.rx.num_rx_queue_set++;
+#endif /* ADQ_PERF_COUNTERS */
 
-	/* check for L4 errors and handle packets that were not able to be
-	 * checksummed due to arrival speed
+	/* mark selected queue:vector for inline filter usage by
+	 * incrementing atomic variable, it can't be flag
+	 * because during ATR eviction, this needs to be
+	 * decremented
 	 */
-	if (rx_error & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
-		goto checksum_fail;
-
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case ICE_RX_PTYPE_INNER_PROT_TCP:
-	case ICE_RX_PTYPE_INNER_PROT_UDP:
-	case ICE_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	default:
-		break;
-	}
-	return;
+	atomic_inc(&ring->q_vector->inline_fd_cnt);
 
-checksum_fail:
-	ring->vsi->back->hw_csum_rx_error++;
+	return;
 }
 
 /**
- * ice_process_skb_fields - Populate skb header fields from Rx descriptor
- * @rx_ring: Rx descriptor ring packet is being transacted on
- * @rx_desc: pointer to the EOP Rx descriptor
- * @skb: pointer to current skb being populated
- * @ptype: the packet type decoded by hardware
+ * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @budget: Total limit on number of packets to process
+ *
+ * This function provides a "bounce buffer" approach to Rx interrupt
+ * processing. The advantage to this is that on systems that have
+ * expensive overhead for IOMMU access this provides a means of avoiding
+ * it by maintaining the mapping of the page to the system.
  *
- * This function checks the ring, descriptor, and packet information in
- * order to populate the hash, checksum, VLAN, protocol, and
- * other fields within the skb.
+ * Returns amount of work completed
  */
-static void
-ice_process_skb_fields(struct ice_ring *rx_ring,
-		       union ice_32b_rx_flex_desc *rx_desc,
-		       struct sk_buff *skb, u8 ptype)
-{
-	ice_rx_hash(rx_ring, rx_desc, skb, ptype);
-
-	/* modifies the skb - consumes the enet header */
-	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
-
-	ice_rx_csum(rx_ring, skb, rx_desc, ptype);
-}
-
-/**
- * ice_receive_skb - Send a completed packet up the stack
- * @rx_ring: Rx ring in play
- * @skb: packet to send up
- * @vlan_tag: VLAN tag for packet
- *
- * This function sends the completed packet (via. skb) up the stack using
- * gro receive functions (with/without VLAN tag)
- */
-static void
-ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)
-{
-	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
-	    (vlan_tag & VLAN_VID_MASK))
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
-	napi_gro_receive(&rx_ring->q_vector->napi, skb);
-}
-
-/**
- * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
- * @rx_ring: Rx descriptor ring to transact packets on
- * @budget: Total limit on number of packets to process
- *
- * This function provides a "bounce buffer" approach to Rx interrupt
- * processing. The advantage to this is that on systems that have
- * expensive overhead for IOMMU access this provides a means of avoiding
- * it by maintaining the mapping of the page to the system.
- *
- * Returns amount of work completed
- */
-static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
+int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 {
 	unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
 	u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+#ifdef HAVE_XDP_SUPPORT
+	unsigned int xdp_res, xdp_xmit = 0;
+	struct bpf_prog *xdp_prog = NULL;
+#endif /* HAVE_XDP_SUPPORT */
+	struct xdp_buff xdp;
 	bool failure;
 
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_XDP_BUFF_RXQ
+	xdp.rxq = &rx_ring->xdp_rxq;
+#endif /* HAVE_XDP_BUFF_RXQ */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_XDP_BUFF_FRAME_SZ
+	/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
+#if (PAGE_SIZE < 8192)
+	xdp.frame_sz = ice_rx_frame_truesize(rx_ring, 0);
+#endif
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+
 	/* start the loop to process Rx packets bounded by 'budget' */
 	while (likely(total_rx_pkts < (unsigned int)budget)) {
 		union ice_32b_rx_flex_desc *rx_desc;
@@ -1016,7 +1498,7 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 		unsigned int size;
 		u16 stat_err_bits;
 		u16 vlan_tag = 0;
-		u8 rx_ptype;
+		u16 rx_ptype;
 
 		/* get the Rx desc from Rx ring based on 'next_to_clean' */
 		rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
@@ -1027,7 +1509,7 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 		 * hardware wrote DD then it will be non-zero
 		 */
 		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
-		if (!ice_test_staterr(rx_desc, stat_err_bits))
+		if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
 			break;
 
 		/* This memory barrier is needed to keep us from reading
@@ -1036,17 +1518,95 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 		 */
 		dma_rmb();
 
+		ice_trace(clean_rx_irq, rx_ring, rx_desc);
+		if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) {
+			struct ice_vsi *ctrl_vsi = rx_ring->vsi;
+
+			if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
+			    ctrl_vsi->vf_id != ICE_INVAL_VFID)
+				ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
+			ice_put_rx_buf(rx_ring, NULL);
+			cleaned_count++;
+			continue;
+		}
+
 		size = le16_to_cpu(rx_desc->wb.pkt_len) &
 			ICE_RX_FLX_DESC_PKT_LEN_M;
 
 		/* retrieve a buffer from the ring */
 		rx_buf = ice_get_rx_buf(rx_ring, &skb, size);
 
-		if (skb)
-			ice_add_rx_frag(rx_buf, skb, size);
-		else
-			skb = ice_construct_skb(rx_ring, rx_buf, size);
+		if (!size) {
+			xdp.data = NULL;
+			xdp.data_end = NULL;
+			xdp.data_hard_start = NULL;
+#ifdef HAVE_XDP_BUFF_DATA_META
+			xdp.data_meta = NULL;
+#endif /* HAVE_XDP_BUFF_DATA_META */
+			goto construct_skb;
+		}
+
+		xdp.data = page_address(rx_buf->page) + rx_buf->page_offset;
+		xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring);
+#ifdef HAVE_XDP_BUFF_DATA_META
+		xdp.data_meta = xdp.data;
+#endif /* HAVE_XDP_BUFF_DATA_META */
+		xdp.data_end = xdp.data + size;
+#ifdef HAVE_XDP_BUFF_FRAME_SZ
+#if (PAGE_SIZE > 4096)
+		/* At larger PAGE_SIZE, frame_sz depend on len size */
+		xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size);
+#endif
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+
+#ifdef HAVE_XDP_SUPPORT
+		rcu_read_lock();
+		xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+		if (!xdp_prog) {
+			rcu_read_unlock();
+			goto construct_skb;
+		}
 
+		xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog);
+		rcu_read_unlock();
+		if (!xdp_res)
+			goto construct_skb;
+		if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+#ifndef HAVE_XDP_BUFF_FRAME_SZ
+			unsigned int truesize;
+
+#if (PAGE_SIZE < 8192)
+			truesize = ice_rx_pg_size(rx_ring) / 2;
+#else
+			truesize = SKB_DATA_ALIGN(ice_rx_offset(rx_ring) +
+						  size);
+#endif
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+			xdp_xmit |= xdp_res;
+#ifdef HAVE_XDP_BUFF_FRAME_SZ
+			ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz);
+#else
+			ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+		} else {
+			rx_buf->pagecnt_bias++;
+		}
+		total_rx_bytes += size;
+		total_rx_pkts++;
+
+		cleaned_count++;
+		ice_put_rx_buf(rx_ring, rx_buf);
+		continue;
+#endif /* HAVE_XDP_SUPPORT */
+construct_skb:
+		if (skb) {
+			ice_add_rx_frag(rx_ring, rx_buf, skb, size);
+		} else if (likely(xdp.data)) {
+			if (ice_ring_uses_build_skb(rx_ring))
+				skb = ice_build_skb(rx_ring, rx_buf, &xdp);
+			else
+				skb = ice_construct_skb(rx_ring, rx_buf, &xdp);
+		}
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
 			rx_ring->rx_stats.alloc_buf_failed++;
@@ -1063,19 +1623,16 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 			continue;
 
 		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
-		if (unlikely(ice_test_staterr(rx_desc, stat_err_bits))) {
+		if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
+					      stat_err_bits))) {
 			dev_kfree_skb_any(skb);
 			continue;
 		}
 
-		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
-		if (ice_test_staterr(rx_desc, stat_err_bits))
-			vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1);
+		vlan_tag = ice_get_vlan_tag_from_rx_desc(rx_desc);
 
-		/* correct empty headers and pad skb if needed (to make valid
-		 * ethernet frame
-		 */
-		if (ice_cleanup_headers(skb)) {
+		/* pad the skb if needed, to make a valid ethernet frame */
+		if (eth_skb_pad(skb)) {
 			skb = NULL;
 			continue;
 		}
@@ -1089,6 +1646,20 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 
 		ice_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
 
+		if (ice_ring_ch_enabled(rx_ring)) {
+			bool ctrl_pkt;
+			u16 flags;
+
+			ctrl_pkt = ice_is_ctrl_pkt(skb, rx_ring, rx_desc,
+						   rx_ptype, &flags);
+			if (!ctrl_pkt)
+				rx_ring->q_vector->state_flags |=
+						ICE_CHNL_PREV_DATA_PKT_RECV;
+			else
+				ice_rx_queue_override(skb, rx_ring, flags);
+		}
+
+		ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb);
 		/* send completed skb up the stack */
 		ice_receive_skb(rx_ring, skb, vlan_tag);
 
@@ -1099,230 +1670,68 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 	/* return up to cleaned_count buffers to hardware */
 	failure = ice_alloc_rx_bufs(rx_ring, cleaned_count);
 
-	/* update queue and vector specific stats */
-	u64_stats_update_begin(&rx_ring->syncp);
-	rx_ring->stats.pkts += total_rx_pkts;
-	rx_ring->stats.bytes += total_rx_bytes;
-	u64_stats_update_end(&rx_ring->syncp);
-	rx_ring->q_vector->rx.total_pkts += total_rx_pkts;
-	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
+#ifdef HAVE_XDP_SUPPORT
+	if (xdp_prog)
+		ice_finalize_xdp_rx(rx_ring, xdp_xmit);
+#endif /* HAVE_XDP_SUPPORT */
+
+	ice_update_rx_ring_stats(rx_ring, total_rx_pkts, total_rx_bytes);
 
 	/* guarantee a trip back through this routine if there was a failure */
 	return failure ? budget : (int)total_rx_pkts;
 }
 
-/**
- * ice_adjust_itr_by_size_and_speed - Adjust ITR based on current traffic
- * @port_info: port_info structure containing the current link speed
- * @avg_pkt_size: average size of Tx or Rx packets based on clean routine
- * @itr: ITR value to update
- *
- * Calculate how big of an increment should be applied to the ITR value passed
- * in based on wmem_default, SKB overhead, Ethernet overhead, and the current
- * link speed.
- *
- * The following is a calculation derived from:
- *  wmem_default / (size + overhead) = desired_pkts_per_int
- *  rate / bits_per_byte / (size + Ethernet overhead) = pkt_rate
- *  (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
- *
- * Assuming wmem_default is 212992 and overhead is 640 bytes per
- * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
- * formula down to:
- *
- *	 wmem_default * bits_per_byte * usecs_per_sec   pkt_size + 24
- * ITR = -------------------------------------------- * --------------
- *			     rate			pkt_size + 640
- */
-static unsigned int
-ice_adjust_itr_by_size_and_speed(struct ice_port_info *port_info,
-				 unsigned int avg_pkt_size,
-				 unsigned int itr)
+static void __ice_update_sample(struct ice_q_vector *q_vector,
+				struct ice_ring_container *rc,
+				struct dim_sample *sample)
 {
-	switch (port_info->phy.link_info.link_speed) {
-	case ICE_AQ_LINK_SPEED_100GB:
-		itr += DIV_ROUND_UP(17 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_50GB:
-		itr += DIV_ROUND_UP(34 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_40GB:
-		itr += DIV_ROUND_UP(43 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_25GB:
-		itr += DIV_ROUND_UP(68 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_20GB:
-		itr += DIV_ROUND_UP(85 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_10GB:
-		/* fall through */
-	default:
-		itr += DIV_ROUND_UP(170 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	}
+	u64 packets = 0, bytes = 0;
+	struct ice_ring *ring;
 
-	if ((itr & ICE_ITR_MASK) > ICE_ITR_ADAPTIVE_MAX_USECS) {
-		itr &= ICE_ITR_ADAPTIVE_LATENCY;
-		itr += ICE_ITR_ADAPTIVE_MAX_USECS;
+	ice_for_each_ring(ring, *rc) {
+		packets += ring->stats.pkts;
+		bytes += ring->stats.bytes;
 	}
 
-	return itr;
+	dim_update_sample(q_vector->total_events, packets, bytes, sample);
+	sample->comp_ctr = 0;
+
+	/* if dim settings get stale, like when not updated for 1
+	 * second or longer, force it to start again. This addresses the
+	 * freqent case of an idle queue being switched to by the
+	 * scheduler. The 1,000 here means 1,000 milliseconds.
+	 */
+	if (ktime_ms_delta(sample->time, rc->dim.start_sample.time) >= 1000)
+		rc->dim.state = DIM_START_MEASURE;
 }
 
 /**
- * ice_update_itr - update the adaptive ITR value based on statistics
- * @q_vector: structure containing interrupt and ring information
- * @rc: structure containing ring performance data
+ * ice_net_dim - Update net DIM algorithm
+ * @q_vector: the vector associated with the interrupt
  *
- * Stores a new ITR value based on packets and byte
- * counts during the last interrupt.  The advantage of per interrupt
- * computation is faster updates and more accurate ITR for the current
- * traffic pattern.  Constants in this function were computed
- * based on theoretical maximum wire speed and thresholds were set based
- * on testing data as well as attempting to minimize response time
- * while increasing bulk throughput.
+ * Create a DIM sample and notify net_dim() so that it can possibly decide
+ * a new ITR value based on incoming packets, bytes, and interrupts.
+ *
+ * This function is a no-op if the ring is not configured to dynamic ITR.
  */
-static void
-ice_update_itr(struct ice_q_vector *q_vector, struct ice_ring_container *rc)
+static void ice_net_dim(struct ice_q_vector *q_vector)
 {
-	unsigned long next_update = jiffies;
-	unsigned int packets, bytes, itr;
-	bool container_is_rx;
+	struct ice_ring_container *tx = &q_vector->tx;
+	struct ice_ring_container *rx = &q_vector->rx;
 
-	if (!rc->ring || !ITR_IS_DYNAMIC(rc->itr_setting))
-		return;
+	if (ITR_IS_DYNAMIC(tx)) {
+		struct dim_sample dim_sample;
 
-	/* If itr_countdown is set it means we programmed an ITR within
-	 * the last 4 interrupt cycles. This has a side effect of us
-	 * potentially firing an early interrupt. In order to work around
-	 * this we need to throw out any data received for a few
-	 * interrupts following the update.
-	 */
-	if (q_vector->itr_countdown) {
-		itr = rc->target_itr;
-		goto clear_counts;
+		__ice_update_sample(q_vector, tx, &dim_sample);
+		net_dim(&tx->dim, dim_sample);
 	}
 
-	container_is_rx = (&q_vector->rx == rc);
-	/* For Rx we want to push the delay up and default to low latency.
-	 * for Tx we want to pull the delay down and default to high latency.
-	 */
-	itr = container_is_rx ?
-		ICE_ITR_ADAPTIVE_MIN_USECS | ICE_ITR_ADAPTIVE_LATENCY :
-		ICE_ITR_ADAPTIVE_MAX_USECS | ICE_ITR_ADAPTIVE_LATENCY;
-
-	/* If we didn't update within up to 1 - 2 jiffies we can assume
-	 * that either packets are coming in so slow there hasn't been
-	 * any work, or that there is so much work that NAPI is dealing
-	 * with interrupt moderation and we don't need to do anything.
-	 */
-	if (time_after(next_update, rc->next_update))
-		goto clear_counts;
-
-	prefetch(q_vector->vsi->port_info);
-
-	packets = rc->total_pkts;
-	bytes = rc->total_bytes;
+	if (ITR_IS_DYNAMIC(rx)) {
+		struct dim_sample dim_sample;
 
-	if (container_is_rx) {
-		/* If Rx there are 1 to 4 packets and bytes are less than
-		 * 9000 assume insufficient data to use bulk rate limiting
-		 * approach unless Tx is already in bulk rate limiting. We
-		 * are likely latency driven.
-		 */
-		if (packets && packets < 4 && bytes < 9000 &&
-		    (q_vector->tx.target_itr & ICE_ITR_ADAPTIVE_LATENCY)) {
-			itr = ICE_ITR_ADAPTIVE_LATENCY;
-			goto adjust_by_size_and_speed;
-		}
-	} else if (packets < 4) {
-		/* If we have Tx and Rx ITR maxed and Tx ITR is running in
-		 * bulk mode and we are receiving 4 or fewer packets just
-		 * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so
-		 * that the Rx can relax.
-		 */
-		if (rc->target_itr == ICE_ITR_ADAPTIVE_MAX_USECS &&
-		    (q_vector->rx.target_itr & ICE_ITR_MASK) ==
-		    ICE_ITR_ADAPTIVE_MAX_USECS)
-			goto clear_counts;
-	} else if (packets > 32) {
-		/* If we have processed over 32 packets in a single interrupt
-		 * for Tx assume we need to switch over to "bulk" mode.
-		 */
-		rc->target_itr &= ~ICE_ITR_ADAPTIVE_LATENCY;
+		__ice_update_sample(q_vector, rx, &dim_sample);
+		net_dim(&rx->dim, dim_sample);
 	}
-
-	/* We have no packets to actually measure against. This means
-	 * either one of the other queues on this vector is active or
-	 * we are a Tx queue doing TSO with too high of an interrupt rate.
-	 *
-	 * Between 4 and 56 we can assume that our current interrupt delay
-	 * is only slightly too low. As such we should increase it by a small
-	 * fixed amount.
-	 */
-	if (packets < 56) {
-		itr = rc->target_itr + ICE_ITR_ADAPTIVE_MIN_INC;
-		if ((itr & ICE_ITR_MASK) > ICE_ITR_ADAPTIVE_MAX_USECS) {
-			itr &= ICE_ITR_ADAPTIVE_LATENCY;
-			itr += ICE_ITR_ADAPTIVE_MAX_USECS;
-		}
-		goto clear_counts;
-	}
-
-	if (packets <= 256) {
-		itr = min(q_vector->tx.current_itr, q_vector->rx.current_itr);
-		itr &= ICE_ITR_MASK;
-
-		/* Between 56 and 112 is our "goldilocks" zone where we are
-		 * working out "just right". Just report that our current
-		 * ITR is good for us.
-		 */
-		if (packets <= 112)
-			goto clear_counts;
-
-		/* If packet count is 128 or greater we are likely looking
-		 * at a slight overrun of the delay we want. Try halving
-		 * our delay to see if that will cut the number of packets
-		 * in half per interrupt.
-		 */
-		itr >>= 1;
-		itr &= ICE_ITR_MASK;
-		if (itr < ICE_ITR_ADAPTIVE_MIN_USECS)
-			itr = ICE_ITR_ADAPTIVE_MIN_USECS;
-
-		goto clear_counts;
-	}
-
-	/* The paths below assume we are dealing with a bulk ITR since
-	 * number of packets is greater than 256. We are just going to have
-	 * to compute a value and try to bring the count under control,
-	 * though for smaller packet sizes there isn't much we can do as
-	 * NAPI polling will likely be kicking in sooner rather than later.
-	 */
-	itr = ICE_ITR_ADAPTIVE_BULK;
-
-adjust_by_size_and_speed:
-
-	/* based on checks above packets cannot be 0 so division is safe */
-	itr = ice_adjust_itr_by_size_and_speed(q_vector->vsi->port_info,
-					       bytes / packets, itr);
-
-clear_counts:
-	/* write back value */
-	rc->target_itr = itr;
-
-	/* next update should occur within next jiffy */
-	rc->next_update = next_update + 1;
-
-	rc->total_bytes = 0;
-	rc->total_pkts = 0;
 }
 
 /**
@@ -1346,85 +1755,311 @@ static u32 ice_buildreg_itr(u16 itr_idx, u16 itr)
 		(itr << (GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S));
 }
 
-/* The act of updating the ITR will cause it to immediately trigger. In order
- * to prevent this from throwing off adaptive update statistics we defer the
- * update so that it can only happen so often. So after either Tx or Rx are
- * updated we make the adaptive scheme wait until either the ITR completely
- * expires via the next_update expiration or we have been through at least
- * 3 interrupts.
- */
-#define ITR_COUNTDOWN_START 3
-
 /**
- * ice_update_ena_itr - Update ITR and re-enable MSIX interrupt
- * @q_vector: q_vector for which ITR is being updated and interrupt enabled
+ * ice_enable_interrupt - re-enable MSI-X interrupt
+ * @q_vector: the vector associated with the interrupt to enable
+ *
+ * If the VSI is down, the interrupt will not be re-enabled. Also,
+ * when enabling the interrupt always reset the wb_on_itr to false
+ * and trigger a software interrupt to clean out internal state.
  */
-static void ice_update_ena_itr(struct ice_q_vector *q_vector)
+static void ice_enable_interrupt(struct ice_q_vector *q_vector)
 {
-	struct ice_ring_container *tx = &q_vector->tx;
-	struct ice_ring_container *rx = &q_vector->rx;
 	struct ice_vsi *vsi = q_vector->vsi;
+	bool wb_en = q_vector->wb_on_itr;
 	u32 itr_val;
 
-	/* when exiting WB_ON_ITR lets set a low ITR value and trigger
-	 * interrupts to expire right away in case we have more work ready to go
-	 * already
-	 */
-	if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE) {
-		itr_val = ice_buildreg_itr(rx->itr_idx, ICE_WB_ON_ITR_USECS);
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
-		/* set target back to last user set value */
-		rx->target_itr = rx->itr_setting;
-		/* set current to what we just wrote and dynamic if needed */
-		rx->current_itr = ICE_WB_ON_ITR_USECS |
-			(rx->itr_setting & ICE_ITR_DYNAMIC);
-		/* allow normal interrupt flow to start */
-		q_vector->itr_countdown = 0;
+	if (test_bit(ICE_DOWN, vsi->state))
 		return;
+
+	/* When exiting WB_ON_ITR, let ITR resume its normal
+	 * interrupts-enabled path.
+	 */
+	if (wb_en)
+		q_vector->wb_on_itr = false;
+
+	itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0);
+	/* trigger an immediate software interrupt when exiting
+	 * busy poll, to make sure to catch any pending cleanups
+	 * that might have been missed due to interrupt state
+	 * transition.
+	 */
+	if (wb_en) {
+		itr_val |= GLINT_DYN_CTL_SWINT_TRIG_M |
+			   GLINT_DYN_CTL_SW_ITR_INDX_M |
+			   GLINT_DYN_CTL_SW_ITR_INDX_ENA_M;
 	}
+	wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
+}
 
-	/* This will do nothing if dynamic updates are not enabled */
-	ice_update_itr(q_vector, tx);
-	ice_update_itr(q_vector, rx);
+/**
+ * ice_refresh_bp_state - refresh state machine
+ * @napi: ptr to NAPI struct
+ * @budget: NAPI budget
+ *
+ * Update ADQ state machine, and depending on whether this was called from
+ * busy poll, enable interrupts and update ITR
+ */
+static void ice_refresh_bp_state(struct napi_struct *napi, int budget)
+{
+	struct ice_q_vector *q_vector =
+			       container_of(napi, struct ice_q_vector, napi);
+
+	if (ice_vsi_pkt_process_bp_stop_ena(q_vector->ch->ch_vsi) &&
+	    (q_vector->state_flags & ICE_CHNL_WD_EQUALS_BP)) {
+		/* Manage the internal state in such a way that, napi_poll
+		 * can decide when to perform Rx cleanup. When internal
+		 * state indicates that vector is transition from busy_poll to
+		 * interrupt, napi_poll avoid cleaning Rx rings and that
+		 * eventually translates to whether driver will return
+		 * "budget" or not.
+		 *
+		 * Keep internal state as it is "budget" specified
+		 * is not equal to "napi->weight), hence "skip
+		 * When napi weight is equal to budget, and reached the
+		 * value of tunable (max_limit_process_rx_queues), follow
+		 * the NAPI state as seen by OS, otherwise skip internal
+		 * state update (which will allow to keep the vector
+		 * internal state to be whatever it was, in this case)
+		 */
+		if (napi->weight == budget &&
+		    q_vector->process_rx_queues ==
+		    q_vector->max_limit_process_rx_queues) {
+			/* reached the point, keep internal state to be in
+			 * sync with NAPI state as seen by OS
+			 */
+			goto state_update;
+		} else {
+#ifdef ADQ_PERF_COUNTERS
+			if (napi->weight == budget)
+				q_vector->ch_stats.keep_state_bp_budget64++;
+			else
+				q_vector->ch_stats.keep_state_bp_budget8++;
+#endif /* ADQ_PERF_COUNTERS */
+			/* keep internal state of vector as it is, do not
+			 * perform state update
+			 */
+			goto skip_state_update;
+		}
+	}
 
-	/* This block of logic allows us to get away with only updating
-	 * one ITR value with each interrupt. The idea is to perform a
-	 * pseudo-lazy update with the following criteria.
-	 *
-	 * 1. Rx is given higher priority than Tx if both are in same state
-	 * 2. If we must reduce an ITR that is given highest priority.
-	 * 3. We then give priority to increasing ITR based on amount.
+state_update:
+	/* cache previous state of vector */
+	if (q_vector->state_flags & ICE_CHNL_IN_BP)
+		q_vector->state_flags |= ICE_CHNL_PREV_IN_BP;
+	else
+		q_vector->state_flags &= ~ICE_CHNL_PREV_IN_BP;
+
+#ifdef HAVE_NAPI_STATE_IN_BUSY_POLL
+	/* update current state of vector */
+	if (test_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state))
+		q_vector->state_flags |= ICE_CHNL_IN_BP;
+	else
+		q_vector->state_flags &= ~ICE_CHNL_IN_BP;
+
+#endif /* HAVE_STATE_IN_BUSY_POLL */
+
+skip_state_update:
+	if (q_vector->state_flags & ICE_CHNL_IN_BP) {
+		q_vector->jiffy = jiffies;
+		/* triffer force_wb by setting WB_ON_ITR only when
+		 * - vector is transitioning from INTR->BUSY_POLL
+		 * - once_in_bp is false, this is to prevent from doing it
+		 * every time whenever vector state is changing from
+		 * INTR->BUSY_POLL because that could be due to legit
+		 * busy_poll stop
+		 */
+		if (!(q_vector->state_flags & ICE_CHNL_ONCE_IN_BP) &&
+		    ice_vector_intr_busypoll(q_vector))
+			ice_force_wb(&q_vector->vsi->back->hw, q_vector);
+
+		q_vector->state_flags |= ICE_CHNL_ONCE_IN_BP;
+#ifdef ADQ_PERF_COUNTERS
+		q_vector->ch_stats.in_bp++;
+		/* state transition : INTERRUPT --> BUSY_POLL */
+		if (!(q_vector->state_flags & ICE_CHNL_PREV_IN_BP))
+			q_vector->ch_stats.real_int_to_bp++;
+		else
+			q_vector->ch_stats.real_bp_to_bp++;
+	} else {
+		q_vector->ch_stats.in_int++;
+		/* state transition : BUSY_POLL --> INTERRUPT */
+		if (q_vector->state_flags & ICE_CHNL_PREV_IN_BP)
+			q_vector->ch_stats.real_bp_to_int++;
+		else
+			q_vector->ch_stats.real_int_to_int++;
+#endif /* ADQ_PERF_COUNTERS */
+	}
+}
+
+/**
+ * ice_handle_chnl_vector - handle channel enabled vector
+ * @q_vector: ptr to q_vector
+ * @unlikely_cb_bp: will comeback to busy_poll or not
+ *
+ * This function eithers triggers software interrupt (when unlikely_cb_bp is
+ * true) or enable interrupt normally. unlikely_cb_bp gets determined based
+ * on state machine and packet parsing logic.
+ */
+static void
+ice_handle_chnl_vector(struct ice_q_vector *q_vector, bool unlikely_cb_bp)
+{
+#ifdef ADQ_PERF_COUNTERS
+	struct ice_q_vector_ch_stats *stats = &q_vector->ch_stats;
+#endif /* ADQ_PERF_COUNTERS */
+	struct ice_vsi *ch_vsi = q_vector->ch->ch_vsi;
+	struct ice_vsi *vsi = q_vector->vsi;
+
+
+	/* caller of this function deteremines next occurrence/execution context
+	 * of napi_poll (means next time whether napi_poll will be invoked from
+	 * busy_poll or SOFT IRQ context). Please refer to the caller of this
+	 * function to see logic for "unlikely_cb_bp" (aka, re-occurrence to
+	 * busy_poll or not).
+	 * If logic determines that, next occurrence of napi_poll will not be
+	 * from busy_poll context, trigger software initiated interrupt on
+	 * channel enabled vector to revive queue(s) processing, otherwise if
+	 * in true interrupt state - just enable interrupt.
 	 */
-	if (rx->target_itr < rx->current_itr) {
-		/* Rx ITR needs to be reduced, this is highest priority */
-		itr_val = ice_buildreg_itr(rx->itr_idx, rx->target_itr);
-		rx->current_itr = rx->target_itr;
-		q_vector->itr_countdown = ITR_COUNTDOWN_START;
-	} else if ((tx->target_itr < tx->current_itr) ||
-		   ((rx->target_itr - rx->current_itr) <
-		    (tx->target_itr - tx->current_itr))) {
-		/* Tx ITR needs to be reduced, this is second priority
-		 * Tx ITR needs to be increased more than Rx, fourth priority
+	if (unlikely_cb_bp) {
+#ifdef ADQ_PERF_COUNTERS
+		stats->unlikely_cb_to_bp++;
+		if (q_vector->state_flags & ICE_CHNL_ONCE_IN_BP)
+			stats->ucb_o_bp++;
+#endif /* ADQ_PERF_COUNTERS */
+
+		/* if once_in_bp is set and pkt inspection based optimization
+		 * is off, do not trigger SW interrupt (simply bailout).
+		 * No change in logic from service_task based software
+		 * triggred interrupt - to revive the queue based on jiffy logic
+		 */
+		if (ch_vsi && (q_vector->state_flags & ICE_CHNL_ONCE_IN_BP)) {
+			if (!ice_vsi_pkt_inspect_opt_ena(ch_vsi)) {
+#ifdef ADQ_PERF_COUNTERS
+				stats->num_no_sw_intr_opt_off++;
+#endif /* ADQ_PERF_COUNTERS */
+				return;
+			}
+		}
+
+		/* Since this real BP -> INT transition,
+		 * reset Jiffies snapshot
 		 */
-		itr_val = ice_buildreg_itr(tx->itr_idx, tx->target_itr);
-		tx->current_itr = tx->target_itr;
-		q_vector->itr_countdown = ITR_COUNTDOWN_START;
-	} else if (rx->current_itr != rx->target_itr) {
-		/* Rx ITR needs to be increased, third priority */
-		itr_val = ice_buildreg_itr(rx->itr_idx, rx->target_itr);
-		rx->current_itr = rx->target_itr;
-		q_vector->itr_countdown = ITR_COUNTDOWN_START;
+		q_vector->jiffy = 0;
+
+		/* Likewise for real BP -> INT, trigger
+		 * SW interrupt, so that vector is put back
+		 * in sane state, trigger sw interrupt to revive the queue
+		 */
+#ifdef ADQ_PERF_COUNTERS
+		ice_sw_intr_cntr(q_vector, true);
+#endif /* ADQ_PERF_COUNTERS */
+		ice_adq_trigger_sw_intr(&vsi->back->hw, q_vector);
+	} else if (!(q_vector->state_flags & ICE_CHNL_ONCE_IN_BP)) {
+#ifdef ADQ_PERF_COUNTERS
+		stats->once_bp_false++;
+#endif /* ADQ_PERF_COUNTERS */
+		ice_enable_interrupt(q_vector);
+	}
+}
+
+#ifdef HAVE_NAPI_STATE_IN_BUSY_POLL
+/**
+ * ice_chnl_vector_bypass_clean_complete
+ * @napi: ptr to napi
+ * @budget: value of budget (it could be napi:weight or BUSY_POLL_BUDGET)
+ * @work_done: amount of work_done (number of packets cleaned)
+ *
+ * This function returns true upon following condition:
+ * - state of NAPI is IN_BUSY_POLL (this is subject to change)
+ * - priv-flag "channel-pkt-clean-bp-stop" is disabled - means user turned
+ *   off such optimization (this is high level knob for user)
+ * - vector state is set (workdone == budget) and napi:weight == budget (means
+ *   invoked from napi_schedule coe path) and limit of optimization is
+ *   reached
+ *
+ * When this function returns true, caller of this function (napi_poll)
+ * do not allow napi_poll to return "budget". This is to prevent OS calling
+ * us upto 2 msec or 10 times (softirq.c:__do_softirq)
+ */
+static bool
+ice_chnl_vector_bypass_clean_complete(struct napi_struct *napi, int budget,
+				      int work_done)
+{
+	struct ice_q_vector *qv = container_of(napi, struct ice_q_vector, napi);
+
+	if (!ice_vector_ever_in_busypoll(qv))
+		return false;
+
+	if (!ice_vsi_pkt_process_bp_stop_ena(qv->ch->ch_vsi))
+		return true; /* like what it was before */
+
+#ifdef ADQ_PERF_COUNTERS
+	if (napi->weight == budget) /* napi_schedule */
+		qv->ch_stats.pkt_bp_stop_napi_budget += work_done;
+	else /* busy_poll_stop */
+		qv->ch_stats.pkt_bp_stop_bp_budget += work_done;
+#endif /* ADQ_PERF_COUNTERS */
+
+	if ((qv->state_flags & ICE_CHNL_WD_EQUALS_BP) &&
+	    napi->weight == budget) {
+		qv->process_rx_queues++;
+		if (qv->process_rx_queues == qv->max_limit_process_rx_queues)
+			return true;
 	} else {
-		/* Still have to re-enable the interrupts */
-		itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0);
-		if (q_vector->itr_countdown)
-			q_vector->itr_countdown--;
+		qv->process_rx_queues = 0;
 	}
 
-	if (!test_bit(__ICE_DOWN, q_vector->vsi->state))
-		wr32(&q_vector->vsi->back->hw,
-		     GLINT_DYN_CTL(q_vector->reg_idx),
-		     itr_val);
+	return false;
+}
+#endif /* HAVE_NAPI_STATE_IN_BUSY_POLL */
+
+/**
+ * ice_chnl_vector_wd_eq_budget - detect workdone equals budget and set bit
+ * @napi: ptr to napi
+ * @budget: value of budget (it could be napi:weight or BUSY_POLL_BUDGET)
+ * @clean_complete: value of clean_complete as computed by napi_poll
+ * @cleaned_any_data_pkt: this function detects true of cleaned any data pkt
+ *
+ * Based on value of "clean_complete", set/reset per vector state
+ * bit indicating that workdone == budget condition has reached and
+ * increment specific stats based on value of "budget"
+ */
+static void
+ice_chnl_vector_wd_eq_budget(struct napi_struct *napi, int budget,
+			     bool clean_complete, bool *cleaned_any_data_pkt)
+{
+	struct ice_q_vector *qv = container_of(napi, struct ice_q_vector, napi);
+
+	if ((qv->state_flags & ICE_CHNL_IN_BP) &&
+	    ice_vsi_pkt_process_bp_stop_ena(qv->ch->ch_vsi)) {
+		if (qv->state_flags & ICE_CHNL_PREV_DATA_PKT_RECV) {
+			qv->state_flags &= ~ICE_CHNL_PREV_DATA_PKT_RECV;
+			*cleaned_any_data_pkt = true;
+#ifdef ADQ_PERF_COUNTERS
+			qv->ch_stats.cleaned_any_data_pkt++;
+#endif /* ADQ_PERF_COUNTERS */
+		}
+		/* Take snapshot if work_done == budget, which is used
+		 * when busy_poll_stop is called, to decide it internal
+		 * state machine to keep in BUSY_POLL or not.
+		 * see ice_refresh_bp_state function for details.
+		 */
+		if (!clean_complete)
+			qv->state_flags |= ICE_CHNL_WD_EQUALS_BP;
+		else
+			qv->state_flags &= ~ICE_CHNL_WD_EQUALS_BP;
+#ifdef ADQ_PERF_COUNTERS
+		if (qv->state_flags & ICE_CHNL_WD_EQUALS_BP) {
+			if (napi->weight == budget)
+				qv->ch_stats.bp_wd_equals_budget64++;
+			else
+				qv->ch_stats.bp_wd_equals_budget8++;
+		}
+#endif /* ADQ_PERF_COUNTERS */
+	} else {
+		qv->state_flags &= ~ICE_CHNL_WD_EQUALS_BP;
+	}
 }
 
 /**
@@ -1434,32 +2069,31 @@ static void ice_update_ena_itr(struct ice_q_vector *q_vector)
  * We need to tell hardware to write-back completed descriptors even when
  * interrupts are disabled. Descriptors will be written back on cache line
  * boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR
- * descriptors may not be written back if they don't fill a cache line until the
- * next interrupt.
+ * descriptors may not be written back if they don't fill a cache line until
+ * the next interrupt.
  *
- * This sets the write-back frequency to 2 microseconds as that is the minimum
- * value that's not 0 due to ITR granularity. Also, set the INTENA_MSK bit to
- * make sure hardware knows we aren't meddling with the INTENA_M bit.
+ * This sets the write-back frequency to whatever was set previously for the
+ * ITR indices. Also, set the INTENA_MSK bit to make sure hardware knows we
+ * aren't meddling with the INTENA_M bit.
  */
 static void ice_set_wb_on_itr(struct ice_q_vector *q_vector)
 {
 	struct ice_vsi *vsi = q_vector->vsi;
 
-	/* already in WB_ON_ITR mode no need to change it */
-	if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE)
+	/* already in wb_on_itr mode no need to change it */
+	if (q_vector->wb_on_itr)
 		return;
 
-	if (q_vector->num_ring_rx)
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
-		     ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS,
-						 ICE_RX_ITR));
-
-	if (q_vector->num_ring_tx)
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
-		     ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS,
-						 ICE_TX_ITR));
+	/* use previously set ITR values for all of the ITR indices by
+	 * specifying ICE_ITR_NONE, which will vary in adaptive (AIM) mode and
+	 * be static in non-adaptive mode (user configured)
+	 */
+	wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
+	     ((ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) &
+	      GLINT_DYN_CTL_ITR_INDX_M) | GLINT_DYN_CTL_INTENA_MSK_M |
+	     GLINT_DYN_CTL_WB_ON_ITR_M);
 
-	q_vector->itr_countdown = ICE_IN_WB_ON_ITR_MODE;
+	q_vector->wb_on_itr = true;
 }
 
 /**
@@ -1475,29 +2109,103 @@ int ice_napi_poll(struct napi_struct *napi, int budget)
 {
 	struct ice_q_vector *q_vector =
 				container_of(napi, struct ice_q_vector, napi);
+	bool cleaned_any_data_pkt = false;
+	bool unlikely_cb_bp = false;
 	bool clean_complete = true;
 	struct ice_ring *ring;
 	int budget_per_ring;
 	int work_done = 0;
+	bool ch_enabled;
+
+	/* determine once if vector needs to be processed differently */
+	ch_enabled = ice_vector_ch_enabled(q_vector);
+	if (ch_enabled) {
+		/* Refresh state machine */
+		ice_refresh_bp_state(napi, budget);
+
+		/* check during previous run of napi_poll whether at least one
+		 * data packets is processed or not. If processed at least one
+		 * data packet, set the local flag 'cleaned_any_data_pkt'
+		 * which is used later in this function to determine if
+		 * interrupt should be enabled or deferred (this is applicable
+		 * only in case when busy_poll stop is invoked, means previous
+		 * state of vector is in busy_poll and current state is not
+		 * (aka BUSY_POLL -> INTR))
+		 */
+		if (q_vector->state_flags & ICE_CHNL_PREV_DATA_PKT_RECV) {
+			q_vector->state_flags &= ~ICE_CHNL_PREV_DATA_PKT_RECV;
+			/* It is important to check and cache correct
+			 * information (cleaned any data packets or not) in
+			 * local variable before napi_complete_done is finished.
+			 * Once napi_complete_done is returned, napi_poll
+			 * can get invoked again (means re-entrant) which can
+			 * potentially results to incorrect decision making
+			 * w.r.t. whether interrupt should be enabled or
+			 * deferred)
+			 */
+			if (ice_vector_busypoll_intr(q_vector)) {
+				cleaned_any_data_pkt = true;
+#ifdef ADQ_PERF_COUNTERS
+				q_vector->ch_stats.cleaned_any_data_pkt++;
+#endif /* ADQ_PERF_COUNTERS */
+			}
+		}
+	}
 
 	/* Since the actual Tx work is minimal, we can give the Tx a larger
 	 * budget and be more aggressive about cleaning up the Tx descriptors.
 	 */
-	ice_for_each_ring(ring, q_vector->tx)
-		if (!ice_clean_tx_irq(ring, budget))
+	ice_for_each_ring(ring, q_vector->tx) {
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+		bool wd = ring->xsk_pool ?
+			  ice_clean_tx_irq_zc(ring) :
+			  ice_clean_tx_irq(ring, budget);
+#else
+		bool wd = ice_clean_tx_irq(ring, budget);
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+#ifdef ICE_ADD_PROBES
+		if (!wd) {
+			struct ice_q_stats *stats = &ring->stats;
+
+			/* if we are reporting that we are not done, then we
+			 * know napi is going to continue so increment the
+			 * count
+			 */
+			stats->napi_poll_cnt++;
+			clean_complete = false;
+		}
+#else /* ICE_ADD_PROBES */
+		if (!wd)
 			clean_complete = false;
+#endif /* !ICE_ADD_PROBES */
+	}
 
 	/* Handle case where we are called by netpoll with a budget of 0 */
 	if (unlikely(budget <= 0))
 		return budget;
 
+	/* state transitioning from BUSY_POLL --> INTERRUPT. This can happen
+	 * due to several reason when stack calls busy_poll_stop
+	 *    1. during last execution of napi_poll returned non-zero packets
+	 *    2. busy_loop ended
+	 *    3. need re-sched set
+	 * driver keeps track of packets were cleaned during last run and if
+	 * that is zero, means most likely napi_poll won't be invoked from
+	 * busy_poll context; in that situation bypass processing of Rx queues
+	 * and enable interrupt and let subsequent run of napi_poll from
+	 * interrupt path handle cleanup of Rx queues
+	 */
+	if (ch_enabled && ice_vector_busypoll_intr(q_vector))
+		goto bypass;
+
 	/* normally we have 1 Rx ring per q_vector */
 	if (unlikely(q_vector->num_ring_rx > 1))
 		/* We attempt to distribute budget to each Rx queue fairly, but
 		 * don't allow the budget to go below 1 because that would exit
 		 * polling early.
 		 */
-		budget_per_ring = max(budget / q_vector->num_ring_rx, 1);
+		budget_per_ring = max_t(int, budget / q_vector->num_ring_rx, 1);
 	else
 		/* Max of 1 Rx ring in this q_vector so give it the budget */
 		budget_per_ring = budget;
@@ -1505,37 +2213,146 @@ int ice_napi_poll(struct napi_struct *napi, int budget)
 	ice_for_each_ring(ring, q_vector->rx) {
 		int cleaned;
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+		/* A dedicated path for zero-copy allows making a single
+		 * comparison in the irq context instead of many inside the
+		 * ice_clean_rx_irq function and makes the codebase cleaner.
+		 */
+		cleaned = ring->xsk_pool ?
+			  ice_clean_rx_irq_zc(ring, budget_per_ring) :
+			  ice_clean_rx_irq(ring, budget_per_ring);
+#else
 		cleaned = ice_clean_rx_irq(ring, budget_per_ring);
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 		work_done += cleaned;
 		/* if we clean as many as budgeted, we must not be done */
+#ifdef ICE_ADD_PROBES
+		if (cleaned >= budget_per_ring) {
+			struct ice_q_stats *stats = &ring->stats;
+
+			/* if we are reporting that we are not done, then we
+			 * know napi is going to continue so increment the
+			 * count
+			 */
+			stats->napi_poll_cnt++;
+			clean_complete = false;
+		}
+#else /* ICE_ADD_PROBES */
 		if (cleaned >= budget_per_ring)
 			clean_complete = false;
+#endif /* !ICE_ADD_PROBES */
+
+		if (ch_enabled)
+			ice_chnl_vector_wd_eq_budget(napi, budget,
+						     clean_complete,
+						     &cleaned_any_data_pkt);
+	} /* end for ice_for_each_ring */
+
+#ifdef HAVE_NAPI_STATE_IN_BUSY_POLL
+	if (ch_enabled &&
+	    (!test_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state))) {
+		if (ice_chnl_vector_bypass_clean_complete(napi, budget,
+							  work_done))
+			goto bypass;
 	}
 
+#endif /* HAVE_NAPI_STATE_IN_BUSY_POLL */
 	/* If work not completed, return budget and polling will return */
-	if (!clean_complete)
+	if (!clean_complete) {
+		/* Set the writeback on ITR so partial completions of
+		 * cache-lines will still continue even if we're polling.
+		 */
+		if (!ch_enabled)
+			ice_set_wb_on_itr(q_vector);
 		return budget;
+	}
 
-	/* Exit the polling mode, but don't re-enable interrupts if stack might
-	 * poll us due to busy-polling
+bypass:
+	/* reset the counter if code flow reached here because this function
+	 * determined that it is not going to return budget and will
+	 * end up calling napi_complete_done followed by return value < budget
 	 */
-	if (likely(napi_complete_done(napi, work_done)))
-		ice_update_ena_itr(q_vector);
-	else
-		ice_set_wb_on_itr(q_vector);
+	q_vector->process_rx_queues = 0;
+
+#ifdef ADQ_PERF_COUNTERS
+	/* Following block is only for stats */
+	if (ch_enabled && ice_vector_busypoll_intr(q_vector)) {
+		struct ice_q_vector_ch_stats *stats;
+
+		stats = &q_vector->ch_stats;
+		if (unlikely(need_resched())) {
+			stats->num_need_resched_bp_stop++;
+			if (!cleaned_any_data_pkt)
+				stats->num_l_c_data_pkt++;
+		} else {
+			/* here , means actually because of 2 reason
+			 * - busy_poll timeout expired
+			 * - last time, cleaned data packets, hence
+			 *  stack asked to stop busy_poll so that packet
+			 *  can be processed by consumer
+			 */
+			stats->num_timeout_bp_stop++;
+			if (!cleaned_any_data_pkt)
+				stats->num_l_c_data_pkt1++;
+		}
+	}
+#endif /* ADQ_PERF_COUNTERS */
 
-	return min_t(int, work_done, budget - 1);
-}
+	/* if state transition from busy_poll to interrupt and during
+	 * last run: did not cleanup TCP data packets -
+	 *      then application unlikely to comeback to busy_poll
+	 */
+	if (ch_enabled && ice_vector_busypoll_intr(q_vector) &&
+	    !cleaned_any_data_pkt) {
+		/* for now, if need_resched is true (it can be either
+		 * due to voluntary/in-voluntary context switches),
+		 * do not trigger SW interrupt.
+		 * if need_resched is not set, safely assuming, it is due
+		 * to possible timeout and unlikely that application/context
+		 * will return to busy_poll, hence set 'unlikely_cb_bp' to
+		 * true which will cause software triggered interrupt
+		 * to reviev the queue/vector
+		 */
+		if (unlikely(need_resched()))
+			unlikely_cb_bp = false;
+		else
+			unlikely_cb_bp = true;
+	}
 
-/* helper function for building cmd/type/offset */
-static __le64
-build_ctob(u64 td_cmd, u64 td_offset, unsigned int size, u64 td_tag)
-{
-	return cpu_to_le64(ICE_TX_DESC_DTYPE_DATA |
-			   (td_cmd    << ICE_TXD_QW1_CMD_S) |
-			   (td_offset << ICE_TXD_QW1_OFFSET_S) |
-			   ((u64)size << ICE_TXD_QW1_TX_BUF_SZ_S) |
-			   (td_tag    << ICE_TXD_QW1_L2TAG1_S));
+	/* Work is done so exit the polling mode and re-enable the interrupt */
+	if (likely(napi_complete_done(napi, work_done))) {
+		/* napi_ret : false (means vector is still in POLLING mode
+		 *            true (means out of POLLING)
+		 * NOTE: Generally if napi_ret is TRUE, enable device interrupt
+		 * but there are condition/optimization, where it can be
+		 * optimized. Basically, if napi_complete_done returns true.
+		 * But if it is last time Rx packets were cleaned,
+		 * then most likely, consumer thread will come back to do
+		 * busy_polling where cleaning of  Tx/Rx queue will happen
+		 * normally. Hence no reason to arm the interrupt.
+		 *
+		 * If for some reason, consumer thread/context doesn't comeback
+		 * to busy_poll:napi_poll, there is bail-out mechanism to kick
+		 * start the state machine thru' SW triggered interrupt from
+		 * service task.
+		 */
+		if (ch_enabled) {
+			/* current state of NAPI is INTERRUPT */
+			ice_handle_chnl_vector(q_vector, unlikely_cb_bp);
+		} else {
+			/* vector is not channel enabled and NAPI is not in
+			 * BUSY_POLL, always enable interrupt
+			 */
+			ice_net_dim(q_vector);
+			ice_enable_interrupt(q_vector);
+		}
+
+	} else {
+		if (!ch_enabled)
+			ice_set_wb_on_itr(q_vector);
+	}
+
+	return min_t(int, work_done, budget - 1);
 }
 
 /**
@@ -1592,11 +2409,11 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 {
 	u64 td_offset, td_tag, td_cmd;
 	u16 i = tx_ring->next_to_use;
-	skb_frag_t *frag;
 	unsigned int data_len, size;
 	struct ice_tx_desc *tx_desc;
 	struct ice_tx_buf *tx_buf;
 	struct sk_buff *skb;
+	skb_frag_t *frag;
 	dma_addr_t dma;
 
 	td_tag = off->td_l2tag1;
@@ -1607,6 +2424,7 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 	data_len = skb->data_len;
 	size = skb_headlen(skb);
 
+
 	tx_desc = ICE_TX_DESC(tx_ring, i);
 
 	if (first->tx_flags & ICE_TX_FLAGS_HW_VLAN) {
@@ -1638,7 +2456,8 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 		 */
 		while (unlikely(size > ICE_MAX_DATA_PER_TXD)) {
 			tx_desc->cmd_type_offset_bsz =
-				build_ctob(td_cmd, td_offset, max_data, td_tag);
+				ice_build_ctob(td_cmd, td_offset, max_data,
+					       td_tag);
 
 			tx_desc++;
 			i++;
@@ -1658,8 +2477,8 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 		if (likely(!data_len))
 			break;
 
-		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
-							  size, td_tag);
+		tx_desc->cmd_type_offset_bsz = ice_build_ctob(td_cmd, td_offset,
+							      size, td_tag);
 
 		tx_desc++;
 		i++;
@@ -1684,15 +2503,14 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 	/* record SW timestamp if HW timestamp is not available */
 	skb_tx_timestamp(first->skb);
 
+	/* write last descriptor with RS and EOP bits */
+	td_cmd |= (u64)ICE_TXD_LAST_DESC_CMD;
+	tx_desc->cmd_type_offset_bsz =
+			ice_build_ctob(td_cmd, td_offset, size, td_tag);
 	i++;
 	if (i == tx_ring->count)
 		i = 0;
 
-	/* write last descriptor with RS and EOP bits */
-	td_cmd |= (u64)(ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS);
-	tx_desc->cmd_type_offset_bsz =
-			build_ctob(td_cmd, td_offset, size, td_tag);
-
 	/* Force memory writes to complete before letting h/w know there
 	 * are new descriptors to fetch.
 	 *
@@ -1709,9 +2527,20 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 	ice_maybe_stop_tx(tx_ring, DESC_NEEDED);
 
 	/* notify HW of packet */
+#ifdef HAVE_SKB_XMIT_MORE
 	if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) {
-		writel(i, tx_ring->tail);
+#endif /* HAVE_SKB_XMIT_MORE */
+		writel_relaxed(i, tx_ring->tail);
+#ifndef SPIN_UNLOCK_IMPLIES_MMIOWB
+
+		/* we need this if more than one processor can write to our tail
+		 * at a time, it synchronizes IO on IA64/Altix systems
+		 */
+		mmiowb();
+#endif /* SPIN_UNLOCK_IMPLIES_MMIOWB */
+#ifdef HAVE_SKB_XMIT_MORE
 	}
+#endif /* HAVE_SKB_XMIT_MORE */
 
 	return;
 
@@ -1730,6 +2559,7 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 	tx_ring->next_to_use = i;
 }
 
+
 /**
  * ice_tx_csum - Enable Tx checksum offloads
  * @first: pointer to the first descriptor
@@ -1740,6 +2570,9 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 static
 int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 {
+#ifdef ICE_ADD_PROBES
+	struct ice_ring *tx_ring = off->tx_ring;
+#endif
 	u32 l4_len = 0, l3_len = 0, l2_len = 0;
 	struct sk_buff *skb = first->skb;
 	union {
@@ -1766,22 +2599,117 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 	l2_len = ip.hdr - skb->data;
 	offset = (l2_len / 2) << ICE_TX_DESC_LEN_MACLEN_S;
 
-	if (skb->encapsulation)
-		return -1;
+	protocol = vlan_get_protocol(skb);
+
+	if (protocol == htons(ETH_P_IP))
+		first->tx_flags |= ICE_TX_FLAGS_IPV4;
+	else if (protocol == htons(ETH_P_IPV6))
+		first->tx_flags |= ICE_TX_FLAGS_IPV6;
+
+	if (skb->encapsulation) {
+		bool gso_ena = false;
+		u32 tunnel = 0;
+
+		/* define outer network header type */
+		if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
+			tunnel |= (first->tx_flags & ICE_TX_FLAGS_TSO) ?
+				  ICE_TX_CTX_EIPT_IPV4 :
+				  ICE_TX_CTX_EIPT_IPV4_NO_CSUM;
+			l4_proto = ip.v4->protocol;
+		} else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
+			int ret;
+
+			tunnel |= ICE_TX_CTX_EIPT_IPV6;
+			exthdr = ip.hdr + sizeof(*ip.v6);
+			l4_proto = ip.v6->nexthdr;
+			ret = ipv6_skip_exthdr(skb, exthdr - skb->data,
+					       &l4_proto, &frag_off);
+			if (ret < 0)
+				return -1;
+		}
+
+		/* define outer transport */
+		switch (l4_proto) {
+		case IPPROTO_UDP:
+			tunnel |= ICE_TXD_CTX_UDP_TUNNELING;
+			first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
+			break;
+		case IPPROTO_GRE:
+			tunnel |= ICE_TXD_CTX_GRE_TUNNELING;
+			first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
+			break;
+		case IPPROTO_IPIP:
+		case IPPROTO_IPV6:
+			first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
+			l4.hdr = skb_inner_network_header(skb);
+			break;
+		default:
+			if (first->tx_flags & ICE_TX_FLAGS_TSO)
+				return -1;
+
+			skb_checksum_help(skb);
+			return 0;
+		}
+
+#ifdef ICE_ADD_PROBES
+		if (protocol == htons(ETH_P_IP))
+			tx_ring->vsi->back->tx_ip4_cso++;
+#endif
+		/* compute outer L3 header size */
+		tunnel |= ((l4.hdr - ip.hdr) / 4) <<
+			  ICE_TXD_CTX_QW0_EIPLEN_S;
+
+		/* switch IP header pointer from outer to inner header */
+		ip.hdr = skb_inner_network_header(skb);
+
+		/* compute tunnel header size */
+		tunnel |= ((ip.hdr - l4.hdr) / 2) <<
+			   ICE_TXD_CTX_QW0_NATLEN_S;
+
+#ifdef NETIF_F_GSO_PARTIAL
+		gso_ena = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL;
+#endif
+		/* indicate if we need to offload outer UDP header */
+		if ((first->tx_flags & ICE_TX_FLAGS_TSO) && !gso_ena &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
+			tunnel |= ICE_TXD_CTX_QW0_L4T_CS_M;
+
+		/* record tunnel offload values */
+		off->cd_tunnel_params |= tunnel;
+
+		/* set DTYP=1 to indicate that it's an Tx context descriptor
+		 * in IPsec tunnel mode with Tx offloads in Quad word 1
+		 */
+		off->cd_qw1 |= (u64)ICE_TX_DESC_DTYPE_CTX;
+
+		/* switch L4 header pointer from outer to inner */
+		l4.hdr = skb_inner_transport_header(skb);
+		l4_proto = 0;
+
+		/* reset type as we transition from outer to inner headers */
+		first->tx_flags &= ~(ICE_TX_FLAGS_IPV4 | ICE_TX_FLAGS_IPV6);
+		if (ip.v4->version == 4)
+			first->tx_flags |= ICE_TX_FLAGS_IPV4;
+		if (ip.v6->version == 6)
+			first->tx_flags |= ICE_TX_FLAGS_IPV6;
+	}
 
 	/* Enable IP checksum offloads */
-	protocol = vlan_get_protocol(skb);
-	if (protocol == htons(ETH_P_IP)) {
+	if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
 		l4_proto = ip.v4->protocol;
 		/* the stack computes the IP header already, the only time we
 		 * need the hardware to recompute it is in the case of TSO.
 		 */
+
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_ip4_cso++;
+#endif
 		if (first->tx_flags & ICE_TX_FLAGS_TSO)
 			cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM;
 		else
 			cmd |= ICE_TX_DESC_CMD_IIPT_IPV4;
 
-	} else if (protocol == htons(ETH_P_IPV6)) {
+	} else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
 		cmd |= ICE_TX_DESC_CMD_IIPT_IPV6;
 		exthdr = ip.hdr + sizeof(*ip.v6);
 		l4_proto = ip.v6->nexthdr;
@@ -1789,6 +2717,9 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 			ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_proto,
 					 &frag_off);
 	} else {
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_l3_cso_err++;
+#endif
 		return -1;
 	}
 
@@ -1803,21 +2734,33 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 		cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP;
 		l4_len = l4.tcp->doff;
 		offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_tcp_cso++;
+#endif
 		break;
 	case IPPROTO_UDP:
 		/* enable UDP checksum offload */
 		cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP;
 		l4_len = (sizeof(struct udphdr) >> 2);
 		offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_udp_cso++;
+#endif
 		break;
 	case IPPROTO_SCTP:
 		/* enable SCTP checksum offload */
 		cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP;
 		l4_len = sizeof(struct sctphdr) >> 2;
 		offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_sctp_cso++;
+#endif
 		break;
 
 	default:
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_l4_cso_err++;
+#endif
 		if (first->tx_flags & ICE_TX_FLAGS_TSO)
 			return -1;
 		skb_checksum_help(skb);
@@ -1836,50 +2779,59 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
  *
  * Checks the skb and set up correspondingly several generic transmit flags
  * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
- *
- * Returns error code indicate the frame should be dropped upon error and the
- * otherwise returns 0 to indicate the flags has been set properly.
  */
-static int
+static void
 ice_tx_prepare_vlan_flags(struct ice_ring *tx_ring, struct ice_tx_buf *first)
 {
 	struct sk_buff *skb = first->skb;
-	__be16 protocol = skb->protocol;
-
-	if (protocol == htons(ETH_P_8021Q) &&
-	    !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
-		/* when HW VLAN acceleration is turned off by the user the
-		 * stack sets the protocol to 8021q so that the driver
-		 * can take any steps required to support the SW only
-		 * VLAN handling. In our case the driver doesn't need
-		 * to take any further steps so just set the protocol
-		 * to the encapsulated ethertype.
-		 */
-		skb->protocol = vlan_get_protocol(skb);
-		return 0;
-	}
 
-	/* if we have a HW VLAN tag being added, default to the HW one */
+	/* nothing left to do, software offloaded VLAN */
+	if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol))
+		return;
+
+	/* the VLAN ethertype/tpid is determined by VSI configuration and netdev
+	 * feature flags, which the driver only allows either 802.1Q or 802.1ad
+	 * VLAN offloads exclusively so we only care about the VLAN ID here
+	 */
 	if (skb_vlan_tag_present(skb)) {
 		first->tx_flags |= skb_vlan_tag_get(skb) << ICE_TX_FLAGS_VLAN_S;
-		first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
-	} else if (protocol == htons(ETH_P_8021Q)) {
-		struct vlan_hdr *vhdr, _vhdr;
-
-		/* for SW VLAN, check the next protocol and store the tag */
-		vhdr = (struct vlan_hdr *)skb_header_pointer(skb, ETH_HLEN,
-							     sizeof(_vhdr),
-							     &_vhdr);
-		if (!vhdr)
-			return -EINVAL;
-
-		first->tx_flags |= ntohs(vhdr->h_vlan_TCI) <<
-				   ICE_TX_FLAGS_VLAN_S;
-		first->tx_flags |= ICE_TX_FLAGS_SW_VLAN;
+		if (tx_ring->flags & ICE_TX_FLAGS_VLAN_TAG_LOC_L2TAG2)
+			first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
+		else
+			first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
+
+#ifdef ICE_ADD_PROBES
+		if (tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)
+			tx_ring->vsi->back->tx_q_vlano++;
+		else
+			tx_ring->vsi->back->tx_ad_vlano++;
+#endif
 	}
 
-	return ice_tx_prepare_vlan_flags_dcb(tx_ring, first);
+	ice_tx_prepare_vlan_flags_dcb(tx_ring, first);
+}
+
+#ifdef ICE_ADD_PROBES
+/**
+ * ice_update_gso_cntr - update TSO/USO counter
+ * @tx_buf: Tx buffer with necessary data to update counter
+ */
+static void ice_update_gso_cntr(struct ice_tx_buf *tx_buf)
+{
+	struct sk_buff *skb = tx_buf->skb;
+	struct ice_netdev_priv *np =
+		netdev_priv(skb->dev);
+
+#ifdef NETIF_F_GSO_UDP_L4
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+		np->vsi->back->udp_segs += tx_buf->gso_segs;
+	else
+		np->vsi->back->tcp_segs += tx_buf->gso_segs;
+#else
+	np->vsi->back->tcp_segs += tx_buf->gso_segs;
+#endif /* NETIF_F_GSO_UDP_L4 */
 }
+#endif /* ICE_ADD_PROBES */
 
 /**
  * ice_tso - computes mss and TSO length to prepare for TSO
@@ -1899,10 +2851,12 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 	} ip;
 	union {
 		struct tcphdr *tcp;
+		struct udphdr *udp;
 		unsigned char *hdr;
 	} l4;
 	u64 cd_mss, cd_tso_len;
-	u32 paylen, l4_start;
+	u32 paylen;
+	u8 l4_start;
 	int err;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1915,7 +2869,6 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 	if (err < 0)
 		return err;
 
-	/* cppcheck-suppress unreadVariable */
 	ip.hdr = skb_network_header(skb);
 	l4.hdr = skb_transport_header(skb);
 
@@ -1927,15 +2880,74 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 		ip.v6->payload_len = 0;
 	}
 
+	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+#ifdef NETIF_F_GSO_PARTIAL
+					 SKB_GSO_GRE_CSUM |
+#endif
+#ifdef NETIF_F_GSO_IPXIP4
+					 SKB_GSO_IPXIP4 |
+					 SKB_GSO_IPXIP6 |
+#else
+#ifdef NETIF_F_GSO_IPIP
+					 SKB_GSO_IPIP |
+					 SKB_GSO_SIT |
+#endif
+#endif /* NETIF_F_GSO_IPXIP4 */
+					 SKB_GSO_UDP_TUNNEL |
+					 SKB_GSO_UDP_TUNNEL_CSUM)) {
+#ifndef NETIF_F_GSO_PARTIAL
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) {
+#else
+		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
+#endif
+			l4.udp->len = 0;
+
+			/* determine offset of outer transport header */
+			l4_start = (u8)(l4.hdr - skb->data);
+
+			/* remove payload length from outer checksum */
+			paylen = skb->len - l4_start;
+			csum_replace_by_diff(&l4.udp->check,
+					     (__force __wsum)htonl(paylen));
+		}
+
+		/* reset pointers to inner headers */
+
+		ip.hdr = skb_inner_network_header(skb);
+		l4.hdr = skb_inner_transport_header(skb);
+
+		/* initialize inner IP header fields */
+		if (ip.v4->version == 4) {
+			ip.v4->tot_len = 0;
+			ip.v4->check = 0;
+		} else {
+			ip.v6->payload_len = 0;
+		}
+	}
+
 	/* determine offset of transport header */
-	l4_start = l4.hdr - skb->data;
+	l4_start = (u8)(l4.hdr - skb->data);
 
 	/* remove payload length from checksum */
 	paylen = skb->len - l4_start;
-	csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
 
-	/* compute length of segmentation header */
-	off->header_len = (l4.tcp->doff * 4) + l4_start;
+#ifdef NETIF_F_GSO_UDP_L4
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+		csum_replace_by_diff(&l4.udp->check,
+				     (__force __wsum)htonl(paylen));
+		/* compute length of UDP segmentation header */
+		off->header_len = (u8)sizeof(l4.udp) + l4_start;
+	} else {
+		csum_replace_by_diff(&l4.tcp->check,
+				     (__force __wsum)htonl(paylen));
+		/* compute length of TCP segmentation header */
+		off->header_len = (u8)((l4.tcp->doff * 4) + l4_start);
+	}
+#else
+	csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
+	off->header_len = (u8)((l4.tcp->doff * 4) + l4_start);
+#endif /* NETIF_F_GSO_UDP_L4 */
 
 	/* update gso_segs and bytecount */
 	first->gso_segs = skb_shinfo(skb)->gso_segs;
@@ -1950,6 +2962,9 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 			     (cd_tso_len << ICE_TXD_CTX_QW1_TSO_LEN_S) |
 			     (cd_mss << ICE_TXD_CTX_QW1_MSS_S));
 	first->tx_flags |= ICE_TX_FLAGS_TSO;
+#ifdef ICE_ADD_PROBES
+	ice_update_gso_cntr(first);
+#endif
 	return 1;
 }
 
@@ -2040,7 +3055,7 @@ static bool __ice_chk_linearize(struct sk_buff *skb)
 	frag = &skb_shinfo(skb)->frags[0];
 
 	/* Initialize size to the negative value of gso_size minus 1. We
-	 * use this as the worst case scenerio in which the frag ahead
+	 * use this as the worst case scenario in which the frag ahead
 	 * of us only provides one byte which is why we are limited to 6
 	 * descriptors for a single transmit as the header and previous
 	 * fragment are already consuming 2 descriptors.
@@ -2057,10 +3072,30 @@ static bool __ice_chk_linearize(struct sk_buff *skb)
 	/* Walk through fragments adding latest fragment, testing it, and
 	 * then removing stale fragments from the sum.
 	 */
-	stale = &skb_shinfo(skb)->frags[0];
-	for (;;) {
+	for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
+		int stale_size = skb_frag_size(stale);
+
 		sum += skb_frag_size(frag++);
 
+		/* The stale fragment may present us with a smaller
+		 * descriptor than the actual fragment size. To account
+		 * for that we need to remove all the data on the front and
+		 * figure out what the remainder would be in the last
+		 * descriptor associated with the fragment.
+		 */
+		if (stale_size > ICE_MAX_DATA_PER_TXD) {
+			int align_pad = -(skb_frag_off(stale)) &
+					(ICE_MAX_READ_REQ_SIZE - 1);
+
+			sum -= align_pad;
+			stale_size -= align_pad;
+
+			do {
+				sum -= ICE_MAX_DATA_PER_TXD_ALIGNED;
+				stale_size -= ICE_MAX_DATA_PER_TXD_ALIGNED;
+			} while (stale_size > ICE_MAX_DATA_PER_TXD);
+		}
+
 		/* if sum is negative we failed to make sufficient progress */
 		if (sum < 0)
 			return true;
@@ -2068,7 +3103,7 @@ static bool __ice_chk_linearize(struct sk_buff *skb)
 		if (!nr_frags--)
 			break;
 
-		sum -= skb_frag_size(stale++);
+		sum -= stale_size;
 	}
 
 	return false;
@@ -2096,6 +3131,317 @@ static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count)
 	return count != ICE_MAX_BUF_TXD;
 }
 
+/**
+ * ice_get_queue_based_on_mark - determine the Tx queue based on mark value
+ * @vsi: pointer to VSI
+ * @mark: mark value (skb->mark)
+ * @queue: return the Tx queue number
+ *
+ * Based on mark value (which comes form skb->mark as a result of SO_MARK
+ * socket option), determine the Tx queue, which gets used to align flow
+ * to HW queue.
+ */
+static bool ice_get_queue_based_on_mark(struct ice_vsi *vsi, u32 mark,
+					u16 *queue)
+{
+	int v_idx;
+
+	ice_for_each_q_vector(vsi, v_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[v_idx];
+		struct ice_ring *tx_ring;
+
+		if (!q_vector)
+			continue;
+		if (q_vector->napi.napi_id != mark)
+			continue;
+
+		/* Now we located matching "q_vector:napi_struct" based
+		 * on "mark (as napi_id)
+		 */
+
+		/* for now use first tx_ring:q_index */
+		ice_for_each_ring(tx_ring, q_vector->tx) {
+			*queue = tx_ring->q_index;
+			return true;
+		}
+	}
+	return false;
+}
+
+/**
+ * ice_chnl_inline_fd - Add a Flow director ATR filter
+ * @tx_ring: ring to add programming descriptor
+ * @skb: send buffer
+ * @tx_flags: Tx flags
+ */
+static void ice_chnl_inline_fd(struct ice_ring *tx_ring, struct sk_buff *skb,
+			       u32 tx_flags)
+{
+	struct ice_q_vector *qv = tx_ring->q_vector;
+	struct ice_fd_fltr_desc_ctx fd_ctx = { 0 };
+	struct ice_channel *ch = tx_ring->ch;
+	struct ice_fltr_desc *fdir_desc;
+	union {
+		unsigned char *network;
+		struct iphdr *ipv4;
+	} hdr;
+	struct tcphdr *th;
+	unsigned int hlen;
+	u16 q_index = 0;
+	u16 i, vsi_num;
+	u8 l4_proto;
+
+	/* Currently only IPv4/IPv6 with TCP is supported */
+	if (!(tx_flags & (ICE_TX_FLAGS_IPV4 | ICE_TX_FLAGS_IPV6)))
+		return;
+
+	/* make sure channel VSI is valid and vector is channel enabled */
+	if (!ch->ch_vsi || !qv->ch)
+		return;
+
+	/* do not support inline-FD usage for queues which are
+	 * not in range of channel's queue region.
+	 */
+	if (tx_ring->q_index < ch->base_q)
+		return;
+
+	/* make sure channel VSI is FD capable and enabled for
+	 * inline flow-director usage
+	 */
+	if (!ice_vsi_fd_ena(ch->ch_vsi) ||
+	    !ice_vsi_inline_fd_ena(ch->ch_vsi))
+		return;
+
+	/* snag network header to get L4 type and address */
+	hdr.network = (tx_flags & ICE_TX_FLAGS_TUNNEL) ?
+		       skb_inner_network_header(skb) : skb_network_header(skb);
+
+	if (tx_flags & ICE_TX_FLAGS_IPV4) {
+		/* access ihl as u8 to avoid unaligned access on ia64 */
+		hlen = (hdr.network[0] & 0x0F) << 2;
+		hdr.ipv4 = ip_hdr(skb);
+		l4_proto = hdr.ipv4->protocol;
+	} else if (tx_flags & ICE_TX_FLAGS_IPV6) {
+		/* find the start of the innermost ipv6 header */
+		unsigned int inner_hlen = hdr.network - skb->data;
+		unsigned int h_offset = inner_hlen;
+
+		/* this function updates h_offset to the end of the header */
+		l4_proto = ipv6_find_hdr(skb, &h_offset, IPPROTO_TCP, NULL,
+					 NULL);
+		hlen = h_offset - inner_hlen;
+	} else {
+		return; /* Unsupported protocol */
+	}
+
+	/* Currently ATR is supported only for TCP */
+	if (l4_proto != IPPROTO_TCP)
+		return;
+
+	th = (struct tcphdr *)(hdr.network + hlen);
+
+	if (ice_vsi_inline_fd_mark_ena(ch->ch_vsi)) {
+		/* proceed only for MARK, SYN, SYN+ACK, RST, FIN packets */
+		if (!skb->mark && !th->syn && !th->rst && !th->fin)
+			return;
+	} else {
+		/* proceed only for SYN, SYN+ACK, RST, FIN packets */
+		if (!th->syn && !th->rst && !th->fin)
+			return;
+	}
+
+	/* update queue as needed using channel's base_q, this queue number
+	 * gets programmed in filter descriptor while adding inline-FD entry
+	 */
+	if (skb->mark && ice_vsi_inline_fd_mark_ena(ch->ch_vsi)) {
+#ifdef HAVE_MIN_NAPI_ID
+		if (skb->mark < MIN_NAPI_ID)
+			return;
+#endif /* HAVE_MIN_NAPI_ID */
+
+		if (!skb->sk)
+			return;
+
+		/* skb->mark is part of union {mark, reserved_tailroom}.
+		 * Hence explicit check (to avoid false positive) to make
+		 * sure it is (skb->mark) is same as sk->mark.
+		 */
+		if (skb->mark != skb->sk->sk_mark)
+			return;
+
+		/* if current vector/queue is already aligned (as indicated
+		 * by skb->mark (napi_id), no action needed.
+		 */
+		if (skb->mark == qv->napi.napi_id)
+			return;
+
+		/* Unsupported config for now */
+		if (qv->num_ring_tx > 1)
+			return;
+		/* now locate ring/queue using based on skb->mark as napi_id */
+		if (!ice_get_queue_based_on_mark(qv->vsi, skb->mark, &q_index))
+			return;
+
+		/* all checks are passed, proceed with inline-FD programming */
+		q_index -= ch->base_q;
+	} else if (th->ack || th->fin || th->rst)  {
+		/* server side connection setup || connection_termination */
+		q_index = tx_ring->q_index - ch->base_q;
+	} else if (th->syn) {
+		/* just SYN, client side connection establishment.
+		 * since channel's num_txq and num_rxq has to be same,
+		 * using either num_rxq or num_txq is OK, but for readability
+		 * perspective, using 'num_txq' since this is transmit flow
+		 */
+		q_index = (atomic_inc_return(&ch->fd_queue) - 1) % ch->num_txq;
+	} else {
+		/* dont proceed */
+		return;
+	}
+
+	/* use channel specific HW VSI number */
+	vsi_num = ch->ch_vsi->vsi_num;
+
+	if (th->syn && th->ack) {
+		/* server side connection establishment, hence SYN+ACK.
+		 * proceed only when filter type for channel is of type dest
+		 * port or src+dest port. This is to handle server (target)
+		 * side use case where server side filter is either
+		 * based on dest port or src+dest port
+		 */
+		if (!(ch->fltr_type == ICE_CHNL_FLTR_TYPE_DEST_PORT ||
+		      ch->fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT ||
+		      ch->fltr_type == ICE_CHNL_FLTR_TYPE_TENANT_ID))
+			return;
+
+		if (atomic_dec_if_positive(&qv->inline_fd_cnt) < 0) {
+			/* bailout */
+#ifdef ADQ_PERF_COUNTERS
+			tx_ring->ch_q_stats.tx.num_atr_bailouts++;
+#endif /* ADQ_PERF_COUNTERS */
+			return;
+		}
+#ifdef ADQ_PERF_COUNTERS
+		tx_ring->ch_q_stats.tx.num_atr_setup++;
+#endif /* ADQ_PERF_COUNTERS */
+	} else if (th->syn) {
+#ifdef ADQ_PERF_COUNTERS
+		struct ice_ring *ch_tx_ring;
+#endif /* ADQ_PERF_COUNTERS */
+		/* client side doing active connect, hence SYN.
+		 * proceed only when filter type for channel is of type src
+		 * port or src+dest port. This is to handle client (initiator)
+		 * side, where filter type would be either based on
+		 * src port or src+dest port.
+		 */
+		if (!(ch->fltr_type == ICE_CHNL_FLTR_TYPE_SRC_PORT ||
+		      ch->fltr_type == ICE_CHNL_FLTR_TYPE_TENANT_ID ||
+		      ch->fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT))
+			return;
+
+#ifdef ADQ_PERF_COUNTERS
+		ch_tx_ring = qv->vsi->tx_rings[q_index + ch->base_q];
+		if (ch_tx_ring)
+			ch_tx_ring->ch_q_stats.tx.num_atr_setup++;
+#endif /* ADQ_PERF_COUNTERS */
+	} else if (th->fin || th->rst) {
+#ifdef ADQ_PERF_COUNTERS
+		tx_ring->ch_q_stats.tx.num_atr_evict++;
+#endif /* ADQ_PERF_COUNTERS */
+	} else {
+		/* This case is due to skb-mark, no need to check again,
+		 * It is handled previously
+		 */
+
+		/* filter type must be valid, SO_MARK based FD programming
+		 * is agnostic to client/server type connection, hence
+		 * not checking specific type of filter
+		 */
+		if (ch->fltr_type == ICE_CHNL_FLTR_TYPE_INVALID ||
+		    ch->fltr_type == ICE_CHNL_FLTR_TYPE_LAST)
+			return;
+#ifdef ADQ_PERF_COUNTERS
+		struct ice_ring *ch_tx_ring;
+
+		ch_tx_ring = qv->vsi->tx_rings[q_index + ch->base_q];
+		if (ch_tx_ring)
+			ch_tx_ring->ch_q_stats.tx.num_mark_atr_setup++;
+#endif /* ADQ_PERF_COUNTERS */
+	}
+
+	/* grab the next descriptor */
+	i = tx_ring->next_to_use;
+	fdir_desc = ICE_TX_FDIRDESC(tx_ring, i);
+
+	i++;
+	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
+
+	ice_set_dflt_val_fd_desc(&fd_ctx);
+
+
+	/* set report completion to NONE, means flow-director programming
+	 * status won't be informed to SW.
+	 */
+	fd_ctx.comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_NONE;
+
+	/* Do not want auto-eviction of filter due to FIN/RST, eviction
+	 * is managed by SW, to avoid possible problems with TCP half-close
+	 * OR TCP simultaneous close from both side.
+	 */
+	fd_ctx.evict_ena = ICE_FXD_FLTR_QW0_EVICT_ENA_FALSE;
+	fd_ctx.qindex = q_index;
+	fd_ctx.cnt_index = tx_ring->ch_inline_fd_cnt_index;
+	fd_ctx.cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+	fd_ctx.pcmd = (th->fin || th->rst) ?
+			    ICE_FXD_FLTR_QW1_PCMD_REMOVE :
+			    ICE_FXD_FLTR_QW1_PCMD_ADD;
+	fd_ctx.fd_vsi = vsi_num;
+	ice_set_fd_desc_val(&fd_ctx, fdir_desc);
+}
+
+/**
+ * ice_tsyn - set up the tsyn context descriptor
+ * @tx_ring:  ptr to the ring to send
+ * @skb:      ptr to the skb we're sending
+ * @first: Tx buffer
+ * @off: Quad Word 1
+ * @ptp_idx: ptp index to be filled in
+ *
+ * Returns NETDEV_TX_BUSY if not index avail, else OK
+ */
+static netdev_tx_t
+ice_tsyn(struct ice_ring *tx_ring, struct sk_buff *skb,
+	 struct ice_tx_buf *first,
+	 struct ice_tx_offload_params *off, int *ptp_idx)
+{
+	struct ice_vsi *vsi = tx_ring->vsi;
+	int idx;
+
+	if (!vsi->ptp_tx)
+		return NETDEV_TX_BUSY;
+
+	/* Tx timestamps cannot be sampled when doing TSO */
+	if (first->tx_flags & ICE_TX_FLAGS_TSO)
+		return NETDEV_TX_BUSY;
+
+	idx = ice_ptp_get_ts_idx(vsi);
+	if (idx >= 0) {
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+		vsi->ptp_tx_skb[idx] = skb_get(skb);
+		*ptp_idx = idx;
+	} else {
+		vsi->tx_hwtstamp_skipped++;
+		return NETDEV_TX_BUSY;
+	}
+
+	off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
+			     (ICE_TX_CTX_DESC_TSYN << ICE_TXD_CTX_QW1_CMD_S) |
+			     ((u64)idx << ICE_TXD_CTX_QW1_TSO_LEN_S));
+	first->tx_flags |= ICE_TX_FLAGS_TSYN;
+
+	return NETDEV_TX_OK;
+}
+
 /**
  * ice_xmit_frame_ring - Sends buffer on Tx ring
  * @skb: send buffer
@@ -2109,8 +3455,13 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 	struct ice_tx_offload_params offload = { 0 };
 	struct ice_vsi *vsi = tx_ring->vsi;
 	struct ice_tx_buf *first;
+	struct ethhdr *eth;
 	unsigned int count;
+	bool tsyn = true;
 	int tso, csum;
+	int idx = -1;
+
+	ice_trace(xmit_frame_ring, tx_ring, skb);
 
 	count = ice_xmit_desc_count(skb);
 	if (ice_chk_linearize(skb, count)) {
@@ -2142,8 +3493,14 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 	first->tx_flags = 0;
 
 	/* prepare the VLAN tagging flags for Tx */
-	if (ice_tx_prepare_vlan_flags(tx_ring, first))
-		goto out_drop;
+	ice_tx_prepare_vlan_flags(tx_ring, first);
+	if (first->tx_flags & ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN) {
+		offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
+					(ICE_TX_CTX_DESC_IL2TAG2 <<
+					ICE_TXD_CTX_QW1_CMD_S));
+		offload.cd_l2tag2 = (first->tx_flags & ICE_TX_FLAGS_VLAN_M) >>
+			ICE_TX_FLAGS_VLAN_S;
+	}
 
 	/* set up TSO offload */
 	tso = ice_tso(first, &offload);
@@ -2156,16 +3513,34 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 		goto out_drop;
 
 	/* allow CONTROL frames egress from main VSI if FW LLDP disabled */
-	if (unlikely(skb->priority == TC_PRIO_CONTROL &&
+	eth = (struct ethhdr *)skb_mac_header(skb);
+	if (unlikely((skb->priority == TC_PRIO_CONTROL ||
+		      eth->h_proto == htons(ETH_P_LLDP)) &&
+		     (!(tx_ring->ch && tx_ring->ch->ch_vsi)) &&
 		     vsi->type == ICE_VSI_PF &&
-		     vsi->port_info->is_sw_lldp))
+		     vsi->port_info->qos_cfg.is_sw_lldp))
 		offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
 					ICE_TX_CTX_DESC_SWTCH_UPLINK <<
 					ICE_TXD_CTX_QW1_CMD_S);
 
+	/* only timestamp the outbound packet if the user has requested it */
+#ifdef SKB_SHARED_TX_IS_UNION
+	if (likely(!(skb_tx(skb)->hardware)))
+#else
+	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
+#endif /* SKB_SHARED_TX_IS_UNION */
+		tsyn = false;
+
+	if (tsyn &&
+	    ice_tsyn(tx_ring, skb, first, &offload, &idx) == NETDEV_TX_BUSY)
+		goto out_ptp_drop;
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	if (ice_is_switchdev_running(vsi->back))
+		ice_eswitch_set_target_vsi(skb, &offload);
+#endif /* CONFIG_NET_DEVLINK */
 	if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) {
 		struct ice_tx_ctx_desc *cdesc;
-		int i = tx_ring->next_to_use;
+		u16 i = tx_ring->next_to_use;
 
 		/* grab the next descriptor */
 		cdesc = ICE_TX_CTX_DESC(tx_ring, i);
@@ -2179,12 +3554,21 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 		cdesc->qw1 = cpu_to_le64(offload.cd_qw1);
 	}
 
+
+	if (ice_ring_ch_enabled(tx_ring))
+		ice_chnl_inline_fd(tx_ring, skb, first->tx_flags);
+
+	first->ptp_ts_idx = idx;
+
 	ice_tx_map(tx_ring, first, &offload);
 	return NETDEV_TX_OK;
 
 out_drop:
+	ice_trace(xmit_frame_ring_drop, tx_ring, skb);
 	dev_kfree_skb_any(skb);
 	return NETDEV_TX_OK;
+out_ptp_drop:
+	return NETDEV_TX_BUSY;
 }
 
 /**
@@ -2210,3 +3594,86 @@ netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 
 	return ice_xmit_frame_ring(skb, tx_ring);
 }
+
+/**
+ * ice_clean_ctrl_tx_irq - interrupt handler for flow director Tx queue
+ * @tx_ring: tx_ring to clean
+ */
+void ice_clean_ctrl_tx_irq(struct ice_ring *tx_ring)
+{
+	struct ice_vsi *vsi = tx_ring->vsi;
+	s16 i = tx_ring->next_to_clean;
+	int budget = ICE_DFLT_IRQ_WORK;
+	struct ice_tx_desc *tx_desc;
+	struct ice_tx_buf *tx_buf;
+
+	tx_buf = &tx_ring->tx_buf[i];
+	tx_desc = ICE_TX_DESC(tx_ring, i);
+	i -= tx_ring->count;
+
+	do {
+		struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
+
+		/* if next_to_watch is not set then there is no pending work */
+		if (!eop_desc)
+			break;
+
+		/* prevent any other reads prior to eop_desc */
+		smp_rmb();
+
+		/* if the descriptor isn't done, no work to do */
+		if (!(eop_desc->cmd_type_offset_bsz &
+		      cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
+			break;
+
+		/* clear next_to_watch to prevent false hangs */
+		tx_buf->next_to_watch = NULL;
+		tx_desc->buf_addr = 0;
+		tx_desc->cmd_type_offset_bsz = 0;
+
+		/* move past filter desc */
+		tx_buf++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buf = tx_ring->tx_buf;
+			tx_desc = ICE_TX_DESC(tx_ring, 0);
+		}
+
+		/* unmap the data header */
+		if (dma_unmap_len(tx_buf, len))
+			dma_unmap_single(tx_ring->dev,
+					 dma_unmap_addr(tx_buf, dma),
+					 dma_unmap_len(tx_buf, len),
+					 DMA_TO_DEVICE);
+		if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
+			devm_kfree(tx_ring->dev, tx_buf->raw_buf);
+
+		/* clear next_to_watch to prevent false hangs */
+		tx_buf->raw_buf = NULL;
+		tx_buf->tx_flags = 0;
+		tx_buf->next_to_watch = NULL;
+		dma_unmap_len_set(tx_buf, len, 0);
+		tx_desc->buf_addr = 0;
+		tx_desc->cmd_type_offset_bsz = 0;
+
+		/* move past eop_desc for start of next FD desc */
+		tx_buf++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buf = tx_ring->tx_buf;
+			tx_desc = ICE_TX_DESC(tx_ring, 0);
+		}
+
+		budget--;
+	} while (likely(budget));
+
+	i += tx_ring->count;
+	tx_ring->next_to_clean = i;
+
+	/* re-enable interrupt if needed */
+	ice_irq_dynamic_ena(&vsi->back->hw, vsi, vsi->q_vectors[0]);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index 94a9280193e2..054982dbf1c3 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -1,11 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_TXRX_H_
 #define _ICE_TXRX_H_
 
+#include "ice_type.h"
+
 #define ICE_DFLT_IRQ_WORK	256
+#define ICE_RXBUF_3072		3072
 #define ICE_RXBUF_2048		2048
+#define ICE_RXBUF_1536		1536
 #define ICE_MAX_CHAINED_RX_BUFS	5
 #define ICE_MAX_BUF_TXD		8
 #define ICE_MIN_TX_LEN		17
@@ -22,6 +26,72 @@
 #define ICE_RX_BUF_WRITE	16	/* Must be power of 2 */
 #define ICE_MAX_TXQ_PER_TXQG	128
 
+/* Attempt to maximize the headroom available for incoming frames. We use a 2K
+ * buffer for MTUs <= 1500 and need 1536/1534 to store the data for the frame.
+ * This leaves us with 512 bytes of room.  From that we need to deduct the
+ * space needed for the shared info and the padding needed to IP align the
+ * frame.
+ *
+ * Note: For cache line sizes 256 or larger this value is going to end
+ *	 up negative.  In these cases we should fall back to the legacy
+ *	 receive path.
+ */
+#if (PAGE_SIZE < 8192)
+#define ICE_2K_TOO_SMALL_WITH_PADDING \
+	((unsigned int)(NET_SKB_PAD + ICE_RXBUF_1536) > \
+			SKB_WITH_OVERHEAD(ICE_RXBUF_2048))
+
+/**
+ * ice_compute_pad - compute the padding
+ * @rx_buf_len: buffer length
+ *
+ * Figure out the size of half page based on given buffer length and
+ * then subtract the skb_shared_info followed by subtraction of the
+ * actual buffer length; this in turn results in the actual space that
+ * is left for padding usage
+ */
+static inline int ice_compute_pad(int rx_buf_len)
+{
+	int half_page_size;
+
+	half_page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2);
+	return SKB_WITH_OVERHEAD(half_page_size) - rx_buf_len;
+}
+
+/**
+ * ice_skb_pad - determine the padding that we can supply
+ *
+ * Figure out the right Rx buffer size and based on that calculate the
+ * padding
+ */
+static inline int ice_skb_pad(void)
+{
+	int rx_buf_len;
+
+	/* If a 2K buffer cannot handle a standard Ethernet frame then
+	 * optimize padding for a 3K buffer instead of a 1.5K buffer.
+	 *
+	 * For a 3K buffer we need to add enough padding to allow for
+	 * tailroom due to NET_IP_ALIGN possibly shifting us out of
+	 * cache-line alignment.
+	 */
+	if (ICE_2K_TOO_SMALL_WITH_PADDING)
+		rx_buf_len = ICE_RXBUF_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN);
+	else
+		rx_buf_len = ICE_RXBUF_1536;
+
+	/* if needed make room for NET_IP_ALIGN */
+	rx_buf_len -= NET_IP_ALIGN;
+
+	return ice_compute_pad(rx_buf_len);
+}
+
+#define ICE_SKB_PAD ice_skb_pad()
+#else
+#define ICE_2K_TOO_SMALL_WITH_PADDING false
+#define ICE_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN)
+#endif
+
 /* We are assuming that the cache line is always 64 Bytes here for ice.
  * In order to make sure that is a correct assumption there is a check in probe
  * to print a warning if the read from GLPCI_CNF2 tells us that the cache line
@@ -38,28 +108,53 @@
 #define DESC_NEEDED (MAX_SKB_FRAGS + ICE_DESCS_FOR_CTX_DESC + \
 		     ICE_DESCS_PER_CACHE_LINE + ICE_DESCS_FOR_SKB_DATA_PTR)
 #define ICE_DESC_UNUSED(R)	\
-	((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
-	(R)->next_to_clean - (R)->next_to_use - 1)
+	(u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
+	      (R)->next_to_clean - (R)->next_to_use - 1)
 
 #define ICE_TX_FLAGS_TSO	BIT(0)
 #define ICE_TX_FLAGS_HW_VLAN	BIT(1)
 #define ICE_TX_FLAGS_SW_VLAN	BIT(2)
+/* ICE_TX_FLAGS_DUMMY_PKT is used to mark dummy packets that should be
+ * freed instead of returned like skb packets.
+ */
+#define ICE_TX_FLAGS_DUMMY_PKT	BIT(3)
+
+#define ICE_TX_FLAGS_TSYN	BIT(4)
+#define ICE_TX_FLAGS_IPV4	BIT(5)
+#define ICE_TX_FLAGS_IPV6	BIT(6)
+#define ICE_TX_FLAGS_TUNNEL	BIT(7)
+#define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN	BIT(8)
 #define ICE_TX_FLAGS_VLAN_M	0xffff0000
 #define ICE_TX_FLAGS_VLAN_PR_M	0xe0000000
 #define ICE_TX_FLAGS_VLAN_PR_S	29
 #define ICE_TX_FLAGS_VLAN_S	16
+#ifdef HAVE_XDP_SUPPORT
+
+#define ICE_XDP_PASS		0
+#define ICE_XDP_CONSUMED	BIT(0)
+#define ICE_XDP_TX		BIT(1)
+#define ICE_XDP_REDIR		BIT(2)
+#endif /* HAVE_XDP_SUPPORT */
 
 #define ICE_RX_DMA_ATTR \
 	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
 
+#define ICE_ETH_PKT_HDR_PAD	(ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2))
+
+#define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS)
+
 struct ice_tx_buf {
 	struct ice_tx_desc *next_to_watch;
-	struct sk_buff *skb;
+	union {
+		struct sk_buff *skb;
+		void *raw_buf; /* used for XDP */
+	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
 	u32 tx_flags;
 	DEFINE_DMA_UNMAP_LEN(len);
 	DEFINE_DMA_UNMAP_ADDR(dma);
+	int ptp_ts_idx;
 };
 
 struct ice_tx_offload_params {
@@ -76,14 +171,66 @@ struct ice_tx_offload_params {
 struct ice_rx_buf {
 	struct sk_buff *skb;
 	dma_addr_t dma;
-	struct page *page;
-	unsigned int page_offset;
-	u16 pagecnt_bias;
+	union {
+		struct {
+			struct page *page;
+			unsigned int page_offset;
+			u16 pagecnt_bias;
+		};
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+		struct {
+			union {
+				struct xdp_buff *xdp;
+				void *addr;
+			};
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			u64 handle;
+#endif
+		};
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	};
+};
+
+#ifdef ADQ_PERF_COUNTERS
+struct ice_ch_tx_q_stats {
+	u64 num_atr_setup; /* How many times, ATR is setup */
+	u64 num_mark_atr_setup; /* How many times, ATR is setup via skb->mark */
+	u64 num_atr_evict; /* How many times, ATR is teardown */
+	u64 num_atr_bailouts; /* How many times, bailout from ATR setup */
+};
+
+struct ice_ch_rx_q_stats {
+	u64 num_rx_queue_set; /* times, Rx queue set for ATR */
+	u64 num_rx_queue_bailouts; /* times bailed out from ATR setup */
+	u64 num_tcp_ctrl_pkts;
+	u64 num_only_ctrl_pkts;
+	u64 num_no_data_pkt_bp;
+	u64 num_tcp_flags_fin;
+	u64 num_tcp_flags_rst;
+	u64 num_tcp_flags_syn;
+};
+
+struct ice_ch_q_poll_stats {
+	/* general packet counters for busy_poll versus napi_poll */
+	u64 bp_packets;
+	u64 np_packets;
 };
 
+struct ice_ch_q_stats {
+	struct ice_ch_q_poll_stats poll;
+	union {
+		struct ice_ch_tx_q_stats tx;
+		struct ice_ch_rx_q_stats rx;
+	};
+};
+#endif /* ADQ_PERF_COUNTERS */
+
 struct ice_q_stats {
 	u64 pkts;
 	u64 bytes;
+#ifdef ICE_ADD_PROBES
+	u64 napi_poll_cnt;
+#endif /* ICE_ADD_PROBES */
 };
 
 struct ice_txq_stats {
@@ -97,7 +244,14 @@ struct ice_rxq_stats {
 	u64 non_eop_descs;
 	u64 alloc_page_failed;
 	u64 alloc_buf_failed;
-	u64 page_reuse_count;
+#ifdef ICE_ADD_PROBES
+	u64 page_reuse;
+#endif /* ICE_ADD_PROBES */
+};
+
+enum ice_ring_state_t {
+	ICE_TX_XPS_INIT_DONE,
+	ICE_TX_NBITS,
 };
 
 /* this enum matches hardware bits and is meant to be used by DYN_CTLN
@@ -124,27 +278,23 @@ enum ice_rx_dtype {
 #define ICE_TX_ITR	ICE_IDX_ITR1
 #define ICE_ITR_8K	124
 #define ICE_ITR_20K	50
-#define ICE_ITR_MAX	8160
-#define ICE_DFLT_TX_ITR	(ICE_ITR_20K | ICE_ITR_DYNAMIC)
-#define ICE_DFLT_RX_ITR	(ICE_ITR_20K | ICE_ITR_DYNAMIC)
-#define ICE_ITR_DYNAMIC	0x8000  /* used as flag for itr_setting */
-#define ITR_IS_DYNAMIC(setting) (!!((setting) & ICE_ITR_DYNAMIC))
-#define ITR_TO_REG(setting)	((setting) & ~ICE_ITR_DYNAMIC)
+#define ICE_ITR_MAX	8160 /* 0x1FE0 */
+#define ICE_DFLT_TX_ITR	ICE_ITR_20K
+#define ICE_DFLT_RX_ITR	ICE_ITR_20K
+enum ice_dynamic_itr {
+	ITR_STATIC = 0,
+	ITR_DYNAMIC = 1
+};
+
+#define ITR_IS_DYNAMIC(rc) ((rc)->itr_mode == ITR_DYNAMIC)
 #define ICE_ITR_GRAN_S		1	/* ITR granularity is always 2us */
 #define ICE_ITR_GRAN_US		BIT(ICE_ITR_GRAN_S)
 #define ICE_ITR_MASK		0x1FFE	/* ITR register value alignment mask */
-#define ITR_REG_ALIGN(setting)	__ALIGN_MASK(setting, ~ICE_ITR_MASK)
-
-#define ICE_ITR_ADAPTIVE_MIN_INC	0x0002
-#define ICE_ITR_ADAPTIVE_MIN_USECS	0x0002
-#define ICE_ITR_ADAPTIVE_MAX_USECS	0x00FA
-#define ICE_ITR_ADAPTIVE_LATENCY	0x8000
-#define ICE_ITR_ADAPTIVE_BULK		0x0000
+#define ITR_REG_ALIGN(setting)	((setting) & ICE_ITR_MASK)
 
 #define ICE_DFLT_INTRL	0
 #define ICE_MAX_INTRL	236
 
-#define ICE_WB_ON_ITR_USECS	2
 #define ICE_IN_WB_ON_ITR_MODE	255
 /* Sets WB_ON_ITR and assumes INTENA bit is already cleared, which allows
  * setting the MSK_M bit to tell hardware to ignore the INTENA_M bit. Also,
@@ -161,6 +311,22 @@ enum ice_rx_dtype {
 #define ICE_TX_ADVANCED	0
 #define ICE_TX_LEGACY	1
 
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+struct ice_xdp_stats {
+	u64 xdp_rx_pkts;
+	u64 xdp_rx_bytes;
+	u64 xdp_pass;
+	u64 xdp_drop;
+	u64 xdp_tx;
+	u64 xdp_tx_fail;
+	u64 xdp_unknown;
+	u64 xdp_redirect;
+	u64 xdp_redirect_fail;
+};
+#endif
+#endif
+
 /* descriptor ring, associated with a VSI */
 struct ice_ring {
 	/* CL1 - 1st cacheline starts here */
@@ -179,15 +345,21 @@ struct ice_ring {
 	u16 q_index;			/* Queue number of ring */
 	u16 q_handle;			/* Queue handle per TC */
 
-	u8 ring_active:1;		/* is ring online or not */
-
 	u16 count;			/* Number of descriptors */
 	u16 reg_idx;			/* HW register index of the ring */
 
 	/* used in interrupt processing */
 	u16 next_to_use;
 	u16 next_to_clean;
-	u16 next_to_alloc;
+	union {
+		u16 next_to_alloc;
+		u16 next_rs_idx;
+	};
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	u16 xdp_tx_active;
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#endif /* HAVE_XDP_SUPPORT */
 
 	/* stats structs */
 	struct ice_q_stats	stats;
@@ -198,38 +370,119 @@ struct ice_ring {
 	};
 
 	struct rcu_head rcu;		/* to avoid race on free */
+	DECLARE_BITMAP(xps_state, ICE_TX_NBITS);	/* XPS Config State */
+	struct ice_channel *ch;
+#ifdef HAVE_XDP_SUPPORT
+	struct bpf_prog *xdp_prog;
+#ifdef ICE_ADD_PROBES
+	struct ice_xdp_stats xdp_stats;
+#endif
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+	struct xsk_buff_pool *xsk_pool;
+#else
+	struct xdp_umem *xsk_pool;
+#endif
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	struct zero_copy_allocator zca;
+#endif
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	/* CL3 - 3rd cacheline starts here */
+#ifdef HAVE_XDP_BUFF_RXQ
+	struct xdp_rxq_info xdp_rxq;
+#endif /* HAVE_XDP_BUFF_RXQ */
+#endif /* HAVE_XDP_SUPPORT */
+
 	/* CLX - the below items are only accessed infrequently and should be
 	 * in their own cache line if possible
 	 */
+#ifdef HAVE_XDP_SUPPORT
+#define ICE_TX_FLAGS_RING_XDP			BIT(0)
+#endif /* HAVE_XDP_SUPPORT */
+#define ICE_RX_FLAGS_RING_BUILD_SKB		BIT(1)
+#define ICE_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(2)
+#define ICE_TX_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(3)
+	u8 flags;
 	dma_addr_t dma;			/* physical address of ring */
 	unsigned int size;		/* length of descriptor ring in bytes */
 	u32 txq_teid;			/* Added Tx queue TEID */
 	u16 rx_buf_len;
-#ifdef CONFIG_DCB
+	u8 rx_crc_strip_dis;
 	u8 dcb_tc;			/* Traffic class of ring */
-#endif /* CONFIG_DCB */
+	u64 cached_systime;
+	u8 ptp_rx:1;
+	u32 ch_inline_fd_cnt_index;
+#ifdef ADQ_PERF_COUNTERS
+	struct ice_ch_q_stats ch_q_stats;
+#endif /* ADQ_PERF_COUNTERS */
 } ____cacheline_internodealigned_in_smp;
 
+static inline bool ice_ring_uses_build_skb(struct ice_ring *ring)
+{
+	return !!(ring->flags & ICE_RX_FLAGS_RING_BUILD_SKB);
+}
+
+static inline void ice_set_ring_build_skb_ena(struct ice_ring *ring)
+{
+	ring->flags |= ICE_RX_FLAGS_RING_BUILD_SKB;
+}
+
+static inline void ice_clear_ring_build_skb_ena(struct ice_ring *ring)
+{
+	ring->flags &= ~ICE_RX_FLAGS_RING_BUILD_SKB;
+}
+
+static inline bool ice_ring_ch_enabled(struct ice_ring *ring)
+{
+	return !!(ring->ch);
+}
+
+#ifdef HAVE_XDP_SUPPORT
+static inline bool ice_ring_is_xdp(struct ice_ring *ring)
+{
+	return !!(ring->flags & ICE_TX_FLAGS_RING_XDP);
+}
+#endif /* HAVE_XDP_SUPPORT */
+
 struct ice_ring_container {
 	/* head of linked-list of rings */
 	struct ice_ring *ring;
-	unsigned long next_update;	/* jiffies value of next queue update */
-	unsigned int total_bytes;	/* total bytes processed this int */
-	unsigned int total_pkts;	/* total packets processed this int */
+	struct dim dim;		/* data for net_dim algorithm */
 	u16 itr_idx;		/* index in the interrupt vector */
-	u16 target_itr;		/* value in usecs divided by the hw->itr_gran */
-	u16 current_itr;	/* value in usecs divided by the hw->itr_gran */
-	/* high bit set means dynamic ITR, rest is used to store user
-	 * readable ITR value in usecs and must be converted before programming
-	 * to a register.
+	/* this matches the maximum number of ITR bits, but in usec
+	 * values, so it is shifted left one bit (bit zero is ignored)
 	 */
-	u16 itr_setting;
+	u16 itr_setting:13;
+	u16 itr_reserved:2;
+	u16 itr_mode:1;
+};
+
+struct ice_coalesce_stored {
+	u16 itr_tx;
+	u16 itr_rx;
+	u8 intrl;
+	u8 tx_valid;
+	u8 rx_valid;
 };
 
 /* iterator for handling rings in ring container */
 #define ice_for_each_ring(pos, head) \
 	for (pos = (head).ring; pos; pos = pos->next)
 
+static inline unsigned int ice_rx_pg_order(struct ice_ring *ring)
+{
+#if (PAGE_SIZE < 8192)
+	if (ring->rx_buf_len > (PAGE_SIZE / 2))
+		return 1;
+#endif
+	return 0;
+}
+
+#define ice_rx_pg_size(_ring) (PAGE_SIZE << ice_rx_pg_order(_ring))
+
+
+union ice_32b_rx_flex_desc;
+
 bool ice_alloc_rx_bufs(struct ice_ring *rxr, u16 cleaned_count);
 netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev);
 void ice_clean_tx_ring(struct ice_ring *tx_ring);
@@ -239,5 +492,9 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring);
 void ice_free_tx_ring(struct ice_ring *tx_ring);
 void ice_free_rx_ring(struct ice_ring *rx_ring);
 int ice_napi_poll(struct napi_struct *napi, int budget);
-
+int
+ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
+		   u8 *raw_packet);
+int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget);
+void ice_clean_ctrl_tx_irq(struct ice_ring *tx_ring);
 #endif /* _ICE_TXRX_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
new file mode 100644
index 000000000000..261a8635e8b6
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_txrx_lib.h"
+#include "ice_eswitch.h"
+
+/**
+ * ice_release_rx_desc - Store the new tail and head values
+ * @rx_ring: ring to bump
+ * @val: new head index
+ */
+void ice_release_rx_desc(struct ice_ring *rx_ring, u16 val)
+{
+	u16 prev_ntu = rx_ring->next_to_use & ~0x7;
+
+	rx_ring->next_to_use = val;
+
+	/* update next to alloc since we have filled the ring */
+	rx_ring->next_to_alloc = val;
+
+	/* QRX_TAIL will be updated with any tail value, but hardware ignores
+	 * the lower 3 bits. This makes it so we only bump tail on meaningful
+	 * boundaries. Also, this allows us to bump tail on intervals of 8 up to
+	 * the budget depending on the current traffic load.
+	 */
+	val &= ~0x7;
+	if (prev_ntu != val) {
+		/* Force memory writes to complete before letting h/w
+		 * know there are new descriptors to fetch. (Only
+		 * applicable for weak-ordered memory model archs,
+		 * such as IA-64).
+		 */
+		wmb();
+		writel_relaxed(val, rx_ring->tail);
+	}
+}
+
+/**
+ * ice_ptype_to_htype - get a hash type
+ * @ptype: the ptype value from the descriptor
+ *
+ * Returns appropriate hash type (such as PKT_HASH_TYPE_L2/L3/L4) to be used by
+ * skb_set_hash based on PTYPE as parsed by HW Rx pipeline and is part of
+ * Rx desc.
+ */
+static enum pkt_hash_types ice_ptype_to_htype(u16 ptype)
+{
+	struct ice_rx_ptype_decoded decoded = ice_decode_rx_desc_ptype(ptype);
+
+	if (!decoded.known)
+		return PKT_HASH_TYPE_NONE;
+	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4)
+		return PKT_HASH_TYPE_L4;
+	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY3)
+		return PKT_HASH_TYPE_L3;
+	if (decoded.outer_ip == ICE_RX_PTYPE_OUTER_L2)
+		return PKT_HASH_TYPE_L2;
+
+	return PKT_HASH_TYPE_NONE;
+}
+
+/**
+ * ice_rx_hash - set the hash value in the skb
+ * @rx_ring: descriptor ring
+ * @rx_desc: specific descriptor
+ * @skb: pointer to current skb
+ * @rx_ptype: the ptype value from the descriptor
+ */
+static void
+ice_rx_hash(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
+	    struct sk_buff *skb, u16 rx_ptype)
+{
+	struct ice_32b_rx_flex_desc_nic *nic_mdid;
+	u32 hash;
+
+	if (!(rx_ring->netdev->features & NETIF_F_RXHASH))
+		return;
+
+	if (rx_desc->wb.rxdid != ICE_RXDID_FLEX_NIC)
+		return;
+
+	nic_mdid = (struct ice_32b_rx_flex_desc_nic *)rx_desc;
+	hash = le32_to_cpu(nic_mdid->rss_hash);
+	skb_set_hash(skb, hash, ice_ptype_to_htype(rx_ptype));
+}
+
+#ifdef ICE_ADD_PROBES
+/**
+ * ice_rx_extra_counters - Update Rx csum counters
+ * @vsi: the VSI we care about
+ * @rx_status: status bits extracted from desc qword
+ * @inner_prot: inner protocol of decoded ptype
+ * @is_ipv4: if ptype is ipv4 or not
+ */
+static void
+ice_rx_extra_counters(struct ice_vsi *vsi, u16 rx_status, u32 inner_prot,
+		      bool is_ipv4)
+{
+	if (is_ipv4) {
+		vsi->back->rx_ip4_cso++;
+		if (rx_status & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) |
+				 BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S)))
+			vsi->back->rx_ip4_cso_err++;
+	}
+
+	switch (inner_prot) {
+	case ICE_RX_PTYPE_INNER_PROT_TCP:
+		vsi->back->rx_tcp_cso++;
+		if (rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
+			vsi->back->rx_tcp_cso_err++;
+		break;
+	case ICE_RX_PTYPE_INNER_PROT_UDP:
+		vsi->back->rx_udp_cso++;
+		if (rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
+			vsi->back->rx_udp_cso_err++;
+		break;
+	case ICE_RX_PTYPE_INNER_PROT_SCTP:
+		vsi->back->rx_sctp_cso++;
+		if (rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
+			vsi->back->rx_sctp_cso_err++;
+		break;
+	default:
+		break;
+	}
+}
+#endif /* ICE_ADD_PROBES */
+
+/**
+ * ice_rx_csum - Indicate in skb if checksum is good
+ * @ring: the ring we care about
+ * @skb: skb currently being received and modified
+ * @rx_desc: the receive descriptor
+ * @ptype: the packet type decoded by hardware
+ *
+ * skb->protocol must be set before this function is called
+ */
+static void
+ice_rx_csum(struct ice_ring *ring, struct sk_buff *skb,
+	    union ice_32b_rx_flex_desc *rx_desc, u16 ptype)
+{
+	struct ice_rx_ptype_decoded decoded;
+	u16 rx_status0, rx_status1;
+	bool ipv4, ipv6;
+
+	rx_status0 = le16_to_cpu(rx_desc->wb.status_error0);
+	rx_status1 = le16_to_cpu(rx_desc->wb.status_error1);
+
+	decoded = ice_decode_rx_desc_ptype(ptype);
+
+
+	/* Start with CHECKSUM_NONE and by default csum_level = 0 */
+	skb->ip_summed = CHECKSUM_NONE;
+	skb_checksum_none_assert(skb);
+
+	/* check if Rx checksum is enabled */
+	if (!(ring->netdev->features & NETIF_F_RXCSUM))
+		return;
+
+
+	/* check if HW has decoded the packet and checksum */
+	if (!(rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
+		return;
+
+	if (!(decoded.known && decoded.outer_ip))
+		return;
+
+	ipv4 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
+	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4);
+	ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
+	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6);
+
+#ifdef ICE_ADD_PROBES
+	ice_rx_extra_counters(ring->vsi, rx_status0, decoded.inner_prot, ipv4);
+#endif
+
+	if (ipv4 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) |
+				   BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S))))
+		goto checksum_fail;
+
+	if (ipv6 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S))))
+		goto checksum_fail;
+
+	/* check for L4 errors and handle packets that were not able to be
+	 * checksummed due to arrival speed
+	 */
+	if (rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
+		goto checksum_fail;
+
+	/* check for outer UDP checksum error in tunneled packets */
+	if ((rx_status1 & BIT(ICE_RX_FLEX_DESC_STATUS1_NAT_S)) &&
+	    (rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S)))
+		goto checksum_fail;
+
+	/* If there is an outer header present that might contain a checksum
+	 * we need to bump the checksum level by 1 to reflect the fact that
+	 * we are indicating we validated the inner checksum.
+	 */
+	if (decoded.tunnel_type >= ICE_RX_PTYPE_TUNNEL_IP_GRENAT)
+#ifdef HAVE_SKBUFF_CSUM_LEVEL
+		skb->csum_level = 1;
+#else
+		skb->encapsulation = 1;
+#endif
+
+	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
+	switch (decoded.inner_prot) {
+	case ICE_RX_PTYPE_INNER_PROT_TCP:
+	case ICE_RX_PTYPE_INNER_PROT_UDP:
+	case ICE_RX_PTYPE_INNER_PROT_SCTP:
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	default:
+		break;
+	}
+	return;
+
+checksum_fail:
+	ring->vsi->back->hw_csum_rx_error++;
+}
+
+/**
+ * ice_process_skb_fields - Populate skb header fields from Rx descriptor
+ * @rx_ring: Rx descriptor ring packet is being transacted on
+ * @rx_desc: pointer to the EOP Rx descriptor
+ * @skb: pointer to current skb being populated
+ * @ptype: the packet type decoded by hardware
+ *
+ * This function checks the ring, descriptor, and packet information in
+ * order to populate the hash, checksum, VLAN, protocol, and
+ * other fields within the skb.
+ */
+void
+ice_process_skb_fields(struct ice_ring *rx_ring,
+		       union ice_32b_rx_flex_desc *rx_desc,
+		       struct sk_buff *skb, u16 ptype)
+{
+	ice_rx_hash(rx_ring, rx_desc, skb, ptype);
+
+	/* modifies the skb - consumes the enet header */
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	skb->protocol = eth_type_trans(skb, ice_eswitch_get_target_netdev
+				       (rx_ring, rx_desc));
+#else
+	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+#endif /* CONFIG_NET_DEVLINK */
+
+	ice_rx_csum(rx_ring, skb, rx_desc, ptype);
+	if (rx_ring->ptp_rx)
+		ice_ptp_rx_hwtstamp(rx_ring, rx_desc, skb);
+
+#ifdef HAVE_NETDEV_SB_DEV
+	if (!netif_is_ice(rx_ring->netdev))
+		macvlan_count_rx((const struct macvlan_dev *)netdev_priv(rx_ring->netdev),
+				 skb->len + ETH_HLEN, true, false);
+#endif /* HAVE_NETDEV_SB_DEV */
+}
+
+/**
+ * ice_receive_skb - Send a completed packet up the stack
+ * @rx_ring: Rx ring in play
+ * @skb: packet to send up
+ * @vlan_tag: VLAN tag for packet
+ *
+ * This function sends the completed packet (via. skb) up the stack using
+ * gro receive functions (with/without VLAN tag)
+ */
+void
+ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)
+{
+	netdev_features_t features = rx_ring->netdev->features;
+	bool non_zero_vlan = vlan_tag & VLAN_VID_MASK;
+
+#ifdef ICE_ADD_PROBES
+	if ((features & NETIF_F_HW_VLAN_CTAG_RX) && non_zero_vlan) {
+		rx_ring->vsi->back->rx_q_vlano++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+	} else if ((features & NETIF_F_HW_VLAN_STAG_RX) && non_zero_vlan) {
+		rx_ring->vsi->back->rx_ad_vlano++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
+	}
+#else
+	if ((features & NETIF_F_HW_VLAN_CTAG_RX) && non_zero_vlan)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+	else if ((features & NETIF_F_HW_VLAN_STAG_RX) && non_zero_vlan)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
+#endif /* ICE_ADD_PROBES */
+
+	napi_gro_receive(&rx_ring->q_vector->napi, skb);
+}
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_xmit_xdp_ring - submit single packet to XDP ring for transmission
+ * @data: packet data pointer
+ * @size: packet data size
+ * @xdp_ring: XDP ring for transmission
+ */
+int ice_xmit_xdp_ring(void *data, u16 size, struct ice_ring *xdp_ring)
+{
+	u16 i = xdp_ring->next_to_use;
+	struct ice_tx_desc *tx_desc;
+	struct ice_tx_buf *tx_buf;
+	dma_addr_t dma;
+
+	if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
+		xdp_ring->tx_stats.tx_busy++;
+		return ICE_XDP_CONSUMED;
+	}
+
+	dma = dma_map_single(xdp_ring->dev, data, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(xdp_ring->dev, dma))
+		return ICE_XDP_CONSUMED;
+
+	tx_buf = &xdp_ring->tx_buf[i];
+	tx_buf->bytecount = size;
+	tx_buf->gso_segs = 1;
+	tx_buf->raw_buf = data;
+
+	/* record length, and DMA address */
+	dma_unmap_len_set(tx_buf, len, size);
+	dma_unmap_addr_set(tx_buf, dma, dma);
+
+	tx_desc = ICE_TX_DESC(xdp_ring, i);
+	tx_desc->buf_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TXD_LAST_DESC_CMD, 0,
+						      size, 0);
+
+	/* Make certain all of the status bits have been updated
+	 * before next_to_watch is written.
+	 */
+	smp_wmb();
+
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	xdp_ring->xdp_tx_active++;
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#endif /* HAVE_XDP_SUPPORT */
+	i++;
+	if (i == xdp_ring->count)
+		i = 0;
+
+	tx_buf->next_to_watch = tx_desc;
+	xdp_ring->next_to_use = i;
+
+	return ICE_XDP_TX;
+}
+
+/**
+ * ice_xmit_xdp_buff - convert an XDP buffer to an XDP frame and send it
+ * @xdp: XDP buffer
+ * @xdp_ring: XDP Tx ring
+ *
+ * Returns negative on failure, 0 on success.
+ */
+int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_ring *xdp_ring)
+{
+#ifdef HAVE_XDP_FRAME_STRUCT
+	struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
+
+	if (unlikely(!xdpf))
+		return ICE_XDP_CONSUMED;
+
+	return ice_xmit_xdp_ring(xdpf->data, xdpf->len, xdp_ring);
+#else
+	return ice_xmit_xdp_ring(xdp->data,
+				 xdp->data_end - xdp->data,
+				 xdp_ring);
+#endif /* HAVE_XDP_FRAME_STRUCT */
+}
+
+/**
+ * ice_finalize_xdp_rx - Bump XDP Tx tail and/or flush redirect map
+ * @rx_ring: Rx ring
+ * @xdp_res: Result of the receive batch
+ *
+ * This function bumps XDP Tx tail and/or flush redirect map, and
+ * should be called when a batch of packets has been processed in the
+ * napi loop.
+ */
+void ice_finalize_xdp_rx(struct ice_ring *rx_ring, unsigned int xdp_res)
+{
+	if (xdp_res & ICE_XDP_REDIR)
+		xdp_do_flush_map();
+
+	if (xdp_res & ICE_XDP_TX) {
+		struct ice_ring *xdp_ring =
+			rx_ring->vsi->xdp_rings[rx_ring->q_index];
+
+		ice_xdp_ring_update_tail(xdp_ring);
+	}
+}
+#endif /* HAVE_XDP_SUPPORT */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
new file mode 100644
index 000000000000..09a2e80525cd
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_TXRX_LIB_H_
+#define _ICE_TXRX_LIB_H_
+#include "ice.h"
+
+/**
+ * ice_test_staterr - tests bits in Rx descriptor status and error fields
+ * @status_err_n: Rx descriptor status_error0 or status_error1 bits
+ * @stat_err_bits: value to mask
+ *
+ * This function does some fast chicanery in order to return the
+ * value of the mask which is really only used for boolean tests.
+ * The status_error_len doesn't need to be shifted because it begins
+ * at offset zero.
+ */
+static inline bool
+ice_test_staterr(__le16 status_err_n, const u16 stat_err_bits)
+{
+	return !!(status_err_n & cpu_to_le16(stat_err_bits));
+}
+
+static inline __le64
+ice_build_ctob(u64 td_cmd, u64 td_offset, unsigned int size, u64 td_tag)
+{
+	return cpu_to_le64(ICE_TX_DESC_DTYPE_DATA |
+			   (td_cmd    << ICE_TXD_QW1_CMD_S) |
+			   (td_offset << ICE_TXD_QW1_OFFSET_S) |
+			   ((u64)size << ICE_TXD_QW1_TX_BUF_SZ_S) |
+			   (td_tag    << ICE_TXD_QW1_L2TAG1_S));
+}
+
+/**
+ * ice_get_vlan_from_rx_desc - get VLAN from Rx flex descriptor
+ * @rx_desc: Rx 32b flex descriptor with RXDID=2
+ *
+ * The OS and current PF implementation only support stripping a single VLAN tag
+ * at a time, so there should only ever be 0 or 1 tags in the l2tag* fields. If
+ * one is found return the tag, else return 0 to mean no VLAN tag was found.
+ */
+static inline u16
+ice_get_vlan_tag_from_rx_desc(union ice_32b_rx_flex_desc *rx_desc)
+{
+	u16 stat_err_bits;
+
+	stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
+	if (ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
+		return le16_to_cpu(rx_desc->wb.l2tag1);
+
+	stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
+	if (ice_test_staterr(rx_desc->wb.status_error1, stat_err_bits))
+		return le16_to_cpu(rx_desc->wb.l2tag2_2nd);
+
+	return 0;
+}
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_xdp_ring_update_tail - Updates the XDP Tx ring tail register
+ * @xdp_ring: XDP Tx ring
+ *
+ * This function updates the XDP Tx ring tail register.
+ */
+static inline void ice_xdp_ring_update_tail(struct ice_ring *xdp_ring)
+{
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.
+	 */
+	wmb();
+	writel_relaxed(xdp_ring->next_to_use, xdp_ring->tail);
+}
+
+void ice_finalize_xdp_rx(struct ice_ring *rx_ring, unsigned int xdp_res);
+int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_ring *xdp_ring);
+int ice_xmit_xdp_ring(void *data, u16 size, struct ice_ring *xdp_ring);
+#endif /* HAVE_XDP_SUPPORT */
+void ice_release_rx_desc(struct ice_ring *rx_ring, u16 val);
+void
+ice_process_skb_fields(struct ice_ring *rx_ring,
+		       union ice_32b_rx_flex_desc *rx_desc,
+		       struct sk_buff *skb, u16 ptype);
+void
+ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag);
+#endif /* !_ICE_TXRX_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 6667d17a4206..eeb2de1a152e 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -1,42 +1,105 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_TYPE_H_
 #define _ICE_TYPE_H_
 
+
+
+
+
+
 #define ICE_BYTES_PER_WORD	2
 #define ICE_BYTES_PER_DWORD	4
+#define ICE_MAX_TRAFFIC_CLASS	8
+#define ICE_CHNL_MAX_TC		16
+
+
+
+
 
 #include "ice_status.h"
 #include "ice_hw_autogen.h"
+#include "ice_devids.h"
 #include "ice_osdep.h"
 #include "ice_controlq.h"
 #include "ice_lan_tx_rx.h"
 #include "ice_flex_type.h"
+#include "ice_protocol_type.h"
+#include "ice_sbq_cmd.h"
+#include "ice_vlan_mode.h"
+#include "ice_fwlog.h"
+
+
 
 static inline bool ice_is_tc_ena(unsigned long bitmap, u8 tc)
 {
 	return test_bit(tc, &bitmap);
 }
 
+
+static inline u64 round_up_64bit(u64 a, u32 b)
+{
+	return div64_long(((a) + (b) / 2), (b));
+}
+
+static inline u32 ice_round_to_num(u32 N, u32 R)
+{
+	return ((((N) % (R)) < ((R) / 2)) ? (((N) / (R)) * (R)) :
+		((((N) + (R) - 1) / (R)) * (R)));
+}
+
 /* Driver always calls main vsi_handle first */
 #define ICE_MAIN_VSI_HANDLE		0
 
+/* Switch from ms to the 1usec global time (this is the GTIME resolution) */
+#define ICE_MS_TO_GTIME(time)		((time) * 1000)
+
+/* Data type manipulation macros. */
+#define ICE_HI_WORD(x)		((u16)(((x) >> 16) & 0xFFFF))
+#define ICE_LO_WORD(x)		((u16)((x) & 0xFFFF))
+
 /* debug masks - set these bits in hw->debug_mask to control output */
 #define ICE_DBG_INIT		BIT_ULL(1)
+#define ICE_DBG_RELEASE		BIT_ULL(2)
 #define ICE_DBG_FW_LOG		BIT_ULL(3)
 #define ICE_DBG_LINK		BIT_ULL(4)
 #define ICE_DBG_PHY		BIT_ULL(5)
 #define ICE_DBG_QCTX		BIT_ULL(6)
 #define ICE_DBG_NVM		BIT_ULL(7)
 #define ICE_DBG_LAN		BIT_ULL(8)
+#define ICE_DBG_FLOW		BIT_ULL(9)
+#define ICE_DBG_DCB		BIT_ULL(10)
+#define ICE_DBG_DIAG		BIT_ULL(11)
+#define ICE_DBG_FD		BIT_ULL(12)
 #define ICE_DBG_SW		BIT_ULL(13)
 #define ICE_DBG_SCHED		BIT_ULL(14)
+
+#define ICE_DBG_RDMA		BIT_ULL(15)
 #define ICE_DBG_PKG		BIT_ULL(16)
 #define ICE_DBG_RES		BIT_ULL(17)
+#define ICE_DBG_ACL		BIT_ULL(18)
+#define ICE_DBG_PTP		BIT_ULL(19)
 #define ICE_DBG_AQ_MSG		BIT_ULL(24)
+#define ICE_DBG_AQ_DESC		BIT_ULL(25)
+#define ICE_DBG_AQ_DESC_BUF	BIT_ULL(26)
 #define ICE_DBG_AQ_CMD		BIT_ULL(27)
+#define ICE_DBG_AQ		(ICE_DBG_AQ_MSG		| \
+				 ICE_DBG_AQ_DESC	| \
+				 ICE_DBG_AQ_DESC_BUF	| \
+				 ICE_DBG_AQ_CMD)
+
 #define ICE_DBG_USER		BIT_ULL(31)
+#define ICE_DBG_ALL		0xFFFFFFFFFFFFFFFFULL
+
+#ifndef __always_unused
+#define __always_unused
+#endif
+
+
+
+
+
 
 enum ice_aq_res_ids {
 	ICE_NVM_RES_ID = 1,
@@ -72,6 +135,12 @@ enum ice_fc_mode {
 	ICE_FC_DFLT
 };
 
+enum ice_phy_cache_mode {
+	ICE_FC_MODE = 0,
+	ICE_SPEED_MODE,
+	ICE_FEC_MODE
+};
+
 enum ice_fec_mode {
 	ICE_FEC_NONE = 0,
 	ICE_FEC_RS,
@@ -79,6 +148,14 @@ enum ice_fec_mode {
 	ICE_FEC_AUTO
 };
 
+struct ice_phy_cache_mode_data {
+	union {
+		enum ice_fec_mode curr_user_fec_req;
+		enum ice_fc_mode curr_user_fc_req;
+		u16 curr_user_speed_req;
+	} data;
+};
+
 enum ice_set_fc_aq_failures {
 	ICE_SET_FC_AQ_FAIL_NONE = 0,
 	ICE_SET_FC_AQ_FAIL_GET,
@@ -86,12 +163,16 @@ enum ice_set_fc_aq_failures {
 	ICE_SET_FC_AQ_FAIL_UPDATE
 };
 
-/* Various MAC types */
+/* These are structs for managing the hardware information and the operations */
+/* MAC types */
 enum ice_mac_type {
 	ICE_MAC_UNKNOWN = 0,
+	ICE_MAC_VF,
+	ICE_MAC_E810,
 	ICE_MAC_GENERIC,
 };
 
+
 /* Media Types */
 enum ice_media_type {
 	ICE_MEDIA_UNKNOWN = 0,
@@ -99,12 +180,19 @@ enum ice_media_type {
 	ICE_MEDIA_BASET,
 	ICE_MEDIA_BACKPLANE,
 	ICE_MEDIA_DA,
+	ICE_MEDIA_AUI,
 };
 
+/* Software VSI types. */
 enum ice_vsi_type {
 	ICE_VSI_PF = 0,
-	ICE_VSI_VF,
+	ICE_VSI_VF = 1,
+	ICE_VSI_VMDQ2 = 2,
+	ICE_VSI_CTRL = 3,	/* equates to ICE_VSI_PF with 1 queue pair */
+	ICE_VSI_CHNL = 4,
+	ICE_VSI_OFFLOAD_MACVLAN = 5,
 	ICE_VSI_LB = 6,
+	ICE_VSI_SWITCHDEV_CTRL = 7,
 };
 
 struct ice_link_status {
@@ -115,6 +203,7 @@ struct ice_link_status {
 	u16 max_frame_size;
 	u16 link_speed;
 	u16 req_speeds;
+	u8 link_cfg_err;
 	u8 lse_ena;	/* Link Status Event notification */
 	u8 link_info;
 	u8 an_info;
@@ -127,6 +216,15 @@ struct ice_link_status {
 	u8 module_type[ICE_MODULE_TYPE_TOTAL_BYTE];
 };
 
+/* Different data queue types: These are mainly for SW consumption. */
+enum ice_q {
+	ICE_DATA_Q_DOORBELL,
+	ICE_DATA_Q_CMPL,
+	ICE_DATA_Q_QUANTA,
+	ICE_DATA_Q_RX,
+	ICE_DATA_Q_TX,
+};
+
 /* Different reset sources for which a disable queue AQ call has to be made in
  * order to clean the Tx scheduler as a part of the reset
  */
@@ -144,36 +242,260 @@ struct ice_phy_info {
 	u64 phy_type_high;
 	enum ice_media_type media_type;
 	u8 get_link_info;
+	/* Please refer to struct ice_aqc_get_link_status_data to get
+	 * detail of enable bit in curr_user_speed_req
+	 */
+	u16 curr_user_speed_req;
+	enum ice_fec_mode curr_user_fec_req;
+	enum ice_fc_mode curr_user_fc_req;
+	struct ice_aqc_set_phy_cfg_data curr_user_phy_cfg;
+};
+
+#define ICE_MAX_NUM_MIRROR_RULES	64
+
+/* protocol enumeration for filters */
+enum ice_fltr_ptype {
+	/* NONE - used for undef/error */
+	ICE_FLTR_PTYPE_NONF_NONE = 0,
+	ICE_FLTR_PTYPE_NONF_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_SCTP,
+	ICE_FLTR_PTYPE_NONF_IPV4_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_ICMP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_IPV6_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_IPV6_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3,
+	ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3,
+	ICE_FLTR_PTYPE_NONF_IPV4_ESP,
+	ICE_FLTR_PTYPE_NONF_IPV6_ESP,
+	ICE_FLTR_PTYPE_NONF_IPV4_AH,
+	ICE_FLTR_PTYPE_NONF_IPV6_AH,
+	ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP,
+	ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP,
+	ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE,
+	ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION,
+	ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE,
+	ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION,
+	ICE_FLTR_PTYPE_NON_IP_L2,
+	ICE_FLTR_PTYPE_NONF_ECPRI_TP0,
+	ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0,
+	ICE_FLTR_PTYPE_FRAG_IPV4,
+	ICE_FLTR_PTYPE_FRAG_IPV6,
+	ICE_FLTR_PTYPE_NONF_IPV6_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV6_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV6_SCTP,
+	ICE_FLTR_PTYPE_NONF_IPV6_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV4_UDP_VXLAN,
+	ICE_FLTR_PTYPE_MAX,
+};
+
+enum ice_fd_hw_seg {
+	ICE_FD_HW_SEG_NON_TUN = 0,
+	ICE_FD_HW_SEG_TUN,
+	ICE_FD_HW_SEG_MAX,
+};
+
+/* 1 ICE_VSI_PF + 1 ICE_VSI_CTRL + ICE_CHNL_MAX_TC */
+#define ICE_MAX_FDIR_VSI_PER_FILTER	(2 + ICE_CHNL_MAX_TC)
+
+struct ice_fd_hw_prof {
+	struct ice_flow_seg_info *fdir_seg[ICE_FD_HW_SEG_MAX];
+	int cnt;
+	u64 entry_h[ICE_MAX_FDIR_VSI_PER_FILTER][ICE_FD_HW_SEG_MAX];
+	u16 vsi_h[ICE_MAX_FDIR_VSI_PER_FILTER];
 };
 
 /* Common HW capabilities for SW use */
 struct ice_hw_common_caps {
+	/* Write CSR protection */
+	u64 wr_csr_prot;
+	u32 switching_mode;
+	/* switching mode supported - EVB switching (including cloud) */
+#define ICE_NVM_IMAGE_TYPE_EVB		0x0
+
+	/* Manageablity mode & supported protocols over MCTP */
+	u32 mgmt_mode;
+#define ICE_MGMT_MODE_PASS_THRU_MODE_M		0xF
+#define ICE_MGMT_MODE_CTL_INTERFACE_M		0xF0
+#define ICE_MGMT_MODE_REDIR_SB_INTERFACE_M	0xF00
+
+	u32 mgmt_protocols_mctp;
+#define ICE_MGMT_MODE_PROTO_RSVD	BIT(0)
+#define ICE_MGMT_MODE_PROTO_PLDM	BIT(1)
+#define ICE_MGMT_MODE_PROTO_OEM		BIT(2)
+#define ICE_MGMT_MODE_PROTO_NC_SI	BIT(3)
+
+	u32 os2bmc;
 	u32 valid_functions;
 	/* DCB capabilities */
 	u32 active_tc_bitmap;
 	u32 maxtc;
 
+	/* RSS related capabilities */
+	u32 rss_table_size;		/* 512 for PFs and 64 for VFs */
+	u32 rss_table_entry_width;	/* RSS Entry width in bits */
+
 	/* Tx/Rx queues */
-	u16 num_rxq;		/* Number/Total Rx queues */
-	u16 rxq_first_id;	/* First queue ID for Rx queues */
-	u16 num_txq;		/* Number/Total Tx queues */
-	u16 txq_first_id;	/* First queue ID for Tx queues */
+	u32 num_rxq;			/* Number/Total Rx queues */
+	u32 rxq_first_id;		/* First queue ID for Rx queues */
+	u32 num_txq;			/* Number/Total Tx queues */
+	u32 txq_first_id;		/* First queue ID for Tx queues */
 
 	/* MSI-X vectors */
-	u16 num_msix_vectors;
-	u16 msix_vector_first_id;
+	u32 num_msix_vectors;
+	u32 msix_vector_first_id;
 
 	/* Max MTU for function or device */
-	u16 max_mtu;
+	u32 max_mtu;
 
-	/* Virtualization support */
+	/* WOL related */
+	u32 num_wol_proxy_fltr;
+	u32 wol_proxy_vsi_seid;
+
+	/* LED/SDP pin count */
+	u32 led_pin_num;
+	u32 sdp_pin_num;
+
+	/* LED/SDP - Supports up to 12 LED pins and 8 SDP signals */
+#define ICE_MAX_SUPPORTED_GPIO_LED	12
+#define ICE_MAX_SUPPORTED_GPIO_SDP	8
+	u8 led[ICE_MAX_SUPPORTED_GPIO_LED];
+	u8 sdp[ICE_MAX_SUPPORTED_GPIO_SDP];
+
+	/* SR-IOV virtualization */
 	u8 sr_iov_1_1;			/* SR-IOV enabled */
 
-	/* RSS related capabilities */
-	u16 rss_table_size;		/* 512 for PFs and 64 for VFs */
-	u8 rss_table_entry_width;	/* RSS Entry width in bits */
+	/* VMDQ */
+	u8 vmdq;			/* VMDQ supported */
+
+	/* EVB capabilities */
+	u8 evb_802_1_qbg;		/* Edge Virtual Bridging */
+	u8 evb_802_1_qbh;		/* Bridge Port Extension */
 
 	u8 dcb;
+	u8 iscsi;
+	u8 ieee_1588;
+	u8 mgmt_cem;
+	u8 iwarp;
+
+	/* WoL and APM support */
+#define ICE_WOL_SUPPORT_M		BIT(0)
+#define ICE_ACPI_PROG_MTHD_M		BIT(1)
+#define ICE_PROXY_SUPPORT_M		BIT(2)
+	u8 apm_wol_support;
+	u8 acpi_prog_mthd;
+	u8 proxy_support;
+	bool nvm_update_pending_nvm;
+	bool nvm_update_pending_orom;
+	bool nvm_update_pending_netlist;
+#define ICE_NVM_PENDING_NVM_IMAGE		BIT(0)
+#define ICE_NVM_PENDING_OROM			BIT(1)
+#define ICE_NVM_PENDING_NETLIST			BIT(2)
+	bool sec_rev_disabled;
+	bool update_disabled;
+	bool nvm_unified_update;
+#define ICE_NVM_MGMT_SEC_REV_DISABLED		BIT(0)
+#define ICE_NVM_MGMT_UPDATE_DISABLED		BIT(1)
+#define ICE_NVM_MGMT_UNIFIED_UPD_SUPPORT	BIT(3)
+
+	/* External topology device images within the NVM */
+#define ICE_EXT_TOPO_DEV_IMG_COUNT	4
+	u32 ext_topo_dev_img_ver_high[ICE_EXT_TOPO_DEV_IMG_COUNT];
+	u32 ext_topo_dev_img_ver_low[ICE_EXT_TOPO_DEV_IMG_COUNT];
+	u8 ext_topo_dev_img_part_num[ICE_EXT_TOPO_DEV_IMG_COUNT];
+#define ICE_EXT_TOPO_DEV_IMG_PART_NUM_S	8
+#define ICE_EXT_TOPO_DEV_IMG_PART_NUM_M	\
+		ICE_M(0xFF, ICE_EXT_TOPO_DEV_IMG_PART_NUM_S)
+	bool ext_topo_dev_img_load_en[ICE_EXT_TOPO_DEV_IMG_COUNT];
+#define ICE_EXT_TOPO_DEV_IMG_LOAD_EN	BIT(0)
+	bool ext_topo_dev_img_prog_en[ICE_EXT_TOPO_DEV_IMG_COUNT];
+#define ICE_EXT_TOPO_DEV_IMG_PROG_EN	BIT(1)
+};
+
+/* IEEE 1588 TIME_SYNC specific info */
+/* Function specific definitions */
+#define ICE_TS_FUNC_ENA_M		BIT(0)
+#define ICE_TS_SRC_TMR_OWND_M		BIT(1)
+#define ICE_TS_TMR_ENA_M		BIT(2)
+#define ICE_TS_TMR_IDX_OWND_S		4
+#define ICE_TS_TMR_IDX_OWND_M		BIT(4)
+#define ICE_TS_CLK_FREQ_S		16
+#define ICE_TS_CLK_FREQ_M		ICE_M(0x7, ICE_TS_CLK_FREQ_S)
+#define ICE_TS_CLK_SRC_S		20
+#define ICE_TS_CLK_SRC_M		BIT(20)
+#define ICE_TS_TMR_IDX_ASSOC_S		24
+#define ICE_TS_TMR_IDX_ASSOC_M		BIT(24)
+
+/* TIME_REF clock rate specification */
+enum ice_time_ref_freq {
+	ICE_TIME_REF_FREQ_25_000	= 0,
+	ICE_TIME_REF_FREQ_122_880	= 1,
+	ICE_TIME_REF_FREQ_125_000	= 2,
+	ICE_TIME_REF_FREQ_153_600	= 3,
+	ICE_TIME_REF_FREQ_156_250	= 4,
+	ICE_TIME_REF_FREQ_245_760	= 5,
+
+	NUM_ICE_TIME_REF_FREQ
+};
+
+struct ice_ts_func_info {
+	/* Function specific info */
+	enum ice_time_ref_freq time_ref;
+	u8 clk_freq;
+	u8 clk_src;
+	u8 tmr_index_assoc;
+	u8 ena;
+	u8 tmr_index_owned;
+	u8 src_tmr_owned;
+	u8 tmr_ena;
+};
+
+/* Device specific definitions */
+#define ICE_TS_TMR0_OWNR_M		0x7
+#define ICE_TS_TMR0_OWND_M		BIT(3)
+#define ICE_TS_TMR1_OWNR_S		4
+#define ICE_TS_TMR1_OWNR_M		ICE_M(0x7, ICE_TS_TMR1_OWNR_S)
+#define ICE_TS_TMR1_OWND_M		BIT(7)
+#define ICE_TS_DEV_ENA_M		BIT(24)
+#define ICE_TS_TMR0_ENA_M		BIT(25)
+#define ICE_TS_TMR1_ENA_M		BIT(26)
+
+struct ice_ts_dev_info {
+	/* Device specific info */
+	u32 ena_ports;
+	u32 tmr_own_map;
+	u32 tmr0_owner;
+	u32 tmr1_owner;
+	u8 tmr0_owned;
+	u8 tmr1_owned;
+	u8 ena;
+	u8 tmr0_ena;
+	u8 tmr1_ena;
 };
 
 /* Function specific capabilities */
@@ -182,6 +504,9 @@ struct ice_hw_func_caps {
 	u32 num_allocd_vfs;		/* Number of allocated VFs */
 	u32 vf_base_id;			/* Logical ID of the first VF */
 	u32 guar_num_vsi;
+	u32 fd_fltr_guar;		/* Number of filters guaranteed */
+	u32 fd_fltr_best_effort;	/* Number of best effort filters */
+	struct ice_ts_func_info ts_func_info;
 };
 
 /* Device wide capabilities */
@@ -189,12 +514,48 @@ struct ice_hw_dev_caps {
 	struct ice_hw_common_caps common_cap;
 	u32 num_vfs_exposed;		/* Total number of VFs exposed */
 	u32 num_vsi_allocd_to_host;	/* Excluding EMP VSI */
+	u32 num_flow_director_fltr;	/* Number of FD filters available */
+	struct ice_ts_dev_info ts_dev_info;
+	u32 num_funcs;
 };
 
-/* MAC info */
+
+/* Information about MAC such as address, etc... */
 struct ice_mac_info {
 	u8 lan_addr[ETH_ALEN];
 	u8 perm_addr[ETH_ALEN];
+	u8 port_addr[ETH_ALEN];
+	u8 wol_addr[ETH_ALEN];
+};
+
+/* PCI bus types */
+enum ice_bus_type {
+	ice_bus_unknown = 0,
+	ice_bus_pci_express,
+	ice_bus_embedded, /* Is device Embedded versus card */
+	ice_bus_reserved
+};
+
+/* PCI bus speeds */
+enum ice_pcie_bus_speed {
+	ice_pcie_speed_unknown	= 0xff,
+	ice_pcie_speed_2_5GT	= 0x14,
+	ice_pcie_speed_5_0GT	= 0x15,
+	ice_pcie_speed_8_0GT	= 0x16,
+	ice_pcie_speed_16_0GT	= 0x17
+};
+
+/* PCI bus widths */
+enum ice_pcie_link_width {
+	ice_pcie_lnk_width_resrv	= 0x00,
+	ice_pcie_lnk_x1			= 0x01,
+	ice_pcie_lnk_x2			= 0x02,
+	ice_pcie_lnk_x4			= 0x04,
+	ice_pcie_lnk_x8			= 0x08,
+	ice_pcie_lnk_x12		= 0x0C,
+	ice_pcie_lnk_x16		= 0x10,
+	ice_pcie_lnk_x32		= 0x20,
+	ice_pcie_lnk_width_unknown	= 0xff,
 };
 
 /* Reset types used to determine which kind of reset was requested. These
@@ -213,8 +574,13 @@ enum ice_reset_req {
 
 /* Bus parameters */
 struct ice_bus_info {
+	enum ice_pcie_bus_speed speed;
+	enum ice_pcie_link_width width;
+	enum ice_bus_type type;
+	u16 domain_num;
 	u16 device;
 	u8 func;
+	u8 bus_num;
 };
 
 /* Flow control (FC) parameters */
@@ -223,25 +589,117 @@ struct ice_fc_info {
 	enum ice_fc_mode req_mode;	/* FC mode requested by caller */
 };
 
-/* NVM Information */
+/* Option ROM version information */
+struct ice_orom_info {
+	u8 major;			/* Major version of OROM */
+	u8 patch;			/* Patch version of OROM */
+	u16 build;			/* Build version of OROM */
+	u32 srev;			/* Security revision */
+};
+
+/* NVM version information */
 struct ice_nvm_info {
-	u32 eetrack;              /* NVM data version */
-	u32 oem_ver;              /* OEM version info */
-	u16 sr_words;             /* Shadow RAM size in words */
-	u16 ver;                  /* NVM package version */
-	u8 blank_nvm_mode;        /* is NVM empty (no FW present) */
+	u32 eetrack;
+	u32 srev;
+	u8 major;
+	u8 minor;
+};
+
+/* Minimum Security Revision information */
+struct ice_minsrev_info {
+	u32 nvm;
+	u32 orom;
+	u8 nvm_valid : 1;
+	u8 orom_valid : 1;
+};
+
+/* netlist version information */
+struct ice_netlist_info {
+	u32 major;			/* major high/low */
+	u32 minor;			/* minor high/low */
+	u32 type;			/* type high/low */
+	u32 rev;			/* revision high/low */
+	u32 hash;			/* SHA-1 hash word */
+	u16 cust_ver;			/* customer version */
+};
+
+/* Enumeration of possible flash banks for the NVM, OROM, and Netlist modules
+ * of the flash image.
+ */
+enum ice_flash_bank {
+	ICE_INVALID_FLASH_BANK,
+	ICE_1ST_FLASH_BANK,
+	ICE_2ND_FLASH_BANK,
+};
+
+/* Enumeration of which flash bank is desired to read from, either the active
+ * bank or the inactive bank. Used to abstract 1st and 2nd bank notion from
+ * code which just wants to read the active or inactive flash bank.
+ */
+enum ice_bank_select {
+	ICE_ACTIVE_FLASH_BANK,
+	ICE_INACTIVE_FLASH_BANK,
+};
+
+/* information for accessing NVM, OROM, and Netlist flash banks */
+struct ice_bank_info {
+	u32 nvm_ptr;				/* Pointer to 1st NVM bank */
+	u32 nvm_size;				/* Size of NVM bank */
+	u32 orom_ptr;				/* Pointer to 1st OROM bank */
+	u32 orom_size;				/* Size of OROM bank */
+	u32 netlist_ptr;			/* Pointer to 1st Netlist bank */
+	u32 netlist_size;			/* Size of Netlist bank */
+	enum ice_flash_bank nvm_bank;		/* Active NVM bank */
+	enum ice_flash_bank orom_bank;		/* Active OROM bank */
+	enum ice_flash_bank netlist_bank;	/* Active Netlist bank */
+};
+
+/* Flash Chip Information */
+struct ice_flash_info {
+	struct ice_orom_info orom;	/* Option ROM version info */
+	struct ice_nvm_info nvm;	/* NVM version information */
+	struct ice_netlist_info netlist;/* Netlist version info */
+	struct ice_bank_info banks;	/* Flash Bank information */
+	u16 sr_words;			/* Shadow RAM size in words */
+	u32 flash_size;			/* Size of available flash in bytes */
+	u8 blank_nvm_mode;		/* is NVM empty (no FW present) */
+};
+
+struct ice_link_default_override_tlv {
+	u8 options;
+#define ICE_LINK_OVERRIDE_OPT_M		0x3F
+#define ICE_LINK_OVERRIDE_STRICT_MODE	BIT(0)
+#define ICE_LINK_OVERRIDE_EPCT_DIS	BIT(1)
+#define ICE_LINK_OVERRIDE_PORT_DIS	BIT(2)
+#define ICE_LINK_OVERRIDE_EN		BIT(3)
+#define ICE_LINK_OVERRIDE_AUTO_LINK_DIS	BIT(4)
+#define ICE_LINK_OVERRIDE_EEE_EN	BIT(5)
+	u8 phy_config;
+#define ICE_LINK_OVERRIDE_PHY_CFG_S	8
+#define ICE_LINK_OVERRIDE_PHY_CFG_M	(0xC3 << ICE_LINK_OVERRIDE_PHY_CFG_S)
+#define ICE_LINK_OVERRIDE_PAUSE_M	0x3
+#define ICE_LINK_OVERRIDE_LESM_EN	BIT(6)
+#define ICE_LINK_OVERRIDE_AUTO_FEC_EN	BIT(7)
+	u8 fec_options;
+#define ICE_LINK_OVERRIDE_FEC_OPT_M	0xFF
+	u8 rsvd1;
+	u64 phy_type_low;
+	u64 phy_type_high;
 };
 
 #define ICE_NVM_VER_LEN	32
 
 /* Max number of port to queue branches w.r.t topology */
-#define ICE_MAX_TRAFFIC_CLASS 8
 #define ICE_TXSCHED_MAX_BRANCHES ICE_MAX_TRAFFIC_CLASS
 
 #define ice_for_each_traffic_class(_i)	\
 	for ((_i) = 0; (_i) < ICE_MAX_TRAFFIC_CLASS; (_i)++)
 
+/* ICE_DFLT_AGG_ID means that all new VM(s)/VSI node connects
+ * to driver defined policy for default aggregator
+ */
 #define ICE_INVAL_TEID 0xFFFFFFFF
+#define ICE_DFLT_AGG_ID 0
 
 struct ice_sched_node {
 	struct ice_sched_node *parent;
@@ -256,38 +714,136 @@ struct ice_sched_node {
 	u8 tc_num;
 	u8 owner;
 #define ICE_SCHED_NODE_OWNER_LAN	0
+#define ICE_SCHED_NODE_OWNER_AE		1
+#define ICE_SCHED_NODE_OWNER_RDMA	2
 };
 
 /* Access Macros for Tx Sched Elements data */
 #define ICE_TXSCHED_GET_NODE_TEID(x) le32_to_cpu((x)->info.node_teid)
+#define ICE_TXSCHED_GET_PARENT_TEID(x) le32_to_cpu((x)->info.parent_teid)
+#define ICE_TXSCHED_GET_CIR_RL_ID(x)	\
+	le16_to_cpu((x)->info.cir_bw.bw_profile_idx)
+#define ICE_TXSCHED_GET_EIR_RL_ID(x)	\
+	le16_to_cpu((x)->info.eir_bw.bw_profile_idx)
+#define ICE_TXSCHED_GET_SRL_ID(x) le16_to_cpu((x)->info.srl_id)
+#define ICE_TXSCHED_GET_CIR_BWALLOC(x)	\
+	le16_to_cpu((x)->info.cir_bw.bw_alloc)
+#define ICE_TXSCHED_GET_EIR_BWALLOC(x)	\
+	le16_to_cpu((x)->info.eir_bw.bw_alloc)
+
+struct ice_sched_rl_profile {
+	u32 rate; /* In Kbps */
+	struct ice_aqc_rl_profile_elem info;
+};
 
 /* The aggregator type determines if identifier is for a VSI group,
  * aggregator group, aggregator of queues, or queue group.
  */
 enum ice_agg_type {
 	ICE_AGG_TYPE_UNKNOWN = 0,
-	ICE_AGG_TYPE_VSI,
+	ICE_AGG_TYPE_TC,
 	ICE_AGG_TYPE_AGG, /* aggregator */
-	ICE_AGG_TYPE_Q,
-	ICE_AGG_TYPE_QG
+	ICE_AGG_TYPE_VSI,
+	ICE_AGG_TYPE_QG,
+	ICE_AGG_TYPE_Q
+};
+
+/* Rate limit types */
+enum ice_rl_type {
+	ICE_UNKNOWN_BW = 0,
+	ICE_MIN_BW,		/* for CIR profile */
+	ICE_MAX_BW,		/* for EIR profile */
+	ICE_SHARED_BW		/* for shared profile */
 };
 
+#define ICE_SCHED_MIN_BW		500		/* in Kbps */
+#define ICE_SCHED_MAX_BW		100000000	/* in Kbps */
+#define ICE_SCHED_DFLT_BW		0xFFFFFFFF	/* unlimited */
+#define ICE_SCHED_NO_PRIORITY		0
+#define ICE_SCHED_NO_BW_WT		0
 #define ICE_SCHED_DFLT_RL_PROF_ID	0
-#define ICE_SCHED_DFLT_BW_WT		1
+#define ICE_SCHED_NO_SHARED_RL_PROF_ID	0xFFFF
+#define ICE_SCHED_DFLT_BW_WT		4
+#define ICE_SCHED_INVAL_PROF_ID		0xFFFF
+#define ICE_SCHED_DFLT_BURST_SIZE	(15 * 1024)	/* in bytes (15k) */
+
+/* Access Macros for Tx Sched RL Profile data */
+#define ICE_TXSCHED_GET_RL_PROF_ID(p) le16_to_cpu((p)->info.profile_id)
+#define ICE_TXSCHED_GET_RL_MBS(p) le16_to_cpu((p)->info.max_burst_size)
+#define ICE_TXSCHED_GET_RL_MULTIPLIER(p) le16_to_cpu((p)->info.rl_multiply)
+#define ICE_TXSCHED_GET_RL_WAKEUP_MV(p) le16_to_cpu((p)->info.wake_up_calc)
+#define ICE_TXSCHED_GET_RL_ENCODE(p) le16_to_cpu((p)->info.rl_encode)
+
+#define ICE_MAX_PORT_PER_PCI_DEV	8
+
+/* The following tree example shows the naming conventions followed under
+ * ice_port_info struct for default scheduler tree topology.
+ *
+ *                 A tree on a port
+ *                       *                ---> root node
+ *        (TC0)/  /  /  / \  \  \  \(TC7) ---> num_branches (range:1- 8)
+ *            *  *  *  *   *  *  *  *     |
+ *           /                            |
+ *          *                             |
+ *         /                              |-> num_elements (range:1 - 9)
+ *        *                               |   implies num_of_layers
+ *       /                                |
+ *   (a)*                                 |
+ *
+ *  (a) is the last_node_teid(not of type Leaf). A leaf node is created under
+ *  (a) as child node where queues get added, add Tx/Rx queue admin commands;
+ *  need TEID of (a) to add queues.
+ *
+ *  This tree
+ *       -> has 8 branches (one for each TC)
+ *       -> First branch (TC0) has 4 elements
+ *       -> has 4 layers
+ *       -> (a) is the topmost layer node created by firmware on branch 0
+ *
+ *  Note: Above asterisk tree covers only basic terminology and scenario.
+ *  Refer to the documentation for more info.
+ */
 
-/* VSI type list entry to locate corresponding VSI/ag nodes */
+ /* Data structure for saving BW information */
+enum ice_bw_type {
+	ICE_BW_TYPE_PRIO,
+	ICE_BW_TYPE_CIR,
+	ICE_BW_TYPE_CIR_WT,
+	ICE_BW_TYPE_EIR,
+	ICE_BW_TYPE_EIR_WT,
+	ICE_BW_TYPE_SHARED,
+	ICE_BW_TYPE_CNT		/* This must be last */
+};
+
+struct ice_bw {
+	u32 bw;
+	u16 bw_alloc;
+};
+
+struct ice_bw_type_info {
+	DECLARE_BITMAP(bw_t_bitmap, ICE_BW_TYPE_CNT);
+	u8 generic;
+	struct ice_bw cir_bw;
+	struct ice_bw eir_bw;
+	u32 shared_bw;
+};
+
+/* VSI queue context structure for given TC */
+struct ice_q_ctx {
+	u16  q_handle;
+	u32  q_teid;
+	/* bw_t_info saves queue BW information */
+	struct ice_bw_type_info bw_t_info;
+};
+
+/* VSI type list entry to locate corresponding VSI/aggregator nodes */
 struct ice_sched_vsi_info {
 	struct ice_sched_node *vsi_node[ICE_MAX_TRAFFIC_CLASS];
 	struct ice_sched_node *ag_node[ICE_MAX_TRAFFIC_CLASS];
-	struct list_head list_entry;
 	u16 max_lanq[ICE_MAX_TRAFFIC_CLASS];
-};
-
-/* driver defines the policy */
-struct ice_sched_tx_policy {
-	u16 max_num_vsis;
-	u8 max_num_lan_qs_per_tc[ICE_MAX_TRAFFIC_CLASS];
-	u8 rdma_ena;
+	u16 max_rdmaq[ICE_MAX_TRAFFIC_CLASS];
+	/* bw_t_info saves VSI BW information */
+	struct ice_bw_type_info bw_t_info[ICE_MAX_TRAFFIC_CLASS];
 };
 
 /* CEE or IEEE 802.1Qaz ETS Configuration data */
@@ -315,19 +871,18 @@ struct ice_dcb_app_priority_table {
 	u8 selector;
 };
 
-#define ICE_MAX_USER_PRIORITY	8
-#define ICE_DCBX_MAX_APPS	32
-#define ICE_LLDPDU_SIZE		1500
-#define ICE_TLV_STATUS_OPER	0x1
-#define ICE_TLV_STATUS_SYNC	0x2
-#define ICE_TLV_STATUS_ERR	0x4
-#define ICE_APP_PROT_ID_FCOE	0x8906
-#define ICE_APP_PROT_ID_ISCSI	0x0cbc
-#define ICE_APP_PROT_ID_FIP	0x8914
-#define ICE_APP_SEL_ETHTYPE	0x1
-#define ICE_APP_SEL_TCPIP	0x2
-#define ICE_CEE_APP_SEL_ETHTYPE	0x0
-#define ICE_CEE_APP_SEL_TCPIP	0x1
+#define ICE_MAX_USER_PRIORITY		8
+#define ICE_DCBX_MAX_APPS		64
+#define ICE_DSCP_NUM_VAL		64
+#define ICE_LLDPDU_SIZE			1500
+#define ICE_TLV_STATUS_OPER		0x1
+#define ICE_TLV_STATUS_SYNC		0x2
+#define ICE_TLV_STATUS_ERR		0x4
+#define ICE_APP_PROT_ID_ISCSI_860	0x035c
+#define ICE_APP_SEL_ETHTYPE		0x1
+#define ICE_APP_SEL_TCPIP		0x2
+#define ICE_CEE_APP_SEL_ETHTYPE		0x0
+#define ICE_CEE_APP_SEL_TCPIP		0x1
 
 struct ice_dcbx_cfg {
 	u32 numapps;
@@ -335,7 +890,14 @@ struct ice_dcbx_cfg {
 	struct ice_dcb_ets_cfg etscfg;
 	struct ice_dcb_ets_cfg etsrec;
 	struct ice_dcb_pfc_cfg pfc;
+#define ICE_QOS_MODE_VLAN	0x0
+#define ICE_QOS_MODE_DSCP	0x1
+	u8 pfc_mode;
 	struct ice_dcb_app_priority_table app[ICE_DCBX_MAX_APPS];
+	/* when DSCP mapping defined by user set its bit to 1 */
+	DECLARE_BITMAP(dscp_mapped, ICE_DSCP_NUM_VAL);
+	/* array holding DSCP -> UP/TC values for DSCP L3 QoS mode */
+	u8 dscp_map[ICE_DSCP_NUM_VAL];
 	u8 dcbx_mode;
 #define ICE_DCBX_MODE_CEE	0x1
 #define ICE_DCBX_MODE_IEEE	0x2
@@ -343,6 +905,14 @@ struct ice_dcbx_cfg {
 #define ICE_DCBX_APPS_NON_WILLING	0x1
 };
 
+struct ice_qos_cfg {
+	struct ice_dcbx_cfg local_dcbx_cfg;	/* Oper/Local Cfg */
+	struct ice_dcbx_cfg desired_dcbx_cfg;	/* CEE Desired Cfg */
+	struct ice_dcbx_cfg remote_dcbx_cfg;	/* Peer Cfg */
+	u8 dcbx_status : 3;			/* see ICE_DCBX_STATUS_DIS */
+	u8 is_sw_lldp : 1;
+};
+
 struct ice_port_info {
 	struct ice_sched_node *root;	/* Root Node per Port */
 	struct ice_hw *hw;		/* back pointer to HW instance */
@@ -364,37 +934,94 @@ struct ice_port_info {
 	struct mutex sched_lock;	/* protect access to TXSched tree */
 	struct ice_sched_node *
 		sib_head[ICE_MAX_TRAFFIC_CLASS][ICE_AQC_TOPO_MAX_LEVEL_NUM];
-	struct ice_dcbx_cfg local_dcbx_cfg;	/* Oper/Local Cfg */
-	/* DCBX info */
-	struct ice_dcbx_cfg remote_dcbx_cfg;	/* Peer Cfg */
-	struct ice_dcbx_cfg desired_dcbx_cfg;	/* CEE Desired Cfg */
-	/* LLDP/DCBX Status */
-	u8 dcbx_status:3;		/* see ICE_DCBX_STATUS_DIS */
-	u8 is_sw_lldp:1;
+	struct ice_bw_type_info root_node_bw_t_info;
+	struct ice_bw_type_info tc_node_bw_t_info[ICE_MAX_TRAFFIC_CLASS];
+	struct ice_qos_cfg qos_cfg;
 	u8 is_vf:1;
 };
 
 struct ice_switch_info {
 	struct list_head vsi_list_map_head;
 	struct ice_sw_recipe *recp_list;
+	u16 prof_res_bm_init;
+	u16 max_used_prof_index;
+
+	DECLARE_BITMAP(prof_res_bm[ICE_MAX_NUM_PROFILES], ICE_MAX_FV_WORDS);
+};
+
+
+/* Enum defining the different states of the mailbox snapshot in the
+ * PF-VF mailbox overflow detection algorithm. The snapshot can be in
+ * states:
+ * 1. ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT - generate a new static snapshot
+ * within the mailbox buffer.
+ * 2. ICE_MAL_VF_DETECT_STATE_TRAVERSE - iterate through the mailbox snaphot
+ * 3. ICE_MAL_VF_DETECT_STATE_DETECT - track the messages sent per VF via the
+ * mailbox and mark any VFs sending more messages than the threshold limit set.
+ * 4. ICE_MAL_VF_DETECT_STATE_INVALID - Invalid mailbox state set to 0xFFFFFFFF.
+ */
+enum ice_mbx_snapshot_state {
+	ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT = 0,
+	ICE_MAL_VF_DETECT_STATE_TRAVERSE,
+	ICE_MAL_VF_DETECT_STATE_DETECT,
+	ICE_MAL_VF_DETECT_STATE_INVALID = 0xFFFFFFFF,
 };
 
-/* FW logging configuration */
-struct ice_fw_log_evnt {
-	u8 cfg : 4;	/* New event enables to configure */
-	u8 cur : 4;	/* Current/active event enables */
+/* Structure to hold information of the static snapshot and the mailbox
+ * buffer data used to generate and track the snapshot.
+ * 1. state: the state of the mailbox snapshot in the malicious VF
+ * detection state handler ice_mbx_vf_state_handler()
+ * 2. head : head of the mailbox snapshot in a circular mailbox buffer
+ * 3. tail : tail of the mailbox snapshot in a circular mailbox buffer
+ * 4. num_iterations: number of messages traversed in circular mailbox buffer
+ * 5. num_msg_proc: number of messages processed in mailbox
+ * 6. num_pending_arq: number of pending asynchronous messages
+ * 7. max_num_msgs_mbx: maximum messages in mailbox for currently
+ * serviced work item or interrupt.
+ */
+struct ice_mbx_snap_buffer_data {
+	enum ice_mbx_snapshot_state state;
+	u32 head;
+	u32 tail;
+	u32 num_iterations;
+	u16 num_msg_proc;
+	u16 num_pending_arq;
+	u16 max_num_msgs_mbx;
+};
+
+/* Structure to track messages sent by VFs on mailbox:
+ * 1. vf_cntr : a counter array of VFs to track the number of
+ * asynchronous messages sent by each VF
+ * 2. vfcntr_len : number of entries in VF counter array
+ */
+struct ice_mbx_vf_counter {
+	u32 *vf_cntr;
+	u32 vfcntr_len;
 };
 
-struct ice_fw_log_cfg {
-	u8 cq_en : 1;    /* FW logging is enabled via the control queue */
-	u8 uart_en : 1;  /* FW logging is enabled via UART for all PFs */
-	u8 actv_evnts;   /* Cumulation of currently enabled log events */
+/* Structure to hold data relevant to the captured static snapshot
+ * of the PF-VF mailbox.
+ */
+struct ice_mbx_snapshot {
+	struct ice_mbx_snap_buffer_data mbx_buf;
+	struct ice_mbx_vf_counter mbx_vf;
+};
 
-#define ICE_FW_LOG_EVNT_INFO	(ICE_AQC_FW_LOG_INFO_EN >> ICE_AQC_FW_LOG_EN_S)
-#define ICE_FW_LOG_EVNT_INIT	(ICE_AQC_FW_LOG_INIT_EN >> ICE_AQC_FW_LOG_EN_S)
-#define ICE_FW_LOG_EVNT_FLOW	(ICE_AQC_FW_LOG_FLOW_EN >> ICE_AQC_FW_LOG_EN_S)
-#define ICE_FW_LOG_EVNT_ERR	(ICE_AQC_FW_LOG_ERR_EN >> ICE_AQC_FW_LOG_EN_S)
-	struct ice_fw_log_evnt evnts[ICE_AQC_FW_LOG_ID_MAX];
+/* Structure to hold data to be used for capturing or updating a
+ * static snapshot.
+ * 1. num_msg_proc: number of messages processed in mailbox
+ * 2. num_pending_arq: number of pending asynchronous messages
+ * 3. max_num_msgs_mbx: maximum messages in mailbox for currently
+ * serviced work item or interrupt.
+ * 4. async_watermark_val: An upper threshold set by caller to determine
+ * if the pending arq count is large enough to assume that there is
+ * the possibility of a mailicious VF.
+ */
+struct ice_mbx_data {
+	u16 num_msg_proc;
+	u16 num_pending_arq;
+	u16 max_num_msgs_mbx;
+	u16 async_watermark_val;
 };
 
 /* Port hardware description */
@@ -403,9 +1030,16 @@ struct ice_hw {
 	void *back;
 	struct ice_aqc_layer_props *layer_info;
 	struct ice_port_info *port_info;
-	u64 debug_mask;		/* bitmap for debug mask */
+	/* 2D Array for each Tx Sched RL Profile type */
+	struct ice_sched_rl_profile **cir_profiles;
+	struct ice_sched_rl_profile **eir_profiles;
+	struct ice_sched_rl_profile **srl_profiles;
+	/* PSM clock frequency for calculating RL profile params */
+	u32 psm_clk_freq;
+	u64 debug_mask;		/* BITMAP for debug mask */
 	enum ice_mac_type mac_type;
 
+	u16 fd_ctr_base;	/* FD counter base index */
 	/* pci info */
 	u16 device_id;
 	u16 vendor_id;
@@ -415,20 +1049,23 @@ struct ice_hw {
 
 	u8 pf_id;		/* device profile info */
 
+	u16 max_burst_size;	/* driver sets this value */
+
 	/* Tx Scheduler values */
-	u16 num_tx_sched_layers;
-	u16 num_tx_sched_phys_layers;
+	u8 num_tx_sched_layers;
+	u8 num_tx_sched_phys_layers;
 	u8 flattened_layers;
 	u8 max_cgds;
 	u8 sw_entry_point_layer;
 	u16 max_children[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 	struct list_head agg_list;	/* lists all aggregator */
-
+	/* List contain profile ID(s) and other params per layer */
+	struct list_head rl_prof_list[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 	struct ice_vsi_ctx *vsi_ctx[ICE_MAX_VSI];
 	u8 evb_veb;		/* true for VEB, false for VEPA */
 	u8 reset_ongoing;	/* true if HW is in reset, false otherwise */
 	struct ice_bus_info bus;
-	struct ice_nvm_info nvm;
+	struct ice_flash_info flash;
 	struct ice_hw_dev_caps dev_caps;	/* device capabilities */
 	struct ice_hw_func_caps func_caps;	/* function capabilities */
 
@@ -436,8 +1073,11 @@ struct ice_hw {
 
 	/* Control Queue info */
 	struct ice_ctl_q_info adminq;
+	struct ice_ctl_q_info sbq;
 	struct ice_ctl_q_info mailboxq;
-
+#define DCF_ACL_CAP		0x01	/* DCF ACL capability */
+#define DCF_UDP_TUNNEL_CAP	0x02	/* DCF UDP Tunnel capability */
+	u8 dcf_caps;
 	u8 api_branch;		/* API branch version */
 	u8 api_maj_ver;		/* API major version */
 	u8 api_min_ver;		/* API minor version */
@@ -448,10 +1088,11 @@ struct ice_hw {
 	u8 fw_patch;		/* firmware patch version */
 	u32 fw_build;		/* firmware build number */
 
-	struct ice_fw_log_cfg fw_log;
+	struct ice_fwlog_cfg fwlog_cfg;
+	bool fwlog_support_ena; /* does hardware support FW logging? */
 
 /* Device max aggregate bandwidths corresponding to the GL_PWR_MODE_CTL
- * register. Used for determining the ITR/intrl granularity during
+ * register. Used for determining the ITR/INTRL granularity during
  * initialization.
  */
 #define ICE_MAX_AGG_BW_200G	0x0
@@ -469,22 +1110,33 @@ struct ice_hw {
 	/* INTRL granularity in 1 us */
 	u8 intrl_gran;
 
-	u8 ucast_shared;	/* true if VSIs can share unicast addr */
+	/* true if VSIs can share unicast MAC addr */
+	u8 umac_shared;
+
+#define ICE_PHY_PER_NAC		1
+#define ICE_MAX_QUAD		2
+#define ICE_NUM_QUAD_TYPE	2
+#define ICE_PORTS_PER_QUAD	4
+#define ICE_PHY_0_LAST_QUAD	1
+#define ICE_PORTS_PER_PHY	8
+#define ICE_NUM_EXTERNAL_PORTS		ICE_PORTS_PER_PHY
+
 
 	/* Active package version (currently active) */
 	struct ice_pkg_ver active_pkg_ver;
+	u32 active_track_id;
 	u8 active_pkg_name[ICE_PKG_NAME_SIZE];
 	u8 active_pkg_in_nvm;
 
 	enum ice_aq_err pkg_dwnld_status;
 
-	/* Driver's package ver - (from the Metadata seg) */
+	/* Driver's package ver - (from the Ice Metadata section) */
 	struct ice_pkg_ver pkg_ver;
 	u8 pkg_name[ICE_PKG_NAME_SIZE];
 
-	/* Driver's Ice package version (from the Ice seg) */
-	struct ice_pkg_ver ice_pkg_ver;
-	u8 ice_pkg_name[ICE_PKG_NAME_SIZE];
+	/* Driver's Ice segment format version and id (from the Ice seg) */
+	struct ice_pkg_ver ice_seg_fmt_ver;
+	u8 ice_seg_id[ICE_SEG_ID_SIZE];
 
 	/* Pointer to the ice segment */
 	struct ice_seg *seg;
@@ -493,8 +1145,40 @@ struct ice_hw {
 	u8 *pkg_copy;
 	u32 pkg_size;
 
+	/* tunneling info */
+	struct mutex tnl_lock;
+	struct ice_tunnel_table tnl;
+
+	/* dvm boost update information */
+	struct ice_dvm_table dvm_upd;
+
+	struct ice_acl_tbl *acl_tbl;
+	struct ice_fd_hw_prof **acl_prof;
+	u16 acl_fltr_cnt[ICE_FLTR_PTYPE_MAX];
 	/* HW block tables */
 	struct ice_blk_info blk[ICE_BLK_COUNT];
+	struct mutex fl_profs_locks[ICE_BLK_COUNT];	/* lock fltr profiles */
+	struct list_head fl_profs[ICE_BLK_COUNT];
+	/* Flow Director filter info */
+	int fdir_active_fltr;
+
+	struct mutex fdir_fltr_lock;	/* protect Flow Director */
+	struct list_head fdir_list_head;
+
+	/* Book-keeping of side-band filter count per flow-type.
+	 * This is used to detect and handle input set changes for
+	 * respective flow-type.
+	 */
+	u16 fdir_fltr_cnt[ICE_FLTR_PTYPE_MAX];
+
+	struct ice_fd_hw_prof **fdir_prof;
+	DECLARE_BITMAP(fdir_perfect_fltr, ICE_FLTR_PTYPE_MAX);
+	struct mutex rss_locks;	/* protect RSS configuration */
+	struct list_head rss_list_head;
+	struct ice_mbx_snapshot mbx_snapshot;
+	DECLARE_BITMAP(hw_ptype, ICE_FLOW_PTYPE_MAX);
+	u8 dvm_ena;
+	__le16 io_expander_handle;
 };
 
 /* Statistics collected by each port, VSI, VEB, and S-channel */
@@ -513,6 +1197,16 @@ struct ice_eth_stats {
 	u64 tx_errors;			/* tepc */
 };
 
+#define ICE_MAX_UP	8
+
+/* Statistics collected per VEB per User Priority (UP) for up to 8 UPs */
+struct ice_veb_up_stats {
+	u64 up_rx_pkts[ICE_MAX_UP];
+	u64 up_rx_bytes[ICE_MAX_UP];
+	u64 up_tx_pkts[ICE_MAX_UP];
+	u64 up_tx_bytes[ICE_MAX_UP];
+};
+
 /* Statistics collected by the MAC */
 struct ice_hw_port_stats {
 	/* eth stats collected by the port */
@@ -552,26 +1246,207 @@ struct ice_hw_port_stats {
 	u64 tx_size_1023;		/* ptc1023 */
 	u64 tx_size_1522;		/* ptc1522 */
 	u64 tx_size_big;		/* ptc9522 */
+	u64 mac_short_pkt_dropped;	/* mspdc */
+	/* EEE LPI */
+	u32 tx_lpi_status;
+	u32 rx_lpi_status;
+	u64 tx_lpi_count;		/* etlpic */
+	u64 rx_lpi_count;		/* erlpic */
+	/* flow director stats */
+	u32 fd_sb_status;
+	u64 fd_sb_match;
+	u64 ch_atr_match;
+#ifdef ICE_ADD_PROBES
+	u64 arfs_tcpv4_match;
+	u64 arfs_tcpv6_match;
+	u64 arfs_udpv4_match;
+	u64 arfs_udpv6_match;
+#endif /* ICE_ADD_PROBES */
+};
+
+enum ice_sw_fwd_act_type {
+	ICE_FWD_TO_VSI = 0,
+	ICE_FWD_TO_VSI_LIST, /* Do not use this when adding filter */
+	ICE_FWD_TO_Q,
+	ICE_FWD_TO_QGRP,
+	ICE_DROP_PACKET,
+	ICE_INVAL_ACT
+};
+
+struct ice_aq_get_set_rss_lut_params {
+	u16 vsi_handle;		/* software VSI handle */
+	u16 lut_size;		/* size of the LUT buffer */
+	u8 lut_type;		/* type of the LUT (i.e. VSI, PF, Global) */
+	u8 *lut;		/* input RSS LUT for set and output RSS LUT for get */
+	u8 global_lut_id;	/* only valid when lut_type is global */
 };
 
 /* Checksum and Shadow RAM pointers */
-#define ICE_SR_NVM_DEV_STARTER_VER	0x18
-#define ICE_SR_NVM_EETRACK_LO		0x2D
-#define ICE_SR_NVM_EETRACK_HI		0x2E
-#define ICE_NVM_VER_LO_SHIFT		0
-#define ICE_NVM_VER_LO_MASK		(0xff << ICE_NVM_VER_LO_SHIFT)
-#define ICE_NVM_VER_HI_SHIFT		12
-#define ICE_NVM_VER_HI_MASK		(0xf << ICE_NVM_VER_HI_SHIFT)
-#define ICE_OEM_VER_PATCH_SHIFT		0
-#define ICE_OEM_VER_PATCH_MASK		(0xff << ICE_OEM_VER_PATCH_SHIFT)
-#define ICE_OEM_VER_BUILD_SHIFT		8
-#define ICE_OEM_VER_BUILD_MASK		(0xffff << ICE_OEM_VER_BUILD_SHIFT)
-#define ICE_OEM_VER_SHIFT		24
-#define ICE_OEM_VER_MASK		(0xff << ICE_OEM_VER_SHIFT)
+#define ICE_SR_NVM_CTRL_WORD			0x00
+#define ICE_SR_PHY_ANALOG_PTR			0x04
+#define ICE_SR_OPTION_ROM_PTR			0x05
+#define ICE_SR_RO_PCIR_REGS_AUTO_LOAD_PTR	0x06
+#define ICE_SR_AUTO_GENERATED_POINTERS_PTR	0x07
+#define ICE_SR_PCIR_REGS_AUTO_LOAD_PTR		0x08
+#define ICE_SR_EMP_GLOBAL_MODULE_PTR		0x09
+#define ICE_SR_EMP_IMAGE_PTR			0x0B
+#define ICE_SR_PE_IMAGE_PTR			0x0C
+#define ICE_SR_CSR_PROTECTED_LIST_PTR		0x0D
+#define ICE_SR_MNG_CFG_PTR			0x0E
+#define ICE_SR_EMP_MODULE_PTR			0x0F
+#define ICE_SR_PBA_BLOCK_PTR			0x16
+#define ICE_SR_BOOT_CFG_PTR			0x132
+#define ICE_SR_NVM_WOL_CFG			0x19
+#define ICE_NVM_OROM_VER_OFF			0x02
+#define ICE_SR_NVM_DEV_STARTER_VER		0x18
+#define ICE_SR_ALTERNATE_SAN_MAC_ADDR_PTR	0x27
+#define ICE_SR_PERMANENT_SAN_MAC_ADDR_PTR	0x28
+#define ICE_SR_NVM_MAP_VER			0x29
+#define ICE_SR_NVM_IMAGE_VER			0x2A
+#define ICE_SR_NVM_STRUCTURE_VER		0x2B
+#define ICE_SR_NVM_EETRACK_LO			0x2D
+#define ICE_SR_NVM_EETRACK_HI			0x2E
+#define ICE_NVM_VER_LO_SHIFT			0
+#define ICE_NVM_VER_LO_MASK			(0xff << ICE_NVM_VER_LO_SHIFT)
+#define ICE_NVM_VER_HI_SHIFT			12
+#define ICE_NVM_VER_HI_MASK			(0xf << ICE_NVM_VER_HI_SHIFT)
+#define ICE_OEM_EETRACK_ID			0xffffffff
+#define ICE_OROM_VER_PATCH_SHIFT		0
+#define ICE_OROM_VER_PATCH_MASK		(0xff << ICE_OROM_VER_PATCH_SHIFT)
+#define ICE_OROM_VER_BUILD_SHIFT		8
+#define ICE_OROM_VER_BUILD_MASK		(0xffff << ICE_OROM_VER_BUILD_SHIFT)
+#define ICE_OROM_VER_SHIFT			24
+#define ICE_OROM_VER_MASK			(0xff << ICE_OROM_VER_SHIFT)
+#define ICE_SR_VPD_PTR				0x2F
+#define ICE_SR_PXE_SETUP_PTR			0x30
+#define ICE_SR_PXE_CFG_CUST_OPTIONS_PTR		0x31
+#define ICE_SR_NVM_ORIGINAL_EETRACK_LO		0x34
+#define ICE_SR_NVM_ORIGINAL_EETRACK_HI		0x35
+#define ICE_SR_VLAN_CFG_PTR			0x37
+#define ICE_SR_POR_REGS_AUTO_LOAD_PTR		0x38
+#define ICE_SR_EMPR_REGS_AUTO_LOAD_PTR		0x3A
+#define ICE_SR_GLOBR_REGS_AUTO_LOAD_PTR		0x3B
+#define ICE_SR_CORER_REGS_AUTO_LOAD_PTR		0x3C
+#define ICE_SR_PHY_CFG_SCRIPT_PTR		0x3D
+#define ICE_SR_PCIE_ALT_AUTO_LOAD_PTR		0x3E
+#define ICE_SR_SW_CHECKSUM_WORD			0x3F
+#define ICE_SR_PFA_PTR				0x40
+#define ICE_SR_1ST_SCRATCH_PAD_PTR		0x41
+#define ICE_SR_1ST_NVM_BANK_PTR			0x42
+#define ICE_SR_NVM_BANK_SIZE			0x43
+#define ICE_SR_1ST_OROM_BANK_PTR		0x44
+#define ICE_SR_OROM_BANK_SIZE			0x45
+#define ICE_SR_NETLIST_BANK_PTR			0x46
+#define ICE_SR_NETLIST_BANK_SIZE		0x47
+#define ICE_SR_EMP_SR_SETTINGS_PTR		0x48
+#define ICE_SR_CONFIGURATION_METADATA_PTR	0x4D
+#define ICE_SR_IMMEDIATE_VALUES_PTR		0x4E
+#define ICE_SR_LINK_DEFAULT_OVERRIDE_PTR	0x134
+#define ICE_SR_POR_REGISTERS_AUTOLOAD_PTR	0x118
+
+/* CSS Header words */
+#define ICE_NVM_CSS_SREV_L			0x14
+#define ICE_NVM_CSS_SREV_H			0x15
+
+/* Length of CSS header section in words */
+#define ICE_CSS_HEADER_LENGTH			330
+
+/* Offset of Shadow RAM copy in the NVM bank area. */
+#define ICE_NVM_SR_COPY_WORD_OFFSET		roundup(ICE_CSS_HEADER_LENGTH, 32)
+
+/* Size in bytes of Option ROM trailer */
+#define ICE_NVM_OROM_TRAILER_LENGTH		(2 * ICE_CSS_HEADER_LENGTH)
+
+/* The Link Topology Netlist section is stored as a series of words. It is
+ * stored in the NVM as a TLV, with the first two words containing the type
+ * and length.
+ */
+#define ICE_NETLIST_LINK_TOPO_MOD_ID		0x011B
+#define ICE_NETLIST_TYPE_OFFSET			0x0000
+#define ICE_NETLIST_LEN_OFFSET			0x0001
+
+/* The Link Topology section follows the TLV header. When reading the netlist
+ * using ice_read_netlist_module, we need to account for the 2-word TLV
+ * header.
+ */
+#define ICE_NETLIST_LINK_TOPO_OFFSET(n)		((n) + 2)
+
+#define ICE_LINK_TOPO_MODULE_LEN		ICE_NETLIST_LINK_TOPO_OFFSET(0x0000)
+#define ICE_LINK_TOPO_NODE_COUNT		ICE_NETLIST_LINK_TOPO_OFFSET(0x0001)
+
+#define ICE_LINK_TOPO_NODE_COUNT_M		ICE_M(0x3FF, 0)
+
+/* The Netlist ID Block is located after all of the Link Topology nodes. */
+#define ICE_NETLIST_ID_BLK_SIZE			0x30
+#define ICE_NETLIST_ID_BLK_OFFSET(n)		ICE_NETLIST_LINK_TOPO_OFFSET(0x0004 + 2 * (n))
+
+/* netlist ID block field offsets (word offsets) */
+#define ICE_NETLIST_ID_BLK_MAJOR_VER_LOW	0x02
+#define ICE_NETLIST_ID_BLK_MAJOR_VER_HIGH	0x03
+#define ICE_NETLIST_ID_BLK_MINOR_VER_LOW	0x04
+#define ICE_NETLIST_ID_BLK_MINOR_VER_HIGH	0x05
+#define ICE_NETLIST_ID_BLK_TYPE_LOW		0x06
+#define ICE_NETLIST_ID_BLK_TYPE_HIGH		0x07
+#define ICE_NETLIST_ID_BLK_REV_LOW		0x08
+#define ICE_NETLIST_ID_BLK_REV_HIGH		0x09
+#define ICE_NETLIST_ID_BLK_SHA_HASH_WORD(n)	(0x0A + (n))
+#define ICE_NETLIST_ID_BLK_CUST_VER		0x2F
+
+/* Auxiliary field, mask and shift definition for Shadow RAM and NVM Flash */
+#define ICE_SR_VPD_SIZE_WORDS		512
+#define ICE_SR_PCIE_ALT_SIZE_WORDS	512
+#define ICE_SR_CTRL_WORD_1_S		0x06
+#define ICE_SR_CTRL_WORD_1_M		(0x03 << ICE_SR_CTRL_WORD_1_S)
+#define ICE_SR_CTRL_WORD_VALID		0x1
+#define ICE_SR_CTRL_WORD_OROM_BANK	BIT(3)
+#define ICE_SR_CTRL_WORD_NETLIST_BANK	BIT(4)
+#define ICE_SR_CTRL_WORD_NVM_BANK	BIT(5)
+
+#define ICE_SR_NVM_PTR_4KB_UNITS	BIT(15)
+
+/* Shadow RAM related */
 #define ICE_SR_SECTOR_SIZE_IN_WORDS	0x800
+#define ICE_SR_BUF_ALIGNMENT		4096
 #define ICE_SR_WORDS_IN_1KB		512
-
+/* Checksum should be calculated such that after adding all the words,
+ * including the checksum word itself, the sum should be 0xBABA.
+ */
+#define ICE_SR_SW_CHECKSUM_BASE		0xBABA
+
+/* Link override related */
+#define ICE_SR_PFA_LINK_OVERRIDE_WORDS		10
+#define ICE_SR_PFA_LINK_OVERRIDE_PHY_WORDS	4
+#define ICE_SR_PFA_LINK_OVERRIDE_OFFSET		2
+#define ICE_SR_PFA_LINK_OVERRIDE_FEC_OFFSET	1
+#define ICE_SR_PFA_LINK_OVERRIDE_PHY_OFFSET	2
+#define ICE_FW_API_LINK_OVERRIDE_MAJ		1
+#define ICE_FW_API_LINK_OVERRIDE_MIN		5
+#define ICE_FW_API_LINK_OVERRIDE_PATCH		2
+
+#define ICE_PBA_FLAG_DFLT		0xFAFA
 /* Hash redirection LUT for VSI - maximum array size */
 #define ICE_VSIQF_HLUT_ARRAY_SIZE	((VSIQF_HLUT_MAX_INDEX + 1) * 4)
 
+/*
+ * Defines for values in the VF_PE_DB_SIZE bits in the GLPCI_LBARCTRL register.
+ * This is needed to determine the BAR0 space for the VFs
+ */
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_0KB 0x0
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_8KB 0x1
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_64KB 0x2
+
+/* AQ API version for LLDP_FILTER_CONTROL */
+#define ICE_FW_API_LLDP_FLTR_MAJ	1
+#define ICE_FW_API_LLDP_FLTR_MIN	7
+#define ICE_FW_API_LLDP_FLTR_PATCH	1
+
+/* AQ API version for report default configuration */
+#define ICE_FW_API_REPORT_DFLT_CFG_MAJ		1
+#define ICE_FW_API_REPORT_DFLT_CFG_MIN		7
+#define ICE_FW_API_REPORT_DFLT_CFG_PATCH	3
+
+/* AQ API version for FW health reports */
+#define ICE_FW_API_HEALTH_REPORT_MAJ		1
+#define ICE_FW_API_HEALTH_REPORT_MIN		7
+#define ICE_FW_API_HEALTH_REPORT_PATCH		6
 #endif /* _ICE_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c
new file mode 100644
index 000000000000..bd1254d01b80
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_ops.h"
+#include "ice_vsi_vlan_lib.h"
+#include "ice_vlan_mode.h"
+#include "ice.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_virtchnl_pf.h"
+#include "ice_lib.h"
+
+static int
+noop_vlan_arg(struct ice_vsi __always_unused *vsi,
+	      struct ice_vlan * __always_unused vlan)
+{
+	return 0;
+}
+
+static int
+noop_vlan(struct ice_vsi __always_unused *vsi)
+{
+	return 0;
+}
+
+/**
+ * ice_vf_vsi_init_vlan_ops - Initialize default VSI VLAN ops for VF VSI
+ * @vsi: VF's VSI being configured
+ *
+ * If Double VLAN Mode (DVM) is enabled, assume that the VF supports the new
+ * VIRTCHNL_VF_VLAN_OFFLOAD_V2 capability and set up the VLAN ops accordingly.
+ * If SVM is enabled maintain the same level of VLAN support previous to
+ * VIRTCHNL_VF_VLAN_OFFLOAD_V2.
+ */
+void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops;
+	struct ice_pf *pf = vsi->back;
+	struct ice_vf *vf;
+
+	vf = &pf->vf[vsi->vf_id];
+
+	if (ice_is_dvm_ena(&pf->hw)) {
+		vlan_ops = &vsi->outer_vlan_ops;
+
+		/* outer VLAN ops regardless of port VLAN config */
+		vlan_ops->add_vlan = ice_vsi_add_vlan;
+		vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+		vlan_ops->ena_tx_filtering = ice_vsi_ena_tx_vlan_filtering;
+		vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+
+		if (ice_vf_is_port_vlan_ena(vf)) {
+			/* setup outer VLAN ops */
+			vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan;
+			vlan_ops->ena_rx_filtering =
+				ice_vsi_ena_rx_vlan_filtering;
+
+			/* setup inner VLAN ops */
+			vlan_ops = &vsi->inner_vlan_ops;
+			vlan_ops->add_vlan = noop_vlan_arg;
+			vlan_ops->del_vlan = noop_vlan_arg;
+			vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+			vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+			vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+			vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+		} else {
+			if (test_bit(ICE_FLAG_VF_VLAN_PRUNE_DIS, pf->flags))
+				vlan_ops->ena_rx_filtering = noop_vlan;
+			else
+				vlan_ops->ena_rx_filtering =
+					ice_vsi_ena_rx_vlan_filtering;
+
+			vlan_ops->del_vlan = ice_vsi_del_vlan;
+			vlan_ops->ena_stripping = ice_vsi_ena_outer_stripping;
+			vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+			vlan_ops->ena_insertion = ice_vsi_ena_outer_insertion;
+			vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+			/* setup inner VLAN ops */
+			vlan_ops = &vsi->inner_vlan_ops;
+
+			vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+			vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+			vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+			vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+		}
+	} else {
+		vlan_ops = &vsi->inner_vlan_ops;
+
+		/* inner VLAN ops regardless of port VLAN config */
+		vlan_ops->add_vlan = ice_vsi_add_vlan;
+		vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+		vlan_ops->ena_tx_filtering = ice_vsi_ena_tx_vlan_filtering;
+		vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+
+		if (ice_vf_is_port_vlan_ena(vf)) {
+			vlan_ops->set_port_vlan = ice_vsi_set_inner_port_vlan;
+			vlan_ops->ena_rx_filtering =
+				ice_vsi_ena_rx_vlan_filtering;
+		} else {
+			if (test_bit(ICE_FLAG_VF_VLAN_PRUNE_DIS, pf->flags))
+				vlan_ops->ena_rx_filtering = noop_vlan;
+			else
+				vlan_ops->ena_rx_filtering =
+					ice_vsi_ena_rx_vlan_filtering;
+
+			vlan_ops->del_vlan = ice_vsi_del_vlan;
+			vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+			vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+			vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+			vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+		}
+	}
+}
+
+/**
+ * ice_vf_vsi_cfg_dvm_legacy_vlan_mode - Config VLAN mode for old VFs in DVM
+ * @vsi: VF's VSI being configured
+ *
+ * This should only be called when Double VLAN Mode (DVM) is enabled, there
+ * is not a port VLAN enabled on this VF, and the VF negotiates
+ * VIRTCHNL_VF_OFFLOAD_VLAN.
+ *
+ * This function sets up the VF VSI's inner and outer ice_vsi_vlan_ops and also
+ * initializes software only VLAN mode (i.e. allow all VLANs). Also, use no-op
+ * implementations for any functions that may be called during the lifetime of
+ * the VF so these methods do nothing and succeed.
+ */
+void ice_vf_vsi_cfg_dvm_legacy_vlan_mode(struct ice_vsi *vsi)
+{
+	struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+	struct ice_vsi_vlan_ops *vlan_ops;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+
+	if (!ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+		return;
+
+	vlan_ops = &vsi->outer_vlan_ops;
+
+	/* Rx VLAN filtering always disabled to allow software offloaded VLANs
+	 * for VFs that only support VIRTCHNL_VF_OFFLOAD_VLAN and don't have a
+	 * port VLAN configured
+	 */
+	vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+	/* Don't fail when attempting to enable Rx VLAN filtering */
+	vlan_ops->ena_rx_filtering = noop_vlan;
+
+	/* Tx VLAN filtering always disabled to allow software offloaded VLANs
+	 * for VFs that only support VIRTCHNL_VF_OFFLOAD_VLAN and don't have a
+	 * port VLAN configured
+	 */
+	vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+	/* Don't fail when attempting to enable Tx VLAN filtering */
+	vlan_ops->ena_tx_filtering = noop_vlan;
+
+	if (vlan_ops->dis_rx_filtering(vsi))
+		dev_dbg(dev, "Failed to disable Rx VLAN filtering for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+	if (vlan_ops->dis_tx_filtering(vsi))
+		dev_dbg(dev, "Failed to disable Tx VLAN filtering for old VF without VIRTHCNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+	/* All outer VLAN offloads must be disabled */
+	vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+	vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+	if (vlan_ops->dis_stripping(vsi))
+		dev_dbg(dev, "Failed to disable outer VLAN stripping for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+	if (vlan_ops->dis_insertion(vsi))
+		dev_dbg(dev, "Failed to disable outer VLAN insertion for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+	/* All inner VLAN offloads must be disabled */
+	vlan_ops = &vsi->inner_vlan_ops;
+
+	vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+	vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+	if (vlan_ops->dis_stripping(vsi))
+		dev_dbg(dev, "Failed to disable inner VLAN stripping for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+	if (vlan_ops->dis_insertion(vsi))
+		dev_dbg(dev, "Failed to disable inner VLAN insertion for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+}
+
+/**
+ * ice_vf_vsi_cfg_svm_legacy_vlan_mode - Config VLAN mode for old VFs in SVM
+ * @vsi: VF's VSI being configured
+ *
+ * This should only be called when Single VLAN Mode (SVM) is enabled, there is
+ * not a port VLAN enabled on this VF, and the VF negotiates
+ * VIRTCHNL_VF_OFFLOAD_VLAN.
+ *
+ * All of the normal SVM VLAN ops are identical for this case. However, by
+ * default Rx VLAN filtering should be turned off by default in this case.
+ */
+void ice_vf_vsi_cfg_svm_legacy_vlan_mode(struct ice_vsi *vsi)
+{
+	struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+
+	if (ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+		return;
+
+	if (vsi->inner_vlan_ops.dis_rx_filtering(vsi))
+		dev_dbg(ice_pf_to_dev(vf->pf), "Failed to disable Rx VLAN filtering for old VF with VIRTCHNL_VF_OFFLOAD_VLAN support\n");
+}
+
+/**
+ * ice_vf_vsi_dcf_set_outer_port_vlan - Config outer port VLAN for VF in DVM
+ * @vsi: VF's VSI being configured
+ * @vlan: ice_vlan structure used to set the port VLAN
+ */
+int ice_vf_vsi_dcf_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+	int err;
+
+	if (!ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+		return -EOPNOTSUPP;
+
+	err = ice_vsi_set_outer_port_vlan(vsi, vlan);
+	if (err)
+		return err;
+
+	err = ice_vsi_add_vlan(vsi, vlan);
+	if (err)
+		return err;
+
+	vsi->outer_vlan_ops.add_vlan = noop_vlan_arg;
+
+	return 0;
+}
+
+/**
+ * ice_vf_vsi_dcf_ena_outer_vlan_stripping - Enable outer VLAN stripping for VF in DVM
+ * @vsi: VF's VSI being configured
+ * @tpid: TPID to enable outer VLAN stripping for
+ */
+int ice_vf_vsi_dcf_ena_outer_vlan_stripping(struct ice_vsi *vsi, u16 tpid)
+{
+	struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+	int err;
+
+	if (!ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+		return -EOPNOTSUPP;
+
+	err = ice_vsi_add_vlan_zero(vsi);
+	if (err)
+		return err;
+
+	err = ice_vsi_ena_outer_stripping(vsi, tpid);
+	if (err)
+		return err;
+
+	vsi->outer_vlan_ops.add_vlan = noop_vlan_arg;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h
new file mode 100644
index 000000000000..c02936e6aa31
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VF_VSI_VLAN_OPS_H_
+#define _ICE_VF_VSI_VLAN_OPS_H_
+
+#include "ice_vsi_vlan_ops.h"
+
+struct ice_vsi;
+
+void ice_vf_vsi_cfg_dvm_legacy_vlan_mode(struct ice_vsi *vsi);
+void ice_vf_vsi_cfg_svm_legacy_vlan_mode(struct ice_vsi *vsi);
+int ice_vf_vsi_dcf_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+int ice_vf_vsi_dcf_ena_outer_vlan_stripping(struct ice_vsi *vsi, u16 tpid);
+#ifdef CONFIG_PCI_IOV
+void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi);
+#else
+static inline void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) { }
+#endif /* CONFIG_PCI_IOV */
+
+#endif /* _ICE_PF_VSI_VLAN_OPS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
new file mode 100644
index 000000000000..ffbbe0bfc3d0
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_virtchnl_allowlist.h"
+
+/* Purpose of this file is to share functionality to allowlist or denylist
+ * opcodes used in PF <-> VF communication. Group of opcodes:
+ * - default -> should be always allowed after creating VF,
+ *   default_allowlist_opcodes
+ * - opcodes needed by VF to work correctly, but not associated with caps ->
+ *   should be allowed after successful VF resources allocation,
+ *   working_allowlist_opcodes
+ * - opcodes needed by VF when caps are activated
+ *
+ * Caps that don't use new opcodes (no opcodes should be allowed):
+ * - VIRTCHNL_VF_OFFLOAD_RSS_AQ
+ * - VIRTCHNL_VF_OFFLOAD_RSS_REG
+ * - VIRTCHNL_VF_OFFLOAD_WB_ON_ITR
+ * - VIRTCHNL_VF_OFFLOAD_CRC
+ * - VIRTCHNL_VF_OFFLOAD_RX_POLLING
+ * - VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2
+ * - VIRTCHNL_VF_OFFLOAD_ENCAP
+ * - VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM
+ * - VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM
+ * - VIRTCHNL_VF_OFFLOAD_USO
+ */
+
+/* default opcodes to communicate with VF */
+static const u32 default_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_VF_RESOURCES, VIRTCHNL_OP_VERSION, VIRTCHNL_OP_RESET_VF,
+};
+
+/* opcodes supported after successful VIRTCHNL_OP_GET_VF_RESOURCES */
+static const u32 working_allowlist_opcodes[] = {
+	VIRTCHNL_OP_CONFIG_TX_QUEUE, VIRTCHNL_OP_CONFIG_RX_QUEUE,
+	VIRTCHNL_OP_CONFIG_VSI_QUEUES, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+	VIRTCHNL_OP_ENABLE_QUEUES, VIRTCHNL_OP_DISABLE_QUEUES,
+	VIRTCHNL_OP_GET_STATS, VIRTCHNL_OP_EVENT,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_L2 */
+static const u32 l2_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_ETH_ADDR, VIRTCHNL_OP_DEL_ETH_ADDR,
+	VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+};
+
+/* VIRTCHNL_VF_CAP_RDMA */
+static const u32 rdma_allowlist_opcodes[] = {
+	VIRTCHNL_OP_RDMA, VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP,
+	VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_REQ_QUEUES */
+static const u32 req_queues_allowlist_opcodes[] = {
+	VIRTCHNL_OP_REQUEST_QUEUES,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_VLAN */
+static const u32 vlan_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_VLAN, VIRTCHNL_OP_DEL_VLAN,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_VLAN_V2 */
+static const u32 vlan_v2_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS, VIRTCHNL_OP_ADD_VLAN_V2,
+	VIRTCHNL_OP_DEL_VLAN_V2, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2,
+	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2,
+	VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2,
+	VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_RSS_PF */
+static const u32 rss_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_CONFIG_RSS_KEY, VIRTCHNL_OP_CONFIG_RSS_LUT,
+	VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_ADQ */
+static const u32 adq_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ENABLE_CHANNELS, VIRTCHNL_OP_DISABLE_CHANNELS,
+	VIRTCHNL_OP_ADD_CLOUD_FILTER, VIRTCHNL_OP_DEL_CLOUD_FILTER,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_ADQ_V2 */
+static const u32 adq_v2_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ENABLE_CHANNELS, VIRTCHNL_OP_DISABLE_CHANNELS,
+	VIRTCHNL_OP_ADD_CLOUD_FILTER, VIRTCHNL_OP_DEL_CLOUD_FILTER,
+};
+
+/* VIRTCHNL_VF_CAP_DCF */
+static const u32 cap_dcf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_DCF_VLAN_OFFLOAD,
+	VIRTCHNL_OP_DCF_CMD_DESC, VIRTCHNL_OP_DCF_CMD_BUFF,
+	VIRTCHNL_OP_DCF_DISABLE, VIRTCHNL_OP_DCF_GET_VSI_MAP,
+	VIRTCHNL_OP_DCF_GET_PKG_INFO,
+        VIRTCHNL_OP_DCF_RULE_FLUSH,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC */
+static const u32 rx_flex_desc_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_SUPPORTED_RXDIDS,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF */
+static const u32 adv_rss_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_RSS_CFG, VIRTCHNL_OP_DEL_RSS_CFG,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_FDIR_PF */
+static const u32 fdir_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_FDIR_FILTER, VIRTCHNL_OP_DEL_FDIR_FILTER,
+};
+
+
+static const u32 large_num_qpairs_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_MAX_RSS_QREGION,
+	VIRTCHNL_OP_ENABLE_QUEUES_V2,
+	VIRTCHNL_OP_DISABLE_QUEUES_V2,
+	VIRTCHNL_OP_MAP_QUEUE_VECTOR,
+};
+
+struct allowlist_opcode_info {
+	const u32 *opcodes;
+	size_t size;
+};
+
+#define BIT_INDEX(caps) (HWEIGHT((caps) - 1))
+#define ALLOW_ITEM(caps, list) \
+	[BIT_INDEX(caps)] = { \
+		.opcodes = list, \
+		.size = ARRAY_SIZE(list) \
+	}
+static const struct allowlist_opcode_info allowlist_opcodes[] = {
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_L2, l2_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_CAP_RDMA, rdma_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_REQ_QUEUES, req_queues_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN, vlan_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RSS_PF, rss_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADQ, adq_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADQ_V2, adq_v2_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_CAP_DCF, cap_dcf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC, rx_flex_desc_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF, adv_rss_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_FDIR_PF, fdir_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_LARGE_NUM_QPAIRS, large_num_qpairs_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN_V2, vlan_v2_allowlist_opcodes),
+};
+
+/**
+ * ice_vc_is_opcode_allowed - check if this opcode is allowed on this VF
+ * @vf: pointer to VF structure
+ * @opcode: virtchnl opcode
+ *
+ * Return true if message is allowed on this VF
+ */
+bool ice_vc_is_opcode_allowed(struct ice_vf *vf, u32 opcode)
+{
+	if (opcode >= VIRTCHNL_OP_MAX)
+		return false;
+
+	return test_bit(opcode, vf->opcodes_allowlist);
+}
+
+/**
+ * ice_vc_allowlist_opcodes - allowlist selected opcodes
+ * @vf: pointer to VF structure
+ * @opcodes: array of opocodes to allowlist
+ * @size: size of opcodes array
+ *
+ * Function should be called to allowlist opcodes on VF.
+ */
+static void
+ice_vc_allowlist_opcodes(struct ice_vf *vf, const u32 *opcodes, size_t size)
+{
+	unsigned int i;
+
+	for (i = 0; i < size; i++)
+		set_bit(opcodes[i], vf->opcodes_allowlist);
+}
+
+/**
+ * ice_vc_clear_allowlist - clear all allowlist opcodes
+ * @vf: pointer to VF structure
+ */
+static void ice_vc_clear_allowlist(struct ice_vf *vf)
+{
+	bitmap_zero(vf->opcodes_allowlist, VIRTCHNL_OP_MAX);
+}
+
+/**
+ * ice_vc_set_default_allowlist - allowlist default opcodes for VF
+ * @vf: pointer to VF structure
+ */
+void ice_vc_set_default_allowlist(struct ice_vf *vf)
+{
+	ice_vc_clear_allowlist(vf);
+	ice_vc_allowlist_opcodes(vf, default_allowlist_opcodes,
+				 ARRAY_SIZE(default_allowlist_opcodes));
+}
+
+/**
+ * ice_vc_set_working_allowlist - allowlist opcodes needed to by VF to work
+ * @vf: pointer to VF structure
+ *
+ * Allowlist opcodes that aren't associated with specific caps, but
+ * are needed by VF to work.
+ */
+void ice_vc_set_working_allowlist(struct ice_vf *vf)
+{
+	ice_vc_allowlist_opcodes(vf, working_allowlist_opcodes,
+				 ARRAY_SIZE(working_allowlist_opcodes));
+}
+
+/**
+ * ice_vc_set_caps_allowlist - allowlist VF opcodes according caps
+ * @vf: pointer to VF structure
+ */
+void ice_vc_set_caps_allowlist(struct ice_vf *vf)
+{
+	unsigned long caps = vf->driver_caps;
+	unsigned int i;
+
+	for_each_set_bit(i, &caps, ARRAY_SIZE(allowlist_opcodes))
+		ice_vc_allowlist_opcodes(vf, allowlist_opcodes[i].opcodes,
+					 allowlist_opcodes[i].size);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h
new file mode 100644
index 000000000000..c33bc6ac3f54
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VIRTCHNL_ALLOWLIST_H_
+#define _ICE_VIRTCHNL_ALLOWLIST_H_
+#include "ice.h"
+
+bool ice_vc_is_opcode_allowed(struct ice_vf *vf, u32 opcode);
+
+void ice_vc_set_default_allowlist(struct ice_vf *vf);
+void ice_vc_set_working_allowlist(struct ice_vf *vf);
+void ice_vc_set_caps_allowlist(struct ice_vf *vf);
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
new file mode 100644
index 000000000000..a02ebef5772a
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
@@ -0,0 +1,2267 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_base.h"
+#include "ice_lib.h"
+
+#define to_fltr_conf_from_desc(p) \
+	container_of(p, struct virtchnl_fdir_fltr_conf, input)
+
+#define ICE_FLOW_PROF_TYPE_S	0
+#define ICE_FLOW_PROF_TYPE_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_TYPE_S)
+#define ICE_FLOW_PROF_VSI_S	32
+#define ICE_FLOW_PROF_VSI_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_VSI_S)
+
+/* Flow profile ID format:
+ * [0:31] - flow type, flow + tun_offs
+ * [32:63] - VSI index
+ */
+#define ICE_FLOW_PROF_FD(vsi, flow, tun_offs) \
+	(u64)(((((flow) + (tun_offs)) & ICE_FLOW_PROF_TYPE_M)) | \
+	      (((u64)(vsi) << ICE_FLOW_PROF_VSI_S) & ICE_FLOW_PROF_VSI_M))
+
+#define GTPU_TEID_OFFSET 4
+#define GTPU_EH_QFI_OFFSET 1
+#define GTPU_EH_QFI_MASK 0x3F
+#define PFCP_S_OFFSET 0
+#define PFCP_S_MASK 0x1
+#define PFCP_PORT_NR 8805
+
+#define FDIR_INSET_FLAG_ESP_S 0
+#define FDIR_INSET_FLAG_ESP_M BIT_ULL(FDIR_INSET_FLAG_ESP_S)
+#define FDIR_INSET_FLAG_ESP_UDP BIT_ULL(FDIR_INSET_FLAG_ESP_S)
+#define FDIR_INSET_FLAG_ESP_IPSEC (0ULL << FDIR_INSET_FLAG_ESP_S)
+
+#define FDIR_INSET_FLAG_ECPRI_S 1
+#define FDIR_INSET_FLAG_ECPRI_M BIT_ULL(FDIR_INSET_FLAG_ECPRI_S)
+#define FDIR_INSET_FLAG_ECPRI_UDP BIT_ULL(FDIR_INSET_FLAG_ECPRI_S)
+#define FDIR_INSET_FLAG_ECPRI_MAC (0ULL << FDIR_INSET_FLAG_ECPRI_S)
+
+enum ice_fdir_tunnel_type {
+	ICE_FDIR_TUNNEL_TYPE_NONE = 0,
+	ICE_FDIR_TUNNEL_TYPE_GTPU,
+	ICE_FDIR_TUNNEL_TYPE_GTPU_EH,
+	ICE_FDIR_TUNNEL_TYPE_ECPRI,
+};
+
+struct virtchnl_fdir_fltr_conf {
+	struct ice_fdir_fltr input;
+	enum ice_fdir_tunnel_type ttype;
+	u64 inset_flag;
+	u32 flow_id;
+};
+
+struct virtchnl_fdir_inset_map {
+	enum virtchnl_proto_hdr_field field;
+	enum ice_flow_field fld;
+	u64 flag;
+	u64 mask;
+};
+
+static const struct virtchnl_fdir_inset_map fdir_inset_map[] = {
+	{VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE, ICE_FLOW_FIELD_IDX_ETH_TYPE,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_SRC, ICE_FLOW_FIELD_IDX_IPV4_SA,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_DST, ICE_FLOW_FIELD_IDX_IPV4_DA,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_DSCP, ICE_FLOW_FIELD_IDX_IPV4_DSCP,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_TTL, ICE_FLOW_FIELD_IDX_IPV4_TTL,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_PROT, ICE_FLOW_FIELD_IDX_IPV4_PROT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_SRC, ICE_FLOW_FIELD_IDX_IPV6_SA,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_DST, ICE_FLOW_FIELD_IDX_IPV6_DA,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_TC, ICE_FLOW_FIELD_IDX_IPV6_DSCP,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT, ICE_FLOW_FIELD_IDX_IPV6_TTL,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_PROT, ICE_FLOW_FIELD_IDX_IPV6_PROT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_UDP_SRC_PORT, ICE_FLOW_FIELD_IDX_UDP_SRC_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_UDP_DST_PORT, ICE_FLOW_FIELD_IDX_UDP_DST_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_TCP_SRC_PORT, ICE_FLOW_FIELD_IDX_TCP_SRC_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_TCP_DST_PORT, ICE_FLOW_FIELD_IDX_TCP_DST_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT, ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_SCTP_DST_PORT, ICE_FLOW_FIELD_IDX_SCTP_DST_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_GTPU_IP_TEID, ICE_FLOW_FIELD_IDX_GTPU_IP_TEID,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH_QFI, ICE_FLOW_FIELD_IDX_GTPU_EH_QFI,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_ESP_SPI, ICE_FLOW_FIELD_IDX_ESP_SPI,
+		FDIR_INSET_FLAG_ESP_IPSEC, FDIR_INSET_FLAG_ESP_M},
+	{VIRTCHNL_PROTO_HDR_ESP_SPI, ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI,
+		FDIR_INSET_FLAG_ESP_UDP, FDIR_INSET_FLAG_ESP_M},
+	{VIRTCHNL_PROTO_HDR_AH_SPI, ICE_FLOW_FIELD_IDX_AH_SPI,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID, ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_PFCP_S_FIELD, ICE_FLOW_FIELD_IDX_UDP_DST_PORT,
+		0, 0},
+	{
+		VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID,
+		ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID,
+		FDIR_INSET_FLAG_ECPRI_MAC,
+		FDIR_INSET_FLAG_ECPRI_M
+	},
+	{
+		VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID,
+		ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID,
+		FDIR_INSET_FLAG_ECPRI_UDP,
+		FDIR_INSET_FLAG_ECPRI_M
+	},
+};
+
+/**
+ * ice_vc_fdir_param_check
+ * @vf: pointer to the VF structure
+ * @vsi_id: VF relative VSI ID
+ *
+ * Check for the valid VSI ID, PF's state and VF's state
+ *
+ * Return: 0 on success, and -EINVAL on error.
+ */
+static int
+ice_vc_fdir_param_check(struct ice_vf *vf, u16 vsi_id)
+{
+	struct ice_pf *pf = vf->pf;
+
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EINVAL;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+		return -EINVAL;
+
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF))
+		return -EINVAL;
+
+	if (vsi_id != vf->lan_vsi_num)
+		return -EINVAL;
+
+	if (!ice_vc_isvalid_vsi_id(vf, vsi_id))
+		return -EINVAL;
+
+	if (!ice_get_vf_vsi(vf))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_vf_start_ctrl_vsi
+ * @vf: pointer to the VF structure
+ *
+ * Allocate ctrl_vsi for the first time and open the ctrl_vsi port for VF
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int ice_vf_start_ctrl_vsi(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *ctrl_vsi;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+		return -EEXIST;
+
+	ctrl_vsi = ice_vf_ctrl_vsi_setup(vf);
+	if (!ctrl_vsi) {
+		dev_dbg(dev, "Could not setup control VSI for VF %d\n",
+			vf->vf_id);
+		return -ENOMEM;
+	}
+
+	err = ice_vsi_open_ctrl(ctrl_vsi);
+	if (err) {
+		dev_dbg(dev, "Could not open control VSI for VF %d\n",
+			vf->vf_id);
+		goto err_vsi_open;
+	}
+
+	return 0;
+
+err_vsi_open:
+	ice_vsi_release(ctrl_vsi);
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI) {
+		pf->vsi[vf->ctrl_vsi_idx] = NULL;
+		vf->ctrl_vsi_idx = ICE_NO_VSI;
+	}
+	return err;
+}
+
+/**
+ * ice_vc_fdir_alloc_prof - allocate profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_alloc_prof(struct ice_vf *vf, enum ice_fltr_ptype flow)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	if (!fdir->fdir_prof) {
+		fdir->fdir_prof = kcalloc(ICE_FLTR_PTYPE_MAX,
+					  sizeof(*fdir->fdir_prof),
+					  GFP_KERNEL);
+		if (!fdir->fdir_prof)
+			return -ENOMEM;
+	}
+
+	if (!fdir->fdir_prof[flow]) {
+		fdir->fdir_prof[flow] = kzalloc(sizeof(**fdir->fdir_prof),
+						GFP_KERNEL);
+		if (!fdir->fdir_prof[flow])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_free_prof - free profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ */
+static void
+ice_vc_fdir_free_prof(struct ice_vf *vf, enum ice_fltr_ptype flow)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	if (!fdir->fdir_prof)
+		return;
+
+	if (!fdir->fdir_prof[flow])
+		return;
+
+	kfree(fdir->fdir_prof[flow]);
+	fdir->fdir_prof[flow] = NULL;
+}
+
+/**
+ * ice_vc_fdir_free_prof_all - free all the profile for this VF
+ * @vf: pointer to the VF structure
+ */
+void ice_vc_fdir_free_prof_all(struct ice_vf *vf)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	enum ice_fltr_ptype flow;
+
+	if (!fdir->fdir_prof)
+		return;
+
+	for (flow = ICE_FLTR_PTYPE_NONF_NONE; flow < ICE_FLTR_PTYPE_MAX; flow++)
+		ice_vc_fdir_free_prof(vf, flow);
+
+	kfree(fdir->fdir_prof);
+	fdir->fdir_prof = NULL;
+}
+
+/**
+ * ice_vc_fdir_parse_flow_fld
+ * @vf: pointer to the VF structure
+ * @proto_hdr: virtual channel protocol filter header
+ * @conf: FDIR configuration for each filter
+ * @fld: field type array
+ * @fld_cnt: field counter
+ *
+ * Parse the virtual channel filter header and store them into field type array
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_parse_flow_fld(struct ice_vf *vf,
+			   struct virtchnl_proto_hdr *proto_hdr,
+			   struct virtchnl_fdir_fltr_conf *conf,
+			   enum ice_flow_field *fld,
+			   int *fld_cnt)
+{
+	struct virtchnl_proto_hdr hdr;
+	u32 i;
+
+	memcpy(&hdr, proto_hdr, sizeof(hdr));
+
+	for (i = 0; (i < ARRAY_SIZE(fdir_inset_map)) &&
+	     VIRTCHNL_GET_PROTO_HDR_FIELD(&hdr); i++) {
+		if (VIRTCHNL_TEST_PROTO_HDR(&hdr, fdir_inset_map[i].field)) {
+			if (fdir_inset_map[i].mask &&
+			    ((fdir_inset_map[i].mask & conf->inset_flag)
+			    != fdir_inset_map[i].flag))
+				continue;
+
+			fld[*fld_cnt] = fdir_inset_map[i].fld;
+			*fld_cnt += 1;
+			if (*fld_cnt >= ICE_FLOW_FIELD_IDX_MAX)
+				return -EINVAL;
+			VIRTCHNL_DEL_PROTO_HDR_FIELD(&hdr,
+						     fdir_inset_map[i].field);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_set_flow_fld
+ * @vf: pointer to the VF structure
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ * @seg: array of one or more packet segments that describe the flow
+ *
+ * Parse the virtual channel add msg buffer's field vector and store them into
+ * flow's packet segment field
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_set_flow_fld(struct ice_vf *vf,
+			 struct virtchnl_fdir_add *fltr,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 struct ice_flow_seg_info *seg)
+{
+	struct virtchnl_fdir_rule *rule = &fltr->rule_cfg;
+	enum ice_flow_field fld[ICE_FLOW_FIELD_IDX_MAX];
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct virtchnl_proto_hdrs *proto;
+	int fld_cnt = 0;
+	int i;
+
+	proto = &rule->proto_hdrs;
+	for (i = 0; i < proto->count; i++) {
+		struct virtchnl_proto_hdr *hdr = &proto->proto_hdr[i];
+		int ret;
+
+		ret = ice_vc_fdir_parse_flow_fld(vf, hdr, conf, fld, &fld_cnt);
+		if (ret)
+			return ret;
+	}
+
+	if (fld_cnt == 0) {
+		dev_dbg(dev, "Empty input set for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < fld_cnt; i++) {
+		ice_flow_set_fld(seg, fld[i],
+				 ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_set_flow_hdr - config the flow's packet segment header
+ * @vf: pointer to the VF structure
+ * @conf: FDIR configuration for each filter
+ * @seg: array of one or more packet segments that describe the flow
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_set_flow_hdr(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 struct ice_flow_seg_info *seg)
+{
+	enum ice_fltr_ptype flow = conf->input.flow_type;
+	enum ice_fdir_tunnel_type ttype = conf->ttype;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NON_IP_L2:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_ETH_NON_IP);
+		break;
+	case ICE_FLTR_PTYPE_NONF_ECPRI_TP0:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_ECPRI_TP0);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0 |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_L2TPV3 |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_ESP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_ESP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_AH:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_AH |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_NAT_T_ESP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_PFCP_NODE |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_PFCP_SESSION |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_TCP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_ICMP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_OTHER:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV4 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_OTHER:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_EH |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV4 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_DWN |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV4 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_UP |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV4 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_IPV6_OTHER:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV6 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_IPV6_OTHER:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_EH |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV6 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_DWN |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV6 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_UP |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV6 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_SCTP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_L2TPV3 |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_ESP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_ESP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_AH:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_AH |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_NAT_T_ESP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_PFCP_NODE |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_PFCP_SESSION |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_TCP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_SCTP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	default:
+		dev_dbg(dev, "Invalid flow type 0x%x for VF %d failed\n",
+			flow, vf->vf_id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_rem_prof - remove profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ */
+static void
+ice_vc_fdir_rem_prof(struct ice_vf *vf, enum ice_fltr_ptype flow, int tun)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	struct ice_fd_hw_prof *vf_prof;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vf_vsi;
+	struct device *dev;
+	struct ice_hw *hw;
+	u64 prof_id;
+	int i;
+
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	if (!fdir->fdir_prof || !fdir->fdir_prof[flow])
+		return;
+
+	vf_prof = fdir->fdir_prof[flow];
+
+	vf_vsi = ice_get_vf_vsi(vf);
+	if (!vf_vsi) {
+		dev_dbg(dev, "NULL vf %d vsi pointer\n", vf->vf_id);
+		return;
+	}
+
+	if (!fdir->prof_entry_cnt[flow][tun])
+		return;
+
+	prof_id = ICE_FLOW_PROF_FD(vf_vsi->vsi_num,
+				   flow, tun ? ICE_FLTR_PTYPE_MAX : 0);
+
+	for (i = 0; i < fdir->prof_entry_cnt[flow][tun]; i++) {
+		if (vf_prof->entry_h[i][tun]) {
+			u16 vsi_num = ice_get_hw_vsi_num(hw, vf_prof->vsi_h[i]);
+
+			ice_rem_prof_id_flow(hw, ICE_BLK_FD, vsi_num, prof_id);
+			ice_flow_rem_entry(hw, ICE_BLK_FD,
+					   vf_prof->entry_h[i][tun]);
+			vf_prof->entry_h[i][tun] = 0;
+		}
+	}
+
+	ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id);
+	kfree(vf_prof->fdir_seg[tun]);
+	vf_prof->fdir_seg[tun] = NULL;
+
+	for (i = 0; i < vf_prof->cnt; i++)
+		vf_prof->vsi_h[i] = 0;
+
+	fdir->prof_entry_cnt[flow][tun] = 0;
+}
+
+/**
+ * ice_vc_fdir_rem_prof_all - remove profile for this VF
+ * @vf: pointer to the VF structure
+ */
+void ice_vc_fdir_rem_prof_all(struct ice_vf *vf)
+{
+	enum ice_fltr_ptype flow;
+
+	for (flow = ICE_FLTR_PTYPE_NONF_NONE;
+	     flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		ice_vc_fdir_rem_prof(vf, flow, 0);
+		ice_vc_fdir_rem_prof(vf, flow, 1);
+	}
+}
+
+/**
+ * ice_vc_fdir_write_flow_prof
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ * @seg: array of one or more packet segments that describe the flow
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ *
+ * Write the flow's profile config and packet segment into the hardware
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_write_flow_prof(struct ice_vf *vf,
+			    enum ice_fltr_ptype flow,
+			    struct ice_flow_seg_info *seg,
+			    int tun)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	struct ice_vsi *vf_vsi, *ctrl_vsi;
+	struct ice_flow_seg_info *old_seg;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_fd_hw_prof *vf_prof;
+	enum ice_status status;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u64 entry1_h = 0;
+	u64 entry2_h = 0;
+	u64 prof_id;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	vf_vsi = ice_get_vf_vsi(vf);
+	if (!vf_vsi)
+		return -EINVAL;
+
+	ctrl_vsi = pf->vsi[vf->ctrl_vsi_idx];
+	if (!ctrl_vsi)
+		return -EINVAL;
+
+	vf_prof = fdir->fdir_prof[flow];
+	old_seg = vf_prof->fdir_seg[tun];
+	if (old_seg) {
+		if (!memcmp(old_seg, seg, sizeof(*seg) * ICE_FD_HW_SEG_MAX)) {
+			dev_dbg(dev, "Duplicated profile for VF %d!\n",
+				vf->vf_id);
+			return -EEXIST;
+		}
+
+		if (fdir->fdir_fltr_cnt[flow][tun]) {
+			ret = -EINVAL;
+			dev_dbg(dev, "Input set conflicts for VF %d\n",
+				vf->vf_id);
+			goto err_exit;
+		}
+
+		/* remove previously allocated profile */
+		ice_vc_fdir_rem_prof(vf, flow, tun);
+	}
+
+	prof_id = ICE_FLOW_PROF_FD(vf_vsi->vsi_num,
+				   flow, tun ? ICE_FLTR_PTYPE_MAX : 0);
+
+	status = ice_flow_add_prof(hw, ICE_BLK_FD, ICE_FLOW_RX, prof_id, seg,
+				   tun + 1, NULL, 0, &prof);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Could not add VSI flow 0x%x for VF %d\n",
+			flow, vf->vf_id);
+		goto err_exit;
+	}
+
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, vf_vsi->idx,
+				    vf_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, NULL, 0, &entry1_h);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Could not add flow 0x%x VSI entry for VF %d\n",
+			flow, vf->vf_id);
+		goto err_prof;
+	}
+
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, vf_vsi->idx,
+				    ctrl_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, NULL, 0, &entry2_h);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev,
+			"Could not add flow 0x%x Ctrl VSI entry for VF %d\n",
+			flow, vf->vf_id);
+		goto err_entry_1;
+	}
+
+	vf_prof->fdir_seg[tun] = seg;
+	vf_prof->cnt = 0;
+	fdir->prof_entry_cnt[flow][tun] = 0;
+
+	vf_prof->entry_h[vf_prof->cnt][tun] = entry1_h;
+	vf_prof->vsi_h[vf_prof->cnt] = vf_vsi->idx;
+	vf_prof->cnt++;
+	fdir->prof_entry_cnt[flow][tun]++;
+
+	vf_prof->entry_h[vf_prof->cnt][tun] = entry2_h;
+	vf_prof->vsi_h[vf_prof->cnt] = ctrl_vsi->idx;
+	vf_prof->cnt++;
+	fdir->prof_entry_cnt[flow][tun]++;
+
+	return 0;
+
+err_entry_1:
+	ice_rem_prof_id_flow(hw, ICE_BLK_FD,
+			     ice_get_hw_vsi_num(hw, vf_vsi->idx), prof_id);
+	ice_flow_rem_entry(hw, ICE_BLK_FD, entry1_h);
+err_prof:
+	ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id);
+err_exit:
+	return ret;
+}
+
+/**
+ * ice_vc_fdir_has_prof_conflict
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ *
+ * Check if @conf has conflicting profile with existing profiles
+ *
+ * Return: true on success, and false on error.
+ */
+static bool
+ice_vc_fdir_has_prof_conflict(struct ice_vf *vf,
+			      struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct ice_fdir_fltr *desc;
+
+	list_for_each_entry(desc, &vf->fdir.fdir_rule_list, fltr_node) {
+		struct virtchnl_fdir_fltr_conf *existing_conf =
+				to_fltr_conf_from_desc(desc);
+		struct ice_fdir_fltr *a = &existing_conf->input;
+		struct ice_fdir_fltr *b = &conf->input;
+
+		enum ice_fltr_ptype flow_type_a = a->flow_type;
+		enum ice_fltr_ptype flow_type_b = b->flow_type;
+
+		/* No need to compare two rules with different tunnel type */
+		if (existing_conf->ttype != conf->ttype)
+			continue;
+
+		/* No need to compare two rules with same protocol */
+		if (flow_type_a == flow_type_b)
+			continue;
+
+		switch (flow_type_a) {
+		case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+			if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_OTHER)
+				return true;
+			break;
+		case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+			if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+			    flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+			    flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_SCTP)
+				return true;
+			break;
+		case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+			if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_OTHER)
+				return true;
+			break;
+		case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+			if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_UDP ||
+			    flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_TCP ||
+			    flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_SCTP)
+				return true;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return false;
+}
+
+/**
+ * ice_vc_fdir_config_input_set
+ * @vf: pointer to the VF structure
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ *
+ * Config the input set type and value for virtual channel add msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_config_input_set(struct ice_vf *vf,
+			     struct virtchnl_fdir_add *fltr,
+			     struct virtchnl_fdir_fltr_conf *conf,
+			     int tun)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_flow_seg_info *seg;
+	enum ice_fltr_ptype flow;
+	int ret;
+
+	ret = ice_vc_fdir_has_prof_conflict(vf, conf);
+	if (ret) {
+		dev_dbg(dev, "Found flow prof conflict for VF %d\n", vf->vf_id);
+		return ret;
+	}
+
+	flow = input->flow_type;
+	ret = ice_vc_fdir_alloc_prof(vf, flow);
+	if (ret) {
+		dev_dbg(dev, "Alloc flow prof for VF %d failed\n", vf->vf_id);
+		return ret;
+	}
+
+	seg = kcalloc(ICE_FD_HW_SEG_MAX, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	ret = ice_vc_fdir_set_flow_fld(vf, fltr, conf, &seg[tun]);
+	if (ret) {
+		dev_dbg(dev, "Set flow field for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_set_flow_hdr(vf, conf, &seg[tun]);
+	if (ret) {
+		dev_dbg(dev, "Set flow hdr for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_write_flow_prof(vf, flow, seg, tun);
+	if (ret == -EEXIST) {
+		kfree(seg);
+	} else if (ret) {
+		dev_dbg(dev, "Write flow profile for VF %d failed\n",
+			vf->vf_id);
+		goto err_exit;
+	}
+
+	return 0;
+
+err_exit:
+	kfree(seg);
+	return ret;
+}
+
+/**
+ * ice_vc_fdir_parse_pattern
+ * @vf: pointer to the VF info
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ *
+ * Parse the virtual channel filter's pattern and store them into @conf
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_parse_pattern(struct ice_vf *vf,
+			  struct virtchnl_fdir_add *fltr,
+			  struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct virtchnl_proto_hdrs *proto = &fltr->rule_cfg.proto_hdrs;
+	enum virtchnl_proto_hdr_type l3 = VIRTCHNL_PROTO_HDR_NONE;
+	enum virtchnl_proto_hdr_type l4 = VIRTCHNL_PROTO_HDR_NONE;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_fdir_fltr *input = &conf->input;
+	int i;
+
+	if (proto->count > VIRTCHNL_MAX_NUM_PROTO_HDRS) {
+		dev_dbg(dev, "Invalid protocol count:0x%x for VF %d\n",
+			proto->count, vf->vf_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < proto->count; i++) {
+		struct virtchnl_proto_hdr *hdr = &proto->proto_hdr[i];
+		struct ip_esp_hdr *esph;
+		struct ip_auth_hdr *ah;
+		struct sctphdr *sctph;
+		struct ipv6hdr *ip6h;
+		struct udphdr *udph;
+		struct tcphdr *tcph;
+		struct ethhdr *eth;
+		struct iphdr *iph;
+		u8 msg_type;
+		u8 s_field;
+		u8 *rawh;
+
+		switch (hdr->type) {
+		case VIRTCHNL_PROTO_HDR_ETH:
+			eth = (struct ethhdr *)hdr->buffer;
+			input->flow_type = ICE_FLTR_PTYPE_NON_IP_L2;
+
+			if (hdr->field_selector)
+				input->ext_data.ether_type = eth->h_proto;
+			break;
+		case VIRTCHNL_PROTO_HDR_IPV4:
+			iph = (struct iphdr *)hdr->buffer;
+			l3 = VIRTCHNL_PROTO_HDR_IPV4;
+			input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_OTHER;
+
+			if (hdr->field_selector) {
+				input->ip.v4.src_ip = iph->saddr;
+				input->ip.v4.dst_ip = iph->daddr;
+				input->ip.v4.ttl = iph->ttl;
+				input->ip.v4.tos = iph->tos;
+				input->ip.v4.proto = iph->protocol;
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_IPV6:
+			ip6h = (struct ipv6hdr *)hdr->buffer;
+			l3 = VIRTCHNL_PROTO_HDR_IPV6;
+			input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_OTHER;
+
+			if (hdr->field_selector) {
+				memcpy(input->ip.v6.src_ip,
+				       ip6h->saddr.in6_u.u6_addr8,
+				       sizeof(ip6h->saddr));
+				memcpy(input->ip.v6.dst_ip,
+				       ip6h->daddr.in6_u.u6_addr8,
+				       sizeof(ip6h->daddr));
+				input->ip.v6.hlim = ip6h->hop_limit;
+				input->ip.v6.tc = ((u8)(ip6h->priority) << 4) |
+						  (ip6h->flow_lbl[0] >> 4);
+				input->ip.v6.proto = ip6h->nexthdr;
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_TCP:
+			tcph = (struct tcphdr *)hdr->buffer;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_TCP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_TCP;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4) {
+					input->ip.v4.src_port = tcph->source;
+					input->ip.v4.dst_port = tcph->dest;
+				} else if (l3 == VIRTCHNL_PROTO_HDR_IPV6) {
+					input->ip.v6.src_port = tcph->source;
+					input->ip.v6.dst_port = tcph->dest;
+				}
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_UDP:
+			udph = (struct udphdr *)hdr->buffer;
+			l4 = VIRTCHNL_PROTO_HDR_UDP;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_UDP;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4) {
+					input->ip.v4.src_port = udph->source;
+					input->ip.v4.dst_port = udph->dest;
+				} else if (l3 == VIRTCHNL_PROTO_HDR_IPV6) {
+					input->ip.v6.src_port = udph->source;
+					input->ip.v6.dst_port = udph->dest;
+				}
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_SCTP:
+			sctph = (struct sctphdr *)hdr->buffer;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_SCTP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_SCTP;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4) {
+					input->ip.v4.src_port = sctph->source;
+					input->ip.v4.dst_port = sctph->dest;
+				} else if (l3 == VIRTCHNL_PROTO_HDR_IPV6) {
+					input->ip.v6.src_port = sctph->source;
+					input->ip.v6.dst_port = sctph->dest;
+				}
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_L2TPV3:
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3;
+
+			if (hdr->field_selector)
+				input->l2tpv3_data.session_id =
+					*((__force __be32 *)hdr->buffer);
+			break;
+		case VIRTCHNL_PROTO_HDR_ESP:
+			esph = (struct ip_esp_hdr *)hdr->buffer;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4 &&
+			    l4 == VIRTCHNL_PROTO_HDR_UDP)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6 &&
+				 l4 == VIRTCHNL_PROTO_HDR_UDP)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV4 &&
+				 l4 == VIRTCHNL_PROTO_HDR_NONE)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_ESP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6 &&
+				 l4 == VIRTCHNL_PROTO_HDR_NONE)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_ESP;
+
+			if (l4 == VIRTCHNL_PROTO_HDR_UDP)
+				conf->inset_flag |= FDIR_INSET_FLAG_ESP_UDP;
+			else
+				conf->inset_flag |= FDIR_INSET_FLAG_ESP_IPSEC;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+					input->ip.v4.sec_parm_idx = esph->spi;
+				else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+					input->ip.v6.sec_parm_idx = esph->spi;
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_AH:
+			ah = (struct ip_auth_hdr *)hdr->buffer;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_AH;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_AH;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+					input->ip.v4.sec_parm_idx = ah->spi;
+				else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+					input->ip.v6.sec_parm_idx = ah->spi;
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_PFCP:
+			rawh = (u8 *)hdr->buffer;
+			s_field = (rawh[0] >> PFCP_S_OFFSET) & PFCP_S_MASK;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4 && s_field == 0)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV4 && s_field == 1)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6 && s_field == 0)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6 && s_field == 1)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+					input->ip.v4.dst_port =
+						cpu_to_be16(PFCP_PORT_NR);
+				else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+					input->ip.v6.dst_port =
+						cpu_to_be16(PFCP_PORT_NR);
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_IP:
+			rawh = (u8 *)hdr->buffer;
+			if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP)
+				input->flow_type =
+				    ICE_FLTR_PTYPE_NONF_IPV4_GTPU;
+			else
+				input->flow_type =
+				    ICE_FLTR_PTYPE_NONF_IPV6_GTPU;
+			if (hdr->field_selector)
+				input->gtpu_data.teid =
+					*(__force __be32 *)(&rawh[GTPU_TEID_OFFSET]);
+			conf->ttype = ICE_FDIR_TUNNEL_TYPE_GTPU;
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_EH:
+			rawh = (u8 *)hdr->buffer;
+			if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_GTPU)
+				input->flow_type =
+				    ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH;
+			else
+				input->flow_type =
+				    ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH;
+
+			if (hdr->field_selector)
+				input->gtpu_data.qfi =
+					rawh[GTPU_EH_QFI_OFFSET] &
+					GTPU_EH_QFI_MASK;
+			conf->ttype = ICE_FDIR_TUNNEL_TYPE_GTPU_EH;
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN:
+			rawh = (u8 *)hdr->buffer;
+			if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_GTPU)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW;
+			else
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW;
+
+			if (hdr->field_selector)
+				input->gtpu_data.qfi =
+					rawh[GTPU_EH_QFI_OFFSET] &
+					GTPU_EH_QFI_MASK;
+			conf->ttype = ICE_FDIR_TUNNEL_TYPE_GTPU_EH;
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP:
+			rawh = (u8 *)hdr->buffer;
+			if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_GTPU)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP;
+			else
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP;
+
+			if (hdr->field_selector)
+				input->gtpu_data.qfi =
+					rawh[GTPU_EH_QFI_OFFSET] &
+					GTPU_EH_QFI_MASK;
+			conf->ttype = ICE_FDIR_TUNNEL_TYPE_GTPU_EH;
+			break;
+		case VIRTCHNL_PROTO_HDR_ECPRI:
+			rawh = (u8 *)hdr->buffer;
+			msg_type = rawh[1];
+			if (l3 == VIRTCHNL_PROTO_HDR_NONE &&
+			    l4 == VIRTCHNL_PROTO_HDR_NONE &&
+			    msg_type == 0) {
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_ECPRI_TP0;
+				conf->inset_flag |= FDIR_INSET_FLAG_ECPRI_MAC;
+			} else if ((l3 == VIRTCHNL_PROTO_HDR_IPV4) &&
+				   (l4 == VIRTCHNL_PROTO_HDR_UDP) &&
+				   (msg_type == 0)) {
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0;
+				conf->inset_flag |= FDIR_INSET_FLAG_ECPRI_UDP;
+				conf->ttype = ICE_FDIR_TUNNEL_TYPE_ECPRI;
+			} else {
+				return -EINVAL;
+			}
+
+			if (hdr->field_selector)
+				input->ecpri_data.pc_id =
+					*(__force __be16 *)(&rawh[4]);
+			break;
+		default:
+			dev_dbg(dev, "Invalid header type 0x:%x for VF %d\n",
+				hdr->type, vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_parse_action
+ * @vf: pointer to the VF info
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ *
+ * Parse the virtual channel filter's action and store them into @conf
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_parse_action(struct ice_vf *vf,
+			 struct virtchnl_fdir_add *fltr,
+			 struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct virtchnl_filter_action_set *as = &fltr->rule_cfg.action_set;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_fdir_fltr *input = &conf->input;
+	u32 dest_num = 0;
+	u32 mark_num = 0;
+	int i;
+
+	if (as->count > VIRTCHNL_MAX_NUM_ACTIONS) {
+		dev_dbg(dev, "Invalid action numbers:0x%x for VF %d\n",
+			as->count, vf->vf_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < as->count; i++) {
+		struct virtchnl_filter_action *action = &as->actions[i];
+
+		switch (action->type) {
+		case VIRTCHNL_ACTION_PASSTHRU:
+			dest_num++;
+			input->dest_ctl =
+				ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_OTHER;
+			break;
+		case VIRTCHNL_ACTION_DROP:
+			dest_num++;
+			input->dest_ctl =
+				ICE_FLTR_PRGM_DESC_DEST_DROP_PKT;
+			break;
+		case VIRTCHNL_ACTION_QUEUE:
+			dest_num++;
+			input->dest_ctl =
+				ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QINDEX;
+			input->q_index = action->act_conf.queue.index;
+			break;
+		case VIRTCHNL_ACTION_Q_REGION:
+			dest_num++;
+			input->dest_ctl =
+				ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QGROUP;
+			input->q_index = action->act_conf.queue.index;
+			input->q_region = action->act_conf.queue.region;
+			break;
+		case VIRTCHNL_ACTION_MARK:
+			mark_num++;
+			input->fltr_id = action->act_conf.mark_id;
+			input->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_THREE;
+			break;
+		default:
+			dev_dbg(dev, "Invalid action type:0x%x for VF %d\n",
+				action->type, vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	if (dest_num == 0 || dest_num >= 2) {
+		dev_dbg(dev, "Invalid destination action for VF %d\n",
+			vf->vf_id);
+		return -EINVAL;
+	}
+
+	if (mark_num >= 2) {
+		dev_dbg(dev, "Too many mark actions for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_validate_fdir_fltr - validate the virtual channel filter
+ * @vf: pointer to the VF info
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_validate_fdir_fltr(struct ice_vf *vf,
+			  struct virtchnl_fdir_add *fltr,
+			  struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct virtchnl_proto_hdrs *proto = &fltr->rule_cfg.proto_hdrs;
+	int ret;
+
+	if (!ice_vc_validate_pattern(vf, proto))
+		return -EINVAL;
+
+	ret = ice_vc_fdir_parse_pattern(vf, fltr, conf);
+	if (ret)
+		return ret;
+
+	ret = ice_vc_fdir_parse_action(vf, fltr, conf);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_comp_rules - compare if two filter rules have the same value
+ * @conf_a: FDIR configuration for filter a
+ * @conf_b: FDIR configuration for filter b
+ *
+ * Return: 0 on success, and other on error.
+ */
+static bool
+ice_vc_fdir_comp_rules(struct virtchnl_fdir_fltr_conf *conf_a,
+		       struct virtchnl_fdir_fltr_conf *conf_b)
+{
+	struct ice_fdir_fltr *a = &conf_a->input;
+	struct ice_fdir_fltr *b = &conf_b->input;
+
+	if (conf_a->ttype != conf_b->ttype)
+		return false;
+	if (a->flow_type != b->flow_type)
+		return false;
+	if (memcmp(&a->ip, &b->ip, sizeof(a->ip)))
+		return false;
+	if (memcmp(&a->mask, &b->mask, sizeof(a->mask)))
+		return false;
+	if (memcmp(&a->gtpu_data, &b->gtpu_data, sizeof(a->gtpu_data)))
+		return false;
+	if (memcmp(&a->gtpu_mask, &b->gtpu_mask, sizeof(a->gtpu_mask)))
+		return false;
+	if (memcmp(&a->l2tpv3_data, &b->l2tpv3_data, sizeof(a->l2tpv3_data)))
+		return false;
+	if (memcmp(&a->l2tpv3_mask, &b->l2tpv3_mask, sizeof(a->l2tpv3_mask)))
+		return false;
+	if (memcmp(&a->ext_data, &b->ext_data, sizeof(a->ext_data)))
+		return false;
+	if (memcmp(&a->ext_mask, &b->ext_mask, sizeof(a->ext_mask)))
+		return false;
+	if (memcmp(&a->ecpri_data, &b->ecpri_data, sizeof(a->ecpri_data)))
+		return false;
+	if (memcmp(&a->ecpri_mask, &b->ecpri_mask, sizeof(a->ecpri_mask)))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_fdir_is_dup_fltr
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ *
+ * Check if there is duplicated rule with same @conf value
+ *
+ * Return: 0 true success, and false on error.
+ */
+static bool
+ice_vc_fdir_is_dup_fltr(struct ice_vf *vf,
+			struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct ice_fdir_fltr *desc;
+
+	list_for_each_entry(desc, &vf->fdir.fdir_rule_list, fltr_node) {
+		struct virtchnl_fdir_fltr_conf *node =
+				to_fltr_conf_from_desc(desc);
+
+		if (ice_vc_fdir_comp_rules(node, conf))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * ice_vc_fdir_insert_entry
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @id: pointer to ID value allocated by driver
+ *
+ * Insert FDIR conf entry into list and allocate ID for this filter
+ *
+ * Return: 0 true success, and other on error.
+ */
+static int
+ice_vc_fdir_insert_entry(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 u32 *id)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	int i;
+
+	/* alloc ID corresponding with conf */
+	i = idr_alloc(&vf->fdir.fdir_rule_idr, conf, 0,
+		      ICE_FDIR_MAX_FLTRS, GFP_KERNEL);
+	if (i < 0)
+		return -EINVAL;
+	*id = i;
+
+	list_add(&input->fltr_node, &vf->fdir.fdir_rule_list);
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_remove_entry - remove FDIR conf entry by ID value
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @id: filter rule's ID
+ */
+static void
+ice_vc_fdir_remove_entry(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 u32 id)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+
+	idr_remove(&vf->fdir.fdir_rule_idr, id);
+	list_del(&input->fltr_node);
+}
+
+/**
+ * ice_vc_fdir_lookup_entry - lookup FDIR conf entry by ID value
+ * @vf: pointer to the VF info
+ * @id: filter rule's ID
+ *
+ * Return: NULL on error, and other on success.
+ */
+static struct virtchnl_fdir_fltr_conf *
+ice_vc_fdir_lookup_entry(struct ice_vf *vf, u32 id)
+{
+	return idr_find(&vf->fdir.fdir_rule_idr, id);
+}
+
+/**
+ * ice_vc_fdir_flush_entry - remove all FDIR conf entry
+ * @vf: pointer to the VF info
+ */
+static void ice_vc_fdir_flush_entry(struct ice_vf *vf)
+{
+	struct ice_fdir_fltr *desc, *temp;
+
+	list_for_each_entry_safe(desc, temp,
+				 &vf->fdir.fdir_rule_list, fltr_node) {
+		struct virtchnl_fdir_fltr_conf *conf =
+				to_fltr_conf_from_desc(desc);
+
+		list_del(&desc->fltr_node);
+		kfree(conf);
+	}
+}
+
+/**
+ * ice_vc_fdir_write_fltr - write filter rule into hardware
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @add: true implies add rule, false implies del rules
+ * @is_tun: false implies non-tunnel type filter, true implies tunnel filter
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int ice_vc_fdir_write_fltr(struct ice_vf *vf,
+				  struct virtchnl_fdir_fltr_conf *conf,
+				  bool add,
+				  bool is_tun)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	struct ice_vsi *vsi, *ctrl_vsi;
+	struct ice_fltr_desc desc;
+	enum ice_status status;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int ret;
+	u8 *pkt;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		dev_dbg(dev, "Invalid vsi for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	input->dest_vsi = vsi->idx;
+	input->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW;
+
+	ctrl_vsi = pf->vsi[vf->ctrl_vsi_idx];
+	if (!ctrl_vsi) {
+		dev_dbg(dev, "Invalid ctrl_vsi for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	pkt = devm_kzalloc(dev, ICE_FDIR_MAX_RAW_PKT_SIZE, GFP_KERNEL);
+	if (!pkt)
+		return -ENOMEM;
+
+	ice_fdir_get_prgm_desc(hw, input, &desc, add);
+	status = ice_fdir_get_gen_prgm_pkt(hw, input, pkt, false, is_tun);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Gen training pkt for VF %d ptype %d failed\n",
+			vf->vf_id, input->flow_type);
+		goto err_free_pkt;
+	}
+
+	ret = ice_prgm_fdir_fltr(ctrl_vsi, &desc, pkt);
+	if (ret)
+		goto err_free_pkt;
+
+	return 0;
+
+err_free_pkt:
+	devm_kfree(dev, pkt);
+	return ret;
+}
+
+/**
+ * ice_vf_fdir_timer - FDIR program waiting timer interrupt handler
+ * @t: pointer to timer_list
+ */
+static void ice_vf_fdir_timer(struct timer_list *t)
+{
+	struct ice_vf_fdir_ctx *ctx_irq = from_timer(ctx_irq, t, rx_tmr);
+	struct ice_vf_fdir_ctx *ctx_done;
+	struct ice_vf_fdir *fdir;
+	unsigned long flags;
+	struct ice_vf *vf;
+	struct ice_pf *pf;
+
+	fdir = container_of(ctx_irq, struct ice_vf_fdir, ctx_irq);
+	vf = container_of(fdir, struct ice_vf, fdir);
+	ctx_done = &fdir->ctx_done;
+	pf = vf->pf;
+	spin_lock_irqsave(&fdir->ctx_lock, flags);
+	if (!(ctx_irq->flags & ICE_VF_FDIR_CTX_VALID)) {
+		spin_unlock_irqrestore(&fdir->ctx_lock, flags);
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ctx_irq->flags &= ~ICE_VF_FDIR_CTX_VALID;
+
+	ctx_done->flags |= ICE_VF_FDIR_CTX_VALID;
+	ctx_done->conf = ctx_irq->conf;
+	ctx_done->stat = ICE_FDIR_CTX_TIMEOUT;
+	ctx_done->v_opcode = ctx_irq->v_opcode;
+	spin_unlock_irqrestore(&fdir->ctx_lock, flags);
+
+	set_bit(ICE_FD_VF_FLUSH_CTX, pf->state);
+	ice_service_task_schedule(pf);
+}
+
+/**
+ * ice_vc_fdir_irq_handler - ctrl_vsi Rx queue interrupt handler
+ * @ctrl_vsi: pointer to a VF's CTRL VSI
+ * @rx_desc: pointer to FDIR Rx queue descriptor
+ */
+void
+ice_vc_fdir_irq_handler(struct ice_vsi *ctrl_vsi,
+			union ice_32b_rx_flex_desc *rx_desc)
+{
+	struct ice_pf *pf = ctrl_vsi->back;
+	struct ice_vf_fdir_ctx *ctx_done;
+	struct ice_vf_fdir_ctx *ctx_irq;
+	struct ice_vf_fdir *fdir;
+	unsigned long flags;
+	struct device *dev;
+	struct ice_vf *vf;
+	int ret;
+
+	vf = &pf->vf[ctrl_vsi->vf_id];
+
+	fdir = &vf->fdir;
+	ctx_done = &fdir->ctx_done;
+	ctx_irq = &fdir->ctx_irq;
+	dev = ice_pf_to_dev(pf);
+	spin_lock_irqsave(&fdir->ctx_lock, flags);
+	if (!(ctx_irq->flags & ICE_VF_FDIR_CTX_VALID)) {
+		spin_unlock_irqrestore(&fdir->ctx_lock, flags);
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ctx_irq->flags &= ~ICE_VF_FDIR_CTX_VALID;
+
+	ctx_done->flags |= ICE_VF_FDIR_CTX_VALID;
+	ctx_done->conf = ctx_irq->conf;
+	ctx_done->stat = ICE_FDIR_CTX_IRQ;
+	ctx_done->v_opcode = ctx_irq->v_opcode;
+	memcpy(&ctx_done->rx_desc, rx_desc, sizeof(*rx_desc));
+	spin_unlock_irqrestore(&fdir->ctx_lock, flags);
+
+	ret = del_timer(&ctx_irq->rx_tmr);
+	if (!ret)
+		dev_err(dev, "VF %d: Unexpected inactive timer!\n", vf->vf_id);
+
+	set_bit(ICE_FD_VF_FLUSH_CTX, pf->state);
+	ice_service_task_schedule(pf);
+}
+
+/**
+ * ice_vf_fdir_dump_info - dump FDIR information for diagnosis
+ * @vf: pointer to the VF info
+ */
+static void ice_vf_fdir_dump_info(struct ice_vf *vf)
+{
+	struct ice_vsi *vf_vsi;
+	u32 fd_size, fd_cnt;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u16 vsi_num;
+
+	pf = vf->pf;
+	hw = &pf->hw;
+	dev = ice_pf_to_dev(pf);
+	vf_vsi = ice_get_vf_vsi(vf);
+	vsi_num = ice_get_hw_vsi_num(hw, vf_vsi->idx);
+
+	fd_size = rd32(hw, VSIQF_FD_SIZE(vsi_num));
+	fd_cnt = rd32(hw, VSIQF_FD_CNT(vsi_num));
+	dev_dbg(dev, "VF %d: space allocated: guar:0x%x, be:0x%x, space consumed: guar:0x%x, be:0x%x",
+		vf->vf_id,
+		(fd_size & VSIQF_FD_CNT_FD_GCNT_M) >> VSIQF_FD_CNT_FD_GCNT_S,
+		(fd_size & VSIQF_FD_CNT_FD_BCNT_M) >> VSIQF_FD_CNT_FD_BCNT_S,
+		(fd_cnt & VSIQF_FD_CNT_FD_GCNT_M) >> VSIQF_FD_CNT_FD_GCNT_S,
+		(fd_cnt & VSIQF_FD_CNT_FD_BCNT_M) >> VSIQF_FD_CNT_FD_BCNT_S);
+}
+
+/**
+ * ice_vf_verify_rx_desc - verify received FDIR programming status descriptor
+ * @vf: pointer to the VF info
+ * @ctx: FDIR context info for post processing
+ * @status: virtchnl FDIR program status
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vf_verify_rx_desc(struct ice_vf *vf,
+		      struct ice_vf_fdir_ctx *ctx,
+		      enum virtchnl_fdir_prgm_status *status)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	u32 stat_err, error, prog_id;
+	int ret;
+
+	stat_err = le16_to_cpu(ctx->rx_desc.wb.status_error0);
+	if (((stat_err & ICE_FXD_FLTR_WB_QW1_DD_M) >>
+	    ICE_FXD_FLTR_WB_QW1_DD_S) != ICE_FXD_FLTR_WB_QW1_DD_YES) {
+		*status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "VF %d: Desc Done not set\n", vf->vf_id);
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	prog_id = (stat_err & ICE_FXD_FLTR_WB_QW1_PROG_ID_M) >>
+		ICE_FXD_FLTR_WB_QW1_PROG_ID_S;
+	if (prog_id == ICE_FXD_FLTR_WB_QW1_PROG_ADD &&
+	    ctx->v_opcode != VIRTCHNL_OP_ADD_FDIR_FILTER) {
+		dev_err(dev, "VF %d: Desc show add, but ctx not",
+			vf->vf_id);
+		*status = VIRTCHNL_FDIR_FAILURE_RULE_INVALID;
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	if (prog_id == ICE_FXD_FLTR_WB_QW1_PROG_DEL &&
+	    ctx->v_opcode != VIRTCHNL_OP_DEL_FDIR_FILTER) {
+		dev_err(dev, "VF %d: Desc show del, but ctx not",
+			vf->vf_id);
+		*status = VIRTCHNL_FDIR_FAILURE_RULE_INVALID;
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	error = (stat_err & ICE_FXD_FLTR_WB_QW1_FAIL_M) >>
+		ICE_FXD_FLTR_WB_QW1_FAIL_S;
+	if (error == ICE_FXD_FLTR_WB_QW1_FAIL_YES) {
+		if (prog_id == ICE_FXD_FLTR_WB_QW1_PROG_ADD) {
+			dev_err(dev, "VF %d, Failed to add FDIR rule due to no space in the table",
+				vf->vf_id);
+			*status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		} else {
+			dev_err(dev, "VF %d, Failed to remove FDIR rule, attempt to remove non-existent entry",
+				vf->vf_id);
+			*status = VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST;
+		}
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	error = (stat_err & ICE_FXD_FLTR_WB_QW1_FAIL_PROF_M) >>
+		ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S;
+	if (error == ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES) {
+		dev_err(dev, "VF %d: Profile matching error", vf->vf_id);
+		*status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	*status = VIRTCHNL_FDIR_SUCCESS;
+
+	return 0;
+
+err_exit:
+	ice_vf_fdir_dump_info(vf);
+	return ret;
+}
+
+static int ice_fdir_is_tunnel(enum ice_fdir_tunnel_type ttype)
+{
+	return ttype == ICE_FDIR_TUNNEL_TYPE_ECPRI;
+}
+
+/**
+ * ice_vc_add_fdir_fltr_post
+ * @vf: pointer to the VF structure
+ * @ctx: FDIR context info for post processing
+ * @status: virtchnl FDIR program status
+ * @success: true implies success, false implies failure
+ *
+ * Post process for flow director add command. If success, then do post process
+ * and send back success msg by virtchnl. Otherwise, do context reversion and
+ * send back failure msg by virtchnl.
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_add_fdir_fltr_post(struct ice_vf *vf,
+			  struct ice_vf_fdir_ctx *ctx,
+			  enum virtchnl_fdir_prgm_status status,
+			  bool success)
+{
+	struct virtchnl_fdir_fltr_conf *conf = ctx->conf;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum virtchnl_status_code v_ret;
+	struct virtchnl_fdir_add *resp;
+	int ret, len, is_tun;
+
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+	len = sizeof(*resp);
+	resp = kzalloc(sizeof(*resp), GFP_KERNEL);
+	if (!resp) {
+		len = 0;
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "VF %d: Alloc resp buf fail", vf->vf_id);
+		goto err_exit;
+	}
+
+	if (!success)
+		goto err_exit;
+
+	is_tun = ice_fdir_is_tunnel(conf->ttype);
+	resp->status = status;
+	resp->flow_id = conf->flow_id;
+	vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]++;
+
+	ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret,
+				    (u8 *)resp, len);
+	kfree(resp);
+
+	dev_dbg(dev, "VF %d: flow_id:0x%X, FDIR %s success!\n",
+		vf->vf_id, conf->flow_id,
+		(ctx->v_opcode == VIRTCHNL_OP_ADD_FDIR_FILTER) ?
+		"add" : "del");
+	return ret;
+
+err_exit:
+	if (resp)
+		resp->status = status;
+	ice_vc_fdir_remove_entry(vf, conf, conf->flow_id);
+	kfree(conf);
+
+	ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret,
+				    (u8 *)resp, len);
+	kfree(resp);
+	return ret;
+}
+
+/**
+ * ice_vc_del_fdir_fltr_post
+ * @vf: pointer to the VF structure
+ * @ctx: FDIR context info for post processing
+ * @status: virtchnl FDIR program status
+ * @success: true implies success, false implies failure
+ *
+ * Post process for flow director del command. If success, then do post process
+ * and send back success msg by virtchnl. Otherwise, do context reversion and
+ * send back failure msg by virtchnl.
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_del_fdir_fltr_post(struct ice_vf *vf,
+			  struct ice_vf_fdir_ctx *ctx,
+			  enum virtchnl_fdir_prgm_status status,
+			  bool success)
+{
+	struct virtchnl_fdir_fltr_conf *conf = ctx->conf;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum virtchnl_status_code v_ret;
+	struct virtchnl_fdir_del *resp;
+	int ret, len, is_tun;
+
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+	len = sizeof(*resp);
+	resp = kzalloc(sizeof(*resp), GFP_KERNEL);
+	if (!resp) {
+		len = 0;
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "VF %d: Alloc resp buf fail", vf->vf_id);
+		goto err_exit;
+	}
+
+	if (!success)
+		goto err_exit;
+
+	is_tun = ice_fdir_is_tunnel(conf->ttype);
+	resp->status = status;
+	ice_vc_fdir_remove_entry(vf, conf, conf->flow_id);
+	vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]--;
+
+	ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret,
+				    (u8 *)resp, len);
+	kfree(resp);
+
+	dev_dbg(dev, "VF %d: flow_id:0x%X, FDIR %s success!\n",
+		vf->vf_id, conf->flow_id,
+		(ctx->v_opcode == VIRTCHNL_OP_ADD_FDIR_FILTER) ?
+		"add" : "del");
+	kfree(conf);
+	return ret;
+
+err_exit:
+	if (resp)
+		resp->status = status;
+	if (success)
+		kfree(conf);
+
+	ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret,
+				    (u8 *)resp, len);
+	kfree(resp);
+	return ret;
+}
+
+/**
+ * ice_flush_fdir_ctx
+ * @pf: pointer to the PF structure
+ *
+ * Flush all the pending event on ctx_done list and process them.
+ */
+void ice_flush_fdir_ctx(struct ice_pf *pf)
+{
+	int i;
+
+	if (!test_and_clear_bit(ICE_FD_VF_FLUSH_CTX, pf->state))
+		return;
+
+	ice_for_each_vf(pf, i) {
+		struct device *dev = ice_pf_to_dev(pf);
+		enum virtchnl_fdir_prgm_status status;
+		struct ice_vf *vf = &pf->vf[i];
+		struct ice_vf_fdir_ctx *ctx;
+		unsigned long flags;
+		int ret;
+
+		if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+			continue;
+
+		if (vf->ctrl_vsi_idx == ICE_NO_VSI)
+			continue;
+
+		ctx = &vf->fdir.ctx_done;
+		spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+		if (!(ctx->flags & ICE_VF_FDIR_CTX_VALID)) {
+			spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+
+		WARN_ON(ctx->stat == ICE_FDIR_CTX_READY);
+		if (ctx->stat == ICE_FDIR_CTX_TIMEOUT) {
+			status = VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT;
+			dev_err(dev, "VF %d: ctrl_vsi irq timeout\n",
+				vf->vf_id);
+			goto err_exit;
+		}
+
+		ret = ice_vf_verify_rx_desc(vf, ctx, &status);
+		if (ret)
+			goto err_exit;
+
+		if (ctx->v_opcode == VIRTCHNL_OP_ADD_FDIR_FILTER)
+			ice_vc_add_fdir_fltr_post(vf, ctx, status, true);
+		else if (ctx->v_opcode == VIRTCHNL_OP_DEL_FDIR_FILTER)
+			ice_vc_del_fdir_fltr_post(vf, ctx, status, true);
+		else
+			dev_err(dev, "VF %d: Unsupported opcode\n", vf->vf_id);
+
+		spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+		ctx->flags &= ~ICE_VF_FDIR_CTX_VALID;
+		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+		continue;
+err_exit:
+		if (ctx->v_opcode == VIRTCHNL_OP_ADD_FDIR_FILTER)
+			ice_vc_add_fdir_fltr_post(vf, ctx, status, false);
+		else if (ctx->v_opcode == VIRTCHNL_OP_DEL_FDIR_FILTER)
+			ice_vc_del_fdir_fltr_post(vf, ctx, status, false);
+		else
+			dev_err(dev, "VF %d: Unsupported opcode\n", vf->vf_id);
+
+		spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+		ctx->flags &= ~ICE_VF_FDIR_CTX_VALID;
+		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+	}
+}
+
+/**
+ * ice_vc_fdir_set_irq_ctx - set FDIR context info for later irq handler
+ * @vf: pointer to the VF structure
+ * @conf: FDIR configuration for each filter
+ * @v_opcode: virtual channel operation code
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_set_irq_ctx(struct ice_vf *vf,
+			struct virtchnl_fdir_fltr_conf *conf,
+			enum virtchnl_ops v_opcode)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vf_fdir_ctx *ctx;
+	unsigned long flags;
+
+	ctx = &vf->fdir.ctx_irq;
+	spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+	if ((vf->fdir.ctx_irq.flags & ICE_VF_FDIR_CTX_VALID) ||
+	    (vf->fdir.ctx_done.flags & ICE_VF_FDIR_CTX_VALID)) {
+		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+		dev_dbg(dev, "VF %d: Last request is still in progress\n",
+			vf->vf_id);
+		return -EBUSY;
+	}
+	ctx->flags |= ICE_VF_FDIR_CTX_VALID;
+	spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+
+	ctx->conf = conf;
+	ctx->v_opcode = v_opcode;
+	ctx->stat = ICE_FDIR_CTX_READY;
+	timer_setup(&ctx->rx_tmr, ice_vf_fdir_timer, 0);
+
+	mod_timer(&ctx->rx_tmr,
+		  round_jiffies(msecs_to_jiffies(10) + jiffies));
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_clear_irq_ctx - clear FDIR context info for irq handler
+ * @vf: pointer to the VF structure
+ *
+ * Return: 0 on success, and other on error.
+ */
+static void ice_vc_fdir_clear_irq_ctx(struct ice_vf *vf)
+{
+	struct ice_vf_fdir_ctx *ctx = &vf->fdir.ctx_irq;
+	unsigned long flags;
+
+	del_timer(&ctx->rx_tmr);
+	spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+	ctx->flags &= ~ICE_VF_FDIR_CTX_VALID;
+	spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+}
+
+/**
+ * ice_vc_add_fdir_fltr - add a FDIR filter for VF by the msg buffer
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_fdir_add *fltr = (struct virtchnl_fdir_add *)msg;
+	struct virtchnl_fdir_add *stat = NULL;
+	struct virtchnl_fdir_fltr_conf *conf;
+	enum virtchnl_status_code v_ret;
+	struct device *dev;
+	struct ice_pf *pf;
+	int is_tun = 0;
+	int len = 0;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	ret = ice_vc_fdir_param_check(vf, fltr->vsi_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_dbg(dev, "Parameter check for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vf_start_ctrl_vsi(vf);
+	if (ret && (ret != -EEXIST)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_err(dev, "Init FDIR for VF %d failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_exit;
+	}
+
+	stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+	if (!stat) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc stat for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+	if (!conf) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc conf for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	len = sizeof(*stat);
+	ret = ice_vc_validate_fdir_fltr(vf, fltr, conf);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_INVALID;
+		dev_dbg(dev, "Invalid FDIR filter from VF %d\n", vf->vf_id);
+		goto err_free_conf;
+	}
+
+	if (fltr->validate_only) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_SUCCESS;
+		kfree(conf);
+		ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER,
+					    v_ret, (u8 *)stat, len);
+		goto exit;
+	}
+
+	is_tun = ice_fdir_is_tunnel(conf->ttype);
+	ret = ice_vc_fdir_config_input_set(vf, fltr, conf, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT;
+		dev_err(dev, "VF %d: FDIR input set configure failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_is_dup_fltr(vf, conf);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_EXIST;
+		dev_dbg(dev, "VF %d: duplicated FDIR rule detected\n",
+			vf->vf_id);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_insert_entry(vf, conf, &conf->flow_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_dbg(dev, "VF %d: insert FDIR list failed\n", vf->vf_id);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_set_irq_ctx(vf, conf, VIRTCHNL_OP_ADD_FDIR_FILTER);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_dbg(dev, "VF %d: set FDIR context failed\n", vf->vf_id);
+		goto err_rem_entry;
+	}
+
+	ret = ice_vc_fdir_write_fltr(vf, conf, true, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "VF %d: writing FDIR rule failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_clr_irq;
+	}
+
+exit:
+	kfree(stat);
+	return ret;
+
+err_clr_irq:
+	ice_vc_fdir_clear_irq_ctx(vf);
+err_rem_entry:
+	ice_vc_fdir_remove_entry(vf, conf, conf->flow_id);
+err_free_conf:
+	kfree(conf);
+err_exit:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER, v_ret,
+				    (u8 *)stat, len);
+	kfree(stat);
+	return ret;
+}
+
+/**
+ * ice_vc_del_fdir_fltr - delete a FDIR filter for VF by the msg buffer
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+int ice_vc_del_fdir_fltr(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_fdir_del *fltr = (struct virtchnl_fdir_del *)msg;
+	struct virtchnl_fdir_del *stat = NULL;
+	struct virtchnl_fdir_fltr_conf *conf;
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	enum virtchnl_status_code v_ret;
+	struct ice_fdir_fltr *input;
+	enum ice_fltr_ptype flow;
+	struct device *dev;
+	struct ice_pf *pf;
+	int is_tun = 0;
+	int len = 0;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	ret = ice_vc_fdir_param_check(vf, fltr->vsi_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_dbg(dev, "Parameter check for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+	if (!stat) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc stat for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	len = sizeof(*stat);
+
+	conf = ice_vc_fdir_lookup_entry(vf, fltr->flow_id);
+	if (!conf) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST;
+		dev_dbg(dev, "VF %d: FDIR invalid flow_id:0x%X\n",
+			vf->vf_id, fltr->flow_id);
+		goto err_exit;
+	}
+
+	/* Just return failure when ctrl_vsi idx is invalid */
+	if (vf->ctrl_vsi_idx == ICE_NO_VSI) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "Invalid FDIR ctrl_vsi for VF %d\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_set_irq_ctx(vf, conf, VIRTCHNL_OP_DEL_FDIR_FILTER);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_dbg(dev, "VF %d: set FDIR context failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	is_tun = ice_fdir_is_tunnel(conf->ttype);
+	ret = ice_vc_fdir_write_fltr(vf, conf, false, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "VF %d: writing FDIR rule failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_del_tmr;
+	}
+
+	/* Remove unused profiles to avoid unexpected behaviors */
+	input = &conf->input;
+	flow = input->flow_type;
+	if (fdir->fdir_fltr_cnt[flow][is_tun] == 1)
+		ice_vc_fdir_rem_prof(vf, flow, is_tun);
+
+	kfree(stat);
+
+	return ret;
+
+err_del_tmr:
+	ice_vc_fdir_clear_irq_ctx(vf);
+err_exit:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_FDIR_FILTER, v_ret,
+				    (u8 *)stat, len);
+	kfree(stat);
+	return ret;
+}
+
+/**
+ * ice_vf_fdir_init - init FDIR resource for VF
+ * @vf: pointer to the VF info
+ */
+void ice_vf_fdir_init(struct ice_vf *vf)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	idr_init(&fdir->fdir_rule_idr);
+	INIT_LIST_HEAD(&fdir->fdir_rule_list);
+
+	spin_lock_init(&fdir->ctx_lock);
+	fdir->ctx_irq.flags = 0;
+	fdir->ctx_done.flags = 0;
+}
+
+/**
+ * ice_vf_fdir_exit - destroy FDIR resource for VF
+ * @vf: pointer to the VF info
+ */
+void ice_vf_fdir_exit(struct ice_vf *vf)
+{
+	ice_vc_fdir_flush_entry(vf);
+	idr_destroy(&vf->fdir.fdir_rule_idr);
+	ice_vc_fdir_rem_prof_all(vf);
+	ice_vc_fdir_free_prof_all(vf);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h
new file mode 100644
index 000000000000..eaf5d8359c70
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VIRTCHNL_FDIR_H_
+#define _ICE_VIRTCHNL_FDIR_H_
+
+struct ice_vf;
+struct ice_pf;
+
+enum ice_fdir_ctx_stat {
+	ICE_FDIR_CTX_READY,
+	ICE_FDIR_CTX_IRQ,
+	ICE_FDIR_CTX_TIMEOUT,
+};
+
+struct ice_vf_fdir_ctx {
+	struct timer_list rx_tmr;
+	enum virtchnl_ops v_opcode;
+	enum ice_fdir_ctx_stat stat;
+	union ice_32b_rx_flex_desc rx_desc;
+#define ICE_VF_FDIR_CTX_VALID		BIT(0)
+	u32 flags;
+
+	void *conf;
+};
+
+/* VF FDIR information structure */
+struct ice_vf_fdir {
+	u16 fdir_fltr_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX];
+	int prof_entry_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX];
+	struct ice_fd_hw_prof **fdir_prof;
+
+	struct idr fdir_rule_idr;
+	struct list_head fdir_rule_list;
+
+	spinlock_t ctx_lock; /* protects FDIR context info */
+	struct ice_vf_fdir_ctx ctx_irq;
+	struct ice_vf_fdir_ctx ctx_done;
+};
+
+#ifdef CONFIG_PCI_IOV
+int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg);
+int ice_vc_del_fdir_fltr(struct ice_vf *vf, u8 *msg);
+void ice_vc_fdir_free_prof_all(struct ice_vf *vf);
+void ice_vc_fdir_rem_prof_all(struct ice_vf *vf);
+void ice_vf_fdir_init(struct ice_vf *vf);
+void ice_vf_fdir_exit(struct ice_vf *vf);
+void
+ice_vc_fdir_irq_handler(struct ice_vsi *ctrl_vsi,
+			union ice_32b_rx_flex_desc *rx_desc);
+void ice_flush_fdir_ctx(struct ice_pf *pf);
+#else
+static inline
+void ice_vc_fdir_irq_handler(struct ice_vsi *ctrl_vsi, union ice_32b_rx_flex_desc *rx_desc) { }
+static inline void ice_flush_fdir_ctx(struct ice_pf *pf) { }
+#endif /* CONFIG_PCI_IOV */
+#endif /* _ICE_VIRTCHNL_FDIR_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index e92a00a61755..52fd9c37fa38 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -1,11 +1,301 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice.h"
+#include "ice_base.h"
 #include "ice_lib.h"
+#include "ice_fltr.h"
+#include "ice_dcb_lib.h"
+#include "ice_eswitch.h"
+#include "ice_virtchnl_allowlist.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_vlan.h"
+#include "ice_flex_pipe.h"
+#include "ice_tc_lib.h"
+
+#define FIELD_SELECTOR(proto_hdr_field) \
+		BIT((proto_hdr_field) & PROTO_HDR_FIELD_MASK)
+
+struct ice_vc_hdr_match_type {
+	s32 vc_hdr;	/* virtchnl headers (VIRTCHNL_PROTO_HDR_XXX) */
+	u32 ice_hdr;	/* ice headers (ICE_FLOW_SEG_HDR_XXX) */
+};
+
+static const struct ice_vc_hdr_match_type ice_vc_hdr_list[] = {
+	{VIRTCHNL_PROTO_HDR_NONE,	ICE_FLOW_SEG_HDR_NONE},
+	{VIRTCHNL_PROTO_HDR_ETH,	ICE_FLOW_SEG_HDR_ETH},
+	{VIRTCHNL_PROTO_HDR_S_VLAN,	ICE_FLOW_SEG_HDR_VLAN},
+	{VIRTCHNL_PROTO_HDR_C_VLAN,	ICE_FLOW_SEG_HDR_VLAN},
+	{VIRTCHNL_PROTO_HDR_IPV4,	ICE_FLOW_SEG_HDR_IPV4 |
+					ICE_FLOW_SEG_HDR_IPV_OTHER},
+	{VIRTCHNL_PROTO_HDR_IPV6,	ICE_FLOW_SEG_HDR_IPV6 |
+					ICE_FLOW_SEG_HDR_IPV_OTHER},
+	{VIRTCHNL_PROTO_HDR_TCP,	ICE_FLOW_SEG_HDR_TCP},
+	{VIRTCHNL_PROTO_HDR_UDP,	ICE_FLOW_SEG_HDR_UDP},
+	{VIRTCHNL_PROTO_HDR_SCTP,	ICE_FLOW_SEG_HDR_SCTP},
+	{VIRTCHNL_PROTO_HDR_PPPOE,	ICE_FLOW_SEG_HDR_PPPOE},
+	{VIRTCHNL_PROTO_HDR_GTPU_IP,	ICE_FLOW_SEG_HDR_GTPU_IP},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH,	ICE_FLOW_SEG_HDR_GTPU_EH},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN,
+					ICE_FLOW_SEG_HDR_GTPU_DWN},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP,
+					ICE_FLOW_SEG_HDR_GTPU_UP},
+	{VIRTCHNL_PROTO_HDR_L2TPV3,	ICE_FLOW_SEG_HDR_L2TPV3},
+	{VIRTCHNL_PROTO_HDR_ESP,	ICE_FLOW_SEG_HDR_ESP},
+	{VIRTCHNL_PROTO_HDR_AH,		ICE_FLOW_SEG_HDR_AH},
+	{VIRTCHNL_PROTO_HDR_PFCP,	ICE_FLOW_SEG_HDR_PFCP_SESSION},
+	{VIRTCHNL_PROTO_HDR_GTPC,	ICE_FLOW_SEG_HDR_GTPC},
+	{VIRTCHNL_PROTO_HDR_ECPRI,	ICE_FLOW_SEG_HDR_ECPRI_TP0 |
+					ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0},
+	{VIRTCHNL_PROTO_HDR_L2TPV2,	ICE_FLOW_SEG_HDR_L2TPV2},
+	{VIRTCHNL_PROTO_HDR_PPP,	ICE_FLOW_SEG_HDR_PPP},
+};
+
+struct ice_vc_hash_field_match_type {
+	s32 vc_hdr;		/* virtchnl headers
+				 * (VIRTCHNL_PROTO_HDR_XXX)
+				 */
+	u32 vc_hash_field;	/* virtchnl hash fields selector
+				 * FIELD_SELECTOR((VIRTCHNL_PROTO_HDR_ETH_XXX))
+				 */
+	u64 ice_hash_field;	/* ice hash fields
+				 * (BIT_ULL(ICE_FLOW_FIELD_IDX_XXX))
+				 */
+};
+
+static const struct
+ice_vc_hash_field_match_type ice_vc_hash_field_list[] = {
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_SA)},
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_DA)},
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_DST),
+		ICE_FLOW_HASH_ETH},
+	{VIRTCHNL_PROTO_HDR_ETH,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_TYPE)},
+	{VIRTCHNL_PROTO_HDR_S_VLAN,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_S_VLAN_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_S_VLAN)},
+	{VIRTCHNL_PROTO_HDR_C_VLAN,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_C_VLAN_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_C_VLAN)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		ICE_FLOW_HASH_IPV4},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
+		ICE_FLOW_HASH_IPV6},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		ICE_FLOW_HASH_IPV6 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST),
+		ICE_FLOW_HASH_IPV6_PRE64},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		ICE_FLOW_HASH_IPV6_PRE64 |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
+		ICE_FLOW_HASH_TCP_PORT},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
+		ICE_FLOW_HASH_UDP_PORT},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
+		ICE_FLOW_HASH_SCTP_PORT},
+	{VIRTCHNL_PROTO_HDR_PPPOE,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID)},
+	{VIRTCHNL_PROTO_HDR_GTPU_IP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_GTPU_IP_TEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_IP_TEID)},
+	{VIRTCHNL_PROTO_HDR_L2TPV3,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID)},
+	{VIRTCHNL_PROTO_HDR_ESP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ESP_SPI),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ESP_SPI)},
+	{VIRTCHNL_PROTO_HDR_AH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_AH_SPI),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_AH_SPI)},
+	{VIRTCHNL_PROTO_HDR_PFCP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PFCP_SEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_PFCP_SEID)},
+	{VIRTCHNL_PROTO_HDR_GTPC,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_GTPC_TEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_GTPC_TEID)},
+	{VIRTCHNL_PROTO_HDR_ECPRI,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID)},
+};
+
+/**
+ * ice_get_vf_vsi - get VF's VSI based on the stored index
+ * @vf: VF used to get VSI
+ */
+struct ice_vsi *ice_get_vf_vsi(struct ice_vf *vf)
+{
+	return vf->pf->vsi[vf->lan_vsi_idx];
+}
+
+static struct ice_vsi *ice_get_vf_adq_vsi(struct ice_vf *vf, u8 tc)
+{
+	return vf->pf->vsi[vf->ch[tc].vsi_idx];
+}
+
+/**
+ * ice_is_vf_adq_ena - is VF ADQ enabled
+ * @vf: pointer to the VF info
+ *
+ * This function returns true if VF ADQ is enabled. It is must to check
+ * VF's num_tc as well, it must be more than ICE_VF_CHNL_START_TC for
+ * valid ADQ configuration
+ */
+static bool ice_is_vf_adq_ena(struct ice_vf *vf)
+{
+	return vf->adq_enabled && (vf->num_tc > ICE_VF_CHNL_START_TC);
+}
+
+/**
+ * ice_vf_adq_vsi_stop_rings - stops the VF ADQ VSI rings
+ * @vf: pointer to the VF info
+ * @tc: VF ADQ TC number
+ *
+ * This function stops Tx and Rx ring specific to VF ADQ VSI
+ */
+static void ice_vf_adq_vsi_stop_rings(struct ice_vf *vf, int tc)
+{
+	struct ice_vsi *vsi = ice_get_vf_adq_vsi(vf, tc);
+
+	if (!vsi)
+		return;
+	ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, vf->vf_id);
+	ice_vsi_stop_all_rx_rings(vsi);
+}
+
+/**
+ * ice_vf_adq_vsi_disable_txqs - disable Tx queues for VF ADQ
+ * @vf: pointer to the VF info
+ * @tc: VF ADQ TC number
+ *
+ * This function disabled Tx queues specific to VF ADQ VSI
+ */
+static void ice_vf_adq_vsi_disable_txqs(struct ice_vf *vf, int tc)
+{
+	struct ice_vsi *vsi = ice_get_vf_adq_vsi(vf, tc);
+
+	if (!vsi)
+		return;
+	ice_dis_vsi_txq(vsi->port_info, vf->ch[tc].vsi_idx, 0, 0, NULL, NULL,
+			NULL, ICE_VF_RESET, vf->vf_id, NULL);
+}
+
+/**
+ * ice_validate_vf_id - helper to check if VF ID is valid
+ * @pf: pointer to the PF structure
+ * @vf_id: the ID of the VF to check
+ */
+static int ice_validate_vf_id(struct ice_pf *pf, u16 vf_id)
+{
+	/* vf_id range is only valid for 0-255, and should always be unsigned */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(ice_pf_to_dev(pf), "Invalid VF ID: %u\n", vf_id);
+		return -EINVAL;
+	}
+	return 0;
+}
 
 /**
- * ice_err_to_virt err - translate errors for VF return code
+ * ice_check_vf_init - helper to check if VF init complete
+ * @pf: pointer to the PF structure
+ * @vf: the pointer to the VF to check
+ */
+static int ice_check_vf_init(struct ice_pf *pf, struct ice_vf *vf)
+{
+	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
+		dev_err(ice_pf_to_dev(pf), "VF ID: %u in reset. Try again.\n",
+			vf->vf_id);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/**
+ * ice_err_to_virt_err - translate errors for VF return code
  * @ice_err: error return code
  */
 static enum virtchnl_status_code ice_err_to_virt_err(enum ice_status ice_err)
@@ -48,10 +338,11 @@ ice_vc_vf_broadcast(struct ice_pf *pf, enum virtchnl_ops v_opcode,
 		    enum virtchnl_status_code v_retval, u8 *msg, u16 msglen)
 {
 	struct ice_hw *hw = &pf->hw;
-	struct ice_vf *vf = pf->vf;
-	int i;
+	unsigned int i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
 
-	for (i = 0; i < pf->num_alloc_vfs; i++, vf++) {
 		/* Not all vfs are enabled so skip the ones that are not */
 		if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states) &&
 		    !test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
@@ -91,23 +382,49 @@ ice_set_pfe_link(struct ice_vf *vf, struct virtchnl_pf_event *pfe,
 }
 
 /**
- * ice_set_pfe_link_forced - Force the virtchnl_pf_event link speed/status
- * @vf: pointer to the VF structure
- * @pfe: pointer to the virtchnl_pf_event to set link speed/status for
- * @link_up: whether or not to set the link up/down
+ * ice_vf_has_no_qs_ena - check if the VF has any Rx or Tx queues enabled
+ * @vf: the VF to check
+ *
+ * Returns true if the VF has no Rx and no Tx queues enabled and returns false
+ * otherwise
  */
-static void
-ice_set_pfe_link_forced(struct ice_vf *vf, struct virtchnl_pf_event *pfe,
-			bool link_up)
+static bool ice_vf_has_no_qs_ena(struct ice_vf *vf)
 {
-	u16 link_speed;
+	return (!bitmap_weight(vf->rxq_ena, ICE_MAX_QS_PER_VF) &&
+		!bitmap_weight(vf->txq_ena, ICE_MAX_QS_PER_VF));
+}
 
-	if (link_up)
-		link_speed = ICE_AQ_LINK_SPEED_100GB;
-	else
-		link_speed = ICE_AQ_LINK_SPEED_UNKNOWN;
+/**
+ * ice_vf_get_port_info - Get the VF's port info structure
+ * @vf: VF used to get the port info structure for
+ */
+static struct ice_port_info *ice_vf_get_port_info(struct ice_vf *vf)
+{
+	return vf->pf->hw.port_info;
+}
+
+/**
+ * ice_is_vf_link_up - check if the VF's link is up
+ * @vf: VF to check if link is up
+ */
+static bool ice_is_vf_link_up(struct ice_vf *vf)
+{
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	struct ice_pf *pf = vf->pf;
+
+	if (ice_check_vf_init(pf, vf))
+		return false;
+
+	if (test_bit(ICE_BAD_EEPROM, pf->state))
+		return false;
 
-	ice_set_pfe_link(vf, pfe, link_speed, link_up);
+	if (ice_vf_has_no_qs_ena(vf))
+		return false;
+	else if (vf->link_forced)
+		return vf->link_up;
+	else
+		return pi->phy.link_info.link_info &
+			ICE_AQ_LINK_UP;
 }
 
 /**
@@ -116,33 +433,237 @@ ice_set_pfe_link_forced(struct ice_vf *vf, struct virtchnl_pf_event *pfe,
  *
  * send a link status message to a single VF
  */
-static void ice_vc_notify_vf_link_state(struct ice_vf *vf)
+void ice_vc_notify_vf_link_state(struct ice_vf *vf)
 {
 	struct virtchnl_pf_event pfe = { 0 };
-	struct ice_link_status *ls;
-	struct ice_pf *pf = vf->pf;
-	struct ice_hw *hw;
+	struct ice_hw *hw = &vf->pf->hw;
+	struct ice_port_info *pi;
+
+	pi = ice_vf_get_port_info(vf);
 
-	hw = &pf->hw;
-	ls = &hw->port_info->phy.link_info;
 
 	pfe.event = VIRTCHNL_EVENT_LINK_CHANGE;
 	pfe.severity = PF_EVENT_SEVERITY_INFO;
 
-	/* Always report link is down if the VF queues aren't enabled */
-	if (!vf->num_qs_ena)
-		ice_set_pfe_link(vf, &pfe, ICE_AQ_LINK_SPEED_UNKNOWN, false);
-	else if (vf->link_forced)
-		ice_set_pfe_link_forced(vf, &pfe, vf->link_up);
+	if (ice_is_vf_link_up(vf))
+		ice_set_pfe_link(vf, &pfe, pi->phy.link_info.link_speed, true);
 	else
-		ice_set_pfe_link(vf, &pfe, ls->link_speed, ls->link_info &
-				 ICE_AQ_LINK_UP);
+		ice_set_pfe_link(vf, &pfe, ICE_AQ_LINK_SPEED_UNKNOWN, false);
 
 	ice_aq_send_msg_to_vf(hw, vf->vf_id, VIRTCHNL_OP_EVENT,
 			      VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe,
 			      sizeof(pfe), NULL);
 }
 
+/**
+ * ice_vf_invalidate_vsi - invalidate vsi_idx/vsi_num to remove VSI access
+ * @vf: VF to remove access to VSI for
+ */
+static void ice_vf_invalidate_vsi(struct ice_vf *vf)
+{
+	vf->lan_vsi_idx = ICE_NO_VSI;
+	vf->lan_vsi_num = ICE_NO_VSI;
+}
+
+/**
+ * ice_vf_vsi_release - invalidate the VF's VSI after freeing it
+ * @vf: invalidate this VF's VSI after freeing it
+ */
+static void ice_vf_vsi_release(struct ice_vf *vf)
+{
+	ice_vsi_release(ice_get_vf_vsi(vf));
+	ice_vf_invalidate_vsi(vf);
+}
+
+/**
+ * ice_vf_adq_invalidate_vsi - invalidate vsi_idx/vsi_num to remove VSI access
+ * @vf: VF that ADQ VSI is being invalidated on
+ * @tc: TC used to access channel specific vsi_idx/vsi_num
+ */
+static void ice_vf_adq_invalidate_vsi(struct ice_vf *vf, u8 tc)
+{
+	vf->ch[tc].vsi_idx = ICE_NO_VSI;
+	vf->ch[tc].vsi_num = ICE_NO_VSI;
+}
+
+/**
+ * ice_vf_adq_vsi_valid - is ADQ VSI valid?
+ * @vf: VF that ADQ VSI is being validated
+ * @tc: TC used to access channel specific vsi_idx/vsi_num
+ *
+ * vsi_idx must be non-zero, and vsi_idx and vsi_num must not be ICE_NO_VSI
+ */
+static bool ice_vf_adq_vsi_valid(struct ice_vf *vf, u8 tc)
+{
+	return (vf->ch[tc].vsi_idx && vf->ch[tc].vsi_idx != ICE_NO_VSI &&
+		vf->ch[tc].vsi_num != ICE_NO_VSI);
+}
+
+/**
+ * ice_vf_adq_vsi_release - release VF ADQ VSI resources
+ * @vf: VF that ADQ VSI is being released on
+ * @tc: TC used to access channel specific VSI
+ *
+ * This function stops Tx and Rx queues if specified, disables Tx queues if
+ * specified, releases VSI resources, and invalidates it
+ *
+ */
+static void ice_vf_adq_vsi_release(struct ice_vf *vf, u8 tc)
+{
+	ice_vsi_release(ice_get_vf_adq_vsi(vf, tc));
+	ice_vf_adq_invalidate_vsi(vf, tc);
+}
+
+/**
+ * ice_vf_adq_cfg_cleanup - invalidate the VF's channel software info
+ * @vf: VF that ADQ VSI is being released on
+ * @tc: TC used to access channel specific VSI
+ *
+ * This function invalidates software data structures specific to channel
+ * such as num_qps, tx_rate, etc... This is called from places like:
+ * when ADQ VSI is released either from rebuild path "ice_vf_adq_release"
+ * or during rebuild ADQ config if failed to create/setup VF ADQ VSIs
+ */
+static void ice_vf_adq_cfg_cleanup(struct ice_vf *vf, u8 tc)
+{
+	vf->ch[tc].num_qps = 0;
+	vf->ch[tc].offset = 0;
+	vf->ch[tc].max_tx_rate = 0;
+	/* since this function is called from places where
+	 * VF ADQ VSI are cleanup from HW, it's OK to clear
+	 * VF ADQ filter_type to be INVALID.
+	 * Remember VF ADQ filter are replayed by VF driver
+	 * as needed
+	 */
+	vf->ch[tc].fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_del_all_adv_switch_fltr
+ * @vf: pointer to the VF info
+ *
+ * This function deletes all advanced switch filters specific to the VF and
+ * releases filter memory and updates all book-keeping. This function to be
+ * used when delete channel message is received before deleting channel VSIs
+ */
+static void ice_del_all_adv_switch_fltr(struct ice_vf *vf)
+{
+	struct ice_rule_query_data rule;
+	struct ice_tc_flower_fltr *f;
+	struct ice_pf *pf = vf->pf;
+	struct hlist_node *node;
+	struct device *dev;
+	int err;
+	int i;
+
+	dev = ice_pf_to_dev(pf);
+	hlist_for_each_entry_safe(f, node, &vf->tc_flower_fltr_list,
+				  tc_flower_node) {
+		if (!f->dest_vsi)
+			continue;
+
+		/* Deleting TC filter */
+		rule.rid = f->rid;
+		rule.rule_id = f->rule_id;
+		rule.vsi_handle = f->dest_id;
+		err = ice_rem_adv_rule_by_id(&pf->hw, &rule);
+		if (err) {
+			if (err == ICE_ERR_DOES_NOT_EXIST)
+				dev_dbg(dev, "VF %d: filter (rule_id %u) for dest VSI %u DOES NOT EXIST in hw table\n",
+					vf->vf_id, f->rule_id, f->dest_id);
+			else
+				dev_err(dev, "VF %d: Failed to delete switch filter for VSI handle %u, err %d\n",
+					vf->vf_id, f->dest_id, err);
+		}
+
+		/* book-keeping and update filter type if filter count
+		 * reached zero
+		 */
+		f->dest_vsi->num_chnl_fltr--;
+		hlist_del(&f->tc_flower_node);
+		devm_kfree(dev, f);
+		vf->num_dmac_chnl_fltrs--;
+	}
+
+	/* Reset VF channel filter type to be INVALID */
+	for (i = 1; i < vf->num_tc; i++)
+		vf->ch[i].fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+/**
+ * ice_vf_adq_release - perform VF ADQ resource cleanup only
+ * @vf: pointer to the VF structure
+ *
+ * Delete all VF ADQ filters, release VF ADQ VSIs, cleanup internal data
+ * structues which keeps track of per TC infor including TC0. This function
+ * is invoked only when VFLR based VF Reset.
+ */
+static void ice_vf_adq_release(struct ice_vf *vf)
+{
+	u8 tc;
+
+	/* no ADQ configured, nothing to do */
+	if (!ice_is_vf_adq_ena(vf))
+		return;
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	/* release VF ADQ specific filters and eventually VF driver
+	 * will trigger replay of VF ADQ filters as needed, just like
+	 * other MAC, VLAN filters
+	 */
+	ice_del_all_adv_switch_fltr(vf);
+#endif
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+		/* Tx queues are disabled before VF reset is scheduled as part
+		 * of VFLR flow. Disabling TX queues again causes error
+		 * such as EINVAL from admin command because underlying
+		 * scheduler configs are cleared as part of disabling once
+		 */
+		if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states))
+			ice_vf_adq_vsi_stop_rings(vf, tc);
+		ice_vf_adq_vsi_release(vf, tc);
+		/* clear per TC info to avoid stale information such as
+		 * num_qps, tx_rate, etc...
+		 */
+		ice_vf_adq_cfg_cleanup(vf, tc);
+	}
+
+	/* to avoid rebuilding of VF ADQ VSIs by mistake */
+	vf->adq_enabled = false;
+	vf->num_tc = 0;
+
+	/* main VF VSI should be built with default, hence clear related
+	 * data structures otherwise vf->ch[0].num_qps and tx_rate will
+	 * still have stale information as stored from "add channel"
+	 * virtchnl message
+	 */
+	ice_vf_adq_cfg_cleanup(vf, 0);
+}
+
+/**
+ * ice_vf_ctrl_invalidate_vsi - invalidate ctrl_vsi_idx to remove VSI access
+ * @vf: VF that control VSI is being invalidated on
+ */
+static void ice_vf_ctrl_invalidate_vsi(struct ice_vf *vf)
+{
+	vf->ctrl_vsi_idx = ICE_NO_VSI;
+}
+
+/**
+ * ice_vf_ctrl_vsi_release - invalidate the VF's control VSI after freeing it
+ * @vf: VF that control VSI is being released on
+ */
+static void ice_vf_ctrl_vsi_release(struct ice_vf *vf)
+{
+	ice_vsi_release(vf->pf->vsi[vf->ctrl_vsi_idx]);
+	ice_vf_ctrl_invalidate_vsi(vf);
+}
+
 /**
  * ice_free_vf_res - Free a VF's resources
  * @vf: pointer to the VF info
@@ -156,16 +677,25 @@ static void ice_free_vf_res(struct ice_vf *vf)
 	 * accessing the VF's VSI after it's freed or invalidated.
 	 */
 	clear_bit(ICE_VF_STATE_INIT, vf->vf_states);
+	ice_vf_fdir_exit(vf);
+	/* free VF control VSI */
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+		ice_vf_ctrl_vsi_release(vf);
 
 	/* free VSI and disconnect it from the parent uplink */
-	if (vf->lan_vsi_idx) {
-		ice_vsi_release(pf->vsi[vf->lan_vsi_idx]);
-		vf->lan_vsi_idx = 0;
-		vf->lan_vsi_num = 0;
+	if (vf->lan_vsi_idx != ICE_NO_VSI) {
+		ice_vf_vsi_release(vf);
 		vf->num_mac = 0;
 	}
 
-	last_vector_idx = vf->first_vector_idx + pf->num_vf_msix - 1;
+	last_vector_idx = vf->first_vector_idx + pf->num_msix_per_vf - 1;
+
+	/* clear VF MDD event information */
+	memset(&vf->mdd_tx_events, 0, sizeof(vf->mdd_tx_events));
+	memset(&vf->mdd_rx_events, 0, sizeof(vf->mdd_rx_events));
+
+	ice_vf_adq_release(vf);
+
 	/* Disable interrupts so that VF starts in a known state */
 	for (i = vf->first_vector_idx; i <= last_vector_idx; i++) {
 		wr32(&pf->hw, GLINT_DYN_CTL(i), GLINT_DYN_CTL_CLEARPBA_M);
@@ -184,17 +714,19 @@ static void ice_dis_vf_mappings(struct ice_vf *vf)
 {
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
+	struct device *dev;
 	int first, last, v;
 	struct ice_hw *hw;
 
 	hw = &pf->hw;
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	vsi = ice_get_vf_vsi(vf);
 
+	dev = ice_pf_to_dev(pf);
 	wr32(hw, VPINT_ALLOC(vf->vf_id), 0);
 	wr32(hw, VPINT_ALLOC_PCI(vf->vf_id), 0);
 
 	first = vf->first_vector_idx;
-	last = first + pf->num_vf_msix - 1;
+	last = first + pf->num_msix_per_vf - 1;
 	for (v = first; v <= last; v++) {
 		u32 reg;
 
@@ -208,25 +740,19 @@ static void ice_dis_vf_mappings(struct ice_vf *vf)
 	if (vsi->tx_mapping_mode == ICE_VSI_MAP_CONTIG)
 		wr32(hw, VPLAN_TX_QBASE(vf->vf_id), 0);
 	else
-		dev_err(&pf->pdev->dev,
-			"Scattered mode for VF Tx queues is not yet implemented\n");
+		dev_err(dev, "Scattered mode for VF Tx queues is not yet implemented\n");
 
 	if (vsi->rx_mapping_mode == ICE_VSI_MAP_CONTIG)
 		wr32(hw, VPLAN_RX_QBASE(vf->vf_id), 0);
 	else
-		dev_err(&pf->pdev->dev,
-			"Scattered mode for VF Rx queues is not yet implemented\n");
+		dev_err(dev, "Scattered mode for VF Rx queues is not yet implemented\n");
 }
 
 /**
  * ice_sriov_free_msix_res - Reset/free any used MSIX resources
  * @pf: pointer to the PF structure
  *
- * If MSIX entries from the pf->irq_tracker were needed then we need to
- * reset the irq_tracker->end and give back the entries we needed to
- * num_avail_sw_msix.
- *
- * If no MSIX entries were taken from the pf->irq_tracker then just clear
+ * Since no MSIX entries are taken from the pf->irq_tracker then just clear
  * the pf->sriov_base_vector.
  *
  * Returns 0 on success, and -EINVAL on error.
@@ -243,11 +769,7 @@ static int ice_sriov_free_msix_res(struct ice_pf *pf)
 		return -EINVAL;
 
 	/* give back irq_tracker resources used */
-	if (pf->sriov_base_vector < res->num_entries) {
-		res->end = res->num_entries;
-		pf->num_avail_sw_msix +=
-			res->num_entries - pf->sriov_base_vector;
-	}
+	WARN_ON(pf->sriov_base_vector < res->num_entries);
 
 	pf->sriov_base_vector = 0;
 
@@ -261,9 +783,8 @@ static int ice_sriov_free_msix_res(struct ice_pf *pf)
 void ice_set_vf_state_qs_dis(struct ice_vf *vf)
 {
 	/* Clear Rx/Tx enabled queues flag */
-	bitmap_zero(vf->txq_ena, ICE_MAX_BASE_QS_PER_VF);
-	bitmap_zero(vf->rxq_ena, ICE_MAX_BASE_QS_PER_VF);
-	vf->num_qs_ena = 0;
+	bitmap_zero(vf->txq_ena, ICE_MAX_QS_PER_VF);
+	bitmap_zero(vf->rxq_ena, ICE_MAX_QS_PER_VF);
 	clear_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
 }
 
@@ -273,13 +794,22 @@ void ice_set_vf_state_qs_dis(struct ice_vf *vf)
  */
 static void ice_dis_vf_qs(struct ice_vf *vf)
 {
-	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi;
-
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
 
 	ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, vf->vf_id);
-	ice_vsi_stop_rx_rings(vsi);
+	ice_vsi_stop_all_rx_rings(vsi);
+	/* Likewise if VF ADQ is enabled, stop Tx and Rx rings of VF ADQ VSI */
+	if (ice_is_vf_adq_ena(vf)) {
+		int tc;
+
+		for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+			if (!ice_vf_adq_vsi_valid(vf, tc))
+				continue;
+			vsi = ice_get_vf_adq_vsi(vf, tc);
+			ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, vf->vf_id);
+			ice_vsi_stop_all_rx_rings(vsi);
+		}
+	}
 	ice_set_vf_state_qs_dis(vf);
 }
 
@@ -289,19 +819,18 @@ static void ice_dis_vf_qs(struct ice_vf *vf)
  */
 void ice_free_vfs(struct ice_pf *pf)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
-	int tmp, i;
+	unsigned int tmp, i;
 
 	if (!pf->vf)
 		return;
 
-	while (test_and_set_bit(__ICE_VF_DIS, pf->state))
+	ice_eswitch_release(pf);
+
+	while (test_and_set_bit(ICE_VF_DIS, pf->state))
 		usleep_range(1000, 2000);
 
-	/* Avoid wait time by stopping all VFs at the same time */
-	for (i = 0; i < pf->num_alloc_vfs; i++)
-		if (test_bit(ICE_VF_STATE_QS_ENA, pf->vf[i].vf_states))
-			ice_dis_vf_qs(&pf->vf[i]);
 
 	/* Disable IOV before freeing resources. This lets any VF drivers
 	 * running in the host get themselves cleaned up before we yank
@@ -310,10 +839,21 @@ void ice_free_vfs(struct ice_pf *pf)
 	if (!pci_vfs_assigned(pf->pdev))
 		pci_disable_sriov(pf->pdev);
 	else
-		dev_warn(&pf->pdev->dev, "VFs are assigned - not disabling SR-IOV\n");
+		dev_warn(dev, "VFs are assigned - not disabling SR-IOV\n");
+
+	if (ice_dcf_get_state(pf) != ICE_DCF_STATE_OFF) {
+		ice_rm_all_dcf_sw_rules(pf);
+		ice_dcf_set_state(pf, ICE_DCF_STATE_OFF);
+		pf->dcf.vf = NULL;
+	}
+
+	/* Avoid wait time by stopping all VFs at the same time */
+	ice_for_each_vf(pf, i)
+		if (test_bit(ICE_VF_STATE_QS_ENA, pf->vf[i].vf_states))
+			ice_dis_vf_qs(&pf->vf[i]);
 
 	tmp = pf->num_alloc_vfs;
-	pf->num_vf_qps = 0;
+	pf->num_qps_per_vf = 0;
 	pf->num_alloc_vfs = 0;
 	for (i = 0; i < tmp; i++) {
 		if (test_bit(ICE_VF_STATE_INIT, pf->vf[i].vf_states)) {
@@ -325,10 +865,9 @@ void ice_free_vfs(struct ice_pf *pf)
 	}
 
 	if (ice_sriov_free_msix_res(pf))
-		dev_err(&pf->pdev->dev,
-			"Failed to free MSIX resources used by SR-IOV\n");
+		dev_err(dev, "Failed to free MSIX resources used by SR-IOV\n");
 
-	devm_kfree(&pf->pdev->dev, pf->vf);
+	devm_kfree(dev, pf->vf);
 	pf->vf = NULL;
 
 	/* This check is for when the driver is unloaded while VFs are
@@ -336,7 +875,7 @@ void ice_free_vfs(struct ice_pf *pf)
 	 * before this function ever gets called.
 	 */
 	if (!pci_vfs_assigned(pf->pdev)) {
-		int vf_id;
+		unsigned int vf_id;
 
 		/* Acknowledge VFLR for all VFs. Without this, VFs will fail to
 		 * work correctly when SR-IOV gets re-enabled.
@@ -349,7 +888,13 @@ void ice_free_vfs(struct ice_pf *pf)
 			wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx));
 		}
 	}
-	clear_bit(__ICE_VF_DIS, pf->state);
+
+	/* clear malicious info if the VFs are getting released */
+	for (i = 0; i < tmp; i++)
+		if (ice_mbx_clear_malvf(&hw->mbx_snapshot, pf->malvfs, ICE_MAX_VF_COUNT, i))
+			dev_dbg(dev, "failed to clear malicious VF state for VF %u\n", i);
+
+	clear_bit(ICE_VF_DIS, pf->state);
 	clear_bit(ICE_FLAG_SRIOV_ENA, pf->flags);
 }
 
@@ -367,9 +912,11 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr)
 {
 	struct ice_pf *pf = vf->pf;
 	u32 reg, reg_idx, bit_idx;
+	unsigned int vf_abs_id, i;
+	struct device *dev;
 	struct ice_hw *hw;
-	int vf_abs_id, i;
 
+	dev = ice_pf_to_dev(pf);
 	hw = &pf->hw;
 	vf_abs_id = vf->vf_id + hw->func_caps.vf_base_id;
 
@@ -377,20 +924,19 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr)
 	clear_bit(ICE_VF_STATE_ACTIVE, vf->vf_states);
 
 	/* Disable VF's configuration API during reset. The flag is re-enabled
-	 * in ice_alloc_vf_res(), when it's safe again to access VF's VSI.
-	 * It's normally disabled in ice_free_vf_res(), but it's safer
-	 * to do it earlier to give some time to finish to any VF config
-	 * functions that may still be running at this point.
+	 * when it's safe again to access VF's VSI.
 	 */
 	clear_bit(ICE_VF_STATE_INIT, vf->vf_states);
 
-	/* VF_MBX_ARQLEN is cleared by PFR, so the driver needs to clear it
-	 * in the case of VFR. If this is done for PFR, it can mess up VF
-	 * resets because the VF driver may already have started cleanup
-	 * by the time we get here.
+	/* VF_MBX_ARQLEN and VF_MBX_ATQLEN are cleared by PFR, so the driver
+	 * needs to clear them in the case of VFR/VFLR. If this is done for
+	 * PFR, it can mess up VF resets because the VF driver may already
+	 * have started cleanup by the time we get here.
 	 */
-	if (!is_pfr)
-		wr32(hw, VF_MBX_ARQLEN(vf_abs_id), 0);
+	if (!is_pfr) {
+		wr32(hw, VF_MBX_ARQLEN(vf->vf_id), 0);
+		wr32(hw, VF_MBX_ATQLEN(vf->vf_id), 0);
+	}
 
 	/* In the case of a VFLR, the HW has already reset the VF and we
 	 * just need to clean up, so don't hit the VFRTRIG register.
@@ -415,92 +961,84 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr)
 		if ((reg & VF_TRANS_PENDING_M) == 0)
 			break;
 
-		dev_err(&pf->pdev->dev,
-			"VF %d PCI transactions stuck\n", vf->vf_id);
+		dev_err(dev, "VF %u PCI transactions stuck\n", vf->vf_id);
 		udelay(ICE_PCI_CIAD_WAIT_DELAY_US);
 	}
 }
 
 /**
- * ice_vsi_set_pvid_fill_ctxt - Set VSI ctxt for add PVID
- * @ctxt: the VSI ctxt to fill
- * @vid: the VLAN ID to set as a PVID
+ * ice_vf_vsi_setup - Set up a VF VSI
+ * @vf: VF to setup VSI for
+ *
+ * Returns pointer to the successfully allocated VSI struct on success,
+ * otherwise returns NULL on failure.
  */
-static void ice_vsi_set_pvid_fill_ctxt(struct ice_vsi_ctx *ctxt, u16 vid)
+static struct ice_vsi *ice_vf_vsi_setup(struct ice_vf *vf)
 {
-	ctxt->info.vlan_flags = (ICE_AQ_VSI_VLAN_MODE_UNTAGGED |
-				 ICE_AQ_VSI_PVLAN_INSERT_PVID |
-				 ICE_AQ_VSI_VLAN_EMOD_STR);
-	ctxt->info.pvid = cpu_to_le16(vid);
-	ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
-						ICE_AQ_VSI_PROP_SW_VALID);
-}
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
 
-/**
- * ice_vsi_kill_pvid_fill_ctxt - Set VSI ctx for remove PVID
- * @ctxt: the VSI ctxt to fill
- */
-static void ice_vsi_kill_pvid_fill_ctxt(struct ice_vsi_ctx *ctxt)
-{
-	ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_EMOD_NOTHING;
-	ctxt->info.vlan_flags |= ICE_AQ_VSI_VLAN_MODE_ALL;
-	ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
-						ICE_AQ_VSI_PROP_SW_VALID);
+	vsi = ice_vsi_setup(pf, pi, ICE_VSI_VF, vf->vf_id, NULL, 0);
+
+	if (!vsi) {
+		dev_err(ice_pf_to_dev(pf), "Failed to create VF VSI\n");
+		ice_vf_invalidate_vsi(vf);
+		return NULL;
+	}
+
+	vf->lan_vsi_idx = vsi->idx;
+	vf->lan_vsi_num = vsi->vsi_num;
+
+	return vsi;
 }
 
 /**
- * ice_vsi_manage_pvid - Enable or disable port VLAN for VSI
- * @vsi: the VSI to update
- * @vid: the VLAN ID to set as a PVID
- * @enable: true for enable PVID false for disable
+ * ice_vf_adq_vsi_setup - Set up a VF channel VSI
+ * @vf: VF to setup VSI for
+ * @tc: TC to setup the channel VSI for
  */
-static int ice_vsi_manage_pvid(struct ice_vsi *vsi, u16 vid, bool enable)
+static struct ice_vsi *ice_vf_adq_vsi_setup(struct ice_vf *vf, u8 tc)
 {
-	struct device *dev = &vsi->back->pdev->dev;
-	struct ice_hw *hw = &vsi->back->hw;
-	struct ice_vsi_ctx *ctxt;
-	enum ice_status status;
-	int ret = 0;
-
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
-
-	ctxt->info = vsi->info;
-	if (enable)
-		ice_vsi_set_pvid_fill_ctxt(ctxt, vid);
-	else
-		ice_vsi_kill_pvid_fill_ctxt(ctxt);
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
 
-	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		dev_info(dev, "update VSI for port VLAN failed, err %d aq_err %d\n",
-			 status, hw->adminq.sq_last_status);
-		ret = -EIO;
-		goto out;
+	vsi = ice_vsi_setup(pf, pi, ICE_VSI_VF, vf->vf_id, NULL, tc);
+	if (!vsi) {
+		dev_err(ice_pf_to_dev(pf), "Failed to create VF ADQ VSI for TC %d\n",
+			tc);
+		ice_vf_adq_invalidate_vsi(vf, tc);
+		return NULL;
 	}
 
-	vsi->info = ctxt->info;
-out:
-	devm_kfree(dev, ctxt);
-	return ret;
+	vf->ch[tc].vsi_idx = vsi->idx;
+	vf->ch[tc].vsi_num = vsi->vsi_num;
+
+	return vsi;
 }
 
 /**
- * ice_vf_vsi_setup - Set up a VF VSI
- * @pf: board private structure
- * @pi: pointer to the port_info instance
- * @vf_id: defines VF ID to which this VSI connects.
+ * ice_vf_ctrl_vsi_setup - Set up a VF control VSI
+ * @vf: VF to setup control VSI for
  *
  * Returns pointer to the successfully allocated VSI struct on success,
  * otherwise returns NULL on failure.
  */
-static struct ice_vsi *
-ice_vf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi, u16 vf_id)
+struct ice_vsi *ice_vf_ctrl_vsi_setup(struct ice_vf *vf)
 {
-	return ice_vsi_setup(pf, pi, ICE_VSI_VF, vf_id);
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+
+	vsi = ice_vsi_setup(pf, pi, ICE_VSI_CTRL, vf->vf_id, NULL, 0);
+
+	if (!vsi) {
+		dev_err(ice_pf_to_dev(pf), "Failed to create VF control VSI\n");
+		ice_vf_ctrl_invalidate_vsi(vf);
+	}
+
+	return vsi;
 }
 
 /**
@@ -517,160 +1055,418 @@ ice_vf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi, u16 vf_id)
  */
 static int ice_calc_vf_first_vector_idx(struct ice_pf *pf, struct ice_vf *vf)
 {
-	return pf->sriov_base_vector + vf->vf_id * pf->num_vf_msix;
+	return pf->sriov_base_vector + vf->vf_id * pf->num_msix_per_vf;
 }
 
 /**
- * ice_alloc_vsi_res - Setup VF VSI and its resources
- * @vf: pointer to the VF structure
+ * ice_vf_rebuild_host_tx_rate_cfg - re-apply the Tx rate limiting configuration
+ * @vf: VF to re-apply the configuration for
  *
- * Returns 0 on success, negative value on failure
+ * Called after a VF VSI has been re-added/rebuild during reset. The PF driver
+ * needs to re-apply the host configured Tx rate limiting configuration.
  */
-static int ice_alloc_vsi_res(struct ice_vf *vf)
+static int ice_vf_rebuild_host_tx_rate_cfg(struct ice_vf *vf)
 {
-	struct ice_pf *pf = vf->pf;
-	LIST_HEAD(tmp_add_list);
-	u8 broadcast[ETH_ALEN];
-	struct ice_vsi *vsi;
-	int status = 0;
-
-	/* first vector index is the VFs OICR index */
-	vf->first_vector_idx = ice_calc_vf_first_vector_idx(pf, vf);
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	int err;
 
-	vsi = ice_vf_vsi_setup(pf, pf->hw.port_info, vf->vf_id);
-	if (!vsi) {
-		dev_err(&pf->pdev->dev, "Failed to create VF VSI\n");
-		return -ENOMEM;
+	if (vf->min_tx_rate) {
+		err = ice_set_min_bw_limit(vsi, (u64)vf->min_tx_rate * 1000);
+		if (err) {
+			dev_err(dev, "failed to set min Tx rate to %d Mbps for VF %u, error %d\n",
+				vf->min_tx_rate, vf->vf_id, err);
+			return err;
+		}
 	}
 
-	vf->lan_vsi_idx = vsi->idx;
-	vf->lan_vsi_num = vsi->vsi_num;
-
-	/* Check if port VLAN exist before, and restore it accordingly */
-	if (vf->port_vlan_id) {
-		ice_vsi_manage_pvid(vsi, vf->port_vlan_id, true);
-		ice_vsi_add_vlan(vsi, vf->port_vlan_id & ICE_VLAN_M);
+	if (vf->max_tx_rate) {
+		err = ice_set_max_bw_limit(vsi, (u64)vf->max_tx_rate * 1000);
+		if (err) {
+			dev_err(dev, "failed to set max Tx rate to %d Mbps for VF %u, error %d\n",
+				vf->max_tx_rate, vf->vf_id, err);
+			return err;
+		}
 	}
 
-	eth_broadcast_addr(broadcast);
+	return 0;
+}
 
-	status = ice_add_mac_to_list(vsi, &tmp_add_list, broadcast);
-	if (status)
-		goto ice_alloc_vsi_res_exit;
+static u16 ice_vf_get_port_vlan_id(struct ice_vf *vf)
+{
+	return vf->port_vlan_info.vid;
+}
 
-	if (is_valid_ether_addr(vf->dflt_lan_addr.addr)) {
-		status = ice_add_mac_to_list(vsi, &tmp_add_list,
-					     vf->dflt_lan_addr.addr);
-		if (status)
-			goto ice_alloc_vsi_res_exit;
+static u8 ice_vf_get_port_vlan_prio(struct ice_vf *vf)
+{
+	return vf->port_vlan_info.prio;
+}
+
+bool ice_vf_is_port_vlan_ena(struct ice_vf *vf)
+{
+	return (ice_vf_get_port_vlan_id(vf) || ice_vf_get_port_vlan_prio(vf));
+}
+
+static u16 ice_vf_get_port_vlan_tpid(struct ice_vf *vf)
+{
+	return vf->port_vlan_info.tpid;
+}
+
+/**
+ * ice_vf_rebuild_host_vlan_cfg - add VLAN 0 filter or rebuild the Port VLAN
+ * @vf: VF to add MAC filters for
+ * @vsi: Pointer to VSI
+ *
+ * Called after a VF VSI has been re-added/rebuilt during reset. The PF driver
+ * always re-adds either a VLAN 0 or port VLAN based filter after reset.
+ */
+static int ice_vf_rebuild_host_vlan_cfg(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	int err;
+
+	if (ice_vf_is_port_vlan_ena(vf)) {
+		err = vlan_ops->set_port_vlan(vsi, &vf->port_vlan_info);
+		if (err) {
+			dev_err(dev, "failed to configure port VLAN via VSI parameters for VF %u, error %d\n",
+				vf->vf_id, err);
+			return err;
+		}
+
+		err = vlan_ops->add_vlan(vsi, &vf->port_vlan_info);
+	} else {
+		err = ice_vsi_add_vlan_zero(vsi);
 	}
 
-	status = ice_add_mac(&pf->hw, &tmp_add_list);
-	if (status)
-		dev_err(&pf->pdev->dev,
-			"could not add mac filters error %d\n", status);
+	if (err) {
+		dev_err(dev, "failed to add VLAN %u filter for VF %u during VF rebuild, error %d\n",
+			ice_vf_is_port_vlan_ena(vf) ?
+			ice_vf_get_port_vlan_id(vf) : 0, vf->vf_id, err);
+		return err;
+	}
+
+	err = vlan_ops->ena_rx_filtering(vsi);
+	if (err) {
+		dev_warn(dev, "failed to enable Rx VLAN filtering for VF %d VSI %d during VF rebuild, error %d\n",
+			 vf->vf_id, vsi->idx, err);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vf_rebuild_dcf_vlan_cfg - Config DCF outer VLAN for VF
+ * @vf: VF to add outer VLAN for
+ * @vsi: Pointer to VSI
+ */
+static int ice_vf_rebuild_dcf_vlan_cfg(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+	struct ice_dcf_vlan_info *dcf_vlan = &vf->dcf_vlan_info;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	int err;
+
+	if (!ice_is_dcf_enabled(vf->pf) || !dcf_vlan->applying) {
+		memset(dcf_vlan, 0, sizeof(*dcf_vlan));
+		return 0;
+	}
+
+	dcf_vlan->applying = 0;
+
+	if (dcf_vlan->outer_port_vlan.vid) {
+		err = ice_vf_vsi_dcf_set_outer_port_vlan(vsi, &dcf_vlan->outer_port_vlan);
+		if (err) {
+			dev_err(dev, "failed to configure outer port VLAN via DCF for VF %u, error %d\n",
+				vf->vf_id, err);
+			return err;
+		}
+	}
+
+	if (dcf_vlan->outer_stripping_ena) {
+		err = ice_vf_vsi_dcf_ena_outer_vlan_stripping(vsi, dcf_vlan->outer_stripping_tpid);
+		if (err) {
+			dev_err(dev, "failed to enable outer VLAN stripping via DCF for VF %u, error %d\n",
+				vf->vf_id, err);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int ice_cfg_mac_antispoof(struct ice_vsi *vsi, bool enable)
+{
+	struct ice_vsi_ctx *ctx;
+	enum ice_status status;
+	int err = 0;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->info.sec_flags = vsi->info.sec_flags;
+	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+
+	if (enable)
+		ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
 	else
-		vf->num_mac = 1;
+		ctx->info.sec_flags &= ~ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
 
-	/* Clear this bit after VF initialization since we shouldn't reclaim
-	 * and reassign interrupts for synchronous or asynchronous VFR events.
-	 * We don't want to reconfigure interrupts since AVF driver doesn't
-	 * expect vector assignment to be changed unless there is a request for
-	 * more vectors.
-	 */
-ice_alloc_vsi_res_exit:
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
-	return status;
+	status = ice_update_vsi(&vsi->back->hw, vsi->idx, ctx, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to configure Tx MAC anti-spoof %s for VSI %d, error %s\n",
+			enable ? "ON" : "OFF", vsi->vsi_num,
+			ice_stat_str(status));
+		err = ice_status_to_errno(status);
+	} else {
+		vsi->info.sec_flags = ctx->info.sec_flags;
+	}
+
+	kfree(ctx);
+
+	return err;
 }
 
 /**
- * ice_alloc_vf_res - Allocate VF resources
- * @vf: pointer to the VF structure
+ * ice_vsi_ena_spoofchk - enable Tx spoof checking for this VSI
+ * @vsi: VSI to enable Tx spoof checking for
  */
-static int ice_alloc_vf_res(struct ice_vf *vf)
+static int ice_vsi_ena_spoofchk(struct ice_vsi *vsi)
 {
-	struct ice_pf *pf = vf->pf;
-	int tx_rx_queue_left;
-	int status;
+	struct ice_vsi_vlan_ops *vlan_ops;
+	int err;
 
-	/* Update number of VF queues, in case VF had requested for queue
-	 * changes
-	 */
-	tx_rx_queue_left = min_t(int, ice_get_avail_txq_count(pf),
-				 ice_get_avail_rxq_count(pf));
-	tx_rx_queue_left += ICE_DFLT_QS_PER_VF;
-	if (vf->num_req_qs && vf->num_req_qs <= tx_rx_queue_left &&
-	    vf->num_req_qs != vf->num_vf_qs)
-		vf->num_vf_qs = vf->num_req_qs;
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
-	/* setup VF VSI and necessary resources */
-	status = ice_alloc_vsi_res(vf);
-	if (status)
-		goto ice_alloc_vf_res_exit;
+	err = vlan_ops->ena_tx_filtering(vsi);
+	if (err)
+		return err;
 
-	if (vf->trusted)
-		set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+	err = ice_cfg_mac_antispoof(vsi, true);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vsi_dis_spoofchk - disable Tx spoof checking for this VSI
+ * @vsi: VSI to disable Tx spoof checking for
+ */
+static int ice_vsi_dis_spoofchk(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops;
+	int err;
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+
+	err = vlan_ops->dis_tx_filtering(vsi);
+	if (err)
+		return err;
+
+	err = ice_cfg_mac_antispoof(vsi, false);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vf_set_spoofchk_cfg - apply Tx spoof checking setting
+ * @vf: VF set spoofchk for
+ * @vsi: VSI associated to the VF
+ */
+static int
+ice_vf_set_spoofchk_cfg(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+	int err;
+
+	if (vf->spoofchk)
+		err = ice_vsi_ena_spoofchk(vsi);
 	else
-		clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+		err = ice_vsi_dis_spoofchk(vsi);
 
-	/* VF is now completely initialized */
-	set_bit(ICE_VF_STATE_INIT, vf->vf_states);
+	return err;
+}
 
-	return status;
+/**
+ * ice_vf_rebuild_adq_port_vlan_cfg - set the port VLAN for VF ADQ VSIs
+ * @vf: VF to add MAC filters for
+ *
+ * Called after a VF ADQ VSI has been re-added/rebuilt during reset.
+ */
+static int ice_vf_rebuild_adq_port_vlan_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	int err, tc;
 
-ice_alloc_vf_res_exit:
-	ice_free_vf_res(vf);
-	return status;
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		struct ice_vsi *vsi;
+
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		err = ice_vf_rebuild_host_vlan_cfg(vf, vsi);
+		if (err) {
+			dev_err(dev, "failed to configure port VLAN via VSI parameters for VF %u, ADQ VSI(num %u), error %d\n",
+				vf->vf_id, vsi->vsi_num, err);
+			return err;
+		}
+	}
+	return 0;
 }
 
 /**
- * ice_ena_vf_mappings
- * @vf: pointer to the VF structure
+ * ice_vf_rebuild_adq_spoofchk_cfg - set the spoofchk config for VF ADQ VSIs
+ * @vf: VF to set spoofchk for
  *
- * Enable VF vectors and queues allocation by writing the details into
- * respective registers.
+ * Called after a VF ADQ VSI has been re-added/rebuilt during reset.
  */
-static void ice_ena_vf_mappings(struct ice_vf *vf)
+static int ice_vf_rebuild_adq_spoofchk_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	int err, tc;
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		struct ice_vsi *vsi;
+
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		err = ice_vf_set_spoofchk_cfg(vf, vsi);
+		if (err) {
+			dev_err(dev, "failed to configure spoofchk via VSI parameters for VF %u, ADQ VSI(num %u), error %d\n",
+				vf->vf_id, vsi->vsi_num, err);
+			return err;
+		}
+	}
+	return 0;
+}
+
+/**
+ * ice_vf_rebuild_host_mac_cfg - add broadcast and the VF's perm_addr/LAA
+ * @vf: VF to add MAC filters for
+ *
+ * Called after a VF VSI has been re-added/rebuilt during reset. The PF driver
+ * always re-adds a broadcast filter and the VF's perm_addr/LAA after reset.
+ */
+static int ice_vf_rebuild_host_mac_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	enum ice_status status;
+	u8 broadcast[ETH_ALEN];
+
+	if (ice_is_eswitch_mode_switchdev(vf->pf))
+		return 0;
+
+	eth_broadcast_addr(broadcast);
+	status = ice_fltr_add_mac(vsi, broadcast, ICE_FWD_TO_VSI);
+	if (status) {
+		dev_err(dev, "failed to add broadcast MAC filter for VF %u, error %s\n",
+			vf->vf_id, ice_stat_str(status));
+		return ice_status_to_errno(status);
+	}
+
+	vf->num_mac++;
+
+	if (is_valid_ether_addr(vf->hw_lan_addr.addr)) {
+		status = ice_fltr_add_mac(vsi, vf->hw_lan_addr.addr,
+					  ICE_FWD_TO_VSI);
+		if (status) {
+			dev_err(dev, "failed to add default unicast MAC filter %pM for VF %u, error %s\n",
+				&vf->hw_lan_addr.addr[0], vf->vf_id,
+				ice_stat_str(status));
+			return ice_status_to_errno(status);
+		}
+		vf->num_mac++;
+
+		ether_addr_copy(vf->dev_lan_addr.addr, vf->hw_lan_addr.addr);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vf_set_host_trust_cfg - set trust setting based on pre-reset value
+ * @vf: VF to configure trust setting for
+ */
+static void ice_vf_set_host_trust_cfg(struct ice_vf *vf)
+{
+	if (vf->trusted)
+		set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+	else
+		clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+}
+
+/**
+ * ice_ena_vf_msix_mappings - enable VF MSIX mappings in hardware
+ * @vf: VF to enable MSIX mappings for
+ *
+ * Some of the registers need to be indexed/configured using hardware global
+ * device values and other registers need 0-based values, which represent PF
+ * based values.
+ */
+static void ice_ena_vf_msix_mappings(struct ice_vf *vf)
 {
-	int abs_vf_id, abs_first, abs_last;
+	int device_based_first_msix, device_based_last_msix;
+	int pf_based_first_msix, pf_based_last_msix, v;
 	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi;
-	int first, last, v;
+	int device_based_vf_id;
 	struct ice_hw *hw;
 	u32 reg;
 
 	hw = &pf->hw;
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	first = vf->first_vector_idx;
-	last = (first + pf->num_vf_msix) - 1;
-	abs_first = first + pf->hw.func_caps.common_cap.msix_vector_first_id;
-	abs_last = (abs_first + pf->num_vf_msix) - 1;
-	abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id;
-
-	/* VF Vector allocation */
-	reg = (((abs_first << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) |
-	       ((abs_last << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) |
-	       VPINT_ALLOC_VALID_M);
+
+	pf_based_first_msix = vf->first_vector_idx;
+	pf_based_last_msix = (pf_based_first_msix + pf->num_msix_per_vf) - 1;
+
+	device_based_first_msix = pf_based_first_msix +
+		pf->hw.func_caps.common_cap.msix_vector_first_id;
+	device_based_last_msix =
+		(device_based_first_msix + pf->num_msix_per_vf) - 1;
+	device_based_vf_id = vf->vf_id + hw->func_caps.vf_base_id;
+
+	reg = (((device_based_first_msix << VPINT_ALLOC_FIRST_S) &
+		VPINT_ALLOC_FIRST_M) |
+	       ((device_based_last_msix << VPINT_ALLOC_LAST_S) &
+		VPINT_ALLOC_LAST_M) | VPINT_ALLOC_VALID_M);
 	wr32(hw, VPINT_ALLOC(vf->vf_id), reg);
 
-	reg = (((abs_first << VPINT_ALLOC_PCI_FIRST_S)
+	reg = (((device_based_first_msix << VPINT_ALLOC_PCI_FIRST_S)
 		 & VPINT_ALLOC_PCI_FIRST_M) |
-	       ((abs_last << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) |
-	       VPINT_ALLOC_PCI_VALID_M);
+	       ((device_based_last_msix << VPINT_ALLOC_PCI_LAST_S) &
+		VPINT_ALLOC_PCI_LAST_M) | VPINT_ALLOC_PCI_VALID_M);
 	wr32(hw, VPINT_ALLOC_PCI(vf->vf_id), reg);
+
 	/* map the interrupts to its functions */
-	for (v = first; v <= last; v++) {
-		reg = (((abs_vf_id << GLINT_VECT2FUNC_VF_NUM_S) &
+	for (v = pf_based_first_msix; v <= pf_based_last_msix; v++) {
+		reg = (((device_based_vf_id << GLINT_VECT2FUNC_VF_NUM_S) &
 			GLINT_VECT2FUNC_VF_NUM_M) |
 		       ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) &
 			GLINT_VECT2FUNC_PF_NUM_M));
 		wr32(hw, GLINT_VECT2FUNC(v), reg);
 	}
 
-	/* Map mailbox interrupt. We put an explicit 0 here to remind us that
-	 * VF admin queue interrupts will go to VF MSI-X vector 0.
-	 */
-	wr32(hw, VPINT_MBX_CTL(abs_vf_id), VPINT_MBX_CTL_CAUSE_ENA_M | 0);
+	/* Map mailbox interrupt to VF VSI VF MSI-X vector 0 */
+	wr32(hw, VPINT_MBX_CTL(device_based_vf_id), VPINT_MBX_CTL_CAUSE_ENA_M);
+}
+
+/**
+ * ice_ena_vf_q_mappings - enable Rx/Tx queue mappings for a VF
+ * @vf: VF to enable the mappings for
+ * @max_txq: max Tx queues allowed on the VF's VSI
+ * @max_rxq: max Rx queues allowed on the VF's VSI
+ */
+static void ice_ena_vf_q_mappings(struct ice_vf *vf, u16 max_txq, u16 max_rxq)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	struct ice_hw *hw = &vf->pf->hw;
+	u32 reg;
+
 	/* set regardless of mapping mode */
 	wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_id), VPLAN_TXQ_MAPENA_TX_ENA_M);
 
@@ -682,12 +1478,11 @@ static void ice_ena_vf_mappings(struct ice_vf *vf)
 		 */
 		reg = (((vsi->txq_map[0] << VPLAN_TX_QBASE_VFFIRSTQ_S) &
 			VPLAN_TX_QBASE_VFFIRSTQ_M) |
-		       (((vsi->alloc_txq - 1) << VPLAN_TX_QBASE_VFNUMQ_S) &
+		       (((max_txq - 1) << VPLAN_TX_QBASE_VFNUMQ_S) &
 			VPLAN_TX_QBASE_VFNUMQ_M));
 		wr32(hw, VPLAN_TX_QBASE(vf->vf_id), reg);
 	} else {
-		dev_err(&pf->pdev->dev,
-			"Scattered mode for VF Tx queues is not yet implemented\n");
+		dev_err(dev, "Scattered mode for VF Tx queues is not yet implemented\n");
 	}
 
 	/* set regardless of mapping mode */
@@ -701,13 +1496,38 @@ static void ice_ena_vf_mappings(struct ice_vf *vf)
 		 */
 		reg = (((vsi->rxq_map[0] << VPLAN_RX_QBASE_VFFIRSTQ_S) &
 			VPLAN_RX_QBASE_VFFIRSTQ_M) |
-		       (((vsi->alloc_txq - 1) << VPLAN_RX_QBASE_VFNUMQ_S) &
+		       (((max_rxq - 1) << VPLAN_RX_QBASE_VFNUMQ_S) &
 			VPLAN_RX_QBASE_VFNUMQ_M));
 		wr32(hw, VPLAN_RX_QBASE(vf->vf_id), reg);
 	} else {
-		dev_err(&pf->pdev->dev,
-			"Scattered mode for VF Rx queues is not yet implemented\n");
+		dev_err(dev, "Scattered mode for VF Rx queues is not yet implemented\n");
+	}
+}
+
+/**
+ * ice_ena_vf_mappings - enable VF MSIX and queue mapping
+ * @vf: pointer to the VF structure
+ */
+static void ice_ena_vf_mappings(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	u16 max_txq, max_rxq;
+
+	ice_ena_vf_msix_mappings(vf);
+
+	if (ice_is_vf_adq_ena(vf)) {
+		u16 offset, num_qps;
+
+		offset = vf->ch[vf->num_tc - 1].offset;
+		num_qps = vf->ch[vf->num_tc - 1].num_qps;
+		max_txq = offset + num_qps;
+		max_rxq = offset + num_qps;
+	} else {
+		max_txq = vsi->alloc_txq;
+		max_rxq = vsi->alloc_rxq;
 	}
+
+	ice_ena_vf_q_mappings(vf, max_txq, max_rxq);
 }
 
 /**
@@ -753,19 +1573,26 @@ ice_determine_res(struct ice_pf *pf, u16 avail_res, u16 max_res, u16 min_res)
  * ice_calc_vf_reg_idx - Calculate the VF's register index in the PF space
  * @vf: VF to calculate the register index for
  * @q_vector: a q_vector associated to the VF
+ * @tc: Traffic class number for VF ADQ
  */
-int ice_calc_vf_reg_idx(struct ice_vf *vf, struct ice_q_vector *q_vector)
+int ice_calc_vf_reg_idx(struct ice_vf *vf, struct ice_q_vector *q_vector,
+			u8 __maybe_unused tc)
 {
 	struct ice_pf *pf;
+	u32 reg_idx;
 
 	if (!vf || !q_vector)
 		return -EINVAL;
 
 	pf = vf->pf;
-
 	/* always add one to account for the OICR being the first MSIX */
-	return pf->sriov_base_vector + pf->num_vf_msix * vf->vf_id +
-		q_vector->v_idx + 1;
+	reg_idx = pf->sriov_base_vector + pf->num_msix_per_vf * vf->vf_id +
+		  q_vector->v_idx + 1;
+
+	if (tc && ice_is_vf_adq_ena(vf))
+		return reg_idx + vf->ch[tc].offset;
+	else
+		return reg_idx;
 }
 
 /**
@@ -797,260 +1624,551 @@ static int ice_get_max_valid_res_idx(struct ice_res_tracker *res)
  * @num_msix_needed: number of MSIX vectors needed for all SR-IOV VFs
  *
  * This function allows SR-IOV resources to be taken from the end of the PF's
- * allowed HW MSIX vectors so in many cases the irq_tracker will not
- * be needed. In these cases we just set the pf->sriov_base_vector and return
- * success.
+ * allowed HW MSIX vectors so that the irq_tracker will not be affected. We
+ * just set the pf->sriov_base_vector and return success.
  *
- * If SR-IOV needs to use any pf->irq_tracker entries it updates the
- * irq_tracker->end based on the first entry needed for SR-IOV. This makes it
- * so any calls to ice_get_res() using the irq_tracker will not try to use
- * resources at or beyond the newly set value.
+ * If there are not enough resources available, return an error. This should
+ * always be caught by ice_set_per_vf_res().
  *
- * Return 0 on success, and -EINVAL when there are not enough MSIX vectors in
+ * Return 0 on success, and -EINVAL when there are not enough MSIX vectors
  * in the PF's space available for SR-IOV.
  */
 static int ice_sriov_set_msix_res(struct ice_pf *pf, u16 num_msix_needed)
 {
-	int max_valid_res_idx = ice_get_max_valid_res_idx(pf->irq_tracker);
-	u16 pf_total_msix_vectors =
-		pf->hw.func_caps.common_cap.num_msix_vectors;
-	struct ice_res_tracker *res = pf->irq_tracker;
+	u16 total_vectors = pf->hw.func_caps.common_cap.num_msix_vectors;
+	int vectors_used = pf->irq_tracker->num_entries;
 	int sriov_base_vector;
 
-	if (max_valid_res_idx < 0)
-		return max_valid_res_idx;
-
-	sriov_base_vector = pf_total_msix_vectors - num_msix_needed;
+	sriov_base_vector = total_vectors - num_msix_needed;
 
 	/* make sure we only grab irq_tracker entries from the list end and
 	 * that we have enough available MSIX vectors
 	 */
-	if (sriov_base_vector <= max_valid_res_idx)
+	if (sriov_base_vector < vectors_used)
 		return -EINVAL;
 
 	pf->sriov_base_vector = sriov_base_vector;
 
-	/* dip into irq_tracker entries and update used resources */
-	if (num_msix_needed > (pf_total_msix_vectors - res->num_entries)) {
-		pf->num_avail_sw_msix -=
-			res->num_entries - pf->sriov_base_vector;
-		res->end = pf->sriov_base_vector;
-	}
-
 	return 0;
 }
 
 /**
- * ice_check_avail_res - check if vectors and queues are available
+ * ice_set_per_vf_res - check if vectors and queues are available
  * @pf: pointer to the PF structure
  *
- * This function is where we calculate actual number of resources for VF VSIs,
- * we don't reserve ahead of time during probe. Returns success if vectors and
- * queues resources are available, otherwise returns error code
+ * First, determine HW interrupts from common pool. If we allocate fewer VFs, we
+ * get more vectors and can enable more queues per VF. Note that this does not
+ * grab any vectors from the SW pool already allocated. Also note, that all
+ * vector counts include one for each VF's miscellaneous interrupt vector
+ * (i.e. OICR).
+ *
+ * Minimum VFs - 2 vectors, 1 queue pair
+ * Small VFs - 5 vectors, 4 queue pairs
+ * Medium VFs - 17 vectors, 16 queue pairs
+ *
+ * While more vectors can be assigned to a VF, the RSS LUT
+ * is only 4 bits wide, so we can only do 16 queues of RSS
+ * per VF.
+ *
+ * ADQ sizes:
+ * Small ADQ VFs - 5 vectors, 4 TCs, 16 queue pairs (4 queue pairs/int)
+ * Medium ADQ VFs - 17 vectors, 4 TCs, 16 queue pairs (1 queue pairs/int)
+ *
+ * Second, determine number of queue pairs per VF by starting with a pre-defined
+ * maximum each VF supports. If this is not possible, then we adjust based on
+ * queue pairs available on the device.
+ *
+ * Lastly, set queue and MSI-X VF variables tracked by the PF so it can be used
+ * by each VF during VF initialization and reset.
  */
-static int ice_check_avail_res(struct ice_pf *pf)
+static int ice_set_per_vf_res(struct ice_pf *pf)
 {
 	int max_valid_res_idx = ice_get_max_valid_res_idx(pf->irq_tracker);
-	u16 num_msix, num_txq, num_rxq, num_avail_msix;
+	int msix_avail_per_vf, msix_avail_for_sriov;
+	struct device *dev = ice_pf_to_dev(pf);
+	u16 num_msix_per_vf, num_txq, num_rxq;
 
 	if (!pf->num_alloc_vfs || max_valid_res_idx < 0)
 		return -EINVAL;
 
-	/* add 1 to max_valid_res_idx to account for it being 0-based */
-	num_avail_msix = pf->hw.func_caps.common_cap.num_msix_vectors -
-		(max_valid_res_idx + 1);
-
-	/* Grab from HW interrupts common pool
-	 * Note: By the time the user decides it needs more vectors in a VF
-	 * its already too late since one must decide this prior to creating the
-	 * VF interface. So the best we can do is take a guess as to what the
-	 * user might want.
-	 *
-	 * We have two policies for vector allocation:
-	 * 1. if num_alloc_vfs is from 1 to 16, then we consider this as small
-	 * number of NFV VFs used for NFV appliances, since this is a special
-	 * case, we try to assign maximum vectors per VF (65) as much as
-	 * possible, based on determine_resources algorithm.
-	 * 2. if num_alloc_vfs is from 17 to 256, then its large number of
-	 * regular VFs which are not used for any special purpose. Hence try to
-	 * grab default interrupt vectors (5 as supported by AVF driver).
-	 */
-	if (pf->num_alloc_vfs <= 16) {
-		num_msix = ice_determine_res(pf, num_avail_msix,
-					     ICE_MAX_INTR_PER_VF,
-					     ICE_MIN_INTR_PER_VF);
-	} else if (pf->num_alloc_vfs <= ICE_MAX_VF_COUNT) {
-		num_msix = ice_determine_res(pf, num_avail_msix,
-					     ICE_DFLT_INTR_PER_VF,
-					     ICE_MIN_INTR_PER_VF);
+	/* determine MSI-X resources per VF */
+	msix_avail_for_sriov = pf->hw.func_caps.common_cap.num_msix_vectors -
+		pf->irq_tracker->num_entries;
+	msix_avail_per_vf = msix_avail_for_sriov / pf->num_alloc_vfs;
+	if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_MAX) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_MAX;
+	} else if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_LARGE) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_LARGE;
+	} else if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_MED) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_MED;
+	} else if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_SMALL) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_SMALL;
+	} else if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_MULTIQ_MIN) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_MULTIQ_MIN;
+	} else if (msix_avail_per_vf >= ICE_MIN_INTR_PER_VF) {
+		num_msix_per_vf = ICE_MIN_INTR_PER_VF;
 	} else {
-		dev_err(&pf->pdev->dev,
-			"Number of VFs %d exceeds max VF count %d\n",
-			pf->num_alloc_vfs, ICE_MAX_VF_COUNT);
+		dev_err(dev, "Only %d MSI-X interrupts available for SR-IOV. Not enough to support minimum of %d MSI-X interrupts per VF for %d VFs\n",
+			msix_avail_for_sriov, ICE_MIN_INTR_PER_VF,
+			pf->num_alloc_vfs);
 		return -EIO;
 	}
 
-	if (!num_msix)
-		return -EIO;
-
-	/* Grab from the common pool
-	 * start by requesting Default queues (4 as supported by AVF driver),
-	 * Note that, the main difference between queues and vectors is, latter
-	 * can only be reserved at init time but queues can be requested by VF
-	 * at runtime through Virtchnl, that is the reason we start by reserving
-	 * few queues.
-	 */
+	/* determine queue resources per VF */
 	num_txq = ice_determine_res(pf, ice_get_avail_txq_count(pf),
-				    ICE_DFLT_QS_PER_VF, ICE_MIN_QS_PER_VF);
+				    min_t(u16,
+					  num_msix_per_vf - ICE_NONQ_VECS_VF,
+					  ICE_MAX_DFLT_QS_PER_VF),
+				    ICE_MIN_QS_PER_VF);
 
 	num_rxq = ice_determine_res(pf, ice_get_avail_rxq_count(pf),
-				    ICE_DFLT_QS_PER_VF, ICE_MIN_QS_PER_VF);
-
-	if (!num_txq || !num_rxq)
+				    min_t(u16,
+					  num_msix_per_vf - ICE_NONQ_VECS_VF,
+					  ICE_MAX_DFLT_QS_PER_VF),
+				    ICE_MIN_QS_PER_VF);
+
+	if (!num_txq || !num_rxq) {
+		dev_err(dev, "Not enough queues to support minimum of %d queue pairs per VF for %d VFs\n",
+			ICE_MIN_QS_PER_VF, pf->num_alloc_vfs);
 		return -EIO;
+	}
 
-	if (ice_sriov_set_msix_res(pf, num_msix * pf->num_alloc_vfs))
+	if (ice_sriov_set_msix_res(pf, num_msix_per_vf * pf->num_alloc_vfs)) {
+		dev_err(dev, "Unable to set MSI-X resources for %d VFs\n",
+			pf->num_alloc_vfs);
 		return -EINVAL;
+	}
 
-	/* since AVF driver works with only queue pairs which means, it expects
-	 * to have equal number of Rx and Tx queues, so take the minimum of
-	 * available Tx or Rx queues
-	 */
-	pf->num_vf_qps = min_t(int, num_txq, num_rxq);
-	pf->num_vf_msix = num_msix;
+	/* only allow equal Tx/Rx queue count (i.e. queue pairs) */
+	pf->num_qps_per_vf = min_t(int, num_txq, num_rxq);
+	pf->num_msix_per_vf = num_msix_per_vf;
+	dev_info(dev, "Enabling %d VFs with %d vectors and %d queues per VF\n",
+		 pf->num_alloc_vfs, pf->num_msix_per_vf, pf->num_qps_per_vf);
 
 	return 0;
 }
 
 /**
- * ice_cleanup_and_realloc_vf - Clean up VF and reallocate resources after reset
- * @vf: pointer to the VF structure
- *
- * Cleanup a VF after the hardware reset is finished. Expects the caller to
- * have verified whether the reset is finished properly, and ensure the
- * minimum amount of wait time has passed. Reallocate VF resources back to make
- * VF state active
+ * ice_clear_vf_reset_trigger - enable VF to access hardware
+ * @vf: VF to enabled hardware access for
  */
-static void ice_cleanup_and_realloc_vf(struct ice_vf *vf)
+static void ice_clear_vf_reset_trigger(struct ice_vf *vf)
 {
-	struct ice_pf *pf = vf->pf;
-	struct ice_hw *hw;
+	struct ice_hw *hw = &vf->pf->hw;
 	u32 reg;
 
-	hw = &pf->hw;
-
-	/* PF software completes the flow by notifying VF that reset flow is
-	 * completed. This is done by enabling hardware by clearing the reset
-	 * bit in the VPGEN_VFRTRIG reg and setting VFR_STATE in the VFGEN_RSTAT
-	 * register to VFR completed (done at the end of this function)
-	 * By doing this we allow HW to access VF memory at any point. If we
-	 * did it any sooner, HW could access memory while it was being freed
-	 * in ice_free_vf_res(), causing an IOMMU fault.
-	 *
-	 * On the other hand, this needs to be done ASAP, because the VF driver
-	 * is waiting for this to happen and may report a timeout. It's
-	 * harmless, but it gets logged into Guest OS kernel log, so best avoid
-	 * it.
-	 */
 	reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_id));
 	reg &= ~VPGEN_VFRTRIG_VFSWR_M;
 	wr32(hw, VPGEN_VFRTRIG(vf->vf_id), reg);
+	ice_flush(hw);
+}
 
-	/* reallocate VF resources to finish resetting the VSI state */
-	if (!ice_alloc_vf_res(vf)) {
-		ice_ena_vf_mappings(vf);
-		set_bit(ICE_VF_STATE_ACTIVE, vf->vf_states);
-		clear_bit(ICE_VF_STATE_DIS, vf->vf_states);
-		vf->num_vlan = 0;
+static int ice_vf_set_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	u8 lport = vsi->port_info->lport;
+	enum ice_status status;
+
+	if (ice_vf_is_port_vlan_ena(vf))
+		status = ice_fltr_set_vsi_promisc(hw, vsi->idx, promisc_m,
+						  ice_vf_get_port_vlan_id(vf),
+						  lport);
+	else if (ice_vsi_has_non_zero_vlans(vsi))
+		status = ice_fltr_set_vlan_vsi_promisc(hw, vsi, promisc_m);
+	else
+		status = ice_fltr_set_vsi_promisc(hw, vsi->idx, promisc_m, 0, lport);
+
+	if (status && status != ICE_ERR_ALREADY_EXISTS) {
+		dev_err(ice_pf_to_dev(vsi->back), "enable Tx/Rx filter promiscuous mode on VF-%u failed, error: %s\n",
+			vf->vf_id, ice_stat_str(status));
+		return ice_status_to_errno(status);
 	}
 
-	/* Tell the VF driver the reset is done. This needs to be done only
-	 * after VF has been fully initialized, because the VF driver may
-	 * request resources immediately after setting this flag.
-	 */
-	wr32(hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE);
+	return 0;
 }
 
-/**
- * ice_vf_set_vsi_promisc - set given VF VSI to given promiscuous mode(s)
- * @vf: pointer to the VF info
- * @vsi: the VSI being configured
- * @promisc_m: mask of promiscuous config bits
- * @rm_promisc: promisc flag request from the VF to remove or add filter
- *
- * This function configures VF VSI promiscuous mode, based on the VF requests,
- * for Unicast, Multicast and VLAN
- */
-static enum ice_status
-ice_vf_set_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m,
-		       bool rm_promisc)
+static int ice_vf_clear_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m)
 {
-	struct ice_pf *pf = vf->pf;
-	enum ice_status status = 0;
-	struct ice_hw *hw;
+	struct ice_hw *hw = &vsi->back->hw;
+	u8 lport = vsi->port_info->lport;
+	enum ice_status status;
 
-	hw = &pf->hw;
-	if (vf->num_vlan) {
-		status = ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_m,
-						  rm_promisc);
-	} else if (vf->port_vlan_id) {
-		if (rm_promisc)
-			status = ice_clear_vsi_promisc(hw, vsi->idx, promisc_m,
-						       vf->port_vlan_id);
-		else
-			status = ice_set_vsi_promisc(hw, vsi->idx, promisc_m,
-						     vf->port_vlan_id);
-	} else {
-		if (rm_promisc)
-			status = ice_clear_vsi_promisc(hw, vsi->idx, promisc_m,
-						       0);
-		else
-			status = ice_set_vsi_promisc(hw, vsi->idx, promisc_m,
-						     0);
+	if (ice_vf_is_port_vlan_ena(vf))
+		status = ice_fltr_clear_vsi_promisc(hw, vsi->idx, promisc_m,
+						    ice_vf_get_port_vlan_id(vf),
+						    lport);
+	else if (ice_vsi_has_non_zero_vlans(vsi))
+		status = ice_fltr_clear_vlan_vsi_promisc(hw, vsi, promisc_m);
+	else
+		status = ice_fltr_clear_vsi_promisc(hw, vsi->idx, promisc_m, 0, lport);
+
+	if (status && status != ICE_ERR_DOES_NOT_EXIST) {
+		dev_err(ice_pf_to_dev(vsi->back), "disable Tx/Rx filter promiscuous mode on VF-%u failed, error: %s\n",
+			vf->vf_id, ice_stat_str(status));
+		return ice_status_to_errno(status);
 	}
 
-	return status;
+	return 0;
 }
 
-/**
- * ice_config_res_vfs - Finalize allocation of VFs resources in one go
- * @pf: pointer to the PF structure
+static void ice_vf_clear_counters(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+
+	vf->num_mac = 0;
+	vsi->num_vlan = 0;
+	memset(&vf->mdd_tx_events, 0, sizeof(vf->mdd_tx_events));
+	memset(&vf->mdd_rx_events, 0, sizeof(vf->mdd_rx_events));
+}
+
+/**
+ * ice_vf_pre_vsi_rebuild - tasks to be done prior to VSI rebuild
+ * @vf: VF to perfrom pre VSI rebuild tasks
  *
- * This function is being called as last part of resetting all VFs, or when
- * configuring VFs for the first time, where there is no resource to be freed
- * Returns true if resources were properly allocated for all VFs, and false
- * otherwise.
+ * These tasks are items that don't need to be amortized since they are most
+ * likely called in a for loop with all VF(s) in the reset_all_vfs() case.
  */
-static bool ice_config_res_vfs(struct ice_pf *pf)
+static void ice_vf_pre_vsi_rebuild(struct ice_vf *vf)
 {
-	struct ice_hw *hw = &pf->hw;
-	int v;
+	/* Remove switch rules associated with the reset VF */
+	ice_rm_dcf_sw_vsi_rule(vf->pf, vf->lan_vsi_num);
 
-	if (ice_check_avail_res(pf)) {
-		dev_err(&pf->pdev->dev,
-			"Cannot allocate VF resources, try with fewer number of VFs\n");
-		return false;
+	if (ice_is_vf_dcf(vf)) {
+		if (vf->pf->hw.dcf_caps & DCF_ACL_CAP)
+			ice_acl_destroy_tbl(&vf->pf->hw);
+		ice_clear_dcf_udp_tunnel_cfg(vf->pf);
 	}
 
-	/* rearm global interrupts */
-	if (test_and_clear_bit(__ICE_OICR_INTR_DIS, pf->state))
-		ice_irq_dynamic_ena(hw, NULL, NULL);
+	ice_vf_clear_counters(vf);
+	ice_clear_vf_reset_trigger(vf);
+}
+
+/**
+ * ice_vf_rebuild_aggregator_node_cfg - rebuild aggregator node config
+ * @vsi: Pointer to VSI
+ *
+ * This function moves VSI into corresponding scheduler aggregator node
+ * based on cached value of "aggregator node info" per VSI
+ */
+static void ice_vf_rebuild_aggregator_node_cfg(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
 
-	/* Finish resetting each VF and allocate resources */
-	for (v = 0; v < pf->num_alloc_vfs; v++) {
-		struct ice_vf *vf = &pf->vf[v];
+	if (!vsi->agg_node)
+		return;
 
-		vf->num_vf_qs = pf->num_vf_qps;
-		dev_dbg(&pf->pdev->dev,
-			"VF-id %d has %d queues configured\n",
-			vf->vf_id, vf->num_vf_qs);
-		ice_cleanup_and_realloc_vf(vf);
+	dev = ice_pf_to_dev(pf);
+	if (vsi->agg_node->num_vsis == ICE_MAX_VSIS_IN_AGG_NODE) {
+		dev_dbg(dev,
+			"agg_id %u already has reached max_num_vsis %u\n",
+			vsi->agg_node->agg_id, vsi->agg_node->num_vsis);
+		return;
 	}
 
-	ice_flush(hw);
-	clear_bit(__ICE_VF_DIS, pf->state);
+	status = ice_move_vsi_to_agg(pf->hw.port_info, vsi->agg_node->agg_id,
+				     vsi->idx, (u8)vsi->tc_cfg.ena_tc);
+	if (status)
+		dev_dbg(dev, "unable to move VSI idx %u into aggregator %u node",
+			vsi->idx, vsi->agg_node->agg_id);
+	else
+		vsi->agg_node->num_vsis++;
+}
 
-	return true;
+/**
+ * ice_vf_rebuild_host_cfg - host admin configuration is persistent across reset
+ * @vf: VF to rebuild host configuration on
+ */
+static void ice_vf_rebuild_host_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+
+	ice_vf_set_host_trust_cfg(vf);
+
+	if (ice_vf_rebuild_host_mac_cfg(vf))
+		dev_err(dev, "failed to rebuild default MAC configuration for VF %d\n",
+			vf->vf_id);
+
+	if (ice_vf_rebuild_dcf_vlan_cfg(vf, vsi))
+		dev_err(dev, "failed to rebuild DCF VLAN configuration for VF %u\n",
+			vf->vf_id);
+
+	if (ice_vf_rebuild_host_vlan_cfg(vf, vsi))
+		dev_err(dev, "failed to rebuild VLAN configuration for VF %u\n",
+			vf->vf_id);
+
+	if (ice_vf_rebuild_host_tx_rate_cfg(vf))
+		dev_err(dev, "failed to rebuild Tx rate limiting configuration for VF %u\n",
+			vf->vf_id);
+
+	if (ice_vf_set_spoofchk_cfg(vf, vsi))
+		dev_err(dev, "failed to rebuild spoofchk configuration for VF %d\n",
+			vf->vf_id);
+
+	/* rebuild aggregator node config for main VF VSI */
+	ice_vf_rebuild_aggregator_node_cfg(vsi);
+}
+
+/**
+ * ice_vf_rebuild_adq_aggregator_node - move ADQ VSIs into aggregator node
+ * @vf: VF to rebuild ADQ VSI(s) Tx rate configuration on
+ *
+ * If VF ADQ is enabled, replay scheduler aggregator node config
+ */
+static void ice_vf_rebuild_adq_aggregator_node(struct ice_vf *vf)
+{
+	int tc;
+
+	if (!ice_is_vf_adq_ena(vf))
+		return;
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		struct ice_vsi *vsi;
+
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		ice_vf_rebuild_aggregator_node_cfg(vsi);
+	}
+}
+
+/**
+ * ice_vf_rebuild_adq_tx_rate_cfg - rebuild ADQ VSI(s) Tx rate configuration
+ * @vf: VF to rebuild ADQ VSI(s) Tx rate configuration on
+ */
+static void ice_vf_rebuild_adq_tx_rate_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi;
+	u64 max_tx_rate;
+	u8 tc;
+
+	if (!ice_is_vf_adq_ena(vf))
+		return;
+
+	/* Host may have set Tx rate for VF, but use the TC0's specified
+	 * max Tx rate for main VF VSI.
+	 * Iterate thru' all VSI (hence for loop starts with zero) shared by
+	 * given VF and set the BW limit if specified as part of
+	 * VF ADQ TC config
+	 */
+	for (tc = 0; tc < vf->num_tc; tc++) {
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+
+		max_tx_rate = vf->ch[tc].max_tx_rate;
+		if (!max_tx_rate)
+			continue;
+
+		if (!tc && vf->max_tx_rate)
+			dev_dbg(dev, "Host managed VF rate limit %u for VF %d are being changed to %llu\n",
+				vf->max_tx_rate, vf->vf_id, max_tx_rate);
+
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		if (ice_set_max_bw_limit(vsi, max_tx_rate * 1000))
+			dev_err(dev, "Unable to set Tx rate %llu in Mbps for VF %u TC %d\n",
+				max_tx_rate, vf->vf_id, tc);
+	}
+}
+
+/**
+ * ice_vf_rebuild_adq_host_cfg - host admin config is persistent across reset
+ * @vf: VF to rebuild ADQ host configuration on
+ */
+static void ice_vf_rebuild_adq_host_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+
+	ice_vf_rebuild_adq_aggregator_node(vf);
+	ice_vf_rebuild_adq_tx_rate_cfg(vf);
+	if (ice_vf_rebuild_adq_port_vlan_cfg(vf))
+		dev_err(dev, "failed to rebuild port VLAN configuration for ADQ enabled VF %u\n",
+			vf->vf_id);
+	if (ice_vf_rebuild_adq_spoofchk_cfg(vf))
+		dev_err(dev, "failed to rebuild spoofchk configuration for ADQ enabled VF %u\n",
+			vf->vf_id);
+}
+
+/**
+ * ice_vf_rebuild_adq_vsi_with_release - release and setup each ADQ VSI
+ * @vf: VF to re-apply ADQ configuration for
+ *
+ * This is only called when a single VF is being reset (i.e. VFR, VFLR, host VF
+ * configuration change, etc.).
+ *
+ * This cannot be called for the reset all VFs case as ice_vf_adq_vsi_release()
+ * will fail because there are no VF VSI(s) in firmware at this point.
+ */
+static int ice_vf_rebuild_adq_vsi_with_release(struct ice_vf *vf)
+{
+	u8 tc;
+
+	if (!ice_is_vf_adq_ena(vf))
+		return 0;
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		if (ice_vf_adq_vsi_valid(vf, tc)) {
+			ice_vf_adq_vsi_stop_rings(vf, tc);
+			ice_vf_adq_vsi_disable_txqs(vf, tc);
+			ice_vf_adq_vsi_release(vf, tc);
+		}
+
+		if (!ice_vf_adq_vsi_setup(vf, tc)) {
+			dev_err(ice_pf_to_dev(vf->pf), "failed to setup ADQ VSI for VF %u, TC %d, disabling VF ADQ VSI\n",
+				vf->vf_id, tc);
+			goto adq_cfg_failed;
+		}
+	}
+
+	/* must to store away TC0's info because it is used later */
+	vf->ch[0].vsi_idx = vf->lan_vsi_idx;
+	vf->ch[0].vsi_num = vf->lan_vsi_num;
+
+	return 0;
+
+adq_cfg_failed:
+	/* perform VSI release for ADQ VSI if some of them were
+	 * created successfully.
+	 */
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		if (ice_vf_adq_vsi_valid(vf, tc)) {
+			ice_vf_adq_vsi_disable_txqs(vf, tc);
+			ice_vf_adq_vsi_release(vf, tc);
+		}
+		ice_vf_adq_cfg_cleanup(vf, tc);
+	}
+	vf->adq_enabled = false;
+	vf->num_tc = 0;
+	/* Upon failure also clean up tc=0 specific info from
+	 * software data structs, to avoid having stale info
+	 */
+	ice_vf_adq_invalidate_vsi(vf, 0);
+	ice_vf_adq_cfg_cleanup(vf, 0);
+	return -ENOMEM;
+}
+
+/**
+ * ice_vf_rebuild_adq_vsi - rebuild ADQ VSI(s) on the VF
+ * @vf: VF to rebuild ADQ VSI(s) on
+ */
+static int ice_vf_rebuild_adq_vsi(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+	int tc;
+
+	/* no ADQ configured, nothing to do */
+	if (!ice_is_vf_adq_ena(vf))
+		return 0;
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		struct ice_vsi *vsi;
+		int ret;
+
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		ret = ice_vsi_rebuild(vsi, true);
+		if (ret) {
+			dev_err(ice_pf_to_dev(pf), "failed to rebuild ADQ VSI for VF %u, disabling VF ADQ VSI\n",
+				vf->vf_id);
+			vf->adq_enabled = false;
+			ice_vf_adq_invalidate_vsi(vf, tc);
+			return ret;
+		}
+
+		vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+		vf->ch[tc].vsi_num = vsi->vsi_num;
+		vf->ch[tc].vsi_idx = vsi->idx;
+	}
+
+	/* must to store away TC0's info because it is use later */
+	vf->ch[0].vsi_idx = vf->lan_vsi_idx;
+	vf->ch[0].vsi_num = vf->lan_vsi_num;
+
+	return 0;
+}
+
+/**
+ * ice_vf_rebuild_vsi_with_release - release and setup the VF's VSI
+ * @vf: VF to release and setup the VSI for
+ *
+ * This is only called when a single VF is being reset (i.e. VFR, VFLR, host VF
+ * configuration change, etc.).
+ */
+static int ice_vf_rebuild_vsi_with_release(struct ice_vf *vf)
+{
+	ice_vf_vsi_release(vf);
+	if (!ice_vf_vsi_setup(vf))
+		return -ENOMEM;
+
+	ice_vf_rebuild_adq_vsi_with_release(vf);
+
+	return 0;
+}
+
+/**
+ * ice_vf_rebuild_vsi - rebuild the VF's VSI
+ * @vf: VF to rebuild the VSI for
+ *
+ * This is only called when all VF(s) are being reset (i.e. PCIe Reset on the
+ * host, PFR, CORER, etc.).
+ */
+static int ice_vf_rebuild_vsi(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	struct ice_pf *pf = vf->pf;
+
+	if (ice_vsi_rebuild(vsi, true)) {
+		dev_err(ice_pf_to_dev(pf), "failed to rebuild VF %d VSI\n",
+			vf->vf_id);
+		return -EIO;
+	}
+	/* vsi->idx will remain the same in this case so don't update
+	 * vf->lan_vsi_idx
+	 */
+	vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+	vf->lan_vsi_num = vsi->vsi_num;
+
+	if (ice_vf_rebuild_adq_vsi(vf)) {
+		dev_err(ice_pf_to_dev(pf), "failed to rebuild ADQ configuration for VF %d\n",
+			vf->vf_id);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vf_set_initialized - VF is ready for VIRTCHNL communication
+ * @vf: VF to set in initialized state
+ *
+ * After this function the VF will be ready to receive/handle the
+ * VIRTCHNL_OP_GET_VF_RESOURCES message
+ */
+static void ice_vf_set_initialized(struct ice_vf *vf)
+{
+	ice_set_vf_state_qs_dis(vf);
+	clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states);
+	clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states);
+	clear_bit(ICE_VF_STATE_DIS, vf->vf_states);
+	set_bit(ICE_VF_STATE_INIT, vf->vf_states);
+	memset(&vf->vlan_v2_caps, 0, sizeof(vf->vlan_v2_caps));
+}
+
+/**
+ * ice_vf_post_vsi_rebuild - tasks to do after the VF's VSI have been rebuilt
+ * @vf: VF to perform tasks on
+ */
+static void ice_vf_post_vsi_rebuild(struct ice_vf *vf)
+{
+	ice_vf_rebuild_host_cfg(vf);
+	ice_vf_rebuild_adq_host_cfg(vf);
+	ice_vf_set_initialized(vf);
+	ice_ena_vf_mappings(vf);
+	wr32(&vf->pf->hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE);
 }
 
 /**
@@ -1067,6 +2185,7 @@ static bool ice_config_res_vfs(struct ice_pf *pf)
  */
 bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
 	struct ice_vf *vf;
 	int v, i;
@@ -1075,25 +2194,23 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 	if (!pf->num_alloc_vfs)
 		return false;
 
+	/* clear all malicious info if the VFs are getting reset */
+	ice_for_each_vf(pf, i)
+		if (ice_mbx_clear_malvf(&hw->mbx_snapshot, pf->malvfs, ICE_MAX_VF_COUNT, i))
+			dev_dbg(dev, "failed to clear malicious VF state for VF %u\n", i);
+
 	/* If VFs have been disabled, there is no need to reset */
-	if (test_and_set_bit(__ICE_VF_DIS, pf->state))
+	if (test_and_set_bit(ICE_VF_DIS, pf->state))
 		return false;
 
+	ice_clear_dcf_acl_cfg(pf);
+	ice_clear_dcf_udp_tunnel_cfg(pf);
+	pf->hw.dcf_caps &= ~(DCF_ACL_CAP | DCF_UDP_TUNNEL_CAP);
+
 	/* Begin reset on all VFs at once */
-	for (v = 0; v < pf->num_alloc_vfs; v++)
+	ice_for_each_vf(pf, v)
 		ice_trigger_vf_reset(&pf->vf[v], is_vflr, true);
 
-	for (v = 0; v < pf->num_alloc_vfs; v++) {
-		struct ice_vsi *vsi;
-
-		vf = &pf->vf[v];
-		vsi = pf->vsi[vf->lan_vsi_idx];
-		if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states))
-			ice_dis_vf_qs(vf);
-		ice_dis_vsi_txq(vsi->port_info, vsi->idx, 0, 0, NULL, NULL,
-				NULL, ICE_VF_RESET, vf->vf_id, NULL);
-	}
-
 	/* HW requires some time to make sure it can flush the FIFO for a VF
 	 * when it resets it. Poll the VPGEN_VFRSTAT register for each VF in
 	 * sequence to make sure that it has completed. We'll keep track of
@@ -1101,7 +2218,6 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 	 * finished resetting.
 	 */
 	for (i = 0, v = 0; i < 10 && v < pf->num_alloc_vfs; i++) {
-
 		/* Check each VF in sequence */
 		while (v < pf->num_alloc_vfs) {
 			u32 reg;
@@ -1121,69 +2237,168 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 		}
 	}
 
+
 	/* Display a warning if at least one VF didn't manage to reset in
 	 * time, but continue on with the operation.
 	 */
 	if (v < pf->num_alloc_vfs)
-		dev_warn(&pf->pdev->dev, "VF reset check timeout\n");
+		dev_warn(dev, "VF reset check timeout\n");
+
 
 	/* free VF resources to begin resetting the VSI state */
-	for (v = 0; v < pf->num_alloc_vfs; v++) {
+	ice_for_each_vf(pf, v) {
 		vf = &pf->vf[v];
 
-		ice_free_vf_res(vf);
+		vf->driver_caps = 0;
+		ice_vc_set_default_allowlist(vf);
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		/* always release VF ADQ filters since those filters will be
+		 * replayed by VF driver. This is needed to avoid stale
+		 * filters in software internal data structues
+		 */
+		ice_del_all_adv_switch_fltr(vf);
+#endif
 
-		/* Free VF queues as well, and reallocate later.
-		 * If a given VF has different number of queues
-		 * configured, the request for update will come
-		 * via mailbox communication.
+		ice_vf_fdir_exit(vf);
+		ice_vf_fdir_init(vf);
+		/* clean VF control VSI when resetting VFs since it should be
+		 * setup only when iAVF creates its first FDIR rule.
 		 */
-		vf->num_vf_qs = 0;
+		if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+			ice_vf_ctrl_invalidate_vsi(vf);
+
+		ice_vf_pre_vsi_rebuild(vf);
+		ice_vf_rebuild_vsi(vf);
+		ice_vf_post_vsi_rebuild(vf);
 	}
 
-	if (ice_sriov_free_msix_res(pf))
-		dev_err(&pf->pdev->dev,
-			"Failed to free MSIX resources used by SR-IOV\n");
+	if (ice_is_eswitch_mode_switchdev(pf))
+		if (ice_eswitch_rebuild(pf))
+			dev_warn(dev, "eswitch rebuild failed\n");
 
-	if (!ice_config_res_vfs(pf))
-		return false;
+	ice_flush(hw);
+	clear_bit(ICE_VF_DIS, pf->state);
 
 	return true;
 }
 
+/**
+ * ice_is_vf_disabled
+ * @vf: pointer to the VF info
+ *
+ * Returns true if the PF or VF is disabled, false otherwise.
+ */
+static bool ice_is_vf_disabled(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+
+	/* If the PF has been disabled, there is no need resetting VF until
+	 * PF is active again. Similarly, if the VF has been disabled, this
+	 * means something else is resetting the VF, so we shouldn't continue.
+	 * Otherwise, set disable VF state bit for actual reset, and continue.
+	 */
+	return (test_bit(ICE_VF_DIS, pf->state) ||
+		test_bit(ICE_VF_STATE_DIS, vf->vf_states));
+}
+
+/**
+ * ice_vf_get_glint_ceqctl_idx - get the GLINT_CEQCTL index relative to the PF
+ * @vf: VF used to get the index
+ * @ceq_idx: 0-based index from the VF
+ *
+ * Use the VF relative (0-based) CEQ index plus the first PF MSI-X index
+ * assigned to this VF (relative to the PF's MSIX space) to determine the index
+ * of the GLINT_CEQCTL register
+ */
+static u16 ice_vf_get_glint_ceqctl_idx(struct ice_vf *vf, u16 ceq_idx)
+{
+	return vf->first_vector_idx + ceq_idx;
+}
+
+/**
+ * ice_vf_clear_ceq_irq_map - clear the CEQ IRQ mapping
+ * @vf: VF used to clear the mapping
+ * @ceq_idx: VF relative (0-based) CEQ index
+ */
+static void ice_vf_clear_ceq_irq_map(struct ice_vf *vf, u16 ceq_idx)
+{
+	u16 glint_ceqctl_idx = ice_vf_get_glint_ceqctl_idx(vf, ceq_idx);
+
+	wr32(&vf->pf->hw, GLINT_CEQCTL(glint_ceqctl_idx), 0);
+}
+
+/**
+ * ice_vf_clear_aeq_irq_map - clear the AEQ IRQ mapping
+ * @vf: VF used to clear the mapping
+ */
+static void ice_vf_clear_aeq_irq_map(struct ice_vf *vf)
+{
+	wr32(&vf->pf->hw, VPINT_AEQCTL(vf->vf_id), 0);
+}
+
+/**
+ * ice_vf_clear_rdma_irq_map - clear the RDMA IRQ mapping
+ * @vf: VF used to clear the mapping
+ *
+ * Clear any RDMA IRQ mapping that a VF might have requested. Since the number
+ * of CEQ indices are never greater than the num_msix_per_vf just clear all CEQ
+ * indices that are possibly associated to this VF. Also clear the AEQ for this
+ * VF. Doing it this way prevents the need to cache the configuration received
+ * on VIRTCHNL_OP_CONFIG_RMDA_IRQ_MAP since VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP is
+ * designed to clear the entire RDMA IRQ mapping configuration.
+ */
+static void ice_vf_clear_rdma_irq_map(struct ice_vf *vf)
+{
+	u16 i;
+
+	for (i = 0; i < vf->pf->num_msix_per_vf; i++)
+		ice_vf_clear_ceq_irq_map(vf, i);
+
+	ice_vf_clear_aeq_irq_map(vf);
+}
+
 /**
  * ice_reset_vf - Reset a particular VF
  * @vf: pointer to the VF structure
  * @is_vflr: true if VFLR was issued, false if not
  *
- * Returns true if the VF is reset, false otherwise.
+ * Returns true if the VF is currently in reset, resets successfully, or resets
+ * are disabled and false otherwise.
  */
-static bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
+bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 {
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
+	struct device *dev;
 	struct ice_hw *hw;
 	bool rsd = false;
 	u8 promisc_m;
 	u32 reg;
 	int i;
 
-	/* If the PF has been disabled, there is no need resetting VF until
-	 * PF is active again.
-	 */
-	if (test_bit(__ICE_VF_DIS, pf->state))
-		return false;
+	dev = ice_pf_to_dev(pf);
 
-	/* If the VF has been disabled, this means something else is
-	 * resetting the VF, so we shouldn't continue. Otherwise, set
-	 * disable VF state bit for actual reset, and continue.
-	 */
-	if (test_and_set_bit(ICE_VF_STATE_DIS, vf->vf_states))
-		return false;
+	if (test_bit(ICE_VF_RESETS_DISABLED, pf->state)) {
+		dev_dbg(dev, "Trying to reset VF %d, but all VF resets are disabled\n",
+			vf->vf_id);
+		return true;
+	}
+
+	if (ice_is_vf_disabled(vf)) {
+		dev_dbg(dev, "VF is already disabled, there is no need for resetting it, telling VM, all is fine %d\n",
+			vf->vf_id);
+		return true;
+	}
 
+	/* Set VF disable bit state here, before triggering reset */
+	set_bit(ICE_VF_STATE_DIS, vf->vf_states);
 	ice_trigger_vf_reset(vf, is_vflr, false);
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (ice_dcf_get_state(pf) == ICE_DCF_STATE_ON)
+		ice_dcf_set_state(pf, ICE_DCF_STATE_BUSY);
+
+	vsi = ice_get_vf_vsi(vf);
 
 	if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states))
 		ice_dis_vf_qs(vf);
@@ -1193,6 +2408,21 @@ static bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 	 */
 	ice_dis_vsi_txq(vsi->port_info, vsi->idx, 0, 0, NULL, NULL,
 			NULL, ICE_VF_RESET, vf->vf_id, NULL);
+	/* Likewise Disable LAN Tx queues for VF ADQ VSIs */
+	if (ice_is_vf_adq_ena(vf)) {
+		int tc;
+
+		for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+			if (!ice_vf_adq_vsi_valid(vf, tc))
+				continue;
+			ice_dis_vsi_txq(vsi->port_info, vf->ch[tc].vsi_idx, 0,
+					0, NULL, NULL, NULL, ICE_VF_RESET,
+					vf->vf_id, NULL);
+		}
+	}
+
+	if (vf->driver_caps & VIRTCHNL_VF_CAP_RDMA)
+		ice_vf_clear_rdma_irq_map(vf);
 
 	hw = &pf->hw;
 	/* poll VPGEN_VFRSTAT reg to make sure
@@ -1213,34 +2443,67 @@ static bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 		usleep_range(10, 20);
 	}
 
+	vf->driver_caps = 0;
+	ice_vc_set_default_allowlist(vf);
+
 	/* Display a warning if VF didn't manage to reset in time, but need to
 	 * continue on with the operation.
 	 */
 	if (!rsd)
-		dev_warn(&pf->pdev->dev, "VF reset check timeout on VF %d\n",
-			 vf->vf_id);
+		dev_warn(dev, "VF reset check timeout on VF %d\n", vf->vf_id);
 
 	/* disable promiscuous modes in case they were enabled
 	 * ignore any error if disabling process failed
 	 */
 	if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
 	    test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) {
-		if (vf->port_vlan_id ||  vf->num_vlan)
+		if (ice_vf_is_port_vlan_ena(vf) || vsi->num_vlan)
 			promisc_m = ICE_UCAST_VLAN_PROMISC_BITS;
 		else
 			promisc_m = ICE_UCAST_PROMISC_BITS;
 
-		vsi = pf->vsi[vf->lan_vsi_idx];
-		if (ice_vf_set_vsi_promisc(vf, vsi, promisc_m, true))
-			dev_err(&pf->pdev->dev, "disabling promiscuous mode failed\n");
+		if (ice_vf_clear_vsi_promisc(vf, vsi, promisc_m))
+			dev_err(dev, "disabling promiscuous mode failed\n");
 	}
 
-	/* free VF resources to begin resetting the VSI state */
-	ice_free_vf_res(vf);
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	/* always release VF ADQ filters since those filters will be
+	 * replayed by VF driver. This is needed to avoid stale filters in
+	 * software internal data structures
+	 */
+	ice_del_all_adv_switch_fltr(vf);
+#endif
+	/* VF driver gets reloaded on VFLR, so clear ADQ configuration */
+	if (is_vflr)
+		ice_vf_adq_release(vf);
 
-	ice_cleanup_and_realloc_vf(vf);
 
-	ice_flush(hw);
+	ice_vf_fdir_exit(vf);
+	ice_vf_fdir_init(vf);
+	/* clean VF control VSI when resetting VF since it should be setup
+	 * only when iAVF creates its first FDIR rule.
+	 */
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+		ice_vf_ctrl_vsi_release(vf);
+
+	ice_vf_pre_vsi_rebuild(vf);
+
+	if (ice_vf_rebuild_vsi_with_release(vf)) {
+		dev_err(dev, "Failed to release and setup the VF%u's VSI\n", vf->vf_id);
+		return false;
+	}
+
+	ice_vf_post_vsi_rebuild(vf);
+	vsi = ice_get_vf_vsi(vf);
+	ice_eswitch_update_repr(vsi);
+
+	if (ice_dcf_get_state(pf) == ICE_DCF_STATE_BUSY) {
+		ice_dcf_set_state(pf, ICE_DCF_STATE_ON);
+	}
+
+	/* if the VF has been reset allow it to come up again */
+	if (ice_mbx_clear_malvf(&hw->mbx_snapshot, pf->malvfs, ICE_MAX_VF_COUNT, vf->vf_id))
+		dev_dbg(dev, "failed to clear malicious VF state for VF %u\n", i);
 
 	return true;
 }
@@ -1253,7 +2516,7 @@ void ice_vc_notify_link_state(struct ice_pf *pf)
 {
 	int i;
 
-	for (i = 0; i < pf->num_alloc_vfs; i++)
+	ice_for_each_vf(pf, i)
 		ice_vc_notify_vf_link_state(&pf->vf[i]);
 }
 
@@ -1276,6 +2539,28 @@ void ice_vc_notify_reset(struct ice_pf *pf)
 			    (u8 *)&pfe, sizeof(struct virtchnl_pf_event));
 }
 
+/**
+ * ice_vc_notify_dcf_vf_info - Send DCF VF information to the VF
+ * @old_dcf_vf: pointer to the previous DCF VF structure
+ * @cur_dcf_vf: pointer to the current DCF VF structure
+ */
+static void ice_vc_notify_dcf_vf_info(struct ice_vf *old_dcf_vf, struct ice_vf *cur_dcf_vf)
+{
+        struct ice_pf *pf = cur_dcf_vf->pf;
+        struct virtchnl_pf_event pfe = { 0 };
+
+        if (!old_dcf_vf || !cur_dcf_vf)
+                return;
+
+        pfe.event = VIRTCHNL_EVENT_DCF_VSI_INFO;
+        pfe.event_data.vf_vsi_map.vf_id = cur_dcf_vf->vf_id;
+        pfe.event_data.vf_vsi_map.vsi_id = cur_dcf_vf->lan_vsi_num;
+
+        ice_aq_send_msg_to_vf(&pf->hw, old_dcf_vf->vf_id, VIRTCHNL_OP_EVENT,
+                              VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe, sizeof(pfe), NULL);
+        return;
+}
+
 /**
  * ice_vc_notify_vf_reset - Notify VF of a reset event
  * @vf: pointer to the VF structure
@@ -1283,13 +2568,17 @@ void ice_vc_notify_reset(struct ice_pf *pf)
 static void ice_vc_notify_vf_reset(struct ice_vf *vf)
 {
 	struct virtchnl_pf_event pfe;
+	struct ice_pf *pf;
 
-	/* validate the request */
-	if (!vf || vf->vf_id >= vf->pf->num_alloc_vfs)
+	if (!vf)
 		return;
 
-	/* Bail out if VF is in disabled state, neither initialized, nor active
-	 * state - otherwise proceed with notifications
+	pf = vf->pf;
+	if (ice_validate_vf_id(pf, vf->vf_id))
+		return;
+
+	/* Bail out if VF is in disabled state, neither initialized, nor active
+	 * state - otherwise proceed with notifications
 	 */
 	if ((!test_bit(ICE_VF_STATE_INIT, vf->vf_states) &&
 	     !test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) ||
@@ -1298,173 +2587,351 @@ static void ice_vc_notify_vf_reset(struct ice_vf *vf)
 
 	pfe.event = VIRTCHNL_EVENT_RESET_IMPENDING;
 	pfe.severity = PF_EVENT_SEVERITY_CERTAIN_DOOM;
-	ice_aq_send_msg_to_vf(&vf->pf->hw, vf->vf_id, VIRTCHNL_OP_EVENT,
+	ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, VIRTCHNL_OP_EVENT,
 			      VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe, sizeof(pfe),
 			      NULL);
 }
 
+
 /**
- * ice_alloc_vfs - Allocate and set up VFs resources
- * @pf: pointer to the PF structure
- * @num_alloc_vfs: number of VFs to allocate
+ * ice_init_vf_vsi_res - initialize/setup VF VSI resources
+ * @vf: VF to initialize/setup the VSI for
+ *
+ * This function creates a VSI for the VF, adds a VLAN 0 filter, and sets up the
+ * VF VSI's broadcast filter and is only used during initial VF creation.
+ */
+static int ice_init_vf_vsi_res(struct ice_vf *vf)
+{
+	struct ice_vsi_vlan_ops *vlan_ops;
+	struct ice_pf *pf = vf->pf;
+	u8 broadcast[ETH_ALEN];
+	enum ice_status status;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	int err;
+
+	vf->first_vector_idx = ice_calc_vf_first_vector_idx(pf, vf);
+
+	dev = ice_pf_to_dev(pf);
+	vsi = ice_vf_vsi_setup(vf);
+	if (!vsi)
+		return -ENOMEM;
+
+	err = ice_vsi_add_vlan_zero(vsi);
+	if (err) {
+		dev_warn(dev, "Failed to add VLAN 0 filter for VF %d\n",
+			 vf->vf_id);
+		goto release_vsi;
+	}
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	err = vlan_ops->ena_rx_filtering(vsi);
+	if (err) {
+		dev_warn(dev, "Failed to enable Rx VLAN filtering for VF %d\n",
+			 vf->vf_id);
+		goto release_vsi;
+	}
+
+	eth_broadcast_addr(broadcast);
+	status = ice_fltr_add_mac(vsi, broadcast, ICE_FWD_TO_VSI);
+	if (status) {
+		dev_err(dev, "Failed to add broadcast MAC filter for VF %d, status %s\n",
+			vf->vf_id, ice_stat_str(status));
+		err = ice_status_to_errno(status);
+		goto release_vsi;
+	}
+
+	err = ice_vf_set_spoofchk_cfg(vf, vsi);
+	if (err) {
+		dev_warn(dev, "Failed to initialize spoofchk setting for VF %d\n",
+			 vf->vf_id);
+		goto release_vsi;
+	}
+
+
+	vf->num_mac = 1;
+
+	return 0;
+
+release_vsi:
+	ice_vf_vsi_release(vf);
+	return err;
+}
+
+/**
+ * ice_start_vfs - start VFs so they are ready to be used by SR-IOV
+ * @pf: PF the VFs are associated with
  */
-static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs)
+static int ice_start_vfs(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
+	int retval, i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_clear_vf_reset_trigger(vf);
+
+		retval = ice_init_vf_vsi_res(vf);
+		if (retval) {
+			dev_err(ice_pf_to_dev(pf), "Failed to initialize VSI resources for VF %d, error %d\n",
+				vf->vf_id, retval);
+			goto teardown;
+		}
+
+		set_bit(ICE_VF_STATE_INIT, vf->vf_states);
+		ice_ena_vf_mappings(vf);
+		wr32(hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE);
+	}
+
+	ice_flush(hw);
+	return 0;
+
+teardown:
+	for (i = i - 1; i >= 0; i--) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_dis_vf_mappings(vf);
+		ice_vf_vsi_release(vf);
+	}
+
+	return retval;
+}
+
+static void
+ice_vf_hash_ctx_init(struct ice_vf *vf)
+{
+	memset(&vf->hash_ctx, 0, sizeof(vf->hash_ctx));
+}
+
+/**
+ * ice_set_dflt_settings_vfs - set VF defaults during initialization/creation
+ * @pf: PF holding reference to all VFs for default configuration
+ */
+static void ice_set_dflt_settings_vfs(struct ice_pf *pf)
+{
+	int i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+		vf->vf_sw_id = pf->first_sw;
+		vf->pf = pf;
+		vf->vf_id = i;
+		/* assign default capabilities */
+		set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vf->vf_caps);
+		vf->spoofchk = true;
+		vf->num_vf_qs = pf->num_qps_per_vf;
+		ice_vc_set_default_allowlist(vf);
+
+		/* ctrl_vsi_idx will be set to a valid value only when iAVF
+		 * creates its first fdir rule.
+		 */
+		ice_vf_ctrl_invalidate_vsi(vf);
+		ice_vf_fdir_init(vf);
+
+		ice_vf_hash_ctx_init(vf);
+
+		ice_vc_set_dflt_vf_ops(&vf->vc_ops);
+	}
+}
+
+/**
+ * ice_alloc_vfs - allocate num_vfs in the PF structure
+ * @pf: PF to store the allocated VFs in
+ * @num_vfs: number of VFs to allocate
+ */
+static int ice_alloc_vfs(struct ice_pf *pf, int num_vfs)
+{
 	struct ice_vf *vfs;
-	int i, ret;
+
+	vfs = devm_kcalloc(ice_pf_to_dev(pf), num_vfs, sizeof(*vfs),
+			   GFP_KERNEL);
+	if (!vfs)
+		return -ENOMEM;
+
+	pf->vf = vfs;
+	pf->num_alloc_vfs = num_vfs;
+
+	return 0;
+}
+
+/**
+ * ice_ena_vfs - enable VFs so they are ready to be used
+ * @pf: pointer to the PF structure
+ * @num_vfs: number of VFs to enable
+ */
+static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	int ret;
 
 	/* Disable global interrupt 0 so we don't try to handle the VFLR. */
 	wr32(hw, GLINT_DYN_CTL(pf->oicr_idx),
 	     ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S);
-	set_bit(__ICE_OICR_INTR_DIS, pf->state);
+	set_bit(ICE_OICR_INTR_DIS, pf->state);
 	ice_flush(hw);
 
-	ret = pci_enable_sriov(pf->pdev, num_alloc_vfs);
+	ret = pci_enable_sriov(pf->pdev, num_vfs);
 	if (ret) {
 		pf->num_alloc_vfs = 0;
 		goto err_unroll_intr;
 	}
-	/* allocate memory */
-	vfs = devm_kcalloc(&pf->pdev->dev, num_alloc_vfs, sizeof(*vfs),
-			   GFP_KERNEL);
-	if (!vfs) {
-		ret = -ENOMEM;
+
+	ret = ice_alloc_vfs(pf, num_vfs);
+	if (ret)
 		goto err_pci_disable_sriov;
-	}
-	pf->vf = vfs;
 
-	/* apply default profile */
-	for (i = 0; i < num_alloc_vfs; i++) {
-		vfs[i].pf = pf;
-		vfs[i].vf_sw_id = pf->first_sw;
-		vfs[i].vf_id = i;
+	ice_dcf_init_sw_rule_mgmt(pf);
 
-		/* assign default capabilities */
-		set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vfs[i].vf_caps);
-		vfs[i].spoofchk = true;
+	if (ice_set_per_vf_res(pf)) {
+		dev_err(dev, "Not enough resources for %d VFs, try with fewer number of VFs\n",
+			num_vfs);
+		ret = -ENOSPC;
+		goto err_unroll_sriov;
 	}
-	pf->num_alloc_vfs = num_alloc_vfs;
 
-	/* VF resources get allocated with initialization */
-	if (!ice_config_res_vfs(pf)) {
-		ret = -EIO;
+	ice_set_dflt_settings_vfs(pf);
+
+	if (ice_start_vfs(pf)) {
+		dev_err(dev, "Failed to start VF(s)\n");
+		ret = -EAGAIN;
 		goto err_unroll_sriov;
 	}
 
-	return ret;
+	clear_bit(ICE_VF_DIS, pf->state);
+
+	if (ice_eswitch_configure(pf))
+		goto err_unroll_sriov;
+
+	/* rearm global interrupts */
+	if (test_and_clear_bit(ICE_OICR_INTR_DIS, pf->state))
+		ice_irq_dynamic_ena(hw, NULL, NULL);
+
+	return 0;
 
 err_unroll_sriov:
+	devm_kfree(dev, pf->vf);
 	pf->vf = NULL;
-	devm_kfree(&pf->pdev->dev, vfs);
-	vfs = NULL;
 	pf->num_alloc_vfs = 0;
 err_pci_disable_sriov:
 	pci_disable_sriov(pf->pdev);
 err_unroll_intr:
 	/* rearm interrupts here */
 	ice_irq_dynamic_ena(hw, NULL, NULL);
-	clear_bit(__ICE_OICR_INTR_DIS, pf->state);
+	clear_bit(ICE_OICR_INTR_DIS, pf->state);
 	return ret;
 }
 
-/**
- * ice_pf_state_is_nominal - checks the PF for nominal state
- * @pf: pointer to PF to check
- *
- * Check the PF's state for a collection of bits that would indicate
- * the PF is in a state that would inhibit normal operation for
- * driver functionality.
- *
- * Returns true if PF is in a nominal state.
- * Returns false otherwise
- */
-static bool ice_pf_state_is_nominal(struct ice_pf *pf)
-{
-	DECLARE_BITMAP(check_bits, __ICE_STATE_NBITS) = { 0 };
-
-	if (!pf)
-		return false;
-
-	bitmap_set(check_bits, 0, __ICE_STATE_NOMINAL_CHECK_BITS);
-	if (bitmap_intersects(pf->state, check_bits, __ICE_STATE_NBITS))
-		return false;
-
-	return true;
-}
-
 /**
  * ice_pci_sriov_ena - Enable or change number of VFs
  * @pf: pointer to the PF structure
  * @num_vfs: number of VFs to allocate
+ *
+ * Returns 0 on success and negative on failure
  */
 static int ice_pci_sriov_ena(struct ice_pf *pf, int num_vfs)
 {
 	int pre_existing_vfs = pci_num_vf(pf->pdev);
-	struct device *dev = &pf->pdev->dev;
+	struct device *dev = ice_pf_to_dev(pf);
 	int err;
 
-	if (!ice_pf_state_is_nominal(pf)) {
-		dev_err(dev, "Cannot enable SR-IOV, device not ready\n");
-		return -EBUSY;
-	}
-
-	if (!test_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags)) {
-		dev_err(dev, "This device is not capable of SR-IOV\n");
-		return -ENODEV;
-	}
-
 	if (pre_existing_vfs && pre_existing_vfs != num_vfs)
 		ice_free_vfs(pf);
 	else if (pre_existing_vfs && pre_existing_vfs == num_vfs)
-		return num_vfs;
+		return 0;
 
 	if (num_vfs > pf->num_vfs_supported) {
 		dev_err(dev, "Can't enable %d VFs, max VFs supported is %d\n",
 			num_vfs, pf->num_vfs_supported);
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
-	dev_info(dev, "Allocating %d VFs\n", num_vfs);
-	err = ice_alloc_vfs(pf, num_vfs);
+	dev_info(dev, "Enabling %d VFs\n", num_vfs);
+	err = ice_ena_vfs(pf, num_vfs);
 	if (err) {
 		dev_err(dev, "Failed to enable SR-IOV: %d\n", err);
 		return err;
 	}
 
 	set_bit(ICE_FLAG_SRIOV_ENA, pf->flags);
-	return num_vfs;
+	return 0;
+}
+
+
+/**
+ * ice_check_sriov_allowed - check if SR-IOV is allowed based on various checks
+ * @pf: PF to enabled SR-IOV on
+ */
+static int ice_check_sriov_allowed(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags)) {
+		dev_err(dev, "This device is not capable of SR-IOV\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (test_bit(ICE_RECOVERY_MODE, pf->state)) {
+		dev_err(dev, "SR-IOV cannot be configured - Device is in Recovery Mode\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ice_is_safe_mode(pf)) {
+		dev_err(dev, "SR-IOV cannot be configured - Device is in Safe Mode\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!ice_pf_state_is_nominal(pf)) {
+		dev_err(dev, "Cannot enable SR-IOV, device not ready\n");
+		return -EBUSY;
+	}
+
+	return 0;
 }
 
 /**
  * ice_sriov_configure - Enable or change number of VFs via sysfs
  * @pdev: pointer to a pci_dev structure
- * @num_vfs: number of VFs to allocate
+ * @num_vfs: number of VFs to allocate or 0 to free VFs
  *
- * This function is called when the user updates the number of VFs in sysfs.
+ * This function is called when the user updates the number of VFs in sysfs. On
+ * success return whatever num_vfs was set to by the caller. Return negative on
+ * failure.
  */
 int ice_sriov_configure(struct pci_dev *pdev, int num_vfs)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
+	struct device *dev = ice_pf_to_dev(pf);
+	enum ice_status status;
+	int err;
 
-	if (ice_is_safe_mode(pf)) {
-		dev_err(&pf->pdev->dev,
-			"SR-IOV cannot be configured - Device is in Safe Mode\n");
-		return -EOPNOTSUPP;
-	}
+	err = ice_check_sriov_allowed(pf);
+	if (err)
+		return err;
 
-	if (num_vfs)
-		return ice_pci_sriov_ena(pf, num_vfs);
+	if (!num_vfs) {
+		if (!pci_vfs_assigned(pdev)) {
+			ice_mbx_deinit_snapshot(&pf->hw);
+			ice_free_vfs(pf);
+			return 0;
+		}
 
-	if (!pci_vfs_assigned(pdev)) {
-		ice_free_vfs(pf);
-	} else {
-		dev_err(&pf->pdev->dev,
-			"can't free VFs because some are assigned to VMs.\n");
+		dev_err(dev, "can't free VFs because some are assigned to VMs.\n");
 		return -EBUSY;
 	}
 
-	return 0;
+	status = ice_mbx_init_snapshot(&pf->hw, num_vfs);
+	if (status)
+		return ice_status_to_errno(status);
+
+	err = ice_pci_sriov_ena(pf, num_vfs);
+	if (err) {
+		ice_mbx_deinit_snapshot(&pf->hw);
+		return err;
+	}
+
+	return num_vfs;
 }
 
 /**
@@ -1477,14 +2944,14 @@ int ice_sriov_configure(struct pci_dev *pdev, int num_vfs)
 void ice_process_vflr_event(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
-	int vf_id;
+	unsigned int vf_id;
 	u32 reg;
 
-	if (!test_and_clear_bit(__ICE_VFLR_EVENT_PENDING, pf->state) ||
+	if (!test_and_clear_bit(ICE_VFLR_EVENT_PENDING, pf->state) ||
 	    !pf->num_alloc_vfs)
 		return;
 
-	for (vf_id = 0; vf_id < pf->num_alloc_vfs; vf_id++) {
+	ice_for_each_vf(pf, vf_id) {
 		struct ice_vf *vf = &pf->vf[vf_id];
 		u32 reg_idx, bit_idx;
 
@@ -1499,17 +2966,81 @@ void ice_process_vflr_event(struct ice_pf *pf)
 }
 
 /**
- * ice_vc_dis_vf - Disable a given VF via SW reset
+ * ice_vc_reset_vf - Perform software reset on the VF after informing the AVF
  * @vf: pointer to the VF info
- *
- * Disable the VF through a SW reset
  */
-static void ice_vc_dis_vf(struct ice_vf *vf)
+static void ice_vc_reset_vf(struct ice_vf *vf)
 {
 	ice_vc_notify_vf_reset(vf);
 	ice_reset_vf(vf, false);
 }
 
+/**
+ * ice_get_vf_from_pfq - get the VF who owns the PF space queue passed in
+ * @pf: PF used to index all VFs
+ * @pfq: queue index relative to the PF's function space
+ *
+ * If no VF is found who owns the pfq then return NULL, otherwise return a
+ * pointer to the VF who owns the pfq
+ */
+static struct ice_vf *ice_get_vf_from_pfq(struct ice_pf *pf, u16 pfq)
+{
+	unsigned int vf_id;
+
+	ice_for_each_vf(pf, vf_id) {
+		struct ice_vf *vf = &pf->vf[vf_id];
+		struct ice_vsi *vsi;
+		u16 rxq_idx;
+
+		vsi = ice_get_vf_vsi(vf);
+
+		ice_for_each_rxq(vsi, rxq_idx)
+			if (vsi->rxq_map[rxq_idx] == pfq)
+				return vf;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_globalq_to_pfq - convert from global queue index to PF space queue index
+ * @pf: PF used for conversion
+ * @globalq: global queue index used to convert to PF space queue index
+ */
+static u32 ice_globalq_to_pfq(struct ice_pf *pf, u32 globalq)
+{
+	return globalq - pf->hw.func_caps.common_cap.rxq_first_id;
+}
+
+/**
+ * ice_vf_lan_overflow_event - handle LAN overflow event for a VF
+ * @pf: PF that the LAN overflow event happened on
+ * @event: structure holding the event information for the LAN overflow event
+ *
+ * Determine if the LAN overflow event was caused by a VF queue. If it was not
+ * caused by a VF, do nothing. If a VF caused this LAN overflow event trigger a
+ * reset on the offending VF.
+ */
+void
+ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event)
+{
+	u32 gldcb_rtctq, queue;
+	struct ice_vf *vf;
+
+	gldcb_rtctq = le32_to_cpu(event->desc.params.lan_overflow.prtdcb_ruptq);
+	dev_dbg(ice_pf_to_dev(pf), "GLDCB_RTCTQ: 0x%08x\n", gldcb_rtctq);
+
+	/* event returns device global Rx queue number */
+	queue = (gldcb_rtctq & GLDCB_RTCTQ_RXQNUM_M) >>
+		GLDCB_RTCTQ_RXQNUM_S;
+
+	vf = ice_get_vf_from_pfq(pf, ice_globalq_to_pfq(pf, queue));
+	if (!vf)
+		return;
+
+	ice_vc_reset_vf(vf);
+}
+
 /**
  * ice_vc_send_msg_to_vf - Send message to VF
  * @vf: pointer to the VF info
@@ -1520,29 +3051,32 @@ static void ice_vc_dis_vf(struct ice_vf *vf)
  *
  * send msg to VF
  */
-static int
+int
 ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
 		      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen)
 {
 	enum ice_status aq_ret;
+	struct device *dev;
 	struct ice_pf *pf;
 
-	/* validate the request */
-	if (!vf || vf->vf_id >= vf->pf->num_alloc_vfs)
+	if (!vf)
 		return -EINVAL;
 
 	pf = vf->pf;
+	if (ice_validate_vf_id(pf, vf->vf_id))
+		return -EINVAL;
+
+	dev = ice_pf_to_dev(pf);
 
 	/* single place to detect unsuccessful return values */
 	if (v_retval) {
 		vf->num_inval_msgs++;
-		dev_info(&pf->pdev->dev, "VF %d failed opcode %d, retval: %d\n",
-			 vf->vf_id, v_opcode, v_retval);
+		dev_info(dev, "VF %d failed opcode %d, retval: %d\n", vf->vf_id,
+			 v_opcode, v_retval);
 		if (vf->num_inval_msgs > ICE_DFLT_NUM_INVAL_MSGS_ALLOWED) {
-			dev_err(&pf->pdev->dev,
-				"Number of invalid messages exceeded for VF %d\n",
+			dev_err(dev, "Number of invalid messages exceeded for VF %d\n",
 				vf->vf_id);
-			dev_err(&pf->pdev->dev, "Use PF Control I/F to enable the VF\n");
+			dev_err(dev, "Use PF Control I/F to enable the VF\n");
 			set_bit(ICE_VF_STATE_DIS, vf->vf_states);
 			return -EIO;
 		}
@@ -1555,9 +3089,9 @@ ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
 	aq_ret = ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, v_opcode, v_retval,
 				       msg, msglen, NULL);
 	if (aq_ret && pf->hw.mailboxq.sq_last_status != ICE_AQ_RC_ENOSYS) {
-		dev_info(&pf->pdev->dev,
-			 "Unable to send the message to VF %d ret %d aq_err %d\n",
-			 vf->vf_id, aq_ret, pf->hw.mailboxq.sq_last_status);
+		dev_info(dev, "Unable to send the message to VF %d ret %s aq_err %s\n",
+			 vf->vf_id, ice_stat_str(aq_ret),
+			 ice_aq_str(pf->hw.mailboxq.sq_last_status));
 		return -EIO;
 	}
 
@@ -1587,6 +3121,28 @@ static int ice_vc_get_ver_msg(struct ice_vf *vf, u8 *msg)
 				     sizeof(struct virtchnl_version_info));
 }
 
+/**
+ * ice_vc_get_max_frame_size - get max frame size allowed for VF
+ * @vf: VF used to determine max frame size
+ *
+ * Max frame size is determined based on the current port's max frame size and
+ * whether a port VLAN is configured on this VF. The VF is not aware whether
+ * it's in a port VLAN so the PF needs to account for this in max frame size
+ * checks and sending the max frame size to the VF.
+ */
+static u16 ice_vc_get_max_frame_size(struct ice_vf *vf)
+{
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	u16 max_frame_size;
+
+	max_frame_size = pi->phy.link_info.max_frame_size;
+
+	if (ice_vf_is_port_vlan_ena(vf))
+		max_frame_size -= VLAN_HLEN;
+
+	return max_frame_size;
+}
+
 /**
  * ice_vc_get_vf_res_msg
  * @vf: pointer to the VF info
@@ -1603,14 +3159,14 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	int len = 0;
 	int ret;
 
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
+	if (ice_check_vf_init(pf, vf)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto err;
 	}
 
 	len = sizeof(struct virtchnl_vf_resource);
 
-	vfres = devm_kzalloc(&pf->pdev->dev, len, GFP_KERNEL);
+	vfres = kzalloc(len, GFP_KERNEL);
 	if (!vfres) {
 		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
 		len = 0;
@@ -1624,14 +3180,39 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 				  VIRTCHNL_VF_OFFLOAD_VLAN;
 
 	vfres->vf_cap_flags = VIRTCHNL_VF_OFFLOAD_L2;
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	vsi = ice_get_vf_vsi(vf);
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto err;
 	}
 
-	if (!vsi->info.pvid)
-		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_VLAN_V2) {
+		/* VLAN offloads based on current device configuration */
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN_V2;
+	} else if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_VLAN) {
+		/* allow VF to negotiate VIRTCHNL_VF_OFFLOAD explicitly for
+		 * these two conditions, which amounts to guest VLAN filtering
+		 * and offloads being based on the inner VLAN or the
+		 * inner/single VLAN respectively and don't allow VF to
+		 * negotiate VIRTCHNL_VF_OFFLOAD in any other cases
+		 */
+		if (ice_is_dvm_ena(&pf->hw) && ice_vf_is_port_vlan_ena(vf)) {
+			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+		} else if (!ice_is_dvm_ena(&pf->hw) &&
+			   !ice_vf_is_port_vlan_ena(vf)) {
+			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+			/* configure backward compatible support for VFs that
+			 * only support VIRTCHNL_VF_OFFLOAD_VLAN, the PF is
+			 * configured in SVM, and no port VLAN is configured
+			 */
+			ice_vf_vsi_cfg_svm_legacy_vlan_mode(vsi);
+		} else if (ice_is_dvm_ena(&pf->hw)) {
+			/* configure software offloaded VLAN support when DVM
+			 * is enabled, but no port VLAN is enabled
+			 */
+			ice_vf_vsi_cfg_dvm_legacy_vlan_mode(vsi);
+		}
+	}
 
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PF) {
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PF;
@@ -1642,6 +3223,12 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_REG;
 	}
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC;
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_FDIR_PF;
+
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2;
 
@@ -1660,21 +3247,84 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_REQ_QUEUES;
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_CRC)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_CRC;
+
 	if (vf->driver_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED;
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF;
+#ifdef __TC_MQPRIO_MODE_MAX
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ && !ice_is_eswitch_mode_switchdev(pf))
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADQ;
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2 && !ice_is_eswitch_mode_switchdev(pf))
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADQ_V2;
+#endif /* __TC_MQPRIO_MODE_MAX */
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_USO)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_USO;
+
+	if (vf->driver_caps & VIRTCHNL_VF_LARGE_NUM_QPAIRS)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_LARGE_NUM_QPAIRS;
+
+	/* Negotiate DCF capability. */
+	if (vf->driver_caps & VIRTCHNL_VF_CAP_DCF) {
+		if (!ice_is_vf_dcf(vf)) {
+			if (!ice_check_dcf_allowed(vf)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto err;
+			}
+                        if (!ice_is_vf_dcf(vf))
+                                ice_vc_notify_dcf_vf_info(pf->dcf.vf, vf);
+			pf->dcf.vf = vf;
+			dev_info(ice_pf_to_dev(pf), "Grant request for DCF functionality to VF%d\n",
+				 vf->vf_id);
+			if (!ice_is_tunnel_empty(&pf->hw)) {
+				dev_info(ice_pf_to_dev(pf), "Failed to grant UDP tunnel capability to VF%d as UDP tunnel rules already exist\n",
+					 vf->vf_id);
+				pf->hw.dcf_caps &= ~DCF_UDP_TUNNEL_CAP;
+			}
+		}
+
+		vfres->vf_cap_flags |= VIRTCHNL_VF_CAP_DCF;
+		ice_dcf_set_state(pf, ICE_DCF_STATE_ON);
+	} else if (ice_is_vf_dcf(vf) &&
+		   ice_dcf_get_state(pf) != ICE_DCF_STATE_OFF) {
+		/* If a designated DCF requests AVF functionality from the
+		 * same VF without the DCF gracefully relinquishing the DCF
+		 * functionality first, remove ALL switch filters that were
+		 * added by the DCF.
+		 */
+		dev_info(ice_pf_to_dev(pf), "DCF is not in the OFF state, removing all filters that were added by the DCF\n");
+		ice_rm_all_dcf_sw_rules(pf);
+		ice_clear_dcf_acl_cfg(pf);
+		ice_clear_dcf_udp_tunnel_cfg(pf);
+		pf->hw.dcf_caps &= ~(DCF_ACL_CAP | DCF_UDP_TUNNEL_CAP);
+		ice_dcf_set_state(pf, ICE_DCF_STATE_OFF);
+		pf->dcf.vf = NULL;
+		ice_reset_vf(vf, false);
+	}
+
 	vfres->num_vsis = 1;
 	/* Tx and Rx queue are equal for VF */
 	vfres->num_queue_pairs = vsi->num_txq;
-	vfres->max_vectors = pf->num_vf_msix;
+	vfres->max_vectors = pf->num_msix_per_vf;
 	vfres->rss_key_size = ICE_VSIQF_HKEY_ARRAY_SIZE;
-	vfres->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
+	vfres->rss_lut_size = vsi->rss_table_size;
+	vfres->max_mtu = ice_vc_get_max_frame_size(vf);
 
 	vfres->vsi_res[0].vsi_id = vf->lan_vsi_num;
 	vfres->vsi_res[0].vsi_type = VIRTCHNL_VSI_SRIOV;
 	vfres->vsi_res[0].num_queue_pairs = vsi->num_txq;
 	ether_addr_copy(vfres->vsi_res[0].default_mac_addr,
-			vf->dflt_lan_addr.addr);
+			vf->hw_lan_addr.addr);
+
+	/* match guest capabilities */
+	vf->driver_caps = vfres->vf_cap_flags;
+
+	ice_vc_set_caps_allowlist(vf);
+	ice_vc_set_working_allowlist(vf);
 
 	set_bit(ICE_VF_STATE_ACTIVE, vf->vf_states);
 
@@ -1683,7 +3333,7 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_VF_RESOURCES, v_ret,
 				    (u8 *)vfres, len);
 
-	devm_kfree(&pf->pdev->dev, vfres);
+	kfree(vfres);
 	return ret;
 }
 
@@ -1697,7 +3347,7 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
  */
 static void ice_vc_reset_vf_msg(struct ice_vf *vf)
 {
-	if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+	if (test_bit(ICE_VF_STATE_INIT, vf->vf_states))
 		ice_reset_vf(vf, false);
 }
 
@@ -1726,7 +3376,7 @@ static struct ice_vsi *ice_find_vsi_from_id(struct ice_pf *pf, u16 id)
  *
  * check for the valid VSI ID
  */
-static bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id)
+bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id)
 {
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
@@ -1766,474 +3416,5309 @@ static bool ice_vc_isvalid_ring_len(u16 ring_len)
 		!(ring_len % ICE_REQ_DESC_MULTIPLE));
 }
 
-/**
- * ice_vc_config_rss_key
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
+static enum virtchnl_status_code ice_vc_rss_hash_update(struct ice_hw *hw,
+							struct ice_vsi *vsi,
+							u8 hash_type)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi_ctx *ctx;
+	enum ice_status status;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+
+
+	/* clear previous hash_type */
+	ctx->info.q_opt_rss = vsi->info.q_opt_rss &
+		~(ICE_AQ_VSI_Q_OPT_RSS_HASH_M);
+	/* hash_type is passed in as ICE_AQ_VSI_Q_OPT_RSS_<XOR|TPLZ|SYM_TPLZ */
+	ctx->info.q_opt_rss |= hash_type;
+
+	/* Preserve existing queueing option setting */
+	ctx->info.q_opt_tc = vsi->info.q_opt_tc;
+	ctx->info.q_opt_flags = vsi->info.q_opt_flags;
+
+	ctx->info.valid_sections =
+			cpu_to_le16(ICE_AQ_VSI_PROP_Q_OPT_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctx, NULL);
+	if (status) {
+		dev_err(ice_hw_to_dev(hw),
+			"update VSI for rss failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	} else {
+		vsi->info.q_opt_rss = ctx->info.q_opt_rss;
+	}
+
+	kfree(ctx);
+
+	return v_ret;
+}
+
+/**
+ * ice_pkg_name_to_type - translate DDP package name string to type
+ * @hw: pointer to the hardware
+ *
+ * This is a helper function to translate the DDP package name string
+ * to ice_pkg_type, in order to select the correct hash list.
+ */
+enum ice_pkg_type ice_pkg_name_to_type(struct ice_hw *hw)
+{
+	uint16_t i;
+	static const struct {
+		char name[ICE_PKG_NAME_SIZE];
+		enum ice_pkg_type pkg_type;
+	} ice_pkg_type_list[] = {
+		{"ICE OS Default Package",	ICE_PKG_TYPE_OS_DEFAULT},
+		{"ICE COMMS Package",	ICE_PKG_TYPE_COMMS},
+		{"ICE Wireless Edge Package",	ICE_PKG_TYPE_WIRELESS_EDGE},
+		{"ICE GTP over GRE Package",	ICE_PKG_TYPE_GTP_OVER_GRE},
+		{"ICE Tencent GRE Package",	ICE_PKG_TYPE_OS_DEFAULT},
+	};
+
+	for (i = 0; i < ARRAY_SIZE(ice_pkg_type_list); i++) {
+		if (!strcmp(ice_pkg_type_list[i].name,
+			    (const char *)hw->active_pkg_name))
+			return ice_pkg_type_list[i].pkg_type;
+	}
+
+	return ICE_PKG_TYPE_UNKNOWN;
+};
+
+/**
+ * ice_vc_validate_pattern
+ * @vf: pointer to the VF info
+ * @proto: virtchnl protocol headers
+ *
+ * validate the pattern is supported or not.
+ *
+ * Return: true on success, false on error.
+ */
+bool
+ice_vc_validate_pattern(struct ice_vf *vf, struct virtchnl_proto_hdrs *proto)
+{
+	bool is_l2tpv2 = false;
+	bool is_ipv4 = false;
+	bool is_ipv6 = false;
+	bool is_udp = false;
+	u16 ptype = -1;
+	int i = 0;
+
+	while (i < proto->count &&
+	       proto->proto_hdr[i].type != VIRTCHNL_PROTO_HDR_NONE) {
+		switch (proto->proto_hdr[i].type) {
+		case VIRTCHNL_PROTO_HDR_ETH:
+			ptype = ICE_PTYPE_MAC_PAY;
+			break;
+		case VIRTCHNL_PROTO_HDR_IPV4:
+			ptype = ICE_PTYPE_IPV4_PAY;
+			is_ipv4 = true;
+			break;
+		case VIRTCHNL_PROTO_HDR_IPV6:
+			ptype = ICE_PTYPE_IPV6_PAY;
+			is_ipv6 = true;
+			break;
+		case VIRTCHNL_PROTO_HDR_UDP:
+			if (is_ipv4)
+				ptype = ICE_PTYPE_IPV4_UDP_PAY;
+			else if (is_ipv6)
+				ptype = ICE_PTYPE_IPV6_UDP_PAY;
+			is_udp = true;
+			break;
+		case VIRTCHNL_PROTO_HDR_TCP:
+			if (is_ipv4)
+				ptype = ICE_PTYPE_IPV4_TCP_PAY;
+			else if (is_ipv6)
+				ptype = ICE_PTYPE_IPV6_TCP_PAY;
+			break;
+		case VIRTCHNL_PROTO_HDR_SCTP:
+			if (is_ipv4)
+				ptype = ICE_PTYPE_IPV4_SCTP_PAY;
+			else if (is_ipv6)
+				ptype = ICE_PTYPE_IPV6_SCTP_PAY;
+			break;
+		case VIRTCHNL_PROTO_HDR_L2TPV2:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_L2TPV2;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_L2TPV2;
+			is_l2tpv2 = true;
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_IP:
+		case VIRTCHNL_PROTO_HDR_GTPU_EH:
+		case VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN:
+		case VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_GTPU;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_GTPU;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_L2TPV3:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_L2TPV3;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_L2TPV3;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_ESP:
+			if (is_ipv4)
+				ptype = is_udp ? ICE_MAC_IPV4_NAT_T_ESP :
+						ICE_MAC_IPV4_ESP;
+			else if (is_ipv6)
+				ptype = is_udp ? ICE_MAC_IPV6_NAT_T_ESP :
+						ICE_MAC_IPV6_ESP;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_AH:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_AH;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_AH;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_PFCP:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_PFCP_SESSION;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_PFCP_SESSION;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_ECPRI:
+			if (is_ipv4)
+				ptype = ICE_PTYPE_IPV4_UDP_PAY;
+			else if (is_ipv6)
+				ptype = ICE_PTYPE_IPV6_UDP_PAY;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_PPP:
+			if (is_ipv4 && is_l2tpv2)
+				ptype = ICE_MAC_IPV4_PPPOL2TPV2;
+			else if (is_ipv6 && is_l2tpv2)
+				ptype = ICE_MAC_IPV6_PPPOL2TPV2;
+			goto out;
+		default:
+			break;
+		}
+		i++;
+	}
+
+out:
+	return ice_hw_ptype_ena(&vf->pf->hw, ptype);
+}
+
+/**
+ * ice_vc_parse_rss_cfg - parses hash fields and headers from
+ * a specific virtchnl RSS cfg
+ * @hw: pointer to the hardware
+ * @rss_cfg: pointer to the virtchnl rss cfg
+ * @hash_cfg: pointer to the HW hash configuration
+ *
+ * Return true if all the protocol header and hash fields in the rss cfg could
+ * be parsed, else return false
+ *
+ * This function parses the virtchnl rss cfg to be the intended
+ * hash fields and the intended header for RSS configuration
+ */
+static bool ice_vc_parse_rss_cfg(struct ice_hw *hw,
+				 struct virtchnl_rss_cfg *rss_cfg,
+				 struct ice_rss_hash_cfg *hash_cfg)
+{
+	const struct ice_vc_hash_field_match_type *hf_list;
+	const struct ice_vc_hdr_match_type *hdr_list;
+	int i, hf_list_len, hdr_list_len;
+	bool outer_ipv4 = false;
+	bool outer_ipv6 = false;
+	bool inner_hdr = false;
+
+	u32 *addl_hdrs = &hash_cfg->addl_hdrs;
+	u64 *hash_flds = &hash_cfg->hash_flds;
+	/* set outer layer RSS as default */
+	hash_cfg->hdr_type = ICE_RSS_OUTER_HEADERS;
+
+	hf_list = ice_vc_hash_field_list;
+	hf_list_len = ARRAY_SIZE(ice_vc_hash_field_list);
+	hdr_list = ice_vc_hdr_list;
+	hdr_list_len = ARRAY_SIZE(ice_vc_hdr_list);
+
+	for (i = 0; i < rss_cfg->proto_hdrs.count; i++) {
+		struct virtchnl_proto_hdr *proto_hdr =
+				&rss_cfg->proto_hdrs.proto_hdr[i];
+		u32 hdr_found = 0;
+		int j;
+
+		/* find matched ice headers according to virtchnl headers.
+		 * Also figure out the outer type of GTPU headers.
+		 */
+		for (j = 0; j < hdr_list_len; j++) {
+			struct ice_vc_hdr_match_type hdr_map =
+				hdr_list[j];
+
+			if (proto_hdr->type == hdr_map.vc_hdr)
+				hdr_found = hdr_map.ice_hdr;
+		}
+
+		if (!hdr_found)
+			return false;
+
+		if (proto_hdr->type == VIRTCHNL_PROTO_HDR_IPV4 && !inner_hdr)
+			outer_ipv4 = true;
+		else if (proto_hdr->type == VIRTCHNL_PROTO_HDR_IPV6 &&
+			 !inner_hdr)
+			outer_ipv6 = true;
+		/* for GTPU and L2TPv2, take inner header as input set if no
+		 * any field is selected from outer headers.
+		 */
+		else if ((proto_hdr->type == VIRTCHNL_PROTO_HDR_L2TPV2 ||
+			  proto_hdr->type == VIRTCHNL_PROTO_HDR_GTPU_IP ||
+			  proto_hdr->type == VIRTCHNL_PROTO_HDR_GTPU_EH ||
+			  proto_hdr->type == VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN ||
+			  proto_hdr->type == VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP) &&
+			  *hash_flds == 0) {
+			/* set inner_hdr flag, and clean up outer header */
+			inner_hdr = true;
+
+			/* clear outer headers */
+			*addl_hdrs = 0;
+
+			if (outer_ipv4 && outer_ipv6)
+				return false;
+
+			if (outer_ipv4)
+				hash_cfg->hdr_type = ICE_RSS_INNER_HEADERS_W_OUTER_IPV4;
+			else if (outer_ipv6)
+				hash_cfg->hdr_type = ICE_RSS_INNER_HEADERS_W_OUTER_IPV6;
+			else
+				hash_cfg->hdr_type = ICE_RSS_INNER_HEADERS;
+		}
+
+		*addl_hdrs |= hdr_found;
+
+		/* Find matched ice hash fields according to
+		 * virtchnl hash fields.
+		 */
+		for (j = 0; j < hf_list_len; j++) {
+			struct ice_vc_hash_field_match_type hf_map =
+				hf_list[j];
+
+			if (proto_hdr->type == hf_map.vc_hdr &&
+			    proto_hdr->field_selector ==
+			     hf_map.vc_hash_field) {
+				*hash_flds |= hf_map.ice_hash_field;
+				break;
+			}
+		}
+	}
+
+	/* refine gtpu header if we take outer as input set for a no inner
+	 * ip gtpu flow.
+	 */
+	if (hash_cfg->hdr_type == ICE_RSS_OUTER_HEADERS &&
+	    *addl_hdrs & ICE_FLOW_SEG_HDR_GTPU_IP) {
+		*addl_hdrs &= ~(ICE_FLOW_SEG_HDR_GTPU_IP);
+		*addl_hdrs |= ICE_FLOW_SEG_HDR_GTPU_NON_IP;
+	}
+
+	/* refine hash field for esp and nat-t-esp. */
+	if ((*addl_hdrs & ICE_FLOW_SEG_HDR_UDP) &&
+	    (*addl_hdrs & ICE_FLOW_SEG_HDR_ESP)) {
+		*addl_hdrs &= ~(ICE_FLOW_SEG_HDR_ESP | ICE_FLOW_SEG_HDR_UDP);
+		*addl_hdrs |= ICE_FLOW_SEG_HDR_NAT_T_ESP;
+		*hash_flds &= ~(BIT_ULL(ICE_FLOW_FIELD_IDX_ESP_SPI));
+		*hash_flds |= BIT_ULL(ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI);
+	}
+
+	/* refine hash hdrs for L4 udp/tcp/sctp. */
+	if (*addl_hdrs & (ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP |
+			  ICE_FLOW_SEG_HDR_SCTP) &&
+	    *addl_hdrs & ICE_FLOW_SEG_HDR_IPV_OTHER)
+		*addl_hdrs &= ~ICE_FLOW_SEG_HDR_IPV_OTHER;
+
+	/* refine hash field for ecpri over mac or udp */
+	if ((*addl_hdrs & ICE_FLOW_SEG_HDR_ECPRI_TP0) &&
+	    (*addl_hdrs & ICE_FLOW_SEG_HDR_UDP)) {
+		*addl_hdrs &= ~ICE_FLOW_SEG_HDR_ECPRI_TP0;
+		*hash_flds &= ~(BIT_ULL(ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID));
+	} else if (*addl_hdrs & ICE_FLOW_SEG_HDR_ECPRI_TP0) {
+		*addl_hdrs &= ~ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0;
+		*hash_flds &=
+			~(BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID));
+	}
+
+	return true;
+}
+
+/**
+ * ice_vf_adv_rss_offload_ena - determine if capabilities support advanced
+ * rss offloads
+ * @caps: VF driver negotiated capabilities
+ *
+ * Return true if VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF capability is set,
+ * else return false
+ */
+static bool ice_vf_adv_rss_offload_ena(u32 caps)
+{
+	return !!(caps & VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF);
+}
+
+/**
+ * is_hash_cfg_valid - check if the hash context is valid
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function will return true if the hash context is valid, otherwise
+ * return false.
+ */
+static bool is_hash_cfg_valid(struct ice_rss_hash_cfg *cfg)
+{
+	return (cfg->hash_flds != 0 && cfg->addl_hdrs != 0) ?
+		true : false;
+}
+
+/**
+ * hash_cfg_reset - reset the hash context
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function will reset the hash context which stores the valid rule info.
+ */
+static void hash_cfg_reset(struct ice_rss_hash_cfg *cfg)
+{
+	cfg->hash_flds = 0;
+	cfg->addl_hdrs = 0;
+	cfg->hdr_type = ICE_RSS_OUTER_HEADERS;
+	cfg->symm = 0;
+}
+
+/**
+ * hash_cfg_record - record the hash context
+ * @ctx: pointer to the global RSS hash configuration
+ * @cfg: pointer to the RSS hash configuration to be recorded
+ *
+ * This function will record the hash context which stores the valid rule info.
+ */
+static void hash_cfg_record(struct ice_rss_hash_cfg *ctx,
+			    struct ice_rss_hash_cfg *cfg)
+{
+	ctx->hash_flds = cfg->hash_flds;
+	ctx->addl_hdrs = cfg->addl_hdrs;
+	ctx->hdr_type = cfg->hdr_type;
+	ctx->symm = cfg->symm;
+}
+
+/**
+ * ice_hash_moveout - delete a RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function will delete an existing RSS hash configuration but not delete
+ * the hash context which stores the rule info.
+ */
+static int
+ice_hash_moveout(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum ice_status status = 0;
+	struct ice_hw *hw = &vf->pf->hw;
+
+	if (!is_hash_cfg_valid(cfg))
+		return -ENOENT;
+
+	status = ice_rem_rss_cfg(hw, vf->lan_vsi_idx, cfg);
+	if (status && status != ICE_ERR_DOES_NOT_EXIST) {
+		dev_err(dev, "ice_rem_rss_cfg failed for VSI:%d, error:%s\n",
+			vf->lan_vsi_num, ice_stat_str(status));
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_hash_moveback - add an RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * The function will add a RSS hash configuration if the hash context is valid.
+ */
+static int
+ice_hash_moveback(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum ice_status status = 0;
+	struct ice_hw *hw = &vf->pf->hw;
+
+	if (!is_hash_cfg_valid(cfg))
+		return -ENOENT;
+
+	status = ice_add_rss_cfg(hw, vf->lan_vsi_idx, cfg);
+	if (status) {
+		dev_err(dev, "ice_add_rss_cfg failed for VSI:%d, error:%s\n",
+			vf->lan_vsi_num, ice_stat_str(status));
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_hash_remove - remove a RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function will delete a RSS hash configuration and also delete the
+ * hash context which stores the rule info.
+ */
+static int
+ice_hash_remove(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	int ret;
+
+	ret = ice_hash_moveout(vf, cfg);
+	if (ret && (ret != -ENOENT))
+		return ret;
+
+	hash_cfg_reset(cfg);
+
+	return 0;
+}
+
+/**
+ * ice_add_rss_cfg_pre_gtpu - pre-process the GTPU RSS configuration
+ * @vf: pointer to the VF info
+ * @ctx: pointer to the context of the GTPU hash
+ * @ctx_idx: The index of the hash context
+ *
+ * This function pre-process the GTPU hash configuration before adding a hash
+ * config, it will remove or rotate some prior hash configs which will cause
+ * conflicts.  For example, if a GTPU_UP/DWN rule be configured after a GTPU_EH
+ * rule, the GTPU_EH hash will be hit at first due to TCAM write sequence from
+ * top to down, and the hash hit sequence also from top to down. So the
+ * GTPU_EH rule need roolback to the later of the GTPU_UP/DWN rule. On the
+ * other hand, when a GTPU_EH rule be configured after a GTPU_UP/DWN rule,
+ * just need to remove the GTPU_DWN/UP rules.
+ */
+static int
+ice_add_rss_cfg_pre_gtpu(struct ice_vf *vf, struct ice_vf_hash_gtpu_ctx *ctx,
+			 u32 ctx_idx)
+{
+	int ret;
+
+	switch (ctx_idx) {
+	case ICE_HASH_GTPU_CTX_EH_IP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_EH_IP_UDP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_EH_IP_TCP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_UP_IP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_UP_IP_UDP:
+	case ICE_HASH_GTPU_CTX_UP_IP_TCP:
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_DW_IP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_DW_IP_UDP:
+	case ICE_HASH_GTPU_CTX_DW_IP_TCP:
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_add_rss_cfg_pre_ip - pre-process the IP RSS configuration
+ * @vf: pointer to the VF info
+ * @ctx: pointer to the context of the IP L4 hash
+ *
+ * This function will remove all covered and recorded IP RSS configurations,
+ * including IP with ESP/UDP_ESP/AH/L2TPV3/PFCP and UDP/TCP/SCTP.
+ */
+static int
+ice_add_rss_cfg_pre_ip(struct ice_vf *vf, struct ice_vf_hash_ip_ctx *ctx)
+{
+	int i, ret;
+
+	for (i = 1; i < ICE_HASH_IP_CTX_MAX; i++)
+		if (is_hash_cfg_valid(&ctx->ctx[i])) {
+			ret = ice_hash_remove(vf, &ctx->ctx[i]);
+
+			if (ret)
+				return ret;
+		}
+
+	return 0;
+}
+
+/**
+ * calc_gtpu_ctx_idx - calculate the index of the GTPU hash context
+ * @hdrs: the protocol headers prefix with ICE_FLOW_SEG_HDR_XXX.
+ *
+ * The GTPU hash context use the index to classify for IPV4/IPV6 and
+ * GTPU_EH/GTPU_UP/GTPU_DWN, this function used to calculate the index
+ * by the protocol headers.
+ */
+static u32 calc_gtpu_ctx_idx(u32 hdrs)
+{
+	u32 eh_idx, ip_idx;
+
+	if (hdrs & ICE_FLOW_SEG_HDR_GTPU_EH)
+		eh_idx = 0;
+	else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_UP)
+		eh_idx = 1;
+	else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_DWN)
+		eh_idx = 2;
+	else
+		return ICE_HASH_GTPU_CTX_MAX;
+
+	ip_idx = 0;
+	if (hdrs & ICE_FLOW_SEG_HDR_UDP)
+		ip_idx = 1;
+	else if (hdrs & ICE_FLOW_SEG_HDR_TCP)
+		ip_idx = 2;
+
+	if (hdrs & (ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV6))
+		return eh_idx * 3 + ip_idx;
+	else
+		return ICE_HASH_GTPU_CTX_MAX;
+}
+
+/**
+ * ice_map_ip_ctx_idx - map the index of the IP L4 hash context
+ * @hdrs: protocol headers prefix with ICE_FLOW_SEG_HDR_XXX.
+ *
+ * The IP L4 hash context use the index to classify for IPv4/IPv6 with
+ * ESP/UDP_ESP/AH/L2TPV3/PFCP and non-tunnel UDP/TCP/SCTP
+ * this function map the index based on the protocol headers.
+ */
+static u8 ice_map_ip_ctx_idx(u32 hdrs)
+{
+	u8 i;
+
+	static struct {
+		u32 hdrs;
+		u8 ctx_idx;
+	} ip_ctx_idx_map[] = {
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_ESP,
+			ICE_HASH_IP_CTX_IP_ESP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_NAT_T_ESP,
+			ICE_HASH_IP_CTX_IP_UDP_ESP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_AH,
+			ICE_HASH_IP_CTX_IP_AH },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_L2TPV3,
+			ICE_HASH_IP_CTX_IP_L2TPV3 },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_PFCP_SESSION,
+			ICE_HASH_IP_CTX_IP_PFCP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_UDP,
+			ICE_HASH_IP_CTX_IP_UDP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_TCP,
+			ICE_HASH_IP_CTX_IP_TCP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_SCTP,
+			ICE_HASH_IP_CTX_IP_SCTP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER,
+			ICE_HASH_IP_CTX_IP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_ESP,
+			ICE_HASH_IP_CTX_IP_ESP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_NAT_T_ESP,
+			ICE_HASH_IP_CTX_IP_UDP_ESP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_AH,
+			ICE_HASH_IP_CTX_IP_AH },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_L2TPV3,
+			ICE_HASH_IP_CTX_IP_L2TPV3 },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_PFCP_SESSION,
+			ICE_HASH_IP_CTX_IP_PFCP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_UDP,
+			ICE_HASH_IP_CTX_IP_UDP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_TCP,
+			ICE_HASH_IP_CTX_IP_TCP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_SCTP,
+			ICE_HASH_IP_CTX_IP_SCTP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER,
+			ICE_HASH_IP_CTX_IP },
+		/* the remaining mappings are used for default RSS */
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_UDP,
+			ICE_HASH_IP_CTX_IP_UDP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_TCP,
+			ICE_HASH_IP_CTX_IP_TCP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_SCTP,
+			ICE_HASH_IP_CTX_IP_SCTP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER,
+			ICE_HASH_IP_CTX_IP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_UDP,
+			ICE_HASH_IP_CTX_IP_UDP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_TCP,
+			ICE_HASH_IP_CTX_IP_TCP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_SCTP,
+			ICE_HASH_IP_CTX_IP_SCTP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER,
+			ICE_HASH_IP_CTX_IP },
+	};
+
+	for (i = 0; i < ARRAY_SIZE(ip_ctx_idx_map); i++) {
+		if (hdrs == ip_ctx_idx_map[i].hdrs)
+			return ip_ctx_idx_map[i].ctx_idx;
+	}
+
+	return ICE_HASH_IP_CTX_MAX;
+}
+
+static int
+ice_add_rss_cfg_pre(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	u32 ice_gtpu_ctx_idx = calc_gtpu_ctx_idx(cfg->addl_hdrs);
+
+	u8 ip_ctx_idx = ice_map_ip_ctx_idx(cfg->addl_hdrs);
+
+	if (ip_ctx_idx == ICE_HASH_IP_CTX_IP) {
+		int ret = 0;
+
+		if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			ret = ice_add_rss_cfg_pre_ip(vf, &vf->hash_ctx.v4);
+		else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			ret = ice_add_rss_cfg_pre_ip(vf, &vf->hash_ctx.v6);
+
+		if (ret)
+			return ret;
+	}
+
+	if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4) {
+		return ice_add_rss_cfg_pre_gtpu(vf, &vf->hash_ctx.ipv4,
+						ice_gtpu_ctx_idx);
+	} else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6) {
+		return ice_add_rss_cfg_pre_gtpu(vf, &vf->hash_ctx.ipv6,
+						ice_gtpu_ctx_idx);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_add_rss_cfg_post_gtpu - A wrap function of deleting an RSS configuration
+ * @vf: pointer to the VF info
+ * @ctx: pointer to the context of the GTPU hash
+ * @cfg: pointer to the rss hash configuration
+ * @ctx_idx: The index of the hash context
+ *
+ * This function post process the hash configuration after the hash config is
+ * successfully adding, it will re-configure the prior hash config which was
+ * moveout but need to moveback again.
+ */
+static int
+ice_add_rss_cfg_post_gtpu(struct ice_vf *vf, struct ice_vf_hash_gtpu_ctx *ctx,
+			  struct ice_rss_hash_cfg *cfg, u32 ctx_idx)
+{
+	int ret;
+
+	if (ctx_idx < ICE_HASH_GTPU_CTX_MAX) {
+		ctx->ctx[ctx_idx].addl_hdrs = cfg->addl_hdrs;
+		ctx->ctx[ctx_idx].hash_flds = cfg->hash_flds;
+		ctx->ctx[ctx_idx].hdr_type = cfg->hdr_type;
+		ctx->ctx[ctx_idx].symm = cfg->symm;
+	}
+
+	switch (ctx_idx) {
+	case ICE_HASH_GTPU_CTX_EH_IP:
+		break;
+	case ICE_HASH_GTPU_CTX_EH_IP_UDP:
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_EH_IP_TCP:
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_UP_IP:
+	case ICE_HASH_GTPU_CTX_UP_IP_UDP:
+	case ICE_HASH_GTPU_CTX_UP_IP_TCP:
+	case ICE_HASH_GTPU_CTX_DW_IP:
+	case ICE_HASH_GTPU_CTX_DW_IP_UDP:
+	case ICE_HASH_GTPU_CTX_DW_IP_TCP:
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int
+ice_add_rss_cfg_post(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	u32 ice_gtpu_ctx_idx = calc_gtpu_ctx_idx(cfg->addl_hdrs);
+
+	u8 ip_ctx_idx = ice_map_ip_ctx_idx(cfg->addl_hdrs);
+
+	if (ip_ctx_idx && ip_ctx_idx < ICE_HASH_IP_CTX_MAX) {
+		if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			hash_cfg_record(&vf->hash_ctx.v4.ctx[ip_ctx_idx], cfg);
+		else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			hash_cfg_record(&vf->hash_ctx.v6.ctx[ip_ctx_idx], cfg);
+	}
+
+	if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4) {
+		return ice_add_rss_cfg_post_gtpu(vf, &vf->hash_ctx.ipv4,
+						 cfg, ice_gtpu_ctx_idx);
+	} else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6) {
+		return ice_add_rss_cfg_post_gtpu(vf, &vf->hash_ctx.ipv6,
+						 cfg, ice_gtpu_ctx_idx);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_rem_rss_cfg_post - post-process the RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function post-process the RSS hash configuration after deleting a hash
+ * config. Such as, it will reset the hash context for the GTPU hash.
+ */
+static void
+ice_rem_rss_cfg_post(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	u32 ice_gtpu_ctx_idx = calc_gtpu_ctx_idx(cfg->addl_hdrs);
+
+	u8 ip_ctx_idx = ice_map_ip_ctx_idx(cfg->addl_hdrs);
+
+	if (ip_ctx_idx && ip_ctx_idx < ICE_HASH_IP_CTX_MAX) {
+		if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			hash_cfg_reset(&vf->hash_ctx.v4.ctx[ip_ctx_idx]);
+		else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			hash_cfg_reset(&vf->hash_ctx.v6.ctx[ip_ctx_idx]);
+	}
+
+	if (ice_gtpu_ctx_idx >= ICE_HASH_GTPU_CTX_MAX)
+		return;
+
+	if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4)
+		hash_cfg_reset(&vf->hash_ctx.ipv4.ctx[ice_gtpu_ctx_idx]);
+	else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6)
+		hash_cfg_reset(&vf->hash_ctx.ipv6.ctx[ice_gtpu_ctx_idx]);
+}
+
+/**
+ * ice_rem_rss_cfg_wrap - A wrap function of deleting an RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * Wrapper function to delete a flow profile base on an RSS configuration,
+ * and also post process the hash context base on the rollback mechanism
+ * which handle some rules conflict by ice_add_rss_cfg_wrap.
+ */
+static enum ice_status
+ice_rem_rss_cfg_wrap(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum ice_status status = 0;
+	struct ice_hw *hw = &vf->pf->hw;
+
+	status = ice_rem_rss_cfg(hw, vf->lan_vsi_idx, cfg);
+	/* We just ignore ICE_ERR_DOES_NOT_EXIST, because
+	 * if two configurations share the same profile remove
+	 * one of them actually removes both, since the
+	 * profile is deleted.
+	 */
+	if (status && status != ICE_ERR_DOES_NOT_EXIST) {
+		dev_err(dev, "ice_rem_rss_cfg failed for VSI:%d, error:%s\n",
+			vf->lan_vsi_num, ice_stat_str(status));
+		goto error;
+	}
+
+	ice_rem_rss_cfg_post(vf, cfg);
+
+error:
+	return status;
+}
+
+/**
+ * ice_add_rss_cfg_wrap - A wrap function of adding an RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * Wapper function to add a flow profile base on a RSS configuration, and
+ * also use a rollback mechanism to handle some rules conflict due to TCAM
+ * write sequence from top to down.
+ */
+static enum ice_status
+ice_add_rss_cfg_wrap(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum ice_status status = 0;
+	struct ice_hw *hw = &vf->pf->hw;
+
+	if (ice_add_rss_cfg_pre(vf, cfg))
+		return ICE_ERR_PARAM;
+
+	status = ice_add_rss_cfg(hw, vf->lan_vsi_idx, cfg);
+	if (status) {
+		dev_err(dev, "ice_add_rss_cfg failed for VSI:%d, error:%s\n",
+			vf->lan_vsi_num, ice_stat_str(status));
+		goto error;
+	}
+
+	if (ice_add_rss_cfg_post(vf, cfg))
+		status = ICE_ERR_PARAM;
+
+error:
+	return status;
+}
+
+/**
+ * ice_vc_handle_rss_cfg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the message buffer
+ * @add: add a RSS config if true, otherwise delete a RSS config
+ *
+ * This function adds/deletes a RSS config
+ */
+static int ice_vc_handle_rss_cfg(struct ice_vf *vf, u8 *msg, bool add)
+{
+	struct virtchnl_rss_cfg *rss_cfg = (struct virtchnl_rss_cfg *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	u32 v_opcode = add ? VIRTCHNL_OP_ADD_RSS_CFG :
+			VIRTCHNL_OP_DEL_RSS_CFG;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_hw *hw = &vf->pf->hw;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but RSS is not supported by the PF\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto error_param;
+	}
+
+	if (!ice_vf_adv_rss_offload_ena(vf->driver_caps)) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but Advanced rss offload is not supported\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (rss_cfg->proto_hdrs.count > VIRTCHNL_MAX_NUM_PROTO_HDRS ||
+	    rss_cfg->rss_algorithm < VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC ||
+	    rss_cfg->rss_algorithm > VIRTCHNL_RSS_ALG_XOR_SYMMETRIC) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but RSS configuration is not valid\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_validate_pattern(vf, &rss_cfg->proto_hdrs)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (rss_cfg->rss_algorithm == VIRTCHNL_RSS_ALG_R_ASYMMETRIC) {
+		u8 hash_type = add ? ICE_AQ_VSI_Q_OPT_RSS_XOR : ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
+
+		v_ret = ice_vc_rss_hash_update(hw, vsi, hash_type);
+	} else {
+		struct ice_rss_hash_cfg cfg;
+		u8 hash_type;
+
+		cfg.addl_hdrs = ICE_FLOW_SEG_HDR_NONE;
+		cfg.hash_flds = ICE_HASH_INVALID;
+		cfg.hdr_type = ICE_RSS_ANY_HEADERS;
+
+		hash_type = add ? ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ :
+				ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
+
+		v_ret = ice_vc_rss_hash_update(hw, vsi, hash_type);
+		if (v_ret)
+			goto error_param;
+
+		if (!ice_vc_parse_rss_cfg(hw, rss_cfg, &cfg)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		if (add) {
+			if (rss_cfg->rss_algorithm ==
+				     VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC)
+				cfg.symm = true;
+			else
+				cfg.symm = false;
+
+			if (ice_add_rss_cfg_wrap(vf, &cfg))
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		} else {
+			if (ice_rem_rss_cfg_wrap(vf, &cfg))
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		}
+	}
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, v_opcode, v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_config_rss_key
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
  * Configure the VF's RSS key
  */
-static int ice_vc_config_rss_key(struct ice_vf *vf, u8 *msg)
+static int ice_vc_config_rss_key(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_rss_key *vrk =
+		(struct virtchnl_rss_key *)msg;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vrk->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vrk->key_len != ICE_VSIQF_HKEY_ARRAY_SIZE) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (ice_set_rss_key(vsi, vrk->key))
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_KEY, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_vc_config_rss_lut
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Configure the VF's RSS LUT
+ */
+static int ice_vc_config_rss_lut(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_rss_lut *vrl = (struct virtchnl_rss_lut *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vrl->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vrl->lut_entries != vsi->rss_table_size) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (ice_set_rss_lut(vsi, vrl->lut, vrl->lut_entries))
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_LUT, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_wait_on_vf_reset - poll to make sure a given VF is ready after reset
+ * @vf: The VF being resseting
+ *
+ * The max poll time is about ~800ms, which is about the maximum time it takes
+ * for a VF to be reset and/or a VF driver to be removed.
+ */
+static void ice_wait_on_vf_reset(struct ice_vf *vf)
+{
+	int i;
+
+	for (i = 0; i < ICE_MAX_VF_RESET_TRIES; i++) {
+		if (test_bit(ICE_VF_STATE_INIT, vf->vf_states))
+			break;
+		msleep(ICE_MAX_VF_RESET_SLEEP_MS);
+	}
+}
+
+/**
+ * ice_check_vf_ready_for_cfg - check if VF is ready to be configured/queried
+ * @vf: VF to check if it's ready to be configured/queried
+ *
+ * The purpose of this function is to make sure the VF is not in reset, not
+ * disabled, and initialized so it can be configured and/or queried by a host
+ * administrator.
+ */
+int ice_check_vf_ready_for_cfg(struct ice_vf *vf)
+{
+	struct ice_pf *pf;
+
+	ice_wait_on_vf_reset(vf);
+
+	if (ice_is_vf_disabled(vf))
+		return -EINVAL;
+
+	pf = vf->pf;
+	if (ice_check_vf_init(pf, vf))
+		return -EBUSY;
+
+	return 0;
+}
+
+/**
+ * ice_set_vf_spoofchk
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @ena: flag to enable or disable feature
+ *
+ * Enable or disable VF spoof checking
+ */
+int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_pf *pf = np->vsi->back;
+	struct ice_vsi *vf_vsi;
+	struct device *dev;
+	struct ice_vf *vf;
+	int ret;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	vf_vsi = ice_get_vf_vsi(vf);
+	if (!vf_vsi) {
+		netdev_err(netdev, "VSI %d for VF %d is null\n",
+			   vf->lan_vsi_idx, vf->vf_id);
+		return -EINVAL;
+	}
+
+	if (vf_vsi->type != ICE_VSI_VF) {
+		netdev_err(netdev, "Type %d of VSI %d for VF %d is no ICE_VSI_VF\n",
+			   vf_vsi->type, vf_vsi->vsi_num, vf->vf_id);
+		return -ENODEV;
+	}
+
+	if (ena == vf->spoofchk) {
+		dev_dbg(dev, "VF spoofchk already %s\n", ena ? "ON" : "OFF");
+		return 0;
+	}
+
+	if (ena)
+		ret = ice_vsi_ena_spoofchk(vf_vsi);
+	else
+		ret = ice_vsi_dis_spoofchk(vf_vsi);
+	if (ret)
+		dev_err(dev, "Failed to set spoofchk %s for VF %d VSI %d\n error %d\n",
+			ena ? "ON" : "OFF", vf->vf_id, vf_vsi->vsi_num, ret);
+	else
+		vf->spoofchk = ena;
+
+	return ret;
+}
+
+/**
+ * ice_is_any_vf_in_promisc - check if any VF(s) are in promiscuous mode
+ * @pf: PF structure for accessing VF(s)
+ *
+ * Return false if no VF(s) are in unicast and/or multicast promiscuous mode,
+ * else return true
+ */
+bool ice_is_any_vf_in_promisc(struct ice_pf *pf)
+{
+	int vf_idx;
+
+	ice_for_each_vf(pf, vf_idx) {
+		struct ice_vf *vf = &pf->vf[vf_idx];
+
+		/* found a VF that has promiscuous mode configured */
+		if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
+		    test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * ice_vc_cfg_promiscuous_mode_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to configure VF VSIs promiscuous mode
+ */
+static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	bool rm_promisc, alluni = false, allmulti = false;
+	struct virtchnl_promisc_info *info =
+	    (struct virtchnl_promisc_info *)msg;
+	struct ice_vsi_vlan_ops *vlan_ops;
+	int mcast_err = 0, ucast_err = 0;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	int ret = 0;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, info->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+		dev_err(dev, "Unprivileged VF %d is attempting to configure promiscuous mode\n",
+			vf->vf_id);
+		/* Leave v_ret alone, lie to the VF on purpose. */
+		goto error_param;
+	}
+
+	if (info->flags & FLAG_VF_UNICAST_PROMISC)
+		alluni = true;
+
+	if (info->flags & FLAG_VF_MULTICAST_PROMISC)
+		allmulti = true;
+
+	rm_promisc = !allmulti && !alluni;
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	if (rm_promisc)
+		ret = vlan_ops->ena_rx_filtering(vsi);
+	else
+		ret = vlan_ops->dis_rx_filtering(vsi);
+	if (ret) {
+		dev_err(dev, "Failed to configure VLAN pruning in promiscuous mode\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags)) {
+		bool set_dflt_vsi = alluni || allmulti;
+
+		if (set_dflt_vsi && !ice_is_dflt_vsi_in_use(vsi->vsw))
+			/* only attempt to set the default forwarding VSI if
+			 * it's not currently set
+			 */
+			ret = ice_set_dflt_vsi(vsi->vsw, vsi);
+		else if (!set_dflt_vsi &&
+			 ice_is_vsi_dflt_vsi(vsi->vsw, vsi))
+			/* only attempt to free the default forwarding VSI if we
+			 * are the owner
+			 */
+			ret = ice_clear_dflt_vsi(vsi->vsw);
+
+		if (ret) {
+			dev_err(dev, "%sable VF %d as the default VSI failed, error %d\n",
+				set_dflt_vsi ? "en" : "dis", vf->vf_id, ret);
+			v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+			goto error_param;
+		}
+	} else {
+		u8 mcast_m, ucast_m;
+
+		if (ice_vf_is_port_vlan_ena(vf) ||
+		    ice_vsi_has_non_zero_vlans(vsi)) {
+			mcast_m = ICE_MCAST_VLAN_PROMISC_BITS;
+			ucast_m = ICE_UCAST_VLAN_PROMISC_BITS;
+		} else {
+			mcast_m = ICE_MCAST_PROMISC_BITS;
+			ucast_m = ICE_UCAST_PROMISC_BITS;
+		}
+
+		if (alluni)
+			ucast_err = ice_vf_set_vsi_promisc(vf, vsi, ucast_m);
+		else
+			ucast_err = ice_vf_clear_vsi_promisc(vf, vsi, ucast_m);
+
+		if (allmulti)
+			mcast_err = ice_vf_set_vsi_promisc(vf, vsi, mcast_m);
+		else
+			mcast_err = ice_vf_clear_vsi_promisc(vf, vsi, mcast_m);
+	}
+
+	if (!mcast_err) {
+		if (allmulti &&
+		    !test_and_set_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+			dev_info(dev, "VF %u successfully set multicast promiscuous mode\n",
+				 vf->vf_id);
+		else if (!allmulti && test_and_clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+			dev_info(dev, "VF %u successfully unset multicast promiscuous mode\n",
+				 vf->vf_id);
+	}
+
+	if (!ucast_err) {
+		if (alluni && !test_and_set_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states))
+			dev_info(dev, "VF %u successfully set unicast promiscuous mode\n",
+				 vf->vf_id);
+		else if (!alluni && test_and_clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states))
+			dev_info(dev, "VF %u successfully unset unicast promiscuous mode\n",
+				 vf->vf_id);
+	}
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_get_stats_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to get VSI stats
+ */
+static int ice_vc_get_stats_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_queue_select *vqs =
+		(struct virtchnl_queue_select *)msg;
+	struct ice_eth_stats stats = { 0 };
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	ice_update_eth_stats(vsi);
+
+	stats = vsi->eth_stats;
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_STATS, v_ret,
+				     (u8 *)&stats, sizeof(stats));
+}
+
+/**
+ * ice_vf_get_tc_based_qid - get the updated QID based on offset
+ * @qid: queue ID
+ * @offset : TC specific queue offset
+ *
+ * This function returns updated queueID based on offset. This is
+ * meant to be used only with VF ADQ. Queue ID will always be
+ * 0-based from the specified offset
+ */
+static u16 ice_vf_get_tc_based_qid(u16 qid, u16 offset)
+{
+	return (qid >= offset) ? (qid - offset) : qid;
+}
+
+/**
+ * ice_vf_q_id_get_vsi_q_id
+ * @vf: pointer to the VF info
+ * @vf_q_id: VF relative queue ID
+ * @t_tc: traffic class for indexing the VSIs
+ * @vqs: the VFs virtual queue selection
+ * @vsi_p: pointer to VSI pointer, which changes based on TC for ADQ
+ * @vsi_id: VSI ID specific to desired queue ID
+ * @q_id: queue ID of the VSI
+ *
+ * provides ADQ queue enablement support by mapping the VF queue ID and TC to
+ * VSI ID and queue ID. call while iterating through VF queue IDs, VF VSIs and
+ * TCs.
+ */
+static void ice_vf_q_id_get_vsi_q_id(struct ice_vf *vf, u16 vf_q_id, u16 *t_tc,
+				     struct virtchnl_queue_select *vqs,
+				     struct ice_vsi **vsi_p, u16 *vsi_id,
+				     u16 *q_id)
+{
+	struct ice_vsi *vsi = *vsi_p;
+	u32 max_chnl_tc;
+	u16 tc = *t_tc;
+
+	max_chnl_tc = ice_vc_get_max_chnl_tc_allowed(vf);
+
+	/* Update the VSI and TC based on per TC queue region and offset */
+	if (tc + 1U < max_chnl_tc && vf_q_id == vf->ch[tc + 1].offset &&
+	    tc < vf->num_tc && ice_is_vf_adq_ena(vf)) {
+		vsi = vf->pf->vsi[vf->ch[tc + 1].vsi_idx];
+		tc++;
+	}
+
+	/* Update vsi_id and queue_id based on TC if TC is VF ADQ TC, then
+	 * use VF ADQ VSI otherwise main VF VSI
+	 */
+	if (tc >= ICE_VF_CHNL_START_TC && ice_is_vf_adq_ena(vf)) {
+		*vsi_id = vsi->vsi_num;
+		*q_id = ice_vf_get_tc_based_qid(vf_q_id, vf->ch[tc].offset);
+	} else {
+		*vsi_id = vqs->vsi_id;
+		*q_id = vf_q_id;
+	}
+
+	*vsi_p = vsi;
+	*t_tc = tc;
+}
+
+/**
+ * ice_vc_validate_vqs_bitmaps - validate Rx/Tx queue bitmaps from VIRTCHNL
+ * @vqs: virtchnl_queue_select structure containing bitmaps to validate
+ *
+ * Return true on successful validation, else false
+ */
+static bool ice_vc_validate_vqs_bitmaps(struct virtchnl_queue_select *vqs)
+{
+	if ((!vqs->rx_queues && !vqs->tx_queues) ||
+	    vqs->rx_queues >= BIT(ICE_MAX_DFLT_QS_PER_VF) ||
+	    vqs->tx_queues >= BIT(ICE_MAX_DFLT_QS_PER_VF))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vf_ena_txq_interrupt - enable Tx queue interrupt via QINT_TQCTL
+ * @vsi: VSI of the VF to configure
+ * @q_idx: VF queue index used to determine the queue in the PF's space
+ */
+static void ice_vf_ena_txq_interrupt(struct ice_vsi *vsi, u32 q_idx)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	u32 pfq = vsi->txq_map[q_idx];
+	u32 reg;
+
+	reg = rd32(hw, QINT_TQCTL(pfq));
+
+	/* MSI-X index 0 in the VF's space is always for the OICR, which means
+	 * this is most likely a poll mode VF driver, so don't enable an
+	 * interrupt that was never configured via VIRTCHNL_OP_CONFIG_IRQ_MAP
+	 */
+	if (!(reg & QINT_TQCTL_MSIX_INDX_M))
+		return;
+
+	wr32(hw, QINT_TQCTL(pfq), reg | QINT_TQCTL_CAUSE_ENA_M);
+}
+
+/**
+ * ice_vf_ena_rxq_interrupt - enable Tx queue interrupt via QINT_RQCTL
+ * @vsi: VSI of the VF to configure
+ * @q_idx: VF queue index used to determine the queue in the PF's space
+ */
+static void ice_vf_ena_rxq_interrupt(struct ice_vsi *vsi, u32 q_idx)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	u32 pfq = vsi->rxq_map[q_idx];
+	u32 reg;
+
+	reg = rd32(hw, QINT_RQCTL(pfq));
+
+	/* MSI-X index 0 in the VF's space is always for the OICR, which means
+	 * this is most likely a poll mode VF driver, so don't enable an
+	 * interrupt that was never configured via VIRTCHNL_OP_CONFIG_IRQ_MAP
+	 */
+	if (!(reg & QINT_RQCTL_MSIX_INDX_M))
+		return;
+
+	wr32(hw, QINT_RQCTL(pfq), reg | QINT_RQCTL_CAUSE_ENA_M);
+}
+
+/**
+ * ice_vf_vsi_ena_single_rxq - enable single Rx queue based on relative q_id
+ * @vf: VF to enable queue for
+ * @vsi: VSI for the VF
+ * @q_id: VSI relative (0-based) queue ID
+ * @vf_q_id: VF relative (0-based) queue ID
+ *
+ * Attempt to enable the Rx queue passed in. If the Rx queue was successfully enabled then set
+ * q_id bit in the enabled queues bitmap and return success. Otherwise return error.
+ */
+static int ice_vf_vsi_ena_single_rxq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id, u16 vf_q_id)
+{
+	int err;
+
+	if (test_bit(vf_q_id, vf->rxq_ena))
+		return 0;
+
+	err = ice_vsi_ctrl_one_rx_ring(vsi, true, q_id, true);
+	if (err) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to enable Rx ring %d on VSI %d\n",
+			q_id, vsi->vsi_num);
+		return err;
+	}
+
+	ice_vf_ena_rxq_interrupt(vsi, q_id);
+	set_bit(vf_q_id, vf->rxq_ena);
+
+	return 0;
+}
+
+/**
+ * ice_vf_vsi_ena_single_txq - enable single Tx queue based on relative q_id
+ * @vf: VF to enable queue for
+ * @vsi: VSI for the VF
+ * @q_id: VSI relative (0-based) queue ID
+ * @vf_q_id: VF relative (0-based) queue ID
+ *
+ * Enable the Tx queue's interrupt then set the q_id bit in the enabled queues bitmap. Note that the
+ * Tx queue(s) should have already been configurated/enabled in VIRTCHNL_OP_CONFIG_QUEUES so this
+ * function only enables the interrupt associated with the q_id.
+ */
+static void ice_vf_vsi_ena_single_txq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id, u16 vf_q_id)
+{
+	if (test_bit(vf_q_id, vf->txq_ena))
+		return;
+
+	ice_vf_ena_txq_interrupt(vsi, q_id);
+	set_bit(vf_q_id, vf->txq_ena);
+}
+
+/**
+ * ice_vc_ena_qs_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to enable all or specific queue(s)
+ */
+static int ice_vc_ena_qs_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_queue_select *vqs =
+	    (struct virtchnl_queue_select *)msg;
+	struct ice_vsi *vsi;
+	unsigned long q_map;
+	u16 vf_q_id = 0;
+	u16 tc = 0;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_validate_vqs_bitmaps(vqs)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	/* Enable only Rx rings, Tx rings were enabled by the FW when the
+	 * Tx queue group list was configured and the context bits were
+	 * programmed using ice_vsi_cfg_txqs
+	 */
+	q_map = vqs->rx_queues;
+	for_each_set_bit(vf_q_id, &q_map, ICE_MAX_DFLT_QS_PER_VF) {
+		u16 vsi_id, q_id;
+
+		ice_vf_q_id_get_vsi_q_id(vf, vf_q_id, &tc, vqs, &vsi,
+					 &vsi_id, &q_id);
+		if (ice_is_vf_adq_ena(vf) && tc >= ICE_VF_CHNL_START_TC) {
+			if (!ice_vf_adq_vsi_valid(vf, tc)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+
+		if (!ice_vc_isvalid_q_id(vf, vsi_id, q_id)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		if (ice_vf_vsi_ena_single_rxq(vf, vsi, q_id, vf_q_id)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+	}
+
+	tc = 0;
+	vsi = ice_get_vf_vsi(vf);
+	q_map = vqs->tx_queues;
+	for_each_set_bit(vf_q_id, &q_map, ICE_MAX_DFLT_QS_PER_VF) {
+		u16 vsi_id, q_id;
+
+		ice_vf_q_id_get_vsi_q_id(vf, vf_q_id, &tc, vqs, &vsi,
+					 &vsi_id, &q_id);
+		if (ice_is_vf_adq_ena(vf) && tc >= ICE_VF_CHNL_START_TC) {
+			if (!ice_vf_adq_vsi_valid(vf, tc)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+
+		if (!ice_vc_isvalid_q_id(vf, vsi_id, q_id)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		ice_vf_vsi_ena_single_txq(vf, vsi, q_id, vf_q_id);
+	}
+
+	/* Set flag to indicate that queues are enabled */
+	if (v_ret == VIRTCHNL_STATUS_SUCCESS) {
+		set_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
+		if (vf->repr)
+			netif_carrier_on(vf->repr->netdev);
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_QUEUES, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_vf_vsi_dis_single_txq - disable a single Tx queue for the VF based on relative queue ID
+ * @vf: VF to disable queue for
+ * @vsi: VSI for the VF
+ * @q_id: VSI relative (0-based) queue ID
+ * @vf_q_id: VF relative (0-based) queue ID
+ *
+ * Attempt to disable the Tx queue passed in. If the Tx queue was successfully disabled then clear
+ * q_id bit in the enabled queues bitmap and return success. Otherwise return error.
+ */
+static int ice_vf_vsi_dis_single_txq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id, u16 vf_q_id)
+{
+	struct ice_txq_meta txq_meta = { 0 };
+	struct ice_ring *ring;
+	int err;
+
+	/* Skip queue if not enabled */
+	if (!test_bit(vf_q_id, vf->txq_ena))
+		return 0;
+
+	ring = vsi->tx_rings[q_id];
+	if (!ring)
+		return -EINVAL;
+
+	ice_fill_txq_meta(vsi, ring, &txq_meta);
+
+	err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, vf->vf_id, ring, &txq_meta);
+	if (err) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Tx ring %d on VSI %d\n",
+			q_id, vsi->vsi_num);
+		return err;
+	}
+
+	/* Clear enabled queues flag */
+	clear_bit(vf_q_id, vf->txq_ena);
+
+	return 0;
+}
+
+/**
+ * ice_vf_vsi_dis_single_rxq - disable a Rx queue for VF on relative queue ID
+ * @vf: VF to disable queue for
+ * @vsi: VSI for the VF
+ * @q_id: VSI relative (0-based) queue ID
+ * @vf_q_id: VF relative (0-based) queue ID
+ *
+ * Attempt to disable the Rx queue passed in. If the Rx queue was successfully
+ * disabled then clear q_id bit in the enabled queues bitmap and return success.
+ * Otherwise return error.
+ */
+
+static int ice_vf_vsi_dis_single_rxq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id, u16 vf_q_id)
+{
+	int err;
+
+	if (!test_bit(vf_q_id, vf->rxq_ena))
+		return 0;
+
+	err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_id, true);
+	if (err) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Rx ring %d on VSI %d\n",
+			q_id, vsi->vsi_num);
+		return err;
+	}
+
+	/* Clear enabled queues flag */
+	clear_bit(vf_q_id, vf->rxq_ena);
+
+	return 0;
+}
+
+/**
+ * ice_vc_dis_qs_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to disable all or specific
+ * queue(s)
+ */
+static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_queue_select *vqs =
+	    (struct virtchnl_queue_select *)msg;
+	struct ice_vsi *vsi;
+	unsigned long q_map;
+	u16 vf_q_id = 0;
+	u16 tc = 0;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) &&
+	    !test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_validate_vqs_bitmaps(vqs)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vqs->tx_queues) {
+		q_map = vqs->tx_queues;
+
+		for_each_set_bit(vf_q_id, &q_map, ICE_MAX_DFLT_QS_PER_VF) {
+			u16 vsi_id, q_id;
+
+			ice_vf_q_id_get_vsi_q_id(vf, vf_q_id, &tc, vqs, &vsi,
+						 &vsi_id, &q_id);
+			if (ice_is_vf_adq_ena(vf) &&
+			    tc >= ICE_VF_CHNL_START_TC) {
+				if (!ice_vf_adq_vsi_valid(vf, tc)) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					goto error_param;
+				}
+			}
+
+			if (!ice_vc_isvalid_q_id(vf, vsi_id, q_id)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			if (ice_vf_vsi_dis_single_txq(vf, vsi, q_id, vf_q_id)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+	}
+
+	q_map = vqs->rx_queues;
+	tc = 0;
+	/* Reset VSI pointer as it was assigned to ADQ VSIs */
+	vsi = ice_get_vf_vsi(vf);
+	/* speed up Rx queue disable by batching them if possible */
+	if (q_map &&
+	    bitmap_equal(&q_map, vf->rxq_ena, ICE_MAX_DFLT_QS_PER_VF)) {
+		if (ice_vsi_stop_all_rx_rings(vsi)) {
+			dev_err(ice_pf_to_dev(vsi->back), "Failed to stop all Rx rings on VSI %d\n",
+				vsi->vsi_num);
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+		if (ice_is_vf_adq_ena(vf)) {
+			for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+				vsi = ice_get_vf_adq_vsi(vf, tc);
+				if (ice_vsi_stop_all_rx_rings(vsi)) {
+					dev_err(ice_pf_to_dev(vsi->back),
+						"Failed to stop all Rx rings on VF ADQ VSI %d\n",
+						vsi->vsi_num);
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					goto error_param;
+				}
+			}
+		}
+		bitmap_zero(vf->rxq_ena, ICE_MAX_DFLT_QS_PER_VF);
+	} else if (q_map) {
+		for_each_set_bit(vf_q_id, &q_map, ICE_MAX_DFLT_QS_PER_VF) {
+			u16 vsi_id, q_id;
+
+			ice_vf_q_id_get_vsi_q_id(vf, vf_q_id, &tc, vqs, &vsi,
+						 &vsi_id, &q_id);
+			if (ice_is_vf_adq_ena(vf) &&
+			    tc >= ICE_VF_CHNL_START_TC) {
+				if (!ice_vf_adq_vsi_valid(vf, tc)) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					goto error_param;
+				}
+			}
+			if (!ice_vc_isvalid_q_id(vf, vsi_id, q_id)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			if (ice_vf_vsi_dis_single_rxq(vf, vsi, q_id, vf_q_id)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+	}
+
+	/* Clear enabled queues flag */
+	if (v_ret == VIRTCHNL_STATUS_SUCCESS && ice_vf_has_no_qs_ena(vf)) {
+		clear_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
+		if (vf->repr)
+			netif_carrier_off(vf->repr->netdev);
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_QUEUES, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_cfg_interrupt
+ * @vf: pointer to the VF info
+ * @vsi: the VSI being configured
+ * @vector_id: vector ID
+ * @tc: traffic class number for ADQ
+ * @map: vector map for mapping vectors to queues
+ * @q_vector: structure for interrupt vector
+ * configure the IRQ to queue map
+ */
+static int
+ice_cfg_interrupt(struct ice_vf *vf, struct ice_vsi *vsi, u16 vector_id,
+		  u8 __maybe_unused tc, struct virtchnl_vector_map *map,
+		  struct ice_q_vector *q_vector)
+{
+	unsigned long qmap;
+	u16 vsi_q_id_idx;
+
+	q_vector->num_ring_rx = 0;
+	q_vector->num_ring_tx = 0;
+
+	qmap = map->rxq_map;
+	for_each_set_bit(vsi_q_id_idx, &qmap, ICE_MAX_DFLT_QS_PER_VF) {
+		u16 vsi_q_id = vsi_q_id_idx;
+
+		if (tc && ice_is_vf_adq_ena(vf))
+			vsi_q_id = ice_vf_get_tc_based_qid(vsi_q_id_idx,
+							   vf->ch[tc].offset);
+
+		if (!ice_vc_isvalid_q_id(vf, vsi->vsi_num, vsi_q_id))
+			return VIRTCHNL_STATUS_ERR_PARAM;
+
+		q_vector->num_ring_rx++;
+		q_vector->rx.itr_idx = map->rxitr_idx;
+		vsi->rx_rings[vsi_q_id]->q_vector = q_vector;
+		ice_cfg_rxq_interrupt(vsi, vsi_q_id, vector_id,
+				      q_vector->rx.itr_idx);
+	}
+
+	qmap = map->txq_map;
+	for_each_set_bit(vsi_q_id_idx, &qmap, ICE_MAX_DFLT_QS_PER_VF) {
+		u16 vsi_q_id = vsi_q_id_idx;
+
+		if (tc && ice_is_vf_adq_ena(vf))
+			vsi_q_id = ice_vf_get_tc_based_qid(vsi_q_id_idx,
+							   vf->ch[tc].offset);
+
+		if (!ice_vc_isvalid_q_id(vf, vsi->vsi_num, vsi_q_id))
+			return VIRTCHNL_STATUS_ERR_PARAM;
+
+		q_vector->num_ring_tx++;
+		q_vector->tx.itr_idx = map->txitr_idx;
+		vsi->tx_rings[vsi_q_id]->q_vector = q_vector;
+		ice_cfg_txq_interrupt(vsi, vsi_q_id, vector_id,
+				      q_vector->tx.itr_idx);
+	}
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_vc_cfg_irq_map_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to configure the IRQ to queue map
+ */
+static int ice_vc_cfg_irq_map_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_irq_map_info *irqmap_info;
+	struct virtchnl_vector_map *map;
+	struct ice_pf *pf = vf->pf;
+	u16 num_q_vectors_mapped;
+	struct ice_vsi *vsi;
+	u16 vector_id_ch;
+	u16 tc = 0;
+	int i;
+
+	irqmap_info = (struct virtchnl_irq_map_info *)msg;
+	num_q_vectors_mapped = irqmap_info->num_vectors;
+
+	/* Check to make sure number of VF vectors mapped is not greater than
+	 * number of VF vectors originally allocated, and check that
+	 * there is actually at least a single VF queue vector mapped
+	 */
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
+	    pf->num_msix_per_vf < num_q_vectors_mapped ||
+	    !num_q_vectors_mapped) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < num_q_vectors_mapped; i++) {
+		struct ice_q_vector *q_vector;
+		u16 vsi_id, vector_id;
+
+		map = &irqmap_info->vecmap[i];
+
+		vector_id = map->vector_id;
+		vsi_id = map->vsi_id;
+		if (ice_is_vf_adq_ena(vf) && tc >= ICE_VF_CHNL_START_TC) {
+			if (!ice_vf_adq_vsi_valid(vf, tc)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+			vsi_id = vsi->vsi_num;
+		}
+		/* vector_id is always 0-based for each VF, and can never be
+		 * larger than or equal to the max allowed interrupts per VF
+		 */
+		if (!(vector_id < pf->num_msix_per_vf) ||
+		    !ice_vc_isvalid_vsi_id(vf, vsi_id) ||
+		    (!vector_id && (map->rxq_map || map->txq_map))) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		/* No need to map VF miscellaneous or rogue vector */
+		if (!vector_id)
+			continue;
+
+		/* Subtract non queue vector from vector_id passed by VF
+		 * to get actual number of VSI queue vector array index
+		 */
+		if (tc && ice_is_vf_adq_ena(vf))
+			vector_id_ch = vector_id - vf->ch[tc].offset;
+		else
+			vector_id_ch = vector_id;
+
+		/* if ADQ enablement failed, the main VF VSI could have been
+		 * reconfigured (based on TC0 information - means main
+		 * VF VSI queues and vectors are equal to TC0.num_qps and
+		 * not equal to "num_q_vectors" which is part of
+		 * irq_cfg virtchnl message) so prevent using invalid vector ID
+		 */
+		if ((vector_id_ch - ICE_NONQ_VECS_VF) >= vsi->num_q_vectors) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+		q_vector = vsi->q_vectors[vector_id_ch - ICE_NONQ_VECS_VF];
+		if (!q_vector) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		/* lookout for the invalid queue index */
+
+		v_ret = (enum virtchnl_status_code)
+			ice_cfg_interrupt(vf, vsi, vector_id, tc, map,
+					  q_vector);
+		if (v_ret)
+			goto error_param;
+
+		/* Update VSI and TC only when ADQ is configured */
+		if (ice_is_vf_adq_ena(vf) &&
+		    vector_id == vf->ch[tc + 1].offset) {
+			vsi = pf->vsi[vf->ch[tc + 1].vsi_idx];
+			tc++;
+		}
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_IRQ_MAP, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_vc_get_max_allowed_qpairs - get max allowed queue pairs based on negotiated capabilities
+ * @vf: VF used to get max queue pairs allowed
+ *
+ * The maximum allowed queues is determined based on whether VIRTCHNL_VF_LARGE_NUM_QPAIRS was
+ * negotiated.
+ */
+static int ice_vc_get_max_allowed_qpairs(struct ice_vf *vf)
+{
+	if (vf->driver_caps & VIRTCHNL_VF_LARGE_NUM_QPAIRS)
+		return ICE_MAX_QS_PER_VF;
+
+	return ICE_MAX_DFLT_QS_PER_VF;
+}
+
+/**
+ * ice_vc_cfg_qs_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to configure the Rx/Tx queues
+ */
+static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vsi_queue_config_info *qci =
+	    (struct virtchnl_vsi_queue_config_info *)msg;
+	struct virtchnl_queue_pair_info *qpi;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+	u16 queue_id_tmp, tc;
+	int i, q_idx;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, qci->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	/* check for number of queues is done in ice_alloc_vf_res() function
+	 * for ADQ
+	 */
+	if (ice_is_vf_adq_ena(vf))
+		goto skip_num_queues_check;
+
+	if (qci->num_queue_pairs > ice_vc_get_max_allowed_qpairs(vf) ||
+	    qci->num_queue_pairs > min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)) {
+		dev_err(ice_pf_to_dev(pf), "VF-%d trying to configure more than allocated number of queues: %d\n",
+			vf->vf_id, min_t(u16, vsi->alloc_txq, vsi->alloc_rxq));
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+skip_num_queues_check:
+	queue_id_tmp = 0;
+	tc = 0;
+	for (i = 0; i < qci->num_queue_pairs; i++) {
+		if (!qci->qpair[i].rxq.crc_disable)
+			continue;
+
+		if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_CRC) ||
+		    vf->dcf_vlan_info.outer_stripping_ena ||
+		    vf->vlan_strip_ena) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+	}
+	for (i = 0; i < qci->num_queue_pairs; i++) {
+		qpi = &qci->qpair[i];
+		if (ice_is_vf_adq_ena(vf))
+			goto skip_non_adq_checks;
+
+		if (qpi->txq.vsi_id != qci->vsi_id ||
+		    qpi->rxq.vsi_id != qci->vsi_id ||
+		    qpi->rxq.queue_id != qpi->txq.queue_id ||
+		    qpi->txq.headwb_enabled ||
+		    !ice_vc_isvalid_ring_len(qpi->txq.ring_len) ||
+		    !ice_vc_isvalid_ring_len(qpi->rxq.ring_len) ||
+		    !ice_vc_isvalid_q_id(vf, qci->vsi_id,
+					 qpi->txq.queue_id)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+skip_non_adq_checks:
+		if (ice_is_vf_adq_ena(vf)) {
+			q_idx = queue_id_tmp;
+			vsi = ice_find_vsi_from_id(vf->pf, vf->ch[tc].vsi_num);
+			if (!vsi) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		} else {
+			q_idx = qpi->rxq.queue_id;
+		}
+
+		/* make sure selected "q_idx" is in valid range of queues
+		 * for selected "vsi" (which could be main VF VSI or
+		 * VF ADQ VSI
+		 */
+		if (q_idx >= vsi->alloc_txq || q_idx >= vsi->alloc_rxq) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		/* copy Tx queue info from VF into VSI */
+		if (qpi->txq.ring_len > 0) {
+			vsi->tx_rings[q_idx]->dma = qpi->txq.dma_ring_addr;
+			vsi->tx_rings[q_idx]->count = qpi->txq.ring_len;
+			if (ice_vsi_cfg_single_txq(vsi, vsi->tx_rings, q_idx)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+
+		/* copy Rx queue info from VF into VSI */
+		if (qpi->rxq.ring_len > 0) {
+			u16 max_frame_size = ice_vc_get_max_frame_size(vf);
+			u32 rxdid;
+
+			vsi->rx_rings[q_idx]->dma = qpi->rxq.dma_ring_addr;
+			vsi->rx_rings[q_idx]->count = qpi->rxq.ring_len;
+
+			vsi->rx_rings[q_idx]->rx_crc_strip_dis = qpi->rxq.crc_disable;
+
+			if (qpi->rxq.databuffer_size != 0 &&
+			    (qpi->rxq.databuffer_size > ((16 * 1024) - 128) ||
+			     qpi->rxq.databuffer_size < 1024)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+			vsi->rx_buf_len = qpi->rxq.databuffer_size;
+			vsi->rx_rings[q_idx]->rx_buf_len = vsi->rx_buf_len;
+			if (qpi->rxq.max_pkt_size > max_frame_size ||
+			    qpi->rxq.max_pkt_size < 64) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			vsi->max_frame = qpi->rxq.max_pkt_size;
+			/* add space for the port VLAN since the VF driver is not
+			 * expected to account for it in the MTU calculation
+			 */
+			if (ice_vf_is_port_vlan_ena(vf))
+				vsi->max_frame += VLAN_HLEN;
+
+			if (ice_vsi_cfg_single_rxq(vsi, q_idx)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			/* If Rx flex desc is supported, select RXDID for Rx queues.
+			 * Otherwise, use legacy 32byte descriptor format.
+			 * Legacy 16byte descriptor is not supported. If this RXDID
+			 * is selected, return error.
+			 */
+			if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+				rxdid = qpi->rxq.rxdid;
+				if (!(BIT(rxdid) & pf->supported_rxdids)) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					goto error_param;
+				}
+			} else {
+				rxdid = ICE_RXDID_LEGACY_1;
+			}
+
+			ice_write_qrxflxp_cntxt(&vsi->back->hw, vsi->rxq_map[q_idx], rxdid, 0x03,
+						false);
+		}
+
+		/* For ADQ there can be up to 4 VSIs with max 4 queues each.
+		 * VF does not know about these additional VSIs and all
+		 * it cares is about its own queues. PF configures these queues
+		 * to its appropriate VSIs based on TC mapping
+		 */
+		if (ice_is_vf_adq_ena(vf)) {
+			if (queue_id_tmp == (vf->ch[tc].num_qps - 1)) {
+				tc++;
+				/* reset the queue num */
+				queue_id_tmp = 0;
+			} else {
+				queue_id_tmp++;
+			}
+		}
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_VSI_QUEUES, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_is_vf_trusted
+ * @vf: pointer to the VF info
+ */
+static bool ice_is_vf_trusted(struct ice_vf *vf)
+{
+	return test_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+}
+
+/**
+ * ice_can_vf_change_mac
+ * @vf: pointer to the VF info
+ *
+ * Return true if the VF is allowed to change its MAC filters, false otherwise
+ */
+static bool ice_can_vf_change_mac(struct ice_vf *vf)
+{
+	/* If the VF MAC address has been set administratively (via the
+	 * ndo_set_vf_mac command), then deny permission to the VF to
+	 * add/delete unicast MAC addresses, unless the VF is trusted
+	 */
+	if (vf->pf_set_mac && !ice_is_vf_trusted(vf))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_ether_addr_type - get type of virtchnl_ether_addr
+ * @vc_ether_addr: used to extract the type
+ */
+static u8
+ice_vc_ether_addr_type(struct virtchnl_ether_addr *vc_ether_addr)
+{
+	return (vc_ether_addr->type & VIRTCHNL_ETHER_ADDR_TYPE_MASK);
+}
+
+/**
+ * ice_is_vc_addr_legacy - check if the MAC address is from an older VF
+ * @vc_ether_addr: VIRTCHNL structure that contains MAC and type
+ */
+static bool
+ice_is_vc_addr_legacy(struct virtchnl_ether_addr __maybe_unused *vc_ether_addr)
+{
+	u8 type = ice_vc_ether_addr_type(vc_ether_addr);
+
+	return (type == VIRTCHNL_ETHER_ADDR_LEGACY);
+}
+
+/**
+ * ice_is_vc_addr_primary - check if the MAC address is the VF's primary MAC
+ * @vc_ether_addr: VIRTCHNL structure that contains MAC and type
+ *
+ * This function should only be called when the MAC address in
+ * virtchnl_ether_addr is a valid unicast MAC
+ */
+static bool
+ice_is_vc_addr_primary(struct virtchnl_ether_addr __maybe_unused *vc_ether_addr)
+{
+	u8 type = ice_vc_ether_addr_type(vc_ether_addr);
+
+	return (type == VIRTCHNL_ETHER_ADDR_PRIMARY);
+}
+
+/**
+ * ice_vfhw_mac_add - update the VF's cached hardware MAC if allowed
+ * @vf: VF to update
+ * @vc_ether_addr: structure from VIRTCHNL with MAC to add
+ */
+static void
+ice_vfhw_mac_add(struct ice_vf *vf, struct virtchnl_ether_addr *vc_ether_addr)
+{
+	u8 *mac_addr = vc_ether_addr->addr;
+
+	if (!is_valid_ether_addr(mac_addr))
+		return;
+
+	/* only allow legacy VF drivers to set the device and hardware MAC if it
+	 * is zero and allow new VF drivers to set the hardware MAC if the type
+	 * was correctly specified over VIRTCHNL
+	 */
+	if ((ice_is_vc_addr_legacy(vc_ether_addr) &&
+	     is_zero_ether_addr(vf->hw_lan_addr.addr)) ||
+	    ice_is_vc_addr_primary(vc_ether_addr)) {
+		ether_addr_copy(vf->dev_lan_addr.addr, mac_addr);
+		ether_addr_copy(vf->hw_lan_addr.addr, mac_addr);
+	}
+
+	/* hardware and device MACs are already set, but its possible that the
+	 * VF driver sent the VIRTCHNL_OP_ADD_ETH_ADDR message before the
+	 * VIRTCHNL_OP_DEL_ETH_ADDR when trying to update its MAC, so save it
+	 * away for the legacy VF driver case as it will be updated in the
+	 * delete flow for this case
+	 */
+	if (ice_is_vc_addr_legacy(vc_ether_addr)) {
+		ether_addr_copy(vf->legacy_last_added_umac.addr,
+				mac_addr);
+		vf->legacy_last_added_umac.time_modified = jiffies;
+	}
+}
+
+/**
+ * ice_vc_add_mac_addr - attempt to add the MAC address passed in
+ * @vf: pointer to the VF info
+ * @vsi: pointer to the VF's VSI
+ * @vc_ether_addr: VIRTCHNL MAC address structure used to add MAC
+ */
+static int
+ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi,
+		    struct virtchnl_ether_addr *vc_ether_addr)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	u8 *mac_addr = vc_ether_addr->addr;
+	enum ice_status status;
+	int ret = 0;
+
+	/* device MAC already added */
+	if (!ether_addr_equal(mac_addr, vf->dev_lan_addr.addr)) {
+		if (is_unicast_ether_addr(mac_addr) &&
+		    !ice_can_vf_change_mac(vf)) {
+			dev_err(dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n");
+			return -EPERM;
+		}
+
+		status = ice_fltr_add_mac(vsi, mac_addr, ICE_FWD_TO_VSI);
+		if (status == ICE_ERR_ALREADY_EXISTS) {
+			dev_dbg(dev, "MAC %pM already exists for VF %d\n", mac_addr,
+				vf->vf_id);
+			/* don’t return since we might need to update
+			 * the primary MAC in ice_vfhw_mac_add() below
+			 */
+			ret = -EEXIST;
+		} else if (status) {
+			dev_err(dev, "Failed to add MAC %pM for VF %d\n, error %s\n", mac_addr,
+				vf->vf_id, ice_stat_str(status));
+			return -EIO;
+		} else {
+			vf->num_mac++;
+		}
+
+		ice_vfhw_mac_add(vf, vc_ether_addr);
+	}
+
+	return ret;
+}
+
+/**
+ * ice_is_legacy_umac_expired - check if last added legacy unicast MAC expired
+ * @last_added_umac: structure used to check expiration
+ */
+static bool ice_is_legacy_umac_expired(struct ice_time_mac *last_added_umac)
+{
+#define ICE_LEGACY_VF_MAC_CHANGE_EXPIRE_TIME	msecs_to_jiffies(3000)
+	return time_is_before_jiffies(last_added_umac->time_modified +
+				      ICE_LEGACY_VF_MAC_CHANGE_EXPIRE_TIME);
+}
+
+/**
+ * ice_vfhw_mac_del - update the VF's cached hardware MAC if allowed
+ * @vf: VF to update
+ * @vc_ether_addr: structure from VIRTCHNL with MAC to delete
+ */
+static void
+ice_vfhw_mac_del(struct ice_vf *vf, struct virtchnl_ether_addr *vc_ether_addr)
+{
+	u8 *mac_addr = vc_ether_addr->addr;
+
+	if (!is_valid_ether_addr(mac_addr) ||
+	    !ether_addr_equal(vf->dev_lan_addr.addr, mac_addr))
+		return;
+
+	/* allow the device MAC to be repopulated in the add flow and don't
+	 * clear the hardware MAC (i.e. hw_lan_addr.addr) here as that is meant
+	 * to be persistent on VM reboot and across driver unload/load, which
+	 * won't work if we clear the hardware MAC here
+	 */
+	eth_zero_addr(vf->dev_lan_addr.addr);
+
+	/* only update cached hardware MAC for legacy VF drivers on delete
+	 * because we cannot guarantee order/type of MAC from the VF driver
+	 */
+	if (ice_is_vc_addr_legacy(vc_ether_addr) &&
+	    !ice_is_legacy_umac_expired(&vf->legacy_last_added_umac)) {
+		ether_addr_copy(vf->dev_lan_addr.addr,
+				vf->legacy_last_added_umac.addr);
+		ether_addr_copy(vf->hw_lan_addr.addr,
+				vf->legacy_last_added_umac.addr);
+	}
+}
+
+/**
+ * ice_vc_del_mac_addr - attempt to delete the MAC address passed in
+ * @vf: pointer to the VF info
+ * @vsi: pointer to the VF's VSI
+ * @vc_ether_addr: VIRTCHNL MAC address structure used to delete MAC
+ */
+static int
+ice_vc_del_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi,
+		    struct virtchnl_ether_addr *vc_ether_addr)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	u8 *mac_addr = vc_ether_addr->addr;
+	enum ice_status status;
+
+	if (!ice_can_vf_change_mac(vf) &&
+	    ether_addr_equal(vf->dev_lan_addr.addr, mac_addr))
+		return 0;
+
+	status = ice_fltr_remove_mac(vsi, mac_addr, ICE_FWD_TO_VSI);
+	if (status == ICE_ERR_DOES_NOT_EXIST) {
+		dev_err(dev, "MAC %pM does not exist for VF %d\n", mac_addr,
+			vf->vf_id);
+		return -ENOENT;
+	} else if (status) {
+		dev_err(dev, "Failed to delete MAC %pM for VF %d, error %s\n",
+			mac_addr, vf->vf_id, ice_stat_str(status));
+		return -EIO;
+	}
+
+
+	ice_vfhw_mac_del(vf, vc_ether_addr);
+
+	vf->num_mac--;
+
+	return 0;
+}
+
+/**
+ * ice_vc_handle_mac_addr_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @set: true if MAC filters are being set, false otherwise
+ *
+ * add guest MAC address filter
+ */
+static int
+ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set)
+{
+	int (*ice_vc_cfg_mac)
+		(struct ice_vf *vf, struct ice_vsi *vsi,
+		 struct virtchnl_ether_addr *virtchnl_ether_addr);
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_ether_addr_list *al =
+	    (struct virtchnl_ether_addr_list *)msg;
+	struct ice_pf *pf = vf->pf;
+	enum virtchnl_ops vc_op;
+	struct ice_vsi *vsi;
+	int i;
+
+	if (set) {
+		vc_op = VIRTCHNL_OP_ADD_ETH_ADDR;
+		ice_vc_cfg_mac = ice_vc_add_mac_addr;
+	} else {
+		vc_op = VIRTCHNL_OP_DEL_ETH_ADDR;
+		ice_vc_cfg_mac = ice_vc_del_mac_addr;
+	}
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
+	    !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto handle_mac_exit;
+	}
+
+	/* If this VF is not privileged, then we can't add more than a
+	 * limited number of addresses. Check to make sure that the
+	 * additions do not push us over the limit.
+	 */
+	if (set && !ice_is_vf_trusted(vf) &&
+	    (vf->num_mac + al->num_elements) > ICE_MAX_MACADDR_PER_VF) {
+		dev_err(ice_pf_to_dev(pf), "Can't add more MAC addresses, because VF-%d is not trusted, switch the VF to trusted mode in order to add more functionalities\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto handle_mac_exit;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto handle_mac_exit;
+	}
+
+	for (i = 0; i < al->num_elements; i++) {
+		u8 *mac_addr = al->list[i].addr;
+		int result;
+
+		if (is_broadcast_ether_addr(mac_addr) ||
+		    is_zero_ether_addr(mac_addr))
+			continue;
+
+		result = ice_vc_cfg_mac(vf, vsi, &al->list[i]);
+		if (result == -EEXIST || result == -ENOENT) {
+			continue;
+		} else if (result) {
+			v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+			goto handle_mac_exit;
+		}
+	}
+
+handle_mac_exit:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, vc_op, v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_add_mac_addr_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * add guest MAC address filter
+ */
+static int ice_vc_add_mac_addr_msg(struct ice_vf *vf, u8 *msg)
+{
+	return ice_vc_handle_mac_addr_msg(vf, msg, true);
+}
+
+/**
+ * ice_vc_del_mac_addr_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * remove guest MAC address filter
+ */
+static int ice_vc_del_mac_addr_msg(struct ice_vf *vf, u8 *msg)
+{
+	return ice_vc_handle_mac_addr_msg(vf, msg, false);
+}
+
+/**
+ * ice_vc_request_qs_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * VFs get a default number of queues but can use this message to request a
+ * different number. If the request is successful, PF will reset the VF and
+ * return 0. If unsuccessful, PF will send message informing VF of number of
+ * available queue pairs via virtchnl message response to VF.
+ */
+static int ice_vc_request_qs_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vf_res_request *vfres =
+		(struct virtchnl_vf_res_request *)msg;
+	u16 max_avail_vf_qps, max_allowed_vf_qps;
+	u16 req_queues = vfres->num_queue_pairs;
+	struct ice_pf *pf = vf->pf;
+	u16 tx_rx_queue_left;
+	struct device *dev;
+	u16 cur_queues;
+
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	max_allowed_vf_qps = ice_vc_get_max_allowed_qpairs(vf);
+
+	cur_queues = vf->num_vf_qs;
+	tx_rx_queue_left = min_t(u16, ice_get_avail_txq_count(pf),
+				 ice_get_avail_rxq_count(pf));
+	max_avail_vf_qps = tx_rx_queue_left + cur_queues;
+	if (!req_queues) {
+		dev_err(dev, "VF %d tried to request 0 queues. Ignoring.\n",
+			vf->vf_id);
+	} else if (req_queues > max_allowed_vf_qps) {
+		dev_err(dev, "VF %d tried to request more than %d queues.\n",
+			vf->vf_id, max_allowed_vf_qps);
+		vfres->num_queue_pairs = max_allowed_vf_qps;
+	} else if (req_queues > cur_queues &&
+		   req_queues - cur_queues > tx_rx_queue_left) {
+		dev_warn(dev, "VF %d requested %u more queues, but only %u left.\n",
+			 vf->vf_id, req_queues - cur_queues, tx_rx_queue_left);
+		vfres->num_queue_pairs = min_t(u16, max_avail_vf_qps, max_allowed_vf_qps);
+	} else {
+		/* request is successful, then reset VF */
+		vf->num_req_qs = req_queues;
+		ice_vc_reset_vf(vf);
+		dev_info(dev, "VF %d granted request of %u queues.\n",
+			 vf->vf_id, req_queues);
+		return 0;
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES,
+				     v_ret, (u8 *)vfres, sizeof(*vfres));
+}
+
+/**
+ * ice_is_supported_port_vlan_proto - make sure the vlan_proto is supported
+ * @hw: hardware structure used to check the VLAN mode
+ * @vlan_proto: VLAN TPID being checked
+ *
+ * If the device is configured in Double VLAN Mode (DVM), then both ETH_P_8021Q
+ * and ETH_P_8021AD are supported. If the device is configured in Single VLAN
+ * Mode (SVM), then only ETH_P_8021Q is supported.
+ */
+static bool
+ice_is_supported_port_vlan_proto(struct ice_hw *hw, u16 vlan_proto)
+{
+	bool is_supported = false;
+
+	switch (vlan_proto) {
+	case ETH_P_8021Q:
+		is_supported = true;
+		break;
+	case ETH_P_8021AD:
+		if (ice_is_dvm_ena(hw))
+			is_supported = true;
+		break;
+	}
+
+	return is_supported;
+}
+
+#ifdef IFLA_VF_VLAN_INFO_MAX
+/**
+ * ice_set_vf_port_vlan
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @vlan_id: VLAN ID being set
+ * @qos: priority setting
+ * @vlan_proto: VLAN protocol
+ *
+ * program VF Port VLAN ID and/or QoS
+ */
+int
+ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
+		     __be16 vlan_proto)
+#else
+int
+ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos)
+#endif /* IFLA_VF_VLAN_INFO_MAX */
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+#ifdef IFLA_VF_VLAN_INFO_MAX
+	u16 local_vlan_proto = ntohs(vlan_proto);
+#else
+	u16 local_vlan_proto = ETH_P_8021Q;
+#endif
+	struct device *dev;
+	struct ice_vf *vf;
+	int ret;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	if (vlan_id >= VLAN_N_VID || qos > 7) {
+		dev_err(dev, "Invalid Port VLAN parameters for VF %d, ID %d, QoS %d\n",
+			vf_id, vlan_id, qos);
+		return -EINVAL;
+	}
+
+	if (!ice_is_supported_port_vlan_proto(&pf->hw, local_vlan_proto)) {
+		dev_err(dev, "VF VLAN protocol 0x%04x is not supported\n",
+			local_vlan_proto);
+		return -EPROTONOSUPPORT;
+	}
+
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	if (ice_vf_get_port_vlan_prio(vf) == qos &&
+	    ice_vf_get_port_vlan_tpid(vf) == local_vlan_proto &&
+	    ice_vf_get_port_vlan_id(vf) == vlan_id) {
+		/* duplicate request, so just return success */
+		dev_dbg(dev, "Duplicate port VLAN %u, QoS %u, TPID 0x%04x request\n",
+			vlan_id, qos, local_vlan_proto);
+		return 0;
+	}
+
+	vf->port_vlan_info =
+		ICE_VLAN(local_vlan_proto, vlan_id, qos, ICE_FWD_TO_VSI);
+	if (ice_vf_is_port_vlan_ena(vf))
+		dev_info(dev, "Setting VLAN %u, QoS %u, TPID 0x%04x on VF %d\n",
+			 vlan_id, qos, local_vlan_proto, vf_id);
+	else
+		dev_info(dev, "Clearing port VLAN on VF %d\n", vf_id);
+
+	ice_vc_reset_vf(vf);
+
+	return 0;
+}
+
+/**
+ * ice_vf_vlan_offload_ena - determine if capabilities support VLAN offloads
+ * @caps: VF driver negotiated capabilities
+ *
+ * Return true if VIRTCHNL_VF_OFFLOAD_VLAN capability is set, else return false
+ */
+static bool ice_vf_vlan_offload_ena(u32 caps)
+{
+	return !!(caps & VIRTCHNL_VF_OFFLOAD_VLAN);
+}
+
+/**
+ * ice_is_vlan_promisc_allowed - check if VLAN promiscuous config is allowed
+ * @vf: VF used to determine if VLAN promiscuous config is allowed
+ */
+static bool ice_is_vlan_promisc_allowed(struct ice_vf *vf)
+{
+	if ((test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
+	     test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) &&
+	    test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, vf->pf->flags))
+		return true;
+
+	return false;
+}
+
+/**
+ * ice_vf_ena_vlan_promisc - Enable Tx/Rx VLAN promiscuous for the VLAN
+ * @vsi: VF's VSI used to enable VLAN promiscuous mode
+ * @vlan: VLAN used to enable VLAN promiscuous
+ *
+ * This function should only be called if VLAN promiscuous mode is allowed,
+ * which can be determined via ice_is_vlan_promisc_allowed().
+ */
+static int ice_vf_ena_vlan_promisc(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	u8 promisc_m = ICE_PROMISC_VLAN_TX | ICE_PROMISC_VLAN_RX;
+	enum ice_status status;
+
+	status = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m,
+					  vlan->vid, vsi->port_info->lport);
+	if (status && status != ICE_ERR_ALREADY_EXISTS)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+/**
+ * ice_vf_dis_vlan_promisc - Disable Tx/Rx VLAN promiscuous for the VLAN
+ * @vsi: VF's VSI used to disable VLAN promiscuous mode for
+ * @vlan: VLAN used to disable VLAN promiscuous
+ *
+ * This function should only be called if VLAN promiscuous mode is allowed,
+ * which can be determined via ice_is_vlan_promisc_allowed().
+ */
+static int ice_vf_dis_vlan_promisc(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	u8 promisc_m = ICE_PROMISC_VLAN_TX | ICE_PROMISC_VLAN_RX;
+	enum ice_status status;
+
+	status = ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m,
+					    vlan->vid, vsi->port_info->lport);
+	if (status && status != ICE_ERR_DOES_NOT_EXIST)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+/**
+ * ice_vf_has_max_vlans - check if VF already has the max allowed VLAN filters
+ * @vf: VF to check against
+ * @vsi: VF's VSI
+ *
+ * If the VF is trusted then the VF is allowed to add as many VLANs as it
+ * wants to, so return false.
+ *
+ * When the VF is untrusted compare the number of non-zero VLANs + 1 to the max
+ * allowed VLANs for an untrusted VF. Return the result of this comparison.
+ */
+static bool ice_vf_has_max_vlans(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+	if (ice_is_vf_trusted(vf))
+		return false;
+
+#define ICE_VF_ADDED_VLAN_ZERO_FLTRS	1
+	return ((ice_vsi_num_non_zero_vlans(vsi) +
+		 ICE_VF_ADDED_VLAN_ZERO_FLTRS) >= ICE_MAX_VLAN_PER_VF);
+}
+
+/**
+ * ice_vc_process_vlan_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @add_v: Add VLAN if true, otherwise delete VLAN
+ *
+ * Process virtchnl op to add or remove programmed guest VLAN ID
+ */
+static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vlan_filter_list *vfl =
+	    (struct virtchnl_vlan_filter_list *)msg;
+	struct ice_pf *pf = vf->pf;
+	bool vlan_promisc = false;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	int status = 0;
+	int i;
+
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vf_vlan_offload_ena(vf->driver_caps)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vfl->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		if (vfl->vlan_id[i] >= VLAN_N_VID) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			dev_err(dev, "invalid VF VLAN id %d\n",
+				vfl->vlan_id[i]);
+			goto error_param;
+		}
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (add_v && ice_vf_has_max_vlans(vf, vsi)) {
+		dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
+			 vf->vf_id);
+		/* There is no need to let VF know about being not trusted,
+		 * so we can just return success message here
+		 */
+		goto error_param;
+	}
+
+	/* in DVM a VF can add/delete inner VLAN filters when
+	 * VIRTCHNL_VF_OFFLOAD_VLAN is negotiated, so only reject in SVM
+	 */
+	if (ice_vf_is_port_vlan_ena(vf) && !ice_is_dvm_ena(&pf->hw)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	/* in DVM VLAN promiscuous is based on the outer VLAN, which would be
+	 * the port VLAN if VIRTCHNL_VF_OFFLOAD_VLAN was negotiated, so only
+	 * allow vlan_promisc = true in SVM and if no port VLAN is configured
+	 */
+	vlan_promisc = ice_is_vlan_promisc_allowed(vf) &&
+		!ice_is_dvm_ena(&pf->hw) &&
+		!ice_vf_is_port_vlan_ena(vf);
+
+	if (add_v) {
+		for (i = 0; i < vfl->num_elements; i++) {
+			u16 vid = vfl->vlan_id[i];
+			struct ice_vlan vlan;
+
+			if (ice_vf_has_max_vlans(vf, vsi)) {
+				dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
+					 vf->vf_id);
+				/* There is no need to let VF know about being
+				 * not trusted, so we can just return success
+				 * message here as well.
+				 */
+				goto error_param;
+			}
+
+			/* we add VLAN 0 by default for each VF so we can enable
+			 * Tx VLAN anti-spoof without triggering MDD events so
+			 * we don't need to add it again here
+			 */
+			if (!vid)
+				continue;
+
+			vlan = ICE_VLAN(ETH_P_8021Q, vid, 0, ICE_FWD_TO_VSI);
+			status = vsi->inner_vlan_ops.add_vlan(vsi, &vlan);
+			if (status) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			/* Enable VLAN filtering on first non-zero VLAN */
+			if (!vlan_promisc && vid && !ice_is_dvm_ena(&pf->hw)) {
+				if (vsi->inner_vlan_ops.ena_rx_filtering(vsi)) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					dev_err(dev, "Enable VLAN pruning on VLAN ID: %d failed error-%d\n",
+						vid, status);
+					goto error_param;
+				}
+			} else if (vlan_promisc) {
+				status = ice_vf_ena_vlan_promisc(vsi, &vlan);
+				if (status) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					dev_err(dev, "Enable Unicast/multicast promiscuous mode on VLAN ID:%d failed error-%d\n",
+						vid, status);
+				}
+			}
+		}
+	} else {
+		/* In case of non_trusted VF, number of VLAN elements passed
+		 * to PF for removal might be greater than number of VLANs
+		 * filter programmed for that VF - So, use actual number of
+		 * VLANS added earlier with add VLAN opcode. In order to avoid
+		 * removing VLAN that doesn't exist, which result to sending
+		 * erroneous failed message back to the VF
+		 */
+		int num_vf_vlan;
+
+		num_vf_vlan = vsi->num_vlan;
+		for (i = 0; i < vfl->num_elements && i < num_vf_vlan; i++) {
+			u16 vid = vfl->vlan_id[i];
+			struct ice_vlan vlan;
+
+			/* we add VLAN 0 by default for each VF so we can enable
+			 * Tx VLAN anti-spoof without triggering MDD events so
+			 * we don't want a VIRTCHNL request to remove it
+			 */
+			if (!vid)
+				continue;
+
+			vlan = ICE_VLAN(ETH_P_8021Q, vid, 0, ICE_FWD_TO_VSI);
+			status = vsi->inner_vlan_ops.del_vlan(vsi, &vlan);
+			if (status) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			/* Disable VLAN filtering when only VLAN 0 is left */
+			if (!ice_vsi_has_non_zero_vlans(vsi))
+				vsi->inner_vlan_ops.dis_rx_filtering(vsi);
+
+			if (vlan_promisc)
+				ice_vf_dis_vlan_promisc(vsi, &vlan);
+		}
+	}
+
+error_param:
+	/* send the response to the VF */
+	if (add_v)
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_VLAN, v_ret,
+					     NULL, 0);
+	else
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_VLAN, v_ret,
+					     NULL, 0);
+}
+
+/**
+ * ice_vc_add_vlan_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Add and program guest VLAN ID
+ */
+static int ice_vc_add_vlan_msg(struct ice_vf *vf, u8 *msg)
+{
+	return ice_vc_process_vlan_msg(vf, msg, true);
+}
+
+/**
+ * ice_vc_remove_vlan_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * remove programmed guest VLAN ID
+ */
+static int ice_vc_remove_vlan_msg(struct ice_vf *vf, u8 *msg)
+{
+	return ice_vc_process_vlan_msg(vf, msg, false);
+}
+
+/**
+ * ice_vsi_is_rxq_crc_strip_dis - check if Rx queue CRC strip is disabled or not
+ * @vsi: pointer to the VF VSI info
+ */
+static bool ice_vsi_is_rxq_crc_strip_dis(struct ice_vsi *vsi)
+{
+	u16 i;
+
+	for (i = 0; i < vsi->alloc_rxq; i++)
+		if (vsi->rx_rings[i]->rx_crc_strip_dis)
+			return true;
+
+	return false;
+}
+
+/**
+ * ice_vc_ena_vlan_stripping
+ * @vf: pointer to the VF info
+ *
+ * Enable VLAN header stripping for a given VF
+ */
+static int ice_vc_ena_vlan_stripping(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vf_vlan_offload_ena(vf->driver_caps)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (ice_vsi_is_rxq_crc_strip_dis(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto error_param;
+	}
+	if (vsi->inner_vlan_ops.ena_stripping(vsi, ETH_P_8021Q))
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	else
+		vf->vlan_strip_ena |= ICE_INNER_VLAN_STRIP_ENA;
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_dis_vlan_stripping
+ * @vf: pointer to the VF info
+ *
+ * Disable VLAN header stripping for a given VF
+ */
+static int ice_vc_dis_vlan_stripping(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vf_vlan_offload_ena(vf->driver_caps)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vsi->inner_vlan_ops.dis_stripping(vsi))
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	else
+		vf->vlan_strip_ena &= ~ICE_INNER_VLAN_STRIP_ENA;
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_get_rss_hena - return the RSS HENA bits allowed by the hardware
+ * @vf: pointer to the VF info
+ */
+static int ice_vc_get_rss_hena(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_rss_hena *vrh = NULL;
+	int len = 0, ret;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		dev_err(ice_pf_to_dev(vf->pf), "RSS not supported by PF\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	len = sizeof(struct virtchnl_rss_hena);
+	vrh = kzalloc(len, GFP_KERNEL);
+	if (!vrh) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
+	}
+
+	vrh->hena = ICE_DEFAULT_RSS_HENA;
+err:
+	/* send the response back to the VF */
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_RSS_HENA_CAPS, v_ret,
+				    (u8 *)vrh, len);
+	kfree(vrh);
+	return ret;
+}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_validate_cloud_filter
+ * @vf: pointer to the VF info
+ * @tc_filter: pointer to virtchnl_filter
+ *
+ * This function validates cloud filter programmed as TC filter for ADQ
+ */
+static int
+ice_validate_cloud_filter(struct ice_vf *vf, struct virtchnl_filter *tc_filter)
+{
+	struct virtchnl_l4_spec mask = tc_filter->mask.tcp_spec;
+	struct virtchnl_l4_spec data = tc_filter->data.tcp_spec;
+	struct ice_pf *pf = vf->pf;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (!tc_filter->action) {
+		dev_err(dev, "VF %d: Currently ADQ doesn't support Drop Action\n",
+			vf->vf_id);
+		return -EOPNOTSUPP;
+	}
+
+	/* Check filter if it's programmed for advanced mode or basic mode.
+	 * There are two ADQ modes (for VF only),
+	 * 1. Basic mode: intended to allow as many filter options as possible
+	 *		  to be added to a VF in Non-trusted mode. Main goal is
+	 *		  to add filters to its own MAC and VLAN ID.
+	 * 2. Advanced mode: is for allowing filters to be applied other than
+	 *		  its own MAC or VLAN. This mode requires the VF to be
+	 *		  Trusted.
+	 */
+	if (mask.dst_mac[0] && !mask.dst_ip[0]) {
+		/* As of now supporting, MAC filter if MAC address is the
+		 * default LAN addr for this VF
+		 */
+		if (!ice_mac_fltr_exist(&pf->hw, data.dst_mac,
+					vf->lan_vsi_idx)) {
+			dev_err(dev, "Destination MAC %pM doesn't belong to VF %d\n",
+				data.dst_mac, vf->vf_id);
+			return -EINVAL;
+		}
+	} else if (!test_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+		/* Check if VF is trusted */
+		dev_err(dev, "VF %d not trusted, make VF trusted to add ADQ filters\n",
+			vf->vf_id);
+		return -EOPNOTSUPP;
+	}
+
+	if (mask.dst_mac[0] & data.dst_mac[0]) {
+		if (is_broadcast_ether_addr(data.dst_mac) ||
+		    is_zero_ether_addr(data.dst_mac)) {
+			dev_err(dev, "VF %d: Invalid Dest MAC addr %pM\n",
+				vf->vf_id, data.dst_mac);
+			return -EINVAL;
+		}
+	}
+
+	if (mask.src_mac[0] & data.src_mac[0]) {
+		if (is_broadcast_ether_addr(data.src_mac) ||
+		    is_zero_ether_addr(data.src_mac)) {
+			dev_err(dev, "VF %d: Invalid Source MAC addr %pM\n",
+				vf->vf_id, data.src_mac);
+			return -EINVAL;
+		}
+	}
+
+	if (mask.dst_port & data.dst_port) {
+		if (!data.dst_port) {
+			dev_err(dev, "VF %d: Invalid Dest port\n", vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	if (mask.src_port & data.src_port) {
+		if (!data.src_port) {
+			dev_err(dev, "VF %d: Invalid Source port\n", vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	if (mask.vlan_id & data.vlan_id) {
+		if (ntohs(data.vlan_id) >= VLAN_N_VID) {
+			dev_err(dev, "VF %d: invalid VLAN ID\n", vf->vf_id);
+			return -EINVAL;
+		}
+		/* Validate VLAN for the VF the same way we do for the PF */
+		if (!ice_vlan_fltr_exist(&pf->hw, ntohs(data.vlan_id),
+					 vf->lan_vsi_idx)) {
+			dev_err(dev, "specified VLAN %u doesn't belong to this VF %d\n",
+				ntohs(data.vlan_id), vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_tc_flower_fltr - locate the TC flower filter
+ * @vf: pointer to the VF info
+ * @fltr: pointer to the tc_flower filter
+ * @mask: ptr to filter mask (representing filter data specification)
+ *
+ * This function is used to locate specific filter in filter list. It returns
+ * NULL if unable to locate such filter otherwise returns found filter
+ */
+static struct ice_tc_flower_fltr *
+ice_get_tc_flower_fltr(struct ice_vf *vf, struct ice_tc_flower_fltr *fltr,
+		       struct virtchnl_l4_spec *mask)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *hdrs;
+	struct ice_tc_l2_hdr *l2_key;
+	struct ice_tc_l3_hdr *l3_key;
+	struct ice_tc_l4_hdr *l4_key;
+	struct ice_tc_flower_fltr *f;
+	struct hlist_node *node;
+
+	hdrs = &fltr->outer_headers;
+	if (!hdrs)
+		return NULL;
+
+	l2_key = &hdrs->l2_key;
+	l3_key = &hdrs->l3_key;
+	l4_key = &hdrs->l4_key;
+
+	hlist_for_each_entry_safe(f, node,
+				  &vf->tc_flower_fltr_list, tc_flower_node) {
+		struct ice_tc_flower_lyr_2_4_hdrs *f_hdrs;
+
+		if (!f->dest_vsi || fltr->dest_vsi != f->dest_vsi ||
+		    fltr->dest_vsi->idx != f->dest_vsi->idx)
+			continue;
+
+		f_hdrs = &f->outer_headers;
+
+		/* handle L2 fields if specified and do not match */
+		if ((mask->src_mac[0] &&
+		     !ether_addr_equal(l2_key->src_mac,
+		     f_hdrs->l2_key.src_mac)) ||
+		    (mask->dst_mac[0] &&
+		     !ether_addr_equal(l2_key->dst_mac,
+		     f_hdrs->l2_key.dst_mac)))
+			continue;
+
+		/* handle VLAN if specified and do not match  */
+		if (mask->vlan_id && hdrs->vlan_hdr.vlan_id !=
+		    f_hdrs->vlan_hdr.vlan_id)
+			continue;
+
+		/* handle L3 IPv4 if specified and do not match
+		 * for ipv4 data to be valid, check only first dword of mask
+		 */
+		if (l2_key->n_proto == ETH_P_IP)
+			if ((mask->dst_ip[0] &&
+			     l3_key->dst_ipv4 != f_hdrs->l3_key.dst_ipv4) ||
+			    (mask->src_ip[0] &&
+			     l3_key->src_ipv4 != f_hdrs->l3_key.src_ipv4))
+				continue;
+
+		/* handle L3 IPv6 if specified and do not match
+		 * for ipv6 to be valid, last dword from mask must be valid
+		 * hence check only last dword of mask
+		 */
+		if (l2_key->n_proto == ETH_P_IPV6 && mask->dst_ip[3])
+			if (memcmp(&l3_key->ip.v6.dst_ip6,
+				   &f_hdrs->l3_key.ip.v6.dst_ip6,
+				   sizeof(l3_key->ip.v6.dst_ip6)))
+				continue;
+		if (l2_key->n_proto == ETH_P_IPV6 && mask->src_ip[3])
+			if (memcmp(&l3_key->ip.v6.src_ip6,
+				   &f_hdrs->l3_key.ip.v6.src_ip6,
+				   sizeof(l3_key->ip.v6.src_ip6)))
+				continue;
+
+		/* make sure "ip_proto" is same */
+		if (l3_key->ip_proto != f_hdrs->l3_key.ip_proto)
+			continue;
+
+		/* handle L4 fields if specified and do not match */
+		if ((mask->dst_port &&
+		     l4_key->dst_port != f_hdrs->l4_key.dst_port) ||
+		    (mask->src_port &&
+		     l4_key->src_port != f_hdrs->l4_key.src_port))
+			continue;
+
+		/* if reached here, means found matching filter entry */
+		return f;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_vc_chnl_fltr_state_verify - verify general state of VF
+ * @vf: pointer to the VF info
+ * @vcf: pointer to virtchannel_filter
+ *
+ * This function performs general validation including validation of filter
+ * message and content
+ */
+static enum virtchnl_status_code
+ice_vc_chnl_fltr_state_verify(struct ice_vf *vf, struct virtchnl_filter *vcf)
+{
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+	u32 max_tc_allowed;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	if (!ice_is_vf_adq_ena(vf)) {
+		dev_err(dev, "VF %d: ADQ is not enabled, can't apply switch filter\n",
+			vf->vf_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		dev_err(dev, "VF %d: No corresponding VF VSI\n", vf->vf_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	max_tc_allowed = ice_vc_get_max_chnl_tc_allowed(vf);
+	if (vcf->action == VIRTCHNL_ACTION_TC_REDIRECT &&
+	    vcf->action_meta >= max_tc_allowed) {
+		dev_err(dev, "VF %d: Err: action(%u)_meta(TC): %u >= max_tc_allowed (%u)\n",
+			vf->vf_id, vcf->action, vcf->action_meta,
+			max_tc_allowed);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	/* enforce supported flow_type based on negotiated capability */
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2) {
+		if (!(vcf->flow_type == VIRTCHNL_TCP_V4_FLOW ||
+		      vcf->flow_type == VIRTCHNL_TCP_V6_FLOW ||
+		      vcf->flow_type == VIRTCHNL_UDP_V4_FLOW ||
+		      vcf->flow_type == VIRTCHNL_UDP_V6_FLOW)) {
+			dev_err(ice_pf_to_dev(pf), "VF %d: Invalid input/s, unsupported flow_type %u\n",
+				vf->vf_id, vcf->flow_type);
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		}
+	} else {
+		if (!(vcf->flow_type == VIRTCHNL_TCP_V4_FLOW ||
+		      vcf->flow_type == VIRTCHNL_TCP_V6_FLOW)){
+			dev_err(ice_pf_to_dev(pf), "VF %d: Invalid input/s, unsupported flow_type %u\n",
+				vf->vf_id, vcf->flow_type);
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		}
+	}
+
+	if (ice_validate_cloud_filter(vf, vcf)) {
+		dev_err(dev, "VF %d: Invalid input/s, can't apply switch filter\n",
+			vf->vf_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	/* filter state fully verified, return SUCCESS */
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_setup_fltr - populate fields in TC flower filter structure
+ * @vf: ptr to VF
+ * @vcf: ptr to virt channel message
+ * @fltr: pointer to the TC filter structure
+ * @dest_vsi: pointer to destination VSI for filter
+ * @tc_class: TC number when action type to FWD_TO_VSI, counter index when
+ *	      action is count, queue number when action is FWD_TO_QUEUE,
+ *	      queue group ID when action is FWD_TO_QGRP
+ */
+static void
+ice_setup_fltr(struct ice_vf *vf, struct ice_tc_flower_fltr *fltr,
+	       struct virtchnl_filter *vcf, struct ice_vsi *dest_vsi,
+	       int tc_class)
+{
+	struct virtchnl_l4_spec *mask = &vcf->mask.tcp_spec;
+	struct virtchnl_l4_spec *tcf = &vcf->data.tcp_spec;
+	struct ice_tc_flower_lyr_2_4_hdrs *hdrs;
+
+	memset(fltr, 0, sizeof(*fltr));
+
+	hdrs = &fltr->outer_headers;
+	if (!hdrs)
+		return;
+
+	/* copy L2 MAC address and MAC mask */
+	ether_addr_copy(hdrs->l2_key.dst_mac, tcf->dst_mac);
+	ether_addr_copy(hdrs->l2_mask.dst_mac, mask->dst_mac);
+	if (!is_zero_ether_addr(hdrs->l2_key.dst_mac))
+		fltr->flags |= ICE_TC_FLWR_FIELD_DST_MAC;
+
+	/* copy L2 source address and MAC mask */
+	ether_addr_copy(hdrs->l2_key.src_mac, tcf->src_mac);
+	ether_addr_copy(hdrs->l2_mask.src_mac, mask->src_mac);
+	if (!is_zero_ether_addr(hdrs->l2_key.src_mac))
+		fltr->flags |= ICE_TC_FLWR_FIELD_SRC_MAC;
+
+	/* copy VLAN info */
+	hdrs->vlan_hdr.vlan_id = mask->vlan_id & tcf->vlan_id;
+	if (hdrs->vlan_hdr.vlan_id)
+		fltr->flags |= ICE_TC_FLWR_FIELD_VLAN;
+
+	/* copy L4 fields */
+	hdrs->l4_key.dst_port = mask->dst_port & tcf->dst_port;
+	hdrs->l4_mask.dst_port = mask->dst_port;
+	if (hdrs->l4_key.dst_port)
+		fltr->flags |= ICE_TC_FLWR_FIELD_DEST_L4_PORT;
+
+	hdrs->l4_key.src_port = mask->src_port & tcf->src_port;
+	hdrs->l4_mask.src_port = mask->src_port;
+	if (hdrs->l4_key.src_port)
+		fltr->flags |= ICE_TC_FLWR_FIELD_SRC_L4_PORT;
+
+	/* copy L3 fields, IPv4[6] */
+	if (vcf->flow_type == VIRTCHNL_TCP_V4_FLOW ||
+	    vcf->flow_type == VIRTCHNL_UDP_V4_FLOW) {
+		struct ice_tc_l3_hdr *key, *msk;
+
+		key = &hdrs->l3_key;
+		msk = &hdrs->l3_mask;
+
+		/* set n_proto based on flow_type */
+		hdrs->l2_key.n_proto = ETH_P_IP;
+		if (mask->dst_ip[0] & tcf->dst_ip[0]) {
+			key->dst_ipv4 = tcf->dst_ip[0];
+			msk->dst_ipv4 = mask->dst_ip[0];
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_IPV4;
+		}
+		if (mask->src_ip[0] & tcf->src_ip[0]) {
+			key->src_ipv4 = tcf->src_ip[0];
+			msk->src_ipv4 = mask->src_ip[0];
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_IPV4;
+		}
+	} else if (vcf->flow_type == VIRTCHNL_TCP_V6_FLOW ||
+		   vcf->flow_type == VIRTCHNL_UDP_V6_FLOW) {
+		struct ice_tc_l3_hdr *key, *msk;
+
+		key = &hdrs->l3_key;
+		msk = &hdrs->l3_mask;
+
+		/* set n_proto based on flow_type */
+		hdrs->l2_key.n_proto = ETH_P_IPV6;
+		if (mask->dst_ip[3] & tcf->dst_ip[3]) {
+			memcpy(&key->ip.v6.dst_ip6, tcf->dst_ip,
+			       sizeof(key->ip.v6.dst_ip6));
+			memcpy(&msk->ip.v6.dst_ip6, mask->dst_ip,
+			       sizeof(msk->ip.v6.dst_ip6));
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_IPV6;
+		}
+		if (mask->src_ip[3] & tcf->src_ip[3]) {
+			memcpy(&key->ip.v6.src_ip6, tcf->src_ip,
+			       sizeof(key->ip.v6.src_ip6));
+			memcpy(&msk->ip.v6.src_ip6, mask->src_ip,
+			       sizeof(msk->ip.v6.src_ip6));
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_IPV6;
+		}
+	}
+
+	/* get the VSI to which the TC belongs to */
+	fltr->dest_vsi = dest_vsi;
+	if (vcf->action == VIRTCHNL_ACTION_TC_REDIRECT)
+		fltr->action.fltr_act = ICE_FWD_TO_VSI;
+	else
+		fltr->action.fltr_act = ICE_DROP_PACKET;
+
+	/* make sure to include VF's MAC address when adding ADQ filter */
+	if ((!(fltr->flags & ICE_TC_FLWR_FIELD_DST_MAC)) &&
+	    fltr->action.fltr_act == ICE_FWD_TO_VSI) {
+		fltr->flags |= ICE_TC_FLWR_FIELD_DST_MAC;
+		ether_addr_copy(hdrs->l2_key.dst_mac, vf->dev_lan_addr.addr);
+		eth_broadcast_addr(hdrs->l2_mask.dst_mac);
+	}
+
+	/* 'tc_class' could be TC/QUEUE/QUEUE_GRP number */
+	fltr->action.tc_class = tc_class;
+
+	/* must to set the tunnel_type to be INVALID, otherwise if left as zero,
+	 * it gets treated as VxLAN tunnel since definition of VxLAN tunnel
+	 * type is zero
+	 */
+	fltr->tunnel_type = TNL_LAST;
+
+	/* set ip_proto in headers based on flow_type which is part of VIRTCHNL
+	 * message, "add filter"
+	 */
+	if (vcf->flow_type == VIRTCHNL_TCP_V4_FLOW ||
+	    vcf->flow_type == VIRTCHNL_TCP_V6_FLOW)
+		hdrs->l3_key.ip_proto = IPPROTO_TCP;
+	else
+		hdrs->l3_key.ip_proto = IPPROTO_UDP;
+}
+
+/**
+ * ice_vc_del_switch_filter
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * This function deletes a cloud filter programmed as TC filter for ADQ
+ */
+static int ice_vc_del_switch_filter(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_filter *vcf = (struct virtchnl_filter *)msg;
+	struct virtchnl_l4_spec *mask = &vcf->mask.tcp_spec;
+	struct ice_rule_query_data rule;
+	enum virtchnl_status_code v_ret;
+	struct ice_tc_flower_fltr fltr;
+	struct ice_tc_flower_fltr *f;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *dest_vsi;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	/* Advanced switch filters and DCF are mutually exclusive. */
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. Advanced switch filters cannot be deleted.\n");
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	v_ret = ice_vc_chnl_fltr_state_verify(vf, vcf);
+	if (v_ret) {
+		dev_err(dev, "VF %d: failed to verify ADQ state during filter message processing\n",
+			vf->vf_id);
+		goto err;
+	}
+
+	dest_vsi = pf->vsi[vf->ch[vcf->action_meta].vsi_idx];
+
+	/* prepare the TC flower filter based on input */
+	ice_setup_fltr(vf, &fltr, vcf, dest_vsi, vcf->action_meta);
+
+	/* locate the filter in VF tc_flower filter list */
+	f = ice_get_tc_flower_fltr(vf, &fltr, mask);
+	if (!f) {
+		dev_err(dev, "VF %d: Invalid input/s, unable to locate filter due to mismatch\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* Deleting TC filter */
+	rule.rid = f->rid;
+	rule.rule_id = f->rule_id;
+	rule.vsi_handle = f->dest_id;
+	err = ice_rem_adv_rule_by_id(&pf->hw, &rule);
+	if (err) {
+		dev_err(dev, "VF %d: Failed to delete switch filter for tc %u, err %d\n",
+			vf->vf_id, vcf->action_meta, err);
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+		goto err;
+	}
+
+	/* book-keeping and update filter type if filter count reached zero */
+	dest_vsi->num_chnl_fltr--;
+
+	/* reset filter type for channel if channel filter
+	 * count reaches zero
+	 */
+	if (!dest_vsi->num_chnl_fltr)
+		vf->ch[vcf->action_meta].fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+
+	hlist_del(&f->tc_flower_node);
+	devm_kfree(dev, f);
+	if (f->flags & ICE_TC_FLWR_FIELD_DST_MAC)
+		vf->num_dmac_chnl_fltrs--;
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+err:
+	/* send the response back to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_CLOUD_FILTER, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_vc_add_switch_filter
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * This function adds a switch filter programmed as TC filter for ADQ
+ *
+ * General info about filtering mode:
+ * VF ADQ has two different modes when it comes to applying the switch
+ * filters
+ * 1. basic mode: only dst MAC and dst VLAN filters supported
+ * 2. advanced mode: all combination of filters including dst MAC and
+ *			dst VLAN ex:
+ *	a. dst IP + dst PORT
+ *	b. dst MAC + src PORT
+ *	c. dst MAC + dst PORT
+ * basic mode is for 'untrusted VFs' and advanced mode is only for
+ * 'trusted VFs'. When a VF is toggled from being 'trusted' to
+ * 'untrusted' we remove all filters irrespective if it's basic or
+ * advanced.
+ * when ADQ is enabled we need to do ice_down irrespective if VF is
+ * 'trusted' or not and delete switch filters only if a 'trusted' VF
+ * is made 'untrusted'.
+ */
+static int ice_vc_add_switch_filter(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_filter *vcf = (struct virtchnl_filter *)msg;
+	struct ice_tc_flower_fltr *fltr = NULL;
+	enum virtchnl_status_code v_ret;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *dest_vsi;
+	struct device *dev;
+	int ret;
+
+	dev = ice_pf_to_dev(pf);
+	/* Advanced switch filters and DCF are mutually exclusive. */
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. Advanced switch filters cannot be added\n");
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	v_ret = ice_vc_chnl_fltr_state_verify(vf, vcf);
+	if (v_ret) {
+		dev_err(dev, "VF %d: failed to verify ADQ state during filter message processing\n",
+			vf->vf_id);
+		goto err;
+	}
+
+	dest_vsi = pf->vsi[vf->ch[vcf->action_meta].vsi_idx];
+
+	fltr = devm_kzalloc(dev, sizeof(*fltr), GFP_KERNEL);
+	if (!fltr) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		goto err;
+	}
+
+	/* prepare the TC flower filter based on input */
+	ice_setup_fltr(vf, fltr, vcf, dest_vsi, vcf->action_meta);
+
+	/* call function which adds advanced switch filter */
+	ret = ice_add_tc_flower_adv_fltr(ice_get_vf_vsi(vf), fltr);
+	if (ret) {
+		dev_err(dev, "Failed to add TC Flower filter using advance filter recipe\n");
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+		devm_kfree(dev, fltr);
+		goto err;
+	}
+
+	INIT_HLIST_NODE(&fltr->tc_flower_node);
+	hlist_add_head(&fltr->tc_flower_node, &vf->tc_flower_fltr_list);
+	if (fltr->flags & ICE_TC_FLWR_FIELD_DST_MAC)
+		vf->num_dmac_chnl_fltrs++;
+
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+	vf->adq_fltr_ena = true;
+
+err:
+	/* send the response back to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_CLOUD_FILTER, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_conv_virtchnl_speed_to_mbps
+ * @virt_speed: virt speed that needs to be converted from
+ *
+ * convert virt channel speeds to mbps, return link speed on success,
+ * '0' otherwise
+ */
+static u32 ice_conv_virtchnl_speed_to_mbps(u16 virt_speed)
+{
+	u32 speed, link_speed;
+
+	speed = ice_conv_link_speed_to_virtchnl(false, virt_speed);
+
+       /* get link speed in MB to validate rate limit */
+	switch (speed) {
+	case VIRTCHNL_LINK_SPEED_100MB:
+		link_speed = SPEED_100;
+		break;
+	case VIRTCHNL_LINK_SPEED_1GB:
+		link_speed = SPEED_1000;
+		break;
+	case VIRTCHNL_LINK_SPEED_10GB:
+		link_speed = SPEED_10000;
+		break;
+	case VIRTCHNL_LINK_SPEED_20GB:
+		link_speed = SPEED_20000;
+		break;
+	case VIRTCHNL_LINK_SPEED_25GB:
+		link_speed = SPEED_25000;
+		break;
+	case VIRTCHNL_LINK_SPEED_40GB:
+		link_speed = SPEED_40000;
+		break;
+	default:
+		/* on failure to detect link speed the expectation of the caller
+		 * to this function is '0'.
+		 */
+		link_speed = 0;
+		break;
+	}
+
+	return link_speed;
+}
+
+/**
+ * ice_vc_add_qch_msg: Add queue channel and enable ADQ
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ */
+static int ice_vc_add_qch_msg(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_tc_info *tci =
+		(struct virtchnl_tc_info *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_pf *pf = vf->pf;
+	int adq_request_qps = 0;
+	struct ice_link_status *ls;
+	u16 available_vsis = 0;
+	u64 total_max_rate = 0;
+	u32 max_tc_allowed;
+	struct device *dev;
+	u16 total_qs = 0;
+	u32 link_speed;
+	unsigned int i;
+
+	dev = ice_pf_to_dev(pf);
+	ls = &pf->hw.port_info->phy.link_info;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* check if VF has negotiated this capability before anything else */
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ)) {
+		dev_dbg(dev, "VF %d attempting to enable ADQ, but hasn't properly negotiated that capability\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* Currently ADQ and DCB are mutually exclusive and keeping in sync
+	 * with PF, don't allow VF ADQ configuration when DCB Firmware LLDP
+	 * agent is already running/enabled.
+	 */
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
+		dev_err(dev, "FW LLDP is enabled, cannot enable ADQ on VF %d\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* VF ADQ and DCF are mutually exclusive. */
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. VF ADQ cannot be enabled\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* ADQ cannot be applied if spoof check is ON */
+	if (vf->spoofchk) {
+		dev_err(dev, "Spoof check is ON, turn it OFF to enable ADQ\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	ice_for_each_vsi(pf, i) {
+		if (!pf->vsi[i])
+			++available_vsis;
+	}
+
+	if (available_vsis < tci->num_tc - 1) {
+		dev_err(dev, "Not enough VSIs left to enable ADQ on VF %d\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	max_tc_allowed = ice_vc_get_max_chnl_tc_allowed(vf);
+	/* max number of traffic classes for VF currently capped at 4 for legacy
+	 * ADQ and 16 for ADQ V2.
+	 */
+	if (!tci->num_tc || tci->num_tc > max_tc_allowed) {
+		dev_dbg(dev, "VF %d trying to set %u TCs, valid range 1-%u TCs per VF\n",
+			vf->vf_id, tci->num_tc, max_tc_allowed);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* validate queues for each TC */
+	for (i = 0; i < tci->num_tc; i++) {
+		if (!tci->list[i].count) {
+			dev_err(dev, "VF %d: TC %d trying to set %u queues, should be > 0 per TC\n",
+				vf->vf_id, i, tci->list[i].count);
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+		total_qs += tci->list[i].count;
+	}
+
+	if (total_qs > ICE_MAX_DFLT_QS_PER_VF) {
+		dev_err(dev, "VF %d: Total number of queues of all TCs cannot exceed %u\n",
+			vf->vf_id, ICE_MAX_DFLT_QS_PER_VF);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* Speed in Mbps */
+	if (vf->driver_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
+		link_speed = ice_conv_link_speed_to_virtchnl(true,
+							     ls->link_speed);
+	else
+		link_speed = ice_conv_virtchnl_speed_to_mbps(ls->link_speed);
+
+	if (!link_speed) {
+		dev_err(dev, "Cannot detect link speed on VF %d\n", vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	for (i = 0; i < tci->num_tc; i++)
+		if (tci->list[i].max_tx_rate)
+			total_max_rate += tci->list[i].max_tx_rate;
+
+	if (total_max_rate > link_speed) {
+		dev_err(dev, "Invalid tx rate specified for ADQ on VF %d\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (vf->max_tx_rate && total_max_rate > vf->max_tx_rate) {
+		dev_err(dev, "Invalid tx rate specified for ADQ on VF %d, total_max_rate %llu Mpbs > host set max_tx_rate %u Mbps\n",
+			vf->vf_id, total_max_rate, vf->max_tx_rate);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* need Max VF queues but already have default number of queues */
+	adq_request_qps = ICE_MAX_DFLT_QS_PER_VF - pf->num_qps_per_vf;
+
+	if (ice_get_avail_txq_count(pf) < adq_request_qps) {
+		dev_err(dev, "No queues left to allocate to VF %d\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		goto err;
+	} else {
+		/* we need to allocate max VF queues to enable ADQ so as to
+		 * make sure ADQ enabled VF always gets back queues when it
+		 * goes through a reset.
+		 */
+		vf->num_vf_qs = ICE_MAX_DFLT_QS_PER_VF;
+	}
+
+	/* parse data from the queue channel info */
+	vf->num_tc = tci->num_tc;
+
+	for (i = 0; i < vf->num_tc; i++) {
+		if (tci->list[i].max_tx_rate)
+			vf->ch[i].max_tx_rate = tci->list[i].max_tx_rate;
+
+		vf->ch[i].num_qps = tci->list[i].count;
+		vf->ch[i].offset = tci->list[i].offset;
+	}
+
+	/* set this flag only after making sure all inputs are sane */
+	vf->adq_enabled = true;
+	/* initialize filter enable flag, set it only if filters are applied */
+	vf->adq_fltr_ena = false;
+
+	/* reset the VF in order to allocate resources. Don't reset if ADQ_V2
+	 * capability is negotiated, since in that case AVF driver will request
+	 * for a reset.
+	 */
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2)) {
+		ice_vc_notify_vf_reset(vf);
+		ice_reset_vf(vf, false);
+	}
+	/* send the response to the VF */
+err:
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2)
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_CHANNELS,
+					     v_ret, (u8 *)tci, sizeof(*tci));
+	else
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_CHANNELS,
+					     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_del_qch_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * delete the additional VSIs which are created as part of ADQ
+ */
+static int ice_vc_del_qch_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	u8 tc;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* VF ADQ and DCF are mutually exclusive. */
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. VF ADQ cannot be enabled\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (ice_is_vf_adq_ena(vf)) {
+		/* if ADQ_V2 is set, perform inline cleanup of ADQ resources and
+		 * return success and eventually VF driver will initiate reset
+		 * as per design
+		 */
+		if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2) {
+			dev_info(ice_pf_to_dev(pf),
+				 "Deleting Queue Channels for ADQ on VF %d and ADQ_V2 is set\n",
+				 vf->vf_id);
+
+			/* release VF ADQ filters and VSIs inline */
+			ice_vf_adq_release(vf);
+			v_ret = VIRTCHNL_STATUS_SUCCESS;
+			goto err;
+		}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		/* delete all ADQ filters for given VF */
+		ice_del_all_adv_switch_fltr(vf);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+		/* stop all Tx/Rx rings and clean them before deleting the ADQ
+		 * resources, if not it will throw fail to set the LAN Tx queue
+		 * context error. This is needed irrespective of ADQ_V2. Channel
+		 * related TC starts at 1. Don't down the VSI and related
+		 * resources for TC 0 because it is primary VF VSI and downing
+		 * that VSI is handled somewhere else.
+		 */
+		for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+			vsi = ice_get_vf_adq_vsi(vf, tc);
+			if (!vsi) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto err;
+			}
+			if (vf->ch[tc].vsi_num)
+				ice_down(vsi);
+		}
+
+		/* this order of code is very important, if num_tc is not
+		 * cleared, VF again rebuilds as ADQ enabled clearly contrary
+		 * to what we're trying to do. Also clearing num_tc before
+		 * deleting ADQ filters leads to the condition where the code
+		 * will try to delete filters when none are configured.
+		 */
+		vf->num_tc = 0;
+		dev_info(ice_pf_to_dev(pf), "Deleting Queue Channels for ADQ on VF %d\n",
+			 vf->vf_id);
+
+		/* reset needs to happen first, before we clear the adq_enabled
+		 * flag, since freeing up of ADQ resources happens based off of
+		 * this flag in reset path. Doing a reset after clearing the
+		 * flag will leave the ADQ resources in zombie state which in
+		 * turn creates undesired problems such as system lock up, stack
+		 * trace etc.,
+		 * Also we shouldn't be doing a reset if ADQ flag is cleared in
+		 * some other place, hence sending the failure response back to
+		 * the VF.
+		 */
+		ice_vc_notify_vf_reset(vf);
+		ice_reset_vf(vf, false);
+		if (ice_is_vf_link_up(vf)) {
+			/* bring the VSI 0 back up again */
+			vsi = ice_get_vf_adq_vsi(vf, 0);
+			if (!vsi) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto err;
+			}
+			ice_up(vsi);
+		}
+
+		vf->adq_enabled = false;
+	} else {
+		dev_info(dev, "VF %d trying to delete queue channels but ADQ isn't enabled\n",
+			 vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	/* send the response to the VF */
+err:
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2)
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_CHANNELS,
+					     v_ret, msg,
+					     sizeof(struct virtchnl_tc_info));
+	else
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_CHANNELS,
+					     v_ret, NULL, 0);
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+/**
+ * ice_vc_set_rss_hena - set RSS HENA bits for the VF
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ */
+static int ice_vc_set_rss_hena(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_rss_hena *vrh = (struct virtchnl_rss_hena *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_pf *pf = vf->pf;
+	enum ice_status status;
+	struct ice_vsi *vsi;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+		dev_err(dev, "RSS not supported by PF\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* clear all previously programmed RSS configuration to allow VF drivers
+	 * the ability to customize the RSS configuration and/or completely
+	 * disable RSS
+	 */
+	status = ice_rem_vsi_rss_cfg(&pf->hw, vsi->idx);
+	if (status && !vrh->hena) {
+		/* only report failure to clear the current RSS configuration if
+		 * that was clearly the VF's intention (i.e. vrh->hena = 0)
+		 */
+		v_ret = ice_err_to_virt_err(status);
+		goto err;
+	} else if (status) {
+		/* allow the VF to update the RSS configuration even on failure
+		 * to clear the current RSS confguration in an attempt to keep
+		 * RSS in a working state
+		 */
+		dev_warn(dev, "Failed to clear the RSS configuration for VF %u\n",
+			 vf->vf_id);
+	}
+
+	if (vrh->hena) {
+		status = ice_add_avf_rss_cfg(&pf->hw, vsi->idx, vrh->hena);
+		v_ret = ice_err_to_virt_err(status);
+	}
+
+	/* send the response to the VF */
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_SET_RSS_HENA, v_ret,
+				      NULL, 0);
+}
+
+/**
+ * ice_vc_rdma_msg - send msg to RDMA PF from VF
+ * @vf: pointer to VF info
+ * @msg: pointer to msg buffer
+ * @len: length of the message
+ *
+ * This function is called indirectly from the AQ clean function.
+ */
+static int ice_vc_rdma_msg(struct ice_vf *vf, u8 *msg, u16 len)
+{
+	struct ice_peer_obj *rdma_peer;
+	int ret;
+
+	rdma_peer = vf->pf->rdma_peer;
+	if (!rdma_peer) {
+		pr_err("Invalid RDMA peer attempted to send message to peer\n");
+		return -EIO;
+	}
+
+	if (!rdma_peer->peer_ops || !rdma_peer->peer_ops->vc_receive) {
+		pr_err("Incomplete RMDA peer attempting to send msg\n");
+		return -EINVAL;
+	}
+
+	ret = rdma_peer->peer_ops->vc_receive(rdma_peer, vf->vf_id, msg, len);
+	if (ret)
+		pr_err("Failed to send message to RDMA peer, error %d\n", ret);
+
+	return ret;
+}
+
+/**
+ * ice_vf_cfg_rdma_ceq_irq_map - configure the CEQ IRQ mapping
+ * @vf: VF structure associated to the VF that requested the mapping
+ * @qv_info: RDMA queue vector mapping information
+ *
+ * Configure the CEQ index for the passed in VF. This will result in the CEQ
+ * being able to generate interrupts
+ */
+static void
+ice_vf_cfg_rdma_ceq_irq_map(struct ice_vf *vf,
+			    struct virtchnl_rdma_qv_info *qv_info)
+{
+	u16 glint_ceqctl_idx = ice_vf_get_glint_ceqctl_idx(vf,
+							   qv_info->ceq_idx);
+
+	u32 regval = (qv_info->v_idx & GLINT_CEQCTL_MSIX_INDX_M) |
+		     ((qv_info->itr_idx << GLINT_CEQCTL_ITR_INDX_S) &
+		      GLINT_CEQCTL_ITR_INDX_M) | GLINT_CEQCTL_CAUSE_ENA_M;
+
+	wr32(&vf->pf->hw, GLINT_CEQCTL(glint_ceqctl_idx), regval);
+}
+
+/**
+ * ice_vf_cfg_rdma_aeq_irq_map - configure the AEQ IRQ mapping
+ * @vf: VF structure associated to the VF that requested the mapping
+ * @qv_info: RDMA queue vector mapping information
+ *
+ * Configure the AEQ for the passed in VF. This will result in the AEQ being
+ * able to generate interrupts
+ */
+static void
+ice_vf_cfg_rdma_aeq_irq_map(struct ice_vf *vf,
+			    struct virtchnl_rdma_qv_info *qv_info)
+{
+	u32 regval = (qv_info->v_idx & PFINT_AEQCTL_MSIX_INDX_M) |
+		     ((qv_info->itr_idx << VPINT_AEQCTL_ITR_INDX_S) &
+		      VPINT_AEQCTL_ITR_INDX_M) | VPINT_AEQCTL_CAUSE_ENA_M;
+
+	wr32(&vf->pf->hw, VPINT_AEQCTL(vf->vf_id), regval);
+}
+
+/**
+ * ice_vc_cfg_rdma_irq_map_msg - MSIX mapping of RDMA control queue interrupts
+ * @vf: VF structure associated to the VF that requested the mapping
+ * @msg: Message from the VF used to configure the RDMA mapping
+ *
+ * Handler for the VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP opcode in virtchnl. This
+ * causes the specified control queues to be mapped to the specified MSIX
+ * indices and ITR indices. Also, the control queue's interrupt will be
+ * enabled.
+ */
+static int ice_vc_cfg_rdma_irq_map_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_rss_key *vrk =
-		(struct virtchnl_rss_key *)msg;
+	struct virtchnl_rdma_qvlist_info *qvlist =
+		(struct virtchnl_rdma_qvlist_info *)msg;
+	u16 num_msix_per_vf;
+	u32 i;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	num_msix_per_vf = vf->pf->num_msix_per_vf;
+	if (qvlist->num_vectors > num_msix_per_vf) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	for (i = 0; i < qvlist->num_vectors; i++) {
+		struct virtchnl_rdma_qv_info *qv_info = &qvlist->qv_info[i];
+
+		if (qv_info->v_idx >= num_msix_per_vf) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+
+		if (qv_info->ceq_idx == VIRTCHNL_RDMA_INVALID_QUEUE_IDX &&
+		    qv_info->aeq_idx == VIRTCHNL_RDMA_INVALID_QUEUE_IDX) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+
+		if (qv_info->ceq_idx != VIRTCHNL_RDMA_INVALID_QUEUE_IDX &&
+		    qv_info->ceq_idx >= num_msix_per_vf) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+
+		if (qv_info->aeq_idx != VIRTCHNL_RDMA_INVALID_QUEUE_IDX &&
+		    qv_info->aeq_idx >= num_msix_per_vf) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+	}
+
+	for (i = 0; i < qvlist->num_vectors; i++) {
+		struct virtchnl_rdma_qv_info *qv_info = &qvlist->qv_info[i];
+
+		if (qv_info->ceq_idx != VIRTCHNL_RDMA_INVALID_QUEUE_IDX)
+			ice_vf_cfg_rdma_ceq_irq_map(vf, qv_info);
+
+		if (qv_info->aeq_idx != VIRTCHNL_RDMA_INVALID_QUEUE_IDX)
+			ice_vf_cfg_rdma_aeq_irq_map(vf, qv_info);
+	}
+
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_clear_rdma_irq_map - clear mapped RDMA control queue interrupts
+ * @vf: VF structure associated to the VF that requested to release the mapping
+ *
+ * Handler for the VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP opcode in virtchnl. This
+ * causes all of the MSIX mapping of all the RDMA control queues to be cleared
+ * and disabled.
+ */
+static int ice_vc_clear_rdma_irq_map(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	ice_vf_clear_rdma_irq_map(vf);
+
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_query_rxdid - query RXDID supported by DDP package
+ * @vf: pointer to VF info
+ *
+ * Called from VF to query a bitmap of supported flexible
+ * descriptor RXDIDs of a DDP package.
+ */
+static int ice_vc_query_rxdid(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_supported_rxdids *rxdid = NULL;
+	struct ice_hw *hw = &vf->pf->hw;
 	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi = NULL;
+	int len = 0;
+	int ret, i;
+	u32 regval;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vrk->vsi_id)) {
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (vrk->key_len != ICE_VSIQF_HKEY_ARRAY_SIZE) {
+	len = sizeof(struct virtchnl_supported_rxdids);
+	rxdid = kzalloc(len, GFP_KERNEL);
+	if (!rxdid) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
+	}
+
+	/* RXDIDs supported by DDP package can be read from the register
+	 * to get the supported RXDID bitmap. But the legacy 32byte RXDID
+	 * is not listed in DDP package, add it in the bitmap manually.
+	 * Legacy 16byte descriptor is not supported.
+	 */
+	rxdid->supported_rxdids |= BIT(ICE_RXDID_LEGACY_1);
+
+	for (i = ICE_RXDID_FLEX_NIC; i < ICE_FLEX_DESC_RXDID_MAX_NUM; i++) {
+		regval = rd32(hw, GLFLXP_RXDID_FLAGS(i, 0));
+		if ((regval >> GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S)
+			& GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M)
+			rxdid->supported_rxdids |= BIT(i);
+	}
+
+	pf->supported_rxdids = rxdid->supported_rxdids;
+
+err:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_SUPPORTED_RXDIDS,
+				    v_ret, (u8 *)rxdid, len);
+	kfree(rxdid);
+	return ret;
+}
+
+/**
+ * ice_vf_init_vlan_stripping - enable/disable VLAN stripping on initialization
+ * @vf: VF to enable/disable VLAN stripping for on initialization
+ *
+ * Set the default for VLAN stripping based on whether a port VLAN is configured
+ * and the current VLAN mode of the device.
+ */
+static int ice_vf_init_vlan_stripping(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+
+	vf->vlan_strip_ena = 0;
+
+	if (!vsi)
+		return -EINVAL;
+
+	/* don't modify stripping if port VLAN is configured in SVM since the
+	 * port VLAN is based on the inner/single VLAN in SVM
+	 */
+	if (ice_vf_is_port_vlan_ena(vf) && !ice_is_dvm_ena(&vsi->back->hw))
+		return 0;
+
+	if (ice_vf_vlan_offload_ena(vf->driver_caps)) {
+		int err = vsi->inner_vlan_ops.ena_stripping(vsi, ETH_P_8021Q);
+
+		if (!err)
+			vf->vlan_strip_ena |= ICE_INNER_VLAN_STRIP_ENA;
+
+		return err;
+	}
+
+	return vsi->inner_vlan_ops.dis_stripping(vsi);
+}
+
+/**
+ * ice_validate_tpid - validate the VLAN TPID
+ * @tpid: VLAN TPID
+ */
+static int ice_validate_tpid(u16 tpid)
+{
+	if (tpid == ETH_P_8021Q ||
+	    tpid == ETH_P_8021AD ||
+	    tpid == ETH_P_QINQ1)
+		return 0;
+
+	return -EINVAL;
+}
+
+/**
+ * ice_vc_dcf_vlan_offload_msg - send msg to handle VLAN offload from DCF
+ * @vf: pointer to VF info
+ * @msg: pointer to msg buffer
+ */
+static int ice_vc_dcf_vlan_offload_msg(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_dcf_vlan_offload *offload = (struct virtchnl_dcf_vlan_offload *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_dcf_vlan_info *dcf_vlan;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *target_vsi;
+	struct ice_vf *target_vf;
+	u16 insert_mode;
+	u16 strip_mode;
+	u16 vlan_flags;
+	u16 vlan_type;
+
+	if (!ice_is_dvm_ena(&pf->hw) || !(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_VLAN_V2) ||
+	    !ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+	vlan_flags = offload->vlan_flags;
+	insert_mode = (vlan_flags & VIRTCHNL_DCF_VLAN_INSERT_MODE_M) >>
+				    VIRTCHNL_DCF_VLAN_INSERT_MODE_S;
+	strip_mode = (vlan_flags & VIRTCHNL_DCF_VLAN_STRIP_MODE_M) >>
+				   VIRTCHNL_DCF_VLAN_STRIP_MODE_S;
+	vlan_type = (vlan_flags & VIRTCHNL_DCF_VLAN_TYPE_M) >>
+				  VIRTCHNL_DCF_VLAN_TYPE_S;
+
+	if (ice_validate_vf_id(pf, offload->vf_id) || ice_validate_tpid(offload->tpid) ||
+	    (!insert_mode && !strip_mode) || vlan_type != VIRTCHNL_DCF_VLAN_TYPE_OUTER ||
+	    offload->vlan_id >= VLAN_N_VID) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
+	target_vf = &pf->vf[offload->vf_id];
+	if (ice_check_vf_ready_for_cfg(target_vf)) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	target_vsi = ice_get_vf_vsi(target_vf);
+	if (!target_vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (ice_set_rss(vsi, vrk->key, NULL, 0))
-		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
-error_param:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_KEY, v_ret,
-				     NULL, 0);
+	if (ice_vf_is_port_vlan_ena(target_vf) || ice_vsi_has_non_zero_vlans(target_vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	dcf_vlan = &target_vf->dcf_vlan_info;
+
+	if (insert_mode == VIRTCHNL_DCF_VLAN_INSERT_DISABLE) {
+		if (dcf_vlan->outer_port_vlan.vid) {
+			dcf_vlan->outer_port_vlan.vid = 0;
+			dcf_vlan->applying = 1;
+		}
+	} else if (insert_mode == VIRTCHNL_DCF_VLAN_INSERT_PORT_BASED) {
+		if (strip_mode) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+
+		if (dcf_vlan->outer_port_vlan.tpid != offload->tpid ||
+		    dcf_vlan->outer_port_vlan.vid != offload->vlan_id) {
+			dcf_vlan->outer_port_vlan.tpid = offload->tpid;
+			dcf_vlan->outer_port_vlan.vid = offload->vlan_id;
+			dcf_vlan->outer_port_vlan.prio = 0;
+			dcf_vlan->outer_port_vlan.fwd_act = ICE_FWD_TO_VSI;
+			dcf_vlan->applying = 1;
+		}
+	} else if (insert_mode) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	if (strip_mode == VIRTCHNL_DCF_VLAN_STRIP_DISABLE) {
+		if (dcf_vlan->outer_stripping_ena) {
+			dcf_vlan->outer_stripping_ena = 0;
+			dcf_vlan->applying = 1;
+		}
+	} else if (strip_mode == VIRTCHNL_DCF_VLAN_STRIP_INTO_RX_DESC) {
+		if (dcf_vlan->outer_stripping_tpid != offload->tpid ||
+		    !dcf_vlan->outer_stripping_ena) {
+			if (ice_vsi_is_rxq_crc_strip_dis(target_vsi)) {
+				v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+				goto err;
+			}
+			dcf_vlan->outer_stripping_tpid = offload->tpid;
+			dcf_vlan->outer_stripping_ena = 1;
+			dcf_vlan->applying = 1;
+		}
+	} else if (strip_mode) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	if (dcf_vlan->applying)
+		ice_vc_reset_vf(target_vf);
+
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_VLAN_OFFLOAD,
+				     v_ret, NULL, 0);
 }
 
 /**
- * ice_vc_config_rss_lut
+ * ice_dcf_handle_aq_cmd - handle the AdminQ command from DCF to FW
  * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
+ * @aq_desc: the AdminQ command descriptor
+ * @aq_buf: the AdminQ command buffer if aq_buf_size is non-zero
+ * @aq_buf_size: the AdminQ command buffer size
  *
- * Configure the VF's RSS LUT
+ * The VF splits the AdminQ command into two parts: one is the descriptor of
+ * AdminQ command, the other is the buffer of AdminQ command (the descriptor
+ * has BUF flag set). When both of them are received by PF, this function will
+ * forward them to firmware once to get the AdminQ's response. And also, the
+ * filled descriptor and buffer of the response will be sent back to VF one by
+ * one through the virtchnl.
  */
-static int ice_vc_config_rss_lut(struct ice_vf *vf, u8 *msg)
+static int
+ice_dcf_handle_aq_cmd(struct ice_vf *vf, struct ice_aq_desc *aq_desc,
+		      u8 *aq_buf, u16 aq_buf_size)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_pf *pf = vf->pf;
+	enum virtchnl_ops v_op;
+	enum ice_status aq_ret;
+	u16 v_msg_len = 0;
+	u8 *v_msg = NULL;
+	int ret;
+
+	pf->dcf.aq_desc_received = false;
+
+	if ((aq_buf && !aq_buf_size) || (!aq_buf && aq_buf_size))
+		return -EINVAL;
+
+	if (ice_dcf_is_acl_aq_cmd(aq_desc) && !ice_dcf_is_acl_capable(&pf->hw))
+		return 0;
+
+	if (ice_dcf_is_udp_tunnel_aq_cmd(aq_desc, aq_buf) &&
+	    !(pf->hw.dcf_caps & DCF_UDP_TUNNEL_CAP) &&
+	    !ice_is_tunnel_empty(&pf->hw))
+		return 0;
+
+	if (ice_dcf_pre_aq_send_cmd(vf, aq_desc, aq_buf, aq_buf_size)) {
+		ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_CMD_DESC,
+					    VIRTCHNL_STATUS_SUCCESS,
+					    (u8 *)aq_desc, sizeof(*aq_desc));
+		if (ret || !aq_buf_size)
+			return ret;
+
+		v_op = VIRTCHNL_OP_DCF_CMD_BUFF;
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		goto err;
+	}
+
+	aq_ret = ice_aq_send_cmd(&pf->hw, aq_desc, aq_buf, aq_buf_size, NULL);
+	/* It needs to send back the AQ response message if ICE_ERR_AQ_ERROR
+	 * returns, some AdminQ handlers will use the error code filled by FW
+	 * to do exception handling.
+	 */
+	if (aq_ret && aq_ret != ICE_ERR_AQ_ERROR) {
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+		v_op = VIRTCHNL_OP_DCF_CMD_DESC;
+		goto err;
+	}
+
+	if (aq_ret != ICE_ERR_AQ_ERROR) {
+		v_ret = ice_dcf_post_aq_send_cmd(pf, aq_desc, aq_buf);
+		if (v_ret != VIRTCHNL_STATUS_SUCCESS) {
+			v_op = VIRTCHNL_OP_DCF_CMD_DESC;
+			goto err;
+		}
+
+		v_ret = ice_dcf_update_acl_rule_info(pf, aq_desc, aq_buf);
+		if (v_ret != VIRTCHNL_STATUS_SUCCESS) {
+			v_op = VIRTCHNL_OP_DCF_CMD_DESC;
+			goto err;
+		}
+	}
+
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_CMD_DESC, v_ret,
+				    (u8 *)aq_desc, sizeof(*aq_desc));
+	/* Bail out so we don't send the VIRTCHNL_OP_DCF_CMD_BUFF message
+	 * below if failure happens or no AdminQ command buffer.
+	 */
+	if (ret || !aq_buf_size)
+		return ret;
+
+	v_op = VIRTCHNL_OP_DCF_CMD_BUFF;
+	v_msg_len = le16_to_cpu(aq_desc->datalen);
+
+	/* buffer is not updated if data length exceeds buffer size */
+	if (v_msg_len > aq_buf_size)
+		v_msg_len = 0;
+	else if (v_msg_len)
+		v_msg = aq_buf;
+
+	/* send the response back to the VF */
+err:
+	return ice_vc_send_msg_to_vf(vf, v_op, v_ret, v_msg, v_msg_len);
+}
+
+/**
+ * ice_vc_dcf_cmd_desc_msg - handle the DCF AdminQ command descriptor
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer which holds the command descriptor
+ * @len: length of the message
+ */
+static int ice_vc_dcf_cmd_desc_msg(struct ice_vf *vf, u8 *msg, u16 len)
+{
+	struct ice_aq_desc *aq_desc = (struct ice_aq_desc *)msg;
+	struct ice_pf *pf = vf->pf;
+
+	if (!ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON)
+		goto err;
+
+	if (len != sizeof(*aq_desc) || !ice_dcf_aq_cmd_permitted(aq_desc)) {
+		/* In case to avoid the VIRTCHNL_OP_DCF_CMD_DESC message with
+		 * the ICE_AQ_FLAG_BUF set followed by another bad message
+		 * VIRTCHNL_OP_DCF_CMD_DESC.
+		 */
+		pf->dcf.aq_desc_received = false;
+		goto err;
+	}
+
+	/* The AdminQ descriptor needs to be stored for use when the followed
+	 * VIRTCHNL_OP_DCF_CMD_BUFF is received.
+	 */
+	if (aq_desc->flags & cpu_to_le16(ICE_AQ_FLAG_BUF)) {
+		pf->dcf.aq_desc = *aq_desc;
+		pf->dcf.aq_desc_received = true;
+		pf->dcf.aq_desc_expires = jiffies + ICE_DCF_AQ_DESC_TIMEOUT;
+		return 0;
+	}
+
+	return ice_dcf_handle_aq_cmd(vf, aq_desc, NULL, 0);
+
+	/* send the response back to the VF */
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_CMD_DESC,
+				     VIRTCHNL_STATUS_ERR_PARAM, NULL, 0);
+}
+
+/**
+ * ice_vc_dcf_cmd_buff_msg - handle the DCF AdminQ command buffer
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer which holds the command buffer
+ * @len: length of the message
+ */
+static int ice_vc_dcf_cmd_buff_msg(struct ice_vf *vf, u8 *msg, u16 len)
+{
+	struct ice_pf *pf = vf->pf;
+
+	if (!ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON ||
+	    !len || !pf->dcf.aq_desc_received ||
+	    time_is_before_jiffies(pf->dcf.aq_desc_expires))
+		goto err;
+
+	return ice_dcf_handle_aq_cmd(vf, &pf->dcf.aq_desc, msg, len);
+
+	/* send the response back to the VF */
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_CMD_BUFF,
+				     VIRTCHNL_STATUS_ERR_PARAM, NULL, 0);
+}
+
+static int ice_vc_flush_dcf_rule(struct ice_vf *vf)
+{
+        enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+        struct ice_pf *pf = vf->pf;
+
+        if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+                goto err;
+        }
+
+        if (!ice_is_vf_dcf(vf)) {
+                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+                goto err;
+        }
+
+        ice_rm_all_dcf_sw_rules(pf);
+
+err:
+        return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_RULE_FLUSH,
+                                     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_dis_dcf_cap - disable DCF capability for the VF
+ * @vf: pointer to the VF
+ */
+static int ice_vc_dis_dcf_cap(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (!ice_is_vf_dcf(vf)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (vf->driver_caps & VIRTCHNL_VF_CAP_DCF) {
+#ifndef HAVE_NDO_SET_VF_TRUST
+		ice_set_vf_trust(ice_get_main_vsi(vf->pf)->netdev, vf->vf_id, false);
+#endif /* !HAVE_NOD_SET_VF_TRUST */
+		vf->driver_caps &= ~VIRTCHNL_VF_CAP_DCF;
+		ice_rm_all_dcf_sw_rules(vf->pf);
+		ice_clear_dcf_acl_cfg(vf->pf);
+		ice_clear_dcf_udp_tunnel_cfg(vf->pf);
+		vf->pf->hw.dcf_caps &= ~(DCF_ACL_CAP | DCF_UDP_TUNNEL_CAP);
+		ice_dcf_set_state(vf->pf, ICE_DCF_STATE_OFF);
+		vf->pf->dcf.vf = NULL;
+	}
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_DISABLE,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_dcf_get_vsi_map - get VSI mapping table
+ * @vf: pointer to the VF info
+ */
+static int ice_vc_dcf_get_vsi_map(struct ice_vf *vf)
 {
-	struct virtchnl_rss_lut *vrl = (struct virtchnl_rss_lut *)msg;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_dcf_vsi_map *vsi_map = NULL;
 	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi = NULL;
+	struct ice_vsi *pf_vsi;
+	u16 len = 0;
+	int vf_id;
+	int ret;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vrl->vsi_id)) {
+	if (!ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (vrl->lut_entries != ICE_VSIQF_HLUT_ARRAY_SIZE) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+	len = struct_size(vsi_map, vf_vsi, pf->num_alloc_vfs - 1);
+	vsi_map = kzalloc(len, GFP_KERNEL);
+	if (!vsi_map) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
 	}
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		len = 0;
+		goto err;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+	vsi_map->pf_vsi = pf_vsi->vsi_num;
+	vsi_map->num_vfs = pf->num_alloc_vfs;
+
+	ice_for_each_vf(pf, vf_id) {
+		struct ice_vf *tmp_vf = &pf->vf[vf_id];
+
+		if (!ice_is_vf_disabled(tmp_vf) &&
+		    test_bit(ICE_VF_STATE_INIT, tmp_vf->vf_states))
+			vsi_map->vf_vsi[vf_id] = tmp_vf->lan_vsi_num |
+				VIRTCHNL_DCF_VF_VSI_VALID;
 	}
 
-	if (ice_set_rss(vsi, NULL, vrl->lut, ICE_VSIQF_HLUT_ARRAY_SIZE))
-		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
-error_param:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_LUT, v_ret,
-				     NULL, 0);
+err:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_GET_VSI_MAP, v_ret,
+				    (u8 *)vsi_map, len);
+	kfree(vsi_map);
+	return ret;
 }
 
 /**
- * ice_vc_get_stats_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
+ * ice_vc_dcf_query_pkg_info - query DDP package info from PF
+ * @vf: pointer to VF info
  *
- * called from the VF to get VSI stats
+ * Called from VF to query DDP package information loaded in PF,
+ * including track ID, package name, version and device serial
+ * number.
  */
-static int ice_vc_get_stats_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_dcf_query_pkg_info(struct ice_vf *vf)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_queue_select *vqs =
-		(struct virtchnl_queue_select *)msg;
-	struct ice_eth_stats stats = { 0 };
+	struct virtchnl_pkg_info *pkg_info = NULL;
+	struct ice_hw *hw = &vf->pf->hw;
 	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi;
+	int len = 0;
+	int ret;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+	if (!ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+	len = sizeof(struct virtchnl_pkg_info);
+	pkg_info = kzalloc(len, GFP_KERNEL);
+	if (!pkg_info) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
 	}
 
-	ice_update_eth_stats(vsi);
-
-	stats = vsi->eth_stats;
+	pkg_info->track_id = hw->active_track_id;
+	memcpy(&pkg_info->pkg_ver, &hw->active_pkg_ver,
+	       sizeof(pkg_info->pkg_ver));
+	memcpy(pkg_info->pkg_name, hw->active_pkg_name,
+	       sizeof(pkg_info->pkg_name));
+	memcpy(pkg_info->dsn, pf->dcf.dsn, sizeof(pkg_info->dsn));
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_STATS, v_ret,
-				     (u8 *)&stats, sizeof(stats));
+err:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_GET_PKG_INFO,
+				    v_ret, (u8 *)pkg_info, len);
+	kfree(pkg_info);
+	return ret;
 }
 
 /**
- * ice_vc_ena_qs_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * called from the VF to enable all or specific queue(s)
+ * ice_vc_get_max_rss_qregion - message handling for VIRTCHNL_OP_GET_MAX_RSS_QREGION
+ * @vf: source of the request
  */
-static int ice_vc_ena_qs_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_get_max_rss_qregion(struct ice_vf *vf)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_queue_select *vqs =
-	    (struct virtchnl_queue_select *)msg;
-	struct ice_pf *pf = vf->pf;
+	struct virtchnl_max_rss_qregion *max_rss_qregion = NULL;
 	struct ice_vsi *vsi;
-	unsigned long q_map;
-	u16 vf_q_id;
+	int err, len = 0;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+	vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!vqs->rx_queues && !vqs->tx_queues) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	max_rss_qregion = kzalloc(sizeof(*max_rss_qregion), GFP_KERNEL);
+	if (!max_rss_qregion) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
 		goto error_param;
 	}
 
-	if (vqs->rx_queues > ICE_MAX_BASE_QS_PER_VF ||
-	    vqs->tx_queues > ICE_MAX_BASE_QS_PER_VF) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
+	len = sizeof(*max_rss_qregion);
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
+	max_rss_qregion->vport_id = vf->lan_vsi_num;
+	max_rss_qregion->qregion_width = ilog2(ICE_MAX_RSS_QS_PER_VF);
+	if (vsi->global_lut_id)
+		max_rss_qregion->qregion_width = ilog2(ICE_MAX_RSS_QS_PER_LARGE_VF);
 
-	/* Enable only Rx rings, Tx rings were enabled by the FW when the
-	 * Tx queue group list was configured and the context bits were
-	 * programmed using ice_vsi_cfg_txqs
-	 */
-	q_map = vqs->rx_queues;
-	for_each_set_bit(vf_q_id, &q_map, ICE_MAX_BASE_QS_PER_VF) {
-		if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
+error_param:
+	err = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_MAX_RSS_QREGION, v_ret,
+				    (u8 *)max_rss_qregion, len);
+	kfree(max_rss_qregion);
+	return err;
+}
 
-		/* Skip queue if enabled */
-		if (test_bit(vf_q_id, vf->rxq_ena))
-			continue;
+static bool ice_vc_supported_queue_type(s32 queue_type)
+{
+	return (queue_type == VIRTCHNL_QUEUE_TYPE_RX || queue_type == VIRTCHNL_QUEUE_TYPE_TX);
+}
 
-		if (ice_vsi_ctrl_rx_ring(vsi, true, vf_q_id)) {
-			dev_err(&vsi->back->pdev->dev,
-				"Failed to enable Rx ring %d on VSI %d\n",
-				vf_q_id, vsi->vsi_num);
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
+/**
+ * ice_vc_validate_qs_v2_msg - validate all parameters sent in the qs_msg structure
+ * @vf: VF the message was received from
+ * @qs_msg: contents of the message from the VF
+ *
+ * Used to validate both the VIRTCHNL_OP_ENABLE_QUEUES_V2 and VIRTCHNL_OP_DISABLE_QUEUES_V2
+ * messages. This should always be called before attempting to enable and/or disable queues on
+ * behalf of a VF in response to the preivously mentioned opcodes. If all checks succeed, then
+ * return success indicating to the caller that the qs_msg is valid. Otherwise return false,
+ * indicating to the caller that the qs_msg is invalid.
+ */
+static bool ice_vc_validate_qs_v2_msg(struct ice_vf *vf, struct virtchnl_del_ena_dis_queues *qs_msg)
+{
+	struct virtchnl_queue_chunks *chunks = &qs_msg->chunks;
+	int i;
+
+	if (qs_msg->vport_id != vf->lan_vsi_num)
+		return false;
+
+	if (!chunks->num_chunks)
+		return false;
 
-		set_bit(vf_q_id, vf->rxq_ena);
-		vf->num_qs_ena++;
+	for (i = 0; i < chunks->num_chunks; i++) {
+		if (!ice_vc_supported_queue_type(chunks->chunks[i].type))
+			return false;
+
+		if (!chunks->chunks[i].num_queues)
+			return false;
+
+		if (chunks->chunks[i].start_queue_id + chunks->chunks[i].num_queues > vf->num_vf_qs)
+			return false;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	q_map = vqs->tx_queues;
-	for_each_set_bit(vf_q_id, &q_map, ICE_MAX_BASE_QS_PER_VF) {
-		if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
+	return true;
+}
 
-		/* Skip queue if enabled */
-		if (test_bit(vf_q_id, vf->txq_ena))
-			continue;
+#define ice_for_each_q_in_chunk(chunk, q_id) \
+	for ((q_id) = (chunk)->start_queue_id; \
+	     (q_id) < (chunk)->start_queue_id + (chunk)->num_queues; \
+	     (q_id)++)
+
+static int ice_vc_ena_rxq_chunk(struct ice_vf *vf, struct virtchnl_queue_chunk *chunk)
+{
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	int q_id;
+
+	if (!vsi)
+		return -EINVAL;
+
+	ice_for_each_q_in_chunk(chunk, q_id) {
+		int err = ice_vf_vsi_ena_single_rxq(vf, vsi, q_id, q_id);
 
-		set_bit(vf_q_id, vf->txq_ena);
-		vf->num_qs_ena++;
+		if (err)
+			return err;
 	}
 
-	/* Set flag to indicate that queues are enabled */
-	if (v_ret == VIRTCHNL_STATUS_SUCCESS)
-		set_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
+	return 0;
+}
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_QUEUES, v_ret,
-				     NULL, 0);
+static int ice_vc_ena_txq_chunk(struct ice_vf *vf, struct virtchnl_queue_chunk *chunk)
+{
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	int q_id;
+
+	if (!vsi)
+		return -EINVAL;
+
+	ice_for_each_q_in_chunk(chunk, q_id)
+		ice_vf_vsi_ena_single_txq(vf, vsi, q_id, q_id);
+
+	return 0;
 }
 
 /**
- * ice_vc_dis_qs_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * called from the VF to disable all or specific
- * queue(s)
+ * ice_vc_ena_qs_v2_msg - message handling for VIRTCHNL_OP_ENABLE_QUEUES_V2
+ * @vf: source of the request
+ * @msg: message to handle
  */
-static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_ena_qs_v2_msg(struct ice_vf *vf, u8 *msg)
 {
+	struct virtchnl_del_ena_dis_queues *ena_qs_msg = (struct virtchnl_del_ena_dis_queues *)msg;
+	struct virtchnl_queue_chunks *chunks = &ena_qs_msg->chunks;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_queue_select *vqs =
-	    (struct virtchnl_queue_select *)msg;
-	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi;
-	unsigned long q_map;
-	u16 vf_q_id;
+	int i;
 
-	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) &&
-	    !test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states)) {
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+	if (!ice_vc_isvalid_vsi_id(vf, ena_qs_msg->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!vqs->rx_queues && !vqs->tx_queues) {
+	if (!ice_vc_validate_qs_v2_msg(vf, ena_qs_msg)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (vqs->rx_queues > ICE_MAX_BASE_QS_PER_VF ||
-	    vqs->tx_queues > ICE_MAX_BASE_QS_PER_VF) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
+	for (i = 0; i < chunks->num_chunks; i++) {
+		struct virtchnl_queue_chunk *chunk = &chunks->chunks[i];
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
+		if (chunk->type == VIRTCHNL_QUEUE_TYPE_RX && ice_vc_ena_rxq_chunk(vf, chunk))
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		else if (chunk->type == VIRTCHNL_QUEUE_TYPE_TX && ice_vc_ena_txq_chunk(vf, chunk))
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 
-	if (vqs->tx_queues) {
-		q_map = vqs->tx_queues;
+		if (v_ret != VIRTCHNL_STATUS_SUCCESS)
+			goto error_param;
+	}
 
-		for_each_set_bit(vf_q_id, &q_map, ICE_MAX_BASE_QS_PER_VF) {
-			struct ice_ring *ring = vsi->tx_rings[vf_q_id];
-			struct ice_txq_meta txq_meta = { 0 };
+	set_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
 
-			if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_QUEUES_V2, v_ret, NULL, 0);
+}
 
-			/* Skip queue if not enabled */
-			if (!test_bit(vf_q_id, vf->txq_ena))
-				continue;
+static int ice_vc_dis_rxq_chunk(struct ice_vf *vf, struct virtchnl_queue_chunk *chunk)
+{
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	u16 q_id;
 
-			ice_fill_txq_meta(vsi, ring, &txq_meta);
+	if (!vsi)
+		return -EINVAL;
 
-			if (ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, vf->vf_id,
-						 ring, &txq_meta)) {
-				dev_err(&vsi->back->pdev->dev,
-					"Failed to stop Tx ring %d on VSI %d\n",
-					vf_q_id, vsi->vsi_num);
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+	ice_for_each_q_in_chunk(chunk, q_id) {
+		int err;
 
-			/* Clear enabled queues flag */
-			clear_bit(vf_q_id, vf->txq_ena);
-			vf->num_qs_ena--;
-		}
+		err = ice_vf_vsi_dis_single_rxq(vf, vsi, q_id, q_id);
+		if (err)
+			return err;
 	}
 
-	if (vqs->rx_queues) {
-		q_map = vqs->rx_queues;
+	return 0;
+}
 
-		for_each_set_bit(vf_q_id, &q_map, ICE_MAX_BASE_QS_PER_VF) {
-			if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+static int ice_vc_dis_txq_chunk(struct ice_vf *vf, struct virtchnl_queue_chunk *chunk)
+{
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	u16 q_id;
 
-			/* Skip queue if not enabled */
-			if (!test_bit(vf_q_id, vf->rxq_ena))
-				continue;
+	if (!vsi)
+		return -EINVAL;
 
-			if (ice_vsi_ctrl_rx_ring(vsi, false, vf_q_id)) {
-				dev_err(&vsi->back->pdev->dev,
-					"Failed to stop Rx ring %d on VSI %d\n",
-					vf_q_id, vsi->vsi_num);
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+	ice_for_each_q_in_chunk(chunk, q_id) {
+		int err;
 
-			/* Clear enabled queues flag */
-			clear_bit(vf_q_id, vf->rxq_ena);
-			vf->num_qs_ena--;
-		}
+		err = ice_vf_vsi_dis_single_txq(vf, vsi, q_id, q_id);
+		if (err)
+			return err;
 	}
 
-	/* Clear enabled queues flag */
-	if (v_ret == VIRTCHNL_STATUS_SUCCESS && !vf->num_qs_ena)
-		clear_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
-
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_QUEUES, v_ret,
-				     NULL, 0);
+	return 0;
 }
 
 /**
- * ice_vc_cfg_irq_map_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * called from the VF to configure the IRQ to queue map
+ * ice_vc_dis_qs_v2_msg - message handling for VIRTCHNL_OP_DISABLE_QUEUES_V2
+ * @vf: source of the request
+ * @msg: message to handle
  */
-static int ice_vc_cfg_irq_map_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_dis_qs_v2_msg(struct ice_vf *vf, u8 *msg)
 {
+	struct virtchnl_del_ena_dis_queues *dis_qs_msg = (struct virtchnl_del_ena_dis_queues *)msg;
+	struct virtchnl_queue_chunks *chunks = &dis_qs_msg->chunks;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_irq_map_info *irqmap_info;
-	u16 vsi_id, vsi_q_id, vector_id;
-	struct virtchnl_vector_map *map;
-	struct ice_pf *pf = vf->pf;
-	u16 num_q_vectors_mapped;
-	struct ice_vsi *vsi;
-	unsigned long qmap;
 	int i;
 
-	irqmap_info = (struct virtchnl_irq_map_info *)msg;
-	num_q_vectors_mapped = irqmap_info->num_vectors;
-
-	/* Check to make sure number of VF vectors mapped is not greater than
-	 * number of VF vectors originally allocated, and check that
-	 * there is actually at least a single VF queue vector mapped
-	 */
-	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
-	    pf->num_vf_msix < num_q_vectors_mapped ||
-	    !irqmap_info->num_vectors) {
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
+	if (!ice_vc_isvalid_vsi_id(vf, dis_qs_msg->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	for (i = 0; i < num_q_vectors_mapped; i++) {
-		struct ice_q_vector *q_vector;
+	if (!ice_vc_validate_qs_v2_msg(vf, dis_qs_msg)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
 
-		map = &irqmap_info->vecmap[i];
+	for (i = 0; i < chunks->num_chunks; i++) {
+		struct virtchnl_queue_chunk *chunk = &chunks->chunks[i];
 
-		vector_id = map->vector_id;
-		vsi_id = map->vsi_id;
-		/* validate msg params */
-		if (!(vector_id < pf->hw.func_caps.common_cap
-		    .num_msix_vectors) || !ice_vc_isvalid_vsi_id(vf, vsi_id) ||
-		    (!vector_id && (map->rxq_map || map->txq_map))) {
+		if (chunk->type == VIRTCHNL_QUEUE_TYPE_RX && ice_vc_dis_rxq_chunk(vf, chunk))
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		else if (chunk->type == VIRTCHNL_QUEUE_TYPE_TX && ice_vc_dis_txq_chunk(vf, chunk))
 			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+
+		if (v_ret != VIRTCHNL_STATUS_SUCCESS)
 			goto error_param;
-		}
+	}
 
-		/* No need to map VF miscellaneous or rogue vector */
-		if (!vector_id)
-			continue;
+	if (ice_vf_has_no_qs_ena(vf))
+		clear_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
 
-		/* Subtract non queue vector from vector_id passed by VF
-		 * to get actual number of VSI queue vector array index
-		 */
-		q_vector = vsi->q_vectors[vector_id - ICE_NONQ_VECS_VF];
-		if (!q_vector) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_QUEUES_V2, v_ret, NULL, 0);
+}
 
-		/* lookout for the invalid queue index */
-		qmap = map->rxq_map;
-		q_vector->num_ring_rx = 0;
-		for_each_set_bit(vsi_q_id, &qmap, ICE_MAX_BASE_QS_PER_VF) {
-			if (!ice_vc_isvalid_q_id(vf, vsi_id, vsi_q_id)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
-			q_vector->num_ring_rx++;
-			q_vector->rx.itr_idx = map->rxitr_idx;
-			vsi->rx_rings[vsi_q_id]->q_vector = q_vector;
-			ice_cfg_rxq_interrupt(vsi, vsi_q_id, vector_id,
-					      q_vector->rx.itr_idx);
-		}
+/**
+ * ice_vc_validate_qv_maps - validate parameters sent in the qs_msg structure
+ * @vf: VF the message was received from
+ * @qv_maps: contents of the message from the VF
+ *
+ * Used to validate VIRTCHNL_OP_MAP_VECTOR  messages. This should always be called before attempting
+ * map interrupts to queues. If all checks succeed, then return success indicating to the caller
+ * that the qv_maps are valid. Otherwise return false, indicating to the caller that the qv_maps
+ * are invalid.
+ */
+static bool ice_vc_validate_qv_maps(struct ice_vf *vf, struct virtchnl_queue_vector_maps *qv_maps)
+{
+	struct ice_vsi *vsi;
+	int i;
+
+	vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	if (!vsi)
+		return false;
+
+	if (!qv_maps->num_qv_maps)
+		return false;
 
-		qmap = map->txq_map;
-		q_vector->num_ring_tx = 0;
-		for_each_set_bit(vsi_q_id, &qmap, ICE_MAX_BASE_QS_PER_VF) {
-			if (!ice_vc_isvalid_q_id(vf, vsi_id, vsi_q_id)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
-			q_vector->num_ring_tx++;
-			q_vector->tx.itr_idx = map->txitr_idx;
-			vsi->tx_rings[vsi_q_id]->q_vector = q_vector;
-			ice_cfg_txq_interrupt(vsi, vsi_q_id, vector_id,
-					      q_vector->tx.itr_idx);
-		}
+	for (i = 0; i < qv_maps->num_qv_maps; i++) {
+		if (!ice_vc_supported_queue_type(qv_maps->qv_maps[i].queue_type))
+			return false;
+
+		if (qv_maps->qv_maps[i].queue_id >= vf->num_vf_qs)
+			return false;
+
+		if (qv_maps->qv_maps[i].vector_id >= (vsi->num_q_vectors + ICE_NONQ_VECS_VF))
+			return false;
 	}
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_IRQ_MAP, v_ret,
-				     NULL, 0);
+	return true;
 }
 
 /**
- * ice_vc_cfg_qs_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * called from the VF to configure the Rx/Tx queues
+ * ice_vc_map_q_vector_msg - message handling for VIRTCHNL_OP_MAP_QUEUE_VECTOR
+ * @vf: source of the request
+ * @msg: message to handle
  */
-static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_map_q_vector_msg(struct ice_vf *vf, u8 *msg)
 {
+	struct virtchnl_queue_vector_maps *qv_maps = (struct virtchnl_queue_vector_maps *)msg;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_vsi_queue_config_info *qci =
-	    (struct virtchnl_vsi_queue_config_info *)msg;
-	struct virtchnl_queue_pair_info *qpi;
-	u16 num_rxq = 0, num_txq = 0;
-	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
 	int i;
 
@@ -2242,655 +8727,1156 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, qci->vsi_id)) {
+	if (!ice_vc_isvalid_vsi_id(vf, qv_maps->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
+	if (!ice_vc_validate_qv_maps(vf, qv_maps)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (qci->num_queue_pairs > ICE_MAX_BASE_QS_PER_VF ||
-	    qci->num_queue_pairs > min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)) {
-		dev_err(&pf->pdev->dev,
-			"VF-%d requesting more than supported number of queues: %d\n",
-			vf->vf_id, min_t(u16, vsi->alloc_txq, vsi->alloc_rxq));
+	vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	for (i = 0; i < qci->num_queue_pairs; i++) {
-		qpi = &qci->qpair[i];
-		if (qpi->txq.vsi_id != qci->vsi_id ||
-		    qpi->rxq.vsi_id != qci->vsi_id ||
-		    qpi->rxq.queue_id != qpi->txq.queue_id ||
-		    qpi->txq.headwb_enabled ||
-		    !ice_vc_isvalid_ring_len(qpi->txq.ring_len) ||
-		    !ice_vc_isvalid_ring_len(qpi->rxq.ring_len) ||
-		    !ice_vc_isvalid_q_id(vf, qci->vsi_id, qpi->txq.queue_id)) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
-		/* copy Tx queue info from VF into VSI */
-		if (qpi->txq.ring_len > 0) {
-			num_txq++;
-			vsi->tx_rings[i]->dma = qpi->txq.dma_ring_addr;
-			vsi->tx_rings[i]->count = qpi->txq.ring_len;
-		}
+	for (i = 0; i < qv_maps->num_qv_maps; i++) {
+		struct virtchnl_queue_vector *qv_map = &qv_maps->qv_maps[i];
 
-		/* copy Rx queue info from VF into VSI */
-		if (qpi->rxq.ring_len > 0) {
-			num_rxq++;
-			vsi->rx_rings[i]->dma = qpi->rxq.dma_ring_addr;
-			vsi->rx_rings[i]->count = qpi->rxq.ring_len;
+		if (qv_map->queue_type == VIRTCHNL_QUEUE_TYPE_RX)
+			ice_cfg_rxq_interrupt(vsi, qv_map->queue_id, qv_map->vector_id,
+					      qv_map->itr_idx);
+		else if (qv_map->queue_type == VIRTCHNL_QUEUE_TYPE_TX)
+			ice_cfg_txq_interrupt(vsi, qv_map->queue_id, qv_map->vector_id,
+					      qv_map->itr_idx);
+	}
 
-			if (qpi->rxq.databuffer_size != 0 &&
-			    (qpi->rxq.databuffer_size > ((16 * 1024) - 128) ||
-			     qpi->rxq.databuffer_size < 1024)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
-			vsi->rx_buf_len = qpi->rxq.databuffer_size;
-			vsi->rx_rings[i]->rx_buf_len = vsi->rx_buf_len;
-			if (qpi->rxq.max_pkt_size >= (16 * 1024) ||
-			    qpi->rxq.max_pkt_size < 64) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
-		}
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_MAP_QUEUE_VECTOR, v_ret, NULL, 0);
+}
+
+static u16 ice_vc_get_max_vlan_fltrs(struct ice_vf *vf)
+{
+	if (vf->trusted)
+		return VLAN_N_VID;
+	else
+		return ICE_MAX_VLAN_PER_VF;
+}
+
+/**
+ * ice_vf_outer_vlan_not_allowed - check outer VLAN can be used when the device is in DVM
+ * @vf: VF that being checked for
+ */
+static bool ice_vf_outer_vlan_not_allowed(struct ice_vf *vf)
+{
+	if (ice_vf_is_port_vlan_ena(vf))
+		return true;
+
+	if (vf->dcf_vlan_info.outer_port_vlan.vid ||
+	    vf->dcf_vlan_info.outer_stripping_ena)
+		return true;
+
+	return false;
+}
+
+/**
+ * ice_vc_set_dvm_caps - set VLAN capabilities when the device is in DVM
+ * @vf: VF that capabilities are being set for
+ * @caps: VLAN capabilities to populate
+ *
+ * Determine VLAN capabilities support based on whether a port VLAN is
+ * configured. If a port VLAN is configured then the VF should use the inner
+ * filtering/offload capabilities since the port VLAN is using the outer VLAN
+ * capabilies.
+ */
+static void
+ice_vc_set_dvm_caps(struct ice_vf *vf, struct virtchnl_vlan_caps *caps)
+{
+	struct virtchnl_vlan_supported_caps *supported_caps;
 
-		vsi->max_frame = qpi->rxq.max_pkt_size;
+	if (ice_vf_outer_vlan_not_allowed(vf)) {
+		/* until support for inner VLAN filtering is added when a port
+		 * VLAN is configured, only support software offloaded inner
+		 * VLANs when a port VLAN is confgured in DVM
+		 */
+		supported_caps = &caps->filtering.filtering_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.stripping_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.insertion_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+		caps->offloads.ethertype_match =
+			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+	} else {
+		supported_caps = &caps->filtering.filtering_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+		supported_caps->outer = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+					VIRTCHNL_VLAN_ETHERTYPE_9100 |
+					VIRTCHNL_VLAN_ETHERTYPE_AND;
+		caps->filtering.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+						 VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+						 VIRTCHNL_VLAN_ETHERTYPE_9100;
+
+		supported_caps = &caps->offloads.stripping_support;
+		supported_caps->inner = VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+					VIRTCHNL_VLAN_ETHERTYPE_9100 |
+					VIRTCHNL_VLAN_ETHERTYPE_XOR |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2;
+
+		supported_caps = &caps->offloads.insertion_support;
+		supported_caps->inner = VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+					VIRTCHNL_VLAN_ETHERTYPE_9100 |
+					VIRTCHNL_VLAN_ETHERTYPE_XOR |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2;
+
+		caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+
+		caps->offloads.ethertype_match =
+			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
 	}
 
-	/* VF can request to configure less than allocated queues
-	 * or default allocated queues. So update the VSI with new number
-	 */
-	vsi->num_txq = num_txq;
-	vsi->num_rxq = num_rxq;
-	/* All queues of VF VSI are in TC 0 */
-	vsi->tc_cfg.tc_info[0].qcount_tx = num_txq;
-	vsi->tc_cfg.tc_info[0].qcount_rx = num_rxq;
+	caps->filtering.max_filters = ice_vc_get_max_vlan_fltrs(vf);
+}
 
-	if (ice_vsi_cfg_lan_txqs(vsi) || ice_vsi_cfg_rxqs(vsi))
-		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+/**
+ * ice_vc_set_svm_caps - set VLAN capabilities when the device is in SVM
+ * @vf: VF that capabilities are being set for
+ * @caps: VLAN capabilities to populate
+ *
+ * Determine VLAN capabilities support based on whether a port VLAN is
+ * configured. If a port VLAN is configured then the VF does not have any VLAN
+ * filtering or offload capabilities since the port VLAN is using the inner VLAN
+ * capabilities in single VLAN mode (SVM). Otherwise allow the VF to use inner
+ * VLAN fitlering and offload capabilities.
+ */
+static void
+ice_vc_set_svm_caps(struct ice_vf *vf, struct virtchnl_vlan_caps *caps)
+{
+	struct virtchnl_vlan_supported_caps *supported_caps;
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_VSI_QUEUES, v_ret,
-				     NULL, 0);
+	if (ice_vf_is_port_vlan_ena(vf)) {
+		supported_caps = &caps->filtering.filtering_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.stripping_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.insertion_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		caps->offloads.ethertype_init = VIRTCHNL_VLAN_UNSUPPORTED;
+		caps->offloads.ethertype_match = VIRTCHNL_VLAN_UNSUPPORTED;
+		caps->filtering.max_filters = 0;
+	} else {
+		supported_caps = &caps->filtering.filtering_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+		caps->filtering.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+
+		supported_caps = &caps->offloads.stripping_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.insertion_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+		caps->offloads.ethertype_match =
+			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+		caps->filtering.max_filters = ice_vc_get_max_vlan_fltrs(vf);
+	}
 }
 
 /**
- * ice_is_vf_trusted
- * @vf: pointer to the VF info
+ * ice_vc_get_offload_vlan_v2_caps - determine VF's VLAN capabilities
+ * @vf: VF to determine VLAN capabilities for
+ *
+ * This will only be called if the VF and PF successfully negotiated
+ * VIRTCHNL_VF_OFFLOAD_VLAN_V2.
+ *
+ * Set VLAN capabilities based on the current VLAN mode and whether a port VLAN
+ * is configured or not.
  */
-static bool ice_is_vf_trusted(struct ice_vf *vf)
+static int ice_vc_get_offload_vlan_v2_caps(struct ice_vf *vf)
 {
-	return test_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vlan_caps *caps = NULL;
+	int err, len = 0;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
+	if (!caps) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		goto out;
+	}
+	len = sizeof(*caps);
+
+	if (ice_is_dvm_ena(&vf->pf->hw))
+		ice_vc_set_dvm_caps(vf, caps);
+	else
+		ice_vc_set_svm_caps(vf, caps);
+
+	/* store negotiated caps to prevent invalid VF messages */
+	memcpy(&vf->vlan_v2_caps, caps, sizeof(*caps));
+
+out:
+	err = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS,
+				    v_ret, (u8 *)caps, len);
+	kfree(caps);
+	return err;
 }
 
 /**
- * ice_can_vf_change_mac
- * @vf: pointer to the VF info
+ * ice_vc_validate_vlan_tpid - validate VLAN TPID
+ * @filtering_caps: negotiated/supported VLAN filtering capabilities
+ * @tpid: VLAN TPID used for validation
  *
- * Return true if the VF is allowed to change its MAC filters, false otherwise
+ * Convert the VLAN TPID to a VIRTCHNL_VLAN_ETHERTYPE_* and then compare against
+ * the negotiated/supported filtering caps to see if the VLAN TPID is valid.
  */
-static bool ice_can_vf_change_mac(struct ice_vf *vf)
+static bool ice_vc_validate_vlan_tpid(u16 filtering_caps, u16 tpid)
 {
-	/* If the VF MAC address has been set administratively (via the
-	 * ndo_set_vf_mac command), then deny permission to the VF to
-	 * add/delete unicast MAC addresses, unless the VF is trusted
-	 */
-	if (vf->pf_set_mac && !ice_is_vf_trusted(vf))
+	enum virtchnl_vlan_support vlan_ethertype = VIRTCHNL_VLAN_UNSUPPORTED;
+
+	switch (tpid) {
+	case ETH_P_8021Q:
+		vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_8100;
+		break;
+	case ETH_P_8021AD:
+		vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_88A8;
+		break;
+	case ETH_P_QINQ1:
+		vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_9100;
+		break;
+	}
+
+	if (!(filtering_caps & vlan_ethertype))
 		return false;
 
 	return true;
 }
 
 /**
- * ice_vc_handle_mac_addr_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- * @set: true if MAC filters are being set, false otherwise
+ * ice_vc_is_valid_vlan - validate the virtchnl_vlan
+ * @vc_vlan: virtchnl_vlan to validate
  *
- * add guest MAC address filter
+ * If the VLAN TCI and VLAN TPID are 0, then this filter is invalid, so return
+ * false. Otherwise return true.
+ */
+static bool ice_vc_is_valid_vlan(struct virtchnl_vlan *vc_vlan)
+{
+	if (!vc_vlan->tci || !vc_vlan->tpid)
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_validate_vlan_filter_list - validate the filter list from the VF
+ * @vfc: negotiated/supported VLAN filtering capabilities
+ * @vfl: VLAN filter list from VF to validate
+ *
+ * Validate all of the filters in the VLAN filter list from the VF. If any of
+ * the checks fail then return false. Otherwise return true.
+ */
+static bool
+ice_vc_validate_vlan_filter_list(struct virtchnl_vlan_filtering_caps *vfc,
+				 struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+	u16 i;
+
+	if (!vfl->num_elements)
+		return false;
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		struct virtchnl_vlan_supported_caps *filtering_support =
+			&vfc->filtering_support;
+		struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+		struct virtchnl_vlan *outer = &vlan_fltr->outer;
+		struct virtchnl_vlan *inner = &vlan_fltr->inner;
+
+		if ((ice_vc_is_valid_vlan(outer) &&
+		     filtering_support->outer == VIRTCHNL_VLAN_UNSUPPORTED) ||
+		    (ice_vc_is_valid_vlan(inner) &&
+		     filtering_support->inner == VIRTCHNL_VLAN_UNSUPPORTED))
+			return false;
+
+		if ((outer->tci_mask &&
+		     !(filtering_support->outer & VIRTCHNL_VLAN_FILTER_MASK)) ||
+		    (inner->tci_mask &&
+		     !(filtering_support->inner & VIRTCHNL_VLAN_FILTER_MASK)))
+			return false;
+
+		if (((outer->tci & VLAN_PRIO_MASK) &&
+		     !(filtering_support->outer & VIRTCHNL_VLAN_PRIO)) ||
+		    ((inner->tci & VLAN_PRIO_MASK) &&
+		     !(filtering_support->inner & VIRTCHNL_VLAN_PRIO)))
+			return false;
+
+		if ((ice_vc_is_valid_vlan(outer) &&
+		     !ice_vc_validate_vlan_tpid(filtering_support->outer, outer->tpid)) ||
+		    (ice_vc_is_valid_vlan(inner) &&
+		     !ice_vc_validate_vlan_tpid(filtering_support->inner, inner->tpid)))
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_vc_to_vlan - transform from struct virtchnl_vlan to struct ice_vlan
+ * @vc_vlan: struct virtchnl_vlan to transform
+ */
+static struct ice_vlan ice_vc_to_vlan(struct virtchnl_vlan *vc_vlan)
+{
+	struct ice_vlan vlan = { 0 };
+
+	vlan.prio = (vc_vlan->tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+	vlan.vid = vc_vlan->tci & VLAN_VID_MASK;
+	vlan.tpid = vc_vlan->tpid;
+
+	return vlan;
+}
+
+/**
+ * ice_vc_vlan_action - action to perform on the virthcnl_vlan
+ * @vsi: VF's VSI used to perform the action
+ * @vlan_action: function to perform the action with (i.e. add/del)
+ * @vlan: VLAN filter to perform the action with
  */
 static int
-ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set)
+ice_vc_vlan_action(struct ice_vsi *vsi,
+		   int (*vlan_action)(struct ice_vsi *, struct ice_vlan *),
+		   struct ice_vlan *vlan)
+{
+	int err;
+
+	err = vlan_action(vsi, vlan);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vc_del_vlans - delete VLAN(s) from the virtchnl filter list
+ * @vf: VF used to delete the VLAN(s)
+ * @vsi: VF's VSI used to delete the VLAN(s)
+ * @vfl: virthchnl filter list used to delete the filters
+ */
+static int
+ice_vc_del_vlans(struct ice_vf *vf, struct ice_vsi *vsi,
+		 struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+	bool vlan_promisc = ice_is_vlan_promisc_allowed(vf);
+	int err;
+	u16 i;
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+		struct virtchnl_vlan *vc_vlan;
+
+		vc_vlan = &vlan_fltr->outer;
+		if (ice_vc_is_valid_vlan(vc_vlan)) {
+			struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+			err = ice_vc_vlan_action(vsi,
+						 vsi->outer_vlan_ops.del_vlan,
+						 &vlan);
+			if (err)
+				return err;
+
+			if (vlan_promisc)
+				ice_vf_dis_vlan_promisc(vsi, &vlan);
+		}
+
+		vc_vlan = &vlan_fltr->inner;
+		if (ice_vc_is_valid_vlan(vc_vlan)) {
+			struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+			err = ice_vc_vlan_action(vsi,
+						 vsi->inner_vlan_ops.del_vlan,
+						 &vlan);
+			if (err)
+				return err;
+
+			/* no support for VLAN promiscuous on inner VLAN unless
+			 * we are in Single VLAN Mode (SVM)
+			 */
+			if (!ice_is_dvm_ena(&vsi->back->hw) && vlan_promisc)
+				ice_vf_dis_vlan_promisc(vsi, &vlan);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_remove_vlan_v2_msg - virtchnl handler for VIRTCHNL_OP_DEL_VLAN_V2
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ */
+static int ice_vc_remove_vlan_v2_msg(struct ice_vf *vf, u8 *msg)
 {
+	struct virtchnl_vlan_filter_list_v2 *vfl =
+		(struct virtchnl_vlan_filter_list_v2 *)msg;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_ether_addr_list *al =
-	    (struct virtchnl_ether_addr_list *)msg;
-	struct ice_pf *pf = vf->pf;
-	enum virtchnl_ops vc_op;
-	enum ice_status status;
 	struct ice_vsi *vsi;
-	int mac_count = 0;
-	int i;
-
-	if (set)
-		vc_op = VIRTCHNL_OP_ADD_ETH_ADDR;
-	else
-		vc_op = VIRTCHNL_OP_DEL_ETH_ADDR;
 
-	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
-	    !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) {
+	if (!ice_vc_validate_vlan_filter_list(&vf->vlan_v2_caps.filtering,
+					      vfl)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto handle_mac_exit;
+		goto out;
 	}
 
-	if (set && !ice_is_vf_trusted(vf) &&
-	    (vf->num_mac + al->num_elements) > ICE_MAX_MACADDR_PER_VF) {
-		dev_err(&pf->pdev->dev,
-			"Can't add more MAC addresses, because VF-%d is not trusted, switch the VF to trusted mode in order to add more functionalities\n",
-			vf->vf_id);
-		/* There is no need to let VF know about not being trusted
-		 * to add more MAC addr, so we can just return success message.
-		 */
+	if (!ice_vc_isvalid_vsi_id(vf, vfl->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto handle_mac_exit;
+		goto out;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	vsi = ice_get_vf_vsi(vf);
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto handle_mac_exit;
+		goto out;
 	}
 
-	for (i = 0; i < al->num_elements; i++) {
-		u8 *maddr = al->list[i].addr;
+	if (ice_vc_del_vlans(vf, vsi, vfl))
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 
-		if (ether_addr_equal(maddr, vf->dflt_lan_addr.addr) ||
-		    is_broadcast_ether_addr(maddr)) {
-			if (set) {
-				/* VF is trying to add filters that the PF
-				 * already added. Just continue.
-				 */
-				dev_info(&pf->pdev->dev,
-					 "MAC %pM already set for VF %d\n",
-					 maddr, vf->vf_id);
-				continue;
-			} else {
-				/* VF can't remove dflt_lan_addr/bcast MAC */
-				dev_err(&pf->pdev->dev,
-					"VF can't remove default MAC address or MAC %pM programmed by PF for VF %d\n",
-					maddr, vf->vf_id);
-				continue;
-			}
-		}
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_VLAN_V2, v_ret, NULL,
+				     0);
+}
 
-		/* check for the invalid cases and bail if necessary */
-		if (is_zero_ether_addr(maddr)) {
-			dev_err(&pf->pdev->dev,
-				"invalid MAC %pM provided for VF %d\n",
-				maddr, vf->vf_id);
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto handle_mac_exit;
-		}
+/**
+ * ice_vc_add_vlans - add VLAN(s) from the virtchnl filter list
+ * @vf: VF used to add the VLAN(s)
+ * @vsi: VF's VSI used to add the VLAN(s)
+ * @vfl: virthchnl filter list used to add the filters
+ */
+static int
+ice_vc_add_vlans(struct ice_vf *vf, struct ice_vsi *vsi,
+		 struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+	bool vlan_promisc = ice_is_vlan_promisc_allowed(vf);
+	int err;
+	u16 i;
 
-		if (is_unicast_ether_addr(maddr) &&
-		    !ice_can_vf_change_mac(vf)) {
-			dev_err(&pf->pdev->dev,
-				"can't change unicast MAC for untrusted VF %d\n",
-				vf->vf_id);
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto handle_mac_exit;
-		}
+	for (i = 0; i < vfl->num_elements; i++) {
+		struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+		struct virtchnl_vlan *vc_vlan;
 
-		/* program the updated filter list */
-		status = ice_vsi_cfg_mac_fltr(vsi, maddr, set);
-		if (status == ICE_ERR_DOES_NOT_EXIST ||
-		    status == ICE_ERR_ALREADY_EXISTS) {
-			dev_info(&pf->pdev->dev,
-				 "can't %s MAC filters %pM for VF %d, error %d\n",
-				 set ? "add" : "remove", maddr, vf->vf_id,
-				 status);
-		} else if (status) {
-			dev_err(&pf->pdev->dev,
-				"can't %s MAC filters for VF %d, error %d\n",
-				set ? "add" : "remove", vf->vf_id, status);
-			v_ret = ice_err_to_virt_err(status);
-			goto handle_mac_exit;
+		vc_vlan = &vlan_fltr->outer;
+		if (ice_vc_is_valid_vlan(vc_vlan)) {
+			struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+			err = ice_vc_vlan_action(vsi,
+						 vsi->outer_vlan_ops.add_vlan,
+						 &vlan);
+			if (err)
+				return err;
+
+			if (vlan_promisc) {
+				err = ice_vf_ena_vlan_promisc(vsi, &vlan);
+				if (err)
+					return err;
+			}
 		}
 
-		mac_count++;
-	}
+		vc_vlan = &vlan_fltr->inner;
+		if (ice_vc_is_valid_vlan(vc_vlan)) {
+			struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
 
-	/* Track number of MAC filters programmed for the VF VSI */
-	if (set)
-		vf->num_mac += mac_count;
-	else
-		vf->num_mac -= mac_count;
+			err = ice_vc_vlan_action(vsi,
+						 vsi->inner_vlan_ops.add_vlan,
+						 &vlan);
+			if (err)
+				return err;
 
-handle_mac_exit:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, vc_op, v_ret, NULL, 0);
-}
+			/* no support for VLAN promiscuous on inner VLAN unless
+			 * we are in Single VLAN Mode (SVM)
+			 */
+			if (!ice_is_dvm_ena(&vsi->back->hw) && vlan_promisc) {
+				err = ice_vf_ena_vlan_promisc(vsi, &vlan);
+				if (err)
+					return err;
+			}
+		}
+	}
 
-/**
- * ice_vc_add_mac_addr_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * add guest MAC address filter
- */
-static int ice_vc_add_mac_addr_msg(struct ice_vf *vf, u8 *msg)
-{
-	return ice_vc_handle_mac_addr_msg(vf, msg, true);
+	return 0;
 }
 
 /**
- * ice_vc_del_mac_addr_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
+ * ice_vc_validate_add_vlan_filter_list - validate add filter list from the VF
+ * @vsi: VF VSI used to get number of existing VLAN filters
+ * @vfc: negotiated/supported VLAN filtering capabilities
+ * @vfl: VLAN filter list from VF to validate
  *
- * remove guest MAC address filter
+ * Validate all of the filters in the VLAN filter list from the VF during the
+ * VIRTCHNL_OP_ADD_VLAN_V2 opcode. If any of the checks fail then return false.
+ * Otherwise return true.
  */
-static int ice_vc_del_mac_addr_msg(struct ice_vf *vf, u8 *msg)
+static bool
+ice_vc_validate_add_vlan_filter_list(struct ice_vsi *vsi,
+				     struct virtchnl_vlan_filtering_caps *vfc,
+				     struct virtchnl_vlan_filter_list_v2 *vfl)
 {
-	return ice_vc_handle_mac_addr_msg(vf, msg, false);
+	u16 num_requested_filters = vsi->num_vlan + vfl->num_elements;
+
+	if (num_requested_filters > vfc->max_filters)
+		return false;
+
+	return ice_vc_validate_vlan_filter_list(vfc, vfl);
 }
 
 /**
- * ice_vc_request_qs_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * VFs get a default number of queues but can use this message to request a
- * different number. If the request is successful, PF will reset the VF and
- * return 0. If unsuccessful, PF will send message informing VF of number of
- * available queue pairs via virtchnl message response to VF.
+ * ice_vc_add_vlan_v2_msg - virtchnl handler for VIRTCHNL_OP_ADD_VLAN_V2
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
  */
-static int ice_vc_request_qs_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_add_vlan_v2_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_vf_res_request *vfres =
-		(struct virtchnl_vf_res_request *)msg;
-	u16 req_queues = vfres->num_queue_pairs;
-	struct ice_pf *pf = vf->pf;
-	u16 max_allowed_vf_queues;
-	u16 tx_rx_queue_left;
-	u16 cur_queues;
+	struct virtchnl_vlan_filter_list_v2 *vfl =
+		(struct virtchnl_vlan_filter_list_v2 *)msg;
+	struct ice_vsi *vsi;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
 	}
 
-	cur_queues = vf->num_vf_qs;
-	tx_rx_queue_left = min_t(u16, ice_get_avail_txq_count(pf),
-				 ice_get_avail_rxq_count(pf));
-	max_allowed_vf_queues = tx_rx_queue_left + cur_queues;
-	if (!req_queues) {
-		dev_err(&pf->pdev->dev,
-			"VF %d tried to request 0 queues. Ignoring.\n",
-			vf->vf_id);
-	} else if (req_queues > ICE_MAX_BASE_QS_PER_VF) {
-		dev_err(&pf->pdev->dev,
-			"VF %d tried to request more than %d queues.\n",
-			vf->vf_id, ICE_MAX_BASE_QS_PER_VF);
-		vfres->num_queue_pairs = ICE_MAX_BASE_QS_PER_VF;
-	} else if (req_queues > cur_queues &&
-		   req_queues - cur_queues > tx_rx_queue_left) {
-		dev_warn(&pf->pdev->dev,
-			 "VF %d requested %u more queues, but only %u left.\n",
-			 vf->vf_id, req_queues - cur_queues, tx_rx_queue_left);
-		vfres->num_queue_pairs = min_t(u16, max_allowed_vf_queues,
-					       ICE_MAX_BASE_QS_PER_VF);
-	} else {
-		/* request is successful, then reset VF */
-		vf->num_req_qs = req_queues;
-		ice_vc_dis_vf(vf);
-		dev_info(&pf->pdev->dev,
-			 "VF %d granted request of %u queues.\n",
-			 vf->vf_id, req_queues);
-		return 0;
+	if (!ice_vc_isvalid_vsi_id(vf, vfl->vport_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
 	}
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES,
-				     v_ret, (u8 *)vfres, sizeof(*vfres));
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	if (!ice_vc_validate_add_vlan_filter_list(vsi,
+						  &vf->vlan_v2_caps.filtering,
+						  vfl)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	if (ice_vc_add_vlans(vf, vsi, vfl))
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_VLAN_V2, v_ret, NULL,
+				     0);
 }
 
 /**
- * ice_set_vf_port_vlan
- * @netdev: network interface device structure
- * @vf_id: VF identifier
- * @vlan_id: VLAN ID being set
- * @qos: priority setting
- * @vlan_proto: VLAN protocol
+ * ice_vc_valid_vlan_setting - validate VLAN setting
+ * @negotiated_settings: negotiated VLAN settings during VF init
+ * @ethertype_setting: ethertype(s) requested for the VLAN setting
+ */
+static bool
+ice_vc_valid_vlan_setting(u32 negotiated_settings, u32 ethertype_setting)
+{
+	if (ethertype_setting && !(negotiated_settings & ethertype_setting))
+		return false;
+
+	/* only allow a single VIRTCHNL_VLAN_ETHERTYPE if
+	 * VIRTHCNL_VLAN_ETHERTYPE_AND is not negotiated/supported
+	 */
+	if (!(negotiated_settings & VIRTCHNL_VLAN_ETHERTYPE_AND) &&
+	    hweight32(ethertype_setting) > 1)
+		return false;
+
+	/* ability to modify the VLAN setting was not negotiated */
+	if (!(negotiated_settings & VIRTCHNL_VLAN_TOGGLE))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_valid_vlan_setting_msg - validate the VLAN setting message
+ * @caps: negotiated VLAN settings during VF init
+ * @msg: message to validate
  *
- * program VF Port VLAN ID and/or QoS
+ * Used to validate any VLAN virtchnl message sent as a
+ * virtchnl_vlan_setting structure. Validates the message against the
+ * negotiated/supported caps during VF driver init.
  */
-int
-ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
-		     __be16 vlan_proto)
+static bool
+ice_vc_valid_vlan_setting_msg(struct virtchnl_vlan_supported_caps *caps,
+			      struct virtchnl_vlan_setting *msg)
 {
-	u16 vlanprio = vlan_id | (qos << ICE_VLAN_PRIORITY_S);
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_pf *pf = np->vsi->back;
-	struct ice_vsi *vsi;
-	struct ice_vf *vf;
-	int ret = 0;
+	if ((!msg->outer_ethertype_setting &&
+	     !msg->inner_ethertype_setting) ||
+	    (!caps->outer && !caps->inner))
+		return false;
 
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		dev_err(&pf->pdev->dev, "invalid VF id: %d\n", vf_id);
-		return -EINVAL;
-	}
+	if (msg->outer_ethertype_setting &&
+	    !ice_vc_valid_vlan_setting(caps->outer,
+				       msg->outer_ethertype_setting))
+		return false;
 
-	if (vlan_id > ICE_MAX_VLANID || qos > 7) {
-		dev_err(&pf->pdev->dev, "Invalid VF Parameters\n");
+	if (msg->inner_ethertype_setting &&
+	    !ice_vc_valid_vlan_setting(caps->inner,
+				       msg->inner_ethertype_setting))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_get_tpid - transform from VIRTCHNL_VLAN_ETHERTYPE_* to VLAN TPID
+ * @ethertype_setting: VIRTCHNL_VLAN_ETHERTYPE_* used to get VLAN TPID
+ * @tpid: VLAN TPID to populate
+ */
+static int ice_vc_get_tpid(u32 ethertype_setting, u16 *tpid)
+{
+	switch (ethertype_setting) {
+	case VIRTCHNL_VLAN_ETHERTYPE_8100:
+		*tpid = ETH_P_8021Q;
+		break;
+	case VIRTCHNL_VLAN_ETHERTYPE_88A8:
+		*tpid = ETH_P_8021AD;
+		break;
+	case VIRTCHNL_VLAN_ETHERTYPE_9100:
+		*tpid = ETH_P_QINQ1;
+		break;
+	default:
+		*tpid = 0;
 		return -EINVAL;
 	}
 
-	if (vlan_proto != htons(ETH_P_8021Q)) {
-		dev_err(&pf->pdev->dev, "VF VLAN protocol is not supported\n");
-		return -EPROTONOSUPPORT;
-	}
+	return 0;
+}
 
-	vf = &pf->vf[vf_id];
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		dev_err(&pf->pdev->dev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
+/**
+ * ice_vc_ena_vlan_offload - enable VLAN offload based on the ethertype_setting
+ * @vsi: VF's VSI used to enable the VLAN offload
+ * @ena_offload: function used to enable the VLAN offload
+ * @ethertype_setting: VIRTCHNL_VLAN_ETHERTYPE_* to enable offloads for
+ */
+static int
+ice_vc_ena_vlan_offload(struct ice_vsi *vsi,
+			int (*ena_offload)(struct ice_vsi *vsi, u16 tpid),
+			u32 ethertype_setting)
+{
+	u16 tpid;
+	int err;
 
-	if (le16_to_cpu(vsi->info.pvid) == vlanprio) {
-		/* duplicate request, so just return success */
-		dev_info(&pf->pdev->dev,
-			 "Duplicate pvid %d request\n", vlanprio);
-		return ret;
-	}
+	err = ice_vc_get_tpid(ethertype_setting, &tpid);
+	if (err)
+		return err;
 
-	/* If PVID, then remove all filters on the old VLAN */
-	if (vsi->info.pvid)
-		ice_vsi_kill_vlan(vsi, (le16_to_cpu(vsi->info.pvid) &
-				  VLAN_VID_MASK));
+	err = ena_offload(vsi, tpid);
+	if (err)
+		return err;
 
-	if (vlan_id || qos) {
-		ret = ice_vsi_manage_pvid(vsi, vlanprio, true);
-		if (ret)
-			goto error_set_pvid;
-	} else {
-		ice_vsi_manage_pvid(vsi, 0, false);
-		vsi->info.pvid = 0;
-	}
+	return 0;
+}
 
-	if (vlan_id) {
-		dev_info(&pf->pdev->dev, "Setting VLAN %d, QOS 0x%x on VF %d\n",
-			 vlan_id, qos, vf_id);
+#define ICE_L2TSEL_QRX_CONTEXT_REG_IDX	3
+#define ICE_L2TSEL_BIT_OFFSET		23
+enum ice_l2tsel {
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND,
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1,
+};
 
-		/* add new VLAN filter for each MAC */
-		ret = ice_vsi_add_vlan(vsi, vlan_id);
-		if (ret)
-			goto error_set_pvid;
-	}
+/**
+ * ice_vsi_update_l2tsel - update l2tsel field for all Rx rings on this VSI
+ * @vsi: VSI used to update l2tsel on
+ * @l2tsel: l2tsel setting requested
+ *
+ * Use the l2tsel setting to update all of the Rx queue context bits for l2tsel.
+ * This will modify which descriptor field the first offloaded VLAN will be
+ * stripped into.
+ */
+static void ice_vsi_update_l2tsel(struct ice_vsi *vsi, enum ice_l2tsel l2tsel)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	u32 l2tsel_bit;
+	int i;
 
-	/* The Port VLAN needs to be saved across resets the same as the
-	 * default LAN MAC address.
-	 */
-	vf->port_vlan_id = le16_to_cpu(vsi->info.pvid);
+	if (l2tsel == ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND)
+		l2tsel_bit = 0;
+	else
+		l2tsel_bit = BIT(ICE_L2TSEL_BIT_OFFSET);
 
-error_set_pvid:
-	return ret;
+	for (i = 0; i < vsi->alloc_rxq; i++) {
+		u16 pfq = vsi->rxq_map[i];
+		u32 qrx_context_offset;
+		u32 regval;
+
+		qrx_context_offset =
+			QRX_CONTEXT(ICE_L2TSEL_QRX_CONTEXT_REG_IDX, pfq);
+
+		regval = rd32(hw, qrx_context_offset);
+		regval &= ~BIT(ICE_L2TSEL_BIT_OFFSET);
+		regval |= l2tsel_bit;
+		wr32(hw, qrx_context_offset, regval);
+	}
 }
 
 /**
- * ice_vc_process_vlan_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- * @add_v: Add VLAN if true, otherwise delete VLAN
+ * ice_vc_ena_vlan_stripping_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
  *
- * Process virtchnl op to add or remove programmed guest VLAN ID
+ * virthcnl handler for VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2
  */
-static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
+static int ice_vc_ena_vlan_stripping_v2_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_vlan_filter_list *vfl =
-	    (struct virtchnl_vlan_filter_list *)msg;
-	struct ice_pf *pf = vf->pf;
-	bool vlan_promisc = false;
+	struct virtchnl_vlan_supported_caps *stripping_support;
+	struct virtchnl_vlan_setting *strip_msg =
+		(struct virtchnl_vlan_setting *)msg;
+	u32 ethertype_setting;
 	struct ice_vsi *vsi;
-	struct ice_hw *hw;
-	int status = 0;
-	u8 promisc_m;
-	int i;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vfl->vsi_id)) {
+	if (!ice_vc_isvalid_vsi_id(vf, strip_msg->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
-
-	if (add_v && !ice_is_vf_trusted(vf) &&
-	    vf->num_vlan >= ICE_MAX_VLAN_PER_VF) {
-		dev_info(&pf->pdev->dev,
-			 "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
-			 vf->vf_id);
-		/* There is no need to let VF know about being not trusted,
-		 * so we can just return success message here
-		 */
-		goto error_param;
-	}
-
-	for (i = 0; i < vfl->num_elements; i++) {
-		if (vfl->vlan_id[i] > ICE_MAX_VLANID) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			dev_err(&pf->pdev->dev,
-				"invalid VF VLAN id %d\n", vfl->vlan_id[i]);
-			goto error_param;
-		}
+		goto out;
 	}
 
-	hw = &pf->hw;
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	vsi = ice_get_vf_vsi(vf);
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
 	}
 
-	if (vsi->info.pvid) {
+	stripping_support = &vf->vlan_v2_caps.offloads.stripping_support;
+	if (!ice_vc_valid_vlan_setting_msg(stripping_support, strip_msg)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
 	}
 
-	if (ice_vsi_manage_vlan_stripping(vsi, add_v)) {
-		dev_err(&pf->pdev->dev,
-			"%sable VLAN stripping failed for VSI %i\n",
-			 add_v ? "en" : "dis", vsi->vsi_num);
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+	if (ice_vsi_is_rxq_crc_strip_dis(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto out;
 	}
 
-	if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
-	    test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
-		vlan_promisc = true;
+	ethertype_setting = strip_msg->outer_ethertype_setting;
+	if (ethertype_setting) {
+		if (ice_vc_ena_vlan_offload(vsi,
+					    vsi->outer_vlan_ops.ena_stripping,
+					    ethertype_setting)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto out;
+		} else {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support outer stripping so the first tag always ends
+			 * up in L2TAG2_2ND and the second/inner tag, if
+			 * enabled, is extracted in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
 
-	if (add_v) {
-		for (i = 0; i < vfl->num_elements; i++) {
-			u16 vid = vfl->vlan_id[i];
+			vf->vlan_strip_ena |= ICE_OUTER_VLAN_STRIP_ENA;
+		}
+	}
 
-			if (!ice_is_vf_trusted(vf) &&
-			    vf->num_vlan >= ICE_MAX_VLAN_PER_VF) {
-				dev_info(&pf->pdev->dev,
-					 "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
-					 vf->vf_id);
-				/* There is no need to let VF know about being
-				 * not trusted, so we can just return success
-				 * message here as well.
-				 */
-				goto error_param;
-			}
+	ethertype_setting = strip_msg->inner_ethertype_setting;
+	if (ethertype_setting &&
+	    ice_vc_ena_vlan_offload(vsi, vsi->inner_vlan_ops.ena_stripping,
+				    ethertype_setting)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-			if (ice_vsi_add_vlan(vsi, vid)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+	if (ethertype_setting)
+		vf->vlan_strip_ena |= ICE_INNER_VLAN_STRIP_ENA;
 
-			vf->num_vlan++;
-			/* Enable VLAN pruning when VLAN is added */
-			if (!vlan_promisc) {
-				status = ice_cfg_vlan_pruning(vsi, true, false);
-				if (status) {
-					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-					dev_err(&pf->pdev->dev,
-						"Enable VLAN pruning on VLAN ID: %d failed error-%d\n",
-						vid, status);
-					goto error_param;
-				}
-			} else {
-				/* Enable Ucast/Mcast VLAN promiscuous mode */
-				promisc_m = ICE_PROMISC_VLAN_TX |
-					    ICE_PROMISC_VLAN_RX;
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2, v_ret, NULL, 0);
+}
 
-				status = ice_set_vsi_promisc(hw, vsi->idx,
-							     promisc_m, vid);
-				if (status) {
-					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-					dev_err(&pf->pdev->dev,
-						"Enable Unicast/multicast promiscuous mode on VLAN ID:%d failed error-%d\n",
-						vid, status);
-				}
-			}
-		}
-	} else {
-		/* In case of non_trusted VF, number of VLAN elements passed
-		 * to PF for removal might be greater than number of VLANs
-		 * filter programmed for that VF - So, use actual number of
-		 * VLANS added earlier with add VLAN opcode. In order to avoid
-		 * removing VLAN that doesn't exist, which result to sending
-		 * erroneous failed message back to the VF
-		 */
-		int num_vf_vlan;
+/**
+ * ice_vc_dis_vlan_stripping_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ *
+ * virthcnl handler for VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2
+ */
+static int ice_vc_dis_vlan_stripping_v2_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vlan_supported_caps *stripping_support;
+	struct virtchnl_vlan_setting *strip_msg =
+		(struct virtchnl_vlan_setting *)msg;
+	u32 ethertype_setting;
+	struct ice_vsi *vsi;
 
-		num_vf_vlan = vf->num_vlan;
-		for (i = 0; i < vfl->num_elements && i < num_vf_vlan; i++) {
-			u16 vid = vfl->vlan_id[i];
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-			/* Make sure ice_vsi_kill_vlan is successful before
-			 * updating VLAN information
-			 */
-			if (ice_vsi_kill_vlan(vsi, vid)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+	if (!ice_vc_isvalid_vsi_id(vf, strip_msg->vport_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-			vf->num_vlan--;
-			/* Disable VLAN pruning when the last VLAN is removed */
-			if (!vf->num_vlan)
-				ice_cfg_vlan_pruning(vsi, false, false);
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-			/* Disable Unicast/Multicast VLAN promiscuous mode */
-			if (vlan_promisc) {
-				promisc_m = ICE_PROMISC_VLAN_TX |
-					    ICE_PROMISC_VLAN_RX;
+	stripping_support = &vf->vlan_v2_caps.offloads.stripping_support;
+	if (!ice_vc_valid_vlan_setting_msg(stripping_support, strip_msg)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-				ice_clear_vsi_promisc(hw, vsi->idx,
-						      promisc_m, vid);
-			}
+	ethertype_setting = strip_msg->outer_ethertype_setting;
+	if (ethertype_setting) {
+		if (vsi->outer_vlan_ops.dis_stripping(vsi)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto out;
+		} else {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support inner stripping while outer stripping is
+			 * disabled so that the first and only tag is extracted
+			 * in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
+
+			vf->vlan_strip_ena &= ~ICE_OUTER_VLAN_STRIP_ENA;
 		}
 	}
 
-error_param:
-	/* send the response to the VF */
-	if (add_v)
-		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_VLAN, v_ret,
-					     NULL, 0);
-	else
-		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_VLAN, v_ret,
-					     NULL, 0);
-}
+	ethertype_setting = strip_msg->inner_ethertype_setting;
+	if (ethertype_setting && vsi->inner_vlan_ops.dis_stripping(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-/**
- * ice_vc_add_vlan_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * Add and program guest VLAN ID
- */
-static int ice_vc_add_vlan_msg(struct ice_vf *vf, u8 *msg)
-{
-	return ice_vc_process_vlan_msg(vf, msg, true);
+	if (ethertype_setting)
+		vf->vlan_strip_ena &= ~ICE_INNER_VLAN_STRIP_ENA;
+
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2, v_ret, NULL, 0);
 }
 
 /**
- * ice_vc_remove_vlan_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
+ * ice_vc_ena_vlan_insertion_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
  *
- * remove programmed guest VLAN ID
+ * virthcnl handler for VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2
  */
-static int ice_vc_remove_vlan_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_ena_vlan_insertion_v2_msg(struct ice_vf *vf, u8 *msg)
 {
-	return ice_vc_process_vlan_msg(vf, msg, false);
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vlan_supported_caps *insertion_support;
+	struct virtchnl_vlan_setting *insertion_msg =
+		(struct virtchnl_vlan_setting *)msg;
+	u32 ethertype_setting;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, insertion_msg->vport_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	insertion_support = &vf->vlan_v2_caps.offloads.insertion_support;
+	if (!ice_vc_valid_vlan_setting_msg(insertion_support, insertion_msg)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	ethertype_setting = insertion_msg->outer_ethertype_setting;
+	if (ethertype_setting &&
+	    ice_vc_ena_vlan_offload(vsi, vsi->outer_vlan_ops.ena_insertion,
+				    ethertype_setting)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	ethertype_setting = insertion_msg->inner_ethertype_setting;
+	if (ethertype_setting &&
+	    ice_vc_ena_vlan_offload(vsi, vsi->inner_vlan_ops.ena_insertion,
+				    ethertype_setting)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2, v_ret, NULL, 0);
 }
 
 /**
- * ice_vc_ena_vlan_stripping
- * @vf: pointer to the VF info
+ * ice_vc_dis_vlan_insertion_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
  *
- * Enable VLAN header stripping for a given VF
+ * virthcnl handler for VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2
  */
-static int ice_vc_ena_vlan_stripping(struct ice_vf *vf)
+static int ice_vc_dis_vlan_insertion_v2_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct ice_pf *pf = vf->pf;
+	struct virtchnl_vlan_supported_caps *insertion_support;
+	struct virtchnl_vlan_setting *insertion_msg =
+		(struct virtchnl_vlan_setting *)msg;
+	u32 ethertype_setting;
 	struct ice_vsi *vsi;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, insertion_msg->vport_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (ice_vsi_manage_vlan_stripping(vsi, true))
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-error_param:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING,
-				     v_ret, NULL, 0);
+	insertion_support = &vf->vlan_v2_caps.offloads.insertion_support;
+	if (!ice_vc_valid_vlan_setting_msg(insertion_support, insertion_msg)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	ethertype_setting = insertion_msg->outer_ethertype_setting;
+	if (ethertype_setting && vsi->outer_vlan_ops.dis_insertion(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	ethertype_setting = insertion_msg->inner_ethertype_setting;
+	if (ethertype_setting && vsi->inner_vlan_ops.dis_insertion(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2, v_ret, NULL, 0);
+}
+
+static struct ice_vc_vf_ops ice_vc_vf_dflt_ops = {
+	.get_ver_msg = ice_vc_get_ver_msg,
+	.get_vf_res_msg = ice_vc_get_vf_res_msg,
+	.reset_vf = ice_vc_reset_vf_msg,
+	.add_mac_addr_msg = ice_vc_add_mac_addr_msg,
+	.del_mac_addr_msg = ice_vc_del_mac_addr_msg,
+	.cfg_qs_msg = ice_vc_cfg_qs_msg,
+	.ena_qs_msg = ice_vc_ena_qs_msg,
+	.dis_qs_msg = ice_vc_dis_qs_msg,
+	.request_qs_msg = ice_vc_request_qs_msg,
+	.cfg_irq_map_msg = ice_vc_cfg_irq_map_msg,
+	.config_rss_key = ice_vc_config_rss_key,
+	.config_rss_lut = ice_vc_config_rss_lut,
+	.get_stats_msg = ice_vc_get_stats_msg,
+	.cfg_promiscuous_mode_msg = ice_vc_cfg_promiscuous_mode_msg,
+	.add_vlan_msg = ice_vc_add_vlan_msg,
+	.remove_vlan_msg = ice_vc_remove_vlan_msg,
+	.query_rxdid = ice_vc_query_rxdid,
+	.get_rss_hena = ice_vc_get_rss_hena,
+	.set_rss_hena_msg = ice_vc_set_rss_hena,
+	.ena_vlan_stripping = ice_vc_ena_vlan_stripping,
+	.dis_vlan_stripping = ice_vc_dis_vlan_stripping,
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	.add_qch_msg = ice_vc_add_qch_msg,
+	.add_switch_filter_msg = ice_vc_add_switch_filter,
+	.del_switch_filter_msg = ice_vc_del_switch_filter,
+	.del_qch_msg = ice_vc_del_qch_msg,
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+	.rdma_msg = ice_vc_rdma_msg,
+	.cfg_rdma_irq_map_msg = ice_vc_cfg_rdma_irq_map_msg,
+	.clear_rdma_irq_map = ice_vc_clear_rdma_irq_map,
+	.dcf_vlan_offload_msg = ice_vc_dcf_vlan_offload_msg,
+	.dcf_cmd_desc_msg = ice_vc_dcf_cmd_desc_msg,
+	.dcf_cmd_buff_msg = ice_vc_dcf_cmd_buff_msg,
+	.dis_dcf_cap = ice_vc_dis_dcf_cap,
+	.dcf_get_vsi_map = ice_vc_dcf_get_vsi_map,
+	.dcf_query_pkg_info = ice_vc_dcf_query_pkg_info,
+	.handle_rss_cfg_msg = ice_vc_handle_rss_cfg,
+	.add_fdir_fltr_msg = ice_vc_add_fdir_fltr,
+	.del_fdir_fltr_msg = ice_vc_del_fdir_fltr,
+	.get_max_rss_qregion = ice_vc_get_max_rss_qregion,
+	.ena_qs_v2_msg = ice_vc_ena_qs_v2_msg,
+	.dis_qs_v2_msg = ice_vc_dis_qs_v2_msg,
+	.map_q_vector_msg = ice_vc_map_q_vector_msg,
+	.get_offload_vlan_v2_caps = ice_vc_get_offload_vlan_v2_caps,
+	.add_vlan_v2_msg = ice_vc_add_vlan_v2_msg,
+	.remove_vlan_v2_msg = ice_vc_remove_vlan_v2_msg,
+	.ena_vlan_stripping_v2_msg = ice_vc_ena_vlan_stripping_v2_msg,
+	.dis_vlan_stripping_v2_msg = ice_vc_dis_vlan_stripping_v2_msg,
+	.ena_vlan_insertion_v2_msg = ice_vc_ena_vlan_insertion_v2_msg,
+	.dis_vlan_insertion_v2_msg = ice_vc_dis_vlan_insertion_v2_msg,
+};
+
+void ice_vc_set_dflt_vf_ops(struct ice_vc_vf_ops *ops)
+{
+	*ops = ice_vc_vf_dflt_ops;
+}
+
+static int ice_vc_repr_no_action_msg(struct ice_vf __always_unused *vf,
+				     u8 __always_unused *msg)
+{
+	return 0;
 }
 
 /**
- * ice_vc_dis_vlan_stripping
- * @vf: pointer to the VF info
+ * ice_vc_repr_add_mac
+ * @vf: pointer to VF
+ * @msg: virtchannel message
  *
- * Disable VLAN header stripping for a given VF
+ * When port representors are created, we do not add MAC rule
+ * to firmware, we store it so that PF could report same
+ * MAC as VF.
  */
-static int ice_vc_dis_vlan_stripping(struct ice_vf *vf)
+static int ice_vc_repr_add_mac(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct ice_pf *pf = vf->pf;
+	struct virtchnl_ether_addr_list *al =
+	    (struct virtchnl_ether_addr_list *)msg;
 	struct ice_vsi *vsi;
+	struct ice_pf *pf;
+	int i;
 
-	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
+	    !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto handle_mac_exit;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	pf = vf->pf;
+
+	vsi = ice_get_vf_vsi(vf);
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto handle_mac_exit;
 	}
 
-	if (ice_vsi_manage_vlan_stripping(vsi, false))
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	for (i = 0; i < al->num_elements; i++) {
+		u8 *mac_addr = al->list[i].addr;
 
-error_param:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+		if (!is_unicast_ether_addr(mac_addr) ||
+		    ether_addr_equal(mac_addr, vf->hw_lan_addr.addr))
+			continue;
+
+		if (vf->pf_set_mac) {
+			dev_err(ice_pf_to_dev(pf),
+				"VF attempting to override administratively set MAC address\n");
+			v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+			goto handle_mac_exit;
+		}
+
+
+		ice_vfhw_mac_add(vf, &al->list[i]);
+		vf->num_mac++;
+		break;
+	}
+
+handle_mac_exit:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_ETH_ADDR,
 				     v_ret, NULL, 0);
 }
 
+/**
+ * ice_vc_repr_del_mac - response with success for deleting MAC
+ * @vf: pointer to VF
+ * @msg: virtchannel message
+ *
+ * Respond with success to not break normal VF flow.
+ */
+static int ice_vc_repr_del_mac(struct ice_vf __always_unused *vf,
+			       u8 __always_unused *msg)
+{
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_ETH_ADDR,
+				     VIRTCHNL_STATUS_SUCCESS, NULL, 0);
+}
+
+static int ice_vc_repr_no_action(struct ice_vf __always_unused *vf)
+{
+	return 0;
+}
+
+void ice_vc_change_ops_to_repr(struct ice_vc_vf_ops *ops)
+{
+	ops->add_mac_addr_msg = ice_vc_repr_add_mac;
+	ops->del_mac_addr_msg = ice_vc_repr_del_mac;
+	ops->add_vlan_msg = ice_vc_repr_no_action_msg;
+	ops->remove_vlan_msg = ice_vc_repr_no_action_msg;
+	ops->ena_vlan_stripping = ice_vc_repr_no_action;
+	ops->dis_vlan_stripping = ice_vc_repr_no_action;
+	ops->cfg_promiscuous_mode_msg = ice_vc_repr_no_action_msg;
+}
+
 /**
  * ice_vc_process_vf_msg - Process request from VF
  * @pf: pointer to the PF structure
@@ -2904,23 +9890,27 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 	u32 v_opcode = le32_to_cpu(event->desc.cookie_high);
 	s16 vf_id = le16_to_cpu(event->desc.retval);
 	u16 msglen = event->msg_len;
+	struct ice_vc_vf_ops *ops;
 	u8 *msg = event->msg_buf;
 	struct ice_vf *vf = NULL;
+	struct device *dev;
 	int err = 0;
 
-	if (vf_id >= pf->num_alloc_vfs) {
+	dev = ice_pf_to_dev(pf);
+	if (ice_validate_vf_id(pf, vf_id)) {
 		err = -EINVAL;
 		goto error_handler;
 	}
 
 	vf = &pf->vf[vf_id];
-
 	/* Check if VF is disabled. */
 	if (test_bit(ICE_VF_STATE_DIS, vf->vf_states)) {
 		err = -EPERM;
 		goto error_handler;
 	}
 
+	ops = &vf->vc_ops;
+
 	/* Perform basic checks on the msg */
 	err = virtchnl_vc_validate_vf_msg(&vf->vf_ver, v_opcode, msg, msglen);
 	if (err) {
@@ -2930,73 +9920,185 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 			err = -EINVAL;
 	}
 
+	if (!ice_vc_is_opcode_allowed(vf, v_opcode)) {
+		ice_vc_send_msg_to_vf(vf, v_opcode,
+				      VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL,
+				      0);
+		return;
+	}
+
 error_handler:
 	if (err) {
 		ice_vc_send_msg_to_vf(vf, v_opcode, VIRTCHNL_STATUS_ERR_PARAM,
 				      NULL, 0);
-		dev_err(&pf->pdev->dev, "Invalid message from VF %d, opcode %d, len %d, error %d\n",
+		dev_err(dev, "Invalid message from VF %d, opcode %d, len %d, error %d\n",
 			vf_id, v_opcode, msglen, err);
 		return;
 	}
 
 	switch (v_opcode) {
 	case VIRTCHNL_OP_VERSION:
-		err = ice_vc_get_ver_msg(vf, msg);
+		err = ops->get_ver_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_GET_VF_RESOURCES:
-		err = ice_vc_get_vf_res_msg(vf, msg);
+		err = ops->get_vf_res_msg(vf, msg);
+		if (ice_vf_init_vlan_stripping(vf))
+			dev_dbg(dev, "Failed to initialize VLAN stripping for VF %d\n",
+				vf->vf_id);
 		ice_vc_notify_vf_link_state(vf);
 		break;
 	case VIRTCHNL_OP_RESET_VF:
-		ice_vc_reset_vf_msg(vf);
+		ops->reset_vf(vf);
 		break;
 	case VIRTCHNL_OP_ADD_ETH_ADDR:
-		err = ice_vc_add_mac_addr_msg(vf, msg);
+		err = ops->add_mac_addr_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_DEL_ETH_ADDR:
-		err = ice_vc_del_mac_addr_msg(vf, msg);
+		err = ops->del_mac_addr_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
-		err = ice_vc_cfg_qs_msg(vf, msg);
+		err = ops->cfg_qs_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_ENABLE_QUEUES:
-		err = ice_vc_ena_qs_msg(vf, msg);
+		err = ops->ena_qs_msg(vf, msg);
 		ice_vc_notify_vf_link_state(vf);
 		break;
 	case VIRTCHNL_OP_DISABLE_QUEUES:
-		err = ice_vc_dis_qs_msg(vf, msg);
+		err = ops->dis_qs_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_REQUEST_QUEUES:
-		err = ice_vc_request_qs_msg(vf, msg);
+		err = ops->request_qs_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_CONFIG_IRQ_MAP:
-		err = ice_vc_cfg_irq_map_msg(vf, msg);
+		err = ops->cfg_irq_map_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_CONFIG_RSS_KEY:
-		err = ice_vc_config_rss_key(vf, msg);
+		err = ops->config_rss_key(vf, msg);
 		break;
 	case VIRTCHNL_OP_CONFIG_RSS_LUT:
-		err = ice_vc_config_rss_lut(vf, msg);
+		err = ops->config_rss_lut(vf, msg);
 		break;
 	case VIRTCHNL_OP_GET_STATS:
-		err = ice_vc_get_stats_msg(vf, msg);
+		err = ops->get_stats_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+		err = ops->cfg_promiscuous_mode_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_ADD_VLAN:
-		err = ice_vc_add_vlan_msg(vf, msg);
+		err = ops->add_vlan_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_DEL_VLAN:
-		err = ice_vc_remove_vlan_msg(vf, msg);
+		err = ops->remove_vlan_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
+		err = ops->query_rxdid(vf);
+		break;
+	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
+		err = ops->get_rss_hena(vf);
+		break;
+	case VIRTCHNL_OP_SET_RSS_HENA:
+		err = ops->set_rss_hena_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
-		err = ice_vc_ena_vlan_stripping(vf);
+		err = ops->ena_vlan_stripping(vf);
 		break;
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
-		err = ice_vc_dis_vlan_stripping(vf);
+		err = ops->dis_vlan_stripping(vf);
+		break;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	case VIRTCHNL_OP_ENABLE_CHANNELS:
+		err = ops->add_qch_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_ADD_CLOUD_FILTER:
+		err = ops->add_switch_filter_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
+		err = ops->del_switch_filter_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DISABLE_CHANNELS:
+		err = ops->del_qch_msg(vf, msg);
+		break;
+#endif /* HAVE_TC_SETUP_FLOWER */
+	case VIRTCHNL_OP_RDMA:
+		err = ops->rdma_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP:
+		err = ops->cfg_rdma_irq_map_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP:
+		err = ops->clear_rdma_irq_map(vf);
+		break;
+	case VIRTCHNL_OP_DCF_VLAN_OFFLOAD:
+		err = ops->dcf_vlan_offload_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DCF_CMD_DESC:
+		err = ops->dcf_cmd_desc_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_DCF_CMD_BUFF:
+		err = ops->dcf_cmd_buff_msg(vf, msg, msglen);
+		break;
+        case VIRTCHNL_OP_DCF_RULE_FLUSH:
+                err = ice_vc_flush_dcf_rule(vf);
+                break;
+	case VIRTCHNL_OP_DCF_DISABLE:
+		err = ops->dis_dcf_cap(vf);
+		break;
+	case VIRTCHNL_OP_DCF_GET_VSI_MAP:
+		err = ops->dcf_get_vsi_map(vf);
+		break;
+	case VIRTCHNL_OP_DCF_GET_PKG_INFO:
+		err = ops->dcf_query_pkg_info(vf);
+		break;
+	case VIRTCHNL_OP_ADD_RSS_CFG:
+		err = ops->handle_rss_cfg_msg(vf, msg, true);
+		break;
+	case VIRTCHNL_OP_DEL_RSS_CFG:
+		err = ops->handle_rss_cfg_msg(vf, msg, false);
+		break;
+	case VIRTCHNL_OP_ADD_FDIR_FILTER:
+		err = ops->add_fdir_fltr_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_FDIR_FILTER:
+		err = ops->del_fdir_fltr_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_GET_MAX_RSS_QREGION:
+		err = ops->get_max_rss_qregion(vf);
+		break;
+	case VIRTCHNL_OP_ENABLE_QUEUES_V2:
+		err = ops->ena_qs_v2_msg(vf, msg);
+		ice_vc_notify_vf_link_state(vf);
+		break;
+	case VIRTCHNL_OP_DISABLE_QUEUES_V2:
+		err = ops->dis_qs_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_MAP_QUEUE_VECTOR:
+		err = ops->map_q_vector_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS:
+		err = ops->get_offload_vlan_v2_caps(vf);
+		break;
+	case VIRTCHNL_OP_ADD_VLAN_V2:
+		err = ops->add_vlan_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_VLAN_V2:
+		err = ops->remove_vlan_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2:
+		err = ops->ena_vlan_stripping_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2:
+		err = ops->dis_vlan_stripping_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2:
+		err = ops->ena_vlan_insertion_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
+		err = ops->dis_vlan_insertion_v2_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_UNKNOWN:
 	default:
-		dev_err(&pf->pdev->dev, "Unsupported opcode %d from VF %d\n",
-			v_opcode, vf_id);
+		dev_err(dev, "Unsupported opcode %d from VF %d\n", v_opcode,
+			vf_id);
 		err = ice_vc_send_msg_to_vf(vf, v_opcode,
 					    VIRTCHNL_STATUS_ERR_NOT_SUPPORTED,
 					    NULL, 0);
@@ -3006,8 +10108,7 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 		/* Helper function cares less about error return values here
 		 * as it is busy with pending work.
 		 */
-		dev_info(&pf->pdev->dev,
-			 "PF failed to honor VF %d, opcode %d, error %d\n",
+		dev_info(dev, "PF failed to honor VF %d, opcode %d, error %d\n",
 			 vf_id, v_opcode, err);
 	}
 }
@@ -3023,107 +10124,48 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 int
 ice_get_vf_cfg(struct net_device *netdev, int vf_id, struct ifla_vf_info *ivi)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	struct ice_vf *vf;
-
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		netdev_err(netdev, "invalid VF id: %d\n", vf_id);
-		return -EINVAL;
-	}
-
-	vf = &pf->vf[vf_id];
-	vsi = pf->vsi[vf->lan_vsi_idx];
-
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		netdev_err(netdev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
-
-	ivi->vf = vf_id;
-	ether_addr_copy(ivi->mac, vf->dflt_lan_addr.addr);
-
-	/* VF configuration for VLAN and applicable QoS */
-	ivi->vlan = le16_to_cpu(vsi->info.pvid) & ICE_VLAN_M;
-	ivi->qos = (le16_to_cpu(vsi->info.pvid) & ICE_PRIORITY_M) >>
-		    ICE_VLAN_PRIORITY_S;
-
-	ivi->trusted = vf->trusted;
-	ivi->spoofchk = vf->spoofchk;
-	if (!vf->link_forced)
-		ivi->linkstate = IFLA_VF_LINK_STATE_AUTO;
-	else if (vf->link_up)
-		ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE;
-	else
-		ivi->linkstate = IFLA_VF_LINK_STATE_DISABLE;
-	ivi->max_tx_rate = vf->tx_rate;
-	ivi->min_tx_rate = 0;
-	return 0;
-}
-
-/**
- * ice_set_vf_spoofchk
- * @netdev: network interface device structure
- * @vf_id: VF identifier
- * @ena: flag to enable or disable feature
- *
- * Enable or disable VF spoof checking
- */
-int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
-{
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	struct ice_vsi_ctx *ctx;
-	enum ice_status status;
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
 	struct ice_vf *vf;
-	int ret = 0;
-
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		netdev_err(netdev, "invalid VF id: %d\n", vf_id);
-		return -EINVAL;
-	}
-
-	vf = &pf->vf[vf_id];
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		netdev_err(netdev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
-
-	if (ena == vf->spoofchk) {
-		dev_dbg(&pf->pdev->dev, "VF spoofchk already %s\n",
-			ena ? "ON" : "OFF");
-		return 0;
-	}
-
-	ctx = devm_kzalloc(&pf->pdev->dev, sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
+	int ret;
 
-	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
 
-	if (ena) {
-		ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
-		ctx->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M;
-	}
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
 
-	status = ice_update_vsi(&pf->hw, vsi->idx, ctx, NULL);
-	if (status) {
-		dev_dbg(&pf->pdev->dev,
-			"Error %d, failed to update VSI* parameters\n", status);
-		ret = -EIO;
-		goto out;
-	}
+	ivi->vf = vf_id;
+	ether_addr_copy(ivi->mac, vf->hw_lan_addr.addr);
 
-	vf->spoofchk = ena;
-	vsi->info.sec_flags = ctx->info.sec_flags;
-	vsi->info.sw_flags2 = ctx->info.sw_flags2;
-out:
-	devm_kfree(&pf->pdev->dev, ctx);
-	return ret;
+	/* VF configuration for VLAN and applicable QoS */
+	ivi->vlan = ice_vf_get_port_vlan_id(vf);
+	ivi->qos = ice_vf_get_port_vlan_prio(vf);
+#ifdef IFLA_VF_VLAN_INFO_MAX
+	if (ice_vf_is_port_vlan_ena(vf))
+		ivi->vlan_proto = cpu_to_be16(ice_vf_get_port_vlan_tpid(vf));
+#endif /* IFLA_VF_VLAN_INFO_MAX */
+
+#ifdef HAVE_NDO_SET_VF_TRUST
+	ivi->trusted = vf->trusted;
+#endif /* HAVE_NDO_SET_VF_TRUST */
+	ivi->spoofchk = vf->spoofchk;
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
+	if (!vf->link_forced)
+		ivi->linkstate = IFLA_VF_LINK_STATE_AUTO;
+	else if (vf->link_up)
+		ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE;
+	else
+		ivi->linkstate = IFLA_VF_LINK_STATE_DISABLE;
+#endif
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+	ivi->max_tx_rate = vf->max_tx_rate;
+	ivi->min_tx_rate = vf->min_tx_rate;
+#else
+	ivi->tx_rate = vf->max_tx_rate;
+#endif
+	return 0;
 }
 
 /**
@@ -3136,42 +10178,54 @@ int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
  */
 int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
 	struct ice_vf *vf;
-	int ret = 0;
+	int ret;
 
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		netdev_err(netdev, "invalid VF id: %d\n", vf_id);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	if (is_multicast_ether_addr(mac)) {
+		netdev_err(netdev, "%pM not a valid unicast address\n", mac);
 		return -EINVAL;
 	}
 
 	vf = &pf->vf[vf_id];
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		netdev_err(netdev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
+	/* nothing left to do, unicast MAC already set */
+	if (ether_addr_equal(vf->dev_lan_addr.addr, mac) &&
+	    ether_addr_equal(vf->hw_lan_addr.addr, mac))
+		return 0;
 
-	if (is_zero_ether_addr(mac) || is_multicast_ether_addr(mac)) {
-		netdev_err(netdev, "%pM not a valid unicast address\n", mac);
-		return -EINVAL;
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	if (ice_vf_chnl_dmac_fltr_cnt(vf)) {
+		netdev_err(netdev,
+			   "can't set mac %pM. VF %d has tc-flower filters, delete them and try again\n",
+			   mac, vf_id);
+		return -EAGAIN;
 	}
 
-	/* copy MAC into dflt_lan_addr and trigger a VF reset. The reset
-	 * flow will use the updated dflt_lan_addr and add a MAC filter
-	 * using ice_add_mac. Also set pf_set_mac to indicate that the PF has
-	 * set the MAC address for this VF.
+	/* VF is notified of its new MAC via the PF's response to the
+	 * VIRTCHNL_OP_GET_VF_RESOURCES message after the VF has been reset
 	 */
-	ether_addr_copy(vf->dflt_lan_addr.addr, mac);
-	vf->pf_set_mac = true;
-	netdev_info(netdev,
-		    "MAC on VF %d set to %pM. VF driver will be reinitialized\n",
-		    vf_id, mac);
+	ether_addr_copy(vf->dev_lan_addr.addr, mac);
+	ether_addr_copy(vf->hw_lan_addr.addr, mac);
+	if (is_zero_ether_addr(mac)) {
+		/* VF will send VIRTCHNL_OP_ADD_ETH_ADDR message with its MAC */
+		vf->pf_set_mac = false;
+		netdev_info(netdev, "Removing MAC on VF %d. VF driver will be reinitialized\n",
+			    vf->vf_id);
+	} else {
+		/* PF will add MAC rule for the VF */
+		vf->pf_set_mac = true;
+		netdev_info(netdev, "Setting MAC %pM on VF %d. VF driver will be reinitialized\n",
+			    mac, vf_id);
+	}
 
-	ice_vc_dis_vf(vf);
-	return ret;
+	ice_vc_reset_vf(vf);
+	return 0;
 }
 
 /**
@@ -3184,35 +10238,55 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
  */
 int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
 	struct ice_vf *vf;
+	int ret;
 
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		dev_err(&pf->pdev->dev, "invalid VF id: %d\n", vf_id);
-		return -EINVAL;
+	if (ice_is_eswitch_mode_switchdev(pf)) {
+		dev_info(ice_pf_to_dev(pf),
+			 "Trusted VF is forbidden in switchdev mode\n");
+		return -EOPNOTSUPP;
 	}
 
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
 	vf = &pf->vf[vf_id];
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		dev_err(&pf->pdev->dev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
 
 	/* Check if already trusted */
 	if (trusted == vf->trusted)
 		return 0;
 
+#ifdef HAVE_NDO_SET_VF_TRUST
+	/* If the trust mode of a given DCF is taken away without the DCF
+	 * gracefully relinquishing the DCF functionality, remove ALL switch
+	 * filters that were added by the DCF and treat this VF as any other
+	 * untrusted AVF.
+	 */
+	if (ice_is_vf_dcf(vf) && !trusted &&
+	    ice_dcf_get_state(pf) != ICE_DCF_STATE_OFF) {
+		ice_rm_all_dcf_sw_rules(pf);
+		ice_clear_dcf_acl_cfg(pf);
+		ice_clear_dcf_udp_tunnel_cfg(pf);
+		pf->hw.dcf_caps &= ~(DCF_ACL_CAP | DCF_UDP_TUNNEL_CAP);
+		ice_dcf_set_state(pf, ICE_DCF_STATE_OFF);
+		pf->dcf.vf = NULL;
+		vf->driver_caps &= ~VIRTCHNL_VF_CAP_DCF;
+	}
+
+	ice_vc_reset_vf(vf);
+#endif /* HAVE_NDO_SET_VF_TRUST */
 	vf->trusted = trusted;
-	ice_vc_dis_vf(vf);
-	dev_info(&pf->pdev->dev, "VF %u is now %strusted\n",
-		 vf_id, trusted ? "" : "un");
+	dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n", vf_id,
+		 trusted ? "" : "un");
 
 	return 0;
 }
 
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 /**
  * ice_set_vf_link_state
  * @netdev: network interface device structure
@@ -3223,34 +10297,25 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted)
  */
 int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_pf *pf = np->vsi->back;
-	struct virtchnl_pf_event pfe = { 0 };
-	struct ice_link_status *ls;
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
 	struct ice_vf *vf;
-	struct ice_hw *hw;
+	int ret;
 
-	if (vf_id >= pf->num_alloc_vfs) {
-		dev_err(&pf->pdev->dev, "Invalid VF Identifier %d\n", vf_id);
+	if (ice_validate_vf_id(pf, vf_id))
 		return -EINVAL;
-	}
 
 	vf = &pf->vf[vf_id];
-	hw = &pf->hw;
-	ls = &pf->hw.port_info->phy.link_info;
-
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		dev_err(&pf->pdev->dev, "vf %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
 
-	pfe.event = VIRTCHNL_EVENT_LINK_CHANGE;
-	pfe.severity = PF_EVENT_SEVERITY_INFO;
+	/* disallow link state change if eeprom is corrupted */
+	if (test_bit(ICE_BAD_EEPROM, pf->state))
+		return -EOPNOTSUPP;
 
 	switch (link_state) {
 	case IFLA_VF_LINK_STATE_AUTO:
 		vf->link_forced = false;
-		vf->link_up = ls->link_info & ICE_AQ_LINK_UP;
 		break;
 	case IFLA_VF_LINK_STATE_ENABLE:
 		vf->link_forced = true;
@@ -3264,15 +10329,466 @@ int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state)
 		return -EINVAL;
 	}
 
-	if (vf->link_forced)
-		ice_set_pfe_link_forced(vf, &pfe, vf->link_up);
-	else
-		ice_set_pfe_link(vf, &pfe, ls->link_speed, vf->link_up);
+	if (vf->repr) {
+		struct net_device *pr_netdev = vf->repr->netdev;
+		unsigned int flags = pr_netdev->flags;
 
-	/* Notify the VF of its new link state */
-	ice_aq_send_msg_to_vf(hw, vf->vf_id, VIRTCHNL_OP_EVENT,
-			      VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe,
-			      sizeof(pfe), NULL);
+		flags = vf->link_up ? flags | IFF_UP : flags & ~IFF_UP;
+		dev_change_flags(pr_netdev, flags, NULL);
+	}
+
+	ice_vc_notify_vf_link_state(vf);
+
+	return 0;
+}
+#endif /* HAVE_NDO_SET_VF_LINK_STATE */
+
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+/**
+ * ice_calc_all_vfs_min_tx_rate - calculate cummulative min Tx rate on all VFs
+ * @pf: PF associated with VFs
+ */
+static int ice_calc_all_vfs_min_tx_rate(struct ice_pf *pf)
+{
+	int rate = 0, i;
+
+	ice_for_each_vf(pf, i)
+		rate += pf->vf[i].min_tx_rate;
+
+	return rate;
+}
+
+/**
+ * ice_min_tx_rate_oversubscribed - check if min Tx rate causes oversubscription
+ * @vf: VF trying to configure min_tx_rate
+ * @min_tx_rate: min Tx rate in Mbps
+ *
+ * Check if the min_tx_rate being passed in will cause oversubscription of total
+ * min_tx_rate based on the current link speed and all other VFs configured
+ * min_tx_rate
+ *
+ * Return true if the passed min_tx_rate would cause oversubscription, else
+ * return false
+ */
+static bool
+ice_min_tx_rate_oversubscribed(struct ice_vf *vf, int min_tx_rate)
+{
+	int link_speed_mbps = ice_get_link_speed_mbps(ice_get_vf_vsi(vf));
+	int all_vfs_min_tx_rate = ice_calc_all_vfs_min_tx_rate(vf->pf);
+
+	/* this VF's previous rate is being overwritten */
+	all_vfs_min_tx_rate -= vf->min_tx_rate;
+
+	if (all_vfs_min_tx_rate + min_tx_rate > link_speed_mbps) {
+		dev_err(ice_pf_to_dev(vf->pf), "min_tx_rate of %d Mbps on VF %u would cause oversubscription of %d Mbps based on the current link speed %d Mbps\n",
+			min_tx_rate, vf->vf_id,
+			all_vfs_min_tx_rate + min_tx_rate - link_speed_mbps,
+			link_speed_mbps);
+		return true;
+	}
+
+	return false;
+}
+#endif /* HAVE_NDO_SET_VF_MIN_MAX_TX_RATE */
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_vf_adq_total_max_tx_rate - cummulative max_tx_rate when VF ADQ is enabled
+ * @vf: Pointer to VF
+ *
+ * This function cummulative max Tx rate of all TCs if VF ADQ is enabled
+ */
+static u64 ice_vf_adq_total_max_tx_rate(struct ice_vf *vf)
+{
+	u64 cummulative_max_tx_rate = 0;
+	int i;
+
+	if (!ice_is_vf_adq_ena(vf))
+		return 0;
+
+	for (i = 0; i < vf->num_tc; i++)
+		cummulative_max_tx_rate += vf->ch[i].max_tx_rate;
+
+	return cummulative_max_tx_rate;
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+/**
+ * ice_set_vf_bw - set min/max VF bandwidth
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @min_tx_rate: Minimum Tx rate in Mbps
+ * @max_tx_rate: Maximum Tx rate in Mbps
+ */
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+int
+ice_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
+	      int max_tx_rate)
+#else
+int ice_set_vf_bw(struct net_device *netdev, int vf_id, int max_tx_rate)
+#endif
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_vf *vf;
+	int ret;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	vsi = ice_get_vf_vsi(vf);
+
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+	/* when max_tx_rate is zero that means no max Tx rate limiting, so only
+	 * check if max_tx_rate is non-zero
+	 */
+	if (max_tx_rate && min_tx_rate > max_tx_rate) {
+		dev_err(dev, "Cannot set min Tx rate %d Mbps greater than max Tx rate %d Mbps\n",
+			min_tx_rate, max_tx_rate);
+		return -EINVAL;
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (min_tx_rate && ice_is_adq_active(pf)) {
+		dev_err(dev, "ADQ on PF is currently enabled. VF min Tx rate limiting not allowed on this PF.\n");
+		return -EOPNOTSUPP;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	if (min_tx_rate && ice_is_dcb_active(pf)) {
+		dev_err(dev, "DCB on PF is currently enabled. VF min Tx rate limiting not allowed on this PF.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ice_min_tx_rate_oversubscribed(vf, min_tx_rate))
+		return -EINVAL;
+
+	if (vf->min_tx_rate != (unsigned int)min_tx_rate) {
+		ret = ice_set_min_bw_limit(vsi, (u64)min_tx_rate * 1000);
+		if (ret) {
+			dev_err(dev, "Unable to set min-tx-rate for VF %d\n",
+				vf->vf_id);
+			return ret;
+		}
+
+		vf->min_tx_rate = min_tx_rate;
+	}
+
+#endif /* HAVE_NDO_SET_VF_MIN_MAX_TX_RATE */
+	if (vf->max_tx_rate != (unsigned int)max_tx_rate) {
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		u64 adq_max_tx_rate;
+#endif
+		ret = ice_set_max_bw_limit(vsi, (u64)max_tx_rate * 1000);
+		if (ret) {
+			dev_err(dev, "Unable to set max-tx-rate for VF %d\n",
+				vf->vf_id);
+			return ret;
+		}
+
+		vf->max_tx_rate = max_tx_rate;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		adq_max_tx_rate = ice_vf_adq_total_max_tx_rate(vf);
+		if (vf->max_tx_rate < adq_max_tx_rate)
+			dev_warn(dev, "Host managed max_tx_rate %u Mpbs for VF %d is less VF ADQ cummulative max_tx_rate %llu Mpbs\n",
+				 vf->vf_id, vf->max_tx_rate, adq_max_tx_rate);
+#endif
+	}
+
+	return 0;
+}
+
+#ifdef HAVE_VF_STATS
+/**
+ * ice_get_vf_stats - populate some stats for the VF
+ * @netdev: the netdev of the PF
+ * @vf_id: the host OS identifier (0-255)
+ * @vf_stats: pointer to the OS memory to be initialized
+ */
+int ice_get_vf_stats(struct net_device *netdev, int vf_id,
+		     struct ifla_vf_stats *vf_stats)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_eth_stats *stats;
+	struct ice_vsi *vsi;
+	struct ice_vf *vf;
+	int ret;
+
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi)
+		return -EINVAL;
+
+	ice_update_eth_stats(vsi);
+	stats = &vsi->eth_stats;
+
+	memset(vf_stats, 0, sizeof(*vf_stats));
+
+	vf_stats->rx_packets = stats->rx_unicast + stats->rx_broadcast +
+		stats->rx_multicast;
+	vf_stats->tx_packets = stats->tx_unicast + stats->tx_broadcast +
+		stats->tx_multicast;
+	vf_stats->rx_bytes   = stats->rx_bytes;
+	vf_stats->tx_bytes   = stats->tx_bytes;
+	vf_stats->broadcast  = stats->rx_broadcast;
+	vf_stats->multicast  = stats->rx_multicast;
+#ifdef HAVE_VF_STATS_DROPPED
+	vf_stats->rx_dropped = stats->rx_discards;
+	vf_stats->tx_dropped = stats->tx_discards;
+#endif
 
 	return 0;
 }
+#endif /* HAVE_VF_STATS */
+
+/**
+ * ice_print_vf_rx_mdd_event - print VF Rx malicious driver detect event
+ * @vf: pointer to the VF structure
+ */
+void ice_print_vf_rx_mdd_event(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+
+	dev_info(dev, "%d Rx Malicious Driver Detection events detected on PF %d VF %d MAC %pM. mdd-auto-reset-vfs=%s\n",
+		 vf->mdd_rx_events.count, pf->hw.pf_id, vf->vf_id,
+		 vf->dev_lan_addr.addr,
+		 test_bit(ICE_FLAG_MDD_AUTO_RESET_VF, pf->flags)
+			  ? "on" : "off");
+}
+
+/**
+ * ice_print_vfs_mdd_events - print VFs malicious driver detect event
+ * @pf: pointer to the PF structure
+ *
+ * Called from ice_handle_mdd_event to rate limit and print VFs MDD events.
+ */
+void ice_print_vfs_mdd_events(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	int i;
+
+	/* check that there are pending MDD events to print */
+	if (!test_and_clear_bit(ICE_MDD_VF_PRINT_PENDING, pf->state))
+		return;
+
+	/* VF MDD event logs are rate limited to one second intervals */
+	if (time_is_after_jiffies(pf->last_printed_mdd_jiffies + HZ * 1))
+		return;
+
+	pf->last_printed_mdd_jiffies = jiffies;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		/* only print Rx MDD event message if there are new events */
+		if (vf->mdd_rx_events.count != vf->mdd_rx_events.last_printed) {
+			vf->mdd_rx_events.last_printed =
+							vf->mdd_rx_events.count;
+			ice_print_vf_rx_mdd_event(vf);
+		}
+
+		/* only print Tx MDD event message if there are new events */
+		if (vf->mdd_tx_events.count != vf->mdd_tx_events.last_printed) {
+			vf->mdd_tx_events.last_printed =
+							vf->mdd_tx_events.count;
+
+			dev_info(dev, "%d Tx Malicious Driver Detection events detected on PF %d VF %d MAC %pM.\n",
+				 vf->mdd_tx_events.count, hw->pf_id, i,
+				 vf->dev_lan_addr.addr);
+		}
+	}
+}
+
+/**
+ * ice_restore_all_vfs_msi_state - restore VF MSI state after PF FLR
+ * @pdev: pointer to a pci_dev structure
+ *
+ * Called when recovering from a PF FLR to restore interrupt capability to
+ * the VFs.
+ */
+void ice_restore_all_vfs_msi_state(struct pci_dev *pdev)
+{
+	u16 vf_id;
+	int pos;
+
+	if (!pci_num_vf(pdev))
+		return;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+	if (pos) {
+		struct pci_dev *vfdev;
+
+		pci_read_config_word(pdev, pos + PCI_SRIOV_VF_DID,
+				     &vf_id);
+		vfdev = pci_get_device(pdev->vendor, vf_id, NULL);
+		while (vfdev) {
+			if (vfdev->is_virtfn && vfdev->physfn == pdev)
+				pci_restore_msi_state(vfdev);
+			vfdev = pci_get_device(pdev->vendor, vf_id,
+					       vfdev);
+		}
+	}
+}
+
+/**
+ * ice_is_malicious_vf - helper function to detect a malicious VF
+ * @pf: ptr to struct ice_pf
+ * @event: pointer to the AQ event
+ * @num_msg_proc: the number of messages processed so far
+ * @num_msg_pending: the number of messages peinding in admin queue
+ */
+bool
+ice_is_malicious_vf(struct ice_pf *pf, struct ice_rq_event_info *event,
+		    u16 num_msg_proc, u16 num_msg_pending)
+{
+	s16 vf_id = le16_to_cpu(event->desc.retval);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_mbx_data mbxdata;
+	enum ice_status status;
+	bool malvf = false;
+	struct ice_vf *vf;
+
+	if (ice_validate_vf_id(pf, vf_id))
+		return false;
+
+	vf = &pf->vf[vf_id];
+	/* Check if VF is disabled. */
+	if (test_bit(ICE_VF_STATE_DIS, vf->vf_states))
+		return false;
+
+	mbxdata.num_msg_proc = num_msg_proc;
+	mbxdata.num_pending_arq = num_msg_pending;
+	mbxdata.max_num_msgs_mbx = pf->hw.mailboxq.num_rq_entries;
+#define ICE_MBX_OVERFLOW_WATERMARK 64
+	mbxdata.async_watermark_val = ICE_MBX_OVERFLOW_WATERMARK;
+
+	/* check to see if we have a malicious VF */
+	status = ice_mbx_vf_state_handler(&pf->hw, &mbxdata, vf_id, &malvf);
+	if (status)
+		return false;
+
+	if (malvf) {
+		bool report_vf = false;
+
+		/* if the VF is malicious and we haven't let the user
+		 * know about it, then let them know now
+		 */
+		status = ice_mbx_report_malvf(&pf->hw, pf->malvfs,
+					      ICE_MAX_VF_COUNT, vf_id,
+					      &report_vf);
+		if (status)
+			dev_dbg(dev, "Error reporting malicious VF\n");
+
+		if (report_vf) {
+			struct ice_vsi *pf_vsi = ice_get_main_vsi(pf);
+
+			if (pf_vsi)
+				dev_warn(dev, "VF MAC %pM on PF MAC %pM is generating asynchronous messages and may be overflowing the PF message queue. Please see the Adapter User Guide for more information\n",
+					 &vf->dev_lan_addr.addr[0],
+					 pf_vsi->netdev->dev_addr);
+		}
+
+		return true;
+	}
+
+	/* if there was an error in detection or the VF is not malicious then
+	 * return false
+	 */
+	return false;
+}
+
+static void ice_dump_vf(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_pf *pf;
+
+	if (!vf)
+		return;
+
+	pf = vf->pf;
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi)
+		return;
+
+	dev = ice_pf_to_dev(pf);
+	dev_info(dev, "VF[%d]:\n", vf->vf_id);
+	dev_info(dev, "\tvf_ver.major = %d vf_ver.minor = %d\n",
+		 vf->vf_ver.major, vf->vf_ver.minor);
+	dev_info(dev, "\tdriver_caps = 0x%08x\n", vf->driver_caps);
+	dev_info(dev, "\tvf_caps = 0x%08lx\n", vf->vf_caps);
+	dev_info(dev, "\tvf_states:\n");
+	if (test_bit(ICE_VF_STATE_INIT, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_INIT\n");
+	if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_ACTIVE\n");
+	if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_QS_ENA\n");
+	if (test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_MC_PROMISC\n");
+	if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_UC_PROMISC\n");
+	dev_info(dev, "\tvsi = %pK, vsi->idx = %d, vsi->vsi_num = %d\n",
+		 vsi, vsi->idx, vsi->vsi_num);
+	dev_info(dev, "\tlan_vsi_idx = %d\n", vf->lan_vsi_idx);
+	dev_info(dev, "\tlan_vsi_num = %d\n", vf->lan_vsi_num);
+	dev_info(dev, "\tnum_mac = %d\n", vf->num_mac);
+	dev_info(dev, "\tdev_lan_addr = %pM\n", &vf->dev_lan_addr.addr[0]);
+	dev_info(dev, "\thw_lan_addr = %pM\n", &vf->hw_lan_addr.addr[0]);
+	dev_info(dev, "\tnum_req_qs = %d\n", vf->num_req_qs);
+	dev_info(dev, "\trxq_ena = 0x%lx\n", *vf->rxq_ena);
+	dev_info(dev, "\ttxq_ena = 0x%lx\n", *vf->txq_ena);
+	dev_info(dev, "\tPort VLAN status: %s\n",
+		 ice_vf_is_port_vlan_ena(vf) ? "enabled" : "disabled");
+	dev_info(dev, "\t\tPort VLAN ID = %d\n", ice_vf_get_port_vlan_id(vf));
+	dev_info(dev, "\t\tQoS = %d\n", ice_vf_get_port_vlan_prio(vf));
+	dev_info(dev, "\t\tTPID = 0x%x", ice_vf_get_port_vlan_tpid(vf));
+	dev_info(dev, "\tpf_set_mac = %s\n", vf->pf_set_mac ? "true" : "false");
+	dev_info(dev, "\ttrusted = %s\n", vf->trusted ? "true" : "false");
+	dev_info(dev, "\tspoofchk = %s\n", vf->spoofchk ? "true" : "false");
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
+	dev_info(dev, "\tlink_forced = %s, link_up (only valid when link_forced is true) = %s\n",
+		 vf->link_forced ? "true" : "false",
+		 vf->link_up ? "true" : "false");
+#endif
+	dev_info(dev, "\tmax_tx_rate = %d\n", vf->max_tx_rate);
+	dev_info(dev, "\tmin_tx_rate = %d\n", vf->min_tx_rate);
+	dev_info(dev, "\tnum_inval_msgs = %lld\n", vf->num_inval_msgs);
+	dev_info(dev, "\tnum_valid_msgs = %lld\n", vf->num_valid_msgs);
+	dev_info(dev, "\tmdd_rx_events = %u\n", vf->mdd_rx_events.count);
+	dev_info(dev, "\tmdd_tx_events = %u\n", vf->mdd_tx_events.count);
+	dev_info(dev, "\tfirst_vector_idx = %d\n", vf->first_vector_idx);
+	dev_info(dev, "\tvf_sw_id = %pK\n", vf->vf_sw_id);
+	dev_info(dev, "\tadq_enabled = %s\n",
+		 vf->adq_enabled ? "true" : "false");
+	dev_info(dev, "\tadq_fltr_ena = %s\n",
+		 vf->adq_fltr_ena ? "true" : "false");
+	dev_info(dev, "\tnum_tc = %u\n", vf->num_tc);
+	dev_info(dev, "\tnum_dmac_chnl_fltrs = %u\n", vf->num_dmac_chnl_fltrs);
+}
+
+void ice_dump_all_vfs(struct ice_pf *pf)
+{
+	u16 v;
+
+	ice_for_each_vf(pf, v)
+		ice_dump_vf(&pf->vf[v]);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
index 0d9880c8bba3..ba727a4c986e 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
@@ -1,18 +1,21 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_VIRTCHNL_PF_H_
 #define _ICE_VIRTCHNL_PF_H_
 #include "ice.h"
+#include "ice_virtchnl_fdir.h"
+#include "ice_dcf.h"
+#include "ice_vsi_vlan_ops.h"
 
-#define ICE_MAX_VLANID			4095
-#define ICE_VLAN_PRIORITY_S		12
-#define ICE_VLAN_M			0xFFF
-#define ICE_PRIORITY_M			0x7000
+#define ICE_VIRTCHNL_SUPPORTED_QTYPES	2
 
 /* Restrict number of MAC Addr and VLAN that non-trusted VF can programmed */
 #define ICE_MAX_VLAN_PER_VF		8
-#define ICE_MAX_MACADDR_PER_VF		12
+/* MAC filters: 1 is reserved for the VF's default/perm_addr/LAA MAC, 1 for
+ * broadcast, and 16 for additional unicast/multicast filters
+ */
+#define ICE_MAX_MACADDR_PER_VF		18
 
 /* Malicious Driver Detection */
 #define ICE_DFLT_NUM_INVAL_MSGS_ALLOWED		10
@@ -26,18 +29,31 @@
 #define ICE_PCI_CIAD_WAIT_COUNT		100
 #define ICE_PCI_CIAD_WAIT_DELAY_US	1
 
-/* VF resources default values and limitation */
+/* VF resource constraints */
 #define ICE_MAX_VF_COUNT		256
-#define ICE_MAX_QS_PER_VF		256
+#define ICE_MAX_QS_PER_VF	256
+/* Maximum number of queue pairs to configure by default for a VF */
+#define ICE_MAX_DFLT_QS_PER_VF		16
 #define ICE_MIN_QS_PER_VF		1
-#define ICE_DFLT_QS_PER_VF		4
 #define ICE_NONQ_VECS_VF		1
 #define ICE_MAX_SCATTER_QS_PER_VF	16
-#define ICE_MAX_BASE_QS_PER_VF		16
-#define ICE_MAX_INTR_PER_VF		65
-#define ICE_MAX_POLICY_INTR_PER_VF	33
+#define ICE_MAX_RSS_QS_PER_LARGE_VF	64
+#define ICE_MAX_RSS_QS_PER_VF		16
+#define ICE_NUM_VF_MSIX_MAX		65
+#define ICE_NUM_VF_MSIX_LARGE		33
+#define ICE_NUM_VF_MSIX_MED		17
+#define ICE_NUM_VF_MSIX_SMALL		5
+#define ICE_NUM_VF_MSIX_MULTIQ_MIN	3
 #define ICE_MIN_INTR_PER_VF		(ICE_MIN_QS_PER_VF + 1)
-#define ICE_DFLT_INTR_PER_VF		(ICE_DFLT_QS_PER_VF + 1)
+#define ICE_MAX_VF_RESET_TRIES		40
+#define ICE_MAX_VF_RESET_SLEEP_MS	20
+#define ICE_MAX_IPSEC_CAPABLE_VF_ID	127
+
+#define ice_for_each_vf(pf, i) \
+	for ((i) = 0; (i) < (pf)->num_alloc_vfs; (i)++)
+
+/* Max number of flexible descriptor rxdid */
+#define ICE_FLEX_DESC_RXDID_MAX_NUM 64
 
 /* Specific VF states */
 enum ice_vf_states {
@@ -56,46 +72,240 @@ enum ice_virtchnl_cap {
 	ICE_VIRTCHNL_VF_CAP_PRIVILEGE,
 };
 
+/* DDP package type */
+enum ice_pkg_type {
+	ICE_PKG_TYPE_UNKNOWN = 0,
+	ICE_PKG_TYPE_OS_DEFAULT,
+	ICE_PKG_TYPE_COMMS,
+	ICE_PKG_TYPE_WIRELESS_EDGE,
+	ICE_PKG_TYPE_GTP_OVER_GRE,
+	ICE_PKG_TYPE_END,
+};
+
+/* In ADQ, max 4 VSI's can be allocated per VF including primary VF VSI.
+ * These variables are used to store indices, ID's and number of queues
+ * for each VSI including that of primary VF VSI. Each Traffic class is
+ * termed as channel and each channel can in-turn have 4 queues which
+ * means max 16 queues overall per VF.
+ */
+struct ice_channel_vf {
+	u16 vsi_idx; /* index in PF struct for all channel VSIs */
+	u16 vsi_num; /* HW (absolute) index of this VSI */
+	u16 num_qps; /* number of queue pairs requested by user */
+	u16 offset;
+	u64 max_tx_rate; /* Tx rate limiting for channels */
+
+	/* type of filter: dest/src/dest+src port */
+	u32 fltr_type;
+};
+
+struct ice_time_mac {
+	unsigned long time_modified;
+	u8 addr[ETH_ALEN];
+};
+
+/* VF MDD events print structure */
+struct ice_mdd_vf_events {
+	u16 count;			/* total count of Rx|Tx events */
+	/* count number of the last printed event */
+	u16 last_printed;
+};
+
+/* The VF VLAN information controlled by DCF */
+struct ice_dcf_vlan_info {
+	struct ice_vlan outer_port_vlan;
+	u16 outer_stripping_tpid;
+	u8 outer_stripping_ena:1;
+	u8 applying:1;
+};
+
+#define ICE_HASH_IP_CTX_IP		0
+#define ICE_HASH_IP_CTX_IP_ESP		1
+#define ICE_HASH_IP_CTX_IP_UDP_ESP	2
+#define ICE_HASH_IP_CTX_IP_AH		3
+#define ICE_HASH_IP_CTX_IP_L2TPV3	4
+#define ICE_HASH_IP_CTX_IP_PFCP		5
+#define ICE_HASH_IP_CTX_IP_UDP		6
+#define ICE_HASH_IP_CTX_IP_TCP		7
+#define ICE_HASH_IP_CTX_IP_SCTP		8
+#define ICE_HASH_IP_CTX_MAX		9
+
+struct ice_vf_hash_ip_ctx {
+	struct ice_rss_hash_cfg ctx[ICE_HASH_IP_CTX_MAX];
+};
+
+#define ICE_HASH_GTPU_CTX_EH_IP		0
+#define ICE_HASH_GTPU_CTX_EH_IP_UDP	1
+#define ICE_HASH_GTPU_CTX_EH_IP_TCP	2
+#define ICE_HASH_GTPU_CTX_UP_IP		3
+#define ICE_HASH_GTPU_CTX_UP_IP_UDP	4
+#define ICE_HASH_GTPU_CTX_UP_IP_TCP	5
+#define ICE_HASH_GTPU_CTX_DW_IP		6
+#define ICE_HASH_GTPU_CTX_DW_IP_UDP	7
+#define ICE_HASH_GTPU_CTX_DW_IP_TCP	8
+#define ICE_HASH_GTPU_CTX_MAX		9
+
+struct ice_vf_hash_gtpu_ctx {
+	struct ice_rss_hash_cfg ctx[ICE_HASH_GTPU_CTX_MAX];
+};
+
+struct ice_vf_hash_ctx {
+	struct ice_vf_hash_ip_ctx v4;
+	struct ice_vf_hash_ip_ctx v6;
+	struct ice_vf_hash_gtpu_ctx ipv4;
+	struct ice_vf_hash_gtpu_ctx ipv6;
+};
+
+struct ice_vf;
+
+struct ice_vc_vf_ops {
+	int (*get_ver_msg)(struct ice_vf *vf, u8 *msg);
+	int (*get_vf_res_msg)(struct ice_vf *vf, u8 *msg);
+	void (*reset_vf)(struct ice_vf *vf);
+	int (*add_mac_addr_msg)(struct ice_vf *vf, u8 *msg);
+	int (*del_mac_addr_msg)(struct ice_vf *vf, u8 *msg);
+	int (*cfg_qs_msg)(struct ice_vf *vf, u8 *msg);
+	int (*ena_qs_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dis_qs_msg)(struct ice_vf *vf, u8 *msg);
+	int (*request_qs_msg)(struct ice_vf *vf, u8 *msg);
+	int (*cfg_irq_map_msg)(struct ice_vf *vf, u8 *msg);
+	int (*config_rss_key)(struct ice_vf *vf, u8 *msg);
+	int (*config_rss_lut)(struct ice_vf *vf, u8 *msg);
+	int (*get_stats_msg)(struct ice_vf *vf, u8 *msg);
+	int (*cfg_promiscuous_mode_msg)(struct ice_vf *vf, u8 *msg);
+	int (*add_vlan_msg)(struct ice_vf *vf, u8 *msg);
+	int (*remove_vlan_msg)(struct ice_vf *vf, u8 *msg);
+	int (*query_rxdid)(struct ice_vf *vf);
+	int (*get_rss_hena)(struct ice_vf *vf);
+	int (*set_rss_hena_msg)(struct ice_vf *vf, u8 *msg);
+	int (*ena_vlan_stripping)(struct ice_vf *vf);
+	int (*dis_vlan_stripping)(struct ice_vf *vf);
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	int (*add_qch_msg)(struct ice_vf *vf, u8 *msg);
+	int (*add_switch_filter_msg)(struct ice_vf *vf, u8 *msg);
+	int (*del_switch_filter_msg)(struct ice_vf *vf, u8 *msg);
+	int (*del_qch_msg)(struct ice_vf *vf, u8 *msg);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+	int (*rdma_msg)(struct ice_vf *vf, u8 *msg, u16 msglen);
+	int (*cfg_rdma_irq_map_msg)(struct ice_vf *vf, u8 *msg);
+	int (*clear_rdma_irq_map)(struct ice_vf *vf);
+	int (*dcf_vlan_offload_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dcf_cmd_desc_msg)(struct ice_vf *vf, u8 *msg, u16 msglen);
+	int (*dcf_cmd_buff_msg)(struct ice_vf *vf, u8 *msg, u16 msglen);
+	int (*dis_dcf_cap)(struct ice_vf *vf);
+	int (*dcf_get_vsi_map)(struct ice_vf *vf);
+	int (*dcf_query_pkg_info)(struct ice_vf *vf);
+	int (*handle_rss_cfg_msg)(struct ice_vf *vf, u8 *msg, bool add);
+	int (*add_fdir_fltr_msg)(struct ice_vf *vf, u8 *msg);
+	int (*del_fdir_fltr_msg)(struct ice_vf *vf, u8 *msg);
+	int (*get_max_rss_qregion)(struct ice_vf *vf);
+	int (*ena_qs_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dis_qs_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*map_q_vector_msg)(struct ice_vf *vf, u8 *msg);
+	int (*get_offload_vlan_v2_caps)(struct ice_vf *vf);
+	int (*add_vlan_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*remove_vlan_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*ena_vlan_stripping_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dis_vlan_stripping_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*ena_vlan_insertion_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dis_vlan_insertion_v2_msg)(struct ice_vf *vf, u8 *msg);
+};
+
 /* VF information structure */
 struct ice_vf {
 	struct ice_pf *pf;
 
-	s16 vf_id;			/* VF ID in the PF space */
+
+	u16 vf_id;			/* VF ID in the PF space */
 	u16 lan_vsi_idx;		/* index into PF struct */
+	u16 ctrl_vsi_idx;
+	struct ice_vf_fdir fdir;
+	struct ice_vf_hash_ctx hash_ctx;
 	/* first vector index of this VF in the PF space */
 	int first_vector_idx;
 	struct ice_sw *vf_sw_id;	/* switch ID the VF VSIs connect to */
 	struct virtchnl_version_info vf_ver;
 	u32 driver_caps;		/* reported by VF driver */
-	struct virtchnl_ether_addr dflt_lan_addr;
-	DECLARE_BITMAP(txq_ena, ICE_MAX_BASE_QS_PER_VF);
-	DECLARE_BITMAP(rxq_ena, ICE_MAX_BASE_QS_PER_VF);
-	u16 port_vlan_id;
+	u16 stag;			/* VF Port Extender (PE) stag if used */
+	struct virtchnl_ether_addr dev_lan_addr;
+	struct virtchnl_ether_addr hw_lan_addr;
+	struct ice_time_mac legacy_last_added_umac;
+	DECLARE_BITMAP(txq_ena, ICE_MAX_QS_PER_VF);
+	DECLARE_BITMAP(rxq_ena, ICE_MAX_QS_PER_VF);
+	struct ice_vlan port_vlan_info;	/* Port VLAN ID, QoS, and TPID */
+	struct virtchnl_vlan_caps vlan_v2_caps;
+	struct ice_dcf_vlan_info dcf_vlan_info;
 	u8 pf_set_mac:1;		/* VF MAC address set by VMM admin */
 	u8 trusted:1;
 	u8 spoofchk:1;
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 	u8 link_forced:1;
 	u8 link_up:1;			/* only valid if VF link is forced */
+#endif
 	/* VSI indices - actual VSI pointers are maintained in the PF structure
 	 * When assigned, these will be non-zero, because VSI 0 is always
 	 * the main LAN VSI for the PF.
 	 */
 	u16 lan_vsi_num;		/* ID as used by firmware */
-	unsigned int tx_rate;		/* Tx bandwidth limit in Mbps */
+	unsigned int min_tx_rate;	/* Minimum Tx bandwidth limit in Mbps */
+	unsigned int max_tx_rate;	/* Maximum Tx bandwidth limit in Mbps */
 	DECLARE_BITMAP(vf_states, ICE_VF_STATES_NBITS);	/* VF runtime states */
 
-	u64 num_mdd_events;		/* number of MDD events detected */
 	u64 num_inval_msgs;		/* number of continuous invalid msgs */
 	u64 num_valid_msgs;		/* number of valid msgs detected */
 	unsigned long vf_caps;		/* VF's adv. capabilities */
-	u8 num_req_qs;			/* num of queue pairs requested by VF */
+	u16 num_req_qs;			/* num of queue pairs requested by VF */
 	u16 num_mac;
-	u16 num_vlan;
 	u16 num_vf_qs;			/* num of queue configured per VF */
-	u16 num_qs_ena;			/* total num of Tx/Rx queue enabled */
+	u8 vlan_strip_ena;		/* Outer and Inner VLAN strip enable */
+#define ICE_INNER_VLAN_STRIP_ENA	BIT(0)
+#define ICE_OUTER_VLAN_STRIP_ENA	BIT(1)
+	/* ADQ related variables */
+	u8 adq_enabled; /* flag to enable ADQ */
+	u8 adq_fltr_ena; /* flag to denote that ADQ filters are applied */
+	u8 num_tc;
+	u16 num_dmac_chnl_fltrs;
+	struct ice_channel_vf ch[VIRTCHNL_MAX_ADQ_V2_CHANNELS];
+	struct hlist_head tc_flower_fltr_list;
+	struct ice_mdd_vf_events mdd_rx_events;
+	struct ice_mdd_vf_events mdd_tx_events;
+	struct ice_repr *repr;
+	DECLARE_BITMAP(opcodes_allowlist, VIRTCHNL_OP_MAX);
+	struct ice_vc_vf_ops vc_ops;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	/* devlink port data */
+	struct devlink_port devlink_port;
+#endif /* CONFIG_NET_DEVLINK */
 };
 
+/**
+ * ice_vc_get_max_chnl_tc_allowed
+ * @vf: pointer to the VF info
+ *
+ * This function returns max channel TC allowed depends upon "driver_caps"
+ */
+static inline u32 ice_vc_get_max_chnl_tc_allowed(struct ice_vf *vf)
+{
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2)
+		return VIRTCHNL_MAX_ADQ_V2_CHANNELS;
+	else
+		return VIRTCHNL_MAX_ADQ_CHANNELS;
+}
+
+/**
+ * ice_vf_chnl_dmac_fltr_cnt - number of dmac based channel filters
+ * @vf: pointer to the VF info
+ */
+static inline u16 ice_vf_chnl_dmac_fltr_cnt(struct ice_vf *vf)
+{
+	return vf->num_dmac_chnl_fltrs;
+}
+
+
 #ifdef CONFIG_PCI_IOV
+void ice_dump_all_vfs(struct ice_pf *pf);
+struct ice_vsi *ice_get_vf_vsi(struct ice_vf *vf);
 void ice_process_vflr_event(struct ice_pf *pf);
 int ice_sriov_configure(struct pci_dev *pdev, int num_vfs);
 int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac);
@@ -104,30 +314,106 @@ ice_get_vf_cfg(struct net_device *netdev, int vf_id, struct ifla_vf_info *ivi);
 
 void ice_free_vfs(struct ice_pf *pf);
 void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event);
+
+/* VF configuration related iplink handlers */
 void ice_vc_notify_link_state(struct ice_pf *pf);
 void ice_vc_notify_reset(struct ice_pf *pf);
+void ice_vc_notify_vf_link_state(struct ice_vf *vf);
+void ice_vc_change_ops_to_repr(struct ice_vc_vf_ops *ops);
+void ice_vc_set_dflt_vf_ops(struct ice_vc_vf_ops *ops);
 bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr);
+bool ice_reset_vf(struct ice_vf *vf, bool is_vflr);
+void ice_restore_all_vfs_msi_state(struct pci_dev *pdev);
+bool
+ice_is_malicious_vf(struct ice_pf *pf, struct ice_rq_event_info *event,
+		    u16 num_msg_proc, u16 num_msg_pending);
+
 
+#ifdef IFLA_VF_VLAN_INFO_MAX
 int
 ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
 		     __be16 vlan_proto);
+#else
+int
+ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos);
+#endif
+
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+int
+ice_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
+	      int max_tx_rate);
+#else
+int ice_set_vf_bw(struct net_device *netdev, int vf_id, int tx_rate);
+#endif
 
 int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted);
 
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state);
+#endif
+
+int ice_check_vf_ready_for_cfg(struct ice_vf *vf);
 
 int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena);
 
-int ice_calc_vf_reg_idx(struct ice_vf *vf, struct ice_q_vector *q_vector);
+int ice_calc_vf_reg_idx(struct ice_vf *vf, struct ice_q_vector *q_vector,
+			u8 tc);
 
 void ice_set_vf_state_qs_dis(struct ice_vf *vf);
+#ifdef HAVE_VF_STATS
+int
+ice_get_vf_stats(struct net_device *netdev, int vf_id,
+		 struct ifla_vf_stats *vf_stats);
+#endif /* HAVE_VF_STATS */
+bool ice_is_any_vf_in_promisc(struct ice_pf *pf);
+void
+ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event);
+void ice_print_vfs_mdd_events(struct ice_pf *pf);
+void ice_print_vf_rx_mdd_event(struct ice_vf *vf);
+enum ice_pkg_type ice_pkg_name_to_type(struct ice_hw *hw);
+bool ice_vc_validate_pattern(struct ice_vf *vf,
+			     struct virtchnl_proto_hdrs *proto);
+struct ice_vsi *ice_vf_ctrl_vsi_setup(struct ice_vf *vf);
+int
+ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
+		      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen);
+bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id);
+bool ice_vf_is_port_vlan_ena(struct ice_vf *vf);
 #else /* CONFIG_PCI_IOV */
-#define ice_process_vflr_event(pf) do {} while (0)
-#define ice_free_vfs(pf) do {} while (0)
-#define ice_vc_process_vf_msg(pf, event) do {} while (0)
-#define ice_vc_notify_link_state(pf) do {} while (0)
-#define ice_vc_notify_reset(pf) do {} while (0)
-#define ice_set_vf_state_qs_dis(vf) do {} while (0)
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+static inline struct ice_vsi *ice_get_vf_vsi(struct ice_vf *vf)
+{
+	return NULL;
+}
+#endif /* CONFIG_NET_DEVLINK */
+static inline void ice_dump_all_vfs(struct ice_pf *pf) { }
+static inline void ice_process_vflr_event(struct ice_pf *pf) { }
+static inline void ice_free_vfs(struct ice_pf *pf) { }
+static inline
+void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event) { }
+static inline void ice_vc_notify_link_state(struct ice_pf *pf) { }
+static inline void ice_vc_notify_reset(struct ice_pf *pf) { }
+static inline void ice_vc_notify_vf_link_state(struct ice_vf *vf) { }
+static inline void ice_vc_change_ops_to_repr(struct ice_vc_vf_ops *ops) { }
+static inline int ice_check_vf_ready_for_cfg(struct ice_vf *vf)
+{
+	return -EOPNOTSUPP;
+}
+static inline void ice_vc_set_dflt_vf_ops(struct ice_vc_vf_ops *ops) { }
+static inline void ice_set_vf_state_qs_dis(struct ice_vf *vf) { }
+static inline
+void ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event) { }
+static inline void ice_print_vfs_mdd_events(struct ice_pf *pf) { }
+static inline void ice_print_vf_rx_mdd_event(struct ice_vf *vf) { }
+static inline void ice_restore_all_vfs_msi_state(struct pci_dev *pdev) { }
+static inline bool
+ice_is_malicious_vf(struct ice_pf __always_unused *pf,
+		    struct ice_rq_event_info __always_unused *event,
+		    u16 __always_unused num_msg_proc,
+		    u16 __always_unused num_msg_pending)
+{
+	return false;
+}
 
 static inline bool
 ice_reset_all_vfs(struct ice_pf __always_unused *pf,
@@ -136,6 +422,12 @@ ice_reset_all_vfs(struct ice_pf __always_unused *pf,
 	return true;
 }
 
+static inline bool
+ice_reset_vf(struct ice_vf __always_unused *vf, bool __always_unused is_vflr)
+{
+	return true;
+}
+
 static inline int
 ice_sriov_configure(struct pci_dev __always_unused *pdev,
 		    int __always_unused num_vfs)
@@ -158,13 +450,16 @@ ice_get_vf_cfg(struct net_device __always_unused *netdev,
 	return -EOPNOTSUPP;
 }
 
+#ifdef HAVE_NDO_SET_VF_TRUST
 static inline int
 ice_set_vf_trust(struct net_device __always_unused *netdev,
 		 int __always_unused vf_id, bool __always_unused trusted)
 {
 	return -EOPNOTSUPP;
 }
+#endif /* HAVE_NDO_SET_VF_TRUST */
 
+#ifdef IFLA_VF_VLAN_INFO_MAX
 static inline int
 ice_set_vf_port_vlan(struct net_device __always_unused *netdev,
 		     int __always_unused vf_id, u16 __always_unused vid,
@@ -172,6 +467,15 @@ ice_set_vf_port_vlan(struct net_device __always_unused *netdev,
 {
 	return -EOPNOTSUPP;
 }
+#else
+static inline int
+ice_set_vf_port_vlan(struct net_device __always_unused *netdev,
+		     int __always_unused vf_id, u16 __always_unused vid,
+		     u8 __always_unused qos)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* IFLA_VF_VLAN_INFO_MAX */
 
 static inline int
 ice_set_vf_spoofchk(struct net_device __always_unused *netdev,
@@ -180,18 +484,77 @@ ice_set_vf_spoofchk(struct net_device __always_unused *netdev,
 	return -EOPNOTSUPP;
 }
 
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 static inline int
 ice_set_vf_link_state(struct net_device __always_unused *netdev,
 		      int __always_unused vf_id, int __always_unused link_state)
 {
 	return -EOPNOTSUPP;
 }
+#endif /* HAVE_NDO_SET_VF_LINK_STATE */
+
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+static inline int
+ice_set_vf_bw(struct net_device __always_unused *netdev,
+	      int __always_unused vf_id, int __always_unused min_tx_rate,
+	      int __always_unused max_tx_rate)
+#else
+static inline int
+ice_set_vf_bw(struct net_device __always_unused *netdev,
+	      int __always_unused vf_id, int __always_unused max_tx_rate)
+#endif
+{
+	return -EOPNOTSUPP;
+}
 
 static inline int
 ice_calc_vf_reg_idx(struct ice_vf __always_unused *vf,
-		    struct ice_q_vector __always_unused *q_vector)
+		    struct ice_q_vector __always_unused *q_vector,
+		    u8 __always_unused tc)
 {
 	return 0;
 }
+
+#ifdef HAVE_VF_STATS
+static inline int
+ice_get_vf_stats(struct net_device __always_unused *netdev,
+		 int __always_unused vf_id,
+		 struct ifla_vf_stats __always_unused *vf_stats)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_VF_STATS */
+
+static inline bool ice_is_any_vf_in_promisc(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+
+static inline enum ice_pkg_type ice_pkg_name_to_type(struct ice_hw *hw)
+{
+	return ICE_PKG_TYPE_UNKNOWN;
+}
+
+static inline struct ice_vsi *
+ice_vf_ctrl_vsi_setup(struct ice_vf __always_unused *vf)
+{
+	return NULL;
+}
+
+static inline int
+ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
+		      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen)
+{
+	return 0;
+}
+
+static inline bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id)
+{
+	return 0;
+}
+static inline bool ice_vf_is_port_vlan_ena(struct ice_vf __always_unused *vf)
+{
+	return false;
+}
 #endif /* CONFIG_PCI_IOV */
 #endif /* _ICE_VIRTCHNL_PF_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan.h b/drivers/net/ethernet/intel/ice/ice_vlan.h
new file mode 100644
index 000000000000..69b78750c1f5
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vlan.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VLAN_H_
+#define _ICE_VLAN_H_
+
+#include <linux/types.h>
+#include "ice_type.h"
+
+struct ice_vlan {
+	u16 tpid;
+	u16 vid;
+	u8 prio;
+	enum ice_sw_fwd_act_type fwd_act;
+};
+
+#endif /* _ICE_VLAN_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan_mode.c b/drivers/net/ethernet/intel/ice/ice_vlan_mode.c
new file mode 100644
index 000000000000..5cddf9ec042f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vlan_mode.c
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_common.h"
+
+/**
+ * ice_pkg_get_supported_vlan_mode - chk if DDP supports Double VLAN mode (DVM)
+ * @hw: pointer to the HW struct
+ * @dvm: output variable to determine if DDP supports DVM(true) or SVM(false)
+ */
+static enum ice_status
+ice_pkg_get_supported_vlan_mode(struct ice_hw *hw, bool *dvm)
+{
+	u16 meta_init_size = sizeof(struct ice_meta_init_section);
+	struct ice_meta_init_section *sect;
+	struct ice_buf_build *bld;
+	enum ice_status status;
+
+	/* if anything fails, we assume there is no DVM support */
+	*dvm = false;
+
+	bld = ice_pkg_buf_alloc_single_section(hw,
+					       ICE_SID_RXPARSER_METADATA_INIT,
+					       meta_init_size, (void **)&sect);
+	if (!bld)
+		return ICE_ERR_NO_MEMORY;
+
+	/* only need to read a single section */
+	sect->count = cpu_to_le16(1);
+	sect->offset = cpu_to_le16(ICE_META_VLAN_MODE_ENTRY);
+
+	status = ice_aq_upload_section(hw,
+				       (struct ice_buf_hdr *)ice_pkg_buf(bld),
+				       ICE_PKG_BUF_SIZE, NULL);
+	if (!status) {
+		DECLARE_BITMAP(entry, ICE_META_INIT_BITS);
+		u32 arr[ICE_META_INIT_DW_CNT];
+		u16 i;
+
+		/* convert to host bitmap format */
+		for (i = 0; i < ICE_META_INIT_DW_CNT; i++)
+			arr[i] = le32_to_cpu(sect->entry[0].bm[i]);
+
+		bitmap_from_arr32(entry, arr, (u16)ICE_META_INIT_BITS);
+
+		/* check if DVM is supported */
+		*dvm = test_bit(ICE_META_VLAN_MODE_BIT, entry);
+	}
+
+	ice_pkg_buf_free(hw, bld);
+
+	return status;
+}
+
+/**
+ * ice_aq_get_vlan_mode - get the VLAN mode of the device
+ * @hw: pointer to the HW structure
+ * @get_params: structure FW fills in based on the current VLAN mode config
+ *
+ * Get VLAN Mode Parameters (0x020D)
+ */
+static enum ice_status
+ice_aq_get_vlan_mode(struct ice_hw *hw,
+		     struct ice_aqc_get_vlan_mode *get_params)
+{
+	struct ice_aq_desc desc;
+
+	if (!get_params)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_get_vlan_mode_parameters);
+
+	return ice_aq_send_cmd(hw, &desc, get_params, sizeof(*get_params),
+			       NULL);
+}
+
+/**
+ * ice_aq_is_dvm_ena - query FW to check if double VLAN mode is enabled
+ * @hw: pointer to the HW structure
+ *
+ * Returns true if the hardware/firmware is configured in double VLAN mode,
+ * else return false signaling that the hardware/firmware is configured in
+ * single VLAN mode.
+ *
+ * Also, return false if this call fails for any reason (i.e. firmware doesn't
+ * support this AQ call).
+ */
+static bool ice_aq_is_dvm_ena(struct ice_hw *hw)
+{
+	struct ice_aqc_get_vlan_mode get_params = { 0 };
+	enum ice_status status;
+
+	status = ice_aq_get_vlan_mode(hw, &get_params);
+	if (status) {
+		ice_debug(hw, ICE_DBG_AQ, "Failed to get VLAN mode, status %d\n",
+			  status);
+		return false;
+	}
+
+	return (get_params.vlan_mode & ICE_AQ_VLAN_MODE_DVM_ENA);
+}
+
+/**
+ * ice_is_dvm_ena - check if double VLAN mode is enabled
+ * @hw: pointer to the HW structure
+ *
+ * The device is configured in single or double VLAN mode on initialization and
+ * this cannot be dynamically changed during runtime. Based on this there is no
+ * need to make an AQ call every time the driver needs to know the VLAN mode.
+ * Instead, use the cached VLAN mode.
+ */
+bool ice_is_dvm_ena(struct ice_hw *hw)
+{
+	return hw->dvm_ena;
+}
+
+/**
+ * ice_cache_vlan_mode - cache VLAN mode after DDP is downloaded
+ * @hw: pointer to the HW structure
+ *
+ * This is only called after downloading the DDP and after the global
+ * configuration lock has been released because all ports on a device need to
+ * cache the VLAN mode.
+ */
+static void ice_cache_vlan_mode(struct ice_hw *hw)
+{
+	hw->dvm_ena = ice_aq_is_dvm_ena(hw) ? true : false;
+}
+
+/**
+ * ice_pkg_supports_dvm - find out if DDP supports DVM
+ * @hw: pointer to the HW structure
+ */
+static bool ice_pkg_supports_dvm(struct ice_hw *hw)
+{
+	enum ice_status status;
+	bool pkg_supports_dvm;
+
+	status = ice_pkg_get_supported_vlan_mode(hw, &pkg_supports_dvm);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PKG, "Failed to get supported VLAN mode, status %d\n",
+			  status);
+		return false;
+	}
+
+	return pkg_supports_dvm;
+}
+
+/**
+ * ice_fw_supports_dvm - find out if FW supports DVM
+ * @hw: pointer to the HW structure
+ */
+static bool ice_fw_supports_dvm(struct ice_hw *hw)
+{
+	struct ice_aqc_get_vlan_mode get_vlan_mode = { 0 };
+	enum ice_status status;
+
+	/* If firmware returns success, then it supports DVM, else it only
+	 * supports SVM
+	 */
+	status = ice_aq_get_vlan_mode(hw, &get_vlan_mode);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to get VLAN mode, status %d\n",
+			  status);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_is_dvm_supported - check if Double VLAN Mode is supported
+ * @hw: pointer to the hardware structure
+ *
+ * Returns true if Double VLAN Mode (DVM) is supported and false if only Single
+ * VLAN Mode (SVM) is supported. In order for DVM to be supported the DDP and
+ * firmware must support it, otherwise only SVM is supported. This function
+ * should only be called while the global config lock is held and after the
+ * package has been successfully downloaded.
+ */
+static bool ice_is_dvm_supported(struct ice_hw *hw)
+{
+	if (!ice_pkg_supports_dvm(hw)) {
+		ice_debug(hw, ICE_DBG_PKG, "DDP doesn't support DVM\n");
+		return false;
+	}
+
+	if (!ice_fw_supports_dvm(hw)) {
+		ice_debug(hw, ICE_DBG_PKG, "FW doesn't support DVM\n");
+		return false;
+	}
+
+	return true;
+}
+
+#define ICE_EXTERNAL_VLAN_ID_FV_IDX			11
+#define ICE_SW_LKUP_VLAN_LOC_LKUP_IDX			1
+#define ICE_SW_LKUP_VLAN_PKT_FLAGS_LKUP_IDX		2
+#define ICE_SW_LKUP_PROMISC_VLAN_LOC_LKUP_IDX		2
+#define ICE_PKT_FLAGS_0_TO_15_FV_IDX			1
+#define ICE_PKT_FLAGS_0_TO_15_VLAN_FLAGS_MASK		0xD000
+static struct ice_update_recipe_lkup_idx_params ice_dvm_dflt_recipes[] = {
+	{
+		/* Update recipe ICE_SW_LKUP_VLAN to filter based on the
+		 * outer/single VLAN in DVM
+		 */
+		.rid = ICE_SW_LKUP_VLAN,
+		.fv_idx = ICE_EXTERNAL_VLAN_ID_FV_IDX,
+		.ignore_valid = true,
+		.mask = 0,
+		.mask_valid = false, /* use pre-existing mask */
+		.lkup_idx = ICE_SW_LKUP_VLAN_LOC_LKUP_IDX,
+	},
+	{
+		/* Update recipe ICE_SW_LKUP_VLAN to filter based on the VLAN
+		 * packet flags to support VLAN filtering on multiple VLAN
+		 * ethertypes (i.e. 0x8100 and 0x88a8) in DVM
+		 */
+		.rid = ICE_SW_LKUP_VLAN,
+		.fv_idx = ICE_PKT_FLAGS_0_TO_15_FV_IDX,
+		.ignore_valid = false,
+		.mask = ICE_PKT_FLAGS_0_TO_15_VLAN_FLAGS_MASK,
+		.mask_valid = true,
+		.lkup_idx = ICE_SW_LKUP_VLAN_PKT_FLAGS_LKUP_IDX,
+	},
+	{
+		/* Update recipe ICE_SW_LKUP_PROMISC_VLAN to filter based on the
+		 * outer/single VLAN in DVM
+		 */
+		.rid = ICE_SW_LKUP_PROMISC_VLAN,
+		.fv_idx = ICE_EXTERNAL_VLAN_ID_FV_IDX,
+		.ignore_valid = true,
+		.mask = 0,
+		.mask_valid = false,  /* use pre-existing mask */
+		.lkup_idx = ICE_SW_LKUP_PROMISC_VLAN_LOC_LKUP_IDX,
+	},
+};
+
+/**
+ * ice_dvm_update_dflt_recipes - update default switch recipes in DVM
+ * @hw: hardware structure used to update the recipes
+ */
+static enum ice_status ice_dvm_update_dflt_recipes(struct ice_hw *hw)
+{
+	unsigned long i;
+
+	for (i = 0; i < ARRAY_SIZE(ice_dvm_dflt_recipes); i++) {
+		struct ice_update_recipe_lkup_idx_params *params;
+		enum ice_status status;
+
+		params = &ice_dvm_dflt_recipes[i];
+
+		status = ice_update_recipe_lkup_idx(hw, params);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to update RID %d lkup_idx %d fv_idx %d mask_valid %s mask 0x%04x\n",
+				  params->rid, params->lkup_idx, params->fv_idx,
+				  params->mask_valid ? "true" : "false",
+				  params->mask);
+			return status;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_aq_set_vlan_mode - set the VLAN mode of the device
+ * @hw: pointer to the HW structure
+ * @set_params: requested VLAN mode configuration
+ *
+ * Set VLAN Mode Parameters (0x020C)
+ */
+static enum ice_status
+ice_aq_set_vlan_mode(struct ice_hw *hw,
+		     struct ice_aqc_set_vlan_mode *set_params)
+{
+	u8 rdma_packet, mng_vlan_prot_id;
+	struct ice_aq_desc desc;
+
+	if (!set_params)
+		return ICE_ERR_PARAM;
+
+	if (set_params->l2tag_prio_tagging > ICE_AQ_VLAN_PRIO_TAG_MAX)
+		return ICE_ERR_PARAM;
+
+	rdma_packet = set_params->rdma_packet;
+	if (rdma_packet != ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING &&
+	    rdma_packet != ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING)
+		return ICE_ERR_PARAM;
+
+	mng_vlan_prot_id = set_params->mng_vlan_prot_id;
+	if (mng_vlan_prot_id != ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER &&
+	    mng_vlan_prot_id != ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_set_vlan_mode_parameters);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	return ice_aq_send_cmd(hw, &desc, set_params, sizeof(*set_params),
+			       NULL);
+}
+
+/**
+ * ice_set_dvm - sets up software and hardware for double VLAN mode
+ * @hw: pointer to the hardware structure
+ */
+static enum ice_status ice_set_dvm(struct ice_hw *hw)
+{
+	struct ice_aqc_set_vlan_mode params = { 0 };
+	enum ice_status status;
+
+	params.l2tag_prio_tagging = ICE_AQ_VLAN_PRIO_TAG_OUTER_CTAG;
+	params.rdma_packet = ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING;
+	params.mng_vlan_prot_id = ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER;
+
+	status = ice_aq_set_vlan_mode(hw, &params);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set double VLAN mode parameters, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_dvm_update_dflt_recipes(hw);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to update default recipes for double VLAN mode, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_aq_set_port_params(hw->port_info, 0, false, false, true,
+					NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set port in double VLAN mode, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_set_dvm_boost_entries(hw);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set boost TCAM entries for double VLAN mode, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_svm - set single VLAN mode
+ * @hw: pointer to the HW structure
+ */
+static enum ice_status ice_set_svm(struct ice_hw *hw)
+{
+	struct ice_aqc_set_vlan_mode *set_params;
+	enum ice_status status;
+
+	status = ice_aq_set_port_params(hw->port_info, 0, false, false, false, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set port parameters for single VLAN mode\n");
+		return status;
+	}
+
+	set_params = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*set_params),
+				  GFP_KERNEL);
+	if (!set_params)
+		return ICE_ERR_NO_MEMORY;
+
+	/* default configuration for SVM configurations */
+	set_params->l2tag_prio_tagging = ICE_AQ_VLAN_PRIO_TAG_INNER_CTAG;
+	set_params->rdma_packet = ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING;
+	set_params->mng_vlan_prot_id = ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER;
+
+	status = ice_aq_set_vlan_mode(hw, set_params);
+	if (status)
+		ice_debug(hw, ICE_DBG_INIT, "Failed to configure port in single VLAN mode\n");
+
+	devm_kfree(ice_hw_to_dev(hw), set_params);
+	return status;
+}
+
+/**
+ * ice_set_vlan_mode
+ * @hw: pointer to the HW structure
+ */
+enum ice_status ice_set_vlan_mode(struct ice_hw *hw)
+{
+	if (!ice_is_dvm_supported(hw))
+		return 0;
+
+	if (!ice_set_dvm(hw))
+		return 0;
+
+	return ice_set_svm(hw);
+}
+
+/**
+ * ice_print_dvm_not_supported - print if DDP and/or FW doesn't support DVM
+ * @hw: pointer to the HW structure
+ *
+ * The purpose of this function is to print that  QinQ is not supported due to
+ * incompatibilty from the DDP and/or FW. This will give a hint to the user to
+ * update one and/or both components if they expect QinQ functionality.
+ */
+static void ice_print_dvm_not_supported(struct ice_hw *hw)
+{
+	bool pkg_supports_dvm = ice_pkg_supports_dvm(hw);
+	bool fw_supports_dvm = ice_fw_supports_dvm(hw);
+
+	if (!fw_supports_dvm && !pkg_supports_dvm)
+		dev_info(ice_hw_to_dev(hw),
+		         "QinQ functionality cannot be enabled on this device. Update your DDP package and NVM to versions that support QinQ.\n");
+	else if (!pkg_supports_dvm)
+		dev_info(ice_hw_to_dev(hw),
+		         "QinQ functionality cannot be enabled on this device. Update your DDP package to a version that supports QinQ.\n");
+	else if (!fw_supports_dvm)
+		dev_info(ice_hw_to_dev(hw),
+		         "QinQ functionality cannot be enabled on this device. Update your NVM to a version that supports QinQ.\n");
+}
+
+/**
+ * ice_post_pkg_dwnld_vlan_mode_cfg - configure VLAN mode after DDP download
+ * @hw: pointer to the HW structure
+ *
+ * This function is meant to configure any VLAN mode specific functionality
+ * after the global configuration lock has been released and the DDP has been
+ * downloaded.
+ *
+ * Since only one PF downloads the DDP and configures the VLAN mode there needs
+ * to be a way to configure the other PFs after the DDP has been downloaded and
+ * the global configuration lock has been released. All such code should go in
+ * this function.
+ */
+void ice_post_pkg_dwnld_vlan_mode_cfg(struct ice_hw *hw)
+{
+	ice_cache_vlan_mode(hw);
+
+	if (ice_is_dvm_ena(hw))
+		ice_change_proto_id_to_dvm();
+	else
+		ice_print_dvm_not_supported(hw);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan_mode.h b/drivers/net/ethernet/intel/ice/ice_vlan_mode.h
new file mode 100644
index 000000000000..5072529c1e03
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vlan_mode.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VLAN_MODE_H_
+#define _ICE_VLAN_MODE_H_
+
+#include "ice_osdep.h"
+
+struct ice_hw;
+
+bool ice_is_dvm_ena(struct ice_hw *hw);
+enum ice_status ice_set_vlan_mode(struct ice_hw *hw);
+void ice_post_pkg_dwnld_vlan_mode_cfg(struct ice_hw *hw);
+
+#endif /* _ICE_VLAN_MODE_H */
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c
new file mode 100644
index 000000000000..705e496b7f74
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c
@@ -0,0 +1,744 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_lib.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+#include "ice.h"
+
+static void print_invalid_tpid(struct ice_vsi *vsi, u16 tpid)
+{
+	dev_err(ice_pf_to_dev(vsi->back), "%s %d specified invalid VLAN tpid 0x%04x\n",
+		ice_vsi_type_str(vsi->type), vsi->idx, tpid);
+}
+
+/**
+ * validate_vlan - check if the ice_vlan passed in is valid
+ * @vsi: VSI used for printing error message
+ * @vlan: ice_vlan structure to validate
+ *
+ * Return true if the VLAN TPID is valid or if the VLAN TPID is 0 and the VLAN
+ * VID is 0, which allows for non-zero VLAN filters with the specified VLAN TPID
+ * and untagged VLAN 0 filtersto be added to the prune list respectively.
+ */
+static bool validate_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	if (vlan->tpid != ETH_P_8021Q && vlan->tpid != ETH_P_8021AD &&
+	    vlan->tpid != ETH_P_QINQ1 && (vlan->tpid || vlan->vid)) {
+		print_invalid_tpid(vsi, vlan->tpid);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_vsi_add_vlan - default add VLAN implementation for all VSI types
+ * @vsi: VSI being configured
+ * @vlan: VLAN filter to add
+ */
+int ice_vsi_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	enum ice_status status;
+	int err = 0;
+
+	if (!validate_vlan(vsi, vlan))
+		return -EINVAL;
+
+	status = ice_fltr_add_vlan(vsi, vlan);
+	if (status && status != ICE_ERR_ALREADY_EXISTS) {
+		err = -ENODEV;
+		dev_err(ice_pf_to_dev(vsi->back), "Failure Adding VLAN %d on VSI %i, status %s\n",
+			vlan->vid, vsi->vsi_num, ice_stat_str(status));
+	} else {
+		vsi->num_vlan++;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_del_vlan - default del VLAN implementation for all VSI types
+ * @vsi: VSI being configured
+ * @vlan: VLAN filter to delete
+ */
+int ice_vsi_del_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+	int err = 0;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (!validate_vlan(vsi, vlan))
+		return -EINVAL;
+
+	status = ice_fltr_remove_vlan(vsi, vlan);
+	if (!status) {
+		vsi->num_vlan--;
+	} else if (status != ICE_ERR_DOES_NOT_EXIST &&
+		   status != ICE_ERR_RESET_ONGOING) {
+		dev_err(dev, "Error removing VLAN %d on VSI %i error: %s\n",
+			vlan->vid, vsi->vsi_num, ice_stat_str(status));
+		err = ice_status_to_errno(status);
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_manage_vlan_insertion - Manage VLAN insertion for the VSI for Tx
+ * @vsi: the VSI being changed
+ */
+static int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int err = 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	/* Here we are configuring the VSI to let the driver add VLAN tags by
+	 * setting inner_vlan_flags to ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL. The actual VLAN tag
+	 * insertion happens in the Tx hot path, in ice_tx_map.
+	 */
+	ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL;
+
+	/* Preserve existing VLAN strip setting */
+	ctxt->info.inner_vlan_flags |= (vsi->info.inner_vlan_flags &
+					ICE_AQ_VSI_INNER_VLAN_EMODE_M);
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN insert failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+		goto out;
+	}
+
+	vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
+out:
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_manage_vlan_stripping - Manage VLAN stripping for the VSI for Rx
+ * @vsi: the VSI being changed
+ * @ena: boolean value indicating if this is a enable or disable request
+ */
+static int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int err = 0;
+
+	/* do not allow modifying VLAN stripping when a port VLAN is configured
+	 * on this VSI
+	 */
+	if (vsi->info.port_based_inner_vlan)
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	/* Here we are configuring what the VSI should do with the VLAN tag in
+	 * the Rx packet. We can either leave the tag in the packet or put it in
+	 * the Rx descriptor.
+	 */
+	if (ena)
+		/* Strip VLAN tag from Rx packet and put it in the desc */
+		ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_EMODE_STR_BOTH;
+	else
+		/* Disable stripping. Leave tag in packet */
+		ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
+
+	/* Allow all packets untagged/tagged */
+	ctxt->info.inner_vlan_flags |= ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL;
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN strip failed, ena = %d err %s aq_err %s\n",
+			ena, ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+		goto out;
+	}
+
+	vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
+out:
+	kfree(ctxt);
+	return err;
+}
+
+int ice_vsi_ena_inner_stripping(struct ice_vsi *vsi, const u16 tpid)
+{
+	if (tpid != ETH_P_8021Q) {
+		print_invalid_tpid(vsi, tpid);
+		return -EINVAL;
+	}
+
+	return ice_vsi_manage_vlan_stripping(vsi, true);
+}
+
+int ice_vsi_dis_inner_stripping(struct ice_vsi *vsi)
+{
+	return ice_vsi_manage_vlan_stripping(vsi, false);
+}
+
+int ice_vsi_ena_inner_insertion(struct ice_vsi *vsi, const u16 tpid)
+{
+	if (tpid != ETH_P_8021Q) {
+		print_invalid_tpid(vsi, tpid);
+		return -EINVAL;
+	}
+
+	return ice_vsi_manage_vlan_insertion(vsi);
+}
+
+int ice_vsi_dis_inner_insertion(struct ice_vsi *vsi)
+{
+	return ice_vsi_manage_vlan_insertion(vsi);
+}
+
+/**
+ * __ice_vsi_set_inner_port_vlan - set port VLAN VSI context settings to enable a port VLAN
+ * @vsi: the VSI to update
+ * @pvid_info: VLAN ID and QoS used to set the PVID VSI context field
+ */
+static int __ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, u16 pvid_info)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_aqc_vsi_props *info;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int ret = 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+	info = &ctxt->info;
+	info->inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTUNTAGGED |
+		ICE_AQ_VSI_INNER_VLAN_INSERT_PVID |
+		ICE_AQ_VSI_INNER_VLAN_EMODE_STR;
+	info->sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	info->port_based_inner_vlan = cpu_to_le16(pvid_info);
+	info->valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
+					   ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_info(ice_hw_to_dev(hw), "update VSI for port VLAN failed, err %s aq_err %s\n",
+			 ice_stat_str(status),
+			 ice_aq_str(hw->adminq.sq_last_status));
+		ret = -EIO;
+		goto out;
+	}
+
+	vsi->info.inner_vlan_flags = info->inner_vlan_flags;
+	vsi->info.sw_flags2 = info->sw_flags2;
+	vsi->info.port_based_inner_vlan = info->port_based_inner_vlan;
+out:
+	kfree(ctxt);
+	return ret;
+}
+
+int ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	u16 port_vlan_info;
+
+	if (vlan->tpid != ETH_P_8021Q)
+		return -EINVAL;
+
+	if (vlan->prio > 7)
+		return -EINVAL;
+
+	port_vlan_info = vlan->vid | (vlan->prio << VLAN_PRIO_SHIFT);
+
+	return __ice_vsi_set_inner_port_vlan(vsi, port_vlan_info);
+}
+
+/**
+ * ice_cfg_vlan_pruning - enable or disable VLAN pruning on the VSI
+ * @vsi: VSI to enable or disable VLAN pruning on
+ * @ena: set to true to enable VLAN pruning and false to disable it
+ *
+ * returns 0 if VSI is updated, negative otherwise
+ */
+static int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena)
+{
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	struct ice_pf *pf;
+
+	if (!vsi)
+		return -EINVAL;
+
+	/* Don't enable VLAN pruning if the netdev is currently in promiscuous
+	 * mode. VLAN pruning will be enabled when the interface exits
+	 * promiscuous mode if any VLAN filters are active.
+	 */
+	if (vsi->netdev && vsi->netdev->flags & IFF_PROMISC && ena)
+		return 0;
+
+	pf = vsi->back;
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+
+	if (ena)
+		ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+	else
+		ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(&pf->hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		netdev_err(vsi->netdev, "%sabling VLAN pruning on VSI handle: %d, VSI HW ID: %d failed, err = %s, aq_err = %s\n",
+			   ena ? "En" : "Dis", vsi->idx, vsi->vsi_num,
+			   ice_stat_str(status),
+			   ice_aq_str(pf->hw.adminq.sq_last_status));
+		goto err_out;
+	}
+
+	vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+
+	kfree(ctxt);
+	return 0;
+
+err_out:
+	kfree(ctxt);
+	return -EIO;
+}
+
+int ice_vsi_ena_rx_vlan_filtering(struct ice_vsi *vsi)
+{
+	return ice_cfg_vlan_pruning(vsi, true);
+}
+
+int ice_vsi_dis_rx_vlan_filtering(struct ice_vsi *vsi)
+{
+	return ice_cfg_vlan_pruning(vsi, false);
+}
+
+static int ice_cfg_vlan_antispoof(struct ice_vsi *vsi, bool enable)
+{
+	struct ice_vsi_ctx *ctx;
+	enum ice_status status;
+	int err = 0;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->info.sec_flags = vsi->info.sec_flags;
+	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+
+	if (enable)
+		ctx->info.sec_flags |= ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+			ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S;
+	else
+		ctx->info.sec_flags &= ~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+					 ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+
+	status = ice_update_vsi(&vsi->back->hw, vsi->idx, ctx, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to configure Tx VLAN anti-spoof %s for VSI %d, error %s\n",
+			enable ? "ON" : "OFF", vsi->vsi_num,
+			ice_stat_str(status));
+		err = ice_status_to_errno(status);
+	} else {
+		vsi->info.sec_flags = ctx->info.sec_flags;
+	}
+
+	kfree(ctx);
+
+	return err;
+}
+
+int ice_vsi_ena_tx_vlan_filtering(struct ice_vsi *vsi)
+{
+	return ice_cfg_vlan_antispoof(vsi, true);
+}
+
+int ice_vsi_dis_tx_vlan_filtering(struct ice_vsi *vsi)
+{
+	return ice_cfg_vlan_antispoof(vsi, false);
+}
+
+/**
+ * tpid_to_vsi_outer_vlan_type - convert from TPID to VSI context based tag_type
+ * @tpid: tpid used to translate into VSI context based tag_type
+ * @tag_type: output variable to hold the VSI context based tag type
+ */
+static int tpid_to_vsi_outer_vlan_type(u16 tpid, u8 *tag_type)
+{
+	switch (tpid) {
+	case ETH_P_8021Q:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_8100;
+		break;
+	case ETH_P_8021AD:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_STAG;
+		break;
+	case ETH_P_QINQ1:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_9100;
+		break;
+	default:
+		*tag_type = 0;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsi_ena_outer_stripping - enable outer VLAN stripping
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN stripping for
+ *
+ * Enable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for stripping will affect the TPID for insertion.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN stripping settings and the VLAN TPID. Outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This enables hardware to strip a VLAN tag with the specified TPID to be
+ * stripped from the packet and placed in the receive descriptor.
+ */
+int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	/* do not allow modifying VLAN stripping when a port VLAN is configured
+	 * on this VSI
+	 */
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN strip settings */
+	ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_EMODE_M | ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+	ctxt->info.outer_vlan_flags |=
+		((ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH <<
+		  ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+		 ((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		  ICE_AQ_VSI_OUTER_TAG_TYPE_M));
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for enabling outer VLAN stripping failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_dis_outer_stripping - disable outer VLAN stripping
+ * @vsi: VSI to configure
+ *
+ * Disable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Only modify the outer VLAN stripping settings. The VLAN TPID and outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This tells the hardware to not strip any VLAN tagged packets, thus leaving
+ * them in the packet. This enables software offloaded VLAN stripping and
+ * disables hardware offloaded VLAN stripping.
+ */
+int ice_vsi_dis_outer_stripping(struct ice_vsi *vsi)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int err = 0;
+
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN strip settings */
+	ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~ICE_AQ_VSI_OUTER_VLAN_EMODE_M;
+	ctxt->info.outer_vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING <<
+		ICE_AQ_VSI_OUTER_VLAN_EMODE_S;
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for disabling outer VLAN stripping failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_ena_outer_insertion - enable outer VLAN insertion
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN insertion for
+ *
+ * Enable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for insertion will affect the TPID for stripping.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN insertion settings and the VLAN TPID. Outer VLAN
+ * stripping settings are unmodified.
+ *
+ * This allows a VLAN tag with the specified TPID to be inserted in the transmit
+ * descriptor.
+ */
+int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, u16 tpid)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN insertion settings */
+	ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M |
+		  ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+	ctxt->info.outer_vlan_flags |=
+		((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for enabling outer VLAN insertion failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_dis_outer_insertion - disable outer VLAN insertion
+ * @vsi: VSI to configure
+ *
+ * Disable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Only modify the outer VLAN insertion settings. The VLAN TPID and outer VLAN
+ * settings are unmodified.
+ *
+ * This tells the hardware to not allow any VLAN tagged packets in the transmit
+ * descriptor. This enables software offloaded VLAN insertion and disables
+ * hardware offloaded VLAN insertion.
+ */
+int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int err = 0;
+
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN insertion settings */
+	ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+	ctxt->info.outer_vlan_flags |=
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for disabling outer VLAN insertion failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * __ice_vsi_set_outer_port_vlan - set the outer port VLAN and related settings
+ * @vsi: VSI to configure
+ * @vlan_info: packed u16 that contains the VLAN prio and ID
+ * @tpid: TPID of the port VLAN
+ *
+ * Set the port VLAN prio, ID, and TPID.
+ *
+ * Enable VLAN pruning so the VSI doesn't receive any traffic that doesn't match
+ * a VLAN prune rule. The caller should take care to add a VLAN prune rule that
+ * matches the port VLAN ID and TPID.
+ *
+ * Tell hardware to strip outer VLAN tagged packets on receive and don't put
+ * them in the receive descriptor. VSI(s) in port VLANs should not be aware of
+ * the port VLAN ID or TPID they are assigned to.
+ *
+ * Tell hardware to prevent outer VLAN tag insertion on transmit and only allow
+ * untagged outer packets from the transmit descriptor.
+ *
+ * Also, tell the hardware to insert the port VLAN on transmit.
+ */
+static int
+__ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, u16 vlan_info, u16 tpid)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+
+	ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	ctxt->info.port_based_outer_vlan = cpu_to_le16(vlan_info);
+	ctxt->info.outer_vlan_flags =
+		(ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW <<
+		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M) |
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		(ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED <<
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) |
+		ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID |
+			    ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for setting outer port based VLAN failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.port_based_outer_vlan = ctxt->info.port_based_outer_vlan;
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+		vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_set_outer_port_vlan - public version of __ice_vsi_set_outer_port_vlan
+ * @vsi: VSI to configure
+ * @vlan: ice_vlan structure used to set the port VLAN
+ *
+ * Set the outer port VLAN via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * This function does not support clearing the port VLAN as there is currently
+ * no use case for this.
+ *
+ * Use the ice_vlan structure passed in to set this VSI in a port VLAN.
+ */
+int ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	u16 port_vlan_info;
+
+	if (vlan->prio > (VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT))
+		return -EINVAL;
+
+	port_vlan_info = vlan->vid | (vlan->prio << VLAN_PRIO_SHIFT);
+
+	return __ice_vsi_set_outer_port_vlan(vsi, port_vlan_info, vlan->tpid);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h
new file mode 100644
index 000000000000..60925e920fac
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VSI_VLAN_LIB_H_
+#define _ICE_VSI_VLAN_LIB_H_
+
+#include <linux/types.h>
+#include "ice_vlan.h"
+
+struct ice_vsi;
+
+#define ICE_VLAN(tpid, vid, prio, fwd_action)	\
+	((struct ice_vlan){ tpid, vid, prio, fwd_action })
+
+int ice_vsi_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+int ice_vsi_del_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+
+int ice_vsi_ena_inner_stripping(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_inner_stripping(struct ice_vsi *vsi);
+int ice_vsi_ena_inner_insertion(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_inner_insertion(struct ice_vsi *vsi);
+int ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+
+int ice_vsi_ena_rx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_dis_rx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_ena_tx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_dis_tx_vlan_filtering(struct ice_vsi *vsi);
+
+int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_outer_stripping(struct ice_vsi *vsi);
+int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi);
+int ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+#endif /* _ICE_VSI_VLAN_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c
new file mode 100644
index 000000000000..466c691d6848
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_pf_vsi_vlan_ops.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_lib.h"
+#include "ice.h"
+
+static int
+op_unsupported_vlan_arg(struct ice_vsi * __always_unused vsi,
+			struct ice_vlan * __always_unused vlan)
+{
+	return -EOPNOTSUPP;
+}
+
+static int
+op_unsupported_tpid_arg(struct ice_vsi *__always_unused vsi,
+			u16 __always_unused tpid)
+{
+	return -EOPNOTSUPP;
+}
+
+static int op_unsupported(struct ice_vsi *__always_unused vsi)
+{
+	return -EOPNOTSUPP;
+}
+
+/* If any new ops are added to the VSI VLAN ops interface then an unsupported
+ * implementation should be set here.
+ */
+static struct ice_vsi_vlan_ops ops_unsupported = {
+	.add_vlan = op_unsupported_vlan_arg,
+	.del_vlan = op_unsupported_vlan_arg,
+	.ena_stripping = op_unsupported_tpid_arg,
+	.dis_stripping = op_unsupported,
+	.ena_insertion = op_unsupported_tpid_arg,
+	.dis_insertion = op_unsupported,
+	.ena_rx_filtering = op_unsupported,
+	.dis_rx_filtering = op_unsupported,
+	.ena_tx_filtering = op_unsupported,
+	.dis_tx_filtering = op_unsupported,
+	.set_port_vlan = op_unsupported_vlan_arg,
+};
+
+/**
+ * ice_vsi_init_unsupported_vlan_ops - init all VSI VLAN ops to unsupported
+ * @vsi: VSI to initialize VSI VLAN ops to unsupported for
+ *
+ * By default all inner and outer VSI VLAN ops return -EOPNOTSUPP. This was done
+ * as oppsed to leaving the ops null to prevent unexpected crashes. Instead if
+ * an unsupported VSI VLAN op is called it will just return -EOPNOTSUPP.
+ *
+ */
+static void ice_vsi_init_unsupported_vlan_ops(struct ice_vsi *vsi)
+{
+	vsi->outer_vlan_ops = ops_unsupported;
+	vsi->inner_vlan_ops = ops_unsupported;
+}
+
+/**
+ * ice_vsi_init_vlan_ops - initialize type specific VSI VLAN ops
+ * @vsi: VSI to initialize ops for
+ *
+ * If any VSI types are added and/or require different ops than the PF or VF VSI
+ * then they will have to add a case here to handle that. Also, VSI type
+ * specific files should be added in the same manner that was done for PF VSI.
+ */
+void ice_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+	/* Initialize all VSI types to have unsupported VSI VLAN ops */
+	ice_vsi_init_unsupported_vlan_ops(vsi);
+
+	switch (vsi->type) {
+	case ICE_VSI_PF:
+	case ICE_VSI_CHNL:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		ice_pf_vsi_init_vlan_ops(vsi);
+		break;
+	case ICE_VSI_VF:
+		ice_vf_vsi_init_vlan_ops(vsi);
+		break;
+	default:
+		dev_dbg(ice_pf_to_dev(vsi->back), "%s does not support VLAN operations\n",
+			ice_vsi_type_str(vsi->type));
+		break;
+	}
+}
+
+/**
+ * ice_get_compat_vsi_vlan_ops - Get VSI VLAN ops based on VLAN mode
+ * @vsi: VSI used to get the VSI VLAN ops
+ *
+ * This function is meant to be used when the caller doesn't know which VLAN ops
+ * to use (i.e. inner or outer). This allows backward compatibility for VLANs
+ * since most of the Outer VSI VLAN functins are not supported when
+ * the device is configured in Single VLAN Mode (SVM).
+ */
+struct ice_vsi_vlan_ops *ice_get_compat_vsi_vlan_ops(struct ice_vsi *vsi)
+{
+	if (ice_is_dvm_ena(&vsi->back->hw))
+		return &vsi->outer_vlan_ops;
+	else
+		return &vsi->inner_vlan_ops;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h
new file mode 100644
index 000000000000..70ff0e289d0c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VSI_VLAN_OPS_H_
+#define _ICE_VSI_VLAN_OPS_H_
+
+#include "ice_type.h"
+#include "ice_vsi_vlan_lib.h"
+
+struct ice_vsi;
+
+struct ice_vsi_vlan_ops {
+	int (*add_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+	int (*del_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+	int (*ena_stripping)(struct ice_vsi *vsi, const u16 tpid);
+	int (*dis_stripping)(struct ice_vsi *vsi);
+	int (*ena_insertion)(struct ice_vsi *vsi, const u16 tpid);
+	int (*dis_insertion)(struct ice_vsi *vsi);
+	int (*ena_rx_filtering)(struct ice_vsi *vsi);
+	int (*dis_rx_filtering)(struct ice_vsi *vsi);
+	int (*ena_tx_filtering)(struct ice_vsi *vsi);
+	int (*dis_tx_filtering)(struct ice_vsi *vsi);
+	int (*set_port_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+};
+
+void ice_vsi_init_vlan_ops(struct ice_vsi *vsi);
+struct ice_vsi_vlan_ops *ice_get_compat_vsi_vlan_ops(struct ice_vsi *vsi);
+
+#endif /* _ICE_VSI_VLAN_OPS_H_ */
+
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
new file mode 100644
index 000000000000..43012bb9b115
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -0,0 +1,1393 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
+#include <net/xdp.h>
+#include <net/busy_poll.h>
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_base.h"
+#include "ice_type.h"
+#include "ice_xsk.h"
+#include "ice_txrx.h"
+#include "ice_txrx_lib.h"
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+
+/**
+ * ice_qp_reset_stats - Resets all stats for rings of given index
+ * @vsi: VSI that contains rings of interest
+ * @q_idx: ring index in array
+ */
+static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx)
+{
+	memset(&vsi->rx_rings[q_idx]->rx_stats, 0,
+	       sizeof(vsi->rx_rings[q_idx]->rx_stats));
+	memset(&vsi->tx_rings[q_idx]->stats, 0,
+	       sizeof(vsi->tx_rings[q_idx]->stats));
+	if (ice_is_xdp_ena_vsi(vsi))
+		memset(&vsi->xdp_rings[q_idx]->stats, 0,
+		       sizeof(vsi->xdp_rings[q_idx]->stats));
+}
+
+/**
+ * ice_qp_clean_rings - Cleans all the rings of a given index
+ * @vsi: VSI that contains rings of interest
+ * @q_idx: ring index in array
+ */
+static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx)
+{
+	ice_clean_tx_ring(vsi->tx_rings[q_idx]);
+	if (ice_is_xdp_ena_vsi(vsi))
+		ice_clean_tx_ring(vsi->xdp_rings[q_idx]);
+	ice_clean_rx_ring(vsi->rx_rings[q_idx]);
+}
+
+/**
+ * ice_qvec_toggle_napi - Enables/disables NAPI for a given q_vector
+ * @vsi: VSI that has netdev
+ * @q_vector: q_vector that has NAPI context
+ * @enable: true for enable, false for disable
+ */
+static void
+ice_qvec_toggle_napi(struct ice_vsi *vsi, struct ice_q_vector *q_vector,
+		     bool enable)
+{
+	if (!vsi->netdev || !q_vector)
+		return;
+
+	if (enable)
+		napi_enable(&q_vector->napi);
+	else
+		napi_disable(&q_vector->napi);
+}
+
+/**
+ * ice_qvec_dis_irq - Mask off queue interrupt generation on given ring
+ * @vsi: the VSI that contains queue vector being un-configured
+ * @rx_ring: Rx ring that will have its IRQ disabled
+ * @q_vector: queue vector
+ */
+static void
+ice_qvec_dis_irq(struct ice_vsi *vsi, struct ice_ring *rx_ring,
+		 struct ice_q_vector *q_vector)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	int base = vsi->base_vector;
+	u16 reg;
+	u32 val;
+
+	/* QINT_TQCTL is being cleared in ice_vsi_stop_tx_ring, so handle
+	 * here only QINT_RQCTL
+	 */
+	reg = rx_ring->reg_idx;
+	val = rd32(hw, QINT_RQCTL(reg));
+	val &= ~QINT_RQCTL_CAUSE_ENA_M;
+	wr32(hw, QINT_RQCTL(reg), val);
+
+	if (q_vector) {
+		u16 v_idx = q_vector->v_idx;
+
+		wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx), 0);
+
+		ice_flush(hw);
+		synchronize_irq(pf->msix_entries[v_idx + base].vector);
+	}
+}
+
+/**
+ * ice_qvec_cfg_msix - Enable IRQ for given queue vector
+ * @vsi: the VSI that contains queue vector
+ * @q_vector: queue vector
+ */
+static void
+ice_qvec_cfg_msix(struct ice_vsi *vsi, struct ice_q_vector *q_vector)
+{
+	u16 reg_idx = q_vector->reg_idx;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	struct ice_ring *ring;
+
+	ice_cfg_itr(hw, q_vector);
+
+	ice_for_each_ring(ring, q_vector->tx)
+		ice_cfg_txq_interrupt(vsi, ring->reg_idx, reg_idx,
+				      q_vector->tx.itr_idx);
+
+	ice_for_each_ring(ring, q_vector->rx)
+		ice_cfg_rxq_interrupt(vsi, ring->reg_idx, reg_idx,
+				      q_vector->rx.itr_idx);
+
+	ice_flush(hw);
+}
+
+/**
+ * ice_qvec_ena_irq - Enable IRQ for given queue vector
+ * @vsi: the VSI that contains queue vector
+ * @q_vector: queue vector
+ */
+static void ice_qvec_ena_irq(struct ice_vsi *vsi, struct ice_q_vector *q_vector)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+
+	ice_irq_dynamic_ena(hw, vsi, q_vector);
+
+	ice_flush(hw);
+}
+
+/**
+ * ice_qp_dis - Disables a queue pair
+ * @vsi: VSI of interest
+ * @q_idx: ring index in array
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx)
+{
+	struct ice_txq_meta txq_meta = { };
+	struct ice_ring *tx_ring, *rx_ring;
+	struct ice_q_vector *q_vector;
+	int timeout = 50;
+	int err;
+
+	if (q_idx >= vsi->num_rxq || q_idx >= vsi->num_txq)
+		return -EINVAL;
+
+	tx_ring = vsi->tx_rings[q_idx];
+	rx_ring = vsi->rx_rings[q_idx];
+	q_vector = rx_ring->q_vector;
+
+	while (test_and_set_bit(ICE_CFG_BUSY, vsi->state)) {
+		timeout--;
+		if (!timeout)
+			return -EBUSY;
+		usleep_range(1000, 2000);
+	}
+	netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx));
+
+	ice_qvec_dis_irq(vsi, rx_ring, q_vector);
+
+	ice_fill_txq_meta(vsi, tx_ring, &txq_meta);
+	err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta);
+	if (err)
+		return err;
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		struct ice_ring *xdp_ring = vsi->xdp_rings[q_idx];
+
+		memset(&txq_meta, 0, sizeof(txq_meta));
+		ice_fill_txq_meta(vsi, xdp_ring, &txq_meta);
+		err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, xdp_ring,
+					   &txq_meta);
+		if (err)
+			return err;
+	}
+	err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true);
+	if (err)
+		return err;
+
+	ice_qvec_toggle_napi(vsi, q_vector, false);
+	ice_qp_clean_rings(vsi, q_idx);
+	ice_qp_reset_stats(vsi, q_idx);
+
+	return 0;
+}
+
+/**
+ * ice_qp_ena - Enables a queue pair
+ * @vsi: VSI of interest
+ * @q_idx: ring index in array
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx)
+{
+	struct ice_aqc_add_tx_qgrp *qg_buf;
+	struct ice_ring *tx_ring, *rx_ring;
+	struct ice_q_vector *q_vector;
+	u16 size;
+	int err;
+
+	if (q_idx >= vsi->num_rxq || q_idx >= vsi->num_txq)
+		return -EINVAL;
+
+	size = struct_size(qg_buf, txqs, 1);
+	qg_buf = kzalloc(size, GFP_KERNEL);
+	if (!qg_buf)
+		return -ENOMEM;
+
+	qg_buf->num_txqs = 1;
+
+	tx_ring = vsi->tx_rings[q_idx];
+	rx_ring = vsi->rx_rings[q_idx];
+	q_vector = rx_ring->q_vector;
+
+	err = ice_vsi_cfg_txq(vsi, tx_ring, qg_buf);
+	if (err)
+		goto free_buf;
+
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		struct ice_ring *xdp_ring = vsi->xdp_rings[q_idx];
+
+		memset(qg_buf, 0, size);
+		qg_buf->num_txqs = 1;
+		err = ice_vsi_cfg_txq(vsi, xdp_ring, qg_buf);
+		if (err)
+			goto free_buf;
+		ice_set_ring_xdp(xdp_ring);
+		xdp_ring->xsk_pool = ice_xsk_umem(xdp_ring);
+	}
+
+	err = ice_vsi_cfg_rxq(rx_ring);
+	if (err)
+		goto free_buf;
+
+	ice_qvec_cfg_msix(vsi, q_vector);
+
+	err = ice_vsi_ctrl_one_rx_ring(vsi, true, q_idx, true);
+	if (err)
+		goto free_buf;
+
+	clear_bit(ICE_CFG_BUSY, vsi->state);
+	ice_qvec_toggle_napi(vsi, q_vector, true);
+	ice_qvec_ena_irq(vsi, q_vector);
+
+	netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx));
+free_buf:
+	kfree(qg_buf);
+	return err;
+}
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+/**
+ * ice_xsk_alloc_umems - allocate a UMEM region for an XDP socket
+ * @vsi: VSI to allocate the UMEM on
+ *
+ * Returns 0 on success, negative on error
+ */
+static int ice_xsk_alloc_umems(struct ice_vsi *vsi)
+{
+	if (vsi->xsk_umems)
+		return 0;
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+	vsi->xsk_umems = kcalloc(vsi->num_xsk_umems, sizeof(*vsi->xsk_umems),
+				 GFP_KERNEL);
+#else
+	vsi->xsk_umems = kcalloc(vsi->num_xsk_umems, sizeof(*vsi->xsk_umems),
+				 GFP_KERNEL);
+#endif /* HAVE_NETDEV_BPF_XSK_POOL */
+
+	if (!vsi->xsk_umems) {
+		vsi->num_xsk_umems = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_xsk_remove_umem - Remove an UMEM for a certain ring/qid
+ * @vsi: VSI from which the VSI will be removed
+ * @qid: Ring/qid associated with the UMEM
+ */
+static void ice_xsk_remove_umem(struct ice_vsi *vsi, u16 qid)
+{
+	vsi->xsk_umems[qid] = NULL;
+	vsi->num_xsk_umems_used--;
+
+	if (vsi->num_xsk_umems_used == 0) {
+		kfree(vsi->xsk_umems);
+		vsi->xsk_umems = NULL;
+		vsi->num_xsk_umems = 0;
+	}
+}
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+/**
+ * ice_xsk_umem_dma_map - DMA map UMEM region for XDP sockets
+ * @vsi: VSI to map the UMEM region
+ * @umem: UMEM to map
+ *
+ * Returns 0 on success, negative on error
+ */
+static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	unsigned int i;
+
+	dev = ice_pf_to_dev(pf);
+	for (i = 0; i < umem->npgs; i++) {
+		dma_addr_t dma = dma_map_page_attrs(dev, umem->pgs[i], 0,
+						    PAGE_SIZE,
+						    DMA_BIDIRECTIONAL,
+						    ICE_RX_DMA_ATTR);
+		if (dma_mapping_error(dev, dma)) {
+			dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d/n",
+				i);
+			goto out_unmap;
+		}
+
+		umem->pages[i].dma = dma;
+	}
+
+	return 0;
+
+out_unmap:
+	for (; i > 0; i--) {
+		dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
+				     DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
+		umem->pages[i].dma = 0;
+	}
+
+	return -EFAULT;
+}
+
+/**
+ * ice_xsk_umem_dma_unmap - DMA unmap UMEM region for XDP sockets
+ * @vsi: VSI from which the UMEM will be unmapped
+ * @umem: UMEM to unmap
+ */
+static void ice_xsk_umem_dma_unmap(struct ice_vsi *vsi, struct xdp_umem *umem)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	unsigned int i;
+
+	dev = ice_pf_to_dev(pf);
+	for (i = 0; i < umem->npgs; i++) {
+		dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
+				     DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
+
+		umem->pages[i].dma = 0;
+	}
+}
+#endif
+
+/**
+ * ice_xsk_umem_disable - disable a UMEM region
+ * @vsi: Current VSI
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid)
+{
+#ifdef HAVE_AF_XDP_NETDEV_UMEM
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+	struct xsk_buff_pool *umem = xsk_get_pool_from_qid(vsi->netdev, qid);
+#else
+	struct xdp_umem *umem = xsk_get_pool_from_qid(vsi->netdev, qid);
+#endif
+#else
+	struct xdp_umem *umem;
+
+	if (!vsi->xsk_umems || qid >= vsi->num_xsk_umems)
+		return -EINVAL;
+
+	umem = vsi->xsk_umems[qid];
+#endif
+
+	if (!umem)
+		return -EINVAL;
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	ice_xsk_umem_dma_unmap(vsi, umem);
+#else
+	xsk_pool_dma_unmap(umem, ICE_RX_DMA_ATTR);
+#endif
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	ice_xsk_remove_umem(vsi, qid);
+#endif
+
+	return 0;
+}
+
+/**
+ * ice_xsk_umem_enable - enable a UMEM region
+ * @vsi: Current VSI
+ * @umem: pointer to a requested UMEM region
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+ice_xsk_umem_enable(struct ice_vsi *vsi, struct xsk_buff_pool *umem, u16 qid)
+#else
+ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
+#endif /* HAVE_NETDEV_BPF_XSK_POOL */
+{
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	struct xdp_umem_fq_reuse *reuseq;
+#endif
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	int err;
+#endif
+
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (!vsi->num_xsk_umems)
+		vsi->num_xsk_umems = min_t(u16, vsi->num_rxq, vsi->num_txq);
+	if (qid >= vsi->num_xsk_umems)
+		return -EINVAL;
+
+	err = ice_xsk_alloc_umems(vsi);
+	if (err)
+		return err;
+
+	if (vsi->xsk_umems && vsi->xsk_umems[qid])
+		return -EBUSY;
+
+	vsi->xsk_umems[qid] = umem;
+	vsi->num_xsk_umems_used++;
+#else
+	if (qid >= vsi->netdev->real_num_rx_queues ||
+	    qid >= vsi->netdev->real_num_tx_queues)
+		return -EINVAL;
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
+	if (!reuseq)
+		return -ENOMEM;
+
+	xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
+
+	return ice_xsk_umem_dma_map(vsi, umem);
+#else
+	return xsk_pool_dma_map(umem, ice_pf_to_dev(vsi->back),
+			       ICE_RX_DMA_ATTR);
+#endif
+}
+
+/**
+ * ice_xsk_umem_setup - enable/disable a UMEM region depending on its state
+ * @vsi: Current VSI
+ * @umem: UMEM to enable/associate to a ring, NULL to disable
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xsk_buff_pool *umem, u16 qid)
+#else
+int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
+#endif
+{
+	bool if_running, umem_present = !!umem;
+	int ret = 0, umem_failure = 0;
+
+	if_running = netif_running(vsi->netdev) && ice_is_xdp_ena_vsi(vsi);
+
+	if (if_running) {
+		ret = ice_qp_dis(vsi, qid);
+		if (ret) {
+			netdev_err(vsi->netdev, "ice_qp_dis error = %d\n", ret);
+			goto xsk_umem_if_up;
+		}
+	}
+
+	umem_failure = umem_present ? ice_xsk_umem_enable(vsi, umem, qid) :
+				      ice_xsk_umem_disable(vsi, qid);
+
+xsk_umem_if_up:
+	if (if_running) {
+		ret = ice_qp_ena(vsi, qid);
+		if (!ret && umem_present)
+			napi_schedule(&vsi->xdp_rings[qid]->q_vector->napi);
+		else if (ret)
+			netdev_err(vsi->netdev, "ice_qp_ena error = %d\n", ret);
+	}
+
+	if (umem_failure) {
+		netdev_err(vsi->netdev, "Could not %sable UMEM, error = %d\n",
+			   umem_present ? "en" : "dis", umem_failure);
+		return umem_failure;
+	}
+
+	return ret;
+}
+
+#ifndef NO_XDP_QUERY_XSK_UMEM
+/**
+ * ice_xsk_umem_query - queries a certain ring/qid for its UMEM
+ * @vsi: Current VSI
+ * @umem: UMEM associated to the ring, if any
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_xsk_umem_query(struct ice_vsi *vsi, struct xdp_umem **umem, u16 qid)
+{
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	if (qid >= min_t(u16, vsi->num_rxq, vsi->num_txq))
+		return -EINVAL;
+
+	if (vsi->xsk_umems) {
+		if (qid >= vsi->num_xsk_umems)
+			return -EINVAL;
+		*umem = vsi->xsk_umems[qid];
+		return 0;
+	}
+
+	*umem = NULL;
+#else
+	struct net_device *netdev = vsi->netdev;
+	struct xdp_umem *queried_umem;
+
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	queried_umem = xsk_get_pool_from_qid(netdev, qid);
+	if (!queried_umem)
+		return -EINVAL;
+
+	*umem = queried_umem;
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+
+	return 0;
+}
+#endif /* NO_XDP_QUERY_XSK_UMEM */
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+/**
+ * ice_zca_free - Callback for MEM_TYPE_ZERO_COPY allocations
+ * @zca: zero-cpoy allocator
+ * @handle: Buffer handle
+ */
+void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
+{
+	struct ice_rx_buf *rx_buf;
+	struct ice_ring *rx_ring;
+	struct xdp_umem *umem;
+	u64 hr, mask;
+	u16 nta;
+
+	rx_ring = container_of(zca, struct ice_ring, zca);
+	umem = rx_ring->xsk_pool;
+	hr = umem->headroom + XDP_PACKET_HEADROOM;
+
+#ifndef HAVE_XDP_UMEM_PROPS
+	mask = umem->chunk_mask;
+#else
+	mask = umem->props.chunk_mask;
+#endif
+
+	nta = rx_ring->next_to_alloc;
+	rx_buf = &rx_ring->rx_buf[nta];
+
+	nta++;
+	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+	handle &= mask;
+
+	rx_buf->dma = xdp_umem_get_dma(umem, handle);
+	rx_buf->dma += hr;
+
+	rx_buf->addr = xdp_umem_get_data(umem, handle);
+	rx_buf->addr += hr;
+
+	rx_buf->handle = (u64)handle + umem->headroom;
+}
+
+/**
+ * ice_alloc_buf_fast_zc - Retrieve buffer address from XDP umem
+ * @rx_ring: ring with an xdp_umem bound to it
+ * @rx_buf: buffer to which xsk page address will be assigned
+ *
+ * This function allocates an Rx buffer in the hot path.
+ * The buffer can come from fill queue or recycle queue.
+ *
+ * Returns true if an assignment was successful, false if not.
+ */
+static __always_inline bool
+ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
+{
+	struct xdp_umem *umem = rx_ring->xsk_pool;
+	void *addr = rx_buf->addr;
+	u64 handle, hr;
+
+	if (addr) {
+#ifdef ICE_ADD_PROBES
+		rx_ring->rx_stats.page_reuse++;
+#endif /* ICE_ADD_PROBES */
+		return true;
+	}
+
+	if (!xsk_umem_peek_addr(umem, &handle)) {
+		rx_ring->rx_stats.alloc_page_failed++;
+		return false;
+	}
+
+	hr = umem->headroom + XDP_PACKET_HEADROOM;
+
+	rx_buf->dma = xdp_umem_get_dma(umem, handle);
+	rx_buf->dma += hr;
+
+	rx_buf->addr = xdp_umem_get_data(umem, handle);
+	rx_buf->addr += hr;
+
+	rx_buf->handle = handle + umem->headroom;
+
+	xsk_umem_release_addr(umem);
+	return true;
+}
+
+/**
+ * ice_alloc_buf_slow_zc - Retrieve buffer address from XDP umem
+ * @rx_ring: ring with an xdp_umem bound to it
+ * @rx_buf: buffer to which xsk page address will be assigned
+ *
+ * This function allocates an Rx buffer in the slow path.
+ * The buffer can come from fill queue or recycle queue.
+ *
+ * Returns true if an assignment was successful, false if not.
+ */
+static __always_inline bool
+ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
+{
+	struct xdp_umem *umem = rx_ring->xsk_pool;
+	u64 handle, headroom;
+
+	if (!xsk_umem_peek_addr_rq(umem, &handle)) {
+		rx_ring->rx_stats.alloc_page_failed++;
+		return false;
+	}
+
+	handle &= umem->chunk_mask;
+	headroom = umem->headroom + XDP_PACKET_HEADROOM;
+
+	rx_buf->dma = xdp_umem_get_dma(umem, handle);
+	rx_buf->dma += headroom;
+
+	rx_buf->addr = xdp_umem_get_data(umem, handle);
+	rx_buf->addr += headroom;
+
+	rx_buf->handle = handle + umem->headroom;
+
+	xsk_umem_release_addr_rq(umem);
+	return true;
+}
+#endif /* !HAVE_MEM_TYPE_XSK_BUFF_POOL */
+
+/*
+ * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ * @alloc: the function pointer to call for allocation
+ *
+ * This function allocates a number of Rx buffers from the fill ring
+ * or the internal recycle mechanism and places them on the Rx ring.
+ *
+ * Returns false if all allocations were successful, true if any fail.
+ * NOTE: this function header description doesn't do kdoc style
+ *       because of the function pointer creating problems.
+ */
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+static bool
+ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
+		     bool (*alloc)(struct ice_ring *, struct ice_rx_buf *))
+#else
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count)
+#endif
+{
+	union ice_32b_rx_flex_desc *rx_desc;
+	u16 ntu = rx_ring->next_to_use;
+	struct ice_rx_buf *rx_buf;
+	bool ret = false;
+#ifdef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	dma_addr_t dma;
+#endif
+
+	if (!count)
+		return false;
+
+	rx_desc = ICE_RX_DESC(rx_ring, ntu);
+	rx_buf = &rx_ring->rx_buf[ntu];
+
+	do {
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		if (!alloc(rx_ring, rx_buf)) {
+			ret = true;
+			break;
+		}
+
+		dma_sync_single_range_for_device(rx_ring->dev, rx_buf->dma, 0,
+						 rx_ring->rx_buf_len,
+						 DMA_BIDIRECTIONAL);
+
+		rx_desc->read.pkt_addr = cpu_to_le64(rx_buf->dma);
+#else
+		rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_pool);
+		if (!rx_buf->xdp) {
+			ret = true;
+			break;
+		}
+
+		dma = xsk_buff_xdp_get_dma(rx_buf->xdp);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma);
+#endif
+		rx_desc->wb.status_error0 = 0;
+
+		rx_desc++;
+		rx_buf++;
+		ntu++;
+
+		if (unlikely(ntu == rx_ring->count)) {
+			rx_desc = ICE_RX_DESC(rx_ring, 0);
+			rx_buf = rx_ring->rx_buf;
+			ntu = 0;
+		}
+	} while (--count);
+
+	if (rx_ring->next_to_use != ntu)
+		ice_release_rx_desc(rx_ring, ntu);
+
+	return ret;
+}
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+/**
+ * ice_alloc_rx_bufs_fast_zc - allocate zero copy bufs in the hot path
+ * @rx_ring: Rx ring
+ * @count: number of bufs to allocate
+ *
+ * Returns false on success, true on failure.
+ */
+static bool ice_alloc_rx_bufs_fast_zc(struct ice_ring *rx_ring, u16 count)
+{
+	return ice_alloc_rx_bufs_zc(rx_ring, count,
+				    ice_alloc_buf_fast_zc);
+}
+
+/**
+ * ice_alloc_rx_bufs_slow_zc - allocate zero copy bufs in the slow path
+ * @rx_ring: Rx ring
+ * @count: number of bufs to allocate
+ *
+ * Returns false on success, true on failure.
+ */
+bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count)
+{
+	return ice_alloc_rx_bufs_zc(rx_ring, count,
+				    ice_alloc_buf_slow_zc);
+}
+#endif
+
+/**
+ * ice_bump_ntc - Bump the next_to_clean counter of an Rx ring
+ * @rx_ring: Rx ring
+ */
+static void ice_bump_ntc(struct ice_ring *rx_ring)
+{
+	int ntc = rx_ring->next_to_clean + 1;
+
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+	prefetch(ICE_RX_DESC(rx_ring, ntc));
+}
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+/**
+ * ice_get_rx_buf_zc - Fetch the current Rx buffer
+ * @rx_ring: Rx ring
+ * @size: size of a buffer
+ *
+ * This function returns the current, received Rx buffer and does
+ * DMA synchronization.
+ *
+ * Returns a pointer to the received Rx buffer.
+ */
+static struct ice_rx_buf *ice_get_rx_buf_zc(struct ice_ring *rx_ring, int size)
+{
+	struct ice_rx_buf *rx_buf;
+
+	rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
+
+	dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 0,
+				      size, DMA_BIDIRECTIONAL);
+
+	return rx_buf;
+}
+
+/**
+ * ice_reuse_rx_buf_zc - reuse an Rx buffer
+ * @rx_ring: Rx ring
+ * @old_buf: The buffer to recycle
+ *
+ * This function recycles a finished Rx buffer, and places it on the recycle
+ * queue (next_to_alloc).
+ */
+static void
+ice_reuse_rx_buf_zc(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf)
+{
+#ifndef HAVE_XDP_UMEM_PROPS
+	unsigned long mask = (unsigned long)rx_ring->xsk_pool->chunk_mask;
+#else
+	unsigned long mask = (unsigned long)rx_ring->xsk_pool->props.chunk_mask;
+#endif /* NO_XDP_UMEM_PROPS */
+	u64 hr = rx_ring->xsk_pool->headroom + XDP_PACKET_HEADROOM;
+	u16 nta = rx_ring->next_to_alloc;
+	struct ice_rx_buf *new_buf;
+
+	new_buf = &rx_ring->rx_buf[nta++];
+	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+	new_buf->dma = old_buf->dma & mask;
+	new_buf->dma += hr;
+
+	new_buf->addr = (void *)((unsigned long)old_buf->addr & mask);
+	new_buf->addr += hr;
+
+	new_buf->handle = old_buf->handle & mask;
+	new_buf->handle += rx_ring->xsk_pool->headroom;
+
+	old_buf->addr = NULL;
+}
+#endif
+
+/**
+ * ice_construct_skb_zc - Create an sk_buff from zero-copy buffer
+ * @rx_ring: Rx ring
+ * @rx_buf: zero-copy Rx buffer
+ * @xdp: xdp buffer
+ *
+ * This function allocates a new skb from a zero-copy Rx buffer.
+ *
+ * Returns the skb on success, NULL on failure.
+ */
+static struct sk_buff *
+ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
+		     struct xdp_buff *xdp)
+{
+	unsigned int metasize = xdp->data - xdp->data_meta;
+	unsigned int datasize = xdp->data_end - xdp->data;
+	unsigned int datasize_hard = xdp->data_end -
+				     xdp->data_hard_start;
+	struct sk_buff *skb;
+
+	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb))
+		return NULL;
+
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+
+	if (metasize)
+		skb_metadata_set(skb, metasize);
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	ice_reuse_rx_buf_zc(rx_ring, rx_buf);
+#else
+	xsk_buff_free(rx_buf->xdp);
+	rx_buf->xdp = NULL;
+#endif
+
+	return skb;
+}
+
+/**
+ * ice_run_xdp_zc - Executes an XDP program in zero-copy path
+ * @rx_ring: Rx ring
+ * @xdp: xdp_buff used as input to the XDP program
+ *
+ * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
+ */
+static int
+ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp)
+{
+	int err, result = ICE_XDP_PASS;
+	struct bpf_prog *xdp_prog;
+	struct ice_ring *xdp_ring;
+	u32 act;
+
+	rcu_read_lock();
+	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+	if (!xdp_prog) {
+		rcu_read_unlock();
+		return ICE_XDP_PASS;
+	}
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	xdp->handle += xdp->data - xdp->data_hard_start;
+#endif
+	switch (act) {
+	case XDP_PASS:
+		break;
+	case XDP_TX:
+		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->q_index];
+		result = ice_xmit_xdp_buff(xdp, xdp_ring);
+		break;
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+		result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fallthrough -- not supported action */
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+		/* fallthrough -- handle aborts by dropping frame */
+	case XDP_DROP:
+		result = ICE_XDP_CONSUMED;
+		break;
+	}
+
+	rcu_read_unlock();
+	return result;
+}
+
+/**
+ * ice_clean_rx_irq_zc - consumes packets from the hardware ring
+ * @rx_ring: AF_XDP Rx ring
+ * @budget: NAPI budget
+ *
+ * Returns number of processed packets on success, remaining budget on failure.
+ */
+int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
+{
+	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+	unsigned int xdp_xmit = 0;
+	bool failure = false;
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	struct xdp_buff xdp;
+
+	xdp.rxq = &rx_ring->xdp_rxq;
+
+#endif
+	while (likely(total_rx_packets < (unsigned int)budget)) {
+		union ice_32b_rx_flex_desc *rx_desc;
+		unsigned int size, xdp_res = 0;
+		struct ice_rx_buf *rx_buf;
+		struct sk_buff *skb;
+		u16 stat_err_bits;
+		u16 vlan_tag = 0;
+		u16 rx_ptype;
+
+		if (cleaned_count >= ICE_RX_BUF_WRITE) {
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			failure |= ice_alloc_rx_bufs_fast_zc(rx_ring,
+							     cleaned_count);
+#else
+			failure |= ice_alloc_rx_bufs_zc(rx_ring, cleaned_count);
+#endif
+			cleaned_count = 0;
+		}
+
+		rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
+
+		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
+		if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
+			break;
+
+		/* This memory barrier is needed to keep us from reading
+		 * any other fields out of the rx_desc until we have
+		 * verified the descriptor has been written back.
+		 */
+		dma_rmb();
+
+		size = le16_to_cpu(rx_desc->wb.pkt_len) &
+				   ICE_RX_FLX_DESC_PKT_LEN_M;
+		if (!size)
+			break;
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		rx_buf = ice_get_rx_buf_zc(rx_ring, size);
+		if (!rx_buf->addr)
+			break;
+
+		xdp.data = rx_buf->addr;
+		xdp.data_meta = xdp.data;
+		xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
+		xdp.data_end = xdp.data + size;
+		xdp.handle = rx_buf->handle;
+
+		xdp_res = ice_run_xdp_zc(rx_ring, &xdp);
+#else
+		rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
+		if (!rx_buf->xdp)
+			break;
+
+		rx_buf->xdp->data_end = rx_buf->xdp->data + size;
+		xsk_buff_dma_sync_for_cpu(rx_buf->xdp, rx_ring->xsk_pool);
+
+		xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
+#endif
+		if (xdp_res) {
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+				xdp_xmit |= xdp_res;
+				rx_buf->addr = NULL;
+			} else {
+				ice_reuse_rx_buf_zc(rx_ring, rx_buf);
+			}
+#else
+			if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
+				xdp_xmit |= xdp_res;
+			else
+				xsk_buff_free(rx_buf->xdp);
+			rx_buf->xdp = NULL;
+#endif
+			total_rx_bytes += size;
+			total_rx_packets++;
+			cleaned_count++;
+
+			ice_bump_ntc(rx_ring);
+			continue;
+		}
+
+		/* XDP_PASS path */
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		skb = ice_construct_skb_zc(rx_ring, rx_buf, &xdp);
+#else
+		skb = ice_construct_skb_zc(rx_ring, rx_buf, rx_buf->xdp);
+#endif
+		if (!skb) {
+			rx_ring->rx_stats.alloc_buf_failed++;
+			break;
+		}
+
+		cleaned_count++;
+		ice_bump_ntc(rx_ring);
+
+		if (eth_skb_pad(skb)) {
+			skb = NULL;
+			continue;
+		}
+
+		total_rx_bytes += skb->len;
+		total_rx_packets++;
+
+		vlan_tag = ice_get_vlan_tag_from_rx_desc(rx_desc);
+
+		rx_ptype = le16_to_cpu(rx_desc->wb.ptype_flex_flags0) &
+				       ICE_RX_FLEX_DESC_PTYPE_M;
+
+		ice_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+		ice_receive_skb(rx_ring, skb, vlan_tag);
+	}
+
+	ice_finalize_xdp_rx(rx_ring, xdp_xmit);
+	ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes);
+
+#ifdef HAVE_NDO_XSK_WAKEUP
+	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
+		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use ||
+		    (ice_ring_ch_enabled(rx_ring) &&
+		     !ice_vsi_pkt_inspect_opt_ena(rx_ring->vsi)))
+			xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
+		else
+			xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
+
+		return (int)total_rx_packets;
+	}
+
+#endif /* HAVE_NDO_XSK_WAKEUP */
+	return failure ? budget : (int)total_rx_packets;
+}
+
+/**
+ * ice_xmit_zc - Completes AF_XDP entries, and cleans XDP entries
+ * @xdp_ring: XDP Tx ring
+ * @budget: max number of frames to xmit
+ *
+ * Returns true if cleanup/transmission is done.
+ */
+static bool ice_xmit_zc(struct ice_ring *xdp_ring, int budget)
+{
+	unsigned int sent_frames = 0, total_bytes = 0;
+	struct ice_tx_desc *tx_desc = NULL;
+	u16 ntu = xdp_ring->next_to_use;
+#ifdef XSK_UMEM_RETURNS_XDP_DESC
+	struct xdp_desc desc;
+#endif /* XSK_UMEM_RETURNS_XDP_DESC */
+	dma_addr_t dma;
+#ifndef XSK_UMEM_RETURNS_XDP_DESC
+	u32 len;
+#endif /* !XSK_UMEM_RETURNS_XDP_DESC */
+
+	while (likely(budget-- > 0)) {
+		struct ice_tx_buf *tx_buf;
+
+		tx_buf = &xdp_ring->tx_buf[ntu];
+
+#ifdef XSK_UMEM_RETURNS_XDP_DESC
+		if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
+			break;
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		dma = xdp_umem_get_dma(xdp_ring->xsk_pool, desc.addr);
+
+		dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
+					   DMA_BIDIRECTIONAL);
+#else
+		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
+		xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
+						 desc.len);
+#endif
+		tx_buf->bytecount = desc.len;
+#else
+		if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &dma, &len))
+			break;
+
+		dma_sync_single_for_device(xdp_ring->dev, dma, len,
+					   DMA_BIDIRECTIONAL);
+
+		tx_buf->bytecount = len;
+#endif /* XSK_UMEM_RETURNS_XDP_DESC */
+
+		tx_desc = ICE_TX_DESC(xdp_ring, ntu);
+		tx_desc->buf_addr = cpu_to_le64(dma);
+		tx_desc->cmd_type_offset_bsz =
+#ifdef XSK_UMEM_RETURNS_XDP_DESC
+			ice_build_ctob(ICE_TX_DESC_CMD_EOP, 0, desc.len, 0);
+#else
+			ice_build_ctob(ICE_TX_DESC_CMD_EOP, 0, len, 0);
+#endif /* XSK_UMEM_RETURNS_XDP_DESC */
+
+		xdp_ring->next_rs_idx = ntu;
+		ntu++;
+		if (ntu == xdp_ring->count)
+			ntu = 0;
+		sent_frames++;
+		total_bytes += tx_buf->bytecount;
+	}
+
+	if (tx_desc) {
+		xdp_ring->next_to_use = ntu;
+		/* Set RS bit for the last frame and bump tail ptr */
+		tx_desc->cmd_type_offset_bsz |=
+			cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
+		ice_xdp_ring_update_tail(xdp_ring);
+		xsk_tx_release(xdp_ring->xsk_pool);
+		ice_update_tx_ring_stats(xdp_ring, sent_frames, total_bytes);
+	}
+
+	return budget > 0;
+}
+
+/**
+ * ice_clean_xdp_tx_buf - Free and unmap XDP Tx buffer
+ * @xdp_ring: XDP Tx ring
+ * @tx_buf: Tx buffer to clean
+ */
+static void
+ice_clean_xdp_tx_buf(struct ice_ring *xdp_ring, struct ice_tx_buf *tx_buf)
+{
+	xdp_return_frame((struct xdp_frame *)tx_buf->raw_buf);
+	xdp_ring->xdp_tx_active--;
+	dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
+			 dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
+	dma_unmap_len_set(tx_buf, len, 0);
+}
+
+/**
+ * ice_clean_tx_irq_zc - Completes AF_XDP entries, and cleans XDP entries
+ * @xdp_ring: XDP Tx ring
+ *
+ * Returns true if cleanup/tranmission is done.
+ */
+bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring)
+{
+	u16 next_rs_idx = xdp_ring->next_rs_idx;
+	u16 ntc = xdp_ring->next_to_clean;
+	u16 frames_ready = 0, send_budget;
+	struct ice_tx_desc *next_rs_desc;
+	struct ice_tx_buf *tx_buf;
+	u32 xsk_frames = 0;
+	u16 i;
+
+	next_rs_desc = ICE_TX_DESC(xdp_ring, next_rs_idx);
+	if (next_rs_desc->cmd_type_offset_bsz &
+	    cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)) {
+		if (next_rs_idx >= ntc)
+			frames_ready = next_rs_idx - ntc;
+		else
+			frames_ready = next_rs_idx + xdp_ring->count - ntc;
+	}
+
+	if (!frames_ready)
+		goto out_xmit;
+
+	if (likely(!xdp_ring->xdp_tx_active)) {
+		xsk_frames = frames_ready;
+		goto skip;
+	}
+
+	for (i = 0; i < frames_ready; i++) {
+		tx_buf = &xdp_ring->tx_buf[ntc];
+
+		if (tx_buf->raw_buf) {
+			ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
+			tx_buf->raw_buf = NULL;
+		} else {
+			xsk_frames++;
+		}
+
+		++ntc;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+
+skip:
+	xdp_ring->next_to_clean += frames_ready;
+	if (unlikely(xdp_ring->next_to_clean >= xdp_ring->count))
+		xdp_ring->next_to_clean -= xdp_ring->count;
+
+	if (xsk_frames)
+		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+
+out_xmit:
+#ifdef HAVE_NDO_XSK_WAKEUP
+	if (xsk_uses_need_wakeup(xdp_ring->xsk_pool))
+		xsk_set_tx_need_wakeup(xdp_ring->xsk_pool);
+#endif /* HAVE_NDO_XSK_WAKEUP */
+	send_budget = ICE_DESC_UNUSED(xdp_ring);
+	send_budget = min_t(u16, send_budget, xdp_ring->count >> 2);
+	return ice_xmit_zc(xdp_ring, send_budget);
+}
+
+#ifdef HAVE_NDO_XSK_WAKEUP
+/**
+ * ice_xsk_wakeup - Implements ndo_xsk_wakeup
+ * @netdev: net_device
+ * @queue_id: queue to wake up
+ * @flags: ignored in our case, since we have Rx and Tx in the same NAPI
+ *
+ * Returns negative on error, zero otherwise.
+ */
+int
+ice_xsk_wakeup(struct net_device *netdev, u32 queue_id,
+	       u32 __always_unused flags)
+#else
+int ice_xsk_async_xmit(struct net_device *netdev, u32 queue_id)
+#endif /* HAVE_NDO_XSK_WAKEUP */
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_q_vector *q_vector;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_ring *ring;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return -ENETDOWN;
+
+	if (!ice_is_xdp_ena_vsi(vsi))
+		return -ENXIO;
+
+	if (queue_id >= vsi->num_txq)
+		return -ENXIO;
+
+	if (!vsi->xdp_rings[queue_id]->xsk_pool)
+		return -ENXIO;
+
+	ring = vsi->xdp_rings[queue_id];
+
+	/* The idea here is that if NAPI is running, mark a miss, so
+	 * it will run again. If not, trigger an interrupt and
+	 * schedule the NAPI from interrupt context. If NAPI would be
+	 * scheduled here, the interrupt affinity would not be
+	 * honored.
+	 */
+	q_vector = ring->q_vector;
+	if (!napi_if_scheduled_mark_missed(&q_vector->napi)) {
+		if (ice_ring_ch_enabled(vsi->rx_rings[queue_id]) &&
+		    !ice_vsi_pkt_inspect_opt_ena(vsi))
+#define ICE_BUSY_POLL_BUDGET 8
+			napi_busy_loop(q_vector->napi.napi_id, NULL, NULL,
+				       false, ICE_BUSY_POLL_BUDGET);
+		else
+			ice_trigger_sw_intr(&vsi->back->hw, q_vector);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_xsk_any_rx_ring_ena - Checks if Rx rings have AF_XDP UMEM attached
+ * @vsi: VSI to be checked
+ *
+ * Returns true if any of the Rx rings has an AF_XDP UMEM attached
+ */
+bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi)
+{
+	int i;
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (!vsi->xsk_umems)
+		return false;
+
+	for (i = 0; i < vsi->num_xsk_umems; i++) {
+		if (vsi->xsk_umems[i])
+			return true;
+	}
+#else
+	ice_for_each_rxq(vsi, i) {
+		if (xsk_get_pool_from_qid(vsi->netdev, i))
+			return true;
+	}
+#endif /* HAVE_AF_XDP_NETDEV_UMEM */
+
+	return false;
+}
+
+/**
+ * ice_xsk_clean_rx_ring - clean UMEM queues connected to a given Rx ring
+ * @rx_ring: ring to be cleaned
+ */
+void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring)
+{
+	u16 i;
+
+	for (i = 0; i < rx_ring->count; i++) {
+		struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
+
+		if (!rx_buf->addr)
+			continue;
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		xsk_umem_fq_reuse(rx_ring->xsk_pool, rx_buf->handle);
+#endif
+		rx_buf->addr = NULL;
+	}
+}
+
+/**
+ * ice_xsk_clean_xdp_ring - Clean the XDP Tx ring and its UMEM queues
+ * @xdp_ring: XDP_Tx ring
+ */
+void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring)
+{
+	u16 ntc = xdp_ring->next_to_clean, ntu = xdp_ring->next_to_use;
+	u32 xsk_frames = 0;
+
+	while (ntc != ntu) {
+		struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
+
+		if (tx_buf->raw_buf)
+			ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
+		else
+			xsk_frames++;
+
+		tx_buf->raw_buf = NULL;
+
+		ntc++;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+
+	if (xsk_frames)
+		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h
new file mode 100644
index 000000000000..63599776bac1
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_XSK_H_
+#define _ICE_XSK_H_
+#include "ice_txrx.h"
+#include "ice.h"
+#ifdef HAVE_MEM_TYPE_XSK_BUFF_POOL
+#include <net/xdp_sock_drv.h>
+#endif
+
+struct ice_vsi;
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifdef CONFIG_XDP_SOCKETS
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xsk_buff_pool *umem,
+		       u16 qid);
+#else
+int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem,
+		       u16 qid);
+#endif
+int ice_xsk_umem_query(struct ice_vsi *vsi, struct xdp_umem **umem, u16 qid);
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
+#endif
+int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget);
+bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring);
+#ifdef HAVE_NDO_XSK_WAKEUP
+int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
+#else
+int ice_xsk_async_xmit(struct net_device *netdev, u32 queue_id);
+#endif /* HAVE_NDO_XSK_WAKEUP */
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count);
+#else
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count);
+#endif
+bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi);
+void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring);
+void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring);
+#else
+static inline int
+ice_xsk_umem_setup(struct ice_vsi __always_unused *vsi,
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+		   struct xsk_buff_pool __always_unused *pool,
+#else
+		   struct xdp_umem __always_unused *umem,
+#endif
+		   u16 __always_unused qid)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ice_xsk_umem_query(struct ice_vsi __always_unused *vsi,
+		   struct xdp_umem __always_unused **umem,
+		   u16 __always_unused qid)
+{
+	return -EOPNOTSUPP;
+}
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+static inline void
+ice_zca_free(struct zero_copy_allocator __always_unused *zca,
+	     unsigned long __always_unused handle)
+{
+}
+#endif
+
+static inline int
+ice_clean_rx_irq_zc(struct ice_ring __always_unused *rx_ring,
+		    int __always_unused budget)
+{
+	return 0;
+}
+
+static inline bool
+ice_clean_tx_irq_zc(struct ice_ring __always_unused *xdp_ring)
+{
+	return false;
+}
+
+static inline bool
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+ice_alloc_rx_bufs_slow_zc(struct ice_ring __always_unused *rx_ring,
+			  u16 __always_unused count)
+#else
+ice_alloc_rx_bufs_zc(struct ice_ring __always_unused *rx_ring,
+		     u16 __always_unused count)
+#endif
+{
+	return false;
+}
+
+static inline bool ice_xsk_any_rx_ring_ena(struct ice_vsi __always_unused *vsi)
+{
+	return false;
+}
+
+#ifdef HAVE_NDO_XSK_WAKEUP
+static inline int
+ice_xsk_wakeup(struct net_device __always_unused *netdev,
+	       u32 __always_unused queue_id, u32 __always_unused flags)
+{
+	return -EOPNOTSUPP;
+}
+#else
+static inline int
+ice_xsk_async_xmit(struct net_device __always_unused *netdev,
+		   u32 __always_unused queue_id)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_NDO_XSK_WAKEUP */
+
+static inline void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring) { }
+static inline void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring) { }
+#endif /* CONFIG_XDP_SOCKETS */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#endif /* !_ICE_XSK_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat.c b/drivers/net/ethernet/intel/ice/kcompat.c
new file mode 100644
index 000000000000..e5f8068c875c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat.c
@@ -0,0 +1,1360 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "kcompat.h"
+
+
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+#ifdef HAVE_FDB_OPS
+#ifdef USE_CONST_DEV_UC_CHAR
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev, const unsigned char *addr,
+			  u16 flags)
+#else
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct net_device *dev,
+			  unsigned char *addr, u16 flags)
+#endif
+{
+	int err = -EINVAL;
+
+	/* If aging addresses are supported device will need to
+	 * implement its own handler for this.
+	 */
+	if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+		pr_info("%s: FDB only supports static addresses\n", dev->name);
+		return err;
+	}
+
+	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+		err = dev_uc_add_excl(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_add_excl(dev, addr);
+
+	/* Only return duplicate errors if NLM_F_EXCL is set */
+	if (err == -EEXIST && !(flags & NLM_F_EXCL))
+		err = 0;
+
+	return err;
+}
+
+#ifdef USE_CONST_DEV_UC_CHAR
+#ifdef HAVE_FDB_DEL_NLATTR
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev, const unsigned char *addr)
+#else
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev,
+			  const unsigned char *addr)
+#endif
+#else
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev,
+			  unsigned char *addr)
+#endif
+{
+	int err = -EINVAL;
+
+	/* If aging addresses are supported device will need to
+	 * implement its own handler for this.
+	 */
+	if (!(ndm->ndm_state & NUD_PERMANENT)) {
+		pr_info("%s: FDB only supports static addresses\n", dev->name);
+		return err;
+	}
+
+	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+		err = dev_uc_del(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_del(dev, addr);
+
+	return err;
+}
+
+#endif /* HAVE_FDB_OPS */
+#ifdef CONFIG_PCI_IOV
+int __kc_pci_vfs_assigned(struct pci_dev __maybe_unused *dev)
+{
+	unsigned int vfs_assigned = 0;
+#ifdef HAVE_PCI_DEV_FLAGS_ASSIGNED
+	int pos;
+	struct pci_dev *vfdev;
+	unsigned short dev_id;
+
+	/* only search if we are a PF */
+	if (!dev->is_physfn)
+		return 0;
+
+	/* find SR-IOV capability */
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
+	if (!pos)
+		return 0;
+
+	/*
+	 * determine the device ID for the VFs, the vendor ID will be the
+	 * same as the PF so there is no need to check for that one
+	 */
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &dev_id);
+
+	/* loop through all the VFs to see if we own any that are assigned */
+	vfdev = pci_get_device(dev->vendor, dev_id, NULL);
+	while (vfdev) {
+		/*
+		 * It is considered assigned if it is a virtual function with
+		 * our dev as the physical function and the assigned bit is set
+		 */
+		if (vfdev->is_virtfn && (vfdev->physfn == dev) &&
+		    (vfdev->dev_flags & PCI_DEV_FLAGS_ASSIGNED))
+			vfs_assigned++;
+
+		vfdev = pci_get_device(dev->vendor, dev_id, vfdev);
+	}
+
+#endif /* HAVE_PCI_DEV_FLAGS_ASSIGNED */
+	return vfs_assigned;
+}
+
+#endif /* CONFIG_PCI_IOV */
+#endif /* 3.10.0 */
+
+static const unsigned char __maybe_unused pcie_link_speed[] = {
+	PCI_SPEED_UNKNOWN,      /* 0 */
+	PCIE_SPEED_2_5GT,       /* 1 */
+	PCIE_SPEED_5_0GT,       /* 2 */
+	PCIE_SPEED_8_0GT,       /* 3 */
+	PCIE_SPEED_16_0GT,      /* 4 */
+	PCI_SPEED_UNKNOWN,      /* 5 */
+	PCI_SPEED_UNKNOWN,      /* 6 */
+	PCI_SPEED_UNKNOWN,      /* 7 */
+	PCI_SPEED_UNKNOWN,      /* 8 */
+	PCI_SPEED_UNKNOWN,      /* 9 */
+	PCI_SPEED_UNKNOWN,      /* A */
+	PCI_SPEED_UNKNOWN,      /* B */
+	PCI_SPEED_UNKNOWN,      /* C */
+	PCI_SPEED_UNKNOWN,      /* D */
+	PCI_SPEED_UNKNOWN,      /* E */
+	PCI_SPEED_UNKNOWN       /* F */
+};
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) )
+int __kc_pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
+			       enum pcie_link_width *width)
+{
+
+	*speed = PCI_SPEED_UNKNOWN;
+	*width = PCIE_LNK_WIDTH_UNKNOWN;
+
+	while (dev) {
+		u16 lnksta;
+		enum pci_bus_speed next_speed;
+		enum pcie_link_width next_width;
+		int ret = pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
+
+		if (ret)
+			return ret;
+
+		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
+		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
+			PCI_EXP_LNKSTA_NLW_SHIFT;
+
+		if (next_speed < *speed)
+			*speed = next_speed;
+
+		if (next_width < *width)
+			*width = next_width;
+
+		dev = dev->bus->self;
+	}
+
+	return 0;
+}
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6,7))
+int _kc_pci_wait_for_pending_transaction(struct pci_dev *dev)
+{
+	int i;
+	u16 status;
+
+	/* Wait for Transaction Pending bit clean */
+	for (i = 0; i < 4; i++) {
+		if (i)
+			msleep((1 << (i - 1)) * 100);
+
+		pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &status);
+		if (!(status & PCI_EXP_DEVSTA_TRPND))
+			return 1;
+	}
+
+	return 0;
+}
+#endif /* <RHEL6.7 */
+
+#endif /* <3.12 */
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) )
+int __kc_dma_set_mask_and_coherent(struct device *dev, u64 mask)
+{
+	int err = dma_set_mask(dev, mask);
+
+	if (!err)
+		/* coherent mask for the same size will always succeed if
+		 * dma_set_mask does. However we store the error anyways, due
+		 * to some kernels which use gcc's warn_unused_result on their
+		 * definition of dma_set_coherent_mask.
+		 */
+		err = dma_set_coherent_mask(dev, mask);
+	return err;
+}
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))
+static bool _kc_pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn,
+					   u32 *l, int crs_timeout)
+{
+	int delay = 1;
+
+	if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l))
+		return false;
+
+	/* some broken boards return 0 or ~0 if a slot is empty: */
+	if (*l == 0xffffffff || *l == 0x00000000 ||
+	    *l == 0x0000ffff || *l == 0xffff0000)
+		return false;
+
+	/* Configuration request Retry Status */
+	while (*l == 0xffff0001) {
+		if (!crs_timeout)
+			return false;
+
+		msleep(delay);
+		delay *= 2;
+		if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l))
+			return false;
+		/* Card hasn't responded in 60 seconds?  Must be stuck. */
+		if (delay > crs_timeout) {
+			printk(KERN_WARNING "pci %04x:%02x:%02x.%d: not "
+			       "responding\n", pci_domain_nr(bus),
+			       bus->number, PCI_SLOT(devfn),
+			       PCI_FUNC(devfn));
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool _kc_pci_device_is_present(struct pci_dev *pdev)
+{
+	u32 v;
+
+	return _kc_pci_bus_read_dev_vendor_id(pdev->bus, pdev->devfn, &v, 0);
+}
+#endif /* <RHEL7.0 */
+#endif /* 3.13.0 */
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) )
+/******************************************************************************
+ * ripped from linux/net/ipv6/exthdrs_core.c, GPL2, no direct copyright,
+ * inferred copyright from kernel
+ */
+int __kc_ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
+		       int target, unsigned short *fragoff, int *flags)
+{
+	unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+	u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+	bool found;
+
+#define __KC_IP6_FH_F_FRAG	BIT(0)
+#define __KC_IP6_FH_F_AUTH	BIT(1)
+#define __KC_IP6_FH_F_SKIP_RH	BIT(2)
+
+	if (fragoff)
+		*fragoff = 0;
+
+	if (*offset) {
+		struct ipv6hdr _ip6, *ip6;
+
+		ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
+		if (!ip6 || (ip6->version != 6)) {
+			printk(KERN_ERR "IPv6 header not found\n");
+			return -EBADMSG;
+		}
+		start = *offset + sizeof(struct ipv6hdr);
+		nexthdr = ip6->nexthdr;
+	}
+
+	do {
+		struct ipv6_opt_hdr _hdr, *hp;
+		unsigned int hdrlen;
+		found = (nexthdr == target);
+
+		if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
+			if (target < 0 || found)
+				break;
+			return -ENOENT;
+		}
+
+		hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+		if (!hp)
+			return -EBADMSG;
+
+		if (nexthdr == NEXTHDR_ROUTING) {
+			struct ipv6_rt_hdr _rh, *rh;
+
+			rh = skb_header_pointer(skb, start, sizeof(_rh),
+						&_rh);
+			if (!rh)
+				return -EBADMSG;
+
+			if (flags && (*flags & __KC_IP6_FH_F_SKIP_RH) &&
+			    rh->segments_left == 0)
+				found = false;
+		}
+
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			unsigned short _frag_off;
+			__be16 *fp;
+
+			if (flags)	/* Indicate that this is a fragment */
+				*flags |= __KC_IP6_FH_F_FRAG;
+			fp = skb_header_pointer(skb,
+						start+offsetof(struct frag_hdr,
+							       frag_off),
+						sizeof(_frag_off),
+						&_frag_off);
+			if (!fp)
+				return -EBADMSG;
+
+			_frag_off = ntohs(*fp) & ~0x7;
+			if (_frag_off) {
+				if (target < 0 &&
+				    ((!ipv6_ext_hdr(hp->nexthdr)) ||
+				     hp->nexthdr == NEXTHDR_NONE)) {
+					if (fragoff)
+						*fragoff = _frag_off;
+					return hp->nexthdr;
+				}
+				return -ENOENT;
+			}
+			hdrlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH) {
+			if (flags && (*flags & __KC_IP6_FH_F_AUTH) && (target < 0))
+				break;
+			hdrlen = (hp->hdrlen + 2) << 2;
+		} else
+			hdrlen = ipv6_optlen(hp);
+
+		if (!found) {
+			nexthdr = hp->nexthdr;
+			start += hdrlen;
+		}
+	} while (!found);
+
+	*offset = start;
+	return nexthdr;
+}
+
+int __kc_pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
+			       int minvec, int maxvec)
+{
+        int nvec = maxvec;
+        int rc;
+
+        if (maxvec < minvec)
+                return -ERANGE;
+
+        do {
+                rc = pci_enable_msix(dev, entries, nvec);
+                if (rc < 0) {
+                        return rc;
+                } else if (rc > 0) {
+                        if (rc < minvec)
+                                return -ENOSPC;
+                        nvec = rc;
+                }
+        } while (rc);
+
+        return nvec;
+}
+#endif /* 3.14.0 */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0))
+char *_kc_devm_kstrdup(struct device *dev, const char *s, gfp_t gfp)
+{
+	size_t size;
+	char *buf;
+
+	if (!s)
+		return NULL;
+
+	size = strlen(s) + 1;
+	buf = devm_kzalloc(dev, size, gfp);
+	if (buf)
+		memcpy(buf, s, size);
+	return buf;
+}
+
+void __kc_netdev_rss_key_fill(void *buffer, size_t len)
+{
+	/* Set of random keys generated using kernel random number generator */
+	static const u8 seed[NETDEV_RSS_KEY_LEN] = {0xE6, 0xFA, 0x35, 0x62,
+				0x95, 0x12, 0x3E, 0xA3, 0xFB, 0x46, 0xC1, 0x5F,
+				0xB1, 0x43, 0x82, 0x5B, 0x6A, 0x49, 0x50, 0x95,
+				0xCD, 0xAB, 0xD8, 0x11, 0x8F, 0xC5, 0xBD, 0xBC,
+				0x6A, 0x4A, 0xB2, 0xD4, 0x1F, 0xFE, 0xBC, 0x41,
+				0xBF, 0xAC, 0xB2, 0x9A, 0x8F, 0x70, 0xE9, 0x2A,
+				0xD7, 0xB2, 0x80, 0xB6, 0x5B, 0xAA, 0x9D, 0x20};
+
+	BUG_ON(len > NETDEV_RSS_KEY_LEN);
+	memcpy(buffer, seed, len);
+}
+#endif /* 3.15.0 */
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) )
+#ifdef HAVE_SET_RX_MODE
+#ifdef NETDEV_HW_ADDR_T_UNICAST
+int __kc_hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+		struct net_device *dev,
+		int (*sync)(struct net_device *, const unsigned char *),
+		int (*unsync)(struct net_device *, const unsigned char *))
+{
+	struct netdev_hw_addr *ha, *tmp;
+	int err;
+
+	/* first go through and flush out any stale entries */
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		if (!ha->synced || ha->refcount != 1)
+#else
+		if (!ha->sync_cnt || ha->refcount != 1)
+#endif
+			continue;
+
+		if (unsync && unsync(dev, ha->addr))
+			continue;
+
+		list_del_rcu(&ha->list);
+		kfree_rcu(ha, rcu_head);
+		list->count--;
+	}
+
+	/* go through and sync new entries to the list */
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		if (ha->synced)
+#else
+		if (ha->sync_cnt)
+#endif
+			continue;
+
+		err = sync(dev, ha->addr);
+		if (err)
+			return err;
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		ha->synced = true;
+#else
+		ha->sync_cnt++;
+#endif
+		ha->refcount++;
+	}
+
+	return 0;
+}
+
+void __kc_hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+		struct net_device *dev,
+		int (*unsync)(struct net_device *, const unsigned char *))
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		if (!ha->synced)
+#else
+		if (!ha->sync_cnt)
+#endif
+			continue;
+
+		if (unsync && unsync(dev, ha->addr))
+			continue;
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		ha->synced = false;
+#else
+		ha->sync_cnt--;
+#endif
+		if (--ha->refcount)
+			continue;
+
+		list_del_rcu(&ha->list);
+		kfree_rcu(ha, rcu_head);
+		list->count--;
+	}
+}
+
+#endif /* NETDEV_HW_ADDR_T_UNICAST  */
+#ifndef NETDEV_HW_ADDR_T_MULTICAST
+int __kc_dev_addr_sync_dev(struct dev_addr_list **list, int *count,
+		struct net_device *dev,
+		int (*sync)(struct net_device *, const unsigned char *),
+		int (*unsync)(struct net_device *, const unsigned char *))
+{
+	struct dev_addr_list *da, **next = list;
+	int err;
+
+	/* first go through and flush out any stale entries */
+	while ((da = *next) != NULL) {
+		if (da->da_synced && da->da_users == 1) {
+			if (!unsync || !unsync(dev, da->da_addr)) {
+				*next = da->next;
+				kfree(da);
+				(*count)--;
+				continue;
+			}
+		}
+		next = &da->next;
+	}
+
+	/* go through and sync new entries to the list */
+	for (da = *list; da != NULL; da = da->next) {
+		if (da->da_synced)
+			continue;
+
+		err = sync(dev, da->da_addr);
+		if (err)
+			return err;
+
+		da->da_synced++;
+		da->da_users++;
+	}
+
+	return 0;
+}
+
+void __kc_dev_addr_unsync_dev(struct dev_addr_list **list, int *count,
+		struct net_device *dev,
+		int (*unsync)(struct net_device *, const unsigned char *))
+{
+	struct dev_addr_list *da;
+
+	while ((da = *list) != NULL) {
+		if (da->da_synced) {
+			if (!unsync || !unsync(dev, da->da_addr)) {
+				da->da_synced--;
+				if (--da->da_users == 0) {
+					*list = da->next;
+					kfree(da);
+					(*count)--;
+					continue;
+				}
+			}
+		}
+		list = &da->next;
+	}
+}
+#endif /* NETDEV_HW_ADDR_T_MULTICAST  */
+#endif /* HAVE_SET_RX_MODE */
+void *__kc_devm_kmemdup(struct device *dev, const void *src, size_t len,
+			gfp_t gfp)
+{
+	void *p;
+
+	p = devm_kzalloc(dev, len, gfp);
+	if (p)
+		memcpy(p, src, len);
+
+	return p;
+}
+#endif /* 3.16.0 */
+
+/******************************************************************************/
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0)) && \
+     (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5)))
+int _kc_param_set_ullong(const char *val, const struct kernel_param *kp)
+{
+	return kstrtoull(val, 0, (unsigned long long *)kp->arg);
+}
+int _kc_param_get_ullong(char *buffer, const struct kernel_param *kp)
+{
+	return scnprintf(buffer, PAGE_SIZE, "%llu",
+			 *((unsigned long long *)kp->arg));
+}
+const struct kernel_param_ops _kc_param_ops_ullong = {
+	.set = _kc_param_set_ullong,
+	.get = _kc_param_get_ullong,
+};
+#endif /* <3.17.0 && RHEL_RELEASE_CODE < RHEL7.5 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) )
+static void __kc_sock_efree(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+
+struct sk_buff *__kc_skb_clone_sk(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct sk_buff *clone;
+
+	if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
+		return NULL;
+
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (!clone) {
+		sock_put(sk);
+		return NULL;
+	}
+
+	clone->sk = sk;
+	clone->destructor = __kc_sock_efree;
+
+	return clone;
+}
+
+void __kc_skb_complete_tx_timestamp(struct sk_buff *skb,
+				    struct skb_shared_hwtstamps *hwtstamps)
+{
+	struct sock_exterr_skb *serr;
+	struct sock *sk = skb->sk;
+	int err;
+
+	sock_hold(sk);
+
+	*skb_hwtstamps(skb) = *hwtstamps;
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+
+	err = sock_queue_err_skb(sk, skb);
+	if (err)
+		kfree_skb(skb);
+
+	sock_put(sk);
+}
+
+/* include headers needed for get_headlen function */
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+#include <scsi/fc/fc_fcoe.h>
+#endif
+#ifdef HAVE_SCTP
+#include <linux/sctp.h>
+#endif
+
+u32 __kc_eth_get_headlen(const struct net_device __always_unused *dev,
+			 unsigned char *data, unsigned int max_len)
+{
+	union {
+		unsigned char *network;
+		/* l2 headers */
+		struct ethhdr *eth;
+		struct vlan_hdr *vlan;
+		/* l3 headers */
+		struct iphdr *ipv4;
+		struct ipv6hdr *ipv6;
+	} hdr;
+	__be16 proto;
+	u8 nexthdr = 0;	/* default to not TCP */
+	u8 hlen;
+
+	/* this should never happen, but better safe than sorry */
+	if (max_len < ETH_HLEN)
+		return max_len;
+
+	/* initialize network frame pointer */
+	hdr.network = data;
+
+	/* set first protocol and move network header forward */
+	proto = hdr.eth->h_proto;
+	hdr.network += ETH_HLEN;
+
+again:
+	switch (proto) {
+	/* handle any vlan tag if present */
+	case __constant_htons(ETH_P_8021AD):
+	case __constant_htons(ETH_P_8021Q):
+		if ((hdr.network - data) > (max_len - VLAN_HLEN))
+			return max_len;
+
+		proto = hdr.vlan->h_vlan_encapsulated_proto;
+		hdr.network += VLAN_HLEN;
+		goto again;
+	/* handle L3 protocols */
+	case __constant_htons(ETH_P_IP):
+		if ((hdr.network - data) > (max_len - sizeof(struct iphdr)))
+			return max_len;
+
+		/* access ihl as a u8 to avoid unaligned access on ia64 */
+		hlen = (hdr.network[0] & 0x0F) << 2;
+
+		/* verify hlen meets minimum size requirements */
+		if (hlen < sizeof(struct iphdr))
+			return hdr.network - data;
+
+		/* record next protocol if header is present */
+		if (!(hdr.ipv4->frag_off & htons(IP_OFFSET)))
+			nexthdr = hdr.ipv4->protocol;
+
+		hdr.network += hlen;
+		break;
+#ifdef NETIF_F_TSO6
+	case __constant_htons(ETH_P_IPV6):
+		if ((hdr.network - data) > (max_len - sizeof(struct ipv6hdr)))
+			return max_len;
+
+		/* record next protocol */
+		nexthdr = hdr.ipv6->nexthdr;
+		hdr.network += sizeof(struct ipv6hdr);
+		break;
+#endif /* NETIF_F_TSO6 */
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	case __constant_htons(ETH_P_FCOE):
+		hdr.network += FCOE_HEADER_LEN;
+		break;
+#endif
+	default:
+		return hdr.network - data;
+	}
+
+	/* finally sort out L4 */
+	switch (nexthdr) {
+	case IPPROTO_TCP:
+		if ((hdr.network - data) > (max_len - sizeof(struct tcphdr)))
+			return max_len;
+
+		/* access doff as a u8 to avoid unaligned access on ia64 */
+		hdr.network += max_t(u8, sizeof(struct tcphdr),
+				     (hdr.network[12] & 0xF0) >> 2);
+
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+		hdr.network += sizeof(struct udphdr);
+		break;
+#ifdef HAVE_SCTP
+	case IPPROTO_SCTP:
+		hdr.network += sizeof(struct sctphdr);
+		break;
+#endif
+	}
+
+	/*
+	 * If everything has gone correctly hdr.network should be the
+	 * data section of the packet and will be the end of the header.
+	 * If not then it probably represents the end of the last recognized
+	 * header.
+	 */
+	return min_t(unsigned int, hdr.network - data, max_len);
+}
+
+#endif /* < 3.18.0 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) )
+#ifdef HAVE_NET_GET_RANDOM_ONCE
+static u8 __kc_netdev_rss_key[NETDEV_RSS_KEY_LEN];
+
+void __kc_netdev_rss_key_fill(void *buffer, size_t len)
+{
+	BUG_ON(len > sizeof(__kc_netdev_rss_key));
+	net_get_random_once(__kc_netdev_rss_key, sizeof(__kc_netdev_rss_key));
+	memcpy(buffer, __kc_netdev_rss_key, len);
+}
+#endif
+
+int _kc_bitmap_print_to_pagebuf(bool list, char *buf,
+				const unsigned long *maskp,
+				int nmaskbits)
+{
+	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf - 2;
+	int n = 0;
+
+	if (len > 1) {
+		n = list ? bitmap_scnlistprintf(buf, len, maskp, nmaskbits) :
+			   bitmap_scnprintf(buf, len, maskp, nmaskbits);
+		buf[n++] = '\n';
+		buf[n] = '\0';
+	}
+	return n;
+}
+#endif
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) )
+#if !((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,8) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) && \
+      (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2)) && \
+      (SLE_VERSION_CODE > SLE_VERSION(12,1,0)))
+unsigned int _kc_cpumask_local_spread(unsigned int i, int node)
+{
+	int cpu;
+
+	/* Wrap: we always want a cpu. */
+	i %= num_online_cpus();
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) )
+	/* Kernels prior to 2.6.28 do not have for_each_cpu or
+	 * cpumask_of_node, so just use for_each_online_cpu()
+	 */
+	for_each_online_cpu(cpu)
+		if (i-- == 0)
+			return cpu;
+
+	return 0;
+#else
+	if (node == -1) {
+		for_each_cpu(cpu, cpu_online_mask)
+			if (i-- == 0)
+				return cpu;
+	} else {
+		/* NUMA first. */
+		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
+			if (i-- == 0)
+				return cpu;
+
+		for_each_cpu(cpu, cpu_online_mask) {
+			/* Skip NUMA nodes, done above. */
+			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
+				continue;
+
+			if (i-- == 0)
+				return cpu;
+		}
+	}
+#endif /* KERNEL_VERSION >= 2.6.28 */
+	BUG();
+}
+#endif
+#endif
+
+/******************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,2,0)))
+/**
+ * _kc_skb_flow_dissect_flow_keys - parse SKB to fill _kc_flow_keys
+ * @skb: SKB used to fille _kc_flow_keys
+ * @flow: _kc_flow_keys to set with SKB fields
+ * @flags: currently unused flags
+ *
+ * The purpose of using kcompat for this function is so the caller doesn't have
+ * to care about which kernel version they are on, which prevents a larger than
+ * normal #ifdef mess created by using a HAVE_* flag for this case. This is also
+ * done for 4.2 kernels to simplify calling skb_flow_dissect_flow_keys()
+ * because in 4.2 kernels skb_flow_dissect_flow_keys() exists, but only has 2
+ * arguments. Recent kernels have skb_flow_dissect_flow_keys() that has 3
+ * arguments.
+ *
+ * The caller needs to understand that this function was only implemented as a
+ * bare-minimum replacement for recent versions of skb_flow_dissect_flow_keys()
+ * and this function is in no way similar to skb_flow_dissect_flow_keys(). An
+ * example use can be found in the ice driver, specifically ice_arfs.c.
+ *
+ * This function is treated as a allowlist of supported fields the SKB can
+ * parse. If new functionality is added make sure to keep this format (i.e. only
+ * check for fields that are explicity wanted).
+ *
+ * Current allowlist:
+ *
+ * TCPv4, TCPv6, UDPv4, UDPv6
+ *
+ * If any unexpected protocol or other field is found this function memsets the
+ * flow passed in back to 0 and returns false. Otherwise the flow is populated
+ * and returns true.
+ */
+bool
+_kc_skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+			       struct _kc_flow_keys *flow,
+			       unsigned int __always_unused flags)
+{
+	memset(flow, 0, sizeof(*flow));
+
+	flow->basic.n_proto = skb->protocol;
+	switch (flow->basic.n_proto) {
+	case htons(ETH_P_IP):
+		flow->basic.ip_proto = ip_hdr(skb)->protocol;
+		flow->addrs.v4addrs.src = ip_hdr(skb)->saddr;
+		flow->addrs.v4addrs.dst = ip_hdr(skb)->daddr;
+		break;
+	case htons(ETH_P_IPV6):
+		flow->basic.ip_proto = ipv6_hdr(skb)->nexthdr;
+		memcpy(&flow->addrs.v6addrs.src, &ipv6_hdr(skb)->saddr,
+		       sizeof(struct in6_addr));
+		memcpy(&flow->addrs.v6addrs.dst, &ipv6_hdr(skb)->daddr,
+		       sizeof(struct in6_addr));
+		break;
+	default:
+		netdev_dbg(skb->dev, "%s: Unsupported/unimplemented layer 3 protocol %04x\n", __func__, htons(flow->basic.n_proto));
+		goto unsupported;
+	}
+
+	switch (flow->basic.ip_proto) {
+	case IPPROTO_TCP:
+	{
+		struct tcphdr *tcph;
+
+		tcph = tcp_hdr(skb);
+		flow->ports.src = tcph->source;
+		flow->ports.dst = tcph->dest;
+		break;
+	}
+	case IPPROTO_UDP:
+	{
+		struct udphdr *udph;
+
+		udph = udp_hdr(skb);
+		flow->ports.src = udph->source;
+		flow->ports.dst = udph->dest;
+		break;
+	}
+	default:
+		netdev_dbg(skb->dev, "%s: Unsupported/unimplemented layer 4 protocol %02x\n", __func__, flow->basic.ip_proto);
+		return false;
+	}
+
+	return true;
+
+unsupported:
+	memset(flow, 0, sizeof(*flow));
+	return false;
+}
+#endif /* ! >= RHEL7.4 && ! >= SLES12.2 */
+#endif /* 4.3.0 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) )
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
+#ifdef CONFIG_SPARC
+#include <asm/idprom.h>
+#include <asm/prom.h>
+#endif
+int _kc_eth_platform_get_mac_address(struct device *dev __maybe_unused,
+				     u8 *mac_addr __maybe_unused)
+{
+#if (((LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0)) && defined(CONFIG_OF) && \
+      !defined(HAVE_STRUCT_DEVICE_OF_NODE) || !defined(CONFIG_OF)) && \
+     !defined(CONFIG_SPARC))
+	return -ENODEV;
+#else
+	const unsigned char *addr;
+	struct device_node *dp;
+
+	if (dev_is_pci(dev))
+		dp = pci_device_to_OF_node(to_pci_dev(dev));
+	else
+#if defined(HAVE_STRUCT_DEVICE_OF_NODE) && defined(CONFIG_OF)
+		dp = dev->of_node;
+#else
+		dp = NULL;
+#endif
+
+	addr = NULL;
+	if (dp)
+		addr = of_get_mac_address(dp);
+#ifdef CONFIG_SPARC
+	/* Kernel hasn't implemented arch_get_platform_mac_address, but we
+	 * should handle the SPARC case here since it was supported
+	 * originally. This is replaced by arch_get_platform_mac_address()
+	 * upstream.
+	 */
+	if (!addr)
+		addr = idprom->id_ethaddr;
+#endif
+	if (!addr)
+		return -ENODEV;
+
+	ether_addr_copy(mac_addr, addr);
+	return 0;
+#endif
+}
+#endif /* !(RHEL_RELEASE >= 7.3) */
+#endif /* < 4.5.0 */
+
+/*****************************************************************************/
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE <= SLE_VERSION(12,3,0))) || \
+     (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,5))))
+const char *_kc_phy_speed_to_str(int speed)
+{
+	switch (speed) {
+	case SPEED_10:
+		return "10Mbps";
+	case SPEED_100:
+		return "100Mbps";
+	case SPEED_1000:
+		return "1Gbps";
+	case SPEED_2500:
+		return "2.5Gbps";
+	case SPEED_5000:
+		return "5Gbps";
+	case SPEED_10000:
+		return "10Gbps";
+	case SPEED_14000:
+		return "14Gbps";
+	case SPEED_20000:
+		return "20Gbps";
+	case SPEED_25000:
+		return "25Gbps";
+	case SPEED_40000:
+		return "40Gbps";
+	case SPEED_50000:
+		return "50Gbps";
+	case SPEED_56000:
+		return "56Gbps";
+#ifdef SPEED_100000
+	case SPEED_100000:
+		return "100Gbps";
+#endif
+	case SPEED_UNKNOWN:
+		return "Unknown";
+	default:
+		return "Unsupported (update phy-core.c)";
+	}
+}
+#endif /* (LINUX < 4.14.0) || (SLES <= 12.3.0) || (RHEL <= 7.5) */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) )
+void _kc_ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				      struct ethtool_link_ksettings *src)
+{
+	unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS);
+	unsigned int idx = 0;
+
+	for (; idx < size; idx++) {
+		dst->link_modes.supported[idx] &=
+			src->link_modes.supported[idx];
+		dst->link_modes.advertising[idx] &=
+			src->link_modes.advertising[idx];
+	}
+}
+#endif /* 4.15.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0))
+#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,5,0) && \
+       SLE_VERSION_CODE < SLE_VERSION(15,0,0) || \
+       SLE_VERSION_CODE >= SLE_VERSION(15,1,0))
+#if BITS_PER_LONG == 64
+/**
+ * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
+ * @bitmap: array of unsigned longs, the destination bitmap
+ * @buf: array of u32 (in host byte order), the source bitmap
+ * @nbits: number of bits in @bitmap
+ */
+void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
+{
+	unsigned int i, halfwords;
+
+	halfwords = DIV_ROUND_UP(nbits, 32);
+	for (i = 0; i < halfwords; i++) {
+		bitmap[i/2] = (unsigned long) buf[i];
+		if (++i < halfwords)
+			bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
+	}
+
+	/* Clear tail bits in last word beyond nbits. */
+	if (nbits % BITS_PER_LONG)
+		bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
+}
+#endif /* BITS_PER_LONG == 64 */
+#endif /* !(RHEL >= 8.0) && !(SLES >= 12.5 && SLES < 15.0 || SLES >= 15.1) */
+#endif /* 4.16.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,17,0))
+/* PCIe link information */
+#define PCIE_SPEED2STR(speed) \
+	((speed) == PCIE_SPEED_16_0GT ? "16 GT/s" : \
+	 (speed) == PCIE_SPEED_8_0GT ? "8 GT/s" : \
+	 (speed) == PCIE_SPEED_5_0GT ? "5 GT/s" : \
+	 (speed) == PCIE_SPEED_2_5GT ? "2.5 GT/s" : \
+	 "Unknown speed")
+
+/* PCIe speed to Mb/s reduced by encoding overhead */
+#define PCIE_SPEED2MBS_ENC(speed) \
+	((speed) == PCIE_SPEED_16_0GT ? 16000*128/130 : \
+	 (speed) == PCIE_SPEED_8_0GT  ?  8000*128/130 : \
+	 (speed) == PCIE_SPEED_5_0GT  ?  5000*8/10 : \
+	 (speed) == PCIE_SPEED_2_5GT  ?  2500*8/10 : \
+	 0)
+
+static u32
+_kc_pcie_bandwidth_available(struct pci_dev *dev,
+			     struct pci_dev **limiting_dev,
+			     enum pci_bus_speed *speed,
+			     enum pcie_link_width *width)
+{
+	u16 lnksta;
+	enum pci_bus_speed next_speed;
+	enum pcie_link_width next_width;
+	u32 bw, next_bw;
+
+	if (speed)
+		*speed = PCI_SPEED_UNKNOWN;
+	if (width)
+		*width = PCIE_LNK_WIDTH_UNKNOWN;
+
+	bw = 0;
+
+	while (dev) {
+		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
+
+		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
+		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
+			PCI_EXP_LNKSTA_NLW_SHIFT;
+
+		next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
+
+		/* Check if current device limits the total bandwidth */
+		if (!bw || next_bw <= bw) {
+			bw = next_bw;
+
+			if (limiting_dev)
+				*limiting_dev = dev;
+			if (speed)
+				*speed = next_speed;
+			if (width)
+				*width = next_width;
+		}
+
+		dev = pci_upstream_bridge(dev);
+	}
+
+	return bw;
+}
+
+static enum pci_bus_speed _kc_pcie_get_speed_cap(struct pci_dev *dev)
+{
+	u32 lnkcap2, lnkcap;
+
+	/*
+	 * PCIe r4.0 sec 7.5.3.18 recommends using the Supported Link
+	 * Speeds Vector in Link Capabilities 2 when supported, falling
+	 * back to Max Link Speed in Link Capabilities otherwise.
+	 */
+	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
+	if (lnkcap2) { /* PCIe r3.0-compliant */
+		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
+			return PCIE_SPEED_16_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
+			return PCIE_SPEED_8_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
+			return PCIE_SPEED_5_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
+			return PCIE_SPEED_2_5GT;
+		return PCI_SPEED_UNKNOWN;
+	}
+
+	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+	if (lnkcap) {
+		if (lnkcap & PCI_EXP_LNKCAP_SLS_16_0GB)
+			return PCIE_SPEED_16_0GT;
+		else if (lnkcap & PCI_EXP_LNKCAP_SLS_8_0GB)
+			return PCIE_SPEED_8_0GT;
+		else if (lnkcap & PCI_EXP_LNKCAP_SLS_5_0GB)
+			return PCIE_SPEED_5_0GT;
+		else if (lnkcap & PCI_EXP_LNKCAP_SLS_2_5GB)
+			return PCIE_SPEED_2_5GT;
+	}
+
+	return PCI_SPEED_UNKNOWN;
+}
+
+static enum pcie_link_width _kc_pcie_get_width_cap(struct pci_dev *dev)
+{
+	u32 lnkcap;
+
+	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+	if (lnkcap)
+		return (lnkcap & PCI_EXP_LNKCAP_MLW) >> 4;
+
+	return PCIE_LNK_WIDTH_UNKNOWN;
+}
+
+static u32
+_kc_pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed,
+			   enum pcie_link_width *width)
+{
+	*speed = _kc_pcie_get_speed_cap(dev);
+	*width = _kc_pcie_get_width_cap(dev);
+
+	if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN)
+		return 0;
+
+	return *width * PCIE_SPEED2MBS_ENC(*speed);
+}
+
+void _kc_pcie_print_link_status(struct pci_dev *dev) {
+	enum pcie_link_width width, width_cap;
+	enum pci_bus_speed speed, speed_cap;
+	struct pci_dev *limiting_dev = NULL;
+	u32 bw_avail, bw_cap;
+
+	bw_cap = _kc_pcie_bandwidth_capable(dev, &speed_cap, &width_cap);
+	bw_avail = _kc_pcie_bandwidth_available(dev, &limiting_dev, &speed,
+						&width);
+
+	if (bw_avail >= bw_cap)
+		pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth (%s x%d link)\n",
+			 bw_cap / 1000, bw_cap % 1000,
+			 PCIE_SPEED2STR(speed_cap), width_cap);
+	else
+		pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth, limited by %s x%d link at %s (capable of %u.%03u Gb/s with %s x%d link)\n",
+			 bw_avail / 1000, bw_avail % 1000,
+			 PCIE_SPEED2STR(speed), width,
+			 limiting_dev ? pci_name(limiting_dev) : "<unknown>",
+			 bw_cap / 1000, bw_cap % 1000,
+			 PCIE_SPEED2STR(speed_cap), width_cap);
+}
+#endif /* 4.17.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,1,0)) || (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,1)))
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#define FLOW_DISSECTOR_MATCH(__rule, __type, __out)				\
+	const struct flow_match *__m = &(__rule)->match;			\
+	struct flow_dissector *__d = (__m)->dissector;				\
+										\
+	(__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key);	\
+	(__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask);	\
+
+void flow_rule_match_basic(const struct flow_rule *rule,
+			   struct flow_match_basic *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out);
+}
+
+void flow_rule_match_control(const struct flow_rule *rule,
+			     struct flow_match_control *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out);
+}
+
+void flow_rule_match_eth_addrs(const struct flow_rule *rule,
+			       struct flow_match_eth_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out);
+}
+
+#ifdef HAVE_TC_FLOWER_ENC
+void flow_rule_match_enc_keyid(const struct flow_rule *rule,
+			       struct flow_match_enc_keyid *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out);
+}
+
+void flow_rule_match_enc_ports(const struct flow_rule *rule,
+			       struct flow_match_ports *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out);
+}
+
+void flow_rule_match_enc_control(const struct flow_rule *rule,
+				 struct flow_match_control *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out);
+}
+
+void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule,
+				    struct flow_match_ipv4_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out);
+}
+
+void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule,
+				    struct flow_match_ipv6_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out);
+}
+#endif
+
+#ifndef HAVE_TC_FLOWER_VLAN_IN_TAGS
+void flow_rule_match_vlan(const struct flow_rule *rule,
+			  struct flow_match_vlan *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out);
+}
+#endif
+
+void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
+				struct flow_match_ipv4_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out);
+}
+
+void flow_rule_match_ipv6_addrs(const struct flow_rule *rule,
+				struct flow_match_ipv6_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out);
+}
+
+void flow_rule_match_ports(const struct flow_rule *rule,
+			   struct flow_match_ports *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out);
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+#endif /* 5.1.0 || (RHEL && RHEL < 8.1) */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0))
+#if (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))))
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+int _kc_flow_block_cb_setup_simple(struct flow_block_offload *f,
+				   struct list_head __always_unused *driver_list,
+				   tc_setup_cb_t *cb,
+				   void *cb_ident, void *cb_priv,
+				   bool ingress_only)
+{
+	if (ingress_only &&
+	    f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	/* Note: Upstream has driver_block_list, but older kernels do not */
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+#ifdef HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+		return tcf_block_cb_register(f->block, cb, cb_ident, cb_priv,
+					     f->extack);
+#else
+		return tcf_block_cb_register(f->block, cb, cb_ident, cb_priv);
+#endif
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, cb, cb_ident);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#endif /* !RHEL >= 8.2 */
+#endif /* 5.3.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,7,0))
+u64 _kc_pci_get_dsn(struct pci_dev *dev)
+{
+	u32 dword;
+	u64 dsn;
+	int pos;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DSN);
+	if (!pos)
+		return 0;
+
+	/*
+	 * The Device Serial Number is two dwords offset 4 bytes from the
+	 * capability position. The specification says that the first dword is
+	 * the lower half, and the second dword is the upper half.
+	 */
+	pos += 4;
+	pci_read_config_dword(dev, pos, &dword);
+	dsn = (u64)dword;
+	pci_read_config_dword(dev, pos + 4, &dword);
+	dsn |= ((u64)dword) << 32;
+
+	return dsn;
+}
+#endif /* 5.7.0 */
diff --git a/drivers/net/ethernet/intel/ice/kcompat.h b/drivers/net/ethernet/intel/ice/kcompat.h
new file mode 100644
index 000000000000..6549382150a6
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat.h
@@ -0,0 +1,3558 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_H_
+#define _KCOMPAT_H_
+
+#ifndef LINUX_VERSION_CODE
+#include <linux/version.h>
+#else
+#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
+#endif
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/if_link.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/list.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <linux/vmalloc.h>
+
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 10000		\
+		     + __GNUC_MINOR__ * 100	\
+		     + __GNUC_PATCHLEVEL__)
+#endif /* GCC_VERSION */
+
+#ifndef IEEE_8021QAZ_APP_SEL_DSCP
+#define IEEE_8021QAZ_APP_SEL_DSCP	5
+#endif
+
+/* Backport macros for controlling GCC diagnostics */
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0) )
+
+/* Compilers before gcc-4.6 do not understand "#pragma GCC diagnostic push" */
+#if GCC_VERSION >= 40600
+#define __diag_str1(s)		#s
+#define __diag_str(s)		__diag_str1(s)
+#define __diag(s)		_Pragma(__diag_str(GCC diagnostic s))
+#else
+#define __diag(s)
+#endif /* GCC_VERSION >= 4.6 */
+#define __diag_push()	__diag(push)
+#define __diag_pop()	__diag(pop)
+#endif /* LINUX_VERSION < 4.18.0 */
+
+#ifndef NSEC_PER_MSEC
+#define NSEC_PER_MSEC 1000000L
+#endif
+#include <net/ipv6.h>
+/* UTS_RELEASE is in a different header starting in kernel 2.6.18 */
+#ifndef UTS_RELEASE
+/* utsrelease.h changed locations in 2.6.33 */
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) )
+#include <linux/utsrelease.h>
+#else
+#include <generated/utsrelease.h>
+#endif
+#endif
+
+
+#define adapter_struct ice_pf
+#define adapter_q_vector ice_q_vector
+
+
+/* Dynamic LTR and deeper C-State support disable/enable */
+
+/* packet split disable/enable */
+#ifdef DISABLE_PACKET_SPLIT
+#endif /* DISABLE_PACKET_SPLIT */
+
+/* MSI compatibility code for all kernels and drivers */
+#ifdef DISABLE_PCI_MSI
+#undef CONFIG_PCI_MSI
+#endif
+#ifndef CONFIG_PCI_MSI
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) )
+struct msix_entry {
+	u16 vector; /* kernel uses to write allocated vector */
+	u16 entry;  /* driver uses to specify entry, OS writes */
+};
+#endif
+#undef pci_enable_msi
+#define pci_enable_msi(a) -ENOTSUPP
+#undef pci_disable_msi
+#define pci_disable_msi(a) do {} while (0)
+#undef pci_enable_msix
+#define pci_enable_msix(a, b, c) -ENOTSUPP
+#undef pci_disable_msix
+#define pci_disable_msix(a) do {} while (0)
+#define msi_remove_pci_irq_vectors(a) do {} while (0)
+#endif /* CONFIG_PCI_MSI */
+#ifdef DISABLE_PM
+#undef CONFIG_PM
+#endif
+
+#ifdef DISABLE_NET_POLL_CONTROLLER
+#undef CONFIG_NET_POLL_CONTROLLER
+#endif
+
+#ifndef PMSG_SUSPEND
+#define PMSG_SUSPEND 3
+#endif
+
+/* generic boolean compatibility */
+#undef TRUE
+#undef FALSE
+#define TRUE true
+#define FALSE false
+#ifdef GCC_VERSION
+#if ( GCC_VERSION < 3000 )
+#define _Bool char
+#endif
+#else
+#define _Bool char
+#endif
+
+
+#undef __always_unused
+#define __always_unused __attribute__((__unused__))
+
+#undef __maybe_unused
+#define __maybe_unused __attribute__((__unused__))
+
+/* kernels less than 2.4.14 don't have this */
+#ifndef ETH_P_8021Q
+#define ETH_P_8021Q 0x8100
+#endif
+
+#ifndef module_param
+#define module_param(v,t,p) MODULE_PARM(v, "i");
+#endif
+
+#ifndef DMA_64BIT_MASK
+#define DMA_64BIT_MASK  0xffffffffffffffffULL
+#endif
+
+#ifndef DMA_32BIT_MASK
+#define DMA_32BIT_MASK  0x00000000ffffffffULL
+#endif
+
+#ifndef PCI_CAP_ID_EXP
+#define PCI_CAP_ID_EXP 0x10
+#endif
+
+#ifndef uninitialized_var
+#define uninitialized_var(x) x = x
+#endif
+
+#ifndef PCIE_LINK_STATE_L0S
+#define PCIE_LINK_STATE_L0S 1
+#endif
+#ifndef PCIE_LINK_STATE_L1
+#define PCIE_LINK_STATE_L1 2
+#endif
+
+#ifndef SET_NETDEV_DEV
+#define SET_NETDEV_DEV(net, pdev)
+#endif
+
+#if !defined(HAVE_FREE_NETDEV) && ( LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0) )
+#define free_netdev(x)	kfree(x)
+#endif
+
+#ifdef HAVE_POLL_CONTROLLER
+#define CONFIG_NET_POLL_CONTROLLER
+#endif
+
+#ifndef SKB_DATAREF_SHIFT
+/* if we do not have the infrastructure to detect if skb_header is cloned
+   just return false in all cases */
+#define skb_header_cloned(x) 0
+#endif
+
+#ifndef NETIF_F_GSO
+#define gso_size tso_size
+#define gso_segs tso_segs
+#endif
+
+#ifndef NETIF_F_GRO
+#define vlan_gro_receive(_napi, _vlgrp, _vlan, _skb) \
+		vlan_hwaccel_receive_skb(_skb, _vlgrp, _vlan)
+#define napi_gro_receive(_napi, _skb) netif_receive_skb(_skb)
+#endif
+
+#ifndef NETIF_F_SCTP_CSUM
+#define NETIF_F_SCTP_CSUM 0
+#endif
+
+#ifndef NETIF_F_LRO
+#define NETIF_F_LRO BIT(15)
+#endif
+
+#ifndef NETIF_F_NTUPLE
+#define NETIF_F_NTUPLE BIT(27)
+#endif
+
+#ifndef NETIF_F_ALL_FCOE
+#define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
+				 NETIF_F_FSO)
+#endif
+
+#ifndef IPPROTO_SCTP
+#define IPPROTO_SCTP 132
+#endif
+
+#ifndef IPPROTO_UDPLITE
+#define IPPROTO_UDPLITE 136
+#endif
+
+#ifndef CHECKSUM_PARTIAL
+#define CHECKSUM_PARTIAL CHECKSUM_HW
+#define CHECKSUM_COMPLETE CHECKSUM_HW
+#endif
+
+#ifndef __read_mostly
+#define __read_mostly
+#endif
+
+#ifndef MII_RESV1
+#define MII_RESV1		0x17		/* Reserved...		*/
+#endif
+
+#ifndef unlikely
+#define unlikely(_x) _x
+#define likely(_x) _x
+#endif
+
+#ifndef WARN_ON
+#define WARN_ON(x) ({0;})
+#endif
+
+#ifndef PCI_DEVICE
+#define PCI_DEVICE(vend,dev) \
+	.vendor = (vend), .device = (dev), \
+	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
+#endif
+
+#ifndef node_online
+#define node_online(node) ((node) == 0)
+#endif
+
+#ifndef _LINUX_RANDOM_H
+#include <linux/random.h>
+#endif
+
+#ifndef BITS_PER_TYPE
+#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
+#endif
+
+#ifndef BITS_TO_LONGS
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#endif
+
+#ifndef DECLARE_BITMAP
+#define DECLARE_BITMAP(name,bits) long name[BITS_TO_LONGS(bits)]
+#endif
+
+#ifndef VLAN_HLEN
+#define VLAN_HLEN 4
+#endif
+
+#ifndef VLAN_ETH_HLEN
+#define VLAN_ETH_HLEN 18
+#endif
+
+#ifndef VLAN_ETH_FRAME_LEN
+#define VLAN_ETH_FRAME_LEN 1518
+#endif
+
+#ifndef DCA_GET_TAG_TWO_ARGS
+#define dca3_get_tag(a,b) dca_get_tag(b)
+#endif
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+#if defined(__i386__) || defined(__x86_64__)
+#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+#endif
+#endif
+
+/* taken from 2.6.24 definition in linux/kernel.h */
+#ifndef IS_ALIGNED
+#define IS_ALIGNED(x,a)         (((x) % ((typeof(x))(a))) == 0)
+#endif
+
+#ifdef IS_ENABLED
+#undef IS_ENABLED
+#undef __ARG_PLACEHOLDER_1
+#undef config_enabled
+#undef _config_enabled
+#undef __config_enabled
+#undef ___config_enabled
+#endif
+
+#define __ARG_PLACEHOLDER_1 0,
+#define config_enabled(cfg) _config_enabled(cfg)
+#ifdef __CHECKER__
+/* cppcheck-suppress preprocessorErrorDirective */
+#endif /* __CHECKER__ */
+#define _config_enabled(value) __config_enabled(__ARG_PLACEHOLDER_##value)
+#define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0)
+#define ___config_enabled(__ignored, val, ...) val
+
+#define IS_ENABLED(option) \
+	(config_enabled(option) || config_enabled(option##_MODULE))
+
+#if !defined(NETIF_F_HW_VLAN_TX) && !defined(NETIF_F_HW_VLAN_CTAG_TX)
+struct _kc_vlan_ethhdr {
+	unsigned char	h_dest[ETH_ALEN];
+	unsigned char	h_source[ETH_ALEN];
+	__be16		h_vlan_proto;
+	__be16		h_vlan_TCI;
+	__be16		h_vlan_encapsulated_proto;
+};
+#define vlan_ethhdr _kc_vlan_ethhdr
+struct _kc_vlan_hdr {
+	__be16		h_vlan_TCI;
+	__be16		h_vlan_encapsulated_proto;
+};
+#define vlan_hdr _kc_vlan_hdr
+#define vlan_tx_tag_present(_skb) 0
+#define vlan_tx_tag_get(_skb) 0
+#endif /* NETIF_F_HW_VLAN_TX && NETIF_F_HW_VLAN_CTAG_TX */
+
+#ifndef VLAN_PRIO_SHIFT
+#define VLAN_PRIO_SHIFT 13
+#endif
+
+#ifndef PCI_EXP_LNKSTA_CLS_2_5GB
+#define PCI_EXP_LNKSTA_CLS_2_5GB 0x0001
+#endif
+
+#ifndef PCI_EXP_LNKSTA_CLS_5_0GB
+#define PCI_EXP_LNKSTA_CLS_5_0GB 0x0002
+#endif
+
+#ifndef PCI_EXP_LNKSTA_CLS_8_0GB
+#define PCI_EXP_LNKSTA_CLS_8_0GB 0x0003
+#endif
+
+#ifndef PCI_EXP_LNKSTA_NLW_X1
+#define PCI_EXP_LNKSTA_NLW_X1 0x0010
+#endif
+
+#ifndef PCI_EXP_LNKSTA_NLW_X2
+#define PCI_EXP_LNKSTA_NLW_X2 0x0020
+#endif
+
+#ifndef PCI_EXP_LNKSTA_NLW_X4
+#define PCI_EXP_LNKSTA_NLW_X4 0x0040
+#endif
+
+#ifndef PCI_EXP_LNKSTA_NLW_X8
+#define PCI_EXP_LNKSTA_NLW_X8 0x0080
+#endif
+
+
+#ifndef __GFP_COLD
+#define __GFP_COLD 0
+#endif
+
+#ifndef __GFP_COMP
+#define __GFP_COMP 0
+#endif
+
+#ifndef IP_OFFSET
+#define IP_OFFSET 0x1FFF /* "Fragment Offset" part */
+#endif
+
+/*****************************************************************************/
+/* Installations with ethtool version without eeprom, adapter id, or statistics
+ * support */
+
+#ifndef ETH_GSTRING_LEN
+#define ETH_GSTRING_LEN 32
+#endif
+
+#ifndef ETHTOOL_GSTATS
+#define ETHTOOL_GSTATS 0x1d
+#undef ethtool_drvinfo
+#define ethtool_drvinfo k_ethtool_drvinfo
+struct k_ethtool_drvinfo {
+	u32 cmd;
+	char driver[32];
+	char version[32];
+	char fw_version[32];
+	char bus_info[32];
+	char reserved1[32];
+	char reserved2[16];
+	u32 n_stats;
+	u32 testinfo_len;
+	u32 eedump_len;
+	u32 regdump_len;
+};
+
+struct ethtool_stats {
+	u32 cmd;
+	u32 n_stats;
+	u64 data[0];
+};
+#endif /* ETHTOOL_GSTATS */
+
+#ifndef ETHTOOL_PHYS_ID
+#define ETHTOOL_PHYS_ID 0x1c
+#endif /* ETHTOOL_PHYS_ID */
+
+#ifndef ETHTOOL_GSTRINGS
+#define ETHTOOL_GSTRINGS 0x1b
+enum ethtool_stringset {
+	ETH_SS_TEST             = 0,
+	ETH_SS_STATS,
+};
+struct ethtool_gstrings {
+	u32 cmd;            /* ETHTOOL_GSTRINGS */
+	u32 string_set;     /* string set id e.c. ETH_SS_TEST, etc*/
+	u32 len;            /* number of strings in the string set */
+	u8 data[0];
+};
+#endif /* ETHTOOL_GSTRINGS */
+
+#ifndef ETHTOOL_TEST
+#define ETHTOOL_TEST 0x1a
+enum ethtool_test_flags {
+	ETH_TEST_FL_OFFLINE	= BIT(0),
+	ETH_TEST_FL_FAILED	= BIT(1),
+};
+struct ethtool_test {
+	u32 cmd;
+	u32 flags;
+	u32 reserved;
+	u32 len;
+	u64 data[0];
+};
+#endif /* ETHTOOL_TEST */
+
+#ifndef ETHTOOL_GEEPROM
+#define ETHTOOL_GEEPROM 0xb
+#undef ETHTOOL_GREGS
+struct ethtool_eeprom {
+	u32 cmd;
+	u32 magic;
+	u32 offset;
+	u32 len;
+	u8 data[0];
+};
+
+struct ethtool_value {
+	u32 cmd;
+	u32 data;
+};
+#endif /* ETHTOOL_GEEPROM */
+
+#ifndef ETHTOOL_GLINK
+#define ETHTOOL_GLINK 0xa
+#endif /* ETHTOOL_GLINK */
+
+#ifndef ETHTOOL_GWOL
+#define ETHTOOL_GWOL 0x5
+#define ETHTOOL_SWOL 0x6
+#define SOPASS_MAX      6
+struct ethtool_wolinfo {
+	u32 cmd;
+	u32 supported;
+	u32 wolopts;
+	u8 sopass[SOPASS_MAX]; /* SecureOn(tm) password */
+};
+#endif /* ETHTOOL_GWOL */
+
+#ifndef ETHTOOL_GREGS
+#define ETHTOOL_GREGS		0x00000004 /* Get NIC registers */
+#define ethtool_regs _kc_ethtool_regs
+/* for passing big chunks of data */
+struct _kc_ethtool_regs {
+	u32 cmd;
+	u32 version; /* driver-specific, indicates different chips/revs */
+	u32 len; /* bytes */
+	u8 data[0];
+};
+#endif /* ETHTOOL_GREGS */
+
+#ifndef ETHTOOL_GMSGLVL
+#define ETHTOOL_GMSGLVL		0x00000007 /* Get driver message level */
+#endif
+#ifndef ETHTOOL_SMSGLVL
+#define ETHTOOL_SMSGLVL		0x00000008 /* Set driver msg level, priv. */
+#endif
+#ifndef ETHTOOL_NWAY_RST
+#define ETHTOOL_NWAY_RST	0x00000009 /* Restart autonegotiation, priv */
+#endif
+#ifndef ETHTOOL_GLINK
+#define ETHTOOL_GLINK		0x0000000a /* Get link status */
+#endif
+#ifndef ETHTOOL_GEEPROM
+#define ETHTOOL_GEEPROM		0x0000000b /* Get EEPROM data */
+#endif
+#ifndef ETHTOOL_SEEPROM
+#define ETHTOOL_SEEPROM		0x0000000c /* Set EEPROM data */
+#endif
+#ifndef ETHTOOL_GCOALESCE
+#define ETHTOOL_GCOALESCE	0x0000000e /* Get coalesce config */
+/* for configuring coalescing parameters of chip */
+#define ethtool_coalesce _kc_ethtool_coalesce
+struct _kc_ethtool_coalesce {
+	u32	cmd;	/* ETHTOOL_{G,S}COALESCE */
+
+	/* How many usecs to delay an RX interrupt after
+	 * a packet arrives.  If 0, only rx_max_coalesced_frames
+	 * is used.
+	 */
+	u32	rx_coalesce_usecs;
+
+	/* How many packets to delay an RX interrupt after
+	 * a packet arrives.  If 0, only rx_coalesce_usecs is
+	 * used.  It is illegal to set both usecs and max frames
+	 * to zero as this would cause RX interrupts to never be
+	 * generated.
+	 */
+	u32	rx_max_coalesced_frames;
+
+	/* Same as above two parameters, except that these values
+	 * apply while an IRQ is being serviced by the host.  Not
+	 * all cards support this feature and the values are ignored
+	 * in that case.
+	 */
+	u32	rx_coalesce_usecs_irq;
+	u32	rx_max_coalesced_frames_irq;
+
+	/* How many usecs to delay a TX interrupt after
+	 * a packet is sent.  If 0, only tx_max_coalesced_frames
+	 * is used.
+	 */
+	u32	tx_coalesce_usecs;
+
+	/* How many packets to delay a TX interrupt after
+	 * a packet is sent.  If 0, only tx_coalesce_usecs is
+	 * used.  It is illegal to set both usecs and max frames
+	 * to zero as this would cause TX interrupts to never be
+	 * generated.
+	 */
+	u32	tx_max_coalesced_frames;
+
+	/* Same as above two parameters, except that these values
+	 * apply while an IRQ is being serviced by the host.  Not
+	 * all cards support this feature and the values are ignored
+	 * in that case.
+	 */
+	u32	tx_coalesce_usecs_irq;
+	u32	tx_max_coalesced_frames_irq;
+
+	/* How many usecs to delay in-memory statistics
+	 * block updates.  Some drivers do not have an in-memory
+	 * statistic block, and in such cases this value is ignored.
+	 * This value must not be zero.
+	 */
+	u32	stats_block_coalesce_usecs;
+
+	/* Adaptive RX/TX coalescing is an algorithm implemented by
+	 * some drivers to improve latency under low packet rates and
+	 * improve throughput under high packet rates.  Some drivers
+	 * only implement one of RX or TX adaptive coalescing.  Anything
+	 * not implemented by the driver causes these values to be
+	 * silently ignored.
+	 */
+	u32	use_adaptive_rx_coalesce;
+	u32	use_adaptive_tx_coalesce;
+
+	/* When the packet rate (measured in packets per second)
+	 * is below pkt_rate_low, the {rx,tx}_*_low parameters are
+	 * used.
+	 */
+	u32	pkt_rate_low;
+	u32	rx_coalesce_usecs_low;
+	u32	rx_max_coalesced_frames_low;
+	u32	tx_coalesce_usecs_low;
+	u32	tx_max_coalesced_frames_low;
+
+	/* When the packet rate is below pkt_rate_high but above
+	 * pkt_rate_low (both measured in packets per second) the
+	 * normal {rx,tx}_* coalescing parameters are used.
+	 */
+
+	/* When the packet rate is (measured in packets per second)
+	 * is above pkt_rate_high, the {rx,tx}_*_high parameters are
+	 * used.
+	 */
+	u32	pkt_rate_high;
+	u32	rx_coalesce_usecs_high;
+	u32	rx_max_coalesced_frames_high;
+	u32	tx_coalesce_usecs_high;
+	u32	tx_max_coalesced_frames_high;
+
+	/* How often to do adaptive coalescing packet rate sampling,
+	 * measured in seconds.  Must not be zero.
+	 */
+	u32	rate_sample_interval;
+};
+#endif /* ETHTOOL_GCOALESCE */
+
+#ifndef ETHTOOL_SCOALESCE
+#define ETHTOOL_SCOALESCE	0x0000000f /* Set coalesce config. */
+#endif
+#ifndef ETHTOOL_GRINGPARAM
+#define ETHTOOL_GRINGPARAM	0x00000010 /* Get ring parameters */
+/* for configuring RX/TX ring parameters */
+#define ethtool_ringparam _kc_ethtool_ringparam
+struct _kc_ethtool_ringparam {
+	u32	cmd;	/* ETHTOOL_{G,S}RINGPARAM */
+
+	/* Read only attributes.  These indicate the maximum number
+	 * of pending RX/TX ring entries the driver will allow the
+	 * user to set.
+	 */
+	u32	rx_max_pending;
+	u32	rx_mini_max_pending;
+	u32	rx_jumbo_max_pending;
+	u32	tx_max_pending;
+
+	/* Values changeable by the user.  The valid values are
+	 * in the range 1 to the "*_max_pending" counterpart above.
+	 */
+	u32	rx_pending;
+	u32	rx_mini_pending;
+	u32	rx_jumbo_pending;
+	u32	tx_pending;
+};
+#endif /* ETHTOOL_GRINGPARAM */
+
+#ifndef ETHTOOL_SRINGPARAM
+#define ETHTOOL_SRINGPARAM	0x00000011 /* Set ring parameters, priv. */
+#endif
+#ifndef ETHTOOL_GPAUSEPARAM
+#define ETHTOOL_GPAUSEPARAM	0x00000012 /* Get pause parameters */
+/* for configuring link flow control parameters */
+#define ethtool_pauseparam _kc_ethtool_pauseparam
+struct _kc_ethtool_pauseparam {
+	u32	cmd;	/* ETHTOOL_{G,S}PAUSEPARAM */
+
+	/* If the link is being auto-negotiated (via ethtool_cmd.autoneg
+	 * being true) the user may set 'autoneg' here non-zero to have the
+	 * pause parameters be auto-negotiated too.  In such a case, the
+	 * {rx,tx}_pause values below determine what capabilities are
+	 * advertised.
+	 *
+	 * If 'autoneg' is zero or the link is not being auto-negotiated,
+	 * then {rx,tx}_pause force the driver to use/not-use pause
+	 * flow control.
+	 */
+	u32	autoneg;
+	u32	rx_pause;
+	u32	tx_pause;
+};
+#endif /* ETHTOOL_GPAUSEPARAM */
+
+#ifndef ETHTOOL_SPAUSEPARAM
+#define ETHTOOL_SPAUSEPARAM	0x00000013 /* Set pause parameters. */
+#endif
+#ifndef ETHTOOL_GRXCSUM
+#define ETHTOOL_GRXCSUM		0x00000014 /* Get RX hw csum enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_SRXCSUM
+#define ETHTOOL_SRXCSUM		0x00000015 /* Set RX hw csum enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_GTXCSUM
+#define ETHTOOL_GTXCSUM		0x00000016 /* Get TX hw csum enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_STXCSUM
+#define ETHTOOL_STXCSUM		0x00000017 /* Set TX hw csum enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_GSG
+#define ETHTOOL_GSG		0x00000018 /* Get scatter-gather enable
+					    * (ethtool_value) */
+#endif
+#ifndef ETHTOOL_SSG
+#define ETHTOOL_SSG		0x00000019 /* Set scatter-gather enable
+					    * (ethtool_value). */
+#endif
+#ifndef ETHTOOL_TEST
+#define ETHTOOL_TEST		0x0000001a /* execute NIC self-test, priv. */
+#endif
+#ifndef ETHTOOL_GSTRINGS
+#define ETHTOOL_GSTRINGS	0x0000001b /* get specified string set */
+#endif
+#ifndef ETHTOOL_PHYS_ID
+#define ETHTOOL_PHYS_ID		0x0000001c /* identify the NIC */
+#endif
+#ifndef ETHTOOL_GSTATS
+#define ETHTOOL_GSTATS		0x0000001d /* get NIC-specific statistics */
+#endif
+#ifndef ETHTOOL_GTSO
+#define ETHTOOL_GTSO		0x0000001e /* Get TSO enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_STSO
+#define ETHTOOL_STSO		0x0000001f /* Set TSO enable (ethtool_value) */
+#endif
+
+#ifndef ETHTOOL_BUSINFO_LEN
+#define ETHTOOL_BUSINFO_LEN	32
+#endif
+
+#ifndef WAKE_FILTER
+#define WAKE_FILTER	BIT(7)
+#endif
+
+#ifndef SPEED_2500
+#define SPEED_2500 2500
+#endif
+#ifndef SPEED_5000
+#define SPEED_5000 5000
+#endif
+#ifndef SPEED_14000
+#define SPEED_14000 14000
+#endif
+#ifndef SPEED_25000
+#define SPEED_25000 25000
+#endif
+#ifndef SPEED_50000
+#define SPEED_50000 50000
+#endif
+#ifndef SPEED_56000
+#define SPEED_56000 56000
+#endif
+#ifndef SPEED_100000
+#define SPEED_100000 100000
+#endif
+
+#ifndef RHEL_RELEASE_VERSION
+#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
+#endif
+#ifndef AX_RELEASE_VERSION
+#define AX_RELEASE_VERSION(a,b) (((a) << 8) + (b))
+#endif
+
+#ifndef AX_RELEASE_CODE
+#define AX_RELEASE_CODE 0
+#endif
+
+#if (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,0))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,0)
+#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,1))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,1)
+#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,2))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,3)
+#endif
+
+#ifndef RHEL_RELEASE_CODE
+/* NOTE: RHEL_RELEASE_* introduced in RHEL4.5 */
+#define RHEL_RELEASE_CODE 0
+#endif
+
+/* RHEL 7 didn't backport the parameter change in
+ * create_singlethread_workqueue.
+ * If/when RH corrects this we will want to tighten up the version check.
+ */
+#if (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,0))
+#undef create_singlethread_workqueue
+#define create_singlethread_workqueue(name)	\
+	alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name)
+#endif
+
+/* Ubuntu Release ABI is the 4th digit of their kernel version. You can find
+ * it in /usr/src/linux/$(uname -r)/include/generated/utsrelease.h for new
+ * enough versions of Ubuntu. Otherwise you can simply see it in the output of
+ * uname as the 4th digit of the kernel. The UTS_UBUNTU_RELEASE_ABI is not in
+ * the linux-source package, but in the linux-headers package. It begins to
+ * appear in later releases of 14.04 and 14.10.
+ *
+ * Ex:
+ * <Ubuntu 14.04.1>
+ *  $uname -r
+ *  3.13.0-45-generic
+ * ABI is 45
+ *
+ * <Ubuntu 14.10>
+ *  $uname -r
+ *  3.16.0-23-generic
+ * ABI is 23
+ */
+#ifndef UTS_UBUNTU_RELEASE_ABI
+#define UTS_UBUNTU_RELEASE_ABI 0
+#define UBUNTU_VERSION_CODE 0
+#else
+/* Ubuntu does not provide actual release version macro, so we use the kernel
+ * version plus the ABI to generate a unique version code specific to Ubuntu.
+ * In addition, we mask the lower 8 bits of LINUX_VERSION_CODE in order to
+ * ignore differences in sublevel which are not important since we have the
+ * ABI value. Otherwise, it becomes impossible to correlate ABI to version for
+ * ordering checks.
+ *
+ * This also lets us store an ABI value up to 65535, since it can take the
+ * space that would use the lower byte of the Linux version code.
+ */
+#define UBUNTU_VERSION_CODE (((~0xFF & LINUX_VERSION_CODE) << 8) + \
+			     UTS_UBUNTU_RELEASE_ABI)
+
+#if UTS_UBUNTU_RELEASE_ABI > 65535
+#error UTS_UBUNTU_RELEASE_ABI is larger than 65535...
+#endif /* UTS_UBUNTU_RELEASE_ABI > 65535 */
+
+#if ( LINUX_VERSION_CODE <= KERNEL_VERSION(3,0,0) )
+/* Our version code scheme does not make sense for non 3.x or newer kernels,
+ * and we have no support in kcompat for this scenario. Thus, treat this as a
+ * non-Ubuntu kernel. Possibly might be better to error here.
+ */
+#define UTS_UBUNTU_RELEASE_ABI 0
+#define UBUNTU_VERSION_CODE 0
+#endif /* <= 3.0.0 */
+#endif /* !UTS_UBUNTU_RELEASE_ABI */
+
+/* We ignore the 3rd digit since we want to give precedence to the additional
+ * ABI value provided by Ubuntu.
+ */
+#define UBUNTU_VERSION(a,b,c,d) (((a) << 24) + ((b) << 16) + (d))
+
+/* SLE_VERSION is used to generate a 3-digit encoding that can order SLE
+ * kernels based on their major release, service pack, and a possible
+ * maintenance release.
+ */
+#define SLE_VERSION(a,b,c)	(((a) << 16) + ((b) << 8) + (c))
+
+/* The SLE_LOCALVERSION_CODE comes from a 3-digit code added as part of the
+ * Linux kernel version. It is extracted by the driver Makefile. This macro is
+ * used to generate codes for making comparisons below.
+ */
+#define SLE_LOCALVERSION(a,b,c)	(((a) << 16) + ((b) << 8) + (c))
+
+#ifdef CONFIG_SUSE_KERNEL
+/* Starting since at least SLE 12sp4 and SLE 15, the SUSE kernels have
+ * provided CONFIG_SUSE_VERSION, CONFIG_SUSE_PATCHLEVEL and
+ * CONFIG_SUSE_AUXRELEASE. Use these to generate SLE_VERSION if available.
+ * Only fall back to the manual table otherwise. We expect all future versions
+ * of SLE kernels to include these values, so the table will remain only for
+ * the older releases.
+ */
+#ifdef CONFIG_SUSE_VERSION
+#ifndef CONFIG_SUSE_PATCHLEVEL
+#error "CONFIG_SUSE_VERSION exists but CONFIG_SUSE_PATCHLEVEL is missing"
+#endif
+#ifndef CONFIG_SUSE_AUXRELEASE
+#error "CONFIG_SUSE_VERSION exists but CONFIG_SUSE_AUXRELEASE is missing"
+#endif
+#define SLE_VERSION_CODE SLE_VERSION(CONFIG_SUSE_VERSION, CONFIG_SUSE_PATCHLEVEL, CONFIG_SUSE_AUXRELEASE)
+#else
+/* If we do not have the CONFIG_SUSE_VERSION configuration values, fall back
+ * to the following table for older releases.
+ */
+#if ( LINUX_VERSION_CODE == KERNEL_VERSION(2,6,27) )
+/* SLES11 GA is 2.6.27 based */
+#define SLE_VERSION_CODE SLE_VERSION(11,0,0)
+#elif ( LINUX_VERSION_CODE == KERNEL_VERSION(2,6,32) )
+/* SLES11 SP1 is 2.6.32 based */
+#define SLE_VERSION_CODE SLE_VERSION(11,1,0)
+#elif ( LINUX_VERSION_CODE == KERNEL_VERSION(3,0,13) )
+/* SLES11 SP2 GA is 3.0.13-0.27 */
+#define SLE_VERSION_CODE SLE_VERSION(11,2,0)
+#elif ((LINUX_VERSION_CODE == KERNEL_VERSION(3,0,76)))
+/* SLES11 SP3 GA is 3.0.76-0.11 */
+#define SLE_VERSION_CODE SLE_VERSION(11,3,0)
+#elif (LINUX_VERSION_CODE == KERNEL_VERSION(3,0,101))
+  #if (SLE_LOCALVERSION_CODE < SLE_LOCALVERSION(0,8,0))
+  /* some SLES11sp2 update kernels up to 3.0.101-0.7.x */
+  #define SLE_VERSION_CODE SLE_VERSION(11,2,0)
+  #elif (SLE_LOCALVERSION_CODE < SLE_LOCALVERSION(63,0,0))
+  /* most SLES11sp3 update kernels */
+  #define SLE_VERSION_CODE SLE_VERSION(11,3,0)
+  #else
+  /* SLES11 SP4 GA (3.0.101-63) and update kernels 3.0.101-63+ */
+  #define SLE_VERSION_CODE SLE_VERSION(11,4,0)
+  #endif
+#elif (LINUX_VERSION_CODE == KERNEL_VERSION(3,12,28))
+/* SLES12 GA is 3.12.28-4
+ * kernel updates 3.12.xx-<33 through 52>[.yy] */
+#define SLE_VERSION_CODE SLE_VERSION(12,0,0)
+#elif (LINUX_VERSION_CODE == KERNEL_VERSION(3,12,49))
+/* SLES12 SP1 GA is 3.12.49-11
+ * updates 3.12.xx-60.yy where xx={51..} */
+#define SLE_VERSION_CODE SLE_VERSION(12,1,0)
+#elif ((LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,21) && \
+       (LINUX_VERSION_CODE <= KERNEL_VERSION(4,4,59))) || \
+       (LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,74) && \
+        LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \
+        SLE_LOCALVERSION_CODE >= KERNEL_VERSION(92,0,0) && \
+        SLE_LOCALVERSION_CODE <  KERNEL_VERSION(93,0,0)))
+/* SLES12 SP2 GA is 4.4.21-69.
+ * SLES12 SP2 updates before SLES12 SP3 are: 4.4.{21,38,49,59}
+ * SLES12 SP2 updates after SLES12 SP3 are: 4.4.{74,90,103,114,120}
+ * but they all use a SLE_LOCALVERSION_CODE matching 92.nn.y */
+#define SLE_VERSION_CODE SLE_VERSION(12,2,0)
+#elif ((LINUX_VERSION_CODE == KERNEL_VERSION(4,4,73) || \
+        LINUX_VERSION_CODE == KERNEL_VERSION(4,4,82) || \
+        LINUX_VERSION_CODE == KERNEL_VERSION(4,4,92)) || \
+       (LINUX_VERSION_CODE == KERNEL_VERSION(4,4,103) && \
+       (SLE_LOCALVERSION_CODE == KERNEL_VERSION(6,33,0) || \
+        SLE_LOCALVERSION_CODE == KERNEL_VERSION(6,38,0))) || \
+       (LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,114) && \
+        LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \
+        SLE_LOCALVERSION_CODE >= KERNEL_VERSION(94,0,0) && \
+        SLE_LOCALVERSION_CODE <  KERNEL_VERSION(95,0,0)) )
+/* SLES12 SP3 GM is 4.4.73-5 and update kernels are 4.4.82-6.3.
+ * SLES12 SP3 updates not conflicting with SP2 are: 4.4.{82,92}
+ * SLES12 SP3 updates conflicting with SP2 are:
+ *   - 4.4.103-6.33.1, 4.4.103-6.38.1
+ *   - 4.4.{114,120}-94.nn.y */
+#define SLE_VERSION_CODE SLE_VERSION(12,3,0)
+#else
+#error "This looks like a SUSE kernel, but it has an unrecognized local version code."
+#endif /* LINUX_VERSION_CODE == KERNEL_VERSION(x,y,z) */
+#endif /* !CONFIG_SUSE_VERSION */
+#endif /* CONFIG_SUSE_KERNEL */
+#ifndef SLE_VERSION_CODE
+#define SLE_VERSION_CODE 0
+#endif /* SLE_VERSION_CODE */
+#ifndef SLE_LOCALVERSION_CODE
+#define SLE_LOCALVERSION_CODE 0
+#endif /* SLE_LOCALVERSION_CODE */
+
+/*
+ * Include the definitions file for HAVE/NEED flags for the standard upstream
+ * kernels.
+ *
+ * Then, based on the distribution we detect, load the distribution specific
+ * definitions file that customizes the definitions for the target
+ * distribution.
+ */
+#include "kcompat_std_defs.h"
+
+#ifdef CONFIG_SUSE_KERNEL
+#include "kcompat_sles_defs.h"
+#elif UBUNTU_VERSION_CODE
+#include "kcompat_ubuntu_defs.h"
+#elif RHEL_RELEASE_CODE
+#include "kcompat_rhel_defs.h"
+#endif
+
+/*
+ * ADQ depends on __TC_MQPRIO_MODE_MAX and related kernel code
+ * added around 4.15. Some distributions (e.g. Oracle Linux 7.7)
+ * have done a partial back-port of that to their kernels based
+ * on older mainline kernels that did not include all the necessary
+ * kernel enablement to support ADQ.
+ * Undefine __TC_MQPRIO_MODE_MAX for all OSV distributions with
+ * kernels based on mainline kernels older than 4.15 except for
+ * RHEL, SLES and Ubuntu which are known to have good back-ports.
+ */
+#if (!RHEL_RELEASE_CODE && !SLE_VERSION_CODE && !UBUNTU_VERSION_CODE)
+  #if (LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0))
+  #undef __TC_MQPRIO_MODE_MAX
+  #endif /*  LINUX_VERSION_CODE == KERNEL_VERSION(4,15,0) */
+#endif /* if (NOT RHEL && NOT SLES && NOT UBUNTU) */
+
+
+#ifdef __KLOCWORK__
+ */
+#ifdef ARRAY_SIZE
+#undef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#define memcpy(dest, src, len)	memcpy_s(dest, len, src, len)
+#define memset(dest, ch, len)	memset_s(dest, len, ch, len)
+
+static inline int _kc_test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+	unsigned long flags = 0;
+
+	_atomic_spin_lock_irqsave(p, flags);
+	old = *p;
+	*p = old & ~mask;
+	_atomic_spin_unlock_irqrestore(p, flags);
+
+	return (old & mask) != 0;
+}
+#define test_and_clear_bit(nr, addr) _kc_test_and_clear_bit(nr, addr)
+
+static inline int _kc_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+	unsigned long flags = 0;
+
+	_atomic_spin_lock_irqsave(p, flags);
+	old = *p;
+	*p = old | mask;
+	_atomic_spin_unlock_irqrestore(p, flags);
+
+	return (old & mask) != 0;
+}
+#define test_and_set_bit(nr, addr) _kc_test_and_set_bit(nr, addr)
+
+#ifdef CONFIG_DYNAMIC_DEBUG
+#undef dev_dbg
+#define dev_dbg(dev, format, arg...) dev_printk(KERN_DEBUG, dev, format, ##arg)
+#undef pr_debug
+#define pr_debug(format, arg...) printk(KERN_DEBUG format, ##arg)
+#endif /* CONFIG_DYNAMIC_DEBUG */
+
+
+#undef hlist_for_each_entry_safe
+#define hlist_for_each_entry_safe(pos, n, head, member)			     \
+	for (n = NULL, pos = hlist_entry_safe((head)->first, typeof(*(pos)), \
+					      member);			     \
+	     pos;							     \
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+#ifdef uninitialized_var
+#undef uninitialized_var
+#define uninitialized_var(x) x = *(&(x))
+#endif
+
+#ifdef WRITE_ONCE
+#undef WRITE_ONCE
+#define WRITE_ONCE(x, val)	((x) = (val))
+#endif /* WRITE_ONCE */
+
+#ifdef wait_event_interruptible_timeout
+#undef wait_event_interruptible_timeout
+#define wait_event_interruptible_timeout(wq_head, condition, timeout) ({	\
+	long ret;								\
+	if ((condition))							\
+		ret = timeout;							\
+	else									\
+		ret = 0;							\
+	ret;									\
+})
+#endif /* wait_event_interruptible_timeout */
+
+#ifdef max_t
+#undef max_t
+#define max_t(type, x, y) ({							\
+type __x = (x);								\
+type __y = (y);								\
+__x > __y ? __x : __y;							\
+})
+#endif /* max_t */
+
+#ifdef min_t
+#undef min_t
+#define min_t(type, x, y) ({							\
+type __x = (x);								\
+type __y = (y);								\
+__x < __y ? __x : __y;							\
+})
+#endif /* min_t */
+#endif /* __KLOCWORK__ */
+
+
+/* Older versions of GCC will trigger -Wformat-nonliteral warnings for const
+ * char * strings. Unfortunately, the implementation of do_trace_printk does
+ * this, in order to add a storage attribute to the memory. This was fixed in
+ * GCC 5.1, but we still use older distributions built with GCC 4.x.
+ *
+ * The string pointer is only passed as a const char * to the __trace_bprintk
+ * function. Since that function has the __printf attribute, it will trigger
+ * the warnings. We can't remove the attribute, so instead we'll use the
+ * __diag macro to disable -Wformat-nonliteral around the call to
+ * __trace_bprintk.
+ */
+#if GCC_VERSION < 50100
+#define __trace_bprintk(ip, fmt, args...) ({		\
+	int err;					\
+	__diag_push();					\
+	__diag(ignored "-Wformat-nonliteral");		\
+	err = __trace_bprintk(ip, fmt, ##args);		\
+	__diag_pop();					\
+	err;						\
+})
+#endif /* GCC_VERSION < 5.1.0 */
+
+/* Newer kernels removed <linux/pci-aspm.h> */
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0)) && \
+     (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,3)) && \
+     !(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))))
+#define HAVE_PCI_ASPM_H
+#endif
+
+#include <linux/aer.h>
+#include <linux/pci_hotplug.h>
+#include <linux/of_net.h>
+#include <linux/of.h>
+#define HAVE_SET_RX_MODE
+#define HAVE_STRUCT_DEVICE_OF_NODE
+#define HAVE_BRIDGE_FILTER
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+#ifndef NAPI_POLL_WEIGHT
+#define NAPI_POLL_WEIGHT 64
+#endif
+#ifdef CONFIG_PCI_IOV
+int __kc_pci_vfs_assigned(struct pci_dev *dev);
+#else
+static inline int __kc_pci_vfs_assigned(struct pci_dev __always_unused *dev)
+{
+	return 0;
+}
+#endif
+#define pci_vfs_assigned(dev) __kc_pci_vfs_assigned(dev)
+
+#ifndef list_first_entry_or_null
+#define list_first_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+#endif
+
+#ifndef VLAN_TX_COOKIE_MAGIC
+static inline struct sk_buff *__kc__vlan_hwaccel_put_tag(struct sk_buff *skb,
+							 u16 vlan_tci)
+{
+#ifdef VLAN_TAG_PRESENT
+	vlan_tci |= VLAN_TAG_PRESENT;
+#endif
+	skb->vlan_tci = vlan_tci;
+        return skb;
+}
+#define __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci) \
+	__kc__vlan_hwaccel_put_tag(skb, vlan_tci)
+#endif
+
+#ifdef HAVE_FDB_OPS
+#if defined(HAVE_NDO_FDB_ADD_NLATTR)
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev,
+			  const unsigned char *addr, u16 flags);
+#elif defined(USE_CONST_DEV_UC_CHAR)
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct net_device *dev,
+			  const unsigned char *addr, u16 flags);
+#else
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct net_device *dev,
+			  unsigned char *addr, u16 flags);
+#endif /* HAVE_NDO_FDB_ADD_NLATTR */
+#if defined(HAVE_FDB_DEL_NLATTR)
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev,
+			  const unsigned char *addr);
+#elif defined(USE_CONST_DEV_UC_CHAR)
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev,
+			  const unsigned char *addr);
+#else
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev,
+			  unsigned char *addr);
+#endif /* HAVE_FDB_DEL_NLATTR */
+#define ndo_dflt_fdb_add __kc_ndo_dflt_fdb_add
+#define ndo_dflt_fdb_del __kc_ndo_dflt_fdb_del
+#endif /* HAVE_FDB_OPS */
+
+#ifndef PCI_DEVID
+#define PCI_DEVID(bus, devfn)  ((((u16)(bus)) << 8) | (devfn))
+#endif
+
+/* The definitions for these functions when CONFIG_OF_NET is defined are
+ * pulled in from <linux/of_net.h>. For kernels older than 3.5 we already have
+ * backports for when CONFIG_OF_NET is true. These are separated and
+ * duplicated in order to cover all cases so that all kernels get either the
+ * real definitions (when CONFIG_OF_NET is defined) or the stub definitions
+ * (when CONFIG_OF_NET is not defined, or the kernel is too old to have real
+ * definitions).
+ */
+#ifndef CONFIG_OF_NET
+static inline int of_get_phy_mode(struct device_node __always_unused *np)
+{
+	return -ENODEV;
+}
+
+static inline const void *
+of_get_mac_address(struct device_node __always_unused *np)
+{
+	return NULL;
+}
+#endif
+
+#else /* >= 3.10.0 */
+#define HAVE_ENCAP_TSO_OFFLOAD
+#define USE_DEFAULT_FDB_DEL_DUMP
+#define HAVE_SKB_INNER_NETWORK_HEADER
+
+#if (RHEL_RELEASE_CODE && \
+     (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,0)))
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0))
+#define HAVE_RHEL7_PCI_DRIVER_RH
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))
+#define HAVE_RHEL7_PCI_RESET_NOTIFY
+#endif /* RHEL >= 7.2 */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5))
+#define HAVE_GENEVE_RX_OFFLOAD
+#endif /* RHEL < 7.5 */
+#define HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+#define HAVE_RHEL7_NET_DEVICE_OPS_EXT
+#if !defined(HAVE_UDP_ENC_TUNNEL) && IS_ENABLED(CONFIG_GENEVE)
+#define HAVE_UDP_ENC_TUNNEL
+#endif /* !HAVE_UDP_ENC_TUNNEL && CONFIG_GENEVE */
+#endif /* RHEL >= 7.3 */
+
+/* new hooks added to net_device_ops_extended in RHEL7.4 */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SET_VF_VLAN
+#define HAVE_RHEL7_NETDEV_OPS_EXT_NDO_UDP_TUNNEL
+#define HAVE_UDP_ENC_RX_OFFLOAD
+#endif /* RHEL >= 7.4 */
+#else  /* RHEL >= 8.0 */
+#define HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+#define NO_NETDEV_BPF_PROG_ATTACHED
+#define HAVE_NDO_SELECT_QUEUE_SB_DEV
+#endif /* RHEL >= 8.0 */
+#endif /* RHEL >= 7.0 */
+#endif /* >= 3.10.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) )
+#define netdev_notifier_info_to_dev(ptr) ptr
+#ifndef time_in_range64
+#define time_in_range64(a, b, c) \
+	(time_after_eq64(a, b) && \
+	 time_before_eq64(a, c))
+#endif /* time_in_range64 */
+#if ((RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,6)) ||\
+     (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(11,4,0)))
+#define HAVE_NDO_SET_VF_LINK_STATE
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#endif
+#else /* >= 3.11.0 */
+#define HAVE_NDO_SET_VF_LINK_STATE
+#define HAVE_SKB_INNER_PROTOCOL
+#define HAVE_MPLS_FEATURES
+#endif /* >= 3.11.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) )
+int __kc_pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
+			       enum pcie_link_width *width);
+#ifndef pcie_get_minimum_link
+#define pcie_get_minimum_link(_p, _s, _w) __kc_pcie_get_minimum_link(_p, _s, _w)
+#endif
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6,7))
+int _kc_pci_wait_for_pending_transaction(struct pci_dev *dev);
+#define pci_wait_for_pending_transaction _kc_pci_wait_for_pending_transaction
+#endif /* <RHEL6.7 */
+
+#else /* >= 3.12.0 */
+#if ( SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#endif
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) )
+#define HAVE_VXLAN_RX_OFFLOAD
+#if !defined(HAVE_UDP_ENC_TUNNEL) && IS_ENABLED(CONFIG_VXLAN)
+#define HAVE_UDP_ENC_TUNNEL
+#endif
+#endif /* < 4.8.0 */
+#define HAVE_NDO_GET_PHYS_PORT_ID
+#define HAVE_NETIF_SET_XPS_QUEUE_CONST_MASK
+#endif /* >= 3.12.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) )
+#define dma_set_mask_and_coherent(_p, _m) __kc_dma_set_mask_and_coherent(_p, _m)
+int __kc_dma_set_mask_and_coherent(struct device *dev, u64 mask);
+#ifndef u64_stats_init
+#define u64_stats_init(a) do { } while(0)
+#endif
+#undef BIT_ULL
+#define BIT_ULL(n) (1ULL << (n))
+
+#if (!(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
+     !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,0)))
+static inline struct pci_dev *pci_upstream_bridge(struct pci_dev *dev)
+{
+	dev = pci_physfn(dev);
+	if (pci_is_root_bus(dev->bus))
+		return NULL;
+
+	return dev->bus->self;
+}
+#endif
+
+#if (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,1,0))
+#undef HAVE_STRUCT_PAGE_PFMEMALLOC
+#define HAVE_DCBNL_OPS_SETAPP_RETURN_INT
+#endif
+#ifndef list_next_entry
+#define list_next_entry(pos, member) \
+	list_entry((pos)->member.next, typeof(*(pos)), member)
+#endif
+#ifndef list_prev_entry
+#define list_prev_entry(pos, member) \
+	list_entry((pos)->member.prev, typeof(*(pos)), member)
+#endif
+
+#if ( LINUX_VERSION_CODE > KERNEL_VERSION(2,6,20) )
+#define devm_kcalloc(dev, cnt, size, flags) \
+	devm_kzalloc(dev, (cnt) * (size), flags)
+#endif /* > 2.6.20 */
+
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))
+#define list_last_entry(ptr, type, member) list_entry((ptr)->prev, type, member)
+#endif
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))
+bool _kc_pci_device_is_present(struct pci_dev *pdev);
+#define pci_device_is_present _kc_pci_device_is_present
+#endif /* <RHEL7.0 */
+#else /* >= 3.13.0 */
+#define HAVE_VXLAN_CHECKS
+#if (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#else
+#define HAVE_NDO_SELECT_QUEUE_ACCEL
+#endif
+#define HAVE_HWMON_DEVICE_REGISTER_WITH_GROUPS
+#endif
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) )
+
+#ifndef U16_MAX
+#define U16_MAX ((u16)~0U)
+#endif
+
+#ifndef U32_MAX
+#define U32_MAX ((u32)~0U)
+#endif
+
+#ifndef U64_MAX
+#define U64_MAX ((u64)~0ULL)
+#endif
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))
+#define dev_consume_skb_any(x) dev_kfree_skb_any(x)
+#define dev_consume_skb_irq(x) dev_kfree_skb_irq(x)
+#endif
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,0)) && \
+     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)))
+
+/* it isn't expected that this would be a #define unless we made it so */
+#ifndef skb_set_hash
+
+#define PKT_HASH_TYPE_NONE	0
+#define PKT_HASH_TYPE_L2	1
+#define PKT_HASH_TYPE_L3	2
+#define PKT_HASH_TYPE_L4	3
+
+enum _kc_pkt_hash_types {
+	_KC_PKT_HASH_TYPE_NONE = PKT_HASH_TYPE_NONE,
+	_KC_PKT_HASH_TYPE_L2 = PKT_HASH_TYPE_L2,
+	_KC_PKT_HASH_TYPE_L3 = PKT_HASH_TYPE_L3,
+	_KC_PKT_HASH_TYPE_L4 = PKT_HASH_TYPE_L4,
+};
+#define pkt_hash_types         _kc_pkt_hash_types
+
+#define skb_set_hash __kc_skb_set_hash
+static inline void __kc_skb_set_hash(struct sk_buff __maybe_unused *skb,
+				     u32 __maybe_unused hash,
+				     int __maybe_unused type)
+{
+#ifdef HAVE_SKB_L4_RXHASH
+	skb->l4_rxhash = (type == PKT_HASH_TYPE_L4);
+#endif
+#ifdef NETIF_F_RXHASH
+	skb->rxhash = hash;
+#endif
+}
+#endif /* !skb_set_hash */
+
+#else	/* RHEL_RELEASE_CODE >= 7.0 || SLE_VERSION_CODE >= 12.0 */
+
+#if ((RHEL_RELEASE_CODE && RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,0)) ||\
+     (SLE_VERSION_CODE && SLE_VERSION_CODE <= SLE_VERSION(12,1,0)))
+/* GPLv2 code taken from 5.10-rc2 kernel source include/linux/pci.h, Copyright
+ * original authors.
+ */
+static inline int pci_enable_msix_exact(struct pci_dev *dev,
+					struct msix_entry *entries, int nvec)
+{
+	int rc = pci_enable_msix_range(dev, entries, nvec, nvec);
+	if (rc < 0)
+		return rc;
+	return 0;
+}
+#endif /* <=EL7.0 || <=SLES 12.1 */
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#ifndef HAVE_VXLAN_RX_OFFLOAD
+#define HAVE_VXLAN_RX_OFFLOAD
+#endif /* HAVE_VXLAN_RX_OFFLOAD */
+#endif
+
+#if !defined(HAVE_UDP_ENC_TUNNEL) && IS_ENABLED(CONFIG_VXLAN)
+#define HAVE_UDP_ENC_TUNNEL
+#endif
+
+#ifndef HAVE_VXLAN_CHECKS
+#define HAVE_VXLAN_CHECKS
+#endif /* HAVE_VXLAN_CHECKS */
+#endif /* !(RHEL_RELEASE_CODE >= 7.0 && SLE_VERSION_CODE >= 12.0) */
+
+#if ((RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)) ||\
+     (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)))
+#define HAVE_NDO_DFWD_OPS
+#endif
+
+#ifndef pci_enable_msix_range
+int __kc_pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
+			       int minvec, int maxvec);
+#define pci_enable_msix_range __kc_pci_enable_msix_range
+#endif
+
+#ifndef ether_addr_copy
+#define ether_addr_copy __kc_ether_addr_copy
+static inline void __kc_ether_addr_copy(u8 *dst, const u8 *src)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	*(u32 *)dst = *(const u32 *)src;
+	*(u16 *)(dst + 4) = *(const u16 *)(src + 4);
+#else
+	u16 *a = (u16 *)dst;
+	const u16 *b = (const u16 *)src;
+
+	a[0] = b[0];
+	a[1] = b[1];
+	a[2] = b[2];
+#endif
+}
+#endif /* ether_addr_copy */
+int __kc_ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
+		       int target, unsigned short *fragoff, int *flags);
+#define ipv6_find_hdr(a, b, c, d, e) __kc_ipv6_find_hdr((a), (b), (c), (d), (e))
+
+#ifndef OPTIMIZE_HIDE_VAR
+#ifdef __GNUC__
+#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var))
+#else
+#include <linux/barrier.h>
+#define OPTIMIZE_HIDE_VAR(var)	barrier()
+#endif
+#endif
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0)) && \
+     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(10,4,0)))
+static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
+{
+#ifdef NETIF_F_RXHASH
+	return skb->rxhash;
+#else
+	return 0;
+#endif /* NETIF_F_RXHASH */
+}
+#endif /* !RHEL > 5.9 && !SLES >= 10.4 */
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5))
+#define request_firmware_direct	request_firmware
+#endif /* !RHEL || RHEL < 7.5 */
+
+#else /* >= 3.14.0 */
+
+/* for ndo_dfwd_ ops add_station, del_station and _start_xmit */
+#ifndef HAVE_NDO_DFWD_OPS
+#define HAVE_NDO_DFWD_OPS
+#endif
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#endif /* 3.14.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0) )
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) )
+#define HAVE_SKBUFF_RXHASH
+#endif /* >= 2.6.35 */
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1)) && \
+     !(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,30)))
+#define u64_stats_fetch_begin_irq u64_stats_fetch_begin_bh
+#define u64_stats_fetch_retry_irq u64_stats_fetch_retry_bh
+#endif
+
+char *_kc_devm_kstrdup(struct device *dev, const char *s, gfp_t gfp);
+#define devm_kstrdup(dev, s, gfp) _kc_devm_kstrdup(dev, s, gfp)
+
+#else /* >= 3.15.0 */
+#define HAVE_NET_GET_RANDOM_ONCE
+#define HAVE_PTP_1588_CLOCK_PINS
+#define HAVE_NETDEV_PORT
+#endif /* 3.15.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) )
+#ifndef smp_mb__before_atomic
+#define smp_mb__before_atomic() smp_mb()
+#define smp_mb__after_atomic()  smp_mb()
+#endif
+#ifndef __dev_uc_sync
+#ifdef HAVE_SET_RX_MODE
+#ifdef NETDEV_HW_ADDR_T_UNICAST
+int __kc_hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+		struct net_device *dev,
+		int (*sync)(struct net_device *, const unsigned char *),
+		int (*unsync)(struct net_device *, const unsigned char *));
+void __kc_hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+		struct net_device *dev,
+		int (*unsync)(struct net_device *, const unsigned char *));
+#endif
+#ifndef NETDEV_HW_ADDR_T_MULTICAST
+int __kc_dev_addr_sync_dev(struct dev_addr_list **list, int *count,
+		struct net_device *dev,
+		int (*sync)(struct net_device *, const unsigned char *),
+		int (*unsync)(struct net_device *, const unsigned char *));
+void __kc_dev_addr_unsync_dev(struct dev_addr_list **list, int *count,
+		struct net_device *dev,
+		int (*unsync)(struct net_device *, const unsigned char *));
+#endif
+#endif /* HAVE_SET_RX_MODE */
+
+static inline int __kc_dev_uc_sync(struct net_device __maybe_unused *dev,
+				   int __maybe_unused (*sync)(struct net_device *, const unsigned char *),
+				   int __maybe_unused (*unsync)(struct net_device *, const unsigned char *))
+{
+#ifdef NETDEV_HW_ADDR_T_UNICAST
+	return __kc_hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
+#elif defined(HAVE_SET_RX_MODE)
+	return __kc_dev_addr_sync_dev(&dev->uc_list, &dev->uc_count,
+				      dev, sync, unsync);
+#else
+	return 0;
+#endif
+}
+#define __dev_uc_sync __kc_dev_uc_sync
+
+static inline void __kc_dev_uc_unsync(struct net_device __maybe_unused *dev,
+				      int __maybe_unused (*unsync)(struct net_device *, const unsigned char *))
+{
+#ifdef HAVE_SET_RX_MODE
+#ifdef NETDEV_HW_ADDR_T_UNICAST
+	__kc_hw_addr_unsync_dev(&dev->uc, dev, unsync);
+#else /* NETDEV_HW_ADDR_T_MULTICAST */
+	__kc_dev_addr_unsync_dev(&dev->uc_list, &dev->uc_count, dev, unsync);
+#endif /* NETDEV_HW_ADDR_T_UNICAST */
+#endif /* HAVE_SET_RX_MODE */
+}
+#define __dev_uc_unsync __kc_dev_uc_unsync
+
+static inline int __kc_dev_mc_sync(struct net_device __maybe_unused *dev,
+				   int __maybe_unused (*sync)(struct net_device *, const unsigned char *),
+				   int __maybe_unused (*unsync)(struct net_device *, const unsigned char *))
+{
+#ifdef NETDEV_HW_ADDR_T_MULTICAST
+	return __kc_hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
+#elif defined(HAVE_SET_RX_MODE)
+	return __kc_dev_addr_sync_dev(&dev->mc_list, &dev->mc_count,
+				      dev, sync, unsync);
+#else
+	return 0;
+#endif
+
+}
+#define __dev_mc_sync __kc_dev_mc_sync
+
+static inline void __kc_dev_mc_unsync(struct net_device __maybe_unused *dev,
+				      int __maybe_unused (*unsync)(struct net_device *, const unsigned char *))
+{
+#ifdef HAVE_SET_RX_MODE
+#ifdef NETDEV_HW_ADDR_T_MULTICAST
+	__kc_hw_addr_unsync_dev(&dev->mc, dev, unsync);
+#else /* NETDEV_HW_ADDR_T_MULTICAST */
+	__kc_dev_addr_unsync_dev(&dev->mc_list, &dev->mc_count, dev, unsync);
+#endif /* NETDEV_HW_ADDR_T_MULTICAST */
+#endif /* HAVE_SET_RX_MODE */
+}
+#define __dev_mc_unsync __kc_dev_mc_unsync
+#endif /* __dev_uc_sync */
+
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1))
+#define HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+#endif
+
+#ifndef NETIF_F_GSO_UDP_TUNNEL_CSUM
+/* if someone backports this, hopefully they backport as a #define.
+ * declare it as zero on older kernels so that if it get's or'd in
+ * it won't effect anything, therefore preventing core driver changes
+ */
+#define NETIF_F_GSO_UDP_TUNNEL_CSUM 0
+#define SKB_GSO_UDP_TUNNEL_CSUM 0
+#endif
+void *__kc_devm_kmemdup(struct device *dev, const void *src, size_t len,
+			gfp_t gfp);
+#define devm_kmemdup __kc_devm_kmemdup
+
+#else
+#if ( ( LINUX_VERSION_CODE < KERNEL_VERSION(4,13,0) ) && \
+      ! ( SLE_VERSION_CODE && ( SLE_VERSION_CODE >= SLE_VERSION(12,4,0)) ) )
+#define HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY
+#endif /* >= 3.16.0 && < 4.13.0 && !(SLES >= 12sp4) */
+#define HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+#endif /* 3.16.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) )
+#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
+      RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) && \
+    !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))
+#ifndef timespec64
+#define timespec64 timespec
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	return ts;
+}
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	return ts64;
+}
+#define timespec64_equal timespec_equal
+#define timespec64_compare timespec_compare
+#define set_normalized_timespec64 set_normalized_timespec
+#define timespec64_add_safe timespec_add_safe
+#define timespec64_add timespec_add
+#define timespec64_sub timespec_sub
+#define timespec64_valid timespec_valid
+#define timespec64_valid_strict timespec_valid_strict
+#define timespec64_to_ns timespec_to_ns
+#define ns_to_timespec64 ns_to_timespec
+#define ktime_to_timespec64 ktime_to_timespec
+#define ktime_get_ts64 ktime_get_ts
+#define ktime_get_real_ts64 ktime_get_real_ts
+#define timespec64_add_ns timespec_add_ns
+#endif /* timespec64 */
+#endif /* !(RHEL6.8<RHEL7.0) && !RHEL7.2+ */
+
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
+     RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))
+static inline void ktime_get_real_ts64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get_real());
+}
+
+static inline void ktime_get_ts64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get());
+}
+#endif
+
+#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define hlist_add_behind(_a, _b) hlist_add_after(_b, _a)
+#endif
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5))
+#define param_ops_ullong _kc_param_ops_ullong
+extern const struct kernel_param_ops _kc_param_ops_ullong;
+#define param_set_ullong _kc_param_set_ullong
+int _kc_param_set_ullong(const char *val, const struct kernel_param *kp);
+#define param_get_ullong _kc_param_get_ullong
+int _kc_param_get_ullong(char *buffer, const struct kernel_param *kp);
+#define param_check_ullong(name, p) __param_check(name, p, unsigned long long)
+#endif /* RHEL_RELEASE_CODE < RHEL7.5 */
+
+#if RHEL_RELEASE_CODE && \
+	RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,3) && \
+	RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,3)
+static inline u64 ktime_get_ns(void)
+{
+	return ktime_to_ns(ktime_get());
+}
+
+static inline u64 ktime_get_real_ns(void)
+{
+	return ktime_to_ns(ktime_get_real());
+}
+
+static inline u64 ktime_get_boot_ns(void)
+{
+	return ktime_to_ns(ktime_get_boottime());
+}
+#endif /* RHEL < 7.3 */
+
+#else
+#define HAVE_DCBNL_OPS_SETAPP_RETURN_INT
+#include <linux/time64.h>
+#define HAVE_RHASHTABLE
+#endif /* 3.17.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) )
+#include <linux/errqueue.h>
+struct sk_buff *__kc_skb_clone_sk(struct sk_buff *skb);
+void __kc_skb_complete_tx_timestamp(struct sk_buff *skb,
+				    struct skb_shared_hwtstamps *hwtstamps);
+#define skb_clone_sk __kc_skb_clone_sk
+#define skb_complete_tx_timestamp __kc_skb_complete_tx_timestamp
+#if (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))))
+u32 __kc_eth_get_headlen(const struct net_device *dev, unsigned char *data,
+			 unsigned int max_len);
+#else
+unsigned int __kc_eth_get_headlen(unsigned char *data, unsigned int max_len);
+#endif /* !RHEL >= 8.2 */
+
+#define eth_get_headlen __kc_eth_get_headlen
+#ifndef ETH_P_XDSA
+#define ETH_P_XDSA 0x00F8
+#endif
+/* RHEL 7.1 backported csum_level, but SLES 12 and 12-SP1 did not */
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1))
+#define HAVE_SKBUFF_CSUM_LEVEL
+#endif /* >= RH 7.1 */
+
+/* RHEL 7.3 backported xmit_more */
+#if (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))
+#define HAVE_SKB_XMIT_MORE
+#endif /* >= RH 7.3 */
+
+#undef GENMASK
+#define GENMASK(h, l) \
+	(((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+#undef GENMASK_ULL
+#define GENMASK_ULL(h, l) \
+	(((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
+
+#else /*  3.18.0 */
+#define HAVE_SKBUFF_CSUM_LEVEL
+#define HAVE_SKB_XMIT_MORE
+#define HAVE_SKB_INNER_PROTOCOL_TYPE
+#endif /* 3.18.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,4) )
+#else
+#define HAVE_NDO_FEATURES_CHECK
+#endif /* 3.18.4 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,13) )
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, val) ({ ACCESS_ONCE(x) = (val); })
+#endif
+#endif /* 3.18.13 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) )
+/* netdev_phys_port_id renamed to netdev_phys_item_id */
+#define netdev_phys_item_id netdev_phys_port_id
+
+static inline void _kc_napi_complete_done(struct napi_struct *napi,
+					  int __always_unused work_done) {
+	napi_complete(napi);
+}
+/* don't use our backport if the distro kernels already have it */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE < SLE_VERSION(12,3,0))) || \
+    (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5)))
+#define napi_complete_done _kc_napi_complete_done
+#endif
+
+int _kc_bitmap_print_to_pagebuf(bool list, char *buf,
+				const unsigned long *maskp, int nmaskbits);
+#define bitmap_print_to_pagebuf _kc_bitmap_print_to_pagebuf
+
+#ifndef NETDEV_RSS_KEY_LEN
+#define NETDEV_RSS_KEY_LEN (13 * 4)
+#endif
+#if (!(RHEL_RELEASE_CODE && \
+      ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,7) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) || \
+       (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))))
+#define netdev_rss_key_fill(buffer, len) __kc_netdev_rss_key_fill(buffer, len)
+#endif /* RHEL_RELEASE_CODE */
+void __kc_netdev_rss_key_fill(void *buffer, size_t len);
+#define SPEED_20000 20000
+#define SPEED_40000 40000
+#ifndef dma_rmb
+#define dma_rmb() rmb()
+#endif
+#ifndef dev_alloc_pages
+#ifndef NUMA_NO_NODE
+#define NUMA_NO_NODE -1
+#endif
+#define dev_alloc_pages(_order) alloc_pages_node(NUMA_NO_NODE, (GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC), (_order))
+#endif
+#ifndef dev_alloc_page
+#define dev_alloc_page() dev_alloc_pages(0)
+#endif
+#if !defined(eth_skb_pad) && !defined(skb_put_padto)
+/**
+ *     __kc_skb_put_padto - increase size and pad an skbuff up to a minimal size
+ *     @skb: buffer to pad
+ *     @len: minimal length
+ *
+ *     Pads up a buffer to ensure the trailing bytes exist and are
+ *     blanked. If the buffer already contains sufficient data it
+ *     is untouched. Otherwise it is extended. Returns zero on
+ *     success. The skb is freed on error.
+ */
+static inline int __kc_skb_put_padto(struct sk_buff *skb, unsigned int len)
+{
+	unsigned int size = skb->len;
+
+	if (unlikely(size < len)) {
+		len -= size;
+		if (skb_pad(skb, len))
+			return -ENOMEM;
+		__skb_put(skb, len);
+	}
+	return 0;
+}
+#define skb_put_padto(skb, len) __kc_skb_put_padto(skb, len)
+
+static inline int __kc_eth_skb_pad(struct sk_buff *skb)
+{
+	return __kc_skb_put_padto(skb, ETH_ZLEN);
+}
+#define eth_skb_pad(skb) __kc_eth_skb_pad(skb)
+#endif /* eth_skb_pad && skb_put_padto */
+
+#ifndef SKB_ALLOC_NAPI
+/* RHEL 7.2 backported napi_alloc_skb and friends */
+static inline struct sk_buff *__kc_napi_alloc_skb(struct napi_struct *napi, unsigned int length)
+{
+	return netdev_alloc_skb_ip_align(napi->dev, length);
+}
+#define napi_alloc_skb(napi,len) __kc_napi_alloc_skb(napi,len)
+#define __napi_alloc_skb(napi,len,mask) __kc_napi_alloc_skb(napi,len)
+#endif /* SKB_ALLOC_NAPI */
+#define HAVE_CONFIG_PM_RUNTIME
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,7)) && \
+     (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+#define HAVE_RXFH_HASHFUNC
+#endif /* 6.7 < RHEL < 7.0 */
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1))
+#define HAVE_RXFH_HASHFUNC
+#define NDO_DFLT_BRIDGE_GETLINK_HAS_BRFLAGS
+#endif /* RHEL > 7.1 */
+#ifndef napi_schedule_irqoff
+#define napi_schedule_irqoff	napi_schedule
+#endif
+#ifndef READ_ONCE
+#define READ_ONCE(_x) ACCESS_ONCE(_x)
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_FDB_ADD_VID
+#endif
+#ifndef ETH_MODULE_SFF_8636
+#define ETH_MODULE_SFF_8636		0x3
+#endif
+#ifndef ETH_MODULE_SFF_8636_LEN
+#define ETH_MODULE_SFF_8636_LEN		256
+#endif
+#ifndef ETH_MODULE_SFF_8436
+#define ETH_MODULE_SFF_8436		0x4
+#endif
+#ifndef ETH_MODULE_SFF_8436_LEN
+#define ETH_MODULE_SFF_8436_LEN		256
+#endif
+#ifndef writel_relaxed
+#define writel_relaxed	writel
+#endif
+#else /* 3.19.0 */
+#define HAVE_NDO_FDB_ADD_VID
+#define HAVE_RXFH_HASHFUNC
+#define NDO_DFLT_BRIDGE_GETLINK_HAS_BRFLAGS
+#endif /* 3.19.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,20,0) )
+/* vlan_tx_xx functions got renamed to skb_vlan */
+#ifndef skb_vlan_tag_get
+#define skb_vlan_tag_get vlan_tx_tag_get
+#endif
+#ifndef skb_vlan_tag_present
+#define skb_vlan_tag_present vlan_tx_tag_present
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1))
+#define HAVE_INCLUDE_LINUX_TIMECOUNTER_H
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_BRIDGE_SET_DEL_LINK_FLAGS
+#endif
+#else
+#define HAVE_INCLUDE_LINUX_TIMECOUNTER_H
+#define HAVE_NDO_BRIDGE_SET_DEL_LINK_FLAGS
+#endif /* 3.20.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) )
+/* Definition for CONFIG_OF was introduced earlier */
+#if !defined(CONFIG_OF) && \
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+static inline struct device_node *
+pci_device_to_OF_node(const struct pci_dev __always_unused *pdev) { return NULL; }
+#else /* !CONFIG_OF && RHEL < 7.3 */
+#define HAVE_DDP_PROFILE_UPLOAD_SUPPORT
+#endif /* !CONFIG_OF && RHEL < 7.3 */
+#else /* < 4.0 */
+#define HAVE_DDP_PROFILE_UPLOAD_SUPPORT
+#endif /* < 4.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) )
+#ifdef HAVE_INCLUDE_LINUX_TIMECOUNTER_H
+#include <linux/timecounter.h>
+#else
+#include <linux/clocksource.h>
+#endif
+static inline void __kc_timecounter_adjtime(struct timecounter *tc, s64 delta)
+{
+	tc->nsec += delta;
+}
+
+static inline struct net_device *
+of_find_net_device_by_node(struct device_node __always_unused *np)
+{
+	return NULL;
+}
+
+#define timecounter_adjtime __kc_timecounter_adjtime
+#if ((RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,2,0))))
+#define HAVE_NDO_SET_VF_RSS_QUERY_EN
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_BRIDGE_GETLINK_NLFLAGS
+#define HAVE_RHEL7_EXTENDED_NDO_SET_TX_MAXRATE
+#define HAVE_NDO_SET_TX_MAXRATE
+#endif
+#if !((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,8) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) && \
+      (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2)) && \
+      (SLE_VERSION_CODE > SLE_VERSION(12,1,0)))
+unsigned int _kc_cpumask_local_spread(unsigned int i, int node);
+#define cpumask_local_spread _kc_cpumask_local_spread
+#endif
+#ifdef HAVE_RHASHTABLE
+#define rhashtable_loopup_fast(ht, key, params)		\
+	do {						\
+		(void)params;				\
+		rhashtable_lookup((ht), (key));		\
+	} while (0)
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) )
+#define rhashtable_insert_fast(ht, obj, params)			\
+	do {							\
+		(void)params;					\
+		rhashtable_insert((ht), (obj), GFP_KERNEL);	\
+	} while (0)
+
+#define rhashtable_remove_fast(ht, obj, params)			\
+	do {							\
+		(void)params;					\
+		rhashtable_remove((ht), (obj), GFP_KERNEL);	\
+	} while (0)
+
+#else /* >= 3,19,0 */
+#define rhashtable_insert_fast(ht, obj, params)			\
+	do {							\
+		(void)params;					\
+		rhashtable_insert((ht), (obj));			\
+	} while (0)
+
+#define rhashtable_remove_fast(ht, obj, params)			\
+	do {							\
+		(void)params;					\
+		rhashtable_remove((ht), (obj));			\
+	} while (0)
+
+#endif /* 3,19,0 */
+#endif /* HAVE_RHASHTABLE */
+#else /* >= 4,1,0 */
+#define HAVE_NDO_GET_PHYS_PORT_NAME
+#define HAVE_PTP_CLOCK_INFO_GETTIME64
+#define HAVE_NDO_BRIDGE_GETLINK_NLFLAGS
+#define HAVE_PASSTHRU_FEATURES_CHECK
+#define HAVE_NDO_SET_VF_RSS_QUERY_EN
+#define HAVE_NDO_SET_TX_MAXRATE
+#endif /* 4,1,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,1,9))
+#if (!(RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2)) && \
+     !((SLE_VERSION_CODE == SLE_VERSION(11,3,0)) && \
+       (SLE_LOCALVERSION_CODE >= SLE_LOCALVERSION(0,47,71))) && \
+     !((SLE_VERSION_CODE == SLE_VERSION(11,4,0)) && \
+       (SLE_LOCALVERSION_CODE >= SLE_LOCALVERSION(65,0,0))) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,1,0)))
+static inline bool page_is_pfmemalloc(struct page __maybe_unused *page)
+{
+#ifdef HAVE_STRUCT_PAGE_PFMEMALLOC
+	return page->pfmemalloc;
+#else
+	return false;
+#endif
+}
+#endif /* !RHEL7.2+ && !SLES11sp3(3.0.101-0.47.71+ update) && !SLES11sp4(3.0.101-65+ update) & !SLES12sp1+ */
+#else
+#undef HAVE_STRUCT_PAGE_PFMEMALLOC
+#endif /* 4.1.9 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,1,0)))
+#define ETHTOOL_RX_FLOW_SPEC_RING	0x00000000FFFFFFFFULL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF	0x000000FF00000000ULL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32
+static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie)
+{
+	return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie;
+};
+
+static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie)
+{
+	return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >>
+				ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
+};
+#endif /* ! RHEL >= 7.2 && ! SLES >= 12.1 */
+#if (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_NDO_DFLT_BRIDGE_GETLINK_VLAN_SUPPORT
+#endif
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,27))
+#if (!((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
+	RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) || \
+       RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))
+static inline bool pci_ari_enabled(struct pci_bus *bus)
+{
+	return bus->self && bus->self->ari_enabled;
+}
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))
+#define HAVE_VF_STATS
+#endif /* (RHEL7.2+) */
+#endif /* !(RHEL6.8+ || RHEL7.2+) */
+#else
+static inline bool pci_ari_enabled(struct pci_bus *bus)
+{
+	return false;
+}
+#endif /* 2.6.27 */
+#else
+#define HAVE_NDO_DFLT_BRIDGE_GETLINK_VLAN_SUPPORT
+#define HAVE_VF_STATS
+#endif /* 4.2.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,2,0)))
+/**
+ * _kc_flow_dissector_key_ipv4_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
+ */
+struct _kc_flow_dissector_key_ipv4_addrs {
+	__be32 src;
+	__be32 dst;
+};
+
+/**
+ * _kc_flow_dissector_key_ipv6_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
+ */
+struct _kc_flow_dissector_key_ipv6_addrs {
+	struct in6_addr src;
+	struct in6_addr dst;
+};
+
+/**
+ * _kc_flow_dissector_key_addrs:
+ * @v4addrs: IPv4 addresses
+ * @v6addrs: IPv6 addresses
+ */
+struct _kc_flow_dissector_key_addrs {
+	union {
+		struct _kc_flow_dissector_key_ipv4_addrs v4addrs;
+		struct _kc_flow_dissector_key_ipv6_addrs v6addrs;
+	};
+};
+
+/**
+ * _kc_flow_dissector_key_tp_ports:
+ *	@ports: port numbers of Transport header
+ *		src: source port number
+ *		dst: destination port number
+ */
+struct _kc_flow_dissector_key_ports {
+	union {
+		__be32 ports;
+		struct {
+			__be16 src;
+			__be16 dst;
+		};
+	};
+};
+
+/**
+ * _kc_flow_dissector_key_basic:
+ * @n_proto: Network header protocol (eg. IPv4/IPv6)
+ * @ip_proto: Transport header protocol (eg. TCP/UDP)
+ * @padding: padding for alignment
+ */
+struct _kc_flow_dissector_key_basic {
+	__be16	n_proto;
+	u8	ip_proto;
+	u8	padding;
+};
+
+struct _kc_flow_keys {
+	struct _kc_flow_dissector_key_basic basic;
+	struct _kc_flow_dissector_key_ports ports;
+	struct _kc_flow_dissector_key_addrs addrs;
+};
+
+/* These are all the include files for kernels inside this #ifdef block that
+ * have any reference to the in kernel definition of struct flow_keys. The
+ * reason for putting them here is to make 100% sure that these files do not get
+ * included after re-defining flow_keys to _kc_flow_keys. This is done to
+ * prevent any possible ABI issues that this structure re-definition could case.
+ */
+#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) && \
+      LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)) || \
+      RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,7) || \
+      SLE_VERSION_CODE >= SLE_VERSION(11,4,0))
+#include <net/flow_keys.h>
+#endif /* (>= 3.3.0 && < 4.2.0) || >= RHEL 6.7  || >= SLE 11.4 */
+#if (LINUX_VERSION_CODE == KERNEL_VERSION(4,2,0))
+#include <net/flow_dissector.h>
+#endif /* 4.2.0 */
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/flow.h>
+
+#define flow_keys _kc_flow_keys
+bool
+_kc_skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+			       struct flow_keys *flow,
+			       unsigned int __always_unused flags);
+#define skb_flow_dissect_flow_keys	_kc_skb_flow_dissect_flow_keys
+#endif /* ! >= RHEL 7.4 && ! >= SLES 12.2 */
+
+#if ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)) || \
+     (SLE_VERSION_CODE >= SLE_VERSION(12,2,0)))
+#include <net/dst_metadata.h>
+#endif /* >= RHEL7.3 || >= SLE12sp2 */
+#else /* >= 4.3.0 */
+#include <net/dst_metadata.h>
+#endif /* 4.3.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,4,0))
+#if (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))
+#define HAVE_NDO_SET_VF_TRUST
+#endif /* (RHEL_RELEASE >= 7.3) */
+#ifndef CONFIG_64BIT
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0))
+#include <asm-generic/io-64-nonatomic-lo-hi.h>	/* 32-bit readq/writeq */
+#else /* 3.3.0 => 4.3.x */
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26))
+#include <asm-generic/int-ll64.h>
+#endif /* 2.6.26 => 3.3.0 */
+#ifndef readq
+static inline __u64 readq(const volatile void __iomem *addr)
+{
+	const volatile u32 __iomem *p = addr;
+	u32 low, high;
+
+	low = readl(p);
+	high = readl(p + 1);
+
+	return low + ((u64)high << 32);
+}
+#define readq readq
+#endif
+
+#ifndef writeq
+static inline void writeq(__u64 val, volatile void __iomem *addr)
+{
+	writel(val, addr);
+	writel(val >> 32, (u8 *)addr + 4);
+}
+#define writeq writeq
+#endif
+#endif /* < 3.3.0 */
+#endif /* !CONFIG_64BIT */
+#else /* < 4.4.0 */
+#define HAVE_NDO_SET_VF_TRUST
+
+#ifndef CONFIG_64BIT
+#include <linux/io-64-nonatomic-lo-hi.h>	/* 32-bit readq/writeq */
+#endif /* !CONFIG_64BIT */
+#endif /* 4.4.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0))
+/* protect against a likely backport */
+#ifndef NETIF_F_CSUM_MASK
+#define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM
+#endif /* NETIF_F_CSUM_MASK */
+#ifndef NETIF_F_SCTP_CRC
+#define NETIF_F_SCTP_CRC NETIF_F_SCTP_CSUM
+#endif /* NETIF_F_SCTP_CRC */
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
+#define eth_platform_get_mac_address _kc_eth_platform_get_mac_address
+int _kc_eth_platform_get_mac_address(struct device *dev __maybe_unused,
+				     u8 *mac_addr __maybe_unused);
+#endif /* !(RHEL_RELEASE >= 7.3) */
+#else /* 4.5.0 */
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) )
+#define HAVE_GENEVE_RX_OFFLOAD
+#if !defined(HAVE_UDP_ENC_TUNNEL) && IS_ENABLED(CONFIG_GENEVE)
+#define HAVE_UDP_ENC_TUNNEL
+#endif
+#endif /* < 4.8.0 */
+#define HAVE_NETIF_NAPI_ADD_CALLS_NAPI_HASH_ADD
+#define HAVE_NETDEV_UPPER_INFO
+#endif /* 4.5.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0))
+#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,3))
+static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22))
+	return skb->head + skb->csum_start;
+#else /* < 2.6.22 */
+	return skb_transport_header(skb);
+#endif
+}
+#endif
+
+#if !(UBUNTU_VERSION_CODE && \
+		UBUNTU_VERSION_CODE >= UBUNTU_VERSION(4,4,0,21)) && \
+	!(RHEL_RELEASE_CODE && \
+		(RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))) && \
+	!(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0)))
+static inline void napi_consume_skb(struct sk_buff *skb,
+				    int __always_unused budget)
+{
+	dev_consume_skb_any(skb);
+}
+
+#endif /* UBUNTU 4,4,0,21, RHEL 7.2, SLES12 SP3 */
+#if !(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0))) && \
+	!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff)
+{
+	* sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+}
+#endif
+#if !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))) && \
+	!(SLE_VERSION_CODE && (SLE_VERSION_CODE > SLE_VERSION(12,3,0)))
+static inline void page_ref_inc(struct page *page)
+{
+	get_page(page);
+}
+#else
+#define HAVE_PAGE_COUNT_BULK_UPDATE
+#endif
+#ifndef IPV4_USER_FLOW
+#define	IPV4_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
+#endif
+
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_TC_SETUP_CLSFLOWER
+#define HAVE_TC_FLOWER_ENC
+#endif
+
+#if ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,7)) || \
+     (SLE_VERSION_CODE >= SLE_VERSION(12,2,0)))
+#define HAVE_TC_SETUP_CLSU32
+#endif
+
+#if (SLE_VERSION_CODE >= SLE_VERSION(12,2,0))
+#define HAVE_TC_SETUP_CLSFLOWER
+#endif
+
+#else /* >= 4.6.0 */
+#define HAVE_PAGE_COUNT_BULK_UPDATE
+#define HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+#define HAVE_PTP_CROSSTIMESTAMP
+#define HAVE_TC_SETUP_CLSFLOWER
+#define HAVE_TC_SETUP_CLSU32
+#endif /* 4.6.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0))
+#if ((SLE_VERSION_CODE >= SLE_VERSION(12,3,0)) ||\
+     (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)))
+#define HAVE_NETIF_TRANS_UPDATE
+#endif /* SLES12sp3+ || RHEL7.4+ */
+#if ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)) ||\
+     (SLE_VERSION_CODE >= SLE_VERSION(12,3,0)))
+#define HAVE_ETHTOOL_25G_BITS
+#define HAVE_ETHTOOL_50G_BITS
+#define HAVE_ETHTOOL_100G_BITS
+#endif /* RHEL7.3+ || SLES12sp3+ */
+#else /* 4.7.0 */
+#define HAVE_NETIF_TRANS_UPDATE
+#define HAVE_ETHTOOL_CONVERT_U32_AND_LINK_MODE
+#define HAVE_ETHTOOL_25G_BITS
+#define HAVE_ETHTOOL_50G_BITS
+#define HAVE_ETHTOOL_100G_BITS
+#define HAVE_TCF_MIRRED_REDIRECT
+#endif /* 4.7.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0))
+#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+enum udp_parsable_tunnel_type {
+	UDP_TUNNEL_TYPE_VXLAN,
+	UDP_TUNNEL_TYPE_GENEVE,
+};
+struct udp_tunnel_info {
+	unsigned short type;
+	sa_family_t sa_family;
+	__be16 port;
+};
+#endif
+
+#if (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4,8,0,0))
+#define tc_no_actions(_exts) true
+#define tc_for_each_action(_a, _exts) while (0)
+#endif
+#if !(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0))) &&\
+	!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+static inline int
+#ifdef HAVE_NON_CONST_PCI_DRIVER_NAME
+pci_request_io_regions(struct pci_dev *pdev, char *name)
+#else
+pci_request_io_regions(struct pci_dev *pdev, const char *name)
+#endif
+{
+	return pci_request_selected_regions(pdev,
+			    pci_select_bars(pdev, IORESOURCE_IO), name);
+}
+
+static inline void
+pci_release_io_regions(struct pci_dev *pdev)
+{
+	return pci_release_selected_regions(pdev,
+			    pci_select_bars(pdev, IORESOURCE_IO));
+}
+
+static inline int
+#ifdef HAVE_NON_CONST_PCI_DRIVER_NAME
+pci_request_mem_regions(struct pci_dev *pdev, char *name)
+#else
+pci_request_mem_regions(struct pci_dev *pdev, const char *name)
+#endif
+{
+	return pci_request_selected_regions(pdev,
+			    pci_select_bars(pdev, IORESOURCE_MEM), name);
+}
+
+static inline void
+pci_release_mem_regions(struct pci_dev *pdev)
+{
+	return pci_release_selected_regions(pdev,
+			    pci_select_bars(pdev, IORESOURCE_MEM));
+}
+#endif /* !SLE_VERSION(12,3,0) */
+#if ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)) ||\
+     (SLE_VERSION_CODE >= SLE_VERSION(12,3,0)))
+#define HAVE_ETHTOOL_NEW_50G_BITS
+#endif /* RHEL7.4+ || SLES12sp3+ */
+#else
+#define HAVE_UDP_ENC_RX_OFFLOAD
+#define HAVE_ETHTOOL_NEW_50G_BITS
+#endif /* 4.8.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,9,0))
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#if (!(RHEL_RELEASE_CODE) && !(SLE_VERSION_CODE) || \
+    (SLE_VERSION_CODE && (SLE_VERSION_CODE < SLE_VERSION(12,3,0))))
+#define HAVE_TC_FLOWER_VLAN_IN_TAGS
+#endif /* !RHEL_RELEASE_CODE && !SLE_VERSION_CODE || <SLE_VERSION(12,3,0) */
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_ETHTOOL_NEW_1G_BITS
+#define HAVE_ETHTOOL_NEW_10G_BITS
+#endif /* RHEL7.4+ */
+#if (!(SLE_VERSION_CODE) && !(RHEL_RELEASE_CODE)) || \
+     SLE_VERSION_CODE && (SLE_VERSION_CODE <= SLE_VERSION(12,3,0)) || \
+     RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,5))
+#define time_is_before_jiffies64(a)	time_after64(get_jiffies_64(), a)
+#endif /* !SLE_VERSION_CODE && !RHEL_RELEASE_CODE || (SLES <= 12.3.0) || (RHEL <= 7.5) */
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,4))
+static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
+{
+	dst[0] = mask & ULONG_MAX;
+
+	if (sizeof(mask) > sizeof(unsigned long))
+		dst[1] = mask >> 32;
+}
+#endif /* <RHEL7.4 */
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,3,0)) && \
+     !(UBUNTU_VERSION_CODE >= UBUNTU_VERSION(4,13,0,16)))
+static inline bool eth_type_vlan(__be16 ethertype)
+{
+	switch (ethertype) {
+	case htons(ETH_P_8021Q):
+#ifdef ETH_P_8021AD
+	case htons(ETH_P_8021AD):
+#endif
+		return true;
+	default:
+		return false;
+	}
+}
+#endif /* Linux < 4.9 || RHEL < 7.4 || SLES < 12.3 || Ubuntu < 4.3.0-16 */
+#else /* >=4.9 */
+#define HAVE_FLOW_DISSECTOR_KEY_VLAN_PRIO
+#define HAVE_ETHTOOL_NEW_1G_BITS
+#define HAVE_ETHTOOL_NEW_10G_BITS
+#endif /* KERNEL_VERSION(4.9.0) */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0))
+/* SLES 12.3 and RHEL 7.5 backported this interface */
+#if (!SLE_VERSION_CODE && !RHEL_RELEASE_CODE) || \
+    (SLE_VERSION_CODE && (SLE_VERSION_CODE < SLE_VERSION(12,3,0))) || \
+    (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5)))
+static inline bool _kc_napi_complete_done2(struct napi_struct *napi,
+					   int __always_unused work_done)
+{
+	/* it was really hard to get napi_complete_done to be safe to call
+	 * recursively without running into our own kcompat, so just use
+	 * napi_complete
+	 */
+	napi_complete(napi);
+
+	/* true means that the stack is telling the driver to go-ahead and
+	 * re-enable interrupts
+	 */
+	return true;
+}
+
+#ifdef napi_complete_done
+#undef napi_complete_done
+#endif
+#define napi_complete_done _kc_napi_complete_done2
+#endif /* sles and rhel exclusion for < 4.10 */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_DEV_WALK_API
+#define HAVE_ETHTOOL_NEW_2500MB_BITS
+#define HAVE_ETHTOOL_5G_BITS
+#endif /* RHEL7.4+ */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE == SLE_VERSION(12,3,0)))
+#define HAVE_STRUCT_DMA_ATTRS
+#endif /* (SLES == 12.3.0) */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0)))
+#define HAVE_NETDEVICE_MIN_MAX_MTU
+#endif /* (SLES >= 12.3.0) */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define HAVE_STRUCT_DMA_ATTRS
+#define HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+#define HAVE_NETDEVICE_MIN_MAX_MTU
+#endif
+#if (!(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0))) && \
+     !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))))
+#ifndef dma_map_page_attrs
+#define dma_map_page_attrs __kc_dma_map_page_attrs
+static inline dma_addr_t __kc_dma_map_page_attrs(struct device *dev,
+						 struct page *page,
+						 size_t offset, size_t size,
+						 enum dma_data_direction dir,
+						 unsigned long __always_unused attrs)
+{
+	return dma_map_page(dev, page, offset, size, dir);
+}
+#endif
+
+#ifndef dma_unmap_page_attrs
+#define dma_unmap_page_attrs __kc_dma_unmap_page_attrs
+static inline void __kc_dma_unmap_page_attrs(struct device *dev,
+					     dma_addr_t addr, size_t size,
+					     enum dma_data_direction dir,
+					     unsigned long __always_unused attrs)
+{
+	dma_unmap_page(dev, addr, size, dir);
+}
+#endif
+
+static inline void __page_frag_cache_drain(struct page *page,
+					   unsigned int count)
+{
+#ifdef HAVE_PAGE_COUNT_BULK_UPDATE
+	if (!page_ref_sub_and_test(page, count))
+		return;
+
+	init_page_count(page);
+#else
+	BUG_ON(count > 1);
+	if (!count)
+		return;
+#endif
+	__free_pages(page, compound_order(page));
+}
+#endif /* !SLE_VERSION(12,3,0) && !RHEL_VERSION(7,5) */
+#if ((SLE_VERSION_CODE && (SLE_VERSION_CODE > SLE_VERSION(12,3,0))) ||\
+     (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define HAVE_SWIOTLB_SKIP_CPU_SYNC
+#endif
+
+#if ((SLE_VERSION_CODE && (SLE_VERSION_CODE < SLE_VERSION(15,0,0))) ||\
+     (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4))))
+#define page_frag_free __free_page_frag
+#endif
+#ifndef ETH_MIN_MTU
+#define ETH_MIN_MTU 68
+#endif /* ETH_MIN_MTU */
+
+/* If kernel is older than 4.10 but distro is RHEL >= 7.5 || SLES > 12SP4,
+ * it does have support for NAPI_STATE
+ */
+#if ((RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))) ||\
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,4,0))))
+#define HAVE_NAPI_STATE_IN_BUSY_POLL
+#endif /* RHEL >= 7.5 || SLES >=12.4 */
+#else /* >= 4.10 */
+#define HAVE_TC_FLOWER_ENC
+#define HAVE_NETDEVICE_MIN_MAX_MTU
+#define HAVE_SWIOTLB_SKIP_CPU_SYNC
+#define HAVE_NETDEV_TC_RESETS_XPS
+#define HAVE_XPS_QOS_SUPPORT
+#define HAVE_DEV_WALK_API
+#define HAVE_ETHTOOL_NEW_2500MB_BITS
+#define HAVE_ETHTOOL_5G_BITS
+/* kernel 4.10 onwards, as part of busy_poll rewrite, new state were added
+ * which is part of NAPI:state. If NAPI:state=NAPI_STATE_IN_BUSY_POLL,
+ * it means napi_poll is invoked in busy_poll context
+ */
+#define HAVE_NAPI_STATE_IN_BUSY_POLL
+#define HAVE_TCF_MIRRED_EGRESS_REDIRECT
+#define HAVE_PTP_CLOCK_INFO_ADJFINE
+#endif /* 4.10.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0))
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#define HAVE_NDO_BUSY_POLL
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+#if ((SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0))) || \
+     (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))))
+#define HAVE_VOID_NDO_GET_STATS64
+#endif /* (SLES >= 12.3.0) && (RHEL >= 7.5) */
+
+static inline void _kc_dev_kfree_skb_irq(struct sk_buff *skb)
+{
+	if (!skb)
+		return;
+	dev_kfree_skb_irq(skb);
+}
+
+#undef dev_kfree_skb_irq
+#define dev_kfree_skb_irq _kc_dev_kfree_skb_irq
+
+static inline void _kc_dev_consume_skb_irq(struct sk_buff *skb)
+{
+	if (!skb)
+		return;
+	dev_consume_skb_irq(skb);
+}
+
+#undef dev_consume_skb_irq
+#define dev_consume_skb_irq _kc_dev_consume_skb_irq
+
+static inline void _kc_dev_kfree_skb_any(struct sk_buff *skb)
+{
+	if (!skb)
+		return;
+	dev_kfree_skb_any(skb);
+}
+
+#undef dev_kfree_skb_any
+#define dev_kfree_skb_any _kc_dev_kfree_skb_any
+
+static inline void _kc_dev_consume_skb_any(struct sk_buff *skb)
+{
+	if (!skb)
+		return;
+	dev_consume_skb_any(skb);
+}
+
+#undef dev_consume_skb_any
+#define dev_consume_skb_any _kc_dev_consume_skb_any
+
+#else /* > 4.11 */
+#define HAVE_VOID_NDO_GET_STATS64
+#define HAVE_VM_OPS_FAULT_NO_VMA
+#endif /* 4.11.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,12,0))
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,7) && \
+     RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0))
+/* The RHEL 7.7+ NL_SET_ERR_MSG_MOD triggers unused parameter warnings */
+#undef NL_SET_ERR_MSG_MOD
+#endif
+/* If kernel is older than 4.12 but distro is RHEL >= 7.5 || SLES > 12SP4,
+ * it does have support for MIN_NAPI_ID
+ */
+#if ((RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,4,0))))
+#define HAVE_MIN_NAPI_ID
+#endif /* RHEL >= 7.5 || SLES >= 12.4 */
+#ifndef NL_SET_ERR_MSG_MOD
+#define NL_SET_ERR_MSG_MOD(extack, msg)						\
+	do {									\
+		uninitialized_var(extack);					\
+		pr_err(KBUILD_MODNAME ": " msg);				\
+	} while (0)
+#endif /* !NL_SET_ERR_MSG_MOD */
+#else /* >= 4.12 */
+#define HAVE_NAPI_BUSY_LOOP
+#define HAVE_MIN_NAPI_ID
+#endif /* 4.12 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,13,0))
+#if ((SLE_VERSION_CODE && (SLE_VERSION_CODE > SLE_VERSION(12,3,0))) || \
+     (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define HAVE_TCF_EXTS_HAS_ACTION
+#endif
+#define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,4,0)))
+#define HAVE_PCI_ERROR_HANDLER_RESET_PREPARE
+#endif /* SLES >= 12sp4 */
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,4,0)))
+#define UUID_SIZE 16
+typedef struct {
+	__u8 b[UUID_SIZE];
+} uuid_t;
+#define UUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
+((uuid_t)								\
+{{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
+   ((b) >> 8) & 0xff, (b) & 0xff,					\
+   ((c) >> 8) & 0xff, (c) & 0xff,					\
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+static inline bool uuid_equal(const uuid_t *u1, const uuid_t *u2)
+{
+	return memcmp(u1, u2, sizeof(uuid_t)) == 0;
+}
+#else
+#define HAVE_METADATA_PORT_INFO
+#endif /* !(RHEL >= 7.5) && !(SLES >= 12.4) */
+#else /* > 4.13 */
+#define HAVE_METADATA_PORT_INFO
+#define HAVE_HWTSTAMP_FILTER_NTP_ALL
+#define HAVE_NDO_SETUP_TC_CHAIN_INDEX
+#define HAVE_PCI_ERROR_HANDLER_RESET_PREPARE
+#define HAVE_PTP_CLOCK_DO_AUX_WORK
+#endif /* 4.13.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0))
+#ifdef ETHTOOL_GLINKSETTINGS
+#ifndef ethtool_link_ksettings_del_link_mode
+#define ethtool_link_ksettings_del_link_mode(ptr, name, mode)		\
+	__clear_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+#endif
+#endif /* ETHTOOL_GLINKSETTINGS */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,4,0)))
+#define HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+#endif
+
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+#define HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC
+#endif
+
+#define TIMER_DATA_TYPE		unsigned long
+#define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
+
+#define timer_setup(timer, callback, flags)				\
+	__setup_timer((timer), (TIMER_FUNC_TYPE)(callback),		\
+		      (TIMER_DATA_TYPE)(timer), (flags))
+
+#define from_timer(var, callback_timer, timer_fieldname) \
+	container_of(callback_timer, typeof(*var), timer_fieldname)
+
+#ifndef xdp_do_flush_map
+#define xdp_do_flush_map() do {} while (0)
+#endif
+struct _kc_xdp_buff {
+	void *data;
+	void *data_end;
+	void *data_hard_start;
+};
+#define xdp_buff _kc_xdp_buff
+struct _kc_bpf_prog {
+};
+#define bpf_prog _kc_bpf_prog
+#ifndef DIV_ROUND_DOWN_ULL
+#define DIV_ROUND_DOWN_ULL(ll, d) \
+	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
+#endif /* DIV_ROUND_DOWN_ULL */
+#else /* > 4.14 */
+#define HAVE_XDP_SUPPORT
+#define HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+#define HAVE_TCF_EXTS_HAS_ACTION
+#endif /* 4.14.0 */
+
+/*****************************************************************************/
+#ifndef ETHTOOL_GLINKSETTINGS
+
+#define __ETHTOOL_LINK_MODE_MASK_NBITS 32
+#define ETHTOOL_LINK_MASK_SIZE BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/**
+ * struct ethtool_link_ksettings
+ * @link_modes: supported and advertising, single item arrays
+ * @link_modes.supported: bitmask of supported link speeds
+ * @link_modes.advertising: bitmask of currently advertised speeds
+ * @base: base link details
+ * @base.speed: current link speed
+ * @base.port: current port type
+ * @base.duplex: current duplex mode
+ * @base.autoneg: current autonegotiation settings
+ *
+ * This struct and the following macros provide a way to support the old
+ * ethtool get/set_settings API on older kernels, but in the style of the new
+ * GLINKSETTINGS API.  In this way, the same code can be used to support both
+ * APIs as seemlessly as possible.
+ *
+ * It should be noted the old API only has support up to the first 32 bits.
+ */
+struct ethtool_link_ksettings {
+	struct {
+		u32 speed;
+		u8 port;
+		u8 duplex;
+		u8 autoneg;
+	} base;
+	struct {
+		unsigned long supported[ETHTOOL_LINK_MASK_SIZE];
+		unsigned long advertising[ETHTOOL_LINK_MASK_SIZE];
+	} link_modes;
+};
+
+#define ETHTOOL_LINK_NAME_advertising(mode) ADVERTISED_ ## mode
+#define ETHTOOL_LINK_NAME_supported(mode) SUPPORTED_ ## mode
+#define ETHTOOL_LINK_NAME(name) ETHTOOL_LINK_NAME_ ## name
+#define ETHTOOL_LINK_CONVERT(name, mode) ETHTOOL_LINK_NAME(name)(mode)
+
+/**
+ * ethtool_link_ksettings_zero_link_mode
+ * @ptr: ptr to ksettings struct
+ * @name: supported or advertising
+ */
+#define ethtool_link_ksettings_zero_link_mode(ptr, name)\
+	(*((ptr)->link_modes.name) = 0x0)
+
+/**
+ * ethtool_link_ksettings_add_link_mode
+ * @ptr: ptr to ksettings struct
+ * @name: supported or advertising
+ * @mode: link mode to add
+ */
+#define ethtool_link_ksettings_add_link_mode(ptr, name, mode)\
+	(*((ptr)->link_modes.name) |= (typeof(*((ptr)->link_modes.name)))ETHTOOL_LINK_CONVERT(name, mode))
+
+/**
+ * ethtool_link_ksettings_del_link_mode
+ * @ptr: ptr to ksettings struct
+ * @name: supported or advertising
+ * @mode: link mode to delete
+ */
+#define ethtool_link_ksettings_del_link_mode(ptr, name, mode)\
+	(*((ptr)->link_modes.name) &= ~(typeof(*((ptr)->link_modes.name)))ETHTOOL_LINK_CONVERT(name, mode))
+
+/**
+ * ethtool_link_ksettings_test_link_mode
+ * @ptr: ptr to ksettings struct
+ * @name: supported or advertising
+ * @mode: link mode to add
+ */
+#define ethtool_link_ksettings_test_link_mode(ptr, name, mode)\
+	(!!(*((ptr)->link_modes.name) & ETHTOOL_LINK_CONVERT(name, mode)))
+
+/**
+ * _kc_ethtool_ksettings_to_cmd - Convert ethtool_link_ksettings to ethtool_cmd
+ * @ks: ethtool_link_ksettings struct
+ * @cmd: ethtool_cmd struct
+ *
+ * Convert an ethtool_link_ksettings structure into the older ethtool_cmd
+ * structure. We provide this in kcompat.h so that drivers can easily
+ * implement the older .{get|set}_settings as wrappers around the new api.
+ * Hence, we keep it prefixed with _kc_ to make it clear this isn't actually
+ * a real function in the kernel.
+ */
+static inline void
+_kc_ethtool_ksettings_to_cmd(struct ethtool_link_ksettings *ks,
+			     struct ethtool_cmd *cmd)
+{
+	cmd->supported = (u32)ks->link_modes.supported[0];
+	cmd->advertising = (u32)ks->link_modes.advertising[0];
+	ethtool_cmd_speed_set(cmd, ks->base.speed);
+	cmd->duplex = ks->base.duplex;
+	cmd->autoneg = ks->base.autoneg;
+	cmd->port = ks->base.port;
+}
+
+#endif /* !ETHTOOL_GLINKSETTINGS */
+
+/*****************************************************************************/
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE <= SLE_VERSION(12,3,0))) || \
+     (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,5))))
+#define phy_speed_to_str _kc_phy_speed_to_str
+const char *_kc_phy_speed_to_str(int speed);
+#else /* (LINUX >= 4.14.0) || (SLES > 12.3.0) || (RHEL > 7.5) */
+#include <linux/phy.h>
+#endif /* (LINUX < 4.14.0) || (SLES <= 12.3.0) || (RHEL <= 7.5) */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0))
+#if ((RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,6))) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,1,0))))
+#define HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#define HAVE_TCF_BLOCK
+#else /* RHEL >= 7.6 || SLES >= 15.1 */
+#endif /* !(RHEL >= 7.6) && !(SLES >= 15.1) */
+void _kc_ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				      struct ethtool_link_ksettings *src);
+#define ethtool_intersect_link_masks _kc_ethtool_intersect_link_masks
+#else /* >= 4.15 */
+#define HAVE_NDO_BPF
+#define HAVE_XDP_BUFF_DATA_META
+#define HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#define HAVE_TCF_BLOCK
+#endif /* 4.15.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,7)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,4,0) && \
+       SLE_VERSION_CODE < SLE_VERSION(15,0,0)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(15,1,0)))
+/* The return value of the strscpy() and strlcpy() functions is different.
+ * This could be potentially hazard for the future.
+ * To avoid this the void result is forced.
+ * So it is not possible use this function with the return value.
+ * Return value is required in kernel 4.3 through 4.15
+ */
+#define strscpy(...) (void)(strlcpy(__VA_ARGS__))
+#endif /* !RHEL >= 7.7 && !SLES12sp4+ && !SLES15sp1+ */
+
+#define pci_printk(level, pdev, fmt, arg...) \
+	dev_printk(level, &(pdev)->dev, fmt, ##arg)
+#define pci_emerg(pdev, fmt, arg...)	dev_emerg(&(pdev)->dev, fmt, ##arg)
+#define pci_alert(pdev, fmt, arg...)	dev_alert(&(pdev)->dev, fmt, ##arg)
+#define pci_crit(pdev, fmt, arg...)	dev_crit(&(pdev)->dev, fmt, ##arg)
+#define pci_err(pdev, fmt, arg...)	dev_err(&(pdev)->dev, fmt, ##arg)
+#define pci_warn(pdev, fmt, arg...)	dev_warn(&(pdev)->dev, fmt, ##arg)
+#define pci_notice(pdev, fmt, arg...)	dev_notice(&(pdev)->dev, fmt, ##arg)
+#define pci_info(pdev, fmt, arg...)	dev_info(&(pdev)->dev, fmt, ##arg)
+#define pci_dbg(pdev, fmt, arg...)	dev_dbg(&(pdev)->dev, fmt, ##arg)
+
+#ifndef array_index_nospec
+static inline unsigned long _kc_array_index_mask_nospec(unsigned long index,
+							unsigned long size)
+{
+	/*
+	 * Always calculate and emit the mask even if the compiler
+	 * thinks the mask is not needed. The compiler does not take
+	 * into account the value of @index under speculation.
+	 */
+	OPTIMIZER_HIDE_VAR(index);
+	return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
+}
+
+#define array_index_nospec(index, size)					\
+({									\
+	typeof(index) _i = (index);					\
+	typeof(size) _s = (size);					\
+	unsigned long _mask = _kc_array_index_mask_nospec(_i, _s);	\
+									\
+	BUILD_BUG_ON(sizeof(_i) > sizeof(long));			\
+	BUILD_BUG_ON(sizeof(_s) > sizeof(long));			\
+									\
+	(typeof(_i)) (_i & _mask);					\
+})
+#endif /* array_index_nospec */
+#ifndef sizeof_field
+#define sizeof_field(TYPE, MEMBER) (sizeof((((TYPE *)0)->MEMBER)))
+#endif /* sizeof_field */
+#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,5,0) && \
+       SLE_VERSION_CODE < SLE_VERSION(15,0,0) || \
+       SLE_VERSION_CODE >= SLE_VERSION(15,1,0))
+/*
+ * Copy bitmap and clear tail bits in last word.
+ */
+static inline void
+bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits)
+{
+	bitmap_copy(dst, src, nbits);
+	if (nbits % BITS_PER_LONG)
+		dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
+}
+
+/*
+ * On 32-bit systems bitmaps are represented as u32 arrays internally, and
+ * therefore conversion is not needed when copying data from/to arrays of u32.
+ */
+#if BITS_PER_LONG == 64
+void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits);
+#else
+#define bitmap_from_arr32(bitmap, buf, nbits)			\
+	bitmap_copy_clear_tail((unsigned long *) (bitmap),	\
+			       (const unsigned long *) (buf), (nbits))
+#endif /* BITS_PER_LONG == 64 */
+#endif /* !(RHEL >= 8.0) && !(SLES >= 12.5 && SLES < 15.0 || SLES >= 15.1) */
+#else /* >= 4.16 */
+#include <linux/nospec.h>
+#define HAVE_XDP_BUFF_RXQ
+#define HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+#define HAVE_TCF_MIRRED_DEV
+#define HAVE_VF_STATS_DROPPED
+#endif /* 4.16.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,17,0))
+#include <linux/pci_regs.h>
+#include <linux/pci.h>
+#define PCIE_SPEED_16_0GT 0x17
+#define PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */
+#define PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */
+#define PCI_EXP_LNKCAP2_SLS_16_0GB 0x00000010 /* Supported Speed 16GT/s */
+void _kc_pcie_print_link_status(struct pci_dev *dev);
+#define pcie_print_link_status _kc_pcie_print_link_status
+#else /* >= 4.17.0 */
+#define HAVE_XDP_BUFF_IN_XDP_H
+#endif /* 4.17.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0))
+#include "kcompat_overflow.h"
+
+#if (SLE_VERSION_CODE < SLE_VERSION(15,1,0))
+#define firmware_request_nowarn	request_firmware_direct
+#endif /* SLES < 15.1 */
+
+#else
+#include <linux/overflow.h>
+#include <net/xdp_sock.h>
+#define HAVE_XDP_FRAME_STRUCT
+#define HAVE_XDP_SOCK
+#define HAVE_NDO_XDP_XMIT_BULK_AND_FLAGS
+#define NO_NDO_XDP_FLUSH
+#define HAVE_AF_XDP_SUPPORT
+#endif /* 4.18.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0))
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0)) && \
+    (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(8,2)))
+#define HAVE_DEVLINK_REGIONS
+#endif /* RHEL >= 8.0 && RHEL <= 8.2 */
+#define bitmap_alloc(nbits, flags) \
+	kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags)
+#define bitmap_zalloc(nbits, flags) bitmap_alloc(nbits, ((flags) | __GFP_ZERO))
+#define bitmap_free(bitmap) kfree(bitmap)
+#ifdef ETHTOOL_GLINKSETTINGS
+#define ethtool_ks_clear(ptr, name) \
+	ethtool_link_ksettings_zero_link_mode(ptr, name)
+#define ethtool_ks_add_mode(ptr, name, mode) \
+	ethtool_link_ksettings_add_link_mode(ptr, name, mode)
+#define ethtool_ks_del_mode(ptr, name, mode) \
+	ethtool_link_ksettings_del_link_mode(ptr, name, mode)
+#define ethtool_ks_test(ptr, name, mode) \
+	ethtool_link_ksettings_test_link_mode(ptr, name, mode)
+#endif /* ETHTOOL_GLINKSETTINGS */
+#define HAVE_NETPOLL_CONTROLLER
+#define REQUIRE_PCI_CLEANUP_AER_ERROR_STATUS
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,1,0)))
+#define HAVE_TCF_MIRRED_DEV
+#define HAVE_NDO_SELECT_QUEUE_SB_DEV
+#define HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+#endif
+
+static inline void __kc_metadata_dst_free(void *md_dst)
+{
+	kfree(md_dst);
+}
+
+#define metadata_dst_free(md_dst) __kc_metadata_dst_free(md_dst)
+#else /* >= 4.19.0 */
+#define HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+#define NO_NETDEV_BPF_PROG_ATTACHED
+#define HAVE_NDO_SELECT_QUEUE_SB_DEV
+#define HAVE_NETDEV_SB_DEV
+#define HAVE_TCF_VLAN_TPID
+#define HAVE_RHASHTABLE_TYPES
+#define HAVE_DEVLINK_REGIONS
+#define HAVE_DEVLINK_PARAMS
+#endif /* 4.19.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0))
+#define HAVE_XDP_UMEM_PROPS
+#ifdef HAVE_AF_XDP_SUPPORT
+#ifndef napi_if_scheduled_mark_missed
+static inline bool __kc_napi_if_scheduled_mark_missed(struct napi_struct *n)
+{
+	unsigned long val, new;
+
+	do {
+		val = READ_ONCE(n->state);
+		if (val & NAPIF_STATE_DISABLE)
+			return true;
+
+		if (!(val & NAPIF_STATE_SCHED))
+			return false;
+
+		new = val | NAPIF_STATE_MISSED;
+	} while (cmpxchg(&n->state, val, new) != val);
+
+	return true;
+}
+
+#define napi_if_scheduled_mark_missed __kc_napi_if_scheduled_mark_missed
+#endif /* !napi_if_scheduled_mark_missed */
+#endif /* HAVE_AF_XDP_SUPPORT */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0)))
+#define HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+#endif /* RHEL >= 8.0 */
+#if ((SLE_VERSION_CODE >= SLE_VERSION(12,5,0) && \
+      SLE_VERSION_CODE < SLE_VERSION(15,0,0)) || \
+     (SLE_VERSION_CODE >= SLE_VERSION(15,1,0)))
+#define HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+#endif /* SLE == 12sp5 || SLE >= 15sp1 */
+#else /* >= 4.20.0 */
+#define HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+#define HAVE_AF_XDP_ZC_SUPPORT
+#define HAVE_VXLAN_TYPE
+#define HAVE_ETF_SUPPORT /* Earliest TxTime First */
+#endif /* 4.20.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,0,0))
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(8,0)))
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,12,0))
+#define NETLINK_MAX_COOKIE_LEN	20
+struct netlink_ext_ack {
+	const char *_msg;
+	const struct nlattr *bad_attr;
+	u8 cookie[NETLINK_MAX_COOKIE_LEN];
+	u8 cookie_len;
+};
+
+#endif /* < 4.12 */
+static inline int _kc_dev_open(struct net_device *netdev,
+			       struct netlink_ext_ack __always_unused *extack)
+{
+	return dev_open(netdev);
+}
+
+#define dev_open _kc_dev_open
+
+static inline int
+_kc_dev_change_flags(struct net_device *netdev, unsigned int flags,
+		     struct netlink_ext_ack __always_unused *extack)
+{
+	return dev_change_flags(netdev, flags);
+}
+
+#define dev_change_flags _kc_dev_change_flags
+#endif /* !(RHEL_RELEASE_CODE && RHEL > RHEL(8,0)) */
+#if (RHEL_RELEASE_CODE && \
+     (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,7) && \
+      RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0)) || \
+     (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,1)))
+#define HAVE_PTP_SYS_OFFSET_EXTENDED_IOCTL
+#define HAVE_PTP_CLOCK_INFO_GETTIMEX64
+#else /* RHEL >= 7.7 && RHEL < 8.0 || RHEL >= 8.1 */
+struct ptp_system_timestamp {
+	struct timespec64 pre_ts;
+	struct timespec64 post_ts;
+};
+
+static inline void
+ptp_read_system_prets(struct ptp_system_timestamp __always_unused *sts)
+{
+	;
+}
+
+static inline void
+ptp_read_system_postts(struct ptp_system_timestamp __always_unused *sts)
+{
+	;
+}
+#endif /* !(RHEL >= 7.7 && RHEL != 8.0) */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,1)))
+#define HAVE_NDO_BRIDGE_SETLINK_EXTACK
+#endif /* RHEL 8.1 */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))
+#define HAVE_TC_INDIR_BLOCK
+#endif /* RHEL 8.2 */
+#define INDIRECT_CALLABLE_DECLARE(x) x
+#else /* >= 5.0.0 */
+#define HAVE_PTP_SYS_OFFSET_EXTENDED_IOCTL
+#define HAVE_PTP_CLOCK_INFO_GETTIMEX64
+#define HAVE_NDO_BRIDGE_SETLINK_EXTACK
+#define HAVE_DMA_ALLOC_COHERENT_ZEROES_MEM
+#define HAVE_GENEVE_TYPE
+#define HAVE_TC_INDIR_BLOCK
+#define HAVE_INDIRECT_CALL_WRAPPER_HEADER
+#endif /* 5.0.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,1,0))
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,1)))
+#define HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+#define HAVE_NDO_FDB_ADD_EXTACK
+#define HAVE_DEVLINK_INFO_GET
+#define HAVE_DEVLINK_FLASH_UPDATE
+#else /* RHEL < 8.1 */
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#include <net/pkt_cls.h>
+
+struct flow_match {
+	struct flow_dissector	*dissector;
+	void			*mask;
+	void			*key;
+};
+
+struct flow_match_basic {
+	struct flow_dissector_key_basic *key, *mask;
+};
+
+struct flow_match_control {
+	struct flow_dissector_key_control *key, *mask;
+};
+
+struct flow_match_eth_addrs {
+	struct flow_dissector_key_eth_addrs *key, *mask;
+};
+
+#ifdef HAVE_TC_FLOWER_ENC
+struct flow_match_enc_keyid {
+	struct flow_dissector_key_keyid *key, *mask;
+};
+#endif
+
+#ifndef HAVE_TC_FLOWER_VLAN_IN_TAGS
+struct flow_match_vlan {
+	struct flow_dissector_key_vlan *key, *mask;
+};
+#endif
+
+struct flow_match_ipv4_addrs {
+	struct flow_dissector_key_ipv4_addrs *key, *mask;
+};
+
+struct flow_match_ipv6_addrs {
+	struct flow_dissector_key_ipv6_addrs *key, *mask;
+};
+
+struct flow_match_ports {
+	struct flow_dissector_key_ports *key, *mask;
+};
+
+struct flow_rule {
+	struct flow_match	match;
+};
+
+void flow_rule_match_basic(const struct flow_rule *rule,
+			   struct flow_match_basic *out);
+void flow_rule_match_control(const struct flow_rule *rule,
+			     struct flow_match_control *out);
+void flow_rule_match_eth_addrs(const struct flow_rule *rule,
+			       struct flow_match_eth_addrs *out);
+#ifndef HAVE_TC_FLOWER_VLAN_IN_TAGS
+void flow_rule_match_vlan(const struct flow_rule *rule,
+			  struct flow_match_vlan *out);
+#endif
+void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
+				struct flow_match_ipv4_addrs *out);
+void flow_rule_match_ipv6_addrs(const struct flow_rule *rule,
+				struct flow_match_ipv6_addrs *out);
+void flow_rule_match_ports(const struct flow_rule *rule,
+			   struct flow_match_ports *out);
+#ifdef HAVE_TC_FLOWER_ENC
+void flow_rule_match_enc_ports(const struct flow_rule *rule,
+			       struct flow_match_ports *out);
+void flow_rule_match_enc_control(const struct flow_rule *rule,
+				 struct flow_match_control *out);
+void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule,
+				    struct flow_match_ipv4_addrs *out);
+void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule,
+				    struct flow_match_ipv6_addrs *out);
+void flow_rule_match_enc_keyid(const struct flow_rule *rule,
+			       struct flow_match_enc_keyid *out);
+#endif
+
+static inline struct flow_rule *
+tc_cls_flower_offload_flow_rule(struct tc_cls_flower_offload *tc_flow_cmd)
+{
+	return (struct flow_rule *)&tc_flow_cmd->dissector;
+}
+
+static inline bool flow_rule_match_key(const struct flow_rule *rule,
+				       enum flow_dissector_key_id key)
+{
+	return dissector_uses_key(rule->match.dissector, key);
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+#endif /* RHEL < 8.1 */
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,1)))
+#define devlink_params_publish(devlink) do { } while (0)
+#define devlink_params_unpublish(devlink) do { } while (0)
+#endif
+
+#else /* >= 5.1.0 */
+#define HAVE_NDO_FDB_ADD_EXTACK
+#define NO_XDP_QUERY_XSK_UMEM
+#define HAVE_AF_XDP_NETDEV_UMEM
+#define HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+#define HAVE_TC_FLOWER_ENC_IP
+#define HAVE_DEVLINK_INFO_GET
+#define HAVE_DEVLINK_FLASH_UPDATE
+#define HAVE_DEVLINK_PORT_PARAMS
+#endif /* 5.1.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,2,0))
+#if (defined HAVE_SKB_XMIT_MORE) && \
+(!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))))
+#define netdev_xmit_more()	(skb->xmit_more)
+#else
+#define netdev_xmit_more()	(0)
+#endif
+
+#if (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))))
+#ifndef eth_get_headlen
+static inline u32
+__kc_eth_get_headlen(const struct net_device __always_unused *dev, void *data,
+		     unsigned int len)
+{
+	return eth_get_headlen(data, len);
+}
+
+#define eth_get_headlen(dev, data, len) __kc_eth_get_headlen(dev, data, len)
+#endif /* !eth_get_headlen */
+#endif /* !RHEL >= 8.2 */
+
+#ifndef mmiowb
+#ifdef CONFIG_IA64
+#define mmiowb() asm volatile ("mf.a" ::: "memory")
+#else
+#define mmiowb()
+#endif
+#endif /* mmiowb */
+
+#if (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(8,1))
+#define HAVE_NDO_GET_DEVLINK_PORT
+#endif /* RHEL > 8.1 */
+
+#else /* >= 5.2.0 */
+#define HAVE_NDO_SELECT_QUEUE_FALLBACK_REMOVED
+#define SPIN_UNLOCK_IMPLIES_MMIOWB
+#define HAVE_NDO_GET_DEVLINK_PORT
+#endif /* 5.2.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2)))
+#define flow_block_offload tc_block_offload
+#define flow_block_command tc_block_command
+#define flow_cls_offload tc_cls_flower_offload
+#define flow_block_binder_type tcf_block_binder_type
+#define flow_cls_common_offload tc_cls_common_offload
+#define flow_cls_offload_flow_rule tc_cls_flower_offload_flow_rule
+#define FLOW_CLS_REPLACE TC_CLSFLOWER_REPLACE
+#define FLOW_CLS_DESTROY TC_CLSFLOWER_DESTROY
+#define FLOW_CLS_STATS TC_CLSFLOWER_STATS
+#define FLOW_CLS_TMPLT_CREATE TC_CLSFLOWER_TMPLT_CREATE
+#define FLOW_CLS_TMPLT_DESTROY TC_CLSFLOWER_TMPLT_DESTROY
+#define FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS \
+		TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS
+#define FLOW_BLOCK_BIND TC_BLOCK_BIND
+#define FLOW_BLOCK_UNBIND TC_BLOCK_UNBIND
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#include <net/pkt_cls.h>
+
+int _kc_flow_block_cb_setup_simple(struct flow_block_offload *f,
+				   struct list_head *driver_list,
+				   tc_setup_cb_t *cb,
+				   void *cb_ident, void *cb_priv,
+				   bool ingress_only);
+
+#define flow_block_cb_setup_simple(f, driver_list, cb, cb_ident, cb_priv, \
+				   ingress_only) \
+	_kc_flow_block_cb_setup_simple(f, driver_list, cb, cb_ident, cb_priv, \
+				       ingress_only)
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#else /* RHEL >= 8.2 */
+#define HAVE_FLOW_BLOCK_API
+#define HAVE_DEVLINK_PORT_ATTR_PCI_VF
+#endif /* RHEL >= 8.2 */
+
+#ifndef ETH_P_LLDP
+#define ETH_P_LLDP	0x88CC
+#endif /* !ETH_P_LLDP */
+
+#else /* >= 5.3.0 */
+#define XSK_UMEM_RETURNS_XDP_DESC
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0))
+#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0))
+#define HAVE_XSK_UMEM_HAS_ADDRS
+#endif /* SLE < 15.3 */
+#endif /* < 5.8.0*/
+#define HAVE_FLOW_BLOCK_API
+#define HAVE_DEVLINK_PORT_ATTR_PCI_VF
+#if IS_ENABLED(CONFIG_DIMLIB)
+#define HAVE_CONFIG_DIMLIB
+#endif
+#endif /* 5.3.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0))
+#if (SLE_VERSION_CODE >= SLE_VERSION(15,2,0))
+#define HAVE_NDO_XSK_WAKEUP
+#endif /* SLES15sp2 */
+#else /* >= 5.4.0 */
+#define HAVE_NDO_XSK_WAKEUP
+#endif /* 5.4.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,5,0))
+static inline unsigned long _kc_bitmap_get_value8(const unsigned long *map,
+						  unsigned long start)
+{
+	const size_t index = BIT_WORD(start);
+	const unsigned long offset = start % BITS_PER_LONG;
+
+	return (map[index] >> offset) & 0xFF;
+}
+#define bitmap_get_value8 _kc_bitmap_get_value8
+
+static inline void _kc_bitmap_set_value8(unsigned long *map,
+					 unsigned long value,
+					 unsigned long start)
+{
+	const size_t index = BIT_WORD(start);
+	const unsigned long offset = start % BITS_PER_LONG;
+
+	map[index] &= ~(0xFFUL << offset);
+	map[index] |= value << offset;
+}
+#define bitmap_set_value8 _kc_bitmap_set_value8
+
+#endif /* 5.5.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,6,0))
+#ifdef HAVE_AF_XDP_SUPPORT
+#define xsk_umem_release_addr		xsk_umem_discard_addr
+#define xsk_umem_release_addr_rq	xsk_umem_discard_addr_rq
+#endif /* HAVE_AF_XDP_SUPPORT */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,3)) || \
+     (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))
+#define HAVE_TX_TIMEOUT_TXQUEUE
+#endif
+#else /* >= 5.6.0 */
+#define HAVE_TX_TIMEOUT_TXQUEUE
+#endif /* 5.6.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,7,0))
+u64 _kc_pci_get_dsn(struct pci_dev *dev);
+#define pci_get_dsn(dev) _kc_pci_get_dsn(dev)
+#if !(SLE_VERSION_CODE > SLE_VERSION(15,2,0)) && \
+    !((LINUX_VERSION_CODE == KERNEL_VERSION(5,3,18)) && \
+      (SLE_LOCALVERSION_CODE >= KERNEL_VERSION(14,0,0))) && \
+    !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,3)))
+#define pci_aer_clear_nonfatal_status	pci_cleanup_aer_uncorrect_error_status
+#endif
+
+#define cpu_latency_qos_update_request pm_qos_update_request
+#define cpu_latency_qos_add_request(arg1, arg2) pm_qos_add_request(arg1, PM_QOS_CPU_DMA_LATENCY, arg2)
+#define cpu_latency_qos_remove_request pm_qos_remove_request
+
+#ifndef DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID
+#define DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID "fw.bundle_id"
+#endif
+#else /* >= 5.7.0 */
+#define HAVE_DEVLINK_REGION_OPS_SNAPSHOT
+#define HAVE_ETHTOOL_COALESCE_PARAMS_SUPPORT
+#endif /* 5.7.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0))
+#if !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,4))) && \
+    !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0))
+#define xdp_convert_buff_to_frame convert_to_xdp_frame
+#endif /* (RHEL < 8.4) || (SLE < 15.3) */
+#define flex_array_size(p, member, count) \
+	array_size(count, sizeof(*(p)->member) + __must_be_array((p)->member))
+#if (!(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifndef xsk_umem_get_rx_frame_size
+static inline u32 _xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+	return umem->chunk_size_nohr - XDP_PACKET_HEADROOM;
+}
+
+#define xsk_umem_get_rx_frame_size _xsk_umem_get_rx_frame_size
+#endif /* xsk_umem_get_rx_frame_size */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#else /* SLE >= 15.3 */
+#define HAVE_XDP_BUFF_FRAME_SZ
+#define HAVE_MEM_TYPE_XSK_BUFF_POOL
+#endif /* SLE >= 15.3 */
+#else /* >= 5.8.0 */
+#define HAVE_TC_FLOW_INDIR_DEV
+#define HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP
+#define HAVE_XDP_BUFF_FRAME_SZ
+#define HAVE_MEM_TYPE_XSK_BUFF_POOL
+#endif /* 5.8.0 */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,3)))
+#define HAVE_TC_FLOW_INDIR_DEV
+#endif
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))
+#define HAVE_TC_FLOW_INDIR_DEV
+#endif /* SLE_VERSION_CODE && SLE_VERSION_CODE >= SLES15SP3 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,4)))
+#define HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP
+#endif /* (RHEL >= 8.4) */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,9,0))
+#else /* >= 5.9.0 */
+#define HAVE_FLOW_INDIR_BLOCK_QDISC
+#define HAVE_UDP_TUNNEL_NIC_INFO
+#endif /* 5.9.0 */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(8,3)))
+#define HAVE_FLOW_INDIR_BLOCK_QDISC
+#endif
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))
+#define HAVE_FLOW_INDIR_BLOCK_QDISC
+#endif /* SLE_VERSION_CODE && SLE_VERSION_CODE >= SLES15SP3 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0))
+#if (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0))
+#define HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+#define HAVE_DEVLINK_FLASH_UPDATE_PARAMS
+#else /* SLE >= 15.3 */
+struct devlink_flash_update_params {
+	const char *file_name;
+	const char *component;
+	u32 overwrite_mask;
+};
+
+#ifndef DEVLINK_FLASH_OVERWRITE_SETTINGS
+#define DEVLINK_FLASH_OVERWRITE_SETTINGS BIT(0)
+#endif
+
+#ifndef DEVLINK_FLASH_OVERWRITE_IDENTIFIERS
+#define DEVLINK_FLASH_OVERWRITE_IDENTIFIERS BIT(1)
+#endif
+#endif /* !(SLE >= 15.3) */
+
+#if (!(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,3,0))))
+#define XDP_SETUP_XSK_POOL XDP_SETUP_XSK_UMEM
+#define xsk_get_pool_from_qid xdp_get_umem_from_qid
+#define xsk_pool_get_rx_frame_size xsk_umem_get_rx_frame_size
+#define xsk_pool_set_rxq_info xsk_buff_set_rxq_info
+#define xsk_pool_dma_unmap xsk_buff_dma_unmap
+#define xsk_pool_dma_map xsk_buff_dma_map
+#define xsk_tx_peek_desc xsk_umem_consume_tx
+#define xsk_tx_release xsk_umem_consume_tx_done
+#define xsk_tx_completed xsk_umem_complete_tx
+#define xsk_uses_need_wakeup xsk_umem_uses_need_wakeup
+#ifdef HAVE_MEM_TYPE_XSK_BUFF_POOL
+#include <net/xdp_sock_drv.h>
+static inline void
+_kc_xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp,
+			      void __always_unused *pool)
+{
+	xsk_buff_dma_sync_for_cpu(xdp);
+}
+
+#define xsk_buff_dma_sync_for_cpu(xdp, pool) \
+	_kc_xsk_buff_dma_sync_for_cpu(xdp, pool)
+#endif /* HAVE_MEM_TYPE_XSK_BUFF_POOL */
+#else /* SLE >= 15.3 */
+#define HAVE_NETDEV_BPF_XSK_POOL
+#endif /* SLE >= 15.3 */
+#else /* >= 5.10.0 */
+#define HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+#define HAVE_DEVLINK_FLASH_UPDATE_PARAMS
+#define HAVE_NETDEV_BPF_XSK_POOL
+#endif /* 5.10.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,11,0))
+#ifdef HAVE_XDP_BUFF_RXQ
+#include <net/xdp.h>
+static inline int
+_kc_xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev,
+		     u32 queue_index, unsigned int __always_unused napi_id)
+{
+	return xdp_rxq_info_reg(xdp_rxq, dev, queue_index);
+}
+
+#define xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id) \
+	_kc_xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id)
+#endif /* HAVE_XDP_BUFF_RXQ */
+#ifdef HAVE_NAPI_BUSY_LOOP
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#include <net/busy_poll.h>
+static inline void
+_kc_napi_busy_loop(unsigned int napi_id,
+		   bool (*loop_end)(void *, unsigned long), void *loop_end_arg,
+		   bool __always_unused prefer_busy_poll,
+		   u16 __always_unused budget)
+{
+	napi_busy_loop(napi_id, loop_end, loop_end_arg);
+}
+
+#define napi_busy_loop(napi_id, loop_end, loop_end_arg, prefer_busy_poll, budget) \
+	_kc_napi_busy_loop(napi_id, loop_end, loop_end_arg, prefer_busy_poll, budget)
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+#endif /* HAVE_NAPI_BUSY_LOOP */
+#define HAVE_DEVLINK_FLASH_UPDATE_BEGIN_END_NOTIFY
+#else /* >= 5.11.0 */
+#define HAVE_DEVLINK_FLASH_UPDATE_PARAMS_FW
+#endif /* 5.11.0 */
+
+/*
+ * Load the implementations file which actually defines kcompat backports.
+ * Legacy backports still exist in this file, but all new backports must be
+ * implemented using kcompat_*defs.h and kcompat_impl.h
+ */
+#include "kcompat_impl.h"
+
+#endif /* _KCOMPAT_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_dim.c b/drivers/net/ethernet/intel/ice/kcompat_dim.c
new file mode 100644
index 000000000000..f5228e9718e4
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_dim.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include "kcompat.h"
+#include "kcompat_dim.h"
+
+bool dim_on_top(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		return true;
+	case DIM_GOING_RIGHT:
+		return (dim->steps_left > 1) && (dim->steps_right == 1);
+	default: /* DIM_GOING_LEFT */
+		return (dim->steps_right > 1) && (dim->steps_left == 1);
+	}
+}
+
+void dim_turn(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		dim->tune_state = DIM_GOING_LEFT;
+		dim->steps_left = 0;
+		break;
+	case DIM_GOING_LEFT:
+		dim->tune_state = DIM_GOING_RIGHT;
+		dim->steps_right = 0;
+		break;
+	}
+}
+
+void dim_park_on_top(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tired        = 0;
+	dim->tune_state   = DIM_PARKING_ON_TOP;
+}
+
+void dim_park_tired(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tune_state   = DIM_PARKING_TIRED;
+}
+
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats)
+{
+	/* u32 holds up to 71 minutes, should be enough */
+	u32 delta_us = ktime_us_delta(end->time, start->time);
+	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+			     start->byte_ctr);
+	u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr,
+			     start->comp_ctr);
+
+	if (!delta_us)
+		return;
+
+	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
+					delta_us);
+	curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us);
+	if (curr_stats->epms != 0)
+		curr_stats->cpe_ratio = DIV_ROUND_DOWN_ULL(
+			curr_stats->cpms * 100, curr_stats->epms);
+	else
+		curr_stats->cpe_ratio = 0;
+
+}
diff --git a/drivers/net/ethernet/intel/ice/kcompat_dim.h b/drivers/net/ethernet/intel/ice/kcompat_dim.h
new file mode 100644
index 000000000000..ab8da64c8a83
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_dim.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef _KCOMPAT_DIM_H_
+#define _KCOMPAT_DIM_H_
+
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/*
+ * Number of events between DIM iterations.
+ * Causes a moderation of the algorithm run.
+ */
+#define DIM_NEVENTS 64
+
+/*
+ * Is a difference between values justifies taking an action.
+ * We consider 10% difference as significant.
+ */
+#define IS_SIGNIFICANT_DIFF(val, ref) \
+	(((100UL * abs((val) - (ref))) / (ref)) > 10)
+
+/*
+ * Calculate the gap between two values.
+ * Take wrap-around and variable size into consideration.
+ */
+#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
+		& (BIT_ULL(bits) - 1))
+
+/**
+ * struct dim_cq_moder - Structure for CQ moderation values.
+ * Used for communications between DIM and its consumer.
+ *
+ * @usec: CQ timer suggestion (by DIM)
+ * @pkts: CQ packet counter suggestion (by DIM)
+ * @comps: Completion counter
+ * @cq_period_mode: CQ period count mode (from CQE/EQE)
+ */
+struct dim_cq_moder {
+	u16 usec;
+	u16 pkts;
+	u16 comps;
+	u8 cq_period_mode;
+};
+
+/**
+ * struct dim_sample - Structure for DIM sample data.
+ * Used for communications between DIM and its consumer.
+ *
+ * @time: Sample timestamp
+ * @pkt_ctr: Number of packets
+ * @byte_ctr: Number of bytes
+ * @event_ctr: Number of events
+ * @comp_ctr: Current completion counter
+ */
+struct dim_sample {
+	ktime_t time;
+	u32 pkt_ctr;
+	u32 byte_ctr;
+	u16 event_ctr;
+	u32 comp_ctr;
+};
+
+/**
+ * struct dim_stats - Structure for DIM stats.
+ * Used for holding current measured rates.
+ *
+ * @ppms: Packets per msec
+ * @bpms: Bytes per msec
+ * @epms: Events per msec
+ * @cpms: Completions per msec
+ * @cpe_ratio: Ratio of completions to events
+ */
+struct dim_stats {
+	int ppms; /* packets per msec */
+	int bpms; /* bytes per msec */
+	int epms; /* events per msec */
+	int cpms; /* completions per msec */
+	int cpe_ratio; /* ratio of completions to events */
+};
+
+/**
+ * struct dim - Main structure for dynamic interrupt moderation (DIM).
+ * Used for holding all information about a specific DIM instance.
+ *
+ * @state: Algorithm state (see below)
+ * @prev_stats: Measured rates from previous iteration (for comparison)
+ * @start_sample: Sampled data at start of current iteration
+ * @measuring_sample: A &dim_sample that is used to update the current events
+ * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
+ * @profile_ix: Current moderation profile
+ * @mode: CQ period count mode
+ * @tune_state: Algorithm tuning state (see below)
+ * @steps_right: Number of steps taken towards higher moderation
+ * @steps_left: Number of steps taken towards lower moderation
+ * @tired: Parking depth counter
+ */
+struct dim {
+	u8 state;
+	struct dim_stats prev_stats;
+	struct dim_sample start_sample;
+	struct dim_sample measuring_sample;
+	struct work_struct work;
+	void *priv;
+	u8 profile_ix;
+	u8 mode;
+	u8 tune_state;
+	u8 steps_right;
+	u8 steps_left;
+	u8 tired;
+};
+
+/**
+ * enum dim_cq_period_mode - Modes for CQ period count
+ *
+ * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE
+ * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset)
+ * @DIM_CQ_PERIOD_NUM_MODES: Number of modes
+ */
+enum dim_cq_period_mode {
+	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	DIM_CQ_PERIOD_NUM_MODES
+};
+
+/**
+ * enum dim_state - DIM algorithm states
+ *
+ * These will determine if the algorithm is in a valid state to start an iteration.
+ *
+ * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile)
+ * @DIM_MEASURE_IN_PROGRESS: Algorithm is already in progress - check if
+ * need to perform an action
+ * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure
+ */
+enum dim_state {
+	DIM_START_MEASURE,
+	DIM_MEASURE_IN_PROGRESS,
+	DIM_APPLY_NEW_PROFILE,
+};
+
+/**
+ * enum dim_tune_state - DIM algorithm tune states
+ *
+ * These will determine which action the algorithm should perform.
+ *
+ * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference
+ * @DIM_PARKING_TIRED: Algorithm found a deep top point - don't exit if tired > 0
+ * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels
+ * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels
+ */
+enum dim_tune_state {
+	DIM_PARKING_ON_TOP,
+	DIM_PARKING_TIRED,
+	DIM_GOING_RIGHT,
+	DIM_GOING_LEFT,
+};
+
+/**
+ * enum dim_stats_state - DIM algorithm statistics states
+ *
+ * These will determine the verdict of current iteration.
+ *
+ * @DIM_STATS_WORSE: Current iteration shows worse performance than before
+ * @DIM_STATS_SAME:  Current iteration shows same performance than before
+ * @DIM_STATS_BETTER: Current iteration shows better performance than before
+ */
+enum dim_stats_state {
+	DIM_STATS_WORSE,
+	DIM_STATS_SAME,
+	DIM_STATS_BETTER,
+};
+
+/**
+ * enum dim_step_result - DIM algorithm step results
+ *
+ * These describe the result of a step.
+ *
+ * @DIM_STEPPED: Performed a regular step
+ * @DIM_TOO_TIRED: Same kind of step was done multiple times - should go to
+ * tired parking
+ * @DIM_ON_EDGE: Stepped to the most left/right profile
+ */
+enum dim_step_result {
+	DIM_STEPPED,
+	DIM_TOO_TIRED,
+	DIM_ON_EDGE,
+};
+
+/**
+ *	dim_on_top - check if current state is a good place to stop (top location)
+ *	@dim: DIM context
+ *
+ * Check if current profile is a good place to park at.
+ * This will result in reducing the DIM checks frequency as we assume we
+ * shouldn't probably change profiles, unless traffic pattern wasn't changed.
+ */
+bool dim_on_top(struct dim *dim);
+
+/**
+ *	dim_turn - change profile altering direction
+ *	@dim: DIM context
+ *
+ * Go left if we were going right and vice-versa.
+ * Do nothing if currently parking.
+ */
+void dim_turn(struct dim *dim);
+
+/**
+ *	dim_park_on_top - enter a parking state on a top location
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history.
+ */
+void dim_park_on_top(struct dim *dim);
+
+/**
+ *	dim_park_tired - enter a tired parking state
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history and cause DIM checks frequency to reduce.
+ */
+void dim_park_tired(struct dim *dim);
+
+/**
+ *	dim_calc_stats - calculate the difference between two samples
+ *	@start: start sample
+ *	@end: end sample
+ *	@curr_stats: delta between samples
+ *
+ * Calculate the delta between two samples (in data rates).
+ * Takes into consideration counter wrap-around.
+ */
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats);
+
+/**
+ *	dim_update_sample - set a sample's fields with given values
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
+{
+	s->time	     = ktime_get();
+	s->pkt_ctr   = packets;
+	s->byte_ctr  = bytes;
+	s->event_ctr = event_ctr;
+}
+
+/**
+ *	dim_update_sample_with_comps - set a sample's fields with given
+ *	values including the completion parameter
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@comps: number of completions to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps,
+			     struct dim_sample *s)
+{
+	dim_update_sample(event_ctr, packets, bytes, s);
+	s->comp_ctr = comps;
+}
+
+/* Net DIM */
+
+/**
+ *	net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_rx_moderation - provide the default RX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_rx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim_get_tx_moderation - provide a CQ moderation object for the given TX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_tx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_tx_moderation - provide the default TX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim - main DIM algorithm entry point
+ *	@dim: DIM instance information
+ *	@end_sample: Current data measurement
+ *
+ * Called by the consumer.
+ * This is the main logic of the algorithm, where data is processed in order
+ * to decide on next required action.
+ */
+void net_dim(struct dim *dim, struct dim_sample end_sample);
+
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
+#endif /* DIM_H */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_impl.h b/drivers/net/ethernet/intel/ice/kcompat_impl.h
new file mode 100644
index 000000000000..266415ca7a96
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_impl.h
@@ -0,0 +1,498 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_IMPL_H_
+#define _KCOMPAT_IMPL_H_
+
+/* This file contains implementations of backports from various kernels. It
+ * must rely only on NEED_<FLAG> and HAVE_<FLAG> checks. It must not make any
+ * checks to determine the kernel version when deciding whether to include an
+ * implementation.
+ *
+ * All new implementations must go in this file, and legacy implementations
+ * should be migrated to the new format over time.
+ */
+
+/*
+ * generic network stack functions
+ */
+
+/* NEED_NET_PREFETCH
+ *
+ * net_prefetch was introduced by commit f468f21b7af0 ("net: Take common
+ * prefetch code structure into a function")
+ *
+ * This function is trivial to re-implement in full.
+ */
+#ifdef NEED_NET_PREFETCH
+static inline void net_prefetch(void *p)
+{
+	prefetch(p);
+#if L1_CACHE_BYTES < 128
+	prefetch((u8 *)p + L1_CACHE_BYTES);
+#endif
+}
+#endif /* NEED_NET_PREFETCH */
+
+/* NEED_SKB_FRAG_OFF and NEED_SKB_FRAG_OFF_ADD
+ *
+ * skb_frag_off and skb_frag_off_add were added in upstream commit
+ * 7240b60c98d6 ("linux: Add skb_frag_t page_offset accessors")
+ *
+ * Implementing the wrappers directly for older kernels which still have the
+ * old implementation of skb_frag_t is trivial.
+ *
+ * LTS 4.19 backported the define for skb_frag_off in 4.19.201.
+ * d94d95ae0dd0 ("gro: ensure frag0 meets IP header alignment")
+ * Need to exclude defining skb_frag_off for 4.19.X where X > 200
+ */
+#ifdef NEED_SKB_FRAG_OFF
+static inline unsigned int skb_frag_off(const skb_frag_t *frag)
+{
+	return frag->page_offset;
+}
+#endif /* NEED_SKB_FRAG_OFF */
+#ifdef NEED_SKB_FRAG_OFF_ADD
+static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
+{
+	frag->page_offset += delta;
+}
+#endif /* NEED_SKB_FRAG_OFF_ADD */
+
+/*
+ * NETIF_F_HW_L2FW_DOFFLOAD related functions
+ *
+ * Support for NETIF_F_HW_L2FW_DOFFLOAD was first introduced upstream by
+ * commit a6cc0cfa72e0 ("net: Add layer 2 hardware acceleration operations for
+ * macvlan devices")
+ */
+#ifdef NETIF_F_HW_L2FW_DOFFLOAD
+
+#include <linux/if_macvlan.h>
+
+/* NEED_MACVLAN_ACCEL_PRIV
+ *
+ * macvlan_accel_priv is an accessor function that replaced direct access to
+ * the macvlan->fwd_priv variable. It was introduced in commit 7d775f63470c
+ * ("macvlan: Rename fwd_priv to accel_priv and add accessor function")
+ *
+ * Implement the new wrapper name by simply accessing the older
+ * macvlan->fwd_priv name.
+ */
+#ifdef NEED_MACVLAN_ACCEL_PRIV
+static inline void *macvlan_accel_priv(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	return macvlan->fwd_priv;
+}
+#endif /* NEED_MACVLAN_ACCEL_PRIV */
+
+/* NEED_MACVLAN_RELEASE_L2FW_OFFLOAD
+ *
+ * macvlan_release_l2fw_offload was introduced upstream by commit 53cd4d8e4dfb
+ * ("macvlan: Provide function for interfaces to release HW offload")
+ *
+ * Implementing this is straight forward, but we must be careful to use
+ * fwd_priv instead of accel_priv. Note that both the change to accel_priv and
+ * introduction of this function happened in the same release.
+ */
+#ifdef NEED_MACVLAN_RELEASE_L2FW_OFFLOAD
+static inline int macvlan_release_l2fw_offload(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	macvlan->fwd_priv = NULL;
+	return dev_uc_add(macvlan->lowerdev, dev->dev_addr);
+}
+#endif /* NEED_MACVLAN_RELEASE_L2FW_OFFLOAD */
+
+/* NEED_MACVLAN_SUPPORTS_DEST_FILTER
+ *
+ * macvlan_supports_dest_filter was introduced upstream by commit 6cb1937d4eff
+ * ("macvlan: Add function to test for destination filtering support")
+ *
+ * The implementation doesn't rely on anything new and is trivial to backport
+ * for kernels that have NETIF_F_HW_L2FW_DOFFLOAD support.
+ */
+#ifdef NEED_MACVLAN_SUPPORTS_DEST_FILTER
+static inline bool macvlan_supports_dest_filter(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	return macvlan->mode == MACVLAN_MODE_PRIVATE ||
+	       macvlan->mode == MACVLAN_MODE_VEPA ||
+	       macvlan->mode == MACVLAN_MODE_BRIDGE;
+}
+#endif /* NEED_MACVLAN_SUPPORTS_DEST_FILTER */
+
+#endif /* NETIF_F_HW_L2FW_DOFFLOAD */
+
+/*
+ * tc functions
+ */
+
+/* NEED_FLOW_INDR_BLOCK_CB_REGISTER
+ *
+ * __flow_indr_block_cb_register and __flow_indr_block_cb_unregister were
+ * added in upstream commit 4e481908c51b ("flow_offload: move tc indirect
+ * block to flow offload")
+ *
+ * This was a simple rename so we can just translate from the old
+ * naming scheme with a macro.
+ */
+#ifdef NEED_FLOW_INDR_BLOCK_CB_REGISTER
+#define __flow_indr_block_cb_register __tc_indr_block_cb_register
+#define __flow_indr_block_cb_unregister __tc_indr_block_cb_unregister
+#endif
+
+/*
+ * devlink support
+ */
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
+#include <net/devlink.h>
+
+#ifdef HAVE_DEVLINK_REGIONS
+/* NEED_DEVLINK_REGION_CREATE_OPS
+ *
+ * The ops parameter to devlink_region_create was added by commit e8937681797c
+ * ("devlink: prepare to support region operations")
+ *
+ * For older kernels, define _kc_devlink_region_create that takes an ops
+ * parameter, and calls the old implementation function by extracting the name
+ * from the structure.
+ */
+#ifdef NEED_DEVLINK_REGION_CREATE_OPS
+struct devlink_region_ops {
+	const char *name;
+	void (*destructor)(const void *data);
+};
+
+static inline struct devlink_region *
+_kc_devlink_region_create(struct devlink *devlink,
+			  const struct devlink_region_ops *ops,
+			  u32 region_max_snapshots, u64 region_size)
+{
+	return devlink_region_create(devlink, ops->name, region_max_snapshots,
+				     region_size);
+}
+
+#define devlink_region_create _kc_devlink_region_create
+#endif /* NEED_DEVLINK_REGION_CREATE_OPS */
+#endif /* HAVE_DEVLINK_REGIONS */
+
+/* NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY
+ *
+ * devlink_flash_update_status_notify, _begin_notify, and _end_notify were
+ * added by upstream commit 191ed2024de9 ("devlink: allow driver to update
+ * progress of flash update")
+ *
+ * For older kernels that lack the netlink messages, convert the functions
+ * into no-ops.
+ */
+#ifdef NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY
+static inline void
+devlink_flash_update_begin_notify(struct devlink __always_unused *devlink)
+{
+}
+
+static inline void
+devlink_flash_update_end_notify(struct devlink __always_unused *devlink)
+{
+}
+
+static inline void
+devlink_flash_update_status_notify(struct devlink __always_unused *devlink,
+				   const char __always_unused *status_msg,
+				   const char __always_unused *component,
+				   unsigned long __always_unused done,
+				   unsigned long __always_unused total)
+{
+}
+#endif /* NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY */
+
+/* NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+ *
+ * devlink_flash_update_timeout_notify was added by upstream commit
+ * f92970c694b3 ("devlink: add timeout information to status_notify").
+ *
+ * For older kernels, just convert timeout notifications into regular status
+ * notification messages without timeout information.
+ */
+#ifdef NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+static inline void
+devlink_flash_update_timeout_notify(struct devlink *devlink,
+				    const char *status_msg,
+				    const char *component,
+				    unsigned long __always_unused timeout)
+{
+	devlink_flash_update_status_notify(devlink, status_msg, component, 0, 0);
+}
+#endif /* NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY */
+
+/*
+ * NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+ *
+ * HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+ * HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID
+ *
+ * devlink_port_attrs_set was introduced by commit b9ffcbaf56d3 ("devlink:
+ * introduce devlink_port_attrs_set")
+ *
+ * It's function signature has changed multiple times over several kernel
+ * releases:
+ *
+ * commit 5ec1380a21bb ("devlink: extend attrs_set for setting port
+ * flavours") added the ability to set port flavour. (Note that there is no
+ * official kernel release with devlink_port_attrs_set without the flavour
+ * argument, as they were introduced in the same series.)
+ *
+ * commit bec5267cded2 ("net: devlink: extend port attrs for switch ID") added
+ * the ability to set the switch ID (HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID)
+ *
+ * Finally commit 71ad8d55f8e5 ("devlink: Replace devlink_port_attrs_set
+ * parameters with a struct") refactored to pass devlink_port_attrs struct
+ * instead of individual parameters. (!NEED_DEVLINK_PORT_ATTRS_SET_STRUCT)
+ *
+ * We want core drivers to just use the latest form that takes
+ * a devlink_port_attrs structure. Note that this structure did exist as part
+ * of <net/devlink.h> but was never used directly by driver code prior to the
+ * function parameter change. For this reason, the implementation always
+ * relies on _kc_devlink_port_attrs instead of what was defined in the kernel.
+ */
+#ifdef NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+
+#ifndef HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+enum devlink_port_flavour {
+	DEVLINK_PORT_FLAVOUR_PHYSICAL,
+	DEVLINK_PORT_FLAVOUR_CPU,
+	DEVLINK_PORT_FLAVOUR_DSA,
+	DEVLINK_PORT_FLAVOUR_PCI_PF,
+	DEVLINK_PORT_FLAVOUR_PCI_VF,
+};
+#endif
+
+struct _kc_devlink_port_phys_attrs {
+	u32 port_number;
+	u32 split_subport_number;
+};
+
+struct _kc_devlink_port_pci_pf_attrs {
+	u16 pf;
+};
+
+struct _kc_devlink_port_pci_vf_attrs {
+	u16 pf;
+	u16 vf;
+};
+
+struct _kc_devlink_port_attrs {
+	u8 split:1,
+	   splittable:1;
+	u32 lanes;
+	enum devlink_port_flavour flavour;
+	struct netdev_phys_item_id switch_id;
+	union {
+		struct _kc_devlink_port_phys_attrs phys;
+		struct _kc_devlink_port_pci_pf_attrs pci_pf;
+		struct _kc_devlink_port_pci_vf_attrs pci_vf;
+	};
+};
+
+#define devlink_port_attrs _kc_devlink_port_attrs
+
+static inline void
+_kc_devlink_port_attrs_set(struct devlink_port *devlink_port,
+			   struct _kc_devlink_port_attrs *attrs)
+{
+#if defined(HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID)
+	devlink_port_attrs_set(devlink_port, attrs->flavour, attrs->phys.port_number,
+			       attrs->split, attrs->phys.split_subport_number,
+			       attrs->switch_id.id, attrs->switch_id.id_len);
+#elif defined(HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR)
+	devlink_port_attrs_set(devlink_port, attrs->flavour, attrs->phys.port_number,
+			       attrs->split, attrs->phys.split_subport_number);
+#else
+	if (attrs->split)
+		devlink_port_split_set(devlink_port, attrs->phys.port_number);
+#endif
+}
+
+#define devlink_port_attrs_set _kc_devlink_port_attrs_set
+
+#endif /* NEED_DEVLINK_PORT_ATTRS_SET_STRUCT */
+
+#endif /* CONFIG_NET_DEVLINK */
+
+#ifdef NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+/* ida_alloc(), ida_alloc_min(), ida_alloc_max(), ida_alloc_range(), and
+ * ida_free() were added in commit 5ade60dda43c ("ida: add new API").
+ *
+ * Also, using "0" as the "end" argument (3rd argument) to ida_simple_get() is
+ * considered the max value, which is why it's used in ida_alloc() and
+ * ida_alloc_min().
+ */
+static inline int ida_alloc(struct ida *ida, gfp_t gfp)
+{
+	return ida_simple_get(ida, 0, 0, gfp);
+}
+
+static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp)
+{
+	return ida_simple_get(ida, min, 0, gfp);
+}
+
+static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
+{
+	return ida_simple_get(ida, 0, max, gfp);
+}
+
+static inline int
+ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, gfp_t gfp)
+{
+	return ida_simple_get(ida, min, max, gfp);
+}
+
+static inline void ida_free(struct ida *ida, unsigned int id)
+{
+	ida_simple_remove(ida, id);
+}
+#endif /* NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE */
+
+/*
+ * dev_printk implementations
+ */
+
+/* NEED_DEV_PRINTK_ONCE
+ *
+ * The dev_*_once family of printk functions was introduced by commit
+ * e135303bd5be ("device: Add dev_<level>_once variants")
+ *
+ * The implementation is very straight forward so we will just implement them
+ * as-is here.
+ */
+#ifdef NEED_DEV_PRINTK_ONCE
+#ifdef CONFIG_PRINTK
+#define dev_level_once(dev_level, dev, fmt, ...)			\
+do {									\
+	static bool __print_once __read_mostly;				\
+									\
+	if (!__print_once) {						\
+		__print_once = true;					\
+		dev_level(dev, fmt, ##__VA_ARGS__);			\
+	}								\
+} while (0)
+#else
+#define dev_level_once(dev_level, dev, fmt, ...)			\
+do {									\
+	if (0)								\
+		dev_level(dev, fmt, ##__VA_ARGS__);			\
+} while (0)
+#endif
+
+#define dev_emerg_once(dev, fmt, ...)					\
+	dev_level_once(dev_emerg, dev, fmt, ##__VA_ARGS__)
+#define dev_alert_once(dev, fmt, ...)					\
+	dev_level_once(dev_alert, dev, fmt, ##__VA_ARGS__)
+#define dev_crit_once(dev, fmt, ...)					\
+	dev_level_once(dev_crit, dev, fmt, ##__VA_ARGS__)
+#define dev_err_once(dev, fmt, ...)					\
+	dev_level_once(dev_err, dev, fmt, ##__VA_ARGS__)
+#define dev_warn_once(dev, fmt, ...)					\
+	dev_level_once(dev_warn, dev, fmt, ##__VA_ARGS__)
+#define dev_notice_once(dev, fmt, ...)					\
+	dev_level_once(dev_notice, dev, fmt, ##__VA_ARGS__)
+#define dev_info_once(dev, fmt, ...)					\
+	dev_level_once(dev_info, dev, fmt, ##__VA_ARGS__)
+#define dev_dbg_once(dev, fmt, ...)					\
+	dev_level_once(dev_dbg, dev, fmt, ##__VA_ARGS__)
+#endif /* NEED_DEV_PRINTK_ONCE */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+
+/* NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+ *
+ * tc_cls_can_offload_and_chain0 was added by upstream commit
+ * 878db9f0f26d ("pkt_cls: add new tc cls helper to check offload flag and
+ * chain index").
+ *
+ * This patch backports this function for older kernels by calling
+ * tc_can_offload() directly.
+ */
+#ifdef NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+#include <net/pkt_cls.h>
+static inline bool
+tc_cls_can_offload_and_chain0(const struct net_device *dev,
+			      struct tc_cls_common_offload *common)
+{
+	if (!tc_can_offload(dev))
+		return false;
+	if (common->chain_index)
+		return false;
+
+	return true;
+}
+#endif /* NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0 */
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+/* NEED_TC_SETUP_QDISC_MQPRIO
+ *
+ * TC_SETUP_QDISC_MQPRIO was added by upstream commit
+ * 575ed7d39e2f ("net_sch: mqprio: Change TC_SETUP_MQPRIO to
+ * TC_SETUP_QDISC_MQPRIO").
+ *
+ * For older kernels which are using TC_SETUP_MQPRIO
+ */
+#ifdef NEED_TC_SETUP_QDISC_MQPRIO
+#define TC_SETUP_QDISC_MQPRIO TC_SETUP_MQPRIO
+#endif /* NEED_TC_SETUP_QDISC_MQPRIO */
+
+/*
+ * ART/TSC functions
+ */
+#ifdef HAVE_PTP_CROSSTIMESTAMP
+/* NEED_CONVERT_ART_NS_TO_TSC
+ *
+ * convert_art_ns_to_tsc was added by upstream commit fc804f65d462 ("x86/tsc:
+ * Convert ART in nanoseconds to TSC").
+ *
+ * This function is similar to convert_art_to_tsc, but expects the input in
+ * terms of nanoseconds, rather than ART cycles. We implement this by
+ * accessing the tsc_khz value and performing the proper calculation. In order
+ * to access the correct clock object on returning, we use the function
+ * convert_art_to_tsc, because the art_related_clocksource is inaccessible.
+ */
+#ifdef NEED_CONVERT_ART_NS_TO_TSC
+#ifdef CONFIG_X86
+#include <asm/tsc.h>
+
+static inline struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
+{
+	struct system_counterval_t system;
+	u64 tmp, res, rem;
+
+	rem = do_div(art_ns, USEC_PER_SEC);
+
+	res = art_ns * tsc_khz;
+	tmp = rem * tsc_khz;
+
+	do_div(tmp, USEC_PER_SEC);
+	res += tmp;
+
+	system = convert_art_to_tsc(art_ns);
+	system.cycles = res;
+
+	return system;
+}
+#else /* CONFIG_X86 */
+static inline struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
+{
+	WARN_ONCE(1, "%s is only supported on X86", __func__);
+	return (struct system_counterval_t){};
+}
+#endif /* !CONFIG_X86 */
+#endif /* NEED_CONVERT_ART_NS_TO_TSC */
+#endif /* HAVE_PTP_CROSSTIMESTAMP */
+
+#endif /* _KCOMPAT_IMPL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_net_dim.c b/drivers/net/ethernet/intel/ice/kcompat_net_dim.c
new file mode 100644
index 000000000000..244c87634aba
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_net_dim.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include "kcompat.h"
+#include "kcompat_dim.h"
+
+/*
+ * Net DIM profiles:
+ *        There are different set of profiles for each CQ period mode.
+ *        There are different set of profiles for RX/TX CQs.
+ *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
+ */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
+#define NET_DIM_RX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+}
+
+#define NET_DIM_RX_CQE_PROFILES { \
+	{2,  256},             \
+	{8,  128},             \
+	{16, 64},              \
+	{32, 64},              \
+	{64, 64}               \
+}
+
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
+static const struct dim_cq_moder
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct dim_cq_moder
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
+};
+
+struct dim_cq_moder
+net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+struct dim_cq_moder
+net_dim_get_def_rx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+#ifdef __CHECKER__
+			/* cppcheck-suppress duplicateValueTernary */
+#endif /* __CHECKER__ */
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
+}
+
+struct dim_cq_moder
+net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+struct dim_cq_moder
+net_dim_get_def_tx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+#ifdef __CHECKER__
+			/* cppcheck-suppress duplicateValueTernary */
+#endif /* __CHECKER__ */
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
+}
+
+static int net_dim_step(struct dim *dim)
+{
+	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
+		return DIM_TOO_TIRED;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+		break;
+	case DIM_GOING_LEFT:
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+		break;
+	}
+
+	dim->tired++;
+	return DIM_STEPPED;
+}
+
+static void net_dim_exit_parking(struct dim *dim)
+{
+	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT;
+	net_dim_step(dim);
+}
+
+static int net_dim_stats_compare(struct dim_stats *curr,
+				 struct dim_stats *prev)
+{
+	if (!prev->bpms)
+		return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
+		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->ppms)
+		return curr->ppms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
+		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->epms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
+		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_state = dim->tune_state;
+	int prev_ix = dim->profile_ix;
+	int stats_res;
+	int step_res;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_SAME)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_PARKING_TIRED:
+		dim->tired--;
+		if (!dim->tired)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_GOING_RIGHT:
+	case DIM_GOING_LEFT:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_BETTER)
+			dim_turn(dim);
+
+		if (dim_on_top(dim)) {
+			dim_park_on_top(dim);
+			break;
+		}
+
+		step_res = net_dim_step(dim);
+		switch (step_res) {
+		case DIM_ON_EDGE:
+			dim_park_on_top(dim);
+			break;
+		case DIM_TOO_TIRED:
+			dim_park_tired(dim);
+			break;
+		}
+
+		break;
+	}
+
+	if (prev_state != DIM_PARKING_ON_TOP ||
+	    dim->tune_state != DIM_PARKING_ON_TOP)
+		dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void net_dim(struct dim *dim, struct dim_sample end_sample)
+{
+	struct dim_stats curr_stats;
+	u16 nevents;
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = BIT_GAP(BITS_PER_TYPE(u16),
+				  end_sample.event_ctr,
+				  dim->start_sample.event_ctr);
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
+		if (net_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				  end_sample.byte_ctr, &dim->start_sample);
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
diff --git a/drivers/net/ethernet/intel/ice/kcompat_overflow.h b/drivers/net/ethernet/intel/ice/kcompat_overflow.h
new file mode 100644
index 000000000000..5e80d6e01d4b
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_overflow.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+/* The is_signed_type macro is redefined in a few places in various kernel
+ * headers. If this header is included at the same time as one of those, we
+ * will generate compilation warnings. Since we can't fix every old kernel,
+ * rename is_signed_type for this file to _kc_is_signed_type. This prevents
+ * the macro name collision, and should be safe since our drivers do not
+ * directly call the macro.
+ */
+#define _kc_is_signed_type(type)       (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - _kc_is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_add_overflow(__a, __b, __d);	\
+})
+
+#define check_sub_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_sub_overflow(__a, __b, __d);	\
+})
+
+#define check_mul_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_mul_overflow(__a, __b, __d);	\
+})
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a + __b;			\
+	*__d < __a;				\
+})
+#define __unsigned_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a - __b;			\
+	__a < __b;				\
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);				\
+	typeof(b) __b = (b);				\
+	typeof(d) __d = (d);				\
+	(void) (&__a == &__b);				\
+	(void) (&__a == __d);				\
+	*__d = __a * __b;				\
+	__builtin_constant_p(__b) ?			\
+	  __b > 0 && __a > type_max(typeof(__a)) / __b : \
+	  __a > 0 && __b > type_max(typeof(__b)) / __a;	 \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a + (u64)__b;		\
+	(((~(__a ^ __b)) & (*__d ^ __a))	\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a - (u64)__b;		\
+	((((__a ^ __b)) & (*__d ^ __a))		\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({				\
+	typeof(a) __a = (a);						\
+	typeof(b) __b = (b);						\
+	typeof(d) __d = (d);						\
+	typeof(a) __tmax = type_max(typeof(a));				\
+	typeof(a) __tmin = type_min(typeof(a));				\
+	(void) (&__a == &__b);						\
+	(void) (&__a == __d);						\
+	*__d = (u64)__a * (u64)__b;					\
+	(__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||	\
+	(__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
+	(__b == (typeof(__b))-1 && __a == __tmin);			\
+})
+
+
+#define check_add_overflow(a, b, d)					\
+	__builtin_choose_expr(_kc_is_signed_type(typeof(a)),		\
+			__signed_add_overflow(a, b, d),			\
+			__unsigned_add_overflow(a, b, d))
+
+#define check_sub_overflow(a, b, d)					\
+	__builtin_choose_expr(_kc_is_signed_type(typeof(a)),		\
+			__signed_sub_overflow(a, b, d),			\
+			__unsigned_sub_overflow(a, b, d))
+
+#define check_mul_overflow(a, b, d)					\
+	__builtin_choose_expr(_kc_is_signed_type(typeof(a)),		\
+			__signed_mul_overflow(a, b, d),			\
+			__unsigned_mul_overflow(a, b, d))
+
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+ *
+ * @a: Value to be shifted
+ * @s: How many bits left to shift
+ * @d: Pointer to where to store the result
+ *
+ * Computes *@d = (@a << @s)
+ *
+ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * make sense. Example conditions:
+ * - 'a << s' causes bits to be lost when stored in *d.
+ * - 's' is garbage (e.g. negative) or so large that the result of
+ *   'a << s' is guaranteed to be 0.
+ * - 'a' is negative.
+ * - 'a << s' sets the sign bit, if any, in '*d'.
+ *
+ * '*d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if false is returned.
+ */
+#define check_shl_overflow(a, s, d) ({					\
+	typeof(a) _a = a;						\
+	typeof(s) _s = s;						\
+	typeof(d) _d = d;						\
+	u64 _a_full = _a;						\
+	unsigned int _to_shift =					\
+		_s >= 0 && _s < 8 * sizeof(*d) ? _s : 0;		\
+	*_d = (_a_full << _to_shift);					\
+	(_to_shift != _s || *_d < 0 || _a < 0 ||			\
+		(*_d >> _to_shift) != _a);				\
+})
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+	if (check_mul_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(n, size, &bytes))
+		return SIZE_MAX;
+	if (check_add_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @n: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @n @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, n)					\
+	__ab_c_size(n,							\
+		    sizeof(*(p)->member) + __must_be_array((p)->member),\
+		    sizeof(*(p)))
+
+#endif /* __LINUX_OVERFLOW_H */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_pldmfw.c b/drivers/net/ethernet/intel/ice/kcompat_pldmfw.c
new file mode 100644
index 000000000000..181700796b4e
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_pldmfw.c
@@ -0,0 +1,1117 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2020, Intel Corporation. */
+
+/* This is taken from upstream "lib/pldmfw/pldmfw.c" */
+
+#include <asm/unaligned.h>
+#include <linux/crc32.h>
+#include <linux/device.h>
+#include <linux/firmware.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+
+#include "kcompat.h"
+#include "kcompat_pldmfw.h"
+
+#if IS_ENABLED(CONFIG_PLDMFW)
+#error "CONFIG_PLDMFW is enabled, use builtin lib/pldmfw instead."
+#endif
+
+/* This section is taken from upstream "lib/pldmfw/pldmfw_private.h" */
+
+/* The following data structures define the layout of a firmware binary
+ * following the "PLDM For Firmware Update Specification", DMTF standard
+ * #DSP0267.
+ *
+ * pldmfw.c uses these structures to implement a simple engine that will parse
+ * a fw binary file in this format and perform a firmware update for a given
+ * device.
+ *
+ * Due to the variable sized data layout, alignment of fields within these
+ * structures is not guaranteed when reading. For this reason, all multi-byte
+ * field accesses should be done using the unaligned access macros.
+ * Additionally, the standard specifies that multi-byte fields are in
+ * LittleEndian format.
+ *
+ * The structure definitions are not made public, in order to keep direct
+ * accesses within code that is prepared to deal with the limitation of
+ * unaligned access.
+ */
+
+/* UUID for PLDM firmware packages: f018878c-cb7d-4943-9800-a02f059aca02 */
+static const uuid_t pldm_firmware_header_id =
+	UUID_INIT(0xf018878c, 0xcb7d, 0x4943,
+		  0x98, 0x00, 0xa0, 0x2f, 0x05, 0x9a, 0xca, 0x02);
+
+/* Revision number of the PLDM header format this code supports */
+#define PACKAGE_HEADER_FORMAT_REVISION 0x01
+
+/* timestamp104 structure defined in PLDM Base specification */
+#define PLDM_TIMESTAMP_SIZE 13
+struct __pldm_timestamp {
+	u8 b[PLDM_TIMESTAMP_SIZE];
+};
+
+/* Package Header Information */
+struct __pldm_header {
+	uuid_t id;			    /* PackageHeaderIdentifier */
+	u8 revision;			    /* PackageHeaderFormatRevision */
+	__le16 size;			    /* PackageHeaderSize */
+	struct __pldm_timestamp release_date; /* PackageReleaseDateTime */
+	__le16 component_bitmap_len;	    /* ComponentBitmapBitLength */
+	u8 version_type;		    /* PackageVersionStringType */
+	u8 version_len;			    /* PackageVersionStringLength */
+
+	/*
+	 * DSP0267 also includes the following variable length fields at the
+	 * end of this structure:
+	 *
+	 * PackageVersionString, length is version_len.
+	 *
+	 * The total size of this section is
+	 *   sizeof(pldm_header) + version_len;
+	 */
+	u8 version_string[];		/* PackageVersionString */
+} __packed __aligned(1);
+
+/* Firmware Device ID Record */
+struct __pldmfw_record_info {
+	__le16 record_len;		/* RecordLength */
+	u8 descriptor_count;		/* DescriptorCount */
+	__le32 device_update_flags;	/* DeviceUpdateOptionFlags */
+	u8 version_type;		/* ComponentImageSetVersionType */
+	u8 version_len;			/* ComponentImageSetVersionLength */
+	__le16 package_data_len;	/* FirmwareDevicePackageDataLength */
+
+	/*
+	 * DSP0267 also includes the following variable length fields at the
+	 * end of this structure:
+	 *
+	 * ApplicableComponents, length is component_bitmap_len from header
+	 * ComponentImageSetVersionString, length is version_len
+	 * RecordDescriptors, a series of TLVs with 16bit type and length
+	 * FirmwareDevicePackageData, length is package_data_len
+	 *
+	 * The total size of each record is
+	 *   sizeof(pldmfw_record_info) +
+	 *   component_bitmap_len (converted to bytes!) +
+	 *   version_len +
+	 *   <length of RecordDescriptors> +
+	 *   package_data_len
+	 */
+	u8 variable_record_data[];
+} __packed __aligned(1);
+
+/* Firmware Descriptor Definition */
+struct __pldmfw_desc_tlv {
+	__le16 type;			/* DescriptorType */
+	__le16 size;			/* DescriptorSize */
+	u8 data[];			/* DescriptorData */
+} __aligned(1);
+
+/* Firmware Device Identification Area */
+struct __pldmfw_record_area {
+	u8 record_count;		/* DeviceIDRecordCount */
+	/* This is not a struct type because the size of each record varies */
+	u8 records[];
+} __aligned(1);
+
+/* Individual Component Image Information */
+struct __pldmfw_component_info {
+	__le16 classification;		/* ComponentClassfication */
+	__le16 identifier;		/* ComponentIdentifier */
+	__le32 comparison_stamp;	/* ComponentComparisonStamp */
+	__le16 options;			/* componentOptions */
+	__le16 activation_method;	/* RequestedComponentActivationMethod */
+	__le32 location_offset;		/* ComponentLocationOffset */
+	__le32 size;			/* ComponentSize */
+	u8 version_type;		/* ComponentVersionStringType */
+	u8 version_len;		/* ComponentVersionStringLength */
+
+	/*
+	 * DSP0267 also includes the following variable length fields at the
+	 * end of this structure:
+	 *
+	 * ComponentVersionString, length is version_len
+	 *
+	 * The total size of this section is
+	 *   sizeof(pldmfw_component_info) + version_len;
+	 */
+	u8 version_string[];		/* ComponentVersionString */
+} __packed __aligned(1);
+
+/* Component Image Information Area */
+struct __pldmfw_component_area {
+	__le16 component_image_count;
+	/* This is not a struct type because the component size varies */
+	u8 components[];
+} __aligned(1);
+
+/**
+ * pldm_first_desc_tlv
+ * @start: byte offset of the start of the descriptor TLVs
+ *
+ * Converts the starting offset of the descriptor TLVs into a pointer to the
+ * first descriptor.
+ */
+#define pldm_first_desc_tlv(start)					\
+	((const struct __pldmfw_desc_tlv *)(start))
+
+/**
+ * pldm_next_desc_tlv
+ * @desc: pointer to a descriptor TLV
+ *
+ * Finds the pointer to the next descriptor following a given descriptor
+ */
+#define pldm_next_desc_tlv(desc)						\
+	((const struct __pldmfw_desc_tlv *)((desc)->data +			\
+					     get_unaligned_le16(&(desc)->size)))
+
+/**
+ * pldm_for_each_desc_tlv
+ * @i: variable to store descriptor index
+ * @desc: variable to store descriptor pointer
+ * @start: byte offset of the start of the descriptors
+ * @count: the number of descriptors
+ *
+ * for loop macro to iterate over all of the descriptors of a given PLDM
+ * record.
+ */
+#define pldm_for_each_desc_tlv(i, desc, start, count)			\
+	for ((i) = 0, (desc) = pldm_first_desc_tlv(start);		\
+	     (i) < (count);						\
+	     (i)++, (desc) = pldm_next_desc_tlv(desc))
+
+/**
+ * pldm_first_record
+ * @start: byte offset of the start of the PLDM records
+ *
+ * Converts a starting offset of the PLDM records into a pointer to the first
+ * record.
+ */
+#define pldm_first_record(start)					\
+	((const struct __pldmfw_record_info *)(start))
+
+/**
+ * pldm_next_record
+ * @record: pointer to a PLDM record
+ *
+ * Finds a pointer to the next record following a given record
+ */
+#define pldm_next_record(record)					\
+	((const struct __pldmfw_record_info *)				\
+	 ((const u8 *)(record) + get_unaligned_le16(&(record)->record_len)))
+
+/**
+ * pldm_for_each_record
+ * @i: variable to store record index
+ * @record: variable to store record pointer
+ * @start: byte offset of the start of the records
+ * @count: the number of records
+ *
+ * for loop macro to iterate over all of the records of a PLDM file.
+ */
+#define pldm_for_each_record(i, record, start, count)			\
+	for ((i) = 0, (record) = pldm_first_record(start);		\
+	     (i) < (count);						\
+	     (i)++, (record) = pldm_next_record(record))
+
+/**
+ * pldm_first_component
+ * @start: byte offset of the start of the PLDM components
+ *
+ * Convert a starting offset of the PLDM components into a pointer to the
+ * first component
+ */
+#define pldm_first_component(start)					\
+	((const struct __pldmfw_component_info *)(start))
+
+/**
+ * pldm_next_component
+ * @component: pointer to a PLDM component
+ *
+ * Finds a pointer to the next component following a given component
+ */
+#define pldm_next_component(component)						\
+	((const struct __pldmfw_component_info *)((component)->version_string +	\
+						  (component)->version_len))
+
+/**
+ * pldm_for_each_component
+ * @i: variable to store component index
+ * @component: variable to store component pointer
+ * @start: byte offset to the start of the first component
+ * @count: the number of components
+ *
+ * for loop macro to iterate over all of the components of a PLDM file.
+ */
+#define pldm_for_each_component(i, component, start, count)		\
+	for ((i) = 0, (component) = pldm_first_component(start);	\
+	     (i) < (count);						\
+	     (i)++, (component) = pldm_next_component(component))
+
+/* End of lib/pldmfw/pldmfw_private.h */
+
+/* Internal structure used to store details about the PLDM image file as it is
+ * being validated and processed.
+ */
+struct pldmfw_priv {
+	struct pldmfw *context;
+	const struct firmware *fw;
+
+	/* current offset of firmware image */
+	size_t offset;
+
+	struct list_head records;
+	struct list_head components;
+
+	/* PLDM Firmware Package Header */
+	const struct __pldm_header *header;
+	u16 total_header_size;
+
+	/* length of the component bitmap */
+	u16 component_bitmap_len;
+	u16 bitmap_size;
+
+	/* Start of the component image information */
+	u16 component_count;
+	const u8 *component_start;
+
+	/* Start pf the firmware device id records */
+	const u8 *record_start;
+	u8 record_count;
+
+	/* The CRC at the end of the package header */
+	u32 header_crc;
+
+	struct pldmfw_record *matching_record;
+};
+
+/**
+ * pldm_check_fw_space - Verify that the firmware image has space left
+ * @data: pointer to private data
+ * @offset: offset to start from
+ * @length: length to check for
+ *
+ * Verify that the firmware data can hold a chunk of bytes with the specified
+ * offset and length.
+ *
+ * Returns: zero on success, or -EFAULT if the image does not have enough
+ * space left to fit the expected length.
+ */
+static int
+pldm_check_fw_space(struct pldmfw_priv *data, size_t offset, size_t length)
+{
+	size_t expected_size = offset + length;
+	struct device *dev = data->context->dev;
+
+	if (data->fw->size < expected_size) {
+		dev_dbg(dev, "Firmware file size smaller than expected. Got %zu bytes, needed %zu bytes\n",
+			data->fw->size, expected_size);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_move_fw_offset - Move the current firmware offset forward
+ * @data: pointer to private data
+ * @bytes_to_move: number of bytes to move the offset forward by
+ *
+ * Check that there is enough space past the current offset, and then move the
+ * offset forward by this ammount.
+ *
+ * Returns: zero on success, or -EFAULT if the image is too small to fit the
+ * expected length.
+ */
+static int
+pldm_move_fw_offset(struct pldmfw_priv *data, size_t bytes_to_move)
+{
+	int err;
+
+	err = pldm_check_fw_space(data, data->offset, bytes_to_move);
+	if (err)
+		return err;
+
+	data->offset += bytes_to_move;
+
+	return 0;
+}
+
+/**
+ * pldm_parse_header - Validate and extract details about the PLDM header
+ * @data: pointer to private data
+ *
+ * Performs initial basic verification of the PLDM image, up to the first
+ * firmware record.
+ *
+ * This includes the following checks and extractions
+ *
+ *   * Verify that the UUID at the start of the header matches the expected
+ *     value as defined in the DSP0267 PLDM specification
+ *   * Check that the revision is 0x01
+ *   * Extract the total header_size and verify that the image is large enough
+ *     to contain at least the length of this header
+ *   * Extract the size of the component bitmap length
+ *   * Extract a pointer to the start of the record area
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_parse_header(struct pldmfw_priv *data)
+{
+	const struct __pldmfw_record_area *record_area;
+	struct device *dev = data->context->dev;
+	const struct __pldm_header *header;
+	size_t header_size;
+	int err;
+
+	err = pldm_move_fw_offset(data, sizeof(*header));
+	if (err)
+		return err;
+
+	header = (const struct __pldm_header *)data->fw->data;
+	data->header = header;
+
+	if (!uuid_equal(&header->id, &pldm_firmware_header_id)) {
+		dev_dbg(dev, "Invalid package header identifier. Expected UUID %pUB, but got %pUB\n",
+			&pldm_firmware_header_id, &header->id);
+		return -EINVAL;
+	}
+
+	if (header->revision != PACKAGE_HEADER_FORMAT_REVISION) {
+		dev_dbg(dev, "Invalid package header revision. Expected revision %u but got %u\n",
+			PACKAGE_HEADER_FORMAT_REVISION, header->revision);
+		return -EOPNOTSUPP;
+	}
+
+	data->total_header_size = get_unaligned_le16(&header->size);
+	header_size = data->total_header_size - sizeof(*header);
+
+	err = pldm_check_fw_space(data, data->offset, header_size);
+	if (err)
+		return err;
+
+	data->component_bitmap_len =
+		get_unaligned_le16(&header->component_bitmap_len);
+
+	if (data->component_bitmap_len % 8 != 0) {
+		dev_dbg(dev, "Invalid component bitmap length. The length is %u, which is not a multiple of 8\n",
+			data->component_bitmap_len);
+		return -EINVAL;
+	}
+
+	data->bitmap_size = data->component_bitmap_len / 8;
+
+	err = pldm_move_fw_offset(data, header->version_len);
+	if (err)
+		return err;
+
+	/* extract a pointer to the record area, which just follows the main
+	 * PLDM header data.
+	 */
+	record_area = (const struct __pldmfw_record_area *)(data->fw->data +
+							 data->offset);
+
+	err = pldm_move_fw_offset(data, sizeof(*record_area));
+	if (err)
+		return err;
+
+	data->record_count = record_area->record_count;
+	data->record_start = record_area->records;
+
+	return 0;
+}
+
+/**
+ * pldm_check_desc_tlv_len - Check that the length matches expectation
+ * @data: pointer to image details
+ * @type: the descriptor type
+ * @size: the length from the descriptor header
+ *
+ * If the descriptor type is one of the documented descriptor types according
+ * to the standard, verify that the provided length matches.
+ *
+ * If the type is not recognized or is VENDOR_DEFINED, return zero.
+ *
+ * Returns: zero on success, or -EINVAL if the specified size of a standard
+ * TLV does not match the expected value defined for that TLV.
+ */
+static int
+pldm_check_desc_tlv_len(struct pldmfw_priv *data, u16 type, u16 size)
+{
+	struct device *dev = data->context->dev;
+	u16 expected_size;
+
+	switch (type) {
+	case PLDM_DESC_ID_PCI_VENDOR_ID:
+	case PLDM_DESC_ID_PCI_DEVICE_ID:
+	case PLDM_DESC_ID_PCI_SUBVENDOR_ID:
+	case PLDM_DESC_ID_PCI_SUBDEV_ID:
+		expected_size = 2;
+		break;
+	case PLDM_DESC_ID_PCI_REVISION_ID:
+		expected_size = 1;
+		break;
+	case PLDM_DESC_ID_PNP_VENDOR_ID:
+		expected_size = 3;
+		break;
+	case PLDM_DESC_ID_IANA_ENTERPRISE_ID:
+	case PLDM_DESC_ID_ACPI_VENDOR_ID:
+	case PLDM_DESC_ID_PNP_PRODUCT_ID:
+	case PLDM_DESC_ID_ACPI_PRODUCT_ID:
+		expected_size = 4;
+		break;
+	case PLDM_DESC_ID_UUID:
+		expected_size = 16;
+		break;
+	case PLDM_DESC_ID_VENDOR_DEFINED:
+		return 0;
+	default:
+		/* Do not report an error on an unexpected TLV */
+		dev_dbg(dev, "Found unrecognized TLV type 0x%04x\n", type);
+		return 0;
+	}
+
+	if (size != expected_size) {
+		dev_dbg(dev, "Found TLV type 0x%04x with unexpected length. Got %u bytes, but expected %u bytes\n",
+			type, size, expected_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_parse_desc_tlvs - Check and skip past a number of TLVs
+ * @data: pointer to private data
+ * @record: pointer to the record this TLV belongs too
+ * @desc_count: descriptor count
+ *
+ * From the current offset, read and extract the descriptor TLVs, updating the
+ * current offset each time.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+pldm_parse_desc_tlvs(struct pldmfw_priv *data, struct pldmfw_record *record, u8 desc_count)
+{
+	const struct __pldmfw_desc_tlv *__desc;
+	const u8 *desc_start;
+	u8 i;
+
+	desc_start = data->fw->data + data->offset;
+
+	pldm_for_each_desc_tlv(i, __desc, desc_start, desc_count) {
+		struct pldmfw_desc_tlv *desc;
+		int err;
+		u16 type, size;
+
+		err = pldm_move_fw_offset(data, sizeof(*__desc));
+		if (err)
+			return err;
+
+		type = get_unaligned_le16(&__desc->type);
+
+		/* According to DSP0267, this only includes the data field */
+		size = get_unaligned_le16(&__desc->size);
+
+		err = pldm_check_desc_tlv_len(data, type, size);
+		if (err)
+			return err;
+
+		/* check that we have space and move the offset forward */
+		err = pldm_move_fw_offset(data, size);
+		if (err)
+			return err;
+
+		desc = (struct pldmfw_desc_tlv *)kzalloc(sizeof(*desc), GFP_KERNEL);
+		if (!desc)
+			return -ENOMEM;
+
+		desc->type = type;
+		desc->size = size;
+		desc->data = __desc->data;
+
+		list_add_tail(&desc->entry, &record->descs);
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_parse_one_record - Verify size of one PLDM record
+ * @data: pointer to image details
+ * @__record: pointer to the record to check
+ *
+ * This function checks that the record size does not exceed either the size
+ * of the firmware file or the total length specified in the header section.
+ *
+ * It also verifies that the recorded length of the start of the record
+ * matches the size calculated by adding the static structure length, the
+ * component bitmap length, the version string length, the length of all
+ * descriptor TLVs, and the length of the package data.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+pldm_parse_one_record(struct pldmfw_priv *data,
+		      const struct __pldmfw_record_info *__record)
+{
+	struct pldmfw_record *record;
+	size_t measured_length;
+	int err;
+	const u8 *bitmap_ptr;
+	u16 record_len;
+	int i;
+
+	/* Make a copy and insert it into the record list */
+	record = (struct pldmfw_record *)kzalloc(sizeof(*record), GFP_KERNEL);
+	if (!record)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&record->descs);
+	list_add_tail(&record->entry, &data->records);
+
+	/* Then check that we have space and move the offset */
+	err = pldm_move_fw_offset(data, sizeof(*__record));
+	if (err)
+		return err;
+
+	record_len = get_unaligned_le16(&__record->record_len);
+	record->package_data_len = get_unaligned_le16(&__record->package_data_len);
+	record->version_len = __record->version_len;
+	record->version_type = __record->version_type;
+
+	bitmap_ptr = data->fw->data + data->offset;
+
+	/* check that we have space for the component bitmap length */
+	err = pldm_move_fw_offset(data, data->bitmap_size);
+	if (err)
+		return err;
+
+	record->component_bitmap_len = data->component_bitmap_len;
+	record->component_bitmap = bitmap_zalloc(record->component_bitmap_len,
+						 GFP_KERNEL);
+	if (!record->component_bitmap)
+		return -ENOMEM;
+
+	for (i = 0; i < data->bitmap_size; i++)
+		bitmap_set_value8(record->component_bitmap, bitmap_ptr[i], i * 8);
+
+	record->version_string = data->fw->data + data->offset;
+
+	err = pldm_move_fw_offset(data, record->version_len);
+	if (err)
+		return err;
+
+	/* Scan through the descriptor TLVs and find the end */
+	err = pldm_parse_desc_tlvs(data, record, __record->descriptor_count);
+	if (err)
+		return err;
+
+	record->package_data = data->fw->data + data->offset;
+
+	err = pldm_move_fw_offset(data, record->package_data_len);
+	if (err)
+		return err;
+
+	measured_length = data->offset - ((const u8 *)__record - data->fw->data);
+	if (measured_length != record_len) {
+		dev_dbg(data->context->dev, "Unexpected record length. Measured record length is %zu bytes, expected length is %u bytes\n",
+			measured_length, record_len);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_parse_records - Locate the start of the component area
+ * @data: pointer to private data
+ *
+ * Extract the record count, and loop through each record, searching for the
+ * component area.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_parse_records(struct pldmfw_priv *data)
+{
+	const struct __pldmfw_component_area *component_area;
+	const struct __pldmfw_record_info *record;
+	int err;
+	u8 i;
+
+	pldm_for_each_record(i, record, data->record_start, data->record_count) {
+		err = pldm_parse_one_record(data, record);
+		if (err)
+			return err;
+	}
+
+	/* Extract a pointer to the component area, which just follows the
+	 * PLDM device record data.
+	 */
+	component_area = (const struct __pldmfw_component_area *)(data->fw->data + data->offset);
+
+	err = pldm_move_fw_offset(data, sizeof(*component_area));
+	if (err)
+		return err;
+
+	data->component_count =
+		get_unaligned_le16(&component_area->component_image_count);
+	data->component_start = component_area->components;
+
+	return 0;
+}
+
+/**
+ * pldm_parse_components - Locate the CRC header checksum
+ * @data: pointer to private data
+ *
+ * Extract the component count, and find the pointer to the component area.
+ * Scan through each component searching for the end, which should point to
+ * the package header checksum.
+ *
+ * Extract the package header CRC and save it for verification.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_parse_components(struct pldmfw_priv *data)
+{
+	const struct __pldmfw_component_info *__component;
+	struct device *dev = data->context->dev;
+	const u8 *header_crc_ptr;
+	int err;
+	u8 i;
+
+	pldm_for_each_component(i, __component, data->component_start, data->component_count) {
+		struct pldmfw_component *component;
+		u32 offset, size;
+
+		err = pldm_move_fw_offset(data, sizeof(*__component));
+		if (err)
+			return err;
+
+		err = pldm_move_fw_offset(data, __component->version_len);
+		if (err)
+			return err;
+
+		offset = get_unaligned_le32(&__component->location_offset);
+		size = get_unaligned_le32(&__component->size);
+
+		err = pldm_check_fw_space(data, offset, size);
+		if (err)
+			return err;
+
+		component = (struct pldmfw_component *)kzalloc(sizeof(*component), GFP_KERNEL);
+		if (!component)
+			return -ENOMEM;
+
+		component->index = i;
+		component->classification = get_unaligned_le16(&__component->classification);
+		component->identifier = get_unaligned_le16(&__component->identifier);
+		component->comparison_stamp = get_unaligned_le32(&__component->comparison_stamp);
+		component->options = get_unaligned_le16(&__component->options);
+		component->activation_method = get_unaligned_le16(&__component->activation_method);
+		component->version_type = __component->version_type;
+		component->version_len = __component->version_len;
+		component->version_string = __component->version_string;
+		component->component_data = data->fw->data + offset;
+		component->component_size = size;
+
+		list_add_tail(&component->entry, &data->components);
+	}
+
+	header_crc_ptr = data->fw->data + data->offset;
+
+	err = pldm_move_fw_offset(data, sizeof(data->header_crc));
+	if (err)
+		return err;
+
+	/* Make sure that we reached the expected offset */
+	if (data->offset != data->total_header_size) {
+		dev_dbg(dev, "Invalid firmware header size. Expected %u but got %zu\n",
+			data->total_header_size, data->offset);
+		return -EFAULT;
+	}
+
+	data->header_crc = get_unaligned_le32(header_crc_ptr);
+
+	return 0;
+}
+
+/**
+ * pldm_verify_header_crc - Verify that the CRC in the header matches
+ * @data: pointer to private data
+ *
+ * Calculates the 32-bit CRC using the standard IEEE 802.3 CRC polynomial and
+ * compares it to the value stored in the header.
+ *
+ * Returns: zero on success if the CRC matches, or -EBADMSG on an invalid CRC.
+ */
+static int pldm_verify_header_crc(struct pldmfw_priv *data)
+{
+	struct device *dev = data->context->dev;
+	u32 calculated_crc;
+	size_t length;
+
+	/* Calculate the 32-bit CRC of the header header contents up to but
+	 * not including the checksum. Note that the Linux crc32_le function
+	 * does not perform an expected final XOR.
+	 */
+	length = data->offset - sizeof(data->header_crc);
+	calculated_crc = crc32_le(~0, data->fw->data, length) ^ ~0;
+
+	if (calculated_crc != data->header_crc) {
+		dev_dbg(dev, "Invalid CRC in firmware header. Got 0x%08x but expected 0x%08x\n",
+			calculated_crc, data->header_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+/**
+ * pldmfw_free_priv - Free memory allocated while parsing the PLDM image
+ * @data: pointer to the PLDM data structure
+ *
+ * Loops through and clears all allocated memory associated with each
+ * allocated descriptor, record, and component.
+ */
+static void pldmfw_free_priv(struct pldmfw_priv *data)
+{
+	struct pldmfw_component *component, *c_safe;
+	struct pldmfw_record *record, *r_safe;
+	struct pldmfw_desc_tlv *desc, *d_safe;
+
+	list_for_each_entry_safe(component, c_safe, &data->components, entry) {
+		list_del(&component->entry);
+		kfree(component);
+	}
+
+	list_for_each_entry_safe(record, r_safe, &data->records, entry) {
+		list_for_each_entry_safe(desc, d_safe, &record->descs, entry) {
+			list_del(&desc->entry);
+			kfree(desc);
+		}
+
+		if (record->component_bitmap) {
+			bitmap_free(record->component_bitmap);
+			record->component_bitmap = NULL;
+		}
+
+		list_del(&record->entry);
+		kfree(record);
+	}
+}
+
+/**
+ * pldm_parse_image - parse and extract details from PLDM image
+ * @data: pointer to private data
+ *
+ * Verify that the firmware file contains valid data for a PLDM firmware
+ * file. Extract useful pointers and data from the firmware file and store
+ * them in the data structure.
+ *
+ * The PLDM firmware file format is defined in DMTF DSP0267 1.0.0. Care
+ * should be taken to use get_unaligned_le* when accessing data from the
+ * pointers in data.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_parse_image(struct pldmfw_priv *data)
+{
+	int err;
+
+	if (WARN_ON(!(data->context->dev && data->fw->data && data->fw->size)))
+		return -EINVAL;
+
+	err = pldm_parse_header(data);
+	if (err)
+		return err;
+
+	err = pldm_parse_records(data);
+	if (err)
+		return err;
+
+	err = pldm_parse_components(data);
+	if (err)
+		return err;
+
+	return pldm_verify_header_crc(data);
+}
+
+/* these are u32 so that we can store PCI_ANY_ID */
+struct pldm_pci_record_id {
+	int vendor;
+	int device;
+	int subsystem_vendor;
+	int subsystem_device;
+};
+
+/**
+ * pldmfw_op_pci_match_record - Check if a PCI device matches the record
+ * @context: PLDM fw update structure
+ * @record: list of records extracted from the PLDM image
+ *
+ * Determine of the PCI device associated with this device matches the record
+ * data provided.
+ *
+ * Searches the descriptor TLVs and extracts the relevant descriptor data into
+ * a pldm_pci_record_id. This is then compared against the PCI device ID
+ * information.
+ *
+ * Returns: true if the device matches the record, false otherwise.
+ */
+bool pldmfw_op_pci_match_record(struct pldmfw *context, struct pldmfw_record *record)
+{
+	struct pci_dev *pdev = to_pci_dev(context->dev);
+	struct pldm_pci_record_id id = {
+		.vendor = PCI_ANY_ID,
+		.device = PCI_ANY_ID,
+		.subsystem_vendor = PCI_ANY_ID,
+		.subsystem_device = PCI_ANY_ID,
+	};
+	struct pldmfw_desc_tlv *desc;
+
+	list_for_each_entry(desc, &record->descs, entry) {
+		u16 value;
+		int *ptr;
+
+		switch (desc->type) {
+		case PLDM_DESC_ID_PCI_VENDOR_ID:
+			ptr = &id.vendor;
+			break;
+		case PLDM_DESC_ID_PCI_DEVICE_ID:
+			ptr = &id.device;
+			break;
+		case PLDM_DESC_ID_PCI_SUBVENDOR_ID:
+			ptr = &id.subsystem_vendor;
+			break;
+		case PLDM_DESC_ID_PCI_SUBDEV_ID:
+			ptr = &id.subsystem_device;
+			break;
+		default:
+			/* Skip unrelated TLVs */
+			continue;
+		}
+
+		value = get_unaligned_le16(desc->data);
+		/* A value of zero for one of the descriptors is sometimes
+		 * used when the record should ignore this field when matching
+		 * device. For example if the record applies to any subsystem
+		 * device or vendor.
+		 */
+		if (value)
+			*ptr = (int)value;
+		else
+			*ptr = PCI_ANY_ID;
+	}
+
+	if ((id.vendor == PCI_ANY_ID || id.vendor == pdev->vendor) &&
+	    (id.device == PCI_ANY_ID || id.device == pdev->device) &&
+	    (id.subsystem_vendor == PCI_ANY_ID || id.subsystem_vendor == pdev->subsystem_vendor) &&
+	    (id.subsystem_device == PCI_ANY_ID || id.subsystem_device == pdev->subsystem_device))
+		return true;
+	else
+		return false;
+}
+
+/**
+ * pldm_find_matching_record - Find the first matching PLDM record
+ * @data: pointer to private data
+ *
+ * Search through PLDM records and find the first matching entry. It is
+ * expected that only one entry matches.
+ *
+ * Store a pointer to the matching record, if found.
+ *
+ * Returns: zero on success, or -ENOENT if no matching record is found.
+ */
+static int pldm_find_matching_record(struct pldmfw_priv *data)
+{
+	struct pldmfw_record *record;
+
+	list_for_each_entry(record, &data->records, entry) {
+		if (data->context->ops->match_record(data->context, record)) {
+			data->matching_record = record;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+/**
+ * pldm_send_package_data - Send firmware the package data for the record
+ * @data: pointer to private data
+ *
+ * Send the package data associated with the matching record to the firmware,
+ * using the send_pkg_data operation.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+pldm_send_package_data(struct pldmfw_priv *data)
+{
+	struct pldmfw_record *record = data->matching_record;
+	const struct pldmfw_ops *ops = data->context->ops;
+
+	return ops->send_package_data(data->context, record->package_data,
+				      record->package_data_len);
+}
+
+/**
+ * pldm_send_component_tables - Send component table information to firmware
+ * @data: pointer to private data
+ *
+ * Loop over each component, sending the applicable components to the firmware
+ * via the send_component_table operation.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+pldm_send_component_tables(struct pldmfw_priv *data)
+{
+	unsigned long *bitmap = data->matching_record->component_bitmap;
+	struct pldmfw_component *component;
+	int err;
+
+	list_for_each_entry(component, &data->components, entry) {
+		u8 index = component->index, transfer_flag = 0;
+
+		/* Skip components which are not intended for this device */
+		if (!test_bit(index, bitmap))
+			continue;
+
+		/* determine whether this is the start, middle, end, or both
+		 * the start and end of the component tables
+		 */
+		if (index == find_first_bit(bitmap, data->component_bitmap_len))
+			transfer_flag |= PLDM_TRANSFER_FLAG_START;
+		if (index == find_last_bit(bitmap, data->component_bitmap_len))
+			transfer_flag |= PLDM_TRANSFER_FLAG_END;
+		if (!transfer_flag)
+			transfer_flag = PLDM_TRANSFER_FLAG_MIDDLE;
+
+		err = data->context->ops->send_component_table(data->context,
+							       component,
+							       transfer_flag);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_flash_components - Program each component to device flash
+ * @data: pointer to private data
+ *
+ * Loop through each component that is active for the matching device record,
+ * and send it to the device driver for flashing.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_flash_components(struct pldmfw_priv *data)
+{
+	unsigned long *bitmap = data->matching_record->component_bitmap;
+	struct pldmfw_component *component;
+	int err;
+
+	list_for_each_entry(component, &data->components, entry) {
+		u8 index = component->index;
+
+		/* Skip components which are not intended for this device */
+		if (!test_bit(index, bitmap))
+			continue;
+
+		err = data->context->ops->flash_component(data->context, component);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_finalize_update - Finalize the device flash update
+ * @data: pointer to private data
+ *
+ * Tell the device driver to perform any remaining logic to complete the
+ * device update.
+ *
+ * Returns: zero on success, or a PLFM_FWU error indicating the reason for
+ * failure.
+ */
+static int pldm_finalize_update(struct pldmfw_priv *data)
+{
+	if (data->context->ops->finalize_update)
+		return data->context->ops->finalize_update(data->context);
+
+	return 0;
+}
+
+/**
+ * pldmfw_flash_image - Write a PLDM-formatted firmware image to the device
+ * @context: ops and data for firmware update
+ * @fw: firmware object pointing to the relevant firmware file to program
+ *
+ * Parse the data for a given firmware file, verifying that it is a valid PLDM
+ * formatted image that matches this device.
+ *
+ * Extract the device record Package Data and Component Tables and send them
+ * to the device firmware. Extract and write the flash data for each of the
+ * components indicated in the firmware file.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+int pldmfw_flash_image(struct pldmfw *context, const struct firmware *fw)
+{
+	struct pldmfw_priv *data;
+	int err;
+
+	data = (struct pldmfw_priv *)kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&data->records);
+	INIT_LIST_HEAD(&data->components);
+
+	data->fw = fw;
+	data->context = context;
+
+	err = pldm_parse_image(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_find_matching_record(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_send_package_data(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_send_component_tables(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_flash_components(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_finalize_update(data);
+
+out_release_data:
+	pldmfw_free_priv(data);
+	kfree(data);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/intel/ice/kcompat_pldmfw.h b/drivers/net/ethernet/intel/ice/kcompat_pldmfw.h
new file mode 100644
index 000000000000..fa425cd7ec70
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_pldmfw.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020, Intel Corporation. */
+
+/* This is taken from upstream <linux/pldmfw.h> */
+
+#ifndef _KCOMPAT_PLDMFW_H_
+#define _KCOMPAT_PLDMFW_H_
+
+#ifdef _PLDMFW_H_
+#error "Do not include both kcompat_pldmfw.h and <linux/pldmfw.h>"
+#endif
+
+#if IS_ENABLED(CONFIG_PLDMFW)
+#error "CONFIG_PLDMFW is enabled, use <linux/pldmfw.h>"
+#endif
+
+#include <linux/list.h>
+#include <linux/firmware.h>
+
+#define PLDM_DEVICE_UPDATE_CONTINUE_AFTER_FAIL BIT(0)
+
+#define PLDM_STRING_TYPE_UNKNOWN	0
+#define PLDM_STRING_TYPE_ASCII		1
+#define PLDM_STRING_TYPE_UTF8		2
+#define PLDM_STRING_TYPE_UTF16		3
+#define PLDM_STRING_TYPE_UTF16LE	4
+#define PLDM_STRING_TYPE_UTF16BE	5
+
+struct pldmfw_record {
+	struct list_head entry;
+
+	/* List of descriptor TLVs */
+	struct list_head descs;
+
+	/* Component Set version string*/
+	const u8 *version_string;
+	u8 version_type;
+	u8 version_len;
+
+	/* Package Data length */
+	u16 package_data_len;
+
+	/* Bitfield of Device Update Flags */
+	u32 device_update_flags;
+
+	/* Package Data block */
+	const u8 *package_data;
+
+	/* Bitmap of components applicable to this record */
+	unsigned long *component_bitmap;
+	u16 component_bitmap_len;
+};
+
+/* Standard descriptor TLV identifiers */
+#define PLDM_DESC_ID_PCI_VENDOR_ID	0x0000
+#define PLDM_DESC_ID_IANA_ENTERPRISE_ID	0x0001
+#define PLDM_DESC_ID_UUID		0x0002
+#define PLDM_DESC_ID_PNP_VENDOR_ID	0x0003
+#define PLDM_DESC_ID_ACPI_VENDOR_ID	0x0004
+#define PLDM_DESC_ID_PCI_DEVICE_ID	0x0100
+#define PLDM_DESC_ID_PCI_SUBVENDOR_ID	0x0101
+#define PLDM_DESC_ID_PCI_SUBDEV_ID	0x0102
+#define PLDM_DESC_ID_PCI_REVISION_ID	0x0103
+#define PLDM_DESC_ID_PNP_PRODUCT_ID	0x0104
+#define PLDM_DESC_ID_ACPI_PRODUCT_ID	0x0105
+#define PLDM_DESC_ID_VENDOR_DEFINED	0xFFFF
+
+struct pldmfw_desc_tlv {
+	struct list_head entry;
+
+	const u8 *data;
+	u16 type;
+	u16 size;
+};
+
+#define PLDM_CLASSIFICATION_UNKNOWN		0x0000
+#define PLDM_CLASSIFICATION_OTHER		0x0001
+#define PLDM_CLASSIFICATION_DRIVER		0x0002
+#define PLDM_CLASSIFICATION_CONFIG_SW		0x0003
+#define PLDM_CLASSIFICATION_APP_SW		0x0004
+#define PLDM_CLASSIFICATION_INSTRUMENTATION	0x0005
+#define PLDM_CLASSIFICATION_BIOS		0x0006
+#define PLDM_CLASSIFICATION_DIAGNOSTIC_SW	0x0007
+#define PLDM_CLASSIFICATION_OS			0x0008
+#define PLDM_CLASSIFICATION_MIDDLEWARE		0x0009
+#define PLDM_CLASSIFICATION_FIRMWARE		0x000A
+#define PLDM_CLASSIFICATION_CODE		0x000B
+#define PLDM_CLASSIFICATION_SERVICE_PACK	0x000C
+#define PLDM_CLASSIFICATION_SOFTWARE_BUNDLE	0x000D
+
+#define PLDM_ACTIVATION_METHOD_AUTO		BIT(0)
+#define PLDM_ACTIVATION_METHOD_SELF_CONTAINED	BIT(1)
+#define PLDM_ACTIVATION_METHOD_MEDIUM_SPECIFIC	BIT(2)
+#define PLDM_ACTIVATION_METHOD_REBOOT		BIT(3)
+#define PLDM_ACTIVATION_METHOD_DC_CYCLE		BIT(4)
+#define PLDM_ACTIVATION_METHOD_AC_CYCLE		BIT(5)
+
+#define PLDMFW_COMPONENT_OPTION_FORCE_UPDATE		BIT(0)
+#define PLDMFW_COMPONENT_OPTION_USE_COMPARISON_STAMP	BIT(1)
+
+struct pldmfw_component {
+	struct list_head entry;
+
+	/* component identifier */
+	u16 classification;
+	u16 identifier;
+
+	u16 options;
+	u16 activation_method;
+
+	u32 comparison_stamp;
+
+	u32 component_size;
+	const u8 *component_data;
+
+	/* Component version string */
+	const u8 *version_string;
+	u8 version_type;
+	u8 version_len;
+
+	/* component index */
+	u8 index;
+
+};
+
+/* Transfer flag used for sending components to the firmware */
+#define PLDM_TRANSFER_FLAG_START		BIT(0)
+#define PLDM_TRANSFER_FLAG_MIDDLE		BIT(1)
+#define PLDM_TRANSFER_FLAG_END			BIT(2)
+
+struct pldmfw_ops;
+
+/* Main entry point to the PLDM firmware update engine. Device drivers
+ * should embed this in a private structure and use container_of to obtain
+ * a pointer to their own data, used to implement the device specific
+ * operations.
+ */
+struct pldmfw {
+	const struct pldmfw_ops *ops;
+	struct device *dev;
+};
+
+bool pldmfw_op_pci_match_record(struct pldmfw *context, struct pldmfw_record *record);
+
+/* Operations invoked by the generic PLDM firmware update engine. Used to
+ * implement device specific logic.
+ *
+ * @match_record: check if the device matches the given record. For
+ * convenience, a standard implementation is provided for PCI devices.
+ *
+ * @send_package_data: send the package data associated with the matching
+ * record to firmware.
+ *
+ * @send_component_table: send the component data associated with a given
+ * component to firmware. Called once for each applicable component.
+ *
+ * @flash_component: Flash the data for a given component to the device.
+ * Called once for each applicable component, after all component tables have
+ * been sent.
+ *
+ * @finalize_update: (optional) Finish the update. Called after all components
+ * have been flashed.
+ */
+struct pldmfw_ops {
+	bool (*match_record)(struct pldmfw *context, struct pldmfw_record *record);
+	int (*send_package_data)(struct pldmfw *context, const u8 *data, u16 length);
+	int (*send_component_table)(struct pldmfw *context, struct pldmfw_component *component,
+				    u8 transfer_flag);
+	int (*flash_component)(struct pldmfw *context, struct pldmfw_component *component);
+	int (*finalize_update)(struct pldmfw *context);
+};
+
+int pldmfw_flash_image(struct pldmfw *context, const struct firmware *fw);
+
+#endif
diff --git a/drivers/net/ethernet/intel/ice/kcompat_rhel_defs.h b/drivers/net/ethernet/intel/ice/kcompat_rhel_defs.h
new file mode 100644
index 000000000000..e5ddbe16e6a0
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_rhel_defs.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_RHEL_DEFS_H_
+#define _KCOMPAT_RHEL_DEFS_H_
+
+/* This is the RedHat Enterprise Linux distribution specific definitions file.
+ * It defines what features need backports for a given version of the RHEL
+ * kernel.
+ *
+ * It checks the RHEL_RELEASE_CODE and RHEL_RELEASE_VERSION macros to decide
+ * what support the target kernel has.
+ *
+ * It assumes that kcompat_std_defs.h has already been processed, and will
+ * #define or #undef any flags that have changed based on backports done by
+ * RHEL.
+ */
+
+#if !RHEL_RELEASE_CODE
+#error "RHEL_RELEASE_CODE is 0 or undefined"
+#endif
+
+#ifndef RHEL_RELEASE_VERSION
+#error "RHEL_RELEASE_VERSION is undefined"
+#endif
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,3))
+#else /* >= 7.3 */
+#undef NEED_DEV_PRINTK_ONCE
+#endif /* 7.3 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5))
+#else /* >= 7.5 */
+#define HAVE_TCF_EXTS_TO_LIST
+#endif /* 7.5 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,6))
+#else /* >= 7.6 */
+#undef NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+#undef NEED_TC_SETUP_QDISC_MQPRIO
+#endif /* 7.6 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,7))
+#else /* >= 7.7 */
+#define HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+#endif /* 7.7 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0))
+#else /* >= 8.0 */
+#undef HAVE_TCF_EXTS_TO_LIST
+#define HAVE_TCF_EXTS_FOR_EACH_ACTION
+#endif /* 7.5 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,1))
+#define NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#else /* >= 8.1 */
+#undef NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#endif /* 8.1 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,2))
+#else /* >= 8.2 */
+#undef NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY
+#undef NEED_SKB_FRAG_OFF
+#undef NEED_SKB_FRAG_OFF_ADD
+#undef NEED_FLOW_INDR_BLOCK_CB_REGISTER
+#define HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID
+#endif /* 8.2 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,4))
+#else /* >= 8.4 */
+#undef NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+#undef NEED_NET_PREFETCH
+#undef NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+#undef HAVE_XDP_QUERY_PROG
+#endif /* 8.4 */
+
+#endif /* _KCOMPAT_RHEL_DEFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_sles_defs.h b/drivers/net/ethernet/intel/ice/kcompat_sles_defs.h
new file mode 100644
index 000000000000..5ee6563993d5
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_sles_defs.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_SLES_DEFS_H_
+#define _KCOMPAT_SLES_DEFS_H_
+
+/* This is the SUSE Linux Enterprise distribution specific definitions file.
+ * It defines what features need backports for a given version of the SUSE
+ * Linux Enterprise kernel.
+ *
+ * It checks a combination of the LINUX_VERSION code and the
+ * SLE_LOCALVERSION_CODE to determine what support the kernel has.
+ *
+ * It assumes that kcompat_std_defs.h has already been processed, and will
+ * #define or #undef any flags that have changed based on backports done by
+ * SUSE.
+ */
+
+#ifndef LINUX_VERSION_CODE
+#error "LINUX_VERSION_CODE is undefined"
+#endif
+
+#ifndef KERNEL_VERSION
+#error "KERNEL_VERSION is undefined"
+#endif
+
+#if !SLE_KERNEL_REVISION
+#error "SLE_KERNEL_REVISION is 0 or undefined"
+#endif
+
+#if SLE_KERNEL_REVISION > 65535
+#error "SLE_KERNEL_REVISION is unexpectedly large"
+#endif
+
+/* SLE kernel versions are a combination of the LINUX_VERSION_CODE along with
+ * an extra digit that indicates the SUSE specific revision of that kernel.
+ * This value is found in the CONFIG_LOCALVERSION of the SUSE kernel, which is
+ * extracted by common.mk and placed into SLE_KERNEL_REVISION_CODE.
+ *
+ * We combine the value of SLE_KERNEL_REVISION along with the LINUX_VERSION code
+ * to generate the useful value that determines what specific kernel we're
+ * dealing with.
+ *
+ * Just in case the SLE_KERNEL_REVISION ever goes above 255, we reserve 16 bits
+ * instead of 8 for this value.
+ */
+#define SLE_KERNEL_CODE ((LINUX_VERSION_CODE << 16) + SLE_KERNEL_REVISION)
+#define SLE_KERNEL_VERSION(a,b,c,d) ((KERNEL_VERSION(a,b,c) << 16) + (d))
+
+/* Unlike RHEL, SUSE kernels are not always tied to a single service pack. For
+ * example, 4.12.14 was used as the base for SLE 15 SP1, SLE 12 SP4, and SLE 12
+ * SP5.
+ *
+ * You can find the patches that SUSE applied to the kernel tree at
+ * https://github.com/SUSE/kernel-source.
+ *
+ * You can find the correct kernel version for a check by using steps similar
+ * to the following
+ *
+ * 1) download the kernel-source repo
+ * 2) checkout the relevant branch, i.e SLE15-SP3
+ * 3) find the relevant backport you're interested in the patches.suse
+ *    directory
+ * 4) git log <patch file> to locate the commit that introduced the backport
+ * 5) git describe --contains to find the relevant tag that includes that
+ *    commit, i.e. rpm-5.3.18-37
+ * 6) those digits represent the SLE kernel that introduced that backport.
+ *
+ * Try to keep the checks in SLE_KERNEL_CODE order and condense where
+ * possible.
+ */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE > SLE_KERNEL_VERSION(4,12,14,23) && \
+     SLE_KERNEL_CODE < SLE_KERNEL_VERSION(4,12,14,94))
+/*
+ * 4.12.14 is used as the base for SLE 12 SP4, SLE 12 SP5, SLE 15, and SLE 15
+ * SP1. Unfortunately the revision codes do not line up cleanly. SLE 15
+ * launched with 4.12.14-23. It appears that SLE 12 SP4 and SLE 15 SP1 both
+ * diverged from this point, with SLE 12 SP4 kernels starting around
+ * 4.12.14-94. A few backports for SLE 15 SP1 landed in some alpha and beta
+ * kernels tagged between 4.12.14-25 up to 4.12.14-32. These changes did not
+ * make it into SLE 12 SP4. This was cleaned up with SLE 12 SP5 by an apparent
+ * merge in 4.12.14-111. The official launch of SLE 15 SP1 ended up with
+ * version 4.12.14-195.
+ *
+ * Because of this inconsistency and because all of these kernels appear to be
+ * alpha or beta kernel releases for SLE 15 SP1, we do not rely on version
+ * checks between this range. Issue a warning to indicate that we do not
+ * support these.
+ */
+#warning "SLE kernel versions between 4.12.14-23 and 4.12.14-94 are not supported"
+#endif
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(4,12,14,100))
+#else /* >= 4.12.14-100 */
+#undef HAVE_TCF_EXTS_TO_LIST
+#define HAVE_TCF_EXTS_FOR_EACH_ACTION
+#endif /* 4.12.14-100 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(4,12,14,111))
+#define NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#else /* >= 4.12.14-111 */
+#define HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+#undef NEED_MACVLAN_ACCEL_PRIV
+#undef NEED_MACVLAN_RELEASE_L2FW_OFFLOAD
+#undef NEED_MACVLAN_SUPPORTS_DEST_FILTER
+#undef NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#endif /* 4.12.14-111 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(4,12,14,120))
+#else /* >= 4.12.14-120 */
+#define HAVE_NDO_SELECT_QUEUE_SB_DEV
+#define HAVE_TCF_MIRRED_DEV
+#define HAVE_TCF_BLOCK
+#define HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#define HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+#undef NEED_TC_SETUP_QDISC_MQPRIO
+#undef NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+#endif /* 4.12.14-120 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(5,3,8,2))
+#else /* >= 5.3.8-2 */
+#undef NEED_FLOW_INDR_BLOCK_CB_REGISTER
+#undef NEED_SKB_FRAG_OFF
+#undef NEED_SKB_FRAG_OFF_ADD
+#endif /* 5.3.8-2 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(5,3,18,34))
+#else /* >= 5.3.18-34 */
+#undef NEED_DEVLINK_REGION_CREATE_OPS
+#undef NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+#endif /* 5.3.18-34 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(5,3,18,37))
+#else /* >= 5.3.18-37 */
+#undef NEED_NET_PREFETCH
+#endif /* 5.3.18-37 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(5,3,18,38))
+#else /* >= 5.3.18-38 */
+#undef NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+#endif /* 5.3.18-38 */
+
+#endif /* _KCOMPAT_SLES_DEFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_std_defs.h b/drivers/net/ethernet/intel/ice/kcompat_std_defs.h
new file mode 100644
index 000000000000..1d657809ff9e
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_std_defs.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_STD_DEFS_H_
+#define _KCOMPAT_STD_DEFS_H_
+
+/* This file contains the definitions for what kernel features need backports
+ * for a given kernel. It targets only the standard stable kernel releases.
+ * It must check only LINUX_VERSION_CODE and assume the kernel is a standard
+ * release, and not a custom distribution.
+ *
+ * It must define HAVE_<FLAG> and NEED_<FLAG> for features. It must not
+ * implement any backports, instead leaving the implementation to the
+ * kcompat_impl.h header.
+ *
+ * If a feature can be easily implemented as a replacement macro or fully
+ * backported, use a NEED_<FLAG> to indicate that the feature needs
+ * a backport. (If NEED_<FLAG> is undefined, then no backport for that feature
+ * is needed).
+ *
+ * If a feature cannot be easily implemented in kcompat directly, but
+ * requires drivers to make specific changes such as stripping out an entire
+ * feature or modifying a function pointer prototype, use a HAVE_<FLAG>.
+ */
+
+#ifndef LINUX_VERSION_CODE
+#error "LINUX_VERSION_CODE is undefined"
+#endif
+
+#ifndef KERNEL_VERSION
+#error "KERNEL_VERSION is undefined"
+#endif
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0))
+#define NEED_DEV_PRINTK_ONCE
+#else /* >= 3,19,0 */
+#endif /* 3,19,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0))
+#else /* >= 4,8,0 */
+#define HAVE_TCF_EXTS_TO_LIST
+#endif /* 4,8,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0))
+#define NEED_TC_SETUP_QDISC_MQPRIO
+#else /* >= 4,15,0 */
+#define HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#endif /* 4,15,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0))
+#define NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+#else /* >= 4,16,0 */
+#endif /* 4,16,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,17,0))
+#define NEED_CONVERT_ART_NS_TO_TSC
+#else /* >= 4,17,0 */
+#endif /* 4,17,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0))
+#define NEED_MACVLAN_ACCEL_PRIV
+#define NEED_MACVLAN_RELEASE_L2FW_OFFLOAD
+#define NEED_MACVLAN_SUPPORTS_DEST_FILTER
+#else /* >= 4,18,0 */
+#define HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+#endif /* 4,18,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0))
+#define NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#else /* >= 4,19,0 */
+#undef HAVE_TCF_EXTS_TO_LIST
+#define HAVE_TCF_EXTS_FOR_EACH_ACTION
+#endif /* 4,19,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,2,0))
+#else /* >= 5.2.0 */
+#define HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID
+#endif /* 5.2.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0))
+#define NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY
+#else /* >= 5.3.0 */
+#endif /* 5.3.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,7,0))
+#define NEED_DEVLINK_REGION_CREATE_OPS
+#else /* >= 5.7.0 */
+#endif /* 5.7.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0))
+#define NEED_SKB_FRAG_OFF_ADD
+#define NEED_SKB_FRAG_OFF
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(4,19,200) && \
+     LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0))
+#undef NEED_SKB_FRAG_OFF
+#endif /* 4.19.X for X > 201 */
+
+#define NEED_FLOW_INDR_BLOCK_CB_REGISTER
+#else /* >= 5.4.0 */
+#endif /* 5.4.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,9,0))
+#define NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+#define HAVE_XDP_QUERY_PROG
+#else /* >= 5.9.0 */
+#endif /* 5.9.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0))
+#define NEED_NET_PREFETCH
+#define NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+#else /* >= 5.10.0 */
+#endif /* 5.10.0 */
+
+#endif /* _KCOMPAT_STD_DEFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_ubuntu_defs.h b/drivers/net/ethernet/intel/ice/kcompat_ubuntu_defs.h
new file mode 100644
index 000000000000..9da611f64f21
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_ubuntu_defs.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_UBUNTU_DEFS_H_
+#define _KCOMPAT_UBUNTU_DEFS_H_
+
+/* This file contains the definitions for the Ubuntu specific distribution of
+ * the Linux kernel.
+ *
+ * It checks the UBUNTU_VERSION_CODE to decide which features are available in
+ * the target kernel. It assumes that kcompat_std_defs.h has already been
+ * processed, and will #define or #undef the relevant flags based on what
+ * features were backported by Ubuntu.
+ */
+
+#if !UTS_UBUNTU_RELEASE_ABI
+#error "UTS_UBUNTU_RELEASE_ABI is 0 or undefined"
+#endif
+
+#if !UBUNTU_VERSION_CODE
+#error "UBUNTU_VERSION_CODE is 0 or undefined"
+#endif
+
+#ifndef UBUNTU_VERSION
+#error "UBUNTU_VERSION is undefined"
+#endif
+
+#endif /* _KCOMPAT_UBUNTU_DEFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/virtchnl.h b/drivers/net/ethernet/intel/ice/virtchnl.h
new file mode 100644
index 000000000000..dd7fade36bd3
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/virtchnl.h
@@ -0,0 +1,2232 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+
+#ifndef _VIRTCHNL_H_
+#define _VIRTCHNL_H_
+
+/* Description:
+ * This header file describes the Virtual Function (VF) - Physical Function
+ * (PF) communication protocol used by the drivers for all devices starting
+ * from our 40G product line
+ *
+ * Admin queue buffer usage:
+ * desc->opcode is always aqc_opc_send_msg_to_pf
+ * flags, retval, datalen, and data addr are all used normally.
+ * The Firmware copies the cookie fields when sending messages between the
+ * PF and VF, but uses all other fields internally. Due to this limitation,
+ * we must send all messages as "indirect", i.e. using an external buffer.
+ *
+ * All the VSI indexes are relative to the VF. Each VF can have maximum of
+ * three VSIs. All the queue indexes are relative to the VSI.  Each VF can
+ * have a maximum of sixteen queues for all of its VSIs.
+ *
+ * The PF is required to return a status code in v_retval for all messages
+ * except RESET_VF, which does not require any response. The returned value
+ * is of virtchnl_status_code type, defined in the shared type.h.
+ *
+ * In general, VF driver initialization should roughly follow the order of
+ * these opcodes. The VF driver must first validate the API version of the
+ * PF driver, then request a reset, then get resources, then configure
+ * queues and interrupts. After these operations are complete, the VF
+ * driver may start its queues, optionally add MAC and VLAN filters, and
+ * process traffic.
+ */
+
+/* START GENERIC DEFINES
+ * Need to ensure the following enums and defines hold the same meaning and
+ * value in current and future projects
+ */
+
+
+/* Error Codes */
+enum virtchnl_status_code {
+	VIRTCHNL_STATUS_SUCCESS				= 0,
+	VIRTCHNL_STATUS_ERR_PARAM			= -5,
+	VIRTCHNL_STATUS_ERR_NO_MEMORY			= -18,
+	VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH		= -38,
+	VIRTCHNL_STATUS_ERR_CQP_COMPL_ERROR		= -39,
+	VIRTCHNL_STATUS_ERR_INVALID_VF_ID		= -40,
+	VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR		= -53,
+	VIRTCHNL_STATUS_ERR_NOT_SUPPORTED		= -64,
+};
+
+/* Backward compatibility */
+#define VIRTCHNL_ERR_PARAM VIRTCHNL_STATUS_ERR_PARAM
+#define VIRTCHNL_STATUS_NOT_SUPPORTED VIRTCHNL_STATUS_ERR_NOT_SUPPORTED
+
+#define VIRTCHNL_LINK_SPEED_2_5GB_SHIFT		0x0
+#define VIRTCHNL_LINK_SPEED_100MB_SHIFT		0x1
+#define VIRTCHNL_LINK_SPEED_1000MB_SHIFT	0x2
+#define VIRTCHNL_LINK_SPEED_10GB_SHIFT		0x3
+#define VIRTCHNL_LINK_SPEED_40GB_SHIFT		0x4
+#define VIRTCHNL_LINK_SPEED_20GB_SHIFT		0x5
+#define VIRTCHNL_LINK_SPEED_25GB_SHIFT		0x6
+#define VIRTCHNL_LINK_SPEED_5GB_SHIFT		0x7
+
+enum virtchnl_link_speed {
+	VIRTCHNL_LINK_SPEED_UNKNOWN	= 0,
+	VIRTCHNL_LINK_SPEED_100MB	= BIT(VIRTCHNL_LINK_SPEED_100MB_SHIFT),
+	VIRTCHNL_LINK_SPEED_1GB		= BIT(VIRTCHNL_LINK_SPEED_1000MB_SHIFT),
+	VIRTCHNL_LINK_SPEED_10GB	= BIT(VIRTCHNL_LINK_SPEED_10GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_40GB	= BIT(VIRTCHNL_LINK_SPEED_40GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_20GB	= BIT(VIRTCHNL_LINK_SPEED_20GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_25GB	= BIT(VIRTCHNL_LINK_SPEED_25GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_2_5GB	= BIT(VIRTCHNL_LINK_SPEED_2_5GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_5GB		= BIT(VIRTCHNL_LINK_SPEED_5GB_SHIFT),
+};
+
+/* for hsplit_0 field of Rx HMC context */
+/* deprecated with AVF 1.0 */
+enum virtchnl_rx_hsplit {
+	VIRTCHNL_RX_HSPLIT_NO_SPLIT      = 0,
+	VIRTCHNL_RX_HSPLIT_SPLIT_L2      = 1,
+	VIRTCHNL_RX_HSPLIT_SPLIT_IP      = 2,
+	VIRTCHNL_RX_HSPLIT_SPLIT_TCP_UDP = 4,
+	VIRTCHNL_RX_HSPLIT_SPLIT_SCTP    = 8,
+};
+
+/* END GENERIC DEFINES */
+
+/* Opcodes for VF-PF communication. These are placed in the v_opcode field
+ * of the virtchnl_msg structure.
+ */
+enum virtchnl_ops {
+/* The PF sends status change events to VFs using
+ * the VIRTCHNL_OP_EVENT opcode.
+ * VFs send requests to the PF using the other ops.
+ * Use of "advanced opcode" features must be negotiated as part of capabilities
+ * exchange and are not considered part of base mode feature set.
+ */
+	VIRTCHNL_OP_UNKNOWN = 0,
+	VIRTCHNL_OP_VERSION = 1, /* must ALWAYS be 1 */
+	VIRTCHNL_OP_RESET_VF = 2,
+	VIRTCHNL_OP_GET_VF_RESOURCES = 3,
+	VIRTCHNL_OP_CONFIG_TX_QUEUE = 4,
+	VIRTCHNL_OP_CONFIG_RX_QUEUE = 5,
+	VIRTCHNL_OP_CONFIG_VSI_QUEUES = 6,
+	VIRTCHNL_OP_CONFIG_IRQ_MAP = 7,
+	VIRTCHNL_OP_ENABLE_QUEUES = 8,
+	VIRTCHNL_OP_DISABLE_QUEUES = 9,
+	VIRTCHNL_OP_ADD_ETH_ADDR = 10,
+	VIRTCHNL_OP_DEL_ETH_ADDR = 11,
+	VIRTCHNL_OP_ADD_VLAN = 12,
+	VIRTCHNL_OP_DEL_VLAN = 13,
+	VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE = 14,
+	VIRTCHNL_OP_GET_STATS = 15,
+	VIRTCHNL_OP_RSVD = 16,
+	VIRTCHNL_OP_EVENT = 17, /* must ALWAYS be 17 */
+	/* opcode 19 is reserved */
+	VIRTCHNL_OP_IWARP = 20, /* advanced opcode */
+	VIRTCHNL_OP_RDMA = VIRTCHNL_OP_IWARP,
+	VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP = 21, /* advanced opcode */
+	VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP = VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP,
+	VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP = 22, /* advanced opcode */
+	VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP = VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP,
+	VIRTCHNL_OP_CONFIG_RSS_KEY = 23,
+	VIRTCHNL_OP_CONFIG_RSS_LUT = 24,
+	VIRTCHNL_OP_GET_RSS_HENA_CAPS = 25,
+	VIRTCHNL_OP_SET_RSS_HENA = 26,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27,
+	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28,
+	VIRTCHNL_OP_REQUEST_QUEUES = 29,
+	VIRTCHNL_OP_ENABLE_CHANNELS = 30,
+	VIRTCHNL_OP_DISABLE_CHANNELS = 31,
+	VIRTCHNL_OP_ADD_CLOUD_FILTER = 32,
+	VIRTCHNL_OP_DEL_CLOUD_FILTER = 33,
+	/* opcode 34 is reserved */
+	VIRTCHNL_OP_DCF_VLAN_OFFLOAD = 38,
+	VIRTCHNL_OP_DCF_CMD_DESC = 39,
+	VIRTCHNL_OP_DCF_CMD_BUFF = 40,
+	VIRTCHNL_OP_DCF_DISABLE = 41,
+	VIRTCHNL_OP_DCF_GET_VSI_MAP = 42,
+	VIRTCHNL_OP_DCF_GET_PKG_INFO = 43,
+	VIRTCHNL_OP_GET_SUPPORTED_RXDIDS = 44,
+	VIRTCHNL_OP_ADD_RSS_CFG = 45,
+	VIRTCHNL_OP_DEL_RSS_CFG = 46,
+	VIRTCHNL_OP_ADD_FDIR_FILTER = 47,
+	VIRTCHNL_OP_DEL_FDIR_FILTER = 48,
+	VIRTCHNL_OP_GET_MAX_RSS_QREGION = 50,
+	VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS = 51,
+	VIRTCHNL_OP_ADD_VLAN_V2 = 52,
+	VIRTCHNL_OP_DEL_VLAN_V2 = 53,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 = 54,
+	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55,
+	VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 = 56,
+	VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57,
+	VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2 = 58,
+	VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2 = 59,
+	/* opcodes 60 through 69 are reserved */
+	VIRTCHNL_OP_ENABLE_QUEUES_V2 = 107,
+	VIRTCHNL_OP_DISABLE_QUEUES_V2 = 108,
+	VIRTCHNL_OP_MAP_QUEUE_VECTOR = 111,
+        VIRTCHNL_OP_DCF_RULE_FLUSH = 6000,
+	VIRTCHNL_OP_MAX,
+};
+
+static inline const char *virtchnl_op_str(enum virtchnl_ops v_opcode)
+{
+	switch (v_opcode) {
+	case VIRTCHNL_OP_UNKNOWN:
+		return "VIRTCHNL_OP_UNKNOWN";
+	case VIRTCHNL_OP_VERSION:
+		return "VIRTCHNL_OP_VERSION";
+	case VIRTCHNL_OP_RESET_VF:
+		return "VIRTCHNL_OP_RESET_VF";
+	case VIRTCHNL_OP_GET_VF_RESOURCES:
+		return "VIRTCHNL_OP_GET_VF_RESOURCES";
+	case VIRTCHNL_OP_CONFIG_TX_QUEUE:
+		return "VIRTCHNL_OP_CONFIG_TX_QUEUE";
+	case VIRTCHNL_OP_CONFIG_RX_QUEUE:
+		return "VIRTCHNL_OP_CONFIG_RX_QUEUE";
+	case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+		return "VIRTCHNL_OP_CONFIG_VSI_QUEUES";
+	case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+		return "VIRTCHNL_OP_CONFIG_IRQ_MAP";
+	case VIRTCHNL_OP_ENABLE_QUEUES:
+		return "VIRTCHNL_OP_ENABLE_QUEUES";
+	case VIRTCHNL_OP_DISABLE_QUEUES:
+		return "VIRTCHNL_OP_DISABLE_QUEUES";
+	case VIRTCHNL_OP_ADD_ETH_ADDR:
+		return "VIRTCHNL_OP_ADD_ETH_ADDR";
+	case VIRTCHNL_OP_DEL_ETH_ADDR:
+		return "VIRTCHNL_OP_DEL_ETH_ADDR";
+	case VIRTCHNL_OP_ADD_VLAN:
+		return "VIRTCHNL_OP_ADD_VLAN";
+	case VIRTCHNL_OP_DEL_VLAN:
+		return "VIRTCHNL_OP_DEL_VLAN";
+	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+		return "VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE";
+	case VIRTCHNL_OP_GET_STATS:
+		return "VIRTCHNL_OP_GET_STATS";
+	case VIRTCHNL_OP_RSVD:
+		return "VIRTCHNL_OP_RSVD";
+	case VIRTCHNL_OP_EVENT:
+		return "VIRTCHNL_OP_EVENT";
+	case VIRTCHNL_OP_RDMA:
+		return "VIRTCHNL_OP_RDMA";
+	case VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP:
+		return "VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP:";
+	case VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP:
+		return "VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP";
+	case VIRTCHNL_OP_CONFIG_RSS_KEY:
+		return "VIRTCHNL_OP_CONFIG_RSS_KEY";
+	case VIRTCHNL_OP_CONFIG_RSS_LUT:
+		return "VIRTCHNL_OP_CONFIG_RSS_LUT";
+	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
+		return "VIRTCHNL_OP_GET_RSS_HENA_CAPS";
+	case VIRTCHNL_OP_SET_RSS_HENA:
+		return "VIRTCHNL_OP_SET_RSS_HENA";
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
+		return "VIRTCHNL_OP_ENABLE_VLAN_STRIPPING";
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
+		return "VIRTCHNL_OP_DISABLE_VLAN_STRIPPING";
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		return "VIRTCHNL_OP_REQUEST_QUEUES";
+	case VIRTCHNL_OP_ENABLE_CHANNELS:
+		return "VIRTCHNL_OP_ENABLE_CHANNELS";
+	case VIRTCHNL_OP_DISABLE_CHANNELS:
+		return "VIRTCHNL_OP_DISABLE_CHANNELS";
+	case VIRTCHNL_OP_ADD_CLOUD_FILTER:
+		return "VIRTCHNL_OP_ADD_CLOUD_FILTER";
+	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
+		return "VIRTCHNL_OP_DEL_CLOUD_FILTER";
+	case VIRTCHNL_OP_DCF_CMD_DESC:
+		return "VIRTCHNL_OP_DCF_CMD_DESC";
+	case VIRTCHNL_OP_DCF_CMD_BUFF:
+		return "VIRTCHHNL_OP_DCF_CMD_BUFF";
+	case VIRTCHNL_OP_DCF_DISABLE:
+		return "VIRTCHNL_OP_DCF_DISABLE";
+	case VIRTCHNL_OP_DCF_GET_VSI_MAP:
+		return "VIRTCHNL_OP_DCF_GET_VSI_MAP";
+	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
+		return "VIRTCHNL_OP_GET_SUPPORTED_RXDIDS";
+	case VIRTCHNL_OP_ADD_RSS_CFG:
+		return "VIRTCHNL_OP_ADD_RSS_CFG";
+	case VIRTCHNL_OP_DEL_RSS_CFG:
+		return "VIRTCHNL_OP_DEL_RSS_CFG";
+	case VIRTCHNL_OP_ADD_FDIR_FILTER:
+		return "VIRTCHNL_OP_ADD_FDIR_FILTER";
+	case VIRTCHNL_OP_DEL_FDIR_FILTER:
+		return "VIRTCHNL_OP_DEL_FDIR_FILTER";
+	case VIRTCHNL_OP_GET_MAX_RSS_QREGION:
+		return "VIRTCHNL_OP_GET_MAX_RSS_QREGION";
+	case VIRTCHNL_OP_ENABLE_QUEUES_V2:
+		return "VIRTCHNL_OP_ENABLE_QUEUES_V2";
+	case VIRTCHNL_OP_DISABLE_QUEUES_V2:
+		return "VIRTCHNL_OP_DISABLE_QUEUES_V2";
+	case VIRTCHNL_OP_MAP_QUEUE_VECTOR:
+		return "VIRTCHNL_OP_MAP_QUEUE_VECTOR";
+	case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS:
+		return "VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS";
+	case VIRTCHNL_OP_ADD_VLAN_V2:
+		return "VIRTCHNL_OP_ADD_VLAN_V2";
+	case VIRTCHNL_OP_DEL_VLAN_V2:
+		return "VIRTCHNL_OP_DEL_VLAN_V2";
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2:
+		return "VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2";
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2:
+		return "VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2";
+	case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2:
+		return "VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2";
+	case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
+		return "VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2";
+	case VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2:
+		return "VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2";
+	case VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2:
+		return "VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2";
+	case VIRTCHNL_OP_MAX:
+		return "VIRTCHNL_OP_MAX";
+	default:
+		return "Unsupported (update virtchnl.h)";
+	}
+}
+
+/* These macros are used to generate compilation errors if a structure/union
+ * is not exactly the correct length. It gives a divide by zero error if the
+ * structure/union is not of the correct size, otherwise it creates an enum
+ * that is never used.
+ */
+#define VIRTCHNL_CHECK_STRUCT_LEN(n, X) enum virtchnl_static_assert_enum_##X \
+	{ virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) }
+#define VIRTCHNL_CHECK_UNION_LEN(n, X) enum virtchnl_static_asset_enum_##X \
+	{ virtchnl_static_assert_##X = (n)/((sizeof(union X) == (n)) ? 1 : 0) }
+
+/* Message descriptions and data structures. */
+
+/* VIRTCHNL_OP_VERSION
+ * VF posts its version number to the PF. PF responds with its version number
+ * in the same format, along with a return code.
+ * Reply from PF has its major/minor versions also in param0 and param1.
+ * If there is a major version mismatch, then the VF cannot operate.
+ * If there is a minor version mismatch, then the VF can operate but should
+ * add a warning to the system log.
+ *
+ * This enum element MUST always be specified as == 1, regardless of other
+ * changes in the API. The PF must always respond to this message without
+ * error regardless of version mismatch.
+ */
+#define VIRTCHNL_VERSION_MAJOR		1
+#define VIRTCHNL_VERSION_MINOR		1
+#define VIRTCHNL_VERSION_MAJOR_2	2
+#define VIRTCHNL_VERSION_MINOR_0	0
+#define VIRTCHNL_VERSION_MINOR_NO_VF_CAPS	0
+
+struct virtchnl_version_info {
+	u32 major;
+	u32 minor;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_version_info);
+
+#define VF_IS_V10(_ver) (((_ver)->major == 1) && ((_ver)->minor == 0))
+#define VF_IS_V11(_ver) (((_ver)->major == 1) && ((_ver)->minor == 1))
+#define VF_IS_V20(_ver) (((_ver)->major == 2) && ((_ver)->minor == 0))
+
+/* VIRTCHNL_OP_RESET_VF
+ * VF sends this request to PF with no parameters
+ * PF does NOT respond! VF driver must delay then poll VFGEN_RSTAT register
+ * until reset completion is indicated. The admin queue must be reinitialized
+ * after this operation.
+ *
+ * When reset is complete, PF must ensure that all queues in all VSIs associated
+ * with the VF are stopped, all queue configurations in the HMC are set to 0,
+ * and all MAC and VLAN filters (except the default MAC address) on all VSIs
+ * are cleared.
+ */
+
+/* VSI types that use VIRTCHNL interface for VF-PF communication. VSI_SRIOV
+ * vsi_type should always be 6 for backward compatibility. Add other fields
+ * as needed.
+ */
+enum virtchnl_vsi_type {
+	VIRTCHNL_VSI_TYPE_INVALID = 0,
+	VIRTCHNL_VSI_SRIOV = 6,
+};
+
+/* VIRTCHNL_OP_GET_VF_RESOURCES
+ * Version 1.0 VF sends this request to PF with no parameters
+ * Version 1.1 VF sends this request to PF with u32 bitmap of its capabilities
+ * PF responds with an indirect message containing
+ * virtchnl_vf_resource and one or more
+ * virtchnl_vsi_resource structures.
+ */
+
+struct virtchnl_vsi_resource {
+	u16 vsi_id;
+	u16 num_queue_pairs;
+
+	/* see enum virtchnl_vsi_type */
+	s32 vsi_type;
+	u16 qset_handle;
+	u8 default_mac_addr[ETH_ALEN];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
+
+/* VF capability flags
+ * VIRTCHNL_VF_OFFLOAD_L2 flag is inclusive of base mode L2 offloads including
+ * TX/RX Checksum offloading and TSO for non-tunnelled packets.
+ */
+#define VIRTCHNL_VF_OFFLOAD_L2			BIT(0)
+#define VIRTCHNL_VF_OFFLOAD_IWARP		BIT(1)
+#define VIRTCHNL_VF_CAP_RDMA			VIRTCHNL_VF_OFFLOAD_IWARP
+#define VIRTCHNL_VF_OFFLOAD_RSS_AQ		BIT(3)
+#define VIRTCHNL_VF_OFFLOAD_RSS_REG		BIT(4)
+#define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		BIT(5)
+#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		BIT(6)
+/* used to negotiate communicating link speeds in Mbps */
+#define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		BIT(7)
+	/* BIT(8) is reserved */
+#define VIRTCHNL_VF_LARGE_NUM_QPAIRS		BIT(9)
+#define VIRTCHNL_VF_OFFLOAD_CRC			BIT(10)
+#define VIRTCHNL_VF_OFFLOAD_VLAN_V2		BIT(15)
+#define VIRTCHNL_VF_OFFLOAD_VLAN		BIT(16)
+#define VIRTCHNL_VF_OFFLOAD_RX_POLLING		BIT(17)
+#define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	BIT(18)
+#define VIRTCHNL_VF_OFFLOAD_RSS_PF		BIT(19)
+#define VIRTCHNL_VF_OFFLOAD_ENCAP		BIT(20)
+#define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM		BIT(21)
+#define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	BIT(22)
+#define VIRTCHNL_VF_OFFLOAD_ADQ			BIT(23)
+#define VIRTCHNL_VF_OFFLOAD_ADQ_V2		BIT(24)
+#define VIRTCHNL_VF_OFFLOAD_USO			BIT(25)
+#define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC	BIT(26)
+#define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF		BIT(27)
+#define VIRTCHNL_VF_OFFLOAD_FDIR_PF		BIT(28)
+#define VIRTCHNL_VF_CAP_DCF			BIT(30)
+	/* BIT(31) is reserved */
+
+#define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \
+			       VIRTCHNL_VF_OFFLOAD_VLAN | \
+			       VIRTCHNL_VF_OFFLOAD_RSS_PF)
+
+struct virtchnl_vf_resource {
+	u16 num_vsis;
+	u16 num_queue_pairs;
+	u16 max_vectors;
+	u16 max_mtu;
+
+	u32 vf_cap_flags;
+	u32 rss_key_size;
+	u32 rss_lut_size;
+
+	struct virtchnl_vsi_resource vsi_res[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(36, virtchnl_vf_resource);
+
+/* VIRTCHNL_OP_CONFIG_TX_QUEUE
+ * VF sends this message to set up parameters for one TX queue.
+ * External data buffer contains one instance of virtchnl_txq_info.
+ * PF configures requested queue and returns a status code.
+ */
+
+/* Tx queue config info */
+struct virtchnl_txq_info {
+	u16 vsi_id;
+	u16 queue_id;
+	u16 ring_len;		/* number of descriptors, multiple of 8 */
+	u16 headwb_enabled; /* deprecated with AVF 1.0 */
+	u64 dma_ring_addr;
+	u64 dma_headwb_addr; /* deprecated with AVF 1.0 */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_txq_info);
+
+/* RX descriptor IDs (range from 0 to 63) */
+enum virtchnl_rx_desc_ids {
+	VIRTCHNL_RXDID_0_16B_BASE		= 0,
+	/* 32B_BASE and FLEX_SPLITQ share desc ids as default descriptors
+	 * because they can be differentiated based on queue model; e.g. single
+	 * queue model can only use 32B_BASE and split queue model can only use
+	 * FLEX_SPLITQ.  Having these as 1 allows them to be used as default
+	 * descriptors without negotiation.
+	 */
+	VIRTCHNL_RXDID_1_32B_BASE		= 1,
+	VIRTCHNL_RXDID_1_FLEX_SPLITQ		= 1,
+	VIRTCHNL_RXDID_2_FLEX_SQ_NIC		= 2,
+	VIRTCHNL_RXDID_3_FLEX_SQ_SW		= 3,
+	VIRTCHNL_RXDID_4_FLEX_SQ_NIC_VEB	= 4,
+	VIRTCHNL_RXDID_5_FLEX_SQ_NIC_ACL	= 5,
+	VIRTCHNL_RXDID_6_FLEX_SQ_NIC_2		= 6,
+	VIRTCHNL_RXDID_7_HW_RSVD		= 7,
+	/* 9 through 15 are reserved */
+	VIRTCHNL_RXDID_16_COMMS_GENERIC 	= 16,
+	VIRTCHNL_RXDID_17_COMMS_AUX_VLAN 	= 17,
+	VIRTCHNL_RXDID_18_COMMS_AUX_IPV4 	= 18,
+	VIRTCHNL_RXDID_19_COMMS_AUX_IPV6 	= 19,
+	VIRTCHNL_RXDID_20_COMMS_AUX_FLOW 	= 20,
+	VIRTCHNL_RXDID_21_COMMS_AUX_TCP 	= 21,
+	/* 22 through 63 are reserved */
+};
+
+/* RX descriptor ID bitmasks */
+enum virtchnl_rx_desc_id_bitmasks {
+	VIRTCHNL_RXDID_0_16B_BASE_M		= BIT(VIRTCHNL_RXDID_0_16B_BASE),
+	VIRTCHNL_RXDID_1_32B_BASE_M		= BIT(VIRTCHNL_RXDID_1_32B_BASE),
+	VIRTCHNL_RXDID_1_FLEX_SPLITQ_M		= BIT(VIRTCHNL_RXDID_1_FLEX_SPLITQ),
+	VIRTCHNL_RXDID_2_FLEX_SQ_NIC_M		= BIT(VIRTCHNL_RXDID_2_FLEX_SQ_NIC),
+	VIRTCHNL_RXDID_3_FLEX_SQ_SW_M		= BIT(VIRTCHNL_RXDID_3_FLEX_SQ_SW),
+	VIRTCHNL_RXDID_4_FLEX_SQ_NIC_VEB_M	= BIT(VIRTCHNL_RXDID_4_FLEX_SQ_NIC_VEB),
+	VIRTCHNL_RXDID_5_FLEX_SQ_NIC_ACL_M	= BIT(VIRTCHNL_RXDID_5_FLEX_SQ_NIC_ACL),
+	VIRTCHNL_RXDID_6_FLEX_SQ_NIC_2_M	= BIT(VIRTCHNL_RXDID_6_FLEX_SQ_NIC_2),
+	VIRTCHNL_RXDID_7_HW_RSVD_M		= BIT(VIRTCHNL_RXDID_7_HW_RSVD),
+	/* 9 through 15 are reserved */
+	VIRTCHNL_RXDID_16_COMMS_GENERIC_M	= BIT(VIRTCHNL_RXDID_16_COMMS_GENERIC),
+	VIRTCHNL_RXDID_17_COMMS_AUX_VLAN_M	= BIT(VIRTCHNL_RXDID_17_COMMS_AUX_VLAN),
+	VIRTCHNL_RXDID_18_COMMS_AUX_IPV4_M	= BIT(VIRTCHNL_RXDID_18_COMMS_AUX_IPV4),
+	VIRTCHNL_RXDID_19_COMMS_AUX_IPV6_M	= BIT(VIRTCHNL_RXDID_19_COMMS_AUX_IPV6),
+	VIRTCHNL_RXDID_20_COMMS_AUX_FLOW_M	= BIT(VIRTCHNL_RXDID_20_COMMS_AUX_FLOW),
+	VIRTCHNL_RXDID_21_COMMS_AUX_TCP_M	= BIT(VIRTCHNL_RXDID_21_COMMS_AUX_TCP),
+	/* 22 through 63 are reserved */
+};
+
+/* VIRTCHNL_OP_CONFIG_RX_QUEUE
+ * VF sends this message to set up parameters for one RX queue.
+ * External data buffer contains one instance of virtchnl_rxq_info.
+ * PF configures requested queue and returns a status code. The
+ * crc_disable flag disables CRC stripping on the VF. Setting
+ * the crc_disable flag to 1 will disable CRC stripping for each
+ * queue in the VF where the flag is set. The VIRTCHNL_VF_OFFLOAD_CRC
+ * offload must have been set prior to sending this info or the PF
+ * will ignore the request. This flag should be set the same for
+ * all of the queues for a VF.
+ */
+
+/* Rx queue config info */
+struct virtchnl_rxq_info {
+	u16 vsi_id;
+	u16 queue_id;
+	u32 ring_len;		/* number of descriptors, multiple of 32 */
+	u16 hdr_size;
+	u16 splithdr_enabled; /* deprecated with AVF 1.0 */
+	u32 databuffer_size;
+	u32 max_pkt_size;
+	u8 crc_disable;
+	/* see enum virtchnl_rx_desc_ids;
+	 * only used when VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC is supported. Note
+	 * that when the offload is not supported, the descriptor format aligns
+	 * with VIRTCHNL_RXDID_1_32B_BASE.
+	 */
+	u8 rxdid;
+	u8 pad1[2];
+	u64 dma_ring_addr;
+
+	/* see enum virtchnl_rx_hsplit; deprecated with AVF 1.0 */
+	s32 rx_split_pos;
+	u32 pad2;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_rxq_info);
+
+/* VIRTCHNL_OP_CONFIG_VSI_QUEUES
+ * VF sends this message to set parameters for active TX and RX queues
+ * associated with the specified VSI.
+ * PF configures queues and returns status.
+ * If the number of queues specified is greater than the number of queues
+ * associated with the VSI, an error is returned and no queues are configured.
+ * NOTE: The VF is not required to configure all queues in a single request.
+ * It may send multiple messages. PF drivers must correctly handle all VF
+ * requests.
+ */
+struct virtchnl_queue_pair_info {
+	/* NOTE: vsi_id and queue_id should be identical for both queues. */
+	struct virtchnl_txq_info txq;
+	struct virtchnl_rxq_info rxq;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(64, virtchnl_queue_pair_info);
+
+struct virtchnl_vsi_queue_config_info {
+	u16 vsi_id;
+	u16 num_queue_pairs;
+	u32 pad;
+	struct virtchnl_queue_pair_info qpair[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_vsi_queue_config_info);
+
+/* VIRTCHNL_OP_REQUEST_QUEUES
+ * VF sends this message to request the PF to allocate additional queues to
+ * this VF.  Each VF gets a guaranteed number of queues on init but asking for
+ * additional queues must be negotiated.  This is a best effort request as it
+ * is possible the PF does not have enough queues left to support the request.
+ * If the PF cannot support the number requested it will respond with the
+ * maximum number it is able to support.  If the request is successful, PF will
+ * then reset the VF to institute required changes.
+ */
+
+/* VF resource request */
+struct virtchnl_vf_res_request {
+	u16 num_queue_pairs;
+};
+
+/* VIRTCHNL_OP_CONFIG_IRQ_MAP
+ * VF uses this message to map vectors to queues.
+ * The rxq_map and txq_map fields are bitmaps used to indicate which queues
+ * are to be associated with the specified vector.
+ * The "other" causes are always mapped to vector 0. The VF may not request
+ * that vector 0 be used for traffic.
+ * PF configures interrupt mapping and returns status.
+ * NOTE: due to hardware requirements, all active queues (both TX and RX)
+ * should be mapped to interrupts, even if the driver intends to operate
+ * only in polling mode. In this case the interrupt may be disabled, but
+ * the ITR timer will still run to trigger writebacks.
+ */
+struct virtchnl_vector_map {
+	u16 vsi_id;
+	u16 vector_id;
+	u16 rxq_map;
+	u16 txq_map;
+	u16 rxitr_idx;
+	u16 txitr_idx;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_vector_map);
+
+struct virtchnl_irq_map_info {
+	u16 num_vectors;
+	struct virtchnl_vector_map vecmap[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(14, virtchnl_irq_map_info);
+
+/* VIRTCHNL_OP_ENABLE_QUEUES
+ * VIRTCHNL_OP_DISABLE_QUEUES
+ * VF sends these message to enable or disable TX/RX queue pairs.
+ * The queues fields are bitmaps indicating which queues to act upon.
+ * (Currently, we only support 16 queues per VF, but we make the field
+ * u32 to allow for expansion.)
+ * PF performs requested action and returns status.
+ * NOTE: The VF is not required to enable/disable all queues in a single
+ * request. It may send multiple messages.
+ * PF drivers must correctly handle all VF requests.
+ */
+struct virtchnl_queue_select {
+	u16 vsi_id;
+	u16 pad;
+	u32 rx_queues;
+	u32 tx_queues;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_queue_select);
+
+/* VIRTCHNL_OP_GET_MAX_RSS_QREGION
+ *
+ * if VIRTCHNL_VF_LARGE_NUM_QPAIRS was negotiated in VIRTCHNL_OP_GET_VF_RESOURCES
+ * then this op must be supported.
+ *
+ * VF sends this message in order to query the max RSS queue region
+ * size supported by PF, when VIRTCHNL_VF_LARGE_NUM_QPAIRS is enabled.
+ * This information should be used when configuring the RSS LUT and/or
+ * configuring queue region based filters.
+ *
+ * The maximum RSS queue region is 2^qregion_width. So, a qregion_width
+ * of 6 would inform the VF that the PF supports a maximum RSS queue region
+ * of 64.
+ *
+ * A queue region represents a range of queues that can be used to configure
+ * a RSS LUT. For example, if a VF is given 64 queues, but only a max queue
+ * region size of 16 (i.e. 2^qregion_width = 16) then it will only be able
+ * to configure the RSS LUT with queue indices from 0 to 15. However, other
+ * filters can be used to direct packets to queues >15 via specifying a queue
+ * base/offset and queue region width.
+ */
+struct virtchnl_max_rss_qregion {
+	u16 vport_id;
+	u16 qregion_width;
+	u8 pad[4];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_max_rss_qregion);
+
+/* VIRTCHNL_OP_ADD_ETH_ADDR
+ * VF sends this message in order to add one or more unicast or multicast
+ * address filters for the specified VSI.
+ * PF adds the filters and returns status.
+ */
+
+/* VIRTCHNL_OP_DEL_ETH_ADDR
+ * VF sends this message in order to remove one or more unicast or multicast
+ * filters for the specified VSI.
+ * PF removes the filters and returns status.
+ */
+
+/* VIRTCHNL_ETHER_ADDR_LEGACY
+ * Prior to adding the @type member to virtchnl_ether_addr, there were 2 pad
+ * bytes. Moving forward all VF drivers should not set type to
+ * VIRTCHNL_ETHER_ADDR_LEGACY. This is only here to not break previous/legacy
+ * behavior. The control plane function (i.e. PF) can use a best effort method
+ * of tracking the primary/device unicast in this case, but there is no
+ * guarantee and functionality depends on the implementation of the PF.
+ */
+
+/* VIRTCHNL_ETHER_ADDR_PRIMARY
+ * All VF drivers should set @type to VIRTCHNL_ETHER_ADDR_PRIMARY for the
+ * primary/device unicast MAC address filter for VIRTCHNL_OP_ADD_ETH_ADDR and
+ * VIRTCHNL_OP_DEL_ETH_ADDR. This allows for the underlying control plane
+ * function (i.e. PF) to accurately track and use this MAC address for
+ * displaying on the host and for VM/function reset.
+ */
+
+/* VIRTCHNL_ETHER_ADDR_EXTRA
+ * All VF drivers should set @type to VIRTCHNL_ETHER_ADDR_EXTRA for any extra
+ * unicast and/or multicast filters that are being added/deleted via
+ * VIRTCHNL_OP_DEL_ETH_ADDR/VIRTCHNL_OP_ADD_ETH_ADDR respectively.
+ */
+struct virtchnl_ether_addr {
+	u8 addr[ETH_ALEN];
+	u8 type;
+#define VIRTCHNL_ETHER_ADDR_LEGACY	0
+#define VIRTCHNL_ETHER_ADDR_PRIMARY	1
+#define VIRTCHNL_ETHER_ADDR_EXTRA	2
+#define VIRTCHNL_ETHER_ADDR_TYPE_MASK	3 /* first two bits of type are valid */
+	u8 pad;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_ether_addr);
+
+struct virtchnl_ether_addr_list {
+	u16 vsi_id;
+	u16 num_elements;
+	struct virtchnl_ether_addr list[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_ether_addr_list);
+
+/* VIRTCHNL_OP_ADD_VLAN
+ * VF sends this message to add one or more VLAN tag filters for receives.
+ * PF adds the filters and returns status.
+ * If a port VLAN is configured by the PF, this operation will return an
+ * error to the VF.
+ */
+
+/* VIRTCHNL_OP_DEL_VLAN
+ * VF sends this message to remove one or more VLAN tag filters for receives.
+ * PF removes the filters and returns status.
+ * If a port VLAN is configured by the PF, this operation will return an
+ * error to the VF.
+ */
+
+struct virtchnl_vlan_filter_list {
+	u16 vsi_id;
+	u16 num_elements;
+	u16 vlan_id[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_vlan_filter_list);
+
+/* This enum is used for all of the VIRTCHNL_VF_OFFLOAD_VLAN_V2_CAPS related
+ * structures and opcodes.
+ *
+ * VIRTCHNL_VLAN_UNSUPPORTED - This field is not supported and if a VF driver
+ * populates it the PF should return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED.
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 - This field supports 0x8100 ethertype.
+ * VIRTCHNL_VLAN_ETHERTYPE_88A8 - This field supports 0x88A8 ethertype.
+ * VIRTCHNL_VLAN_ETHERTYPE_9100 - This field supports 0x9100 ethertype.
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_AND - Used when multiple ethertypes can be supported
+ * by the PF concurrently. For example, if the PF can support
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 AND VIRTCHNL_VLAN_ETHERTYPE_88A8 filters it
+ * would OR the following bits:
+ *
+ *	VIRTHCNL_VLAN_ETHERTYPE_8100 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * The VF would interpret this as VLAN filtering can be supported on both 0x8100
+ * and 0x88A8 VLAN ethertypes.
+ *
+ * VIRTCHNL_ETHERTYPE_XOR - Used when only a single ethertype can be supported
+ * by the PF concurrently. For example if the PF can support
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 XOR VIRTCHNL_VLAN_ETHERTYPE_88A8 stripping
+ * offload it would OR the following bits:
+ *
+ *	VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * The VF would interpret this as VLAN stripping can be supported on either
+ * 0x8100 or 0x88a8 VLAN ethertypes. So when requesting VLAN stripping via
+ * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 the specified ethertype will override
+ * the previously set value.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 - Used to tell the VF to insert and/or
+ * strip the VLAN tag using the L2TAG1 field of the Tx/Rx descriptors.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to insert hardware
+ * offloaded VLAN tags using the L2TAG2 field of the Tx descriptor.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to strip hardware
+ * offloaded VLAN tags using the L2TAG2_2 field of the Rx descriptor.
+ *
+ * VIRTCHNL_VLAN_PRIO - This field supports VLAN priority bits. This is used for
+ * VLAN filtering if the underlying PF supports it.
+ *
+ * VIRTCHNL_VLAN_TOGGLE_ALLOWED - This field is used to say whether a
+ * certain VLAN capability can be toggled. For example if the underlying PF/CP
+ * allows the VF to toggle VLAN filtering, stripping, and/or insertion it should
+ * set this bit along with the supported ethertypes.
+ */
+enum virtchnl_vlan_support {
+	VIRTCHNL_VLAN_UNSUPPORTED =		0,
+	VIRTCHNL_VLAN_ETHERTYPE_8100 =		0x00000001,
+	VIRTCHNL_VLAN_ETHERTYPE_88A8 =		0x00000002,
+	VIRTCHNL_VLAN_ETHERTYPE_9100 =		0x00000004,
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 =	0x00000100,
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 =	0x00000200,
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 =	0x00000400,
+	VIRTCHNL_VLAN_PRIO =			0x01000000,
+	VIRTCHNL_VLAN_FILTER_MASK =		0x10000000,
+	VIRTCHNL_VLAN_ETHERTYPE_AND =		0x20000000,
+	VIRTCHNL_VLAN_ETHERTYPE_XOR =		0x40000000,
+	VIRTCHNL_VLAN_TOGGLE =			0x80000000
+};
+
+/* This structure is used as part of the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS
+ * for filtering, insertion, and stripping capabilities.
+ *
+ * If only outer capabilities are supported (for filtering, insertion, and/or
+ * stripping) then this refers to the outer most or single VLAN from the VF's
+ * perspective.
+ *
+ * If only inner capabilities are supported (for filtering, insertion, and/or
+ * stripping) then this refers to the outer most or single VLAN from the VF's
+ * perspective. Functionally this is the same as if only outer capabilities are
+ * supported. The VF driver is just forced to use the inner fields when
+ * adding/deleting filters and enabling/disabling offloads (if supported).
+ *
+ * If both outer and inner capabilities are supported (for filtering, insertion,
+ * and/or stripping) then outer refers to the outer most or single VLAN and
+ * inner refers to the second VLAN, if it exists, in the packet.
+ *
+ * There is no support for tunneled VLAN offloads, so outer or inner are never
+ * referring to a tunneled packet from the VF's perspective.
+ */
+struct virtchnl_vlan_supported_caps {
+	u32 outer;
+	u32 inner;
+};
+
+/* The PF populates these fields based on the supported VLAN filtering. If a
+ * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will
+ * reject any VIRTCHNL_OP_ADD_VLAN_V2 or VIRTCHNL_OP_DEL_VLAN_V2 messages using
+ * the unsupported fields.
+ *
+ * Also, a VF is only allowed to toggle its VLAN filtering setting if the
+ * VIRTCHNL_VLAN_TOGGLE bit is set.
+ *
+ * The ethertype(s) specified in the ethertype_init field are the ethertypes
+ * enabled for VLAN filtering. VLAN filtering in this case refers to the outer
+ * most VLAN from the VF's perspective. If both inner and outer filtering are
+ * allowed then ethertype_init only refers to the outer most VLAN as only
+ * VLAN ethertype supported for inner VLAN filtering is
+ * VIRTCHNL_VLAN_ETHERTYPE_8100. By default, inner VLAN filtering is disabled
+ * when both inner and outer filtering are allowed.
+ *
+ * The max_filters field tells the VF how many VLAN filters it's allowed to have
+ * at any one time. If it exceeds this amount and tries to add another filter,
+ * then the request will be rejected by the PF. To prevent failures, the VF
+ * should keep track of how many VLAN filters it has added and not attempt to
+ * add more than max_filters.
+ */
+struct virtchnl_vlan_filtering_caps {
+	struct virtchnl_vlan_supported_caps filtering_support;
+	u32 ethertype_init;
+	u16 max_filters;
+	u8 pad[2];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_filtering_caps);
+
+/* This enum is used for the virtchnl_vlan_offload_caps structure to specify
+ * if the PF supports a different ethertype for stripping and insertion.
+ *
+ * VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION - The ethertype(s) specified
+ * for stripping affect the ethertype(s) specified for insertion and visa versa
+ * as well. If the VF tries to configure VLAN stripping via
+ * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 with VIRTCHNL_VLAN_ETHERTYPE_8100 then
+ * that will be the ethertype for both stripping and insertion.
+ *
+ * VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED - The ethertype(s) specified for
+ * stripping do not affect the ethertype(s) specified for insertion and visa
+ * versa.
+ */
+enum virtchnl_vlan_ethertype_match {
+	VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION = 0,
+	VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED = 1,
+};
+
+/* The PF populates these fields based on the supported VLAN offloads. If a
+ * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will
+ * reject any VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 or
+ * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 messages using the unsupported fields.
+ *
+ * Also, a VF is only allowed to toggle its VLAN offload setting if the
+ * VIRTCHNL_VLAN_TOGGLE_ALLOWED bit is set.
+ *
+ * The VF driver needs to be aware of how the tags are stripped by hardware and
+ * inserted by the VF driver based on the level of offload support. The PF will
+ * populate these fields based on where the VLAN tags are expected to be
+ * offloaded via the VIRTHCNL_VLAN_TAG_LOCATION_* bits. The VF will need to
+ * interpret these fields. See the definition of the
+ * VIRTCHNL_VLAN_TAG_LOCATION_* bits above the virtchnl_vlan_support
+ * enumeration.
+ */
+struct virtchnl_vlan_offload_caps {
+	struct virtchnl_vlan_supported_caps stripping_support;
+	struct virtchnl_vlan_supported_caps insertion_support;
+	u32 ethertype_init;
+	u8 ethertype_match;
+	u8 pad[3];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_vlan_offload_caps);
+
+/* VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS
+ * VF sends this message to determine its VLAN capabilities.
+ *
+ * PF will mark which capabilities it supports based on hardware support and
+ * current configuration. For example, if a port VLAN is configured the PF will
+ * not allow outer VLAN filtering, stripping, or insertion to be configured so
+ * it will block these features from the VF.
+ *
+ * The VF will need to cross reference its capabilities with the PFs
+ * capabilities in the response message from the PF to determine the VLAN
+ * support.
+ */
+struct virtchnl_vlan_caps {
+	struct virtchnl_vlan_filtering_caps filtering;
+	struct virtchnl_vlan_offload_caps offloads;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_caps);
+
+struct virtchnl_vlan {
+	u16 tci;	/* tci[15:13] = PCP and tci[11:0] = VID */
+	u16 tci_mask;	/* only valid if VIRTCHNL_VLAN_FILTER_MASK set in
+			 * filtering caps
+			 */
+	u16 tpid;	/* 0x8100, 0x88a8, etc. and only type(s) set in
+			 * filtering caps. Note that tpid here does not refer to
+			 * VIRTCHNL_VLAN_ETHERTYPE_*, but it refers to the
+			 * actual 2-byte VLAN TPID
+			 */
+	u8 pad[2];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_vlan);
+
+struct virtchnl_vlan_filter {
+	struct virtchnl_vlan inner;
+	struct virtchnl_vlan outer;
+	u8 pad[16];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(32, virtchnl_vlan_filter);
+
+/* VIRTCHNL_OP_ADD_VLAN_V2
+ * VIRTCHNL_OP_DEL_VLAN_V2
+ *
+ * VF sends these messages to add/del one or more VLAN tag filters for Rx
+ * traffic.
+ *
+ * The PF attempts to add the filters and returns status.
+ *
+ * The VF should only ever attempt to add/del virtchnl_vlan_filter(s) using the
+ * supported fields negotiated via VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS.
+ */
+struct virtchnl_vlan_filter_list_v2 {
+	u16 vport_id;
+	u16 num_elements;
+	u8 pad[4];
+	struct virtchnl_vlan_filter filters[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_filter_list_v2);
+
+/* VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2
+ * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2
+ * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2
+ * VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2
+ *
+ * VF sends this message to enable or disable VLAN stripping or insertion. It
+ * also needs to specify an ethertype. The VF knows which VLAN ethertypes are
+ * allowed and whether or not it's allowed to enable/disable the specific
+ * offload via the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS message. The VF needs to
+ * parse the virtchnl_vlan_caps.offloads fields to determine which offload
+ * messages are allowed.
+ *
+ * For example, if the PF populates the virtchnl_vlan_caps.offloads in the
+ * following manner the VF will be allowed to enable and/or disable 0x8100 inner
+ * VLAN insertion and/or stripping via the opcodes listed above. Inner in this
+ * case means the outer most or single VLAN from the VF's perspective. This is
+ * because no outer offloads are supported. See the comments above the
+ * virtchnl_vlan_supported_caps structure for more details.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.inner =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.inner =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * In order to enable inner (again note that in this case inner is the outer
+ * most or single VLAN from the VF's perspective) VLAN stripping for 0x8100
+ * VLANs, the VF would populate the virtchnl_vlan_setting structure in the
+ * following manner and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message.
+ *
+ * virtchnl_vlan_setting.inner_ethertype_setting =
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ *
+ * The reason that VLAN TPID(s) are not being used for the
+ * outer_ethertype_setting and inner_ethertype_setting fields is because it's
+ * possible a device could support VLAN insertion and/or stripping offload on
+ * multiple ethertypes concurrently, so this method allows a VF to request
+ * multiple ethertypes in one message using the virtchnl_vlan_support
+ * enumeration.
+ *
+ * For example, if the PF populates the virtchnl_vlan_caps.offloads in the
+ * following manner the VF will be allowed to enable 0x8100 and 0x88a8 outer
+ * VLAN insertion and stripping simultaneously. The
+ * virtchnl_vlan_caps.offloads.ethertype_match field will also have to be
+ * populated based on what the PF can support.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * In order to enable outer VLAN stripping for 0x8100 and 0x88a8 VLANs, the VF
+ * would populate the virthcnl_vlan_offload_structure in the following manner
+ * and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message.
+ *
+ * virtchnl_vlan_setting.outer_ethertype_setting =
+ *			VIRTHCNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTHCNL_VLAN_ETHERTYPE_88A8;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ *
+ * There is also the case where a PF and the underlying hardware can support
+ * VLAN offloads on multiple ethertypes, but not concurrently. For example, if
+ * the PF populates the virtchnl_vlan_caps.offloads in the following manner the
+ * VF will be allowed to enable and/or disable 0x8100 XOR 0x88a8 outer VLAN
+ * offloads. The ethertypes must match for stripping and insertion.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * virtchnl_vlan_caps.offloads.ethertype_match =
+ *			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+ *
+ * In order to enable outer VLAN stripping for 0x88a8 VLANs, the VF would
+ * populate the virtchnl_vlan_setting structure in the following manner and send
+ * the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2. Also, this will change the
+ * ethertype for VLAN insertion if it's enabled. So, for completeness, a
+ * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 with the same ethertype should be sent.
+ *
+ * virtchnl_vlan_setting.outer_ethertype_setting = VIRTHCNL_VLAN_ETHERTYPE_88A8;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ *
+ * VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2
+ * VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2
+ *
+ * VF sends this message to enable or disable VLAN filtering. It also needs to
+ * specify an ethertype. The VF knows which VLAN ethertypes are allowed and
+ * whether or not it's allowed to enable/disable filtering via the
+ * VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS message. The VF needs to
+ * parse the virtchnl_vlan_caps.filtering fields to determine which, if any,
+ * filtering messages are allowed.
+ *
+ * For example, if the PF populates the virtchnl_vlan_caps.filtering in the
+ * following manner the VF will be allowed to enable/disable 0x8100 and 0x88a8
+ * outer VLAN filtering together. Note, that the VIRTCHNL_VLAN_ETHERTYPE_AND
+ * means that all filtering ethertypes will to be enabled and disabled together
+ * regardless of the request from the VF. This means that the underlying
+ * hardware only supports VLAN filtering for all VLAN the specified ethertypes
+ * or none of them.
+ *
+ * virtchnl_vlan_caps.filtering.filtering_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTHCNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_9100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * In order to enable outer VLAN filtering for 0x88a8 and 0x8100 VLANs (0x9100
+ * VLANs aren't supported by the VF driver), the VF would populate the
+ * virtchnl_vlan_setting structure in the following manner and send the
+ * VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2. The same message format would be used
+ * to disable outer VLAN filtering for 0x88a8 and 0x8100 VLANs, but the
+ * VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2 opcode is used.
+ *
+ * virtchnl_vlan_setting.outer_ethertype_setting =
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8;
+ *
+ */
+struct virtchnl_vlan_setting {
+	u32 outer_ethertype_setting;
+	u32 inner_ethertype_setting;
+	u16 vport_id;
+	u8 pad[6];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_setting);
+
+/* VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE
+ * VF sends VSI id and flags.
+ * PF returns status code in retval.
+ * Note: we assume that broadcast accept mode is always enabled.
+ */
+struct virtchnl_promisc_info {
+	u16 vsi_id;
+	u16 flags;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(4, virtchnl_promisc_info);
+
+#define FLAG_VF_UNICAST_PROMISC	0x00000001
+#define FLAG_VF_MULTICAST_PROMISC	0x00000002
+
+/* VIRTCHNL_OP_GET_STATS
+ * VF sends this message to request stats for the selected VSI. VF uses
+ * the virtchnl_queue_select struct to specify the VSI. The queue_id
+ * field is ignored by the PF.
+ *
+ * PF replies with struct virtchnl_eth_stats in an external buffer.
+ */
+
+struct virtchnl_eth_stats {
+	u64 rx_bytes;			/* received bytes */
+	u64 rx_unicast;			/* received unicast pkts */
+	u64 rx_multicast;		/* received multicast pkts */
+	u64 rx_broadcast;		/* received broadcast pkts */
+	u64 rx_discards;
+	u64 rx_unknown_protocol;
+	u64 tx_bytes;			/* transmitted bytes */
+	u64 tx_unicast;			/* transmitted unicast pkts */
+	u64 tx_multicast;		/* transmitted multicast pkts */
+	u64 tx_broadcast;		/* transmitted broadcast pkts */
+	u64 tx_discards;
+	u64 tx_errors;
+};
+
+/* VIRTCHNL_OP_CONFIG_RSS_KEY
+ * VIRTCHNL_OP_CONFIG_RSS_LUT
+ * VF sends these messages to configure RSS. Only supported if both PF
+ * and VF drivers set the VIRTCHNL_VF_OFFLOAD_RSS_PF bit during
+ * configuration negotiation. If this is the case, then the RSS fields in
+ * the VF resource struct are valid.
+ * Both the key and LUT are initialized to 0 by the PF, meaning that
+ * RSS is effectively disabled until set up by the VF.
+ */
+struct virtchnl_rss_key {
+	u16 vsi_id;
+	u16 key_len;
+	u8 key[1];         /* RSS hash key, packed bytes */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_key);
+
+struct virtchnl_rss_lut {
+	u16 vsi_id;
+	u16 lut_entries;
+	u8 lut[1];        /* RSS lookup table */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_lut);
+
+/* VIRTCHNL_OP_GET_RSS_HENA_CAPS
+ * VIRTCHNL_OP_SET_RSS_HENA
+ * VF sends these messages to get and set the hash filter enable bits for RSS.
+ * By default, the PF sets these to all possible traffic types that the
+ * hardware supports. The VF can query this value if it wants to change the
+ * traffic types that are hashed by the hardware.
+ */
+struct virtchnl_rss_hena {
+	u64 hena;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena);
+
+/* Type of RSS algorithm */
+enum virtchnl_rss_algorithm {
+	VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC	= 0,
+	VIRTCHNL_RSS_ALG_R_ASYMMETRIC		= 1,
+	VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC	= 2,
+	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
+};
+
+/* This is used by PF driver to enforce how many channels can be supported.
+ * When ADQ_V2 capability is negotiated, it will allow 16 channels otherwise
+ * PF driver will allow only max 4 channels
+ */
+#define VIRTCHNL_MAX_ADQ_CHANNELS 4
+#define VIRTCHNL_MAX_ADQ_V2_CHANNELS 16
+
+/* VIRTCHNL_OP_ENABLE_CHANNELS
+ * VIRTCHNL_OP_DISABLE_CHANNELS
+ * VF sends these messages to enable or disable channels based on
+ * the user specified queue count and queue offset for each traffic class.
+ * This struct encompasses all the information that the PF needs from
+ * VF to create a channel.
+ */
+struct virtchnl_channel_info {
+	u16 count; /* number of queues in a channel */
+	u16 offset; /* queues in a channel start from 'offset' */
+	u32 pad;
+	u64 max_tx_rate;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_channel_info);
+
+struct virtchnl_tc_info {
+	u32	num_tc;
+	u32	pad;
+	struct	virtchnl_channel_info list[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_tc_info);
+
+/* VIRTCHNL_ADD_CLOUD_FILTER
+ * VIRTCHNL_DEL_CLOUD_FILTER
+ * VF sends these messages to add or delete a cloud filter based on the
+ * user specified match and action filters. These structures encompass
+ * all the information that the PF needs from the VF to add/delete a
+ * cloud filter.
+ */
+
+struct virtchnl_l4_spec {
+	u8	src_mac[ETH_ALEN];
+	u8	dst_mac[ETH_ALEN];
+	/* vlan_prio is part of this 16 bit field even from OS perspective
+	 * vlan_id:12 is actual vlan_id, then vlanid:bit14..12 is vlan_prio
+	 * in future, when decided to offload vlan_prio, pass that information
+	 * as part of the "vlan_id" field, Bit14..12
+	 */
+	__be16	vlan_id;
+	__be16	pad; /* reserved for future use */
+	__be32	src_ip[4];
+	__be32	dst_ip[4];
+	__be16	src_port;
+	__be16	dst_port;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(52, virtchnl_l4_spec);
+
+union virtchnl_flow_spec {
+	struct	virtchnl_l4_spec tcp_spec;
+	u8	buffer[128]; /* reserved for future use */
+};
+
+VIRTCHNL_CHECK_UNION_LEN(128, virtchnl_flow_spec);
+
+enum virtchnl_action {
+	/* action types */
+	VIRTCHNL_ACTION_DROP = 0,
+	VIRTCHNL_ACTION_TC_REDIRECT,
+	VIRTCHNL_ACTION_PASSTHRU,
+	VIRTCHNL_ACTION_QUEUE,
+	VIRTCHNL_ACTION_Q_REGION,
+	VIRTCHNL_ACTION_MARK,
+	VIRTCHNL_ACTION_COUNT,
+};
+
+enum virtchnl_flow_type {
+	/* flow types */
+	VIRTCHNL_TCP_V4_FLOW = 0,
+	VIRTCHNL_TCP_V6_FLOW,
+	VIRTCHNL_UDP_V4_FLOW,
+	VIRTCHNL_UDP_V6_FLOW,
+};
+
+struct virtchnl_filter {
+	union	virtchnl_flow_spec data;
+	union	virtchnl_flow_spec mask;
+
+	/* see enum virtchnl_flow_type */
+	s32 	flow_type;
+
+	/* see enum virtchnl_action */
+	s32	action;
+	u32	action_meta;
+	u8	field_flags;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter);
+
+/* VIRTCHNL_OP_DCF_GET_VSI_MAP
+ * VF sends this message to get VSI mapping table.
+ * PF responds with an indirect message containing VF's
+ * HW VSI IDs.
+ * The index of vf_vsi array is the logical VF ID, the
+ * value of vf_vsi array is the VF's HW VSI ID with its
+ * valid configuration.
+ */
+struct virtchnl_dcf_vsi_map {
+	u16 pf_vsi;	/* PF's HW VSI ID */
+	u16 num_vfs;	/* The actual number of VFs allocated */
+#define VIRTCHNL_DCF_VF_VSI_ID_S	0
+#define VIRTCHNL_DCF_VF_VSI_ID_M	(0xFFF << VIRTCHNL_DCF_VF_VSI_ID_S)
+#define VIRTCHNL_DCF_VF_VSI_VALID	BIT(15)
+	u16 vf_vsi[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_dcf_vsi_map);
+
+#define PKG_NAME_SIZE	32
+#define DSN_SIZE	8
+
+struct pkg_version {
+	u8 major;
+	u8 minor;
+	u8 update;
+	u8 draft;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(4, pkg_version);
+
+struct virtchnl_pkg_info {
+	struct pkg_version pkg_ver;
+	u32 track_id;
+	char pkg_name[PKG_NAME_SIZE];
+	u8 dsn[DSN_SIZE];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(48, virtchnl_pkg_info);
+
+/* VIRTCHNL_OP_DCF_VLAN_OFFLOAD
+ * DCF negotiates the VIRTCHNL_VF_OFFLOAD_VLAN_V2 capability firstly to get
+ * the double VLAN configuration, then DCF sends this message to configure the
+ * outer or inner VLAN offloads (insertion and strip) for the target VF.
+ */
+struct virtchnl_dcf_vlan_offload {
+	u16 vf_id;
+	u16 tpid;
+	u16 vlan_flags;
+#define VIRTCHNL_DCF_VLAN_TYPE_S		0
+#define VIRTCHNL_DCF_VLAN_TYPE_M		\
+			(0x1 << VIRTCHNL_DCF_VLAN_TYPE_S)
+#define VIRTCHNL_DCF_VLAN_TYPE_INNER		0x0
+#define VIRTCHNL_DCF_VLAN_TYPE_OUTER		0x1
+#define VIRTCHNL_DCF_VLAN_INSERT_MODE_S		1
+#define VIRTCHNL_DCF_VLAN_INSERT_MODE_M	\
+			(0x7 << VIRTCHNL_DCF_VLAN_INSERT_MODE_S)
+#define VIRTCHNL_DCF_VLAN_INSERT_DISABLE	0x1
+#define VIRTCHNL_DCF_VLAN_INSERT_PORT_BASED	0x2
+#define VIRTCHNL_DCF_VLAN_INSERT_VIA_TX_DESC	0x3
+#define VIRTCHNL_DCF_VLAN_STRIP_MODE_S		4
+#define VIRTCHNL_DCF_VLAN_STRIP_MODE_M		\
+			(0x7 << VIRTCHNL_DCF_VLAN_STRIP_MODE_S)
+#define VIRTCHNL_DCF_VLAN_STRIP_DISABLE		0x1
+#define VIRTCHNL_DCF_VLAN_STRIP_ONLY		0x2
+#define VIRTCHNL_DCF_VLAN_STRIP_INTO_RX_DESC	0x3
+	u16 vlan_id;
+	u16 pad[4];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_dcf_vlan_offload);
+
+struct virtchnl_supported_rxdids {
+	/* see enum virtchnl_rx_desc_id_bitmasks */
+	u64 supported_rxdids;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_supported_rxdids);
+
+/* VIRTCHNL_OP_EVENT
+ * PF sends this message to inform the VF driver of events that may affect it.
+ * No direct response is expected from the VF, though it may generate other
+ * messages in response to this one.
+ */
+enum virtchnl_event_codes {
+	VIRTCHNL_EVENT_UNKNOWN = 0,
+	VIRTCHNL_EVENT_LINK_CHANGE,
+	VIRTCHNL_EVENT_RESET_IMPENDING,
+	VIRTCHNL_EVENT_PF_DRIVER_CLOSE,
+	VIRTCHNL_EVENT_DCF_VSI_MAP_UPDATE,
+        VIRTCHNL_EVENT_DCF_VSI_INFO = 1000,
+};
+
+#define PF_EVENT_SEVERITY_INFO		0
+#define PF_EVENT_SEVERITY_CERTAIN_DOOM	255
+
+struct virtchnl_pf_event {
+	/* see enum virtchnl_event_codes */
+	s32 event;
+	union {
+		/* If the PF driver does not support the new speed reporting
+		 * capabilities then use link_event else use link_event_adv to
+		 * get the speed and link information. The ability to understand
+		 * new speeds is indicated by setting the capability flag
+		 * VIRTCHNL_VF_CAP_ADV_LINK_SPEED in vf_cap_flags parameter
+		 * in virtchnl_vf_resource struct and can be used to determine
+		 * which link event struct to use below.
+		 */
+		struct {
+			enum virtchnl_link_speed link_speed;
+			bool link_status;
+			u8 pad[3];
+		} link_event;
+		struct {
+			/* link_speed provided in Mbps */
+			u32 link_speed;
+			u8 link_status;
+			u8 pad[3];
+		} link_event_adv;
+		struct {
+			/* link_speed provided in Mbps */
+			u32 link_speed;
+			u16 vport_id;
+			u8 link_status;
+			u8 pad;
+		} link_event_adv_vport;
+		struct {
+			u16 vf_id;
+			u16 vsi_id;
+			u32 pad;
+		} vf_vsi_map;
+	} event_data;
+
+	s32 severity;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_pf_event);
+
+/* used to specify if a ceq_idx or aeq_idx is invalid */
+#define VIRTCHNL_RDMA_INVALID_QUEUE_IDX	0xFFFF
+/* VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP
+ * VF uses this message to request PF to map RDMA vectors to RDMA queues.
+ * The request for this originates from the VF RDMA driver through
+ * a client interface between VF LAN and VF RDMA driver.
+ * A vector could have an AEQ and CEQ attached to it although
+ * there is a single AEQ per VF RDMA instance in which case
+ * most vectors will have an VIRTCHNL_RDMA_INVALID_QUEUE_IDX for aeq and valid
+ * idx for ceqs There will never be a case where there will be multiple CEQs
+ * attached to a single vector.
+ * PF configures interrupt mapping and returns status.
+ */
+#define virtchnl_iwarp_qv_info virtchnl_rdma_qv_info
+struct virtchnl_rdma_qv_info {
+	u32 v_idx; /* msix_vector */
+	u16 ceq_idx; /* set to VIRTCHNL_RDMA_INVALID_QUEUE_IDX if invalid */
+	u16 aeq_idx; /* set to VIRTCHNL_RDMA_INVALID_QUEUE_IDX if invalid */
+	u8 itr_idx;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_rdma_qv_info);
+
+#define virtchnl_iwarp_qvlist_info virtchnl_rdma_qvlist_info
+struct virtchnl_rdma_qvlist_info {
+	u32 num_vectors;
+	struct virtchnl_rdma_qv_info qv_info[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_rdma_qvlist_info);
+
+
+/* VF reset states - these are written into the RSTAT register:
+ * VFGEN_RSTAT on the VF
+ * When the PF initiates a reset, it writes 0
+ * When the reset is complete, it writes 1
+ * When the PF detects that the VF has recovered, it writes 2
+ * VF checks this register periodically to determine if a reset has occurred,
+ * then polls it to know when the reset is complete.
+ * If either the PF or VF reads the register while the hardware
+ * is in a reset state, it will return DEADBEEF, which, when masked
+ * will result in 3.
+ */
+enum virtchnl_vfr_states {
+	VIRTCHNL_VFR_INPROGRESS = 0,
+	VIRTCHNL_VFR_COMPLETED,
+	VIRTCHNL_VFR_VFACTIVE,
+};
+
+#define VIRTCHNL_MAX_NUM_PROTO_HDRS	32
+#define PROTO_HDR_SHIFT			5
+#define PROTO_HDR_FIELD_START(proto_hdr_type) \
+					(proto_hdr_type << PROTO_HDR_SHIFT)
+#define PROTO_HDR_FIELD_MASK ((1UL << PROTO_HDR_SHIFT) - 1)
+
+/* VF use these macros to configure each protocol header.
+ * Specify which protocol headers and protocol header fields base on
+ * virtchnl_proto_hdr_type and virtchnl_proto_hdr_field.
+ * @param hdr: a struct of virtchnl_proto_hdr
+ * @param hdr_type: ETH/IPV4/TCP, etc
+ * @param field: SRC/DST/TEID/SPI, etc
+ */
+#define VIRTCHNL_ADD_PROTO_HDR_FIELD(hdr, field) \
+	((hdr)->field_selector |= BIT((field) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_DEL_PROTO_HDR_FIELD(hdr, field) \
+	((hdr)->field_selector &= ~BIT((field) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_TEST_PROTO_HDR_FIELD(hdr, val) \
+	((hdr)->field_selector & BIT((val) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_GET_PROTO_HDR_FIELD(hdr)	((hdr)->field_selector)
+
+#define VIRTCHNL_ADD_PROTO_HDR_FIELD_BIT(hdr, hdr_type, field) \
+	(VIRTCHNL_ADD_PROTO_HDR_FIELD(hdr, \
+		VIRTCHNL_PROTO_HDR_ ## hdr_type ## _ ## field))
+#define VIRTCHNL_DEL_PROTO_HDR_FIELD_BIT(hdr, hdr_type, field) \
+	(VIRTCHNL_DEL_PROTO_HDR_FIELD(hdr, \
+		VIRTCHNL_PROTO_HDR_ ## hdr_type ## _ ## field))
+
+#define VIRTCHNL_SET_PROTO_HDR_TYPE(hdr, hdr_type) \
+	((hdr)->type = VIRTCHNL_PROTO_HDR_ ## hdr_type)
+#define VIRTCHNL_GET_PROTO_HDR_TYPE(hdr) \
+	(((hdr)->type) >> PROTO_HDR_SHIFT)
+#define VIRTCHNL_TEST_PROTO_HDR_TYPE(hdr, val) \
+	((hdr)->type == ((s32)((val) >> PROTO_HDR_SHIFT)))
+#define VIRTCHNL_TEST_PROTO_HDR(hdr, val) \
+	(VIRTCHNL_TEST_PROTO_HDR_TYPE(hdr, val) && \
+	 VIRTCHNL_TEST_PROTO_HDR_FIELD(hdr, val))
+
+/* Protocol header type within a packet segment. A segment consists of one or
+ * more protocol headers that make up a logical group of protocol headers. Each
+ * logical group of protocol headers encapsulates or is encapsulated using/by
+ * tunneling or encapsulation protocols for network virtualization.
+ */
+enum virtchnl_proto_hdr_type {
+	VIRTCHNL_PROTO_HDR_NONE,
+	VIRTCHNL_PROTO_HDR_ETH,
+	VIRTCHNL_PROTO_HDR_S_VLAN,
+	VIRTCHNL_PROTO_HDR_C_VLAN,
+	VIRTCHNL_PROTO_HDR_IPV4,
+	VIRTCHNL_PROTO_HDR_IPV6,
+	VIRTCHNL_PROTO_HDR_TCP,
+	VIRTCHNL_PROTO_HDR_UDP,
+	VIRTCHNL_PROTO_HDR_SCTP,
+	VIRTCHNL_PROTO_HDR_GTPU_IP,
+	VIRTCHNL_PROTO_HDR_GTPU_EH,
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN,
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP,
+	VIRTCHNL_PROTO_HDR_PPPOE,
+	VIRTCHNL_PROTO_HDR_L2TPV3,
+	VIRTCHNL_PROTO_HDR_ESP,
+	VIRTCHNL_PROTO_HDR_AH,
+	VIRTCHNL_PROTO_HDR_PFCP,
+	VIRTCHNL_PROTO_HDR_GTPC,
+	VIRTCHNL_PROTO_HDR_ECPRI,
+	VIRTCHNL_PROTO_HDR_L2TPV2,
+	VIRTCHNL_PROTO_HDR_PPP,
+	/* IPv4 and IPv6 Fragment header types are only associated to
+	 * VIRTCHNL_PROTO_HDR_IPV4 and VIRTCHNL_PROTO_HDR_IPV6 respectively,
+	 * cannot be used independently.
+	 */
+	VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+	VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG,
+};
+
+/* Protocol header field within a protocol header. */
+enum virtchnl_proto_hdr_field {
+	/* ETHER */
+	VIRTCHNL_PROTO_HDR_ETH_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ETH),
+	VIRTCHNL_PROTO_HDR_ETH_DST,
+	VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE,
+	/* S-VLAN */
+	VIRTCHNL_PROTO_HDR_S_VLAN_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_S_VLAN),
+	/* C-VLAN */
+	VIRTCHNL_PROTO_HDR_C_VLAN_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_C_VLAN),
+	/* IPV4 */
+	VIRTCHNL_PROTO_HDR_IPV4_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4),
+	VIRTCHNL_PROTO_HDR_IPV4_DST,
+	VIRTCHNL_PROTO_HDR_IPV4_DSCP,
+	VIRTCHNL_PROTO_HDR_IPV4_TTL,
+	VIRTCHNL_PROTO_HDR_IPV4_PROT,
+	/* IPV6 */
+	VIRTCHNL_PROTO_HDR_IPV6_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6),
+	VIRTCHNL_PROTO_HDR_IPV6_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_TC,
+	VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT,
+	VIRTCHNL_PROTO_HDR_IPV6_PROT,
+	/* IPV6 Prefix */
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_DST,
+	/* TCP */
+	VIRTCHNL_PROTO_HDR_TCP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_TCP),
+	VIRTCHNL_PROTO_HDR_TCP_DST_PORT,
+	/* UDP */
+	VIRTCHNL_PROTO_HDR_UDP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_UDP),
+	VIRTCHNL_PROTO_HDR_UDP_DST_PORT,
+	/* SCTP */
+	VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_SCTP),
+	VIRTCHNL_PROTO_HDR_SCTP_DST_PORT,
+	/* GTPU_IP */
+	VIRTCHNL_PROTO_HDR_GTPU_IP_TEID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_IP),
+	/* GTPU_EH */
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH),
+	VIRTCHNL_PROTO_HDR_GTPU_EH_QFI,
+	/* PPPOE */
+	VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PPPOE),
+	/* L2TPV3 */
+	VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_L2TPV3),
+	/* ESP */
+	VIRTCHNL_PROTO_HDR_ESP_SPI =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ESP),
+	/* AH */
+	VIRTCHNL_PROTO_HDR_AH_SPI =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_AH),
+	/* PFCP */
+	VIRTCHNL_PROTO_HDR_PFCP_S_FIELD =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PFCP),
+	VIRTCHNL_PROTO_HDR_PFCP_SEID,
+	/* GTPC */
+	VIRTCHNL_PROTO_HDR_GTPC_TEID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPC),
+	/* ECPRI */
+	VIRTCHNL_PROTO_HDR_ECPRI_MSG_TYPE =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ECPRI),
+	VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID,
+	/* IPv4 Dummy Fragment */
+	VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4_FRAG),
+	/* IPv6 Extension Fragment */
+	VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG_PKID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG),
+};
+
+struct virtchnl_proto_hdr {
+	/* see enum virtchnl_proto_hdr_type */
+	s32 type;
+	u32 field_selector; /* a bit mask to select field for header type */
+	u8 buffer[64];
+	/**
+	 * binary buffer in network order for specific header type.
+	 * For example, if type = VIRTCHNL_PROTO_HDR_IPV4, a IPv4
+	 * header is expected to be copied into the buffer.
+	 */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_proto_hdr);
+
+struct virtchnl_proto_hdrs {
+	u8 tunnel_level;
+	/**
+	 * specify where protocol header start from.
+	 * 0 - from the outer layer
+	 * 1 - from the first inner layer
+	 * 2 - from the second inner layer
+	 * ....
+	 **/
+	int count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */
+	struct virtchnl_proto_hdr proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs);
+
+struct virtchnl_rss_cfg {
+	struct virtchnl_proto_hdrs proto_hdrs;	   /* protocol headers */
+
+	/* see enum virtchnl_rss_algorithm; rss algorithm type */
+	s32 rss_algorithm;
+	u8 reserved[128];                          /* reserve for future */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2444, virtchnl_rss_cfg);
+
+/* action configuration for FDIR */
+struct virtchnl_filter_action {
+	/* see enum virtchnl_action type */
+	s32 type;
+	union {
+		/* used for queue and qgroup action */
+		struct {
+			u16 index;
+			u8 region;
+		} queue;
+		/* used for count action */
+		struct {
+			/* share counter ID with other flow rules */
+			u8 shared;
+			u32 id; /* counter ID */
+		} count;
+		/* used for mark action */
+		u32 mark_id;
+		u8 reserve[32];
+	} act_conf;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(36, virtchnl_filter_action);
+
+#define VIRTCHNL_MAX_NUM_ACTIONS  8
+
+struct virtchnl_filter_action_set {
+	/* action number must be less then VIRTCHNL_MAX_NUM_ACTIONS */
+	int count;
+	struct virtchnl_filter_action actions[VIRTCHNL_MAX_NUM_ACTIONS];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(292, virtchnl_filter_action_set);
+
+/* pattern and action for FDIR rule */
+struct virtchnl_fdir_rule {
+	struct virtchnl_proto_hdrs proto_hdrs;
+	struct virtchnl_filter_action_set action_set;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2604, virtchnl_fdir_rule);
+
+/* Status returned to VF after VF requests FDIR commands
+ * VIRTCHNL_FDIR_SUCCESS
+ * VF FDIR related request is successfully done by PF
+ * The request can be OP_ADD/DEL/QUERY_FDIR_FILTER.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE
+ * OP_ADD_FDIR_FILTER request is failed due to no Hardware resource.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_EXIST
+ * OP_ADD_FDIR_FILTER request is failed due to the rule is already existed.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT
+ * OP_ADD_FDIR_FILTER request is failed due to conflict with existing rule.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST
+ * OP_DEL_FDIR_FILTER request is failed due to this rule doesn't exist.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_INVALID
+ * OP_ADD_FDIR_FILTER request is failed due to parameters validation
+ * or HW doesn't support.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT
+ * OP_ADD/DEL_FDIR_FILTER request is failed due to timing out
+ * for programming.
+ *
+ * VIRTCHNL_FDIR_FAILURE_QUERY_INVALID
+ * OP_QUERY_FDIR_FILTER request is failed due to parameters validation,
+ * for example, VF query counter of a rule who has no counter action.
+ */
+enum virtchnl_fdir_prgm_status {
+	VIRTCHNL_FDIR_SUCCESS = 0,
+	VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE,
+	VIRTCHNL_FDIR_FAILURE_RULE_EXIST,
+	VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT,
+	VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST,
+	VIRTCHNL_FDIR_FAILURE_RULE_INVALID,
+	VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT,
+	VIRTCHNL_FDIR_FAILURE_QUERY_INVALID,
+};
+
+/* VIRTCHNL_OP_ADD_FDIR_FILTER
+ * VF sends this request to PF by filling out vsi_id,
+ * validate_only and rule_cfg. PF will return flow_id
+ * if the request is successfully done and return add_status to VF.
+ */
+struct virtchnl_fdir_add {
+	u16 vsi_id;  /* INPUT */
+	/*
+	 * 1 for validating a fdir rule, 0 for creating a fdir rule.
+	 * Validate and create share one ops: VIRTCHNL_OP_ADD_FDIR_FILTER.
+	 */
+	u16 validate_only; /* INPUT */
+	u32 flow_id;       /* OUTPUT */
+	struct virtchnl_fdir_rule rule_cfg; /* INPUT */
+
+	/* see enum virtchnl_fdir_prgm_status; OUTPUT */
+	s32 status;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2616, virtchnl_fdir_add);
+
+/* VIRTCHNL_OP_DEL_FDIR_FILTER
+ * VF sends this request to PF by filling out vsi_id
+ * and flow_id. PF will return del_status to VF.
+ */
+struct virtchnl_fdir_del {
+	u16 vsi_id;  /* INPUT */
+	u16 pad;
+	u32 flow_id; /* INPUT */
+
+	/* see enum virtchnl_fdir_prgm_status; OUTPUT */
+	s32 status;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del);
+
+/* TX and RX queue types are valid in legacy as well as split queue models.
+ * With Split Queue model, 2 additional types are introduced - TX_COMPLETION
+ * and RX_BUFFER. In split queue model, RX corresponds to the queue where HW
+ * posts completions.
+ */
+enum virtchnl_queue_type {
+	VIRTCHNL_QUEUE_TYPE_TX			= 0,
+	VIRTCHNL_QUEUE_TYPE_RX			= 1,
+	VIRTCHNL_QUEUE_TYPE_TX_COMPLETION	= 2,
+	VIRTCHNL_QUEUE_TYPE_RX_BUFFER		= 3,
+	VIRTCHNL_QUEUE_TYPE_CONFIG_TX		= 4,
+	VIRTCHNL_QUEUE_TYPE_CONFIG_RX		= 5
+};
+
+
+/* structure to specify a chunk of contiguous queues */
+struct virtchnl_queue_chunk {
+	/* see enum virtchnl_queue_type */
+	s32 type;
+	u16 start_queue_id;
+	u16 num_queues;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_queue_chunk);
+
+/* structure to specify several chunks of contiguous queues */
+struct virtchnl_queue_chunks {
+	u16 num_chunks;
+	u16 rsvd;
+	struct virtchnl_queue_chunk chunks[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_queue_chunks);
+
+
+/* VIRTCHNL_OP_ENABLE_QUEUES_V2
+ * VIRTCHNL_OP_DISABLE_QUEUES_V2
+ * VIRTCHNL_OP_DEL_QUEUES
+ *
+ * If VIRTCHNL version was negotiated in VIRTCHNL_OP_VERSION as 2.0
+ * then all of these ops are available.
+ *
+ * If VIRTCHNL_VF_LARGE_NUM_QPAIRS was negotiated in VIRTCHNL_OP_GET_VF_RESOURCES
+ * then VIRTCHNL_OP_ENABLE_QUEUES_V2 and VIRTCHNL_OP_DISABLE_QUEUES_V2 are
+ * available.
+ *
+ * PF sends these messages to enable, disable or delete queues specified in
+ * chunks. PF sends virtchnl_del_ena_dis_queues struct to specify the queues
+ * to be enabled/disabled/deleted. Also applicable to single queue RX or
+ * TX. CP performs requested action and returns status.
+ */
+struct virtchnl_del_ena_dis_queues {
+	u16 vport_id;
+	u16 pad;
+	struct virtchnl_queue_chunks chunks;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_del_ena_dis_queues);
+
+/* Virtchannel interrupt throttling rate index */
+enum virtchnl_itr_idx {
+	VIRTCHNL_ITR_IDX_0	= 0,
+	VIRTCHNL_ITR_IDX_1	= 1,
+	VIRTCHNL_ITR_IDX_NO_ITR	= 3,
+};
+
+/* Queue to vector mapping */
+struct virtchnl_queue_vector {
+	u16 queue_id;
+	u16 vector_id;
+	u8 pad[4];
+
+	/* see enum virtchnl_itr_idx */
+	s32 itr_idx;
+
+	/* see enum virtchnl_queue_type */
+	s32 queue_type;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_queue_vector);
+
+/* VIRTCHNL_OP_MAP_QUEUE_VECTOR
+ *
+ * If VIRTCHNL_VF_LARGE_NUM_QPAIRS was negotiated in VIRTCHNL_OP_GET_VF_RESOURCES
+ * then only VIRTCHNL_OP_MAP_QUEUE_VECTOR is available.
+ *
+ * PF sends this message to map or unmap queues to vectors and ITR index
+ * registers. External data buffer contains virtchnl_queue_vector_maps structure
+ * that contains num_qv_maps of virtchnl_queue_vector structures.
+ * CP maps the requested queue vector maps after validating the queue and vector
+ * ids and returns a status code.
+ */
+struct virtchnl_queue_vector_maps {
+	u16 vport_id;
+	u16 num_qv_maps;
+	u8 pad[4];
+	struct virtchnl_queue_vector qv_maps[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_queue_vector_maps);
+
+
+
+/* Since VF messages are limited by u16 size, precalculate the maximum possible
+ * values of nested elements in virtchnl structures that virtual channel can
+ * possibly handle in a single message.
+ */
+enum virtchnl_vector_limits {
+	VIRTCHNL_OP_CONFIG_VSI_QUEUES_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_vsi_queue_config_info)) /
+		sizeof(struct virtchnl_queue_pair_info),
+
+	VIRTCHNL_OP_CONFIG_IRQ_MAP_MAX		=
+		((u16)(~0) - sizeof(struct virtchnl_irq_map_info)) /
+		sizeof(struct virtchnl_vector_map),
+
+	VIRTCHNL_OP_ADD_DEL_ETH_ADDR_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_ether_addr_list)) /
+		sizeof(struct virtchnl_ether_addr),
+
+	VIRTCHNL_OP_ADD_DEL_VLAN_MAX		=
+		((u16)(~0) - sizeof(struct virtchnl_vlan_filter_list)) /
+		sizeof(u16),
+
+	VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_rdma_qvlist_info)) /
+		sizeof(struct virtchnl_rdma_qv_info),
+
+	VIRTCHNL_OP_ENABLE_CHANNELS_MAX		=
+		((u16)(~0) - sizeof(struct virtchnl_tc_info)) /
+		sizeof(struct virtchnl_channel_info),
+
+	VIRTCHNL_OP_ENABLE_DISABLE_DEL_QUEUES_V2_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_del_ena_dis_queues)) /
+		sizeof(struct virtchnl_queue_chunk),
+
+	VIRTCHNL_OP_MAP_UNMAP_QUEUE_VECTOR_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_queue_vector_maps)) /
+		sizeof(struct virtchnl_queue_vector),
+
+	VIRTCHNL_OP_ADD_DEL_VLAN_V2_MAX		=
+		((u16)(~0) - sizeof(struct virtchnl_vlan_filter_list_v2)) /
+		sizeof(struct virtchnl_vlan_filter),
+};
+
+/**
+ * virtchnl_vc_validate_vf_msg
+ * @ver: Virtchnl version info
+ * @v_opcode: Opcode for the message
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * validate msg format against struct for each opcode
+ */
+static inline int
+virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
+			    u8 *msg, u16 msglen)
+{
+	bool err_msg_format = false;
+	u32 valid_len = 0;
+
+	/* Validate message length. */
+	switch (v_opcode) {
+	case VIRTCHNL_OP_VERSION:
+		valid_len = sizeof(struct virtchnl_version_info);
+		break;
+	case VIRTCHNL_OP_RESET_VF:
+		break;
+	case VIRTCHNL_OP_GET_VF_RESOURCES:
+		if (VF_IS_V11(ver))
+			valid_len = sizeof(u32);
+		break;
+	case VIRTCHNL_OP_CONFIG_TX_QUEUE:
+		valid_len = sizeof(struct virtchnl_txq_info);
+		break;
+	case VIRTCHNL_OP_CONFIG_RX_QUEUE:
+		valid_len = sizeof(struct virtchnl_rxq_info);
+		break;
+	case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+		valid_len = sizeof(struct virtchnl_vsi_queue_config_info);
+		if (msglen >= valid_len) {
+			struct virtchnl_vsi_queue_config_info *vqc =
+			    (struct virtchnl_vsi_queue_config_info *)msg;
+
+			if (vqc->num_queue_pairs == 0 || vqc->num_queue_pairs >
+			    VIRTCHNL_OP_CONFIG_VSI_QUEUES_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += (vqc->num_queue_pairs *
+				      sizeof(struct
+					     virtchnl_queue_pair_info));
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+		valid_len = sizeof(struct virtchnl_irq_map_info);
+		if (msglen >= valid_len) {
+			struct virtchnl_irq_map_info *vimi =
+			    (struct virtchnl_irq_map_info *)msg;
+
+			if (vimi->num_vectors == 0 || vimi->num_vectors >
+			    VIRTCHNL_OP_CONFIG_IRQ_MAP_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += (vimi->num_vectors *
+				      sizeof(struct virtchnl_vector_map));
+		}
+		break;
+	case VIRTCHNL_OP_ENABLE_QUEUES:
+	case VIRTCHNL_OP_DISABLE_QUEUES:
+		valid_len = sizeof(struct virtchnl_queue_select);
+		break;
+	case VIRTCHNL_OP_GET_MAX_RSS_QREGION:
+		break;
+	case VIRTCHNL_OP_ADD_ETH_ADDR:
+	case VIRTCHNL_OP_DEL_ETH_ADDR:
+		valid_len = sizeof(struct virtchnl_ether_addr_list);
+		if (msglen >= valid_len) {
+			struct virtchnl_ether_addr_list *veal =
+			    (struct virtchnl_ether_addr_list *)msg;
+
+			if (veal->num_elements == 0 || veal->num_elements >
+			    VIRTCHNL_OP_ADD_DEL_ETH_ADDR_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += veal->num_elements *
+			    sizeof(struct virtchnl_ether_addr);
+		}
+		break;
+	case VIRTCHNL_OP_ADD_VLAN:
+	case VIRTCHNL_OP_DEL_VLAN:
+		valid_len = sizeof(struct virtchnl_vlan_filter_list);
+		if (msglen >= valid_len) {
+			struct virtchnl_vlan_filter_list *vfl =
+			    (struct virtchnl_vlan_filter_list *)msg;
+
+			if (vfl->num_elements == 0 || vfl->num_elements >
+			    VIRTCHNL_OP_ADD_DEL_VLAN_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += vfl->num_elements * sizeof(u16);
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+		valid_len = sizeof(struct virtchnl_promisc_info);
+		break;
+	case VIRTCHNL_OP_GET_STATS:
+		valid_len = sizeof(struct virtchnl_queue_select);
+		break;
+	case VIRTCHNL_OP_RDMA:
+		/* These messages are opaque to us and will be validated in
+		 * the RDMA client code. We just need to check for nonzero
+		 * length. The firmware will enforce max length restrictions.
+		 */
+		if (msglen)
+			valid_len = msglen;
+		else
+			err_msg_format = true;
+		break;
+	case VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP:
+		break;
+	case VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP:
+		valid_len = sizeof(struct virtchnl_rdma_qvlist_info);
+		if (msglen >= valid_len) {
+			struct virtchnl_rdma_qvlist_info *qv =
+				(struct virtchnl_rdma_qvlist_info *)msg;
+
+			if (qv->num_vectors == 0 || qv->num_vectors >
+			    VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += ((qv->num_vectors - 1) *
+				sizeof(struct virtchnl_rdma_qv_info));
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_RSS_KEY:
+		valid_len = sizeof(struct virtchnl_rss_key);
+		if (msglen >= valid_len) {
+			struct virtchnl_rss_key *vrk =
+				(struct virtchnl_rss_key *)msg;
+
+			if (vrk->key_len == 0) {
+				/* zero length is allowed as input */
+				break;
+			}
+
+			valid_len += vrk->key_len - 1;
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_RSS_LUT:
+		valid_len = sizeof(struct virtchnl_rss_lut);
+		if (msglen >= valid_len) {
+			struct virtchnl_rss_lut *vrl =
+				(struct virtchnl_rss_lut *)msg;
+
+			if (vrl->lut_entries == 0) {
+				/* zero entries is allowed as input */
+				break;
+			}
+
+			valid_len += vrl->lut_entries - 1;
+		}
+		break;
+	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
+		break;
+	case VIRTCHNL_OP_SET_RSS_HENA:
+		valid_len = sizeof(struct virtchnl_rss_hena);
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
+		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		valid_len = sizeof(struct virtchnl_vf_res_request);
+		break;
+	case VIRTCHNL_OP_ENABLE_CHANNELS:
+		valid_len = sizeof(struct virtchnl_tc_info);
+		if (msglen >= valid_len) {
+			struct virtchnl_tc_info *vti =
+				(struct virtchnl_tc_info *)msg;
+
+			if (vti->num_tc == 0 || vti->num_tc >
+			    VIRTCHNL_OP_ENABLE_CHANNELS_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += (vti->num_tc - 1) *
+				     sizeof(struct virtchnl_channel_info);
+		}
+		break;
+	case VIRTCHNL_OP_DISABLE_CHANNELS:
+		break;
+	case VIRTCHNL_OP_ADD_CLOUD_FILTER:
+	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
+		valid_len = sizeof(struct virtchnl_filter);
+		break;
+	case VIRTCHNL_OP_DCF_VLAN_OFFLOAD:
+		valid_len = sizeof(struct virtchnl_dcf_vlan_offload);
+		break;
+	case VIRTCHNL_OP_DCF_CMD_DESC:
+	case VIRTCHNL_OP_DCF_CMD_BUFF:
+		/* These two opcodes are specific to handle the AdminQ command,
+		 * so the validation needs to be done in PF's context.
+		 */
+		valid_len = msglen;
+		break;
+        case VIRTCHNL_OP_DCF_RULE_FLUSH:
+	case VIRTCHNL_OP_DCF_DISABLE:
+	case VIRTCHNL_OP_DCF_GET_VSI_MAP:
+	case VIRTCHNL_OP_DCF_GET_PKG_INFO:
+		break;
+	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
+		break;
+	case VIRTCHNL_OP_ADD_RSS_CFG:
+	case VIRTCHNL_OP_DEL_RSS_CFG:
+		valid_len = sizeof(struct virtchnl_rss_cfg);
+		break;
+	case VIRTCHNL_OP_ADD_FDIR_FILTER:
+		valid_len = sizeof(struct virtchnl_fdir_add);
+		break;
+	case VIRTCHNL_OP_DEL_FDIR_FILTER:
+		valid_len = sizeof(struct virtchnl_fdir_del);
+		break;
+	case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS:
+		break;
+	case VIRTCHNL_OP_ADD_VLAN_V2:
+	case VIRTCHNL_OP_DEL_VLAN_V2:
+		valid_len = sizeof(struct virtchnl_vlan_filter_list_v2);
+		if (msglen >= valid_len) {
+			struct virtchnl_vlan_filter_list_v2 *vfl =
+			    (struct virtchnl_vlan_filter_list_v2 *)msg;
+
+			if (vfl->num_elements == 0 || vfl->num_elements >
+			    VIRTCHNL_OP_ADD_DEL_VLAN_V2_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += (vfl->num_elements - 1) *
+				sizeof(struct virtchnl_vlan_filter);
+		}
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2:
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2:
+	case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2:
+	case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
+	case VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2:
+	case VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2:
+		valid_len = sizeof(struct virtchnl_vlan_setting);
+		break;
+	case VIRTCHNL_OP_ENABLE_QUEUES_V2:
+	case VIRTCHNL_OP_DISABLE_QUEUES_V2:
+		valid_len = sizeof(struct virtchnl_del_ena_dis_queues);
+		if (msglen >= valid_len) {
+			struct virtchnl_del_ena_dis_queues *qs =
+				(struct virtchnl_del_ena_dis_queues *)msg;
+			if (qs->chunks.num_chunks == 0 ||
+			    qs->chunks.num_chunks > VIRTCHNL_OP_ENABLE_DISABLE_DEL_QUEUES_V2_MAX) {
+				err_msg_format = true;
+				break;
+			}
+			valid_len += (qs->chunks.num_chunks - 1) *
+				      sizeof(struct virtchnl_queue_chunk);
+		}
+		break;
+	case VIRTCHNL_OP_MAP_QUEUE_VECTOR:
+		valid_len = sizeof(struct virtchnl_queue_vector_maps);
+		if (msglen >= valid_len) {
+			struct virtchnl_queue_vector_maps *v_qp =
+				(struct virtchnl_queue_vector_maps *)msg;
+			if (v_qp->num_qv_maps == 0 ||
+			    v_qp->num_qv_maps > VIRTCHNL_OP_MAP_UNMAP_QUEUE_VECTOR_MAX) {
+				err_msg_format = true;
+				break;
+			}
+			valid_len += (v_qp->num_qv_maps - 1) *
+				      sizeof(struct virtchnl_queue_vector);
+		}
+		break;
+	/* These are always errors coming from the VF. */
+	case VIRTCHNL_OP_EVENT:
+	case VIRTCHNL_OP_UNKNOWN:
+	default:
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+	/* few more checks */
+	if (err_msg_format || valid_len != msglen)
+		return VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH;
+
+	return 0;
+}
+#endif /* _VIRTCHNL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/virtchnl_inline_ipsec.h b/drivers/net/ethernet/intel/ice/virtchnl_inline_ipsec.h
new file mode 100644
index 000000000000..eec608dde607
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/virtchnl_inline_ipsec.h
@@ -0,0 +1,548 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+
+#ifndef _VIRTCHNL_INLINE_IPSEC_H_
+#define _VIRTCHNL_INLINE_IPSEC_H_
+
+#define VIRTCHNL_IPSEC_MAX_CRYPTO_CAP_NUM	3
+#define VIRTCHNL_IPSEC_MAX_ALGO_CAP_NUM		16
+#define VIRTCHNL_IPSEC_MAX_TX_DESC_NUM		128
+#define VIRTCHNL_IPSEC_MAX_CRYPTO_ITEM_NUMBER	2
+#define VIRTCHNL_IPSEC_MAX_KEY_LEN		128
+#define VIRTCHNL_IPSEC_MAX_SA_DESTROY_NUM	8
+#define VIRTCHNL_IPSEC_SA_DESTROY		0
+#define VIRTCHNL_IPSEC_BROADCAST_VFID		0xFFFFFFFF
+#define VIRTCHNL_IPSEC_INVALID_REQ_ID		0xFFFF
+#define VIRTCHNL_IPSEC_INVALID_SA_CFG_RESP	0xFFFFFFFF
+#define VIRTCHNL_IPSEC_INVALID_SP_CFG_RESP	0xFFFFFFFF
+
+/* crypto type */
+#define VIRTCHNL_AUTH		1
+#define VIRTCHNL_CIPHER		2
+#define VIRTCHNL_AEAD		3
+
+/* caps enabled */
+#define VIRTCHNL_IPSEC_ESN_ENA			BIT(0)
+#define VIRTCHNL_IPSEC_UDP_ENCAP_ENA		BIT(1)
+#define VIRTCHNL_IPSEC_SA_INDEX_SW_ENA		BIT(2)
+#define VIRTCHNL_IPSEC_AUDIT_ENA		BIT(3)
+#define VIRTCHNL_IPSEC_BYTE_LIMIT_ENA		BIT(4)
+#define VIRTCHNL_IPSEC_DROP_ON_AUTH_FAIL_ENA	BIT(5)
+#define VIRTCHNL_IPSEC_ARW_CHECK_ENA		BIT(6)
+#define VIRTCHNL_IPSEC_24BIT_SPI_ENA		BIT(7)
+
+/* algorithm type */
+/* Hash Algorithm */
+#define VIRTCHNL_HASH_NO_ALG	0 /* NULL algorithm */
+#define VIRTCHNL_AES_CBC_MAC	1 /* AES-CBC-MAC algorithm */
+#define VIRTCHNL_AES_CMAC	2 /* AES CMAC algorithm */
+#define VIRTCHNL_AES_GMAC	3 /* AES GMAC algorithm */
+#define VIRTCHNL_AES_XCBC_MAC	4 /* AES XCBC algorithm */
+#define VIRTCHNL_MD5_HMAC	5 /* HMAC using MD5 algorithm */
+#define VIRTCHNL_SHA1_HMAC	6 /* HMAC using 128 bit SHA algorithm */
+#define VIRTCHNL_SHA224_HMAC	7 /* HMAC using 224 bit SHA algorithm */
+#define VIRTCHNL_SHA256_HMAC	8 /* HMAC using 256 bit SHA algorithm */
+#define VIRTCHNL_SHA384_HMAC	9 /* HMAC using 384 bit SHA algorithm */
+#define VIRTCHNL_SHA512_HMAC	10 /* HMAC using 512 bit SHA algorithm */
+#define VIRTCHNL_SHA3_224_HMAC	11 /* HMAC using 224 bit SHA3 algorithm */
+#define VIRTCHNL_SHA3_256_HMAC	12 /* HMAC using 256 bit SHA3 algorithm */
+#define VIRTCHNL_SHA3_384_HMAC	13 /* HMAC using 384 bit SHA3 algorithm */
+#define VIRTCHNL_SHA3_512_HMAC	14 /* HMAC using 512 bit SHA3 algorithm */
+/* Cipher Algorithm */
+#define VIRTCHNL_CIPHER_NO_ALG	15 /* NULL algorithm */
+#define VIRTCHNL_3DES_CBC	16 /* Triple DES algorithm in CBC mode */
+#define VIRTCHNL_AES_CBC	17 /* AES algorithm in CBC mode */
+#define VIRTCHNL_AES_CTR	18 /* AES algorithm in Counter mode */
+/* AEAD Algorithm */
+#define VIRTCHNL_AES_CCM	19 /* AES algorithm in CCM mode */
+#define VIRTCHNL_AES_GCM	20 /* AES algorithm in GCM mode */
+#define VIRTCHNL_CHACHA20_POLY1305 21 /* algorithm of ChaCha20-Poly1305 */
+
+/* protocol type */
+#define VIRTCHNL_PROTO_ESP	1
+#define VIRTCHNL_PROTO_AH	2
+#define VIRTCHNL_PROTO_RSVD1	3
+
+/* sa mode */
+#define VIRTCHNL_SA_MODE_TRANSPORT	1
+#define VIRTCHNL_SA_MODE_TUNNEL		2
+#define VIRTCHNL_SA_MODE_TRAN_TUN	3
+#define VIRTCHNL_SA_MODE_UNKNOWN	4
+
+/* sa direction */
+#define VIRTCHNL_DIR_INGRESS		1
+#define VIRTCHNL_DIR_EGRESS		2
+#define VIRTCHNL_DIR_INGRESS_EGRESS	3
+
+/* sa termination */
+#define VIRTCHNL_TERM_SOFTWARE	1
+#define VIRTCHNL_TERM_HARDWARE	2
+
+/* sa ip type */
+#define VIRTCHNL_IPV4	1
+#define VIRTCHNL_IPV6	2
+
+/* for virtchnl_ipsec_resp */
+enum inline_ipsec_resp {
+	INLINE_IPSEC_SUCCESS = 0,
+	INLINE_IPSEC_FAIL = -1,
+	INLINE_IPSEC_ERR_FIFO_FULL = -2,
+	INLINE_IPSEC_ERR_NOT_READY = -3,
+	INLINE_IPSEC_ERR_VF_DOWN = -4,
+	INLINE_IPSEC_ERR_INVALID_PARAMS = -5,
+	INLINE_IPSEC_ERR_NO_MEM = -6,
+};
+
+/* Detailed opcodes for DPDK and IPsec use */
+enum inline_ipsec_ops {
+	INLINE_IPSEC_OP_GET_CAP = 0,
+	INLINE_IPSEC_OP_GET_STATUS = 1,
+	INLINE_IPSEC_OP_SA_CREATE = 2,
+	INLINE_IPSEC_OP_SA_UPDATE = 3,
+	INLINE_IPSEC_OP_SA_DESTROY = 4,
+	INLINE_IPSEC_OP_SP_CREATE = 5,
+	INLINE_IPSEC_OP_SP_DESTROY = 6,
+	INLINE_IPSEC_OP_SA_READ = 7,
+	INLINE_IPSEC_OP_EVENT = 8,
+	INLINE_IPSEC_OP_RESP = 9,
+};
+
+/* Not all valid, if certain field is invalid, set 1 for all bits */
+struct virtchnl_algo_cap  {
+	u32 algo_type;
+
+	u16 block_size;
+
+	u16 min_key_size;
+	u16 max_key_size;
+	u16 inc_key_size;
+
+	u16 min_iv_size;
+	u16 max_iv_size;
+	u16 inc_iv_size;
+
+	u16 min_digest_size;
+	u16 max_digest_size;
+	u16 inc_digest_size;
+
+	u16 min_aad_size;
+	u16 max_aad_size;
+	u16 inc_aad_size;
+} __packed;
+
+/* vf record the capability of crypto from the virtchnl */
+struct virtchnl_sym_crypto_cap {
+	u8 crypto_type;
+	u8 algo_cap_num;
+	struct virtchnl_algo_cap algo_cap_list[VIRTCHNL_IPSEC_MAX_ALGO_CAP_NUM];
+};
+
+/* VIRTCHNL_OP_GET_IPSEC_CAP
+ * VF pass virtchnl_ipsec_cap to PF
+ * and PF return capability of ipsec from virtchnl.
+ */
+struct virtchnl_ipsec_cap {
+	/* max number of SA per VF */
+	u16 max_sa_num;
+
+	/* IPsec SA Protocol - value ref VIRTCHNL_PROTO_XXX */
+	u8 virtchnl_protocol_type;
+
+	/* IPsec SA Mode - value ref VIRTCHNL_SA_MODE_XXX */
+	u8 virtchnl_sa_mode;
+
+	/* IPSec SA Direction - value ref VIRTCHNL_DIR_XXX */
+	u8 virtchnl_direction;
+
+	/* termination mode - value ref VIRTCHNL_TERM_XXX */
+	u8 termination_mode;
+
+	/* number of supported crypto capability */
+	u8 crypto_cap_num;
+
+	/* descriptor ID */
+	u16 desc_id;
+
+	/* capabilities enabled - value ref VIRTCHNL_IPSEC_XXX_ENA */
+	u32 caps_enabled;
+
+	/* crypto capabilities */
+	struct virtchnl_sym_crypto_cap cap[VIRTCHNL_IPSEC_MAX_CRYPTO_CAP_NUM];
+} __packed;
+
+/* configuration of crypto function */
+struct virtchnl_ipsec_crypto_cfg_item {
+	u8 crypto_type;
+
+	u32 algo_type;
+
+	/* Length of valid IV data. */
+	u16 iv_len;
+
+	/* Length of digest */
+	u16 digest_len;
+
+	/* SA salt */
+	u32 salt;
+
+	/* The length of the symmetric key */
+	u16 key_len;
+
+	/* key data buffer */
+	u8 key_data[VIRTCHNL_IPSEC_MAX_KEY_LEN];
+} __packed;
+
+struct virtchnl_ipsec_sym_crypto_cfg {
+	struct virtchnl_ipsec_crypto_cfg_item
+		items[VIRTCHNL_IPSEC_MAX_CRYPTO_ITEM_NUMBER];
+};
+
+/* VIRTCHNL_OP_IPSEC_SA_CREATE
+ * VF send this SA configuration to PF using virtchnl;
+ * PF create SA as configuration and PF driver will return
+ * an unique index (sa_idx) for the created SA.
+ */
+struct virtchnl_ipsec_sa_cfg {
+	/* IPsec SA Protocol - AH/ESP */
+	u8 virtchnl_protocol_type;
+
+	/* termination mode - value ref VIRTCHNL_TERM_XXX */
+	u8 virtchnl_termination;
+
+	/* type of outer IP - IPv4/IPv6 */
+	u8 virtchnl_ip_type;
+
+	/* type of esn - !0:enable/0:disable */
+	u8 esn_enabled;
+
+	/* udp encap - !0:enable/0:disable */
+	u8 udp_encap_enabled;
+
+	/* IPSec SA Direction - value ref VIRTCHNL_DIR_XXX */
+	u8 virtchnl_direction;
+
+	/* reserved */
+	u8 reserved1;
+
+	/* SA security parameter index */
+	u32 spi;
+
+	/* outer src ip address */
+	u8 src_addr[16];
+
+	/* outer dst ip address */
+	u8 dst_addr[16];
+
+	/* SPD reference. Used to link an SA with its policy.
+	 * PF drivers may ignore this field.
+	 */
+	u16 spd_ref;
+
+	/* high 32 bits of esn */
+	u32 esn_hi;
+
+	/* low 32 bits of esn */
+	u32 esn_low;
+
+	/* When enabled, sa_index must be valid */
+	u8 sa_index_en;
+
+	/* SA index when sa_index_en is true */
+	u32 sa_index;
+
+	/* auditing mode - enable/disable */
+	u8 audit_en;
+
+	/* lifetime byte limit - enable/disable
+	 * When enabled, byte_limit_hard and byte_limit_soft
+	 * must be valid.
+	 */
+	u8 byte_limit_en;
+
+	/* hard byte limit count */
+	u64 byte_limit_hard;
+
+	/* soft byte limit count */
+	u64 byte_limit_soft;
+
+	/* drop on authentication failure - enable/disable */
+	u8 drop_on_auth_fail_en;
+
+	/* anti-reply window check - enable/disable
+	 * When enabled, arw_size must be valid.
+	 */
+	u8 arw_check_en;
+
+	/* size of arw window, offset by 1. Setting to 0
+	 * represents ARW window size of 1. Setting to 127
+	 * represents ARW window size of 128
+	 */
+	u8 arw_size;
+
+	/* no ip offload mode - enable/disable
+	 * When enabled, ip type and address must not be valid.
+	 */
+	u8 no_ip_offload_en;
+
+	/* SA Domain. Used to logical separate an SADB into groups.
+	 * PF drivers supporting a single group ignore this field.
+	 */
+	u16 sa_domain;
+
+	/* crypto configuration */
+	struct virtchnl_ipsec_sym_crypto_cfg crypto_cfg;
+} __packed;
+
+/* VIRTCHNL_OP_IPSEC_SA_UPDATE
+ * VF send configuration of index of SA to PF
+ * PF will update SA according to configuration
+ */
+struct virtchnl_ipsec_sa_update {
+	u32 sa_index; /* SA to update */
+	u32 esn_hi; /* high 32 bits of esn */
+	u32 esn_low; /* low 32 bits of esn */
+};
+
+/* VIRTCHNL_OP_IPSEC_SA_DESTROY
+ * VF send configuration of index of SA to PF
+ * PF will destroy SA according to configuration
+ * flag bitmap indicate all SA or just selected SA will
+ * be destroyed
+ */
+struct virtchnl_ipsec_sa_destroy {
+	/* All zero bitmap indicates all SA will be destroyed.
+	 * Non-zero bitmap indicates the selected SA in
+	 * array sa_index will be destroyed.
+	 */
+	u8 flag;
+
+	/* selected SA index */
+	u32 sa_index[VIRTCHNL_IPSEC_MAX_SA_DESTROY_NUM];
+} __packed;
+
+/* VIRTCHNL_OP_IPSEC_SA_READ
+ * VF send this SA configuration to PF using virtchnl;
+ * PF read SA and will return configuration for the created SA.
+ */
+struct virtchnl_ipsec_sa_read {
+	/* SA valid - invalid/valid */
+	u8 valid;
+
+	/* SA active - inactive/active */
+	u8 active;
+
+	/* SA SN rollover - not_rollover/rollover */
+	u8 sn_rollover;
+
+	/* IPsec SA Protocol - AH/ESP */
+	u8 virtchnl_protocol_type;
+
+	/* termination mode - value ref VIRTCHNL_TERM_XXX */
+	u8 virtchnl_termination;
+
+	/* auditing mode - enable/disable */
+	u8 audit_en;
+
+	/* lifetime byte limit - enable/disable
+	 * When set to limit, byte_limit_hard and byte_limit_soft
+	 * must be valid.
+	 */
+	u8 byte_limit_en;
+
+	/* hard byte limit count */
+	u64 byte_limit_hard;
+
+	/* soft byte limit count */
+	u64 byte_limit_soft;
+
+	/* drop on authentication failure - enable/disable */
+	u8 drop_on_auth_fail_en;
+
+	/* anti-replay window check - enable/disable
+	 * When set to check, arw_size, arw_top, and arw must be valid
+	 */
+	u8 arw_check_en;
+
+	/* size of arw window, offset by 1. Setting to 0
+	 * represents ARW window size of 1. Setting to 127
+	 * represents ARW window size of 128
+	 */
+	u8 arw_size;
+
+	/* reserved */
+	u8 reserved1;
+
+	/* top of anti-replay-window */
+	u64 arw_top;
+
+	/* anti-replay-window */
+	u8 arw[16];
+
+	/* packets processed  */
+	u64 packets_processed;
+
+	/* bytes processed  */
+	u64 bytes_processed;
+
+	/* packets dropped  */
+	u32 packets_dropped;
+
+	/* authentication failures */
+	u32 auth_fails;
+
+	/* ARW check failures */
+	u32 arw_fails;
+
+	/* type of esn - enable/disable */
+	u8 esn;
+
+	/* IPSec SA Direction - value ref VIRTCHNL_DIR_XXX */
+	u8 virtchnl_direction;
+
+	/* SA security parameter index */
+	u32 spi;
+
+	/* SA salt */
+	u32 salt;
+
+	/* high 32 bits of esn */
+	u32 esn_hi;
+
+	/* low 32 bits of esn */
+	u32 esn_low;
+
+	/* SA Domain. Used to logical separate an SADB into groups.
+	 * PF drivers supporting a single group ignore this field.
+	 */
+	u16 sa_domain;
+
+	/* SPD reference. Used to link an SA with its policy.
+	 * PF drivers may ignore this field.
+	 */
+	u16 spd_ref;
+
+	/* crypto configuration. Salt and keys are set to 0 */
+	struct virtchnl_ipsec_sym_crypto_cfg crypto_cfg;
+} __packed;
+
+/* Add allowlist entry in IES */
+struct virtchnl_ipsec_sp_cfg {
+	u32 spi;
+	u32 dip[4];
+
+	/* Drop frame if true or redirect to QAT if false. */
+	u8 drop;
+
+	/* Congestion domain. For future use. */
+	u8 cgd;
+
+	/* 0 for IPv4 table, 1 for IPv6 table. */
+	u8 table_id;
+
+	/* Set TC (congestion domain) if true. For future use. */
+	u8 set_tc;
+};
+
+/* Delete allowlist entry in IES */
+struct virtchnl_ipsec_sp_destroy {
+	/* 0 for IPv4 table, 1 for IPv6 table. */
+	u8 table_id;
+	u32 rule_id;
+} __packed;
+
+/* Response from IES to allowlist operations */
+struct virtchnl_ipsec_sp_cfg_resp {
+	u32 rule_id;
+};
+
+struct virtchnl_ipsec_sa_cfg_resp {
+	u32 sa_handle;
+};
+
+#define INLINE_IPSEC_EVENT_RESET	0x1
+#define INLINE_IPSEC_EVENT_CRYPTO_ON	0x2
+#define INLINE_IPSEC_EVENT_CRYPTO_OFF	0x4
+
+struct virtchnl_ipsec_event {
+	u32 ipsec_event_data;
+};
+
+#define INLINE_IPSEC_STATUS_AVAILABLE	0x1
+#define INLINE_IPSEC_STATUS_UNAVAILABLE	0x2
+
+struct virtchnl_ipsec_status {
+	u32 status;
+};
+
+struct virtchnl_ipsec_resp {
+	u32 resp;
+};
+
+/* Internal message descriptor for VF <-> IPsec communication */
+struct inline_ipsec_msg {
+	u16 ipsec_opcode;
+	u16 req_id;
+
+	union {
+		/* IPsec request */
+		struct virtchnl_ipsec_sa_cfg sa_cfg[0];
+		struct virtchnl_ipsec_sp_cfg sp_cfg[0];
+		struct virtchnl_ipsec_sa_update sa_update[0];
+		struct virtchnl_ipsec_sa_destroy sa_destroy[0];
+		struct virtchnl_ipsec_sp_destroy sp_destroy[0];
+
+		/* IPsec response */
+		struct virtchnl_ipsec_sa_cfg_resp sa_cfg_resp[0];
+		struct virtchnl_ipsec_sp_cfg_resp sp_cfg_resp[0];
+		struct virtchnl_ipsec_cap ipsec_cap[0];
+		struct virtchnl_ipsec_status ipsec_status[0];
+		/* response to del_sa, del_sp, update_sa */
+		struct virtchnl_ipsec_resp ipsec_resp[0];
+
+		/* IPsec event (no req_id is required) */
+		struct virtchnl_ipsec_event event[0];
+
+		/* Reserved */
+		struct virtchnl_ipsec_sa_read sa_read[0];
+	} ipsec_data;
+};
+
+static inline u16 virtchnl_inline_ipsec_val_msg_len(u16 opcode)
+{
+	u16 valid_len = sizeof(struct inline_ipsec_msg);
+
+	switch (opcode) {
+	case INLINE_IPSEC_OP_GET_CAP:
+	case INLINE_IPSEC_OP_GET_STATUS:
+		break;
+	case INLINE_IPSEC_OP_SA_CREATE:
+		valid_len += sizeof(struct virtchnl_ipsec_sa_cfg);
+		break;
+	case INLINE_IPSEC_OP_SP_CREATE:
+		valid_len += sizeof(struct virtchnl_ipsec_sp_cfg);
+		break;
+	case INLINE_IPSEC_OP_SA_UPDATE:
+		valid_len += sizeof(struct virtchnl_ipsec_sa_update);
+		break;
+	case INLINE_IPSEC_OP_SA_DESTROY:
+		valid_len += sizeof(struct virtchnl_ipsec_sa_destroy);
+		break;
+	case INLINE_IPSEC_OP_SP_DESTROY:
+		valid_len += sizeof(struct virtchnl_ipsec_sp_destroy);
+		break;
+	/* Only for msg length caculation of response to VF in case of
+	 * inline ipsec failure.
+	 */
+	case INLINE_IPSEC_OP_RESP:
+		valid_len += sizeof(struct virtchnl_ipsec_resp);
+		break;
+	default:
+		valid_len = 0;
+		break;
+	}
+
+	return valid_len;
+}
+
+#endif /* _VIRTCHNL_INLINE_IPSEC_H_ */
diff --git a/drivers/net/ethernet/intel/ice/virtchnl_lan_desc.h b/drivers/net/ethernet/intel/ice/virtchnl_lan_desc.h
new file mode 100644
index 000000000000..5ac587008da7
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/virtchnl_lan_desc.h
@@ -0,0 +1,528 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/*
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * For licensing information, see the file 'LICENSE' in the root folder
+ */
+#ifndef _VIRTCHNL_LAN_DESC_H_
+#define _VIRTCHNL_LAN_DESC_H_
+
+/* Rx */
+/* For virtchnl_splitq_base_rx_flex desc members */
+#define VIRTCHNL_RXD_FLEX_RXDID_S		0
+#define VIRTCHNL_RXD_FLEX_RXDID_M		\
+	ICE_M(0xFUL, VIRTCHNL_RXD_FLEX_RXDID_S)
+#define VIRTCHNL_RXD_FLEX_PTYPE_S		0
+#define VIRTCHNL_RXD_FLEX_PTYPE_M		\
+	ICE_M(0x3FFUL, VIRTCHNL_RXD_FLEX_PTYPE_S)
+#define VIRTCHNL_RXD_FLEX_UMBCAST_S		10
+#define VIRTCHNL_RXD_FLEX_UMBCAST_M		\
+	ICE_M(0x3UL, VIRTCHNL_RXD_FLEX_UMBCAST_S)
+#define VIRTCHNL_RXD_FLEX_FF0_S		12
+#define VIRTCHNL_RXD_FLEX_FF0_M		ICE_M(0xFUL, VIRTCHNL_RXD_FLEX_FF0_S)
+#define VIRTCHNL_RXD_FLEX_LEN_PBUF_S	0
+#define VIRTCHNL_RXD_FLEX_LEN_PBUF_M	\
+	ICE_M(0x3FFFUL, VIRTCHNL_RXD_FLEX_LEN_PBUF_S)
+#define VIRTCHNL_RXD_FLEX_GEN_S		14
+#define VIRTCHNL_RXD_FLEX_GEN_M		BIT_ULL(VIRTCHNL_RXD_FLEX_GEN_S)
+#define VIRTCHNL_RXD_FLEX_BUFQ_ID_S		15
+#define VIRTCHNL_RXD_FLEX_BUFQ_ID_M		\
+	BIT_ULL(VIRTCHNL_RXD_FLEX_BUFQ_ID_S)
+#define VIRTCHNL_RXD_FLEX_LEN_HDR_S		0
+#define VIRTCHNL_RXD_FLEX_LEN_HDR_M		\
+	ICE_M(0x3FFUL, VIRTCHNL_RXD_FLEX_LEN_HDR_S)
+#define VIRTCHNL_RXD_FLEX_RSC_S		10
+#define VIRTCHNL_RXD_FLEX_RSC_M		BIT_ULL(VIRTCHNL_RXD_FLEX_RSC_S)
+#define VIRTCHNL_RXD_FLEX_SPH_S		11
+#define VIRTCHNL_RXD_FLEX_SPH_M		BIT_ULL(VIRTCHNL_RXD_FLEX_SPH_S)
+#define VIRTCHNL_RXD_FLEX_MISS_S		12
+#define VIRTCHNL_RXD_FLEX_MISS_M		\
+	BIT_ULL(VIRTCHNL_RXD_FLEX_MISS_S)
+#define VIRTCHNL_RXD_FLEX_FF1_S		13
+#define VIRTCHNL_RXD_FLEX_FF1_M		ICE_M(0x7UL, VIRTCHNL_RXD_FLEX_FF1_M)
+
+/* For virtchnl_singleq_base_rx_legacy desc members */
+#define VIRTCHNL_RXD_QW1_LEN_SPH_S	63
+#define VIRTCHNL_RXD_QW1_LEN_SPH_M	BIT_ULL(VIRTCHNL_RXD_QW1_LEN_SPH_S)
+#define VIRTCHNL_RXD_QW1_LEN_HBUF_S	52
+#define VIRTCHNL_RXD_QW1_LEN_HBUF_M	\
+	ICE_M(0x7FFULL, VIRTCHNL_RXD_QW1_LEN_HBUF_S)
+#define VIRTCHNL_RXD_QW1_LEN_PBUF_S	38
+#define VIRTCHNL_RXD_QW1_LEN_PBUF_M	\
+	ICE_M(0x3FFFULL, VIRTCHNL_RXD_QW1_LEN_PBUF_S)
+#define VIRTCHNL_RXD_QW1_PTYPE_S	30
+#define VIRTCHNL_RXD_QW1_PTYPE_M	\
+	ICE_M(0xFFULL, VIRTCHNL_RXD_QW1_PTYPE_S)
+#define VIRTCHNL_RXD_QW1_ERROR_S	19
+#define VIRTCHNL_RXD_QW1_ERROR_M	\
+	ICE_M(0xFFUL, VIRTCHNL_RXD_QW1_ERROR_S)
+#define VIRTCHNL_RXD_QW1_STATUS_S	0
+#define VIRTCHNL_RXD_QW1_STATUS_M	\
+	ICE_M(0x7FFFFUL, VIRTCHNL_RXD_QW1_STATUS_S)
+
+enum virtchnl_rx_flex_desc_status_error_0_qw1_bits {
+	/* Note: These are predefined bit offsets */
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_DD_S = 0,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_EOF_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_HBO_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_L3L4P_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XSUM_IPE_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XSUM_L4E_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S,
+};
+
+enum virtchnl_rx_flex_desc_status_error_0_qw0_bits {
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_LPBK_S = 0,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_IPV6EXADD_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_RXE_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_CRCP_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_RSS_VALID_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_L2TAG1P_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XTRMD0_VALID_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XTRMD1_VALID_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_LAST /* this entry must be last!!! */
+};
+
+enum virtchnl_rx_flex_desc_status_error_1_bits {
+	/* Note: These are predefined bit offsets */
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_RSVD_S = 0, /* 2 bits */
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_ATRAEFAIL_S = 2,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_L2TAG2P_S = 3,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_XTRMD2_VALID_S = 4,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_XTRMD3_VALID_S = 5,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_XTRMD4_VALID_S = 6,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_XTRMD5_VALID_S = 7,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_LAST /* this entry must be last!!! */
+};
+
+enum virtchnl_rx_base_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	VIRTCHNL_RX_BASE_DESC_STATUS_DD_S		= 0,
+	VIRTCHNL_RX_BASE_DESC_STATUS_EOF_S		= 1,
+	VIRTCHNL_RX_BASE_DESC_STATUS_L2TAG1P_S	= 2,
+	VIRTCHNL_RX_BASE_DESC_STATUS_L3L4P_S	= 3,
+	VIRTCHNL_RX_BASE_DESC_STATUS_CRCP_S		= 4,
+	VIRTCHNL_RX_BASE_DESC_STATUS_RSVD_S		= 5, /* 3 BITS */
+	VIRTCHNL_RX_BASE_DESC_STATUS_EXT_UDP_0_S	= 8,
+	VIRTCHNL_RX_BASE_DESC_STATUS_UMBCAST_S	= 9, /* 2 BITS */
+	VIRTCHNL_RX_BASE_DESC_STATUS_FLM_S		= 11,
+	VIRTCHNL_RX_BASE_DESC_STATUS_FLTSTAT_S	= 12, /* 2 BITS */
+	VIRTCHNL_RX_BASE_DESC_STATUS_LPBK_S		= 14,
+	VIRTCHNL_RX_BASE_DESC_STATUS_IPV6EXADD_S	= 15,
+	VIRTCHNL_RX_BASE_DESC_STATUS_RSVD1_S	= 16, /* 2 BITS */
+	VIRTCHNL_RX_BASE_DESC_STATUS_INT_UDP_0_S	= 18,
+	VIRTCHNL_RX_BASE_DESC_STATUS_LAST /* this entry must be last!!! */
+};
+
+enum virtchnl_rx_desc_fltstat_values {
+	VIRTCHNL_RX_DESC_FLTSTAT_NO_DATA	= 0,
+	VIRTCHNL_RX_DESC_FLTSTAT_RSV_FD_ID	= 1, /* 16byte desc? FD_ID : RSV */
+	VIRTCHNL_RX_DESC_FLTSTAT_RSV	= 2,
+	VIRTCHNL_RX_DESC_FLTSTAT_RSS_HASH	= 3,
+};
+
+enum virtchnl_rx_base_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	VIRTCHNL_RX_BASE_DESC_ERROR_RXE_S		= 0,
+	VIRTCHNL_RX_BASE_DESC_ERROR_ATRAEFAIL_S	= 1,
+	VIRTCHNL_RX_BASE_DESC_ERROR_HBO_S		= 2,
+	VIRTCHNL_RX_BASE_DESC_ERROR_L3L4E_S		= 3, /* 3 BITS */
+	VIRTCHNL_RX_BASE_DESC_ERROR_IPE_S		= 3,
+	VIRTCHNL_RX_BASE_DESC_ERROR_L4E_S		= 4,
+	VIRTCHNL_RX_BASE_DESC_ERROR_EIPE_S		= 5,
+	VIRTCHNL_RX_BASE_DESC_ERROR_OVERSIZE_S	= 6,
+	VIRTCHNL_RX_BASE_DESC_ERROR_RSVD_S		= 7
+};
+
+/* Receive Descriptors */
+/* splitq buf
+ |                                       16|                   0|
+ ----------------------------------------------------------------
+ | RSV                                     | Buffer ID          |
+ ----------------------------------------------------------------
+ | Rx packet buffer adresss                                     |
+ ----------------------------------------------------------------
+ | Rx header buffer adresss                                     |
+ ----------------------------------------------------------------
+ | RSV                                                          |
+ ----------------------------------------------------------------
+ |                                                             0|
+ */
+struct virtchnl_splitq_rx_buf_desc {
+	struct {
+		__le16  buf_id; /* Buffer Identifier */
+		__le16  rsvd0;
+		__le32  rsvd1;
+	} qword0;
+	__le64  pkt_addr; /* Packet buffer address */
+	__le64  hdr_addr; /* Header buffer address */
+	__le64  rsvd2;
+}; /* read used with buffer queues*/
+
+/* singleq buf
+ |                                                             0|
+ ----------------------------------------------------------------
+ | Rx packet buffer adresss                                     |
+ ----------------------------------------------------------------
+ | Rx header buffer adresss                                     |
+ ----------------------------------------------------------------
+ | RSV                                                          |
+ ----------------------------------------------------------------
+ | RSV                                                          |
+ ----------------------------------------------------------------
+ |                                                             0|
+ */
+struct virtchnl_singleq_rx_buf_desc {
+	__le64  pkt_addr; /* Packet buffer address */
+	__le64  hdr_addr; /* Header buffer address */
+	__le64  rsvd1;
+	__le64  rsvd2;
+}; /* read used with buffer queues*/
+
+union virtchnl_rx_buf_desc {
+	struct virtchnl_singleq_rx_buf_desc		read;
+	struct virtchnl_splitq_rx_buf_desc		split_rd;
+};
+
+/* (0x00) singleq wb(compl) */
+struct virtchnl_singleq_base_rx_desc {
+	struct {
+		struct {
+			__le16 mirroring_status;
+			__le16 l2tag1;
+		} lo_dword;
+		union {
+			__le32 rss; /* RSS Hash */
+			__le32 fd_id; /* Flow Director filter id */
+		} hi_dword;
+	} qword0;
+	struct {
+		/* status/error/PTYPE/length */
+		__le64 status_error_ptype_len;
+	} qword1;
+	struct {
+		__le16 ext_status; /* extended status */
+		__le16 rsvd;
+		__le16 l2tag2_1;
+		__le16 l2tag2_2;
+	} qword2;
+	struct {
+		__le32 reserved;
+		__le32 fd_id;
+	} qword3;
+}; /* writeback */
+
+/* (0x01) singleq flex compl */
+struct virtchnl_rx_flex_desc {
+	/* Qword 0 */
+	u8 rxdid; /* descriptor builder profile id */
+	u8 mir_id_umb_cast; /* mirror=[5:0], umb=[7:6] */
+	__le16 ptype_flex_flags0; /* ptype=[9:0], ff0=[15:10] */
+	__le16 pkt_len; /* [15:14] are reserved */
+	__le16 hdr_len_sph_flex_flags1; /* header=[10:0] */
+					/* sph=[11:11] */
+					/* ff1/ext=[15:12] */
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 flex_meta0;
+	__le16 flex_meta1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 time_stamp_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 flex_meta2;
+	__le16 flex_meta3;
+	union {
+		struct {
+			__le16 flex_meta4;
+			__le16 flex_meta5;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
+/* (0x02) */
+struct virtchnl_rx_flex_desc_nic {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le32 rss_hash;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flexi_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 flow_id;
+	union {
+		struct {
+			__le16 rsvd;
+			__le16 flow_id_ipv6;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
+/* Rx Flex Descriptor Switch Profile
+ * RxDID Profile Id 3
+ * Flex-field 0: Source Vsi
+ */
+struct virtchnl_rx_flex_desc_sw {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 src_vsi; /* [10:15] are reserved */
+	__le16 flex_md1_rsvd;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC VEB Profile
+ * RxDID Profile Id 4
+ * Flex-field 0: Destination Vsi
+ */
+struct virtchnl_rx_flex_desc_nic_veb_dbg {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 dst_vsi; /* [0:12]: destination vsi */
+			/* 13: vsi valid bit */
+			/* [14:15] are reserved */
+	__le16 flex_field_1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC ACL Profile
+ * RxDID Profile Id 5
+ * Flex-field 0: ACL Counter 0
+ * Flex-field 1: ACL Counter 1
+ * Flex-field 2: ACL Counter 2
+ */
+struct virtchnl_rx_flex_desc_nic_acl_dbg {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 acl_ctr0;
+	__le16 acl_ctr1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 acl_ctr2;
+	__le16 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC Profile
+ * RxDID Profile Id 6
+ * Flex-field 0: RSS hash lower 16-bits
+ * Flex-field 1: RSS hash upper 16-bits
+ * Flex-field 2: Flow Id lower 16-bits
+ * Flex-field 3: Source Vsi
+ * Flex-field 4: reserved, Vlan id taken from L2Tag
+ */
+struct virtchnl_rx_flex_desc_nic_2 {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le32 rss_hash;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flexi_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 flow_id;
+	__le16 src_vsi;
+	union {
+		struct {
+			__le16 rsvd;
+			__le16 flow_id_ipv6;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
+/* Rx Flex Descriptor Advanced (Split Queue Model)
+ * RxDID Profile Id 7
+ */
+struct virtchnl_rx_flex_desc_adv {
+	/* Qword 0 */
+	u8 rxdid_ucast; /* profile_id=[3:0] */
+			/* rsvd=[5:4] */
+			/* ucast=[7:6] */
+	u8 status_err0_qw0;
+	__le16 ptype_err_fflags0;	/* ptype=[9:0] */
+					/* ip_hdr_err=[10:10] */
+					/* udp_len_err=[11:11] */
+					/* ff0=[15:12] */
+	__le16 pktlen_gen_bufq_id;	/* plen=[13:0] */
+					/* gen=[14:14]  only in splitq */
+					/* bufq_id=[15:15] only in splitq */
+	__le16 hdrlen_flags;		/* header=[9:0] */
+					/* rsc=[10:10] only in splitq */
+					/* sph=[11:11] only in splitq */
+					/* ext_udp_0=[12:12] */
+					/* int_udp_0=[13:13] */
+					/* trunc_mirr=[14:14] */
+					/* miss_prepend=[15:15] */
+	/* Qword 1 */
+	u8 status_err0_qw1;
+	u8 status_err1;
+	u8 fflags1;
+	u8 ts_low;
+	__le16 fmd0;
+	__le16 fmd1;
+	/* Qword 2 */
+	__le16 fmd2;
+	u8 fflags2;
+	u8 hash3;
+	__le16 fmd3;
+	__le16 fmd4;
+	/* Qword 3 */
+	__le16 fmd5;
+	__le16 fmd6;
+	__le16 fmd7_0;
+	__le16 fmd7_1;
+}; /* writeback */
+
+/* Rx Flex Descriptor Advanced (Split Queue Model) NIC Profile
+ * RxDID Profile Id 8
+ * Flex-field 0: BufferID
+ * Flex-field 1: Raw checksum/L2TAG1/RSC Seg Len (determined by HW)
+ * Flex-field 2: Hash[15:0]
+ * Flex-flags 2: Hash[23:16]
+ * Flex-field 3: L2TAG2
+ * Flex-field 5: L2TAG1
+ * Flex-field 7: Timestamp (upper 32 bits)
+ */
+struct virtchnl_rx_flex_desc_adv_nic_3 {
+	/* Qword 0 */
+	u8 rxdid_ucast; /* profile_id=[3:0] */
+			/* rsvd=[5:4] */
+			/* ucast=[7:6] */
+	u8 status_err0_qw0;
+	__le16 ptype_err_fflags0;	/* ptype=[9:0] */
+					/* ip_hdr_err=[10:10] */
+					/* udp_len_err=[11:11] */
+					/* ff0=[15:12] */
+	__le16 pktlen_gen_bufq_id;	/* plen=[13:0] */
+					/* gen=[14:14]  only in splitq */
+					/* bufq_id=[15:15] only in splitq */
+	__le16 hdrlen_flags;		/* header=[9:0] */
+					/* rsc=[10:10] only in splitq */
+					/* sph=[11:11] only in splitq */
+					/* ext_udp_0=[12:12] */
+					/* int_udp_0=[13:13] */
+					/* trunc_mirr=[14:14] */
+					/* miss_prepend=[15:15] */
+	/* Qword 1 */
+	u8 status_err0_qw1;
+	u8 status_err1;
+	u8 fflags1;
+	u8 ts_low;
+	__le16 buf_id; /* only in splitq */
+	union {
+		__le16 raw_cs;
+		__le16 l2tag1;
+		__le16 rscseglen;
+	} misc;
+	/* Qword 2 */
+	__le16 hash1;
+	union {
+		u8 fflags2;
+		u8 mirrorid;
+		u8 hash2;
+	} ff2_mirrid_hash2;
+	u8 hash3;
+	__le16 l2tag2;
+	__le16 fmd4;
+	/* Qword 3 */
+	__le16 l2tag1;
+	__le16 fmd6;
+	__le32 ts_high;
+}; /* writeback */
+
+union virtchnl_rx_desc {
+	struct virtchnl_singleq_rx_buf_desc	read;
+	struct virtchnl_singleq_base_rx_desc	base_wb;
+	struct virtchnl_rx_flex_desc		flex_wb;
+	struct virtchnl_rx_flex_desc_adv	flex_wb_adv;
+};
+
+#endif /* _VIRTCHNL_LAN_DESC_H_ */
diff --git a/package/default/config.default_kasan b/package/default/config.default_kasan
index 4590481565e2..3bf8deb99f58 100644
--- a/package/default/config.default_kasan
+++ b/package/default/config.default_kasan
@@ -2346,6 +2346,7 @@ CONFIG_I40E=m
 # CONFIG_I40E_DCB is not set
 CONFIG_IAVF=m
 CONFIG_I40EVF=m
+CONFIG_ICE=m
 # CONFIG_ICE is not set
 # CONFIG_FM10K is not set
 # CONFIG_IGC is not set
-- 
Gitee


From b59fce6d7c0295de07c4f8d335e297891ca82471 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 12 Apr 2021 19:52:33 +0800
Subject: [PATCH 0002/2161] fix:Don't reclaim kvm mmu pages in mmu_shrink

commit opencloudos.

Backport from c199f9f620e38a866f2b8f4f601995fee4a8dd9b

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/mmu.c       |  26 ++++++++-
 include/linux/kvm_host.h |   4 ++
 virt/kvm/kvm_main.c      | 112 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 47c27c6e3842..5d097fb46dc9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -247,6 +247,22 @@ static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
+/*
+1) Each ept page table entry requires 8 bytes, and a physical  page 
+   can store 512 (4096/8)  entries. Therefore, for a scenario with 4K pages, 
+   the number of mmu pages required to map 760G physical memory is :
+   (760*2^30)/4096/512=389120.
+2)The reason for  mapping  760G  is that the max physical memory 
+   of  host is 768G.
+3)In fact, since THP is enabled, this number maybe never be reached.
+*/
+u32 kvm_mmu_limit_nr = 389120;
+EXPORT_SYMBOL(kvm_mmu_limit_nr);
+u32 kvm_mmu_reclaim_try_times = 0;
+EXPORT_SYMBOL(kvm_mmu_reclaim_try_times);
+u32 kvm_mmu_reclaim_times = 0;
+EXPORT_SYMBOL(kvm_mmu_reclaim_times);
+
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
@@ -6240,7 +6256,15 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 static unsigned long
 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+	unsigned long shrink_count =
+		percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+
+	kvm_mmu_reclaim_try_times++;
+	if (shrink_count < kvm_mmu_limit_nr)
+		return 0;
+	kvm_mmu_reclaim_times++;
+
+	return shrink_count;
 }
 
 static struct shrinker mmu_shrinker = {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0ec4ec428b9b..3d0aee229958 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1245,6 +1245,10 @@ extern unsigned int halt_poll_ns_grow;
 extern unsigned int halt_poll_ns_grow_start;
 extern unsigned int halt_poll_ns_shrink;
 
+extern u32 kvm_mmu_limit_nr;
+extern u32 kvm_mmu_reclaim_try_times;
+extern u32 kvm_mmu_reclaim_times;
+
 struct kvm_device {
 	struct kvm_device_ops *ops;
 	struct kvm *kvm;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f83fa0aeeb45..4176f370ee2b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,7 @@
 #include <linux/io.h>
 #include <linux/lockdep.h>
 #include <linux/kthread.h>
+#include <linux/proc_fs.h>
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
@@ -4412,6 +4413,113 @@ static void check_processor_compat(void *rtn)
 	*(int *)rtn = kvm_arch_check_processor_compat();
 }
 
+static int kvmmmu_limit_nr_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", kvm_mmu_limit_nr);
+	return 0; 
+}
+
+static int kvmmmu_limit_nr_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmmmu_limit_nr_proc_show, NULL);
+}
+
+static ssize_t kvmmmu_limit_nr_proc_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	char str[16] = {0};
+	if (count > 1) {
+		int value;
+		int size = min((int)count - 1, 10);
+
+		if (copy_from_user(str, buf, size))
+			goto out;
+		
+		if (!kstrtoint(str, 10, &value)) {
+			if (value < 0)
+				value = 0;
+			kvm_mmu_limit_nr = value;
+		}
+	}
+
+out:
+	return count;
+}
+
+static const struct file_operations kvmmmu_limit_nr_proc_fops = {
+	.open		= kvmmmu_limit_nr_proc_open,
+	.read		= seq_read,
+	.write		= kvmmmu_limit_nr_proc_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int kvmmmu_reclaim_try_times_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", kvm_mmu_reclaim_try_times);
+	return 0; 
+}
+
+static int kvmmmu_reclaim_try_times_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmmmu_reclaim_try_times_proc_show, NULL);
+}
+
+static const struct file_operations kvmmmu_reclaim_try_times_proc_fops = {
+	.open		= kvmmmu_reclaim_try_times_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int kvmmmu_reclaim_times_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", kvm_mmu_reclaim_times);
+	return 0; 
+}
+
+static int kvmmmu_reclaim_times_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmmmu_reclaim_times_proc_show, NULL);
+}
+
+static const struct file_operations kvmmmu_reclaim_times_proc_fops = {
+	.open		= kvmmmu_reclaim_times_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+void kvm_proc_interface_uninit(void)
+{
+	remove_proc_entry("kvm_control/kvmmmu_reclaim_times", NULL);
+	remove_proc_entry("kvm_control/kvmmmu_reclaim_try_times", NULL);
+	remove_proc_entry("kvm_control/kvmmmu_limit_nr", NULL);
+	remove_proc_entry("kvm_control", NULL);
+}
+
+int kvm_proc_interface_init(void)
+{
+	int ret = -ENOMEM;
+	if (!proc_mkdir("kvm_control", NULL))
+		goto err;
+
+	if (!proc_create("kvm_control/kvmmmu_limit_nr", 0, NULL, &kvmmmu_limit_nr_proc_fops))
+		goto err;
+
+	if (!proc_create("kvm_control/kvmmmu_reclaim_try_times", 0, NULL, &kvmmmu_reclaim_try_times_proc_fops))
+		goto err;
+
+	if (!proc_create("kvm_control/kvmmmu_reclaim_times", 0, NULL, &kvmmmu_reclaim_times_proc_fops))
+		goto err;
+
+	return 0;
+err:
+	kvm_proc_interface_uninit();
+	return ret;
+}
+
+
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module)
 {
@@ -4492,6 +4600,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	r = kvm_vfio_ops_init();
 	WARN_ON(r);
 
+	if (kvm_proc_interface_init())
+		pr_err("kvm: create kvm proc control interface failed\n");
+
 	return 0;
 
 out_unreg:
@@ -4530,6 +4641,7 @@ void kvm_exit(void)
 	kvm_irqfd_exit();
 	free_cpumask_var(cpus_hardware_enabled);
 	kvm_vfio_ops_exit();
+	kvm_proc_interface_uninit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
 
-- 
Gitee


From fe47b2ca97015dca6db2db48e38257022899705f Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 12 Apr 2021 20:19:07 +0800
Subject: [PATCH 0003/2161] KVM: check pending exception before injecting APF

commit opencloudos.

Backport from dc20775f5680d6e49c7cd4e038f3e1267a386f4c

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 153659e8f403..4b369b518749 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -527,6 +527,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	prev_nr = vcpu->arch.exception.nr;
 	if (prev_nr == DF_VECTOR) {
 		/* triple fault -> shutdown */
+		pr_info("inject a TF to VM, current_nr:%x\n", nr);
+		WARN_ON(1);
 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
@@ -539,6 +541,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		 * exception.pending = true so that the double fault
 		 * can trigger a nested vmexit.
 		 */
+		pr_info("inject a DF to VM, prev_nr:%x, current_nr:%x\n",
+			prev_nr, nr);
+		WARN_ON(1);
 		vcpu->arch.exception.pending = true;
 		vcpu->arch.exception.injected = false;
 		vcpu->arch.exception.has_error_code = true;
-- 
Gitee


From d113f5d5d22aefcf8b5fc68be65c22123eeba875 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 13 Apr 2021 20:35:32 +0800
Subject: [PATCH 0004/2161] KVM: VMX & SVM: FIXED+PHYSICAL mode single target
 IPI fastpath

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 10 ++++--
 arch/x86/kvm/irq_comm.c         |  6 ++--
 arch/x86/kvm/lapic.c            | 10 ++----
 arch/x86/kvm/lapic.h            |  8 +++++
 arch/x86/kvm/svm.c              | 25 ++++++++++++---
 arch/x86/kvm/vmx/vmx.c          | 26 ++++++++++-----
 arch/x86/kvm/x86.c              | 56 +++++++++++++++++++++++++++++++--
 arch/x86/kvm/x86.h              |  2 ++
 8 files changed, 117 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c52b7073a5ab..b7e643ee685b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -177,6 +177,11 @@ enum {
 	VCPU_SREG_LDTR,
 };
 
+enum exit_fastpath_completion {
+	EXIT_FASTPATH_NONE,
+	EXIT_FASTPATH_SKIP_EMUL_INS,
+};
+
 #include <asm/kvm_emulate.h>
 
 #define KVM_NR_MEM_OBJS 40
@@ -1073,8 +1078,9 @@ struct kvm_x86_ops {
 	 */
 	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
 
-	void (*run)(struct kvm_vcpu *vcpu);
-	int (*handle_exit)(struct kvm_vcpu *vcpu);
+	enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
+	int (*handle_exit)(struct kvm_vcpu *vcpu,
+		enum exit_fastpath_completion exit_fastpath);
 	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 5ddcaacef291..9b92a6bb277b 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -52,15 +52,15 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 	unsigned int dest_vcpus = 0;
 
+	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+		return r;
+
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
 			kvm_lowest_prio_delivery(irq)) {
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 		irq->delivery_mode = APIC_DM_FIXED;
 	}
 
-	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
-		return r;
-
 	memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3f6b866c644d..74044fecfccb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -56,15 +56,9 @@
 #define APIC_VERSION			(0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
 #define LAPIC_MMIO_LENGTH		(1 << 12)
 /* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK			0xc0000
-#define APIC_DEST_NOSHORT		0x0
-#define APIC_DEST_MASK			0x800
 #define MAX_APIC_VECTOR			256
 #define APIC_VECTORS_PER_REG		32
 
-#define APIC_BROADCAST			0xFF
-#define X2APIC_BROADCAST		0xFFFFFFFFul
-
 static bool lapic_timer_advance_dynamic __read_mostly;
 #define LAPIC_TIMER_ADVANCE_ADJUST_MIN	100	/* clock cycles */
 #define LAPIC_TIMER_ADVANCE_ADJUST_MAX	10000	/* clock cycles */
@@ -1195,7 +1189,7 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
 
-static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
 {
 	struct kvm_lapic_irq irq;
 
@@ -1909,7 +1903,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	case APIC_ICR:
 		/* No delay here, so we always clear the pending bit */
 		val &= ~(1 << 12);
-		apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
+		kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
 		kvm_lapic_set_reg(apic, APIC_ICR, val);
 		break;
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1f5014852e20..f086829b87fe 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -10,12 +10,19 @@
 #define KVM_APIC_SIPI		1
 #define KVM_APIC_LVT_NUM	6
 
+#define APIC_SHORT_MASK		0xc0000
+#define APIC_DEST_NOSHORT	0x0
+#define APIC_DEST_MASK		0x800
+
 #define KVM_APIC_SHORT_MASK	0xc0000
 #define KVM_APIC_DEST_MASK	0x800
 
 #define APIC_BUS_CYCLE_NS       1
 #define APIC_BUS_FREQUENCY      (1000000000ULL / APIC_BUS_CYCLE_NS)
 
+#define APIC_BROADCAST		0xFF
+#define X2APIC_BROADCAST	0xFFFFFFFFul
+
 enum lapic_mode {
 	LAPIC_MODE_DISABLED = 0,
 	LAPIC_MODE_INVALID = X2APIC_ENABLE,
@@ -111,6 +118,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
 
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b9d14fdbd2d8..81af3f7e7068 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4978,7 +4978,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = control->exit_info_2;
 }
 
-static int handle_exit(struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu,
+		       enum exit_fastpath_completion exit_fastpath)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_run *kvm_run = vcpu->run;
@@ -5036,7 +5037,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 		       __func__, svm->vmcb->control.exit_int_info,
 		       exit_code);
 
-	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+		skip_emulated_instruction(vcpu);
+		return 1;
+	} else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
 	    || !svm_exit_handlers[exit_code]) {
 		vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
 		dump_vmcb(vcpu);
@@ -5653,8 +5657,19 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(svm);
 }
 
-static void svm_vcpu_run(struct kvm_vcpu *vcpu)
+static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+	if (!is_guest_mode(vcpu) &&
+	    to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+	    to_svm(vcpu)->vmcb->control.exit_info_1)
+		return handle_fastpath_set_msr_irqoff(vcpu);
+
+	return EXIT_FASTPATH_NONE;
+}
+
+static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
+	enum exit_fastpath_completion exit_fastpath;
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
@@ -5666,7 +5681,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * again.
 	 */
 	if (unlikely(svm->nested.exit_required))
-		return;
+		return EXIT_FASTPATH_NONE;
 
 	/*
 	 * Disable singlestep if we're injecting an interrupt/exception.
@@ -5845,6 +5860,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	stgi();
 
 	/* Any pending NMI will happen here */
+	exit_fastpath = svm_exit_handlers_fastpath(vcpu);
 
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_after_interrupt(&svm->vcpu);
@@ -5873,6 +5889,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		svm_handle_mce(svm);
 
 	mark_all_clean(svm->vmcb);
+	return exit_fastpath;
 }
 STACK_FRAME_NON_STANDARD(svm_vcpu_run);
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0564c05c1ce0..b5e865934b03 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5855,7 +5855,8 @@ void dump_vmcs(void)
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
  */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu,
+			   enum exit_fastpath_completion exit_fastpath)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exit_reason = vmx->exit_reason;
@@ -5942,10 +5943,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	if (exit_reason < kvm_vmx_max_exit_handlers
-	    && kvm_vmx_exit_handlers[exit_reason])
+	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+		skip_emulated_instruction(vcpu);
+		return 1;
+	} else if (exit_reason < kvm_vmx_max_exit_handlers
+		   && kvm_vmx_exit_handlers[exit_reason]) {
 		return kvm_vmx_exit_handlers[exit_reason](vcpu);
-	else {
+	} else {
 		vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
 				exit_reason);
 		dump_vmcs();
@@ -6478,10 +6482,11 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long cr3, cr4;
+	enum exit_fastpath_completion exit_fastpath;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!enable_vnmi &&
@@ -6491,7 +6496,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	/* Don't enter VMX if guest state is invalid, let the exit handler
 	   start emulation until we arrive back to a valid state */
 	if (vmx->emulation_required)
-		return;
+		return EXIT_FASTPATH_NONE;
 
 	if (vmx->ple_window_dirty) {
 		vmx->ple_window_dirty = false;
@@ -6628,13 +6633,20 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		kvm_machine_check();
 
 	if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
-		return;
+		return EXIT_FASTPATH_NONE;
 
 	vmx->loaded_vmcs->launched = 1;
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
+
+	if (!is_guest_mode(vcpu) && vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+		exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
+	else
+		exit_fastpath = EXIT_FASTPATH_NONE;
+
+	return exit_fastpath;
 }
 
 static struct kvm *vmx_vm_alloc(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b369b518749..f76c6e140ce3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1611,6 +1611,57 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
+/*
+ * The fast path for frequent and performance sensitive wrmsr emulation,
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
+ * other cases which must be called after interrupts are enabled on the host.
+ */
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+{
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
+		return 1;
+
+	if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
+		((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+		((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+		((u32)(data >> 32) != X2APIC_BROADCAST)) {
+
+		data &= ~(1 << 12);
+		kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
+		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
+		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
+		trace_kvm_apic_write(APIC_ICR, (u32)data);
+		return 0;
+	}
+	return 1;
+}
+
+enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+{
+	u32 msr = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	u64 data;
+	int ret = 0;
+
+	switch (msr) {
+	case APIC_BASE_MSR + (APIC_ICR >> 4):
+		data = kvm_read_edx_eax(vcpu);
+		ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
+		break;
+	default:
+		return EXIT_FASTPATH_NONE;
+	}
+
+	if (!ret) {
+		trace_kvm_msr_write(msr, data);
+		return EXIT_FASTPATH_SKIP_EMUL_INS;
+	}
+
+	return EXIT_FASTPATH_NONE;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
+
 /*
  * Adapt set_msr() to msr_io()'s calling convention
  */
@@ -8057,6 +8108,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_int_win =
 		dm_request_for_irq_injection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
+	enum exit_fastpath_completion exit_fastpath;
 
 	bool req_immediate_exit = false;
 
@@ -8275,7 +8327,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
-	kvm_x86_ops->run(vcpu);
+	exit_fastpath = kvm_x86_ops->run(vcpu);
 
 	/*
 	 * Do this here before restoring debug registers on the host.  And
@@ -8351,7 +8403,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_lapic_sync_from_vapic(vcpu);
 
 	vcpu->arch.gpa_available = false;
-	r = kvm_x86_ops->handle_exit(vcpu);
+	r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
 	return r;
 
 cancel_injection:
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c520d373790a..31cb84d1dbfe 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -289,6 +289,8 @@ bool kvm_vector_hashing_enabled(void);
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 			    int emulation_type, void *insn, int insn_len);
 
+enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
+
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
 				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
-- 
Gitee


From 71c30408d357b023a0b21eb50bb889d215c55816 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 13 Apr 2021 20:58:04 +0800
Subject: [PATCH 0005/2161] Disable fastpath in svm

commit opencloudos.

Backport from 34332740e5cb1aff16dbbc86080eef46b316c09a

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 81af3f7e7068..360844440c8d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5659,11 +5659,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 
 static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
-	if (!is_guest_mode(vcpu) &&
-	    to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
-	    to_svm(vcpu)->vmcb->control.exit_info_1)
-		return handle_fastpath_set_msr_irqoff(vcpu);
-
 	return EXIT_FASTPATH_NONE;
 }
 
-- 
Gitee


From cf12ed35455669ede5d7f2309241145a4264ca6f Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 20 May 2021 19:27:11 +0800
Subject: [PATCH 0006/2161] KVM: x86: Move bit() helper to cpuid.h

commit a0a2260c12d8658e522f21ed8ece72bbdede58fd upstream

Move bit() to cpuid.h in preparation for incorporating the reverse_cpuid
array in bit() build-time assertions.  Opportunistically use the BIT()
macro instead of open-coding the shift.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.h   | 5 +++++
 arch/x86/kvm/emulate.c | 1 +
 arch/x86/kvm/x86.h     | 5 -----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 7dec43b2c420..eefa7f88f71a 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -55,6 +55,11 @@ static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
 };
 
+static inline u32 bit(int bitno)
+{
+	return BIT(bitno & 31);
+}
+
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
 {
 	unsigned x86_leaf = x86_feature / 32;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 60c8dcb907a5..3fb94199fb20 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -26,6 +26,7 @@
 #include <asm/debugreg.h>
 #include <asm/nospec-branch.h>
 
+#include "cpuid.h"
 #include "x86.h"
 #include "tss.h"
 #include "mmu.h"
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 31cb84d1dbfe..c1854a8ac6a4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -144,11 +144,6 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
 	return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
 }
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
 static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
-- 
Gitee


From 592a841e1297aad1d5f5ad3b3353adf0262eb911 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 17 Dec 2019 13:32:40 -0800
Subject: [PATCH 0007/2161] KVM: x86: Add CPUID_7_1_EAX to the reverse CPUID
 table

commit daa0d8c3a48732e5f64c69cca4c597cab1dfd455 upstream

Add an entry for CPUID_7_1_EAX in the reserve_cpuid array in preparation
for incorporating the array in bit() build-time assertions, specifically
to avoid an assertion on F(AVX512_BF16) in do_cpuid_7_mask().

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index eefa7f88f71a..f2e1cc91609d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -53,6 +53,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_7_ECX]         = {         7, 0, CPUID_ECX},
 	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
 	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
+	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
 };
 
 static inline u32 bit(int bitno)
-- 
Gitee


From 54fd49e3f9869916b19b7cb2c7e160bf29a9bfe8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 17 Dec 2019 13:32:41 -0800
Subject: [PATCH 0008/2161] KVM: x86: Expand build-time assertion on reverse
 CPUID usage

commit a7c48c3f56db8d18d15ee6d8c2c5e2da447c77cc upstream

Add build-time checks to ensure KVM isn't trying to do a reverse CPUID
lookup on Linux-defined feature bits, along with comments to explain
the gory details of X86_FEATUREs and bit().

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c |  3 ++-
 arch/x86/kvm/cpuid.h | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6fa946f983c9..cab56fa74fab 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -281,8 +281,9 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-static void cpuid_mask(u32 *word, int wordnum)
+static __always_inline void cpuid_mask(u32 *word, int wordnum)
 {
+	reverse_cpuid_check(wordnum);
 	*word &= boot_cpu_data.x86_capability[wordnum];
 }
 
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index f2e1cc91609d..5bfeacf1c998 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -56,18 +56,41 @@ static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
 };
 
-static inline u32 bit(int bitno)
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities.  And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned x86_leaf)
 {
-	return BIT(bitno & 31);
+	BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+	BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+	BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+	BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+	BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+	BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5).  The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 bit(int x86_feature)
+{
+	reverse_cpuid_check(x86_feature / 32);
+	return 1 << (x86_feature & 31);
 }
 
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
 {
 	unsigned x86_leaf = x86_feature / 32;
 
-	BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
-	BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
-
+	reverse_cpuid_check(x86_leaf);
 	return reverse_cpuid[x86_leaf];
 }
 
-- 
Gitee


From 372704e055a34623fe24fc0c2cd842aba0984b86 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 20 May 2021 19:52:39 +0800
Subject: [PATCH 0009/2161] KVM: x86: Refactor and rename bit() to
 feature_bit() macro

commit 87382003e3555926017228452dae7e7064b0f915 upstream

Rename bit() to __feature_bit() to give it a more descriptive name, and
add a macro, feature_bit(), to stuff the X68_FEATURE_ prefix to keep
line lengths manageable for code that hardcodes the bit to be retrieved.

No functional change intended.

Cc: Jim Mattson <jmattson@google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c   |  2 +-
 arch/x86/kvm/cpuid.h   |  8 +++++---
 arch/x86/kvm/emulate.c |  4 ++--
 arch/x86/kvm/svm.c     |  4 ++--
 arch/x86/kvm/vmx/vmx.c | 40 ++++++++++++++++++++--------------------
 arch/x86/kvm/x86.c     |  2 +-
 6 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index cab56fa74fab..cb3332ea72e3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -62,7 +62,7 @@ u64 kvm_supported_xcr0(void)
 	return xcr0;
 }
 
-#define F(x) bit(X86_FEATURE_##x)
+#define F feature_bit
 
 int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 5bfeacf1c998..072a7a1e7624 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -80,12 +80,14 @@ static __always_inline void reverse_cpuid_check(unsigned x86_leaf)
  * "word" (stored in bits 31:5).  The word is used to index into arrays of
  * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
  */
-static __always_inline u32 bit(int x86_feature)
+static __always_inline u32 __feature_bit(int x86_feature)
 {
 	reverse_cpuid_check(x86_feature / 32);
 	return 1 << (x86_feature & 31);
 }
 
+#define feature_bit(name)  __feature_bit(X86_FEATURE_##name)
+
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
 {
 	unsigned x86_leaf = x86_feature / 32;
@@ -130,7 +132,7 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_
 	if (!reg)
 		return false;
 
-	return *reg & bit(x86_feature);
+	return *reg & __feature_bit(x86_feature);
 }
 
 static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
@@ -139,7 +141,7 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x8
 
 	reg = guest_cpuid_get_register(vcpu, x86_feature);
 	if (reg)
-		*reg &= ~bit(x86_feature);
+		*reg &= ~__feature_bit(x86_feature);
 }
 
 static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3fb94199fb20..795be3d7f8f1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2383,7 +2383,7 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
 	eax = 0x80000001;
 	ecx = 0;
 	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
-	return edx & bit(X86_FEATURE_LM);
+	return edx & feature_bit(LM);
 #else
 	return false;
 #endif
@@ -3652,7 +3652,7 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-#define FFL(x) bit(X86_FEATURE_##x)
+#define FFL(x) __feature_bit(X86_FEATURE_##x)
 
 static int em_movbe(struct x86_emulate_ctxt *ctxt)
 {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 360844440c8d..191ed75b7457 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5971,14 +5971,14 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 	guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
 }
 
-#define F(x) bit(X86_FEATURE_##x)
+#define F feature_bit
 
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
 	switch (func) {
 	case 0x1:
 		if (avic)
-			entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+			entry->ecx &= ~F(X2APIC);
 		break;
 	case 0x80000001:
 		if (nested)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b5e865934b03..0f1550b7f631 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6961,27 +6961,27 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
 } while (0)
 
 	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
-	cr4_fixed1_update(X86_CR4_VME,        edx, bit(X86_FEATURE_VME));
-	cr4_fixed1_update(X86_CR4_PVI,        edx, bit(X86_FEATURE_VME));
-	cr4_fixed1_update(X86_CR4_TSD,        edx, bit(X86_FEATURE_TSC));
-	cr4_fixed1_update(X86_CR4_DE,         edx, bit(X86_FEATURE_DE));
-	cr4_fixed1_update(X86_CR4_PSE,        edx, bit(X86_FEATURE_PSE));
-	cr4_fixed1_update(X86_CR4_PAE,        edx, bit(X86_FEATURE_PAE));
-	cr4_fixed1_update(X86_CR4_MCE,        edx, bit(X86_FEATURE_MCE));
-	cr4_fixed1_update(X86_CR4_PGE,        edx, bit(X86_FEATURE_PGE));
-	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, bit(X86_FEATURE_FXSR));
-	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
-	cr4_fixed1_update(X86_CR4_VMXE,       ecx, bit(X86_FEATURE_VMX));
-	cr4_fixed1_update(X86_CR4_SMXE,       ecx, bit(X86_FEATURE_SMX));
-	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, bit(X86_FEATURE_PCID));
-	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, bit(X86_FEATURE_XSAVE));
+	cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
+	cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
+	cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
+	cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
+	cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
+	cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
+	cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
+	cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
+	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
+	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
+	cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
+	cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
+	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
+	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
 
 	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
-	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, bit(X86_FEATURE_FSGSBASE));
-	cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
-	cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
-	cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
-	cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));
+	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
+	cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
+	cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
+	cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
+	cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
 
 #undef cr4_fixed1_update
 }
@@ -7101,7 +7101,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
 	if (func == 1 && nested)
-		entry->ecx |= bit(X86_FEATURE_VMX);
+		entry->ecx |= feature_bit(VMX);
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f76c6e140ce3..a4b128c1c091 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -936,7 +936,7 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
 		reserved_bits |= X86_CR4_PKE;
 
 	if (!cpu_has(c, X86_FEATURE_LA57) &&
-	    !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
+	    !(cpuid_ecx(0x7) & feature_bit(LA57)))
 		reserved_bits |= X86_CR4_LA57;
 
 	if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
-- 
Gitee


From 89b4c47a3eb1f0ffb3f553f731634fe1914323ae Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:04 -0800
Subject: [PATCH 0010/2161] KVM: x86: Return -E2BIG when
 KVM_GET_SUPPORTED_CPUID hits max entries

commit 68c9a46e9ee8e9f673c7e0d1bbf483a289b5a86f upstream

Fix a long-standing bug that causes KVM to return 0 instead of -E2BIG
when userspace's array is insufficiently sized.

This technically breaks backwards compatibility, e.g. a userspace with a
hardcoded cpuid->nent could theoretically be broken as it would see an
error instead of success if cpuid->nent is less than the number of
entries required to fully enumerate the host CPU.  But, the lowest known
cpuid->nent hardcoded by a VMM is 100 (lkvm and selftests), and the
limit for current processors on Intel and AMD is well under a 100.  E.g.
Intel's Icelake server with all the bells and whistles tops out at ~60
entries (variable due to SGX sub-leafs), and AMD's CPUID documentation
allows for less than 50.  CPUID 0xD sub-leaves on current kernels are
capped by the value of KVM_SUPPORTED_XCR0, and therefore so many subleaves
cannot have appeared on current kernels.

Note, while the Fixes: tag is accurate with respect to the immediate
bug, it's likely that similar bugs in KVM_GET_SUPPORTED_CPUID existed
prior to the refactoring, e.g. Qemu contains a workaround for the broken
KVM_GET_SUPPORTED_CPUID behavior that predates the buggy commit by over
two years.  The Qemu workaround is also likely the main reason the bug
has gone unreported for so long.

Qemu hack:
  commit 76ae317f7c16aec6b469604b1764094870a75470
  Author: Mark McLoughlin <markmc@redhat.com>
  Date:   Tue May 19 18:55:21 2009 +0100

    kvm: work around supported cpuid ioctl() brokenness

    KVM_GET_SUPPORTED_CPUID has been known to fail to return -E2BIG
    when it runs out of entries. Detect this by always trying again
    with a bigger table if the ioctl() fills the table.

Fixes: 831bf664e9c1f ("KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid")
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index cb3332ea72e3..aa0a0102bb7c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -905,9 +905,14 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 			goto out_free;
 
 		limit = cpuid_entries[nent - 1].eax;
-		for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
+		for (func = ent->func + 1; func <= limit && r == 0; ++func) {
+			if (nent >= cpuid->nent) {
+				r = -E2BIG;
+				goto out_free;
+			}
 			r = do_cpuid_func(&cpuid_entries[nent], func,
 				          &nent, cpuid->nent, type);
+		}
 
 		if (r)
 			goto out_free;
-- 
Gitee


From 737055f7dcfbdf3c4ec628ba567eedf7eba89006 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:05 -0800
Subject: [PATCH 0011/2161] KVM: x86: Refactor loop around do_cpuid_func() to
 separate helper

commit 619a17f11069ce3d338e00a621f57dd8dec164c2 upstream

Move the guts of kvm_dev_ioctl_get_cpuid()'s CPUID func loop to a
separate helper to improve code readability and pave the way for future
cleanup.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 45 ++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index aa0a0102bb7c..4df017308577 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -836,6 +836,29 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
 	return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
 }
 
+static int get_cpuid_func(struct kvm_cpuid_entry2 *entries, u32 func,
+			  int *nent, int maxnent, unsigned int type)
+{
+	u32 limit;
+	int r;
+
+	r = do_cpuid_func(&entries[*nent], func, nent, maxnent, type);
+	if (r)
+		return r;
+
+	limit = entries[*nent - 1].eax;
+	for (func = func + 1; func <= limit; ++func) {
+		if (*nent >= maxnent)
+			return -E2BIG;
+
+		r = do_cpuid_func(&entries[*nent], func, nent, maxnent, type);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
 static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
 				 __u32 num_entries, unsigned int ioctl_type)
 {
@@ -868,8 +891,8 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 			    unsigned int type)
 {
 	struct kvm_cpuid_entry2 *cpuid_entries;
-	int limit, nent = 0, r = -E2BIG, i;
-	u32 func;
+	int nent = 0, r = -E2BIG, i;
+
 	static const struct kvm_cpuid_param param[] = {
 		{ .func = 0 },
 		{ .func = 0x80000000 },
@@ -898,22 +921,8 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 		if (ent->qualifier && !ent->qualifier(ent))
 			continue;
 
-		r = do_cpuid_func(&cpuid_entries[nent], ent->func,
-				  &nent, cpuid->nent, type);
-
-		if (r)
-			goto out_free;
-
-		limit = cpuid_entries[nent - 1].eax;
-		for (func = ent->func + 1; func <= limit && r == 0; ++func) {
-			if (nent >= cpuid->nent) {
-				r = -E2BIG;
-				goto out_free;
-			}
-			r = do_cpuid_func(&cpuid_entries[nent], func,
-				          &nent, cpuid->nent, type);
-		}
-
+		r = get_cpuid_func(cpuid_entries, ent->func, &nent,
+				   cpuid->nent, type);
 		if (r)
 			goto out_free;
 	}
-- 
Gitee


From 25021294e7175e5e8b307ac2a05b0296a00e9b45 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 19 May 2021 10:46:53 +0800
Subject: [PATCH 0012/2161] KVM: x86: Simplify handling of Centaur CPUID leafs

commit 8b86079cc339a7f1f94e92d8469d7fb69f8879c2 upstream

Refactor the handling of the Centaur-only CPUID leaf to detect the leaf
via a runtime query instead of adding a one-off callback in the static
array.  When the callback was introduced, there were additional fields
in the array's structs, and more importantly, retpoline wasn't a thing.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 4df017308577..997921483b96 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -826,15 +826,7 @@ static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
 
 #undef F
 
-struct kvm_cpuid_param {
-	u32 func;
-	bool (*qualifier)(const struct kvm_cpuid_param *param);
-};
-
-static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
-{
-	return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
-}
+#define CENTAUR_CPUID_SIGNATURE 0xC0000000
 
 static int get_cpuid_func(struct kvm_cpuid_entry2 *entries, u32 func,
 			  int *nent, int maxnent, unsigned int type)
@@ -842,6 +834,10 @@ static int get_cpuid_func(struct kvm_cpuid_entry2 *entries, u32 func,
 	u32 limit;
 	int r;
 
+	if (func == CENTAUR_CPUID_SIGNATURE &&
+	    boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR)
+		return 0;
+
 	r = do_cpuid_func(&entries[*nent], func, nent, maxnent, type);
 	if (r)
 		return r;
@@ -893,11 +889,8 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 	struct kvm_cpuid_entry2 *cpuid_entries;
 	int nent = 0, r = -E2BIG, i;
 
-	static const struct kvm_cpuid_param param[] = {
-		{ .func = 0 },
-		{ .func = 0x80000000 },
-		{ .func = 0xC0000000, .qualifier = is_centaur_cpu },
-		{ .func = KVM_CPUID_SIGNATURE },
+	static const u32 funcs[] = {
+		0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
 	};
 
 	if (cpuid->nent < 1)
@@ -915,14 +908,9 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 		goto out;
 
 	r = 0;
-	for (i = 0; i < ARRAY_SIZE(param); i++) {
-		const struct kvm_cpuid_param *ent = &param[i];
-
-		if (ent->qualifier && !ent->qualifier(ent))
-			continue;
-
-		r = get_cpuid_func(cpuid_entries, ent->func, &nent,
-				   cpuid->nent, type);
+	for (i = 0; i < ARRAY_SIZE(funcs); i++) {
+		r = get_cpuid_func(cpuid_entries, funcs[i], &nent, cpuid->nent,
+				   type);
 		if (r)
 			goto out_free;
 	}
-- 
Gitee


From bb62de80fd2b936c53004450283c80a014d15212 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:07 -0800
Subject: [PATCH 0013/2161] KVM: x86: Clean up error handling in
 kvm_dev_ioctl_get_cpuid()

commit d5a661d19df15cc618bb69e43cab3edb06d2021f upstream

Clean up the error handling in kvm_dev_ioctl_get_cpuid(), which has
gotten a bit crusty as the function has evolved over the years.

Opportunistically hoist the static @funcs declaration to the top of the
function to make it more obvious that it's a "static const".

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 997921483b96..b0c166ed2a55 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -886,45 +886,40 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 			    struct kvm_cpuid_entry2 __user *entries,
 			    unsigned int type)
 {
-	struct kvm_cpuid_entry2 *cpuid_entries;
-	int nent = 0, r = -E2BIG, i;
-
 	static const u32 funcs[] = {
 		0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
 	};
 
+	struct kvm_cpuid_entry2 *cpuid_entries;
+	int nent = 0, r, i;
+
 	if (cpuid->nent < 1)
-		goto out;
+		return -E2BIG;
 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
 
 	if (sanity_check_entries(entries, cpuid->nent, type))
 		return -EINVAL;
 
-	r = -ENOMEM;
 	cpuid_entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
 					   cpuid->nent));
 	if (!cpuid_entries)
-		goto out;
+		return -ENOMEM;
 
-	r = 0;
 	for (i = 0; i < ARRAY_SIZE(funcs); i++) {
 		r = get_cpuid_func(cpuid_entries, funcs[i], &nent, cpuid->nent,
 				   type);
 		if (r)
 			goto out_free;
 	}
+	cpuid->nent = nent;
 
-	r = -EFAULT;
 	if (copy_to_user(entries, cpuid_entries,
 			 nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out_free;
-	cpuid->nent = nent;
-	r = 0;
+		r = -EFAULT;
 
 out_free:
 	vfree(cpuid_entries);
-out:
 	return r;
 }
 
-- 
Gitee


From 9981a718c2720cedd9a013ff8c54690f7ba80faf Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:08 -0800
Subject: [PATCH 0014/2161] KVM: x86: Check userspace CPUID array size after
 validating sub-leaf

commit 0fc62671876c29a80c16d41cbce782ff10795bef upstream

Verify that the next sub-leaf of CPUID 0x4 (or 0x8000001d) is valid
before rejecting the entire KVM_GET_SUPPORTED_CPUID due to insufficent
space in the userspace array.

Note, although this is technically a bug, it's not visible to userspace
as KVM_GET_SUPPORTED_CPUID is guaranteed to fail on KVM_CPUID_SIGNATURE,
which is hardcoded to be added after the affected leafs.  The real
motivation for the change is to tightly couple the nent/maxnent and
do_host_cpuid() sequences in preparation for future cleanup.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b0c166ed2a55..d9b4d5c0feb7 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -552,12 +552,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		/* read more entries until cache_type is zero */
 		for (i = 1; ; ++i) {
-			if (*nent >= maxnent)
-				goto out;
-
 			cache_type = entry[i - 1].eax & 0x1f;
 			if (!cache_type)
 				break;
+
+			if (*nent >= maxnent)
+				goto out;
 			do_host_cpuid(&entry[i], function, i);
 			++*nent;
 		}
-- 
Gitee


From 6178cf7bb7a0bcedd42e082fa98b5183f9a6f75a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:09 -0800
Subject: [PATCH 0015/2161] KVM: x86: Move CPUID 0xD.1 handling out of the
 index>0 loop

commit 3dc4a9cf05e53c358816f733e383ec42c86f3d4c upstream

Mov the sub-leaf 1 handling for CPUID 0xD out of the index>0 loop so
that the loop only handles index>2.  Sub-leafs 2+ have identical
semantics, whereas sub-leaf 1 is effectively a feature sub-leaf.

Moving sub-leaf 1 out of the loop does duplicate a bit of code, but
the nent/maxnent code will be consolidated in a future patch, and
duplicating the clear of ECX/EDX is arguably a good thing as the reasons
for clearing said registers are completely different.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index d9b4d5c0feb7..ae6d8362f0e4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -653,26 +653,33 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		if (!supported)
 			break;
 
-		for (idx = 1, i = 1; idx < 64; ++idx) {
+		if (*nent >= maxnent)
+			goto out;
+
+		do_host_cpuid(&entry[1], function, 1);
+		++*nent;
+
+		entry[1].eax &= kvm_cpuid_D_1_eax_x86_features;
+		cpuid_mask(&entry[1].eax, CPUID_D_1_EAX);
+		if (entry[1].eax & (F(XSAVES)|F(XSAVEC)))
+			entry[1].ebx = xstate_required_size(supported, true);
+		else
+			entry[1].ebx = 0;
+		/* Saving XSS controlled state via XSAVES isn't supported. */
+		entry[1].ecx = 0;
+		entry[1].edx = 0;
+
+		for (idx = 2, i = 2; idx < 64; ++idx) {
 			u64 mask = ((u64)1 << idx);
+
 			if (*nent >= maxnent)
 				goto out;
 
 			do_host_cpuid(&entry[i], function, idx);
-			if (idx == 1) {
-				entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
-				cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
-				entry[i].ebx = 0;
-				if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
-					entry[i].ebx =
-						xstate_required_size(supported,
-								     true);
-			} else {
-				if (entry[i].eax == 0 || !(supported & mask))
-					continue;
-				if (WARN_ON_ONCE(entry[i].ecx & 1))
-					continue;
-			}
+			if (entry[i].eax == 0 || !(supported & mask))
+				continue;
+			if (WARN_ON_ONCE(entry[i].ecx & 1))
+				continue;
 			entry[i].ecx = 0;
 			entry[i].edx = 0;
 			++*nent;
-- 
Gitee


From 4a85ccd75a3f53609043876a42533566841850b8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:10 -0800
Subject: [PATCH 0016/2161] KVM: x86: Check for CPUID 0xD.N support before
 validating array size

commit 1893c9415ae8cad20da863c41bdf308e56a2dd67 upstream

Now that sub-leaf 1 is handled separately, verify the next sub-leaf is
needed before rejecting KVM_GET_SUPPORTED_CPUID due to an insufficiently
sized userspace array.

Note, although this is technically a bug, it's not visible to userspace
as KVM_GET_SUPPORTED_CPUID is guaranteed to fail on KVM_CPUID_SIGNATURE,
which is hardcoded to be added after leaf 0xD.  The real motivation for
the change is to tightly couple the nent/maxnent and do_host_cpuid()
sequences in preparation for future cleanup.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ae6d8362f0e4..59203a505047 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -670,13 +670,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry[1].edx = 0;
 
 		for (idx = 2, i = 2; idx < 64; ++idx) {
-			u64 mask = ((u64)1 << idx);
+			if (!(supported & BIT_ULL(idx)))
+				continue;
 
 			if (*nent >= maxnent)
 				goto out;
 
 			do_host_cpuid(&entry[i], function, idx);
-			if (entry[i].eax == 0 || !(supported & mask))
+			if (entry[i].eax == 0)
 				continue;
 			if (WARN_ON_ONCE(entry[i].ecx & 1))
 				continue;
-- 
Gitee


From 459b25e59b6d4bc14111eee0740167276290f4ef Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:11 -0800
Subject: [PATCH 0017/2161] KVM: x86: Warn on zero-size save state for valid
 CPUID 0xD.N sub-leaf

commit 91001d403ad39b9c800a3b805bf48c5b027ecac5 upstream

WARN if the save state size for a valid XCR0-managed sub-leaf is zero,
which would indicate a KVM or CPU bug.  Add a comment to explain why KVM
WARNs so the reader doesn't have to tease out the relevant bits from
Intel's SDM and KVM's XCR0/XSS code.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59203a505047..5ee86f907242 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -677,10 +677,17 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 				goto out;
 
 			do_host_cpuid(&entry[i], function, idx);
-			if (entry[i].eax == 0)
-				continue;
-			if (WARN_ON_ONCE(entry[i].ecx & 1))
+
+			/*
+			 * The @supported check above should have filtered out
+			 * invalid sub-leafs as well as sub-leafs managed by
+			 * IA32_XSS MSR.  Only XCR0-managed sub-leafs should
+			 * reach this point, and they should have a non-zero
+			 * save state size.
+			 */
+			if (WARN_ON_ONCE(!entry[i].eax || (entry[i].ecx & 1)))
 				continue;
+
 			entry[i].ecx = 0;
 			entry[i].edx = 0;
 			++*nent;
-- 
Gitee


From 16a89ea09efa5bb6361c1334164c88e7985ee5a7 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:12 -0800
Subject: [PATCH 0018/2161] KVM: x86: Refactor CPUID 0xD.N sub-leaf entry
 creation

commit 8b2fc445a7619fec55740d0a5785aae4bd1058f3 upstream

Increment the number of CPUID entries immediately after do_host_cpuid()
in preparation for moving the logic into do_host_cpuid().  Handle the
rare/impossible case of encountering a bogus sub-leaf by decrementing
the number entries on failure.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 5ee86f907242..d7f2fb086e08 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -677,6 +677,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 				goto out;
 
 			do_host_cpuid(&entry[i], function, idx);
+			++*nent;
 
 			/*
 			 * The @supported check above should have filtered out
@@ -685,12 +686,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			 * reach this point, and they should have a non-zero
 			 * save state size.
 			 */
-			if (WARN_ON_ONCE(!entry[i].eax || (entry[i].ecx & 1)))
+			if (WARN_ON_ONCE(!entry[i].eax || (entry[i].ecx & 1))) {
+				--*nent;
 				continue;
+			}
 
 			entry[i].ecx = 0;
 			entry[i].edx = 0;
-			++*nent;
 			++i;
 		}
 		break;
-- 
Gitee


From 137228dba78212f08012a85c0d2e5598d70443eb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:13 -0800
Subject: [PATCH 0019/2161] KVM: x86: Clean up CPUID 0x7 sub-leaf loop

commit 87849b1ccbd404eff78b54d8e4dd831e4a905cbb upstream

Refactor the sub-leaf loop for CPUID 0x7 to move the main leaf out of
said loop.  The emitted code savings is basically a mirage, as the
handling of the main leaf can easily be split to its own helper to avoid
code bloat.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index d7f2fb086e08..c54f7bfbed78 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -573,16 +573,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 	case 7: {
 		int i;
 
-		for (i = 0; ; ) {
-			do_cpuid_7_mask(&entry[i], i);
-			if (i == entry->eax)
-				break;
+		do_cpuid_7_mask(entry, 0);
+
+		for (i = 1; i <= entry->eax; i++) {
 			if (*nent >= maxnent)
 				goto out;
 
-			++i;
 			do_host_cpuid(&entry[i], function, i);
 			++*nent;
+
+			do_cpuid_7_mask(&entry[i], i);
 		}
 		break;
 	}
-- 
Gitee


From 28ddec1f7ea07bf561806448460a8e2744f9c71d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:14 -0800
Subject: [PATCH 0020/2161] KVM: x86: Drop the explicit @index from
 do_cpuid_7_mask()

commit aceac6e5700f4bdfb35fd558ae60176be24aa482 upstream

Drop the index param from do_cpuid_7_mask() and instead switch on the
entry's index, which is guaranteed to be set by do_host_cpuid().

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c54f7bfbed78..350490943757 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -346,7 +346,7 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
 	return 0;
 }
 
-static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
+static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 {
 	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
 	unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
@@ -380,7 +380,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
 	const u32 kvm_cpuid_7_1_eax_x86_features =
 		F(AVX512_BF16);
 
-	switch (index) {
+	switch (entry->index) {
 	case 0:
 		entry->eax = min(entry->eax, 1u);
 		entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
@@ -573,7 +573,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 	case 7: {
 		int i;
 
-		do_cpuid_7_mask(entry, 0);
+		do_cpuid_7_mask(entry);
 
 		for (i = 1; i <= entry->eax; i++) {
 			if (*nent >= maxnent)
@@ -582,7 +582,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			do_host_cpuid(&entry[i], function, i);
 			++*nent;
 
-			do_cpuid_7_mask(&entry[i], i);
+			do_cpuid_7_mask(&entry[i]);
 		}
 		break;
 	}
-- 
Gitee


From 667f7abc74a140ddabb88eb73b687c59f9df0e08 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:15 -0800
Subject: [PATCH 0021/2161] KVM: x86: Drop redundant boot cpu checks on SSBD
 feature bits

commit acfad336ecf97ccad43346d7a82c60c3b21dbde0 upstream

Drop redundant checks when "emulating" SSBD feature across vendors,
i.e. advertising the AMD variant when running on an Intel CPU and vice
versa.  Both SPEC_CTRL_SSBD and AMD_SSBD are already defined in the
leaf-specific feature masks and are *not* forcefully set by the kernel,
i.e. will already be set in the entry when supported by the host.

Functionally, this changes nothing, but the redundant check is
confusing, especially when considering future patches that will further
differentiate between "real" and "emulated" feature bits.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 350490943757..5df29937e557 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -405,8 +405,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 			entry->edx |= F(SPEC_CTRL);
 		if (boot_cpu_has(X86_FEATURE_STIBP))
 			entry->edx |= F(INTEL_STIBP);
-		if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
-		    boot_cpu_has(X86_FEATURE_AMD_SSBD))
+		if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
 			entry->edx |= F(SPEC_CTRL_SSBD);
 		/*
 		 * We emulate ARCH_CAPABILITIES in software even
@@ -780,8 +779,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			entry->ebx |= F(AMD_IBRS);
 		if (boot_cpu_has(X86_FEATURE_STIBP))
 			entry->ebx |= F(AMD_STIBP);
-		if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
-		    boot_cpu_has(X86_FEATURE_AMD_SSBD))
+		if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
 			entry->ebx |= F(AMD_SSBD);
 		if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
 			entry->ebx |= F(AMD_SSB_NO);
-- 
Gitee


From fa55fefc6efbef496d9eab8d4c311a9b3f0a908a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:16 -0800
Subject: [PATCH 0022/2161] KVM: x86: Consolidate CPUID array max num entries
 checking

commit aa10a7dc8858f6cbd1ef5cb86693592896ee27c7 upstream

Move the nent vs. maxnent check and nent increment into do_host_cpuid()
to consolidate what is now identical code.  To signal success vs.
failure, return the entry and NULL respectively.  A future patch will
build on this to also move the entry retrieval into do_host_cpuid().

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 49 +++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 5df29937e557..c54d31c0e62a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -287,9 +287,14 @@ static __always_inline void cpuid_mask(u32 *word, int wordnum)
 	*word &= boot_cpu_data.x86_capability[wordnum];
 }
 
-static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
-			   u32 index)
+static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_entry2 *entry,
+					      int *nent, int maxnent,
+					      u32 function, u32 index)
 {
+	if (*nent >= maxnent)
+		return NULL;
+	++*nent;
+
 	entry->function = function;
 	entry->index = index;
 	entry->flags = 0;
@@ -316,6 +321,8 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		break;
 	}
+
+	return entry;
 }
 
 static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
@@ -507,12 +514,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	r = -E2BIG;
 
-	if (WARN_ON(*nent >= maxnent))
+	if (WARN_ON(!do_host_cpuid(entry, nent, maxnent, function, 0)))
 		goto out;
 
-	do_host_cpuid(entry, function, 0);
-	++*nent;
-
 	switch (function) {
 	case 0:
 		/* Limited to the highest leaf implemented in KVM. */
@@ -536,11 +540,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
 		for (t = 1; t < times; ++t) {
-			if (*nent >= maxnent)
+			if (!do_host_cpuid(&entry[t], nent, maxnent, function, 0))
 				goto out;
-
-			do_host_cpuid(&entry[t], function, 0);
-			++*nent;
 		}
 		break;
 	}
@@ -555,10 +556,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			if (!cache_type)
 				break;
 
-			if (*nent >= maxnent)
+			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
 				goto out;
-			do_host_cpuid(&entry[i], function, i);
-			++*nent;
 		}
 		break;
 	}
@@ -575,12 +574,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		do_cpuid_7_mask(entry);
 
 		for (i = 1; i <= entry->eax; i++) {
-			if (*nent >= maxnent)
+			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
 				goto out;
 
-			do_host_cpuid(&entry[i], function, i);
-			++*nent;
-
 			do_cpuid_7_mask(&entry[i]);
 		}
 		break;
@@ -633,11 +629,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		 * added entry is zero.
 		 */
 		for (i = 1; entry[i - 1].ecx & 0xff00; ++i) {
-			if (*nent >= maxnent)
+			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
 				goto out;
-
-			do_host_cpuid(&entry[i], function, i);
-			++*nent;
 		}
 		break;
 	}
@@ -652,12 +645,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		if (!supported)
 			break;
 
-		if (*nent >= maxnent)
+		if (!do_host_cpuid(&entry[1], nent, maxnent, function, 1))
 			goto out;
 
-		do_host_cpuid(&entry[1], function, 1);
-		++*nent;
-
 		entry[1].eax &= kvm_cpuid_D_1_eax_x86_features;
 		cpuid_mask(&entry[1].eax, CPUID_D_1_EAX);
 		if (entry[1].eax & (F(XSAVES)|F(XSAVEC)))
@@ -672,12 +662,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			if (!(supported & BIT_ULL(idx)))
 				continue;
 
-			if (*nent >= maxnent)
+			if (!do_host_cpuid(&entry[i], nent, maxnent, function, idx))
 				goto out;
 
-			do_host_cpuid(&entry[i], function, idx);
-			++*nent;
-
 			/*
 			 * The @supported check above should have filtered out
 			 * invalid sub-leafs as well as sub-leafs managed by
@@ -704,10 +691,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			break;
 
 		for (t = 1; t <= times; ++t) {
-			if (*nent >= maxnent)
+			if (!do_host_cpuid(&entry[t], nent, maxnent, function, t))
 				goto out;
-			do_host_cpuid(&entry[t], function, t);
-			++*nent;
 		}
 		break;
 	}
-- 
Gitee


From ba4616ef45ea8ac97008e347b1244d62acb7586a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:17 -0800
Subject: [PATCH 0023/2161] KVM: x86: Hoist loop counter and terminator to top
 of __do_cpuid_func()

commit 74fa0bc7f083fbcce1ee0b9c3c2716fcb068fce4 upstream

Declare "i" and "max_idx" at the top of __do_cpuid_func() to consolidate
a handful of declarations in various case statements.

More importantly, establish the pattern of using max_idx instead of e.g.
entry->eax as the loop terminator in preparation for refactoring how
entry is handled in __do_cpuid_func().

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c54d31c0e62a..0e34887e6130 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -439,7 +439,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 				  int *nent, int maxnent)
 {
-	int r;
+	int r, i, max_idx;
 	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
 #ifdef CONFIG_X86_64
 	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
@@ -535,20 +535,18 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 	 * may return different values. This forces us to get_cpu() before
 	 * issuing the first command, and also to emulate this annoying behavior
 	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
-	case 2: {
-		int t, times = entry->eax & 0xff;
-
+	case 2:
 		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-		for (t = 1; t < times; ++t) {
-			if (!do_host_cpuid(&entry[t], nent, maxnent, function, 0))
+
+		for (i = 1, max_idx = entry->eax & 0xff; i < max_idx; ++i) {
+			if (!do_host_cpuid(&entry[i], nent, maxnent, function, 0))
 				goto out;
 		}
 		break;
-	}
 	/* functions 4 and 0x8000001d have additional index. */
 	case 4:
 	case 0x8000001d: {
-		int i, cache_type;
+		int cache_type;
 
 		/* read more entries until cache_type is zero */
 		for (i = 1; ; ++i) {
@@ -568,19 +566,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->edx = 0;
 		break;
 	/* function 7 has additional index. */
-	case 7: {
-		int i;
-
+	case 7:
 		do_cpuid_7_mask(entry);
 
-		for (i = 1; i <= entry->eax; i++) {
+		for (i = 1, max_idx = entry->eax; i <= max_idx; i++) {
 			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
 				goto out;
 
 			do_cpuid_7_mask(&entry[i]);
 		}
 		break;
-	}
 	case 9:
 		break;
 	case 0xa: { /* Architectural Performance Monitoring */
@@ -617,9 +612,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 	 * thus they can be handled by common code.
 	 */
 	case 0x1f:
-	case 0xb: {
-		int i;
-
+	case 0xb:
 		/*
 		 * We filled in entry[0] for CPUID(EAX=<function>,
 		 * ECX=00H) above.  If its level type (ECX[15:8]) is
@@ -633,9 +626,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 				goto out;
 		}
 		break;
-	}
 	case 0xd: {
-		int idx, i;
+		int idx;
 		u64 supported = kvm_supported_xcr0();
 
 		entry->eax &= supported;
@@ -684,18 +676,15 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	}
 	/* Intel PT */
-	case 0x14: {
-		int t, times = entry->eax;
-
+	case 0x14:
 		if (!f_intel_pt)
 			break;
 
-		for (t = 1; t <= times; ++t) {
-			if (!do_host_cpuid(&entry[t], nent, maxnent, function, t))
+		for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
 				goto out;
 		}
 		break;
-	}
 	case KVM_CPUID_SIGNATURE: {
 		static const char signature[12] = "KVMKVMKVM\0\0";
 		const u32 *sigptr = (const u32 *)signature;
-- 
Gitee


From a65df670e329052e13f6a5132f1af12f50336b1c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:18 -0800
Subject: [PATCH 0024/2161] KVM: x86: Refactor CPUID 0x4 and 0x8000001d
 handling

commit c862903963bbbf8dfcbb46204584f2330bb8df42 upstream

Refactoring the sub-leaf handling for CPUID 0x4/0x8000001d to eliminate
a one-off variable and its associated brackets.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0e34887e6130..6ccb98838f31 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -545,20 +545,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	/* functions 4 and 0x8000001d have additional index. */
 	case 4:
-	case 0x8000001d: {
-		int cache_type;
-
-		/* read more entries until cache_type is zero */
-		for (i = 1; ; ++i) {
-			cache_type = entry[i - 1].eax & 0x1f;
-			if (!cache_type)
-				break;
-
+	case 0x8000001d:
+		/*
+		 * Read entries until the cache type in the previous entry is
+		 * zero, i.e. indicates an invalid entry.
+		 */
+		for (i = 1; entry[i - 1].eax & 0x1f; ++i) {
 			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
 				goto out;
 		}
 		break;
-	}
 	case 6: /* Thermal management */
 		entry->eax = 0x4; /* allow ARAT */
 		entry->ebx = 0;
-- 
Gitee


From 69d32797a42ba9562f98e1427e3c024705044a65 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 17 May 2021 21:12:51 +0800
Subject: [PATCH 0025/2161] KVM: x86: Encapsulate CPUID entries and metadata in
 struct

commit e53c95e8d41ef9927d1d5ff031db1fd6989ec16b upstream

Add a struct to hold the array of CPUID entries and its associated
metadata when handling KVM_GET_SUPPORTED_CPUID.  Lookup and provide
the correct entry in do_host_cpuid(), which eliminates the majority of
array indexing shenanigans, e.g. entries[i -1], and generally makes the
code more readable.  The last array indexing holdout is kvm_get_cpuid(),
which can't really be avoided without throwing the baby out with the
bathwater.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 138 ++++++++++++++++++++++++-------------------
 1 file changed, 76 insertions(+), 62 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6ccb98838f31..8bedb9ec4302 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -287,13 +287,21 @@ static __always_inline void cpuid_mask(u32 *word, int wordnum)
 	*word &= boot_cpu_data.x86_capability[wordnum];
 }
 
-static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_entry2 *entry,
-					      int *nent, int maxnent,
+struct kvm_cpuid_array {
+	struct kvm_cpuid_entry2 *entries;
+	const int maxnent;
+	int nent;
+};
+
+static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
 					      u32 function, u32 index)
 {
-	if (*nent >= maxnent)
+	struct kvm_cpuid_entry2 *entry;
+
+	if (array->nent >= array->maxnent)
 		return NULL;
-	++*nent;
+
+	entry = &array->entries[array->nent++];
 
 	entry->function = function;
 	entry->index = index;
@@ -325,9 +333,10 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_entry2 *entry,
 	return entry;
 }
 
-static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
-				    u32 func, int *nent, int maxnent)
+static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 {
+	struct kvm_cpuid_entry2 *entry = &array->entries[array->nent];
+
 	entry->function = func;
 	entry->index = 0;
 	entry->flags = 0;
@@ -335,17 +344,17 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
 	switch (func) {
 	case 0:
 		entry->eax = 7;
-		++*nent;
+		++array->nent;
 		break;
 	case 1:
 		entry->ecx = F(MOVBE);
-		++*nent;
+		++array->nent;
 		break;
 	case 7:
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		entry->eax = 0;
 		entry->ecx = F(RDPID);
-		++*nent;
+		++array->nent;
 	default:
 		break;
 	}
@@ -436,9 +445,9 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 	}
 }
 
-static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
-				  int *nent, int maxnent)
+static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 {
+	struct kvm_cpuid_entry2 *entry;
 	int r, i, max_idx;
 	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
 #ifdef CONFIG_X86_64
@@ -514,7 +523,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	r = -E2BIG;
 
-	if (WARN_ON(!do_host_cpuid(entry, nent, maxnent, function, 0)))
+	entry = do_host_cpuid(array, function, 0);
+	if (WARN_ON(!entry))
 		goto out;
 
 	switch (function) {
@@ -539,7 +549,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
 
 		for (i = 1, max_idx = entry->eax & 0xff; i < max_idx; ++i) {
-			if (!do_host_cpuid(&entry[i], nent, maxnent, function, 0))
+			entry = do_host_cpuid(array, function, 0);
+			if (!entry)
 				goto out;
 		}
 		break;
@@ -550,8 +561,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		 * Read entries until the cache type in the previous entry is
 		 * zero, i.e. indicates an invalid entry.
 		 */
-		for (i = 1; entry[i - 1].eax & 0x1f; ++i) {
-			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
+		for (i = 1; entry->eax & 0x1f; ++i) {
+			entry = do_host_cpuid(array, function, i);
+			if (!entry)
 				goto out;
 		}
 		break;
@@ -566,10 +578,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		do_cpuid_7_mask(entry);
 
 		for (i = 1, max_idx = entry->eax; i <= max_idx; i++) {
-			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
+			entry = do_host_cpuid(array, function, i);
+			if (!entry)
 				goto out;
 
-			do_cpuid_7_mask(&entry[i]);
+			do_cpuid_7_mask(entry);
 		}
 		break;
 	case 9:
@@ -610,15 +623,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 	case 0x1f:
 	case 0xb:
 		/*
-		 * We filled in entry[0] for CPUID(EAX=<function>,
-		 * ECX=00H) above.  If its level type (ECX[15:8]) is
-		 * zero, then the leaf is unimplemented, and we're
-		 * done.  Otherwise, continue to populate entries
-		 * until the level type (ECX[15:8]) of the previously
-		 * added entry is zero.
+		 * Populate entries until the level type (ECX[15:8]) of the
+		 * previous entry is zero.  Note, CPUID EAX.{0x1f,0xb}.0 is
+		 * the starting entry, filled by the primary do_host_cpuid().
 		 */
-		for (i = 1; entry[i - 1].ecx & 0xff00; ++i) {
-			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
+		for (i = 1; entry->ecx & 0xff00; ++i) {
+			entry = do_host_cpuid(array, function, i);
+			if (!entry)
 				goto out;
 		}
 		break;
@@ -633,24 +644,26 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		if (!supported)
 			break;
 
-		if (!do_host_cpuid(&entry[1], nent, maxnent, function, 1))
+		entry = do_host_cpuid(array, function, 1);
+		if (!entry)
 			goto out;
 
-		entry[1].eax &= kvm_cpuid_D_1_eax_x86_features;
-		cpuid_mask(&entry[1].eax, CPUID_D_1_EAX);
-		if (entry[1].eax & (F(XSAVES)|F(XSAVEC)))
-			entry[1].ebx = xstate_required_size(supported, true);
+		entry->eax &= kvm_cpuid_D_1_eax_x86_features;
+		cpuid_mask(&entry->eax, CPUID_D_1_EAX);
+		if (entry->eax & (F(XSAVES)|F(XSAVEC)))
+			entry->ebx = xstate_required_size(supported, true);
 		else
-			entry[1].ebx = 0;
+			entry->ebx = 0;
 		/* Saving XSS controlled state via XSAVES isn't supported. */
-		entry[1].ecx = 0;
-		entry[1].edx = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
 
-		for (idx = 2, i = 2; idx < 64; ++idx) {
+		for (idx = 2; idx < 64; ++idx) {
 			if (!(supported & BIT_ULL(idx)))
 				continue;
 
-			if (!do_host_cpuid(&entry[i], nent, maxnent, function, idx))
+			entry = do_host_cpuid(array, function, idx);
+			if (!entry)
 				goto out;
 
 			/*
@@ -660,14 +673,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			 * reach this point, and they should have a non-zero
 			 * save state size.
 			 */
-			if (WARN_ON_ONCE(!entry[i].eax || (entry[i].ecx & 1))) {
-				--*nent;
+			if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 1))) {
+				--array->nent;
 				continue;
 			}
 
-			entry[i].ecx = 0;
-			entry[i].edx = 0;
-			++i;
+			entry->ecx = 0;
+			entry->edx = 0;
 		}
 		break;
 	}
@@ -677,7 +689,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			break;
 
 		for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
-			if (!do_host_cpuid(&entry[i], nent, maxnent, function, i))
+			if (!do_host_cpuid(array, function, i))
 				goto out;
 		}
 		break;
@@ -797,24 +809,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 	return r;
 }
 
-static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
-			 int *nent, int maxnent, unsigned int type)
+static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+			 unsigned int type)
 {
-	if (*nent >= maxnent)
+	if (array->nent >= array->maxnent)
 		return -E2BIG;
 
 	if (type == KVM_GET_EMULATED_CPUID)
-		return __do_cpuid_func_emulated(entry, func, nent, maxnent);
+		return __do_cpuid_func_emulated(array, func);
 
-	return __do_cpuid_func(entry, func, nent, maxnent);
+	return __do_cpuid_func(array, func);
 }
 
 #undef F
 
 #define CENTAUR_CPUID_SIGNATURE 0xC0000000
 
-static int get_cpuid_func(struct kvm_cpuid_entry2 *entries, u32 func,
-			  int *nent, int maxnent, unsigned int type)
+static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+			  unsigned int type)
 {
 	u32 limit;
 	int r;
@@ -823,16 +835,16 @@ static int get_cpuid_func(struct kvm_cpuid_entry2 *entries, u32 func,
 	    boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR)
 		return 0;
 
-	r = do_cpuid_func(&entries[*nent], func, nent, maxnent, type);
+	r = do_cpuid_func(array, func, type);
 	if (r)
 		return r;
 
-	limit = entries[*nent - 1].eax;
+	limit = array->entries[array->nent - 1].eax;
 	for (func = func + 1; func <= limit; ++func) {
-		if (*nent >= maxnent)
+		if (array->nent >= array->maxnent)
 			return -E2BIG;
 
-		r = do_cpuid_func(&entries[*nent], func, nent, maxnent, type);
+		r = do_cpuid_func(array, func, type);
 		if (r)
 			break;
 	}
@@ -875,8 +887,11 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 		0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
 	};
 
-	struct kvm_cpuid_entry2 *cpuid_entries;
-	int nent = 0, r, i;
+	struct kvm_cpuid_array array = {
+		.nent = 0,
+		.maxnent = cpuid->nent,
+	};
+	int r, i;
 
 	if (cpuid->nent < 1)
 		return -E2BIG;
@@ -886,25 +901,24 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 	if (sanity_check_entries(entries, cpuid->nent, type))
 		return -EINVAL;
 
-	cpuid_entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
+	array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
 					   cpuid->nent));
-	if (!cpuid_entries)
+	if (!array.entries)
 		return -ENOMEM;
 
 	for (i = 0; i < ARRAY_SIZE(funcs); i++) {
-		r = get_cpuid_func(cpuid_entries, funcs[i], &nent, cpuid->nent,
-				   type);
+		r = get_cpuid_func(&array, funcs[i], type);
 		if (r)
 			goto out_free;
 	}
-	cpuid->nent = nent;
+	cpuid->nent = array.nent;
 
-	if (copy_to_user(entries, cpuid_entries,
-			 nent * sizeof(struct kvm_cpuid_entry2)))
+	if (copy_to_user(entries, array.entries,
+			 array.nent * sizeof(struct kvm_cpuid_entry2)))
 		r = -EFAULT;
 
 out_free:
-	vfree(cpuid_entries);
+	vfree(array.entries);
 	return r;
 }
 
-- 
Gitee


From 58784d7d15986ee37989a489e09b3e496084d2d3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:20 -0800
Subject: [PATCH 0026/2161] KVM: x86: Drop redundant array size check

commit 695538aa21c057578a0c70e1aa22fd97fd407930 upstream

Drop a "nent >= maxnent" check in kvm_get_cpuid() that's fully redundant
now that kvm_get_cpuid() isn't indexing the array to pass an entry to
do_cpuid_func().

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8bedb9ec4302..00362815cd81 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -841,9 +841,6 @@ static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
 
 	limit = array->entries[array->nent - 1].eax;
 	for (func = func + 1; func <= limit; ++func) {
-		if (array->nent >= array->maxnent)
-			return -E2BIG;
-
 		r = do_cpuid_func(array, func, type);
 		if (r)
 			break;
-- 
Gitee


From f599ad4cfd433b9f677dbbe7a14f56a4f0554839 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 18 May 2021 16:29:09 +0800
Subject: [PATCH 0027/2161] KVM: x86: Clear output regs for CPUID 0x14 if PT
 isn't exposed to guest

commit 7392079c4e740572a5af72a7223ef5e162e442c9 upstream

Clear the output regs for the main CPUID 0x14 leaf (index=0) if Intel PT
isn't exposed to the guest.  Leaf 0x14 enumerates Intel PT capabilities
and should return zeroes if PT is not supported.  Incorrectly reporting
PT capabilities is essentially a cosmetic error, i.e. doesn't negatively
affect any known userspace/kernel, as the existence of PT itself is
correctly enumerated via CPUID 0x7.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 00362815cd81..2dfa8c4333d0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -685,8 +685,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	}
 	/* Intel PT */
 	case 0x14:
-		if (!f_intel_pt)
+		if (!f_intel_pt) {
+			entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 			break;
+		}
 
 		for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
 			if (!do_host_cpuid(array, function, i))
-- 
Gitee


From bcbdf202f608d97e6b4d4b8ca0d8a118dfe5264d Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 20 May 2021 16:19:46 +0800
Subject: [PATCH 0028/2161] KVM: x86: Drop explicit @func param from
 ->set_supported_cpuid()

commit 160b486f65ff89be7f90ff9297bb4bb0da446d91 upstream

Drop the explicit @func param from ->set_supported_cpuid() and instead
pull the CPUID function from the relevant entry.  This sets the stage
for hardening guest CPUID updates in future patches, e.g. allows adding
run-time assertions that the CPUID feature being changed is actually
a bit in the referenced CPUID entry.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/cpuid.c            | 2 +-
 arch/x86/kvm/svm.c              | 4 ++--
 arch/x86/kvm/vmx/vmx.c          | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b7e643ee685b..b88321b66daa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1117,7 +1117,7 @@ struct kvm_x86_ops {
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
-	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+	void (*set_supported_cpuid)(struct kvm_cpuid_entry2 *entry);
 
 	bool (*has_wbinvd_exit)(void);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2dfa8c4333d0..5e399a577e2a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -801,7 +801,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		break;
 	}
 
-	kvm_x86_ops->set_supported_cpuid(function, entry);
+	kvm_x86_ops->set_supported_cpuid(entry);
 
 	r = 0;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 191ed75b7457..53357152809e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5973,9 +5973,9 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 
 #define F feature_bit
 
-static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
-	switch (func) {
+	switch (entry->function) {
 	case 0x1:
 		if (avic)
 			entry->ecx &= ~F(X2APIC);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0f1550b7f631..b21281fdbb8f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7098,9 +7098,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 		update_intel_pt_cfg(vcpu);
 }
 
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
-	if (func == 1 && nested)
+	if (entry->function == 1 && nested)
 		entry->ecx |= feature_bit(VMX);
 }
 
-- 
Gitee


From 256b09e94ffa3a7a0dc6f590f81d793f71211f6e Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 18 May 2021 17:15:23 +0800
Subject: [PATCH 0029/2161] KVM: x86: Use u32 for holding CPUID register value
 in helpers

commit 3be5a60b454a19979963a246a6286255ce965fcf upstream

Change the intermediate CPUID output register values from "int" to
 "u32" to match both hardware and the storage type in struct cpuid_reg.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 072a7a1e7624..08e038997a9b 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -96,7 +96,7 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
 	return reverse_cpuid[x86_leaf];
 }
 
-static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
 {
 	struct kvm_cpuid_entry2 *entry;
 	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
@@ -122,7 +122,7 @@ static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsi
 
 static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature)
 {
-	int *reg;
+	u32 *reg;
 
 	if (x86_feature == X86_FEATURE_XSAVE &&
 			!static_cpu_has(X86_FEATURE_XSAVE))
@@ -137,7 +137,7 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_
 
 static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
 {
-	int *reg;
+	u32 *reg;
 
 	reg = guest_cpuid_get_register(vcpu, x86_feature);
 	if (reg)
-- 
Gitee


From d43549bcaf75813fcfe391fc7e41402819077a88 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:29 -0800
Subject: [PATCH 0030/2161] KVM: x86: Replace bare "unsigned" with "unsigned
 int" in cpuid helpers

commit 5e12b2bb34e9fb81d6942f9d300e6d259cc15431 upstream

Replace "unsigned" with "unsigned int" to make checkpatch and people
everywhere a little bit happier, and to avoid propagating the filth when
future patches add more cpuid helpers that work with unsigned (ints).

No functional change intended.

Suggested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 08e038997a9b..388d97a354a4 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -64,7 +64,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
  * and can't be used by KVM to query/control guest capabilities.  And obviously
  * the leaf being queried must have an entry in the lookup table.
  */
-static __always_inline void reverse_cpuid_check(unsigned x86_leaf)
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
 {
 	BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
 	BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
@@ -88,15 +88,16 @@ static __always_inline u32 __feature_bit(int x86_feature)
 
 #define feature_bit(name)  __feature_bit(X86_FEATURE_##name)
 
-static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
 {
-	unsigned x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = x86_feature / 32;
 
 	reverse_cpuid_check(x86_leaf);
 	return reverse_cpuid[x86_leaf];
 }
 
-static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
+						     unsigned int x86_feature)
 {
 	struct kvm_cpuid_entry2 *entry;
 	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
@@ -120,7 +121,8 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsi
 	}
 }
 
-static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
+					    unsigned int x86_feature)
 {
 	u32 *reg;
 
@@ -135,7 +137,8 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_
 	return *reg & __feature_bit(x86_feature);
 }
 
-static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
+					      unsigned int x86_feature)
 {
 	u32 *reg;
 
-- 
Gitee


From 7a4f9dcfb2e1f0c92b0999314668c8c93ccff224 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:30 -0800
Subject: [PATCH 0031/2161] KVM: x86: Introduce cpuid_entry_{get,has}()
 accessors

commit 4c61534aaae2a170bf0abd72676624fd3fabc655 upstream

Introduce accessors to retrieve feature bits from CPUID entries and use
the new accessors where applicable.  Using the accessors eliminates the
need to manually specify the register to be queried at no extra cost
(binary output is identical) and will allow adding runtime consistency
checks on the function and index in a future patch.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c |  9 +++++----
 arch/x86/kvm/cpuid.h | 48 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 5e399a577e2a..3016333f06cb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -85,7 +85,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 		best->edx |= F(APIC);
 
 	if (apic) {
-		if (best->ecx & F(TSC_DEADLINE_TIMER))
+		if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
 			apic->lapic_timer.timer_mode_mask = 3 << 17;
 		else
 			apic->lapic_timer.timer_mode_mask = 1 << 17;
@@ -114,7 +114,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	}
 
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
-	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
+	if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
+		     cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
 	/*
@@ -173,7 +174,7 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
 			break;
 		}
 	}
-	if (entry && (entry->edx & F(NX)) && !is_efer_nx()) {
+	if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
 		entry->edx &= ~F(NX);
 		printk(KERN_INFO "kvm: guest NX capability removed\n");
 	}
@@ -405,7 +406,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 		entry->ebx |= F(TSC_ADJUST);
 
 		entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
-		f_la57 = entry->ecx & F(LA57);
+		f_la57 = cpuid_entry_get(entry, X86_FEATURE_LA57);
 		cpuid_mask(&entry->ecx, CPUID_7_ECX);
 		/* Set LA57 based on hardware capability. */
 		entry->ecx |= f_la57;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 388d97a354a4..03d7eb4efd1b 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -96,17 +96,10 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_featu
 	return reverse_cpuid[x86_leaf];
 }
 
-static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
-						     unsigned int x86_feature)
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+						  const struct cpuid_reg *cpuid)
 {
-	struct kvm_cpuid_entry2 *entry;
-	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
-
-	entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
-	if (!entry)
-		return NULL;
-
-	switch (cpuid.reg) {
+	switch (cpuid->reg) {
 	case CPUID_EAX:
 		return &entry->eax;
 	case CPUID_EBX:
@@ -121,6 +114,41 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
 	}
 }
 
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+						unsigned int x86_feature)
+{
+	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+	return __cpuid_entry_get_reg(entry, &cpuid);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+					   unsigned int x86_feature)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+	return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+					    unsigned int x86_feature)
+{
+	return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
+						     unsigned int x86_feature)
+{
+	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+	struct kvm_cpuid_entry2 *entry;
+
+	entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
+	if (!entry)
+		return NULL;
+
+	return __cpuid_entry_get_reg(entry, &cpuid);
+}
+
 static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
 					    unsigned int x86_feature)
 {
-- 
Gitee


From 51717c26de6cf948d6a08c2f315edffd516b5ad6 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 21 May 2021 11:09:49 +0800
Subject: [PATCH 0032/2161] KVM: x86: Introduce
 cpuid_entry_{change,set,clear}() mutators

commit b32666b13a72a1fd9f5078d8142bd7325022520f upstream

Introduce mutators to modify feature bits in CPUID entries and use the
new mutators where applicable.  Using the mutators eliminates the need
to manually specify the register to modify query at no extra cost and
will allow adding runtime consistency checks on the function/index in a
future patch.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 62 +++++++++++++++++++-------------------------
 arch/x86/kvm/cpuid.h | 32 +++++++++++++++++++++++
 arch/x86/kvm/svm.c   | 13 ++++------
 3 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 3016333f06cb..ca23b29bb729 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -74,15 +74,12 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 		return 0;
 
 	/* Update OSXSAVE bit */
-	if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) {
-		best->ecx &= ~F(OSXSAVE);
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
-			best->ecx |= F(OSXSAVE);
-	}
+	if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1)
+		cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
+				   kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
 
-	best->edx &= ~F(APIC);
-	if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
-		best->edx |= F(APIC);
+	cpuid_entry_change(best, X86_FEATURE_APIC,
+			   vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
 
 	if (apic) {
 		if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
@@ -92,14 +89,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	}
 
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	if (best) {
-		/* Update OSPKE bit */
-		if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
-			best->ecx &= ~F(OSPKE);
-			if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
-				best->ecx |= F(OSPKE);
-		}
-	}
+	if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
+		cpuid_entry_change(best, X86_FEATURE_OSPKE,
+				   kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
 
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
 	if (!best) {
@@ -137,12 +129,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 
 	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
 		best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
-		if (best) {
-			if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT)
-				best->ecx |= F(MWAIT);
-			else
-				best->ecx &= ~F(MWAIT);
-		}
+		if (best)
+			cpuid_entry_change(best, X86_FEATURE_MWAIT,
+					   vcpu->arch.ia32_misc_enable_msr &
+					   MSR_IA32_MISC_ENABLE_MWAIT);
 	}
 
 	/* Update physical-address width */
@@ -175,7 +165,7 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
 		}
 	}
 	if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
-		entry->edx &= ~F(NX);
+		cpuid_entry_clear(entry, X86_FEATURE_NX);
 		printk(KERN_INFO "kvm: guest NX capability removed\n");
 	}
 }
@@ -403,7 +393,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 		entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
 		cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
 		/* TSC_ADJUST is emulated */
-		entry->ebx |= F(TSC_ADJUST);
+		cpuid_entry_set(entry, X86_FEATURE_TSC_ADJUST);
 
 		entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
 		f_la57 = cpuid_entry_get(entry, X86_FEATURE_LA57);
@@ -414,21 +404,21 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 		entry->ecx |= f_pku;
 		/* PKU is not yet implemented for shadow paging. */
 		if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
-			entry->ecx &= ~F(PKU);
+			cpuid_entry_clear(entry, X86_FEATURE_PKU);
 
 		entry->edx &= kvm_cpuid_7_0_edx_x86_features;
 		cpuid_mask(&entry->edx, CPUID_7_EDX);
 		if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
-			entry->edx |= F(SPEC_CTRL);
+			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL);
 		if (boot_cpu_has(X86_FEATURE_STIBP))
-			entry->edx |= F(INTEL_STIBP);
+			cpuid_entry_set(entry, X86_FEATURE_INTEL_STIBP);
 		if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
-			entry->edx |= F(SPEC_CTRL_SSBD);
+			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL_SSBD);
 		/*
 		 * We emulate ARCH_CAPABILITIES in software even
 		 * if the host doesn't support it.
 		 */
-		entry->edx |= F(ARCH_CAPABILITIES);
+		cpuid_entry_set(entry, X86_FEATURE_ARCH_CAPABILITIES);
 		break;
 	case 1:
 		entry->eax &= kvm_cpuid_7_1_eax_x86_features;
@@ -540,7 +530,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		cpuid_mask(&entry->ecx, CPUID_1_ECX);
 		/* we support x2apic emulation even if host does not support
 		 * it since we emulate x2apic in software */
-		entry->ecx |= F(X2APIC);
+		cpuid_entry_set(entry, X86_FEATURE_X2APIC);
 		break;
 	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
 	 * may return different values. This forces us to get_cpu() before
@@ -759,22 +749,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		 * record that in cpufeatures so use them.
 		 */
 		if (boot_cpu_has(X86_FEATURE_IBPB))
-			entry->ebx |= F(AMD_IBPB);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_IBPB);
 		if (boot_cpu_has(X86_FEATURE_IBRS))
-			entry->ebx |= F(AMD_IBRS);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_IBRS);
 		if (boot_cpu_has(X86_FEATURE_STIBP))
-			entry->ebx |= F(AMD_STIBP);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_STIBP);
 		if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
-			entry->ebx |= F(AMD_SSBD);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_SSBD);
 		if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
-			entry->ebx |= F(AMD_SSB_NO);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_SSB_NO);
 		/*
 		 * The preference is to use SPEC CTRL MSR instead of the
 		 * VIRT_SPEC MSR.
 		 */
 		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
 		    !boot_cpu_has(X86_FEATURE_AMD_SSBD))
-			entry->ebx |= F(VIRT_SSBD);
+			cpuid_entry_set(entry, X86_FEATURE_VIRT_SSBD);
 		break;
 	}
 	case 0x80000019:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 03d7eb4efd1b..cbc04d78461b 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -136,6 +136,38 @@ static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
 	return cpuid_entry_get(entry, x86_feature);
 }
 
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+					      unsigned int x86_feature)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+	*reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+					    unsigned int x86_feature)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+	*reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+					       unsigned int x86_feature,
+					       bool set)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+	/*
+	 * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+	 * compiler into using CMOV instead of Jcc when possible.
+	 */
+	if (set)
+		*reg |= __feature_bit(x86_feature);
+	else
+		*reg &= ~__feature_bit(x86_feature);
+}
+
 static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
 						     unsigned int x86_feature)
 {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 53357152809e..a8c9c5d9a919 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5971,23 +5971,21 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 	guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
 }
 
-#define F feature_bit
-
 static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
 	switch (entry->function) {
 	case 0x1:
 		if (avic)
-			entry->ecx &= ~F(X2APIC);
+			cpuid_entry_clear(entry, X86_FEATURE_X2APIC);
 		break;
 	case 0x80000001:
 		if (nested)
-			entry->ecx |= (1 << 2); /* Set SVM bit */
+			cpuid_entry_set(entry, X86_FEATURE_SVM);
 		break;
 	case 0x80000008:
 		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 		     boot_cpu_has(X86_FEATURE_AMD_SSBD))
-			entry->ebx |= F(VIRT_SSBD);
+			cpuid_entry_set(entry, X86_FEATURE_VIRT_SSBD);
 		break;
 	case 0x8000000A:
 		entry->eax = 1; /* SVM revision 1 */
@@ -5999,12 +5997,11 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 
 		/* Support next_rip if host supports it */
 		if (boot_cpu_has(X86_FEATURE_NRIPS))
-			entry->edx |= F(NRIPS);
+			cpuid_entry_set(entry, X86_FEATURE_NRIPS);
 
 		/* Support NPT for the guest if enabled */
 		if (npt_enabled)
-			entry->edx |= F(NPT);
-
+			cpuid_entry_set(entry, X86_FEATURE_NPT);
 		break;
 	case 0x8000001F:
 		/* Support memory encryption cpuid if host supports it */
-- 
Gitee


From 62d2e40e428777821723f0a271af157e82dbe90b Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 19 May 2021 17:53:34 +0800
Subject: [PATCH 0033/2161] KVM: x86: Refactor cpuid_mask() to auto-retrieve
 the register

commit e745e37d49771b8a06955f18c7a435830c0a4a5c upstream

Use the recently introduced cpuid_entry_get_reg() to automatically get
the appropriate register when masking a CPUID entry.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 26 ++++++++++----------------
 arch/x86/kvm/cpuid.h |  8 ++++++++
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ca23b29bb729..aaed6a62fea0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -272,12 +272,6 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-static __always_inline void cpuid_mask(u32 *word, int wordnum)
-{
-	reverse_cpuid_check(wordnum);
-	*word &= boot_cpu_data.x86_capability[wordnum];
-}
-
 struct kvm_cpuid_array {
 	struct kvm_cpuid_entry2 *entries;
 	const int maxnent;
@@ -391,13 +385,13 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 	case 0:
 		entry->eax = min(entry->eax, 1u);
 		entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
-		cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
+		cpuid_entry_mask(entry, CPUID_7_0_EBX);
 		/* TSC_ADJUST is emulated */
 		cpuid_entry_set(entry, X86_FEATURE_TSC_ADJUST);
 
 		entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
 		f_la57 = cpuid_entry_get(entry, X86_FEATURE_LA57);
-		cpuid_mask(&entry->ecx, CPUID_7_ECX);
+		cpuid_entry_mask(entry, CPUID_7_ECX);
 		/* Set LA57 based on hardware capability. */
 		entry->ecx |= f_la57;
 		entry->ecx |= f_umip;
@@ -407,7 +401,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 			cpuid_entry_clear(entry, X86_FEATURE_PKU);
 
 		entry->edx &= kvm_cpuid_7_0_edx_x86_features;
-		cpuid_mask(&entry->edx, CPUID_7_EDX);
+		cpuid_entry_mask(entry, CPUID_7_EDX);
 		if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
 			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL);
 		if (boot_cpu_has(X86_FEATURE_STIBP))
@@ -525,9 +519,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		break;
 	case 1:
 		entry->edx &= kvm_cpuid_1_edx_x86_features;
-		cpuid_mask(&entry->edx, CPUID_1_EDX);
+		cpuid_entry_mask(entry, CPUID_1_EDX);
 		entry->ecx &= kvm_cpuid_1_ecx_x86_features;
-		cpuid_mask(&entry->ecx, CPUID_1_ECX);
+		cpuid_entry_mask(entry, CPUID_1_ECX);
 		/* we support x2apic emulation even if host does not support
 		 * it since we emulate x2apic in software */
 		cpuid_entry_set(entry, X86_FEATURE_X2APIC);
@@ -640,7 +634,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			goto out;
 
 		entry->eax &= kvm_cpuid_D_1_eax_x86_features;
-		cpuid_mask(&entry->eax, CPUID_D_1_EAX);
+		cpuid_entry_mask(entry, CPUID_D_1_EAX);
 		if (entry->eax & (F(XSAVES)|F(XSAVEC)))
 			entry->ebx = xstate_required_size(supported, true);
 		else
@@ -721,9 +715,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		break;
 	case 0x80000001:
 		entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
-		cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
+		cpuid_entry_mask(entry, CPUID_8000_0001_EDX);
 		entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
-		cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
+		cpuid_entry_mask(entry, CPUID_8000_0001_ECX);
 		break;
 	case 0x80000007: /* Advanced power management */
 		/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -742,7 +736,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
 		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
-		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+		cpuid_entry_mask(entry, CPUID_8000_0008_EBX);
 		/*
 		 * AMD has separate bits for each SPEC_CTRL bit.
 		 * arch/x86/kernel/cpu/bugs.c is kind enough to
@@ -780,7 +774,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		break;
 	case 0xC0000001:
 		entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
-		cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
+		cpuid_entry_mask(entry, CPUID_C000_0001_EDX);
 		break;
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index cbc04d78461b..b688ecc9d1fb 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -168,6 +168,14 @@ static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
 		*reg &= ~__feature_bit(x86_feature);
 }
 
+static __always_inline void cpuid_entry_mask(struct kvm_cpuid_entry2 *entry,
+					     enum cpuid_leafs leaf)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
+
+	*reg &= boot_cpu_data.x86_capability[leaf];
+}
+
 static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
 						     unsigned int x86_feature)
 {
-- 
Gitee


From e02d2d9aff9a61d913c8aa7c42f720be2c893d1f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:33 -0800
Subject: [PATCH 0034/2161] KVM: x86: Handle MPX CPUID adjustment in VMX code

commit 6c7ea4b56bfe7a11ecbaef9c521a7974fc1171cf upstream

Move the MPX CPUID adjustments into VMX to eliminate an instance of the
undesirable "unsigned f_* = *_supported ? F(*) : 0" pattern in the
common CPUID handling code.

Note, to maintain existing behavior, VMX must manually check for kernel
support for MPX by querying boot_cpu_has(X86_FEATURE_MPX).  Previously,
do_cpuid_7_mask() masked MPX based on boot_cpu_data by invoking
cpuid_mask() on the associated cpufeatures word, but cpuid_mask() runs
prior to executing vmx_set_supported_cpuid().

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c   |  3 +--
 arch/x86/kvm/vmx/vmx.c | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index aaed6a62fea0..7ac921f71bf4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -350,7 +350,6 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 {
 	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
-	unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
 	unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
 	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 	unsigned f_la57;
@@ -359,7 +358,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 	/* cpuid 7.0.ebx */
 	const u32 kvm_cpuid_7_0_ebx_x86_features =
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
+		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
 		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
 		F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b21281fdbb8f..6df0f54661e6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7100,8 +7100,18 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
 static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
-	if (entry->function == 1 && nested)
-		entry->ecx |= feature_bit(VMX);
+	switch (entry->function) {
+	case 0x1:
+		if (nested)
+			cpuid_entry_set(entry, X86_FEATURE_VMX);
+		break;
+	case 0x7:
+		if (boot_cpu_has(X86_FEATURE_MPX) && kvm_mpx_supported())
+			cpuid_entry_set(entry, X86_FEATURE_MPX);
+		break;
+	default:
+		break;
+	}
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-- 
Gitee


From 882e638d021a6e0558ee51d08320e4da8a066566 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:34 -0800
Subject: [PATCH 0035/2161] KVM: x86: Handle INVPCID CPUID adjustment in VMX
 code

commit 5ffec6f910dc8998c9da9320550ffddebe2e7afc upstream

Move the INVPCID CPUID adjustments into VMX to eliminate an instance of
the undesirable "unsigned f_* = *_supported ? F(*) : 0" pattern in the
common CPUID handling code.  Drop ->invpcid_supported(), CPUID
adjustment was the only user.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/cpuid.c            |  3 +--
 arch/x86/kvm/svm.c              |  6 ------
 arch/x86/kvm/vmx/vmx.c          | 10 +++-------
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b88321b66daa..2a4a8091a8e5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1113,7 +1113,6 @@ struct kvm_x86_ops {
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
-	bool (*invpcid_supported)(void);
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7ac921f71bf4..e1c3c3cd28e0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -349,7 +349,6 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 
 static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 {
-	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
 	unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
 	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 	unsigned f_la57;
@@ -358,7 +357,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 	/* cpuid 7.0.ebx */
 	const u32 kvm_cpuid_7_0_ebx_x86_features =
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
+		F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
 		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
 		F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a8c9c5d9a919..bc681bd198eb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6022,11 +6022,6 @@ static bool svm_rdtscp_supported(void)
 	return boot_cpu_has(X86_FEATURE_RDTSCP);
 }
 
-static bool svm_invpcid_supported(void)
-{
-	return false;
-}
-
 static bool svm_mpx_supported(void)
 {
 	return false;
@@ -7345,7 +7340,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.cpuid_update = svm_cpuid_update,
 
 	.rdtscp_supported = svm_rdtscp_supported,
-	.invpcid_supported = svm_invpcid_supported,
 	.mpx_supported = svm_mpx_supported,
 	.xsaves_supported = svm_xsaves_supported,
 	.umip_emulated = svm_umip_emulated,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6df0f54661e6..9200788a09c7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1631,11 +1631,6 @@ static bool vmx_rdtscp_supported(void)
 	return cpu_has_vmx_rdtscp();
 }
 
-static bool vmx_invpcid_supported(void)
-{
-	return cpu_has_vmx_invpcid();
-}
-
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
@@ -4072,7 +4067,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 		}
 	}
 
-	if (vmx_invpcid_supported()) {
+	if (cpu_has_vmx_invpcid()) {
 		/* Exposing INVPCID only when PCID is exposed */
 		bool invpcid_enabled =
 			guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@ -7108,6 +7103,8 @@ static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 	case 0x7:
 		if (boot_cpu_has(X86_FEATURE_MPX) && kvm_mpx_supported())
 			cpuid_entry_set(entry, X86_FEATURE_MPX);
+		if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_has_vmx_invpcid())
+			cpuid_entry_set(entry, X86_FEATURE_INVPCID);
 		break;
 	default:
 		break;
@@ -7900,7 +7897,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpuid_update = vmx_cpuid_update,
 
 	.rdtscp_supported = vmx_rdtscp_supported,
-	.invpcid_supported = vmx_invpcid_supported,
 
 	.set_supported_cpuid = vmx_set_supported_cpuid,
 
-- 
Gitee


From 02d80059dc358ddbcaafe0c712564947162aa9f5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:35 -0800
Subject: [PATCH 0036/2161] KVM: x86: Handle UMIP emulation CPUID adjustment in
 VMX code

commit e574768f841ba600009da06b3027d9413dba868f upstream

Move the CPUID adjustment for UMIP emulation into VMX code to eliminate
an instance of the undesirable "unsigned f_* = *_supported ? F(*) : 0"
pattern in the common CPUID handling code.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c   | 2 --
 arch/x86/kvm/vmx/vmx.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e1c3c3cd28e0..7052616bf6aa 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -349,7 +349,6 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 
 static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 {
-	unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
 	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 	unsigned f_la57;
 	unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
@@ -392,7 +391,6 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 		cpuid_entry_mask(entry, CPUID_7_ECX);
 		/* Set LA57 based on hardware capability. */
 		entry->ecx |= f_la57;
-		entry->ecx |= f_umip;
 		entry->ecx |= f_pku;
 		/* PKU is not yet implemented for shadow paging. */
 		if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9200788a09c7..f8ca2cd49440 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7105,6 +7105,8 @@ static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 			cpuid_entry_set(entry, X86_FEATURE_MPX);
 		if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_has_vmx_invpcid())
 			cpuid_entry_set(entry, X86_FEATURE_INVPCID);
+		if (vmx_umip_emulated())
+			cpuid_entry_set(entry, X86_FEATURE_UMIP);
 		break;
 	default:
 		break;
-- 
Gitee


From 06e81cd2b9e63f5a809ffe266f78dd5c0df4b058 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:36 -0800
Subject: [PATCH 0037/2161] KVM: x86: Handle PKU CPUID adjustment in VMX code

commit d64d83d1e026f9fea9c8f18bf97b9529f7e4189c upstream

Move the setting of the PKU CPUID bit into VMX to eliminate an instance
of the undesirable "unsigned f_* = *_supported ? F(*) : 0" pattern in
the common CPUID handling code.  Drop ->pku_supported(), CPUID
adjustment was the only user.

Note, some AMD CPUs now support PKU, but SVM doesn't yet support
exposing it to a guest.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/cpuid.c            | 5 -----
 arch/x86/kvm/svm.c              | 6 ------
 arch/x86/kvm/vmx/capabilities.h | 5 -----
 arch/x86/kvm/vmx/vmx.c          | 6 +++++-
 5 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2a4a8091a8e5..8a4d8e98767c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1134,7 +1134,6 @@ struct kvm_x86_ops {
 	bool (*xsaves_supported)(void);
 	bool (*umip_emulated)(void);
 	bool (*pt_supported)(void);
-	bool (*pku_supported)(void);
 
 	int (*check_nested_events)(struct kvm_vcpu *vcpu);
 	void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7052616bf6aa..95f170edd5f4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -351,7 +351,6 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 {
 	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 	unsigned f_la57;
-	unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
 
 	/* cpuid 7.0.ebx */
 	const u32 kvm_cpuid_7_0_ebx_x86_features =
@@ -391,10 +390,6 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 		cpuid_entry_mask(entry, CPUID_7_ECX);
 		/* Set LA57 based on hardware capability. */
 		entry->ecx |= f_la57;
-		entry->ecx |= f_pku;
-		/* PKU is not yet implemented for shadow paging. */
-		if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
-			cpuid_entry_clear(entry, X86_FEATURE_PKU);
 
 		entry->edx &= kvm_cpuid_7_0_edx_x86_features;
 		cpuid_entry_mask(entry, CPUID_7_EDX);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bc681bd198eb..ba39c3502839 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6047,11 +6047,6 @@ static bool svm_has_wbinvd_exit(void)
 	return true;
 }
 
-static bool svm_pku_supported(void)
-{
-	return false;
-}
-
 #define PRE_EX(exit)  { .exit_code = (exit), \
 			.stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -7344,7 +7339,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.xsaves_supported = svm_xsaves_supported,
 	.umip_emulated = svm_umip_emulated,
 	.pt_supported = svm_pt_supported,
-	.pku_supported = svm_pku_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index f486e2606247..100cacde88fc 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -146,11 +146,6 @@ static inline bool vmx_umip_emulated(void)
 		SECONDARY_EXEC_DESC;
 }
 
-static inline bool vmx_pku_supported(void)
-{
-	return boot_cpu_has(X86_FEATURE_PKU);
-}
-
 static inline bool cpu_has_vmx_rdtscp(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f8ca2cd49440..0b8f76ba3f1b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7107,6 +7107,11 @@ static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 			cpuid_entry_set(entry, X86_FEATURE_INVPCID);
 		if (vmx_umip_emulated())
 			cpuid_entry_set(entry, X86_FEATURE_UMIP);
+
+		/* PKU is not yet implemented for shadow paging. */
+		if (enable_ept && boot_cpu_has(X86_FEATURE_PKU) &&
+		    boot_cpu_has(X86_FEATURE_OSPKE))
+			cpuid_entry_set(entry, X86_FEATURE_PKU);
 		break;
 	default:
 		break;
@@ -7915,7 +7920,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.xsaves_supported = vmx_xsaves_supported,
 	.umip_emulated = vmx_umip_emulated,
 	.pt_supported = vmx_pt_supported,
-	.pku_supported = vmx_pku_supported,
 
 	.request_immediate_exit = vmx_request_immediate_exit,
 
-- 
Gitee


From 883d67ef4e965785cd59669c5aa3a72c502f44f2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:37 -0800
Subject: [PATCH 0038/2161] KVM: x86: Handle RDTSCP CPUID adjustment in VMX
 code

commit 733deafc00df1dda5130fc14f87a1d3993913243 upstream

Move the clearing of the RDTSCP CPUID bit into VMX, which has a separate
VMCS control to enable RDTSCP in non-root, to eliminate an instance of
the undesirable "unsigned f_* = *_supported ? F(*) : 0" pattern in the
common CPUID handling code.  Drop ->rdtscp_supported() since CPUID
adjustment was the last remaining user.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c   | 3 +--
 arch/x86/kvm/vmx/vmx.c | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 95f170edd5f4..8ebe6327e900 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -434,7 +434,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	unsigned f_gbpages = 0;
 	unsigned f_lm = 0;
 #endif
-	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
 	unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
 	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
@@ -456,7 +455,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
 		F(PAT) | F(PSE36) | 0 /* Reserved */ |
 		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
+		F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	const u32 kvm_cpuid_1_ecx_x86_features =
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0b8f76ba3f1b..cb7024c048f5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7113,6 +7113,10 @@ static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 		    boot_cpu_has(X86_FEATURE_OSPKE))
 			cpuid_entry_set(entry, X86_FEATURE_PKU);
 		break;
+	case 0x80000001:
+		if (!cpu_has_vmx_rdtscp())
+			cpuid_entry_clear(entry, X86_FEATURE_RDTSCP);
+		break;
 	default:
 		break;
 	}
-- 
Gitee


From 217a033557fa0d8a190283682a5d135439d16e1b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:38 -0800
Subject: [PATCH 0039/2161] KVM: x86: Handle Intel PT CPUID adjustment in VMX
 code

commit dbd068040c64162fbbfa278eb63ef64704190612 upstream

Move the Processor Trace CPUID adjustment into VMX code to eliminate
an instance of the undesirable "unsigned f_* = *_supported ? F(*) : 0"
pattern in the common CPUID handling code, and to pave the way toward
eventually removing ->pt_supported().

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c   | 3 +--
 arch/x86/kvm/vmx/vmx.c | 3 +++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8ebe6327e900..57f70bf4f01c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -349,7 +349,6 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 
 static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 {
-	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 	unsigned f_la57;
 
 	/* cpuid 7.0.ebx */
@@ -358,7 +357,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 		F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
 		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
-		F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
+		F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/;
 
 	/* cpuid 7.0.ecx*/
 	const u32 kvm_cpuid_7_0_ecx_x86_features =
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cb7024c048f5..bed726750fcd 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7105,6 +7105,9 @@ static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 			cpuid_entry_set(entry, X86_FEATURE_MPX);
 		if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_has_vmx_invpcid())
 			cpuid_entry_set(entry, X86_FEATURE_INVPCID);
+		if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+		    vmx_pt_supported())
+			cpuid_entry_set(entry, X86_FEATURE_INTEL_PT);
 		if (vmx_umip_emulated())
 			cpuid_entry_set(entry, X86_FEATURE_UMIP);
 
-- 
Gitee


From 617ef7008fdc742f130db554480e94383598cca7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 3 Mar 2020 15:54:39 +0100
Subject: [PATCH 0040/2161] KVM: x86: handle GBPAGE CPUID adjustment for EPT
 with generic code

commit fb7d4377d513145303c1d0a192cb4b33d72be2d9 upstream

The clearing of the GBPAGE CPUID bit for VMX is wrong; support for 1GB
pages in EPT has no relationship to whether 1GB pages should be marked as
supported in CPUID.  This has no ill effect because we're only clearing
the bit, but we're not marking 1GB pages as available when EPT is disabled
(even though they are actually supported thanks to shadowing).  Instead,
forcibly enable 1GB pages in the shadow paging case.

This also eliminates an instance of the undesirable "unsigned f_* =
*_supported ? F(*) : 0" pattern in the common CPUID handling code,
and paves the way toward eliminating ->get_lpage_level().

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 57f70bf4f01c..50916699a5c8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -426,8 +426,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	int r, i, max_idx;
 	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
 #ifdef CONFIG_X86_64
-	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
-				? F(GBPAGES) : 0;
+	unsigned f_gbpages = F(GBPAGES);
 	unsigned f_lm = F(LM);
 #else
 	unsigned f_gbpages = 0;
@@ -705,6 +704,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	case 0x80000001:
 		entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
 		cpuid_entry_mask(entry, CPUID_8000_0001_EDX);
+		if (!tdp_enabled)
+			cpuid_entry_set(entry, X86_FEATURE_GBPAGES);
 		entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
 		cpuid_entry_mask(entry, CPUID_8000_0001_ECX);
 		break;
-- 
Gitee


From dbaea01161f7dd52e7fa22fb9ae4606b9a9195b5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:40 -0800
Subject: [PATCH 0041/2161] KVM: x86: Refactor handling of XSAVES CPUID
 adjustment

commit 9e6d01c2d9088efb8326997cafa8580295a49435 upstream

Invert the handling of XSAVES, i.e. set it based on boot_cpu_has() by
default, in preparation for adding KVM cpu caps, which will generate the
mask at load time before ->xsaves_supported() is ready.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 50916699a5c8..226b1698008e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -432,7 +432,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	unsigned f_gbpages = 0;
 	unsigned f_lm = 0;
 #endif
-	unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
 	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
 	/* cpuid 1.edx */
@@ -489,7 +488,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 
 	/* cpuid 0xD.1.eax */
 	const u32 kvm_cpuid_D_1_eax_x86_features =
-		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
+		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
@@ -623,6 +622,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 
 		entry->eax &= kvm_cpuid_D_1_eax_x86_features;
 		cpuid_entry_mask(entry, CPUID_D_1_EAX);
+
+		if (!kvm_x86_ops->xsaves_supported())
+			cpuid_entry_clear(entry, X86_FEATURE_XSAVES);
+
 		if (entry->eax & (F(XSAVES)|F(XSAVEC)))
 			entry->ebx = xstate_required_size(supported, true);
 		else
-- 
Gitee


From 8827d2ceaa85b760eb9bc1bc9b613a1134bda95e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:41 -0800
Subject: [PATCH 0042/2161] KVM: x86: Introduce kvm_cpu_caps to replace runtime
 CPUID masking

commit 66a6950f99950c77e2898e48d668eca1dac10a1e upstream

Calculate the CPUID masks for KVM_GET_SUPPORTED_CPUID at load time using
what is effectively a KVM-adjusted copy of boot_cpu_data, or more
precisely, the x86_capability array in boot_cpu_data.

In terms of KVM support, the vast majority of CPUID feature bits are
constant, and *all* feature support is known at KVM load time.  Rather
than apply boot_cpu_data, which is effectively read-only after init,
at runtime, copy it into a KVM-specific array and use *that* to mask
CPUID registers.

In additional to consolidating the masking, kvm_cpu_caps can be adjusted
by SVM/VMX at load time and thus eliminate all feature bit manipulation
in ->set_supported_cpuid().

Opportunistically clean up a few warts:

  - Replace bare "unsigned" with "unsigned int" when a feature flag is
    captured in a local variable, e.g. f_nx.

  - Sort the CPUID masks by function, index and register (alphabetically
    for registers, i.e. EBX comes before ECX/EDX).

  - Remove the superfluous /* cpuid 7.0.ecx */ comments.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
[Call kvm_set_cpu_caps from kvm_x86_ops->hardware_setup due to fixed
 GBPAGES patch. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c   | 233 ++++++++++++++++++++++-------------------
 arch/x86/kvm/cpuid.h   |  22 +++-
 arch/x86/kvm/svm.c     |   2 +
 arch/x86/kvm/vmx/vmx.c |   2 +
 4 files changed, 151 insertions(+), 108 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 226b1698008e..7a44c304d951 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -24,6 +24,13 @@
 #include "trace.h"
 #include "pmu.h"
 
+/*
+ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
+ * aligned to sizeof(unsigned long) because it's not accessed via bitops.
+ */
+u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_cpu_caps);
+
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 {
 	int feature_bit = 0;
@@ -272,6 +279,123 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 	return r;
 }
 
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+	reverse_cpuid_check(leaf);
+	kvm_cpu_caps[leaf] &= mask;
+}
+
+void kvm_set_cpu_caps(void)
+{
+	unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+	unsigned int f_gbpages = F(GBPAGES);
+	unsigned int f_lm = F(LM);
+#else
+	unsigned int f_gbpages = 0;
+	unsigned int f_lm = 0;
+#endif
+
+	BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+		     sizeof(boot_cpu_data.x86_capability));
+
+	memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
+	       sizeof(kvm_cpu_caps));
+
+	kvm_cpu_cap_mask(CPUID_1_ECX,
+		/*
+		 * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
+		 * advertised to guests via CPUID!
+		 */
+		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+		0 /* DS-CPL, VMX, SMX, EST */ |
+		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
+		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+		F(F16C) | F(RDRAND)
+	);
+
+	kvm_cpu_cap_mask(CPUID_1_EDX,
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
+		0 /* Reserved, DS, ACPI */ | F(MMX) |
+		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+		0 /* HTT, TM, Reserved, PBE */
+	);
+
+	kvm_cpu_cap_mask(CPUID_7_0_EBX,
+		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+		F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
+		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+		F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
+	);
+
+	kvm_cpu_cap_mask(CPUID_7_ECX,
+		F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
+		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+	);
+	/* Set LA57 based on hardware capability. */
+	if (cpuid_ecx(7) & F(LA57))
+		kvm_cpu_cap_set(X86_FEATURE_LA57);
+
+	kvm_cpu_cap_mask(CPUID_7_EDX,
+		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+		F(MD_CLEAR)
+	);
+
+	kvm_cpu_cap_mask(CPUID_7_1_EAX,
+		F(AVX512_BF16)
+	);
+
+	kvm_cpu_cap_mask(CPUID_D_1_EAX,
+		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
+	);
+
+	kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
+		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+		F(TOPOEXT) | F(PERFCTR_CORE)
+	);
+
+	kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* Reserved */ |
+		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+		F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
+		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
+	);
+
+	if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
+		kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+
+	kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
+		F(CLZERO) | F(XSAVEERPTR) |
+		F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+		F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
+	);
+
+	kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
+		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+		F(PMM) | F(PMM_EN)
+	);
+}
+EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
+
 struct kvm_cpuid_array {
 	struct kvm_cpuid_entry2 *entries;
 	const int maxnent;
@@ -349,48 +473,13 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 
 static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 {
-	unsigned f_la57;
-
-	/* cpuid 7.0.ebx */
-	const u32 kvm_cpuid_7_0_ebx_x86_features =
-		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-		F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
-		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
-		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
-		F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/;
-
-	/* cpuid 7.0.ecx*/
-	const u32 kvm_cpuid_7_0_ecx_x86_features =
-		F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
-		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
-		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
-
-	/* cpuid 7.0.edx*/
-	const u32 kvm_cpuid_7_0_edx_x86_features =
-		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
-		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
-		F(MD_CLEAR);
-
-	/* cpuid 7.1.eax */
-	const u32 kvm_cpuid_7_1_eax_x86_features =
-		F(AVX512_BF16);
-
 	switch (entry->index) {
 	case 0:
 		entry->eax = min(entry->eax, 1u);
-		entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
 		cpuid_entry_mask(entry, CPUID_7_0_EBX);
 		/* TSC_ADJUST is emulated */
 		cpuid_entry_set(entry, X86_FEATURE_TSC_ADJUST);
-
-		entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
-		f_la57 = cpuid_entry_get(entry, X86_FEATURE_LA57);
 		cpuid_entry_mask(entry, CPUID_7_ECX);
-		/* Set LA57 based on hardware capability. */
-		entry->ecx |= f_la57;
-
-		entry->edx &= kvm_cpuid_7_0_edx_x86_features;
 		cpuid_entry_mask(entry, CPUID_7_EDX);
 		if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
 			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL);
@@ -405,7 +494,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
 		cpuid_entry_set(entry, X86_FEATURE_ARCH_CAPABILITIES);
 		break;
 	case 1:
-		entry->eax &= kvm_cpuid_7_1_eax_x86_features;
+		cpuid_entry_mask(entry, CPUID_7_1_EAX);
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
@@ -424,72 +513,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 {
 	struct kvm_cpuid_entry2 *entry;
 	int r, i, max_idx;
-	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
-	unsigned f_gbpages = F(GBPAGES);
-	unsigned f_lm = F(LM);
-#else
-	unsigned f_gbpages = 0;
-	unsigned f_lm = 0;
-#endif
 	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
-	/* cpuid 1.edx */
-	const u32 kvm_cpuid_1_edx_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
-		0 /* Reserved, DS, ACPI */ | F(MMX) |
-		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
-		0 /* HTT, TM, Reserved, PBE */;
-	/* cpuid 0x80000001.edx */
-	const u32 kvm_cpuid_8000_0001_edx_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* Reserved */ |
-		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-		F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
-		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
-	/* cpuid 1.ecx */
-	const u32 kvm_cpuid_1_ecx_x86_features =
-		/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
-		 * but *not* advertised to guests via CPUID ! */
-		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
-		0 /* DS-CPL, VMX, SMX, EST */ |
-		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
-		F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
-		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
-		F(F16C) | F(RDRAND);
-	/* cpuid 0x80000001.ecx */
-	const u32 kvm_cpuid_8000_0001_ecx_x86_features =
-		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
-		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
-		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
-		F(TOPOEXT) | F(PERFCTR_CORE);
-
-	/* cpuid 0x80000008.ebx */
-	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-		F(CLZERO) | F(XSAVEERPTR) |
-		F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
-		F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
-
-	/* cpuid 0xC0000001.edx */
-	const u32 kvm_cpuid_C000_0001_edx_x86_features =
-		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
-		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
-		F(PMM) | F(PMM_EN);
-
-	/* cpuid 0xD.1.eax */
-	const u32 kvm_cpuid_D_1_eax_x86_features =
-		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES);
-
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 
@@ -505,9 +530,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->eax = min(entry->eax, 0x1fU);
 		break;
 	case 1:
-		entry->edx &= kvm_cpuid_1_edx_x86_features;
 		cpuid_entry_mask(entry, CPUID_1_EDX);
-		entry->ecx &= kvm_cpuid_1_ecx_x86_features;
 		cpuid_entry_mask(entry, CPUID_1_ECX);
 		/* we support x2apic emulation even if host does not support
 		 * it since we emulate x2apic in software */
@@ -620,7 +643,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		if (!entry)
 			goto out;
 
-		entry->eax &= kvm_cpuid_D_1_eax_x86_features;
 		cpuid_entry_mask(entry, CPUID_D_1_EAX);
 
 		if (!kvm_x86_ops->xsaves_supported())
@@ -705,11 +727,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->eax = min(entry->eax, 0x8000001f);
 		break;
 	case 0x80000001:
-		entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
 		cpuid_entry_mask(entry, CPUID_8000_0001_EDX);
+		/* Add it manually because it may not be in host CPUID.  */
 		if (!tdp_enabled)
 			cpuid_entry_set(entry, X86_FEATURE_GBPAGES);
-		entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
 		cpuid_entry_mask(entry, CPUID_8000_0001_ECX);
 		break;
 	case 0x80000007: /* Advanced power management */
@@ -728,7 +749,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			g_phys_as = phys_as;
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
-		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
 		cpuid_entry_mask(entry, CPUID_8000_0008_EBX);
 		/*
 		 * AMD has separate bits for each SPEC_CTRL bit.
@@ -766,7 +786,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->eax = min(entry->eax, 0xC0000004);
 		break;
 	case 0xC0000001:
-		entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
 		cpuid_entry_mask(entry, CPUID_C000_0001_EDX);
 		break;
 	case 3: /* Processor serial number */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b688ecc9d1fb..b79060b817f0 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -6,6 +6,9 @@
 #include <asm/cpu.h>
 #include <asm/processor.h>
 
+extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+void kvm_set_cpu_caps(void);
+
 int kvm_update_cpuid(struct kvm_vcpu *vcpu);
 bool kvm_mpx_supported(void);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
@@ -173,7 +176,8 @@ static __always_inline void cpuid_entry_mask(struct kvm_cpuid_entry2 *entry,
 {
 	u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
 
-	*reg &= boot_cpu_data.x86_capability[leaf];
+	BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps));
+	*reg &= kvm_cpu_caps[leaf];
 }
 
 static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
@@ -281,4 +285,20 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
 		  MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
 }
 
+static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
+{
+	unsigned int x86_leaf = x86_feature / 32;
+
+	reverse_cpuid_check(x86_leaf);
+	kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
+{
+	unsigned int x86_leaf = x86_feature / 32;
+
+	reverse_cpuid_check(x86_leaf);
+	kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
+}
+
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ba39c3502839..f9d4d4d09568 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1450,6 +1450,8 @@ static __init int svm_hardware_setup(void)
 			pr_info("Virtual GIF supported\n");
 	}
 
+	kvm_set_cpu_caps();
+	
 	return 0;
 
 err:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index bed726750fcd..033462795526 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7801,6 +7801,8 @@ static __init int hardware_setup(void)
 			return r;
 	}
 
+	kvm_set_cpu_caps();
+
 	r = alloc_kvm_area();
 	if (r)
 		nested_vmx_hardware_unsetup();
-- 
Gitee


From 4e13ebce6003f3bd9357288b7cf426a764bb35b5 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 24 May 2021 11:51:13 +0800
Subject: [PATCH 0043/2161] KVM: SVM: Convert feature updates from CPUID to KVM
 cpu caps

commit 9b58b9857f221e4f7149a22727ef61d0c141f56b upstream

Use the recently introduced KVM CPU caps to propagate SVM-only (kernel)
settings to supported CPUID flags.

Note, there are a few subtleties:

  - Setting a flag based on a *different* feature is effectively
    emulation, and must be done at runtime via ->set_supported_cpuid().

  - CPUID 0x8000000A.EDX is a feature leaf that was previously not
    adjusted by kvm_cpu_cap_mask() because all features are hidden by
    default.

Opportunistically add a technically unnecessary break and fix an
indentation issue in svm_set_supported_cpuid().

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c |  6 ++++++
 arch/x86/kvm/svm.c   | 45 ++++++++++++++++++++++++++++----------------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7a44c304d951..371e1756dc18 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -388,6 +388,12 @@ void kvm_set_cpu_caps(void)
 		F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
 	);
 
+	/*
+	 * Hide all SVM features by default, SVM will set the cap bits for
+	 * features it emulates and/or exposes for L1.
+	 */
+	kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
+
 	kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
 		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
 		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f9d4d4d09568..1f00cf462e11 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1340,6 +1340,23 @@ static __init void svm_adjust_mmio_mask(void)
 	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 
+static __init void svm_set_cpu_caps(void)
+{
+	kvm_set_cpu_caps();
+
+	/* CPUID 0x80000001 */
+	if (nested)
+		kvm_cpu_cap_set(X86_FEATURE_SVM);
+
+	/* CPUID 0x8000000A */
+	/* Support next_rip if host supports it */
+	if (boot_cpu_has(X86_FEATURE_NRIPS))
+		kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+	if (npt_enabled)
+		kvm_cpu_cap_set(X86_FEATURE_NPT);
+}
+
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
@@ -1450,7 +1467,7 @@ static __init int svm_hardware_setup(void)
 			pr_info("Virtual GIF supported\n");
 	}
 
-	kvm_set_cpu_caps();
+	svm_set_cpu_caps();
 	
 	return 0;
 
@@ -5973,6 +5990,14 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 	guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
 }
 
+/*
+ * Vendor specific emulation must be handled via ->set_supported_cpuid(), not
+ * svm_set_cpu_caps(), as capabilities configured during hardware_setup() are
+ * masked against hardware/kernel support, i.e. they'd be lost.
+ *
+ * Note, setting a flag based on a *different* feature, e.g. setting VIRT_SSBD
+ * if LS_CFG_SSBD or AMD_SSBD is supported, is effectively emulation.
+ */
 static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
 	switch (entry->function) {
@@ -5980,13 +6005,9 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 		if (avic)
 			cpuid_entry_clear(entry, X86_FEATURE_X2APIC);
 		break;
-	case 0x80000001:
-		if (nested)
-			cpuid_entry_set(entry, X86_FEATURE_SVM);
-		break;
 	case 0x80000008:
 		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
-		     boot_cpu_has(X86_FEATURE_AMD_SSBD))
+		    boot_cpu_has(X86_FEATURE_AMD_SSBD))
 			cpuid_entry_set(entry, X86_FEATURE_VIRT_SSBD);
 		break;
 	case 0x8000000A:
@@ -5994,16 +6015,8 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 				   ASID emulation to nested SVM */
 		entry->ecx = 0; /* Reserved */
-		entry->edx = 0; /* Per default do not support any
-				   additional features */
-
-		/* Support next_rip if host supports it */
-		if (boot_cpu_has(X86_FEATURE_NRIPS))
-			cpuid_entry_set(entry, X86_FEATURE_NRIPS);
-
-		/* Support NPT for the guest if enabled */
-		if (npt_enabled)
-			cpuid_entry_set(entry, X86_FEATURE_NPT);
+		/* Note, 0x8000000A.EDX is managed via kvm_cpu_caps. */;
+		cpuid_entry_mask(entry, CPUID_8000_000A_EDX);
 		break;
 	case 0x8000001F:
 		/* Support memory encryption cpuid if host supports it */
-- 
Gitee


From 5a409265e32c89fdd22941673c9ad8d7b4a3d0d4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:43 -0800
Subject: [PATCH 0044/2161] KVM: VMX: Convert feature updates from CPUID to KVM
 cpu caps

commit 3ec6fd8cf0ba6bb3ded5cdee88319c9af90e14c8 upstream

Use the recently introduced KVM CPU caps to propagate VMX-only (kernel)
settings to supported CPUID flags.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 54 ++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 033462795526..ab1c28e72aa0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7093,38 +7093,50 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 		update_intel_pt_cfg(vcpu);
 }
 
+/*
+ * Vendor specific emulation must be handled via ->set_supported_cpuid(), not
+ * vmx_set_cpu_caps(), as capabilities configured during hardware_setup() are
+ * masked against hardware/kernel support, i.e. they'd be lost.
+ */
 static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
 	switch (entry->function) {
-	case 0x1:
-		if (nested)
-			cpuid_entry_set(entry, X86_FEATURE_VMX);
-		break;
 	case 0x7:
-		if (boot_cpu_has(X86_FEATURE_MPX) && kvm_mpx_supported())
-			cpuid_entry_set(entry, X86_FEATURE_MPX);
-		if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_has_vmx_invpcid())
-			cpuid_entry_set(entry, X86_FEATURE_INVPCID);
-		if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
-		    vmx_pt_supported())
-			cpuid_entry_set(entry, X86_FEATURE_INTEL_PT);
 		if (vmx_umip_emulated())
 			cpuid_entry_set(entry, X86_FEATURE_UMIP);
-
-		/* PKU is not yet implemented for shadow paging. */
-		if (enable_ept && boot_cpu_has(X86_FEATURE_PKU) &&
-		    boot_cpu_has(X86_FEATURE_OSPKE))
-			cpuid_entry_set(entry, X86_FEATURE_PKU);
-		break;
-	case 0x80000001:
-		if (!cpu_has_vmx_rdtscp())
-			cpuid_entry_clear(entry, X86_FEATURE_RDTSCP);
 		break;
 	default:
 		break;
 	}
 }
 
+static __init void vmx_set_cpu_caps(void)
+{
+	kvm_set_cpu_caps();
+
+	/* CPUID 0x1 */
+	if (nested)
+		kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+	/* CPUID 0x7 */
+	if (boot_cpu_has(X86_FEATURE_MPX) && kvm_mpx_supported())
+		kvm_cpu_cap_set(X86_FEATURE_MPX);
+	if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_has_vmx_invpcid())
+		kvm_cpu_cap_set(X86_FEATURE_INVPCID);
+	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+	    vmx_pt_supported())
+		kvm_cpu_cap_set(X86_FEATURE_INTEL_PT);
+
+	/* PKU is not yet implemented for shadow paging. */
+	if (enable_ept && boot_cpu_has(X86_FEATURE_PKU) &&
+	    boot_cpu_has(X86_FEATURE_OSPKE))
+		kvm_cpu_cap_set(X86_FEATURE_PKU);
+
+	/* CPUID 0x80000001 */
+	if (!cpu_has_vmx_rdtscp())
+		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+}
+
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
 {
 	to_vmx(vcpu)->req_immediate_exit = true;
@@ -7801,7 +7813,7 @@ static __init int hardware_setup(void)
 			return r;
 	}
 
-	kvm_set_cpu_caps();
+	vmx_set_cpu_caps();
 
 	r = alloc_kvm_area();
 	if (r)
-- 
Gitee


From f3193aeb1bf69128a7d67c815ba26b8e615551a5 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 25 May 2021 18:01:34 +0800
Subject: [PATCH 0045/2161] KVM: x86: Use KVM cpu caps to mark CR4.LA57 as
 not-reserved

commit c10398b6d0ddb9c8234890828ab83341e11f9840 upstream

Add accessor(s) for KVM cpu caps and use said accessor to detect
hardware support for LA57 instead of manually querying CPUID.

Note, the explicit conversion to bool via '!!' in kvm_cpu_cap_has() is
technically unnecessary, but it gives people a warm fuzzy feeling.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.h | 13 +++++++++++++
 arch/x86/kvm/x86.c   |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b79060b817f0..90927d8a1b55 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -301,4 +301,17 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 	kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
 }
 
+static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
+{
+	unsigned int x86_leaf = x86_feature / 32;
+
+	reverse_cpuid_check(x86_leaf);
+	return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
+}
+
+static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature)
+{
+	return !!kvm_cpu_cap_get(x86_feature);
+}
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a4b128c1c091..e722a876b154 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -936,7 +936,7 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
 		reserved_bits |= X86_CR4_PKE;
 
 	if (!cpu_has(c, X86_FEATURE_LA57) &&
-	    !(cpuid_ecx(0x7) & feature_bit(LA57)))
+	    !kvm_cpu_cap_has(X86_FEATURE_LA57))
 		reserved_bits |= X86_CR4_LA57;
 
 	if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
-- 
Gitee


From b1fc9c992f9a88bd63081ee342f6ae5b867e1e02 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 24 May 2021 17:08:01 +0800
Subject: [PATCH 0046/2161] KVM: x86: Use KVM cpu caps to track UMIP emulation

commit 90d2f60f41f73b90768554e5a30b1cfedd167731 upstream

Set UMIP in kvm_cpu_caps when it is emulated by VMX, even though the
bit will effectively be dropped by do_host_cpuid().  This allows
checking for UMIP emulation via kvm_cpu_caps instead of a dedicated
kvm_x86_ops callback.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/svm.c              | 6 ------
 arch/x86/kvm/vmx/vmx.c          | 8 +++++++-
 arch/x86/kvm/x86.c              | 2 +-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a4d8e98767c..88c4abeb83b8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1132,7 +1132,6 @@ struct kvm_x86_ops {
 	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
 	bool (*mpx_supported)(void);
 	bool (*xsaves_supported)(void);
-	bool (*umip_emulated)(void);
 	bool (*pt_supported)(void);
 
 	int (*check_nested_events)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f00cf462e11..68befcd76f18 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6047,11 +6047,6 @@ static bool svm_xsaves_supported(void)
 	return false;
 }
 
-static bool svm_umip_emulated(void)
-{
-	return false;
-}
-
 static bool svm_pt_supported(void)
 {
 	return false;
@@ -7352,7 +7347,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.rdtscp_supported = svm_rdtscp_supported,
 	.mpx_supported = svm_mpx_supported,
 	.xsaves_supported = svm_xsaves_supported,
-	.umip_emulated = svm_umip_emulated,
 	.pt_supported = svm_pt_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ab1c28e72aa0..b7bd919a1c18 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7102,6 +7102,10 @@ static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
 	switch (entry->function) {
 	case 0x7:
+		/*
+		 * UMIP needs to be manually set even though vmx_set_cpu_caps()
+		 * also sets UMIP since do_host_cpuid() will drop it.
+		 */
 		if (vmx_umip_emulated())
 			cpuid_entry_set(entry, X86_FEATURE_UMIP);
 		break;
@@ -7132,6 +7136,9 @@ static __init void vmx_set_cpu_caps(void)
 	    boot_cpu_has(X86_FEATURE_OSPKE))
 		kvm_cpu_cap_set(X86_FEATURE_PKU);
 
+	if (vmx_umip_emulated())
+		kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
 	/* CPUID 0x80000001 */
 	if (!cpu_has_vmx_rdtscp())
 		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
@@ -7939,7 +7946,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.handle_exit_irqoff = vmx_handle_exit_irqoff,
 	.mpx_supported = vmx_mpx_supported,
 	.xsaves_supported = vmx_xsaves_supported,
-	.umip_emulated = vmx_umip_emulated,
 	.pt_supported = vmx_pt_supported,
 
 	.request_immediate_exit = vmx_request_immediate_exit,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e722a876b154..6e1c6d9e7afd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -939,7 +939,7 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
 	    !kvm_cpu_cap_has(X86_FEATURE_LA57))
 		reserved_bits |= X86_CR4_LA57;
 
-	if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
+	if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_cpu_cap_has(X86_FEATURE_UMIP))
 		reserved_bits |= X86_CR4_UMIP;
 
 	return reserved_bits;
-- 
Gitee


From 09cc38ffe8a568bffbee203b4fb35dd7e3d595ba Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:48 -0800
Subject: [PATCH 0047/2161] KVM: x86: Fold CPUID 0x7 masking back into
 __do_cpuid_func()

commit 09f628a0b49cf489faada00254569813a2143894 upstream

Move the CPUID 0x7 masking back into __do_cpuid_func() now that the
size of the code has been trimmed down significantly.

Tweak the WARN case, which is impossible to hit unless the CPU is
completely broken, to break the loop before creating the bogus entry.

Opportunustically reorder the cpuid_entry_set() calls and shorten the
comment about emulation to further reduce the footprint of CPUID 0x7.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 62 ++++++++++++++++----------------------------
 1 file changed, 22 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 371e1756dc18..8f094431a4f2 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -477,44 +477,6 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 	return 0;
 }
 
-static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry)
-{
-	switch (entry->index) {
-	case 0:
-		entry->eax = min(entry->eax, 1u);
-		cpuid_entry_mask(entry, CPUID_7_0_EBX);
-		/* TSC_ADJUST is emulated */
-		cpuid_entry_set(entry, X86_FEATURE_TSC_ADJUST);
-		cpuid_entry_mask(entry, CPUID_7_ECX);
-		cpuid_entry_mask(entry, CPUID_7_EDX);
-		if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
-			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL);
-		if (boot_cpu_has(X86_FEATURE_STIBP))
-			cpuid_entry_set(entry, X86_FEATURE_INTEL_STIBP);
-		if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
-			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL_SSBD);
-		/*
-		 * We emulate ARCH_CAPABILITIES in software even
-		 * if the host doesn't support it.
-		 */
-		cpuid_entry_set(entry, X86_FEATURE_ARCH_CAPABILITIES);
-		break;
-	case 1:
-		cpuid_entry_mask(entry, CPUID_7_1_EAX);
-		entry->ebx = 0;
-		entry->ecx = 0;
-		entry->edx = 0;
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		entry->eax = 0;
-		entry->ebx = 0;
-		entry->ecx = 0;
-		entry->edx = 0;
-		break;
-	}
-}
-
 static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 {
 	struct kvm_cpuid_entry2 *entry;
@@ -576,14 +538,34 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		break;
 	/* function 7 has additional index. */
 	case 7:
-		do_cpuid_7_mask(entry);
+		entry->eax = min(entry->eax, 1u);
+		cpuid_entry_mask(entry, CPUID_7_0_EBX);
+		cpuid_entry_mask(entry, CPUID_7_ECX);
+		cpuid_entry_mask(entry, CPUID_7_EDX);
+
+		/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
+		cpuid_entry_set(entry, X86_FEATURE_TSC_ADJUST);
+		cpuid_entry_set(entry, X86_FEATURE_ARCH_CAPABILITIES);
+
+		if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
+			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL);
+		if (boot_cpu_has(X86_FEATURE_STIBP))
+			cpuid_entry_set(entry, X86_FEATURE_INTEL_STIBP);
+		if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL_SSBD);
 
 		for (i = 1, max_idx = entry->eax; i <= max_idx; i++) {
+			if (WARN_ON_ONCE(i > 1))
+				break;
+
 			entry = do_host_cpuid(array, function, i);
 			if (!entry)
 				goto out;
 
-			do_cpuid_7_mask(entry);
+			cpuid_entry_mask(entry, CPUID_7_1_EAX);
+			entry->ebx = 0;
+			entry->ecx = 0;
+			entry->edx = 0;
 		}
 		break;
 	case 9:
-- 
Gitee


From 4b08785a7d4f8f4983fc6c13c296776d88d1a152 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:52 -0800
Subject: [PATCH 0048/2161] KVM: x86: Do host CPUID at load time to mask KVM
 cpu caps

commit d8577a4c238f8bd3089bfb428fece97f14eaabad upstream

Mask kvm_cpu_caps based on host CPUID in preparation for overriding the
CPUID results during KVM_GET_SUPPORTED_CPUID instead of doing the
masking at runtime.

Note, masking may or may not be necessary, e.g. the kernel rarely, if
ever, sets real CPUID bits that are not supported by hardware.  But, the
code is cheap and only runs once at load, so an abundance of caution is
warranted.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8f094431a4f2..c82d69c1dae6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -281,8 +281,16 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 
 static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
 {
+	const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
+	struct kvm_cpuid_entry2 entry;
+
 	reverse_cpuid_check(leaf);
 	kvm_cpu_caps[leaf] &= mask;
+
+	cpuid_count(cpuid.function, cpuid.index,
+		    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
+
+	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, &cpuid);
 }
 
 void kvm_set_cpu_caps(void)
-- 
Gitee


From 3f133a32be153f9f715beabd95b45d70a5f841e9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 2 Mar 2020 15:56:53 -0800
Subject: [PATCH 0049/2161] KVM: x86: Override host CPUID results with
 kvm_cpu_caps

commit bd79199990477bf0c316b32bfcbd9862dc0f08ec upstream

Override CPUID entries with kvm_cpu_caps during KVM_GET_SUPPORTED_CPUID
instead of masking the host CPUID result, which is redundant now that
the host CPUID is incorporated into kvm_cpu_caps at runtime.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c   | 25 +++++++++++--------------
 arch/x86/kvm/cpuid.h   |  6 +++---
 arch/x86/kvm/svm.c     |  3 +--
 arch/x86/kvm/vmx/vmx.c | 12 ------------
 4 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c82d69c1dae6..b6fcd321c739 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -506,8 +506,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->eax = min(entry->eax, 0x1fU);
 		break;
 	case 1:
-		cpuid_entry_mask(entry, CPUID_1_EDX);
-		cpuid_entry_mask(entry, CPUID_1_ECX);
+		cpuid_entry_override(entry, CPUID_1_EDX);
+		cpuid_entry_override(entry, CPUID_1_ECX);
 		/* we support x2apic emulation even if host does not support
 		 * it since we emulate x2apic in software */
 		cpuid_entry_set(entry, X86_FEATURE_X2APIC);
@@ -547,9 +547,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	/* function 7 has additional index. */
 	case 7:
 		entry->eax = min(entry->eax, 1u);
-		cpuid_entry_mask(entry, CPUID_7_0_EBX);
-		cpuid_entry_mask(entry, CPUID_7_ECX);
-		cpuid_entry_mask(entry, CPUID_7_EDX);
+		cpuid_entry_override(entry, CPUID_7_0_EBX);
+		cpuid_entry_override(entry, CPUID_7_ECX);
+		cpuid_entry_override(entry, CPUID_7_EDX);
 
 		/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
 		cpuid_entry_set(entry, X86_FEATURE_TSC_ADJUST);
@@ -570,7 +570,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			if (!entry)
 				goto out;
 
-			cpuid_entry_mask(entry, CPUID_7_1_EAX);
+			cpuid_entry_override(entry, CPUID_7_1_EAX);
 			entry->ebx = 0;
 			entry->ecx = 0;
 			entry->edx = 0;
@@ -639,7 +639,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		if (!entry)
 			goto out;
 
-		cpuid_entry_mask(entry, CPUID_D_1_EAX);
+		cpuid_entry_override(entry, CPUID_D_1_EAX);
 
 		if (!kvm_x86_ops->xsaves_supported())
 			cpuid_entry_clear(entry, X86_FEATURE_XSAVES);
@@ -723,11 +723,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->eax = min(entry->eax, 0x8000001f);
 		break;
 	case 0x80000001:
-		cpuid_entry_mask(entry, CPUID_8000_0001_EDX);
-		/* Add it manually because it may not be in host CPUID.  */
-		if (!tdp_enabled)
-			cpuid_entry_set(entry, X86_FEATURE_GBPAGES);
-		cpuid_entry_mask(entry, CPUID_8000_0001_ECX);
+		cpuid_entry_override(entry, CPUID_8000_0001_EDX);
+		cpuid_entry_override(entry, CPUID_8000_0001_ECX);
 		break;
 	case 0x80000007: /* Advanced power management */
 		/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -745,7 +742,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			g_phys_as = phys_as;
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
-		cpuid_entry_mask(entry, CPUID_8000_0008_EBX);
+		cpuid_entry_override(entry, CPUID_8000_0008_EBX);
 		/*
 		 * AMD has separate bits for each SPEC_CTRL bit.
 		 * arch/x86/kernel/cpu/bugs.c is kind enough to
@@ -782,7 +779,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->eax = min(entry->eax, 0xC0000004);
 		break;
 	case 0xC0000001:
-		cpuid_entry_mask(entry, CPUID_C000_0001_EDX);
+		cpuid_entry_override(entry, CPUID_C000_0001_EDX);
 		break;
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 90927d8a1b55..db0548d707d6 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -171,13 +171,13 @@ static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
 		*reg &= ~__feature_bit(x86_feature);
 }
 
-static __always_inline void cpuid_entry_mask(struct kvm_cpuid_entry2 *entry,
-					     enum cpuid_leafs leaf)
+static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
+						 enum cpuid_leafs leaf)
 {
 	u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
 
 	BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps));
-	*reg &= kvm_cpu_caps[leaf];
+	*reg = kvm_cpu_caps[leaf];
 }
 
 static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 68befcd76f18..f39e92408f30 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6015,8 +6015,7 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 				   ASID emulation to nested SVM */
 		entry->ecx = 0; /* Reserved */
-		/* Note, 0x8000000A.EDX is managed via kvm_cpu_caps. */;
-		cpuid_entry_mask(entry, CPUID_8000_000A_EDX);
+		cpuid_entry_override(entry, CPUID_8000_000A_EDX);
 		break;
 	case 0x8000001F:
 		/* Support memory encryption cpuid if host supports it */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b7bd919a1c18..32a27848f08f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7100,18 +7100,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
  */
 static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
-	switch (entry->function) {
-	case 0x7:
-		/*
-		 * UMIP needs to be manually set even though vmx_set_cpu_caps()
-		 * also sets UMIP since do_host_cpuid() will drop it.
-		 */
-		if (vmx_umip_emulated())
-			cpuid_entry_set(entry, X86_FEATURE_UMIP);
-		break;
-	default:
-		break;
-	}
 }
 
 static __init void vmx_set_cpu_caps(void)
-- 
Gitee


From 19d3ce1dfd5224ac94c8534500d0f8d41a57e312 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 25 Mar 2020 12:12:59 -0700
Subject: [PATCH 0050/2161] KVM: x86: Fix BUILD_BUG() in
 __cpuid_entry_get_reg() w/ CONFIG_UBSAN=y

commit 855c7e9b9c2c39fb8d108ac70d0eed530f80a2aa upstream

Take the target reg in __cpuid_entry_get_reg() instead of a pointer to a
struct cpuid_reg.  When building with -fsanitize=alignment (enabled by
CONFIG_UBSAN=y), some versions of gcc get tripped up on the pointer and
trigger the BUILD_BUG().

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Fixes: d8577a4c238f8 ("KVM: x86: Do host CPUID at load time to mask KVM cpu caps")
Fixes: 4c61534aaae2a ("KVM: x86: Introduce cpuid_entry_{get,has}() accessors")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200325191259.23559-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 2 +-
 arch/x86/kvm/cpuid.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b6fcd321c739..dc7f0d7fca1d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -290,7 +290,7 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
 	cpuid_count(cpuid.function, cpuid.index,
 		    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
 
-	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, &cpuid);
+	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
 void kvm_set_cpu_caps(void)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index db0548d707d6..9938797b6273 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -100,9 +100,9 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_featu
 }
 
 static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
-						  const struct cpuid_reg *cpuid)
+						  u32 reg)
 {
-	switch (cpuid->reg) {
+	switch (reg) {
 	case CPUID_EAX:
 		return &entry->eax;
 	case CPUID_EBX:
@@ -122,7 +122,7 @@ static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
 {
 	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
 
-	return __cpuid_entry_get_reg(entry, &cpuid);
+	return __cpuid_entry_get_reg(entry, cpuid.reg);
 }
 
 static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
@@ -190,7 +190,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
 	if (!entry)
 		return NULL;
 
-	return __cpuid_entry_get_reg(entry, &cpuid);
+	return __cpuid_entry_get_reg(entry, cpuid.reg);
 }
 
 static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
-- 
Gitee


From 6b013fce72f9870dfead42e9ea8e19f71ef63f37 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:01 -0700
Subject: [PATCH 0051/2161] KVM: x86: Export kvm_propagate_fault() (as
 kvm_inject_emulated_page_fault)

commit 53b3d8e9d57753295b33065f80b1e2fb4fcb946d upstream

Export the page fault propagation helper so that VMX can use it to
correctly emulate TLB invalidation on page faults in an upcoming patch.

In the (hopefully) not-too-distant future, SGX virtualization will also
want access to the helper for injecting page faults to the correct level
(L1 vs. L2) when emulating ENCLS instructions.

Rename the function to kvm_inject_emulated_page_fault() to clarify that
it is (a) injecting a fault and (b) only for page faults.  WARN if it's
invoked with an exception other than PF_VECTOR.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-6-sean.j.christopherson@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/x86.c              | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 88c4abeb83b8..f62356a9bc48 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1405,6 +1405,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+				    struct x86_exception *fault);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gfn_t gfn, void *data, int offset, int len,
 			    u32 access);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e1c6d9e7afd..7207b4b25105 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -609,8 +609,11 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+				    struct x86_exception *fault)
 {
+	WARN_ON_ONCE(fault->vector != PF_VECTOR);
+
 	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
 		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
 	else
@@ -618,6 +621,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau
 
 	return fault->nested_page_fault;
 }
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
@@ -6422,7 +6426,7 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception.vector == PF_VECTOR)
-		return kvm_propagate_fault(vcpu, &ctxt->exception);
+		return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
 
 	if (ctxt->exception.error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
-- 
Gitee


From b8ad901edf92b2ad132e3c1bb2b55221cb39b6d5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 17 Apr 2020 12:21:06 -0400
Subject: [PATCH 0052/2161] KVM: SVM: avoid infinite loop on NPF from bad
 address

commit e72436bc3a5206f95bb384e741154166ddb3202e upstream

When a nested page fault is taken from an address that does not have
a memslot associated to it, kvm_mmu_do_page_fault returns RET_PF_EMULATE
(via mmu_set_spte) and kvm_mmu_page_fault then invokes svm_need_emulation_on_page_fault.

The default answer there is to return false, but in this case this just
causes the page fault to be retried ad libitum.  Since this is not a
fast path, and the only other case where it is taken is an erratum,
just stick a kvm_vcpu_gfn_to_memslot check in there to detect the
common case where the erratum is not happening.

This fixes an infinite loop in the new set_memory_region_test.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c  | 7 +++++++
 virt/kvm/kvm_main.c | 1 +
 2 files changed, 8 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f39e92408f30..f6e536070465 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -7183,6 +7183,13 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
 	bool smap = cr4 & X86_CR4_SMAP;
 	bool is_user = svm_get_cpl(vcpu) == 3;
 
+	/*
+	 * If RIP is invalid, go ahead with emulation which will cause an
+	 * internal error exit.
+	 */
+	if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+		return true;
+
 	/*
 	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
 	 *
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4176f370ee2b..238734754260 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1390,6 +1390,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
 {
 	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
 
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
-- 
Gitee


From 2e7b95d18fdb1850720d3d3f1807ec2ed863f10f Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 26 May 2021 18:05:08 +0800
Subject: [PATCH 0053/2161] KVM: x86: Add kvm_x86_ops hook to short circuit
 emulation

commit 09e3e2a1cc8d8069085785f1236a64c72707e7f2 upstream

Replace the existing kvm_x86_ops.need_emulation_on_page_fault() with a
more generic is_emulatable(), and unconditionally call the new function
in x86_emulate_instruction().

KVM will use the generic hook to support multiple security related
technologies that prevent emulation in one way or another.  Similar to
the existing AMD #NPF case where emulation of the current instruction is
not possible due to lack of information, AMD's SEV-ES and Intel's SGX
and TDX will introduce scenarios where emulation is impossible due to
the guest's register state being inaccessible.  And again similar to the
existing #NPF case, emulation can be initiated by kvm_mmu_page_fault(),
i.e. outside of the control of vendor-specific code.

While the cause and architecturally visible behavior of the various
cases are different, e.g. SGX will inject a #UD, AMD #NPF is a clean
resume or complete shutdown, and SEV-ES and TDX "return" an error, the
impact on the common emulation code is identical: KVM must stop
emulation immediately and resume the guest.

Query is_emulatable() in handle_ud() as well so that the
force_emulation_prefix code doesn't incorrectly modify RIP before
calling emulate_instruction() in the absurdly unlikely scenario that
KVM encounters forced emulation in conjunction with "do not emulate".

Cc: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200915232702.15945-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 12 ------------
 arch/x86/kvm/svm.c              | 31 ++++++++++++++++++-------------
 arch/x86/kvm/vmx/vmx.c          | 12 ++++++------
 arch/x86/kvm/x86.c              |  9 ++++++++-
 5 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f62356a9bc48..b90f9047541d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1217,7 +1217,7 @@ struct kvm_x86_ops {
 				   uint16_t *vmcs_version);
 	uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
 
-	bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
+	bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
 
 	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
 	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5d097fb46dc9..7a73eb05d616 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5618,18 +5618,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
 	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
 		emulation_type = EMULTYPE_ALLOW_RETRY;
 emulate:
-	/*
-	 * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
-	 * This can happen if a guest gets a page-fault on data access but the HW
-	 * table walker is not able to read the instruction page (e.g instruction
-	 * page is not present in memory). In those cases we simply restart the
-	 * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
-	 */
-	if (unlikely(insn && !insn_len)) {
-		if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
-			return 1;
-	}
-
 	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
 				       insn_len);
 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f6e536070465..291fcca1fee4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -7176,19 +7176,10 @@ static int svm_unregister_enc_region(struct kvm *kvm,
 	return ret;
 }
 
-static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
-	unsigned long cr4 = kvm_read_cr4(vcpu);
-	bool smep = cr4 & X86_CR4_SMEP;
-	bool smap = cr4 & X86_CR4_SMAP;
-	bool is_user = svm_get_cpl(vcpu) == 3;
-
-	/*
-	 * If RIP is invalid, go ahead with emulation which will cause an
-	 * internal error exit.
-	 */
-	if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
-		return true;
+	bool smep, smap, is_user;
+	unsigned long cr4;
 
 	/*
 	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
@@ -7230,6 +7221,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
 	 * instruction pointer so we will not able to workaround it. Lets
 	 * print the error and request to kill the guest.
 	 */
+	if (likely(!insn || insn_len))
+		return true;
+
+	/*
+	 * If RIP is invalid, go ahead with emulation which will cause an
+	 * internal error exit.
+	 */
+	if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+		return true;
+
+	cr4 = kvm_read_cr4(vcpu);
+	smep = cr4 & X86_CR4_SMEP;
+	smap = cr4 & X86_CR4_SMAP;
+	is_user = svm_get_cpl(vcpu) == 3;
 	if (smap && (!smep || is_user)) {
 		if (!sev_guest(vcpu->kvm))
 			return true;
@@ -7389,7 +7394,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.nested_enable_evmcs = NULL,
 	.nested_get_evmcs_version = NULL,
 
-	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
+	.can_emulate_instruction = svm_can_emulate_instruction,
 
 	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
 };
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 32a27848f08f..8dcef0edab64 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1539,6 +1539,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 	return 0;
 }
 
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+{
+	return true;
+}
+
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	unsigned long rip, orig_rip;
@@ -7637,11 +7642,6 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
-{
-	return false;
-}
-
 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
 {
 	return to_vmx(vcpu)->nested.vmxon;
@@ -7971,7 +7971,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.get_vmcs12_pages = NULL,
 	.nested_enable_evmcs = NULL,
 	.nested_get_evmcs_version = NULL,
-	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+	.can_emulate_instruction = vmx_can_emulate_instruction,
 	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7207b4b25105..50cdef361eb5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5630,6 +5630,9 @@ int handle_ud(struct kvm_vcpu *vcpu)
 	char sig[5]; /* ud2; .ascii "kvm" */
 	struct x86_exception e;
 
+	if (unlikely(!kvm_x86_ops->can_emulate_instruction(vcpu, NULL, 0)))
+		return 1;
+
 	if (force_emulation_prefix &&
 	    kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
 				sig, sizeof(sig), &e) == 0 &&
@@ -6777,7 +6780,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	int r;
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	bool writeback = true;
-	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+	bool write_fault_to_spt;
+
+	if (unlikely(!kvm_x86_ops->can_emulate_instruction(vcpu, insn, insn_len)))
+		return 1;
 
 	vcpu->arch.l1tf_flush_l1d = true;
 
@@ -6785,6 +6791,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
 	 */
+	write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 	vcpu->arch.write_fault_to_shadow_pgtable = false;
 	kvm_clear_exception_queue(vcpu);
 
-- 
Gitee


From 5dd3f08a379660351642636118752d0b7cb8f286 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 26 May 2021 15:16:31 +0800
Subject: [PATCH 0054/2161] KVM: VMX: Convert vcpu_vmx.exit_reason to a union

commit 8e53324021645f820a01bf8aa745711c802c8542 upstream

Convert vcpu_vmx.exit_reason from a u32 to a union (of size u32).  The
full VM_EXIT_REASON field is comprised of a 16-bit basic exit reason in
bits 15:0, and single-bit modifiers in bits 31:16.

Historically, KVM has only had to worry about handling the "failed
VM-Entry" modifier, which could only be set in very specific flows and
required dedicated handling.  I.e. manually stripping the FAILED_VMENTRY
bit was a somewhat viable approach.  But even with only a single bit to
worry about, KVM has had several bugs related to comparing a basic exit
reason against the full exit reason store in vcpu_vmx.

Upcoming Intel features, e.g. SGX, will add new modifier bits that can
be set on more or less any VM-Exit, as opposed to the significantly more
restricted FAILED_VMENTRY, i.e. correctly handling everything in one-off
flows isn't scalable.  Tracking exit reason in a union forces code to
explicitly choose between consuming the full exit reason and the basic
exit, and is a convenient way to document and access the modifiers.

No functional change intended.

Cc: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
Message-Id: <20201106090315.18606-2-chenyi.qiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/nested.c | 28 +++++++++++++++-------
 arch/x86/kvm/vmx/nested.h |  9 +++----
 arch/x86/kvm/vmx/vmx.c    | 49 +++++++++++++++++++++------------------
 arch/x86/kvm/vmx/vmx.h    | 25 +++++++++++++++++++-
 4 files changed, 74 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3f63bd7421ac..7a853b13131c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3058,7 +3058,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	bool evaluate_pending_interrupts;
-	u32 exit_reason = EXIT_REASON_INVALID_STATE;
+	union vmx_exit_reason exit_reason = {
+		.basic = EXIT_REASON_INVALID_STATE,
+		.failed_vmentry = 1,
+	};
 	u32 exit_qual;
 
 	evaluate_pending_interrupts = exec_controls_get(vmx) &
@@ -3118,7 +3121,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 		goto vmentry_fail_vmexit_guest_mode;
 
 	if (from_vmentry) {
-		exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+		exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
 		exit_qual = nested_vmx_load_msr(vcpu,
 						vmcs12->vm_entry_msr_load_addr,
 						vmcs12->vm_entry_msr_load_count);
@@ -3186,7 +3189,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 		return NVMX_VMENTRY_VMEXIT;
 
 	load_vmcs12_host_state(vcpu, vmcs12);
-	vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+	vmcs12->vm_exit_reason = exit_reason.full;
 	vmcs12->exit_qualification = exit_qual;
 	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
 		vmx->nested.need_vmcs12_to_shadow_sync = true;
@@ -5113,7 +5116,12 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
 	return kvm_skip_emulated_instruction(vcpu);
 
 fail:
-	nested_vmx_vmexit(vcpu, vmx->exit_reason,
+	/*
+	 * This is effectively a reflected VM-Exit, as opposed to a synthesized
+	 * nested VM-Exit.  Pass the original exit reason, i.e. don't hardcode
+	 * EXIT_REASON_VMFUNC as the exit reason.
+	 */
+	nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
 			  vmcs_read32(VM_EXIT_INTR_INFO),
 			  vmcs_readl(EXIT_QUALIFICATION));
 	return 1;
@@ -5181,7 +5189,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
  */
 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
-	struct vmcs12 *vmcs12, u32 exit_reason)
+					struct vmcs12 *vmcs12,
+					union vmx_exit_reason exit_reason)
 {
 	u32 msr_index = kvm_rcx_read(vcpu);
 	gpa_t bitmap;
@@ -5195,7 +5204,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 	 * First we need to figure out which of the four to use:
 	 */
 	bitmap = vmcs12->msr_bitmap;
-	if (exit_reason == EXIT_REASON_MSR_WRITE)
+	if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
 		bitmap += 2048;
 	if (msr_index >= 0xc0000000) {
 		msr_index -= 0xc0000000;
@@ -5325,7 +5334,8 @@ static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
  * should handle it ourselves in L0 (and then continue L2). Only call this
  * when in is_guest_mode (L2).
  */
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu,
+			       union vmx_exit_reason exit_reason)
 {
 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5354,14 +5364,14 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 	 */
 	nested_mark_vmcs12_pages_dirty(vcpu);
 
-	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason.full,
 				vmcs_readl(EXIT_QUALIFICATION),
 				vmx->idt_vectoring_info,
 				intr_info,
 				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
 				KVM_ISA_VMX);
 
-	switch ((u16)exit_reason) {
+	switch ((u16)exit_reason.basic) {
 	case EXIT_REASON_EXCEPTION_NMI:
 		if (is_nmi(intr_info))
 			return false;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index b8521c451bb0..7e960f084c52 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -24,7 +24,8 @@ void nested_vmx_vcpu_setup(void);
 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 						     bool from_vmentry);
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
+bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu,
+			       union vmx_exit_reason exit_reason);
 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		       u32 exit_intr_info, unsigned long exit_qualification);
 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
@@ -74,7 +75,7 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
  * Reflect a VM Exit into L1.
  */
 static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
-					    u32 exit_reason)
+					    union vmx_exit_reason exit_reason)
 {
 	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
@@ -83,7 +84,7 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
 	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
 	 * we need to query the in-kernel LAPIC.
 	 */
-	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+	WARN_ON(exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT);
 	if ((exit_intr_info &
 	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
 	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
@@ -93,7 +94,7 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
 			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	}
 
-	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+	nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info,
 			  vmcs_readl(EXIT_QUALIFICATION));
 	return 1;
 }
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8dcef0edab64..6565a26f7214 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1557,7 +1557,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	 * i.e. we end up advancing IP with some random value.
 	 */
 	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-	    to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+	    to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
 		orig_rip = kvm_rip_read(vcpu);
 		rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 #ifdef CONFIG_X86_64
@@ -5859,10 +5859,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 			   enum exit_fastpath_completion exit_fastpath)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exit_reason = vmx->exit_reason;
+	union vmx_exit_reason exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
+	u16 exit_handler_index;
 
-	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+	trace_kvm_exit(exit_reason.full, vcpu, KVM_ISA_VMX);
 
 	/*
 	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -5881,11 +5882,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 	if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
 		return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 
-	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+	if (exit_reason.failed_vmentry) {
 		dump_vmcs();
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
-			= exit_reason;
+			= exit_reason.full;
 		return 0;
 	}
 
@@ -5905,18 +5906,18 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 	 * will cause infinite loop.
 	 */
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-			exit_reason != EXIT_REASON_EPT_VIOLATION &&
-			exit_reason != EXIT_REASON_PML_FULL &&
-			exit_reason != EXIT_REASON_APIC_ACCESS &&
-			exit_reason != EXIT_REASON_TASK_SWITCH)) {
+			(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+			exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+			exit_reason.basic != EXIT_REASON_PML_FULL &&
+			exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+			exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
 		vcpu->run->internal.ndata = 3;
 		vcpu->run->internal.data[0] = vectoring_info;
-		vcpu->run->internal.data[1] = exit_reason;
+		vcpu->run->internal.data[1] = exit_reason.full;
 		vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
-		if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+		if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
 			vcpu->run->internal.ndata++;
 			vcpu->run->internal.data[3] =
 				vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -5943,21 +5944,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+						kvm_vmx_max_exit_handlers);
 	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
 		skip_emulated_instruction(vcpu);
 		return 1;
-	} else if (exit_reason < kvm_vmx_max_exit_handlers
-		   && kvm_vmx_exit_handlers[exit_reason]) {
-		return kvm_vmx_exit_handlers[exit_reason](vcpu);
+	} else if (exit_reason.basic < kvm_vmx_max_exit_handlers
+		   && kvm_vmx_exit_handlers[exit_handler_index]) {
+		return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
 	} else {
 		vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
-				exit_reason);
+				exit_reason.full);
 		dump_vmcs();
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror =
 			KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
 		vcpu->run->internal.ndata = 1;
-		vcpu->run->internal.data[0] = exit_reason;
+		vcpu->run->internal.data[0] = exit_reason.full;
 		return 0;
 	}
 }
@@ -6284,9 +6287,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+	if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
 		handle_external_interrupt_irqoff(vcpu);
-	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
+	else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
 		handle_exception_nmi_irqoff(vmx);
 }
 
@@ -6628,11 +6631,11 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx->nested.nested_run_pending = 0;
 	vmx->idt_vectoring_info = 0;
 
-	vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
-	if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+	vmx->exit_reason.full = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+	if ((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)
 		kvm_machine_check();
 
-	if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+	if (vmx->fail || (vmx->exit_reason.failed_vmentry))
 		return EXIT_FASTPATH_NONE;
 
 	vmx->loaded_vmcs->launched = 1;
@@ -6641,7 +6644,7 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
 
-	if (!is_guest_mode(vcpu) && vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+	if (!is_guest_mode(vcpu) && vmx->exit_reason.basic == EXIT_REASON_MSR_WRITE)
 		exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
 	else
 		exit_fastpath = EXIT_FASTPATH_NONE;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a1919ec7fd10..a80e25495f0f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -85,6 +85,29 @@ struct pt_desc {
 	struct pt_ctx guest;
 };
 
+union vmx_exit_reason {
+	struct {
+		u32	basic			: 16;
+		u32	reserved16		: 1;
+		u32	reserved17		: 1;
+		u32	reserved18		: 1;
+		u32	reserved19		: 1;
+		u32	reserved20		: 1;
+		u32	reserved21		: 1;
+		u32	reserved22		: 1;
+		u32	reserved23		: 1;
+		u32	reserved24		: 1;
+		u32	reserved25		: 1;
+		u32	reserved26		: 1;
+		u32	enclave_mode		: 1;
+		u32	smi_pending_mtf		: 1;
+		u32	smi_from_vmx_root	: 1;
+		u32	reserved30		: 1;
+		u32	failed_vmentry		: 1;
+	};
+	u32 full;
+};
+
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -245,7 +268,7 @@ struct vcpu_vmx {
 	int vpid;
 	bool emulation_required;
 
-	u32 exit_reason;
+	union vmx_exit_reason exit_reason;
 
 	/* Posted interrupt descriptor */
 	struct pi_desc pi_desc;
-- 
Gitee


From f46d461d0e81b5306c79bce3fd306e101355f437 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 12 Apr 2021 16:21:35 +1200
Subject: [PATCH 0055/2161] KVM: x86: Add support for reverse CPUID lookup of
 scattered features

commit 4e66c0cb79b732b01b82e094b21b8e22a20dff83 upstream

Introduce a scheme that allows KVM's CPUID magic to support features
that are scattered in the kernel's feature words.  To advertise and/or
query guest support for CPUID-based features, KVM requires the bit
number of an X86_FEATURE_* to match the bit number in its associated
CPUID entry.  For scattered features, this does not hold true.

Add a framework to allow defining KVM-only words, stored in
kvm_cpu_caps after the shared kernel caps, that can be used to gather
the scattered feature bits by translating X86_FEATURE_* flags into their
KVM-defined feature.

Note, because reverse_cpuid_check() effectively forces kvm_cpu_caps
lookups to be resolved at compile time, there is no runtime cost for
translating from kernel-defined to kvm-defined features.

More details here:  https://lkml.kernel.org/r/X/jxCOLG+HUO4QlZ@google.com

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <16cad8d00475f67867fb36701fc7fb7c1ec86ce1.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 32 +++++++++++++++++++++++++++-----
 arch/x86/kvm/cpuid.h | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index dc7f0d7fca1d..30d903596e37 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -28,7 +28,7 @@
  * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
  * aligned to sizeof(unsigned long) because it's not accessed via bitops.
  */
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
 
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
@@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
 }
 
 #define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
 
 int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 {
@@ -279,13 +280,13 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
 {
 	const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
 	struct kvm_cpuid_entry2 entry;
 
 	reverse_cpuid_check(leaf);
-	kvm_cpu_caps[leaf] &= mask;
 
 	cpuid_count(cpuid.function, cpuid.index,
 		    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -293,6 +294,26 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
 	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
+static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+{
+	/* Use kvm_cpu_cap_mask for non-scattered leafs. */
+	BUILD_BUG_ON(leaf < NCAPINTS);
+
+	kvm_cpu_caps[leaf] = mask;
+
+	__kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+	/* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+	BUILD_BUG_ON(leaf >= NCAPINTS);
+
+	kvm_cpu_caps[leaf] &= mask;
+
+	__kvm_cpu_cap_mask(leaf);
+}
+
 void kvm_set_cpu_caps(void)
 {
 	unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
@@ -303,12 +324,13 @@ void kvm_set_cpu_caps(void)
 	unsigned int f_gbpages = 0;
 	unsigned int f_lm = 0;
 #endif
+	memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
 
-	BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+	BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
 		     sizeof(boot_cpu_data.x86_capability));
 
 	memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
-	       sizeof(kvm_cpu_caps));
+	       sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
 
 	kvm_cpu_cap_mask(CPUID_1_ECX,
 		/*
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 9938797b6273..2485944c6888 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -6,7 +6,20 @@
 #include <asm/cpu.h>
 #include <asm/processor.h>
 
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM.  Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+	NR_KVM_CPU_CAPS = NCAPINTS,
+
+	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define X86_KVM_FEATURE(w, f)		((w)*32 + (f))
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 void kvm_set_cpu_caps(void);
 
 int kvm_update_cpuid(struct kvm_vcpu *vcpu);
@@ -77,6 +90,20 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
 	BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
 }
 
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+	return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+	return __feature_translate(x86_feature) / 32;
+}
+
 /*
  * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
  * the hardware defined bit number (stored in bits 4:0) and a software defined
@@ -85,6 +112,8 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
  */
 static __always_inline u32 __feature_bit(int x86_feature)
 {
+	x86_feature = __feature_translate(x86_feature);
+
 	reverse_cpuid_check(x86_feature / 32);
 	return 1 << (x86_feature & 31);
 }
@@ -93,7 +122,7 @@ static __always_inline u32 __feature_bit(int x86_feature)
 
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	return reverse_cpuid[x86_leaf];
@@ -287,7 +316,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
 
 static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -295,7 +324,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 
 static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -303,7 +332,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 
 static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
-- 
Gitee


From f477b941363af83c928fd657427c7bb7637de517 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 1 Nov 2019 15:38:50 +0300
Subject: [PATCH 0056/2161] x86/fpu: Update stale variable name in comment

commit 58db103784994e9be5322237df7ef0cf4c0afc39 upstream.

When the fpu code was reworked pcntxt_mask was renamed to
xfeatures_mask. Reflect it in the comment as well.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191101123850.GE1615@uranus.lan
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 735d1f1bbabc..08b2a2aee8e0 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -840,7 +840,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 
 	/*
 	 * We should not ever be requesting features that we
-	 * have not enabled.  Remember that pcntxt_mask is
+	 * have not enabled.  Remember that xfeatures_mask is
 	 * what we write to the XCR0 register.
 	 */
 	WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
-- 
Gitee


From 59da2f0074f97c1d309f336c1b199d77b8238fb6 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 1 Nov 2019 15:42:28 +0300
Subject: [PATCH 0057/2161] x86/fpu: Shrink space allocated for
 xstate_comp_offsets

commit c08550510ca26bd57eabfe912281635e382193e5 upstream.

commit 8ff925e10f2c ("x86/xsaves: Clean up code in xstate offsets
computation in xsave area") introduced an allocation of 64 entries for
xstate_comp_offsets while the code only handles up to XFEATURE_MAX entries.

For this reason xstate_offsets and xstate_sizes are already defined with
the explicit XFEATURE_MAX limit.

Do the same for compressed format for consistency sake.

As the changelog of that commit is not giving any information it's assumed
that the main idea was to cover all possible bits in xfeatures_mask, but
this doesn't explain why other variables such as the non-compacted offsets
and sizes are explicitely limited to XFEATURE_MAX.

For consistency it's better to use the XFEATURE_MAX limit everywhere and
extend it on demand when new features get implemented at the hardware
level and subsequently supported by the kernel.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191101124228.GF1615@uranus.lan
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 08b2a2aee8e0..4ea2db018a6c 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -60,7 +60,7 @@ u64 xfeatures_mask __read_mostly;
 
 static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
 
 /*
  * The XSAVE area of kernel can be in standard or compacted format;
@@ -342,7 +342,7 @@ static int xfeature_is_aligned(int xfeature_nr)
  */
 static void __init setup_xstate_comp(void)
 {
-	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+	unsigned int xstate_comp_sizes[XFEATURE_MAX];
 	int i;
 
 	/*
-- 
Gitee


From 28a652c76790d079481671cc963c95d4535ea48d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 1 Nov 2019 16:01:53 +0300
Subject: [PATCH 0058/2161] x86/fpu: Use XFEATURE_FP/SSE enum values instead of
 hardcoded numbers

commit 446e693ca30b7c7c2aaeaf09e90ec224c7538fec upstream.

When setting up sizes and offsets for legacy header entries the code uses
hardcoded 0/1 instead of the corresponding enum values XFEATURE_FP and
XFEATURE_SSE.

Replace the hardcoded numbers which enhances readability of the code and
also makes this code the first user of those enum values..

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191101130153.GG1615@uranus.lan
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4ea2db018a6c..7593bb309f78 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -254,10 +254,13 @@ static void __init setup_xstate_features(void)
 	 * in the fixed offsets in the xsave area in either compacted form
 	 * or standard form.
 	 */
-	xstate_offsets[0] = 0;
-	xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
-	xstate_offsets[1] = xstate_sizes[0];
-	xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
+	xstate_offsets[XFEATURE_FP]	= 0;
+	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
+						   xmm_space);
+
+	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
+	xstate_sizes[XFEATURE_SSE]	= FIELD_SIZEOF(struct fxregs_state,
+						       xmm_space);
 
 	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
 		if (!xfeature_enabled(i))
@@ -350,8 +353,9 @@ static void __init setup_xstate_comp(void)
 	 * in the fixed offsets in the xsave area in either compacted form
 	 * or standard form.
 	 */
-	xstate_comp_offsets[0] = 0;
-	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+	xstate_comp_offsets[XFEATURE_FP] = 0;
+	xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
+						     xmm_space);
 
 	if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
 		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-- 
Gitee


From 660e3516def764a2d36df99f1c1b0b70eb3526e3 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 26 Oct 2021 16:36:25 +0800
Subject: [PATCH 0059/2161] treewide: Use sizeof_field() macro

commit c593642c8be046915ca3a4a300243a68077cd207 upstream.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7593bb309f78..4399af6d445a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -259,7 +259,7 @@ static void __init setup_xstate_features(void)
 						   xmm_space);
 
 	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
-	xstate_sizes[XFEATURE_SSE]	= FIELD_SIZEOF(struct fxregs_state,
+	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
 						       xmm_space);
 
 	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-- 
Gitee


From 14d13be73746d3dbf6ce42eadcc1172ade9ce6b0 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Thu, 12 Dec 2019 13:08:53 -0800
Subject: [PATCH 0060/2161] x86/fpu/xstate: Fix small issues

commit 8c9e607376968865456b33d9a2efdee2c7e1030d upstream.

In response to earlier comments, fix small issues before introducing
XSAVES supervisor states:

- Fix comments of xfeature_is_supervisor().
- Replace ((u64)1 << 63) with XCOMP_BV_COMPACTED_FORMAT.

No functional changes.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20191212210855.19260-2-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4399af6d445a..b21fceb66391 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -110,12 +110,9 @@ EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
 static int xfeature_is_supervisor(int xfeature_nr)
 {
 	/*
-	 * We currently do not support supervisor states, but if
-	 * we did, we could find out like this.
-	 *
-	 * SDM says: If state component 'i' is a user state component,
-	 * ECX[0] return 0; if state component i is a supervisor
-	 * state component, ECX[0] returns 1.
+	 * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1)
+	 * returns ECX[0] set to (1) for a supervisor state, and cleared (0)
+	 * for a user state.
 	 */
 	u32 eax, ebx, ecx, edx;
 
@@ -419,7 +416,8 @@ static void __init setup_init_fpu_buf(void)
 	print_xstate_features();
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+		init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
+						     xfeatures_mask;
 
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
-- 
Gitee


From 4425a0451365a0fd9a8e82417d6c1aa743e338d1 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Thu, 12 Dec 2019 13:08:54 -0800
Subject: [PATCH 0061/2161] x86/fpu/xstate: Make
 xfeature_is_supervisor()/xfeature_is_user() return bool

commit 158e2ee61f22b878d61de92bea5aad3d2df1c146 upstream.

Have both xfeature_is_supervisor()/xfeature_is_user() return bool
because they are used only in boolean context.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20191212210855.19260-3-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b21fceb66391..7595fc47f110 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -107,7 +107,7 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 }
 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
 
-static int xfeature_is_supervisor(int xfeature_nr)
+static bool xfeature_is_supervisor(int xfeature_nr)
 {
 	/*
 	 * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1)
@@ -117,10 +117,10 @@ static int xfeature_is_supervisor(int xfeature_nr)
 	u32 eax, ebx, ecx, edx;
 
 	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
-	return !!(ecx & 1);
+	return ecx & 1;
 }
 
-static int xfeature_is_user(int xfeature_nr)
+static bool xfeature_is_user(int xfeature_nr)
 {
 	return !xfeature_is_supervisor(xfeature_nr);
 }
-- 
Gitee


From aff35934737c2d80270278484c51f4114bbd7b69 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Thu, 9 Jan 2020 13:14:50 -0800
Subject: [PATCH 0062/2161] x86/fpu/xstate: Fix last_good_offset in
 setup_xstate_features()

commit c12e13dcd814023a903399ec5ac2e7082d664b8b upstream.

The function setup_xstate_features() uses CPUID to find each xfeature's
standard-format offset and size.  Since XSAVES always uses the compacted
format, supervisor xstates are *NEVER* in the standard-format and their
offsets are left as -1's.  However, they are still being tracked as
last_good_offset.

Fix it by tracking only user xstate offsets.

 [ bp: Use xfeature_is_supervisor() and save an indentation level. Drop
   now unused xfeature_is_user(). ]

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20200109211452.27369-2-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7595fc47f110..b21c3d766a35 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -120,11 +120,6 @@ static bool xfeature_is_supervisor(int xfeature_nr)
 	return ecx & 1;
 }
 
-static bool xfeature_is_user(int xfeature_nr)
-{
-	return !xfeature_is_supervisor(xfeature_nr);
-}
-
 /*
  * When executing XSAVEOPT (or other optimized XSAVE instructions), if
  * a processor implementation detects that an FPU state component is still
@@ -265,21 +260,25 @@ static void __init setup_xstate_features(void)
 
 		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
 
+		xstate_sizes[i] = eax;
+
 		/*
-		 * If an xfeature is supervisor state, the offset
-		 * in EBX is invalid. We leave it to -1.
+		 * If an xfeature is supervisor state, the offset in EBX is
+		 * invalid, leave it to -1.
 		 */
-		if (xfeature_is_user(i))
-			xstate_offsets[i] = ebx;
+		if (xfeature_is_supervisor(i))
+			continue;
+
+		xstate_offsets[i] = ebx;
 
-		xstate_sizes[i] = eax;
 		/*
-		 * In our xstate size checks, we assume that the
-		 * highest-numbered xstate feature has the
-		 * highest offset in the buffer.  Ensure it does.
+		 * In our xstate size checks, we assume that the highest-numbered
+		 * xstate feature has the highest offset in the buffer.  Ensure
+		 * it does.
 		 */
 		WARN_ONCE(last_good_offset > xstate_offsets[i],
-			"x86/fpu: misordered xstate at %d\n", last_good_offset);
+			  "x86/fpu: misordered xstate at %d\n", last_good_offset);
+
 		last_good_offset = xstate_offsets[i];
 	}
 }
-- 
Gitee


From 4e528c80ea9e7ced6fba31a028d939d593fc3be9 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Thu, 9 Jan 2020 13:14:51 -0800
Subject: [PATCH 0063/2161] x86/fpu/xstate: Fix XSAVES offsets in
 setup_xstate_comp()

commit 49a91d61aed1db01097b51a24c77137eb348a0bf upstream.

In setup_xstate_comp(), each XSAVES component offset starts from the
end of its preceding component plus alignment. A disabled feature does
not take space and its offset should be set to the end of its preceding
one with no alignment. However, in this case, alignment is incorrectly
added to the offset, which can cause the next component to have a wrong
offset.

This problem has not been visible because currently there is no xfeature
requiring alignment.

Fix it by tracking the next starting offset only from enabled
xfeatures. To make it clear, also change the function name to
setup_xstate_comp_offsets().

 [ bp: Fix a typo in the comment above it, while at it. ]

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20200109211452.27369-3-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b21c3d766a35..74aa5d5fe57b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -337,11 +337,11 @@ static int xfeature_is_aligned(int xfeature_nr)
 /*
  * This function sets up offsets and sizes of all extended states in
  * xsave area. This supports both standard format and compacted format
- * of the xsave aread.
+ * of the xsave area.
  */
-static void __init setup_xstate_comp(void)
+static void __init setup_xstate_comp_offsets(void)
 {
-	unsigned int xstate_comp_sizes[XFEATURE_MAX];
+	unsigned int next_offset;
 	int i;
 
 	/*
@@ -355,31 +355,23 @@ static void __init setup_xstate_comp(void)
 
 	if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
 		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-			if (xfeature_enabled(i)) {
+			if (xfeature_enabled(i))
 				xstate_comp_offsets[i] = xstate_offsets[i];
-				xstate_comp_sizes[i] = xstate_sizes[i];
-			}
 		}
 		return;
 	}
 
-	xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
-		FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 
 	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (xfeature_enabled(i))
-			xstate_comp_sizes[i] = xstate_sizes[i];
-		else
-			xstate_comp_sizes[i] = 0;
+		if (!xfeature_enabled(i))
+			continue;
 
-		if (i > FIRST_EXTENDED_XFEATURE) {
-			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
-					+ xstate_comp_sizes[i-1];
+		if (xfeature_is_aligned(i))
+			next_offset = ALIGN(next_offset, 64);
 
-			if (xfeature_is_aligned(i))
-				xstate_comp_offsets[i] =
-					ALIGN(xstate_comp_offsets[i], 64);
-		}
+		xstate_comp_offsets[i] = next_offset;
+		next_offset += xstate_sizes[i];
 	}
 }
 
@@ -773,7 +765,7 @@ void __init fpu__init_system_xstate(void)
 
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
-	setup_xstate_comp();
+	setup_xstate_comp_offsets();
 	print_xstate_offset_size();
 
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
-- 
Gitee


From 746c7017e44553ecbcd4330a2488b10aa163e8f3 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Thu, 9 Jan 2020 13:14:52 -0800
Subject: [PATCH 0064/2161] x86/fpu/xstate: Warn when checking alignment of
 disabled xfeatures

commit e70b100806d63fb79775858ea92e1a716da46186 upstream.

An XSAVES component's alignment/offset is meaningful only when the
feature is enabled. Return zero and WARN_ONCE on checking alignment of
disabled features.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20200109211452.27369-4-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 74aa5d5fe57b..9cf40a7ff7ae 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -325,6 +325,13 @@ static int xfeature_is_aligned(int xfeature_nr)
 	u32 eax, ebx, ecx, edx;
 
 	CHECK_XFEATURE(xfeature_nr);
+
+	if (!xfeature_enabled(xfeature_nr)) {
+		WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n",
+			  xfeature_nr);
+		return 0;
+	}
+
 	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
 	/*
 	 * The value returned by ECX[1] indicates the alignment
-- 
Gitee


From 314ac5496810fb00c8971f7b95874564de96aa03 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 12 May 2020 07:54:35 -0700
Subject: [PATCH 0065/2161] x86/fpu/xstate: Rename validate_xstate_header() to
 validate_user_xstate_header()

commit 5274e6c172c47241534e970df26a522497086624 upstream.

The function validate_xstate_header() validates an xstate header coming
from userspace (PTRACE or sigreturn). To make it clear, rename it to
validate_user_xstate_header().

Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200512145444.15483-2-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 2 +-
 arch/x86/kernel/fpu/regset.c      | 2 +-
 arch/x86/kernel/fpu/signal.c      | 2 +-
 arch/x86/kernel/fpu/xstate.c      | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index c6136d79f8c0..fc4db51f3b53 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -56,6 +56,6 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-extern int validate_xstate_header(const struct xstate_header *hdr);
+int validate_user_xstate_header(const struct xstate_header *hdr);
 
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index d652b939ccfb..bd1d0649f8ce 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -139,7 +139,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	} else {
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 		if (!ret)
-			ret = validate_xstate_header(&xsave->header);
+			ret = validate_user_xstate_header(&xsave->header);
 	}
 
 	/*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 400a05e1c1c5..585e3651b98f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -366,7 +366,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
 
 			if (!ret && state_size > offsetof(struct xregs_state, header))
-				ret = validate_xstate_header(&fpu->state.xsave.header);
+				ret = validate_user_xstate_header(&fpu->state.xsave.header);
 		}
 		if (ret)
 			goto err_out;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9cf40a7ff7ae..7e32693a8df4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -472,7 +472,7 @@ int using_compacted_format(void)
 }
 
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_xstate_header(const struct xstate_header *hdr)
+int validate_user_xstate_header(const struct xstate_header *hdr)
 {
 	/* No unknown or supervisor features may be set */
 	if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
@@ -1157,7 +1157,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 
 	memcpy(&hdr, kbuf + offset, size);
 
-	if (validate_xstate_header(&hdr))
+	if (validate_user_xstate_header(&hdr))
 		return -EINVAL;
 
 	for (i = 0; i < XFEATURE_MAX; i++) {
@@ -1211,7 +1211,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	if (__copy_from_user(&hdr, ubuf + offset, size))
 		return -EFAULT;
 
-	if (validate_xstate_header(&hdr))
+	if (validate_user_xstate_header(&hdr))
 		return -EINVAL;
 
 	for (i = 0; i < XFEATURE_MAX; i++) {
-- 
Gitee


From 1ed1df0fbcd20e8520ca8188fd4d122313cf0b38 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Jun 2021 16:18:25 +0200
Subject: [PATCH 0066/2161] x86/fpu: Make init_fpstate correct with optimized
 XSAVE

commit f9dfb5e390fab2df9f7944bb91e7705aba14cd26 upstream.

The XSAVE init code initializes all enabled and supported components with
XRSTOR(S) to init state. Then it XSAVEs the state of the components back
into init_fpstate which is used in several places to fill in the init state
of components.

This works correctly with XSAVE, but not with XSAVEOPT and XSAVES because
those use the init optimization and skip writing state of components which
are in init state. So init_fpstate.xsave still contains all zeroes after
this operation.

There are two ways to solve that:

   1) Use XSAVE unconditionally, but that requires to reshuffle the buffer when
      XSAVES is enabled because XSAVES uses compacted format.

   2) Save the components which are known to have a non-zero init state by other
      means.

Looking deeper, #2 is the right thing to do because all components the
kernel supports have all-zeroes init state except the legacy features (FP,
SSE). Those cannot be hard coded because the states are not identical on all
CPUs, but they can be saved with FXSAVE which avoids all conditionals.

Use FXSAVE to save the legacy FP/SSE components in init_fpstate along with
a BUILD_BUG_ON() which reminds developers to validate that a newly added
component has all zeroes init state. As a bonus remove the now unused
copy_xregs_to_kernel_booting() crutch.

The XSAVE and reshuffle method can still be implemented in the unlikely
case that components are added which have a non-zero init state and no
other means to save them. For now, FXSAVE is just simple and good enough.

  [ bp: Fix a typo or two in the text. ]

Fixes: 6bad06b76892 ("x86, xsave: Use xsaveopt in context-switch path when supported")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210618143444.587311343@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 30 ++++++-----------------
 arch/x86/kernel/fpu/xstate.c        | 38 ++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 00eac7f1529b..d6c2cb549fed 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -203,6 +203,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
 }
 
+static inline void fxsave(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+	else
+		asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
 /* These macros all use (%edi)/(%rdi) as the single memory argument. */
 #define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
 #define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
@@ -267,28 +275,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
 
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
-{
-	u64 mask = -1;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
-	else
-		XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
-	/* We should never fault when copying to a kernel buffer: */
-	WARN_ON_FPU(err);
-}
-
 /*
  * This function is called only during boot time when x86 caps are not set
  * up and alternative can not be used yet.
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7e32693a8df4..4ecdc2d48893 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -397,6 +397,24 @@ static void __init print_xstate_offset_size(void)
 	}
 }
 
+/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED		\
+	(XFEATURE_MASK_FP |			\
+	 XFEATURE_MASK_SSE |			\
+	 XFEATURE_MASK_YMM |			\
+	 XFEATURE_MASK_OPMASK |			\
+	 XFEATURE_MASK_ZMM_Hi256 |		\
+	 XFEATURE_MASK_Hi16_ZMM	 |		\
+	 XFEATURE_MASK_PKRU |			\
+	 XFEATURE_MASK_BNDREGS |		\
+	 XFEATURE_MASK_BNDCSR)
+
 /*
  * setup the xstate image representing the init state
  */
@@ -404,6 +422,8 @@ static void __init setup_init_fpu_buf(void)
 {
 	static int on_boot_cpu __initdata = 1;
 
+	BUILD_BUG_ON(XCNTXT_MASK != XFEATURES_INIT_FPSTATE_HANDLED);
+
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
 
@@ -423,10 +443,22 @@ static void __init setup_init_fpu_buf(void)
 	copy_kernel_to_xregs_booting(&init_fpstate.xsave);
 
 	/*
-	 * Dump the init state again. This is to identify the init state
-	 * of any feature which is not represented by all zero's.
+	 * All components are now in init state. Read the state back so
+	 * that init_fpstate contains all non-zero init state. This only
+	 * works with XSAVE, but not with XSAVEOPT and XSAVES because
+	 * those use the init optimization which skips writing data for
+	 * components in init state.
+	 *
+	 * XSAVE could be used, but that would require to reshuffle the
+	 * data when XSAVES is available because XSAVES uses xstate
+	 * compaction. But doing so is a pointless exercise because most
+	 * components have an all zeros init state except for the legacy
+	 * ones (FP and SSE). Those can be saved with FXSAVE into the
+	 * legacy area. Adding new features requires to ensure that init
+	 * state is all zeroes or if not to add the necessary handling
+	 * here.
 	 */
-	copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+	fxsave(&init_fpstate.fxsave);
 }
 
 static int xfeature_uncompacted_offset(int xfeature_nr)
-- 
Gitee


From 32e1510f8975e1a6ca929b593bf240388b8e3394 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 12 May 2020 07:54:36 -0700
Subject: [PATCH 0067/2161] x86/fpu/xstate: Define new macros for supervisor
 and user xstates

commit 8ab22804efefea9ecf3c68aa00f1fa69c70fcfad upstream.

XCNTXT_MASK is 'all supported xfeatures' before introducing supervisor
xstates.  Rename it to XFEATURE_MASK_USER_SUPPORTED to make clear that
these are user xstates.

Replace XFEATURE_MASK_SUPERVISOR with the following:
- XFEATURE_MASK_SUPERVISOR_SUPPORTED: Currently nothing.  ENQCMD and
  Control-flow Enforcement Technology (CET) will be introduced in separate
  series.
- XFEATURE_MASK_SUPERVISOR_UNSUPPORTED: Currently only Processor Trace.
- XFEATURE_MASK_SUPERVISOR_ALL: the combination of above.

Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20200512145444.15483-3-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 36 ++++++++++++++++++++-----------
 arch/x86/kernel/fpu/init.c        |  3 ++-
 arch/x86/kernel/fpu/xstate.c      | 28 ++++++++++++------------
 3 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index fc4db51f3b53..b08fa823425f 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -21,19 +21,29 @@
 #define XSAVE_YMM_SIZE	    256
 #define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
-/* Supervisor features */
-#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
-
-/* All currently supported features */
-#define XCNTXT_MASK		(XFEATURE_MASK_FP | \
-				 XFEATURE_MASK_SSE | \
-				 XFEATURE_MASK_YMM | \
-				 XFEATURE_MASK_OPMASK | \
-				 XFEATURE_MASK_ZMM_Hi256 | \
-				 XFEATURE_MASK_Hi16_ZMM	 | \
-				 XFEATURE_MASK_PKRU | \
-				 XFEATURE_MASK_BNDREGS | \
-				 XFEATURE_MASK_BNDCSR)
+/* All currently supported user features */
+#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
+				      XFEATURE_MASK_SSE | \
+				      XFEATURE_MASK_YMM | \
+				      XFEATURE_MASK_OPMASK | \
+				      XFEATURE_MASK_ZMM_Hi256 | \
+				      XFEATURE_MASK_Hi16_ZMM	 | \
+				      XFEATURE_MASK_PKRU | \
+				      XFEATURE_MASK_BNDREGS | \
+				      XFEATURE_MASK_BNDCSR)
+
+/* All currently supported supervisor features */
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0)
+
+/*
+ * Unsupported supervisor features. When a supervisor feature in this mask is
+ * supported in the future, move it to the supported supervisor feature mask.
+ */
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
+
+/* All supervisor states including supported and unsupported states. */
+#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
+				      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
 #ifdef CONFIG_X86_64
 #define REX_PREFIX	"0x48, "
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index b271da0fa219..f8ff895aaf7e 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -224,7 +224,8 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  */
 u64 __init fpu__get_supported_xfeatures_mask(void)
 {
-	return XCNTXT_MASK;
+	return XFEATURE_MASK_USER_SUPPORTED |
+	       XFEATURE_MASK_SUPERVISOR_SUPPORTED;
 }
 
 /* Legacy code to initialize eager fpu mode. */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4ecdc2d48893..2f9f5ed5d2b5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -208,14 +208,13 @@ void fpu__init_cpu_xstate(void)
 	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
 		return;
 	/*
-	 * Make it clear that XSAVES supervisor states are not yet
-	 * implemented should anyone expect it to work by changing
-	 * bits in XFEATURE_MASK_* macros and XCR0.
+	 * Unsupported supervisor xstates should not be found in
+	 * the xfeatures mask.
 	 */
-	WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
-		"x86/fpu: XSAVES supervisor states are not yet implemented.\n");
+	WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED),
+		  "x86/fpu: Found unsupported supervisor xstates.\n");
 
-	xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
+	xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
 
 	cr4_set_bits(X86_CR4_OSXSAVE);
 	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
@@ -422,7 +421,7 @@ static void __init setup_init_fpu_buf(void)
 {
 	static int on_boot_cpu __initdata = 1;
 
-	BUILD_BUG_ON(XCNTXT_MASK != XFEATURES_INIT_FPSTATE_HANDLED);
+	BUILD_BUG_ON(XFEATURE_MASK_USER_SUPPORTED != XFEATURES_INIT_FPSTATE_HANDLED);
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
@@ -470,7 +469,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
 	 * format. Checking a supervisor state's uncompacted offset is
 	 * an error.
 	 */
-	if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) {
+	if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) {
 		WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
 		return -1;
 	}
@@ -507,7 +506,7 @@ int using_compacted_format(void)
 int validate_user_xstate_header(const struct xstate_header *hdr)
 {
 	/* No unknown or supervisor features may be set */
-	if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+	if (hdr->xfeatures & ~(xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED))
 		return -EINVAL;
 
 	/* Userspace must use the uncompacted format */
@@ -800,7 +799,8 @@ void __init fpu__init_system_xstate(void)
 	 * Update info used for ptrace frames; use standard-format size and no
 	 * supervisor xstates:
 	 */
-	update_regset_xstate_info(fpu_user_xstate_size,	xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
+	update_regset_xstate_info(fpu_user_xstate_size,
+				  xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED);
 
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
@@ -1042,7 +1042,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 	 */
 	memset(&header, 0, sizeof(header));
 	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+	header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED;
 
 	if (header.xfeatures & XFEATURE_MASK_FP)
 		copy_part(0, off_mxcsr,
@@ -1122,7 +1122,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 	 */
 	memset(&header, 0, sizeof(header));
 	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+	header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED;
 
 	/*
 	 * Copy xregs_state->header:
@@ -1215,7 +1215,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	 * The state that came in from userspace was user-state only.
 	 * Mask all the user states out of 'xfeatures':
 	 */
-	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
 
 	/*
 	 * Add back in the features that came in from userspace:
@@ -1271,7 +1271,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	 * The state that came in from userspace was user-state only.
 	 * Mask all the user states out of 'xfeatures':
 	 */
-	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
 
 	/*
 	 * Add back in the features that came in from userspace:
-- 
Gitee


From f95c9825a47bfec55fa64e847702a8896ed4f606 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 12 May 2020 07:54:37 -0700
Subject: [PATCH 0068/2161] x86/fpu/xstate: Separate user and supervisor
 xfeatures mask

commit 524bb73bc15c56f5587e33c817e103a259b019d2 upstream.

Before the introduction of XSAVES supervisor states, 'xfeatures_mask' is
used at various places to determine XSAVE buffer components and XCR0 bits.
It contains only user xstates.  To support supervisor xstates, it is
necessary to separate user and supervisor xstates:

- First, change 'xfeatures_mask' to 'xfeatures_mask_all', which represents
  the full set of bits that should ever be set in a kernel XSAVE buffer.
- Introduce xfeatures_mask_supervisor() and xfeatures_mask_user() to
  extract relevant xfeatures from xfeatures_mask_all.

Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20200512145444.15483-4-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  2 +-
 arch/x86/include/asm/fpu/xstate.h   | 13 ++++-
 arch/x86/kernel/fpu/signal.c        | 16 +++++--
 arch/x86/kernel/fpu/xstate.c        | 73 +++++++++++++++++------------
 4 files changed, 67 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index d6c2cb549fed..744e0f385604 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -92,7 +92,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
 	 * XRSTORS requires these bits set in xcomp_bv, or it will
 	 * trigger #GP:
 	 */
-	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
+	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
 }
 
 static inline void fpstate_init_fxstate(struct fxregs_state *fx)
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index b08fa823425f..92104b298d77 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -51,7 +51,18 @@
 #define REX_PREFIX
 #endif
 
-extern u64 xfeatures_mask;
+extern u64 xfeatures_mask_all;
+
+static inline u64 xfeatures_mask_supervisor(void)
+{
+	return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+}
+
+static inline u64 xfeatures_mask_user(void)
+{
+	return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
+}
+
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 585e3651b98f..3df0cfae535f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -252,13 +252,17 @@ sanitize_restored_xstate(union fpregs_state *state,
  */
 static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 {
+	u64 init_bv;
+
 	if (use_xsave()) {
 		if (fx_only) {
-			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+			init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
+
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 			return copy_user_to_fxregs(buf);
 		} else {
-			u64 init_bv = xfeatures_mask & ~xbv;
+			init_bv = xfeatures_mask_user() & ~xbv;
+
 			if (unlikely(init_bv))
 				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 			return copy_user_to_xregs(buf, xbv);
@@ -358,7 +362,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 
 	if (use_xsave() && !fx_only) {
-		u64 init_bv = xfeatures_mask & ~xfeatures;
+		u64 init_bv = xfeatures_mask_user() & ~xfeatures;
 
 		if (using_compacted_format()) {
 			ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
@@ -389,7 +393,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 		fpregs_lock();
 		if (use_xsave()) {
-			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+			u64 init_bv;
+
+			init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 		}
 
@@ -465,7 +471,7 @@ void fpu__init_prepare_fx_sw_frame(void)
 
 	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
 	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xfeatures = xfeatures_mask;
+	fx_sw_reserved.xfeatures = xfeatures_mask_user();
 	fx_sw_reserved.xstate_size = fpu_user_xstate_size;
 
 	if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 2f9f5ed5d2b5..916753d083a0 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -54,9 +54,10 @@ static short xsave_cpuid_features[] __initdata = {
 };
 
 /*
- * Mask of xstate features supported by the CPU and the kernel:
+ * This represents the full set of bits that should ever be set in a kernel
+ * XSAVE buffer, both supervisor and user xstates.
  */
-u64 xfeatures_mask __read_mostly;
+u64 xfeatures_mask_all __read_mostly;
 
 static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
@@ -76,7 +77,7 @@ unsigned int fpu_user_xstate_size;
  */
 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 {
-	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
 
 	if (unlikely(feature_name)) {
 		long xfeature_idx, max_idx;
@@ -150,7 +151,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	 * None of the feature bits are in init state. So nothing else
 	 * to do for us, as the memory layout is up to date.
 	 */
-	if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+	if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all)
 		return;
 
 	/*
@@ -177,7 +178,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	 * in a special way already:
 	 */
 	feature_bit = 0x2;
-	xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+	xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2;
 
 	/*
 	 * Update all the remaining memory layouts according to their
@@ -205,19 +206,28 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
  */
 void fpu__init_cpu_xstate(void)
 {
-	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
+	u64 unsup_bits;
+
+	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
 		return;
 	/*
 	 * Unsupported supervisor xstates should not be found in
 	 * the xfeatures mask.
 	 */
-	WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED),
-		  "x86/fpu: Found unsupported supervisor xstates.\n");
+	unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
+	WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n",
+		  unsup_bits);
 
-	xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
+	xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
 
 	cr4_set_bits(X86_CR4_OSXSAVE);
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+
+	/*
+	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
+	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
+	 * states can be set here.
+	 */
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
 }
 
 /*
@@ -225,9 +235,9 @@ void fpu__init_cpu_xstate(void)
  * functions here: one for user xstates and the other for
  * system xstates.  For now, they are the same.
  */
-static int xfeature_enabled(enum xfeature xfeature)
+static bool xfeature_enabled(enum xfeature xfeature)
 {
-	return !!(xfeatures_mask & (1UL << xfeature));
+	return xfeatures_mask_all & BIT_ULL(xfeature);
 }
 
 /*
@@ -434,7 +444,7 @@ static void __init setup_init_fpu_buf(void)
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
-						     xfeatures_mask;
+						     xfeatures_mask_all;
 
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
@@ -506,7 +516,7 @@ int using_compacted_format(void)
 int validate_user_xstate_header(const struct xstate_header *hdr)
 {
 	/* No unknown or supervisor features may be set */
-	if (hdr->xfeatures & ~(xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED))
+	if (hdr->xfeatures & ~xfeatures_mask_user())
 		return -EINVAL;
 
 	/* Userspace must use the uncompacted format */
@@ -641,7 +651,7 @@ static void do_extra_xstate_size_checks(void)
 
 
 /*
- * Get total size of enabled xstates in XCR0/xfeatures_mask.
+ * Get total size of enabled xstates in XCR0 | IA32_XSS.
  *
  * Note the SDM's wording here.  "sub-function 0" only enumerates
  * the size of the *user* states.  If we use it to size a buffer
@@ -731,7 +741,7 @@ static int __init init_xstate_size(void)
  */
 static void fpu__init_disable_system_xstate(void)
 {
-	xfeatures_mask = 0;
+	xfeatures_mask_all = 0;
 	cr4_clear_bits(X86_CR4_OSXSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 }
@@ -766,16 +776,21 @@ void __init fpu__init_system_xstate(void)
 		return;
 	}
 
+	/*
+	 * Find user xstates supported by the processor.
+	 */
 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-	xfeatures_mask = eax + ((u64)edx << 32);
+	xfeatures_mask_all = eax + ((u64)edx << 32);
 
-	if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+	/* Place supervisor features in xfeatures_mask_all here */
+	if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		/*
 		 * This indicates that something really unexpected happened
 		 * with the enumeration.  Disable XSAVE and try to continue
 		 * booting without it.  This is too early to BUG().
 		 */
-		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
+		       xfeatures_mask_all);
 		goto out_disable;
 	}
 
@@ -784,10 +799,10 @@ void __init fpu__init_system_xstate(void)
 	 */
 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
 		if (!boot_cpu_has(xsave_cpuid_features[i]))
-			xfeatures_mask &= ~BIT(i);
+			xfeatures_mask_all &= ~BIT_ULL(i);
 	}
 
-	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
+	xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
@@ -799,8 +814,7 @@ void __init fpu__init_system_xstate(void)
 	 * Update info used for ptrace frames; use standard-format size and no
 	 * supervisor xstates:
 	 */
-	update_regset_xstate_info(fpu_user_xstate_size,
-				  xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED);
+	update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user());
 
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
@@ -808,7 +822,7 @@ void __init fpu__init_system_xstate(void)
 	print_xstate_offset_size();
 
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
-		xfeatures_mask,
+		xfeatures_mask_all,
 		fpu_kernel_xstate_size,
 		boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
 	return;
@@ -827,7 +841,7 @@ void fpu__resume_cpu(void)
 	 * Restore XCR0 on xsave capable CPUs:
 	 */
 	if (boot_cpu_has(X86_FEATURE_XSAVE))
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
 }
 
 /*
@@ -872,10 +886,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 
 	/*
 	 * We should not ever be requesting features that we
-	 * have not enabled.  Remember that xfeatures_mask is
-	 * what we write to the XCR0 register.
+	 * have not enabled.
 	 */
-	WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
+	WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)),
 		  "get of unsupported state");
 	/*
 	 * This assumes the last 'xsave*' instruction to
@@ -1042,7 +1055,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 	 */
 	memset(&header, 0, sizeof(header));
 	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED;
+	header.xfeatures &= xfeatures_mask_user();
 
 	if (header.xfeatures & XFEATURE_MASK_FP)
 		copy_part(0, off_mxcsr,
@@ -1122,7 +1135,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 	 */
 	memset(&header, 0, sizeof(header));
 	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED;
+	header.xfeatures &= xfeatures_mask_user();
 
 	/*
 	 * Copy xregs_state->header:
-- 
Gitee


From 04dec08dc8cbc0d054a1f9c2845780e0259049a0 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 12 May 2020 07:54:38 -0700
Subject: [PATCH 0069/2161] x86/fpu/xstate: Introduce XSAVES supervisor states

commit 71581eefd7a0a81b1af7d7c93641925a01d70a9a upstream.

Enable XSAVES supervisor states by setting MSR_IA32_XSS bits according
to CPUID enumeration results. Also revise comments at various places.

Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20200512145444.15483-5-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 916753d083a0..7c4c716fcb92 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -228,13 +228,14 @@ void fpu__init_cpu_xstate(void)
 	 * states can be set here.
 	 */
 	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
+
+	/*
+	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
+	 */
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
 }
 
-/*
- * Note that in the future we will likely need a pair of
- * functions here: one for user xstates and the other for
- * system xstates.  For now, they are the same.
- */
 static bool xfeature_enabled(enum xfeature xfeature)
 {
 	return xfeatures_mask_all & BIT_ULL(xfeature);
@@ -657,9 +658,6 @@ static void do_extra_xstate_size_checks(void)
  * the size of the *user* states.  If we use it to size a buffer
  * that we use 'XSAVES' on, we could potentially overflow the
  * buffer because 'XSAVES' saves system states too.
- *
- * Note that we do not currently set any bits on IA32_XSS so
- * 'XCR0 | IA32_XSS == XCR0' for now.
  */
 static unsigned int __init get_xsaves_size(void)
 {
@@ -782,7 +780,12 @@ void __init fpu__init_system_xstate(void)
 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 	xfeatures_mask_all = eax + ((u64)edx << 32);
 
-	/* Place supervisor features in xfeatures_mask_all here */
+	/*
+	 * Find supervisor xstates supported by the processor.
+	 */
+	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+	xfeatures_mask_all |= ecx + ((u64)edx << 32);
+
 	if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		/*
 		 * This indicates that something really unexpected happened
@@ -842,6 +845,13 @@ void fpu__resume_cpu(void)
 	 */
 	if (boot_cpu_has(X86_FEATURE_XSAVE))
 		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
+
+	/*
+	 * Restore IA32_XSS. The same CPUID bit enumerates support
+	 * of XSAVES and MSR_IA32_XSS.
+	 */
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
 }
 
 /*
-- 
Gitee


From 26cf8b3e99769939fb7f780ce56a6451cca067ca Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 12 May 2020 07:54:39 -0700
Subject: [PATCH 0070/2161] x86/fpu/xstate: Define new functions for clearing
 fpregs and xstates

commit b860eb8dce5906b14e3a7f3c771e0b3d6ef61b94 upstream.

Currently, fpu__clear() clears all fpregs and xstates.  Once XSAVES
supervisor states are introduced, supervisor settings (e.g. CET xstates)
must remain active for signals; It is necessary to have separate functions:

- Create fpu__clear_user_states(): clear only user settings for signals;
- Create fpu__clear_all(): clear both user and supervisor settings in
   flush_thread().

Also modify copy_init_fpstate_to_fpregs() to take a mask from above two
functions.

Remove obvious side-comment in fpu__clear(), while at it.

 [ bp: Make the second argument of fpu__clear() bool after requesting it
   a bunch of times during review.
  - Add a comment about copy_init_fpstate_to_fpregs() locking needs. ]

Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20200512145444.15483-6-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  3 +-
 arch/x86/kernel/fpu/core.c          | 53 +++++++++++++++++++----------
 arch/x86/kernel/fpu/signal.c        |  4 +--
 arch/x86/kernel/process.c           |  2 +-
 arch/x86/kernel/signal.c            |  2 +-
 5 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 744e0f385604..36639b1abab6 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -31,7 +31,8 @@ extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
 extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
-extern void fpu__clear(struct fpu *fpu);
+extern void fpu__clear_user_states(struct fpu *fpu);
+extern void fpu__clear_all(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8c9b202f3e6d..9591cdfcf5b6 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -298,15 +298,13 @@ void fpu__drop(struct fpu *fpu)
 }
 
 /*
- * Clear FPU registers by setting them up from
- * the init fpstate:
+ * Clear FPU registers by setting them up from the init fpstate.
+ * Caller must do fpregs_[un]lock() around it.
  */
-static inline void copy_init_fpstate_to_fpregs(void)
+static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
 {
-	fpregs_lock();
-
 	if (use_xsave())
-		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+		copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
 	else if (static_cpu_has(X86_FEATURE_FXSR))
 		copy_kernel_to_fxregs(&init_fpstate.fxsave);
 	else
@@ -314,9 +312,6 @@ static inline void copy_init_fpstate_to_fpregs(void)
 
 	if (boot_cpu_has(X86_FEATURE_OSPKE))
 		copy_init_pkru_to_fpregs();
-
-	fpregs_mark_activate();
-	fpregs_unlock();
 }
 
 /*
@@ -325,18 +320,40 @@ static inline void copy_init_fpstate_to_fpregs(void)
  * Called by sys_execve(), by the signal handler code and by various
  * error paths.
  */
-void fpu__clear(struct fpu *fpu)
+static void fpu__clear(struct fpu *fpu, bool user_only)
 {
-	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
+	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	fpu__drop(fpu);
+	if (!static_cpu_has(X86_FEATURE_FPU)) {
+		fpu__drop(fpu);
+		fpu__initialize(fpu);
+		return;
+	}
 
-	/*
-	 * Make sure fpstate is cleared and initialized.
-	 */
-	fpu__initialize(fpu);
-	if (static_cpu_has(X86_FEATURE_FPU))
-		copy_init_fpstate_to_fpregs();
+	fpregs_lock();
+
+	if (user_only) {
+		if (!fpregs_state_valid(fpu, smp_processor_id()) &&
+		    xfeatures_mask_supervisor())
+			copy_kernel_to_xregs(&fpu->state.xsave,
+					     xfeatures_mask_supervisor());
+		copy_init_fpstate_to_fpregs(xfeatures_mask_user());
+	} else {
+		copy_init_fpstate_to_fpregs(xfeatures_mask_all);
+	}
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+}
+
+void fpu__clear_user_states(struct fpu *fpu)
+{
+	fpu__clear(fpu, true);
+}
+
+void fpu__clear_all(struct fpu *fpu)
+{
+	fpu__clear(fpu, false);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 3df0cfae535f..cd6eafba12da 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -289,7 +289,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
 
 	if (!buf) {
-		fpu__clear(fpu);
+		fpu__clear_user_states(fpu);
 		return 0;
 	}
 
@@ -416,7 +416,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 err_out:
 	if (ret)
-		fpu__clear(fpu);
+		fpu__clear_user_states(fpu);
 	return ret;
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0105fd785e9a..5db6ee4894e5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -241,7 +241,7 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	fpu__clear(&tsk->thread.fpu);
+	fpu__clear_all(&tsk->thread.fpu);
 }
 
 void disable_TSC(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2fdbf5ef8c39..3c7db2541998 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -763,7 +763,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		fpu__clear(fpu);
+		fpu__clear_user_states(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
-- 
Gitee


From 4c0dd4b69e75adc74fa0a2d9d3b095a37b9bd35c Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 12 May 2020 07:54:40 -0700
Subject: [PATCH 0071/2161] x86/fpu/xstate: Update sanitize_restored_xstate()
 for supervisor xstates

commit 5d6b6a6f9b5ce7ac42273efd75d61ec63b463c18 upstream.

The function sanitize_restored_xstate() sanitizes user xstates of an XSAVE
buffer by clearing bits not in the input 'xfeatures' from the buffer's
header->xfeatures, effectively resetting those features back to the init
state.

When supervisor xstates are introduced, it is necessary to make sure only
user xstates are sanitized.  Ensure supervisor bits in header->xfeatures
stay set and supervisor states are not modified.

To make names clear, also:

- Rename the function to sanitize_restored_user_xstate().
- Rename input parameter 'xfeatures' to 'user_xfeatures'.
- In __fpu__restore_sig(), rename 'xfeatures' to 'user_xfeatures'.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20200512145444.15483-7-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 37 +++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index cd6eafba12da..77e5c2e34ee6 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -211,9 +211,9 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 }
 
 static inline void
-sanitize_restored_xstate(union fpregs_state *state,
-			 struct user_i387_ia32_struct *ia32_env,
-			 u64 xfeatures, int fx_only)
+sanitize_restored_user_xstate(union fpregs_state *state,
+			      struct user_i387_ia32_struct *ia32_env,
+			      u64 user_xfeatures, int fx_only)
 {
 	struct xregs_state *xsave = &state->xsave;
 	struct xstate_header *header = &xsave->header;
@@ -226,13 +226,22 @@ sanitize_restored_xstate(union fpregs_state *state,
 		 */
 
 		/*
-		 * Init the state that is not present in the memory
-		 * layout and not enabled by the OS.
+		 * 'user_xfeatures' might have bits clear which are
+		 * set in header->xfeatures. This represents features that
+		 * were in init state prior to a signal delivery, and need
+		 * to be reset back to the init state.  Clear any user
+		 * feature bits which are set in the kernel buffer to get
+		 * them back to the init state.
+		 *
+		 * Supervisor state is unchanged by input from userspace.
+		 * Ensure supervisor state bits stay set and supervisor
+		 * state is not modified.
 		 */
 		if (fx_only)
 			header->xfeatures = XFEATURE_MASK_FPSSE;
 		else
-			header->xfeatures &= xfeatures;
+			header->xfeatures &= user_xfeatures |
+					     xfeatures_mask_supervisor();
 	}
 
 	if (use_fxsr()) {
@@ -281,7 +290,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
 	struct user_i387_ia32_struct env;
-	u64 xfeatures = 0;
+	u64 user_xfeatures = 0;
 	int fx_only = 0;
 	int ret = 0;
 
@@ -314,7 +323,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			trace_x86_fpu_xstate_check_failed(fpu);
 		} else {
 			state_size = fx_sw_user.xstate_size;
-			xfeatures = fx_sw_user.xfeatures;
+			user_xfeatures = fx_sw_user.xfeatures;
 		}
 	}
 
@@ -349,7 +358,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 */
 		fpregs_lock();
 		pagefault_disable();
-		ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only);
+		ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only);
 		pagefault_enable();
 		if (!ret) {
 			fpregs_mark_activate();
@@ -362,7 +371,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 
 	if (use_xsave() && !fx_only) {
-		u64 init_bv = xfeatures_mask_user() & ~xfeatures;
+		u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
 
 		if (using_compacted_format()) {
 			ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
@@ -375,12 +384,13 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		if (ret)
 			goto err_out;
 
-		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
+		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
+					      fx_only);
 
 		fpregs_lock();
 		if (unlikely(init_bv))
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-		ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures);
+		ret = copy_kernel_to_xregs_err(&fpu->state.xsave, user_xfeatures);
 
 	} else if (use_fxsr()) {
 		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
@@ -389,7 +399,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			goto err_out;
 		}
 
-		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
+		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
+					      fx_only);
 
 		fpregs_lock();
 		if (use_xsave()) {
-- 
Gitee


From 184ccd486f5263f6218d16af0519d6a58aaa6e05 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 12 May 2020 07:54:41 -0700
Subject: [PATCH 0072/2161] x86/fpu/xstate: Update copy_kernel_to_xregs_err()
 for supervisor states

commit c95473e175dd1234b7440daa6eb2670ebf529653 upstream.

The function copy_kernel_to_xregs_err() uses XRSTOR which can work with
standard or compacted format without supervisor xstates. However, when
supervisor xstates are present, XRSTORS must be used. Fix it by using
XRSTORS when supervisor state handling is enabled.

I also considered if there were additional cases where XRSTOR might be
mistakenly called instead of XRSTORS.  There are only three XRSTOR sites
in the kernel:

1. copy_kernel_to_xregs_booting(), already switches between XRSTOR and
   XRSTORS based on X86_FEATURE_XSAVES.

2. copy_user_to_xregs(), which *needs* XRSTOR because it is copying from
   userspace and must never copy supervisor state with XRSTORS.

3. copy_kernel_to_xregs_err() mistakenly used XRSTOR only.  Fix it.

 [ bp: Massage commit message. ]

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20200512145444.15483-8-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 36639b1abab6..f5706c0b5cfe 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -386,7 +386,10 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
 	u32 hmask = mask >> 32;
 	int err;
 
-	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+	if (static_cpu_has(X86_FEATURE_XSAVES))
+		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+	else
+		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
 
 	return err;
 }
-- 
Gitee


From 2e2405c25d8be270be54e337075fddbba35665ea Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 12 May 2020 07:54:42 -0700
Subject: [PATCH 0073/2161] x86/fpu: Introduce copy_supervisor_to_kernel()

commit eeedf1533687b8e81865fdbde79eddf7c4b76c9a upstream.

The XSAVES instruction takes a mask and saves only the features specified
in that mask.  The kernel normally specifies that all features be saved.

XSAVES also unconditionally uses the "compacted format" which means that
all specified features are saved next to each other in memory.  If a
feature is removed from the mask, all the features after it will "move
up" into earlier locations in the buffer.

Introduce copy_supervisor_to_kernel(), which saves only supervisor states
and then moves those states into the standard location where they are
normally found.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200512145444.15483-9-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  1 +
 arch/x86/kernel/fpu/xstate.c      | 84 +++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 92104b298d77..422d8369012a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -75,6 +75,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
+void copy_supervisor_to_kernel(struct xregs_state *xsave);
 
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 int validate_user_xstate_header(const struct xstate_header *hdr);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7c4c716fcb92..b90f6053a826 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -62,6 +62,7 @@ u64 xfeatures_mask_all __read_mostly;
 static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
 
 /*
  * The XSAVE area of kernel can be in standard or compacted format;
@@ -392,6 +393,33 @@ static void __init setup_xstate_comp_offsets(void)
 	}
 }
 
+/*
+ * Setup offsets of a supervisor-state-only XSAVES buffer:
+ *
+ * The offsets stored in xstate_comp_offsets[] only work for one specific
+ * value of the Requested Feature BitMap (RFBM).  In cases where a different
+ * RFBM value is used, a different set of offsets is required.  This set of
+ * offsets is for when RFBM=xfeatures_mask_supervisor().
+ */
+static void __init setup_supervisor_only_offsets(void)
+{
+	unsigned int next_offset;
+	int i;
+
+	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (!xfeature_enabled(i) || !xfeature_is_supervisor(i))
+			continue;
+
+		if (xfeature_is_aligned(i))
+			next_offset = ALIGN(next_offset, 64);
+
+		xstate_supervisor_only_offsets[i] = next_offset;
+		next_offset += xstate_sizes[i];
+	}
+}
+
 /*
  * Print out xstate component offsets and sizes
  */
@@ -822,6 +850,7 @@ void __init fpu__init_system_xstate(void)
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
 	setup_xstate_comp_offsets();
+	setup_supervisor_only_offsets();
 	print_xstate_offset_size();
 
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
@@ -1304,6 +1333,61 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	return 0;
 }
 
+/*
+ * Save only supervisor states to the kernel buffer.  This blows away all
+ * old states, and is intended to be used only in __fpu__restore_sig(), where
+ * user states are restored from the user buffer.
+ */
+void copy_supervisor_to_kernel(struct xregs_state *xstate)
+{
+	struct xstate_header *header;
+	u64 max_bit, min_bit;
+	u32 lmask, hmask;
+	int err, i;
+
+	if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES)))
+		return;
+
+	if (!xfeatures_mask_supervisor())
+		return;
+
+	max_bit = __fls(xfeatures_mask_supervisor());
+	min_bit = __ffs(xfeatures_mask_supervisor());
+
+	lmask = xfeatures_mask_supervisor();
+	hmask = xfeatures_mask_supervisor() >> 32;
+	XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+
+	/* We should never fault when copying to a kernel buffer: */
+	if (WARN_ON_FPU(err))
+		return;
+
+	/*
+	 * At this point, the buffer has only supervisor states and must be
+	 * converted back to normal kernel format.
+	 */
+	header = &xstate->header;
+	header->xcomp_bv |= xfeatures_mask_all;
+
+	/*
+	 * This only moves states up in the buffer.  Start with
+	 * the last state and move backwards so that states are
+	 * not overwritten until after they are moved.  Note:
+	 * memmove() allows overlapping src/dst buffers.
+	 */
+	for (i = max_bit; i >= min_bit; i--) {
+		u8 *xbuf = (u8 *)xstate;
+
+		if (!((header->xfeatures >> i) & 1))
+			continue;
+
+		/* Move xfeature 'i' into its normal location */
+		memmove(xbuf + xstate_comp_offsets[i],
+			xbuf + xstate_supervisor_only_offsets[i],
+			xstate_sizes[i]);
+	}
+}
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * Report the amount of time elapsed in millisecond since last AVX512
-- 
Gitee


From 8402c1f27b69a08baf3bd5145295457d6d04169f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jun 2021 21:18:00 +0200
Subject: [PATCH 0074/2161] x86/fpu: Reset state for all signal restore
 failures

commit efa165504943f2128d50f63de0c02faf6dcceb0d upstream.

If access_ok() or fpregs_soft_set() fails in __fpu__restore_sig() then the
function just returns but does not clear the FPU state as it does for all
other fatal failures.

Clear the FPU state for these failures as well.

Fixes: 72a671ced66d ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/87mtryyhhz.ffs@nanos.tec.linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 77e5c2e34ee6..5e914d82debd 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -302,13 +302,17 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		return 0;
 	}
 
-	if (!access_ok(buf, size))
-		return -EACCES;
+	if (!access_ok(buf, size)) {
+		ret = -EACCES;
+		goto out;
+	}
 
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_set(current, NULL,
-				       0, sizeof(struct user_i387_ia32_struct),
-				       NULL, buf) != 0;
+	if (!static_cpu_has(X86_FEATURE_FPU)) {
+		ret = fpregs_soft_set(current, NULL, 0,
+				      sizeof(struct user_i387_ia32_struct),
+				      NULL, buf);
+		goto out;
+	}
 
 	if (use_xsave()) {
 		struct _fpx_sw_bytes fx_sw_user;
@@ -346,7 +350,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	if (ia32_fxstate) {
 		ret = __copy_from_user(&env, buf, sizeof(env));
 		if (ret)
-			goto err_out;
+			goto out;
 		envp = &env;
 	} else {
 		/*
@@ -382,7 +386,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 				ret = validate_user_xstate_header(&fpu->state.xsave.header);
 		}
 		if (ret)
-			goto err_out;
+			goto out;
 
 		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
 					      fx_only);
@@ -396,7 +400,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
 		if (ret) {
 			ret = -EFAULT;
-			goto err_out;
+			goto out;
 		}
 
 		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
@@ -414,7 +418,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	} else {
 		ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
 		if (ret)
-			goto err_out;
+			goto out;
 
 		fpregs_lock();
 		ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
@@ -425,7 +429,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		fpregs_deactivate(fpu);
 	fpregs_unlock();
 
-err_out:
+out:
 	if (ret)
 		fpu__clear_user_states(fpu);
 	return ret;
-- 
Gitee


From 193da881a0d45775b4ef73ba0473557be7eadb5a Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 12 May 2020 07:54:43 -0700
Subject: [PATCH 0075/2161] x86/fpu/xstate: Preserve supervisor states for the
 slow path in __fpu__restore_sig()

commit 98265c17efa9f2279c59262cd27679aca12e0bb8 upstream.

The signal return code is responsible for taking an XSAVE buffer
present in user memory and loading it into the hardware registers. This
operation only affects user XSAVE state and never affects supervisor
state.

The fast path through this code simply points XRSTOR directly at the
user buffer. However, since user memory is not guaranteed to be always
mapped, this XRSTOR can fail. If it fails, the signal return code falls
back to a slow path which can tolerate page faults.

That slow path copies the xfeatures one by one out of the user buffer
into the task's fpu state area. However, by being in a context where it
can handle page faults, the code can also schedule.

The lazy-fpu-load code would think it has an up-to-date fpstate and
would fail to save the supervisor state when scheduling the task out.
When scheduling back in, it would likely restore stale supervisor state.

To fix that, preserve supervisor state before the slow path.  Modify
copy_user_to_fpregs_zeroing() so that if it fails, fpregs are not zeroed,
and there is no need for fpregs_deactivate() and supervisor states are
preserved.

Move set_thread_flag(TIF_NEED_FPU_LOAD) to the slow path.  Without doing
this, the fast path also needs supervisor states to be saved first.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200512145444.15483-10-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 53 +++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 5e914d82debd..9f1b76b23b67 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -262,19 +262,23 @@ sanitize_restored_user_xstate(union fpregs_state *state,
 static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 {
 	u64 init_bv;
+	int r;
 
 	if (use_xsave()) {
 		if (fx_only) {
 			init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
 
-			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-			return copy_user_to_fxregs(buf);
+			r = copy_user_to_fxregs(buf);
+			if (!r)
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			return r;
 		} else {
 			init_bv = xfeatures_mask_user() & ~xbv;
 
-			if (unlikely(init_bv))
+			r = copy_user_to_xregs(buf, xbv);
+			if (!r && unlikely(init_bv))
 				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-			return copy_user_to_xregs(buf, xbv);
+			return r;
 		}
 	} else if (use_fxsr()) {
 		return copy_user_to_fxregs(buf);
@@ -331,28 +335,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		}
 	}
 
-	/*
-	 * The current state of the FPU registers does not matter. By setting
-	 * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate
-	 * is not modified on context switch and that the xstate is considered
-	 * to be loaded again on return to userland (overriding last_cpu avoids
-	 * the optimisation).
-	 */
-	set_thread_flag(TIF_NEED_FPU_LOAD);
-	__fpu_invalidate_fpregs_state(fpu);
-
 	if ((unsigned long)buf_fx % 64)
 		fx_only = 1;
-	/*
-	 * For 32-bit frames with fxstate, copy the fxstate so it can be
-	 * reconstructed later.
-	 */
-	if (ia32_fxstate) {
-		ret = __copy_from_user(&env, buf, sizeof(env));
-		if (ret)
-			goto out;
-		envp = &env;
-	} else {
+	
+	if (!ia32_fxstate) {
 		/*
 		 * Attempt to restore the FPU registers directly from user
 		 * memory. For that to succeed, the user access cannot cause
@@ -369,10 +355,27 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			fpregs_unlock();
 			return 0;
 		}
-		fpregs_deactivate(fpu);
 		fpregs_unlock();
+	} else {
+		/*
+		 * For 32-bit frames with fxstate, copy the fxstate so it can
+		 * be reconstructed later.
+		 */
+		ret = __copy_from_user(&env, buf, sizeof(env));
+		if (ret)
+			goto out;
+		envp = &env;
 	}
 
+	/*
+	 * The current state of the FPU registers does not matter. By setting
+	 * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate
+	 * is not modified on context switch and that the xstate is considered
+	 * to be loaded again on return to userland (overriding last_cpu avoids
+	 * the optimisation).
+	 */
+	set_thread_flag(TIF_NEED_FPU_LOAD);
+	__fpu_invalidate_fpregs_state(fpu);
 
 	if (use_xsave() && !fx_only) {
 		u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
-- 
Gitee


From 2489c17c0a7147831ea76e8e1b91d8ab4898fb4f Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 12 May 2020 07:54:44 -0700
Subject: [PATCH 0076/2161] x86/fpu/xstate: Restore supervisor states for
 signal return

commit 55e00fb66fd5048f4a3ee357018fd26fc527abca upstream.

The signal return fast path directly restores user states from the user
buffer. Once that succeeds, restore supervisor states (but only when
they are not yet restored).

For the slow path, save supervisor states to preserve them across context
switches, and restore after the user states are restored.

The previous version has the overhead of an XSAVES in both the fast and the
slow paths.  It is addressed as the following:

- In the fast path, only do an XRSTORS.
- In the slow path, do a supervisor-state-only XSAVES, and relocate the
  buffer contents.

Some thoughts in the implementation:

- In the slow path, can any supervisor state become stale between
  save/restore?

  Answer: set_thread_flag(TIF_NEED_FPU_LOAD) protects the xstate buffer.

- In the slow path, can any code reference a stale supervisor state
  register between save/restore?

  Answer: In the current lazy-restore scheme, any reference to xstate
  registers needs fpregs_lock()/fpregs_unlock() and __fpregs_load_activate().

- Are there other options?

  One other option is eagerly restoring all supervisor states.

  Currently, CET user-mode states and ENQCMD's PASID do not need to be
  eagerly restored.  The upcoming CET kernel-mode states (24 bytes) need
  to be eagerly restored.  To me, eagerly restoring all supervisor states
  adds more overhead then benefit at this point.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20200512145444.15483-11-yu-cheng.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 44 ++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 9f1b76b23b67..238e5741c895 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -351,6 +351,23 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only);
 		pagefault_enable();
 		if (!ret) {
+
+			/*
+			 * Restore supervisor states: previous context switch
+			 * etc has done XSAVES and saved the supervisor states
+			 * in the kernel buffer from which they can be restored
+			 * now.
+			 *
+			 * We cannot do a single XRSTORS here - which would
+			 * be nice - because the rest of the FPU registers are
+			 * being restored from a user buffer directly. The
+			 * single XRSTORS happens below, when the user buffer
+			 * has been copied to the kernel one.
+			 */
+			if (test_thread_flag(TIF_NEED_FPU_LOAD) &&
+			    xfeatures_mask_supervisor())
+				copy_kernel_to_xregs(&fpu->state.xsave,
+						     xfeatures_mask_supervisor());
 			fpregs_mark_activate();
 			fpregs_unlock();
 			return 0;
@@ -368,14 +385,25 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	}
 
 	/*
-	 * The current state of the FPU registers does not matter. By setting
-	 * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate
-	 * is not modified on context switch and that the xstate is considered
+	 * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
+	 * not modified on context switch and that the xstate is considered
 	 * to be loaded again on return to userland (overriding last_cpu avoids
 	 * the optimisation).
 	 */
-	set_thread_flag(TIF_NEED_FPU_LOAD);
+	fpregs_lock();
+
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+
+		/*
+		 * Supervisor states are not modified by user space input.  Save
+		 * current supervisor states first and invalidate the FPU regs.
+		 */
+		if (xfeatures_mask_supervisor())
+			copy_supervisor_to_kernel(&fpu->state.xsave);
+		set_thread_flag(TIF_NEED_FPU_LOAD);
+	}
 	__fpu_invalidate_fpregs_state(fpu);
+	fpregs_unlock();
 
 	if (use_xsave() && !fx_only) {
 		u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
@@ -397,7 +425,13 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		fpregs_lock();
 		if (unlikely(init_bv))
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-		ret = copy_kernel_to_xregs_err(&fpu->state.xsave, user_xfeatures);
+
+		/*
+		 * Restore previously saved supervisor xstates along with
+		 * copied-in user xstates.
+		 */
+		ret = copy_kernel_to_xregs_err(&fpu->state.xsave,
+					       user_xfeatures | xfeatures_mask_supervisor());
 
 	} else if (use_fxsr()) {
 		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
-- 
Gitee


From 848ee36f386a8246283ce0c6667545e9d54f7e89 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 21 Feb 2020 00:11:55 -0500
Subject: [PATCH 0077/2161] x86: copy_fpstate_to_sigframe(): have
 fpregs_soft_get() use kernel buffer

commit 36c8673f90c8e244b10d1905eb08f4f61c033a25 upstream.

... then copy_to_user() the results

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 238e5741c895..20d453cc79d1 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -170,14 +170,14 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
 
+	if (!static_cpu_has(X86_FEATURE_FPU)) {
+		struct user_i387_ia32_struct fp;
+		fpregs_soft_get(current, NULL, 0, sizeof(fp), &fp, NULL);
+		return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0;
+	}
+
 	if (!access_ok(buf, size))
 		return -EACCES;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(current, NULL, 0,
-			sizeof(struct user_i387_ia32_struct), NULL,
-			(struct _fpstate_32 __user *) buf) ? -1 : 1;
-
 retry:
 	/*
 	 * Load the FPU registers if they are not valid for the current task.
-- 
Gitee


From 556fed41be8bbd7140f057336587e4b9acd9d78b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2020 11:48:25 -0400
Subject: [PATCH 0078/2161] x86: kill dump_fpu()

commit 4dfa103e82aefe15ac525c0d4853653a774be494 upstream.

dead since the removal of aout coredump support...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  1 -
 arch/x86/kernel/fpu/regset.c        | 16 ----------------
 2 files changed, 17 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index f5706c0b5cfe..a24a2843c397 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -34,7 +34,6 @@ extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
 extern void fpu__clear_user_states(struct fpu *fpu);
 extern void fpu__clear_all(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
-extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
 
 /*
  * Boot time FPU initialization functions:
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index bd1d0649f8ce..91f80aca27fb 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -356,20 +356,4 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	return ret;
 }
 
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
-{
-	struct task_struct *tsk = current;
-
-	return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct),
-			   ufpu, NULL);
-}
-EXPORT_SYMBOL(dump_fpu);
-
 #endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
-- 
Gitee


From 746e6b5ce2fc77bbec163a77220ac930d0d56d19 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:07 -0700
Subject: [PATCH 0079/2161] x86/cpufeatures: Add Architectural LBRs feature bit

commit bd657aa3dd8514e62486ce7f90b5e484c18d684d upstream.

CPUID.(EAX=07H, ECX=0):EDX[19] indicates whether an Intel CPU supports
Architectural LBRs.

The "X86_FEATURE_..., word 18" is already mirrored from CPUID
"0x00000007:0 (EDX)". Add X86_FEATURE_ARCH_LBR under the "word 18"
section.

The feature will appear as "arch_lbr" in /proc/cpuinfo.

The Architectural Last Branch Records (LBR) feature enables recording
of software path history by logging taken branches and other control
flows. The feature will be supported in the perf_events subsystem.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f885908107f5..6b3ea62eb62d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -364,6 +364,7 @@
 #define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
+#define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */
-- 
Gitee


From 9b4318d04d65163d4e75f4ed7f4e0ac5fc3e5e2d Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:25 -0700
Subject: [PATCH 0080/2161] x86/fpu: Use proper mask to replace full
 instruction mask

commit a063bf249b9f8d8004f282031781322c1b527d13 upstream.

When saving xstate to a kernel/user XSAVE area with the XSAVE family of
instructions, the current code applies the 'full' instruction mask (-1),
which tries to XSAVE all possible features. This method relies on
hardware to trim 'all possible' down to what is enabled in the
hardware. The code works well for now. However, there will be a
problem, if some features are enabled in hardware, but are not suitable
to be saved into all kernel XSAVE buffers, like task->fpu, due to
performance consideration.

One such example is the Last Branch Records (LBR) state. The LBR state
only contains valuable information when LBR is explicitly enabled by
the perf subsystem, and the size of an LBR state is large (808 bytes
for now). To avoid both CPU overhead and space overhead at each context
switch, the LBR state should not be saved into task->fpu like other
state components. It should be saved/restored on demand when LBR is
enabled in the perf subsystem. Current copy_xregs_to_* will trigger a
buffer overflow for such cases.

Three sites use the '-1' instruction mask which must be updated.

Two are saving/restoring the xstate to/from a kernel-allocated XSAVE
buffer and can use 'xfeatures_mask_all', which will save/restore all of
the features present in a normal task FPU buffer.

The last one saves the register state directly to a user buffer. It
could
also use 'xfeatures_mask_all'. Just as it was with the '-1' argument,
any supervisor states in the mask will be filtered out by the hardware
and not saved to the buffer.  But, to be more explicit about what is
expected to be saved, use xfeatures_mask_user() for the instruction
mask.

KVM includes the header file fpu/internal.h. To avoid 'undefined
xfeatures_mask_all' compiling issue, move copy_fpregs_to_fpstate() to
fpu/core.c and export it, because:
- The xfeatures_mask_all is indirectly used via copy_fpregs_to_fpstate()
  by KVM. The function which is directly used by other modules should be
  exported.
- The copy_fpregs_to_fpstate() is a function, while xfeatures_mask_all
  is a variable for the "internal" FPU state. It's safer to export a
  function than a variable, which may be implicitly changed by others.
- The copy_fpregs_to_fpstate() is a big function with many checks. The
  removal of the inline keyword should not impact the performance.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-20-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 45 ++++-------------------------
 arch/x86/kernel/fpu/core.c          | 39 +++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a24a2843c397..90bdf15d2efe 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -305,7 +305,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
  */
 static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
 {
-	u64 mask = -1;
+	u64 mask = xfeatures_mask_all;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
@@ -341,6 +341,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
  */
 static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 {
+	u64 mask = xfeatures_mask_user();
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
 	int err;
 
 	/*
@@ -352,7 +355,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 		return -EFAULT;
 
 	stac();
-	XSTATE_OP(XSAVE, buf, -1, -1, err);
+	XSTATE_OP(XSAVE, buf, lmask, hmask, err);
 	clac();
 
 	return err;
@@ -393,43 +396,7 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
 	return err;
 }
 
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact and we can
- * keep registers active.
- *
- * The legacy FNSAVE instruction cleared all FPU state
- * unconditionally, so registers are essentially destroyed.
- * Modern FPU state can be kept in registers, if there are
- * no pending FP exceptions.
- */
-static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
-{
-	if (likely(use_xsave())) {
-		copy_xregs_to_kernel(&fpu->state.xsave);
-
-		/*
-		 * AVX512 state is tracked here because its use is
-		 * known to slow the max clock speed of the core.
-		 */
-		if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
-			fpu->avx512_timestamp = jiffies;
-		return 1;
-	}
-
-	if (likely(use_fxsr())) {
-		copy_fxregs_to_kernel(fpu);
-		return 1;
-	}
-
-	/*
-	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
-	 * so we have to mark them inactive:
-	 */
-	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
-
-	return 0;
-}
+extern int copy_fpregs_to_fpstate(struct fpu *fpu);
 
 static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
 {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 9591cdfcf5b6..571220ac8bea 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -82,6 +82,45 @@ bool irq_fpu_usable(void)
 }
 EXPORT_SYMBOL(irq_fpu_usable);
 
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+	if (likely(use_xsave())) {
+		copy_xregs_to_kernel(&fpu->state.xsave);
+
+		/*
+		 * AVX512 state is tracked here because its use is
+		 * known to slow the max clock speed of the core.
+		 */
+		if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+			fpu->avx512_timestamp = jiffies;
+		return 1;
+	}
+
+	if (likely(use_fxsr())) {
+		copy_fxregs_to_kernel(fpu);
+		return 1;
+	}
+
+	/*
+	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
+	 * so we have to mark them inactive:
+	 */
+	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+	return 0;
+}
+EXPORT_SYMBOL(copy_fpregs_to_fpstate);
+
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
 	preempt_disable();
-- 
Gitee


From ab0645f2903e598edd6a7329e4ce49174d96b7bb Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:26 -0700
Subject: [PATCH 0081/2161] x86/fpu/xstate: Support dynamic supervisor feature
 for LBR

commit f0dccc9da4c0fda049e99326f85db8c242fd781f upstream.

Last Branch Records (LBR) registers are used to log taken branches and
other control flows. In perf with call stack mode, LBR information is
used to reconstruct a call stack. To get the complete call stack, perf
has to save/restore all LBR registers during a context switch. Due to
the large number of the LBR registers, e.g., the current platform has
96 LBR registers, this process causes a high CPU overhead. To reduce
the CPU overhead during a context switch, an LBR state component that
contains all the LBR related registers is introduced in hardware. All
LBR registers can be saved/restored together using one XSAVES/XRSTORS
instruction.

However, the kernel should not save/restore the LBR state component at
each context switch, like other state components, because of the
following unique features of LBR:
- The LBR state component only contains valuable information when LBR
  is enabled in the perf subsystem, but for most of the time, LBR is
  disabled.
- The size of the LBR state component is huge. For the current
  platform, it's 808 bytes.
If the kernel saves/restores the LBR state at each context switch, for
most of the time, it is just a waste of space and cycles.

To efficiently support the LBR state component, it is desired to have:
- only context-switch the LBR when the LBR feature is enabled in perf.
- only allocate an LBR-specific XSAVE buffer on demand.
  (Besides the LBR state, a legacy region and an XSAVE header have to be
   included in the buffer as well. There is a total of (808+576) byte
   overhead for the LBR-specific XSAVE buffer. The overhead only happens
   when the perf is actively using LBRs. There is still a space-saving,
   on average, when it replaces the constant 808 bytes of overhead for
   every task, all the time on the systems that support architectural
   LBR.)
- be able to use XSAVES/XRSTORS for accessing LBR at run time.
  However, the IA32_XSS should not be adjusted at run time.
  (The XCR0 | IA32_XSS are used to determine the requested-feature
  bitmap (RFBM) of XSAVES.)

A solution, called dynamic supervisor feature, is introduced to address
this issue, which
- does not allocate a buffer in each task->fpu;
- does not save/restore a state component at each context switch;
- sets the bit corresponding to the dynamic supervisor feature in
  IA32_XSS at boot time, and avoids setting it at run time.
- dynamically allocates a specific buffer for a state component
  on demand, e.g. only allocates LBR-specific XSAVE buffer when LBR is
  enabled in perf. (Note: The buffer has to include the LBR state
  component, a legacy region and a XSAVE header space.)
  (Implemented in a later patch)
- saves/restores a state component on demand, e.g. manually invokes
  the XSAVES/XRSTORS instruction to save/restore the LBR state
  to/from the buffer when perf is active and a call stack is required.
  (Implemented in a later patch)

A new mask XFEATURE_MASK_DYNAMIC and a helper xfeatures_mask_dynamic()
are introduced to indicate the dynamic supervisor feature. For the
systems which support the Architecture LBR, LBR is the only dynamic
supervisor feature for now. For the previous systems, there is no
dynamic supervisor feature available.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-21-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h  |  7 +++++++
 arch/x86/include/asm/fpu/xstate.h | 30 ++++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/xstate.c      | 15 ++++++++++-----
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f098f6cab94b..132e9cc26d60 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -114,6 +114,12 @@ enum xfeature {
 	XFEATURE_Hi16_ZMM,
 	XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
 	XFEATURE_PKRU,
+	XFEATURE_RSRVD_COMP_10,
+	XFEATURE_RSRVD_COMP_11,
+	XFEATURE_RSRVD_COMP_12,
+	XFEATURE_RSRVD_COMP_13,
+	XFEATURE_RSRVD_COMP_14,
+	XFEATURE_LBR,
 
 	XFEATURE_MAX,
 };
@@ -128,6 +134,7 @@ enum xfeature {
 #define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
 #define XFEATURE_MASK_PT		(1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
 #define XFEATURE_MASK_PKRU		(1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_LBR		(1 << XFEATURE_LBR)
 
 #define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 422d8369012a..040c4d49bfcb 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -35,6 +35,27 @@
 /* All currently supported supervisor features */
 #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0)
 
+/*
+ * A supervisor state component may not always contain valuable information,
+ * and its size may be huge. Saving/restoring such supervisor state components
+ * at each context switch can cause high CPU and space overhead, which should
+ * be avoided. Such supervisor state components should only be saved/restored
+ * on demand. The on-demand dynamic supervisor features are set in this mask.
+ *
+ * Unlike the existing supported supervisor features, a dynamic supervisor
+ * feature does not allocate a buffer in task->fpu, and the corresponding
+ * supervisor state component cannot be saved/restored at each context switch.
+ *
+ * To support a dynamic supervisor feature, a developer should follow the
+ * dos and don'ts as below:
+ * - Do dynamically allocate a buffer for the supervisor state component.
+ * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
+ *   state component to/from the buffer.
+ * - Don't set the bit corresponding to the dynamic supervisor feature in
+ *   IA32_XSS at run time, since it has been set at boot time.
+ */
+#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR)
+
 /*
  * Unsupported supervisor features. When a supervisor feature in this mask is
  * supported in the future, move it to the supported supervisor feature mask.
@@ -43,6 +64,7 @@
 
 /* All supervisor states including supported and unsupported states. */
 #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
+				      XFEATURE_MASK_DYNAMIC | \
 				      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
 #ifdef CONFIG_X86_64
@@ -63,6 +85,14 @@ static inline u64 xfeatures_mask_user(void)
 	return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
 }
 
+static inline u64 xfeatures_mask_dynamic(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
+		return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR;
+
+	return XFEATURE_MASK_DYNAMIC;
+}
+
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b90f6053a826..38521a8baaf1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -233,8 +233,10 @@ void fpu__init_cpu_xstate(void)
 	/*
 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+				     xfeatures_mask_dynamic());
+	}
 }
 
 static bool xfeature_enabled(enum xfeature xfeature)
@@ -630,7 +632,8 @@ static void check_xstate_against_struct(int nr)
 	 */
 	if ((nr < XFEATURE_YMM) ||
 	    (nr >= XFEATURE_MAX) ||
-	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
+	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
+	    ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) {
 		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
 		XSTATE_WARN_ON(1);
 	}
@@ -879,8 +882,10 @@ void fpu__resume_cpu(void)
 	 * Restore IA32_XSS. The same CPUID bit enumerates support
 	 * of XSAVES and MSR_IA32_XSS.
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
+				     xfeatures_mask_dynamic());
+	}
 }
 
 /*
-- 
Gitee


From e6f11d9330fd36e700d9021d9c9c5e96e32d4bc0 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:27 -0700
Subject: [PATCH 0082/2161] x86/fpu/xstate: Add helpers for LBR dynamic
 supervisor feature

commit 50f408d96d4d1a945d2c50c5fd8ed400883edf0e upstream.

The perf subsystem will only need to save/restore the LBR state.
However, the existing helpers save all supported supervisor states to a
kernel buffer, which will be unnecessary. Two helpers are introduced to
only save/restore requested dynamic supervisor states. The supervisor
features in XFEATURE_MASK_SUPERVISOR_SUPPORTED and
XFEATURE_MASK_SUPERVISOR_UNSUPPORTED mask cannot be saved/restored using
these helpers.

The helpers will be used in the following patch.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-22-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  3 ++
 arch/x86/kernel/fpu/xstate.c      | 72 +++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 040c4d49bfcb..c029fce627cf 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -106,6 +106,9 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 void copy_supervisor_to_kernel(struct xregs_state *xsave);
+void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
+void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
+
 
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 int validate_user_xstate_header(const struct xstate_header *hdr);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 38521a8baaf1..88dd9be53eb1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1393,6 +1393,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate)
 	}
 }
 
+/**
+ * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
+ *                                       an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features saved into the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are saved into the xsave
+ * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
+ * supervisor feature). Besides the dynamic supervisor states, the legacy
+ * region and XSAVE header are also saved into the xsave area. The supervisor
+ * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+{
+	u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+	u32 lmask, hmask;
+	int err;
+
+	if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+		return;
+
+	if (WARN_ON_FPU(!dynamic_mask))
+		return;
+
+	lmask = dynamic_mask;
+	hmask = dynamic_mask >> 32;
+
+	XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+
+	/* Should never fault when copying to a kernel buffer */
+	WARN_ON_FPU(err);
+}
+
+/**
+ * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
+ *                                       an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features restored from the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are restored from the
+ * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
+ * dynamic supervisor feature). Besides the dynamic supervisor states, the
+ * legacy region and XSAVE header are also restored from the xsave area. The
+ * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
+{
+	u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+	u32 lmask, hmask;
+	int err;
+
+	if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+		return;
+
+	if (WARN_ON_FPU(!dynamic_mask))
+		return;
+
+	lmask = dynamic_mask;
+	hmask = dynamic_mask >> 32;
+
+	XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+
+	/* Should never fault when copying from a kernel buffer */
+	WARN_ON_FPU(err);
+}
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * Report the amount of time elapsed in millisecond since last AVX512
-- 
Gitee


From 77fa0d55ee062414b15348295ff7d87a599dc8d5 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 26 Oct 2021 20:49:25 +0800
Subject: [PATCH 0083/2161] perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR
 context switch

commit ce711ea3cab9ad325d849792d442848e553095b8 upstream.

We have skipped changes for pmu lbr, because it will cause much commits for
intel_pmu_arch_lbr_init() to merge. We skipped here and will check perf function
for lbr to aviod crash while perfing lbr.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 3 +++
 arch/x86/kernel/fpu/xstate.c      | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index c029fce627cf..1559554af931 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -21,6 +21,8 @@
 #define XSAVE_YMM_SIZE	    256
 #define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
+#define XSAVE_ALIGNMENT     64
+
 /* All currently supported user features */
 #define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
 				      XFEATURE_MASK_SSE | \
@@ -101,6 +103,7 @@ extern void __init update_regset_xstate_info(unsigned int size,
 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
+int xfeature_size(int xfeature_nr);
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 88dd9be53eb1..7dfc710e6a17 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -520,7 +520,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
 	return ebx;
 }
 
-static int xfeature_size(int xfeature_nr)
+int xfeature_size(int xfeature_nr)
 {
 	u32 eax, ebx, ecx, edx;
 
-- 
Gitee


From b0319efb3017b7226ca9440eeb08b21c04a765e8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 1 Jun 2020 19:42:40 -0400
Subject: [PATCH 0084/2161] introduction of regset ->get() wrappers, switching
 ELF coredumps to those

commit b4e9c9549f62329d2412f899635fddc5212b9cd4 upstream.

Two new helpers: given a process and regset, dump into a buffer.
regset_get() takes a buffer and size, regset_get_alloc() takes size
and allocates a buffer.

Return value in both cases is the amount of data actually dumped in
case of success or -E...  on error.

In both cases the size is capped by regset->n * regset->size, so
->get() is called with offset 0 and size no more than what regset
expects.

binfmt_elf.c callers of ->get() are switched to using those; the other
caller (copy_regset_to_user()) will need some preparations to switch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 fs/binfmt_elf.c        | 54 ++++++++++++++++++++----------------------
 include/linux/regset.h |  9 +++++++
 kernel/Makefile        |  2 +-
 kernel/regset.c        | 54 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 29 deletions(-)
 create mode 100644 kernel/regset.c

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7ce3cfd965d2..d3b2acbfedd5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1701,7 +1701,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
-	unsigned int regset0_size = regset_size(t->task, &view->regsets[0]);
+	int regset0_size;
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1710,8 +1710,10 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size,
-				    &t->prstatus.pr_reg, NULL);
+	regset0_size = regset_get(t->task, &view->regsets[0],
+		   sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
+	if (regset0_size < 0)
+		return 0;
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
 		  PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus);
@@ -1726,32 +1728,28 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 */
 	for (i = 1; i < view->n; ++i) {
 		const struct user_regset *regset = &view->regsets[i];
+		int note_type = regset->core_note_type;
+		bool is_fpreg = note_type == NT_PRFPREG;
+		void *data;
+		int ret;
+
 		do_thread_regset_writeback(t->task, regset);
-		if (regset->core_note_type && regset->get &&
-		    (!regset->active || regset->active(t->task, regset) > 0)) {
-			int ret;
-			size_t size = regset_size(t->task, regset);
-			void *data = kzalloc(size, GFP_KERNEL);
-			if (unlikely(!data))
-				return 0;
-			ret = regset->get(t->task, regset,
-					  0, size, data, NULL);
-			if (unlikely(ret))
-				kfree(data);
-			else {
-				if (regset->core_note_type != NT_PRFPREG)
-					fill_note(&t->notes[i], "LINUX",
-						  regset->core_note_type,
-						  size, data);
-				else {
-					SET_PR_FPVALID(&t->prstatus,
-							1, regset0_size);
-					fill_note(&t->notes[i], "CORE",
-						  NT_PRFPREG, size, data);
-				}
-				*total += notesize(&t->notes[i]);
-			}
-		}
+		if (!note_type) // not for coredumps
+			continue;
+		if (regset->active && regset->active(t->task, regset) <= 0)
+			continue;
+
+		ret = regset_get_alloc(t->task, regset, ~0U, &data);
+		if (ret < 0)
+			continue;
+
+		if (is_fpreg)
+			SET_PR_FPVALID(&t->prstatus, 1, regset0_size);
+
+		fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
+			  note_type, ret, data);
+
+		*total += notesize(&t->notes[i]);
 	}
 
 	return 1;
diff --git a/include/linux/regset.h b/include/linux/regset.h
index bf0243779738..d7ba81f1fd18 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -353,6 +353,15 @@ static inline int user_regset_copyin_ignore(unsigned int *pos,
 	return 0;
 }
 
+extern int regset_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int size, void *data);
+
+extern int regset_get_alloc(struct task_struct *target,
+			    const struct user_regset *regset,
+			    unsigned int size,
+			    void **data);
+
 /**
  * copy_regset_to_user - fetch a thread's user_regset data into user memory
  * @target:	thread to be examined
diff --git a/kernel/Makefile b/kernel/Makefile
index cd6db723bc1e..1f2f8bda63bc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o ucount.o
+	    async.o range.o smpboot.o ucount.o regset.o
 
 obj-y += tkernel/
 
diff --git a/kernel/regset.c b/kernel/regset.c
new file mode 100644
index 000000000000..6b39fa0993ec
--- /dev/null
+++ b/kernel/regset.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/regset.h>
+
+static int __regset_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int size,
+			void **data)
+{
+	void *p = *data, *to_free = NULL;
+	int res;
+
+	if (!regset->get)
+		return -EOPNOTSUPP;
+	if (size > regset->n * regset->size)
+		size = regset->n * regset->size;
+	if (!p) {
+		to_free = p = kzalloc(size, GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+	}
+	res = regset->get(target, regset, 0, size, p, NULL);
+	if (unlikely(res < 0)) {
+		kfree(to_free);
+		return res;
+	}
+	*data = p;
+	if (regset->get_size) { // arm64-only kludge, will go away
+		unsigned max_size = regset->get_size(target, regset);
+		if (size > max_size)
+			size = max_size;
+	}
+	return size;
+}
+
+int regset_get(struct task_struct *target,
+	       const struct user_regset *regset,
+	       unsigned int size,
+	       void *data)
+{
+	return __regset_get(target, regset, size, &data);
+}
+EXPORT_SYMBOL(regset_get);
+
+int regset_get_alloc(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int size,
+		     void **data)
+{
+	*data = NULL;
+	return __regset_get(target, regset, size, data);
+}
+EXPORT_SYMBOL(regset_get_alloc);
-- 
Gitee


From 7e66787679301e8a53499942f2d1cc63fdad6b54 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 17 Feb 2020 12:25:14 -0500
Subject: [PATCH 0085/2161] copy_regset_to_user(): do all copyout at once.

commit dc12d7968f9c9540494deb1285854b18ca4465ec upstream.

Turn copy_regset_to_user() into regset_get_alloc() + copy_to_user().
Now all ->get() calls have a kernel buffer as destination.

Note that we'd already eliminated the callers of copy_regset_to_user()
with non-zero offset; now that argument is simply unused.

Uninlined, while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/regset.h | 29 ++++-------------------------
 kernel/regset.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/include/linux/regset.h b/include/linux/regset.h
index d7ba81f1fd18..4a7225ee6645 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -362,31 +362,10 @@ extern int regset_get_alloc(struct task_struct *target,
 			    unsigned int size,
 			    void **data);
 
-/**
- * copy_regset_to_user - fetch a thread's user_regset data into user memory
- * @target:	thread to be examined
- * @view:	&struct user_regset_view describing user thread machine state
- * @setno:	index in @view->regsets
- * @offset:	offset into the regset data, in bytes
- * @size:	amount of data to copy, in bytes
- * @data:	user-mode pointer to copy into
- */
-static inline int copy_regset_to_user(struct task_struct *target,
-				      const struct user_regset_view *view,
-				      unsigned int setno,
-				      unsigned int offset, unsigned int size,
-				      void __user *data)
-{
-	const struct user_regset *regset = &view->regsets[setno];
-
-	if (!regset->get)
-		return -EOPNOTSUPP;
-
-	if (!access_ok(data, size))
-		return -EFAULT;
-
-	return regset->get(target, regset, offset, size, NULL, data);
-}
+extern int copy_regset_to_user(struct task_struct *target,
+			       const struct user_regset_view *view,
+			       unsigned int setno, unsigned int offset,
+			       unsigned int size, void __user *data);
 
 /**
  * copy_regset_from_user - store into thread's user_regset data from user memory
diff --git a/kernel/regset.c b/kernel/regset.c
index 6b39fa0993ec..0a610983ce43 100644
--- a/kernel/regset.c
+++ b/kernel/regset.c
@@ -52,3 +52,29 @@ int regset_get_alloc(struct task_struct *target,
 	return __regset_get(target, regset, size, data);
 }
 EXPORT_SYMBOL(regset_get_alloc);
+
+/**
+ * copy_regset_to_user - fetch a thread's user_regset data into user memory
+ * @target:	thread to be examined
+ * @view:	&struct user_regset_view describing user thread machine state
+ * @setno:	index in @view->regsets
+ * @offset:	offset into the regset data, in bytes
+ * @size:	amount of data to copy, in bytes
+ * @data:	user-mode pointer to copy into
+ */
+int copy_regset_to_user(struct task_struct *target,
+			const struct user_regset_view *view,
+			unsigned int setno,
+			unsigned int offset, unsigned int size,
+			void __user *data)
+{
+	const struct user_regset *regset = &view->regsets[setno];
+	void *buf;
+	int ret;
+
+	ret = regset_get_alloc(target, regset, size, &buf);
+	if (ret > 0)
+		ret = copy_to_user(data, buf, ret) ? -EFAULT : 0;
+	kfree(buf);
+	return ret;
+}
-- 
Gitee


From c3030c394592b96efaf15658ed71b1ea1b2ba2cc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 20 Feb 2020 20:48:16 -0500
Subject: [PATCH 0086/2161] regset: new method and helpers for it

commit 7717cb9bdd0421faa432a4e0d499fdba6e2394c8 upstream.

->regset_get() takes task+regset+buffer, returns the amount of free space
left in the buffer on success and -E... on error.

buffer is represented as struct membuf - a pair of (kernel) pointer
and amount of space left

Primitives for writing to such:
	* membuf_write(buf, data, size)
	* membuf_zero(buf, size)
	* membuf_store(buf, value)

These are implemented as inlines (in case of membuf_store - a macro).
All writes are sequential; they become no-ops when there's no space
left.  Return value of all primitives is the amount of space left
after the operation, so they can be used as return values of ->regset_get().

Example of use:

// stores pt_regs of task + 64 bytes worth of zeroes + 32bit PID of task
int foo_get(struct task_struct *task, const struct regset *regset,
	    struct membuf to)
{
	membuf_write(&to, task_pt_regs(task), sizeof(struct pt_regs));
	membuf_zero(&to, 64);
	return membuf_store(&to, (u32)task_tgid_vnr(task));
}

regset_get()/regset_get_alloc() taught to use that thing if present.
By the end of the series all users of ->get() will be converted;
then ->get() and ->get_size() can go.

Note that unlike ->get() this thing always starts at offset 0 and,
since it only writes to kernel buffer, can't fail on copyout.
It can, of course, fail for other reasons, but those tend to
be less numerous.

The caller guarantees that the buffer size won't be bigger than
regset->n * regset->size.  That simplifies life for quite a few
instances.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/regset.h | 51 ++++++++++++++++++++++++++++++++++++++++++
 kernel/regset.c        | 12 +++++++++-
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/include/linux/regset.h b/include/linux/regset.h
index 4a7225ee6645..8f1cde3faaba 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -17,6 +17,52 @@
 struct task_struct;
 struct user_regset;
 
+struct membuf {
+	void *p;
+	size_t left;
+};
+
+static inline int membuf_zero(struct membuf *s, size_t size)
+{
+	if (s->left) {
+		if (size > s->left)
+			size = s->left;
+		memset(s->p, 0, size);
+		s->p += size;
+		s->left -= size;
+	}
+	return s->left;
+}
+
+static inline int membuf_write(struct membuf *s, const void *v, size_t size)
+{
+	if (s->left) {
+		if (size > s->left)
+			size = s->left;
+		memcpy(s->p, v, size);
+		s->p += size;
+		s->left -= size;
+	}
+	return s->left;
+}
+
+/* current s->p must be aligned for v; v must be a scalar */
+#define membuf_store(s, v)				\
+({							\
+	struct membuf *__s = (s);			\
+        if (__s->left) {				\
+		typeof(v) __v = (v);			\
+		size_t __size = sizeof(__v);		\
+		if (unlikely(__size > __s->left)) {	\
+			__size = __s->left;		\
+			memcpy(__s->p, &__v, __size);	\
+		} else {				\
+			*(typeof(__v + 0) *)__s->p = __v;	\
+		}					\
+		__s->p += __size;			\
+		__s->left -= __size;			\
+	}						\
+	__s->left;})
 
 /**
  * user_regset_active_fn - type of @active function in &struct user_regset
@@ -57,6 +103,10 @@ typedef int user_regset_get_fn(struct task_struct *target,
 			       unsigned int pos, unsigned int count,
 			       void *kbuf, void __user *ubuf);
 
+typedef int user_regset_get2_fn(struct task_struct *target,
+			       const struct user_regset *regset,
+			       struct membuf to);
+
 /**
  * user_regset_set_fn - type of @set function in &struct user_regset
  * @target:	thread being examined
@@ -186,6 +236,7 @@ typedef unsigned int user_regset_get_size_fn(struct task_struct *target,
  */
 struct user_regset {
 	user_regset_get_fn		*get;
+	user_regset_get2_fn		*regset_get;
 	user_regset_set_fn		*set;
 	user_regset_active_fn		*active;
 	user_regset_writeback_fn	*writeback;
diff --git a/kernel/regset.c b/kernel/regset.c
index 0a610983ce43..eaeaefbbd39e 100644
--- a/kernel/regset.c
+++ b/kernel/regset.c
@@ -11,7 +11,7 @@ static int __regset_get(struct task_struct *target,
 	void *p = *data, *to_free = NULL;
 	int res;
 
-	if (!regset->get)
+	if (!regset->get && !regset->regset_get)
 		return -EOPNOTSUPP;
 	if (size > regset->n * regset->size)
 		size = regset->n * regset->size;
@@ -20,6 +20,16 @@ static int __regset_get(struct task_struct *target,
 		if (!p)
 			return -ENOMEM;
 	}
+	if (regset->regset_get) {
+		res = regset->regset_get(target, regset,
+				   (struct membuf){.p = p, .left = size});
+		if (res < 0) {
+			kfree(to_free);
+			return res;
+		}
+		*data = p;
+		return size - res;
+	}
 	res = regset->get(target, regset, 0, size, p, NULL);
 	if (unlikely(res < 0)) {
 		kfree(to_free);
-- 
Gitee


From 91ba17987a9f9503da0553d2f576cae5929d42a2 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 27 Oct 2021 17:58:06 +0800
Subject: [PATCH 0087/2161] x86/ptrace: change coding style to make patch happy

commit opencloudos.

Change code style of ioperm_get(),
so than porting from 0557d64d983e3dedead2d4b4e8abc49620d5f5d2 will more easily.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/ptrace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2366dbeaf263..b3a7a7e6c8cc 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -701,12 +701,13 @@ static int ioperm_get(struct task_struct *target,
 		      unsigned int pos, unsigned int count,
 		      void *kbuf, void __user *ubuf)
 {
-	if (!target->thread.io_bitmap_ptr)
+	unsigned long *iobm = target->thread.io_bitmap_ptr;
+
+	if (!iobm)
 		return -ENXIO;
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   target->thread.io_bitmap_ptr,
-				   0, IO_BITMAP_BYTES);
+				   iobm, 0, IO_BITMAP_BYTES);
 }
 
 /*
-- 
Gitee


From 072adcc114a09c04ceba59043684afdc1f91c6e8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 18 Feb 2020 12:14:34 -0500
Subject: [PATCH 0088/2161] x86: switch to ->regset_get()

commit 0557d64d983e3dedead2d4b4e8abc49620d5f5d2 upstream.

	All instances of ->get() in arch/x86 switched; that might or might
not be worth splitting up.  Notes:

	* for xstateregs_get() the amount we want to store is determined at
the boot time; see init_xstate_size() and update_regset_xstate_info() for
details.  task->thread.fpu.state.xsave ends with a flexible array member and
the amount of data in it depends upon the FPU features supported/enabled.

	* fpregs_get() writes slightly less than full ->thread.fpu.state.fsave
(the last word is not copied); we pass the full size of state.fsave and let
membuf_write() trim to the amount declared by regset - __regset_get() will
make sure that the space in buffer is no more than that.

	* copy_xstate_to_user() and its helpers are gone now.

	* fpregs_soft_get() was getting user_regset_copyout() arguments
wrong.  Since "x86: x86 user_regset math_emu" back in 2008...  I really
doubt that it's worth splitting out for -stable, though - you need
a 486SX box for that to trigger...

[Kevin's braino fix for copy_xstate_to_kernel() essentially duplicated here]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/regset.h |   4 +-
 arch/x86/include/asm/fpu/xstate.h |   4 +-
 arch/x86/kernel/fpu/regset.c      |  39 +++----
 arch/x86/kernel/fpu/signal.c      |   3 +-
 arch/x86/kernel/fpu/xstate.c      | 164 +++++-------------------------
 arch/x86/kernel/ptrace.c          |  75 +++++---------
 arch/x86/kernel/tls.c             |  32 ++----
 arch/x86/kernel/tls.h             |   2 +-
 arch/x86/math-emu/fpu_entry.c     |  19 +---
 9 files changed, 83 insertions(+), 259 deletions(-)

diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
index d5bdffb9d27f..4f928d6a367b 100644
--- a/arch/x86/include/asm/fpu/regset.h
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -8,8 +8,8 @@
 #include <linux/regset.h>
 
 extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
-				xstateregs_get;
+extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+				 xstateregs_get;
 extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
 				 xstateregs_set;
 
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 1559554af931..14ab815132d4 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -104,8 +104,8 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
-int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
-int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
+struct membuf;
+void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 void copy_supervisor_to_kernel(struct xregs_state *xsave);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 91f80aca27fb..c413756ba89f 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -27,8 +27,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
 }
 
 int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
+		struct membuf to)
 {
 	struct fpu *fpu = &target->thread.fpu;
 
@@ -38,8 +37,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	fpu__prepare_read(fpu);
 	fpstate_sanitize_xstate(fpu);
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &fpu->state.fxsave, 0, -1);
+	return membuf_write(&to, &fpu->state.fxsave, sizeof(struct fxregs_state));
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -74,12 +72,10 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 }
 
 int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
+		struct membuf to)
 {
 	struct fpu *fpu = &target->thread.fpu;
 	struct xregs_state *xsave;
-	int ret;
 
 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return -ENODEV;
@@ -89,10 +85,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	fpu__prepare_read(fpu);
 
 	if (using_compacted_format()) {
-		if (kbuf)
-			ret = copy_xstate_to_kernel(kbuf, xsave, pos, count);
-		else
-			ret = copy_xstate_to_user(ubuf, xsave, pos, count);
+		copy_xstate_to_kernel(to, xsave);
+		return 0;
 	} else {
 		fpstate_sanitize_xstate(fpu);
 		/*
@@ -105,9 +99,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 		/*
 		 * Copy the xstate memory layout.
 		 */
-		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+		return membuf_write(&to, xsave, fpu_user_xstate_size);
 	}
-	return ret;
 }
 
 int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -293,8 +286,7 @@ void convert_to_fxsr(struct fxregs_state *fxsave,
 }
 
 int fpregs_get(struct task_struct *target, const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       void *kbuf, void __user *ubuf)
+	       struct membuf to)
 {
 	struct fpu *fpu = &target->thread.fpu;
 	struct user_i387_ia32_struct env;
@@ -302,23 +294,22 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 	fpu__prepare_read(fpu);
 
 	if (!boot_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+		return fpregs_soft_get(target, regset, to);
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR))
-		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &fpu->state.fsave, 0,
-					   -1);
+	if (!boot_cpu_has(X86_FEATURE_FXSR)) {
+		return membuf_write(&to, &fpu->state.fsave,
+				    sizeof(struct fregs_state));
+	}
 
 	fpstate_sanitize_xstate(fpu);
 
-	if (kbuf && pos == 0 && count == sizeof(env)) {
-		convert_from_fxsr(kbuf, target);
+	if (to.left == sizeof(env)) {
+		convert_from_fxsr(to.p, target);
 		return 0;
 	}
 
 	convert_from_fxsr(&env, target);
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+	return membuf_write(&to, &env, sizeof(env));
 }
 
 int fpregs_set(struct task_struct *target, const struct user_regset *regset,
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 20d453cc79d1..5eb54f1bfd7d 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -172,7 +172,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 
 	if (!static_cpu_has(X86_FEATURE_FPU)) {
 		struct user_i387_ia32_struct fp;
-		fpregs_soft_get(current, NULL, 0, sizeof(fp), &fp, NULL);
+		fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
+						.left = sizeof(fp)});
 		return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0;
 	}
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7dfc710e6a17..46348f3e3b94 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1046,32 +1046,20 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
 	return true;
 }
 
-static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count)
+static void fill_gap(struct membuf *to, unsigned *last, unsigned offset)
 {
-	if (*pos < to) {
-		unsigned size = to - *pos;
-
-		if (size > *count)
-			size = *count;
-		memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size);
-		*kbuf += size;
-		*pos += size;
-		*count -= size;
-	}
+	if (*last >= offset)
+		return;
+	membuf_write(to, (void *)&init_fpstate.xsave + *last, offset - *last);
+	*last = offset;
 }
 
-static void copy_part(unsigned offset, unsigned size, void *from,
-			void **kbuf, unsigned *pos, unsigned *count)
+static void copy_part(struct membuf *to, unsigned *last, unsigned offset,
+		      unsigned size, void *from)
 {
-	fill_gap(offset, kbuf, pos, count);
-	if (size > *count)
-		size = *count;
-	if (size) {
-		memcpy(*kbuf, from, size);
-		*kbuf += size;
-		*pos += size;
-		*count -= size;
-	}
+	fill_gap(to, last, offset);
+	membuf_write(to, from, size);
+	*last = offset + size;
 }
 
 /*
@@ -1081,19 +1069,14 @@ static void copy_part(unsigned offset, unsigned size, void *from,
  * It supports partial copy but pos always starts from zero. This is called
  * from xstateregs_get() and there we check the CPU has XSAVES.
  */
-int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 {
 	struct xstate_header header;
 	const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
-	unsigned count = size_total;
+	unsigned size = to.left;
+	unsigned last = 0;
 	int i;
 
-	/*
-	 * Currently copy_regset_to_user() starts from pos 0:
-	 */
-	if (unlikely(offset_start != 0))
-		return -EFAULT;
-
 	/*
 	 * The destination is a ptrace buffer; we put in only user xstates:
 	 */
@@ -1102,27 +1085,26 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 	header.xfeatures &= xfeatures_mask_user();
 
 	if (header.xfeatures & XFEATURE_MASK_FP)
-		copy_part(0, off_mxcsr,
-			  &xsave->i387, &kbuf, &offset_start, &count);
+		copy_part(&to, &last, 0, off_mxcsr, &xsave->i387);
 	if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
-		copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE,
-			  &xsave->i387.mxcsr, &kbuf, &offset_start, &count);
+		copy_part(&to, &last, off_mxcsr,
+			  MXCSR_AND_FLAGS_SIZE, &xsave->i387.mxcsr);
 	if (header.xfeatures & XFEATURE_MASK_FP)
-		copy_part(offsetof(struct fxregs_state, st_space), 128,
-			  &xsave->i387.st_space, &kbuf, &offset_start, &count);
+		copy_part(&to, &last, offsetof(struct fxregs_state, st_space),
+			  128, &xsave->i387.st_space);
 	if (header.xfeatures & XFEATURE_MASK_SSE)
-		copy_part(xstate_offsets[XFEATURE_SSE], 256,
-			  &xsave->i387.xmm_space, &kbuf, &offset_start, &count);
+		copy_part(&to, &last, xstate_offsets[XFEATURE_SSE],
+			  256, &xsave->i387.xmm_space);
 	/*
 	 * Fill xsave->i387.sw_reserved value for ptrace frame:
 	 */
-	copy_part(offsetof(struct fxregs_state, sw_reserved), 48,
-		  xstate_fx_sw_bytes, &kbuf, &offset_start, &count);
+	copy_part(&to, &last, offsetof(struct fxregs_state, sw_reserved),
+		  48, xstate_fx_sw_bytes);
 	/*
 	 * Copy xregs_state->header:
 	 */
-	copy_part(offsetof(struct xregs_state, header), sizeof(header),
-		  &header, &kbuf, &offset_start, &count);
+	copy_part(&to, &last, offsetof(struct xregs_state, header),
+		  sizeof(header), &header);
 
 	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
 		/*
@@ -1131,104 +1113,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 		if ((header.xfeatures >> i) & 1) {
 			void *src = __raw_xsave_addr(xsave, i);
 
-			copy_part(xstate_offsets[i], xstate_sizes[i],
-				  src, &kbuf, &offset_start, &count);
+			copy_part(&to, &last, xstate_offsets[i],
+				  xstate_sizes[i], src);
 		}
 
 	}
-	fill_gap(size_total, &kbuf, &offset_start, &count);
-
-	return 0;
-}
-
-static inline int
-__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total)
-{
-	if (!size)
-		return 0;
-
-	if (offset < size_total) {
-		unsigned int copy = min(size, size_total - offset);
-
-		if (__copy_to_user(ubuf + offset, data, copy))
-			return -EFAULT;
-	}
-	return 0;
-}
-
-/*
- * Convert from kernel XSAVES compacted format to standard format and copy
- * to a user-space buffer. It supports partial copy but pos always starts from
- * zero. This is called from xstateregs_get() and there we check the CPU
- * has XSAVES.
- */
-int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
-{
-	unsigned int offset, size;
-	int ret, i;
-	struct xstate_header header;
-
-	/*
-	 * Currently copy_regset_to_user() starts from pos 0:
-	 */
-	if (unlikely(offset_start != 0))
-		return -EFAULT;
-
-	/*
-	 * The destination is a ptrace buffer; we put in only user xstates:
-	 */
-	memset(&header, 0, sizeof(header));
-	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= xfeatures_mask_user();
-
-	/*
-	 * Copy xregs_state->header:
-	 */
-	offset = offsetof(struct xregs_state, header);
-	size = sizeof(header);
-
-	ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total);
-	if (ret)
-		return ret;
-
-	for (i = 0; i < XFEATURE_MAX; i++) {
-		/*
-		 * Copy only in-use xstates:
-		 */
-		if ((header.xfeatures >> i) & 1) {
-			void *src = __raw_xsave_addr(xsave, i);
-
-			offset = xstate_offsets[i];
-			size = xstate_sizes[i];
-
-			/* The next component has to fit fully into the output buffer: */
-			if (offset + size > size_total)
-				break;
-
-			ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
-			if (ret)
-				return ret;
-		}
-
-	}
-
-	if (xfeatures_mxcsr_quirk(header.xfeatures)) {
-		offset = offsetof(struct fxregs_state, mxcsr);
-		size = MXCSR_AND_FLAGS_SIZE;
-		__copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total);
-	}
-
-	/*
-	 * Fill xsave->i387.sw_reserved value for ptrace frame:
-	 */
-	offset = offsetof(struct fxregs_state, sw_reserved);
-	size = sizeof(xstate_fx_sw_bytes);
-
-	ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total);
-	if (ret)
-		return ret;
-
-	return 0;
+	fill_gap(&to, &last, size);
 }
 
 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b3a7a7e6c8cc..1f9d856feb24 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -417,26 +417,12 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
 
 static int genregs_get(struct task_struct *target,
 		       const struct user_regset *regset,
-		       unsigned int pos, unsigned int count,
-		       void *kbuf, void __user *ubuf)
+		       struct membuf to)
 {
-	if (kbuf) {
-		unsigned long *k = kbuf;
-		while (count >= sizeof(*k)) {
-			*k++ = getreg(target, pos);
-			count -= sizeof(*k);
-			pos += sizeof(*k);
-		}
-	} else {
-		unsigned long __user *u = ubuf;
-		while (count >= sizeof(*u)) {
-			if (__put_user(getreg(target, pos), u++))
-				return -EFAULT;
-			count -= sizeof(*u);
-			pos += sizeof(*u);
-		}
-	}
+	int reg;
 
+	for (reg = 0; to.left; reg++)
+		membuf_store(&to, getreg(target, reg * sizeof(unsigned long)));
 	return 0;
 }
 
@@ -698,16 +684,14 @@ static int ioperm_active(struct task_struct *target,
 
 static int ioperm_get(struct task_struct *target,
 		      const struct user_regset *regset,
-		      unsigned int pos, unsigned int count,
-		      void *kbuf, void __user *ubuf)
+		      struct membuf to)
 {
 	unsigned long *iobm = target->thread.io_bitmap_ptr;
 
 	if (!iobm)
 		return -ENXIO;
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   iobm, 0, IO_BITMAP_BYTES);
+	return membuf_write(&to, iobm, IO_BITMAP_BYTES);
 }
 
 /*
@@ -1010,28 +994,15 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 
 static int genregs32_get(struct task_struct *target,
 			 const struct user_regset *regset,
-			 unsigned int pos, unsigned int count,
-			 void *kbuf, void __user *ubuf)
+			 struct membuf to)
 {
-	if (kbuf) {
-		compat_ulong_t *k = kbuf;
-		while (count >= sizeof(*k)) {
-			getreg32(target, pos, k++);
-			count -= sizeof(*k);
-			pos += sizeof(*k);
-		}
-	} else {
-		compat_ulong_t __user *u = ubuf;
-		while (count >= sizeof(*u)) {
-			compat_ulong_t word;
-			getreg32(target, pos, &word);
-			if (__put_user(word, u++))
-				return -EFAULT;
-			count -= sizeof(*u);
-			pos += sizeof(*u);
-		}
-	}
+	int reg;
 
+	for (reg = 0; to.left; reg++) {
+		u32 val;
+		getreg32(target, reg * 4, &val);
+		membuf_store(&to, val);
+	}
 	return 0;
 }
 
@@ -1241,25 +1212,25 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
 		.core_note_type = NT_PRSTATUS,
 		.n = sizeof(struct user_regs_struct) / sizeof(long),
 		.size = sizeof(long), .align = sizeof(long),
-		.get = genregs_get, .set = genregs_set
+		.regset_get = genregs_get, .set = genregs_set
 	},
 	[REGSET_FP] = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_struct) / sizeof(long),
 		.size = sizeof(long), .align = sizeof(long),
-		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
 		.size = sizeof(u64), .align = sizeof(u64),
-		.active = xstateregs_active, .get = xstateregs_get,
+		.active = xstateregs_active, .regset_get = xstateregs_get,
 		.set = xstateregs_set
 	},
 	[REGSET_IOPERM64] = {
 		.core_note_type = NT_386_IOPERM,
 		.n = IO_BITMAP_LONGS,
 		.size = sizeof(long), .align = sizeof(long),
-		.active = ioperm_active, .get = ioperm_get
+		.active = ioperm_active, .regset_get = ioperm_get
 	},
 };
 
@@ -1282,24 +1253,24 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
 		.core_note_type = NT_PRSTATUS,
 		.n = sizeof(struct user_regs_struct32) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.get = genregs32_get, .set = genregs32_set
+		.regset_get = genregs32_get, .set = genregs32_set
 	},
 	[REGSET_FP] = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
+		.active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set
 	},
 	[REGSET_XFP] = {
 		.core_note_type = NT_PRXFPREG,
 		.n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
 		.size = sizeof(u64), .align = sizeof(u64),
-		.active = xstateregs_active, .get = xstateregs_get,
+		.active = xstateregs_active, .regset_get = xstateregs_get,
 		.set = xstateregs_set
 	},
 	[REGSET_TLS] = {
@@ -1308,13 +1279,13 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
 		.size = sizeof(struct user_desc),
 		.align = sizeof(struct user_desc),
 		.active = regset_tls_active,
-		.get = regset_tls_get, .set = regset_tls_set
+		.regset_get = regset_tls_get, .set = regset_tls_set
 	},
 	[REGSET_IOPERM32] = {
 		.core_note_type = NT_386_IOPERM,
 		.n = IO_BITMAP_BYTES / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = ioperm_active, .get = ioperm_get
+		.active = ioperm_active, .regset_get = ioperm_get
 	},
 };
 
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 71d3fef1edc9..64a496a0687f 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -256,36 +256,16 @@ int regset_tls_active(struct task_struct *target,
 }
 
 int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   void *kbuf, void __user *ubuf)
+		   struct membuf to)
 {
 	const struct desc_struct *tls;
+	struct user_desc v;
+	int pos;
 
-	if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
-	    (pos % sizeof(struct user_desc)) != 0 ||
-	    (count % sizeof(struct user_desc)) != 0)
-		return -EINVAL;
-
-	pos /= sizeof(struct user_desc);
-	count /= sizeof(struct user_desc);
-
-	tls = &target->thread.tls_array[pos];
-
-	if (kbuf) {
-		struct user_desc *info = kbuf;
-		while (count-- > 0)
-			fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
-				       tls++);
-	} else {
-		struct user_desc __user *u_info = ubuf;
-		while (count-- > 0) {
-			struct user_desc info;
-			fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
-			if (__copy_to_user(u_info++, &info, sizeof(info)))
-				return -EFAULT;
-		}
+	for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) {
+		fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls);
+		membuf_write(&to, &v, sizeof(v));
 	}
-
 	return 0;
 }
 
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
index 3a76e1d3535e..fc39447a0c1a 100644
--- a/arch/x86/kernel/tls.h
+++ b/arch/x86/kernel/tls.h
@@ -12,7 +12,7 @@
 #include <linux/regset.h>
 
 extern user_regset_active_fn regset_tls_active;
-extern user_regset_get_fn regset_tls_get;
+extern user_regset_get2_fn regset_tls_get;
 extern user_regset_set_fn regset_tls_set;
 
 #endif	/* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index a873da6b46d6..8679a9d6c47f 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -689,12 +689,10 @@ int fpregs_soft_set(struct task_struct *target,
 
 int fpregs_soft_get(struct task_struct *target,
 		    const struct user_regset *regset,
-		    unsigned int pos, unsigned int count,
-		    void *kbuf, void __user *ubuf)
+		    struct membuf to)
 {
 	struct swregs_state *s387 = &target->thread.fpu.state.soft;
 	const void *space = s387->st_space;
-	int ret;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
 
 	RE_ENTRANT_CHECK_OFF;
@@ -709,18 +707,11 @@ int fpregs_soft_get(struct task_struct *target,
 	S387->fos |= 0xffff0000;
 #endif /* PECULIAR_486 */
 
-	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
-				  offsetof(struct swregs_state, st_space));
-
-	/* Copy all registers in stack order. */
-	if (!ret)
-		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					  space + offset, 0, other);
-	if (!ret)
-		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					  space, 0, offset);
+	membuf_write(&to, s387, offsetof(struct swregs_state, st_space));
+	membuf_write(&to, space + offset, other);
+	membuf_write(&to, space, offset);
 
 	RE_ENTRANT_CHECK_ON;
 
-	return ret;
+	return 0;
 }
-- 
Gitee


From d6db400d1393352882711ba779f45cb46bf2d3a9 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 20 Jul 2020 06:50:51 -0700
Subject: [PATCH 0089/2161] x86/fpu/xstate: Fix an xstate size check warning
 with architectural LBRs

commit 76d10256a97a7cab72b123d54b766a3c17da658c upstream.

An xstate size check warning is triggered on machines which support
Architectural LBRs.

    XSAVE consistency problem, dumping leaves
    WARNING: CPU: 0 PID: 0 at arch/x86/kernel/fpu/xstate.c:649 fpu__init_system_xstate+0x4d4/0xd0e
    Modules linked in:
    CPU: 0 PID: 0 Comm: swapper Not tainted intel-arch_lbr+
    RIP: 0010:fpu__init_system_xstate+0x4d4/0xd0e

The xstate size check routine, init_xstate_size(), compares the size
retrieved from the hardware with the size of task->fpu, which is
calculated by the software.

The size from the hardware is the total size of the enabled xstates in
XCR0 | IA32_XSS. Architectural LBR state is a dynamic supervisor
feature, which sets the corresponding bit in the IA32_XSS at boot time.
The size from the hardware includes the size of the Architectural LBR
state.

However, a dynamic supervisor feature doesn't allocate a buffer in the
task->fpu. The size of task->fpu doesn't include the size of the
Architectural LBR state. The mismatch will trigger the warning.

Three options as below were considered to fix the issue:

- Correct the size from the hardware by subtracting the size of the
  dynamic supervisor features.
  The purpose of the check is to compare the size CPU told with the size
  of the XSAVE buffer, which is calculated by the software. If the
  software mucks with the number from hardware, it removes the value of
  the check.
  This option is not a good option.

- Prevent the hardware from counting the size of the dynamic supervisor
  feature by temporarily removing the corresponding bits in IA32_XSS.
  Two extra MSR writes are required to flip the IA32_XSS. The option is
  not pretty, but it is workable. The check is only called once at early
  boot time. The synchronization or context-switching doesn't need to be
  worried.
  This option is implemented here.

- Remove the check entirely, because the check hasn't found any real
  problems. The option may be an alternative as option 2.
  This option is not implemented here.

Add a new function, get_xsaves_size_no_dynamic(), which retrieves the
total size without the dynamic supervisor features from the hardware.
The size will be used to compare with the size of task->fpu.

Fixes: f0dccc9da4c0 ("x86/fpu/xstate: Support dynamic supervisor feature for LBR")
Reported-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lore.kernel.org/r/1595253051-75374-1-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 46348f3e3b94..66f88a1b522c 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -643,6 +643,10 @@ static void check_xstate_against_struct(int nr)
  * This essentially double-checks what the cpu told us about
  * how large the XSAVE buffer needs to be.  We are recalculating
  * it to be safe.
+ *
+ * Dynamic XSAVE features allocate their own buffers and are not
+ * covered by these checks. Only the size of the buffer for task->fpu
+ * is checked here.
  */
 static void do_extra_xstate_size_checks(void)
 {
@@ -705,6 +709,33 @@ static unsigned int __init get_xsaves_size(void)
 	return ebx;
 }
 
+/*
+ * Get the total size of the enabled xstates without the dynamic supervisor
+ * features.
+ */
+static unsigned int __init get_xsaves_size_no_dynamic(void)
+{
+	u64 mask = xfeatures_mask_dynamic();
+	unsigned int size;
+
+	if (!mask)
+		return get_xsaves_size();
+
+	/* Disable dynamic features. */
+	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+
+	/*
+	 * Ask the hardware what size is required of the buffer.
+	 * This is the size required for the task->fpu buffer.
+	 */
+	size = get_xsaves_size();
+
+	/* Re-enable dynamic features so XSAVES will work on them again. */
+	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
+
+	return size;
+}
+
 static unsigned int __init get_xsave_size(void)
 {
 	unsigned int eax, ebx, ecx, edx;
@@ -742,7 +773,7 @@ static int __init init_xstate_size(void)
 	xsave_size = get_xsave_size();
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		possible_xstate_size = get_xsaves_size();
+		possible_xstate_size = get_xsaves_size_no_dynamic();
 	else
 		possible_xstate_size = xsave_size;
 
-- 
Gitee


From 77454d8d902a1235111d1efee57938f4561f5821 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 15 Sep 2020 09:30:08 -0700
Subject: [PATCH 0090/2161] x86/cpufeatures: Enumerate ENQCMD and ENQCMDS
 instructions

commit ff4f82816dff28ffaaff96d1409bb3811d345514 upstream.

Work submission instruction comes in two flavors. ENQCMD can be called
both in ring 3 and ring 0 and always uses the contents of a PASID MSR
when shipping the command to the device. ENQCMDS allows a kernel driver
to submit commands on behalf of a user process. The driver supplies the
PASID value in ENQCMDS. There isn't any usage of ENQCMD in the kernel as
of now.

The CPU feature flag is shown as "enqcmd" in /proc/cpuinfo.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1600187413-163670-5-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/kernel/cpu/cpuid-deps.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6b3ea62eb62d..0ec98ca7e8d8 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -348,6 +348,7 @@
 #define X86_FEATURE_CLDEMOTE		(16*32+25) /* CLDEMOTE instruction */
 #define X86_FEATURE_MOVDIRI		(16*32+27) /* MOVDIRI instruction */
 #define X86_FEATURE_MOVDIR64B		(16*32+28) /* MOVDIR64B instruction */
+#define X86_FEATURE_ENQCMD		(16*32+29) /* ENQCMD and ENQCMDS instructions */
 
 #define X86_FEATURE_SGX_LC		(16*32+30) /* Software Guard Extensions Launch Control */
 /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 3cbe24ca80ab..3a02707c1f4d 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_CQM_MBM_TOTAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_CQM_MBM_LOCAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{}
 };
 
-- 
Gitee


From 997faad87c941be59635b1a730a44d30c84c5047 Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 15 Sep 2020 09:30:09 -0700
Subject: [PATCH 0091/2161] x86/fpu/xstate: Add supervisor PASID state for
 ENQCMD

commit b454feb9abc1a9ee876fb84bfea0fc8d726f5bc4 upstream.

The ENQCMD instruction reads a PASID from the IA32_PASID MSR. The
MSR is stored in the task's supervisor XSAVE* PASID state and is
context-switched by XSAVES/XRSTORS.

 [ bp: Add (in-)definite articles and massage. ]

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1600187413-163670-6-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h  | 11 ++++++++++-
 arch/x86/include/asm/fpu/xstate.h |  2 +-
 arch/x86/kernel/fpu/xstate.c      |  6 +++++-
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 132e9cc26d60..1c7231ce9856 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -114,7 +114,7 @@ enum xfeature {
 	XFEATURE_Hi16_ZMM,
 	XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
 	XFEATURE_PKRU,
-	XFEATURE_RSRVD_COMP_10,
+	XFEATURE_PASID,
 	XFEATURE_RSRVD_COMP_11,
 	XFEATURE_RSRVD_COMP_12,
 	XFEATURE_RSRVD_COMP_13,
@@ -134,6 +134,7 @@ enum xfeature {
 #define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
 #define XFEATURE_MASK_PT		(1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
 #define XFEATURE_MASK_PKRU		(1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_PASID		(1 << XFEATURE_PASID)
 #define XFEATURE_MASK_LBR		(1 << XFEATURE_LBR)
 
 #define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
@@ -236,6 +237,14 @@ struct pkru_state {
 	u32				pad;
 } __packed;
 
+/*
+ * State component 10 is supervisor state used for context-switching the
+ * PASID state.
+ */
+struct ia32_pasid_state {
+	u64 pasid;
+} __packed;
+
 struct xstate_header {
 	u64				xfeatures;
 	u64				xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 14ab815132d4..47a92232d595 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -35,7 +35,7 @@
 				      XFEATURE_MASK_BNDCSR)
 
 /* All currently supported supervisor features */
-#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0)
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
 
 /*
  * A supervisor state component may not always contain valuable information,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 66f88a1b522c..81a8ab8f4208 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -37,6 +37,7 @@ static const char *xfeature_names[] =
 	"AVX-512 ZMM_Hi256"		,
 	"Processor Trace (unused)"	,
 	"Protection Keys User registers",
+	"PASID state",
 	"unknown xstate feature"	,
 };
 
@@ -51,6 +52,7 @@ static short xsave_cpuid_features[] __initdata = {
 	X86_FEATURE_AVX512F,
 	X86_FEATURE_INTEL_PT,
 	X86_FEATURE_PKU,
+	X86_FEATURE_ENQCMD,
 };
 
 /*
@@ -318,6 +320,7 @@ static void __init print_xstate_features(void)
 	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
 	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
 	print_xstate_feature(XFEATURE_MASK_PKRU);
+	print_xstate_feature(XFEATURE_MASK_PASID);
 }
 
 /*
@@ -624,6 +627,7 @@ static void check_xstate_against_struct(int nr)
 	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
 	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
 	XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
+	XCHECK_SZ(sz, nr, XFEATURE_PASID,     struct ia32_pasid_state);
 
 	/*
 	 * Make *SURE* to add any feature numbers in below if
@@ -633,7 +637,7 @@ static void check_xstate_against_struct(int nr)
 	if ((nr < XFEATURE_YMM) ||
 	    (nr >= XFEATURE_MAX) ||
 	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
-	    ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) {
+	    ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) {
 		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
 		XSTATE_WARN_ON(1);
 	}
-- 
Gitee


From 80131822fae99552c827262d0ca4347f099ebef2 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 15 Sep 2020 09:30:12 -0700
Subject: [PATCH 0092/2161] x86/cpufeatures: Mark ENQCMD as disabled when
 configured out

commit 1478b99a76534b6c244cfe24fa616280a9441118 upstream.

Currently, the ENQCMD feature depends on CONFIG_IOMMU_SUPPORT. Add
X86_FEATURE_ENQCMD to the disabled features mask so that it gets
disabled when the IOMMU config option above is not enabled.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1600187413-163670-9-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/disabled-features.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 8473c786632d..3a8d9c71e630 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -62,6 +62,12 @@
 # define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
 #endif
 
+#ifdef CONFIG_IOMMU_SUPPORT
+# define DISABLE_ENQCMD	0
+#else
+# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
+#endif
+
 #ifdef CONFIG_X86_SGX
 # define DISABLE_SGX	0
 #else
@@ -87,7 +93,8 @@
 #define DISABLED_MASK13	0
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
-#define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
+#define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
+			 DISABLE_ENQCMD)
 #define DISABLED_MASK17	0
 #define DISABLED_MASK18	0
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
-- 
Gitee


From a466b4e49546449dcb75e1cbb7f909b3c75fc1ec Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 15 Sep 2020 09:30:10 -0700
Subject: [PATCH 0093/2161] x86/msr-index: Define an IA32_PASID MSR

commit f0f2f9feb4ee6f28729e5388da3c03ce1dac077a upstream.

The IA32_PASID MSR (0xd93) contains the Process Address Space Identifier
(PASID), a 20-bit value. Bit 31 must be set to indicate the value
programmed in the MSR is valid. Hardware uses the PASID to identify a
process address space and direct responses to the right address space.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1600187413-163670-7-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/msr-index.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6a80c597b09f..aba7a12c522b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -228,6 +228,9 @@
 #define MSR_IA32_LASTINTFROMIP		0x000001dd
 #define MSR_IA32_LASTINTTOIP		0x000001de
 
+#define MSR_IA32_PASID			0x00000d93
+#define MSR_IA32_PASID_VALID		BIT_ULL(31)
+
 /* DEBUGCTLMSR bits (others vary by model): */
 #define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
 #define DEBUGCTLMSR_BTF_SHIFT		1
-- 
Gitee


From 43fd1bd1c860ddc680ca0365e7447dc1d6b2a490 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 15 Sep 2020 09:30:11 -0700
Subject: [PATCH 0094/2161] mm: Add a pasid member to struct mm_struct

commit 52ad9bc64c74167466e70e0df4b99ee5ccef0078 upstream.

A PASID is shared by all threads in a process. So the logical place to
keep track of it is in the mm_struct. Both ARM and x86 would use this
PASID.

 [ bp: Massage commit message. ]

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1600187413-163670-8-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/mm_types.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 34f390f45bd2..cb1e045256c4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -531,6 +531,10 @@ struct mm_struct {
 		atomic_long_t hugetlb_usage;
 #endif
 		struct work_struct async_put_work;
+
+#ifdef CONFIG_IOMMU_SUPPORT
+		u32 pasid;
+#endif
 	} __randomize_layout;
 
 	KABI_RESERVE(1);
-- 
Gitee


From b24698c1f8f37ccbbd77380ebf85c8b45216640c Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 15 Sep 2020 09:30:13 -0700
Subject: [PATCH 0095/2161] x86/mmu: Allocate/free a PASID

commit 20f0afd1fb3d7d44f4a3db5a4b6e904410862140 upstream.

A PASID is allocated for an "mm" the first time any thread binds to an
SVA-capable device and is freed from the "mm" when the SVA is unbound
by the last thread. It's possible for the "mm" to have different PASID
values in different binding/unbinding SVA cycles.

The mm's PASID (non-zero for valid PASID or 0 for invalid PASID) is
propagated to a per-thread PASID MSR for all threads within the mm
through IPI, context switch, or inherited. This is done to ensure that a
running thread has the right PASID in the MSR matching the mm's PASID.

 [ bp: s/SVM/SVA/g; massage. ]

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1600187413-163670-10-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h      | 12 ++++++
 arch/x86/include/asm/fpu/internal.h |  7 ++++
 arch/x86/kernel/fpu/xstate.c        | 57 +++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 06e767bca0c1..38f4936045ab 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -73,4 +73,16 @@ extern void switch_fpu_return(void);
  */
 extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
 
+/*
+ * Tasks that are not using SVA have mm->pasid set to zero to note that they
+ * will not have the valid bit set in MSR_IA32_PASID while they are running.
+ */
+#define PASID_DISABLED	0
+
+#ifdef CONFIG_IOMMU_SUPPORT
+/* Update current's PASID MSR/state by mm's PASID. */
+void update_pasid(void);
+#else
+static inline void update_pasid(void) { }
+#endif
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 90bdf15d2efe..26adefffa39f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -569,6 +569,13 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 			pkru_val = pk->pkru;
 	}
 	__write_pkru(pkru_val);
+
+	/*
+	 * Expensive PASID MSR write will be avoided in update_pasid() because
+	 * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated
+	 * unless it's different from mm->pasid to reduce overhead.
+	 */
+	update_pasid();
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 81a8ab8f4208..ca742239978a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1434,3 +1434,60 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
+#ifdef CONFIG_IOMMU_SUPPORT
+void update_pasid(void)
+{
+	u64 pasid_state;
+	u32 pasid;
+
+	if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+		return;
+
+	if (!current->mm)
+		return;
+
+	pasid = READ_ONCE(current->mm->pasid);
+	/* Set the valid bit in the PASID MSR/state only for valid pasid. */
+	pasid_state = pasid == PASID_DISABLED ?
+		      pasid : pasid | MSR_IA32_PASID_VALID;
+
+	/*
+	 * No need to hold fregs_lock() since the task's fpstate won't
+	 * be changed by others (e.g. ptrace) while the task is being
+	 * switched to or is in IPI.
+	 */
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+		/* The MSR is active and can be directly updated. */
+		wrmsrl(MSR_IA32_PASID, pasid_state);
+	} else {
+		struct fpu *fpu = &current->thread.fpu;
+		struct ia32_pasid_state *ppasid_state;
+		struct xregs_state *xsave;
+
+		/*
+		 * The CPU's xstate registers are not currently active. Just
+		 * update the PASID state in the memory buffer here. The
+		 * PASID MSR will be loaded when returning to user mode.
+		 */
+		xsave = &fpu->state.xsave;
+		xsave->header.xfeatures |= XFEATURE_MASK_PASID;
+		ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
+		/*
+		 * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
+		 * won't be NULL and no need to check its value.
+		 *
+		 * Only update the task's PASID state when it's different
+		 * from the mm's pasid.
+		 */
+		if (ppasid_state->pasid != pasid_state) {
+			/*
+			 * Invalid fpregs so that state restoring will pick up
+			 * the PASID state.
+			 */
+			__fpu_invalidate_fpregs_state(fpu);
+			ppasid_state->pasid = pasid_state;
+		}
+	}
+}
+#endif /* CONFIG_IOMMU_SUPPORT */
-- 
Gitee


From 9e7194b30a28a3a616ad8c5557f1249eab0b380f Mon Sep 17 00:00:00 2001
From: Yejune Deng <yejune.deng@gmail.com>
Date: Fri, 22 Jan 2021 15:19:25 +0800
Subject: [PATCH 0096/2161] x86/fpu/xstate: Use sizeof() instead of a constant

commit 0a74d61c7d842b583f33f74d7a9e93201826f4c5 upstream.

Use sizeof() instead of a constant in fpstate_sanitize_xstate().
Remove use of the address of the 0th array element of ->st_space and
->xmm_space which is equivalent to the array address itself:

No code changed:

  # arch/x86/kernel/fpu/xstate.o:

   text    data     bss     dec     hex filename
   9694     899       4   10597    2965 xstate.o.before
   9694     899       4   10597    2965 xstate.o.after

md5:
   5a43fc70bad8e2a1784f67f01b71aabb  xstate.o.before.asm
   5a43fc70bad8e2a1784f67f01b71aabb  xstate.o.after.asm

 [ bp: Massage commit message. ]

Signed-off-by: Yejune Deng <yejune.deng@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210122071925.41285-1-yejune.deng@gmail.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index ca742239978a..1c2be1411449 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 		fx->fop = 0;
 		fx->rip = 0;
 		fx->rdp = 0;
-		memset(&fx->st_space[0], 0, 128);
+		memset(fx->st_space, 0, sizeof(fx->st_space));
 	}
 
 	/*
 	 * SSE is in init state
 	 */
 	if (!(xfeatures & XFEATURE_MASK_SSE))
-		memset(&fx->xmm_space[0], 0, 256);
+		memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
 
 	/*
 	 * First two features are FPU and SSE, which above we handled
-- 
Gitee


From 2ed3f193c7342611a33e79e7b5a6dcbd569eb5e5 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 29 Oct 2021 11:29:17 +0800
Subject: [PATCH 0097/2161] x86: Fix various typos in comments

commit d9f6e12fb0b7fcded0bac34b8293ec46f80dfc33 upstream.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/common.c | 4 ++--
 arch/x86/kernel/fpu/xstate.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8ec156923ebb..b3bfa90125da 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -480,7 +480,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 	if (pk)
 		pk->pkru = init_pkru_value;
 	/*
-	 * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
+	 * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
 	 * cpuid bit to be set.  We need to ensure that we
 	 * update that bit in this CPU's "cpu_info".
 	 */
@@ -1367,7 +1367,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
 	 * where GS is unused by the prev and next threads.
 	 *
 	 * Since neither vendor documents this anywhere that I can see,
-	 * detect it directly instead of hardcoding the choice by
+	 * detect it directly instead of hard-coding the choice by
 	 * vendor.
 	 *
 	 * I've designated AMD's behavior as the "bug" because it's
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 1c2be1411449..77c12e9cc045 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -253,7 +253,7 @@ static bool xfeature_enabled(enum xfeature xfeature)
 static void __init setup_xstate_features(void)
 {
 	u32 eax, ebx, ecx, edx, i;
-	/* start at the beginnning of the "extended state" */
+	/* start at the beginning of the "extended state" */
 	unsigned int last_good_offset = offsetof(struct xregs_state,
 						 extended_state_area);
 	/*
-- 
Gitee


From 5e1d1b68776fd557f1eac7ef6df216c3188b9974 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 18 May 2021 13:03:16 -0700
Subject: [PATCH 0098/2161] x86/signal: Introduce helpers to get the maximum
 signal frame size

commit 939ef713297df2cc910592305aa26af0e87f28ac upstream.

Signal frames do not have a fixed format and can vary in size when a number
of things change: supported XSAVE features, 32 vs. 64-bit apps, etc.

Add support for a runtime method for userspace to dynamically discover
how large a signal stack needs to be.

Introduce a new variable, max_frame_size, and helper functions for the
calculation to be used in a new user interface. Set max_frame_size to a
system-wide worst-case value, instead of storing multiple app-specific
values.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Len Brown <len.brown@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H.J. Lu <hjl.tools@gmail.com>
Link: https://lkml.kernel.org/r/20210518200320.17239-3-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/signal.h |  2 ++
 arch/x86/include/asm/sigframe.h   |  2 ++
 arch/x86/kernel/cpu/common.c      |  3 ++
 arch/x86/kernel/fpu/signal.c      | 19 ++++++++++
 arch/x86/kernel/signal.c          | 59 +++++++++++++++++++++++++++++--
 5 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..8b6631dffefd 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,6 +29,8 @@ unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 		     unsigned long *buf_fx, unsigned long *size);
 
+unsigned long fpu__get_fpstate_size(void);
+
 extern void fpu__init_prepare_fx_sw_frame(void);
 
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index f176114c04d4..e30aa05905f5 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -89,4 +89,6 @@ struct rt_sigframe_x32 {
 
 #endif /* CONFIG_X86_64 */
 
+void __init init_sigframe_size(void);
+
 #endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b3bfa90125da..d11939181c75 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -57,6 +57,7 @@
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/uv/uv.h>
 #endif
+#include <asm/sigframe.h>
 
 #ifdef CONFIG_NUMA_AWARE_SPINLOCKS
 #include <asm/qspinlock.h>
@@ -1297,6 +1298,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	fpu__init_system(c);
 
+	init_sigframe_size();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Regardless of whether PCID is enumerated, the SDM says
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 5eb54f1bfd7d..c84386ef54be 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -511,6 +511,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
 	return sp;
 }
+
+unsigned long fpu__get_fpstate_size(void)
+{
+	unsigned long ret = xstate_sigframe_size();
+
+	/*
+	 * This space is needed on (most) 32-bit kernels, or when a 32-bit
+	 * app is running on a 64-bit kernel. To keep things simple, just
+	 * assume the worst case and always include space for 'freg_state',
+	 * even for 64-bit apps on 64-bit kernels. This wastes a bit of
+	 * space, but keeps the code simple.
+	 */
+	if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+	     IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+		ret += sizeof(struct fregs_state);
+
+	return ret;
+}
+
 /*
  * Prepare the SW reserved portion of the fxsave memory layout, indicating
  * the presence of the extended state information in the memory layout
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 3c7db2541998..c747468c183a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -219,6 +219,11 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
  * Set up a signal frame.
  */
 
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT	16UL
+
+#define MAX_FRAME_PADDING	(FRAME_ALIGNMENT - 1)
+
 /*
  * Determine which stack to use..
  */
@@ -229,9 +234,9 @@ static unsigned long align_sigframe(unsigned long sp)
 	 * Align the stack pointer according to the i386 ABI,
 	 * i.e. so that on function entry ((sp + 4) & 15) == 0.
 	 */
-	sp = ((sp + 4) & -16ul) - 4;
+	sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
 #else /* !CONFIG_X86_32 */
-	sp = round_down(sp, 16) - 8;
+	sp = round_down(sp, FRAME_ALIGNMENT) - 8;
 #endif
 	return sp;
 }
@@ -667,6 +672,56 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	return 0;
 }
 
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE	sizeof(struct sigframe_ia32)
+#else
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE	sizeof(struct rt_sigframe)
+#endif
+
+/*
+ * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
+ * If a signal frame starts at an unaligned address, extra space is required.
+ * This is the max alignment padding, conservatively.
+ */
+#define MAX_XSAVE_PADDING	63UL
+
+/*
+ * The frame data is composed of the following areas and laid out as:
+ *
+ * -------------------------
+ * | alignment padding     |
+ * -------------------------
+ * | (f)xsave frame        |
+ * -------------------------
+ * | fsave header          |
+ * -------------------------
+ * | alignment padding     |
+ * -------------------------
+ * | siginfo + ucontext    |
+ * -------------------------
+ */
+
+/* max_frame_size tells userspace the worst case signal stack size. */
+static unsigned long __ro_after_init max_frame_size;
+
+void __init init_sigframe_size(void)
+{
+	max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
+
+	max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING;
+
+	/* Userspace expects an aligned size. */
+	max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
+
+	pr_info("max sigframe size: %lu\n", max_frame_size);
+}
+
 static inline int is_ia32_compat_frame(struct ksignal *ksig)
 {
 	return IS_ENABLED(CONFIG_IA32_EMULATION) &&
-- 
Gitee


From 0c2460f7e0abea3ec8c2a0482783a2d0d9509dfd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 29 May 2021 11:17:30 +0200
Subject: [PATCH 0099/2161] x86/cpufeatures: Force disable X86_FEATURE_ENQCMD
 and remove update_pasid()

commit 9bfecd05833918526cc7357d55e393393440c5fa upstream.

While digesting the XSAVE-related horrors which got introduced with
the supervisor/user split, the recent addition of ENQCMD-related
functionality got on the radar and turned out to be similarly broken.

update_pasid(), which is only required when X86_FEATURE_ENQCMD is
available, is invoked from two places:

 1) From switch_to() for the incoming task

 2) Via a SMP function call from the IOMMU/SMV code

#1 is half-ways correct as it hacks around the brokenness of get_xsave_addr()
   by enforcing the state to be 'present', but all the conditionals in that
   code are completely pointless for that.

   Also the invocation is just useless overhead because at that point
   it's guaranteed that TIF_NEED_FPU_LOAD is set on the incoming task
   and all of this can be handled at return to user space.

#2 is broken beyond repair. The comment in the code claims that it is safe
   to invoke this in an IPI, but that's just wishful thinking.

   FPU state of a running task is protected by fregs_lock() which is
   nothing else than a local_bh_disable(). As BH-disabled regions run
   usually with interrupts enabled the IPI can hit a code section which
   modifies FPU state and there is absolutely no guarantee that any of the
   assumptions which are made for the IPI case is true.

   Also the IPI is sent to all CPUs in mm_cpumask(mm), but the IPI is
   invoked with a NULL pointer argument, so it can hit a completely
   unrelated task and unconditionally force an update for nothing.
   Worse, it can hit a kernel thread which operates on a user space
   address space and set a random PASID for it.

The offending commit does not cleanly revert, but it's sufficient to
force disable X86_FEATURE_ENQCMD and to remove the broken update_pasid()
code to make this dysfunctional all over the place. Anything more
complex would require more surgery and none of the related functions
outside of the x86 core code are blatantly wrong, so removing those
would be overkill.

As nothing enables the PASID bit in the IA32_XSS MSR yet, which is
required to make this actually work, this cannot result in a regression
except for related out of tree train-wrecks, but they are broken already
today.

Fixes: 20f0afd1fb3d ("x86/mmu: Allocate/free a PASID")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/87mtsd6gr9.ffs@nanos.tec.linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/disabled-features.h |  7 +--
 arch/x86/include/asm/fpu/api.h           |  6 +--
 arch/x86/include/asm/fpu/internal.h      |  7 ---
 arch/x86/kernel/fpu/xstate.c             | 57 --------------------
 mm/memory.c                              | 67 ++++++++++++++++++++++++
 5 files changed, 70 insertions(+), 74 deletions(-)

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 3a8d9c71e630..0c6adf09fae5 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -62,11 +62,8 @@
 # define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
 #endif
 
-#ifdef CONFIG_IOMMU_SUPPORT
-# define DISABLE_ENQCMD	0
-#else
-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
-#endif
+/* Force disable because it's broken beyond repair */
+#define DISABLE_ENQCMD		(1 << (X86_FEATURE_ENQCMD & 31))
 
 #ifdef CONFIG_X86_SGX
 # define DISABLE_SGX	0
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 38f4936045ab..8b9bfaad6e66 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -79,10 +79,6 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
  */
 #define PASID_DISABLED	0
 
-#ifdef CONFIG_IOMMU_SUPPORT
-/* Update current's PASID MSR/state by mm's PASID. */
-void update_pasid(void);
-#else
 static inline void update_pasid(void) { }
-#endif
+
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 26adefffa39f..90bdf15d2efe 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -569,13 +569,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 			pkru_val = pk->pkru;
 	}
 	__write_pkru(pkru_val);
-
-	/*
-	 * Expensive PASID MSR write will be avoided in update_pasid() because
-	 * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated
-	 * unless it's different from mm->pasid to reduce overhead.
-	 */
-	update_pasid();
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 77c12e9cc045..48f8a6e7d80a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1434,60 +1434,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
-
-#ifdef CONFIG_IOMMU_SUPPORT
-void update_pasid(void)
-{
-	u64 pasid_state;
-	u32 pasid;
-
-	if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
-		return;
-
-	if (!current->mm)
-		return;
-
-	pasid = READ_ONCE(current->mm->pasid);
-	/* Set the valid bit in the PASID MSR/state only for valid pasid. */
-	pasid_state = pasid == PASID_DISABLED ?
-		      pasid : pasid | MSR_IA32_PASID_VALID;
-
-	/*
-	 * No need to hold fregs_lock() since the task's fpstate won't
-	 * be changed by others (e.g. ptrace) while the task is being
-	 * switched to or is in IPI.
-	 */
-	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		/* The MSR is active and can be directly updated. */
-		wrmsrl(MSR_IA32_PASID, pasid_state);
-	} else {
-		struct fpu *fpu = &current->thread.fpu;
-		struct ia32_pasid_state *ppasid_state;
-		struct xregs_state *xsave;
-
-		/*
-		 * The CPU's xstate registers are not currently active. Just
-		 * update the PASID state in the memory buffer here. The
-		 * PASID MSR will be loaded when returning to user mode.
-		 */
-		xsave = &fpu->state.xsave;
-		xsave->header.xfeatures |= XFEATURE_MASK_PASID;
-		ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
-		/*
-		 * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
-		 * won't be NULL and no need to check its value.
-		 *
-		 * Only update the task's PASID state when it's different
-		 * from the mm's pasid.
-		 */
-		if (ppasid_state->pasid != pasid_state) {
-			/*
-			 * Invalid fpregs so that state restoring will pick up
-			 * the PASID state.
-			 */
-			__fpu_invalidate_fpregs_state(fpu);
-			ppasid_state->pasid = pasid_state;
-		}
-	}
-}
-#endif /* CONFIG_IOMMU_SUPPORT */
diff --git a/mm/memory.c b/mm/memory.c
index 465c6b6bbeb0..8eda1dc25d81 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4325,6 +4325,73 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(follow_pte);
 
+int follow_pte_pud_lockless(struct mm_struct *mm, unsigned long address,
+			    pte_t *ptep, pmd_t *pmdp, pud_t *pudp)
+{
+	pgd_t *pgd, pgdval;
+	p4d_t *p4d, p4dval;
+	pud_t *pud, pudval;
+	pmd_t *pmd, pmdval;
+	pte_t *pte, pteval;
+
+	pgd = pgd_offset(mm, address);
+	pgdval = *pgd;
+	if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
+		goto out;
+
+	p4d = p4d_offset(&pgdval, address);
+	p4dval = *p4d;
+	if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
+		goto out;
+
+	pud = pud_offset(&p4dval, address);
+	pudval = *pud;
+	if (!pud_present(pudval))
+		goto out;
+	BUG_ON(pud_trans_huge(pudval));
+
+	if (pud_huge(pudval)) {
+		if (!pudp)
+			goto out;
+
+		*pudp = pudval;
+		return 0;
+	}
+
+	if (pud_none(pudval) || unlikely(pud_bad(pudval)))
+		goto out;
+
+	pmd = pmd_offset(&pudval, address);
+	pmdval = *pmd;
+	if (!pmd_present(pmdval))
+		goto out;
+	BUG_ON(pmd_trans_huge(pmdval));
+
+	if (pmd_huge(pmdval)) {
+		if (!pmdp)
+			goto out;
+
+		*pmdp = pmdval;
+		return 0;
+	}
+
+	if (pmd_none(pmdval) || unlikely(pmd_bad(pmdval)))
+		goto out;
+
+	pte = pte_offset_map(&pmdval, address);
+	if (!pte)
+		goto out;
+	pteval = *pte;
+	pte_unmap(pte);
+	if (!pte_present(pteval))
+		goto out;
+	*ptep = pteval;
+	return 0;
+out:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(follow_pte_pud_lockless);
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping
-- 
Gitee


From 21a30e4d15ec4937839d50456638f8507a12d3d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Jun 2021 16:36:18 +0200
Subject: [PATCH 0100/2161] x86/fpu: Prevent state corruption in
 __fpu__restore_sig()

commit 484cea4f362e1eeb5c869abbfb5f90eae6421b38 upstream.

The non-compacted slowpath uses __copy_from_user() and copies the entire
user buffer into the kernel buffer, verbatim.  This means that the kernel
buffer may now contain entirely invalid state on which XRSTOR will #GP.
validate_user_xstate_header() can detect some of that corruption, but that
leaves the onus on callers to clear the buffer.

Prior to XSAVES support, it was possible just to reinitialize the buffer,
completely, but with supervisor states that is not longer possible as the
buffer clearing code split got it backwards. Fixing that is possible but
not corrupting the state in the first place is more robust.

Avoid corruption of the kernel XSAVE buffer by using copy_user_to_xstate()
which validates the XSAVE header contents before copying the actual states
to the kernel. copy_user_to_xstate() was previously only called for
compacted-format kernel buffers, but it works for both compacted and
non-compacted forms.

Using it for the non-compacted form is slower because of multiple
__copy_from_user() operations, but that cost is less important than robust
code in an already slow path.

[ Changelog polished by Dave Hansen ]

Fixes: b860eb8dce59 ("x86/fpu/xstate: Define new functions for clearing fpregs and xstates")
Reported-by: syzbot+2067e764dbcd10721e2e@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Rik van Riel <riel@surriel.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210608144345.611833074@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index c84386ef54be..256cb5885c79 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -409,14 +409,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	if (use_xsave() && !fx_only) {
 		u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
 
-		if (using_compacted_format()) {
-			ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
-		} else {
-			ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
-
-			if (!ret && state_size > offsetof(struct xregs_state, header))
-				ret = validate_user_xstate_header(&fpu->state.xsave.header);
-		}
+		ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
 		if (ret)
 			goto out;
 
-- 
Gitee


From 00a383e396f1e41b1893018908fe94d5e39f2742 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 8 Jun 2021 16:36:19 +0200
Subject: [PATCH 0101/2161] x86/fpu: Invalidate FPU state after a failed XRSTOR
 from a user buffer

commit d8778e393afa421f1f117471144f8ce6deb6953a upstream.

Both Intel and AMD consider it to be architecturally valid for XRSTOR to
fail with #PF but nonetheless change the register state.  The actual
conditions under which this might occur are unclear [1], but it seems
plausible that this might be triggered if one sibling thread unmaps a page
and invalidates the shared TLB while another sibling thread is executing
XRSTOR on the page in question.

__fpu__restore_sig() can execute XRSTOR while the hardware registers
are preserved on behalf of a different victim task (using the
fpu_fpregs_owner_ctx mechanism), and, in theory, XRSTOR could fail but
modify the registers.

If this happens, then there is a window in which __fpu__restore_sig()
could schedule out and the victim task could schedule back in without
reloading its own FPU registers. This would result in part of the FPU
state that __fpu__restore_sig() was attempting to load leaking into the
victim task's user-visible state.

Invalidate preserved FPU registers on XRSTOR failure to prevent this
situation from corrupting any state.

[1] Frequent readers of the errata lists might imagine "complex
    microarchitectural conditions".

Fixes: 1d731e731c4c ("x86/fpu: Add a fastpath to __fpu__restore_sig()")
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Rik van Riel <riel@surriel.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210608144345.758116583@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 256cb5885c79..2d597f1f34aa 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -373,6 +373,25 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			fpregs_unlock();
 			return 0;
 		}
+
+		/*
+		 * The above did an FPU restore operation, restricted to
+		 * the user portion of the registers, and failed, but the
+		 * microcode might have modified the FPU registers
+		 * nevertheless.
+		 *
+		 * If the FPU registers do not belong to current, then
+		 * invalidate the FPU register state otherwise the task might
+		 * preempt current and return to user space with corrupted
+		 * FPU registers.
+		 *
+		 * In case current owns the FPU registers then no further
+		 * action is required. The fixup below will handle it
+		 * correctly.
+		 */
+		if (test_thread_flag(TIF_NEED_FPU_LOAD))
+			__cpu_invalidate_fpregs_state();
+
 		fpregs_unlock();
 	} else {
 		/*
-- 
Gitee


From 08799c579b36292fdb2478810837b541fb59faf0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 8 Jun 2021 16:36:22 +0200
Subject: [PATCH 0102/2161] x86/fpu: Add address range checks to
 copy_user_to_xstate()

commit f72a249b0ba85564c6bfa94d609a70567485a061 upstream.

copy_user_to_xstate() uses __copy_from_user(), which provides a negligible
speedup.  Fortunately, both call sites are at least almost correct.

__fpu__restore_sig() checks access_ok() with xstate_sigframe_size()
length and ptrace regset access uses fpu_user_xstate_size. These should
be valid upper bounds on the length, so, at worst, this would cause
spurious failures and not accesses to kernel memory.

Nonetheless, this is far more fragile than necessary and none of these
callers are in a hotpath.

Use copy_from_user() instead.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Rik van Riel <riel@surriel.com>
Link: https://lkml.kernel.org/r/20210608144346.140254130@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 48f8a6e7d80a..0caa2f7bcdb3 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1222,7 +1222,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(hdr);
 
-	if (__copy_from_user(&hdr, ubuf + offset, size))
+	if (copy_from_user(&hdr, ubuf + offset, size))
 		return -EFAULT;
 
 	if (validate_user_xstate_header(&hdr))
@@ -1237,7 +1237,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			if (__copy_from_user(dst, ubuf + offset, size))
+			if (copy_from_user(dst, ubuf + offset, size))
 				return -EFAULT;
 		}
 	}
@@ -1245,7 +1245,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
 		offset = offsetof(struct fxregs_state, mxcsr);
 		size = MXCSR_AND_FLAGS_SIZE;
-		if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
+		if (copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
 			return -EFAULT;
 	}
 
-- 
Gitee


From 7504079e482e1a1abc317354dd058d2acd752ab4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Jun 2021 16:18:24 +0200
Subject: [PATCH 0103/2161] x86/fpu: Preserve supervisor states in
 sanitize_restored_user_xstate()

commit 9301982c424a003c0095bf157154a85bf5322bd0 upstream.

sanitize_restored_user_xstate() preserves the supervisor states only
when the fx_only argument is zero, which allows unprivileged user space
to put supervisor states back into init state.

Preserve them unconditionally.

 [ bp: Fix a typo or two in the text. ]

Fixes: 5d6b6a6f9b5c ("x86/fpu/xstate: Update sanitize_restored_xstate() for supervisor xstates")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210618143444.438635017@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 2d597f1f34aa..995339a43c98 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -221,28 +221,18 @@ sanitize_restored_user_xstate(union fpregs_state *state,
 
 	if (use_xsave()) {
 		/*
-		 * Note: we don't need to zero the reserved bits in the
-		 * xstate_header here because we either didn't copy them at all,
-		 * or we checked earlier that they aren't set.
+		 * Clear all feature bits which are not set in
+		 * user_xfeatures and clear all extended features
+		 * for fx_only mode.
 		 */
+		u64 mask = fx_only ? XFEATURE_MASK_FPSSE : user_xfeatures;
 
 		/*
-		 * 'user_xfeatures' might have bits clear which are
-		 * set in header->xfeatures. This represents features that
-		 * were in init state prior to a signal delivery, and need
-		 * to be reset back to the init state.  Clear any user
-		 * feature bits which are set in the kernel buffer to get
-		 * them back to the init state.
-		 *
-		 * Supervisor state is unchanged by input from userspace.
-		 * Ensure supervisor state bits stay set and supervisor
-		 * state is not modified.
+		 * Supervisor state has to be preserved. The sigframe
+		 * restore can only modify user features, i.e. @mask
+		 * cannot contain them.
 		 */
-		if (fx_only)
-			header->xfeatures = XFEATURE_MASK_FPSSE;
-		else
-			header->xfeatures &= user_xfeatures |
-					     xfeatures_mask_supervisor();
+		header->xfeatures &= mask | xfeatures_mask_supervisor();
 	}
 
 	if (use_fxsr()) {
-- 
Gitee


From b8a8f68efa669280b1b3262e9c4e1924121d7a4b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:28 +0200
Subject: [PATCH 0104/2161] x86/fpu: Fix copy_xstate_to_kernel() gap handling

commit 9625895011d130033d1bc7aac0d77a9bf68ff8a6 upstream.

The gap handling in copy_xstate_to_kernel() is wrong when XSAVES is in
use.

Using init_fpstate for copying the init state of features which are
not set in the xstate header is only correct for the legacy area, but
not for the extended features area because when XSAVES is in use then
init_fpstate is in compacted form which means the xstate offsets which
are used to copy from init_fpstate are not valid.

Fortunately, this is not a real problem today because all extended
features in use have an all-zeros init state, but it is wrong
nevertheless and with a potentially dynamically sized init_fpstate this
would result in an access outside of the init_fpstate.

Fix this by keeping track of the last copied state in the target buffer and
explicitly zero it when there is a feature or alignment gap.

Use the compacted offset when accessing the extended feature space in
init_fpstate.

As this is not a functional issue on older kernels this is intentionally
not tagged for stable.

Fixes: b8be15d58806 ("x86/fpu/xstate: Re-enable XSAVES")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121451.294282032@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 105 ++++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0caa2f7bcdb3..cc64a4a0ae09 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1081,20 +1081,10 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
 	return true;
 }
 
-static void fill_gap(struct membuf *to, unsigned *last, unsigned offset)
+static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
+			 void *init_xstate, unsigned int size)
 {
-	if (*last >= offset)
-		return;
-	membuf_write(to, (void *)&init_fpstate.xsave + *last, offset - *last);
-	*last = offset;
-}
-
-static void copy_part(struct membuf *to, unsigned *last, unsigned offset,
-		      unsigned size, void *from)
-{
-	fill_gap(to, last, offset);
-	membuf_write(to, from, size);
-	*last = offset + size;
+	membuf_write(to, from_xstate ? xstate : init_xstate, size);
 }
 
 /*
@@ -1106,10 +1096,10 @@ static void copy_part(struct membuf *to, unsigned *last, unsigned offset,
  */
 void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 {
+	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+	struct xregs_state *xinit = &init_fpstate.xsave;
 	struct xstate_header header;
-	const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
-	unsigned size = to.left;
-	unsigned last = 0;
+	unsigned int zerofrom;
 	int i;
 
 	/*
@@ -1119,41 +1109,68 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 	header.xfeatures = xsave->header.xfeatures;
 	header.xfeatures &= xfeatures_mask_user();
 
-	if (header.xfeatures & XFEATURE_MASK_FP)
-		copy_part(&to, &last, 0, off_mxcsr, &xsave->i387);
-	if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
-		copy_part(&to, &last, off_mxcsr,
-			  MXCSR_AND_FLAGS_SIZE, &xsave->i387.mxcsr);
-	if (header.xfeatures & XFEATURE_MASK_FP)
-		copy_part(&to, &last, offsetof(struct fxregs_state, st_space),
-			  128, &xsave->i387.st_space);
-	if (header.xfeatures & XFEATURE_MASK_SSE)
-		copy_part(&to, &last, xstate_offsets[XFEATURE_SSE],
-			  256, &xsave->i387.xmm_space);
-	/*
-	 * Fill xsave->i387.sw_reserved value for ptrace frame:
-	 */
-	copy_part(&to, &last, offsetof(struct fxregs_state, sw_reserved),
-		  48, xstate_fx_sw_bytes);
-	/*
-	 * Copy xregs_state->header:
-	 */
-	copy_part(&to, &last, offsetof(struct xregs_state, header),
-		  sizeof(header), &header);
+	/* Copy FP state up to MXCSR */
+	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
+		     &xinit->i387, off_mxcsr);
+
+	/* Copy MXCSR when SSE or YMM are set in the feature mask */
+	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
+		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
+		     MXCSR_AND_FLAGS_SIZE);
+
+	/* Copy the remaining FP state */
+	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
+		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
+		     sizeof(xsave->i387.st_space));
+
+	/* Copy the SSE state - shared with YMM, but independently managed */
+	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
+		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
+		     sizeof(xsave->i387.xmm_space));
+
+	/* Zero the padding area */
+	membuf_zero(&to, sizeof(xsave->i387.padding));
+
+	/* Copy xsave->i387.sw_reserved */
+	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
+
+	/* Copy the user space relevant state of @xsave->header */
+	membuf_write(&to, &header, sizeof(header));
+
+	zerofrom = offsetof(struct xregs_state, extended_state_area);
 
 	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
 		/*
-		 * Copy only in-use xstates:
+		 * The ptrace buffer is in non-compacted XSAVE format.
+		 * In non-compacted format disabled features still occupy
+		 * state space, but there is no state to copy from in the
+		 * compacted init_fpstate. The gap tracking will zero this
+		 * later.
 		 */
-		if ((header.xfeatures >> i) & 1) {
-			void *src = __raw_xsave_addr(xsave, i);
+		if (!(xfeatures_mask_user() & BIT_ULL(i)))
+			continue;
 
-			copy_part(&to, &last, xstate_offsets[i],
-				  xstate_sizes[i], src);
-		}
+		/*
+		 * If there was a feature or alignment gap, zero the space
+		 * in the destination buffer.
+		 */
+		if (zerofrom < xstate_offsets[i])
+			membuf_zero(&to, xstate_offsets[i] - zerofrom);
+
+		copy_feature(header.xfeatures & BIT_ULL(i), &to,
+			     __raw_xsave_addr(xsave, i),
+			     __raw_xsave_addr(xinit, i),
+			     xstate_sizes[i]);
 
+		/*
+		 * Keep track of the last copied state in the non-compacted
+		 * target buffer for gap zeroing.
+		 */
+		zerofrom = xstate_offsets[i] + xstate_sizes[i];
 	}
-	fill_gap(&to, &last, size);
+
+	if (to.left)
+		membuf_zero(&to, to.left);
 }
 
 /*
-- 
Gitee


From bd96636b28eb6fb03e1d6009e13a3154b40a4833 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:29 +0200
Subject: [PATCH 0105/2161] x86/pkeys: Revert a5eff7259790 ("x86/pkeys: Add
 PKRU value to init_fpstate")

commit b3607269ff57fd3c9690cb25962c5e4b91a0fd3b upstream.

This cannot work and it's unclear how that ever made a difference.

init_fpstate.xsave.header.xfeatures is always 0 so get_xsave_addr() will
always return a NULL pointer, which will prevent storing the default PKRU
value in init_fpstate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121451.451391598@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/common.c | 5 -----
 arch/x86/mm/pkeys.c          | 6 ------
 2 files changed, 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d11939181c75..2823db1138a8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -465,8 +465,6 @@ static bool pku_disabled;
 
 static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 {
-	struct pkru_state *pk;
-
 	/* check the boot processor, plus compile options for PKU: */
 	if (!cpu_feature_enabled(X86_FEATURE_PKU))
 		return;
@@ -477,9 +475,6 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 		return;
 
 	cr4_set_bits(X86_CR4_PKE);
-	pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
-	if (pk)
-		pk->pkru = init_pkru_value;
 	/*
 	 * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
 	 * cpuid bit to be set.  We need to ensure that we
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index c6f84c0b5d7a..ca77af96033b 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -10,7 +10,6 @@
 
 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
 #include <asm/mmu_context.h>            /* vma_pkey()                   */
-#include <asm/fpu/internal.h>		/* init_fpstate			*/
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
@@ -154,7 +153,6 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 static ssize_t init_pkru_write_file(struct file *file,
 		 const char __user *user_buf, size_t count, loff_t *ppos)
 {
-	struct pkru_state *pk;
 	char buf[32];
 	ssize_t len;
 	u32 new_init_pkru;
@@ -177,10 +175,6 @@ static ssize_t init_pkru_write_file(struct file *file,
 		return -EINVAL;
 
 	WRITE_ONCE(init_pkru_value, new_init_pkru);
-	pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
-	if (!pk)
-		return -EINVAL;
-	pk->pkru = new_init_pkru;
 	return count;
 }
 
-- 
Gitee


From 45ba93cda5bf8fbabce64afc719d91d7eb6bc346 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:30 +0200
Subject: [PATCH 0106/2161] x86/fpu: Mark various FPU state variables
 __ro_after_init

commit ce578f16348b003675c928a1992498b33b515f18 upstream.

Nothing modifies these after booting.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lkml.kernel.org/r/20210623121451.611751529@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/init.c   |  4 ++--
 arch/x86/kernel/fpu/xstate.c | 14 +++++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index f8ff895aaf7e..2c36e4454c3d 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -90,7 +90,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 /*
  * Boot time FPU feature detection code:
  */
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu;
 EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
 
 static void __init fpu__init_system_mxcsr(void)
@@ -136,7 +136,7 @@ static void __init fpu__init_system_generic(void)
  * This is inherent to the XSAVE architecture which puts all state
  * components into a single, continuous memory block:
  */
-unsigned int fpu_kernel_xstate_size;
+unsigned int fpu_kernel_xstate_size __ro_after_init;
 EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
 
 /* Get alignment of the TYPE. */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index cc64a4a0ae09..229674116fb0 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -61,17 +61,21 @@ static short xsave_cpuid_features[] __initdata = {
  */
 u64 xfeatures_mask_all __read_mostly;
 
-static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
+	{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
+	{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init =
+	{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init =
+	{ [ 0 ... XFEATURE_MAX - 1] = -1};
 
 /*
  * The XSAVE area of kernel can be in standard or compacted format;
  * it is always in standard format for user mode. This is the user
  * mode standard format size used for signal and ptrace frames.
  */
-unsigned int fpu_user_xstate_size;
+unsigned int fpu_user_xstate_size __ro_after_init;
 
 /*
  * Return whether the system supports a given xfeature.
-- 
Gitee


From dfec410ff050b1b497a2929620a575422ccd57d6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:31 +0200
Subject: [PATCH 0107/2161] x86/fpu: Make xfeatures_mask_all __ro_after_init

commit 4e8e4313cf81add679e1c57677d689c02e382a67 upstream.

Nothing has to modify this after init.

But of course there is code which unconditionally masks
xfeatures_mask_all on CPU hotplug. This goes unnoticed during boot
hotplug because at that point the variable is still RW mapped.

This is broken in several ways:

  1) Masking this in post init CPU hotplug means that any
     modification of this state goes unnoticed until actual hotplug
     happens.

  2) If that ever happens then these bogus feature bits are already
     populated all over the place and the system is in inconsistent state
     vs. the compacted XSTATE offsets. If at all then this has to panic the
     machine because the inconsistency cannot be undone anymore.

Make this a one-time paranoia check in xstate init code and disable
xsave when this happens.

Reported-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121451.712803952@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 229674116fb0..183a6ae565f9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -59,7 +59,7 @@ static short xsave_cpuid_features[] __initdata = {
  * This represents the full set of bits that should ever be set in a kernel
  * XSAVE buffer, both supervisor and user xstates.
  */
-u64 xfeatures_mask_all __read_mostly;
+u64 xfeatures_mask_all __ro_after_init;
 
 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
@@ -213,19 +213,8 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
  */
 void fpu__init_cpu_xstate(void)
 {
-	u64 unsup_bits;
-
 	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
 		return;
-	/*
-	 * Unsupported supervisor xstates should not be found in
-	 * the xfeatures mask.
-	 */
-	unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
-	WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n",
-		  unsup_bits);
-
-	xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
 
 	cr4_set_bits(X86_CR4_OSXSAVE);
 
@@ -822,6 +811,7 @@ void __init fpu__init_system_xstate(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 	static int on_boot_cpu __initdata = 1;
+	u64 xfeatures;
 	int err;
 	int i;
 
@@ -876,6 +866,8 @@ void __init fpu__init_system_xstate(void)
 	}
 
 	xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
+	/* Store it for paranoia check at the end */
+	xfeatures = xfeatures_mask_all;
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
@@ -893,8 +885,18 @@ void __init fpu__init_system_xstate(void)
 	setup_init_fpu_buf();
 	setup_xstate_comp_offsets();
 	setup_supervisor_only_offsets();
-	print_xstate_offset_size();
 
+	/*
+	 * Paranoia check whether something in the setup modified the
+	 * xfeatures mask.
+	 */
+	if (xfeatures != xfeatures_mask_all) {
+		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
+		       xfeatures, xfeatures_mask_all);
+		goto out_disable;
+	}
+
+	print_xstate_offset_size();
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
 		xfeatures_mask_all,
 		fpu_kernel_xstate_size,
-- 
Gitee


From 41c0e59e53161d8d348400b157226be116592ba9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:32 +0200
Subject: [PATCH 0108/2161] x86/fpu: Get rid of
 fpu__get_supported_xfeatures_mask()

commit bae54dc4f353653bbd3e3081754adad05da1d4dd upstream.

This function is really not doing what the comment advertises:

 "Find supported xfeatures based on cpu features and command-line input.
  This must be called after fpu__init_parse_early_param() is called and
  xfeatures_mask is enumerated."

fpu__init_parse_early_param() does not exist anymore and the function just
returns a constant.

Remove it and fix the caller and get rid of further references to
fpu__init_parse_early_param().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121451.816404717@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  1 -
 arch/x86/kernel/cpu/common.c        |  5 ++---
 arch/x86/kernel/fpu/init.c          | 11 -----------
 arch/x86/kernel/fpu/xstate.c        |  4 +++-
 4 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 90bdf15d2efe..4c1e949c1bc5 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -44,7 +44,6 @@ extern void fpu__init_cpu_xstate(void);
 extern void fpu__init_system(struct cpuinfo_x86 *c);
 extern void fpu__init_check_bugs(void);
 extern void fpu__resume_cpu(void);
-extern u64 fpu__get_supported_xfeatures_mask(void);
 
 /*
  * Debugging facility:
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2823db1138a8..49e40b5fb195 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1704,9 +1704,8 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 }
 
 /*
- * clearcpuid= was already parsed in fpu__init_parse_early_param.
- * But we need to keep a dummy __setup around otherwise it would
- * show up as an environment variable for init.
+ * clearcpuid= was already parsed in cpu_parse_early_param().  This dummy
+ * function prevents it from becoming an environment variable for init.
  */
 static __init int setup_clearcpuid(char *arg)
 {
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 2c36e4454c3d..d708b6a1c077 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -217,17 +217,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 	fpu_user_xstate_size = fpu_kernel_xstate_size;
 }
 
-/*
- * Find supported xfeatures based on cpu features and command-line input.
- * This must be called after fpu__init_parse_early_param() is called and
- * xfeatures_mask is enumerated.
- */
-u64 __init fpu__get_supported_xfeatures_mask(void)
-{
-	return XFEATURE_MASK_USER_SUPPORTED |
-	       XFEATURE_MASK_SUPERVISOR_SUPPORTED;
-}
-
 /* Legacy code to initialize eager fpu mode. */
 static void __init fpu__init_system_ctx_switch(void)
 {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 183a6ae565f9..ce214b3f385f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -865,7 +865,9 @@ void __init fpu__init_system_xstate(void)
 			xfeatures_mask_all &= ~BIT_ULL(i);
 	}
 
-	xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
+	xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED |
+			      XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+
 	/* Store it for paranoia check at the end */
 	xfeatures = xfeatures_mask_all;
 
-- 
Gitee


From e505604938a7213410f5681ab6fb86f90a8ff3c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:34 +0200
Subject: [PATCH 0109/2161] x86/fpu: Move inlines where they belong

commit e68524456c855e500f0a636adb1aa977e1e0b4d8 upstream.

They are only used in fpstate_init() and there is no point to have them in
a header just to make reading the code harder.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.023118522@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 14 --------------
 arch/x86/kernel/fpu/core.c          | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 4c1e949c1bc5..dbc7d05b25a3 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -85,20 +85,6 @@ extern void fpstate_init_soft(struct swregs_state *soft);
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
-{
-	/*
-	 * XRSTORS requires these bits set in xcomp_bv, or it will
-	 * trigger #GP:
-	 */
-	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
-}
-
-static inline void fpstate_init_fxstate(struct fxregs_state *fx)
-{
-	fx->cwd = 0x37f;
-	fx->mxcsr = MXCSR_DEFAULT;
-}
 extern void fpstate_sanitize_xstate(struct fpu *fpu);
 
 #define user_insn(insn, output, input...)				\
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 571220ac8bea..9ff467bb4894 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -181,6 +181,21 @@ void fpu__save(struct fpu *fpu)
 	fpregs_unlock();
 }
 
+static inline void fpstate_init_xstate(struct xregs_state *xsave)
+{
+	/*
+	 * XRSTORS requires these bits set in xcomp_bv, or it will
+	 * trigger #GP:
+	 */
+	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
+}
+
+static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+{
+	fx->cwd = 0x37f;
+	fx->mxcsr = MXCSR_DEFAULT;
+}
+
 /*
  * Legacy x87 fpstate state init:
  */
-- 
Gitee


From 2b3e67e7ab8047ce431af4a40328c8100394b52c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:36 +0200
Subject: [PATCH 0110/2161] x86/fpu: Sanitize xstateregs_set()

commit 43be46e89698a41dbf4fff81a322f4c2ae21b5e2 upstream.

xstateregs_set() operates on a stopped task and tries to copy the provided
buffer into the task's fpu.state.xsave buffer.

Any error while copying or invalid state detected after copying results in
wiping the target task's FPU state completely including supervisor states.

That's just wrong. The caller supplied invalid data or has a problem with
unmapped memory, so there is absolutely no justification to corrupt the
target state.

Fix this with the following modifications:

 1) If data has to be copied from userspace, allocate a buffer and copy from
    user first.

 2) Use copy_kernel_to_xstate() unconditionally so that header checking
    works correctly.

 3) Return on error without corrupting the target state.

This prevents corrupting states and lets the caller deal with the problem
it caused in the first place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.214903673@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  4 ---
 arch/x86/kernel/fpu/regset.c      | 42 +++++++++++++------------------
 arch/x86/kernel/fpu/xstate.c      | 14 ++++++-----
 3 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..f4c5bad2a89b 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -112,8 +112,4 @@ void copy_supervisor_to_kernel(struct xregs_state *xsave);
 void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
 void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
 
-
-/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_user_xstate_header(const struct xstate_header *hdr);
-
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c413756ba89f..7e3886e43e23 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -2,11 +2,13 @@
 /*
  * FPU register's regset abstraction, for ptrace, core dumps, etc.
  */
+#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
+
 #include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
-#include <linux/sched/task_stack.h>
 
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
@@ -108,10 +110,10 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		  const void *kbuf, const void __user *ubuf)
 {
 	struct fpu *fpu = &target->thread.fpu;
-	struct xregs_state *xsave;
+	struct xregs_state *tmpbuf = NULL;
 	int ret;
 
-	if (!boot_cpu_has(X86_FEATURE_XSAVE))
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
 		return -ENODEV;
 
 	/*
@@ -120,32 +122,22 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	if ((pos != 0) || (count < fpu_user_xstate_size))
 		return -EFAULT;
 
-	xsave = &fpu->state.xsave;
-
-	fpu__prepare_write(fpu);
+	if (!kbuf) {
+		tmpbuf = vmalloc(count);
+		if (!tmpbuf)
+			return -ENOMEM;
 
-	if (using_compacted_format()) {
-		if (kbuf)
-			ret = copy_kernel_to_xstate(xsave, kbuf);
-		else
-			ret = copy_user_to_xstate(xsave, ubuf);
-	} else {
-		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-		if (!ret)
-			ret = validate_user_xstate_header(&xsave->header);
+		if (copy_from_user(tmpbuf, ubuf, count)) {
+			ret = -EFAULT;
+			goto out;
+		}
 	}
 
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	xsave->i387.mxcsr &= mxcsr_feature_mask;
-
-	/*
-	 * In case of failure, mark all states as init:
-	 */
-	if (ret)
-		fpstate_init(&fpu->state);
+	fpu__prepare_write(fpu);
+	ret = copy_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf);
 
+out:
+	vfree(tmpbuf);
 	return ret;
 }
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index ce214b3f385f..8c3d3695e1cd 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -540,7 +540,7 @@ int using_compacted_format(void)
 }
 
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_user_xstate_header(const struct xstate_header *hdr)
+static int validate_user_xstate_header(const struct xstate_header *hdr)
 {
 	/* No unknown or supervisor features may be set */
 	if (hdr->xfeatures & ~xfeatures_mask_user())
@@ -1182,7 +1182,7 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 }
 
 /*
- * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] format
  * and copy to the target thread. This is called from xstateregs_set().
  */
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
@@ -1229,14 +1229,16 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	 */
 	xsave->header.xfeatures |= hdr.xfeatures;
 
+	/* mxcsr reserved bits must be masked to zero for historical reasons. */
+	xsave->i387.mxcsr &= mxcsr_feature_mask;
+
 	return 0;
 }
 
 /*
- * Convert from a ptrace or sigreturn standard-format user-space buffer to
- * kernel XSAVES format and copy to the target thread. This is called from
- * xstateregs_set(), as well as potentially from the sigreturn() and
- * rt_sigreturn() system calls.
+ * Convert from a sigreturn standard-format user-space buffer to kernel
+ * XSAVE[S] format and copy to the target thread. This is called from the
+ * sigreturn() and rt_sigreturn() system calls.
  */
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 {
-- 
Gitee


From d25e2b21fc93d541c8a6afee2d49135448fa7890 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:37 +0200
Subject: [PATCH 0111/2161] x86/fpu: Reject invalid MXCSR values in
 copy_kernel_to_xstate()

commit 947f4947cf00ea1e6d319eb182c64ea51ba4de8d upstream.

Instead of masking out reserved bits, check them and reject the provided
state as invalid if not zero.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.308388343@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 8c3d3695e1cd..6835de93eddd 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1181,6 +1181,19 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 		membuf_zero(&to, to.left);
 }
 
+static inline bool mxcsr_valid(struct xstate_header *hdr, const u32 *mxcsr)
+{
+	u64 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
+
+	/* Only check if it is in use */
+	if (hdr->xfeatures & mask) {
+		/* Reserved bits in MXCSR must be zero. */
+		if (*mxcsr & ~mxcsr_feature_mask)
+			return false;
+	}
+	return true;
+}
+
 /*
  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] format
  * and copy to the target thread. This is called from xstateregs_set().
@@ -1199,6 +1212,9 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	if (validate_user_xstate_header(&hdr))
 		return -EINVAL;
 
+	if (!mxcsr_valid(&hdr, kbuf + offsetof(struct fxregs_state, mxcsr)))
+		return -EINVAL;
+
 	for (i = 0; i < XFEATURE_MAX; i++) {
 		u64 mask = ((u64)1 << i);
 
@@ -1229,9 +1245,6 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	 */
 	xsave->header.xfeatures |= hdr.xfeatures;
 
-	/* mxcsr reserved bits must be masked to zero for historical reasons. */
-	xsave->i387.mxcsr &= mxcsr_feature_mask;
-
 	return 0;
 }
 
-- 
Gitee


From 5b4feccb4574c256b885bf1aa427af49f4c3143e Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 23 Jun 2021 14:01:38 +0200
Subject: [PATCH 0112/2161] x86/fpu: Simplify PTRACE_GETREGS code

commit 3a3351126ee8f1f1c86c4c79c60a650c1da89733 upstream.

ptrace() has interfaces that let a ptracer inspect a ptracee's register state.
This includes XSAVE state.  The ptrace() ABI includes a hardware-format XSAVE
buffer for both the SETREGS and GETREGS interfaces.

In the old days, the kernel buffer and the ptrace() ABI buffer were the
same boring non-compacted format.  But, since the advent of supervisor
states and the compacted format, the kernel buffer has diverged from the
format presented in the ABI.

This leads to two paths in the kernel:
1. Effectively a verbatim copy_to_user() which just copies the kernel buffer
   out to userspace.  This is used when the kernel buffer is kept in the
   non-compacted form which means that it shares a format with the ptrace
   ABI.
2. A one-state-at-a-time path: copy_xstate_to_kernel().  This is theoretically
   slower since it does a bunch of piecemeal copies.

Remove the verbatim copy case.  Speed probably does not matter in this path,
and the vast majority of new hardware will use the one-state-at-a-time path
anyway.  This ensures greater testing for the "slow" path.

This also makes enabling PKRU in this interface easier since a single path
can be patched instead of two.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.408457100@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 24 +++---------------------
 arch/x86/kernel/fpu/xstate.c |  6 +++---
 2 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 7e3886e43e23..dcdb07f4bdf0 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -77,32 +77,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 		struct membuf to)
 {
 	struct fpu *fpu = &target->thread.fpu;
-	struct xregs_state *xsave;
 
-	if (!boot_cpu_has(X86_FEATURE_XSAVE))
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
 		return -ENODEV;
 
-	xsave = &fpu->state.xsave;
-
 	fpu__prepare_read(fpu);
 
-	if (using_compacted_format()) {
-		copy_xstate_to_kernel(to, xsave);
-		return 0;
-	} else {
-		fpstate_sanitize_xstate(fpu);
-		/*
-		 * Copy the 48 bytes defined by the software into the xsave
-		 * area in the thread struct, so that we can copy the whole
-		 * area to user using one user_regset_copyout().
-		 */
-		memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
-		/*
-		 * Copy the xstate memory layout.
-		 */
-		return membuf_write(&to, xsave, fpu_user_xstate_size);
-	}
+	copy_xstate_to_kernel(to, &fpu->state.xsave);
+	return 0;
 }
 
 int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6835de93eddd..b008838d430e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1096,11 +1096,11 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
 }
 
 /*
- * Convert from kernel XSAVES compacted format to standard format and copy
- * to a kernel-space ptrace buffer.
+ * Convert from kernel XSAVE or XSAVES compacted format to UABI
+ * non-compacted format and copy to a kernel-space ptrace buffer.
  *
  * It supports partial copy but pos always starts from zero. This is called
- * from xstateregs_get() and there we check the CPU has XSAVES.
+ * from xstateregs_get() and there we check the CPU has XSAVE.
  */
 void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 {
-- 
Gitee


From fc1fc7db0bfdbb2979c1b4c98768f89cec9d9dea Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 23 Jun 2021 14:01:39 +0200
Subject: [PATCH 0113/2161] x86/fpu: Rewrite xfpregs_set()

commit 6164331d15f7d912fb9369245368e9564ea49813 upstream.

xfpregs_set() was incomprehensible.  Almost all of the complexity was due
to trying to support nonsensically sized writes or -EFAULT errors that
would have partially or completely overwritten the destination before
failing.  Nonsensically sized input would only have been possible using
PTRACE_SETREGSET on REGSET_XFP.  Fortunately, it appears (based on Debian
code search results) that no one uses that API at all, let alone with the
wrong sized buffer.  Failed user access can be handled more cleanly by
first copying to kernel memory.

Just rewrite it to require sensible input.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.504234607@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 37 ++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index dcdb07f4bdf0..682c3f5218cd 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -47,30 +47,39 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 		const void *kbuf, const void __user *ubuf)
 {
 	struct fpu *fpu = &target->thread.fpu;
+	struct user32_fxsr_struct newstate;
 	int ret;
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR))
+	BUILD_BUG_ON(sizeof(newstate) != sizeof(struct fxregs_state));
+
+	if (!cpu_feature_enabled(X86_FEATURE_FXSR))
 		return -ENODEV;
 
+	/* No funny business with partial or oversized writes is permitted. */
+	if (pos != 0 || count != sizeof(newstate))
+		return -EINVAL;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1);
+	if (ret)
+		return ret;
+
+	/* Mask invalid MXCSR bits (for historical reasons). */
+	newstate.mxcsr &= mxcsr_feature_mask;
+
 	fpu__prepare_write(fpu);
-	fpstate_sanitize_xstate(fpu);
 
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &fpu->state.fxsave, 0, -1);
+	/* Copy the state  */
+	memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate));
 
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+	/* Clear xmm8..15 */
+	BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16);
+	memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16);
 
-	/*
-	 * update the header bits in the xsave header, indicating the
-	 * presence of FP and SSE state.
-	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVE))
+	/* Mark FP and SSE as in use when XSAVE is enabled */
+	if (use_xsave())
 		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
 
-	return ret;
+	return 0;
 }
 
 int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
-- 
Gitee


From 0c095400b2b618f0132046d67f1ee2708cec290e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 23 Jun 2021 14:01:40 +0200
Subject: [PATCH 0114/2161] x86/fpu: Fail ptrace() requests that try to set
 invalid MXCSR values

commit 145e9e0d8c6fada4a40f9fc65b34658077874d9c upstream.

There is no benefit from accepting and silently changing an invalid MXCSR
value supplied via ptrace().  Instead, return -EINVAL on invalid input.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.613614842@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 682c3f5218cd..24d923b72dc1 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -63,8 +63,9 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
-	/* Mask invalid MXCSR bits (for historical reasons). */
-	newstate.mxcsr &= mxcsr_feature_mask;
+	/* Do not allow an invalid MXCSR value. */
+	if (newstate.mxcsr & ~mxcsr_feature_mask)
+		return -EINVAL;
 
 	fpu__prepare_write(fpu);
 
-- 
Gitee


From 6379cb1db382240c4aadec771ccc650da789b546 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 23 Jun 2021 14:01:41 +0200
Subject: [PATCH 0115/2161] x86/fpu: Clean up fpregs_set()

commit da53f60bb86e60830932926cf1093953a811912c upstream.

fpregs_set() has unnecessary complexity to support short or nonzero-offset
writes and to handle the case in which a copy from userspace overwrites
some of the target buffer and then fails.  Support for partial writes is
useless -- just require that the write has offset 0 and the correct size,
and copy into a temporary kernel buffer to avoid clobbering the state if
the user access fails.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.710467587@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 24d923b72dc1..2d15f381186d 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -304,31 +304,32 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	struct user_i387_ia32_struct env;
 	int ret;
 
-	fpu__prepare_write(fpu);
-	fpstate_sanitize_xstate(fpu);
+	/* No funny business with partial or oversized writes is permitted. */
+	if (pos != 0 || count != sizeof(struct user_i387_ia32_struct))
+		return -EINVAL;
 
-	if (!boot_cpu_has(X86_FEATURE_FPU))
+	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR))
-		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &fpu->state.fsave, 0,
-					  -1);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+	if (ret)
+		return ret;
 
-	if (pos > 0 || count < sizeof(env))
-		convert_from_fxsr(&env, target);
+	fpu__prepare_write(fpu);
 
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-	if (!ret)
-		convert_to_fxsr(&target->thread.fpu.state.fxsave, &env);
+	if (cpu_feature_enabled(X86_FEATURE_FXSR))
+		convert_to_fxsr(&fpu->state.fxsave, &env);
+	else
+		memcpy(&fpu->state.fsave, &env, sizeof(env));
 
 	/*
-	 * update the header bit in the xsave header, indicating the
+	 * Update the header bit in the xsave header, indicating the
 	 * presence of FP.
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVE))
+	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
 		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
-	return ret;
+
+	return 0;
 }
 
 #endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
-- 
Gitee


From 51446e3509890c71dcb2163e575fa024ef72e564 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:42 +0200
Subject: [PATCH 0116/2161] x86/fpu: Make copy_xstate_to_kernel() usable for
 [x]fpregs_get()

commit eb6f51723f03c9a1c098ed196a31a03e626b9fb6 upstream.

When xsave with init state optimization is used then a component's state
in the task's xsave buffer can be stale when the corresponding feature bit
is not set.

fpregs_get() and xfpregs_get() invoke fpstate_sanitize_xstate() to update
the task's xsave buffer before retrieving the FX or FP state. That's just
duplicated code as copy_xstate_to_kernel() already handles this correctly.

Add a copy mode argument to the function which allows to restrict the state
copy to the FP and SSE features.

Also rename the function to copy_xstate_to_uabi_buf() so the name reflects
what it is doing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.805327286@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 12 +++++++--
 arch/x86/kernel/fpu/regset.c      |  2 +-
 arch/x86/kernel/fpu/xstate.c      | 42 +++++++++++++++++++++++--------
 3 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index f4c5bad2a89b..f03e6ed3ecca 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -104,12 +104,20 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
-struct membuf;
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 void copy_supervisor_to_kernel(struct xregs_state *xsave);
 void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
 void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
 
+enum xstate_copy_mode {
+	XSTATE_COPY_FP,
+	XSTATE_COPY_FX,
+	XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
+void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
+			     enum xstate_copy_mode mode);
+
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 2d15f381186d..fbf1de5d5302 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -93,7 +93,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	fpu__prepare_read(fpu);
 
-	copy_xstate_to_kernel(to, &fpu->state.xsave);
+	copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_XSAVE);
 	return 0;
 }
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b008838d430e..069f95fecd3e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1095,14 +1095,20 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
 	membuf_write(to, from_xstate ? xstate : init_xstate, size);
 }
 
-/*
- * Convert from kernel XSAVE or XSAVES compacted format to UABI
- * non-compacted format and copy to a kernel-space ptrace buffer.
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to:		membuf descriptor
+ * @xsave:	The kernel xstate buffer to copy from
+ * @copy_mode:	The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
  *
- * It supports partial copy but pos always starts from zero. This is called
- * from xstateregs_get() and there we check the CPU has XSAVE.
+ * It supports partial copy but @to.pos always starts from zero.
  */
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
+void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
+			     enum xstate_copy_mode copy_mode)
 {
 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
 	struct xregs_state *xinit = &init_fpstate.xsave;
@@ -1110,12 +1116,22 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 	unsigned int zerofrom;
 	int i;
 
-	/*
-	 * The destination is a ptrace buffer; we put in only user xstates:
-	 */
-	memset(&header, 0, sizeof(header));
 	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= xfeatures_mask_user();
+
+	/* Mask out the feature bits depending on copy mode */
+	switch (copy_mode) {
+	case XSTATE_COPY_FP:
+		header.xfeatures &= XFEATURE_MASK_FP;
+		break;
+
+	case XSTATE_COPY_FX:
+		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
+		break;
+
+	case XSTATE_COPY_XSAVE:
+		header.xfeatures &= xfeatures_mask_user();
+		break;
+	}
 
 	/* Copy FP state up to MXCSR */
 	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
@@ -1136,6 +1152,9 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
 		     sizeof(xsave->i387.xmm_space));
 
+	if (copy_mode != XSTATE_COPY_XSAVE)
+		goto out;
+
 	/* Zero the padding area */
 	membuf_zero(&to, sizeof(xsave->i387.padding));
 
@@ -1177,6 +1196,7 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
 		zerofrom = xstate_offsets[i] + xstate_sizes[i];
 	}
 
+out:
 	if (to.left)
 		membuf_zero(&to, to.left);
 }
-- 
Gitee


From 072b19f008bc6e03ac5166bc922c6afd4530d16d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:43 +0200
Subject: [PATCH 0117/2161] x86/fpu: Use copy_xstate_to_uabi_buf() in
 xfpregs_get()

commit adc997b3d66d1cfa8c15a7dbafdaef239a51b5db upstream.

Use the new functionality of copy_xstate_to_uabi_buf() to retrieve the
FX state when XSAVE* is in use. This avoids overwriting the FPU state
buffer with fpstate_sanitize_xstate() which is error prone and duplicated
code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.901736860@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index fbf1de5d5302..9c320763dca7 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -33,13 +33,18 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 {
 	struct fpu *fpu = &target->thread.fpu;
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR))
+	if (!cpu_feature_enabled(X86_FEATURE_FXSR))
 		return -ENODEV;
 
 	fpu__prepare_read(fpu);
-	fpstate_sanitize_xstate(fpu);
 
-	return membuf_write(&to, &fpu->state.fxsave, sizeof(struct fxregs_state));
+	if (!use_xsave()) {
+		return membuf_write(&to, &fpu->state.fxsave,
+				    sizeof(fpu->state.fxsave));
+	}
+
+	copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_FX);
+	return 0;
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
-- 
Gitee


From 025bb3ac2cfe05884ba20e39cecf8f56dc54d4a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:44 +0200
Subject: [PATCH 0118/2161] x86/fpu: Use copy_xstate_to_uabi_buf() in
 fpregs_get()

commit adc997b3d66d1cfa8c15a7dbafdaef239a51b5db upstream.

Use the new functionality of copy_xstate_to_uabi_buf() to retrieve the
FX state when XSAVE* is in use. This avoids to overwrite the FPU state
buffer with fpstate_sanitize_xstate() which is error prone and duplicated
code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.014441775@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 9c320763dca7..3bbb6c0e6f45 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -210,10 +210,10 @@ static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
  * FXSR floating point environment conversions.
  */
 
-void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
+				struct task_struct *tsk,
+				struct fxregs_state *fxsave)
 {
-	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
 	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -247,6 +247,12 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 		memcpy(&to[i], &from[i], sizeof(to[0]));
 }
 
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+	__convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave);
+}
+
 void convert_to_fxsr(struct fxregs_state *fxsave,
 		     const struct user_i387_ia32_struct *env)
 
@@ -279,25 +285,29 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 {
 	struct fpu *fpu = &target->thread.fpu;
 	struct user_i387_ia32_struct env;
+	struct fxregs_state fxsave, *fx;
 
 	fpu__prepare_read(fpu);
 
-	if (!boot_cpu_has(X86_FEATURE_FPU))
+	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return fpregs_soft_get(target, regset, to);
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR)) {
+	if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
 		return membuf_write(&to, &fpu->state.fsave,
 				    sizeof(struct fregs_state));
 	}
 
-	fpstate_sanitize_xstate(fpu);
+	if (use_xsave()) {
+		struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) };
 
-	if (to.left == sizeof(env)) {
-		convert_from_fxsr(to.p, target);
-		return 0;
+		/* Handle init state optimized xstate correctly */
+		copy_xstate_to_uabi_buf(mb, &fpu->state.xsave, XSTATE_COPY_FP);
+		fx = &fxsave;
+	} else {
+		fx = &fpu->state.fxsave;
 	}
 
-	convert_from_fxsr(&env, target);
+	__convert_from_fxsr(&env, target, fx);
 	return membuf_write(&to, &env, sizeof(env));
 }
 
-- 
Gitee


From be7204e0836d19e339a68dbdab432078da769c99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:30 +0200
Subject: [PATCH 0119/2161] x86/fpu: Return proper error codes from user access
 functions

commit aee8c67a4faa40a8df4e79316dbfc92d123989c1 upstream.

When *RSTOR from user memory raises an exception, there is no way to
differentiate them. That's bad because it forces the slow path even when
the failure was not a fault. If the operation raised eg. #GP then going
through the slow path is pointless.

Use _ASM_EXTABLE_FAULT() which stores the trap number and let the exception
fixup return the negated trap number as error.

This allows to separate the fast path and let it handle faults directly and
avoid the slow path for all other exceptions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121457.601480369@linutronix.de
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index dbc7d05b25a3..d77fea20e272 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -87,6 +87,7 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {}
 
 extern void fpstate_sanitize_xstate(struct fpu *fpu);
 
+/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
 #define user_insn(insn, output, input...)				\
 ({									\
 	int err;							\
@@ -94,14 +95,14 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
 	might_fault();							\
 									\
 	asm volatile(ASM_STAC "\n"					\
-		     "1:" #insn "\n\t"					\
+		     "1: " #insn "\n"					\
 		     "2: " ASM_CLAC "\n"				\
 		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
+		     "3:  negl %%eax\n"					\
 		     "    jmp  2b\n"					\
 		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
+		     _ASM_EXTABLE_FAULT(1b, 3b)				\
+		     : [err] "=a" (err), output				\
 		     : "0"(0), input);					\
 	err;								\
 })
@@ -203,16 +204,20 @@ static inline void fxsave(struct fxregs_state *fx)
 #define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
 #define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
 
+/*
+ * After this @err contains 0 on success or the negated trap number when
+ * the operation raises an exception. For faults this results in -EFAULT.
+ */
 #define XSTATE_OP(op, st, lmask, hmask, err)				\
 	asm volatile("1:" op "\n\t"					\
 		     "xor %[err], %[err]\n"				\
 		     "2:\n\t"						\
 		     ".pushsection .fixup,\"ax\"\n\t"			\
-		     "3: movl $-2,%[err]\n\t"				\
+		     "3: negl %%eax\n\t"				\
 		     "jmp 2b\n\t"					\
 		     ".popsection\n\t"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err)					\
+		     _ASM_EXTABLE_FAULT(1b, 3b)				\
+		     : [err] "=a" (err)					\
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
 
-- 
Gitee


From b3ab566e6141f04d23588d5b01f86f7adcdee585 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:45 +0200
Subject: [PATCH 0120/2161] x86/fpu: Remove fpstate_sanitize_xstate()

commit afac9e894364418731d1d7e66c1118b31fd130e8 upstream.

No more users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.124819167@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  2 -
 arch/x86/kernel/fpu/xstate.c        | 79 -----------------------------
 2 files changed, 81 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index d77fea20e272..72ee335bc6fa 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -85,8 +85,6 @@ extern void fpstate_init_soft(struct swregs_state *soft);
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 
-extern void fpstate_sanitize_xstate(struct fpu *fpu);
-
 /* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
 #define user_insn(insn, output, input...)				\
 ({									\
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 069f95fecd3e..109c3108c6e9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -128,85 +128,6 @@ static bool xfeature_is_supervisor(int xfeature_nr)
 	return ecx & 1;
 }
 
-/*
- * When executing XSAVEOPT (or other optimized XSAVE instructions), if
- * a processor implementation detects that an FPU state component is still
- * (or is again) in its initialized state, it may clear the corresponding
- * bit in the header.xfeatures field, and can skip the writeout of registers
- * to the corresponding memory layout.
- *
- * This means that when the bit is zero, the state component might still contain
- * some previous - non-initialized register state.
- *
- * Before writing xstate information to user-space we sanitize those components,
- * to always ensure that the memory layout of a feature will be in the init state
- * if the corresponding header bit is zero. This is to ensure that user-space doesn't
- * see some stale state in the memory layout during signal handling, debugging etc.
- */
-void fpstate_sanitize_xstate(struct fpu *fpu)
-{
-	struct fxregs_state *fx = &fpu->state.fxsave;
-	int feature_bit;
-	u64 xfeatures;
-
-	if (!use_xsaveopt())
-		return;
-
-	xfeatures = fpu->state.xsave.header.xfeatures;
-
-	/*
-	 * None of the feature bits are in init state. So nothing else
-	 * to do for us, as the memory layout is up to date.
-	 */
-	if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all)
-		return;
-
-	/*
-	 * FP is in init state
-	 */
-	if (!(xfeatures & XFEATURE_MASK_FP)) {
-		fx->cwd = 0x37f;
-		fx->swd = 0;
-		fx->twd = 0;
-		fx->fop = 0;
-		fx->rip = 0;
-		fx->rdp = 0;
-		memset(fx->st_space, 0, sizeof(fx->st_space));
-	}
-
-	/*
-	 * SSE is in init state
-	 */
-	if (!(xfeatures & XFEATURE_MASK_SSE))
-		memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
-
-	/*
-	 * First two features are FPU and SSE, which above we handled
-	 * in a special way already:
-	 */
-	feature_bit = 0x2;
-	xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2;
-
-	/*
-	 * Update all the remaining memory layouts according to their
-	 * standard xstate layout, if their header bit is in the init
-	 * state:
-	 */
-	while (xfeatures) {
-		if (xfeatures & 0x1) {
-			int offset = xstate_comp_offsets[feature_bit];
-			int size = xstate_sizes[feature_bit];
-
-			memcpy((void *)fx + offset,
-			       (void *)&init_fpstate.xsave + offset,
-			       size);
-		}
-
-		xfeatures >>= 1;
-		feature_bit++;
-	}
-}
-
 /*
  * Enable the extended processor state save/restore feature.
  * Called once per CPU onlining.
-- 
Gitee


From c9ff3066fc48f319a64426a04d89ed7b80fcd98c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:46 +0200
Subject: [PATCH 0121/2161] x86/fpu/regset: Move fpu__read_begin() into regset

commit 5a32fac8dbe8adc08c10e2c8770c95aebfc627cd upstream.

The function can only be used from the regset get() callbacks safely. So
there is no reason to have it globally exposed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.234942936@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  1 -
 arch/x86/kernel/fpu/core.c          | 20 --------------------
 arch/x86/kernel/fpu/regset.c        | 22 +++++++++++++++++++---
 3 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 72ee335bc6fa..da81acee5151 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -25,7 +25,6 @@
 /*
  * High level FPU state handling functions:
  */
-extern void fpu__prepare_read(struct fpu *fpu);
 extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 9ff467bb4894..d390644e5fa1 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -281,26 +281,6 @@ static void fpu__initialize(struct fpu *fpu)
 	trace_x86_fpu_init_state(fpu);
 }
 
-/*
- * This function must be called before we read a task's fpstate.
- *
- * There's two cases where this gets called:
- *
- * - for the current task (when coredumping), in which case we have
- *   to save the latest FPU registers into the fpstate,
- *
- * - or it's called for stopped tasks (ptrace), in which case the
- *   registers were already saved by the context-switch code when
- *   the task scheduled out.
- *
- * If the task has used the FPU before then save it.
- */
-void fpu__prepare_read(struct fpu *fpu)
-{
-	if (fpu == &current->thread.fpu)
-		fpu__save(fpu);
-}
-
 /*
  * This function must be called before we write a task's fpstate.
  *
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 3bbb6c0e6f45..52ba8f56367e 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -28,6 +28,22 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
 		return 0;
 }
 
+/*
+ * The regset get() functions are invoked from:
+ *
+ *   - coredump to dump the current task's fpstate. If the current task
+ *     owns the FPU then the memory state has to be synchronized and the
+ *     FPU register state preserved. Otherwise fpstate is already in sync.
+ *
+ *   - ptrace to dump fpstate of a stopped task, in which case the registers
+ *     have already been saved to fpstate on context switch.
+ */
+static void sync_fpstate(struct fpu *fpu)
+{
+	if (fpu == &current->thread.fpu)
+		fpu__save(fpu);
+}
+
 int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 		struct membuf to)
 {
@@ -36,7 +52,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_feature_enabled(X86_FEATURE_FXSR))
 		return -ENODEV;
 
-	fpu__prepare_read(fpu);
+	sync_fpstate(fpu);
 
 	if (!use_xsave()) {
 		return membuf_write(&to, &fpu->state.fxsave,
@@ -96,7 +112,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
 		return -ENODEV;
 
-	fpu__prepare_read(fpu);
+	sync_fpstate(fpu);
 
 	copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_XSAVE);
 	return 0;
@@ -287,7 +303,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 	struct user_i387_ia32_struct env;
 	struct fxregs_state fxsave, *fx;
 
-	fpu__prepare_read(fpu);
+	sync_fpstate(fpu);
 
 	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return fpregs_soft_get(target, regset, to);
-- 
Gitee


From d3c6f635290bc2278cbbebebb1d68a246a2a395f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:47 +0200
Subject: [PATCH 0122/2161] x86/fpu: Move fpu__write_begin() to regset

commit dbb60ac764581e62f2116c5a6b8926ba3a872dd4 upstream.

The only usecase for fpu__write_begin is the set() callback of regset, so
the function is pointlessly global.

Move it to the regset code and rename it to fpu_force_restore() which is
exactly decribing what the function does.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.328652975@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  1 -
 arch/x86/kernel/fpu/core.c          | 24 ------------------------
 arch/x86/kernel/fpu/regset.c        | 25 ++++++++++++++++++++++---
 3 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index da81acee5151..6a7e7f94738b 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -25,7 +25,6 @@
 /*
  * High level FPU state handling functions:
  */
-extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d390644e5fa1..22437011f4f0 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -281,30 +281,6 @@ static void fpu__initialize(struct fpu *fpu)
 	trace_x86_fpu_init_state(fpu);
 }
 
-/*
- * This function must be called before we write a task's fpstate.
- *
- * Invalidate any cached FPU registers.
- *
- * After this function call, after registers in the fpstate are
- * modified and the child task has woken up, the child task will
- * restore the modified FPU state from the modified context. If we
- * didn't clear its cached status here then the cached in-registers
- * state pending on its former CPU could be restored, corrupting
- * the modifications.
- */
-void fpu__prepare_write(struct fpu *fpu)
-{
-	/*
-	 * Only stopped child tasks can be used to modify the FPU
-	 * state in the fpstate buffer:
-	 */
-	WARN_ON_FPU(fpu == &current->thread.fpu);
-
-	/* Invalidate any cached state: */
-	__fpu_invalidate_fpregs_state(fpu);
-}
-
 /*
  * Drops current FPU state: deactivates the fpregs and
  * the fpstate. NOTE: it still leaves previous contents
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 52ba8f56367e..bdbfd4d7a272 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -44,6 +44,25 @@ static void sync_fpstate(struct fpu *fpu)
 		fpu__save(fpu);
 }
 
+/*
+ * Invalidate cached FPU registers before modifying the stopped target
+ * task's fpstate.
+ *
+ * This forces the target task on resume to restore the FPU registers from
+ * modified fpstate. Otherwise the task might skip the restore and operate
+ * with the cached FPU registers which discards the modifications.
+ */
+static void fpu_force_restore(struct fpu *fpu)
+{
+	/*
+	 * Only stopped child tasks can be used to modify the FPU
+	 * state in the fpstate buffer:
+	 */
+	WARN_ON_FPU(fpu == &current->thread.fpu);
+
+	__fpu_invalidate_fpregs_state(fpu);
+}
+
 int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 		struct membuf to)
 {
@@ -88,7 +107,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (newstate.mxcsr & ~mxcsr_feature_mask)
 		return -EINVAL;
 
-	fpu__prepare_write(fpu);
+	fpu_force_restore(fpu);
 
 	/* Copy the state  */
 	memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate));
@@ -146,7 +165,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		}
 	}
 
-	fpu__prepare_write(fpu);
+	fpu_force_restore(fpu);
 	ret = copy_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf);
 
 out:
@@ -346,7 +365,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
-	fpu__prepare_write(fpu);
+	fpu_force_restore(fpu);
 
 	if (cpu_feature_enabled(X86_FEATURE_FXSR))
 		convert_to_fxsr(&fpu->state.fxsave, &env);
-- 
Gitee


From cd377da3dd7625ad75f0203d2855da08d5d732e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:48 +0200
Subject: [PATCH 0123/2161] x86/fpu: Get rid of using_compacted_format()

commit 02b93c0b00df222b9ccf7a1fbd0eb59353d0a58c upstream.

This function is pointlessly global and a complete misnomer because it's
usage is related to both supervisor state checks and compacted format
checks. Remove it and just make the conditions check the XSAVES feature.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.425493349@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  1 -
 arch/x86/kernel/fpu/xstate.c      | 22 ++++------------------
 2 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index f03e6ed3ecca..ddf1e35a4454 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -102,7 +102,6 @@ extern void __init update_regset_xstate_info(unsigned int size,
 
 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
-int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 109c3108c6e9..0ce8e32f22d4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -446,20 +446,6 @@ int xfeature_size(int xfeature_nr)
 	return eax;
 }
 
-/*
- * 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
- * 2. using the compacted format
- *
- * Use this function when dealing with the compacted format so
- * that it is obvious which aspect of 'XSAVES' is being handled
- * by the calling code.
- */
-int using_compacted_format(void)
-{
-	return boot_cpu_has(X86_FEATURE_XSAVES);
-}
-
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 static int validate_user_xstate_header(const struct xstate_header *hdr)
 {
@@ -578,9 +564,9 @@ static void do_extra_xstate_size_checks(void)
 		check_xstate_against_struct(i);
 		/*
 		 * Supervisor state components can be managed only by
-		 * XSAVES, which is compacted-format only.
+		 * XSAVES.
 		 */
-		if (!using_compacted_format())
+		if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
 			XSTATE_WARN_ON(xfeature_is_supervisor(i));
 
 		/* Align from the end of the previous feature */
@@ -590,9 +576,9 @@ static void do_extra_xstate_size_checks(void)
 		 * The offset of a given state in the non-compacted
 		 * format is given to us in a CPUID leaf.  We check
 		 * them for being ordered (increasing offsets) in
-		 * setup_xstate_features().
+		 * setup_xstate_features(). XSAVES uses compacted format.
 		 */
-		if (!using_compacted_format())
+		if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
 			paranoid_xstate_size = xfeature_uncompacted_offset(i);
 		/*
 		 * The compacted-format offset always depends on where
-- 
Gitee


From fa7d3b99d5d05503c6e04da3d79b0a3be5328d4c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 23 Jun 2021 14:01:49 +0200
Subject: [PATCH 0124/2161] x86/kvm: Avoid looking up PKRU in XSAVE buffer

commit 71ef453355a9197fcfd8ff22391a4ad7861d79e6 upstream.

PKRU is being removed from the kernel XSAVE/FPU buffers.  This removal
will probably include warnings for code that look up PKRU in those
buffers.

KVM currently looks up the location of PKRU but doesn't even use the
pointer that it gets back.  Rework the code to avoid calling
get_xsave_addr() except in cases where its result is actually used.

This makes the code more clear and also avoids the inevitable PKRU
warnings.

This is probably a good cleanup and could go upstream idependently
of any PKRU rework.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.541037562@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 50cdef361eb5..c43e5d018e24 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4039,20 +4039,21 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 	 */
 	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
+		u32 size, offset, ecx, edx;
 		u64 xfeature_mask = valid & -valid;
 		int xfeature_nr = fls64(xfeature_mask) - 1;
-		void *src = get_xsave_addr(xsave, xfeature_nr);
-
-		if (src) {
-			u32 size, offset, ecx, edx;
-			cpuid_count(XSTATE_CPUID, xfeature_nr,
-				    &size, &offset, &ecx, &edx);
-			if (xfeature_nr == XFEATURE_PKRU)
-				memcpy(dest + offset, &vcpu->arch.pkru,
-				       sizeof(vcpu->arch.pkru));
-			else
-				memcpy(dest + offset, src, size);
+		void *src;
+
+		cpuid_count(XSTATE_CPUID, xfeature_nr,
+			    &size, &offset, &ecx, &edx);
 
+		if (xfeature_nr == XFEATURE_PKRU) {
+			memcpy(dest + offset, &vcpu->arch.pkru,
+			       sizeof(vcpu->arch.pkru));
+		} else {
+			src = get_xsave_addr(xsave, xfeature_nr);
+			if (src)
+				memcpy(dest + offset, src, size);
 		}
 
 		valid -= xfeature_mask;
@@ -4082,18 +4083,20 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 	 */
 	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
+		u32 size, offset, ecx, edx;
 		u64 xfeature_mask = valid & -valid;
 		int xfeature_nr = fls64(xfeature_mask) - 1;
-		void *dest = get_xsave_addr(xsave, xfeature_nr);
-
-		if (dest) {
-			u32 size, offset, ecx, edx;
-			cpuid_count(XSTATE_CPUID, xfeature_nr,
-				    &size, &offset, &ecx, &edx);
-			if (xfeature_nr == XFEATURE_PKRU)
-				memcpy(&vcpu->arch.pkru, src + offset,
-				       sizeof(vcpu->arch.pkru));
-			else
+
+		cpuid_count(XSTATE_CPUID, xfeature_nr,
+			    &size, &offset, &ecx, &edx);
+
+		if (xfeature_nr == XFEATURE_PKRU) {
+			memcpy(&vcpu->arch.pkru, src + offset,
+			       sizeof(vcpu->arch.pkru));
+		} else {
+			void *dest = get_xsave_addr(xsave, xfeature_nr);
+
+			if (dest)
 				memcpy(dest, src + offset, size);
 		}
 
-- 
Gitee


From c3a777915a0d365c3d95aae9db573ed104cd8cde Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:50 +0200
Subject: [PATCH 0125/2161] x86/fpu: Cleanup arch_set_user_pkey_access()

commit 9fe8a6f5eed8fff6b2d7dbc99b911334e311732d upstream.

The function does a sanity check with a WARN_ON_ONCE() but happily proceeds
when the pkey argument is out of range.

Clean it up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.635764326@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0ce8e32f22d4..11b3e9d1b6dc 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -939,11 +939,10 @@ const void *get_xsave_field_ptr(int xfeature_nr)
  * rights for @pkey to @init_val.
  */
 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
-		unsigned long init_val)
+			      unsigned long init_val)
 {
-	u32 old_pkru;
-	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
-	u32 new_pkru_bits = 0;
+	u32 old_pkru, new_pkru_bits = 0;
+	int pkey_shift;
 
 	/*
 	 * This check implies XSAVE support.  OSPKE only gets
@@ -957,7 +956,8 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	 * values originating from in-kernel users.  Complain
 	 * if a bad value is observed.
 	 */
-	WARN_ON_ONCE(pkey >= arch_max_pkey());
+	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
+		return -EINVAL;
 
 	/* Set the bits we need in PKRU:  */
 	if (init_val & PKEY_DISABLE_ACCESS)
@@ -966,6 +966,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		new_pkru_bits |= PKRU_WD_BIT;
 
 	/* Shift the bits in to the correct place in PKRU for pkey: */
+	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
 	new_pkru_bits <<= pkey_shift;
 
 	/* Get old PKRU and mask off any old bits in place: */
-- 
Gitee


From 8345c4dd6630ec7d2e5193586fb82984d9861472 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:51 +0200
Subject: [PATCH 0126/2161] x86/fpu: Get rid of copy_supervisor_to_kernel()

commit 1f3171252dc586745bb548d48f3bcedfea34b58d upstream.

If the fast path of restoring the FPU state on sigreturn fails or is not
taken and the current task's FPU is active then the FPU has to be
deactivated for the slow path to allow a safe update of the tasks FPU
memory state.

With supervisor states enabled, this requires to save the supervisor state
in the memory state first. Supervisor states require XSAVES so saving only
the supervisor state requires to reshuffle the memory buffer because XSAVES
uses the compacted format and therefore stores the supervisor states at the
beginning of the memory state. That's just an overengineered optimization.

Get rid of it and save the full state for this case.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.734561971@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  1 -
 arch/x86/kernel/fpu/signal.c      | 13 +++++---
 arch/x86/kernel/fpu/xstate.c      | 55 -------------------------------
 3 files changed, 8 insertions(+), 61 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index ddf1e35a4454..5f9d4acd14a0 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -105,7 +105,6 @@ const void *get_xsave_field_ptr(int xfeature_nr);
 int xfeature_size(int xfeature_nr);
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-void copy_supervisor_to_kernel(struct xregs_state *xsave);
 void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
 void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
 
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 995339a43c98..6fcc56d303b2 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -401,15 +401,18 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	 * the optimisation).
 	 */
 	fpregs_lock();
-
 	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-
 		/*
-		 * Supervisor states are not modified by user space input.  Save
-		 * current supervisor states first and invalidate the FPU regs.
+		 * If supervisor states are available then save the
+		 * hardware state in current's fpstate so that the
+		 * supervisor state is preserved. Save the full state for
+		 * simplicity. There is no point in optimizing this by only
+		 * saving the supervisor states and then shuffle them to
+		 * the right place in memory. This is the slow path and the
+		 * above XRSTOR failed or ia32_fxstate is true. Shrug.
 		 */
 		if (xfeatures_mask_supervisor())
-			copy_supervisor_to_kernel(&fpu->state.xsave);
+			copy_xregs_to_kernel(&fpu->state.xsave);
 		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
 	__fpu_invalidate_fpregs_state(fpu);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 11b3e9d1b6dc..dc70b749280e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1231,61 +1231,6 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	return 0;
 }
 
-/*
- * Save only supervisor states to the kernel buffer.  This blows away all
- * old states, and is intended to be used only in __fpu__restore_sig(), where
- * user states are restored from the user buffer.
- */
-void copy_supervisor_to_kernel(struct xregs_state *xstate)
-{
-	struct xstate_header *header;
-	u64 max_bit, min_bit;
-	u32 lmask, hmask;
-	int err, i;
-
-	if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES)))
-		return;
-
-	if (!xfeatures_mask_supervisor())
-		return;
-
-	max_bit = __fls(xfeatures_mask_supervisor());
-	min_bit = __ffs(xfeatures_mask_supervisor());
-
-	lmask = xfeatures_mask_supervisor();
-	hmask = xfeatures_mask_supervisor() >> 32;
-	XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
-
-	/* We should never fault when copying to a kernel buffer: */
-	if (WARN_ON_FPU(err))
-		return;
-
-	/*
-	 * At this point, the buffer has only supervisor states and must be
-	 * converted back to normal kernel format.
-	 */
-	header = &xstate->header;
-	header->xcomp_bv |= xfeatures_mask_all;
-
-	/*
-	 * This only moves states up in the buffer.  Start with
-	 * the last state and move backwards so that states are
-	 * not overwritten until after they are moved.  Note:
-	 * memmove() allows overlapping src/dst buffers.
-	 */
-	for (i = max_bit; i >= min_bit; i--) {
-		u8 *xbuf = (u8 *)xstate;
-
-		if (!((header->xfeatures >> i) & 1))
-			continue;
-
-		/* Move xfeature 'i' into its normal location */
-		memmove(xbuf + xstate_comp_offsets[i],
-			xbuf + xstate_supervisor_only_offsets[i],
-			xstate_sizes[i]);
-	}
-}
-
 /**
  * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
  *                                       an xsave area
-- 
Gitee


From 8d874fcae35417e014d3c3ebf783b23638235f91 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:52 +0200
Subject: [PATCH 0127/2161] x86/fpu: Rename copy_xregs_to_kernel() and
 copy_kernel_to_xregs()

commit b16313f71c1050ad5c92548925e0e9cec26989ab upstream.

The function names for xsave[s]/xrstor[s] operations are horribly named and
a permanent source of confusion.

Rename:
	copy_xregs_to_kernel() to os_xsave()
	copy_kernel_to_xregs() to os_xrstor()

These are truly low level wrappers around the actual instructions
XSAVE[OPT]/XRSTOR and XSAVES/XRSTORS with the twist that the selection
based on the available CPU features happens with an alternative to avoid
conditionals all over the place and to provide the best performance for hot
paths.

The os_ prefix tells that this is the OS selected mechanism.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.830239347@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 17 +++++++++++------
 arch/x86/kernel/fpu/core.c          |  7 +++----
 arch/x86/kernel/fpu/signal.c        | 21 +++++++++++----------
 arch/x86/kernel/fpu/xstate.c        |  2 +-
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 6a7e7f94738b..f05a8ea5777c 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -265,7 +265,7 @@ static inline void fxsave(struct fxregs_state *fx)
  * This function is called only during boot time when x86 caps are not set
  * up and alternative can not be used yet.
  */
-static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
+static inline void os_xrstor_booting(struct xregs_state *xstate)
 {
 	u64 mask = -1;
 	u32 lmask = mask;
@@ -288,8 +288,11 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
 
 /*
  * Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
  */
-static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+static inline void os_xsave(struct xregs_state *xstate)
 {
 	u64 mask = xfeatures_mask_all;
 	u32 lmask = mask;
@@ -306,8 +309,10 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
 
 /*
  * Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
  */
-static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
+static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
 {
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
@@ -368,13 +373,13 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
  * Restore xstate from kernel space xsave area, return an error code instead of
  * an exception.
  */
-static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
+static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
 {
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
 
-	if (static_cpu_has(X86_FEATURE_XSAVES))
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
 	else
 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
@@ -387,7 +392,7 @@ extern int copy_fpregs_to_fpstate(struct fpu *fpu);
 static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
 {
 	if (use_xsave()) {
-		copy_kernel_to_xregs(&fpstate->xsave, mask);
+		os_xrstor(&fpstate->xsave, mask);
 	} else {
 		if (use_fxsr())
 			copy_kernel_to_fxregs(&fpstate->fxsave);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 22437011f4f0..bfdcf7fd63e3 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(irq_fpu_usable);
 int copy_fpregs_to_fpstate(struct fpu *fpu)
 {
 	if (likely(use_xsave())) {
-		copy_xregs_to_kernel(&fpu->state.xsave);
+		os_xsave(&fpu->state.xsave);
 
 		/*
 		 * AVX512 state is tracked here because its use is
@@ -314,7 +314,7 @@ void fpu__drop(struct fpu *fpu)
 static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
 {
 	if (use_xsave())
-		copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
+		os_xrstor(&init_fpstate.xsave, features_mask);
 	else if (static_cpu_has(X86_FEATURE_FXSR))
 		copy_kernel_to_fxregs(&init_fpstate.fxsave);
 	else
@@ -345,8 +345,7 @@ static void fpu__clear(struct fpu *fpu, bool user_only)
 	if (user_only) {
 		if (!fpregs_state_valid(fpu, smp_processor_id()) &&
 		    xfeatures_mask_supervisor())
-			copy_kernel_to_xregs(&fpu->state.xsave,
-					     xfeatures_mask_supervisor());
+			os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
 		copy_init_fpstate_to_fpregs(xfeatures_mask_user());
 	} else {
 		copy_init_fpstate_to_fpregs(xfeatures_mask_all);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 6fcc56d303b2..d39432f1961c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -261,14 +261,14 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 
 			r = copy_user_to_fxregs(buf);
 			if (!r)
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+				os_xrstor(&init_fpstate.xsave, init_bv);
 			return r;
 		} else {
 			init_bv = xfeatures_mask_user() & ~xbv;
 
 			r = copy_user_to_xregs(buf, xbv);
 			if (!r && unlikely(init_bv))
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+				os_xrstor(&init_fpstate.xsave, init_bv);
 			return r;
 		}
 	} else if (use_fxsr()) {
@@ -356,9 +356,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			 * has been copied to the kernel one.
 			 */
 			if (test_thread_flag(TIF_NEED_FPU_LOAD) &&
-			    xfeatures_mask_supervisor())
-				copy_kernel_to_xregs(&fpu->state.xsave,
-						     xfeatures_mask_supervisor());
+			    xfeatures_mask_supervisor()) {
+				os_xrstor(&fpu->state.xsave,
+					  xfeatures_mask_supervisor());
+			}
 			fpregs_mark_activate();
 			fpregs_unlock();
 			return 0;
@@ -412,7 +413,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 * above XRSTOR failed or ia32_fxstate is true. Shrug.
 		 */
 		if (xfeatures_mask_supervisor())
-			copy_xregs_to_kernel(&fpu->state.xsave);
+			os_xsave(&fpu->state.xsave);
 		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
 	__fpu_invalidate_fpregs_state(fpu);
@@ -430,14 +431,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 		fpregs_lock();
 		if (unlikely(init_bv))
-			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			os_xrstor(&init_fpstate.xsave, init_bv);
 
 		/*
 		 * Restore previously saved supervisor xstates along with
 		 * copied-in user xstates.
 		 */
-		ret = copy_kernel_to_xregs_err(&fpu->state.xsave,
-					       user_xfeatures | xfeatures_mask_supervisor());
+		ret = os_xrstor_safe(&fpu->state.xsave,
+				     user_xfeatures | xfeatures_mask_supervisor());
 
 	} else if (use_fxsr()) {
 		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
@@ -454,7 +455,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			u64 init_bv;
 
 			init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
-			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			os_xrstor(&init_fpstate.xsave, init_bv);
 		}
 
 		ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index dc70b749280e..ead3771d9744 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -397,7 +397,7 @@ static void __init setup_init_fpu_buf(void)
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
 	 */
-	copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+	os_xrstor_booting(&init_fpstate.xsave);
 
 	/*
 	 * All components are now in init state. Read the state back so
-- 
Gitee


From 594ca770304d652ffad98ee927d41f722e679836 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:53 +0200
Subject: [PATCH 0128/2161] x86/fpu: Rename copy_user_to_xregs() and
 copy_xregs_to_user()

commit 6b862ba1821441e6083cf061404694d33a841526 upstream.

The function names for xsave[s]/xrstor[s] operations are horribly named and
a permanent source of confusion.

Rename:
	copy_xregs_to_user() to xsave_to_user_sigframe()
	copy_user_to_xregs() to xrstor_from_user_sigframe()

so it's entirely clear what this is about. This is also a clear indicator
of the potentially different storage format because this is user ABI and
cannot use compacted format.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121453.924266705@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 4 ++--
 arch/x86/kernel/fpu/signal.c        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index f05a8ea5777c..2c61034c4434 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -330,7 +330,7 @@ static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
  * backward compatibility for old applications which don't understand
  * compacted format of xsave area.
  */
-static inline int copy_xregs_to_user(struct xregs_state __user *buf)
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
 {
 	u64 mask = xfeatures_mask_user();
 	u32 lmask = mask;
@@ -355,7 +355,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 /*
  * Restore xstate from user space xsave area.
  */
-static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
 {
 	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
 	u32 lmask = mask;
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d39432f1961c..7fc82c716580 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -129,7 +129,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
 	int err;
 
 	if (use_xsave())
-		err = copy_xregs_to_user(buf);
+		err = xsave_to_user_sigframe(buf);
 	else if (use_fxsr())
 		err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
 	else
@@ -266,7 +266,7 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 		} else {
 			init_bv = xfeatures_mask_user() & ~xbv;
 
-			r = copy_user_to_xregs(buf, xbv);
+			r = xrstor_from_user_sigframe(buf, xbv);
 			if (!r && unlikely(init_bv))
 				os_xrstor(&init_fpstate.xsave, init_bv);
 			return r;
-- 
Gitee


From 04a985433dcc255740dd51ca09af18dbe8965c74 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:54 +0200
Subject: [PATCH 0129/2161] x86/fpu: Rename fxregs-related copy functions

commit 16dcf4385933a02bb21d0af86a04439d151ad42a upstream.

The function names for fxsave/fxrstor operations are horribly named and
a permanent source of confusion.

Rename:
	copy_fxregs_to_kernel() to fxsave()
	copy_kernel_to_fxregs() to fxrstor()
	copy_fxregs_to_user() to fxsave_to_user_sigframe()
	copy_user_to_fxregs() to fxrstor_from_user_sigframe()

so it's clear what these are doing. All these functions are really low
level wrappers around the equally named instructions, so mapping to the
documentation is just natural.

While at it, replace the static_cpu_has(X86_FEATURE_FXSR) with
use_fxsr() to be consistent with the rest of the code.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121454.017863494@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 18 +++++-------------
 arch/x86/kernel/fpu/core.c          |  6 +++---
 arch/x86/kernel/fpu/signal.c        | 10 +++++-----
 3 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 2c61034c4434..43e85f0240e7 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -129,7 +129,7 @@ static inline int copy_fregs_to_user(struct fregs_state __user *fx)
 	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
 }
 
-static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
 {
 	if (IS_ENABLED(CONFIG_X86_32))
 		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
@@ -138,7 +138,7 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
 
 }
 
-static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
+static inline void fxrstor(struct fxregs_state *fx)
 {
 	if (IS_ENABLED(CONFIG_X86_32))
 		kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -146,7 +146,7 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
 		kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
-static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
+static inline int fxrstor_safe(struct fxregs_state *fx)
 {
 	if (IS_ENABLED(CONFIG_X86_32))
 		return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -154,7 +154,7 @@ static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
 		return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
-static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
 {
 	if (IS_ENABLED(CONFIG_X86_32))
 		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -177,14 +177,6 @@ static inline int copy_user_to_fregs(struct fregs_state __user *fx)
 	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
-static inline void copy_fxregs_to_kernel(struct fpu *fpu)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
-	else
-		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
-}
-
 static inline void fxsave(struct fxregs_state *fx)
 {
 	if (IS_ENABLED(CONFIG_X86_32))
@@ -395,7 +387,7 @@ static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask
 		os_xrstor(&fpstate->xsave, mask);
 	} else {
 		if (use_fxsr())
-			copy_kernel_to_fxregs(&fpstate->fxsave);
+			fxrstor(&fpstate->fxsave);
 		else
 			copy_kernel_to_fregs(&fpstate->fsave);
 	}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index bfdcf7fd63e3..035487dcb29e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -107,7 +107,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
 	}
 
 	if (likely(use_fxsr())) {
-		copy_fxregs_to_kernel(fpu);
+		fxsave(&fpu->state.fxsave);
 		return 1;
 	}
 
@@ -315,8 +315,8 @@ static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
 {
 	if (use_xsave())
 		os_xrstor(&init_fpstate.xsave, features_mask);
-	else if (static_cpu_has(X86_FEATURE_FXSR))
-		copy_kernel_to_fxregs(&init_fpstate.fxsave);
+	else if (use_fxsr())
+		fxrstor(&init_fpstate.fxsave);
 	else
 		copy_kernel_to_fregs(&init_fpstate.fsave);
 
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 7fc82c716580..e76fb0570c9e 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -64,7 +64,7 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
 
 		fpregs_lock();
 		if (!test_thread_flag(TIF_NEED_FPU_LOAD))
-			copy_fxregs_to_kernel(&tsk->thread.fpu);
+			fxsave(&tsk->thread.fpu.state.fxsave);
 		fpregs_unlock();
 
 		convert_from_fxsr(&env, tsk);
@@ -131,7 +131,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
 	if (use_xsave())
 		err = xsave_to_user_sigframe(buf);
 	else if (use_fxsr())
-		err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+		err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
 	else
 		err = copy_fregs_to_user((struct fregs_state __user *) buf);
 
@@ -259,7 +259,7 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 		if (fx_only) {
 			init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
 
-			r = copy_user_to_fxregs(buf);
+			r = fxrstor_from_user_sigframe(buf);
 			if (!r)
 				os_xrstor(&init_fpstate.xsave, init_bv);
 			return r;
@@ -272,7 +272,7 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 			return r;
 		}
 	} else if (use_fxsr()) {
-		return copy_user_to_fxregs(buf);
+		return fxrstor_from_user_sigframe(buf);
 	} else
 		return copy_user_to_fregs(buf);
 }
@@ -458,7 +458,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			os_xrstor(&init_fpstate.xsave, init_bv);
 		}
 
-		ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave);
+		ret = fxrstor_safe(&fpu->state.fxsave);
 	} else {
 		ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
 		if (ret)
-- 
Gitee


From d7b0a50fdd66d4fb7c6ba88eefc387f171cf5d24 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:55 +0200
Subject: [PATCH 0130/2161] x86/math-emu: Rename frstor()

commit 872c65dbf669b3b471b3d8656391a6b4f736d22b upstream.

This is in the way of renaming the low level hardware accessors to match
the instruction name. Prepend it with FPU_ which is consistent vs. the
rest of the emulation code.

No functional change.

  [ bp: Correct the Reported-by: ]

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121454.111665161@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/math-emu/fpu_proto.h  | 2 +-
 arch/x86/math-emu/load_store.c | 2 +-
 arch/x86/math-emu/reg_ld_str.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 70d35c200945..94c4023092f3 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -144,7 +144,7 @@ extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
 extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
 extern int FPU_round_to_int(FPU_REG *r, u_char tag);
 extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s);
-extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
+extern void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
 extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d);
 extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address);
 extern int FPU_tagof(FPU_REG *ptr);
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index f15263e158e8..4092df79de4f 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -240,7 +240,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		   fix-up operations. */
 		return 1;
 	case 022:		/* frstor m94/108byte */
-		frstor(addr_modes, (u_char __user *) data_address);
+		FPU_frstor(addr_modes, (u_char __user *) data_address);
 		/* Ensure that the values just loaded are not changed by
 		   fix-up operations. */
 		return 1;
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index fe6246ff9887..2de1094ed4d7 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -1117,7 +1117,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
 	return s;
 }
 
-void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
+void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
 {
 	int i, regnr;
 	u_char __user *s = fldenv(addr_modes, data_address);
-- 
Gitee


From 08b13d870795f59881bbf94200a63ea566329719 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:56 +0200
Subject: [PATCH 0131/2161] x86/fpu: Rename fregs-related copy functions

commit 6fdc908cb56123591baa4259400cfb0787582b11 upstream.

The function names for fnsave/fnrstor operations are horribly named and
a permanent source of confusion.

Rename:
	copy_kernel_to_fregs() to frstor()
	copy_fregs_to_user()   to fnsave_to_user_sigframe()
	copy_user_to_fregs()   to frstor_from_user_sigframe()

so it's clear what these are doing. All these functions are really low
level wrappers around the equally named instructions, so mapping to the
documentation is just natural.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121454.223594101@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 10 +++++-----
 arch/x86/kernel/fpu/core.c          |  2 +-
 arch/x86/kernel/fpu/signal.c        |  6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 43e85f0240e7..ae1ad4a43efc 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -124,7 +124,7 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {}
 		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore)	\
 		     : output : input)
 
-static inline int copy_fregs_to_user(struct fregs_state __user *fx)
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
 {
 	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
 }
@@ -162,17 +162,17 @@ static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
 		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
-static inline void copy_kernel_to_fregs(struct fregs_state *fx)
+static inline void frstor(struct fregs_state *fx)
 {
 	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
-static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
+static inline int frstor_safe(struct fregs_state *fx)
 {
 	return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
-static inline int copy_user_to_fregs(struct fregs_state __user *fx)
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
 {
 	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
@@ -389,7 +389,7 @@ static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask
 		if (use_fxsr())
 			fxrstor(&fpstate->fxsave);
 		else
-			copy_kernel_to_fregs(&fpstate->fsave);
+			frstor(&fpstate->fsave);
 	}
 }
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 035487dcb29e..1d2587607a7f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -318,7 +318,7 @@ static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
 	else if (use_fxsr())
 		fxrstor(&init_fpstate.fxsave);
 	else
-		copy_kernel_to_fregs(&init_fpstate.fsave);
+		frstor(&init_fpstate.fsave);
 
 	if (boot_cpu_has(X86_FEATURE_OSPKE))
 		copy_init_pkru_to_fpregs();
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index e76fb0570c9e..082d4ff6ea86 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -133,7 +133,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
 	else if (use_fxsr())
 		err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
 	else
-		err = copy_fregs_to_user((struct fregs_state __user *) buf);
+		err = fnsave_to_user_sigframe((struct fregs_state __user *) buf);
 
 	if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
 		err = -EFAULT;
@@ -274,7 +274,7 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 	} else if (use_fxsr()) {
 		return fxrstor_from_user_sigframe(buf);
 	} else
-		return copy_user_to_fregs(buf);
+		return frstor_from_user_sigframe(buf);
 }
 
 static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
@@ -465,7 +465,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			goto out;
 
 		fpregs_lock();
-		ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
+		ret = frstor_safe(&fpu->state.fsave);
 	}
 	if (!ret)
 		fpregs_mark_activate();
-- 
Gitee


From 8e9755752a1a4541904ee1c3847f0b0470952842 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:57 +0200
Subject: [PATCH 0132/2161] x86/fpu: Rename xstate copy functions which are
 related to UABI

commit 1cc34413ff3f18c30e5df89fefd95cc0f3b3292e upstream.

Rename them to reflect that these functions deal with user space format
XSAVE buffers.

      copy_kernel_to_xstate() -> copy_uabi_from_kernel_to_xstate()
      copy_user_to_xstate()   -> copy_sigframe_from_user_to_xstate()

Again a clear statement that these functions deal with user space ABI.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121454.318485015@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 4 ++--
 arch/x86/kernel/fpu/regset.c      | 2 +-
 arch/x86/kernel/fpu/signal.c      | 2 +-
 arch/x86/kernel/fpu/xstate.c      | 5 +++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 5f9d4acd14a0..e92c820d3d44 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -103,8 +103,8 @@ extern void __init update_regset_xstate_info(unsigned int size,
 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int xfeature_size(int xfeature_nr);
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
+int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
+int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
 void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
 
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index bdbfd4d7a272..36d0a68d361f 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -166,7 +166,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	}
 
 	fpu_force_restore(fpu);
-	ret = copy_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf);
+	ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf);
 
 out:
 	vfree(tmpbuf);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 082d4ff6ea86..f754996d9046 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -422,7 +422,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	if (use_xsave() && !fx_only) {
 		u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
 
-		ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
+		ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
 		if (ret)
 			goto out;
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index ead3771d9744..7c5c9b4ef4c4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1126,7 +1126,7 @@ static inline bool mxcsr_valid(struct xstate_header *hdr, const u32 *mxcsr)
  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] format
  * and copy to the target thread. This is called from xstateregs_set().
  */
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 {
 	unsigned int offset, size;
 	int i;
@@ -1181,7 +1181,8 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
  * XSAVE[S] format and copy to the target thread. This is called from the
  * sigreturn() and rt_sigreturn() system calls.
  */
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
+int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
+				      const void __user *ubuf)
 {
 	unsigned int offset, size;
 	int i;
-- 
Gitee


From cb40723f650bd3d8ef57a713284b9fd9bf68a611 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:58 +0200
Subject: [PATCH 0133/2161] x86/fpu: Deduplicate
 copy_uabi_from_user/kernel_to_xstate()

commit 522e92743b35351bda1b6a9136560f833a9c2490 upstream.

copy_uabi_from_user_to_xstate() and copy_uabi_from_kernel_to_xstate() are
almost identical except for the copy function.

Unify them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lkml.kernel.org/r/20210623121454.414215896@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 137 ++++++++++++-----------------------
 1 file changed, 47 insertions(+), 90 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7c5c9b4ef4c4..dfea31d4a009 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -980,23 +980,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 }
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
-/*
- * Weird legacy quirk: SSE and YMM states store information in the
- * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
- * area is marked as unused in the xfeatures header, we need to copy
- * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
- */
-static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
-{
-	if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
-		return false;
-
-	if (xfeatures & XFEATURE_MASK_FP)
-		return false;
-
-	return true;
-}
-
 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
 			 void *init_xstate, unsigned int size)
 {
@@ -1109,39 +1092,53 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 		membuf_zero(&to, to.left);
 }
 
-static inline bool mxcsr_valid(struct xstate_header *hdr, const u32 *mxcsr)
+static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
+			    const void *kbuf, const void __user *ubuf)
 {
-	u64 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
-
-	/* Only check if it is in use */
-	if (hdr->xfeatures & mask) {
-		/* Reserved bits in MXCSR must be zero. */
-		if (*mxcsr & ~mxcsr_feature_mask)
-			return false;
+	if (kbuf) {
+		memcpy(dst, kbuf + offset, size);
+	} else {
+		if (copy_from_user(dst, ubuf + offset, size))
+			return -EFAULT;
 	}
-	return true;
+	return 0;
 }
 
-/*
- * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] format
- * and copy to the target thread. This is called from xstateregs_set().
- */
-int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+
+static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
+			       const void __user *ubuf)
 {
 	unsigned int offset, size;
-	int i;
 	struct xstate_header hdr;
+	u64 mask;
+	int i;
 
 	offset = offsetof(struct xregs_state, header);
-	size = sizeof(hdr);
-
-	memcpy(&hdr, kbuf + offset, size);
+	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
+		return -EFAULT;
 
 	if (validate_user_xstate_header(&hdr))
 		return -EINVAL;
 
-	if (!mxcsr_valid(&hdr, kbuf + offsetof(struct fxregs_state, mxcsr)))
-		return -EINVAL;
+	/* Validate MXCSR when any of the related features is in use */
+	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
+	if (hdr.xfeatures & mask) {
+		u32 mxcsr[2];
+
+		offset = offsetof(struct fxregs_state, mxcsr);
+		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
+			return -EFAULT;
+
+		/* Reserved bits in MXCSR must be zero. */
+		if (mxcsr[0] & ~mxcsr_feature_mask)
+			return -EINVAL;
+
+		/* SSE and YMM require MXCSR even when FP is not in use. */
+		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
+			xsave->i387.mxcsr = mxcsr[0];
+			xsave->i387.mxcsr_mask = mxcsr[1];
+		}
+	}
 
 	for (i = 0; i < XFEATURE_MAX; i++) {
 		u64 mask = ((u64)1 << i);
@@ -1152,16 +1149,11 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			memcpy(dst, kbuf + offset, size);
+			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
+				return -EFAULT;
 		}
 	}
 
-	if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
-		offset = offsetof(struct fxregs_state, mxcsr);
-		size = MXCSR_AND_FLAGS_SIZE;
-		memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
-	}
-
 	/*
 	 * The state that came in from userspace was user-state only.
 	 * Mask all the user states out of 'xfeatures':
@@ -1176,6 +1168,16 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	return 0;
 }
 
+/*
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
+ * format and copy to the target thread. This is called from
+ * xstateregs_set().
+ */
+int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+{
+	return copy_uabi_to_xstate(xsave, kbuf, NULL);
+}
+
 /*
  * Convert from a sigreturn standard-format user-space buffer to kernel
  * XSAVE[S] format and copy to the target thread. This is called from the
@@ -1184,52 +1186,7 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
 				      const void __user *ubuf)
 {
-	unsigned int offset, size;
-	int i;
-	struct xstate_header hdr;
-
-	offset = offsetof(struct xregs_state, header);
-	size = sizeof(hdr);
-
-	if (copy_from_user(&hdr, ubuf + offset, size))
-		return -EFAULT;
-
-	if (validate_user_xstate_header(&hdr))
-		return -EINVAL;
-
-	for (i = 0; i < XFEATURE_MAX; i++) {
-		u64 mask = ((u64)1 << i);
-
-		if (hdr.xfeatures & mask) {
-			void *dst = __raw_xsave_addr(xsave, i);
-
-			offset = xstate_offsets[i];
-			size = xstate_sizes[i];
-
-			if (copy_from_user(dst, ubuf + offset, size))
-				return -EFAULT;
-		}
-	}
-
-	if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
-		offset = offsetof(struct fxregs_state, mxcsr);
-		size = MXCSR_AND_FLAGS_SIZE;
-		if (copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
-			return -EFAULT;
-	}
-
-	/*
-	 * The state that came in from userspace was user-state only.
-	 * Mask all the user states out of 'xfeatures':
-	 */
-	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
-
-	/*
-	 * Add back in the features that came in from userspace:
-	 */
-	xsave->header.xfeatures |= hdr.xfeatures;
-
-	return 0;
+	return copy_uabi_to_xstate(xsave, NULL, ubuf);
 }
 
 /**
-- 
Gitee


From 701450e0df35a635b8d24fd6a0262fec16facd89 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Jun 2021 16:36:20 +0200
Subject: [PATCH 0134/2161] x86/process: Check PF_KTHREAD and not current->mm
 for kernel threads

commit 12f7764ac61200e32c916f038bdc08f884b0b604 upstream.

switch_fpu_finish() checks current->mm as indicator for kernel threads.
That's wrong because kernel threads can temporarily use a mm of a user
process via kthread_use_mm().

Check the task flags for PF_KTHREAD instead.

Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Rik van Riel <riel@surriel.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210608144345.912645927@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index ae1ad4a43efc..3eadbd6cd4c8 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -546,7 +546,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 	 * PKRU state is switched eagerly because it needs to be valid before we
 	 * return to userland e.g. for a copy_to_user() operation.
 	 */
-	if (current->mm) {
+	if (!(current->flags & PF_KTHREAD)) {
 		pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
 		if (pk)
 			pkru_val = pk->pkru;
-- 
Gitee


From 890eacb22c6043ece1252ba2d8d5521a85967fb6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Jun 2021 16:36:21 +0200
Subject: [PATCH 0135/2161] x86/pkru: Write hardware init value to PKRU when
 xstate is init

commit 510b80a6a0f1a0d114c6e33bcea64747d127973c upstream.

When user space brings PKRU into init state, then the kernel handling is
broken:

  T1 user space
     xsave(state)
     state.header.xfeatures &= ~XFEATURE_MASK_PKRU;
     xrstor(state)

  T1 -> kernel
     schedule()
       XSAVE(S) -> T1->xsave.header.xfeatures[PKRU] == 0
       T1->flags |= TIF_NEED_FPU_LOAD;

       wrpkru();

     schedule()
       ...
       pk = get_xsave_addr(&T1->fpu->state.xsave, XFEATURE_PKRU);
       if (pk)
	 wrpkru(pk->pkru);
       else
	 wrpkru(DEFAULT_PKRU);

Because the xfeatures bit is 0 and therefore the value in the xsave
storage is not valid, get_xsave_addr() returns NULL and switch_to()
writes the default PKRU. -> FAIL #1!

So that wrecks any copy_to/from_user() on the way back to user space
which hits memory which is protected by the default PKRU value.

Assumed that this does not fail (pure luck) then T1 goes back to user
space and because TIF_NEED_FPU_LOAD is set it ends up in

  switch_fpu_return()
      __fpregs_load_activate()
        if (!fpregs_state_valid()) {
  	 load_XSTATE_from_task();
        }

But if nothing touched the FPU between T1 scheduling out and back in,
then the fpregs_state is still valid which means switch_fpu_return()
does nothing and just clears TIF_NEED_FPU_LOAD. Back to user space with
DEFAULT_PKRU loaded. -> FAIL #2!

The fix is simple: if get_xsave_addr() returns NULL then set the
PKRU value to 0 instead of the restrictive default PKRU value in
init_pkru_value.

 [ bp: Massage in minor nitpicks from folks. ]

Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Rik van Riel <riel@surriel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210608144346.045616965@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 3eadbd6cd4c8..568262160cc1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -547,9 +547,16 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 	 * return to userland e.g. for a copy_to_user() operation.
 	 */
 	if (!(current->flags & PF_KTHREAD)) {
+		/*
+		 * If the PKRU bit in xsave.header.xfeatures is not set,
+		 * then the PKRU component was in init state, which means
+		 * XRSTOR will set PKRU to 0. If the bit is not set then
+		 * get_xsave_addr() will return NULL because the PKRU value
+		 * in memory is not valid. This means pkru_val has to be
+		 * set to 0 and not to init_pkru_value.
+		 */
 		pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
-		if (pk)
-			pkru_val = pk->pkru;
+		pkru_val = pk ? pk->pkru : 0;
 	}
 	__write_pkru(pkru_val);
 }
-- 
Gitee


From 00db18e5d6f5224bf3c544b00dbbaae035e1e8ef Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 15 Feb 2022 11:22:33 -0800
Subject: [PATCH 0136/2161] x86/fpu: Correct pkru/xstate inconsistency

commit opencloudos.

When eagerly switching PKRU in switch_fpu_finish() it checks that
current is not a kernel thread as kernel threads will never use PKRU.
It's possible that this_cpu_read_stable() on current_task
(ie. get_current()) is returning an old cached value. To resolve this
reference next_p directly rather than relying on current.

As written it's possible when switching from a kernel thread to a
userspace thread to observe a cached PF_KTHREAD flag and never restore
the PKRU. And as a result this issue only occurs when switching
from a kernel thread to a userspace thread, switching from a non kernel
thread works perfectly fine because all that is considered in that
situation are the flags from some other non kernel task and the next fpu
is passed in to switch_fpu_finish().

This behavior only exists between 5.2 and 5.13 when it was fixed by a
rewrite decoupling PKRU from xstate, in:
  commit 954436989cc5 ("x86/fpu: Remove PKRU handling from switch_fpu_finish()")

Unfortunately backporting the fix from 5.13 is probably not realistic as
it's part of a 60+ patch series which rewrites most of the PKRU handling.

Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
Signed-off-by: Brian Geffon <bgeffon@google.com>
Signed-off-by: Willis Kung <williskung@google.com>
Tested-by: Willis Kung <williskung@google.com>
Cc: <stable@vger.kernel.org> # v5.4.x
Cc: <stable@vger.kernel.org> # v5.10.x
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 13 ++++++++-----
 arch/x86/kernel/process_32.c        |  6 ++----
 arch/x86/kernel/process_64.c        |  6 ++----
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 568262160cc1..4853a8f3de08 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -508,9 +508,11 @@ static inline void __fpregs_load_activate(void)
  * The FPU context is only stored/restored for a user task and
  * PF_KTHREAD is used to distinguish between kernel and user threads.
  */
-static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
 {
-	if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
+	struct fpu *old_fpu = &prev->thread.fpu;
+
+	if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
 		if (!copy_fpregs_to_fpstate(old_fpu))
 			old_fpu->last_cpu = -1;
 		else
@@ -529,10 +531,11 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  * Load PKRU from the FPU context if available. Delay loading of the
  * complete FPU state until the return to userland.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(struct task_struct *next)
 {
 	u32 pkru_val = init_pkru_value;
 	struct pkru_state *pk;
+	struct fpu *next_fpu = &next->thread.fpu;
 
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return;
@@ -546,7 +549,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 	 * PKRU state is switched eagerly because it needs to be valid before we
 	 * return to userland e.g. for a copy_to_user() operation.
 	 */
-	if (!(current->flags & PF_KTHREAD)) {
+	if (!(next->flags & PF_KTHREAD)) {
 		/*
 		 * If the PKRU bit in xsave.header.xfeatures is not set,
 		 * then the PKRU component was in init state, which means
@@ -555,7 +558,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 		 * in memory is not valid. This means pkru_val has to be
 		 * set to 0 and not to init_pkru_value.
 		 */
-		pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
+		pk = get_xsave_addr(&next_fpu->state.xsave, XFEATURE_PKRU);
 		pkru_val = pk ? pk->pkru : 0;
 	}
 	__write_pkru(pkru_val);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6c7d90527156..4e1972cc7cae 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -161,14 +161,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 			     *next = &next_p->thread;
-	struct fpu *prev_fpu = &prev->fpu;
-	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
-		switch_fpu_prepare(prev_fpu, cpu);
+		switch_fpu_prepare(prev_p, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -224,7 +222,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	this_cpu_write(current_task, next_p);
 
-	switch_fpu_finish(next_fpu);
+	switch_fpu_finish(next_p);
 
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5db6a0737e6d..a521b564e090 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -537,15 +537,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *next = &next_p->thread;
-	struct fpu *prev_fpu = &prev->fpu;
-	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 		     this_cpu_read(irq_count) != -1);
 
 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
-		switch_fpu_prepare(prev_fpu, cpu);
+		switch_fpu_prepare(prev_p, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -597,7 +595,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
-	switch_fpu_finish(next_fpu);
+	switch_fpu_finish(next_p);
 
 	/* Reload sp0. */
 	update_task_stack(next_p);
-- 
Gitee


From 2ff7be7eb57ab2eb3a268cdfb00d091a249a7e96 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:59 +0200
Subject: [PATCH 0137/2161] x86/fpu: Rename copy_fpregs_to_fpstate() to
 save_fpregs_to_fpstate()

commit ebe7234b08a42d69bae94c4062a84777ea26ef99 upstream.

A copy is guaranteed to leave the source intact, which is not the case when
FNSAVE is used as that reinitilizes the registers.

Save does not make such guarantees and it matches what this is about,
i.e. to save the state for a later restore.

Rename it to save_fpregs_to_fpstate().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121454.508853062@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  4 ++--
 arch/x86/kernel/fpu/core.c          | 10 +++++-----
 arch/x86/kvm/x86.c                  |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 4853a8f3de08..652feb5ac43e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -379,7 +379,7 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
 	return err;
 }
 
-extern int copy_fpregs_to_fpstate(struct fpu *fpu);
+extern int save_fpregs_to_fpstate(struct fpu *fpu);
 
 static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
 {
@@ -513,7 +513,7 @@ static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
 	struct fpu *old_fpu = &prev->thread.fpu;
 
 	if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
-		if (!copy_fpregs_to_fpstate(old_fpu))
+		if (!save_fpregs_to_fpstate(old_fpu))
 			old_fpu->last_cpu = -1;
 		else
 			old_fpu->last_cpu = cpu;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1d2587607a7f..dcf4d6b58180 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(irq_fpu_usable);
  * Modern FPU state can be kept in registers, if there are
  * no pending FP exceptions.
  */
-int copy_fpregs_to_fpstate(struct fpu *fpu)
+int save_fpregs_to_fpstate(struct fpu *fpu)
 {
 	if (likely(use_xsave())) {
 		os_xsave(&fpu->state.xsave);
@@ -119,7 +119,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
 
 	return 0;
 }
-EXPORT_SYMBOL(copy_fpregs_to_fpstate);
+EXPORT_SYMBOL(save_fpregs_to_fpstate);
 
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
@@ -137,7 +137,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 		 * Ignore return value -- we don't care if reg state
 		 * is clobbered.
 		 */
-		copy_fpregs_to_fpstate(&current->thread.fpu);
+		save_fpregs_to_fpstate(&current->thread.fpu);
 	}
 	__cpu_invalidate_fpregs_state();
 
@@ -172,7 +172,7 @@ void fpu__save(struct fpu *fpu)
 	trace_x86_fpu_before_save(fpu);
 
 	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		if (!copy_fpregs_to_fpstate(fpu)) {
+		if (!save_fpregs_to_fpstate(fpu)) {
 			copy_kernel_to_fpregs(&fpu->state);
 		}
 	}
@@ -255,7 +255,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src)
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
 		memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
 
-	else if (!copy_fpregs_to_fpstate(dst_fpu))
+	else if (!save_fpregs_to_fpstate(dst_fpu))
 		copy_kernel_to_fpregs(&dst_fpu->state);
 
 	fpregs_unlock();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c43e5d018e24..0f0fb7f1e470 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8611,7 +8611,7 @@ static void kvm_save_current_fpu(struct fpu *fpu)
 		memcpy(&fpu->state, &current->thread.fpu.state,
 		       fpu_kernel_xstate_size);
 	else
-		copy_fpregs_to_fpstate(fpu);
+		save_fpregs_to_fpstate(fpu);
 }
 
 /* Swap (qemu) user FPU context for the guest FPU context. */
-- 
Gitee


From 8d0c4dac20d88456ab7b49d351db6b0c76777c6c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:00 +0200
Subject: [PATCH 0138/2161] x86/fpu: Get rid of the FNSAVE optimization

commit 08ded2cd18a09749e67a14426aa7fd1b04ab1dc0 upstream.

The FNSAVE support requires conditionals in quite some call paths because
FNSAVE reinitializes the FPU hardware. If the save has to preserve the FPU
register state then the caller has to conditionally restore it from memory
when FNSAVE is in use.

This also requires a conditional in context switch because the restore
avoidance optimization cannot work with FNSAVE. As this only affects 20+
years old CPUs there is really no reason to keep this optimization
effective for FNSAVE. It's about time to not optimize for antiques anymore.

Just unconditionally FRSTOR the save content to the registers and clean up
the conditionals all over the place.

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121454.617369268@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 18 ++++++----
 arch/x86/kernel/fpu/core.c          | 54 ++++++++++++-----------------
 2 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 652feb5ac43e..c6bc90bd1208 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -82,6 +82,7 @@ extern void fpstate_init_soft(struct swregs_state *soft);
 #else
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
+extern void save_fpregs_to_fpstate(struct fpu *fpu);
 
 /* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
 #define user_insn(insn, output, input...)				\
@@ -379,8 +380,6 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
 	return err;
 }
 
-extern int save_fpregs_to_fpstate(struct fpu *fpu);
-
 static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
 {
 	if (use_xsave()) {
@@ -513,12 +512,17 @@ static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
 	struct fpu *old_fpu = &prev->thread.fpu;
 
 	if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
-		if (!save_fpregs_to_fpstate(old_fpu))
-			old_fpu->last_cpu = -1;
-		else
-			old_fpu->last_cpu = cpu;
+		save_fpregs_to_fpstate(old_fpu);
+		/*
+		 * The save operation preserved register state, so the
+		 * fpu_fpregs_owner_ctx is still @old_fpu. Store the
+		 * current CPU number in @old_fpu, so the next return
+		 * to user space can avoid the FPU register restore
+		 * when is returns on the same CPU and still owns the
+		 * context.
+		 */
+		old_fpu->last_cpu = cpu;
 
-		/* But leave fpu_fpregs_owner_ctx! */
 		trace_x86_fpu_regs_deactivated(old_fpu);
 	}
 }
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index dcf4d6b58180..c290ba27ffef 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -83,16 +83,20 @@ bool irq_fpu_usable(void)
 EXPORT_SYMBOL(irq_fpu_usable);
 
 /*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact and we can
- * keep registers active.
+ * Save the FPU register state in fpu->state. The register state is
+ * preserved.
  *
- * The legacy FNSAVE instruction cleared all FPU state
- * unconditionally, so registers are essentially destroyed.
- * Modern FPU state can be kept in registers, if there are
- * no pending FP exceptions.
+ * Must be called with fpregs_lock() held.
+ *
+ * The legacy FNSAVE instruction clears all FPU state unconditionally, so
+ * register state has to be reloaded. That might be a pointless exercise
+ * when the FPU is going to be used by another task right after that. But
+ * this only affects 20+ years old 32bit systems and avoids conditionals all
+ * over the place.
+ *
+ * FXSAVE and all XSAVE variants preserve the FPU register state.
  */
-int save_fpregs_to_fpstate(struct fpu *fpu)
+void save_fpregs_to_fpstate(struct fpu *fpu)
 {
 	if (likely(use_xsave())) {
 		os_xsave(&fpu->state.xsave);
@@ -103,21 +107,20 @@ int save_fpregs_to_fpstate(struct fpu *fpu)
 		 */
 		if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
 			fpu->avx512_timestamp = jiffies;
-		return 1;
+		return;
 	}
 
 	if (likely(use_fxsr())) {
 		fxsave(&fpu->state.fxsave);
-		return 1;
+		return;
 	}
 
 	/*
 	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
-	 * so we have to mark them inactive:
+	 * so we have to reload them from the memory state.
 	 */
 	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
-
-	return 0;
+	frstor(&fpu->state.fsave);
 }
 EXPORT_SYMBOL(save_fpregs_to_fpstate);
 
@@ -133,10 +136,6 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 	if (!(current->flags & PF_KTHREAD) &&
 	    !test_thread_flag(TIF_NEED_FPU_LOAD)) {
 		set_thread_flag(TIF_NEED_FPU_LOAD);
-		/*
-		 * Ignore return value -- we don't care if reg state
-		 * is clobbered.
-		 */
 		save_fpregs_to_fpstate(&current->thread.fpu);
 	}
 	__cpu_invalidate_fpregs_state();
@@ -171,11 +170,8 @@ void fpu__save(struct fpu *fpu)
 	fpregs_lock();
 	trace_x86_fpu_before_save(fpu);
 
-	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		if (!save_fpregs_to_fpstate(fpu)) {
-			copy_kernel_to_fpregs(&fpu->state);
-		}
-	}
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		save_fpregs_to_fpstate(fpu);
 
 	trace_x86_fpu_after_save(fpu);
 	fpregs_unlock();
@@ -244,20 +240,16 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src)
 	memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
 
 	/*
-	 * If the FPU registers are not current just memcpy() the state.
-	 * Otherwise save current FPU registers directly into the child's FPU
-	 * context, without any memory-to-memory copying.
-	 *
-	 * ( The function 'fails' in the FNSAVE case, which destroys
-	 *   register contents so we have to load them back. )
+	 * If the FPU registers are not owned by current just memcpy() the
+	 * state.  Otherwise save the FPU registers directly into the
+	 * child's FPU context, without any memory-to-memory copying.
 	 */
 	fpregs_lock();
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
 		memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
 
-	else if (!save_fpregs_to_fpstate(dst_fpu))
-		copy_kernel_to_fpregs(&dst_fpu->state);
-
+	else
+		save_fpregs_to_fpstate(dst_fpu);
 	fpregs_unlock();
 
 	set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
-- 
Gitee


From 070efcbae1a85a3cf82b530300a730bb0caf1292 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:01 +0200
Subject: [PATCH 0139/2161] x86/fpu: Rename copy_kernel_to_fpregs() to
 restore_fpregs_from_fpstate()

commit 1c61fada304c125c3f8a2b8eb1896406e4098a05 upstream.

This is not a copy functionality. It restores the register state from the
supplied kernel buffer.

No functional changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121454.716058365@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 8 ++++----
 arch/x86/kvm/x86.c                  | 4 ++--
 arch/x86/mm/extable.c               | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index c6bc90bd1208..e5147ec5b138 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -380,7 +380,7 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
 	return err;
 }
 
-static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
+static inline void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
 {
 	if (use_xsave()) {
 		os_xrstor(&fpstate->xsave, mask);
@@ -392,7 +392,7 @@ static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask
 	}
 }
 
-static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
+static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
 {
 	/*
 	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
@@ -407,7 +407,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
 			: : [addr] "m" (fpstate));
 	}
 
-	__copy_kernel_to_fpregs(fpstate, -1);
+	__restore_fpregs_from_fpstate(fpstate, -1);
 }
 
 extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
@@ -478,7 +478,7 @@ static inline void __fpregs_load_activate(void)
 		return;
 
 	if (!fpregs_state_valid(fpu, cpu)) {
-		copy_kernel_to_fpregs(&fpu->state);
+		restore_fpregs_from_fpstate(&fpu->state);
 		fpregs_activate(fpu);
 		fpu->last_cpu = cpu;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0f0fb7f1e470..608cf94d1d5b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8622,7 +8622,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	kvm_save_current_fpu(vcpu->arch.user_fpu);
 
 	/* PKRU is separately restored in kvm_x86_ops->run.  */
-	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+	__restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
 				~XFEATURE_MASK_PKRU);
 
 	fpregs_mark_activate();
@@ -8638,7 +8638,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 
 	kvm_save_current_fpu(vcpu->arch.guest_fpu);
 
-	copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
+	restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
 
 	fpregs_mark_activate();
 	fpregs_unlock();
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 3626bae87d40..de0b5158563d 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -113,7 +113,7 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
 	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
 		  (void *)instruction_pointer(regs));
 
-	__copy_kernel_to_fpregs(&init_fpstate, -1);
+	__restore_fpregs_from_fpstate(&init_fpstate, -1);
 	return true;
 }
 EXPORT_SYMBOL_GPL(ex_handler_fprestore);
-- 
Gitee


From 336419ba248d72bc2647b1fbb803320dc1b2d94b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:02 +0200
Subject: [PATCH 0140/2161] x86/fpu: Rename initstate copy functions

commit b76411b1b568311bfd89d03acc587ffc1548c26f upstream.

Again this not a copy. It's restoring register state from kernel memory.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121454.816581630@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index c290ba27ffef..4a59e0fbcfd8 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -303,7 +303,7 @@ void fpu__drop(struct fpu *fpu)
  * Clear FPU registers by setting them up from the init fpstate.
  * Caller must do fpregs_[un]lock() around it.
  */
-static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
+static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 {
 	if (use_xsave())
 		os_xrstor(&init_fpstate.xsave, features_mask);
@@ -338,9 +338,9 @@ static void fpu__clear(struct fpu *fpu, bool user_only)
 		if (!fpregs_state_valid(fpu, smp_processor_id()) &&
 		    xfeatures_mask_supervisor())
 			os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
-		copy_init_fpstate_to_fpregs(xfeatures_mask_user());
+		restore_fpregs_from_init_fpstate(xfeatures_mask_user());
 	} else {
-		copy_init_fpstate_to_fpregs(xfeatures_mask_all);
+		restore_fpregs_from_init_fpstate(xfeatures_mask_all);
 	}
 
 	fpregs_mark_activate();
-- 
Gitee


From 62e71083a037b4f88f1714579e43afef22af050c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 23 Jun 2021 14:02:03 +0200
Subject: [PATCH 0141/2161] x86/fpu: Rename "dynamic" XSTATEs to "independent"

commit 01707b66535872f7a0d87f66078fd018d1814be0 upstream.

The salient feature of "dynamic" XSTATEs is that they are not part of the
main task XSTATE buffer.  The fact that they are dynamically allocated is
irrelevant and will become quite confusing when user math XSTATEs start
being dynamically allocated.  Rename them to "independent" because they
are independent of the main XSTATE code.

This is just a search-and-replace with some whitespace updates to keep
things aligned.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/1eecb0e4f3e07828ebe5d737ec77dc3b708fad2d.1623388344.git.luto@kernel.org
Link: https://lkml.kernel.org/r/20210623121454.911450390@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 22 +++++------
 arch/x86/kernel/fpu/xstate.c      | 62 +++++++++++++++----------------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index e92c820d3d44..6e158305e453 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -42,21 +42,21 @@
  * and its size may be huge. Saving/restoring such supervisor state components
  * at each context switch can cause high CPU and space overhead, which should
  * be avoided. Such supervisor state components should only be saved/restored
- * on demand. The on-demand dynamic supervisor features are set in this mask.
+ * on demand. The on-demand supervisor features are set in this mask.
  *
- * Unlike the existing supported supervisor features, a dynamic supervisor
+ * Unlike the existing supported supervisor features, an independent supervisor
  * feature does not allocate a buffer in task->fpu, and the corresponding
  * supervisor state component cannot be saved/restored at each context switch.
  *
- * To support a dynamic supervisor feature, a developer should follow the
+ * To support an independent supervisor feature, a developer should follow the
  * dos and don'ts as below:
  * - Do dynamically allocate a buffer for the supervisor state component.
  * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
  *   state component to/from the buffer.
- * - Don't set the bit corresponding to the dynamic supervisor feature in
+ * - Don't set the bit corresponding to the independent supervisor feature in
  *   IA32_XSS at run time, since it has been set at boot time.
  */
-#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR)
+#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)
 
 /*
  * Unsupported supervisor features. When a supervisor feature in this mask is
@@ -66,7 +66,7 @@
 
 /* All supervisor states including supported and unsupported states. */
 #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
-				      XFEATURE_MASK_DYNAMIC | \
+				      XFEATURE_MASK_INDEPENDENT | \
 				      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
 #ifdef CONFIG_X86_64
@@ -87,12 +87,12 @@ static inline u64 xfeatures_mask_user(void)
 	return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
 }
 
-static inline u64 xfeatures_mask_dynamic(void)
+static inline u64 xfeatures_mask_independent(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
-		return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR;
+		return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
 
-	return XFEATURE_MASK_DYNAMIC;
+	return XFEATURE_MASK_INDEPENDENT;
 }
 
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
@@ -105,8 +105,8 @@ const void *get_xsave_field_ptr(int xfeature_nr);
 int xfeature_size(int xfeature_nr);
 int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
-void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
+void copy_independent_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
+void copy_kernel_to_independent_supervisor(struct xregs_state *xstate, u64 mask);
 
 enum xstate_copy_mode {
 	XSTATE_COPY_FP,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index dfea31d4a009..63e6182ec0b7 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -151,7 +151,7 @@ void fpu__init_cpu_xstate(void)
 	 */
 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
-				     xfeatures_mask_dynamic());
+				     xfeatures_mask_independent());
 	}
 }
 
@@ -548,7 +548,7 @@ static void check_xstate_against_struct(int nr)
  * how large the XSAVE buffer needs to be.  We are recalculating
  * it to be safe.
  *
- * Dynamic XSAVE features allocate their own buffers and are not
+ * Independent XSAVE features allocate their own buffers and are not
  * covered by these checks. Only the size of the buffer for task->fpu
  * is checked here.
  */
@@ -614,18 +614,18 @@ static unsigned int __init get_xsaves_size(void)
 }
 
 /*
- * Get the total size of the enabled xstates without the dynamic supervisor
+ * Get the total size of the enabled xstates without the independent supervisor
  * features.
  */
-static unsigned int __init get_xsaves_size_no_dynamic(void)
+static unsigned int __init get_xsaves_size_no_independent(void)
 {
-	u64 mask = xfeatures_mask_dynamic();
+	u64 mask = xfeatures_mask_independent();
 	unsigned int size;
 
 	if (!mask)
 		return get_xsaves_size();
 
-	/* Disable dynamic features. */
+	/* Disable independent features. */
 	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
 
 	/*
@@ -634,7 +634,7 @@ static unsigned int __init get_xsaves_size_no_dynamic(void)
 	 */
 	size = get_xsaves_size();
 
-	/* Re-enable dynamic features so XSAVES will work on them again. */
+	/* Re-enable independent features so XSAVES will work on them again. */
 	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
 
 	return size;
@@ -677,7 +677,7 @@ static int __init init_xstate_size(void)
 	xsave_size = get_xsave_size();
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		possible_xstate_size = get_xsaves_size_no_dynamic();
+		possible_xstate_size = get_xsaves_size_no_independent();
 	else
 		possible_xstate_size = xsave_size;
 
@@ -834,7 +834,7 @@ void fpu__resume_cpu(void)
 	 */
 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
-				     xfeatures_mask_dynamic());
+				     xfeatures_mask_independent());
 	}
 }
 
@@ -1190,34 +1190,34 @@ int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
 }
 
 /**
- * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
- *                                       an xsave area
+ * copy_independent_supervisor_to_kernel() - Save independent supervisor states to
+ *                                           an xsave area
  * @xstate: A pointer to an xsave area
- * @mask: Represent the dynamic supervisor features saved into the xsave area
+ * @mask: Represent the independent supervisor features saved into the xsave area
  *
- * Only the dynamic supervisor states sets in the mask are saved into the xsave
- * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
- * supervisor feature). Besides the dynamic supervisor states, the legacy
+ * Only the independent supervisor states sets in the mask are saved into the xsave
+ * area (See the comment in XFEATURE_MASK_INDEPENDENT for the details of independent
+ * supervisor feature). Besides the independent supervisor states, the legacy
  * region and XSAVE header are also saved into the xsave area. The supervisor
  * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
  * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
  *
  * The xsave area must be 64-bytes aligned.
  */
-void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+void copy_independent_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
 {
-	u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+	u64 independent_mask = xfeatures_mask_independent() & mask;
 	u32 lmask, hmask;
 	int err;
 
 	if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
 		return;
 
-	if (WARN_ON_FPU(!dynamic_mask))
+	if (WARN_ON_FPU(!independent_mask))
 		return;
 
-	lmask = dynamic_mask;
-	hmask = dynamic_mask >> 32;
+	lmask = independent_mask;
+	hmask = independent_mask >> 32;
 
 	XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
 
@@ -1226,34 +1226,34 @@ void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
 }
 
 /**
- * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
- *                                       an xsave area
+ * copy_kernel_to_independent_supervisor() - Restore independent supervisor states from
+ *                                           an xsave area
  * @xstate: A pointer to an xsave area
- * @mask: Represent the dynamic supervisor features restored from the xsave area
+ * @mask: Represent the independent supervisor features restored from the xsave area
  *
- * Only the dynamic supervisor states sets in the mask are restored from the
- * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
- * dynamic supervisor feature). Besides the dynamic supervisor states, the
+ * Only the independent supervisor states sets in the mask are restored from the
+ * xsave area (See the comment in XFEATURE_MASK_INDEPENDENT for the details of
+ * independent supervisor feature). Besides the independent supervisor states, the
  * legacy region and XSAVE header are also restored from the xsave area. The
  * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
  * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
  *
  * The xsave area must be 64-bytes aligned.
  */
-void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
+void copy_kernel_to_independent_supervisor(struct xregs_state *xstate, u64 mask)
 {
-	u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+	u64 independent_mask = xfeatures_mask_independent() & mask;
 	u32 lmask, hmask;
 	int err;
 
 	if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
 		return;
 
-	if (WARN_ON_FPU(!dynamic_mask))
+	if (WARN_ON_FPU(!independent_mask))
 		return;
 
-	lmask = dynamic_mask;
-	hmask = dynamic_mask >> 32;
+	lmask = independent_mask;
+	hmask = independent_mask >> 32;
 
 	XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
 
-- 
Gitee


From 9a8cddd77821564fc06bb8b6602a1b9a03dc5135 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:04 +0200
Subject: [PATCH 0142/2161] x86/fpu/xstate: Sanitize handling of independent
 features

commit a75c52896b6d42d6600db4d4dd9f7e4bde9218db upstream.

The copy functions for the independent features are horribly named and the
supervisor and independent part is just overengineered.

The point is that the supplied mask has either to be a subset of the
independent features or a subset of the task->fpu.xstate managed features.

Rewrite it so it checks for invalid overlaps of these areas in the caller
supplied feature mask. Rename it so it follows the new naming convention
for these operations. Mop up the function documentation.

This allows to use that function for other purposes as well.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lkml.kernel.org/r/20210623121455.004880675@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  5 +-
 arch/x86/kernel/fpu/xstate.c      | 98 +++++++++++++++----------------
 2 files changed, 51 insertions(+), 52 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 6e158305e453..9588059a1fbe 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -105,8 +105,9 @@ const void *get_xsave_field_ptr(int xfeature_nr);
 int xfeature_size(int xfeature_nr);
 int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-void copy_independent_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
-void copy_kernel_to_independent_supervisor(struct xregs_state *xstate, u64 mask);
+
+void xsaves(struct xregs_state *xsave, u64 mask);
+void xrstors(struct xregs_state *xsave, u64 mask);
 
 enum xstate_copy_mode {
 	XSTATE_COPY_FP,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 63e6182ec0b7..a59d6d7bc3af 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1189,76 +1189,74 @@ int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
 	return copy_uabi_to_xstate(xsave, NULL, ubuf);
 }
 
+static bool validate_xsaves_xrstors(u64 mask)
+{
+	u64 xchk;
+
+	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
+		return false;
+	/*
+	 * Validate that this is either a task->fpstate related component
+	 * subset or an independent one.
+	 */
+	if (mask & xfeatures_mask_independent())
+		xchk = ~xfeatures_mask_independent();
+	else
+		xchk = ~xfeatures_mask_all;
+
+	if (WARN_ON_ONCE(!mask || mask & xchk))
+		return false;
+
+	return true;
+}
+
 /**
- * copy_independent_supervisor_to_kernel() - Save independent supervisor states to
- *                                           an xsave area
- * @xstate: A pointer to an xsave area
- * @mask: Represent the independent supervisor features saved into the xsave area
+ * xsaves - Save selected components to a kernel xstate buffer
+ * @xstate:	Pointer to the buffer
+ * @mask:	Feature mask to select the components to save
  *
- * Only the independent supervisor states sets in the mask are saved into the xsave
- * area (See the comment in XFEATURE_MASK_INDEPENDENT for the details of independent
- * supervisor feature). Besides the independent supervisor states, the legacy
- * region and XSAVE header are also saved into the xsave area. The supervisor
- * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
- * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
+ * The @xstate buffer must be 64 byte aligned and correctly initialized as
+ * XSAVES does not write the full xstate header. Before first use the
+ * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
+ * can #GP.
  *
- * The xsave area must be 64-bytes aligned.
+ * The feature mask must either be a subset of the independent features or
+ * a subset of the task->fpstate related features.
  */
-void copy_independent_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+void xsaves(struct xregs_state *xstate, u64 mask)
 {
-	u64 independent_mask = xfeatures_mask_independent() & mask;
-	u32 lmask, hmask;
 	int err;
 
-	if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+	if (!validate_xsaves_xrstors(mask))
 		return;
 
-	if (WARN_ON_FPU(!independent_mask))
-		return;
-
-	lmask = independent_mask;
-	hmask = independent_mask >> 32;
-
-	XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
-
-	/* Should never fault when copying to a kernel buffer */
-	WARN_ON_FPU(err);
+	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
+	WARN_ON_ONCE(err);
 }
 
 /**
- * copy_kernel_to_independent_supervisor() - Restore independent supervisor states from
- *                                           an xsave area
- * @xstate: A pointer to an xsave area
- * @mask: Represent the independent supervisor features restored from the xsave area
+ * xrstors - Restore selected components from a kernel xstate buffer
+ * @xstate:	Pointer to the buffer
+ * @mask:	Feature mask to select the components to restore
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized
+ * otherwise XRSTORS from that buffer can #GP.
  *
- * Only the independent supervisor states sets in the mask are restored from the
- * xsave area (See the comment in XFEATURE_MASK_INDEPENDENT for the details of
- * independent supervisor feature). Besides the independent supervisor states, the
- * legacy region and XSAVE header are also restored from the xsave area. The
- * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
- * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
+ * Proper usage is to restore the state which was saved with
+ * xsaves() into @xstate.
  *
- * The xsave area must be 64-bytes aligned.
+ * The feature mask must either be a subset of the independent features or
+ * a subset of the task->fpstate related features.
  */
-void copy_kernel_to_independent_supervisor(struct xregs_state *xstate, u64 mask)
+void xrstors(struct xregs_state *xstate, u64 mask)
 {
-	u64 independent_mask = xfeatures_mask_independent() & mask;
-	u32 lmask, hmask;
 	int err;
 
-	if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+	if (!validate_xsaves_xrstors(mask))
 		return;
 
-	if (WARN_ON_FPU(!independent_mask))
-		return;
-
-	lmask = independent_mask;
-	hmask = independent_mask >> 32;
-
-	XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
-
-	/* Should never fault when copying from a kernel buffer */
-	WARN_ON_FPU(err);
+	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
+	WARN_ON_ONCE(err);
 }
 
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
-- 
Gitee


From 35b690add9bb250788fe4b28e28d1613e5a6ac12 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Thu, 16 Dec 2021 00:08:56 +0000
Subject: [PATCH 0143/2161] x86/pkey: Fix undefined behaviour with PKRU_WD_BIT

commit 57690554abe135fee81d6ac33cc94d75a7e224bb upstream.

Both __pkru_allows_write() and arch_set_user_pkey_access() shift
PKRU_WD_BIT (a signed constant) by up to 30 bits, hitting the
sign bit.

Use unsigned constants instead.

Clearly pkey 15 has not been used in combination with UBSAN yet.

Noticed by code inspection only.  I can't actually provoke the
compiler into generating incorrect logic as far as this shift is
concerned.

[
  dhansen: add stable@ tag, plus minor changelog massaging,

           For anyone doing backports, these #defines were in
	   arch/x86/include/asm/pgtable.h before 784a46618f6.
]

Fixes: 33a709b25a76 ("mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20211216000856.4480-1-andrew.cooper3@citrix.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pgtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ea85f23d9e22..1ae7c20f5469 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1375,8 +1375,8 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
 #endif
 #endif
 
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
 #define PKRU_BITS_PER_PKEY 2
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-- 
Gitee


From d4bae646f60ca2cb657e5ca84b5a0aa33769ef4c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 23 Jun 2021 14:02:05 +0200
Subject: [PATCH 0144/2161] x86/pkeys: Move read_pkru() and write_pkru()

commit 784a46618f634973a17535b7d3d03cd4ebc0ccbd upstream.

write_pkru() was originally used just to write to the PKRU register.  It
was mercifully short and sweet and was not out of place in pgtable.h with
some other pkey-related code.

But, later work included a requirement to also modify the task XSAVE
buffer when updating the register.  This really is more related to the
XSAVE architecture than to paging.

Move the read/write_pkru() to asm/pkru.h.  pgtable.h won't miss them.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.102647114@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  1 +
 arch/x86/include/asm/pgtable.h    | 57 +---------------------------
 arch/x86/include/asm/pkru.h       | 62 +++++++++++++++++++++++++++++++
 arch/x86/kernel/process_64.c      |  1 +
 arch/x86/kvm/x86.c                |  1 +
 arch/x86/mm/pkeys.c               |  1 +
 6 files changed, 67 insertions(+), 56 deletions(-)
 create mode 100644 arch/x86/include/asm/pkru.h

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 9588059a1fbe..879f2c19cd6b 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 
 #include <asm/processor.h>
+#include <asm/fpu/api.h>
 #include <asm/user.h>
 
 /* Bit 63 of XCR0 is reserved for future expansion */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1ae7c20f5469..515787ace19d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -23,7 +23,7 @@
 
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
-#include <asm/fpu/xstate.h>
+#include <asm/pkru.h>
 #include <asm/fpu/api.h>
 
 extern pgd_t early_top_pgt[PTRS_PER_PGD];
@@ -125,35 +125,6 @@ static inline int pte_dirty(pte_t pte)
 	return pte_flags(pte) & _PAGE_DIRTY;
 }
 
-
-static inline u32 read_pkru(void)
-{
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
-		return rdpkru();
-	return 0;
-}
-
-static inline void write_pkru(u32 pkru)
-{
-	struct pkru_state *pk;
-
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
-		return;
-
-	pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);
-
-	/*
-	 * The PKRU value in xstate needs to be in sync with the value that is
-	 * written to the CPU. The FPU restore on return to userland would
-	 * otherwise load the previous value again.
-	 */
-	fpregs_lock();
-	if (pk)
-		pk->pkru = pkru;
-	__write_pkru(pkru);
-	fpregs_unlock();
-}
-
 static inline int pte_young(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_ACCESSED;
@@ -1375,32 +1346,6 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
 #endif
 #endif
 
-#define PKRU_AD_BIT 0x1u
-#define PKRU_WD_BIT 0x2u
-#define PKRU_BITS_PER_PKEY 2
-
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-extern u32 init_pkru_value;
-#else
-#define init_pkru_value	0
-#endif
-
-static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
-{
-	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
-	return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
-}
-
-static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
-{
-	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
-	/*
-	 * Access-disable disables writes too so we need to check
-	 * both bits here.
-	 */
-	return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
-}
-
 static inline u16 pte_flags_pkey(unsigned long pte_flags)
 {
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
new file mode 100644
index 000000000000..02393db3aa95
--- /dev/null
+++ b/arch/x86/include/asm/pkru.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PKRU_H
+#define _ASM_X86_PKRU_H
+
+#include <asm/fpu/xstate.h>
+
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
+#define PKRU_BITS_PER_PKEY 2
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+extern u32 init_pkru_value;
+#else
+#define init_pkru_value	0
+#endif
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+	return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+	/*
+	 * Access-disable disables writes too so we need to check
+	 * both bits here.
+	 */
+	return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+
+static inline u32 read_pkru(void)
+{
+	if (boot_cpu_has(X86_FEATURE_OSPKE))
+		return rdpkru();
+	return 0;
+}
+
+static inline void write_pkru(u32 pkru)
+{
+	struct pkru_state *pk;
+
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return;
+
+	pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);
+
+	/*
+	 * The PKRU value in xstate needs to be in sync with the value that is
+	 * written to the CPU. The FPU restore on return to userland would
+	 * otherwise load the previous value again.
+	 */
+	fpregs_lock();
+	if (pk)
+		pk->pkru = pkru;
+	__write_pkru(pkru);
+	fpregs_unlock();
+}
+
+#endif
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a521b564e090..704e7aaafc30 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -42,6 +42,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
+#include <asm/pkru.h>
 #include <asm/fpu/internal.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 608cf94d1d5b..ceea868d0981 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,6 +60,7 @@
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mce.h>
+#include <asm/pkru.h>
 #include <linux/kernel_stat.h>
 #include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/pvclock.h>
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index ca77af96033b..5d0d571ab4ad 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -10,6 +10,7 @@
 
 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
 #include <asm/mmu_context.h>            /* vma_pkey()                   */
+#include <asm/pkru.h>			/* read/write_pkru()		*/
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
-- 
Gitee


From be177183dfc120c4610a39e8ceab44fb8b29a5c8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:06 +0200
Subject: [PATCH 0145/2161] x86/fpu: Rename and sanitize fpu__save/copy()

commit b2681e791dbcee6acb1dca7a5076a0285109ac4c upstream.

Both function names are a misnomer.

fpu__save() is actually about synchronizing the hardware register state
into the task's memory state so that either coredump or a math exception
handler can inspect the state at the time where the problem happens.

The function guarantees to preserve the register state, while "save" is a
common terminology for saving the current state so it can be modified and
restored later. This is clearly not the case here.

Rename it to fpu_sync_fpstate().

fpu__copy() is used to clone the current task's FPU state when duplicating
task_struct. While the register state is a copy the rest of the FPU state
is not.

Name it accordingly and remove the really pointless @src argument along
with the warning which comes along with it.

Nothing can ever copy the FPU state of a non-current task. It's clearly
just a consequence of arch_dup_task_struct(), but it makes no sense to
proliferate that further.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.196727450@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  6 ++++--
 arch/x86/kernel/fpu/core.c          | 17 ++++++++---------
 arch/x86/kernel/fpu/regset.c        |  2 +-
 arch/x86/kernel/process.c           |  3 +--
 arch/x86/kernel/traps.c             |  5 +++--
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index e5147ec5b138..9f31f1541eb1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -25,14 +25,16 @@
 /*
  * High level FPU state handling functions:
  */
-extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
-extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
 extern void fpu__clear_user_states(struct fpu *fpu);
 extern void fpu__clear_all(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 
+extern void fpu_sync_fpstate(struct fpu *fpu);
+
+extern int  fpu_clone(struct task_struct *dst);
+
 /*
  * Boot time FPU initialization functions:
  */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 4a59e0fbcfd8..8762b1a8966a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -159,11 +159,10 @@ void kernel_fpu_end(void)
 EXPORT_SYMBOL_GPL(kernel_fpu_end);
 
 /*
- * Save the FPU state (mark it for reload if necessary):
- *
- * This only ever gets called for the current task.
+ * Sync the FPU register state to current's memory register state when the
+ * current task owns the FPU. The hardware register state is preserved.
  */
-void fpu__save(struct fpu *fpu)
+void fpu_sync_fpstate(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
@@ -221,18 +220,18 @@ void fpstate_init(union fpregs_state *state)
 }
 EXPORT_SYMBOL_GPL(fpstate_init);
 
-int fpu__copy(struct task_struct *dst, struct task_struct *src)
+/* Clone current's FPU state on fork */
+int fpu_clone(struct task_struct *dst)
 {
+	struct fpu *src_fpu = &current->thread.fpu;
 	struct fpu *dst_fpu = &dst->thread.fpu;
-	struct fpu *src_fpu = &src->thread.fpu;
 
+	/* The new task's FPU state cannot be valid in the hardware. */
 	dst_fpu->last_cpu = -1;
 
-	if (!static_cpu_has(X86_FEATURE_FPU))
+	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return 0;
 
-	WARN_ON_FPU(src_fpu != &current->thread.fpu);
-
 	/*
 	 * Don't let 'init optimized' areas of the XSAVE area
 	 * leak into the child task:
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 36d0a68d361f..b4be376da5a1 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -41,7 +41,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
 static void sync_fpstate(struct fpu *fpu)
 {
 	if (fpu == &current->thread.fpu)
-		fpu__save(fpu);
+		fpu_sync_fpstate(fpu);
 }
 
 /*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5db6ee4894e5..24131a417e0b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -100,8 +100,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 #ifdef CONFIG_VM86
 	dst->thread.vm86 = NULL;
 #endif
-
-	return fpu__copy(dst, src);
+	return fpu_clone(dst);
 }
 
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 18648baeccf7..f1fe995309a4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -767,9 +767,10 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	}
 
 	/*
-	 * Save the info for the exception handler and clear the error.
+	 * Synchronize the FPU register state to the memory register state
+	 * if necessary. This allows the exception handler to inspect it.
 	 */
-	fpu__save(fpu);
+	fpu_sync_fpstate(fpu);
 
 	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
-- 
Gitee


From 482dda64f74dccc26ab3b39b3fa5fe4d657cb416 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 2 Nov 2021 16:13:52 +0800
Subject: [PATCH 0146/2161] x86/fpu: Change fpu__save() to fpu_sync_fpstate()
 called in get_xsave_field_ptr()

commit opencloudos.

Since fpu__save() is renamed to fpu_sync_fpstate() in "x86/fpu: Rename and sanitize fpu__save/copy()",
here exchange it.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a59d6d7bc3af..fa1620e0eca3 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -927,7 +927,7 @@ const void *get_xsave_field_ptr(int xfeature_nr)
 	 * fpu__save() takes the CPU's xstate registers
 	 * and saves them off to the 'fpu memory buffer.
 	 */
-	fpu__save(fpu);
+	fpu_sync_fpstate(fpu);
 
 	return get_xsave_addr(&fpu->state.xsave, xfeature_nr);
 }
-- 
Gitee


From d7deb63407321c7e1ef18b4739487fe076c42b57 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:07 +0200
Subject: [PATCH 0147/2161] x86/cpu: Sanitize X86_FEATURE_OSPKE

commit 8a1dc55a3f3ef0a723c3c117a567e7b5dd2c1793 upstream.

X86_FEATURE_OSPKE is enabled first on the boot CPU and the feature flag is
set. Secondary CPUs have to enable CR4.PKE as well and set their per CPU
feature flag. That's ineffective because all call sites have checks for
boot_cpu_data.

Make it smarter and force the feature flag when PKU is enabled on the boot
cpu which allows then to use cpu_feature_enabled(X86_FEATURE_OSPKE) all
over the place. That either compiles the code out when PKEY support is
disabled in Kconfig or uses a static_cpu_has() for the feature check which
makes a significant difference in hotpaths, e.g. context switch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.305113644@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pkeys.h |  8 ++++----
 arch/x86/include/asm/pkru.h  |  4 ++--
 arch/x86/kernel/cpu/common.c | 24 +++++++++++-------------
 arch/x86/kernel/fpu/core.c   |  2 +-
 arch/x86/kernel/fpu/xstate.c |  2 +-
 arch/x86/kernel/process_64.c |  2 +-
 arch/x86/mm/fault.c          |  2 +-
 7 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 2ff9b98812b7..4128f647c755 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -9,14 +9,14 @@
  * will be necessary to ensure that the types that store key
  * numbers and masks have sufficient capacity.
  */
-#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 
 static inline bool arch_pkeys_enabled(void)
 {
-	return boot_cpu_has(X86_FEATURE_OSPKE);
+	return cpu_feature_enabled(X86_FEATURE_OSPKE);
 }
 
 /*
@@ -26,7 +26,7 @@ static inline bool arch_pkeys_enabled(void)
 extern int __execute_only_pkey(struct mm_struct *mm);
 static inline int execute_only_pkey(struct mm_struct *mm)
 {
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return ARCH_DEFAULT_PKEY;
 
 	return __execute_only_pkey(mm);
@@ -37,7 +37,7 @@ extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
 static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
 		int prot, int pkey)
 {
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return 0;
 
 	return __arch_override_mprotect_pkey(vma, prot, pkey);
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index 02393db3aa95..875ef03d228f 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -33,7 +33,7 @@ static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
 
 static inline u32 read_pkru(void)
 {
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
+	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return rdpkru();
 	return 0;
 }
@@ -42,7 +42,7 @@ static inline void write_pkru(u32 pkru)
 {
 	struct pkru_state *pk;
 
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return;
 
 	pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 49e40b5fb195..25f766ee8c68 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -465,22 +465,20 @@ static bool pku_disabled;
 
 static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 {
-	/* check the boot processor, plus compile options for PKU: */
-	if (!cpu_feature_enabled(X86_FEATURE_PKU))
-		return;
-	/* checks the actual processor's cpuid bits: */
-	if (!cpu_has(c, X86_FEATURE_PKU))
-		return;
-	if (pku_disabled)
+	if (c == &boot_cpu_data) {
+		if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+			return;
+		/*
+		 * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+		 * bit to be set.  Enforce it.
+		 */
+		setup_force_cpu_cap(X86_FEATURE_OSPKE);
+
+	} else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
 		return;
+	}
 
 	cr4_set_bits(X86_CR4_PKE);
-	/*
-	 * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
-	 * cpuid bit to be set.  We need to ensure that we
-	 * update that bit in this CPU's "cpu_info".
-	 */
-	set_cpu_cap(c, X86_FEATURE_OSPKE);
 }
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8762b1a8966a..3866954354a4 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -311,7 +311,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 	else
 		frstor(&init_fpstate.fsave);
 
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
+	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
 		copy_init_pkru_to_fpregs();
 }
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index fa1620e0eca3..64193bf65038 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -948,7 +948,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	 * This check implies XSAVE support.  OSPKE only gets
 	 * set if we enable XSAVE and we enable PKU in XCR0.
 	 */
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return -EINVAL;
 
 	/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 704e7aaafc30..befd6174467c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -138,7 +138,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
 		       d3, d6, d7);
 	}
 
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
+	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
 		printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 61562a6c310c..b1cfb26d06c7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -971,7 +971,7 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
 	/* This code is always called on the current mm */
 	bool foreign = false;
 
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return false;
 	if (error_code & X86_PF_PK)
 		return true;
-- 
Gitee


From 61c177ab8ad2c4b2c17ac7c399c83ca3a30e0c91 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:08 +0200
Subject: [PATCH 0148/2161] x86/pkru: Provide pkru_get_init_value()

commit 739e2eec0f4849eb411567407d61491f923db405 upstream.

When CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS is disabled then the following
code fails to compile:

     if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
     	u32 pkru = READ_ONCE(init_pkru_value);
	..
     }

because init_pkru_value is defined as '0' which makes READ_ONCE() upset.

Provide an accessor macro to avoid #ifdeffery all over the place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.404880646@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pkru.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index 875ef03d228f..485d06e8be44 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -10,8 +10,10 @@
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 extern u32 init_pkru_value;
+#define pkru_get_init_value()	READ_ONCE(init_pkru_value)
 #else
 #define init_pkru_value	0
+#define pkru_get_init_value()	0
 #endif
 
 static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
-- 
Gitee


From eb6aed6ea01f6dac2de1daacad33c5a047704bd2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:09 +0200
Subject: [PATCH 0149/2161] x86/pkru: Provide pkru_write_default()

commit ff7ebff47c595e747aa1bb10d8a30b2acb7d425b upstream.

Provide a simple and trivial helper which just writes the PKRU default
value without trying to fiddle with the task's xsave buffer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.513729794@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pkru.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index 485d06e8be44..07d510ea5c13 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -61,4 +61,12 @@ static inline void write_pkru(u32 pkru)
 	fpregs_unlock();
 }
 
+static inline void pkru_write_default(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return;
+
+	wrpkru(pkru_get_init_value());
+}
+
 #endif
-- 
Gitee


From 4e03e32be0e5c2a4c67abe0236a216e444b4ec35 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:10 +0200
Subject: [PATCH 0150/2161] x86/cpu: Write the default PKRU value when enabling
 PKE

commit fa8c84b77a54bf3cf351c8b4b26a5aca27a14013 upstream.

In preparation of making the PKRU management more independent from XSTATES,
write the default PKRU value into the hardware right after enabling PKRU in
CR4. This ensures that switch_to() and copy_thread() have the correct
setting for init task and the per CPU idle threads right away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.622983906@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/common.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 25f766ee8c68..4898078deede 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -479,6 +479,8 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 	}
 
 	cr4_set_bits(X86_CR4_PKE);
+	/* Load the default PKRU value */
+	pkru_write_default();
 }
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-- 
Gitee


From b6635bee10a9d66bca99b70c5abbea761ee6557f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:11 +0200
Subject: [PATCH 0151/2161] x86/fpu: Use pkru_write_default() in
 copy_init_fpstate_to_fpregs()

commit 371071131cd1032c1e9172c51234a2a324841cab upstream.

There is no point in using copy_init_pkru_to_fpregs() which in turn calls
write_pkru(). write_pkru() tries to fiddle with the task's xstate buffer
for nothing because the XRSTOR[S](init_fpstate) just cleared the xfeature
flag in the xstate header which makes get_xsave_addr() fail.

It's a useless exercise anyway because the reinitialization activates the
FPU so before the task's xstate buffer can be used again a XRSTOR[S] must
happen which in turn dumps the PKRU value.

Get rid of the now unused copy_init_pkru_to_fpregs().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.732508792@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pkeys.h |  1 -
 arch/x86/kernel/fpu/core.c   |  3 +--
 arch/x86/mm/pkeys.c          | 17 -----------------
 include/linux/pkeys.h        |  4 ----
 4 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 4128f647c755..5c7bcaa79623 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -124,7 +124,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
-extern void copy_init_pkru_to_fpregs(void);
 
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3866954354a4..fedadcb04ba2 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -311,8 +311,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 	else
 		frstor(&init_fpstate.fsave);
 
-	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
-		copy_init_pkru_to_fpregs();
+	pkru_write_default();
 }
 
 /*
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 5d0d571ab4ad..b5945b62d92a 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -10,7 +10,6 @@
 
 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
 #include <asm/mmu_context.h>            /* vma_pkey()                   */
-#include <asm/pkru.h>			/* read/write_pkru()		*/
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
@@ -125,22 +124,6 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
 		      PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
 		      PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
 
-/*
- * Called from the FPU code when creating a fresh set of FPU
- * registers.  This is called from a very specific context where
- * we know the FPU regstiers are safe for use and we can use PKRU
- * directly.
- */
-void copy_init_pkru_to_fpregs(void)
-{
-	u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
-	/*
-	 * Override the PKRU state that came from 'init_fpstate'
-	 * with the baseline from the process.
-	 */
-	write_pkru(init_pkru_value_snapshot);
-}
-
 static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 			     size_t count, loff_t *ppos)
 {
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 2955ba976048..6beb26b7151d 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -44,10 +44,6 @@ static inline bool arch_pkeys_enabled(void)
 	return false;
 }
 
-static inline void copy_init_pkru_to_fpregs(void)
-{
-}
-
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */
-- 
Gitee


From 19893d3cc4a9ca5bdfc428dd2b5ed27b78e04ef7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:12 +0200
Subject: [PATCH 0152/2161] x86/fpu: Rename fpu__clear_all() to
 fpu_flush_thread()

commit e7ecad17c84d0f6bef635c20d02bbe4096eea700 upstream.

Make it clear what the function is about.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.827979263@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 3 ++-
 arch/x86/kernel/fpu/core.c          | 4 ++--
 arch/x86/kernel/process.c           | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 9f31f1541eb1..f56827127c3d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -28,12 +28,13 @@
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
 extern void fpu__clear_user_states(struct fpu *fpu);
-extern void fpu__clear_all(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 
 extern void fpu_sync_fpstate(struct fpu *fpu);
 
+/* Clone and exit operations */
 extern int  fpu_clone(struct task_struct *dst);
+extern void fpu_flush_thread(void);
 
 /*
  * Boot time FPU initialization functions:
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index fedadcb04ba2..4b69be9bea55 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -350,9 +350,9 @@ void fpu__clear_user_states(struct fpu *fpu)
 	fpu__clear(fpu, true);
 }
 
-void fpu__clear_all(struct fpu *fpu)
+void fpu_flush_thread(void)
 {
-	fpu__clear(fpu, false);
+	fpu__clear(&current->thread.fpu, false);
 }
 
 /*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 24131a417e0b..6cf94d174540 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -240,7 +240,7 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	fpu__clear_all(&tsk->thread.fpu);
+	fpu_flush_thread();
 }
 
 void disable_TSC(void)
-- 
Gitee


From f54c27de74ff70c39f00a36a90a0da66be462324 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 23 Jun 2021 14:02:13 +0200
Subject: [PATCH 0153/2161] x86/fpu: Clean up the fpu__clear() variants

commit 33344368cb08f8d6bf55a32aa052318d3a69ea84 upstream.

fpu__clear() currently resets both register state and kernel XSAVE buffer
state.  It has two modes: one for all state (supervisor and user) and
another for user state only.  fpu__clear_all() uses the "all state"
(user_only=0) mode, while a number of signal paths use the user_only=1
mode.

Make fpu__clear() work only for user state (user_only=1) and remove the
"all state" (user_only=0) code.  Rename it to match so it can be used by
the signal paths.

Replace the "all state" (user_only=0) fpu__clear() functionality.  Use the
TIF_NEED_FPU_LOAD functionality instead of making any actual hardware
registers changes in this path.

Instead of invoking fpu__initialize() just memcpy() init_fpstate into the
task's FPU state because that has already the correct format and in case of
PKRU also contains the default PKRU value. Move the actual PKRU write out
into flush_thread() where it belongs and where it will end up anyway when
PKRU and XSTATE have been untangled.

For bisectability a workaround is required which stores the PKRU value in
the xstate memory until PKRU is untangled from XSTATE for context
switching and return to user.

[ Dave Hansen: Polished changelog ]
[ tglx: Fixed the PKRU fallout ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.922729522@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 113 +++++++++++++++++++++++++------------
 arch/x86/kernel/process.c  |  10 ++++
 2 files changed, 86 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 4b69be9bea55..aa7e808b9d1e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -259,19 +259,6 @@ int fpu_clone(struct task_struct *dst)
 	return 0;
 }
 
-/*
- * Activate the current task's in-memory FPU context,
- * if it has not been used before:
- */
-static void fpu__initialize(struct fpu *fpu)
-{
-	WARN_ON_FPU(fpu != &current->thread.fpu);
-
-	set_thread_flag(TIF_NEED_FPU_LOAD);
-	fpstate_init(&fpu->state);
-	trace_x86_fpu_init_state(fpu);
-}
-
 /*
  * Drops current FPU state: deactivates the fpregs and
  * the fpstate. NOTE: it still leaves previous contents
@@ -314,47 +301,99 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 	pkru_write_default();
 }
 
+static inline unsigned int init_fpstate_copy_size(void)
+{
+	if (!use_xsave())
+		return fpu_kernel_xstate_size;
+
+	/* XSAVE(S) just needs the legacy and the xstate header part */
+	return sizeof(init_fpstate.xsave);
+}
+
+/* Temporary workaround. Will be removed once PKRU and XSTATE are untangled. */
+static inline void pkru_set_default_in_xstate(struct xregs_state *xsave)
+{
+	struct pkru_state *pk;
+
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return;
+	/*
+	 * Force XFEATURE_PKRU to be set in the header otherwise
+	 * get_xsave_addr() does not work and it also needs to be set to
+	 * make XRSTOR(S) load it.
+	 */
+	xsave->header.xfeatures |= XFEATURE_MASK_PKRU;
+	pk = get_xsave_addr(xsave, XFEATURE_PKRU);
+	pk->pkru = pkru_get_init_value();
+}
+
 /*
- * Clear the FPU state back to init state.
- *
- * Called by sys_execve(), by the signal handler code and by various
- * error paths.
+ * Reset current->fpu memory state to the init values.
+ */
+static void fpu_reset_fpstate(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	fpregs_lock();
+	fpu__drop(fpu);
+	/*
+	 * This does not change the actual hardware registers. It just
+	 * resets the memory image and sets TIF_NEED_FPU_LOAD so a
+	 * subsequent return to usermode will reload the registers from the
+	 * task's memory image.
+	 *
+	 * Do not use fpstate_init() here. Just copy init_fpstate which has
+	 * the correct content already except for PKRU.
+	 */
+	memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size());
+	pkru_set_default_in_xstate(&fpu->state.xsave);
+	set_thread_flag(TIF_NEED_FPU_LOAD);
+	fpregs_unlock();
+}
+
+/*
+ * Reset current's user FPU states to the init states.  current's
+ * supervisor states, if any, are not modified by this function.  The
+ * caller guarantees that the XSTATE header in memory is intact.
  */
-static void fpu__clear(struct fpu *fpu, bool user_only)
+void fpu__clear_user_states(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	if (!static_cpu_has(X86_FEATURE_FPU)) {
-		fpu__drop(fpu);
-		fpu__initialize(fpu);
+	fpregs_lock();
+	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+		fpu_reset_fpstate();
+		fpregs_unlock();
 		return;
 	}
 
-	fpregs_lock();
-
-	if (user_only) {
-		if (!fpregs_state_valid(fpu, smp_processor_id()) &&
-		    xfeatures_mask_supervisor())
-			os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
-		restore_fpregs_from_init_fpstate(xfeatures_mask_user());
-	} else {
-		restore_fpregs_from_init_fpstate(xfeatures_mask_all);
+	/*
+	 * Ensure that current's supervisor states are loaded into their
+	 * corresponding registers.
+	 */
+	if (xfeatures_mask_supervisor() &&
+	    !fpregs_state_valid(fpu, smp_processor_id())) {
+		os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
 	}
 
+	/* Reset user states in registers. */
+	restore_fpregs_from_init_fpstate(xfeatures_mask_user());
+
+	/*
+	 * Now all FPU registers have their desired values.  Inform the FPU
+	 * state machine that current's FPU registers are in the hardware
+	 * registers. The memory image does not need to be updated because
+	 * any operation relying on it has to save the registers first when
+	 * current's FPU is marked active.
+	 */
 	fpregs_mark_activate();
 	fpregs_unlock();
 }
 
-void fpu__clear_user_states(struct fpu *fpu)
-{
-	fpu__clear(fpu, true);
-}
-
 void fpu_flush_thread(void)
 {
-	fpu__clear(&current->thread.fpu, false);
+	fpu_reset_fpstate();
 }
-
 /*
  * Load FPU context before returning to userspace.
  */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6cf94d174540..90d8ef86e2c3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -131,6 +131,15 @@ void exit_thread(struct task_struct *tsk)
 	fpu__drop(fpu);
 }
 
+static void pkru_flush_thread(void)
+{
+	/*
+	 * If PKRU is enabled the default PKRU value has to be loaded into
+	 * the hardware right here (similar to context switch).
+	 */
+	pkru_write_default();
+}
+
 static int set_new_tls(struct task_struct *p, unsigned long tls)
 {
 	struct user_desc __user *utls = (struct user_desc __user *)tls;
@@ -241,6 +250,7 @@ void flush_thread(void)
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
 	fpu_flush_thread();
+	pkru_flush_thread();
 }
 
 void disable_TSC(void)
-- 
Gitee


From 2950c89afec6919cf92d9078405d0e5a2704e4bc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:14 +0200
Subject: [PATCH 0154/2161] x86/fpu: Rename __fpregs_load_activate() to
 fpregs_restore_userregs()

commit 727d01100e15b18c67f05fb697779ad2a6c99b63 upstream.

Rename it so that it becomes entirely clear what this function is
about. It's purpose is to restore the FPU registers to the state which was
saved in the task's FPU memory state either at context switch or by an in
kernel FPU user.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.018867925@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 6 ++----
 arch/x86/kernel/fpu/core.c          | 2 +-
 arch/x86/kernel/fpu/signal.c        | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index f56827127c3d..c12d3fb3136a 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -469,10 +469,8 @@ static inline void fpregs_activate(struct fpu *fpu)
 	trace_x86_fpu_regs_activated(fpu);
 }
 
-/*
- * Internal helper, do not use directly. Use switch_fpu_return() instead.
- */
-static inline void __fpregs_load_activate(void)
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
 {
 	struct fpu *fpu = &current->thread.fpu;
 	int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index aa7e808b9d1e..6babf1876389 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -402,7 +402,7 @@ void switch_fpu_return(void)
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return;
 
-	__fpregs_load_activate();
+	fpregs_restore_userregs();
 }
 EXPORT_SYMBOL_GPL(switch_fpu_return);
 
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index f754996d9046..8a4382487404 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -188,7 +188,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	 */
 	fpregs_lock();
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
-		__fpregs_load_activate();
+		fpregs_restore_userregs();
 
 	pagefault_disable();
 	ret = copy_fpregs_to_sigframe(buf_fx);
-- 
Gitee


From b8aab2bfa78319c5bbca4c705d5bcb677ccc7dda Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:15 +0200
Subject: [PATCH 0155/2161] x86/fpu: Move FXSAVE_LEAK quirk info
 __copy_kernel_to_fpregs()

commit 1d9bffab116fadfe1594f5fea2b50ab280d81d30 upstream.

copy_kernel_to_fpregs() restores all xfeatures but it is also the place
where the AMD FXSAVE_LEAK bug is handled.

That prevents fpregs_restore_userregs() to limit the restored features,
which is required to untangle PKRU and XSTATE handling and also for the
upcoming supervisor state management.

Move the FXSAVE_LEAK quirk into __copy_kernel_to_fpregs() and deinline that
function which has become rather fat.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.114271278@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 25 +------------------------
 arch/x86/kernel/fpu/core.c          | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index c12d3fb3136a..6ff631b59ecd 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -383,33 +383,10 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
 	return err;
 }
 
-static inline void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
-{
-	if (use_xsave()) {
-		os_xrstor(&fpstate->xsave, mask);
-	} else {
-		if (use_fxsr())
-			fxrstor(&fpstate->fxsave);
-		else
-			frstor(&fpstate->fsave);
-	}
-}
+extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
 
 static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
 {
-	/*
-	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
-	 * pending. Clear the x87 state here by setting it to fixed values.
-	 * "m" is a random variable that should be in L1.
-	 */
-	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
-		asm volatile(
-			"fnclex\n\t"
-			"emms\n\t"
-			"fildl %P[addr]"	/* set F?P to defined value */
-			: : [addr] "m" (fpstate));
-	}
-
 	__restore_fpregs_from_fpstate(fpstate, -1);
 }
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 6babf1876389..afd0deee8cfd 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -124,6 +124,33 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
 }
 EXPORT_SYMBOL(save_fpregs_to_fpstate);
 
+void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
+{
+	/*
+	 * AMD K7/K8 and later CPUs up to Zen don't save/restore
+	 * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
+	 * here by setting it to fixed values.  "m" is a random variable
+	 * that should be in L1.
+	 */
+	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
+		asm volatile(
+			"fnclex\n\t"
+			"emms\n\t"
+			"fildl %P[addr]"	/* set F?P to defined value */
+			: : [addr] "m" (fpstate));
+	}
+
+	if (use_xsave()) {
+		os_xrstor(&fpstate->xsave, mask);
+	} else {
+		if (use_fxsr())
+			fxrstor(&fpstate->fxsave);
+		else
+			frstor(&fpstate->fsave);
+	}
+}
+EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate);
+
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
 	preempt_disable();
-- 
Gitee


From 7fd3bdef1fd33e0811a7df6e0dc173ba024932ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:16 +0200
Subject: [PATCH 0156/2161] x86/fpu: Rename xfeatures_mask_user() to
 xfeatures_mask_uabi()

commit 65e952102122bf89f0e4f1bebf8664e32587aaed upstream.

Rename it so it's clear that this is about user ABI features which can
differ from the feature set which the kernel saves and restores because the
kernel handles e.g. PKRU differently. But the user ABI (ptrace, signal
frame) expects it to be there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.211585137@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  7 ++++++-
 arch/x86/include/asm/fpu/xstate.h   |  6 +++++-
 arch/x86/kernel/fpu/core.c          |  2 +-
 arch/x86/kernel/fpu/signal.c        | 10 +++++-----
 arch/x86/kernel/fpu/xstate.c        | 18 +++++++++---------
 5 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 6ff631b59ecd..6d3ab778b2a5 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -328,7 +328,12 @@ static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
  */
 static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
 {
-	u64 mask = xfeatures_mask_user();
+	/*
+	 * Include the features which are not xsaved/rstored by the kernel
+	 * internally, e.g. PKRU. That's user space ABI and also required
+	 * to allow the signal handler to modify PKRU.
+	 */
+	u64 mask = xfeatures_mask_uabi();
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 879f2c19cd6b..ac917dce531e 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -83,7 +83,11 @@ static inline u64 xfeatures_mask_supervisor(void)
 	return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
 }
 
-static inline u64 xfeatures_mask_user(void)
+/*
+ * The xfeatures which are enabled in XCR0 and expected to be in ptrace
+ * buffers and signal frames.
+ */
+static inline u64 xfeatures_mask_uabi(void)
 {
 	return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
 }
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index afd0deee8cfd..12437383ff79 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -404,7 +404,7 @@ void fpu__clear_user_states(struct fpu *fpu)
 	}
 
 	/* Reset user states in registers. */
-	restore_fpregs_from_init_fpstate(xfeatures_mask_user());
+	restore_fpregs_from_init_fpstate(xfeatures_mask_uabi());
 
 	/*
 	 * Now all FPU registers have their desired values.  Inform the FPU
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 8a4382487404..82e605df0529 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -257,14 +257,14 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 
 	if (use_xsave()) {
 		if (fx_only) {
-			init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
+			init_bv = xfeatures_mask_uabi() & ~XFEATURE_MASK_FPSSE;
 
 			r = fxrstor_from_user_sigframe(buf);
 			if (!r)
 				os_xrstor(&init_fpstate.xsave, init_bv);
 			return r;
 		} else {
-			init_bv = xfeatures_mask_user() & ~xbv;
+			init_bv = xfeatures_mask_uabi() & ~xbv;
 
 			r = xrstor_from_user_sigframe(buf, xbv);
 			if (!r && unlikely(init_bv))
@@ -420,7 +420,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	fpregs_unlock();
 
 	if (use_xsave() && !fx_only) {
-		u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
+		u64 init_bv = xfeatures_mask_uabi() & ~user_xfeatures;
 
 		ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
 		if (ret)
@@ -454,7 +454,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		if (use_xsave()) {
 			u64 init_bv;
 
-			init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
+			init_bv = xfeatures_mask_uabi() & ~XFEATURE_MASK_FPSSE;
 			os_xrstor(&init_fpstate.xsave, init_bv);
 		}
 
@@ -549,7 +549,7 @@ void fpu__init_prepare_fx_sw_frame(void)
 
 	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
 	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xfeatures = xfeatures_mask_user();
+	fx_sw_reserved.xfeatures = xfeatures_mask_uabi();
 	fx_sw_reserved.xstate_size = fpu_user_xstate_size;
 
 	if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 64193bf65038..88a529ca270f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -144,7 +144,7 @@ void fpu__init_cpu_xstate(void)
 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
 	 * states can be set here.
 	 */
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
 
 	/*
 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
@@ -450,7 +450,7 @@ int xfeature_size(int xfeature_nr)
 static int validate_user_xstate_header(const struct xstate_header *hdr)
 {
 	/* No unknown or supervisor features may be set */
-	if (hdr->xfeatures & ~xfeatures_mask_user())
+	if (hdr->xfeatures & ~xfeatures_mask_uabi())
 		return -EINVAL;
 
 	/* Userspace must use the uncompacted format */
@@ -753,7 +753,7 @@ void __init fpu__init_system_xstate(void)
 	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
 	xfeatures_mask_all |= ecx + ((u64)edx << 32);
 
-	if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+	if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		/*
 		 * This indicates that something really unexpected happened
 		 * with the enumeration.  Disable XSAVE and try to continue
@@ -788,7 +788,7 @@ void __init fpu__init_system_xstate(void)
 	 * Update info used for ptrace frames; use standard-format size and no
 	 * supervisor xstates:
 	 */
-	update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user());
+	update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi());
 
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
@@ -825,14 +825,14 @@ void fpu__resume_cpu(void)
 	/*
 	 * Restore XCR0 on xsave capable CPUs:
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVE))
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
+	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
 
 	/*
 	 * Restore IA32_XSS. The same CPUID bit enumerates support
 	 * of XSAVES and MSR_IA32_XSS.
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
 				     xfeatures_mask_independent());
 	}
@@ -1020,7 +1020,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 		break;
 
 	case XSTATE_COPY_XSAVE:
-		header.xfeatures &= xfeatures_mask_user();
+		header.xfeatures &= xfeatures_mask_uabi();
 		break;
 	}
 
@@ -1065,7 +1065,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 		 * compacted init_fpstate. The gap tracking will zero this
 		 * later.
 		 */
-		if (!(xfeatures_mask_user() & BIT_ULL(i)))
+		if (!(xfeatures_mask_uabi() & BIT_ULL(i)))
 			continue;
 
 		/*
-- 
Gitee


From 82a318d98a42539dc802642033de6baf8c701b95 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:17 +0200
Subject: [PATCH 0157/2161] x86/fpu: Dont restore PKRU in
 fpregs_restore_userspace()

commit 2ebe81c6d800576e1213f9d7cf0068017ae610c1 upstream.

switch_to() and flush_thread() write the task's PKRU value eagerly so
the PKRU value of current is always valid in the hardware.

That means there is no point in restoring PKRU on exit to user or when
reactivating the task's FPU registers in the signal frame setup path.

This allows to remove all the xstate buffer updates with PKRU values once
the PKRU state is stored in thread struct while a task is scheduled out.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.303919033@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 16 +++++++++++++++-
 arch/x86/include/asm/fpu/xstate.h   | 19 +++++++++++++++++++
 arch/x86/kernel/fpu/core.c          |  2 +-
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 6d3ab778b2a5..ae0d4bef9088 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -461,7 +461,21 @@ static inline void fpregs_restore_userregs(void)
 		return;
 
 	if (!fpregs_state_valid(fpu, cpu)) {
-		restore_fpregs_from_fpstate(&fpu->state);
+		u64 mask;
+
+		/*
+		 * This restores _all_ xstate which has not been
+		 * established yet.
+		 *
+		 * If PKRU is enabled, then the PKRU value is already
+		 * correct because it was either set in switch_to() or in
+		 * flush_thread(). So it is excluded because it might be
+		 * not up to date in current->thread.fpu.xsave state.
+		 */
+		mask = xfeatures_mask_restore_user() |
+			xfeatures_mask_supervisor();
+		__restore_fpregs_from_fpstate(&fpu->state, mask);
+
 		fpregs_activate(fpu);
 		fpu->last_cpu = cpu;
 	}
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index ac917dce531e..934badad75b2 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -35,6 +35,14 @@
 				      XFEATURE_MASK_BNDREGS | \
 				      XFEATURE_MASK_BNDCSR)
 
+/*
+ * Features which are restored when returning to user space.
+ * PKRU is not restored on return to user space because PKRU
+ * is switched eagerly in switch_to() and flush_thread()
+ */
+#define XFEATURE_MASK_USER_RESTORE	\
+	(XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
+
 /* All currently supported supervisor features */
 #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
 
@@ -92,6 +100,17 @@ static inline u64 xfeatures_mask_uabi(void)
 	return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
 }
 
+/*
+ * The xfeatures which are restored by the kernel when returning to user
+ * mode. This is not necessarily the same as xfeatures_mask_uabi() as the
+ * kernel does not manage all XCR0 enabled features via xsave/xrstor as
+ * some of them have to be switched eagerly on context switch and exec().
+ */
+static inline u64 xfeatures_mask_restore_user(void)
+{
+	return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE;
+}
+
 static inline u64 xfeatures_mask_independent(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 12437383ff79..470576ced907 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -404,7 +404,7 @@ void fpu__clear_user_states(struct fpu *fpu)
 	}
 
 	/* Reset user states in registers. */
-	restore_fpregs_from_init_fpstate(xfeatures_mask_uabi());
+	restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user());
 
 	/*
 	 * Now all FPU registers have their desired values.  Inform the FPU
-- 
Gitee


From 2d66fa9b13e19790658733b74d22187da2e38b79 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 1 Nov 2021 20:26:51 +0800
Subject: [PATCH 0158/2161] x86/fpu: Add PKRU storage outside of task XSAVE
 buffer

commit 9782a712eb971ce483442076e79eb1d8d608646e upstream.

PKRU is currently partly XSAVE-managed and partly not. It has space
in the task XSAVE buffer and is context-switched by XSAVE/XRSTOR.
However, it is switched more eagerly than FPU because there may be a
need for PKRU to be up-to-date for things like copy_to/from_user() since
PKRU affects user-permission memory accesses, not just accesses from
userspace itself.

This leaves PKRU in a very odd position. XSAVE brings very little value
to the table for how Linux uses PKRU except for signal related XSTATE
handling.

Prepare to move PKRU away from being XSAVE-managed. Allocate space in
the thread_struct for it and save/restore it in the context-switch path
separately from the XSAVE-managed features. task->thread_struct.pkru
is only valid when the task is scheduled out. For the current task the
authoritative source is the hardware, i.e. it has to be retrieved via
rdpkru().

Leave the XSAVE code in place for now to ensure bisectability.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.399107624@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/alpha/kernel/process.c      |  8 ++++++++
 arch/x86/include/asm/processor.h |  9 +++++++++
 arch/x86/kernel/process_64.c     | 25 +++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 48b81d015d8a..8b1524b7af77 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -253,6 +253,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		/* kernel thread */
+		p->thread.pkru = pkru_get_init_value();
 		memset(childstack, 0,
 			sizeof(struct switch_stack) + sizeof(struct pt_regs));
 		childstack->r26 = (unsigned long) ret_from_kernel_thread;
@@ -262,6 +263,13 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 		childti->pcb.usp = 0;
 		return 0;
 	}
+
+	/*
+	 * Clone current's PKRU value from hardware. tsk->thread.pkru
+	 * is only valid when scheduled out.
+	 */
+	p->thread.pkru = read_pkru();
+
 	/* Note: if CLONE_SETTLS is not set, then we must inherit the
 	   value from the parent, which will have been set by the block
 	   copy in dup_task_struct.  This is non-intuitive, but is
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6574bf1c9dc8..798079909a73 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -488,6 +488,15 @@ struct thread_struct {
 	unsigned int		sig_on_uaccess_err:1;
 	unsigned int		uaccess_err:1;	/* uaccess failed */
 
+	/*
+	 * Protection Keys Register for Userspace.  Loaded immediately on
+	 * context switch. Store it in thread_struct to avoid a lookup in
+	 * the tasks's FPU xstate buffer. This value is only valid when a
+	 * task is scheduled out. For 'current' the authoritative source of
+	 * PKRU is the hardware itself.
+	 */
+	u32			pkru;
+
 	/* Floating point and extended processor state */
 	struct fpu		fpu;
 	/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index befd6174467c..8855d69bab35 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -341,6 +341,29 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
 	}
 }
 
+/*
+ * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
+ * is not XSTATE managed on context switch because that would require a
+ * lookup in the task's FPU xsave buffer and require to keep that updated
+ * in various places.
+ */
+static __always_inline void x86_pkru_load(struct thread_struct *prev,
+					  struct thread_struct *next)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return;
+
+	/* Stash the prev task's value: */
+	prev->pkru = rdpkru();
+
+	/*
+	 * PKRU writes are slightly expensive.  Avoid them when not
+	 * strictly necessary:
+	 */
+	if (prev->pkru != next->pkru)
+		wrpkru(next->pkru);
+}
+
 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
 					      struct thread_struct *next)
 {
@@ -590,6 +613,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	x86_fsgsbase_load(prev, next);
 
+	x86_pkru_load(prev, next);
+
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
-- 
Gitee


From 62d9549d328a15a0f61b837ab3feb46bb69b6892 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 23 Jun 2021 14:02:19 +0200
Subject: [PATCH 0159/2161] x86/fpu: Hook up PKRU into ptrace()

commit e84ba47e313dbc097bf859bb6e4f9219883d5f78 upstream.

One nice thing about having PKRU be XSAVE-managed is that it gets naturally
exposed into the XSAVE-using ABIs.  Now that XSAVE will not be used to
manage PKRU, these ABIs need to be manually enabled to deal with PKRU.

ptrace() uses copy_uabi_xstate_to_kernel() to collect the tracee's
XSTATE. As PKRU is not in the task's XSTATE buffer, use task->thread.pkru
for filling in up the ptrace buffer.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.508770763@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  2 +-
 arch/x86/kernel/fpu/regset.c      | 10 ++++------
 arch/x86/kernel/fpu/xstate.c      | 25 ++++++++++++++++++-------
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 934badad75b2..b283bb8422a4 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -140,7 +140,7 @@ enum xstate_copy_mode {
 };
 
 struct membuf;
-void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 			     enum xstate_copy_mode mode);
 
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index b4be376da5a1..d71fd3adfee0 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -78,7 +78,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 				    sizeof(fpu->state.fxsave));
 	}
 
-	copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_FX);
+	copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
 	return 0;
 }
 
@@ -126,14 +126,12 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 		struct membuf to)
 {
-	struct fpu *fpu = &target->thread.fpu;
-
 	if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
 		return -ENODEV;
 
-	sync_fpstate(fpu);
+	sync_fpstate(&target->thread.fpu);
 
-	copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_XSAVE);
+	copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE);
 	return 0;
 }
 
@@ -336,7 +334,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 		struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) };
 
 		/* Handle init state optimized xstate correctly */
-		copy_xstate_to_uabi_buf(mb, &fpu->state.xsave, XSTATE_COPY_FP);
+		copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
 		fx = &fxsave;
 	} else {
 		fx = &fpu->state.fxsave;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 88a529ca270f..0daea92e6847 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -989,7 +989,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
 /**
  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
  * @to:		membuf descriptor
- * @xsave:	The kernel xstate buffer to copy from
+ * @tsk:	The task from which to copy the saved xstate
  * @copy_mode:	The requested copy mode
  *
  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
@@ -998,10 +998,11 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
  *
  * It supports partial copy but @to.pos always starts from zero.
  */
-void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 			     enum xstate_copy_mode copy_mode)
 {
 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
 	struct xregs_state *xinit = &init_fpstate.xsave;
 	struct xstate_header header;
 	unsigned int zerofrom;
@@ -1075,11 +1076,21 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 		if (zerofrom < xstate_offsets[i])
 			membuf_zero(&to, xstate_offsets[i] - zerofrom);
 
-		copy_feature(header.xfeatures & BIT_ULL(i), &to,
-			     __raw_xsave_addr(xsave, i),
-			     __raw_xsave_addr(xinit, i),
-			     xstate_sizes[i]);
-
+		if (i == XFEATURE_PKRU) {
+			struct pkru_state pkru = {0};
+			/*
+			 * PKRU is not necessarily up to date in the
+			 * thread's XSAVE buffer.  Fill this part from the
+			 * per-thread storage.
+			 */
+			pkru.pkru = tsk->thread.pkru;
+			membuf_write(&to, &pkru, sizeof(pkru));
+		} else {
+			copy_feature(header.xfeatures & BIT_ULL(i), &to,
+				     __raw_xsave_addr(xsave, i),
+				     __raw_xsave_addr(xinit, i),
+				     xstate_sizes[i]);
+		}
 		/*
 		 * Keep track of the last copied state in the non-compacted
 		 * target buffer for gap zeroing.
-- 
Gitee


From 50b555ab3e45ac83b26ae20a804c8d8cfac3c98b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:20 +0200
Subject: [PATCH 0160/2161] x86/fpu: Mask PKRU from kernel XRSTOR[S] operations

commit opencloudos.

Port from 30a304a138738d71a09c730ca8044e9662de0dbf

As the PKRU state is managed separately restoring it from the xstate
buffer would be counterproductive as it might either restore a stale
value or reinit the PKRU state to 0.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.606745195@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  4 ++--
 arch/x86/include/asm/fpu/xstate.h   | 10 ++++++++++
 arch/x86/kernel/fpu/xstate.c        |  1 +
 arch/x86/mm/extable.c               |  2 +-
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index ae0d4bef9088..bd20ffca7f25 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -263,7 +263,7 @@ static inline void fxsave(struct fxregs_state *fx)
  */
 static inline void os_xrstor_booting(struct xregs_state *xstate)
 {
-	u64 mask = -1;
+	u64 mask = xfeatures_mask_fpstate();
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
@@ -392,7 +392,7 @@ extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
 
 static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
 {
-	__restore_fpregs_from_fpstate(fpstate, -1);
+	__restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
 }
 
 extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index b283bb8422a4..2df5bd667a43 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -111,6 +111,16 @@ static inline u64 xfeatures_mask_restore_user(void)
 	return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE;
 }
 
+/*
+ * Like xfeatures_mask_restore_user() but additionally restors the
+ * supported supervisor states.
+ */
+static inline u64 xfeatures_mask_fpstate(void)
+{
+	return xfeatures_mask_all & \
+		(XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED);
+}
+
 static inline u64 xfeatures_mask_independent(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0daea92e6847..b106ddf6262f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -60,6 +60,7 @@ static short xsave_cpuid_features[] __initdata = {
  * XSAVE buffer, both supervisor and user xstates.
  */
 u64 xfeatures_mask_all __ro_after_init;
+EXPORT_SYMBOL_GPL(xfeatures_mask_all);
 
 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index de0b5158563d..e91d45fc78fa 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -113,7 +113,7 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
 	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
 		  (void *)instruction_pointer(regs));
 
-	__restore_fpregs_from_fpstate(&init_fpstate, -1);
+	__restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
 	return true;
 }
 EXPORT_SYMBOL_GPL(ex_handler_fprestore);
-- 
Gitee


From 72522544e16c36ff2052e8bf5b1c49fa688647ec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:21 +0200
Subject: [PATCH 0161/2161] x86/fpu: Remove PKRU handling from
 switch_fpu_finish()

commit 954436989cc550dd91aab98363240c9c0a4b7e23 upstream.

PKRU is already updated and the xstate is not longer the proper source
of information.

 [ bp: Use cpu_feature_enabled() ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.708180184@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 35 ++++-------------------------
 1 file changed, 4 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index bd20ffca7f25..b73c5cec7ff5 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -529,40 +529,13 @@ static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
  */
 
 /*
- * Load PKRU from the FPU context if available. Delay loading of the
- * complete FPU state until the return to userland.
+ * Delay loading of the complete FPU state until the return to userland.
+ * PKRU is handled separately.
  */
 static inline void switch_fpu_finish(struct task_struct *next)
 {
-	u32 pkru_val = init_pkru_value;
-	struct pkru_state *pk;
-	struct fpu *next_fpu = &next->thread.fpu;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return;
-
-	set_thread_flag(TIF_NEED_FPU_LOAD);
-
-	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
-		return;
-
-	/*
-	 * PKRU state is switched eagerly because it needs to be valid before we
-	 * return to userland e.g. for a copy_to_user() operation.
-	 */
-	if (!(next->flags & PF_KTHREAD)) {
-		/*
-		 * If the PKRU bit in xsave.header.xfeatures is not set,
-		 * then the PKRU component was in init state, which means
-		 * XRSTOR will set PKRU to 0. If the bit is not set then
-		 * get_xsave_addr() will return NULL because the PKRU value
-		 * in memory is not valid. This means pkru_val has to be
-		 * set to 0 and not to init_pkru_value.
-		 */
-		pk = get_xsave_addr(&next_fpu->state.xsave, XFEATURE_PKRU);
-		pkru_val = pk ? pk->pkru : 0;
-	}
-	__write_pkru(pkru_val);
+	if (cpu_feature_enabled(X86_FEATURE_FPU))
+		set_thread_flag(TIF_NEED_FPU_LOAD);
 }
 
 /*
-- 
Gitee


From 66ae589e00af8a613241495beecac225d89cc3b3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:22 +0200
Subject: [PATCH 0162/2161] x86/fpu: Don't store PKRU in xstate in
 fpu_reset_fpstate()

commit 0e8c54f6b2c8b1037cef9276e451522ee90ed969 upstream.

PKRU for a task is stored in task->thread.pkru when the task is scheduled
out. For 'current' the authoritative source of PKRU is the hardware.

fpu_reset_fpstate() has two callers:

  1) fpu__clear_user_states() for !FPU systems. For those PKRU is irrelevant

  2) fpu_flush_thread() which is invoked from flush_thread(). flush_thread()
     resets the hardware to the kernel restrictive default value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.802850233@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 470576ced907..5295cbafc92e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -337,23 +337,6 @@ static inline unsigned int init_fpstate_copy_size(void)
 	return sizeof(init_fpstate.xsave);
 }
 
-/* Temporary workaround. Will be removed once PKRU and XSTATE are untangled. */
-static inline void pkru_set_default_in_xstate(struct xregs_state *xsave)
-{
-	struct pkru_state *pk;
-
-	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
-		return;
-	/*
-	 * Force XFEATURE_PKRU to be set in the header otherwise
-	 * get_xsave_addr() does not work and it also needs to be set to
-	 * make XRSTOR(S) load it.
-	 */
-	xsave->header.xfeatures |= XFEATURE_MASK_PKRU;
-	pk = get_xsave_addr(xsave, XFEATURE_PKRU);
-	pk->pkru = pkru_get_init_value();
-}
-
 /*
  * Reset current->fpu memory state to the init values.
  */
@@ -371,9 +354,12 @@ static void fpu_reset_fpstate(void)
 	 *
 	 * Do not use fpstate_init() here. Just copy init_fpstate which has
 	 * the correct content already except for PKRU.
+	 *
+	 * PKRU handling does not rely on the xstate when restoring for
+	 * user space as PKRU is eagerly written in switch_to() and
+	 * flush_thread().
 	 */
 	memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size());
-	pkru_set_default_in_xstate(&fpu->state.xsave);
 	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpregs_unlock();
 }
-- 
Gitee


From 65e46afac205522e6e55d0477f089d67cbbc804c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:23 +0200
Subject: [PATCH 0163/2161] x86/pkru: Remove xstate fiddling from write_pkru()

commit 72a6c08c44e4460e39315ca828f60b8d5afd6b19 upstream.

The PKRU value of a task is stored in task->thread.pkru when the task is
scheduled out. PKRU is restored on schedule in from there. So keeping the
XSAVE buffer up to date is a pointless exercise.

Remove the xstate fiddling and cleanup all related functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.897372712@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pkru.h          | 17 ++++-------------
 arch/x86/include/asm/special_insns.h | 14 +-------------
 2 files changed, 5 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index 07d510ea5c13..4bb2a34ef498 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -42,23 +42,14 @@ static inline u32 read_pkru(void)
 
 static inline void write_pkru(u32 pkru)
 {
-	struct pkru_state *pk;
-
 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return;
-
-	pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);
-
 	/*
-	 * The PKRU value in xstate needs to be in sync with the value that is
-	 * written to the CPU. The FPU restore on return to userland would
-	 * otherwise load the previous value again.
+	 * WRPKRU is relatively expensive compared to RDPKRU.
+	 * Avoid WRPKRU when it would not change the value.
 	 */
-	fpregs_lock();
-	if (pk)
-		pk->pkru = pkru;
-	__write_pkru(pkru);
-	fpregs_unlock();
+	if (pkru != rdpkru())
+		wrpkru(pkru);
 }
 
 static inline void pkru_write_default(void)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2e0cdc64cb50..0ae84a5898a0 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -103,25 +103,13 @@ static inline void wrpkru(u32 pkru)
 		     : : "a" (pkru), "c"(ecx), "d"(edx));
 }
 
-static inline void __write_pkru(u32 pkru)
-{
-	/*
-	 * WRPKRU is relatively expensive compared to RDPKRU.
-	 * Avoid WRPKRU when it would not change the value.
-	 */
-	if (pkru == rdpkru())
-		return;
-
-	wrpkru(pkru);
-}
-
 #else
 static inline u32 rdpkru(void)
 {
 	return 0;
 }
 
-static inline void __write_pkru(u32 pkru)
+static inline void wrpkru(u32 pkru)
 {
 }
 #endif
-- 
Gitee


From f185755ca71e6bf86670b1b64256224630efc9c8 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Wed, 14 Sep 2022 15:29:50 +0800
Subject: [PATCH 0164/2161] Change __write_pkru to write_pkru

commit opencloudos.

Since we delete the __write_pkru in (commit af6fa1a1327f36828cb64f2bc4e2ff7529b5f6f4)
x86/pkru: Remove xstate fiddling from write_pkru(), we change
__write_pkru to write_pkru in x86.c.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ceea868d0981..68d58541307b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -848,7 +848,7 @@ void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
 	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 	     (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
 	    vcpu->arch.pkru != vcpu->arch.host_pkru)
-		__write_pkru(vcpu->arch.pkru);
+		write_pkru(vcpu->arch.pkru);
 }
 EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
 
@@ -859,7 +859,7 @@ void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 	     (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
 		vcpu->arch.pkru = rdpkru();
 		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
-			__write_pkru(vcpu->arch.host_pkru);
+			write_pkru(vcpu->arch.host_pkru);
 	}
 
 	if (vcpu->guest_xcr0_loaded) {
-- 
Gitee


From 82f698a36f37b4d49f62638b05514ddd44121a48 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:24 +0200
Subject: [PATCH 0165/2161] x86/fpu: Mark init_fpstate __ro_after_init

commit bf68a7d98922e1665019b8bf0c4791500837c857 upstream.

Prot from bf68a7d98922e1665019b8bf0c4791500837c857

Nothing has to write into that state after init.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121456.992342060@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 5295cbafc92e..7ada7bd03a32 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -23,7 +23,7 @@
  * Represents the initial FPU state. It's mostly (but not completely) zeroes,
  * depending on the FPU hardware format:
  */
-union fpregs_state init_fpstate __read_mostly;
+union fpregs_state init_fpstate __ro_after_init;
 
 /*
  * Track whether the kernel is using the FPU state
-- 
Gitee


From 4c2437ff7c3a845461877fae22f56c3cb005b819 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:25 +0200
Subject: [PATCH 0166/2161] x86/fpu/signal: Move initial checks into
 fpu__restore_sig()

commit 99a5901951b70251965b0d1542d4a8c616842a99 upstream.

__fpu__restore_sig() is convoluted and some of the basic checks can
trivially be done in the calling function as well as the final error
handling of clearing user state.

 [ bp: Fixup typos. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121457.086336154@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 76 +++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 82e605df0529..9ed7cc244139 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -277,11 +277,11 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 		return frstor_from_user_sigframe(buf);
 }
 
-static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+			     bool ia32_fxstate)
 {
 	struct user_i387_ia32_struct *envp = NULL;
 	int state_size = fpu_kernel_xstate_size;
-	int ia32_fxstate = (buf != buf_fx);
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
 	struct user_i387_ia32_struct env;
@@ -289,26 +289,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	int fx_only = 0;
 	int ret = 0;
 
-	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
-			 IS_ENABLED(CONFIG_IA32_EMULATION));
-
-	if (!buf) {
-		fpu__clear_user_states(fpu);
-		return 0;
-	}
-
-	if (!access_ok(buf, size)) {
-		ret = -EACCES;
-		goto out;
-	}
-
-	if (!static_cpu_has(X86_FEATURE_FPU)) {
-		ret = fpregs_soft_set(current, NULL, 0,
-				      sizeof(struct user_i387_ia32_struct),
-				      NULL, buf);
-		goto out;
-	}
-
 	if (use_xsave()) {
 		struct _fpx_sw_bytes fx_sw_user;
 		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
@@ -391,7 +371,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 */
 		ret = __copy_from_user(&env, buf, sizeof(env));
 		if (ret)
-			goto out;
+			return ret;
 		envp = &env;
 	}
 
@@ -424,7 +404,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 		ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
 		if (ret)
-			goto out;
+			return ret;
 
 		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
 					      fx_only);
@@ -442,10 +422,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 	} else if (use_fxsr()) {
 		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
-		if (ret) {
-			ret = -EFAULT;
-			goto out;
-		}
+		if (ret)
+			return -EFAULT;
 
 		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
 					      fx_only);
@@ -462,7 +440,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	} else {
 		ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
 		if (ret)
-			goto out;
+			return ret;
 
 		fpregs_lock();
 		ret = frstor_safe(&fpu->state.fsave);
@@ -472,10 +450,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	else
 		fpregs_deactivate(fpu);
 	fpregs_unlock();
-
-out:
-	if (ret)
-		fpu__clear_user_states(fpu);
 	return ret;
 }
 
@@ -490,15 +464,47 @@ static inline int xstate_sigframe_size(void)
  */
 int fpu__restore_sig(void __user *buf, int ia32_frame)
 {
+	unsigned int size = xstate_sigframe_size();
+	struct fpu *fpu = &current->thread.fpu;
 	void __user *buf_fx = buf;
-	int size = xstate_sigframe_size();
+	bool ia32_fxstate = false;
+	int ret;
+
+	if (unlikely(!buf)) {
+		fpu__clear_user_states(fpu);
+		return 0;
+	}
 
+	ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
+		       IS_ENABLED(CONFIG_IA32_EMULATION));
+
+	/*
+	 * Only FXSR enabled systems need the FX state quirk.
+	 * FRSTOR does not need it and can use the fast path.
+	 */
 	if (ia32_frame && use_fxsr()) {
 		buf_fx = buf + sizeof(struct fregs_state);
 		size += sizeof(struct fregs_state);
+		ia32_fxstate = true;
 	}
 
-	return __fpu__restore_sig(buf, buf_fx, size);
+	if (!access_ok(buf, size)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
+		ret = fpregs_soft_set(current, NULL, 0,
+				      sizeof(struct user_i387_ia32_struct),
+				      NULL, buf);
+	} else {
+		ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+	}
+
+out:
+	if (unlikely(ret))
+		fpu__clear_user_states(fpu);
+	return ret;
 }
 
 unsigned long
-- 
Gitee


From c7bc15bd3ed67f216d056d388e0d78d85f3820be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:26 +0200
Subject: [PATCH 0167/2161] x86/fpu/signal: Remove the legacy alignment check

commit 9ba589f9cdbd8906465b108bc7ec0fc1519a06d3 upstream.

Checking for the XSTATE buffer being 64-byte aligned, and if not,
deciding just to restore the FXSR state is daft.

If user space provides an unaligned math frame and has the extended state
magic set in the FX software reserved bytes, then it really can keep the
pieces.

If the frame is unaligned and the FX software magic is not set, then
fx_only is already set and the restore will use fxrstor.

Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121457.184149902@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 9ed7cc244139..243aef653516 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -305,9 +305,6 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 			user_xfeatures = fx_sw_user.xfeatures;
 		}
 	}
-
-	if ((unsigned long)buf_fx % 64)
-		fx_only = 1;
 	
 	if (!ia32_fxstate) {
 		/*
-- 
Gitee


From 3b39c834c47c75a4e5192c4b5108c4cd20c9e2cf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:27 +0200
Subject: [PATCH 0168/2161] x86/fpu/signal: Sanitize the xstate check on
 sigframe

commit 1258a8c896044564514c1b53795ba3033b1e9fd6 upstream.

Utilize the check for the extended state magic in the FX software reserved
bytes and set the parameters for restoring fx_only in the relevant members
of fw_sw_user.

This allows further cleanups on top because the data is consistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121457.277738268@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 70 +++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 243aef653516..ec345180a35a 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -15,29 +15,30 @@
 #include <asm/sigframe.h>
 #include <asm/trace/fpu.h>
 
-static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
+static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
 
 /*
  * Check for the presence of extended state information in the
  * user fpstate pointer in the sigcontext.
  */
-static inline int check_for_xstate(struct fxregs_state __user *buf,
-				   void __user *fpstate,
-				   struct _fpx_sw_bytes *fx_sw)
+static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+					   struct _fpx_sw_bytes *fx_sw)
 {
 	int min_xstate_size = sizeof(struct fxregs_state) +
 			      sizeof(struct xstate_header);
+	void __user *fpstate = fxbuf;
 	unsigned int magic2;
 
-	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
-		return -1;
+	if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
+		return -EFAULT;
 
 	/* Check for the first magic field and other error scenarios. */
 	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
 	    fx_sw->xstate_size < min_xstate_size ||
 	    fx_sw->xstate_size > fpu_user_xstate_size ||
 	    fx_sw->xstate_size > fx_sw->extended_size)
-		return -1;
+		goto setfx;
 
 	/*
 	 * Check for the presence of second magic word at the end of memory
@@ -45,10 +46,18 @@ static inline int check_for_xstate(struct fxregs_state __user *buf,
 	 * fpstate layout with out copying the extended state information
 	 * in the memory layout.
 	 */
-	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
-	    || magic2 != FP_XSTATE_MAGIC2)
-		return -1;
+	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
+		return -EFAULT;
 
+	if (likely(magic2 == FP_XSTATE_MAGIC2))
+		return 0;
+setfx:
+	trace_x86_fpu_xstate_check_failed(&current->thread.fpu);
+
+	/* Set the parameters for fx only state */
+	fx_sw->magic1 = 0;
+	fx_sw->xstate_size = sizeof(struct fxregs_state);
+	fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
 	return 0;
 }
 
@@ -213,21 +222,15 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 
 static inline void
 sanitize_restored_user_xstate(union fpregs_state *state,
-			      struct user_i387_ia32_struct *ia32_env,
-			      u64 user_xfeatures, int fx_only)
+			      struct user_i387_ia32_struct *ia32_env, u64 mask)
 {
 	struct xregs_state *xsave = &state->xsave;
 	struct xstate_header *header = &xsave->header;
 
 	if (use_xsave()) {
 		/*
-		 * Clear all feature bits which are not set in
-		 * user_xfeatures and clear all extended features
-		 * for fx_only mode.
-		 */
-		u64 mask = fx_only ? XFEATURE_MASK_FPSSE : user_xfeatures;
-
-		/*
+		 * Clear all feature bits which are not set in mask.
+		 *
 		 * Supervisor state has to be preserved. The sigframe
 		 * restore can only modify user features, i.e. @mask
 		 * cannot contain them.
@@ -286,24 +289,19 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	struct fpu *fpu = &tsk->thread.fpu;
 	struct user_i387_ia32_struct env;
 	u64 user_xfeatures = 0;
-	int fx_only = 0;
+	bool fx_only = false;
 	int ret = 0;
 
 	if (use_xsave()) {
 		struct _fpx_sw_bytes fx_sw_user;
-		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
-			/*
-			 * Couldn't find the extended state information in the
-			 * memory layout. Restore just the FP/SSE and init all
-			 * the other extended state.
-			 */
-			state_size = sizeof(struct fxregs_state);
-			fx_only = 1;
-			trace_x86_fpu_xstate_check_failed(fpu);
-		} else {
-			state_size = fx_sw_user.xstate_size;
-			user_xfeatures = fx_sw_user.xfeatures;
-		}
+
+		ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user);
+		if (unlikely(ret))
+			return ret;
+
+		fx_only = !fx_sw_user.magic1;
+		state_size = fx_sw_user.xstate_size;
+		user_xfeatures = fx_sw_user.xfeatures;
 	}
 	
 	if (!ia32_fxstate) {
@@ -403,8 +401,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		if (ret)
 			return ret;
 
-		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
-					      fx_only);
+		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures);
 
 		fpregs_lock();
 		if (unlikely(init_bv))
@@ -422,8 +419,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		if (ret)
 			return -EFAULT;
 
-		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
-					      fx_only);
+		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures);
 
 		fpregs_lock();
 		if (use_xsave()) {
-- 
Gitee


From a0a1dcb7329c724da250fef0bcc67f0ed48a9832 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:28 +0200
Subject: [PATCH 0169/2161] x86/fpu/signal: Sanitize
 copy_user_to_fpregs_zeroing()

commit 1258a8c896044564514c1b53795ba3033b1e9fd6 upstream.

Now that user_xfeatures is correctly set when xsave is enabled, remove
the duplicated initialization of components.

Rename the function while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121457.377341297@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index ec345180a35a..1c33e71fbd4d 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -251,33 +251,27 @@ sanitize_restored_user_xstate(union fpregs_state *state,
 }
 
 /*
- * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ * Restore the FPU state directly from the userspace signal frame.
  */
-static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
 {
-	u64 init_bv;
-	int r;
-
 	if (use_xsave()) {
-		if (fx_only) {
-			init_bv = xfeatures_mask_uabi() & ~XFEATURE_MASK_FPSSE;
+		u64 init_bv = xfeatures_mask_uabi() & ~xrestore;
+		int ret;
 
-			r = fxrstor_from_user_sigframe(buf);
-			if (!r)
-				os_xrstor(&init_fpstate.xsave, init_bv);
-			return r;
-		} else {
-			init_bv = xfeatures_mask_uabi() & ~xbv;
-
-			r = xrstor_from_user_sigframe(buf, xbv);
-			if (!r && unlikely(init_bv))
-				os_xrstor(&init_fpstate.xsave, init_bv);
-			return r;
-		}
+		if (likely(!fx_only))
+			ret = xrstor_from_user_sigframe(buf, xrestore);
+		else
+			ret = fxrstor_from_user_sigframe(buf);
+
+		if (!ret && unlikely(init_bv))
+			os_xrstor(&init_fpstate.xsave, init_bv);
+		return ret;
 	} else if (use_fxsr()) {
 		return fxrstor_from_user_sigframe(buf);
-	} else
+	} else {
 		return frstor_from_user_sigframe(buf);
+	}
 }
 
 static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
@@ -314,7 +308,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		 */
 		fpregs_lock();
 		pagefault_disable();
-		ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only);
+		ret = restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
 		pagefault_enable();
 		if (!ret) {
 
-- 
Gitee


From 861b5fae087f05ed38734bdaba7e5819455487d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:29 +0200
Subject: [PATCH 0170/2161] x86/fpu/signal: Split out the direct restore code

commit 0a6c2e9ec91c96bde1e8ce063180ac6e05e680f7 upstream.

Prepare for smarter failure handling of the direct restore.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121457.493455414@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 112 ++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 54 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 1c33e71fbd4d..2e8302dc52b5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -250,10 +250,8 @@ sanitize_restored_user_xstate(union fpregs_state *state,
 	}
 }
 
-/*
- * Restore the FPU state directly from the userspace signal frame.
- */
-static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
+static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
+				      bool fx_only)
 {
 	if (use_xsave()) {
 		u64 init_bv = xfeatures_mask_uabi() & ~xrestore;
@@ -274,6 +272,57 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only
 	}
 }
 
+static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
+{
+	struct fpu *fpu = &current->thread.fpu;
+	int ret;
+
+	fpregs_lock();
+	pagefault_disable();
+	ret = __restore_fpregs_from_user(buf, xrestore, fx_only);
+	pagefault_enable();
+
+	if (unlikely(ret)) {
+		/*
+		 * The above did an FPU restore operation, restricted to
+		 * the user portion of the registers, and failed, but the
+		 * microcode might have modified the FPU registers
+		 * nevertheless.
+		 *
+		 * If the FPU registers do not belong to current, then
+		 * invalidate the FPU register state otherwise the task
+		 * might preempt current and return to user space with
+		 * corrupted FPU registers.
+		 *
+		 * In case current owns the FPU registers then no further
+		 * action is required. The fixup in the slow path will
+		 * handle it correctly.
+		 */
+		if (test_thread_flag(TIF_NEED_FPU_LOAD))
+			__cpu_invalidate_fpregs_state();
+		fpregs_unlock();
+		return ret;
+	}
+
+	/*
+	 * Restore supervisor states: previous context switch etc has done
+	 * XSAVES and saved the supervisor states in the kernel buffer from
+	 * which they can be restored now.
+	 *
+	 * It would be optimal to handle this with a single XRSTORS, but
+	 * this does not work because the rest of the FPU registers have
+	 * been restored from a user buffer directly. The single XRSTORS
+	 * happens below, when the user buffer has been copied to the
+	 * kernel one.
+	 */
+	if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
+		os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+	return 0;
+}
+
 static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 			     bool ia32_fxstate)
 {
@@ -298,61 +347,16 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		user_xfeatures = fx_sw_user.xfeatures;
 	}
 	
-	if (!ia32_fxstate) {
+	if (likely(!ia32_fxstate)) {
 		/*
 		 * Attempt to restore the FPU registers directly from user
-		 * memory. For that to succeed, the user access cannot cause
-		 * page faults. If it does, fall back to the slow path below,
-		 * going through the kernel buffer with the enabled pagefault
-		 * handler.
+		 * memory. For that to succeed, the user access cannot cause page
+		 * faults. If it does, fall back to the slow path below, going
+		 * through the kernel buffer with the enabled pagefault handler.
 		 */
-		fpregs_lock();
-		pagefault_disable();
 		ret = restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
-		pagefault_enable();
-		if (!ret) {
-
-			/*
-			 * Restore supervisor states: previous context switch
-			 * etc has done XSAVES and saved the supervisor states
-			 * in the kernel buffer from which they can be restored
-			 * now.
-			 *
-			 * We cannot do a single XRSTORS here - which would
-			 * be nice - because the rest of the FPU registers are
-			 * being restored from a user buffer directly. The
-			 * single XRSTORS happens below, when the user buffer
-			 * has been copied to the kernel one.
-			 */
-			if (test_thread_flag(TIF_NEED_FPU_LOAD) &&
-			    xfeatures_mask_supervisor()) {
-				os_xrstor(&fpu->state.xsave,
-					  xfeatures_mask_supervisor());
-			}
-			fpregs_mark_activate();
-			fpregs_unlock();
+		if (likely(!ret))
 			return 0;
-		}
-
-		/*
-		 * The above did an FPU restore operation, restricted to
-		 * the user portion of the registers, and failed, but the
-		 * microcode might have modified the FPU registers
-		 * nevertheless.
-		 *
-		 * If the FPU registers do not belong to current, then
-		 * invalidate the FPU register state otherwise the task might
-		 * preempt current and return to user space with corrupted
-		 * FPU registers.
-		 *
-		 * In case current owns the FPU registers then no further
-		 * action is required. The fixup below will handle it
-		 * correctly.
-		 */
-		if (test_thread_flag(TIF_NEED_FPU_LOAD))
-			__cpu_invalidate_fpregs_state();
-
-		fpregs_unlock();
 	} else {
 		/*
 		 * For 32-bit frames with fxstate, copy the fxstate so it can
-- 
Gitee


From 041986ef3ec1155144a9f6b8cf2dbc45718287b9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:31 +0200
Subject: [PATCH 0171/2161] x86/fpu/signal: Handle #PF in the direct restore
 path

commit fcb3635f5018e53024c6be3c3213737f469f74ff upstream.

If *RSTOR raises an exception, then the slow path is taken. That's wrong
because if the reason was not #PF then going through the slow path is waste
of time because that will end up with the same conclusion that the data is
invalid.

Now that the wrapper around *RSTOR return an negative error code, which is
the negated trap number, it's possible to differentiate.

If the *RSTOR raised #PF then handle it directly in the fast path and if it
was some other exception, e.g. #GP, then give up and do not try the fast
path.

This removes the legacy frame FRSTOR code from the slow path because FRSTOR
is not a ia32_fxstate frame and is therefore handled in the fast path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121457.696022863@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 67 ++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 2e8302dc52b5..a0f144880d87 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -272,11 +272,17 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
 	}
 }
 
-static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
+/*
+ * Attempt to restore the FPU registers directly from user memory.
+ * Pagefaults are handled and any errors returned are fatal.
+ */
+static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
+				    bool fx_only, unsigned int size)
 {
 	struct fpu *fpu = &current->thread.fpu;
 	int ret;
 
+retry:
 	fpregs_lock();
 	pagefault_disable();
 	ret = __restore_fpregs_from_user(buf, xrestore, fx_only);
@@ -293,14 +299,18 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only
 		 * invalidate the FPU register state otherwise the task
 		 * might preempt current and return to user space with
 		 * corrupted FPU registers.
-		 *
-		 * In case current owns the FPU registers then no further
-		 * action is required. The fixup in the slow path will
-		 * handle it correctly.
 		 */
 		if (test_thread_flag(TIF_NEED_FPU_LOAD))
 			__cpu_invalidate_fpregs_state();
 		fpregs_unlock();
+
+		/* Try to handle #PF, but anything else is fatal. */
+		if (ret != -EFAULT)
+			return -EINVAL;
+
+		ret = fault_in_pages_readable(buf, size);
+		if (!ret)
+			goto retry;
 		return ret;
 	}
 
@@ -311,9 +321,7 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only
 	 *
 	 * It would be optimal to handle this with a single XRSTORS, but
 	 * this does not work because the rest of the FPU registers have
-	 * been restored from a user buffer directly. The single XRSTORS
-	 * happens below, when the user buffer has been copied to the
-	 * kernel one.
+	 * been restored from a user buffer directly.
 	 */
 	if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
 		os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
@@ -326,14 +334,13 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only
 static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 			     bool ia32_fxstate)
 {
-	struct user_i387_ia32_struct *envp = NULL;
 	int state_size = fpu_kernel_xstate_size;
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
 	struct user_i387_ia32_struct env;
 	u64 user_xfeatures = 0;
 	bool fx_only = false;
-	int ret = 0;
+	int ret;
 
 	if (use_xsave()) {
 		struct _fpx_sw_bytes fx_sw_user;
@@ -354,20 +361,19 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		 * faults. If it does, fall back to the slow path below, going
 		 * through the kernel buffer with the enabled pagefault handler.
 		 */
-		ret = restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
-		if (likely(!ret))
-			return 0;
-	} else {
-		/*
-		 * For 32-bit frames with fxstate, copy the fxstate so it can
-		 * be reconstructed later.
-		 */
-		ret = __copy_from_user(&env, buf, sizeof(env));
-		if (ret)
-			return ret;
-		envp = &env;
+		return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
+						state_size);
 	}
 
+	/*
+	 * Copy the legacy state because the FP portion of the FX frame has
+	 * to be ignored for histerical raisins. The legacy state is folded
+	 * in once the larger state has been copied.
+	 */
+	ret = __copy_from_user(&env, buf, sizeof(env));
+	if (ret)
+		return ret;
+
 	/*
 	 * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
 	 * not modified on context switch and that the xstate is considered
@@ -382,8 +388,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		 * supervisor state is preserved. Save the full state for
 		 * simplicity. There is no point in optimizing this by only
 		 * saving the supervisor states and then shuffle them to
-		 * the right place in memory. This is the slow path and the
-		 * above XRSTOR failed or ia32_fxstate is true. Shrug.
+		 * the right place in memory. It's ia32 mode. Shrug.
 		 */
 		if (xfeatures_mask_supervisor())
 			os_xsave(&fpu->state.xsave);
@@ -399,7 +404,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		if (ret)
 			return ret;
 
-		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures);
+		sanitize_restored_user_xstate(&fpu->state, &env, user_xfeatures);
 
 		fpregs_lock();
 		if (unlikely(init_bv))
@@ -412,12 +417,12 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		ret = os_xrstor_safe(&fpu->state.xsave,
 				     user_xfeatures | xfeatures_mask_supervisor());
 
-	} else if (use_fxsr()) {
+	} else {
 		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
 		if (ret)
 			return -EFAULT;
 
-		sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures);
+		sanitize_restored_user_xstate(&fpu->state, &env, user_xfeatures);
 
 		fpregs_lock();
 		if (use_xsave()) {
@@ -428,14 +433,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		}
 
 		ret = fxrstor_safe(&fpu->state.fxsave);
-	} else {
-		ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
-		if (ret)
-			return ret;
-
-		fpregs_lock();
-		ret = frstor_safe(&fpu->state.fsave);
 	}
+
 	if (!ret)
 		fpregs_mark_activate();
 	else
-- 
Gitee


From 8a8ae1c932b592c227782c759a0066b571397599 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:32 +0200
Subject: [PATCH 0172/2161] x86/fpu/signal: Let xrstor handle the features to
 init

commit 6f9866a166cd1ad3ebb2dcdb3874aa8fee8dea2f upstream.

There is no reason to do an extra XRSTOR from init_fpstate for feature
bits which have been cleared by user space in the FX magic xfeatures
storage.

Just clear them in the task's XSTATE header and do a full restore which
will put these cleared features into init state.

There is no real difference in performance because the current code
already does a full restore when the xfeatures bits are preserved as the
signal frame setup has stored them, which is the full UABI feature set.

 [ bp: Use the negated mxcsr_feature_mask in the MXCSR check. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121457.804115017@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 89 +++++++++++++-----------------------
 1 file changed, 31 insertions(+), 58 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a0f144880d87..ce26ce1a9f3e 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -220,36 +220,6 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	return 0;
 }
 
-static inline void
-sanitize_restored_user_xstate(union fpregs_state *state,
-			      struct user_i387_ia32_struct *ia32_env, u64 mask)
-{
-	struct xregs_state *xsave = &state->xsave;
-	struct xstate_header *header = &xsave->header;
-
-	if (use_xsave()) {
-		/*
-		 * Clear all feature bits which are not set in mask.
-		 *
-		 * Supervisor state has to be preserved. The sigframe
-		 * restore can only modify user features, i.e. @mask
-		 * cannot contain them.
-		 */
-		header->xfeatures &= mask | xfeatures_mask_supervisor();
-	}
-
-	if (use_fxsr()) {
-		/*
-		 * mscsr reserved bits must be masked to zero for security
-		 * reasons.
-		 */
-		xsave->i387.mxcsr &= mxcsr_feature_mask;
-
-		if (ia32_env)
-			convert_to_fxsr(&state->fxsave, ia32_env);
-	}
-}
-
 static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
 				      bool fx_only)
 {
@@ -352,6 +322,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		fx_only = !fx_sw_user.magic1;
 		state_size = fx_sw_user.xstate_size;
 		user_xfeatures = fx_sw_user.xfeatures;
+	} else {
+		user_xfeatures = XFEATURE_MASK_FPSSE;
 	}
 	
 	if (likely(!ia32_fxstate)) {
@@ -395,54 +367,55 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
 	__fpu_invalidate_fpregs_state(fpu);
+	__cpu_invalidate_fpregs_state();
 	fpregs_unlock();
 
 	if (use_xsave() && !fx_only) {
-		u64 init_bv = xfeatures_mask_uabi() & ~user_xfeatures;
-
 		ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
 		if (ret)
 			return ret;
+	} else {
+		if (__copy_from_user(&fpu->state.fxsave, buf_fx,
+				     sizeof(fpu->state.fxsave)))
+			return -EFAULT;
 
-		sanitize_restored_user_xstate(&fpu->state, &env, user_xfeatures);
+		/* Reject invalid MXCSR values. */
+		if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
+			return -EINVAL;
 
-		fpregs_lock();
-		if (unlikely(init_bv))
-			os_xrstor(&init_fpstate.xsave, init_bv);
+		/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
+		if (use_xsave())
+			fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+	}
+
+	/* Fold the legacy FP storage */
+	convert_to_fxsr(&fpu->state.fxsave, &env);
 
+	fpregs_lock();
+	if (use_xsave()) {
 		/*
-		 * Restore previously saved supervisor xstates along with
-		 * copied-in user xstates.
+		 * Remove all UABI feature bits not set in user_xfeatures
+		 * from the memory xstate header which makes the full
+		 * restore below bring them into init state. This works for
+		 * fx_only mode as well because that has only FP and SSE
+		 * set in user_xfeatures.
+		 *
+		 * Preserve supervisor states!
 		 */
-		ret = os_xrstor_safe(&fpu->state.xsave,
-				     user_xfeatures | xfeatures_mask_supervisor());
+		u64 mask = user_xfeatures | xfeatures_mask_supervisor();
 
+		fpu->state.xsave.header.xfeatures &= mask;
+		ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all);
 	} else {
-		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
-		if (ret)
-			return -EFAULT;
-
-		sanitize_restored_user_xstate(&fpu->state, &env, user_xfeatures);
-
-		fpregs_lock();
-		if (use_xsave()) {
-			u64 init_bv;
-
-			init_bv = xfeatures_mask_uabi() & ~XFEATURE_MASK_FPSSE;
-			os_xrstor(&init_fpstate.xsave, init_bv);
-		}
-
 		ret = fxrstor_safe(&fpu->state.fxsave);
 	}
 
-	if (!ret)
+	if (likely(!ret))
 		fpregs_mark_activate();
-	else
-		fpregs_deactivate(fpu);
+
 	fpregs_unlock();
 	return ret;
 }
-
 static inline int xstate_sigframe_size(void)
 {
 	return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
-- 
Gitee


From 4ecf5745a20a4d4e49acbf9dfcd93f25e3c863bb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Jun 2021 17:09:18 +0200
Subject: [PATCH 0173/2161] x86/fpu/xstate: Clear xstate header in
 copy_xstate_to_uabi_buf() again

commit 93c2cdc975aab53c222472c5b96c2d41dbeb350c upstream.

The change which made copy_xstate_to_uabi_buf() usable for
[x]fpregs_get() removed the zeroing of the header which means the
header, which is copied to user space later, contains except for the
xfeatures member, random stack content.

Add the memset() back to zero it before usage.

Fixes: eb6f51723f03 ("x86/fpu: Make copy_xstate_to_kernel() usable for [x]fpregs_get()")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/875yy3wb8h.ffs@nanos.tec.linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b106ddf6262f..33006fbb0351 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1009,6 +1009,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 	unsigned int zerofrom;
 	int i;
 
+	memset(&header, 0, sizeof(header));
 	header.xfeatures = xsave->header.xfeatures;
 
 	/* Mask out the feature bits depending on copy mode */
-- 
Gitee


From 3782131146b279b467c5ac17a277182be2ef295e Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 10 Nov 2021 15:55:52 +0800
Subject: [PATCH 0174/2161] iommu: Add gfp parameter to iommu_ops::map

commit 781ca2de89bae1b1d2c96df9ef33e9a324415995 upstream.

Add a gfp_t parameter to the iommu_ops::map function.
Remove the needless locking in the AMD iommu driver.

The iommu_ops::map function (or the iommu_map function which calls it)
was always supposed to be sleepable (according to Joerg's comment in
this thread: https://lore.kernel.org/patchwork/patch/977520/ ) and so
should probably have had a "might_sleep()" since it was written. However
currently the dma-iommu api can call iommu_map in an atomic context,
which it shouldn't do. This doesn't cause any problems because any iommu
driver which uses the dma-iommu api uses gfp_atomic in it's
iommu_ops::map function. But doing this wastes the memory allocators
atomic pools.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c   |  3 ++-
 drivers/iommu/intel-iommu.c |  2 +-
 drivers/iommu/iommu.c       | 43 +++++++++++++++++++++++++++++++------
 include/linux/iommu.h       | 21 +++++++++++++++++-
 4 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c392930253a3..5e269f3dca3d 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3098,7 +3098,8 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 }
 
 static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
-			 phys_addr_t paddr, size_t page_size, int iommu_prot)
+			 phys_addr_t paddr, size_t page_size, int iommu_prot,
+			 gfp_t gfp)
 {
 	struct protection_domain *domain = to_pdomain(dom);
 	int prot = 0;
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 953d86ca6d2b..e17c3ca8a0d9 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5452,7 +5452,7 @@ static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
 
 static int intel_iommu_map(struct iommu_domain *domain,
 			   unsigned long iova, phys_addr_t hpa,
-			   size_t size, int iommu_prot)
+			   size_t size, int iommu_prot, gfp_t gfp)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	u64 max_addr;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3d7448e7cec9..7e6d0286146c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1868,8 +1868,8 @@ static size_t iommu_pgsize(struct iommu_domain *domain,
 	return pgsize;
 }
 
-int iommu_map(struct iommu_domain *domain, unsigned long iova,
-	      phys_addr_t paddr, size_t size, int prot)
+int __iommu_map(struct iommu_domain *domain, unsigned long iova,
+		phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
 	const struct iommu_ops *ops = domain->ops;
 	unsigned long orig_iova = iova;
@@ -1906,8 +1906,8 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
 			 iova, &paddr, pgsize);
+		ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
 
-		ret = ops->map(domain, iova, paddr, pgsize, prot);
 		if (ret)
 			break;
 
@@ -1927,8 +1927,22 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 	return ret;
 }
+
+int iommu_map(struct iommu_domain *domain, unsigned long iova,
+	      phys_addr_t paddr, size_t size, int prot)
+{
+	might_sleep();
+	return __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
+}
 EXPORT_SYMBOL_GPL(iommu_map);
 
+int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
+	      phys_addr_t paddr, size_t size, int prot)
+{
+	return __iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(iommu_map_atomic);
+
 static size_t __iommu_unmap(struct iommu_domain *domain,
 			    unsigned long iova, size_t size,
 			    struct iommu_iotlb_gather *iotlb_gather)
@@ -2005,8 +2019,9 @@ size_t iommu_unmap_fast(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_fast);
 
-size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-		    struct scatterlist *sg, unsigned int nents, int prot)
+size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		      struct scatterlist *sg, unsigned int nents, int prot,
+		      gfp_t gfp)
 {
 	size_t len = 0, mapped = 0;
 	phys_addr_t start;
@@ -2017,7 +2032,9 @@ size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 		phys_addr_t s_phys = sg_phys(sg);
 
 		if (len && s_phys != start + len) {
-			ret = iommu_map(domain, iova + mapped, start, len, prot);
+			ret = __iommu_map(domain, iova + mapped, start,
+					  len, prot, gfp);
+
 			if (ret)
 				goto out_err;
 
@@ -2045,8 +2062,22 @@ size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 	return 0;
 
 }
+
+size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		    struct scatterlist *sg, unsigned int nents, int prot)
+{
+	might_sleep();
+	return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_KERNEL);
+}
 EXPORT_SYMBOL_GPL(iommu_map_sg);
 
+size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
+		    struct scatterlist *sg, unsigned int nents, int prot)
+{
+	return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(iommu_map_sg_atomic);
+
 int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 			       phys_addr_t paddr, u64 size, int prot)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 87ea30c5b39d..7e54dfa53087 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -256,7 +256,7 @@ struct iommu_ops {
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
 	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
 	int (*map)(struct iommu_domain *domain, unsigned long iova,
-		   phys_addr_t paddr, size_t size, int prot);
+		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
@@ -427,6 +427,8 @@ extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		     phys_addr_t paddr, size_t size, int prot);
+extern int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
+			    phys_addr_t paddr, size_t size, int prot);
 extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 			  size_t size);
 extern size_t iommu_unmap_fast(struct iommu_domain *domain,
@@ -434,6 +436,9 @@ extern size_t iommu_unmap_fast(struct iommu_domain *domain,
 			       struct iommu_iotlb_gather *iotlb_gather);
 extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 			   struct scatterlist *sg,unsigned int nents, int prot);
+extern size_t iommu_map_sg_atomic(struct iommu_domain *domain,
+				  unsigned long iova, struct scatterlist *sg,
+				  unsigned int nents, int prot);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
 			iommu_fault_handler_t handler, void *token);
@@ -668,6 +673,13 @@ static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
 	return -ENODEV;
 }
 
+static inline int iommu_map_atomic(struct iommu_domain *domain,
+				   unsigned long iova, phys_addr_t paddr,
+				   size_t size, int prot)
+{
+	return -ENODEV;
+}
+
 static inline size_t iommu_unmap(struct iommu_domain *domain,
 				 unsigned long iova, size_t size)
 {
@@ -688,6 +700,13 @@ static inline size_t iommu_map_sg(struct iommu_domain *domain,
 	return 0;
 }
 
+static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain,
+					 unsigned long iova, struct scatterlist *sg,
+					 unsigned int nents, int prot)
+{
+	return 0;
+}
+
 static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
 {
 }
-- 
Gitee


From 7181de135523e218fc7a4c05d6e72ca6d15cc521 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 21 Sep 2019 15:06:44 +0800
Subject: [PATCH 0175/2161] iommu/vt-d: Refactor find_domain() helper

commit 1ee0186b9a128a872887e16e2d1520ea37a95dc4 upstream.

Current find_domain() helper checks and does the deferred domain
attachment and return the domain in use. This isn't always the
use case for the callers. Some callers only want to retrieve the
current domain in use.

This refactors find_domain() into two helpers: 1) find_domain()
only returns the domain in use; 2) deferred_attach_domain() does
the deferred domain attachment if required and return the domain
in use.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e17c3ca8a0d9..03c69a96027a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2427,14 +2427,24 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
-/*
- * find_domain
- * Note: we use struct device->archdata.iommu stores the info
- */
 static struct dmar_domain *find_domain(struct device *dev)
 {
 	struct device_domain_info *info;
 
+	if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
+		     dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
+		return NULL;
+
+	/* No lock here, assumes no domain exit in normal case */
+	info = dev->archdata.iommu;
+	if (likely(info))
+		return info->domain;
+
+	return NULL;
+}
+
+static struct dmar_domain *deferred_attach_domain(struct device *dev)
+{
 	if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
 		struct iommu_domain *domain;
 
@@ -2444,12 +2454,7 @@ static struct dmar_domain *find_domain(struct device *dev)
 			intel_iommu_attach_device(domain, dev);
 	}
 
-	/* No lock here, assumes no domain exit in normal case */
-	info = dev->archdata.iommu;
-
-	if (likely(info))
-		return info->domain;
-	return NULL;
+	return find_domain(dev);
 }
 
 static inline struct device_domain_info *
@@ -3515,7 +3520,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 
 	BUG_ON(dir == DMA_NONE);
 
-	domain = find_domain(dev);
+	domain = deferred_attach_domain(dev);
 	if (!domain)
 		return DMA_MAPPING_ERROR;
 
@@ -3735,7 +3740,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 	if (!iommu_need_mapping(dev))
 		return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
 
-	domain = find_domain(dev);
+	domain = deferred_attach_domain(dev);
 	if (!domain)
 		return 0;
 
@@ -3830,7 +3835,7 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
 	int prot = 0;
 	int ret;
 
-	domain = find_domain(dev);
+	domain = deferred_attach_domain(dev);
 	if (WARN_ON(dir == DMA_NONE || !domain))
 		return DMA_MAPPING_ERROR;
 
-- 
Gitee


From 553e9f29d0849510d8d9688a607046dd8d592663 Mon Sep 17 00:00:00 2001
From: Yi L Liu <yi.l.liu@intel.com>
Date: Wed, 2 Oct 2019 12:42:40 -0700
Subject: [PATCH 0176/2161] iommu: Introduce cache_invalidate API

commit 4c7c171f85b261f91270d405b7c7390aa6ddfb60 upstream.

In any virtualization use case, when the first translation stage
is "owned" by the guest OS, the host IOMMU driver has no knowledge
of caching structure updates unless the guest invalidation activities
are trapped by the virtualizer and passed down to the host.

Since the invalidation data can be obtained from user space and will be
written into physical IOMMU, we must allow security check at various
layers. Therefore, generic invalidation data format are proposed here,
model specific IOMMU drivers need to convert them into their own format.

Signed-off-by: Yi L Liu <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c      |  10 ++++
 include/linux/iommu.h      |  14 +++++
 include/uapi/linux/iommu.h | 110 +++++++++++++++++++++++++++++++++++++
 3 files changed, 134 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7e6d0286146c..a6721f3c3bed 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1679,6 +1679,16 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+int iommu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
+			   struct iommu_cache_invalidate_info *inv_info)
+{
+	if (unlikely(!domain->ops->cache_invalidate))
+		return -ENODEV;
+
+	return domain->ops->cache_invalidate(domain, dev, inv_info);
+}
+EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
 				  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 7e54dfa53087..0b09fad57a77 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -244,6 +244,7 @@ struct iommu_iotlb_gather {
  * @sva_unbind: Unbind process address space from device
  * @sva_get_pasid: Get PASID associated to a SVA handle
  * @page_response: handle page request response
+ * @cache_invalidate: invalidate translation caches
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  */
 struct iommu_ops {
@@ -306,6 +307,8 @@ struct iommu_ops {
 	int (*page_response)(struct device *dev,
 			     struct iommu_fault_event *evt,
 			     struct iommu_page_response *msg);
+	int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
+				struct iommu_cache_invalidate_info *inv_info);
 
 	unsigned long pgsize_bitmap;
 
@@ -423,6 +426,9 @@ extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
 				struct device *dev);
+extern int iommu_cache_invalidate(struct iommu_domain *domain,
+				  struct device *dev,
+				  struct iommu_cache_invalidate_info *inv_info);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
@@ -1030,6 +1036,14 @@ static inline int iommu_sva_get_pasid(struct iommu_sva *handle)
 	return IOMMU_PASID_INVALID;
 }
 
+static inline int
+iommu_cache_invalidate(struct iommu_domain *domain,
+		       struct device *dev,
+		       struct iommu_cache_invalidate_info *inv_info)
+{
+	return -ENODEV;
+}
+
 #endif /* CONFIG_IOMMU_API */
 
 #ifdef CONFIG_IOMMU_DEBUGFS
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index fc00c5d4741b..f3e96214df8e 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -152,4 +152,114 @@ struct iommu_page_response {
 	__u32	code;
 };
 
+/* defines the granularity of the invalidation */
+enum iommu_inv_granularity {
+	IOMMU_INV_GRANU_DOMAIN,	/* domain-selective invalidation */
+	IOMMU_INV_GRANU_PASID,	/* PASID-selective invalidation */
+	IOMMU_INV_GRANU_ADDR,	/* page-selective invalidation */
+	IOMMU_INV_GRANU_NR,	/* number of invalidation granularities */
+};
+
+/**
+ * struct iommu_inv_addr_info - Address Selective Invalidation Structure
+ *
+ * @flags: indicates the granularity of the address-selective invalidation
+ * - If the PASID bit is set, the @pasid field is populated and the invalidation
+ *   relates to cache entries tagged with this PASID and matching the address
+ *   range.
+ * - If ARCHID bit is set, @archid is populated and the invalidation relates
+ *   to cache entries tagged with this architecture specific ID and matching
+ *   the address range.
+ * - Both PASID and ARCHID can be set as they may tag different caches.
+ * - If neither PASID or ARCHID is set, global addr invalidation applies.
+ * - The LEAF flag indicates whether only the leaf PTE caching needs to be
+ *   invalidated and other paging structure caches can be preserved.
+ * @pasid: process address space ID
+ * @archid: architecture-specific ID
+ * @addr: first stage/level input address
+ * @granule_size: page/block size of the mapping in bytes
+ * @nb_granules: number of contiguous granules to be invalidated
+ */
+struct iommu_inv_addr_info {
+#define IOMMU_INV_ADDR_FLAGS_PASID	(1 << 0)
+#define IOMMU_INV_ADDR_FLAGS_ARCHID	(1 << 1)
+#define IOMMU_INV_ADDR_FLAGS_LEAF	(1 << 2)
+	__u32	flags;
+	__u32	archid;
+	__u64	pasid;
+	__u64	addr;
+	__u64	granule_size;
+	__u64	nb_granules;
+};
+
+/**
+ * struct iommu_inv_pasid_info - PASID Selective Invalidation Structure
+ *
+ * @flags: indicates the granularity of the PASID-selective invalidation
+ * - If the PASID bit is set, the @pasid field is populated and the invalidation
+ *   relates to cache entries tagged with this PASID and matching the address
+ *   range.
+ * - If the ARCHID bit is set, the @archid is populated and the invalidation
+ *   relates to cache entries tagged with this architecture specific ID and
+ *   matching the address range.
+ * - Both PASID and ARCHID can be set as they may tag different caches.
+ * - At least one of PASID or ARCHID must be set.
+ * @pasid: process address space ID
+ * @archid: architecture-specific ID
+ */
+struct iommu_inv_pasid_info {
+#define IOMMU_INV_PASID_FLAGS_PASID	(1 << 0)
+#define IOMMU_INV_PASID_FLAGS_ARCHID	(1 << 1)
+	__u32	flags;
+	__u32	archid;
+	__u64	pasid;
+};
+
+/**
+ * struct iommu_cache_invalidate_info - First level/stage invalidation
+ *     information
+ * @version: API version of this structure
+ * @cache: bitfield that allows to select which caches to invalidate
+ * @granularity: defines the lowest granularity used for the invalidation:
+ *     domain > PASID > addr
+ * @padding: reserved for future use (should be zero)
+ * @pasid_info: invalidation data when @granularity is %IOMMU_INV_GRANU_PASID
+ * @addr_info: invalidation data when @granularity is %IOMMU_INV_GRANU_ADDR
+ *
+ * Not all the combinations of cache/granularity are valid:
+ *
+ * +--------------+---------------+---------------+---------------+
+ * | type /       |   DEV_IOTLB   |     IOTLB     |      PASID    |
+ * | granularity  |               |               |      cache    |
+ * +==============+===============+===============+===============+
+ * | DOMAIN       |       N/A     |       Y       |       Y       |
+ * +--------------+---------------+---------------+---------------+
+ * | PASID        |       Y       |       Y       |       Y       |
+ * +--------------+---------------+---------------+---------------+
+ * | ADDR         |       Y       |       Y       |       N/A     |
+ * +--------------+---------------+---------------+---------------+
+ *
+ * Invalidations by %IOMMU_INV_GRANU_DOMAIN don't take any argument other than
+ * @version and @cache.
+ *
+ * If multiple cache types are invalidated simultaneously, they all
+ * must support the used granularity.
+ */
+struct iommu_cache_invalidate_info {
+#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1
+	__u32	version;
+/* IOMMU paging structure cache */
+#define IOMMU_CACHE_INV_TYPE_IOTLB	(1 << 0) /* IOMMU IOTLB */
+#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB	(1 << 1) /* Device IOTLB */
+#define IOMMU_CACHE_INV_TYPE_PASID	(1 << 2) /* PASID cache */
+#define IOMMU_CACHE_INV_TYPE_NR		(3)
+	__u8	cache;
+	__u8	granularity;
+	__u8	padding[2];
+	union {
+		struct iommu_inv_pasid_info pasid_info;
+		struct iommu_inv_addr_info addr_info;
+	};
+};
+
 #endif /* _UAPI_IOMMU_H */
-- 
Gitee


From a3a6957dc4747954c72397a4053eee497882ebd0 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Wed, 2 Oct 2019 12:42:41 -0700
Subject: [PATCH 0177/2161] iommu: Add I/O ASID allocator

commit fa83433c92e340822a056a610a4fa2063a3db304 upstream.

Some devices might support multiple DMA address spaces, in particular
those that have the PCI PASID feature. PASID (Process Address Space ID)
allows to share process address spaces with devices (SVA), partition a
device into VM-assignable entities (VFIO mdev) or simply provide
multiple DMA address space to kernel drivers. Add a global PASID
allocator usable by different drivers at the same time. Name it I/O ASID
to avoid confusion with ASIDs allocated by arch code, which are usually
a separate ID space.

The IOASID space is global. Each device can have its own PASID space,
but by convention the IOMMU ended up having a global PASID space, so
that with SVA, each mm_struct is associated to a single PASID.

The allocator is primarily used by IOMMU subsystem but in rare occasions
drivers would like to allocate PASIDs for devices that aren't managed by
an IOMMU, using the same ID space as IOMMU.

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig  |   4 ++
 drivers/iommu/Makefile |   1 +
 drivers/iommu/ioasid.c | 151 +++++++++++++++++++++++++++++++++++++++++
 include/linux/ioasid.h |  48 +++++++++++++
 4 files changed, 204 insertions(+)
 create mode 100644 drivers/iommu/ioasid.c
 create mode 100644 include/linux/ioasid.h

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2aa340c63939..5c8bbb2eac89 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -3,6 +3,10 @@
 config IOMMU_IOVA
 	tristate
 
+# The IOASID library may also be used by non-IOMMU_API users
+config IOASID
+	tristate
+
 # IOMMU_API always gets selected by whoever wants it.
 config IOMMU_API
 	bool
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 4f405f926e73..35d17094fe3b 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOASID) += ioasid.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
new file mode 100644
index 000000000000..aadf500c281c
--- /dev/null
+++ b/drivers/iommu/ioasid.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O Address Space ID allocator. There is one global IOASID space, split into
+ * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
+ * free IOASIDs with ioasid_alloc and ioasid_free.
+ */
+#include <linux/ioasid.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/xarray.h>
+
+struct ioasid_data {
+	ioasid_t id;
+	struct ioasid_set *set;
+	void *private;
+	struct rcu_head rcu;
+};
+
+static DEFINE_XARRAY_ALLOC(ioasid_xa);
+
+/**
+ * ioasid_set_data - Set private data for an allocated ioasid
+ * @ioasid: the ID to set data
+ * @data:   the private data
+ *
+ * For IOASID that is already allocated, private data can be set
+ * via this API. Future lookup can be done via ioasid_find.
+ */
+int ioasid_set_data(ioasid_t ioasid, void *data)
+{
+	struct ioasid_data *ioasid_data;
+	int ret = 0;
+
+	xa_lock(&ioasid_xa);
+	ioasid_data = xa_load(&ioasid_xa, ioasid);
+	if (ioasid_data)
+		rcu_assign_pointer(ioasid_data->private, data);
+	else
+		ret = -ENOENT;
+	xa_unlock(&ioasid_xa);
+
+	/*
+	 * Wait for readers to stop accessing the old private data, so the
+	 * caller can free it.
+	 */
+	if (!ret)
+		synchronize_rcu();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_data);
+
+/**
+ * ioasid_alloc - Allocate an IOASID
+ * @set: the IOASID set
+ * @min: the minimum ID (inclusive)
+ * @max: the maximum ID (inclusive)
+ * @private: data private to the caller
+ *
+ * Allocate an ID between @min and @max. The @private pointer is stored
+ * internally and can be retrieved with ioasid_find().
+ *
+ * Return: the allocated ID on success, or %INVALID_IOASID on failure.
+ */
+ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
+		      void *private)
+{
+	struct ioasid_data *data;
+	ioasid_t id;
+
+	data = kzalloc(sizeof(*data), GFP_ATOMIC);
+	if (!data)
+		return INVALID_IOASID;
+
+	data->set = set;
+	data->private = private;
+
+	if (xa_alloc(&ioasid_xa, &id, data, XA_LIMIT(min, max), GFP_KERNEL)) {
+		pr_err("Failed to alloc ioasid from %d to %d\n", min, max);
+		goto exit_free;
+	}
+	data->id = id;
+
+	return id;
+exit_free:
+	kfree(data);
+	return INVALID_IOASID;
+}
+EXPORT_SYMBOL_GPL(ioasid_alloc);
+
+/**
+ * ioasid_free - Free an IOASID
+ * @ioasid: the ID to remove
+ */
+void ioasid_free(ioasid_t ioasid)
+{
+	struct ioasid_data *ioasid_data;
+
+	ioasid_data = xa_erase(&ioasid_xa, ioasid);
+
+	kfree_rcu(ioasid_data, rcu);
+}
+EXPORT_SYMBOL_GPL(ioasid_free);
+
+/**
+ * ioasid_find - Find IOASID data
+ * @set: the IOASID set
+ * @ioasid: the IOASID to find
+ * @getter: function to call on the found object
+ *
+ * The optional getter function allows to take a reference to the found object
+ * under the rcu lock. The function can also check if the object is still valid:
+ * if @getter returns false, then the object is invalid and NULL is returned.
+ *
+ * If the IOASID exists, return the private pointer passed to ioasid_alloc.
+ * Private data can be NULL if not set. Return an error if the IOASID is not
+ * found, or if @set is not NULL and the IOASID does not belong to the set.
+ */
+void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
+		  bool (*getter)(void *))
+{
+	void *priv;
+	struct ioasid_data *ioasid_data;
+
+	rcu_read_lock();
+	ioasid_data = xa_load(&ioasid_xa, ioasid);
+	if (!ioasid_data) {
+		priv = ERR_PTR(-ENOENT);
+		goto unlock;
+	}
+	if (set && ioasid_data->set != set) {
+		/* data found but does not belong to the set */
+		priv = ERR_PTR(-EACCES);
+		goto unlock;
+	}
+	/* Now IOASID and its set is verified, we can return the private data */
+	priv = rcu_dereference(ioasid_data->private);
+	if (getter && !getter(priv))
+		priv = NULL;
+unlock:
+	rcu_read_unlock();
+
+	return priv;
+}
+EXPORT_SYMBOL_GPL(ioasid_find);
+
+MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
+MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
+MODULE_DESCRIPTION("IO Address Space ID (IOASID) allocator");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
new file mode 100644
index 000000000000..17337a13f06e
--- /dev/null
+++ b/include/linux/ioasid.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_IOASID_H
+#define __LINUX_IOASID_H
+
+#include <linux/types.h>
+#include <linux/errno.h>
+
+#define INVALID_IOASID ((ioasid_t)-1)
+typedef unsigned int ioasid_t;
+
+struct ioasid_set {
+	int dummy;
+};
+
+#define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
+
+#if IS_ENABLED(CONFIG_IOASID)
+ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
+		      void *private);
+void ioasid_free(ioasid_t ioasid);
+void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
+		  bool (*getter)(void *));
+int ioasid_set_data(ioasid_t ioasid, void *data);
+
+#else /* !CONFIG_IOASID */
+static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
+				    ioasid_t max, void *private)
+{
+	return INVALID_IOASID;
+}
+
+static inline void ioasid_free(ioasid_t ioasid)
+{
+}
+
+static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
+				bool (*getter)(void *))
+{
+	return NULL;
+}
+
+static inline int ioasid_set_data(ioasid_t ioasid, void *data)
+{
+	return -ENOTSUPP;
+}
+
+#endif /* CONFIG_IOASID */
+#endif /* __LINUX_IOASID_H */
-- 
Gitee


From 36bcfd51b15abd80a19947c0e70bacb65775738f Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 2 Oct 2019 12:42:42 -0700
Subject: [PATCH 0178/2161] iommu/ioasid: Add custom allocators

commit e5c0bd7f2206cd288029edb6afbfde93c73b4048 upstream.

IOASID allocation may rely on platform specific methods. One use case is
that when running in the guest, in order to obtain system wide global
IOASIDs, emulated allocation interface is needed to communicate with the
host. Here we call these platform specific allocators custom allocators.

Custom IOASID allocators can be registered at runtime and take precedence
over the default XArray allocator. They have these attributes:

- provides platform specific alloc()/free() functions with private data.
- allocation results lookup are not provided by the allocator, lookup
  request must be done by the IOASID framework by its own XArray.
- allocators can be unregistered at runtime, either fallback to the next
  custom allocator or to the default allocator.
- custom allocators can share the same set of alloc()/free() helpers, in
  this case they also share the same IOASID space, thus the same XArray.
- switching between allocators requires all outstanding IOASIDs to be
  freed unless the two allocators share the same alloc()/free() helpers.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lkml.org/lkml/2019/4/26/462
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 289 +++++++++++++++++++++++++++++++++++++++--
 include/linux/ioasid.h |  28 ++++
 2 files changed, 308 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index aadf500c281c..0f8dd377aada 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -17,7 +17,245 @@ struct ioasid_data {
 	struct rcu_head rcu;
 };
 
-static DEFINE_XARRAY_ALLOC(ioasid_xa);
+/*
+ * struct ioasid_allocator_data - Internal data structure to hold information
+ * about an allocator. There are two types of allocators:
+ *
+ * - Default allocator always has its own XArray to track the IOASIDs allocated.
+ * - Custom allocators may share allocation helpers with different private data.
+ *   Custom allocators that share the same helper functions also share the same
+ *   XArray.
+ * Rules:
+ * 1. Default allocator is always available, not dynamically registered. This is
+ *    to prevent race conditions with early boot code that want to register
+ *    custom allocators or allocate IOASIDs.
+ * 2. Custom allocators take precedence over the default allocator.
+ * 3. When all custom allocators sharing the same helper functions are
+ *    unregistered (e.g. due to hotplug), all outstanding IOASIDs must be
+ *    freed. Otherwise, outstanding IOASIDs will be lost and orphaned.
+ * 4. When switching between custom allocators sharing the same helper
+ *    functions, outstanding IOASIDs are preserved.
+ * 5. When switching between custom allocator and default allocator, all IOASIDs
+ *    must be freed to ensure unadulterated space for the new allocator.
+ *
+ * @ops:	allocator helper functions and its data
+ * @list:	registered custom allocators
+ * @slist:	allocators share the same ops but different data
+ * @flags:	attributes of the allocator
+ * @xa:		xarray holds the IOASID space
+ * @rcu:	used for kfree_rcu when unregistering allocator
+ */
+struct ioasid_allocator_data {
+	struct ioasid_allocator_ops *ops;
+	struct list_head list;
+	struct list_head slist;
+#define IOASID_ALLOCATOR_CUSTOM BIT(0) /* Needs framework to track results */
+	unsigned long flags;
+	struct xarray xa;
+	struct rcu_head rcu;
+};
+
+static DEFINE_SPINLOCK(ioasid_allocator_lock);
+static LIST_HEAD(allocators_list);
+
+static ioasid_t default_alloc(ioasid_t min, ioasid_t max, void *opaque);
+static void default_free(ioasid_t ioasid, void *opaque);
+
+static struct ioasid_allocator_ops default_ops = {
+	.alloc = default_alloc,
+	.free = default_free,
+};
+
+static struct ioasid_allocator_data default_allocator = {
+	.ops = &default_ops,
+	.flags = 0,
+	.xa = XARRAY_INIT(ioasid_xa, XA_FLAGS_ALLOC),
+};
+
+static struct ioasid_allocator_data *active_allocator = &default_allocator;
+
+static ioasid_t default_alloc(ioasid_t min, ioasid_t max, void *opaque)
+{
+	ioasid_t id;
+
+	if (xa_alloc(&default_allocator.xa, &id, opaque, XA_LIMIT(min, max), GFP_ATOMIC)) {
+		pr_err("Failed to alloc ioasid from %d to %d\n", min, max);
+		return INVALID_IOASID;
+	}
+
+	return id;
+}
+
+static void default_free(ioasid_t ioasid, void *opaque)
+{
+	struct ioasid_data *ioasid_data;
+
+	ioasid_data = xa_erase(&default_allocator.xa, ioasid);
+	kfree_rcu(ioasid_data, rcu);
+}
+
+/* Allocate and initialize a new custom allocator with its helper functions */
+static struct ioasid_allocator_data *ioasid_alloc_allocator(struct ioasid_allocator_ops *ops)
+{
+	struct ioasid_allocator_data *ia_data;
+
+	ia_data = kzalloc(sizeof(*ia_data), GFP_ATOMIC);
+	if (!ia_data)
+		return NULL;
+
+	xa_init_flags(&ia_data->xa, XA_FLAGS_ALLOC);
+	INIT_LIST_HEAD(&ia_data->slist);
+	ia_data->flags |= IOASID_ALLOCATOR_CUSTOM;
+	ia_data->ops = ops;
+
+	/* For tracking custom allocators that share the same ops */
+	list_add_tail(&ops->list, &ia_data->slist);
+
+	return ia_data;
+}
+
+static bool use_same_ops(struct ioasid_allocator_ops *a, struct ioasid_allocator_ops *b)
+{
+	return (a->free == b->free) && (a->alloc == b->alloc);
+}
+
+/**
+ * ioasid_register_allocator - register a custom allocator
+ * @ops: the custom allocator ops to be registered
+ *
+ * Custom allocators take precedence over the default xarray based allocator.
+ * Private data associated with the IOASID allocated by the custom allocators
+ * are managed by IOASID framework similar to data stored in xa by default
+ * allocator.
+ *
+ * There can be multiple allocators registered but only one is active. In case
+ * of runtime removal of a custom allocator, the next one is activated based
+ * on the registration ordering.
+ *
+ * Multiple allocators can share the same alloc() function, in this case the
+ * IOASID space is shared.
+ */
+int ioasid_register_allocator(struct ioasid_allocator_ops *ops)
+{
+	struct ioasid_allocator_data *ia_data;
+	struct ioasid_allocator_data *pallocator;
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+
+	ia_data = ioasid_alloc_allocator(ops);
+	if (!ia_data) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/*
+	 * No particular preference, we activate the first one and keep
+	 * the later registered allocators in a list in case the first one gets
+	 * removed due to hotplug.
+	 */
+	if (list_empty(&allocators_list)) {
+		WARN_ON(active_allocator != &default_allocator);
+		/* Use this new allocator if default is not active */
+		if (xa_empty(&active_allocator->xa)) {
+			rcu_assign_pointer(active_allocator, ia_data);
+			list_add_tail(&ia_data->list, &allocators_list);
+			goto out_unlock;
+		}
+		pr_warn("Default allocator active with outstanding IOASID\n");
+		ret = -EAGAIN;
+		goto out_free;
+	}
+
+	/* Check if the allocator is already registered */
+	list_for_each_entry(pallocator, &allocators_list, list) {
+		if (pallocator->ops == ops) {
+			pr_err("IOASID allocator already registered\n");
+			ret = -EEXIST;
+			goto out_free;
+		} else if (use_same_ops(pallocator->ops, ops)) {
+			/*
+			 * If the new allocator shares the same ops,
+			 * then they will share the same IOASID space.
+			 * We should put them under the same xarray.
+			 */
+			list_add_tail(&ops->list, &pallocator->slist);
+			goto out_free;
+		}
+	}
+	list_add_tail(&ia_data->list, &allocators_list);
+
+	spin_unlock(&ioasid_allocator_lock);
+	return 0;
+out_free:
+	kfree(ia_data);
+out_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_register_allocator);
+
+/**
+ * ioasid_unregister_allocator - Remove a custom IOASID allocator ops
+ * @ops: the custom allocator to be removed
+ *
+ * Remove an allocator from the list, activate the next allocator in
+ * the order it was registered. Or revert to default allocator if all
+ * custom allocators are unregistered without outstanding IOASIDs.
+ */
+void ioasid_unregister_allocator(struct ioasid_allocator_ops *ops)
+{
+	struct ioasid_allocator_data *pallocator;
+	struct ioasid_allocator_ops *sops;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (list_empty(&allocators_list)) {
+		pr_warn("No custom IOASID allocators active!\n");
+		goto exit_unlock;
+	}
+
+	list_for_each_entry(pallocator, &allocators_list, list) {
+		if (!use_same_ops(pallocator->ops, ops))
+			continue;
+
+		if (list_is_singular(&pallocator->slist)) {
+			/* No shared helper functions */
+			list_del(&pallocator->list);
+			/*
+			 * All IOASIDs should have been freed before
+			 * the last allocator that shares the same ops
+			 * is unregistered.
+			 */
+			WARN_ON(!xa_empty(&pallocator->xa));
+			if (list_empty(&allocators_list)) {
+				pr_info("No custom IOASID allocators, switch to default.\n");
+				rcu_assign_pointer(active_allocator, &default_allocator);
+			} else if (pallocator == active_allocator) {
+				rcu_assign_pointer(active_allocator,
+						list_first_entry(&allocators_list,
+								struct ioasid_allocator_data, list));
+				pr_info("IOASID allocator changed");
+			}
+			kfree_rcu(pallocator, rcu);
+			break;
+		}
+		/*
+		 * Find the matching shared ops to delete,
+		 * but keep outstanding IOASIDs
+		 */
+		list_for_each_entry(sops, &pallocator->slist, list) {
+			if (sops == ops) {
+				list_del(&ops->list);
+				break;
+			}
+		}
+		break;
+	}
+
+exit_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
 
 /**
  * ioasid_set_data - Set private data for an allocated ioasid
@@ -32,13 +270,13 @@ int ioasid_set_data(ioasid_t ioasid, void *data)
 	struct ioasid_data *ioasid_data;
 	int ret = 0;
 
-	xa_lock(&ioasid_xa);
-	ioasid_data = xa_load(&ioasid_xa, ioasid);
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_data = xa_load(&active_allocator->xa, ioasid);
 	if (ioasid_data)
 		rcu_assign_pointer(ioasid_data->private, data);
 	else
 		ret = -ENOENT;
-	xa_unlock(&ioasid_xa);
+	spin_unlock(&ioasid_allocator_lock);
 
 	/*
 	 * Wait for readers to stop accessing the old private data, so the
@@ -67,6 +305,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		      void *private)
 {
 	struct ioasid_data *data;
+	void *adata;
 	ioasid_t id;
 
 	data = kzalloc(sizeof(*data), GFP_ATOMIC);
@@ -76,14 +315,31 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 	data->set = set;
 	data->private = private;
 
-	if (xa_alloc(&ioasid_xa, &id, data, XA_LIMIT(min, max), GFP_KERNEL)) {
-		pr_err("Failed to alloc ioasid from %d to %d\n", min, max);
+	/*
+	 * Custom allocator needs allocator data to perform platform specific
+	 * operations.
+	 */
+	spin_lock(&ioasid_allocator_lock);
+	adata = active_allocator->flags & IOASID_ALLOCATOR_CUSTOM ? active_allocator->ops->pdata : data;
+	id = active_allocator->ops->alloc(min, max, adata);
+	if (id == INVALID_IOASID) {
+		pr_err("Failed ASID allocation %lu\n", active_allocator->flags);
+		goto exit_free;
+	}
+
+	if ((active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) &&
+	     xa_alloc(&active_allocator->xa, &id, data, XA_LIMIT(id, id), GFP_ATOMIC)) {
+		/* Custom allocator needs framework to store and track allocation results */
+		pr_err("Failed to alloc ioasid from %d\n", id);
+		active_allocator->ops->free(id, active_allocator->ops->pdata);
 		goto exit_free;
 	}
 	data->id = id;
 
+	spin_unlock(&ioasid_allocator_lock);
 	return id;
 exit_free:
+	spin_unlock(&ioasid_allocator_lock);
 	kfree(data);
 	return INVALID_IOASID;
 }
@@ -97,9 +353,22 @@ void ioasid_free(ioasid_t ioasid)
 {
 	struct ioasid_data *ioasid_data;
 
-	ioasid_data = xa_erase(&ioasid_xa, ioasid);
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_data = xa_load(&active_allocator->xa, ioasid);
+	if (!ioasid_data) {
+		pr_err("Trying to free unknown IOASID %u\n", ioasid);
+		goto exit_unlock;
+	}
 
-	kfree_rcu(ioasid_data, rcu);
+	active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
+	/* Custom allocator needs additional steps to free the xa element */
+	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
+		ioasid_data = xa_erase(&active_allocator->xa, ioasid);
+		kfree_rcu(ioasid_data, rcu);
+	}
+
+exit_unlock:
+	spin_unlock(&ioasid_allocator_lock);
 }
 EXPORT_SYMBOL_GPL(ioasid_free);
 
@@ -122,9 +391,11 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 {
 	void *priv;
 	struct ioasid_data *ioasid_data;
+	struct ioasid_allocator_data *idata;
 
 	rcu_read_lock();
-	ioasid_data = xa_load(&ioasid_xa, ioasid);
+	idata = rcu_dereference(active_allocator);
+	ioasid_data = xa_load(&idata->xa, ioasid);
 	if (!ioasid_data) {
 		priv = ERR_PTR(-ENOENT);
 		goto unlock;
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 17337a13f06e..6f000d7a0ddc 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -7,11 +7,28 @@
 
 #define INVALID_IOASID ((ioasid_t)-1)
 typedef unsigned int ioasid_t;
+typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
+typedef void (*ioasid_free_fn_t)(ioasid_t ioasid, void *data);
 
 struct ioasid_set {
 	int dummy;
 };
 
+/**
+ * struct ioasid_allocator_ops - IOASID allocator helper functions and data
+ *
+ * @alloc:	helper function to allocate IOASID
+ * @free:	helper function to free IOASID
+ * @list:	for tracking ops that share helper functions but not data
+ * @pdata:	data belong to the allocator, provided when calling alloc()
+ */
+struct ioasid_allocator_ops {
+	ioasid_alloc_fn_t alloc;
+	ioasid_free_fn_t free;
+	struct list_head list;
+	void *pdata;
+};
+
 #define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
 
 #if IS_ENABLED(CONFIG_IOASID)
@@ -20,6 +37,8 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 void ioasid_free(ioasid_t ioasid);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
+int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
+void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_set_data(ioasid_t ioasid, void *data);
 
 #else /* !CONFIG_IOASID */
@@ -39,6 +58,15 @@ static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 	return NULL;
 }
 
+static inline int ioasid_register_allocator(struct ioasid_allocator_ops *allocator)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator)
+{
+}
+
 static inline int ioasid_set_data(ioasid_t ioasid, void *data)
 {
 	return -ENOTSUPP;
-- 
Gitee


From acdb8a39f256d0a7af86d3e8c8726b3ecbc8b768 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 2 Oct 2019 12:42:43 -0700
Subject: [PATCH 0179/2161] iommu: Introduce guest PASID bind function

commit 808be0aae53a3675337fad9cde616e086bdc8287 upstream.

Guest shared virtual address (SVA) may require host to shadow guest
PASID tables. Guest PASID can also be allocated from the host via
enlightened interfaces. In this case, guest needs to bind the guest
mm, i.e. cr3 in guest physical address to the actual PASID table in
the host IOMMU. Nesting will be turned on such that guest virtual
address can go through a two level translation:
- 1st level translates GVA to GPA
- 2nd level translates GPA to HPA
This patch introduces APIs to bind guest PASID data to the assigned
device entry in the physical IOMMU. See the diagram below for usage
explanation.

    .-------------.  .---------------------------.
    |   vIOMMU    |  | Guest process mm, FL only |
    |             |  '---------------------------'
    .----------------/
    | PASID Entry |--- PASID cache flush -
    '-------------'                       |
    |             |                       V
    |             |                      GP
    '-------------'
Guest
------| Shadow |----------------------- GP->HP* ---------
      v        v                          |
Host                                      v
    .-------------.  .----------------------.
    |   pIOMMU    |  | Bind FL for GVA-GPA  |
    |             |  '----------------------'
    .----------------/  |
    | PASID Entry |     V (Nested xlate)
    '----------------\.---------------------.
    |             |   |Set SL to GPA-HPA    |
    |             |   '---------------------'
    '-------------'

Where:
 - FL = First level/stage one page tables
 - SL = Second level/stage two page tables
 - GP = Guest PASID
 - HP = Host PASID
* Conversion needed if non-identity GP-HP mapping option is chosen.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c      | 20 +++++++++++++
 include/linux/iommu.h      | 22 ++++++++++++++
 include/uapi/linux/iommu.h | 59 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a6721f3c3bed..ec7992b5cfde 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1689,6 +1689,26 @@ int iommu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
 
+int iommu_sva_bind_gpasid(struct iommu_domain *domain,
+			   struct device *dev, struct iommu_gpasid_bind_data *data)
+{
+	if (unlikely(!domain->ops->sva_bind_gpasid))
+		return -ENODEV;
+
+	return domain->ops->sva_bind_gpasid(domain, dev, data);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_bind_gpasid);
+
+int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+			     ioasid_t pasid)
+{
+	if (unlikely(!domain->ops->sva_unbind_gpasid))
+		return -ENODEV;
+
+	return domain->ops->sva_unbind_gpasid(dev, pasid);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
 				  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0b09fad57a77..9f17b963a4f4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
+#include <linux/ioasid.h>
 #include <uapi/linux/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
@@ -246,6 +247,8 @@ struct iommu_iotlb_gather {
  * @page_response: handle page request response
  * @cache_invalidate: invalidate translation caches
  * @pgsize_bitmap: bitmap of all possible supported page sizes
+ * @sva_bind_gpasid: bind guest pasid and mm
+ * @sva_unbind_gpasid: unbind guest pasid and mm
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
@@ -309,6 +312,10 @@ struct iommu_ops {
 			     struct iommu_page_response *msg);
 	int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
 				struct iommu_cache_invalidate_info *inv_info);
+	int (*sva_bind_gpasid)(struct iommu_domain *domain,
+			struct device *dev, struct iommu_gpasid_bind_data *data);
+
+	int (*sva_unbind_gpasid)(struct device *dev, int pasid);
 
 	unsigned long pgsize_bitmap;
 
@@ -429,6 +436,10 @@ extern void iommu_detach_device(struct iommu_domain *domain,
 extern int iommu_cache_invalidate(struct iommu_domain *domain,
 				  struct device *dev,
 				  struct iommu_cache_invalidate_info *inv_info);
+extern int iommu_sva_bind_gpasid(struct iommu_domain *domain,
+		struct device *dev, struct iommu_gpasid_bind_data *data);
+extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+				struct device *dev, ioasid_t pasid);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
@@ -1043,6 +1054,17 @@ iommu_cache_invalidate(struct iommu_domain *domain,
 {
 	return -ENODEV;
 }
+static inline int iommu_sva_bind_gpasid(struct iommu_domain *domain,
+				struct device *dev, struct iommu_gpasid_bind_data *data)
+{
+	return -ENODEV;
+}
+
+static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+					   struct device *dev, int pasid)
+{
+	return -ENODEV;
+}
 
 #endif /* CONFIG_IOMMU_API */
 
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index f3e96214df8e..4ad3496e5c43 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -262,4 +262,63 @@ struct iommu_cache_invalidate_info {
 	};
 };
 
+/**
+ * struct iommu_gpasid_bind_data_vtd - Intel VT-d specific data on device and guest
+ * SVA binding.
+ *
+ * @flags:	VT-d PASID table entry attributes
+ * @pat:	Page attribute table data to compute effective memory type
+ * @emt:	Extended memory type
+ *
+ * Only guest vIOMMU selectable and effective options are passed down to
+ * the host IOMMU.
+ */
+struct iommu_gpasid_bind_data_vtd {
+#define IOMMU_SVA_VTD_GPASID_SRE	(1 << 0) /* supervisor request */
+#define IOMMU_SVA_VTD_GPASID_EAFE	(1 << 1) /* extended access enable */
+#define IOMMU_SVA_VTD_GPASID_PCD	(1 << 2) /* page-level cache disable */
+#define IOMMU_SVA_VTD_GPASID_PWT	(1 << 3) /* page-level write through */
+#define IOMMU_SVA_VTD_GPASID_EMTE	(1 << 4) /* extended mem type enable */
+#define IOMMU_SVA_VTD_GPASID_CD		(1 << 5) /* PASID-level cache disable */
+	__u64 flags;
+	__u32 pat;
+	__u32 emt;
+};
+
+/**
+ * struct iommu_gpasid_bind_data - Information about device and guest PASID binding
+ * @version:	Version of this data structure
+ * @format:	PASID table entry format
+ * @flags:	Additional information on guest bind request
+ * @gpgd:	Guest page directory base of the guest mm to bind
+ * @hpasid:	Process address space ID used for the guest mm in host IOMMU
+ * @gpasid:	Process address space ID used for the guest mm in guest IOMMU
+ * @addr_width:	Guest virtual address width
+ * @padding:	Reserved for future use (should be zero)
+ * @vtd:	Intel VT-d specific data
+ *
+ * Guest to host PASID mapping can be an identity or non-identity, where guest
+ * has its own PASID space. For non-identify mapping, guest to host PASID lookup
+ * is needed when VM programs guest PASID into an assigned device. VMM may
+ * trap such PASID programming then request host IOMMU driver to convert guest
+ * PASID to host PASID based on this bind data.
+ */
+struct iommu_gpasid_bind_data {
+#define IOMMU_GPASID_BIND_VERSION_1	1
+	__u32 version;
+#define IOMMU_PASID_FORMAT_INTEL_VTD	1
+	__u32 format;
+#define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
+	__u64 flags;
+	__u64 gpgd;
+	__u64 hpasid;
+	__u64 gpasid;
+	__u32 addr_width;
+	__u8  padding[12];
+	/* Vendor specific data */
+	union {
+		struct iommu_gpasid_bind_data_vtd vtd;
+	};
+};
+
 #endif /* _UAPI_IOMMU_H */
-- 
Gitee


From 6b2166226f9d3f72bd3ba7f2fd00048ebdbbc133 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 9 Oct 2019 15:14:05 -0500
Subject: [PATCH 0180/2161] iommu/vt-d: Select PCI_PRI for INTEL_IOMMU_SVM

commit fd872843ecd52820654dcacf85e680d8101a9fde upstream.

Previously intel-iommu.c depended on CONFIG_AMD_IOMMU in an undesirable
way.  When CONFIG_INTEL_IOMMU_SVM=y, iommu_enable_dev_iotlb() calls PRI
interfaces (pci_reset_pri() and pci_enable_pri()), but those are only
implemented when CONFIG_PCI_PRI is enabled.

The INTEL_IOMMU_SVM Kconfig did nothing with PCI_PRI, but AMD_IOMMU selects
PCI_PRI.  So if AMD_IOMMU was enabled, intel-iommu.c got the full PRI
interfaces, but if AMD_IOMMU was not enabled, it got the PRI stubs.

Make the iommu_enable_dev_iotlb() behavior independent of AMD_IOMMU by
having INTEL_IOMMU_SVM select PCI_PRI so iommu_enable_dev_iotlb() always
uses the full implementations of PRI interfaces.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 5c8bbb2eac89..36ee7b44c4fd 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -211,6 +211,7 @@ config INTEL_IOMMU_SVM
 	bool "Support for Shared Virtual Memory with Intel IOMMU"
 	depends on INTEL_IOMMU && X86_64
 	select PCI_PASID
+	select PCI_PRI
 	select MMU_NOTIFIER
 	help
 	  Shared Virtual Memory (SVM) provides a facility for devices
-- 
Gitee


From 680b5fa0979d77ea9eeb6ce1ee9e09ca234f935f Mon Sep 17 00:00:00 2001
From: Yian Chen <yian.chen@intel.com>
Date: Thu, 17 Oct 2019 04:39:19 -0700
Subject: [PATCH 0181/2161] iommu/vt-d: Check VT-d RMRR region in BIOS is
 reported as reserved

commit f036c7fa0ab60b7ea47560c32c78e435eb1cd214 upstream.

VT-d RMRR (Reserved Memory Region Reporting) regions are reserved
for device use only and should not be part of allocable memory pool of OS.

BIOS e820_table reports complete memory map to OS, including OS usable
memory ranges and BIOS reserved memory ranges etc.

x86 BIOS may not be trusted to include RMRR regions as reserved type
of memory in its e820 memory map, hence validate every RMRR entry
with the e820 memory map to make sure the RMRR regions will not be
used by OS for any other purposes.

ia64 EFI is working fine so implement RMRR validation as a dummy function

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Signed-off-by: Yian Chen <yian.chen@intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/iommu.h | 18 ++++++++++++++++++
 drivers/iommu/intel-iommu.c  |  8 +++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index b91623d521d9..bf1ed2ddc74b 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -2,10 +2,28 @@
 #ifndef _ASM_X86_IOMMU_H
 #define _ASM_X86_IOMMU_H
 
+#include <linux/acpi.h>
+
+#include <asm/e820/api.h>
+
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
 
+static inline int __init
+arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
+{
+	u64 start = rmrr->base_address;
+	u64 end = rmrr->end_address + 1;
+
+	if (e820__mapped_all(start, end, E820_TYPE_RESERVED))
+		return 0;
+
+	pr_err(FW_BUG "No firmware reserved region can cover this RMRR [%#018Lx-%#018Lx], contact BIOS vendor for fixes\n",
+	       start, end - 1);
+	return -EINVAL;
+}
+
 #endif /* _ASM_X86_IOMMU_H */
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 03c69a96027a..e89f02826b57 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4323,13 +4323,19 @@ int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
 {
 	struct acpi_dmar_reserved_memory *rmrr;
 	struct dmar_rmrr_unit *rmrru;
+	int ret;
+
+	rmrr = (struct acpi_dmar_reserved_memory *)header;
+	ret = arch_rmrr_sanity_check(rmrr);
+	if (ret)
+		return ret;
 
 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
 	if (!rmrru)
 		goto out;
 
 	rmrru->hdr = header;
-	rmrr = (struct acpi_dmar_reserved_memory *)header;
+
 	rmrru->base_address = rmrr->base_address;
 	rmrru->end_address = rmrr->end_address;
 
-- 
Gitee


From b6a3e686504321729abe73bce79952a133c6d145 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 10 Nov 2019 09:27:44 -0800
Subject: [PATCH 0182/2161] iommu/vt-d: Turn off translations at shutdown

commit 6c3a44ed3c553c324845744f30bcd1d3b07d61fd upstream.

The intel-iommu driver assumes that the iommu state is
cleaned up at the start of the new kernel.
But, when we try to kexec boot something other than the
Linux kernel, the cleanup cannot be relied upon.
Hence, cleanup before we go down for reboot.

Keeping the cleanup at initialization also, in case BIOS
leaves the IOMMU enabled.

I considered turning off iommu only during kexec reboot, but a clean
shutdown seems always a good idea. But if someone wants to make it
conditional, such as VMM live update, we can do that.  There doesn't
seem to be such a condition at this time.

Tested that before, the info message
'DMAR: Translation was enabled for <iommu> but we are not in kdump mode'
would be reported for each iommu. The message will not appear when the
DMA-remapping is not enabled on entry to the kernel.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dmar.c        |  5 ++++-
 drivers/iommu/intel-iommu.c | 20 ++++++++++++++++++++
 include/linux/dmar.h        |  2 ++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 1b9795743276..1c952d24af36 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -906,8 +906,11 @@ int __init detect_intel_iommu(void)
 	}
 
 #ifdef CONFIG_X86
-	if (!ret)
+	if (!ret) {
 		x86_init.iommu.iommu_init = intel_iommu_init;
+		x86_platform.iommu_shutdown = intel_iommu_shutdown;
+	}
+
 #endif
 
 	if (dmar_tbl) {
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e89f02826b57..aa6f37b2c92e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4775,6 +4775,26 @@ static void intel_disable_iommus(void)
 		iommu_disable_translation(iommu);
 }
 
+void intel_iommu_shutdown(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	if (no_iommu || dmar_disabled)
+		return;
+
+	down_write(&dmar_global_lock);
+
+	/* Disable PMRs explicitly here. */
+	for_each_iommu(iommu, drhd)
+		iommu_disable_protect_mem_regions(iommu);
+
+	/* Make sure the IOMMUs are switched off */
+	intel_disable_iommus();
+
+	up_write(&dmar_global_lock);
+}
+
 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
 {
 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index f397e52c2d9d..d7bf029df737 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -133,6 +133,7 @@ static inline int dmar_res_noop(struct acpi_dmar_header *hdr, void *arg)
 #ifdef CONFIG_INTEL_IOMMU
 extern int iommu_detected, no_iommu;
 extern int intel_iommu_init(void);
+extern void intel_iommu_shutdown(void);
 extern int dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg);
 extern int dmar_parse_one_atsr(struct acpi_dmar_header *header, void *arg);
 extern int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg);
@@ -141,6 +142,7 @@ extern int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert);
 extern int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info);
 #else /* !CONFIG_INTEL_IOMMU: */
 static inline int intel_iommu_init(void) { return -ENODEV; }
+static inline void intel_iommu_shutdown(void) { }
 
 #define	dmar_parse_one_rmrr		dmar_res_noop
 #define	dmar_parse_one_atsr		dmar_res_noop
-- 
Gitee


From 6f2943d9eb592a3cc517b2001a266a9e136a9943 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 19 Dec 2019 12:03:37 +0000
Subject: [PATCH 0183/2161] drivers/iommu: Export core IOMMU API symbols to
 permit modular drivers

commit a7ba5c3d008dd78d881a1658eae5a2275ebd5087 upstream.

Building IOMMU drivers as modules requires that the core IOMMU API
symbols are exported as GPL symbols.

Signed-off-by: Will Deacon <will@kernel.org>
Tested-by: John Garry <john.garry@huawei.com> # smmu v3
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu-sysfs.c | 5 +++++
 drivers/iommu/iommu.c       | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
index e436ff813e7e..99869217fbec 100644
--- a/drivers/iommu/iommu-sysfs.c
+++ b/drivers/iommu/iommu-sysfs.c
@@ -87,6 +87,7 @@ int iommu_device_sysfs_add(struct iommu_device *iommu,
 	put_device(iommu->dev);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(iommu_device_sysfs_add);
 
 void iommu_device_sysfs_remove(struct iommu_device *iommu)
 {
@@ -94,6 +95,8 @@ void iommu_device_sysfs_remove(struct iommu_device *iommu)
 	device_unregister(iommu->dev);
 	iommu->dev = NULL;
 }
+EXPORT_SYMBOL_GPL(iommu_device_sysfs_remove);
+
 /*
  * IOMMU drivers can indicate a device is managed by a given IOMMU using
  * this interface.  A link to the device will be created in the "devices"
@@ -119,6 +122,7 @@ int iommu_device_link(struct iommu_device *iommu, struct device *link)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(iommu_device_link);
 
 void iommu_device_unlink(struct iommu_device *iommu, struct device *link)
 {
@@ -128,3 +132,4 @@ void iommu_device_unlink(struct iommu_device *iommu, struct device *link)
 	sysfs_remove_link(&link->kobj, "iommu");
 	sysfs_remove_link_from_group(&iommu->dev->kobj, "devices", dev_name(link));
 }
+EXPORT_SYMBOL_GPL(iommu_device_unlink);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ec7992b5cfde..238dc7363d70 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -141,6 +141,7 @@ int iommu_device_register(struct iommu_device *iommu)
 	spin_unlock(&iommu_device_lock);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(iommu_device_register);
 
 void iommu_device_unregister(struct iommu_device *iommu)
 {
@@ -148,6 +149,7 @@ void iommu_device_unregister(struct iommu_device *iommu)
 	list_del(&iommu->list);
 	spin_unlock(&iommu_device_lock);
 }
+EXPORT_SYMBOL_GPL(iommu_device_unregister);
 
 static struct iommu_param *iommu_get_dev_param(struct device *dev)
 {
@@ -887,6 +889,7 @@ struct iommu_group *iommu_group_ref_get(struct iommu_group *group)
 	kobject_get(group->devices_kobj);
 	return group;
 }
+EXPORT_SYMBOL_GPL(iommu_group_ref_get);
 
 /**
  * iommu_group_put - Decrement group reference
@@ -1260,6 +1263,7 @@ struct iommu_group *generic_device_group(struct device *dev)
 {
 	return iommu_group_alloc();
 }
+EXPORT_SYMBOL_GPL(generic_device_group);
 
 /*
  * Use standard PCI bus topology, isolation features, and DMA alias quirks
@@ -1327,6 +1331,7 @@ struct iommu_group *pci_device_group(struct device *dev)
 	/* No shared group found, allocate new */
 	return iommu_group_alloc();
 }
+EXPORT_SYMBOL_GPL(pci_device_group);
 
 /* Get the IOMMU group for device on fsl-mc bus */
 struct iommu_group *fsl_mc_device_group(struct device *dev)
@@ -1339,6 +1344,7 @@ struct iommu_group *fsl_mc_device_group(struct device *dev)
 		group = iommu_group_alloc();
 	return group;
 }
+EXPORT_SYMBOL_GPL(fsl_mc_device_group);
 
 /**
  * iommu_group_get_for_dev - Find or create the IOMMU group for a device
@@ -1420,6 +1426,7 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 
 	return group;
 }
+EXPORT_SYMBOL(iommu_group_get_for_dev);
 
 struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
 {
@@ -2260,6 +2267,7 @@ struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
 	region->type = type;
 	return region;
 }
+EXPORT_SYMBOL_GPL(iommu_alloc_resv_region);
 
 static int
 request_default_domain_for_dev(struct device *dev, unsigned long type)
-- 
Gitee


From bb19a5d3aeff8d192a61b518852abb77e71d98fe Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 19 Dec 2019 12:03:41 +0000
Subject: [PATCH 0184/2161] drivers/iommu: Take a ref to the IOMMU driver prior
 to ->add_device()

commit 25f003de987aed630db265ceae9cd978537a3f80 upstream.

To avoid accidental removal of an active IOMMU driver module, take a
reference to the driver module in 'iommu_probe_device()' immediately
prior to invoking the '->add_device()' callback and hold it until the
after the device has been removed by '->remove_device()'.

Suggested-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Will Deacon <will@kernel.org>
Tested-by: John Garry <john.garry@huawei.com> # smmu v3
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 19 +++++++++++++++++--
 include/linux/iommu.h |  4 +++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 238dc7363d70..0ab7e85778c8 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -22,6 +22,7 @@
 #include <linux/bitops.h>
 #include <linux/property.h>
 #include <linux/fsl/mc.h>
+#include <linux/module.h>
 #include <trace/events/iommu.h>
 
 static struct kset *iommu_group_kset;
@@ -185,10 +186,21 @@ int iommu_probe_device(struct device *dev)
 	if (!iommu_get_dev_param(dev))
 		return -ENOMEM;
 
+	if (!try_module_get(ops->owner)) {
+		ret = -EINVAL;
+		goto err_free_dev_param;
+	}
+
 	ret = ops->add_device(dev);
 	if (ret)
-		iommu_free_dev_param(dev);
+		goto err_module_put;
+
+	return 0;
 
+err_module_put:
+	module_put(ops->owner);
+err_free_dev_param:
+	iommu_free_dev_param(dev);
 	return ret;
 }
 
@@ -199,7 +211,10 @@ void iommu_release_device(struct device *dev)
 	if (dev->iommu_group)
 		ops->remove_device(dev);
 
-	iommu_free_dev_param(dev);
+	if (dev->iommu_param) {
+		module_put(ops->owner);
+		iommu_free_dev_param(dev);
+	}
 }
 
 static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9f17b963a4f4..15837ee3e27c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -246,9 +246,10 @@ struct iommu_iotlb_gather {
  * @sva_get_pasid: Get PASID associated to a SVA handle
  * @page_response: handle page request response
  * @cache_invalidate: invalidate translation caches
- * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @sva_bind_gpasid: bind guest pasid and mm
  * @sva_unbind_gpasid: unbind guest pasid and mm
+ * @pgsize_bitmap: bitmap of all possible supported page sizes
+ * @owner: Driver module providing these ops
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
@@ -318,6 +319,7 @@ struct iommu_ops {
 	int (*sva_unbind_gpasid)(struct device *dev, int pasid);
 
 	unsigned long pgsize_bitmap;
+	struct module *owner;
 
 #ifdef CONFIG_SMMU_BYPASS_DEV
 #ifndef __GENKSYMS__
-- 
Gitee


From 5cba8d90c04cc14aca6a7ca4f180b86dcd992f43 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 19 Dec 2019 12:03:43 +0000
Subject: [PATCH 0185/2161] drivers/iommu: Allow IOMMU bus ops to be
 unregistered

commit 4312cf7f16c8d43e154bf2a6eea6d1e9347c922c upstream.

'bus_set_iommu()' allows IOMMU drivers to register their ops for a given
bus type. Unfortunately, it then doesn't allow them to be removed, which
is necessary for modular drivers to shutdown cleanly so that they can be
reloaded later on.

Allow 'bus_set_iommu()' to take a NULL 'ops' argument, which clear the
ops pointer for the selected bus_type.

Signed-off-by: Will Deacon <will@kernel.org>
Tested-by: John Garry <john.garry@huawei.com> # smmu v3
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0ab7e85778c8..cacb1cf7acb1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1572,6 +1572,11 @@ int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops)
 {
 	int err;
 
+	if (ops == NULL) {
+		bus->iommu_ops = NULL;
+		return 0;
+	}
+
 	if (bus->iommu_ops != NULL)
 		return -EBUSY;
 
-- 
Gitee


From a96f152522faf36d92458ecbd17430c390b23c2c Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 21 Nov 2019 04:19:30 +0100
Subject: [PATCH 0186/2161] iommu: Fix Kconfig indentation

commit d0432345b4b57680eb441aa014fbb91d710b86ee upstream.

Adjust indentation from spaces to tab (+optional two spaces) as in
coding style with command like:
	$ sed -e 's/^        /\t/' -i */Kconfig

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 36ee7b44c4fd..12499d32e690 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -82,7 +82,7 @@ config IOMMU_DEBUGFS
 config IOMMU_DEFAULT_PASSTHROUGH
 	bool "IOMMU passthrough by default"
 	depends on IOMMU_API
-        help
+	help
 	  Enable passthrough by default, removing the need to pass in
 	  iommu.passthrough=on or iommu=pt through command line. If this
 	  is enabled, you can still disable with iommu.passthrough=off
@@ -91,8 +91,8 @@ config IOMMU_DEFAULT_PASSTHROUGH
 	  If unsure, say N here.
 
 config OF_IOMMU
-       def_bool y
-       depends on OF && IOMMU_API
+	def_bool y
+	depends on OF && IOMMU_API
 
 # IOMMU-agnostic DMA-mapping layer
 config IOMMU_DMA
-- 
Gitee


From bf75e9c8b5e92f47c120f81aa0a60ed9ad67058e Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 18 Dec 2019 14:42:01 +0100
Subject: [PATCH 0187/2161] iommu: Implement generic_iommu_put_resv_regions()

commit f9f6971ebb75f5bc302d77e3380dd6363cc1a0f6 upstream.

Implement a generic function for removing reserved regions. This can be
used by drivers that don't do anything fancy with these regions other
than allocating memory for them.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 19 +++++++++++++++++++
 include/linux/iommu.h |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cacb1cf7acb1..fd9bc4aa515e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2270,6 +2270,25 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list)
 		ops->put_resv_regions(dev, list);
 }
 
+/**
+ * generic_iommu_put_resv_regions - Reserved region driver helper
+ * @dev: device for which to free reserved regions
+ * @list: reserved region list for device
+ *
+ * IOMMU drivers can use this to implement their .put_resv_regions() callback
+ * for simple reservations. Memory allocated for each reserved region will be
+ * freed. If an IOMMU driver allocates additional resources per region, it is
+ * going to have to implement a custom callback.
+ */
+void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, list, list)
+		kfree(entry);
+}
+EXPORT_SYMBOL(generic_iommu_put_resv_regions);
+
 struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
 						  size_t length, int prot,
 						  enum iommu_resv_type type)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 15837ee3e27c..010b88dad109 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -464,6 +464,8 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
 
 extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
+extern void generic_iommu_put_resv_regions(struct device *dev,
+					   struct list_head *list);
 extern int iommu_request_dm_for_dev(struct device *dev);
 extern int iommu_request_dma_domain_for_dev(struct device *dev);
 extern void iommu_set_default_passthrough(bool cmd_line);
-- 
Gitee


From 427985d2059fa73909cb7d91f5484904cca5cf2c Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 18 Dec 2019 14:42:04 +0100
Subject: [PATCH 0188/2161] iommu: intel: Use generic_iommu_put_resv_regions()

commit 0ecdebb7dad586f15c3a5a98bb0cf431ca2fc1da upstream.

Use the new standard function instead of open-coding it.

Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index aa6f37b2c92e..4d2e1dc6ac37 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5765,15 +5765,6 @@ static void intel_iommu_get_resv_regions(struct device *device,
 	list_add_tail(&reg->list, head);
 }
 
-static void intel_iommu_put_resv_regions(struct device *dev,
-					 struct list_head *head)
-{
-	struct iommu_resv_region *entry, *next;
-
-	list_for_each_entry_safe(entry, next, head, list)
-		kfree(entry);
-}
-
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
 {
 	struct device_domain_info *info;
@@ -6032,7 +6023,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.add_device		= intel_iommu_add_device,
 	.remove_device		= intel_iommu_remove_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
-	.put_resv_regions	= intel_iommu_put_resv_regions,
+	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.apply_resv_region	= intel_iommu_apply_resv_region,
 	.device_group		= intel_iommu_device_group,
 	.dev_has_feat		= intel_iommu_dev_has_feat,
-- 
Gitee


From 8d85510bdaa43ac5dcab94764f253777eb2a7863 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:02 +0800
Subject: [PATCH 0189/2161] iommu/vt-d: Add Kconfig option to enable/disable
 scalable mode

commit 046182525db611964da0db113dde9d3a2969085c upstream.

This adds Kconfig option INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
to make it easier for distributions to enable or disable the
Intel IOMMU scalable mode by default during kernel build.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig       | 12 ++++++++++++
 drivers/iommu/intel-iommu.c |  7 ++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 12499d32e690..7186deee16eb 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -247,6 +247,18 @@ config INTEL_IOMMU_FLOPPY_WA
 	  workaround will setup a 1:1 mapping for the first
 	  16MiB to make floppy (an ISA device) work.
 
+config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
+	bool "Enable Intel IOMMU scalable mode by default"
+	depends on INTEL_IOMMU
+	help
+	  Selecting this option will enable by default the scalable mode if
+	  hardware presents the capability. The scalable mode is defined in
+	  VT-d 3.0. The scalable mode capability could be checked by reading
+	  /sys/devices/virtual/iommu/dmar*/intel-iommu/ecap. If this option
+	  is not selected, scalable mode support could also be enabled by
+	  passing intel_iommu=sm_on to the kernel. If not sure, please use
+	  the default value.
+
 config IRQ_REMAP
 	bool "Support for Interrupt Remapping"
 	depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 4d2e1dc6ac37..fca6834fd736 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -355,9 +355,14 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 int dmar_disabled = 0;
 #else
 int dmar_disabled = 1;
-#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
+#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 
+#ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
+int intel_iommu_sm = 1;
+#else
 int intel_iommu_sm;
+#endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
+
 int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
-- 
Gitee


From 4ed4239ac43d1e6bef47a2834e98c30e7093ee11 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:03 +0800
Subject: [PATCH 0190/2161] iommu/vt-d: Fix CPU and IOMMU SVM feature matching
 checks

commit ff3dc6521f78132eaaf62a842c3ece9060dcde26 upstream.

Shared Virtual Memory(SVM) is based on a collective set of hardware
features detected at runtime. There are requirements for matching CPU
and IOMMU capabilities.

The current code checks CPU and IOMMU feature set for SVM support but
the result is never stored nor used. Therefore, SVM can still be used
even when these checks failed. The consequences can be:
1. CPU uses 5-level paging mode for virtual address of 57 bits, but
IOMMU can only support 4-level paging mode with 48 bits address for DMA.
2. 1GB page size is used by CPU but IOMMU does not support it. VT-d
unrecoverable faults may be generated.

The best solution to fix these problems is to prevent them in the first
place.

This patch consolidates code for checking PASID, CPU vs. IOMMU paging
mode compatibility, as well as provides specific error messages for
each failed checks. On sane hardware configurations, these error message
shall never appear in kernel log.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 11 ++--------
 drivers/iommu/intel-svm.c   | 40 +++++++++++++++++++++++++------------
 include/linux/intel-iommu.h |  5 ++++-
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index fca6834fd736..4d8359712508 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3300,11 +3300,7 @@ static int __init init_dmars(void)
 			pr_info("Disable batched IOTLB flush due to virtualization");
 			intel_iommu_strict = 1;
 		}
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-		if (pasid_supported(iommu))
-			intel_svm_init(iommu);
-#endif
+		intel_svm_check(iommu);
 	}
 
 	/*
@@ -4500,10 +4496,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 	if (ret)
 		goto out;
 
-#ifdef CONFIG_INTEL_IOMMU_SVM
-	if (pasid_supported(iommu))
-		intel_svm_init(iommu);
-#endif
+	intel_svm_check(iommu);
 
 	if (dmaru->ignored) {
 		/*
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index a3739f626629..6323b287da3a 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -23,19 +23,6 @@
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 
-int intel_svm_init(struct intel_iommu *iommu)
-{
-	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
-			!cap_fl1gp_support(iommu->cap))
-		return -EINVAL;
-
-	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
-			!cap_5lp_support(iommu->cap))
-		return -EINVAL;
-
-	return 0;
-}
-
 #define PRQ_ORDER 0
 
 int intel_svm_enable_prq(struct intel_iommu *iommu)
@@ -153,6 +140,33 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
 	}
 }
 
+static inline bool intel_svm_capable(struct intel_iommu *iommu)
+{
+	return iommu->flags & VTD_FLAG_SVM_CAPABLE;
+}
+
+void intel_svm_check(struct intel_iommu *iommu)
+{
+	if (!pasid_supported(iommu))
+		return;
+
+	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
+	    !cap_fl1gp_support(iommu->cap)) {
+		pr_err("%s SVM disabled, incompatible 1GB page capability\n",
+		       iommu->name);
+		return;
+	}
+
+	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+	    !cap_5lp_support(iommu->cap)) {
+		pr_err("%s SVM disabled, incompatible paging mode\n",
+		       iommu->name);
+		return;
+	}
+
+	iommu->flags |= VTD_FLAG_SVM_CAPABLE;
+}
+
 static void intel_flush_svm_range_dev(struct intel_svm *svm,
 				      struct intel_svm_dev *sdev,
 				      unsigned long address,
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 88ac8edf44e3..370520ef3659 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -437,6 +437,7 @@ enum {
 
 #define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
 #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
+#define VTD_FLAG_SVM_CAPABLE		(1 << 2)
 
 extern int intel_iommu_sm;
 
@@ -662,7 +663,7 @@ void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
-int intel_svm_init(struct intel_iommu *iommu);
+extern void intel_svm_check(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
 extern int intel_svm_finish_prq(struct intel_iommu *iommu);
 
@@ -690,6 +691,8 @@ struct intel_svm {
 };
 
 extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev);
+#else
+static inline void intel_svm_check(struct intel_iommu *iommu) {}
 #endif
 
 #ifdef CONFIG_INTEL_IOMMU_DEBUGFS
-- 
Gitee


From 5f4d9b2d94147a68300d320eb40ec519b0e4b32b Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:05 +0800
Subject: [PATCH 0191/2161] iommu/vt-d: Reject SVM bind for failed capability
 check

commit 6eba09a4b5d583ad18a2a1fd1b82aee8cc3ac542 upstream.

Add a check during SVM bind to ensure CPU and IOMMU hardware capabilities
are met.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-svm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 6323b287da3a..bebf3cb3d3dd 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -252,6 +252,9 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 	if (!iommu || dmar_disabled)
 		return -EINVAL;
 
+	if (!intel_svm_capable(iommu))
+		return -ENOTSUPP;
+
 	if (dev_is_pci(dev)) {
 		pasid_max = pci_max_pasids(to_pci_dev(dev));
 		if (pasid_max < 0)
-- 
Gitee


From 71a1fc96baccd327e1dfe4063e6cee23818dadbf Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:06 +0800
Subject: [PATCH 0192/2161] iommu/vt-d: Avoid duplicated code for PASID setup

commit d62efd4fa62be50fa1ff4433a83b14c51d320a27 upstream.

After each setup for PASID entry, related translation caches must be
flushed. We can combine duplicated code into one function which is less
error prone.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-pasid.c | 48 ++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index e7cb0b8a7332..732bfee228df 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -465,6 +465,21 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 		devtlb_invalidation_with_pasid(iommu, dev, pasid);
 }
 
+static void pasid_flush_caches(struct intel_iommu *iommu,
+				struct pasid_entry *pte,
+				int pasid, u16 did)
+{
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(pte, sizeof(*pte));
+
+	if (cap_caching_mode(iommu->cap)) {
+		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+		iotlb_invalidation_with_pasid(iommu, did, pasid);
+	} else {
+		iommu_flush_write_buffer(iommu);
+	}
+}
+
 /*
  * Set up the scalable mode pasid table entry for first only
  * translation type.
@@ -518,16 +533,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 	/* Setup Present and PASID Granular Transfer Type: */
 	pasid_set_translation_type(pte, 1);
 	pasid_set_present(pte);
-
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(pte, sizeof(*pte));
-
-	if (cap_caching_mode(iommu->cap)) {
-		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-		iotlb_invalidation_with_pasid(iommu, did, pasid);
-	} else {
-		iommu_flush_write_buffer(iommu);
-	}
+	pasid_flush_caches(iommu, pte, pasid, did);
 
 	return 0;
 }
@@ -591,16 +597,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 	 */
 	pasid_set_sre(pte);
 	pasid_set_present(pte);
-
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(pte, sizeof(*pte));
-
-	if (cap_caching_mode(iommu->cap)) {
-		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-		iotlb_invalidation_with_pasid(iommu, did, pasid);
-	} else {
-		iommu_flush_write_buffer(iommu);
-	}
+	pasid_flush_caches(iommu, pte, pasid, did);
 
 	return 0;
 }
@@ -634,16 +631,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 	 */
 	pasid_set_sre(pte);
 	pasid_set_present(pte);
-
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(pte, sizeof(*pte));
-
-	if (cap_caching_mode(iommu->cap)) {
-		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-		iotlb_invalidation_with_pasid(iommu, did, pasid);
-	} else {
-		iommu_flush_write_buffer(iommu);
-	}
+	pasid_flush_caches(iommu, pte, pasid, did);
 
 	return 0;
 }
-- 
Gitee


From f2711a842cc16f7b9ad6381a560e6dfdaa3b7e6e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:08 +0800
Subject: [PATCH 0193/2161] iommu/vt-d: Replace Intel specific PASID allocator
 with IOASID

commit 59a623374dc38f7724457c59753c3f02fac6aed6 upstream.

Make use of generic IOASID code to manage PASID allocation,
free, and lookup. Replace Intel specific code.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig       |  1 +
 drivers/iommu/intel-iommu.c | 13 +++++++------
 drivers/iommu/intel-pasid.c | 36 ------------------------------------
 drivers/iommu/intel-svm.c   | 36 ++++++++++++++++++++++--------------
 4 files changed, 30 insertions(+), 56 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 7186deee16eb..c7f3bb83e9c1 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -213,6 +213,7 @@ config INTEL_IOMMU_SVM
 	select PCI_PASID
 	select PCI_PRI
 	select MMU_NOTIFIER
+	select IOASID
 	help
 	  Shared Virtual Memory (SVM) provides a facility for devices
 	  to access DMA resources through process address space by
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 4d8359712508..10c9b8361b27 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5293,7 +5293,7 @@ static void auxiliary_unlink_device(struct dmar_domain *domain,
 	domain->auxd_refcnt--;
 
 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		intel_pasid_free_id(domain->default_pasid);
+		ioasid_free(domain->default_pasid);
 }
 
 static int aux_domain_add_dev(struct dmar_domain *domain,
@@ -5311,10 +5311,11 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 	if (domain->default_pasid <= 0) {
 		int pasid;
 
-		pasid = intel_pasid_alloc_id(domain, PASID_MIN,
-					     pci_max_pasids(to_pci_dev(dev)),
-					     GFP_KERNEL);
-		if (pasid <= 0) {
+		/* No private data needed for the default pasid */
+		pasid = ioasid_alloc(NULL, PASID_MIN,
+				     pci_max_pasids(to_pci_dev(dev)) - 1,
+				     NULL);
+		if (pasid == INVALID_IOASID) {
 			pr_err("Can't allocate default pasid\n");
 			return -ENODEV;
 		}
@@ -5350,7 +5351,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 	spin_unlock(&iommu->lock);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		intel_pasid_free_id(domain->default_pasid);
+		ioasid_free(domain->default_pasid);
 
 	return ret;
 }
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 732bfee228df..3cb569e76642 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -26,42 +26,6 @@
  */
 static DEFINE_SPINLOCK(pasid_lock);
 u32 intel_pasid_max_id = PASID_MAX;
-static DEFINE_IDR(pasid_idr);
-
-int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp)
-{
-	int ret, min, max;
-
-	min = max_t(int, start, PASID_MIN);
-	max = min_t(int, end, intel_pasid_max_id);
-
-	WARN_ON(in_interrupt());
-	idr_preload(gfp);
-	spin_lock(&pasid_lock);
-	ret = idr_alloc(&pasid_idr, ptr, min, max, GFP_ATOMIC);
-	spin_unlock(&pasid_lock);
-	idr_preload_end();
-
-	return ret;
-}
-
-void intel_pasid_free_id(int pasid)
-{
-	spin_lock(&pasid_lock);
-	idr_remove(&pasid_idr, pasid);
-	spin_unlock(&pasid_lock);
-}
-
-void *intel_pasid_lookup_id(int pasid)
-{
-	void *p;
-
-	spin_lock(&pasid_lock);
-	p = idr_find(&pasid_idr, pasid);
-	spin_unlock(&pasid_lock);
-
-	return p;
-}
 
 /*
  * Per device pasid table management:
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index bebf3cb3d3dd..1adb4acb3d6b 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -17,6 +17,7 @@
 #include <linux/dmar.h>
 #include <linux/interrupt.h>
 #include <linux/mm_types.h>
+#include <linux/ioasid.h>
 #include <asm/page.h>
 
 #include "intel-pasid.h"
@@ -349,16 +350,15 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		if (pasid_max > intel_pasid_max_id)
 			pasid_max = intel_pasid_max_id;
 
-		/* Do not use PASID 0 in caching mode (virtualised IOMMU) */
-		ret = intel_pasid_alloc_id(svm,
-					   !!cap_caching_mode(iommu->cap),
-					   pasid_max, GFP_KERNEL);
-		if (ret < 0) {
+		/* Do not use PASID 0, reserved for RID to PASID */
+		svm->pasid = ioasid_alloc(NULL, PASID_MIN,
+					  pasid_max - 1, svm);
+		if (svm->pasid == INVALID_IOASID) {
 			kfree(svm);
 			kfree(sdev);
+			ret = -ENOSPC;
 			goto out;
 		}
-		svm->pasid = ret;
 		svm->notifier.ops = &intel_mmuops;
 		svm->mm = mm;
 		svm->flags = flags;
@@ -368,7 +368,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		if (mm) {
 			ret = mmu_notifier_register(&svm->notifier, mm);
 			if (ret) {
-				intel_pasid_free_id(svm->pasid);
+				ioasid_free(svm->pasid);
 				kfree(svm);
 				kfree(sdev);
 				goto out;
@@ -384,7 +384,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		if (ret) {
 			if (mm)
 				mmu_notifier_unregister(&svm->notifier, mm);
-			intel_pasid_free_id(svm->pasid);
+			ioasid_free(svm->pasid);
 			kfree(svm);
 			kfree(sdev);
 			goto out;
@@ -432,10 +432,15 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 	if (!iommu)
 		goto out;
 
-	svm = intel_pasid_lookup_id(pasid);
+	svm = ioasid_find(NULL, pasid, NULL);
 	if (!svm)
 		goto out;
 
+	if (IS_ERR(svm)) {
+		ret = PTR_ERR(svm);
+		goto out;
+	}
+
 	list_for_each_entry(sdev, &svm->devs, list) {
 		if (dev == sdev->dev) {
 			ret = 0;
@@ -454,7 +459,7 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 				kfree_rcu(sdev, rcu);
 
 				if (list_empty(&svm->devs)) {
-					intel_pasid_free_id(svm->pasid);
+					ioasid_free(svm->pasid);
 					if (svm->mm)
 						mmu_notifier_unregister(&svm->notifier, svm->mm);
 
@@ -489,10 +494,14 @@ int intel_svm_is_pasid_valid(struct device *dev, int pasid)
 	if (!iommu)
 		goto out;
 
-	svm = intel_pasid_lookup_id(pasid);
+	svm = ioasid_find(NULL, pasid, NULL);
 	if (!svm)
 		goto out;
 
+	if (IS_ERR(svm)) {
+		ret = PTR_ERR(svm);
+		goto out;
+	}
 	/* init_mm is used in this case */
 	if (!svm->mm)
 		ret = 1;
@@ -599,13 +608,12 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 
 		if (!svm || svm->pasid != req->pasid) {
 			rcu_read_lock();
-			svm = intel_pasid_lookup_id(req->pasid);
+			svm = ioasid_find(NULL, req->pasid, NULL);
 			/* It *can't* go away, because the driver is not permitted
 			 * to unbind the mm while any page faults are outstanding.
 			 * So we only need RCU to protect the internal idr code. */
 			rcu_read_unlock();
-
-			if (!svm) {
+			if (IS_ERR_OR_NULL(svm)) {
 				pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
 				       iommu->name, req->pasid, ((unsigned long long *)req)[0],
 				       ((unsigned long long *)req)[1]);
-- 
Gitee


From b2847279655648c7a2542a99b81f168c3a2adc74 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:10 +0800
Subject: [PATCH 0194/2161] iommu/vt-d: Misc macro clean up for SVM

commit 034d473109e90784868297870f456c9687f84058 upstream.

Use combined macros for_each_svm_dev() to simplify SVM device iteration
and error checking.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-svm.c | 79 +++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 1adb4acb3d6b..ccfa03b2cfb8 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -240,6 +240,10 @@ static const struct mmu_notifier_ops intel_mmuops = {
 static DEFINE_MUTEX(pasid_mutex);
 static LIST_HEAD(global_svm_list);
 
+#define for_each_svm_dev(sdev, svm, d)			\
+	list_for_each_entry((sdev), &(svm)->devs, list)	\
+		if ((d) != (sdev)->dev) {} else
+
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
 {
 	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
@@ -288,15 +292,14 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 				goto out;
 			}
 
-			list_for_each_entry(sdev, &svm->devs, list) {
-				if (dev == sdev->dev) {
-					if (sdev->ops != ops) {
-						ret = -EBUSY;
-						goto out;
-					}
-					sdev->users++;
-					goto success;
+			/* Find the matching device in svm list */
+			for_each_svm_dev(sdev, svm, dev) {
+				if (sdev->ops != ops) {
+					ret = -EBUSY;
+					goto out;
 				}
+				sdev->users++;
+				goto success;
 			}
 
 			break;
@@ -441,40 +444,36 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 		goto out;
 	}
 
-	list_for_each_entry(sdev, &svm->devs, list) {
-		if (dev == sdev->dev) {
-			ret = 0;
-			sdev->users--;
-			if (!sdev->users) {
-				list_del_rcu(&sdev->list);
-				/* Flush the PASID cache and IOTLB for this device.
-				 * Note that we do depend on the hardware *not* using
-				 * the PASID any more. Just as we depend on other
-				 * devices never using PASIDs that they have no right
-				 * to use. We have a *shared* PASID table, because it's
-				 * large and has to be physically contiguous. So it's
-				 * hard to be as defensive as we might like. */
-				intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
-				intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
-				kfree_rcu(sdev, rcu);
-
-				if (list_empty(&svm->devs)) {
-					ioasid_free(svm->pasid);
-					if (svm->mm)
-						mmu_notifier_unregister(&svm->notifier, svm->mm);
-
-					list_del(&svm->list);
-
-					/* We mandate that no page faults may be outstanding
-					 * for the PASID when intel_svm_unbind_mm() is called.
-					 * If that is not obeyed, subtle errors will happen.
-					 * Let's make them less subtle... */
-					memset(svm, 0x6b, sizeof(*svm));
-					kfree(svm);
-				}
+	for_each_svm_dev(sdev, svm, dev) {
+		ret = 0;
+		sdev->users--;
+		if (!sdev->users) {
+			list_del_rcu(&sdev->list);
+			/* Flush the PASID cache and IOTLB for this device.
+			 * Note that we do depend on the hardware *not* using
+			 * the PASID any more. Just as we depend on other
+			 * devices never using PASIDs that they have no right
+			 * to use. We have a *shared* PASID table, because it's
+			 * large and has to be physically contiguous. So it's
+			 * hard to be as defensive as we might like. */
+			intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
+			intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
+			kfree_rcu(sdev, rcu);
+
+			if (list_empty(&svm->devs)) {
+				ioasid_free(svm->pasid);
+				if (svm->mm)
+					mmu_notifier_unregister(&svm->notifier, svm->mm);
+				list_del(&svm->list);
+				/* We mandate that no page faults may be outstanding
+				 * for the PASID when intel_svm_unbind_mm() is called.
+				 * If that is not obeyed, subtle errors will happen.
+				 * Let's make them less subtle... */
+				memset(svm, 0x6b, sizeof(*svm));
+				kfree(svm);
 			}
-			break;
 		}
+		break;
 	}
  out:
 	mutex_unlock(&pasid_mutex);
-- 
Gitee


From d63c6af38eb2ff5914b2e82c1e2940529a1b3606 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:11 +0800
Subject: [PATCH 0195/2161] iommu/vt-d: trace: Extend map_sg trace event

commit 984d03adc9bdfb1bea408f9cc41a67214ccdffd3 upstream.

Current map_sg stores trace message in a coarse manner. This
extends it so that more detailed messages could be traced.

The map_sg trace message looks like:

map_sg: dev=0000:00:17.0 [1/9] dev_addr=0xf8f90000 phys_addr=0x158051000 size=4096
map_sg: dev=0000:00:17.0 [2/9] dev_addr=0xf8f91000 phys_addr=0x15a858000 size=4096
map_sg: dev=0000:00:17.0 [3/9] dev_addr=0xf8f92000 phys_addr=0x15aa13000 size=4096
map_sg: dev=0000:00:17.0 [4/9] dev_addr=0xf8f93000 phys_addr=0x1570f1000 size=8192
map_sg: dev=0000:00:17.0 [5/9] dev_addr=0xf8f95000 phys_addr=0x15c6d0000 size=4096
map_sg: dev=0000:00:17.0 [6/9] dev_addr=0xf8f96000 phys_addr=0x157194000 size=4096
map_sg: dev=0000:00:17.0 [7/9] dev_addr=0xf8f97000 phys_addr=0x169552000 size=4096
map_sg: dev=0000:00:17.0 [8/9] dev_addr=0xf8f98000 phys_addr=0x169dde000 size=4096
map_sg: dev=0000:00:17.0 [9/9] dev_addr=0xf8f99000 phys_addr=0x148351000 size=4096

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c        |  7 +++--
 include/trace/events/intel_iommu.h | 48 ++++++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 10c9b8361b27..94f562ab6c96 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3778,8 +3778,8 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 		return 0;
 	}
 
-	trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
-		     sg_phys(sglist), size << VTD_PAGE_SHIFT);
+	for_each_sg(sglist, sg, nelems, i)
+		trace_map_sg(dev, i + 1, nelems, sg);
 
 	return nelems;
 }
@@ -3991,6 +3991,9 @@ bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
 		sg_dma_len(sg) = sg->length;
 	}
 
+	for_each_sg(sglist, sg, nelems, i)
+		trace_bounce_map_sg(dev, i + 1, nelems, sg);
+
 	return nelems;
 
 out_unmap:
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index 54e61d456cdf..112bd06487bf 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -49,12 +49,6 @@ DEFINE_EVENT(dma_map, map_single,
 	TP_ARGS(dev, dev_addr, phys_addr, size)
 );
 
-DEFINE_EVENT(dma_map, map_sg,
-	TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
-		 size_t size),
-	TP_ARGS(dev, dev_addr, phys_addr, size)
-);
-
 DEFINE_EVENT(dma_map, bounce_map_single,
 	TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
 		 size_t size),
@@ -99,6 +93,48 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single,
 	TP_ARGS(dev, dev_addr, size)
 );
 
+DECLARE_EVENT_CLASS(dma_map_sg,
+	TP_PROTO(struct device *dev, int index, int total,
+		 struct scatterlist *sg),
+
+	TP_ARGS(dev, index, total, sg),
+
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name(dev))
+		__field(dma_addr_t, dev_addr)
+		__field(phys_addr_t, phys_addr)
+		__field(size_t,	size)
+		__field(int, index)
+		__field(int, total)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name(dev));
+		__entry->dev_addr = sg->dma_address;
+		__entry->phys_addr = sg_phys(sg);
+		__entry->size = sg->dma_length;
+		__entry->index = index;
+		__entry->total = total;
+	),
+
+	TP_printk("dev=%s [%d/%d] dev_addr=0x%llx phys_addr=0x%llx size=%zu",
+		  __get_str(dev_name), __entry->index, __entry->total,
+		  (unsigned long long)__entry->dev_addr,
+		  (unsigned long long)__entry->phys_addr,
+		  __entry->size)
+);
+
+DEFINE_EVENT(dma_map_sg, map_sg,
+	TP_PROTO(struct device *dev, int index, int total,
+		 struct scatterlist *sg),
+	TP_ARGS(dev, index, total, sg)
+);
+
+DEFINE_EVENT(dma_map_sg, bounce_map_sg,
+	TP_PROTO(struct device *dev, int index, int total,
+		 struct scatterlist *sg),
+	TP_ARGS(dev, index, total, sg)
+);
 #endif /* _TRACE_INTEL_IOMMU_H */
 
 /* This part must be outside protection */
-- 
Gitee


From d865213bb13e254b0fab2dd6bdc68d71bcde031c Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:12 +0800
Subject: [PATCH 0196/2161] iommu/vt-d: Avoid iova flush queue in strict mode

commit 10f8008f0f884c3f8b6c8ea8f0f84540beda86be upstream.

If Intel IOMMU strict mode is enabled by users, it's unnecessary
to create the iova flush queue.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 94f562ab6c96..852a3cd68188 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1864,10 +1864,12 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
 
 	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
 
-	err = init_iova_flush_queue(&domain->iovad,
-				    iommu_flush_iova, iova_entry_free);
-	if (err)
-		return err;
+	if (!intel_iommu_strict) {
+		err = init_iova_flush_queue(&domain->iovad,
+					    iommu_flush_iova, iova_entry_free);
+		if (err)
+			return err;
+	}
 
 	domain_reserve_special_ranges(domain);
 
@@ -5210,6 +5212,7 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 {
 	struct dmar_domain *dmar_domain;
 	struct iommu_domain *domain;
+	int ret;
 
 	switch (type) {
 	case IOMMU_DOMAIN_DMA:
@@ -5226,11 +5229,14 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 			return NULL;
 		}
 
-		if (type == IOMMU_DOMAIN_DMA &&
-		    init_iova_flush_queue(&dmar_domain->iovad,
-					  iommu_flush_iova, iova_entry_free)) {
-			pr_warn("iova flush queue initialization failed\n");
-			intel_iommu_strict = 1;
+		if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
+			ret = init_iova_flush_queue(&dmar_domain->iovad,
+						    iommu_flush_iova,
+						    iova_entry_free);
+			if (ret) {
+				pr_warn("iova flush queue initialization failed\n");
+				intel_iommu_strict = 1;
+			}
 		}
 
 		domain_update_iommu_cap(dmar_domain);
-- 
Gitee


From 4784ed082728369cfe00cf6c3d3993b2646399bc Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:13 +0800
Subject: [PATCH 0197/2161] iommu/vt-d: Loose requirement for flush queue
 initializaton

commit 8e3391cfdcd40eb4034014f8e6f8e9cbeb5335e7 upstream.

Currently if flush queue initialization fails, we return error
or enforce the system-wide strict mode. These are unnecessary
because we always check the existence of a flush queue before
queuing any iova's for lazy flushing. Printing a informational
message is enough.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 852a3cd68188..ef24e3bc9eac 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1860,15 +1860,15 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
 {
 	int adjust_width, agaw;
 	unsigned long sagaw;
-	int err;
+	int ret;
 
 	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
 
 	if (!intel_iommu_strict) {
-		err = init_iova_flush_queue(&domain->iovad,
+		ret = init_iova_flush_queue(&domain->iovad,
 					    iommu_flush_iova, iova_entry_free);
-		if (err)
-			return err;
+		if (ret)
+			pr_info("iova flush queue initialization failed\n");
 	}
 
 	domain_reserve_special_ranges(domain);
@@ -5233,10 +5233,8 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 			ret = init_iova_flush_queue(&dmar_domain->iovad,
 						    iommu_flush_iova,
 						    iova_entry_free);
-			if (ret) {
-				pr_warn("iova flush queue initialization failed\n");
-				intel_iommu_strict = 1;
-			}
+			if (ret)
+				pr_info("iova flush queue initialization failed\n");
 		}
 
 		domain_update_iommu_cap(dmar_domain);
-- 
Gitee


From 81e62c9572a195cd679b5d590bdaaf80f378a704 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:14 +0800
Subject: [PATCH 0198/2161] iommu/vt-d: Identify domains using first level page
 table

commit a1948f2e0a9d7ba6b9af883c673986752d9da52d upstream.

This checks whether a domain should use the first level page
table for map/unmap and marks it in the domain structure.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 39 +++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index ef24e3bc9eac..93ff8564adae 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -307,6 +307,14 @@ static int hw_pass_through = 1;
  */
 #define DOMAIN_FLAG_LOSE_CHILDREN		BIT(1)
 
+/*
+ * When VT-d works in the scalable mode, it allows DMA translation to
+ * happen through either first level or second level page table. This
+ * bit marks that the DMA translation for the domain goes through the
+ * first level page table, otherwise, it goes through the second level.
+ */
+#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(2)
+
 #define for_each_domain_iommu(idx, domain)			\
 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
 		if (domain->iommu_refcnt[idx])
@@ -1720,6 +1728,35 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
 #endif
 }
 
+/*
+ * Check and return whether first level is used by default for
+ * DMA translation. Currently, we make it off by setting
+ * first_level_support = 0, and will change it to -1 after all
+ * map/unmap paths support first level page table.
+ */
+static bool first_level_by_default(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	static int first_level_support = 0;
+
+	if (likely(first_level_support != -1))
+		return first_level_support;
+
+	first_level_support = 1;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
+			first_level_support = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return first_level_support;
+}
+
 static struct dmar_domain *alloc_domain(int flags)
 {
 	struct dmar_domain *domain;
@@ -1731,6 +1768,8 @@ static struct dmar_domain *alloc_domain(int flags)
 	memset(domain, 0, sizeof(*domain));
 	domain->nid = NUMA_NO_NODE;
 	domain->flags = flags;
+	if (first_level_by_default())
+		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
 	domain->has_iotlb_device = false;
 	INIT_LIST_HEAD(&domain->devices);
 
-- 
Gitee


From bfceda8709e8d32fce01d855775992fb312d2add Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:15 +0800
Subject: [PATCH 0199/2161] iommu/vt-d: Add set domain DOMAIN_ATTR_NESTING attr

commit 2cd1311a26673d45ffa8b7c8f46a8c7023601491 upstream.

This adds the Intel VT-d specific callback of setting
DOMAIN_ATTR_NESTING domain attribution. It is necessary
to let the VT-d driver know that the domain represents
a virtual machine which requires the IOMMU hardware to
support nested translation mode. Return success if the
IOMMU hardware suports nested mode, otherwise failure.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 56 +++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 93ff8564adae..4bf441960138 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -315,6 +315,12 @@ static int hw_pass_through = 1;
  */
 #define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(2)
 
+/*
+ * Domain represents a virtual machine which demands iommu nested
+ * translation mode support.
+ */
+#define DOMAIN_FLAG_NESTING_MODE		BIT(3)
+
 #define for_each_domain_iommu(idx, domain)			\
 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
 		if (domain->iommu_refcnt[idx])
@@ -5653,6 +5659,24 @@ static inline bool iommu_pasid_support(void)
 	return ret;
 }
 
+static inline bool nested_mode_support(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	bool ret = true;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
+			ret = false;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static bool intel_iommu_capable(enum iommu_cap cap)
 {
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
@@ -6053,10 +6077,42 @@ static bool risky_device(struct pci_dev *pdev)
 	return false;
 }
 
+static int
+intel_iommu_domain_set_attr(struct iommu_domain *domain,
+			    enum iommu_attr attr, void *data)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	unsigned long flags;
+	int ret = 0;
+
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		spin_lock_irqsave(&device_domain_lock, flags);
+		if (nested_mode_support() &&
+		    list_empty(&dmar_domain->devices)) {
+			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
+			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
+		} else {
+			ret = -ENODEV;
+		}
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
+	.domain_set_attr	= intel_iommu_domain_set_attr,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
 	.aux_attach_dev		= intel_iommu_aux_attach_device,
-- 
Gitee


From c0749e4cf7554739797fb7ac24844876df3abd90 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:16 +0800
Subject: [PATCH 0200/2161] iommu/vt-d: Add PASID_FLAG_FL5LP for first-level
 pasid setup

commit 87208f22a4d942ce880e7bf092158eecd6ffa293 upstream.

Current intel_pasid_setup_first_level() use 5-level paging for
first level translation if CPUs use 5-level paging mode too.
This makes sense for SVA usages since the page table is shared
between CPUs and IOMMUs. But it makes no sense if we only want
to use first level for IOVA translation. Add PASID_FLAG_FL5LP
bit in the flags which indicates whether the 5-level paging
mode should be used.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-pasid.c | 7 ++-----
 drivers/iommu/intel-pasid.h | 6 ++++++
 drivers/iommu/intel-svm.c   | 8 ++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 3cb569e76642..22b30f10b396 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -477,18 +477,15 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 		pasid_set_sre(pte);
 	}
 
-#ifdef CONFIG_X86
-	/* Both CPU and IOMMU paging mode need to match */
-	if (cpu_feature_enabled(X86_FEATURE_LA57)) {
+	if (flags & PASID_FLAG_FL5LP) {
 		if (cap_5lp_support(iommu->cap)) {
 			pasid_set_flpm(pte, 1);
 		} else {
-			pr_err("VT-d has no 5-level paging support for CPU\n");
+			pr_err("No 5-level paging support for first-level\n");
 			pasid_clear_entry(pte);
 			return -EINVAL;
 		}
 	}
-#endif /* CONFIG_X86 */
 
 	pasid_set_domain_id(pte, did);
 	pasid_set_address_width(pte, iommu->agaw);
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index fc8cd8f17de1..92de6df24ccb 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -37,6 +37,12 @@
  */
 #define PASID_FLAG_SUPERVISOR_MODE	BIT(0)
 
+/*
+ * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
+ * level translation, otherwise, 4-level paging will be used.
+ */
+#define PASID_FLAG_FL5LP		BIT(1)
+
 struct pasid_dir_entry {
 	u64 val;
 };
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index ccfa03b2cfb8..517ed2cd5eaa 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -382,7 +382,9 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		ret = intel_pasid_setup_first_level(iommu, dev,
 				mm ? mm->pgd : init_mm.pgd,
 				svm->pasid, FLPT_DEFAULT_DID,
-				mm ? 0 : PASID_FLAG_SUPERVISOR_MODE);
+				(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+				(cpu_feature_enabled(X86_FEATURE_LA57) ?
+				 PASID_FLAG_FL5LP : 0));
 		spin_unlock(&iommu->lock);
 		if (ret) {
 			if (mm)
@@ -403,7 +405,9 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		ret = intel_pasid_setup_first_level(iommu, dev,
 						mm ? mm->pgd : init_mm.pgd,
 						svm->pasid, FLPT_DEFAULT_DID,
-						mm ? 0 : PASID_FLAG_SUPERVISOR_MODE);
+						(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+						(cpu_feature_enabled(X86_FEATURE_LA57) ?
+						PASID_FLAG_FL5LP : 0));
 		spin_unlock(&iommu->lock);
 		if (ret) {
 			kfree(sdev);
-- 
Gitee


From 5d14e69a226dcc94db4e07393af30ecdb87f65dd Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:17 +0800
Subject: [PATCH 0201/2161] iommu/vt-d: Setup pasid entries for iova over first
 level

commit ddf09b6d43ece8e4d5591e4957e89c4fe7714792 upstream.

Intel VT-d in scalable mode supports two types of page tables for
IOVA translation: first level and second level. The IOMMU driver
can choose one from both for IOVA translation according to the use
case. This sets up the pasid entry if a domain is selected to use
the first-level page table for iova translation.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 57 +++++++++++++++++++++++++++++++++----
 include/linux/intel-iommu.h | 16 +++++++----
 2 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 4bf441960138..813256e694fb 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -571,6 +571,11 @@ static inline int domain_type_is_si(struct dmar_domain *domain)
 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 }
 
+static inline bool domain_use_first_level(struct dmar_domain *domain)
+{
+	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
+}
+
 static inline int domain_pfn_supported(struct dmar_domain *domain,
 				       unsigned long pfn)
 {
@@ -938,6 +943,8 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 
 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
+			if (domain_use_first_level(domain))
+				pteval |= DMA_FL_PTE_XD;
 			if (cmpxchg64(&pte->val, 0ULL, pteval))
 				/* Someone else set it while we were thinking; use theirs. */
 				free_pgtable_page(tmp_page);
@@ -2288,17 +2295,20 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 	unsigned long sg_res = 0;
 	unsigned int largepage_lvl = 0;
 	unsigned long lvl_pages = 0;
+	u64 attr;
 
 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
 
 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
 		return -EINVAL;
 
-	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
+	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
+	if (domain_use_first_level(domain))
+		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
 
 	if (!sg) {
 		sg_res = nr_pages;
-		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
+		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
 	}
 
 	while (nr_pages > 0) {
@@ -2310,7 +2320,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			sg_res = aligned_nrpages(sg->offset, sg->length);
 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
 			sg->dma_length = sg->length;
-			pteval = (sg_phys(sg) - pgoff) | prot;
+			pteval = (sg_phys(sg) - pgoff) | attr;
 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
 		}
 
@@ -2522,6 +2532,36 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
 	return NULL;
 }
 
+static int domain_setup_first_level(struct intel_iommu *iommu,
+				    struct dmar_domain *domain,
+				    struct device *dev,
+				    int pasid)
+{
+	int flags = PASID_FLAG_SUPERVISOR_MODE;
+	struct dma_pte *pgd = domain->pgd;
+	int agaw, level;
+
+	/*
+	 * Skip top levels of page tables for iommu which has
+	 * less agaw than default. Unnecessary for PT mode.
+	 */
+	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+		pgd = phys_to_virt(dma_pte_addr(pgd));
+		if (!dma_pte_present(pgd))
+			return -ENOMEM;
+	}
+
+	level = agaw_to_level(agaw);
+	if (level != 4 && level != 5)
+		return -EINVAL;
+
+	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
+
+	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
+					     domain->iommu_did[iommu->seq_id],
+					     flags);
+}
+
 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 						    int bus, int devfn,
 						    struct device *dev,
@@ -2621,6 +2661,9 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 		if (hw_pass_through && domain_type_is_si(domain))
 			ret = intel_pasid_setup_pass_through(iommu, domain,
 					dev, PASID_RID2PASID);
+		else if (domain_use_first_level(domain))
+			ret = domain_setup_first_level(iommu, domain, dev,
+					PASID_RID2PASID);
 		else
 			ret = intel_pasid_setup_second_level(iommu, domain,
 					dev, PASID_RID2PASID);
@@ -5385,8 +5428,12 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 		goto attach_failed;
 
 	/* Setup the PASID entry for mediated devices: */
-	ret = intel_pasid_setup_second_level(iommu, domain, dev,
-					     domain->default_pasid);
+	if (domain_use_first_level(domain))
+		ret = domain_setup_first_level(iommu, domain, dev,
+					       domain->default_pasid);
+	else
+		ret = intel_pasid_setup_second_level(iommu, domain, dev,
+						     domain->default_pasid);
 	if (ret)
 		goto table_failed;
 	spin_unlock(&iommu->lock);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 370520ef3659..b81f6467108c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -34,10 +34,13 @@
 #define VTD_STRIDE_SHIFT        (9)
 #define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
 
-#define DMA_PTE_READ (1)
-#define DMA_PTE_WRITE (2)
-#define DMA_PTE_LARGE_PAGE (1 << 7)
-#define DMA_PTE_SNP (1 << 11)
+#define DMA_PTE_READ		BIT_ULL(0)
+#define DMA_PTE_WRITE		BIT_ULL(1)
+#define DMA_PTE_LARGE_PAGE	BIT_ULL(7)
+#define DMA_PTE_SNP		BIT_ULL(11)
+
+#define DMA_FL_PTE_PRESENT	BIT_ULL(0)
+#define DMA_FL_PTE_XD		BIT_ULL(63)
 
 #define CONTEXT_TT_MULTI_LEVEL	0
 #define CONTEXT_TT_DEV_IOTLB	1
@@ -614,10 +617,11 @@ static inline void dma_clear_pte(struct dma_pte *pte)
 static inline u64 dma_pte_addr(struct dma_pte *pte)
 {
 #ifdef CONFIG_64BIT
-	return pte->val & VTD_PAGE_MASK;
+	return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
 #else
 	/* Must have a full atomic 64-bit read */
-	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
+	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) &
+			VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
 #endif
 }
 
-- 
Gitee


From 91f0f8e9d5d5b659691bd65fcc397cca97c034b5 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:18 +0800
Subject: [PATCH 0202/2161] iommu/vt-d: Flush PASID-based iotlb for iova over
 first level

commit 33cd6e642d6a76c1d338ce25cba5fd79a5029011 upstream.

When software has changed first-level tables, it should invalidate
the affected IOTLB and the paging-structure-caches using the PASID-
based-IOTLB Invalidate Descriptor defined in spec 6.5.2.4.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dmar.c        | 41 +++++++++++++++++++++++++++
 drivers/iommu/intel-iommu.c | 56 +++++++++++++++++++++++++++----------
 include/linux/intel-iommu.h |  2 ++
 3 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 1c952d24af36..d82ac6d09ab7 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1398,6 +1398,47 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 	qi_submit_sync(&desc, iommu);
 }
 
+/* PASID-based IOTLB invalidation */
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
+		     unsigned long npages, bool ih)
+{
+	struct qi_desc desc = {.qw2 = 0, .qw3 = 0};
+
+	/*
+	 * npages == -1 means a PASID-selective invalidation, otherwise,
+	 * a positive value for Page-selective-within-PASID invalidation.
+	 * 0 is not a valid input.
+	 */
+	if (WARN_ON(!npages)) {
+		pr_err("Invalid input npages = %ld\n", npages);
+		return;
+	}
+
+	if (npages == -1) {
+		desc.qw0 = QI_EIOTLB_PASID(pasid) |
+				QI_EIOTLB_DID(did) |
+				QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+				QI_EIOTLB_TYPE;
+		desc.qw1 = 0;
+	} else {
+		int mask = ilog2(__roundup_pow_of_two(npages));
+		unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask));
+
+		if (WARN_ON_ONCE(!ALIGN(addr, align)))
+			addr &= ~(align - 1);
+
+		desc.qw0 = QI_EIOTLB_PASID(pasid) |
+				QI_EIOTLB_DID(did) |
+				QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
+				QI_EIOTLB_TYPE;
+		desc.qw1 = QI_EIOTLB_ADDR(addr) |
+				QI_EIOTLB_IH(ih) |
+				QI_EIOTLB_AM(mask);
+	}
+
+	qi_submit_sync(&desc, iommu);
+}
+
 /*
  * Disable Queued Invalidation interface.
  */
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 813256e694fb..41a2822d9070 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1515,6 +1515,20 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
+static void domain_flush_piotlb(struct intel_iommu *iommu,
+				struct dmar_domain *domain,
+				u64 addr, unsigned long npages, bool ih)
+{
+	u16 did = domain->iommu_did[iommu->seq_id];
+
+	if (domain->default_pasid)
+		qi_flush_piotlb(iommu, did, domain->default_pasid,
+				addr, npages, ih);
+
+	if (!list_empty(&domain->devices))
+		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
+}
+
 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
 				  struct dmar_domain *domain,
 				  unsigned long pfn, unsigned int pages,
@@ -1528,18 +1542,23 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
 
 	if (ih)
 		ih = 1 << 6;
-	/*
-	 * Fallback to domain selective flush if no PSI support or the size is
-	 * too big.
-	 * PSI requires page size to be 2 ^ x, and the base address is naturally
-	 * aligned to the size
-	 */
-	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
-		iommu->flush.flush_iotlb(iommu, did, 0, 0,
-						DMA_TLB_DSI_FLUSH);
-	else
-		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
-						DMA_TLB_PSI_FLUSH);
+
+	if (domain_use_first_level(domain)) {
+		domain_flush_piotlb(iommu, domain, addr, pages, ih);
+	} else {
+		/*
+		 * Fallback to domain selective flush if no PSI support or
+		 * the size is too big. PSI requires page size to be 2 ^ x,
+		 * and the base address is naturally aligned to the size.
+		 */
+		if (!cap_pgsel_inv(iommu->cap) ||
+		    mask > cap_max_amask_val(iommu->cap))
+			iommu->flush.flush_iotlb(iommu, did, 0, 0,
+							DMA_TLB_DSI_FLUSH);
+		else
+			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
+							DMA_TLB_PSI_FLUSH);
+	}
 
 	/*
 	 * In caching mode, changes of pages from non-present to present require
@@ -1554,8 +1573,11 @@ static inline void __mapping_notify_one(struct intel_iommu *iommu,
 					struct dmar_domain *domain,
 					unsigned long pfn, unsigned int pages)
 {
-	/* It's a non-present to present mapping. Only flush if caching mode */
-	if (cap_caching_mode(iommu->cap))
+	/*
+	 * It's a non-present to present mapping. Only flush if caching mode
+	 * and second level.
+	 */
+	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
 	else
 		iommu_flush_write_buffer(iommu);
@@ -1572,7 +1594,11 @@ static void iommu_flush_iova(struct iova_domain *iovad)
 		struct intel_iommu *iommu = g_iommus[idx];
 		u16 did = domain->iommu_did[iommu->seq_id];
 
-		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+		if (domain_use_first_level(domain))
+			domain_flush_piotlb(iommu, domain, 0, -1, 0);
+		else
+			iommu->flush.flush_iotlb(iommu, did, 0, 0,
+						 DMA_TLB_DSI_FLUSH);
 
 		if (!cap_caching_mode(iommu->cap))
 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index b81f6467108c..1029c10352b0 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -654,6 +654,8 @@ extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type);
 extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 			u16 qdep, u64 addr, unsigned mask);
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
+		     unsigned long npages, bool ih);
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
 extern int dmar_ir_support(void);
-- 
Gitee


From 4254a0a0348daf30eb3c4f3f5bf4fb8c996a65a6 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:19 +0800
Subject: [PATCH 0203/2161] iommu/vt-d: Make first level IOVA canonical

commit cb8b892dcecf279004967daab7791deb921cbcbe upstream.

First-level translation restricts the input-address to a canonical
address (i.e., address bits 63:N have the same value as address
bit [N-1], where N is 48-bits with 4-level paging and 57-bits with
5-level paging). (section 3.6 in the spec)

This makes first level IOVA canonical by using IOVA with bit [N-1]
always cleared.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 41a2822d9070..9f1e873cea05 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3506,8 +3506,21 @@ static unsigned long intel_alloc_iova(struct device *dev,
 {
 	unsigned long iova_pfn;
 
-	/* Restrict dma_mask to the width that the iommu can handle */
-	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
+	/*
+	 * Restrict dma_mask to the width that the iommu can handle.
+	 * First-level translation restricts the input-address to a
+	 * canonical address (i.e., address bits 63:N have the same
+	 * value as address bit [N-1], where N is 48-bits with 4-level
+	 * paging and 57-bits with 5-level paging). Hence, skip bit
+	 * [N-1].
+	 */
+	if (domain_use_first_level(domain))
+		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
+				 dma_mask);
+	else
+		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
+				 dma_mask);
+
 	/* Ensure we reserve the whole size-aligned region */
 	nrpages = __roundup_pow_of_two(nrpages);
 
-- 
Gitee


From 472209ff56d427ccdb6956d50fcb37cd6f69686e Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:20 +0800
Subject: [PATCH 0204/2161] iommu/vt-d: Update first level super page
 capability

commit 64229e8f37cf0fc460e611dacc9f14561ddfd018 upstream.

First-level translation may map input addresses to 4-KByte pages,
2-MByte pages, or 1-GByte pages. Support for 4-KByte pages and
2-Mbyte pages are mandatory for first-level translation. Hardware
support for 1-GByte page is reported through the FL1GP field in
the Capability Register.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 9f1e873cea05..30e96ac775c1 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -691,11 +691,12 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip)
 	return ret;
 }
 
-static int domain_update_iommu_superpage(struct intel_iommu *skip)
+static int domain_update_iommu_superpage(struct dmar_domain *domain,
+					 struct intel_iommu *skip)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
-	int mask = 0xf;
+	int mask = 0x3;
 
 	if (!intel_iommu_superpage) {
 		return 0;
@@ -705,7 +706,13 @@ static int domain_update_iommu_superpage(struct intel_iommu *skip)
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
 		if (iommu != skip) {
-			mask &= cap_super_page_val(iommu->cap);
+			if (domain && domain_use_first_level(domain)) {
+				if (!cap_fl1gp_support(iommu->cap))
+					mask = 0x1;
+			} else {
+				mask &= cap_super_page_val(iommu->cap);
+			}
+
 			if (!mask)
 				break;
 		}
@@ -720,7 +727,7 @@ static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
 	domain_update_iommu_coherency(domain);
 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
-	domain->iommu_superpage = domain_update_iommu_superpage(NULL);
+	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 }
 
 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
@@ -4608,7 +4615,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 			iommu->name);
 		return -ENXIO;
 	}
-	sp = domain_update_iommu_superpage(iommu) - 1;
+	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
 		pr_warn("%s: Doesn't support large page.\n",
 			iommu->name);
-- 
Gitee


From 0f6ae9d689f454343a884ba3607942b7770cc034 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:21 +0800
Subject: [PATCH 0205/2161] iommu/vt-d: Use iova over first level

commit b802d070a52a1565b47daaa808872cfbd4a17b01 upstream.

After we make all map/unmap paths support first level page table.
Let's turn it on if hardware supports scalable mode.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 30e96ac775c1..f5222eac5a10 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1776,15 +1776,13 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
 
 /*
  * Check and return whether first level is used by default for
- * DMA translation. Currently, we make it off by setting
- * first_level_support = 0, and will change it to -1 after all
- * map/unmap paths support first level page table.
+ * DMA translation.
  */
 static bool first_level_by_default(void)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
-	static int first_level_support = 0;
+	static int first_level_support = -1;
 
 	if (likely(first_level_support != -1))
 		return first_level_support;
-- 
Gitee


From 2e88605f5ae77a5bde6d4b08f973767a1cf51485 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 2 Jan 2020 08:18:22 +0800
Subject: [PATCH 0206/2161] iommu/vt-d: debugfs: Add support to show page table
 internals

commit e2726daea583d81e447b71e09b79e67f618d6152 upstream.

Export page table internals of the domain attached to each device.
Example of such dump on a Skylake machine:

$ sudo cat /sys/kernel/debug/iommu/intel/domain_translation_struct
[ ... ]
Device 0000:00:14.0 with pasid 0 @0x15f3d9000
IOVA_PFN                PML5E                   PML4E
0x000000008ced0 |       0x0000000000000000      0x000000015f3da003
0x000000008ced1 |       0x0000000000000000      0x000000015f3da003
0x000000008ced2 |       0x0000000000000000      0x000000015f3da003
0x000000008ced3 |       0x0000000000000000      0x000000015f3da003
0x000000008ced4 |       0x0000000000000000      0x000000015f3da003
0x000000008ced5 |       0x0000000000000000      0x000000015f3da003
0x000000008ced6 |       0x0000000000000000      0x000000015f3da003
0x000000008ced7 |       0x0000000000000000      0x000000015f3da003
0x000000008ced8 |       0x0000000000000000      0x000000015f3da003
0x000000008ced9 |       0x0000000000000000      0x000000015f3da003

PDPE                    PDE                     PTE
0x000000015f3db003      0x000000015f3dc003      0x000000008ced0003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced1003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced2003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced3003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced4003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced5003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced6003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced7003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced8003
0x000000015f3db003      0x000000015f3dc003      0x000000008ced9003
[ ... ]

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu-debugfs.c | 75 +++++++++++++++++++++++++++++
 drivers/iommu/intel-iommu.c         |  4 +-
 include/linux/intel-iommu.h         |  2 +
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel-iommu-debugfs.c b/drivers/iommu/intel-iommu-debugfs.c
index bdf095e9dbe0..3eb1fe240fb0 100644
--- a/drivers/iommu/intel-iommu-debugfs.c
+++ b/drivers/iommu/intel-iommu-debugfs.c
@@ -5,6 +5,7 @@
  * Authors: Gayatri Kammela <gayatri.kammela@intel.com>
  *	    Sohil Mehta <sohil.mehta@intel.com>
  *	    Jacob Pan <jacob.jun.pan@linux.intel.com>
+ *	    Lu Baolu <baolu.lu@linux.intel.com>
  */
 
 #include <linux/debugfs.h>
@@ -300,6 +301,77 @@ static int dmar_translation_struct_show(struct seq_file *m, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(dmar_translation_struct);
 
+static inline unsigned long level_to_directory_size(int level)
+{
+	return BIT_ULL(VTD_PAGE_SHIFT + VTD_STRIDE_SHIFT * (level - 1));
+}
+
+static inline void
+dump_page_info(struct seq_file *m, unsigned long iova, u64 *path)
+{
+	seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\n",
+		   iova >> VTD_PAGE_SHIFT, path[5], path[4],
+		   path[3], path[2], path[1]);
+}
+
+static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde,
+			       int level, unsigned long start,
+			       u64 *path)
+{
+	int i;
+
+	if (level > 5 || level < 1)
+		return;
+
+	for (i = 0; i < BIT_ULL(VTD_STRIDE_SHIFT);
+			i++, pde++, start += level_to_directory_size(level)) {
+		if (!dma_pte_present(pde))
+			continue;
+
+		path[level] = pde->val;
+		if (dma_pte_superpage(pde) || level == 1)
+			dump_page_info(m, start, path);
+		else
+			pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)),
+					   level - 1, start, path);
+		path[level] = 0;
+	}
+}
+
+static int show_device_domain_translation(struct device *dev, void *data)
+{
+	struct dmar_domain *domain = find_domain(dev);
+	struct seq_file *m = data;
+	u64 path[6] = { 0 };
+
+	if (!domain)
+		return 0;
+
+	seq_printf(m, "Device %s with pasid %d @0x%llx\n",
+		   dev_name(dev), domain->default_pasid,
+		   (u64)virt_to_phys(domain->pgd));
+	seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n");
+
+	pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static int domain_translation_struct_show(struct seq_file *m, void *unused)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	ret = bus_for_each_dev(&pci_bus_type, NULL, m,
+			       show_device_domain_translation);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(domain_translation_struct);
+
 #ifdef CONFIG_IRQ_REMAP
 static void ir_tbl_remap_entry_show(struct seq_file *m,
 				    struct intel_iommu *iommu)
@@ -415,6 +487,9 @@ void __init intel_iommu_debugfs_init(void)
 			    &iommu_regset_fops);
 	debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug,
 			    NULL, &dmar_translation_struct_fops);
+	debugfs_create_file("domain_translation_struct", 0444,
+			    intel_iommu_debug, NULL,
+			    &domain_translation_struct_fops);
 #ifdef CONFIG_IRQ_REMAP
 	debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
 			    NULL, &ir_translation_struct_fops);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index f5222eac5a10..911a8ecdfc6a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -396,7 +396,7 @@ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
-static DEFINE_SPINLOCK(device_domain_lock);
+DEFINE_SPINLOCK(device_domain_lock);
 static LIST_HEAD(device_domain_list);
 
 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
@@ -2520,7 +2520,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
-static struct dmar_domain *find_domain(struct device *dev)
+struct dmar_domain *find_domain(struct device *dev)
 {
 	struct device_domain_info *info;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 1029c10352b0..13ff71a56305 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -443,6 +443,7 @@ enum {
 #define VTD_FLAG_SVM_CAPABLE		(1 << 2)
 
 extern int intel_iommu_sm;
+extern spinlock_t device_domain_lock;
 
 #define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
 #define pasid_supported(iommu)	(sm_supported(iommu) &&			\
@@ -667,6 +668,7 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 				     void *data), void *data);
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
+struct dmar_domain *find_domain(struct device *dev);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 extern void intel_svm_check(struct intel_iommu *iommu);
-- 
Gitee


From 78653453a4bdd67b524c85dde50245d73f479dc0 Mon Sep 17 00:00:00 2001
From: jimyan <jimyan@baidu.com>
Date: Wed, 15 Jan 2020 11:03:55 +0800
Subject: [PATCH 0207/2161] iommu/vt-d: Don't reject Host Bridge due to scope
 mismatch

commit 53291622e2607db53ded78b534874b8fb515ba57 upstream.

On a system with two host bridges(0000:00:00.0,0000:80:00.0), iommu
initialization fails with

    DMAR: Device scope type does not match for 0000:80:00.0

This is because the DMAR table reports this device as having scope 2
(ACPI_DMAR_SCOPE_TYPE_BRIDGE):

but the device has a type 0 PCI header:
80:00.0 Class 0600: Device 8086:2020 (rev 06)
00: 86 80 20 20 47 05 10 00 06 00 00 06 10 00 00 00
10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
20: 00 00 00 00 00 00 00 00 00 00 00 00 86 80 00 00
30: 00 00 00 00 90 00 00 00 00 00 00 00 00 01 00 00

VT-d works perfectly on this system, so there's no reason to bail out
on initialization due to this apparent scope mismatch. Add the class
0x06 ("PCI_BASE_CLASS_BRIDGE") as a heuristic for allowing DMAR
initialization for non-bridge PCI devices listed with scope bridge.

Signed-off-by: jimyan <jimyan@baidu.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Roland Dreier <roland@purestorage.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dmar.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index d82ac6d09ab7..0645f44227bb 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -252,7 +252,7 @@ int dmar_insert_dev_scope(struct dmar_pci_notify_info *info,
 		     info->dev->hdr_type != PCI_HEADER_TYPE_NORMAL) ||
 		    (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE &&
 		     (info->dev->hdr_type == PCI_HEADER_TYPE_NORMAL &&
-		      info->dev->class >> 8 != PCI_CLASS_BRIDGE_OTHER))) {
+		      info->dev->class >> 16 != PCI_BASE_CLASS_BRIDGE))) {
 			pr_warn("Device scope type does not match for %s\n",
 				pci_name(info->dev));
 			return -EINVAL;
-- 
Gitee


From 5d367fed9f7a9cda0f07237c51fcbe4f74df47d0 Mon Sep 17 00:00:00 2001
From: Barret Rhoden <brho@google.com>
Date: Wed, 15 Jan 2020 11:03:56 +0800
Subject: [PATCH 0208/2161] iommu/vt-d: Mark firmware tainted if RMRR fails
 sanity check

commit f5a68bb0752e0cf77c06f53f72258e7beb41381b upstream.

RMRR entries describe memory regions that are DMA targets for devices
outside the kernel's control.

RMRR entries that fail the sanity check are pointing to regions of
memory that the firmware did not tell the kernel are reserved or
otherwise should not be used.

Instead of aborting DMAR processing, this commit marks the firmware
as tainted. These RMRRs will still be identity mapped, otherwise,
some devices, e.x. graphic devices, will not work during boot.

Signed-off-by: Barret Rhoden <brho@google.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Fixes: f036c7fa0ab60 ("iommu/vt-d: Check VT-d RMRR region in BIOS is reported as reserved")
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 911a8ecdfc6a..8ba3c5bfff5a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4461,12 +4461,16 @@ int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
 {
 	struct acpi_dmar_reserved_memory *rmrr;
 	struct dmar_rmrr_unit *rmrru;
-	int ret;
 
 	rmrr = (struct acpi_dmar_reserved_memory *)header;
-	ret = arch_rmrr_sanity_check(rmrr);
-	if (ret)
-		return ret;
+	if (arch_rmrr_sanity_check(rmrr))
+		WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
+			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
+			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+			   rmrr->base_address, rmrr->end_address,
+			   dmi_get_system_info(DMI_BIOS_VENDOR),
+			   dmi_get_system_info(DMI_BIOS_VERSION),
+			   dmi_get_system_info(DMI_PRODUCT_VERSION));
 
 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
 	if (!rmrru)
-- 
Gitee


From 110233f547c47c7ca6c9ef2b3dbeab7c8b115757 Mon Sep 17 00:00:00 2001
From: Barret Rhoden <brho@google.com>
Date: Wed, 15 Jan 2020 11:03:57 +0800
Subject: [PATCH 0209/2161] iommu/vt-d: Add RMRR base and end addresses sanity
 check

commit ce4cc52b51dfaefff65c6d2f31f169122a8659d4 upstream.

The VT-d spec specifies requirements for the RMRR entries base and
end (called 'Limit' in the docs) addresses.

This commit will cause the DMAR processing to mark the firmware as
tainted if any RMRR entries that do not meet these requirements.

Signed-off-by: Barret Rhoden <brho@google.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 8ba3c5bfff5a..57532feefa86 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4457,13 +4457,24 @@ static void __init init_iommu_pm_ops(void)
 static inline void init_iommu_pm_ops(void) {}
 #endif	/* CONFIG_PM */
 
+static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
+{
+	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
+	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
+	    rmrr->end_address <= rmrr->base_address ||
+	    arch_rmrr_sanity_check(rmrr))
+		return -EINVAL;
+
+	return 0;
+}
+
 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
 {
 	struct acpi_dmar_reserved_memory *rmrr;
 	struct dmar_rmrr_unit *rmrru;
 
 	rmrr = (struct acpi_dmar_reserved_memory *)header;
-	if (arch_rmrr_sanity_check(rmrr))
+	if (rmrr_sanity_check(rmrr))
 		WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-- 
Gitee


From 2b7e76218daed2fedffc63f783a7957a3485c9ce Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 15 Jan 2020 11:03:59 +0800
Subject: [PATCH 0210/2161] iommu/vt-d: Unnecessary to handle default identity
 domain

commit b89b6605b85f13b0cf41a047aa5759ea0a5663ec upstream.

The iommu default domain framework has been designed to take
care of setting identity default domain type. It's unnecessary
to handle this again in the VT-d driver. Hence, remove it.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 57532feefa86..575ca63c95b4 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -387,7 +387,6 @@ static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
 static int intel_no_bounce;
 
-#define IDENTMAP_ALL		1
 #define IDENTMAP_GFX		2
 #define IDENTMAP_AZALIA		4
 
@@ -3086,8 +3085,7 @@ static int device_def_domain_type(struct device *dev)
 			return IOMMU_DOMAIN_DMA;
 	}
 
-	return (iommu_identity_mapping & IDENTMAP_ALL) ?
-			IOMMU_DOMAIN_IDENTITY : 0;
+	return 0;
 }
 
 static void intel_iommu_init_qi(struct intel_iommu *iommu)
@@ -3436,9 +3434,6 @@ static int __init init_dmars(void)
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 	}
 
-	if (iommu_default_passthrough())
-		iommu_identity_mapping |= IDENTMAP_ALL;
-
 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
 	dmar_map_gfx = 0;
 #endif
@@ -5053,7 +5048,7 @@ static int __init platform_optin_force_iommu(void)
 	 * map for all devices except those marked as being untrusted.
 	 */
 	if (dmar_disabled)
-		iommu_identity_mapping |= IDENTMAP_ALL;
+		iommu_set_default_passthrough(false);
 
 	dmar_disabled = 0;
 	no_iommu = 0;
-- 
Gitee


From 7be9b612e3d45502d72315c5211dff0ce58488a7 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Jan 2020 06:37:48 -0700
Subject: [PATCH 0211/2161] iommu/vt-d: Use pci_real_dma_dev() for mapping

commit 2b0140c69637e6f3cf9f8a0b7629567de9645680 upstream.

The PCI device may have a DMA requester on another bus, such as VMD
subdevices needing to use the VMD endpoint.  This case requires the real
DMA device for the IOMMU mapping, so use pci_real_dma_dev() to find that
device.

Link: https://lore.kernel.org/r/1579613871-301529-5-git-send-email-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 575ca63c95b4..e5ee8ee5cb8a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -818,6 +818,8 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
 			return NULL;
 #endif
 
+		pdev = pci_real_dma_dev(pdev);
+
 		/* VFs aren't listed in scope tables; we need to look up
 		 * the PF instead to find the IOMMU. */
 		pf_pdev = pci_physfn(pdev);
@@ -2527,6 +2529,9 @@ struct dmar_domain *find_domain(struct device *dev)
 		     dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
 		return NULL;
 
+	if (dev_is_pci(dev))
+		dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
+
 	/* No lock here, assumes no domain exit in normal case */
 	info = dev->archdata.iommu;
 	if (likely(info))
-- 
Gitee


From 203e1a61ff26fbf6ffe0f950b2b5cccd2bf635d2 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Jan 2020 06:37:49 -0700
Subject: [PATCH 0212/2161] iommu/vt-d: Remove VMD child device sanity check

commit e3560ee4cfb29e232ea99ff9adfaa8ac5b414345 upstream.

Remove the sanity check required for VMD child devices.  The new
pci_real_dma_dev() DMA alias mechanism places them in the same IOMMU group
as the VMD endpoint.  Assignment of the group would require assigning the
VMD endpoint, where unbinding the VMD endpoint removes the child device
domain from the hierarchy.

Link: https://lore.kernel.org/r/1579613871-301529-6-git-send-email-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e5ee8ee5cb8a..a330be8371ed 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -810,15 +810,7 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
 	if (dev_is_pci(dev)) {
 		struct pci_dev *pf_pdev;
 
-		pdev = to_pci_dev(dev);
-
-#ifdef CONFIG_X86
-		/* VMD child devices currently cannot be handled individually */
-		if (is_vmd(pdev->bus))
-			return NULL;
-#endif
-
-		pdev = pci_real_dma_dev(pdev);
+		pdev = pci_real_dma_dev(to_pci_dev(dev));
 
 		/* VFs aren't listed in scope tables; we need to look up
 		 * the PF instead to find the IOMMU. */
-- 
Gitee


From c0bf87dce68173ea1aeb09c2d34cbab30f249151 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 17 Feb 2020 17:12:37 +0100
Subject: [PATCH 0213/2161] iommu/vt-d: Add attach_deferred() helper

commit 1d4615978f525b769990a4a4ef22fb1b9a04cdf1 upstream.

Implement a helper function to check whether a device's attach process
is deferred.

Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper")
Cc: stable@vger.kernel.org # v5.5
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a330be8371ed..a33e77fa016b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -768,6 +768,11 @@ static int iommu_dummy(struct device *dev)
 	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 }
 
+static bool attach_deferred(struct device *dev)
+{
+	return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
+}
+
 /**
  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
  *				 sub-hierarchy of a candidate PCI-PCI bridge
@@ -2517,8 +2522,7 @@ struct dmar_domain *find_domain(struct device *dev)
 {
 	struct device_domain_info *info;
 
-	if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
-		     dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
+	if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
 		return NULL;
 
 	if (dev_is_pci(dev))
@@ -2534,7 +2538,7 @@ struct dmar_domain *find_domain(struct device *dev)
 
 static struct dmar_domain *deferred_attach_domain(struct device *dev)
 {
-	if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
+	if (unlikely(attach_deferred(dev))) {
 		struct iommu_domain *domain;
 
 		dev->archdata.iommu = NULL;
@@ -6155,7 +6159,7 @@ intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
 					   struct device *dev)
 {
-	return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
+	return attach_deferred(dev);
 }
 
 /*
-- 
Gitee


From f4fa2e62de3610b3951933b728a12b01bc203083 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 17 Feb 2020 17:16:19 +0100
Subject: [PATCH 0214/2161] iommu/vt-d: Move deferred device attachment into
 helper function

commit 034d98cc0cdcde2415c6f598fa9125e3eaa02569 upstream.

Move the code that does the deferred device attachment into a separate
helper function.

Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper")
Cc: stable@vger.kernel.org # v5.5
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a33e77fa016b..e36d955a2c52 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2536,16 +2536,20 @@ struct dmar_domain *find_domain(struct device *dev)
 	return NULL;
 }
 
-static struct dmar_domain *deferred_attach_domain(struct device *dev)
+static void do_deferred_attach(struct device *dev)
 {
-	if (unlikely(attach_deferred(dev))) {
-		struct iommu_domain *domain;
+	struct iommu_domain *domain;
 
-		dev->archdata.iommu = NULL;
-		domain = iommu_get_domain_for_dev(dev);
-		if (domain)
-			intel_iommu_attach_device(domain, dev);
-	}
+	dev->archdata.iommu = NULL;
+	domain = iommu_get_domain_for_dev(dev);
+	if (domain)
+		intel_iommu_attach_device(domain, dev);
+}
+
+static struct dmar_domain *deferred_attach_domain(struct device *dev)
+{
+	if (unlikely(attach_deferred(dev)))
+		do_deferred_attach(dev);
 
 	return find_domain(dev);
 }
-- 
Gitee


From 94730f2c34f45f4bda12516cf44fea15008ab669 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 17 Feb 2020 17:20:59 +0100
Subject: [PATCH 0215/2161] iommu/vt-d: Do deferred attachment in
 iommu_need_mapping()

commit a11bfde9c77df1fd350ea27169ab921f511bf5d0 upstream.

The attachment of deferred devices needs to happen before the check
whether the device is identity mapped or not. Otherwise the check will
return wrong results, cause warnings boot failures in kdump kernels, like

	WARNING: CPU: 0 PID: 318 at ../drivers/iommu/intel-iommu.c:592 domain_get_iommu+0x61/0x70

	[...]

	 Call Trace:
	  __intel_map_single+0x55/0x190
	  intel_alloc_coherent+0xac/0x110
	  dmam_alloc_attrs+0x50/0xa0
	  ahci_port_start+0xfb/0x1f0 [libahci]
	  ata_host_start.part.39+0x104/0x1e0 [libata]

With the earlier check the kdump boot succeeds and a crashdump is written.

Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper")
Cc: stable@vger.kernel.org # v5.5
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e36d955a2c52..67628e4e0137 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2548,9 +2548,6 @@ static void do_deferred_attach(struct device *dev)
 
 static struct dmar_domain *deferred_attach_domain(struct device *dev)
 {
-	if (unlikely(attach_deferred(dev)))
-		do_deferred_attach(dev);
-
 	return find_domain(dev);
 }
 
@@ -3607,6 +3604,9 @@ static bool iommu_need_mapping(struct device *dev)
 	if (iommu_dummy(dev))
 		return false;
 
+	if (unlikely(attach_deferred(dev)))
+		do_deferred_attach(dev);
+
 	ret = identity_mapping(dev);
 	if (ret) {
 		u64 dma_mask = *dev->dma_mask;
@@ -3970,7 +3970,11 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
 	int prot = 0;
 	int ret;
 
+	if (unlikely(attach_deferred(dev)))
+		do_deferred_attach(dev);
+
 	domain = deferred_attach_domain(dev);
+
 	if (WARN_ON(dir == DMA_NONE || !domain))
 		return DMA_MAPPING_ERROR;
 
-- 
Gitee


From 43888282b6f417a6dbdcf130f8ecbda16dff35b1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 17 Feb 2020 17:27:44 +0100
Subject: [PATCH 0216/2161] iommu/vt-d: Remove deferred_attach_domain()

commit 96d170f3b1a607612caf3618c534d5c64fc2d61b upstream.

The function is now only a wrapper around find_domain(). Remove the
function and call find_domain() directly at the call-sites.

Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper")
Cc: stable@vger.kernel.org # v5.5
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 67628e4e0137..b254ebca12b8 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2546,11 +2546,6 @@ static void do_deferred_attach(struct device *dev)
 		intel_iommu_attach_device(domain, dev);
 }
 
-static struct dmar_domain *deferred_attach_domain(struct device *dev)
-{
-	return find_domain(dev);
-}
-
 static inline struct device_domain_info *
 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
 {
@@ -3655,7 +3650,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 
 	BUG_ON(dir == DMA_NONE);
 
-	domain = deferred_attach_domain(dev);
+	domain = find_domain(dev);
 	if (!domain)
 		return DMA_MAPPING_ERROR;
 
@@ -3875,7 +3870,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 	if (!iommu_need_mapping(dev))
 		return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
 
-	domain = deferred_attach_domain(dev);
+	domain = find_domain(dev);
 	if (!domain)
 		return 0;
 
@@ -3973,7 +3968,7 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
 	if (unlikely(attach_deferred(dev)))
 		do_deferred_attach(dev);
 
-	domain = deferred_attach_domain(dev);
+	domain = find_domain(dev);
 
 	if (WARN_ON(dir == DMA_NONE || !domain))
 		return DMA_MAPPING_ERROR;
-- 
Gitee


From 2264936e1b5391850338c889a21bdfdfa2cc2c4b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 17 Feb 2020 17:29:55 +0100
Subject: [PATCH 0217/2161] iommu/vt-d: Simplify check in identity_mapping()

commit 1ddb32da4a629fa7f87873d0b6836c2e1feb7518 upstream.

The function only has one call-site and there it is never called with
dummy or deferred devices. Simplify the check in the function to
account for that.

Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper")
Cc: stable@vger.kernel.org # v5.5
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index b254ebca12b8..52ae8ed9a839 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2923,7 +2923,7 @@ static int identity_mapping(struct device *dev)
 	struct device_domain_info *info;
 
 	info = dev->archdata.iommu;
-	if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
+	if (info)
 		return (info->domain == si_domain);
 
 	return 0;
-- 
Gitee


From 3f355bd21293ca226e1b6078d8f659e70ed05cb8 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 13 Feb 2020 14:00:21 +0000
Subject: [PATCH 0218/2161] iommu: Use C99 flexible array in fwspec

commit 098accf2da940189f4d62d3514d17f8bb05dc6e1 upstream.

Although the 1-element array was a typical pre-C99 way to implement
variable-length structures, and indeed is a fundamental construct in the
APIs of certain other popular platforms, there's no good reason for it
here (and in particular the sizeof() trick is far too "clever" for its
own good). We can just as easily implement iommu_fwspec's preallocation
behaviour using a standard flexible array member, so let's make it look
the way most readers would expect.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 15 ++++++++-------
 include/linux/iommu.h |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index fd9bc4aa515e..8a9614c36ca4 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2418,7 +2418,8 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 	if (fwspec)
 		return ops == fwspec->ops ? 0 : -EINVAL;
 
-	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
+	/* Preallocate for the overwhelmingly common case of 1 ID */
+	fwspec = kzalloc(struct_size(fwspec, ids, 1), GFP_KERNEL);
 	if (!fwspec)
 		return -ENOMEM;
 
@@ -2445,15 +2446,15 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_free);
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-	size_t size;
-	int i;
+	int i, new_num;
 
 	if (!fwspec)
 		return -EINVAL;
 
-	size = offsetof(struct iommu_fwspec, ids[fwspec->num_ids + num_ids]);
-	if (size > sizeof(*fwspec)) {
-		fwspec = krealloc(fwspec, size, GFP_KERNEL);
+	new_num = fwspec->num_ids + num_ids;
+	if (new_num > 1) {
+		fwspec = krealloc(fwspec, struct_size(fwspec, ids, new_num),
+				  GFP_KERNEL);
 		if (!fwspec)
 			return -ENOMEM;
 
@@ -2463,7 +2464,7 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
 	for (i = 0; i < num_ids; i++)
 		fwspec->ids[fwspec->num_ids + i] = ids[i];
 
-	fwspec->num_ids += num_ids;
+	fwspec->num_ids = new_num;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 010b88dad109..e0d32f91324c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -589,7 +589,7 @@ struct iommu_fwspec {
 	void			*iommu_priv;
 	u32			flags;
 	unsigned int		num_ids;
-	u32			ids[1];
+	u32			ids[];
 };
 
 /* ATS is supported */
-- 
Gitee


From 88b9bc5b3cef39d55493d1ee0d06f045c7fb37fb Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Mon, 9 Mar 2020 15:01:38 +0100
Subject: [PATCH 0219/2161] iommu/vt-d: dmar_parse_one_rmrr: replace WARN_TAINT
 with pr_warn + add_taint

commit 96788c7a7f1e7206519d4d736f89a2072dcfe0fc upstream.

Quoting from the comment describing the WARN functions in
include/asm-generic/bug.h:

 * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
 * significant kernel issues that need prompt attention if they should ever
 * appear at runtime.
 *
 * Do not use these macros when checking for invalid external inputs

The (buggy) firmware tables which the dmar code was calling WARN_TAINT
for really are invalid external inputs. They are not under the kernel's
control and the issues in them cannot be fixed by a kernel update.
So logging a backtrace, which invites bug reports to be filed about this,
is not helpful.

Some distros, e.g. Fedora, have tools watching for the kernel backtraces
logged by the WARN macros and offer the user an option to file a bug for
this when these are encountered. The WARN_TAINT in dmar_parse_one_rmrr
+ another iommu WARN_TAINT, addressed in another patch, have lead to over
a 100 bugs being filed this way.

This commit replaces the WARN_TAINT("...") call, with a
pr_warn(FW_BUG "...") + add_taint(TAINT_FIRMWARE_WORKAROUND, ...) call
avoiding the backtrace and thus also avoiding bug-reports being filed
about this against the kernel.

Fixes: f5a68bb0752e ("iommu/vt-d: Mark firmware tainted if RMRR fails sanity check")
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: stable@vger.kernel.org
Cc: Barret Rhoden <brho@google.com>
Link: https://lore.kernel.org/r/20200309140138.3753-3-hdegoede@redhat.com
BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1808874
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 52ae8ed9a839..d80dbd05f44c 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4473,14 +4473,16 @@ int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
 	struct dmar_rmrr_unit *rmrru;
 
 	rmrr = (struct acpi_dmar_reserved_memory *)header;
-	if (rmrr_sanity_check(rmrr))
-		WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
+	if (rmrr_sanity_check(rmrr)) {
+		pr_warn(FW_BUG
 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
 			   rmrr->base_address, rmrr->end_address,
 			   dmi_get_system_info(DMI_BIOS_VENDOR),
 			   dmi_get_system_info(DMI_BIOS_VERSION),
 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
+		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+	}
 
 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
 	if (!rmrru)
-- 
Gitee


From bdb5c0237d538b927fe0de02976787540f266389 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Tue, 17 Mar 2020 11:03:26 -0400
Subject: [PATCH 0220/2161] iommu/vt-d: Silence RCU-list debugging warning in
 dmar_find_atsr()

commit c6f4ebdeba4cff590594df931ff1ee610c426431 upstream.

dmar_find_atsr() calls list_for_each_entry_rcu() outside of an RCU read
side critical section but with dmar_global_lock held. Silence this
false positive.

 drivers/iommu/intel-iommu.c:4504 RCU-list traversed in non-reader section!!
 1 lock held by swapper/0/1:
 #0: ffffffff9755bee8 (dmar_global_lock){+.+.}, at: intel_iommu_init+0x1a6/0xe19

 Call Trace:
  dump_stack+0xa4/0xfe
  lockdep_rcu_suspicious+0xeb/0xf5
  dmar_find_atsr+0x1ab/0x1c0
  dmar_parse_one_atsr+0x64/0x220
  dmar_walk_remapping_entries+0x130/0x380
  dmar_table_init+0x166/0x243
  intel_iommu_init+0x1ab/0xe19
  pci_iommu_init+0x1a/0x44
  do_one_initcall+0xae/0x4d0
  kernel_init_freeable+0x412/0x4c5
  kernel_init+0x19/0x193

Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index d80dbd05f44c..3d502e5c578f 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4751,7 +4751,8 @@ int dmar_find_matched_atsr_unit(struct pci_dev *dev)
 	}
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
+	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
+				dmar_rcu_check()) {
 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
 		if (atsr->segment != pci_domain_nr(dev->bus))
 			continue;
-- 
Gitee


From 0b11fdcd1ca6b198d59dd68a8dfe646dccb39aa7 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Wed, 27 May 2020 10:56:15 -0600
Subject: [PATCH 0221/2161] iommu/vt-d: Only clear real DMA device's context
 entries

commit 8038bdb8553313ad53bfcffcf8294dd0ab44618f upstream.

Domain context mapping can encounter issues with sub-devices of a real
DMA device. A sub-device cannot have a valid context entry due to it
potentially aliasing another device's 16-bit ID. It's expected that
sub-devices of the real DMA device uses the real DMA device's requester
when context mapping.

This is an issue when a sub-device is removed where the context entry is
cleared for all aliases. Other sub-devices are still valid, resulting in
those sub-devices being stranded without valid context entries.

The correct approach is to use the real DMA device when programming the
context entries. The insertion path is correct because device_to_iommu()
will return the bus and devfn of the real DMA device. The removal path
needs to only operate on the real DMA device, otherwise the entire
context entry would be cleared for all sub-devices of the real DMA
device.

This patch also adds a helper to determine if a struct device is a
sub-device of a real DMA device.

Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
Cc: stable@vger.kernel.org # v5.6+
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200527165617.297470-2-jonathan.derrick@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 3d502e5c578f..d0997fc4b4ea 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2589,6 +2589,12 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
 					     flags);
 }
 
+static bool dev_is_real_dma_subdevice(struct device *dev)
+{
+	return dev && dev_is_pci(dev) &&
+	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
+}
+
 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 						    int bus, int devfn,
 						    struct device *dev,
@@ -5300,7 +5306,8 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 					PASID_RID2PASID);
 
 		iommu_disable_dev_iotlb(info);
-		domain_context_clear(iommu, info->dev);
+		if (!dev_is_real_dma_subdevice(info->dev))
+			domain_context_clear(iommu, info->dev);
 		intel_pasid_free_table(info->dev);
 	}
 
-- 
Gitee


From c42c027de484fec9ba06496baac5f14b04d9aa52 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Wed, 27 May 2020 10:56:16 -0600
Subject: [PATCH 0222/2161] iommu/vt-d: Allocate domain info for real DMA
 sub-devices

commit 4fda230ecddc2573ed88632e98b69b0b9b68c0ad upstream.

Sub-devices of a real DMA device might exist on a separate segment than
the real DMA device and its IOMMU. These devices should still have a
valid device_domain_info, but the current dma alias model won't
allocate info for the subdevice.

This patch adds a segment member to struct device_domain_info and uses
the sub-device's BDF so that these sub-devices won't alias to other
devices.

Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
Cc: stable@vger.kernel.org # v5.6+
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200527165617.297470-3-jonathan.derrick@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 19 +++++++++++++++----
 include/linux/intel-iommu.h |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index d0997fc4b4ea..a92df35be8a9 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2552,7 +2552,7 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
 	struct device_domain_info *info;
 
 	list_for_each_entry(info, &device_domain_list, global)
-		if (info->iommu->segment == segment && info->bus == bus &&
+		if (info->segment == segment && info->bus == bus &&
 		    info->devfn == devfn)
 			return info;
 
@@ -2609,8 +2609,18 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	if (!info)
 		return NULL;
 
-	info->bus = bus;
-	info->devfn = devfn;
+	if (!dev_is_real_dma_subdevice(dev)) {
+		info->bus = bus;
+		info->devfn = devfn;
+		info->segment = iommu->segment;
+	} else {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		info->bus = pdev->bus->number;
+		info->devfn = pdev->devfn;
+		info->segment = pci_domain_nr(pdev->bus);
+	}
+
 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
 	info->ats_qdep = 0;
@@ -2650,7 +2660,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 
 	if (!found) {
 		struct device_domain_info *info2;
-		info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
+		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
+						       info->devfn);
 		if (info2) {
 			found      = info2->domain;
 			info2->dev = dev;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 13ff71a56305..a7c6e63a39f0 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -573,6 +573,7 @@ struct device_domain_info {
 	struct list_head auxiliary_domains; /* auxiliary domains
 					     * attached to this device
 					     */
+	u32 segment;		/* PCI segment number */
 	u8 bus;			/* PCI bus number */
 	u8 devfn;		/* PCI devfn number */
 	u16 pfsid;		/* SRIOV physical function source ID */
-- 
Gitee


From a05ddcc7109abad029261c5f2a2a76b893bddc87 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Wed, 27 May 2020 10:56:17 -0600
Subject: [PATCH 0223/2161] iommu/vt-d: Remove real DMA lookup in find_domain

commit bba9cc2cf82840bd3c9b3f4f7edac2dc8329c241 upstream.

By removing the real DMA indirection in find_domain(), we can allow
sub-devices of a real DMA device to have their own valid
device_domain_info. The dmar lookup and context entry removal paths have
been fixed to account for sub-devices.

Fixes: 2b0140c69637 ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200527165617.297470-4-jonathan.derrick@intel.com
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=207575
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a92df35be8a9..0e55a074abee 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2525,9 +2525,6 @@ struct dmar_domain *find_domain(struct device *dev)
 	if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
 		return NULL;
 
-	if (dev_is_pci(dev))
-		dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
-
 	/* No lock here, assumes no domain exit in normal case */
 	info = dev->archdata.iommu;
 	if (likely(info))
-- 
Gitee


From 443672193b9ea629d69153e69a61fb4ba3384894 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Jan 2020 06:37:47 -0700
Subject: [PATCH 0224/2161] PCI: Introduce pci_real_dma_dev()

commit 2856ba6020fc5cbf051d5a75b2abb3046072c144 upstream.

The current DMA alias implementation requires the aliased device be on the
same PCI bus as the requester ID.  Add an arch-specific mechanism to point
to another PCI device when doing mapping and PCI DMA alias search.  The
default case returns the actual device.

Link: https://lore.kernel.org/r/1579613871-301529-4-git-send-email-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/pci/common.c | 10 ++++++++++
 drivers/pci/pci.c     | 19 ++++++++++++++++++-
 drivers/pci/search.c  |  6 ++++++
 include/linux/pci.h   |  1 +
 4 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index b339ce5e7f59..a0b44fe0d2a4 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -738,3 +738,13 @@ int pci_ext_cfg_avail(void)
 	else
 		return 0;
 }
+
+#if IS_ENABLED(CONFIG_VMD)
+struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
+{
+	if (is_vmd(dev->bus))
+		return to_pci_sysdata(dev->bus)->vmd_dev;
+
+	return dev;
+}
+#endif
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 436bedbaf40b..a73e7cabca94 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6059,7 +6059,9 @@ bool pci_devs_are_dma_aliases(struct pci_dev *dev1, struct pci_dev *dev2)
 	return (dev1->dma_alias_mask &&
 		test_bit(dev2->devfn, dev1->dma_alias_mask)) ||
 	       (dev2->dma_alias_mask &&
-		test_bit(dev1->devfn, dev2->dma_alias_mask));
+		test_bit(dev1->devfn, dev2->dma_alias_mask)) ||
+	       pci_real_dma_dev(dev1) == dev2 ||
+	       pci_real_dma_dev(dev2) == dev1;
 }
 
 bool pci_device_is_present(struct pci_dev *pdev)
@@ -6083,6 +6085,21 @@ void pci_ignore_hotplug(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_ignore_hotplug);
 
+/**
+ * pci_real_dma_dev - Get PCI DMA device for PCI device
+ * @dev: the PCI device that may have a PCI DMA alias
+ *
+ * Permits the platform to provide architecture-specific functionality to
+ * devices needing to alias DMA to another PCI device on another PCI bus. If
+ * the PCI device is on the same bus, it is recommended to use
+ * pci_add_dma_alias(). This is the default implementation. Architecture
+ * implementations can override this.
+ */
+struct pci_dev __weak *pci_real_dma_dev(struct pci_dev *dev)
+{
+	return dev;
+}
+
 resource_size_t __weak pcibios_default_alignment(void)
 {
 	return 0;
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index e4dbdef5aef0..2061672954ee 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -32,6 +32,12 @@ int pci_for_each_dma_alias(struct pci_dev *pdev,
 	struct pci_bus *bus;
 	int ret;
 
+	/*
+	 * The device may have an explicit alias requester ID for DMA where the
+	 * requester is on another PCI bus.
+	 */
+	pdev = pci_real_dma_dev(pdev);
+
 	ret = fn(pdev, pci_dev_id(pdev), data);
 	if (ret)
 		return ret;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2665076fc842..b70d1e0565a7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1221,6 +1221,7 @@ int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
 void pci_ignore_hotplug(struct pci_dev *dev);
+struct pci_dev *pci_real_dma_dev(struct pci_dev *dev);
 
 int __printf(6, 7) pci_request_irq(struct pci_dev *dev, unsigned int nr,
 		irq_handler_t handler, irq_handler_t thread_fn, void *dev_id,
-- 
Gitee


From e56eccbb1eace16231eae0b697e943437edb9e46 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Jan 2020 06:37:45 -0700
Subject: [PATCH 0225/2161] x86/PCI: Add to_pci_sysdata() helper

commit aad6aa0cd674b7935957354666f82e5b2cdc813d upstream.

Various helpers need the pci_sysdata just to dereference a single field in
it.  Add a little helper that returns the properly typed sysdata pointer to
require a little less boilerplate code.

[jonathan.derrick: to_pci_sysdata const argument]
Link: https://lore.kernel.org/r/1579613871-301529-2-git-send-email-jonathan.derrick@intel.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pci.h | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index e662f987dfa2..fa852642bb2c 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -37,12 +37,15 @@ extern int noioapicreroute;
 
 #ifdef CONFIG_PCI
 
+static inline struct pci_sysdata *to_pci_sysdata(const struct pci_bus *bus)
+{
+	return bus->sysdata;
+}
+
 #ifdef CONFIG_PCI_DOMAINS
 static inline int pci_domain_nr(struct pci_bus *bus)
 {
-	struct pci_sysdata *sd = bus->sysdata;
-
-	return sd->domain;
+	return to_pci_sysdata(bus)->domain;
 }
 
 static inline int pci_proc_domain(struct pci_bus *bus)
@@ -54,24 +57,20 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
 {
-	struct pci_sysdata *sd = bus->sysdata;
-
-	return sd->fwnode;
+	return to_pci_sysdata(bus)->fwnode;
 }
 
 #define pci_root_bus_fwnode	_pci_root_bus_fwnode
 #endif
 
+#if IS_ENABLED(CONFIG_VMD)
 static inline bool is_vmd(struct pci_bus *bus)
 {
-#if IS_ENABLED(CONFIG_VMD)
-	struct pci_sysdata *sd = bus->sysdata;
-
-	return sd->vmd_domain;
-#else
-	return false;
-#endif
+	return to_pci_sysdata(bus)->vmd_domain;
 }
+#else
+#define is_vmd(bus)		false
+#endif /* CONFIG_VMD */
 
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
@@ -131,9 +130,7 @@ void native_restore_msi_irqs(struct pci_dev *dev);
 /* Returns the node based on pci bus */
 static inline int __pcibus_to_node(const struct pci_bus *bus)
 {
-	const struct pci_sysdata *sd = bus->sysdata;
-
-	return sd->node;
+	return to_pci_sysdata(bus)->node;
 }
 
 static inline const struct cpumask *
-- 
Gitee


From 4746812e7e1673e0ead13201978c94650b8986c7 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Jan 2020 06:37:46 -0700
Subject: [PATCH 0226/2161] x86/PCI: Expose VMD's pci_dev in struct pci_sysdata

commit 34067c56fa177d3582695da8e3e162ef78cd0371 upstream.

Expose VMD's pci_dev pointer in struct pci_sysdata.  This will be used
indirectly by intel-iommu.c to find the correct domain.

Link: https://lore.kernel.org/r/1579613871-301529-3-git-send-email-jonathan.derrick@intel.com
[bhelgaas: commit log]
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pci.h   | 4 ++--
 drivers/pci/controller/vmd.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index fa852642bb2c..394e0f708986 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -27,7 +27,7 @@ struct pci_sysdata {
 	void		*fwnode;	/* IRQ domain for MSI assignment */
 #endif
 #if IS_ENABLED(CONFIG_VMD)
-	bool vmd_domain;		/* True if in Intel VMD domain */
+	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
 #endif
 };
 
@@ -66,7 +66,7 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
 #if IS_ENABLED(CONFIG_VMD)
 static inline bool is_vmd(struct pci_bus *bus)
 {
-	return to_pci_sysdata(bus)->vmd_domain;
+	return to_pci_sysdata(bus)->vmd_dev != NULL;
 }
 #else
 #define is_vmd(bus)		false
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 9966dcf1d112..5a1827342b97 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -667,7 +667,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 		.parent = res,
 	};
 
-	sd->vmd_domain = true;
+	sd->vmd_dev = vmd->dev;
 	sd->domain = vmd_find_free_domain();
 	if (sd->domain < 0)
 		return sd->domain;
-- 
Gitee


From d0e7ae3c654ce69d0ca332dfa08feab222a3a8ac Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 19 Mar 2020 21:32:31 -0700
Subject: [PATCH 0227/2161] iommu/vt-d: Add build dependency on IOASID

commit 4a663dae47316ae8b97d5b77025fe7dfd9d3487f upstream.

IOASID code is needed by VT-d scalable mode for PASID allocation.
Add explicit dependency such that IOASID is built-in whenever Intel
IOMMU is enabled.
Otherwise, aux domain code will fail when IOMMU is built-in and IOASID
is compiled as a module.

Fixes: 59a623374dc38 ("iommu/vt-d: Replace Intel specific PASID allocator with IOASID")
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c7f3bb83e9c1..800686499301 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,6 +187,7 @@ config INTEL_IOMMU
 	select NEED_DMA_MAP_STATE
 	select DMAR_TABLE
 	select SWIOTLB
+	select IOASID
 	help
 	  DMA remapping (DMAR) devices support enables independent address
 	  translations for Direct Memory Access (DMA) from devices.
-- 
Gitee


From 8fbdf9b1f4f37a087b253c66c9e6547709dadf12 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 26 Mar 2020 16:08:26 +0100
Subject: [PATCH 0228/2161] iommu: Define dev_iommu_fwspec_get() for
 !CONFIG_IOMMU_API

commit 0008d0c3b1ab03b046b04b7bd9d70df1e2fffbfc upstream.

There are users outside of the IOMMU code that need to call that
function. Define it for !CONFIG_IOMMU_API too so that compilation does
not break.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20200326150841.10083-2-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e0d32f91324c..b41c5d6dd1e4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1070,6 +1070,10 @@ static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
+static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
+{
+	return NULL;
+}
 #endif /* CONFIG_IOMMU_API */
 
 #ifdef CONFIG_IOMMU_DEBUGFS
-- 
Gitee


From 7271fcfbbb790e416d5014d0dd3c63479d03928a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 26 Mar 2020 16:08:30 +0100
Subject: [PATCH 0229/2161] iommu: Rename struct iommu_param to dev_iommu

commit 045a70426067d6a22e3e5745b55efc18fa75aabf upstream.

The term dev_iommu aligns better with other existing structures and
their accessor functions.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Will Deacon <will@kernel.org> # arm-smmu
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20200326150841.10083-6-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c  | 28 ++++++++++++++--------------
 include/linux/device.h |  6 +++---
 include/linux/iommu.h  |  4 ++--
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 8a9614c36ca4..34ca6def5663 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -152,9 +152,9 @@ void iommu_device_unregister(struct iommu_device *iommu)
 }
 EXPORT_SYMBOL_GPL(iommu_device_unregister);
 
-static struct iommu_param *iommu_get_dev_param(struct device *dev)
+static struct dev_iommu *dev_iommu_get(struct device *dev)
 {
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
 
 	if (param)
 		return param;
@@ -164,14 +164,14 @@ static struct iommu_param *iommu_get_dev_param(struct device *dev)
 		return NULL;
 
 	mutex_init(&param->lock);
-	dev->iommu_param = param;
+	dev->iommu = param;
 	return param;
 }
 
-static void iommu_free_dev_param(struct device *dev)
+static void dev_iommu_free(struct device *dev)
 {
-	kfree(dev->iommu_param);
-	dev->iommu_param = NULL;
+	kfree(dev->iommu);
+	dev->iommu = NULL;
 }
 
 int iommu_probe_device(struct device *dev)
@@ -183,7 +183,7 @@ int iommu_probe_device(struct device *dev)
 	if (!ops)
 		return -EINVAL;
 
-	if (!iommu_get_dev_param(dev))
+	if (!dev_iommu_get(dev))
 		return -ENOMEM;
 
 	if (!try_module_get(ops->owner)) {
@@ -200,7 +200,7 @@ int iommu_probe_device(struct device *dev)
 err_module_put:
 	module_put(ops->owner);
 err_free_dev_param:
-	iommu_free_dev_param(dev);
+	dev_iommu_free(dev);
 	return ret;
 }
 
@@ -211,9 +211,9 @@ void iommu_release_device(struct device *dev)
 	if (dev->iommu_group)
 		ops->remove_device(dev);
 
-	if (dev->iommu_param) {
+	if (dev->iommu) {
 		module_put(ops->owner);
-		iommu_free_dev_param(dev);
+		dev_iommu_free(dev);
 	}
 }
 
@@ -972,7 +972,7 @@ int iommu_register_device_fault_handler(struct device *dev,
 					iommu_dev_fault_handler_t handler,
 					void *data)
 {
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
 	int ret = 0;
 
 	if (!param)
@@ -1015,7 +1015,7 @@ EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
  */
 int iommu_unregister_device_fault_handler(struct device *dev)
 {
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
 	int ret = 0;
 
 	if (!param)
@@ -1055,7 +1055,7 @@ EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
  */
 int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 {
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
 	struct iommu_fault_event *evt_pending = NULL;
 	struct iommu_fault_param *fparam;
 	int ret = 0;
@@ -1104,7 +1104,7 @@ int iommu_page_response(struct device *dev,
 	int ret = -EINVAL;
 	struct iommu_fault_event *evt;
 	struct iommu_fault_page_request *prm;
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
 	if (!domain || !domain->ops->page_response)
diff --git a/include/linux/device.h b/include/linux/device.h
index 5add0c2deb87..ff36b66b8a95 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -43,7 +43,7 @@ struct iommu_ops;
 struct iommu_group;
 struct iommu_fwspec;
 struct dev_pin_info;
-struct iommu_param;
+struct dev_iommu;
 
 struct bus_attribute {
 	struct attribute	attr;
@@ -1212,7 +1212,7 @@ struct dev_links_info {
  * 		device (i.e. the bus driver that discovered the device).
  * @iommu_group: IOMMU group the device belongs to.
  * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
- * @iommu_param: Per device generic IOMMU runtime data
+ * @iommu:	Per device generic IOMMU runtime data
  *
  * @offline_disabled: If set, the device is permanently online.
  * @offline:	Set after successful invocation of bus type's .offline().
@@ -1309,7 +1309,7 @@ struct device {
 	void	(*release)(struct device *dev);
 	struct iommu_group	*iommu_group;
 	struct iommu_fwspec	*iommu_fwspec;
-	struct iommu_param	*iommu_param;
+	struct dev_iommu	*iommu;
 
 	bool			offline_disabled:1;
 	bool			offline:1;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b41c5d6dd1e4..9065c8759bb7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -371,7 +371,7 @@ struct iommu_fault_param {
 };
 
 /**
- * struct iommu_param - collection of per-device IOMMU data
+ * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
  *
@@ -379,7 +379,7 @@ struct iommu_fault_param {
  *	struct iommu_group	*iommu_group;
  *	struct iommu_fwspec	*iommu_fwspec;
  */
-struct iommu_param {
+struct dev_iommu {
 	struct mutex lock;
 	struct iommu_fault_param *fault_param;
 };
-- 
Gitee


From 3eb78573e2d92e6a4cdff5809dd4f84768922650 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 26 Mar 2020 16:08:31 +0100
Subject: [PATCH 0230/2161] iommu: Move iommu_fwspec to struct dev_iommu

commit 72acd9df18f12420001f901493c54b7364f34d60 upstream.

Move the iommu_fwspec pointer in struct device into struct dev_iommu.
This is a step in the effort to reduce the iommu related pointers in
struct device to one.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Will Deacon <will@kernel.org> # arm-smmu
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20200326150841.10083-7-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c  |  3 +++
 include/linux/device.h |  3 ---
 include/linux/iommu.h  | 12 ++++++++----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 34ca6def5663..ac384f893e3d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2418,6 +2418,9 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 	if (fwspec)
 		return ops == fwspec->ops ? 0 : -EINVAL;
 
+	if (!dev_iommu_get(dev))
+		return -ENOMEM;
+
 	/* Preallocate for the overwhelmingly common case of 1 ID */
 	fwspec = kzalloc(struct_size(fwspec, ids, 1), GFP_KERNEL);
 	if (!fwspec)
diff --git a/include/linux/device.h b/include/linux/device.h
index ff36b66b8a95..e9850ad0cdf2 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -41,7 +41,6 @@ struct device_node;
 struct fwnode_handle;
 struct iommu_ops;
 struct iommu_group;
-struct iommu_fwspec;
 struct dev_pin_info;
 struct dev_iommu;
 
@@ -1211,7 +1210,6 @@ struct dev_links_info {
  * 		gone away. This should be set by the allocator of the
  * 		device (i.e. the bus driver that discovered the device).
  * @iommu_group: IOMMU group the device belongs to.
- * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
  * @iommu:	Per device generic IOMMU runtime data
  *
  * @offline_disabled: If set, the device is permanently online.
@@ -1308,7 +1306,6 @@ struct device {
 
 	void	(*release)(struct device *dev);
 	struct iommu_group	*iommu_group;
-	struct iommu_fwspec	*iommu_fwspec;
 	struct dev_iommu	*iommu;
 
 	bool			offline_disabled:1;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9065c8759bb7..82730a489846 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -374,14 +374,15 @@ struct iommu_fault_param {
  * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
+ * @fwspec:	 IOMMU fwspec data
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
  *	struct iommu_group	*iommu_group;
- *	struct iommu_fwspec	*iommu_fwspec;
  */
 struct dev_iommu {
 	struct mutex lock;
-	struct iommu_fault_param *fault_param;
+	struct iommu_fault_param	*fault_param;
+	struct iommu_fwspec		*fwspec;
 };
 
 int  iommu_device_register(struct iommu_device *iommu);
@@ -611,13 +612,16 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
 
 static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 {
-	return dev->iommu_fwspec;
+	if (dev->iommu)
+		return dev->iommu->fwspec;
+	else
+		return NULL;
 }
 
 static inline void dev_iommu_fwspec_set(struct device *dev,
 					struct iommu_fwspec *fwspec)
 {
-	dev->iommu_fwspec = fwspec;
+	dev->iommu->fwspec = fwspec;
 }
 
 int iommu_probe_device(struct device *dev);
-- 
Gitee


From 7c6b6e9c7d121adead5c58e2768a700d8c84ecac Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 26 Mar 2020 16:08:33 +0100
Subject: [PATCH 0231/2161] iommu: Introduce accessors for iommu private data

commit f9867f416ee721141e1664810516b8ebc2563cdd upstream.

Add dev_iommu_priv_get/set() functions to access per-device iommu
private data. This makes it easier to move the pointer to a different
location.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Will Deacon <will@kernel.org> # arm-smmu
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20200326150841.10083-9-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 82730a489846..51c61199c6f3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -624,6 +624,16 @@ static inline void dev_iommu_fwspec_set(struct device *dev,
 	dev->iommu->fwspec = fwspec;
 }
 
+static inline void *dev_iommu_priv_get(struct device *dev)
+{
+	return dev->iommu->fwspec->iommu_priv;
+}
+
+static inline void dev_iommu_priv_set(struct device *dev, void *priv)
+{
+	dev->iommu->fwspec->iommu_priv = priv;
+}
+
 int iommu_probe_device(struct device *dev);
 void iommu_release_device(struct device *dev);
 
-- 
Gitee


From 4a9714aa8fd6f64bfb6f213bf613d012ca810daa Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 26 Mar 2020 16:08:41 +0100
Subject: [PATCH 0232/2161] iommu: Move fwspec->iommu_priv to struct dev_iommu

commit 986d5ecc56999800a5d112a70e88522d9212aefd upstream.

Move the pointer for iommu private data from struct iommu_fwspec to
struct dev_iommu.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Will Deacon <will@kernel.org> # arm-smmu
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20200326150841.10083-17-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 51c61199c6f3..7954803056fd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -375,6 +375,7 @@ struct iommu_fault_param {
  *
  * @fault_param: IOMMU detected device fault reporting data
  * @fwspec:	 IOMMU fwspec data
+ * @priv:	 IOMMU Driver private data
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
  *	struct iommu_group	*iommu_group;
@@ -383,6 +384,7 @@ struct dev_iommu {
 	struct mutex lock;
 	struct iommu_fault_param	*fault_param;
 	struct iommu_fwspec		*fwspec;
+	void				*priv;
 };
 
 int  iommu_device_register(struct iommu_device *iommu);
@@ -587,7 +589,6 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
 struct iommu_fwspec {
 	const struct iommu_ops	*ops;
 	struct fwnode_handle	*iommu_fwnode;
-	void			*iommu_priv;
 	u32			flags;
 	unsigned int		num_ids;
 	u32			ids[];
@@ -626,12 +627,12 @@ static inline void dev_iommu_fwspec_set(struct device *dev,
 
 static inline void *dev_iommu_priv_get(struct device *dev)
 {
-	return dev->iommu->fwspec->iommu_priv;
+	return dev->iommu->priv;
 }
 
 static inline void dev_iommu_priv_set(struct device *dev, void *priv)
 {
-	dev->iommu->fwspec->iommu_priv = priv;
+	dev->iommu->priv = priv;
 }
 
 int iommu_probe_device(struct device *dev);
-- 
Gitee


From 7e82c86ef4c9993e3c1fb9331eb69f8d4bf11e37 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Thu, 2 Apr 2020 22:37:49 +0800
Subject: [PATCH 0233/2161] iommu: Fix the memory leak in dev_iommu_free()

commit 5375e874c7634f0e1795ec4b37260b724d481e86 upstream.

In iommu_probe_device(), we would invoke dev_iommu_free() to free the
dev->iommu after the ->add_device() returns failure. But after commit
72acd9df18f1 ("iommu: Move iommu_fwspec to struct dev_iommu"), we also
need to free the iommu_fwspec before the dev->iommu is freed. This fixes
the following memory leak reported by kmemleak:
  unreferenced object 0xffff000bc836c700 (size 128):
    comm "swapper/0", pid 1, jiffies 4294896304 (age 782.120s)
    hex dump (first 32 bytes):
      00 00 00 00 00 00 00 00 d8 cd 9b ff 0b 00 ff ff  ................
      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    backtrace:
      [<00000000df34077b>] kmem_cache_alloc_trace+0x244/0x4b0
      [<000000000e560ac0>] iommu_fwspec_init+0x7c/0xb0
      [<0000000075eda275>] of_iommu_xlate+0x80/0xe8
      [<00000000728d6bf9>] of_pci_iommu_init+0xb0/0xb8
      [<00000000d001fe6f>] pci_for_each_dma_alias+0x48/0x190
      [<000000006db6bbce>] of_iommu_configure+0x1ac/0x1d0
      [<00000000634745f8>] of_dma_configure+0xdc/0x220
      [<000000002cbc8ba0>] pci_dma_configure+0x50/0x78
      [<00000000cdf6e193>] really_probe+0x8c/0x340
      [<00000000fddddc46>] driver_probe_device+0x60/0xf8
      [<0000000061bcdb51>] __device_attach_driver+0x8c/0xd0
      [<000000009b9ff58e>] bus_for_each_drv+0x80/0xd0
      [<000000004b9c8aa3>] __device_attach+0xec/0x148
      [<00000000a5c13bf3>] device_attach+0x1c/0x28
      [<000000005071e151>] pci_bus_add_device+0x58/0xd0
      [<000000002d4f87d1>] pci_bus_add_devices+0x40/0x90

Fixes: 72acd9df18f1 ("iommu: Move iommu_fwspec to struct dev_iommu")
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Link: https://lore.kernel.org/r/20200402143749.40500-1-haokexin@gmail.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ac384f893e3d..771e2ef37721 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -170,6 +170,7 @@ static struct dev_iommu *dev_iommu_get(struct device *dev)
 
 static void dev_iommu_free(struct device *dev)
 {
+	iommu_fwspec_free(dev);
 	kfree(dev->iommu);
 	dev->iommu = NULL;
 }
-- 
Gitee


From 192adba7ef37780e7e153427d0ccbc0046aa5fb1 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 1 May 2020 15:24:27 +0800
Subject: [PATCH 0234/2161] iommu/vt-d: Use right Kconfig option name

commit ba61c3da00f4a5bf8805aeca1ba5ac3c9bd82e96 upstream.

The CONFIG_ prefix should be added in the code.

Fixes: 046182525db61 ("iommu/vt-d: Add Kconfig option to enable/disable scalable mode")
Reported-and-tested-by: Kumar, Sanjay K <sanjay.k.kumar@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Link: https://lore.kernel.org/r/20200501072427.14265-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 0e55a074abee..2de914fae540 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -371,11 +371,11 @@ int dmar_disabled = 0;
 int dmar_disabled = 1;
 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 
-#ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
+#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 int intel_iommu_sm = 1;
 #else
 int intel_iommu_sm;
-#endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
+#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 
 int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
-- 
Gitee


From 82f8874d65f37f3d220b1cab53e7f9999b4afacd Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 30 Apr 2020 14:01:20 +0200
Subject: [PATCH 0235/2161] iommu: Properly export iommu_group_get_for_dev()

commit ae74c19faa7d7996e857e13165bd40fc4a285e0d upstream.

In commit a7ba5c3d008d ("drivers/iommu: Export core IOMMU API symbols to
permit modular drivers") a bunch of iommu symbols were exported, all
with _GPL markings except iommu_group_get_for_dev().  That export should
also be _GPL like the others.

Fixes: a7ba5c3d008d ("drivers/iommu: Export core IOMMU API symbols to permit modular drivers")
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Will Deacon <will@kernel.org>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: John Garry <john.garry@huawei.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200430120120.2948448-1-gregkh@linuxfoundation.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 771e2ef37721..f6055508f84d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1442,7 +1442,7 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 
 	return group;
 }
-EXPORT_SYMBOL(iommu_group_get_for_dev);
+EXPORT_SYMBOL_GPL(iommu_group_get_for_dev);
 
 struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
 {
-- 
Gitee


From cedd5c0f8cc26487b833418bc136e7f05c9ccdc1 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 17 Nov 2021 21:03:03 +0800
Subject: [PATCH 0236/2161] iommu: Move default domain allocation to separate
 function

commit ff2a08b39bcede1b08d84d8b5c8ee1336a39c5df upstream.

Move the code out of iommu_group_get_for_dev() into a separate
function.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-2-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 104 +++++++++++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 43 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f6055508f84d..77f8c1454a54 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1362,6 +1362,56 @@ struct iommu_group *fsl_mc_device_group(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(fsl_mc_device_group);
 
+static int iommu_alloc_default_domain(struct device *dev,
+				      struct iommu_group *group)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_domain *dom;
+#ifdef CONFIG_SMMU_BYPASS_DEV
+	unsigned int type = iommu_def_domain_type;
+#endif
+
+	if (group->default_domain)
+		return 0;
+
+#ifdef CONFIG_SMMU_BYPASS_DEV
+	/* direct allocate required default domain type for some specific devices. */
+	if (ops->device_domain_type != NULL) {
+		if (ops->device_domain_type(dev, &type))
+			type = iommu_def_domain_type;
+	}
+
+	dom = __iommu_domain_alloc(dev->bus, type);
+	if (!dom && type != IOMMU_DOMAIN_DMA) {
+#else
+	dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
+	if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) {
+#endif
+		dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
+		if (dom) {
+			dev_warn(dev,
+				 "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA",
+				 iommu_def_domain_type);
+		}
+	}
+
+	if (!dom)
+		return -ENOMEM;
+
+	group->default_domain = dom;
+	if (!group->domain)
+		group->domain = dom;
+
+	if (!iommu_dma_strict) {
+		int attr = 1;
+		iommu_domain_set_attr(dom,
+				      DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+				      &attr);
+	}
+
+	return 0;
+}
+
 /**
  * iommu_group_get_for_dev - Find or create the IOMMU group for a device
  * @dev: target device
@@ -1377,9 +1427,7 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct iommu_group *group;
 	int ret;
-#ifdef CONFIG_SMMU_BYPASS_DEV
-	unsigned int type = iommu_def_domain_type;
-#endif
+
 	group = iommu_group_get(dev);
 	if (group)
 		return group;
@@ -1396,51 +1444,21 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 
 	/*
 	 * Try to allocate a default domain - needs support from the
-	 * IOMMU driver.
+	 * IOMMU driver. There are still some drivers which don't support
+	 * default domains, so the return value is not yet checked.
 	 */
-	if (!group->default_domain) {
-		struct iommu_domain *dom;
-
-#ifdef CONFIG_SMMU_BYPASS_DEV
-		/* direct allocate required default domain type for some specific devices. */
-		if (ops->device_domain_type != NULL) {
-			if (ops->device_domain_type(dev, &type))
-				type = iommu_def_domain_type;
-               }
-
-		dom = __iommu_domain_alloc(dev->bus, type);
-		if (!dom && type != IOMMU_DOMAIN_DMA) {
-#else
-		dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
-		if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) {
-#endif
-			dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
-			if (dom) {
-				dev_warn(dev,
-					 "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA",
-					 iommu_def_domain_type);
-			}
-		}
-
-		group->default_domain = dom;
-		if (!group->domain)
-			group->domain = dom;
-
-		if (dom && !iommu_dma_strict) {
-			int attr = 1;
-			iommu_domain_set_attr(dom,
-					      DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
-					      &attr);
-		}
-	}
+	iommu_alloc_default_domain(dev, group);
 
 	ret = iommu_group_add_device(group, dev);
-	if (ret) {
-		iommu_group_put(group);
-		return ERR_PTR(ret);
-	}
+	if (ret)
+		goto out_put_group;
 
 	return group;
+
+out_put_group:
+	iommu_group_put(group);
+
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(iommu_group_get_for_dev);
 
-- 
Gitee


From 1c89ddd04a245605bd4d31f0a277aa621c2b60e2 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 17 Nov 2021 21:06:04 +0800
Subject: [PATCH 0237/2161] iommu: Add def_domain_type() callback in iommu_ops

commit 4cbf38511a007867def958872203ae8adb8e2351 upstream.

Some devices are reqired to use a specific type (identity or dma)
of default domain when they are used with a vendor iommu. When the
system level default domain type is different from it, the vendor
iommu driver has to request a new default domain with
iommu_request_dma_domain_for_dev() and iommu_request_dm_for_dev()
in the add_dev() callback. Unfortunately, these two helpers only
work when the group hasn't been assigned to any other devices,
hence, some vendor iommu driver has to use a private domain if
it fails to request a new default one.

This adds def_domain_type() callback in the iommu_ops, so that
any special requirement of default domain for a device could be
aware by the iommu generic layer.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
[ jroedel@suse.de: Added iommu_get_def_domain_type() function and use
                   it to allocate the default domain ]
Co-developed-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-3-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 22 +++++++++++++++-------
 include/linux/iommu.h |  6 ++++++
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 77f8c1454a54..125bae7f4b41 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1362,14 +1362,23 @@ struct iommu_group *fsl_mc_device_group(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(fsl_mc_device_group);
 
+static int iommu_get_def_domain_type(struct device *dev)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	unsigned int type = 0;
+
+	if (ops->def_domain_type)
+		type = ops->def_domain_type(dev);
+
+	return (type == 0) ? iommu_def_domain_type : type;
+}
+
 static int iommu_alloc_default_domain(struct device *dev,
 				      struct iommu_group *group)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct iommu_domain *dom;
-#ifdef CONFIG_SMMU_BYPASS_DEV
 	unsigned int type = iommu_def_domain_type;
-#endif
 
 	if (group->default_domain)
 		return 0;
@@ -1380,18 +1389,17 @@ static int iommu_alloc_default_domain(struct device *dev,
 		if (ops->device_domain_type(dev, &type))
 			type = iommu_def_domain_type;
 	}
+#else
+	type = iommu_get_def_domain_type(dev);
+#endif
 
 	dom = __iommu_domain_alloc(dev->bus, type);
 	if (!dom && type != IOMMU_DOMAIN_DMA) {
-#else
-	dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
-	if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) {
-#endif
 		dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
 		if (dom) {
 			dev_warn(dev,
 				 "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA",
-				 iommu_def_domain_type);
+				 type);
 		}
 	}
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 7954803056fd..effd8d172600 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -248,6 +248,10 @@ struct iommu_iotlb_gather {
  * @cache_invalidate: invalidate translation caches
  * @sva_bind_gpasid: bind guest pasid and mm
  * @sva_unbind_gpasid: unbind guest pasid and mm
+ * @def_domain_type: device default domain type, return value:
+ *		 - IOMMU_DOMAIN_IDENTITY: must use an identity domain
+ *		 - IOMMU_DOMAIN_DMA: must use a dma domain
+ *		 - 0: use the default setting
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
  */
@@ -318,6 +322,8 @@ struct iommu_ops {
 
 	int (*sva_unbind_gpasid)(struct device *dev, int pasid);
 
+	int (*def_domain_type)(struct device *dev);
+
 	unsigned long pgsize_bitmap;
 	struct module *owner;
 
-- 
Gitee


From 0f47320634629068dab41b350d652c740b728690 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:41 +0200
Subject: [PATCH 0238/2161] iommu/amd: Implement iommu_ops->def_domain_type
 call-back

commit bdf4a7c4c77dcb91bd64b53b70d9faf3184e88d8 upstream.

Implement the new def_domain_type call-back for the AMD IOMMU driver.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200429133712.31431-4-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 5e269f3dca3d..e71f79c11f7d 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3274,6 +3274,20 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
 	amd_iommu_flush_iotlb_all(domain);
 }
 
+static int amd_iommu_def_domain_type(struct device *dev)
+{
+	struct iommu_dev_data *dev_data;
+
+	dev_data = get_dev_data(dev);
+	if (!dev_data)
+		return 0;
+
+	if (dev_data->iommu_v2)
+		return IOMMU_DOMAIN_IDENTITY;
+
+	return 0;
+}
+
 const struct iommu_ops amd_iommu_ops = {
 	.capable = amd_iommu_capable,
 	.domain_alloc = amd_iommu_domain_alloc,
@@ -3293,6 +3307,7 @@ const struct iommu_ops amd_iommu_ops = {
 	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
 	.flush_iotlb_all = amd_iommu_flush_iotlb_all,
 	.iotlb_sync = amd_iommu_iotlb_sync,
+	.def_domain_type = amd_iommu_def_domain_type,
 };
 
 /*****************************************************************************
-- 
Gitee


From 38ef8d41077111c922f4388c2ba60d7c1abf749e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:42 +0200
Subject: [PATCH 0239/2161] iommu/vt-d: Wire up iommu_ops->def_domain_type

commit 7039d11b3e4af77cf5ac1f689ae395b2a183bd25 upstream.

The Intel VT-d driver already has a matching function to determine the
default domain type for a device. Wire it up in intel_iommu_ops.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200429133712.31431-5-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 2de914fae540..0ecd9b674d0a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -6255,6 +6255,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
+	.def_domain_type	= device_def_domain_type,
 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
 };
 
-- 
Gitee


From 8337d44116ef4711d7ead1891f9317dcbc7f9073 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:45 +0200
Subject: [PATCH 0240/2161] iommu: Add probe_device() and release_device()
 call-backs

commit a6a4c7e2c5b8b981d1c546a393ff21f2112468c3 upstream.

Add call-backs to 'struct iommu_ops' as an alternative to the
add_device() and remove_device() call-backs, which will be removed when
all drivers are converted.

The new call-backs will not setup IOMMU groups and domains anymore,
so also add a probe_finalize() call-back where the IOMMU driver can do
per-device setup work which require the device to be set up with a
group and a domain.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-8-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 63 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/iommu.h |  9 +++++++
 2 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 125bae7f4b41..77305e5a0854 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -175,6 +175,36 @@ static void dev_iommu_free(struct device *dev)
 	dev->iommu = NULL;
 }
 
+static int __iommu_probe_device(struct device *dev)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_device *iommu_dev;
+	struct iommu_group *group;
+	int ret;
+
+	iommu_dev = ops->probe_device(dev);
+	if (IS_ERR(iommu_dev))
+		return PTR_ERR(iommu_dev);
+
+	dev->iommu->iommu_dev = iommu_dev;
+
+	group = iommu_group_get_for_dev(dev);
+	if (!IS_ERR(group)) {
+		ret = PTR_ERR(group);
+		goto out_release;
+	}
+	iommu_group_put(group);
+
+	iommu_device_link(iommu_dev, dev);
+
+	return 0;
+
+out_release:
+	ops->release_device(dev);
+
+	return ret;
+}
+
 int iommu_probe_device(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
@@ -192,10 +222,17 @@ int iommu_probe_device(struct device *dev)
 		goto err_free_dev_param;
 	}
 
-	ret = ops->add_device(dev);
+	if (ops->probe_device)
+		ret = __iommu_probe_device(dev);
+	else
+		ret = ops->add_device(dev);
+
 	if (ret)
 		goto err_module_put;
 
+	if (ops->probe_finalize)
+		ops->probe_finalize(dev);
+
 	return 0;
 
 err_module_put:
@@ -205,17 +242,31 @@ int iommu_probe_device(struct device *dev)
 	return ret;
 }
 
+static void __iommu_release_device(struct device *dev)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+	iommu_device_unlink(dev->iommu->iommu_dev, dev);
+
+	iommu_group_remove_device(dev);
+
+	ops->release_device(dev);
+}
+
 void iommu_release_device(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 
-	if (dev->iommu_group)
+	if (!dev->iommu)
+		return;
+
+	if (ops->release_device)
+		__iommu_release_device(dev);
+	else if (dev->iommu_group)
 		ops->remove_device(dev);
 
-	if (dev->iommu) {
-		module_put(ops->owner);
-		dev_iommu_free(dev);
-	}
+	module_put(ops->owner);
+	dev_iommu_free(dev);
 }
 
 static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index effd8d172600..078cac609362 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -225,6 +225,10 @@ struct iommu_iotlb_gather {
  * @iova_to_phys: translate iova to physical address
  * @add_device: add device to iommu grouping
  * @remove_device: remove device from iommu grouping
+ * @probe_device: Add device to iommu driver handling
+ * @release_device: Remove device from iommu driver handling
+ * @probe_finalize: Do final setup work after the device is added to an IOMMU
+ *                  group and attached to the groups domain
  * @device_group: find iommu group for a particular device
  * @domain_get_attr: Query domain attributes
  * @domain_set_attr: Change domain attributes
@@ -275,6 +279,9 @@ struct iommu_ops {
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
 	int (*add_device)(struct device *dev);
 	void (*remove_device)(struct device *dev);
+	struct iommu_device *(*probe_device)(struct device *dev);
+	void (*release_device)(struct device *dev);
+	void (*probe_finalize)(struct device *dev);
 	struct iommu_group *(*device_group)(struct device *dev);
 	int (*domain_get_attr)(struct iommu_domain *domain,
 			       enum iommu_attr attr, void *data);
@@ -381,6 +388,7 @@ struct iommu_fault_param {
  *
  * @fault_param: IOMMU detected device fault reporting data
  * @fwspec:	 IOMMU fwspec data
+ * @iommu_dev:	 IOMMU device this device is linked to
  * @priv:	 IOMMU Driver private data
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
@@ -390,6 +398,7 @@ struct dev_iommu {
 	struct mutex lock;
 	struct iommu_fault_param	*fault_param;
 	struct iommu_fwspec		*fwspec;
+	struct iommu_device		*iommu_dev;
 	void				*priv;
 };
 
-- 
Gitee


From 0bc428c7d1e5bce76204a9d22ab4e3d529b774af Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 17 Nov 2021 21:14:42 +0800
Subject: [PATCH 0241/2161] iommu: Move default domain allocation to
 iommu_probe_device()

commit ff2a08b39bcede1b08d84d8b5c8ee1336a39c5df upstream.

Well, not really. The call to iommu_alloc_default_domain() in
iommu_group_get_for_dev() has to stay around as long as there are
IOMMU drivers using the add/remove_device() call-backs instead of
probe/release_device().

Those drivers expect that iommu_group_get_for_dev() returns the device
attached to a group and the group set up with a default domain (and
the device attached to the groups current domain).

But when all drivers are converted this compatability mess can be
removed.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-9-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 120 ++++++++++++++++++++++++++++--------------
 1 file changed, 80 insertions(+), 40 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 77305e5a0854..0fa93606726c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -79,6 +79,16 @@ static bool iommu_cmd_line_dma_api(void)
 	return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API);
 }
 
+static int iommu_alloc_default_domain(struct device *dev);
+static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+						 unsigned type);
+static int __iommu_attach_device(struct iommu_domain *domain,
+				 struct device *dev);
+static int __iommu_attach_group(struct iommu_domain *domain,
+				struct iommu_group *group);
+static void __iommu_detach_group(struct iommu_domain *domain,
+				 struct iommu_group *group);
+
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
 struct iommu_group_attribute iommu_group_attr_##_name =		\
 	__ATTR(_name, _mode, _show, _store)
@@ -222,10 +232,29 @@ int iommu_probe_device(struct device *dev)
 		goto err_free_dev_param;
 	}
 
-	if (ops->probe_device)
+	if (ops->probe_device) {
+		struct iommu_group *group;
+
 		ret = __iommu_probe_device(dev);
-	else
+
+		/*
+		 * Try to allocate a default domain - needs support from the
+		 * IOMMU driver. There are still some drivers which don't
+		 * support default domains, so the return value is not yet
+		 * checked.
+		 */
+		if (!ret)
+			iommu_alloc_default_domain(dev);
+
+		group = iommu_group_get(dev);
+		if (group && group->default_domain) {
+			ret = __iommu_attach_device(group->default_domain, dev);
+			iommu_group_put(group);
+		}
+
+	} else {
 		ret = ops->add_device(dev);
+	}
 
 	if (ret)
 		goto err_module_put;
@@ -269,15 +298,6 @@ void iommu_release_device(struct device *dev)
 	dev_iommu_free(dev);
 }
 
-static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
-						 unsigned type);
-static int __iommu_attach_device(struct iommu_domain *domain,
-				 struct device *dev);
-static int __iommu_attach_group(struct iommu_domain *domain,
-				struct iommu_group *group);
-static void __iommu_detach_group(struct iommu_domain *domain,
-				 struct iommu_group *group);
-
 static int __init iommu_set_def_domain_type(char *str)
 {
 	bool pt;
@@ -1424,34 +1444,18 @@ static int iommu_get_def_domain_type(struct device *dev)
 	return (type == 0) ? iommu_def_domain_type : type;
 }
 
-static int iommu_alloc_default_domain(struct device *dev,
-				      struct iommu_group *group)
+static int iommu_group_alloc_default_domain(struct bus_type *bus,
+					    struct iommu_group *group,
+					    unsigned int type)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct iommu_domain *dom;
-	unsigned int type = iommu_def_domain_type;
-
-	if (group->default_domain)
-		return 0;
-
-#ifdef CONFIG_SMMU_BYPASS_DEV
-	/* direct allocate required default domain type for some specific devices. */
-	if (ops->device_domain_type != NULL) {
-		if (ops->device_domain_type(dev, &type))
-			type = iommu_def_domain_type;
-	}
-#else
-	type = iommu_get_def_domain_type(dev);
-#endif
 
-	dom = __iommu_domain_alloc(dev->bus, type);
+	dom = __iommu_domain_alloc(bus, type);
 	if (!dom && type != IOMMU_DOMAIN_DMA) {
-		dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
-		if (dom) {
-			dev_warn(dev,
-				 "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA",
-				 type);
-		}
+		dom = __iommu_domain_alloc(bus, IOMMU_DOMAIN_DMA);
+		if (dom)
+			pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA",
+				type, group->name);
 	}
 
 	if (!dom)
@@ -1471,6 +1475,32 @@ static int iommu_alloc_default_domain(struct device *dev,
 	return 0;
 }
 
+static int iommu_alloc_default_domain(struct device *dev)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_group *group;
+	unsigned int type = iommu_def_domain_type;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return -ENODEV;
+
+	if (group->default_domain)
+		return 0;
+
+#ifdef CONFIG_SMMU_BYPASS_DEV
+	/* direct allocate required default domain type for some specific devices. */
+	if (ops->device_domain_type != NULL) {
+		if (ops->device_domain_type(dev, &type))
+			type = iommu_def_domain_type;
+	}
+#else
+	type = iommu_get_def_domain_type(dev);
+#endif
+
+	return iommu_group_alloc_default_domain(dev->bus, group, type);
+}
+
 /**
  * iommu_group_get_for_dev - Find or create the IOMMU group for a device
  * @dev: target device
@@ -1501,16 +1531,26 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 	if (IS_ERR(group))
 		return group;
 
+	ret = iommu_group_add_device(group, dev);
+	if (ret)
+		goto out_put_group;
+
 	/*
 	 * Try to allocate a default domain - needs support from the
 	 * IOMMU driver. There are still some drivers which don't support
-	 * default domains, so the return value is not yet checked.
+	 * default domains, so the return value is not yet checked. Only
+	 * allocate the domain here when the driver still has the
+	 * add_device/remove_device call-backs implemented.
 	 */
-	iommu_alloc_default_domain(dev, group);
+	if (!ops->probe_device) {
+		iommu_alloc_default_domain(dev);
 
-	ret = iommu_group_add_device(group, dev);
-	if (ret)
-		goto out_put_group;
+		if (group->default_domain)
+			ret = __iommu_attach_device(group->default_domain, dev);
+
+		if (ret)
+			goto out_put_group;
+	}
 
 	return group;
 
-- 
Gitee


From 112b89b5c7ba819798593173c8c8ae4f0dba125a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:47 +0200
Subject: [PATCH 0242/2161] iommu: Keep a list of allocated groups in
 __iommu_probe_device()

commit 41df6dcc0a3ff4fb654c3d969ab96ba9c4f0e796 upstream.

This is needed to defer default_domain allocation for new IOMMU groups
until all devices have been added to the group.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-10-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0fa93606726c..dbadb1ab00b6 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -44,6 +44,7 @@ struct iommu_group {
 	int id;
 	struct iommu_domain *default_domain;
 	struct iommu_domain *domain;
+	struct list_head entry;
 };
 
 struct group_device {
@@ -185,7 +186,7 @@ static void dev_iommu_free(struct device *dev)
 	dev->iommu = NULL;
 }
 
-static int __iommu_probe_device(struct device *dev)
+static int __iommu_probe_device(struct device *dev, struct list_head *group_list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct iommu_device *iommu_dev;
@@ -205,6 +206,9 @@ static int __iommu_probe_device(struct device *dev)
 	}
 	iommu_group_put(group);
 
+	if (group_list && !group->default_domain && list_empty(&group->entry))
+		list_add_tail(&group->entry, group_list);
+
 	iommu_device_link(iommu_dev, dev);
 
 	return 0;
@@ -235,7 +239,7 @@ int iommu_probe_device(struct device *dev)
 	if (ops->probe_device) {
 		struct iommu_group *group;
 
-		ret = __iommu_probe_device(dev);
+		ret = __iommu_probe_device(dev, NULL);
 
 		/*
 		 * Try to allocate a default domain - needs support from the
@@ -568,6 +572,7 @@ struct iommu_group *iommu_group_alloc(void)
 	group->kobj.kset = iommu_group_kset;
 	mutex_init(&group->mutex);
 	INIT_LIST_HEAD(&group->devices);
+	INIT_LIST_HEAD(&group->entry);
 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 
 	ret = ida_simple_get(&iommu_group_ida, 0, 0, GFP_KERNEL);
-- 
Gitee


From b063e56d9665e02ab57577afbc931580f9783797 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:48 +0200
Subject: [PATCH 0243/2161] iommu: Move new probe_device path to separate
 function

commit cf193888bfbd3d57e03a511e49d26f7d9c6f76df upstream.

This makes it easier to remove to old code-path when all drivers are
converted. As a side effect that it also fixes the error cleanup
path.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-11-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 69 ++++++++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index dbadb1ab00b6..e23e467ff988 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -219,12 +219,55 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	return ret;
 }
 
+static int __iommu_probe_device_helper(struct device *dev)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_group *group;
+	int ret;
+
+	ret = __iommu_probe_device(dev, NULL);
+	if (ret)
+		goto err_out;
+
+	/*
+	 * Try to allocate a default domain - needs support from the
+	 * IOMMU driver. There are still some drivers which don't
+	 * support default domains, so the return value is not yet
+	 * checked.
+	 */
+	iommu_alloc_default_domain(dev);
+
+	group = iommu_group_get(dev);
+	if (!group)
+		goto err_release;
+
+	if (group->default_domain)
+		ret = __iommu_attach_device(group->default_domain, dev);
+
+	iommu_group_put(group);
+
+	if (ret)
+		goto err_release;
+
+	if (ops->probe_finalize)
+		ops->probe_finalize(dev);
+
+	return 0;
+
+err_release:
+	iommu_release_device(dev);
+err_out:
+	return ret;
+
+}
+
 int iommu_probe_device(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	int ret;
 
 	WARN_ON(dev->iommu_group);
+
 	if (!ops)
 		return -EINVAL;
 
@@ -236,30 +279,10 @@ int iommu_probe_device(struct device *dev)
 		goto err_free_dev_param;
 	}
 
-	if (ops->probe_device) {
-		struct iommu_group *group;
-
-		ret = __iommu_probe_device(dev, NULL);
-
-		/*
-		 * Try to allocate a default domain - needs support from the
-		 * IOMMU driver. There are still some drivers which don't
-		 * support default domains, so the return value is not yet
-		 * checked.
-		 */
-		if (!ret)
-			iommu_alloc_default_domain(dev);
-
-		group = iommu_group_get(dev);
-		if (group && group->default_domain) {
-			ret = __iommu_attach_device(group->default_domain, dev);
-			iommu_group_put(group);
-		}
-
-	} else {
-		ret = ops->add_device(dev);
-	}
+	if (ops->probe_device)
+		return __iommu_probe_device_helper(dev);
 
+	ret = ops->add_device(dev);
 	if (ret)
 		goto err_module_put;
 
-- 
Gitee


From bb2f02a4b8ee1a2a7b1ff87d7ea1b09a3081dbf7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:49 +0200
Subject: [PATCH 0244/2161] iommu: Split off default domain allocation from
 group assignment

commit deac0b3bed26bb5d04486696b1071d8ec3851100 upstream.

When a bus is initialized with iommu-ops, all devices on the bus are
scanned and iommu-groups are allocated for them, and each groups will
also get a default domain allocated.

Until now this happened as soon as the group was created and the first
device added to it. When other devices with different default domain
requirements were added to the group later on, the default domain was
re-allocated, if possible.

This resulted in some back and forth and unnecessary allocations, so
change the flow to defer default domain allocation until all devices
have been added to their respective IOMMU groups.

The default domains are allocated for newly allocated groups after
each device on the bus is handled and was probed by the IOMMU driver.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-12-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 154 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 151 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e23e467ff988..5141735d192f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -200,7 +200,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	dev->iommu->iommu_dev = iommu_dev;
 
 	group = iommu_group_get_for_dev(dev);
-	if (!IS_ERR(group)) {
+	if (IS_ERR(group)) {
 		ret = PTR_ERR(group);
 		goto out_release;
 	}
@@ -1609,6 +1609,37 @@ static int add_iommu_group(struct device *dev, void *data)
 	return ret;
 }
 
+static int probe_iommu_group(struct device *dev, void *data)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct list_head *group_list = data;
+	int ret;
+
+	if (!dev_iommu_get(dev))
+		return -ENOMEM;
+
+	if (!try_module_get(ops->owner)) {
+		ret = -EINVAL;
+		goto err_free_dev_iommu;
+	}
+
+	ret = __iommu_probe_device(dev, group_list);
+	if (ret)
+		goto err_module_put;
+
+	return 0;
+
+err_module_put:
+	module_put(ops->owner);
+err_free_dev_iommu:
+	dev_iommu_free(dev);
+
+	if (ret == -ENODEV)
+		ret = 0;
+
+	return ret;
+}
+
 static int remove_iommu_group(struct device *dev, void *data)
 {
 	iommu_release_device(dev);
@@ -1668,10 +1699,127 @@ static int iommu_bus_notifier(struct notifier_block *nb,
 	return 0;
 }
 
+struct __group_domain_type {
+	struct device *dev;
+	unsigned int type;
+};
+
+static int probe_get_default_domain_type(struct device *dev, void *data)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct __group_domain_type *gtype = data;
+	unsigned int type = 0;
+
+	if (ops->def_domain_type)
+		type = ops->def_domain_type(dev);
+
+	if (type) {
+		if (gtype->type && gtype->type != type) {
+			dev_warn(dev, "Device needs domain type %s, but device %s in the same iommu group requires type %s - using default\n",
+				 iommu_domain_type_str(type),
+				 dev_name(gtype->dev),
+				 iommu_domain_type_str(gtype->type));
+			gtype->type = 0;
+		}
+
+		if (!gtype->dev) {
+			gtype->dev  = dev;
+			gtype->type = type;
+		}
+	}
+
+	return 0;
+}
+
+static void probe_alloc_default_domain(struct bus_type *bus,
+				       struct iommu_group *group)
+{
+	struct __group_domain_type gtype;
+
+	memset(&gtype, 0, sizeof(gtype));
+
+	/* Ask for default domain requirements of all devices in the group */
+	__iommu_group_for_each_dev(group, &gtype,
+				   probe_get_default_domain_type);
+
+	if (!gtype.type)
+		gtype.type = iommu_def_domain_type;
+
+	iommu_group_alloc_default_domain(bus, group, gtype.type);
+}
+
+static int iommu_group_do_dma_attach(struct device *dev, void *data)
+{
+	struct iommu_domain *domain = data;
+	const struct iommu_ops *ops;
+	int ret;
+
+	ret = __iommu_attach_device(domain, dev);
+
+	ops = domain->ops;
+
+	if (ret == 0 && ops->probe_finalize)
+		ops->probe_finalize(dev);
+
+	return ret;
+}
+
+static int __iommu_group_dma_attach(struct iommu_group *group)
+{
+	return __iommu_group_for_each_dev(group, group->default_domain,
+					  iommu_group_do_dma_attach);
+}
+
+static int bus_iommu_probe(struct bus_type *bus)
+{
+	const struct iommu_ops *ops = bus->iommu_ops;
+	int ret;
+
+	if (ops->probe_device) {
+		struct iommu_group *group, *next;
+		LIST_HEAD(group_list);
+
+		/*
+		 * This code-path does not allocate the default domain when
+		 * creating the iommu group, so do it after the groups are
+		 * created.
+		 */
+		ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group);
+		if (ret)
+			return ret;
+
+		list_for_each_entry_safe(group, next, &group_list, entry) {
+			/* Remove item from the list */
+			list_del_init(&group->entry);
+
+			mutex_lock(&group->mutex);
+
+			/* Try to allocate default domain */
+			probe_alloc_default_domain(bus, group);
+
+			if (!group->default_domain) {
+				mutex_unlock(&group->mutex);
+				continue;
+			}
+
+			ret = __iommu_group_dma_attach(group);
+
+			mutex_unlock(&group->mutex);
+
+			if (ret)
+				break;
+		}
+	} else {
+		ret = bus_for_each_dev(bus, NULL, NULL, add_iommu_group);
+	}
+
+	return ret;
+}
+
 static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops)
 {
-	int err;
 	struct notifier_block *nb;
+	int err;
 
 	nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
 	if (!nb)
@@ -1683,7 +1831,7 @@ static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops)
 	if (err)
 		goto out_free;
 
-	err = bus_for_each_dev(bus, NULL, NULL, add_iommu_group);
+	err = bus_iommu_probe(bus);
 	if (err)
 		goto out_err;
 
-- 
Gitee


From 75b6a0303c2b53bc06e214e4c139ec4121151c55 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:50 +0200
Subject: [PATCH 0245/2161] iommu: Move iommu_group_create_direct_mappings()
 out of iommu_group_add_device()

commit ce574c27ae275bc51b6437883fc9cd1c46b498e5 upstream.

After the previous changes the iommu group may not have a default
domain when iommu_group_add_device() is called. With no default domain
iommu_group_create_direct_mappings() will do nothing and no direct
mappings will be created.

Rename iommu_group_create_direct_mappings() to
iommu_create_device_direct_mappings() to better reflect that the
function creates direct mappings only for one device and not for all
devices in the group. Then move the call to the places where a default
domain actually exists.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-13-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5141735d192f..cf91b3468ebe 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -89,6 +89,8 @@ static int __iommu_attach_group(struct iommu_domain *domain,
 				struct iommu_group *group);
 static void __iommu_detach_group(struct iommu_domain *domain,
 				 struct iommu_group *group);
+static int iommu_create_device_direct_mappings(struct iommu_group *group,
+					       struct device *dev);
 
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
 struct iommu_group_attribute iommu_group_attr_##_name =		\
@@ -244,6 +246,8 @@ static int __iommu_probe_device_helper(struct device *dev)
 	if (group->default_domain)
 		ret = __iommu_attach_device(group->default_domain, dev);
 
+	iommu_create_device_direct_mappings(group, dev);
+
 	iommu_group_put(group);
 
 	if (ret)
@@ -264,6 +268,7 @@ static int __iommu_probe_device_helper(struct device *dev)
 int iommu_probe_device(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_group *group;
 	int ret;
 
 	WARN_ON(dev->iommu_group);
@@ -286,6 +291,10 @@ int iommu_probe_device(struct device *dev)
 	if (ret)
 		goto err_module_put;
 
+	group = iommu_group_get(dev);
+	iommu_create_device_direct_mappings(group, dev);
+	iommu_group_put(group);
+
 	if (ops->probe_finalize)
 		ops->probe_finalize(dev);
 
@@ -737,8 +746,8 @@ int iommu_group_set_name(struct iommu_group *group, const char *name)
 }
 EXPORT_SYMBOL_GPL(iommu_group_set_name);
 
-static int iommu_group_create_direct_mappings(struct iommu_group *group,
-					      struct device *dev)
+static int iommu_create_device_direct_mappings(struct iommu_group *group,
+					       struct device *dev)
 {
 	struct iommu_domain *domain = group->default_domain;
 	struct iommu_resv_region *entry;
@@ -842,8 +851,6 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
 
 	dev->iommu_group = group;
 
-	iommu_group_create_direct_mappings(group, dev);
-
 	mutex_lock(&group->mutex);
 	list_add_tail(&device->list, &group->devices);
 	if (group->domain)
@@ -1746,6 +1753,7 @@ static void probe_alloc_default_domain(struct bus_type *bus,
 		gtype.type = iommu_def_domain_type;
 
 	iommu_group_alloc_default_domain(bus, group, gtype.type);
+
 }
 
 static int iommu_group_do_dma_attach(struct device *dev, void *data)
@@ -1770,6 +1778,21 @@ static int __iommu_group_dma_attach(struct iommu_group *group)
 					  iommu_group_do_dma_attach);
 }
 
+static int iommu_do_create_direct_mappings(struct device *dev, void *data)
+{
+	struct iommu_group *group = data;
+
+	iommu_create_device_direct_mappings(group, dev);
+
+	return 0;
+}
+
+static int iommu_group_create_direct_mappings(struct iommu_group *group)
+{
+	return __iommu_group_for_each_dev(group, group,
+					  iommu_do_create_direct_mappings);
+}
+
 static int bus_iommu_probe(struct bus_type *bus)
 {
 	const struct iommu_ops *ops = bus->iommu_ops;
@@ -1802,6 +1825,8 @@ static int bus_iommu_probe(struct bus_type *bus)
 				continue;
 			}
 
+			iommu_group_create_direct_mappings(group);
+
 			ret = __iommu_group_dma_attach(group);
 
 			mutex_unlock(&group->mutex);
@@ -2642,7 +2667,7 @@ request_default_domain_for_dev(struct device *dev, unsigned long type)
 		iommu_domain_free(group->default_domain);
 	group->default_domain = domain;
 
-	iommu_group_create_direct_mappings(group, dev);
+	iommu_create_device_direct_mappings(group, dev);
 
 	dev_info(dev, "Using iommu %s mapping\n",
 		 type == IOMMU_DOMAIN_DMA ? "dma" : "direct");
-- 
Gitee


From 8057226e47ea16c54e4cb11aa3c31720d17de657 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:51 +0200
Subject: [PATCH 0246/2161] iommu: Export bus_iommu_probe() and make is safe
 for re-probing

commit 5012c3968537e2ffecbdb2eba3479bf9fb9e5597 upstream.

Add a check to the bus_iommu_probe() call-path to make sure it ignores
devices which have already been successfully probed. Then export the
bus_iommu_probe() function so it can be used by IOMMU drivers.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-14-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 10 +++++++++-
 include/linux/iommu.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cf91b3468ebe..6e180185ccff 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1620,11 +1620,19 @@ static int probe_iommu_group(struct device *dev, void *data)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct list_head *group_list = data;
+	struct iommu_group *group;
 	int ret;
 
 	if (!dev_iommu_get(dev))
 		return -ENOMEM;
 
+	/* Device is probed already if in a group */
+	group = iommu_group_get(dev);
+	if (group) {
+		iommu_group_put(group);
+		return 0;
+	}
+
 	if (!try_module_get(ops->owner)) {
 		ret = -EINVAL;
 		goto err_free_dev_iommu;
@@ -1793,7 +1801,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group)
 					  iommu_do_create_direct_mappings);
 }
 
-static int bus_iommu_probe(struct bus_type *bus)
+int bus_iommu_probe(struct bus_type *bus)
 {
 	const struct iommu_ops *ops = bus->iommu_ops;
 	int ret;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 078cac609362..9f271799a831 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -444,6 +444,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 #define IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER	6 /* Post Driver unbind */
 
 extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops);
+extern int bus_iommu_probe(struct bus_type *bus);
 extern bool iommu_present(struct bus_type *bus);
 extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
-- 
Gitee


From a3da370445bfe08989650b6ae0e06cb0d9a67e02 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 18 Nov 2021 11:34:55 +0800
Subject: [PATCH 0247/2161] iommu/amd: Convert to probe/release_device()
 call-backs

commit dce8d6964ebdb333383bacf5e7ab8c27df151218 upstream.

Convert the AMD IOMMU Driver to use the probe_device() and
release_device() call-backs of iommu_ops, so that the iommu core code
does the group and sysfs setup.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200429133712.31431-16-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 70 +++++++++++++--------------------------
 1 file changed, 23 insertions(+), 47 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index e71f79c11f7d..18389dbe37a1 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -385,21 +385,9 @@ static bool check_device(struct device *dev)
 	return true;
 }
 
-static void init_iommu_group(struct device *dev)
-{
-	struct iommu_group *group;
-
-	group = iommu_group_get_for_dev(dev);
-	if (IS_ERR(group))
-		return;
-
-	iommu_group_put(group);
-}
-
 static int iommu_init_device(struct device *dev)
 {
 	struct iommu_dev_data *dev_data;
-	struct amd_iommu *iommu;
 	int devid;
 
 	if (dev->archdata.iommu)
@@ -409,8 +397,6 @@ static int iommu_init_device(struct device *dev)
 	if (devid < 0)
 		return devid;
 
-	iommu = amd_iommu_rlookup_table[devid];
-
 	dev_data = find_dev_data(devid);
 	if (!dev_data)
 		return -ENOMEM;
@@ -433,8 +419,6 @@ static int iommu_init_device(struct device *dev)
 
 	dev->archdata.iommu = dev_data;
 
-	iommu_device_link(&iommu->iommu, dev);
-
 	return 0;
 }
 
@@ -452,7 +436,7 @@ static void iommu_ignore_device(struct device *dev)
 	setup_aliases(dev);
 }
 
-static void iommu_uninit_device(struct device *dev)
+static void amd_iommu_uninit_device(struct device *dev)
 {
 	struct iommu_dev_data *dev_data;
 	struct amd_iommu *iommu;
@@ -471,13 +455,6 @@ static void iommu_uninit_device(struct device *dev)
 	if (dev_data->domain)
 		detach_device(dev);
 
-	iommu_device_unlink(&iommu->iommu, dev);
-
-	iommu_group_remove_device(dev);
-
-	/* Remove dma-ops */
-	dev->dma_ops = NULL;
-
 	/*
 	 * We keep dev_data around for unplugged devices and reuse it when the
 	 * device is re-plugged - not doing so would introduce a ton of races.
@@ -2261,39 +2238,42 @@ static void detach_device(struct device *dev)
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
-static int amd_iommu_add_device(struct device *dev)
+static struct iommu_device *amd_iommu_probe_device(struct device *dev)
 {
-	struct iommu_dev_data *dev_data;
-	struct iommu_domain *domain;
+	struct iommu_device *iommu_dev;
 	struct amd_iommu *iommu;
 	int ret, devid;
 
-	if (!check_device(dev) || get_dev_data(dev))
-		return 0;
+	if (!check_device(dev))
+		return ERR_PTR(-ENODEV);
 
 	devid = get_device_id(dev);
 	if (devid < 0)
-		return devid;
+		return ERR_PTR(devid);
 
 	iommu = amd_iommu_rlookup_table[devid];
 
+	if (get_dev_data(dev))
+		return &iommu->iommu;
+
 	ret = iommu_init_device(dev);
 	if (ret) {
 		if (ret != -ENOTSUPP)
 			dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
-
+		iommu_dev = ERR_PTR(ret);
 		iommu_ignore_device(dev);
-		dev->dma_ops = NULL;
-		goto out;
+	} else {
+		iommu_dev = &iommu->iommu;
 	}
-	init_iommu_group(dev);
 
-	dev_data = get_dev_data(dev);
+	iommu_completion_wait(iommu);
 
-	BUG_ON(!dev_data);
+	return iommu_dev;
+}
 
-	if (dev_data->iommu_v2)
-		iommu_request_dm_for_dev(dev);
+static void amd_iommu_probe_finalize(struct device *dev)
+{
+	struct iommu_domain *domain;
 
 	/* Domains are initialized for this device - have a look what we ended up with */
 	domain = iommu_get_domain_for_dev(dev);
@@ -2301,14 +2281,9 @@ static int amd_iommu_add_device(struct device *dev)
 		dev_data->passthrough = true;
 	else
 		dev->dma_ops = &amd_iommu_dma_ops;
-
-out:
-	iommu_completion_wait(iommu);
-
-	return 0;
 }
 
-static void amd_iommu_remove_device(struct device *dev)
+static void amd_iommu_release_device(struct device *dev)
 {
 	struct amd_iommu *iommu;
 	int devid;
@@ -2322,7 +2297,7 @@ static void amd_iommu_remove_device(struct device *dev)
 
 	iommu = amd_iommu_rlookup_table[devid];
 
-	iommu_uninit_device(dev);
+	amd_iommu_uninit_device(dev);
 	iommu_completion_wait(iommu);
 }
 
@@ -3297,8 +3272,9 @@ const struct iommu_ops amd_iommu_ops = {
 	.map = amd_iommu_map,
 	.unmap = amd_iommu_unmap,
 	.iova_to_phys = amd_iommu_iova_to_phys,
-	.add_device = amd_iommu_add_device,
-	.remove_device = amd_iommu_remove_device,
+	.probe_device = amd_iommu_probe_device,
+	.release_device = amd_iommu_release_device,
+	.probe_finalize = amd_iommu_probe_finalize,
 	.device_group = amd_iommu_device_group,
 	.get_resv_regions = amd_iommu_get_resv_regions,
 	.put_resv_regions = amd_iommu_put_resv_regions,
-- 
Gitee


From 07fa45a37620bc952407c8f262ecdee7b541e069 Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Sun, 8 Sep 2019 09:56:41 -0700
Subject: [PATCH 0248/2161] iommu/amd: Convert AMD iommu driver to the
 dma-iommu api

commit be62dbf554c5b50718a54a359372c148cd9975c7 upstream.

Convert the AMD iommu driver to the dma-iommu api. Remove the iova
handling and reserve region code from the AMD iommu driver.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig     |   1 +
 drivers/iommu/amd_iommu.c | 692 ++++----------------------------------
 2 files changed, 68 insertions(+), 625 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 800686499301..e44a1f868ad0 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -142,6 +142,7 @@ config AMD_IOMMU
 	select PCI_PASID
 	select IOMMU_API
 	select IOMMU_IOVA
+	select IOMMU_DMA
 	depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
 	---help---
 	  With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 18389dbe37a1..7e282e758f46 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -20,6 +20,7 @@
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
 #include <linux/dma-direct.h>
+#include <linux/dma-iommu.h>
 #include <linux/iommu-helper.h>
 #include <linux/iommu.h>
 #include <linux/delay.h>
@@ -88,8 +89,6 @@ const struct iommu_ops amd_iommu_ops;
 static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
 int amd_iommu_max_glx_val = -1;
 
-static const struct dma_map_ops amd_iommu_dma_ops;
-
 /*
  * general struct to manage commands send to an IOMMU
  */
@@ -102,21 +101,6 @@ struct kmem_cache *amd_iommu_irq_cache;
 static void update_domain(struct protection_domain *domain);
 static int protection_domain_init(struct protection_domain *domain);
 static void detach_device(struct device *dev);
-static void iova_domain_flush_tlb(struct iova_domain *iovad);
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
-	/* generic protection domain information */
-	struct protection_domain domain;
-
-	/* IOVA RB-Tree */
-	struct iova_domain iovad;
-};
-
-static struct iova_domain reserved_iova_ranges;
-static struct lock_class_key reserved_rbtree_key;
 
 /****************************************************************************
  *
@@ -187,12 +171,6 @@ static struct protection_domain *to_pdomain(struct iommu_domain *dom)
 	return container_of(dom, struct protection_domain, domain);
 }
 
-static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain)
-{
-	BUG_ON(domain->flags != PD_DMA_OPS_MASK);
-	return container_of(domain, struct dma_ops_domain, domain);
-}
-
 static struct iommu_dev_data *alloc_dev_data(u16 devid)
 {
 	struct iommu_dev_data *dev_data;
@@ -1277,12 +1255,6 @@ static void domain_flush_pages(struct protection_domain *domain,
 	__domain_flush_pages(domain, address, size, 0);
 }
 
-/* Flush the whole IO/TLB for a given protection domain */
-static void domain_flush_tlb(struct protection_domain *domain)
-{
-	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-
 /* Flush the whole IO/TLB for a given protection domain - including PDE */
 static void domain_flush_tlb_pde(struct protection_domain *domain)
 {
@@ -1730,43 +1702,6 @@ static unsigned long iommu_unmap_page(struct protection_domain *dom,
 	return unmapped;
 }
 
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions.
- *
- ****************************************************************************/
-
-
-static unsigned long dma_ops_alloc_iova(struct device *dev,
-					struct dma_ops_domain *dma_dom,
-					unsigned int pages, u64 dma_mask)
-{
-	unsigned long pfn = 0;
-
-	pages = __roundup_pow_of_two(pages);
-
-	if (dma_mask > DMA_BIT_MASK(32))
-		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
-				      IOVA_PFN(DMA_BIT_MASK(32)), false);
-
-	if (!pfn)
-		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
-				      IOVA_PFN(dma_mask), true);
-
-	return (pfn << PAGE_SHIFT);
-}
-
-static void dma_ops_free_iova(struct dma_ops_domain *dma_dom,
-			      unsigned long address,
-			      unsigned int pages)
-{
-	pages = __roundup_pow_of_two(pages);
-	address >>= PAGE_SHIFT;
-
-	free_iova_fast(&dma_dom->iovad, address, pages);
-}
-
 /****************************************************************************
  *
  * The next functions belong to the domain allocation. A domain is
@@ -1843,42 +1778,23 @@ static void free_gcr3_table(struct protection_domain *domain)
 	free_page((unsigned long)domain->gcr3_tbl);
 }
 
-static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&dom->domain.lock, flags);
-	domain_flush_tlb(&dom->domain);
-	domain_flush_complete(&dom->domain);
-	spin_unlock_irqrestore(&dom->domain.lock, flags);
-}
-
-static void iova_domain_flush_tlb(struct iova_domain *iovad)
-{
-	struct dma_ops_domain *dom;
-
-	dom = container_of(iovad, struct dma_ops_domain, iovad);
-
-	dma_ops_domain_flush_tlb(dom);
-}
-
 /*
  * Free a domain, only used if something went wrong in the
  * allocation path and we need to free an already allocated page table
  */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
+static void dma_ops_domain_free(struct protection_domain *domain)
 {
-	if (!dom)
+	if (!domain)
 		return;
 
-	put_iova_domain(&dom->iovad);
+	iommu_put_dma_cookie(&domain->domain);
 
-	free_pagetable(&dom->domain);
+	free_pagetable(domain);
 
-	if (dom->domain.id)
-		domain_id_free(dom->domain.id);
+	if (domain->id)
+		domain_id_free(domain->id);
 
-	kfree(dom);
+	kfree(domain);
 }
 
 /*
@@ -1886,35 +1802,30 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
  * It also initializes the page table and the address allocator data
  * structures required for the dma_ops interface
  */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
+static struct protection_domain *dma_ops_domain_alloc(void)
 {
-	struct dma_ops_domain *dma_dom;
+	struct protection_domain *domain;
 
-	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
-	if (!dma_dom)
+	domain = kzalloc(sizeof(struct protection_domain), GFP_KERNEL);
+	if (!domain)
 		return NULL;
 
-	if (protection_domain_init(&dma_dom->domain))
-		goto free_dma_dom;
-
-	dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
-	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
-	dma_dom->domain.flags = PD_DMA_OPS_MASK;
-	if (!dma_dom->domain.pt_root)
-		goto free_dma_dom;
-
-	init_iova_domain(&dma_dom->iovad, PAGE_SIZE, IOVA_START_PFN);
+	if (protection_domain_init(domain))
+		goto free_domain;
 
-	if (init_iova_flush_queue(&dma_dom->iovad, iova_domain_flush_tlb, NULL))
-		goto free_dma_dom;
+	domain->mode = PAGE_MODE_3_LEVEL;
+	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+	domain->flags = PD_DMA_OPS_MASK;
+	if (!domain->pt_root)
+		goto free_domain;
 
-	/* Initialize reserved ranges */
-	copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad);
+	if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM)
+		goto free_domain;
 
-	return dma_dom;
+	return domain;
 
-free_dma_dom:
-	dma_ops_domain_free(dma_dom);
+free_domain:
+	dma_ops_domain_free(domain);
 
 	return NULL;
 }
@@ -2279,8 +2190,8 @@ static void amd_iommu_probe_finalize(struct device *dev)
 	domain = iommu_get_domain_for_dev(dev);
 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
 		dev_data->passthrough = true;
-	else
-		dev->dma_ops = &amd_iommu_dma_ops;
+	else if (domain->type == IOMMU_DOMAIN_DMA)
+		iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
 }
 
 static void amd_iommu_release_device(struct device *dev)
@@ -2309,43 +2220,32 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev)
 	return acpihid_device_group(dev);
 }
 
+static int amd_iommu_domain_get_attr(struct iommu_domain *domain,
+		enum iommu_attr attr, void *data)
+{
+	switch (domain->type) {
+	case IOMMU_DOMAIN_UNMANAGED:
+		return -ENODEV;
+	case IOMMU_DOMAIN_DMA:
+		switch (attr) {
+		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+			*(int *)data = !amd_iommu_unmap_flush;
+			return 0;
+		default:
+			return -ENODEV;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+}
+
 /*****************************************************************************
  *
  * The next functions belong to the dma_ops mapping/unmapping code.
  *
  *****************************************************************************/
 
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
-	struct protection_domain *domain;
-	struct iommu_domain *io_domain;
-
-	if (!check_device(dev))
-		return ERR_PTR(-EINVAL);
-
-	domain = get_dev_data(dev)->domain;
-	if (domain == NULL && get_dev_data(dev)->defer_attach) {
-		get_dev_data(dev)->defer_attach = false;
-		io_domain = iommu_get_domain_for_dev(dev);
-		domain = to_pdomain(io_domain);
-		attach_device(dev, domain);
-	}
-	if (domain == NULL)
-		return ERR_PTR(-EBUSY);
-
-	if (!dma_ops_domain(domain))
-		return ERR_PTR(-EBUSY);
-
-	return domain;
-}
-
 static void update_device_table(struct protection_domain *domain)
 {
 	struct iommu_dev_data *dev_data;
@@ -2366,458 +2266,6 @@ static void update_domain(struct protection_domain *domain)
 	domain_flush_complete(domain);
 }
 
-static int dir2prot(enum dma_data_direction direction)
-{
-	if (direction == DMA_TO_DEVICE)
-		return IOMMU_PROT_IR;
-	else if (direction == DMA_FROM_DEVICE)
-		return IOMMU_PROT_IW;
-	else if (direction == DMA_BIDIRECTIONAL)
-		return IOMMU_PROT_IW | IOMMU_PROT_IR;
-	else
-		return 0;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
-			       struct dma_ops_domain *dma_dom,
-			       phys_addr_t paddr,
-			       size_t size,
-			       enum dma_data_direction direction,
-			       u64 dma_mask)
-{
-	dma_addr_t offset = paddr & ~PAGE_MASK;
-	dma_addr_t address, start, ret;
-	unsigned long flags;
-	unsigned int pages;
-	int prot = 0;
-	int i;
-
-	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
-	paddr &= PAGE_MASK;
-
-	address = dma_ops_alloc_iova(dev, dma_dom, pages, dma_mask);
-	if (!address)
-		goto out;
-
-	prot = dir2prot(direction);
-
-	start = address;
-	for (i = 0; i < pages; ++i) {
-		ret = iommu_map_page(&dma_dom->domain, start, paddr,
-				     PAGE_SIZE, prot, GFP_ATOMIC);
-		if (ret)
-			goto out_unmap;
-
-		paddr += PAGE_SIZE;
-		start += PAGE_SIZE;
-	}
-	address += offset;
-
-	domain_flush_np_cache(&dma_dom->domain, address, size);
-
-out:
-	return address;
-
-out_unmap:
-
-	for (--i; i >= 0; --i) {
-		start -= PAGE_SIZE;
-		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
-	}
-
-	spin_lock_irqsave(&dma_dom->domain.lock, flags);
-	domain_flush_tlb(&dma_dom->domain);
-	domain_flush_complete(&dma_dom->domain);
-	spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
-
-	dma_ops_free_iova(dma_dom, address, pages);
-
-	return DMA_MAPPING_ERROR;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
-			   dma_addr_t dma_addr,
-			   size_t size,
-			   int dir)
-{
-	dma_addr_t i, start;
-	unsigned int pages;
-
-	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
-	dma_addr &= PAGE_MASK;
-	start = dma_addr;
-
-	for (i = 0; i < pages; ++i) {
-		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
-		start += PAGE_SIZE;
-	}
-
-	if (amd_iommu_unmap_flush) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&dma_dom->domain.lock, flags);
-		domain_flush_tlb(&dma_dom->domain);
-		domain_flush_complete(&dma_dom->domain);
-		spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
-		dma_ops_free_iova(dma_dom, dma_addr, pages);
-	} else {
-		pages = __roundup_pow_of_two(pages);
-		queue_iova(&dma_dom->iovad, dma_addr >> PAGE_SHIFT, pages, 0);
-	}
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
-			   unsigned long offset, size_t size,
-			   enum dma_data_direction dir,
-			   unsigned long attrs)
-{
-	phys_addr_t paddr = page_to_phys(page) + offset;
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	u64 dma_mask;
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL)
-		return (dma_addr_t)paddr;
-	else if (IS_ERR(domain))
-		return DMA_MAPPING_ERROR;
-
-	dma_mask = *dev->dma_mask;
-	dma_dom = to_dma_ops_domain(domain);
-
-	return __map_single(dev, dma_dom, paddr, size, dir, dma_mask);
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
-		       enum dma_data_direction dir, unsigned long attrs)
-{
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return;
-
-	dma_dom = to_dma_ops_domain(domain);
-
-	__unmap_single(dma_dom, dma_addr, size, dir);
-}
-
-static int sg_num_pages(struct device *dev,
-			struct scatterlist *sglist,
-			int nelems)
-{
-	unsigned long mask, boundary_size;
-	struct scatterlist *s;
-	int i, npages = 0;
-
-	mask          = dma_get_seg_boundary(dev);
-	boundary_size = mask + 1 ? ALIGN(mask + 1, PAGE_SIZE) >> PAGE_SHIFT :
-				   1UL << (BITS_PER_LONG - PAGE_SHIFT);
-
-	for_each_sg(sglist, s, nelems, i) {
-		int p, n;
-
-		s->dma_address = npages << PAGE_SHIFT;
-		p = npages % boundary_size;
-		n = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
-		if (p + n > boundary_size)
-			npages += boundary_size - p;
-		npages += n;
-	}
-
-	return npages;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
-		  int nelems, enum dma_data_direction direction,
-		  unsigned long attrs)
-{
-	int mapped_pages = 0, npages = 0, prot = 0, i;
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	struct scatterlist *s;
-	unsigned long address;
-	u64 dma_mask;
-	int ret;
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return 0;
-
-	dma_dom  = to_dma_ops_domain(domain);
-	dma_mask = *dev->dma_mask;
-
-	npages = sg_num_pages(dev, sglist, nelems);
-
-	address = dma_ops_alloc_iova(dev, dma_dom, npages, dma_mask);
-	if (!address)
-		goto out_err;
-
-	prot = dir2prot(direction);
-
-	/* Map all sg entries */
-	for_each_sg(sglist, s, nelems, i) {
-		int j, pages = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
-
-		for (j = 0; j < pages; ++j) {
-			unsigned long bus_addr, phys_addr;
-
-			bus_addr  = address + s->dma_address + (j << PAGE_SHIFT);
-			phys_addr = (sg_phys(s) & PAGE_MASK) + (j << PAGE_SHIFT);
-			ret = iommu_map_page(domain, bus_addr, phys_addr,
-					     PAGE_SIZE, prot,
-					     GFP_ATOMIC | __GFP_NOWARN);
-			if (ret)
-				goto out_unmap;
-
-			mapped_pages += 1;
-		}
-	}
-
-	/* Everything is mapped - write the right values into s->dma_address */
-	for_each_sg(sglist, s, nelems, i) {
-		/*
-		 * Add in the remaining piece of the scatter-gather offset that
-		 * was masked out when we were determining the physical address
-		 * via (sg_phys(s) & PAGE_MASK) earlier.
-		 */
-		s->dma_address += address + (s->offset & ~PAGE_MASK);
-		s->dma_length   = s->length;
-	}
-
-	if (s)
-		domain_flush_np_cache(domain, s->dma_address, s->dma_length);
-
-	return nelems;
-
-out_unmap:
-	dev_err(dev, "IOMMU mapping error in map_sg (io-pages: %d reason: %d)\n",
-		npages, ret);
-
-	for_each_sg(sglist, s, nelems, i) {
-		int j, pages = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
-
-		for (j = 0; j < pages; ++j) {
-			unsigned long bus_addr;
-
-			bus_addr  = address + s->dma_address + (j << PAGE_SHIFT);
-			iommu_unmap_page(domain, bus_addr, PAGE_SIZE);
-
-			if (--mapped_pages == 0)
-				goto out_free_iova;
-		}
-	}
-
-out_free_iova:
-	free_iova_fast(&dma_dom->iovad, address >> PAGE_SHIFT, npages);
-
-out_err:
-	return 0;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
-		     int nelems, enum dma_data_direction dir,
-		     unsigned long attrs)
-{
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	unsigned long startaddr;
-	int npages;
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return;
-
-	startaddr = sg_dma_address(sglist) & PAGE_MASK;
-	dma_dom   = to_dma_ops_domain(domain);
-	npages    = sg_num_pages(dev, sglist, nelems);
-
-	__unmap_single(dma_dom, startaddr, npages << PAGE_SHIFT, dir);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
-			    dma_addr_t *dma_addr, gfp_t flag,
-			    unsigned long attrs)
-{
-	u64 dma_mask = dev->coherent_dma_mask;
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	struct page *page;
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL) {
-		page = alloc_pages(flag, get_order(size));
-		*dma_addr = page_to_phys(page);
-		return page_address(page);
-	} else if (IS_ERR(domain))
-		return NULL;
-
-	dma_dom   = to_dma_ops_domain(domain);
-	size	  = PAGE_ALIGN(size);
-	dma_mask  = dev->coherent_dma_mask;
-	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-	flag     |= __GFP_ZERO;
-
-	page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
-	if (!page) {
-		if (!gfpflags_allow_blocking(flag))
-			return NULL;
-
-		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-					get_order(size), flag & __GFP_NOWARN);
-		if (!page)
-			return NULL;
-	}
-
-	if (!dma_mask)
-		dma_mask = *dev->dma_mask;
-
-	*dma_addr = __map_single(dev, dma_dom, page_to_phys(page),
-				 size, DMA_BIDIRECTIONAL, dma_mask);
-
-	if (*dma_addr == DMA_MAPPING_ERROR)
-		goto out_free;
-
-	return page_address(page);
-
-out_free:
-
-	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-		__free_pages(page, get_order(size));
-
-	return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
-			  void *virt_addr, dma_addr_t dma_addr,
-			  unsigned long attrs)
-{
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	struct page *page;
-
-	page = virt_to_page(virt_addr);
-	size = PAGE_ALIGN(size);
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		goto free_mem;
-
-	dma_dom = to_dma_ops_domain(domain);
-
-	__unmap_single(dma_dom, dma_addr, size, DMA_BIDIRECTIONAL);
-
-free_mem:
-	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-		__free_pages(page, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
-	if (!dma_direct_supported(dev, mask))
-		return 0;
-	return check_device(dev);
-}
-
-static const struct dma_map_ops amd_iommu_dma_ops = {
-	.alloc		= alloc_coherent,
-	.free		= free_coherent,
-	.map_page	= map_page,
-	.unmap_page	= unmap_page,
-	.map_sg		= map_sg,
-	.unmap_sg	= unmap_sg,
-	.dma_supported	= amd_iommu_dma_supported,
-	.mmap		= dma_common_mmap,
-	.get_sgtable	= dma_common_get_sgtable,
-};
-
-static int init_reserved_iova_ranges(void)
-{
-	struct pci_dev *pdev = NULL;
-	struct iova *val;
-
-	init_iova_domain(&reserved_iova_ranges, PAGE_SIZE, IOVA_START_PFN);
-
-	lockdep_set_class(&reserved_iova_ranges.iova_rbtree_lock,
-			  &reserved_rbtree_key);
-
-	/* MSI memory range */
-	val = reserve_iova(&reserved_iova_ranges,
-			   IOVA_PFN(MSI_RANGE_START), IOVA_PFN(MSI_RANGE_END));
-	if (!val) {
-		pr_err("Reserving MSI range failed\n");
-		return -ENOMEM;
-	}
-
-	/* HT memory range */
-	val = reserve_iova(&reserved_iova_ranges,
-			   IOVA_PFN(HT_RANGE_START), IOVA_PFN(HT_RANGE_END));
-	if (!val) {
-		pr_err("Reserving HT range failed\n");
-		return -ENOMEM;
-	}
-
-	/*
-	 * Memory used for PCI resources
-	 * FIXME: Check whether we can reserve the PCI-hole completly
-	 */
-	for_each_pci_dev(pdev) {
-		int i;
-
-		for (i = 0; i < PCI_NUM_RESOURCES; ++i) {
-			struct resource *r = &pdev->resource[i];
-
-			if (!(r->flags & IORESOURCE_MEM))
-				continue;
-
-			val = reserve_iova(&reserved_iova_ranges,
-					   IOVA_PFN(r->start),
-					   IOVA_PFN(r->end));
-			if (!val) {
-				pci_err(pdev, "Reserve pci-resource range %pR failed\n", r);
-				return -ENOMEM;
-			}
-		}
-	}
-
-	return 0;
-}
-
 int __init amd_iommu_init_api(void)
 {
 	int ret, err = 0;
@@ -2826,10 +2274,6 @@ int __init amd_iommu_init_api(void)
 	if (ret)
 		return ret;
 
-	ret = init_reserved_iova_ranges();
-	if (ret)
-		return ret;
-
 	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
 	if (err)
 		return err;
@@ -2931,7 +2375,6 @@ static struct protection_domain *protection_domain_alloc(void)
 static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
 {
 	struct protection_domain *pdomain;
-	struct dma_ops_domain *dma_domain;
 
 	switch (type) {
 	case IOMMU_DOMAIN_UNMANAGED:
@@ -2952,12 +2395,11 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
 
 		break;
 	case IOMMU_DOMAIN_DMA:
-		dma_domain = dma_ops_domain_alloc();
-		if (!dma_domain) {
+		pdomain = dma_ops_domain_alloc();
+		if (!pdomain) {
 			pr_err("Failed to allocate\n");
 			return NULL;
 		}
-		pdomain = &dma_domain->domain;
 		break;
 	case IOMMU_DOMAIN_IDENTITY:
 		pdomain = protection_domain_alloc();
@@ -2976,7 +2418,6 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
 static void amd_iommu_domain_free(struct iommu_domain *dom)
 {
 	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
 
 	domain = to_pdomain(dom);
 
@@ -2991,8 +2432,7 @@ static void amd_iommu_domain_free(struct iommu_domain *dom)
 	switch (dom->type) {
 	case IOMMU_DOMAIN_DMA:
 		/* Now release the domain */
-		dma_dom = to_dma_ops_domain(domain);
-		dma_ops_domain_free(dma_dom);
+		dma_ops_domain_free(domain);
 		break;
 	default:
 		if (domain->mode != PAGE_MODE_NONE)
@@ -3048,6 +2488,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 		return -EINVAL;
 
 	dev_data = dev->archdata.iommu;
+	dev_data->defer_attach = false;
 
 	iommu = amd_iommu_rlookup_table[dev_data->devid];
 	if (!iommu)
@@ -3212,19 +2653,6 @@ static void amd_iommu_put_resv_regions(struct device *dev,
 		kfree(entry);
 }
 
-static void amd_iommu_apply_resv_region(struct device *dev,
-				      struct iommu_domain *domain,
-				      struct iommu_resv_region *region)
-{
-	struct dma_ops_domain *dma_dom = to_dma_ops_domain(to_pdomain(domain));
-	unsigned long start, end;
-
-	start = IOVA_PFN(region->start);
-	end   = IOVA_PFN(region->start + region->length - 1);
-
-	WARN_ON_ONCE(reserve_iova(&dma_dom->iovad, start, end) == NULL);
-}
-
 static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
 					 struct device *dev)
 {
@@ -3276,9 +2704,9 @@ const struct iommu_ops amd_iommu_ops = {
 	.release_device = amd_iommu_release_device,
 	.probe_finalize = amd_iommu_probe_finalize,
 	.device_group = amd_iommu_device_group,
+	.domain_get_attr = amd_iommu_domain_get_attr,
 	.get_resv_regions = amd_iommu_get_resv_regions,
 	.put_resv_regions = amd_iommu_put_resv_regions,
-	.apply_resv_region = amd_iommu_apply_resv_region,
 	.is_attach_deferred = amd_iommu_is_attach_deferred,
 	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
 	.flush_iotlb_all = amd_iommu_flush_iotlb_all,
@@ -3591,9 +3019,23 @@ EXPORT_SYMBOL(amd_iommu_complete_ppr);
 struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
 {
 	struct protection_domain *pdomain;
+	struct iommu_domain *io_domain;
+	struct device *dev = &pdev->dev;
+
+	if (!check_device(dev))
+		return NULL;
+
+	pdomain = get_dev_data(dev)->domain;
+	if (pdomain == NULL && get_dev_data(dev)->defer_attach) {
+		get_dev_data(dev)->defer_attach = false;
+		io_domain = iommu_get_domain_for_dev(dev);
+		pdomain = to_pdomain(io_domain);
+		attach_device(dev, pdomain);
+	}
+	if (pdomain == NULL)
+		return NULL;
 
-	pdomain = get_domain(&pdev->dev);
-	if (IS_ERR(pdomain))
+	if (!dma_ops_domain(pdomain))
 		return NULL;
 
 	/* Only return IOMMUv2 domains */
-- 
Gitee


From 4bd608259638c75e1785c71d47253fa0f3889ca3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:52 +0200
Subject: [PATCH 0249/2161] iommu/amd: Remove dev_data->passthrough

commit 57f9842e48840684f596db1de936d7c6d44cd087 upstream.

Make use of generic IOMMU infrastructure to gather the same information
carried in dev_data->passthrough and remove the struct member.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200429133712.31431-15-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c       | 10 +++++-----
 drivers/iommu/amd_iommu_types.h |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 7e282e758f46..eeedb0a690d3 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2046,8 +2046,8 @@ static int pdev_iommuv2_enable(struct pci_dev *pdev)
 static int attach_device(struct device *dev,
 			 struct protection_domain *domain)
 {
-	struct pci_dev *pdev;
 	struct iommu_dev_data *dev_data;
+	struct pci_dev *pdev;
 	unsigned long flags;
 	int ret;
 
@@ -2066,8 +2066,10 @@ static int attach_device(struct device *dev,
 
 	pdev = to_pci_dev(dev);
 	if (domain->flags & PD_IOMMUV2_MASK) {
+		struct iommu_domain *def_domain = iommu_get_dma_domain(dev);
+
 		ret = -EINVAL;
-		if (!dev_data->passthrough)
+		if (def_domain->type != IOMMU_DOMAIN_IDENTITY)
 			goto out;
 
 		if (dev_data->iommu_v2) {
@@ -2188,9 +2190,7 @@ static void amd_iommu_probe_finalize(struct device *dev)
 
 	/* Domains are initialized for this device - have a look what we ended up with */
 	domain = iommu_get_domain_for_dev(dev);
-	if (domain->type == IOMMU_DOMAIN_IDENTITY)
-		dev_data->passthrough = true;
-	else if (domain->type == IOMMU_DOMAIN_DMA)
+	if (domain->type == IOMMU_DOMAIN_DMA)
 		iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
 }
 
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 76e9d3e2f9f2..36e4aa460bb3 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -646,7 +646,6 @@ struct iommu_dev_data {
 	struct pci_dev *pdev;
 	u16 devid;			  /* PCI Device ID */
 	bool iommu_v2;			  /* Device can make use of IOMMUv2 */
-	bool passthrough;		  /* Device is identity mapped */
 	struct {
 		bool enabled;
 		int qdep;
-- 
Gitee


From 231587724e4763f821db84b75d01cf10377af7cf Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:36:54 +0200
Subject: [PATCH 0250/2161] iommu/vt-d: Convert to probe/release_device()
 call-backs

commit e5d1841f18b2401c8b449c024817cd243e363934 upstream.

Convert the Intel IOMMU driver to use the probe_device() and
release_device() call-backs of iommu_ops, so that the iommu core code
does the group and sysfs setup.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200429133712.31431-17-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 67 ++++---------------------------------
 1 file changed, 6 insertions(+), 61 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 0ecd9b674d0a..a61fa16a861f 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5810,78 +5810,27 @@ static bool intel_iommu_capable(enum iommu_cap cap)
 	return false;
 }
 
-static int intel_iommu_add_device(struct device *dev)
+static struct iommu_device *intel_iommu_probe_device(struct device *dev)
 {
-	struct dmar_domain *dmar_domain;
-	struct iommu_domain *domain;
 	struct intel_iommu *iommu;
-	struct iommu_group *group;
 	u8 bus, devfn;
-	int ret;
 
 	iommu = device_to_iommu(dev, &bus, &devfn);
 	if (!iommu)
-		return -ENODEV;
-
-	iommu_device_link(&iommu->iommu, dev);
+		return ERR_PTR(-ENODEV);
 
 	if (translation_pre_enabled(iommu))
 		dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
 
-	group = iommu_group_get_for_dev(dev);
-
-	if (IS_ERR(group)) {
-		ret = PTR_ERR(group);
-		goto unlink;
-	}
-
-	iommu_group_put(group);
-
-	domain = iommu_get_domain_for_dev(dev);
-	dmar_domain = to_dmar_domain(domain);
-	if (domain->type == IOMMU_DOMAIN_DMA) {
-		if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
-			ret = iommu_request_dm_for_dev(dev);
-			if (ret) {
-				dmar_remove_one_dev_info(dev);
-				dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
-				domain_add_dev_info(si_domain, dev);
-				dev_info(dev,
-					 "Device uses a private identity domain.\n");
-			}
-		}
-	} else {
-		if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
-			ret = iommu_request_dma_domain_for_dev(dev);
-			if (ret) {
-				dmar_remove_one_dev_info(dev);
-				dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
-				if (!get_private_domain_for_dev(dev)) {
-					dev_warn(dev,
-						 "Failed to get a private domain.\n");
-					ret = -ENOMEM;
-					goto unlink;
-				}
-
-				dev_info(dev,
-					 "Device uses a private dma domain.\n");
-			}
-		}
-	}
-
 	if (device_needs_bounce(dev)) {
 		dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
 		set_dma_ops(dev, &bounce_dma_ops);
 	}
 
-	return 0;
-
-unlink:
-	iommu_device_unlink(&iommu->iommu, dev);
-	return ret;
+	return &iommu->iommu;
 }
 
-static void intel_iommu_remove_device(struct device *dev)
+static void intel_iommu_release_device(struct device *dev)
 {
 	struct intel_iommu *iommu;
 	u8 bus, devfn;
@@ -5892,10 +5841,6 @@ static void intel_iommu_remove_device(struct device *dev)
 
 	dmar_remove_one_dev_info(dev);
 
-	iommu_group_remove_device(dev);
-
-	iommu_device_unlink(&iommu->iommu, dev);
-
 	if (device_needs_bounce(dev))
 		set_dma_ops(dev, NULL);
 }
@@ -6244,8 +6189,8 @@ const struct iommu_ops intel_iommu_ops = {
 	.map			= intel_iommu_map,
 	.unmap			= intel_iommu_unmap,
 	.iova_to_phys		= intel_iommu_iova_to_phys,
-	.add_device		= intel_iommu_add_device,
-	.remove_device		= intel_iommu_remove_device,
+	.probe_device		= intel_iommu_probe_device,
+	.release_device		= intel_iommu_release_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.apply_resv_region	= intel_iommu_apply_resv_region,
-- 
Gitee


From 467d26ffd4700621d0ec01c7370af5f0e521a00d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:37:10 +0200
Subject: [PATCH 0251/2161] iommu: Remove add_device()/remove_device()
 code-paths

commit 3eeeb45c6d0444b368cdeba9bdafa8bbcf5370d1 upstream.

All drivers are converted to use the probe/release_device()
call-backs, so the add_device/remove_device() pointers are unused and
the code using them can be removed.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-33-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 158 ++++++++++--------------------------------
 include/linux/iommu.h |   4 --
 2 files changed, 38 insertions(+), 124 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6e180185ccff..5208ca55bbcd 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -221,12 +221,20 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	return ret;
 }
 
-static int __iommu_probe_device_helper(struct device *dev)
+int iommu_probe_device(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct iommu_group *group;
 	int ret;
 
+	if (!dev_iommu_get(dev))
+		return -ENOMEM;
+
+	if (!try_module_get(ops->owner)) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
 	ret = __iommu_probe_device(dev, NULL);
 	if (ret)
 		goto err_out;
@@ -260,75 +268,23 @@ static int __iommu_probe_device_helper(struct device *dev)
 
 err_release:
 	iommu_release_device(dev);
+
 err_out:
 	return ret;
 
 }
 
-int iommu_probe_device(struct device *dev)
+void iommu_release_device(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
-	struct iommu_group *group;
-	int ret;
-
-	WARN_ON(dev->iommu_group);
-
-	if (!ops)
-		return -EINVAL;
-
-	if (!dev_iommu_get(dev))
-		return -ENOMEM;
-
-	if (!try_module_get(ops->owner)) {
-		ret = -EINVAL;
-		goto err_free_dev_param;
-	}
-
-	if (ops->probe_device)
-		return __iommu_probe_device_helper(dev);
-
-	ret = ops->add_device(dev);
-	if (ret)
-		goto err_module_put;
 
-	group = iommu_group_get(dev);
-	iommu_create_device_direct_mappings(group, dev);
-	iommu_group_put(group);
-
-	if (ops->probe_finalize)
-		ops->probe_finalize(dev);
-
-	return 0;
-
-err_module_put:
-	module_put(ops->owner);
-err_free_dev_param:
-	dev_iommu_free(dev);
-	return ret;
-}
-
-static void __iommu_release_device(struct device *dev)
-{
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	if (!dev->iommu)
+		return;
 
 	iommu_device_unlink(dev->iommu->iommu_dev, dev);
-
 	iommu_group_remove_device(dev);
 
 	ops->release_device(dev);
-}
-
-void iommu_release_device(struct device *dev)
-{
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
-
-	if (!dev->iommu)
-		return;
-
-	if (ops->release_device)
-		__iommu_release_device(dev);
-	else if (dev->iommu_group)
-		ops->remove_device(dev);
 
 	module_put(ops->owner);
 	dev_iommu_free(dev);
@@ -1570,23 +1526,6 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 	if (ret)
 		goto out_put_group;
 
-	/*
-	 * Try to allocate a default domain - needs support from the
-	 * IOMMU driver. There are still some drivers which don't support
-	 * default domains, so the return value is not yet checked. Only
-	 * allocate the domain here when the driver still has the
-	 * add_device/remove_device call-backs implemented.
-	 */
-	if (!ops->probe_device) {
-		iommu_alloc_default_domain(dev);
-
-		if (group->default_domain)
-			ret = __iommu_attach_device(group->default_domain, dev);
-
-		if (ret)
-			goto out_put_group;
-	}
-
 	return group;
 
 out_put_group:
@@ -1601,21 +1540,6 @@ struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
 	return group->default_domain;
 }
 
-static int add_iommu_group(struct device *dev, void *data)
-{
-	int ret = iommu_probe_device(dev);
-
-	/*
-	 * We ignore -ENODEV errors for now, as they just mean that the
-	 * device is not translated by an IOMMU. We still care about
-	 * other errors and fail to initialize when they happen.
-	 */
-	if (ret == -ENODEV)
-		ret = 0;
-
-	return ret;
-}
-
 static int probe_iommu_group(struct device *dev, void *data)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
@@ -1803,47 +1727,41 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group)
 
 int bus_iommu_probe(struct bus_type *bus)
 {
-	const struct iommu_ops *ops = bus->iommu_ops;
+	struct iommu_group *group, *next;
+	LIST_HEAD(group_list);
 	int ret;
 
-	if (ops->probe_device) {
-		struct iommu_group *group, *next;
-		LIST_HEAD(group_list);
-
-		/*
-		 * This code-path does not allocate the default domain when
-		 * creating the iommu group, so do it after the groups are
-		 * created.
-		 */
-		ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group);
-		if (ret)
-			return ret;
+	/*
+	 * This code-path does not allocate the default domain when
+	 * creating the iommu group, so do it after the groups are
+	 * created.
+	 */
+	ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group);
+	if (ret)
+		return ret;
 
-		list_for_each_entry_safe(group, next, &group_list, entry) {
-			/* Remove item from the list */
-			list_del_init(&group->entry);
+	list_for_each_entry_safe(group, next, &group_list, entry) {
+		/* Remove item from the list */
+		list_del_init(&group->entry);
 
-			mutex_lock(&group->mutex);
+		mutex_lock(&group->mutex);
 
-			/* Try to allocate default domain */
-			probe_alloc_default_domain(bus, group);
+		/* Try to allocate default domain */
+		probe_alloc_default_domain(bus, group);
 
-			if (!group->default_domain) {
-				mutex_unlock(&group->mutex);
-				continue;
-			}
+		if (!group->default_domain) {
+			mutex_unlock(&group->mutex);
+			continue;
+		}
 
-			iommu_group_create_direct_mappings(group);
+		iommu_group_create_direct_mappings(group);
 
-			ret = __iommu_group_dma_attach(group);
+		ret = __iommu_group_dma_attach(group);
 
-			mutex_unlock(&group->mutex);
+		mutex_unlock(&group->mutex);
 
-			if (ret)
-				break;
-		}
-	} else {
-		ret = bus_for_each_dev(bus, NULL, NULL, add_iommu_group);
+		if (ret)
+			break;
 	}
 
 	return ret;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9f271799a831..fe01a1de4061 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -223,8 +223,6 @@ struct iommu_iotlb_gather {
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *            queue
  * @iova_to_phys: translate iova to physical address
- * @add_device: add device to iommu grouping
- * @remove_device: remove device from iommu grouping
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -277,8 +275,6 @@ struct iommu_ops {
 	void (*iotlb_sync)(struct iommu_domain *domain,
 			   struct iommu_iotlb_gather *iotlb_gather);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
-	int (*add_device)(struct device *dev);
-	void (*remove_device)(struct device *dev);
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
 	void (*probe_finalize)(struct device *dev);
-- 
Gitee


From 6931e1ff54719afe6856edc1fca23369dabfbbc9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:37:11 +0200
Subject: [PATCH 0252/2161] iommu: Move more initialization to
 __iommu_probe_device()

commit 4e8906f0d84d1a7d3cf82a30a701b0fb5d48977c upstream.

Move the calls to dev_iommu_get() and try_module_get() into
__iommu_probe_device(), so that the callers don't have to do it on
their own.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-34-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 47 +++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5208ca55bbcd..5b03cbecea62 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -195,9 +195,19 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	struct iommu_group *group;
 	int ret;
 
+	if (!dev_iommu_get(dev))
+		return -ENOMEM;
+
+	if (!try_module_get(ops->owner)) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
 	iommu_dev = ops->probe_device(dev);
-	if (IS_ERR(iommu_dev))
-		return PTR_ERR(iommu_dev);
+	if (IS_ERR(iommu_dev)) {
+		ret = PTR_ERR(iommu_dev);
+		goto out_module_put;
+	}
 
 	dev->iommu->iommu_dev = iommu_dev;
 
@@ -218,6 +228,12 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 out_release:
 	ops->release_device(dev);
 
+out_module_put:
+	module_put(ops->owner);
+
+err_free:
+	dev_iommu_free(dev);
+
 	return ret;
 }
 
@@ -227,14 +243,6 @@ int iommu_probe_device(struct device *dev)
 	struct iommu_group *group;
 	int ret;
 
-	if (!dev_iommu_get(dev))
-		return -ENOMEM;
-
-	if (!try_module_get(ops->owner)) {
-		ret = -EINVAL;
-		goto err_out;
-	}
-
 	ret = __iommu_probe_device(dev, NULL);
 	if (ret)
 		goto err_out;
@@ -1542,14 +1550,10 @@ struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
 
 static int probe_iommu_group(struct device *dev, void *data)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct list_head *group_list = data;
 	struct iommu_group *group;
 	int ret;
 
-	if (!dev_iommu_get(dev))
-		return -ENOMEM;
-
 	/* Device is probed already if in a group */
 	group = iommu_group_get(dev);
 	if (group) {
@@ -1557,22 +1561,7 @@ static int probe_iommu_group(struct device *dev, void *data)
 		return 0;
 	}
 
-	if (!try_module_get(ops->owner)) {
-		ret = -EINVAL;
-		goto err_free_dev_iommu;
-	}
-
 	ret = __iommu_probe_device(dev, group_list);
-	if (ret)
-		goto err_module_put;
-
-	return 0;
-
-err_module_put:
-	module_put(ops->owner);
-err_free_dev_iommu:
-	dev_iommu_free(dev);
-
 	if (ret == -ENODEV)
 		ret = 0;
 
-- 
Gitee


From 2b329afd399ec52f2e6c849793fac5142456e5b6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 29 Apr 2020 15:37:12 +0200
Subject: [PATCH 0253/2161] iommu: Unexport iommu_group_get_for_dev()

commit 1b032ec1ecbce6047af7d11c9db432e237cb17d8 upstream.

The function is now only used in IOMMU core code and shouldn't be used
outside of it anyway, so remove the export for it.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200429133712.31431-35-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 4 ++--
 include/linux/iommu.h | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5b03cbecea62..c628e3bcaa7a 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -91,6 +91,7 @@ static void __iommu_detach_group(struct iommu_domain *domain,
 				 struct iommu_group *group);
 static int iommu_create_device_direct_mappings(struct iommu_group *group,
 					       struct device *dev);
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
 
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
 struct iommu_group_attribute iommu_group_attr_##_name =		\
@@ -1510,7 +1511,7 @@ static int iommu_alloc_default_domain(struct device *dev)
  * to the returned IOMMU group, which will already include the provided
  * device.  The reference should be released with iommu_group_put().
  */
-struct iommu_group *iommu_group_get_for_dev(struct device *dev)
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct iommu_group *group;
@@ -1541,7 +1542,6 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(iommu_group_get_for_dev);
 
 struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index fe01a1de4061..64610192a5cd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -526,7 +526,6 @@ extern int iommu_page_response(struct device *dev,
 			       struct iommu_page_response *msg);
 
 extern int iommu_group_id(struct iommu_group *group);
-extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
 extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
-- 
Gitee


From 391ef649080b1f0dd64e16ee6a39cfd8544860d8 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 11 May 2020 18:10:00 +0200
Subject: [PATCH 0254/2161] iommu: Do not probe devices on IOMMU-less busses

commit f38338cf0691b5fae5f9a46d188eef92ab9e6296 upstream.

Port form f38338cf0691b5fae5f9a46d188eef92ab9e6296

The host1x bus implemented on Tegra SoCs is primarily an abstraction to
create logical device from multiple platform devices. Since the devices
in such a setup are typically hierarchical, DMA setup still needs to be
done so that DMA masks can be properly inherited, but we don't actually
want to attach the host1x logical devices to any IOMMU. The platform
devices that make up the logical device are responsible for memory bus
transactions, so it is them that will need to be attached to the IOMMU.

Add a check to __iommu_probe_device() that aborts IOMMU setup early for
busses that don't have the IOMMU operations pointer set since they will
cause a crash otherwise.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://lore.kernel.org/r/20200511161000.3853342-1-thierry.reding@gmail.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c628e3bcaa7a..260ea10626e4 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -196,6 +196,9 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	struct iommu_group *group;
 	int ret;
 
+	if (!ops)
+		return -ENODEV;
+
 	if (!dev_iommu_get(dev))
 		return -ENOMEM;
 
-- 
Gitee


From ba4155b8e2dde59170f4216498c9b59871bbd30b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 7 May 2020 19:18:03 +0300
Subject: [PATCH 0255/2161] iommu/iova: Unify format of the printed messages

commit 3a0ce12e3b8e3cb7d54569a42aec743cc93f4f0d upstream.

Unify format of the printed messages, i.e. replace printk(LEVEL ... )
with pr_level(...).

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200507161804.13275-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 612cbf668adf..45a251da5453 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -253,7 +253,7 @@ int iova_cache_get(void)
 			SLAB_HWCACHE_ALIGN, NULL);
 		if (!iova_cache) {
 			mutex_unlock(&iova_cache_mutex);
-			printk(KERN_ERR "Couldn't create iova cache\n");
+			pr_err("Couldn't create iova cache\n");
 			return -ENOMEM;
 		}
 	}
@@ -718,8 +718,8 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
 
 		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
 		if (!new_iova)
-			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
-				iova->pfn_lo, iova->pfn_lo);
+			pr_err("Reserve iova range %lx@%lx failed\n",
+			       iova->pfn_lo, iova->pfn_lo);
 	}
 	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
 }
-- 
Gitee


From 753b748c8809e53ff1331594d7c60cc55129c45c Mon Sep 17 00:00:00 2001
From: Saeed Mirzamohammadi <saeed.mirzamohammadi@oracle.com>
Date: Mon, 16 Aug 2021 19:39:32 +0800
Subject: [PATCH 0256/2161] iommu/vt-d: Fix agaw for a supported 48 bit guest
 address width

commit 327d5b2fee91c404a3956c324193892cf2cc9528 upstream.

The IOMMU driver calculates the guest addressability for a DMA request
based on the value of the mgaw reported from the IOMMU. However, this
is a fused value and as mentioned in the spec, the guest width
should be calculated based on the minimum of supported adjusted guest
address width (SAGAW) and MGAW.

This is from specification:
"Guest addressability for a given DMA request is limited to the
minimum of the value reported through this field and the adjusted
guest address width of the corresponding page-table structure.
(Adjusted guest address widths supported by hardware are reported
through the SAGAW field)."

This causes domain initialization to fail and following
errors appear for EHCI PCI driver:

[    2.486393] ehci-pci 0000:01:00.4: EHCI Host Controller
[    2.486624] ehci-pci 0000:01:00.4: new USB bus registered, assigned bus
number 1
[    2.489127] ehci-pci 0000:01:00.4: DMAR: Allocating domain failed
[    2.489350] ehci-pci 0000:01:00.4: DMAR: 32bit DMA uses non-identity
mapping
[    2.489359] ehci-pci 0000:01:00.4: can't setup: -12
[    2.489531] ehci-pci 0000:01:00.4: USB bus 1 deregistered
[    2.490023] ehci-pci 0000:01:00.4: init 0000:01:00.4 fail, -12
[    2.490358] ehci-pci: probe of 0000:01:00.4 failed with error -12

This issue happens when the value of the sagaw corresponds to a
48-bit agaw. This fix updates the calculation of the agaw based on
the minimum of IOMMU's sagaw value and MGAW.

This issue happens on the code path of getting a private domain for a
device. A private domain was needed when the domain of an iommu group
couldn't meet the requirement of a device. The IOMMU core has been
evolved to eliminate the need for private domain, hence this code path
has alreay been removed from the upstream since commit 327d5b2fee91c
("iommu/vt-d: Allow 32bit devices to uses DMA domain"). Instead of back
porting all patches that are required for removing the private domain,
this simply fixes it in the affected stable kernel between v4.16 and v5.7.

[baolu: The orignal patch could be found here
 https://lore.kernel.org/linux-iommu/20210412202736.70765-1-saeed.mirzamohammadi@oracle.com/.
 I added commit message according to Greg's comments at
 https://lore.kernel.org/linux-iommu/YHZ%2FT9x7Xjf1r6fI@kroah.com/.]

Cc: Joerg Roedel <joro@8bytes.org>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: stable@vger.kernel.org #v4.16+
Signed-off-by: Saeed Mirzamohammadi <saeed.mirzamohammadi@oracle.com>
Tested-by: Camille Lu <camille.lu@hpe.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a61fa16a861f..92142824d94d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1939,7 +1939,7 @@ static inline int guestwidth_to_adjustwidth(int gaw)
 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
 		       int guest_width)
 {
-	int adjust_width, agaw;
+	int adjust_width, agaw, cap_width;
 	unsigned long sagaw;
 	int ret;
 
@@ -1955,8 +1955,9 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
-	if (guest_width > cap_mgaw(iommu->cap))
-		guest_width = cap_mgaw(iommu->cap);
+	cap_width = min_t(int, cap_mgaw(iommu->cap), agaw_to_width(iommu->agaw));
+	if (guest_width > cap_width)
+		guest_width = cap_width;
 	domain->gaw = guest_width;
 	adjust_width = guestwidth_to_adjustwidth(guest_width);
 	agaw = width_to_agaw(adjust_width);
-- 
Gitee


From 7bddd3318a2c370558f7009331803b818362fb42 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 6 May 2020 09:59:45 +0800
Subject: [PATCH 0257/2161] iommu/vt-d: Allow 32bit devices to uses DMA domain

commit 327d5b2fee91c404a3956c324193892cf2cc9528 upstream.

Currently, if a 32bit device initially uses an identity domain, Intel
IOMMU driver will convert it forcibly to a DMA one if its address
capability is not enough for the whole system memory. The motivation was
to overcome the overhead caused by possible bounced buffer.

Unfortunately, this improvement has led to many problems. For example,
some 32bit devices are required to use an identity domain, forcing them
to use DMA domain will cause the device not to work anymore. On the
other hand, the VMD sub-devices share a domain but each sub-device might
have different address capability. Forcing a VMD sub-device to use DMA
domain blindly will impact the operation of other sub-devices without
any notification. Further more, PCI aliased devices (PCI bridge and all
devices beneath it, VMD devices and various devices quirked with
pci_add_dma_alias()) must use the same domain. Forcing one device to
switch to DMA domain during runtime will cause in-fligh DMAs for other
devices to abort or target to other memory which might cause undefind
system behavior.

With the last private domain usage in iommu_need_mapping() removed, all
private domain helpers are also cleaned in this patch. Otherwise, the
compiler will complain that some functions are defined but not used.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Daniel Drake <drake@endlessm.com>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Cc: Daniel Drake <drake@endlessm.com>
Cc: Derrick Jonathan <jonathan.derrick@intel.com>
Cc: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/20200506015947.28662-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 292 +-----------------------------------
 1 file changed, 1 insertion(+), 291 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 92142824d94d..33c270927686 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -355,11 +355,6 @@ static void domain_exit(struct dmar_domain *domain);
 static void domain_remove_dev_info(struct dmar_domain *domain);
 static void dmar_remove_one_dev_info(struct device *dev);
 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
-static void domain_context_clear(struct intel_iommu *iommu,
-				 struct device *dev);
-static int domain_detach_iommu(struct dmar_domain *domain,
-			       struct intel_iommu *iommu);
-static bool device_is_rmrr_locked(struct device *dev);
 static int intel_iommu_attach_device(struct iommu_domain *domain,
 				     struct device *dev);
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
@@ -1936,66 +1931,6 @@ static inline int guestwidth_to_adjustwidth(int gaw)
 	return agaw;
 }
 
-static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
-		       int guest_width)
-{
-	int adjust_width, agaw, cap_width;
-	unsigned long sagaw;
-	int ret;
-
-	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
-
-	if (!intel_iommu_strict) {
-		ret = init_iova_flush_queue(&domain->iovad,
-					    iommu_flush_iova, iova_entry_free);
-		if (ret)
-			pr_info("iova flush queue initialization failed\n");
-	}
-
-	domain_reserve_special_ranges(domain);
-
-	/* calculate AGAW */
-	cap_width = min_t(int, cap_mgaw(iommu->cap), agaw_to_width(iommu->agaw));
-	if (guest_width > cap_width)
-		guest_width = cap_width;
-	domain->gaw = guest_width;
-	adjust_width = guestwidth_to_adjustwidth(guest_width);
-	agaw = width_to_agaw(adjust_width);
-	sagaw = cap_sagaw(iommu->cap);
-	if (!test_bit(agaw, &sagaw)) {
-		/* hardware doesn't support it, choose a bigger one */
-		pr_debug("Hardware doesn't support agaw %d\n", agaw);
-		agaw = find_next_bit(&sagaw, 5, agaw);
-		if (agaw >= 5)
-			return -ENODEV;
-	}
-	domain->agaw = agaw;
-
-	if (ecap_coherent(iommu->ecap))
-		domain->iommu_coherency = 1;
-	else
-		domain->iommu_coherency = 0;
-
-	if (ecap_sc_support(iommu->ecap))
-		domain->iommu_snooping = 1;
-	else
-		domain->iommu_snooping = 0;
-
-	if (intel_iommu_superpage)
-		domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
-	else
-		domain->iommu_superpage = 0;
-
-	domain->nid = iommu->node;
-
-	/* always allocate the top pgd */
-	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
-	if (!domain->pgd)
-		return -ENOMEM;
-	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
-	return 0;
-}
-
 static void domain_exit(struct dmar_domain *domain)
 {
 
@@ -2726,94 +2661,6 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	return domain;
 }
 
-static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
-{
-	*(u16 *)opaque = alias;
-	return 0;
-}
-
-static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
-{
-	struct device_domain_info *info;
-	struct dmar_domain *domain = NULL;
-	struct intel_iommu *iommu;
-	u16 dma_alias;
-	unsigned long flags;
-	u8 bus, devfn;
-
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
-		return NULL;
-
-	if (dev_is_pci(dev)) {
-		struct pci_dev *pdev = to_pci_dev(dev);
-
-		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
-
-		spin_lock_irqsave(&device_domain_lock, flags);
-		info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
-						      PCI_BUS_NUM(dma_alias),
-						      dma_alias & 0xff);
-		if (info) {
-			iommu = info->iommu;
-			domain = info->domain;
-		}
-		spin_unlock_irqrestore(&device_domain_lock, flags);
-
-		/* DMA alias already has a domain, use it */
-		if (info)
-			goto out;
-	}
-
-	/* Allocate and initialize new domain for the device */
-	domain = alloc_domain(0);
-	if (!domain)
-		return NULL;
-	if (domain_init(domain, iommu, gaw)) {
-		domain_exit(domain);
-		return NULL;
-	}
-
-out:
-	return domain;
-}
-
-static struct dmar_domain *set_domain_for_dev(struct device *dev,
-					      struct dmar_domain *domain)
-{
-	struct intel_iommu *iommu;
-	struct dmar_domain *tmp;
-	u16 req_id, dma_alias;
-	u8 bus, devfn;
-
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
-		return NULL;
-
-	req_id = ((u16)bus << 8) | devfn;
-
-	if (dev_is_pci(dev)) {
-		struct pci_dev *pdev = to_pci_dev(dev);
-
-		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
-
-		/* register PCI DMA alias device */
-		if (req_id != dma_alias) {
-			tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
-					dma_alias & 0xff, NULL, domain);
-
-			if (!tmp || tmp != domain)
-				return tmp;
-		}
-	}
-
-	tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
-	if (!tmp || tmp != domain)
-		return tmp;
-
-	return domain;
-}
-
 static int iommu_domain_identity_map(struct dmar_domain *domain,
 				     unsigned long long start,
 				     unsigned long long end)
@@ -2839,45 +2686,6 @@ static int iommu_domain_identity_map(struct dmar_domain *domain,
 				DMA_PTE_READ|DMA_PTE_WRITE);
 }
 
-static int domain_prepare_identity_map(struct device *dev,
-				       struct dmar_domain *domain,
-				       unsigned long long start,
-				       unsigned long long end)
-{
-	/* For _hardware_ passthrough, don't bother. But for software
-	   passthrough, we do it anyway -- it may indicate a memory
-	   range which is reserved in E820, so which didn't get set
-	   up to start with in si_domain */
-	if (domain == si_domain && hw_pass_through) {
-		dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
-			 start, end);
-		return 0;
-	}
-
-	dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
-
-	if (end < start) {
-		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
-			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-			dmi_get_system_info(DMI_BIOS_VENDOR),
-			dmi_get_system_info(DMI_BIOS_VERSION),
-		     dmi_get_system_info(DMI_PRODUCT_VERSION));
-		return -EIO;
-	}
-
-	if (end >> agaw_to_width(domain->agaw)) {
-		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
-		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-		     agaw_to_width(domain->agaw),
-		     dmi_get_system_info(DMI_BIOS_VENDOR),
-		     dmi_get_system_info(DMI_BIOS_VERSION),
-		     dmi_get_system_info(DMI_PRODUCT_VERSION));
-		return -EIO;
-	}
-
-	return iommu_domain_identity_map(domain, start, end);
-}
-
 static int md_domain_init(struct dmar_domain *domain, int guest_width);
 
 static int __init si_domain_init(int hw)
@@ -3558,98 +3366,16 @@ static unsigned long intel_alloc_iova(struct device *dev,
 	return iova_pfn;
 }
 
-static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
-{
-	struct dmar_domain *domain, *tmp;
-	struct dmar_rmrr_unit *rmrr;
-	struct device *i_dev;
-	int i, ret;
-
-	/* Device shouldn't be attached by any domains. */
-	domain = find_domain(dev);
-	if (domain)
-		return NULL;
-
-	domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
-	if (!domain)
-		goto out;
-
-	/* We have a new domain - setup possible RMRRs for the device */
-	rcu_read_lock();
-	for_each_rmrr_units(rmrr) {
-		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
-					  i, i_dev) {
-			if (i_dev != dev)
-				continue;
-
-			ret = domain_prepare_identity_map(dev, domain,
-							  rmrr->base_address,
-							  rmrr->end_address);
-			if (ret)
-				dev_err(dev, "Mapping reserved region failed\n");
-		}
-	}
-	rcu_read_unlock();
-
-	tmp = set_domain_for_dev(dev, domain);
-	if (!tmp || domain != tmp) {
-		domain_exit(domain);
-		domain = tmp;
-	}
-
-out:
-	if (!domain)
-		dev_err(dev, "Allocating domain failed\n");
-	else
-		domain->domain.type = IOMMU_DOMAIN_DMA;
-
-	return domain;
-}
-
 /* Check if the dev needs to go through non-identity map and unmap process.*/
 static bool iommu_need_mapping(struct device *dev)
 {
-	int ret;
-
 	if (iommu_dummy(dev))
 		return false;
 
 	if (unlikely(attach_deferred(dev)))
 		do_deferred_attach(dev);
 
-	ret = identity_mapping(dev);
-	if (ret) {
-		u64 dma_mask = *dev->dma_mask;
-
-		if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
-			dma_mask = dev->coherent_dma_mask;
-
-		if (dma_mask >= dma_direct_get_required_mask(dev))
-			return false;
-
-		/*
-		 * 32 bit DMA is removed from si_domain and fall back to
-		 * non-identity mapping.
-		 */
-		dmar_remove_one_dev_info(dev);
-		ret = iommu_request_dma_domain_for_dev(dev);
-		if (ret) {
-			struct iommu_domain *domain;
-			struct dmar_domain *dmar_domain;
-
-			domain = iommu_get_domain_for_dev(dev);
-			if (domain) {
-				dmar_domain = to_dmar_domain(domain);
-				dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
-			}
-			dmar_remove_one_dev_info(dev);
-			get_private_domain_for_dev(dev);
-		}
-
-		dev_info(dev, "32bit DMA uses non-identity mapping\n");
-	}
-
-	return true;
+	return !identity_mapping(dev);
 }
 
 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
@@ -5215,16 +4941,6 @@ int __init intel_iommu_init(void)
 	}
 	up_write(&dmar_global_lock);
 
-#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
-	/*
-	 * If the system has no untrusted device or the user has decided
-	 * to disable the bounce page mechanisms, we don't need swiotlb.
-	 * Mark this and the pre-allocated bounce pages will be released
-	 * later.
-	 */
-	if (!has_untrusted_dev() || intel_no_bounce)
-		swiotlb = 0;
-#endif
 	dma_ops = &intel_dma_ops;
 
 	init_iommu_pm_ops();
@@ -5326,12 +5042,6 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 	domain_detach_iommu(domain, iommu);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
-	/* free the private domain */
-	if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
-	    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
-	    list_empty(&domain->devices))
-		domain_exit(info->domain);
-
 	free_devinfo_mem(info);
 }
 
-- 
Gitee


From 49f0c9079070b91ca4b8a610cec47c55bfce3994 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 6 May 2020 09:59:46 +0800
Subject: [PATCH 0258/2161] iommu/vt-d: Allow PCI sub-hierarchy to use DMA
 domain

commit 14b3526d5909f01e1d1baa05f50952788bb7418e upstream.

Before commit fa954e6831789 ("iommu/vt-d: Delegate the dma domain
to upper layer"), Intel IOMMU started off with all devices in the
identity domain, and took them out later if it found they couldn't
access all of memory. This required devices behind a PCI bridge to
use a DMA domain at the beginning because all PCI devices behind
the bridge use the same source-id in their transactions and the
domain couldn't be changed at run-time.

Intel IOMMU driver is now aligned with the default domain framework,
there's no need to keep this requirement anymore.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Daniel Drake <drake@endlessm.com>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/20200506015947.28662-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 33c270927686..9bfdec192600 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2878,31 +2878,6 @@ static int device_def_domain_type(struct device *dev)
 
 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
 			return IOMMU_DOMAIN_IDENTITY;
-
-		/*
-		 * We want to start off with all devices in the 1:1 domain, and
-		 * take them out later if we find they can't access all of memory.
-		 *
-		 * However, we can't do this for PCI devices behind bridges,
-		 * because all PCI devices behind the same bridge will end up
-		 * with the same source-id on their transactions.
-		 *
-		 * Practically speaking, we can't change things around for these
-		 * devices at run-time, because we can't be sure there'll be no
-		 * DMA transactions in flight for any of their siblings.
-		 *
-		 * So PCI devices (unless they're on the root bus) as well as
-		 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
-		 * the 1:1 domain, just in _case_ one of their siblings turns out
-		 * not to be able to map all of memory.
-		 */
-		if (!pci_is_pcie(pdev)) {
-			if (!pci_is_root_bus(pdev->bus))
-				return IOMMU_DOMAIN_DMA;
-			if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
-				return IOMMU_DOMAIN_DMA;
-		} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
-			return IOMMU_DOMAIN_DMA;
 	}
 
 	return 0;
-- 
Gitee


From d85e5f70eb58a8137f005b3815454a955646e9ee Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 6 May 2020 09:59:47 +0800
Subject: [PATCH 0259/2161] iommu/vt-d: Apply per-device dma_ops

commit 6fc7020cf298aaec343df423746b44d99c6efaa5 upstream.

Current Intel IOMMU driver sets the system level dma_ops. This causes
each dma API to go through the IOMMU driver even the devices are using
identity mapped domains. This sets per-device dma_ops only if a device
is using a DMA domain. Otherwise, use the default system level dma_ops
for direct dma.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Daniel Drake <drake@endlessm.com>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/20200506015947.28662-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 82 ++++++++++++-------------------------
 1 file changed, 26 insertions(+), 56 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 9bfdec192600..316b5d9192ca 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2741,17 +2741,6 @@ static int __init si_domain_init(int hw)
 	return 0;
 }
 
-static int identity_mapping(struct device *dev)
-{
-	struct device_domain_info *info;
-
-	info = dev->archdata.iommu;
-	if (info)
-		return (info->domain == si_domain);
-
-	return 0;
-}
-
 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
 {
 	struct dmar_domain *ndomain;
@@ -3341,18 +3330,6 @@ static unsigned long intel_alloc_iova(struct device *dev,
 	return iova_pfn;
 }
 
-/* Check if the dev needs to go through non-identity map and unmap process.*/
-static bool iommu_need_mapping(struct device *dev)
-{
-	if (iommu_dummy(dev))
-		return false;
-
-	if (unlikely(attach_deferred(dev)))
-		do_deferred_attach(dev);
-
-	return !identity_mapping(dev);
-}
-
 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 				     size_t size, int dir, u64 dma_mask)
 {
@@ -3366,6 +3343,9 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 
 	BUG_ON(dir == DMA_NONE);
 
+	if (unlikely(attach_deferred(dev)))
+		do_deferred_attach(dev);
+
 	domain = find_domain(dev);
 	if (!domain)
 		return DMA_MAPPING_ERROR;
@@ -3417,20 +3397,15 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page,
 				 enum dma_data_direction dir,
 				 unsigned long attrs)
 {
-	if (iommu_need_mapping(dev))
-		return __intel_map_single(dev, page_to_phys(page) + offset,
-				size, dir, *dev->dma_mask);
-	return dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	return __intel_map_single(dev, page_to_phys(page) + offset,
+				  size, dir, *dev->dma_mask);
 }
 
 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
 				     size_t size, enum dma_data_direction dir,
 				     unsigned long attrs)
 {
-	if (iommu_need_mapping(dev))
-		return __intel_map_single(dev, phys_addr, size, dir,
-				*dev->dma_mask);
-	return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
 }
 
 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
@@ -3481,17 +3456,13 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
 			     size_t size, enum dma_data_direction dir,
 			     unsigned long attrs)
 {
-	if (iommu_need_mapping(dev))
-		intel_unmap(dev, dev_addr, size);
-	else
-		dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
+	intel_unmap(dev, dev_addr, size);
 }
 
 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-	if (iommu_need_mapping(dev))
-		intel_unmap(dev, dev_addr, size);
+	intel_unmap(dev, dev_addr, size);
 }
 
 static void *intel_alloc_coherent(struct device *dev, size_t size,
@@ -3501,8 +3472,8 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
 	struct page *page = NULL;
 	int order;
 
-	if (!iommu_need_mapping(dev))
-		return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
+	if (unlikely(attach_deferred(dev)))
+		do_deferred_attach(dev);
 
 	size = PAGE_ALIGN(size);
 	order = get_order(size);
@@ -3537,9 +3508,6 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
 	int order;
 	struct page *page = virt_to_page(vaddr);
 
-	if (!iommu_need_mapping(dev))
-		return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
-
 	size = PAGE_ALIGN(size);
 	order = get_order(size);
 
@@ -3557,9 +3525,6 @@ static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
 	struct scatterlist *sg;
 	int i;
 
-	if (!iommu_need_mapping(dev))
-		return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
-
 	for_each_sg(sglist, sg, nelems, i) {
 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
 	}
@@ -3583,8 +3548,9 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 	struct intel_iommu *iommu;
 
 	BUG_ON(dir == DMA_NONE);
-	if (!iommu_need_mapping(dev))
-		return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
+
+	if (unlikely(attach_deferred(dev)))
+		do_deferred_attach(dev);
 
 	domain = find_domain(dev);
 	if (!domain)
@@ -3631,8 +3597,6 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 
 static u64 intel_get_required_mask(struct device *dev)
 {
-	if (!iommu_need_mapping(dev))
-		return dma_direct_get_required_mask(dev);
 	return DMA_BIT_MASK(32);
 }
 
@@ -4916,8 +4880,6 @@ int __init intel_iommu_init(void)
 	}
 	up_write(&dmar_global_lock);
 
-	dma_ops = &intel_dma_ops;
-
 	init_iommu_pm_ops();
 
 	down_read(&dmar_global_lock);
@@ -5508,11 +5470,6 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev)
 	if (translation_pre_enabled(iommu))
 		dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
 
-	if (device_needs_bounce(dev)) {
-		dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
-		set_dma_ops(dev, &bounce_dma_ops);
-	}
-
 	return &iommu->iommu;
 }
 
@@ -5527,7 +5484,19 @@ static void intel_iommu_release_device(struct device *dev)
 
 	dmar_remove_one_dev_info(dev);
 
+	set_dma_ops(dev, NULL);
+}
+
+static void intel_iommu_probe_finalize(struct device *dev)
+{
+	struct iommu_domain *domain;
+
+	domain = iommu_get_domain_for_dev(dev);
 	if (device_needs_bounce(dev))
+		set_dma_ops(dev, &bounce_dma_ops);
+	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
+		set_dma_ops(dev, &intel_dma_ops);
+	else
 		set_dma_ops(dev, NULL);
 }
 
@@ -5876,6 +5845,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.unmap			= intel_iommu_unmap,
 	.iova_to_phys		= intel_iommu_iova_to_phys,
 	.probe_device		= intel_iommu_probe_device,
+	.probe_finalize		= intel_iommu_probe_finalize,
 	.release_device		= intel_iommu_release_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
-- 
Gitee


From ffc23e34ad8b485cebb2605d88f63abbf4bc38c9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 7 May 2020 19:18:02 +0300
Subject: [PATCH 0260/2161] iommu/vt-d: Unify format of the printed messages

commit 8627892af6cb80f80acab066e26851d7b5fb3ca6 upstream.

Unify format of the printed messages, i.e. replace printk(LEVEL ... )
with pr_level(...).

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200507161804.13275-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 316b5d9192ca..d0f3cd743828 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -475,8 +475,7 @@ static int __init intel_iommu_setup(char *str)
 			pr_info("Intel-IOMMU: scalable mode supported\n");
 			intel_iommu_sm = 1;
 		} else if (!strncmp(str, "tboot_noforce", 13)) {
-			printk(KERN_INFO
-				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
+			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 			intel_iommu_tboot_noforce = 1;
 		} else if (!strncmp(str, "nobounce", 8)) {
 			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
-- 
Gitee


From 9a7bdb813f9bcfd28ca658ddeddedcdd65abc4fc Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 7 May 2020 19:18:04 +0300
Subject: [PATCH 0261/2161] iommu/amd: Unify format of the printed messages

commit 555fb5ae0f39962417d35e02d77ee9b2c14a5428 upstream.

Unify format of the printed messages, i.e. replace printk(LEVEL ... )
with pr_level(...).

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200507161804.13275-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu_types.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 36e4aa460bb3..36007799800d 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -395,10 +395,10 @@
 #define PD_IOMMUV2_MASK		(1UL << 3) /* domain has gcr3 table */
 
 extern bool amd_iommu_dump;
-#define DUMP_printk(format, arg...)					\
-	do {								\
-		if (amd_iommu_dump)						\
-			printk(KERN_INFO "AMD-Vi: " format, ## arg);	\
+#define DUMP_printk(format, arg...)				\
+	do {							\
+		if (amd_iommu_dump)				\
+			pr_info("AMD-Vi: " format, ## arg);	\
 	} while(0);
 
 /* global flag if IOMMUs cache non-present entries */
-- 
Gitee


From 11df26f38afc34198a237727ebd1172ff595ca51 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Wed, 13 May 2020 15:47:21 -0700
Subject: [PATCH 0262/2161] iommu: Remove functions that support private domain

commit 69cf449166987d9a041020be6422ee7bf94a7228 upstream.

After moving iommu_group setup to iommu core code [1][2] and removing
private domain support in vt-d [3], there are no users for functions such
as iommu_request_dm_for_dev(), iommu_request_dma_domain_for_dev() and
request_default_domain_for_dev(). So, remove these functions.

[1] commit dce8d6964ebd ("iommu/amd: Convert to probe/release_device()
    call-backs")
[2] commit e5d1841f18b2 ("iommu/vt-d: Convert to probe/release_device()
    call-backs")
[3] commit 327d5b2fee91 ("iommu/vt-d: Allow 32bit devices to uses DMA
    domain")

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200513224721.20504-1-sai.praneeth.prakhya@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 65 -------------------------------------------
 include/linux/iommu.h | 12 --------
 2 files changed, 77 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 260ea10626e4..9e831d328afa 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2545,71 +2545,6 @@ struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
 }
 EXPORT_SYMBOL_GPL(iommu_alloc_resv_region);
 
-static int
-request_default_domain_for_dev(struct device *dev, unsigned long type)
-{
-	struct iommu_domain *domain;
-	struct iommu_group *group;
-	int ret;
-
-	/* Device must already be in a group before calling this function */
-	group = iommu_group_get(dev);
-	if (!group)
-		return -EINVAL;
-
-	mutex_lock(&group->mutex);
-
-	ret = 0;
-	if (group->default_domain && group->default_domain->type == type)
-		goto out;
-
-	/* Don't change mappings of existing devices */
-	ret = -EBUSY;
-	if (iommu_group_device_count(group) != 1)
-		goto out;
-
-	ret = -ENOMEM;
-	domain = __iommu_domain_alloc(dev->bus, type);
-	if (!domain)
-		goto out;
-
-	/* Attach the device to the domain */
-	ret = __iommu_attach_group(domain, group);
-	if (ret) {
-		iommu_domain_free(domain);
-		goto out;
-	}
-
-	/* Make the domain the default for this group */
-	if (group->default_domain)
-		iommu_domain_free(group->default_domain);
-	group->default_domain = domain;
-
-	iommu_create_device_direct_mappings(group, dev);
-
-	dev_info(dev, "Using iommu %s mapping\n",
-		 type == IOMMU_DOMAIN_DMA ? "dma" : "direct");
-
-	ret = 0;
-out:
-	mutex_unlock(&group->mutex);
-	iommu_group_put(group);
-
-	return ret;
-}
-
-/* Request that a device is direct mapped by the IOMMU */
-int iommu_request_dm_for_dev(struct device *dev)
-{
-	return request_default_domain_for_dev(dev, IOMMU_DOMAIN_IDENTITY);
-}
-
-/* Request that a device can't be direct mapped by the IOMMU */
-int iommu_request_dma_domain_for_dev(struct device *dev)
-{
-	return request_default_domain_for_dev(dev, IOMMU_DOMAIN_DMA);
-}
-
 void iommu_set_default_passthrough(bool cmd_line)
 {
 	if (cmd_line)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 64610192a5cd..96acc9731fd7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -481,8 +481,6 @@ extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
 extern void generic_iommu_put_resv_regions(struct device *dev,
 					   struct list_head *list);
-extern int iommu_request_dm_for_dev(struct device *dev);
-extern int iommu_request_dma_domain_for_dev(struct device *dev);
 extern void iommu_set_default_passthrough(bool cmd_line);
 extern void iommu_set_default_translated(bool cmd_line);
 extern bool iommu_default_passthrough(void);
@@ -801,16 +799,6 @@ static inline int iommu_get_group_resv_regions(struct iommu_group *group,
 	return -ENODEV;
 }
 
-static inline int iommu_request_dm_for_dev(struct device *dev)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_request_dma_domain_for_dev(struct device *dev)
-{
-	return -ENODEV;
-}
-
 static inline void iommu_set_default_passthrough(bool cmd_line)
 {
 }
-- 
Gitee


From 95ae5fcf5a4774f9468a018815b5f71b3cf0d413 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:44 +0800
Subject: [PATCH 0263/2161] iommu/vt-d: Move domain helper to header

commit 3db9983e4327f773c490de2a8c66d6000561d88a upstream.

Move domain helper to header to be used by SVA code.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 6 ------
 include/linux/intel-iommu.h | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index d0f3cd743828..a643cc081b86 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -441,12 +441,6 @@ static void init_translation_status(struct intel_iommu *iommu)
 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 }
 
-/* Convert generic 'struct iommu_domain to private struct dmar_domain */
-static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
-{
-	return container_of(dom, struct dmar_domain, domain);
-}
-
 static int __init intel_iommu_setup(char *str)
 {
 	if (!str)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a7c6e63a39f0..a136bb504a60 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -598,6 +598,12 @@ static inline void __iommu_flush_cache(
 		clflush_cache_range(addr, size);
 }
 
+/* Convert generic struct iommu_domain to private struct dmar_domain */
+static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct dmar_domain, domain);
+}
+
 /*
  * 0: readable
  * 1: writable
-- 
Gitee


From 9955bf3fcdb7109d77bbb9dd33f90bd10cad8e7c Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:45 +0800
Subject: [PATCH 0264/2161] iommu/vt-d: Use a helper function to skip agaw for
 SL

commit 3aef9ca6a42aca8c797f867e554ad297b446439f upstream.

An Intel iommu domain uses 5-level page table by default. If the iommu
that the domain tries to attach supports less page levels, the top level
page tables should be skipped. Add a helper to do this so that it could
be used in other places.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-pasid.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 22b30f10b396..d9cea3011b58 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -499,6 +499,25 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 	return 0;
 }
 
+/*
+ * Skip top levels of page tables for iommu which has less agaw
+ * than default. Unnecessary for PT mode.
+ */
+static inline int iommu_skip_agaw(struct dmar_domain *domain,
+				  struct intel_iommu *iommu,
+				  struct dma_pte **pgd)
+{
+	int agaw;
+
+	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+		*pgd = phys_to_virt(dma_pte_addr(*pgd));
+		if (!dma_pte_present(*pgd))
+			return -EINVAL;
+	}
+
+	return agaw;
+}
+
 /*
  * Set up the scalable mode pasid entry for second only translation type.
  */
@@ -522,17 +541,11 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 		return -EINVAL;
 	}
 
-	/*
-	 * Skip top levels of page tables for iommu which has less agaw
-	 * than default. Unnecessary for PT mode.
-	 */
 	pgd = domain->pgd;
-	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
-		pgd = phys_to_virt(dma_pte_addr(pgd));
-		if (!dma_pte_present(pgd)) {
-			dev_err(dev, "Invalid domain page table\n");
-			return -EINVAL;
-		}
+	agaw = iommu_skip_agaw(domain, iommu, &pgd);
+	if (agaw < 0) {
+		dev_err(dev, "Invalid domain page table\n");
+		return -EINVAL;
 	}
 
 	pgd_val = virt_to_phys(pgd);
-- 
Gitee


From 93253538255c47d9ad863e9a01e0b3a8b739bddc Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:46 +0800
Subject: [PATCH 0265/2161] iommu/vt-d: Add nested translation helper function

commit b0d1f8741b812352fe0e5f3b2381427085f23e19 upstream.

Nested translation mode is supported in VT-d 3.0 Spec.CH 3.8.
With PASID granular translation type set to 0x11b, translation
result from the first level(FL) also subject to a second level(SL)
page table translation. This mode is used for SVA virtualization,
where FL performs guest virtual to guest physical translation and
SL performs guest physical to host physical translation.

This patch adds a helper function for setting up nested translation
where second level comes from a domain and first level comes from
a guest PGD.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20200516062101.29541-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c |  25 ------
 drivers/iommu/intel-pasid.c | 174 +++++++++++++++++++++++++++++++++++-
 drivers/iommu/intel-pasid.h |  10 +++
 include/linux/intel-iommu.h |  20 +++++
 include/uapi/linux/iommu.h  |   5 ++
 5 files changed, 206 insertions(+), 28 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a643cc081b86..d2792cd5c02a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -296,31 +296,6 @@ static inline void context_clear_entry(struct context_entry *context)
 static struct dmar_domain *si_domain;
 static int hw_pass_through = 1;
 
-/* si_domain contains mulitple devices */
-#define DOMAIN_FLAG_STATIC_IDENTITY		BIT(0)
-
-/*
- * This is a DMA domain allocated through the iommu domain allocation
- * interface. But one or more devices belonging to this domain have
- * been chosen to use a private domain. We should avoid to use the
- * map/unmap/iova_to_phys APIs on it.
- */
-#define DOMAIN_FLAG_LOSE_CHILDREN		BIT(1)
-
-/*
- * When VT-d works in the scalable mode, it allows DMA translation to
- * happen through either first level or second level page table. This
- * bit marks that the DMA translation for the domain goes through the
- * first level page table, otherwise, it goes through the second level.
- */
-#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(2)
-
-/*
- * Domain represents a virtual machine which demands iommu nested
- * translation mode support.
- */
-#define DOMAIN_FLAG_NESTING_MODE		BIT(3)
-
 #define for_each_domain_iommu(idx, domain)			\
 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
 		if (domain->iommu_refcnt[idx])
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index d9cea3011b58..c7fa1b79eaf7 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -359,6 +359,16 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value)
 	pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
 }
 
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_eafe(struct pasid_entry *pe)
+{
+	pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
+
 static void
 pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
 				    u16 did, int pasid)
@@ -492,7 +502,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
 	/* Setup Present and PASID Granular Transfer Type: */
-	pasid_set_translation_type(pte, 1);
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
 	pasid_set_present(pte);
 	pasid_flush_caches(iommu, pte, pasid, did);
 
@@ -561,7 +571,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 	pasid_set_domain_id(pte, did);
 	pasid_set_slptr(pte, pgd_val);
 	pasid_set_address_width(pte, agaw);
-	pasid_set_translation_type(pte, 2);
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
 	pasid_set_fault_enable(pte);
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
@@ -595,7 +605,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 	pasid_clear_entry(pte);
 	pasid_set_domain_id(pte, did);
 	pasid_set_address_width(pte, iommu->agaw);
-	pasid_set_translation_type(pte, 4);
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
 	pasid_set_fault_enable(pte);
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
@@ -609,3 +619,161 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 
 	return 0;
 }
+
+static int
+intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
+			    struct iommu_gpasid_bind_data_vtd *pasid_data)
+{
+	/*
+	 * Not all guest PASID table entry fields are passed down during bind,
+	 * here we only set up the ones that are dependent on guest settings.
+	 * Execution related bits such as NXE, SMEP are not supported.
+	 * Other fields, such as snoop related, are set based on host needs
+	 * regardless of guest settings.
+	 */
+	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) {
+		if (!ecap_srs(iommu->ecap)) {
+			pr_err_ratelimited("No supervisor request support on %s\n",
+					   iommu->name);
+			return -EINVAL;
+		}
+		pasid_set_sre(pte);
+	}
+
+	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
+		if (!ecap_eafs(iommu->ecap)) {
+			pr_err_ratelimited("No extended access flag support on %s\n",
+					   iommu->name);
+			return -EINVAL;
+		}
+		pasid_set_eafe(pte);
+	}
+
+	/*
+	 * Memory type is only applicable to devices inside processor coherent
+	 * domain. Will add MTS support once coherent devices are available.
+	 */
+	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) {
+		pr_warn_ratelimited("No memory type support %s\n",
+				    iommu->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * This could be used for guest shared virtual address. In this case, the
+ * first level page tables are used for GVA-GPA translation in the guest,
+ * second level page tables are used for GPA-HPA translation.
+ *
+ * @iommu:      IOMMU which the device belong to
+ * @dev:        Device to be set up for translation
+ * @gpgd:       FLPTPTR: First Level Page translation pointer in GPA
+ * @pasid:      PASID to be programmed in the device PASID table
+ * @pasid_data: Additional PASID info from the guest bind request
+ * @domain:     Domain info for setting up second level page tables
+ * @addr_width: Address width of the first level (guest)
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+			     pgd_t *gpgd, int pasid,
+			     struct iommu_gpasid_bind_data_vtd *pasid_data,
+			     struct dmar_domain *domain, int addr_width)
+{
+	struct pasid_entry *pte;
+	struct dma_pte *pgd;
+	int ret = 0;
+	u64 pgd_val;
+	int agaw;
+	u16 did;
+
+	if (!ecap_nest(iommu->ecap)) {
+		pr_err_ratelimited("IOMMU: %s: No nested translation support\n",
+				   iommu->name);
+		return -EINVAL;
+	}
+
+	if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) {
+		pr_err_ratelimited("Domain is not in nesting mode, %x\n",
+				   domain->flags);
+		return -EINVAL;
+	}
+
+	pte = intel_pasid_get_entry(dev, pasid);
+	if (WARN_ON(!pte))
+		return -EINVAL;
+
+	/*
+	 * Caller must ensure PASID entry is not in use, i.e. not bind the
+	 * same PASID to the same device twice.
+	 */
+	if (pasid_pte_is_present(pte))
+		return -EBUSY;
+
+	pasid_clear_entry(pte);
+
+	/* Sanity checking performed by caller to make sure address
+	 * width matching in two dimensions:
+	 * 1. CPU vs. IOMMU
+	 * 2. Guest vs. Host.
+	 */
+	switch (addr_width) {
+#ifdef CONFIG_X86
+	case ADDR_WIDTH_5LEVEL:
+		if (!cpu_feature_enabled(X86_FEATURE_LA57) ||
+		    !cap_5lp_support(iommu->cap)) {
+			dev_err_ratelimited(dev,
+					    "5-level paging not supported\n");
+			return -EINVAL;
+		}
+
+		pasid_set_flpm(pte, 1);
+		break;
+#endif
+	case ADDR_WIDTH_4LEVEL:
+		pasid_set_flpm(pte, 0);
+		break;
+	default:
+		dev_err_ratelimited(dev, "Invalid guest address width %d\n",
+				    addr_width);
+		return -EINVAL;
+	}
+
+	/* First level PGD is in GPA, must be supported by the second level */
+	if ((unsigned long long)gpgd > domain->max_addr) {
+		dev_err_ratelimited(dev,
+				    "Guest PGD %llx not supported, max %llx\n",
+				    (unsigned long long)gpgd, domain->max_addr);
+		return -EINVAL;
+	}
+	pasid_set_flptr(pte, (u64)gpgd);
+
+	ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data);
+	if (ret)
+		return ret;
+
+	/* Setup the second level based on the given domain */
+	pgd = domain->pgd;
+
+	agaw = iommu_skip_agaw(domain, iommu, &pgd);
+	if (agaw < 0) {
+		dev_err_ratelimited(dev, "Invalid domain page table\n");
+		return -EINVAL;
+	}
+	pgd_val = virt_to_phys(pgd);
+	pasid_set_slptr(pte, pgd_val);
+	pasid_set_fault_enable(pte);
+
+	did = domain->iommu_did[iommu->seq_id];
+	pasid_set_domain_id(pte, did);
+
+	pasid_set_address_width(pte, agaw);
+	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+	pasid_set_present(pte);
+	pasid_flush_caches(iommu, pte, pasid, did);
+
+	return ret;
+}
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index 92de6df24ccb..ccd50c2ae75c 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -36,6 +36,7 @@
  * to vmalloc or even module mappings.
  */
 #define PASID_FLAG_SUPERVISOR_MODE	BIT(0)
+#define PASID_FLAG_NESTED		BIT(1)
 
 /*
  * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
@@ -51,6 +52,11 @@ struct pasid_entry {
 	u64 val[8];
 };
 
+#define PASID_ENTRY_PGTT_FL_ONLY	(1)
+#define PASID_ENTRY_PGTT_SL_ONLY	(2)
+#define PASID_ENTRY_PGTT_NESTED		(3)
+#define PASID_ENTRY_PGTT_PT		(4)
+
 /* The representative of a PASID table */
 struct pasid_table {
 	void			*table;		/* pasid table pointer */
@@ -99,6 +105,10 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
 				   struct device *dev, int pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu,
+			     struct device *dev, pgd_t *pgd, int pasid,
+			     struct iommu_gpasid_bind_data_vtd *pasid_data,
+			     struct dmar_domain *domain, int addr_width);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 				 struct device *dev, int pasid);
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a136bb504a60..eba6c818dbf2 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -42,6 +42,9 @@
 #define DMA_FL_PTE_PRESENT	BIT_ULL(0)
 #define DMA_FL_PTE_XD		BIT_ULL(63)
 
+#define ADDR_WIDTH_5LEVEL	(57)
+#define ADDR_WIDTH_4LEVEL	(48)
+
 #define CONTEXT_TT_MULTI_LEVEL	0
 #define CONTEXT_TT_DEV_IOTLB	1
 #define CONTEXT_TT_PASS_THROUGH 2
@@ -480,6 +483,23 @@ struct context_entry {
 	u64 hi;
 };
 
+/* si_domain contains mulitple devices */
+#define DOMAIN_FLAG_STATIC_IDENTITY		BIT(0)
+
+/*
+ * When VT-d works in the scalable mode, it allows DMA translation to
+ * happen through either first level or second level page table. This
+ * bit marks that the DMA translation for the domain goes through the
+ * first level page table, otherwise, it goes through the second level.
+ */
+#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(1)
+
+/*
+ * Domain represents a virtual machine which demands iommu nested
+ * translation mode support.
+ */
+#define DOMAIN_FLAG_NESTING_MODE		BIT(2)
+
 struct dmar_domain {
 	int	nid;			/* node id */
 
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 4ad3496e5c43..e907b7091a46 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -285,6 +285,11 @@ struct iommu_gpasid_bind_data_vtd {
 	__u32 emt;
 };
 
+#define IOMMU_SVA_VTD_GPASID_MTS_MASK	(IOMMU_SVA_VTD_GPASID_CD | \
+					 IOMMU_SVA_VTD_GPASID_EMTE | \
+					 IOMMU_SVA_VTD_GPASID_PCD |  \
+					 IOMMU_SVA_VTD_GPASID_PWT)
+
 /**
  * struct iommu_gpasid_bind_data - Information about device and guest PASID binding
  * @version:	Version of this data structure
-- 
Gitee


From 5ce8702391cb1d2a5efa38d026f07330c86548f1 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:47 +0800
Subject: [PATCH 0266/2161] iommu/vt-d: Add bind guest PASID support

commit 56722a4398a306585ca3ed39ff54fc907af98618 upstream.

When supporting guest SVA with emulated IOMMU, the guest PASID
table is shadowed in VMM. Updates to guest vIOMMU PASID table
will result in PASID cache flush which will be passed down to
the host as bind guest PASID calls.

For the SL page tables, it will be harvested from device's
default domain (request w/o PASID), or aux domain in case of
mediated device.

    .-------------.  .---------------------------.
    |   vIOMMU    |  | Guest process CR3, FL only|
    |             |  '---------------------------'
    .----------------/
    | PASID Entry |--- PASID cache flush -
    '-------------'                       |
    |             |                       V
    |             |                CR3 in GPA
    '-------------'
Guest
------| Shadow |--------------------------|--------
      v        v                          v
Host
    .-------------.  .----------------------.
    |   pIOMMU    |  | Bind FL for GVA-GPA  |
    |             |  '----------------------'
    .----------------/  |
    | PASID Entry |     V (Nested xlate)
    '----------------\.------------------------------.
    |             |   |SL for GPA-HPA, default domain|
    |             |   '------------------------------'
    '-------------'
Where:
 - FL = First level/stage one page tables
 - SL = Second level/stage two page tables

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-5-baolu.lu@linux.intel.com

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c |   4 +
 drivers/iommu/intel-svm.c   | 200 ++++++++++++++++++++++++++++++++++++
 include/linux/intel-iommu.h |   6 +-
 include/linux/intel-svm.h   |  12 +++
 4 files changed, 221 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index d2792cd5c02a..96857b3c47b8 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5826,6 +5826,10 @@ const struct iommu_ops intel_iommu_ops = {
 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
 	.def_domain_type	= device_def_domain_type,
 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	.sva_bind_gpasid	= intel_svm_bind_gpasid,
+	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
+#endif
 };
 
 static void quirk_iommu_igfx(struct pci_dev *dev)
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 517ed2cd5eaa..953963783f46 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -244,6 +244,206 @@ static LIST_HEAD(global_svm_list);
 	list_for_each_entry((sdev), &(svm)->devs, list)	\
 		if ((d) != (sdev)->dev) {} else
 
+int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+			  struct iommu_gpasid_bind_data *data)
+{
+	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct dmar_domain *dmar_domain;
+	struct intel_svm_dev *sdev;
+	struct intel_svm *svm;
+	int ret = 0;
+
+	if (WARN_ON(!iommu) || !data)
+		return -EINVAL;
+
+	if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
+	    data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+		return -EINVAL;
+
+	if (!dev_is_pci(dev))
+		return -ENOTSUPP;
+
+	/* VT-d supports devices with full 20 bit PASIDs only */
+	if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+		return -EINVAL;
+
+	/*
+	 * We only check host PASID range, we have no knowledge to check
+	 * guest PASID range.
+	 */
+	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
+		return -EINVAL;
+
+	dmar_domain = to_dmar_domain(domain);
+
+	mutex_lock(&pasid_mutex);
+	svm = ioasid_find(NULL, data->hpasid, NULL);
+	if (IS_ERR(svm)) {
+		ret = PTR_ERR(svm);
+		goto out;
+	}
+
+	if (svm) {
+		/*
+		 * If we found svm for the PASID, there must be at
+		 * least one device bond, otherwise svm should be freed.
+		 */
+		if (WARN_ON(list_empty(&svm->devs))) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		for_each_svm_dev(sdev, svm, dev) {
+			/*
+			 * For devices with aux domains, we should allow
+			 * multiple bind calls with the same PASID and pdev.
+			 */
+			if (iommu_dev_feature_enabled(dev,
+						      IOMMU_DEV_FEAT_AUX)) {
+				sdev->users++;
+			} else {
+				dev_warn_ratelimited(dev,
+						     "Already bound with PASID %u\n",
+						     svm->pasid);
+				ret = -EBUSY;
+			}
+			goto out;
+		}
+	} else {
+		/* We come here when PASID has never been bond to a device. */
+		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+		if (!svm) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		/* REVISIT: upper layer/VFIO can track host process that bind
+		 * the PASID. ioasid_set = mm might be sufficient for vfio to
+		 * check pasid VMM ownership. We can drop the following line
+		 * once VFIO and IOASID set check is in place.
+		 */
+		svm->mm = get_task_mm(current);
+		svm->pasid = data->hpasid;
+		if (data->flags & IOMMU_SVA_GPASID_VAL) {
+			svm->gpasid = data->gpasid;
+			svm->flags |= SVM_FLAG_GUEST_PASID;
+		}
+		ioasid_set_data(data->hpasid, svm);
+		INIT_LIST_HEAD_RCU(&svm->devs);
+		mmput(svm->mm);
+	}
+	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+	if (!sdev) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	sdev->dev = dev;
+
+	/* Only count users if device has aux domains */
+	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+		sdev->users = 1;
+
+	/* Set up device context entry for PASID if not enabled already */
+	ret = intel_iommu_enable_pasid(iommu, sdev->dev);
+	if (ret) {
+		dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
+		kfree(sdev);
+		goto out;
+	}
+
+	/*
+	 * PASID table is per device for better security. Therefore, for
+	 * each bind of a new device even with an existing PASID, we need to
+	 * call the nested mode setup function here.
+	 */
+	spin_lock(&iommu->lock);
+	ret = intel_pasid_setup_nested(iommu, dev, (pgd_t *)data->gpgd,
+				       data->hpasid, &data->vtd, dmar_domain,
+				       data->addr_width);
+	spin_unlock(&iommu->lock);
+	if (ret) {
+		dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
+				    data->hpasid, ret);
+		/*
+		 * PASID entry should be in cleared state if nested mode
+		 * set up failed. So we only need to clear IOASID tracking
+		 * data such that free call will succeed.
+		 */
+		kfree(sdev);
+		goto out;
+	}
+
+	svm->flags |= SVM_FLAG_GUEST_MODE;
+
+	init_rcu_head(&sdev->rcu);
+	list_add_rcu(&sdev->list, &svm->devs);
+ out:
+	if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
+		ioasid_set_data(data->hpasid, NULL);
+		kfree(svm);
+	}
+
+	mutex_unlock(&pasid_mutex);
+	return ret;
+}
+
+int intel_svm_unbind_gpasid(struct device *dev, int pasid)
+{
+	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct intel_svm_dev *sdev;
+	struct intel_svm *svm;
+	int ret = -EINVAL;
+
+	if (WARN_ON(!iommu))
+		return -EINVAL;
+
+	mutex_lock(&pasid_mutex);
+	svm = ioasid_find(NULL, pasid, NULL);
+	if (!svm) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (IS_ERR(svm)) {
+		ret = PTR_ERR(svm);
+		goto out;
+	}
+
+	for_each_svm_dev(sdev, svm, dev) {
+		ret = 0;
+		if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+			sdev->users--;
+		if (!sdev->users) {
+			list_del_rcu(&sdev->list);
+			intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
+			intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
+			/* TODO: Drain in flight PRQ for the PASID since it
+			 * may get reused soon, we don't want to
+			 * confuse with its previous life.
+			 * intel_svm_drain_prq(dev, pasid);
+			 */
+			kfree_rcu(sdev, rcu);
+
+			if (list_empty(&svm->devs)) {
+				/*
+				 * We do not free the IOASID here in that
+				 * IOMMU driver did not allocate it.
+				 * Unlike native SVM, IOASID for guest use was
+				 * allocated prior to the bind call.
+				 * In any case, if the free call comes before
+				 * the unbind, IOMMU driver will get notified
+				 * and perform cleanup.
+				 */
+				ioasid_set_data(pasid, NULL);
+				kfree(svm);
+			}
+		}
+		break;
+	}
+out:
+	mutex_unlock(&pasid_mutex);
+	return ret;
+}
+
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
 {
 	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index eba6c818dbf2..33257afcff99 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -701,7 +701,9 @@ struct dmar_domain *find_domain(struct device *dev);
 extern void intel_svm_check(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
 extern int intel_svm_finish_prq(struct intel_iommu *iommu);
-
+int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+			  struct iommu_gpasid_bind_data *data);
+int intel_svm_unbind_gpasid(struct device *dev, int pasid);
 struct svm_dev_ops;
 
 struct intel_svm_dev {
@@ -718,9 +720,11 @@ struct intel_svm_dev {
 struct intel_svm {
 	struct mmu_notifier notifier;
 	struct mm_struct *mm;
+
 	struct intel_iommu *iommu;
 	int flags;
 	int pasid;
+	int gpasid; /* In case that guest PASID is different from host PASID */
 	struct list_head devs;
 	struct list_head list;
 };
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index d7c403d0dd27..1b47ca46373e 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -44,6 +44,18 @@ struct svm_dev_ops {
  * do such IOTLB flushes automatically.
  */
 #define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
+/*
+ * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest
+ * processes. Compared to the host bind, the primary differences are:
+ * 1. mm life cycle management
+ * 2. fault reporting
+ */
+#define SVM_FLAG_GUEST_MODE		(1<<2)
+/*
+ * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space,
+ * which requires guest and host PASID translation at both directions.
+ */
+#define SVM_FLAG_GUEST_PASID		(1<<3)
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 
-- 
Gitee


From 6960c74df5ce38af4edc73e5fb3925334f63f1eb Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:48 +0800
Subject: [PATCH 0267/2161] iommu/vt-d: Support flushing more translation cache
 types

commit 61a06a16e36d830f7811fbf931668d87197d95b7 upstream.

When Shared Virtual Memory is exposed to a guest via vIOMMU, scalable
IOTLB invalidation may be passed down from outside IOMMU subsystems.
This patch adds invalidation functions that can be used for additional
translation cache types.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20200516062101.29541-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dmar.c        | 39 +++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel-pasid.c |  3 ++-
 include/linux/intel-iommu.h | 21 ++++++++++++++++----
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 0645f44227bb..2f4b16a8822f 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1439,6 +1439,45 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
 	qi_submit_sync(&desc, iommu);
 }
 
+/* PASID-based device IOTLB Invalidate */
+void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			      u32 pasid,  u16 qdep, u64 addr,
+			      unsigned int size_order, u64 granu)
+{
+	unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1);
+	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
+
+	desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+		QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+		QI_DEV_IOTLB_PFSID(pfsid);
+	desc.qw1 = QI_DEV_EIOTLB_GLOB(granu);
+
+	/*
+	 * If S bit is 0, we only flush a single page. If S bit is set,
+	 * The least significant zero bit indicates the invalidation address
+	 * range. VT-d spec 6.5.2.6.
+	 * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB.
+	 * size order = 0 is PAGE_SIZE 4KB
+	 * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
+	 * ECAP.
+	 */
+	desc.qw1 |= addr & ~mask;
+	if (size_order)
+		desc.qw1 |= QI_DEV_EIOTLB_SIZE;
+
+	qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
+			  u64 granu, int pasid)
+{
+	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
+
+	desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
+			QI_PC_GRAN(granu) | QI_PC_TYPE;
+	qi_submit_sync(&desc, iommu);
+}
+
 /*
  * Disable Queued Invalidation interface.
  */
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index c7fa1b79eaf7..5d9d9ff49334 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -375,7 +375,8 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
 {
 	struct qi_desc desc;
 
-	desc.qw0 = QI_PC_DID(did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid);
+	desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) |
+		QI_PC_PASID(pasid) | QI_PC_TYPE;
 	desc.qw1 = 0;
 	desc.qw2 = 0;
 	desc.qw3 = 0;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 33257afcff99..c815821a2768 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -334,7 +334,7 @@ enum {
 #define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
 #define QI_IOTLB_ADDR(addr)	(((u64)addr) & VTD_PAGE_MASK)
 #define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
-#define QI_IOTLB_AM(am)		(((u8)am))
+#define QI_IOTLB_AM(am)		(((u8)am) & 0x3f)
 
 #define QI_CC_FM(fm)		(((u64)fm) << 48)
 #define QI_CC_SID(sid)		(((u64)sid) << 32)
@@ -353,16 +353,21 @@ enum {
 #define QI_PC_DID(did)		(((u64)did) << 16)
 #define QI_PC_GRAN(gran)	(((u64)gran) << 4)
 
-#define QI_PC_ALL_PASIDS	(QI_PC_TYPE | QI_PC_GRAN(0))
-#define QI_PC_PASID_SEL		(QI_PC_TYPE | QI_PC_GRAN(1))
+/* PASID cache invalidation granu */
+#define QI_PC_ALL_PASIDS	0
+#define QI_PC_PASID_SEL		1
 
 #define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
 #define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
-#define QI_EIOTLB_AM(am)	(((u64)am))
+#define QI_EIOTLB_AM(am)	(((u64)am) & 0x3f)
 #define QI_EIOTLB_PASID(pasid) 	(((u64)pasid) << 32)
 #define QI_EIOTLB_DID(did)	(((u64)did) << 16)
 #define QI_EIOTLB_GRAN(gran) 	(((u64)gran) << 4)
 
+/* QI Dev-IOTLB inv granu */
+#define QI_DEV_IOTLB_GRAN_ALL		1
+#define QI_DEV_IOTLB_GRAN_PASID_SEL	0
+
 #define QI_DEV_EIOTLB_ADDR(a)	((u64)(a) & VTD_PAGE_MASK)
 #define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
 #define QI_DEV_EIOTLB_GLOB(g)	((u64)(g) & 0x1)
@@ -682,8 +687,16 @@ extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type);
 extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 			u16 qdep, u64 addr, unsigned mask);
+
 void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
 		     unsigned long npages, bool ih);
+
+void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			      u32 pasid, u16 qdep, u64 addr,
+			      unsigned int size_order, u64 granu);
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
+			  int pasid);
+
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
 extern int dmar_ir_support(void);
-- 
Gitee


From 677c88feed0c93075c30fa2c6a2d5985f8e415da Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:49 +0800
Subject: [PATCH 0268/2161] iommu/vt-d: Add svm/sva invalidate function

commit 6ee1b77ba3ac0a79fc6f3273f3b27b13240a355e upstream.

When Shared Virtual Address (SVA) is enabled for a guest OS via
vIOMMU, we need to provide invalidation support at IOMMU API and driver
level. This patch adds Intel VT-d specific function to implement
iommu passdown invalidate API for shared virtual address.

The use case is for supporting caching structure invalidation
of assigned SVM capable devices. Emulated IOMMU exposes queue
invalidation capability and passes down all descriptors from the guest
to the physical IOMMU.

The assumption is that guest to host device ID mapping should be
resolved prior to calling IOMMU driver. Based on the device handle,
host IOMMU driver can replace certain fields before submit to the
invalidation queue.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20200516062101.29541-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 171 ++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 96857b3c47b8..ca7593174c05 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5271,6 +5271,176 @@ static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
 }
 
+/*
+ * 2D array for converting and sanitizing IOMMU generic TLB granularity to
+ * VT-d granularity. Invalidation is typically included in the unmap operation
+ * as a result of DMA or VFIO unmap. However, for assigned devices guest
+ * owns the first level page tables. Invalidations of translation caches in the
+ * guest are trapped and passed down to the host.
+ *
+ * vIOMMU in the guest will only expose first level page tables, therefore
+ * we do not support IOTLB granularity for request without PASID (second level).
+ *
+ * For example, to find the VT-d granularity encoding for IOTLB
+ * type and page selective granularity within PASID:
+ * X: indexed by iommu cache type
+ * Y: indexed by enum iommu_inv_granularity
+ * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
+ */
+
+const static int
+inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
+	/*
+	 * PASID based IOTLB invalidation: PASID selective (per PASID),
+	 * page selective (address granularity)
+	 */
+	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
+	/* PASID based dev TLBs */
+	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
+	/* PASID cache */
+	{-EINVAL, -EINVAL, -EINVAL}
+};
+
+static inline int to_vtd_granularity(int type, int granu)
+{
+	return inv_type_granu_table[type][granu];
+}
+
+static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
+{
+	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
+
+	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
+	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
+	 * granu size in contiguous memory.
+	 */
+	return order_base_2(nr_pages);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static int
+intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
+			   struct iommu_cache_invalidate_info *inv_info)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct device_domain_info *info;
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	int cache_type;
+	u8 bus, devfn;
+	u16 did, sid;
+	int ret = 0;
+	u64 size = 0;
+
+	if (!inv_info || !dmar_domain ||
+	    inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+		return -EINVAL;
+
+	if (!dev || !dev_is_pci(dev))
+		return -ENODEV;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
+		return -EINVAL;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	spin_lock(&iommu->lock);
+	info = dev->archdata.iommu;
+	if (!info) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	did = dmar_domain->iommu_did[iommu->seq_id];
+	sid = PCI_DEVID(bus, devfn);
+
+	/* Size is only valid in address selective invalidation */
+	if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
+		size = to_vtd_size(inv_info->addr_info.granule_size,
+				   inv_info->addr_info.nb_granules);
+
+	for_each_set_bit(cache_type,
+			 (unsigned long *)&inv_info->cache,
+			 IOMMU_CACHE_INV_TYPE_NR) {
+		int granu = 0;
+		u64 pasid = 0;
+
+		granu = to_vtd_granularity(cache_type, inv_info->granularity);
+		if (granu == -EINVAL) {
+			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
+					   cache_type, inv_info->granularity);
+			break;
+		}
+
+		/*
+		 * PASID is stored in different locations based on the
+		 * granularity.
+		 */
+		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
+		    (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
+			pasid = inv_info->pasid_info.pasid;
+		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
+			 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
+			pasid = inv_info->addr_info.pasid;
+
+		switch (BIT(cache_type)) {
+		case IOMMU_CACHE_INV_TYPE_IOTLB:
+			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
+			    size &&
+			    (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
+				pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
+						   inv_info->addr_info.addr, size);
+				ret = -ERANGE;
+				goto out_unlock;
+			}
+
+			/*
+			 * If granu is PASID-selective, address is ignored.
+			 * We use npages = -1 to indicate that.
+			 */
+			qi_flush_piotlb(iommu, did, pasid,
+					mm_to_dma_pfn(inv_info->addr_info.addr),
+					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
+					inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
+
+			/*
+			 * Always flush device IOTLB if ATS is enabled. vIOMMU
+			 * in the guest may assume IOTLB flush is inclusive,
+			 * which is more efficient.
+			 */
+			if (info->ats_enabled)
+				qi_flush_dev_iotlb_pasid(iommu, sid,
+						info->pfsid, pasid,
+						info->ats_qdep,
+						inv_info->addr_info.addr,
+						size, granu);
+			break;
+		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
+			if (info->ats_enabled)
+				qi_flush_dev_iotlb_pasid(iommu, sid,
+						info->pfsid, pasid,
+						info->ats_qdep,
+						inv_info->addr_info.addr,
+						size, granu);
+			else
+				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
+			break;
+		default:
+			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
+					    cache_type);
+			ret = -EINVAL;
+		}
+	}
+out_unlock:
+	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
+}
+#endif
+
 static int intel_iommu_map(struct iommu_domain *domain,
 			   unsigned long iova, phys_addr_t hpa,
 			   size_t size, int iommu_prot, gfp_t gfp)
@@ -5827,6 +5997,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.def_domain_type	= device_def_domain_type,
 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
 #ifdef CONFIG_INTEL_IOMMU_SVM
+	.cache_invalidate	= intel_iommu_sva_invalidate,
 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
 #endif
-- 
Gitee


From 31d2f616857d07ed0a98518c30d42722cf552776 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 16 May 2020 14:20:50 +0800
Subject: [PATCH 0269/2161] iommu/vt-d: Enlightened PASID allocation

commit 24f27d32ab6b71dedcbbeeab8f9bdc143b539ac0 upstream.

Enabling IOMMU in a guest requires communication with the host
driver for certain aspects. Use of PASID ID to enable Shared Virtual
Addressing (SVA) requires managing PASID's in the host. VT-d 3.0 spec
provides a Virtual Command Register (VCMD) to facilitate this.
Writes to this register in the guest are trapped by vIOMMU which
proxies the call to the host driver.

This virtual command interface consists of a capability register,
a virtual command register, and a virtual response register. Refer
to section 10.4.42, 10.4.43, 10.4.44 for more information.

This patch adds the enlightened PASID allocation/free interfaces
via the virtual command interface.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-pasid.c | 57 +++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel-pasid.h | 13 ++++++++-
 include/linux/intel-iommu.h |  1 +
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 5d9d9ff49334..ea8f4ef4e295 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -27,6 +27,63 @@
 static DEFINE_SPINLOCK(pasid_lock);
 u32 intel_pasid_max_id = PASID_MAX;
 
+int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid)
+{
+	unsigned long flags;
+	u8 status_code;
+	int ret = 0;
+	u64 res;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+	dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC);
+	IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+		      !(res & VCMD_VRSP_IP), res);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	status_code = VCMD_VRSP_SC(res);
+	switch (status_code) {
+	case VCMD_VRSP_SC_SUCCESS:
+		*pasid = VCMD_VRSP_RESULT_PASID(res);
+		break;
+	case VCMD_VRSP_SC_NO_PASID_AVAIL:
+		pr_info("IOMMU: %s: No PASID available\n", iommu->name);
+		ret = -ENOSPC;
+		break;
+	default:
+		ret = -ENODEV;
+		pr_warn("IOMMU: %s: Unexpected error code %d\n",
+			iommu->name, status_code);
+	}
+
+	return ret;
+}
+
+void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid)
+{
+	unsigned long flags;
+	u8 status_code;
+	u64 res;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+	dmar_writeq(iommu->reg + DMAR_VCMD_REG,
+		    VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE);
+	IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+		      !(res & VCMD_VRSP_IP), res);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	status_code = VCMD_VRSP_SC(res);
+	switch (status_code) {
+	case VCMD_VRSP_SC_SUCCESS:
+		break;
+	case VCMD_VRSP_SC_INVALID_PASID:
+		pr_info("IOMMU: %s: Invalid PASID\n", iommu->name);
+		break;
+	default:
+		pr_warn("IOMMU: %s: Unexpected error code %d\n",
+			iommu->name, status_code);
+	}
+}
+
 /*
  * Per device pasid table management:
  */
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index ccd50c2ae75c..a41b09b3ffde 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -23,6 +23,16 @@
 #define is_pasid_enabled(entry)		(((entry)->lo >> 3) & 0x1)
 #define get_pasid_dir_size(entry)	(1 << ((((entry)->lo >> 9) & 0x7) + 7))
 
+/* Virtual command interface for enlightened pasid management. */
+#define VCMD_CMD_ALLOC			0x1
+#define VCMD_CMD_FREE			0x2
+#define VCMD_VRSP_IP			0x1
+#define VCMD_VRSP_SC(e)			(((e) >> 1) & 0x3)
+#define VCMD_VRSP_SC_SUCCESS		0
+#define VCMD_VRSP_SC_NO_PASID_AVAIL	1
+#define VCMD_VRSP_SC_INVALID_PASID	1
+#define VCMD_VRSP_RESULT_PASID(e)	(((e) >> 8) & 0xfffff)
+#define VCMD_CMD_OPERAND(e)		((e) << 8)
 /*
  * Domain ID reserved for pasid entries programmed for first-level
  * only and pass-through transfer modes.
@@ -111,5 +121,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu,
 			     struct dmar_domain *domain, int addr_width);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 				 struct device *dev, int pasid);
-
+int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid);
+void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid);
 #endif /* __INTEL_PASID_H */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index c815821a2768..4c963cc867cb 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -169,6 +169,7 @@
 #define ecap_smpwc(e)		(((e) >> 48) & 0x1)
 #define ecap_flts(e)		(((e) >> 47) & 0x1)
 #define ecap_slts(e)		(((e) >> 46) & 0x1)
+#define ecap_vcs(e)		(((e) >> 44) & 0x1)
 #define ecap_smts(e)		(((e) >> 43) & 0x1)
 #define ecap_dit(e)		((e >> 41) & 0x1)
 #define ecap_pasid(e)		((e >> 40) & 0x1)
-- 
Gitee


From 094c83ec4636d3bb2a359f4da4978aca3f554e2c Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:51 +0800
Subject: [PATCH 0270/2161] iommu/vt-d: Add custom allocator for IOASID

commit 3375303e82877552f3b2b42309e8233fe715fd9f upstream.

When VT-d driver runs in the guest, PASID allocation must be
performed via virtual command interface. This patch registers a
custom IOASID allocator which takes precedence over the default
XArray based allocator. The resulting IOASID allocation will always
come from the host. This ensures that PASID namespace is system-
wide.

Virtual command registers are used in the guest only, to prevent
vmexit cost, we cache the capability and store it during initialization.

Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20200516062101.29541-9-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dmar.c        |  1 +
 drivers/iommu/intel-iommu.c | 85 +++++++++++++++++++++++++++++++++++++
 include/linux/intel-iommu.h |  7 +++
 3 files changed, 93 insertions(+)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 2f4b16a8822f..0a6870f9c912 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -964,6 +964,7 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
 		warn_invalid_dmar(phys_addr, " returns all ones");
 		goto unmap;
 	}
+	iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
 
 	/* the registers might be more than one page */
 	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index ca7593174c05..f92bf5cbf4a4 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1732,6 +1732,9 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
 		if (ecap_prs(iommu->ecap))
 			intel_svm_finish_prq(iommu);
 	}
+	if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
+		ioasid_unregister_allocator(&iommu->pasid_allocator);
+
 #endif
 }
 
@@ -3059,6 +3062,85 @@ static int copy_translation_tables(struct intel_iommu *iommu)
 	return ret;
 }
 
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
+{
+	struct intel_iommu *iommu = data;
+	ioasid_t ioasid;
+
+	if (!iommu)
+		return INVALID_IOASID;
+	/*
+	 * VT-d virtual command interface always uses the full 20 bit
+	 * PASID range. Host can partition guest PASID range based on
+	 * policies but it is out of guest's control.
+	 */
+	if (min < PASID_MIN || max > intel_pasid_max_id)
+		return INVALID_IOASID;
+
+	if (vcmd_alloc_pasid(iommu, &ioasid))
+		return INVALID_IOASID;
+
+	return ioasid;
+}
+
+static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
+{
+	struct intel_iommu *iommu = data;
+
+	if (!iommu)
+		return;
+	/*
+	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
+	 * We can only free the PASID when all the devices are unbound.
+	 */
+	if (ioasid_find(NULL, ioasid, NULL)) {
+		pr_alert("Cannot free active IOASID %d\n", ioasid);
+		return;
+	}
+	vcmd_free_pasid(iommu, ioasid);
+}
+
+static void register_pasid_allocator(struct intel_iommu *iommu)
+{
+	/*
+	 * If we are running in the host, no need for custom allocator
+	 * in that PASIDs are allocated from the host system-wide.
+	 */
+	if (!cap_caching_mode(iommu->cap))
+		return;
+
+	if (!sm_supported(iommu)) {
+		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
+		return;
+	}
+
+	/*
+	 * Register a custom PASID allocator if we are running in a guest,
+	 * guest PASID must be obtained via virtual command interface.
+	 * There can be multiple vIOMMUs in each guest but only one allocator
+	 * is active. All vIOMMU allocators will eventually be calling the same
+	 * host allocator.
+	 */
+	if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
+		return;
+
+	pr_info("Register custom PASID allocator\n");
+	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
+	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
+	iommu->pasid_allocator.pdata = (void *)iommu;
+	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
+		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
+		/*
+		 * Disable scalable mode on this IOMMU if there
+		 * is no custom allocator. Mixing SM capable vIOMMU
+		 * and non-SM vIOMMU are not supported.
+		 */
+		intel_iommu_sm = 0;
+	}
+}
+#endif
+
 static int __init init_dmars(void)
 {
 	struct dmar_drhd_unit *drhd;
@@ -3181,6 +3263,9 @@ static int __init init_dmars(void)
 	 */
 	for_each_active_iommu(iommu, drhd) {
 		iommu_flush_write_buffer(iommu);
+#ifdef CONFIG_INTEL_IOMMU_SVM
+		register_pasid_allocator(iommu);
+#endif
 		iommu_set_root_entry(iommu);
 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 4c963cc867cb..a7568d08378e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -19,6 +19,7 @@
 #include <linux/iommu.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmar.h>
+#include <linux/ioasid.h>
 
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -195,6 +196,9 @@
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
 #define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
 
+/* Virtual command interface capability */
+#define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
+
 /* IOTLB_REG */
 #define DMA_TLB_FLUSH_GRANU_OFFSET  60
 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
@@ -288,6 +292,7 @@
 
 /* PRS_REG */
 #define DMA_PRS_PPR	((u32)1)
+#define DMA_VCS_PAS	((u64)1)
 
 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
 do {									\
@@ -555,6 +560,7 @@ struct intel_iommu {
 	u64		reg_size; /* size of hw register set */
 	u64		cap;
 	u64		ecap;
+	u64		vccap;
 	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
 	raw_spinlock_t	register_lock; /* protect register handling */
 	int		seq_id;	/* sequence id of the iommu */
@@ -575,6 +581,7 @@ struct intel_iommu {
 #ifdef CONFIG_INTEL_IOMMU_SVM
 	struct page_req_dsc *prq;
 	unsigned char prq_name[16];    /* Name for PRQ interrupt */
+	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
 #endif
 	struct q_inval  *qi;            /* Queued invalidation info */
 	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
-- 
Gitee


From b80914d036940ac62aba56c4bb209776a272d0cf Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 16 May 2020 14:20:52 +0800
Subject: [PATCH 0271/2161] iommu/vt-d: Add get_domain_info() helper

commit e85bb99b79ca5ad2681612a7bb22f94cc2c71866 upstream.

Add a get_domain_info() helper to retrieve the valid per-device
iommu private data.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-10-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 40 +++++++++++++++++++++++++------------
 drivers/iommu/intel-pasid.c | 12 +++++------
 drivers/iommu/intel-svm.c   |  2 +-
 include/linux/intel-iommu.h |  1 +
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index f92bf5cbf4a4..b4f5929d99cb 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -365,6 +365,21 @@ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
+struct device_domain_info *get_domain_info(struct device *dev)
+{
+	struct device_domain_info *info;
+
+	if (!dev)
+		return NULL;
+
+	info = dev->archdata.iommu;
+	if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
+		     info == DEFER_DEVICE_DOMAIN_INFO))
+		return NULL;
+
+	return info;
+}
+
 DEFINE_SPINLOCK(device_domain_lock);
 static LIST_HEAD(device_domain_list);
 
@@ -2433,7 +2448,7 @@ struct dmar_domain *find_domain(struct device *dev)
 		return NULL;
 
 	/* No lock here, assumes no domain exit in normal case */
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (likely(info))
 		return info->domain;
 
@@ -5041,9 +5056,8 @@ static void dmar_remove_one_dev_info(struct device *dev)
 	unsigned long flags;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	info = dev->archdata.iommu;
-	if (info && info != DEFER_DEVICE_DOMAIN_INFO
-	    && info != DUMMY_DEVICE_DOMAIN_INFO)
+	info = get_domain_info(dev);
+	if (info)
 		__dmar_remove_one_dev_info(info);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
@@ -5133,7 +5147,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain)
 static inline bool
 is_aux_domain(struct device *dev, struct iommu_domain *domain)
 {
-	struct device_domain_info *info = dev->archdata.iommu;
+	struct device_domain_info *info = get_domain_info(dev);
 
 	return info && info->auxd_enabled &&
 			domain->type == IOMMU_DOMAIN_UNMANAGED;
@@ -5142,7 +5156,7 @@ is_aux_domain(struct device *dev, struct iommu_domain *domain)
 static void auxiliary_link_device(struct dmar_domain *domain,
 				  struct device *dev)
 {
-	struct device_domain_info *info = dev->archdata.iommu;
+	struct device_domain_info *info = get_domain_info(dev);
 
 	assert_spin_locked(&device_domain_lock);
 	if (WARN_ON(!info))
@@ -5155,7 +5169,7 @@ static void auxiliary_link_device(struct dmar_domain *domain,
 static void auxiliary_unlink_device(struct dmar_domain *domain,
 				    struct device *dev)
 {
-	struct device_domain_info *info = dev->archdata.iommu;
+	struct device_domain_info *info = get_domain_info(dev);
 
 	assert_spin_locked(&device_domain_lock);
 	if (WARN_ON(!info))
@@ -5243,7 +5257,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
 		return;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	iommu = info->iommu;
 
 	auxiliary_unlink_device(domain, dev);
@@ -5433,7 +5447,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 
 	spin_lock_irqsave(&device_domain_lock, flags);
 	spin_lock(&iommu->lock);
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info) {
 		ret = -EINVAL;
 		goto out_unlock;
@@ -5797,7 +5811,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
 	spin_lock(&iommu->lock);
 
 	ret = -EINVAL;
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info || !info->pasid_supported)
 		goto out;
 
@@ -5893,7 +5907,7 @@ static int intel_iommu_enable_auxd(struct device *dev)
 		return -ENODEV;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	info->auxd_enabled = 1;
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
@@ -5906,7 +5920,7 @@ static int intel_iommu_disable_auxd(struct device *dev)
 	unsigned long flags;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!WARN_ON(!info))
 		info->auxd_enabled = 0;
 	spin_unlock_irqrestore(&device_domain_lock, flags);
@@ -5983,7 +5997,7 @@ intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
 static bool
 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
 {
-	struct device_domain_info *info = dev->archdata.iommu;
+	struct device_domain_info *info = get_domain_info(dev);
 
 	if (feat == IOMMU_DEV_FEAT_AUX)
 		return scalable_mode_support() && info && info->auxd_enabled;
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index ea8f4ef4e295..c46a068142b9 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -151,7 +151,7 @@ int intel_pasid_alloc_table(struct device *dev)
 	int size;
 
 	might_sleep();
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table))
 		return -EINVAL;
 
@@ -198,7 +198,7 @@ void intel_pasid_free_table(struct device *dev)
 	struct pasid_entry *table;
 	int i, max_pde;
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info || !dev_is_pci(dev) || !info->pasid_table)
 		return;
 
@@ -224,7 +224,7 @@ struct pasid_table *intel_pasid_get_table(struct device *dev)
 {
 	struct device_domain_info *info;
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info)
 		return NULL;
 
@@ -235,7 +235,7 @@ int intel_pasid_get_dev_max_id(struct device *dev)
 {
 	struct device_domain_info *info;
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info || !info->pasid_table)
 		return 0;
 
@@ -256,7 +256,7 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid)
 		return NULL;
 
 	dir = pasid_table->table;
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	dir_index = pasid >> PASID_PDE_SHIFT;
 	index = pasid & PASID_PTE_MASK;
 
@@ -462,7 +462,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
 	struct device_domain_info *info;
 	u16 sid, qdep, pfsid;
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info || !info->ats_enabled)
 		return;
 
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 953963783f46..e3c752dd7a1b 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -521,7 +521,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		goto out;
 	}
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info || !info->pasid_supported) {
 		kfree(sdev);
 		goto out;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a7568d08378e..e7ef3b866cb0 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -717,6 +717,7 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
 struct dmar_domain *find_domain(struct device *dev);
+struct device_domain_info *get_domain_info(struct device *dev);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 extern void intel_svm_check(struct intel_iommu *iommu);
-- 
Gitee


From 2fa056fb052d3b1516579926ef5a0fa75dcf8fcb Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:53 +0800
Subject: [PATCH 0272/2161] iommu/vt-d: Report SVA feature with generic flag

commit 76fdd6c59532630559a2c63e8645a7033f9623c4 upstream.

Query Shared Virtual Address/Memory capability is a generic feature.
SVA feature check is the required first step before calling
iommu_sva_bind_device().

VT-d checks SVA feature enabling at per IOMMU level during this step,
SVA bind device will check and enable PCI ATS, PRS, and PASID capabilities
at device level.

This patch reports Intel SVM as SVA feature such that generic code
(e.g. Uacce [1]) can use it.

[1] https://lkml.org/lkml/2020/1/15/604

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-11-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index b4f5929d99cb..e6c84ca0df1f 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5973,6 +5973,14 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
 	}
 
+	if (feat == IOMMU_DEV_FEAT_SVA) {
+		struct device_domain_info *info = get_domain_info(dev);
+
+		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
+			info->pasid_supported && info->pri_supported &&
+			info->ats_supported;
+	}
+
 	return false;
 }
 
@@ -5982,6 +5990,16 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 	if (feat == IOMMU_DEV_FEAT_AUX)
 		return intel_iommu_enable_auxd(dev);
 
+	if (feat == IOMMU_DEV_FEAT_SVA) {
+		struct device_domain_info *info = get_domain_info(dev);
+
+		if (!info)
+			return -EINVAL;
+
+		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
+			return 0;
+	}
+
 	return -ENODEV;
 }
 
-- 
Gitee


From 6b673a3ee71935c8a4b1b1c491776bfa156cb2a9 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sat, 16 May 2020 14:20:54 +0800
Subject: [PATCH 0273/2161] iommu/vt-d: Replace intel SVM APIs with generic SVA
 APIs

commit 064a57d7ddfc46ada02b477b91c478001b03bfa3 upstream.

This patch is an initial step to replace Intel SVM code with the
following IOMMU SVA ops:
intel_svm_bind_mm() => iommu_sva_bind_device()
intel_svm_unbind_mm() => iommu_sva_unbind_device()
intel_svm_is_pasid_valid() => iommu_sva_get_pasid()

The features below will continue to work but are not included in this patch
in that they are handled mostly within the IOMMU subsystem.
- IO page fault
- mmu notifier

Consolidation of the above will come after merging generic IOMMU sva
code[1]. There should not be any changes needed for SVA users such as
accelerator device drivers during this time.

[1] http://jpbrucker.net/sva/

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-12-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c |   3 +
 drivers/iommu/intel-svm.c   | 124 ++++++++++++++++++++----------------
 include/linux/intel-iommu.h |   6 ++
 include/linux/intel-svm.h   |  86 -------------------------
 4 files changed, 78 insertions(+), 141 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e6c84ca0df1f..dcd87e911c68 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -6117,6 +6117,9 @@ const struct iommu_ops intel_iommu_ops = {
 	.cache_invalidate	= intel_iommu_sva_invalidate,
 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
+	.sva_bind		= intel_svm_bind,
+	.sva_unbind		= intel_svm_unbind,
+	.sva_get_pasid		= intel_svm_get_pasid,
 #endif
 };
 
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index e3c752dd7a1b..dfabe6610b32 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -444,13 +444,15 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 	return ret;
 }
 
-int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
+/* Caller must hold pasid_mutex, mm reference */
+static int
+intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
+		  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
 	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
 	struct device_domain_info *info;
 	struct intel_svm_dev *sdev;
 	struct intel_svm *svm = NULL;
-	struct mm_struct *mm = NULL;
 	int pasid_max;
 	int ret;
 
@@ -467,16 +469,15 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 	} else
 		pasid_max = 1 << 20;
 
+	/* Bind supervisor PASID shuld have mm = NULL */
 	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
-		if (!ecap_srs(iommu->ecap))
+		if (!ecap_srs(iommu->ecap) || mm) {
+			pr_err("Supervisor PASID with user provided mm.\n");
 			return -EINVAL;
-	} else if (pasid) {
-		mm = get_task_mm(current);
-		BUG_ON(!mm);
+		}
 	}
 
-	mutex_lock(&pasid_mutex);
-	if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
+	if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
 		struct intel_svm *t;
 
 		list_for_each_entry(t, &global_svm_list, list) {
@@ -514,9 +515,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 	sdev->dev = dev;
 
 	ret = intel_iommu_enable_pasid(iommu, dev);
-	if (ret || !pasid) {
-		/* If they don't actually want to assign a PASID, this is
-		 * just an enabling check/preparation. */
+	if (ret) {
 		kfree(sdev);
 		goto out;
 	}
@@ -615,18 +614,17 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		}
 	}
 	list_add_rcu(&sdev->list, &svm->devs);
-
- success:
-	*pasid = svm->pasid;
+success:
+	sdev->pasid = svm->pasid;
+	sdev->sva.dev = dev;
+	if (sd)
+		*sd = sdev;
 	ret = 0;
  out:
-	mutex_unlock(&pasid_mutex);
-	if (mm)
-		mmput(mm);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(intel_svm_bind_mm);
 
+/* Caller must hold pasid_mutex */
 int intel_svm_unbind_mm(struct device *dev, int pasid)
 {
 	struct intel_svm_dev *sdev;
@@ -634,7 +632,6 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 	struct intel_svm *svm;
 	int ret = -EINVAL;
 
-	mutex_lock(&pasid_mutex);
 	iommu = intel_svm_device_to_iommu(dev);
 	if (!iommu)
 		goto out;
@@ -680,45 +677,9 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 		break;
 	}
  out:
-	mutex_unlock(&pasid_mutex);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(intel_svm_unbind_mm);
-
-int intel_svm_is_pasid_valid(struct device *dev, int pasid)
-{
-	struct intel_iommu *iommu;
-	struct intel_svm *svm;
-	int ret = -EINVAL;
-
-	mutex_lock(&pasid_mutex);
-	iommu = intel_svm_device_to_iommu(dev);
-	if (!iommu)
-		goto out;
-
-	svm = ioasid_find(NULL, pasid, NULL);
-	if (!svm)
-		goto out;
-
-	if (IS_ERR(svm)) {
-		ret = PTR_ERR(svm);
-		goto out;
-	}
-	/* init_mm is used in this case */
-	if (!svm->mm)
-		ret = 1;
-	else if (atomic_read(&svm->mm->mm_users) > 0)
-		ret = 1;
-	else
-		ret = 0;
-
- out:
-	mutex_unlock(&pasid_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid);
 
 /* Page request queue descriptor */
 struct page_req_dsc {
@@ -912,3 +873,56 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 
 	return IRQ_RETVAL(handled);
 }
+
+#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
+struct iommu_sva *
+intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+{
+	struct iommu_sva *sva = ERR_PTR(-EINVAL);
+	struct intel_svm_dev *sdev = NULL;
+	int flags = 0;
+	int ret;
+
+	/*
+	 * TODO: Consolidate with generic iommu-sva bind after it is merged.
+	 * It will require shared SVM data structures, i.e. combine io_mm
+	 * and intel_svm etc.
+	 */
+	if (drvdata)
+		flags = *(int *)drvdata;
+	mutex_lock(&pasid_mutex);
+	ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
+	if (ret)
+		sva = ERR_PTR(ret);
+	else if (sdev)
+		sva = &sdev->sva;
+	else
+		WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+
+	mutex_unlock(&pasid_mutex);
+
+	return sva;
+}
+
+void intel_svm_unbind(struct iommu_sva *sva)
+{
+	struct intel_svm_dev *sdev;
+
+	mutex_lock(&pasid_mutex);
+	sdev = to_intel_svm_dev(sva);
+	intel_svm_unbind_mm(sdev->dev, sdev->pasid);
+	mutex_unlock(&pasid_mutex);
+}
+
+int intel_svm_get_pasid(struct iommu_sva *sva)
+{
+	struct intel_svm_dev *sdev;
+	int pasid;
+
+	mutex_lock(&pasid_mutex);
+	sdev = to_intel_svm_dev(sva);
+	pasid = sdev->pasid;
+	mutex_unlock(&pasid_mutex);
+
+	return pasid;
+}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index e7ef3b866cb0..628aa211687d 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -726,6 +726,10 @@ extern int intel_svm_finish_prq(struct intel_iommu *iommu);
 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			  struct iommu_gpasid_bind_data *data);
 int intel_svm_unbind_gpasid(struct device *dev, int pasid);
+struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
+				 void *drvdata);
+void intel_svm_unbind(struct iommu_sva *handle);
+int intel_svm_get_pasid(struct iommu_sva *handle);
 struct svm_dev_ops;
 
 struct intel_svm_dev {
@@ -733,6 +737,8 @@ struct intel_svm_dev {
 	struct rcu_head rcu;
 	struct device *dev;
 	struct svm_dev_ops *ops;
+	struct iommu_sva sva;
+	int pasid;
 	int users;
 	u16 did;
 	u16 dev_iotlb:1;
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 1b47ca46373e..c9e7e601950d 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -21,7 +21,6 @@ struct svm_dev_ops {
 #define SVM_REQ_EXEC	(1<<1)
 #define SVM_REQ_PRIV	(1<<0)
 
-
 /*
  * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main"
  * PASID for the current process. Even if a PASID already exists, a new one
@@ -57,89 +56,4 @@ struct svm_dev_ops {
  */
 #define SVM_FLAG_GUEST_PASID		(1<<3)
 
-#ifdef CONFIG_INTEL_IOMMU_SVM
-
-/**
- * intel_svm_bind_mm() - Bind the current process to a PASID
- * @dev:	Device to be granted access
- * @pasid:	Address for allocated PASID
- * @flags:	Flags. Later for requesting supervisor mode, etc.
- * @ops:	Callbacks to device driver
- *
- * This function attempts to enable PASID support for the given device.
- * If the @pasid argument is non-%NULL, a PASID is allocated for access
- * to the MM of the current process.
- *
- * By using a %NULL value for the @pasid argument, this function can
- * be used to simply validate that PASID support is available for the
- * given device — i.e. that it is behind an IOMMU which has the
- * requisite support, and is enabled.
- *
- * Page faults are handled transparently by the IOMMU code, and there
- * should be no need for the device driver to be involved. If a page
- * fault cannot be handled (i.e. is an invalid address rather than
- * just needs paging in), then the page request will be completed by
- * the core IOMMU code with appropriate status, and the device itself
- * can then report the resulting fault to its driver via whatever
- * mechanism is appropriate.
- *
- * Multiple calls from the same process may result in the same PASID
- * being re-used. A reference count is kept.
- */
-extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags,
-			     struct svm_dev_ops *ops);
-
-/**
- * intel_svm_unbind_mm() - Unbind a specified PASID
- * @dev:	Device for which PASID was allocated
- * @pasid:	PASID value to be unbound
- *
- * This function allows a PASID to be retired when the device no
- * longer requires access to the address space of a given process.
- *
- * If the use count for the PASID in question reaches zero, the
- * PASID is revoked and may no longer be used by hardware.
- *
- * Device drivers are required to ensure that no access (including
- * page requests) is currently outstanding for the PASID in question,
- * before calling this function.
- */
-extern int intel_svm_unbind_mm(struct device *dev, int pasid);
-
-/**
- * intel_svm_is_pasid_valid() - check if pasid is valid
- * @dev:	Device for which PASID was allocated
- * @pasid:	PASID value to be checked
- *
- * This function checks if the specified pasid is still valid. A
- * valid pasid means the backing mm is still having a valid user.
- * For kernel callers init_mm is always valid. for other mm, if mm->mm_users
- * is non-zero, it is valid.
- *
- * returns -EINVAL if invalid pasid, 0 if pasid ref count is invalid
- * 1 if pasid is valid.
- */
-extern int intel_svm_is_pasid_valid(struct device *dev, int pasid);
-
-#else /* CONFIG_INTEL_IOMMU_SVM */
-
-static inline int intel_svm_bind_mm(struct device *dev, int *pasid,
-				    int flags, struct svm_dev_ops *ops)
-{
-	return -ENOSYS;
-}
-
-static inline int intel_svm_unbind_mm(struct device *dev, int pasid)
-{
-	BUG();
-}
-
-static inline int intel_svm_is_pasid_valid(struct device *dev, int pasid)
-{
-	return -EINVAL;
-}
-#endif /* CONFIG_INTEL_IOMMU_SVM */
-
-#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL))
-
 #endif /* __INTEL_SVM_H__ */
-- 
Gitee


From ce8dccf1af2e9d1677de8f8223d6eb42ec7948d0 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 16 May 2020 14:20:55 +0800
Subject: [PATCH 0274/2161] iommu/vt-d: Multiple descriptors per
 qi_submit_sync()

commit 8a1d824625402b3ef3c3e5965663354ff0394d86 upstream.

Current qi_submit_sync() only supports single invalidation descriptor
per submission and appends wait descriptor after each submission to
poll the hardware completion. This extends the qi_submit_sync() helper
to support multiple descriptors, and add an option so that the caller
could specify the Page-request Drain (PD) bit in the wait descriptor.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-13-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dmar.c                | 63 +++++++++++++++++------------
 drivers/iommu/intel-pasid.c         |  4 +-
 drivers/iommu/intel-svm.c           |  6 +--
 drivers/iommu/intel_irq_remapping.c |  2 +-
 include/linux/intel-iommu.h         |  9 ++++-
 5 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 0a6870f9c912..1d406f2a81ff 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1175,12 +1175,11 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 	}
 }
 
-static int qi_check_fault(struct intel_iommu *iommu, int index)
+static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 {
 	u32 fault;
 	int head, tail;
 	struct q_inval *qi = iommu->qi;
-	int wait_index = (index + 1) % QI_LENGTH;
 	int shift = qi_shift(iommu);
 
 	if (qi->desc_status[wait_index] == QI_ABORT)
@@ -1243,17 +1242,21 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 }
 
 /*
- * Submit the queued invalidation descriptor to the remapping
- * hardware unit and wait for its completion.
+ * Function to submit invalidation descriptors of all types to the queued
+ * invalidation interface(QI). Multiple descriptors can be submitted at a
+ * time, a wait descriptor will be appended to each submission to ensure
+ * hardware has completed the invalidation before return. Wait descriptors
+ * can be part of the submission but it will not be polled for completion.
  */
-int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+		   unsigned int count, unsigned long options)
 {
-	int rc;
 	struct q_inval *qi = iommu->qi;
-	int offset, shift, length;
 	struct qi_desc wait_desc;
 	int wait_index, index;
 	unsigned long flags;
+	int offset, shift;
+	int rc, i;
 
 	if (!qi)
 		return 0;
@@ -1262,32 +1265,41 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 	rc = 0;
 
 	raw_spin_lock_irqsave(&qi->q_lock, flags);
-	while (qi->free_cnt < 3) {
+	/*
+	 * Check if we have enough empty slots in the queue to submit,
+	 * the calculation is based on:
+	 * # of desc + 1 wait desc + 1 space between head and tail
+	 */
+	while (qi->free_cnt < count + 2) {
 		raw_spin_unlock_irqrestore(&qi->q_lock, flags);
 		cpu_relax();
 		raw_spin_lock_irqsave(&qi->q_lock, flags);
 	}
 
 	index = qi->free_head;
-	wait_index = (index + 1) % QI_LENGTH;
+	wait_index = (index + count) % QI_LENGTH;
 	shift = qi_shift(iommu);
-	length = 1 << shift;
 
-	qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
+	for (i = 0; i < count; i++) {
+		offset = ((index + i) % QI_LENGTH) << shift;
+		memcpy(qi->desc + offset, &desc[i], 1 << shift);
+		qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE;
+	}
+	qi->desc_status[wait_index] = QI_IN_USE;
 
-	offset = index << shift;
-	memcpy(qi->desc + offset, desc, length);
 	wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
 			QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+	if (options & QI_OPT_WAIT_DRAIN)
+		wait_desc.qw0 |= QI_IWD_PRQ_DRAIN;
 	wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]);
 	wait_desc.qw2 = 0;
 	wait_desc.qw3 = 0;
 
 	offset = wait_index << shift;
-	memcpy(qi->desc + offset, &wait_desc, length);
+	memcpy(qi->desc + offset, &wait_desc, 1 << shift);
 
-	qi->free_head = (qi->free_head + 2) % QI_LENGTH;
-	qi->free_cnt -= 2;
+	qi->free_head = (qi->free_head + count + 1) % QI_LENGTH;
+	qi->free_cnt -= count + 1;
 
 	/*
 	 * update the HW tail register indicating the presence of
@@ -1303,7 +1315,7 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 		 * a deadlock where the interrupt context can wait indefinitely
 		 * for free slots in the queue.
 		 */
-		rc = qi_check_fault(iommu, index);
+		rc = qi_check_fault(iommu, index, wait_index);
 		if (rc)
 			break;
 
@@ -1312,7 +1324,8 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 		raw_spin_lock(&qi->q_lock);
 	}
 
-	qi->desc_status[index] = QI_DONE;
+	for (i = 0; i < count; i++)
+		qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE;
 
 	reclaim_free_desc(qi);
 	raw_spin_unlock_irqrestore(&qi->q_lock, flags);
@@ -1336,7 +1349,7 @@ void qi_global_iec(struct intel_iommu *iommu)
 	desc.qw3 = 0;
 
 	/* should never fail */
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
@@ -1350,7 +1363,7 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
@@ -1374,7 +1387,7 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
@@ -1396,7 +1409,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 /* PASID-based IOTLB invalidation */
@@ -1437,7 +1450,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
 				QI_EIOTLB_AM(mask);
 	}
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 /* PASID-based device IOTLB Invalidate */
@@ -1466,7 +1479,7 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 	if (size_order)
 		desc.qw1 |= QI_DEV_EIOTLB_SIZE;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
@@ -1476,7 +1489,7 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
 
 	desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
 			QI_PC_GRAN(granu) | QI_PC_TYPE;
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 /*
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index c46a068142b9..45e9b5b291bc 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -438,7 +438,7 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 static void
@@ -452,7 +452,7 @@ iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid)
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 static void
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index dfabe6610b32..080280a655af 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -113,7 +113,7 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
 	}
 	desc.qw2 = 0;
 	desc.qw3 = 0;
-	qi_submit_sync(&desc, svm->iommu);
+	qi_submit_sync(svm->iommu, &desc, 1, 0);
 
 	if (sdev->dev_iotlb) {
 		desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
@@ -137,7 +137,7 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
 		}
 		desc.qw2 = 0;
 		desc.qw3 = 0;
-		qi_submit_sync(&desc, svm->iommu);
+		qi_submit_sync(svm->iommu, &desc, 1, 0);
 	}
 }
 
@@ -864,7 +864,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 				       sizeof(req->priv_data));
 			resp.qw2 = 0;
 			resp.qw3 = 0;
-			qi_submit_sync(&resp, iommu);
+			qi_submit_sync(iommu, &resp, 1, 0);
 		}
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 5dcc81b1df62..a3fe281a3a5b 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -151,7 +151,7 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	return qi_submit_sync(&desc, iommu);
+	return qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 static int modify_irte(struct irq_2_iommu *irq_iommu,
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 628aa211687d..3c3de7869e1d 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -333,6 +333,7 @@ enum {
 
 #define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
 #define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
+#define QI_IWD_PRQ_DRAIN	(((u64)1) << 7)
 
 #define QI_IOTLB_DID(did) 	(((u64)did) << 16)
 #define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
@@ -705,7 +706,13 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
 			  int pasid);
 
-extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+		   unsigned int count, unsigned long options);
+/*
+ * Options used in qi_submit_sync:
+ * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8.
+ */
+#define QI_OPT_WAIT_DRAIN		BIT(0)
 
 extern int dmar_ir_support(void);
 
-- 
Gitee


From 5f41b84c39669c856caa2811c6096733078d4cc9 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 16 May 2020 14:20:56 +0800
Subject: [PATCH 0275/2161] iommu/vt-d: debugfs: Add support to show inv queue
 internals

commit 4c0fa5bfca7eba479002f0a1ecd1bf7631b2f5da upstream.

Export invalidation queue internals of each iommu device through the
debugfs.

Example of such dump on a Skylake machine:

$ sudo cat /sys/kernel/debug/iommu/intel/invalidation_queue
Invalidation queue on IOMMU: dmar1
 Base: 0x1672c9000      Head: 80        Tail: 80
Index           qw0                     qw1                     status
    0   0000000000000004        0000000000000000        0000000000000000
    1   0000000200000025        00000001672be804        0000000000000000
    2   0000000000000011        0000000000000000        0000000000000000
    3   0000000200000025        00000001672be80c        0000000000000000
    4   00000000000000d2        0000000000000000        0000000000000000
    5   0000000200000025        00000001672be814        0000000000000000
    6   0000000000000014        0000000000000000        0000000000000000
    7   0000000200000025        00000001672be81c        0000000000000000
    8   0000000000000014        0000000000000000        0000000000000000
    9   0000000200000025        00000001672be824        0000000000000000

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-14-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu-debugfs.c | 62 +++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/drivers/iommu/intel-iommu-debugfs.c b/drivers/iommu/intel-iommu-debugfs.c
index 3eb1fe240fb0..cf1ebb98e418 100644
--- a/drivers/iommu/intel-iommu-debugfs.c
+++ b/drivers/iommu/intel-iommu-debugfs.c
@@ -372,6 +372,66 @@ static int domain_translation_struct_show(struct seq_file *m, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(domain_translation_struct);
 
+static void invalidation_queue_entry_show(struct seq_file *m,
+					  struct intel_iommu *iommu)
+{
+	int index, shift = qi_shift(iommu);
+	struct qi_desc *desc;
+	int offset;
+
+	if (ecap_smts(iommu->ecap))
+		seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tqw2\t\t\tqw3\t\t\tstatus\n");
+	else
+		seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tstatus\n");
+
+	for (index = 0; index < QI_LENGTH; index++) {
+		offset = index << shift;
+		desc = iommu->qi->desc + offset;
+		if (ecap_smts(iommu->ecap))
+			seq_printf(m, "%5d\t%016llx\t%016llx\t%016llx\t%016llx\t%016x\n",
+				   index, desc->qw0, desc->qw1,
+				   desc->qw2, desc->qw3,
+				   iommu->qi->desc_status[index]);
+		else
+			seq_printf(m, "%5d\t%016llx\t%016llx\t%016x\n",
+				   index, desc->qw0, desc->qw1,
+				   iommu->qi->desc_status[index]);
+	}
+}
+
+static int invalidation_queue_show(struct seq_file *m, void *unused)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	struct q_inval *qi;
+	int shift;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		qi = iommu->qi;
+		shift = qi_shift(iommu);
+
+		if (!qi || !ecap_qis(iommu->ecap))
+			continue;
+
+		seq_printf(m, "Invalidation queue on IOMMU: %s\n", iommu->name);
+
+		raw_spin_lock_irqsave(&qi->q_lock, flags);
+		seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n",
+			   (u64)virt_to_phys(qi->desc),
+			   dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift,
+			   dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift);
+		invalidation_queue_entry_show(m, iommu);
+		raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+		seq_putc(m, '\n');
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(invalidation_queue);
+
 #ifdef CONFIG_IRQ_REMAP
 static void ir_tbl_remap_entry_show(struct seq_file *m,
 				    struct intel_iommu *iommu)
@@ -490,6 +550,8 @@ void __init intel_iommu_debugfs_init(void)
 	debugfs_create_file("domain_translation_struct", 0444,
 			    intel_iommu_debug, NULL,
 			    &domain_translation_struct_fops);
+	debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug,
+			    NULL, &invalidation_queue_fops);
 #ifdef CONFIG_IRQ_REMAP
 	debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
 			    NULL, &ir_translation_struct_fops);
-- 
Gitee


From ed5787624ffabd93683a4694f8124cbd8b5279bd Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 16 May 2020 14:20:57 +0800
Subject: [PATCH 0276/2161] iommu/vt-d: Disable non-recoverable fault
 processing before unbind

commit 37e91bd4b39922b31ca4e6c4eabb0d7140b14e74 upstream.

When a PASID is used for SVA by the device, it's possible that the PASID
entry is cleared before the device flushes all ongoing DMA requests. The
IOMMU should tolerate and ignore the non-recoverable faults caused by the
untranslated requests from this device.

For example, when an exception happens, the process terminates before the
device driver stops DMA and call IOMMU driver to unbind PASID. The flow
of process exist is as follows:

do_exit() {
     exit_mm() {
             mm_put();
             exit_mmap() {
                     intel_invalidate_range() //mmu notifier
                     tlb_finish_mmu()
                     mmu_notifier_release(mm) {
                             intel_iommu_release() {
[2]                                  intel_iommu_teardown_pasid();
                                     intel_iommu_flush_tlbs();
                             }
                     }
                     unmap_vmas();
                     free_pgtables();
             };
     }
     exit_files(tsk) {
             close_files() {
                     dsa_close();
[1]                  dsa_stop_dma();
                     intel_svm_unbind_pasid();
             }
     }
}

Care must be taken on VT-d to avoid unrecoverable faults between the time
window of [1] and [2]. [Process exist flow was contributed by Jacob Pan.]

Intel VT-d provides such function through the FPD bit of the PASID entry.
This sets FPD bit when PASID entry is changing from present to nonpresent
in the mm notifier and will clear it when the pasid is unbound.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-15-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c |  4 ++--
 drivers/iommu/intel-pasid.c | 26 +++++++++++++++++++++-----
 drivers/iommu/intel-pasid.h |  4 +++-
 drivers/iommu/intel-svm.c   |  9 ++++++---
 4 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index dcd87e911c68..9c194106d720 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5033,7 +5033,7 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 	if (info->dev) {
 		if (dev_is_pci(info->dev) && sm_supported(iommu))
 			intel_pasid_tear_down_entry(iommu, info->dev,
-					PASID_RID2PASID);
+					PASID_RID2PASID, false);
 
 		iommu_disable_dev_iotlb(info);
 		if (!dev_is_real_dma_subdevice(info->dev))
@@ -5263,7 +5263,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
 	auxiliary_unlink_device(domain, dev);
 
 	spin_lock(&iommu->lock);
-	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
+	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
 	domain_detach_iommu(domain, iommu);
 	spin_unlock(&iommu->lock);
 
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 45e9b5b291bc..25d749830500 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -292,7 +292,20 @@ static inline void pasid_clear_entry(struct pasid_entry *pe)
 	WRITE_ONCE(pe->val[7], 0);
 }
 
-static void intel_pasid_clear_entry(struct device *dev, int pasid)
+static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
+{
+	WRITE_ONCE(pe->val[0], PASID_PTE_FPD);
+	WRITE_ONCE(pe->val[1], 0);
+	WRITE_ONCE(pe->val[2], 0);
+	WRITE_ONCE(pe->val[3], 0);
+	WRITE_ONCE(pe->val[4], 0);
+	WRITE_ONCE(pe->val[5], 0);
+	WRITE_ONCE(pe->val[6], 0);
+	WRITE_ONCE(pe->val[7], 0);
+}
+
+static void
+intel_pasid_clear_entry(struct device *dev, int pasid, bool fault_ignore)
 {
 	struct pasid_entry *pe;
 
@@ -300,7 +313,10 @@ static void intel_pasid_clear_entry(struct device *dev, int pasid)
 	if (WARN_ON(!pe))
 		return;
 
-	pasid_clear_entry(pe);
+	if (fault_ignore && pasid_pte_is_present(pe))
+		pasid_clear_entry_with_fpd(pe);
+	else
+		pasid_clear_entry(pe);
 }
 
 static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
@@ -473,8 +489,8 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
 	qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
 }
 
-void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
-				 struct device *dev, int pasid)
+void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
+				 int pasid, bool fault_ignore)
 {
 	struct pasid_entry *pte;
 	u16 did;
@@ -484,7 +500,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 		return;
 
 	did = pasid_get_domain_id(pte);
-	intel_pasid_clear_entry(dev, pasid);
+	intel_pasid_clear_entry(dev, pasid, fault_ignore);
 
 	if (!ecap_coherent(iommu->ecap))
 		clflush_cache_range(pte, sizeof(*pte));
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index a41b09b3ffde..c5318d40e0fa 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -15,6 +15,7 @@
 #define PASID_MAX			0x100000
 #define PASID_PTE_MASK			0x3F
 #define PASID_PTE_PRESENT		1
+#define PASID_PTE_FPD			2
 #define PDE_PFN_MASK			PAGE_MASK
 #define PASID_PDE_SHIFT			6
 #define MAX_NR_PASID_BITS		20
@@ -120,7 +121,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu,
 			     struct iommu_gpasid_bind_data_vtd *pasid_data,
 			     struct dmar_domain *domain, int addr_width);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
-				 struct device *dev, int pasid);
+				 struct device *dev, int pasid,
+				 bool fault_ignore);
 int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid);
 void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid);
 #endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 080280a655af..99073e71cfa5 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -225,7 +225,8 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 */
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdev, &svm->devs, list) {
-		intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid);
+		intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
+					    svm->pasid, true);
 		intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
 	}
 	rcu_read_unlock();
@@ -414,7 +415,8 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 			sdev->users--;
 		if (!sdev->users) {
 			list_del_rcu(&sdev->list);
-			intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
+			intel_pasid_tear_down_entry(iommu, dev,
+						    svm->pasid, false);
 			intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
 			/* TODO: Drain in flight PRQ for the PASID since it
 			 * may get reused soon, we don't want to
@@ -657,7 +659,8 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 			 * to use. We have a *shared* PASID table, because it's
 			 * large and has to be physically contiguous. So it's
 			 * hard to be as defensive as we might like. */
-			intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
+			intel_pasid_tear_down_entry(iommu, dev,
+						    svm->pasid, false);
 			intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
 			kfree_rcu(sdev, rcu);
 
-- 
Gitee


From ed4c8b75e7fb92b501e0ef399ceed0050a786e51 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 16 May 2020 14:20:58 +0800
Subject: [PATCH 0277/2161] iommu/vt-d: Add page request draining support

commit 66ac4db36f4c12a3b02db864ccb0801cd938b6de upstream.

When a PASID is stopped or terminated, there can be pending PRQs
(requests that haven't received responses) in remapping hardware.
This adds the interface to drain page requests and call it when a
PASID is terminated.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-16-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-svm.c   | 107 ++++++++++++++++++++++++++++++++++--
 include/linux/intel-iommu.h |   4 ++
 2 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 99073e71cfa5..b8ab4a8656cc 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -23,6 +23,7 @@
 #include "intel-pasid.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
+static void intel_svm_drain_prq(struct device *dev, int pasid);
 
 #define PRQ_ORDER 0
 
@@ -66,6 +67,8 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
 	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
 
+	init_completion(&iommu->prq_complete);
+
 	return 0;
 }
 
@@ -417,12 +420,8 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 			list_del_rcu(&sdev->list);
 			intel_pasid_tear_down_entry(iommu, dev,
 						    svm->pasid, false);
+			intel_svm_drain_prq(dev, svm->pasid);
 			intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
-			/* TODO: Drain in flight PRQ for the PASID since it
-			 * may get reused soon, we don't want to
-			 * confuse with its previous life.
-			 * intel_svm_drain_prq(dev, pasid);
-			 */
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
@@ -661,6 +660,7 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 			 * hard to be as defensive as we might like. */
 			intel_pasid_tear_down_entry(iommu, dev,
 						    svm->pasid, false);
+			intel_svm_drain_prq(dev, svm->pasid);
 			intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
 			kfree_rcu(sdev, rcu);
 
@@ -739,6 +739,93 @@ static bool is_canonical_address(u64 addr)
 	return (((saddr << shift) >> shift) == saddr);
 }
 
+/**
+ * intel_svm_drain_prq - Drain page requests and responses for a pasid
+ * @dev: target device
+ * @pasid: pasid for draining
+ *
+ * Drain all pending page requests and responses related to @pasid in both
+ * software and hardware. This is supposed to be called after the device
+ * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
+ * and DevTLB have been invalidated.
+ *
+ * It waits until all pending page requests for @pasid in the page fault
+ * queue are completed by the prq handling thread. Then follow the steps
+ * described in VT-d spec CH7.10 to drain all page requests and page
+ * responses pending in the hardware.
+ */
+static void intel_svm_drain_prq(struct device *dev, int pasid)
+{
+	struct device_domain_info *info;
+	struct dmar_domain *domain;
+	struct intel_iommu *iommu;
+	struct qi_desc desc[3];
+	struct pci_dev *pdev;
+	int head, tail;
+	u16 sid, did;
+	int qdep;
+
+	info = get_domain_info(dev);
+	if (WARN_ON(!info || !dev_is_pci(dev)))
+		return;
+
+	if (!info->pri_enabled)
+		return;
+
+	iommu = info->iommu;
+	domain = info->domain;
+	pdev = to_pci_dev(dev);
+	sid = PCI_DEVID(info->bus, info->devfn);
+	did = domain->iommu_did[iommu->seq_id];
+	qdep = pci_ats_queue_depth(pdev);
+
+	/*
+	 * Check and wait until all pending page requests in the queue are
+	 * handled by the prq handling thread.
+	 */
+prq_retry:
+	reinit_completion(&iommu->prq_complete);
+	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+	while (head != tail) {
+		struct page_req_dsc *req;
+
+		req = &iommu->prq[head / sizeof(*req)];
+		if (!req->pasid_present || req->pasid != pasid) {
+			head = (head + sizeof(*req)) & PRQ_RING_MASK;
+			continue;
+		}
+
+		wait_for_completion(&iommu->prq_complete);
+		goto prq_retry;
+	}
+
+	/*
+	 * Perform steps described in VT-d spec CH7.10 to drain page
+	 * requests and responses in hardware.
+	 */
+	memset(desc, 0, sizeof(desc));
+	desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+			QI_IWD_FENCE |
+			QI_IWD_TYPE;
+	desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
+			QI_EIOTLB_DID(did) |
+			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+			QI_EIOTLB_TYPE;
+	desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
+			QI_DEV_EIOTLB_SID(sid) |
+			QI_DEV_EIOTLB_QDEP(qdep) |
+			QI_DEIOTLB_TYPE |
+			QI_DEV_IOTLB_PFSID(info->pfsid);
+qi_retry:
+	reinit_completion(&iommu->prq_complete);
+	qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
+	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+		wait_for_completion(&iommu->prq_complete);
+		goto qi_retry;
+	}
+}
+
 static irqreturn_t prq_event_thread(int irq, void *d)
 {
 	struct intel_iommu *iommu = d;
@@ -874,6 +961,16 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 
 	dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
 
+	/*
+	 * Clear the page request overflow bit and wake up all threads that
+	 * are waiting for the completion of this handling.
+	 */
+	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
+		writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+
+	if (!completion_done(&iommu->prq_complete))
+		complete(&iommu->prq_complete);
+
 	return IRQ_RETVAL(handled);
 }
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 3c3de7869e1d..45eab051de56 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -292,6 +292,8 @@
 
 /* PRS_REG */
 #define DMA_PRS_PPR	((u32)1)
+#define DMA_PRS_PRO	((u32)2)
+
 #define DMA_VCS_PAS	((u64)1)
 
 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
@@ -333,6 +335,7 @@ enum {
 
 #define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
 #define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
+#define QI_IWD_FENCE		(((u64)1) << 6)
 #define QI_IWD_PRQ_DRAIN	(((u64)1) << 7)
 
 #define QI_IOTLB_DID(did) 	(((u64)did) << 16)
@@ -582,6 +585,7 @@ struct intel_iommu {
 #ifdef CONFIG_INTEL_IOMMU_SVM
 	struct page_req_dsc *prq;
 	unsigned char prq_name[16];    /* Name for PRQ interrupt */
+	struct completion prq_complete;
 	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
 #endif
 	struct q_inval  *qi;            /* Queued invalidation info */
-- 
Gitee


From de2ff9a5249fd9bf5df55152ad89e85e26e731ef Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 16 May 2020 14:20:59 +0800
Subject: [PATCH 0278/2161] iommu/vt-d: Remove redundant IOTLB flush

commit 81ebd91a436b87158b2ab6c71a51395316b147dc upstream.

IOTLB flush already included in the PASID tear down and the page request
drain process. There is no need to flush again.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-17-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-svm.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index b8ab4a8656cc..6652f0f76e41 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -227,11 +227,9 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 * *has* to handle gracefully without affecting other processes.
 	 */
 	rcu_read_lock();
-	list_for_each_entry_rcu(sdev, &svm->devs, list) {
+	list_for_each_entry_rcu(sdev, &svm->devs, list)
 		intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
 					    svm->pasid, true);
-		intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
-	}
 	rcu_read_unlock();
 
 }
@@ -421,7 +419,6 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 			intel_pasid_tear_down_entry(iommu, dev,
 						    svm->pasid, false);
 			intel_svm_drain_prq(dev, svm->pasid);
-			intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
@@ -661,7 +658,6 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 			intel_pasid_tear_down_entry(iommu, dev,
 						    svm->pasid, false);
 			intel_svm_drain_prq(dev, svm->pasid);
-			intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
-- 
Gitee


From 3c627ff4fe3fc18e41f6e730e111f6d2b3912e6f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 16 May 2020 14:21:00 +0800
Subject: [PATCH 0279/2161] iommu/vt-d: Remove duplicated check in
 intel_svm_bind_mm()

commit 7482fd59259a7f23e191c14b8a126a0f6981b3e4 upstream.

The info and info->pasid_support have already been checked in previous
intel_iommu_enable_pasid() call. No need to check again.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-18-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-svm.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 6652f0f76e41..8594d878bdb8 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -519,11 +519,6 @@ intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
 	}
 
 	info = get_domain_info(dev);
-	if (!info || !info->pasid_supported) {
-		kfree(sdev);
-		goto out;
-	}
-
 	sdev->did = FLPT_DEFAULT_DID;
 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
 	if (info->ats_enabled) {
-- 
Gitee


From 459f33744b0c716626a74fe1ef4a5e237a496a68 Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Sat, 16 May 2020 14:21:01 +0800
Subject: [PATCH 0280/2161] iommu/vt-d: Remove IOVA handling code from the
 non-dma_ops path

commit e70b081c6f376471d7a9fee69e12e8f05ac2925d upstream.

There's no need for the non-dma_ops path to keep track of IOVAs. The
whole point of the non-dma_ops path is that it allows the IOVAs to be
handled separately. The IOVA handling code removed in this patch is
pointless.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200516062101.29541-19-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 95 +++++++++++++------------------------
 1 file changed, 32 insertions(+), 63 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 9c194106d720..b91f9a1d5b76 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1898,11 +1898,6 @@ static int dmar_init_reserved_ranges(void)
 	return 0;
 }
 
-static void domain_reserve_special_ranges(struct dmar_domain *domain)
-{
-	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
-}
-
 static inline int guestwidth_to_adjustwidth(int gaw)
 {
 	int agaw;
@@ -1924,7 +1919,8 @@ static void domain_exit(struct dmar_domain *domain)
 	domain_remove_dev_info(domain);
 
 	/* destroy iovas */
-	put_iova_domain(&domain->iovad);
+	if (domain->domain.type == IOMMU_DOMAIN_DMA)
+		put_iova_domain(&domain->iovad);
 
 	if (domain->pgd) {
 		struct page *freelist;
@@ -2648,19 +2644,9 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 }
 
 static int iommu_domain_identity_map(struct dmar_domain *domain,
-				     unsigned long long start,
-				     unsigned long long end)
+				     unsigned long first_vpfn,
+				     unsigned long last_vpfn)
 {
-	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
-	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
-
-	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
-			  dma_to_mm_pfn(last_vpfn))) {
-		pr_err("Reserving iova failed\n");
-		return -ENOMEM;
-	}
-
-	pr_debug("Mapping reserved region %llx-%llx\n", start, end);
 	/*
 	 * RMRR range might have overlap with physical memory range,
 	 * clear it first
@@ -2698,7 +2684,8 @@ static int __init si_domain_init(int hw)
 
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			ret = iommu_domain_identity_map(si_domain,
-					PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
+					mm_to_dma_pfn(start_pfn),
+					mm_to_dma_pfn(end_pfn));
 			if (ret)
 				return ret;
 		}
@@ -4574,58 +4561,37 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb,
 				       unsigned long val, void *v)
 {
 	struct memory_notify *mhp = v;
-	unsigned long long start, end;
-	unsigned long start_vpfn, last_vpfn;
+	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
+	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
+			mhp->nr_pages - 1);
 
 	switch (val) {
 	case MEM_GOING_ONLINE:
-		start = mhp->start_pfn << PAGE_SHIFT;
-		end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
-		if (iommu_domain_identity_map(si_domain, start, end)) {
-			pr_warn("Failed to build identity map for [%llx-%llx]\n",
-				start, end);
+		if (iommu_domain_identity_map(si_domain,
+					      start_vpfn, last_vpfn)) {
+			pr_warn("Failed to build identity map for [%lx-%lx]\n",
+				start_vpfn, last_vpfn);
 			return NOTIFY_BAD;
 		}
 		break;
 
 	case MEM_OFFLINE:
 	case MEM_CANCEL_ONLINE:
-		start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
-		last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
-		while (start_vpfn <= last_vpfn) {
-			struct iova *iova;
+		{
 			struct dmar_drhd_unit *drhd;
 			struct intel_iommu *iommu;
 			struct page *freelist;
 
-			iova = find_iova(&si_domain->iovad, start_vpfn);
-			if (iova == NULL) {
-				pr_debug("Failed get IOVA for PFN %lx\n",
-					 start_vpfn);
-				break;
-			}
-
-			iova = split_and_remove_iova(&si_domain->iovad, iova,
-						     start_vpfn, last_vpfn);
-			if (iova == NULL) {
-				pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
-					start_vpfn, last_vpfn);
-				return NOTIFY_BAD;
-			}
-
-			freelist = domain_unmap(si_domain, iova->pfn_lo,
-					       iova->pfn_hi);
+			freelist = domain_unmap(si_domain,
+						start_vpfn, last_vpfn);
 
 			rcu_read_lock();
 			for_each_active_iommu(iommu, drhd)
 				iommu_flush_iotlb_psi(iommu, si_domain,
-					iova->pfn_lo, iova_size(iova),
+					start_vpfn, mhp->nr_pages,
 					!freelist, 0);
 			rcu_read_unlock();
 			dma_free_pagelist(freelist);
-
-			start_vpfn = iova->pfn_hi + 1;
-			free_iova_mem(iova);
 		}
 		break;
 	}
@@ -4653,8 +4619,9 @@ static void free_all_cpu_cached_iovas(unsigned int cpu)
 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
 			domain = get_iommu_domain(iommu, (u16)did);
 
-			if (!domain)
+			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
 				continue;
+
 			free_cpu_cached_iovas(cpu, &domain->iovad);
 		}
 	}
@@ -5066,9 +5033,6 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
 	int adjust_width;
 
-	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
-	domain_reserve_special_ranges(domain);
-
 	/* calculate AGAW */
 	domain->gaw = guest_width;
 	adjust_width = guestwidth_to_adjustwidth(guest_width);
@@ -5087,11 +5051,21 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	return 0;
 }
 
+static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
+{
+	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
+	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
+
+	if (!intel_iommu_strict &&
+	    init_iova_flush_queue(&dmar_domain->iovad,
+				  iommu_flush_iova, iova_entry_free))
+		pr_info("iova flush queue initialization failed\n");
+}
+
 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 {
 	struct dmar_domain *dmar_domain;
 	struct iommu_domain *domain;
-	int ret;
 
 	switch (type) {
 	case IOMMU_DOMAIN_DMA:
@@ -5108,13 +5082,8 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 			return NULL;
 		}
 
-		if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
-			ret = init_iova_flush_queue(&dmar_domain->iovad,
-						    iommu_flush_iova,
-						    iova_entry_free);
-			if (ret)
-				pr_info("iova flush queue initialization failed\n");
-		}
+		if (type == IOMMU_DOMAIN_DMA)
+			intel_init_iova_domain(dmar_domain);
 
 		domain_update_iommu_cap(dmar_domain);
 
-- 
Gitee


From a3dd06ffd583ba5364f126cdaa163181f37125ea Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 19 May 2020 15:03:40 +0200
Subject: [PATCH 0281/2161] iommu: Fix deferred domain attachment

commit bd421264ed307dd296eab036851221b225071a32 upstream.

The IOMMU core code has support for deferring the attachment of a domain
to a device. This is needed in kdump kernels where the new domain must
not be attached to a device before the device driver takes it over.

When the AMD IOMMU driver got converted to use the dma-iommu
implementation, the deferred attaching got lost. The code in
dma-iommu.c has support for deferred attaching, but it calls into
iommu_attach_device() to actually do it. But iommu_attach_device()
will check if the device should be deferred in it code-path and do
nothing, breaking deferred attachment.

Move the is_deferred_attach() check out of the attach_device path and
into iommu_group_add_device() to make deferred attaching work from the
dma-iommu code.

Fixes: 795bbbb9b6f8 ("iommu/dma-iommu: Handle deferred devices")
Reported-by: Jerry Snitselaar <jsnitsel@redhat.com>
Suggested-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Jerry Snitselaar <jsnitsel@redhat.com>
Cc: Jerry Snitselaar <jsnitsel@redhat.com>
Cc: Tom Murphy <murphyt7@tcd.ie>
Cc: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20200519130340.14564-1-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9e831d328afa..3c3bedc1a365 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -769,6 +769,15 @@ static int iommu_create_device_direct_mappings(struct iommu_group *group,
 	return ret;
 }
 
+static bool iommu_is_attach_deferred(struct iommu_domain *domain,
+				     struct device *dev)
+{
+	if (domain->ops->is_attach_deferred)
+		return domain->ops->is_attach_deferred(domain, dev);
+
+	return false;
+}
+
 /**
  * iommu_group_add_device - add a device to an iommu group
  * @group: the group into which to add the device (reference should be held)
@@ -821,7 +830,7 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
 
 	mutex_lock(&group->mutex);
 	list_add_tail(&device->list, &group->devices);
-	if (group->domain)
+	if (group->domain  && !iommu_is_attach_deferred(group->domain, dev))
 		ret = __iommu_attach_device(group->domain, dev);
 	mutex_unlock(&group->mutex);
 	if (ret)
@@ -1902,9 +1911,6 @@ static int __iommu_attach_device(struct iommu_domain *domain,
 				 struct device *dev)
 {
 	int ret;
-	if ((domain->ops->is_attach_deferred != NULL) &&
-	    domain->ops->is_attach_deferred(domain, dev))
-		return 0;
 
 	if (unlikely(domain->ops->attach_dev == NULL))
 		return -ENODEV;
@@ -1976,8 +1982,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
 static void __iommu_detach_device(struct iommu_domain *domain,
 				  struct device *dev)
 {
-	if ((domain->ops->is_attach_deferred != NULL) &&
-	    domain->ops->is_attach_deferred(domain, dev))
+	if (iommu_is_attach_deferred(domain, dev))
 		return;
 
 	if (unlikely(domain->ops->detach_dev == NULL))
-- 
Gitee


From aee73801727c34e84572f0f9e8c418f1ae1427a8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 19 May 2020 15:28:24 +0200
Subject: [PATCH 0282/2161] iommu: Don't call .probe_finalize() under
 group->mutex

commit 70b8170e55d3ca9503a53211967faee6b5f18b19 upstream.

The .probe_finalize() call-back of some IOMMU drivers calls into
arm_iommu_attach_device(). This function will call back into the
IOMMU core code, where it tries to take group->mutex again, resulting
in a deadlock.

As there is no reason why .probe_finalize() needs to be called under
that mutex, move it after the lock has been released to fix the
deadlock.

Fixes: deac0b3bed26 ("iommu: Split off default domain allocation from group assignment")
Reported-by: Yong Wu <yong.wu@mediatek.com>
Tested-by: Yong Wu <yong.wu@mediatek.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Cc: Yong Wu <yong.wu@mediatek.com>
Link: https://lore.kernel.org/r/20200519132824.15163-1-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3c3bedc1a365..cfe63cc50522 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1692,17 +1692,8 @@ static void probe_alloc_default_domain(struct bus_type *bus,
 static int iommu_group_do_dma_attach(struct device *dev, void *data)
 {
 	struct iommu_domain *domain = data;
-	const struct iommu_ops *ops;
-	int ret;
-
-	ret = __iommu_attach_device(domain, dev);
-
-	ops = domain->ops;
-
-	if (ret == 0 && ops->probe_finalize)
-		ops->probe_finalize(dev);
 
-	return ret;
+	return __iommu_attach_device(domain, dev);
 }
 
 static int __iommu_group_dma_attach(struct iommu_group *group)
@@ -1711,6 +1702,21 @@ static int __iommu_group_dma_attach(struct iommu_group *group)
 					  iommu_group_do_dma_attach);
 }
 
+static int iommu_group_do_probe_finalize(struct device *dev, void *data)
+{
+	struct iommu_domain *domain = data;
+
+	if (domain->ops->probe_finalize)
+		domain->ops->probe_finalize(dev);
+
+	return 0;
+}
+
+static void __iommu_group_dma_finalize(struct iommu_group *group)
+{
+	__iommu_group_for_each_dev(group, group->default_domain,
+				   iommu_group_do_probe_finalize);
+}
 static int iommu_do_create_direct_mappings(struct device *dev, void *data)
 {
 	struct iommu_group *group = data;
@@ -1763,6 +1769,8 @@ int bus_iommu_probe(struct bus_type *bus)
 
 		if (ret)
 			break;
+
+		__iommu_group_dma_finalize(group);
 	}
 
 	return ret;
-- 
Gitee


From 1b6e6f8bc114627b4ab96278da686b5409ed36ba Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 19 May 2020 09:34:23 +0800
Subject: [PATCH 0283/2161] iommu/vt-d: Fix pointer cast warnings on 32 bit

commit bfe6240dfe4f16c20db94bc7c0ab9ffa316fb926 upstream.

Pointers should be casted to unsigned long to avoid "cast from pointer
to integer of different size" warnings.

drivers/iommu/intel-pasid.c:818:6: warning:
    cast from pointer to integer of different size [-Wpointer-to-int-cast]
drivers/iommu/intel-pasid.c:821:9: warning:
    cast from pointer to integer of different size [-Wpointer-to-int-cast]
drivers/iommu/intel-pasid.c:824:23: warning:
    cast from pointer to integer of different size [-Wpointer-to-int-cast]
drivers/iommu/intel-svm.c:343:45: warning:
    cast to pointer from integer of different size [-Wint-to-pointer-cast]

Fixes: b0d1f8741b81 ("iommu/vt-d: Add nested translation helper function")
Fixes: 56722a4398a3 ("iommu/vt-d: Add bind guest PASID support")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200519013423.11971-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-pasid.c | 8 ++++----
 drivers/iommu/intel-svm.c   | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 25d749830500..c81f0f17c6ba 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -815,13 +815,13 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
 	}
 
 	/* First level PGD is in GPA, must be supported by the second level */
-	if ((unsigned long long)gpgd > domain->max_addr) {
+	if ((uintptr_t)gpgd > domain->max_addr) {
 		dev_err_ratelimited(dev,
-				    "Guest PGD %llx not supported, max %llx\n",
-				    (unsigned long long)gpgd, domain->max_addr);
+				    "Guest PGD %lx not supported, max %llx\n",
+				    (uintptr_t)gpgd, domain->max_addr);
 		return -EINVAL;
 	}
-	pasid_set_flptr(pte, (u64)gpgd);
+	pasid_set_flptr(pte, (uintptr_t)gpgd);
 
 	ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data);
 	if (ret)
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 8594d878bdb8..05092ddcef89 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -358,7 +358,8 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	 * call the nested mode setup function here.
 	 */
 	spin_lock(&iommu->lock);
-	ret = intel_pasid_setup_nested(iommu, dev, (pgd_t *)data->gpgd,
+	ret = intel_pasid_setup_nested(iommu, dev,
+				       (pgd_t *)(uintptr_t)data->gpgd,
 				       data->hpasid, &data->vtd, dmar_domain,
 				       data->addr_width);
 	spin_unlock(&iommu->lock);
-- 
Gitee


From 4f9f3fc3bb06f7393b5aaa0d7c5b75a1d6dd2441 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 21 May 2020 17:50:30 -0400
Subject: [PATCH 0284/2161] iommu/vt-d: fix a GCC warning

commit 7809c4d5805b1d331acfe0047dd9695691d428d4 upstream.

The commit 6ee1b77ba3ac ("iommu/vt-d: Add svm/sva invalidate function")
introduced a GCC warning,

drivers/iommu/intel-iommu.c:5330:1: warning: 'static' is not at beginning of
declaration [-Wold-style-declaration]
 const static int
 ^~~~~

Fixes: 6ee1b77ba3ac0 ("iommu/vt-d: Add svm/sva invalidate function")
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200521215030.16938-1-cai@lca.pw
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index b91f9a1d5b76..42dd0fd8d5d4 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5356,7 +5356,7 @@ static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
  */
 
-const static int
+static const int
 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
 	/*
 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
-- 
Gitee


From a49d7f588a78ae48b76a2a5b6990e957f8e19534 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 18 Nov 2021 17:33:52 +0800
Subject: [PATCH 0285/2161] iommu: Don't take group reference in
 iommu_alloc_default_domain()

commit 79659190ee972c05498c338e48d80cb45490c533 upstream.

The iommu_alloc_default_domain() function takes a reference to an IOMMU
group without releasing it. This causes the group to never be released,
with undefined side effects.

The function has only one call-site, which takes a group reference on
its own, so to fix this leak, do not take another reference in
iommu_alloc_default_domain() and pass the group as a function parameter
instead.

Fixes: 6e1aa2049154 ("iommu: Move default domain allocation to iommu_probe_device()")
Reported-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Cc: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Link: https://lore.kernel.org/r/20200525130122.380-1-joro@8bytes.org
Reference: https://lore.kernel.org/lkml/20200522130145.30067-1-saiprakash.ranjan@codeaurora.org/
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cfe63cc50522..acd5f575946d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -80,7 +80,8 @@ static bool iommu_cmd_line_dma_api(void)
 	return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API);
 }
 
-static int iommu_alloc_default_domain(struct device *dev);
+static int iommu_alloc_default_domain(struct iommu_group *group,
+				      struct device *dev);
 static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 						 unsigned type);
 static int __iommu_attach_device(struct iommu_domain *domain,
@@ -251,17 +252,17 @@ int iommu_probe_device(struct device *dev)
 	if (ret)
 		goto err_out;
 
+	group = iommu_group_get(dev);
+	if (!group)
+		goto err_release;
+
 	/*
 	 * Try to allocate a default domain - needs support from the
 	 * IOMMU driver. There are still some drivers which don't
 	 * support default domains, so the return value is not yet
 	 * checked.
 	 */
-	iommu_alloc_default_domain(dev);
-
-	group = iommu_group_get(dev);
-	if (!group)
-		goto err_release;
+	iommu_alloc_default_domain(group, dev);
 
 	if (group->default_domain)
 		ret = __iommu_attach_device(group->default_domain, dev);
@@ -1487,16 +1488,12 @@ static int iommu_group_alloc_default_domain(struct bus_type *bus,
 	return 0;
 }
 
-static int iommu_alloc_default_domain(struct device *dev)
+static int iommu_alloc_default_domain(struct iommu_group *group,
+				      struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
-	struct iommu_group *group;
 	unsigned int type = iommu_def_domain_type;
 
-	group = iommu_group_get(dev);
-	if (!group)
-		return -ENODEV;
-
 	if (group->default_domain)
 		return 0;
 
-- 
Gitee


From 617b274212707a4b4d981c66e46dcc49b95408ea Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 20 May 2020 17:22:01 +0200
Subject: [PATCH 0286/2161] iommu/amd: Use pci_ats_supported()

commit 7a441b2110527851f630144ec76ab8409e1d6c61 upstream.

The pci_ats_supported() function checks if a device supports ATS and is
allowed to use it. In addition to checking that the device has an ATS
capability and that the global pci=noats is not set
(pci_ats_disabled()), it also checks if a device is untrusted.

A device is untrusted if it is plugged into an external-facing port such
as Thunderbolt and could be spoofing an existing device to exploit
weaknesses in the IOMMU configuration. By calling pci_ats_supported() we
keep DTE[I]=0 for untrusted devices and abort transactions with
Pretranslated Addresses.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200520152201.3309416-3-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index eeedb0a690d3..8c7ddbf63e3d 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -311,16 +311,15 @@ static struct iommu_group *acpihid_device_group(struct device *dev)
 static bool pci_iommuv2_capable(struct pci_dev *pdev)
 {
 	static const int caps[] = {
-		PCI_EXT_CAP_ID_ATS,
 		PCI_EXT_CAP_ID_PRI,
 		PCI_EXT_CAP_ID_PASID,
 	};
 	int i, pos;
 
-	if (pci_ats_disabled())
+	if (!pci_ats_supported(pdev))
 		return false;
 
-	for (i = 0; i < 3; ++i) {
+	for (i = 0; i < 2; ++i) {
 		pos = pci_find_ext_capability(pdev, caps[i]);
 		if (pos == 0)
 			return false;
@@ -3072,11 +3071,8 @@ int amd_iommu_device_info(struct pci_dev *pdev,
 
 	memset(info, 0, sizeof(*info));
 
-	if (!pci_ats_disabled()) {
-		pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
-		if (pos)
-			info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
-	}
+	if (pci_ats_supported(pdev))
+		info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
 
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
 	if (pos)
-- 
Gitee


From f32bbc218c3cd8895ee0d29ce90fa44335f561eb Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 20 May 2020 17:22:03 +0200
Subject: [PATCH 0287/2161] iommu/vt-d: Use pci_ats_supported()

commit da656a042568ffbc30881c43a832277f275eea4a upstream.

The pci_ats_supported() helper checks if a device supports ATS and is
allowed to use it. By checking the ATS capability it also integrates the
pci_ats_disabled() check from pci_ats_init(). Simplify the vt-d checks.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200520152201.3309416-5-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-iommu.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 42dd0fd8d5d4..bcd14c3adf49 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1438,8 +1438,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
 		info->pri_enabled = 1;
 #endif
-	if (!pdev->untrusted && info->ats_supported &&
-	    pci_ats_page_aligned(pdev) &&
+	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
 		info->ats_enabled = 1;
 		domain_update_iotlb(info->domain);
@@ -2549,10 +2548,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	if (dev && dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(info->dev);
 
-		if (!pdev->untrusted &&
-		    !pci_ats_disabled() &&
-		    ecap_dev_iotlb_support(iommu->ecap) &&
-		    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
+		if (ecap_dev_iotlb_support(iommu->ecap) &&
+		    pci_ats_supported(pdev) &&
 		    dmar_find_matched_atsr_unit(pdev))
 			info->ats_supported = 1;
 
-- 
Gitee


From 933f1fbc3b6b47fdb5159b2a464871c3b4e51c7f Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 20 May 2020 17:22:00 +0200
Subject: [PATCH 0288/2161] PCI/ATS: Only enable ATS for trusted devices

commit 521376741b2c26fe53a1ec24d02da24d477eb739 upstream.

Add pci_ats_supported(), which checks whether a device has an ATS
capability, and whether it is trusted.  A device is untrusted if it is
plugged into an external-facing port such as Thunderbolt and could be
spoofing an existing device to exploit weaknesses in the IOMMU
configuration.  PCIe ATS is one such weaknesses since it allows
endpoints to cache IOMMU translations and emit transactions with
'Translated' Address Type (10b) that partially bypass the IOMMU
translation.

The SMMUv3 and VT-d IOMMU drivers already disallow ATS and transactions
with 'Translated' Address Type for untrusted devices.  Add the check to
pci_enable_ats() to let other drivers (AMD IOMMU for now) benefit from
it.

By checking ats_cap, the pci_ats_supported() helper also returns whether
ATS was globally disabled with pci=noats, and could later include more
things, for example whether the whole PCIe hierarchy down to the
endpoint supports ATS.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20200520152201.3309416-2-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/ats.c   | 18 +++++++++++++++++-
 include/linux/pci.h |  3 +++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index e18499243f84..20e59b971bc7 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -30,6 +30,22 @@ void pci_ats_init(struct pci_dev *dev)
 	dev->ats_cap = pos;
 }
 
+/**
+ * pci_ats_supported - check if the device can use ATS
+ * @dev: the PCI device
+ *
+ * Returns true if the device supports ATS and is allowed to use it, false
+ * otherwise.
+ */
+bool pci_ats_supported(struct pci_dev *dev)
+{
+	if (!dev->ats_cap)
+		return false;
+
+	return (dev->untrusted == 0);
+}
+EXPORT_SYMBOL_GPL(pci_ats_supported);
+
 /**
  * pci_enable_ats - enable the ATS capability
  * @dev: the PCI device
@@ -42,7 +58,7 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
 	u16 ctrl;
 	struct pci_dev *pdev;
 
-	if (!dev->ats_cap)
+	if (!pci_ats_supported(dev))
 		return -EINVAL;
 
 	if (WARN_ON(dev->ats_enabled))
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b70d1e0565a7..0811885de1c9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1794,11 +1794,14 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
 
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
+bool pci_ats_supported(struct pci_dev *dev);
 int pci_enable_ats(struct pci_dev *dev, int ps);
 void pci_disable_ats(struct pci_dev *dev);
 int pci_ats_queue_depth(struct pci_dev *dev);
 int pci_ats_page_aligned(struct pci_dev *dev);
 #else
+static inline bool pci_ats_supported(struct pci_dev *d)
+{ return false; }
 static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
 static inline void pci_disable_ats(struct pci_dev *d) { }
 static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
-- 
Gitee


From e755e6087e952ee2e778927c1a9d7fa354a1b972 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 20 May 2020 17:22:02 +0200
Subject: [PATCH 0289/2161] iommu/arm-smmu-v3: Use pci_ats_supported()

commit 0b2527a654190a987d45e2cc9e5c6946eea11fc5 upstream.

The new pci_ats_supported() function checks if a device supports ATS and
is allowed to use it.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200520152201.3309416-4-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/arm-smmu-v3.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 4736d8612d30..2870790e125c 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2370,26 +2370,20 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master)
 	}
 }
 
-#ifdef CONFIG_PCI_ATS
 static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
 {
-	struct pci_dev *pdev;
+	struct device *dev = master->dev;
 	struct arm_smmu_device *smmu = master->smmu;
-	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 
-	if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) ||
-	    !(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled())
+	if (!(smmu->features & ARM_SMMU_FEAT_ATS))
 		return false;
 
-	pdev = to_pci_dev(master->dev);
-	return !pdev->untrusted && pdev->ats_cap;
-}
-#else
-static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
-{
-	return false;
+	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS))
+		return false;
+
+	return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev));
 }
-#endif
 
 static void arm_smmu_enable_ats(struct arm_smmu_master *master)
 {
-- 
Gitee


From 9071fe328135100c92cf61d7ce0105b45e3b228e Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 23 Apr 2020 14:53:30 +0200
Subject: [PATCH 0290/2161] iommu: Remove iommu_sva_ops::mm_exit()

commit edcc40d2ab5f47f205c2dd2a9aeedd8c77de050a upstream.

After binding a device to an mm, device drivers currently need to
register a mm_exit handler. This function is called when the mm exits,
to gracefully stop DMA targeting the address space and flush page faults
to the IOMMU.

This is deemed too complex for the MMU release() notifier, which may be
triggered by any mmput() invocation, from about 120 callsites [1]. The
upcoming SVA module has an example of such complexity: the I/O Page
Fault handler would need to call mmput_async() instead of mmput() after
handling an IOPF, to avoid triggering the release() notifier which would
in turn drain the IOPF queue and lock up.

Another concern is the DMA stop function taking too long, up to several
minutes [2]. For some mmput() callers this may disturb other users. For
example, if the OOM killer picks the mm bound to a device as the victim
and that mm's memory is locked, if the release() takes too long, it
might choose additional innocent victims to kill.

To simplify the MMU release notifier, don't forward the notification to
device drivers. Since they don't stop DMA on mm exit anymore, the PASID
lifetime is extended:

(1) The device driver calls bind(). A PASID is allocated.

  Here any DMA fault is handled by mm, and on error we don't print
  anything to dmesg. Userspace can easily trigger errors by issuing DMA
  on unmapped buffers.

(2) exit_mmap(), for example the process took a SIGKILL. This step
    doesn't happen during normal operations. Remove the pgd from the
    PASID table, since the page tables are about to be freed. Invalidate
    the IOTLBs.

  Here the device may still perform DMA on the address space. Incoming
  transactions are aborted but faults aren't printed out. ATS
  Translation Requests return Successful Translation Completions with
  R=W=0. PRI Page Requests return with Invalid Request.

(3) The device driver stops DMA, possibly following release of a fd, and
    calls unbind(). PASID table is cleared, IOTLB invalidated if
    necessary. The page fault queues are drained, and the PASID is
    freed.

  If DMA for that PASID is still running here, something went seriously
  wrong and errors should be reported.

For now remove iommu_sva_ops entirely. We might need to re-introduce
them at some point, for example to notify device drivers of unhandled
IOPF.

[1] https://lore.kernel.org/linux-iommu/20200306174239.GM31668@ziepe.ca/
[2] https://lore.kernel.org/linux-iommu/4d68da96-0ad5-b412-5987-2f7a6aa796c3@amd.com/

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Acked-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200423125329.782066-3-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 11 -----------
 include/linux/iommu.h | 30 ------------------------------
 2 files changed, 41 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index acd5f575946d..3bc0574f53f8 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2832,17 +2832,6 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
 }
 EXPORT_SYMBOL_GPL(iommu_sva_unbind_device);
 
-int iommu_sva_set_ops(struct iommu_sva *handle,
-		      const struct iommu_sva_ops *sva_ops)
-{
-	if (handle->ops && handle->ops != sva_ops)
-		return -EEXIST;
-
-	handle->ops = sva_ops;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_sva_set_ops);
-
 int iommu_sva_get_pasid(struct iommu_sva *handle)
 {
 	const struct iommu_ops *ops = handle->dev->bus->iommu_ops;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 96acc9731fd7..eb76974d2d18 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -53,8 +53,6 @@ struct iommu_fault_event;
 
 typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
 			struct device *, unsigned long, int, void *);
-typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *,
-				       void *);
 typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *);
 
 struct iommu_domain_geometry {
@@ -171,25 +169,6 @@ enum iommu_dev_features {
 
 #define IOMMU_PASID_INVALID	(-1U)
 
-/**
- * struct iommu_sva_ops - device driver callbacks for an SVA context
- *
- * @mm_exit: called when the mm is about to be torn down by exit_mmap. After
- *           @mm_exit returns, the device must not issue any more transaction
- *           with the PASID given as argument.
- *
- *           The @mm_exit handler is allowed to sleep. Be careful about the
- *           locks taken in @mm_exit, because they might lead to deadlocks if
- *           they are also held when dropping references to the mm. Consider the
- *           following call chain:
- *           mutex_lock(A); mmput(mm) -> exit_mm() -> @mm_exit() -> mutex_lock(A)
- *           Using mmput_async() prevents this scenario.
- *
- */
-struct iommu_sva_ops {
-	iommu_mm_exit_handler_t mm_exit;
-};
-
 #ifdef CONFIG_IOMMU_API
 
 /**
@@ -611,7 +590,6 @@ struct iommu_fwspec {
  */
 struct iommu_sva {
 	struct device			*dev;
-	const struct iommu_sva_ops	*ops;
 };
 
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
@@ -659,8 +637,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm,
 					void *drvdata);
 void iommu_sva_unbind_device(struct iommu_sva *handle);
-int iommu_sva_set_ops(struct iommu_sva *handle,
-		      const struct iommu_sva_ops *ops);
 int iommu_sva_get_pasid(struct iommu_sva *handle);
 
 #else /* CONFIG_IOMMU_API */
@@ -1054,12 +1030,6 @@ static inline void iommu_sva_unbind_device(struct iommu_sva *handle)
 {
 }
 
-static inline int iommu_sva_set_ops(struct iommu_sva *handle,
-				    const struct iommu_sva_ops *ops)
-{
-	return -EINVAL;
-}
-
 static inline int iommu_sva_get_pasid(struct iommu_sva *handle)
 {
 	return IOMMU_PASID_INVALID;
-- 
Gitee


From 435d80f53c68fcfb170d50a1582bcb690203d2ad Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 28 May 2020 11:03:51 -0700
Subject: [PATCH 0291/2161] iommu/vt-d: Fix compile warning

commit e7598fac323aad0e502415edeffd567315994dd6 upstream.

Make intel_svm_unbind_mm() a static function.

Fixes: 064a57d7ddfc ("iommu/vt-d: Replace intel SVM APIs with generic SVA APIs")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1590689031-79318-1-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel-svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 05092ddcef89..b444f45a6b17 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -619,7 +619,7 @@ intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
 }
 
 /* Caller must hold pasid_mutex */
-int intel_svm_unbind_mm(struct device *dev, int pasid)
+static int intel_svm_unbind_mm(struct device *dev, int pasid)
 {
 	struct intel_svm_dev *sdev;
 	struct intel_iommu *iommu;
-- 
Gitee


From ad52f4cd2d31e511c5633b3588f4989dbde901b2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 4 Jun 2020 11:19:44 +0200
Subject: [PATCH 0292/2161] iommu: Check for deferred attach in
 iommu_group_do_dma_attach()

commit 431275afdc7155415254aef4bd3816a1b8a2ead0 upstream.

The iommu_group_do_dma_attach() must not attach devices which have
deferred_attach set. Otherwise devices could cause IOMMU faults when
re-initialized in a kdump kernel.

Fixes: deac0b3bed26 ("iommu: Split off default domain allocation from group assignment")
Reported-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/20200604091944.26402-1-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3bc0574f53f8..a203223b382d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1689,8 +1689,12 @@ static void probe_alloc_default_domain(struct bus_type *bus,
 static int iommu_group_do_dma_attach(struct device *dev, void *data)
 {
 	struct iommu_domain *domain = data;
+	int ret = 0;
 
-	return __iommu_attach_device(domain, dev);
+	if (!iommu_is_attach_deferred(domain, dev))
+		ret = __iommu_attach_device(domain, dev);
+
+	return ret;
 }
 
 static int __iommu_group_dma_attach(struct iommu_group *group)
-- 
Gitee


From b40e3f2cc6ddb263fcdfe9a228cb5af9945073bd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 9 Jun 2020 15:03:03 +0200
Subject: [PATCH 0293/2161] iommu/vt-d: Move Intel IOMMU driver into
 subdirectory

commit 672cf6df9b8a3a6d70a6a5c30397f76fa40d3178 upstream.

Move all files related to the Intel IOMMU driver into its own
subdirectory.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200609130303.26974-3-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 MAINTAINERS                                          |  3 +--
 drivers/iommu/Makefile                               | 12 ++++++------
 .../iommu/{intel-iommu-debugfs.c => intel/debugfs.c} |  0
 drivers/iommu/{ => intel}/dmar.c                     |  2 +-
 drivers/iommu/{ => intel}/intel-pasid.h              |  0
 drivers/iommu/{intel-iommu.c => intel/iommu.c}       |  2 +-
 .../{intel_irq_remapping.c => intel/irq_remapping.c} |  2 +-
 drivers/iommu/{intel-pasid.c => intel/pasid.c}       |  0
 drivers/iommu/{intel-svm.c => intel/svm.c}           |  0
 drivers/iommu/{intel-trace.c => intel/trace.c}       |  0
 10 files changed, 10 insertions(+), 11 deletions(-)
 rename drivers/iommu/{intel-iommu-debugfs.c => intel/debugfs.c} (100%)
 rename drivers/iommu/{ => intel}/dmar.c (99%)
 rename drivers/iommu/{ => intel}/intel-pasid.h (100%)
 rename drivers/iommu/{intel-iommu.c => intel/iommu.c} (99%)
 rename drivers/iommu/{intel_irq_remapping.c => intel/irq_remapping.c} (99%)
 rename drivers/iommu/{intel-pasid.c => intel/pasid.c} (100%)
 rename drivers/iommu/{intel-svm.c => intel/svm.c} (100%)
 rename drivers/iommu/{intel-trace.c => intel/trace.c} (100%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 974064e82ca9..a48b00bf7639 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8305,8 +8305,7 @@ M:	Lu Baolu <baolu.lu@linux.intel.com>
 L:	iommu@lists.linux-foundation.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
 S:	Supported
-F:	drivers/iommu/dmar.c
-F:	drivers/iommu/intel*.[ch]
+F:	drivers/iommu/intel/
 F:	include/linux/intel-iommu.h
 F:	include/linux/intel-svm.h
 
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 35d17094fe3b..43af91354c82 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -16,13 +16,13 @@ obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o
 obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
-obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o
-obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o
-obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
+obj-$(CONFIG_DMAR_TABLE) += intel/dmar.o
+obj-$(CONFIG_INTEL_IOMMU) += intel/iommu.o intel/pasid.o
+obj-$(CONFIG_INTEL_IOMMU) += intel/trace.o
+obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel/debugfs.o
+obj-$(CONFIG_INTEL_IOMMU_SVM) += intel/svm.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
-obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o
+obj-$(CONFIG_IRQ_REMAP) += intel/irq_remapping.o irq_remapping.o
 obj-$(CONFIG_MTK_IOMMU) += mtk_iommu.o
 obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o
 obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
diff --git a/drivers/iommu/intel-iommu-debugfs.c b/drivers/iommu/intel/debugfs.c
similarity index 100%
rename from drivers/iommu/intel-iommu-debugfs.c
rename to drivers/iommu/intel/debugfs.c
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/intel/dmar.c
similarity index 99%
rename from drivers/iommu/dmar.c
rename to drivers/iommu/intel/dmar.c
index 1d406f2a81ff..c008c4a6eaa5 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -32,7 +32,7 @@
 #include <asm/irq_remapping.h>
 #include <asm/iommu_table.h>
 
-#include "irq_remapping.h"
+#include "../irq_remapping.h"
 
 typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
 struct dmar_res_callback {
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel/intel-pasid.h
similarity index 100%
rename from drivers/iommu/intel-pasid.h
rename to drivers/iommu/intel/intel-pasid.h
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel/iommu.c
similarity index 99%
rename from drivers/iommu/intel-iommu.c
rename to drivers/iommu/intel/iommu.c
index bcd14c3adf49..b61caf38859f 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -47,7 +47,7 @@
 #include <asm/iommu.h>
 #include <trace/events/intel_iommu.h>
 
-#include "irq_remapping.h"
+#include "../irq_remapping.h"
 #include "intel-pasid.h"
 
 #define ROOT_SIZE		VTD_PAGE_SIZE
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
similarity index 99%
rename from drivers/iommu/intel_irq_remapping.c
rename to drivers/iommu/intel/irq_remapping.c
index a3fe281a3a5b..33f70c448599 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -21,7 +21,7 @@
 #include <asm/pci-direct.h>
 #include <asm/msidef.h>
 
-#include "irq_remapping.h"
+#include "../irq_remapping.h"
 
 enum irq_mode {
 	IRQ_REMAPPING,
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel/pasid.c
similarity index 100%
rename from drivers/iommu/intel-pasid.c
rename to drivers/iommu/intel/pasid.c
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel/svm.c
similarity index 100%
rename from drivers/iommu/intel-svm.c
rename to drivers/iommu/intel/svm.c
diff --git a/drivers/iommu/intel-trace.c b/drivers/iommu/intel/trace.c
similarity index 100%
rename from drivers/iommu/intel-trace.c
rename to drivers/iommu/intel/trace.c
-- 
Gitee


From 14d7d3d86ec4fbd71f050d5ad2156c9b6fb4d207 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 23 Jun 2020 07:13:41 +0800
Subject: [PATCH 0294/2161] iommu/vt-d: Set U/S bit in first level page table
 by default

commit 16ecf10e815d70d11d2300243f4a3b4c7c5acac7 upstream.

When using first-level translation for IOVA, currently the U/S bit in the
page table is cleared which implies DMA requests with user privilege are
blocked. As the result, following error messages might be observed when
passing through a device to user level:

DMAR: DRHD: handling fault status reg 3
DMAR: [DMA Read] Request device [41:00.0] PASID 1 fault addr 7ecdcd000
        [fault reason 129] SM: U/S set 0 for first-level translation
        with user privilege

This fixes it by setting U/S bit in the first level page table and makes
IOVA over first level compatible with previous second-level translation.

Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level")
Reported-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200622231345.29722-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 5 ++---
 include/linux/intel-iommu.h | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b61caf38859f..30eb5de66e93 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -927,7 +927,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 			if (domain_use_first_level(domain))
-				pteval |= DMA_FL_PTE_XD;
+				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
 			if (cmpxchg64(&pte->val, 0ULL, pteval))
 				/* Someone else set it while we were thinking; use theirs. */
 				free_pgtable_page(tmp_page);
@@ -1957,7 +1957,6 @@ static inline void
 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
 {
 	context->hi |= pasid & ((1 << 20) - 1);
-	context->hi |= (1 << 20);
 }
 
 /*
@@ -2250,7 +2249,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 
 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
 	if (domain_use_first_level(domain))
-		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
+		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
 	if (!sg) {
 		sg_res = nr_pages;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 45eab051de56..545427a4fc48 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -41,6 +41,7 @@
 #define DMA_PTE_SNP		BIT_ULL(11)
 
 #define DMA_FL_PTE_PRESENT	BIT_ULL(0)
+#define DMA_FL_PTE_US		BIT_ULL(2)
 #define DMA_FL_PTE_XD		BIT_ULL(63)
 
 #define ADDR_WIDTH_5LEVEL	(57)
-- 
Gitee


From 925cf2dbff2b218b4f6582bd2f8c2e6dbabd7633 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 23 Jun 2020 07:13:45 +0800
Subject: [PATCH 0295/2161] iommu/vt-d: Fix misuse of
 iommu_domain_identity_map()

commit 48f0bcfb7aad2c6eb4c1e66476b58475aa14393e upstream.

The iommu_domain_identity_map() helper takes start/end PFN as arguments.
Fix a misuse case where the start and end addresses are passed.

Fixes: e70b081c6f376 ("iommu/vt-d: Remove IOVA handling code from the non-dma_ops path")
Reported-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Cc: Tom Murphy <murphyt7@tcd.ie>
Link: https://lore.kernel.org/r/20200622231345.29722-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 30eb5de66e93..6ab0f7c66a52 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2701,7 +2701,9 @@ static int __init si_domain_init(int hw)
 				    end >> agaw_to_width(si_domain->agaw)))
 				continue;
 
-			ret = iommu_domain_identity_map(si_domain, start, end);
+			ret = iommu_domain_identity_map(si_domain,
+					mm_to_dma_pfn(start >> PAGE_SHIFT),
+					mm_to_dma_pfn(end >> PAGE_SHIFT));
 			if (ret)
 				return ret;
 		}
-- 
Gitee


From a2eb93680add5424c810adcf1284c66c3cc274d7 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Tue, 16 Jun 2020 16:47:14 +0200
Subject: [PATCH 0296/2161] iommu: Allow page responses without PASID

commit 970471914c67b70df24def6b2a30cc42acbebded upstream.

Some PCIe devices do not expect a PASID value in PRI Page Responses.
If the "PRG Response PASID Required" bit in the PRI capability is zero,
then the OS should not set the PASID field. Similarly on Arm SMMU,
responses to stall events do not have a PASID.

Currently iommu_page_response() systematically checks that the PASID in
the page response corresponds to the one in the page request. This can't
work with virtualization because a page response coming from a guest OS
won't have a PASID if the passed-through device does not require one.

Add a flag to page requests that declares whether the corresponding
response needs to have a PASID. When this flag isn't set, allow page
responses without PASID.

Reported-by: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20200616144712.748818-1-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c      | 23 +++++++++++++++++------
 include/uapi/linux/iommu.h |  6 +++++-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a203223b382d..143bca80c023 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1185,11 +1185,12 @@ EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 int iommu_page_response(struct device *dev,
 			struct iommu_page_response *msg)
 {
-	bool pasid_valid;
+	bool needs_pasid;
 	int ret = -EINVAL;
 	struct iommu_fault_event *evt;
 	struct iommu_fault_page_request *prm;
 	struct dev_iommu *param = dev->iommu;
+	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
 	if (!domain || !domain->ops->page_response)
@@ -1214,14 +1215,24 @@ int iommu_page_response(struct device *dev,
 	 */
 	list_for_each_entry(evt, &param->fault_param->faults, list) {
 		prm = &evt->fault.prm;
-		pasid_valid = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+		if (prm->grpid != msg->grpid)
+			continue;
 
-		if ((pasid_valid && prm->pasid != msg->pasid) ||
-		    prm->grpid != msg->grpid)
+		/*
+		 * If the PASID is required, the corresponding request is
+		 * matched using the group ID, the PASID valid bit and the PASID
+		 * value. Otherwise only the group ID matches request and
+		 * response.
+		 */
+		needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+		if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid))
 			continue;
 
-		/* Sanitize the reply */
-		msg->flags = pasid_valid ? IOMMU_PAGE_RESP_PASID_VALID : 0;
+		if (!needs_pasid && has_pasid) {
+			/* No big deal, just clear it. */
+			msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
+			msg->pasid = 0;
+		}
 
 		ret = domain->ops->page_response(dev, evt, msg);
 		list_del(&evt->list);
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index e907b7091a46..c2b2caf9ed41 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -81,7 +81,10 @@ struct iommu_fault_unrecoverable {
 /**
  * struct iommu_fault_page_request - Page Request data
  * @flags: encodes whether the corresponding fields are valid and whether this
- *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values)
+ *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values).
+ *         When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response
+ *         must have the same PASID value as the page request. When it is clear,
+ *         the page response should not have a PASID.
  * @pasid: Process Address Space ID
  * @grpid: Page Request Group Index
  * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
@@ -92,6 +95,7 @@ struct iommu_fault_page_request {
 #define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID	(1 << 0)
 #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
 #define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
+#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 3)
 	__u32	flags;
 	__u32	pasid;
 	__u32	grpid;
-- 
Gitee


From 5834fe0e8f170572fa6a239d65246192557de2d6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 25 Jun 2020 15:08:25 +0200
Subject: [PATCH 0297/2161] iommu/vt-d: Use dev_iommu_priv_get/set()

commit 01b9d4e21148c89fdbab3d6b3705f9791314b631 upstream.

Remove the use of dev->archdata.iommu and use the private per-device
pointer provided by IOMMU core code instead.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200625130836.1916-3-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../gpu/drm/i915/selftests/mock_gem_device.c   | 10 ++++++++--
 drivers/iommu/intel/iommu.c                    | 18 +++++++++---------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 01a89c071bf5..8e63dd7a2f3e 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -24,6 +24,7 @@
 
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
+#include <linux/iommu.h>
 
 #include "gt/intel_gt.h"
 #include "gt/mock_engine.h"
@@ -132,6 +133,9 @@ struct drm_i915_private *mock_gem_device(void)
 {
 	struct drm_i915_private *i915;
 	struct pci_dev *pdev;
+#if IS_ENABLED(CONFIG_IOMMU_API) && defined(CONFIG_INTEL_IOMMU)
+	struct dev_iommu iommu;
+#endif
 	int err;
 
 	pdev = kzalloc(sizeof(*pdev) + sizeof(*i915), GFP_KERNEL);
@@ -145,8 +149,10 @@ struct drm_i915_private *mock_gem_device(void)
 	dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 
 #if IS_ENABLED(CONFIG_IOMMU_API) && defined(CONFIG_INTEL_IOMMU)
-	/* hack to disable iommu for the fake device; force identity mapping */
-	pdev->dev.archdata.iommu = (void *)-1;
+	/* HACK HACK HACK to disable iommu for the fake device; force identity mapping */
+	memset(&iommu, 0, sizeof(iommu));
+	iommu.priv = (void *)-1;
+	pdev->dev.iommu = &iommu;
 #endif
 
 	i915 = (struct drm_i915_private *)(pdev + 1);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6ab0f7c66a52..5a46295fad8c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -372,7 +372,7 @@ struct device_domain_info *get_domain_info(struct device *dev)
 	if (!dev)
 		return NULL;
 
-	info = dev->archdata.iommu;
+	info = dev_iommu_priv_get(dev);
 	if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
 		     info == DEFER_DEVICE_DOMAIN_INFO))
 		return NULL;
@@ -743,12 +743,12 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 
 static int iommu_dummy(struct device *dev)
 {
-	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
+	return dev_iommu_priv_get(dev) == DUMMY_DEVICE_DOMAIN_INFO;
 }
 
 static bool attach_deferred(struct device *dev)
 {
-	return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
+	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 }
 
 /**
@@ -2420,7 +2420,7 @@ static inline void unlink_domain_info(struct device_domain_info *info)
 	list_del(&info->link);
 	list_del(&info->global);
 	if (info->dev)
-		info->dev->archdata.iommu = NULL;
+		dev_iommu_priv_set(info->dev, NULL);
 }
 
 static void domain_remove_dev_info(struct dmar_domain *domain)
@@ -2453,7 +2453,7 @@ static void do_deferred_attach(struct device *dev)
 {
 	struct iommu_domain *domain;
 
-	dev->archdata.iommu = NULL;
+	dev_iommu_priv_set(dev, NULL);
 	domain = iommu_get_domain_for_dev(dev);
 	if (domain)
 		intel_iommu_attach_device(domain, dev);
@@ -2599,7 +2599,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	list_add(&info->link, &domain->devices);
 	list_add(&info->global, &device_domain_list);
 	if (dev)
-		dev->archdata.iommu = info;
+		dev_iommu_priv_set(dev, info);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	/* PASID table is mandatory for a PCI device in scalable mode. */
@@ -4009,7 +4009,7 @@ static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+		dev_iommu_priv_set(&pdev->dev, DUMMY_DEVICE_DOMAIN_INFO);
 	}
 }
 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
@@ -4048,7 +4048,7 @@ static void __init init_no_remapping_devices(void)
 			drhd->ignored = 1;
 			for_each_active_dev_scope(drhd->devices,
 						  drhd->devices_cnt, i, dev)
-				dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+				dev_iommu_priv_set(dev, DUMMY_DEVICE_DOMAIN_INFO);
 		}
 	}
 }
@@ -5672,7 +5672,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev)
 		return ERR_PTR(-ENODEV);
 
 	if (translation_pre_enabled(iommu))
-		dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
+		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
 
 	return &iommu->iommu;
 }
-- 
Gitee


From 29e26376d49a40436242d0520897c6a2fe882254 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Fri, 3 Jul 2020 20:10:03 -0400
Subject: [PATCH 0298/2161] iommu: Fix use-after-free in iommu_release_device

commit 9ac8545199a1b711f5643f535b82981faa0b4bf1 upstream.

In pci_disable_sriov(), i.e.,

 # echo 0 > /sys/class/net/enp11s0f1np1/device/sriov_numvfs

iommu_release_device
  iommu_group_remove_device
    arm_smmu_domain_free
      kfree(smmu_domain)

Later,

iommu_release_device
  arm_smmu_release_device
    arm_smmu_detach_dev
      spin_lock_irqsave(&smmu_domain->devices_lock,

would trigger an use-after-free. Fixed it by call
arm_smmu_release_device() first before iommu_group_remove_device().

 BUG: KASAN: use-after-free in __lock_acquire+0x3458/0x4440
  __lock_acquire at kernel/locking/lockdep.c:4250
 Read of size 8 at addr ffff0089df1a6f68 by task bash/3356

 CPU: 5 PID: 3356 Comm: bash Not tainted 5.8.0-rc3-next-20200630 #2
 Hardware name: HPE Apollo 70             /C01_APACHE_MB         , BIOS L50_5.13_1.11 06/18/2019
 Call trace:
  dump_backtrace+0x0/0x398
  show_stack+0x14/0x20
  dump_stack+0x140/0x1b8
  print_address_description.isra.12+0x54/0x4a8
  kasan_report+0x134/0x1b8
  __asan_report_load8_noabort+0x2c/0x50
  __lock_acquire+0x3458/0x4440
  lock_acquire+0x204/0xf10
  _raw_spin_lock_irqsave+0xf8/0x180
  arm_smmu_detach_dev+0xd8/0x4a0
  arm_smmu_detach_dev at drivers/iommu/arm-smmu-v3.c:2776
  arm_smmu_release_device+0xb4/0x1c8
  arm_smmu_disable_pasid at drivers/iommu/arm-smmu-v3.c:2754
  (inlined by) arm_smmu_release_device at drivers/iommu/arm-smmu-v3.c:3000
  iommu_release_device+0xc0/0x178
  iommu_release_device at drivers/iommu/iommu.c:302
  iommu_bus_notifier+0x118/0x160
  notifier_call_chain+0xa4/0x128
  __blocking_notifier_call_chain+0x70/0xa8
  blocking_notifier_call_chain+0x14/0x20
  device_del+0x618/0xa00
  pci_remove_bus_device+0x108/0x2d8
  pci_stop_and_remove_bus_device+0x1c/0x28
  pci_iov_remove_virtfn+0x228/0x368
  sriov_disable+0x8c/0x348
  pci_disable_sriov+0x5c/0x70
  mlx5_core_sriov_configure+0xd8/0x260 [mlx5_core]
  sriov_numvfs_store+0x240/0x318
  dev_attr_store+0x38/0x68
  sysfs_kf_write+0xdc/0x128
  kernfs_fop_write+0x23c/0x448
  __vfs_write+0x54/0xe8
  vfs_write+0x124/0x3f0
  ksys_write+0xe8/0x1b8
  __arm64_sys_write+0x68/0x98
  do_el0_svc+0x124/0x220
  el0_sync_handler+0x260/0x408
  el0_sync+0x140/0x180

 Allocated by task 3356:
  save_stack+0x24/0x50
  __kasan_kmalloc.isra.13+0xc4/0xe0
  kasan_kmalloc+0xc/0x18
  kmem_cache_alloc_trace+0x1ec/0x318
  arm_smmu_domain_alloc+0x54/0x148
  iommu_group_alloc_default_domain+0xc0/0x440
  iommu_probe_device+0x1c0/0x308
  iort_iommu_configure+0x434/0x518
  acpi_dma_configure+0xf0/0x128
  pci_dma_configure+0x114/0x160
  really_probe+0x124/0x6d8
  driver_probe_device+0xc4/0x180
  __device_attach_driver+0x184/0x1e8
  bus_for_each_drv+0x114/0x1a0
  __device_attach+0x19c/0x2a8
  device_attach+0x10/0x18
  pci_bus_add_device+0x70/0xf8
  pci_iov_add_virtfn+0x7b4/0xb40
  sriov_enable+0x5c8/0xc30
  pci_enable_sriov+0x64/0x80
  mlx5_core_sriov_configure+0x58/0x260 [mlx5_core]
  sriov_numvfs_store+0x1c0/0x318
  dev_attr_store+0x38/0x68
  sysfs_kf_write+0xdc/0x128
  kernfs_fop_write+0x23c/0x448
  __vfs_write+0x54/0xe8
  vfs_write+0x124/0x3f0
  ksys_write+0xe8/0x1b8
  __arm64_sys_write+0x68/0x98
  do_el0_svc+0x124/0x220
  el0_sync_handler+0x260/0x408
  el0_sync+0x140/0x180

 Freed by task 3356:
  save_stack+0x24/0x50
  __kasan_slab_free+0x124/0x198
  kasan_slab_free+0x10/0x18
  slab_free_freelist_hook+0x110/0x298
  kfree+0x128/0x668
  arm_smmu_domain_free+0xf4/0x1a0
  iommu_group_release+0xec/0x160
  kobject_put+0xf4/0x238
  kobject_del+0x110/0x190
  kobject_put+0x1e4/0x238
  iommu_group_remove_device+0x394/0x938
  iommu_release_device+0x9c/0x178
  iommu_release_device at drivers/iommu/iommu.c:300
  iommu_bus_notifier+0x118/0x160
  notifier_call_chain+0xa4/0x128
  __blocking_notifier_call_chain+0x70/0xa8
  blocking_notifier_call_chain+0x14/0x20
  device_del+0x618/0xa00
  pci_remove_bus_device+0x108/0x2d8
  pci_stop_and_remove_bus_device+0x1c/0x28
  pci_iov_remove_virtfn+0x228/0x368
  sriov_disable+0x8c/0x348
  pci_disable_sriov+0x5c/0x70
  mlx5_core_sriov_configure+0xd8/0x260 [mlx5_core]
  sriov_numvfs_store+0x240/0x318
  dev_attr_store+0x38/0x68
  sysfs_kf_write+0xdc/0x128
  kernfs_fop_write+0x23c/0x448
  __vfs_write+0x54/0xe8
  vfs_write+0x124/0x3f0
  ksys_write+0xe8/0x1b8
  __arm64_sys_write+0x68/0x98
  do_el0_svc+0x124/0x220
  el0_sync_handler+0x260/0x408
  el0_sync+0x140/0x180

 The buggy address belongs to the object at ffff0089df1a6e00
  which belongs to the cache kmalloc-512 of size 512
 The buggy address is located 360 bytes inside of
  512-byte region [ffff0089df1a6e00, ffff0089df1a7000)
 The buggy address belongs to the page:
 page:ffffffe02257c680 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff0089df1a1400
 flags: 0x7ffff800000200(slab)
 raw: 007ffff800000200 ffffffe02246b8c8 ffffffe02257ff88 ffff000000320680
 raw: ffff0089df1a1400 00000000002a000e 00000001ffffffff ffff0089df1a5001
 page dumped because: kasan: bad access detected
 page->mem_cgroup:ffff0089df1a5001

 Memory state around the buggy address:
  ffff0089df1a6e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff0089df1a6e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 >ffff0089df1a6f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                                           ^
  ffff0089df1a6f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff0089df1a7000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc

Fixes: a6a4c7e2c5b8 ("iommu: Add probe_device() and release_device() call-backs")
Signed-off-by: Qian Cai <cai@lca.pw>
Link: https://lore.kernel.org/r/20200704001003.2303-1-cai@lca.pw
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 143bca80c023..84ac68bcfbba 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -295,10 +295,10 @@ void iommu_release_device(struct device *dev)
 		return;
 
 	iommu_device_unlink(dev->iommu->iommu_dev, dev);
-	iommu_group_remove_device(dev);
 
 	ops->release_device(dev);
 
+	iommu_group_remove_device(dev);
 	module_put(ops->owner);
 	dev_iommu_free(dev);
 }
-- 
Gitee


From 4a0d988a7eaf323a1730c716ee23493dcd457bd4 Mon Sep 17 00:00:00 2001
From: Rajat Jain <rajatja@google.com>
Date: Tue, 7 Jul 2020 15:46:03 -0700
Subject: [PATCH 0299/2161] PCI: Treat "external-facing" devices themselves as
 internal

commit 99b50be9d8ec9ef319cc7d5de07f4d405fac7764 upstream.

"External-facing" devices are internal devices that expose PCIe hierarchies
such as Thunderbolt outside the platform [1].  Previously these internal
devices were marked as "untrusted" the same as devices downstream from
them.

Use the ACPI or DT information to identify external-facing devices, but
only mark the devices *downstream* from them as "untrusted" [2].  The
external-facing device itself is no longer marked as untrusted.

[1] https://docs.microsoft.com/en-us/windows-hardware/drivers/pci/dsd-for-pcie-root-ports#identifying-externally-exposed-pcie-root-ports
[2] https://lore.kernel.org/linux-pci/20200610230906.GA1528594@bjorn-Precision-5520/
Link: https://lore.kernel.org/r/20200707224604.3737893-3-rajatja@google.com
Signed-off-by: Rajat Jain <rajatja@google.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 6 +++---
 drivers/pci/of.c            | 2 +-
 drivers/pci/pci-acpi.c      | 9 ++++-----
 drivers/pci/probe.c         | 2 +-
 include/linux/pci.h         | 6 ++++++
 5 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5a46295fad8c..6600e91fb143 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4744,12 +4744,12 @@ const struct attribute_group *intel_iommu_groups[] = {
 	NULL,
 };
 
-static inline bool has_untrusted_dev(void)
+static inline bool has_external_pci(void)
 {
 	struct pci_dev *pdev = NULL;
 
 	for_each_pci_dev(pdev)
-		if (pdev->untrusted)
+		if (pdev->external_facing)
 			return true;
 
 	return false;
@@ -4757,7 +4757,7 @@ static inline bool has_untrusted_dev(void)
 
 static int __init platform_optin_force_iommu(void)
 {
-	if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
+	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
 		return 0;
 
 	if (no_iommu || dmar_disabled)
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index 36891e7deee3..f74b082b8e4a 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -42,7 +42,7 @@ void pci_set_bus_of_node(struct pci_bus *bus)
 	} else {
 		node = of_node_get(bus->self->dev.of_node);
 		if (node && of_property_read_bool(node, "external-facing"))
-			bus->self->untrusted = true;
+			bus->self->external_facing = true;
 	}
 
 	bus->dev.of_node = node;
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index e30c2a78a88f..3a87b5b7f9b0 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -1223,7 +1223,7 @@ static void pci_acpi_optimize_delay(struct pci_dev *pdev,
 	ACPI_FREE(obj);
 }
 
-static void pci_acpi_set_untrusted(struct pci_dev *dev)
+static void pci_acpi_set_external_facing(struct pci_dev *dev)
 {
 	u8 val;
 
@@ -1234,11 +1234,10 @@ static void pci_acpi_set_untrusted(struct pci_dev *dev)
 
 	/*
 	 * These root ports expose PCIe (including DMA) outside of the
-	 * system so make sure we treat them and everything behind as
-	 * untrusted.
+	 * system.  Everything downstream from them is external.
 	 */
 	if (val)
-		dev->untrusted = 1;
+		dev->external_facing = 1;
 }
 
 static void pci_acpi_setup(struct device *dev)
@@ -1250,7 +1249,7 @@ static void pci_acpi_setup(struct device *dev)
 		return;
 
 	pci_acpi_optimize_delay(pci_dev, adev->handle);
-	pci_acpi_set_untrusted(pci_dev);
+	pci_acpi_set_external_facing(pci_dev);
 
 	pci_acpi_add_pm_notifier(adev, pci_dev);
 	if (!adev->wakeup.flags.valid)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 89c261c159f1..8891935fcb31 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1509,7 +1509,7 @@ static void set_pcie_untrusted(struct pci_dev *dev)
 	 * untrusted as well.
 	 */
 	parent = pci_upstream_bridge(dev);
-	if (parent && parent->untrusted)
+	if (parent && (parent->untrusted || parent->external_facing))
 		dev->untrusted = true;
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0811885de1c9..aa99fb56f7dd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -415,6 +415,12 @@ struct pci_dev {
 	 * mappings to make sure they cannot access arbitrary memory.
 	 */
 	unsigned int	untrusted:1;
+	/*
+	 * Info from the platform, e.g., ACPI or device tree, may mark a
+	 * device as "external-facing".  An external-facing device is
+	 * itself internal but devices downstream from it are external.
+	 */
+	unsigned int	external_facing:1;
 	unsigned int	__aer_firmware_first_valid:1;
 	unsigned int	__aer_firmware_first:1;
 	unsigned int	broken_intx_masking:1;	/* INTx masking can't be used */
-- 
Gitee


From 7637d7beae3675c4f29aaf71396b621bdd024df0 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 19 Nov 2021 12:32:22 +0800
Subject: [PATCH 0300/2161] treewide: Remove uninitialized_var() usage

commit 3f649ab728cda8038259d8f14492fe400fbab911 upstream.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6600e91fb143..6cc9cd51a624 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2236,7 +2236,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			    unsigned long nr_pages, int prot)
 {
 	struct dma_pte *first_pte = NULL, *pte = NULL;
-	phys_addr_t uninitialized_var(pteval);
+	phys_addr_t pteval;
 	unsigned long sg_res = 0;
 	unsigned int largepage_lvl = 0;
 	unsigned long lvl_pages = 0;
-- 
Gitee


From c877f56a9f57392752e662932db1f25e74851ab5 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:15 +0800
Subject: [PATCH 0301/2161] iommu/vt-d: Remove global page support in devTLB
 flush

commit 78df6c86f0691f5b6e325006aeb470de443351ea upstream.

Global pages support is removed from VT-d spec 3.0 for dev TLB
invalidation. This patch is to remove the bits for vSVA. Similar change
already made for the native SVA. See the link below.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/linux-iommu/20190830142919.GE11578@8bytes.org/T/
Link: https://lore.kernel.org/r/20200724014925.15523-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c  | 4 +---
 drivers/iommu/intel/iommu.c | 4 ++--
 include/linux/intel-iommu.h | 3 +--
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index c008c4a6eaa5..cfbb8911e8ea 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1455,8 +1455,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
 
 /* PASID-based device IOTLB Invalidate */
 void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
-			      u32 pasid,  u16 qdep, u64 addr,
-			      unsigned int size_order, u64 granu)
+			      u32 pasid,  u16 qdep, u64 addr, unsigned int size_order)
 {
 	unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1);
 	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
@@ -1464,7 +1463,6 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 	desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
 		QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
 		QI_DEV_IOTLB_PFSID(pfsid);
-	desc.qw1 = QI_DEV_EIOTLB_GLOB(granu);
 
 	/*
 	 * If S bit is 0, we only flush a single page. If S bit is set,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6cc9cd51a624..df9daee5171f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5481,7 +5481,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 						info->pfsid, pasid,
 						info->ats_qdep,
 						inv_info->addr_info.addr,
-						size, granu);
+						size);
 			break;
 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
 			if (info->ats_enabled)
@@ -5489,7 +5489,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 						info->pfsid, pasid,
 						info->ats_qdep,
 						inv_info->addr_info.addr,
-						size, granu);
+						size);
 			else
 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
 			break;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 545427a4fc48..c102c58c6649 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -381,7 +381,6 @@ enum {
 
 #define QI_DEV_EIOTLB_ADDR(a)	((u64)(a) & VTD_PAGE_MASK)
 #define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
-#define QI_DEV_EIOTLB_GLOB(g)	((u64)(g) & 0x1)
 #define QI_DEV_EIOTLB_PASID(p)	((u64)((p) & 0xfffff) << 32)
 #define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 16)
 #define QI_DEV_EIOTLB_QDEP(qd)	((u64)((qd) & 0x1f) << 4)
@@ -707,7 +706,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
 
 void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 			      u32 pasid, u16 qdep, u64 addr,
-			      unsigned int size_order, u64 granu);
+			      unsigned int size_order);
 void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
 			  int pasid);
 
-- 
Gitee


From e51007d7e676fcc00e8907ca88efa032de03c4ee Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:16 +0800
Subject: [PATCH 0302/2161] iommu/vt-d: Fix PASID devTLB invalidation

commit e7e69461a83264dbce2b4ff480f858f3f1454db7 upstream.

DevTLB flush can be used for both DMA request with and without PASIDs.
The former uses PASID#0 (RID2PASID), latter uses non-zero PASID for SVA
usage.

This patch adds a check for PASID value such that devTLB flush with
PASID is used for SVA case. This is more efficient in that multiple
PASIDs can be used by a single device, when tearing down a PASID entry
we shall flush only the devTLB specific to a PASID.

Fixes: 6f7db75e1c46 ("iommu/vt-d: Add second level page table")
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20200724014925.15523-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index c81f0f17c6ba..fa0154cce537 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -486,7 +486,16 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
 	qdep = info->ats_qdep;
 	pfsid = info->pfsid;
 
-	qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+	/*
+	 * When PASID 0 is used, it indicates RID2PASID(DMA request w/o PASID),
+	 * devTLB flush w/o PASID should be used. For non-zero PASID under
+	 * SVA usage, device could do DMA with multiple PASIDs. It is more
+	 * efficient to flush devTLB specific to the PASID.
+	 */
+	if (pasid == PASID_RID2PASID)
+		qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+	else
+		qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT);
 }
 
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
-- 
Gitee


From ea3d472a993fddae9744fc32fcfd73b48b28da02 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Fri, 24 Jul 2020 09:49:17 +0800
Subject: [PATCH 0303/2161] iommu/vt-d: Handle non-page aligned address

commit 288d08e78008828416ffaa85ef274b4e29ef3dae upstream.

Address information for device TLB invalidation comes from userspace
when device is directly assigned to a guest with vIOMMU support.
VT-d requires page aligned address. This patch checks and enforce
address to be page aligned, otherwise reserved bits can be set in the
invalidation descriptor. Unrecoverable fault will be reported due to
non-zero value in the reserved bits.

Fixes: 61a06a16e36d8 ("iommu/vt-d: Support flushing more translation cache types")
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20200724014925.15523-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index cfbb8911e8ea..503e95e5cc11 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1473,9 +1473,26 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 	 * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
 	 * ECAP.
 	 */
-	desc.qw1 |= addr & ~mask;
-	if (size_order)
+	if (addr & GENMASK_ULL(size_order + VTD_PAGE_SHIFT, 0))
+		pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n",
+				    addr, size_order);
+
+	/* Take page address */
+	desc.qw1 = QI_DEV_EIOTLB_ADDR(addr);
+
+	if (size_order) {
+		/*
+		 * Existing 0s in address below size_order may be the least
+		 * significant bit, we must set them to 1s to avoid having
+		 * smaller size than desired.
+		 */
+		desc.qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1,
+					VTD_PAGE_SHIFT);
+		/* Clear size_order bit to indicate size */
+		desc.qw1 &= ~mask;
+		/* Set the S bit to indicate flushing more than 1 page */
 		desc.qw1 |= QI_DEV_EIOTLB_SIZE;
+	}
 
 	qi_submit_sync(iommu, &desc, 1, 0);
 }
-- 
Gitee


From 41641b7cda34c38676c7ef53a2e29ae979327ca4 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Fri, 24 Jul 2020 09:49:18 +0800
Subject: [PATCH 0304/2161] iommu/vt-d: Fix devTLB flush for vSVA

commit 0fa1a15fa9b37be3899d7fec552c77b67baa8ac7 upstream.

For guest SVA usage, in order to optimize for less VMEXIT, guest request
of IOTLB flush also includes device TLB.

On the host side, IOMMU driver performs IOTLB and implicit devTLB
invalidation. When PASID-selective granularity is requested by the guest
we need to derive the equivalent address range for devTLB instead of
using the address information in the UAPI data. The reason for that is,
unlike IOTLB flush, devTLB flush does not support PASID-selective
granularity. This is to say, we need to set the following in the PASID
based devTLB invalidation descriptor:
- entire 64 bit range in address ~(0x1 << 63)
- S bit = 1 (VT-d CH 6.5.2.6).

Without this fix, device TLB flush range is not set properly for PASID
selective granularity. This patch also merged devTLB flush code for both
implicit and explicit cases.

Fixes: 6ee1b77ba3ac ("iommu/vt-d: Add svm/sva invalidate function")
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20200724014925.15523-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index df9daee5171f..c5b5618276d1 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5423,7 +5423,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 	sid = PCI_DEVID(bus, devfn);
 
 	/* Size is only valid in address selective invalidation */
-	if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
+	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
 		size = to_vtd_size(inv_info->addr_info.granule_size,
 				   inv_info->addr_info.nb_granules);
 
@@ -5432,6 +5432,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 			 IOMMU_CACHE_INV_TYPE_NR) {
 		int granu = 0;
 		u64 pasid = 0;
+		u64 addr = 0;
 
 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
 		if (granu == -EINVAL) {
@@ -5471,24 +5472,34 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
 					inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
 
+			if (!info->ats_enabled)
+				break;
 			/*
 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
 			 * in the guest may assume IOTLB flush is inclusive,
 			 * which is more efficient.
 			 */
-			if (info->ats_enabled)
-				qi_flush_dev_iotlb_pasid(iommu, sid,
-						info->pfsid, pasid,
-						info->ats_qdep,
-						inv_info->addr_info.addr,
-						size);
-			break;
+			fallthrough;
 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
+			/*
+			 * PASID based device TLB invalidation does not support
+			 * IOMMU_INV_GRANU_PASID granularity but only supports
+			 * IOMMU_INV_GRANU_ADDR.
+			 * The equivalent of that is we set the size to be the
+			 * entire range of 64 bit. User only provides PASID info
+			 * without address info. So we set addr to 0.
+			 */
+			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
+				size = 64 - VTD_PAGE_SHIFT;
+				addr = 0;
+			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
+				addr = inv_info->addr_info.addr;
+			}
+
 			if (info->ats_enabled)
 				qi_flush_dev_iotlb_pasid(iommu, sid,
 						info->pfsid, pasid,
-						info->ats_qdep,
-						inv_info->addr_info.addr,
+						info->ats_qdep, addr,
 						size);
 			else
 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
-- 
Gitee


From ae24ebfad65259ce0025e8f1791d2b18490937e7 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:19 +0800
Subject: [PATCH 0305/2161] iommu/vt-d: Warn on out-of-range invalidation
 address

commit 1ff00279655d95ae9c285c39878aedf9ff008d25 upstream.

For guest requested IOTLB invalidation, address and mask are provided as
part of the invalidation data. VT-d HW silently ignores any address bits
below the mask. SW shall also allow such case but give warning if
address does not align with the mask. This patch relax the fault
handling from error to warning and proceed with invalidation request
with the given mask.

Fixes: 6ee1b77ba3ac0 ("iommu/vt-d: Add svm/sva invalidate function")
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20200724014925.15523-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c5b5618276d1..732f599504c5 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5454,13 +5454,12 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 
 		switch (BIT(cache_type)) {
 		case IOMMU_CACHE_INV_TYPE_IOTLB:
+			/* HW will ignore LSB bits based on address mask */
 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
 			    size &&
 			    (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
-				pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
+				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
 						   inv_info->addr_info.addr, size);
-				ret = -ERANGE;
-				goto out_unlock;
 			}
 
 			/*
-- 
Gitee


From c0ff9372568d0f97fdd7cbd204183ed5f8bcc055 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:20 +0800
Subject: [PATCH 0306/2161] iommu/vt-d: Disable multiple GPASID-dev bind

commit d315e9e684d1efd4cb2e8cd70b8d71dec02fcf1f upstream.

For the unlikely use case where multiple aux domains from the same pdev
are attached to a single guest and then bound to a single process
(thus same PASID) within that guest, we cannot easily support this case
by refcounting the number of users. As there is only one SL page table
per PASID while we have multiple aux domains thus multiple SL page tables
for the same PASID.

Extra unbinding guest PASID can happen due to race between normal and
exception cases. Termination of one aux domain may affect others unless
we actively track and switch aux domains to ensure the validity of SL
page tables and TLB states in the shared PASID entry.

Support for sharing second level PGDs across domains can reduce the
complexity but this is not available due to the limitations on VFIO
container architecture. We can revisit this decision once sharing PGDs
are available.

Overall, the complexity and potential glitch do not warrant this unlikely
use case thereby removed by this patch.

Fixes: 56722a4398a30 ("iommu/vt-d: Add bind guest PASID support")
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200724014925.15523-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index b444f45a6b17..b291e9b63949 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -295,20 +295,16 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			goto out;
 		}
 
+		/*
+		 * Do not allow multiple bindings of the same device-PASID since
+		 * there is only one SL page tables per PASID. We may revisit
+		 * once sharing PGD across domains are supported.
+		 */
 		for_each_svm_dev(sdev, svm, dev) {
-			/*
-			 * For devices with aux domains, we should allow
-			 * multiple bind calls with the same PASID and pdev.
-			 */
-			if (iommu_dev_feature_enabled(dev,
-						      IOMMU_DEV_FEAT_AUX)) {
-				sdev->users++;
-			} else {
-				dev_warn_ratelimited(dev,
-						     "Already bound with PASID %u\n",
-						     svm->pasid);
-				ret = -EBUSY;
-			}
+			dev_warn_ratelimited(dev,
+					     "Already bound with PASID %u\n",
+					     svm->pasid);
+			ret = -EBUSY;
 			goto out;
 		}
 	} else {
-- 
Gitee


From 8ec038ac7aee7e4af73760ec4694451d3ac600fa Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:21 +0800
Subject: [PATCH 0307/2161] iommu/vt-d: Refactor device_to_iommu() helper

commit dd6692f1b883bac46036000a1e3a0b3785f89e87 upstream.

It is refactored in two ways:

- Make it global so that it could be used in other files.

- Make bus/devfn optional so that callers could ignore these two returned
values when they only want to get the coresponding iommu pointer.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200724014925.15523-9-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 55 +++++++++++--------------------------
 drivers/iommu/intel/svm.c   |  8 +++---
 include/linux/intel-iommu.h |  3 +-
 3 files changed, 21 insertions(+), 45 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 732f599504c5..25a1c564ace7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -778,16 +778,16 @@ is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 	return false;
 }
 
-static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
+struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 {
 	struct dmar_drhd_unit *drhd = NULL;
+	struct pci_dev *pdev = NULL;
 	struct intel_iommu *iommu;
 	struct device *tmp;
-	struct pci_dev *pdev = NULL;
 	u16 segment = 0;
 	int i;
 
-	if (iommu_dummy(dev))
+	if (!dev || iommu_dummy(dev))
 		return NULL;
 
 	if (dev_is_pci(dev)) {
@@ -818,8 +818,10 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
 				if (pdev && pdev->is_virtfn)
 					goto got_pdev;
 
-				*bus = drhd->devices[i].bus;
-				*devfn = drhd->devices[i].devfn;
+				if (bus && devfn) {
+					*bus = drhd->devices[i].bus;
+					*devfn = drhd->devices[i].devfn;
+				}
 				goto out;
 			}
 
@@ -829,8 +831,10 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
 
 		if (pdev && drhd->include_all) {
 		got_pdev:
-			*bus = pdev->bus->number;
-			*devfn = pdev->devfn;
+			if (bus && devfn) {
+				*bus = pdev->bus->number;
+				*devfn = pdev->devfn;
+			}
 			goto out;
 		}
 	}
@@ -5153,11 +5157,10 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 			      struct device *dev)
 {
 	int ret;
-	u8 bus, devfn;
 	unsigned long flags;
 	struct intel_iommu *iommu;
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
+	iommu = device_to_iommu(dev, NULL, NULL);
 	if (!iommu)
 		return -ENODEV;
 
@@ -5243,9 +5246,8 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	struct intel_iommu *iommu;
 	int addr_width;
-	u8 bus, devfn;
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
+	iommu = device_to_iommu(dev, NULL, NULL);
 	if (!iommu)
 		return -ENODEV;
 
@@ -5675,9 +5677,8 @@ static bool intel_iommu_capable(enum iommu_cap cap)
 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
 {
 	struct intel_iommu *iommu;
-	u8 bus, devfn;
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
+	iommu = device_to_iommu(dev, NULL, NULL);
 	if (!iommu)
 		return ERR_PTR(-ENODEV);
 
@@ -5690,9 +5691,8 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev)
 static void intel_iommu_release_device(struct device *dev)
 {
 	struct intel_iommu *iommu;
-	u8 bus, devfn;
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
+	iommu = device_to_iommu(dev, NULL, NULL);
 	if (!iommu)
 		return;
 
@@ -5842,37 +5842,14 @@ static struct iommu_group *intel_iommu_device_group(struct device *dev)
 	return generic_device_group(dev);
 }
 
-#ifdef CONFIG_INTEL_IOMMU_SVM
-struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
-{
-	struct intel_iommu *iommu;
-	u8 bus, devfn;
-
-	if (iommu_dummy(dev)) {
-		dev_warn(dev,
-			 "No IOMMU translation for device; cannot enable SVM\n");
-		return NULL;
-	}
-
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if ((!iommu)) {
-		dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
-		return NULL;
-	}
-
-	return iommu;
-}
-#endif /* CONFIG_INTEL_IOMMU_SVM */
-
 static int intel_iommu_enable_auxd(struct device *dev)
 {
 	struct device_domain_info *info;
 	struct intel_iommu *iommu;
 	unsigned long flags;
-	u8 bus, devfn;
 	int ret;
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
+	iommu = device_to_iommu(dev, NULL, NULL);
 	if (!iommu || dmar_disabled)
 		return -EINVAL;
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index b291e9b63949..0a06955e8580 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -249,7 +249,7 @@ static LIST_HEAD(global_svm_list);
 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			  struct iommu_gpasid_bind_data *data)
 {
-	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct dmar_domain *dmar_domain;
 	struct intel_svm_dev *sdev;
 	struct intel_svm *svm;
@@ -387,7 +387,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 
 int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 {
-	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct intel_svm_dev *sdev;
 	struct intel_svm *svm;
 	int ret = -EINVAL;
@@ -444,7 +444,7 @@ static int
 intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
 		  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
-	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct device_domain_info *info;
 	struct intel_svm_dev *sdev;
 	struct intel_svm *svm = NULL;
@@ -622,7 +622,7 @@ static int intel_svm_unbind_mm(struct device *dev, int pasid)
 	struct intel_svm *svm;
 	int ret = -EINVAL;
 
-	iommu = intel_svm_device_to_iommu(dev);
+	iommu = device_to_iommu(dev, NULL, NULL);
 	if (!iommu)
 		goto out;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index c102c58c6649..59394cd63f4a 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -729,6 +729,7 @@ void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
 struct dmar_domain *find_domain(struct device *dev);
 struct device_domain_info *get_domain_info(struct device *dev);
+struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 extern void intel_svm_check(struct intel_iommu *iommu);
@@ -767,8 +768,6 @@ struct intel_svm {
 	struct list_head devs;
 	struct list_head list;
 };
-
-extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev);
 #else
 static inline void intel_svm_check(struct intel_iommu *iommu) {}
 #endif
-- 
Gitee


From 0167dff406cfea8a7737c98b29f058fd27ff68df Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:22 +0800
Subject: [PATCH 0308/2161] iommu/vt-d: Add a helper to get svm and sdev for
 pasid

commit 19abcf70c2b16834a607db515b26a32b233c79eb upstream.

There are several places in the code that need to get the pointers of
svm and sdev according to a pasid and device. Add a helper to achieve
this for code consolidation and readability.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200724014925.15523-10-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 115 +++++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 50 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 0a06955e8580..a254afd1799a 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -246,13 +246,57 @@ static LIST_HEAD(global_svm_list);
 	list_for_each_entry((sdev), &(svm)->devs, list)	\
 		if ((d) != (sdev)->dev) {} else
 
+static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
+			     struct intel_svm **rsvm,
+			     struct intel_svm_dev **rsdev)
+{
+	struct intel_svm_dev *d, *sdev = NULL;
+	struct intel_svm *svm;
+
+	/* The caller should hold the pasid_mutex lock */
+	if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
+		return -EINVAL;
+
+	if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
+		return -EINVAL;
+
+	svm = ioasid_find(NULL, pasid, NULL);
+	if (IS_ERR(svm))
+		return PTR_ERR(svm);
+
+	if (!svm)
+		goto out;
+
+	/*
+	 * If we found svm for the PASID, there must be at least one device
+	 * bond.
+	 */
+	if (WARN_ON(list_empty(&svm->devs)))
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(d, &svm->devs, list) {
+		if (d->dev == dev) {
+			sdev = d;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+out:
+	*rsvm = svm;
+	*rsdev = sdev;
+
+	return 0;
+}
+
 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			  struct iommu_gpasid_bind_data *data)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct intel_svm_dev *sdev = NULL;
 	struct dmar_domain *dmar_domain;
-	struct intel_svm_dev *sdev;
-	struct intel_svm *svm;
+	struct intel_svm *svm = NULL;
 	int ret = 0;
 
 	if (WARN_ON(!iommu) || !data)
@@ -279,35 +323,23 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	dmar_domain = to_dmar_domain(domain);
 
 	mutex_lock(&pasid_mutex);
-	svm = ioasid_find(NULL, data->hpasid, NULL);
-	if (IS_ERR(svm)) {
-		ret = PTR_ERR(svm);
+	ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
+	if (ret)
 		goto out;
-	}
-
-	if (svm) {
-		/*
-		 * If we found svm for the PASID, there must be at
-		 * least one device bond, otherwise svm should be freed.
-		 */
-		if (WARN_ON(list_empty(&svm->devs))) {
-			ret = -EINVAL;
-			goto out;
-		}
 
+	if (sdev) {
 		/*
 		 * Do not allow multiple bindings of the same device-PASID since
 		 * there is only one SL page tables per PASID. We may revisit
 		 * once sharing PGD across domains are supported.
 		 */
-		for_each_svm_dev(sdev, svm, dev) {
-			dev_warn_ratelimited(dev,
-					     "Already bound with PASID %u\n",
-					     svm->pasid);
-			ret = -EBUSY;
-			goto out;
-		}
-	} else {
+		dev_warn_ratelimited(dev, "Already bound with PASID %u\n",
+				     svm->pasid);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (!svm) {
 		/* We come here when PASID has never been bond to a device. */
 		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
 		if (!svm) {
@@ -390,25 +422,17 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct intel_svm_dev *sdev;
 	struct intel_svm *svm;
-	int ret = -EINVAL;
+	int ret;
 
 	if (WARN_ON(!iommu))
 		return -EINVAL;
 
 	mutex_lock(&pasid_mutex);
-	svm = ioasid_find(NULL, pasid, NULL);
-	if (!svm) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (IS_ERR(svm)) {
-		ret = PTR_ERR(svm);
+	ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
+	if (ret)
 		goto out;
-	}
 
-	for_each_svm_dev(sdev, svm, dev) {
-		ret = 0;
+	if (sdev) {
 		if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
 			sdev->users--;
 		if (!sdev->users) {
@@ -432,7 +456,6 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 				kfree(svm);
 			}
 		}
-		break;
 	}
 out:
 	mutex_unlock(&pasid_mutex);
@@ -610,7 +633,7 @@ intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
 	if (sd)
 		*sd = sdev;
 	ret = 0;
- out:
+out:
 	return ret;
 }
 
@@ -626,17 +649,11 @@ static int intel_svm_unbind_mm(struct device *dev, int pasid)
 	if (!iommu)
 		goto out;
 
-	svm = ioasid_find(NULL, pasid, NULL);
-	if (!svm)
-		goto out;
-
-	if (IS_ERR(svm)) {
-		ret = PTR_ERR(svm);
+	ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
+	if (ret)
 		goto out;
-	}
 
-	for_each_svm_dev(sdev, svm, dev) {
-		ret = 0;
+	if (sdev) {
 		sdev->users--;
 		if (!sdev->users) {
 			list_del_rcu(&sdev->list);
@@ -665,10 +682,8 @@ static int intel_svm_unbind_mm(struct device *dev, int pasid)
 				kfree(svm);
 			}
 		}
-		break;
 	}
- out:
-
+out:
 	return ret;
 }
 
-- 
Gitee


From c52f9063d62e258a91fea83f388d0b806e4e91bf Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:23 +0800
Subject: [PATCH 0309/2161] iommu/vt-d: Report page request faults for guest
 SVA

commit eb8d93ea3c1d35b0baf693dd0b7c87ec62358fc9 upstream.

A pasid might be bound to a page table from a VM guest via the iommu
ops.sva_bind_gpasid. In this case, when a DMA page fault is detected
on the physical IOMMU, we need to inject the page fault request into
the guest. After the guest completes handling the page fault, a page
response need to be sent back via the iommu ops.page_response().

This adds support to report a page request fault. Any external module
which is interested in handling this fault should regiester a notifier
with iommu_register_device_fault_handler().

Co-developed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Co-developed-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200724014925.15523-11-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 103 +++++++++++++++++++++++++++++++-------
 1 file changed, 85 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index a254afd1799a..009188f3a5f8 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -829,8 +829,63 @@ static void intel_svm_drain_prq(struct device *dev, int pasid)
 	}
 }
 
+static int prq_to_iommu_prot(struct page_req_dsc *req)
+{
+	int prot = 0;
+
+	if (req->rd_req)
+		prot |= IOMMU_FAULT_PERM_READ;
+	if (req->wr_req)
+		prot |= IOMMU_FAULT_PERM_WRITE;
+	if (req->exe_req)
+		prot |= IOMMU_FAULT_PERM_EXEC;
+	if (req->pm_req)
+		prot |= IOMMU_FAULT_PERM_PRIV;
+
+	return prot;
+}
+
+static int
+intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
+{
+	struct iommu_fault_event event;
+
+	if (!dev || !dev_is_pci(dev))
+		return -ENODEV;
+
+	/* Fill in event data for device specific processing */
+	memset(&event, 0, sizeof(struct iommu_fault_event));
+	event.fault.type = IOMMU_FAULT_PAGE_REQ;
+	event.fault.prm.addr = desc->addr;
+	event.fault.prm.pasid = desc->pasid;
+	event.fault.prm.grpid = desc->prg_index;
+	event.fault.prm.perm = prq_to_iommu_prot(desc);
+
+	if (desc->lpig)
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+	if (desc->pasid_present) {
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+	}
+	if (desc->priv_data_present) {
+		/*
+		 * Set last page in group bit if private data is present,
+		 * page response is required as it does for LPIG.
+		 * iommu_report_device_fault() doesn't understand this vendor
+		 * specific requirement thus we set last_page as a workaround.
+		 */
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
+		memcpy(event.fault.prm.private_data, desc->priv_data,
+		       sizeof(desc->priv_data));
+	}
+
+	return iommu_report_device_fault(dev, &event);
+}
+
 static irqreturn_t prq_event_thread(int irq, void *d)
 {
+	struct intel_svm_dev *sdev = NULL;
 	struct intel_iommu *iommu = d;
 	struct intel_svm *svm = NULL;
 	int head, tail, handled = 0;
@@ -842,7 +897,6 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 	while (head != tail) {
-		struct intel_svm_dev *sdev;
 		struct vm_area_struct *vma;
 		struct page_req_dsc *req;
 		struct qi_desc resp;
@@ -878,6 +932,20 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			}
 		}
 
+		if (!sdev || sdev->sid != req->rid) {
+			struct intel_svm_dev *t;
+
+			sdev = NULL;
+			rcu_read_lock();
+			list_for_each_entry_rcu(t, &svm->devs, list) {
+				if (t->sid == req->rid) {
+					sdev = t;
+					break;
+				}
+			}
+			rcu_read_unlock();
+		}
+
 		result = QI_RESP_INVALID;
 		/* Since we're using init_mm.pgd directly, we should never take
 		 * any faults on kernel addresses. */
@@ -888,6 +956,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		if (!is_canonical_address(address))
 			goto bad_req;
 
+		/*
+		 * If prq is to be handled outside iommu driver via receiver of
+		 * the fault notifiers, we skip the page response here.
+		 */
+		if (svm->flags & SVM_FLAG_GUEST_MODE) {
+			if (sdev && !intel_svm_prq_report(sdev->dev, req))
+				goto prq_advance;
+			else
+				goto bad_req;
+		}
+
 		/* If the mm is already defunct, don't handle faults. */
 		if (!mmget_not_zero(svm->mm))
 			goto bad_req;
@@ -906,24 +985,11 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			goto invalid;
 
 		result = QI_RESP_SUCCESS;
-	invalid:
+invalid:
 		up_read(&svm->mm->mmap_sem);
 		mmput(svm->mm);
-	bad_req:
-		/* Accounting for major/minor faults? */
-		rcu_read_lock();
-		list_for_each_entry_rcu(sdev, &svm->devs, list) {
-			if (sdev->sid == req->rid)
-				break;
-		}
-		/* Other devices can go away, but the drivers are not permitted
-		 * to unbind while any page faults might be in flight. So it's
-		 * OK to drop the 'lock' here now we have it. */
-		rcu_read_unlock();
-
-		if (WARN_ON(&sdev->list == &svm->devs))
-			sdev = NULL;
-
+bad_req:
+		WARN_ON(!sdev);
 		if (sdev && sdev->ops && sdev->ops->fault_cb) {
 			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
 				(req->exe_req << 1) | (req->pm_req);
@@ -934,7 +1000,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		   and these can be NULL. Do not use them below this point! */
 		sdev = NULL;
 		svm = NULL;
-	no_pasid:
+no_pasid:
 		if (req->lpig || req->priv_data_present) {
 			/*
 			 * Per VT-d spec. v3.0 ch7.7, system software must
@@ -959,6 +1025,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			resp.qw3 = 0;
 			qi_submit_sync(iommu, &resp, 1, 0);
 		}
+prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}
 
-- 
Gitee


From 0746b9afb628307c70d4b9b45e66d8c53d50f205 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:24 +0800
Subject: [PATCH 0310/2161] iommu/vt-d: Add page response ops support

commit 8b73712115ebd603b75876f7d96d59e41d9107ad upstream.

After page requests are handled, software must respond to the device
which raised the page request with the result. This is done through
the iommu ops.page_response if the request was reported to outside of
vendor iommu driver through iommu_report_device_fault(). This adds the
VT-d implementation of page_response ops.

Co-developed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Co-developed-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20200724014925.15523-12-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  1 +
 drivers/iommu/intel/svm.c   | 99 +++++++++++++++++++++++++++++++++++++
 include/linux/intel-iommu.h |  3 ++
 3 files changed, 103 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 25a1c564ace7..1592a19b7a4c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -6074,6 +6074,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.sva_bind		= intel_svm_bind,
 	.sva_unbind		= intel_svm_unbind,
 	.sva_get_pasid		= intel_svm_get_pasid,
+	.page_response		= intel_svm_page_response,
 #endif
 };
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 009188f3a5f8..7b9588169265 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1096,3 +1096,102 @@ int intel_svm_get_pasid(struct iommu_sva *sva)
 
 	return pasid;
 }
+
+int intel_svm_page_response(struct device *dev,
+			    struct iommu_fault_event *evt,
+			    struct iommu_page_response *msg)
+{
+	struct iommu_fault_page_request *prm;
+	struct intel_svm_dev *sdev = NULL;
+	struct intel_svm *svm = NULL;
+	struct intel_iommu *iommu;
+	bool private_present;
+	bool pasid_present;
+	bool last_page;
+	u8 bus, devfn;
+	int ret = 0;
+	u16 sid;
+
+	if (!dev || !dev_is_pci(dev))
+		return -ENODEV;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	if (!msg || !evt)
+		return -EINVAL;
+
+	mutex_lock(&pasid_mutex);
+
+	prm = &evt->fault.prm;
+	sid = PCI_DEVID(bus, devfn);
+	pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+	private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
+	last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+
+	if (!pasid_present) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
+	if (ret || !sdev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * For responses from userspace, need to make sure that the
+	 * pasid has been bound to its mm.
+	 */
+	if (svm->flags & SVM_FLAG_GUEST_MODE) {
+		struct mm_struct *mm;
+
+		mm = get_task_mm(current);
+		if (!mm) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (mm != svm->mm) {
+			ret = -ENODEV;
+			mmput(mm);
+			goto out;
+		}
+
+		mmput(mm);
+	}
+
+	/*
+	 * Per VT-d spec. v3.0 ch7.7, system software must respond
+	 * with page group response if private data is present (PDP)
+	 * or last page in group (LPIG) bit is set. This is an
+	 * additional VT-d requirement beyond PCI ATS spec.
+	 */
+	if (last_page || private_present) {
+		struct qi_desc desc;
+
+		desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
+				QI_PGRP_PASID_P(pasid_present) |
+				QI_PGRP_PDP(private_present) |
+				QI_PGRP_RESP_CODE(msg->code) |
+				QI_PGRP_RESP_TYPE;
+		desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
+		desc.qw2 = 0;
+		desc.qw3 = 0;
+		if (private_present)
+			memcpy(&desc.qw2, prm->private_data,
+			       sizeof(prm->private_data));
+
+		qi_submit_sync(iommu, &desc, 1, 0);
+	}
+out:
+	mutex_unlock(&pasid_mutex);
+	return ret;
+}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 59394cd63f4a..bdf80d7a70fb 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -742,6 +742,9 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
 				 void *drvdata);
 void intel_svm_unbind(struct iommu_sva *handle);
 int intel_svm_get_pasid(struct iommu_sva *handle);
+int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
+			    struct iommu_page_response *msg);
+
 struct svm_dev_ops;
 
 struct intel_svm_dev {
-- 
Gitee


From 6af21939f58197f1b205df7d68c45b72af4bbdc5 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 24 Jul 2020 09:49:25 +0800
Subject: [PATCH 0311/2161] iommu/vt-d: Rename intel-pasid.h to pasid.h

commit 02f3effddfd04f3f08a24d23a82d1c1c6d89b777 upstream.

As Intel VT-d files have been moved to its own subdirectory, the prefix
makes no sense. No functional changes.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200724014925.15523-13-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/debugfs.c                  | 2 +-
 drivers/iommu/intel/iommu.c                    | 2 +-
 drivers/iommu/intel/pasid.c                    | 2 +-
 drivers/iommu/intel/{intel-pasid.h => pasid.h} | 2 +-
 drivers/iommu/intel/svm.c                      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename drivers/iommu/intel/{intel-pasid.h => pasid.h} (98%)

diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index cf1ebb98e418..efea7f02abd9 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -15,7 +15,7 @@
 
 #include <asm/irq_remapping.h>
 
-#include "intel-pasid.h"
+#include "pasid.h"
 
 struct tbl_walk {
 	u16 bus;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 1592a19b7a4c..6ffcf512dea3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -48,7 +48,7 @@
 #include <trace/events/intel_iommu.h>
 
 #include "../irq_remapping.h"
-#include "intel-pasid.h"
+#include "pasid.h"
 
 #define ROOT_SIZE		VTD_PAGE_SIZE
 #define CONTEXT_SIZE		VTD_PAGE_SIZE
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index fa0154cce537..e6faedf42fd4 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -19,7 +19,7 @@
 #include <linux/pci-ats.h>
 #include <linux/spinlock.h>
 
-#include "intel-pasid.h"
+#include "pasid.h"
 
 /*
  * Intel IOMMU system wide PASID name space:
diff --git a/drivers/iommu/intel/intel-pasid.h b/drivers/iommu/intel/pasid.h
similarity index 98%
rename from drivers/iommu/intel/intel-pasid.h
rename to drivers/iommu/intel/pasid.h
index c5318d40e0fa..c9850766c3a9 100644
--- a/drivers/iommu/intel/intel-pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * intel-pasid.h - PASID idr, table and entry header
+ * pasid.h - PASID idr, table and entry header
  *
  * Copyright (C) 2018 Intel Corporation
  *
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 7b9588169265..45c661453639 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -20,7 +20,7 @@
 #include <linux/ioasid.h>
 #include <asm/page.h>
 
-#include "intel-pasid.h"
+#include "pasid.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 static void intel_svm_drain_prq(struct device *dev, int pasid);
-- 
Gitee


From b060e7f0b9ba928543d3320a867ab1c63164a81a Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 19 Nov 2021 17:13:54 +0800
Subject: [PATCH 0312/2161] PCI/ATS: Add pci_pri_supported() to check device or
 associated PF

commit 3f9a7a13fe4cb6e119e4e4745fbf975d30bfac9b upstream.

For SR-IOV, the PF PRI is shared between the PF and any associated VFs, and
the PRI Capability is allowed for PFs but not for VFs.  Searching for the
PRI Capability on a VF always fails, even if its associated PF supports
PRI.

Add pci_pri_supported() to check whether device or its associated PF
supports PRI.

[bhelgaas: commit log, avoid "!!"]
Fixes: b16d0cb9e2fc ("iommu/vt-d: Always enable PASID/PRI PCI capabilities before ATS")
Link: https://lore.kernel.org/r/1595543849-19692-1-git-send-email-ashok.raj@intel.com
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
Cc: stable@vger.kernel.org	# v4.4+
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 29 ++++++++++++++++++++++++++++-
 drivers/pci/ats.c           | 15 +++++++++++++++
 include/linux/dmar.h        |  1 +
 include/linux/pci-ats.h     |  5 +++--
 4 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6ffcf512dea3..7e57122dc1a5 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -356,6 +356,7 @@ static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
 static int intel_no_bounce;
+static int iommu_skip_te_disable;
 
 #define IDENTMAP_GFX		2
 #define IDENTMAP_AZALIA		4
@@ -1633,6 +1634,10 @@ static void iommu_disable_translation(struct intel_iommu *iommu)
 	u32 sts;
 	unsigned long flag;
 
+	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
+	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
+		return;
+
 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
 	iommu->gcmd &= ~DMA_GCMD_TE;
 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
@@ -2564,7 +2569,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 			}
 
 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
-			    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
+			    pci_pri_supported(pdev))
 				info->pri_supported = 1;
 		}
 	}
@@ -4048,6 +4053,7 @@ static void __init init_no_remapping_devices(void)
 
 		/* This IOMMU has *only* gfx devices. Either bypass it or
 		   set the gfx_mapped flag, as appropriate */
+		drhd->gfx_dedicated = 1;
 		if (!dmar_map_gfx) {
 			drhd->ignored = 1;
 			for_each_active_dev_scope(drhd->devices,
@@ -6177,6 +6183,27 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_g
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
 
+static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
+{
+	unsigned short ver;
+
+	if (!IS_GFX_DEVICE(dev))
+		return;
+
+	ver = (dev->device >> 8) & 0xff;
+	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
+	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
+	    ver != 0x9a)
+		return;
+
+	if (risky_device(dev))
+		return;
+
+	pci_info(dev, "Skip IOMMU disabling for graphics\n");
+	iommu_skip_te_disable = 1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
+
 /* On Tylersburg chipsets, some BIOSes have been known to enable the
    ISOCH DMAR unit for the Azalia sound device, but not give it any
    TLB entries, which causes it to deadlock. Check for that.  We do
diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 20e59b971bc7..9191d223e311 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -296,6 +296,21 @@ int pci_reset_pri(struct pci_dev *pdev)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pci_reset_pri);
+
+/**
+ * pci_pri_supported - Check if PRI is supported.
+ * @pdev: PCI device structure
+ *
+ * Returns true if PRI capability is present, false otherwise.
+ */
+bool pci_pri_supported(struct pci_dev *pdev)
+{
+	/* VFs share the PF PRI */
+	if (pci_physfn(pdev)->pri_cap)
+		return true;
+	return false;
+}
+EXPORT_SYMBOL_GPL(pci_pri_supported);
 #endif /* CONFIG_PCI_PRI */
 
 #ifdef CONFIG_PCI_PASID
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index d7bf029df737..65565820328a 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -48,6 +48,7 @@ struct dmar_drhd_unit {
 	u16	segment;		/* PCI domain		*/
 	u8	ignored:1; 		/* ignore drhd		*/
 	u8	include_all:1;
+	u8	gfx_dedicated:1;	/* graphic dedicated	*/
 	struct intel_iommu *iommu;
 };
 
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 1ebb88e7c184..8be85e1372eb 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -10,9 +10,10 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs);
 void pci_disable_pri(struct pci_dev *pdev);
 void pci_restore_pri_state(struct pci_dev *pdev);
 int pci_reset_pri(struct pci_dev *pdev);
-
+bool pci_pri_supported(struct pci_dev *pdev);
 #else /* CONFIG_PCI_PRI */
-
+static inline bool pci_pri_supported(struct pci_dev *pdev)
+{ return false; }
 static inline int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
 {
 	return -ENODEV;
-- 
Gitee


From ad13d604e48fa2dd0efcca20496c9740bbeff7bc Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Thu, 5 Sep 2019 14:31:45 -0500
Subject: [PATCH 0313/2161] PCI/ATS: Cache PRI Capability offset

commit c065190bbcd4fb54ce9c5fd34fcad71acf2a0ea4 upstream.

Previously each PRI interface searched for the PRI Capability.  Cache the
capability offset the first time we use it instead of searching each time.

[bhelgaas: commit log, reorder patch to later, call pci_pri_init() from
pci_init_capabilities()]
Link: https://lore.kernel.org/r/0c5495d376faf6dbb8eb2165204c474438aaae65.156
7029860.git.sathyanarayanan.kuppuswamy@linux.intel.com
Link: https://lore.kernel.org/r/20190905193146.90250-5-helgaas@kernel.org
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/ats.c   | 51 +++++++++++++++++++++++----------------------
 drivers/pci/pci.h   |  6 ++++++
 drivers/pci/probe.c |  3 +++
 include/linux/pci.h |  1 +
 4 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 9191d223e311..40ad10716b2a 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -186,6 +186,11 @@ int pci_ats_page_aligned(struct pci_dev *pdev)
 EXPORT_SYMBOL_GPL(pci_ats_page_aligned);
 
 #ifdef CONFIG_PCI_PRI
+void pci_pri_init(struct pci_dev *pdev)
+{
+	pdev->pri_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+}
+
 /**
  * pci_enable_pri - Enable PRI capability
  * @ pdev: PCI device structure
@@ -196,26 +201,25 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
 {
 	u16 control, status;
 	u32 max_requests;
-	int pos;
+	int pri = pdev->pri_cap;
 
 	if (WARN_ON(pdev->pri_enabled))
 		return -EBUSY;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	if (!pri)
 		return -EINVAL;
 
-	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+	pci_read_config_word(pdev, pri + PCI_PRI_STATUS, &status);
 	if (!(status & PCI_PRI_STATUS_STOPPED))
 		return -EBUSY;
 
-	pci_read_config_dword(pdev, pos + PCI_PRI_MAX_REQ, &max_requests);
+	pci_read_config_dword(pdev, pri + PCI_PRI_MAX_REQ, &max_requests);
 	reqs = min(max_requests, reqs);
 	pdev->pri_reqs_alloc = reqs;
-	pci_write_config_dword(pdev, pos + PCI_PRI_ALLOC_REQ, reqs);
+	pci_write_config_dword(pdev, pri + PCI_PRI_ALLOC_REQ, reqs);
 
 	control = PCI_PRI_CTRL_ENABLE;
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+	pci_write_config_word(pdev, pri + PCI_PRI_CTRL, control);
 
 	pdev->pri_enabled = 1;
 
@@ -232,18 +236,17 @@ EXPORT_SYMBOL_GPL(pci_enable_pri);
 void pci_disable_pri(struct pci_dev *pdev)
 {
 	u16 control;
-	int pos;
+	int pri = pdev->pri_cap;
 
 	if (WARN_ON(!pdev->pri_enabled))
 		return;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	if (!pri)
 		return;
 
-	pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
+	pci_read_config_word(pdev, pri + PCI_PRI_CTRL, &control);
 	control &= ~PCI_PRI_CTRL_ENABLE;
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+	pci_write_config_word(pdev, pri + PCI_PRI_CTRL, control);
 
 	pdev->pri_enabled = 0;
 }
@@ -257,17 +260,16 @@ void pci_restore_pri_state(struct pci_dev *pdev)
 {
 	u16 control = PCI_PRI_CTRL_ENABLE;
 	u32 reqs = pdev->pri_reqs_alloc;
-	int pos;
+	int pri = pdev->pri_cap;
 
 	if (!pdev->pri_enabled)
 		return;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	if (!pri)
 		return;
 
-	pci_write_config_dword(pdev, pos + PCI_PRI_ALLOC_REQ, reqs);
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+	pci_write_config_dword(pdev, pri + PCI_PRI_ALLOC_REQ, reqs);
+	pci_write_config_word(pdev, pri + PCI_PRI_CTRL, control);
 }
 EXPORT_SYMBOL_GPL(pci_restore_pri_state);
 
@@ -281,17 +283,16 @@ EXPORT_SYMBOL_GPL(pci_restore_pri_state);
 int pci_reset_pri(struct pci_dev *pdev)
 {
 	u16 control;
-	int pos;
+	int pri = pdev->pri_cap;
 
 	if (WARN_ON(pdev->pri_enabled))
 		return -EBUSY;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	if (!pri)
 		return -EINVAL;
 
 	control = PCI_PRI_CTRL_RESET;
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+	pci_write_config_word(pdev, pri + PCI_PRI_CTRL, control);
 
 	return 0;
 }
@@ -441,13 +442,13 @@ EXPORT_SYMBOL_GPL(pci_pasid_features);
 int pci_prg_resp_pasid_required(struct pci_dev *pdev)
 {
 	u16 status;
-	int pos;
+	int pri;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	pri = pdev->pri_cap;
+	if (!pri)
 		return 0;
 
-	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+	pci_read_config_word(pdev, pri + PCI_PRI_STATUS, &status);
 
 	if (status & PCI_PRI_STATUS_PASID)
 		return 1;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index b66ffea134dc..638e83f8085c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -466,6 +466,12 @@ static inline void pci_ats_init(struct pci_dev *d) { }
 static inline void pci_restore_ats_state(struct pci_dev *dev) { }
 #endif /* CONFIG_PCI_ATS */
 
+#ifdef CONFIG_PCI_PRI
+void pci_pri_init(struct pci_dev *dev);
+#else
+static inline void pci_pri_init(struct pci_dev *dev) { }
+#endif
+
 #ifdef CONFIG_PCI_IOV
 int pci_iov_init(struct pci_dev *dev);
 void pci_iov_release(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8891935fcb31..58bf772157ec 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2371,6 +2371,9 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	/* Address Translation Services */
 	pci_ats_init(dev);
 
+	/* Page Request Interface */
+	pci_pri_init(dev);
+
 	/* Enable ACS P2P upstream forwarding */
 	pci_enable_acs(dev);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index aa99fb56f7dd..136dcfac14cd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -463,6 +463,7 @@ struct pci_dev {
 	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
 #endif
 #ifdef CONFIG_PCI_PRI
+	u16		pri_cap;	/* PRI Capability offset */
 	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
 #endif
 #ifdef CONFIG_PCI_PASID
-- 
Gitee


From 386d3efc0667d197d34110f6de94cd34e8909d11 Mon Sep 17 00:00:00 2001
From: Jerry Snitselaar <jsnitsel@redhat.com>
Date: Tue, 30 Jun 2020 13:06:35 -0700
Subject: [PATCH 0314/2161] iommu/vt-d: Move Kconfig and Makefile bits down
 into intel directory

commit ab65ba57e3acb55920999f96a6152228b52a2f49 upstream.

Move Intel Kconfig and Makefile bits down into intel directory
with the rest of the Intel specific files.

Signed-off-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20200630200636.48600-2-jsnitsel@redhat.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig        | 86 +-----------------------------------
 drivers/iommu/Makefile       |  8 +---
 drivers/iommu/intel/Kconfig  | 86 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/Makefile |  7 +++
 4 files changed, 96 insertions(+), 91 deletions(-)
 create mode 100644 drivers/iommu/intel/Kconfig
 create mode 100644 drivers/iommu/intel/Makefile

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index e44a1f868ad0..eea7d376162b 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -176,91 +176,7 @@ config AMD_IOMMU_DEBUGFS
 	  This option is -NOT- intended for production environments, and should
 	  not generally be enabled.
 
-# Intel IOMMU support
-config DMAR_TABLE
-	bool
-
-config INTEL_IOMMU
-	bool "Support for Intel IOMMU using DMA Remapping Devices"
-	depends on PCI_MSI && ACPI && (X86 || IA64)
-	select IOMMU_API
-	select IOMMU_IOVA
-	select NEED_DMA_MAP_STATE
-	select DMAR_TABLE
-	select SWIOTLB
-	select IOASID
-	help
-	  DMA remapping (DMAR) devices support enables independent address
-	  translations for Direct Memory Access (DMA) from devices.
-	  These DMA remapping devices are reported via ACPI tables
-	  and include PCI device scope covered by these DMA
-	  remapping devices.
-
-config INTEL_IOMMU_DEBUGFS
-	bool "Export Intel IOMMU internals in Debugfs"
-	depends on INTEL_IOMMU && IOMMU_DEBUGFS
-	help
-	  !!!WARNING!!!
-
-	  DO NOT ENABLE THIS OPTION UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!!!
-
-	  Expose Intel IOMMU internals in Debugfs.
-
-	  This option is -NOT- intended for production environments, and should
-	  only be enabled for debugging Intel IOMMU.
-
-config INTEL_IOMMU_SVM
-	bool "Support for Shared Virtual Memory with Intel IOMMU"
-	depends on INTEL_IOMMU && X86_64
-	select PCI_PASID
-	select PCI_PRI
-	select MMU_NOTIFIER
-	select IOASID
-	help
-	  Shared Virtual Memory (SVM) provides a facility for devices
-	  to access DMA resources through process address space by
-	  means of a Process Address Space ID (PASID).
-
-config INTEL_IOMMU_DEFAULT_ON
-	def_bool y
-	prompt "Enable Intel DMA Remapping Devices by default"
-	depends on INTEL_IOMMU
-	help
-	  Selecting this option will enable a DMAR device at boot time if
-	  one is found. If this option is not selected, DMAR support can
-	  be enabled by passing intel_iommu=on to the kernel.
-
-config INTEL_IOMMU_BROKEN_GFX_WA
-	bool "Workaround broken graphics drivers (going away soon)"
-	depends on INTEL_IOMMU && BROKEN && X86
-	---help---
-	  Current Graphics drivers tend to use physical address
-	  for DMA and avoid using DMA APIs. Setting this config
-	  option permits the IOMMU driver to set a unity map for
-	  all the OS-visible memory. Hence the driver can continue
-	  to use physical addresses for DMA, at least until this
-	  option is removed in the 2.6.32 kernel.
-
-config INTEL_IOMMU_FLOPPY_WA
-	def_bool y
-	depends on INTEL_IOMMU && X86
-	---help---
-	  Floppy disk drivers are known to bypass DMA API calls
-	  thereby failing to work when IOMMU is enabled. This
-	  workaround will setup a 1:1 mapping for the first
-	  16MiB to make floppy (an ISA device) work.
-
-config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
-	bool "Enable Intel IOMMU scalable mode by default"
-	depends on INTEL_IOMMU
-	help
-	  Selecting this option will enable by default the scalable mode if
-	  hardware presents the capability. The scalable mode is defined in
-	  VT-d 3.0. The scalable mode capability could be checked by reading
-	  /sys/devices/virtual/iommu/dmar*/intel-iommu/ecap. If this option
-	  is not selected, scalable mode support could also be enabled by
-	  passing intel_iommu=sm_on to the kernel. If not sure, please use
-	  the default value.
+source "drivers/iommu/intel/Kconfig"
 
 config IRQ_REMAP
 	bool "Support for Interrupt Remapping"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 43af91354c82..3ba27a3a6bef 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
+obj-y += intel/
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
@@ -16,13 +17,8 @@ obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o
 obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
-obj-$(CONFIG_DMAR_TABLE) += intel/dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += intel/iommu.o intel/pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += intel/trace.o
-obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel/debugfs.o
-obj-$(CONFIG_INTEL_IOMMU_SVM) += intel/svm.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
-obj-$(CONFIG_IRQ_REMAP) += intel/irq_remapping.o irq_remapping.o
+obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
 obj-$(CONFIG_MTK_IOMMU) += mtk_iommu.o
 obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o
 obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
new file mode 100644
index 000000000000..877beec9d987
--- /dev/null
+++ b/drivers/iommu/intel/Kconfig
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Intel IOMMU support
+config DMAR_TABLE
+	bool
+
+config INTEL_IOMMU
+	bool "Support for Intel IOMMU using DMA Remapping Devices"
+	depends on PCI_MSI && ACPI && (X86 || IA64)
+	select IOMMU_API
+	select IOMMU_IOVA
+	select NEED_DMA_MAP_STATE
+	select DMAR_TABLE
+	select SWIOTLB
+	select IOASID
+	help
+	  DMA remapping (DMAR) devices support enables independent address
+	  translations for Direct Memory Access (DMA) from devices.
+	  These DMA remapping devices are reported via ACPI tables
+	  and include PCI device scope covered by these DMA
+	  remapping devices.
+
+config INTEL_IOMMU_DEBUGFS
+	bool "Export Intel IOMMU internals in Debugfs"
+	depends on INTEL_IOMMU && IOMMU_DEBUGFS
+	help
+	  !!!WARNING!!!
+
+	  DO NOT ENABLE THIS OPTION UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!!!
+
+	  Expose Intel IOMMU internals in Debugfs.
+
+	  This option is -NOT- intended for production environments, and should
+	  only be enabled for debugging Intel IOMMU.
+
+config INTEL_IOMMU_SVM
+	bool "Support for Shared Virtual Memory with Intel IOMMU"
+	depends on INTEL_IOMMU && X86_64
+	select PCI_PASID
+	select PCI_PRI
+	select MMU_NOTIFIER
+	select IOASID
+	help
+	  Shared Virtual Memory (SVM) provides a facility for devices
+	  to access DMA resources through process address space by
+	  means of a Process Address Space ID (PASID).
+
+config INTEL_IOMMU_DEFAULT_ON
+	def_bool y
+	prompt "Enable Intel DMA Remapping Devices by default"
+	depends on INTEL_IOMMU
+	help
+	  Selecting this option will enable a DMAR device at boot time if
+	  one is found. If this option is not selected, DMAR support can
+	  be enabled by passing intel_iommu=on to the kernel.
+
+config INTEL_IOMMU_BROKEN_GFX_WA
+	bool "Workaround broken graphics drivers (going away soon)"
+	depends on INTEL_IOMMU && BROKEN && X86
+	help
+	  Current Graphics drivers tend to use physical address
+	  for DMA and avoid using DMA APIs. Setting this config
+	  option permits the IOMMU driver to set a unity map for
+	  all the OS-visible memory. Hence the driver can continue
+	  to use physical addresses for DMA, at least until this
+	  option is removed in the 2.6.32 kernel.
+
+config INTEL_IOMMU_FLOPPY_WA
+	def_bool y
+	depends on INTEL_IOMMU && X86
+	help
+	  Floppy disk drivers are known to bypass DMA API calls
+	  thereby failing to work when IOMMU is enabled. This
+	  workaround will setup a 1:1 mapping for the first
+	  16MiB to make floppy (an ISA device) work.
+
+config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
+	bool "Enable Intel IOMMU scalable mode by default"
+	depends on INTEL_IOMMU
+	help
+	  Selecting this option will enable by default the scalable mode if
+	  hardware presents the capability. The scalable mode is defined in
+	  VT-d 3.0. The scalable mode capability could be checked by reading
+	  /sys/devices/virtual/iommu/dmar*/intel-iommu/ecap. If this option
+	  is not selected, scalable mode support could also be enabled by
+	  passing intel_iommu=sm_on to the kernel. If not sure, please use
+	  the default value.
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
new file mode 100644
index 000000000000..fb8e1e8c8029
--- /dev/null
+++ b/drivers/iommu/intel/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_DMAR_TABLE) += dmar.o
+obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
+obj-$(CONFIG_INTEL_IOMMU) += trace.o
+obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
+obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
+obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
-- 
Gitee


From 404c6c087dcbca0276067c42463e71ae44f581c9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 6 Aug 2020 14:34:32 +0200
Subject: [PATCH 0315/2161] x86/headers: Remove APIC headers from <asm/smp.h>

commit 13c01139b17163c9b2aa543a9c39f8bbc875b625 upstream.

The APIC headers are relatively complex and bring in additional
header dependencies - while smp.h is a relatively simple header
included from high level headers.

Remove the dependency and add in the missing #include's in .c
files where they gained it indirectly before.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/smp.h          | 10 ----------
 arch/x86/include/asm/tsc.h          |  1 +
 arch/x86/kernel/apic/apic.c         |  1 +
 arch/x86/kernel/apic/bigsmp_32.c    |  1 +
 arch/x86/kernel/apic/ipi.c          |  1 +
 arch/x86/kernel/apic/local.h        |  1 +
 arch/x86/kernel/apic/probe_32.c     |  1 +
 arch/x86/kernel/devicetree.c        |  1 +
 arch/x86/kernel/irqinit.c           |  2 ++
 arch/x86/kernel/jailhouse.c         |  1 +
 arch/x86/kernel/mpparse.c           |  2 ++
 arch/x86/kernel/setup.c             |  1 +
 arch/x86/kernel/topology.c          |  1 +
 arch/x86/xen/apic.c                 |  1 +
 arch/x86/xen/enlighten_hvm.c        |  1 +
 arch/x86/xen/smp_pv.c               |  1 +
 drivers/iommu/intel/irq_remapping.c |  1 +
 17 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index e15f364efbcc..c0538f82c9a2 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -5,16 +5,6 @@
 #include <linux/cpumask.h>
 #include <asm/percpu.h>
 
-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-# include <asm/mpspec.h>
-# include <asm/apic.h>
-# ifdef CONFIG_X86_IO_APIC
-#  include <asm/io_apic.h>
-# endif
-#endif
 #include <asm/thread_info.h>
 #include <asm/cpumask.h>
 
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 8a0c25c6bf09..db5977174ce7 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -6,6 +6,7 @@
 #define _ASM_X86_TSC_H
 
 #include <asm/processor.h>
+#include <asm/cpufeature.h>
 
 #define NS_SCALE	10 /* 2^10, carefully chosen */
 #define US_SCALE	32 /* 2^32, arbitralrily chosen */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7fafa859e9f2..c9a70e084d52 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -48,6 +48,7 @@
 #include <asm/proto.h>
 #include <asm/traps.h>
 #include <asm/apic.h>
+#include <asm/acpi.h>
 #include <asm/io_apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 38b5b51d42f6..98d015a4405a 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -9,6 +9,7 @@
 #include <linux/smp.h>
 
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 
 #include "local.h"
 
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 6ca0f91372fd..387154e39e08 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -2,6 +2,7 @@
 
 #include <linux/cpumask.h>
 #include <linux/smp.h>
+#include <asm/io_apic.h>
 
 #include "local.h"
 
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index 04797f05ce94..a997d849509a 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -10,6 +10,7 @@
 
 #include <linux/jump_label.h>
 
+#include <asm/irq_vectors.h>
 #include <asm/apic.h>
 
 /* APIC flat 64 */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 67b33d67002f..7bda71def557 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/smp.h>
 
+#include <asm/io_apic.h>
 #include <asm/apic.h>
 #include <asm/acpi.h>
 
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 8d85e00bb40a..a0e8fc7d85f1 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -20,6 +20,7 @@
 #include <asm/irqdomain.h>
 #include <asm/hpet.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/pci_x86.h>
 #include <asm/setup.h>
 #include <asm/i8259.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 16919a9671fa..41df22cdb99b 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -22,6 +22,8 @@
 #include <asm/hw_irq.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/i8259.h>
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index 3ad34f01de2a..20f73d3e47e4 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/reboot.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/cpu.h>
 #include <asm/hypervisor.h>
 #include <asm/i8259.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index afac7ccce72f..db509e1134ce 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,6 +19,8 @@
 #include <linux/smp.h>
 #include <linux/pci.h>
 
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
 #include <asm/irqdomain.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 366875a828b3..b2a50dd90874 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -79,6 +79,7 @@
 
 #include <asm/mtrr.h>
 #include <asm/apic.h>
+#include <asm/numa.h>
 #include <asm/realmode.h>
 #include <asm/e820/api.h>
 #include <asm/mpspec.h>
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index be5bc2e47c71..f3cad0d8c6a8 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,6 +31,7 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/irq.h>
+#include <asm/io_apic.h>
 #include <asm/cpu.h>
 
 static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 5e53bfbe5823..2df7d089ad54 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -3,6 +3,7 @@
 
 #include <asm/x86_init.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/xen/hypercall.h>
 
 #include <xen/xen.h>
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 6024fafed164..3c4d0b6431ca 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -11,6 +11,7 @@
 
 #include <asm/cpu.h>
 #include <asm/smp.h>
+#include <asm/io_apic.h>
 #include <asm/reboot.h>
 #include <asm/setup.h>
 #include <asm/hypervisor.h>
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 0cebe5db691d..38d1cd4b2088 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -28,6 +28,7 @@
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/cpu.h>
+#include <asm/io_apic.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 33f70c448599..acaf98ddadbd 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -15,6 +15,7 @@
 #include <linux/irqdomain.h>
 #include <linux/crash_dump.h>
 #include <asm/io_apic.h>
+#include <asm/apic.h>
 #include <asm/smp.h>
 #include <asm/cpu.h>
 #include <asm/irq_remapping.h>
-- 
Gitee


From 7ad553962b28f623099b6a392f4b80c3c731d58f Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 28 Jul 2020 19:08:58 +0200
Subject: [PATCH 0316/2161] iommu/vt-d: Drop kerneldoc marker from regular
 comment

commit 3207fa325ad77f00e83aaaeca5ca79fa234528f1 upstream.

Fix W=1 compile warnings (invalid kerneldoc):

    drivers/iommu/intel/dmar.c:389: warning: Function parameter or member 'header' not described in 'dmar_parse_one_drhd'

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20200728170859.28143-2-krzk@kernel.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 503e95e5cc11..f0b614122446 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -380,7 +380,7 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
 	return NULL;
 }
 
-/**
+/*
  * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
  * structure which uniquely represent one DMA remapping hardware unit
  * present in the platform
-- 
Gitee


From cd49c303f4fd4bc8aa961a026a9b398a2d0c90a9 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 3 Sep 2020 14:51:32 +0800
Subject: [PATCH 0317/2161] iommu/vt-d: Fix NULL pointer dereference in
 dev_iommu_priv_set()

commit 2d33b7d631d9dc81c78bb71368645cf7f0e68cb1 upstream.

The dev_iommu_priv_set() must be called after probe_device(). This fixes
a NULL pointer deference bug when booting a system with kernel cmdline
"intel_iommu=on,igfx_off", where the dev_iommu_priv_set() is abused.

The following stacktrace was produced:

 Command line: BOOT_IMAGE=/isolinux/bzImage console=tty1 intel_iommu=on,igfx_off
 ...
 DMAR: Host address width 39
 DMAR: DRHD base: 0x000000fed90000 flags: 0x0
 DMAR: dmar0: reg_base_addr fed90000 ver 1:0 cap 1c0000c40660462 ecap 19e2ff0505e
 DMAR: DRHD base: 0x000000fed91000 flags: 0x1
 DMAR: dmar1: reg_base_addr fed91000 ver 1:0 cap d2008c40660462 ecap f050da
 DMAR: RMRR base: 0x0000009aa9f000 end: 0x0000009aabefff
 DMAR: RMRR base: 0x0000009d000000 end: 0x0000009f7fffff
 DMAR: No ATSR found
 BUG: kernel NULL pointer dereference, address: 0000000000000038
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD 0 P4D 0
 Oops: 0002 [#1] SMP PTI
 CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.9.0-devel+ #2
 Hardware name: LENOVO 20HGS0TW00/20HGS0TW00, BIOS N1WET46S (1.25s ) 03/30/2018
 RIP: 0010:intel_iommu_init+0xed0/0x1136
 Code: fe e9 61 02 00 00 bb f4 ff ff ff e9 57 02 00 00 48 63 d1 48 c1 e2 04 48
       03 50 20 48 8b 12 48 85 d2 74 0b 48 8b 92 d0 02 00 00 48 89 7a 38 ff c1
       e9 15 f5 ff ff 48 c7 c7 60 99 ac a7 49 c7 c7 a0
 RSP: 0000:ffff96d180073dd0 EFLAGS: 00010282
 RAX: ffff8c91037a7d20 RBX: 0000000000000000 RCX: 0000000000000000
 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffffffffff
 RBP: ffff96d180073e90 R08: 0000000000000001 R09: ffff8c91039fe3c0
 R10: 0000000000000226 R11: 0000000000000226 R12: 000000000000000b
 R13: ffff8c910367c650 R14: ffffffffa8426d60 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffff8c9107480000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000038 CR3: 00000004b100a001 CR4: 00000000003706e0
 Call Trace:
  ? _raw_spin_unlock_irqrestore+0x1f/0x30
  ? call_rcu+0x10e/0x320
  ? trace_hardirqs_on+0x2c/0xd0
  ? rdinit_setup+0x2c/0x2c
  ? e820__memblock_setup+0x8b/0x8b
  pci_iommu_init+0x16/0x3f
  do_one_initcall+0x46/0x1e4
  kernel_init_freeable+0x169/0x1b2
  ? rest_init+0x9f/0x9f
  kernel_init+0xa/0x101
  ret_from_fork+0x22/0x30
 Modules linked in:
 CR2: 0000000000000038
 ---[ end trace 3653722a6f936f18 ]---

Fixes: 01b9d4e21148c ("iommu/vt-d: Use dev_iommu_priv_get/set()")
Reported-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Reported-by: Wendy Wang <wendy.wang@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Link: https://lore.kernel.org/linux-iommu/96717683-70be-7388-3d2f-61131070a96a@secunet.com/
Link: https://lore.kernel.org/r/20200903065132.16879-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 100 ++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 45 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7e57122dc1a5..d960133638f8 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -364,7 +364,6 @@ static int iommu_skip_te_disable;
 int intel_iommu_gfx_mapped;
 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 
-#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 struct device_domain_info *get_domain_info(struct device *dev)
 {
@@ -374,8 +373,7 @@ struct device_domain_info *get_domain_info(struct device *dev)
 		return NULL;
 
 	info = dev_iommu_priv_get(dev);
-	if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
-		     info == DEFER_DEVICE_DOMAIN_INFO))
+	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 		return NULL;
 
 	return info;
@@ -742,11 +740,6 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 	return &context[devfn];
 }
 
-static int iommu_dummy(struct device *dev)
-{
-	return dev_iommu_priv_get(dev) == DUMMY_DEVICE_DOMAIN_INFO;
-}
-
 static bool attach_deferred(struct device *dev)
 {
 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
@@ -779,6 +772,53 @@ is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 	return false;
 }
 
+static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+{
+	struct dmar_drhd_unit *drhd;
+	u32 vtbar;
+	int rc;
+
+	/* We know that this device on this chipset has its own IOMMU.
+	 * If we find it under a different IOMMU, then the BIOS is lying
+	 * to us. Hope that the IOMMU for this device is actually
+	 * disabled, and it needs no translation...
+	 */
+	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
+	if (rc) {
+		/* "can't" happen */
+		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
+		return false;
+	}
+	vtbar &= 0xffff0000;
+
+	/* we know that the this iommu should be at offset 0xa000 from vtbar */
+	drhd = dmar_find_matched_drhd_unit(pdev);
+	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
+		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
+		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+		return true;
+	}
+
+	return false;
+}
+
+static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
+{
+	if (!iommu || iommu->drhd->ignored)
+		return true;
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
+		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
+		    quirk_ioat_snb_local_iommu(pdev))
+			return true;
+	}
+
+	return false;
+}
+
 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 {
 	struct dmar_drhd_unit *drhd = NULL;
@@ -788,7 +828,7 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 	u16 segment = 0;
 	int i;
 
-	if (!dev || iommu_dummy(dev))
+	if (!dev)
 		return NULL;
 
 	if (dev_is_pci(dev)) {
@@ -805,7 +845,7 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 		dev = &ACPI_COMPANION(dev)->dev;
 
 	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
+	for_each_iommu(iommu, drhd) {
 		if (pdev && segment != drhd->segment)
 			continue;
 
@@ -841,6 +881,9 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 	}
 	iommu = NULL;
  out:
+	if (iommu_is_dummy(iommu, dev))
+		iommu = NULL;
+
 	rcu_read_unlock();
 
 	return iommu;
@@ -2447,7 +2490,7 @@ struct dmar_domain *find_domain(struct device *dev)
 {
 	struct device_domain_info *info;
 
-	if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
+	if (unlikely(attach_deferred(dev)))
 		return NULL;
 
 	/* No lock here, assumes no domain exit in normal case */
@@ -3994,35 +4037,6 @@ static void __init iommu_exit_mempool(void)
 	iova_cache_put();
 }
 
-static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
-{
-	struct dmar_drhd_unit *drhd;
-	u32 vtbar;
-	int rc;
-
-	/* We know that this device on this chipset has its own IOMMU.
-	 * If we find it under a different IOMMU, then the BIOS is lying
-	 * to us. Hope that the IOMMU for this device is actually
-	 * disabled, and it needs no translation...
-	 */
-	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
-	if (rc) {
-		/* "can't" happen */
-		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
-		return;
-	}
-	vtbar &= 0xffff0000;
-
-	/* we know that the this iommu should be at offset 0xa000 from vtbar */
-	drhd = dmar_find_matched_drhd_unit(pdev);
-	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
-		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
-		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-		dev_iommu_priv_set(&pdev->dev, DUMMY_DEVICE_DOMAIN_INFO);
-	}
-}
-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
-
 static void __init init_no_remapping_devices(void)
 {
 	struct dmar_drhd_unit *drhd;
@@ -4054,12 +4068,8 @@ static void __init init_no_remapping_devices(void)
 		/* This IOMMU has *only* gfx devices. Either bypass it or
 		   set the gfx_mapped flag, as appropriate */
 		drhd->gfx_dedicated = 1;
-		if (!dmar_map_gfx) {
+		if (!dmar_map_gfx)
 			drhd->ignored = 1;
-			for_each_active_dev_scope(drhd->devices,
-						  drhd->devices_cnt, i, dev)
-				dev_iommu_priv_set(dev, DUMMY_DEVICE_DOMAIN_INFO);
-		}
 	}
 }
 
-- 
Gitee


From 0a6fb2fbbbde852d1d99ae22e03e4cf69cf3a21b Mon Sep 17 00:00:00 2001
From: Yuqi Jin <jinyuqi@huawei.com>
Date: Thu, 27 Aug 2020 16:43:54 +0800
Subject: [PATCH 0318/2161] iommu/iova: Replace cmpxchg with xchg in queue_iova

commit ba328f82613260a098de7b4aff58743cd8733507 upstream.

The performance of the atomic_xchg is better than atomic_cmpxchg because
no comparison is required. While the value of @fq_timer_on can only be 0
or 1. Let's use atomic_xchg instead of atomic_cmpxchg here because we
only need to check that the value changes from 0 to 1 or from 1 to 1.

Signed-off-by: Yuqi Jin <jinyuqi@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Link: https://lore.kernel.org/r/1598517834-30275-1-git-send-email-zhangshaokun@hisilicon.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 45a251da5453..30d969a4c5fd 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -579,7 +579,7 @@ void queue_iova(struct iova_domain *iovad,
 
 	/* Avoid false sharing as much as possible. */
 	if (!atomic_read(&iovad->fq_timer_on) &&
-	    !atomic_cmpxchg(&iovad->fq_timer_on, 0, 1))
+	    !atomic_xchg(&iovad->fq_timer_on, 1))
 		mod_timer(&iovad->fq_timer,
 			  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
 }
-- 
Gitee


From ca02daf509f56128fa4a3168473864531b1de618 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 22 Nov 2021 17:05:19 +0800
Subject: [PATCH 0319/2161] dma-direct: rename and cleanup __phys_to_dma

commit 5ceda74093a5c1c3f42a02b894df031f3bbc9af1 upstream.

The __phys_to_dma vs phys_to_dma distinction isn't exactly obvious.  Try
to improve the situation by renaming __phys_to_dma to
phys_to_dma_unencryped, and not forcing architectures that want to
override phys_to_dma to actually provide __phys_to_dma.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/arm/include/asm/dma-direct.h |  2 +-
 drivers/iommu/intel/iommu.c       |  2 +-
 include/linux/dma-direct.h        | 28 ++++++++++++++++------------
 kernel/dma/direct.c               | 10 +++++-----
 kernel/dma/swiotlb.c              |  4 ++--
 5 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/arch/arm/include/asm/dma-direct.h b/arch/arm/include/asm/dma-direct.h
index b67e5fc1fe43..c9f4e70a4fe2 100644
--- a/arch/arm/include/asm/dma-direct.h
+++ b/arch/arm/include/asm/dma-direct.h
@@ -2,7 +2,7 @@
 #ifndef ASM_ARM_DMA_DIRECT_H
 #define ASM_ARM_DMA_DIRECT_H 1
 
-static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
 	unsigned int offset = paddr & ~PAGE_MASK;
 	return pfn_to_dma(dev, __phys_to_pfn(paddr)) + offset;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d960133638f8..3d7952414b01 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3784,7 +3784,7 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
 	 */
 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
 		tlb_addr = swiotlb_tbl_map_single(dev,
-				__phys_to_dma(dev, io_tlb_start),
+				phys_to_dma_unencrypted(dev, io_tlb_start),
 				paddr, size, aligned_size, dir, attrs);
 		if (tlb_addr == DMA_MAPPING_ERROR) {
 			goto swiotlb_error;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 6a18a97b76a8..a56ea8cd399b 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -10,14 +10,29 @@ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
 
 #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA
 #include <asm/dma-direct.h>
+#ifndef phys_to_dma_unencrypted
+#define phys_to_dma_unencrypted		phys_to_dma
+#endif
 #else
-static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
+static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
+		phys_addr_t paddr)
 {
 	dma_addr_t dev_addr = (dma_addr_t)paddr;
 
 	return dev_addr - ((dma_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
 }
 
+/*
+ * If memory encryption is supported, phys_to_dma will set the memory encryption
+ * bit in the DMA address, and dma_to_phys will clear it.
+ * phys_to_dma_unencrypted is for use on special unencrypted memory like swiotlb
+ * buffers.
+ */
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+	return __sme_set(phys_to_dma_unencrypted(dev, paddr));
+}
+
 static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 {
 	phys_addr_t paddr = (phys_addr_t)dev_addr;
@@ -49,17 +64,6 @@ static inline bool force_dma_unencrypted(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */
 
-/*
- * If memory encryption is supported, phys_to_dma will set the memory encryption
- * bit in the DMA address, and dma_to_phys will clear it.  The raw __phys_to_dma
- * and __dma_to_phys versions should only be used on non-encrypted memory for
- * special occasions like DMA coherent buffers.
- */
-static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
-{
-	return __sme_set(__phys_to_dma(dev, paddr));
-}
-
 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
 	return __sme_clr(__dma_to_phys(dev, daddr));
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 0a093a675b63..86baa9a12ef4 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -39,7 +39,7 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 		phys_addr_t phys)
 {
 	if (force_dma_unencrypted(dev))
-		return __phys_to_dma(dev, phys);
+		return phys_to_dma_unencrypted(dev, phys);
 	return phys_to_dma(dev, phys);
 }
 
@@ -161,7 +161,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	ret = page_address(page);
 	if (force_dma_unencrypted(dev)) {
 		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
-		*dma_handle = __phys_to_dma(dev, page_to_phys(page));
+		*dma_handle = phys_to_dma_unencrypted(dev, page_to_phys(page));
 	} else {
 		*dma_handle = phys_to_dma(dev, page_to_phys(page));
 	}
@@ -403,11 +403,11 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
 
 	/*
-	 * This check needs to be against the actual bit mask value, so
-	 * use __phys_to_dma() here so that the SME encryption mask isn't
+	 * This check needs to be against the actual bit mask value, so use
+	 * phys_to_dma_unencrypted() here so that the SME encryption mask isn't
 	 * part of the check.
 	 */
-	return mask >= __phys_to_dma(dev, min_mask);
+	return mask >= phys_to_dma_unencrypted(dev, min_mask);
 }
 
 size_t dma_direct_max_mapping_size(struct device *dev)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f99b79d7e123..c2874f44f7ba 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -675,13 +675,13 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 	}
 
 	/* Oh well, have to allocate and map a bounce buffer. */
-	*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
+	*phys = swiotlb_tbl_map_single(dev, phys_to_dma_unencrypted(dev, io_tlb_start),
 			*phys, size, size, dir, attrs);
 	if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
 		return false;
 
 	/* Ensure that the address returned is DMA'ble */
-	*dma_addr = __phys_to_dma(dev, *phys);
+	*dma_addr = phys_to_dma_unencrypted(dev, *phys);
 	if (unlikely(!dma_capable(dev, *dma_addr, size))) {
 		swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
-- 
Gitee


From 0dcc2973faec5bcb672ac55b7b503d4533386aa2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:32 +0200
Subject: [PATCH 0320/2161] genirq/chip: Use the first chip in
 irq_chip_compose_msi_msg()

commit 13b90cadfc294718dd5a89e1fcf103477b01eb50 upstream.

The documentation of irq_chip_compose_msi_msg() claims that with
hierarchical irq domains the first chip in the hierarchy which has an
irq_compose_msi_msg() callback is chosen. But the code just keeps
iterating after it finds a chip with a compose callback.

The x86 HPET MSI implementation relies on that behaviour, but that does not
make it more correct.

The message should always be composed at the domain which manages the
underlying resource (e.g. APIC or remap table) because that domain knows
about the required layout of the message.

On X86 the following hierarchies exist:

1)   vector -------- PCI/MSI
2)   vector -- IR -- PCI/MSI

The vector domain has a different message format than the IR (remapping)
domain. So obviously the PCI/MSI domain can't compose the message without
having knowledge about the parent domain, which is exactly the opposite of
what hierarchical domains want to achieve.

X86 actually has two different PCI/MSI chips where #1 has a compose
callback and #2 does not. #2 delegates the composition to the remap domain
where it belongs, but #1 does it at the PCI/MSI level.

For the upcoming device MSI support it's necessary to change this and just
let the first domain which can compose the message take care of it. That
way the top level chip does not have to worry about it and the device MSI
code does not need special knowledge about topologies. It just sets the
compose callback to NULL and lets the hierarchy pick the first chip which
has one.

Due to that the attempt to move the compose callback from the direct
delivery PCI/MSI domain to the vector domain made the system fail to boot
with interrupt remapping enabled because in the remapping case
irq_chip_compose_msi_msg() keeps iterating and choses the compose callback
of the vector domain which obviously creates the wrong format for the remap
table.

Break out of the loop when the first irq chip with a compose callback is
found and fixup the HPET code temporarily. That workaround will be removed
once the direct delivery compose callback is moved to the place where it
belongs in the vector domain.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <maz@kernel.org>                                                                                                                                                                                                                                     Link: https://lore.kernel.org/r/20200826112331.047917603@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/msi.c | 7 +++++--
 kernel/irq/chip.c          | 9 ++++-----
 kernel/irq/internals.h     | 9 +++++++++
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index a20873bbbed6..9ff7d6e53a74 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -478,10 +478,13 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
 	info.type = X86_IRQ_ALLOC_TYPE_HPET;
 	info.hpet_id = hpet_id;
 	parent = irq_remapping_get_ir_irq_domain(&info);
-	if (parent == NULL)
+	if (parent == NULL) {
 		parent = x86_vector_domain;
-	else
+	} else {
 		hpet_msi_controller.name = "IR-HPET-MSI";
+		/* Temporary fix: Will go away */
+		hpet_msi_controller.irq_compose_msi_msg = NULL;
+	}
 
 	fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
 					      hpet_id);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 41d8818ee84f..c36a17b08dc8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1509,18 +1509,17 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
  */
 int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	struct irq_data *pos = NULL;
+	struct irq_data *pos;
 
-#ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
-	for (; data; data = data->parent_data)
-#endif
+	for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) {
 		if (data->chip && data->chip->irq_compose_msi_msg)
 			pos = data;
+	}
+
 	if (!pos)
 		return -ENOSYS;
 
 	pos->chip->irq_compose_msi_msg(pos, msg);
-
 	return 0;
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 7db284b10ac9..54363527feea 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -473,6 +473,15 @@ static inline void irq_domain_deactivate_irq(struct irq_data *data)
 }
 #endif
 
+static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+	return irqd->parent_data;
+#else
+	return NULL;
+#endif
+}
+
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 #include <linux/debugfs.h>
 
-- 
Gitee


From b4bbd4d84537e9071861a99cad6ac831a340ff1d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:33 +0200
Subject: [PATCH 0321/2161] x86/msi: Move compose message callback where it
 belongs

commit b0a19555efd098183db0ee3ad52a3cd3bfbd1ba2 upstream.

Composing the MSI message at the MSI chip level is wrong because the
underlying parent domain is the one which knows how the message should be
composed for the direct vector delivery or the interrupt remapping table
entry.

The interrupt remapping aware PCI/MSI chip does that already. Make the
direct delivery chip do the same and move the composition of the direct
delivery MSI message to the vector domain irq chip.

This prepares for the upcoming device MSI support to avoid having
architecture specific knowledge in the device MSI domain irq chips.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112331.157603198@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/apic.h   |  8 ++++++++
 arch/x86/kernel/apic/msi.c    | 12 +++---------
 arch/x86/kernel/apic/vector.c |  1 +
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5bef1575708d..c4895b3897eb 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -510,6 +510,14 @@ static inline bool apic_id_is_primary_thread(unsigned int id) { return false; }
 static inline void apic_smt_update(void) { }
 #endif
 
+struct msi_msg;
+
+#ifdef CONFIG_PCI_MSI
+void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg);
+#else
+# define x86_vector_msi_compose_msg NULL
+#endif
+
 extern void irq_enter(void);
 extern void irq_exit(void);
 
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 9ff7d6e53a74..018e4fb03af3 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -45,7 +45,7 @@ static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
 		MSI_DATA_VECTOR(cfg->vector);
 }
 
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	__irq_msi_compose_msg(irqd_cfg(data), msg);
 }
@@ -176,7 +176,6 @@ static struct irq_chip pci_msi_controller = {
 	.irq_mask		= pci_msi_mask_irq,
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
-	.irq_compose_msi_msg	= irq_msi_compose_msg,
 	.irq_set_affinity	= msi_set_affinity,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
@@ -320,7 +319,6 @@ static struct irq_chip dmar_msi_controller = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= msi_domain_set_affinity,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
-	.irq_compose_msi_msg	= irq_msi_compose_msg,
 	.irq_write_msi_msg	= dmar_msi_write_msg,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
@@ -418,7 +416,6 @@ static struct irq_chip hpet_msi_controller __ro_after_init = {
 	.irq_ack = irq_chip_ack_parent,
 	.irq_set_affinity = msi_domain_set_affinity,
 	.irq_retrigger = irq_chip_retrigger_hierarchy,
-	.irq_compose_msi_msg = irq_msi_compose_msg,
 	.irq_write_msi_msg = hpet_msi_write_msg,
 	.flags = IRQCHIP_SKIP_SET_WAKE,
 };
@@ -478,13 +475,10 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
 	info.type = X86_IRQ_ALLOC_TYPE_HPET;
 	info.hpet_id = hpet_id;
 	parent = irq_remapping_get_ir_irq_domain(&info);
-	if (parent == NULL) {
+	if (parent == NULL)
 		parent = x86_vector_domain;
-	} else {
+	else
 		hpet_msi_controller.name = "IR-HPET-MSI";
-		/* Temporary fix: Will go away */
-		hpet_msi_controller.irq_compose_msi_msg = NULL;
-	}
 
 	fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
 					      hpet_id);
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index bf6662d37a33..1c2cdcb42880 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -821,6 +821,7 @@ static struct irq_chip lapic_controller = {
 	.name			= "APIC",
 	.irq_ack		= apic_ack_edge,
 	.irq_set_affinity	= apic_set_affinity,
+	.irq_compose_msi_msg	= x86_vector_msi_compose_msg,
 	.irq_retrigger		= apic_retrigger_irq,
 };
 
-- 
Gitee


From b3fa9f011ad1edd21a4326ea0300fc8688409538 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 25 Mar 2022 15:11:08 +0800
Subject: [PATCH 0322/2161] x86: Fix __apicid_to_node missing

commit opencloudos.

Include <asm/numa.h>

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/hygon.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 62e9a982adaf..dc0840aae26c 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -10,6 +10,7 @@
 
 #include <asm/cpu.h>
 #include <asm/smp.h>
+#include <asm/numa.h>
 #include <asm/cacheinfo.h>
 #include <asm/spec-ctrl.h>
 #include <asm/delay.h>
-- 
Gitee


From 4f00fedec266d9cf4509d605b35fbb6cabf105a8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:35 +0200
Subject: [PATCH 0323/2161] 
 x86_irq_Rename_X86_IRQ_ALLOC_TYPE_MSI_to_reflect_PCI_dependency

commit a83722f45c5baa350693795b0dc3f04ab5201a6b upstream.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112331.343103175@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h       |  4 ++--
 arch/x86/kernel/apic/msi.c          |  6 +++---
 drivers/iommu/amd_iommu.c           | 24 ++++++++++++------------
 drivers/iommu/intel/irq_remapping.c | 18 +++++++++---------
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 4154bc5f6a4e..65da8a70d3af 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -58,8 +58,8 @@ struct msi_desc;
 enum irq_alloc_type {
 	X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
 	X86_IRQ_ALLOC_TYPE_HPET,
-	X86_IRQ_ALLOC_TYPE_MSI,
-	X86_IRQ_ALLOC_TYPE_MSIX,
+	X86_IRQ_ALLOC_TYPE_PCI_MSI,
+	X86_IRQ_ALLOC_TYPE_PCI_MSIX,
 	X86_IRQ_ALLOC_TYPE_DMAR,
 	X86_IRQ_ALLOC_TYPE_UV,
 };
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 018e4fb03af3..ab8b7b47c359 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -186,7 +186,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	struct irq_alloc_info info;
 
 	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_MSI;
+	info.type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
 	info.msi_dev = dev;
 
 	domain = irq_remapping_get_irq_domain(&info);
@@ -218,9 +218,9 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
 	init_irq_alloc_info(arg, NULL);
 	arg->msi_dev = pdev;
 	if (desc->msi_attrib.is_msix) {
-		arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
 	} else {
-		arg->type = X86_IRQ_ALLOC_TYPE_MSI;
+		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
 		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
 	}
 
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 8c7ddbf63e3d..f76a357bed5a 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3536,8 +3536,8 @@ static int get_devid(struct irq_alloc_info *info)
 	case X86_IRQ_ALLOC_TYPE_HPET:
 		devid     = get_hpet_devid(info->hpet_id);
 		break;
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		devid = get_device_id(&info->msi_dev->dev);
 		break;
 	default:
@@ -3575,8 +3575,8 @@ static struct irq_domain *get_irq_domain(struct irq_alloc_info *info)
 		return NULL;
 
 	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		devid = get_device_id(&info->msi_dev->dev);
 		if (devid < 0)
 			return NULL;
@@ -3637,8 +3637,8 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 		break;
 
 	case X86_IRQ_ALLOC_TYPE_HPET:
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		msg->address_hi = MSI_ADDR_BASE_HI;
 		msg->address_lo = MSI_ADDR_BASE_LO;
 		msg->data = irte_info->index;
@@ -3682,15 +3682,15 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
 
 	if (!info)
 		return -EINVAL;
-	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
-	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
 		return -EINVAL;
 
 	/*
 	 * With IRQ remapping enabled, don't need contiguous CPU vectors
 	 * to support multiple MSI interrupts.
 	 */
-	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+	if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
 		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
 
 	devid = get_devid(info);
@@ -3722,9 +3722,9 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
 		} else {
 			index = -ENOMEM;
 		}
-	} else if (info->type == X86_IRQ_ALLOC_TYPE_MSI ||
-		   info->type == X86_IRQ_ALLOC_TYPE_MSIX) {
-		bool align = (info->type == X86_IRQ_ALLOC_TYPE_MSI);
+	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
+		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
+		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
 
 		index = alloc_irq_index(devid, nr_irqs, align, info->msi_dev);
 	} else {
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index acaf98ddadbd..2eb1ab5bc1f5 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1121,8 +1121,8 @@ static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
 	case X86_IRQ_ALLOC_TYPE_HPET:
 		iommu = map_hpet_to_ir(info->hpet_id);
 		break;
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		iommu = map_dev_to_ir(info->msi_dev);
 		break;
 	default:
@@ -1141,8 +1141,8 @@ static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
 		return NULL;
 
 	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		iommu = map_dev_to_ir(info->msi_dev);
 		if (iommu)
 			return iommu->ir_msi_domain;
@@ -1312,8 +1312,8 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 		break;
 
 	case X86_IRQ_ALLOC_TYPE_HPET:
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
 			set_hpet_sid(irte, info->hpet_id);
 		else
@@ -1368,15 +1368,15 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
 
 	if (!info || !iommu)
 		return -EINVAL;
-	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
-	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
 		return -EINVAL;
 
 	/*
 	 * With IRQ remapping enabled, don't need contiguous CPU vectors
 	 * to support multiple MSI interrupts.
 	 */
-	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+	if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
 		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
 
 	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
-- 
Gitee


From a81b277bdfda6987e978e7f6cbc17f0c08e2f8dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:36 +0200
Subject: [PATCH 0324/2161] x86/irq: Add allocation type for parent domain
 retrieval

commit b4c364da32cf3b85297648ff5563de2c47d9e32f upstream.

irq_remapping_ir_irq_domain() is used to retrieve the remapping parent
domain for an allocation type. irq_remapping_irq_domain() is for retrieving
the actual device domain for allocating interrupts for a device.

The two functions are similar and can be unified by using explicit modes
for parent irq domain retrieval.

Add X86_IRQ_ALLOC_TYPE_IOAPIC/HPET_GET_PARENT and use it in the iommu
implementations. Drop the parent domain retrieval for PCI_MSI/X as that is
unused.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112331.436350257@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h       | 2 ++
 arch/x86/kernel/apic/io_apic.c      | 2 +-
 arch/x86/kernel/apic/msi.c          | 2 +-
 drivers/iommu/amd_iommu.c           | 8 ++++++++
 drivers/iommu/hyperv-iommu.c        | 2 +-
 drivers/iommu/intel/irq_remapping.c | 8 ++------
 6 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 65da8a70d3af..c8be2c0e3b32 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -62,6 +62,8 @@ enum irq_alloc_type {
 	X86_IRQ_ALLOC_TYPE_PCI_MSIX,
 	X86_IRQ_ALLOC_TYPE_DMAR,
 	X86_IRQ_ALLOC_TYPE_UV,
+	X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT,
+	X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT,
 };
 
 struct irq_alloc_info {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0edcf69659ee..c710efaa231d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2320,7 +2320,7 @@ static int mp_irqdomain_create(int ioapic)
 		return 0;
 
 	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT;
 	info.ioapic_id = mpc_ioapic_id(ioapic);
 	parent = irq_remapping_get_ir_irq_domain(&info);
 	if (!parent)
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index ab8b7b47c359..3abd48e166bf 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -472,7 +472,7 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
 	domain_info->data = (void *)(long)hpet_id;
 
 	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT;
 	info.hpet_id = hpet_id;
 	parent = irq_remapping_get_ir_irq_domain(&info);
 	if (parent == NULL)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index f76a357bed5a..0a6a91b0e740 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3556,6 +3556,14 @@ static struct irq_domain *get_ir_irq_domain(struct irq_alloc_info *info)
 	if (!info)
 		return NULL;
 
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
+	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
+		break;
+	default:
+		return NULL;
+	}
+
 	devid = get_devid(info);
 	if (devid >= 0) {
 		iommu = amd_iommu_rlookup_table[devid];
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index f0fe5030acd3..a8c7c457c2d4 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -184,7 +184,7 @@ static int __init hyperv_enable_irq_remapping(void)
 
 static struct irq_domain *hyperv_get_ir_irq_domain(struct irq_alloc_info *info)
 {
-	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC)
+	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT)
 		return ioapic_ir_domain;
 	else
 		return NULL;
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 2eb1ab5bc1f5..8bd3a803b5b2 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1115,16 +1115,12 @@ static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
 		return NULL;
 
 	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
 		iommu = map_ioapic_to_ir(info->ioapic_id);
 		break;
-	case X86_IRQ_ALLOC_TYPE_HPET:
+	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
 		iommu = map_hpet_to_ir(info->hpet_id);
 		break;
-	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
-	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		iommu = map_dev_to_ir(info->msi_dev);
-		break;
 	default:
 		BUG_ON(1);
 		break;
-- 
Gitee


From a4494f1e9a315046312d24f2e16e14c546ad6268 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:37 +0200
Subject: [PATCH 0325/2161] iommu/vt-d: Consolidate irq domain getter

commit 60e5a9397c0c4b7cecf05fec1aef8fe2ae5c9f3c upstream.

The irq domain request mode is now indicated in irq_alloc_info::type.

Consolidate the two getter functions into one.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112331.530546013@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/irq_remapping.c | 67 +++++++++++------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 8bd3a803b5b2..e9b61023c044 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -204,35 +204,40 @@ static int modify_irte(struct irq_2_iommu *irq_iommu,
 	return rc;
 }
 
-static struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
+static struct irq_domain *map_hpet_to_ir(u8 hpet_id)
 {
 	int i;
 
-	for (i = 0; i < MAX_HPET_TBS; i++)
+	for (i = 0; i < MAX_HPET_TBS; i++) {
 		if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu)
-			return ir_hpet[i].iommu;
+			return ir_hpet[i].iommu->ir_domain;
+	}
 	return NULL;
 }
 
-static struct intel_iommu *map_ioapic_to_ir(int apic)
+static struct intel_iommu *map_ioapic_to_iommu(int apic)
 {
 	int i;
 
-	for (i = 0; i < MAX_IO_APICS; i++)
+	for (i = 0; i < MAX_IO_APICS; i++) {
 		if (ir_ioapic[i].id == apic && ir_ioapic[i].iommu)
 			return ir_ioapic[i].iommu;
+	}
 	return NULL;
 }
 
-static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+static struct irq_domain *map_ioapic_to_ir(int apic)
 {
-	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = map_ioapic_to_iommu(apic);
 
-	drhd = dmar_find_matched_drhd_unit(dev);
-	if (!drhd)
-		return NULL;
+	return iommu ? iommu->ir_domain : NULL;
+}
 
-	return drhd->iommu;
+static struct irq_domain *map_dev_to_ir(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *drhd = dmar_find_matched_drhd_unit(dev);
+
+	return drhd ? drhd->iommu->ir_msi_domain : NULL;
 }
 
 static int clear_entries(struct irq_2_iommu *irq_iommu)
@@ -1002,7 +1007,7 @@ static int __init parse_ioapics_under_ir(void)
 
 	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
 		int ioapic_id = mpc_ioapic_id(ioapic_idx);
-		if (!map_ioapic_to_ir(ioapic_id)) {
+		if (!map_ioapic_to_iommu(ioapic_id)) {
 			pr_err(FW_BUG "ioapic %d has no mapping iommu, "
 			       "interrupt remapping will be disabled\n",
 			       ioapic_id);
@@ -1107,47 +1112,23 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 	irte->redir_hint = 1;
 }
 
-static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
+static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
 {
-	struct intel_iommu *iommu = NULL;
-
 	if (!info)
 		return NULL;
 
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
-		iommu = map_ioapic_to_ir(info->ioapic_id);
-		break;
+		return map_ioapic_to_ir(info->ioapic_id);
 	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
-		iommu = map_hpet_to_ir(info->hpet_id);
-		break;
-	default:
-		BUG_ON(1);
-		break;
-	}
-
-	return iommu ? iommu->ir_domain : NULL;
-}
-
-static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
-{
-	struct intel_iommu *iommu;
-
-	if (!info)
-		return NULL;
-
-	switch (info->type) {
+		return map_hpet_to_ir(info->hpet_id);
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		iommu = map_dev_to_ir(info->msi_dev);
-		if (iommu)
-			return iommu->ir_msi_domain;
-		break;
+		return map_dev_to_ir(info->msi_dev);
 	default:
-		break;
+		WARN_ON_ONCE(1);
+		return NULL;
 	}
-
-	return NULL;
 }
 
 struct irq_remap_ops intel_irq_remap_ops = {
@@ -1156,7 +1137,7 @@ struct irq_remap_ops intel_irq_remap_ops = {
 	.disable		= disable_irq_remapping,
 	.reenable		= reenable_irq_remapping,
 	.enable_faulting	= enable_drhd_fault_handling,
-	.get_ir_irq_domain	= intel_get_ir_irq_domain,
+	.get_ir_irq_domain	= intel_get_irq_domain,
 	.get_irq_domain		= intel_get_irq_domain,
 };
 
-- 
Gitee


From d39446a401c5a4311a860a7ecacfce3086d1a2f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:38 +0200
Subject: [PATCH 0326/2161] iommu/amd: Consolidate irq domain getter

commit 192a99f4bd9d1d546ca276e058761a79af575744 upstream.

The irq domain request mode is now indicated in irq_alloc_info::type.

Consolidate the two getter functions into one.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112331.634777249@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 65 +++++++++++++--------------------------
 1 file changed, 21 insertions(+), 44 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 0a6a91b0e740..c53a02da0f7c 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3527,77 +3527,54 @@ static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
 
 static int get_devid(struct irq_alloc_info *info)
 {
-	int devid = -1;
-
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
-		devid     = get_ioapic_devid(info->ioapic_id);
-		break;
+	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
+		return get_ioapic_devid(info->ioapic_id);
 	case X86_IRQ_ALLOC_TYPE_HPET:
-		devid     = get_hpet_devid(info->hpet_id);
-		break;
+	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
+		return get_hpet_devid(info->hpet_id);
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		devid = get_device_id(&info->msi_dev->dev);
-		break;
+		return get_device_id(&info->msi_dev->dev);
 	default:
-		BUG_ON(1);
-		break;
+		WARN_ON_ONCE(1);
+		return -1;
 	}
-
-	return devid;
 }
 
-static struct irq_domain *get_ir_irq_domain(struct irq_alloc_info *info)
+static struct irq_domain *get_irq_domain_for_devid(struct irq_alloc_info *info,
+						   int devid)
 {
-	struct amd_iommu *iommu;
-	int devid;
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
 
-	if (!info)
+	if (!iommu)
 		return NULL;
 
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
 	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
-		break;
+		return iommu->ir_domain;
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+		return iommu->msi_domain;
 	default:
+		WARN_ON_ONCE(1);
 		return NULL;
 	}
-
-	devid = get_devid(info);
-	if (devid >= 0) {
-		iommu = amd_iommu_rlookup_table[devid];
-		if (iommu)
-			return iommu->ir_domain;
-	}
-
-	return NULL;
 }
 
 static struct irq_domain *get_irq_domain(struct irq_alloc_info *info)
 {
-	struct amd_iommu *iommu;
 	int devid;
 
 	if (!info)
 		return NULL;
 
-	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
-	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		devid = get_device_id(&info->msi_dev->dev);
-		if (devid < 0)
-			return NULL;
-
-		iommu = amd_iommu_rlookup_table[devid];
-		if (iommu)
-			return iommu->msi_domain;
-		break;
-	default:
-		break;
-	}
-
-	return NULL;
+	devid = get_devid(info);
+	if (devid < 0)
+		return NULL;
+	return get_irq_domain_for_devid(info, devid);
 }
 
 struct irq_remap_ops amd_iommu_irq_ops = {
@@ -3606,7 +3583,7 @@ struct irq_remap_ops amd_iommu_irq_ops = {
 	.disable		= amd_iommu_disable,
 	.reenable		= amd_iommu_reenable,
 	.enable_faulting	= amd_iommu_enable_faulting,
-	.get_ir_irq_domain	= get_ir_irq_domain,
+	.get_ir_irq_domain	= get_irq_domain,
 	.get_irq_domain		= get_irq_domain,
 };
 
-- 
Gitee


From 5d1f87d6dfcb4077eec3070956c0628538e0c210 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:39 +0200
Subject: [PATCH 0327/2161] iommu/irq_remapping: Consolidate irq domain lookup

commit 6b6256e616f7e10c4434cfcd32371fc33ca94e48 upstream.

Now that the iommu implementations handle the X86_*_GET_PARENT_DOMAIN
types, consolidate the two getter functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112331.741909337@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/irq_remapping.h |  8 --------
 arch/x86/kernel/apic/io_apic.c       |  2 +-
 arch/x86/kernel/apic/msi.c           |  2 +-
 drivers/iommu/amd_iommu.c            |  1 -
 drivers/iommu/hyperv-iommu.c         |  4 ++--
 drivers/iommu/intel/irq_remapping.c  |  1 -
 drivers/iommu/irq_remapping.c        | 23 +----------------------
 drivers/iommu/irq_remapping.h        |  5 +----
 8 files changed, 6 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 4bc985f1e2e4..af4a151d70b3 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -44,8 +44,6 @@ extern int irq_remapping_reenable(int);
 extern int irq_remap_enable_fault_handling(void);
 extern void panic_if_irq_remap(const char *msg);
 
-extern struct irq_domain *
-irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
 extern struct irq_domain *
 irq_remapping_get_irq_domain(struct irq_alloc_info *info);
 
@@ -73,12 +71,6 @@ static inline void panic_if_irq_remap(const char *msg)
 {
 }
 
-static inline struct irq_domain *
-irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
-{
-	return NULL;
-}
-
 static inline struct irq_domain *
 irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c710efaa231d..33a945848032 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2322,7 +2322,7 @@ static int mp_irqdomain_create(int ioapic)
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT;
 	info.ioapic_id = mpc_ioapic_id(ioapic);
-	parent = irq_remapping_get_ir_irq_domain(&info);
+	parent = irq_remapping_get_irq_domain(&info);
 	if (!parent)
 		parent = x86_vector_domain;
 	else
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 3abd48e166bf..88d718bbf76d 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -474,7 +474,7 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT;
 	info.hpet_id = hpet_id;
-	parent = irq_remapping_get_ir_irq_domain(&info);
+	parent = irq_remapping_get_irq_domain(&info);
 	if (parent == NULL)
 		parent = x86_vector_domain;
 	else
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c53a02da0f7c..403278bd6c76 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3583,7 +3583,6 @@ struct irq_remap_ops amd_iommu_irq_ops = {
 	.disable		= amd_iommu_disable,
 	.reenable		= amd_iommu_reenable,
 	.enable_faulting	= amd_iommu_enable_faulting,
-	.get_ir_irq_domain	= get_irq_domain,
 	.get_irq_domain		= get_irq_domain,
 };
 
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index a8c7c457c2d4..167a12a7f6f5 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -182,7 +182,7 @@ static int __init hyperv_enable_irq_remapping(void)
 	return IRQ_REMAP_X2APIC_MODE;
 }
 
-static struct irq_domain *hyperv_get_ir_irq_domain(struct irq_alloc_info *info)
+static struct irq_domain *hyperv_get_irq_domain(struct irq_alloc_info *info)
 {
 	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT)
 		return ioapic_ir_domain;
@@ -193,7 +193,7 @@ static struct irq_domain *hyperv_get_ir_irq_domain(struct irq_alloc_info *info)
 struct irq_remap_ops hyperv_irq_remap_ops = {
 	.prepare		= hyperv_prepare_irq_remapping,
 	.enable			= hyperv_enable_irq_remapping,
-	.get_ir_irq_domain	= hyperv_get_ir_irq_domain,
+	.get_irq_domain		= hyperv_get_irq_domain,
 };
 
 #endif
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index e9b61023c044..04eb2d64201d 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1137,7 +1137,6 @@ struct irq_remap_ops intel_irq_remap_ops = {
 	.disable		= disable_irq_remapping,
 	.reenable		= reenable_irq_remapping,
 	.enable_faulting	= enable_drhd_fault_handling,
-	.get_ir_irq_domain	= intel_get_irq_domain,
 	.get_irq_domain		= intel_get_irq_domain,
 };
 
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 83f36f61416e..2d84b1ed205e 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -159,34 +159,13 @@ void panic_if_irq_remap(const char *msg)
 		panic(msg);
 }
 
-/**
- * irq_remapping_get_ir_irq_domain - Get the irqdomain associated with the IOMMU
- *				     device serving request @info
- * @info: interrupt allocation information, used to identify the IOMMU device
- *
- * It's used to get parent irqdomain for HPET and IOAPIC irqdomains.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *
-irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
-{
-	if (!remap_ops || !remap_ops->get_ir_irq_domain)
-		return NULL;
-
-	return remap_ops->get_ir_irq_domain(info);
-}
-
 /**
  * irq_remapping_get_irq_domain - Get the irqdomain serving the request @info
  * @info: interrupt allocation information, used to identify the IOMMU device
  *
- * There will be one PCI MSI/MSIX irqdomain associated with each interrupt
- * remapping device, so this interface is used to retrieve the PCI MSI/MSIX
- * irqdomain serving request @info.
  * Returns pointer to IRQ domain, or NULL on failure.
  */
-struct irq_domain *
-irq_remapping_get_irq_domain(struct irq_alloc_info *info)
+struct irq_domain *irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 {
 	if (!remap_ops || !remap_ops->get_irq_domain)
 		return NULL;
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 6a190d504eb6..1661b3d75920 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -43,10 +43,7 @@ struct irq_remap_ops {
 	/* Enable fault handling */
 	int  (*enable_faulting)(void);
 
-	/* Get the irqdomain associated the IOMMU device */
-	struct irq_domain *(*get_ir_irq_domain)(struct irq_alloc_info *);
-
-	/* Get the MSI irqdomain associated with the IOMMU device */
+	/* Get the irqdomain associated to IOMMU device */
 	struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *);
 };
 
-- 
Gitee


From cbd5f7e5d454e3649cefd97318c3f6583beac34d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:40 +0200
Subject: [PATCH 0328/2161] x86/irq: Prepare consolidation of irq_alloc_info

commit 874d9b3a958897e914982962fa25b3bb9dc07988 upstream.

struct irq_alloc_info is a horrible zoo of unnamed structs in a union. Many
of the struct fields can be generic and don't have to be type specific like
hpet_id, ioapic_id...

Provide a generic set of members to prepare for the consolidation. The goal
is to make irq_alloc_info have the same basic member as the generic
msi_alloc_info so generic MSI domain ops can be reused and yet more mess
can be avoided when (non-PCI) device MSI support comes along.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112331.849577844@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index c8be2c0e3b32..9a75b21dc72c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -66,10 +66,25 @@ enum irq_alloc_type {
 	X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT,
 };
 
+/**
+ * irq_alloc_info - X86 specific interrupt allocation info
+ * @type:	X86 specific allocation type
+ * @flags:	Flags for allocation tweaks
+ * @devid:	Device ID for allocations
+ * @hwirq:	Associated hw interrupt number in the domain
+ * @mask:	CPU mask for vector allocation
+ * @desc:	Pointer to msi descriptor
+ * @data:	Allocation specific data
+ */
 struct irq_alloc_info {
 	enum irq_alloc_type	type;
 	u32			flags;
-	const struct cpumask	*mask;	/* CPU mask for vector allocation */
+	u32			devid;
+	irq_hw_number_t		hwirq;
+	const struct cpumask	*mask;
+	struct msi_desc		*desc;
+	void			*data;
+
 	union {
 		int		unused;
 #ifdef	CONFIG_HPET_TIMER
@@ -109,11 +124,6 @@ struct irq_alloc_info {
 			unsigned long	uv_offset;
 			char		*uv_name;
 		};
-#endif
-#if IS_ENABLED(CONFIG_VMD)
-		struct {
-			struct msi_desc *desc;
-		};
 #endif
 	};
 };
-- 
Gitee


From 76400b2e4b9868e0efb02dda857733576c95aa97 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:41 +0200
Subject: [PATCH 0329/2161] x86/msi: Consolidate HPET allocation

commit 2bf1e7bcedb8802cb4fc65757b229edfe112a4bb upstream.

None of the magic HPET fields are required in any way.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112331.943993771@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h       |  7 -------
 arch/x86/kernel/apic/msi.c          | 14 +++++++-------
 drivers/iommu/amd_iommu.c           |  2 +-
 drivers/iommu/intel/irq_remapping.c |  4 ++--
 4 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 9a75b21dc72c..1bd14aa9cb50 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -87,13 +87,6 @@ struct irq_alloc_info {
 
 	union {
 		int		unused;
-#ifdef	CONFIG_HPET_TIMER
-		struct {
-			int		hpet_id;
-			int		hpet_index;
-			void		*hpet_data;
-		};
-#endif
 #ifdef	CONFIG_PCI_MSI
 		struct {
 			struct pci_dev	*msi_dev;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 88d718bbf76d..f488868c8789 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -423,7 +423,7 @@ static struct irq_chip hpet_msi_controller __ro_after_init = {
 static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
 					  msi_alloc_info_t *arg)
 {
-	return arg->hpet_index;
+	return arg->hwirq;
 }
 
 static int hpet_msi_init(struct irq_domain *domain,
@@ -431,8 +431,8 @@ static int hpet_msi_init(struct irq_domain *domain,
 			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
 {
 	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
-	irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
-			    handle_edge_irq, arg->hpet_data, "edge");
+	irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
+			    handle_edge_irq, arg->data, "edge");
 
 	return 0;
 }
@@ -473,7 +473,7 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
 
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT;
-	info.hpet_id = hpet_id;
+	info.devid = hpet_id;
 	parent = irq_remapping_get_irq_domain(&info);
 	if (parent == NULL)
 		parent = x86_vector_domain;
@@ -502,9 +502,9 @@ int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
 
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_HPET;
-	info.hpet_data = hc;
-	info.hpet_id = hpet_dev_id(domain);
-	info.hpet_index = dev_num;
+	info.data = hc;
+	info.devid = hpet_dev_id(domain);
+	info.hwirq = dev_num;
 
 	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
 }
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 403278bd6c76..d250e956fa8c 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3533,7 +3533,7 @@ static int get_devid(struct irq_alloc_info *info)
 		return get_ioapic_devid(info->ioapic_id);
 	case X86_IRQ_ALLOC_TYPE_HPET:
 	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
-		return get_hpet_devid(info->hpet_id);
+		return get_hpet_devid(info->devid);
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		return get_device_id(&info->msi_dev->dev);
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 04eb2d64201d..4cbfccafc7aa 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1121,7 +1121,7 @@ static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
 	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
 		return map_ioapic_to_ir(info->ioapic_id);
 	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
-		return map_hpet_to_ir(info->hpet_id);
+		return map_hpet_to_ir(info->devid);
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		return map_dev_to_ir(info->msi_dev);
@@ -1291,7 +1291,7 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
 		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
-			set_hpet_sid(irte, info->hpet_id);
+			set_hpet_sid(irte, info->devid);
 		else
 			set_msi_sid(irte, info->msi_dev);
 
-- 
Gitee


From abe9befa5a44bb359f9a45f0a0c73372b5c1a0ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2020 15:30:40 +0200
Subject: [PATCH 0330/2161] drm/nouveau/gk20a: stop setting
 DMA_ATTR_NON_CONSISTENT

commit e0ec8a4d6432104bf0e985ccfa475f8742c78cc6 upstream.

DMA_ATTR_NON_CONSISTENT is a no-op except on PA-RISC and a few MIPS
configs, so don't set it in this ARM specific driver part.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
index 985f2990ab0d..13d4d7ac0697 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
@@ -594,8 +594,7 @@ gk20a_instmem_new(struct nvkm_device *device, int index,
 
 		nvkm_info(&imem->base.subdev, "using IOMMU\n");
 	} else {
-		imem->attrs = DMA_ATTR_NON_CONSISTENT |
-			      DMA_ATTR_WEAK_ORDERING |
+		imem->attrs = DMA_ATTR_WEAK_ORDERING |
 			      DMA_ATTR_WRITE_COMBINE;
 
 		nvkm_info(&imem->base.subdev, "using DMA API\n");
-- 
Gitee


From ee093479b8d8542487507209c84c2c6803033907 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:42 +0200
Subject: [PATCH 0331/2161] x86_ioapic_Consolidate_IOAPIC_allocation

commit 33a65ba470c2b7031e513f7b165e68f51cfc55eb upstream.

Move the IOAPIC specific fields into their own struct and reuse the common
devid. Get rid of the #ifdeffery as it does not matter at all whether the
alloc info is a couple of bytes longer or not.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Wei Liu <wei.liu@kernel.org>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112332.054367732@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h       | 23 +++++-----
 arch/x86/kernel/apic/io_apic.c      | 70 ++++++++++++++---------------
 arch/x86/kernel/devicetree.c        |  4 +-
 drivers/iommu/amd_iommu.c           | 14 +++---
 drivers/iommu/hyperv-iommu.c        |  2 +-
 drivers/iommu/intel/irq_remapping.c | 18 ++++----
 6 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1bd14aa9cb50..ecdf420807a8 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -66,6 +66,15 @@ enum irq_alloc_type {
 	X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT,
 };
 
+struct ioapic_alloc_info {
+	int				pin;
+	int				node;
+	u32				trigger : 1;
+	u32				polarity : 1;
+	u32				valid : 1;
+	struct IO_APIC_route_entry	*entry;
+};
+
 /**
  * irq_alloc_info - X86 specific interrupt allocation info
  * @type:	X86 specific allocation type
@@ -75,6 +84,8 @@ enum irq_alloc_type {
  * @mask:	CPU mask for vector allocation
  * @desc:	Pointer to msi descriptor
  * @data:	Allocation specific data
+ *
+ * @ioapic:	IOAPIC specific allocation data
  */
 struct irq_alloc_info {
 	enum irq_alloc_type	type;
@@ -86,6 +97,7 @@ struct irq_alloc_info {
 	void			*data;
 
 	union {
+		struct ioapic_alloc_info	ioapic;
 		int		unused;
 #ifdef	CONFIG_PCI_MSI
 		struct {
@@ -93,17 +105,6 @@ struct irq_alloc_info {
 			irq_hw_number_t	msi_hwirq;
 		};
 #endif
-#ifdef	CONFIG_X86_IO_APIC
-		struct {
-			int		ioapic_id;
-			int		ioapic_pin;
-			int		ioapic_node;
-			u32		ioapic_trigger : 1;
-			u32		ioapic_polarity : 1;
-			u32		ioapic_valid : 1;
-			struct IO_APIC_route_entry *ioapic_entry;
-		};
-#endif
 #ifdef	CONFIG_DMAR_TABLE
 		struct {
 			int		dmar_id;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 33a945848032..4672b9455e84 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -873,10 +873,10 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
 {
 	init_irq_alloc_info(info, NULL);
 	info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
-	info->ioapic_node = node;
-	info->ioapic_trigger = trigger;
-	info->ioapic_polarity = polarity;
-	info->ioapic_valid = 1;
+	info->ioapic.node = node;
+	info->ioapic.trigger = trigger;
+	info->ioapic.polarity = polarity;
+	info->ioapic.valid = 1;
 }
 
 #ifndef CONFIG_ACPI
@@ -891,32 +891,32 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
 
 	copy_irq_alloc_info(dst, src);
 	dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
-	dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
-	dst->ioapic_pin = pin;
-	dst->ioapic_valid = 1;
-	if (src && src->ioapic_valid) {
-		dst->ioapic_node = src->ioapic_node;
-		dst->ioapic_trigger = src->ioapic_trigger;
-		dst->ioapic_polarity = src->ioapic_polarity;
+	dst->devid = mpc_ioapic_id(ioapic_idx);
+	dst->ioapic.pin = pin;
+	dst->ioapic.valid = 1;
+	if (src && src->ioapic.valid) {
+		dst->ioapic.node = src->ioapic.node;
+		dst->ioapic.trigger = src->ioapic.trigger;
+		dst->ioapic.polarity = src->ioapic.polarity;
 	} else {
-		dst->ioapic_node = NUMA_NO_NODE;
+		dst->ioapic.node = NUMA_NO_NODE;
 		if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
-			dst->ioapic_trigger = trigger;
-			dst->ioapic_polarity = polarity;
+			dst->ioapic.trigger = trigger;
+			dst->ioapic.polarity = polarity;
 		} else {
 			/*
 			 * PCI interrupts are always active low level
 			 * triggered.
 			 */
-			dst->ioapic_trigger = IOAPIC_LEVEL;
-			dst->ioapic_polarity = IOAPIC_POL_LOW;
+			dst->ioapic.trigger = IOAPIC_LEVEL;
+			dst->ioapic.polarity = IOAPIC_POL_LOW;
 		}
 	}
 }
 
 static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
 {
-	return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+	return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
 }
 
 static void mp_register_handler(unsigned int irq, unsigned long trigger)
@@ -946,14 +946,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 	 * pin with real trigger and polarity attributes.
 	 */
 	if (irq < nr_legacy_irqs() && data->count == 1) {
-		if (info->ioapic_trigger != data->trigger)
-			mp_register_handler(irq, info->ioapic_trigger);
-		data->entry.trigger = data->trigger = info->ioapic_trigger;
-		data->entry.polarity = data->polarity = info->ioapic_polarity;
+		if (info->ioapic.trigger != data->trigger)
+			mp_register_handler(irq, info->ioapic.trigger);
+		data->entry.trigger = data->trigger = info->ioapic.trigger;
+		data->entry.polarity = data->polarity = info->ioapic.polarity;
 	}
 
-	return data->trigger == info->ioapic_trigger &&
-	       data->polarity == info->ioapic_polarity;
+	return data->trigger == info->ioapic.trigger &&
+	       data->polarity == info->ioapic.polarity;
 }
 
 static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
@@ -1015,7 +1015,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
 		if (!mp_check_pin_attr(irq, info))
 			return -EBUSY;
 		if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
-					  info->ioapic_pin))
+					  info->ioapic.pin))
 			return -ENOMEM;
 	} else {
 		info->flags |= X86_IRQ_ALLOC_LEGACY;
@@ -2115,8 +2115,8 @@ static int mp_alloc_timer_irq(int ioapic, int pin)
 		struct irq_alloc_info info;
 
 		ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
-		info.ioapic_id = mpc_ioapic_id(ioapic);
-		info.ioapic_pin = pin;
+		info.devid = mpc_ioapic_id(ioapic);
+		info.ioapic.pin = pin;
 		mutex_lock(&ioapic_mutex);
 		irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
 		mutex_unlock(&ioapic_mutex);
@@ -2321,7 +2321,7 @@ static int mp_irqdomain_create(int ioapic)
 
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT;
-	info.ioapic_id = mpc_ioapic_id(ioapic);
+	info.devid = mpc_ioapic_id(ioapic);
 	parent = irq_remapping_get_irq_domain(&info);
 	if (!parent)
 		parent = x86_vector_domain;
@@ -2956,9 +2956,9 @@ int mp_ioapic_registered(u32 gsi_base)
 static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
 				  struct irq_alloc_info *info)
 {
-	if (info && info->ioapic_valid) {
-		data->trigger = info->ioapic_trigger;
-		data->polarity = info->ioapic_polarity;
+	if (info && info->ioapic.valid) {
+		data->trigger = info->ioapic.trigger;
+		data->polarity = info->ioapic.polarity;
 	} else if (acpi_get_override_irq(gsi, &data->trigger,
 					 &data->polarity) < 0) {
 		/* PCI interrupts are always active low level triggered. */
@@ -3004,7 +3004,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 		return -EINVAL;
 
 	ioapic = mp_irqdomain_ioapic_idx(domain);
-	pin = info->ioapic_pin;
+	pin = info->ioapic.pin;
 	if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
 		return -EEXIST;
 
@@ -3012,7 +3012,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	if (!data)
 		return -ENOMEM;
 
-	info->ioapic_entry = &data->entry;
+	info->ioapic.entry = &data->entry;
 	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
 	if (ret < 0) {
 		kfree(data);
@@ -3020,7 +3020,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	}
 
 	INIT_LIST_HEAD(&data->irq_2_pin);
-	irq_data->hwirq = info->ioapic_pin;
+	irq_data->hwirq = info->ioapic.pin;
 	irq_data->chip = (domain->parent == x86_vector_domain) ?
 			  &ioapic_chip : &ioapic_ir_chip;
 	irq_data->chip_data = data;
@@ -3030,8 +3030,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
 
 	local_irq_save(flags);
-	if (info->ioapic_entry)
-		mp_setup_entry(cfg, data, info->ioapic_entry);
+	if (info->ioapic.entry)
+		mp_setup_entry(cfg, data, info->ioapic.entry);
 	mp_register_handler(virq, data->trigger);
 	if (virq < nr_legacy_irqs())
 		legacy_pic->mask(virq);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index a0e8fc7d85f1..ddffd80f5c52 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -229,8 +229,8 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 
 	it = &of_ioapic_type[type_index];
 	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
-	tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
-	tmp.ioapic_pin = fwspec->param[0];
+	tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+	tmp.ioapic.pin = fwspec->param[0];
 
 	return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
 }
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index d250e956fa8c..4c86431e07aa 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3530,7 +3530,7 @@ static int get_devid(struct irq_alloc_info *info)
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
 	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
-		return get_ioapic_devid(info->ioapic_id);
+		return get_ioapic_devid(info->devid);
 	case X86_IRQ_ALLOC_TYPE_HPET:
 	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
 		return get_hpet_devid(info->devid);
@@ -3608,15 +3608,15 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
 		/* Setup IOAPIC entry */
-		entry = info->ioapic_entry;
-		info->ioapic_entry = NULL;
+		entry = info->ioapic.entry;
+		info->ioapic.entry = NULL;
 		memset(entry, 0, sizeof(*entry));
 		entry->vector        = index;
 		entry->mask          = 0;
-		entry->trigger       = info->ioapic_trigger;
-		entry->polarity      = info->ioapic_polarity;
+		entry->trigger       = info->ioapic.trigger;
+		entry->polarity      = info->ioapic.polarity;
 		/* Mask level triggered irqs. */
-		if (info->ioapic_trigger)
+		if (info->ioapic.trigger)
 			entry->mask = 1;
 		break;
 
@@ -3702,7 +3702,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
 					iommu->irte_ops->set_allocated(table, i);
 			}
 			WARN_ON(table->min_index != 32);
-			index = info->ioapic_pin;
+			index = info->ioapic.pin;
 		} else {
 			index = -ENOMEM;
 		}
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index 167a12a7f6f5..51c83e2cfafc 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -101,7 +101,7 @@ static int hyperv_irq_remapping_alloc(struct irq_domain *domain,
 	 * in the chip_data and hyperv_irq_remapping_activate()/hyperv_ir_set_
 	 * affinity() set vector and dest_apicid directly into IO-APIC entry.
 	 */
-	irq_data->chip_data = info->ioapic_entry;
+	irq_data->chip_data = info->ioapic.entry;
 
 	/*
 	 * Hypver-V IO APIC irq affinity should be in the scope of
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 4cbfccafc7aa..beb72d675aa2 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1119,7 +1119,7 @@ static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
 
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
-		return map_ioapic_to_ir(info->ioapic_id);
+		return map_ioapic_to_ir(info->devid);
 	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
 		return map_hpet_to_ir(info->devid);
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
@@ -1260,16 +1260,16 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
 		/* Set source-id of interrupt request */
-		set_ioapic_sid(irte, info->ioapic_id);
+		set_ioapic_sid(irte, info->devid);
 		apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
-			info->ioapic_id, irte->present, irte->fpd,
+			info->devid, irte->present, irte->fpd,
 			irte->dst_mode, irte->redir_hint,
 			irte->trigger_mode, irte->dlvry_mode,
 			irte->avail, irte->vector, irte->dest_id,
 			irte->sid, irte->sq, irte->svt);
 
-		entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry;
-		info->ioapic_entry = NULL;
+		entry = (struct IR_IO_APIC_route_entry *)info->ioapic.entry;
+		info->ioapic.entry = NULL;
 		memset(entry, 0, sizeof(*entry));
 		entry->index2	= (index >> 15) & 0x1;
 		entry->zero	= 0;
@@ -1279,11 +1279,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 		 * IO-APIC RTE will be configured with virtual vector.
 		 * irq handler will do the explicit EOI to the io-apic.
 		 */
-		entry->vector	= info->ioapic_pin;
+		entry->vector	= info->ioapic.pin;
 		entry->mask	= 0;			/* enable IRQ */
-		entry->trigger	= info->ioapic_trigger;
-		entry->polarity	= info->ioapic_polarity;
-		if (info->ioapic_trigger)
+		entry->trigger	= info->ioapic.trigger;
+		entry->polarity	= info->ioapic.polarity;
+		if (info->ioapic.trigger)
 			entry->mask = 1; /* Mask level triggered irqs. */
 		break;
 
-- 
Gitee


From e0a4dc20bad94741506a89db074ae08be3c0922d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:43 +0200
Subject: [PATCH 0332/2161] x86/irq: Consolidate DMAR irq allocation

commit 55e039157281f9d8ee7d595c2529a3fd4e790b52 upstream.

None of the DMAR specific fields are required.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112332.163462706@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h |  6 ------
 arch/x86/kernel/apic/msi.c    | 10 +++++-----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ecdf420807a8..e1382b7c6a01 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -105,12 +105,6 @@ struct irq_alloc_info {
 			irq_hw_number_t	msi_hwirq;
 		};
 #endif
-#ifdef	CONFIG_DMAR_TABLE
-		struct {
-			int		dmar_id;
-			void		*dmar_data;
-		};
-#endif
 #ifdef	CONFIG_X86_UV
 		struct {
 			int		uv_limit;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index f488868c8789..7680b1ba3aab 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -326,15 +326,15 @@ static struct irq_chip dmar_msi_controller = {
 static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
 					  msi_alloc_info_t *arg)
 {
-	return arg->dmar_id;
+	return arg->hwirq;
 }
 
 static int dmar_msi_init(struct irq_domain *domain,
 			 struct msi_domain_info *info, unsigned int virq,
 			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
 {
-	irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
-			    handle_edge_irq, arg->dmar_data, "edge");
+	irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL,
+			    handle_edge_irq, arg->data, "edge");
 
 	return 0;
 }
@@ -381,8 +381,8 @@ int dmar_alloc_hwirq(int id, int node, void *arg)
 
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_DMAR;
-	info.dmar_id = id;
-	info.dmar_data = arg;
+	info.devid = id;
+	info.data = arg;
 
 	return irq_domain_alloc_irqs(domain, 1, node, &info);
 }
-- 
Gitee


From 7040f1b62c03b886b18cc6304a14fb5f1d4bae21 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:44 +0200
Subject: [PATCH 0333/2161] x86/irq: Consolidate UV domain allocation

commit 0f5cbdaf203e201f151c2e44a49f6165a7d2c2f9 upstream.

Move the UV specific fields into their own struct for readability sake. Get
rid of the #ifdeffery as it does not matter at all whether the alloc info
is a couple of bytes longer or not.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112332.255792469@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h | 21 ++++++++++++---------
 arch/x86/platform/uv/uv_irq.c | 16 ++++++++--------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e1382b7c6a01..408abac74f04 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -75,6 +75,14 @@ struct ioapic_alloc_info {
 	struct IO_APIC_route_entry	*entry;
 };
 
+struct uv_alloc_info {
+	int		limit;
+	int		blade;
+	unsigned long	offset;
+	char		*name;
+
+};
+
 /**
  * irq_alloc_info - X86 specific interrupt allocation info
  * @type:	X86 specific allocation type
@@ -86,7 +94,8 @@ struct ioapic_alloc_info {
  * @data:	Allocation specific data
  *
  * @ioapic:	IOAPIC specific allocation data
- */
+ * @uv:		UV specific allocation data
+*/
 struct irq_alloc_info {
 	enum irq_alloc_type	type;
 	u32			flags;
@@ -98,20 +107,14 @@ struct irq_alloc_info {
 
 	union {
 		struct ioapic_alloc_info	ioapic;
+		struct uv_alloc_info		uv;
+
 		int		unused;
 #ifdef	CONFIG_PCI_MSI
 		struct {
 			struct pci_dev	*msi_dev;
 			irq_hw_number_t	msi_hwirq;
 		};
-#endif
-#ifdef	CONFIG_X86_UV
-		struct {
-			int		uv_limit;
-			int		uv_blade;
-			unsigned long	uv_offset;
-			char		*uv_name;
-		};
 #endif
 	};
 };
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index abb6075397f0..18ca2261cc9a 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -90,15 +90,15 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
 	if (ret >= 0) {
-		if (info->uv_limit == UV_AFFINITY_CPU)
+		if (info->uv.limit == UV_AFFINITY_CPU)
 			irq_set_status_flags(virq, IRQ_NO_BALANCING);
 		else
 			irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
 
-		chip_data->pnode = uv_blade_to_pnode(info->uv_blade);
-		chip_data->offset = info->uv_offset;
+		chip_data->pnode = uv_blade_to_pnode(info->uv.blade);
+		chip_data->offset = info->uv.offset;
 		irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data,
-				    handle_percpu_irq, NULL, info->uv_name);
+				    handle_percpu_irq, NULL, info->uv.name);
 	} else {
 		kfree(chip_data);
 	}
@@ -193,10 +193,10 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 
 	init_irq_alloc_info(&info, cpumask_of(cpu));
 	info.type = X86_IRQ_ALLOC_TYPE_UV;
-	info.uv_limit = limit;
-	info.uv_blade = mmr_blade;
-	info.uv_offset = mmr_offset;
-	info.uv_name = irq_name;
+	info.uv.limit = limit;
+	info.uv.blade = mmr_blade;
+	info.uv.offset = mmr_offset;
+	info.uv.name = irq_name;
 
 	return irq_domain_alloc_irqs(domain, 1,
 				     uv_blade_to_memory_nid(mmr_blade), &info);
-- 
Gitee


From a56c34f3d7e3ab7550688c10d306e7b6c8b2e322 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:45 +0200
Subject: [PATCH 0334/2161] PCI/MSI: Rework pci_msi_domain_calc_hwirq()

commit dfb9eb7cf6cd0c0b0f2a1111fcc47b0a297b097d upstream.

Retrieve the PCI device from the msi descriptor instead of doing so at the
call sites.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200826112332.352583299@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/msi.c | 2 +-
 drivers/pci/msi.c          | 9 ++++-----
 include/linux/msi.h        | 3 +--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7680b1ba3aab..38c094f70bf6 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(pci_msi_prepare);
 
 void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
 {
-	arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+	arg->msi_hwirq = pci_msi_domain_calc_hwirq(desc);
 }
 EXPORT_SYMBOL_GPL(pci_msi_set_desc);
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 771041784e64..5daca441c84c 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1366,14 +1366,14 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg)
 
 /**
  * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source
- * @dev:	Pointer to the PCI device
  * @desc:	Pointer to the MSI descriptor
  *
  * The ID number is only used within the irqdomain.
  */
-irq_hw_number_t pci_msi_domain_calc_hwirq(struct pci_dev *dev,
-					  struct msi_desc *desc)
+irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc)
 {
+	struct pci_dev *dev = msi_desc_to_pci_dev(desc);
+
 	return (irq_hw_number_t)desc->msi_attrib.entry_nr |
 		pci_dev_id(dev) << 11 |
 		(pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 27;
@@ -1426,8 +1426,7 @@ static void pci_msi_domain_set_desc(msi_alloc_info_t *arg,
 				    struct msi_desc *desc)
 {
 	arg->desc = desc;
-	arg->hwirq = pci_msi_domain_calc_hwirq(msi_desc_to_pci_dev(desc),
-					       desc);
+	arg->hwirq = pci_msi_domain_calc_hwirq(desc);
 }
 #else
 #define pci_msi_domain_set_desc		NULL
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d695e2eb2092..51da403ad134 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -375,8 +375,7 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent);
-irq_hw_number_t pci_msi_domain_calc_hwirq(struct pci_dev *dev,
-					  struct msi_desc *desc);
+irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc);
 int pci_msi_domain_check_cap(struct irq_domain *domain,
 			     struct msi_domain_info *info, struct device *dev);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
-- 
Gitee


From 116665a6fba6ce529d8b70f21b3e14fa51fc3869 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:46 +0200
Subject: [PATCH 0335/2161] x86/msi: Consolidate MSI allocation

commit 3b9c1d377d67072d1d8a2373b4969103cca00dab upstream.

Convert the interrupt remap drivers to retrieve the pci device from the msi
descriptor and use info::hwirq.

This is the first step to prepare x86 for using the generic MSI domain ops.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Wei Liu <wei.liu@kernel.org>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112332.466405395@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h       | 8 --------
 arch/x86/kernel/apic/msi.c          | 7 +++----
 drivers/iommu/amd_iommu.c           | 5 +++--
 drivers/iommu/intel/irq_remapping.c | 4 ++--
 drivers/pci/controller/pci-hyperv.c | 2 +-
 5 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 408abac74f04..80df0ba2a45b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -108,14 +108,6 @@ struct irq_alloc_info {
 	union {
 		struct ioapic_alloc_info	ioapic;
 		struct uv_alloc_info		uv;
-
-		int		unused;
-#ifdef	CONFIG_PCI_MSI
-		struct {
-			struct pci_dev	*msi_dev;
-			irq_hw_number_t	msi_hwirq;
-		};
-#endif
 	};
 };
 
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 38c094f70bf6..6a6cb95e6296 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -187,7 +187,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
-	info.msi_dev = dev;
 
 	domain = irq_remapping_get_irq_domain(&info);
 	if (domain == NULL)
@@ -206,7 +205,7 @@ void native_teardown_msi_irq(unsigned int irq)
 static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
 					 msi_alloc_info_t *arg)
 {
-	return arg->msi_hwirq;
+	return arg->hwirq;
 }
 
 int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
@@ -216,7 +215,6 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
 	struct msi_desc *desc = first_pci_msi_entry(pdev);
 
 	init_irq_alloc_info(arg, NULL);
-	arg->msi_dev = pdev;
 	if (desc->msi_attrib.is_msix) {
 		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
 	} else {
@@ -230,7 +228,8 @@ EXPORT_SYMBOL_GPL(pci_msi_prepare);
 
 void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
 {
-	arg->msi_hwirq = pci_msi_domain_calc_hwirq(desc);
+	arg->desc = desc;
+	arg->hwirq = pci_msi_domain_calc_hwirq(desc);
 }
 EXPORT_SYMBOL_GPL(pci_msi_set_desc);
 
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 4c86431e07aa..ec88f362be70 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3536,7 +3536,7 @@ static int get_devid(struct irq_alloc_info *info)
 		return get_hpet_devid(info->devid);
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		return get_device_id(&info->msi_dev->dev);
+		return get_device_id(msi_desc_to_dev(info->desc));
 	default:
 		WARN_ON_ONCE(1);
 		return -1;
@@ -3710,7 +3710,8 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
 		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
 		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
 
-		index = alloc_irq_index(devid, nr_irqs, align, info->msi_dev);
+		index = alloc_irq_index(devid, nr_irqs, align,
+					msi_desc_to_pci_dev(info->desc));
 	} else {
 		index = alloc_irq_index(devid, nr_irqs, false, NULL);
 	}
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index beb72d675aa2..b2dad4b639a6 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1124,7 +1124,7 @@ static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
 		return map_hpet_to_ir(info->devid);
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		return map_dev_to_ir(info->msi_dev);
+		return map_dev_to_ir(msi_desc_to_pci_dev(info->desc));
 	default:
 		WARN_ON_ONCE(1);
 		return NULL;
@@ -1293,7 +1293,7 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
 			set_hpet_sid(irte, info->devid);
 		else
-			set_msi_sid(irte, info->msi_dev);
+			set_msi_sid(irte, msi_desc_to_pci_dev(info->desc));
 
 		msg->address_hi = MSI_ADDR_BASE_HI;
 		msg->data = sub_handle;
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index f1f300218fab..8b05db522efe 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1513,7 +1513,7 @@ static struct irq_chip hv_msi_irq_chip = {
 static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
 						   msi_alloc_info_t *arg)
 {
-	return arg->msi_hwirq;
+	return arg->hwirq;
 }
 
 static struct msi_domain_ops hv_msi_ops = {
-- 
Gitee


From 8ac3a9e1afbec734a52c3f1d7d2851d9007094ed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:53 +0200
Subject: [PATCH 0336/2161] PCI/MSI: Provide pci_dev_has_special_msi_domain()
 helper

commit 2fd602669ee6d749a7dc47b84b87cef1a5075999 upstream.

Provide a helper function to check whether a PCI device is handled by a
non-standard PCI/MSI domain. This will be used to exclude such devices
which hang of a special bus, e.g. VMD, to be excluded from the irq domain
override in irq remapping.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20200826112333.139387358@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c   | 22 ++++++++++++++++++++++
 include/linux/msi.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 5daca441c84c..c0279e86bf5d 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1576,4 +1576,26 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 		dom = iort_get_device_domain(&pdev->dev, rid);
 	return dom;
 }
+
+/**
+ * pci_dev_has_special_msi_domain - Check whether the device is handled by
+ *				    a non-standard PCI-MSI domain
+ * @pdev:	The PCI device to check.
+ *
+ * Returns: True if the device irqdomain or the bus irqdomain is
+ * non-standard PCI/MSI.
+ */
+bool pci_dev_has_special_msi_domain(struct pci_dev *pdev)
+{
+	struct irq_domain *dom = dev_get_msi_domain(&pdev->dev);
+
+	if (!dom)
+		dom = dev_get_msi_domain(&pdev->bus->dev);
+
+	if (!dom)
+		return true;
+
+	return dom->bus_token != DOMAIN_BUS_PCI_MSI;
+}
+
 #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 51da403ad134..9c2e5f65a9ff 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -380,6 +380,7 @@ int pci_msi_domain_check_cap(struct irq_domain *domain,
 			     struct msi_domain_info *info, struct device *dev);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
+bool pci_dev_has_special_msi_domain(struct pci_dev *pdev);
 #else
 static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 {
-- 
Gitee


From e9d1916dc983db1b0950f8d2b37cf48fe019aad5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:59 +0200
Subject: [PATCH 0337/2161] iommm/vt-d: Store irq domain in struct device

commit 85a8dfc57a0b96785881735e09a61a0fde911ca4 upstream.

As a first step to make X86 utilize the direct MSI irq domain operations
store the irq domain pointer in the device struct when a device is probed.

This is done from dmar_pci_bus_add_dev() because it has to work even when
DMA remapping is disabled. It only overrides the irqdomain of devices which
are handled by a regular PCI/MSI irq domain which protects PCI devices
behind special busses like VMD which have their own irq domain.

No functional change. It just avoids the redirection through
arch_*_msi_irqs() and allows the PCI/MSI core to directly invoke the irq
domain alloc/free functions instead of having to look up the irq domain for
every single MSI interupt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112333.714566121@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c          |  3 +++
 drivers/iommu/intel/irq_remapping.c | 16 ++++++++++++++++
 include/linux/intel-iommu.h         |  7 +++++++
 3 files changed, 26 insertions(+)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index f0b614122446..0299d2c54a21 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -316,6 +316,9 @@ static int dmar_pci_bus_add_dev(struct dmar_pci_notify_info *info)
 	if (ret < 0 && dmar_dev_scope_status == 0)
 		dmar_dev_scope_status = ret;
 
+	if (ret >= 0)
+		intel_irq_remap_add_device(info);
+
 	return ret;
 }
 
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index b2dad4b639a6..f8d7e4ae261a 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1092,6 +1092,22 @@ static int reenable_irq_remapping(int eim)
 	return -1;
 }
 
+/*
+ * Store the MSI remapping domain pointer in the device if enabled.
+ *
+ * This is called from dmar_pci_bus_add_dev() so it works even when DMA
+ * remapping is disabled. Only update the pointer if the device is not
+ * already handled by a non default PCI/MSI interrupt domain. This protects
+ * e.g. VMD devices.
+ */
+void intel_irq_remap_add_device(struct dmar_pci_notify_info *info)
+{
+	if (!irq_remapping_enabled || pci_dev_has_special_msi_domain(info->dev))
+		return;
+
+	dev_set_msi_domain(&info->dev->dev, map_dev_to_ir(info->dev));
+}
+
 static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 {
 	memset(irte, 0, sizeof(*irte));
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index bdf80d7a70fb..287e2005ecad 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -425,6 +425,8 @@ struct q_inval {
 	int             free_cnt;
 };
 
+struct dmar_pci_notify_info;
+
 #ifdef CONFIG_IRQ_REMAP
 /* 1MB - maximum possible interrupt remapping table size */
 #define INTR_REMAP_PAGE_ORDER	8
@@ -439,6 +441,11 @@ struct ir_table {
 	struct irte *base;
 	unsigned long *bitmap;
 };
+
+void intel_irq_remap_add_device(struct dmar_pci_notify_info *info);
+#else
+static inline void
+intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { }
 #endif
 
 struct iommu_flush {
-- 
Gitee


From c23a20a80464debb588f77cd71f3830e16f38f3b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:17:00 +0200
Subject: [PATCH 0338/2161] iommm/amd: Store irq domain in struct device

commit 2b2c6aa63824c69c112bbe4f937f034c5e606a6c upstream.

As the next step to make X86 utilize the direct MSI irq domain operations
store the irq domain pointer in the device struct when a device is probed.

It only overrides the irqdomain of devices which are handled by a regular
PCI/MSI irq domain which protects PCI devices behind special busses like
VMD which have their own irq domain.

No functional change.

It just avoids the redirection through arch_*_msi_irqs() and allows the
PCI/MSI core to directly invoke the irq domain alloc/free functions instead
of having to look up the irq domain for every single MSI interupt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112333.806328762@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index ec88f362be70..ec47efa87740 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -729,7 +729,21 @@ static void iommu_poll_ga_log(struct amd_iommu *iommu)
 		}
 	}
 }
-#endif /* CONFIG_IRQ_REMAP */
+
+static void
+amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
+{
+	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
+	    pci_dev_has_special_msi_domain(to_pci_dev(dev)))
+		return;
+
+	dev_set_msi_domain(dev, iommu->msi_domain);
+}
+
+#else /* CONFIG_IRQ_REMAP */
+static inline void
+amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
+#endif /* !CONFIG_IRQ_REMAP */
 
 #define AMD_IOMMU_INT_MASK	\
 	(MMIO_STATUS_EVT_INT_MASK | \
@@ -2175,6 +2189,7 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev)
 		iommu_dev = ERR_PTR(ret);
 		iommu_ignore_device(dev);
 	} else {
+		amd_iommu_set_pci_msi_domain(dev, iommu);
 		iommu_dev = &iommu->iommu;
 	}
 
-- 
Gitee


From d05877c8f3b7a35464c79dc9b637524f361429b3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:17:05 +0200
Subject: [PATCH 0339/2161] iommu/vt-d: Remove domain search for PCI/MSI[X]

commit opencloudos.

Now that the domain can be retrieved through device::msi_domain the domain
search for PCI_MSI[X] is not longer required. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112334.305699301@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/irq_remapping.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index f8d7e4ae261a..aedaae4630bc 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1138,9 +1138,6 @@ static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
 		return map_ioapic_to_ir(info->devid);
 	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
 		return map_hpet_to_ir(info->devid);
-	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
-	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		return map_dev_to_ir(msi_desc_to_pci_dev(info->desc));
 	default:
 		WARN_ON_ONCE(1);
 		return NULL;
-- 
Gitee


From d8f4ff668a992c8518adde8c0c66fe9505049c85 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:17:06 +0200
Subject: [PATCH 0340/2161] iommu/amd: Remove domain search for PCI/MSI

commit bc95fd0d7c4273034b9486aaf369777eaaa00cb7 upstream.

Now that the domain can be retrieved through device::msi_domain the domain
search for PCI_MSI[X] is not longer required. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112334.400700807@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index ec47efa87740..b8136aa96f5a 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3570,9 +3570,6 @@ static struct irq_domain *get_irq_domain_for_devid(struct irq_alloc_info *info,
 	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
 	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
 		return iommu->ir_domain;
-	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
-	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		return iommu->msi_domain;
 	default:
 		WARN_ON_ONCE(1);
 		return NULL;
-- 
Gitee


From b24ab562554ee329b8e46b68232d65635e8674f3 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 23 Nov 2021 19:57:47 +0800
Subject: [PATCH 0341/2161] drm, iommu: Change type of pasid to u32

commit c7b6bac9c72c5fcbd6e9e12545bd3022c7f21860 upstream.

PASID is defined as a few different types in iommu including "int",
"u32", and "unsigned int". To be consistent and to match with uapi
definitions, define PASID and its variations (e.g. max PASID) as "u32".
"u32" is also shorter and a little more explicit than "unsigned int".

No PASID type change in uapi although it defines PASID as __u64 in
some places.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lkml.kernel.org/r/1600187413-163670-2-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  4 +--
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c       |  6 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h       |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  6 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |  6 ++--
 .../gpu/drm/amd/amdkfd/cik_event_interrupt.c  |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c       |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h       |  2 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  7 ++---
 drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  8 ++---
 drivers/gpu/drm/amd/amdkfd/kfd_events.h       |  4 +--
 drivers/gpu/drm/amd/amdkfd/kfd_iommu.c        |  6 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_pasid.c        |  4 +--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 20 ++++++------
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  2 +-
 .../gpu/drm/amd/include/kgd_kfd_interface.h   |  2 +-
 drivers/iommu/amd_iommu.c                     | 31 ++++++++++---------
 drivers/iommu/amd_iommu_proto.h               | 10 +++---
 drivers/iommu/amd_iommu_v2.c                  | 20 ++++++------
 drivers/iommu/intel/dmar.c                    |  7 +++--
 drivers/iommu/intel/iommu.c                   |  4 +--
 drivers/iommu/intel/pasid.c                   | 31 +++++++++----------
 drivers/iommu/intel/pasid.h                   | 24 +++++++-------
 drivers/iommu/intel/svm.c                     | 12 +++----
 drivers/iommu/iommu.c                         |  2 +-
 include/linux/amd-iommu.h                     |  8 ++---
 include/linux/intel-iommu.h                   | 12 +++----
 include/linux/intel-svm.h                     |  2 +-
 include/linux/iommu.h                         | 10 +++---
 36 files changed, 137 insertions(+), 137 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index e519df3fd2b6..7cd72376e9a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -195,11 +195,11 @@ uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *s
 	})
 
 /* GPUVM API */
-int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, unsigned int pasid,
+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, u32 pasid,
 					void **vm, void **process_info,
 					struct dma_fence **ef);
 int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
-					struct file *filp, unsigned int pasid,
+					struct file *filp, u32 pasid,
 					void **vm, void **process_info,
 					struct dma_fence **ef);
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index ce30d4e8bf25..72703f14e4a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -234,7 +234,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 	unlock_srbm(kgd);
 }
 
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 5f459bf5f622..0e9e63190269 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -258,7 +258,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 	unlock_srbm(kgd);
 }
 
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index 6d2f61449606..4fc1bddf048a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -214,7 +214,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 	unlock_srbm(kgd);
 }
 
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 92754cfb9808..75d6c4cef7d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -145,7 +145,7 @@ void kgd_gfx_v9_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 	unlock_srbm(kgd);
 }
 
-int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 26d8879bff9d..838ff519d56f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -26,7 +26,7 @@ void kgd_gfx_v9_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 		uint32_t sh_mem_config,
 		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
 		uint32_t sh_mem_bases);
-int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 		unsigned int vmid);
 int kgd_gfx_v9_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
 int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f3fa271e3394..7979f4c29af2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -923,7 +923,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
 	return ret;
 }
 
-int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, unsigned int pasid,
+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, u32 pasid,
 					  void **vm, void **process_info,
 					  struct dma_fence **ef)
 {
@@ -959,7 +959,7 @@ int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, unsigned int pasi
 }
 
 int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
-					   struct file *filp, unsigned int pasid,
+					   struct file *filp, u32 pasid,
 					   void **vm, void **process_info,
 					   struct dma_fence **ef)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
index 53734da1c2df..470ca5e6120c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -43,7 +43,7 @@ static DEFINE_IDA(amdgpu_pasid_ida);
 /* Helper to free pasid from a fence callback */
 struct amdgpu_pasid_cb {
 	struct dma_fence_cb cb;
-	unsigned int pasid;
+	u32 pasid;
 };
 
 /**
@@ -79,7 +79,7 @@ int amdgpu_pasid_alloc(unsigned int bits)
  * amdgpu_pasid_free - Free a PASID
  * @pasid: PASID to free
  */
-void amdgpu_pasid_free(unsigned int pasid)
+void amdgpu_pasid_free(u32 pasid)
 {
 	trace_amdgpu_pasid_freed(pasid);
 	ida_simple_remove(&amdgpu_pasid_ida, pasid);
@@ -105,7 +105,7 @@ static void amdgpu_pasid_free_cb(struct dma_fence *fence,
  * Free the pasid only after all the fences in resv are signaled.
  */
 void amdgpu_pasid_free_delayed(struct dma_resv *resv,
-			       unsigned int pasid)
+			       u32 pasid)
 {
 	struct dma_fence *fence, **fences;
 	struct amdgpu_pasid_cb *cb;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
index 8e58325bbca2..0c3b4fa1f936 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
@@ -71,9 +71,9 @@ struct amdgpu_vmid_mgr {
 };
 
 int amdgpu_pasid_alloc(unsigned int bits);
-void amdgpu_pasid_free(unsigned int pasid);
+void amdgpu_pasid_free(u32 pasid);
 void amdgpu_pasid_free_delayed(struct dma_resv *resv,
-			       unsigned int pasid);
+			       u32 pasid);
 
 bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
 			       struct amdgpu_vmid *id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 59fd9ebf3a58..4233e0c80b8e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1052,7 +1052,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
 	struct amdgpu_fpriv *fpriv = file_priv->driver_priv;
 	struct amdgpu_bo_list *list;
 	struct amdgpu_bo *pd;
-	unsigned int pasid;
+	u32 pasid;
 	int handle;
 
 	if (!fpriv)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index fb47ddc6f7f4..bdc6227a6c8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2671,7 +2671,7 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
  * 0 for success, error for failure.
  */
 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
-		   int vm_context, unsigned int pasid)
+		   int vm_context, u32 pasid)
 {
 	struct amdgpu_bo_param bp;
 	struct amdgpu_bo *root;
@@ -2822,7 +2822,7 @@ static int amdgpu_vm_check_clean_reserved(struct amdgpu_device *adev,
  * Returns:
  * 0 for success, -errno for errors.
  */
-int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid)
+int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, u32 pasid)
 {
 	bool pte_support_ats = (adev->asic_type == CHIP_RAVEN);
 	int r;
@@ -3100,7 +3100,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * @pasid: PASID identifier for VM
  * @task_info: task_info to fill.
  */
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
+void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
 			 struct amdgpu_task_info *task_info)
 {
 	struct amdgpu_vm *vm;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 4a64825b53cb..f303a44f05da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -345,8 +345,8 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
 
 long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout);
 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
-		   int vm_context, unsigned int pasid);
-int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid);
+		   int vm_context, u32 pasid);
+int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, u32 pasid);
 void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
@@ -402,7 +402,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 				  struct amdgpu_job *job);
 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
+void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
 			     struct amdgpu_task_info *task_info);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 177d1e5329a5..8e48706c76fb 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -88,7 +88,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 			(const struct cik_ih_ring_entry *)ih_ring_entry;
 	uint32_t context_id = ihre->data & 0xfffffff;
 	unsigned int vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
-	unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
+	u32 pasid = (ihre->ring_id & 0xffff0000) >> 16;
 
 	if (pasid == 0)
 		return;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index a3441b0e385b..8dbe4fa575b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -45,7 +45,7 @@ static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev)
 }
 
 static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
-				unsigned int pasid, uint64_t vmid0_address,
+				u32 pasid, uint64_t vmid0_address,
 				uint32_t *packet_buff, size_t size_in_bytes)
 {
 	struct pm4__release_mem *rm_packet;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
index a04a1fe1d0d9..f9c6df1fdc5c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
@@ -275,7 +275,7 @@ struct kfd_dbgdev {
 };
 
 struct kfd_dbgmgr {
-	unsigned int pasid;
+	u32 pasid;
 	struct kfd_dev *dev;
 	struct kfd_dbgdev *dbgdev;
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ab69898c9cb7..12935197c5f9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -40,7 +40,7 @@
 #define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2)
 
 static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
-					unsigned int pasid, unsigned int vmid);
+				  u32 pasid, unsigned int vmid);
 
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
@@ -837,7 +837,7 @@ static int unregister_process(struct device_queue_manager *dqm,
 }
 
 static int
-set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid,
+set_pasid_vmid_mapping(struct device_queue_manager *dqm, u32 pasid,
 			unsigned int vmid)
 {
 	return dqm->dev->kfd2kgd->set_pasid_vmid_mapping(
@@ -1847,8 +1847,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
 	kfree(dqm);
 }
 
-int kfd_process_vm_fault(struct device_queue_manager *dqm,
-			 unsigned int pasid)
+int kfd_process_vm_fault(struct device_queue_manager *dqm, u32 pasid)
 {
 	struct kfd_process_device *pdd;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index d674d4b3340f..853b600f588b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -461,7 +461,7 @@ static void set_event_from_interrupt(struct kfd_process *p,
 	}
 }
 
-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
 				uint32_t valid_id_bits)
 {
 	struct kfd_event *ev = NULL;
@@ -873,7 +873,7 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
 }
 
 #ifdef KFD_SUPPORT_IOMMU_V2
-void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
 		unsigned long address, bool is_write_requested,
 		bool is_execute_requested)
 {
@@ -950,7 +950,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 }
 #endif /* KFD_SUPPORT_IOMMU_V2 */
 
-void kfd_signal_hw_exception_event(unsigned int pasid)
+void kfd_signal_hw_exception_event(u32 pasid)
 {
 	/*
 	 * Because we are called from arbitrary context (workqueue) as opposed
@@ -971,7 +971,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
 	kfd_unref_process(p);
 }
 
-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
 				struct kfd_vm_fault_info *info)
 {
 	struct kfd_event *ev;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
index c7ac6c73af86..c8fe5dbdad55 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
@@ -79,7 +79,7 @@ struct kfd_event {
 #define KFD_EVENT_TYPE_DEBUG 5
 #define KFD_EVENT_TYPE_MEMORY 8
 
-extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
-					uint32_t valid_id_bits);
+extern void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
+				       uint32_t valid_id_bits);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
index 9266c8e76be7..e4080dc6fb6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
@@ -146,7 +146,7 @@ void kfd_iommu_unbind_process(struct kfd_process *p)
 }
 
 /* Callback for process shutdown invoked by the IOMMU driver */
-static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
+static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, u32 pasid)
 {
 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
 	struct kfd_process *p;
@@ -192,8 +192,8 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
 }
 
 /* This function called by IOMMU driver on PPR failure */
-static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
-		unsigned long address, u16 flags)
+static int iommu_invalid_ppr_cb(struct pci_dev *pdev, u32 pasid,
+				unsigned long address, u16 flags)
 {
 	struct kfd_dev *dev;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
index 33b08ff00b50..c19a2e6fd7c8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
@@ -51,7 +51,7 @@ unsigned int kfd_get_pasid_limit(void)
 	return 1U << pasid_bits;
 }
 
-unsigned int kfd_pasid_alloc(void)
+u32 kfd_pasid_alloc(void)
 {
 	int r;
 
@@ -77,7 +77,7 @@ unsigned int kfd_pasid_alloc(void)
 	return r > 0 ? r : 0;
 }
 
-void kfd_pasid_free(unsigned int pasid)
+void kfd_pasid_free(u32 pasid)
 {
 	if (kfd2kgd)
 		amdgpu_pasid_free(pasid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c89326125d71..5645a02f9636 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -687,7 +687,7 @@ struct kfd_process {
 	/* We want to receive a notification when the mm_struct is destroyed */
 	struct mmu_notifier mmu_notifier;
 
-	unsigned int pasid;
+	u32 pasid;
 	unsigned int doorbell_index;
 
 	/*
@@ -761,7 +761,7 @@ int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
 struct kfd_process *kfd_create_process(struct file *filep);
 struct kfd_process *kfd_get_process(const struct task_struct *);
-struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
+struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
 void kfd_unref_process(struct kfd_process *p);
 int kfd_process_evict_queues(struct kfd_process *p);
@@ -802,8 +802,8 @@ int kfd_pasid_init(void);
 void kfd_pasid_exit(void);
 bool kfd_set_pasid_limit(unsigned int new_limit);
 unsigned int kfd_get_pasid_limit(void);
-unsigned int kfd_pasid_alloc(void);
-void kfd_pasid_free(unsigned int pasid);
+u32 kfd_pasid_alloc(void);
+void kfd_pasid_free(u32 pasid);
 
 /* Doorbells */
 size_t kfd_doorbell_process_slice(struct kfd_dev *kfd);
@@ -886,7 +886,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 					enum kfd_queue_type type);
 void kernel_queue_uninit(struct kernel_queue *kq);
-int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
+int kfd_process_vm_fault(struct device_queue_manager *dqm, u32 pasid);
 
 /* Process Queue Manager */
 struct process_queue_node {
@@ -1009,12 +1009,12 @@ int kfd_wait_on_events(struct kfd_process *p,
 		       uint32_t num_events, void __user *data,
 		       bool all, uint32_t user_timeout_ms,
 		       uint32_t *wait_result);
-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
 				uint32_t valid_id_bits);
 void kfd_signal_iommu_event(struct kfd_dev *dev,
-		unsigned int pasid, unsigned long address,
-		bool is_write_requested, bool is_execute_requested);
-void kfd_signal_hw_exception_event(unsigned int pasid);
+			    u32 pasid, unsigned long address,
+			    bool is_write_requested, bool is_execute_requested);
+void kfd_signal_hw_exception_event(u32 pasid);
 int kfd_set_event(struct kfd_process *p, uint32_t event_id);
 int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
 int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
@@ -1025,7 +1025,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
 		     uint64_t *event_page_offset, uint32_t *event_slot_index);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
 				struct kfd_vm_fault_info *info);
 
 void kfd_signal_reset_event(struct kfd_dev *dev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index aa0a617b8d44..344b4c85e2d1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -903,7 +903,7 @@ void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
 }
 
 /* This increments the process->ref counter. */
-struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid)
 {
 	struct kfd_process *p, *ret_p = NULL;
 	unsigned int temp;
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 98b9533e672b..d2a1d05bb070 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -246,7 +246,7 @@ struct kfd2kgd_calls {
 			uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
 			uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
 
-	int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, unsigned int pasid,
+	int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid);
 
 	int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id);
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index b8136aa96f5a..32f6d165775a 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -512,10 +512,11 @@ static void amd_iommu_report_page_fault(u16 devid, u16 domain_id,
 static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 {
 	struct device *dev = iommu->iommu.dev;
-	int type, devid, pasid, flags, tag;
+	int type, devid, flags, tag;
 	volatile u32 *event = __evt;
 	int count = 0;
 	u64 address;
+	u32 pasid;
 
 retry:
 	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
@@ -922,7 +923,7 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
 		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
 }
 
-static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
+static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid,
 				  u64 address, bool size)
 {
 	memset(cmd, 0, sizeof(*cmd));
@@ -940,7 +941,7 @@ static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
 	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
 }
 
-static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
+static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid,
 				  int qdep, u64 address, bool size)
 {
 	memset(cmd, 0, sizeof(*cmd));
@@ -960,7 +961,7 @@ static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
 	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
 }
 
-static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid,
+static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
 			       int status, int tag, bool gn)
 {
 	memset(cmd, 0, sizeof(*cmd));
@@ -2817,7 +2818,7 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
 }
 EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
 
-static int __flush_pasid(struct protection_domain *domain, int pasid,
+static int __flush_pasid(struct protection_domain *domain, u32 pasid,
 			 u64 address, bool size)
 {
 	struct iommu_dev_data *dev_data;
@@ -2878,13 +2879,13 @@ static int __flush_pasid(struct protection_domain *domain, int pasid,
 	return ret;
 }
 
-static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
+static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid,
 				  u64 address)
 {
 	return __flush_pasid(domain, pasid, address, false);
 }
 
-int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
 			 u64 address)
 {
 	struct protection_domain *domain = to_pdomain(dom);
@@ -2899,13 +2900,13 @@ int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
 }
 EXPORT_SYMBOL(amd_iommu_flush_page);
 
-static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
+static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid)
 {
 	return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
 			     true);
 }
 
-int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
+int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid)
 {
 	struct protection_domain *domain = to_pdomain(dom);
 	unsigned long flags;
@@ -2919,7 +2920,7 @@ int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
 }
 EXPORT_SYMBOL(amd_iommu_flush_tlb);
 
-static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
+static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc)
 {
 	int index;
 	u64 *pte;
@@ -2951,7 +2952,7 @@ static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
 	return pte;
 }
 
-static int __set_gcr3(struct protection_domain *domain, int pasid,
+static int __set_gcr3(struct protection_domain *domain, u32 pasid,
 		      unsigned long cr3)
 {
 	u64 *pte;
@@ -2968,7 +2969,7 @@ static int __set_gcr3(struct protection_domain *domain, int pasid,
 	return __amd_iommu_flush_tlb(domain, pasid);
 }
 
-static int __clear_gcr3(struct protection_domain *domain, int pasid)
+static int __clear_gcr3(struct protection_domain *domain, u32 pasid)
 {
 	u64 *pte;
 
@@ -2984,7 +2985,7 @@ static int __clear_gcr3(struct protection_domain *domain, int pasid)
 	return __amd_iommu_flush_tlb(domain, pasid);
 }
 
-int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
 			      unsigned long cr3)
 {
 	struct protection_domain *domain = to_pdomain(dom);
@@ -2999,7 +3000,7 @@ int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
 }
 EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
 
-int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
+int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid)
 {
 	struct protection_domain *domain = to_pdomain(dom);
 	unsigned long flags;
@@ -3013,7 +3014,7 @@ int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
 }
 EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
 
-int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
 			   int status, int tag)
 {
 	struct iommu_dev_data *dev_data;
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 92c2ba6468a0..17d7957269ee 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -43,12 +43,12 @@ extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb);
 extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb);
 extern void amd_iommu_domain_direct_map(struct iommu_domain *dom);
 extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids);
-extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+extern int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
 				u64 address);
-extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid);
-extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+extern int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid);
+extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
 				     unsigned long cr3);
-extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid);
+extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid);
 extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev);
 
 #ifdef CONFIG_IRQ_REMAP
@@ -64,7 +64,7 @@ static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
 #define PPR_INVALID			0x1
 #define PPR_FAILURE			0xf
 
-extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+extern int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
 				  int status, int tag);
 
 static inline bool is_rd890_iommu(struct pci_dev *pdev)
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 05f3d93cf480..a679fe7ecaca 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -42,7 +42,7 @@ struct pasid_state {
 	struct mmu_notifier mn;                 /* mmu_notifier handle */
 	struct pri_queue pri[PRI_QUEUE_SIZE];	/* PRI tag states */
 	struct device_state *device_state;	/* Link to our device_state */
-	int pasid;				/* PASID index */
+	u32 pasid;				/* PASID index */
 	bool invalid;				/* Used during setup and
 						   teardown of the pasid */
 	spinlock_t lock;			/* Protect pri_queues and
@@ -72,7 +72,7 @@ struct fault {
 	struct mm_struct *mm;
 	u64 address;
 	u16 devid;
-	u16 pasid;
+	u32 pasid;
 	u16 tag;
 	u16 finish;
 	u16 flags;
@@ -152,7 +152,7 @@ static void put_device_state(struct device_state *dev_state)
 
 /* Must be called under dev_state->lock */
 static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
-						  int pasid, bool alloc)
+						  u32 pasid, bool alloc)
 {
 	struct pasid_state **root, **ptr;
 	int level, index;
@@ -186,7 +186,7 @@ static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state
 
 static int set_pasid_state(struct device_state *dev_state,
 			   struct pasid_state *pasid_state,
-			   int pasid)
+			   u32 pasid)
 {
 	struct pasid_state **ptr;
 	unsigned long flags;
@@ -213,7 +213,7 @@ static int set_pasid_state(struct device_state *dev_state,
 	return ret;
 }
 
-static void clear_pasid_state(struct device_state *dev_state, int pasid)
+static void clear_pasid_state(struct device_state *dev_state, u32 pasid)
 {
 	struct pasid_state **ptr;
 	unsigned long flags;
@@ -231,7 +231,7 @@ static void clear_pasid_state(struct device_state *dev_state, int pasid)
 }
 
 static struct pasid_state *get_pasid_state(struct device_state *dev_state,
-					   int pasid)
+					   u32 pasid)
 {
 	struct pasid_state **ptr, *ret = NULL;
 	unsigned long flags;
@@ -598,7 +598,7 @@ static struct notifier_block ppr_nb = {
 	.notifier_call = ppr_notifier,
 };
 
-int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
+int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
 			 struct task_struct *task)
 {
 	struct pasid_state *pasid_state;
@@ -619,7 +619,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
 		return -EINVAL;
 
 	ret = -EINVAL;
-	if (pasid < 0 || pasid >= dev_state->max_pasids)
+	if (pasid >= dev_state->max_pasids)
 		goto out;
 
 	ret = -ENOMEM;
@@ -683,7 +683,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
 }
 EXPORT_SYMBOL(amd_iommu_bind_pasid);
 
-void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
+void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid)
 {
 	struct pasid_state *pasid_state;
 	struct device_state *dev_state;
@@ -699,7 +699,7 @@ void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
 	if (dev_state == NULL)
 		return;
 
-	if (pasid < 0 || pasid >= dev_state->max_pasids)
+	if (pasid >= dev_state->max_pasids)
 		goto out;
 
 	pasid_state = get_pasid_state(dev_state, pasid);
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 0299d2c54a21..afacd5f042d7 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1501,7 +1501,7 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 }
 
 void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
-			  u64 granu, int pasid)
+			  u64 granu, u32 pasid)
 {
 	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
 
@@ -1815,7 +1815,7 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
 }
 
 static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
-		u8 fault_reason, int pasid, u16 source_id,
+		u8 fault_reason, u32 pasid, u16 source_id,
 		unsigned long long addr)
 {
 	const char *reason;
@@ -1865,7 +1865,8 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 		u8 fault_reason;
 		u16 source_id;
 		u64 guest_addr;
-		int type, pasid;
+		u32 pasid;
+		int type;
 		u32 data;
 		bool pasid_present;
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3d7952414b01..388e681e59a1 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2527,7 +2527,7 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
 static int domain_setup_first_level(struct intel_iommu *iommu,
 				    struct dmar_domain *domain,
 				    struct device *dev,
-				    int pasid)
+				    u32 pasid)
 {
 	int flags = PASID_FLAG_SUPERVISOR_MODE;
 	struct dma_pte *pgd = domain->pgd;
@@ -5181,7 +5181,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 		return -ENODEV;
 
 	if (domain->default_pasid <= 0) {
-		int pasid;
+		u32 pasid;
 
 		/* No private data needed for the default pasid */
 		pasid = ioasid_alloc(NULL, PASID_MIN,
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index e6faedf42fd4..b92af83b79bd 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -27,7 +27,7 @@
 static DEFINE_SPINLOCK(pasid_lock);
 u32 intel_pasid_max_id = PASID_MAX;
 
-int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid)
+int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid)
 {
 	unsigned long flags;
 	u8 status_code;
@@ -58,7 +58,7 @@ int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid)
 	return ret;
 }
 
-void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid)
+void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid)
 {
 	unsigned long flags;
 	u8 status_code;
@@ -146,7 +146,7 @@ int intel_pasid_alloc_table(struct device *dev)
 	struct pasid_table *pasid_table;
 	struct pasid_table_opaque data;
 	struct page *pages;
-	int max_pasid = 0;
+	u32 max_pasid = 0;
 	int ret, order;
 	int size;
 
@@ -168,7 +168,7 @@ int intel_pasid_alloc_table(struct device *dev)
 	INIT_LIST_HEAD(&pasid_table->dev);
 
 	if (info->pasid_supported)
-		max_pasid = min_t(int, pci_max_pasids(to_pci_dev(dev)),
+		max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)),
 				  intel_pasid_max_id);
 
 	size = max_pasid >> (PASID_PDE_SHIFT - 3);
@@ -242,7 +242,7 @@ int intel_pasid_get_dev_max_id(struct device *dev)
 	return info->pasid_table->max_pasid;
 }
 
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid)
+struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 {
 	struct device_domain_info *info;
 	struct pasid_table *pasid_table;
@@ -251,8 +251,7 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid)
 	int dir_index, index;
 
 	pasid_table = intel_pasid_get_table(dev);
-	if (WARN_ON(!pasid_table || pasid < 0 ||
-		    pasid >= intel_pasid_get_dev_max_id(dev)))
+	if (WARN_ON(!pasid_table || pasid >= intel_pasid_get_dev_max_id(dev)))
 		return NULL;
 
 	dir = pasid_table->table;
@@ -305,7 +304,7 @@ static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
 }
 
 static void
-intel_pasid_clear_entry(struct device *dev, int pasid, bool fault_ignore)
+intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore)
 {
 	struct pasid_entry *pe;
 
@@ -444,7 +443,7 @@ pasid_set_eafe(struct pasid_entry *pe)
 
 static void
 pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
-				    u16 did, int pasid)
+				    u16 did, u32 pasid)
 {
 	struct qi_desc desc;
 
@@ -473,7 +472,7 @@ iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid)
 
 static void
 devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
-			       struct device *dev, int pasid)
+			       struct device *dev, u32 pasid)
 {
 	struct device_domain_info *info;
 	u16 sid, qdep, pfsid;
@@ -499,7 +498,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
 }
 
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
-				 int pasid, bool fault_ignore)
+				 u32 pasid, bool fault_ignore)
 {
 	struct pasid_entry *pte;
 	u16 did;
@@ -524,7 +523,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 
 static void pasid_flush_caches(struct intel_iommu *iommu,
 				struct pasid_entry *pte,
-				int pasid, u16 did)
+			       u32 pasid, u16 did)
 {
 	if (!ecap_coherent(iommu->ecap))
 		clflush_cache_range(pte, sizeof(*pte));
@@ -543,7 +542,7 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
  */
 int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 				  struct device *dev, pgd_t *pgd,
-				  int pasid, u16 did, int flags)
+				  u32 pasid, u16 did, int flags)
 {
 	struct pasid_entry *pte;
 
@@ -616,7 +615,7 @@ static inline int iommu_skip_agaw(struct dmar_domain *domain,
  */
 int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
-				   struct device *dev, int pasid)
+				   struct device *dev, u32 pasid)
 {
 	struct pasid_entry *pte;
 	struct dma_pte *pgd;
@@ -674,7 +673,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
  */
 int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
-				   struct device *dev, int pasid)
+				   struct device *dev, u32 pasid)
 {
 	u16 did = FLPT_DEFAULT_DID;
 	struct pasid_entry *pte;
@@ -760,7 +759,7 @@ intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
  * @addr_width: Address width of the first level (guest)
  */
 int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
-			     pgd_t *gpgd, int pasid,
+			     pgd_t *gpgd, u32 pasid,
 			     struct iommu_gpasid_bind_data_vtd *pasid_data,
 			     struct dmar_domain *domain, int addr_width)
 {
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index c9850766c3a9..97dfcffbf495 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -72,7 +72,7 @@ struct pasid_entry {
 struct pasid_table {
 	void			*table;		/* pasid table pointer */
 	int			order;		/* page order of pasid table */
-	int			max_pasid;	/* max pasid */
+	u32			max_pasid;	/* max pasid */
 	struct list_head	dev;		/* device list */
 };
 
@@ -98,31 +98,31 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte)
 	return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
 }
 
-extern u32 intel_pasid_max_id;
+extern unsigned int intel_pasid_max_id;
 int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
-void intel_pasid_free_id(int pasid);
-void *intel_pasid_lookup_id(int pasid);
+void intel_pasid_free_id(u32 pasid);
+void *intel_pasid_lookup_id(u32 pasid);
 int intel_pasid_alloc_table(struct device *dev);
 void intel_pasid_free_table(struct device *dev);
 struct pasid_table *intel_pasid_get_table(struct device *dev);
 int intel_pasid_get_dev_max_id(struct device *dev);
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid);
+struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid);
 int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 				  struct device *dev, pgd_t *pgd,
-				  int pasid, u16 did, int flags);
+				  u32 pasid, u16 did, int flags);
 int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
-				   struct device *dev, int pasid);
+				   struct device *dev, u32 pasid);
 int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
-				   struct device *dev, int pasid);
+				   struct device *dev, u32 pasid);
 int intel_pasid_setup_nested(struct intel_iommu *iommu,
-			     struct device *dev, pgd_t *pgd, int pasid,
+			     struct device *dev, pgd_t *pgd, u32 pasid,
 			     struct iommu_gpasid_bind_data_vtd *pasid_data,
 			     struct dmar_domain *domain, int addr_width);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
-				 struct device *dev, int pasid,
+				 struct device *dev, u32 pasid,
 				 bool fault_ignore);
-int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid);
-void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid);
+int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid);
+void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid);
 #endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 45c661453639..e2041837dded 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -23,7 +23,7 @@
 #include "pasid.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
-static void intel_svm_drain_prq(struct device *dev, int pasid);
+static void intel_svm_drain_prq(struct device *dev, u32 pasid);
 
 #define PRQ_ORDER 0
 
@@ -417,7 +417,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	return ret;
 }
 
-int intel_svm_unbind_gpasid(struct device *dev, int pasid)
+int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct intel_svm_dev *sdev;
@@ -638,7 +638,7 @@ intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
 }
 
 /* Caller must hold pasid_mutex */
-static int intel_svm_unbind_mm(struct device *dev, int pasid)
+static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 {
 	struct intel_svm_dev *sdev;
 	struct intel_iommu *iommu;
@@ -757,7 +757,7 @@ static bool is_canonical_address(u64 addr)
  * described in VT-d spec CH7.10 to drain all page requests and page
  * responses pending in the hardware.
  */
-static void intel_svm_drain_prq(struct device *dev, int pasid)
+static void intel_svm_drain_prq(struct device *dev, u32 pasid)
 {
 	struct device_domain_info *info;
 	struct dmar_domain *domain;
@@ -1084,10 +1084,10 @@ void intel_svm_unbind(struct iommu_sva *sva)
 	mutex_unlock(&pasid_mutex);
 }
 
-int intel_svm_get_pasid(struct iommu_sva *sva)
+u32 intel_svm_get_pasid(struct iommu_sva *sva)
 {
 	struct intel_svm_dev *sdev;
-	int pasid;
+	u32 pasid;
 
 	mutex_lock(&pasid_mutex);
 	sdev = to_intel_svm_dev(sva);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 84ac68bcfbba..78d89f479e8e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2847,7 +2847,7 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
 }
 EXPORT_SYMBOL_GPL(iommu_sva_unbind_device);
 
-int iommu_sva_get_pasid(struct iommu_sva *handle)
+u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 {
 	const struct iommu_ops *ops = handle->dev->bus->iommu_ops;
 
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 21e950e4ab62..450717299928 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -76,7 +76,7 @@ extern void amd_iommu_free_device(struct pci_dev *pdev);
  *
  * The function returns 0 on success or a negative value on error.
  */
-extern int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
+extern int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
 				struct task_struct *task);
 
 /**
@@ -88,7 +88,7 @@ extern int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
  * When this function returns the device is no longer using the PASID
  * and the PASID is no longer bound to its task.
  */
-extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid);
+extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid);
 
 /**
  * amd_iommu_set_invalid_ppr_cb() - Register a call-back for failed
@@ -114,7 +114,7 @@ extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid);
 #define AMD_IOMMU_INV_PRI_RSP_FAIL	2
 
 typedef int (*amd_iommu_invalid_ppr_cb)(struct pci_dev *pdev,
-					int pasid,
+					u32 pasid,
 					unsigned long address,
 					u16);
 
@@ -166,7 +166,7 @@ extern int amd_iommu_device_info(struct pci_dev *pdev,
  * @cb: The call-back function
  */
 
-typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid);
+typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, u32 pasid);
 
 extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
 					   amd_iommu_invalidate_ctx cb);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 287e2005ecad..6a5490fde36e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -556,7 +556,7 @@ struct dmar_domain {
 					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 	u64		max_addr;	/* maximum mapped address */
 
-	int		default_pasid;	/*
+	u32		default_pasid;	/*
 					 * The default pasid used for non-SVM
 					 * traffic on mediated devices.
 					 */
@@ -715,7 +715,7 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 			      u32 pasid, u16 qdep, u64 addr,
 			      unsigned int size_order);
 void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
-			  int pasid);
+			  u32 pasid);
 
 int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
 		   unsigned int count, unsigned long options);
@@ -744,11 +744,11 @@ extern int intel_svm_enable_prq(struct intel_iommu *iommu);
 extern int intel_svm_finish_prq(struct intel_iommu *iommu);
 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			  struct iommu_gpasid_bind_data *data);
-int intel_svm_unbind_gpasid(struct device *dev, int pasid);
+int intel_svm_unbind_gpasid(struct device *dev, u32 pasid);
 struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
 				 void *drvdata);
 void intel_svm_unbind(struct iommu_sva *handle);
-int intel_svm_get_pasid(struct iommu_sva *handle);
+u32 intel_svm_get_pasid(struct iommu_sva *handle);
 int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
 			    struct iommu_page_response *msg);
 
@@ -760,7 +760,7 @@ struct intel_svm_dev {
 	struct device *dev;
 	struct svm_dev_ops *ops;
 	struct iommu_sva sva;
-	int pasid;
+	u32 pasid;
 	int users;
 	u16 did;
 	u16 dev_iotlb:1;
@@ -773,7 +773,7 @@ struct intel_svm {
 
 	struct intel_iommu *iommu;
 	int flags;
-	int pasid;
+	u32 pasid;
 	int gpasid; /* In case that guest PASID is different from host PASID */
 	struct list_head devs;
 	struct list_head list;
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index c9e7e601950d..39d368a810b8 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -11,7 +11,7 @@
 struct device;
 
 struct svm_dev_ops {
-	void (*fault_cb)(struct device *dev, int pasid, u64 address,
+	void (*fault_cb)(struct device *dev, u32 pasid, u64 address,
 			 void *private, int rwxp, int response);
 };
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index eb76974d2d18..6c2ea8c8fee4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -292,7 +292,7 @@ struct iommu_ops {
 	struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm,
 				      void *drvdata);
 	void (*sva_unbind)(struct iommu_sva *handle);
-	int (*sva_get_pasid)(struct iommu_sva *handle);
+	u32 (*sva_get_pasid)(struct iommu_sva *handle);
 
 	int (*page_response)(struct device *dev,
 			     struct iommu_fault_event *evt,
@@ -302,7 +302,7 @@ struct iommu_ops {
 	int (*sva_bind_gpasid)(struct iommu_domain *domain,
 			struct device *dev, struct iommu_gpasid_bind_data *data);
 
-	int (*sva_unbind_gpasid)(struct device *dev, int pasid);
+	int (*sva_unbind_gpasid)(struct device *dev, u32 pasid);
 
 	int (*def_domain_type)(struct device *dev);
 
@@ -637,7 +637,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm,
 					void *drvdata);
 void iommu_sva_unbind_device(struct iommu_sva *handle);
-int iommu_sva_get_pasid(struct iommu_sva *handle);
+u32 iommu_sva_get_pasid(struct iommu_sva *handle);
 
 #else /* CONFIG_IOMMU_API */
 
@@ -1030,7 +1030,7 @@ static inline void iommu_sva_unbind_device(struct iommu_sva *handle)
 {
 }
 
-static inline int iommu_sva_get_pasid(struct iommu_sva *handle)
+static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 {
 	return IOMMU_PASID_INVALID;
 }
@@ -1049,7 +1049,7 @@ static inline int iommu_sva_bind_gpasid(struct iommu_domain *domain,
 }
 
 static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
-					   struct device *dev, int pasid)
+					   struct device *dev, u32 pasid)
 {
 	return -ENODEV;
 }
-- 
Gitee


From a56c405936c121856566db446ae1713f5b075b6c Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 15 Sep 2020 09:30:06 -0700
Subject: [PATCH 0342/2161] iommu/vt-d: Change flags type to unsigned int in
 binding mm

commit 2a5054c6e7b16906984ac36a7363ca46b8d99ade upstream.

"flags" passed to intel_svm_bind_mm() is a bit mask and should be
defined as "unsigned int" instead of "int".

Change its type to "unsigned int".

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lkml.kernel.org/r/1600187413-163670-3-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c   | 7 ++++---
 include/linux/intel-iommu.h | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e2041837dded..d21d0793e182 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -464,7 +464,8 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 
 /* Caller must hold pasid_mutex, mm reference */
 static int
-intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
+intel_svm_bind_mm(struct device *dev, unsigned int flags,
+		  struct svm_dev_ops *ops,
 		  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
@@ -1050,7 +1051,7 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 {
 	struct iommu_sva *sva = ERR_PTR(-EINVAL);
 	struct intel_svm_dev *sdev = NULL;
-	int flags = 0;
+	unsigned int flags = 0;
 	int ret;
 
 	/*
@@ -1059,7 +1060,7 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 	 * and intel_svm etc.
 	 */
 	if (drvdata)
-		flags = *(int *)drvdata;
+		flags = *(unsigned int *)drvdata;
 	mutex_lock(&pasid_mutex);
 	ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
 	if (ret)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 6a5490fde36e..d956987ed032 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -772,7 +772,7 @@ struct intel_svm {
 	struct mm_struct *mm;
 
 	struct intel_iommu *iommu;
-	int flags;
+	unsigned int flags;
 	u32 pasid;
 	int gpasid; /* In case that guest PASID is different from host PASID */
 	struct list_head devs;
-- 
Gitee


From a17d79bea1fbbb0a4787b0868b6da7f9095281a9 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 15 Sep 2020 09:30:13 -0700
Subject: [PATCH 0343/2161] x86/mmu: Allocate/free a PASID (part 2)

commit opencloudos.

A PASID is allocated for an "mm" the first time any thread binds to an
SVA-capable device and is freed from the "mm" when the SVA is unbound
by the last thread. It's possible for the "mm" to have different PASID
values in different binding/unbinding SVA cycles.

The mm's PASID (non-zero for valid PASID or 0 for invalid PASID) is
propagated to a per-thread PASID MSR for all threads within the mm
through IPI, context switch, or inherited. This is done to ensure that a
running thread has the right PASID in the MSR matching the mm's PASID.

 [ bp: s/SVM/SVA/g; massage. ]

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1600187413-163670-10-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index d21d0793e182..cb14fdc65b7e 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -19,6 +19,7 @@
 #include <linux/mm_types.h>
 #include <linux/ioasid.h>
 #include <asm/page.h>
+#include <asm/fpu/api.h>
 
 #include "pasid.h"
 
@@ -462,6 +463,24 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 	return ret;
 }
 
+static void _load_pasid(void *unused)
+{
+	update_pasid();
+}
+
+static void load_pasid(struct mm_struct *mm, u32 pasid)
+{
+	mutex_lock(&mm->context.lock);
+
+	/* Synchronize with READ_ONCE in update_pasid(). */
+	smp_store_release(&mm->pasid, pasid);
+
+	/* Update PASID MSR on all CPUs running the mm's tasks. */
+	on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
+
+	mutex_unlock(&mm->context.lock);
+}
+
 /* Caller must hold pasid_mutex, mm reference */
 static int
 intel_svm_bind_mm(struct device *dev, unsigned int flags,
@@ -609,6 +628,10 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		}
 
 		list_add_tail(&svm->list, &global_svm_list);
+		if (mm) {
+			/* The newly allocated pasid is loaded to the mm. */
+			load_pasid(mm, svm->pasid);
+		}
 	} else {
 		/*
 		 * Binding a new device with existing PASID, need to setup
@@ -672,8 +695,11 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 
 			if (list_empty(&svm->devs)) {
 				ioasid_free(svm->pasid);
-				if (svm->mm)
+				if (svm->mm) {
 					mmu_notifier_unregister(&svm->notifier, svm->mm);
+					/* Clear mm's pasid. */
+					load_pasid(svm->mm, PASID_DISABLED);
+				}
 				list_del(&svm->list);
 				/* We mandate that no page faults may be outstanding
 				 * for the PASID when intel_svm_unbind_mm() is called.
-- 
Gitee


From d4737f16732f5d67cf5c0878308d65ef64c3e91d Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 22 Sep 2020 14:08:43 +0800
Subject: [PATCH 0344/2161] iommu/vt-d: Use device numa domain if RHSA is
 missing

commit d2ef0962492c3be3563e53a431c285678849b3c1 upstream.

If there are multiple NUMA domains but the RHSA is missing in ACPI/DMAR
table, we could default to the device NUMA domain as fall back.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Link: https://lore.kernel.org/r/20200904010303.2961-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20200922060843.31546-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 388e681e59a1..56dc20980209 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -698,12 +698,47 @@ static int domain_update_iommu_superpage(struct dmar_domain *domain,
 	return fls(mask);
 }
 
+static int domain_update_device_node(struct dmar_domain *domain)
+{
+	struct device_domain_info *info;
+	int nid = NUMA_NO_NODE;
+
+	assert_spin_locked(&device_domain_lock);
+
+	if (list_empty(&domain->devices))
+		return NUMA_NO_NODE;
+
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->dev)
+			continue;
+
+		/*
+		 * There could possibly be multiple device numa nodes as devices
+		 * within the same domain may sit behind different IOMMUs. There
+		 * isn't perfect answer in such situation, so we select first
+		 * come first served policy.
+		 */
+		nid = dev_to_node(info->dev);
+		if (nid != NUMA_NO_NODE)
+			break;
+	}
+
+	return nid;
+}
+
 /* Some capabilities may be different across iommus */
 static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
 	domain_update_iommu_coherency(domain);
 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
+
+	/*
+	 * If RHSA is missing, we should default to the device numa domain
+	 * as fall back.
+	 */
+	if (domain->nid == NUMA_NO_NODE)
+		domain->nid = domain_update_device_node(domain);
 }
 
 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
@@ -5103,8 +5138,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 		if (type == IOMMU_DOMAIN_DMA)
 			intel_init_iova_domain(dmar_domain);
 
-		domain_update_iommu_cap(dmar_domain);
-
 		domain = &dmar_domain->domain;
 		domain->geometry.aperture_start = 0;
 		domain->geometry.aperture_end   =
-- 
Gitee


From 4f1b2fe4c38e4ec603acb83371eb39818036ee30 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Tue, 18 Aug 2020 22:24:26 +0800
Subject: [PATCH 0345/2161] ACPI: Do not create new NUMA domains from ACPI
 static tables that are not SRAT

commit 01feba590cd610780c463aa3d498200ba4503703 upstream.

Several ACPI static tables contain references to proximity domains.

ACPI 6.3 has clarified that only entries in SRAT may define a new
domain (sec 5.2.16).

Those tables described in the ACPI spec have additional clarifying text.

NFIT: Table 5-132,

"Integer that represents the proximity domain to which the memory
 belongs. This number must match with corresponding entry in the
 SRAT table."

HMAT: Table 5-145,

"... This number must match with the corresponding entry in the SRAT
 table's processor affinity structure ... if the initiator is a processor,
 or the Generic Initiator Affinity Structure if the initiator is a generic
 initiator".

IORT and DMAR are defined by external specifications.

Intel Virtualization Technology for Directed I/O Rev 3.1 does not make any
explicit statements, but the general SRAT statement above will still apply.
https://software.intel.com/sites/default/files/managed/c5/15/vt-directed-io-spec.pdf

IO Remapping Table, Platform Design Document rev D, also makes not explicit
statement, but refers to ACPI SRAT table for more information and again the
generic SRAT statement above applies.
https://developer.arm.com/documentation/den0049/d/

In conclusion, any proximity domain specified in these tables, should be a
reference to a proximity domain also found in SRAT, and they should not be
able to instantiate a new domain.  Hence we switch to pxm_to_node() which
will only return existing nodes.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/arm64/iort.c  | 2 +-
 drivers/acpi/nfit/core.c   | 3 +--
 drivers/iommu/intel/dmar.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index bc95a5eebd13..9b63fb62189c 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1261,7 +1261,7 @@ static int  __init arm_smmu_v3_set_proximity(struct device *dev,
 
 	smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 	if (smmu->flags & ACPI_IORT_SMMU_V3_PXM_VALID) {
-		int dev_node = acpi_map_pxm_to_node(smmu->pxm);
+		int dev_node = pxm_to_node(smmu->pxm);
 
 		if (dev_node != NUMA_NO_NODE && !node_online(dev_node))
 			return -EINVAL;
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 9d78f29cf996..5a6b8ad10eea 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2952,8 +2952,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 	if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) {
 		ndr_desc->numa_node = acpi_map_pxm_to_online_node(
 						spa->proximity_domain);
-		ndr_desc->target_node = acpi_map_pxm_to_node(
-				spa->proximity_domain);
+		ndr_desc->target_node = pxm_to_node(spa->proximity_domain);
 	} else {
 		ndr_desc->numa_node = NUMA_NO_NODE;
 		ndr_desc->target_node = NUMA_NO_NODE;
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index afacd5f042d7..404b40af31cb 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -476,7 +476,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
 	rhsa = (struct acpi_dmar_rhsa *)header;
 	for_each_drhd_unit(drhd) {
 		if (drhd->reg_base_addr == rhsa->base_address) {
-			int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
+			int node = pxm_to_node(rhsa->proximity_domain);
 
 			if (!node_online(node))
 				node = NUMA_NO_NODE;
-- 
Gitee


From 980c8541ffa1070e3e65bff3ee4aa251db5f42de Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.pan.linux@gmail.com>
Date: Fri, 25 Sep 2020 09:32:44 -0700
Subject: [PATCH 0346/2161] iommu/uapi: Use named union for user data

commit 8d3bb3b8cbf2ffb3ef73720a48b3445518dcdb55 upstream.

IOMMU UAPI data size is filled by the user space which must be validated
by the kernel. To ensure backward compatibility, user data can only be
extended by either re-purpose padding bytes or extend the variable sized
union at the end. No size change is allowed before the union. Therefore,
the minimum size is the offset of the union.

To use offsetof() on the union, we must make it named.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/linux-iommu/20200611145518.0c2817d6@x1.home/
Link: https://lore.kernel.org/r/1601051567-54787-4-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 22 +++++++++++-----------
 drivers/iommu/intel/svm.c   |  2 +-
 include/uapi/linux/iommu.h  |  4 ++--
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 56dc20980209..0d25d61c36cf 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5475,8 +5475,8 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 
 	/* Size is only valid in address selective invalidation */
 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
-		size = to_vtd_size(inv_info->addr_info.granule_size,
-				   inv_info->addr_info.nb_granules);
+		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
+				   inv_info->granu.addr_info.nb_granules);
 
 	for_each_set_bit(cache_type,
 			 (unsigned long *)&inv_info->cache,
@@ -5497,20 +5497,20 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 		 * granularity.
 		 */
 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
-		    (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
-			pasid = inv_info->pasid_info.pasid;
+		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
+			pasid = inv_info->granu.pasid_info.pasid;
 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
-			 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
-			pasid = inv_info->addr_info.pasid;
+			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
+			pasid = inv_info->granu.addr_info.pasid;
 
 		switch (BIT(cache_type)) {
 		case IOMMU_CACHE_INV_TYPE_IOTLB:
 			/* HW will ignore LSB bits based on address mask */
 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
 			    size &&
-			    (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
+			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
-						   inv_info->addr_info.addr, size);
+						   inv_info->granu.addr_info.addr, size);
 			}
 
 			/*
@@ -5518,9 +5518,9 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 			 * We use npages = -1 to indicate that.
 			 */
 			qi_flush_piotlb(iommu, did, pasid,
-					mm_to_dma_pfn(inv_info->addr_info.addr),
+					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
-					inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
+					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
 
 			if (!info->ats_enabled)
 				break;
@@ -5543,7 +5543,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 				size = 64 - VTD_PAGE_SHIFT;
 				addr = 0;
 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
-				addr = inv_info->addr_info.addr;
+				addr = inv_info->granu.addr_info.addr;
 			}
 
 			if (info->ats_enabled)
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index cb14fdc65b7e..2d20c2eda032 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -389,7 +389,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	spin_lock(&iommu->lock);
 	ret = intel_pasid_setup_nested(iommu, dev,
 				       (pgd_t *)(uintptr_t)data->gpgd,
-				       data->hpasid, &data->vtd, dmar_domain,
+				       data->hpasid, &data->vendor.vtd, dmar_domain,
 				       data->addr_width);
 	spin_unlock(&iommu->lock);
 	if (ret) {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index c2b2caf9ed41..827e429ca706 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -263,7 +263,7 @@ struct iommu_cache_invalidate_info {
 	union {
 		struct iommu_inv_pasid_info pasid_info;
 		struct iommu_inv_addr_info addr_info;
-	};
+	} granu;
 };
 
 /**
@@ -327,7 +327,7 @@ struct iommu_gpasid_bind_data {
 	/* Vendor specific data */
 	union {
 		struct iommu_gpasid_bind_data_vtd vtd;
-	};
+	} vendor;
 };
 
 #endif /* _UAPI_IOMMU_H */
-- 
Gitee


From 3e464fb9ceaec36ca5bb5e49889627ae58fb0953 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.pan.linux@gmail.com>
Date: Fri, 25 Sep 2020 09:32:43 -0700
Subject: [PATCH 0347/2161] iommu/uapi: Add argsz for user filled data

commit 1e6aaae93e9ddb9dc664993eb949b1da94cab3a5 upstream.

As IOMMU UAPI gets extended, user data size may increase. To support
backward compatibiliy, this patch introduces a size field to each UAPI
data structures. It is *always* the responsibility for the user to fill in
the correct size. Padding fields are adjusted to ensure 8 byte alignment.

Specific scenarios for user data handling are documented in:
Documentation/userspace-api/iommu.rst

As there is no current users of the API, struct version is not
incremented.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/1601051567-54787-3-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/iommu.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 827e429ca706..5946779ac1f9 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -139,6 +139,7 @@ enum iommu_page_response_code {
 
 /**
  * struct iommu_page_response - Generic page response information
+ * @argsz: User filled size of this data
  * @version: API version of this structure
  * @flags: encodes whether the corresponding fields are valid
  *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
@@ -147,6 +148,7 @@ enum iommu_page_response_code {
  * @code: response code from &enum iommu_page_response_code
  */
 struct iommu_page_response {
+	__u32	argsz;
 #define IOMMU_PAGE_RESP_VERSION_1	1
 	__u32	version;
 #define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
@@ -222,6 +224,7 @@ struct iommu_inv_pasid_info {
 /**
  * struct iommu_cache_invalidate_info - First level/stage invalidation
  *     information
+ * @argsz: User filled size of this data
  * @version: API version of this structure
  * @cache: bitfield that allows to select which caches to invalidate
  * @granularity: defines the lowest granularity used for the invalidation:
@@ -250,6 +253,7 @@ struct iommu_inv_pasid_info {
  * must support the used granularity.
  */
 struct iommu_cache_invalidate_info {
+	__u32	argsz;
 #define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1
 	__u32	version;
 /* IOMMU paging structure cache */
@@ -259,7 +263,7 @@ struct iommu_cache_invalidate_info {
 #define IOMMU_CACHE_INV_TYPE_NR		(3)
 	__u8	cache;
 	__u8	granularity;
-	__u8	padding[2];
+	__u8	padding[6];
 	union {
 		struct iommu_inv_pasid_info pasid_info;
 		struct iommu_inv_addr_info addr_info;
@@ -296,6 +300,7 @@ struct iommu_gpasid_bind_data_vtd {
 
 /**
  * struct iommu_gpasid_bind_data - Information about device and guest PASID binding
+ * @argsz:	User filled size of this data
  * @version:	Version of this data structure
  * @format:	PASID table entry format
  * @flags:	Additional information on guest bind request
@@ -313,17 +318,18 @@ struct iommu_gpasid_bind_data_vtd {
  * PASID to host PASID based on this bind data.
  */
 struct iommu_gpasid_bind_data {
+	__u32 argsz;
 #define IOMMU_GPASID_BIND_VERSION_1	1
 	__u32 version;
 #define IOMMU_PASID_FORMAT_INTEL_VTD	1
 	__u32 format;
+	__u32 addr_width;
 #define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
 	__u64 flags;
 	__u64 gpgd;
 	__u64 hpasid;
 	__u64 gpasid;
-	__u32 addr_width;
-	__u8  padding[12];
+	__u8  padding[8];
 	/* Vendor specific data */
 	union {
 		struct iommu_gpasid_bind_data_vtd vtd;
-- 
Gitee


From a4fdcec2cdba28fd925650adaedae000027767b1 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.pan.linux@gmail.com>
Date: Fri, 25 Sep 2020 09:32:47 -0700
Subject: [PATCH 0348/2161] iommu/vt-d: Check UAPI data processed by IOMMU core

commit 6278eecba31f3983fe2743fc01b198433aa18247 upstream.

IOMMU generic layer already does sanity checks on UAPI data for version
match and argsz range based on generic information.

This patch adjusts the following data checking responsibilities:
- removes the redundant version check from VT-d driver
- removes the check for vendor specific data size
- adds check for the use of reserved/undefined flags

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/1601051567-54787-7-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  3 +--
 drivers/iommu/intel/svm.c   | 11 +++++++++--
 include/uapi/linux/iommu.h  |  1 +
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 0d25d61c36cf..3901230bc011 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5449,8 +5449,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 	int ret = 0;
 	u64 size = 0;
 
-	if (!inv_info || !dmar_domain ||
-	    inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+	if (!inv_info || !dmar_domain)
 		return -EINVAL;
 
 	if (!dev || !dev_is_pci(dev))
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 2d20c2eda032..cc867736b1a2 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -303,8 +303,15 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	if (WARN_ON(!iommu) || !data)
 		return -EINVAL;
 
-	if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
-	    data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+	if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+		return -EINVAL;
+
+	/* IOMMU core ensures argsz is more than the start of the union */
+	if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd))
+		return -EINVAL;
+
+	/* Make sure no undefined flags are used in vendor data */
+	if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1))
 		return -EINVAL;
 
 	if (!dev_is_pci(dev))
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 5946779ac1f9..7345dd160773 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -288,6 +288,7 @@ struct iommu_gpasid_bind_data_vtd {
 #define IOMMU_SVA_VTD_GPASID_PWT	(1 << 3) /* page-level write through */
 #define IOMMU_SVA_VTD_GPASID_EMTE	(1 << 4) /* extended mem type enable */
 #define IOMMU_SVA_VTD_GPASID_CD		(1 << 5) /* PASID-level cache disable */
+#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 6)
 	__u64 flags;
 	__u32 pat;
 	__u32 emt;
-- 
Gitee


From 346b8ac6918ec1831ab002ccb650cda2392c508f Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:02 +0100
Subject: [PATCH 0349/2161] x86/msi: Only use high bits of MSI address for DMAR
 unit

commit 47bea873cf809f490cfac0c4e88533fd7fed6064 upstream.

The Intel IOMMU has an MSI-like configuration for its interrupt, but it
isn't really MSI. So it gets to abuse the high 32 bits of the address, and
puts the high 24 bits of the extended APIC ID there.

This isn't something that can be used in the general case for real MSIs,
since external devices using the high bits of the address would be
performing writes to actual memory space above 4GiB, not targeted at the
APIC.

Factor the hack out and allow it only to be used when appropriate, adding a
WARN_ON_ONCE() if other MSIs are targeted at an unreachable APIC ID. That
should never happen since the compatibility MSI messages are not used when
Interrupt Remapping is enabled.

The x2apic_enabled() check isn't needed because Linux won't bring up CPUs
with higher APIC IDs unless IR and x2apic are enabled anyway.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-3-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/msi.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 6a6cb95e6296..a302c3675e2b 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -23,13 +23,11 @@
 
 static struct irq_domain *msi_default_domain;
 
-static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+				  bool dmar)
 {
 	msg->address_hi = MSI_ADDR_BASE_HI;
 
-	if (x2apic_enabled())
-		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
-
 	msg->address_lo =
 		MSI_ADDR_BASE_LO |
 		((apic->irq_dest_mode == 0) ?
@@ -43,18 +41,29 @@ static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
 		MSI_DATA_LEVEL_ASSERT |
 		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_VECTOR(cfg->vector);
+
+	/*
+	 * Only the IOMMU itself can use the trick of putting destination
+	 * APIC ID into the high bits of the address. Anything else would
+	 * just be writing to memory if it tried that, and needs IR to
+	 * address higher APIC IDs.
+	 */
+	if (dmar)
+		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
+	else
+		WARN_ON_ONCE(MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid));
 }
 
 void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	__irq_msi_compose_msg(irqd_cfg(data), msg);
+	__irq_msi_compose_msg(irqd_cfg(data), msg, false);
 }
 
 static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
 {
 	struct msi_msg msg[2] = { [1] = { }, };
 
-	__irq_msi_compose_msg(cfg, msg);
+	__irq_msi_compose_msg(cfg, msg, false);
 	irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
 }
 
@@ -306,6 +315,17 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
 #endif
 
 #ifdef CONFIG_DMAR_TABLE
+/*
+ * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the
+ * high bits of the destination APIC ID. This can't be done in the general
+ * case for MSIs as it would be targeting real memory above 4GiB not the
+ * APIC.
+ */
+static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	__irq_msi_compose_msg(irqd_cfg(data), msg, true);
+}
+
 static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	dmar_msi_write(data->irq, msg);
@@ -318,6 +338,7 @@ static struct irq_chip dmar_msi_controller = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= msi_domain_set_affinity,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= dmar_msi_compose_msg,
 	.irq_write_msi_msg	= dmar_msi_write_msg,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
-- 
Gitee


From 459094d290f2c65b5ac433822a5094f5197b98d4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:05 +0100
Subject: [PATCH 0350/2161] x86/apic: Cleanup delivery mode defines

commit 721612994f53ed600b39a80d912b10f51960e2e3 upstream.

The enum ioapic_irq_destination_types and the enumerated constants starting
with 'dest_' are gross misnomers because they describe the delivery mode.

Rename then enum and the constants so they actually make sense.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-6-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/apic.h           |  3 ++-
 arch/x86/include/asm/apicdef.h        | 16 +++++++---------
 arch/x86/kernel/apic/apic_flat_64.c   |  4 ++--
 arch/x86/kernel/apic/apic_noop.c      |  2 +-
 arch/x86/kernel/apic/apic_numachip.c  |  4 ++--
 arch/x86/kernel/apic/bigsmp_32.c      |  2 +-
 arch/x86/kernel/apic/io_apic.c        | 11 ++++++-----
 arch/x86/kernel/apic/probe_32.c       |  2 +-
 arch/x86/kernel/apic/x2apic_cluster.c |  2 +-
 arch/x86/kernel/apic/x2apic_phys.c    |  2 +-
 drivers/iommu/amd_iommu.c             |  4 ++--
 drivers/iommu/intel/irq_remapping.c   |  2 +-
 drivers/pci/controller/pci-hyperv.c   |  6 +++---
 13 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c4895b3897eb..d3966daae3ac 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -299,7 +299,8 @@ struct apic {
 	/* dest_logical is used by the IPI functions */
 	u32	dest_logical;
 	u32	disable_esr;
-	u32	irq_delivery_mode;
+
+	enum apic_delivery_modes delivery_mode;
 	u32	irq_dest_mode;
 
 	u32	(*calc_dest_apicid)(unsigned int cpu);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 05e694ed8386..5716f22f81ac 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -432,15 +432,13 @@ struct local_apic {
  #define BAD_APICID 0xFFFFu
 #endif
 
-enum ioapic_irq_destination_types {
-	dest_Fixed		= 0,
-	dest_LowestPrio		= 1,
-	dest_SMI		= 2,
-	dest__reserved_1	= 3,
-	dest_NMI		= 4,
-	dest_INIT		= 5,
-	dest__reserved_2	= 6,
-	dest_ExtINT		= 7
+enum apic_delivery_modes {
+	APIC_DELIVERY_MODE_FIXED	= 0,
+	APIC_DELIVERY_MODE_LOWESTPRIO   = 1,
+	APIC_DELIVERY_MODE_SMI		= 2,
+	APIC_DELIVERY_MODE_NMI		= 4,
+	APIC_DELIVERY_MODE_INIT		= 5,
+	APIC_DELIVERY_MODE_EXTINT	= 7,
 };
 
 #endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7862b152a052..fdd38a17f835 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -113,7 +113,7 @@ static struct apic apic_flat __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= flat_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	.irq_dest_mode			= 1, /* logical */
 
 	.disable_esr			= 0,
@@ -206,7 +206,7 @@ static struct apic apic_physflat __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= flat_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	.irq_dest_mode			= 0, /* physical */
 
 	.disable_esr			= 0,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 98c9bb75d185..0bb64e1d0150 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -94,7 +94,7 @@ struct apic apic_noop __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= noop_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	/* logical delivery broadcast to all CPUs: */
 	.irq_dest_mode			= 1,
 
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index cdf45b4700f2..fe1a260c85c0 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -246,7 +246,7 @@ static const struct apic apic_numachip1 __refconst = {
 	.apic_id_valid			= numachip_apic_id_valid,
 	.apic_id_registered		= numachip_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	.irq_dest_mode			= 0, /* physical */
 
 	.disable_esr			= 0,
@@ -295,7 +295,7 @@ static const struct apic apic_numachip2 __refconst = {
 	.apic_id_valid			= numachip_apic_id_valid,
 	.apic_id_registered		= numachip_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	.irq_dest_mode			= 0, /* physical */
 
 	.disable_esr			= 0,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 98d015a4405a..7f6461f5d349 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -127,7 +127,7 @@ static struct apic apic_bigsmp __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= bigsmp_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	/* phys delivery to target CPU: */
 	.irq_dest_mode			= 0,
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4672b9455e84..1ee9172ba79d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -548,7 +548,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
 	entry = ioapic_read_entry(apic, pin);
-	if (entry.delivery_mode == dest_SMI)
+	if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI)
 		return;
 
 	/*
@@ -1391,7 +1391,8 @@ void __init enable_IO_APIC(void)
 		/* If the interrupt line is enabled and in ExtInt mode
 		 * I have found the pin where the i8259 is connected.
 		 */
-		if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+		if ((entry.mask == 0) &&
+		    (entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT)) {
 			ioapic_i8259.apic = apic;
 			ioapic_i8259.pin  = pin;
 			goto found_i8259;
@@ -1439,7 +1440,7 @@ void native_restore_boot_irq_mode(void)
 		entry.trigger		= IOAPIC_EDGE;
 		entry.polarity		= IOAPIC_POL_HIGH;
 		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
-		entry.delivery_mode	= dest_ExtINT;
+		entry.delivery_mode	= APIC_DELIVERY_MODE_EXTINT;
 		entry.dest		= read_apic_id();
 
 		/*
@@ -2070,7 +2071,7 @@ static inline void __init unlock_ExtINT_logic(void)
 	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
 	entry1.mask = IOAPIC_UNMASKED;
 	entry1.dest = hard_smp_processor_id();
-	entry1.delivery_mode = dest_ExtINT;
+	entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
 	entry1.polarity = entry0.polarity;
 	entry1.trigger = IOAPIC_EDGE;
 	entry1.vector = 0;
@@ -2971,7 +2972,7 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
 			   struct IO_APIC_route_entry *entry)
 {
 	memset(entry, 0, sizeof(*entry));
-	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->delivery_mode = apic->delivery_mode;
 	entry->dest_mode     = apic->irq_dest_mode;
 	entry->dest	     = cfg->dest_apicid;
 	entry->vector	     = cfg->vector;
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 7bda71def557..def5b12ca475 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -69,7 +69,7 @@ static struct apic apic_default __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= default_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	/* logical delivery broadcast to all CPUs: */
 	.irq_dest_mode			= 1,
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 7eec3c154fa2..2ece30887d02 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -186,7 +186,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
 	.apic_id_valid			= x2apic_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	.irq_dest_mode			= 1, /* logical */
 
 	.disable_esr			= 0,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 032a00e5d9fa..032b5fc45f0c 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -160,7 +160,7 @@ static struct apic apic_x2apic_phys __ro_after_init = {
 	.apic_id_valid			= x2apic_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
 	.irq_dest_mode			= 0, /* physical */
 
 	.disable_esr			= 0,
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 32f6d165775a..4d7cee899f1a 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3614,7 +3614,7 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 
 	data->irq_2_irte.devid = devid;
 	data->irq_2_irte.index = index + sub_handle;
-	iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
+	iommu->irte_ops->prepare(data->entry, apic->delivery_mode,
 				 apic->irq_dest_mode, irq_cfg->vector,
 				 irq_cfg->dest_apicid, devid);
 
@@ -3883,7 +3883,7 @@ int amd_iommu_deactivate_guest_mode(void *data)
 
 	entry->lo.fields_remap.valid       = valid;
 	entry->lo.fields_remap.dm          = apic->irq_dest_mode;
-	entry->lo.fields_remap.int_type    = apic->irq_delivery_mode;
+	entry->lo.fields_remap.int_type    = apic->delivery_mode;
 	entry->hi.fields.vector            = cfg->vector;
 	entry->lo.fields_remap.destination =
 				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index aedaae4630bc..ec18fc7185a5 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1122,7 +1122,7 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 	 * irq migration in the presence of interrupt-remapping.
 	*/
 	irte->trigger_mode = 0;
-	irte->dlvry_mode = apic->irq_delivery_mode;
+	irte->dlvry_mode = apic->delivery_mode;
 	irte->vector = vector;
 	irte->dest_id = IRTE_DEST(dest);
 	irte->redir_hint = 1;
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 8b05db522efe..36235782adab 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1218,7 +1218,7 @@ static void hv_irq_unmask(struct irq_data *data)
 	params->int_target.vector = cfg->vector;
 
 	/*
-	 * Honoring apic->irq_delivery_mode set to dest_Fixed by
+	 * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by
 	 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
 	 * spurious interrupt storm. Not doing so does not seem to have a
 	 * negative effect (yet?).
@@ -1302,7 +1302,7 @@ static u32 hv_compose_msi_req_v1(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.vector_count = 1;
-	int_pkt->int_desc.delivery_mode = dest_Fixed;
+	int_pkt->int_desc.delivery_mode = APIC_DELIVERY_MODE_FIXED;
 
 	/*
 	 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
@@ -1323,7 +1323,7 @@ static u32 hv_compose_msi_req_v2(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.vector_count = 1;
-	int_pkt->int_desc.delivery_mode = dest_Fixed;
+	int_pkt->int_desc.delivery_mode = APIC_DELIVERY_MODE_FIXED;
 
 	/*
 	 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
-- 
Gitee


From f8183e143f04a3a91ff5cc420c0a053bd541cb3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:07 +0100
Subject: [PATCH 0351/2161] x86/apic: Get rid of apic:: Dest_logical

commit e57d04e5fa00f7649d4c00796f8d12054799be4a upstream.

struct apic has two members which store information about the destination
mode: dest_logical and irq_dest_mode.

dest_logical contains a mask which was historically used to set the
destination mode in IPI messages. Over time the usage was reduced and the
logical/physical functions were seperated.

There are only a few places which still use 'dest_logical' but they can
use 'irq_dest_mode' instead.

irq_dest_mode is actually a boolean where 0 means physical destination mode
and 1 means logical destination mode. Of course the name does not reflect
the functionality. This will be cleaned up in a subsequent change.

Remove apic::dest_logical and fixup the remaining users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-8-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/apic.h           | 2 --
 arch/x86/kernel/apic/apic.c           | 2 +-
 arch/x86/kernel/apic/apic_flat_64.c   | 8 ++------
 arch/x86/kernel/apic/apic_noop.c      | 4 +---
 arch/x86/kernel/apic/apic_numachip.c  | 8 ++------
 arch/x86/kernel/apic/bigsmp_32.c      | 4 +---
 arch/x86/kernel/apic/probe_32.c       | 4 +---
 arch/x86/kernel/apic/x2apic_cluster.c | 4 +---
 arch/x86/kernel/apic/x2apic_phys.c    | 4 +---
 arch/x86/kernel/smpboot.c             | 5 +++--
 arch/x86/xen/apic.c                   | 4 +---
 11 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d3966daae3ac..4f8d99dea3a2 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -296,8 +296,6 @@ struct apic {
 	void	(*send_IPI_all)(int vector);
 	void	(*send_IPI_self)(int vector);
 
-	/* dest_logical is used by the IPI functions */
-	u32	dest_logical;
 	u32	disable_esr;
 
 	enum apic_delivery_modes delivery_mode;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c9a70e084d52..7e94f35fdee8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1641,7 +1641,7 @@ static void setup_local_APIC(void)
 	apic->init_apic_ldr();
 
 #ifdef CONFIG_X86_32
-	if (apic->dest_logical) {
+	if (apic->irq_dest_mode == 1) {
 		int logical_apicid, ldr_apicid;
 
 		/*
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index fdd38a17f835..43033f04441f 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -117,11 +117,9 @@ static struct apic apic_flat __ro_after_init = {
 	.irq_dest_mode			= 1, /* logical */
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= flat_init_apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
@@ -210,11 +208,9 @@ static struct apic apic_physflat __ro_after_init = {
 	.irq_dest_mode			= 0, /* physical */
 
 	.disable_esr			= 0,
-	.dest_logical			= 0,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= physflat_init_apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 0bb64e1d0150..34d08eed84db 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -99,11 +99,9 @@ struct apic apic_noop __ro_after_init = {
 	.irq_dest_mode			= 1,
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= default_check_apicid_used,
 
+	.check_apicid_used		= default_check_apicid_used,
 	.init_apic_ldr			= noop_init_apic_ldr,
-
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= NULL,
 
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index fe1a260c85c0..66f64f28de33 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -250,11 +250,9 @@ static const struct apic apic_numachip1 __refconst = {
 	.irq_dest_mode			= 0, /* physical */
 
 	.disable_esr			= 0,
-	.dest_logical			= 0,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= flat_init_apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
@@ -299,11 +297,9 @@ static const struct apic apic_numachip2 __refconst = {
 	.irq_dest_mode			= 0, /* physical */
 
 	.disable_esr			= 0,
-	.dest_logical			= 0,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= flat_init_apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 7f6461f5d349..64c375b8c54e 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -132,11 +132,9 @@ static struct apic apic_bigsmp __ro_after_init = {
 	.irq_dest_mode			= 0,
 
 	.disable_esr			= 1,
-	.dest_logical			= 0,
-	.check_apicid_used		= bigsmp_check_apicid_used,
 
+	.check_apicid_used		= bigsmp_check_apicid_used,
 	.init_apic_ldr			= bigsmp_init_apic_ldr,
-
 	.ioapic_phys_id_map		= bigsmp_ioapic_phys_id_map,
 	.setup_apic_routing		= bigsmp_setup_apic_routing,
 	.cpu_present_to_apicid		= bigsmp_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index def5b12ca475..f642f31830bf 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -74,11 +74,9 @@ static struct apic apic_default __ro_after_init = {
 	.irq_dest_mode			= 1,
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= default_check_apicid_used,
 
+	.check_apicid_used		= default_check_apicid_used,
 	.init_apic_ldr			= default_init_apic_ldr,
-
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= setup_apic_flat_routing,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 2ece30887d02..7a3b0a0527e5 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -190,11 +190,9 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
 	.irq_dest_mode			= 1, /* logical */
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= init_x2apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 032b5fc45f0c..5385aaa3fc82 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -164,11 +164,9 @@ static struct apic apic_x2apic_phys __ro_after_init = {
 	.irq_dest_mode			= 0, /* physical */
 
 	.disable_esr			= 0,
-	.dest_logical			= 0,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= init_x2apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8367bd7a9a81..f94af399d05e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -753,13 +753,14 @@ static void __init smp_quirk_init_udelay(void)
 int
 wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 {
+	u32 dm = apic->irq_dest_mode ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
 	unsigned long send_status, accept_status = 0;
 	int maxlvt;
 
 	/* Target chip */
 	/* Boot on the stack */
 	/* Kick the second */
-	apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
+	apic_icr_write(APIC_DM_NMI | dm, apicid);
 
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
@@ -986,7 +987,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
 	if (!boot_error) {
 		enable_start_cpu0 = 1;
 		*cpu0_nmi_registered = 1;
-		if (apic->dest_logical == APIC_DEST_LOGICAL)
+		if (apic->irq_dest_mode)
 			id = cpu0_logical_apicid;
 		else
 			id = apicid;
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 2df7d089ad54..bb4471c92e23 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -163,11 +163,9 @@ static struct apic xen_pv_apic = {
 	/* .irq_dest_mode     - used in native_compose_msi_msg only */
 
 	.disable_esr			= 0,
-	/* .dest_logical      -  default_send_IPI_ use it but we use our own. */
-	.check_apicid_used		= default_check_apicid_used, /* Used on 32-bit */
 
+	.check_apicid_used		= default_check_apicid_used, /* Used on 32-bit */
 	.init_apic_ldr			= xen_noop, /* setup_local_APIC calls it */
-
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map, /* Used on 32-bit */
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= xen_cpu_present_to_apicid,
-- 
Gitee


From d21ab0b410e90ac42be330d9e113b15f81bdf79e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:03 +0100
Subject: [PATCH 0352/2161] x86/apic/uv: Fix inconsistent destination mode

commit 93b7a3d6a1f0f159d390959de7a1b9ad863d6b26 upstream.

The UV x2apic is strictly using physical destination mode, but
apic::dest_logical is initialized with APIC_DEST_LOGICAL.

This does not matter much because UV does not use any of the generic
functions which use apic::dest_logical, but is still inconsistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-4-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e6230af19864..83cf4e4bdf56 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -651,7 +651,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
 	.irq_dest_mode			= 0, /* Physical */
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
+	.dest_logical			= APIC_DEST_PHYSICAL,
 	.check_apicid_used		= NULL,
 
 	.init_apic_ldr			= uv_init_apic_ldr,
-- 
Gitee


From 0b3ca91a7ac2a9f98b5d782a5c8cd6cbd288de46 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:06 +0100
Subject: [PATCH 0353/2161] x86/apic: Replace pointless apic:: Dest_logical
 usage

commit 22e0db42097b30a49771744e514350b7e9dd26f2 upstream.

All these functions are only used for logical destination mode. So reading
the destination mode mask from the apic structure is a pointless
exercise. Just hand in the proper constant: APIC_DEST_LOGICAL.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-7-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/apic_flat_64.c   | 2 +-
 arch/x86/kernel/apic/ipi.c            | 6 +++---
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 43033f04441f..bbb1b89fe711 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -53,7 +53,7 @@ static void _flat_send_IPI_mask(unsigned long mask, int vector)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+	__default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 387154e39e08..d1fb874fbe64 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -260,7 +260,7 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
 	for_each_cpu(query_cpu, mask)
 		__default_send_IPI_dest_field(
 			early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
+			vector, APIC_DEST_LOGICAL);
 	local_irq_restore(flags);
 }
 
@@ -279,7 +279,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
 			continue;
 		__default_send_IPI_dest_field(
 			early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
+			vector, APIC_DEST_LOGICAL);
 		}
 	local_irq_restore(flags);
 }
@@ -297,7 +297,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
 
 	local_irq_save(flags);
 	WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
-	__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+	__default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 7a3b0a0527e5..ca81629f55e6 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -63,7 +63,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 		if (!dest)
 			continue;
 
-		__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+		__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
 		/* Remove cluster CPUs from tmpmask */
 		cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
 	}
-- 
Gitee


From c83102f73a8f2320494df0865453bba81468f97b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:08 +0100
Subject: [PATCH 0354/2161] x86/apic: Cleanup destination mode

commit 8c44963b603db76e3e5f57d90d027657ba43c1fe upstream.

apic::irq_dest_mode is actually a boolean, but defined as u32 and named in
a way which does not explain what it means.

Make it a boolean and rename it to 'dest_mode_logical'

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-9-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/apic.h           | 2 +-
 arch/x86/kernel/apic/apic.c           | 2 +-
 arch/x86/kernel/apic/apic_flat_64.c   | 4 ++--
 arch/x86/kernel/apic/apic_noop.c      | 4 +---
 arch/x86/kernel/apic/apic_numachip.c  | 4 ++--
 arch/x86/kernel/apic/bigsmp_32.c      | 3 +--
 arch/x86/kernel/apic/io_apic.c        | 2 +-
 arch/x86/kernel/apic/msi.c            | 6 +++---
 arch/x86/kernel/apic/probe_32.c       | 3 +--
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 arch/x86/kernel/apic/x2apic_phys.c    | 2 +-
 arch/x86/kernel/smpboot.c             | 7 ++-----
 arch/x86/xen/apic.c                   | 3 +--
 drivers/iommu/amd_iommu.c             | 8 ++++----
 drivers/iommu/amd_iommu_types.h       | 2 +-
 drivers/iommu/intel/irq_remapping.c   | 2 +-
 16 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4f8d99dea3a2..b279c2a67fc3 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -299,7 +299,7 @@ struct apic {
 	u32	disable_esr;
 
 	enum apic_delivery_modes delivery_mode;
-	u32	irq_dest_mode;
+	bool	dest_mode_logical;
 
 	u32	(*calc_dest_apicid)(unsigned int cpu);
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7e94f35fdee8..2219d2110a59 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1641,7 +1641,7 @@ static void setup_local_APIC(void)
 	apic->init_apic_ldr();
 
 #ifdef CONFIG_X86_32
-	if (apic->irq_dest_mode == 1) {
+	if (apic->dest_mode_logical) {
 		int logical_apicid, ldr_apicid;
 
 		/*
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index bbb1b89fe711..8f72b4351c9f 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -114,7 +114,7 @@ static struct apic apic_flat __ro_after_init = {
 	.apic_id_registered		= flat_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	.irq_dest_mode			= 1, /* logical */
+	.dest_mode_logical		= true,
 
 	.disable_esr			= 0,
 
@@ -205,7 +205,7 @@ static struct apic apic_physflat __ro_after_init = {
 	.apic_id_registered		= flat_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	.irq_dest_mode			= 0, /* physical */
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 0,
 
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 34d08eed84db..bd982beb53ab 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -95,8 +95,7 @@ struct apic apic_noop __ro_after_init = {
 	.apic_id_registered		= noop_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	/* logical delivery broadcast to all CPUs: */
-	.irq_dest_mode			= 1,
+	.dest_mode_logical		= true,
 
 	.disable_esr			= 0,
 
@@ -104,7 +103,6 @@ struct apic apic_noop __ro_after_init = {
 	.init_apic_ldr			= noop_init_apic_ldr,
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= NULL,
-
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
 
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 66f64f28de33..8a685864f0cb 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -247,7 +247,7 @@ static const struct apic apic_numachip1 __refconst = {
 	.apic_id_registered		= numachip_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	.irq_dest_mode			= 0, /* physical */
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 0,
 
@@ -294,7 +294,7 @@ static const struct apic apic_numachip2 __refconst = {
 	.apic_id_registered		= numachip_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	.irq_dest_mode			= 0, /* physical */
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 0,
 
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 64c375b8c54e..77555f66c14d 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -128,8 +128,7 @@ static struct apic apic_bigsmp __ro_after_init = {
 	.apic_id_registered		= bigsmp_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	/* phys delivery to target CPU: */
-	.irq_dest_mode			= 0,
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 1,
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1ee9172ba79d..915928c53896 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2973,7 +2973,7 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
 {
 	memset(entry, 0, sizeof(*entry));
 	entry->delivery_mode = apic->delivery_mode;
-	entry->dest_mode     = apic->irq_dest_mode;
+	entry->dest_mode     = apic->dest_mode_logical;
 	entry->dest	     = cfg->dest_apicid;
 	entry->vector	     = cfg->vector;
 	entry->trigger	     = data->trigger;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index a302c3675e2b..330d30d8010c 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -30,9 +30,9 @@ static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
 
 	msg->address_lo =
 		MSI_ADDR_BASE_LO |
-		((apic->irq_dest_mode == 0) ?
-			MSI_ADDR_DEST_MODE_PHYSICAL :
-			MSI_ADDR_DEST_MODE_LOGICAL) |
+		(apic->dest_mode_logical ?
+			MSI_ADDR_DEST_MODE_LOGICAL :
+			MSI_ADDR_DEST_MODE_PHYSICAL) |
 		MSI_ADDR_REDIRECTION_CPU |
 		MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index f642f31830bf..a4432ae622de 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -70,8 +70,7 @@ static struct apic apic_default __ro_after_init = {
 	.apic_id_registered		= default_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	/* logical delivery broadcast to all CPUs: */
-	.irq_dest_mode			= 1,
+	.dest_mode_logical		= true,
 
 	.disable_esr			= 0,
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ca81629f55e6..f4da9bb69a88 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -187,7 +187,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
 	.apic_id_registered		= x2apic_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	.irq_dest_mode			= 1, /* logical */
+	.dest_mode_logical		= true,
 
 	.disable_esr			= 0,
 
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 5385aaa3fc82..6bde05a86b4e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -161,7 +161,7 @@ static struct apic apic_x2apic_phys __ro_after_init = {
 	.apic_id_registered		= x2apic_apic_id_registered,
 
 	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
-	.irq_dest_mode			= 0, /* physical */
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 0,
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f94af399d05e..4384f323854b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -753,7 +753,7 @@ static void __init smp_quirk_init_udelay(void)
 int
 wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 {
-	u32 dm = apic->irq_dest_mode ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+	u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
 	unsigned long send_status, accept_status = 0;
 	int maxlvt;
 
@@ -987,10 +987,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
 	if (!boot_error) {
 		enable_start_cpu0 = 1;
 		*cpu0_nmi_registered = 1;
-		if (apic->irq_dest_mode)
-			id = cpu0_logical_apicid;
-		else
-			id = apicid;
+		id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid;
 		boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
 	}
 
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index bb4471c92e23..fe12fab61535 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -159,8 +159,7 @@ static struct apic xen_pv_apic = {
 	.apic_id_valid 			= xen_id_always_valid,
 	.apic_id_registered 		= xen_id_always_registered,
 
-	/* .irq_delivery_mode - used in native_compose_msi_msg only */
-	/* .irq_dest_mode     - used in native_compose_msi_msg only */
+	/* .delivery_mode and .dest_mode_logical not used by XENPV */
 
 	.disable_esr			= 0,
 
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 4d7cee899f1a..4aed9b45553b 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3409,7 +3409,7 @@ static void free_irte(u16 devid, int index)
 }
 
 static void irte_prepare(void *entry,
-			 u32 delivery_mode, u32 dest_mode,
+			 u32 delivery_mode, bool dest_mode,
 			 u8 vector, u32 dest_apicid, int devid)
 {
 	union irte *irte = (union irte *) entry;
@@ -3423,7 +3423,7 @@ static void irte_prepare(void *entry,
 }
 
 static void irte_ga_prepare(void *entry,
-			    u32 delivery_mode, u32 dest_mode,
+			    u32 delivery_mode, bool dest_mode,
 			    u8 vector, u32 dest_apicid, int devid)
 {
 	struct irte_ga *irte = (struct irte_ga *) entry;
@@ -3615,7 +3615,7 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 	data->irq_2_irte.devid = devid;
 	data->irq_2_irte.index = index + sub_handle;
 	iommu->irte_ops->prepare(data->entry, apic->delivery_mode,
-				 apic->irq_dest_mode, irq_cfg->vector,
+				 apic->dest_mode_logical, irq_cfg->vector,
 				 irq_cfg->dest_apicid, devid);
 
 	switch (info->type) {
@@ -3882,7 +3882,7 @@ int amd_iommu_deactivate_guest_mode(void *data)
 	entry->hi.val = 0;
 
 	entry->lo.fields_remap.valid       = valid;
-	entry->lo.fields_remap.dm          = apic->irq_dest_mode;
+	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
 	entry->lo.fields_remap.int_type    = apic->delivery_mode;
 	entry->hi.fields.vector            = cfg->vector;
 	entry->lo.fields_remap.destination =
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 36007799800d..3b72515e6e20 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -890,7 +890,7 @@ struct amd_ir_data {
 };
 
 struct amd_irte_ops {
-	void (*prepare)(void *, u32, u32, u8, u32, int);
+	void (*prepare)(void *, u32, bool, u8, u32, int);
 	void (*activate)(void *, u16, u16);
 	void (*deactivate)(void *, u16, u16);
 	void (*set_affinity)(void *, u16, u16, u8, u32);
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index ec18fc7185a5..4cfb975bc112 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1113,7 +1113,7 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 	memset(irte, 0, sizeof(*irte));
 
 	irte->present = 1;
-	irte->dst_mode = apic->irq_dest_mode;
+	irte->dst_mode = apic->dest_mode_logical;
 	/*
 	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
 	 * actual level or edge trigger will be setup in the IO-APIC
-- 
Gitee


From 24184906e1d6a521f571db83f2a7cae1782e3701 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:09 +0100
Subject: [PATCH 0355/2161] x86/apic: Always provide irq_compose_msi_msg()
 method for vector domain

commit f598181acfb36f67e1de138cbe80a7db497f7d8c upstream.

This shouldn't be dependent on PCI_MSI. HPET and I/O-APIC can deliver
interrupts through MSI without having any PCI in the system at all.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-10-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/apic.h   |  8 +++-----
 arch/x86/kernel/apic/apic.c   | 32 ++++++++++++++++++++++++++++++
 arch/x86/kernel/apic/msi.c    | 37 -----------------------------------
 arch/x86/kernel/apic/vector.c |  6 ++++++
 4 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index b279c2a67fc3..91e62f6b18eb 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -510,12 +510,10 @@ static inline void apic_smt_update(void) { }
 #endif
 
 struct msi_msg;
+struct irq_cfg;
 
-#ifdef CONFIG_PCI_MSI
-void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg);
-#else
-# define x86_vector_msi_compose_msg NULL
-#endif
+extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+				  bool dmar);
 
 extern void irq_enter(void);
 extern void irq_exit(void);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2219d2110a59..18b75ec2318b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,6 +52,7 @@
 #include <asm/io_apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
+#include <asm/msidef.h>
 #include <asm/mtrr.h>
 #include <asm/time.h>
 #include <asm/smp.h>
@@ -2530,6 +2531,37 @@ int hard_smp_processor_id(void)
 	return read_apic_id();
 }
 
+void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+			   bool dmar)
+{
+	msg->address_hi = MSI_ADDR_BASE_HI;
+
+	msg->address_lo =
+		MSI_ADDR_BASE_LO |
+		(apic->dest_mode_logical ?
+			MSI_ADDR_DEST_MODE_LOGICAL :
+			MSI_ADDR_DEST_MODE_PHYSICAL) |
+		MSI_ADDR_REDIRECTION_CPU |
+		MSI_ADDR_DEST_ID(cfg->dest_apicid);
+
+	msg->data =
+		MSI_DATA_TRIGGER_EDGE |
+		MSI_DATA_LEVEL_ASSERT |
+		MSI_DATA_DELIVERY_FIXED |
+		MSI_DATA_VECTOR(cfg->vector);
+
+	/*
+	 * Only the IOMMU itself can use the trick of putting destination
+	 * APIC ID into the high bits of the address. Anything else would
+	 * just be writing to memory if it tried that, and needs IR to
+	 * address higher APIC IDs.
+	 */
+	if (dmar)
+		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
+	else
+		WARN_ON_ONCE(MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid));
+}
+
 /*
  * Override the generic EOI implementation with an optimized version.
  * Only called during early boot when only one CPU is active and with
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 330d30d8010c..f3bd575a1571 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -15,7 +15,6 @@
 #include <linux/hpet.h>
 #include <linux/msi.h>
 #include <asm/irqdomain.h>
-#include <asm/msidef.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
@@ -23,42 +22,6 @@
 
 static struct irq_domain *msi_default_domain;
 
-static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
-				  bool dmar)
-{
-	msg->address_hi = MSI_ADDR_BASE_HI;
-
-	msg->address_lo =
-		MSI_ADDR_BASE_LO |
-		(apic->dest_mode_logical ?
-			MSI_ADDR_DEST_MODE_LOGICAL :
-			MSI_ADDR_DEST_MODE_PHYSICAL) |
-		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DEST_ID(cfg->dest_apicid);
-
-	msg->data =
-		MSI_DATA_TRIGGER_EDGE |
-		MSI_DATA_LEVEL_ASSERT |
-		MSI_DATA_DELIVERY_FIXED |
-		MSI_DATA_VECTOR(cfg->vector);
-
-	/*
-	 * Only the IOMMU itself can use the trick of putting destination
-	 * APIC ID into the high bits of the address. Anything else would
-	 * just be writing to memory if it tried that, and needs IR to
-	 * address higher APIC IDs.
-	 */
-	if (dmar)
-		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
-	else
-		WARN_ON_ONCE(MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid));
-}
-
-void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
-{
-	__irq_msi_compose_msg(irqd_cfg(data), msg, false);
-}
-
 static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
 {
 	struct msi_msg msg[2] = { [1] = { }, };
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 1c2cdcb42880..33291daab1b4 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -817,6 +817,12 @@ void apic_ack_edge(struct irq_data *irqd)
 	apic_ack_irq(irqd);
 }
 
+static void x86_vector_msi_compose_msg(struct irq_data *data,
+				       struct msi_msg *msg)
+{
+       __irq_msi_compose_msg(irqd_cfg(data), msg, false);
+}
+
 static struct irq_chip lapic_controller = {
 	.name			= "APIC",
 	.irq_ack		= apic_ack_edge,
-- 
Gitee


From 9c94c2392a7b04549dc35d600bbfa0fe8c65224a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:11 +0100
Subject: [PATCH 0356/2161] genirq/msi: Allow shadow declarations of msi_msg::
 $member

commit 8073c1ac82c12aaf1b475a3ce5328d43b3eaa4ae upstream.

Architectures like x86 have their MSI messages in various bits of the data,
address_lo and address_hi field. Composing or decomposing these messages
with bitmasks and shifts is possible, but unreadable gunk.

Allow architectures to provide an architecture specific representation for
each member of msi_msg. Provide empty defaults for each and stick them into
an union.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-12-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/asm-generic/msi.h |  4 ++++
 include/linux/msi.h       | 46 +++++++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/include/asm-generic/msi.h b/include/asm-generic/msi.h
index e6795f088bdd..25344de0e8f9 100644
--- a/include/asm-generic/msi.h
+++ b/include/asm-generic/msi.h
@@ -4,6 +4,8 @@
 
 #include <linux/types.h>
 
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+
 #ifndef NUM_MSI_ALLOC_SCRATCHPAD_REGS
 # define NUM_MSI_ALLOC_SCRATCHPAD_REGS	2
 #endif
@@ -30,4 +32,6 @@ typedef struct msi_alloc_info {
 
 #define GENERIC_MSI_DOMAIN_OPS		1
 
+#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
+
 #endif
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 9c2e5f65a9ff..3de0c1c41862 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -4,11 +4,50 @@
 
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <asm/msi.h>
+
+/* Dummy shadow structures if an architecture does not define them */
+#ifndef arch_msi_msg_addr_lo
+typedef struct arch_msi_msg_addr_lo {
+	u32	address_lo;
+} __attribute__ ((packed)) arch_msi_msg_addr_lo_t;
+#endif
+
+#ifndef arch_msi_msg_addr_hi
+typedef struct arch_msi_msg_addr_hi {
+	u32	address_hi;
+} __attribute__ ((packed)) arch_msi_msg_addr_hi_t;
+#endif
+
+#ifndef arch_msi_msg_data
+typedef struct arch_msi_msg_data {
+	u32	data;
+} __attribute__ ((packed)) arch_msi_msg_data_t;
+#endif
 
+/**
+ * msi_msg - Representation of a MSI message
+ * @address_lo:		Low 32 bits of msi message address
+ * @arch_addrlo:	Architecture specific shadow of @address_lo
+ * @address_hi:		High 32 bits of msi message address
+ *			(only used when device supports it)
+ * @arch_addrhi:	Architecture specific shadow of @address_hi
+ * @data:		MSI message data (usually 16 bits)
+ * @arch_data:		Architecture specific shadow of @data
+ */
 struct msi_msg {
-	u32	address_lo;	/* low 32 bits of msi message address */
-	u32	address_hi;	/* high 32 bits of msi message address */
-	u32	data;		/* 16 bits of msi message data */
+	union {
+		u32			address_lo;
+		arch_msi_msg_addr_lo_t	arch_addr_lo;
+	};
+	union {
+		u32			address_hi;
+		arch_msi_msg_addr_hi_t	arch_addr_hi;
+	};
+	union {
+		u32			data;
+		arch_msi_msg_data_t	arch_data;
+	};
 };
 
 extern int pci_msi_ignore_mask;
@@ -228,7 +267,6 @@ struct msi_controller {
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 
 #include <linux/irqhandler.h>
-#include <asm/msi.h>
 
 struct irq_domain;
 struct irq_domain_ops;
-- 
Gitee


From 4b5629a25348e13ce3645088d5b0341581fd7aae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:12 +0100
Subject: [PATCH 0357/2161] x86/msi: Provide msi message shadow structs

commit 6285aa507366729c618d5295fb540b24a956088a upstream.

Create shadow structs with named bitfields for msi_msg data, address_lo and
address_hi and use them in the MSI message composer.

Provide a function to retrieve the destination ID. This could be inline,
but that'd create a circular header dependency.

[dwmw2: fix bitfields not all to be a union]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-13-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/msi.h  | 49 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/apic/apic.c | 35 ++++++++++++++------------
 2 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 25ddd0916bb2..9db196bba88b 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -9,6 +9,55 @@ typedef struct irq_alloc_info msi_alloc_info_t;
 int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
 		    msi_alloc_info_t *arg);
 
+/* Structs and defines for the X86 specific MSI message format */
+
+typedef struct x86_msi_data {
+	u32	vector			:  8,
+		delivery_mode		:  3,
+		dest_mode_logical	:  1,
+		reserved		:  2,
+		active_low		:  1,
+		is_level		:  1;
+
+	u32	dmar_subhandle;
+} __attribute__ ((packed)) arch_msi_msg_data_t;
+#define arch_msi_msg_data	x86_msi_data
+
+typedef struct x86_msi_addr_lo {
+	union {
+		struct {
+			u32	reserved_0		:  2,
+				dest_mode_logical	:  1,
+				redirect_hint		:  1,
+				reserved_1		:  8,
+				destid_0_7		:  8,
+				base_address		: 12;
+		};
+		struct {
+			u32	dmar_reserved_0		:  2,
+				dmar_index_15		:  1,
+				dmar_subhandle_valid	:  1,
+				dmar_format		:  1,
+				dmar_index_0_14		: 15,
+				dmar_base_address	: 12;
+		};
+	};
+} __attribute__ ((packed)) arch_msi_msg_addr_lo_t;
+#define arch_msi_msg_addr_lo	x86_msi_addr_lo
+
+#define X86_MSI_BASE_ADDRESS_LOW	(0xfee00000 >> 20)
+
+typedef struct x86_msi_addr_hi {
+	u32	reserved		:  8,
+		destid_8_31		: 24;
+} __attribute__ ((packed)) arch_msi_msg_addr_hi_t;
+#define arch_msi_msg_addr_hi	x86_msi_addr_hi
+
+#define X86_MSI_BASE_ADDRESS_HIGH	(0)
+
+struct msi_msg;
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid);
+
 void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
 
 #endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 18b75ec2318b..0fddfb8716f8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,7 +52,6 @@
 #include <asm/io_apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
-#include <asm/msidef.h>
 #include <asm/mtrr.h>
 #include <asm/time.h>
 #include <asm/smp.h>
@@ -2534,22 +2533,16 @@ int hard_smp_processor_id(void)
 void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
 			   bool dmar)
 {
-	msg->address_hi = MSI_ADDR_BASE_HI;
+	memset(msg, 0, sizeof(*msg));
 
-	msg->address_lo =
-		MSI_ADDR_BASE_LO |
-		(apic->dest_mode_logical ?
-			MSI_ADDR_DEST_MODE_LOGICAL :
-			MSI_ADDR_DEST_MODE_PHYSICAL) |
-		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DEST_ID(cfg->dest_apicid);
+	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+	msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical;
+	msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF;
 
-	msg->data =
-		MSI_DATA_TRIGGER_EDGE |
-		MSI_DATA_LEVEL_ASSERT |
-		MSI_DATA_DELIVERY_FIXED |
-		MSI_DATA_VECTOR(cfg->vector);
+	msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED;
+	msg->arch_data.vector = cfg->vector;
 
+	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
 	/*
 	 * Only the IOMMU itself can use the trick of putting destination
 	 * APIC ID into the high bits of the address. Anything else would
@@ -2557,11 +2550,21 @@ void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
 	 * address higher APIC IDs.
 	 */
 	if (dmar)
-		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
+		msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8;
 	else
-		WARN_ON_ONCE(MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid));
+		WARN_ON_ONCE(cfg->dest_apicid > 0xFF);
 }
 
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
+{
+	u32 dest = msg->arch_addr_lo.destid_0_7;
+
+	if (extid)
+		dest |= msg->arch_addr_hi.destid_8_31 << 8;
+	return dest;
+}
+EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+
 /*
  * Override the generic EOI implementation with an optimized version.
  * Only called during early boot when only one CPU is active and with
-- 
Gitee


From bc61190a725b81b0d2085ddc89479cb84de4eb24 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:13 +0100
Subject: [PATCH 0358/2161] iommu/intel: Use msi_msg shadow structs

commit 5c0d0e2cc6e0e7a96c25351fd67c775e7b1f11f0 upstream.

Use the bitfields in the x86 shadow struct to compose the MSI message.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-14-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/irq_remapping.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 4cfb975bc112..d61e86c3ddec 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -20,7 +20,6 @@
 #include <asm/cpu.h>
 #include <asm/irq_remapping.h>
 #include <asm/pci-direct.h>
-#include <asm/msidef.h>
 
 #include "../irq_remapping.h"
 
@@ -1260,6 +1259,21 @@ static struct irq_chip intel_ir_chip = {
 	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
 };
 
+static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
+{
+	memset(msg, 0, sizeof(*msg));
+
+	msg->arch_addr_lo.dmar_base_address = X86_MSI_BASE_ADDRESS_LOW;
+	msg->arch_addr_lo.dmar_subhandle_valid = true;
+	msg->arch_addr_lo.dmar_format = true;
+	msg->arch_addr_lo.dmar_index_0_14 = index & 0x7FFF;
+	msg->arch_addr_lo.dmar_index_15 = !!(index & 0x8000);
+
+	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+
+	msg->arch_data.dmar_subhandle = subhandle;
+}
+
 static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 					     struct irq_cfg *irq_cfg,
 					     struct irq_alloc_info *info,
@@ -1267,7 +1281,6 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 {
 	struct IR_IO_APIC_route_entry *entry;
 	struct irte *irte = &data->irte_entry;
-	struct msi_msg *msg = &data->msi_entry;
 
 	prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
 	switch (info->type) {
@@ -1308,12 +1321,7 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 		else
 			set_msi_sid(irte, msi_desc_to_pci_dev(info->desc));
 
-		msg->address_hi = MSI_ADDR_BASE_HI;
-		msg->data = sub_handle;
-		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
-				  MSI_ADDR_IR_SHV |
-				  MSI_ADDR_IR_INDEX1(index) |
-				  MSI_ADDR_IR_INDEX2(index);
+		fill_msi_msg(&data->msi_entry, index, sub_handle);
 		break;
 
 	default:
-- 
Gitee


From b459059a44e03a6f2915d9b96fda639fbbfb376b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:14 +0100
Subject: [PATCH 0359/2161] iommu/amd: Use msi_msg shadow structs

commit b5c3786ee3704bd8cd5b29ae168526f2b1af4557 upstream.

Get rid of the macro mess and use the shadow structs for the x86 specific
MSI message format. Convert the intcapxt setup to use named bitfields as
well while touching it anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-15-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c      | 14 +++++++----
 drivers/iommu/amd_iommu_init.c | 46 +++++++++++++++++++---------------
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 4aed9b45553b..05174ad99566 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -37,7 +37,6 @@
 #include <asm/io_apic.h>
 #include <asm/apic.h>
 #include <asm/hw_irq.h>
-#include <asm/msidef.h>
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
@@ -3599,13 +3598,20 @@ struct irq_remap_ops amd_iommu_irq_ops = {
 	.get_irq_domain		= get_irq_domain,
 };
 
+static void fill_msi_msg(struct msi_msg *msg, u32 index)
+{
+	msg->data = index;
+	msg->address_lo = 0;
+	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+}
+
 static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 				       struct irq_cfg *irq_cfg,
 				       struct irq_alloc_info *info,
 				       int devid, int index, int sub_handle)
 {
 	struct irq_2_irte *irte_info = &data->irq_2_irte;
-	struct msi_msg *msg = &data->msi_entry;
 	struct IO_APIC_route_entry *entry;
 	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
 
@@ -3636,9 +3642,7 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 	case X86_IRQ_ALLOC_TYPE_HPET:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		msg->address_hi = MSI_ADDR_BASE_HI;
-		msg->address_lo = MSI_ADDR_BASE_LO;
-		msg->data = irte_info->index;
+		fill_msi_msg(&data->msi_entry, irte_info->index);
 		break;
 
 	default:
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index be098f15131e..64fc6c152096 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -25,7 +25,6 @@
 #include <asm/pci-direct.h>
 #include <asm/iommu.h>
 #include <asm/apic.h>
-#include <asm/msidef.h>
 #include <asm/gart.h>
 #include <asm/x86_init.h>
 #include <asm/iommu_table.h>
@@ -1983,10 +1982,16 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
 	return 0;
 }
 
-#define XT_INT_DEST_MODE(x)	(((x) & 0x1ULL) << 2)
-#define XT_INT_DEST_LO(x)	(((x) & 0xFFFFFFULL) << 8)
-#define XT_INT_VEC(x)		(((x) & 0xFFULL) << 32)
-#define XT_INT_DEST_HI(x)	((((x) >> 24) & 0xFFULL) << 56)
+union intcapxt {
+	u64	capxt;
+	u64	reserved_0		:  2,
+		dest_mode_logical	:  1,
+		reserved_1		:  5,
+		destid_0_23		: 24,
+		vector			:  8,
+		reserved_2		: 16,
+		destid_24_31		:  8;
+} __attribute__ ((packed));
 
 /**
  * Setup the IntCapXT registers with interrupt routing information
@@ -1995,28 +2000,29 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
  */
 static void iommu_update_intcapxt(struct amd_iommu *iommu)
 {
-	u64 val;
-	u32 addr_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
-	u32 addr_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
-	u32 data    = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);
-	bool dm     = (addr_lo >> MSI_ADDR_DEST_MODE_SHIFT) & 0x1;
-	u32 dest    = ((addr_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xFF);
+	struct msi_msg msg;
+	union intcapxt xt;
+	u32 destid;
 
-	if (x2apic_enabled())
-		dest |= MSI_ADDR_EXT_DEST_ID(addr_hi);
+	msg.address_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
+	msg.address_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
+	msg.data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);
 
-	val = XT_INT_VEC(data & 0xFF) |
-	      XT_INT_DEST_MODE(dm) |
-	      XT_INT_DEST_LO(dest) |
-	      XT_INT_DEST_HI(dest);
+	destid = x86_msi_msg_get_destid(&msg, x2apic_enabled());
+
+	xt.capxt = 0ULL;
+	xt.dest_mode_logical = msg.arch_data.dest_mode_logical;
+	xt.vector = msg.arch_data.vector;
+	xt.destid_0_23 = destid & GENMASK(23, 0);
+	xt.destid_24_31 = destid >> 24;
 
 	/**
 	 * Current IOMMU implemtation uses the same IRQ for all
 	 * 3 IOMMU interrupts.
 	 */
-	writeq(val, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
-	writeq(val, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
-	writeq(val, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
+	writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
+	writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
+	writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
 }
 
 static void _irq_notifier_notify(struct irq_affinity_notify *notify,
-- 
Gitee


From 2141b6769d292652809cd220c5619dfe66410c0a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:19 +0100
Subject: [PATCH 0360/2161] x86/io_apic: Cleanup trigger/polarity helpers

commit a27dca645d2c0f31abb7858aa0e10b2fa0f2f659 upstream.

'trigger' and 'polarity' are used throughout the I/O-APIC code for handling
the trigger type (edge/level) and the active low/high configuration. While
there are defines for initializing these variables and struct members, they
are not used consequently and the meaning of 'trigger' and 'polarity' is
opaque and confusing at best.

Rename them to 'is_level' and 'active_low' and make them boolean in various
structs so it's entirely clear what the meaning is.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-20-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h       |   6 +-
 arch/x86/kernel/apic/io_apic.c      | 244 +++++++++++++---------------
 arch/x86/pci/intel_mid_pci.c        |   8 +-
 drivers/iommu/amd_iommu.c           |  10 +-
 drivers/iommu/intel/irq_remapping.c |   9 +-
 5 files changed, 130 insertions(+), 147 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 80df0ba2a45b..3e6908b6baa4 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -69,9 +69,9 @@ enum irq_alloc_type {
 struct ioapic_alloc_info {
 	int				pin;
 	int				node;
-	u32				trigger : 1;
-	u32				polarity : 1;
-	u32				valid : 1;
+	u32				is_level	: 1;
+	u32				active_low	: 1;
+	u32				valid		: 1;
 	struct IO_APIC_route_entry	*entry;
 };
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 915928c53896..8436b3d56cf2 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,12 +89,12 @@ struct irq_pin_list {
 };
 
 struct mp_chip_data {
-	struct list_head irq_2_pin;
-	struct IO_APIC_route_entry entry;
-	int trigger;
-	int polarity;
+	struct list_head		irq_2_pin;
+	struct IO_APIC_route_entry	entry;
+	bool				is_level;
+	bool				active_low;
+	bool				isa_irq;
 	u32 count;
-	bool isa_irq;
 };
 
 struct mp_ioapic_gsi {
@@ -758,44 +758,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 	return -1;
 }
 
-#ifdef CONFIG_EISA
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
-	if (irq < nr_legacy_irqs()) {
-		unsigned int port = 0x4d0 + (irq >> 3);
-		return (inb(port) >> (irq & 7)) & 1;
-	}
-	apic_printk(APIC_VERBOSE, KERN_INFO
-			"Broken MPtable reports ISA irq %d\n", irq);
-	return 0;
-}
-
-#endif
-
-/* ISA interrupts are always active high edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx)	(IOAPIC_EDGE)
-#define default_ISA_polarity(idx)	(IOAPIC_POL_HIGH)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value.  If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx)	default_ISA_polarity(idx)
-
-/* PCI interrupts are always active low level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx)	(IOAPIC_LEVEL)
-#define default_PCI_polarity(idx)	(IOAPIC_POL_LOW)
-
-static int irq_polarity(int idx)
+static bool irq_active_low(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
 
@@ -804,90 +767,139 @@ static int irq_polarity(int idx)
 	 */
 	switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
 	case MP_IRQPOL_DEFAULT:
-		/* conforms to spec, ie. bus-type dependent polarity */
-		if (test_bit(bus, mp_bus_not_pci))
-			return default_ISA_polarity(idx);
-		else
-			return default_PCI_polarity(idx);
+		/*
+		 * Conforms to spec, ie. bus-type dependent polarity.  PCI
+		 * defaults to low active. [E]ISA defaults to high active.
+		 */
+		return !test_bit(bus, mp_bus_not_pci);
 	case MP_IRQPOL_ACTIVE_HIGH:
-		return IOAPIC_POL_HIGH;
+		return false;
 	case MP_IRQPOL_RESERVED:
 		pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
 		/* fall through */
 	case MP_IRQPOL_ACTIVE_LOW:
 	default: /* Pointless default required due to do gcc stupidity */
-		return IOAPIC_POL_LOW;
+		return true;
 	}
 }
 
 #ifdef CONFIG_EISA
-static int eisa_irq_trigger(int idx, int bus, int trigger)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
+{
+	if (irq < nr_legacy_irqs()) {
+		unsigned int port = 0x4d0 + (irq >> 3);
+		return (inb(port) >> (irq & 7)) & 1;
+	}
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"Broken MPtable reports ISA irq %d\n", irq);
+	return false;
+}
+
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
 {
 	switch (mp_bus_id_to_type[bus]) {
 	case MP_BUS_PCI:
 	case MP_BUS_ISA:
-		return trigger;
+		return level;
 	case MP_BUS_EISA:
-		return default_EISA_trigger(idx);
+		return EISA_ELCR(mp_irqs[idx].srcbusirq);
 	}
 	pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
-	return IOAPIC_LEVEL;
+	return true;
 }
 #else
-static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
 {
-	return trigger;
+	return level;
 }
 #endif
 
-static int irq_trigger(int idx)
+static bool irq_is_level(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
-	int trigger;
+	bool level;
 
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
 	switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
 	case MP_IRQTRIG_DEFAULT:
-		/* conforms to spec, ie. bus-type dependent trigger mode */
-		if (test_bit(bus, mp_bus_not_pci))
-			trigger = default_ISA_trigger(idx);
-		else
-			trigger = default_PCI_trigger(idx);
+		/*
+		 * Conforms to spec, ie. bus-type dependent trigger
+		 * mode. PCI defaults to egde, ISA to level.
+		 */
+		level = test_bit(bus, mp_bus_not_pci);
 		/* Take EISA into account */
-		return eisa_irq_trigger(idx, bus, trigger);
+		return eisa_irq_is_level(idx, bus, level);
 	case MP_IRQTRIG_EDGE:
-		return IOAPIC_EDGE;
+		return false;
 	case MP_IRQTRIG_RESERVED:
 		pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
 		/* fall through */
 	case MP_IRQTRIG_LEVEL:
 	default: /* Pointless default required due to do gcc stupidity */
-		return IOAPIC_LEVEL;
+		return true;
 	}
 }
 
+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
+{
+	int ioapic, pin, idx;
+
+	if (skip_ioapic_setup)
+		return -1;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -1;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	if (pin < 0)
+		return -1;
+
+	idx = find_irq_entry(ioapic, pin, mp_INT);
+	if (idx < 0)
+		return -1;
+
+	*trigger = irq_is_level(idx);
+	*polarity = irq_active_low(idx);
+	return 0;
+}
+
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
+{
+	*is_level = *active_low = 0;
+	return __acpi_get_override_irq(gsi, (bool *)is_level,
+				       (bool *)active_low);
+}
+#endif
+
 void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
 			   int trigger, int polarity)
 {
 	init_irq_alloc_info(info, NULL);
 	info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
 	info->ioapic.node = node;
-	info->ioapic.trigger = trigger;
-	info->ioapic.polarity = polarity;
+	info->ioapic.is_level = trigger;
+	info->ioapic.active_low = polarity;
 	info->ioapic.valid = 1;
 }
 
-#ifndef CONFIG_ACPI
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
-#endif
-
 static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
 				   struct irq_alloc_info *src,
 				   u32 gsi, int ioapic_idx, int pin)
 {
-	int trigger, polarity;
+	bool level, pol_low;
 
 	copy_irq_alloc_info(dst, src);
 	dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
@@ -896,20 +908,20 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
 	dst->ioapic.valid = 1;
 	if (src && src->ioapic.valid) {
 		dst->ioapic.node = src->ioapic.node;
-		dst->ioapic.trigger = src->ioapic.trigger;
-		dst->ioapic.polarity = src->ioapic.polarity;
+		dst->ioapic.is_level = src->ioapic.is_level;
+		dst->ioapic.active_low = src->ioapic.active_low;
 	} else {
 		dst->ioapic.node = NUMA_NO_NODE;
-		if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
-			dst->ioapic.trigger = trigger;
-			dst->ioapic.polarity = polarity;
+		if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+			dst->ioapic.is_level = level;
+			dst->ioapic.active_low = pol_low;
 		} else {
 			/*
 			 * PCI interrupts are always active low level
 			 * triggered.
 			 */
-			dst->ioapic.trigger = IOAPIC_LEVEL;
-			dst->ioapic.polarity = IOAPIC_POL_LOW;
+			dst->ioapic.is_level = true;
+			dst->ioapic.active_low = true;
 		}
 	}
 }
@@ -919,12 +931,12 @@ static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
 	return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
 }
 
-static void mp_register_handler(unsigned int irq, unsigned long trigger)
+static void mp_register_handler(unsigned int irq, bool level)
 {
 	irq_flow_handler_t hdl;
 	bool fasteoi;
 
-	if (trigger) {
+	if (level) {
 		irq_set_status_flags(irq, IRQ_LEVEL);
 		fasteoi = true;
 	} else {
@@ -946,14 +958,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 	 * pin with real trigger and polarity attributes.
 	 */
 	if (irq < nr_legacy_irqs() && data->count == 1) {
-		if (info->ioapic.trigger != data->trigger)
-			mp_register_handler(irq, info->ioapic.trigger);
-		data->entry.trigger = data->trigger = info->ioapic.trigger;
-		data->entry.polarity = data->polarity = info->ioapic.polarity;
+		if (info->ioapic.is_level != data->is_level)
+			mp_register_handler(irq, info->ioapic.is_level);
+		data->entry.trigger = data->is_level = info->ioapic.is_level;
+		data->entry.polarity = data->active_low = info->ioapic.active_low;
 	}
 
-	return data->trigger == info->ioapic.trigger &&
-	       data->polarity == info->ioapic.polarity;
+	return data->is_level == info->ioapic.is_level &&
+	       data->active_low == info->ioapic.active_low;
 }
 
 static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
@@ -2202,9 +2214,9 @@ static inline void __init check_timer(void)
 			 * so only need to unmask if it is level-trigger
 			 * do we really have level trigger timer?
 			 */
-			int idx;
-			idx = find_irq_entry(apic1, pin1, mp_INT);
-			if (idx != -1 && irq_trigger(idx))
+			int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+			if (idx != -1 && irq_is_level(idx))
 				unmask_ioapic_irq(irq_get_irq_data(0));
 		}
 		irq_domain_deactivate_irq(irq_data);
@@ -2611,30 +2623,6 @@ static int io_apic_get_version(int ioapic)
 	return reg_01.bits.version;
 }
 
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
-{
-	int ioapic, pin, idx;
-
-	if (skip_ioapic_setup)
-		return -1;
-
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return -1;
-
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-	if (pin < 0)
-		return -1;
-
-	idx = find_irq_entry(ioapic, pin, mp_INT);
-	if (idx < 0)
-		return -1;
-
-	*trigger = irq_trigger(idx);
-	*polarity = irq_polarity(idx);
-	return 0;
-}
-
 /*
  * This function updates target affinity of IOAPIC interrupts to include
  * the CPUs which came online during SMP bringup.
@@ -2958,13 +2946,13 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
 				  struct irq_alloc_info *info)
 {
 	if (info && info->ioapic.valid) {
-		data->trigger = info->ioapic.trigger;
-		data->polarity = info->ioapic.polarity;
-	} else if (acpi_get_override_irq(gsi, &data->trigger,
-					 &data->polarity) < 0) {
+		data->is_level = info->ioapic.is_level;
+		data->active_low = info->ioapic.active_low;
+	} else if (__acpi_get_override_irq(gsi, &data->is_level,
+					   &data->active_low) < 0) {
 		/* PCI interrupts are always active low level triggered. */
-		data->trigger = IOAPIC_LEVEL;
-		data->polarity = IOAPIC_POL_LOW;
+		data->is_level = true;
+		data->active_low = true;
 	}
 }
 
@@ -2976,16 +2964,13 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
 	entry->dest_mode     = apic->dest_mode_logical;
 	entry->dest	     = cfg->dest_apicid;
 	entry->vector	     = cfg->vector;
-	entry->trigger	     = data->trigger;
-	entry->polarity	     = data->polarity;
+	entry->trigger	     = data->is_level;
+	entry->polarity	     = data->active_low;
 	/*
 	 * Mask level triggered irqs. Edge triggered irqs are masked
 	 * by the irq core code in case they fire.
 	 */
-	if (data->trigger == IOAPIC_LEVEL)
-		entry->mask = IOAPIC_MASKED;
-	else
-		entry->mask = IOAPIC_UNMASKED;
+	entry->mask = data->is_level;
 }
 
 int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
@@ -3033,7 +3018,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	local_irq_save(flags);
 	if (info->ioapic.entry)
 		mp_setup_entry(cfg, data, info->ioapic.entry);
-	mp_register_handler(virq, data->trigger);
+	mp_register_handler(virq, data->is_level);
 	if (virq < nr_legacy_irqs())
 		legacy_pic->mask(virq);
 	local_irq_restore(flags);
@@ -3041,7 +3026,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	apic_printk(APIC_VERBOSE, KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
 		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
-		    virq, data->trigger, data->polarity, cfg->dest_apicid);
+		    virq, data->is_level, data->active_low,
+		    cfg->dest_apicid);
 
 	return 0;
 }
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index eea5a0f3b959..d6ac842c9505 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -215,7 +215,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
 	struct irq_alloc_info info;
-	int polarity;
+	bool polarity_low;
 	int ret;
 	u8 gsi;
 
@@ -230,7 +230,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 	switch (intel_mid_identify_cpu()) {
 	case INTEL_MID_CPU_CHIP_TANGIER:
-		polarity = IOAPIC_POL_HIGH;
+		polarity_low = false;
 
 		/* Special treatment for IRQ0 */
 		if (gsi == 0) {
@@ -252,11 +252,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 		}
 		break;
 	default:
-		polarity = IOAPIC_POL_LOW;
+		polarity_low = true;
 		break;
 	}
 
-	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
+	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low);
 
 	/*
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 05174ad99566..c23d90aed128 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3630,13 +3630,11 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 		entry = info->ioapic.entry;
 		info->ioapic.entry = NULL;
 		memset(entry, 0, sizeof(*entry));
-		entry->vector        = index;
-		entry->mask          = 0;
-		entry->trigger       = info->ioapic.trigger;
-		entry->polarity      = info->ioapic.polarity;
+		entry->vector	= index;
+		entry->trigger	= info->ioapic.is_level;
+		entry->polarity	= info->ioapic.active_low;
 		/* Mask level triggered irqs. */
-		if (info->ioapic.trigger)
-			entry->mask = 1;
+		entry->mask	= info->ioapic.is_level;
 		break;
 
 	case X86_IRQ_ALLOC_TYPE_HPET:
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index d61e86c3ddec..76b8d73a5b68 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1306,11 +1306,10 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 		 * irq handler will do the explicit EOI to the io-apic.
 		 */
 		entry->vector	= info->ioapic.pin;
-		entry->mask	= 0;			/* enable IRQ */
-		entry->trigger	= info->ioapic.trigger;
-		entry->polarity	= info->ioapic.polarity;
-		if (info->ioapic.trigger)
-			entry->mask = 1; /* Mask level triggered irqs. */
+		entry->trigger	= info->ioapic.is_level;
+		entry->polarity	= info->ioapic.active_low;
+		/* Mask level triggered irqs. */
+		entry->mask	= info->ioapic.is_level;
 		break;
 
 	case X86_IRQ_ALLOC_TYPE_HPET:
-- 
Gitee


From 54a1a5ce74b200064d0af20726f431f7248632aa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 24 Jul 2020 13:44:16 +0200
Subject: [PATCH 0361/2161] x86/ioapic: Remove unused "IOAPIC_AUTO" define

commit 8cd591aeb1d650f79a49d8704c35a78bf18f5de9 upstream.

Last use was removed more than 5 years ago, in:

   5ad274d41c1b: ("x86/irq: Remove unused old IOAPIC irqdomain interfaces")

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200724114418.629021-2-mingo@kernel.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/io_apic.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index fd20a2334885..a1a26f6d3aa4 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -99,7 +99,6 @@ struct IR_IO_APIC_route_entry {
 struct irq_alloc_info;
 struct ioapic_domain_cfg;
 
-#define IOAPIC_AUTO			-1
 #define IOAPIC_EDGE			0
 #define IOAPIC_LEVEL			1
 
-- 
Gitee


From 61f6853ca3b9a83f2322f4f6fcac84a6d3472a43 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:20 +0100
Subject: [PATCH 0362/2161] x86/ioapic: Cleanup IO/APIC route entry structs

commit 341b4a7211b6ba3a7089e1dc09ac4bd576dfb05f upstream.

Having two seperate structs for the I/O-APIC RTE entries (non-remapped and
DMAR remapped) requires type casts and makes it hard to map.

Combine them in IO_APIC_routing_entry by defining a union of two 64bit
bitfields. Use naming which reflects which bits are shared and which bits
are actually different for the operating modes.

[dwmw2: Fix it up and finish the job, pulling the 32-bit w1,w2 words for
        register access into the same union and eliminating a few more
        places where bits were accessed through masks and shifts.]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-21-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/io_apic.h      |  78 ++++++---------
 arch/x86/kernel/apic/io_apic.c      | 144 +++++++++++++---------------
 drivers/iommu/amd_iommu.c           |   8 +-
 drivers/iommu/hyperv-iommu.c        |   4 +-
 drivers/iommu/intel/irq_remapping.c |  19 ++--
 5 files changed, 108 insertions(+), 145 deletions(-)

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a1a26f6d3aa4..73da644b2f0d 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -13,15 +13,6 @@
  * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
  */
 
-/* I/O Unit Redirection Table */
-#define IO_APIC_REDIR_VECTOR_MASK	0x000FF
-#define IO_APIC_REDIR_DEST_LOGICAL	0x00800
-#define IO_APIC_REDIR_DEST_PHYSICAL	0x00000
-#define IO_APIC_REDIR_SEND_PENDING	(1 << 12)
-#define IO_APIC_REDIR_REMOTE_IRR	(1 << 14)
-#define IO_APIC_REDIR_LEVEL_TRIGGER	(1 << 15)
-#define IO_APIC_REDIR_MASKED		(1 << 16)
-
 /*
  * The structure of the IO-APIC:
  */
@@ -65,52 +56,39 @@ union IO_APIC_reg_03 {
 };
 
 struct IO_APIC_route_entry {
-	__u32	vector		:  8,
-		delivery_mode	:  3,	/* 000: FIXED
-					 * 001: lowest prio
-					 * 111: ExtINT
-					 */
-		dest_mode	:  1,	/* 0: physical, 1: logical */
-		delivery_status	:  1,
-		polarity	:  1,
-		irr		:  1,
-		trigger		:  1,	/* 0: edge, 1: level */
-		mask		:  1,	/* 0: enabled, 1: disabled */
-		__reserved_2	: 15;
-
-	__u32	__reserved_3	: 24,
-		dest		:  8;
-} __attribute__ ((packed));
-
-struct IR_IO_APIC_route_entry {
-	__u64	vector		: 8,
-		zero		: 3,
-		index2		: 1,
-		delivery_status : 1,
-		polarity	: 1,
-		irr		: 1,
-		trigger		: 1,
-		mask		: 1,
-		reserved	: 31,
-		format		: 1,
-		index		: 15;
+	union {
+		struct {
+			u64	vector			:  8,
+				delivery_mode		:  3,
+				dest_mode_logical	:  1,
+				delivery_status		:  1,
+				active_low		:  1,
+				irr			:  1,
+				is_level		:  1,
+				masked			:  1,
+				reserved_0		: 15,
+				reserved_1		: 24,
+				destid_0_7		:  8;
+		};
+		struct {
+			u64	ir_shared_0		:  8,
+				ir_zero			:  3,
+				ir_index_15		:  1,
+				ir_shared_1		:  5,
+				ir_reserved_0		: 31,
+				ir_format		:  1,
+				ir_index_0_14		: 15;
+		};
+		struct {
+			u64	w1			: 32,
+				w2			: 32;
+		};
+	};
 } __attribute__ ((packed));
 
 struct irq_alloc_info;
 struct ioapic_domain_cfg;
 
-#define IOAPIC_EDGE			0
-#define IOAPIC_LEVEL			1
-
-#define IOAPIC_MASKED			1
-#define IOAPIC_UNMASKED			0
-
-#define IOAPIC_POL_HIGH			0
-#define IOAPIC_POL_LOW			1
-
-#define IOAPIC_DEST_MODE_PHYSICAL	0
-#define IOAPIC_DEST_MODE_LOGICAL	1
-
 #define	IOAPIC_MAP_ALLOC		0x1
 #define	IOAPIC_MAP_CHECK		0x2
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8436b3d56cf2..cc58f0f93355 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -299,31 +299,26 @@ static void io_apic_write(unsigned int apic, unsigned int reg,
 	writel(value, &io_apic->data);
 }
 
-union entry_union {
-	struct { u32 w1, w2; };
-	struct IO_APIC_route_entry entry;
-};
-
 static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
 {
-	union entry_union eu;
+	struct IO_APIC_route_entry entry;
 
-	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
-	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+	entry.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+	entry.w2 = io_apic_read(apic, 0x11 + 2 * pin);
 
-	return eu.entry;
+	return entry;
 }
 
 static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
 {
-	union entry_union eu;
+	struct IO_APIC_route_entry entry;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	eu.entry = __ioapic_read_entry(apic, pin);
+	entry = __ioapic_read_entry(apic, pin);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	return eu.entry;
+	return entry;
 }
 
 /*
@@ -334,11 +329,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
  */
 static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
-	union entry_union eu = {{0, 0}};
-
-	eu.entry = e;
-	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
-	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	io_apic_write(apic, 0x11 + 2*pin, e.w2);
+	io_apic_write(apic, 0x10 + 2*pin, e.w1);
 }
 
 static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
@@ -357,12 +349,12 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
  */
 static void ioapic_mask_entry(int apic, int pin)
 {
+	struct IO_APIC_route_entry e = { .masked = true };
 	unsigned long flags;
-	union entry_union eu = { .entry.mask = IOAPIC_MASKED };
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	io_apic_write(apic, 0x10 + 2*pin, e.w1);
+	io_apic_write(apic, 0x11 + 2*pin, e.w2);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -435,20 +427,15 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
 	add_pin_to_irq_node(data, node, newapic, newpin);
 }
 
-static void io_apic_modify_irq(struct mp_chip_data *data,
-			       int mask_and, int mask_or,
+static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
 			       void (*final)(struct irq_pin_list *entry))
 {
-	union entry_union eu;
 	struct irq_pin_list *entry;
 
-	eu.entry = data->entry;
-	eu.w1 &= mask_and;
-	eu.w1 |= mask_or;
-	data->entry = eu.entry;
+	data->entry.masked = masked;
 
 	for_each_irq_pin(entry, data->irq_2_pin) {
-		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1);
 		if (final)
 			final(entry);
 	}
@@ -472,13 +459,13 @@ static void mask_ioapic_irq(struct irq_data *irq_data)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(data, true, &io_apic_sync);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
 static void __unmask_ioapic(struct mp_chip_data *data)
 {
-	io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(data, false, NULL);
 }
 
 static void unmask_ioapic_irq(struct irq_data *irq_data)
@@ -519,8 +506,8 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector)
 		/*
 		 * Mask the entry and change the trigger mode to edge.
 		 */
-		entry1.mask = IOAPIC_MASKED;
-		entry1.trigger = IOAPIC_EDGE;
+		entry1.masked = true;
+		entry1.is_level = false;
 
 		__ioapic_write_entry(apic, pin, entry1);
 
@@ -555,8 +542,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	 * Make sure the entry is masked and re-read the contents to check
 	 * if it is a level triggered pin and if the remote-IRR is set.
 	 */
-	if (entry.mask == IOAPIC_UNMASKED) {
-		entry.mask = IOAPIC_MASKED;
+	if (!entry.masked) {
+		entry.masked = true;
 		ioapic_write_entry(apic, pin, entry);
 		entry = ioapic_read_entry(apic, pin);
 	}
@@ -569,8 +556,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 		 * doesn't clear the remote-IRR if the trigger mode is not
 		 * set to level.
 		 */
-		if (entry.trigger == IOAPIC_EDGE) {
-			entry.trigger = IOAPIC_LEVEL;
+		if (!entry.is_level) {
+			entry.is_level = true;
 			ioapic_write_entry(apic, pin, entry);
 		}
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -672,8 +659,8 @@ void mask_ioapic_entries(void)
 			struct IO_APIC_route_entry entry;
 
 			entry = ioapics[apic].saved_registers[pin];
-			if (entry.mask == IOAPIC_UNMASKED) {
-				entry.mask = IOAPIC_MASKED;
+			if (!entry.masked) {
+				entry.masked = true;
 				ioapic_write_entry(apic, pin, entry);
 			}
 		}
@@ -960,8 +947,8 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 	if (irq < nr_legacy_irqs() && data->count == 1) {
 		if (info->ioapic.is_level != data->is_level)
 			mp_register_handler(irq, info->ioapic.is_level);
-		data->entry.trigger = data->is_level = info->ioapic.is_level;
-		data->entry.polarity = data->active_low = info->ioapic.active_low;
+		data->entry.is_level = data->is_level = info->ioapic.is_level;
+		data->entry.active_low = data->active_low = info->ioapic.active_low;
 	}
 
 	return data->is_level == info->ioapic.is_level &&
@@ -1254,10 +1241,9 @@ void ioapic_zap_locks(void)
 
 static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 {
-	int i;
-	char buf[256];
 	struct IO_APIC_route_entry entry;
-	struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
+	char buf[256];
+	int i;
 
 	printk(KERN_DEBUG "IOAPIC %d:\n", apic);
 	for (i = 0; i <= nr_entries; i++) {
@@ -1265,20 +1251,20 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 		snprintf(buf, sizeof(buf),
 			 " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
 			 i,
-			 entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
-			 entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
-			 entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+			 entry.masked ? "disabled" : "enabled ",
+			 entry.is_level ? "level" : "edge ",
+			 entry.active_low ? "low " : "high",
 			 entry.vector, entry.irr, entry.delivery_status);
-		if (ir_entry->format)
+		if (entry.ir_format) {
 			printk(KERN_DEBUG "%s, remapped, I(%04X),  Z(%X)\n",
-			       buf, (ir_entry->index2 << 15) | ir_entry->index,
-			       ir_entry->zero);
-		else
-			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
 			       buf,
-			       entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
-			       "logical " : "physical",
-			       entry.dest, entry.delivery_mode);
+			       (entry.ir_index_15 << 15) | entry.ir_index_0_14,
+				entry.ir_zero);
+		} else {
+			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", buf,
+			       entry.dest_mode_logical ? "logical " : "physical",
+			       entry.destid_0_7, entry.delivery_mode);
+		}
 	}
 }
 
@@ -1403,8 +1389,8 @@ void __init enable_IO_APIC(void)
 		/* If the interrupt line is enabled and in ExtInt mode
 		 * I have found the pin where the i8259 is connected.
 		 */
-		if ((entry.mask == 0) &&
-		    (entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT)) {
+		if (!entry.masked &&
+		    entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
 			ioapic_i8259.apic = apic;
 			ioapic_i8259.pin  = pin;
 			goto found_i8259;
@@ -1448,12 +1434,12 @@ void native_restore_boot_irq_mode(void)
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
-		entry.mask		= IOAPIC_UNMASKED;
-		entry.trigger		= IOAPIC_EDGE;
-		entry.polarity		= IOAPIC_POL_HIGH;
-		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
+		entry.masked		= false;
+		entry.is_level		= false;
+		entry.active_low	= false;
+		entry.dest_mode_logical	= false;
 		entry.delivery_mode	= APIC_DELIVERY_MODE_EXTINT;
-		entry.dest		= read_apic_id();
+		entry.destid_0_7	= read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1732,13 +1718,13 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	for_each_irq_pin(entry, data->irq_2_pin) {
-		unsigned int reg;
+		struct IO_APIC_route_entry e;
 		int pin;
 
 		pin = entry->pin;
-		reg = io_apic_read(entry->apic, 0x10 + pin*2);
+		e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
 		/* Is the remote IRR bit set? */
-		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+		if (e.irr) {
 			raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 			return true;
 		}
@@ -1897,7 +1883,7 @@ static void ioapic_configure_entry(struct irq_data *irqd)
 	 * ioapic chip to verify that.
 	 */
 	if (irqd->chip == &ioapic_chip) {
-		mpd->entry.dest = cfg->dest_apicid;
+		mpd->entry.destid_0_7 = cfg->dest_apicid;
 		mpd->entry.vector = cfg->vector;
 	}
 	for_each_irq_pin(entry, mpd->irq_2_pin)
@@ -1955,7 +1941,7 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd,
 		 * irrelevant because the IO-APIC treats them as fire and
 		 * forget.
 		 */
-		if (rentry.irr && rentry.trigger) {
+		if (rentry.irr && rentry.is_level) {
 			*state = true;
 			break;
 		}
@@ -2080,12 +2066,12 @@ static inline void __init unlock_ExtINT_logic(void)
 
 	memset(&entry1, 0, sizeof(entry1));
 
-	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
-	entry1.mask = IOAPIC_UNMASKED;
-	entry1.dest = hard_smp_processor_id();
-	entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
-	entry1.polarity = entry0.polarity;
-	entry1.trigger = IOAPIC_EDGE;
+	entry1.dest_mode_logical	= true;
+	entry1.masked			= false;
+	entry1.destid_0_7		= hard_smp_processor_id();
+	entry1.delivery_mode		= APIC_DELIVERY_MODE_EXTINT;
+	entry1.active_low		= entry0.active_low;
+	entry1.is_level			= false;
 	entry1.vector = 0;
 
 	ioapic_write_entry(apic, pin, entry1);
@@ -2960,17 +2946,17 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
 			   struct IO_APIC_route_entry *entry)
 {
 	memset(entry, 0, sizeof(*entry));
-	entry->delivery_mode = apic->delivery_mode;
-	entry->dest_mode     = apic->dest_mode_logical;
-	entry->dest	     = cfg->dest_apicid;
-	entry->vector	     = cfg->vector;
-	entry->trigger	     = data->is_level;
-	entry->polarity	     = data->active_low;
+	entry->delivery_mode	 = apic->delivery_mode;
+	entry->dest_mode_logical = apic->dest_mode_logical;
+	entry->destid_0_7	 = cfg->dest_apicid;
+	entry->vector		 = cfg->vector;
+	entry->is_level		 = data->is_level;
+	entry->active_low	 = data->active_low;
 	/*
 	 * Mask level triggered irqs. Edge triggered irqs are masked
 	 * by the irq core code in case they fire.
 	 */
-	entry->mask = data->is_level;
+	entry->masked		= data->is_level;
 }
 
 int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c23d90aed128..d0e52f30e457 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3630,11 +3630,11 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 		entry = info->ioapic.entry;
 		info->ioapic.entry = NULL;
 		memset(entry, 0, sizeof(*entry));
-		entry->vector	= index;
-		entry->trigger	= info->ioapic.is_level;
-		entry->polarity	= info->ioapic.active_low;
+		entry->vector		= index;
+		entry->is_level		= info->ioapic.is_level;
+		entry->active_low	= info->ioapic.active_low;
 		/* Mask level triggered irqs. */
-		entry->mask	= info->ioapic.is_level;
+		entry->masked		= info->ioapic.is_level;
 		break;
 
 	case X86_IRQ_ALLOC_TYPE_HPET:
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index 51c83e2cfafc..7a5809656fff 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -52,7 +52,7 @@ static int hyperv_ir_set_affinity(struct irq_data *data,
 		return ret;
 
 	entry = data->chip_data;
-	entry->dest = cfg->dest_apicid;
+	entry->destid_0_7 = cfg->dest_apicid;
 	entry->vector = cfg->vector;
 	send_cleanup_vector(cfg);
 
@@ -125,7 +125,7 @@ static int hyperv_irq_remapping_activate(struct irq_domain *domain,
 	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	struct IO_APIC_route_entry *entry = irq_data->chip_data;
 
-	entry->dest = cfg->dest_apicid;
+	entry->destid_0_7 = cfg->dest_apicid;
 	entry->vector = cfg->vector;
 
 	return 0;
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 76b8d73a5b68..83bee6fc662f 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1279,8 +1279,8 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 					     struct irq_alloc_info *info,
 					     int index, int sub_handle)
 {
-	struct IR_IO_APIC_route_entry *entry;
 	struct irte *irte = &data->irte_entry;
+	struct IO_APIC_route_entry *entry;
 
 	prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
 	switch (info->type) {
@@ -1294,22 +1294,21 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 			irte->avail, irte->vector, irte->dest_id,
 			irte->sid, irte->sq, irte->svt);
 
-		entry = (struct IR_IO_APIC_route_entry *)info->ioapic.entry;
+		entry = info->ioapic.entry;
 		info->ioapic.entry = NULL;
 		memset(entry, 0, sizeof(*entry));
-		entry->index2	= (index >> 15) & 0x1;
-		entry->zero	= 0;
-		entry->format	= 1;
-		entry->index	= (index & 0x7fff);
+		entry->ir_index_15	= !!(index & 0x8000);
+		entry->ir_format	= true;
+		entry->ir_index_0_14	= index & 0x7fff;
 		/*
 		 * IO-APIC RTE will be configured with virtual vector.
 		 * irq handler will do the explicit EOI to the io-apic.
 		 */
-		entry->vector	= info->ioapic.pin;
-		entry->trigger	= info->ioapic.is_level;
-		entry->polarity	= info->ioapic.active_low;
+		entry->vector		= info->ioapic.pin;
+		entry->is_level		= info->ioapic.is_level;
+		entry->active_low	= info->ioapic.active_low;
 		/* Mask level triggered irqs. */
-		entry->mask	= info->ioapic.is_level;
+		entry->masked		= info->ioapic.is_level;
 		break;
 
 	case X86_IRQ_ALLOC_TYPE_HPET:
-- 
Gitee


From c762f0dd34d728da44fc6b008b9c2c904e73d340 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:21 +0100
Subject: [PATCH 0363/2161] x86/ioapic: Generate RTE directly from parent
 irqchip's MSI message

commit 5d5a97133887b2dfd8e2ad0347c3a02cc7aaa0cb upstream.

The I/O-APIC generates an MSI cycle with address/data bits taken from its
Redirection Table Entry in some combination which used to make sense, but
now is just a bunch of bits which get passed through in some seemingly
arbitrary order.

Instead of making IRQ remapping drivers directly frob the I/OA-PIC RTE, let
them just do their job and generate an MSI message. The bit swizzling to
turn that MSI message into the I/O-APIC's RTE is the same in all cases,
since it's a function of the I/O-APIC hardware. The IRQ remappers have no
real need to get involved with that.

The only slight caveat is that the I/OAPIC is interpreting some of those
fields too, and it does want the 'vector' field to be unique to make EOI
work. The AMD IOMMU happens to put its IRTE index in the bits that the
I/O-APIC thinks are the vector field, and accommodates this requirement by
reserving the first 32 indices for the I/O-APIC.  The Intel IOMMU doesn't
actually use the bits that the I/O-APIC thinks are the vector field, so it
fills in the 'pin' value there instead.

[ tglx: Replaced the unreadably macro maze with the cleaned up RTE/msi_msg
  	bitfields and added commentry to explain the mapping magic ]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-22-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h       |  11 ++-
 arch/x86/kernel/apic/io_apic.c      | 103 +++++++++++++++++++---------
 drivers/iommu/amd_iommu.c           |  12 ----
 drivers/iommu/hyperv-iommu.c        |  14 ----
 drivers/iommu/intel/irq_remapping.c |  31 ++-------
 5 files changed, 83 insertions(+), 88 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 3e6908b6baa4..e2fb493ae93e 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -67,12 +67,11 @@ enum irq_alloc_type {
 };
 
 struct ioapic_alloc_info {
-	int				pin;
-	int				node;
-	u32				is_level	: 1;
-	u32				active_low	: 1;
-	u32				valid		: 1;
-	struct IO_APIC_route_entry	*entry;
+	int		pin;
+	int		node;
+	u32		is_level	: 1;
+	u32		active_low	: 1;
+	u32		valid		: 1;
 };
 
 struct uv_alloc_info {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index cc58f0f93355..133398a06e78 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,6 +48,7 @@
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/slab.h>
 #include <linux/memblock.h>
+#include <linux/msi.h>
 
 #include <asm/irqdomain.h>
 #include <asm/io.h>
@@ -63,7 +64,6 @@
 #include <asm/setup.h>
 #include <asm/irq_remapping.h>
 #include <asm/hw_irq.h>
-
 #include <asm/apic.h>
 
 #define	for_each_ioapic(idx)		\
@@ -1871,21 +1871,58 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
 	eoi_ioapic_pin(data->entry.vector, data);
 }
 
+/*
+ * The I/OAPIC is just a device for generating MSI messages from legacy
+ * interrupt pins. Various fields of the RTE translate into bits of the
+ * resulting MSI which had a historical meaning.
+ *
+ * With interrupt remapping, many of those bits have different meanings
+ * in the underlying MSI, but the way that the I/OAPIC transforms them
+ * from its RTE to the MSI message is the same. This function allows
+ * the parent IRQ domain to compose the MSI message, then takes the
+ * relevant bits to put them in the appropriate places in the RTE in
+ * order to generate that message when the IRQ happens.
+ *
+ * The setup here relies on a preconfigured route entry (is_level,
+ * active_low, masked) because the parent domain is merely composing the
+ * generic message routing information which is used for the MSI.
+ */
+static void ioapic_setup_msg_from_msi(struct irq_data *irq_data,
+				      struct IO_APIC_route_entry *entry)
+{
+	struct msi_msg msg;
+
+	/* Let the parent domain compose the MSI message */
+	irq_chip_compose_msi_msg(irq_data, &msg);
+
+	/*
+	 * - Real vector
+	 * - DMAR/IR: 8bit subhandle (ioapic.pin)
+	 * - AMD/IR:  8bit IRTE index
+	 */
+	entry->vector			= msg.arch_data.vector;
+	/* Delivery mode (for DMAR/IR all 0) */
+	entry->delivery_mode		= msg.arch_data.delivery_mode;
+	/* Destination mode or DMAR/IR index bit 15 */
+	entry->dest_mode_logical	= msg.arch_addr_lo.dest_mode_logical;
+	/* DMAR/IR: 1, 0 for all other modes */
+	entry->ir_format		= msg.arch_addr_lo.dmar_format;
+	/*
+	 * DMAR/IR: index bit 0-14.
+	 *
+	 * All other modes have bit 0-6 of dmar_index_0_14 cleared and the
+	 * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7).
+	 */
+	entry->ir_index_0_14		= msg.arch_addr_lo.dmar_index_0_14;
+}
+
 static void ioapic_configure_entry(struct irq_data *irqd)
 {
 	struct mp_chip_data *mpd = irqd->chip_data;
-	struct irq_cfg *cfg = irqd_cfg(irqd);
 	struct irq_pin_list *entry;
 
-	/*
-	 * Only update when the parent is the vector domain, don't touch it
-	 * if the parent is the remapping domain. Check the installed
-	 * ioapic chip to verify that.
-	 */
-	if (irqd->chip == &ioapic_chip) {
-		mpd->entry.destid_0_7 = cfg->dest_apicid;
-		mpd->entry.vector = cfg->vector;
-	}
+	ioapic_setup_msg_from_msi(irqd, &mpd->entry);
+
 	for_each_irq_pin(entry, mpd->irq_2_pin)
 		__ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
 }
@@ -2942,14 +2979,23 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
 	}
 }
 
-static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
-			   struct IO_APIC_route_entry *entry)
+/*
+ * Configure the I/O-APIC specific fields in the routing entry.
+ *
+ * This is important to setup the I/O-APIC specific bits (is_level,
+ * active_low, masked) because the underlying parent domain will only
+ * provide the routing information and is oblivious of the I/O-APIC
+ * specific bits.
+ *
+ * The entry is just preconfigured at this point and not written into the
+ * RTE. This happens later during activation which will fill in the actual
+ * routing information.
+ */
+static void mp_preconfigure_entry(struct mp_chip_data *data)
 {
+	struct IO_APIC_route_entry *entry = &data->entry;
+
 	memset(entry, 0, sizeof(*entry));
-	entry->delivery_mode	 = apic->delivery_mode;
-	entry->dest_mode_logical = apic->dest_mode_logical;
-	entry->destid_0_7	 = cfg->dest_apicid;
-	entry->vector		 = cfg->vector;
 	entry->is_level		 = data->is_level;
 	entry->active_low	 = data->active_low;
 	/*
@@ -2962,11 +3008,10 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
 int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 		       unsigned int nr_irqs, void *arg)
 {
-	int ret, ioapic, pin;
-	struct irq_cfg *cfg;
-	struct irq_data *irq_data;
-	struct mp_chip_data *data;
 	struct irq_alloc_info *info = arg;
+	struct mp_chip_data *data;
+	struct irq_data *irq_data;
+	int ret, ioapic, pin;
 	unsigned long flags;
 
 	if (!info || nr_irqs > 1)
@@ -2984,7 +3029,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	if (!data)
 		return -ENOMEM;
 
-	info->ioapic.entry = &data->entry;
 	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
 	if (ret < 0) {
 		kfree(data);
@@ -2998,23 +3042,20 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	irq_data->chip_data = data;
 	mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
 
-	cfg = irqd_cfg(irq_data);
 	add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
 
-	local_irq_save(flags);
-	if (info->ioapic.entry)
-		mp_setup_entry(cfg, data, info->ioapic.entry);
+	mp_preconfigure_entry(data);
 	mp_register_handler(virq, data->is_level);
+
+	local_irq_save(flags);
 	if (virq < nr_legacy_irqs())
 		legacy_pic->mask(virq);
 	local_irq_restore(flags);
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG
-		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
-		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
-		    virq, data->is_level, data->active_low,
-		    cfg->dest_apicid);
-
+		    "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+		    ioapic, mpc_ioapic_id(ioapic), pin, virq,
+		    data->is_level, data->active_low);
 	return 0;
 }
 
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index d0e52f30e457..7b4643b8b983 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3612,7 +3612,6 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 				       int devid, int index, int sub_handle)
 {
 	struct irq_2_irte *irte_info = &data->irq_2_irte;
-	struct IO_APIC_route_entry *entry;
 	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
 
 	if (!iommu)
@@ -3626,17 +3625,6 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
-		/* Setup IOAPIC entry */
-		entry = info->ioapic.entry;
-		info->ioapic.entry = NULL;
-		memset(entry, 0, sizeof(*entry));
-		entry->vector		= index;
-		entry->is_level		= info->ioapic.is_level;
-		entry->active_low	= info->ioapic.active_low;
-		/* Mask level triggered irqs. */
-		entry->masked		= info->ioapic.is_level;
-		break;
-
 	case X86_IRQ_ALLOC_TYPE_HPET:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index 7a5809656fff..b3299f2a00cb 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -89,20 +89,6 @@ static int hyperv_irq_remapping_alloc(struct irq_domain *domain,
 
 	irq_data->chip = &hyperv_ir_chip;
 
-	/*
-	 * If there is interrupt remapping function of IOMMU, setting irq
-	 * affinity only needs to change IRTE of IOMMU. But Hyper-V doesn't
-	 * support interrupt remapping function, setting irq affinity of IO-APIC
-	 * interrupts still needs to change IO-APIC registers. But ioapic_
-	 * configure_entry() will ignore value of cfg->vector and cfg->
-	 * dest_apicid when IO-APIC's parent irq domain is not the vector
-	 * domain.(See ioapic_configure_entry()) In order to setting vector
-	 * and dest_apicid to IO-APIC register, IO-APIC entry pointer is saved
-	 * in the chip_data and hyperv_irq_remapping_activate()/hyperv_ir_set_
-	 * affinity() set vector and dest_apicid directly into IO-APIC entry.
-	 */
-	irq_data->chip_data = info->ioapic.entry;
-
 	/*
 	 * Hypver-V IO APIC irq affinity should be in the scope of
 	 * ioapic_max_cpumask because no irq remapping support.
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 83bee6fc662f..891ed5fb7223 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1280,9 +1280,9 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 					     int index, int sub_handle)
 {
 	struct irte *irte = &data->irte_entry;
-	struct IO_APIC_route_entry *entry;
 
 	prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
+
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
 		/* Set source-id of interrupt request */
@@ -1293,39 +1293,20 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 			irte->trigger_mode, irte->dlvry_mode,
 			irte->avail, irte->vector, irte->dest_id,
 			irte->sid, irte->sq, irte->svt);
-
-		entry = info->ioapic.entry;
-		info->ioapic.entry = NULL;
-		memset(entry, 0, sizeof(*entry));
-		entry->ir_index_15	= !!(index & 0x8000);
-		entry->ir_format	= true;
-		entry->ir_index_0_14	= index & 0x7fff;
-		/*
-		 * IO-APIC RTE will be configured with virtual vector.
-		 * irq handler will do the explicit EOI to the io-apic.
-		 */
-		entry->vector		= info->ioapic.pin;
-		entry->is_level		= info->ioapic.is_level;
-		entry->active_low	= info->ioapic.active_low;
-		/* Mask level triggered irqs. */
-		entry->masked		= info->ioapic.is_level;
+		sub_handle = info->ioapic.pin;
 		break;
-
 	case X86_IRQ_ALLOC_TYPE_HPET:
+		set_hpet_sid(irte, info->devid);
+		break;
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
-			set_hpet_sid(irte, info->devid);
-		else
-			set_msi_sid(irte, msi_desc_to_pci_dev(info->desc));
-
-		fill_msi_msg(&data->msi_entry, index, sub_handle);
+		set_msi_sid(irte, msi_desc_to_pci_dev(info->desc));
 		break;
-
 	default:
 		BUG_ON(1);
 		break;
 	}
+	fill_msi_msg(&data->msi_entry, index, sub_handle);
 }
 
 static void intel_free_irq_resources(struct irq_domain *domain,
-- 
Gitee


From e0bf167aeeb000064bb63cf736458aaf643e2382 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:24 +0100
Subject: [PATCH 0364/2161] iommu/amd: Implement select() method on remapping
 irqdomain

commit a1a785b572425ab3ca5494a4be02ab59a796df51 upstream.

Preparatory change to remove irq_remapping_get_irq_domain().

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-25-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 7b4643b8b983..927c38d3a051 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3825,7 +3825,26 @@ static void irq_remapping_deactivate(struct irq_domain *domain,
 					    irte_info->index);
 }
 
+static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+				enum irq_domain_bus_token bus_token)
+{
+	struct amd_iommu *iommu;
+	int devid = -1;
+
+	if (x86_fwspec_is_ioapic(fwspec))
+		devid = get_ioapic_devid(fwspec->param[0]);
+	else if (x86_fwspec_is_hpet(fwspec))
+		devid = get_hpet_devid(fwspec->param[0]);
+
+	if (devid < 0)
+		return 0;
+
+	iommu = amd_iommu_rlookup_table[devid];
+	return iommu && iommu->ir_domain == d;
+}
+
 static const struct irq_domain_ops amd_ir_domain_ops = {
+	.select = irq_remapping_select,
 	.alloc = irq_remapping_alloc,
 	.free = irq_remapping_free,
 	.activate = irq_remapping_activate,
-- 
Gitee


From 34391b140861059279993f79f749ce9a1da571f4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:25 +0100
Subject: [PATCH 0365/2161] iommu/vt-d: Implement select() method on remapping
 irqdomain

commit a87fb465ffe8eacd0d69032da33455e4f6fd8b41 upstream.

Preparatory for removing irq_remapping_get_irq_domain()

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-26-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/irq_remapping.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 891ed5fb7223..b365ed489139 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1433,7 +1433,20 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain,
 	modify_irte(&data->irq_2_iommu, &entry);
 }
 
+static int intel_irq_remapping_select(struct irq_domain *d,
+				      struct irq_fwspec *fwspec,
+				      enum irq_domain_bus_token bus_token)
+{
+	if (x86_fwspec_is_ioapic(fwspec))
+		return d == map_ioapic_to_ir(fwspec->param[0]);
+	else if (x86_fwspec_is_hpet(fwspec))
+		return d == map_hpet_to_ir(fwspec->param[0]);
+
+	return 0;
+}
+
 static const struct irq_domain_ops intel_ir_domain_ops = {
+	.select = intel_irq_remapping_select,
 	.alloc = intel_irq_remapping_alloc,
 	.free = intel_irq_remapping_free,
 	.activate = intel_irq_remapping_activate,
-- 
Gitee


From 5a5d1a2f95814356e1654f9b8115418508f40961 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:29 +0100
Subject: [PATCH 0366/2161] x86: Kill all traces of
 irq_remapping_get_irq_domain()

commit ed381fca47122f0787ee53b97e5f9d562eec7237 upstream.

All users are converted to use the fwspec based parent domain lookup.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-30-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h        |  2 --
 arch/x86/include/asm/irq_remapping.h |  9 --------
 drivers/iommu/amd_iommu.c            | 34 ----------------------------
 drivers/iommu/hyperv-iommu.c         |  9 --------
 drivers/iommu/intel/irq_remapping.c  | 17 --------------
 drivers/iommu/irq_remapping.c        | 14 ------------
 drivers/iommu/irq_remapping.h        |  3 ---
 7 files changed, 88 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e2fb493ae93e..6e1194387b18 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -62,8 +62,6 @@ enum irq_alloc_type {
 	X86_IRQ_ALLOC_TYPE_PCI_MSIX,
 	X86_IRQ_ALLOC_TYPE_DMAR,
 	X86_IRQ_ALLOC_TYPE_UV,
-	X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT,
-	X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT,
 };
 
 struct ioapic_alloc_info {
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index af4a151d70b3..7cc49432187f 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -44,9 +44,6 @@ extern int irq_remapping_reenable(int);
 extern int irq_remap_enable_fault_handling(void);
 extern void panic_if_irq_remap(const char *msg);
 
-extern struct irq_domain *
-irq_remapping_get_irq_domain(struct irq_alloc_info *info);
-
 /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
 extern struct irq_domain *
 arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id);
@@ -71,11 +68,5 @@ static inline void panic_if_irq_remap(const char *msg)
 {
 }
 
-static inline struct irq_domain *
-irq_remapping_get_irq_domain(struct irq_alloc_info *info)
-{
-	return NULL;
-}
-
 #endif /* CONFIG_IRQ_REMAP */
 #endif /* __X86_IRQ_REMAPPING_H */
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 927c38d3a051..ba22f4fbeae6 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3544,10 +3544,8 @@ static int get_devid(struct irq_alloc_info *info)
 {
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
-	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
 		return get_ioapic_devid(info->devid);
 	case X86_IRQ_ALLOC_TYPE_HPET:
-	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
 		return get_hpet_devid(info->devid);
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
@@ -3558,44 +3556,12 @@ static int get_devid(struct irq_alloc_info *info)
 	}
 }
 
-static struct irq_domain *get_irq_domain_for_devid(struct irq_alloc_info *info,
-						   int devid)
-{
-	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
-	if (!iommu)
-		return NULL;
-
-	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
-	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
-		return iommu->ir_domain;
-	default:
-		WARN_ON_ONCE(1);
-		return NULL;
-	}
-}
-
-static struct irq_domain *get_irq_domain(struct irq_alloc_info *info)
-{
-	int devid;
-
-	if (!info)
-		return NULL;
-
-	devid = get_devid(info);
-	if (devid < 0)
-		return NULL;
-	return get_irq_domain_for_devid(info, devid);
-}
-
 struct irq_remap_ops amd_iommu_irq_ops = {
 	.prepare		= amd_iommu_prepare,
 	.enable			= amd_iommu_enable,
 	.disable		= amd_iommu_disable,
 	.reenable		= amd_iommu_reenable,
 	.enable_faulting	= amd_iommu_enable_faulting,
-	.get_irq_domain		= get_irq_domain,
 };
 
 static void fill_msi_msg(struct msi_msg *msg, u32 index)
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index b3299f2a00cb..2f0be3f8119f 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -168,18 +168,9 @@ static int __init hyperv_enable_irq_remapping(void)
 	return IRQ_REMAP_X2APIC_MODE;
 }
 
-static struct irq_domain *hyperv_get_irq_domain(struct irq_alloc_info *info)
-{
-	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT)
-		return ioapic_ir_domain;
-	else
-		return NULL;
-}
-
 struct irq_remap_ops hyperv_irq_remap_ops = {
 	.prepare		= hyperv_prepare_irq_remapping,
 	.enable			= hyperv_enable_irq_remapping,
-	.get_irq_domain		= hyperv_get_irq_domain,
 };
 
 #endif
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index b365ed489139..86862f83a3fb 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1127,29 +1127,12 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 	irte->redir_hint = 1;
 }
 
-static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
-{
-	if (!info)
-		return NULL;
-
-	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
-		return map_ioapic_to_ir(info->devid);
-	case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
-		return map_hpet_to_ir(info->devid);
-	default:
-		WARN_ON_ONCE(1);
-		return NULL;
-	}
-}
-
 struct irq_remap_ops intel_irq_remap_ops = {
 	.prepare		= intel_prepare_irq_remapping,
 	.enable			= intel_enable_irq_remapping,
 	.disable		= disable_irq_remapping,
 	.reenable		= reenable_irq_remapping,
 	.enable_faulting	= enable_drhd_fault_handling,
-	.get_irq_domain		= intel_get_irq_domain,
 };
 
 static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 2d84b1ed205e..83314b9d8f38 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -158,17 +158,3 @@ void panic_if_irq_remap(const char *msg)
 	if (irq_remapping_enabled)
 		panic(msg);
 }
-
-/**
- * irq_remapping_get_irq_domain - Get the irqdomain serving the request @info
- * @info: interrupt allocation information, used to identify the IOMMU device
- *
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *irq_remapping_get_irq_domain(struct irq_alloc_info *info)
-{
-	if (!remap_ops || !remap_ops->get_irq_domain)
-		return NULL;
-
-	return remap_ops->get_irq_domain(info);
-}
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 1661b3d75920..8c89cb947cdb 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -42,9 +42,6 @@ struct irq_remap_ops {
 
 	/* Enable fault handling */
 	int  (*enable_faulting)(void);
-
-	/* Get the irqdomain associated to IOMMU device */
-	struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
-- 
Gitee


From 5f712faf9a51b737658ea2da7f89b01a9d1ac299 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:28 +0100
Subject: [PATCH 0367/2161] x86/ioapic: Use irq_find_matching_fwspec() to find
 remapping irqdomain

commit b643128b917ca8f1c8b1e14af64ebdc81147b2d1 upstream.

All possible parent domains have a select method now. Make use of it.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-29-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/io_apic.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 133398a06e78..e7d6a83037d3 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2343,36 +2343,37 @@ static inline void __init check_timer(void)
 
 static int mp_irqdomain_create(int ioapic)
 {
-	struct irq_alloc_info info;
 	struct irq_domain *parent;
 	int hwirqs = mp_ioapic_pin_count(ioapic);
 	struct ioapic *ip = &ioapics[ioapic];
 	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
 	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
 	struct fwnode_handle *fn;
-	char *name = "IO-APIC";
+	struct irq_fwspec fwspec;
 
 	if (cfg->type == IOAPIC_DOMAIN_INVALID)
 		return 0;
 
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT;
-	info.devid = mpc_ioapic_id(ioapic);
-	parent = irq_remapping_get_irq_domain(&info);
-	if (!parent)
-		parent = x86_vector_domain;
-	else
-		name = "IO-APIC-IR";
-
 	/* Handle device tree enumerated APICs proper */
 	if (cfg->dev) {
 		fn = of_node_to_fwnode(cfg->dev);
 	} else {
-		fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+		fn = irq_domain_alloc_named_id_fwnode("IO-APIC", ioapic);
 		if (!fn)
 			return -ENOMEM;
 	}
 
+	fwspec.fwnode = fn;
+	fwspec.param_count = 1;
+	fwspec.param[0] = ioapic;
+
+	parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+	if (!parent) {
+		if (!cfg->dev)
+			irq_domain_free_fwnode(fn);
+		return -ENODEV;
+	}
+
 	ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
 						 (void *)(long)ioapic);
 
-- 
Gitee


From 1e328a44894a6f4eef3f0cf94dac7c034c8997cd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:48 +0200
Subject: [PATCH 0368/2161] genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP

commit 826da771291fc25a428e871f9e7fb465e390f852 upstream.

X86 IO/APIC and MSI interrupts (when used without interrupts remapping)
require that the affinity setup on startup is done before the interrupt is
enabled for the first time as the non-remapped operation mode cannot safely
migrate enabled interrupts from arbitrary contexts. Provide a new irq chip
flag which allows affected hardware to request this.

This has to be opt-in because there have been reports in the past that some
interrupt chips cannot handle affinity setting before startup.

Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.779791738@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/irq.h | 2 ++
 kernel/irq/chip.c   | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3d7b04d5264c..dd6653005ea2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -555,6 +555,7 @@ struct irq_chip {
  * IRQCHIP_EOI_THREADED:	Chip requires eoi() on unmask in threaded mode
  * IRQCHIP_SUPPORTS_LEVEL_MSI	Chip can provide two doorbells for Level MSIs
  * IRQCHIP_SUPPORTS_NMI:	Chip can deliver NMIs, only for root irqchips
+ * IRQCHIP_AFFINITY_PRE_STARTUP:      Default affinity update before startup
  */
 enum {
 	IRQCHIP_SET_TYPE_MASKED		= (1 <<  0),
@@ -566,6 +567,7 @@ enum {
 	IRQCHIP_EOI_THREADED		= (1 <<  6),
 	IRQCHIP_SUPPORTS_LEVEL_MSI	= (1 <<  7),
 	IRQCHIP_SUPPORTS_NMI		= (1 <<  8),
+	IRQCHIP_AFFINITY_PRE_STARTUP	= (1 << 10),
 };
 
 #include <linux/irqdesc.h>
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c36a17b08dc8..bcb736017f57 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
 	} else {
 		switch (__irq_startup_managed(desc, aff, force)) {
 		case IRQ_STARTUP_NORMAL:
+			if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+				irq_setup_affinity(desc);
 			ret = __irq_startup(desc);
-			irq_setup_affinity(desc);
+			if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+				irq_setup_affinity(desc);
 			break;
 		case IRQ_STARTUP_MANAGED:
 			irq_do_set_affinity(d, aff, false);
-- 
Gitee


From 52b7ecce957553b02b4ddae8ff91246ae98458bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:50 +0200
Subject: [PATCH 0369/2161] x86/msi: Force affinity setup before startup

commit ff363f480e5997051dd1de949121ffda3b753741 upstream.

The X86 MSI mechanism cannot handle interrupt affinity changes safely after
startup other than from an interrupt handler, unless interrupt remapping is
enabled. The startup sequence in the generic interrupt code violates that
assumption.

Mark the irq chips with the new IRQCHIP_AFFINITY_PRE_STARTUP flag so that
the default interrupt setting happens before the interrupt is started up
for the first time.

While the interrupt remapping MSI chip does not require this, there is no
point in treating it differently as this might spare an interrupt to a CPU
which is not in the default affinity mask.

For the non-remapping case go to the direct write path when the interrupt
is not yet started similar to the not yet activated case.

Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.886722080@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/msi.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index f3bd575a1571..96882c26aa24 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -58,11 +58,13 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
 	 *   The quirk bit is not set in this case.
 	 * - The new vector is the same as the old vector
 	 * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+	 * - The interrupt is not yet started up
 	 * - The new destination CPU is the same as the old destination CPU
 	 */
 	if (!irqd_msi_nomask_quirk(irqd) ||
 	    cfg->vector == old_cfg.vector ||
 	    old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+	    !irqd_is_started(irqd) ||
 	    cfg->dest_apicid == old_cfg.dest_apicid) {
 		irq_msi_update_msg(irqd, cfg);
 		return ret;
@@ -149,7 +151,8 @@ static struct irq_chip pci_msi_controller = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_set_affinity	= msi_set_affinity,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
+	.flags			= IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
@@ -249,7 +252,8 @@ static struct irq_chip pci_msi_ir_controller = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_set_vcpu_affinity	= irq_chip_set_vcpu_affinity_parent,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
+	.flags			= IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
 static struct msi_domain_info pci_msi_ir_domain_info = {
@@ -303,7 +307,8 @@ static struct irq_chip dmar_msi_controller = {
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_compose_msi_msg	= dmar_msi_compose_msg,
 	.irq_write_msi_msg	= dmar_msi_write_msg,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
+	.flags			= IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
 static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
@@ -400,7 +405,7 @@ static struct irq_chip hpet_msi_controller __ro_after_init = {
 	.irq_set_affinity = msi_domain_set_affinity,
 	.irq_retrigger = irq_chip_retrigger_hierarchy,
 	.irq_write_msi_msg = hpet_msi_write_msg,
-	.flags = IRQCHIP_SKIP_SET_WAKE,
+	.flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
 static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
-- 
Gitee


From b8c53afd724fff288fa1b5de1b0b6284cf83042a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:10 +0100
Subject: [PATCH 0370/2161] x86/hpet: Move MSI support into hpet.c

commit 3d7295eb3003aea9f89de35304b3a88ae4d5036b upstream.

This isn't really dependent on PCI MSI; it's just generic MSI which is now
supported by the generic x86_vector_domain. Move the HPET MSI support back
into hpet.c with the rest of the HPET support.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-11-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hpet.h |  11 ----
 arch/x86/kernel/apic/msi.c  | 117 -----------------------------------
 arch/x86/kernel/hpet.c      | 118 ++++++++++++++++++++++++++++++++++--
 3 files changed, 112 insertions(+), 134 deletions(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 6352dee37cda..ab9f3dd87c80 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,17 +74,6 @@ extern void hpet_disable(void);
 extern unsigned int hpet_readl(unsigned int a);
 extern void force_hpet_resume(void);
 
-struct irq_data;
-struct hpet_channel;
-struct irq_domain;
-
-extern void hpet_msi_unmask(struct irq_data *data);
-extern void hpet_msi_mask(struct irq_data *data);
-extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg);
-extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
-extern int hpet_assign_irq(struct irq_domain *domain,
-			   struct hpet_channel *hc, int dev_num);
-
 #ifdef CONFIG_HPET_EMULATE_RTC
 
 #include <linux/interrupt.h>
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 96882c26aa24..ba9446de95ef 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -380,120 +380,3 @@ void dmar_free_hwirq(int irq)
 	irq_domain_free_irqs(irq, 1);
 }
 #endif
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_HPET_TIMER
-static inline int hpet_dev_id(struct irq_domain *domain)
-{
-	struct msi_domain_info *info = msi_get_domain_info(domain);
-
-	return (int)(long)info->data;
-}
-
-static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
-{
-	hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
-}
-
-static struct irq_chip hpet_msi_controller __ro_after_init = {
-	.name = "HPET-MSI",
-	.irq_unmask = hpet_msi_unmask,
-	.irq_mask = hpet_msi_mask,
-	.irq_ack = irq_chip_ack_parent,
-	.irq_set_affinity = msi_domain_set_affinity,
-	.irq_retrigger = irq_chip_retrigger_hierarchy,
-	.irq_write_msi_msg = hpet_msi_write_msg,
-	.flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP,
-};
-
-static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
-					  msi_alloc_info_t *arg)
-{
-	return arg->hwirq;
-}
-
-static int hpet_msi_init(struct irq_domain *domain,
-			 struct msi_domain_info *info, unsigned int virq,
-			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
-{
-	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
-	irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
-			    handle_edge_irq, arg->data, "edge");
-
-	return 0;
-}
-
-static void hpet_msi_free(struct irq_domain *domain,
-			  struct msi_domain_info *info, unsigned int virq)
-{
-	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
-}
-
-static struct msi_domain_ops hpet_msi_domain_ops = {
-	.get_hwirq	= hpet_msi_get_hwirq,
-	.msi_init	= hpet_msi_init,
-	.msi_free	= hpet_msi_free,
-};
-
-static struct msi_domain_info hpet_msi_domain_info = {
-	.ops		= &hpet_msi_domain_ops,
-	.chip		= &hpet_msi_controller,
-};
-
-struct irq_domain *hpet_create_irq_domain(int hpet_id)
-{
-	struct msi_domain_info *domain_info;
-	struct irq_domain *parent, *d;
-	struct irq_alloc_info info;
-	struct fwnode_handle *fn;
-
-	if (x86_vector_domain == NULL)
-		return NULL;
-
-	domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
-	if (!domain_info)
-		return NULL;
-
-	*domain_info = hpet_msi_domain_info;
-	domain_info->data = (void *)(long)hpet_id;
-
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT;
-	info.devid = hpet_id;
-	parent = irq_remapping_get_irq_domain(&info);
-	if (parent == NULL)
-		parent = x86_vector_domain;
-	else
-		hpet_msi_controller.name = "IR-HPET-MSI";
-
-	fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
-					      hpet_id);
-	if (!fn) {
-		kfree(domain_info);
-		return NULL;
-	}
-
-	d = msi_create_irq_domain(fn, domain_info, parent);
-	if (!d) {
-		irq_domain_free_fwnode(fn);
-		kfree(domain_info);
-	}
-	return d;
-}
-
-int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
-		    int dev_num)
-{
-	struct irq_alloc_info info;
-
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_HPET;
-	info.data = hc;
-	info.devid = hpet_dev_id(domain);
-	info.hwirq = dev_num;
-
-	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
-}
-#endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c6f791bc481e..594077e76a1e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -7,6 +7,7 @@
 #include <linux/cpu.h>
 #include <linux/irq.h>
 
+#include <asm/irq_remapping.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
 
@@ -50,7 +51,7 @@ unsigned long				hpet_address;
 u8					hpet_blockid; /* OS timer block num */
 bool					hpet_msi_disable;
 
-#ifdef CONFIG_PCI_MSI
+#ifdef CONFIG_GENERIC_MSI_IRQ
 static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
 static struct irq_domain		*hpet_domain;
 #endif
@@ -467,9 +468,8 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
 /*
  * HPET MSI Support
  */
-#ifdef CONFIG_PCI_MSI
-
-void hpet_msi_unmask(struct irq_data *data)
+#ifdef CONFIG_GENERIC_MSI_IRQ
+static void hpet_msi_unmask(struct irq_data *data)
 {
 	struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
@@ -479,7 +479,7 @@ void hpet_msi_unmask(struct irq_data *data)
 	hpet_writel(cfg, HPET_Tn_CFG(hc->num));
 }
 
-void hpet_msi_mask(struct irq_data *data)
+static void hpet_msi_mask(struct irq_data *data)
 {
 	struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
@@ -489,12 +489,118 @@ void hpet_msi_mask(struct irq_data *data)
 	hpet_writel(cfg, HPET_Tn_CFG(hc->num));
 }
 
-void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
+static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
 {
 	hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num));
 	hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4);
 }
 
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
+}
+
+static struct irq_chip hpet_msi_controller __ro_after_init = {
+	.name = "HPET-MSI",
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
+	.irq_ack = irq_chip_ack_parent,
+	.irq_set_affinity = msi_domain_set_affinity,
+	.irq_retrigger = irq_chip_retrigger_hierarchy,
+	.irq_write_msi_msg = hpet_msi_write_msg,
+	.flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static int hpet_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
+			    handle_edge_irq, arg->data, "edge");
+
+	return 0;
+}
+
+static void hpet_msi_free(struct irq_domain *domain,
+			  struct msi_domain_info *info, unsigned int virq)
+{
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+}
+
+static struct msi_domain_ops hpet_msi_domain_ops = {
+	.msi_init	= hpet_msi_init,
+	.msi_free	= hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+	.ops		= &hpet_msi_domain_ops,
+	.chip		= &hpet_msi_controller,
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS,
+};
+
+static struct irq_domain *hpet_create_irq_domain(int hpet_id)
+{
+	struct msi_domain_info *domain_info;
+	struct irq_domain *parent, *d;
+	struct irq_alloc_info info;
+	struct fwnode_handle *fn;
+
+	if (x86_vector_domain == NULL)
+		return NULL;
+
+	domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+	if (!domain_info)
+		return NULL;
+
+	*domain_info = hpet_msi_domain_info;
+	domain_info->data = (void *)(long)hpet_id;
+
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT;
+	info.devid = hpet_id;
+	parent = irq_remapping_get_irq_domain(&info);
+	if (parent == NULL)
+		parent = x86_vector_domain;
+	else
+		hpet_msi_controller.name = "IR-HPET-MSI";
+
+	fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+					      hpet_id);
+	if (!fn) {
+		kfree(domain_info);
+		return NULL;
+	}
+
+	d = msi_create_irq_domain(fn, domain_info, parent);
+	if (!d) {
+		irq_domain_free_fwnode(fn);
+		kfree(domain_info);
+	}
+	return d;
+}
+
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+	struct msi_domain_info *info = msi_get_domain_info(domain);
+
+	return (int)(long)info->data;
+}
+
+static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
+			   int dev_num)
+{
+	struct irq_alloc_info info;
+
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.data = hc;
+	info.devid = hpet_dev_id(domain);
+	info.hwirq = dev_num;
+
+	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+}
+
 static int hpet_clkevt_msi_resume(struct clock_event_device *evt)
 {
 	struct hpet_channel *hc = clockevent_to_channel(evt);
-- 
Gitee


From bf587b9d38499df4576de5a9407632726fefa34e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:27 +0100
Subject: [PATCH 0371/2161] x86/hpet: Use irq_find_matching_fwspec() to find
 remapping irqdomain

commit c2a5881c28e5bb4cb901029423a1c7068c0afa2d upstream.

All possible parent domains have a select method now. Make use of it.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-28-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/hpet.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 594077e76a1e..f6a782c73225 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -543,8 +543,8 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id)
 {
 	struct msi_domain_info *domain_info;
 	struct irq_domain *parent, *d;
-	struct irq_alloc_info info;
 	struct fwnode_handle *fn;
+	struct irq_fwspec fwspec;
 
 	if (x86_vector_domain == NULL)
 		return NULL;
@@ -556,15 +556,6 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id)
 	*domain_info = hpet_msi_domain_info;
 	domain_info->data = (void *)(long)hpet_id;
 
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT;
-	info.devid = hpet_id;
-	parent = irq_remapping_get_irq_domain(&info);
-	if (parent == NULL)
-		parent = x86_vector_domain;
-	else
-		hpet_msi_controller.name = "IR-HPET-MSI";
-
 	fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
 					      hpet_id);
 	if (!fn) {
@@ -572,6 +563,19 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id)
 		return NULL;
 	}
 
+	fwspec.fwnode = fn;
+	fwspec.param_count = 1;
+	fwspec.param[0] = hpet_id;
+
+	parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+	if (!parent) {
+		irq_domain_free_fwnode(fn);
+		kfree(domain_info);
+		return NULL;
+	}
+	if (parent != x86_vector_domain)
+		hpet_msi_controller.name = "IR-HPET-MSI";
+
 	d = msi_create_irq_domain(fn, domain_info, parent);
 	if (!d) {
 		irq_domain_free_fwnode(fn);
-- 
Gitee


From 6c3d635a0ebb206257c2de112c18b3e058846d20 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 3 Oct 2019 15:32:12 +0300
Subject: [PATCH 0372/2161] device property: Add fwnode_get_name for returning
 the name of a node

commit bc0500c1e43d95cca5352d2345fb0769f314ba22 upstream.

The fwnode framework did not have means to obtain the name of a node. Add
that now, in form of the fwnode_get_name() function and a corresponding
get_name fwnode op. OF and ACPI support is included.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Rob Herring <robh@kernel.org> (for OF)
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/property.c  | 26 ++++++++++++++++++++++++++
 drivers/base/property.c  | 11 +++++++++++
 drivers/base/swnode.c    | 12 ++++++++++++
 drivers/of/property.c    |  6 ++++++
 include/linux/fwnode.h   |  2 ++
 include/linux/property.h |  1 +
 6 files changed, 58 insertions(+)

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index a08e3eb2a6f9..d48684090be9 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1339,6 +1339,31 @@ acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode,
 						  args_count, args);
 }
 
+static const char *acpi_fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+	const struct acpi_device *adev;
+	struct fwnode_handle *parent;
+
+	/* Is this the root node? */
+	parent = fwnode_get_parent(fwnode);
+	if (!parent)
+		return "\\";
+
+	fwnode_handle_put(parent);
+
+	if (is_acpi_data_node(fwnode)) {
+		const struct acpi_data_node *dn = to_acpi_data_node(fwnode);
+
+		return dn->name;
+	}
+
+	adev = to_acpi_device_node(fwnode);
+	if (WARN_ON(!adev))
+		return NULL;
+
+	return acpi_device_bid(adev);
+}
+
 static struct fwnode_handle *
 acpi_fwnode_get_parent(struct fwnode_handle *fwnode)
 {
@@ -1379,6 +1404,7 @@ acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 		.get_parent = acpi_node_get_parent,			\
 		.get_next_child_node = acpi_get_next_subnode,		\
 		.get_named_child_node = acpi_fwnode_get_named_child_node, \
+		.get_name = acpi_fwnode_get_name,			\
 		.get_reference_args = acpi_fwnode_get_reference_args,	\
 		.graph_get_next_endpoint =				\
 			acpi_graph_get_next_endpoint,			\
diff --git a/drivers/base/property.c b/drivers/base/property.c
index 81bd01ed4042..e3636fd6c3bd 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -577,6 +577,17 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
 
+/**
+ * fwnode_get_name - Return the name of a node
+ * @fwnode: The firmware node
+ *
+ * Returns a pointer to the node name.
+ */
+const char *fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+	return fwnode_call_ptr_op(fwnode, get_name);
+}
+
 /**
  * fwnode_get_parent - Return parent firwmare node
  * @fwnode: Firmware whose parent is retrieved
diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 4c3b9813b284..474fc67d584e 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -515,6 +515,17 @@ static int software_node_read_string_array(const struct fwnode_handle *fwnode,
 						propname, val, nval);
 }
 
+static const char *
+software_node_get_name(const struct fwnode_handle *fwnode)
+{
+	const struct swnode *swnode = to_swnode(fwnode);
+
+	if (!swnode)
+		return "(null)";
+
+	return kobject_name(&swnode->kobj);
+}
+
 static struct fwnode_handle *
 software_node_get_parent(const struct fwnode_handle *fwnode)
 {
@@ -619,6 +630,7 @@ static const struct fwnode_operations software_node_ops = {
 	.property_present = software_node_property_present,
 	.property_read_int_array = software_node_read_int_array,
 	.property_read_string_array = software_node_read_string_array,
+	.get_name = software_node_get_name,
 	.get_parent = software_node_get_parent,
 	.get_next_child_node = software_node_get_next_child,
 	.get_named_child_node = software_node_get_named_child_node,
diff --git a/drivers/of/property.c b/drivers/of/property.c
index d7fa75e31f22..5bed634551ea 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -872,6 +872,11 @@ of_fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
 		of_property_count_strings(node, propname);
 }
 
+static const char *of_fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+	return kbasename(to_of_node(fwnode)->full_name);
+}
+
 static struct fwnode_handle *
 of_fwnode_get_parent(const struct fwnode_handle *fwnode)
 {
@@ -993,6 +998,7 @@ const struct fwnode_operations of_fwnode_ops = {
 	.property_present = of_fwnode_property_present,
 	.property_read_int_array = of_fwnode_property_read_int_array,
 	.property_read_string_array = of_fwnode_property_read_string_array,
+	.get_name = of_fwnode_get_name,
 	.get_parent = of_fwnode_get_parent,
 	.get_next_child_node = of_fwnode_get_next_child_node,
 	.get_named_child_node = of_fwnode_get_named_child_node,
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index ababd6bc82f3..2bbf55739a57 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -56,6 +56,7 @@ struct fwnode_reference_args {
  *				 otherwise.
  * @property_read_string_array: Read an array of string properties. Return zero
  *				on success, a negative error code otherwise.
+ * @get_name: Return the name of an fwnode.
  * @get_parent: Return the parent of an fwnode.
  * @get_next_child_node: Return the next child node in an iteration.
  * @get_named_child_node: Return a child node with a given name.
@@ -82,6 +83,7 @@ struct fwnode_operations {
 	(*property_read_string_array)(const struct fwnode_handle *fwnode_handle,
 				      const char *propname, const char **val,
 				      size_t nval);
+	const char *(*get_name)(const struct fwnode_handle *fwnode);
 	struct fwnode_handle *(*get_parent)(const struct fwnode_handle *fwnode);
 	struct fwnode_handle *
 	(*get_next_child_node)(const struct fwnode_handle *fwnode,
diff --git a/include/linux/property.h b/include/linux/property.h
index 9b3d4ca3a73a..d26276f91aa6 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -80,6 +80,7 @@ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
 					    const char *name,
 					    unsigned int index);
 
+const char *fwnode_get_name(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(
 	struct fwnode_handle *fwnode);
-- 
Gitee


From ce65e27c402d9da1da805759848270ac0f6a0ff9 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 3 Oct 2019 15:32:13 +0300
Subject: [PATCH 0373/2161] device property: Add a function to obtain a node's
 prefix

commit e7e242bccb209b5f73455b33928b8680cc6e3319 upstream.

The prefix is used for printing purpose before a node, and it also works
as a separator between two nodes.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Rob Herring <robh@kernel.org> (for OF)
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/property.c  | 22 ++++++++++++++++++++++
 drivers/base/property.c  | 12 ++++++++++++
 drivers/base/swnode.c    | 22 ++++++++++++++++++++++
 drivers/of/property.c    | 10 ++++++++++
 include/linux/fwnode.h   |  2 ++
 include/linux/property.h |  1 +
 6 files changed, 69 insertions(+)

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index d48684090be9..1a8c5716ca18 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1364,6 +1364,27 @@ static const char *acpi_fwnode_get_name(const struct fwnode_handle *fwnode)
 	return acpi_device_bid(adev);
 }
 
+static const char *
+acpi_fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent;
+
+	/* Is this the root node? */
+	parent = fwnode_get_parent(fwnode);
+	if (!parent)
+		return "";
+
+	/* Is this 2nd node from the root? */
+	parent = fwnode_get_next_parent(parent);
+	if (!parent)
+		return "";
+
+	fwnode_handle_put(parent);
+
+	/* ACPI device or data node. */
+	return ".";
+}
+
 static struct fwnode_handle *
 acpi_fwnode_get_parent(struct fwnode_handle *fwnode)
 {
@@ -1405,6 +1426,7 @@ acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 		.get_next_child_node = acpi_get_next_subnode,		\
 		.get_named_child_node = acpi_fwnode_get_named_child_node, \
 		.get_name = acpi_fwnode_get_name,			\
+		.get_name_prefix = acpi_fwnode_get_name_prefix,		\
 		.get_reference_args = acpi_fwnode_get_reference_args,	\
 		.graph_get_next_endpoint =				\
 			acpi_graph_get_next_endpoint,			\
diff --git a/drivers/base/property.c b/drivers/base/property.c
index e3636fd6c3bd..98c6c4ea665c 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -588,6 +588,18 @@ const char *fwnode_get_name(const struct fwnode_handle *fwnode)
 	return fwnode_call_ptr_op(fwnode, get_name);
 }
 
+/**
+ * fwnode_get_name_prefix - Return the prefix of node for printing purposes
+ * @fwnode: The firmware node
+ *
+ * Returns the prefix of a node, intended to be printed right before the node.
+ * The prefix works also as a separator between the nodes.
+ */
+const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
+{
+	return fwnode_call_ptr_op(fwnode, get_name_prefix);
+}
+
 /**
  * fwnode_get_parent - Return parent firwmare node
  * @fwnode: Firmware whose parent is retrieved
diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 474fc67d584e..ac051019e50f 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -526,6 +526,27 @@ software_node_get_name(const struct fwnode_handle *fwnode)
 	return kobject_name(&swnode->kobj);
 }
 
+static const char *
+software_node_get_name_prefix(const struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent;
+	const char *prefix;
+
+	parent = fwnode_get_parent(fwnode);
+	if (!parent)
+		return "";
+
+	/* Figure out the prefix from the parents. */
+	while (is_software_node(parent))
+		parent = fwnode_get_next_parent(parent);
+
+	prefix = fwnode_get_name_prefix(parent);
+	fwnode_handle_put(parent);
+
+	/* Guess something if prefix was NULL. */
+	return prefix ?: "/";
+}
+
 static struct fwnode_handle *
 software_node_get_parent(const struct fwnode_handle *fwnode)
 {
@@ -631,6 +652,7 @@ static const struct fwnode_operations software_node_ops = {
 	.property_read_int_array = software_node_read_int_array,
 	.property_read_string_array = software_node_read_string_array,
 	.get_name = software_node_get_name,
+	.get_name_prefix = software_node_get_name_prefix,
 	.get_parent = software_node_get_parent,
 	.get_next_child_node = software_node_get_next_child,
 	.get_named_child_node = software_node_get_named_child_node,
diff --git a/drivers/of/property.c b/drivers/of/property.c
index 5bed634551ea..e8202f61a5d9 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -877,6 +877,15 @@ static const char *of_fwnode_get_name(const struct fwnode_handle *fwnode)
 	return kbasename(to_of_node(fwnode)->full_name);
 }
 
+static const char *of_fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
+{
+	/* Root needs no prefix here (its name is "/"). */
+	if (!to_of_node(fwnode)->parent)
+		return "";
+
+	return "/";
+}
+
 static struct fwnode_handle *
 of_fwnode_get_parent(const struct fwnode_handle *fwnode)
 {
@@ -999,6 +1008,7 @@ const struct fwnode_operations of_fwnode_ops = {
 	.property_read_int_array = of_fwnode_property_read_int_array,
 	.property_read_string_array = of_fwnode_property_read_string_array,
 	.get_name = of_fwnode_get_name,
+	.get_name_prefix = of_fwnode_get_name_prefix,
 	.get_parent = of_fwnode_get_parent,
 	.get_next_child_node = of_fwnode_get_next_child_node,
 	.get_named_child_node = of_fwnode_get_named_child_node,
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 2bbf55739a57..a5673c4674cf 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -57,6 +57,7 @@ struct fwnode_reference_args {
  * @property_read_string_array: Read an array of string properties. Return zero
  *				on success, a negative error code otherwise.
  * @get_name: Return the name of an fwnode.
+ * @get_name_prefix: Get a prefix for a node (for printing purposes).
  * @get_parent: Return the parent of an fwnode.
  * @get_next_child_node: Return the next child node in an iteration.
  * @get_named_child_node: Return a child node with a given name.
@@ -84,6 +85,7 @@ struct fwnode_operations {
 				      const char *propname, const char **val,
 				      size_t nval);
 	const char *(*get_name)(const struct fwnode_handle *fwnode);
+	const char *(*get_name_prefix)(const struct fwnode_handle *fwnode);
 	struct fwnode_handle *(*get_parent)(const struct fwnode_handle *fwnode);
 	struct fwnode_handle *
 	(*get_next_child_node)(const struct fwnode_handle *fwnode,
diff --git a/include/linux/property.h b/include/linux/property.h
index d26276f91aa6..1e4dc477c695 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -81,6 +81,7 @@ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
 					    unsigned int index);
 
 const char *fwnode_get_name(const struct fwnode_handle *fwnode);
+const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(
 	struct fwnode_handle *fwnode);
-- 
Gitee


From 50d15df7ca6ae65258c4ce231f32701310a93623 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:23 +0100
Subject: [PATCH 0374/2161] x86/apic: Add select() method on vector irqdomain

commit 6452ea2a323b80868ce5e6d3030e4ccbeab9dc30 upstream.

This will be used to select the irqdomain for I/O-APIC and HPET.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-24-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/irqdomain.h |  3 +++
 arch/x86/kernel/apic/vector.c    | 43 ++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index c066ffae222b..a565908d81bc 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -12,6 +12,9 @@ enum {
 	X86_IRQ_ALLOC_LEGACY				= 0x2,
 };
 
+extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec);
+extern int x86_fwspec_is_hpet(struct irq_fwspec *fwspec);
+
 extern struct irq_domain *x86_vector_domain;
 
 extern void init_irq_alloc_info(struct irq_alloc_info *info,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 33291daab1b4..a7ffebef580b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -633,7 +633,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
 }
 #endif
 
+int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec)
+{
+	if (fwspec->param_count != 1)
+		return 0;
+
+	if (is_fwnode_irqchip(fwspec->fwnode)) {
+		const char *fwname = fwnode_get_name(fwspec->fwnode);
+		return fwname && !strncmp(fwname, "IO-APIC-", 8) &&
+			simple_strtol(fwname+8, NULL, 10) == fwspec->param[0];
+	}
+	return to_of_node(fwspec->fwnode) &&
+		of_device_is_compatible(to_of_node(fwspec->fwnode),
+					"intel,ce4100-ioapic");
+}
+
+int x86_fwspec_is_hpet(struct irq_fwspec *fwspec)
+{
+	if (fwspec->param_count != 1)
+		return 0;
+
+	if (is_fwnode_irqchip(fwspec->fwnode)) {
+		const char *fwname = fwnode_get_name(fwspec->fwnode);
+		return fwname && !strncmp(fwname, "HPET-MSI-", 9) &&
+			simple_strtol(fwname+9, NULL, 10) == fwspec->param[0];
+	}
+	return 0;
+}
+
+static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+			     enum irq_domain_bus_token bus_token)
+{
+	/*
+	 * HPET and I/OAPIC cannot be parented in the vector domain
+	 * if IRQ remapping is enabled. APIC IDs above 15 bits are
+	 * only permitted if IRQ remapping is enabled, so check that.
+	 */
+	if (apic->apic_id_valid(32768))
+		return 0;
+
+	return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec);
+}
+
 static const struct irq_domain_ops x86_vector_domain_ops = {
+	.select		= x86_vector_select,
 	.alloc		= x86_vector_alloc_irqs,
 	.free		= x86_vector_free_irqs,
 	.activate	= x86_vector_activate,
-- 
Gitee


From 5b2ec546a5f7a71de19bcfdcf2fa31b2fa255da7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:30 +0100
Subject: [PATCH 0375/2161] iommu/vt-d: Simplify intel_irq_remapping_select()

commit 79eb3581bcaae9b5677629d945e14da212aa76e2 upstream.

Now that the old get_irq_domain() method has gone, consolidate on just the
map_XXX_to_iommu() functions.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-31-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/irq_remapping.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 86862f83a3fb..685200a5cff0 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -203,13 +203,13 @@ static int modify_irte(struct irq_2_iommu *irq_iommu,
 	return rc;
 }
 
-static struct irq_domain *map_hpet_to_ir(u8 hpet_id)
+static struct intel_iommu *map_hpet_to_iommu(u8 hpet_id)
 {
 	int i;
 
 	for (i = 0; i < MAX_HPET_TBS; i++) {
 		if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu)
-			return ir_hpet[i].iommu->ir_domain;
+			return ir_hpet[i].iommu;
 	}
 	return NULL;
 }
@@ -225,13 +225,6 @@ static struct intel_iommu *map_ioapic_to_iommu(int apic)
 	return NULL;
 }
 
-static struct irq_domain *map_ioapic_to_ir(int apic)
-{
-	struct intel_iommu *iommu = map_ioapic_to_iommu(apic);
-
-	return iommu ? iommu->ir_domain : NULL;
-}
-
 static struct irq_domain *map_dev_to_ir(struct pci_dev *dev)
 {
 	struct dmar_drhd_unit *drhd = dmar_find_matched_drhd_unit(dev);
@@ -1420,12 +1413,14 @@ static int intel_irq_remapping_select(struct irq_domain *d,
 				      struct irq_fwspec *fwspec,
 				      enum irq_domain_bus_token bus_token)
 {
+	struct intel_iommu *iommu = NULL;
+
 	if (x86_fwspec_is_ioapic(fwspec))
-		return d == map_ioapic_to_ir(fwspec->param[0]);
+		iommu = map_ioapic_to_iommu(fwspec->param[0]);
 	else if (x86_fwspec_is_hpet(fwspec))
-		return d == map_hpet_to_ir(fwspec->param[0]);
+		iommu = map_hpet_to_iommu(fwspec->param[0]);
 
-	return 0;
+	return iommu && d == iommu->ir_domain;
 }
 
 static const struct irq_domain_ops intel_ir_domain_ops = {
-- 
Gitee


From 35723d3e62ffbeb35c388c825d80af014494d798 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 26 Nov 2021 17:28:50 +0800
Subject: [PATCH 0376/2161] swiotlb: remove the tbl_dma_addr argument to
 swiotlb_tbl_map_single

commit fc0021aa340af65a0a37d77be39e22aa886a6132 upstream.

The tbl_dma_addr argument is used to check the DMA boundary for the
allocations, and thus needs to be a dma_addr_t.  swiotlb-xen instead
passed a physical address, which could lead to incorrect results for
strange offsets.  Fix this by removing the parameter entirely and hard
code the DMA address for io_tlb_start instead.

Fixes: 91ffe4ad534a ("swiotlb-xen: introduce phys_to_dma/dma_to_phys translations")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  5 ++---
 include/linux/swiotlb.h     | 10 +++-------
 kernel/dma/swiotlb.c        | 15 ++++++---------
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3901230bc011..b933d6c3a67e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3818,9 +3818,8 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
 	 * page aligned, we don't need to use a bounce page.
 	 */
 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
-		tlb_addr = swiotlb_tbl_map_single(dev,
-				phys_to_dma_unencrypted(dev, io_tlb_start),
-				paddr, size, aligned_size, dir, attrs);
+		tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
+				aligned_size, dir, attrs);
 		if (tlb_addr == DMA_MAPPING_ERROR) {
 			goto swiotlb_error;
 		} else {
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 0a8fced6aaec..16361efb7ac9 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -45,13 +45,9 @@ enum dma_sync_target {
 	SYNC_FOR_DEVICE = 1,
 };
 
-extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
-					  dma_addr_t tbl_dma_addr,
-					  phys_addr_t phys,
-					  size_t mapping_size,
-					  size_t alloc_size,
-					  enum dma_data_direction dir,
-					  unsigned long attrs);
+phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
+		size_t mapping_size, size_t alloc_size,
+		enum dma_data_direction dir, unsigned long attrs);
 
 extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     phys_addr_t tlb_addr,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c2874f44f7ba..d53359c3f5dd 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -446,14 +446,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
 	}
 }
 
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
-				   dma_addr_t tbl_dma_addr,
-				   phys_addr_t orig_addr,
-				   size_t mapping_size,
-				   size_t alloc_size,
-				   enum dma_data_direction dir,
-				   unsigned long attrs)
+phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
+		size_t mapping_size, size_t alloc_size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
+	dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start);
 	unsigned long flags;
 	phys_addr_t tlb_addr;
 	unsigned int nslots, stride, index, wrap;
@@ -675,8 +672,8 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 	}
 
 	/* Oh well, have to allocate and map a bounce buffer. */
-	*phys = swiotlb_tbl_map_single(dev, phys_to_dma_unencrypted(dev, io_tlb_start),
-			*phys, size, size, dir, attrs);
+	*phys = swiotlb_tbl_map_single(dev, *phys, size, size, dir,
+			attrs);
 	if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
 		return false;
 
-- 
Gitee


From 8cebdb95ffcb036c27a502cc17083c164e952412 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 28 Oct 2020 15:07:25 +0800
Subject: [PATCH 0377/2161] iommu/vt-d: Fix kernel NULL pointer dereference in
 find_domain()

commit 6097df457adfb67cb75ca700fd1085ede2e1201d upstream.

If calling find_domain() for a device which hasn't been probed by the
iommu core, below kernel NULL pointer dereference issue happens.

[  362.736947] BUG: kernel NULL pointer dereference, address: 0000000000000038
[  362.743953] #PF: supervisor read access in kernel mode
[  362.749115] #PF: error_code(0x0000) - not-present page
[  362.754278] PGD 0 P4D 0
[  362.756843] Oops: 0000 [#1] SMP NOPTI
[  362.760528] CPU: 0 PID: 844 Comm: cat Not tainted 5.9.0-rc4-intel-next+ #1
[  362.767428] Hardware name: Intel Corporation Ice Lake Client Platform/IceLake
               U DDR4 SODIMM PD RVP TLC, BIOS ICLSFWR1.R00.3384.A02.1909200816
               09/20/2019
[  362.781109] RIP: 0010:find_domain+0xd/0x40
[  362.785234] Code: 48 81 fb 60 28 d9 b2 75 de 5b 41 5c 41 5d 5d c3 0f 1f 00 66
                     2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 8b 87 e0 02 00
                     00 55 <48> 8b 40 38 48 89 e5 48 83 f8 fe 0f 94 c1 48 85 ff
                     0f 94 c2 08 d1
[  362.804041] RSP: 0018:ffffb09cc1f0bd38 EFLAGS: 00010046
[  362.809292] RAX: 0000000000000000 RBX: ffff905b98e4fac8 RCX: 0000000000000000
[  362.816452] RDX: 0000000000000001 RSI: ffff905b98e4fac8 RDI: ffff905b9ccd40d0
[  362.823617] RBP: ffffb09cc1f0bda0 R08: ffffb09cc1f0bd48 R09: 000000000000000f
[  362.830778] R10: ffffffffb266c080 R11: ffff905b9042602d R12: ffff905b98e4fac8
[  362.837944] R13: ffffb09cc1f0bd48 R14: ffff905b9ccd40d0 R15: ffff905b98e4fac8
[  362.845108] FS:  00007f8485460740(0000) GS:ffff905b9fc00000(0000)
               knlGS:0000000000000000
[  362.853227] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  362.858996] CR2: 0000000000000038 CR3: 00000004627a6003 CR4: 0000000000770ef0
[  362.866161] PKRU: fffffffc
[  362.868890] Call Trace:
[  362.871363]  ? show_device_domain_translation+0x32/0x100
[  362.876700]  ? bind_store+0x110/0x110
[  362.880387]  ? klist_next+0x91/0x120
[  362.883987]  ? domain_translation_struct_show+0x50/0x50
[  362.889237]  bus_for_each_dev+0x79/0xc0
[  362.893121]  domain_translation_struct_show+0x36/0x50
[  362.898204]  seq_read+0x135/0x410
[  362.901545]  ? handle_mm_fault+0xeb8/0x1750
[  362.905755]  full_proxy_read+0x5c/0x90
[  362.909526]  vfs_read+0xa6/0x190
[  362.912782]  ksys_read+0x61/0xe0
[  362.916037]  __x64_sys_read+0x1a/0x20
[  362.919725]  do_syscall_64+0x37/0x80
[  362.923329]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  362.928405] RIP: 0033:0x7f84855c5e95

Filter out those devices to avoid such error.

Fixes: e2726daea583d ("iommu/vt-d: debugfs: Add support to show page table internals")
Reported-and-tested-by: Xu Pengfei <pengfei.xu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: stable@vger.kernel.org#v5.6+
Link: https://lore.kernel.org/r/20201028070725.24979-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b933d6c3a67e..49e3e4cc5b48 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2525,6 +2525,9 @@ struct dmar_domain *find_domain(struct device *dev)
 {
 	struct device_domain_info *info;
 
+	if (unlikely(!dev || !dev->iommu))
+		return NULL;
+
 	if (unlikely(attach_deferred(dev)))
 		return NULL;
 
-- 
Gitee


From f10f44903b376e1eccc4ebf8139ac6075e48da0a Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Fri, 30 Oct 2020 10:37:23 +0800
Subject: [PATCH 0378/2161] iommu/vt-d: Fix sid not set issue in
 intel_svm_bind_gpasid()

commit eea4e29ab8bef254b228d6e1e3de188087b2c7d0 upstream.

Should get correct sid and set it into sdev. Because we execute
'sdev->sid != req->rid' in the loop of prq_event_thread().

Fixes: eb8d93ea3c1d ("iommu/vt-d: Report page request faults for guest SVA")
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1604025444-6954-2-git-send-email-yi.y.sun@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index cc867736b1a2..95e0fd625775 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -297,6 +297,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct intel_svm_dev *sdev = NULL;
 	struct dmar_domain *dmar_domain;
+	struct device_domain_info *info;
 	struct intel_svm *svm = NULL;
 	int ret = 0;
 
@@ -328,6 +329,10 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
 		return -EINVAL;
 
+	info = get_domain_info(dev);
+	if (!info)
+		return -EINVAL;
+
 	dmar_domain = to_dmar_domain(domain);
 
 	mutex_lock(&pasid_mutex);
@@ -375,6 +380,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 		goto out;
 	}
 	sdev->dev = dev;
+	sdev->sid = PCI_DEVID(info->bus, info->devfn);
 
 	/* Only count users if device has aux domains */
 	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
-- 
Gitee


From 3cf57fe7c068cf3ff97ec56afa7aa0608dbb8931 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Nov 2020 20:14:16 +0100
Subject: [PATCH 0379/2161] iommu/vt-d: Cure VF irqdomain hickup

commit ff828729be446b86957f7c294068758231cd2183 upstream.

The recent changes to store the MSI irqdomain pointer in struct device
missed that Intel DMAR does not register virtual function devices.  Due to
that a VF device gets the plain PCI-MSI domain assigned and then issues
compat MSI messages which get caught by the interrupt remapping unit.

Cure that by inheriting the irq domain from the physical function
device.

Ideally the irqdomain would be associated to the bus, but DMAR can have
multiple units and therefore irqdomains on a single bus. The VF 'bus' could
of course inherit the domain from the PF, but that'd be yet another x86
oddity.

Fixes: 85a8dfc57a0b ("iommm/vt-d: Store irq domain in struct device")
Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Link: https://lore.kernel.org/r/draft-87eekymlpz.fsf@nanos.tec.linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 404b40af31cb..b2e804473209 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -333,6 +333,11 @@ static void  dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info)
 	dmar_iommu_notify_scope_dev(info);
 }
 
+static inline void vf_inherit_msi_domain(struct pci_dev *pdev)
+{
+	dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&pdev->physfn->dev));
+}
+
 static int dmar_pci_bus_notifier(struct notifier_block *nb,
 				 unsigned long action, void *data)
 {
@@ -342,8 +347,20 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb,
 	/* Only care about add/remove events for physical functions.
 	 * For VFs we actually do the lookup based on the corresponding
 	 * PF in device_to_iommu() anyway. */
-	if (pdev->is_virtfn)
+	if (pdev->is_virtfn) {
+		/*
+		 * Ensure that the VF device inherits the irq domain of the
+		 * PF device. Ideally the device would inherit the domain
+		 * from the bus, but DMAR can have multiple units per bus
+		 * which makes this impossible. The VF 'bus' could inherit
+		 * from the PF device, but that's yet another x86'sism to
+		 * inflict on everybody else.
+		 */
+		if (action == BUS_NOTIFY_ADD_DEVICE)
+			vf_inherit_msi_domain(pdev);
 		return NOTIFY_DONE;
+	}
+
 	if (action != BUS_NOTIFY_ADD_DEVICE &&
 	    action != BUS_NOTIFY_REMOVED_DEVICE)
 		return NOTIFY_DONE;
-- 
Gitee


From 1901c4b3200a3b880c5dedc787fe7a4ffcbd31aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 28 Apr 2022 15:50:54 +0200
Subject: [PATCH 0380/2161] x86/pci/xen: Disable PCI/MSI[-X] masking for
 XEN_HVM guests

commit 7e0815b3e09986d2fe651199363e135b9358132a upstream.

When a XEN_HVM guest uses the XEN PIRQ/Eventchannel mechanism, then
PCI/MSI[-X] masking is solely controlled by the hypervisor, but contrary to
XEN_PV guests this does not disable PCI/MSI[-X] masking in the PCI/MSI
layer.

This can lead to a situation where the PCI/MSI layer masks an MSI[-X]
interrupt and the hypervisor grants the write despite the fact that it
already requested the interrupt. As a consequence interrupt delivery on the
affected device is not happening ever.

Set pci_msi_ignore_mask to prevent that like it's done for XEN_PV guests
already.

Fixes: 809f9267bbab ("xen: map MSIs into pirqs")
Reported-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
Reported-by: Dusty Mabe <dustymabe@redhat.com>
Reported-by: Salvatore Bonaccorso <carnil@debian.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Noah Meyerhans <noahm@debian.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/87tuaduxj5.ffs@tglx
[nmeyerha@amazon.com: backported to 5.4]
Signed-off-by: Noah Meyerhans <nmeyerha@amazon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/pci/xen.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 5c11ae66b5d8..9cf8f5417e7f 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -442,6 +442,11 @@ void __init xen_msi_init(void)
 
 	x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
 	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+	/*
+	 * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
+	 * controlled by the hypervisor.
+	 */
+	pci_msi_ignore_mask = 1;
 }
 #endif
 
-- 
Gitee


From 8ebe65574190ba20b492304d499c96646680c13c Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 30 Mar 2022 16:09:41 +0800
Subject: [PATCH 0381/2161] x86/irq: Cleanup the arch_*_msi_irqs() leftovers

commit opencloudos.

Port from 7ca435cf857dd63d29d5e0b785807f6988788d2f.

Get rid of all the gunk and remove the 'select PCI_MSI_ARCH_FALLBACK' from
the x86 Kconfig so the weak functions in the PCI core are replaced by stubs
which emit a warning, which ensures that any fail to set the irq domain
pointer results in a warning when the device is used.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112334.086003720@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/Kconfig                |  1 +
 arch/x86/include/asm/pci.h      | 11 -----------
 arch/x86/include/asm/x86_init.h |  1 -
 arch/x86/kernel/apic/msi.c      | 22 ----------------------
 arch/x86/kernel/x86_init.c      | 18 ------------------
 arch/x86/pci/xen.c              |  9 ---------
 6 files changed, 1 insertion(+), 61 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3ece88cebd68..cd3525c44cc7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -215,6 +215,7 @@ config X86
 	select NEED_SG_DMA_LENGTH
 	select PCI_DOMAINS			if PCI
 	select PCI_LOCKLESS_CONFIG		if PCI
+	select PCI_MSI_ARCH_FALLBACKS
 	select PERF_EVENTS
 	select RTC_LIB
 	select RTC_MC146818_LIB
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 394e0f708986..43e74b7ff65a 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -106,17 +106,6 @@ static inline void early_quirks(void) { }
 #endif
 
 extern void pci_iommu_alloc(void);
-
-#ifdef CONFIG_PCI_MSI
-/* implemented in arch/x86/kernel/apic/io_apic. */
-struct msi_desc;
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
-void native_teardown_msi_irq(unsigned int irq);
-void native_restore_msi_irqs(struct pci_dev *dev);
-#else
-#define native_setup_msi_irqs		NULL
-#define native_teardown_msi_irq		NULL
-#endif
 #endif  /* __KERNEL__ */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 96d9cd208610..4faf3d8625ba 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -286,7 +286,6 @@ struct pci_dev;
 
 struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
-	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
 	void (*restore_msi_irqs)(struct pci_dev *dev);
 };
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index ba9446de95ef..188166a0e77c 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -155,28 +155,6 @@ static struct irq_chip pci_msi_controller = {
 				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-	struct irq_domain *domain;
-	struct irq_alloc_info info;
-
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
-
-	domain = irq_remapping_get_irq_domain(&info);
-	if (domain == NULL)
-		domain = msi_default_domain;
-	if (domain == NULL)
-		return -ENOSYS;
-
-	return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
-}
-
-void native_teardown_msi_irq(unsigned int irq)
-{
-	irq_domain_free_irqs(irq, 1);
-}
-
 static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
 					 msi_alloc_info_t *arg)
 {
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1838b10a299c..9ef42c293326 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -127,28 +127,10 @@ EXPORT_SYMBOL_GPL(x86_platform);
 
 #if defined(CONFIG_PCI_MSI)
 struct x86_msi_ops x86_msi __ro_after_init = {
-	.setup_msi_irqs		= native_setup_msi_irqs,
-	.teardown_msi_irq	= native_teardown_msi_irq,
-	.teardown_msi_irqs	= default_teardown_msi_irqs,
 	.restore_msi_irqs	= default_restore_msi_irqs,
 };
 
 /* MSI arch specific hooks */
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-	return x86_msi.setup_msi_irqs(dev, nvec, type);
-}
-
-void arch_teardown_msi_irqs(struct pci_dev *dev)
-{
-	x86_msi.teardown_msi_irqs(dev);
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
-	x86_msi.teardown_msi_irq(irq);
-}
-
 void arch_restore_msi_irqs(struct pci_dev *dev)
 {
 	x86_msi.restore_msi_irqs(dev);
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 9cf8f5417e7f..0be4b0d99c7b 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -391,12 +391,6 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
 	/* Free the IRQ's and the msidesc using the generic code. */
 	default_teardown_msi_irqs(dev);
 }
-
-static void xen_teardown_msi_irq(unsigned int irq)
-{
-	xen_destroy_irq(irq);
-}
-
 #endif
 
 int __init pci_xen_init(void)
@@ -416,7 +410,6 @@ int __init pci_xen_init(void)
 
 #ifdef CONFIG_PCI_MSI
 	x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
-	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
 	x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs;
 	pci_msi_ignore_mask = 1;
 #endif
@@ -441,7 +434,6 @@ void __init xen_msi_init(void)
 	}
 
 	x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
-	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
 	/*
 	 * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
 	 * controlled by the hypervisor.
@@ -481,7 +473,6 @@ int __init pci_xen_initial_domain(void)
 
 #ifdef CONFIG_PCI_MSI
 	x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
-	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
 	x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
 	pci_msi_ignore_mask = 1;
 #endif
-- 
Gitee


From 6315cfb8c7d51580a6b45dd997c19d9f05f9c4ae Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Sun, 15 Nov 2020 21:59:51 +0100
Subject: [PATCH 0382/2161] iommu/vt-d: include conditionally on
 CONFIG_INTEL_IOMMU_SVM

commit 68dd9d89eaf56dfab8d46bf25610aa4650247617 upstream.

Commit 6ee1b77ba3ac ("iommu/vt-d: Add svm/sva invalidate function")
introduced intel_iommu_sva_invalidate() when CONFIG_INTEL_IOMMU_SVM.
This function uses the dedicated static variable inv_type_granu_table
and functions to_vtd_granularity() and to_vtd_size().

These parts are unused when !CONFIG_INTEL_IOMMU_SVM, and hence,
make CC=clang W=1 warns with an -Wunused-function warning.

Include these parts conditionally on CONFIG_INTEL_IOMMU_SVM.

Fixes: 6ee1b77ba3ac ("iommu/vt-d: Add svm/sva invalidate function")
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201115205951.20698-1-lukas.bulwahn@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 49e3e4cc5b48..5f79ce7a099f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5390,6 +5390,7 @@ static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
 }
 
+#ifdef CONFIG_INTEL_IOMMU_SVM
 /*
  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
  * VT-d granularity. Invalidation is typically included in the unmap operation
@@ -5436,7 +5437,6 @@ static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
 	return order_base_2(nr_pages);
 }
 
-#ifdef CONFIG_INTEL_IOMMU_SVM
 static int
 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 			   struct iommu_cache_invalidate_info *inv_info)
-- 
Gitee


From 14ca704fff50a7fca263c69cb4aeecdffca67b1e Mon Sep 17 00:00:00 2001
From: Vijayanand Jitta <vjitta@codeaurora.org>
Date: Wed, 30 Sep 2020 13:14:23 +0530
Subject: [PATCH 0383/2161] iommu/iova: Retry from last rb tree node if iova
 search fails

commit 4e89dce725213d3d0b0475211b500eda4ef4bf2f upstream.

When ever a new iova alloc request comes iova is always searched
from the cached node and the nodes which are previous to cached
node. So, even if there is free iova space available in the nodes
which are next to the cached node iova allocation can still fail
because of this approach.

Consider the following sequence of iova alloc and frees on
1GB of iova space

1) alloc - 500MB
2) alloc - 12MB
3) alloc - 499MB
4) free -  12MB which was allocated in step 2
5) alloc - 13MB

After the above sequence we will have 12MB of free iova space and
cached node will be pointing to the iova pfn of last alloc of 13MB
which will be the lowest iova pfn of that iova space. Now if we get an
alloc request of 2MB we just search from cached node and then look
for lower iova pfn's for free iova and as they aren't any, iova alloc
fails though there is 12MB of free iova space.

To avoid such iova search failures do a retry from the last rb tree node
when iova search fails, this will search the entire tree and get an iova
if its available.

Signed-off-by: Vijayanand Jitta <vjitta@codeaurora.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1601451864-5956-1-git-send-email-vjitta@codeaurora.org
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 30d969a4c5fd..c3a1a8e6f2c1 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -184,8 +184,9 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 	struct rb_node *curr, *prev;
 	struct iova *curr_iova;
 	unsigned long flags;
-	unsigned long new_pfn;
+	unsigned long new_pfn, retry_pfn;
 	unsigned long align_mask = ~0UL;
+	unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;
 
 	if (size_aligned)
 		align_mask <<= fls_long(size - 1);
@@ -198,15 +199,25 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 
 	curr = __get_cached_rbnode(iovad, limit_pfn);
 	curr_iova = rb_entry(curr, struct iova, node);
+	retry_pfn = curr_iova->pfn_hi + 1;
+
+retry:
 	do {
-		limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
-		new_pfn = (limit_pfn - size) & align_mask;
+		high_pfn = min(high_pfn, curr_iova->pfn_lo);
+		new_pfn = (high_pfn - size) & align_mask;
 		prev = curr;
 		curr = rb_prev(curr);
 		curr_iova = rb_entry(curr, struct iova, node);
-	} while (curr && new_pfn <= curr_iova->pfn_hi);
-
-	if (limit_pfn < size || new_pfn < iovad->start_pfn) {
+	} while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);
+
+	if (high_pfn < size || new_pfn < low_pfn) {
+		if (low_pfn == iovad->start_pfn && retry_pfn < limit_pfn) {
+			high_pfn = limit_pfn;
+			low_pfn = retry_pfn;
+			curr = &iovad->anchor.node;
+			curr_iova = rb_entry(curr, struct iova, node);
+			goto retry;
+		}
 		iovad->max32_alloc_size = size;
 		goto iova32_full;
 	}
-- 
Gitee


From 85fdccc191dc5992cdcd9db8393976fa9185a8d8 Mon Sep 17 00:00:00 2001
From: Vijayanand Jitta <vjitta@codeaurora.org>
Date: Wed, 30 Sep 2020 13:14:24 +0530
Subject: [PATCH 0384/2161] iommu/iova: Free global iova rcache on iova alloc
 failure

commit 6fa3525b455ae1fde5b424907141b33651f137b0 upstream.

When ever an iova alloc request fails we free the iova
ranges present in the percpu iova rcaches and then retry
but the global iova rcache is not freed as a result we could
still see iova alloc failure even after retry as global
rcache is holding the iova's which can cause fragmentation.
So, free the global iova rcache as well and then go for the
retry.

Signed-off-by: Vijayanand Jitta <vjitta@codeaurora.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: John Garry <john.garry@huaqwei.com>
Link: https://lore.kernel.org/r/1601451864-5956-2-git-send-email-vjitta@codeaurora.org
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index c3a1a8e6f2c1..ea04a88c673d 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -25,6 +25,7 @@ static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
 static void fq_flush_timeout(struct timer_list *t);
+static void free_global_cached_iovas(struct iova_domain *iovad);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -442,6 +443,7 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 		flush_rcache = false;
 		for_each_online_cpu(cpu)
 			free_cpu_cached_iovas(cpu, iovad);
+		free_global_cached_iovas(iovad);
 		goto retry;
 	}
 
@@ -1057,5 +1059,25 @@ void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
 	}
 }
 
+/*
+ * free all the IOVA ranges of global cache
+ */
+static void free_global_cached_iovas(struct iova_domain *iovad)
+{
+	struct iova_rcache *rcache;
+	unsigned long flags;
+	int i, j;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		spin_lock_irqsave(&rcache->lock, flags);
+		for (j = 0; j < rcache->depot_size; ++j) {
+			iova_magazine_free_pfns(rcache->depot[j], iovad);
+			iova_magazine_free(rcache->depot[j]);
+		}
+		rcache->depot_size = 0;
+		spin_unlock_irqrestore(&rcache->lock, flags);
+	}
+}
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
 MODULE_LICENSE("GPL");
-- 
Gitee


From 5d85cab60a47569cb67bc22e956b8bc4a3aa2791 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 19 Nov 2020 13:51:19 +0800
Subject: [PATCH 0385/2161] iommu/vt-d: Fix compile error with CONFIG_PCI_ATS
 not set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 3645a34f5b962aeedeb02f30cdf048eaae9b5f5c upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the compile error below (CONFIG_PCI_ATS not set):

drivers/iommu/intel/dmar.c: In function ‘vf_inherit_msi_domain’:
drivers/iommu/intel/dmar.c:338:59: error: ‘struct pci_dev’ has no member named ‘physfn’; did you mean ‘is_physfn’?
  338 |  dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&pdev->physfn->dev));
      |                                                           ^~~~~~
      |                                                           is_physfn

Fixes: ff828729be44 ("iommu/vt-d: Cure VF irqdomain hickup")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/linux-iommu/CAMuHMdXA7wfJovmfSH2nbAhN0cPyCiFHodTvg4a8Hm9rx5Dj-w@mail.gmail.com/
Link: https://lore.kernel.org/r/20201119055119.2862701-1-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index b2e804473209..11319e4dce4a 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -335,7 +335,9 @@ static void  dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info)
 
 static inline void vf_inherit_msi_domain(struct pci_dev *pdev)
 {
-	dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&pdev->physfn->dev));
+	struct pci_dev *physfn = pci_physfn(pdev);
+
+	dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&physfn->dev));
 }
 
 static int dmar_pci_bus_notifier(struct notifier_block *nb,
-- 
Gitee


From e63d5ebb52795e7e4d6b0b48adb006c0c5d93e31 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 6 Nov 2020 16:50:47 +0100
Subject: [PATCH 0386/2161] iommu/ioasid: Add ioasid references

commit cb4789b0d19ff231ce9f73376a023341300aed96 upstream.

Let IOASID users take references to existing ioasids with ioasid_get().
ioasid_put() drops a reference and only frees the ioasid when its
reference number is zero. It returns true if the ioasid was freed.
For drivers that don't call ioasid_get(), ioasid_put() is the same as
ioasid_free().

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201106155048.997886-2-jean-philippe@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  4 ++--
 drivers/iommu/intel/svm.c   |  6 +++---
 drivers/iommu/ioasid.c      | 38 +++++++++++++++++++++++++++++++++----
 include/linux/ioasid.h      | 10 ++++++++--
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5f79ce7a099f..9320f7887d10 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5201,7 +5201,7 @@ static void auxiliary_unlink_device(struct dmar_domain *domain,
 	domain->auxd_refcnt--;
 
 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		ioasid_free(domain->default_pasid);
+		ioasid_put(domain->default_pasid);
 }
 
 static int aux_domain_add_dev(struct dmar_domain *domain,
@@ -5262,7 +5262,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 	spin_unlock(&iommu->lock);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		ioasid_free(domain->default_pasid);
+		ioasid_put(domain->default_pasid);
 
 	return ret;
 }
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 95e0fd625775..ae85522d3db1 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -616,7 +616,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		if (mm) {
 			ret = mmu_notifier_register(&svm->notifier, mm);
 			if (ret) {
-				ioasid_free(svm->pasid);
+				ioasid_put(svm->pasid);
 				kfree(svm);
 				kfree(sdev);
 				goto out;
@@ -634,7 +634,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		if (ret) {
 			if (mm)
 				mmu_notifier_unregister(&svm->notifier, mm);
-			ioasid_free(svm->pasid);
+			ioasid_put(svm->pasid);
 			kfree(svm);
 			kfree(sdev);
 			goto out;
@@ -707,7 +707,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
-				ioasid_free(svm->pasid);
+				ioasid_put(svm->pasid);
 				if (svm->mm) {
 					mmu_notifier_unregister(&svm->notifier, svm->mm);
 					/* Clear mm's pasid. */
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 0f8dd377aada..50ee27bbd04e 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -2,7 +2,7 @@
 /*
  * I/O Address Space ID allocator. There is one global IOASID space, split into
  * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc and ioasid_free.
+ * free IOASIDs with ioasid_alloc and ioasid_put.
  */
 #include <linux/ioasid.h>
 #include <linux/module.h>
@@ -15,6 +15,7 @@ struct ioasid_data {
 	struct ioasid_set *set;
 	void *private;
 	struct rcu_head rcu;
+	refcount_t refs;
 };
 
 /*
@@ -314,6 +315,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 
 	data->set = set;
 	data->private = private;
+	refcount_set(&data->refs, 1);
 
 	/*
 	 * Custom allocator needs allocator data to perform platform specific
@@ -346,11 +348,34 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 EXPORT_SYMBOL_GPL(ioasid_alloc);
 
 /**
- * ioasid_free - Free an IOASID
+ * ioasid_get - obtain a reference to the IOASID
+ */
+void ioasid_get(ioasid_t ioasid)
+{
+	struct ioasid_data *ioasid_data;
+
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_data = xa_load(&active_allocator->xa, ioasid);
+	if (ioasid_data)
+		refcount_inc(&ioasid_data->refs);
+	else
+		WARN_ON(1);
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_get);
+
+/**
+ * ioasid_put - Release a reference to an ioasid
  * @ioasid: the ID to remove
+ *
+ * Put a reference to the IOASID, free it when the number of references drops to
+ * zero.
+ *
+ * Return: %true if the IOASID was freed, %false otherwise.
  */
-void ioasid_free(ioasid_t ioasid)
+bool ioasid_put(ioasid_t ioasid)
 {
+	bool free = false;
 	struct ioasid_data *ioasid_data;
 
 	spin_lock(&ioasid_allocator_lock);
@@ -360,6 +385,10 @@ void ioasid_free(ioasid_t ioasid)
 		goto exit_unlock;
 	}
 
+	free = refcount_dec_and_test(&ioasid_data->refs);
+	if (!free)
+		goto exit_unlock;
+
 	active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
 	/* Custom allocator needs additional steps to free the xa element */
 	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
@@ -369,8 +398,9 @@ void ioasid_free(ioasid_t ioasid)
 
 exit_unlock:
 	spin_unlock(&ioasid_allocator_lock);
+	return free;
 }
-EXPORT_SYMBOL_GPL(ioasid_free);
+EXPORT_SYMBOL_GPL(ioasid_put);
 
 /**
  * ioasid_find - Find IOASID data
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 6f000d7a0ddc..e9dacd4b9f6b 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -34,7 +34,8 @@ struct ioasid_allocator_ops {
 #if IS_ENABLED(CONFIG_IOASID)
 ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		      void *private);
-void ioasid_free(ioasid_t ioasid);
+void ioasid_get(ioasid_t ioasid);
+bool ioasid_put(ioasid_t ioasid);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -48,10 +49,15 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
 	return INVALID_IOASID;
 }
 
-static inline void ioasid_free(ioasid_t ioasid)
+static inline void ioasid_get(ioasid_t ioasid)
 {
 }
 
+static inline bool ioasid_put(ioasid_t ioasid)
+{
+	return false;
+}
+
 static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 				bool (*getter)(void *))
 {
-- 
Gitee


From feb64998dbea8c79ed002ad63d42d8ccfad34960 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 6 Nov 2020 16:50:48 +0100
Subject: [PATCH 0387/2161] iommu/sva: Add PASID helpers

commit cfc78dfd9b36dcda7c3ca9cdfca343f84c72252f upstream.

Let IOMMU drivers allocate a single PASID per mm. Store the mm in the
IOASID set to allow refcounting and searching mm by PASID, when handling
an I/O page fault.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201106155048.997886-3-jean-philippe@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig         |  5 ++
 drivers/iommu/Makefile        |  1 +
 drivers/iommu/iommu-sva-lib.c | 86 +++++++++++++++++++++++++++++++++++
 drivers/iommu/iommu-sva-lib.h | 15 ++++++
 4 files changed, 107 insertions(+)
 create mode 100644 drivers/iommu/iommu-sva-lib.c
 create mode 100644 drivers/iommu/iommu-sva-lib.h

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index eea7d376162b..5230bfeaa032 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -102,6 +102,11 @@ config IOMMU_DMA
 	select IRQ_MSI_IOMMU
 	select NEED_SG_DMA_LENGTH
 
+# Shared Virtual Addressing library
+config IOMMU_SVA_LIB
+	bool
+	select IOASID
+
 config FSL_PAMU
 	bool "Freescale IOMMU support"
 	depends on PCI
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 3ba27a3a6bef..5bf5fca35184 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
+obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
new file mode 100644
index 000000000000..bd41405d34e9
--- /dev/null
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for IOMMU drivers implementing SVA
+ */
+#include <linux/mutex.h>
+#include <linux/sched/mm.h>
+
+#include "iommu-sva-lib.h"
+
+static DEFINE_MUTEX(iommu_sva_lock);
+static DECLARE_IOASID_SET(iommu_sva_pasid);
+
+/**
+ * iommu_sva_alloc_pasid - Allocate a PASID for the mm
+ * @mm: the mm
+ * @min: minimum PASID value (inclusive)
+ * @max: maximum PASID value (inclusive)
+ *
+ * Try to allocate a PASID for this mm, or take a reference to the existing one
+ * provided it fits within the [@min, @max] range. On success the PASID is
+ * available in mm->pasid, and must be released with iommu_sva_free_pasid().
+ * @min must be greater than 0, because 0 indicates an unused mm->pasid.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
+{
+	int ret = 0;
+	ioasid_t pasid;
+
+	if (min == INVALID_IOASID || max == INVALID_IOASID ||
+	    min == 0 || max < min)
+		return -EINVAL;
+
+	mutex_lock(&iommu_sva_lock);
+	if (mm->pasid) {
+		if (mm->pasid >= min && mm->pasid <= max)
+			ioasid_get(mm->pasid);
+		else
+			ret = -EOVERFLOW;
+	} else {
+		pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
+		if (pasid == INVALID_IOASID)
+			ret = -ENOMEM;
+		else
+			mm->pasid = pasid;
+	}
+	mutex_unlock(&iommu_sva_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
+
+/**
+ * iommu_sva_free_pasid - Release the mm's PASID
+ * @mm: the mm
+ *
+ * Drop one reference to a PASID allocated with iommu_sva_alloc_pasid()
+ */
+void iommu_sva_free_pasid(struct mm_struct *mm)
+{
+	mutex_lock(&iommu_sva_lock);
+	if (ioasid_put(mm->pasid))
+		mm->pasid = 0;
+	mutex_unlock(&iommu_sva_lock);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_free_pasid);
+
+/* ioasid_find getter() requires a void * argument */
+static bool __mmget_not_zero(void *mm)
+{
+	return mmget_not_zero(mm);
+}
+
+/**
+ * iommu_sva_find() - Find mm associated to the given PASID
+ * @pasid: Process Address Space ID assigned to the mm
+ *
+ * On success a reference to the mm is taken, and must be released with mmput().
+ *
+ * Returns the mm corresponding to this PASID, or an error if not found.
+ */
+struct mm_struct *iommu_sva_find(ioasid_t pasid)
+{
+	return ioasid_find(&iommu_sva_pasid, pasid, __mmget_not_zero);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_find);
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
new file mode 100644
index 000000000000..b40990aef3fd
--- /dev/null
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SVA library for IOMMU drivers
+ */
+#ifndef _IOMMU_SVA_LIB_H
+#define _IOMMU_SVA_LIB_H
+
+#include <linux/ioasid.h>
+#include <linux/mm_types.h>
+
+int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
+void iommu_sva_free_pasid(struct mm_struct *mm);
+struct mm_struct *iommu_sva_find(ioasid_t pasid);
+
+#endif /* _IOMMU_SVA_LIB_H */
-- 
Gitee


From 09ceadd078bda48ddfbbaf8fb852b7b9d6aa6dce Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Thu, 19 Nov 2020 16:58:46 +0000
Subject: [PATCH 0388/2161] iommu: Check return of __iommu_attach_device()

commit 77c38c8cf52ef715bfc5cab3d14222d4f3e776e2 upstream.

Currently iommu_create_device_direct_mappings() is called
without checking the return of __iommu_attach_device(). This
may result in failures in iommu driver if dev attach returns
error.

Fixes: ce574c27ae27 ("iommu: Move iommu_group_create_direct_mappings() out of iommu_group_add_device()")
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20201119165846.34180-1-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 78d89f479e8e..e2afcba94351 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -264,16 +264,18 @@ int iommu_probe_device(struct device *dev)
 	 */
 	iommu_alloc_default_domain(group, dev);
 
-	if (group->default_domain)
+	if (group->default_domain) {
 		ret = __iommu_attach_device(group->default_domain, dev);
+		if (ret) {
+			iommu_group_put(group);
+			goto err_release;
+		}
+	}
 
 	iommu_create_device_direct_mappings(group, dev);
 
 	iommu_group_put(group);
 
-	if (ret)
-		goto err_release;
-
 	if (ops->probe_finalize)
 		ops->probe_finalize(dev);
 
-- 
Gitee


From c18bd9d4499d79a82d692721ec0d063a473d6286 Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Tue, 24 Nov 2020 16:20:51 +0800
Subject: [PATCH 0389/2161] iommu: Handle freelists when using deferred
 flushing in iommu drivers

commit 2a2b8eaa5b25668a6f717f94b55f4e3aaf87629d upstream.

Allow the iommu_unmap_fast to return newly freed page table pages and
pass the freelist to queue_iova in the dma-iommu ops path.

This is useful for iommu drivers (in this case the intel iommu driver)
which need to wait for the ioTLB to be flushed before newly
free/unmapped page table pages can be freed. This way we can still batch
ioTLB free operations and handle the freelists.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20201124082057.2614359-2-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 55 ++++++++++++++++++++++++-------------
 include/linux/iommu.h       |  1 +
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 9320f7887d10..e470ae15c306 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1243,17 +1243,17 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
    pages can only be freed after the IOTLB flush has been done. */
 static struct page *domain_unmap(struct dmar_domain *domain,
 				 unsigned long start_pfn,
-				 unsigned long last_pfn)
+				 unsigned long last_pfn,
+				 struct page *freelist)
 {
-	struct page *freelist;
-
 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
 	BUG_ON(start_pfn > last_pfn);
 
 	/* we don't need lock here; nobody else touches the iova range */
 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
-				       domain->pgd, 0, start_pfn, last_pfn, NULL);
+				       domain->pgd, 0, start_pfn, last_pfn,
+				       freelist);
 
 	/* free pgd */
 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
@@ -2011,7 +2011,8 @@ static void domain_exit(struct dmar_domain *domain)
 	if (domain->pgd) {
 		struct page *freelist;
 
-		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+		freelist = domain_unmap(domain, 0,
+					DOMAIN_MAX_PFN(domain->gaw), NULL);
 		dma_free_pagelist(freelist);
 	}
 
@@ -3575,7 +3576,7 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
 	if (dev_is_pci(dev))
 		pdev = to_pci_dev(dev);
 
-	freelist = domain_unmap(domain, start_pfn, last_pfn);
+	freelist = domain_unmap(domain, start_pfn, last_pfn, NULL);
 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
 			!has_iova_flush_queue(&domain->iovad)) {
 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
@@ -4638,7 +4639,8 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb,
 			struct page *freelist;
 
 			freelist = domain_unmap(si_domain,
-						start_vpfn, last_vpfn);
+						start_vpfn, last_vpfn,
+						NULL);
 
 			rcu_read_lock();
 			for_each_active_iommu(iommu, drhd)
@@ -5612,10 +5614,8 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
 				struct iommu_iotlb_gather *gather)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-	struct page *freelist = NULL;
 	unsigned long start_pfn, last_pfn;
-	unsigned int npages;
-	int iommu_id, level = 0;
+	int level = 0;
 
 	/* Cope with horrid API which requires us to unmap more than the
 	   size argument if it happens to be a large-page mapping. */
@@ -5627,22 +5627,38 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
 	start_pfn = iova >> VTD_PAGE_SHIFT;
 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
 
-	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
-
-	npages = last_pfn - start_pfn + 1;
-
-	for_each_domain_iommu(iommu_id, dmar_domain)
-		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
-				      start_pfn, npages, !freelist, 0);
-
-	dma_free_pagelist(freelist);
+	gather->freelist = domain_unmap(dmar_domain, start_pfn,
+					last_pfn, gather->freelist);
 
 	if (dmar_domain->max_addr == iova + size)
 		dmar_domain->max_addr = iova;
 
+	iommu_iotlb_gather_add_page(domain, gather, iova, size);
+
 	return size;
 }
 
+static void intel_iommu_tlb_sync(struct iommu_domain *domain,
+				 struct iommu_iotlb_gather *gather)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	unsigned long iova_pfn = IOVA_PFN(gather->start);
+	size_t size = gather->end - gather->start;
+	unsigned long start_pfn, last_pfn;
+	unsigned long nrpages;
+	int iommu_id;
+
+	nrpages = aligned_nrpages(gather->start, size);
+	start_pfn = mm_to_dma_pfn(iova_pfn);
+	last_pfn = start_pfn + nrpages - 1;
+
+	for_each_domain_iommu(iommu_id, dmar_domain)
+		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
+				      start_pfn, nrpages, !gather->freelist, 0);
+
+	dma_free_pagelist(gather->freelist);
+}
+
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 					    dma_addr_t iova)
 {
@@ -6102,6 +6118,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
 	.map			= intel_iommu_map,
 	.unmap			= intel_iommu_unmap,
+	.iotlb_sync		= intel_iommu_tlb_sync,
 	.iova_to_phys		= intel_iommu_iova_to_phys,
 	.probe_device		= intel_iommu_probe_device,
 	.probe_finalize		= intel_iommu_probe_finalize,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6c2ea8c8fee4..ada074d73237 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -186,6 +186,7 @@ struct iommu_iotlb_gather {
 	unsigned long		start;
 	unsigned long		end;
 	size_t			pgsize;
+	struct page		*freelist;
 };
 
 /**
-- 
Gitee


From 1beedfdec89fc40c947f1ea7446532eef258b3a2 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 24 Nov 2020 16:20:55 +0800
Subject: [PATCH 0390/2161] iommu/vt-d: Update domain geometry in
 iommu_ops.at(de)tach_dev

commit c062db039f40e868c371c36afe8d0fac64305b5d upstream.

The iommu-dma constrains IOVA allocation based on the domain geometry
that the driver reports. Update domain geometry everytime a domain is
attached to or detached from a device.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20201124082057.2614359-6-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e470ae15c306..2f52faa86a65 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -67,8 +67,8 @@
 #define MAX_AGAW_WIDTH 64
 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
 
-#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
+#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
+#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
 
 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
@@ -739,6 +739,18 @@ static void domain_update_iommu_cap(struct dmar_domain *domain)
 	 */
 	if (domain->nid == NUMA_NO_NODE)
 		domain->nid = domain_update_device_node(domain);
+
+	/*
+	 * First-level translation restricts the input-address to a
+	 * canonical address (i.e., address bits 63:N have the same
+	 * value as address bit [N-1], where N is 48-bits with 4-level
+	 * paging and 57-bits with 5-level paging). Hence, skip bit
+	 * [N-1].
+	 */
+	if (domain_use_first_level(domain))
+		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
+	else
+		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 }
 
 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
-- 
Gitee


From 81abb8dbc35aeb524f5edbac04523cc93e7785d8 Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Tue, 24 Nov 2020 16:20:56 +0800
Subject: [PATCH 0391/2161] iommu/vt-d: Convert intel iommu driver to the iommu
 ops

commit c588072bba6b54b4b946485228b0409f23cd68a6 upstream.

Convert the intel iommu driver to the dma-iommu api. Remove the iova
handling and reserve region code from the intel iommu driver.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20201124082057.2614359-7-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Kconfig |   1 +
 drivers/iommu/intel/iommu.c | 741 ++----------------------------------
 2 files changed, 43 insertions(+), 699 deletions(-)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 877beec9d987..affb0d53f22a 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -12,6 +12,7 @@ config INTEL_IOMMU
 	select DMAR_TABLE
 	select SWIOTLB
 	select IOASID
+	select IOMMU_DMA
 	help
 	  DMA remapping (DMAR) devices support enables independent address
 	  translations for Direct Memory Access (DMA) from devices.
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2f52faa86a65..658803127add 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -31,6 +31,7 @@
 #include <linux/io.h>
 #include <linux/iova.h>
 #include <linux/iommu.h>
+#include <linux/dma-iommu.h>
 #include <linux/intel-iommu.h>
 #include <linux/syscore_ops.h>
 #include <linux/tboot.h>
@@ -41,7 +42,6 @@
 #include <linux/dma-direct.h>
 #include <linux/crash_dump.h>
 #include <linux/numa.h>
-#include <linux/swiotlb.h>
 #include <asm/irq_remapping.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -382,9 +382,6 @@ struct device_domain_info *get_domain_info(struct device *dev)
 DEFINE_SPINLOCK(device_domain_lock);
 static LIST_HEAD(device_domain_list);
 
-#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
-				to_pci_dev(d)->untrusted)
-
 /*
  * Iterate over elements in device_domain_list and call the specified
  * callback @fn against each element.
@@ -1289,13 +1286,6 @@ static void dma_free_pagelist(struct page *freelist)
 	}
 }
 
-static void iova_entry_free(unsigned long data)
-{
-	struct page *freelist = (struct page *)data;
-
-	dma_free_pagelist(freelist);
-}
-
 /* iommu handling */
 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 {
@@ -1660,19 +1650,17 @@ static inline void __mapping_notify_one(struct intel_iommu *iommu,
 		iommu_flush_write_buffer(iommu);
 }
 
-static void iommu_flush_iova(struct iova_domain *iovad)
+static void intel_flush_iotlb_all(struct iommu_domain *domain)
 {
-	struct dmar_domain *domain;
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	int idx;
 
-	domain = container_of(iovad, struct dmar_domain, iovad);
-
-	for_each_domain_iommu(idx, domain) {
+	for_each_domain_iommu(idx, dmar_domain) {
 		struct intel_iommu *iommu = g_iommus[idx];
-		u16 did = domain->iommu_did[iommu->seq_id];
+		u16 did = dmar_domain->iommu_did[iommu->seq_id];
 
-		if (domain_use_first_level(domain))
-			domain_flush_piotlb(iommu, domain, 0, -1, 0);
+		if (domain_use_first_level(dmar_domain))
+			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
 		else
 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
 						 DMA_TLB_DSI_FLUSH);
@@ -1954,48 +1942,6 @@ static int domain_detach_iommu(struct dmar_domain *domain,
 	return count;
 }
 
-static struct iova_domain reserved_iova_list;
-static struct lock_class_key reserved_rbtree_key;
-
-static int dmar_init_reserved_ranges(void)
-{
-	struct pci_dev *pdev = NULL;
-	struct iova *iova;
-	int i;
-
-	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
-
-	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
-		&reserved_rbtree_key);
-
-	/* IOAPIC ranges shouldn't be accessed by DMA */
-	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
-		IOVA_PFN(IOAPIC_RANGE_END));
-	if (!iova) {
-		pr_err("Reserve IOAPIC range failed\n");
-		return -ENODEV;
-	}
-
-	/* Reserve all PCI MMIO to avoid peer-to-peer access */
-	for_each_pci_dev(pdev) {
-		struct resource *r;
-
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			r = &pdev->resource[i];
-			if (!r->flags || !(r->flags & IORESOURCE_MEM))
-				continue;
-			iova = reserve_iova(&reserved_iova_list,
-					    IOVA_PFN(r->start),
-					    IOVA_PFN(r->end));
-			if (!iova) {
-				pci_err(pdev, "Reserve iova for %pR failed\n", r);
-				return -ENODEV;
-			}
-		}
-	}
-	return 0;
-}
-
 static inline int guestwidth_to_adjustwidth(int gaw)
 {
 	int agaw;
@@ -2018,7 +1964,7 @@ static void domain_exit(struct dmar_domain *domain)
 
 	/* destroy iovas */
 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
-		put_iova_domain(&domain->iovad);
+		iommu_put_dma_cookie(&domain->domain);
 
 	if (domain->pgd) {
 		struct page *freelist;
@@ -2552,16 +2498,6 @@ struct dmar_domain *find_domain(struct device *dev)
 	return NULL;
 }
 
-static void do_deferred_attach(struct device *dev)
-{
-	struct iommu_domain *domain;
-
-	dev_iommu_priv_set(dev, NULL);
-	domain = iommu_get_domain_for_dev(dev);
-	if (domain)
-		intel_iommu_attach_device(domain, dev);
-}
-
 static inline struct device_domain_info *
 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
 {
@@ -3439,590 +3375,6 @@ static int __init init_dmars(void)
 	return ret;
 }
 
-/* This takes a number of _MM_ pages, not VTD pages */
-static unsigned long intel_alloc_iova(struct device *dev,
-				     struct dmar_domain *domain,
-				     unsigned long nrpages, uint64_t dma_mask)
-{
-	unsigned long iova_pfn;
-
-	/*
-	 * Restrict dma_mask to the width that the iommu can handle.
-	 * First-level translation restricts the input-address to a
-	 * canonical address (i.e., address bits 63:N have the same
-	 * value as address bit [N-1], where N is 48-bits with 4-level
-	 * paging and 57-bits with 5-level paging). Hence, skip bit
-	 * [N-1].
-	 */
-	if (domain_use_first_level(domain))
-		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
-				 dma_mask);
-	else
-		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
-				 dma_mask);
-
-	/* Ensure we reserve the whole size-aligned region */
-	nrpages = __roundup_pow_of_two(nrpages);
-
-	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
-		/*
-		 * First try to allocate an io virtual address in
-		 * DMA_BIT_MASK(32) and if that fails then try allocating
-		 * from higher range
-		 */
-		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
-					   IOVA_PFN(DMA_BIT_MASK(32)), false);
-		if (iova_pfn)
-			return iova_pfn;
-	}
-	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
-				   IOVA_PFN(dma_mask), true);
-	if (unlikely(!iova_pfn)) {
-		dev_err_once(dev, "Allocating %ld-page iova failed\n",
-			     nrpages);
-		return 0;
-	}
-
-	return iova_pfn;
-}
-
-static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
-				     size_t size, int dir, u64 dma_mask)
-{
-	struct dmar_domain *domain;
-	phys_addr_t start_paddr;
-	unsigned long iova_pfn;
-	int prot = 0;
-	int ret;
-	struct intel_iommu *iommu;
-	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
-
-	BUG_ON(dir == DMA_NONE);
-
-	if (unlikely(attach_deferred(dev)))
-		do_deferred_attach(dev);
-
-	domain = find_domain(dev);
-	if (!domain)
-		return DMA_MAPPING_ERROR;
-
-	iommu = domain_get_iommu(domain);
-	size = aligned_nrpages(paddr, size);
-
-	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
-	if (!iova_pfn)
-		goto error;
-
-	/*
-	 * Check if DMAR supports zero-length reads on write only
-	 * mappings..
-	 */
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
-			!cap_zlr(iommu->cap))
-		prot |= DMA_PTE_READ;
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-		prot |= DMA_PTE_WRITE;
-	/*
-	 * paddr - (paddr + size) might be partial page, we should map the whole
-	 * page.  Note: if two part of one page are separately mapped, we
-	 * might have two guest_addr mapping to the same host paddr, but this
-	 * is not a big problem
-	 */
-	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
-				 mm_to_dma_pfn(paddr_pfn), size, prot);
-	if (ret)
-		goto error;
-
-	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
-	start_paddr += paddr & ~PAGE_MASK;
-
-	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
-
-	return start_paddr;
-
-error:
-	if (iova_pfn)
-		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
-	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
-		size, (unsigned long long)paddr, dir);
-	return DMA_MAPPING_ERROR;
-}
-
-static dma_addr_t intel_map_page(struct device *dev, struct page *page,
-				 unsigned long offset, size_t size,
-				 enum dma_data_direction dir,
-				 unsigned long attrs)
-{
-	return __intel_map_single(dev, page_to_phys(page) + offset,
-				  size, dir, *dev->dma_mask);
-}
-
-static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
-				     size_t size, enum dma_data_direction dir,
-				     unsigned long attrs)
-{
-	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
-}
-
-static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
-{
-	struct dmar_domain *domain;
-	unsigned long start_pfn, last_pfn;
-	unsigned long nrpages;
-	unsigned long iova_pfn;
-	struct intel_iommu *iommu;
-	struct page *freelist;
-	struct pci_dev *pdev = NULL;
-
-	domain = find_domain(dev);
-	BUG_ON(!domain);
-
-	iommu = domain_get_iommu(domain);
-
-	iova_pfn = IOVA_PFN(dev_addr);
-
-	nrpages = aligned_nrpages(dev_addr, size);
-	start_pfn = mm_to_dma_pfn(iova_pfn);
-	last_pfn = start_pfn + nrpages - 1;
-
-	if (dev_is_pci(dev))
-		pdev = to_pci_dev(dev);
-
-	freelist = domain_unmap(domain, start_pfn, last_pfn, NULL);
-	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
-			!has_iova_flush_queue(&domain->iovad)) {
-		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
-				      nrpages, !freelist, 0);
-		/* free iova */
-		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
-		dma_free_pagelist(freelist);
-	} else {
-		queue_iova(&domain->iovad, iova_pfn, nrpages,
-			   (unsigned long)freelist);
-		/*
-		 * queue up the release of the unmap to save the 1/6th of the
-		 * cpu used up by the iotlb flush operation...
-		 */
-	}
-
-	trace_unmap_single(dev, dev_addr, size);
-}
-
-static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
-			     size_t size, enum dma_data_direction dir,
-			     unsigned long attrs)
-{
-	intel_unmap(dev, dev_addr, size);
-}
-
-static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	intel_unmap(dev, dev_addr, size);
-}
-
-static void *intel_alloc_coherent(struct device *dev, size_t size,
-				  dma_addr_t *dma_handle, gfp_t flags,
-				  unsigned long attrs)
-{
-	struct page *page = NULL;
-	int order;
-
-	if (unlikely(attach_deferred(dev)))
-		do_deferred_attach(dev);
-
-	size = PAGE_ALIGN(size);
-	order = get_order(size);
-
-	if (gfpflags_allow_blocking(flags)) {
-		unsigned int count = size >> PAGE_SHIFT;
-
-		page = dma_alloc_from_contiguous(dev, count, order,
-						 flags & __GFP_NOWARN);
-	}
-
-	if (!page)
-		page = alloc_pages(flags, order);
-	if (!page)
-		return NULL;
-	memset(page_address(page), 0, size);
-
-	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
-					 DMA_BIDIRECTIONAL,
-					 dev->coherent_dma_mask);
-	if (*dma_handle != DMA_MAPPING_ERROR)
-		return page_address(page);
-	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-		__free_pages(page, order);
-
-	return NULL;
-}
-
-static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
-				dma_addr_t dma_handle, unsigned long attrs)
-{
-	int order;
-	struct page *page = virt_to_page(vaddr);
-
-	size = PAGE_ALIGN(size);
-	order = get_order(size);
-
-	intel_unmap(dev, dma_handle, size);
-	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-		__free_pages(page, order);
-}
-
-static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
-			   int nelems, enum dma_data_direction dir,
-			   unsigned long attrs)
-{
-	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
-	unsigned long nrpages = 0;
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sglist, sg, nelems, i) {
-		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
-	}
-
-	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
-
-	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
-}
-
-static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-			enum dma_data_direction dir, unsigned long attrs)
-{
-	int i;
-	struct dmar_domain *domain;
-	size_t size = 0;
-	int prot = 0;
-	unsigned long iova_pfn;
-	int ret;
-	struct scatterlist *sg;
-	unsigned long start_vpfn;
-	struct intel_iommu *iommu;
-
-	BUG_ON(dir == DMA_NONE);
-
-	if (unlikely(attach_deferred(dev)))
-		do_deferred_attach(dev);
-
-	domain = find_domain(dev);
-	if (!domain)
-		return 0;
-
-	iommu = domain_get_iommu(domain);
-
-	for_each_sg(sglist, sg, nelems, i)
-		size += aligned_nrpages(sg->offset, sg->length);
-
-	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
-				*dev->dma_mask);
-	if (!iova_pfn) {
-		sglist->dma_length = 0;
-		return 0;
-	}
-
-	/*
-	 * Check if DMAR supports zero-length reads on write only
-	 * mappings..
-	 */
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
-			!cap_zlr(iommu->cap))
-		prot |= DMA_PTE_READ;
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-		prot |= DMA_PTE_WRITE;
-
-	start_vpfn = mm_to_dma_pfn(iova_pfn);
-
-	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
-	if (unlikely(ret)) {
-		dma_pte_free_pagetable(domain, start_vpfn,
-				       start_vpfn + size - 1,
-				       agaw_to_level(domain->agaw) + 1);
-		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
-		return 0;
-	}
-
-	for_each_sg(sglist, sg, nelems, i)
-		trace_map_sg(dev, i + 1, nelems, sg);
-
-	return nelems;
-}
-
-static u64 intel_get_required_mask(struct device *dev)
-{
-	return DMA_BIT_MASK(32);
-}
-
-static const struct dma_map_ops intel_dma_ops = {
-	.alloc = intel_alloc_coherent,
-	.free = intel_free_coherent,
-	.map_sg = intel_map_sg,
-	.unmap_sg = intel_unmap_sg,
-	.map_page = intel_map_page,
-	.unmap_page = intel_unmap_page,
-	.map_resource = intel_map_resource,
-	.unmap_resource = intel_unmap_resource,
-	.dma_supported = dma_direct_supported,
-	.mmap = dma_common_mmap,
-	.get_sgtable = dma_common_get_sgtable,
-	.get_required_mask = intel_get_required_mask,
-};
-
-static void
-bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
-		   enum dma_data_direction dir, enum dma_sync_target target)
-{
-	struct dmar_domain *domain;
-	phys_addr_t tlb_addr;
-
-	domain = find_domain(dev);
-	if (WARN_ON(!domain))
-		return;
-
-	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
-	if (is_swiotlb_buffer(tlb_addr))
-		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
-}
-
-static dma_addr_t
-bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
-		  enum dma_data_direction dir, unsigned long attrs,
-		  u64 dma_mask)
-{
-	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
-	struct dmar_domain *domain;
-	struct intel_iommu *iommu;
-	unsigned long iova_pfn;
-	unsigned long nrpages;
-	phys_addr_t tlb_addr;
-	int prot = 0;
-	int ret;
-
-	if (unlikely(attach_deferred(dev)))
-		do_deferred_attach(dev);
-
-	domain = find_domain(dev);
-
-	if (WARN_ON(dir == DMA_NONE || !domain))
-		return DMA_MAPPING_ERROR;
-
-	iommu = domain_get_iommu(domain);
-	if (WARN_ON(!iommu))
-		return DMA_MAPPING_ERROR;
-
-	nrpages = aligned_nrpages(0, size);
-	iova_pfn = intel_alloc_iova(dev, domain,
-				    dma_to_mm_pfn(nrpages), dma_mask);
-	if (!iova_pfn)
-		return DMA_MAPPING_ERROR;
-
-	/*
-	 * Check if DMAR supports zero-length reads on write only
-	 * mappings..
-	 */
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
-			!cap_zlr(iommu->cap))
-		prot |= DMA_PTE_READ;
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-		prot |= DMA_PTE_WRITE;
-
-	/*
-	 * If both the physical buffer start address and size are
-	 * page aligned, we don't need to use a bounce page.
-	 */
-	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
-		tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
-				aligned_size, dir, attrs);
-		if (tlb_addr == DMA_MAPPING_ERROR) {
-			goto swiotlb_error;
-		} else {
-			/* Cleanup the padding area. */
-			void *padding_start = phys_to_virt(tlb_addr);
-			size_t padding_size = aligned_size;
-
-			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-			    (dir == DMA_TO_DEVICE ||
-			     dir == DMA_BIDIRECTIONAL)) {
-				padding_start += size;
-				padding_size -= size;
-			}
-
-			memset(padding_start, 0, padding_size);
-		}
-	} else {
-		tlb_addr = paddr;
-	}
-
-	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
-				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
-	if (ret)
-		goto mapping_error;
-
-	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
-
-	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
-
-mapping_error:
-	if (is_swiotlb_buffer(tlb_addr))
-		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
-					 aligned_size, dir, attrs);
-swiotlb_error:
-	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
-	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
-		size, (unsigned long long)paddr, dir);
-
-	return DMA_MAPPING_ERROR;
-}
-
-static void
-bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
-		    enum dma_data_direction dir, unsigned long attrs)
-{
-	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
-	struct dmar_domain *domain;
-	phys_addr_t tlb_addr;
-
-	domain = find_domain(dev);
-	if (WARN_ON(!domain))
-		return;
-
-	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
-	if (WARN_ON(!tlb_addr))
-		return;
-
-	intel_unmap(dev, dev_addr, size);
-	if (is_swiotlb_buffer(tlb_addr))
-		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
-					 aligned_size, dir, attrs);
-
-	trace_bounce_unmap_single(dev, dev_addr, size);
-}
-
-static dma_addr_t
-bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	return bounce_map_single(dev, page_to_phys(page) + offset,
-				 size, dir, attrs, *dev->dma_mask);
-}
-
-static dma_addr_t
-bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
-		    enum dma_data_direction dir, unsigned long attrs)
-{
-	return bounce_map_single(dev, phys_addr, size,
-				 dir, attrs, *dev->dma_mask);
-}
-
-static void
-bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
-		  enum dma_data_direction dir, unsigned long attrs)
-{
-	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
-}
-
-static void
-bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
-		      enum dma_data_direction dir, unsigned long attrs)
-{
-	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
-}
-
-static void
-bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sglist, sg, nelems, i)
-		bounce_unmap_page(dev, sg->dma_address,
-				  sg_dma_len(sg), dir, attrs);
-}
-
-static int
-bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-	      enum dma_data_direction dir, unsigned long attrs)
-{
-	int i;
-	struct scatterlist *sg;
-
-	for_each_sg(sglist, sg, nelems, i) {
-		sg->dma_address = bounce_map_page(dev, sg_page(sg),
-						  sg->offset, sg->length,
-						  dir, attrs);
-		if (sg->dma_address == DMA_MAPPING_ERROR)
-			goto out_unmap;
-		sg_dma_len(sg) = sg->length;
-	}
-
-	for_each_sg(sglist, sg, nelems, i)
-		trace_bounce_map_sg(dev, i + 1, nelems, sg);
-
-	return nelems;
-
-out_unmap:
-	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-	return 0;
-}
-
-static void
-bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-			   size_t size, enum dma_data_direction dir)
-{
-	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
-}
-
-static void
-bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
-			      size_t size, enum dma_data_direction dir)
-{
-	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
-}
-
-static void
-bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
-		       int nelems, enum dma_data_direction dir)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sglist, sg, nelems, i)
-		bounce_sync_single(dev, sg_dma_address(sg),
-				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
-}
-
-static void
-bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
-			  int nelems, enum dma_data_direction dir)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sglist, sg, nelems, i)
-		bounce_sync_single(dev, sg_dma_address(sg),
-				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
-}
-
-static const struct dma_map_ops bounce_dma_ops = {
-	.alloc			= intel_alloc_coherent,
-	.free			= intel_free_coherent,
-	.map_sg			= bounce_map_sg,
-	.unmap_sg		= bounce_unmap_sg,
-	.map_page		= bounce_map_page,
-	.unmap_page		= bounce_unmap_page,
-	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
-	.sync_single_for_device	= bounce_sync_single_for_device,
-	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
-	.sync_sg_for_device	= bounce_sync_sg_for_device,
-	.map_resource		= bounce_map_resource,
-	.unmap_resource		= bounce_unmap_resource,
-	.dma_supported		= dma_direct_supported,
-};
-
 static inline int iommu_domain_cache_init(void)
 {
 	int ret = 0;
@@ -4041,6 +3393,27 @@ static inline int iommu_domain_cache_init(void)
 	return ret;
 }
 
+static int
+intel_iommu_domain_get_attr(struct iommu_domain *domain,
+			    enum iommu_attr attr, void *data)
+{
+	switch (domain->type) {
+	case IOMMU_DOMAIN_UNMANAGED:
+		return -ENODEV;
+	case IOMMU_DOMAIN_DMA:
+		switch (attr) {
+		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+			*(int *)data = !intel_iommu_strict;
+			return 0;
+		default:
+			return -ENODEV;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+}
+
 static inline int iommu_devinfo_cache_init(void)
 {
 	int ret = 0;
@@ -4691,7 +4064,7 @@ static void free_all_cpu_cached_iovas(unsigned int cpu)
 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
 				continue;
 
-			free_cpu_cached_iovas(cpu, &domain->iovad);
+			iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
 		}
 	}
 }
@@ -4964,12 +4337,6 @@ int __init intel_iommu_init(void)
 	if (list_empty(&dmar_atsr_units))
 		pr_info("No ATSR found\n");
 
-	if (dmar_init_reserved_ranges()) {
-		if (force_on)
-			panic("tboot: Failed to reserve iommu ranges\n");
-		goto out_free_reserved_range;
-	}
-
 	if (dmar_map_gfx)
 		intel_iommu_gfx_mapped = 1;
 
@@ -4980,7 +4347,7 @@ int __init intel_iommu_init(void)
 		if (force_on)
 			panic("tboot: Failed to initialize DMARs\n");
 		pr_err("Initialization failed\n");
-		goto out_free_reserved_range;
+		goto out_free_dmar;
 	}
 	up_write(&dmar_global_lock);
 
@@ -5021,8 +4388,6 @@ int __init intel_iommu_init(void)
 
 	return 0;
 
-out_free_reserved_range:
-	put_iova_domain(&reserved_iova_list);
 out_free_dmar:
 	intel_iommu_free_dmars();
 	up_write(&dmar_global_lock);
@@ -5120,17 +4485,6 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	return 0;
 }
 
-static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
-{
-	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
-	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
-
-	if (!intel_iommu_strict &&
-	    init_iova_flush_queue(&dmar_domain->iovad,
-				  iommu_flush_iova, iova_entry_free))
-		pr_info("iova flush queue initialization failed\n");
-}
-
 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 {
 	struct dmar_domain *dmar_domain;
@@ -5151,8 +4505,9 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 			return NULL;
 		}
 
-		if (type == IOMMU_DOMAIN_DMA)
-			intel_init_iova_domain(dmar_domain);
+		if (type == IOMMU_DOMAIN_DMA &&
+		    iommu_get_dma_cookie(&dmar_domain->domain))
+			return NULL;
 
 		domain = &dmar_domain->domain;
 		domain->geometry.aperture_start = 0;
@@ -5781,13 +5136,13 @@ static void intel_iommu_release_device(struct device *dev)
 
 static void intel_iommu_probe_finalize(struct device *dev)
 {
-	struct iommu_domain *domain;
+	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
-	domain = iommu_get_domain_for_dev(dev);
-	if (device_needs_bounce(dev))
-		set_dma_ops(dev, &bounce_dma_ops);
-	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
-		set_dma_ops(dev, &intel_dma_ops);
+	if (domain && domain->type == IOMMU_DOMAIN_DMA)
+		iommu_setup_dma_ops(dev, base,
+				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
 	else
 		set_dma_ops(dev, NULL);
 }
@@ -5900,19 +5255,6 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
 	return ret;
 }
 
-static void intel_iommu_apply_resv_region(struct device *dev,
-					  struct iommu_domain *domain,
-					  struct iommu_resv_region *region)
-{
-	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-	unsigned long start, end;
-
-	start = IOVA_PFN(region->start);
-	end   = IOVA_PFN(region->start + region->length - 1);
-
-	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
-}
-
 static struct iommu_group *intel_iommu_device_group(struct device *dev)
 {
 	if (dev_is_pci(dev))
@@ -6122,6 +5464,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
+	.domain_get_attr        = intel_iommu_domain_get_attr,
 	.domain_set_attr	= intel_iommu_domain_set_attr,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
@@ -6130,6 +5473,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
 	.map			= intel_iommu_map,
 	.unmap			= intel_iommu_unmap,
+	.flush_iotlb_all        = intel_flush_iotlb_all,
 	.iotlb_sync		= intel_iommu_tlb_sync,
 	.iova_to_phys		= intel_iommu_iova_to_phys,
 	.probe_device		= intel_iommu_probe_device,
@@ -6137,7 +5481,6 @@ const struct iommu_ops intel_iommu_ops = {
 	.release_device		= intel_iommu_release_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
-	.apply_resv_region	= intel_iommu_apply_resv_region,
 	.device_group		= intel_iommu_device_group,
 	.dev_has_feat		= intel_iommu_dev_has_feat,
 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
-- 
Gitee


From 26aab7ccf164f3b576916e157a8e77fada26d70c Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 25 Mar 2021 20:29:59 +0800
Subject: [PATCH 0392/2161] iommu/vt-d: Remove IOVA domain rcache flushing for
 CPU offlining

commit 363f266eeff6e22a09483dc922dccd7cd0b9fe9c upstream.

Now that the core code handles flushing per-IOVA domain CPU rcaches,
remove the handling here.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1616675401-151997-3-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 31 -------------------------------
 include/linux/cpuhotplug.h  |  1 -
 2 files changed, 32 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 658803127add..09d1f171af27 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4046,35 +4046,6 @@ static struct notifier_block intel_iommu_memory_nb = {
 	.priority = 0
 };
 
-static void free_all_cpu_cached_iovas(unsigned int cpu)
-{
-	int i;
-
-	for (i = 0; i < g_num_of_iommus; i++) {
-		struct intel_iommu *iommu = g_iommus[i];
-		struct dmar_domain *domain;
-		int did;
-
-		if (!iommu)
-			continue;
-
-		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
-			domain = get_iommu_domain(iommu, (u16)did);
-
-			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
-				continue;
-
-			iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
-		}
-	}
-}
-
-static int intel_iommu_cpu_dead(unsigned int cpu)
-{
-	free_all_cpu_cached_iovas(cpu);
-	return 0;
-}
-
 static void intel_disable_iommus(void)
 {
 	struct intel_iommu *iommu = NULL;
@@ -4366,8 +4337,6 @@ int __init intel_iommu_init(void)
 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
 	if (si_domain && !hw_pass_through)
 		register_memory_notifier(&intel_iommu_memory_nb);
-	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
-			  intel_iommu_cpu_dead);
 
 	down_read(&dmar_global_lock);
 	if (probe_acpi_namespace_devices())
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2d55cee638fc..436d12ef291d 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -56,7 +56,6 @@ enum cpuhp_state {
 	CPUHP_PAGE_ALLOC_DEAD,
 	CPUHP_NET_DEV_DEAD,
 	CPUHP_PCI_XGENE_DEAD,
-	CPUHP_IOMMU_INTEL_DEAD,
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
-- 
Gitee


From e272624927893d545a9de8e9931fdb99372f5270 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 26 Nov 2021 17:43:46 +0800
Subject: [PATCH 0393/2161] iommu/vt-d: Cleanup after converting to dma-iommu
 ops

commit 58a8bb39490db31acbe8f4e24593a88533b4d947 upstream.

Some cleanups after converting the driver to use dma-iommu ops.
- Remove nobounce option;
- Cleanup and simplify the path in domain mapping.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20201124082057.2614359-8-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         |  5 --
 drivers/iommu/intel/iommu.c                   | 90 ++++++-------------
 2 files changed, 28 insertions(+), 67 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 47eb94d8d011..c781bad594be 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1746,11 +1746,6 @@
 			Note that using this option lowers the security
 			provided by tboot because it makes the system
 			vulnerable to DMA attacks.
-		nobounce [Default off]
-			Disable bounce buffer for unstrusted devices such as
-			the Thunderbolt devices. This will treat the untrusted
-			devices as the trusted ones, hence might expose security
-			risks of DMA attacks.
 
 	intel_idle.max_cstate=	[KNL,HW,ACPI,X86]
 			0	disables intel_idle and fall back on acpi_idle.
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 09d1f171af27..3eb50f98466a 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -355,7 +355,6 @@ static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
-static int intel_no_bounce;
 static int iommu_skip_te_disable;
 
 #define IDENTMAP_GFX		2
@@ -457,9 +456,6 @@ static int __init intel_iommu_setup(char *str)
 		} else if (!strncmp(str, "tboot_noforce", 13)) {
 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 			intel_iommu_tboot_noforce = 1;
-		} else if (!strncmp(str, "nobounce", 8)) {
-			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
-			intel_no_bounce = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -2277,15 +2273,14 @@ static inline int hardware_largepage_caps(struct dmar_domain *domain,
 	return level;
 }
 
-static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-			    struct scatterlist *sg, unsigned long phys_pfn,
-			    unsigned long nr_pages, int prot)
+static int
+__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
 {
 	struct dma_pte *first_pte = NULL, *pte = NULL;
-	phys_addr_t pteval;
-	unsigned long sg_res = 0;
 	unsigned int largepage_lvl = 0;
 	unsigned long lvl_pages = 0;
+	phys_addr_t pteval;
 	u64 attr;
 
 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
@@ -2297,26 +2292,14 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 	if (domain_use_first_level(domain))
 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
-	if (!sg) {
-		sg_res = nr_pages;
-		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-	}
+	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
 
 	while (nr_pages > 0) {
 		uint64_t tmp;
 
-		if (!sg_res) {
-			unsigned int pgoff = sg->offset & ~PAGE_MASK;
-
-			sg_res = aligned_nrpages(sg->offset, sg->length);
-			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
-			sg->dma_length = sg->length;
-			pteval = (sg_phys(sg) - pgoff) | attr;
-			phys_pfn = pteval >> VTD_PAGE_SHIFT;
-		}
-
 		if (!pte) {
-			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
+					phys_pfn, nr_pages);
 
 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
 			if (!pte)
@@ -2328,7 +2311,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 				pteval |= DMA_PTE_LARGE_PAGE;
 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
 
-				nr_superpages = sg_res / lvl_pages;
+				nr_superpages = nr_pages / lvl_pages;
 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
 
 				/*
@@ -2362,48 +2345,45 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
 
 		BUG_ON(nr_pages < lvl_pages);
-		BUG_ON(sg_res < lvl_pages);
 
 		nr_pages -= lvl_pages;
 		iov_pfn += lvl_pages;
 		phys_pfn += lvl_pages;
 		pteval += lvl_pages * VTD_PAGE_SIZE;
-		sg_res -= lvl_pages;
 
 		/* If the next PTE would be the first in a new page, then we
-		   need to flush the cache on the entries we've just written.
-		   And then we'll need to recalculate 'pte', so clear it and
-		   let it get set again in the if (!pte) block above.
-
-		   If we're done (!nr_pages) we need to flush the cache too.
-
-		   Also if we've been setting superpages, we may need to
-		   recalculate 'pte' and switch back to smaller pages for the
-		   end of the mapping, if the trailing size is not enough to
-		   use another superpage (i.e. sg_res < lvl_pages). */
+		 * need to flush the cache on the entries we've just written.
+		 * And then we'll need to recalculate 'pte', so clear it and
+		 * let it get set again in the if (!pte) block above.
+		 *
+		 * If we're done (!nr_pages) we need to flush the cache too.
+		 *
+		 * Also if we've been setting superpages, we may need to
+		 * recalculate 'pte' and switch back to smaller pages for the
+		 * end of the mapping, if the trailing size is not enough to
+		 * use another superpage (i.e. nr_pages < lvl_pages).
+		 */
 		pte++;
 		if (!nr_pages || first_pte_in_page(pte) ||
-		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
+		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
 			domain_flush_cache(domain, first_pte,
 					   (void *)pte - (void *)first_pte);
 			pte = NULL;
 		}
-
-		if (!sg_res && nr_pages)
-			sg = sg_next(sg);
 	}
+
 	return 0;
 }
 
-static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-			  struct scatterlist *sg, unsigned long phys_pfn,
-			  unsigned long nr_pages, int prot)
+static int
+domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+	       unsigned long phys_pfn, unsigned long nr_pages, int prot)
 {
 	int iommu_id, ret;
 	struct intel_iommu *iommu;
 
 	/* Do the real mapping first */
-	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
+	ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
 	if (ret)
 		return ret;
 
@@ -2415,20 +2395,6 @@ static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 	return 0;
 }
 
-static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-				    struct scatterlist *sg, unsigned long nr_pages,
-				    int prot)
-{
-	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
-}
-
-static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-				     unsigned long phys_pfn, unsigned long nr_pages,
-				     int prot)
-{
-	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
-}
-
 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
 {
 	unsigned long flags;
@@ -2688,7 +2654,7 @@ static int iommu_domain_identity_map(struct dmar_domain *domain,
 	 */
 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
 
-	return __domain_mapping(domain, first_vpfn, NULL,
+	return __domain_mapping(domain, first_vpfn,
 				first_vpfn, last_vpfn - first_vpfn + 1,
 				DMA_PTE_READ|DMA_PTE_WRITE);
 }
@@ -4940,8 +4906,8 @@ static int intel_iommu_map(struct iommu_domain *domain,
 	/* Round up size to next multiple of PAGE_SIZE, if it and
 	   the low bits of hpa would take us onto the next page */
 	size = aligned_nrpages(hpa, size);
-	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
-				 hpa >> VTD_PAGE_SHIFT, size, prot);
+	ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+			     hpa >> VTD_PAGE_SHIFT, size, prot);
 	return ret;
 }
 
-- 
Gitee


From 3a3ce1319c53c181dd6b38979ee50cebe1ffb264 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 24 Nov 2020 21:06:01 +0800
Subject: [PATCH 0394/2161] iommu: Move def_domain type check for untrusted
 device into core

commit 28b41e2c6aebd3caf99a77a76843c0175876bc72 upstream.

So that the vendor iommu drivers are no more required to provide the
def_domain_type callback to always isolate the untrusted devices.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/linux-iommu/243ce89c33fe4b9da4c56ba35acebf81@huawei.com/
Link: https://lore.kernel.org/r/20201124130604.2912899-2-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  7 -------
 drivers/iommu/iommu.c       | 16 +++++++---------
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3eb50f98466a..8745f512cf00 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2831,13 +2831,6 @@ static int device_def_domain_type(struct device *dev)
 	if (dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(dev);
 
-		/*
-		 * Prevent any device marked as untrusted from getting
-		 * placed into the statically identity mapping domain.
-		 */
-		if (pdev->untrusted)
-			return IOMMU_DOMAIN_DMA;
-
 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
 			return IOMMU_DOMAIN_IDENTITY;
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e2afcba94351..baa1c95c6474 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1462,12 +1462,14 @@ EXPORT_SYMBOL_GPL(fsl_mc_device_group);
 static int iommu_get_def_domain_type(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
-	unsigned int type = 0;
+
+	if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
+		return IOMMU_DOMAIN_DMA;
 
 	if (ops->def_domain_type)
-		type = ops->def_domain_type(dev);
+		return ops->def_domain_type(dev);
 
-	return (type == 0) ? iommu_def_domain_type : type;
+	return 0;
 }
 
 static int iommu_group_alloc_default_domain(struct bus_type *bus,
@@ -1517,7 +1519,7 @@ static int iommu_alloc_default_domain(struct iommu_group *group,
 			type = iommu_def_domain_type;
 	}
 #else
-	type = iommu_get_def_domain_type(dev);
+	type = iommu_get_def_domain_type(dev) ? : iommu_def_domain_type;
 #endif
 
 	return iommu_group_alloc_default_domain(dev->bus, group, type);
@@ -1656,12 +1658,8 @@ struct __group_domain_type {
 
 static int probe_get_default_domain_type(struct device *dev, void *data)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct __group_domain_type *gtype = data;
-	unsigned int type = 0;
-
-	if (ops->def_domain_type)
-		type = ops->def_domain_type(dev);
+	unsigned int type = iommu_get_def_domain_type(dev);
 
 	if (type) {
 		if (gtype->type && gtype->type != type) {
-- 
Gitee


From b53c78d114d0758b73d2512c462f507b5fad1a4e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 26 Nov 2020 11:13:51 +0000
Subject: [PATCH 0395/2161] iommu/vt-d: Don't read VCCAP register unless it
 exists

commit d76b42e92780c3587c1a998a3a943b501c137553 upstream.

My virtual IOMMU implementation is whining that the guest is reading a
register that doesn't exist. Only read the VCCAP_REG if the corresponding
capability is set in ECAP_REG to indicate that it actually exists.

Fixes: 3375303e8287 ("iommu/vt-d: Add custom allocator for IOASID")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Liu Yi L <yi.l.liu@intel.com>
Cc: stable@vger.kernel.org # v5.7+
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/de32b150ffaa752e0cff8571b17dfb1213fbe71c.camel@infradead.org
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c  | 3 ++-
 drivers/iommu/intel/iommu.c | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 11319e4dce4a..b46dbfa6d0ed 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -986,7 +986,8 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
 		warn_invalid_dmar(phys_addr, " returns all ones");
 		goto unmap;
 	}
-	iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
+	if (ecap_vcs(iommu->ecap))
+		iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
 
 	/* the registers might be more than one page */
 	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 8745f512cf00..40f50e5eec62 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1829,7 +1829,7 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
 		if (ecap_prs(iommu->ecap))
 			intel_svm_finish_prq(iommu);
 	}
-	if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
+	if (vccap_pasid(iommu->vccap))
 		ioasid_unregister_allocator(&iommu->pasid_allocator);
 
 #endif
@@ -3120,7 +3120,7 @@ static void register_pasid_allocator(struct intel_iommu *iommu)
 	 * is active. All vIOMMU allocators will eventually be calling the same
 	 * host allocator.
 	 */
-	if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
+	if (!vccap_pasid(iommu->vccap))
 		return;
 
 	pr_info("Register custom PASID allocator\n");
-- 
Gitee


From 100059180a16205b057eab13fba811c832760221 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 27 Nov 2020 09:33:08 +0800
Subject: [PATCH 0396/2161] iommu/vt-d: Remove set but not used variable

commit 405a43cc00471d9f06c58704448f1a43e331826a upstream.

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/iommu/intel/iommu.c:5643:27: warning: variable 'last_pfn' set but not used [-Wunused-but-set-variable]
5643 |  unsigned long start_pfn, last_pfn;
     |                           ^~~~~~~~

This variable is never used, so remove it.

Fixes: 2a2b8eaa5b25 ("iommu: Handle freelists when using deferred flushing in iommu drivers")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201127013308.1833610-1-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 40f50e5eec62..7d853712966d 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4939,13 +4939,12 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain,
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	unsigned long iova_pfn = IOVA_PFN(gather->start);
 	size_t size = gather->end - gather->start;
-	unsigned long start_pfn, last_pfn;
+	unsigned long start_pfn;
 	unsigned long nrpages;
 	int iommu_id;
 
 	nrpages = aligned_nrpages(gather->start, size);
 	start_pfn = mm_to_dma_pfn(iova_pfn);
-	last_pfn = start_pfn + nrpages - 1;
 
 	for_each_domain_iommu(iommu_id, dmar_domain)
 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
-- 
Gitee


From 8320fec0766b8a92413ae224dd73daa81fbbe4d7 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Tue, 1 Dec 2020 09:31:49 +0800
Subject: [PATCH 0397/2161] iommu/vt-d: Avoid GFP_ATOMIC where it is not needed

commit 33e07157105e472b746b70b3ed4197c57c43ab68 upstream.

There is no reason to use GFP_ATOMIC in a 'suspend' function.
Use GFP_KERNEL instead to give more opportunities to allocate the
requested memory.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/20201030182630.5154-1-christophe.jaillet@wanadoo.fr
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201201013149.2466272-2-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7d853712966d..4c322245a442 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3511,7 +3511,7 @@ static int iommu_suspend(void)
 
 	for_each_active_iommu(iommu, drhd) {
 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
-						 GFP_ATOMIC);
+					     GFP_KERNEL);
 		if (!iommu->iommu_state)
 			goto nomem;
 	}
-- 
Gitee


From 9611ff2bb877e8e3e403978b6d67a45ff8e17f1d Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 17 Nov 2020 18:25:34 +0800
Subject: [PATCH 0398/2161] iommu: avoid taking iova_rbtree_lock twice

commit 3a651b3a27a1ee35879499ead3942dc854a20968 upstream.

Both find_iova() and __free_iova() take iova_rbtree_lock,
there is no reason to take and release it twice inside
free_iova().

Fold them into one critical section by calling the unlock
versions instead.

Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1605608734-84416-5-git-send-email-john.garry@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index ea04a88c673d..ff59d8aa3041 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -402,10 +402,14 @@ EXPORT_SYMBOL_GPL(__free_iova);
 void
 free_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	struct iova *iova = find_iova(iovad, pfn);
+	unsigned long flags;
+	struct iova *iova;
 
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = private_find_iova(iovad, pfn);
 	if (iova)
-		__free_iova(iovad, iova);
+		private_free_iova(iovad, iova);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 
 }
 EXPORT_SYMBOL_GPL(free_iova);
-- 
Gitee


From 8bbd8cae32444ee65d0148747977dc6480ae8e2f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 31 Dec 2020 08:53:19 +0800
Subject: [PATCH 0399/2161] iommu/vt-d: Fix misuse of ALIGN in
 qi_flush_piotlb()

commit 1efd17e7acb6692bffc6c58718f41f27fdfd62f5 upstream.

Use IS_ALIGNED() instead. Otherwise, an unaligned address will be ignored.

Fixes: 33cd6e642d6a ("iommu/vt-d: Flush PASID-based iotlb for iova over first level")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201231005323.2178523-1-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index b46dbfa6d0ed..004feaed3c72 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1461,8 +1461,8 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
 		int mask = ilog2(__roundup_pow_of_two(npages));
 		unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask));
 
-		if (WARN_ON_ONCE(!ALIGN(addr, align)))
-			addr &= ~(align - 1);
+		if (WARN_ON_ONCE(!IS_ALIGNED(addr, align)))
+			addr = ALIGN_DOWN(addr, align);
 
 		desc.qw0 = QI_EIOTLB_PASID(pasid) |
 				QI_EIOTLB_DID(did) |
-- 
Gitee


From 10a0864c2d21bb162576570964af19f1efe6fb17 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 31 Dec 2020 08:53:23 +0800
Subject: [PATCH 0400/2161] iommu/vt-d: Fix lockdep splat in sva
 bind()/unbind()

commit 420d42f6f9db27d88bc4f83e3e668fcdacbf7e29 upstream.

Lock(&iommu->lock) without disabling irq causes lockdep warnings.

========================================================
WARNING: possible irq lock inversion dependency detected
5.11.0-rc1+ #828 Not tainted
--------------------------------------------------------
kworker/0:1H/120 just changed the state of lock:
ffffffffad9ea1b8 (device_domain_lock){..-.}-{2:2}, at:
iommu_flush_dev_iotlb.part.0+0x32/0x120
but this lock took another, SOFTIRQ-unsafe lock in the past:
 (&iommu->lock){+.+.}-{2:2}

and interrupts could create inverse lock ordering between them.

other info that might help us debug this:
 Possible interrupt unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&iommu->lock);
                               local_irq_disable();
                               lock(device_domain_lock);
                               lock(&iommu->lock);
  <Interrupt>
    lock(device_domain_lock);

 *** DEADLOCK ***

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201231005323.2178523-5-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ae85522d3db1..331cc83058a2 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -299,6 +299,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	struct dmar_domain *dmar_domain;
 	struct device_domain_info *info;
 	struct intel_svm *svm = NULL;
+	unsigned long iflags;
 	int ret = 0;
 
 	if (WARN_ON(!iommu) || !data)
@@ -399,12 +400,12 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	 * each bind of a new device even with an existing PASID, we need to
 	 * call the nested mode setup function here.
 	 */
-	spin_lock(&iommu->lock);
+	spin_lock_irqsave(&iommu->lock, iflags);
 	ret = intel_pasid_setup_nested(iommu, dev,
 				       (pgd_t *)(uintptr_t)data->gpgd,
 				       data->hpasid, &data->vendor.vtd, dmar_domain,
 				       data->addr_width);
-	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&iommu->lock, iflags);
 	if (ret) {
 		dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
 				    data->hpasid, ret);
@@ -504,6 +505,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 	struct device_domain_info *info;
 	struct intel_svm_dev *sdev;
 	struct intel_svm *svm = NULL;
+	unsigned long iflags;
 	int pasid_max;
 	int ret;
 
@@ -623,14 +625,14 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 			}
 		}
 
-		spin_lock(&iommu->lock);
+		spin_lock_irqsave(&iommu->lock, iflags);
 		ret = intel_pasid_setup_first_level(iommu, dev,
 				mm ? mm->pgd : init_mm.pgd,
 				svm->pasid, FLPT_DEFAULT_DID,
 				(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
 				(cpu_feature_enabled(X86_FEATURE_LA57) ?
 				 PASID_FLAG_FL5LP : 0));
-		spin_unlock(&iommu->lock);
+		spin_unlock_irqrestore(&iommu->lock, iflags);
 		if (ret) {
 			if (mm)
 				mmu_notifier_unregister(&svm->notifier, mm);
@@ -650,14 +652,14 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		 * Binding a new device with existing PASID, need to setup
 		 * the PASID entry.
 		 */
-		spin_lock(&iommu->lock);
+		spin_lock_irqsave(&iommu->lock, iflags);
 		ret = intel_pasid_setup_first_level(iommu, dev,
 						mm ? mm->pgd : init_mm.pgd,
 						svm->pasid, FLPT_DEFAULT_DID,
 						(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
 						(cpu_feature_enabled(X86_FEATURE_LA57) ?
 						PASID_FLAG_FL5LP : 0));
-		spin_unlock(&iommu->lock);
+		spin_unlock_irqrestore(&iommu->lock, iflags);
 		if (ret) {
 			kfree(sdev);
 			goto out;
-- 
Gitee


From 9dbc781a34fa9bcdd954905802d1d4478b6afa9b Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Thu, 7 Jan 2021 00:03:55 +0800
Subject: [PATCH 0401/2161] iommu/vt-d: Move intel_iommu info from struct
 intel_svm to struct intel_svm_dev

commit 9ad9f45b3b91162b33abfe175ae75ab65718dbf5 upstream.

'struct intel_svm' is shared by all devices bound to a give process,
but records only a single pointer to a 'struct intel_iommu'. Consequently,
cache invalidations may only be applied to a single DMAR unit, and are
erroneously skipped for the other devices.

In preparation for fixing this, rework the structures so that the iommu
pointer resides in 'struct intel_svm_dev', allowing 'struct intel_svm'
to track them in its device list.

Fixes: 1c4f88b7f1f9 ("iommu/vt-d: Shared virtual address in scalable mode")
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Raj Ashok <ashok.raj@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Reported-by: Guo Kaijie <Kaijie.Guo@intel.com>
Reported-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Guo Kaijie <Kaijie.Guo@intel.com>
Signed-off-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Tested-by: Guo Kaijie <Kaijie.Guo@intel.com>
Cc: stable@vger.kernel.org # v5.0+
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1609949037-25291-2-git-send-email-yi.l.liu@intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c   | 9 +++++----
 include/linux/intel-iommu.h | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 331cc83058a2..d9379b9d6637 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -117,7 +117,7 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
 	}
 	desc.qw2 = 0;
 	desc.qw3 = 0;
-	qi_submit_sync(svm->iommu, &desc, 1, 0);
+	qi_submit_sync(sdev->iommu, &desc, 1, 0);
 
 	if (sdev->dev_iotlb) {
 		desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
@@ -141,7 +141,7 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
 		}
 		desc.qw2 = 0;
 		desc.qw3 = 0;
-		qi_submit_sync(svm->iommu, &desc, 1, 0);
+		qi_submit_sync(sdev->iommu, &desc, 1, 0);
 	}
 }
 
@@ -229,7 +229,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 */
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdev, &svm->devs, list)
-		intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
+		intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
 					    svm->pasid, true);
 	rcu_read_unlock();
 
@@ -382,6 +382,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	}
 	sdev->dev = dev;
 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
+	sdev->iommu = iommu;
 
 	/* Only count users if device has aux domains */
 	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
@@ -566,6 +567,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		goto out;
 	}
 	sdev->dev = dev;
+	sdev->iommu = iommu;
 
 	ret = intel_iommu_enable_pasid(iommu, dev);
 	if (ret) {
@@ -595,7 +597,6 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 			kfree(sdev);
 			goto out;
 		}
-		svm->iommu = iommu;
 
 		if (pasid_max > intel_pasid_max_id)
 			pasid_max = intel_pasid_max_id;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d956987ed032..94522685a0d9 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -758,6 +758,7 @@ struct intel_svm_dev {
 	struct list_head list;
 	struct rcu_head rcu;
 	struct device *dev;
+	struct intel_iommu *iommu;
 	struct svm_dev_ops *ops;
 	struct iommu_sva sva;
 	u32 pasid;
@@ -771,7 +772,6 @@ struct intel_svm {
 	struct mmu_notifier notifier;
 	struct mm_struct *mm;
 
-	struct intel_iommu *iommu;
 	unsigned int flags;
 	u32 pasid;
 	int gpasid; /* In case that guest PASID is different from host PASID */
-- 
Gitee


From 9d2afb24f3766273c0c510fcf41dc828445bc21c Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Thu, 7 Jan 2021 00:03:56 +0800
Subject: [PATCH 0402/2161] iommu/vt-d: Fix general protection fault in
 aux_detach_device()

commit 18abda7a2d555783d28ea1701f3ec95e96237a86 upstream.

The aux-domain attach/detach are not tracked, some data structures might
be used after free. This causes general protection faults when multiple
subdevices are created and assigned to a same guest machine:

  | general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] SMP NOPTI
  | RIP: 0010:intel_iommu_aux_detach_device+0x12a/0x1f0
  | [...]
  | Call Trace:
  |  iommu_aux_detach_device+0x24/0x70
  |  vfio_mdev_detach_domain+0x3b/0x60
  |  ? vfio_mdev_set_domain+0x50/0x50
  |  iommu_group_for_each_dev+0x4f/0x80
  |  vfio_iommu_detach_group.isra.0+0x22/0x30
  |  vfio_iommu_type1_detach_group.cold+0x71/0x211
  |  ? find_exported_symbol_in_section+0x4a/0xd0
  |  ? each_symbol_section+0x28/0x50
  |  __vfio_group_unset_container+0x4d/0x150
  |  vfio_group_try_dissolve_container+0x25/0x30
  |  vfio_group_put_external_user+0x13/0x20
  |  kvm_vfio_group_put_external_user+0x27/0x40 [kvm]
  |  kvm_vfio_destroy+0x45/0xb0 [kvm]
  |  kvm_put_kvm+0x1bb/0x2e0 [kvm]
  |  kvm_vm_release+0x22/0x30 [kvm]
  |  __fput+0xcc/0x260
  |  ____fput+0xe/0x10
  |  task_work_run+0x8f/0xb0
  |  do_exit+0x358/0xaf0
  |  ? wake_up_state+0x10/0x20
  |  ? signal_wake_up_state+0x1a/0x30
  |  do_group_exit+0x47/0xb0
  |  __x64_sys_exit_group+0x18/0x20
  |  do_syscall_64+0x57/0x1d0
  |  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fix the crash by tracking the subdevices when attaching and detaching
aux-domains.

Fixes: 67b8e02b5e76 ("iommu/vt-d: Aux-domain specific domain attach/detach")
Co-developed-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1609949037-25291-3-git-send-email-yi.l.liu@intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 95 +++++++++++++++++++++++++++----------
 include/linux/intel-iommu.h | 16 +++++--
 2 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 4c322245a442..ee54186b60f5 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1877,6 +1877,7 @@ static struct dmar_domain *alloc_domain(int flags)
 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
 	domain->has_iotlb_device = false;
 	INIT_LIST_HEAD(&domain->devices);
+	INIT_LIST_HEAD(&domain->subdevices);
 
 	return domain;
 }
@@ -2547,7 +2548,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	info->iommu = iommu;
 	info->pasid_table = NULL;
 	info->auxd_enabled = 0;
-	INIT_LIST_HEAD(&info->auxiliary_domains);
+	INIT_LIST_HEAD(&info->subdevices);
 
 	if (dev && dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -4472,33 +4473,61 @@ is_aux_domain(struct device *dev, struct iommu_domain *domain)
 			domain->type == IOMMU_DOMAIN_UNMANAGED;
 }
 
-static void auxiliary_link_device(struct dmar_domain *domain,
-				  struct device *dev)
+static inline struct subdev_domain_info *
+lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
+{
+	struct subdev_domain_info *sinfo;
+
+	if (!list_empty(&domain->subdevices)) {
+		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
+			if (sinfo->pdev == dev)
+				return sinfo;
+		}
+	}
+
+	return NULL;
+}
+
+static int auxiliary_link_device(struct dmar_domain *domain,
+				 struct device *dev)
 {
 	struct device_domain_info *info = get_domain_info(dev);
+	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
 
 	assert_spin_locked(&device_domain_lock);
 	if (WARN_ON(!info))
-		return;
+		return -EINVAL;
+
+	if (!sinfo) {
+		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
+		sinfo->domain = domain;
+		sinfo->pdev = dev;
+		list_add(&sinfo->link_phys, &info->subdevices);
+		list_add(&sinfo->link_domain, &domain->subdevices);
+	}
 
-	domain->auxd_refcnt++;
-	list_add(&domain->auxd, &info->auxiliary_domains);
+	return ++sinfo->users;
 }
 
-static void auxiliary_unlink_device(struct dmar_domain *domain,
-				    struct device *dev)
+static int auxiliary_unlink_device(struct dmar_domain *domain,
+				   struct device *dev)
 {
 	struct device_domain_info *info = get_domain_info(dev);
+	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
+	int ret;
 
 	assert_spin_locked(&device_domain_lock);
-	if (WARN_ON(!info))
-		return;
+	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
+		return -EINVAL;
 
-	list_del(&domain->auxd);
-	domain->auxd_refcnt--;
+	ret = --sinfo->users;
+	if (!ret) {
+		list_del(&sinfo->link_phys);
+		list_del(&sinfo->link_domain);
+		kfree(sinfo);
+	}
 
-	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		ioasid_put(domain->default_pasid);
+	return ret;
 }
 
 static int aux_domain_add_dev(struct dmar_domain *domain,
@@ -4527,6 +4556,19 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 	}
 
 	spin_lock_irqsave(&device_domain_lock, flags);
+	ret = auxiliary_link_device(domain, dev);
+	if (ret <= 0)
+		goto link_failed;
+
+	/*
+	 * Subdevices from the same physical device can be attached to the
+	 * same domain. For such cases, only the first subdevice attachment
+	 * needs to go through the full steps in this function. So if ret >
+	 * 1, just goto out.
+	 */
+	if (ret > 1)
+		goto out;
+
 	/*
 	 * iommu->lock must be held to attach domain to iommu and setup the
 	 * pasid entry for second level translation.
@@ -4545,10 +4587,9 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 						     domain->default_pasid);
 	if (ret)
 		goto table_failed;
-	spin_unlock(&iommu->lock);
-
-	auxiliary_link_device(domain, dev);
 
+	spin_unlock(&iommu->lock);
+out:
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	return 0;
@@ -4557,8 +4598,10 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 	domain_detach_iommu(domain, iommu);
 attach_failed:
 	spin_unlock(&iommu->lock);
+	auxiliary_unlink_device(domain, dev);
+link_failed:
 	spin_unlock_irqrestore(&device_domain_lock, flags);
-	if (!domain->auxd_refcnt && domain->default_pasid > 0)
+	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
 		ioasid_put(domain->default_pasid);
 
 	return ret;
@@ -4578,14 +4621,18 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
 	info = get_domain_info(dev);
 	iommu = info->iommu;
 
-	auxiliary_unlink_device(domain, dev);
-
-	spin_lock(&iommu->lock);
-	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
-	domain_detach_iommu(domain, iommu);
-	spin_unlock(&iommu->lock);
+	if (!auxiliary_unlink_device(domain, dev)) {
+		spin_lock(&iommu->lock);
+		intel_pasid_tear_down_entry(iommu, dev,
+					    domain->default_pasid, false);
+		domain_detach_iommu(domain, iommu);
+		spin_unlock(&iommu->lock);
+	}
 
 	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
+		ioasid_put(domain->default_pasid);
 }
 
 static int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 94522685a0d9..09c6a0bf3892 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -533,11 +533,10 @@ struct dmar_domain {
 					/* Domain ids per IOMMU. Use u16 since
 					 * domain ids are 16 bit wide according
 					 * to VT-d spec, section 9.3 */
-	unsigned int	auxd_refcnt;	/* Refcount of auxiliary attaching */
 
 	bool has_iotlb_device;
 	struct list_head devices;	/* all devices' list */
-	struct list_head auxd;		/* link to device's auxiliary list */
+	struct list_head subdevices;	/* all subdevices' list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
 
 	struct dma_pte	*pgd;		/* virtual address */
@@ -610,14 +609,21 @@ struct intel_iommu {
 	struct dmar_drhd_unit *drhd;
 };
 
+/* Per subdevice private data */
+struct subdev_domain_info {
+	struct list_head link_phys;	/* link to phys device siblings */
+	struct list_head link_domain;	/* link to domain siblings */
+	struct device *pdev;		/* physical device derived from */
+	struct dmar_domain *domain;	/* aux-domain */
+	int users;			/* user count */
+};
+
 /* PCI domain-device relationship */
 struct device_domain_info {
 	struct list_head link;	/* link to domain siblings */
 	struct list_head global; /* link to global list */
 	struct list_head table;	/* link to pasid table */
-	struct list_head auxiliary_domains; /* auxiliary domains
-					     * attached to this device
-					     */
+	struct list_head subdevices; /* subdevices sibling */
 	u32 segment;		/* PCI segment number */
 	u8 bus;			/* PCI bus number */
 	u8 devfn;		/* PCI devfn number */
-- 
Gitee


From 62625fa87812273efaabee5c45e4dabb4a7662eb Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Thu, 7 Jan 2021 00:03:57 +0800
Subject: [PATCH 0403/2161] iommu/vt-d: Fix ineffective devTLB invalidation for
 subdevices

commit 7c29ada5e70083805bc3a68daa23441df421fbee upstream.

iommu_flush_dev_iotlb() is called to invalidate caches on a device but
only loops over the devices which are fully-attached to the domain. For
sub-devices, this is ineffective and can result in invalid caching
entries left on the device.

Fix the missing invalidation by adding a loop over the subdevices and
ensuring that 'domain->has_iotlb_device' is updated when attaching to
subdevices.

Fixes: 67b8e02b5e76 ("iommu/vt-d: Aux-domain specific domain attach/detach")
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1609949037-25291-4-git-send-email-yi.l.liu@intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 53 ++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index ee54186b60f5..3b2a47e07c06 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -719,6 +719,8 @@ static int domain_update_device_node(struct dmar_domain *domain)
 	return nid;
 }
 
+static void domain_update_iotlb(struct dmar_domain *domain);
+
 /* Some capabilities may be different across iommus */
 static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
@@ -744,6 +746,8 @@ static void domain_update_iommu_cap(struct dmar_domain *domain)
 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 	else
 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
+
+	domain_update_iotlb(domain);
 }
 
 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
@@ -1464,17 +1468,22 @@ static void domain_update_iotlb(struct dmar_domain *domain)
 
 	assert_spin_locked(&device_domain_lock);
 
-	list_for_each_entry(info, &domain->devices, link) {
-		struct pci_dev *pdev;
-
-		if (!info->dev || !dev_is_pci(info->dev))
-			continue;
-
-		pdev = to_pci_dev(info->dev);
-		if (pdev->ats_enabled) {
+	list_for_each_entry(info, &domain->devices, link)
+		if (info->ats_enabled) {
 			has_iotlb_device = true;
 			break;
 		}
+
+	if (!has_iotlb_device) {
+		struct subdev_domain_info *sinfo;
+
+		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
+			info = get_domain_info(sinfo->pdev);
+			if (info && info->ats_enabled) {
+				has_iotlb_device = true;
+				break;
+			}
+		}
 	}
 
 	domain->has_iotlb_device = has_iotlb_device;
@@ -1555,25 +1564,37 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
 #endif
 }
 
+static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
+				    u64 addr, unsigned int mask)
+{
+	u16 sid, qdep;
+
+	if (!info || !info->ats_enabled)
+		return;
+
+	sid = info->bus << 8 | info->devfn;
+	qdep = info->ats_qdep;
+	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+			   qdep, addr, mask);
+}
+
 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
 				  u64 addr, unsigned mask)
 {
-	u16 sid, qdep;
 	unsigned long flags;
 	struct device_domain_info *info;
+	struct subdev_domain_info *sinfo;
 
 	if (!domain->has_iotlb_device)
 		return;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_entry(info, &domain->devices, link) {
-		if (!info->ats_enabled)
-			continue;
+	list_for_each_entry(info, &domain->devices, link)
+		__iommu_flush_dev_iotlb(info, addr, mask);
 
-		sid = info->bus << 8 | info->devfn;
-		qdep = info->ats_qdep;
-		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
-				qdep, addr, mask);
+	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
+		info = get_domain_info(sinfo->pdev);
+		__iommu_flush_dev_iotlb(info, addr, mask);
 	}
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
-- 
Gitee


From f870a7def14b69282945454eded26727032f7ca9 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 14 Jan 2021 16:50:21 +0800
Subject: [PATCH 0404/2161] iommu/vt-d: Consolidate duplicate cache invaliation
 code

commit 9872f9bd9dbd68f75e8db782717d71e8594f6a02 upstream.

The pasid based IOTLB and devTLB invalidation code is duplicate in
several places. Consolidate them by using the common helpers.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210114085021.717041-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 18 ++----------
 drivers/iommu/intel/svm.c   | 55 ++++++-------------------------------
 2 files changed, 11 insertions(+), 62 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index b92af83b79bd..f26cb6195b2c 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -456,20 +456,6 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
 	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
-static void
-iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid)
-{
-	struct qi_desc desc;
-
-	desc.qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
-			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE;
-	desc.qw1 = 0;
-	desc.qw2 = 0;
-	desc.qw3 = 0;
-
-	qi_submit_sync(iommu, &desc, 1, 0);
-}
-
 static void
 devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
 			       struct device *dev, u32 pasid)
@@ -514,7 +500,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 		clflush_cache_range(pte, sizeof(*pte));
 
 	pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-	iotlb_invalidation_with_pasid(iommu, did, pasid);
+	qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
 
 	/* Device IOTLB doesn't need to be flushed in caching mode. */
 	if (!cap_caching_mode(iommu->cap))
@@ -530,7 +516,7 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
 
 	if (cap_caching_mode(iommu->cap)) {
 		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-		iotlb_invalidation_with_pasid(iommu, did, pasid);
+		qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
 	} else {
 		iommu_flush_write_buffer(iommu);
 	}
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index d9379b9d6637..ae9c28bcb800 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -96,53 +96,16 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
 				  unsigned long address,
 				  unsigned long pages, int ih)
 {
-	struct qi_desc desc;
+	struct device_domain_info *info = get_domain_info(sdev->dev);
 
-	if (pages == -1) {
-		desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
-			QI_EIOTLB_DID(sdev->did) |
-			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
-			QI_EIOTLB_TYPE;
-		desc.qw1 = 0;
-	} else {
-		int mask = ilog2(__roundup_pow_of_two(pages));
-
-		desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
-				QI_EIOTLB_DID(sdev->did) |
-				QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
-				QI_EIOTLB_TYPE;
-		desc.qw1 = QI_EIOTLB_ADDR(address) |
-				QI_EIOTLB_IH(ih) |
-				QI_EIOTLB_AM(mask);
-	}
-	desc.qw2 = 0;
-	desc.qw3 = 0;
-	qi_submit_sync(sdev->iommu, &desc, 1, 0);
-
-	if (sdev->dev_iotlb) {
-		desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
-				QI_DEV_EIOTLB_SID(sdev->sid) |
-				QI_DEV_EIOTLB_QDEP(sdev->qdep) |
-				QI_DEIOTLB_TYPE;
-		if (pages == -1) {
-			desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
-					QI_DEV_EIOTLB_SIZE;
-		} else if (pages > 1) {
-			/* The least significant zero bit indicates the size. So,
-			 * for example, an "address" value of 0x12345f000 will
-			 * flush from 0x123440000 to 0x12347ffff (256KiB). */
-			unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
-			unsigned long mask = __rounddown_pow_of_two(address ^ last);
-
-			desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
-					(mask - 1)) | QI_DEV_EIOTLB_SIZE;
-		} else {
-			desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
-		}
-		desc.qw2 = 0;
-		desc.qw3 = 0;
-		qi_submit_sync(sdev->iommu, &desc, 1, 0);
-	}
+	if (WARN_ON(!pages))
+		return;
+
+	qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih);
+	if (info->ats_enabled)
+		qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid,
+					 svm->pasid, sdev->qdep, address,
+					 order_base_2(pages));
 }
 
 static inline bool intel_svm_capable(struct intel_iommu *iommu)
-- 
Gitee


From 264134db09b2943f9de335b197dff85f58175ef1 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 14 Jan 2021 17:04:00 +0800
Subject: [PATCH 0405/2161] iommu/vt-d: Add qi_submit trace event

commit f2dd871799ba5d80f95f9bdbc0e60d390e1bcd22 upstream.

This adds a new trace event to track the submissions of requests to the
invalidation queue. This event will provide the information like:
- IOMMU name
- Invalidation type
- Descriptor raw data

A sample output like:
| qi_submit: iotlb_inv dmar1: 0x100e2 0x0 0x0 0x0
| qi_submit: dev_tlb_inv dmar1: 0x1000000003 0x7ffffffffffff001 0x0 0x0
| qi_submit: iotlb_inv dmar2: 0x800f2 0xf9a00005 0x0 0x0

This will be helpful for queued invalidation related debugging.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210114090400.736104-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c         |  3 +++
 include/trace/events/intel_iommu.h | 37 ++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 004feaed3c72..bd51f33642e0 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -31,6 +31,7 @@
 #include <linux/limits.h>
 #include <asm/irq_remapping.h>
 #include <asm/iommu_table.h>
+#include <trace/events/intel_iommu.h>
 
 #include "../irq_remapping.h"
 
@@ -1307,6 +1308,8 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
 		offset = ((index + i) % QI_LENGTH) << shift;
 		memcpy(qi->desc + offset, &desc[i], 1 << shift);
 		qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE;
+		trace_qi_submit(iommu, desc[i].qw0, desc[i].qw1,
+				desc[i].qw2, desc[i].qw3);
 	}
 	qi->desc_status[wait_index] = QI_IN_USE;
 
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index 112bd06487bf..aad2ff0c1e2e 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -135,6 +135,43 @@ DEFINE_EVENT(dma_map_sg, bounce_map_sg,
 		 struct scatterlist *sg),
 	TP_ARGS(dev, index, total, sg)
 );
+
+TRACE_EVENT(qi_submit,
+	TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3),
+
+	TP_ARGS(iommu, qw0, qw1, qw2, qw3),
+
+	TP_STRUCT__entry(
+		__field(u64, qw0)
+		__field(u64, qw1)
+		__field(u64, qw2)
+		__field(u64, qw3)
+		__string(iommu, iommu->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(iommu, iommu->name);
+		__entry->qw0 = qw0;
+		__entry->qw1 = qw1;
+		__entry->qw2 = qw2;
+		__entry->qw3 = qw3;
+	),
+
+	TP_printk("%s %s: 0x%llx 0x%llx 0x%llx 0x%llx",
+		  __print_symbolic(__entry->qw0 & 0xf,
+				   { QI_CC_TYPE,	"cc_inv" },
+				   { QI_IOTLB_TYPE,	"iotlb_inv" },
+				   { QI_DIOTLB_TYPE,	"dev_tlb_inv" },
+				   { QI_IEC_TYPE,	"iec_inv" },
+				   { QI_IWD_TYPE,	"inv_wait" },
+				   { QI_EIOTLB_TYPE,	"p_iotlb_inv" },
+				   { QI_PC_TYPE,	"pc_inv" },
+				   { QI_DEIOTLB_TYPE,	"p_dev_tlb_inv" },
+				   { QI_PGRP_RESP_TYPE,	"page_grp_resp" }),
+		__get_str(iommu),
+		__entry->qw0, __entry->qw1, __entry->qw2, __entry->qw3
+	)
+);
 #endif /* _TRACE_INTEL_IOMMU_H */
 
 /* This part must be outside protection */
-- 
Gitee


From 38be1b1cb6301d00b87b5a3409efcc829fd1fee0 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 15 Jan 2021 08:42:02 +0800
Subject: [PATCH 0406/2161] iommu/vt-d: Preset Access/Dirty bits for IOVA over
 FL

commit a8ce9ebbecdfda3322bbcece6b3b25888217f8e3 upstream.

The Access/Dirty bits in the first level page table entry will be set
whenever a page table entry was used for address translation or write
permission was successfully translated. This is always true when using
the first-level page table for kernel IOVA. Instead of wasting hardware
cycles to update the certain bits, it's better to set them up at the
beginning.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210115004202.953965-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 14 ++++++++++++--
 include/linux/intel-iommu.h |  2 ++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3b2a47e07c06..04cdbab96824 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1018,8 +1018,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 
 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
-			if (domain_use_first_level(domain))
+			if (domain_use_first_level(domain)) {
 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
+				if (domain->domain.type == IOMMU_DOMAIN_DMA)
+					pteval |= DMA_FL_PTE_ACCESS;
+			}
 			if (cmpxchg64(&pte->val, 0ULL, pteval))
 				/* Someone else set it while we were thinking; use theirs. */
 				free_pgtable_page(tmp_page);
@@ -2311,9 +2314,16 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		return -EINVAL;
 
 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
-	if (domain_use_first_level(domain))
+	if (domain_use_first_level(domain)) {
 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
+		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
+			attr |= DMA_FL_PTE_ACCESS;
+			if (prot & DMA_PTE_WRITE)
+				attr |= DMA_FL_PTE_DIRTY;
+		}
+	}
+
 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
 
 	while (nr_pages > 0) {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 09c6a0bf3892..ecb35fdff03e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -42,6 +42,8 @@
 
 #define DMA_FL_PTE_PRESENT	BIT_ULL(0)
 #define DMA_FL_PTE_US		BIT_ULL(2)
+#define DMA_FL_PTE_ACCESS	BIT_ULL(5)
+#define DMA_FL_PTE_DIRTY	BIT_ULL(6)
 #define DMA_FL_PTE_XD		BIT_ULL(63)
 
 #define ADDR_WIDTH_5LEVEL	(57)
-- 
Gitee


From 2b8354748a6615dc564a5e9e4a206d26e0fea5eb Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 19 Jan 2021 12:35:00 +0800
Subject: [PATCH 0407/2161] iommu/vt-d: Correctly check addr alignment in
 qi_flush_dev_iotlb_pasid()

commit 494b3688bb11a21af12e92a344a1313486693d47 upstream.

An incorrect address mask is being used in the qi_flush_dev_iotlb_pasid()
to check the address alignment. This leads to a lot of spurious kernel
warnings:

[  485.837093] DMAR: Invalidate non-aligned address 7f76f47f9000, order 0
[  485.837098] DMAR: Invalidate non-aligned address 7f76f47f9000, order 0
[  492.494145] qi_flush_dev_iotlb_pasid: 5734 callbacks suppressed
[  492.494147] DMAR: Invalidate non-aligned address 7f7728800000, order 11
[  492.508965] DMAR: Invalidate non-aligned address 7f7728800000, order 11

Fix it by checking the alignment in right way.

Fixes: 288d08e780088 ("iommu/vt-d: Handle non-page aligned address")
Reported-and-tested-by: Guo Kaijie <Kaijie.Guo@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Liu Yi L <yi.l.liu@intel.com>
Link: https://lore.kernel.org/r/20210119043500.1539596-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index bd51f33642e0..980e8ae090af 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1499,7 +1499,7 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 	 * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
 	 * ECAP.
 	 */
-	if (addr & GENMASK_ULL(size_order + VTD_PAGE_SHIFT, 0))
+	if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order))
 		pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n",
 				    addr, size_order);
 
-- 
Gitee


From fbf27135d00c8b08f5b435450b99eaf455231555 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Wed, 27 Jan 2021 09:53:17 -0800
Subject: [PATCH 0408/2161] iommu/vt-d: Do not use flush-queue when
 caching-mode is on

commit 29b32839725f8c89a41cb6ee054c85f3116ea8b5 upstream.

When an Intel IOMMU is virtualized, and a physical device is
passed-through to the VM, changes of the virtual IOMMU need to be
propagated to the physical IOMMU. The hypervisor therefore needs to
monitor PTE mappings in the IOMMU page-tables. Intel specifications
provide "caching-mode" capability that a virtual IOMMU uses to report
that the IOMMU is virtualized and a TLB flush is needed after mapping to
allow the hypervisor to propagate virtual IOMMU mappings to the physical
IOMMU. To the best of my knowledge no real physical IOMMU reports
"caching-mode" as turned on.

Synchronizing the virtual and the physical IOMMU tables is expensive if
the hypervisor is unaware which PTEs have changed, as the hypervisor is
required to walk all the virtualized tables and look for changes.
Consequently, domain flushes are much more expensive than page-specific
flushes on virtualized IOMMUs with passthrough devices. The kernel
therefore exploited the "caching-mode" indication to avoid domain
flushing and use page-specific flushing in virtualized environments. See
commit 78d5f0f500e6 ("intel-iommu: Avoid global flushes with caching
mode.")

This behavior changed after commit 13cf01744608 ("iommu/vt-d: Make use
of iova deferred flushing"). Now, when batched TLB flushing is used (the
default), full TLB domain flushes are performed frequently, requiring
the hypervisor to perform expensive synchronization between the virtual
TLB and the physical one.

Getting batched TLB flushes to use page-specific invalidations again in
such circumstances is not easy, since the TLB invalidation scheme
assumes that "full" domain TLB flushes are performed for scalability.

Disable batched TLB flushes when caching-mode is on, as the performance
benefit from using batched TLB invalidations is likely to be much
smaller than the overhead of the virtual-to-physical IOMMU page-tables
synchronization.

Fixes: 13cf01744608 ("iommu/vt-d: Make use of iova deferred flushing")
Signed-off-by: Nadav Amit <namit@vmware.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Cc: stable@vger.kernel.org

Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210127175317.1600473-1-namit@vmware.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 04cdbab96824..3e3027fb7814 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3384,6 +3384,36 @@ static inline int iommu_domain_cache_init(void)
 	return ret;
 }
 
+static bool domain_use_flush_queue(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	bool r = true;
+
+	if (intel_iommu_strict)
+		return false;
+
+	/*
+	 * The flush queue implementation does not perform page-selective
+	 * invalidations that are required for efficient TLB flushes in virtual
+	 * environments. The benefit of batching is likely to be much lower than
+	 * the overhead of synchronizing the virtual and physical IOMMU
+	 * page-tables.
+	 */
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (!cap_caching_mode(iommu->cap))
+			continue;
+
+		pr_warn_once("IOMMU batching is disabled due to virtualization");
+		r = false;
+		break;
+	}
+	rcu_read_unlock();
+
+	return r;
+}
+
 static int
 intel_iommu_domain_get_attr(struct iommu_domain *domain,
 			    enum iommu_attr attr, void *data)
@@ -3394,7 +3424,7 @@ intel_iommu_domain_get_attr(struct iommu_domain *domain,
 	case IOMMU_DOMAIN_DMA:
 		switch (attr) {
 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-			*(int *)data = !intel_iommu_strict;
+			*(int *)data = domain_use_flush_queue();
 			return 0;
 		default:
 			return -ENODEV;
-- 
Gitee


From 7968deaf48301efa51c55ae0e8ed40a36f6473dd Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 26 Jan 2021 16:07:29 +0800
Subject: [PATCH 0409/2161] iommu/vt-d: Clear PRQ overflow only when PRQ is
 empty

commit 28a77185f1cd0650b664f54614143aaaa3a7a615 upstream.

It is incorrect to always clear PRO when it's set w/o first checking
whether the overflow condition has been cleared. Current code assumes
that if an overflow condition occurs it must have been cleared by earlier
loop. However since the code runs in a threaded context, the overflow
condition could occur even after setting the head to the tail under some
extreme condition. To be sane, we should read both head/tail again when
seeing a pending PRO and only clear PRO after all pending PRs have been
handled.

Suggested-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/linux-iommu/MWHPR11MB18862D2EA5BD432BF22D99A48CA09@MWHPR11MB1886.namprd11.prod.outlook.com/
Link: https://lore.kernel.org/r/20210126080730.2232859-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ae9c28bcb800..bc4acbe64673 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1041,8 +1041,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	 * Clear the page request overflow bit and wake up all threads that
 	 * are waiting for the completion of this handling.
 	 */
-	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
-		writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+		pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
+				    iommu->name);
+		head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+		tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+		if (head == tail) {
+			writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+			pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
+					    iommu->name);
+		}
+	}
 
 	if (!completion_done(&iommu->prq_complete))
 		complete(&iommu->prq_complete);
-- 
Gitee


From 7cdf9c75970f887946b2e3ec107bac7354ac40a4 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 26 Jan 2021 16:07:30 +0800
Subject: [PATCH 0410/2161] iommu/vt-d: Use INVALID response code instead of
 FAILURE

commit 3aa7c62cb7d7986063f352d96e921ee2bf2d9749 upstream.

The VT-d IOMMU response RESPONSE_FAILURE for a page request in below
cases:

- When it gets a Page_Request with no PASID;
- When it gets a Page_Request with PASID that is not in use for this
  device.

This is allowed by the spec, but IOMMU driver doesn't support such cases
today. When the device receives RESPONSE_FAILURE, it sends the device
state machine to HALT state. Now if we try to unload the driver, it hangs
since the device doesn't send any outbound transactions to host when the
driver is trying to clear things up. The only possible responses would be
for invalidation requests.

Let's use RESPONSE_INVALID instead for now, so that the device state
machine doesn't enter HALT state.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210126080730.2232859-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index bc4acbe64673..b3934b8232db 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -911,10 +911,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		u64 address;
 
 		handled = 1;
-
 		req = &iommu->prq[head / sizeof(*req)];
-
-		result = QI_RESP_FAILURE;
+		result = QI_RESP_INVALID;
 		address = (u64)req->addr << VTD_PAGE_SHIFT;
 		if (!req->pasid_present) {
 			pr_err("%s: Page request without PASID: %08llx %08llx\n",
@@ -952,7 +950,6 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			rcu_read_unlock();
 		}
 
-		result = QI_RESP_INVALID;
 		/* Since we're using init_mm.pgd directly, we should never take
 		 * any faults on kernel addresses. */
 		if (!svm->mm)
-- 
Gitee


From 97092831da226b170156d0e782511f7b3a0ae78b Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 30 Jan 2021 23:19:07 +0800
Subject: [PATCH 0411/2161] iommu/vt-d: Fix compile error
 [-Werror=implicit-function-declaration]

commit 3645a34f5b962aeedeb02f30cdf048eaae9b5f5c upstream.

trace_qi_submit() could be used when interrupt remapping is supported,
but DMA remapping is not. In this case, the following compile error
occurs.

../drivers/iommu/intel/dmar.c: In function 'qi_submit_sync':
../drivers/iommu/intel/dmar.c:1311:3: error: implicit declaration of function 'trace_qi_submit';
  did you mean 'ftrace_nmi_exit'? [-Werror=implicit-function-declaration]
   trace_qi_submit(iommu, desc[i].qw0, desc[i].qw1,
   ^~~~~~~~~~~~~~~
   ftrace_nmi_exit

Fixes: f2dd871799ba5 ("iommu/vt-d: Add qi_submit trace event")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210130151907.3929148-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Makefile       | 2 +-
 drivers/iommu/intel/iommu.c        | 1 -
 include/trace/events/intel_iommu.h | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index fb8e1e8c8029..ae570810a35e 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += trace.o
+obj-$(CONFIG_DMAR_TABLE) += trace.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
 obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3e3027fb7814..c40b5ecea874 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -45,7 +45,6 @@
 #include <asm/irq_remapping.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
-#include <trace/events/intel_iommu.h>
 
 #include "../irq_remapping.h"
 #include "pasid.h"
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index aad2ff0c1e2e..e801f4910522 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -6,7 +6,6 @@
  *
  * Author: Lu Baolu <baolu.lu@linux.intel.com>
  */
-#ifdef CONFIG_INTEL_IOMMU
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM intel_iommu
 
@@ -176,4 +175,3 @@ TRACE_EVENT(qi_submit,
 
 /* This part must be outside protection */
 #include <trace/define_trace.h>
-#endif /* CONFIG_INTEL_IOMMU */
-- 
Gitee


From 76ca46356e69f6adb61a9e296eb78a772a3aabe4 Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Thu, 4 Feb 2021 09:43:56 +0800
Subject: [PATCH 0412/2161] iommu/vt-d: Audit IOMMU Capabilities and add helper
 functions

commit ad3d19029979b19378ece2011fc8ce07be98c905 upstream.

Audit IOMMU Capability/Extended Capability and check if the IOMMUs have
the consistent value for features. Report out or scale to the lowest
supported when IOMMU features have incompatibility among IOMMUs.

Report out features when below features are mismatched:
  - First Level 5 Level Paging Support (FL5LP)
  - First Level 1 GByte Page Support (FL1GP)
  - Read Draining (DRD)
  - Write Draining (DWD)
  - Page Selective Invalidation (PSI)
  - Zero Length Read (ZLR)
  - Caching Mode (CM)
  - Protected High/Low-Memory Region (PHMR/PLMR)
  - Required Write-Buffer Flushing (RWBF)
  - Advanced Fault Logging (AFL)
  - RID-PASID Support (RPS)
  - Scalable Mode Page Walk Coherency (SMPWC)
  - First Level Translation Support (FLTS)
  - Second Level Translation Support (SLTS)
  - No Write Flag Support (NWFS)
  - Second Level Accessed/Dirty Support (SLADS)
  - Virtual Command Support (VCS)
  - Scalable Mode Translation Support (SMTS)
  - Device TLB Invalidation Throttle (DIT)
  - Page Drain Support (PDS)
  - Process Address Space ID Support (PASID)
  - Extended Accessed Flag Support (EAFS)
  - Supervisor Request Support (SRS)
  - Execute Request Support (ERS)
  - Page Request Support (PRS)
  - Nested Translation Support (NEST)
  - Snoop Control (SC)
  - Pass Through (PT)
  - Device TLB Support (DT)
  - Queued Invalidation (QI)
  - Page walk Coherency (C)

Set capability to the lowest supported when below features are mismatched:
  - Maximum Address Mask Value (MAMV)
  - Number of Fault Recording Registers (NFR)
  - Second Level Large Page Support (SLLPS)
  - Fault Recording Offset (FRO)
  - Maximum Guest Address Width (MGAW)
  - Supported Adjusted Guest Address Width (SAGAW)
  - Number of Domains supported (NDOMS)
  - Pasid Size Supported (PSS)
  - Maximum Handle Mask Value (MHMV)
  - IOTLB Register Offset (IRO)

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210130184452.31711-1-kyung.min.park@intel.com
Link: https://lore.kernel.org/r/20210204014401.2846425-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Makefile        |   2 +-
 drivers/iommu/intel/cap_audit.c     | 185 ++++++++++++++++++++++++++++
 drivers/iommu/intel/cap_audit.h     | 110 +++++++++++++++++
 drivers/iommu/intel/iommu.c         |   9 ++
 drivers/iommu/intel/irq_remapping.c |   8 ++
 include/linux/intel-iommu.h         |  39 +++---
 6 files changed, 334 insertions(+), 19 deletions(-)
 create mode 100644 drivers/iommu/intel/cap_audit.c
 create mode 100644 drivers/iommu/intel/cap_audit.h

diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index ae570810a35e..ae236ec7d219 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
-obj-$(CONFIG_DMAR_TABLE) += trace.o
+obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
 obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
new file mode 100644
index 000000000000..049bc0c76a00
--- /dev/null
+++ b/drivers/iommu/intel/cap_audit.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * cap_audit.c - audit iommu capabilities for boot time and hot plug
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Kyung Min Park <kyung.min.park@intel.com>
+ *         Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)	"DMAR: " fmt
+
+#include <linux/intel-iommu.h>
+#include "cap_audit.h"
+
+static u64 intel_iommu_cap_sanity;
+static u64 intel_iommu_ecap_sanity;
+
+static inline void check_irq_capabilities(struct intel_iommu *a,
+					  struct intel_iommu *b)
+{
+	CHECK_FEATURE_MISMATCH(a, b, cap, pi_support, CAP_PI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, eim_support, ECAP_EIM_MASK);
+}
+
+static inline void check_dmar_capabilities(struct intel_iommu *a,
+					   struct intel_iommu *b)
+{
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_MAMV_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_NFR_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_SLLPS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_FRO_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_MGAW_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_SAGAW_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_NDOMS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_PSS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_MHMV_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_IRO_MASK);
+
+	CHECK_FEATURE_MISMATCH(a, b, cap, 5lp_support, CAP_FL5LP_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, fl1gp_support, CAP_FL1GP_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, read_drain, CAP_RD_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, write_drain, CAP_WD_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, pgsel_inv, CAP_PSI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, zlr, CAP_ZLR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, caching_mode, CAP_CM_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, phmr, CAP_PHMR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, plmr, CAP_PLMR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, rwbf, CAP_RWBF_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, afl, CAP_AFL_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, rps, ECAP_RPS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, smpwc, ECAP_SMPWC_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, flts, ECAP_FLTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, slts, ECAP_SLTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, nwfs, ECAP_NWFS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, slads, ECAP_SLADS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, vcs, ECAP_VCS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, smts, ECAP_SMTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pds, ECAP_PDS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, dit, ECAP_DIT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pasid, ECAP_PASID_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, eafs, ECAP_EAFS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, srs, ECAP_SRS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, ers, ECAP_ERS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, prs, ECAP_PRS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, nest, ECAP_NEST_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, mts, ECAP_MTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, sc_support, ECAP_SC_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pass_through, ECAP_PT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, dev_iotlb_support, ECAP_DT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, qis, ECAP_QI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, coherent, ECAP_C_MASK);
+}
+
+static int cap_audit_hotplug(struct intel_iommu *iommu, enum cap_audit_type type)
+{
+	bool mismatch = false;
+	u64 old_cap = intel_iommu_cap_sanity;
+	u64 old_ecap = intel_iommu_ecap_sanity;
+
+	if (type == CAP_AUDIT_HOTPLUG_IRQR) {
+		CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pi_support, CAP_PI_MASK);
+		CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eim_support, ECAP_EIM_MASK);
+		goto out;
+	}
+
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, 5lp_support, CAP_FL5LP_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl1gp_support, CAP_FL1GP_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, read_drain, CAP_RD_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, write_drain, CAP_WD_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pgsel_inv, CAP_PSI_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, zlr, CAP_ZLR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, caching_mode, CAP_CM_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, phmr, CAP_PHMR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, plmr, CAP_PLMR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, rwbf, CAP_RWBF_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, afl, CAP_AFL_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, rps, ECAP_RPS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smpwc, ECAP_SMPWC_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, flts, ECAP_FLTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slts, ECAP_SLTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nwfs, ECAP_NWFS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slads, ECAP_SLADS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, vcs, ECAP_VCS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smts, ECAP_SMTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pds, ECAP_PDS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dit, ECAP_DIT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pasid, ECAP_PASID_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eafs, ECAP_EAFS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, srs, ECAP_SRS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, ers, ECAP_ERS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, prs, ECAP_PRS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nest, ECAP_NEST_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, mts, ECAP_MTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, sc_support, ECAP_SC_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pass_through, ECAP_PT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dev_iotlb_support, ECAP_DT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, qis, ECAP_QI_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, coherent, ECAP_C_MASK);
+
+	/* Abort hot plug if the hot plug iommu feature is smaller than global */
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, max_amask_val, CAP_MAMV_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, num_fault_regs, CAP_NFR_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, super_page_val, CAP_SLLPS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, fault_reg_offset, CAP_FRO_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, mgaw, CAP_MGAW_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, sagaw, CAP_SAGAW_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, ndoms, CAP_NDOMS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, pss, ECAP_PSS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, max_handle_mask, ECAP_MHMV_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, iotlb_offset, ECAP_IRO_MASK, mismatch);
+
+out:
+	if (mismatch) {
+		intel_iommu_cap_sanity = old_cap;
+		intel_iommu_ecap_sanity = old_ecap;
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type)
+{
+	struct dmar_drhd_unit *d;
+	struct intel_iommu *i;
+
+	rcu_read_lock();
+	if (list_empty(&dmar_drhd_units))
+		goto out;
+
+	for_each_active_iommu(i, d) {
+		if (!iommu) {
+			intel_iommu_ecap_sanity = i->ecap;
+			intel_iommu_cap_sanity = i->cap;
+			iommu = i;
+			continue;
+		}
+
+		if (type == CAP_AUDIT_STATIC_DMAR)
+			check_dmar_capabilities(iommu, i);
+		else
+			check_irq_capabilities(iommu, i);
+	}
+
+out:
+	rcu_read_unlock();
+	return 0;
+}
+
+int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu)
+{
+	switch (type) {
+	case CAP_AUDIT_STATIC_DMAR:
+	case CAP_AUDIT_STATIC_IRQR:
+		return cap_audit_static(iommu, type);
+	case CAP_AUDIT_HOTPLUG_DMAR:
+	case CAP_AUDIT_HOTPLUG_IRQR:
+		return cap_audit_hotplug(iommu, type);
+	default:
+		break;
+	}
+
+	return -EFAULT;
+}
diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
new file mode 100644
index 000000000000..a6a1530441b7
--- /dev/null
+++ b/drivers/iommu/intel/cap_audit.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * cap_audit.h - audit iommu capabilities header
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Kyung Min Park <kyung.min.park@intel.com>
+ */
+
+/*
+ * Capability Register Mask
+ */
+#define CAP_FL5LP_MASK		BIT_ULL(60)
+#define CAP_PI_MASK		BIT_ULL(59)
+#define CAP_FL1GP_MASK		BIT_ULL(56)
+#define CAP_RD_MASK		BIT_ULL(55)
+#define CAP_WD_MASK		BIT_ULL(54)
+#define CAP_MAMV_MASK		GENMASK_ULL(53, 48)
+#define CAP_NFR_MASK		GENMASK_ULL(47, 40)
+#define CAP_PSI_MASK		BIT_ULL(39)
+#define CAP_SLLPS_MASK		GENMASK_ULL(37, 34)
+#define CAP_FRO_MASK		GENMASK_ULL(33, 24)
+#define CAP_ZLR_MASK		BIT_ULL(22)
+#define CAP_MGAW_MASK		GENMASK_ULL(21, 16)
+#define CAP_SAGAW_MASK		GENMASK_ULL(12, 8)
+#define CAP_CM_MASK		BIT_ULL(7)
+#define CAP_PHMR_MASK		BIT_ULL(6)
+#define CAP_PLMR_MASK		BIT_ULL(5)
+#define CAP_RWBF_MASK		BIT_ULL(4)
+#define CAP_AFL_MASK		BIT_ULL(3)
+#define CAP_NDOMS_MASK		GENMASK_ULL(2, 0)
+
+/*
+ * Extended Capability Register Mask
+ */
+#define ECAP_RPS_MASK		BIT_ULL(49)
+#define ECAP_SMPWC_MASK		BIT_ULL(48)
+#define ECAP_FLTS_MASK		BIT_ULL(47)
+#define ECAP_SLTS_MASK		BIT_ULL(46)
+#define ECAP_SLADS_MASK		BIT_ULL(45)
+#define ECAP_VCS_MASK		BIT_ULL(44)
+#define ECAP_SMTS_MASK		BIT_ULL(43)
+#define ECAP_PDS_MASK		BIT_ULL(42)
+#define ECAP_DIT_MASK		BIT_ULL(41)
+#define ECAP_PASID_MASK		BIT_ULL(40)
+#define ECAP_PSS_MASK		GENMASK_ULL(39, 35)
+#define ECAP_EAFS_MASK		BIT_ULL(34)
+#define ECAP_NWFS_MASK		BIT_ULL(33)
+#define ECAP_SRS_MASK		BIT_ULL(31)
+#define ECAP_ERS_MASK		BIT_ULL(30)
+#define ECAP_PRS_MASK		BIT_ULL(29)
+#define ECAP_NEST_MASK		BIT_ULL(26)
+#define ECAP_MTS_MASK		BIT_ULL(25)
+#define ECAP_MHMV_MASK		GENMASK_ULL(23, 20)
+#define ECAP_IRO_MASK		GENMASK_ULL(17, 8)
+#define ECAP_SC_MASK		BIT_ULL(7)
+#define ECAP_PT_MASK		BIT_ULL(6)
+#define ECAP_EIM_MASK		BIT_ULL(4)
+#define ECAP_DT_MASK		BIT_ULL(2)
+#define ECAP_QI_MASK		BIT_ULL(1)
+#define ECAP_C_MASK		BIT_ULL(0)
+
+/*
+ * u64 intel_iommu_cap_sanity, intel_iommu_ecap_sanity will be adjusted as each
+ * IOMMU gets audited.
+ */
+#define DO_CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \
+do { \
+	if (cap##_##feature(a) != cap##_##feature(b)) { \
+		intel_iommu_##cap##_sanity &= ~(MASK); \
+		pr_info("IOMMU feature %s inconsistent", #feature); \
+	} \
+} while (0)
+
+#define CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \
+	DO_CHECK_FEATURE_MISMATCH((a)->cap, (b)->cap, cap, feature, MASK)
+
+#define CHECK_FEATURE_MISMATCH_HOTPLUG(b, cap, feature, MASK) \
+do { \
+	if (cap##_##feature(intel_iommu_##cap##_sanity)) \
+		DO_CHECK_FEATURE_MISMATCH(intel_iommu_##cap##_sanity, \
+					  (b)->cap, cap, feature, MASK); \
+} while (0)
+
+#define MINIMAL_FEATURE_IOMMU(iommu, cap, MASK) \
+do { \
+	u64 min_feature = intel_iommu_##cap##_sanity & (MASK); \
+	min_feature = min_t(u64, min_feature, (iommu)->cap & (MASK)); \
+	intel_iommu_##cap##_sanity = (intel_iommu_##cap##_sanity & ~(MASK)) | \
+				     min_feature; \
+} while (0)
+
+#define MINIMAL_FEATURE_HOTPLUG(iommu, cap, feature, MASK, mismatch) \
+do { \
+	if ((intel_iommu_##cap##_sanity & (MASK)) > \
+	    (cap##_##feature((iommu)->cap))) \
+		mismatch = true; \
+	else \
+		(iommu)->cap = ((iommu)->cap & ~(MASK)) | \
+		(intel_iommu_##cap##_sanity & (MASK)); \
+} while (0)
+
+enum cap_audit_type {
+	CAP_AUDIT_STATIC_DMAR,
+	CAP_AUDIT_STATIC_IRQR,
+	CAP_AUDIT_HOTPLUG_DMAR,
+	CAP_AUDIT_HOTPLUG_IRQR,
+};
+
+int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c40b5ecea874..a72a6800a9bf 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -48,6 +48,7 @@
 
 #include "../irq_remapping.h"
 #include "pasid.h"
+#include "cap_audit.h"
 
 #define ROOT_SIZE		VTD_PAGE_SIZE
 #define CONTEXT_SIZE		VTD_PAGE_SIZE
@@ -3207,6 +3208,10 @@ static int __init init_dmars(void)
 		goto error;
 	}
 
+	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
+	if (ret)
+		goto free_iommu;
+
 	for_each_iommu(iommu, drhd) {
 		if (drhd->ignored) {
 			iommu_disable_translation(iommu);
@@ -3814,6 +3819,10 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 	if (g_iommus[iommu->seq_id])
 		return 0;
 
+	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
+	if (ret)
+		goto out;
+
 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
 		pr_warn("%s: Doesn't support hardware pass through.\n",
 			iommu->name);
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 685200a5cff0..611ef5243cb6 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -22,6 +22,7 @@
 #include <asm/pci-direct.h>
 
 #include "../irq_remapping.h"
+#include "cap_audit.h"
 
 enum irq_mode {
 	IRQ_REMAPPING,
@@ -734,6 +735,9 @@ static int __init intel_prepare_irq_remapping(void)
 	if (dmar_table_init() < 0)
 		return -ENODEV;
 
+	if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
+		goto error;
+
 	if (!dmar_ir_support())
 		return -ENODEV;
 
@@ -1439,6 +1443,10 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
 	int ret;
 	int eim = x2apic_enabled();
 
+	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu);
+	if (ret)
+		return ret;
+
 	if (eim && !ecap_eim_support(iommu->ecap)) {
 		pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
 			iommu->reg_phys, iommu->ecap);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ecb35fdff03e..832730549c52 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -170,34 +170,37 @@
  * Extended Capability Register
  */
 
+#define	ecap_rps(e)		(((e) >> 49) & 0x1)
 #define ecap_smpwc(e)		(((e) >> 48) & 0x1)
 #define ecap_flts(e)		(((e) >> 47) & 0x1)
 #define ecap_slts(e)		(((e) >> 46) & 0x1)
+#define ecap_slads(e)		(((e) >> 45) & 0x1)
 #define ecap_vcs(e)		(((e) >> 44) & 0x1)
 #define ecap_smts(e)		(((e) >> 43) & 0x1)
-#define ecap_dit(e)		((e >> 41) & 0x1)
-#define ecap_pasid(e)		((e >> 40) & 0x1)
-#define ecap_pss(e)		((e >> 35) & 0x1f)
-#define ecap_eafs(e)		((e >> 34) & 0x1)
-#define ecap_nwfs(e)		((e >> 33) & 0x1)
-#define ecap_srs(e)		((e >> 31) & 0x1)
-#define ecap_ers(e)		((e >> 30) & 0x1)
-#define ecap_prs(e)		((e >> 29) & 0x1)
-#define ecap_broken_pasid(e)	((e >> 28) & 0x1)
-#define ecap_dis(e)		((e >> 27) & 0x1)
-#define ecap_nest(e)		((e >> 26) & 0x1)
-#define ecap_mts(e)		((e >> 25) & 0x1)
-#define ecap_ecs(e)		((e >> 24) & 0x1)
+#define ecap_dit(e)		(((e) >> 41) & 0x1)
+#define ecap_pds(e)		(((e) >> 42) & 0x1)
+#define ecap_pasid(e)		(((e) >> 40) & 0x1)
+#define ecap_pss(e)		(((e) >> 35) & 0x1f)
+#define ecap_eafs(e)		(((e) >> 34) & 0x1)
+#define ecap_nwfs(e)		(((e) >> 33) & 0x1)
+#define ecap_srs(e)		(((e) >> 31) & 0x1)
+#define ecap_ers(e)		(((e) >> 30) & 0x1)
+#define ecap_prs(e)		(((e) >> 29) & 0x1)
+#define ecap_broken_pasid(e)	(((e) >> 28) & 0x1)
+#define ecap_dis(e)		(((e) >> 27) & 0x1)
+#define ecap_nest(e)		(((e) >> 26) & 0x1)
+#define ecap_mts(e)		(((e) >> 25) & 0x1)
+#define ecap_ecs(e)		(((e) >> 24) & 0x1)
 #define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
 #define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
 #define ecap_coherent(e)	((e) & 0x1)
 #define ecap_qis(e)		((e) & 0x2)
-#define ecap_pass_through(e)	((e >> 6) & 0x1)
-#define ecap_eim_support(e)	((e >> 4) & 0x1)
-#define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_pass_through(e)	(((e) >> 6) & 0x1)
+#define ecap_eim_support(e)	(((e) >> 4) & 0x1)
+#define ecap_ir_support(e)	(((e) >> 3) & 0x1)
 #define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
-#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
-#define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
+#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
+#define ecap_sc_support(e)	(((e) >> 7) & 0x1) /* Snooping Control */
 
 /* Virtual command interface capability */
 #define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
-- 
Gitee


From ded6258698c1d704b7d8ee177a92f79a1eac196e Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Thu, 4 Feb 2021 09:43:57 +0800
Subject: [PATCH 0413/2161] iommu/vt-d: Move capability check code to cap_audit
 files

commit 010bf5659e01b0a169e8e6b9e6a8b7e62209470d upstream.

Move IOMMU capability check and sanity check code to cap_audit files.
Also implement some helper functions for sanity checks.

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210130184452.31711-1-kyung.min.park@intel.com
Link: https://lore.kernel.org/r/20210204014401.2846425-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/cap_audit.c | 20 +++++++++
 drivers/iommu/intel/cap_audit.h | 20 +++++++++
 drivers/iommu/intel/iommu.c     | 76 +--------------------------------
 3 files changed, 42 insertions(+), 74 deletions(-)

diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
index 049bc0c76a00..b12e421a2f1a 100644
--- a/drivers/iommu/intel/cap_audit.c
+++ b/drivers/iommu/intel/cap_audit.c
@@ -183,3 +183,23 @@ int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu)
 
 	return -EFAULT;
 }
+
+bool intel_cap_smts_sanity(void)
+{
+	return ecap_smts(intel_iommu_ecap_sanity);
+}
+
+bool intel_cap_pasid_sanity(void)
+{
+	return ecap_pasid(intel_iommu_ecap_sanity);
+}
+
+bool intel_cap_nest_sanity(void)
+{
+	return ecap_nest(intel_iommu_ecap_sanity);
+}
+
+bool intel_cap_flts_sanity(void)
+{
+	return ecap_flts(intel_iommu_ecap_sanity);
+}
diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
index a6a1530441b7..74cfccae0e81 100644
--- a/drivers/iommu/intel/cap_audit.h
+++ b/drivers/iommu/intel/cap_audit.h
@@ -107,4 +107,24 @@ enum cap_audit_type {
 	CAP_AUDIT_HOTPLUG_IRQR,
 };
 
+bool intel_cap_smts_sanity(void);
+bool intel_cap_pasid_sanity(void);
+bool intel_cap_nest_sanity(void);
+bool intel_cap_flts_sanity(void);
+
+static inline bool scalable_mode_support(void)
+{
+	return (intel_iommu_sm && intel_cap_smts_sanity());
+}
+
+static inline bool pasid_mode_support(void)
+{
+	return scalable_mode_support() && intel_cap_pasid_sanity();
+}
+
+static inline bool nested_mode_support(void)
+{
+	return scalable_mode_support() && intel_cap_nest_sanity();
+}
+
 int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index a72a6800a9bf..f12c428c7719 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1865,25 +1865,7 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
  */
 static bool first_level_by_default(void)
 {
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	static int first_level_support = -1;
-
-	if (likely(first_level_support != -1))
-		return first_level_support;
-
-	first_level_support = 1;
-
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
-		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
-			first_level_support = 0;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return first_level_support;
+	return scalable_mode_support() && intel_cap_flts_sanity();
 }
 
 static struct dmar_domain *alloc_domain(int flags)
@@ -5086,60 +5068,6 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 	return phys;
 }
 
-static inline bool scalable_mode_support(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	bool ret = true;
-
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
-		if (!sm_supported(iommu)) {
-			ret = false;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static inline bool iommu_pasid_support(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	bool ret = true;
-
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
-		if (!pasid_supported(iommu)) {
-			ret = false;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static inline bool nested_mode_support(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	bool ret = true;
-
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
-		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
-			ret = false;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
 static bool intel_iommu_capable(enum iommu_cap cap)
 {
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
@@ -5380,7 +5308,7 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 		int ret;
 
 		if (!dev_is_pci(dev) || dmar_disabled ||
-		    !scalable_mode_support() || !iommu_pasid_support())
+		    !scalable_mode_support() || !pasid_mode_support())
 			return false;
 
 		ret = pci_pasid_features(to_pci_dev(dev));
-- 
Gitee


From eb4736b885b9a06f55d12c6307734b2bf0876a47 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 4 Feb 2021 09:43:58 +0800
Subject: [PATCH 0414/2161] iommu/vt-d: Add iotlb_sync_map callback

commit 933fcd01e97e2ba29880dd5f1239365e40094950 upstream.

Some Intel VT-d hardware implementations don't support memory coherency
for page table walk (presented by the Page-Walk-coherency bit in the
ecap register), so that software must flush the corresponding CPU cache
lines explicitly after each page table entry update.

The iommu_map_sg() code iterates through the given scatter-gather list
and invokes iommu_map() for each element in the scatter-gather list,
which calls into the vendor IOMMU driver through iommu_ops callback. As
the result, a single sg mapping may lead to multiple cache line flushes,
which leads to the degradation of I/O performance after the commit
<c588072bba6b5> ("iommu/vt-d: Convert intel iommu driver to the iommu
ops").

Fix this by adding iotlb_sync_map callback and centralizing the clflush
operations after all sg mappings.

Fixes: c588072bba6b5 ("iommu/vt-d: Convert intel iommu driver to the iommu ops")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/linux-iommu/D81314ED-5673-44A6-B597-090E3CB83EB0@oracle.com/
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Robin Murphy <robin.murphy@arm.com>
[ cel: removed @first_pte, which is no longer used ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/linux-iommu/161177763962.1311.15577661784296014186.stgit@manet.1015granger.net
Link: https://lore.kernel.org/r/20210204014401.2846425-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 90 ++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f12c428c7719..014e3f0cb441 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2284,9 +2284,9 @@ static int
 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
 {
-	struct dma_pte *first_pte = NULL, *pte = NULL;
 	unsigned int largepage_lvl = 0;
 	unsigned long lvl_pages = 0;
+	struct dma_pte *pte = NULL;
 	phys_addr_t pteval;
 	u64 attr;
 
@@ -2315,7 +2315,7 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
 					phys_pfn, nr_pages);
 
-			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
 			if (!pte)
 				return -ENOMEM;
 			/* It is large page*/
@@ -2376,34 +2376,14 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		 * recalculate 'pte' and switch back to smaller pages for the
 		 * end of the mapping, if the trailing size is not enough to
 		 * use another superpage (i.e. nr_pages < lvl_pages).
+		 *
+		 * We leave clflush for the leaf pte changes to iotlb_sync_map()
+		 * callback.
 		 */
 		pte++;
 		if (!nr_pages || first_pte_in_page(pte) ||
-		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
-			domain_flush_cache(domain, first_pte,
-					   (void *)pte - (void *)first_pte);
+		    (largepage_lvl > 1 && nr_pages < lvl_pages))
 			pte = NULL;
-		}
-	}
-
-	return 0;
-}
-
-static int
-domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-	       unsigned long phys_pfn, unsigned long nr_pages, int prot)
-{
-	int iommu_id, ret;
-	struct intel_iommu *iommu;
-
-	/* Do the real mapping first */
-	ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
-	if (ret)
-		return ret;
-
-	for_each_domain_iommu(iommu_id, domain) {
-		iommu = g_iommus[iommu_id];
-		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
 	}
 
 	return 0;
@@ -4971,7 +4951,6 @@ static int intel_iommu_map(struct iommu_domain *domain,
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	u64 max_addr;
 	int prot = 0;
-	int ret;
 
 	if (iommu_prot & IOMMU_READ)
 		prot |= DMA_PTE_READ;
@@ -4997,9 +4976,8 @@ static int intel_iommu_map(struct iommu_domain *domain,
 	/* Round up size to next multiple of PAGE_SIZE, if it and
 	   the low bits of hpa would take us onto the next page */
 	size = aligned_nrpages(hpa, size);
-	ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
-			     hpa >> VTD_PAGE_SHIFT, size, prot);
-	return ret;
+	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+				hpa >> VTD_PAGE_SHIFT, size, prot);
 }
 
 static size_t intel_iommu_unmap(struct iommu_domain *domain,
@@ -5431,6 +5409,57 @@ intel_iommu_domain_set_attr(struct iommu_domain *domain,
 	return ret;
 }
 
+static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
+			     unsigned long clf_pages)
+{
+	struct dma_pte *first_pte = NULL, *pte = NULL;
+	unsigned long lvl_pages = 0;
+	int level = 0;
+
+	while (clf_pages > 0) {
+		if (!pte) {
+			level = 0;
+			pte = pfn_to_dma_pte(domain, clf_pfn, &level);
+			if (WARN_ON(!pte))
+				return;
+			first_pte = pte;
+			lvl_pages = lvl_to_nr_pages(level);
+		}
+
+		if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
+			return;
+
+		clf_pages -= lvl_pages;
+		clf_pfn += lvl_pages;
+		pte++;
+
+		if (!clf_pages || first_pte_in_page(pte) ||
+		    (level > 1 && clf_pages < lvl_pages)) {
+			domain_flush_cache(domain, first_pte,
+					   (void *)pte - (void *)first_pte);
+			pte = NULL;
+		}
+	}
+}
+
+static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
+				       unsigned long iova, size_t size)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	unsigned long pages = aligned_nrpages(iova, size);
+	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
+	struct intel_iommu *iommu;
+	int iommu_id;
+
+	if (!dmar_domain->iommu_coherency)
+		clflush_sync_map(dmar_domain, pfn, pages);
+
+	for_each_domain_iommu(iommu_id, dmar_domain) {
+		iommu = g_iommus[iommu_id];
+		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
+	}
+}
+
 const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
@@ -5443,6 +5472,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.aux_detach_dev		= intel_iommu_aux_detach_device,
 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
 	.map			= intel_iommu_map,
+	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
 	.unmap			= intel_iommu_unmap,
 	.flush_iotlb_all        = intel_flush_iotlb_all,
 	.iotlb_sync		= intel_iommu_tlb_sync,
-- 
Gitee


From 608656f0af70d42c920b994c77b6df664b1c6113 Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Thu, 7 Jan 2021 20:29:03 +0800
Subject: [PATCH 0415/2161] iommu: Move iotlb_sync_map out from __iommu_map

commit d8c1df02ac7f2c802a9b2afc0f5c888c4217f1d5 upstream.

In the end of __iommu_map, It alway call iotlb_sync_map.

This patch moves iotlb_sync_map out from __iommu_map since it is
unnecessary to call this for each sg segment especially iotlb_sync_map
is flush tlb all currently. Add a little helper _iommu_map for this.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210107122909.16317-2-yong.wu@mediatek.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index baa1c95c6474..77188f0cb9a3 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2235,9 +2235,6 @@ int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 		size -= pgsize;
 	}
 
-	if (ops->iotlb_sync_map)
-		ops->iotlb_sync_map(domain);
-
 	/* unroll mapping in case something went wrong */
 	if (ret)
 		iommu_unmap(domain, orig_iova, orig_size - size);
@@ -2247,18 +2244,31 @@ int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	return ret;
 }
 
+static int _iommu_map(struct iommu_domain *domain, unsigned long iova,
+		      phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+	const struct iommu_ops *ops = domain->ops;
+	int ret;
+
+	ret = __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
+	if (ret == 0 && ops->iotlb_sync_map)
+		ops->iotlb_sync_map(domain);
+
+	return ret;
+}
+
 int iommu_map(struct iommu_domain *domain, unsigned long iova,
 	      phys_addr_t paddr, size_t size, int prot)
 {
 	might_sleep();
-	return __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
+	return _iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(iommu_map);
 
 int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
 	      phys_addr_t paddr, size_t size, int prot)
 {
-	return __iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC);
+	return _iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC);
 }
 EXPORT_SYMBOL_GPL(iommu_map_atomic);
 
@@ -2342,6 +2352,7 @@ size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 		      struct scatterlist *sg, unsigned int nents, int prot,
 		      gfp_t gfp)
 {
+	const struct iommu_ops *ops = domain->ops;
 	size_t len = 0, mapped = 0;
 	phys_addr_t start;
 	unsigned int i = 0;
@@ -2372,6 +2383,8 @@ size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 			sg = sg_next(sg);
 	}
 
+	if (ops->iotlb_sync_map)
+		ops->iotlb_sync_map(domain);
 	return mapped;
 
 out_err:
-- 
Gitee


From 2ba540b3e8ba5881d28dcd96874423009318eada Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Thu, 7 Jan 2021 20:29:04 +0800
Subject: [PATCH 0416/2161] iommu: Add iova and size as parameters in
 iotlb_sync_map

commit 2ebbd25873cef06f739489fd8ff9f707a3dfa2fa upstream.

iotlb_sync_map allow IOMMU drivers tlb sync after completing the whole
mapping. This patch adds iova and size as the parameters in it. then the
IOMMU driver could flush tlb with the whole range once after iova mapping
to improve performance.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210107122909.16317-3-yong.wu@mediatek.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c      | 4 ++--
 drivers/iommu/tegra-gart.c | 7 +++++--
 include/linux/iommu.h      | 3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 77188f0cb9a3..520fdbf11e75 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2252,7 +2252,7 @@ static int _iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 	ret = __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
 	if (ret == 0 && ops->iotlb_sync_map)
-		ops->iotlb_sync_map(domain);
+		ops->iotlb_sync_map(domain, iova, size);
 
 	return ret;
 }
@@ -2384,7 +2384,7 @@ size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 	}
 
 	if (ops->iotlb_sync_map)
-		ops->iotlb_sync_map(domain);
+		ops->iotlb_sync_map(domain, iova, mapped);
 	return mapped;
 
 out_err:
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 3924f7c05544..a446a6aee71d 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -273,7 +273,8 @@ static int gart_iommu_of_xlate(struct device *dev,
 	return 0;
 }
 
-static void gart_iommu_sync_map(struct iommu_domain *domain)
+static void gart_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
+				size_t size)
 {
 	FLUSH_GART_REGS(gart_handle);
 }
@@ -281,7 +282,9 @@ static void gart_iommu_sync_map(struct iommu_domain *domain)
 static void gart_iommu_sync(struct iommu_domain *domain,
 			    struct iommu_iotlb_gather *gather)
 {
-	gart_iommu_sync_map(domain);
+	size_t length = gather->end - gather->start;
+
+	gart_iommu_sync_map(domain, gather->start, length);
 }
 
 static const struct iommu_ops gart_iommu_ops = {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ada074d73237..b26c4e9e4e2c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -251,7 +251,8 @@ struct iommu_ops {
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
-	void (*iotlb_sync_map)(struct iommu_domain *domain);
+	void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
+			       size_t size);
 	void (*iotlb_sync)(struct iommu_domain *domain,
 			   struct iommu_iotlb_gather *iotlb_gather);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
-- 
Gitee


From 8bb872413bb028d2870107ad3280d9ac37964429 Mon Sep 17 00:00:00 2001
From: Yian Chen <yian.chen@intel.com>
Date: Thu, 4 Feb 2021 09:43:59 +0800
Subject: [PATCH 0417/2161] iommu/vt-d: Add new enum value and structure for
 SATC

commit 81d3c75bb3c32ca61712e093b8dce89d73c22150 upstream.

Starting from Intel Platform VT-d v3.2, BIOS may provide new remapping
structure SATC for SOC integrated devices, according to section 8.8 of
Intel VT-d architecture specification v3.2. The SATC structure reports
a list of the devices that require ATS for normal device operation. It
is a functional requirement that these devices will not work without OS
enabling ATS capability.

This patch introduces the new enum value and structure to represent the
remapping information. Kernel should parse the information from the
reporting structure and enable ATC for the devices as needed.

Signed-off-by: Yian Chen <yian.chen@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210203093329.1617808-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210204014401.2846425-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/acpi/actbl1.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 22c039ebc6c5..c7e102e2fa41 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -514,7 +514,8 @@ enum acpi_dmar_type {
 	ACPI_DMAR_TYPE_ROOT_ATS = 2,
 	ACPI_DMAR_TYPE_HARDWARE_AFFINITY = 3,
 	ACPI_DMAR_TYPE_NAMESPACE = 4,
-	ACPI_DMAR_TYPE_RESERVED = 5	/* 5 and greater are reserved */
+	ACPI_DMAR_TYPE_SATC = 5,
+	ACPI_DMAR_TYPE_RESERVED = 6	/* 6 and greater are reserved */
 };
 
 /* DMAR Device Scope structure */
@@ -607,6 +608,14 @@ struct acpi_dmar_andd {
 	char device_name[1];
 };
 
+/* 5: SOC Integrated Address Translation Cache Reporting Structure */
+
+struct acpi_dmar_satc {
+	struct acpi_dmar_header header;
+	u8 flags;
+	u8 reserved;
+	u16 segment;
+};
 /*******************************************************************************
  *
  * DRTM - Dynamic Root of Trust for Measurement table
-- 
Gitee


From bf37ea05a81172a755d9f6c1a7b986f90854a107 Mon Sep 17 00:00:00 2001
From: Yian Chen <yian.chen@intel.com>
Date: Thu, 4 Feb 2021 09:44:00 +0800
Subject: [PATCH 0418/2161] iommu/vt-d: Parse SATC reporting structure

commit 31a75cbbb9274cf8185f402904bf11386917870b upstream.

Software should parse every SATC table and all devices in the tables
reported by the BIOS and keep the information in kernel list for further
reference.

Signed-off-by: Yian Chen <yian.chen@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210203093329.1617808-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210204014401.2846425-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c  |  8 ++++
 drivers/iommu/intel/iommu.c | 89 +++++++++++++++++++++++++++++++++++++
 include/linux/dmar.h        |  2 +
 3 files changed, 99 insertions(+)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 980e8ae090af..d5c51b5c20af 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -526,6 +526,7 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 	struct acpi_dmar_reserved_memory *rmrr;
 	struct acpi_dmar_atsr *atsr;
 	struct acpi_dmar_rhsa *rhsa;
+	struct acpi_dmar_satc *satc;
 
 	switch (header->type) {
 	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
@@ -555,6 +556,10 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 		/* We don't print this here because we need to sanity-check
 		   it first. So print it in dmar_parse_one_andd() instead. */
 		break;
+	case ACPI_DMAR_TYPE_SATC:
+		satc = container_of(header, struct acpi_dmar_satc, header);
+		pr_info("SATC flags: 0x%x\n", satc->flags);
+		break;
 	}
 }
 
@@ -642,6 +647,7 @@ parse_dmar_table(void)
 		.cb[ACPI_DMAR_TYPE_ROOT_ATS] = &dmar_parse_one_atsr,
 		.cb[ACPI_DMAR_TYPE_HARDWARE_AFFINITY] = &dmar_parse_one_rhsa,
 		.cb[ACPI_DMAR_TYPE_NAMESPACE] = &dmar_parse_one_andd,
+		.cb[ACPI_DMAR_TYPE_SATC] = &dmar_parse_one_satc,
 	};
 
 	/*
@@ -2077,6 +2083,7 @@ static guid_t dmar_hp_guid =
 #define	DMAR_DSM_FUNC_DRHD		1
 #define	DMAR_DSM_FUNC_ATSR		2
 #define	DMAR_DSM_FUNC_RHSA		3
+#define	DMAR_DSM_FUNC_SATC		4
 
 static inline bool dmar_detect_dsm(acpi_handle handle, int func)
 {
@@ -2094,6 +2101,7 @@ static int dmar_walk_dsm_resource(acpi_handle handle, int func,
 		[DMAR_DSM_FUNC_DRHD] = ACPI_DMAR_TYPE_HARDWARE_UNIT,
 		[DMAR_DSM_FUNC_ATSR] = ACPI_DMAR_TYPE_ROOT_ATS,
 		[DMAR_DSM_FUNC_RHSA] = ACPI_DMAR_TYPE_HARDWARE_AFFINITY,
+		[DMAR_DSM_FUNC_SATC] = ACPI_DMAR_TYPE_SATC,
 	};
 
 	if (!dmar_detect_dsm(handle, func))
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 014e3f0cb441..e13f5cc2701f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -317,8 +317,18 @@ struct dmar_atsr_unit {
 	u8 include_all:1;		/* include all ports */
 };
 
+struct dmar_satc_unit {
+	struct list_head list;		/* list of SATC units */
+	struct acpi_dmar_header *hdr;	/* ACPI header */
+	struct dmar_dev_scope *devices;	/* target devices */
+	struct intel_iommu *iommu;	/* the corresponding iommu */
+	int devices_cnt;		/* target device count */
+	u8 atc_required:1;		/* ATS is required */
+};
+
 static LIST_HEAD(dmar_atsr_units);
 static LIST_HEAD(dmar_rmrr_units);
+static LIST_HEAD(dmar_satc_units);
 
 #define for_each_rmrr_units(rmrr) \
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
@@ -3773,6 +3783,57 @@ int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
 	return 0;
 }
 
+static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
+{
+	struct dmar_satc_unit *satcu;
+	struct acpi_dmar_satc *tmp;
+
+	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
+				dmar_rcu_check()) {
+		tmp = (struct acpi_dmar_satc *)satcu->hdr;
+		if (satc->segment != tmp->segment)
+			continue;
+		if (satc->header.length != tmp->header.length)
+			continue;
+		if (memcmp(satc, tmp, satc->header.length) == 0)
+			return satcu;
+	}
+
+	return NULL;
+}
+
+int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
+{
+	struct acpi_dmar_satc *satc;
+	struct dmar_satc_unit *satcu;
+
+	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
+		return 0;
+
+	satc = container_of(hdr, struct acpi_dmar_satc, header);
+	satcu = dmar_find_satc(satc);
+	if (satcu)
+		return 0;
+
+	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
+	if (!satcu)
+		return -ENOMEM;
+
+	satcu->hdr = (void *)(satcu + 1);
+	memcpy(satcu->hdr, hdr, hdr->length);
+	satcu->atc_required = satc->flags & 0x1;
+	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
+					      (void *)satc + satc->header.length,
+					      &satcu->devices_cnt);
+	if (satcu->devices_cnt && !satcu->devices) {
+		kfree(satcu);
+		return -ENOMEM;
+	}
+	list_add_rcu(&satcu->list, &dmar_satc_units);
+
+	return 0;
+}
+
 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 {
 	int sp, ret;
@@ -3880,6 +3941,7 @@ static void intel_iommu_free_dmars(void)
 {
 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
 	struct dmar_atsr_unit *atsru, *atsr_n;
+	struct dmar_satc_unit *satcu, *satc_n;
 
 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
 		list_del(&rmrru->list);
@@ -3891,6 +3953,11 @@ static void intel_iommu_free_dmars(void)
 		list_del(&atsru->list);
 		intel_iommu_free_atsr(atsru);
 	}
+	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
+		list_del(&satcu->list);
+		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
+		kfree(satcu);
+	}
 }
 
 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
@@ -3943,8 +4010,10 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 	int ret;
 	struct dmar_rmrr_unit *rmrru;
 	struct dmar_atsr_unit *atsru;
+	struct dmar_satc_unit *satcu;
 	struct acpi_dmar_atsr *atsr;
 	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_satc *satc;
 
 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
 		return 0;
@@ -3985,6 +4054,23 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 				break;
 		}
 	}
+	list_for_each_entry(satcu, &dmar_satc_units, list) {
+		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
+		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
+					(void *)satc + satc->header.length,
+					satc->segment, satcu->devices,
+					satcu->devices_cnt);
+			if (ret > 0)
+				break;
+			else if (ret < 0)
+				return ret;
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+			if (dmar_remove_dev_scope(info, satc->segment,
+					satcu->devices, satcu->devices_cnt))
+				break;
+		}
+	}
 
 	return 0;
 }
@@ -4299,6 +4385,9 @@ int __init intel_iommu_init(void)
 	if (list_empty(&dmar_atsr_units))
 		pr_info("No ATSR found\n");
 
+	if (list_empty(&dmar_satc_units))
+		pr_info("No SATC found\n");
+
 	if (dmar_map_gfx)
 		intel_iommu_gfx_mapped = 1;
 
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 65565820328a..e04436a7ff27 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -138,6 +138,7 @@ extern void intel_iommu_shutdown(void);
 extern int dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg);
 extern int dmar_parse_one_atsr(struct acpi_dmar_header *header, void *arg);
 extern int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg);
+extern int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg);
 extern int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg);
 extern int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert);
 extern int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info);
@@ -149,6 +150,7 @@ static inline void intel_iommu_shutdown(void) { }
 #define	dmar_parse_one_atsr		dmar_res_noop
 #define	dmar_check_one_atsr		dmar_res_noop
 #define	dmar_release_one_atsr		dmar_res_noop
+#define	dmar_parse_one_satc		dmar_res_noop
 
 static inline int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 {
-- 
Gitee


From 391923a7ee07f165073bc08790064f9866eeb699 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Sat, 27 Feb 2021 15:39:09 +0800
Subject: [PATCH 0419/2161] iommu/vt-d: Fix status code for Allocate/Free PASID
 command

commit 444d66a23c1f1e4c4d12aed4812681d0ad835d60 upstream.

As per Intel vt-d spec, Rev 3.0 (section 10.4.45 "Virtual Command Response
Register"), the status code of "No PASID available" error in response to
the Allocate PASID command is 2, not 1. The same for "Invalid PASID" error
in response to the Free PASID command.

We will otherwise see confusing kernel log under the command failure from
guest side. Fix it.

Fixes: 24f27d32ab6b ("iommu/vt-d: Enlightened PASID allocation")
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210227073909.432-1-yuzenghui@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 97dfcffbf495..444c0bec221a 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -30,8 +30,8 @@
 #define VCMD_VRSP_IP			0x1
 #define VCMD_VRSP_SC(e)			(((e) >> 1) & 0x3)
 #define VCMD_VRSP_SC_SUCCESS		0
-#define VCMD_VRSP_SC_NO_PASID_AVAIL	1
-#define VCMD_VRSP_SC_INVALID_PASID	1
+#define VCMD_VRSP_SC_NO_PASID_AVAIL	2
+#define VCMD_VRSP_SC_INVALID_PASID	2
 #define VCMD_VRSP_RESULT_PASID(e)	(((e) >> 8) & 0xfffff)
 #define VCMD_CMD_OPERAND(e)		((e) << 8)
 /*
-- 
Gitee


From 33a640978e242b02763848d98b67b14708dc7cf4 Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 3 Mar 2021 17:36:11 +0000
Subject: [PATCH 0420/2161] iommu: Check dev->iommu in iommu_dev_xxx functions

commit b9abb19fa5fd2d8a4be61c6cd4b2a48aa1a17f9c upstream.

The device iommu probe/attach might have failed leaving dev->iommu
to NULL and device drivers may still invoke these functions resulting
in a crash in iommu vendor driver code.

Hence make sure we check that.

Fixes: a3a195929d40 ("iommu: Add APIs for multiple domains per device")
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210303173611.520-1-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 520fdbf11e75..958b8fc85da9 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2698,10 +2698,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_has_feature);
 
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	if (dev->iommu && dev->iommu->iommu_dev) {
+		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-	if (ops && ops->dev_enable_feat)
-		return ops->dev_enable_feat(dev, feat);
+		if (ops->dev_enable_feat)
+			return ops->dev_enable_feat(dev, feat);
+	}
 
 	return -ENODEV;
 }
@@ -2714,10 +2716,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
  */
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	if (dev->iommu && dev->iommu->iommu_dev) {
+		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-	if (ops && ops->dev_disable_feat)
-		return ops->dev_disable_feat(dev, feat);
+		if (ops->dev_disable_feat)
+			return ops->dev_disable_feat(dev, feat);
+	}
 
 	return -EBUSY;
 }
@@ -2725,10 +2729,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
 
 bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	if (dev->iommu && dev->iommu->iommu_dev) {
+		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-	if (ops && ops->dev_feat_enabled)
-		return ops->dev_feat_enabled(dev, feat);
+		if (ops->dev_feat_enabled)
+			return ops->dev_feat_enabled(dev, feat);
+	}
 
 	return false;
 }
-- 
Gitee


From 42a0ec4b02e0bc53ef8a6287670144957b52674d Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 5 Mar 2021 16:32:34 +0000
Subject: [PATCH 0421/2161] iommu/dma: Resurrect the "forcedac" option

commit 3542dcb15cef66c0b9e6c3b33168eb657e0d9520 upstream.

In converting intel-iommu over to the common IOMMU DMA ops, it quietly
lost the functionality of its "forcedac" option. Since this is a handy
thing both for testing and for performance optimisation on certain
platforms, reimplement it under the common IOMMU parameter namespace.

For the sake of fixing the inadvertent breakage of the Intel-specific
parameter, remove the dmar_forcedac remnants and hook it up as an alias
while documenting the transition to the new common parameter.

Fixes: c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/7eece8e0ea7bfbe2cd0e30789e0d46df573af9b0.1614961776.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 15 ++++++++-------
 drivers/iommu/dma-iommu.c                       | 13 ++++++++++++-
 drivers/iommu/intel/iommu.c                     |  5 ++---
 include/linux/dma-iommu.h                       |  2 ++
 4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c781bad594be..ea099ff506b8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1717,13 +1717,6 @@
 			bypassed by not enabling DMAR with this option. In
 			this case, gfx device will use physical address for
 			DMA.
-		forcedac [x86_64]
-			With this option iommu will not optimize to look
-			for io virtual address below 32-bit forcing dual
-			address cycle on pci bus for cards supporting greater
-			than 32-bit addressing. The default is to look
-			for translation below 32-bit and if not available
-			then look in the higher range.
 		strict [Default Off]
 			With this option on every unmap_single operation will
 			result in a hardware IOTLB flush operation as opposed
@@ -1812,6 +1805,14 @@
 		nobypass	[PPC/POWERNV]
 			Disable IOMMU bypass, using IOMMU for PCI devices.
 
+	iommu.forcedac=	[ARM64, X86] Control IOVA allocation for PCI devices.
+			Format: { "0" | "1" }
+			0 - Try to allocate a 32-bit DMA address first, before
+			  falling back to the full range if needed.
+			1 - Allocate directly from the full usable range,
+			  forcing Dual Address Cycle for PCI cards supporting
+			  greater than 32-bit addressing.
+
 	iommu.strict=	[ARM64] Configure TLB invalidation behaviour
 			Format: { "0" | "1" }
 			0 - Lazy mode.
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 76bd2309e023..a735515ee4e9 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -49,6 +49,17 @@ struct iommu_dma_cookie {
 	struct iommu_domain		*fq_domain;
 };
 
+bool iommu_dma_forcedac __read_mostly;
+
+static int __init iommu_dma_forcedac_setup(char *str)
+{
+	int ret = kstrtobool(str, &iommu_dma_forcedac);
+
+	if (!ret && iommu_dma_forcedac)
+		pr_info("Forcing DAC for PCI devices\n");
+	return ret;
+}
+early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
 {
 	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
@@ -411,7 +422,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 		dma_limit = min(dma_limit, domain->geometry.aperture_end);
 
 	/* Try to get PCI devices a SAC address */
-	if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
+	if (dma_limit > DMA_BIT_MASK(32) && !iommu_dma_forcedac && dev_is_pci(dev))
 		iova = alloc_iova_fast(iovad, iova_len,
 				       DMA_BIT_MASK(32) >> shift, false);
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e13f5cc2701f..db515046050e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -361,7 +361,6 @@ int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
 static int dmar_map_gfx = 1;
-static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
@@ -452,8 +451,8 @@ static int __init intel_iommu_setup(char *str)
 			dmar_map_gfx = 0;
 			pr_info("Disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			pr_info("Forcing DAC for PCI devices\n");
-			dmar_forcedac = 1;
+			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
+			iommu_dma_forcedac = true;
 		} else if (!strncmp(str, "strict", 6)) {
 			pr_info("Disable batched IOTLB flush\n");
 			intel_iommu_strict = 1;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 2112f21f73d8..927e22639c34 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -37,6 +37,8 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
+extern bool iommu_dma_forcedac;
+
 #else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
-- 
Gitee


From 63d09d8b4e826666bc39b16a409afd2e0b2b0fbc Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Sun, 14 Mar 2021 13:15:34 -0700
Subject: [PATCH 0422/2161] iommu/vt-d: Disable SVM when ATS/PRI/PASID are not
 enabled in the device

commit dec991e4722d763130c8ccd92523f2a173f8a7cd upstream.

Currently, the Intel VT-d supports Shared Virtual Memory (SVM) only when
IO page fault is supported. Otherwise, shared memory pages can not be
swapped out and need to be pinned. The device needs the Address Translation
Service (ATS), Page Request Interface (PRI) and Process Address Space
Identifier (PASID) capabilities to be enabled to support IO page fault.

Disable SVM when ATS, PRI and PASID are not enabled in the device.

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210314201534.918-1-kyung.min.park@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index db515046050e..54d65dc221b3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5407,6 +5407,9 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 		if (!info)
 			return -EINVAL;
 
+		if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
+			return -EINVAL;
+
 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
 			return 0;
 	}
-- 
Gitee


From 268f7d025ce6903c3e4a2a80c87c4a47dc8b0c6f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 18 Mar 2021 08:53:40 +0800
Subject: [PATCH 0423/2161] iommu/vt-d: Report more information about
 invalidation errors

commit 6ca69e5841f01ccbfa45e56577e1b33e14e53504 upstream.

When the invalidation queue errors are encountered, dump the information
logged by the VT-d hardware together with the pending queue invalidation
descriptors.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Tested-by: Guo Kaijie <Kaijie.Guo@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210318005340.187311-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c  | 68 ++++++++++++++++++++++++++++++++++---
 include/linux/intel-iommu.h |  6 ++++
 2 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index d5c51b5c20af..6971397805f3 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1205,6 +1205,63 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 	}
 }
 
+static const char *qi_type_string(u8 type)
+{
+	switch (type) {
+	case QI_CC_TYPE:
+		return "Context-cache Invalidation";
+	case QI_IOTLB_TYPE:
+		return "IOTLB Invalidation";
+	case QI_DIOTLB_TYPE:
+		return "Device-TLB Invalidation";
+	case QI_IEC_TYPE:
+		return "Interrupt Entry Cache Invalidation";
+	case QI_IWD_TYPE:
+		return "Invalidation Wait";
+	case QI_EIOTLB_TYPE:
+		return "PASID-based IOTLB Invalidation";
+	case QI_PC_TYPE:
+		return "PASID-cache Invalidation";
+	case QI_DEIOTLB_TYPE:
+		return "PASID-based Device-TLB Invalidation";
+	case QI_PGRP_RESP_TYPE:
+		return "Page Group Response";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static void qi_dump_fault(struct intel_iommu *iommu, u32 fault)
+{
+	unsigned int head = dmar_readl(iommu->reg + DMAR_IQH_REG);
+	u64 iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG);
+	struct qi_desc *desc = iommu->qi->desc + head;
+
+	if (fault & DMA_FSTS_IQE)
+		pr_err("VT-d detected Invalidation Queue Error: Reason %llx",
+		       DMAR_IQER_REG_IQEI(iqe_err));
+	if (fault & DMA_FSTS_ITE)
+		pr_err("VT-d detected Invalidation Time-out Error: SID %llx",
+		       DMAR_IQER_REG_ITESID(iqe_err));
+	if (fault & DMA_FSTS_ICE)
+		pr_err("VT-d detected Invalidation Completion Error: SID %llx",
+		       DMAR_IQER_REG_ICESID(iqe_err));
+
+	pr_err("QI HEAD: %s qw0 = 0x%llx, qw1 = 0x%llx\n",
+	       qi_type_string(desc->qw0 & 0xf),
+	       (unsigned long long)desc->qw0,
+	       (unsigned long long)desc->qw1);
+
+	head = ((head >> qi_shift(iommu)) + QI_LENGTH - 1) % QI_LENGTH;
+	head <<= qi_shift(iommu);
+	desc = iommu->qi->desc + head;
+
+	pr_err("QI PRIOR: %s qw0 = 0x%llx, qw1 = 0x%llx\n",
+	       qi_type_string(desc->qw0 & 0xf),
+	       (unsigned long long)desc->qw0,
+	       (unsigned long long)desc->qw1);
+}
+
 static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 {
 	u32 fault;
@@ -1216,6 +1273,8 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 		return -EAGAIN;
 
 	fault = readl(iommu->reg + DMAR_FSTS_REG);
+	if (fault & (DMA_FSTS_IQE | DMA_FSTS_ITE | DMA_FSTS_ICE))
+		qi_dump_fault(iommu, fault);
 
 	/*
 	 * If IQE happens, the head points to the descriptor associated
@@ -1232,12 +1291,10 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 			 * used by software as private data. We won't print
 			 * out these two qw's for security consideration.
 			 */
-			pr_err("VT-d detected invalid descriptor: qw0 = %llx, qw1 = %llx\n",
-			       (unsigned long long)desc->qw0,
-			       (unsigned long long)desc->qw1);
 			memcpy(desc, qi->desc + (wait_index << shift),
 			       1 << shift);
 			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+			pr_info("Invalidation Queue Error (IQE) cleared\n");
 			return -EINVAL;
 		}
 	}
@@ -1254,6 +1311,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 		tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
 
 		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+		pr_info("Invalidation Time-out Error (ITE) cleared\n");
 
 		do {
 			if (qi->desc_status[head] == QI_IN_USE)
@@ -1265,8 +1323,10 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 			return -EAGAIN;
 	}
 
-	if (fault & DMA_FSTS_ICE)
+	if (fault & DMA_FSTS_ICE) {
 		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+		pr_info("Invalidation Completion Error (ICE) cleared\n");
+	}
 
 	return 0;
 }
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 832730549c52..f64d507252e2 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -20,6 +20,7 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmar.h>
 #include <linux/ioasid.h>
+#include <linux/bitfield.h>
 
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -80,6 +81,7 @@
 #define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
 #define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
 #define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
+#define DMAR_IQER_REG	0xb0	/* Invalidation queue error record register */
 #define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
 #define DMAR_PQH_REG	0xc0	/* Page request queue head register */
 #define DMAR_PQT_REG	0xc8	/* Page request queue tail register */
@@ -126,6 +128,10 @@
 #define DMAR_VCMD_REG		0xe10 /* Virtual command register */
 #define DMAR_VCRSP_REG		0xe20 /* Virtual command response register */
 
+#define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
+#define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
+#define DMAR_IQER_REG_ICESID(reg)	FIELD_GET(GENMASK_ULL(63, 48), reg)
+
 #define OFFSET_STRIDE		(9)
 
 #define dmar_readq(a) readq(a)
-- 
Gitee


From a30db5ea5f7a279021a670bdfa46eeb4600ad32c Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 2 Mar 2021 02:13:57 -0800
Subject: [PATCH 0424/2161] iommu/vt-d: Enable write protect for supervisor SVM

commit f68c7f539b6e9712e941212ab95a1feb5a0bf3b3 upstream.

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In supervisor shared virtual addressing (SVA), where page tables
are shared between CPU and DMA, IOMMU PASID entry WPE bit should match
CR0.WP bit in the CPU.
This patch sets WPE bit for supervisor PASIDs if CR0.WP is set.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1614680040-1989-2-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index f26cb6195b2c..629e3aee44fe 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -393,6 +393,15 @@ static inline void pasid_set_sre(struct pasid_entry *pe)
 	pasid_set_bits(&pe->val[2], 1 << 0, 1);
 }
 
+/*
+ * Setup the WPE(Write Protect Enable) field (Bit 132) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_wpe(struct pasid_entry *pe)
+{
+	pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4);
+}
+
 /*
  * Setup the P(Present) field (Bit 0) of a scalable mode PASID
  * entry.
@@ -522,6 +531,20 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
 	}
 }
 
+static inline int pasid_enable_wpe(struct pasid_entry *pte)
+{
+	unsigned long cr0 = read_cr0();
+
+	/* CR0.WP is normally set but just to be sure */
+	if (unlikely(!(cr0 & X86_CR0_WP))) {
+		pr_err_ratelimited("No CPU write protect!\n");
+		return -EINVAL;
+	}
+	pasid_set_wpe(pte);
+
+	return 0;
+};
+
 /*
  * Set up the scalable mode pasid table entry for first only
  * translation type.
@@ -553,6 +576,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 			return -EINVAL;
 		}
 		pasid_set_sre(pte);
+		if (pasid_enable_wpe(pte))
+			return -EINVAL;
+
 	}
 
 	if (flags & PASID_FLAG_FL5LP) {
-- 
Gitee


From d7aef6594823c389eac68c2a2e3e567dd7c4c957 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 2 Mar 2021 02:13:58 -0800
Subject: [PATCH 0425/2161] iommu/vt-d: Enable write protect propagation from
 guest

commit bb0f61533dfd6aa815a2719720c77d13f840b683 upstream.

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In guest supervisor shared virtual addressing (SVA), write-protect
should be honored upon guest bind supervisor PASID request.

This patch extends the VT-d portion of the IOMMU UAPI to include WP bit.
WPE bit of the  supervisor PASID entry will be set to match CPU CR0.WP bit.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1614680040-1989-3-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 3 +++
 include/uapi/linux/iommu.h  | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 629e3aee44fe..0bf7e0a76890 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -732,6 +732,9 @@ intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
 			return -EINVAL;
 		}
 		pasid_set_sre(pte);
+		/* Enable write protect WP if guest requested */
+		if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE)
+			pasid_set_wpe(pte);
 	}
 
 	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 7345dd160773..05a3434b3d56 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -288,7 +288,8 @@ struct iommu_gpasid_bind_data_vtd {
 #define IOMMU_SVA_VTD_GPASID_PWT	(1 << 3) /* page-level write through */
 #define IOMMU_SVA_VTD_GPASID_EMTE	(1 << 4) /* extended mem type enable */
 #define IOMMU_SVA_VTD_GPASID_CD		(1 << 5) /* PASID-level cache disable */
-#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 6)
+#define IOMMU_SVA_VTD_GPASID_WPE	(1 << 6) /* Write protect enable */
+#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 7)
 	__u64 flags;
 	__u32 pat;
 	__u32 emt;
-- 
Gitee


From 198e75b6fff5cf4209016998380618c2bdad1232 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 2 Mar 2021 02:13:59 -0800
Subject: [PATCH 0426/2161] iommu/vt-d: Reject unsupported page request modes

commit 78a523fe73b81b4447beb2d6c78c9fafae24eebb upstream.

When supervisor/privilige mode SVM is used, we bind init_mm.pgd with
a supervisor PASID. There should not be any page fault for init_mm.
Execution request with DMA read is also not supported.

This patch checks PRQ descriptor for both unsupported configurations,
reject them both with invalid responses.

Fixes: 1c4f88b7f1f92 ("iommu/vt-d: Shared virtual address in scalable mode")
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lore.kernel.org/r/1614680040-1989-4-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index b3934b8232db..193ac2fa0e72 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -920,7 +920,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			       ((unsigned long long *)req)[1]);
 			goto no_pasid;
 		}
-
+		/* We shall not receive page request for supervisor SVM */
+		if (req->pm_req && (req->rd_req | req->wr_req)) {
+			pr_err("Unexpected page request in Privilege Mode");
+			/* No need to find the matching sdev as for bad_req */
+			goto no_pasid;
+		}
+		/* DMA read with exec requeset is not supported. */
+		if (req->exe_req && req->rd_req) {
+			pr_err("Execution request not supported\n");
+			goto no_pasid;
+		}
 		if (!svm || svm->pasid != req->pasid) {
 			rcu_read_lock();
 			svm = ioasid_find(NULL, req->pasid, NULL);
-- 
Gitee


From f6fa9ef9df4ca73a814e4701df74b739bbc08ccd Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 30 Nov 2021 15:39:58 +0800
Subject: [PATCH 0427/2161] iommu/vt-d: Calculate and set flags for
 handle_mm_fault

commit 396bd6f3d9f659d7ce324806bf3cd6677385f8fd upstream.

Page requests are originated from the user page fault. Therefore, we
shall set FAULT_FLAG_USER.

FAULT_FLAG_REMOTE indicates that we are walking an mm which is not
guaranteed to be the same as the current->mm and should not be subject
to protection key enforcement. Therefore, we should set FAULT_FLAG_REMOTE
to avoid faults when both SVM and PKEY are used.

References: commit 1b2ee1266ea6 ("mm/core: Do not enforce PKEY permissions on remote mm access")
Reviewed-by: Raj Ashok <ashok.raj@intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lore.kernel.org/r/1614680040-1989-5-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 193ac2fa0e72..ab0cb2fd4d47 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -895,6 +895,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	struct intel_iommu *iommu = d;
 	struct intel_svm *svm = NULL;
 	int head, tail, handled = 0;
+	unsigned int flags = 0;
 
 	/* Clear PPR bit before reading head/tail registers, to
 	 * ensure that we get a new interrupt if needed. */
@@ -992,8 +993,11 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		if (access_error(vma, req))
 			goto invalid;
 
-		ret = handle_mm_fault(vma, address,
-				      req->wr_req ? FAULT_FLAG_WRITE : 0);
+		flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+		if (req->wr_req)
+			flags |= FAULT_FLAG_WRITE;
+
+		ret = handle_mm_fault(vma, address, flags);
 		if (ret & VM_FAULT_ERROR)
 			goto invalid;
 
-- 
Gitee


From 969a73295dc9882adfbe9e7aaeeb0d2ffda214ba Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Wed, 10 Feb 2021 09:13:14 -0700
Subject: [PATCH 0428/2161] iommu/vt-d: Use Real PCI DMA device for IRTE

commit 9b4a824b889e1cc5e0430b80e40cfe9838c5b5f0 upstream.

VMD retransmits child device MSI-X with the VMD endpoint's requester-id.
In order to support direct interrupt remapping of VMD child devices,
ensure that the IRTE is programmed with the VMD endpoint's requester-id
using pci_real_dma_dev().

Link: https://lore.kernel.org/r/20210210161315.316097-2-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/irq_remapping.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 611ef5243cb6..75429a5373ec 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1280,7 +1280,8 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 		break;
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-		set_msi_sid(irte, msi_desc_to_pci_dev(info->desc));
+		set_msi_sid(irte,
+			    pci_real_dma_dev(msi_desc_to_pci_dev(info->desc)));
 		break;
 	default:
 		BUG_ON(1);
-- 
Gitee


From 23ec2158f2ca814abbffb2a2a94a45ea10611792 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 20 Mar 2021 10:09:16 +0800
Subject: [PATCH 0429/2161] iommu/vt-d: Fix lockdep splat in
 intel_pasid_get_entry()

commit 1a3f2fd7fc4e8f24510830e265de2ffb8e3300d2 upstream.

The pasid_lock is used to synchronize different threads from modifying a
same pasid directory entry at the same time. It causes below lockdep splat.

[   83.296538] ========================================================
[   83.296538] WARNING: possible irq lock inversion dependency detected
[   83.296539] 5.12.0-rc3+ #25 Tainted: G        W
[   83.296539] --------------------------------------------------------
[   83.296540] bash/780 just changed the state of lock:
[   83.296540] ffffffff82b29c98 (device_domain_lock){..-.}-{2:2}, at:
           iommu_flush_dev_iotlb.part.0+0x32/0x110
[   83.296547] but this lock took another, SOFTIRQ-unsafe lock in the past:
[   83.296547]  (pasid_lock){+.+.}-{2:2}
[   83.296548]

           and interrupts could create inverse lock ordering between them.

[   83.296549] other info that might help us debug this:
[   83.296549] Chain exists of:
                 device_domain_lock --> &iommu->lock --> pasid_lock
[   83.296551]  Possible interrupt unsafe locking scenario:

[   83.296551]        CPU0                    CPU1
[   83.296552]        ----                    ----
[   83.296552]   lock(pasid_lock);
[   83.296553]                                local_irq_disable();
[   83.296553]                                lock(device_domain_lock);
[   83.296554]                                lock(&iommu->lock);
[   83.296554]   <Interrupt>
[   83.296554]     lock(device_domain_lock);
[   83.296555]
                *** DEADLOCK ***

Fix it by replacing the pasid_lock with an atomic exchange operation.

Reported-and-tested-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210320020916.640115-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 0bf7e0a76890..1a4105a5989b 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -24,7 +24,6 @@
 /*
  * Intel IOMMU system wide PASID name space:
  */
-static DEFINE_SPINLOCK(pasid_lock);
 u32 intel_pasid_max_id = PASID_MAX;
 
 int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid)
@@ -259,19 +258,25 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 	dir_index = pasid >> PASID_PDE_SHIFT;
 	index = pasid & PASID_PTE_MASK;
 
-	spin_lock(&pasid_lock);
+retry:
 	entries = get_pasid_table_from_pde(&dir[dir_index]);
 	if (!entries) {
 		entries = alloc_pgtable_page(info->iommu->node);
-		if (!entries) {
-			spin_unlock(&pasid_lock);
+		if (!entries)
 			return NULL;
-		}
 
-		WRITE_ONCE(dir[dir_index].val,
-			   (u64)virt_to_phys(entries) | PASID_PTE_PRESENT);
+		/*
+		 * The pasid directory table entry won't be freed after
+		 * allocation. No worry about the race with free and
+		 * clear. However, this entry might be populated by others
+		 * while we are preparing it. Use theirs with a retry.
+		 */
+		if (cmpxchg64(&dir[dir_index].val, 0ULL,
+			      (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
+			free_pgtable_page(entries);
+			goto retry;
+		}
 	}
-	spin_unlock(&pasid_lock);
 
 	return &entries[index];
 }
-- 
Gitee


From af99679eba0772254d48c50e28ff3b190dd0774c Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 20 Mar 2021 10:41:56 +0800
Subject: [PATCH 0430/2161] iommu/vt-d: Don't set then clear private data in
 prq_event_thread()

commit 1d421058c815d54113d9afdf6db3f995c788cf0d upstream.

The VT-d specification (section 7.6) requires that the value in the
Private Data field of a Page Group Response Descriptor must match
the value in the Private Data field of the respective Page Request
Descriptor.

The private data field of a page group response descriptor is set then
immediately cleared in prq_event_thread(). This breaks the rule defined
by the VT-d specification. Fix it by moving clearing code up.

Fixes: 5b438f4ba315d ("iommu/vt-d: Support page request in scalable mode")
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210320024156.640798-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ab0cb2fd4d47..9e9d091633d4 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1034,12 +1034,12 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 				QI_PGRP_RESP_TYPE;
 			resp.qw1 = QI_PGRP_IDX(req->prg_index) |
 				QI_PGRP_LPIG(req->lpig);
+			resp.qw2 = 0;
+			resp.qw3 = 0;
 
 			if (req->priv_data_present)
 				memcpy(&resp.qw2, req->priv_data,
 				       sizeof(req->priv_data));
-			resp.qw2 = 0;
-			resp.qw3 = 0;
 			qi_submit_sync(iommu, &resp, 1, 0);
 		}
 prq_advance:
-- 
Gitee


From 2de788b904bd4a7949d536ff26ca6f08e86457a3 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 23 Mar 2021 09:05:57 +0800
Subject: [PATCH 0431/2161] iommu/vt-d: Remove svm_dev_ops

commit 2e1a44c1c4acf209c0dd7bc04421d101b9e80d11 upstream.

The svm_dev_ops has never been referenced in the tree, and there's no
plan to have anything to use it. Remove it to make the code neat.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210323010600.678627-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c   | 15 +--------------
 include/linux/intel-iommu.h |  3 ---
 include/linux/intel-svm.h   |  7 -------
 3 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 9e9d091633d4..af099287f9a1 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -462,7 +462,6 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
 /* Caller must hold pasid_mutex, mm reference */
 static int
 intel_svm_bind_mm(struct device *dev, unsigned int flags,
-		  struct svm_dev_ops *ops,
 		  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
@@ -512,10 +511,6 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 
 			/* Find the matching device in svm list */
 			for_each_svm_dev(sdev, svm, dev) {
-				if (sdev->ops != ops) {
-					ret = -EBUSY;
-					goto out;
-				}
 				sdev->users++;
 				goto success;
 			}
@@ -550,7 +545,6 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 
 	/* Finish the setup now we know we're keeping it */
 	sdev->users = 1;
-	sdev->ops = ops;
 	init_rcu_head(&sdev->rcu);
 
 	if (!svm) {
@@ -1006,13 +1000,6 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		up_read(&svm->mm->mmap_sem);
 		mmput(svm->mm);
 bad_req:
-		WARN_ON(!sdev);
-		if (sdev && sdev->ops && sdev->ops->fault_cb) {
-			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
-				(req->exe_req << 1) | (req->pm_req);
-			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
-					    req->priv_data, rwxp, result);
-		}
 		/* We get here in the error case where the PASID lookup failed,
 		   and these can be NULL. Do not use them below this point! */
 		sdev = NULL;
@@ -1087,7 +1074,7 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 	if (drvdata)
 		flags = *(unsigned int *)drvdata;
 	mutex_lock(&pasid_mutex);
-	ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
+	ret = intel_svm_bind_mm(dev, flags, mm, &sdev);
 	if (ret)
 		sva = ERR_PTR(ret);
 	else if (sdev)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f64d507252e2..da7af2ba5a08 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -769,14 +769,11 @@ u32 intel_svm_get_pasid(struct iommu_sva *handle);
 int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
 			    struct iommu_page_response *msg);
 
-struct svm_dev_ops;
-
 struct intel_svm_dev {
 	struct list_head list;
 	struct rcu_head rcu;
 	struct device *dev;
 	struct intel_iommu *iommu;
-	struct svm_dev_ops *ops;
 	struct iommu_sva sva;
 	u32 pasid;
 	int users;
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 39d368a810b8..6c9d10c0fb1e 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -8,13 +8,6 @@
 #ifndef __INTEL_SVM_H__
 #define __INTEL_SVM_H__
 
-struct device;
-
-struct svm_dev_ops {
-	void (*fault_cb)(struct device *dev, u32 pasid, u64 address,
-			 void *private, int rwxp, int response);
-};
-
 /* Values for rxwp in fault_cb callback */
 #define SVM_REQ_READ	(1<<3)
 #define SVM_REQ_WRITE	(1<<2)
-- 
Gitee


From 4c8e9433f715f224af37b10ae57da97afb569f5c Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 23 Mar 2021 09:05:58 +0800
Subject: [PATCH 0432/2161] iommu/vt-d: Remove SVM_FLAG_PRIVATE_PASID

commit 06905ea8319731036695cf1a4c53c12b0f9373cb upstream.

The SVM_FLAG_PRIVATE_PASID has never been referenced in the tree, and
there's no plan to have anything to use it. So cleanup it.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210323010600.678627-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 40 ++++++++++++++++++---------------------
 include/linux/intel-svm.h | 16 +++-------------
 2 files changed, 21 insertions(+), 35 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index af099287f9a1..548d9fb7d9c4 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -465,9 +465,9 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct intel_svm *svm = NULL, *t;
 	struct device_domain_info *info;
 	struct intel_svm_dev *sdev;
-	struct intel_svm *svm = NULL;
 	unsigned long iflags;
 	int pasid_max;
 	int ret;
@@ -493,30 +493,26 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		}
 	}
 
-	if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
-		struct intel_svm *t;
-
-		list_for_each_entry(t, &global_svm_list, list) {
-			if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
-				continue;
-
-			svm = t;
-			if (svm->pasid >= pasid_max) {
-				dev_warn(dev,
-					 "Limited PASID width. Cannot use existing PASID %d\n",
-					 svm->pasid);
-				ret = -ENOSPC;
-				goto out;
-			}
+	list_for_each_entry(t, &global_svm_list, list) {
+		if (t->mm != mm)
+			continue;
 
-			/* Find the matching device in svm list */
-			for_each_svm_dev(sdev, svm, dev) {
-				sdev->users++;
-				goto success;
-			}
+		svm = t;
+		if (svm->pasid >= pasid_max) {
+			dev_warn(dev,
+				 "Limited PASID width. Cannot use existing PASID %d\n",
+				 svm->pasid);
+			ret = -ENOSPC;
+			goto out;
+		}
 
-			break;
+		/* Find the matching device in svm list */
+		for_each_svm_dev(sdev, svm, dev) {
+			sdev->users++;
+			goto success;
 		}
+
+		break;
 	}
 
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 6c9d10c0fb1e..10fa80eef13a 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -14,16 +14,6 @@
 #define SVM_REQ_EXEC	(1<<1)
 #define SVM_REQ_PRIV	(1<<0)
 
-/*
- * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main"
- * PASID for the current process. Even if a PASID already exists, a new one
- * will be allocated. And the PASID allocated with SVM_FLAG_PRIVATE_PASID
- * will not be given to subsequent callers. This facility allows a driver to
- * disambiguate between multiple device contexts which access the same MM,
- * if there is no other way to do so. It should be used sparingly, if at all.
- */
-#define SVM_FLAG_PRIVATE_PASID		(1<<0)
-
 /*
  * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
  * for access to kernel addresses. No IOTLB flushes are automatically done
@@ -35,18 +25,18 @@
  * It is unlikely that we will ever hook into flush_tlb_kernel_range() to
  * do such IOTLB flushes automatically.
  */
-#define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
+#define SVM_FLAG_SUPERVISOR_MODE	BIT(0)
 /*
  * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest
  * processes. Compared to the host bind, the primary differences are:
  * 1. mm life cycle management
  * 2. fault reporting
  */
-#define SVM_FLAG_GUEST_MODE		(1<<2)
+#define SVM_FLAG_GUEST_MODE		BIT(1)
 /*
  * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space,
  * which requires guest and host PASID translation at both directions.
  */
-#define SVM_FLAG_GUEST_PASID		(1<<3)
+#define SVM_FLAG_GUEST_PASID		BIT(2)
 
 #endif /* __INTEL_SVM_H__ */
-- 
Gitee


From de9e894b52450c936b8662110e1959f3d34575f1 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 23 Mar 2021 09:05:59 +0800
Subject: [PATCH 0433/2161] iommu/vt-d: Remove unused function declarations

commit 1b169fdf427f9401bf9c8544cb9942580c06f8ef upstream.

Some functions have been removed. Remove the remaining function
delarations.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210323010600.678627-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 444c0bec221a..90a3268d7a77 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -99,9 +99,6 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte)
 }
 
 extern unsigned int intel_pasid_max_id;
-int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
-void intel_pasid_free_id(u32 pasid);
-void *intel_pasid_lookup_id(u32 pasid);
 int intel_pasid_alloc_table(struct device *dev);
 void intel_pasid_free_table(struct device *dev);
 struct pasid_table *intel_pasid_get_table(struct device *dev);
-- 
Gitee


From 2d42864eb627a21d6f60c1711d5a40e3bd855817 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 23 Mar 2021 09:06:00 +0800
Subject: [PATCH 0434/2161] iommu/vt-d: Make unnecessarily global functions
 static

commit 442b81836d6fdde1cf7dc5fc437a5f770c84498b upstream.

Make some functions static as they are only used inside pasid.c.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210323010600.678627-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 4 ++--
 drivers/iommu/intel/pasid.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 1a4105a5989b..4754830610a6 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -230,7 +230,7 @@ struct pasid_table *intel_pasid_get_table(struct device *dev)
 	return info->pasid_table;
 }
 
-int intel_pasid_get_dev_max_id(struct device *dev)
+static int intel_pasid_get_dev_max_id(struct device *dev)
 {
 	struct device_domain_info *info;
 
@@ -241,7 +241,7 @@ int intel_pasid_get_dev_max_id(struct device *dev)
 	return info->pasid_table->max_pasid;
 }
 
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
+static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 {
 	struct device_domain_info *info;
 	struct pasid_table *pasid_table;
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 90a3268d7a77..079534fcf55d 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -102,8 +102,6 @@ extern unsigned int intel_pasid_max_id;
 int intel_pasid_alloc_table(struct device *dev);
 void intel_pasid_free_table(struct device *dev);
 struct pasid_table *intel_pasid_get_table(struct device *dev);
-int intel_pasid_get_dev_max_id(struct device *dev);
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid);
 int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 				  struct device *dev, pgd_t *pgd,
 				  u32 pasid, u16 did, int flags);
-- 
Gitee


From 7fd4d3e34c51fa74ae0602b253a8715f6e5127fe Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 30 Mar 2021 10:11:45 +0800
Subject: [PATCH 0435/2161] iommu/vt-d: Report right snoop capability when
 using FL for IOVA

commit 6c00612d0cba10f7d0917cf1f73c945003ed4cd7 upstream.

The Intel VT-d driver checks wrong register to report snoop capablility
when using first level page table for GPA to HPA translation. This might
lead the IOMMU driver to say that it supports snooping control, but in
reality, it does not. Fix this by always setting PASID-table-entry.PGSNP
whenever a pasid entry is setting up for GPA to HPA translation so that
the IOMMU driver could report snoop capability as long as it runs in the
scalable mode.

Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level")
Suggested-by: Rajesh Sankaran <rajesh.sankaran@intel.com>
Suggested-by: Kevin Tian <kevin.tian@intel.com>
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210330021145.13824-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 12 +++++++++++-
 drivers/iommu/intel/pasid.c | 16 ++++++++++++++++
 drivers/iommu/intel/pasid.h |  1 +
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 54d65dc221b3..58d70c4f4255 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -658,7 +658,14 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip)
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
 		if (iommu != skip) {
-			if (!ecap_sc_support(iommu->ecap)) {
+			/*
+			 * If the hardware is operating in the scalable mode,
+			 * the snooping control is always supported since we
+			 * always set PASID-table-entry.PGSNP bit if the domain
+			 * is managed outside (UNMANAGED).
+			 */
+			if (!sm_supported(iommu) &&
+			    !ecap_sc_support(iommu->ecap)) {
 				ret = 0;
 				break;
 			}
@@ -2505,6 +2512,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
 
 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
 
+	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+		flags |= PASID_FLAG_PAGE_SNOOP;
+
 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
 					     domain->iommu_did[iommu->seq_id],
 					     flags);
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 4754830610a6..8f2a702ce429 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -425,6 +425,16 @@ static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
 	pasid_set_bits(&pe->val[1], 1 << 23, value << 23);
 }
 
+/*
+ * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode
+ * PASID entry.
+ */
+static inline void
+pasid_set_pgsnp(struct pasid_entry *pe)
+{
+	pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24);
+}
+
 /*
  * Setup the First Level Page table Pointer field (Bit 140~191)
  * of a scalable mode PASID entry.
@@ -596,6 +606,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 		}
 	}
 
+	if (flags & PASID_FLAG_PAGE_SNOOP)
+		pasid_set_pgsnp(pte);
+
 	pasid_set_domain_id(pte, did);
 	pasid_set_address_width(pte, iommu->agaw);
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
@@ -674,6 +687,9 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 	pasid_set_fault_enable(pte);
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
+	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+		pasid_set_pgsnp(pte);
+
 	/*
 	 * Since it is a second level only translation setup, we should
 	 * set SRE bit as well (addresses are expected to be GPAs).
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 079534fcf55d..5ff61c3d401f 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -48,6 +48,7 @@
  */
 #define PASID_FLAG_SUPERVISOR_MODE	BIT(0)
 #define PASID_FLAG_NESTED		BIT(1)
+#define PASID_FLAG_PAGE_SNOOP		BIT(2)
 
 /*
  * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
-- 
Gitee


From 00245e9535b45ec6d3132492d4c08fd696a2559a Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 1 Apr 2021 17:47:12 +0200
Subject: [PATCH 0436/2161] iommu: Separate IOMMU_DEV_FEAT_IOPF from
 IOMMU_DEV_FEAT_SVA

commit 34b48c704d194738eef0893aa06e412bdc8a972f upstream.

Some devices manage I/O Page Faults (IOPF) themselves instead of relying
on PCIe PRI or Arm SMMU stall. Allow their drivers to enable SVA without
mandating IOMMU-managed IOPF. The other device drivers now need to first
enable IOMMU_DEV_FEAT_IOPF before enabling IOMMU_DEV_FEAT_SVA. Enabling
IOMMU_DEV_FEAT_IOPF on its own doesn't have any effect visible to the
device driver, it is used in combination with other features.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20210401154718.307519-4-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b26c4e9e4e2c..2d4265f83287 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -161,10 +161,24 @@ struct iommu_resv_region {
 	enum iommu_resv_type	type;
 };
 
-/* Per device IOMMU features */
+/**
+ * enum iommu_dev_features - Per device IOMMU features
+ * @IOMMU_DEV_FEAT_AUX: Auxiliary domain feature
+ * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses
+ * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. Generally
+ *			 enabling %IOMMU_DEV_FEAT_SVA requires
+ *			 %IOMMU_DEV_FEAT_IOPF, but some devices manage I/O Page
+ *			 Faults themselves instead of relying on the IOMMU. When
+ *			 supported, this feature must be enabled before and
+ *			 disabled after %IOMMU_DEV_FEAT_SVA.
+ *
+ * Device drivers query whether a feature is supported using
+ * iommu_dev_has_feature(), and enable it using iommu_dev_enable_feature().
+ */
 enum iommu_dev_features {
-	IOMMU_DEV_FEAT_AUX,	/* Aux-domain feature */
-	IOMMU_DEV_FEAT_SVA,	/* Shared Virtual Addresses */
+	IOMMU_DEV_FEAT_AUX,
+	IOMMU_DEV_FEAT_SVA,
+	IOMMU_DEV_FEAT_IOPF,
 };
 
 #define IOMMU_PASID_INVALID	(-1U)
-- 
Gitee


From 564d4ba7e2259551d5aa81db4f82aab28c19bd3e Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 1 Apr 2021 17:47:13 +0200
Subject: [PATCH 0437/2161] iommu/vt-d: Support IOMMU_DEV_FEAT_IOPF

commit 9003351cb6bde752de774e6ec874109493413152 upstream.

Allow drivers to query and enable IOMMU_DEV_FEAT_IOPF, which amounts to
checking whether PRI is enabled.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20210401154718.307519-5-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 58d70c4f4255..c7189bee9c63 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5380,6 +5380,8 @@ static int siov_find_pci_dvsec(struct pci_dev *pdev)
 static bool
 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 {
+	struct device_domain_info *info = get_domain_info(dev);
+
 	if (feat == IOMMU_DEV_FEAT_AUX) {
 		int ret;
 
@@ -5394,13 +5396,13 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
 	}
 
-	if (feat == IOMMU_DEV_FEAT_SVA) {
-		struct device_domain_info *info = get_domain_info(dev);
+	if (feat == IOMMU_DEV_FEAT_IOPF)
+		return info && info->pri_supported;
 
+	if (feat == IOMMU_DEV_FEAT_SVA)
 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
 			info->pasid_supported && info->pri_supported &&
 			info->ats_supported;
-	}
 
 	return false;
 }
@@ -5411,6 +5413,9 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 	if (feat == IOMMU_DEV_FEAT_AUX)
 		return intel_iommu_enable_auxd(dev);
 
+	if (feat == IOMMU_DEV_FEAT_IOPF)
+		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
+
 	if (feat == IOMMU_DEV_FEAT_SVA) {
 		struct device_domain_info *info = get_domain_info(dev);
 
-- 
Gitee


From 0095646afc7d37f78774f3db569f07c47984f10c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:52 +0200
Subject: [PATCH 0438/2161] iommu: remove DOMAIN_ATTR_NESTING

commit 7e147547783a9035df816864b6a45ffbb254d700 upstream.

Use an explicit enable_nesting method instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-17-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/arm-smmu-v3.c     | 43 ++++++++++++---------------------
 drivers/iommu/arm-smmu.c        | 30 ++++++++++++-----------
 drivers/iommu/intel/iommu.c     | 31 +++++++-----------------
 drivers/iommu/iommu.c           | 10 ++++++++
 drivers/vfio/vfio_iommu_type1.c |  5 +---
 include/linux/iommu.h           |  4 ++-
 6 files changed, 55 insertions(+), 68 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 2870790e125c..5e39ec0be750 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2675,15 +2675,6 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
 	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-			return 0;
-		default:
-			return -ENODEV;
-		}
-		break;
 	case IOMMU_DOMAIN_DMA:
 		switch (attr) {
 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
@@ -2707,23 +2698,6 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 	mutex_lock(&smmu_domain->init_mutex);
 
 	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			if (smmu_domain->smmu) {
-				ret = -EPERM;
-				goto out_unlock;
-			}
-
-			if (*(int *)data)
-				smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-			else
-				smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
-			break;
-		default:
-			ret = -ENODEV;
-		}
-		break;
 	case IOMMU_DOMAIN_DMA:
 		switch(attr) {
 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
@@ -2737,11 +2711,25 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 		ret = -EINVAL;
 	}
 
-out_unlock:
 	mutex_unlock(&smmu_domain->init_mutex);
 	return ret;
 }
 
+static int arm_smmu_enable_nesting(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	int ret = 0;
+
+	mutex_lock(&smmu_domain->init_mutex);
+	if (smmu_domain->smmu)
+		ret = -EPERM;
+	else
+		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+	mutex_unlock(&smmu_domain->init_mutex);
+
+	return ret;
+}
+
 static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
 	return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2820,6 +2808,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.domain_get_attr	= arm_smmu_domain_get_attr,
 	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.enable_nesting		= arm_smmu_enable_nesting,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= arm_smmu_put_resv_regions,
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 056a5b1d00c1..c95efa5d2c41 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1474,9 +1474,6 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	switch(domain->type) {
 	case IOMMU_DOMAIN_UNMANAGED:
 		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-			return 0;
 		default:
 			return -ENODEV;
 		}
@@ -1495,6 +1492,21 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	}
 }
 
+static int arm_smmu_enable_nesting(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	int ret = 0;
+
+	mutex_lock(&smmu_domain->init_mutex);
+	if (smmu_domain->smmu)
+		ret = -EPERM;
+	else
+		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+	mutex_unlock(&smmu_domain->init_mutex);
+
+	return ret;
+}
+
 static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 				    enum iommu_attr attr, void *data)
 {
@@ -1506,17 +1518,6 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 	switch(domain->type) {
 	case IOMMU_DOMAIN_UNMANAGED:
 		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			if (smmu_domain->smmu) {
-				ret = -EPERM;
-				goto out_unlock;
-			}
-
-			if (*(int *)data)
-				smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-			else
-				smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
-			break;
 		default:
 			ret = -ENODEV;
 		}
@@ -1593,6 +1594,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.domain_get_attr	= arm_smmu_domain_get_attr,
 	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.enable_nesting		= arm_smmu_enable_nesting,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= arm_smmu_put_resv_regions,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c7189bee9c63..7887b6ae8794 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5485,32 +5485,19 @@ static bool risky_device(struct pci_dev *pdev)
 }
 
 static int
-intel_iommu_domain_set_attr(struct iommu_domain *domain,
-			    enum iommu_attr attr, void *data)
+intel_iommu_enable_nesting(struct iommu_domain *domain)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	unsigned long flags;
-	int ret = 0;
-
-	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
-		return -EINVAL;
+	int ret = -ENODEV;
 
-	switch (attr) {
-	case DOMAIN_ATTR_NESTING:
-		spin_lock_irqsave(&device_domain_lock, flags);
-		if (nested_mode_support() &&
-		    list_empty(&dmar_domain->devices)) {
-			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
-			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
-		} else {
-			ret = -ENODEV;
-		}
-		spin_unlock_irqrestore(&device_domain_lock, flags);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
+	spin_lock_irqsave(&device_domain_lock, flags);
+	if (nested_mode_support() && list_empty(&dmar_domain->devices)) {
+		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
+		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
+		ret = 0;
 	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	return ret;
 }
@@ -5571,7 +5558,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
 	.domain_get_attr        = intel_iommu_domain_get_attr,
-	.domain_set_attr	= intel_iommu_domain_set_attr,
+	.enable_nesting		= intel_iommu_enable_nesting,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
 	.aux_attach_dev		= intel_iommu_aux_attach_device,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 958b8fc85da9..24d76dd48dc3 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2529,6 +2529,16 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
 
+int iommu_enable_nesting(struct iommu_domain *domain)
+{
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+	if (!domain->ops->enable_nesting)
+		return -EINVAL;
+	return domain->ops->enable_nesting(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_enable_nesting);
+
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6b1e8cba1798..f8b3af3ebbdd 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1830,10 +1830,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	}
 
 	if (iommu->nesting) {
-		int attr = 1;
-
-		ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
-					    &attr);
+		ret = iommu_enable_nesting(domain->domain);
 		if (ret)
 			goto out_domain;
 	}
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2d4265f83287..8ae360365753 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -122,7 +122,6 @@ enum iommu_attr {
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_FSL_PAMUV1,
-	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_MAX,
 };
@@ -224,6 +223,7 @@ struct iommu_iotlb_gather {
  * @device_group: find iommu group for a particular device
  * @domain_get_attr: Query domain attributes
  * @domain_set_attr: Change domain attributes
+ * @enable_nesting: Enable nesting
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
@@ -278,6 +278,7 @@ struct iommu_ops {
 			       enum iommu_attr attr, void *data);
 	int (*domain_set_attr)(struct iommu_domain *domain,
 			       enum iommu_attr attr, void *data);
+	int (*enable_nesting)(struct iommu_domain *domain);
 
 	/* Request/Free a list of reserved regions for a device */
 	void (*get_resv_regions)(struct device *dev, struct list_head *list);
@@ -525,6 +526,7 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
+int iommu_enable_nesting(struct iommu_domain *domain);
 
 /* Window handling function prototypes */
 extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-- 
Gitee


From ff0119a01e0fe7f7aee092f185fcb1709ef96aa2 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 20 Mar 2021 10:54:11 +0800
Subject: [PATCH 0439/2161] iommu/vt-d: Report the right page fault address

commit 03d205094af45bca4f8e0498c461a893aa3ec6d9 upstream.

The Address field of the Page Request Descriptor only keeps bit [63:12]
of the offending address. Convert it to a full address before reporting
it to device drivers.

Fixes: eb8d93ea3c1d3 ("iommu/vt-d: Report page request faults for guest SVA")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210320025415.641201-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 548d9fb7d9c4..8e5c19b4abe6 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -852,7 +852,7 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
 	/* Fill in event data for device specific processing */
 	memset(&event, 0, sizeof(struct iommu_fault_event));
 	event.fault.type = IOMMU_FAULT_PAGE_REQ;
-	event.fault.prm.addr = desc->addr;
+	event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
 	event.fault.prm.pasid = desc->pasid;
 	event.fault.prm.grpid = desc->prg_index;
 	event.fault.prm.perm = prq_to_iommu_prot(desc);
-- 
Gitee


From d0cf9ba93ed661f32ed9f78d3681559bfdf1d8e1 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 20 Mar 2021 10:54:12 +0800
Subject: [PATCH 0440/2161] iommu/vt-d: Remove WO permissions on second-level
 paging entries

commit eea53c5816889ee8b64544fa2e9311a81184ff9c upstream.

When the first level page table is used for IOVA translation, it only
supports Read-Only and Read-Write permissions. The Write-Only permission
is not supported as the PRESENT bit (implying Read permission) should
always set. When using second level, we still give separate permissions
that allows WriteOnly which seems inconsistent and awkward. We want to
have consistent behavior. After moving to 1st level, we don't want things
to work sometimes, and break if we use 2nd level for the same mappings.
Hence remove this configuration.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210320025415.641201-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7887b6ae8794..66ce503496f7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2312,8 +2312,9 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		return -EINVAL;
 
 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
+	attr |= DMA_FL_PTE_PRESENT;
 	if (domain_use_first_level(domain)) {
-		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
+		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
 		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
 			attr |= DMA_FL_PTE_ACCESS;
-- 
Gitee


From 4b328df234901a4eea31a449abfd9dfbe1550c87 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 20 Mar 2021 10:54:13 +0800
Subject: [PATCH 0441/2161] iommu/vt-d: Invalidate PASID cache when
 root/context entry changed

commit c0474a606ecb9326227b4d68059942f9db88a897 upstream.

When the Intel IOMMU is operating in the scalable mode, some information
from the root and context table may be used to tag entries in the PASID
cache. Software should invalidate the PASID-cache when changing root or
context table entries.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210320025415.641201-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 18 +++++++++---------
 include/linux/intel-iommu.h |  1 +
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 66ce503496f7..3207a9174add 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1347,6 +1347,11 @@ static void iommu_set_root_entry(struct intel_iommu *iommu)
 		      readl, (sts & DMA_GSTS_RTPS), sts);
 
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+	if (sm_supported(iommu))
+		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }
 
 void iommu_flush_write_buffer(struct intel_iommu *iommu)
@@ -2430,6 +2435,10 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
 				   (((u16)bus) << 8) | devfn,
 				   DMA_CCMD_MASK_NOBIT,
 				   DMA_CCMD_DEVICE_INVL);
+
+	if (sm_supported(iommu))
+		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
+
 	iommu->flush.flush_iotlb(iommu,
 				 did_old,
 				 0,
@@ -3283,8 +3292,6 @@ static int __init init_dmars(void)
 		register_pasid_allocator(iommu);
 #endif
 		iommu_set_root_entry(iommu);
-		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 	}
 
 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
@@ -3525,12 +3532,7 @@ static int init_iommu_hw(void)
 		}
 
 		iommu_flush_write_buffer(iommu);
-
 		iommu_set_root_entry(iommu);
-
-		iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 		iommu_enable_translation(iommu);
 		iommu_disable_protect_mem_regions(iommu);
 	}
@@ -3913,8 +3915,6 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 		goto disable_iommu;
 
 	iommu_set_root_entry(iommu);
-	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 	iommu_enable_translation(iommu);
 
 	iommu_disable_protect_mem_regions(iommu);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index da7af2ba5a08..334df5a91f85 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -378,6 +378,7 @@ enum {
 /* PASID cache invalidation granu */
 #define QI_PC_ALL_PASIDS	0
 #define QI_PC_PASID_SEL		1
+#define QI_PC_GLOBAL		3
 
 #define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
 #define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
-- 
Gitee


From ff4d0e34fa1f6935a4ee88667dae1c8eb761d414 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 20 Mar 2021 10:54:15 +0800
Subject: [PATCH 0442/2161] iommu/vt-d: Avoid unnecessary cache flush in pasid
 entry teardown

commit 8b74b6ab253866450c131e9134642efb40439c91 upstream.

When a present pasid entry is disassembled, all kinds of pasid related
caches need to be flushed. But when a pasid entry is not being used
(PRESENT bit not set), we don't need to do this. Check the PRESENT bit
in intel_pasid_tear_down_entry() and avoid flushing caches if it's not
set.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210320025415.641201-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 8f2a702ce429..477b2e1d303c 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -517,6 +517,9 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 	if (WARN_ON(!pte))
 		return;
 
+	if (!(pte->val[0] & PASID_PTE_PRESENT))
+		return;
+
 	did = pasid_get_domain_id(pte);
 	intel_pasid_clear_entry(dev, pasid, fault_ignore);
 
-- 
Gitee


From e2584cb4b55a06d2266b661ac8bca0d922b31887 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sun, 11 Apr 2021 14:23:12 +0800
Subject: [PATCH 0443/2161] iommu/vt-d: Fix build error of pasid_enable_wpe()
 with !X86

commit 906f86c860304e84c36cd4c42a6a73928c4ed74c upstream.

Commit f68c7f539b6e9 ("iommu/vt-d: Enable write protect for supervisor
SVM") added pasid_enable_wpe() which hit below compile error with !X86.

../drivers/iommu/intel/pasid.c: In function 'pasid_enable_wpe':
../drivers/iommu/intel/pasid.c:554:22: error: implicit declaration of function 'read_cr0' [-Werror=implicit-function-declaration]
  554 |  unsigned long cr0 = read_cr0();
      |                      ^~~~~~~~
In file included from ../include/linux/build_bug.h:5,
                 from ../include/linux/bits.h:22,
                 from ../include/linux/bitops.h:6,
                 from ../drivers/iommu/intel/pasid.c:12:
../drivers/iommu/intel/pasid.c:557:23: error: 'X86_CR0_WP' undeclared (first use in this function)
  557 |  if (unlikely(!(cr0 & X86_CR0_WP))) {
      |                       ^~~~~~~~~~
../include/linux/compiler.h:78:42: note: in definition of macro 'unlikely'
   78 | # define unlikely(x) __builtin_expect(!!(x), 0)
      |                                          ^
../drivers/iommu/intel/pasid.c:557:23: note: each undeclared identifier is reported only once for each function it appears in
  557 |  if (unlikely(!(cr0 & X86_CR0_WP))) {
      |                       ^~~~~~~~~~
../include/linux/compiler.h:78:42: note: in definition of macro 'unlikely'
   78 | # define unlikely(x) __builtin_expect(!!(x), 0)
      |

Add the missing dependency.

Cc: Sanjay Kumar <sanjay.k.kumar@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Fixes: f68c7f539b6e9 ("iommu/vt-d: Enable write protect for supervisor SVM")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Link: https://lore.kernel.org/r/20210411062312.3057579-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 477b2e1d303c..72646bafc52f 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -551,6 +551,7 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
 
 static inline int pasid_enable_wpe(struct pasid_entry *pte)
 {
+#ifdef CONFIG_X86
 	unsigned long cr0 = read_cr0();
 
 	/* CR0.WP is normally set but just to be sure */
@@ -558,6 +559,7 @@ static inline int pasid_enable_wpe(struct pasid_entry *pte)
 		pr_err_ratelimited("No CPU write protect!\n");
 		return -EINVAL;
 	}
+#endif
 	pasid_set_wpe(pte);
 
 	return 0;
-- 
Gitee


From b0d32323f7aa686c91b644abd80660bad47d702e Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 11 Apr 2021 09:08:17 +0200
Subject: [PATCH 0444/2161] iommu/vt-d: Fix an error handling path in
 'intel_prepare_irq_remapping()'

commit 745610c4a3e3baaebf6d1f8cd5b4d82892432520 upstream.

If 'intel_cap_audit()' fails, we should return directly, as already done in
the surrounding error handling path.

Fixes: ad3d19029979 ("iommu/vt-d: Audit IOMMU Capabilities and add helper functions")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/98d531caabe66012b4fffc7813fd4b9470afd517.1618124777.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/irq_remapping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 75429a5373ec..f912fe45bea2 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -736,7 +736,7 @@ static int __init intel_prepare_irq_remapping(void)
 		return -ENODEV;
 
 	if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
-		goto error;
+		return -ENODEV;
 
 	if (!dmar_ir_support())
 		return -ENODEV;
-- 
Gitee


From 0ab0392f323c47c927078416d2cb2c8fc7a581e6 Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Thu, 15 Apr 2021 08:46:28 +0800
Subject: [PATCH 0445/2161] iommu/vt-d: Force to flush iotlb before creating
 superpage

commit 38c527aeb41926c71902dd42f788a8b093b21416 upstream.

The translation caches may preserve obsolete data when the
mapping size is changed, suppose the following sequence which
can reveal the problem with high probability.

1.mmap(4GB,MAP_HUGETLB)
2.
  while (1) {
   (a)    DMA MAP   0,0xa0000
   (b)    DMA UNMAP 0,0xa0000
   (c)    DMA MAP   0,0xc0000000
             * DMA read IOVA 0 may failure here (Not present)
             * if the problem occurs.
   (d)    DMA UNMAP 0,0xc0000000
  }

The page table(only focus on IOVA 0) after (a) is:
 PML4: 0x19db5c1003   entry:0xffff899bdcd2f000
  PDPE: 0x1a1cacb003  entry:0xffff89b35b5c1000
   PDE: 0x1a30a72003  entry:0xffff89b39cacb000
    PTE: 0x21d200803  entry:0xffff89b3b0a72000

The page table after (b) is:
 PML4: 0x19db5c1003   entry:0xffff899bdcd2f000
  PDPE: 0x1a1cacb003  entry:0xffff89b35b5c1000
   PDE: 0x1a30a72003  entry:0xffff89b39cacb000
    PTE: 0x0          entry:0xffff89b3b0a72000

The page table after (c) is:
 PML4: 0x19db5c1003   entry:0xffff899bdcd2f000
  PDPE: 0x1a1cacb003  entry:0xffff89b35b5c1000
   PDE: 0x21d200883   entry:0xffff89b39cacb000 (*)

Because the PDE entry after (b) is present, it won't be
flushed even if the iommu driver flush cache when unmap,
so the obsolete data may be preserved in cache, which
would cause the wrong translation at end.

However, we can see the PDE entry is finally switch to
2M-superpage mapping, but it does not transform
to 0x21d200883 directly:

1. PDE: 0x1a30a72003
2. __domain_mapping
     dma_pte_free_pagetable
       Set the PDE entry to ZERO
     Set the PDE entry to 0x21d200883

So we must flush the cache after the entry switch to ZERO
to avoid the obsolete info be preserved.

Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Gonglei (Arei) <arei.gonglei@huawei.com>

Fixes: 6491d4d02893 ("intel-iommu: Free old page tables before creating superpage")
Cc: <stable@vger.kernel.org> # v3.0+
Link: https://lore.kernel.org/linux-iommu/670baaf8-4ff8-4e84-4be3-030b95ab5a5e@huawei.com/
Suggested-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210415004628.1779-1-longpeng2@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 52 +++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3207a9174add..f867c06d7316 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2301,6 +2301,41 @@ static inline int hardware_largepage_caps(struct dmar_domain *domain,
 	return level;
 }
 
+/*
+ * Ensure that old small page tables are removed to make room for superpage(s).
+ * We're going to add new large pages, so make sure we don't remove their parent
+ * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
+ */
+static void switch_to_super_page(struct dmar_domain *domain,
+				 unsigned long start_pfn,
+				 unsigned long end_pfn, int level)
+{
+	unsigned long lvl_pages = lvl_to_nr_pages(level);
+	struct dma_pte *pte = NULL;
+	int i;
+
+	while (start_pfn <= end_pfn) {
+		if (!pte)
+			pte = pfn_to_dma_pte(domain, start_pfn, &level);
+
+		if (dma_pte_present(pte)) {
+			dma_pte_free_pagetable(domain, start_pfn,
+					       start_pfn + lvl_pages - 1,
+					       level + 1);
+
+			for_each_domain_iommu(i, domain)
+				iommu_flush_iotlb_psi(g_iommus[i], domain,
+						      start_pfn, lvl_pages,
+						      0, 0);
+		}
+
+		pte++;
+		start_pfn += lvl_pages;
+		if (first_pte_in_page(pte))
+			pte = NULL;
+	}
+}
+
 static int
 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
@@ -2342,22 +2377,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 				return -ENOMEM;
 			/* It is large page*/
 			if (largepage_lvl > 1) {
-				unsigned long nr_superpages, end_pfn;
+				unsigned long end_pfn;
 
 				pteval |= DMA_PTE_LARGE_PAGE;
-				lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
-				nr_superpages = nr_pages / lvl_pages;
-				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
-
-				/*
-				 * Ensure that old small page tables are
-				 * removed to make room for superpage(s).
-				 * We're adding new large pages, so make sure
-				 * we don't remove their parent tables.
-				 */
-				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
-						       largepage_lvl + 1);
+				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
+				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
 			} else {
 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
 			}
-- 
Gitee


From 0fac47d23376d88e3040e67017d77f3015da9000 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 19 May 2021 09:50:26 +0800
Subject: [PATCH 0446/2161] iommu/vt-d: Check for allocation failure in
 aux_detach_device()

commit 1a590a1c8bf46bf80ea12b657ca44c345531ac80 upstream.

In current kernels small allocations never fail, but checking for
allocation failure is the correct thing to do.

Fixes: 18abda7a2d55 ("iommu/vt-d: Fix general protection fault in aux_detach_device()")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/YJuobKuSn81dOPLd@mwanda
Link: https://lore.kernel.org/r/20210519015027.108468-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f867c06d7316..3c44af75b096 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4654,6 +4654,8 @@ static int auxiliary_link_device(struct dmar_domain *domain,
 
 	if (!sinfo) {
 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
+		if (!sinfo)
+			return -ENOMEM;
 		sinfo->domain = domain;
 		sinfo->pdev = dev;
 		list_add(&sinfo->link_phys, &info->subdevices);
-- 
Gitee


From dc51139d990390da891a6af5b5b7926d7e0c3523 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 19 May 2021 09:50:27 +0800
Subject: [PATCH 0447/2161] iommu/vt-d: Use user privilege for RID2PASID
 translation

commit 54c80d907400189b09548039be8f3b6e297e8ae3 upstream.

When first-level page tables are used for IOVA translation, we use user
privilege by setting U/S bit in the page table entry. This is to make it
consistent with the second level translation, where the U/S enforcement
is not available. Clear the SRE (Supervisor Request Enable) field in the
pasid table entry of RID2PASID so that requests requesting the supervisor
privilege are blocked and treated as DMA remapping faults.

Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level")
Suggested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210512064426.3440915-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210519015027.108468-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 7 +++++--
 drivers/iommu/intel/pasid.c | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3c44af75b096..86e17e93160a 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2526,9 +2526,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
 				    struct device *dev,
 				    u32 pasid)
 {
-	int flags = PASID_FLAG_SUPERVISOR_MODE;
 	struct dma_pte *pgd = domain->pgd;
 	int agaw, level;
+	int flags = 0;
 
 	/*
 	 * Skip top levels of page tables for iommu which has
@@ -2544,7 +2544,10 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
 	if (level != 4 && level != 5)
 		return -EINVAL;
 
-	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
+	if (pasid != PASID_RID2PASID)
+		flags |= PASID_FLAG_SUPERVISOR_MODE;
+	if (level == 5)
+		flags |= PASID_FLAG_FL5LP;
 
 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
 		flags |= PASID_FLAG_PAGE_SNOOP;
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 72646bafc52f..72dc84821dad 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -699,7 +699,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 	 * Since it is a second level only translation setup, we should
 	 * set SRE bit as well (addresses are expected to be GPAs).
 	 */
-	pasid_set_sre(pte);
+	if (pasid != PASID_RID2PASID)
+		pasid_set_sre(pte);
 	pasid_set_present(pte);
 	pasid_flush_caches(iommu, pte, pasid, did);
 
-- 
Gitee


From 822fafaa3f19e7d09415beafec9d539925541db2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 10 Jun 2021 10:00:53 +0800
Subject: [PATCH 0448/2161] iommu/vt-d: Remove redundant assignment to variable
 agaw

commit 05d2cbf969be465027d645f855419dbe49ded849 upstream.

The variable agaw is initialized with a value that is never read and it
is being updated later with a new value as a counter in a for-loop. The
initialization is redundant and can be removed.

Addresses-Coverity: ("Unused value")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210416171826.64091-1-colin.king@canonical.com
Link: https://lore.kernel.org/r/20210610020115.1637656-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 86e17e93160a..5689792b9696 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -565,7 +565,7 @@ static inline int domain_pfn_supported(struct dmar_domain *domain,
 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 {
 	unsigned long sagaw;
-	int agaw = -1;
+	int agaw;
 
 	sagaw = cap_sagaw(iommu->cap);
 	for (agaw = width_to_agaw(max_gaw);
-- 
Gitee


From ce8fc04c19c9a64908349bd663711bda66a689df Mon Sep 17 00:00:00 2001
From: Aditya Srivastava <yashsri421@gmail.com>
Date: Thu, 10 Jun 2021 10:00:54 +0800
Subject: [PATCH 0449/2161] iommu/vt-d: Fix kernel-doc syntax in file header

commit 367f82de5a9c8ee77058f40bb470b83bfa521e3d upstream.

The opening comment mark '/**' is used for highlighting the beginning of
kernel-doc comments.
The header for drivers/iommu/intel/pasid.c follows this syntax, but
the content inside does not comply with kernel-doc.

This line was probably not meant for kernel-doc parsing, but is parsed
due to the presence of kernel-doc like comment syntax(i.e, '/**'), which
causes unexpected warnings from kernel-doc:
warning: Function parameter or member 'fmt' not described in 'pr_fmt'

Provide a simple fix by replacing this occurrence with general comment
format, i.e. '/*', to prevent kernel-doc from parsing it.

Signed-off-by: Aditya Srivastava <yashsri421@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210523143245.19040-1-yashsri421@gmail.com
Link: https://lore.kernel.org/r/20210610020115.1637656-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 72dc84821dad..c6cf44a6c923 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/**
+/*
  * intel-pasid.c - PASID idr, table and entry manipulation
  *
  * Copyright (C) 2018 Intel Corporation
-- 
Gitee


From 8b2cdb37483ebfbf309ae99912c9427d14b14078 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:00:55 +0800
Subject: [PATCH 0450/2161] iommu/vt-d: Tweak the description of a DMA fault

commit 719a19335692083c0f4ebdc6eef65131e7394ac8 upstream.

The Intel IOMMU driver reports the DMA fault reason in a decimal number
while the VT-d specification uses a hexadecimal one. It's inconvenient
that users need to covert them everytime before consulting the spec.
Let's use hexadecimal number for a DMA fault reason.

The fault message uses 0xffffffff as PASID for DMA requests w/o PASID.
This is confusing. Tweak this by adding "NO_PASID" explicitly.

Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210517065425.4953-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 6971397805f3..fa98e542ca34 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1913,16 +1913,23 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 	reason = dmar_get_fault_reason(fault_reason, &fault_type);
 
 	if (fault_type == INTR_REMAP)
-		pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index %llx [fault reason %02d] %s\n",
-			source_id >> 8, PCI_SLOT(source_id & 0xFF),
-			PCI_FUNC(source_id & 0xFF), addr >> 48,
-			fault_reason, reason);
-	else
-		pr_err("[%s] Request device [%02x:%02x.%d] PASID %x fault addr %llx [fault reason %02d] %s\n",
+		pr_err("[INTR-REMAP] Request device [0x%02x:0x%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n",
+		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
+		       PCI_FUNC(source_id & 0xFF), addr >> 48,
+		       fault_reason, reason);
+	else if (pasid == INVALID_IOASID)
+		pr_err("[%s NO_PASID] Request device [0x%02x:0x%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
 		       type ? "DMA Read" : "DMA Write",
 		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
-		       PCI_FUNC(source_id & 0xFF), pasid, addr,
+		       PCI_FUNC(source_id & 0xFF), addr,
 		       fault_reason, reason);
+	else
+		pr_err("[%s PASID 0x%x] Request device [0x%02x:0x%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
+		       type ? "DMA Read" : "DMA Write", pasid,
+		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
+		       PCI_FUNC(source_id & 0xFF), addr,
+		       fault_reason, reason);
+
 	return 0;
 }
 
@@ -1989,7 +1996,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 		if (!ratelimited)
 			/* Using pasid -1 if pasid is not present */
 			dmar_fault_do_one(iommu, type, fault_reason,
-					  pasid_present ? pasid : -1,
+					  pasid_present ? pasid : INVALID_IOASID,
 					  source_id, guest_addr);
 
 		fault_index++;
-- 
Gitee


From 6d16a10465cd1be70d37b630fc104f6d2ab1f3ec Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:00:56 +0800
Subject: [PATCH 0451/2161] iommu/vt-d: Select PCI_ATS explicitly

commit 879fcc6bda6956ce9574421c5f68ebbc794bf06b upstream.

The Intel VT-d implementation supports device TLB management. Select
PCI_ATS explicitly so that the pci_ats helpers are always available.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210512065313.3441309-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index affb0d53f22a..92af4c93f991 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -13,6 +13,7 @@ config INTEL_IOMMU
 	select SWIOTLB
 	select IOASID
 	select IOMMU_DMA
+	select PCI_ATS
 	help
 	  DMA remapping (DMAR) devices support enables independent address
 	  translations for Direct Memory Access (DMA) from devices.
-- 
Gitee


From d0f75d2f4d560732ec2a25e0c5f2178ae0b71733 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:00:57 +0800
Subject: [PATCH 0452/2161] iommu/vt-d: Support asynchronous IOMMU nested
 capabilities

commit 521f546b4e4cedfbfbb5787f940a592dd20dd1f2 upstream.

Current VT-d implementation supports nested translation only if all
underlying IOMMUs support the nested capability. This is unnecessary
as the upper layer is allowed to create different containers and set
them with different type of iommu backend. The IOMMU driver needs to
guarantee that devices attached to a nested mode iommu_domain should
support nested capabilility.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210517065701.5078-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5689792b9696..a90098f0aff6 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4805,6 +4805,13 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
 	if (!iommu)
 		return -ENODEV;
 
+	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
+	    !ecap_nest(iommu->ecap)) {
+		dev_err(dev, "%s: iommu not support nested translation\n",
+			iommu->name);
+		return -EINVAL;
+	}
+
 	/* check if this iommu agaw is sufficient for max mapped address */
 	addr_width = agaw_to_width(iommu->agaw);
 	if (addr_width > cap_mgaw(iommu->cap))
@@ -5522,7 +5529,7 @@ intel_iommu_enable_nesting(struct iommu_domain *domain)
 	int ret = -ENODEV;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	if (nested_mode_support() && list_empty(&dmar_domain->devices)) {
+	if (list_empty(&dmar_domain->devices)) {
 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
 		ret = 0;
-- 
Gitee


From a527ac9f6abeb115f7567abdc756d4117b15854f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:00:58 +0800
Subject: [PATCH 0453/2161] iommu/vt-d: Add pasid private data helpers

commit 100b8a14a37074796a467d6a9147317538ee70cf upstream.

We are about to use iommu_sva_alloc/free_pasid() helpers in iommu core.
That means the pasid life cycle will be managed by iommu core. Use a
local array to save the per pasid private data instead of attaching it
the real pasid.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 62 ++++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 8e5c19b4abe6..0f96465e55e9 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -17,6 +17,7 @@
 #include <linux/dmar.h>
 #include <linux/interrupt.h>
 #include <linux/mm_types.h>
+#include <linux/xarray.h>
 #include <linux/ioasid.h>
 #include <asm/page.h>
 #include <asm/fpu/api.h>
@@ -28,6 +29,23 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid);
 
 #define PRQ_ORDER 0
 
+static DEFINE_XARRAY_ALLOC(pasid_private_array);
+static int pasid_private_add(ioasid_t pasid, void *priv)
+{
+	return xa_alloc(&pasid_private_array, &pasid, priv,
+			XA_LIMIT(pasid, pasid), GFP_ATOMIC);
+}
+
+static void pasid_private_remove(ioasid_t pasid)
+{
+	xa_erase(&pasid_private_array, pasid);
+}
+
+static void *pasid_private_find(ioasid_t pasid)
+{
+	return xa_load(&pasid_private_array, pasid);
+}
+
 int intel_svm_enable_prq(struct intel_iommu *iommu)
 {
 	struct page *pages;
@@ -224,7 +242,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
 	if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
 		return -EINVAL;
 
-	svm = ioasid_find(NULL, pasid, NULL);
+	svm = pasid_private_find(pasid);
 	if (IS_ERR(svm))
 		return PTR_ERR(svm);
 
@@ -334,7 +352,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			svm->gpasid = data->gpasid;
 			svm->flags |= SVM_FLAG_GUEST_PASID;
 		}
-		ioasid_set_data(data->hpasid, svm);
+		pasid_private_add(data->hpasid, svm);
 		INIT_LIST_HEAD_RCU(&svm->devs);
 		mmput(svm->mm);
 	}
@@ -388,7 +406,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	list_add_rcu(&sdev->list, &svm->devs);
  out:
 	if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
-		ioasid_set_data(data->hpasid, NULL);
+		pasid_private_remove(data->hpasid);
 		kfree(svm);
 	}
 
@@ -431,7 +449,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 				 * the unbind, IOMMU driver will get notified
 				 * and perform cleanup.
 				 */
-				ioasid_set_data(pasid, NULL);
+				pasid_private_remove(pasid);
 				kfree(svm);
 			}
 		}
@@ -547,8 +565,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
 		if (!svm) {
 			ret = -ENOMEM;
-			kfree(sdev);
-			goto out;
+			goto sdev_err;
 		}
 
 		if (pasid_max > intel_pasid_max_id)
@@ -556,13 +573,16 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 
 		/* Do not use PASID 0, reserved for RID to PASID */
 		svm->pasid = ioasid_alloc(NULL, PASID_MIN,
-					  pasid_max - 1, svm);
+					  pasid_max - 1, NULL);
 		if (svm->pasid == INVALID_IOASID) {
-			kfree(svm);
-			kfree(sdev);
 			ret = -ENOSPC;
-			goto out;
+			goto svm_err;
 		}
+
+		ret = pasid_private_add(svm->pasid, svm);
+		if (ret)
+			goto pasid_err;
+
 		svm->notifier.ops = &intel_mmuops;
 		svm->mm = mm;
 		svm->flags = flags;
@@ -571,12 +591,8 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		ret = -ENOMEM;
 		if (mm) {
 			ret = mmu_notifier_register(&svm->notifier, mm);
-			if (ret) {
-				ioasid_put(svm->pasid);
-				kfree(svm);
-				kfree(sdev);
-				goto out;
-			}
+			if (ret)
+				goto priv_err;
 		}
 
 		spin_lock_irqsave(&iommu->lock, iflags);
@@ -590,8 +606,13 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		if (ret) {
 			if (mm)
 				mmu_notifier_unregister(&svm->notifier, mm);
+priv_err:
+			pasid_private_remove(svm->pasid);
+pasid_err:
 			ioasid_put(svm->pasid);
+svm_err:
 			kfree(svm);
+sdev_err:
 			kfree(sdev);
 			goto out;
 		}
@@ -614,10 +635,8 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 						(cpu_feature_enabled(X86_FEATURE_LA57) ?
 						PASID_FLAG_FL5LP : 0));
 		spin_unlock_irqrestore(&iommu->lock, iflags);
-		if (ret) {
-			kfree(sdev);
-			goto out;
-		}
+		if (ret)
+			goto sdev_err;
 	}
 	list_add_rcu(&sdev->list, &svm->devs);
 success:
@@ -670,6 +689,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 					load_pasid(svm->mm, PASID_DISABLED);
 				}
 				list_del(&svm->list);
+				pasid_private_remove(svm->pasid);
 				/* We mandate that no page faults may be outstanding
 				 * for the PASID when intel_svm_unbind_mm() is called.
 				 * If that is not obeyed, subtle errors will happen.
@@ -924,7 +944,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		}
 		if (!svm || svm->pasid != req->pasid) {
 			rcu_read_lock();
-			svm = ioasid_find(NULL, req->pasid, NULL);
+			svm = pasid_private_find(req->pasid);
 			/* It *can't* go away, because the driver is not permitted
 			 * to unbind the mm while any page faults are outstanding.
 			 * So we only need RCU to protect the internal idr code. */
-- 
Gitee


From 0cf7574cd0afb2ffc0b493a0c504aee6877cca9a Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:00:59 +0800
Subject: [PATCH 0454/2161] iommu/vt-d: Use iommu_sva_alloc(free)_pasid()
 helpers

commit 40483774141625b9685b177fb6e1f36de48d33f8 upstream.

Align the pasid alloc/free code with the generic helpers defined in the
iommu core. This also refactored the SVA binding code to improve the
readability.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Kconfig |   1 +
 drivers/iommu/intel/iommu.c |   3 +
 drivers/iommu/intel/svm.c   | 278 +++++++++++++++---------------------
 include/linux/intel-iommu.h |   1 -
 4 files changed, 120 insertions(+), 163 deletions(-)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 92af4c93f991..cc18ae84af2f 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -41,6 +41,7 @@ config INTEL_IOMMU_SVM
 	select PCI_PRI
 	select MMU_NOTIFIER
 	select IOASID
+	select IOMMU_SVA_LIB
 	help
 	  Shared Virtual Memory (SVM) provides a facility for devices
 	  to access DMA resources through process address space by
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index a90098f0aff6..4e0a9227b148 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5459,6 +5459,9 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 		if (!info)
 			return -EINVAL;
 
+		if (intel_iommu_enable_pasid(info->iommu, dev))
+			return -ENODEV;
+
 		if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
 			return -EINVAL;
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 0f96465e55e9..c2bb7494e516 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -23,9 +23,11 @@
 #include <asm/fpu/api.h>
 
 #include "pasid.h"
+#include "../iommu-sva-lib.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
+#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
 
 #define PRQ_ORDER 0
 
@@ -222,7 +224,6 @@ static const struct mmu_notifier_ops intel_mmuops = {
 };
 
 static DEFINE_MUTEX(pasid_mutex);
-static LIST_HEAD(global_svm_list);
 
 #define for_each_svm_dev(sdev, svm, d)			\
 	list_for_each_entry((sdev), &(svm)->devs, list)	\
@@ -477,79 +478,80 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
 	mutex_unlock(&mm->context.lock);
 }
 
-/* Caller must hold pasid_mutex, mm reference */
-static int
-intel_svm_bind_mm(struct device *dev, unsigned int flags,
-		  struct mm_struct *mm, struct intel_svm_dev **sd)
+static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
+				 unsigned int flags)
 {
-	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
-	struct intel_svm *svm = NULL, *t;
-	struct device_domain_info *info;
-	struct intel_svm_dev *sdev;
-	unsigned long iflags;
-	int pasid_max;
-	int ret;
+	ioasid_t max_pasid = dev_is_pci(dev) ?
+			pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id;
 
-	if (!iommu || dmar_disabled)
-		return -EINVAL;
+	return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
+}
 
-	if (!intel_svm_capable(iommu))
-		return -ENOTSUPP;
+static void intel_svm_free_pasid(struct mm_struct *mm)
+{
+	iommu_sva_free_pasid(mm);
+}
 
-	if (dev_is_pci(dev)) {
-		pasid_max = pci_max_pasids(to_pci_dev(dev));
-		if (pasid_max < 0)
-			return -EINVAL;
-	} else
-		pasid_max = 1 << 20;
+static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
+					   struct device *dev,
+					   struct mm_struct *mm,
+					   unsigned int flags)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	unsigned long iflags, sflags;
+	struct intel_svm_dev *sdev;
+	struct intel_svm *svm;
+	int ret = 0;
 
-	/* Bind supervisor PASID shuld have mm = NULL */
-	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
-		if (!ecap_srs(iommu->ecap) || mm) {
-			pr_err("Supervisor PASID with user provided mm.\n");
-			return -EINVAL;
-		}
-	}
+	svm = pasid_private_find(mm->pasid);
+	if (!svm) {
+		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+		if (!svm)
+			return ERR_PTR(-ENOMEM);
 
-	list_for_each_entry(t, &global_svm_list, list) {
-		if (t->mm != mm)
-			continue;
+		svm->pasid = mm->pasid;
+		svm->mm = mm;
+		svm->flags = flags;
+		INIT_LIST_HEAD_RCU(&svm->devs);
 
-		svm = t;
-		if (svm->pasid >= pasid_max) {
-			dev_warn(dev,
-				 "Limited PASID width. Cannot use existing PASID %d\n",
-				 svm->pasid);
-			ret = -ENOSPC;
-			goto out;
+		if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) {
+			svm->notifier.ops = &intel_mmuops;
+			ret = mmu_notifier_register(&svm->notifier, mm);
+			if (ret) {
+				kfree(svm);
+				return ERR_PTR(ret);
+			}
 		}
 
-		/* Find the matching device in svm list */
-		for_each_svm_dev(sdev, svm, dev) {
-			sdev->users++;
-			goto success;
+		ret = pasid_private_add(svm->pasid, svm);
+		if (ret) {
+			if (svm->notifier.ops)
+				mmu_notifier_unregister(&svm->notifier, mm);
+			kfree(svm);
+			return ERR_PTR(ret);
 		}
+	}
 
-		break;
+	/* Find the matching device in svm list */
+	for_each_svm_dev(sdev, svm, dev) {
+		sdev->users++;
+		goto success;
 	}
 
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 	if (!sdev) {
 		ret = -ENOMEM;
-		goto out;
+		goto free_svm;
 	}
+
 	sdev->dev = dev;
 	sdev->iommu = iommu;
-
-	ret = intel_iommu_enable_pasid(iommu, dev);
-	if (ret) {
-		kfree(sdev);
-		goto out;
-	}
-
-	info = get_domain_info(dev);
 	sdev->did = FLPT_DEFAULT_DID;
 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
+	sdev->users = 1;
+	sdev->pasid = svm->pasid;
+	sdev->sva.dev = dev;
+	init_rcu_head(&sdev->rcu);
 	if (info->ats_enabled) {
 		sdev->dev_iotlb = 1;
 		sdev->qdep = info->ats_qdep;
@@ -557,96 +559,37 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 			sdev->qdep = 0;
 	}
 
-	/* Finish the setup now we know we're keeping it */
-	sdev->users = 1;
-	init_rcu_head(&sdev->rcu);
-
-	if (!svm) {
-		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
-		if (!svm) {
-			ret = -ENOMEM;
-			goto sdev_err;
-		}
-
-		if (pasid_max > intel_pasid_max_id)
-			pasid_max = intel_pasid_max_id;
-
-		/* Do not use PASID 0, reserved for RID to PASID */
-		svm->pasid = ioasid_alloc(NULL, PASID_MIN,
-					  pasid_max - 1, NULL);
-		if (svm->pasid == INVALID_IOASID) {
-			ret = -ENOSPC;
-			goto svm_err;
-		}
-
-		ret = pasid_private_add(svm->pasid, svm);
-		if (ret)
-			goto pasid_err;
+	/* Setup the pasid table: */
+	sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ?
+			PASID_FLAG_SUPERVISOR_MODE : 0;
+	sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+	spin_lock_irqsave(&iommu->lock, iflags);
+	ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid,
+					    FLPT_DEFAULT_DID, sflags);
+	spin_unlock_irqrestore(&iommu->lock, iflags);
 
-		svm->notifier.ops = &intel_mmuops;
-		svm->mm = mm;
-		svm->flags = flags;
-		INIT_LIST_HEAD_RCU(&svm->devs);
-		INIT_LIST_HEAD(&svm->list);
-		ret = -ENOMEM;
-		if (mm) {
-			ret = mmu_notifier_register(&svm->notifier, mm);
-			if (ret)
-				goto priv_err;
-		}
+	if (ret)
+		goto free_sdev;
 
-		spin_lock_irqsave(&iommu->lock, iflags);
-		ret = intel_pasid_setup_first_level(iommu, dev,
-				mm ? mm->pgd : init_mm.pgd,
-				svm->pasid, FLPT_DEFAULT_DID,
-				(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
-				(cpu_feature_enabled(X86_FEATURE_LA57) ?
-				 PASID_FLAG_FL5LP : 0));
-		spin_unlock_irqrestore(&iommu->lock, iflags);
-		if (ret) {
-			if (mm)
-				mmu_notifier_unregister(&svm->notifier, mm);
-priv_err:
-			pasid_private_remove(svm->pasid);
-pasid_err:
-			ioasid_put(svm->pasid);
-svm_err:
-			kfree(svm);
-sdev_err:
-			kfree(sdev);
-			goto out;
-		}
+	/* The newly allocated pasid is loaded to the mm. */
+	if (!(flags & SVM_FLAG_SUPERVISOR_MODE) && list_empty(&svm->devs))
+		load_pasid(mm, svm->pasid);
 
-		list_add_tail(&svm->list, &global_svm_list);
-		if (mm) {
-			/* The newly allocated pasid is loaded to the mm. */
-			load_pasid(mm, svm->pasid);
-		}
-	} else {
-		/*
-		 * Binding a new device with existing PASID, need to setup
-		 * the PASID entry.
-		 */
-		spin_lock_irqsave(&iommu->lock, iflags);
-		ret = intel_pasid_setup_first_level(iommu, dev,
-						mm ? mm->pgd : init_mm.pgd,
-						svm->pasid, FLPT_DEFAULT_DID,
-						(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
-						(cpu_feature_enabled(X86_FEATURE_LA57) ?
-						PASID_FLAG_FL5LP : 0));
-		spin_unlock_irqrestore(&iommu->lock, iflags);
-		if (ret)
-			goto sdev_err;
-	}
 	list_add_rcu(&sdev->list, &svm->devs);
 success:
-	sdev->pasid = svm->pasid;
-	sdev->sva.dev = dev;
-	if (sd)
-		*sd = sdev;
-	ret = 0;
-out:
-	return ret;
+	return &sdev->sva;
+
+free_sdev:
+	kfree(sdev);
+free_svm:
+	if (list_empty(&svm->devs)) {
+		if (svm->notifier.ops)
+			mmu_notifier_unregister(&svm->notifier, mm);
+		pasid_private_remove(mm->pasid);
+		kfree(svm);
+	}
+
+	return ERR_PTR(ret);
 }
 
 /* Caller must hold pasid_mutex */
@@ -655,6 +598,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 	struct intel_svm_dev *sdev;
 	struct intel_iommu *iommu;
 	struct intel_svm *svm;
+	struct mm_struct *mm;
 	int ret = -EINVAL;
 
 	iommu = device_to_iommu(dev, NULL, NULL);
@@ -664,6 +608,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 	ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
 	if (ret)
 		goto out;
+	mm = svm->mm;
 
 	if (sdev) {
 		sdev->users--;
@@ -682,13 +627,12 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
-				ioasid_put(svm->pasid);
-				if (svm->mm) {
-					mmu_notifier_unregister(&svm->notifier, svm->mm);
+				intel_svm_free_pasid(mm);
+				if (svm->notifier.ops) {
+					mmu_notifier_unregister(&svm->notifier, mm);
 					/* Clear mm's pasid. */
-					load_pasid(svm->mm, PASID_DISABLED);
+					load_pasid(mm, PASID_DISABLED);
 				}
-				list_del(&svm->list);
 				pasid_private_remove(svm->pasid);
 				/* We mandate that no page faults may be outstanding
 				 * for the PASID when intel_svm_unbind_mm() is called.
@@ -1073,31 +1017,42 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	return IRQ_RETVAL(handled);
 }
 
-#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
-struct iommu_sva *
-intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 {
-	struct iommu_sva *sva = ERR_PTR(-EINVAL);
-	struct intel_svm_dev *sdev = NULL;
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	unsigned int flags = 0;
+	struct iommu_sva *sva;
 	int ret;
 
-	/*
-	 * TODO: Consolidate with generic iommu-sva bind after it is merged.
-	 * It will require shared SVM data structures, i.e. combine io_mm
-	 * and intel_svm etc.
-	 */
 	if (drvdata)
 		flags = *(unsigned int *)drvdata;
+
+	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+		if (!ecap_srs(iommu->ecap)) {
+			dev_err(dev, "%s: Supervisor PASID not supported\n",
+				iommu->name);
+			return ERR_PTR(-EOPNOTSUPP);
+		}
+
+		if (mm) {
+			dev_err(dev, "%s: Supervisor PASID with user provided mm\n",
+				iommu->name);
+			return ERR_PTR(-EINVAL);
+		}
+
+		mm = &init_mm;
+	}
+
 	mutex_lock(&pasid_mutex);
-	ret = intel_svm_bind_mm(dev, flags, mm, &sdev);
-	if (ret)
-		sva = ERR_PTR(ret);
-	else if (sdev)
-		sva = &sdev->sva;
-	else
-		WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+	ret = intel_svm_alloc_pasid(dev, mm, flags);
+	if (ret) {
+		mutex_unlock(&pasid_mutex);
+		return ERR_PTR(ret);
+	}
 
+	sva = intel_svm_bind_mm(iommu, dev, mm, flags);
+	if (IS_ERR_OR_NULL(sva))
+		intel_svm_free_pasid(mm);
 	mutex_unlock(&pasid_mutex);
 
 	return sva;
@@ -1105,10 +1060,9 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 
 void intel_svm_unbind(struct iommu_sva *sva)
 {
-	struct intel_svm_dev *sdev;
+	struct intel_svm_dev *sdev = to_intel_svm_dev(sva);
 
 	mutex_lock(&pasid_mutex);
-	sdev = to_intel_svm_dev(sva);
 	intel_svm_unbind_mm(sdev->dev, sdev->pasid);
 	mutex_unlock(&pasid_mutex);
 }
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 334df5a91f85..5767f51371e3 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -791,7 +791,6 @@ struct intel_svm {
 	u32 pasid;
 	int gpasid; /* In case that guest PASID is different from host PASID */
 	struct list_head devs;
-	struct list_head list;
 };
 #else
 static inline void intel_svm_check(struct intel_iommu *iommu) {}
-- 
Gitee


From 5503b4a8859ab8dfcc9c425cf4fc4637f7348070 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:00 +0800
Subject: [PATCH 0455/2161] iommu/vt-d: Use common helper to lookup svm devices

commit 9e52cc0fedb0fd6187a2bd01b859ea8150b588c2 upstream.

It's common to iterate the svm device list and find a matched device. Add
common helpers to do this and consolidate the code.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-9-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 68 +++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index c2bb7494e516..29835b702933 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -48,6 +48,40 @@ static void *pasid_private_find(ioasid_t pasid)
 	return xa_load(&pasid_private_array, pasid);
 }
 
+static struct intel_svm_dev *
+svm_lookup_device_by_sid(struct intel_svm *svm, u16 sid)
+{
+	struct intel_svm_dev *sdev = NULL, *t;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(t, &svm->devs, list) {
+		if (t->sid == sid) {
+			sdev = t;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return sdev;
+}
+
+static struct intel_svm_dev *
+svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
+{
+	struct intel_svm_dev *sdev = NULL, *t;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(t, &svm->devs, list) {
+		if (t->dev == dev) {
+			sdev = t;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return sdev;
+}
+
 int intel_svm_enable_prq(struct intel_iommu *iommu)
 {
 	struct page *pages;
@@ -225,15 +259,11 @@ static const struct mmu_notifier_ops intel_mmuops = {
 
 static DEFINE_MUTEX(pasid_mutex);
 
-#define for_each_svm_dev(sdev, svm, d)			\
-	list_for_each_entry((sdev), &(svm)->devs, list)	\
-		if ((d) != (sdev)->dev) {} else
-
 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
 			     struct intel_svm **rsvm,
 			     struct intel_svm_dev **rsdev)
 {
-	struct intel_svm_dev *d, *sdev = NULL;
+	struct intel_svm_dev *sdev = NULL;
 	struct intel_svm *svm;
 
 	/* The caller should hold the pasid_mutex lock */
@@ -256,15 +286,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
 	 */
 	if (WARN_ON(list_empty(&svm->devs)))
 		return -EINVAL;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(d, &svm->devs, list) {
-		if (d->dev == dev) {
-			sdev = d;
-			break;
-		}
-	}
-	rcu_read_unlock();
+	sdev = svm_lookup_device_by_dev(svm, dev);
 
 out:
 	*rsvm = svm;
@@ -533,7 +555,8 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
 	}
 
 	/* Find the matching device in svm list */
-	for_each_svm_dev(sdev, svm, dev) {
+	sdev = svm_lookup_device_by_dev(svm, dev);
+	if (sdev) {
 		sdev->users++;
 		goto success;
 	}
@@ -901,19 +924,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			}
 		}
 
-		if (!sdev || sdev->sid != req->rid) {
-			struct intel_svm_dev *t;
-
-			sdev = NULL;
-			rcu_read_lock();
-			list_for_each_entry_rcu(t, &svm->devs, list) {
-				if (t->sid == req->rid) {
-					sdev = t;
-					break;
-				}
-			}
-			rcu_read_unlock();
-		}
+		if (!sdev || sdev->sid != req->rid)
+			sdev = svm_lookup_device_by_sid(svm, req->rid);
 
 		/* Since we're using init_mm.pgd directly, we should never take
 		 * any faults on kernel addresses. */
-- 
Gitee


From 6b0cb8a72df3b064e5d8998a8c0c52ae0c64d526 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:01 +0800
Subject: [PATCH 0456/2161] iommu/vt-d: Refactor prq_event_thread()

commit ae7f09b14b4ff18f65790a906d7c2fe2561568b5 upstream.

Refactor prq_event_thread() by moving handling single prq event out of
the main loop.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-10-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 237 ++++++++++++++++++++++----------------
 1 file changed, 135 insertions(+), 102 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 29835b702933..bd30457230e6 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -866,141 +866,174 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
 	return iommu_report_device_fault(dev, &event);
 }
 
+static void handle_bad_prq_event(struct intel_iommu *iommu,
+				 struct page_req_dsc *req, int result)
+{
+	struct qi_desc desc;
+
+	pr_err("%s: Invalid page request: %08llx %08llx\n",
+	       iommu->name, ((unsigned long long *)req)[0],
+	       ((unsigned long long *)req)[1]);
+
+	/*
+	 * Per VT-d spec. v3.0 ch7.7, system software must
+	 * respond with page group response if private data
+	 * is present (PDP) or last page in group (LPIG) bit
+	 * is set. This is an additional VT-d feature beyond
+	 * PCI ATS spec.
+	 */
+	if (!req->lpig && !req->priv_data_present)
+		return;
+
+	desc.qw0 = QI_PGRP_PASID(req->pasid) |
+			QI_PGRP_DID(req->rid) |
+			QI_PGRP_PASID_P(req->pasid_present) |
+			QI_PGRP_PDP(req->priv_data_present) |
+			QI_PGRP_RESP_CODE(result) |
+			QI_PGRP_RESP_TYPE;
+	desc.qw1 = QI_PGRP_IDX(req->prg_index) |
+			QI_PGRP_LPIG(req->lpig);
+	desc.qw2 = 0;
+	desc.qw3 = 0;
+
+	if (req->priv_data_present)
+		memcpy(&desc.qw2, req->priv_data, sizeof(req->priv_data));
+	qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+static void handle_single_prq_event(struct intel_iommu *iommu,
+				    struct mm_struct *mm,
+				    struct page_req_dsc *req)
+{
+	u64 address = (u64)req->addr << VTD_PAGE_SHIFT;
+	int result = QI_RESP_INVALID;
+	struct vm_area_struct *vma;
+	struct qi_desc desc;
+	unsigned int flags;
+	vm_fault_t ret;
+
+	/* If the mm is already defunct, don't handle faults. */
+	if (!mmget_not_zero(mm))
+		goto response;
+
+	down_read(&svm->mm->mmap_sem);
+	vma = find_extend_vma(mm, address);
+	if (!vma || address < vma->vm_start)
+		goto invalid;
+
+	if (access_error(vma, req))
+		goto invalid;
+
+	flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+	if (req->wr_req)
+		flags |= FAULT_FLAG_WRITE;
+
+	ret = handle_mm_fault(vma, address, flags, NULL);
+	if (!(ret & VM_FAULT_ERROR))
+		result = QI_RESP_SUCCESS;
+invalid:
+	up_read(&svm->mm->mmap_sem);
+	mmput(mm);
+
+response:
+	if (!(req->lpig || req->priv_data_present))
+		return;
+
+	desc.qw0 = QI_PGRP_PASID(req->pasid) |
+			QI_PGRP_DID(req->rid) |
+			QI_PGRP_PASID_P(req->pasid_present) |
+			QI_PGRP_PDP(req->priv_data_present) |
+			QI_PGRP_RESP_CODE(result) |
+			QI_PGRP_RESP_TYPE;
+	desc.qw1 = QI_PGRP_IDX(req->prg_index) |
+			QI_PGRP_LPIG(req->lpig);
+	desc.qw2 = 0;
+	desc.qw3 = 0;
+
+	if (req->priv_data_present)
+		memcpy(&desc.qw2, req->priv_data, sizeof(req->priv_data));
+
+	qi_submit_sync(iommu, &desc, 1, 0);
+}
+
 static irqreturn_t prq_event_thread(int irq, void *d)
 {
 	struct intel_svm_dev *sdev = NULL;
 	struct intel_iommu *iommu = d;
 	struct intel_svm *svm = NULL;
-	int head, tail, handled = 0;
-	unsigned int flags = 0;
+	struct page_req_dsc *req;
+	int head, tail, handled;
+	u64 address;
 
-	/* Clear PPR bit before reading head/tail registers, to
-	 * ensure that we get a new interrupt if needed. */
+	/*
+	 * Clear PPR bit before reading head/tail registers, to ensure that
+	 * we get a new interrupt if needed.
+	 */
 	writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
 
 	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+	handled = (head != tail);
 	while (head != tail) {
-		struct vm_area_struct *vma;
-		struct page_req_dsc *req;
-		struct qi_desc resp;
-		int result;
-		vm_fault_t ret;
-		u64 address;
-
-		handled = 1;
 		req = &iommu->prq[head / sizeof(*req)];
-		result = QI_RESP_INVALID;
 		address = (u64)req->addr << VTD_PAGE_SHIFT;
-		if (!req->pasid_present) {
-			pr_err("%s: Page request without PASID: %08llx %08llx\n",
-			       iommu->name, ((unsigned long long *)req)[0],
-			       ((unsigned long long *)req)[1]);
-			goto no_pasid;
+
+		if (unlikely(!req->pasid_present)) {
+			pr_err("IOMMU: %s: Page request without PASID\n",
+			       iommu->name);
+bad_req:
+			svm = NULL;
+			sdev = NULL;
+			handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+			goto prq_advance;
 		}
-		/* We shall not receive page request for supervisor SVM */
-		if (req->pm_req && (req->rd_req | req->wr_req)) {
-			pr_err("Unexpected page request in Privilege Mode");
-			/* No need to find the matching sdev as for bad_req */
-			goto no_pasid;
+
+		if (unlikely(!is_canonical_address(address))) {
+			pr_err("IOMMU: %s: Address is not canonical\n",
+			       iommu->name);
+			goto bad_req;
 		}
-		/* DMA read with exec requeset is not supported. */
-		if (req->exe_req && req->rd_req) {
-			pr_err("Execution request not supported\n");
-			goto no_pasid;
+
+		if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) {
+			pr_err("IOMMU: %s: Page request in Privilege Mode\n",
+			       iommu->name);
+			goto bad_req;
+		}
+
+		if (unlikely(req->exe_req && req->rd_req)) {
+			pr_err("IOMMU: %s: Execution request not supported\n",
+			       iommu->name);
+			goto bad_req;
 		}
 		if (!svm || svm->pasid != req->pasid) {
 			rcu_read_lock();
 			svm = pasid_private_find(req->pasid);
-			/* It *can't* go away, because the driver is not permitted
+			/*
+			 * It can't go away, because the driver is not permitted
 			 * to unbind the mm while any page faults are outstanding.
-			 * So we only need RCU to protect the internal idr code. */
-			rcu_read_unlock();
-			if (IS_ERR_OR_NULL(svm)) {
-				pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
-				       iommu->name, req->pasid, ((unsigned long long *)req)[0],
-				       ((unsigned long long *)req)[1]);
-				goto no_pasid;
-			}
+			 */
+			svm = pasid_private_find(req->pasid);
+			if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE))
+				goto bad_req;
 		}
 
-		if (!sdev || sdev->sid != req->rid)
+		if (!sdev || sdev->sid != req->rid) {
 			sdev = svm_lookup_device_by_sid(svm, req->rid);
-
-		/* Since we're using init_mm.pgd directly, we should never take
-		 * any faults on kernel addresses. */
-		if (!svm->mm)
-			goto bad_req;
-
-		/* If address is not canonical, return invalid response */
-		if (!is_canonical_address(address))
-			goto bad_req;
+			if (!sdev)
+				goto bad_req;
+		}
 
 		/*
 		 * If prq is to be handled outside iommu driver via receiver of
 		 * the fault notifiers, we skip the page response here.
 		 */
 		if (svm->flags & SVM_FLAG_GUEST_MODE) {
-			if (sdev && !intel_svm_prq_report(sdev->dev, req))
+			if (!intel_svm_prq_report(sdev->dev, req))
 				goto prq_advance;
 			else
 				goto bad_req;
 		}
-
-		/* If the mm is already defunct, don't handle faults. */
-		if (!mmget_not_zero(svm->mm))
-			goto bad_req;
-
-		down_read(&svm->mm->mmap_sem);
-		vma = find_extend_vma(svm->mm, address);
-		if (!vma || address < vma->vm_start)
-			goto invalid;
-
-		if (access_error(vma, req))
-			goto invalid;
-
-		flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
-		if (req->wr_req)
-			flags |= FAULT_FLAG_WRITE;
-
-		ret = handle_mm_fault(vma, address, flags);
-		if (ret & VM_FAULT_ERROR)
-			goto invalid;
-
-		result = QI_RESP_SUCCESS;
-invalid:
-		up_read(&svm->mm->mmap_sem);
-		mmput(svm->mm);
-bad_req:
-		/* We get here in the error case where the PASID lookup failed,
-		   and these can be NULL. Do not use them below this point! */
-		sdev = NULL;
-		svm = NULL;
-no_pasid:
-		if (req->lpig || req->priv_data_present) {
-			/*
-			 * Per VT-d spec. v3.0 ch7.7, system software must
-			 * respond with page group response if private data
-			 * is present (PDP) or last page in group (LPIG) bit
-			 * is set. This is an additional VT-d feature beyond
-			 * PCI ATS spec.
-			 */
-			resp.qw0 = QI_PGRP_PASID(req->pasid) |
-				QI_PGRP_DID(req->rid) |
-				QI_PGRP_PASID_P(req->pasid_present) |
-				QI_PGRP_PDP(req->priv_data_present) |
-				QI_PGRP_RESP_CODE(result) |
-				QI_PGRP_RESP_TYPE;
-			resp.qw1 = QI_PGRP_IDX(req->prg_index) |
-				QI_PGRP_LPIG(req->lpig);
-			resp.qw2 = 0;
-			resp.qw3 = 0;
-
-			if (req->priv_data_present)
-				memcpy(&resp.qw2, req->priv_data,
-				       sizeof(req->priv_data));
-			qi_submit_sync(iommu, &resp, 1, 0);
-		}
+		handle_single_prq_event(iommu, svm->mm, req);
 prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}
-- 
Gitee


From 3578757f84223db761f614d827e1ce84ed1936b0 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:02 +0800
Subject: [PATCH 0457/2161] iommu/vt-d: Allocate/register iopf queue for sva
 devices

commit 4c82b88696ac57810ab923b3c5b0734646b9b69f upstream.

This allocates and registers the iopf queue infrastructure for devices
which want to support IO page fault for SVA.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-11-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 66 ++++++++++++++++++++++++++-----------
 drivers/iommu/intel/svm.c   | 37 +++++++++++++++++----
 include/linux/intel-iommu.h |  2 ++
 3 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 4e0a9227b148..7720c24f77ff 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -47,6 +47,7 @@
 #include <asm/iommu.h>
 
 #include "../irq_remapping.h"
+#include "../iommu-sva-lib.h"
 #include "pasid.h"
 #include "cap_audit.h"
 
@@ -5386,6 +5387,34 @@ static int intel_iommu_disable_auxd(struct device *dev)
 	return 0;
 }
 
+static int intel_iommu_enable_sva(struct device *dev)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	struct intel_iommu *iommu = info->iommu;
+
+	if (!info || !iommu || dmar_disabled)
+		return -EINVAL;
+
+	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
+		return -ENODEV;
+
+	if (intel_iommu_enable_pasid(iommu, dev))
+		return -ENODEV;
+
+	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
+		return -EINVAL;
+
+	return iopf_queue_add_device(iommu->iopf_queue, dev);
+}
+
+static int intel_iommu_disable_sva(struct device *dev)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	struct intel_iommu *iommu = info->iommu;
+
+	return iopf_queue_remove_device(iommu->iopf_queue, dev);
+}
+
 /*
  * A PCI express designated vendor specific extended capability is defined
  * in the section 3.7 of Intel scalable I/O virtualization technical spec
@@ -5447,38 +5476,37 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 static int
 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	if (feat == IOMMU_DEV_FEAT_AUX)
+	switch (feat) {
+	case IOMMU_DEV_FEAT_AUX:
 		return intel_iommu_enable_auxd(dev);
 
-	if (feat == IOMMU_DEV_FEAT_IOPF)
+	case IOMMU_DEV_FEAT_IOPF:
 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
 
-	if (feat == IOMMU_DEV_FEAT_SVA) {
-		struct device_domain_info *info = get_domain_info(dev);
-
-		if (!info)
-			return -EINVAL;
-
-		if (intel_iommu_enable_pasid(info->iommu, dev))
-			return -ENODEV;
+	case IOMMU_DEV_FEAT_SVA:
+		return intel_iommu_enable_sva(dev);
 
-		if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
-			return -EINVAL;
-
-		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
-			return 0;
+	default:
+		return -ENODEV;
 	}
-
-	return -ENODEV;
 }
 
 static int
 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	if (feat == IOMMU_DEV_FEAT_AUX)
+	switch (feat) {
+	case IOMMU_DEV_FEAT_AUX:
 		return intel_iommu_disable_auxd(dev);
 
-	return -ENODEV;
+	case IOMMU_DEV_FEAT_IOPF:
+		return 0;
+
+	case IOMMU_DEV_FEAT_SVA:
+		return intel_iommu_disable_sva(dev);
+
+	default:
+		return -ENODEV;
+	}
 }
 
 static bool
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index bd30457230e6..09dacc013f01 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -84,6 +84,7 @@ svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
 
 int intel_svm_enable_prq(struct intel_iommu *iommu)
 {
+	struct iopf_queue *iopfq;
 	struct page *pages;
 	int irq, ret;
 
@@ -100,13 +101,20 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
 		       iommu->name);
 		ret = -EINVAL;
-	err:
-		free_pages((unsigned long)iommu->prq, PRQ_ORDER);
-		iommu->prq = NULL;
-		return ret;
+		goto free_prq;
 	}
 	iommu->pr_irq = irq;
 
+	snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name),
+		 "dmar%d-iopfq", iommu->seq_id);
+	iopfq = iopf_queue_alloc(iommu->iopfq_name);
+	if (!iopfq) {
+		pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name);
+		ret = -ENOMEM;
+		goto free_hwirq;
+	}
+	iommu->iopf_queue = iopfq;
+
 	snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
 
 	ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
@@ -114,9 +122,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	if (ret) {
 		pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
 		       iommu->name);
-		dmar_free_hwirq(irq);
-		iommu->pr_irq = 0;
-		goto err;
+		goto free_iopfq;
 	}
 	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
@@ -125,6 +131,18 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	init_completion(&iommu->prq_complete);
 
 	return 0;
+
+free_iopfq:
+	iopf_queue_free(iommu->iopf_queue);
+	iommu->iopf_queue = NULL;
+free_hwirq:
+	dmar_free_hwirq(irq);
+	iommu->pr_irq = 0;
+free_prq:
+	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	iommu->prq = NULL;
+
+	return ret;
 }
 
 int intel_svm_finish_prq(struct intel_iommu *iommu)
@@ -139,6 +157,11 @@ int intel_svm_finish_prq(struct intel_iommu *iommu)
 		iommu->pr_irq = 0;
 	}
 
+	if (iommu->iopf_queue) {
+		iopf_queue_free(iommu->iopf_queue);
+		iommu->iopf_queue = NULL;
+	}
+
 	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
 	iommu->prq = NULL;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 5767f51371e3..897ae8675e5b 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -606,6 +606,8 @@ struct intel_iommu {
 	struct completion prq_complete;
 	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
 #endif
+	struct iopf_queue *iopf_queue;
+	unsigned char iopfq_name[16];
 	struct q_inval  *qi;            /* Queued invalidation info */
 	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
 
-- 
Gitee


From 4bf2da56e5cf6ed769980f840766987bdb7393b1 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:03 +0800
Subject: [PATCH 0458/2161] iommu/vt-d: Report prq to io-pgfault framework

commit d5b9e4bfe0d8848aaf428bb4bbcc270fecadef35 upstream.

Let the IO page fault requests get handled through the io-pgfault
framework.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-12-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 14 ++++++-
 drivers/iommu/intel/svm.c   | 83 +++----------------------------------
 2 files changed, 17 insertions(+), 80 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7720c24f77ff..5610e4c81090 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5391,6 +5391,7 @@ static int intel_iommu_enable_sva(struct device *dev)
 {
 	struct device_domain_info *info = get_domain_info(dev);
 	struct intel_iommu *iommu = info->iommu;
+	int ret;
 
 	if (!info || !iommu || dmar_disabled)
 		return -EINVAL;
@@ -5404,15 +5405,24 @@ static int intel_iommu_enable_sva(struct device *dev)
 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
 		return -EINVAL;
 
-	return iopf_queue_add_device(iommu->iopf_queue, dev);
+	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
+	if (!ret)
+		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
+
+	return ret;
 }
 
 static int intel_iommu_disable_sva(struct device *dev)
 {
 	struct device_domain_info *info = get_domain_info(dev);
 	struct intel_iommu *iommu = info->iommu;
+	int ret;
+
+	ret = iommu_unregister_device_fault_handler(dev);
+	if (!ret)
+		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
 
-	return iopf_queue_remove_device(iommu->iopf_queue, dev);
+	return ret;
 }
 
 /*
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 09dacc013f01..f564d7ee6624 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -724,22 +724,6 @@ struct page_req_dsc {
 
 #define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
 
-static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
-{
-	unsigned long requested = 0;
-
-	if (req->exe_req)
-		requested |= VM_EXEC;
-
-	if (req->rd_req)
-		requested |= VM_READ;
-
-	if (req->wr_req)
-		requested |= VM_WRITE;
-
-	return (requested & ~vma->vm_flags) != 0;
-}
-
 static bool is_canonical_address(u64 addr)
 {
 	int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
@@ -809,6 +793,8 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid)
 		goto prq_retry;
 	}
 
+	iopf_queue_flush_dev(dev);
+
 	/*
 	 * Perform steps described in VT-d spec CH7.10 to drain page
 	 * requests and responses in hardware.
@@ -924,61 +910,6 @@ static void handle_bad_prq_event(struct intel_iommu *iommu,
 	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
-static void handle_single_prq_event(struct intel_iommu *iommu,
-				    struct mm_struct *mm,
-				    struct page_req_dsc *req)
-{
-	u64 address = (u64)req->addr << VTD_PAGE_SHIFT;
-	int result = QI_RESP_INVALID;
-	struct vm_area_struct *vma;
-	struct qi_desc desc;
-	unsigned int flags;
-	vm_fault_t ret;
-
-	/* If the mm is already defunct, don't handle faults. */
-	if (!mmget_not_zero(mm))
-		goto response;
-
-	down_read(&svm->mm->mmap_sem);
-	vma = find_extend_vma(mm, address);
-	if (!vma || address < vma->vm_start)
-		goto invalid;
-
-	if (access_error(vma, req))
-		goto invalid;
-
-	flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
-	if (req->wr_req)
-		flags |= FAULT_FLAG_WRITE;
-
-	ret = handle_mm_fault(vma, address, flags, NULL);
-	if (!(ret & VM_FAULT_ERROR))
-		result = QI_RESP_SUCCESS;
-invalid:
-	up_read(&svm->mm->mmap_sem);
-	mmput(mm);
-
-response:
-	if (!(req->lpig || req->priv_data_present))
-		return;
-
-	desc.qw0 = QI_PGRP_PASID(req->pasid) |
-			QI_PGRP_DID(req->rid) |
-			QI_PGRP_PASID_P(req->pasid_present) |
-			QI_PGRP_PDP(req->priv_data_present) |
-			QI_PGRP_RESP_CODE(result) |
-			QI_PGRP_RESP_TYPE;
-	desc.qw1 = QI_PGRP_IDX(req->prg_index) |
-			QI_PGRP_LPIG(req->lpig);
-	desc.qw2 = 0;
-	desc.qw3 = 0;
-
-	if (req->priv_data_present)
-		memcpy(&desc.qw2, req->priv_data, sizeof(req->priv_data));
-
-	qi_submit_sync(iommu, &desc, 1, 0);
-}
-
 static irqreturn_t prq_event_thread(int irq, void *d)
 {
 	struct intel_svm_dev *sdev = NULL;
@@ -1050,13 +981,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		 * If prq is to be handled outside iommu driver via receiver of
 		 * the fault notifiers, we skip the page response here.
 		 */
-		if (svm->flags & SVM_FLAG_GUEST_MODE) {
-			if (!intel_svm_prq_report(sdev->dev, req))
-				goto prq_advance;
-			else
-				goto bad_req;
-		}
-		handle_single_prq_event(iommu, svm->mm, req);
+		if (intel_svm_prq_report(sdev->dev, req))
+			handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
 prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}
@@ -1073,6 +999,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 		tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 		if (head == tail) {
+			iopf_queue_discard_partial(iommu->iopf_queue);
 			writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
 			pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
 					    iommu->name);
-- 
Gitee


From f7077cfc889f977872d5b62b485efd0e75275dfc Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 1 Apr 2021 17:47:15 +0200
Subject: [PATCH 0459/2161] iommu: Add a page fault handler

commit fc36479db74e957c4696b605a32c4afaa15fa6cb upstream.

Some systems allow devices to handle I/O Page Faults in the core mm. For
example systems implementing the PCIe PRI extension or Arm SMMU stall
model. Infrastructure for reporting these recoverable page faults was
added to the IOMMU core by commit 0c830e6b3282 ("iommu: Introduce device
fault report API"). Add a page fault handler for host SVA.

IOMMU driver can now instantiate several fault workqueues and link them
to IOPF-capable devices. Drivers can choose between a single global
workqueue, one per IOMMU device, one per low-level fault queue, one per
domain, etc.

When it receives a fault event, most commonly in an IRQ handler, the
IOMMU driver reports the fault using iommu_report_device_fault(), which
calls the registered handler. The page fault handler then calls the mm
fault handler, and reports either success or failure with
iommu_page_response(). After the handler succeeds, the hardware retries
the access.

The iopf_param pointer could be embedded into iommu_fault_param. But
putting iopf_param into the iommu_param structure allows us not to care
about ordering between calls to iopf_queue_add_device() and
iommu_register_device_fault_handler().

Tested-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20210401154718.307519-7-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Makefile        |   1 +
 drivers/iommu/io-pgfault.c    | 461 ++++++++++++++++++++++++++++++++++
 drivers/iommu/iommu-sva-lib.h |  53 ++++
 include/linux/iommu.h         |   2 +
 4 files changed, 517 insertions(+)
 create mode 100644 drivers/iommu/io-pgfault.c

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 5bf5fca35184..20f7625ca45a 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
 obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o
+obj-$(CONFIG_IOMMU_SVA_LIB) += io-pgfault.o
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
new file mode 100644
index 000000000000..326c7080ac4b
--- /dev/null
+++ b/drivers/iommu/io-pgfault.c
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handle device page faults
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ */
+
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "iommu-sva-lib.h"
+
+/**
+ * struct iopf_queue - IO Page Fault queue
+ * @wq: the fault workqueue
+ * @devices: devices attached to this queue
+ * @lock: protects the device list
+ */
+struct iopf_queue {
+	struct workqueue_struct		*wq;
+	struct list_head		devices;
+	struct mutex			lock;
+};
+
+/**
+ * struct iopf_device_param - IO Page Fault data attached to a device
+ * @dev: the device that owns this param
+ * @queue: IOPF queue
+ * @queue_list: index into queue->devices
+ * @partial: faults that are part of a Page Request Group for which the last
+ *           request hasn't been submitted yet.
+ */
+struct iopf_device_param {
+	struct device			*dev;
+	struct iopf_queue		*queue;
+	struct list_head		queue_list;
+	struct list_head		partial;
+};
+
+struct iopf_fault {
+	struct iommu_fault		fault;
+	struct list_head		list;
+};
+
+struct iopf_group {
+	struct iopf_fault		last_fault;
+	struct list_head		faults;
+	struct work_struct		work;
+	struct device			*dev;
+};
+
+static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
+			       enum iommu_page_response_code status)
+{
+	struct iommu_page_response resp = {
+		.version		= IOMMU_PAGE_RESP_VERSION_1,
+		.pasid			= iopf->fault.prm.pasid,
+		.grpid			= iopf->fault.prm.grpid,
+		.code			= status,
+	};
+
+	if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
+	    (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
+		resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+
+	return iommu_page_response(dev, &resp);
+}
+
+static enum iommu_page_response_code
+iopf_handle_single(struct iopf_fault *iopf)
+{
+	vm_fault_t ret;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	unsigned int access_flags = 0;
+	unsigned int fault_flags = FAULT_FLAG_REMOTE;
+	struct iommu_fault_page_request *prm = &iopf->fault.prm;
+	enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
+
+	if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID))
+		return status;
+
+	mm = iommu_sva_find(prm->pasid);
+	if (IS_ERR_OR_NULL(mm))
+		return status;
+
+	mmap_read_lock(mm);
+
+	vma = find_extend_vma(mm, prm->addr);
+	if (!vma)
+		/* Unmapped area */
+		goto out_put_mm;
+
+	if (prm->perm & IOMMU_FAULT_PERM_READ)
+		access_flags |= VM_READ;
+
+	if (prm->perm & IOMMU_FAULT_PERM_WRITE) {
+		access_flags |= VM_WRITE;
+		fault_flags |= FAULT_FLAG_WRITE;
+	}
+
+	if (prm->perm & IOMMU_FAULT_PERM_EXEC) {
+		access_flags |= VM_EXEC;
+		fault_flags |= FAULT_FLAG_INSTRUCTION;
+	}
+
+	if (!(prm->perm & IOMMU_FAULT_PERM_PRIV))
+		fault_flags |= FAULT_FLAG_USER;
+
+	if (access_flags & ~vma->vm_flags)
+		/* Access fault */
+		goto out_put_mm;
+
+	ret = handle_mm_fault(vma, prm->addr, fault_flags);
+	status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID :
+		IOMMU_PAGE_RESP_SUCCESS;
+
+out_put_mm:
+	mmap_read_unlock(mm);
+	mmput(mm);
+
+	return status;
+}
+
+static void iopf_handle_group(struct work_struct *work)
+{
+	struct iopf_group *group;
+	struct iopf_fault *iopf, *next;
+	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
+
+	group = container_of(work, struct iopf_group, work);
+
+	list_for_each_entry_safe(iopf, next, &group->faults, list) {
+		/*
+		 * For the moment, errors are sticky: don't handle subsequent
+		 * faults in the group if there is an error.
+		 */
+		if (status == IOMMU_PAGE_RESP_SUCCESS)
+			status = iopf_handle_single(iopf);
+
+		if (!(iopf->fault.prm.flags &
+		      IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE))
+			kfree(iopf);
+	}
+
+	iopf_complete_group(group->dev, &group->last_fault, status);
+	kfree(group);
+}
+
+/**
+ * iommu_queue_iopf - IO Page Fault handler
+ * @fault: fault event
+ * @cookie: struct device, passed to iommu_register_device_fault_handler.
+ *
+ * Add a fault to the device workqueue, to be handled by mm.
+ *
+ * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
+ * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
+ * expect a response. It may be generated when disabling a PASID (issuing a
+ * PASID stop request) by some PCI devices.
+ *
+ * The PASID stop request is issued by the device driver before unbind(). Once
+ * it completes, no page request is generated for this PASID anymore and
+ * outstanding ones have been pushed to the IOMMU (as per PCIe 4.0r1.0 - 6.20.1
+ * and 10.4.1.2 - Managing PASID TLP Prefix Usage). Some PCI devices will wait
+ * for all outstanding page requests to come back with a response before
+ * completing the PASID stop request. Others do not wait for page responses, and
+ * instead issue this Stop Marker that tells us when the PASID can be
+ * reallocated.
+ *
+ * It is safe to discard the Stop Marker because it is an optimization.
+ * a. Page requests, which are posted requests, have been flushed to the IOMMU
+ *    when the stop request completes.
+ * b. The IOMMU driver flushes all fault queues on unbind() before freeing the
+ *    PASID.
+ *
+ * So even though the Stop Marker might be issued by the device *after* the stop
+ * request completes, outstanding faults will have been dealt with by the time
+ * the PASID is freed.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+	int ret;
+	struct iopf_group *group;
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+
+	struct device *dev = cookie;
+	struct dev_iommu *param = dev->iommu;
+
+	lockdep_assert_held(&param->lock);
+
+	if (fault->type != IOMMU_FAULT_PAGE_REQ)
+		/* Not a recoverable page fault */
+		return -EOPNOTSUPP;
+
+	/*
+	 * As long as we're holding param->lock, the queue can't be unlinked
+	 * from the device and therefore cannot disappear.
+	 */
+	iopf_param = param->iopf_param;
+	if (!iopf_param)
+		return -ENODEV;
+
+	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+		iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
+		if (!iopf)
+			return -ENOMEM;
+
+		iopf->fault = *fault;
+
+		/* Non-last request of a group. Postpone until the last one */
+		list_add(&iopf->list, &iopf_param->partial);
+
+		return 0;
+	}
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group) {
+		/*
+		 * The caller will send a response to the hardware. But we do
+		 * need to clean up before leaving, otherwise partial faults
+		 * will be stuck.
+		 */
+		ret = -ENOMEM;
+		goto cleanup_partial;
+	}
+
+	group->dev = dev;
+	group->last_fault.fault = *fault;
+	INIT_LIST_HEAD(&group->faults);
+	list_add(&group->last_fault.list, &group->faults);
+	INIT_WORK(&group->work, iopf_handle_group);
+
+	/* See if we have partial faults for this group */
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+		if (iopf->fault.prm.grpid == fault->prm.grpid)
+			/* Insert *before* the last fault */
+			list_move(&iopf->list, &group->faults);
+	}
+
+	queue_work(iopf_param->queue->wq, &group->work);
+	return 0;
+
+cleanup_partial:
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+		if (iopf->fault.prm.grpid == fault->prm.grpid) {
+			list_del(&iopf->list);
+			kfree(iopf);
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_queue_iopf);
+
+/**
+ * iopf_queue_flush_dev - Ensure that all queued faults have been processed
+ * @dev: the endpoint whose faults need to be flushed.
+ *
+ * The IOMMU driver calls this before releasing a PASID, to ensure that all
+ * pending faults for this PASID have been handled, and won't hit the address
+ * space of the next process that uses this PASID. The driver must make sure
+ * that no new fault is added to the queue. In particular it must flush its
+ * low-level queue before calling this function.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_flush_dev(struct device *dev)
+{
+	int ret = 0;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param)
+		return -ENODEV;
+
+	mutex_lock(&param->lock);
+	iopf_param = param->iopf_param;
+	if (iopf_param)
+		flush_workqueue(iopf_param->queue->wq);
+	else
+		ret = -ENODEV;
+	mutex_unlock(&param->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
+
+/**
+ * iopf_queue_discard_partial - Remove all pending partial fault
+ * @queue: the queue whose partial faults need to be discarded
+ *
+ * When the hardware queue overflows, last page faults in a group may have been
+ * lost and the IOMMU driver calls this to discard all partial faults. The
+ * driver shouldn't be adding new faults to this queue concurrently.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+
+	if (!queue)
+		return -EINVAL;
+
+	mutex_lock(&queue->lock);
+	list_for_each_entry(iopf_param, &queue->devices, queue_list) {
+		list_for_each_entry_safe(iopf, next, &iopf_param->partial,
+					 list) {
+			list_del(&iopf->list);
+			kfree(iopf);
+		}
+	}
+	mutex_unlock(&queue->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_discard_partial);
+
+/**
+ * iopf_queue_add_device - Add producer to the fault queue
+ * @queue: IOPF queue
+ * @dev: device to add
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
+{
+	int ret = -EBUSY;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param)
+		return -ENODEV;
+
+	iopf_param = kzalloc(sizeof(*iopf_param), GFP_KERNEL);
+	if (!iopf_param)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&iopf_param->partial);
+	iopf_param->queue = queue;
+	iopf_param->dev = dev;
+
+	mutex_lock(&queue->lock);
+	mutex_lock(&param->lock);
+	if (!param->iopf_param) {
+		list_add(&iopf_param->queue_list, &queue->devices);
+		param->iopf_param = iopf_param;
+		ret = 0;
+	}
+	mutex_unlock(&param->lock);
+	mutex_unlock(&queue->lock);
+
+	if (ret)
+		kfree(iopf_param);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_add_device);
+
+/**
+ * iopf_queue_remove_device - Remove producer from fault queue
+ * @queue: IOPF queue
+ * @dev: device to remove
+ *
+ * Caller makes sure that no more faults are reported for this device.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
+{
+	int ret = -EINVAL;
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param || !queue)
+		return -EINVAL;
+
+	mutex_lock(&queue->lock);
+	mutex_lock(&param->lock);
+	iopf_param = param->iopf_param;
+	if (iopf_param && iopf_param->queue == queue) {
+		list_del(&iopf_param->queue_list);
+		param->iopf_param = NULL;
+		ret = 0;
+	}
+	mutex_unlock(&param->lock);
+	mutex_unlock(&queue->lock);
+	if (ret)
+		return ret;
+
+	/* Just in case some faults are still stuck */
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list)
+		kfree(iopf);
+
+	kfree(iopf_param);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_remove_device);
+
+/**
+ * iopf_queue_alloc - Allocate and initialize a fault queue
+ * @name: a unique string identifying the queue (for workqueue)
+ *
+ * Return: the queue on success and NULL on error.
+ */
+struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+	struct iopf_queue *queue;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+
+	/*
+	 * The WQ is unordered because the low-level handler enqueues faults by
+	 * group. PRI requests within a group have to be ordered, but once
+	 * that's dealt with, the high-level function can handle groups out of
+	 * order.
+	 */
+	queue->wq = alloc_workqueue("iopf_queue/%s", WQ_UNBOUND, 0, name);
+	if (!queue->wq) {
+		kfree(queue);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&queue->devices);
+	mutex_init(&queue->lock);
+
+	return queue;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_alloc);
+
+/**
+ * iopf_queue_free - Free IOPF queue
+ * @queue: queue to free
+ *
+ * Counterpart to iopf_queue_alloc(). The driver must not be queuing faults or
+ * adding/removing devices on this queue anymore.
+ */
+void iopf_queue_free(struct iopf_queue *queue)
+{
+	struct iopf_device_param *iopf_param, *next;
+
+	if (!queue)
+		return;
+
+	list_for_each_entry_safe(iopf_param, next, &queue->devices, queue_list)
+		iopf_queue_remove_device(queue, iopf_param->dev);
+
+	destroy_workqueue(queue->wq);
+	kfree(queue);
+}
+EXPORT_SYMBOL_GPL(iopf_queue_free);
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
index b40990aef3fd..031155010ca8 100644
--- a/drivers/iommu/iommu-sva-lib.h
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -12,4 +12,57 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
 void iommu_sva_free_pasid(struct mm_struct *mm);
 struct mm_struct *iommu_sva_find(ioasid_t pasid);
 
+/* I/O Page fault */
+struct device;
+struct iommu_fault;
+struct iopf_queue;
+
+#ifdef CONFIG_IOMMU_SVA_LIB
+int iommu_queue_iopf(struct iommu_fault *fault, void *cookie);
+
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
+int iopf_queue_remove_device(struct iopf_queue *queue,
+			     struct device *dev);
+int iopf_queue_flush_dev(struct device *dev);
+struct iopf_queue *iopf_queue_alloc(const char *name);
+void iopf_queue_free(struct iopf_queue *queue);
+int iopf_queue_discard_partial(struct iopf_queue *queue);
+
+#else /* CONFIG_IOMMU_SVA_LIB */
+static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_add_device(struct iopf_queue *queue,
+					struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_remove_device(struct iopf_queue *queue,
+					   struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_flush_dev(struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+	return NULL;
+}
+
+static inline void iopf_queue_free(struct iopf_queue *queue)
+{
+}
+
+static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_IOMMU_SVA_LIB */
 #endif /* _IOMMU_SVA_LIB_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 8ae360365753..609c13de6b10 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -379,6 +379,7 @@ struct iommu_fault_param {
  * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
+ * @iopf_param:	 I/O Page Fault queue and data
  * @fwspec:	 IOMMU fwspec data
  * @iommu_dev:	 IOMMU device this device is linked to
  * @priv:	 IOMMU Driver private data
@@ -389,6 +390,7 @@ struct iommu_fault_param {
 struct dev_iommu {
 	struct mutex lock;
 	struct iommu_fault_param	*fault_param;
+	struct iopf_device_param	*iopf_param;
 	struct iommu_fwspec		*fwspec;
 	struct iommu_device		*iommu_dev;
 	void				*priv;
-- 
Gitee


From 9e611a0e223b31a6577acbe7c7f0b200cce110c3 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 6 Apr 2022 12:10:08 +0800
Subject: [PATCH 0460/2161] iommu: Fix implicit declaration of function
 'mmap_read_lock'

commit opencloudos.

Fix it in iopf_handle_single().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/io-pgfault.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 326c7080ac4b..7e72acf5eaab 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -87,7 +87,7 @@ iopf_handle_single(struct iopf_fault *iopf)
 	if (IS_ERR_OR_NULL(mm))
 		return status;
 
-	mmap_read_lock(mm);
+	down_read(&mm->mmap_sem);
 
 	vma = find_extend_vma(mm, prm->addr);
 	if (!vma)
@@ -119,7 +119,7 @@ iopf_handle_single(struct iopf_fault *iopf)
 		IOMMU_PAGE_RESP_SUCCESS;
 
 out_put_mm:
-	mmap_read_unlock(mm);
+	up_read(&mm->mmap_sem);
 	mmput(mm);
 
 	return status;
-- 
Gitee


From 0d5b9daad582baa01e802a0743048c761f65611d Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:04 +0800
Subject: [PATCH 0461/2161] iommu/vt-d: Add prq_report trace event

commit e93a67f5a0eef3e9ab5b4649cac5c3b831c6a9db upstream.

This adds a new trace event to track the page fault request report.
This event will provide almost all information defined in a page
request descriptor.

A sample output:
| prq_report: dmar0/0000:00:0a.0 seq# 1: rid=0x50 addr=0x559ef6f97 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 2: rid=0x50 addr=0x559ef6f9c rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 3: rid=0x50 addr=0x559ef6f98 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 4: rid=0x50 addr=0x559ef6f9d rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 5: rid=0x50 addr=0x559ef6f99 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 6: rid=0x50 addr=0x559ef6f9e rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 7: rid=0x50 addr=0x559ef6f9a r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 8: rid=0x50 addr=0x559ef6f9f rw--l pasid=0x2 index=0x1

This will be helpful for I/O page fault related debugging.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-13-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c          |  7 ++++++
 include/linux/intel-iommu.h        | 29 +++++++++++++++++++++++
 include/trace/events/intel_iommu.h | 37 ++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index f564d7ee6624..639390facfdc 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -21,6 +21,7 @@
 #include <linux/ioasid.h>
 #include <asm/page.h>
 #include <asm/fpu/api.h>
+#include <trace/events/intel_iommu.h>
 
 #include "pasid.h"
 #include "../iommu-sva-lib.h"
@@ -977,12 +978,18 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 				goto bad_req;
 		}
 
+		sdev->prq_seq_number++;
+
 		/*
 		 * If prq is to be handled outside iommu driver via receiver of
 		 * the fault notifiers, we skip the page response here.
 		 */
 		if (intel_svm_prq_report(sdev->dev, req))
 			handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+
+		trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
+				 req->priv_data[0], req->priv_data[1],
+				 sdev->prq_seq_number);
 prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 897ae8675e5b..4eeaa995d389 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -778,6 +778,7 @@ struct intel_svm_dev {
 	struct device *dev;
 	struct intel_iommu *iommu;
 	struct iommu_sva sva;
+	unsigned long prq_seq_number;
 	u32 pasid;
 	int users;
 	u16 did;
@@ -828,4 +829,32 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 #define intel_iommu_enabled (0)
 #endif
 
+static inline const char *decode_prq_descriptor(char *str, size_t size,
+		u64 dw0, u64 dw1, u64 dw2, u64 dw3)
+{
+	char *buf = str;
+	int bytes;
+
+	bytes = snprintf(buf, size,
+			 "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
+			 FIELD_GET(GENMASK_ULL(31, 16), dw0),
+			 FIELD_GET(GENMASK_ULL(63, 12), dw1),
+			 dw1 & BIT_ULL(0) ? 'r' : '-',
+			 dw1 & BIT_ULL(1) ? 'w' : '-',
+			 dw0 & BIT_ULL(52) ? 'x' : '-',
+			 dw0 & BIT_ULL(53) ? 'p' : '-',
+			 dw1 & BIT_ULL(2) ? 'l' : '-',
+			 FIELD_GET(GENMASK_ULL(51, 32), dw0),
+			 FIELD_GET(GENMASK_ULL(11, 3), dw1));
+
+	/* Private Data */
+	if (dw0 & BIT_ULL(9)) {
+		size -= bytes;
+		buf += bytes;
+		snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
+	}
+
+	return str;
+}
+
 #endif
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index e801f4910522..758b14d1a179 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -15,6 +15,8 @@
 #include <linux/tracepoint.h>
 #include <linux/intel-iommu.h>
 
+#define MSG_MAX		256
+
 DECLARE_EVENT_CLASS(dma_map,
 	TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
 		 size_t size),
@@ -171,6 +173,41 @@ TRACE_EVENT(qi_submit,
 		__entry->qw0, __entry->qw1, __entry->qw2, __entry->qw3
 	)
 );
+
+TRACE_EVENT(prq_report,
+	TP_PROTO(struct intel_iommu *iommu, struct device *dev,
+		 u64 dw0, u64 dw1, u64 dw2, u64 dw3,
+		 unsigned long seq),
+
+	TP_ARGS(iommu, dev, dw0, dw1, dw2, dw3, seq),
+
+	TP_STRUCT__entry(
+		__field(u64, dw0)
+		__field(u64, dw1)
+		__field(u64, dw2)
+		__field(u64, dw3)
+		__field(unsigned long, seq)
+		__string(iommu, iommu->name)
+		__string(dev, dev_name(dev))
+		__dynamic_array(char, buff, MSG_MAX)
+	),
+
+	TP_fast_assign(
+		__entry->dw0 = dw0;
+		__entry->dw1 = dw1;
+		__entry->dw2 = dw2;
+		__entry->dw3 = dw3;
+		__entry->seq = seq;
+		__assign_str(iommu, iommu->name);
+		__assign_str(dev, dev_name(dev));
+	),
+
+	TP_printk("%s/%s seq# %ld: %s",
+		__get_str(iommu), __get_str(dev), __entry->seq,
+		decode_prq_descriptor(__get_str(buff), MSG_MAX, __entry->dw0,
+				      __entry->dw1, __entry->dw2, __entry->dw3)
+	)
+);
 #endif /* _TRACE_INTEL_IOMMU_H */
 
 /* This part must be outside protection */
-- 
Gitee


From e6af110943bdf100b93c8cd2fb3b17a29f4669dd Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:05 +0800
Subject: [PATCH 0462/2161] iommu/vt-d: Add common code for dmar latency
 performance monitors

commit 55ee5e67a59a1b6f388d7a1c7b24022145f47a3e upstream.

The execution time of some operations is very performance critical, such
as cache invalidation and PRQ processing time. This adds some common code
to monitor the execution time range of those operations. The interfaces
include enabling/disabling, checking status, updating sampling data and
providing a common string format for users.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-14-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Kconfig  |   3 +
 drivers/iommu/intel/Makefile |   1 +
 drivers/iommu/intel/perf.c   | 166 +++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/perf.h   |  73 +++++++++++++++
 include/linux/intel-iommu.h  |   1 +
 5 files changed, 244 insertions(+)
 create mode 100644 drivers/iommu/intel/perf.c
 create mode 100644 drivers/iommu/intel/perf.h

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index cc18ae84af2f..0f40be033154 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -3,6 +3,9 @@
 config DMAR_TABLE
 	bool
 
+config DMAR_PERF
+	bool
+
 config INTEL_IOMMU
 	bool "Support for Intel IOMMU using DMA Remapping Devices"
 	depends on PCI_MSI && ACPI && (X86 || IA64)
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index ae236ec7d219..fa0dae16441c 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -2,6 +2,7 @@
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
 obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
+obj-$(CONFIG_DMAR_PERF) += perf.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
 obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
new file mode 100644
index 000000000000..faaa96dda437
--- /dev/null
+++ b/drivers/iommu/intel/perf.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * perf.c - performance monitor
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *         Fenghua Yu <fenghua.yu@intel.com>
+ */
+
+#include <linux/spinlock.h>
+#include <linux/intel-iommu.h>
+
+#include "perf.h"
+
+static DEFINE_SPINLOCK(latency_lock);
+
+bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+
+	return lstat && lstat[type].enabled;
+}
+
+int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat;
+	unsigned long flags;
+	int ret = -EBUSY;
+
+	if (dmar_latency_enabled(iommu, type))
+		return 0;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	if (!iommu->perf_statistic) {
+		iommu->perf_statistic = kzalloc(sizeof(*lstat) * DMAR_LATENCY_NUM,
+						GFP_ATOMIC);
+		if (!iommu->perf_statistic) {
+			ret = -ENOMEM;
+			goto unlock_out;
+		}
+	}
+
+	lstat = iommu->perf_statistic;
+
+	if (!lstat[type].enabled) {
+		lstat[type].enabled = true;
+		lstat[type].counter[COUNTS_MIN] = UINT_MAX;
+		ret = 0;
+	}
+unlock_out:
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	return ret;
+}
+
+void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+
+	if (!dmar_latency_enabled(iommu, type))
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&lstat[type], 0, sizeof(*lstat) * DMAR_LATENCY_NUM);
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+	u64 min, max;
+
+	if (!dmar_latency_enabled(iommu, type))
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	if (latency < 100)
+		lstat[type].counter[COUNTS_10e2]++;
+	else if (latency < 1000)
+		lstat[type].counter[COUNTS_10e3]++;
+	else if (latency < 10000)
+		lstat[type].counter[COUNTS_10e4]++;
+	else if (latency < 100000)
+		lstat[type].counter[COUNTS_10e5]++;
+	else if (latency < 1000000)
+		lstat[type].counter[COUNTS_10e6]++;
+	else if (latency < 10000000)
+		lstat[type].counter[COUNTS_10e7]++;
+	else
+		lstat[type].counter[COUNTS_10e8_plus]++;
+
+	min = lstat[type].counter[COUNTS_MIN];
+	max = lstat[type].counter[COUNTS_MAX];
+	lstat[type].counter[COUNTS_MIN] = min_t(u64, min, latency);
+	lstat[type].counter[COUNTS_MAX] = max_t(u64, max, latency);
+	lstat[type].counter[COUNTS_SUM] += latency;
+	lstat[type].samples++;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static char *latency_counter_names[] = {
+	"                  <0.1us",
+	"   0.1us-1us", "    1us-10us", "  10us-100us",
+	"   100us-1ms", "    1ms-10ms", "      >=10ms",
+	"     min(us)", "     max(us)", " average(us)"
+};
+
+static char *latency_type_names[] = {
+	"   inv_iotlb", "  inv_devtlb", "     inv_iec",
+	"     svm_prq"
+};
+
+int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+	int bytes = 0, i, j;
+
+	memset(str, 0, size);
+
+	for (i = 0; i < COUNTS_NUM; i++)
+		bytes += snprintf(str + bytes, size - bytes,
+				  "%s", latency_counter_names[i]);
+
+	spin_lock_irqsave(&latency_lock, flags);
+	for (i = 0; i < DMAR_LATENCY_NUM; i++) {
+		if (!dmar_latency_enabled(iommu, i))
+			continue;
+
+		bytes += snprintf(str + bytes, size - bytes,
+				  "\n%s", latency_type_names[i]);
+
+		for (j = 0; j < COUNTS_NUM; j++) {
+			u64 val = lstat[i].counter[j];
+
+			switch (j) {
+			case COUNTS_MIN:
+				if (val == UINT_MAX)
+					val = 0;
+				else
+					val /= 1000;
+				break;
+			case COUNTS_MAX:
+				val /= 1000;
+				break;
+			case COUNTS_SUM:
+				if (lstat[i].samples)
+					val /= (lstat[i].samples * 1000);
+				else
+					val = 0;
+				break;
+			default:
+				break;
+			}
+
+			bytes += snprintf(str + bytes, size - bytes,
+					  "%12lld", val);
+		}
+	}
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	return bytes;
+}
diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h
new file mode 100644
index 000000000000..fd6db8049d1a
--- /dev/null
+++ b/drivers/iommu/intel/perf.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf.h - performance monitor header
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+enum latency_type {
+	DMAR_LATENCY_INV_IOTLB = 0,
+	DMAR_LATENCY_INV_DEVTLB,
+	DMAR_LATENCY_INV_IEC,
+	DMAR_LATENCY_PRQ,
+	DMAR_LATENCY_NUM
+};
+
+enum latency_count {
+	COUNTS_10e2 = 0,	/* < 0.1us	*/
+	COUNTS_10e3,		/* 0.1us ~ 1us	*/
+	COUNTS_10e4,		/* 1us ~ 10us	*/
+	COUNTS_10e5,		/* 10us ~ 100us	*/
+	COUNTS_10e6,		/* 100us ~ 1ms	*/
+	COUNTS_10e7,		/* 1ms ~ 10ms	*/
+	COUNTS_10e8_plus,	/* 10ms and plus*/
+	COUNTS_MIN,
+	COUNTS_MAX,
+	COUNTS_SUM,
+	COUNTS_NUM
+};
+
+struct latency_statistic {
+	bool enabled;
+	u64 counter[COUNTS_NUM];
+	u64 samples;
+};
+
+#ifdef CONFIG_DMAR_PERF
+int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type);
+void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type);
+bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type);
+void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type,
+			 u64 latency);
+int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size);
+#else
+static inline int
+dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
+{
+	return -EINVAL;
+}
+
+static inline void
+dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type)
+{
+}
+
+static inline bool
+dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type)
+{
+	return false;
+}
+
+static inline void
+dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency)
+{
+}
+
+static inline int
+dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+{
+	return 0;
+}
+#endif /* CONFIG_DMAR_PERF */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 4eeaa995d389..5b0dcabb7b36 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -621,6 +621,7 @@ struct intel_iommu {
 	u32		flags;      /* Software defined flags */
 
 	struct dmar_drhd_unit *drhd;
+	void *perf_statistic;
 };
 
 /* Per subdevice private data */
-- 
Gitee


From 4c6e3c9ad8f248b9f4c11f490e2320ef8e29ffec Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:06 +0800
Subject: [PATCH 0463/2161] iommu/vt-d: Expose latency monitor data through
 debugfs

commit 456bb0b97f00fe8defba155c0a4c48d951635395 upstream.

A debugfs interface /sys/kernel/debug/iommu/intel/dmar_perf_latency is
created to control and show counts of execution time ranges for various
types per DMAR. The interface may help debug any potential performance
issue.

By default, the interface is disabled.

Possible write value of /sys/kernel/debug/iommu/intel/dmar_perf_latency
  0 - disable sampling all latency data
  1 - enable sampling IOTLB invalidation latency data
  2 - enable sampling devTLB invalidation latency data
  3 - enable sampling intr entry cache invalidation latency data
  4 - enable sampling prq handling latency data

Read /sys/kernel/debug/iommu/intel/dmar_perf_latency gives a snapshot
of sampling result of all enabled monitors.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-15-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Kconfig   |   1 +
 drivers/iommu/intel/debugfs.c | 111 ++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 0f40be033154..8c4a741db7a4 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -27,6 +27,7 @@ config INTEL_IOMMU
 config INTEL_IOMMU_DEBUGFS
 	bool "Export Intel IOMMU internals in Debugfs"
 	depends on INTEL_IOMMU && IOMMU_DEBUGFS
+	select DMAR_PERF
 	help
 	  !!!WARNING!!!
 
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index efea7f02abd9..62e23ff3c987 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -16,6 +16,7 @@
 #include <asm/irq_remapping.h>
 
 #include "pasid.h"
+#include "perf.h"
 
 struct tbl_walk {
 	u16 bus;
@@ -31,6 +32,9 @@ struct iommu_regset {
 	const char *regs;
 };
 
+#define DEBUG_BUFFER_SIZE	1024
+static char debug_buf[DEBUG_BUFFER_SIZE];
+
 #define IOMMU_REGSET_ENTRY(_reg_)					\
 	{ DMAR_##_reg_##_REG, __stringify(_reg_) }
 
@@ -538,6 +542,111 @@ static int ir_translation_struct_show(struct seq_file *m, void *unused)
 DEFINE_SHOW_ATTRIBUTE(ir_translation_struct);
 #endif
 
+static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu,
+			     struct dmar_drhd_unit *drhd)
+{
+	int ret;
+
+	seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
+		   iommu->name, drhd->reg_base_addr);
+
+	ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
+	if (ret < 0)
+		seq_puts(m, "Failed to get latency snapshot");
+	else
+		seq_puts(m, debug_buf);
+	seq_puts(m, "\n");
+}
+
+static int latency_show(struct seq_file *m, void *v)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd)
+		latency_show_one(m, iommu, drhd);
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int dmar_perf_latency_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, latency_show, NULL);
+}
+
+static ssize_t dmar_perf_latency_write(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt, loff_t *ppos)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	int counting;
+	char buf[64];
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (kstrtoint(buf, 0, &counting))
+		return -EINVAL;
+
+	switch (counting) {
+	case 0:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd) {
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB);
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB);
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC);
+			dmar_latency_disable(iommu, DMAR_LATENCY_PRQ);
+		}
+		rcu_read_unlock();
+		break;
+	case 1:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_IOTLB);
+		rcu_read_unlock();
+		break;
+	case 2:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_DEVTLB);
+		rcu_read_unlock();
+		break;
+	case 3:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC);
+		rcu_read_unlock();
+		break;
+	case 4:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_PRQ);
+		rcu_read_unlock();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+	return cnt;
+}
+
+static const struct file_operations dmar_perf_latency_fops = {
+	.open		= dmar_perf_latency_open,
+	.write		= dmar_perf_latency_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 void __init intel_iommu_debugfs_init(void)
 {
 	struct dentry *intel_iommu_debug = debugfs_create_dir("intel",
@@ -556,4 +665,6 @@ void __init intel_iommu_debugfs_init(void)
 	debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
 			    NULL, &ir_translation_struct_fops);
 #endif
+	debugfs_create_file("dmar_perf_latency", 0644, intel_iommu_debug,
+			    NULL, &dmar_perf_latency_fops);
 }
-- 
Gitee


From 1c5066ecd46647729f0e1a042d1b341b52ac513f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:07 +0800
Subject: [PATCH 0464/2161] iommu/vt-d: Add cache invalidation latency sampling

commit 74eb87a0f9eb8c81264f2eb1b13d578c951b7533 upstream.

Queued invalidation execution time is performance critical and needs
to be monitored. This adds code to sample the execution time of IOTLB/
devTLB/ICE cache invalidation.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-16-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index fa98e542ca34..78866c56496f 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -34,6 +34,7 @@
 #include <trace/events/intel_iommu.h>
 
 #include "../irq_remapping.h"
+#include "perf.h"
 
 typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
 struct dmar_res_callback {
@@ -1342,15 +1343,33 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
 		   unsigned int count, unsigned long options)
 {
 	struct q_inval *qi = iommu->qi;
+	s64 devtlb_start_ktime = 0;
+	s64 iotlb_start_ktime = 0;
+	s64 iec_start_ktime = 0;
 	struct qi_desc wait_desc;
 	int wait_index, index;
 	unsigned long flags;
 	int offset, shift;
 	int rc, i;
+	u64 type;
 
 	if (!qi)
 		return 0;
 
+	type = desc->qw0 & GENMASK_ULL(3, 0);
+
+	if ((type == QI_IOTLB_TYPE || type == QI_EIOTLB_TYPE) &&
+	    dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IOTLB))
+		iotlb_start_ktime = ktime_to_ns(ktime_get());
+
+	if ((type == QI_DIOTLB_TYPE || type == QI_DEIOTLB_TYPE) &&
+	    dmar_latency_enabled(iommu, DMAR_LATENCY_INV_DEVTLB))
+		devtlb_start_ktime = ktime_to_ns(ktime_get());
+
+	if (type == QI_IEC_TYPE &&
+	    dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IEC))
+		iec_start_ktime = ktime_to_ns(ktime_get());
+
 restart:
 	rc = 0;
 
@@ -1425,6 +1444,18 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
 	if (rc == -EAGAIN)
 		goto restart;
 
+	if (iotlb_start_ktime)
+		dmar_latency_update(iommu, DMAR_LATENCY_INV_IOTLB,
+				ktime_to_ns(ktime_get()) - iotlb_start_ktime);
+
+	if (devtlb_start_ktime)
+		dmar_latency_update(iommu, DMAR_LATENCY_INV_DEVTLB,
+				ktime_to_ns(ktime_get()) - devtlb_start_ktime);
+
+	if (iec_start_ktime)
+		dmar_latency_update(iommu, DMAR_LATENCY_INV_IEC,
+				ktime_to_ns(ktime_get()) - iec_start_ktime);
+
 	return rc;
 }
 
-- 
Gitee


From 9e6efe309e2ca84fd4a85c97b88e7d9f0fce09c3 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:08 +0800
Subject: [PATCH 0465/2161] iommu/vt-d: Add PRQ handling latency sampling

commit 0f4834ab255bf488f20544e120713decfa77843e upstream.

The execution time for page fault request handling is performance critical
and needs to be monitored. This adds code to sample the execution time of
page fault request handling.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-17-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 639390facfdc..60edb5cb15c9 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -24,6 +24,7 @@
 #include <trace/events/intel_iommu.h>
 
 #include "pasid.h"
+#include "perf.h"
 #include "../iommu-sva-lib.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
@@ -838,8 +839,8 @@ static int prq_to_iommu_prot(struct page_req_dsc *req)
 	return prot;
 }
 
-static int
-intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
+static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
+				struct page_req_dsc *desc)
 {
 	struct iommu_fault_event event;
 
@@ -871,6 +872,12 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
 		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
 		memcpy(event.fault.prm.private_data, desc->priv_data,
 		       sizeof(desc->priv_data));
+	} else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) {
+		/*
+		 * If the private data fields are not used by hardware, use it
+		 * to monitor the prq handle latency.
+		 */
+		event.fault.prm.private_data[0] = ktime_to_ns(ktime_get());
 	}
 
 	return iommu_report_device_fault(dev, &event);
@@ -984,7 +991,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		 * If prq is to be handled outside iommu driver via receiver of
 		 * the fault notifiers, we skip the page response here.
 		 */
-		if (intel_svm_prq_report(sdev->dev, req))
+		if (intel_svm_prq_report(iommu, sdev->dev, req))
 			handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
 
 		trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
@@ -1173,6 +1180,9 @@ int intel_svm_page_response(struct device *dev,
 		if (private_present)
 			memcpy(&desc.qw2, prm->private_data,
 			       sizeof(prm->private_data));
+		else if (prm->private_data[0])
+			dmar_latency_update(iommu, DMAR_LATENCY_PRQ,
+				ktime_to_ns(ktime_get()) - prm->private_data[0]);
 
 		qi_submit_sync(iommu, &desc, 1, 0);
 	}
-- 
Gitee


From 00dc67a135c214788227f8dc691b07e78dbbbff0 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 10 Jun 2021 10:01:09 +0800
Subject: [PATCH 0466/2161] iommu/vt-d: Fix out-bounds-warning in intel/svm.c

commit 606636dcbdbb73b1a4ed61be77c76ea1087f042d upstream.

Replace a couple of calls to memcpy() with simple assignments in order
to fix the following out-of-bounds warning:

drivers/iommu/intel/svm.c:1198:4: warning: 'memcpy' offset [25, 32] from
    the object at 'desc' is out of the bounds of referenced subobject
    'qw2' with type 'long long unsigned int' at offset 16 [-Warray-bounds]

The problem is that the original code is trying to copy data into a
couple of struct members adjacent to each other in a single call to
memcpy(). This causes a legitimate compiler warning because memcpy()
overruns the length of &desc.qw2 and &resp.qw2, respectively.

This helps with the ongoing efforts to globally enable -Warray-bounds
and get us closer to being able to tighten the FORTIFY_SOURCE routines
on memcpy().

Link: https://github.com/KSPP/linux/issues/109
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210414201403.GA392764@embeddedor
Link: https://lore.kernel.org/r/20210610020115.1637656-18-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 60edb5cb15c9..096285b7602e 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -870,8 +870,8 @@ static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 		 */
 		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
-		memcpy(event.fault.prm.private_data, desc->priv_data,
-		       sizeof(desc->priv_data));
+		event.fault.prm.private_data[0] = desc->priv_data[0];
+		event.fault.prm.private_data[1] = desc->priv_data[1];
 	} else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) {
 		/*
 		 * If the private data fields are not used by hardware, use it
@@ -910,11 +910,15 @@ static void handle_bad_prq_event(struct intel_iommu *iommu,
 			QI_PGRP_RESP_TYPE;
 	desc.qw1 = QI_PGRP_IDX(req->prg_index) |
 			QI_PGRP_LPIG(req->lpig);
-	desc.qw2 = 0;
-	desc.qw3 = 0;
 
-	if (req->priv_data_present)
-		memcpy(&desc.qw2, req->priv_data, sizeof(req->priv_data));
+	if (req->priv_data_present) {
+		desc.qw2 = req->priv_data[0];
+		desc.qw3 = req->priv_data[1];
+	} else {
+		desc.qw2 = 0;
+		desc.qw3 = 0;
+	}
+
 	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
@@ -1177,12 +1181,14 @@ int intel_svm_page_response(struct device *dev,
 		desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
 		desc.qw2 = 0;
 		desc.qw3 = 0;
-		if (private_present)
-			memcpy(&desc.qw2, prm->private_data,
-			       sizeof(prm->private_data));
-		else if (prm->private_data[0])
+
+		if (private_present) {
+			desc.qw2 = prm->private_data[0];
+			desc.qw3 = prm->private_data[1];
+		} else if (prm->private_data[0]) {
 			dmar_latency_update(iommu, DMAR_LATENCY_PRQ,
 				ktime_to_ns(ktime_get()) - prm->private_data[0]);
+		}
 
 		qi_submit_sync(iommu, &desc, 1, 0);
 	}
-- 
Gitee


From 4d6c288f484d0892eea6676163a6ecc30a980e2a Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 10 Jun 2021 10:01:10 +0800
Subject: [PATCH 0467/2161] iommu/vt-d: Use DEVICE_ATTR_RO macro

commit 3bc770b0e998a69c7085ec36405eda246b5b10d8 upstream.

Use DEVICE_ATTR_RO() helper instead of plain DEVICE_ATTR(),
which makes the code a bit shorter and easier to read.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210528130229.22108-1-yuehaibing@huawei.com
Link: https://lore.kernel.org/r/20210610020115.1637656-19-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 42 ++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5610e4c81090..2dc198ee4bad 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4197,62 +4197,56 @@ static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
 	return container_of(iommu_dev, struct intel_iommu, iommu);
 }
 
-static ssize_t intel_iommu_show_version(struct device *dev,
-					struct device_attribute *attr,
-					char *buf)
+static ssize_t version_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
 {
 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
 	return sprintf(buf, "%d:%d\n",
 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
 }
-static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
+static DEVICE_ATTR_RO(version);
 
-static ssize_t intel_iommu_show_address(struct device *dev,
-					struct device_attribute *attr,
-					char *buf)
+static ssize_t address_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
 {
 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
 	return sprintf(buf, "%llx\n", iommu->reg_phys);
 }
-static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
+static DEVICE_ATTR_RO(address);
 
-static ssize_t intel_iommu_show_cap(struct device *dev,
-				    struct device_attribute *attr,
-				    char *buf)
+static ssize_t cap_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
 	return sprintf(buf, "%llx\n", iommu->cap);
 }
-static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
+static DEVICE_ATTR_RO(cap);
 
-static ssize_t intel_iommu_show_ecap(struct device *dev,
-				    struct device_attribute *attr,
-				    char *buf)
+static ssize_t ecap_show(struct device *dev,
+			 struct device_attribute *attr, char *buf)
 {
 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
 	return sprintf(buf, "%llx\n", iommu->ecap);
 }
-static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
+static DEVICE_ATTR_RO(ecap);
 
-static ssize_t intel_iommu_show_ndoms(struct device *dev,
-				      struct device_attribute *attr,
-				      char *buf)
+static ssize_t domains_supported_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
 {
 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
 }
-static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
+static DEVICE_ATTR_RO(domains_supported);
 
-static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
-					   struct device_attribute *attr,
-					   char *buf)
+static ssize_t domains_used_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
 {
 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
 						  cap_ndoms(iommu->cap)));
 }
-static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
+static DEVICE_ATTR_RO(domains_used);
 
 static struct attribute *intel_iommu_attrs[] = {
 	&dev_attr_version.attr,
-- 
Gitee


From 1e586cb4ddf85955202baafb391d70b414353b4e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 10 Jun 2021 10:01:11 +0800
Subject: [PATCH 0468/2161] iommu/vt-d: Use bitfields for DMAR capabilities

commit 1f106ff0ea2782a6bc49bb927e4789681a2ec507 upstream.

IOTLB device presence, iommu coherency and snooping are boolean
capabilities. Use them as bits and keep them adjacent.

Structure layout before the reorg.
$ pahole -C dmar_domain drivers/iommu/intel/dmar.o
struct dmar_domain {
        int                        nid;                  /*     0     4 */
        unsigned int               iommu_refcnt[128];    /*     4   512 */
        /* --- cacheline 8 boundary (512 bytes) was 4 bytes ago --- */
        u16                        iommu_did[128];       /*   516   256 */
        /* --- cacheline 12 boundary (768 bytes) was 4 bytes ago --- */
        bool                       has_iotlb_device;     /*   772     1 */

        /* XXX 3 bytes hole, try to pack */

        struct list_head           devices;              /*   776    16 */
        struct list_head           subdevices;           /*   792    16 */
        struct iova_domain         iovad __attribute__((__aligned__(8)));
							 /*   808  2320 */
        /* --- cacheline 48 boundary (3072 bytes) was 56 bytes ago --- */
        struct dma_pte *           pgd;                  /*  3128     8 */
        /* --- cacheline 49 boundary (3136 bytes) --- */
        int                        gaw;                  /*  3136     4 */
        int                        agaw;                 /*  3140     4 */
        int                        flags;                /*  3144     4 */
        int                        iommu_coherency;      /*  3148     4 */
        int                        iommu_snooping;       /*  3152     4 */
        int                        iommu_count;          /*  3156     4 */
        int                        iommu_superpage;      /*  3160     4 */

        /* XXX 4 bytes hole, try to pack */

        u64                        max_addr;             /*  3168     8 */
        u32                        default_pasid;        /*  3176     4 */

        /* XXX 4 bytes hole, try to pack */

        struct iommu_domain        domain;               /*  3184    72 */

        /* size: 3256, cachelines: 51, members: 18 */
        /* sum members: 3245, holes: 3, sum holes: 11 */
        /* forced alignments: 1 */
        /* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));

After arranging it for natural padding and to make flags as u8 bits, it
saves 8 bytes for the struct.

struct dmar_domain {
        int                        nid;                  /*     0     4 */
        unsigned int               iommu_refcnt[128];    /*     4   512 */
        /* --- cacheline 8 boundary (512 bytes) was 4 bytes ago --- */
        u16                        iommu_did[128];       /*   516   256 */
        /* --- cacheline 12 boundary (768 bytes) was 4 bytes ago --- */
        u8                         has_iotlb_device:1;   /*   772: 0  1 */
        u8                         iommu_coherency:1;    /*   772: 1  1 */
        u8                         iommu_snooping:1;     /*   772: 2  1 */

        /* XXX 5 bits hole, try to pack */
        /* XXX 3 bytes hole, try to pack */

        struct list_head           devices;              /*   776    16 */
        struct list_head           subdevices;           /*   792    16 */
        struct iova_domain         iovad __attribute__((__aligned__(8)));
							 /*   808  2320 */
        /* --- cacheline 48 boundary (3072 bytes) was 56 bytes ago --- */
        struct dma_pte *           pgd;                  /*  3128     8 */
        /* --- cacheline 49 boundary (3136 bytes) --- */
        int                        gaw;                  /*  3136     4 */
        int                        agaw;                 /*  3140     4 */
        int                        flags;                /*  3144     4 */
        int                        iommu_count;          /*  3148     4 */
        int                        iommu_superpage;      /*  3152     4 */

        /* XXX 4 bytes hole, try to pack */

        u64                        max_addr;             /*  3160     8 */
        u32                        default_pasid;        /*  3168     4 */

        /* XXX 4 bytes hole, try to pack */

        struct iommu_domain        domain;               /*  3176    72 */

        /* size: 3248, cachelines: 51, members: 18 */
        /* sum members: 3236, holes: 3, sum holes: 11 */
        /* sum bitfield members: 3 bits, bit holes: 1, sum bit holes: 5 bits */
        /* forced alignments: 1 */
        /* last cacheline: 48 bytes */
} __attribute__((__aligned__(8)));

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210530075053.264218-1-parav@nvidia.com
Link: https://lore.kernel.org/r/20210610020115.1637656-20-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 18 +++++++++---------
 include/linux/intel-iommu.h |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2dc198ee4bad..1756ac64e682 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -627,12 +627,12 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	bool found = false;
 	int i;
 
-	domain->iommu_coherency = 1;
+	domain->iommu_coherency = true;
 
 	for_each_domain_iommu(i, domain) {
 		found = true;
 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
-			domain->iommu_coherency = 0;
+			domain->iommu_coherency = false;
 			break;
 		}
 	}
@@ -643,18 +643,18 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
 		if (!iommu_paging_structure_coherency(iommu)) {
-			domain->iommu_coherency = 0;
+			domain->iommu_coherency = false;
 			break;
 		}
 	}
 	rcu_read_unlock();
 }
 
-static int domain_update_iommu_snooping(struct intel_iommu *skip)
+static bool domain_update_iommu_snooping(struct intel_iommu *skip)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
-	int ret = 1;
+	bool ret = true;
 
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
@@ -667,7 +667,7 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip)
 			 */
 			if (!sm_supported(iommu) &&
 			    !ecap_sc_support(iommu->ecap)) {
-				ret = 0;
+				ret = false;
 				break;
 			}
 		}
@@ -4553,8 +4553,8 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	adjust_width = guestwidth_to_adjustwidth(guest_width);
 	domain->agaw = width_to_agaw(adjust_width);
 
-	domain->iommu_coherency = 0;
-	domain->iommu_snooping = 0;
+	domain->iommu_coherency = false;
+	domain->iommu_snooping = false;
 	domain->iommu_superpage = 0;
 	domain->max_addr = 0;
 
@@ -5179,7 +5179,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 static bool intel_iommu_capable(enum iommu_cap cap)
 {
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
-		return domain_update_iommu_snooping(NULL) == 1;
+		return domain_update_iommu_snooping(NULL);
 	if (cap == IOMMU_CAP_INTR_REMAP)
 		return irq_remapping_enabled == 1;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 5b0dcabb7b36..0e8079ff76de 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -546,7 +546,10 @@ struct dmar_domain {
 					 * domain ids are 16 bit wide according
 					 * to VT-d spec, section 9.3 */
 
-	bool has_iotlb_device;
+	u8 has_iotlb_device: 1;
+	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
+	u8 iommu_snooping: 1;		/* indicate snooping control feature */
+
 	struct list_head devices;	/* all devices' list */
 	struct list_head subdevices;	/* all subdevices' list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
@@ -558,9 +561,6 @@ struct dmar_domain {
 	int		agaw;
 
 	int		flags;		/* flags to find out type of domain */
-
-	int		iommu_coherency;/* indicate coherency of iommu access */
-	int		iommu_snooping; /* indicate snooping control feature*/
 	int		iommu_count;	/* reference count of iommu */
 	int		iommu_superpage;/* Level of superpages supported:
 					   0 == 4KiB (no superpages), 1 == 2MiB,
-- 
Gitee


From de87aa9dd0eebdf4c4ac31e1974d6c79e3a0c83e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 10 Jun 2021 10:01:12 +0800
Subject: [PATCH 0469/2161] iommu/vt-d: Removed unused iommu_count in dmar
 domain

commit 74f6d776ae0b8498cfdb574ab24992bd50a2a2f1 upstream.

DMAR domain uses per DMAR refcount. It is indexed by iommu seq_id.
Older iommu_count is only incremented and decremented but no decisions
are taken based on this refcount. This is not of much use.

Hence, remove iommu_count and further simplify domain_detach_iommu()
by returning void.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210530075053.264218-1-parav@nvidia.com
Link: https://lore.kernel.org/r/20210610020115.1637656-21-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 11 +++--------
 include/linux/intel-iommu.h |  1 -
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 1756ac64e682..c48f36b140d6 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1921,7 +1921,6 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 	assert_spin_locked(&iommu->lock);
 
 	domain->iommu_refcnt[iommu->seq_id] += 1;
-	domain->iommu_count += 1;
 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
 		ndomains = cap_ndoms(iommu->cap);
 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
@@ -1929,7 +1928,6 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 		if (num >= ndomains) {
 			pr_err("%s: No free domain ids\n", iommu->name);
 			domain->iommu_refcnt[iommu->seq_id] -= 1;
-			domain->iommu_count -= 1;
 			return -ENOSPC;
 		}
 
@@ -1945,16 +1943,15 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 	return 0;
 }
 
-static int domain_detach_iommu(struct dmar_domain *domain,
-			       struct intel_iommu *iommu)
+static void domain_detach_iommu(struct dmar_domain *domain,
+				struct intel_iommu *iommu)
 {
-	int num, count;
+	int num;
 
 	assert_spin_locked(&device_domain_lock);
 	assert_spin_locked(&iommu->lock);
 
 	domain->iommu_refcnt[iommu->seq_id] -= 1;
-	count = --domain->iommu_count;
 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
 		num = domain->iommu_did[iommu->seq_id];
 		clear_bit(num, iommu->domain_ids);
@@ -1963,8 +1960,6 @@ static int domain_detach_iommu(struct dmar_domain *domain,
 		domain_update_iommu_cap(domain);
 		domain->iommu_did[iommu->seq_id] = 0;
 	}
-
-	return count;
 }
 
 static inline int guestwidth_to_adjustwidth(int gaw)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0e8079ff76de..e5886fd968ab 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -561,7 +561,6 @@ struct dmar_domain {
 	int		agaw;
 
 	int		flags;		/* flags to find out type of domain */
-	int		iommu_count;	/* reference count of iommu */
 	int		iommu_superpage;/* Level of superpages supported:
 					   0 == 4KiB (no superpages), 1 == 2MiB,
 					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-- 
Gitee


From 5ccfdc8ba95c1ca3acc42d1c2f49d5951ddbad1d Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 10 Jun 2021 10:01:13 +0800
Subject: [PATCH 0470/2161] iommu/vt-d: Remove unnecessary braces

commit cee57d4fe74e82e784f6566bad3e3bb1ca51a211 upstream.

No need for braces for single line statement under if() block.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210530075053.264218-1-parav@nvidia.com
Link: https://lore.kernel.org/r/20210610020115.1637656-22-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c48f36b140d6..b311c08e12ca 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -684,9 +684,8 @@ static int domain_update_iommu_superpage(struct dmar_domain *domain,
 	struct intel_iommu *iommu;
 	int mask = 0x3;
 
-	if (!intel_iommu_superpage) {
+	if (!intel_iommu_superpage)
 		return 0;
-	}
 
 	/* set iommu_superpage to the smallest common denominator */
 	rcu_read_lock();
-- 
Gitee


From 48939fdd2c1f50f66f26d8dd059c7db21eecc138 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 10 Jun 2021 10:01:15 +0800
Subject: [PATCH 0471/2161] iommu/vt-d: No need to typecast

commit 7a0f06c197cb5e2106da8507b5610c1bb9c7daac upstream.

Page directory assignment by alloc_pgtable_page() or phys_to_virt()
doesn't need typecasting as both routines return void*. Hence, remove
typecasting from both the calls.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210530075053.264218-1-parav@nvidia.com
Link: https://lore.kernel.org/r/20210610020115.1637656-24-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b311c08e12ca..09bce631ee3a 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4553,7 +4553,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	domain->max_addr = 0;
 
 	/* always allocate the top pgd */
-	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+	domain->pgd = alloc_pgtable_page(domain->nid);
 	if (!domain->pgd)
 		return -ENOMEM;
 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
@@ -4822,8 +4822,7 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
 
 		pte = dmar_domain->pgd;
 		if (dma_pte_present(pte)) {
-			dmar_domain->pgd = (struct dma_pte *)
-				phys_to_virt(dma_pte_addr(pte));
+			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
 			free_pgtable_page(pte);
 		}
 		dmar_domain->agaw--;
-- 
Gitee


From 76f5eefb4593b3d646235177210fe349bb43a9b0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 10 Jun 2021 10:31:20 +0200
Subject: [PATCH 0472/2161] iommu/vt-d: Fix linker error on 32-bit

commit d6a9642bd673dd0bb2839274fe83eaa979b9207e upstream.

A recent commit broke the build on 32-bit x86. The linker throws these
messages:

	ld: drivers/iommu/intel/perf.o: in function `dmar_latency_snapshot':
	perf.c:(.text+0x40c): undefined reference to `__udivdi3'
	ld: perf.c:(.text+0x458): undefined reference to `__udivdi3'

The reason are the 64-bit divides in dmar_latency_snapshot(). Use the
div_u64() helper function for those.

Fixes: 55ee5e67a59a ("iommu/vt-d: Add common code for dmar latency performance monitors")
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20210610083120.29224-1-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/perf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
index faaa96dda437..73b7ec705552 100644
--- a/drivers/iommu/intel/perf.c
+++ b/drivers/iommu/intel/perf.c
@@ -141,14 +141,14 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
 				if (val == UINT_MAX)
 					val = 0;
 				else
-					val /= 1000;
+					val = div_u64(val, 1000);
 				break;
 			case COUNTS_MAX:
-				val /= 1000;
+				val = div_u64(val, 1000);
 				break;
 			case COUNTS_SUM:
 				if (lstat[i].samples)
-					val /= (lstat[i].samples * 1000);
+					val = div_u64(val, (lstat[i].samples * 1000));
 				else
 					val = 0;
 				break;
-- 
Gitee


From 1827cc4f42fde0e6fa6126a27f1c6af625007887 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 11 Jun 2021 14:50:24 +0100
Subject: [PATCH 0473/2161] iommu/vt-d: Fix dereference of pointer info before
 it is null checked

commit 934ed4580c0a13a49ab7c4cbf94cd1958c0679ed upstream.

The assignment of iommu from info->iommu occurs before info is null checked
hence leading to a potential null pointer dereference issue. Fix this by
assigning iommu and checking if iommu is null after null checking info.

Addresses-Coverity: ("Dereference before null check")
Fixes: 4c82b88696ac ("iommu/vt-d: Allocate/register iopf queue for sva devices")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Link: https://lore.kernel.org/r/20210611135024.32781-1-colin.king@canonical.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 09bce631ee3a..01124a1bbc2e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5377,10 +5377,14 @@ static int intel_iommu_disable_auxd(struct device *dev)
 static int intel_iommu_enable_sva(struct device *dev)
 {
 	struct device_domain_info *info = get_domain_info(dev);
-	struct intel_iommu *iommu = info->iommu;
+	struct intel_iommu *iommu;
 	int ret;
 
-	if (!info || !iommu || dmar_disabled)
+	if (!info || dmar_disabled)
+		return -EINVAL;
+
+	iommu = info->iommu;
+	if (!iommu)
 		return -EINVAL;
 
 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
-- 
Gitee


From df607084a0aa2dbaa39844f6bf87dfedd839cd06 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Mon, 12 Jul 2021 15:13:15 +0800
Subject: [PATCH 0474/2161] iommu/vt-d: Global devTLB flush when present
 context entry changed

commit 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 upstream.

This fixes a bug in context cache clear operation. The code was not
following the correct invalidation flow. A global device TLB invalidation
should be added after the IOTLB invalidation. At the same time, it
uses the domain ID from the context entry. But in scalable mode, the
domain ID is in PASID table entry, not context entry.

Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
Cc: stable@vger.kernel.org # v5.0+
Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210712071315.3416543-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 01124a1bbc2e..237ff7f9a6a3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2430,10 +2430,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 	return 0;
 }
 
-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
+static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
 {
-	unsigned long flags;
+	struct intel_iommu *iommu = info->iommu;
 	struct context_entry *context;
+	unsigned long flags;
 	u16 did_old;
 
 	if (!iommu)
@@ -2445,7 +2446,16 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
 		spin_unlock_irqrestore(&iommu->lock, flags);
 		return;
 	}
-	did_old = context_domain_id(context);
+
+	if (sm_supported(iommu)) {
+		if (hw_pass_through && domain_type_is_si(info->domain))
+			did_old = FLPT_DEFAULT_DID;
+		else
+			did_old = info->domain->iommu_did[iommu->seq_id];
+	} else {
+		did_old = context_domain_id(context);
+	}
+
 	context_clear_entry(context);
 	__iommu_flush_cache(iommu, context, sizeof(*context));
 	spin_unlock_irqrestore(&iommu->lock, flags);
@@ -2463,6 +2473,8 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
 				 0,
 				 0,
 				 DMA_TLB_DSI_FLUSH);
+
+	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
 }
 
 static inline void unlink_domain_info(struct device_domain_info *info)
@@ -4472,9 +4484,9 @@ int __init intel_iommu_init(void)
 
 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
 {
-	struct intel_iommu *iommu = opaque;
+	struct device_domain_info *info = opaque;
 
-	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
 	return 0;
 }
 
@@ -4484,12 +4496,13 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op
  * devices, unbinding the driver from any one of them will possibly leave
  * the others unable to operate.
  */
-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
+static void domain_context_clear(struct device_domain_info *info)
 {
-	if (!iommu || !dev || !dev_is_pci(dev))
+	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
 		return;
 
-	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
+	pci_for_each_dma_alias(to_pci_dev(info->dev),
+			       &domain_context_clear_one_cb, info);
 }
 
 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
@@ -4513,7 +4526,7 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 
 		iommu_disable_dev_iotlb(info);
 		if (!dev_is_real_dma_subdevice(info->dev))
-			domain_context_clear(iommu, info->dev);
+			domain_context_clear(info);
 		intel_pasid_free_table(info->dev);
 	}
 
-- 
Gitee


From f22599283ced28cf08de536709e7a4aa1116d1be Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Jul 2021 15:17:12 +0800
Subject: [PATCH 0475/2161] iommu/vt-d: Fix clearing real DMA device's
 scalable-mode context entries

commit 474dd1c6506411752a9b2f2233eec11f1733a099 upstream.

The commit 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
fixes an issue of "sub-device is removed where the context entry is cleared
for all aliases". But this commit didn't consider the PASID entry and PASID
table in VT-d scalable mode. This fix increases the coverage of scalable
mode.

Suggested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Fixes: 8038bdb855331 ("iommu/vt-d: Only clear real DMA device's context entries")
Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
Cc: stable@vger.kernel.org # v5.6+
Cc: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210712071712.3416949-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 237ff7f9a6a3..30aacd9df59c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4519,14 +4519,13 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 	iommu = info->iommu;
 	domain = info->domain;
 
-	if (info->dev) {
+	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
 		if (dev_is_pci(info->dev) && sm_supported(iommu))
 			intel_pasid_tear_down_entry(iommu, info->dev,
 					PASID_RID2PASID, false);
 
 		iommu_disable_dev_iotlb(info);
-		if (!dev_is_real_dma_subdevice(info->dev))
-			domain_context_clear(info);
+		domain_context_clear(info);
 		intel_pasid_free_table(info->dev);
 	}
 
-- 
Gitee


From ee1315fd74e3d57ae53b165894d7e258640f3232 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Jun 2021 06:38:46 -0700
Subject: [PATCH 0476/2161] iommu: Use bitmap to calculate page size in
 iommu_pgsize()

commit e7d6fff6b3d34230bac1b2c2607646d7b8638cb7 upstream.

Avoid the potential for shifting values by amounts greater than the
width of their type by using a bitmap to compute page size in
iommu_pgsize().

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1623850736-389584-6-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 24d76dd48dc3..92cb80f5bfd6 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -8,6 +8,7 @@
 
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/bits.h>
 #include <linux/bug.h>
 #include <linux/types.h>
 #include <linux/init.h>
@@ -2159,30 +2160,22 @@ static size_t iommu_pgsize(struct iommu_domain *domain,
 			   unsigned long addr_merge, size_t size)
 {
 	unsigned int pgsize_idx;
+	unsigned long pgsizes;
 	size_t pgsize;
 
-	/* Max page size that still fits into 'size' */
-	pgsize_idx = __fls(size);
+	/* Page sizes supported by the hardware and small enough for @size */
+	pgsizes = domain->pgsize_bitmap & GENMASK(__fls(size), 0);
 
-	/* need to consider alignment requirements ? */
-	if (likely(addr_merge)) {
-		/* Max page size allowed by address */
-		unsigned int align_pgsize_idx = __ffs(addr_merge);
-		pgsize_idx = min(pgsize_idx, align_pgsize_idx);
-	}
-
-	/* build a mask of acceptable page sizes */
-	pgsize = (1UL << (pgsize_idx + 1)) - 1;
-
-	/* throw away page sizes not supported by the hardware */
-	pgsize &= domain->pgsize_bitmap;
+	/* Constrain the page sizes further based on the maximum alignment */
+	if (likely(addr_merge))
+		pgsizes &= GENMASK(__ffs(addr_merge), 0);
 
-	/* make sure we're still sane */
-	BUG_ON(!pgsize);
+	/* Make sure we have at least one suitable page size */
+	BUG_ON(!pgsizes);
 
-	/* pick the biggest page */
-	pgsize_idx = __fls(pgsize);
-	pgsize = 1UL << pgsize_idx;
+	/* Pick the biggest page size remaining */
+	pgsize_idx = __fls(pgsizes);
+	pgsize = BIT(pgsize_idx);
 
 	return pgsize;
 }
-- 
Gitee


From 28230b7f88a4755f9e079a6f40e954cff5e9ff79 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 20 Jul 2021 10:06:13 +0800
Subject: [PATCH 0477/2161] iommu/vt-d: Report real pgsize bitmap to iommu core

commit a886d5a7e67bc403b8e51ab50c10324bcdbb686f upstream.

The pgsize bitmap is used to advertise the page sizes our hardware supports
to the IOMMU core, which will then use this information to split physically
contiguous memory regions it is mapping into page sizes that we support.

Traditionally the IOMMU core just handed us the mappings directly, after
making sure the size is an order of a 4KiB page and that the mapping has
natural alignment. To retain this behavior, we currently advertise that we
support all page sizes that are an order of 4KiB.

We are about to utilize the new IOMMU map/unmap_pages APIs. We could change
this to advertise the real page sizes we support.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210720020615.4144323-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 38 ++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 30aacd9df59c..51e03c941fa7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -86,24 +86,6 @@
 #define LEVEL_STRIDE		(9)
 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
 
-/*
- * This bitmap is used to advertise the page sizes our hardware support
- * to the IOMMU core, which will then use this information to split
- * physically contiguous memory regions it is mapping into page sizes
- * that we support.
- *
- * Traditionally the IOMMU core just handed us the mappings directly,
- * after making sure the size is an order of a 4KiB page and that the
- * mapping has natural alignment.
- *
- * To retain this behavior, we currently advertise that we support
- * all page sizes that are an order of 4KiB.
- *
- * If at some point we'd like to utilize the IOMMU core's new behavior,
- * we could change this to advertise the real page sizes we support.
- */
-#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
-
 static inline int agaw_to_level(int agaw)
 {
 	return agaw + 2;
@@ -737,6 +719,23 @@ static int domain_update_device_node(struct dmar_domain *domain)
 
 static void domain_update_iotlb(struct dmar_domain *domain);
 
+/* Return the super pagesize bitmap if supported. */
+static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
+{
+	unsigned long bitmap = 0;
+
+	/*
+	 * 1-level super page supports page size of 2MiB, 2-level super page
+	 * supports page size of both 2MiB and 1GiB.
+	 */
+	if (domain->iommu_superpage == 1)
+		bitmap |= SZ_2M;
+	else if (domain->iommu_superpage == 2)
+		bitmap |= SZ_2M | SZ_1G;
+
+	return bitmap;
+}
+
 /* Some capabilities may be different across iommus */
 static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
@@ -763,6 +762,7 @@ static void domain_update_iommu_cap(struct dmar_domain *domain)
 	else
 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 
+	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 	domain_update_iotlb(domain);
 }
 
@@ -5663,7 +5663,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
 	.def_domain_type	= device_def_domain_type,
-	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
+	.pgsize_bitmap		= SZ_4K,
 #ifdef CONFIG_INTEL_IOMMU_SVM
 	.cache_invalidate	= intel_iommu_sva_invalidate,
 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
-- 
Gitee


From 4a671183ccaab15692aaba17484b82a2824920b5 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 20 Jul 2021 10:06:15 +0800
Subject: [PATCH 0478/2161] iommu/vt-d: Move clflush'es from iotlb_sync_map()
 to map_pages()

commit 75cc1018a9e1e57d4ae43a101fc08a070894d439 upstream.

As the Intel VT-d driver has switched to use the iommu_ops.map_pages()
callback, multiple pages of the same size will be mapped in a call.
There's no need to put the clflush'es in iotlb_sync_map() callback.
Move them back into __domain_mapping() to simplify the code.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210720020615.4144323-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 48 ++++++-------------------------------
 1 file changed, 7 insertions(+), 41 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 51e03c941fa7..b8756d437df2 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2335,9 +2335,9 @@ static int
 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
 {
+	struct dma_pte *first_pte = NULL, *pte = NULL;
 	unsigned int largepage_lvl = 0;
 	unsigned long lvl_pages = 0;
-	struct dma_pte *pte = NULL;
 	phys_addr_t pteval;
 	u64 attr;
 
@@ -2370,6 +2370,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
 			if (!pte)
 				return -ENOMEM;
+			first_pte = pte;
+
 			/* It is large page*/
 			if (largepage_lvl > 1) {
 				unsigned long end_pfn;
@@ -2417,14 +2419,14 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		 * recalculate 'pte' and switch back to smaller pages for the
 		 * end of the mapping, if the trailing size is not enough to
 		 * use another superpage (i.e. nr_pages < lvl_pages).
-		 *
-		 * We leave clflush for the leaf pte changes to iotlb_sync_map()
-		 * callback.
 		 */
 		pte++;
 		if (!nr_pages || first_pte_in_page(pte) ||
-		    (largepage_lvl > 1 && nr_pages < lvl_pages))
+		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
+			domain_flush_cache(domain, first_pte,
+					   (void *)pte - (void *)first_pte);
 			pte = NULL;
+		}
 	}
 
 	return 0;
@@ -5583,39 +5585,6 @@ intel_iommu_enable_nesting(struct iommu_domain *domain)
 	return ret;
 }
 
-static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
-			     unsigned long clf_pages)
-{
-	struct dma_pte *first_pte = NULL, *pte = NULL;
-	unsigned long lvl_pages = 0;
-	int level = 0;
-
-	while (clf_pages > 0) {
-		if (!pte) {
-			level = 0;
-			pte = pfn_to_dma_pte(domain, clf_pfn, &level);
-			if (WARN_ON(!pte))
-				return;
-			first_pte = pte;
-			lvl_pages = lvl_to_nr_pages(level);
-		}
-
-		if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
-			return;
-
-		clf_pages -= lvl_pages;
-		clf_pfn += lvl_pages;
-		pte++;
-
-		if (!clf_pages || first_pte_in_page(pte) ||
-		    (level > 1 && clf_pages < lvl_pages)) {
-			domain_flush_cache(domain, first_pte,
-					   (void *)pte - (void *)first_pte);
-			pte = NULL;
-		}
-	}
-}
-
 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
 				       unsigned long iova, size_t size)
 {
@@ -5625,9 +5594,6 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
 	struct intel_iommu *iommu;
 	int iommu_id;
 
-	if (!dmar_domain->iommu_coherency)
-		clflush_sync_map(dmar_domain, pfn, pages);
-
 	for_each_domain_iommu(iommu_id, dmar_domain) {
 		iommu = g_iommus[iommu_id];
 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
-- 
Gitee


From 8dab3852043bf01e6cfe3a07a0f27bb92b1f4a3e Mon Sep 17 00:00:00 2001
From: Ashish Mhetre <amhetre@nvidia.com>
Date: Tue, 10 Aug 2021 10:14:00 +0530
Subject: [PATCH 0479/2161] iommu: Fix race condition during default domain
 allocation

commit 211ff31b3d33b56aa12937e898c9280d07daf0d9 upstream.

When two devices with same SID are getting probed concurrently through
iommu_probe_device(), the iommu_domain sometimes is getting allocated more
than once as call to iommu_alloc_default_domain() is not protected for
concurrency. Furthermore, it leads to each device holding a different
iommu_domain pointer, separate IOVA space and only one of the devices'
domain is used for translations from IOMMU. This causes accesses from other
device to fault or see incorrect translations.
Fix this by protecting iommu_alloc_default_domain() call with group->mutex
and let all devices with same SID share same iommu_domain.

Signed-off-by: Ashish Mhetre <amhetre@nvidia.com>
Link: https://lore.kernel.org/r/1628570641-9127-2-git-send-email-amhetre@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 92cb80f5bfd6..db2d7151c517 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -263,7 +263,9 @@ int iommu_probe_device(struct device *dev)
 	 * support default domains, so the return value is not yet
 	 * checked.
 	 */
+	mutex_lock(&group->mutex);
 	iommu_alloc_default_domain(group, dev);
+	mutex_unlock(&group->mutex);
 
 	if (group->default_domain) {
 		ret = __iommu_attach_device(group->default_domain, dev);
-- 
Gitee


From 894bc6debde9ceebfc3b90efda3324513e09bf15 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 Aug 2021 20:43:20 +0800
Subject: [PATCH 0480/2161] iommu/vt-d: Fix PASID reference leak

commit 62ef907a045e1a81830941c48004d7af71c9d75a upstream.

A PASID reference is increased whenever a device is bound to an mm (and
its PASID) successfully (i.e. the device's sdev user count is increased).
But the reference is not dropped every time the device is unbound
successfully from the mm (i.e. the device's sdev user count is decreased).
The reference is dropped only once by calling intel_svm_free_pasid() when
there isn't any device bound to the mm. intel_svm_free_pasid() drops the
reference and only frees the PASID on zero reference.

Fix the issue by dropping the PASID reference and freeing the PASID when
no reference on successful unbinding the device by calling
intel_svm_free_pasid() .

Fixes: 4048377414162 ("iommu/vt-d: Use iommu_sva_alloc(free)_pasid() helpers")
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20210813181345.1870742-1-fenghua.yu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210817124321.1517985-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 096285b7602e..0d1c18af2919 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -675,7 +675,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
-				intel_svm_free_pasid(mm);
 				if (svm->notifier.ops) {
 					mmu_notifier_unregister(&svm->notifier, mm);
 					/* Clear mm's pasid. */
@@ -690,6 +689,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 				kfree(svm);
 			}
 		}
+		/* Drop a PASID reference and free it if no reference. */
+		intel_svm_free_pasid(mm);
 	}
 out:
 	return ret;
-- 
Gitee


From 0a6d3e08ca460579c24eebbdc920bc61ea468e9e Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 17 Aug 2021 20:43:21 +0800
Subject: [PATCH 0481/2161] iommu/vt-d: Fix incomplete cache flush in
 intel_pasid_tear_down_entry()

commit 8798d36411196da86e70b994725349c16c1119f6 upstream.

This fixes improper iotlb invalidation in intel_pasid_tear_down_entry().
When a PASID was used as nested mode, released and reused, the following
error message will appear:

[  180.187556] Unexpected page request in Privilege Mode
[  180.187565] Unexpected page request in Privilege Mode
[  180.279933] Unexpected page request in Privilege Mode
[  180.279937] Unexpected page request in Privilege Mode

Per chapter 6.5.3.3 of VT-d spec 3.3, when tear down a pasid entry, the
software should use Domain selective IOTLB flush if the PGTT of the pasid
entry is SL only or Nested, while for the pasid entries whose PGTT is FL
only or PT using PASID-based IOTLB flush is enough.

Fixes: 2cd1311a26673 ("iommu/vt-d: Add set domain DOMAIN_ATTR_NESTING attr")
Signed-off-by: Kumar Sanjay K <sanjay.k.kumar@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Tested-by: Yi Sun <yi.y.sun@intel.com>
Link: https://lore.kernel.org/r/20210817042425.1784279-1-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210817124321.1517985-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 10 ++++++++--
 drivers/iommu/intel/pasid.h |  6 ++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index c6cf44a6c923..9ec374e17469 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -511,7 +511,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 				 u32 pasid, bool fault_ignore)
 {
 	struct pasid_entry *pte;
-	u16 did;
+	u16 did, pgtt;
 
 	pte = intel_pasid_get_entry(dev, pasid);
 	if (WARN_ON(!pte))
@@ -521,13 +521,19 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 		return;
 
 	did = pasid_get_domain_id(pte);
+	pgtt = pasid_pte_get_pgtt(pte);
+
 	intel_pasid_clear_entry(dev, pasid, fault_ignore);
 
 	if (!ecap_coherent(iommu->ecap))
 		clflush_cache_range(pte, sizeof(*pte));
 
 	pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-	qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+
+	if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY)
+		qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+	else
+		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
 
 	/* Device IOTLB doesn't need to be flushed in caching mode. */
 	if (!cap_caching_mode(iommu->cap))
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 5ff61c3d401f..c11bc8b833b8 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -99,6 +99,12 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte)
 	return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
 }
 
+/* Get PGTT field of a PASID table entry */
+static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte)
+{
+	return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7);
+}
+
 extern unsigned int intel_pasid_max_id;
 int intel_pasid_alloc_table(struct device *dev);
 void intel_pasid_free_table(struct device *dev);
-- 
Gitee


From c5cb537d8af2192d4bcf751a2460e78cf37e7aa9 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:15 +0100
Subject: [PATCH 0482/2161] iommu: Pull IOVA cookie management into the core

commit 46983fcd67ac5a830d41ebe3755314db67a6dd16 upstream.

Now that everyone has converged on iommu-dma for IOMMU_DOMAIN_DMA
support, we can abandon the notion of drivers being responsible for the
cookie type, and consolidate all the management into the core code.

CC: Yong Wu <yong.wu@mediatek.com>
CC: Chunyan Zhang <chunyan.zhang@unisoc.com>
CC: Maxime Ripard <mripard@kernel.org>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/46a2c0e7419c7d1d931762dc7b6a69fa082d199a.1628682048.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 7 +++++++
 include/linux/iommu.h | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index db2d7151c517..8a915752fb1f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -7,6 +7,7 @@
 #define pr_fmt(fmt)    "iommu: " fmt
 
 #include <linux/device.h>
+#include <linux/dma-iommu.h>
 #include <linux/kernel.h>
 #include <linux/bits.h>
 #include <linux/bug.h>
@@ -1915,6 +1916,11 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 	/* Assume all sizes by default; the driver may override this later */
 	domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
 
+	/* Temporarily avoid -EEXIST while drivers still get their own cookies */
+	if (type == IOMMU_DOMAIN_DMA && !domain->iova_cookie && iommu_get_dma_cookie(domain)) {
+		iommu_domain_free(domain);
+		domain = NULL;
+	}
 	return domain;
 }
 
@@ -1926,6 +1932,7 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc);
 
 void iommu_domain_free(struct iommu_domain *domain)
 {
+	iommu_put_dma_cookie(domain);
 	domain->ops->domain_free(domain);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_free);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 609c13de6b10..499e22136ecf 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -46,6 +46,7 @@ struct iommu_domain;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_fault_event;
+struct iommu_dma_cookie;
 
 /* iommu fault flags */
 #define IOMMU_FAULT_READ	0x0
@@ -92,7 +93,7 @@ struct iommu_domain {
 	iommu_fault_handler_t handler;
 	void *handler_token;
 	struct iommu_domain_geometry geometry;
-	void *iova_cookie;
+	struct iommu_dma_cookie *iova_cookie;
 };
 
 enum iommu_cap {
-- 
Gitee


From 15de0cc45483479827697335ba11f9eb4888bf8a Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:17 +0100
Subject: [PATCH 0483/2161] iommu/arm-smmu: Drop IOVA cookie management

commit 229496a0eb088b34849fcc3ab25f0f9e3f90794a upstream.

The core code bakes its own cookies now.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/7ae3680dad9735cc69c3618866666896bd11e031.1628682048.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/arm-smmu-v3.c |  7 -------
 drivers/iommu/arm-smmu.c    | 15 ++++-----------
 drivers/iommu/qcom_iommu.c  |  9 ---------
 3 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 5e39ec0be750..636b8ea54d0c 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2147,12 +2147,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 	if (!smmu_domain)
 		return NULL;
 
-	if (type == IOMMU_DOMAIN_DMA &&
-	    iommu_get_dma_cookie(&smmu_domain->domain)) {
-		kfree(smmu_domain);
-		return NULL;
-	}
-
 	mutex_init(&smmu_domain->init_mutex);
 	INIT_LIST_HEAD(&smmu_domain->devices);
 	spin_lock_init(&smmu_domain->devices_lock);
@@ -2183,7 +2177,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 
-	iommu_put_dma_cookie(domain);
 	free_io_pgtable_ops(smmu_domain->pgtbl_ops);
 
 	/* Free the CD and ASID, if we allocated them */
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index c95efa5d2c41..dcd9a6735ece 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -861,10 +861,10 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 {
 	struct arm_smmu_domain *smmu_domain;
 
-	if (type != IOMMU_DOMAIN_UNMANAGED &&
-	    type != IOMMU_DOMAIN_DMA &&
-	    type != IOMMU_DOMAIN_IDENTITY)
-		return NULL;
+	if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_IDENTITY) {
+		if (using_legacy_binding || type != IOMMU_DOMAIN_DMA)
+			return NULL;
+	}
 	/*
 	 * Allocate the domain and initialise some of its data structures.
 	 * We can't really do anything meaningful until we've added a
@@ -874,12 +874,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 	if (!smmu_domain)
 		return NULL;
 
-	if (type == IOMMU_DOMAIN_DMA && (using_legacy_binding ||
-	    iommu_get_dma_cookie(&smmu_domain->domain))) {
-		kfree(smmu_domain);
-		return NULL;
-	}
-
 	mutex_init(&smmu_domain->init_mutex);
 	spin_lock_init(&smmu_domain->cb_lock);
 
@@ -894,7 +888,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
 	 * Free the domain resources. We assume that all devices have
 	 * already been detached.
 	 */
-	iommu_put_dma_cookie(domain);
 	arm_smmu_destroy_domain_context(domain);
 	kfree(smmu_domain);
 }
diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
index 280de92b332e..8bcab8d8ebda 100644
--- a/drivers/iommu/qcom_iommu.c
+++ b/drivers/iommu/qcom_iommu.c
@@ -10,7 +10,6 @@
 #include <linux/bitfield.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
-#include <linux/dma-iommu.h>
 #include <linux/dma-mapping.h>
 #include <linux/err.h>
 #include <linux/interrupt.h>
@@ -329,12 +328,6 @@ static struct iommu_domain *qcom_iommu_domain_alloc(unsigned type)
 	if (!qcom_domain)
 		return NULL;
 
-	if (type == IOMMU_DOMAIN_DMA &&
-	    iommu_get_dma_cookie(&qcom_domain->domain)) {
-		kfree(qcom_domain);
-		return NULL;
-	}
-
 	mutex_init(&qcom_domain->init_mutex);
 	spin_lock_init(&qcom_domain->pgtbl_lock);
 
@@ -345,8 +338,6 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
 {
 	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
 
-	iommu_put_dma_cookie(domain);
-
 	if (qcom_domain->iommu) {
 		/*
 		 * NOTE: unmap can be called after client device is powered
-- 
Gitee


From cac4888f44dd6460e50c6a9d19a94e7bca40e2bc Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:18 +0100
Subject: [PATCH 0484/2161] iommu/vt-d: Drop IOVA cookie management

commit f297e27f831705de50811d933a394969602e8564 upstream.

The core code bakes its own cookies now.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/e9dbe3b6108f8538e17e0c5f59f8feeb714f51a4.1628682048.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b8756d437df2..57a4e482c45c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1981,10 +1981,6 @@ static void domain_exit(struct dmar_domain *domain)
 	/* Remove associated devices and clear attached or cached domains */
 	domain_remove_dev_info(domain);
 
-	/* destroy iovas */
-	if (domain->domain.type == IOMMU_DOMAIN_DMA)
-		iommu_put_dma_cookie(&domain->domain);
-
 	if (domain->pgd) {
 		struct page *freelist;
 
@@ -4594,10 +4590,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 			return NULL;
 		}
 
-		if (type == IOMMU_DOMAIN_DMA &&
-		    iommu_get_dma_cookie(&dmar_domain->domain))
-			return NULL;
-
 		domain = &dmar_domain->domain;
 		domain->geometry.aperture_start = 0;
 		domain->geometry.aperture_end   =
-- 
Gitee


From 0f059ba39fd40b6301dcaf9aaf23546879f92234 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:25 +0100
Subject: [PATCH 0485/2161] iommu/virtio: Drop IOVA cookie management

commit ca84ed7f724cf9f9620a607756482e297c2d8fea upstream.

The core code bakes its own cookies now.

Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/f05cd2d0a0f414de3180e2536c7656faf1e52418.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/virtio-iommu.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 60e659a24f90..670231300d17 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -599,12 +599,6 @@ static struct iommu_domain *viommu_domain_alloc(unsigned type)
 	spin_lock_init(&vdomain->mappings_lock);
 	vdomain->mappings = RB_ROOT_CACHED;
 
-	if (type == IOMMU_DOMAIN_DMA &&
-	    iommu_get_dma_cookie(&vdomain->domain)) {
-		kfree(vdomain);
-		return NULL;
-	}
-
 	return &vdomain->domain;
 }
 
@@ -634,8 +628,6 @@ static void viommu_domain_free(struct iommu_domain *domain)
 {
 	struct viommu_domain *vdomain = to_viommu_domain(domain);
 
-	iommu_put_dma_cookie(domain);
-
 	/* Free all remaining mappings (size 2^64) */
 	viommu_del_mappings(vdomain, 0, 0);
 
-- 
Gitee


From 519871f0e89528b1937f91ac105a06499166bab5 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 18 Aug 2021 21:48:44 +0800
Subject: [PATCH 0486/2161] iommu/vt-d: Update the virtual command related
 registers

commit 4d99efb229e63928c6b03a756a2e38cd4777fbe8 upstream.

The VT-d spec Revision 3.3 updated the virtual command registers, virtual
command opcode B register, virtual command response register and virtual
command capability register (Section 10.4.43, 10.4.44, 10.4.45, 10.4.46).
This updates the virtual command interface implementation in the Intel
IOMMU driver accordingly.

Fixes: 24f27d32ab6b7 ("iommu/vt-d: Enlightened PASID allocation")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Sanjay Kumar <sanjay.k.kumar@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210713042649.3547403-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210818134852.1847070-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.h | 10 +++++-----
 include/linux/intel-iommu.h |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index c11bc8b833b8..d5552e2c160d 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -28,12 +28,12 @@
 #define VCMD_CMD_ALLOC			0x1
 #define VCMD_CMD_FREE			0x2
 #define VCMD_VRSP_IP			0x1
-#define VCMD_VRSP_SC(e)			(((e) >> 1) & 0x3)
+#define VCMD_VRSP_SC(e)			(((e) & 0xff) >> 1)
 #define VCMD_VRSP_SC_SUCCESS		0
-#define VCMD_VRSP_SC_NO_PASID_AVAIL	2
-#define VCMD_VRSP_SC_INVALID_PASID	2
-#define VCMD_VRSP_RESULT_PASID(e)	(((e) >> 8) & 0xfffff)
-#define VCMD_CMD_OPERAND(e)		((e) << 8)
+#define VCMD_VRSP_SC_NO_PASID_AVAIL	16
+#define VCMD_VRSP_SC_INVALID_PASID	16
+#define VCMD_VRSP_RESULT_PASID(e)	(((e) >> 16) & 0xfffff)
+#define VCMD_CMD_OPERAND(e)		((e) << 16)
 /*
  * Domain ID reserved for pasid entries programmed for first-level
  * only and pass-through transfer modes.
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index e5886fd968ab..0eb0cbcafc09 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -124,9 +124,9 @@
 #define DMAR_MTRR_PHYSMASK8_REG 0x208
 #define DMAR_MTRR_PHYSBASE9_REG 0x210
 #define DMAR_MTRR_PHYSMASK9_REG 0x218
-#define DMAR_VCCAP_REG		0xe00 /* Virtual command capability register */
-#define DMAR_VCMD_REG		0xe10 /* Virtual command register */
-#define DMAR_VCRSP_REG		0xe20 /* Virtual command response register */
+#define DMAR_VCCAP_REG		0xe30 /* Virtual command capability register */
+#define DMAR_VCMD_REG		0xe00 /* Virtual command register */
+#define DMAR_VCRSP_REG		0xe10 /* Virtual command response register */
 
 #define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
 #define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
-- 
Gitee


From 2ad6c3d5aaa7560a69d9dbe951d429c9f384d13a Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 18 Aug 2021 21:48:45 +0800
Subject: [PATCH 0487/2161] iommu/vt-d: Remove unnecessary oom message

commit 5e41c998949377c80d03610600228022430daf45 upstream.

Fixes scripts/checkpatch.pl warning:
WARNING: Possible unnecessary 'out of memory' message

Remove it can help us save a bit of memory.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Link: https://lore.kernel.org/r/20210609124937.14260-1-thunder.leizhen@huawei.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210818134852.1847070-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c  | 2 --
 drivers/iommu/intel/iommu.c | 6 +-----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 78866c56496f..2cd3a930d990 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -149,8 +149,6 @@ dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event)
 	} else {
 		info = kzalloc(size, GFP_KERNEL);
 		if (!info) {
-			pr_warn("Out of memory when allocating notify_info "
-				"for %s.\n", pci_name(dev));
 			if (dmar_dev_scope_status == 0)
 				dmar_dev_scope_status = -ENOMEM;
 			return NULL;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 57a4e482c45c..445cbfda4527 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1781,11 +1781,8 @@ static int iommu_init_domains(struct intel_iommu *iommu)
 	spin_lock_init(&iommu->lock);
 
 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
-	if (!iommu->domain_ids) {
-		pr_err("%s: Allocating domain id array failed\n",
-		       iommu->name);
+	if (!iommu->domain_ids)
 		return -ENOMEM;
-	}
 
 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
 	iommu->domains = kzalloc(size, GFP_KERNEL);
@@ -3226,7 +3223,6 @@ static int __init init_dmars(void)
 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
 			GFP_KERNEL);
 	if (!g_iommus) {
-		pr_err("Allocating global iommu array failed\n");
 		ret = -ENOMEM;
 		goto error;
 	}
-- 
Gitee


From 164c79a87864508a853b01b1c61f3af084f74686 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 18 Aug 2021 21:48:46 +0800
Subject: [PATCH 0488/2161] iommu/vt-d: Refactor Kconfig a bit

commit 01dac2d9d2364d2a68b37cc1381947ca53413b51 upstream.

Put all sub-options inside a "if INTEL_IOMMU" so that they don't need to
always depend on INTEL_IOMMU. Use IS_ENABLED() instead of #ifdef as well.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210818134852.1847070-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Kconfig | 18 ++++++++++--------
 drivers/iommu/intel/iommu.c | 13 ++-----------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 8c4a741db7a4..075a18fe2b51 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -24,9 +24,11 @@ config INTEL_IOMMU
 	  and include PCI device scope covered by these DMA
 	  remapping devices.
 
+if INTEL_IOMMU
+
 config INTEL_IOMMU_DEBUGFS
 	bool "Export Intel IOMMU internals in Debugfs"
-	depends on INTEL_IOMMU && IOMMU_DEBUGFS
+	depends on IOMMU_DEBUGFS
 	select DMAR_PERF
 	help
 	  !!!WARNING!!!
@@ -40,7 +42,7 @@ config INTEL_IOMMU_DEBUGFS
 
 config INTEL_IOMMU_SVM
 	bool "Support for Shared Virtual Memory with Intel IOMMU"
-	depends on INTEL_IOMMU && X86_64
+	depends on X86_64
 	select PCI_PASID
 	select PCI_PRI
 	select MMU_NOTIFIER
@@ -52,9 +54,8 @@ config INTEL_IOMMU_SVM
 	  means of a Process Address Space ID (PASID).
 
 config INTEL_IOMMU_DEFAULT_ON
-	def_bool y
-	prompt "Enable Intel DMA Remapping Devices by default"
-	depends on INTEL_IOMMU
+	bool "Enable Intel DMA Remapping Devices by default"
+	default y
 	help
 	  Selecting this option will enable a DMAR device at boot time if
 	  one is found. If this option is not selected, DMAR support can
@@ -62,7 +63,7 @@ config INTEL_IOMMU_DEFAULT_ON
 
 config INTEL_IOMMU_BROKEN_GFX_WA
 	bool "Workaround broken graphics drivers (going away soon)"
-	depends on INTEL_IOMMU && BROKEN && X86
+	depends on BROKEN && X86
 	help
 	  Current Graphics drivers tend to use physical address
 	  for DMA and avoid using DMA APIs. Setting this config
@@ -73,7 +74,7 @@ config INTEL_IOMMU_BROKEN_GFX_WA
 
 config INTEL_IOMMU_FLOPPY_WA
 	def_bool y
-	depends on INTEL_IOMMU && X86
+	depends on X86
 	help
 	  Floppy disk drivers are known to bypass DMA API calls
 	  thereby failing to work when IOMMU is enabled. This
@@ -82,7 +83,6 @@ config INTEL_IOMMU_FLOPPY_WA
 
 config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 	bool "Enable Intel IOMMU scalable mode by default"
-	depends on INTEL_IOMMU
 	help
 	  Selecting this option will enable by default the scalable mode if
 	  hardware presents the capability. The scalable mode is defined in
@@ -91,3 +91,5 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 	  is not selected, scalable mode support could also be enabled by
 	  passing intel_iommu=sm_on to the kernel. If not sure, please use
 	  the default value.
+
+endif # INTEL_IOMMU
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 445cbfda4527..6ddb244ee308 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -328,17 +328,8 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 					    dma_addr_t iova);
 
-#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
-int dmar_disabled = 0;
-#else
-int dmar_disabled = 1;
-#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
-
-#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
-int intel_iommu_sm = 1;
-#else
-int intel_iommu_sm;
-#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
+int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
+int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 
 int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
-- 
Gitee


From 29254c6d65bd52f1e27d576ff8846a30bd005f32 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 18 Aug 2021 21:48:47 +0800
Subject: [PATCH 0489/2161] iommu/vt-d: Enable Intel IOMMU scalable mode by
 default

commit 792fb43ce2c98878f1539c1d3fdecc1d7a7d78fd upstream.

The commit 8950dcd83ae7d ("iommu/vt-d: Leave scalable mode default off")
leaves the scalable mode default off and end users could turn it on with
"intel_iommu=sm_on". Using the Intel IOMMU scalable mode for kernel DMA,
user-level device access and Shared Virtual Address have been enabled.
This enables the scalable mode by default if the hardware advertises the
support and adds kernel options of "intel_iommu=sm_on/sm_off" for end
users to configure it through the kernel parameters.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Suggested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210818134852.1847070-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 11 ++++++-----
 drivers/iommu/intel/Kconfig                     |  1 +
 drivers/iommu/intel/iommu.c                     |  5 ++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ea099ff506b8..551e4859b616 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1725,11 +1725,12 @@
 			By default, super page will be supported if Intel IOMMU
 			has the capability. With this option, super page will
 			not be supported.
-		sm_on [Default Off]
-			By default, scalable mode will be disabled even if the
-			hardware advertises that it has support for the scalable
-			mode translation. With this option set, scalable mode
-			will be used on hardware which claims to support it.
+		sm_on
+			Enable the Intel IOMMU scalable mode if the hardware
+			advertises that it has support for the scalable mode
+			translation.
+		sm_off
+			Disallow use of the Intel IOMMU scalable mode.
 		tboot_noforce [Default Off]
 			Do not force the Intel IOMMU enabled under tboot.
 			By default, tboot will force Intel IOMMU on, which
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 075a18fe2b51..bb2f780edb4f 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -83,6 +83,7 @@ config INTEL_IOMMU_FLOPPY_WA
 
 config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 	bool "Enable Intel IOMMU scalable mode by default"
+	default y
 	help
 	  Selecting this option will enable by default the scalable mode if
 	  hardware presents the capability. The scalable mode is defined in
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6ddb244ee308..d9869f711cba 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -434,8 +434,11 @@ static int __init intel_iommu_setup(char *str)
 			pr_info("Disable supported super page\n");
 			intel_iommu_superpage = 0;
 		} else if (!strncmp(str, "sm_on", 5)) {
-			pr_info("Intel-IOMMU: scalable mode supported\n");
+			pr_info("Enable scalable mode if hardware supports\n");
 			intel_iommu_sm = 1;
+		} else if (!strncmp(str, "sm_off", 6)) {
+			pr_info("Scalable mode is disallowed\n");
+			intel_iommu_sm = 0;
 		} else if (!strncmp(str, "tboot_noforce", 13)) {
 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 			intel_iommu_tboot_noforce = 1;
-- 
Gitee


From ac158735ceb9d5ad6979ae884a334b4dcfa058bf Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 18 Aug 2021 21:48:48 +0800
Subject: [PATCH 0490/2161] iommu/vt-d: Preset A/D bits for user space DMA
 usage

commit 289b3b005cb9d9dd6b30297b52c2b4596bc878b2 upstream.

We preset the access and dirty bits for IOVA over first level usage only
for the kernel DMA (i.e., when domain type is IOMMU_DOMAIN_DMA). We should
also preset the FL A/D for user space DMA usage. The idea is that even the
user space A/D bit memory write is unnecessary. We should avoid it to
minimize the overhead.

Suggested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210818134852.1847070-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d9869f711cba..e840427f76ac 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2336,13 +2336,9 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
 	attr |= DMA_FL_PTE_PRESENT;
 	if (domain_use_first_level(domain)) {
-		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
-
-		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
-			attr |= DMA_FL_PTE_ACCESS;
-			if (prot & DMA_PTE_WRITE)
-				attr |= DMA_FL_PTE_DIRTY;
-		}
+		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
+		if (prot & DMA_PTE_WRITE)
+			attr |= DMA_FL_PTE_DIRTY;
 	}
 
 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-- 
Gitee


From 2f589d794b5cf17655da007c36ca808cf0634c31 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 18 Aug 2021 21:48:49 +0800
Subject: [PATCH 0491/2161] iommu/vt-d: Allow devices to have more than 32
 outstanding PRs

commit 48811c44349ffbb778d3e36b53beb03ad43a979c upstream.

The minimum per-IOMMU PRQ queue size is one 4K page, this is more entries
than the hardcoded limit of 32 in the current VT-d code. Some devices can
support up to 512 outstanding PRQs but underutilized by this limit of 32.
Although, 32 gives some rough fairness when multiple devices share the same
IOMMU PRQ queue, but far from optimal for customized use case. This extends
the per-IOMMU PRQ queue size to four 4K pages and let the devices have as
many outstanding page requests as they can.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210818134852.1847070-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 3 ++-
 drivers/iommu/intel/svm.c   | 4 ----
 include/linux/intel-svm.h   | 5 +++++
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e840427f76ac..5367ab008015 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -33,6 +33,7 @@
 #include <linux/iommu.h>
 #include <linux/dma-iommu.h>
 #include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
 #include <linux/syscore_ops.h>
 #include <linux/tboot.h>
 #include <linux/dmi.h>
@@ -1543,7 +1544,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 
 	if (info->pri_supported &&
 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
-	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
+	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
 		info->pri_enabled = 1;
 #endif
 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 0d1c18af2919..4db589146e40 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -31,8 +31,6 @@ static irqreturn_t prq_event_thread(int irq, void *d);
 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
 
-#define PRQ_ORDER 0
-
 static DEFINE_XARRAY_ALLOC(pasid_private_array);
 static int pasid_private_add(ioasid_t pasid, void *priv)
 {
@@ -725,8 +723,6 @@ struct page_req_dsc {
 	u64 priv_data[2];
 };
 
-#define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
-
 static bool is_canonical_address(u64 addr)
 {
 	int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 10fa80eef13a..57cceecbe37f 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -14,6 +14,11 @@
 #define SVM_REQ_EXEC	(1<<1)
 #define SVM_REQ_PRIV	(1<<0)
 
+/* Page Request Queue depth */
+#define PRQ_ORDER	2
+#define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
+#define PRQ_DEPTH	((0x1000 << PRQ_ORDER) >> 5)
+
 /*
  * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
  * for access to kernel addresses. No IOTLB flushes are automatically done
-- 
Gitee


From 32a2823edc51a8024579be28632bcd01faff2d47 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 18 Aug 2021 21:48:50 +0800
Subject: [PATCH 0492/2161] iommu/vt-d: Drop the kernel doc annotation

commit 9ddc348214c775ce5d0861e0363182b18a3dd187 upstream.

Kernel doc validator is unhappy with the following

.../perf.c:16: warning: Function parameter or member 'latency_lock' not described in 'DEFINE_SPINLOCK'
.../perf.c:16: warning: expecting prototype for perf.c(). Prototype was for DEFINE_SPINLOCK() instead

Drop kernel doc annotation since the top comment is not in the required format.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210729163538.40101-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210818134852.1847070-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
index 73b7ec705552..0e8e03252d92 100644
--- a/drivers/iommu/intel/perf.c
+++ b/drivers/iommu/intel/perf.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/**
+/*
  * perf.c - performance monitor
  *
  * Copyright (C) 2021 Intel Corporation
-- 
Gitee


From 1ef3f93296317d681003816f31aaeef82833dde8 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Wed, 18 Aug 2021 21:48:51 +0800
Subject: [PATCH 0493/2161] iommu/vt-d: Use pasid_pte_is_present() helper
 function

commit 8123b0b86855185357e4b12f29e327ba773fc58d upstream.

Use the pasid_pte_is_present() helper for present bit check in the
intel_pasid_tear_down_entry().

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Link: https://lore.kernel.org/r/20210817042425.1784279-1-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210818134852.1847070-9-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 9ec374e17469..eda599e70a68 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -517,7 +517,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 	if (WARN_ON(!pte))
 		return;
 
-	if (!(pte->val[0] & PASID_PTE_PRESENT))
+	if (!pasid_pte_is_present(pte))
 		return;
 
 	did = pasid_get_domain_id(pte);
-- 
Gitee


From 69967df97a9749ca13ef277a9452dda034cbac31 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Wed, 18 Aug 2021 21:48:52 +0800
Subject: [PATCH 0494/2161] iommu/vt-d: Add present bit check in pasid entry
 setup helpers

commit 423d39d8518c1bba12e0889a92beeddbb1502392 upstream.

The helper functions should not modify the pasid entries which are still
in use. Add a check against present bit.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Link: https://lore.kernel.org/r/20210817042425.1784279-1-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210818134852.1847070-10-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index eda599e70a68..07c390aed1fe 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -540,6 +540,10 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 		devtlb_invalidation_with_pasid(iommu, dev, pasid);
 }
 
+/*
+ * This function flushes cache for a newly setup pasid table entry.
+ * Caller of it should not modify the in-use pasid table entries.
+ */
 static void pasid_flush_caches(struct intel_iommu *iommu,
 				struct pasid_entry *pte,
 			       u32 pasid, u16 did)
@@ -591,6 +595,10 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 	if (WARN_ON(!pte))
 		return -EINVAL;
 
+	/* Caller must ensure PASID entry is not in use. */
+	if (pasid_pte_is_present(pte))
+		return -EBUSY;
+
 	pasid_clear_entry(pte);
 
 	/* Setup the first level page table pointer: */
@@ -690,6 +698,10 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 		return -ENODEV;
 	}
 
+	/* Caller must ensure PASID entry is not in use. */
+	if (pasid_pte_is_present(pte))
+		return -EBUSY;
+
 	pasid_clear_entry(pte);
 	pasid_set_domain_id(pte, did);
 	pasid_set_slptr(pte, pgd_val);
@@ -729,6 +741,10 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 		return -ENODEV;
 	}
 
+	/* Caller must ensure PASID entry is not in use. */
+	if (pasid_pte_is_present(pte))
+		return -EBUSY;
+
 	pasid_clear_entry(pte);
 	pasid_set_domain_id(pte, did);
 	pasid_set_address_width(pte, iommu->agaw);
-- 
Gitee


From c353ef6cf7b09d0e81466f44fb63b3e9268e8c9a Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Sat, 28 Aug 2021 15:06:21 +0800
Subject: [PATCH 0495/2161] iommu/vt-d: Fix PASID leak in intel_svm_unbind_mm()

commit a21518cb23a3c7d49bafcb59862fd389fd829d4b upstream.

The mm->pasid will be used in intel_svm_free_pasid() after load_pasid()
during unbinding mm. Clearing it in load_pasid() will cause PASID cannot
be freed in intel_svm_free_pasid().

Additionally mm->pasid was updated already before load_pasid() during pasid
allocation. No need to update it again in load_pasid() during binding mm.
Don't update mm->pasid to avoid the issues in both binding mm and unbinding
mm.

Fixes: 4048377414162 ("iommu/vt-d: Use iommu_sva_alloc(free)_pasid() helpers")
Reported-and-tested-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20210826215918.4073446-1-fenghua.yu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210828070622.2437559-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 4db589146e40..21a0fe2c4cad 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -514,9 +514,6 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
 {
 	mutex_lock(&mm->context.lock);
 
-	/* Synchronize with READ_ONCE in update_pasid(). */
-	smp_store_release(&mm->pasid, pasid);
-
 	/* Update PASID MSR on all CPUs running the mm's tasks. */
 	on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
 
-- 
Gitee


From 036400d98d0890b1f4c49a8c7f1a8dd01d9f6e95 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Sat, 28 Aug 2021 15:06:22 +0800
Subject: [PATCH 0496/2161] iommu/vt-d: Fix a deadlock in intel_svm_drain_prq()

commit 6ef0505158f7ca1b32763a3b038b5d11296b642b upstream.

pasid_mutex and dev->iommu->param->lock are held while unbinding mm is
flushing IO page fault workqueue and waiting for all page fault works to
finish. But an in-flight page fault work also need to hold the two locks
while unbinding mm are holding them and waiting for the work to finish.
This may cause an ABBA deadlock issue as shown below:

	idxd 0000:00:0a.0: unbind PASID 2
	======================================================
	WARNING: possible circular locking dependency detected
	5.14.0-rc7+ #549 Not tainted [  186.615245] ----------
	dsa_test/898 is trying to acquire lock:
	ffff888100d854e8 (&param->lock){+.+.}-{3:3}, at:
	iopf_queue_flush_dev+0x29/0x60
	but task is already holding lock:
	ffffffff82b2f7c8 (pasid_mutex){+.+.}-{3:3}, at:
	intel_svm_unbind+0x34/0x1e0
	which lock already depends on the new lock.

	the existing dependency chain (in reverse order) is:

	-> #2 (pasid_mutex){+.+.}-{3:3}:
	       __mutex_lock+0x75/0x730
	       mutex_lock_nested+0x1b/0x20
	       intel_svm_page_response+0x8e/0x260
	       iommu_page_response+0x122/0x200
	       iopf_handle_group+0x1c2/0x240
	       process_one_work+0x2a5/0x5a0
	       worker_thread+0x55/0x400
	       kthread+0x13b/0x160
	       ret_from_fork+0x22/0x30

	-> #1 (&param->fault_param->lock){+.+.}-{3:3}:
	       __mutex_lock+0x75/0x730
	       mutex_lock_nested+0x1b/0x20
	       iommu_report_device_fault+0xc2/0x170
	       prq_event_thread+0x28a/0x580
	       irq_thread_fn+0x28/0x60
	       irq_thread+0xcf/0x180
	       kthread+0x13b/0x160
	       ret_from_fork+0x22/0x30

	-> #0 (&param->lock){+.+.}-{3:3}:
	       __lock_acquire+0x1134/0x1d60
	       lock_acquire+0xc6/0x2e0
	       __mutex_lock+0x75/0x730
	       mutex_lock_nested+0x1b/0x20
	       iopf_queue_flush_dev+0x29/0x60
	       intel_svm_drain_prq+0x127/0x210
	       intel_svm_unbind+0xc5/0x1e0
	       iommu_sva_unbind_device+0x62/0x80
	       idxd_cdev_release+0x15a/0x200 [idxd]
	       __fput+0x9c/0x250
	       ____fput+0xe/0x10
	       task_work_run+0x64/0xa0
	       exit_to_user_mode_prepare+0x227/0x230
	       syscall_exit_to_user_mode+0x2c/0x60
	       do_syscall_64+0x48/0x90
	       entry_SYSCALL_64_after_hwframe+0x44/0xae

	other info that might help us debug this:

	Chain exists of:
	  &param->lock --> &param->fault_param->lock --> pasid_mutex

	 Possible unsafe locking scenario:

	       CPU0                    CPU1
	       ----                    ----
	  lock(pasid_mutex);
				       lock(&param->fault_param->lock);
				       lock(pasid_mutex);
	  lock(&param->lock);

	 *** DEADLOCK ***

	2 locks held by dsa_test/898:
	 #0: ffff888100cc1cc0 (&group->mutex){+.+.}-{3:3}, at:
	 iommu_sva_unbind_device+0x53/0x80
	 #1: ffffffff82b2f7c8 (pasid_mutex){+.+.}-{3:3}, at:
	 intel_svm_unbind+0x34/0x1e0

	stack backtrace:
	CPU: 2 PID: 898 Comm: dsa_test Not tainted 5.14.0-rc7+ #549
	Hardware name: Intel Corporation Kabylake Client platform/KBL S
	DDR4 UD IMM CRB, BIOS KBLSE2R1.R00.X050.P01.1608011715 08/01/2016
	Call Trace:
	 dump_stack_lvl+0x5b/0x74
	 dump_stack+0x10/0x12
	 print_circular_bug.cold+0x13d/0x142
	 check_noncircular+0xf1/0x110
	 __lock_acquire+0x1134/0x1d60
	 lock_acquire+0xc6/0x2e0
	 ? iopf_queue_flush_dev+0x29/0x60
	 ? pci_mmcfg_read+0xde/0x240
	 __mutex_lock+0x75/0x730
	 ? iopf_queue_flush_dev+0x29/0x60
	 ? pci_mmcfg_read+0xfd/0x240
	 ? iopf_queue_flush_dev+0x29/0x60
	 mutex_lock_nested+0x1b/0x20
	 iopf_queue_flush_dev+0x29/0x60
	 intel_svm_drain_prq+0x127/0x210
	 ? intel_pasid_tear_down_entry+0x22e/0x240
	 intel_svm_unbind+0xc5/0x1e0
	 iommu_sva_unbind_device+0x62/0x80
	 idxd_cdev_release+0x15a/0x200

pasid_mutex protects pasid and svm data mapping data. It's unnecessary
to hold pasid_mutex while flushing the workqueue. To fix the deadlock
issue, unlock pasid_pasid during flushing the workqueue to allow the works
to be handled.

Fixes: d5b9e4bfe0d8 ("iommu/vt-d: Report prq to io-pgfault framework")
Reported-and-tested-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20210826215918.4073446-1-fenghua.yu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210828070622.2437559-3-baolu.lu@linux.intel.com
[joro: Removed timing information from kernel log messages]
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 21a0fe2c4cad..e013198cdba7 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -789,7 +789,19 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid)
 		goto prq_retry;
 	}
 
+	/*
+	 * A work in IO page fault workqueue may try to lock pasid_mutex now.
+	 * Holding pasid_mutex while waiting in iopf_queue_flush_dev() for
+	 * all works in the workqueue to finish may cause deadlock.
+	 *
+	 * It's unnecessary to hold pasid_mutex in iopf_queue_flush_dev().
+	 * Unlock it to allow the works to be handled while waiting for
+	 * them to finish.
+	 */
+	lockdep_assert_held(&pasid_mutex);
+	mutex_unlock(&pasid_mutex);
 	iopf_queue_flush_dev(dev);
+	mutex_lock(&pasid_mutex);
 
 	/*
 	 * Perform steps described in VT-d spec CH7.10 to drain page
-- 
Gitee


From 54b6c2300c61d33e3049b0aebe6d9ca3b17a96ab Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 22 Sep 2021 13:47:26 +0800
Subject: [PATCH 0497/2161] iommu/vt-d: Drop "0x" prefix from PCI bus & device
 addresses

commit 0b482d0c75bf321b2fd87d215c3d6df095a601d1 upstream.

719a19335692 ("iommu/vt-d: Tweak the description of a DMA fault") changed
the DMA fault reason from hex to decimal.  It also added "0x" prefixes to
the PCI bus/device, e.g.,

  - DMAR: [INTR-REMAP] Request device [00:00.5]
  + DMAR: [INTR-REMAP] Request device [0x00:0x00.5]

These no longer match dev_printk() and other similar messages in
dmar_match_pci_path() and dmar_acpi_insert_dev_scope().

Drop the "0x" prefixes from the bus and device addresses.

Fixes: 719a19335692 ("iommu/vt-d: Tweak the description of a DMA fault")
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20210903193711.483999-1-helgaas@kernel.org
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210922054726.499110-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 2cd3a930d990..f42bf569fabe 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1942,18 +1942,18 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 	reason = dmar_get_fault_reason(fault_reason, &fault_type);
 
 	if (fault_type == INTR_REMAP)
-		pr_err("[INTR-REMAP] Request device [0x%02x:0x%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n",
+		pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n",
 		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
 		       PCI_FUNC(source_id & 0xFF), addr >> 48,
 		       fault_reason, reason);
 	else if (pasid == INVALID_IOASID)
-		pr_err("[%s NO_PASID] Request device [0x%02x:0x%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
+		pr_err("[%s NO_PASID] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
 		       type ? "DMA Read" : "DMA Write",
 		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
 		       PCI_FUNC(source_id & 0xFF), addr,
 		       fault_reason, reason);
 	else
-		pr_err("[%s PASID 0x%x] Request device [0x%02x:0x%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
+		pr_err("[%s PASID 0x%x] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
 		       type ? "DMA Read" : "DMA Write", pasid,
 		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
 		       PCI_FUNC(source_id & 0xFF), addr,
-- 
Gitee


From 42a6c298639f9fc8e23dedc7e85a29434be4587e Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Tue, 15 Sep 2020 09:30:07 -0700
Subject: [PATCH 0498/2161] Documentation/x86: Add documentation for SVA
 (Shared Virtual Addressing)

commit 4e7b11567d946ebe14a3d10b697b078971a9da89 upstream.

ENQCMD and Data Streaming Accelerator (DSA) and all of their associated
features are a complicated stack with lots of interconnected pieces.
This documentation provides a big picture overview for all of the
features.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1600187413-163670-4-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/index.rst |   1 +
 Documentation/x86/sva.rst   | 257 ++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+)
 create mode 100644 Documentation/x86/sva.rst

diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 971f30a7d166..849ba0747026 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -32,3 +32,4 @@ x86-specific Documentation
    i386/index
    x86_64/index
    sgx
+   sva
diff --git a/Documentation/x86/sva.rst b/Documentation/x86/sva.rst
new file mode 100644
index 000000000000..076efd51ef1f
--- /dev/null
+++ b/Documentation/x86/sva.rst
@@ -0,0 +1,257 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================================
+Shared Virtual Addressing (SVA) with ENQCMD
+===========================================
+
+Background
+==========
+
+Shared Virtual Addressing (SVA) allows the processor and device to use the
+same virtual addresses avoiding the need for software to translate virtual
+addresses to physical addresses. SVA is what PCIe calls Shared Virtual
+Memory (SVM).
+
+In addition to the convenience of using application virtual addresses
+by the device, it also doesn't require pinning pages for DMA.
+PCIe Address Translation Services (ATS) along with Page Request Interface
+(PRI) allow devices to function much the same way as the CPU handling
+application page-faults. For more information please refer to the PCIe
+specification Chapter 10: ATS Specification.
+
+Use of SVA requires IOMMU support in the platform. IOMMU is also
+required to support the PCIe features ATS and PRI. ATS allows devices
+to cache translations for virtual addresses. The IOMMU driver uses the
+mmu_notifier() support to keep the device TLB cache and the CPU cache in
+sync. When an ATS lookup fails for a virtual address, the device should
+use the PRI in order to request the virtual address to be paged into the
+CPU page tables. The device must use ATS again in order the fetch the
+translation before use.
+
+Shared Hardware Workqueues
+==========================
+
+Unlike Single Root I/O Virtualization (SR-IOV), Scalable IOV (SIOV) permits
+the use of Shared Work Queues (SWQ) by both applications and Virtual
+Machines (VM's). This allows better hardware utilization vs. hard
+partitioning resources that could result in under utilization. In order to
+allow the hardware to distinguish the context for which work is being
+executed in the hardware by SWQ interface, SIOV uses Process Address Space
+ID (PASID), which is a 20-bit number defined by the PCIe SIG.
+
+PASID value is encoded in all transactions from the device. This allows the
+IOMMU to track I/O on a per-PASID granularity in addition to using the PCIe
+Resource Identifier (RID) which is the Bus/Device/Function.
+
+
+ENQCMD
+======
+
+ENQCMD is a new instruction on Intel platforms that atomically submits a
+work descriptor to a device. The descriptor includes the operation to be
+performed, virtual addresses of all parameters, virtual address of a completion
+record, and the PASID (process address space ID) of the current process.
+
+ENQCMD works with non-posted semantics and carries a status back if the
+command was accepted by hardware. This allows the submitter to know if the
+submission needs to be retried or other device specific mechanisms to
+implement fairness or ensure forward progress should be provided.
+
+ENQCMD is the glue that ensures applications can directly submit commands
+to the hardware and also permits hardware to be aware of application context
+to perform I/O operations via use of PASID.
+
+Process Address Space Tagging
+=============================
+
+A new thread-scoped MSR (IA32_PASID) provides the connection between
+user processes and the rest of the hardware. When an application first
+accesses an SVA-capable device, this MSR is initialized with a newly
+allocated PASID. The driver for the device calls an IOMMU-specific API
+that sets up the routing for DMA and page-requests.
+
+For example, the Intel Data Streaming Accelerator (DSA) uses
+iommu_sva_bind_device(), which will do the following:
+
+- Allocate the PASID, and program the process page-table (%cr3 register) in the
+  PASID context entries.
+- Register for mmu_notifier() to track any page-table invalidations to keep
+  the device TLB in sync. For example, when a page-table entry is invalidated,
+  the IOMMU propagates the invalidation to the device TLB. This will force any
+  future access by the device to this virtual address to participate in
+  ATS. If the IOMMU responds with proper response that a page is not
+  present, the device would request the page to be paged in via the PCIe PRI
+  protocol before performing I/O.
+
+This MSR is managed with the XSAVE feature set as "supervisor state" to
+ensure the MSR is updated during context switch.
+
+PASID Management
+================
+
+The kernel must allocate a PASID on behalf of each process which will use
+ENQCMD and program it into the new MSR to communicate the process identity to
+platform hardware.  ENQCMD uses the PASID stored in this MSR to tag requests
+from this process.  When a user submits a work descriptor to a device using the
+ENQCMD instruction, the PASID field in the descriptor is auto-filled with the
+value from MSR_IA32_PASID. Requests for DMA from the device are also tagged
+with the same PASID. The platform IOMMU uses the PASID in the transaction to
+perform address translation. The IOMMU APIs setup the corresponding PASID
+entry in IOMMU with the process address used by the CPU (e.g. %cr3 register in
+x86).
+
+The MSR must be configured on each logical CPU before any application
+thread can interact with a device. Threads that belong to the same
+process share the same page tables, thus the same MSR value.
+
+PASID is cleared when a process is created. The PASID allocation and MSR
+programming may occur long after a process and its threads have been created.
+One thread must call iommu_sva_bind_device() to allocate the PASID for the
+process. If a thread uses ENQCMD without the MSR first being populated, a #GP
+will be raised. The kernel will update the PASID MSR with the PASID for all
+threads in the process. A single process PASID can be used simultaneously
+with multiple devices since they all share the same address space.
+
+One thread can call iommu_sva_unbind_device() to free the allocated PASID.
+The kernel will clear the PASID MSR for all threads belonging to the process.
+
+New threads inherit the MSR value from the parent.
+
+Relationships
+=============
+
+ * Each process has many threads, but only one PASID.
+ * Devices have a limited number (~10's to 1000's) of hardware workqueues.
+   The device driver manages allocating hardware workqueues.
+ * A single mmap() maps a single hardware workqueue as a "portal" and
+   each portal maps down to a single workqueue.
+ * For each device with which a process interacts, there must be
+   one or more mmap()'d portals.
+ * Many threads within a process can share a single portal to access
+   a single device.
+ * Multiple processes can separately mmap() the same portal, in
+   which case they still share one device hardware workqueue.
+ * The single process-wide PASID is used by all threads to interact
+   with all devices.  There is not, for instance, a PASID for each
+   thread or each thread<->device pair.
+
+FAQ
+===
+
+* What is SVA/SVM?
+
+Shared Virtual Addressing (SVA) permits I/O hardware and the processor to
+work in the same address space, i.e., to share it. Some call it Shared
+Virtual Memory (SVM), but Linux community wanted to avoid confusing it with
+POSIX Shared Memory and Secure Virtual Machines which were terms already in
+circulation.
+
+* What is a PASID?
+
+A Process Address Space ID (PASID) is a PCIe-defined Transaction Layer Packet
+(TLP) prefix. A PASID is a 20-bit number allocated and managed by the OS.
+PASID is included in all transactions between the platform and the device.
+
+* How are shared workqueues different?
+
+Traditionally, in order for userspace applications to interact with hardware,
+there is a separate hardware instance required per process. For example,
+consider doorbells as a mechanism of informing hardware about work to process.
+Each doorbell is required to be spaced 4k (or page-size) apart for process
+isolation. This requires hardware to provision that space and reserve it in
+MMIO. This doesn't scale as the number of threads becomes quite large. The
+hardware also manages the queue depth for Shared Work Queues (SWQ), and
+consumers don't need to track queue depth. If there is no space to accept
+a command, the device will return an error indicating retry.
+
+A user should check Deferrable Memory Write (DMWr) capability on the device
+and only submits ENQCMD when the device supports it. In the new DMWr PCIe
+terminology, devices need to support DMWr completer capability. In addition,
+it requires all switch ports to support DMWr routing and must be enabled by
+the PCIe subsystem, much like how PCIe atomic operations are managed for
+instance.
+
+SWQ allows hardware to provision just a single address in the device. When
+used with ENQCMD to submit work, the device can distinguish the process
+submitting the work since it will include the PASID assigned to that
+process. This helps the device scale to a large number of processes.
+
+* Is this the same as a user space device driver?
+
+Communicating with the device via the shared workqueue is much simpler
+than a full blown user space driver. The kernel driver does all the
+initialization of the hardware. User space only needs to worry about
+submitting work and processing completions.
+
+* Is this the same as SR-IOV?
+
+Single Root I/O Virtualization (SR-IOV) focuses on providing independent
+hardware interfaces for virtualizing hardware. Hence, it's required to be
+almost fully functional interface to software supporting the traditional
+BARs, space for interrupts via MSI-X, its own register layout.
+Virtual Functions (VFs) are assisted by the Physical Function (PF)
+driver.
+
+Scalable I/O Virtualization builds on the PASID concept to create device
+instances for virtualization. SIOV requires host software to assist in
+creating virtual devices; each virtual device is represented by a PASID
+along with the bus/device/function of the device.  This allows device
+hardware to optimize device resource creation and can grow dynamically on
+demand. SR-IOV creation and management is very static in nature. Consult
+references below for more details.
+
+* Why not just create a virtual function for each app?
+
+Creating PCIe SR-IOV type Virtual Functions (VF) is expensive. VFs require
+duplicated hardware for PCI config space and interrupts such as MSI-X.
+Resources such as interrupts have to be hard partitioned between VFs at
+creation time, and cannot scale dynamically on demand. The VFs are not
+completely independent from the Physical Function (PF). Most VFs require
+some communication and assistance from the PF driver. SIOV, in contrast,
+creates a software-defined device where all the configuration and control
+aspects are mediated via the slow path. The work submission and completion
+happen without any mediation.
+
+* Does this support virtualization?
+
+ENQCMD can be used from within a guest VM. In these cases, the VMM helps
+with setting up a translation table to translate from Guest PASID to Host
+PASID. Please consult the ENQCMD instruction set reference for more
+details.
+
+* Does memory need to be pinned?
+
+When devices support SVA along with platform hardware such as IOMMU
+supporting such devices, there is no need to pin memory for DMA purposes.
+Devices that support SVA also support other PCIe features that remove the
+pinning requirement for memory.
+
+Device TLB support - Device requests the IOMMU to lookup an address before
+use via Address Translation Service (ATS) requests.  If the mapping exists
+but there is no page allocated by the OS, IOMMU hardware returns that no
+mapping exists.
+
+Device requests the virtual address to be mapped via Page Request
+Interface (PRI). Once the OS has successfully completed the mapping, it
+returns the response back to the device. The device requests again for
+a translation and continues.
+
+IOMMU works with the OS in managing consistency of page-tables with the
+device. When removing pages, it interacts with the device to remove any
+device TLB entry that might have been cached before removing the mappings from
+the OS.
+
+References
+==========
+
+VT-D:
+https://01.org/blogs/ashokraj/2018/recent-enhancements-intel-virtualization-technology-directed-i/o-intel-vt-d
+
+SIOV:
+https://01.org/blogs/2019/assignable-interfaces-intel-scalable-i/o-virtualization-linux
+
+ENQCMD in ISE:
+https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
+
+DSA spec:
+https://software.intel.com/sites/default/files/341204-intel-data-streaming-accelerator-spec.pdf
-- 
Gitee


From baf6038e0b79d9d407df588a37650ee051b3985d Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 21 Nov 2019 04:19:08 +0100
Subject: [PATCH 0499/2161] dmaengine: Fix Kconfig indentation

commit 67805a4b3c924927d9e064bca235461941f89e4a upstream.

Adjust indentation from spaces to tab (+optional two spaces) as in
coding style with command like:
	$ sed -e 's/^        /\t/' -i */Kconfig

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>

Link: https://lore.kernel.org/r/1574306348-29212-1-git-send-email-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/Kconfig | 60 ++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 7af874b69ffb..4a34feec93d7 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -15,19 +15,19 @@ menuconfig DMADEVICES
 	  be empty in some cases.
 
 config DMADEVICES_DEBUG
-        bool "DMA Engine debugging"
-        depends on DMADEVICES != n
-        help
-          This is an option for use by developers; most people should
-          say N here.  This enables DMA engine core and driver debugging.
+	bool "DMA Engine debugging"
+	depends on DMADEVICES != n
+	help
+	  This is an option for use by developers; most people should
+	  say N here.  This enables DMA engine core and driver debugging.
 
 config DMADEVICES_VDEBUG
-        bool "DMA Engine verbose debugging"
-        depends on DMADEVICES_DEBUG != n
-        help
-          This is an option for use by developers; most people should
-          say N here.  This enables deeper (more verbose) debugging of
-          the DMA engine core and drivers.
+	bool "DMA Engine verbose debugging"
+	depends on DMADEVICES_DEBUG != n
+	help
+	  This is an option for use by developers; most people should
+	  say N here.  This enables deeper (more verbose) debugging of
+	  the DMA engine core and drivers.
 
 
 if DMADEVICES
@@ -215,28 +215,28 @@ config FSL_EDMA
 	  This module can be found on Freescale Vybrid and LS-1 SoCs.
 
 config FSL_QDMA
-       tristate "NXP Layerscape qDMA engine support"
-       depends on ARM || ARM64
-       select DMA_ENGINE
-       select DMA_VIRTUAL_CHANNELS
-       select DMA_ENGINE_RAID
-       select ASYNC_TX_ENABLE_CHANNEL_SWITCH
-       help
-         Support the NXP Layerscape qDMA engine with command queue and legacy mode.
-         Channel virtualization is supported through enqueuing of DMA jobs to,
-         or dequeuing DMA jobs from, different work queues.
-         This module can be found on NXP Layerscape SoCs.
+	tristate "NXP Layerscape qDMA engine support"
+	depends on ARM || ARM64
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	select DMA_ENGINE_RAID
+	select ASYNC_TX_ENABLE_CHANNEL_SWITCH
+	help
+	 Support the NXP Layerscape qDMA engine with command queue and legacy mode.
+	 Channel virtualization is supported through enqueuing of DMA jobs to,
+	 or dequeuing DMA jobs from, different work queues.
+	 This module can be found on NXP Layerscape SoCs.
 	  The qdma driver only work on  SoCs with a DPAA hardware block.
 
 config FSL_RAID
-        tristate "Freescale RAID engine Support"
-        depends on FSL_SOC && !ASYNC_TX_ENABLE_CHANNEL_SWITCH
-        select DMA_ENGINE
-        select DMA_ENGINE_RAID
-        ---help---
-          Enable support for Freescale RAID Engine. RAID Engine is
-          available on some QorIQ SoCs (like P5020/P5040). It has
-          the capability to offload memcpy, xor and pq computation
+	tristate "Freescale RAID engine Support"
+	depends on FSL_SOC && !ASYNC_TX_ENABLE_CHANNEL_SWITCH
+	select DMA_ENGINE
+	select DMA_ENGINE_RAID
+	---help---
+	  Enable support for Freescale RAID Engine. RAID Engine is
+	  available on some QorIQ SoCs (like P5020/P5040). It has
+	  the capability to offload memcpy, xor and pq computation
 	  for raid5/6.
 
 config IMG_MDC_DMA
-- 
Gitee


From 29b87e1cde8c94c6a0aee46c03724729b1203f22 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Mon, 16 Dec 2019 12:01:17 -0700
Subject: [PATCH 0500/2161] dmaengine: Call module_put() after
 device_free_chan_resources()

commit 686607106f1fe163f7d017561f3622f39a291de8 upstream.

The module reference is taken to ensure the callbacks still exist
when they are called. If the channel holds the last reference to the
module, the module can disappear before device_free_chan_resources() is
called and would cause a call into free'd memory.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20191216190120.21374-3-logang@deltatee.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 4b604086b1b3..776fdf535a3a 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -250,7 +250,6 @@ static void dma_chan_put(struct dma_chan *chan)
 		return;
 
 	chan->client_count--;
-	module_put(dma_chan_to_owner(chan));
 
 	/* This channel is not in use anymore, free it */
 	if (!chan->client_count && chan->device->device_free_chan_resources) {
@@ -259,6 +258,8 @@ static void dma_chan_put(struct dma_chan *chan)
 		chan->device->device_free_chan_resources(chan);
 	}
 
+	module_put(dma_chan_to_owner(chan));
+
 	/* If the channel is used via a DMA request router, free the mapping */
 	if (chan->router && chan->router->route_free) {
 		chan->router->route_free(chan->router->dev, chan->route_data);
-- 
Gitee


From da4932924a40d54a9df5253729ffa689b74e8f6e Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Mon, 16 Dec 2019 12:01:18 -0700
Subject: [PATCH 0501/2161] dmaengine: Move dma_channel_rebalance()
 infrastructure up in code

commit 11a0fd2b3baa5e4a97197b9cd990b5d05e69d669 upstream.

So it can be called by a release function which is needed higher up in
the code. No functional changes intended.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20191216190120.21374-4-logang@deltatee.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 288 ++++++++++++++++++++--------------------
 1 file changed, 144 insertions(+), 144 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 776fdf535a3a..1f9a6293f15a 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -164,6 +164,150 @@ static struct class dma_devclass = {
 
 /* --- client and device registration --- */
 
+/**
+ * dma_cap_mask_all - enable iteration over all operation types
+ */
+static dma_cap_mask_t dma_cap_mask_all;
+
+/**
+ * dma_chan_tbl_ent - tracks channel allocations per core/operation
+ * @chan - associated channel for this entry
+ */
+struct dma_chan_tbl_ent {
+	struct dma_chan *chan;
+};
+
+/**
+ * channel_table - percpu lookup table for memory-to-memory offload providers
+ */
+static struct dma_chan_tbl_ent __percpu *channel_table[DMA_TX_TYPE_END];
+
+static int __init dma_channel_table_init(void)
+{
+	enum dma_transaction_type cap;
+	int err = 0;
+
+	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
+
+	/* 'interrupt', 'private', and 'slave' are channel capabilities,
+	 * but are not associated with an operation so they do not need
+	 * an entry in the channel_table
+	 */
+	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
+	clear_bit(DMA_PRIVATE, dma_cap_mask_all.bits);
+	clear_bit(DMA_SLAVE, dma_cap_mask_all.bits);
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
+		channel_table[cap] = alloc_percpu(struct dma_chan_tbl_ent);
+		if (!channel_table[cap]) {
+			err = -ENOMEM;
+			break;
+		}
+	}
+
+	if (err) {
+		pr_err("initialization failure\n");
+		for_each_dma_cap_mask(cap, dma_cap_mask_all)
+			free_percpu(channel_table[cap]);
+	}
+
+	return err;
+}
+arch_initcall(dma_channel_table_init);
+
+/**
+ * dma_chan_is_local - returns true if the channel is in the same numa-node as
+ *	the cpu
+ */
+static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
+{
+	int node = dev_to_node(chan->device->dev);
+	return node == NUMA_NO_NODE ||
+		cpumask_test_cpu(cpu, cpumask_of_node(node));
+}
+
+/**
+ * min_chan - returns the channel with min count and in the same numa-node as
+ *	the cpu
+ * @cap: capability to match
+ * @cpu: cpu index which the channel should be close to
+ *
+ * If some channels are close to the given cpu, the one with the lowest
+ * reference count is returned. Otherwise, cpu is ignored and only the
+ * reference count is taken into account.
+ * Must be called under dma_list_mutex.
+ */
+static struct dma_chan *min_chan(enum dma_transaction_type cap, int cpu)
+{
+	struct dma_device *device;
+	struct dma_chan *chan;
+	struct dma_chan *min = NULL;
+	struct dma_chan *localmin = NULL;
+
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (!dma_has_cap(cap, device->cap_mask) ||
+		    dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node) {
+			if (!chan->client_count)
+				continue;
+			if (!min || chan->table_count < min->table_count)
+				min = chan;
+
+			if (dma_chan_is_local(chan, cpu))
+				if (!localmin ||
+				    chan->table_count < localmin->table_count)
+					localmin = chan;
+		}
+	}
+
+	chan = localmin ? localmin : min;
+
+	if (chan)
+		chan->table_count++;
+
+	return chan;
+}
+
+/**
+ * dma_channel_rebalance - redistribute the available channels
+ *
+ * Optimize for cpu isolation (each cpu gets a dedicated channel for an
+ * operation type) in the SMP case,  and operation isolation (avoid
+ * multi-tasking channels) in the non-SMP case.  Must be called under
+ * dma_list_mutex.
+ */
+static void dma_channel_rebalance(void)
+{
+	struct dma_chan *chan;
+	struct dma_device *device;
+	int cpu;
+	int cap;
+
+	/* undo the last distribution */
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_possible_cpu(cpu)
+			per_cpu_ptr(channel_table[cap], cpu)->chan = NULL;
+
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node)
+			chan->table_count = 0;
+	}
+
+	/* don't populate the channel_table if no clients are available */
+	if (!dmaengine_ref_count)
+		return;
+
+	/* redistribute available channels */
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_online_cpu(cpu) {
+			chan = min_chan(cap, cpu);
+			per_cpu_ptr(channel_table[cap], cpu)->chan = chan;
+		}
+}
+
 #define dma_device_satisfies_mask(device, mask) \
 	__dma_device_satisfies_mask((device), &(mask))
 static int
@@ -289,57 +433,6 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
 }
 EXPORT_SYMBOL(dma_sync_wait);
 
-/**
- * dma_cap_mask_all - enable iteration over all operation types
- */
-static dma_cap_mask_t dma_cap_mask_all;
-
-/**
- * dma_chan_tbl_ent - tracks channel allocations per core/operation
- * @chan - associated channel for this entry
- */
-struct dma_chan_tbl_ent {
-	struct dma_chan *chan;
-};
-
-/**
- * channel_table - percpu lookup table for memory-to-memory offload providers
- */
-static struct dma_chan_tbl_ent __percpu *channel_table[DMA_TX_TYPE_END];
-
-static int __init dma_channel_table_init(void)
-{
-	enum dma_transaction_type cap;
-	int err = 0;
-
-	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
-
-	/* 'interrupt', 'private', and 'slave' are channel capabilities,
-	 * but are not associated with an operation so they do not need
-	 * an entry in the channel_table
-	 */
-	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
-	clear_bit(DMA_PRIVATE, dma_cap_mask_all.bits);
-	clear_bit(DMA_SLAVE, dma_cap_mask_all.bits);
-
-	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
-		channel_table[cap] = alloc_percpu(struct dma_chan_tbl_ent);
-		if (!channel_table[cap]) {
-			err = -ENOMEM;
-			break;
-		}
-	}
-
-	if (err) {
-		pr_err("initialization failure\n");
-		for_each_dma_cap_mask(cap, dma_cap_mask_all)
-			free_percpu(channel_table[cap]);
-	}
-
-	return err;
-}
-arch_initcall(dma_channel_table_init);
-
 /**
  * dma_find_channel - find a channel to carry out the operation
  * @tx_type: transaction type
@@ -370,97 +463,6 @@ void dma_issue_pending_all(void)
 }
 EXPORT_SYMBOL(dma_issue_pending_all);
 
-/**
- * dma_chan_is_local - returns true if the channel is in the same numa-node as the cpu
- */
-static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
-{
-	int node = dev_to_node(chan->device->dev);
-	return node == NUMA_NO_NODE ||
-		cpumask_test_cpu(cpu, cpumask_of_node(node));
-}
-
-/**
- * min_chan - returns the channel with min count and in the same numa-node as the cpu
- * @cap: capability to match
- * @cpu: cpu index which the channel should be close to
- *
- * If some channels are close to the given cpu, the one with the lowest
- * reference count is returned. Otherwise, cpu is ignored and only the
- * reference count is taken into account.
- * Must be called under dma_list_mutex.
- */
-static struct dma_chan *min_chan(enum dma_transaction_type cap, int cpu)
-{
-	struct dma_device *device;
-	struct dma_chan *chan;
-	struct dma_chan *min = NULL;
-	struct dma_chan *localmin = NULL;
-
-	list_for_each_entry(device, &dma_device_list, global_node) {
-		if (!dma_has_cap(cap, device->cap_mask) ||
-		    dma_has_cap(DMA_PRIVATE, device->cap_mask))
-			continue;
-		list_for_each_entry(chan, &device->channels, device_node) {
-			if (!chan->client_count)
-				continue;
-			if (!min || chan->table_count < min->table_count)
-				min = chan;
-
-			if (dma_chan_is_local(chan, cpu))
-				if (!localmin ||
-				    chan->table_count < localmin->table_count)
-					localmin = chan;
-		}
-	}
-
-	chan = localmin ? localmin : min;
-
-	if (chan)
-		chan->table_count++;
-
-	return chan;
-}
-
-/**
- * dma_channel_rebalance - redistribute the available channels
- *
- * Optimize for cpu isolation (each cpu gets a dedicated channel for an
- * operation type) in the SMP case,  and operation isolation (avoid
- * multi-tasking channels) in the non-SMP case.  Must be called under
- * dma_list_mutex.
- */
-static void dma_channel_rebalance(void)
-{
-	struct dma_chan *chan;
-	struct dma_device *device;
-	int cpu;
-	int cap;
-
-	/* undo the last distribution */
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		for_each_possible_cpu(cpu)
-			per_cpu_ptr(channel_table[cap], cpu)->chan = NULL;
-
-	list_for_each_entry(device, &dma_device_list, global_node) {
-		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
-			continue;
-		list_for_each_entry(chan, &device->channels, device_node)
-			chan->table_count = 0;
-	}
-
-	/* don't populate the channel_table if no clients are available */
-	if (!dmaengine_ref_count)
-		return;
-
-	/* redistribute available channels */
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		for_each_online_cpu(cpu) {
-			chan = min_chan(cap, cpu);
-			per_cpu_ptr(channel_table[cap], cpu)->chan = chan;
-		}
-}
-
 int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
 {
 	struct dma_device *device;
@@ -1376,5 +1378,3 @@ static int __init dma_bus_init(void)
 	return class_register(&dma_devclass);
 }
 arch_initcall(dma_bus_init);
-
-
-- 
Gitee


From b16546ec8674e618fad2bfdecf3678cef22e894e Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Mon, 16 Dec 2019 12:01:19 -0700
Subject: [PATCH 0502/2161] dmaengine: Add reference counting to dma_device
 struct

commit 8ad342a863590b24ce77681b7e081363fb3333f7 upstream.

Adding a reference count helps drivers to properly implement the unbind
while in use case.

References are taken and put every time a channel is allocated or freed.

Once the final reference is put, the device is removed from the
dma_device_list and a release callback function is called to signal
the driver to free the memory.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20191216190120.21374-5-logang@deltatee.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 57 +++++++++++++++++++++++++++++++++------
 include/linux/dmaengine.h |  8 +++++-
 2 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 1f9a6293f15a..e316abe3672d 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -342,6 +342,23 @@ static void balance_ref_count(struct dma_chan *chan)
 	}
 }
 
+static void dma_device_release(struct kref *ref)
+{
+	struct dma_device *device = container_of(ref, struct dma_device, ref);
+
+	list_del_rcu(&device->global_node);
+	dma_channel_rebalance();
+
+	if (device->device_release)
+		device->device_release(device);
+}
+
+static void dma_device_put(struct dma_device *device)
+{
+	lockdep_assert_held(&dma_list_mutex);
+	kref_put(&device->ref, dma_device_release);
+}
+
 /**
  * dma_chan_get - try to grab a dma channel's parent driver module
  * @chan - channel to grab
@@ -362,6 +379,12 @@ static int dma_chan_get(struct dma_chan *chan)
 	if (!try_module_get(owner))
 		return -ENODEV;
 
+	ret = kref_get_unless_zero(&chan->device->ref);
+	if (!ret) {
+		ret = -ENODEV;
+		goto module_put_out;
+	}
+
 	/* allocate upon first client reference */
 	if (chan->device->device_alloc_chan_resources) {
 		ret = chan->device->device_alloc_chan_resources(chan);
@@ -377,6 +400,8 @@ static int dma_chan_get(struct dma_chan *chan)
 	return 0;
 
 err_out:
+	dma_device_put(chan->device);
+module_put_out:
 	module_put(owner);
 	return ret;
 }
@@ -402,6 +427,7 @@ static void dma_chan_put(struct dma_chan *chan)
 		chan->device->device_free_chan_resources(chan);
 	}
 
+	dma_device_put(chan->device);
 	module_put(dma_chan_to_owner(chan));
 
 	/* If the channel is used via a DMA request router, free the mapping */
@@ -837,14 +863,14 @@ EXPORT_SYMBOL(dmaengine_get);
  */
 void dmaengine_put(void)
 {
-	struct dma_device *device;
+	struct dma_device *device, *_d;
 	struct dma_chan *chan;
 
 	mutex_lock(&dma_list_mutex);
 	dmaengine_ref_count--;
 	BUG_ON(dmaengine_ref_count < 0);
 	/* drop channel references */
-	list_for_each_entry(device, &dma_device_list, global_node) {
+	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
 		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
 			continue;
 		list_for_each_entry(chan, &device->channels, device_node)
@@ -906,6 +932,10 @@ static int get_dma_id(struct dma_device *device)
 /**
  * dma_async_device_register - registers DMA devices found
  * @device: &dma_device
+ *
+ * After calling this routine the structure should not be freed except in the
+ * device_release() callback which will be called after
+ * dma_async_device_unregister() is called and no further references are taken.
  */
 int dma_async_device_register(struct dma_device *device)
 {
@@ -999,6 +1029,12 @@ int dma_async_device_register(struct dma_device *device)
 		return -EIO;
 	}
 
+	if (!device->device_release)
+		dev_warn(device->dev,
+			 "WARN: Device release is not defined so it is not safe to unbind this driver while in use\n");
+
+	kref_init(&device->ref);
+
 	/* note: this only matters in the
 	 * CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH=n case
 	 */
@@ -1115,13 +1151,8 @@ void dma_async_device_unregister(struct dma_device *device)
 {
 	struct dma_chan *chan;
 
-	mutex_lock(&dma_list_mutex);
-	list_del_rcu(&device->global_node);
-	dma_channel_rebalance();
-	mutex_unlock(&dma_list_mutex);
-
 	list_for_each_entry(chan, &device->channels, device_node) {
-		WARN_ONCE(chan->client_count,
+		WARN_ONCE(!device->device_release && chan->client_count,
 			  "%s called while %d clients hold a reference\n",
 			  __func__, chan->client_count);
 		mutex_lock(&dma_list_mutex);
@@ -1130,6 +1161,16 @@ void dma_async_device_unregister(struct dma_device *device)
 		device_unregister(&chan->dev->device);
 		free_percpu(chan->local);
 	}
+
+	mutex_lock(&dma_list_mutex);
+	/*
+	 * setting DMA_PRIVATE ensures the device being torn down will not
+	 * be used in the channel_table
+	 */
+	dma_cap_set(DMA_PRIVATE, device->cap_mask);
+	dma_channel_rebalance();
+	dma_device_put(device);
+	mutex_unlock(&dma_list_mutex);
 }
 EXPORT_SYMBOL(dma_async_device_unregister);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 8013562751a5..075dba0137be 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -719,9 +719,14 @@ struct dma_filter {
  *	will just return a simple status code
  * @device_issue_pending: push pending transactions to hardware
  * @descriptor_reuse: a submitted transfer can be resubmitted after completion
+ * @device_release: called sometime atfer dma_async_device_unregister() is
+ *     called and there are no further references to this structure. This
+ *     must be implemented to free resources however many existing drivers
+ *     do not and are therefore not safe to unbind while in use.
+ *
  */
 struct dma_device {
-
+	struct kref ref;
 	unsigned int chancnt;
 	unsigned int privatecnt;
 	struct list_head channels;
@@ -802,6 +807,7 @@ struct dma_device {
 					    dma_cookie_t cookie,
 					    struct dma_tx_state *txstate);
 	void (*device_issue_pending)(struct dma_chan *chan);
+	void (*device_release)(struct dma_device *dev);
 };
 
 static inline int dmaengine_slave_config(struct dma_chan *chan,
-- 
Gitee


From 4fb219059561da2309dfd6db47f7f21a6e443b42 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Tue, 24 Dec 2019 10:22:15 +0530
Subject: [PATCH 0503/2161] dmaengine: move module_/dma_device_put() after
 route free

commit 83c77940db12112646a2f74d1d21505788812d7f upstream.

We call dma_device_put() and module_put() after invoking
.device_free_chan_resources callback, but we should also take care of
router devices and invoke this after .route_free callback. So move it
after .route_free

Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index e316abe3672d..0505ea5b002f 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -427,15 +427,15 @@ static void dma_chan_put(struct dma_chan *chan)
 		chan->device->device_free_chan_resources(chan);
 	}
 
-	dma_device_put(chan->device);
-	module_put(dma_chan_to_owner(chan));
-
 	/* If the channel is used via a DMA request router, free the mapping */
 	if (chan->router && chan->router->route_free) {
 		chan->router->route_free(chan->router->dev, chan->route_data);
 		chan->router = NULL;
 		chan->route_data = NULL;
 	}
+
+	dma_device_put(chan->device);
+	module_put(dma_chan_to_owner(chan));
 }
 
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
-- 
Gitee


From 8b09bcc14398b7eb4f1f8d76936fe1d9677480c2 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Tue, 24 Dec 2019 10:26:14 +0530
Subject: [PATCH 0504/2161] dmaengine: print more meaningful error message

commit 08baca4280d8abcf139fa8fec5b3de6f346efbae upstream.

error log for dma_channel_table_init() failure pointed a mere
"initialization failure", which is not very helpful message, so print
additional details like function name and error code.

Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 0505ea5b002f..4ac77456e830 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -206,7 +206,7 @@ static int __init dma_channel_table_init(void)
 	}
 
 	if (err) {
-		pr_err("initialization failure\n");
+		pr_err("dmaengine dma_channel_table_init failure: %d\n", err);
 		for_each_dma_cap_mask(cap, dma_cap_mask_all)
 			free_percpu(channel_table[cap]);
 	}
-- 
Gitee


From fc6459c0a249f6458bfdaf824d7d2068a3f05e61 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Mon, 23 Dec 2019 13:04:44 +0200
Subject: [PATCH 0505/2161] dmaengine: Add metadata_ops for
 dma_async_tx_descriptor

commit 4db8fd32ed2be7cc510e51e43ec3349aa64074a9 upstream.

The metadata is best described as side band data or parameters traveling
alongside the data DMAd by the DMA engine. It is data
which is understood by the peripheral and the peripheral driver only, the
DMA engine see it only as data block and it is not interpreting it in any
way.

The metadata can be different per descriptor as it is a parameter for the
data being transferred.

If the DMA supports per descriptor metadata it can implement the attach,
get_ptr/set_len callbacks.

Client drivers must only use either attach or get_ptr/set_len to avoid
misconfiguration.

Client driver can check if a given metadata mode is supported by the
channel during probe time with
dmaengine_is_metadata_mode_supported(chan, DESC_METADATA_CLIENT);
dmaengine_is_metadata_mode_supported(chan, DESC_METADATA_ENGINE);

and based on this information can use either mode.

Wrappers are also added for the metadata_ops.

To be used in DESC_METADATA_CLIENT mode:
dmaengine_desc_attach_metadata()

To be used in DESC_METADATA_ENGINE mode:
dmaengine_desc_get_metadata_ptr()
dmaengine_desc_set_metadata_len()

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Tero Kristo <t-kristo@ti.com>
Tested-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20191223110458.30766-5-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   |  73 +++++++++++++++++++++++++
 include/linux/dmaengine.h | 112 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 185 insertions(+)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 4ac77456e830..158aeb1b6a8a 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1348,6 +1348,79 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
+static inline int desc_check_and_set_metadata_mode(
+	struct dma_async_tx_descriptor *desc, enum dma_desc_metadata_mode mode)
+{
+	/* Make sure that the metadata mode is not mixed */
+	if (!desc->desc_metadata_mode) {
+		if (dmaengine_is_metadata_mode_supported(desc->chan, mode))
+			desc->desc_metadata_mode = mode;
+		else
+			return -ENOTSUPP;
+	} else if (desc->desc_metadata_mode != mode) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int dmaengine_desc_attach_metadata(struct dma_async_tx_descriptor *desc,
+				   void *data, size_t len)
+{
+	int ret;
+
+	if (!desc)
+		return -EINVAL;
+
+	ret = desc_check_and_set_metadata_mode(desc, DESC_METADATA_CLIENT);
+	if (ret)
+		return ret;
+
+	if (!desc->metadata_ops || !desc->metadata_ops->attach)
+		return -ENOTSUPP;
+
+	return desc->metadata_ops->attach(desc, data, len);
+}
+EXPORT_SYMBOL_GPL(dmaengine_desc_attach_metadata);
+
+void *dmaengine_desc_get_metadata_ptr(struct dma_async_tx_descriptor *desc,
+				      size_t *payload_len, size_t *max_len)
+{
+	int ret;
+
+	if (!desc)
+		return ERR_PTR(-EINVAL);
+
+	ret = desc_check_and_set_metadata_mode(desc, DESC_METADATA_ENGINE);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!desc->metadata_ops || !desc->metadata_ops->get_ptr)
+		return ERR_PTR(-ENOTSUPP);
+
+	return desc->metadata_ops->get_ptr(desc, payload_len, max_len);
+}
+EXPORT_SYMBOL_GPL(dmaengine_desc_get_metadata_ptr);
+
+int dmaengine_desc_set_metadata_len(struct dma_async_tx_descriptor *desc,
+				    size_t payload_len)
+{
+	int ret;
+
+	if (!desc)
+		return -EINVAL;
+
+	ret = desc_check_and_set_metadata_mode(desc, DESC_METADATA_ENGINE);
+	if (ret)
+		return ret;
+
+	if (!desc->metadata_ops || !desc->metadata_ops->set_len)
+		return -ENOTSUPP;
+
+	return desc->metadata_ops->set_len(desc, payload_len);
+}
+EXPORT_SYMBOL_GPL(dmaengine_desc_set_metadata_len);
+
 /* dma_wait_for_async_tx - spin wait for a transaction to complete
  * @tx: in-flight transaction to wait on
  */
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 075dba0137be..7705b44c3351 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -219,6 +219,62 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
  * @bytes_transferred: byte counter
  */
 
+/**
+ * enum dma_desc_metadata_mode - per descriptor metadata mode types supported
+ * @DESC_METADATA_CLIENT - the metadata buffer is allocated/provided by the
+ *  client driver and it is attached (via the dmaengine_desc_attach_metadata()
+ *  helper) to the descriptor.
+ *
+ * Client drivers interested to use this mode can follow:
+ * - DMA_MEM_TO_DEV / DEV_MEM_TO_MEM:
+ *   1. prepare the descriptor (dmaengine_prep_*)
+ *	construct the metadata in the client's buffer
+ *   2. use dmaengine_desc_attach_metadata() to attach the buffer to the
+ *	descriptor
+ *   3. submit the transfer
+ * - DMA_DEV_TO_MEM:
+ *   1. prepare the descriptor (dmaengine_prep_*)
+ *   2. use dmaengine_desc_attach_metadata() to attach the buffer to the
+ *	descriptor
+ *   3. submit the transfer
+ *   4. when the transfer is completed, the metadata should be available in the
+ *	attached buffer
+ *
+ * @DESC_METADATA_ENGINE - the metadata buffer is allocated/managed by the DMA
+ *  driver. The client driver can ask for the pointer, maximum size and the
+ *  currently used size of the metadata and can directly update or read it.
+ *  dmaengine_desc_get_metadata_ptr() and dmaengine_desc_set_metadata_len() is
+ *  provided as helper functions.
+ *
+ *  Note: the metadata area for the descriptor is no longer valid after the
+ *  transfer has been completed (valid up to the point when the completion
+ *  callback returns if used).
+ *
+ * Client drivers interested to use this mode can follow:
+ * - DMA_MEM_TO_DEV / DEV_MEM_TO_MEM:
+ *   1. prepare the descriptor (dmaengine_prep_*)
+ *   2. use dmaengine_desc_get_metadata_ptr() to get the pointer to the engine's
+ *	metadata area
+ *   3. update the metadata at the pointer
+ *   4. use dmaengine_desc_set_metadata_len()  to tell the DMA engine the amount
+ *	of data the client has placed into the metadata buffer
+ *   5. submit the transfer
+ * - DMA_DEV_TO_MEM:
+ *   1. prepare the descriptor (dmaengine_prep_*)
+ *   2. submit the transfer
+ *   3. on transfer completion, use dmaengine_desc_get_metadata_ptr() to get the
+ *	pointer to the engine's metadata area
+ *   4. Read out the metadata from the pointer
+ *
+ * Note: the two mode is not compatible and clients must use one mode for a
+ * descriptor.
+ */
+enum dma_desc_metadata_mode {
+	DESC_METADATA_NONE = 0,
+	DESC_METADATA_CLIENT = BIT(0),
+	DESC_METADATA_ENGINE = BIT(1),
+};
+
 struct dma_chan_percpu {
 	/* stats */
 	unsigned long memcpy_count;
@@ -475,6 +531,18 @@ struct dmaengine_unmap_data {
 	dma_addr_t addr[0];
 };
 
+struct dma_async_tx_descriptor;
+
+struct dma_descriptor_metadata_ops {
+	int (*attach)(struct dma_async_tx_descriptor *desc, void *data,
+		      size_t len);
+
+	void *(*get_ptr)(struct dma_async_tx_descriptor *desc,
+			 size_t *payload_len, size_t *max_len);
+	int (*set_len)(struct dma_async_tx_descriptor *desc,
+		       size_t payload_len);
+};
+
 /**
  * struct dma_async_tx_descriptor - async transaction descriptor
  * ---dma generic offload fields---
@@ -488,6 +556,11 @@ struct dmaengine_unmap_data {
  * descriptor pending. To be pushed on .issue_pending() call
  * @callback: routine to call after this operation is complete
  * @callback_param: general parameter to pass to the callback routine
+ * @desc_metadata_mode: core managed metadata mode to protect mixed use of
+ *	DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise
+ *	DESC_METADATA_NONE
+ * @metadata_ops: DMA driver provided metadata mode ops, need to be set by the
+ *	DMA driver if metadata mode is supported with the descriptor
  * ---async_tx api specific fields---
  * @next: at completion submit this descriptor
  * @parent: pointer to the next level up in the dependency chain
@@ -504,6 +577,8 @@ struct dma_async_tx_descriptor {
 	dma_async_tx_callback_result callback_result;
 	void *callback_param;
 	struct dmaengine_unmap_data *unmap;
+	enum dma_desc_metadata_mode desc_metadata_mode;
+	struct dma_descriptor_metadata_ops *metadata_ops;
 #ifdef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
 	struct dma_async_tx_descriptor *next;
 	struct dma_async_tx_descriptor *parent;
@@ -666,6 +741,7 @@ struct dma_filter {
  * @global_node: list_head for global dma_device_list
  * @filter: information for device/slave to filter function/param mapping
  * @cap_mask: one or more dma_capability flags
+ * @desc_metadata_modes: supported metadata modes by the DMA device
  * @max_xor: maximum number of xor sources, 0 if no capability
  * @max_pq: maximum number of PQ sources and PQ-continue capability
  * @copy_align: alignment shift for memcpy operations
@@ -733,6 +809,7 @@ struct dma_device {
 	struct list_head global_node;
 	struct dma_filter filter;
 	dma_cap_mask_t  cap_mask;
+	enum dma_desc_metadata_mode desc_metadata_modes;
 	unsigned short max_xor;
 	unsigned short max_pq;
 	enum dmaengine_alignment copy_align;
@@ -910,6 +987,41 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
 						    len, flags);
 }
 
+static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan,
+		enum dma_desc_metadata_mode mode)
+{
+	if (!chan)
+		return false;
+
+	return !!(chan->device->desc_metadata_modes & mode);
+}
+
+#ifdef CONFIG_DMA_ENGINE
+int dmaengine_desc_attach_metadata(struct dma_async_tx_descriptor *desc,
+				   void *data, size_t len);
+void *dmaengine_desc_get_metadata_ptr(struct dma_async_tx_descriptor *desc,
+				      size_t *payload_len, size_t *max_len);
+int dmaengine_desc_set_metadata_len(struct dma_async_tx_descriptor *desc,
+				    size_t payload_len);
+#else /* CONFIG_DMA_ENGINE */
+static inline int dmaengine_desc_attach_metadata(
+		struct dma_async_tx_descriptor *desc, void *data, size_t len)
+{
+	return -EINVAL;
+}
+static inline void *dmaengine_desc_get_metadata_ptr(
+		struct dma_async_tx_descriptor *desc, size_t *payload_len,
+		size_t *max_len)
+{
+	return NULL;
+}
+static inline int dmaengine_desc_set_metadata_len(
+		struct dma_async_tx_descriptor *desc, size_t payload_len)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_DMA_ENGINE */
+
 /**
  * dmaengine_terminate_all() - Terminate all active DMA transfers
  * @chan: The channel for which to terminate the transfers
-- 
Gitee


From 5c1c0f96ce64f8778949640595a879c57f69984e Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Mon, 23 Dec 2019 13:04:45 +0200
Subject: [PATCH 0506/2161] dmaengine: Add support for reporting DMA cached
 data amount

commit 6755ec06d1333765d2b935e4e4a5bd011332bac6 upstream.

A DMA hardware can have big cache or FIFO and the amount of data sitting in
the DMA fabric can be an interest for the clients.

For example in audio we want to know the delay in the data flow and in case
the DMA have significantly large FIFO/cache, it can affect the latenc/delay

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Tero Kristo <t-kristo@ti.com>
Tested-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20191223110458.30766-6-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.h   | 8 ++++++++
 include/linux/dmaengine.h | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 501c0b063f85..b0b97475707a 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -77,6 +77,7 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 		state->last = complete;
 		state->used = used;
 		state->residue = 0;
+		state->in_flight_bytes = 0;
 	}
 	return dma_async_is_complete(cookie, complete, used);
 }
@@ -87,6 +88,13 @@ static inline void dma_set_residue(struct dma_tx_state *state, u32 residue)
 		state->residue = residue;
 }
 
+static inline void dma_set_in_flight_bytes(struct dma_tx_state *state,
+					   u32 in_flight_bytes)
+{
+	if (state)
+		state->in_flight_bytes = in_flight_bytes;
+}
+
 struct dmaengine_desc_callback {
 	dma_async_tx_callback callback;
 	dma_async_tx_callback_result callback_result;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 7705b44c3351..4e759c1850d5 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -686,11 +686,13 @@ static inline struct dma_async_tx_descriptor *txd_next(struct dma_async_tx_descr
  * @residue: the remaining number of bytes left to transmit
  *	on the selected transfer for states DMA_IN_PROGRESS and
  *	DMA_PAUSED if this is implemented in the driver, else 0
+ * @in_flight_bytes: amount of data in bytes cached by the DMA.
  */
 struct dma_tx_state {
 	dma_cookie_t last;
 	dma_cookie_t used;
 	u32 residue;
+	u32 in_flight_bytes;
 };
 
 /**
-- 
Gitee


From 749bcdf9764f3bbc763b06678641028485fd4080 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 21 Jan 2020 10:33:09 +0100
Subject: [PATCH 0507/2161] dmaengine: Remove dma_device_satisfies_mask()
 wrapper

commit 69b1189ba2cd6643474312004f10685324e38f58 upstream.

Commit aa1e6f1a385eb2b0 ("dmaengine: kill struct dma_client and
supporting infrastructure") removed the last user of the
dma_device_satisfies_mask() wrapper.

Remove the wrapper, and rename __dma_device_satisfies_mask() to
dma_device_satisfies_mask(), to get rid of one more function starting
with a double underscore.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20200121093311.28639-2-geert+renesas@glider.be
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 158aeb1b6a8a..7550dbdf5488 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -308,11 +308,8 @@ static void dma_channel_rebalance(void)
 		}
 }
 
-#define dma_device_satisfies_mask(device, mask) \
-	__dma_device_satisfies_mask((device), &(mask))
-static int
-__dma_device_satisfies_mask(struct dma_device *device,
-			    const dma_cap_mask_t *want)
+static int dma_device_satisfies_mask(struct dma_device *device,
+				     const dma_cap_mask_t *want)
 {
 	dma_cap_mask_t has;
 
@@ -531,7 +528,7 @@ static struct dma_chan *private_candidate(const dma_cap_mask_t *mask,
 {
 	struct dma_chan *chan;
 
-	if (mask && !__dma_device_satisfies_mask(dev, mask)) {
+	if (mask && !dma_device_satisfies_mask(dev, mask)) {
 		dev_dbg(dev->dev, "%s: wrong capabilities\n", __func__);
 		return NULL;
 	}
-- 
Gitee


From 701def24eb2bd98c6d48afd56a45259d276fcb78 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 21 Jan 2020 10:33:11 +0100
Subject: [PATCH 0508/2161] dmaengine: Move dma_get_{,any_}slave_channel() to
 private dmaengine.h

commit c3c431de99c068e3f64d01335c1532b22e4b1d1b upstream.

The functions dma_get_slave_channel() and dma_get_any_slave_channel()
are called from DMA engine drivers only.  Hence move their declarations
from the public header file <linux/dmaengine.h> to the private header
file drivers/dma/dmaengine.h.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20200121093311.28639-4-geert+renesas@glider.be
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.h   | 3 +++
 drivers/dma/of-dma.c      | 2 ++
 include/linux/dmaengine.h | 2 --
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index b0b97475707a..e8a320c9e57c 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -179,4 +179,7 @@ dmaengine_desc_callback_valid(struct dmaengine_desc_callback *cb)
 	return (cb->callback) ? true : false;
 }
 
+struct dma_chan *dma_get_slave_channel(struct dma_chan *chan);
+struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
+
 #endif
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 4bbf4172b9bf..0db816eb8080 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -15,6 +15,8 @@
 #include <linux/of.h>
 #include <linux/of_dma.h>
 
+#include "dmaengine.h"
+
 static LIST_HEAD(of_dma_list);
 static DEFINE_MUTEX(of_dma_lock);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 4e759c1850d5..027fe23982e5 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1525,8 +1525,6 @@ int dma_async_device_register(struct dma_device *device);
 int dmaenginem_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
-struct dma_chan *dma_get_slave_channel(struct dma_chan *chan);
-struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
 #define dma_request_channel(mask, x, y) \
 	__dma_request_channel(&(mask), x, y, NULL)
 #define dma_request_slave_channel_compat(mask, x, y, dev, name) \
-- 
Gitee


From 38c6165b0f71cd01652cd151315abb827d0c3009 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Jan 2020 16:43:41 -0700
Subject: [PATCH 0509/2161] x86/asm: add iosubmit_cmds512() based on MOVDIR64B
 CPU instruction

commit 232bb01bb8ad2f88aefbd88b0d3fe3f9a253502b upstream.

With the introduction of MOVDIR64B instruction, there is now an instruction
that can write 64 bytes of data atomically.

Quoting from Intel SDM:
"There is no atomicity guarantee provided for the 64-byte load operation
from source address, and processor implementations may use multiple
load operations to read the 64-bytes. The 64-byte direct-store issued
by MOVDIR64B guarantees 64-byte write-completion atomicity. This means
that the data arrives at the destination in a single undivided 64-byte
write transaction."

We have identified at least 3 different use cases for this instruction in
the format of func(dst, src, count):
1) Clear poison / Initialize MKTME memory
   @dst is normal memory.
   @src in normal memory. Does not increment. (Copy same line to all
   targets)
   @count (to clear/init multiple lines)
2) Submit command(s) to new devices
   @dst is a special MMIO region for a device. Does not increment.
   @src is normal memory. Increments.
   @count usually is 1, but can be multiple.
3) Copy to iomem in big chunks
   @dst is iomem and increments
   @src in normal memory and increments
   @count is number of chunks to copy

Add support for case #2 to support device that will accept commands via
this instruction. We provide a @count in order to submit a batch of
preprogrammed descriptors in virtually contiguous memory. This
allows the caller to submit multiple descriptors to a device with a single
submission. The special device requires the entire 64bytes descriptor to
be written atomically and will accept MOVDIR64B instruction.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/157965022175.73301.10174614665472962675.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/io.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6bed97ff6db2..29a337d173f0 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -404,4 +404,40 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset,
 extern bool phys_mem_access_encrypted(unsigned long phys_addr,
 				      unsigned long size);
 
+/**
+ * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
+ * @__dst: destination, in MMIO space (must be 512-bit aligned)
+ * @src: source
+ * @count: number of 512 bits quantities to submit
+ *
+ * Submit data from kernel space to MMIO space, in units of 512 bits at a
+ * time.  Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ *
+ * Warning: Do not use this helper unless your driver has checked that the CPU
+ * instruction is supported on the platform.
+ */
+static inline void iosubmit_cmds512(void __iomem *__dst, const void *src,
+				    size_t count)
+{
+	/*
+	 * Note that this isn't an "on-stack copy", just definition of "dst"
+	 * as a pointer to 64-bytes of stuff that is going to be overwritten.
+	 * In the MOVDIR64B case that may be needed as you can use the
+	 * MOVDIR64B instruction to copy arbitrary memory around. This trick
+	 * lets the compiler know how much gets clobbered.
+	 */
+	volatile struct { char _[64]; } *dst = __dst;
+	const u8 *from = src;
+	const u8 *end = from + count * 64;
+
+	while (from < end) {
+		/* MOVDIR64B [rdx], rax */
+		asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+			     : "=m" (dst)
+			     : "d" (from), "a" (dst));
+		from += 64;
+	}
+}
+
 #endif /* _ASM_X86_IO_H */
-- 
Gitee


From a46b3257647e2cf3ed33a81b5b6f54527924f031 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Jan 2020 16:43:47 -0700
Subject: [PATCH 0510/2161] dmaengine: break out channel registration

commit d2fb0a0438384fee08a418025f743913020033ce upstream.

In preparation for dynamic channel registration, the code segment that
does the channel registration is broken out to its own function.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/157965022778.73301.8929944324898985438.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 135 ++++++++++++++++++++++++----------------
 1 file changed, 81 insertions(+), 54 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 7550dbdf5488..2daf2ee9bebd 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -926,6 +926,79 @@ static int get_dma_id(struct dma_device *device)
 	return 0;
 }
 
+static int __dma_async_device_channel_register(struct dma_device *device,
+					       struct dma_chan *chan,
+					       int chan_id)
+{
+	int rc = 0;
+	int chancnt = device->chancnt;
+	atomic_t *idr_ref;
+	struct dma_chan *tchan;
+
+	tchan = list_first_entry_or_null(&device->channels,
+					 struct dma_chan, device_node);
+	if (tchan->dev) {
+		idr_ref = tchan->dev->idr_ref;
+	} else {
+		idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
+		if (!idr_ref)
+			return -ENOMEM;
+		atomic_set(idr_ref, 0);
+	}
+
+	chan->local = alloc_percpu(typeof(*chan->local));
+	if (!chan->local)
+		goto err_out;
+	chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
+	if (!chan->dev) {
+		free_percpu(chan->local);
+		chan->local = NULL;
+		goto err_out;
+	}
+
+	/*
+	 * When the chan_id is a negative value, we are dynamically adding
+	 * the channel. Otherwise we are static enumerating.
+	 */
+	chan->chan_id = chan_id < 0 ? chancnt : chan_id;
+	chan->dev->device.class = &dma_devclass;
+	chan->dev->device.parent = device->dev;
+	chan->dev->chan = chan;
+	chan->dev->idr_ref = idr_ref;
+	chan->dev->dev_id = device->dev_id;
+	atomic_inc(idr_ref);
+	dev_set_name(&chan->dev->device, "dma%dchan%d",
+		     device->dev_id, chan->chan_id);
+
+	rc = device_register(&chan->dev->device);
+	if (rc)
+		goto err_out;
+	chan->client_count = 0;
+	device->chancnt = chan->chan_id + 1;
+
+	return 0;
+
+ err_out:
+	free_percpu(chan->local);
+	kfree(chan->dev);
+	if (atomic_dec_return(idr_ref) == 0)
+		kfree(idr_ref);
+	return rc;
+}
+
+static void __dma_async_device_channel_unregister(struct dma_device *device,
+						  struct dma_chan *chan)
+{
+	WARN_ONCE(!device->device_release && chan->client_count,
+		  "%s called while %d clients hold a reference\n",
+		  __func__, chan->client_count);
+	mutex_lock(&dma_list_mutex);
+	chan->dev->chan = NULL;
+	mutex_unlock(&dma_list_mutex);
+	device_unregister(&chan->dev->device);
+	free_percpu(chan->local);
+}
+
 /**
  * dma_async_device_register - registers DMA devices found
  * @device: &dma_device
@@ -936,9 +1009,8 @@ static int get_dma_id(struct dma_device *device)
  */
 int dma_async_device_register(struct dma_device *device)
 {
-	int chancnt = 0, rc;
+	int rc, i = 0;
 	struct dma_chan* chan;
-	atomic_t *idr_ref;
 
 	if (!device)
 		return -ENODEV;
@@ -1038,59 +1110,23 @@ int dma_async_device_register(struct dma_device *device)
 	if (device_has_all_tx_types(device))
 		dma_cap_set(DMA_ASYNC_TX, device->cap_mask);
 
-	idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
-	if (!idr_ref)
-		return -ENOMEM;
 	rc = get_dma_id(device);
-	if (rc != 0) {
-		kfree(idr_ref);
+	if (rc != 0)
 		return rc;
-	}
-
-	atomic_set(idr_ref, 0);
 
 	/* represent channels in sysfs. Probably want devs too */
 	list_for_each_entry(chan, &device->channels, device_node) {
-		rc = -ENOMEM;
-		chan->local = alloc_percpu(typeof(*chan->local));
-		if (chan->local == NULL)
+		rc = __dma_async_device_channel_register(device, chan, i++);
+		if (rc < 0)
 			goto err_out;
-		chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
-		if (chan->dev == NULL) {
-			free_percpu(chan->local);
-			chan->local = NULL;
-			goto err_out;
-		}
-
-		chan->chan_id = chancnt++;
-		chan->dev->device.class = &dma_devclass;
-		chan->dev->device.parent = device->dev;
-		chan->dev->chan = chan;
-		chan->dev->idr_ref = idr_ref;
-		chan->dev->dev_id = device->dev_id;
-		atomic_inc(idr_ref);
-		dev_set_name(&chan->dev->device, "dma%dchan%d",
-			     device->dev_id, chan->chan_id);
-
-		rc = device_register(&chan->dev->device);
-		if (rc) {
-			free_percpu(chan->local);
-			chan->local = NULL;
-			kfree(chan->dev);
-			atomic_dec(idr_ref);
-			goto err_out;
-		}
-		chan->client_count = 0;
 	}
 
-	if (!chancnt) {
+	if (!device->chancnt) {
 		dev_err(device->dev, "%s: device has no channels!\n", __func__);
 		rc = -ENODEV;
 		goto err_out;
 	}
 
-	device->chancnt = chancnt;
-
 	mutex_lock(&dma_list_mutex);
 	/* take references on public channels */
 	if (dmaengine_ref_count && !dma_has_cap(DMA_PRIVATE, device->cap_mask))
@@ -1118,9 +1154,8 @@ int dma_async_device_register(struct dma_device *device)
 
 err_out:
 	/* if we never registered a channel just release the idr */
-	if (atomic_read(idr_ref) == 0) {
+	if (!device->chancnt) {
 		ida_free(&dma_ida, device->dev_id);
-		kfree(idr_ref);
 		return rc;
 	}
 
@@ -1148,16 +1183,8 @@ void dma_async_device_unregister(struct dma_device *device)
 {
 	struct dma_chan *chan;
 
-	list_for_each_entry(chan, &device->channels, device_node) {
-		WARN_ONCE(!device->device_release && chan->client_count,
-			  "%s called while %d clients hold a reference\n",
-			  __func__, chan->client_count);
-		mutex_lock(&dma_list_mutex);
-		chan->dev->chan = NULL;
-		mutex_unlock(&dma_list_mutex);
-		device_unregister(&chan->dev->device);
-		free_percpu(chan->local);
-	}
+	list_for_each_entry(chan, &device->channels, device_node)
+		__dma_async_device_channel_unregister(device, chan);
 
 	mutex_lock(&dma_list_mutex);
 	/*
-- 
Gitee


From fac7e977219e6f91ec5a5209c4bd0f41a54bc21e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Jan 2020 16:43:53 -0700
Subject: [PATCH 0511/2161] dmaengine: add support to dynamic
 register/unregister of channels

commit e81274cd6b5264809384066e09a5253708822522 upstream.

With the channel registration routines broken out, now add support code to
allow independent registering and unregistering of channels in a hotplug fashion.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/157965023364.73301.7821862091077299040.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 34 ++++++++++++++++++++++++++--------
 include/linux/dmaengine.h |  4 ++++
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 2daf2ee9bebd..51a2f2b1b2de 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -986,6 +986,20 @@ static int __dma_async_device_channel_register(struct dma_device *device,
 	return rc;
 }
 
+int dma_async_device_channel_register(struct dma_device *device,
+				      struct dma_chan *chan)
+{
+	int rc;
+
+	rc = __dma_async_device_channel_register(device, chan, -1);
+	if (rc < 0)
+		return rc;
+
+	dma_channel_rebalance();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dma_async_device_channel_register);
+
 static void __dma_async_device_channel_unregister(struct dma_device *device,
 						  struct dma_chan *chan)
 {
@@ -993,12 +1007,22 @@ static void __dma_async_device_channel_unregister(struct dma_device *device,
 		  "%s called while %d clients hold a reference\n",
 		  __func__, chan->client_count);
 	mutex_lock(&dma_list_mutex);
+	list_del(&chan->device_node);
+	device->chancnt--;
 	chan->dev->chan = NULL;
 	mutex_unlock(&dma_list_mutex);
 	device_unregister(&chan->dev->device);
 	free_percpu(chan->local);
 }
 
+void dma_async_device_channel_unregister(struct dma_device *device,
+					 struct dma_chan *chan)
+{
+	__dma_async_device_channel_unregister(device, chan);
+	dma_channel_rebalance();
+}
+EXPORT_SYMBOL_GPL(dma_async_device_channel_unregister);
+
 /**
  * dma_async_device_register - registers DMA devices found
  * @device: &dma_device
@@ -1121,12 +1145,6 @@ int dma_async_device_register(struct dma_device *device)
 			goto err_out;
 	}
 
-	if (!device->chancnt) {
-		dev_err(device->dev, "%s: device has no channels!\n", __func__);
-		rc = -ENODEV;
-		goto err_out;
-	}
-
 	mutex_lock(&dma_list_mutex);
 	/* take references on public channels */
 	if (dmaengine_ref_count && !dma_has_cap(DMA_PRIVATE, device->cap_mask))
@@ -1181,9 +1199,9 @@ EXPORT_SYMBOL(dma_async_device_register);
  */
 void dma_async_device_unregister(struct dma_device *device)
 {
-	struct dma_chan *chan;
+	struct dma_chan *chan, *n;
 
-	list_for_each_entry(chan, &device->channels, device_node)
+	list_for_each_entry_safe(chan, n, &device->channels, device_node)
 		__dma_async_device_channel_unregister(device, chan);
 
 	mutex_lock(&dma_list_mutex);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 027fe23982e5..4ad415905e12 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1524,6 +1524,10 @@ static inline int dmaengine_desc_free(struct dma_async_tx_descriptor *desc)
 int dma_async_device_register(struct dma_device *device);
 int dmaenginem_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
+int dma_async_device_channel_register(struct dma_device *device,
+				      struct dma_chan *chan);
+void dma_async_device_channel_unregister(struct dma_device *device,
+					 struct dma_chan *chan);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
 #define dma_request_channel(mask, x, y) \
 	__dma_request_channel(&(mask), x, y, NULL)
-- 
Gitee


From e973463ecd737c0b5ac3dc8489b58a7bc98a598e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Jan 2020 16:43:59 -0700
Subject: [PATCH 0512/2161] dmaengine: idxd: Init and probe for Intel data
 accelerators

commit bfe1d56091c1a404b3d4ce7e9809d745fc4453bb upstream.

The idxd driver introduces the Intel Data Stream Accelerator [1] that will
be available on future Intel Xeon CPUs. One of the kernel access
point for the driver is through the dmaengine subsystem. It will initially
provide the DMA copy service to the kernel.

Some of the main functionality introduced with this accelerator
are: shared virtual memory (SVM) support, and descriptor submission using
Intel CPU instructions movdir64b and enqcmds. There will be additional
accelerator devices that share the same driver with variations to
capabilities.

This commit introduces the probe and initialization component of the
driver.

[1]: https://software.intel.com/en-us/download/intel-data-streaming-accelerator-preliminary-architecture-specification

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/157965023991.73301.6186843973135311580.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 MAINTAINERS          |  8 ++++++++
 drivers/dma/Kconfig  | 13 +++++++++++++
 drivers/dma/Makefile |  1 +
 3 files changed, 22 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a48b00bf7639..668003e83bda 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8283,6 +8283,14 @@ Q:	https://patchwork.kernel.org/project/linux-dmaengine/list/
 S:	Supported
 F:	drivers/dma/ioat*
 
+INTEL IADX DRIVER
+M:	Dave Jiang <dave.jiang@intel.com>
+L:	dmaengine@vger.kernel.org
+S:	Supported
+F:	drivers/dma/idxd/*
+F:	include/uapi/linux/idxd.h
+F:	include/linux/idxd.h
+
 INTEL IDLE DRIVER
 M:	Jacob Pan <jacob.jun.pan@linux.intel.com>
 M:	Len Brown <lenb@kernel.org>
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 4a34feec93d7..c2c7e9635aa2 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -273,6 +273,19 @@ config INTEL_IDMA64
 	  Enable DMA support for Intel Low Power Subsystem such as found on
 	  Intel Skylake PCH.
 
+config INTEL_IDXD
+	tristate "Intel Data Accelerators support"
+	depends on PCI && X86_64
+	select DMA_ENGINE
+	select SBITMAP
+	help
+	  Enable support for the Intel(R) data accelerators present
+	  in Intel Xeon CPU.
+
+	  Say Y if you have such a platform.
+
+	  If unsure, say N.
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86_64
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index f5ce8665e944..ff064b29021e 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_IMX_DMA) += imx-dma.o
 obj-$(CONFIG_IMX_SDMA) += imx-sdma.o
 obj-$(CONFIG_INTEL_IDMA64) += idma64.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioat/
+obj-$(CONFIG_INTEL_IDXD) += idxd/
 obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
 obj-$(CONFIG_INTEL_MIC_X100_DMA) += mic_x100_dma.o
 obj-$(CONFIG_K3_DMA) += k3dma.o
-- 
Gitee


From ea60880e53aff7ac2bf36646fd56ee68a277244b Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Jan 2020 16:44:05 -0700
Subject: [PATCH 0513/2161] dmaengine: idxd: add configuration component of
 driver

commit c52ca478233c172b2d322b5241d6279a8661cbba upstream.

The device is left unconfigured when the driver is loaded. Various
components are configured via the driver sysfs attributes. Once
configuration is done, the device can be enabled by writing the device name
to the bind attribute of the device driver sysfs. Disabling can be done
similarly. Also the individual work queues can also be enabled and disabled
through the bind/unbind attributes. A constructed hierarchy is created
through the struct device framework in order to provide appropriate
configuration points and device state and status. This hierarchy is
presented off the virtual DSA bus.

i.e. /sys/bus/dsa/...

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/157965024585.73301.6431413676230150589.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/Makefile    |    2 +
 drivers/dma/idxd/device.c    |  688 ++++++++++++++++
 drivers/dma/idxd/idxd.h      |  249 ++++++
 drivers/dma/idxd/init.c      |  493 ++++++++++++
 drivers/dma/idxd/irq.c       |  156 ++++
 drivers/dma/idxd/registers.h |  336 ++++++++
 drivers/dma/idxd/sysfs.c     | 1452 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/idxd.h    |  228 ++++++
 8 files changed, 3604 insertions(+)
 create mode 100644 drivers/dma/idxd/Makefile
 create mode 100644 drivers/dma/idxd/device.c
 create mode 100644 drivers/dma/idxd/idxd.h
 create mode 100644 drivers/dma/idxd/init.c
 create mode 100644 drivers/dma/idxd/irq.c
 create mode 100644 drivers/dma/idxd/registers.h
 create mode 100644 drivers/dma/idxd/sysfs.c
 create mode 100644 include/uapi/linux/idxd.h

diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
new file mode 100644
index 000000000000..a552560a03dc
--- /dev/null
+++ b/drivers/dma/idxd/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INTEL_IDXD) += idxd.o
+idxd-y := init.o irq.o device.o sysfs.o
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
new file mode 100644
index 000000000000..d626780caa53
--- /dev/null
+++ b/drivers/dma/idxd/device.c
@@ -0,0 +1,688 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <uapi/linux/idxd.h>
+#include "idxd.h"
+#include "registers.h"
+
+static int idxd_cmd_wait(struct idxd_device *idxd, u32 *status, int timeout);
+static int idxd_cmd_send(struct idxd_device *idxd, int cmd_code, u32 operand);
+
+/* Interrupt control bits */
+int idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	int msixcnt = pci_msix_vec_count(pdev);
+	union msix_perm perm;
+	u32 offset;
+
+	if (vec_id < 0 || vec_id >= msixcnt)
+		return -EINVAL;
+
+	offset = idxd->msix_perm_offset + vec_id * 8;
+	perm.bits = ioread32(idxd->reg_base + offset);
+	perm.ignore = 1;
+	iowrite32(perm.bits, idxd->reg_base + offset);
+
+	return 0;
+}
+
+void idxd_mask_msix_vectors(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	int msixcnt = pci_msix_vec_count(pdev);
+	int i, rc;
+
+	for (i = 0; i < msixcnt; i++) {
+		rc = idxd_mask_msix_vector(idxd, i);
+		if (rc < 0)
+			dev_warn(&pdev->dev,
+				 "Failed disabling msix vec %d\n", i);
+	}
+}
+
+int idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	int msixcnt = pci_msix_vec_count(pdev);
+	union msix_perm perm;
+	u32 offset;
+
+	if (vec_id < 0 || vec_id >= msixcnt)
+		return -EINVAL;
+
+	offset = idxd->msix_perm_offset + vec_id * 8;
+	perm.bits = ioread32(idxd->reg_base + offset);
+	perm.ignore = 0;
+	iowrite32(perm.bits, idxd->reg_base + offset);
+
+	return 0;
+}
+
+void idxd_unmask_error_interrupts(struct idxd_device *idxd)
+{
+	union genctrl_reg genctrl;
+
+	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
+	genctrl.softerr_int_en = 1;
+	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
+}
+
+void idxd_mask_error_interrupts(struct idxd_device *idxd)
+{
+	union genctrl_reg genctrl;
+
+	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
+	genctrl.softerr_int_en = 0;
+	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
+}
+
+static void free_hw_descs(struct idxd_wq *wq)
+{
+	int i;
+
+	for (i = 0; i < wq->num_descs; i++)
+		kfree(wq->hw_descs[i]);
+
+	kfree(wq->hw_descs);
+}
+
+static int alloc_hw_descs(struct idxd_wq *wq, int num)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+	int i;
+	int node = dev_to_node(dev);
+
+	wq->hw_descs = kcalloc_node(num, sizeof(struct dsa_hw_desc *),
+				    GFP_KERNEL, node);
+	if (!wq->hw_descs)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		wq->hw_descs[i] = kzalloc_node(sizeof(*wq->hw_descs[i]),
+					       GFP_KERNEL, node);
+		if (!wq->hw_descs[i]) {
+			free_hw_descs(wq);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void free_descs(struct idxd_wq *wq)
+{
+	int i;
+
+	for (i = 0; i < wq->num_descs; i++)
+		kfree(wq->descs[i]);
+
+	kfree(wq->descs);
+}
+
+static int alloc_descs(struct idxd_wq *wq, int num)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+	int i;
+	int node = dev_to_node(dev);
+
+	wq->descs = kcalloc_node(num, sizeof(struct idxd_desc *),
+				 GFP_KERNEL, node);
+	if (!wq->descs)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		wq->descs[i] = kzalloc_node(sizeof(*wq->descs[i]),
+					    GFP_KERNEL, node);
+		if (!wq->descs[i]) {
+			free_descs(wq);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/* WQ control bits */
+int idxd_wq_alloc_resources(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_group *group = wq->group;
+	struct device *dev = &idxd->pdev->dev;
+	int rc, num_descs, i;
+
+	if (wq->type != IDXD_WQT_KERNEL)
+		return 0;
+
+	num_descs = wq->size +
+		idxd->hw.gen_cap.max_descs_per_engine * group->num_engines;
+	wq->num_descs = num_descs;
+
+	rc = alloc_hw_descs(wq, num_descs);
+	if (rc < 0)
+		return rc;
+
+	wq->compls_size = num_descs * sizeof(struct dsa_completion_record);
+	wq->compls = dma_alloc_coherent(dev, wq->compls_size,
+					&wq->compls_addr, GFP_KERNEL);
+	if (!wq->compls) {
+		rc = -ENOMEM;
+		goto fail_alloc_compls;
+	}
+
+	rc = alloc_descs(wq, num_descs);
+	if (rc < 0)
+		goto fail_alloc_descs;
+
+	rc = sbitmap_init_node(&wq->sbmap, num_descs, -1, GFP_KERNEL,
+			       dev_to_node(dev));
+	if (rc < 0)
+		goto fail_sbitmap_init;
+
+	for (i = 0; i < num_descs; i++) {
+		struct idxd_desc *desc = wq->descs[i];
+
+		desc->hw = wq->hw_descs[i];
+		desc->completion = &wq->compls[i];
+		desc->compl_dma  = wq->compls_addr +
+			sizeof(struct dsa_completion_record) * i;
+		desc->id = i;
+		desc->wq = wq;
+	}
+
+	return 0;
+
+ fail_sbitmap_init:
+	free_descs(wq);
+ fail_alloc_descs:
+	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
+ fail_alloc_compls:
+	free_hw_descs(wq);
+	return rc;
+}
+
+void idxd_wq_free_resources(struct idxd_wq *wq)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	if (wq->type != IDXD_WQT_KERNEL)
+		return;
+
+	free_hw_descs(wq);
+	free_descs(wq);
+	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
+	sbitmap_free(&wq->sbmap);
+}
+
+int idxd_wq_enable(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 status;
+	int rc;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	if (wq->state == IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d already enabled\n", wq->id);
+		return -ENXIO;
+	}
+
+	rc = idxd_cmd_send(idxd, IDXD_CMD_ENABLE_WQ, wq->id);
+	if (rc < 0)
+		return rc;
+	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
+	if (rc < 0)
+		return rc;
+
+	if (status != IDXD_CMDSTS_SUCCESS &&
+	    status != IDXD_CMDSTS_ERR_WQ_ENABLED) {
+		dev_dbg(dev, "WQ enable failed: %#x\n", status);
+		return -ENXIO;
+	}
+
+	wq->state = IDXD_WQ_ENABLED;
+	dev_dbg(dev, "WQ %d enabled\n", wq->id);
+	return 0;
+}
+
+int idxd_wq_disable(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 status, operand;
+	int rc;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	dev_dbg(dev, "Disabling WQ %d\n", wq->id);
+
+	if (wq->state != IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state);
+		return 0;
+	}
+
+	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
+	rc = idxd_cmd_send(idxd, IDXD_CMD_DISABLE_WQ, operand);
+	if (rc < 0)
+		return rc;
+	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
+	if (rc < 0)
+		return rc;
+
+	if (status != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "WQ disable failed: %#x\n", status);
+		return -ENXIO;
+	}
+
+	wq->state = IDXD_WQ_DISABLED;
+	dev_dbg(dev, "WQ %d disabled\n", wq->id);
+	return 0;
+}
+
+int idxd_wq_map_portal(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	resource_size_t start;
+
+	start = pci_resource_start(pdev, IDXD_WQ_BAR);
+	start = start + wq->id * IDXD_PORTAL_SIZE;
+
+	wq->dportal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE);
+	if (!wq->dportal)
+		return -ENOMEM;
+	dev_dbg(dev, "wq %d portal mapped at %p\n", wq->id, wq->dportal);
+
+	return 0;
+}
+
+void idxd_wq_unmap_portal(struct idxd_wq *wq)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	devm_iounmap(dev, wq->dportal);
+}
+
+/* Device control bits */
+static inline bool idxd_is_enabled(struct idxd_device *idxd)
+{
+	union gensts_reg gensts;
+
+	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
+
+	if (gensts.state == IDXD_DEVICE_STATE_ENABLED)
+		return true;
+	return false;
+}
+
+static int idxd_cmd_wait(struct idxd_device *idxd, u32 *status, int timeout)
+{
+	u32 sts, to = timeout;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	sts = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
+	while (sts & IDXD_CMDSTS_ACTIVE && --to) {
+		cpu_relax();
+		sts = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
+	}
+
+	if (to == 0 && sts & IDXD_CMDSTS_ACTIVE) {
+		dev_warn(&idxd->pdev->dev, "%s timed out!\n", __func__);
+		*status = 0;
+		return -EBUSY;
+	}
+
+	*status = sts;
+	return 0;
+}
+
+static int idxd_cmd_send(struct idxd_device *idxd, int cmd_code, u32 operand)
+{
+	union idxd_command_reg cmd;
+	int rc;
+	u32 status;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
+	if (rc < 0)
+		return rc;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cmd = cmd_code;
+	cmd.operand = operand;
+	dev_dbg(&idxd->pdev->dev, "%s: sending cmd: %#x op: %#x\n",
+		__func__, cmd_code, operand);
+	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
+
+	return 0;
+}
+
+int idxd_device_enable(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
+	u32 status;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	if (idxd_is_enabled(idxd)) {
+		dev_dbg(dev, "Device already enabled\n");
+		return -ENXIO;
+	}
+
+	rc = idxd_cmd_send(idxd, IDXD_CMD_ENABLE_DEVICE, 0);
+	if (rc < 0)
+		return rc;
+	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
+	if (rc < 0)
+		return rc;
+
+	/* If the command is successful or if the device was enabled */
+	if (status != IDXD_CMDSTS_SUCCESS &&
+	    status != IDXD_CMDSTS_ERR_DEV_ENABLED) {
+		dev_dbg(dev, "%s: err_code: %#x\n", __func__, status);
+		return -ENXIO;
+	}
+
+	idxd->state = IDXD_DEV_ENABLED;
+	return 0;
+}
+
+int idxd_device_disable(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
+	u32 status;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	if (!idxd_is_enabled(idxd)) {
+		dev_dbg(dev, "Device is not enabled\n");
+		return 0;
+	}
+
+	rc = idxd_cmd_send(idxd, IDXD_CMD_DISABLE_DEVICE, 0);
+	if (rc < 0)
+		return rc;
+	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
+	if (rc < 0)
+		return rc;
+
+	/* If the command is successful or if the device was disabled */
+	if (status != IDXD_CMDSTS_SUCCESS &&
+	    !(status & IDXD_CMDSTS_ERR_DIS_DEV_EN)) {
+		dev_dbg(dev, "%s: err_code: %#x\n", __func__, status);
+		rc = -ENXIO;
+		return rc;
+	}
+
+	idxd->state = IDXD_DEV_CONF_READY;
+	return 0;
+}
+
+int __idxd_device_reset(struct idxd_device *idxd)
+{
+	u32 status;
+	int rc;
+
+	rc = idxd_cmd_send(idxd, IDXD_CMD_RESET_DEVICE, 0);
+	if (rc < 0)
+		return rc;
+	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+int idxd_device_reset(struct idxd_device *idxd)
+{
+	unsigned long flags;
+	int rc;
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	rc = __idxd_device_reset(idxd);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	return rc;
+}
+
+/* Device configuration bits */
+static void idxd_group_config_write(struct idxd_group *group)
+{
+	struct idxd_device *idxd = group->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int i;
+	u32 grpcfg_offset;
+
+	dev_dbg(dev, "Writing group %d cfg registers\n", group->id);
+
+	/* setup GRPWQCFG */
+	for (i = 0; i < 4; i++) {
+		grpcfg_offset = idxd->grpcfg_offset +
+			group->id * 64 + i * sizeof(u64);
+		iowrite64(group->grpcfg.wqs[i],
+			  idxd->reg_base + grpcfg_offset);
+		dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n",
+			group->id, i, grpcfg_offset,
+			ioread64(idxd->reg_base + grpcfg_offset));
+	}
+
+	/* setup GRPENGCFG */
+	grpcfg_offset = idxd->grpcfg_offset + group->id * 64 + 32;
+	iowrite64(group->grpcfg.engines, idxd->reg_base + grpcfg_offset);
+	dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id,
+		grpcfg_offset, ioread64(idxd->reg_base + grpcfg_offset));
+
+	/* setup GRPFLAGS */
+	grpcfg_offset = idxd->grpcfg_offset + group->id * 64 + 40;
+	iowrite32(group->grpcfg.flags.bits, idxd->reg_base + grpcfg_offset);
+	dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n",
+		group->id, grpcfg_offset,
+		ioread32(idxd->reg_base + grpcfg_offset));
+}
+
+static int idxd_groups_config_write(struct idxd_device *idxd)
+
+{
+	union gencfg_reg reg;
+	int i;
+	struct device *dev = &idxd->pdev->dev;
+
+	/* Setup bandwidth token limit */
+	if (idxd->token_limit) {
+		reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+		reg.token_limit = idxd->token_limit;
+		iowrite32(reg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET);
+	}
+
+	dev_dbg(dev, "GENCFG(%#x): %#x\n", IDXD_GENCFG_OFFSET,
+		ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET));
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = &idxd->groups[i];
+
+		idxd_group_config_write(group);
+	}
+
+	return 0;
+}
+
+static int idxd_wq_config_write(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 wq_offset;
+	int i;
+
+	if (!wq->group)
+		return 0;
+
+	memset(&wq->wqcfg, 0, sizeof(union wqcfg));
+
+	/* byte 0-3 */
+	wq->wqcfg.wq_size = wq->size;
+
+	if (wq->size == 0) {
+		dev_warn(dev, "Incorrect work queue size: 0\n");
+		return -EINVAL;
+	}
+
+	/* bytes 4-7 */
+	wq->wqcfg.wq_thresh = wq->threshold;
+
+	/* byte 8-11 */
+	wq->wqcfg.priv = 1; /* kernel, therefore priv */
+	wq->wqcfg.mode = 1;
+
+	wq->wqcfg.priority = wq->priority;
+
+	/* bytes 12-15 */
+	wq->wqcfg.max_xfer_shift = idxd->hw.gen_cap.max_xfer_shift;
+	wq->wqcfg.max_batch_shift = idxd->hw.gen_cap.max_batch_shift;
+
+	dev_dbg(dev, "WQ %d CFGs\n", wq->id);
+	for (i = 0; i < 8; i++) {
+		wq_offset = idxd->wqcfg_offset + wq->id * 32 + i * sizeof(u32);
+		iowrite32(wq->wqcfg.bits[i], idxd->reg_base + wq_offset);
+		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n",
+			wq->id, i, wq_offset,
+			ioread32(idxd->reg_base + wq_offset));
+	}
+
+	return 0;
+}
+
+static int idxd_wqs_config_write(struct idxd_device *idxd)
+{
+	int i, rc;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		rc = idxd_wq_config_write(wq);
+		if (rc < 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+static void idxd_group_flags_setup(struct idxd_device *idxd)
+{
+	int i;
+
+	/* TC-A 0 and TC-B 1 should be defaults */
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = &idxd->groups[i];
+
+		if (group->tc_a == -1)
+			group->grpcfg.flags.tc_a = 0;
+		else
+			group->grpcfg.flags.tc_a = group->tc_a;
+		if (group->tc_b == -1)
+			group->grpcfg.flags.tc_b = 1;
+		else
+			group->grpcfg.flags.tc_b = group->tc_b;
+		group->grpcfg.flags.use_token_limit = group->use_token_limit;
+		group->grpcfg.flags.tokens_reserved = group->tokens_reserved;
+		if (group->tokens_allowed)
+			group->grpcfg.flags.tokens_allowed =
+				group->tokens_allowed;
+		else
+			group->grpcfg.flags.tokens_allowed = idxd->max_tokens;
+	}
+}
+
+static int idxd_engines_setup(struct idxd_device *idxd)
+{
+	int i, engines = 0;
+	struct idxd_engine *eng;
+	struct idxd_group *group;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = &idxd->groups[i];
+		group->grpcfg.engines = 0;
+	}
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		eng = &idxd->engines[i];
+		group = eng->group;
+
+		if (!group)
+			continue;
+
+		group->grpcfg.engines |= BIT(eng->id);
+		engines++;
+	}
+
+	if (!engines)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int idxd_wqs_setup(struct idxd_device *idxd)
+{
+	struct idxd_wq *wq;
+	struct idxd_group *group;
+	int i, j, configured = 0;
+	struct device *dev = &idxd->pdev->dev;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = &idxd->groups[i];
+		for (j = 0; j < 4; j++)
+			group->grpcfg.wqs[j] = 0;
+	}
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = &idxd->wqs[i];
+		group = wq->group;
+
+		if (!wq->group)
+			continue;
+		if (!wq->size)
+			continue;
+
+		if (!wq_dedicated(wq)) {
+			dev_warn(dev, "No shared workqueue support.\n");
+			return -EINVAL;
+		}
+
+		group->grpcfg.wqs[wq->id / 64] |= BIT(wq->id % 64);
+		configured++;
+	}
+
+	if (configured == 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+int idxd_device_config(struct idxd_device *idxd)
+{
+	int rc;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	rc = idxd_wqs_setup(idxd);
+	if (rc < 0)
+		return rc;
+
+	rc = idxd_engines_setup(idxd);
+	if (rc < 0)
+		return rc;
+
+	idxd_group_flags_setup(idxd);
+
+	rc = idxd_wqs_config_write(idxd);
+	if (rc < 0)
+		return rc;
+
+	rc = idxd_groups_config_write(idxd);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
new file mode 100644
index 000000000000..909926aefd3e
--- /dev/null
+++ b/drivers/dma/idxd/idxd.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#ifndef _IDXD_H_
+#define _IDXD_H_
+
+#include <linux/sbitmap.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/wait.h>
+#include "registers.h"
+
+#define IDXD_DRIVER_VERSION	"1.00"
+
+extern struct kmem_cache *idxd_desc_pool;
+
+#define IDXD_REG_TIMEOUT	50
+#define IDXD_DRAIN_TIMEOUT	5000
+
+enum idxd_type {
+	IDXD_TYPE_UNKNOWN = -1,
+	IDXD_TYPE_DSA = 0,
+	IDXD_TYPE_MAX
+};
+
+#define IDXD_NAME_SIZE		128
+
+struct idxd_device_driver {
+	struct device_driver drv;
+};
+
+struct idxd_irq_entry {
+	struct idxd_device *idxd;
+	int id;
+	struct llist_head pending_llist;
+	struct list_head work_list;
+};
+
+struct idxd_group {
+	struct device conf_dev;
+	struct idxd_device *idxd;
+	struct grpcfg grpcfg;
+	int id;
+	int num_engines;
+	int num_wqs;
+	bool use_token_limit;
+	u8 tokens_allowed;
+	u8 tokens_reserved;
+	int tc_a;
+	int tc_b;
+};
+
+#define IDXD_MAX_PRIORITY	0xf
+
+enum idxd_wq_state {
+	IDXD_WQ_DISABLED = 0,
+	IDXD_WQ_ENABLED,
+};
+
+enum idxd_wq_flag {
+	WQ_FLAG_DEDICATED = 0,
+};
+
+enum idxd_wq_type {
+	IDXD_WQT_NONE = 0,
+	IDXD_WQT_KERNEL,
+};
+
+#define IDXD_ALLOCATED_BATCH_SIZE	128U
+#define WQ_NAME_SIZE   1024
+#define WQ_TYPE_SIZE   10
+
+struct idxd_wq {
+	void __iomem *dportal;
+	struct device conf_dev;
+	struct idxd_device *idxd;
+	int id;
+	enum idxd_wq_type type;
+	struct idxd_group *group;
+	int client_count;
+	struct mutex wq_lock;	/* mutex for workqueue */
+	u32 size;
+	u32 threshold;
+	u32 priority;
+	enum idxd_wq_state state;
+	unsigned long flags;
+	union wqcfg wqcfg;
+	atomic_t dq_count;	/* dedicated queue flow control */
+	u32 vec_ptr;		/* interrupt steering */
+	struct dsa_hw_desc **hw_descs;
+	int num_descs;
+	struct dsa_completion_record *compls;
+	dma_addr_t compls_addr;
+	int compls_size;
+	struct idxd_desc **descs;
+	struct sbitmap sbmap;
+	struct percpu_rw_semaphore submit_lock;
+	wait_queue_head_t submit_waitq;
+	char name[WQ_NAME_SIZE + 1];
+};
+
+struct idxd_engine {
+	struct device conf_dev;
+	int id;
+	struct idxd_group *group;
+	struct idxd_device *idxd;
+};
+
+/* shadow registers */
+struct idxd_hw {
+	u32 version;
+	union gen_cap_reg gen_cap;
+	union wq_cap_reg wq_cap;
+	union group_cap_reg group_cap;
+	union engine_cap_reg engine_cap;
+	struct opcap opcap;
+};
+
+enum idxd_device_state {
+	IDXD_DEV_HALTED = -1,
+	IDXD_DEV_DISABLED = 0,
+	IDXD_DEV_CONF_READY,
+	IDXD_DEV_ENABLED,
+};
+
+enum idxd_device_flag {
+	IDXD_FLAG_CONFIGURABLE = 0,
+};
+
+struct idxd_device {
+	enum idxd_type type;
+	struct device conf_dev;
+	struct list_head list;
+	struct idxd_hw hw;
+	enum idxd_device_state state;
+	unsigned long flags;
+	int id;
+
+	struct pci_dev *pdev;
+	void __iomem *reg_base;
+
+	spinlock_t dev_lock;	/* spinlock for device */
+	struct idxd_group *groups;
+	struct idxd_wq *wqs;
+	struct idxd_engine *engines;
+
+	int num_groups;
+
+	u32 msix_perm_offset;
+	u32 wqcfg_offset;
+	u32 grpcfg_offset;
+	u32 perfmon_offset;
+
+	u64 max_xfer_bytes;
+	u32 max_batch_size;
+	int max_groups;
+	int max_engines;
+	int max_tokens;
+	int max_wqs;
+	int max_wq_size;
+	int token_limit;
+	int nr_tokens;		/* non-reserved tokens */
+
+	union sw_err_reg sw_err;
+
+	struct msix_entry *msix_entries;
+	int num_wq_irqs;
+	struct idxd_irq_entry *irq_entries;
+};
+
+/* IDXD software descriptor */
+struct idxd_desc {
+	struct dsa_hw_desc *hw;
+	dma_addr_t desc_dma;
+	struct dsa_completion_record *completion;
+	dma_addr_t compl_dma;
+	struct llist_node llnode;
+	struct list_head list;
+	int id;
+	struct idxd_wq *wq;
+};
+
+#define confdev_to_idxd(dev) container_of(dev, struct idxd_device, conf_dev)
+#define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev)
+
+static inline bool wq_dedicated(struct idxd_wq *wq)
+{
+	return test_bit(WQ_FLAG_DEDICATED, &wq->flags);
+}
+
+static inline void idxd_set_type(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+
+	if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0)
+		idxd->type = IDXD_TYPE_DSA;
+	else
+		idxd->type = IDXD_TYPE_UNKNOWN;
+}
+
+static inline void idxd_wq_get(struct idxd_wq *wq)
+{
+	wq->client_count++;
+}
+
+static inline void idxd_wq_put(struct idxd_wq *wq)
+{
+	wq->client_count--;
+}
+
+static inline int idxd_wq_refcount(struct idxd_wq *wq)
+{
+	return wq->client_count;
+};
+
+const char *idxd_get_dev_name(struct idxd_device *idxd);
+int idxd_register_bus_type(void);
+void idxd_unregister_bus_type(void);
+int idxd_setup_sysfs(struct idxd_device *idxd);
+void idxd_cleanup_sysfs(struct idxd_device *idxd);
+int idxd_register_driver(void);
+void idxd_unregister_driver(void);
+
+/* device interrupt control */
+irqreturn_t idxd_irq_handler(int vec, void *data);
+irqreturn_t idxd_misc_thread(int vec, void *data);
+irqreturn_t idxd_wq_thread(int irq, void *data);
+void idxd_mask_error_interrupts(struct idxd_device *idxd);
+void idxd_unmask_error_interrupts(struct idxd_device *idxd);
+void idxd_mask_msix_vectors(struct idxd_device *idxd);
+int idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
+int idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
+
+/* device control */
+int idxd_device_enable(struct idxd_device *idxd);
+int idxd_device_disable(struct idxd_device *idxd);
+int idxd_device_reset(struct idxd_device *idxd);
+int __idxd_device_reset(struct idxd_device *idxd);
+void idxd_device_cleanup(struct idxd_device *idxd);
+int idxd_device_config(struct idxd_device *idxd);
+void idxd_device_wqs_clear_state(struct idxd_device *idxd);
+
+/* work queue control */
+int idxd_wq_alloc_resources(struct idxd_wq *wq);
+void idxd_wq_free_resources(struct idxd_wq *wq);
+int idxd_wq_enable(struct idxd_wq *wq);
+int idxd_wq_disable(struct idxd_wq *wq);
+int idxd_wq_map_portal(struct idxd_wq *wq);
+void idxd_wq_unmap_portal(struct idxd_wq *wq);
+
+#endif
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
new file mode 100644
index 000000000000..229386464923
--- /dev/null
+++ b/drivers/dma/idxd/init.c
@@ -0,0 +1,493 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/aer.h>
+#include <linux/fs.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+
+MODULE_VERSION(IDXD_DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+
+#define DRV_NAME "idxd"
+
+static struct idr idxd_idrs[IDXD_TYPE_MAX];
+static struct mutex idxd_idr_lock;
+
+static struct pci_device_id idxd_pci_tbl[] = {
+	/* DSA ver 1.0 platforms */
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_DSA_SPR0) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, idxd_pci_tbl);
+
+static char *idxd_name[] = {
+	"dsa",
+};
+
+const char *idxd_get_dev_name(struct idxd_device *idxd)
+{
+	return idxd_name[idxd->type];
+}
+
+static int idxd_setup_interrupts(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	struct msix_entry *msix;
+	struct idxd_irq_entry *irq_entry;
+	int i, msixcnt;
+	int rc = 0;
+
+	msixcnt = pci_msix_vec_count(pdev);
+	if (msixcnt < 0) {
+		dev_err(dev, "Not MSI-X interrupt capable.\n");
+		goto err_no_irq;
+	}
+
+	idxd->msix_entries = devm_kzalloc(dev, sizeof(struct msix_entry) *
+			msixcnt, GFP_KERNEL);
+	if (!idxd->msix_entries) {
+		rc = -ENOMEM;
+		goto err_no_irq;
+	}
+
+	for (i = 0; i < msixcnt; i++)
+		idxd->msix_entries[i].entry = i;
+
+	rc = pci_enable_msix_exact(pdev, idxd->msix_entries, msixcnt);
+	if (rc) {
+		dev_err(dev, "Failed enabling %d MSIX entries.\n", msixcnt);
+		goto err_no_irq;
+	}
+	dev_dbg(dev, "Enabled %d msix vectors\n", msixcnt);
+
+	/*
+	 * We implement 1 completion list per MSI-X entry except for
+	 * entry 0, which is for errors and others.
+	 */
+	idxd->irq_entries = devm_kcalloc(dev, msixcnt,
+					 sizeof(struct idxd_irq_entry),
+					 GFP_KERNEL);
+	if (!idxd->irq_entries) {
+		rc = -ENOMEM;
+		goto err_no_irq;
+	}
+
+	for (i = 0; i < msixcnt; i++) {
+		idxd->irq_entries[i].id = i;
+		idxd->irq_entries[i].idxd = idxd;
+	}
+
+	msix = &idxd->msix_entries[0];
+	irq_entry = &idxd->irq_entries[0];
+	rc = devm_request_threaded_irq(dev, msix->vector, idxd_irq_handler,
+				       idxd_misc_thread, 0, "idxd-misc",
+				       irq_entry);
+	if (rc < 0) {
+		dev_err(dev, "Failed to allocate misc interrupt.\n");
+		goto err_no_irq;
+	}
+
+	dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n",
+		msix->vector);
+
+	/* first MSI-X entry is not for wq interrupts */
+	idxd->num_wq_irqs = msixcnt - 1;
+
+	for (i = 1; i < msixcnt; i++) {
+		msix = &idxd->msix_entries[i];
+		irq_entry = &idxd->irq_entries[i];
+
+		init_llist_head(&idxd->irq_entries[i].pending_llist);
+		INIT_LIST_HEAD(&idxd->irq_entries[i].work_list);
+		rc = devm_request_threaded_irq(dev, msix->vector,
+					       idxd_irq_handler,
+					       idxd_wq_thread, 0,
+					       "idxd-portal", irq_entry);
+		if (rc < 0) {
+			dev_err(dev, "Failed to allocate irq %d.\n",
+				msix->vector);
+			goto err_no_irq;
+		}
+		dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n",
+			i, msix->vector);
+	}
+
+	idxd_unmask_error_interrupts(idxd);
+
+	return 0;
+
+ err_no_irq:
+	/* Disable error interrupt generation */
+	idxd_mask_error_interrupts(idxd);
+	pci_disable_msix(pdev);
+	dev_err(dev, "No usable interrupts\n");
+	return rc;
+}
+
+static void idxd_wqs_free_lock(struct idxd_device *idxd)
+{
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		percpu_free_rwsem(&wq->submit_lock);
+	}
+}
+
+static int idxd_setup_internals(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int i;
+
+	idxd->groups = devm_kcalloc(dev, idxd->max_groups,
+				    sizeof(struct idxd_group), GFP_KERNEL);
+	if (!idxd->groups)
+		return -ENOMEM;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		idxd->groups[i].idxd = idxd;
+		idxd->groups[i].id = i;
+		idxd->groups[i].tc_a = -1;
+		idxd->groups[i].tc_b = -1;
+	}
+
+	idxd->wqs = devm_kcalloc(dev, idxd->max_wqs, sizeof(struct idxd_wq),
+				 GFP_KERNEL);
+	if (!idxd->wqs)
+		return -ENOMEM;
+
+	idxd->engines = devm_kcalloc(dev, idxd->max_engines,
+				     sizeof(struct idxd_engine), GFP_KERNEL);
+	if (!idxd->engines)
+		return -ENOMEM;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+		int rc;
+
+		wq->id = i;
+		wq->idxd = idxd;
+		mutex_init(&wq->wq_lock);
+		atomic_set(&wq->dq_count, 0);
+		init_waitqueue_head(&wq->submit_waitq);
+		rc = percpu_init_rwsem(&wq->submit_lock);
+		if (rc < 0) {
+			idxd_wqs_free_lock(idxd);
+			return rc;
+		}
+	}
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		idxd->engines[i].idxd = idxd;
+		idxd->engines[i].id = i;
+	}
+
+	return 0;
+}
+
+static void idxd_read_table_offsets(struct idxd_device *idxd)
+{
+	union offsets_reg offsets;
+	struct device *dev = &idxd->pdev->dev;
+
+	offsets.bits[0] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET);
+	offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET
+			+ sizeof(u64));
+	idxd->grpcfg_offset = offsets.grpcfg * 0x100;
+	dev_dbg(dev, "IDXD Group Config Offset: %#x\n", idxd->grpcfg_offset);
+	idxd->wqcfg_offset = offsets.wqcfg * 0x100;
+	dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n",
+		idxd->wqcfg_offset);
+	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
+	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
+		idxd->msix_perm_offset);
+	idxd->perfmon_offset = offsets.perfmon * 0x100;
+	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
+}
+
+static void idxd_read_caps(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int i;
+
+	/* reading generic capabilities */
+	idxd->hw.gen_cap.bits = ioread64(idxd->reg_base + IDXD_GENCAP_OFFSET);
+	dev_dbg(dev, "gen_cap: %#llx\n", idxd->hw.gen_cap.bits);
+	idxd->max_xfer_bytes = 1ULL << idxd->hw.gen_cap.max_xfer_shift;
+	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
+	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
+	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
+	if (idxd->hw.gen_cap.config_en)
+		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
+
+	/* reading group capabilities */
+	idxd->hw.group_cap.bits =
+		ioread64(idxd->reg_base + IDXD_GRPCAP_OFFSET);
+	dev_dbg(dev, "group_cap: %#llx\n", idxd->hw.group_cap.bits);
+	idxd->max_groups = idxd->hw.group_cap.num_groups;
+	dev_dbg(dev, "max groups: %u\n", idxd->max_groups);
+	idxd->max_tokens = idxd->hw.group_cap.total_tokens;
+	dev_dbg(dev, "max tokens: %u\n", idxd->max_tokens);
+	idxd->nr_tokens = idxd->max_tokens;
+
+	/* read engine capabilities */
+	idxd->hw.engine_cap.bits =
+		ioread64(idxd->reg_base + IDXD_ENGCAP_OFFSET);
+	dev_dbg(dev, "engine_cap: %#llx\n", idxd->hw.engine_cap.bits);
+	idxd->max_engines = idxd->hw.engine_cap.num_engines;
+	dev_dbg(dev, "max engines: %u\n", idxd->max_engines);
+
+	/* read workqueue capabilities */
+	idxd->hw.wq_cap.bits = ioread64(idxd->reg_base + IDXD_WQCAP_OFFSET);
+	dev_dbg(dev, "wq_cap: %#llx\n", idxd->hw.wq_cap.bits);
+	idxd->max_wq_size = idxd->hw.wq_cap.total_wq_size;
+	dev_dbg(dev, "total workqueue size: %u\n", idxd->max_wq_size);
+	idxd->max_wqs = idxd->hw.wq_cap.num_wqs;
+	dev_dbg(dev, "max workqueues: %u\n", idxd->max_wqs);
+
+	/* reading operation capabilities */
+	for (i = 0; i < 4; i++) {
+		idxd->hw.opcap.bits[i] = ioread64(idxd->reg_base +
+				IDXD_OPCAP_OFFSET + i * sizeof(u64));
+		dev_dbg(dev, "opcap[%d]: %#llx\n", i, idxd->hw.opcap.bits[i]);
+	}
+}
+
+static struct idxd_device *idxd_alloc(struct pci_dev *pdev,
+				      void __iomem * const *iomap)
+{
+	struct device *dev = &pdev->dev;
+	struct idxd_device *idxd;
+
+	idxd = devm_kzalloc(dev, sizeof(struct idxd_device), GFP_KERNEL);
+	if (!idxd)
+		return NULL;
+
+	idxd->pdev = pdev;
+	idxd->reg_base = iomap[IDXD_MMIO_BAR];
+	spin_lock_init(&idxd->dev_lock);
+
+	return idxd;
+}
+
+static int idxd_probe(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	int rc;
+
+	dev_dbg(dev, "%s entered and resetting device\n", __func__);
+	rc = idxd_device_reset(idxd);
+	if (rc < 0)
+		return rc;
+	dev_dbg(dev, "IDXD reset complete\n");
+
+	idxd_read_caps(idxd);
+	idxd_read_table_offsets(idxd);
+
+	rc = idxd_setup_internals(idxd);
+	if (rc)
+		goto err_setup;
+
+	rc = idxd_setup_interrupts(idxd);
+	if (rc)
+		goto err_setup;
+
+	dev_dbg(dev, "IDXD interrupt setup complete.\n");
+
+	mutex_lock(&idxd_idr_lock);
+	idxd->id = idr_alloc(&idxd_idrs[idxd->type], idxd, 0, 0, GFP_KERNEL);
+	mutex_unlock(&idxd_idr_lock);
+	if (idxd->id < 0) {
+		rc = -ENOMEM;
+		goto err_idr_fail;
+	}
+
+	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
+	return 0;
+
+ err_idr_fail:
+	idxd_mask_error_interrupts(idxd);
+	idxd_mask_msix_vectors(idxd);
+ err_setup:
+	return rc;
+}
+
+static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	void __iomem * const *iomap;
+	struct device *dev = &pdev->dev;
+	struct idxd_device *idxd;
+	int rc;
+	unsigned int mask;
+
+	rc = pcim_enable_device(pdev);
+	if (rc)
+		return rc;
+
+	dev_dbg(dev, "Mapping BARs\n");
+	mask = (1 << IDXD_MMIO_BAR);
+	rc = pcim_iomap_regions(pdev, mask, DRV_NAME);
+	if (rc)
+		return rc;
+
+	iomap = pcim_iomap_table(pdev);
+	if (!iomap)
+		return -ENOMEM;
+
+	dev_dbg(dev, "Set DMA masks\n");
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc)
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (rc)
+		return rc;
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc)
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (rc)
+		return rc;
+
+	dev_dbg(dev, "Alloc IDXD context\n");
+	idxd = idxd_alloc(pdev, iomap);
+	if (!idxd)
+		return -ENOMEM;
+
+	idxd_set_type(idxd);
+
+	dev_dbg(dev, "Set PCI master\n");
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, idxd);
+
+	idxd->hw.version = ioread32(idxd->reg_base + IDXD_VER_OFFSET);
+	rc = idxd_probe(idxd);
+	if (rc) {
+		dev_err(dev, "Intel(R) IDXD DMA Engine init failed\n");
+		return -ENODEV;
+	}
+
+	rc = idxd_setup_sysfs(idxd);
+	if (rc) {
+		dev_err(dev, "IDXD sysfs setup failed\n");
+		return -ENODEV;
+	}
+
+	idxd->state = IDXD_DEV_CONF_READY;
+
+	dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n",
+		 idxd->hw.version);
+
+	return 0;
+}
+
+static void idxd_shutdown(struct pci_dev *pdev)
+{
+	struct idxd_device *idxd = pci_get_drvdata(pdev);
+	int rc, i;
+	struct idxd_irq_entry *irq_entry;
+	int msixcnt = pci_msix_vec_count(pdev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	rc = idxd_device_disable(idxd);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	if (rc)
+		dev_err(&pdev->dev, "Disabling device failed\n");
+
+	dev_dbg(&pdev->dev, "%s called\n", __func__);
+	idxd_mask_msix_vectors(idxd);
+	idxd_mask_error_interrupts(idxd);
+
+	for (i = 0; i < msixcnt; i++) {
+		irq_entry = &idxd->irq_entries[i];
+		synchronize_irq(idxd->msix_entries[i].vector);
+		if (i == 0)
+			continue;
+	}
+}
+
+static void idxd_remove(struct pci_dev *pdev)
+{
+	struct idxd_device *idxd = pci_get_drvdata(pdev);
+
+	dev_dbg(&pdev->dev, "%s called\n", __func__);
+	idxd_cleanup_sysfs(idxd);
+	idxd_shutdown(pdev);
+	idxd_wqs_free_lock(idxd);
+	mutex_lock(&idxd_idr_lock);
+	idr_remove(&idxd_idrs[idxd->type], idxd->id);
+	mutex_unlock(&idxd_idr_lock);
+}
+
+static struct pci_driver idxd_pci_driver = {
+	.name		= DRV_NAME,
+	.id_table	= idxd_pci_tbl,
+	.probe		= idxd_pci_probe,
+	.remove		= idxd_remove,
+	.shutdown	= idxd_shutdown,
+};
+
+static int __init idxd_init_module(void)
+{
+	int err, i;
+
+	/*
+	 * If the CPU does not support write512, there's no point in
+	 * enumerating the device. We can not utilize it.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) {
+		pr_warn("idxd driver failed to load without MOVDIR64B.\n");
+		return -ENODEV;
+	}
+
+	pr_info("%s: Intel(R) Accelerator Devices Driver %s\n",
+		DRV_NAME, IDXD_DRIVER_VERSION);
+
+	mutex_init(&idxd_idr_lock);
+	for (i = 0; i < IDXD_TYPE_MAX; i++)
+		idr_init(&idxd_idrs[i]);
+
+	err = idxd_register_bus_type();
+	if (err < 0)
+		return err;
+
+	err = idxd_register_driver();
+	if (err < 0)
+		goto err_idxd_driver_register;
+
+	err = pci_register_driver(&idxd_pci_driver);
+	if (err)
+		goto err_pci_register;
+
+	return 0;
+
+err_pci_register:
+	idxd_unregister_driver();
+err_idxd_driver_register:
+	idxd_unregister_bus_type();
+	return err;
+}
+module_init(idxd_init_module);
+
+static void __exit idxd_exit_module(void)
+{
+	pci_unregister_driver(&idxd_pci_driver);
+	idxd_unregister_bus_type();
+}
+module_exit(idxd_exit_module);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
new file mode 100644
index 000000000000..de4b80973c2f
--- /dev/null
+++ b/drivers/dma/idxd/irq.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <uapi/linux/idxd.h>
+#include "idxd.h"
+#include "registers.h"
+
+void idxd_device_wqs_clear_state(struct idxd_device *idxd)
+{
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		wq->state = IDXD_WQ_DISABLED;
+	}
+}
+
+static int idxd_restart(struct idxd_device *idxd)
+{
+	int i, rc;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	rc = __idxd_device_reset(idxd);
+	if (rc < 0)
+		goto out;
+
+	rc = idxd_device_config(idxd);
+	if (rc < 0)
+		goto out;
+
+	rc = idxd_device_enable(idxd);
+	if (rc < 0)
+		goto out;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		if (wq->state == IDXD_WQ_ENABLED) {
+			rc = idxd_wq_enable(wq);
+			if (rc < 0) {
+				dev_warn(&idxd->pdev->dev,
+					 "Unable to re-enable wq %s\n",
+					 dev_name(&wq->conf_dev));
+			}
+		}
+	}
+
+	return 0;
+
+ out:
+	idxd_device_wqs_clear_state(idxd);
+	idxd->state = IDXD_DEV_HALTED;
+	return rc;
+}
+
+irqreturn_t idxd_irq_handler(int vec, void *data)
+{
+	struct idxd_irq_entry *irq_entry = data;
+	struct idxd_device *idxd = irq_entry->idxd;
+
+	idxd_mask_msix_vector(idxd, irq_entry->id);
+	return IRQ_WAKE_THREAD;
+}
+
+irqreturn_t idxd_misc_thread(int vec, void *data)
+{
+	struct idxd_irq_entry *irq_entry = data;
+	struct idxd_device *idxd = irq_entry->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	union gensts_reg gensts;
+	u32 cause, val = 0;
+	int i, rc;
+	bool err = false;
+
+	cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+
+	if (cause & IDXD_INTC_ERR) {
+		spin_lock_bh(&idxd->dev_lock);
+		for (i = 0; i < 4; i++)
+			idxd->sw_err.bits[i] = ioread64(idxd->reg_base +
+					IDXD_SWERR_OFFSET + i * sizeof(u64));
+		iowrite64(IDXD_SWERR_ACK, idxd->reg_base + IDXD_SWERR_OFFSET);
+		spin_unlock_bh(&idxd->dev_lock);
+		val |= IDXD_INTC_ERR;
+
+		for (i = 0; i < 4; i++)
+			dev_warn(dev, "err[%d]: %#16.16llx\n",
+				 i, idxd->sw_err.bits[i]);
+		err = true;
+	}
+
+	if (cause & IDXD_INTC_CMD) {
+		/* Driver does use command interrupts */
+		val |= IDXD_INTC_CMD;
+	}
+
+	if (cause & IDXD_INTC_OCCUPY) {
+		/* Driver does not utilize occupancy interrupt */
+		val |= IDXD_INTC_OCCUPY;
+	}
+
+	if (cause & IDXD_INTC_PERFMON_OVFL) {
+		/*
+		 * Driver does not utilize perfmon counter overflow interrupt
+		 * yet.
+		 */
+		val |= IDXD_INTC_PERFMON_OVFL;
+	}
+
+	val ^= cause;
+	if (val)
+		dev_warn_once(dev, "Unexpected interrupt cause bits set: %#x\n",
+			      val);
+
+	iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+	if (!err)
+		return IRQ_HANDLED;
+
+	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
+	if (gensts.state == IDXD_DEVICE_STATE_HALT) {
+		spin_lock_bh(&idxd->dev_lock);
+		if (gensts.reset_type == IDXD_DEVICE_RESET_SOFTWARE) {
+			rc = idxd_restart(idxd);
+			if (rc < 0)
+				dev_err(&idxd->pdev->dev,
+					"idxd restart failed, device halt.");
+		} else {
+			idxd_device_wqs_clear_state(idxd);
+			idxd->state = IDXD_DEV_HALTED;
+			dev_err(&idxd->pdev->dev,
+				"idxd halted, need %s.\n",
+				gensts.reset_type == IDXD_DEVICE_RESET_FLR ?
+				"FLR" : "system reset");
+		}
+		spin_unlock_bh(&idxd->dev_lock);
+	}
+
+	idxd_unmask_msix_vector(idxd, irq_entry->id);
+	return IRQ_HANDLED;
+}
+
+irqreturn_t idxd_wq_thread(int irq, void *data)
+{
+	struct idxd_irq_entry *irq_entry = data;
+
+	idxd_unmask_msix_vector(irq_entry->idxd, irq_entry->id);
+
+	return IRQ_HANDLED;
+}
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
new file mode 100644
index 000000000000..a39e7ae6b3d9
--- /dev/null
+++ b/drivers/dma/idxd/registers.h
@@ -0,0 +1,336 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#ifndef _IDXD_REGISTERS_H_
+#define _IDXD_REGISTERS_H_
+
+/* PCI Config */
+#define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
+
+#define IDXD_MMIO_BAR		0
+#define IDXD_WQ_BAR		2
+#define IDXD_PORTAL_SIZE	0x4000
+
+/* MMIO Device BAR0 Registers */
+#define IDXD_VER_OFFSET			0x00
+#define IDXD_VER_MAJOR_MASK		0xf0
+#define IDXD_VER_MINOR_MASK		0x0f
+#define GET_IDXD_VER_MAJOR(x)		(((x) & IDXD_VER_MAJOR_MASK) >> 4)
+#define GET_IDXD_VER_MINOR(x)		((x) & IDXD_VER_MINOR_MASK)
+
+union gen_cap_reg {
+	struct {
+		u64 block_on_fault:1;
+		u64 overlap_copy:1;
+		u64 cache_control_mem:1;
+		u64 cache_control_cache:1;
+		u64 rsvd:3;
+		u64 int_handle_req:1;
+		u64 dest_readback:1;
+		u64 drain_readback:1;
+		u64 rsvd2:6;
+		u64 max_xfer_shift:5;
+		u64 max_batch_shift:4;
+		u64 max_ims_mult:6;
+		u64 config_en:1;
+		u64 max_descs_per_engine:8;
+		u64 rsvd3:24;
+	};
+	u64 bits;
+} __packed;
+#define IDXD_GENCAP_OFFSET		0x10
+
+union wq_cap_reg {
+	struct {
+		u64 total_wq_size:16;
+		u64 num_wqs:8;
+		u64 rsvd:24;
+		u64 shared_mode:1;
+		u64 dedicated_mode:1;
+		u64 rsvd2:1;
+		u64 priority:1;
+		u64 occupancy:1;
+		u64 occupancy_int:1;
+		u64 rsvd3:10;
+	};
+	u64 bits;
+} __packed;
+#define IDXD_WQCAP_OFFSET		0x20
+
+union group_cap_reg {
+	struct {
+		u64 num_groups:8;
+		u64 total_tokens:8;
+		u64 token_en:1;
+		u64 token_limit:1;
+		u64 rsvd:46;
+	};
+	u64 bits;
+} __packed;
+#define IDXD_GRPCAP_OFFSET		0x30
+
+union engine_cap_reg {
+	struct {
+		u64 num_engines:8;
+		u64 rsvd:56;
+	};
+	u64 bits;
+} __packed;
+
+#define IDXD_ENGCAP_OFFSET		0x38
+
+#define IDXD_OPCAP_NOOP			0x0001
+#define IDXD_OPCAP_BATCH			0x0002
+#define IDXD_OPCAP_MEMMOVE		0x0008
+struct opcap {
+	u64 bits[4];
+};
+
+#define IDXD_OPCAP_OFFSET		0x40
+
+#define IDXD_TABLE_OFFSET		0x60
+union offsets_reg {
+	struct {
+		u64 grpcfg:16;
+		u64 wqcfg:16;
+		u64 msix_perm:16;
+		u64 ims:16;
+		u64 perfmon:16;
+		u64 rsvd:48;
+	};
+	u64 bits[2];
+} __packed;
+
+#define IDXD_GENCFG_OFFSET		0x80
+union gencfg_reg {
+	struct {
+		u32 token_limit:8;
+		u32 rsvd:4;
+		u32 user_int_en:1;
+		u32 rsvd2:19;
+	};
+	u32 bits;
+} __packed;
+
+#define IDXD_GENCTRL_OFFSET		0x88
+union genctrl_reg {
+	struct {
+		u32 softerr_int_en:1;
+		u32 rsvd:31;
+	};
+	u32 bits;
+} __packed;
+
+#define IDXD_GENSTATS_OFFSET		0x90
+union gensts_reg {
+	struct {
+		u32 state:2;
+		u32 reset_type:2;
+		u32 rsvd:28;
+	};
+	u32 bits;
+} __packed;
+
+enum idxd_device_status_state {
+	IDXD_DEVICE_STATE_DISABLED = 0,
+	IDXD_DEVICE_STATE_ENABLED,
+	IDXD_DEVICE_STATE_DRAIN,
+	IDXD_DEVICE_STATE_HALT,
+};
+
+enum idxd_device_reset_type {
+	IDXD_DEVICE_RESET_SOFTWARE = 0,
+	IDXD_DEVICE_RESET_FLR,
+	IDXD_DEVICE_RESET_WARM,
+	IDXD_DEVICE_RESET_COLD,
+};
+
+#define IDXD_INTCAUSE_OFFSET		0x98
+#define IDXD_INTC_ERR			0x01
+#define IDXD_INTC_CMD			0x02
+#define IDXD_INTC_OCCUPY			0x04
+#define IDXD_INTC_PERFMON_OVFL		0x08
+
+#define IDXD_CMD_OFFSET			0xa0
+union idxd_command_reg {
+	struct {
+		u32 operand:20;
+		u32 cmd:5;
+		u32 rsvd:6;
+		u32 int_req:1;
+	};
+	u32 bits;
+} __packed;
+
+enum idxd_cmd {
+	IDXD_CMD_ENABLE_DEVICE = 1,
+	IDXD_CMD_DISABLE_DEVICE,
+	IDXD_CMD_DRAIN_ALL,
+	IDXD_CMD_ABORT_ALL,
+	IDXD_CMD_RESET_DEVICE,
+	IDXD_CMD_ENABLE_WQ,
+	IDXD_CMD_DISABLE_WQ,
+	IDXD_CMD_DRAIN_WQ,
+	IDXD_CMD_ABORT_WQ,
+	IDXD_CMD_RESET_WQ,
+	IDXD_CMD_DRAIN_PASID,
+	IDXD_CMD_ABORT_PASID,
+	IDXD_CMD_REQUEST_INT_HANDLE,
+};
+
+#define IDXD_CMDSTS_OFFSET		0xa8
+union cmdsts_reg {
+	struct {
+		u8 err;
+		u16 result;
+		u8 rsvd:7;
+		u8 active:1;
+	};
+	u32 bits;
+} __packed;
+#define IDXD_CMDSTS_ACTIVE		0x80000000
+
+enum idxd_cmdsts_err {
+	IDXD_CMDSTS_SUCCESS = 0,
+	IDXD_CMDSTS_INVAL_CMD,
+	IDXD_CMDSTS_INVAL_WQIDX,
+	IDXD_CMDSTS_HW_ERR,
+	/* enable device errors */
+	IDXD_CMDSTS_ERR_DEV_ENABLED = 0x10,
+	IDXD_CMDSTS_ERR_CONFIG,
+	IDXD_CMDSTS_ERR_BUSMASTER_EN,
+	IDXD_CMDSTS_ERR_PASID_INVAL,
+	IDXD_CMDSTS_ERR_WQ_SIZE_ERANGE,
+	IDXD_CMDSTS_ERR_GRP_CONFIG,
+	IDXD_CMDSTS_ERR_GRP_CONFIG2,
+	IDXD_CMDSTS_ERR_GRP_CONFIG3,
+	IDXD_CMDSTS_ERR_GRP_CONFIG4,
+	/* enable wq errors */
+	IDXD_CMDSTS_ERR_DEV_NOTEN = 0x20,
+	IDXD_CMDSTS_ERR_WQ_ENABLED,
+	IDXD_CMDSTS_ERR_WQ_SIZE,
+	IDXD_CMDSTS_ERR_WQ_PRIOR,
+	IDXD_CMDSTS_ERR_WQ_MODE,
+	IDXD_CMDSTS_ERR_BOF_EN,
+	IDXD_CMDSTS_ERR_PASID_EN,
+	IDXD_CMDSTS_ERR_MAX_BATCH_SIZE,
+	IDXD_CMDSTS_ERR_MAX_XFER_SIZE,
+	/* disable device errors */
+	IDXD_CMDSTS_ERR_DIS_DEV_EN = 0x31,
+	/* disable WQ, drain WQ, abort WQ, reset WQ */
+	IDXD_CMDSTS_ERR_DEV_NOT_EN,
+	/* request interrupt handle */
+	IDXD_CMDSTS_ERR_INVAL_INT_IDX = 0x41,
+	IDXD_CMDSTS_ERR_NO_HANDLE,
+};
+
+#define IDXD_SWERR_OFFSET		0xc0
+#define IDXD_SWERR_VALID		0x00000001
+#define IDXD_SWERR_OVERFLOW		0x00000002
+#define IDXD_SWERR_ACK			(IDXD_SWERR_VALID | IDXD_SWERR_OVERFLOW)
+union sw_err_reg {
+	struct {
+		u64 valid:1;
+		u64 overflow:1;
+		u64 desc_valid:1;
+		u64 wq_idx_valid:1;
+		u64 batch:1;
+		u64 fault_rw:1;
+		u64 priv:1;
+		u64 rsvd:1;
+		u64 error:8;
+		u64 wq_idx:8;
+		u64 rsvd2:8;
+		u64 operation:8;
+		u64 pasid:20;
+		u64 rsvd3:4;
+
+		u64 batch_idx:16;
+		u64 rsvd4:16;
+		u64 invalid_flags:32;
+
+		u64 fault_addr;
+
+		u64 rsvd5;
+	};
+	u64 bits[4];
+} __packed;
+
+union msix_perm {
+	struct {
+		u32 rsvd:2;
+		u32 ignore:1;
+		u32 pasid_en:1;
+		u32 rsvd2:8;
+		u32 pasid:20;
+	};
+	u32 bits;
+} __packed;
+
+union group_flags {
+	struct {
+		u32 tc_a:3;
+		u32 tc_b:3;
+		u32 rsvd:1;
+		u32 use_token_limit:1;
+		u32 tokens_reserved:8;
+		u32 rsvd2:4;
+		u32 tokens_allowed:8;
+		u32 rsvd3:4;
+	};
+	u32 bits;
+} __packed;
+
+struct grpcfg {
+	u64 wqs[4];
+	u64 engines;
+	union group_flags flags;
+} __packed;
+
+union wqcfg {
+	struct {
+		/* bytes 0-3 */
+		u16 wq_size;
+		u16 rsvd;
+
+		/* bytes 4-7 */
+		u16 wq_thresh;
+		u16 rsvd1;
+
+		/* bytes 8-11 */
+		u32 mode:1;	/* shared or dedicated */
+		u32 bof:1;	/* block on fault */
+		u32 rsvd2:2;
+		u32 priority:4;
+		u32 pasid:20;
+		u32 pasid_en:1;
+		u32 priv:1;
+		u32 rsvd3:2;
+
+		/* bytes 12-15 */
+		u32 max_xfer_shift:5;
+		u32 max_batch_shift:4;
+		u32 rsvd4:23;
+
+		/* bytes 16-19 */
+		u16 occupancy_inth;
+		u16 occupancy_table_sel:1;
+		u16 rsvd5:15;
+
+		/* bytes 20-23 */
+		u16 occupancy_limit;
+		u16 occupancy_int_en:1;
+		u16 rsvd6:15;
+
+		/* bytes 24-27 */
+		u16 occupancy;
+		u16 occupancy_int:1;
+		u16 rsvd7:12;
+		u16 mode_support:1;
+		u16 wq_state:2;
+
+		/* bytes 28-31 */
+		u32 rsvd8;
+	};
+	u32 bits[8];
+} __packed;
+#endif
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
new file mode 100644
index 000000000000..b6a0a59b500f
--- /dev/null
+++ b/drivers/dma/idxd/sysfs.c
@@ -0,0 +1,1452 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+
+static char *idxd_wq_type_names[] = {
+	[IDXD_WQT_NONE]		= "none",
+	[IDXD_WQT_KERNEL]	= "kernel",
+};
+
+static void idxd_conf_device_release(struct device *dev)
+{
+	dev_dbg(dev, "%s for %s\n", __func__, dev_name(dev));
+}
+
+static struct device_type idxd_group_device_type = {
+	.name = "group",
+	.release = idxd_conf_device_release,
+};
+
+static struct device_type idxd_wq_device_type = {
+	.name = "wq",
+	.release = idxd_conf_device_release,
+};
+
+static struct device_type idxd_engine_device_type = {
+	.name = "engine",
+	.release = idxd_conf_device_release,
+};
+
+static struct device_type dsa_device_type = {
+	.name = "dsa",
+	.release = idxd_conf_device_release,
+};
+
+static inline bool is_dsa_dev(struct device *dev)
+{
+	return dev ? dev->type == &dsa_device_type : false;
+}
+
+static inline bool is_idxd_dev(struct device *dev)
+{
+	return is_dsa_dev(dev);
+}
+
+static inline bool is_idxd_wq_dev(struct device *dev)
+{
+	return dev ? dev->type == &idxd_wq_device_type : false;
+}
+
+static int idxd_config_bus_match(struct device *dev,
+				 struct device_driver *drv)
+{
+	int matched = 0;
+
+	if (is_idxd_dev(dev)) {
+		struct idxd_device *idxd = confdev_to_idxd(dev);
+
+		if (idxd->state != IDXD_DEV_CONF_READY)
+			return 0;
+		matched = 1;
+	} else if (is_idxd_wq_dev(dev)) {
+		struct idxd_wq *wq = confdev_to_wq(dev);
+		struct idxd_device *idxd = wq->idxd;
+
+		if (idxd->state < IDXD_DEV_CONF_READY)
+			return 0;
+
+		if (wq->state != IDXD_WQ_DISABLED) {
+			dev_dbg(dev, "%s not disabled\n", dev_name(dev));
+			return 0;
+		}
+		matched = 1;
+	}
+
+	if (matched)
+		dev_dbg(dev, "%s matched\n", dev_name(dev));
+
+	return matched;
+}
+
+static int idxd_config_bus_probe(struct device *dev)
+{
+	int rc;
+	unsigned long flags;
+
+	dev_dbg(dev, "%s called\n", __func__);
+
+	if (is_idxd_dev(dev)) {
+		struct idxd_device *idxd = confdev_to_idxd(dev);
+
+		if (idxd->state != IDXD_DEV_CONF_READY) {
+			dev_warn(dev, "Device not ready for config\n");
+			return -EBUSY;
+		}
+
+		spin_lock_irqsave(&idxd->dev_lock, flags);
+
+		/* Perform IDXD configuration and enabling */
+		rc = idxd_device_config(idxd);
+		if (rc < 0) {
+			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			dev_warn(dev, "Device config failed: %d\n", rc);
+			return rc;
+		}
+
+		/* start device */
+		rc = idxd_device_enable(idxd);
+		if (rc < 0) {
+			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			dev_warn(dev, "Device enable failed: %d\n", rc);
+			return rc;
+		}
+
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		dev_info(dev, "Device %s enabled\n", dev_name(dev));
+
+		return 0;
+	} else if (is_idxd_wq_dev(dev)) {
+		struct idxd_wq *wq = confdev_to_wq(dev);
+		struct idxd_device *idxd = wq->idxd;
+
+		mutex_lock(&wq->wq_lock);
+
+		if (idxd->state != IDXD_DEV_ENABLED) {
+			mutex_unlock(&wq->wq_lock);
+			dev_warn(dev, "Enabling while device not enabled.\n");
+			return -EPERM;
+		}
+
+		if (wq->state != IDXD_WQ_DISABLED) {
+			mutex_unlock(&wq->wq_lock);
+			dev_warn(dev, "WQ %d already enabled.\n", wq->id);
+			return -EBUSY;
+		}
+
+		if (!wq->group) {
+			mutex_unlock(&wq->wq_lock);
+			dev_warn(dev, "WQ not attached to group.\n");
+			return -EINVAL;
+		}
+
+		if (strlen(wq->name) == 0) {
+			mutex_unlock(&wq->wq_lock);
+			dev_warn(dev, "WQ name not set.\n");
+			return -EINVAL;
+		}
+
+		rc = idxd_wq_alloc_resources(wq);
+		if (rc < 0) {
+			mutex_unlock(&wq->wq_lock);
+			dev_warn(dev, "WQ resource alloc failed\n");
+			return rc;
+		}
+
+		spin_lock_irqsave(&idxd->dev_lock, flags);
+		rc = idxd_device_config(idxd);
+		if (rc < 0) {
+			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			mutex_unlock(&wq->wq_lock);
+			dev_warn(dev, "Writing WQ %d config failed: %d\n",
+				 wq->id, rc);
+			return rc;
+		}
+
+		rc = idxd_wq_enable(wq);
+		if (rc < 0) {
+			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			mutex_unlock(&wq->wq_lock);
+			dev_warn(dev, "WQ %d enabling failed: %d\n",
+				 wq->id, rc);
+			return rc;
+		}
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+		rc = idxd_wq_map_portal(wq);
+		if (rc < 0) {
+			dev_warn(dev, "wq portal mapping failed: %d\n", rc);
+			rc = idxd_wq_disable(wq);
+			if (rc < 0)
+				dev_warn(dev, "IDXD wq disable failed\n");
+			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			mutex_unlock(&wq->wq_lock);
+			return rc;
+		}
+
+		wq->client_count = 0;
+
+		dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev));
+		mutex_unlock(&wq->wq_lock);
+		return 0;
+	}
+
+	return -ENODEV;
+}
+
+static void disable_wq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	unsigned long flags;
+	int rc;
+
+	mutex_lock(&wq->wq_lock);
+	dev_dbg(dev, "%s removing WQ %s\n", __func__, dev_name(&wq->conf_dev));
+	if (wq->state == IDXD_WQ_DISABLED) {
+		mutex_unlock(&wq->wq_lock);
+		return;
+	}
+
+	if (idxd_wq_refcount(wq))
+		dev_warn(dev, "Clients has claim on wq %d: %d\n",
+			 wq->id, idxd_wq_refcount(wq));
+
+	idxd_wq_unmap_portal(wq);
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	rc = idxd_wq_disable(wq);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	idxd_wq_free_resources(wq);
+	wq->client_count = 0;
+	mutex_unlock(&wq->wq_lock);
+
+	if (rc < 0)
+		dev_warn(dev, "Failed to disable %s: %d\n",
+			 dev_name(&wq->conf_dev), rc);
+	else
+		dev_info(dev, "wq %s disabled\n", dev_name(&wq->conf_dev));
+}
+
+static int idxd_config_bus_remove(struct device *dev)
+{
+	int rc;
+	unsigned long flags;
+
+	dev_dbg(dev, "%s called for %s\n", __func__, dev_name(dev));
+
+	/* disable workqueue here */
+	if (is_idxd_wq_dev(dev)) {
+		struct idxd_wq *wq = confdev_to_wq(dev);
+
+		disable_wq(wq);
+	} else if (is_idxd_dev(dev)) {
+		struct idxd_device *idxd = confdev_to_idxd(dev);
+		int i;
+
+		dev_dbg(dev, "%s removing dev %s\n", __func__,
+			dev_name(&idxd->conf_dev));
+		for (i = 0; i < idxd->max_wqs; i++) {
+			struct idxd_wq *wq = &idxd->wqs[i];
+
+			if (wq->state == IDXD_WQ_DISABLED)
+				continue;
+			dev_warn(dev, "Active wq %d on disable %s.\n", i,
+				 dev_name(&idxd->conf_dev));
+			device_release_driver(&wq->conf_dev);
+		}
+
+		spin_lock_irqsave(&idxd->dev_lock, flags);
+		rc = idxd_device_disable(idxd);
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		if (rc < 0)
+			dev_warn(dev, "Device disable failed\n");
+		else
+			dev_info(dev, "Device %s disabled\n", dev_name(dev));
+	}
+
+	return 0;
+}
+
+static void idxd_config_bus_shutdown(struct device *dev)
+{
+	dev_dbg(dev, "%s called\n", __func__);
+}
+
+static struct bus_type dsa_bus_type = {
+	.name = "dsa",
+	.match = idxd_config_bus_match,
+	.probe = idxd_config_bus_probe,
+	.remove = idxd_config_bus_remove,
+	.shutdown = idxd_config_bus_shutdown,
+};
+
+static struct bus_type *idxd_bus_types[] = {
+	&dsa_bus_type
+};
+
+static struct idxd_device_driver dsa_drv = {
+	.drv = {
+		.name = "dsa",
+		.bus = &dsa_bus_type,
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
+};
+
+static struct idxd_device_driver *idxd_drvs[] = {
+	&dsa_drv
+};
+
+static struct bus_type *idxd_get_bus_type(struct idxd_device *idxd)
+{
+	return idxd_bus_types[idxd->type];
+}
+
+static struct device_type *idxd_get_device_type(struct idxd_device *idxd)
+{
+	if (idxd->type == IDXD_TYPE_DSA)
+		return &dsa_device_type;
+	else
+		return NULL;
+}
+
+/* IDXD generic driver setup */
+int idxd_register_driver(void)
+{
+	int i, rc;
+
+	for (i = 0; i < IDXD_TYPE_MAX; i++) {
+		rc = driver_register(&idxd_drvs[i]->drv);
+		if (rc < 0)
+			goto drv_fail;
+	}
+
+	return 0;
+
+drv_fail:
+	for (; i > 0; i--)
+		driver_unregister(&idxd_drvs[i]->drv);
+	return rc;
+}
+
+void idxd_unregister_driver(void)
+{
+	int i;
+
+	for (i = 0; i < IDXD_TYPE_MAX; i++)
+		driver_unregister(&idxd_drvs[i]->drv);
+}
+
+/* IDXD engine attributes */
+static ssize_t engine_group_id_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct idxd_engine *engine =
+		container_of(dev, struct idxd_engine, conf_dev);
+
+	if (engine->group)
+		return sprintf(buf, "%d\n", engine->group->id);
+	else
+		return sprintf(buf, "%d\n", -1);
+}
+
+static ssize_t engine_group_id_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct idxd_engine *engine =
+		container_of(dev, struct idxd_engine, conf_dev);
+	struct idxd_device *idxd = engine->idxd;
+	long id;
+	int rc;
+	struct idxd_group *prevg, *group;
+
+	rc = kstrtol(buf, 10, &id);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (id > idxd->max_groups - 1 || id < -1)
+		return -EINVAL;
+
+	if (id == -1) {
+		if (engine->group) {
+			engine->group->num_engines--;
+			engine->group = NULL;
+		}
+		return count;
+	}
+
+	group = &idxd->groups[id];
+	prevg = engine->group;
+
+	if (prevg)
+		prevg->num_engines--;
+	engine->group = &idxd->groups[id];
+	engine->group->num_engines++;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_engine_group =
+		__ATTR(group_id, 0644, engine_group_id_show,
+		       engine_group_id_store);
+
+static struct attribute *idxd_engine_attributes[] = {
+	&dev_attr_engine_group.attr,
+	NULL,
+};
+
+static const struct attribute_group idxd_engine_attribute_group = {
+	.attrs = idxd_engine_attributes,
+};
+
+static const struct attribute_group *idxd_engine_attribute_groups[] = {
+	&idxd_engine_attribute_group,
+	NULL,
+};
+
+/* Group attributes */
+
+static void idxd_set_free_tokens(struct idxd_device *idxd)
+{
+	int i, tokens;
+
+	for (i = 0, tokens = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *g = &idxd->groups[i];
+
+		tokens += g->tokens_reserved;
+	}
+
+	idxd->nr_tokens = idxd->max_tokens - tokens;
+}
+
+static ssize_t group_tokens_reserved_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+
+	return sprintf(buf, "%u\n", group->tokens_reserved);
+}
+
+static ssize_t group_tokens_reserved_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_device *idxd = group->idxd;
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (idxd->token_limit == 0)
+		return -EPERM;
+
+	if (val > idxd->max_tokens)
+		return -EINVAL;
+
+	if (val > idxd->nr_tokens)
+		return -EINVAL;
+
+	group->tokens_reserved = val;
+	idxd_set_free_tokens(idxd);
+	return count;
+}
+
+static struct device_attribute dev_attr_group_tokens_reserved =
+		__ATTR(tokens_reserved, 0644, group_tokens_reserved_show,
+		       group_tokens_reserved_store);
+
+static ssize_t group_tokens_allowed_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+
+	return sprintf(buf, "%u\n", group->tokens_allowed);
+}
+
+static ssize_t group_tokens_allowed_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_device *idxd = group->idxd;
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (idxd->token_limit == 0)
+		return -EPERM;
+	if (val < 4 * group->num_engines ||
+	    val > group->tokens_reserved + idxd->nr_tokens)
+		return -EINVAL;
+
+	group->tokens_allowed = val;
+	return count;
+}
+
+static struct device_attribute dev_attr_group_tokens_allowed =
+		__ATTR(tokens_allowed, 0644, group_tokens_allowed_show,
+		       group_tokens_allowed_store);
+
+static ssize_t group_use_token_limit_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+
+	return sprintf(buf, "%u\n", group->use_token_limit);
+}
+
+static ssize_t group_use_token_limit_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_device *idxd = group->idxd;
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (idxd->token_limit == 0)
+		return -EPERM;
+
+	group->use_token_limit = !!val;
+	return count;
+}
+
+static struct device_attribute dev_attr_group_use_token_limit =
+		__ATTR(use_token_limit, 0644, group_use_token_limit_show,
+		       group_use_token_limit_store);
+
+static ssize_t group_engines_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+	int i, rc = 0;
+	char *tmp = buf;
+	struct idxd_device *idxd = group->idxd;
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		struct idxd_engine *engine = &idxd->engines[i];
+
+		if (!engine->group)
+			continue;
+
+		if (engine->group->id == group->id)
+			rc += sprintf(tmp + rc, "engine%d.%d ",
+					idxd->id, engine->id);
+	}
+
+	rc--;
+	rc += sprintf(tmp + rc, "\n");
+
+	return rc;
+}
+
+static struct device_attribute dev_attr_group_engines =
+		__ATTR(engines, 0444, group_engines_show, NULL);
+
+static ssize_t group_work_queues_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+	int i, rc = 0;
+	char *tmp = buf;
+	struct idxd_device *idxd = group->idxd;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		if (!wq->group)
+			continue;
+
+		if (wq->group->id == group->id)
+			rc += sprintf(tmp + rc, "wq%d.%d ",
+					idxd->id, wq->id);
+	}
+
+	rc--;
+	rc += sprintf(tmp + rc, "\n");
+
+	return rc;
+}
+
+static struct device_attribute dev_attr_group_work_queues =
+		__ATTR(work_queues, 0444, group_work_queues_show, NULL);
+
+static ssize_t group_traffic_class_a_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+
+	return sprintf(buf, "%d\n", group->tc_a);
+}
+
+static ssize_t group_traffic_class_a_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_device *idxd = group->idxd;
+	long val;
+	int rc;
+
+	rc = kstrtol(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (val < 0 || val > 7)
+		return -EINVAL;
+
+	group->tc_a = val;
+	return count;
+}
+
+static struct device_attribute dev_attr_group_traffic_class_a =
+		__ATTR(traffic_class_a, 0644, group_traffic_class_a_show,
+		       group_traffic_class_a_store);
+
+static ssize_t group_traffic_class_b_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+
+	return sprintf(buf, "%d\n", group->tc_b);
+}
+
+static ssize_t group_traffic_class_b_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct idxd_group *group =
+		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_device *idxd = group->idxd;
+	long val;
+	int rc;
+
+	rc = kstrtol(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (val < 0 || val > 7)
+		return -EINVAL;
+
+	group->tc_b = val;
+	return count;
+}
+
+static struct device_attribute dev_attr_group_traffic_class_b =
+		__ATTR(traffic_class_b, 0644, group_traffic_class_b_show,
+		       group_traffic_class_b_store);
+
+static struct attribute *idxd_group_attributes[] = {
+	&dev_attr_group_work_queues.attr,
+	&dev_attr_group_engines.attr,
+	&dev_attr_group_use_token_limit.attr,
+	&dev_attr_group_tokens_allowed.attr,
+	&dev_attr_group_tokens_reserved.attr,
+	&dev_attr_group_traffic_class_a.attr,
+	&dev_attr_group_traffic_class_b.attr,
+	NULL,
+};
+
+static const struct attribute_group idxd_group_attribute_group = {
+	.attrs = idxd_group_attributes,
+};
+
+static const struct attribute_group *idxd_group_attribute_groups[] = {
+	&idxd_group_attribute_group,
+	NULL,
+};
+
+/* IDXD work queue attribs */
+static ssize_t wq_clients_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%d\n", wq->client_count);
+}
+
+static struct device_attribute dev_attr_wq_clients =
+		__ATTR(clients, 0444, wq_clients_show, NULL);
+
+static ssize_t wq_state_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	switch (wq->state) {
+	case IDXD_WQ_DISABLED:
+		return sprintf(buf, "disabled\n");
+	case IDXD_WQ_ENABLED:
+		return sprintf(buf, "enabled\n");
+	}
+
+	return sprintf(buf, "unknown\n");
+}
+
+static struct device_attribute dev_attr_wq_state =
+		__ATTR(state, 0444, wq_state_show, NULL);
+
+static ssize_t wq_group_id_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	if (wq->group)
+		return sprintf(buf, "%u\n", wq->group->id);
+	else
+		return sprintf(buf, "-1\n");
+}
+
+static ssize_t wq_group_id_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	long id;
+	int rc;
+	struct idxd_group *prevg, *group;
+
+	rc = kstrtol(buf, 10, &id);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (id > idxd->max_groups - 1 || id < -1)
+		return -EINVAL;
+
+	if (id == -1) {
+		if (wq->group) {
+			wq->group->num_wqs--;
+			wq->group = NULL;
+		}
+		return count;
+	}
+
+	group = &idxd->groups[id];
+	prevg = wq->group;
+
+	if (prevg)
+		prevg->num_wqs--;
+	wq->group = group;
+	group->num_wqs++;
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_group_id =
+		__ATTR(group_id, 0644, wq_group_id_show, wq_group_id_store);
+
+static ssize_t wq_mode_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%s\n",
+			wq_dedicated(wq) ? "dedicated" : "shared");
+}
+
+static ssize_t wq_mode_store(struct device *dev,
+			     struct device_attribute *attr, const char *buf,
+			     size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (sysfs_streq(buf, "dedicated")) {
+		set_bit(WQ_FLAG_DEDICATED, &wq->flags);
+		wq->threshold = 0;
+	} else {
+		return -EINVAL;
+	}
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_mode =
+		__ATTR(mode, 0644, wq_mode_show, wq_mode_store);
+
+static ssize_t wq_size_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n", wq->size);
+}
+
+static ssize_t wq_size_store(struct device *dev,
+			     struct device_attribute *attr, const char *buf,
+			     size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	unsigned long size;
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &size);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (size > idxd->max_wq_size)
+		return -EINVAL;
+
+	wq->size = size;
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_size =
+		__ATTR(size, 0644, wq_size_show, wq_size_store);
+
+static ssize_t wq_priority_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n", wq->priority);
+}
+
+static ssize_t wq_priority_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	unsigned long prio;
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &prio);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (prio > IDXD_MAX_PRIORITY)
+		return -EINVAL;
+
+	wq->priority = prio;
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_priority =
+		__ATTR(priority, 0644, wq_priority_show, wq_priority_store);
+
+static ssize_t wq_type_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	switch (wq->type) {
+	case IDXD_WQT_KERNEL:
+		return sprintf(buf, "%s\n",
+			       idxd_wq_type_names[IDXD_WQT_KERNEL]);
+	case IDXD_WQT_NONE:
+	default:
+		return sprintf(buf, "%s\n",
+			       idxd_wq_type_names[IDXD_WQT_NONE]);
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t wq_type_store(struct device *dev,
+			     struct device_attribute *attr, const char *buf,
+			     size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	enum idxd_wq_type old_type;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	old_type = wq->type;
+	if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_KERNEL]))
+		wq->type = IDXD_WQT_KERNEL;
+	else
+		wq->type = IDXD_WQT_NONE;
+
+	/* If we are changing queue type, clear the name */
+	if (wq->type != old_type)
+		memset(wq->name, 0, WQ_NAME_SIZE + 1);
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_type =
+		__ATTR(type, 0644, wq_type_show, wq_type_store);
+
+static ssize_t wq_name_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%s\n", wq->name);
+}
+
+static ssize_t wq_name_store(struct device *dev,
+			     struct device_attribute *attr, const char *buf,
+			     size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
+		return -EINVAL;
+
+	memset(wq->name, 0, WQ_NAME_SIZE + 1);
+	strncpy(wq->name, buf, WQ_NAME_SIZE);
+	strreplace(wq->name, '\n', '\0');
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_name =
+		__ATTR(name, 0644, wq_name_show, wq_name_store);
+
+static struct attribute *idxd_wq_attributes[] = {
+	&dev_attr_wq_clients.attr,
+	&dev_attr_wq_state.attr,
+	&dev_attr_wq_group_id.attr,
+	&dev_attr_wq_mode.attr,
+	&dev_attr_wq_size.attr,
+	&dev_attr_wq_priority.attr,
+	&dev_attr_wq_type.attr,
+	&dev_attr_wq_name.attr,
+	NULL,
+};
+
+static const struct attribute_group idxd_wq_attribute_group = {
+	.attrs = idxd_wq_attributes,
+};
+
+static const struct attribute_group *idxd_wq_attribute_groups[] = {
+	&idxd_wq_attribute_group,
+	NULL,
+};
+
+/* IDXD device attribs */
+static ssize_t max_work_queues_size_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->max_wq_size);
+}
+static DEVICE_ATTR_RO(max_work_queues_size);
+
+static ssize_t max_groups_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->max_groups);
+}
+static DEVICE_ATTR_RO(max_groups);
+
+static ssize_t max_work_queues_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->max_wqs);
+}
+static DEVICE_ATTR_RO(max_work_queues);
+
+static ssize_t max_engines_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->max_engines);
+}
+static DEVICE_ATTR_RO(max_engines);
+
+static ssize_t numa_node_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%d\n", dev_to_node(&idxd->pdev->dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static ssize_t max_batch_size_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->max_batch_size);
+}
+static DEVICE_ATTR_RO(max_batch_size);
+
+static ssize_t max_transfer_size_show(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%llu\n", idxd->max_xfer_bytes);
+}
+static DEVICE_ATTR_RO(max_transfer_size);
+
+static ssize_t op_cap_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%#llx\n", idxd->hw.opcap.bits[0]);
+}
+static DEVICE_ATTR_RO(op_cap);
+
+static ssize_t configurable_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n",
+			test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags));
+}
+static DEVICE_ATTR_RO(configurable);
+
+static ssize_t clients_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+	unsigned long flags;
+	int count = 0, i;
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		count += wq->client_count;
+	}
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	return sprintf(buf, "%d\n", count);
+}
+static DEVICE_ATTR_RO(clients);
+
+static ssize_t state_show(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	switch (idxd->state) {
+	case IDXD_DEV_DISABLED:
+	case IDXD_DEV_CONF_READY:
+		return sprintf(buf, "disabled\n");
+	case IDXD_DEV_ENABLED:
+		return sprintf(buf, "enabled\n");
+	case IDXD_DEV_HALTED:
+		return sprintf(buf, "halted\n");
+	}
+
+	return sprintf(buf, "unknown\n");
+}
+static DEVICE_ATTR_RO(state);
+
+static ssize_t errors_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+	int i, out = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	for (i = 0; i < 4; i++)
+		out += sprintf(buf + out, "%#018llx ", idxd->sw_err.bits[i]);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	out--;
+	out += sprintf(buf + out, "\n");
+	return out;
+}
+static DEVICE_ATTR_RO(errors);
+
+static ssize_t max_tokens_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->max_tokens);
+}
+static DEVICE_ATTR_RO(max_tokens);
+
+static ssize_t token_limit_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->token_limit);
+}
+
+static ssize_t token_limit_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (!idxd->hw.group_cap.token_limit)
+		return -EPERM;
+
+	if (val > idxd->hw.group_cap.total_tokens)
+		return -EINVAL;
+
+	idxd->token_limit = val;
+	return count;
+}
+static DEVICE_ATTR_RW(token_limit);
+
+static struct attribute *idxd_device_attributes[] = {
+	&dev_attr_max_groups.attr,
+	&dev_attr_max_work_queues.attr,
+	&dev_attr_max_work_queues_size.attr,
+	&dev_attr_max_engines.attr,
+	&dev_attr_numa_node.attr,
+	&dev_attr_max_batch_size.attr,
+	&dev_attr_max_transfer_size.attr,
+	&dev_attr_op_cap.attr,
+	&dev_attr_configurable.attr,
+	&dev_attr_clients.attr,
+	&dev_attr_state.attr,
+	&dev_attr_errors.attr,
+	&dev_attr_max_tokens.attr,
+	&dev_attr_token_limit.attr,
+	NULL,
+};
+
+static const struct attribute_group idxd_device_attribute_group = {
+	.attrs = idxd_device_attributes,
+};
+
+static const struct attribute_group *idxd_attribute_groups[] = {
+	&idxd_device_attribute_group,
+	NULL,
+};
+
+static int idxd_setup_engine_sysfs(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int i, rc;
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		struct idxd_engine *engine = &idxd->engines[i];
+
+		engine->conf_dev.parent = &idxd->conf_dev;
+		dev_set_name(&engine->conf_dev, "engine%d.%d",
+			     idxd->id, engine->id);
+		engine->conf_dev.bus = idxd_get_bus_type(idxd);
+		engine->conf_dev.groups = idxd_engine_attribute_groups;
+		engine->conf_dev.type = &idxd_engine_device_type;
+		dev_dbg(dev, "Engine device register: %s\n",
+			dev_name(&engine->conf_dev));
+		rc = device_register(&engine->conf_dev);
+		if (rc < 0) {
+			put_device(&engine->conf_dev);
+			goto cleanup;
+		}
+	}
+
+	return 0;
+
+cleanup:
+	while (i--) {
+		struct idxd_engine *engine = &idxd->engines[i];
+
+		device_unregister(&engine->conf_dev);
+	}
+	return rc;
+}
+
+static int idxd_setup_group_sysfs(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int i, rc;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = &idxd->groups[i];
+
+		group->conf_dev.parent = &idxd->conf_dev;
+		dev_set_name(&group->conf_dev, "group%d.%d",
+			     idxd->id, group->id);
+		group->conf_dev.bus = idxd_get_bus_type(idxd);
+		group->conf_dev.groups = idxd_group_attribute_groups;
+		group->conf_dev.type = &idxd_group_device_type;
+		dev_dbg(dev, "Group device register: %s\n",
+			dev_name(&group->conf_dev));
+		rc = device_register(&group->conf_dev);
+		if (rc < 0) {
+			put_device(&group->conf_dev);
+			goto cleanup;
+		}
+	}
+
+	return 0;
+
+cleanup:
+	while (i--) {
+		struct idxd_group *group = &idxd->groups[i];
+
+		device_unregister(&group->conf_dev);
+	}
+	return rc;
+}
+
+static int idxd_setup_wq_sysfs(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int i, rc;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		wq->conf_dev.parent = &idxd->conf_dev;
+		dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id);
+		wq->conf_dev.bus = idxd_get_bus_type(idxd);
+		wq->conf_dev.groups = idxd_wq_attribute_groups;
+		wq->conf_dev.type = &idxd_wq_device_type;
+		dev_dbg(dev, "WQ device register: %s\n",
+			dev_name(&wq->conf_dev));
+		rc = device_register(&wq->conf_dev);
+		if (rc < 0) {
+			put_device(&wq->conf_dev);
+			goto cleanup;
+		}
+	}
+
+	return 0;
+
+cleanup:
+	while (i--) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		device_unregister(&wq->conf_dev);
+	}
+	return rc;
+}
+
+static int idxd_setup_device_sysfs(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
+	char devname[IDXD_NAME_SIZE];
+
+	sprintf(devname, "%s%d", idxd_get_dev_name(idxd), idxd->id);
+	idxd->conf_dev.parent = dev;
+	dev_set_name(&idxd->conf_dev, "%s", devname);
+	idxd->conf_dev.bus = idxd_get_bus_type(idxd);
+	idxd->conf_dev.groups = idxd_attribute_groups;
+	idxd->conf_dev.type = idxd_get_device_type(idxd);
+
+	dev_dbg(dev, "IDXD device register: %s\n", dev_name(&idxd->conf_dev));
+	rc = device_register(&idxd->conf_dev);
+	if (rc < 0) {
+		put_device(&idxd->conf_dev);
+		return rc;
+	}
+
+	return 0;
+}
+
+int idxd_setup_sysfs(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
+
+	rc = idxd_setup_device_sysfs(idxd);
+	if (rc < 0) {
+		dev_dbg(dev, "Device sysfs registering failed: %d\n", rc);
+		return rc;
+	}
+
+	rc = idxd_setup_wq_sysfs(idxd);
+	if (rc < 0) {
+		/* unregister conf dev */
+		dev_dbg(dev, "Work Queue sysfs registering failed: %d\n", rc);
+		return rc;
+	}
+
+	rc = idxd_setup_group_sysfs(idxd);
+	if (rc < 0) {
+		/* unregister conf dev */
+		dev_dbg(dev, "Group sysfs registering failed: %d\n", rc);
+		return rc;
+	}
+
+	rc = idxd_setup_engine_sysfs(idxd);
+	if (rc < 0) {
+		/* unregister conf dev */
+		dev_dbg(dev, "Engine sysfs registering failed: %d\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+void idxd_cleanup_sysfs(struct idxd_device *idxd)
+{
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		device_unregister(&wq->conf_dev);
+	}
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		struct idxd_engine *engine = &idxd->engines[i];
+
+		device_unregister(&engine->conf_dev);
+	}
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = &idxd->groups[i];
+
+		device_unregister(&group->conf_dev);
+	}
+
+	device_unregister(&idxd->conf_dev);
+}
+
+int idxd_register_bus_type(void)
+{
+	int i, rc;
+
+	for (i = 0; i < IDXD_TYPE_MAX; i++) {
+		rc = bus_register(idxd_bus_types[i]);
+		if (rc < 0)
+			goto bus_err;
+	}
+
+	return 0;
+
+bus_err:
+	for (; i > 0; i--)
+		bus_unregister(idxd_bus_types[i]);
+	return rc;
+}
+
+void idxd_unregister_bus_type(void)
+{
+	int i;
+
+	for (i = 0; i < IDXD_TYPE_MAX; i++)
+		bus_unregister(idxd_bus_types[i]);
+}
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
new file mode 100644
index 000000000000..849ef1515d04
--- /dev/null
+++ b/include/uapi/linux/idxd.h
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#ifndef _USR_IDXD_H_
+#define _USR_IDXD_H_
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+/* Descriptor flags */
+#define IDXD_OP_FLAG_FENCE	0x0001
+#define IDXD_OP_FLAG_BOF	0x0002
+#define IDXD_OP_FLAG_CRAV	0x0004
+#define IDXD_OP_FLAG_RCR	0x0008
+#define IDXD_OP_FLAG_RCI	0x0010
+#define IDXD_OP_FLAG_CRSTS	0x0020
+#define IDXD_OP_FLAG_CR		0x0080
+#define IDXD_OP_FLAG_CC		0x0100
+#define IDXD_OP_FLAG_ADDR1_TCS	0x0200
+#define IDXD_OP_FLAG_ADDR2_TCS	0x0400
+#define IDXD_OP_FLAG_ADDR3_TCS	0x0800
+#define IDXD_OP_FLAG_CR_TCS	0x1000
+#define IDXD_OP_FLAG_STORD	0x2000
+#define IDXD_OP_FLAG_DRDBK	0x4000
+#define IDXD_OP_FLAG_DSTS	0x8000
+
+/* Opcode */
+enum dsa_opcode {
+	DSA_OPCODE_NOOP = 0,
+	DSA_OPCODE_BATCH,
+	DSA_OPCODE_DRAIN,
+	DSA_OPCODE_MEMMOVE,
+	DSA_OPCODE_MEMFILL,
+	DSA_OPCODE_COMPARE,
+	DSA_OPCODE_COMPVAL,
+	DSA_OPCODE_CR_DELTA,
+	DSA_OPCODE_AP_DELTA,
+	DSA_OPCODE_DUALCAST,
+	DSA_OPCODE_CRCGEN = 0x10,
+	DSA_OPCODE_COPY_CRC,
+	DSA_OPCODE_DIF_CHECK,
+	DSA_OPCODE_DIF_INS,
+	DSA_OPCODE_DIF_STRP,
+	DSA_OPCODE_DIF_UPDT,
+	DSA_OPCODE_CFLUSH = 0x20,
+};
+
+/* Completion record status */
+enum dsa_completion_status {
+	DSA_COMP_NONE = 0,
+	DSA_COMP_SUCCESS,
+	DSA_COMP_SUCCESS_PRED,
+	DSA_COMP_PAGE_FAULT_NOBOF,
+	DSA_COMP_PAGE_FAULT_IR,
+	DSA_COMP_BATCH_FAIL,
+	DSA_COMP_BATCH_PAGE_FAULT,
+	DSA_COMP_DR_OFFSET_NOINC,
+	DSA_COMP_DR_OFFSET_ERANGE,
+	DSA_COMP_DIF_ERR,
+	DSA_COMP_BAD_OPCODE = 0x10,
+	DSA_COMP_INVALID_FLAGS,
+	DSA_COMP_NOZERO_RESERVE,
+	DSA_COMP_XFER_ERANGE,
+	DSA_COMP_DESC_CNT_ERANGE,
+	DSA_COMP_DR_ERANGE,
+	DSA_COMP_OVERLAP_BUFFERS,
+	DSA_COMP_DCAST_ERR,
+	DSA_COMP_DESCLIST_ALIGN,
+	DSA_COMP_INT_HANDLE_INVAL,
+	DSA_COMP_CRA_XLAT,
+	DSA_COMP_CRA_ALIGN,
+	DSA_COMP_ADDR_ALIGN,
+	DSA_COMP_PRIV_BAD,
+	DSA_COMP_TRAFFIC_CLASS_CONF,
+	DSA_COMP_PFAULT_RDBA,
+	DSA_COMP_HW_ERR1,
+	DSA_COMP_HW_ERR_DRB,
+	DSA_COMP_TRANSLATION_FAIL,
+};
+
+#define DSA_COMP_STATUS_MASK		0x7f
+#define DSA_COMP_STATUS_WRITE		0x80
+
+struct dsa_batch_desc {
+	uint32_t	pasid:20;
+	uint32_t	rsvd:11;
+	uint32_t	priv:1;
+	uint32_t	flags:24;
+	uint32_t	opcode:8;
+	uint64_t	completion_addr;
+	uint64_t	desc_list_addr;
+	uint64_t	rsvd1;
+	uint32_t	desc_count;
+	uint16_t	interrupt_handle;
+	uint16_t	rsvd2;
+	uint8_t		rsvd3[24];
+} __attribute__((packed));
+
+struct dsa_hw_desc {
+	uint32_t	pasid:20;
+	uint32_t	rsvd:11;
+	uint32_t	priv:1;
+	uint32_t	flags:24;
+	uint32_t	opcode:8;
+	uint64_t	completion_addr;
+	union {
+		uint64_t	src_addr;
+		uint64_t	rdback_addr;
+		uint64_t	pattern;
+	};
+	union {
+		uint64_t	dst_addr;
+		uint64_t	rdback_addr2;
+		uint64_t	src2_addr;
+		uint64_t	comp_pattern;
+	};
+	uint32_t	xfer_size;
+	uint16_t	int_handle;
+	uint16_t	rsvd1;
+	union {
+		uint8_t		expected_res;
+		struct {
+			uint64_t	delta_addr;
+			uint32_t	max_delta_size;
+		};
+		uint32_t	delta_rec_size;
+		uint64_t	dest2;
+		/* CRC */
+		struct {
+			uint32_t	crc_seed;
+			uint32_t	crc_rsvd;
+			uint64_t	seed_addr;
+		};
+		/* DIF check or strip */
+		struct {
+			uint8_t		src_dif_flags;
+			uint8_t		dif_chk_res;
+			uint8_t		dif_chk_flags;
+			uint8_t		dif_chk_res2[5];
+			uint32_t	chk_ref_tag_seed;
+			uint16_t	chk_app_tag_mask;
+			uint16_t	chk_app_tag_seed;
+		};
+		/* DIF insert */
+		struct {
+			uint8_t		dif_ins_res;
+			uint8_t		dest_dif_flag;
+			uint8_t		dif_ins_flags;
+			uint8_t		dif_ins_res2[13];
+			uint32_t	ins_ref_tag_seed;
+			uint16_t	ins_app_tag_mask;
+			uint16_t	ins_app_tag_seed;
+		};
+		/* DIF update */
+		struct {
+			uint8_t		src_upd_flags;
+			uint8_t		upd_dest_flags;
+			uint8_t		dif_upd_flags;
+			uint8_t		dif_upd_res[5];
+			uint32_t	src_ref_tag_seed;
+			uint16_t	src_app_tag_mask;
+			uint16_t	src_app_tag_seed;
+			uint32_t	dest_ref_tag_seed;
+			uint16_t	dest_app_tag_mask;
+			uint16_t	dest_app_tag_seed;
+		};
+
+		uint8_t		op_specific[24];
+	};
+} __attribute__((packed));
+
+struct dsa_raw_desc {
+	uint64_t	field[8];
+} __attribute__((packed));
+
+/*
+ * The status field will be modified by hardware, therefore it should be
+ * volatile and prevent the compiler from optimize the read.
+ */
+struct dsa_completion_record {
+	volatile uint8_t	status;
+	union {
+		uint8_t		result;
+		uint8_t		dif_status;
+	};
+	uint16_t		rsvd;
+	uint32_t		bytes_completed;
+	uint64_t		fault_addr;
+	union {
+		uint16_t	delta_rec_size;
+		uint16_t	crc_val;
+
+		/* DIF check & strip */
+		struct {
+			uint32_t	dif_chk_ref_tag;
+			uint16_t	dif_chk_app_tag_mask;
+			uint16_t	dif_chk_app_tag;
+		};
+
+		/* DIF insert */
+		struct {
+			uint64_t	dif_ins_res;
+			uint32_t	dif_ins_ref_tag;
+			uint16_t	dif_ins_app_tag_mask;
+			uint16_t	dif_ins_app_tag;
+		};
+
+		/* DIF update */
+		struct {
+			uint32_t	dif_upd_src_ref_tag;
+			uint16_t	dif_upd_src_app_tag_mask;
+			uint16_t	dif_upd_src_app_tag;
+			uint32_t	dif_upd_dest_ref_tag;
+			uint16_t	dif_upd_dest_app_tag_mask;
+			uint16_t	dif_upd_dest_app_tag;
+		};
+
+		uint8_t		op_specific[16];
+	};
+} __attribute__((packed));
+
+struct dsa_raw_completion_record {
+	uint64_t	field[4];
+} __attribute__((packed));
+
+#endif
-- 
Gitee


From fea89afa11f41a65df7ce478f4abc5c6756b9d16 Mon Sep 17 00:00:00 2001
From: Jing Lin <jing.lin@intel.com>
Date: Tue, 21 Jan 2020 16:44:11 -0700
Subject: [PATCH 0514/2161] dmaengine: idxd: add sysfs ABI for idxd driver

commit b131ad593884cb3decc4950f807af2faffc05a3c upstream.

Add the sysfs ABI information for idxd driver in
Documentation/ABI/stable directory.

Signed-off-by: Jing Lin <jing.lin@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/157965025170.73301.13428570530450446901.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          | 171 ++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 Documentation/ABI/stable/sysfs-driver-dma-idxd

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
new file mode 100644
index 000000000000..f4be46cc6cb6
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -0,0 +1,171 @@
+What:           sys/bus/dsa/devices/dsa<m>/cdev_major
+Date:           Oct 25, 2019
+KernelVersion: 	5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:	The major number that the character device driver assigned to
+		this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/errors
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The error information for this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/max_batch_size
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The largest number of work descriptors in a batch.
+
+What:           sys/bus/dsa/devices/dsa<m>/max_work_queues_size
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum work queue size supported by this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/max_engines
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum number of engines supported by this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/max_groups
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum number of groups can be created under this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/max_tokens
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The total number of bandwidth tokens supported by this device.
+		The bandwidth tokens represent resources within the DSA
+		implementation, and these resources are allocated by engines to
+		support operations.
+
+What:           sys/bus/dsa/devices/dsa<m>/max_transfer_size
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The number of bytes to be read from the source address to
+		perform the operation. The maximum transfer size is dependent on
+		the workqueue the descriptor was submitted to.
+
+What:           sys/bus/dsa/devices/dsa<m>/max_work_queues
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum work queue number that this device supports.
+
+What:           sys/bus/dsa/devices/dsa<m>/numa_node
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The numa node number for this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/op_cap
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The operation capability bit mask specify the operation types
+		supported by the this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/state
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The state information of this device. It can be either enabled
+		or disabled.
+
+What:           sys/bus/dsa/devices/dsa<m>/group<m>.<n>
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The assigned group under this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/engine<m>.<n>
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The assigned engine under this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/wq<m>.<n>
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The assigned work queue under this device.
+
+What:           sys/bus/dsa/devices/dsa<m>/configurable
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    To indicate if this device is configurable or not.
+
+What:           sys/bus/dsa/devices/dsa<m>/token_limit
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum number of bandwidth tokens that may be in use at
+		one time by operations that access low bandwidth memory in the
+		device.
+
+What:           sys/bus/dsa/devices/wq<m>.<n>/group_id
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The group id that this work queue belongs to.
+
+What:           sys/bus/dsa/devices/wq<m>.<n>/size
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The work queue size for this work queue.
+
+What:           sys/bus/dsa/devices/wq<m>.<n>/type
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The type of this work queue, it can be "kernel" type for work
+		queue usages in the kernel space or "user" type for work queue
+		usages by applications in user space.
+
+What:           sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The minor number assigned to this work queue by the character
+		device driver.
+
+What:           sys/bus/dsa/devices/wq<m>.<n>/mode
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The work queue mode type for this work queue.
+
+What:           sys/bus/dsa/devices/wq<m>.<n>/priority
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The priority value of this work queue, it is a vlue relative to
+		other work queue in the same group to control quality of service
+		for dispatching work from multiple workqueues in the same group.
+
+What:           sys/bus/dsa/devices/wq<m>.<n>/state
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The current state of the work queue.
+
+What:           sys/bus/dsa/devices/wq<m>.<n>/threshold
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The number of entries in this work queue that may be filled
+		via a limited portal.
+
+What:           sys/bus/dsa/devices/engine<m>.<n>/group_id
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The group that this engine belongs to.
-- 
Gitee


From 1ebc9b1b4f843a45991133129e22c53ab89c3d17 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Jan 2020 16:44:17 -0700
Subject: [PATCH 0515/2161] dmaengine: idxd: add descriptor manipulation
 routines

commit d1dfe5b8ac644a0ffccfe7af22abed7c80b34702 upstream.

This commit adds helper functions for DSA descriptor allocation,
submission, and free operations.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/157965025757.73301.12692876585357550065.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/Makefile |  2 +-
 drivers/dma/idxd/idxd.h   | 10 +++++
 drivers/dma/idxd/submit.c | 91 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dma/idxd/submit.c

diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index a552560a03dc..50eca12015e2 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
-idxd-y := init.o irq.o device.o sysfs.o
+idxd-y := init.o irq.o device.o sysfs.o submit.o
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 909926aefd3e..d369b75468e3 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -68,6 +68,11 @@ enum idxd_wq_type {
 #define WQ_NAME_SIZE   1024
 #define WQ_TYPE_SIZE   10
 
+enum idxd_op_type {
+	IDXD_OP_BLOCK = 0,
+	IDXD_OP_NONBLOCK = 1,
+};
+
 struct idxd_wq {
 	void __iomem *dportal;
 	struct device conf_dev;
@@ -246,4 +251,9 @@ int idxd_wq_disable(struct idxd_wq *wq);
 int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
 
+/* submission */
+int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
+struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype);
+void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc);
+
 #endif
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
new file mode 100644
index 000000000000..a405f06990e3
--- /dev/null
+++ b/drivers/dma/idxd/submit.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <uapi/linux/idxd.h>
+#include "idxd.h"
+#include "registers.h"
+
+struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype)
+{
+	struct idxd_desc *desc;
+	int idx;
+	struct idxd_device *idxd = wq->idxd;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return ERR_PTR(-EIO);
+
+	if (optype == IDXD_OP_BLOCK)
+		percpu_down_read(&wq->submit_lock);
+	else if (!percpu_down_read_trylock(&wq->submit_lock))
+		return ERR_PTR(-EBUSY);
+
+	if (!atomic_add_unless(&wq->dq_count, 1, wq->size)) {
+		int rc;
+
+		if (optype == IDXD_OP_NONBLOCK) {
+			percpu_up_read(&wq->submit_lock);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		percpu_up_read(&wq->submit_lock);
+		percpu_down_write(&wq->submit_lock);
+		rc = wait_event_interruptible(wq->submit_waitq,
+					      atomic_add_unless(&wq->dq_count,
+								1, wq->size) ||
+					       idxd->state != IDXD_DEV_ENABLED);
+		percpu_up_write(&wq->submit_lock);
+		if (rc < 0)
+			return ERR_PTR(-EINTR);
+		if (idxd->state != IDXD_DEV_ENABLED)
+			return ERR_PTR(-EIO);
+	} else {
+		percpu_up_read(&wq->submit_lock);
+	}
+
+	idx = sbitmap_get(&wq->sbmap, 0, false);
+	if (idx < 0) {
+		atomic_dec(&wq->dq_count);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	desc = wq->descs[idx];
+	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
+	memset(desc->completion, 0, sizeof(struct dsa_completion_record));
+	return desc;
+}
+
+void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
+{
+	atomic_dec(&wq->dq_count);
+
+	sbitmap_clear_bit(&wq->sbmap, desc->id);
+	wake_up(&wq->submit_waitq);
+}
+
+int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int vec = desc->hw->int_handle;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -EIO;
+
+	/*
+	 * The wmb() flushes writes to coherent DMA data before possibly
+	 * triggering a DMA read. The wmb() is necessary even on UP because
+	 * the recipient is a device.
+	 */
+	wmb();
+	iosubmit_cmds512(wq->dportal, desc->hw, 1);
+
+	/*
+	 * Pending the descriptor to the lockless list for the irq_entry
+	 * that we designated the descriptor to.
+	 */
+	llist_add(&desc->llnode, &idxd->irq_entries[vec].pending_llist);
+
+	return 0;
+}
-- 
Gitee


From a925a432c45a34682a40601a744ae95a0729fd2e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Jan 2020 16:44:23 -0700
Subject: [PATCH 0516/2161] dmaengine: idxd: connect idxd to dmaengine
 subsystem

commit 8f47d1a5e545f903cd049c42da31a3be36178447 upstream.

Add plumbing for dmaengine subsystem connection. The driver register a DMA
device per DSA device. The channels are dynamically registered when a
workqueue is configured to be "kernel:dmanegine" type.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/157965026376.73301.13867988830650740445.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/Makefile |   2 +-
 drivers/dma/idxd/device.c |   5 +
 drivers/dma/idxd/dma.c    | 217 ++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |  20 ++++
 drivers/dma/idxd/init.c   |  30 ++++++
 drivers/dma/idxd/irq.c    |  87 +++++++++++++++
 drivers/dma/idxd/submit.c |   4 +-
 drivers/dma/idxd/sysfs.c  |  28 +++++
 8 files changed, 391 insertions(+), 2 deletions(-)
 create mode 100644 drivers/dma/idxd/dma.c

diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index 50eca12015e2..a036ba0e77d2 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
-idxd-y := init.o irq.o device.o sysfs.o submit.o
+idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index d626780caa53..b4c4cec489df 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -5,7 +5,9 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/dmaengine.h>
 #include <uapi/linux/idxd.h>
+#include "../dmaengine.h"
 #include "idxd.h"
 #include "registers.h"
 
@@ -192,6 +194,9 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 			sizeof(struct dsa_completion_record) * i;
 		desc->id = i;
 		desc->wq = wq;
+
+		dma_async_tx_descriptor_init(&desc->txd, &wq->dma_chan);
+		desc->txd.tx_submit = idxd_dma_tx_submit;
 	}
 
 	return 0;
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
new file mode 100644
index 000000000000..c64c1429d160
--- /dev/null
+++ b/drivers/dma/idxd/dma.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/dmaengine.h>
+#include <uapi/linux/idxd.h>
+#include "../dmaengine.h"
+#include "registers.h"
+#include "idxd.h"
+
+static inline struct idxd_wq *to_idxd_wq(struct dma_chan *c)
+{
+	return container_of(c, struct idxd_wq, dma_chan);
+}
+
+void idxd_dma_complete_txd(struct idxd_desc *desc,
+			   enum idxd_complete_type comp_type)
+{
+	struct dma_async_tx_descriptor *tx;
+	struct dmaengine_result res;
+	int complete = 1;
+
+	if (desc->completion->status == DSA_COMP_SUCCESS)
+		res.result = DMA_TRANS_NOERROR;
+	else if (desc->completion->status)
+		res.result = DMA_TRANS_WRITE_FAILED;
+	else if (comp_type == IDXD_COMPLETE_ABORT)
+		res.result = DMA_TRANS_ABORTED;
+	else
+		complete = 0;
+
+	tx = &desc->txd;
+	if (complete && tx->cookie) {
+		dma_cookie_complete(tx);
+		dma_descriptor_unmap(tx);
+		dmaengine_desc_get_callback_invoke(tx, &res);
+		tx->callback = NULL;
+		tx->callback_result = NULL;
+	}
+}
+
+static void op_flag_setup(unsigned long flags, u32 *desc_flags)
+{
+	*desc_flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
+	if (flags & DMA_PREP_INTERRUPT)
+		*desc_flags |= IDXD_OP_FLAG_RCI;
+}
+
+static inline void set_completion_address(struct idxd_desc *desc,
+					  u64 *compl_addr)
+{
+		*compl_addr = desc->compl_dma;
+}
+
+static inline void idxd_prep_desc_common(struct idxd_wq *wq,
+					 struct dsa_hw_desc *hw, char opcode,
+					 u64 addr_f1, u64 addr_f2, u64 len,
+					 u64 compl, u32 flags)
+{
+	struct idxd_device *idxd = wq->idxd;
+
+	hw->flags = flags;
+	hw->opcode = opcode;
+	hw->src_addr = addr_f1;
+	hw->dst_addr = addr_f2;
+	hw->xfer_size = len;
+	hw->priv = !!(wq->type == IDXD_WQT_KERNEL);
+	hw->completion_addr = compl;
+
+	/*
+	 * Descriptor completion vectors are 1-8 for MSIX. We will round
+	 * robin through the 8 vectors.
+	 */
+	wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1;
+	hw->int_handle =  wq->vec_ptr;
+}
+
+static struct dma_async_tx_descriptor *
+idxd_dma_submit_memcpy(struct dma_chan *c, dma_addr_t dma_dest,
+		       dma_addr_t dma_src, size_t len, unsigned long flags)
+{
+	struct idxd_wq *wq = to_idxd_wq(c);
+	u32 desc_flags;
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_desc *desc;
+
+	if (wq->state != IDXD_WQ_ENABLED)
+		return NULL;
+
+	if (len > idxd->max_xfer_bytes)
+		return NULL;
+
+	op_flag_setup(flags, &desc_flags);
+	desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(desc))
+		return NULL;
+
+	idxd_prep_desc_common(wq, desc->hw, DSA_OPCODE_MEMMOVE,
+			      dma_src, dma_dest, len, desc->compl_dma,
+			      desc_flags);
+
+	desc->txd.flags = flags;
+
+	return &desc->txd;
+}
+
+static int idxd_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct idxd_wq *wq = to_idxd_wq(chan);
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	idxd_wq_get(wq);
+	dev_dbg(dev, "%s: client_count: %d\n", __func__,
+		idxd_wq_refcount(wq));
+	return 0;
+}
+
+static void idxd_dma_free_chan_resources(struct dma_chan *chan)
+{
+	struct idxd_wq *wq = to_idxd_wq(chan);
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	idxd_wq_put(wq);
+	dev_dbg(dev, "%s: client_count: %d\n", __func__,
+		idxd_wq_refcount(wq));
+}
+
+static enum dma_status idxd_dma_tx_status(struct dma_chan *dma_chan,
+					  dma_cookie_t cookie,
+					  struct dma_tx_state *txstate)
+{
+	return dma_cookie_status(dma_chan, cookie, txstate);
+}
+
+/*
+ * issue_pending() does not need to do anything since tx_submit() does the job
+ * already.
+ */
+static void idxd_dma_issue_pending(struct dma_chan *dma_chan)
+{
+}
+
+dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_chan *c = tx->chan;
+	struct idxd_wq *wq = to_idxd_wq(c);
+	dma_cookie_t cookie;
+	int rc;
+	struct idxd_desc *desc = container_of(tx, struct idxd_desc, txd);
+
+	cookie = dma_cookie_assign(tx);
+
+	rc = idxd_submit_desc(wq, desc);
+	if (rc < 0) {
+		idxd_free_desc(wq, desc);
+		return rc;
+	}
+
+	return cookie;
+}
+
+static void idxd_dma_release(struct dma_device *device)
+{
+}
+
+int idxd_register_dma_device(struct idxd_device *idxd)
+{
+	struct dma_device *dma = &idxd->dma_dev;
+
+	INIT_LIST_HEAD(&dma->channels);
+	dma->dev = &idxd->pdev->dev;
+
+	dma->device_release = idxd_dma_release;
+
+	if (idxd->hw.opcap.bits[0] & IDXD_OPCAP_MEMMOVE) {
+		dma_cap_set(DMA_MEMCPY, dma->cap_mask);
+		dma->device_prep_dma_memcpy = idxd_dma_submit_memcpy;
+	}
+
+	dma->device_tx_status = idxd_dma_tx_status;
+	dma->device_issue_pending = idxd_dma_issue_pending;
+	dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources;
+	dma->device_free_chan_resources = idxd_dma_free_chan_resources;
+
+	return dma_async_device_register(&idxd->dma_dev);
+}
+
+void idxd_unregister_dma_device(struct idxd_device *idxd)
+{
+	dma_async_device_unregister(&idxd->dma_dev);
+}
+
+int idxd_register_dma_channel(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct dma_device *dma = &idxd->dma_dev;
+	struct dma_chan *chan = &wq->dma_chan;
+	int rc;
+
+	memset(&wq->dma_chan, 0, sizeof(struct dma_chan));
+	chan->device = dma;
+	list_add_tail(&chan->device_node, &dma->channels);
+	rc = dma_async_device_channel_register(dma, chan);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+void idxd_unregister_dma_channel(struct idxd_wq *wq)
+{
+	dma_async_device_channel_unregister(&wq->idxd->dma_dev, &wq->dma_chan);
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index d369b75468e3..a36214818d1e 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -4,6 +4,7 @@
 #define _IDXD_H_
 
 #include <linux/sbitmap.h>
+#include <linux/dmaengine.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/wait.h>
 #include "registers.h"
@@ -73,6 +74,11 @@ enum idxd_op_type {
 	IDXD_OP_NONBLOCK = 1,
 };
 
+enum idxd_complete_type {
+	IDXD_COMPLETE_NORMAL = 0,
+	IDXD_COMPLETE_ABORT,
+};
+
 struct idxd_wq {
 	void __iomem *dportal;
 	struct device conf_dev;
@@ -97,6 +103,7 @@ struct idxd_wq {
 	int compls_size;
 	struct idxd_desc **descs;
 	struct sbitmap sbmap;
+	struct dma_chan dma_chan;
 	struct percpu_rw_semaphore submit_lock;
 	wait_queue_head_t submit_waitq;
 	char name[WQ_NAME_SIZE + 1];
@@ -169,6 +176,8 @@ struct idxd_device {
 	struct msix_entry *msix_entries;
 	int num_wq_irqs;
 	struct idxd_irq_entry *irq_entries;
+
+	struct dma_device dma_dev;
 };
 
 /* IDXD software descriptor */
@@ -177,6 +186,7 @@ struct idxd_desc {
 	dma_addr_t desc_dma;
 	struct dsa_completion_record *completion;
 	dma_addr_t compl_dma;
+	struct dma_async_tx_descriptor txd;
 	struct llist_node llnode;
 	struct list_head list;
 	int id;
@@ -256,4 +266,14 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
 struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype);
 void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc);
 
+/* dmaengine */
+int idxd_register_dma_device(struct idxd_device *idxd);
+void idxd_unregister_dma_device(struct idxd_device *idxd);
+int idxd_register_dma_channel(struct idxd_wq *wq);
+void idxd_unregister_dma_channel(struct idxd_wq *wq);
+void idxd_parse_completion_status(u8 status, enum dmaengine_tx_result *res);
+void idxd_dma_complete_txd(struct idxd_desc *desc,
+			   enum idxd_complete_type comp_type);
+dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx);
+
 #endif
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 229386464923..cf6e1d89dd02 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -15,6 +15,8 @@
 #include <linux/device.h>
 #include <linux/idr.h>
 #include <uapi/linux/idxd.h>
+#include <linux/dmaengine.h>
+#include "../dmaengine.h"
 #include "registers.h"
 #include "idxd.h"
 
@@ -396,6 +398,32 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 }
 
+static void idxd_flush_pending_llist(struct idxd_irq_entry *ie)
+{
+	struct idxd_desc *desc, *itr;
+	struct llist_node *head;
+
+	head = llist_del_all(&ie->pending_llist);
+	if (!head)
+		return;
+
+	llist_for_each_entry_safe(desc, itr, head, llnode) {
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT);
+		idxd_free_desc(desc->wq, desc);
+	}
+}
+
+static void idxd_flush_work_list(struct idxd_irq_entry *ie)
+{
+	struct idxd_desc *desc, *iter;
+
+	list_for_each_entry_safe(desc, iter, &ie->work_list, list) {
+		list_del(&desc->list);
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT);
+		idxd_free_desc(desc->wq, desc);
+	}
+}
+
 static void idxd_shutdown(struct pci_dev *pdev)
 {
 	struct idxd_device *idxd = pci_get_drvdata(pdev);
@@ -419,6 +447,8 @@ static void idxd_shutdown(struct pci_dev *pdev)
 		synchronize_irq(idxd->msix_entries[i].vector);
 		if (i == 0)
 			continue;
+		idxd_flush_pending_llist(irq_entry);
+		idxd_flush_work_list(irq_entry);
 	}
 }
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index de4b80973c2f..770d408470db 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -5,7 +5,9 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/dmaengine.h>
 #include <uapi/linux/idxd.h>
+#include "../dmaengine.h"
 #include "idxd.h"
 #include "registers.h"
 
@@ -146,11 +148,96 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	return IRQ_HANDLED;
 }
 
+static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
+				     int *processed)
+{
+	struct idxd_desc *desc, *t;
+	struct llist_node *head;
+	int queued = 0;
+
+	head = llist_del_all(&irq_entry->pending_llist);
+	if (!head)
+		return 0;
+
+	llist_for_each_entry_safe(desc, t, head, llnode) {
+		if (desc->completion->status) {
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
+			idxd_free_desc(desc->wq, desc);
+			(*processed)++;
+		} else {
+			list_add_tail(&desc->list, &irq_entry->work_list);
+			queued++;
+		}
+	}
+
+	return queued;
+}
+
+static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
+				 int *processed)
+{
+	struct list_head *node, *next;
+	int queued = 0;
+
+	if (list_empty(&irq_entry->work_list))
+		return 0;
+
+	list_for_each_safe(node, next, &irq_entry->work_list) {
+		struct idxd_desc *desc =
+			container_of(node, struct idxd_desc, list);
+
+		if (desc->completion->status) {
+			list_del(&desc->list);
+			/* process and callback */
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
+			idxd_free_desc(desc->wq, desc);
+			(*processed)++;
+		} else {
+			queued++;
+		}
+	}
+
+	return queued;
+}
+
 irqreturn_t idxd_wq_thread(int irq, void *data)
 {
 	struct idxd_irq_entry *irq_entry = data;
+	int rc, processed = 0, retry = 0;
+
+	/*
+	 * There are two lists we are processing. The pending_llist is where
+	 * submmiter adds all the submitted descriptor after sending it to
+	 * the workqueue. It's a lockless singly linked list. The work_list
+	 * is the common linux double linked list. We are in a scenario of
+	 * multiple producers and a single consumer. The producers are all
+	 * the kernel submitters of descriptors, and the consumer is the
+	 * kernel irq handler thread for the msix vector when using threaded
+	 * irq. To work with the restrictions of llist to remain lockless,
+	 * we are doing the following steps:
+	 * 1. Iterate through the work_list and process any completed
+	 *    descriptor. Delete the completed entries during iteration.
+	 * 2. llist_del_all() from the pending list.
+	 * 3. Iterate through the llist that was deleted from the pending list
+	 *    and process the completed entries.
+	 * 4. If the entry is still waiting on hardware, list_add_tail() to
+	 *    the work_list.
+	 * 5. Repeat until no more descriptors.
+	 */
+	do {
+		rc = irq_process_work_list(irq_entry, &processed);
+		if (rc != 0) {
+			retry++;
+			continue;
+		}
+
+		rc = irq_process_pending_llist(irq_entry, &processed);
+	} while (rc != 0 && retry != 10);
 
 	idxd_unmask_msix_vector(irq_entry->idxd, irq_entry->id);
 
+	if (processed == 0)
+		return IRQ_NONE;
+
 	return IRQ_HANDLED;
 }
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index a405f06990e3..e16cab37dda8 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -85,7 +85,9 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 * Pending the descriptor to the lockless list for the irq_entry
 	 * that we designated the descriptor to.
 	 */
-	llist_add(&desc->llnode, &idxd->irq_entries[vec].pending_llist);
+	if (desc->hw->flags & IDXD_OP_FLAG_RCI)
+		llist_add(&desc->llnode,
+			  &idxd->irq_entries[vec].pending_llist);
 
 	return 0;
 }
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index b6a0a59b500f..f5e3f962ee6a 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -55,6 +55,14 @@ static inline bool is_idxd_wq_dev(struct device *dev)
 	return dev ? dev->type == &idxd_wq_device_type : false;
 }
 
+static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
+{
+	if (wq->type == IDXD_WQT_KERNEL &&
+	    strcmp(wq->name, "dmaengine") == 0)
+		return true;
+	return false;
+}
+
 static int idxd_config_bus_match(struct device *dev,
 				 struct device_driver *drv)
 {
@@ -122,6 +130,12 @@ static int idxd_config_bus_probe(struct device *dev)
 		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 		dev_info(dev, "Device %s enabled\n", dev_name(dev));
 
+		rc = idxd_register_dma_device(idxd);
+		if (rc < 0) {
+			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			dev_dbg(dev, "Failed to register dmaengine device\n");
+			return rc;
+		}
 		return 0;
 	} else if (is_idxd_wq_dev(dev)) {
 		struct idxd_wq *wq = confdev_to_wq(dev);
@@ -194,6 +208,16 @@ static int idxd_config_bus_probe(struct device *dev)
 		wq->client_count = 0;
 
 		dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev));
+
+		if (is_idxd_wq_dmaengine(wq)) {
+			rc = idxd_register_dma_channel(wq);
+			if (rc < 0) {
+				dev_dbg(dev, "DMA channel register failed\n");
+				mutex_unlock(&wq->wq_lock);
+				return rc;
+			}
+		}
+
 		mutex_unlock(&wq->wq_lock);
 		return 0;
 	}
@@ -215,6 +239,9 @@ static void disable_wq(struct idxd_wq *wq)
 		return;
 	}
 
+	if (is_idxd_wq_dmaengine(wq))
+		idxd_unregister_dma_channel(wq);
+
 	if (idxd_wq_refcount(wq))
 		dev_warn(dev, "Clients has claim on wq %d: %d\n",
 			 wq->id, idxd_wq_refcount(wq));
@@ -264,6 +291,7 @@ static int idxd_config_bus_remove(struct device *dev)
 			device_release_driver(&wq->conf_dev);
 		}
 
+		idxd_unregister_dma_device(idxd);
 		spin_lock_irqsave(&idxd->dev_lock, flags);
 		rc = idxd_device_disable(idxd);
 		spin_unlock_irqrestore(&idxd->dev_lock, flags);
-- 
Gitee


From 33761fe6369b7bcec7cc6b8303df085d68883a81 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Jan 2020 16:44:29 -0700
Subject: [PATCH 0517/2161] dmaengine: idxd: add char driver to expose
 submission portal to userland

commit 42d279f9137ab7d5503836baec2739284b278d8f upstream.

Create a char device region that will allow acquisition of user portals in
order to allow applications to submit DMA operations. A char device will be
created per work queue that gets exposed. The workqueue type "user"
is used to mark a work queue for user char device. For example if the
workqueue 0 of DSA device 0 is marked for char device, then a device node
of /dev/dsa/wq0.0 will be created.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/157965026985.73301.976523230037106742.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/Makefile |   2 +-
 drivers/dma/idxd/cdev.c   | 302 ++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/device.c |   2 +-
 drivers/dma/idxd/idxd.h   |  37 +++++
 drivers/dma/idxd/init.c   |  10 ++
 drivers/dma/idxd/irq.c    |  18 +++
 drivers/dma/idxd/submit.c |   4 +-
 drivers/dma/idxd/sysfs.c  |  52 ++++++-
 8 files changed, 422 insertions(+), 5 deletions(-)
 create mode 100644 drivers/dma/idxd/cdev.c

diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index a036ba0e77d2..8978b898d777 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
-idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o
+idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
new file mode 100644
index 000000000000..1d7347825b95
--- /dev/null
+++ b/drivers/dma/idxd/cdev.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/sched/task.h>
+#include <linux/intel-svm.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+
+struct idxd_cdev_context {
+	const char *name;
+	dev_t devt;
+	struct ida minor_ida;
+};
+
+/*
+ * ictx is an array based off of accelerator types. enum idxd_type
+ * is used as index
+ */
+static struct idxd_cdev_context ictx[IDXD_TYPE_MAX] = {
+	{ .name = "dsa" },
+};
+
+struct idxd_user_context {
+	struct idxd_wq *wq;
+	struct task_struct *task;
+	unsigned int flags;
+};
+
+enum idxd_cdev_cleanup {
+	CDEV_NORMAL = 0,
+	CDEV_FAILED,
+};
+
+static void idxd_cdev_dev_release(struct device *dev)
+{
+	dev_dbg(dev, "releasing cdev device\n");
+	kfree(dev);
+}
+
+static struct device_type idxd_cdev_device_type = {
+	.name = "idxd_cdev",
+	.release = idxd_cdev_dev_release,
+};
+
+static inline struct idxd_cdev *inode_idxd_cdev(struct inode *inode)
+{
+	struct cdev *cdev = inode->i_cdev;
+
+	return container_of(cdev, struct idxd_cdev, cdev);
+}
+
+static inline struct idxd_wq *idxd_cdev_wq(struct idxd_cdev *idxd_cdev)
+{
+	return container_of(idxd_cdev, struct idxd_wq, idxd_cdev);
+}
+
+static inline struct idxd_wq *inode_wq(struct inode *inode)
+{
+	return idxd_cdev_wq(inode_idxd_cdev(inode));
+}
+
+static int idxd_cdev_open(struct inode *inode, struct file *filp)
+{
+	struct idxd_user_context *ctx;
+	struct idxd_device *idxd;
+	struct idxd_wq *wq;
+	struct device *dev;
+	struct idxd_cdev *idxd_cdev;
+
+	wq = inode_wq(inode);
+	idxd = wq->idxd;
+	dev = &idxd->pdev->dev;
+	idxd_cdev = &wq->idxd_cdev;
+
+	dev_dbg(dev, "%s called\n", __func__);
+
+	if (idxd_wq_refcount(wq) > 1 && wq_dedicated(wq))
+		return -EBUSY;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->wq = wq;
+	filp->private_data = ctx;
+	idxd_wq_get(wq);
+	return 0;
+}
+
+static int idxd_cdev_release(struct inode *node, struct file *filep)
+{
+	struct idxd_user_context *ctx = filep->private_data;
+	struct idxd_wq *wq = ctx->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+
+	dev_dbg(dev, "%s called\n", __func__);
+	filep->private_data = NULL;
+
+	kfree(ctx);
+	idxd_wq_put(wq);
+	return 0;
+}
+
+static int check_vma(struct idxd_wq *wq, struct vm_area_struct *vma,
+		     const char *func)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+		dev_info_ratelimited(dev,
+				     "%s: %s: mapping too large: %lu\n",
+				     current->comm, func,
+				     vma->vm_end - vma->vm_start);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct idxd_user_context *ctx = filp->private_data;
+	struct idxd_wq *wq = ctx->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct pci_dev *pdev = idxd->pdev;
+	phys_addr_t base = pci_resource_start(pdev, IDXD_WQ_BAR);
+	unsigned long pfn;
+	int rc;
+
+	dev_dbg(&pdev->dev, "%s called\n", __func__);
+	rc = check_vma(wq, vma, __func__);
+
+	vma->vm_flags |= VM_DONTCOPY;
+	pfn = (base + idxd_get_wq_portal_full_offset(wq->id,
+				IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_private_data = ctx;
+
+	return io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE,
+			vma->vm_page_prot);
+}
+
+static __poll_t idxd_cdev_poll(struct file *filp,
+			       struct poll_table_struct *wait)
+{
+	struct idxd_user_context *ctx = filp->private_data;
+	struct idxd_wq *wq = ctx->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
+	unsigned long flags;
+	__poll_t out = 0;
+
+	poll_wait(filp, &idxd_cdev->err_queue, wait);
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	if (idxd->sw_err.valid)
+		out = EPOLLIN | EPOLLRDNORM;
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	return out;
+}
+
+static const struct file_operations idxd_cdev_fops = {
+	.owner = THIS_MODULE,
+	.open = idxd_cdev_open,
+	.release = idxd_cdev_release,
+	.mmap = idxd_cdev_mmap,
+	.poll = idxd_cdev_poll,
+};
+
+int idxd_cdev_get_major(struct idxd_device *idxd)
+{
+	return MAJOR(ictx[idxd->type].devt);
+}
+
+static int idxd_wq_cdev_dev_setup(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
+	struct idxd_cdev_context *cdev_ctx;
+	struct device *dev;
+	int minor, rc;
+
+	idxd_cdev->dev = kzalloc(sizeof(*idxd_cdev->dev), GFP_KERNEL);
+	if (!idxd_cdev->dev)
+		return -ENOMEM;
+
+	dev = idxd_cdev->dev;
+	dev->parent = &idxd->pdev->dev;
+	dev_set_name(dev, "%s/wq%u.%u", idxd_get_dev_name(idxd),
+		     idxd->id, wq->id);
+	dev->bus = idxd_get_bus_type(idxd);
+
+	cdev_ctx = &ictx[wq->idxd->type];
+	minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL);
+	if (minor < 0) {
+		rc = minor;
+		goto ida_err;
+	}
+
+	dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
+	dev->type = &idxd_cdev_device_type;
+	rc = device_register(dev);
+	if (rc < 0) {
+		dev_err(&idxd->pdev->dev, "device register failed\n");
+		put_device(dev);
+		goto dev_reg_err;
+	}
+	idxd_cdev->minor = minor;
+
+	return 0;
+
+ dev_reg_err:
+	ida_simple_remove(&cdev_ctx->minor_ida, MINOR(dev->devt));
+ ida_err:
+	kfree(dev);
+	idxd_cdev->dev = NULL;
+	return rc;
+}
+
+static void idxd_wq_cdev_cleanup(struct idxd_wq *wq,
+				 enum idxd_cdev_cleanup cdev_state)
+{
+	struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
+	struct idxd_cdev_context *cdev_ctx;
+
+	cdev_ctx = &ictx[wq->idxd->type];
+	if (cdev_state == CDEV_NORMAL)
+		cdev_del(&idxd_cdev->cdev);
+	device_unregister(idxd_cdev->dev);
+	/*
+	 * The device_type->release() will be called on the device and free
+	 * the allocated struct device. We can just forget it.
+	 */
+	ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor);
+	idxd_cdev->dev = NULL;
+	idxd_cdev->minor = -1;
+}
+
+int idxd_wq_add_cdev(struct idxd_wq *wq)
+{
+	struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
+	struct cdev *cdev = &idxd_cdev->cdev;
+	struct device *dev;
+	int rc;
+
+	rc = idxd_wq_cdev_dev_setup(wq);
+	if (rc < 0)
+		return rc;
+
+	dev = idxd_cdev->dev;
+	cdev_init(cdev, &idxd_cdev_fops);
+	cdev_set_parent(cdev, &dev->kobj);
+	rc = cdev_add(cdev, dev->devt, 1);
+	if (rc) {
+		dev_dbg(&wq->idxd->pdev->dev, "cdev_add failed: %d\n", rc);
+		idxd_wq_cdev_cleanup(wq, CDEV_FAILED);
+		return rc;
+	}
+
+	init_waitqueue_head(&idxd_cdev->err_queue);
+	return 0;
+}
+
+void idxd_wq_del_cdev(struct idxd_wq *wq)
+{
+	idxd_wq_cdev_cleanup(wq, CDEV_NORMAL);
+}
+
+int idxd_cdev_register(void)
+{
+	int rc, i;
+
+	for (i = 0; i < IDXD_TYPE_MAX; i++) {
+		ida_init(&ictx[i].minor_ida);
+		rc = alloc_chrdev_region(&ictx[i].devt, 0, MINORMASK,
+					 ictx[i].name);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+void idxd_cdev_remove(void)
+{
+	int i;
+
+	for (i = 0; i < IDXD_TYPE_MAX; i++) {
+		unregister_chrdev_region(ictx[i].devt, MINORMASK);
+		ida_destroy(&ictx[i].minor_ida);
+	}
+}
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index b4c4cec489df..ada69e722f84 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -539,7 +539,7 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	wq->wqcfg.wq_thresh = wq->threshold;
 
 	/* byte 8-11 */
-	wq->wqcfg.priv = 1; /* kernel, therefore priv */
+	wq->wqcfg.priv = !!(wq->type == IDXD_WQT_KERNEL);
 	wq->wqcfg.mode = 1;
 
 	wq->wqcfg.priority = wq->priority;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index a36214818d1e..b8f8a363b4a7 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -7,6 +7,7 @@
 #include <linux/dmaengine.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/wait.h>
+#include <linux/cdev.h>
 #include "registers.h"
 
 #define IDXD_DRIVER_VERSION	"1.00"
@@ -63,6 +64,14 @@ enum idxd_wq_flag {
 enum idxd_wq_type {
 	IDXD_WQT_NONE = 0,
 	IDXD_WQT_KERNEL,
+	IDXD_WQT_USER,
+};
+
+struct idxd_cdev {
+	struct cdev cdev;
+	struct device *dev;
+	int minor;
+	struct wait_queue_head err_queue;
 };
 
 #define IDXD_ALLOCATED_BATCH_SIZE	128U
@@ -82,6 +91,7 @@ enum idxd_complete_type {
 struct idxd_wq {
 	void __iomem *dportal;
 	struct device conf_dev;
+	struct idxd_cdev idxd_cdev;
 	struct idxd_device *idxd;
 	int id;
 	enum idxd_wq_type type;
@@ -145,6 +155,7 @@ struct idxd_device {
 	enum idxd_device_state state;
 	unsigned long flags;
 	int id;
+	int major;
 
 	struct pci_dev *pdev;
 	void __iomem *reg_base;
@@ -196,11 +207,29 @@ struct idxd_desc {
 #define confdev_to_idxd(dev) container_of(dev, struct idxd_device, conf_dev)
 #define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev)
 
+extern struct bus_type dsa_bus_type;
+
 static inline bool wq_dedicated(struct idxd_wq *wq)
 {
 	return test_bit(WQ_FLAG_DEDICATED, &wq->flags);
 }
 
+enum idxd_portal_prot {
+	IDXD_PORTAL_UNLIMITED = 0,
+	IDXD_PORTAL_LIMITED,
+};
+
+static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot)
+{
+	return prot * 0x1000;
+}
+
+static inline int idxd_get_wq_portal_full_offset(int wq_id,
+						 enum idxd_portal_prot prot)
+{
+	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot);
+}
+
 static inline void idxd_set_type(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
@@ -233,6 +262,7 @@ int idxd_setup_sysfs(struct idxd_device *idxd);
 void idxd_cleanup_sysfs(struct idxd_device *idxd);
 int idxd_register_driver(void);
 void idxd_unregister_driver(void);
+struct bus_type *idxd_get_bus_type(struct idxd_device *idxd);
 
 /* device interrupt control */
 irqreturn_t idxd_irq_handler(int vec, void *data);
@@ -276,4 +306,11 @@ void idxd_dma_complete_txd(struct idxd_desc *desc,
 			   enum idxd_complete_type comp_type);
 dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx);
 
+/* cdev */
+int idxd_cdev_register(void);
+void idxd_cdev_remove(void);
+int idxd_cdev_get_major(struct idxd_device *idxd);
+int idxd_wq_add_cdev(struct idxd_wq *wq);
+void idxd_wq_del_cdev(struct idxd_wq *wq);
+
 #endif
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index cf6e1d89dd02..7778c05deb5d 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -188,6 +188,7 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		mutex_init(&wq->wq_lock);
 		atomic_set(&wq->dq_count, 0);
 		init_waitqueue_head(&wq->submit_waitq);
+		wq->idxd_cdev.minor = -1;
 		rc = percpu_init_rwsem(&wq->submit_lock);
 		if (rc < 0) {
 			idxd_wqs_free_lock(idxd);
@@ -321,6 +322,8 @@ static int idxd_probe(struct idxd_device *idxd)
 		goto err_idr_fail;
 	}
 
+	idxd->major = idxd_cdev_get_major(idxd);
+
 	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
 	return 0;
 
@@ -501,6 +504,10 @@ static int __init idxd_init_module(void)
 	if (err < 0)
 		goto err_idxd_driver_register;
 
+	err = idxd_cdev_register();
+	if (err)
+		goto err_cdev_register;
+
 	err = pci_register_driver(&idxd_pci_driver);
 	if (err)
 		goto err_pci_register;
@@ -508,6 +515,8 @@ static int __init idxd_init_module(void)
 	return 0;
 
 err_pci_register:
+	idxd_cdev_remove();
+err_cdev_register:
 	idxd_unregister_driver();
 err_idxd_driver_register:
 	idxd_unregister_bus_type();
@@ -518,6 +527,7 @@ module_init(idxd_init_module);
 static void __exit idxd_exit_module(void)
 {
 	pci_unregister_driver(&idxd_pci_driver);
+	idxd_cdev_remove();
 	idxd_unregister_bus_type();
 }
 module_exit(idxd_exit_module);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 770d408470db..d6fcd2e60103 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -89,6 +89,24 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 			idxd->sw_err.bits[i] = ioread64(idxd->reg_base +
 					IDXD_SWERR_OFFSET + i * sizeof(u64));
 		iowrite64(IDXD_SWERR_ACK, idxd->reg_base + IDXD_SWERR_OFFSET);
+
+		if (idxd->sw_err.valid && idxd->sw_err.wq_idx_valid) {
+			int id = idxd->sw_err.wq_idx;
+			struct idxd_wq *wq = &idxd->wqs[id];
+
+			if (wq->type == IDXD_WQT_USER)
+				wake_up_interruptible(&wq->idxd_cdev.err_queue);
+		} else {
+			int i;
+
+			for (i = 0; i < idxd->max_wqs; i++) {
+				struct idxd_wq *wq = &idxd->wqs[i];
+
+				if (wq->type == IDXD_WQT_USER)
+					wake_up_interruptible(&wq->idxd_cdev.err_queue);
+			}
+		}
+
 		spin_unlock_bh(&idxd->dev_lock);
 		val |= IDXD_INTC_ERR;
 
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index e16cab37dda8..45a0c5869a0a 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -69,17 +69,19 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 {
 	struct idxd_device *idxd = wq->idxd;
 	int vec = desc->hw->int_handle;
+	void __iomem *portal;
 
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
 
+	portal = wq->dportal + idxd_get_wq_portal_offset(IDXD_PORTAL_UNLIMITED);
 	/*
 	 * The wmb() flushes writes to coherent DMA data before possibly
 	 * triggering a DMA read. The wmb() is necessary even on UP because
 	 * the recipient is a device.
 	 */
 	wmb();
-	iosubmit_cmds512(wq->dportal, desc->hw, 1);
+	iosubmit_cmds512(portal, desc->hw, 1);
 
 	/*
 	 * Pending the descriptor to the lockless list for the irq_entry
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index f5e3f962ee6a..849c50ab939a 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -13,6 +13,7 @@
 static char *idxd_wq_type_names[] = {
 	[IDXD_WQT_NONE]		= "none",
 	[IDXD_WQT_KERNEL]	= "kernel",
+	[IDXD_WQT_USER]		= "user",
 };
 
 static void idxd_conf_device_release(struct device *dev)
@@ -63,6 +64,11 @@ static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
 	return false;
 }
 
+static inline bool is_idxd_wq_cdev(struct idxd_wq *wq)
+{
+	return wq->type == IDXD_WQT_USER ? true : false;
+}
+
 static int idxd_config_bus_match(struct device *dev,
 				 struct device_driver *drv)
 {
@@ -109,6 +115,9 @@ static int idxd_config_bus_probe(struct device *dev)
 			return -EBUSY;
 		}
 
+		if (!try_module_get(THIS_MODULE))
+			return -ENXIO;
+
 		spin_lock_irqsave(&idxd->dev_lock, flags);
 
 		/* Perform IDXD configuration and enabling */
@@ -216,6 +225,13 @@ static int idxd_config_bus_probe(struct device *dev)
 				mutex_unlock(&wq->wq_lock);
 				return rc;
 			}
+		} else if (is_idxd_wq_cdev(wq)) {
+			rc = idxd_wq_add_cdev(wq);
+			if (rc < 0) {
+				dev_dbg(dev, "Cdev creation failed\n");
+				mutex_unlock(&wq->wq_lock);
+				return rc;
+			}
 		}
 
 		mutex_unlock(&wq->wq_lock);
@@ -241,6 +257,8 @@ static void disable_wq(struct idxd_wq *wq)
 
 	if (is_idxd_wq_dmaengine(wq))
 		idxd_unregister_dma_channel(wq);
+	else if (is_idxd_wq_cdev(wq))
+		idxd_wq_del_cdev(wq);
 
 	if (idxd_wq_refcount(wq))
 		dev_warn(dev, "Clients has claim on wq %d: %d\n",
@@ -295,10 +313,12 @@ static int idxd_config_bus_remove(struct device *dev)
 		spin_lock_irqsave(&idxd->dev_lock, flags);
 		rc = idxd_device_disable(idxd);
 		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		module_put(THIS_MODULE);
 		if (rc < 0)
 			dev_warn(dev, "Device disable failed\n");
 		else
 			dev_info(dev, "Device %s disabled\n", dev_name(dev));
+
 	}
 
 	return 0;
@@ -309,7 +329,7 @@ static void idxd_config_bus_shutdown(struct device *dev)
 	dev_dbg(dev, "%s called\n", __func__);
 }
 
-static struct bus_type dsa_bus_type = {
+struct bus_type dsa_bus_type = {
 	.name = "dsa",
 	.match = idxd_config_bus_match,
 	.probe = idxd_config_bus_probe,
@@ -334,7 +354,7 @@ static struct idxd_device_driver *idxd_drvs[] = {
 	&dsa_drv
 };
 
-static struct bus_type *idxd_get_bus_type(struct idxd_device *idxd)
+struct bus_type *idxd_get_bus_type(struct idxd_device *idxd)
 {
 	return idxd_bus_types[idxd->type];
 }
@@ -956,6 +976,9 @@ static ssize_t wq_type_show(struct device *dev,
 	case IDXD_WQT_KERNEL:
 		return sprintf(buf, "%s\n",
 			       idxd_wq_type_names[IDXD_WQT_KERNEL]);
+	case IDXD_WQT_USER:
+		return sprintf(buf, "%s\n",
+			       idxd_wq_type_names[IDXD_WQT_USER]);
 	case IDXD_WQT_NONE:
 	default:
 		return sprintf(buf, "%s\n",
@@ -978,6 +1001,8 @@ static ssize_t wq_type_store(struct device *dev,
 	old_type = wq->type;
 	if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_KERNEL]))
 		wq->type = IDXD_WQT_KERNEL;
+	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_USER]))
+		wq->type = IDXD_WQT_USER;
 	else
 		wq->type = IDXD_WQT_NONE;
 
@@ -1020,6 +1045,17 @@ static ssize_t wq_name_store(struct device *dev,
 static struct device_attribute dev_attr_wq_name =
 		__ATTR(name, 0644, wq_name_show, wq_name_store);
 
+static ssize_t wq_cdev_minor_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%d\n", wq->idxd_cdev.minor);
+}
+
+static struct device_attribute dev_attr_wq_cdev_minor =
+		__ATTR(cdev_minor, 0444, wq_cdev_minor_show, NULL);
+
 static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_clients.attr,
 	&dev_attr_wq_state.attr,
@@ -1029,6 +1065,7 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_priority.attr,
 	&dev_attr_wq_type.attr,
 	&dev_attr_wq_name.attr,
+	&dev_attr_wq_cdev_minor.attr,
 	NULL,
 };
 
@@ -1242,6 +1279,16 @@ static ssize_t token_limit_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(token_limit);
 
+static ssize_t cdev_major_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->major);
+}
+static DEVICE_ATTR_RO(cdev_major);
+
 static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_max_groups.attr,
 	&dev_attr_max_work_queues.attr,
@@ -1257,6 +1304,7 @@ static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_errors.attr,
 	&dev_attr_max_tokens.attr,
 	&dev_attr_token_limit.attr,
+	&dev_attr_cdev_major.attr,
 	NULL,
 };
 
-- 
Gitee


From 36f3bcc158842dd84ae0b7daeaf9b76bfac47e70 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 17 Jan 2020 16:30:56 +0100
Subject: [PATCH 0518/2161] dmaengine: Create symlinks between DMA channels and
 slaves
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 71723a96b8b1367fefc18f60025dae792477d602 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently it is not easy to find out which DMA channels are in use, and
which slave devices are using which channels.

Fix this by creating two symlinks between the DMA channel and the actual
slave device when a channel is requested:
  1. A "slave" symlink from DMA channel to slave device,
  2. A "dma:<name>" symlink slave device to DMA channel.
When the channel is released, the symlinks are removed again.
The latter requires keeping track of the slave device and the channel
name in the dma_chan structure.

Note that this is limited to channel request functions for requesting an
exclusive slave channel that take a device pointer (dma_request_chan()
and dma_request_slave_channel*()).

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Niklas Söderlund <niklas.soderlund@ragnatech.se>
Link: https://lore.kernel.org/r/20200117153056.31363-1-geert+renesas@glider.be
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 37 +++++++++++++++++++++++++++++++------
 include/linux/dmaengine.h |  4 ++++
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 51a2f2b1b2de..f3ef4edd4de1 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -60,6 +60,8 @@ static long dmaengine_ref_count;
 
 /* --- sysfs implementation --- */
 
+#define DMA_SLAVE_NAME	"slave"
+
 /**
  * dev_to_dma_chan - convert a device pointer to its sysfs container object
  * @dev - device node
@@ -730,11 +732,11 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 	if (has_acpi_companion(dev) && !chan)
 		chan = acpi_dma_request_slave_chan_by_name(dev, name);
 
-	if (chan) {
-		/* Valid channel found or requester needs to be deferred */
-		if (!IS_ERR(chan) || PTR_ERR(chan) == -EPROBE_DEFER)
-			return chan;
-	}
+	if (PTR_ERR(chan) == -EPROBE_DEFER)
+		return chan;
+
+	if (!IS_ERR_OR_NULL(chan))
+		goto found;
 
 	/* Try to find the channel via the DMA filter map(s) */
 	mutex_lock(&dma_list_mutex);
@@ -754,7 +756,23 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 	}
 	mutex_unlock(&dma_list_mutex);
 
-	return chan ? chan : ERR_PTR(-EPROBE_DEFER);
+	if (!IS_ERR_OR_NULL(chan))
+		goto found;
+
+	return ERR_PTR(-EPROBE_DEFER);
+
+found:
+	chan->slave = dev;
+	chan->name = kasprintf(GFP_KERNEL, "dma:%s", name);
+	if (!chan->name)
+		return ERR_PTR(-ENOMEM);
+
+	if (sysfs_create_link(&chan->dev->device.kobj, &dev->kobj,
+			      DMA_SLAVE_NAME))
+		dev_err(dev, "Cannot create DMA %s symlink\n", DMA_SLAVE_NAME);
+	if (sysfs_create_link(&dev->kobj, &chan->dev->device.kobj, chan->name))
+		dev_err(dev, "Cannot create DMA %s symlink\n", chan->name);
+	return chan;
 }
 EXPORT_SYMBOL_GPL(dma_request_chan);
 
@@ -812,6 +830,13 @@ void dma_release_channel(struct dma_chan *chan)
 	/* drop PRIVATE cap enabled by __dma_request_channel() */
 	if (--chan->device->privatecnt == 0)
 		dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask);
+	if (chan->slave) {
+		sysfs_remove_link(&chan->slave->kobj, chan->name);
+		kfree(chan->name);
+		chan->name = NULL;
+		chan->slave = NULL;
+	}
+	sysfs_remove_link(&chan->dev->device.kobj, DMA_SLAVE_NAME);
 	mutex_unlock(&dma_list_mutex);
 }
 EXPORT_SYMBOL_GPL(dma_release_channel);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 4ad415905e12..880fab836dee 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -294,10 +294,12 @@ struct dma_router {
 /**
  * struct dma_chan - devices supply DMA channels, clients use them
  * @device: ptr to the dma device who supplies this channel, always !%NULL
+ * @slave: ptr to the device using this channel
  * @cookie: last cookie value returned to client
  * @completed_cookie: last completed cookie for this channel
  * @chan_id: channel ID for sysfs
  * @dev: class device for sysfs
+ * @name: backlink name for sysfs
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client_count: how many clients are using this channel
@@ -308,12 +310,14 @@ struct dma_router {
  */
 struct dma_chan {
 	struct dma_device *device;
+	struct device *slave;
 	dma_cookie_t cookie;
 	dma_cookie_t completed_cookie;
 
 	/* sysfs */
 	int chan_id;
 	struct dma_chan_dev *dev;
+	const char *name;
 
 	struct list_head device_node;
 	struct dma_chan_percpu __percpu *local;
-- 
Gitee


From 154d1e3fba78891f0a6645e59de912fa5de90698 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 30 Jan 2020 08:08:34 +0100
Subject: [PATCH 0519/2161] dmaengine: Fix return value for dma_request_chan()
 in case of failure

commit 474809a28e7b0671a5090de6e0db91f0e3821360 upstream.

Commit 71723a96b8b1 ("dmaengine: Create symlinks between DMA channels and
slaves") changed the dma_request_chan() function flow in such a way that
it always returns EPROBE_DEFER in case of channels that cannot be found.
This break the operation of the devices which have optional DMA channels
as it puts their drivers in endless deferred probe loop. Fix this by
propagating the proper error value.

Fixes: 71723a96b8b1 ("dmaengine: Create symlinks between DMA channels and slaves")
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20200130070834.17537-1-m.szyprowski@samsung.com
[vkoul: fix typo in patch title]
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index f3ef4edd4de1..7b1cefc3213a 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -759,7 +759,7 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 	if (!IS_ERR_OR_NULL(chan))
 		goto found;
 
-	return ERR_PTR(-EPROBE_DEFER);
+	return chan ? chan : ERR_PTR(-EPROBE_DEFER);
 
 found:
 	chan->slave = dev;
-- 
Gitee


From 4c1df0ba329aa548bb8fec82cb51214ea3501815 Mon Sep 17 00:00:00 2001
From: kbuild test robot <lkp@intel.com>
Date: Thu, 30 Jan 2020 15:44:49 +0100
Subject: [PATCH 0520/2161] dmaengine: idxd: fix boolconv.cocci warnings

commit a9113a90f5f0ff476a4ac2bf15861fa4358a3643 upstream.

Remove unneeded conversion to bool

Generated by: scripts/coccinelle/misc/boolconv.cocci

CC: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Julia Lawall <julia.lawall@inria.fr>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/alpine.DEB.2.21.2001301543150.7476@hadrien
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 849c50ab939a..6d907fe150aa 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -66,7 +66,7 @@ static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
 
 static inline bool is_idxd_wq_cdev(struct idxd_wq *wq)
 {
-	return wq->type == IDXD_WQT_USER ? true : false;
+	return wq->type == IDXD_WQT_USER;
 }
 
 static int idxd_config_bus_match(struct device *dev,
-- 
Gitee


From c2da13d2d3d001e23781c78abd2ae99764db3145 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 31 Jan 2020 10:58:39 -0700
Subject: [PATCH 0521/2161] dmaengine: fix null ptr check for
 __dma_async_device_channel_register()

commit 5429b51f606cb82f315f68678b959112766f235e upstream.

Add check to pointer after assignment before accessing members.

Fixes: d2fb0a043838: ("dmaengine: break out channel registration")

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158049351973.45445.3291586905226032744.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 7b1cefc3213a..3432dac695e5 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -962,6 +962,9 @@ static int __dma_async_device_channel_register(struct dma_device *device,
 
 	tchan = list_first_entry_or_null(&device->channels,
 					 struct dma_chan, device_node);
+	if (!tchan)
+		return -ENODEV;
+
 	if (tchan->dev) {
 		idr_ref = tchan->dev->idr_ref;
 	} else {
-- 
Gitee


From c79604d9051816c529ca0d9561cd1e3ce9eee996 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Fri, 31 Jan 2020 11:38:58 +0200
Subject: [PATCH 0522/2161] dmaengine: Cleanups for the slave <-> channel
 symlink support

commit bad83565eafe8a00922ad4eed6920625a10a2126 upstream.

No need to use goto to jump over the
return chan ? chan : ERR_PTR(-EPROBE_DEFER);
We can just revert the check and return right there.

Do not fail the channel request if the chan->name allocation fails, but
print a warning about it.

Change the dev_err to dev_warn if sysfs_create_link() fails as it is not
fatal.

Only attempt to remove the DMA_SLAVE_NAME symlink if it is created - or it
was attempted to be created.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20200131093859.3311-2-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 3432dac695e5..c3b1283b6d31 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -756,22 +756,21 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 	}
 	mutex_unlock(&dma_list_mutex);
 
-	if (!IS_ERR_OR_NULL(chan))
-		goto found;
-
-	return chan ? chan : ERR_PTR(-EPROBE_DEFER);
+	if (IS_ERR_OR_NULL(chan))
+		return chan ? chan : ERR_PTR(-EPROBE_DEFER);
 
 found:
-	chan->slave = dev;
 	chan->name = kasprintf(GFP_KERNEL, "dma:%s", name);
 	if (!chan->name)
-		return ERR_PTR(-ENOMEM);
+		return chan;
+	chan->slave = dev;
 
 	if (sysfs_create_link(&chan->dev->device.kobj, &dev->kobj,
 			      DMA_SLAVE_NAME))
-		dev_err(dev, "Cannot create DMA %s symlink\n", DMA_SLAVE_NAME);
+		dev_warn(dev, "Cannot create DMA %s symlink\n", DMA_SLAVE_NAME);
 	if (sysfs_create_link(&dev->kobj, &chan->dev->device.kobj, chan->name))
-		dev_err(dev, "Cannot create DMA %s symlink\n", chan->name);
+		dev_warn(dev, "Cannot create DMA %s symlink\n", chan->name);
+
 	return chan;
 }
 EXPORT_SYMBOL_GPL(dma_request_chan);
@@ -830,13 +829,14 @@ void dma_release_channel(struct dma_chan *chan)
 	/* drop PRIVATE cap enabled by __dma_request_channel() */
 	if (--chan->device->privatecnt == 0)
 		dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask);
+
 	if (chan->slave) {
+		sysfs_remove_link(&chan->dev->device.kobj, DMA_SLAVE_NAME);
 		sysfs_remove_link(&chan->slave->kobj, chan->name);
 		kfree(chan->name);
 		chan->name = NULL;
 		chan->slave = NULL;
 	}
-	sysfs_remove_link(&chan->dev->device.kobj, DMA_SLAVE_NAME);
 	mutex_unlock(&dma_list_mutex);
 }
 EXPORT_SYMBOL_GPL(dma_release_channel);
-- 
Gitee


From 808ed9a97f69d02a45e4945096162740e59b9b23 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 11 Feb 2020 21:53:35 +0800
Subject: [PATCH 0523/2161] dmaengine: idxd: remove set but not used variable
 'group'

commit f7b280c6388137185fab4408da18d989ba36c721 upstream.

drivers/dma/idxd/sysfs.c: In function engine_group_id_store:
drivers/dma/idxd/sysfs.c:419:29: warning: variable group set but not used [-Wunused-but-set-variable]

It is not used, so remove it.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20200211135335.55924-1-yuehaibing@huawei.com
Signed-Off-By: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 6d907fe150aa..e4f35bdf252e 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -416,7 +416,7 @@ static ssize_t engine_group_id_store(struct device *dev,
 	struct idxd_device *idxd = engine->idxd;
 	long id;
 	int rc;
-	struct idxd_group *prevg, *group;
+	struct idxd_group *prevg;
 
 	rc = kstrtol(buf, 10, &id);
 	if (rc < 0)
@@ -436,7 +436,6 @@ static ssize_t engine_group_id_store(struct device *dev,
 		return count;
 	}
 
-	group = &idxd->groups[id];
 	prevg = engine->group;
 
 	if (prevg)
-- 
Gitee


From 4b74b463d9ecf97d45723f67ed54230906a46d40 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Mon, 10 Feb 2020 23:18:55 +0800
Subject: [PATCH 0524/2161] dmaengine: idxd: remove set but not used variable
 'idxd_cdev'

commit f7b280c6388137185fab4408da18d989ba36c721 upstream.

drivers/dma/idxd/cdev.c: In function idxd_cdev_open:
drivers/dma/idxd/cdev.c:77:20: warning:
 variable idxd_cdev set but not used [-Wunused-but-set-variable]

commit 42d279f9137a ("dmaengine: idxd: add char driver to
expose submission portal to userland") involed this.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20200210151855.55044-1-yuehaibing@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 1d7347825b95..8dfdbe37e381 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -74,12 +74,10 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 	struct idxd_device *idxd;
 	struct idxd_wq *wq;
 	struct device *dev;
-	struct idxd_cdev *idxd_cdev;
 
 	wq = inode_wq(inode);
 	idxd = wq->idxd;
 	dev = &idxd->pdev->dev;
-	idxd_cdev = &wq->idxd_cdev;
 
 	dev_dbg(dev, "%s called\n", __func__);
 
-- 
Gitee


From c41b355b4a9e7d74b7d41269f6a78876b3a51989 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 11 Feb 2020 10:42:47 -0700
Subject: [PATCH 0525/2161] dmaengine: idxd: fix runaway module ref count on
 device driver bind

commit 61b5865d56bbb09dd5887b197dd324d1d4333d47 upstream.

idxd_config_bus_probe() calls try_module_get() but never calls module_put()
when it fails. Thus with every failed attempt, the ref count goes up. Add
module_put() in failure paths.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Reported-by: Jerry Chen <jerry.t.chen@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158144296730.41381.12134210685456322434.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index e4f35bdf252e..eb3516a1ac34 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -124,6 +124,7 @@ static int idxd_config_bus_probe(struct device *dev)
 		rc = idxd_device_config(idxd);
 		if (rc < 0) {
 			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			module_put(THIS_MODULE);
 			dev_warn(dev, "Device config failed: %d\n", rc);
 			return rc;
 		}
@@ -132,6 +133,7 @@ static int idxd_config_bus_probe(struct device *dev)
 		rc = idxd_device_enable(idxd);
 		if (rc < 0) {
 			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			module_put(THIS_MODULE);
 			dev_warn(dev, "Device enable failed: %d\n", rc);
 			return rc;
 		}
@@ -142,6 +144,7 @@ static int idxd_config_bus_probe(struct device *dev)
 		rc = idxd_register_dma_device(idxd);
 		if (rc < 0) {
 			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			module_put(THIS_MODULE);
 			dev_dbg(dev, "Failed to register dmaengine device\n");
 			return rc;
 		}
-- 
Gitee


From 4c9699ba28b8d7204de1846dd4567486af0a79f5 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 5 Feb 2020 15:32:48 +0300
Subject: [PATCH 0526/2161] dmaengine: idxd: Fix error handling in
 idxd_wq_cdev_dev_setup()

commit 2227ab4216cd4d56619733bcfaee7e6943cdc9aa upstream.

We can't call kfree(dev) after calling device_register(dev).  The "dev"
pointer has to be freed using put_device().

Fixes: 42d279f9137a ("dmaengine: idxd: add char driver to expose submission portal to userland")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20200205123248.hmtog7qa2eiqaagh@kili.mountain
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 8dfdbe37e381..0ec607397e16 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -202,6 +202,7 @@ static int idxd_wq_cdev_dev_setup(struct idxd_wq *wq)
 	minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL);
 	if (minor < 0) {
 		rc = minor;
+		kfree(dev);
 		goto ida_err;
 	}
 
@@ -210,7 +211,6 @@ static int idxd_wq_cdev_dev_setup(struct idxd_wq *wq)
 	rc = device_register(dev);
 	if (rc < 0) {
 		dev_err(&idxd->pdev->dev, "device register failed\n");
-		put_device(dev);
 		goto dev_reg_err;
 	}
 	idxd_cdev->minor = minor;
@@ -219,8 +219,8 @@ static int idxd_wq_cdev_dev_setup(struct idxd_wq *wq)
 
  dev_reg_err:
 	ida_simple_remove(&cdev_ctx->minor_ida, MINOR(dev->devt));
+	put_device(dev);
  ida_err:
-	kfree(dev);
 	idxd_cdev->dev = NULL;
 	return rc;
 }
-- 
Gitee


From 1be924722fe9bd953f1f3948081cc5c80abda82c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 18 Feb 2020 09:51:58 -0700
Subject: [PATCH 0527/2161] dmaengine: idxd: correct reserved token calculation

commit 2d0b1919457ad78036f24169968cadc6f55d37ec upstream.

The calcuation for limit of reserved token did not take into account the
change the user wanted vs the current group reserved token. This causes
changing of the reserved token to be possible only after we set the value
of the reserved token back to 0. Fix calculation so we can set a value that
is non zero for reserved token.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Reported-by: Jerry Chen <jerry.t.chen@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158204471889.37789.7749177228265869168.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index eb3516a1ac34..9d736dbc8d7b 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -518,7 +518,7 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
 	if (val > idxd->max_tokens)
 		return -EINVAL;
 
-	if (val > idxd->nr_tokens)
+	if (val > idxd->nr_tokens + group->tokens_reserved)
 		return -EINVAL;
 
 	group->tokens_reserved = val;
-- 
Gitee


From db95db4cfded7fc71a6fc75a3407de8a354736c3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 19 Feb 2020 10:24:08 -0700
Subject: [PATCH 0528/2161] dmaengine: idxd: sysfs input of wq incorrect wq
 type should return error

commit 88402c5b1ba7498217027c8a54e8df61d030500c upstream.

Currently when inputing an unrecognized wq type, we set the wq type to
"none". It really should return error and not change the existing wq type
that's in the kernel.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Reported-by: Yixin Zhang <yixin.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158213304803.2290.13336343633425868211.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 9d736dbc8d7b..72cc440eaf5b 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1001,12 +1001,14 @@ static ssize_t wq_type_store(struct device *dev,
 		return -EPERM;
 
 	old_type = wq->type;
-	if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_KERNEL]))
+	if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_NONE]))
+		wq->type = IDXD_WQT_NONE;
+	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_KERNEL]))
 		wq->type = IDXD_WQT_KERNEL;
 	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_USER]))
 		wq->type = IDXD_WQT_USER;
 	else
-		wq->type = IDXD_WQT_NONE;
+		return -EINVAL;
 
 	/* If we are changing queue type, clear the name */
 	if (wq->type != old_type)
-- 
Gitee


From 8b2493d1bf7f4e8ef822397adfdb34fd14ab6f18 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 19 Feb 2020 10:24:56 -0700
Subject: [PATCH 0529/2161] dmaengine: idxd: wq size configuration needs to
 check global max size

commit 50e7e7f6f2d040dd16a636f408eab9184abc63f8 upstream.

The current size_store() function for idxd sysfs does not check the total
wq size. This allows configuration of all wqs with total wq size. Add check
to make sure the wq sysfs attribute rejects storing of size over the total
wq size.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Reported-by: Jerry Chen <jerry.t.chen@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158213309629.2509.3583411832507185041.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 72cc440eaf5b..ae0b5ec9d8dc 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -903,6 +903,20 @@ static ssize_t wq_size_show(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%u\n", wq->size);
 }
 
+static int total_claimed_wq_size(struct idxd_device *idxd)
+{
+	int i;
+	int wq_size = 0;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		wq_size += wq->size;
+	}
+
+	return wq_size;
+}
+
 static ssize_t wq_size_store(struct device *dev,
 			     struct device_attribute *attr, const char *buf,
 			     size_t count)
@@ -922,7 +936,7 @@ static ssize_t wq_size_store(struct device *dev,
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
 
-	if (size > idxd->max_wq_size)
+	if (size + total_claimed_wq_size(idxd) - wq->size > idxd->max_wq_size)
 		return -EINVAL;
 
 	wq->size = size;
-- 
Gitee


From 21d38f5f733f14537ae9cf0984b3a522c94d8153 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 25 Feb 2020 09:47:46 -0700
Subject: [PATCH 0530/2161] dmaengine: idxd: check return result from
 check_vma() in cdev

commit b391554c61cb353c279523a706734b090aaf9000 upstream.

The returned result from the check_vma() function in the cdev ->mmap() call
needs to be handled. Add the check and returning error.

Fixes: 42d279f9137a ("dmaengine: idxd: add char driver to expose submission portal to userland")
Reported-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158264926659.9387.14325163515683047959.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 0ec607397e16..d58144eca6e8 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -137,6 +137,8 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
 
 	dev_dbg(&pdev->dev, "%s called\n", __func__);
 	rc = check_vma(wq, vma, __func__);
+	if (rc < 0)
+		return rc;
 
 	vma->vm_flags |= VM_DONTCOPY;
 	pfn = (base + idxd_get_wq_portal_full_offset(wq->id,
-- 
Gitee


From 6cb1e2254b9fa9d4f601c9696c18bd53d555f3db Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 24 Feb 2020 11:01:34 -0700
Subject: [PATCH 0531/2161] dmaengine: idxd: expose general capabilities
 register in sysfs

commit 9065958ee6dd752c7e1bbacbb79fe5f5f2218393 upstream.

There are some capabilities for the device that are interesting to user
apps that are interacting directly with the device. Expose gencap register
in sysfs to allow that information.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158256729399.55526.10842505054968710547.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index ae0b5ec9d8dc..8a8d86f29e56 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1179,6 +1179,16 @@ static ssize_t op_cap_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(op_cap);
 
+static ssize_t gen_cap_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%#llx\n", idxd->hw.gen_cap.bits);
+}
+static DEVICE_ATTR_RO(gen_cap);
+
 static ssize_t configurable_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
@@ -1316,6 +1326,7 @@ static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_max_batch_size.attr,
 	&dev_attr_max_transfer_size.attr,
 	&dev_attr_op_cap.attr,
+	&dev_attr_gen_cap.attr,
 	&dev_attr_configurable.attr,
 	&dev_attr_clients.attr,
 	&dev_attr_state.attr,
-- 
Gitee


From 364c1342ec6c56d7c5e39e0117e4eab6a325cc3b Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Fri, 6 Mar 2020 19:20:18 +0530
Subject: [PATCH 0532/2161] dmaengine: move .device_release missing log warning
 to debug level

commit f91da3bd21721c05cc7054156fa993edbb16777a upstream.

Dmaengine core warns the drivers registering for missing .device_release
implementation. The warning is accurate for dmaengine controllers which
hotplug but not for rest.

So reduce this to a debug log.

Link: https://lore.kernel.org/r/20200306135018.2286959-1-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index c3b1283b6d31..17909fd1820f 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1151,7 +1151,7 @@ int dma_async_device_register(struct dma_device *device)
 	}
 
 	if (!device->device_release)
-		dev_warn(device->dev,
+		dev_dbg(device->dev,
 			 "WARN: Device release is not defined so it is not safe to unbind this driver while in use\n");
 
 	kref_init(&device->ref);
-- 
Gitee


From e67da39f63538615e3003367faab2ef3e9b9d7aa Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Fri, 6 Mar 2020 16:28:37 +0200
Subject: [PATCH 0533/2161] dmaengine: Add basic debugfs support

commit e937cc1dd7966df33a478943817302502a164e25 upstream.

Via the /sys/kernel/debug/dmaengine/summary users can get information
about the DMA devices and the used channels.

Example output on am654-evm with audio using two channels and after running
dmatest on 4 channels:

dma0 (285c0000.dma-controller): number of channels: 96

dma1 (31150000.dma-controller): number of channels: 267
 dma1chan0    | 2b00000.mcasp:tx
 dma1chan1    | 2b00000.mcasp:rx
 dma1chan2    | in-use
 dma1chan3    | in-use
 dma1chan4    | in-use
 dma1chan5    | in-use

For slave channels we can show the device and the channel name a given
channel is requested.
For non slave devices the only information we know is that the channel is
in use.

DMA drivers can implement the optional dbg_summary_show callback to
provide controller specific information instead of the generic one.

It is easy to extend the generic dmaengine_summary_show() to print
additional information about the used channels.

I have taken the idea from gpiolib and clk subsystems.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20200306142839.17910-2-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 76 ++++++++++++++++++++++++++++++++++++++-
 include/linux/dmaengine.h | 13 ++++++-
 2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 17909fd1820f..5b536ac0dd6e 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -58,6 +58,65 @@ static DEFINE_IDA(dma_ida);
 static LIST_HEAD(dma_device_list);
 static long dmaengine_ref_count;
 
+/* --- debugfs implementation --- */
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static void dmaengine_dbg_summary_show(struct seq_file *s,
+				       struct dma_device *dma_dev)
+{
+	struct dma_chan *chan;
+
+	list_for_each_entry(chan, &dma_dev->channels, device_node) {
+		if (chan->client_count) {
+			seq_printf(s, " %-13s| %s", dma_chan_name(chan),
+				   chan->dbg_client_name ?: "in-use");
+
+			if (chan->router)
+				seq_printf(s, " (via router: %s)\n",
+					dev_name(chan->router->dev));
+			else
+				seq_puts(s, "\n");
+		}
+	}
+}
+
+static int dmaengine_summary_show(struct seq_file *s, void *data)
+{
+	struct dma_device *dma_dev = NULL;
+
+	mutex_lock(&dma_list_mutex);
+	list_for_each_entry(dma_dev, &dma_device_list, global_node) {
+		seq_printf(s, "dma%d (%s): number of channels: %u\n",
+			   dma_dev->dev_id, dev_name(dma_dev->dev),
+			   dma_dev->chancnt);
+
+		if (dma_dev->dbg_summary_show)
+			dma_dev->dbg_summary_show(s, dma_dev);
+		else
+			dmaengine_dbg_summary_show(s, dma_dev);
+
+		if (!list_is_last(&dma_dev->global_node, &dma_device_list))
+			seq_puts(s, "\n");
+	}
+	mutex_unlock(&dma_list_mutex);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dmaengine_summary);
+
+static void __init dmaengine_debugfs_init(void)
+{
+	struct dentry *rootdir = debugfs_create_dir("dmaengine", NULL);
+
+	/* /sys/kernel/debug/dmaengine/summary */
+	debugfs_create_file("summary", 0444, rootdir, NULL,
+			    &dmaengine_summary_fops);
+}
+#else
+static inline void dmaengine_debugfs_init(void) { }
+#endif	/* DEBUG_FS */
+
 /* --- sysfs implementation --- */
 
 #define DMA_SLAVE_NAME	"slave"
@@ -760,6 +819,11 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 		return chan ? chan : ERR_PTR(-EPROBE_DEFER);
 
 found:
+#ifdef CONFIG_DEBUG_FS
+	chan->dbg_client_name = kasprintf(GFP_KERNEL, "%s:%s", dev_name(dev),
+					  name);
+#endif
+
 	chan->name = kasprintf(GFP_KERNEL, "dma:%s", name);
 	if (!chan->name)
 		return chan;
@@ -837,6 +901,11 @@ void dma_release_channel(struct dma_chan *chan)
 		chan->name = NULL;
 		chan->slave = NULL;
 	}
+
+#ifdef CONFIG_DEBUG_FS
+	kfree(chan->dbg_client_name);
+	chan->dbg_client_name = NULL;
+#endif
 	mutex_unlock(&dma_list_mutex);
 }
 EXPORT_SYMBOL_GPL(dma_release_channel);
@@ -1559,6 +1628,11 @@ static int __init dma_bus_init(void)
 
 	if (err)
 		return err;
-	return class_register(&dma_devclass);
+
+	err = class_register(&dma_devclass);
+	if (!err)
+		dmaengine_debugfs_init();
+
+	return err;
 }
 arch_initcall(dma_bus_init);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 880fab836dee..4f64a4f628af 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -300,6 +300,8 @@ struct dma_router {
  * @chan_id: channel ID for sysfs
  * @dev: class device for sysfs
  * @name: backlink name for sysfs
+ * @dbg_client_name: slave name for debugfs in format:
+ *	dev_name(requester's dev):channel name, for example: "2b00000.mcasp:tx"
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client_count: how many clients are using this channel
@@ -318,6 +320,9 @@ struct dma_chan {
 	int chan_id;
 	struct dma_chan_dev *dev;
 	const char *name;
+#ifdef CONFIG_DEBUG_FS
+	char *dbg_client_name;
+#endif
 
 	struct list_head device_node;
 	struct dma_chan_percpu __percpu *local;
@@ -805,7 +810,9 @@ struct dma_filter {
  *     called and there are no further references to this structure. This
  *     must be implemented to free resources however many existing drivers
  *     do not and are therefore not safe to unbind while in use.
- *
+ * @dbg_summary_show: optional routine to show contents in debugfs; default code
+ *     will be used when this is omitted, but custom code can show extra,
+ *     controller specific information.
  */
 struct dma_device {
 	struct kref ref;
@@ -891,6 +898,10 @@ struct dma_device {
 					    struct dma_tx_state *txstate);
 	void (*device_issue_pending)(struct dma_chan *chan);
 	void (*device_release)(struct dma_device *dev);
+	/* debugfs support */
+#ifdef CONFIG_DEBUG_FS
+	void (*dbg_summary_show)(struct seq_file *s, struct dma_device *dev);
+#endif
 };
 
 static inline int dmaengine_slave_config(struct dma_chan *chan,
-- 
Gitee


From 8d7643758ecc766b18c55ce8bd1d3a51eb652c60 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Fri, 6 Mar 2020 16:28:39 +0200
Subject: [PATCH 0534/2161] dmaengine: Create debug directories for DMA devices

commit 26cf132de6f79c06025706ddc61e045d591d404d upstream.

Create a placeholder directory for each registered DMA device.

DMA drivers can use the dmaengine_get_debugfs_root() call to get their
debugfs root and can populate with custom files to aim debugging.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20200306142839.17910-4-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 28 +++++++++++++++++++++++++++-
 drivers/dma/dmaengine.h   | 16 ++++++++++++++++
 include/linux/dmaengine.h |  1 +
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 5b536ac0dd6e..4830ba658ce1 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -62,6 +62,22 @@ static long dmaengine_ref_count;
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 
+static struct dentry *rootdir;
+
+static void dmaengine_debug_register(struct dma_device *dma_dev)
+{
+	dma_dev->dbg_dev_root = debugfs_create_dir(dev_name(dma_dev->dev),
+						   rootdir);
+	if (IS_ERR(dma_dev->dbg_dev_root))
+		dma_dev->dbg_dev_root = NULL;
+}
+
+static void dmaengine_debug_unregister(struct dma_device *dma_dev)
+{
+	debugfs_remove_recursive(dma_dev->dbg_dev_root);
+	dma_dev->dbg_dev_root = NULL;
+}
+
 static void dmaengine_dbg_summary_show(struct seq_file *s,
 				       struct dma_device *dma_dev)
 {
@@ -107,7 +123,7 @@ DEFINE_SHOW_ATTRIBUTE(dmaengine_summary);
 
 static void __init dmaengine_debugfs_init(void)
 {
-	struct dentry *rootdir = debugfs_create_dir("dmaengine", NULL);
+	rootdir = debugfs_create_dir("dmaengine", NULL);
 
 	/* /sys/kernel/debug/dmaengine/summary */
 	debugfs_create_file("summary", 0444, rootdir, NULL,
@@ -115,6 +131,12 @@ static void __init dmaengine_debugfs_init(void)
 }
 #else
 static inline void dmaengine_debugfs_init(void) { }
+static inline int dmaengine_debug_register(struct dma_device *dma_dev)
+{
+	return 0;
+}
+
+static inline void dmaengine_debug_unregister(struct dma_device *dma_dev) { }
 #endif	/* DEBUG_FS */
 
 /* --- sysfs implementation --- */
@@ -1265,6 +1287,8 @@ int dma_async_device_register(struct dma_device *device)
 	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
+	dmaengine_debug_register(device);
+
 	return 0;
 
 err_out:
@@ -1298,6 +1322,8 @@ void dma_async_device_unregister(struct dma_device *device)
 {
 	struct dma_chan *chan, *n;
 
+	dmaengine_debug_unregister(device);
+
 	list_for_each_entry_safe(chan, n, &device->channels, device_node)
 		__dma_async_device_channel_unregister(device, chan);
 
diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index e8a320c9e57c..1bfbd64b1371 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -182,4 +182,20 @@ dmaengine_desc_callback_valid(struct dmaengine_desc_callback *cb)
 struct dma_chan *dma_get_slave_channel(struct dma_chan *chan);
 struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
 
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static inline struct dentry *
+dmaengine_get_debugfs_root(struct dma_device *dma_dev) {
+	return dma_dev->dbg_dev_root;
+}
+#else
+struct dentry;
+static inline struct dentry *
+dmaengine_get_debugfs_root(struct dma_device *dma_dev)
+{
+	return NULL;
+}
+#endif /* CONFIG_DEBUG_FS */
+
 #endif
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 4f64a4f628af..d8d27778dfba 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -901,6 +901,7 @@ struct dma_device {
 	/* debugfs support */
 #ifdef CONFIG_DEBUG_FS
 	void (*dbg_summary_show)(struct seq_file *s, struct dma_device *dev);
+	struct dentry *dbg_dev_root;
 #endif
 };
 
-- 
Gitee


From 32a18bb5e79848e68c218e74fd9a10ea3e20ab3b Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 10 Mar 2020 15:18:02 -0700
Subject: [PATCH 0535/2161] dmaengine: idxd: Merge definition of dsa_batch_desc
 into dsa_hw_desc

commit 7c4a4d088283e02cc04252e88dd08b5cdf54e70f upstream.

We don't need a special structure just for batch descriptors. The
layout matches the general form for other descriptors.

Merge the desc_list_addr field into the union of other aliases for
the the third quadword in the structure.

Create a union to alias "xfer_size" with "desc_count".

Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158387868208.35922.5895104426944263789.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/idxd.h | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 849ef1515d04..1f412fbf561b 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -83,21 +83,6 @@ enum dsa_completion_status {
 #define DSA_COMP_STATUS_MASK		0x7f
 #define DSA_COMP_STATUS_WRITE		0x80
 
-struct dsa_batch_desc {
-	uint32_t	pasid:20;
-	uint32_t	rsvd:11;
-	uint32_t	priv:1;
-	uint32_t	flags:24;
-	uint32_t	opcode:8;
-	uint64_t	completion_addr;
-	uint64_t	desc_list_addr;
-	uint64_t	rsvd1;
-	uint32_t	desc_count;
-	uint16_t	interrupt_handle;
-	uint16_t	rsvd2;
-	uint8_t		rsvd3[24];
-} __attribute__((packed));
-
 struct dsa_hw_desc {
 	uint32_t	pasid:20;
 	uint32_t	rsvd:11;
@@ -109,6 +94,7 @@ struct dsa_hw_desc {
 		uint64_t	src_addr;
 		uint64_t	rdback_addr;
 		uint64_t	pattern;
+		uint64_t	desc_list_addr;
 	};
 	union {
 		uint64_t	dst_addr;
@@ -116,7 +102,10 @@ struct dsa_hw_desc {
 		uint64_t	src2_addr;
 		uint64_t	comp_pattern;
 	};
-	uint32_t	xfer_size;
+	union {
+		uint32_t	xfer_size;
+		uint32_t	desc_count;
+	};
 	uint16_t	int_handle;
 	uint16_t	rsvd1;
 	union {
-- 
Gitee


From 2f0e31c574a1cd3a1b3c1a31f5783a0a1f44f738 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 10 Mar 2020 10:50:30 -0700
Subject: [PATCH 0536/2161] dmaengine: idxd: reflect shadow copy of traffic
 class programming

commit a1fcaf07ec718bb1f11e29e952c9a4cb733d57a5 upstream.

The traffic class are set to -1 at initialization until the user programs
them. If the user choose not to, the driver will program appropriate
defaults. The driver also needs to update the shadowed copies of the values
after doing the programming.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Reported-by: Yixin Zhang <yixin.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158386263076.10898.4586509576813094559.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index ada69e722f84..f6f49f0f6fae 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -584,11 +584,11 @@ static void idxd_group_flags_setup(struct idxd_device *idxd)
 		struct idxd_group *group = &idxd->groups[i];
 
 		if (group->tc_a == -1)
-			group->grpcfg.flags.tc_a = 0;
+			group->tc_a = group->grpcfg.flags.tc_a = 0;
 		else
 			group->grpcfg.flags.tc_a = group->tc_a;
 		if (group->tc_b == -1)
-			group->grpcfg.flags.tc_b = 1;
+			group->tc_b = group->grpcfg.flags.tc_b = 1;
 		else
 			group->grpcfg.flags.tc_b = group->tc_b;
 		group->grpcfg.flags.use_token_limit = group->use_token_limit;
-- 
Gitee


From cc7d35f1a5c20ae3b2da9399e206c699dabe0671 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 10 Mar 2020 10:51:09 -0700
Subject: [PATCH 0537/2161] dmaengine: idxd: remove global token limit check

commit 91124ac61216ed29405d96a29ec914624b196a79 upstream.

The global token_limit is not tied to group tokens_reserved and
tokens_allowed parameters. Remove the check in order to allow independent
configuration.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Reported-by: Yixin Zhang <yixin.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158386266911.11066.7545764533072221536.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 8a8d86f29e56..3999827970ab 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -512,9 +512,6 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
 	if (idxd->state == IDXD_DEV_ENABLED)
 		return -EPERM;
 
-	if (idxd->token_limit == 0)
-		return -EPERM;
-
 	if (val > idxd->max_tokens)
 		return -EINVAL;
 
@@ -560,8 +557,6 @@ static ssize_t group_tokens_allowed_store(struct device *dev,
 	if (idxd->state == IDXD_DEV_ENABLED)
 		return -EPERM;
 
-	if (idxd->token_limit == 0)
-		return -EPERM;
 	if (val < 4 * group->num_engines ||
 	    val > group->tokens_reserved + idxd->nr_tokens)
 		return -EINVAL;
-- 
Gitee


From ddcb7834b076da72989cf6ee7bf1b025d69ae22e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 12 Mar 2020 09:23:53 -0700
Subject: [PATCH 0538/2161] dmaengine: idxd: fix off by one on cdev dwq
 refcount

commit 988aad2f111c768c66efc87d073c73ecb32b682c upstream.

The refcount check for dedicated workqueue (dwq) is off by one and allows
more than 1 user to open the char device. Fix check so only a single user
can open the device.

Fixes: 42d279f9137a ("dmaengine: idxd: add char driver to expose submission portal to userland")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158403020187.10208.14117394394540710774.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index d58144eca6e8..ff49847e37a8 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -79,9 +79,9 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 	idxd = wq->idxd;
 	dev = &idxd->pdev->dev;
 
-	dev_dbg(dev, "%s called\n", __func__);
+	dev_dbg(dev, "%s called: %d\n", __func__, idxd_wq_refcount(wq));
 
-	if (idxd_wq_refcount(wq) > 1 && wq_dedicated(wq))
+	if (idxd_wq_refcount(wq) > 0 && wq_dedicated(wq))
 		return -EBUSY;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-- 
Gitee


From c27c8e088884a4cfb3ffaf0ce963fccf2c4522f9 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 15 Apr 2020 09:13:12 -0700
Subject: [PATCH 0539/2161] dmaengine: idxd: export hw version through sysfs

commit c2ce6bbcfc9f142b426717bc5aa3f9ba20485a65 upstream.

Some user apps would like to know the hardware version in order to
determine the variation of the hardware. Export the hardware version number
to userspace via sysfs.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158696714008.39484.13401950732606906479.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/ABI/stable/sysfs-driver-dma-idxd |  6 ++++++
 drivers/dma/idxd/sysfs.c                       | 11 +++++++++++
 2 files changed, 17 insertions(+)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index f4be46cc6cb6..b5bebf642db6 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -1,3 +1,9 @@
+What:		sys/bus/dsa/devices/dsa<m>/version
+Date:		Apr 15, 2020
+KernelVersion:	5.8.0
+Contact:	dmaengine@vger.kernel.org
+Description:	The hardware version number.
+
 What:           sys/bus/dsa/devices/dsa<m>/cdev_major
 Date:           Oct 25, 2019
 KernelVersion: 	5.6.0
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 3999827970ab..052dae5d6ddd 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1092,6 +1092,16 @@ static const struct attribute_group *idxd_wq_attribute_groups[] = {
 };
 
 /* IDXD device attribs */
+static ssize_t version_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%#x\n", idxd->hw.version);
+}
+static DEVICE_ATTR_RO(version);
+
 static ssize_t max_work_queues_size_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -1313,6 +1323,7 @@ static ssize_t cdev_major_show(struct device *dev,
 static DEVICE_ATTR_RO(cdev_major);
 
 static struct attribute *idxd_device_attributes[] = {
+	&dev_attr_version.attr,
 	&dev_attr_max_groups.attr,
 	&dev_attr_max_work_queues.attr,
 	&dev_attr_max_work_queues_size.attr,
-- 
Gitee


From 0501f5e256308985a4015be315bfa32dcf92a878 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 13 Apr 2020 10:40:12 -0700
Subject: [PATCH 0540/2161] dmaengine: fix channel index enumeration

commit 0821009445a8261ac4d32a6df4b83938e007c765 upstream.

When the channel register code was changed to allow hotplug operations,
dynamic indexing wasn't taken into account. When channels are randomly
plugged and unplugged out of order, the serial indexing breaks. Convert
channel indexing to using IDA tracking in order to allow dynamic
assignment. The previous code does not cause any regression bug for
existing channel allocation besides idxd driver since the hotplug usage
case is only used by idxd at this point.

With this change, the chan->idr_ref is also not needed any longer. We can
have a device with no channels registered due to hot plug. The channel
device release code no longer should attempt to free the dma device id on
the last channel release.

Fixes: e81274cd6b52 ("dmaengine: add support to dynamic register/unregister of channels")

Reported-by: Yixin Zhang <yixin.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Yixin Zhang <yixin.zhang@intel.com>
Link: https://lore.kernel.org/r/158679961260.7674.8485924270472851852.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 60 +++++++++++++++++----------------------
 include/linux/dmaengine.h |  4 +--
 2 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 4830ba658ce1..d31076d9ef25 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -232,10 +232,6 @@ static void chan_dev_release(struct device *dev)
 	struct dma_chan_dev *chan_dev;
 
 	chan_dev = container_of(dev, typeof(*chan_dev), device);
-	if (atomic_dec_and_test(chan_dev->idr_ref)) {
-		ida_free(&dma_ida, chan_dev->dev_id);
-		kfree(chan_dev->idr_ref);
-	}
 	kfree(chan_dev);
 }
 
@@ -1043,27 +1039,9 @@ static int get_dma_id(struct dma_device *device)
 }
 
 static int __dma_async_device_channel_register(struct dma_device *device,
-					       struct dma_chan *chan,
-					       int chan_id)
+					       struct dma_chan *chan)
 {
 	int rc = 0;
-	int chancnt = device->chancnt;
-	atomic_t *idr_ref;
-	struct dma_chan *tchan;
-
-	tchan = list_first_entry_or_null(&device->channels,
-					 struct dma_chan, device_node);
-	if (!tchan)
-		return -ENODEV;
-
-	if (tchan->dev) {
-		idr_ref = tchan->dev->idr_ref;
-	} else {
-		idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
-		if (!idr_ref)
-			return -ENOMEM;
-		atomic_set(idr_ref, 0);
-	}
 
 	chan->local = alloc_percpu(typeof(*chan->local));
 	if (!chan->local)
@@ -1079,29 +1057,36 @@ static int __dma_async_device_channel_register(struct dma_device *device,
 	 * When the chan_id is a negative value, we are dynamically adding
 	 * the channel. Otherwise we are static enumerating.
 	 */
-	chan->chan_id = chan_id < 0 ? chancnt : chan_id;
+	mutex_lock(&device->chan_mutex);
+	chan->chan_id = ida_alloc(&device->chan_ida, GFP_KERNEL);
+	mutex_unlock(&device->chan_mutex);
+	if (chan->chan_id < 0) {
+		pr_err("%s: unable to alloc ida for chan: %d\n",
+		       __func__, chan->chan_id);
+		goto err_out;
+	}
+
 	chan->dev->device.class = &dma_devclass;
 	chan->dev->device.parent = device->dev;
 	chan->dev->chan = chan;
-	chan->dev->idr_ref = idr_ref;
 	chan->dev->dev_id = device->dev_id;
-	atomic_inc(idr_ref);
 	dev_set_name(&chan->dev->device, "dma%dchan%d",
 		     device->dev_id, chan->chan_id);
-
 	rc = device_register(&chan->dev->device);
 	if (rc)
-		goto err_out;
+		goto err_out_ida;
 	chan->client_count = 0;
-	device->chancnt = chan->chan_id + 1;
+	device->chancnt++;
 
 	return 0;
 
+ err_out_ida:
+	mutex_lock(&device->chan_mutex);
+	ida_free(&device->chan_ida, chan->chan_id);
+	mutex_unlock(&device->chan_mutex);
  err_out:
 	free_percpu(chan->local);
 	kfree(chan->dev);
-	if (atomic_dec_return(idr_ref) == 0)
-		kfree(idr_ref);
 	return rc;
 }
 
@@ -1110,7 +1095,7 @@ int dma_async_device_channel_register(struct dma_device *device,
 {
 	int rc;
 
-	rc = __dma_async_device_channel_register(device, chan, -1);
+	rc = __dma_async_device_channel_register(device, chan);
 	if (rc < 0)
 		return rc;
 
@@ -1130,6 +1115,9 @@ static void __dma_async_device_channel_unregister(struct dma_device *device,
 	device->chancnt--;
 	chan->dev->chan = NULL;
 	mutex_unlock(&dma_list_mutex);
+	mutex_lock(&device->chan_mutex);
+	ida_free(&device->chan_ida, chan->chan_id);
+	mutex_unlock(&device->chan_mutex);
 	device_unregister(&chan->dev->device);
 	free_percpu(chan->local);
 }
@@ -1152,7 +1140,7 @@ EXPORT_SYMBOL_GPL(dma_async_device_channel_unregister);
  */
 int dma_async_device_register(struct dma_device *device)
 {
-	int rc, i = 0;
+	int rc;
 	struct dma_chan* chan;
 
 	if (!device)
@@ -1257,9 +1245,12 @@ int dma_async_device_register(struct dma_device *device)
 	if (rc != 0)
 		return rc;
 
+	mutex_init(&device->chan_mutex);
+	ida_init(&device->chan_ida);
+
 	/* represent channels in sysfs. Probably want devs too */
 	list_for_each_entry(chan, &device->channels, device_node) {
-		rc = __dma_async_device_channel_register(device, chan, i++);
+		rc = __dma_async_device_channel_register(device, chan);
 		if (rc < 0)
 			goto err_out;
 	}
@@ -1334,6 +1325,7 @@ void dma_async_device_unregister(struct dma_device *device)
 	 */
 	dma_cap_set(DMA_PRIVATE, device->cap_mask);
 	dma_channel_rebalance();
+	ida_free(&dma_ida, device->dev_id);
 	dma_device_put(device);
 	mutex_unlock(&dma_list_mutex);
 }
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index d8d27778dfba..83d52a7e7b88 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -341,13 +341,11 @@ struct dma_chan {
  * @chan: driver channel device
  * @device: sysfs device
  * @dev_id: parent dma_device dev_id
- * @idr_ref: reference count to gate release of dma_device dev_id
  */
 struct dma_chan_dev {
 	struct dma_chan *chan;
 	struct device device;
 	int dev_id;
-	atomic_t *idr_ref;
 };
 
 /**
@@ -834,6 +832,8 @@ struct dma_device {
 	int dev_id;
 	struct device *dev;
 	struct module *owner;
+	struct ida chan_ida;
+	struct mutex chan_mutex;	/* to protect chan_ida */
 
 	u32 src_addr_widths;
 	u32 dst_addr_widths;
-- 
Gitee


From 8c7c683716d00b5d0a10d8d6e27bd46d4c6da245 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 29 Apr 2020 15:21:50 +0300
Subject: [PATCH 0541/2161] dmaengine: Include dmaengine.h into dmaengine.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 833d88f3fd50a54544a7e71edef0877bb7b769c1 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiler is not happy about non-static functions due to missed inclusion

.../dmaengine.c:682:18: warning: no previous prototype for ‘dma_get_slave_channel’ [-Wmissing-prototypes]
  682 | struct dma_chan *dma_get_slave_channel(struct dma_chan *chan)
      |                  ^~~~~~~~~~~~~~~~~~~~~
.../dmaengine.c:713:18: warning: no previous prototype for ‘dma_get_any_slave_channel’ [-Wmissing-prototypes]
  713 | struct dma_chan *dma_get_any_slave_channel(struct dma_device *device)
      |                  ^~~~~~~~~~~~~~~~~~~~~~~~~

Include missed header to satisfy compiler.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200429122151.50989-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index d31076d9ef25..ee7f51d9da7d 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -53,6 +53,8 @@
 #include <linux/mempool.h>
 #include <linux/numa.h>
 
+#include "dmaengine.h"
+
 static DEFINE_MUTEX(dma_list_mutex);
 static DEFINE_IDA(dma_ida);
 static LIST_HEAD(dma_device_list);
-- 
Gitee


From 8454b190bbd18648a2bdae7473cb1b372ef60997 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 29 Apr 2020 15:21:51 +0300
Subject: [PATCH 0542/2161] dmaengine: Fix doc strings to satisfy validation
 script

commit 9872e23d6879ee04c7fe8372328195d12e977071 upstream.

The validation kernel doc script complains about undescribed
function parameters

.../dmaengine.c:155: warning: Function parameter or member 'dev' not descr ibed in 'dev_to_dma_chan'
.../dmaengine.c:251: warning: cannot understand function prototype: 'dma_cap_mask_t dma_cap_mask_all; '
.../dmaengine.c:257: warning: cannot understand function prototype: 'struct dma_chan_tbl_ent '
.../dmaengine.c:264: warning: cannot understand function prototype: 'struct dma_chan_tbl_ent __percpu *channel_table[DMA_TX_TYPE_END]; '
.../dmaengine.c:304: warning: Function parameter or member 'chan' not described in 'dma_chan_is_local'
.../dmaengine.c:304: warning: Function parameter or member 'cpu' not described in 'dma_chan_is_local'
.../dmaengine.c:414: warning: Function parameter or member 'chan' not described in 'balance_ref_count'
.../dmaengine.c:447: warning: Function parameter or member 'chan' not described in 'dma_chan_get'
.../dmaengine.c:494: warning: Function parameter or member 'chan' not described in 'dma_chan_put'

Add descriptions to the function parameters and in some cases update
existing text as well.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200429122151.50989-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 96 +++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 46 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index ee7f51d9da7d..2b06a7a8629d 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -147,9 +147,9 @@ static inline void dmaengine_debug_unregister(struct dma_device *dma_dev) { }
 
 /**
  * dev_to_dma_chan - convert a device pointer to its sysfs container object
- * @dev - device node
+ * @dev:	device node
  *
- * Must be called under dma_list_mutex
+ * Must be called under dma_list_mutex.
  */
 static struct dma_chan *dev_to_dma_chan(struct device *dev)
 {
@@ -245,22 +245,18 @@ static struct class dma_devclass = {
 
 /* --- client and device registration --- */
 
-/**
- * dma_cap_mask_all - enable iteration over all operation types
- */
+/* enable iteration over all operation types */
 static dma_cap_mask_t dma_cap_mask_all;
 
 /**
- * dma_chan_tbl_ent - tracks channel allocations per core/operation
- * @chan - associated channel for this entry
+ * struct dma_chan_tbl_ent - tracks channel allocations per core/operation
+ * @chan:	associated channel for this entry
  */
 struct dma_chan_tbl_ent {
 	struct dma_chan *chan;
 };
 
-/**
- * channel_table - percpu lookup table for memory-to-memory offload providers
- */
+/* percpu lookup table for memory-to-memory offload providers */
 static struct dma_chan_tbl_ent __percpu *channel_table[DMA_TX_TYPE_END];
 
 static int __init dma_channel_table_init(void)
@@ -297,8 +293,11 @@ static int __init dma_channel_table_init(void)
 arch_initcall(dma_channel_table_init);
 
 /**
- * dma_chan_is_local - returns true if the channel is in the same numa-node as
- *	the cpu
+ * dma_chan_is_local - checks if the channel is in the same NUMA-node as the CPU
+ * @chan:	DMA channel to test
+ * @cpu:	CPU index which the channel should be close to
+ *
+ * Returns true if the channel is in the same NUMA-node as the CPU.
  */
 static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
 {
@@ -308,14 +307,14 @@ static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
 }
 
 /**
- * min_chan - returns the channel with min count and in the same numa-node as
- *	the cpu
- * @cap: capability to match
- * @cpu: cpu index which the channel should be close to
+ * min_chan - finds the channel with min count and in the same NUMA-node as the CPU
+ * @cap:	capability to match
+ * @cpu:	CPU index which the channel should be close to
  *
- * If some channels are close to the given cpu, the one with the lowest
- * reference count is returned. Otherwise, cpu is ignored and only the
+ * If some channels are close to the given CPU, the one with the lowest
+ * reference count is returned. Otherwise, CPU is ignored and only the
  * reference count is taken into account.
+ *
  * Must be called under dma_list_mutex.
  */
 static struct dma_chan *min_chan(enum dma_transaction_type cap, int cpu)
@@ -353,10 +352,11 @@ static struct dma_chan *min_chan(enum dma_transaction_type cap, int cpu)
 /**
  * dma_channel_rebalance - redistribute the available channels
  *
- * Optimize for cpu isolation (each cpu gets a dedicated channel for an
- * operation type) in the SMP case,  and operation isolation (avoid
- * multi-tasking channels) in the non-SMP case.  Must be called under
- * dma_list_mutex.
+ * Optimize for CPU isolation (each CPU gets a dedicated channel for an
+ * operation type) in the SMP case, and operation isolation (avoid
+ * multi-tasking channels) in the non-SMP case.
+ *
+ * Must be called under dma_list_mutex.
  */
 static void dma_channel_rebalance(void)
 {
@@ -406,9 +406,9 @@ static struct module *dma_chan_to_owner(struct dma_chan *chan)
 
 /**
  * balance_ref_count - catch up the channel reference count
- * @chan - channel to balance ->client_count versus dmaengine_ref_count
+ * @chan:	channel to balance ->client_count versus dmaengine_ref_count
  *
- * balance_ref_count must be called under dma_list_mutex
+ * Must be called under dma_list_mutex.
  */
 static void balance_ref_count(struct dma_chan *chan)
 {
@@ -438,10 +438,10 @@ static void dma_device_put(struct dma_device *device)
 }
 
 /**
- * dma_chan_get - try to grab a dma channel's parent driver module
- * @chan - channel to grab
+ * dma_chan_get - try to grab a DMA channel's parent driver module
+ * @chan:	channel to grab
  *
- * Must be called under dma_list_mutex
+ * Must be called under dma_list_mutex.
  */
 static int dma_chan_get(struct dma_chan *chan)
 {
@@ -485,10 +485,10 @@ static int dma_chan_get(struct dma_chan *chan)
 }
 
 /**
- * dma_chan_put - drop a reference to a dma channel's parent driver module
- * @chan - channel to release
+ * dma_chan_put - drop a reference to a DMA channel's parent driver module
+ * @chan:	channel to release
  *
- * Must be called under dma_list_mutex
+ * Must be called under dma_list_mutex.
  */
 static void dma_chan_put(struct dma_chan *chan)
 {
@@ -539,7 +539,7 @@ EXPORT_SYMBOL(dma_sync_wait);
 
 /**
  * dma_find_channel - find a channel to carry out the operation
- * @tx_type: transaction type
+ * @tx_type:	transaction type
  */
 struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type)
 {
@@ -679,7 +679,7 @@ static struct dma_chan *find_candidate(struct dma_device *device,
 
 /**
  * dma_get_slave_channel - try to get specific channel exclusively
- * @chan: target channel
+ * @chan:	target channel
  */
 struct dma_chan *dma_get_slave_channel(struct dma_chan *chan)
 {
@@ -733,10 +733,10 @@ EXPORT_SYMBOL_GPL(dma_get_any_slave_channel);
 
 /**
  * __dma_request_channel - try to allocate an exclusive channel
- * @mask: capabilities that the channel must satisfy
- * @fn: optional callback to disposition available channels
- * @fn_param: opaque parameter to pass to dma_filter_fn
- * @np: device node to look for DMA channels
+ * @mask:	capabilities that the channel must satisfy
+ * @fn:		optional callback to disposition available channels
+ * @fn_param:	opaque parameter to pass to dma_filter_fn()
+ * @np:		device node to look for DMA channels
  *
  * Returns pointer to appropriate DMA channel on success or NULL.
  */
@@ -879,7 +879,7 @@ EXPORT_SYMBOL_GPL(dma_request_slave_channel);
 
 /**
  * dma_request_chan_by_mask - allocate a channel satisfying certain capabilities
- * @mask: capabilities that the channel must satisfy
+ * @mask:	capabilities that the channel must satisfy
  *
  * Returns pointer to appropriate DMA channel on success or an error pointer.
  */
@@ -970,7 +970,7 @@ void dmaengine_get(void)
 EXPORT_SYMBOL(dmaengine_get);
 
 /**
- * dmaengine_put - let dma drivers be removed when ref_count == 0
+ * dmaengine_put - let DMA drivers be removed when ref_count == 0
  */
 void dmaengine_put(void)
 {
@@ -1134,7 +1134,7 @@ EXPORT_SYMBOL_GPL(dma_async_device_channel_unregister);
 
 /**
  * dma_async_device_register - registers DMA devices found
- * @device: &dma_device
+ * @device:	pointer to &struct dma_device
  *
  * After calling this routine the structure should not be freed except in the
  * device_release() callback which will be called after
@@ -1306,7 +1306,7 @@ EXPORT_SYMBOL(dma_async_device_register);
 
 /**
  * dma_async_device_unregister - unregister a DMA device
- * @device: &dma_device
+ * @device:	pointer to &struct dma_device
  *
  * This routine is called by dma driver exit routines, dmaengine holds module
  * references to prevent it being called while channels are in use.
@@ -1343,7 +1343,7 @@ static void dmam_device_release(struct device *dev, void *res)
 
 /**
  * dmaenginem_async_device_register - registers DMA devices found
- * @device: &dma_device
+ * @device:	pointer to &struct dma_device
  *
  * The operation is managed and will be undone on driver detach.
  */
@@ -1580,8 +1580,9 @@ int dmaengine_desc_set_metadata_len(struct dma_async_tx_descriptor *desc,
 }
 EXPORT_SYMBOL_GPL(dmaengine_desc_set_metadata_len);
 
-/* dma_wait_for_async_tx - spin wait for a transaction to complete
- * @tx: in-flight transaction to wait on
+/**
+ * dma_wait_for_async_tx - spin wait for a transaction to complete
+ * @tx:		in-flight transaction to wait on
  */
 enum dma_status
 dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
@@ -1604,9 +1605,12 @@ dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
 }
 EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
 
-/* dma_run_dependencies - helper routine for dma drivers to process
- *	(start) dependent operations on their target channel
- * @tx: transaction with dependencies
+/**
+ * dma_run_dependencies - process dependent operations on the target channel
+ * @tx:		transaction with dependencies
+ *
+ * Helper routine for DMA drivers to process (start) dependent operations
+ * on their target channel.
  */
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx)
 {
-- 
Gitee


From 83850c07fbd91e8921b7c792bec38f82df13e12a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 1 May 2020 08:21:18 -0700
Subject: [PATCH 0543/2161] dmaengine: idxd: fix interrupt completion after
 unmasking

commit 4f302642b70c1348773fe7e3ded9fc315fa92990 upstream.

The current implementation may miss completions after we unmask the
interrupt. In order to make sure we process all competions, we need to:
1. Do an MMIO read from the device as a barrier to ensure that all PCI
   writes for completions have arrived.
2. Check for any additional completions that we missed.

Fixes: 8f47d1a5e545 ("dmaengine: idxd: connect idxd to dmaengine subsystem")

Reported-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/158834641769.35613.1341160109892008587.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  7 +++++++
 drivers/dma/idxd/irq.c    | 26 +++++++++++++++++++-------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index f6f49f0f6fae..8d79a8787104 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -62,6 +62,13 @@ int idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id)
 	perm.ignore = 0;
 	iowrite32(perm.bits, idxd->reg_base + offset);
 
+	/*
+	 * A readback from the device ensures that any previously generated
+	 * completion record writes are visible to software based on PCI
+	 * ordering rules.
+	 */
+	perm.bits = ioread32(idxd->reg_base + offset);
+
 	return 0;
 }
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index d6fcd2e60103..6510791b9921 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -173,6 +173,7 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 	struct llist_node *head;
 	int queued = 0;
 
+	*processed = 0;
 	head = llist_del_all(&irq_entry->pending_llist);
 	if (!head)
 		return 0;
@@ -197,6 +198,7 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 	struct list_head *node, *next;
 	int queued = 0;
 
+	*processed = 0;
 	if (list_empty(&irq_entry->work_list))
 		return 0;
 
@@ -218,10 +220,9 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 	return queued;
 }
 
-irqreturn_t idxd_wq_thread(int irq, void *data)
+static int idxd_desc_process(struct idxd_irq_entry *irq_entry)
 {
-	struct idxd_irq_entry *irq_entry = data;
-	int rc, processed = 0, retry = 0;
+	int rc, processed, total = 0;
 
 	/*
 	 * There are two lists we are processing. The pending_llist is where
@@ -244,15 +245,26 @@ irqreturn_t idxd_wq_thread(int irq, void *data)
 	 */
 	do {
 		rc = irq_process_work_list(irq_entry, &processed);
-		if (rc != 0) {
-			retry++;
+		total += processed;
+		if (rc != 0)
 			continue;
-		}
 
 		rc = irq_process_pending_llist(irq_entry, &processed);
-	} while (rc != 0 && retry != 10);
+		total += processed;
+	} while (rc != 0);
+
+	return total;
+}
+
+irqreturn_t idxd_wq_thread(int irq, void *data)
+{
+	struct idxd_irq_entry *irq_entry = data;
+	int processed;
 
+	processed = idxd_desc_process(irq_entry);
 	idxd_unmask_msix_vector(irq_entry->idxd, irq_entry->id);
+	/* catch anything unprocessed after unmasking */
+	processed += idxd_desc_process(irq_entry);
 
 	if (processed == 0)
 		return IRQ_NONE;
-- 
Gitee


From 31ca0edcde9aee30ae4f894fded6d24b2e458218 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 13 May 2020 11:47:49 -0700
Subject: [PATCH 0544/2161] dmaengine: cookie bypass for out of order
 completion

commit 47ec7f09bc107720905c96bc37771e4ed1ff0aed upstream.

The cookie tracking in dmaengine expects all submissions completed in
order. Some DMA devices like Intel DSA can complete submissions out of
order, especially if configured with a work queue sharing multiple DMA
engines. Add a status DMA_OUT_OF_ORDER that tx_status can be returned for
those DMA devices. The user should use callbacks to track the completion
rather than the DMA cookie. This would address the issue of dmatest
complaining that descriptors are "busy" when the cookie count goes
backwards due to out of order completion. Add DMA_COMPLETION_NO_ORDER
DMA capability to allow the driver to flag the device's ability to complete
operations out of order.

Reported-by: Swathi Kovvuri <swathi.kovvuri@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Swathi Kovvuri <swathi.kovvuri@intel.com>
Link: https://lore.kernel.org/r/158939557151.20335.12404113976045569870.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../driver-api/dmaengine/provider.rst         | 19 +++++++++++++++++++
 drivers/dma/dmatest.c                         | 11 ++++++++++-
 drivers/dma/idxd/dma.c                        |  3 ++-
 include/linux/dmaengine.h                     |  2 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index dfc4486b5743..3b6bb2bc5d1f 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -239,6 +239,22 @@ Currently, the types available are:
     want to transfer a portion of uncompressed data directly to the
     display to print it
 
+- DMA_COMPLETION_NO_ORDER
+
+  - The device does not support in order completion.
+
+  - The driver should return DMA_OUT_OF_ORDER for device_tx_status if
+    the device is setting this capability.
+
+  - All cookie tracking and checking API should be treated as invalid if
+    the device exports this capability.
+
+  - At this point, this is incompatible with polling option for dmatest.
+
+  - If this cap is set, the user is recommended to provide an unique
+    identifier for each descriptor sent to the DMA device in order to
+    properly track the completion.
+
 These various types will also affect how the source and destination
 addresses change over time.
 
@@ -343,6 +359,9 @@ supported.
   - In the case of a cyclic transfer, it should only take into
     account the current period.
 
+  - Should return DMA_OUT_OF_ORDER if the device does not support in order
+    completion and is completing the operation out of order.
+
   - This function can be called in an interrupt context.
 
 - device_config
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 238936e2dfe2..5a49458602d0 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -821,7 +821,10 @@ static int dmatest_func(void *data)
 			result("test timed out", total_tests, src->off, dst->off,
 			       len, 0);
 			goto error_unmap_continue;
-		} else if (status != DMA_COMPLETE) {
+		} else if (status != DMA_COMPLETE &&
+			   !(dma_has_cap(DMA_COMPLETION_NO_ORDER,
+					 dev->cap_mask) &&
+			     status == DMA_OUT_OF_ORDER)) {
 			result(status == DMA_ERROR ?
 			       "completion error status" :
 			       "completion busy status", total_tests, src->off,
@@ -999,6 +1002,12 @@ static int dmatest_add_channel(struct dmatest_info *info,
 	dtc->chan = chan;
 	INIT_LIST_HEAD(&dtc->threads);
 
+	if (dma_has_cap(DMA_COMPLETION_NO_ORDER, dma_dev->cap_mask) &&
+	    info->params.polled) {
+		info->params.polled = false;
+		pr_warn("DMA_COMPLETION_NO_ORDER, polled disabled\n");
+	}
+
 	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) {
 		if (dmatest == 0) {
 			cnt = dmatest_add_threads(info, dtc, DMA_MEMCPY);
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index c64c1429d160..0c892cbd72e0 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -133,7 +133,7 @@ static enum dma_status idxd_dma_tx_status(struct dma_chan *dma_chan,
 					  dma_cookie_t cookie,
 					  struct dma_tx_state *txstate)
 {
-	return dma_cookie_status(dma_chan, cookie, txstate);
+	return DMA_OUT_OF_ORDER;
 }
 
 /*
@@ -174,6 +174,7 @@ int idxd_register_dma_device(struct idxd_device *idxd)
 	INIT_LIST_HEAD(&dma->channels);
 	dma->dev = &idxd->pdev->dev;
 
+	dma_cap_set(DMA_COMPLETION_NO_ORDER, dma->cap_mask);
 	dma->device_release = idxd_dma_release;
 
 	if (idxd->hw.opcap.bits[0] & IDXD_OPCAP_MEMMOVE) {
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 83d52a7e7b88..bbfa6b78b22a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -39,6 +39,7 @@ enum dma_status {
 	DMA_IN_PROGRESS,
 	DMA_PAUSED,
 	DMA_ERROR,
+	DMA_OUT_OF_ORDER,
 };
 
 /**
@@ -61,6 +62,7 @@ enum dma_transaction_type {
 	DMA_SLAVE,
 	DMA_CYCLIC,
 	DMA_INTERLEAVE,
+	DMA_COMPLETION_NO_ORDER,
 /* last transaction type for creation of the capabilities mask */
 	DMA_TX_TYPE_END,
 };
-- 
Gitee


From 283c4e85d865b037b1a54ece39645dc97efeb274 Mon Sep 17 00:00:00 2001
From: Nikhil Rao <nikhil.rao@intel.com>
Date: Mon, 22 Jun 2020 13:38:34 -0700
Subject: [PATCH 0545/2161] dmaengine: idxd: fix cdev locking for open and
 release

commit 66983bc18fad17d10766650b3685045f6f092d73 upstream.

add the wq lock in cdev open and release call. This fixes
race conditions observed in the open and close routines.

Fixes: 42d279f9137a ("dmaengine: idxd: add char driver to expose submission portal to userland")
Signed-off-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159285824892.64944.2905413694915141834.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index ff49847e37a8..cb376cf6a2d2 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -74,6 +74,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 	struct idxd_device *idxd;
 	struct idxd_wq *wq;
 	struct device *dev;
+	int rc = 0;
 
 	wq = inode_wq(inode);
 	idxd = wq->idxd;
@@ -81,17 +82,27 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 
 	dev_dbg(dev, "%s called: %d\n", __func__, idxd_wq_refcount(wq));
 
-	if (idxd_wq_refcount(wq) > 0 && wq_dedicated(wq))
-		return -EBUSY;
-
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
+	mutex_lock(&wq->wq_lock);
+
+	if (idxd_wq_refcount(wq) > 0 && wq_dedicated(wq)) {
+		rc = -EBUSY;
+		goto failed;
+	}
+
 	ctx->wq = wq;
 	filp->private_data = ctx;
 	idxd_wq_get(wq);
+	mutex_unlock(&wq->wq_lock);
 	return 0;
+
+ failed:
+	mutex_unlock(&wq->wq_lock);
+	kfree(ctx);
+	return rc;
 }
 
 static int idxd_cdev_release(struct inode *node, struct file *filep)
@@ -105,7 +116,9 @@ static int idxd_cdev_release(struct inode *node, struct file *filep)
 	filep->private_data = NULL;
 
 	kfree(ctx);
+	mutex_lock(&wq->wq_lock);
 	idxd_wq_put(wq);
+	mutex_unlock(&wq->wq_lock);
 	return 0;
 }
 
-- 
Gitee


From 17c67ac3e776f80f7b24313b344f8679fdc2495b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 22 Jun 2020 21:13:11 +0300
Subject: [PATCH 0546/2161] dmaengine: acpi: Drop double check for ACPI
 companion device

commit 6915ef1cbebb3f66e140ce6b1480bf4a800719da upstream.

acpi_dev_get_resources() does perform the NULL pointer check against
ACPI companion device which is given as function parameter. Thus,
there is no need to duplicate this check in the caller.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200622181311.67649-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/acpi-dma.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c
index dcbcb712de6e..235f1396f968 100644
--- a/drivers/dma/acpi-dma.c
+++ b/drivers/dma/acpi-dma.c
@@ -360,19 +360,12 @@ struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
 {
 	struct acpi_dma_parser_data pdata;
 	struct acpi_dma_spec *dma_spec = &pdata.dma_spec;
+	struct acpi_device *adev = ACPI_COMPANION(dev);
 	struct list_head resource_list;
-	struct acpi_device *adev;
 	struct acpi_dma *adma;
 	struct dma_chan *chan = NULL;
 	int found;
-
-	/* Check if the device was enumerated by ACPI */
-	if (!dev)
-		return ERR_PTR(-ENODEV);
-
-	adev = ACPI_COMPANION(dev);
-	if (!adev)
-		return ERR_PTR(-ENODEV);
+	int ret;
 
 	memset(&pdata, 0, sizeof(pdata));
 	pdata.index = index;
@@ -382,9 +375,11 @@ struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
 	dma_spec->slave_id = -1;
 
 	INIT_LIST_HEAD(&resource_list);
-	acpi_dev_get_resources(adev, &resource_list,
-			acpi_dma_parse_fixed_dma, &pdata);
+	ret = acpi_dev_get_resources(adev, &resource_list,
+				     acpi_dma_parse_fixed_dma, &pdata);
 	acpi_dev_free_resource_list(&resource_list);
+	if (ret < 0)
+		return ERR_PTR(ret);
 
 	if (dma_spec->slave_id < 0 || dma_spec->chan_id < 0)
 		return ERR_PTR(-ENODEV);
-- 
Gitee


From 03e09061dfb7ed9e9798d3909cfb54122d4a846c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 3 Jun 2020 10:27:48 -0700
Subject: [PATCH 0547/2161] dmaengine: idxd: fix hw descriptor fields for delta
 record

commit 0b8975bdc0cc5310d48d9bdd871cefebe1f94c99 upstream.

Fix the hw descriptor fields for delta record in user exported idxd.h
header. Missing the "expected result mask" field.

Reported-by: Mona Hossain <mona.hossain@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159120526866.65385.536565786678052944.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/idxd.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 1f412fbf561b..e103c1434e4b 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -110,9 +110,12 @@ struct dsa_hw_desc {
 	uint16_t	rsvd1;
 	union {
 		uint8_t		expected_res;
+		/* create delta record */
 		struct {
 			uint64_t	delta_addr;
 			uint32_t	max_delta_size;
+			uint32_t 	delt_rsvd;
+			uint8_t 	expected_res_mask;
 		};
 		uint32_t	delta_rec_size;
 		uint64_t	dest2;
-- 
Gitee


From ad403066b2fe4a877ae58c36faf8987d7803fa63 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 15 Jun 2020 13:54:31 -0700
Subject: [PATCH 0548/2161] dmaengine: idxd: add leading / for sysfspath in ABI
 documentation

commit 77522b2191368ea57720e960fc3c7e6fbfc04533 upstream.

Correct to standard convention. All sysfs paths seem to be missing
leading /.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/159225447176.68253.2922149693913698177.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          | 56 +++++++++----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index b5bebf642db6..1af9c4175213 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -1,47 +1,47 @@
-What:		sys/bus/dsa/devices/dsa<m>/version
+What:		/sys/bus/dsa/devices/dsa<m>/version
 Date:		Apr 15, 2020
 KernelVersion:	5.8.0
 Contact:	dmaengine@vger.kernel.org
 Description:	The hardware version number.
 
-What:           sys/bus/dsa/devices/dsa<m>/cdev_major
+What:           /sys/bus/dsa/devices/dsa<m>/cdev_major
 Date:           Oct 25, 2019
-KernelVersion: 	5.6.0
+KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:	The major number that the character device driver assigned to
 		this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/errors
+What:           /sys/bus/dsa/devices/dsa<m>/errors
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The error information for this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/max_batch_size
+What:           /sys/bus/dsa/devices/dsa<m>/max_batch_size
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The largest number of work descriptors in a batch.
 
-What:           sys/bus/dsa/devices/dsa<m>/max_work_queues_size
+What:           /sys/bus/dsa/devices/dsa<m>/max_work_queues_size
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The maximum work queue size supported by this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/max_engines
+What:           /sys/bus/dsa/devices/dsa<m>/max_engines
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The maximum number of engines supported by this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/max_groups
+What:           /sys/bus/dsa/devices/dsa<m>/max_groups
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The maximum number of groups can be created under this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/max_tokens
+What:           /sys/bus/dsa/devices/dsa<m>/max_tokens
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
@@ -50,7 +50,7 @@ Description:    The total number of bandwidth tokens supported by this device.
 		implementation, and these resources are allocated by engines to
 		support operations.
 
-What:           sys/bus/dsa/devices/dsa<m>/max_transfer_size
+What:           /sys/bus/dsa/devices/dsa<m>/max_transfer_size
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
@@ -58,57 +58,57 @@ Description:    The number of bytes to be read from the source address to
 		perform the operation. The maximum transfer size is dependent on
 		the workqueue the descriptor was submitted to.
 
-What:           sys/bus/dsa/devices/dsa<m>/max_work_queues
+What:           /sys/bus/dsa/devices/dsa<m>/max_work_queues
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The maximum work queue number that this device supports.
 
-What:           sys/bus/dsa/devices/dsa<m>/numa_node
+What:           /sys/bus/dsa/devices/dsa<m>/numa_node
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The numa node number for this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/op_cap
+What:           /sys/bus/dsa/devices/dsa<m>/op_cap
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The operation capability bit mask specify the operation types
 		supported by the this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/state
+What:           /sys/bus/dsa/devices/dsa<m>/state
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The state information of this device. It can be either enabled
 		or disabled.
 
-What:           sys/bus/dsa/devices/dsa<m>/group<m>.<n>
+What:           /sys/bus/dsa/devices/dsa<m>/group<m>.<n>
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The assigned group under this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/engine<m>.<n>
+What:           /sys/bus/dsa/devices/dsa<m>/engine<m>.<n>
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The assigned engine under this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/wq<m>.<n>
+What:           /sys/bus/dsa/devices/dsa<m>/wq<m>.<n>
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The assigned work queue under this device.
 
-What:           sys/bus/dsa/devices/dsa<m>/configurable
+What:           /sys/bus/dsa/devices/dsa<m>/configurable
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    To indicate if this device is configurable or not.
 
-What:           sys/bus/dsa/devices/dsa<m>/token_limit
+What:           /sys/bus/dsa/devices/dsa<m>/token_limit
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
@@ -116,19 +116,19 @@ Description:    The maximum number of bandwidth tokens that may be in use at
 		one time by operations that access low bandwidth memory in the
 		device.
 
-What:           sys/bus/dsa/devices/wq<m>.<n>/group_id
+What:           /sys/bus/dsa/devices/wq<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The group id that this work queue belongs to.
 
-What:           sys/bus/dsa/devices/wq<m>.<n>/size
+What:           /sys/bus/dsa/devices/wq<m>.<n>/size
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The work queue size for this work queue.
 
-What:           sys/bus/dsa/devices/wq<m>.<n>/type
+What:           /sys/bus/dsa/devices/wq<m>.<n>/type
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
@@ -136,20 +136,20 @@ Description:    The type of this work queue, it can be "kernel" type for work
 		queue usages in the kernel space or "user" type for work queue
 		usages by applications in user space.
 
-What:           sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
+What:           /sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The minor number assigned to this work queue by the character
 		device driver.
 
-What:           sys/bus/dsa/devices/wq<m>.<n>/mode
+What:           /sys/bus/dsa/devices/wq<m>.<n>/mode
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The work queue mode type for this work queue.
 
-What:           sys/bus/dsa/devices/wq<m>.<n>/priority
+What:           /sys/bus/dsa/devices/wq<m>.<n>/priority
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
@@ -157,20 +157,20 @@ Description:    The priority value of this work queue, it is a vlue relative to
 		other work queue in the same group to control quality of service
 		for dispatching work from multiple workqueues in the same group.
 
-What:           sys/bus/dsa/devices/wq<m>.<n>/state
+What:           /sys/bus/dsa/devices/wq<m>.<n>/state
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The current state of the work queue.
 
-What:           sys/bus/dsa/devices/wq<m>.<n>/threshold
+What:           /sys/bus/dsa/devices/wq<m>.<n>/threshold
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The number of entries in this work queue that may be filled
 		via a limited portal.
 
-What:           sys/bus/dsa/devices/engine<m>.<n>/group_id
+What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
-- 
Gitee


From 2afe195d062a49b3751a543d828860c2b91496c3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 15 Jun 2020 13:54:26 -0700
Subject: [PATCH 0549/2161] dmaengine: idxd: move submission to sbitmap_queue

commit 0705107fcc80711680b169abc2011686dded6c21 upstream.

Kill the percpu-rwsem for work submission in favor of an sbitmap_queue.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/159225446631.68253.8860709181621260997.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/Kconfig       |  2 +-
 drivers/dma/idxd/device.c | 14 ++++----
 drivers/dma/idxd/idxd.h   |  6 ++--
 drivers/dma/idxd/init.c   | 20 -----------
 drivers/dma/idxd/submit.c | 74 +++++++++++++++++++--------------------
 5 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index c2c7e9635aa2..0343da284d54 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -276,8 +276,8 @@ config INTEL_IDMA64
 config INTEL_IDXD
 	tristate "Intel Data Accelerators support"
 	depends on PCI && X86_64
+	depends on SBITMAP
 	select DMA_ENGINE
-	select SBITMAP
 	help
 	  Enable support for the Intel(R) data accelerators present
 	  in Intel Xeon CPU.
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 8d79a8787104..8f05b29e7891 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -160,16 +160,14 @@ static int alloc_descs(struct idxd_wq *wq, int num)
 int idxd_wq_alloc_resources(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
-	struct idxd_group *group = wq->group;
 	struct device *dev = &idxd->pdev->dev;
 	int rc, num_descs, i;
 
 	if (wq->type != IDXD_WQT_KERNEL)
 		return 0;
 
-	num_descs = wq->size +
-		idxd->hw.gen_cap.max_descs_per_engine * group->num_engines;
-	wq->num_descs = num_descs;
+	wq->num_descs = wq->size;
+	num_descs = wq->size;
 
 	rc = alloc_hw_descs(wq, num_descs);
 	if (rc < 0)
@@ -187,8 +185,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	if (rc < 0)
 		goto fail_alloc_descs;
 
-	rc = sbitmap_init_node(&wq->sbmap, num_descs, -1, GFP_KERNEL,
-			       dev_to_node(dev));
+	rc = sbitmap_queue_init_node(&wq->sbq, num_descs, -1, false, GFP_KERNEL,
+				     dev_to_node(dev));
 	if (rc < 0)
 		goto fail_sbitmap_init;
 
@@ -201,7 +199,7 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 			sizeof(struct dsa_completion_record) * i;
 		desc->id = i;
 		desc->wq = wq;
-
+		desc->cpu = -1;
 		dma_async_tx_descriptor_init(&desc->txd, &wq->dma_chan);
 		desc->txd.tx_submit = idxd_dma_tx_submit;
 	}
@@ -227,7 +225,7 @@ void idxd_wq_free_resources(struct idxd_wq *wq)
 	free_hw_descs(wq);
 	free_descs(wq);
 	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
-	sbitmap_free(&wq->sbmap);
+	sbitmap_queue_free(&wq->sbq);
 }
 
 int idxd_wq_enable(struct idxd_wq *wq)
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index b8f8a363b4a7..b03a754918ef 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -104,7 +104,6 @@ struct idxd_wq {
 	enum idxd_wq_state state;
 	unsigned long flags;
 	union wqcfg wqcfg;
-	atomic_t dq_count;	/* dedicated queue flow control */
 	u32 vec_ptr;		/* interrupt steering */
 	struct dsa_hw_desc **hw_descs;
 	int num_descs;
@@ -112,10 +111,8 @@ struct idxd_wq {
 	dma_addr_t compls_addr;
 	int compls_size;
 	struct idxd_desc **descs;
-	struct sbitmap sbmap;
+	struct sbitmap_queue sbq;
 	struct dma_chan dma_chan;
-	struct percpu_rw_semaphore submit_lock;
-	wait_queue_head_t submit_waitq;
 	char name[WQ_NAME_SIZE + 1];
 };
 
@@ -201,6 +198,7 @@ struct idxd_desc {
 	struct llist_node llnode;
 	struct list_head list;
 	int id;
+	int cpu;
 	struct idxd_wq *wq;
 };
 
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 7778c05deb5d..b69839a8ac2c 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -141,17 +141,6 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	return rc;
 }
 
-static void idxd_wqs_free_lock(struct idxd_device *idxd)
-{
-	int i;
-
-	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
-
-		percpu_free_rwsem(&wq->submit_lock);
-	}
-}
-
 static int idxd_setup_internals(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -181,19 +170,11 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 
 	for (i = 0; i < idxd->max_wqs; i++) {
 		struct idxd_wq *wq = &idxd->wqs[i];
-		int rc;
 
 		wq->id = i;
 		wq->idxd = idxd;
 		mutex_init(&wq->wq_lock);
-		atomic_set(&wq->dq_count, 0);
-		init_waitqueue_head(&wq->submit_waitq);
 		wq->idxd_cdev.minor = -1;
-		rc = percpu_init_rwsem(&wq->submit_lock);
-		if (rc < 0) {
-			idxd_wqs_free_lock(idxd);
-			return rc;
-		}
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
@@ -462,7 +443,6 @@ static void idxd_remove(struct pci_dev *pdev)
 	dev_dbg(&pdev->dev, "%s called\n", __func__);
 	idxd_cleanup_sysfs(idxd);
 	idxd_shutdown(pdev);
-	idxd_wqs_free_lock(idxd);
 	mutex_lock(&idxd_idr_lock);
 	idr_remove(&idxd_idrs[idxd->type], idxd->id);
 	mutex_unlock(&idxd_idr_lock);
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 45a0c5869a0a..156a1ee233aa 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -8,61 +8,61 @@
 #include "idxd.h"
 #include "registers.h"
 
-struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype)
+static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 {
 	struct idxd_desc *desc;
-	int idx;
+
+	desc = wq->descs[idx];
+	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
+	memset(desc->completion, 0, sizeof(struct dsa_completion_record));
+	desc->cpu = cpu;
+	return desc;
+}
+
+struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype)
+{
+	int cpu, idx;
 	struct idxd_device *idxd = wq->idxd;
+	DEFINE_SBQ_WAIT(wait);
+	struct sbq_wait_state *ws;
+	struct sbitmap_queue *sbq;
 
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return ERR_PTR(-EIO);
 
-	if (optype == IDXD_OP_BLOCK)
-		percpu_down_read(&wq->submit_lock);
-	else if (!percpu_down_read_trylock(&wq->submit_lock))
-		return ERR_PTR(-EBUSY);
-
-	if (!atomic_add_unless(&wq->dq_count, 1, wq->size)) {
-		int rc;
-
-		if (optype == IDXD_OP_NONBLOCK) {
-			percpu_up_read(&wq->submit_lock);
+	sbq = &wq->sbq;
+	idx = sbitmap_queue_get(sbq, &cpu);
+	if (idx < 0) {
+		if (optype == IDXD_OP_NONBLOCK)
 			return ERR_PTR(-EAGAIN);
-		}
-
-		percpu_up_read(&wq->submit_lock);
-		percpu_down_write(&wq->submit_lock);
-		rc = wait_event_interruptible(wq->submit_waitq,
-					      atomic_add_unless(&wq->dq_count,
-								1, wq->size) ||
-					       idxd->state != IDXD_DEV_ENABLED);
-		percpu_up_write(&wq->submit_lock);
-		if (rc < 0)
-			return ERR_PTR(-EINTR);
-		if (idxd->state != IDXD_DEV_ENABLED)
-			return ERR_PTR(-EIO);
 	} else {
-		percpu_up_read(&wq->submit_lock);
+		return __get_desc(wq, idx, cpu);
 	}
 
-	idx = sbitmap_get(&wq->sbmap, 0, false);
-	if (idx < 0) {
-		atomic_dec(&wq->dq_count);
-		return ERR_PTR(-EAGAIN);
+	ws = &sbq->ws[0];
+	for (;;) {
+		sbitmap_prepare_to_wait(sbq, ws, &wait, TASK_INTERRUPTIBLE);
+		if (signal_pending_state(TASK_INTERRUPTIBLE, current))
+			break;
+		idx = sbitmap_queue_get(sbq, &cpu);
+		if (idx > 0)
+			break;
+		schedule();
 	}
 
-	desc = wq->descs[idx];
-	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
-	memset(desc->completion, 0, sizeof(struct dsa_completion_record));
-	return desc;
+	sbitmap_finish_wait(sbq, ws, &wait);
+	if (idx < 0)
+		return ERR_PTR(-EAGAIN);
+
+	return __get_desc(wq, idx, cpu);
 }
 
 void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 {
-	atomic_dec(&wq->dq_count);
+	int cpu = desc->cpu;
 
-	sbitmap_clear_bit(&wq->sbmap, desc->id);
-	wake_up(&wq->submit_waitq);
+	desc->cpu = -1;
+	sbitmap_queue_clear(&wq->sbq, desc->id, cpu);
 }
 
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
-- 
Gitee


From acc96dd521fc86ce884612446da1079c7ca9368b Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Thu, 15 Sep 2022 21:14:45 +0800
Subject: [PATCH 0550/2161] add signal.h to fix signal_pending_state compile
 error

commit opencloudos.

work around (commit 4d3f90f0a75694e1a68dae8b9e2561efb672b951)
"dmaengine: idxd: move submission to sbitmap_queue" in this way

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 156a1ee233aa..3e31aacc8595 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -7,6 +7,7 @@
 #include <uapi/linux/idxd.h>
 #include "idxd.h"
 #include "registers.h"
+#include <linux/sched/signal.h>
 
 static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 {
-- 
Gitee


From c2d7f4213a964b307f321b7db1374369679176db Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 25 Jun 2020 12:17:42 -0700
Subject: [PATCH 0551/2161] dmaengine: idxd: cleanup workqueue config after
 disabling

commit da32b28c95a79e399e18c03f8178f41aec9c66e4 upstream.

After disabling a device, we should clean up the internal state for
the wqs and zero out the configuration registers. Without doing so can cause
issues when the user reprogram the wqs.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Reported-by: Yixin Zhang <yixin.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Yixin Zhang <yixin.zhang@intel.com>
Link: https://lore.kernel.org/r/159311264246.1198.11955791213681679428.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 25 +++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |  1 +
 drivers/dma/idxd/sysfs.c  |  5 +++++
 3 files changed, 31 insertions(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 8f05b29e7891..01b01e42508f 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -318,6 +318,31 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq)
 	devm_iounmap(dev, wq->dportal);
 }
 
+void idxd_wq_disable_cleanup(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int i, wq_offset;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	memset(&wq->wqcfg, 0, sizeof(wq->wqcfg));
+	wq->type = IDXD_WQT_NONE;
+	wq->size = 0;
+	wq->group = NULL;
+	wq->threshold = 0;
+	wq->priority = 0;
+	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
+	memset(wq->name, 0, WQ_NAME_SIZE);
+
+	for (i = 0; i < 8; i++) {
+		wq_offset = idxd->wqcfg_offset + wq->id * 32 + i * sizeof(u32);
+		iowrite32(0, idxd->reg_base + wq_offset);
+		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n",
+			wq->id, i, wq_offset,
+			ioread32(idxd->reg_base + wq_offset));
+	}
+}
+
 /* Device control bits */
 static inline bool idxd_is_enabled(struct idxd_device *idxd)
 {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index b03a754918ef..7a2bd1b20737 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -288,6 +288,7 @@ int idxd_wq_enable(struct idxd_wq *wq);
 int idxd_wq_disable(struct idxd_wq *wq);
 int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
+void idxd_wq_disable_cleanup(struct idxd_wq *wq);
 
 /* submission */
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 052dae5d6ddd..2e2c5082f322 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -315,6 +315,11 @@ static int idxd_config_bus_remove(struct device *dev)
 		idxd_unregister_dma_device(idxd);
 		spin_lock_irqsave(&idxd->dev_lock, flags);
 		rc = idxd_device_disable(idxd);
+		for (i = 0; i < idxd->max_wqs; i++) {
+			struct idxd_wq *wq = &idxd->wqs[i];
+
+			idxd_wq_disable_cleanup(wq);
+		}
 		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 		module_put(THIS_MODULE);
 		if (rc < 0)
-- 
Gitee


From 65af20b7be27c8b6a2aedd369ea7b7b6e9865432 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 25 Jun 2020 12:16:54 -0700
Subject: [PATCH 0552/2161] dmaengine: idxd: fix misc interrupt handler thread
 unmasking

commit e3122822a74033ba8d6d9af855078f9ab741e33f upstream.

Fix unmasking of misc interrupt handler when completing normal. It exits
early and skips the unmasking with the current implementation. Fix to
unmask interrupt when exiting normally.

Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159311256528.855.11527922406329728512.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 6510791b9921..8a35f58da689 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -141,7 +141,7 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 
 	iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
 	if (!err)
-		return IRQ_HANDLED;
+		goto out;
 
 	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
 	if (gensts.state == IDXD_DEVICE_STATE_HALT) {
@@ -162,6 +162,7 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 		spin_unlock_bh(&idxd->dev_lock);
 	}
 
+ out:
 	idxd_unmask_msix_vector(idxd, irq_entry->id);
 	return IRQ_HANDLED;
 }
-- 
Gitee


From 363654c31fc58d468ad9cc3488463e818704034a Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 8 Dec 2021 16:23:23 +0800
Subject: [PATCH 0553/2161] dmaengine: idxd: add work queue drain support

commit 0d5c10b4c84d6ae6255129e5f16a0d2119c74334 upstream.

Add wq drain support. When a wq is being released, it needs to wait for
all in-flight operation to complete.  A device control function
idxd_wq_drain() has been added to facilitate this. A wq drain call
is added to the char dev on release to make sure all user operations are
complete. A wq drain is also added before the wq is being disabled.

A drain command can take an unpredictable period of time. Interrupt support
for device commands is added to allow waiting on the command to
finish. If a previous command is in progress, the new submitter can block
until the current command is finished before proceeding. The interrupt
based submission will submit the command and then wait until a command
completion interrupt happens to complete. All commands are moved to the
interrupt based command submission except for the device reset during
probe, which will be polled.

Fixes: 42d279f9137a ("dmaengine: idxd: add char driver to expose submission portal to userland")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/159319502515.69593.13451647706946040301.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c   |   3 +
 drivers/dma/idxd/device.c | 155 +++++++++++++++++---------------------
 drivers/dma/idxd/idxd.h   |  11 ++-
 drivers/dma/idxd/init.c   |  14 ++--
 drivers/dma/idxd/irq.c    |  41 +++++-----
 drivers/dma/idxd/sysfs.c  |  20 +----
 6 files changed, 112 insertions(+), 132 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index cb376cf6a2d2..c3976156db2f 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -115,6 +115,9 @@ static int idxd_cdev_release(struct inode *node, struct file *filep)
 	dev_dbg(dev, "%s called\n", __func__);
 	filep->private_data = NULL;
 
+	/* Wait for in-flight operations to complete. */
+	idxd_wq_drain(wq);
+
 	kfree(ctx);
 	mutex_lock(&wq->wq_lock);
 	idxd_wq_put(wq);
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 01b01e42508f..4cd3c53d8f8f 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -11,8 +11,8 @@
 #include "idxd.h"
 #include "registers.h"
 
-static int idxd_cmd_wait(struct idxd_device *idxd, u32 *status, int timeout);
-static int idxd_cmd_send(struct idxd_device *idxd, int cmd_code, u32 operand);
+static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
+			  u32 *status);
 
 /* Interrupt control bits */
 int idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
@@ -233,21 +233,13 @@ int idxd_wq_enable(struct idxd_wq *wq)
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
 	u32 status;
-	int rc;
-
-	lockdep_assert_held(&idxd->dev_lock);
 
 	if (wq->state == IDXD_WQ_ENABLED) {
 		dev_dbg(dev, "WQ %d already enabled\n", wq->id);
 		return -ENXIO;
 	}
 
-	rc = idxd_cmd_send(idxd, IDXD_CMD_ENABLE_WQ, wq->id);
-	if (rc < 0)
-		return rc;
-	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
-	if (rc < 0)
-		return rc;
+	idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_WQ, wq->id, &status);
 
 	if (status != IDXD_CMDSTS_SUCCESS &&
 	    status != IDXD_CMDSTS_ERR_WQ_ENABLED) {
@@ -265,9 +257,7 @@ int idxd_wq_disable(struct idxd_wq *wq)
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
 	u32 status, operand;
-	int rc;
 
-	lockdep_assert_held(&idxd->dev_lock);
 	dev_dbg(dev, "Disabling WQ %d\n", wq->id);
 
 	if (wq->state != IDXD_WQ_ENABLED) {
@@ -276,12 +266,7 @@ int idxd_wq_disable(struct idxd_wq *wq)
 	}
 
 	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
-	rc = idxd_cmd_send(idxd, IDXD_CMD_DISABLE_WQ, operand);
-	if (rc < 0)
-		return rc;
-	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
-	if (rc < 0)
-		return rc;
+	idxd_cmd_exec(idxd, IDXD_CMD_DISABLE_WQ, operand, &status);
 
 	if (status != IDXD_CMDSTS_SUCCESS) {
 		dev_dbg(dev, "WQ disable failed: %#x\n", status);
@@ -293,6 +278,22 @@ int idxd_wq_disable(struct idxd_wq *wq)
 	return 0;
 }
 
+void idxd_wq_drain(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand;
+
+	if (wq->state != IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state);
+		return;
+	}
+
+	dev_dbg(dev, "Draining WQ %d\n", wq->id);
+	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
+	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_WQ, operand, NULL);
+}
+
 int idxd_wq_map_portal(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
@@ -355,66 +356,79 @@ static inline bool idxd_is_enabled(struct idxd_device *idxd)
 	return false;
 }
 
-static int idxd_cmd_wait(struct idxd_device *idxd, u32 *status, int timeout)
+/*
+ * This is function is only used for reset during probe and will
+ * poll for completion. Once the device is setup with interrupts,
+ * all commands will be done via interrupt completion.
+ */
+void idxd_device_init_reset(struct idxd_device *idxd)
 {
-	u32 sts, to = timeout;
-
-	lockdep_assert_held(&idxd->dev_lock);
-	sts = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
-	while (sts & IDXD_CMDSTS_ACTIVE && --to) {
-		cpu_relax();
-		sts = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
-	}
+	struct device *dev = &idxd->pdev->dev;
+	union idxd_command_reg cmd;
+	unsigned long flags;
 
-	if (to == 0 && sts & IDXD_CMDSTS_ACTIVE) {
-		dev_warn(&idxd->pdev->dev, "%s timed out!\n", __func__);
-		*status = 0;
-		return -EBUSY;
-	}
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cmd = IDXD_CMD_RESET_DEVICE;
+	dev_dbg(dev, "%s: sending reset for init.\n", __func__);
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
 
-	*status = sts;
-	return 0;
+	while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) &
+	       IDXD_CMDSTS_ACTIVE)
+		cpu_relax();
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 }
 
-static int idxd_cmd_send(struct idxd_device *idxd, int cmd_code, u32 operand)
+static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
+			  u32 *status)
 {
 	union idxd_command_reg cmd;
-	int rc;
-	u32 status;
-
-	lockdep_assert_held(&idxd->dev_lock);
-	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
-	if (rc < 0)
-		return rc;
+	DECLARE_COMPLETION_ONSTACK(done);
+	unsigned long flags;
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.cmd = cmd_code;
 	cmd.operand = operand;
+	cmd.int_req = 1;
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	wait_event_lock_irq(idxd->cmd_waitq,
+			    !test_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags),
+			    idxd->dev_lock);
+
 	dev_dbg(&idxd->pdev->dev, "%s: sending cmd: %#x op: %#x\n",
 		__func__, cmd_code, operand);
+
+	__set_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
+	idxd->cmd_done = &done;
 	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
 
-	return 0;
+	/*
+	 * After command submitted, release lock and go to sleep until
+	 * the command completes via interrupt.
+	 */
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	wait_for_completion(&done);
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	if (status)
+		*status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
+	__clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
+	/* Wake up other pending commands */
+	wake_up(&idxd->cmd_waitq);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 }
 
 int idxd_device_enable(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
-	int rc;
 	u32 status;
 
-	lockdep_assert_held(&idxd->dev_lock);
 	if (idxd_is_enabled(idxd)) {
 		dev_dbg(dev, "Device already enabled\n");
 		return -ENXIO;
 	}
 
-	rc = idxd_cmd_send(idxd, IDXD_CMD_ENABLE_DEVICE, 0);
-	if (rc < 0)
-		return rc;
-	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
-	if (rc < 0)
-		return rc;
+	idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_DEVICE, 0, &status);
 
 	/* If the command is successful or if the device was enabled */
 	if (status != IDXD_CMDSTS_SUCCESS &&
@@ -430,58 +444,29 @@ int idxd_device_enable(struct idxd_device *idxd)
 int idxd_device_disable(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
-	int rc;
 	u32 status;
 
-	lockdep_assert_held(&idxd->dev_lock);
 	if (!idxd_is_enabled(idxd)) {
 		dev_dbg(dev, "Device is not enabled\n");
 		return 0;
 	}
 
-	rc = idxd_cmd_send(idxd, IDXD_CMD_DISABLE_DEVICE, 0);
-	if (rc < 0)
-		return rc;
-	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
-	if (rc < 0)
-		return rc;
+	idxd_cmd_exec(idxd, IDXD_CMD_DISABLE_DEVICE, 0, &status);
 
 	/* If the command is successful or if the device was disabled */
 	if (status != IDXD_CMDSTS_SUCCESS &&
 	    !(status & IDXD_CMDSTS_ERR_DIS_DEV_EN)) {
 		dev_dbg(dev, "%s: err_code: %#x\n", __func__, status);
-		rc = -ENXIO;
-		return rc;
+		return -ENXIO;
 	}
 
 	idxd->state = IDXD_DEV_CONF_READY;
 	return 0;
 }
 
-int __idxd_device_reset(struct idxd_device *idxd)
-{
-	u32 status;
-	int rc;
-
-	rc = idxd_cmd_send(idxd, IDXD_CMD_RESET_DEVICE, 0);
-	if (rc < 0)
-		return rc;
-	rc = idxd_cmd_wait(idxd, &status, IDXD_REG_TIMEOUT);
-	if (rc < 0)
-		return rc;
-
-	return 0;
-}
-
-int idxd_device_reset(struct idxd_device *idxd)
+void idxd_device_reset(struct idxd_device *idxd)
 {
-	unsigned long flags;
-	int rc;
-
-	spin_lock_irqsave(&idxd->dev_lock, flags);
-	rc = __idxd_device_reset(idxd);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
-	return rc;
+	idxd_cmd_exec(idxd, IDXD_CMD_RESET_DEVICE, 0, NULL);
 }
 
 /* Device configuration bits */
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 7a2bd1b20737..0769354df4cf 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -142,6 +142,7 @@ enum idxd_device_state {
 
 enum idxd_device_flag {
 	IDXD_FLAG_CONFIGURABLE = 0,
+	IDXD_FLAG_CMD_RUNNING,
 };
 
 struct idxd_device {
@@ -158,6 +159,7 @@ struct idxd_device {
 	void __iomem *reg_base;
 
 	spinlock_t dev_lock;	/* spinlock for device */
+	struct completion *cmd_done;
 	struct idxd_group *groups;
 	struct idxd_wq *wqs;
 	struct idxd_engine *engines;
@@ -180,12 +182,14 @@ struct idxd_device {
 	int nr_tokens;		/* non-reserved tokens */
 
 	union sw_err_reg sw_err;
-
+	wait_queue_head_t cmd_waitq;
 	struct msix_entry *msix_entries;
 	int num_wq_irqs;
 	struct idxd_irq_entry *irq_entries;
 
 	struct dma_device dma_dev;
+	struct workqueue_struct *wq;
+	struct work_struct work;
 };
 
 /* IDXD software descriptor */
@@ -273,10 +277,10 @@ int idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
 int idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
+void idxd_device_init_reset(struct idxd_device *idxd);
 int idxd_device_enable(struct idxd_device *idxd);
 int idxd_device_disable(struct idxd_device *idxd);
-int idxd_device_reset(struct idxd_device *idxd);
-int __idxd_device_reset(struct idxd_device *idxd);
+void idxd_device_reset(struct idxd_device *idxd);
 void idxd_device_cleanup(struct idxd_device *idxd);
 int idxd_device_config(struct idxd_device *idxd);
 void idxd_device_wqs_clear_state(struct idxd_device *idxd);
@@ -286,6 +290,7 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq);
 void idxd_wq_free_resources(struct idxd_wq *wq);
 int idxd_wq_enable(struct idxd_wq *wq);
 int idxd_wq_disable(struct idxd_wq *wq);
+void idxd_wq_drain(struct idxd_wq *wq);
 int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
 void idxd_wq_disable_cleanup(struct idxd_wq *wq);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index b69839a8ac2c..c7c61974f20f 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -146,6 +146,7 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 	struct device *dev = &idxd->pdev->dev;
 	int i;
 
+	init_waitqueue_head(&idxd->cmd_waitq);
 	idxd->groups = devm_kcalloc(dev, idxd->max_groups,
 				    sizeof(struct idxd_group), GFP_KERNEL);
 	if (!idxd->groups)
@@ -182,6 +183,10 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		idxd->engines[i].id = i;
 	}
 
+	idxd->wq = create_workqueue(dev_name(dev));
+	if (!idxd->wq)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -277,9 +282,7 @@ static int idxd_probe(struct idxd_device *idxd)
 	int rc;
 
 	dev_dbg(dev, "%s entered and resetting device\n", __func__);
-	rc = idxd_device_reset(idxd);
-	if (rc < 0)
-		return rc;
+	idxd_device_init_reset(idxd);
 	dev_dbg(dev, "IDXD reset complete\n");
 
 	idxd_read_caps(idxd);
@@ -414,11 +417,8 @@ static void idxd_shutdown(struct pci_dev *pdev)
 	int rc, i;
 	struct idxd_irq_entry *irq_entry;
 	int msixcnt = pci_msix_vec_count(pdev);
-	unsigned long flags;
 
-	spin_lock_irqsave(&idxd->dev_lock, flags);
 	rc = idxd_device_disable(idxd);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 	if (rc)
 		dev_err(&pdev->dev, "Disabling device failed\n");
 
@@ -434,6 +434,8 @@ static void idxd_shutdown(struct pci_dev *pdev)
 		idxd_flush_pending_llist(irq_entry);
 		idxd_flush_work_list(irq_entry);
 	}
+
+	destroy_workqueue(idxd->wq);
 }
 
 static void idxd_remove(struct pci_dev *pdev)
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 8a35f58da689..b7db8a0e1773 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -23,16 +23,13 @@ void idxd_device_wqs_clear_state(struct idxd_device *idxd)
 	}
 }
 
-static int idxd_restart(struct idxd_device *idxd)
+static void idxd_device_reinit(struct work_struct *work)
 {
-	int i, rc;
-
-	lockdep_assert_held(&idxd->dev_lock);
-
-	rc = __idxd_device_reset(idxd);
-	if (rc < 0)
-		goto out;
+	struct idxd_device *idxd = container_of(work, struct idxd_device, work);
+	struct device *dev = &idxd->pdev->dev;
+	int rc, i;
 
+	idxd_device_reset(idxd);
 	rc = idxd_device_config(idxd);
 	if (rc < 0)
 		goto out;
@@ -47,19 +44,16 @@ static int idxd_restart(struct idxd_device *idxd)
 		if (wq->state == IDXD_WQ_ENABLED) {
 			rc = idxd_wq_enable(wq);
 			if (rc < 0) {
-				dev_warn(&idxd->pdev->dev,
-					 "Unable to re-enable wq %s\n",
+				dev_warn(dev, "Unable to re-enable wq %s\n",
 					 dev_name(&wq->conf_dev));
 			}
 		}
 	}
 
-	return 0;
+	return;
 
  out:
 	idxd_device_wqs_clear_state(idxd);
-	idxd->state = IDXD_DEV_HALTED;
-	return rc;
 }
 
 irqreturn_t idxd_irq_handler(int vec, void *data)
@@ -78,7 +72,7 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	struct device *dev = &idxd->pdev->dev;
 	union gensts_reg gensts;
 	u32 cause, val = 0;
-	int i, rc;
+	int i;
 	bool err = false;
 
 	cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
@@ -117,8 +111,8 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	}
 
 	if (cause & IDXD_INTC_CMD) {
-		/* Driver does use command interrupts */
 		val |= IDXD_INTC_CMD;
+		complete(idxd->cmd_done);
 	}
 
 	if (cause & IDXD_INTC_OCCUPY) {
@@ -145,21 +139,24 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 
 	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
 	if (gensts.state == IDXD_DEVICE_STATE_HALT) {
-		spin_lock_bh(&idxd->dev_lock);
+		idxd->state = IDXD_DEV_HALTED;
 		if (gensts.reset_type == IDXD_DEVICE_RESET_SOFTWARE) {
-			rc = idxd_restart(idxd);
-			if (rc < 0)
-				dev_err(&idxd->pdev->dev,
-					"idxd restart failed, device halt.");
+			/*
+			 * If we need a software reset, we will throw the work
+			 * on a system workqueue in order to allow interrupts
+			 * for the device command completions.
+			 */
+			INIT_WORK(&idxd->work, idxd_device_reinit);
+			queue_work(idxd->wq, &idxd->work);
 		} else {
+			spin_lock_bh(&idxd->dev_lock);
 			idxd_device_wqs_clear_state(idxd);
-			idxd->state = IDXD_DEV_HALTED;
 			dev_err(&idxd->pdev->dev,
 				"idxd halted, need %s.\n",
 				gensts.reset_type == IDXD_DEVICE_RESET_FLR ?
 				"FLR" : "system reset");
+			spin_unlock_bh(&idxd->dev_lock);
 		}
-		spin_unlock_bh(&idxd->dev_lock);
 	}
 
  out:
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 2e2c5082f322..e66927020125 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -118,12 +118,11 @@ static int idxd_config_bus_probe(struct device *dev)
 		if (!try_module_get(THIS_MODULE))
 			return -ENXIO;
 
-		spin_lock_irqsave(&idxd->dev_lock, flags);
-
 		/* Perform IDXD configuration and enabling */
+		spin_lock_irqsave(&idxd->dev_lock, flags);
 		rc = idxd_device_config(idxd);
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 		if (rc < 0) {
-			spin_unlock_irqrestore(&idxd->dev_lock, flags);
 			module_put(THIS_MODULE);
 			dev_warn(dev, "Device config failed: %d\n", rc);
 			return rc;
@@ -132,18 +131,15 @@ static int idxd_config_bus_probe(struct device *dev)
 		/* start device */
 		rc = idxd_device_enable(idxd);
 		if (rc < 0) {
-			spin_unlock_irqrestore(&idxd->dev_lock, flags);
 			module_put(THIS_MODULE);
 			dev_warn(dev, "Device enable failed: %d\n", rc);
 			return rc;
 		}
 
-		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 		dev_info(dev, "Device %s enabled\n", dev_name(dev));
 
 		rc = idxd_register_dma_device(idxd);
 		if (rc < 0) {
-			spin_unlock_irqrestore(&idxd->dev_lock, flags);
 			module_put(THIS_MODULE);
 			dev_dbg(dev, "Failed to register dmaengine device\n");
 			return rc;
@@ -188,8 +184,8 @@ static int idxd_config_bus_probe(struct device *dev)
 
 		spin_lock_irqsave(&idxd->dev_lock, flags);
 		rc = idxd_device_config(idxd);
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 		if (rc < 0) {
-			spin_unlock_irqrestore(&idxd->dev_lock, flags);
 			mutex_unlock(&wq->wq_lock);
 			dev_warn(dev, "Writing WQ %d config failed: %d\n",
 				 wq->id, rc);
@@ -198,13 +194,11 @@ static int idxd_config_bus_probe(struct device *dev)
 
 		rc = idxd_wq_enable(wq);
 		if (rc < 0) {
-			spin_unlock_irqrestore(&idxd->dev_lock, flags);
 			mutex_unlock(&wq->wq_lock);
 			dev_warn(dev, "WQ %d enabling failed: %d\n",
 				 wq->id, rc);
 			return rc;
 		}
-		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 
 		rc = idxd_wq_map_portal(wq);
 		if (rc < 0) {
@@ -212,7 +206,6 @@ static int idxd_config_bus_probe(struct device *dev)
 			rc = idxd_wq_disable(wq);
 			if (rc < 0)
 				dev_warn(dev, "IDXD wq disable failed\n");
-			spin_unlock_irqrestore(&idxd->dev_lock, flags);
 			mutex_unlock(&wq->wq_lock);
 			return rc;
 		}
@@ -248,7 +241,6 @@ static void disable_wq(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
-	unsigned long flags;
 	int rc;
 
 	mutex_lock(&wq->wq_lock);
@@ -269,9 +261,8 @@ static void disable_wq(struct idxd_wq *wq)
 
 	idxd_wq_unmap_portal(wq);
 
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	idxd_wq_drain(wq);
 	rc = idxd_wq_disable(wq);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 
 	idxd_wq_free_resources(wq);
 	wq->client_count = 0;
@@ -287,7 +278,6 @@ static void disable_wq(struct idxd_wq *wq)
 static int idxd_config_bus_remove(struct device *dev)
 {
 	int rc;
-	unsigned long flags;
 
 	dev_dbg(dev, "%s called for %s\n", __func__, dev_name(dev));
 
@@ -313,14 +303,12 @@ static int idxd_config_bus_remove(struct device *dev)
 		}
 
 		idxd_unregister_dma_device(idxd);
-		spin_lock_irqsave(&idxd->dev_lock, flags);
 		rc = idxd_device_disable(idxd);
 		for (i = 0; i < idxd->max_wqs; i++) {
 			struct idxd_wq *wq = &idxd->wqs[i];
 
 			idxd_wq_disable_cleanup(wq);
 		}
-		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 		module_put(THIS_MODULE);
 		if (rc < 0)
 			dev_warn(dev, "Device disable failed\n");
-- 
Gitee


From 910a91a3ad134193afe19994df954ee05504859b Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 26 Jun 2020 11:12:56 -0700
Subject: [PATCH 0554/2161] dmaengine: idxd: move idxd interrupt handling to
 mask instead of ignore

commit 4548a6ad3d50c398aa12fa3ad45dd0611328f13b upstream.

Switch driver to use MSIX mask and unmask instead of the ignore bit.
When ignore bit is cleared, we must issue an MMIO read to ensure writes
have all arrived and check and process any additional completions. The
ignore bit does not queue up any pending MSIX interrupts. The mask bit
however does. Use API call from interrupt subsystem to mask MSIX
interrupt since the hardware does not have convenient mask bit register.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159319517621.70410.11816465052708900506.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 53 ++++++++-------------------------------
 drivers/dma/idxd/idxd.h   |  4 +--
 drivers/dma/idxd/irq.c    |  2 --
 3 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 4cd3c53d8f8f..14b45853aa5f 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -6,6 +6,8 @@
 #include <linux/pci.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmaengine.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
 #include <uapi/linux/idxd.h>
 #include "../dmaengine.h"
 #include "idxd.h"
@@ -15,61 +17,28 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 			  u32 *status);
 
 /* Interrupt control bits */
-int idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
+void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
 {
-	struct pci_dev *pdev = idxd->pdev;
-	int msixcnt = pci_msix_vec_count(pdev);
-	union msix_perm perm;
-	u32 offset;
-
-	if (vec_id < 0 || vec_id >= msixcnt)
-		return -EINVAL;
-
-	offset = idxd->msix_perm_offset + vec_id * 8;
-	perm.bits = ioread32(idxd->reg_base + offset);
-	perm.ignore = 1;
-	iowrite32(perm.bits, idxd->reg_base + offset);
+	struct irq_data *data = irq_get_irq_data(idxd->msix_entries[vec_id].vector);
 
-	return 0;
+	pci_msi_mask_irq(data);
 }
 
 void idxd_mask_msix_vectors(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
 	int msixcnt = pci_msix_vec_count(pdev);
-	int i, rc;
+	int i;
 
-	for (i = 0; i < msixcnt; i++) {
-		rc = idxd_mask_msix_vector(idxd, i);
-		if (rc < 0)
-			dev_warn(&pdev->dev,
-				 "Failed disabling msix vec %d\n", i);
-	}
+	for (i = 0; i < msixcnt; i++)
+		idxd_mask_msix_vector(idxd, i);
 }
 
-int idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id)
+void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id)
 {
-	struct pci_dev *pdev = idxd->pdev;
-	int msixcnt = pci_msix_vec_count(pdev);
-	union msix_perm perm;
-	u32 offset;
-
-	if (vec_id < 0 || vec_id >= msixcnt)
-		return -EINVAL;
-
-	offset = idxd->msix_perm_offset + vec_id * 8;
-	perm.bits = ioread32(idxd->reg_base + offset);
-	perm.ignore = 0;
-	iowrite32(perm.bits, idxd->reg_base + offset);
+	struct irq_data *data = irq_get_irq_data(idxd->msix_entries[vec_id].vector);
 
-	/*
-	 * A readback from the device ensures that any previously generated
-	 * completion record writes are visible to software based on PCI
-	 * ordering rules.
-	 */
-	perm.bits = ioread32(idxd->reg_base + offset);
-
-	return 0;
+	pci_msi_unmask_irq(data);
 }
 
 void idxd_unmask_error_interrupts(struct idxd_device *idxd)
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 0769354df4cf..e62b4799d189 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -273,8 +273,8 @@ irqreturn_t idxd_wq_thread(int irq, void *data);
 void idxd_mask_error_interrupts(struct idxd_device *idxd);
 void idxd_unmask_error_interrupts(struct idxd_device *idxd);
 void idxd_mask_msix_vectors(struct idxd_device *idxd);
-int idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
-int idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
+void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
+void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
 void idxd_device_init_reset(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index b7db8a0e1773..b5142556cc4e 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -261,8 +261,6 @@ irqreturn_t idxd_wq_thread(int irq, void *data)
 
 	processed = idxd_desc_process(irq_entry);
 	idxd_unmask_msix_vector(irq_entry->idxd, irq_entry->id);
-	/* catch anything unprocessed after unmasking */
-	processed += idxd_desc_process(irq_entry);
 
 	if (processed == 0)
 		return IRQ_NONE;
-- 
Gitee


From b84d873571bcfb0ed6bfa399775ed58fed498a01 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 13 Jul 2020 23:35:39 -0700
Subject: [PATCH 0555/2161] dmaengine: idxd: fix PCI_MSI build errors

commit d6a7bb869dd8a516901591136a9a895fd829d6c6 upstream.

Fix build errors when CONFIG_PCI_MSI is not enabled by making the
driver depend on PCI_MSI:

ld: drivers/dma/idxd/device.o: in function `idxd_mask_msix_vector':
device.c:(.text+0x26f): undefined reference to `pci_msi_mask_irq'
ld: drivers/dma/idxd/device.o: in function `idxd_unmask_msix_vector':
device.c:(.text+0x2af): undefined reference to `pci_msi_unmask_irq'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: dmaengine@vger.kernel.org
Cc: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/9dee3f46-70d9-ea75-10cb-5527ab297d1d@infradead.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0343da284d54..316f4dbc5909 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -276,6 +276,7 @@ config INTEL_IDMA64
 config INTEL_IDXD
 	tristate "Intel Data Accelerators support"
 	depends on PCI && X86_64
+	depends on PCI_MSI
 	depends on SBITMAP
 	select DMA_ENGINE
 	help
-- 
Gitee


From 0c0d7080ce51b7ac3001e803a5df4f2a36731ece Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 17 Jul 2020 19:51:18 -0700
Subject: [PATCH 0556/2161] dmaengine: linux/dmaengine.h: drop duplicated word
 in a comment

commit 2be90e914c12bdea70b0bd297d5715ee66eb46d0 upstream.

Drop the doubled word "has" in a comment.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Vinod Koul <vkoul@kernel.org>
Cc: dmaengine@vger.kernel.org
Link: https://lore.kernel.org/r/06e64046-ebf1-15db-dbaf-73698de3b493@infradead.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index bbfa6b78b22a..dc4cf059e98a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -164,7 +164,7 @@ struct dma_interleaved_template {
  * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of
  *  this transaction
  * @DMA_CTRL_ACK - if clear, the descriptor cannot be reused until the client
- *  acknowledges receipt, i.e. has has a chance to establish any dependency
+ *  acknowledges receipt, i.e. has a chance to establish any dependency
  *  chains
  * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q
  * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P
-- 
Gitee


From f647c7360562ac8a88438e1e026548f273aa1de8 Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Thu, 23 Jul 2020 03:58:41 +0300
Subject: [PATCH 0557/2161] dmaengine: Introduce min burst length capability

commit d97758e048e5fe91c7d8ff9ce5f030ee88d92161 upstream.

Some hardware aside from default 0/1 may have greater minimum burst
transactions length constraints. Here we introduce the DMA device
and slave capability, which if required can be initialized by the DMA
engine driver with the device-specific value.

Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200723005848.31907-4-Sergey.Semin@baikalelectronics.ru
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 1 +
 include/linux/dmaengine.h | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 2b06a7a8629d..2f1a7c0c5446 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -592,6 +592,7 @@ int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
 	caps->src_addr_widths = device->src_addr_widths;
 	caps->dst_addr_widths = device->dst_addr_widths;
 	caps->directions = device->directions;
+	caps->min_burst = device->min_burst;
 	caps->max_burst = device->max_burst;
 	caps->residue_granularity = device->residue_granularity;
 	caps->descriptor_reuse = device->descriptor_reuse;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index dc4cf059e98a..a78312449260 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -467,6 +467,7 @@ enum dma_residue_granularity {
  *	Since the enum dma_transfer_direction is not defined as bit flag for
  *	each type, the dma controller should set BIT(<TYPE>) and same
  *	should be checked by controller as well
+ * @min_burst: min burst capability per-transfer
  * @max_burst: max burst capability per-transfer
  * @cmd_pause: true, if pause is supported (i.e. for reading residue or
  *	       for resume later)
@@ -480,6 +481,7 @@ struct dma_slave_caps {
 	u32 src_addr_widths;
 	u32 dst_addr_widths;
 	u32 directions;
+	u32 min_burst;
 	u32 max_burst;
 	bool cmd_pause;
 	bool cmd_resume;
@@ -770,6 +772,7 @@ struct dma_filter {
  *	Since the enum dma_transfer_direction is not defined as bit flag for
  *	each type, the dma controller should set BIT(<TYPE>) and same
  *	should be checked by controller as well
+ * @min_burst: min burst capability per-transfer
  * @max_burst: max burst capability per-transfer
  * @residue_granularity: granularity of the transfer residue reported
  *	by tx_status
@@ -840,6 +843,7 @@ struct dma_device {
 	u32 src_addr_widths;
 	u32 dst_addr_widths;
 	u32 directions;
+	u32 min_burst;
 	u32 max_burst;
 	bool descriptor_reuse;
 	enum dma_residue_granularity residue_granularity;
-- 
Gitee


From 965989ab97012d5d854dedcbe5d1efb89f6fe541 Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Thu, 23 Jul 2020 03:58:42 +0300
Subject: [PATCH 0558/2161] dmaengine: Introduce max SG burst capability

commit b1b40b8fe7e8fb26e33bad1766ce322d2c63a6c7 upstream.

Some devices may lack the support of the hardware accelerated SG list
entries automatic walking through and execution. In this case a burden of
the SG list traversal and DMA engine re-initialization lies on the
DMA engine driver (normally implemented by using a DMA transfer completion
IRQ to recharge the DMA device with a next SG list entry). But such
solution may not be suitable for some DMA consumers. In particular SPI
devices need both Tx and Rx DMA channels work synchronously in order
to avoid the Rx FIFO overflow. In case if Rx DMA channel is paused for
some time while the Tx DMA channel works implicitly pulling data into the
Rx FIFO, the later will be eventually overflown, which will cause the data
loss. So if SG list entries aren't automatically fetched by the DMA
engine, but are one-by-one manually selected for execution in the
ISRs/deferred work/etc., such problem will eventually happen due to the
non-deterministic latencies of the service execution.

In order to let the DMA consumer know about the DMA device capabilities
regarding the hardware accelerated SG list traversal we introduce the
max_sg_burst capability. It is supposed to be initialized by the DMA engine
driver with 0 if there is no limitation of the number of SG entries
atomically executed and with non-zero value if there is such constraints,
so the upper limit is determined by the number set to the property.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200723005848.31907-5-Sergey.Semin@baikalelectronics.ru
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 1 +
 include/linux/dmaengine.h | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 2f1a7c0c5446..8177f78faeda 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -594,6 +594,7 @@ int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
 	caps->directions = device->directions;
 	caps->min_burst = device->min_burst;
 	caps->max_burst = device->max_burst;
+	caps->max_sg_burst = device->max_sg_burst;
 	caps->residue_granularity = device->residue_granularity;
 	caps->descriptor_reuse = device->descriptor_reuse;
 	caps->cmd_pause = !!device->device_pause;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index a78312449260..6cd68fe769d0 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -469,6 +469,9 @@ enum dma_residue_granularity {
  *	should be checked by controller as well
  * @min_burst: min burst capability per-transfer
  * @max_burst: max burst capability per-transfer
+ * @max_sg_burst: max number of SG list entries executed in a single burst
+ *	DMA tansaction with no software intervention for reinitialization.
+ *	Zero value means unlimited number of entries.
  * @cmd_pause: true, if pause is supported (i.e. for reading residue or
  *	       for resume later)
  * @cmd_resume: true, if resume is supported
@@ -483,6 +486,7 @@ struct dma_slave_caps {
 	u32 directions;
 	u32 min_burst;
 	u32 max_burst;
+	u32 max_sg_burst;
 	bool cmd_pause;
 	bool cmd_resume;
 	bool cmd_terminate;
@@ -774,6 +778,9 @@ struct dma_filter {
  *	should be checked by controller as well
  * @min_burst: min burst capability per-transfer
  * @max_burst: max burst capability per-transfer
+ * @max_sg_burst: max number of SG list entries executed in a single burst
+ *	DMA tansaction with no software intervention for reinitialization.
+ *	Zero value means unlimited number of entries.
  * @residue_granularity: granularity of the transfer residue reported
  *	by tx_status
  * @device_alloc_chan_resources: allocate resources and return the
@@ -845,6 +852,7 @@ struct dma_device {
 	u32 directions;
 	u32 min_burst;
 	u32 max_burst;
+	u32 max_sg_burst;
 	bool descriptor_reuse;
 	enum dma_residue_granularity residue_granularity;
 
-- 
Gitee


From 40a5a9831e29c758207c0399699ae62195edb962 Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Thu, 23 Jul 2020 03:58:43 +0300
Subject: [PATCH 0559/2161] dmaengine: Introduce DMA-device device_caps
 callback

commit 3b6d694eb3eebd86ec44a119e730943ac8e03a6b upstream.

There are DMA devices (like ours version of Synopsys DW DMAC) which have
DMA capabilities non-uniformly redistributed between the device channels.
In order to provide a way of exposing the channel-specific parameters to
the DMA engine consumers, we introduce a new DMA-device callback. In case
if provided it gets called from the dma_get_slave_caps() method and is
able to override the generic DMA-device capabilities.

Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200723005848.31907-6-Sergey.Semin@baikalelectronics.ru
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 10 ++++++++++
 include/linux/dmaengine.h |  4 ++++
 2 files changed, 14 insertions(+)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 8177f78faeda..a53e71d2bbd4 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -601,6 +601,16 @@ int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
 	caps->cmd_resume = !!device->device_resume;
 	caps->cmd_terminate = !!device->device_terminate_all;
 
+	/*
+	 * DMA engine device might be configured with non-uniformly
+	 * distributed slave capabilities per device channels. In this
+	 * case the corresponding driver may provide the device_caps
+	 * callback to override the generic capabilities with
+	 * channel-specific ones.
+	 */
+	if (device->device_caps)
+		device->device_caps(chan, caps);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dma_get_slave_caps);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 6cd68fe769d0..95ee81bd3867 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -800,6 +800,8 @@ struct dma_filter {
  *	be called after period_len bytes have been transferred.
  * @device_prep_interleaved_dma: Transfer expression in a generic way.
  * @device_prep_dma_imm_data: DMA's 8 byte immediate data to the dst address
+ * @device_caps: May be used to override the generic DMA slave capabilities
+ *	with per-channel specific ones
  * @device_config: Pushes a new configuration to a channel, return 0 or an error
  *	code
  * @device_pause: Pauses any transfer happening on a channel. Returns
@@ -900,6 +902,8 @@ struct dma_device {
 		struct dma_chan *chan, dma_addr_t dst, u64 data,
 		unsigned long flags);
 
+	void (*device_caps)(struct dma_chan *chan,
+			    struct dma_slave_caps *caps);
 	int (*device_config)(struct dma_chan *chan,
 			     struct dma_slave_config *config);
 	int (*device_pause)(struct dma_chan *chan);
-- 
Gitee


From 0abe4862fa37dcb1fd9e28ac873ef47e718208c7 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 20 Jul 2020 08:50:58 -0700
Subject: [PATCH 0560/2161] dmaengine: idxd: add missing invalid flags field to
 completion

commit 1a0c02ba643ed05c07ddf14d87c3bec640666836 upstream.

Add missing "invalid flags" field to completion record struct.

Reported-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159526025819.49266.13176787210106133664.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/idxd.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index e103c1434e4b..fdcdfe414223 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -181,6 +181,12 @@ struct dsa_completion_record {
 	uint32_t		bytes_completed;
 	uint64_t		fault_addr;
 	union {
+		/* common record */
+		struct {
+			uint32_t	invalid_flags:24;
+			uint32_t	rsvd2:8;
+		};
+
 		uint16_t	delta_rec_size;
 		uint16_t	crc_val;
 
-- 
Gitee


From 02f937f850c48a8395fcf1fa0a3decbed4fab694 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 27 Jul 2020 09:37:11 -0700
Subject: [PATCH 0561/2161] dmaengine: idxd: reset states after device disable
 or reset

commit df841b17e809f48f740cd2dd8b63543073c91a02 upstream.

The state for WQs should be reset to disabled when a device is reset or
disabled.

Fixes: da32b28c95a7 ("dmaengine: idxd: cleanup workqueue config after disabling")
Reported-by: Mona Hossain <mona.hossain@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159586777684.27150.17589406415773568534.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 26 ++++++++++++++++++++++++++
 drivers/dma/idxd/irq.c    | 12 ------------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 14b45853aa5f..b75d699160bf 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -410,10 +410,27 @@ int idxd_device_enable(struct idxd_device *idxd)
 	return 0;
 }
 
+void idxd_device_wqs_clear_state(struct idxd_device *idxd)
+{
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = &idxd->wqs[i];
+
+		if (wq->state == IDXD_WQ_ENABLED) {
+			idxd_wq_disable_cleanup(wq);
+			wq->state = IDXD_WQ_DISABLED;
+		}
+	}
+}
+
 int idxd_device_disable(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
 	u32 status;
+	unsigned long flags;
 
 	if (!idxd_is_enabled(idxd)) {
 		dev_dbg(dev, "Device is not enabled\n");
@@ -429,13 +446,22 @@ int idxd_device_disable(struct idxd_device *idxd)
 		return -ENXIO;
 	}
 
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	idxd_device_wqs_clear_state(idxd);
 	idxd->state = IDXD_DEV_CONF_READY;
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 	return 0;
 }
 
 void idxd_device_reset(struct idxd_device *idxd)
 {
+	unsigned long flags;
+
 	idxd_cmd_exec(idxd, IDXD_CMD_RESET_DEVICE, 0, NULL);
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	idxd_device_wqs_clear_state(idxd);
+	idxd->state = IDXD_DEV_CONF_READY;
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 }
 
 /* Device configuration bits */
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index b5142556cc4e..1e9e6991f543 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -11,18 +11,6 @@
 #include "idxd.h"
 #include "registers.h"
 
-void idxd_device_wqs_clear_state(struct idxd_device *idxd)
-{
-	int i;
-
-	lockdep_assert_held(&idxd->dev_lock);
-	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
-
-		wq->state = IDXD_WQ_DISABLED;
-	}
-}
-
 static void idxd_device_reinit(struct work_struct *work)
 {
 	struct idxd_device *idxd = container_of(work, struct idxd_device, work);
-- 
Gitee


From cd0f1a8acfbb5ca0494cc08a224a4e700bc8c5e5 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 29 Jul 2020 08:57:26 -0700
Subject: [PATCH 0562/2161] dmaengine: idxd: clear misc interrupt cause after
 read

commit 0ec083e50ca816953e65e3209c7199cd1a203ddc upstream.

Move the clearing of misc interrupt cause to immediately after reading the
register in order to allow the next interrupt to be asserted.

Suggested-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159603824665.28647.5344356370364397996.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 1e9e6991f543..17a65a13fb64 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -64,6 +64,7 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	bool err = false;
 
 	cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+	iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
 
 	if (cause & IDXD_INTC_ERR) {
 		spin_lock_bh(&idxd->dev_lock);
@@ -121,7 +122,6 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 		dev_warn_once(dev, "Unexpected interrupt cause bits set: %#x\n",
 			      val);
 
-	iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
 	if (!err)
 		goto out;
 
-- 
Gitee


From 55842a516dc2c7fb89a1c4a4ac1987abeb5f1c4c Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Fri, 28 Aug 2020 14:05:07 +0300
Subject: [PATCH 0563/2161] dmaengine: Mark dma_request_slave_channel()
 deprecated

commit 7547dbd3b198f309aaff54e3528898a8a196faff upstream.

New drivers should use dma_request_chan() instead
dma_request_slave_channel()

dma_request_slave_channel() is a simple wrapper for dma_request_chan()
eating up the error code for channel request failure and makes deferred
probing impossible.

Move the dma_request_slave_channel() into the header as inline function,
mark it as deprecated.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20200828110507.22407-1-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   | 18 ------------------
 include/linux/dmaengine.h | 15 +++++++++------
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index a53e71d2bbd4..ac8ef6cf7626 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -871,24 +871,6 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 }
 EXPORT_SYMBOL_GPL(dma_request_chan);
 
-/**
- * dma_request_slave_channel - try to allocate an exclusive slave channel
- * @dev:	pointer to client device structure
- * @name:	slave channel name
- *
- * Returns pointer to appropriate DMA channel on success or NULL.
- */
-struct dma_chan *dma_request_slave_channel(struct device *dev,
-					   const char *name)
-{
-	struct dma_chan *ch = dma_request_chan(dev, name);
-	if (IS_ERR(ch))
-		return NULL;
-
-	return ch;
-}
-EXPORT_SYMBOL_GPL(dma_request_slave_channel);
-
 /**
  * dma_request_chan_by_mask - allocate a channel satisfying certain capabilities
  * @mask:	capabilities that the channel must satisfy
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 95ee81bd3867..b99e78ddc930 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1460,7 +1460,6 @@ void dma_issue_pending_all(void);
 struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 				       dma_filter_fn fn, void *fn_param,
 				       struct device_node *np);
-struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
 
 struct dma_chan *dma_request_chan(struct device *dev, const char *name);
 struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask);
@@ -1490,11 +1489,6 @@ static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 {
 	return NULL;
 }
-static inline struct dma_chan *dma_request_slave_channel(struct device *dev,
-							 const char *name)
-{
-	return NULL;
-}
 static inline struct dma_chan *dma_request_chan(struct device *dev,
 						const char *name)
 {
@@ -1568,6 +1562,15 @@ void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
 #define dma_request_slave_channel_compat(mask, x, y, dev, name) \
 	__dma_request_slave_channel_compat(&(mask), x, y, dev, name)
 
+/* Deprecated, please use dma_request_chan() directly */
+static inline struct dma_chan * __deprecated
+dma_request_slave_channel(struct device *dev, const char *name)
+{
+	struct dma_chan *ch = dma_request_chan(dev, name);
+
+	return IS_ERR(ch) ? NULL : ch;
+}
+
 static inline struct dma_chan
 *__dma_request_slave_channel_compat(const dma_cap_mask_t *mask,
 				  dma_filter_fn fn, void *fn_param,
-- 
Gitee


From 7e1d8e55f86656777a14d6860c058b1abb43760e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 28 Aug 2020 17:45:19 +0300
Subject: [PATCH 0564/2161] dmaengine: Save few bytes and increase readability
 of dma_request_chan()

commit 5d7e816e444569c9fdd901ecf0e3933a6ed9a501 upstream.

Split IS_ERR_OR_NULL() check followed by additional conditional
to two simple conditionals. This increases readability and saves memory:

Function                                     old     new   delta
dma_request_chan                             700     697      -3
Total: Before=10224, After=10221, chg -0.03%

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200828144519.14483-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index ac8ef6cf7626..7974fa0400d8 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -847,8 +847,10 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 	}
 	mutex_unlock(&dma_list_mutex);
 
-	if (IS_ERR_OR_NULL(chan))
-		return chan ? chan : ERR_PTR(-EPROBE_DEFER);
+	if (IS_ERR(chan))
+		return chan;
+	if (!chan)
+		return ERR_PTR(-EPROBE_DEFER);
 
 found:
 #ifdef CONFIG_DEBUG_FS
-- 
Gitee


From f12cfdd6692478d97d572dec44b0970516ae6a85 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 28 Aug 2020 15:12:10 -0700
Subject: [PATCH 0565/2161] dmaengine: idxd: add support for configurable max
 wq xfer size

commit d7aad5550eca50370e3a1471b46281d03af0699e upstream.

Add sysfs attribute max_xfer_size to wq in order to allow the max xfer
size configured on a per wq basis. Add support code to configure
the valid user input on wq enable. This is a performance tuning
parameter.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159865265404.29141.3049399618578194052.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  7 ++++
 drivers/dma/idxd/device.c                     |  2 +-
 drivers/dma/idxd/idxd.h                       |  1 +
 drivers/dma/idxd/init.c                       |  1 +
 drivers/dma/idxd/sysfs.c                      | 40 +++++++++++++++++++
 5 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index 1af9c4175213..452f353b4748 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -170,6 +170,13 @@ Contact:        dmaengine@vger.kernel.org
 Description:    The number of entries in this work queue that may be filled
 		via a limited portal.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/max_transfer_size
+Date:		Aug 28, 2020
+KernelVersion:	5.10.0
+Contact:	dmaengine@vger.kernel.org
+Description:	The max transfer sized for this workqueue. Cannot exceed device
+		max transfer size. Configurable parameter.
+
 What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index b75d699160bf..5a8509bb9ad4 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -555,7 +555,7 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	wq->wqcfg.priority = wq->priority;
 
 	/* bytes 12-15 */
-	wq->wqcfg.max_xfer_shift = idxd->hw.gen_cap.max_xfer_shift;
+	wq->wqcfg.max_xfer_shift = ilog2(wq->max_xfer_bytes);
 	wq->wqcfg.max_batch_shift = idxd->hw.gen_cap.max_batch_shift;
 
 	dev_dbg(dev, "WQ %d CFGs\n", wq->id);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index e62b4799d189..81db2a472822 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -114,6 +114,7 @@ struct idxd_wq {
 	struct sbitmap_queue sbq;
 	struct dma_chan dma_chan;
 	char name[WQ_NAME_SIZE + 1];
+	u64 max_xfer_bytes;
 };
 
 struct idxd_engine {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index c7c61974f20f..e5ed5750a6d0 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -176,6 +176,7 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		wq->idxd = idxd;
 		mutex_init(&wq->wq_lock);
 		wq->idxd_cdev.minor = -1;
+		wq->max_xfer_bytes = idxd->max_xfer_bytes;
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index e66927020125..e84691f42db7 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1062,6 +1062,45 @@ static ssize_t wq_cdev_minor_show(struct device *dev,
 static struct device_attribute dev_attr_wq_cdev_minor =
 		__ATTR(cdev_minor, 0444, wq_cdev_minor_show, NULL);
 
+static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attribute *attr,
+					 char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%llu\n", wq->max_xfer_bytes);
+}
+
+static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	u64 xfer_size;
+	int rc;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	rc = kstrtou64(buf, 0, &xfer_size);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (xfer_size == 0)
+		return -EINVAL;
+
+	xfer_size = roundup_pow_of_two(xfer_size);
+	if (xfer_size > idxd->max_xfer_bytes)
+		return -EINVAL;
+
+	wq->max_xfer_bytes = xfer_size;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_max_transfer_size =
+		__ATTR(max_transfer_size, 0644,
+		       wq_max_transfer_size_show, wq_max_transfer_size_store);
+
 static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_clients.attr,
 	&dev_attr_wq_state.attr,
@@ -1072,6 +1111,7 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_type.attr,
 	&dev_attr_wq_name.attr,
 	&dev_attr_wq_cdev_minor.attr,
+	&dev_attr_wq_max_transfer_size.attr,
 	NULL,
 };
 
-- 
Gitee


From 7a7f2309aae48d594dec06ab9bd16e85b91bf23c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 28 Aug 2020 15:12:50 -0700
Subject: [PATCH 0566/2161] dmaengine: idxd: add support for configurable max
 wq batch size

commit d7aad5550eca50370e3a1471b46281d03af0699e upstream.

Add sysfs attribute max_batch_size to wq in order to allow the max batch
size configured on a per wq basis. Add support code to configure
the valid user input on wq enable. This is a performance tuning
parameter.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159865273617.29141.4383066301730821749.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  7 +++
 drivers/dma/idxd/device.c                     |  2 +-
 drivers/dma/idxd/idxd.h                       |  1 +
 drivers/dma/idxd/init.c                       |  1 +
 drivers/dma/idxd/sysfs.c                      | 57 +++++++++++++++++--
 5 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index 452f353b4748..a2bc4afbe308 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -177,6 +177,13 @@ Contact:	dmaengine@vger.kernel.org
 Description:	The max transfer sized for this workqueue. Cannot exceed device
 		max transfer size. Configurable parameter.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/max_batch_size
+Date:		Aug 28, 2020
+KernelVersion:	5.10.0
+Contact:	dmaengine@vger.kernel.org
+Description:	The max batch size for this workqueue. Cannot exceed device
+		max batch size. Configurable parameter.
+
 What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 5a8509bb9ad4..f6dde3c6e002 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -556,7 +556,7 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 
 	/* bytes 12-15 */
 	wq->wqcfg.max_xfer_shift = ilog2(wq->max_xfer_bytes);
-	wq->wqcfg.max_batch_shift = idxd->hw.gen_cap.max_batch_shift;
+	wq->wqcfg.max_batch_shift = ilog2(wq->max_batch_size);
 
 	dev_dbg(dev, "WQ %d CFGs\n", wq->id);
 	for (i = 0; i < 8; i++) {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 81db2a472822..e8bec6eb9f7e 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -115,6 +115,7 @@ struct idxd_wq {
 	struct dma_chan dma_chan;
 	char name[WQ_NAME_SIZE + 1];
 	u64 max_xfer_bytes;
+	u32 max_batch_size;
 };
 
 struct idxd_engine {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index e5ed5750a6d0..11e5ce168177 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -177,6 +177,7 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		mutex_init(&wq->wq_lock);
 		wq->idxd_cdev.minor = -1;
 		wq->max_xfer_bytes = idxd->max_xfer_bytes;
+		wq->max_batch_size = idxd->max_batch_size;
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index e84691f42db7..53c9186dd153 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1062,6 +1062,21 @@ static ssize_t wq_cdev_minor_show(struct device *dev,
 static struct device_attribute dev_attr_wq_cdev_minor =
 		__ATTR(cdev_minor, 0444, wq_cdev_minor_show, NULL);
 
+static int __get_sysfs_u64(const char *buf, u64 *val)
+{
+	int rc;
+
+	rc = kstrtou64(buf, 0, val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (*val == 0)
+		return -EINVAL;
+
+	*val = roundup_pow_of_two(*val);
+	return 0;
+}
+
 static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attribute *attr,
 					 char *buf)
 {
@@ -1081,14 +1096,10 @@ static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attr
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
 
-	rc = kstrtou64(buf, 0, &xfer_size);
+	rc = __get_sysfs_u64(buf, &xfer_size);
 	if (rc < 0)
-		return -EINVAL;
-
-	if (xfer_size == 0)
-		return -EINVAL;
+		return rc;
 
-	xfer_size = roundup_pow_of_two(xfer_size);
 	if (xfer_size > idxd->max_xfer_bytes)
 		return -EINVAL;
 
@@ -1101,6 +1112,39 @@ static struct device_attribute dev_attr_wq_max_transfer_size =
 		__ATTR(max_transfer_size, 0644,
 		       wq_max_transfer_size_show, wq_max_transfer_size_store);
 
+static ssize_t wq_max_batch_size_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n", wq->max_batch_size);
+}
+
+static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	u64 batch_size;
+	int rc;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	rc = __get_sysfs_u64(buf, &batch_size);
+	if (rc < 0)
+		return rc;
+
+	if (batch_size > idxd->max_batch_size)
+		return -EINVAL;
+
+	wq->max_batch_size = (u32)batch_size;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_max_batch_size =
+		__ATTR(max_batch_size, 0644, wq_max_batch_size_show, wq_max_batch_size_store);
+
 static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_clients.attr,
 	&dev_attr_wq_state.attr,
@@ -1112,6 +1156,7 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_name.attr,
 	&dev_attr_wq_cdev_minor.attr,
 	&dev_attr_wq_max_transfer_size.attr,
+	&dev_attr_wq_max_batch_size.attr,
 	NULL,
 };
 
-- 
Gitee


From 60327f95fbb8646bf965be8026f201c4eb534845 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 28 Aug 2020 15:13:55 -0700
Subject: [PATCH 0567/2161] dmaengine: idxd: add command status to idxd sysfs
 attribute

commit ff18de55a62f0e8f0dcf11bfa7f69b23e6e951b0 upstream.

Export admin command status to sysfs attribute in order to allow user to
retrieve configuration error. Allows user tooling to retrieve the command
error and provide more user friendly error messages.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/159865278770.29455.8026892329182750127.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/ABI/stable/sysfs-driver-dma-idxd |  6 ++++++
 drivers/dma/idxd/device.c                      |  6 +++++-
 drivers/dma/idxd/idxd.h                        |  1 +
 drivers/dma/idxd/sysfs.c                       | 10 ++++++++++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index a2bc4afbe308..b44183880935 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -116,6 +116,12 @@ Description:    The maximum number of bandwidth tokens that may be in use at
 		one time by operations that access low bandwidth memory in the
 		device.
 
+What:		/sys/bus/dsa/devices/dsa<m>/cmd_status
+Date:		Aug 28, 2020
+KernelVersion:	5.10.0
+Contact:	dmaengine@vger.kernel.org
+Description:	The last executed device administrative command's status/error.
+
 What:           /sys/bus/dsa/devices/wq<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index f6dde3c6e002..200b9109cacf 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -368,6 +368,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	dev_dbg(&idxd->pdev->dev, "%s: sending cmd: %#x op: %#x\n",
 		__func__, cmd_code, operand);
 
+	idxd->cmd_status = 0;
 	__set_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
 	idxd->cmd_done = &done;
 	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
@@ -379,8 +380,11 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 	wait_for_completion(&done);
 	spin_lock_irqsave(&idxd->dev_lock, flags);
-	if (status)
+	if (status) {
 		*status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
+		idxd->cmd_status = *status & GENMASK(7, 0);
+	}
+
 	__clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
 	/* Wake up other pending commands */
 	wake_up(&idxd->cmd_waitq);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index e8bec6eb9f7e..c64df197e724 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -156,6 +156,7 @@ struct idxd_device {
 	unsigned long flags;
 	int id;
 	int major;
+	u8 cmd_status;
 
 	struct pci_dev *pdev;
 	void __iomem *reg_base;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 53c9186dd153..fc31040716e4 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1400,6 +1400,15 @@ static ssize_t cdev_major_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(cdev_major);
 
+static ssize_t cmd_status_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%#x\n", idxd->cmd_status);
+}
+static DEVICE_ATTR_RO(cmd_status);
+
 static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_version.attr,
 	&dev_attr_max_groups.attr,
@@ -1418,6 +1427,7 @@ static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_max_tokens.attr,
 	&dev_attr_token_limit.attr,
 	&dev_attr_cdev_major.attr,
+	&dev_attr_cmd_status.attr,
 	NULL,
 };
 
-- 
Gitee


From 3f3ca226a178cd7e689ba20a6269802ee21a473c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 5 Oct 2020 08:11:22 -0700
Subject: [PATCH 0568/2161] x86/asm: Carve out a generic movdir64b() helper for
 general usage

commit 0888e1030d3e3e5ce9dfd8e030cf13a2e9a1519a upstream.

Carve out the MOVDIR64B inline asm primitive into a generic helper so
that it can be used by other functions. Move it to special_insns.h and
have iosubmit_cmds512() call it.

 [ bp: Massage commit message. ]

Suggested-by: Michael Matz <matz@suse.de>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20201005151126.657029-2-dave.jiang@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/io.h            | 17 +++--------------
 arch/x86/include/asm/special_insns.h | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 29a337d173f0..67412283c0cd 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -406,7 +406,7 @@ extern bool phys_mem_access_encrypted(unsigned long phys_addr,
 
 /**
  * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
- * @__dst: destination, in MMIO space (must be 512-bit aligned)
+ * @dst: destination, in MMIO space (must be 512-bit aligned)
  * @src: source
  * @count: number of 512 bits quantities to submit
  *
@@ -417,25 +417,14 @@ extern bool phys_mem_access_encrypted(unsigned long phys_addr,
  * Warning: Do not use this helper unless your driver has checked that the CPU
  * instruction is supported on the platform.
  */
-static inline void iosubmit_cmds512(void __iomem *__dst, const void *src,
+static inline void iosubmit_cmds512(void __iomem *dst, const void *src,
 				    size_t count)
 {
-	/*
-	 * Note that this isn't an "on-stack copy", just definition of "dst"
-	 * as a pointer to 64-bytes of stuff that is going to be overwritten.
-	 * In the MOVDIR64B case that may be needed as you can use the
-	 * MOVDIR64B instruction to copy arbitrary memory around. This trick
-	 * lets the compiler know how much gets clobbered.
-	 */
-	volatile struct { char _[64]; } *dst = __dst;
 	const u8 *from = src;
 	const u8 *end = from + count * 64;
 
 	while (from < end) {
-		/* MOVDIR64B [rdx], rax */
-		asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
-			     : "=m" (dst)
-			     : "d" (from), "a" (dst));
+		movdir64b(dst, from);
 		from += 64;
 	}
 }
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 0ae84a5898a0..2fdf254847d9 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -214,6 +214,28 @@ static inline void clwb(volatile void *__p)
 
 #define nop() asm volatile ("nop")
 
+/* The dst parameter must be 64-bytes aligned */
+static inline void movdir64b(void *dst, const void *src)
+{
+	const struct { char _[64]; } *__src = src;
+	struct { char _[64]; } *__dst = dst;
+
+	/*
+	 * MOVDIR64B %(rdx), rax.
+	 *
+	 * Both __src and __dst must be memory constraints in order to tell the
+	 * compiler that no other memory accesses should be reordered around
+	 * this one.
+	 *
+	 * Also, both must be supplied as lvalues because this tells
+	 * the compiler what the object is (its size) the instruction accesses.
+	 * I.e., not the pointers but what they point to, thus the deref'ing '*'.
+	 */
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		     : "+m" (*__dst)
+		     :  "m" (*__src), "a" (__dst), "d" (__src));
+}
+
 
 #endif /* __KERNEL__ */
 
-- 
Gitee


From 2a9681cb05368bb968a83eb82901abbaaf90ca59 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 5 Oct 2020 08:11:23 -0700
Subject: [PATCH 0569/2161] x86/asm: Add an enqcmds() wrapper for the ENQCMDS
 instruction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 7f5933f81bd85a0bf6a87d65c7327ea048a75e54 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, the MOVDIR64B instruction is used to atomically submit
64-byte work descriptors to devices. Although it can encounter errors
like device queue full, command not accepted, device not ready, etc when
writing to a device MMIO, MOVDIR64B can not report back on errors from
the device itself. This means that MOVDIR64B users need to separately
interact with a device to see if a descriptor was successfully queued,
which slows down device interactions.

ENQCMD and ENQCMDS also atomically submit 64-byte work descriptors
to devices. But, they *can* report back errors directly from the
device, such as if the device was busy, or device not enabled or does
not support the command. This immediate feedback from the submission
instruction itself reduces the number of interactions with the device
and can greatly increase efficiency.

ENQCMD can be used at any privilege level, but can effectively only
submit work on behalf of the current process. ENQCMDS is a ring0-only
instruction and can explicitly specify a process context instead of
being tied to the current process or needing to reprogram the IA32_PASID
MSR.

Use ENQCMDS for work submission within the kernel because a Process
Address ID (PASID) is setup to translate the kernel virtual address
space. This PASID is provided to ENQCMDS from the descriptor structure
submitted to the device and not retrieved from IA32_PASID MSR, which is
setup for the current user address space.

See Intel Software Developer’s Manual for more information on the
instructions.

 [ bp:
   - Make operand constraints like movdir64b() because both insns are
     basically doing the same thing, more or less.
   - Fixup comments and cleanup. ]

Link: https://lkml.kernel.org/r/20200924180041.34056-3-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20201005151126.657029-3-dave.jiang@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/special_insns.h | 42 ++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2fdf254847d9..9f0f2d71f670 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -236,6 +236,48 @@ static inline void movdir64b(void *dst, const void *src)
 		     :  "m" (*__src), "a" (__dst), "d" (__src));
 }
 
+/**
+ * enqcmds - Enqueue a command in supervisor (CPL0) mode
+ * @dst: destination, in MMIO space (must be 512-bit aligned)
+ * @src: 512 bits memory operand
+ *
+ * The ENQCMDS instruction allows software to write a 512-bit command to
+ * a 512-bit-aligned special MMIO region that supports the instruction.
+ * A return status is loaded into the ZF flag in the RFLAGS register.
+ * ZF = 0 equates to success, and ZF = 1 indicates retry or error.
+ *
+ * This function issues the ENQCMDS instruction to submit data from
+ * kernel space to MMIO space, in a unit of 512 bits. Order of data access
+ * is not guaranteed, nor is a memory barrier performed afterwards. It
+ * returns 0 on success and -EAGAIN on failure.
+ *
+ * Warning: Do not use this helper unless your driver has checked that the
+ * ENQCMDS instruction is supported on the platform and the device accepts
+ * ENQCMDS.
+ */
+static inline int enqcmds(void __iomem *dst, const void *src)
+{
+	const struct { char _[64]; } *__src = src;
+	struct { char _[64]; } *__dst = dst;
+	int zf;
+
+	/*
+	 * ENQCMDS %(rdx), rax
+	 *
+	 * See movdir64b()'s comment on operand specification.
+	 */
+	asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90"
+		     CC_SET(z)
+		     : CC_OUT(z) (zf), "+m" (*__dst)
+		     : "m" (*__src), "a" (__dst), "d" (__src));
+
+	/* Submission failure is indicated via EFLAGS.ZF=1 */
+	if (zf)
+		return -EAGAIN;
+
+	return 0;
+}
+
 
 #endif /* __KERNEL__ */
 
-- 
Gitee


From c764ed86d10f37d1ae6ba8c14a43e2329210893e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 27 Oct 2020 14:34:09 -0700
Subject: [PATCH 0570/2161] dmaengine: idxd: fix wq config registers offset
 programming

commit 484f910e93b48c1d8890d8330a87e34ae61f4782 upstream.

DSA spec v1.1 [1] updated to include a stride size register for WQ
configuration that will specify how much space is reserved for the WQ
configuration register set. This change is expected to be in the final
gen1 DSA hardware. Fix the driver to use WQCFG_OFFSET() for all WQ
offset calculation and fixup WQCFG_OFFSET() to use the new calculated
wq size.

[1]: https://software.intel.com/content/www/us/en/develop/download/intel-data-streaming-accelerator-preliminary-architecture-specification.html

Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160383444959.48058.14249265538404901781.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c    | 29 ++++++++++++++---------------
 drivers/dma/idxd/idxd.h      |  3 ++-
 drivers/dma/idxd/init.c      |  5 +++++
 drivers/dma/idxd/registers.h | 23 ++++++++++++++++++++++-
 4 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 200b9109cacf..506ac85c4a7f 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -295,7 +295,7 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	int i, wq_offset;
 
 	lockdep_assert_held(&idxd->dev_lock);
-	memset(&wq->wqcfg, 0, sizeof(wq->wqcfg));
+	memset(wq->wqcfg, 0, idxd->wqcfg_size);
 	wq->type = IDXD_WQT_NONE;
 	wq->size = 0;
 	wq->group = NULL;
@@ -304,8 +304,8 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
 
-	for (i = 0; i < 8; i++) {
-		wq_offset = idxd->wqcfg_offset + wq->id * 32 + i * sizeof(u32);
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
 		iowrite32(0, idxd->reg_base + wq_offset);
 		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n",
 			wq->id, i, wq_offset,
@@ -539,10 +539,10 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	if (!wq->group)
 		return 0;
 
-	memset(&wq->wqcfg, 0, sizeof(union wqcfg));
+	memset(wq->wqcfg, 0, idxd->wqcfg_size);
 
 	/* byte 0-3 */
-	wq->wqcfg.wq_size = wq->size;
+	wq->wqcfg->wq_size = wq->size;
 
 	if (wq->size == 0) {
 		dev_warn(dev, "Incorrect work queue size: 0\n");
@@ -550,22 +550,21 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	}
 
 	/* bytes 4-7 */
-	wq->wqcfg.wq_thresh = wq->threshold;
+	wq->wqcfg->wq_thresh = wq->threshold;
 
 	/* byte 8-11 */
-	wq->wqcfg.priv = !!(wq->type == IDXD_WQT_KERNEL);
-	wq->wqcfg.mode = 1;
-
-	wq->wqcfg.priority = wq->priority;
+	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
+	wq->wqcfg->mode = 1;
+	wq->wqcfg->priority = wq->priority;
 
 	/* bytes 12-15 */
-	wq->wqcfg.max_xfer_shift = ilog2(wq->max_xfer_bytes);
-	wq->wqcfg.max_batch_shift = ilog2(wq->max_batch_size);
+	wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
+	wq->wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
 
 	dev_dbg(dev, "WQ %d CFGs\n", wq->id);
-	for (i = 0; i < 8; i++) {
-		wq_offset = idxd->wqcfg_offset + wq->id * 32 + i * sizeof(u32);
-		iowrite32(wq->wqcfg.bits[i], idxd->reg_base + wq_offset);
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
+		iowrite32(wq->wqcfg->bits[i], idxd->reg_base + wq_offset);
 		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n",
 			wq->id, i, wq_offset,
 			ioread32(idxd->reg_base + wq_offset));
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index c64df197e724..d48f193daacc 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -103,7 +103,7 @@ struct idxd_wq {
 	u32 priority;
 	enum idxd_wq_state state;
 	unsigned long flags;
-	union wqcfg wqcfg;
+	union wqcfg *wqcfg;
 	u32 vec_ptr;		/* interrupt steering */
 	struct dsa_hw_desc **hw_descs;
 	int num_descs;
@@ -183,6 +183,7 @@ struct idxd_device {
 	int max_wq_size;
 	int token_limit;
 	int nr_tokens;		/* non-reserved tokens */
+	unsigned int wqcfg_size;
 
 	union sw_err_reg sw_err;
 	wait_queue_head_t cmd_waitq;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 11e5ce168177..0a4432b063b5 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -178,6 +178,9 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		wq->idxd_cdev.minor = -1;
 		wq->max_xfer_bytes = idxd->max_xfer_bytes;
 		wq->max_batch_size = idxd->max_batch_size;
+		wq->wqcfg = devm_kzalloc(dev, idxd->wqcfg_size, GFP_KERNEL);
+		if (!wq->wqcfg)
+			return -ENOMEM;
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
@@ -251,6 +254,8 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	dev_dbg(dev, "total workqueue size: %u\n", idxd->max_wq_size);
 	idxd->max_wqs = idxd->hw.wq_cap.num_wqs;
 	dev_dbg(dev, "max workqueues: %u\n", idxd->max_wqs);
+	idxd->wqcfg_size = 1 << (idxd->hw.wq_cap.wqcfg_size + IDXD_WQCFG_MIN);
+	dev_dbg(dev, "wqcfg size: %u\n", idxd->wqcfg_size);
 
 	/* reading operation capabilities */
 	for (i = 0; i < 4; i++) {
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index a39e7ae6b3d9..aef5a902829e 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -43,7 +43,8 @@ union wq_cap_reg {
 	struct {
 		u64 total_wq_size:16;
 		u64 num_wqs:8;
-		u64 rsvd:24;
+		u64 wqcfg_size:4;
+		u64 rsvd:20;
 		u64 shared_mode:1;
 		u64 dedicated_mode:1;
 		u64 rsvd2:1;
@@ -55,6 +56,7 @@ union wq_cap_reg {
 	u64 bits;
 } __packed;
 #define IDXD_WQCAP_OFFSET		0x20
+#define IDXD_WQCFG_MIN			5
 
 union group_cap_reg {
 	struct {
@@ -333,4 +335,23 @@ union wqcfg {
 	};
 	u32 bits[8];
 } __packed;
+
+/*
+ * This macro calculates the offset into the WQCFG register
+ * idxd - struct idxd *
+ * n - wq id
+ * ofs - the index of the 32b dword for the config register
+ *
+ * The WQCFG register block is divided into groups per each wq. The n index
+ * allows us to move to the register group that's for that particular wq.
+ * Each register is 32bits. The ofs gives us the number of register to access.
+ */
+#define WQCFG_OFFSET(_idxd_dev, n, ofs) \
+({\
+	typeof(_idxd_dev) __idxd_dev = (_idxd_dev);	\
+	(__idxd_dev)->wqcfg_offset + (n) * (__idxd_dev)->wqcfg_size + sizeof(u32) * (ofs);	\
+})
+
+#define WQCFG_STRIDES(_idxd_dev) ((_idxd_dev)->wqcfg_size / sizeof(u32))
+
 #endif
-- 
Gitee


From 03c80113e4f58d36bc8482c22db07a584caf574a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 27 Oct 2020 10:34:35 -0700
Subject: [PATCH 0571/2161] dmaengine: idxd: Add shared workqueue support

commit 8e50d392652f20616a136165dff516b86baf5e49 upstream.

Add shared workqueue support that includes the support of Shared Virtual
memory (SVM) or in similar terms On Demand Paging (ODP). The shared
workqueue uses the enqcmds command in kernel and will respond with retry if
the workqueue is full. Shared workqueue only works when there is PASID
support from the IOMMU.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/160382007499.3911367.26043087963708134.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/Kconfig          |  10 +++
 drivers/dma/idxd/cdev.c      |  49 +++++++++++++-
 drivers/dma/idxd/device.c    |  90 +++++++++++++++++++++++--
 drivers/dma/idxd/dma.c       |   9 ---
 drivers/dma/idxd/idxd.h      |  28 +++++++-
 drivers/dma/idxd/init.c      |  91 +++++++++++++++++++------
 drivers/dma/idxd/registers.h |   2 +
 drivers/dma/idxd/submit.c    |  35 ++++++++--
 drivers/dma/idxd/sysfs.c     | 127 +++++++++++++++++++++++++++++++++++
 9 files changed, 398 insertions(+), 43 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 316f4dbc5909..9b1ce7b2d810 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -287,6 +287,16 @@ config INTEL_IDXD
 
 	  If unsure, say N.
 
+# Config symbol that collects all the dependencies that's necessary to
+# support shared virtual memory for the devices supported by idxd.
+config INTEL_IDXD_SVM
+	bool "Accelerator Shared Virtual Memory Support"
+	depends on INTEL_IDXD
+	depends on INTEL_IOMMU_SVM
+	depends on PCI_PRI
+	depends on PCI_PASID
+	depends on PCI_IOV
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86_64
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index c3976156db2f..010b820d8f74 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -11,6 +11,7 @@
 #include <linux/cdev.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
+#include <linux/iommu.h>
 #include <uapi/linux/idxd.h>
 #include "registers.h"
 #include "idxd.h"
@@ -32,7 +33,9 @@ static struct idxd_cdev_context ictx[IDXD_TYPE_MAX] = {
 struct idxd_user_context {
 	struct idxd_wq *wq;
 	struct task_struct *task;
+	unsigned int pasid;
 	unsigned int flags;
+	struct iommu_sva *sva;
 };
 
 enum idxd_cdev_cleanup {
@@ -75,6 +78,8 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 	struct idxd_wq *wq;
 	struct device *dev;
 	int rc = 0;
+	struct iommu_sva *sva;
+	unsigned int pasid;
 
 	wq = inode_wq(inode);
 	idxd = wq->idxd;
@@ -95,6 +100,34 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 
 	ctx->wq = wq;
 	filp->private_data = ctx;
+
+	if (device_pasid_enabled(idxd)) {
+		sva = iommu_sva_bind_device(dev, current->mm, NULL);
+		if (IS_ERR(sva)) {
+			rc = PTR_ERR(sva);
+			dev_err(dev, "pasid allocation failed: %d\n", rc);
+			goto failed;
+		}
+
+		pasid = iommu_sva_get_pasid(sva);
+		if (pasid == IOMMU_PASID_INVALID) {
+			iommu_sva_unbind_device(sva);
+			goto failed;
+		}
+
+		ctx->sva = sva;
+		ctx->pasid = pasid;
+
+		if (wq_dedicated(wq)) {
+			rc = idxd_wq_set_pasid(wq, pasid);
+			if (rc < 0) {
+				iommu_sva_unbind_device(sva);
+				dev_err(dev, "wq set pasid failed: %d\n", rc);
+				goto failed;
+			}
+		}
+	}
+
 	idxd_wq_get(wq);
 	mutex_unlock(&wq->wq_lock);
 	return 0;
@@ -111,13 +144,27 @@ static int idxd_cdev_release(struct inode *node, struct file *filep)
 	struct idxd_wq *wq = ctx->wq;
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
+	int rc;
 
 	dev_dbg(dev, "%s called\n", __func__);
 	filep->private_data = NULL;
 
 	/* Wait for in-flight operations to complete. */
-	idxd_wq_drain(wq);
+	if (wq_shared(wq)) {
+		idxd_device_drain_pasid(idxd, ctx->pasid);
+	} else {
+		if (device_pasid_enabled(idxd)) {
+			/* The wq disable in the disable pasid function will drain the wq */
+			rc = idxd_wq_disable_pasid(wq);
+			if (rc < 0)
+				dev_err(dev, "wq disable pasid failed.\n");
+		} else {
+			idxd_wq_drain(wq);
+		}
+	}
 
+	if (ctx->sva)
+		iommu_sva_unbind_device(ctx->sva);
 	kfree(ctx);
 	mutex_lock(&wq->wq_lock);
 	idxd_wq_put(wq);
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 506ac85c4a7f..2f09eb89a906 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -273,10 +273,9 @@ int idxd_wq_map_portal(struct idxd_wq *wq)
 	start = pci_resource_start(pdev, IDXD_WQ_BAR);
 	start = start + wq->id * IDXD_PORTAL_SIZE;
 
-	wq->dportal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE);
-	if (!wq->dportal)
+	wq->portal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE);
+	if (!wq->portal)
 		return -ENOMEM;
-	dev_dbg(dev, "wq %d portal mapped at %p\n", wq->id, wq->dportal);
 
 	return 0;
 }
@@ -285,7 +284,61 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq)
 {
 	struct device *dev = &wq->idxd->pdev->dev;
 
-	devm_iounmap(dev, wq->dportal);
+	devm_iounmap(dev, wq->portal);
+}
+
+int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+	union wqcfg wqcfg;
+	unsigned int offset;
+	unsigned long flags;
+
+	rc = idxd_wq_disable(wq);
+	if (rc < 0)
+		return rc;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
+	wqcfg.pasid_en = 1;
+	wqcfg.pasid = pasid;
+	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	rc = idxd_wq_enable(wq);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+int idxd_wq_disable_pasid(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+	union wqcfg wqcfg;
+	unsigned int offset;
+	unsigned long flags;
+
+	rc = idxd_wq_disable(wq);
+	if (rc < 0)
+		return rc;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
+	wqcfg.pasid_en = 0;
+	wqcfg.pasid = 0;
+	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	rc = idxd_wq_enable(wq);
+	if (rc < 0)
+		return rc;
+
+	return 0;
 }
 
 void idxd_wq_disable_cleanup(struct idxd_wq *wq)
@@ -468,6 +521,17 @@ void idxd_device_reset(struct idxd_device *idxd)
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 }
 
+void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand;
+
+	operand = pasid;
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_DRAIN_PASID, operand);
+	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_PASID, operand, NULL);
+	dev_dbg(dev, "pasid %d drained\n", pasid);
+}
+
 /* Device configuration bits */
 static void idxd_group_config_write(struct idxd_group *group)
 {
@@ -554,9 +618,21 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 
 	/* byte 8-11 */
 	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
-	wq->wqcfg->mode = 1;
+	if (wq_dedicated(wq))
+		wq->wqcfg->mode = 1;
+
+	if (device_pasid_enabled(idxd)) {
+		wq->wqcfg->pasid_en = 1;
+		if (wq->type == IDXD_WQT_KERNEL && wq_dedicated(wq))
+			wq->wqcfg->pasid = idxd->pasid;
+	}
+
 	wq->wqcfg->priority = wq->priority;
 
+	if (idxd->hw.gen_cap.block_on_fault &&
+	    test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags))
+		wq->wqcfg->bof = 1;
+
 	/* bytes 12-15 */
 	wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
 	wq->wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
@@ -664,8 +740,8 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 		if (!wq->size)
 			continue;
 
-		if (!wq_dedicated(wq)) {
-			dev_warn(dev, "No shared workqueue support.\n");
+		if (wq_shared(wq) && !device_swq_supported(idxd)) {
+			dev_warn(dev, "No shared wq support but configured.\n");
 			return -EINVAL;
 		}
 
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 0c892cbd72e0..8ed2773d8285 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -61,8 +61,6 @@ static inline void idxd_prep_desc_common(struct idxd_wq *wq,
 					 u64 addr_f1, u64 addr_f2, u64 len,
 					 u64 compl, u32 flags)
 {
-	struct idxd_device *idxd = wq->idxd;
-
 	hw->flags = flags;
 	hw->opcode = opcode;
 	hw->src_addr = addr_f1;
@@ -70,13 +68,6 @@ static inline void idxd_prep_desc_common(struct idxd_wq *wq,
 	hw->xfer_size = len;
 	hw->priv = !!(wq->type == IDXD_WQT_KERNEL);
 	hw->completion_addr = compl;
-
-	/*
-	 * Descriptor completion vectors are 1-8 for MSIX. We will round
-	 * robin through the 8 vectors.
-	 */
-	wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1;
-	hw->int_handle =  wq->vec_ptr;
 }
 
 static struct dma_async_tx_descriptor *
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index d48f193daacc..dcac3bb5a0d0 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -59,6 +59,7 @@ enum idxd_wq_state {
 
 enum idxd_wq_flag {
 	WQ_FLAG_DEDICATED = 0,
+	WQ_FLAG_BLOCK_ON_FAULT,
 };
 
 enum idxd_wq_type {
@@ -86,10 +87,11 @@ enum idxd_op_type {
 enum idxd_complete_type {
 	IDXD_COMPLETE_NORMAL = 0,
 	IDXD_COMPLETE_ABORT,
+	IDXD_COMPLETE_DEV_FAIL,
 };
 
 struct idxd_wq {
-	void __iomem *dportal;
+	void __iomem *portal;
 	struct device conf_dev;
 	struct idxd_cdev idxd_cdev;
 	struct idxd_device *idxd;
@@ -145,6 +147,7 @@ enum idxd_device_state {
 enum idxd_device_flag {
 	IDXD_FLAG_CONFIGURABLE = 0,
 	IDXD_FLAG_CMD_RUNNING,
+	IDXD_FLAG_PASID_ENABLED,
 };
 
 struct idxd_device {
@@ -167,6 +170,9 @@ struct idxd_device {
 	struct idxd_wq *wqs;
 	struct idxd_engine *engines;
 
+	struct iommu_sva *sva;
+	unsigned int pasid;
+
 	int num_groups;
 
 	u32 msix_perm_offset;
@@ -215,11 +221,28 @@ struct idxd_desc {
 
 extern struct bus_type dsa_bus_type;
 
+extern bool support_enqcmd;
+
 static inline bool wq_dedicated(struct idxd_wq *wq)
 {
 	return test_bit(WQ_FLAG_DEDICATED, &wq->flags);
 }
 
+static inline bool wq_shared(struct idxd_wq *wq)
+{
+	return !test_bit(WQ_FLAG_DEDICATED, &wq->flags);
+}
+
+static inline bool device_pasid_enabled(struct idxd_device *idxd)
+{
+	return test_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+}
+
+static inline bool device_swq_supported(struct idxd_device *idxd)
+{
+	return (support_enqcmd && device_pasid_enabled(idxd));
+}
+
 enum idxd_portal_prot {
 	IDXD_PORTAL_UNLIMITED = 0,
 	IDXD_PORTAL_LIMITED,
@@ -288,6 +311,7 @@ void idxd_device_reset(struct idxd_device *idxd);
 void idxd_device_cleanup(struct idxd_device *idxd);
 int idxd_device_config(struct idxd_device *idxd);
 void idxd_device_wqs_clear_state(struct idxd_device *idxd);
+void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid);
 
 /* work queue control */
 int idxd_wq_alloc_resources(struct idxd_wq *wq);
@@ -298,6 +322,8 @@ void idxd_wq_drain(struct idxd_wq *wq);
 int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
 void idxd_wq_disable_cleanup(struct idxd_wq *wq);
+int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid);
+int idxd_wq_disable_pasid(struct idxd_wq *wq);
 
 /* submission */
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 0a4432b063b5..1639f3b2aa58 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -14,6 +14,8 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/device.h>
 #include <linux/idr.h>
+#include <linux/intel-svm.h>
+#include <linux/iommu.h>
 #include <uapi/linux/idxd.h>
 #include <linux/dmaengine.h>
 #include "../dmaengine.h"
@@ -26,6 +28,8 @@ MODULE_AUTHOR("Intel Corporation");
 
 #define DRV_NAME "idxd"
 
+bool support_enqcmd;
+
 static struct idr idxd_idrs[IDXD_TYPE_MAX];
 static struct mutex idxd_idr_lock;
 
@@ -53,6 +57,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	struct idxd_irq_entry *irq_entry;
 	int i, msixcnt;
 	int rc = 0;
+	union msix_perm mperm;
 
 	msixcnt = pci_msix_vec_count(pdev);
 	if (msixcnt < 0) {
@@ -131,6 +136,13 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 
 	idxd_unmask_error_interrupts(idxd);
 
+	/* Setup MSIX permission table */
+	mperm.bits = 0;
+	mperm.pasid = idxd->pasid;
+	mperm.pasid_en = device_pasid_enabled(idxd);
+	for (i = 1; i < msixcnt; i++)
+		iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + i * 8);
+
 	return 0;
 
  err_no_irq:
@@ -265,8 +277,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	}
 }
 
-static struct idxd_device *idxd_alloc(struct pci_dev *pdev,
-				      void __iomem * const *iomap)
+static struct idxd_device *idxd_alloc(struct pci_dev *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct idxd_device *idxd;
@@ -276,12 +287,45 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev,
 		return NULL;
 
 	idxd->pdev = pdev;
-	idxd->reg_base = iomap[IDXD_MMIO_BAR];
 	spin_lock_init(&idxd->dev_lock);
 
 	return idxd;
 }
 
+static int idxd_enable_system_pasid(struct idxd_device *idxd)
+{
+	int flags;
+	unsigned int pasid;
+	struct iommu_sva *sva;
+
+	flags = SVM_FLAG_SUPERVISOR_MODE;
+
+	sva = iommu_sva_bind_device(&idxd->pdev->dev, NULL, &flags);
+	if (IS_ERR(sva)) {
+		dev_warn(&idxd->pdev->dev,
+			 "iommu sva bind failed: %ld\n", PTR_ERR(sva));
+		return PTR_ERR(sva);
+	}
+
+	pasid = iommu_sva_get_pasid(sva);
+	if (pasid == IOMMU_PASID_INVALID) {
+		iommu_sva_unbind_device(sva);
+		return -ENODEV;
+	}
+
+	idxd->sva = sva;
+	idxd->pasid = pasid;
+	dev_dbg(&idxd->pdev->dev, "system pasid: %u\n", pasid);
+	return 0;
+}
+
+static void idxd_disable_system_pasid(struct idxd_device *idxd)
+{
+
+	iommu_sva_unbind_device(idxd->sva);
+	idxd->sva = NULL;
+}
+
 static int idxd_probe(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
@@ -292,6 +336,14 @@ static int idxd_probe(struct idxd_device *idxd)
 	idxd_device_init_reset(idxd);
 	dev_dbg(dev, "IDXD reset complete\n");
 
+	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM)) {
+		rc = idxd_enable_system_pasid(idxd);
+		if (rc < 0)
+			dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
+		else
+			set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+	}
+
 	idxd_read_caps(idxd);
 	idxd_read_table_offsets(idxd);
 
@@ -322,29 +374,29 @@ static int idxd_probe(struct idxd_device *idxd)
 	idxd_mask_error_interrupts(idxd);
 	idxd_mask_msix_vectors(idxd);
  err_setup:
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
 	return rc;
 }
 
 static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	void __iomem * const *iomap;
 	struct device *dev = &pdev->dev;
 	struct idxd_device *idxd;
 	int rc;
-	unsigned int mask;
 
 	rc = pcim_enable_device(pdev);
 	if (rc)
 		return rc;
 
-	dev_dbg(dev, "Mapping BARs\n");
-	mask = (1 << IDXD_MMIO_BAR);
-	rc = pcim_iomap_regions(pdev, mask, DRV_NAME);
-	if (rc)
-		return rc;
+	dev_dbg(dev, "Alloc IDXD context\n");
+	idxd = idxd_alloc(pdev);
+	if (!idxd)
+		return -ENOMEM;
 
-	iomap = pcim_iomap_table(pdev);
-	if (!iomap)
+	dev_dbg(dev, "Mapping BARs\n");
+	idxd->reg_base = pcim_iomap(pdev, IDXD_MMIO_BAR, 0);
+	if (!idxd->reg_base)
 		return -ENOMEM;
 
 	dev_dbg(dev, "Set DMA masks\n");
@@ -360,11 +412,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
-	dev_dbg(dev, "Alloc IDXD context\n");
-	idxd = idxd_alloc(pdev, iomap);
-	if (!idxd)
-		return -ENOMEM;
-
 	idxd_set_type(idxd);
 
 	dev_dbg(dev, "Set PCI master\n");
@@ -452,6 +499,8 @@ static void idxd_remove(struct pci_dev *pdev)
 	dev_dbg(&pdev->dev, "%s called\n", __func__);
 	idxd_cleanup_sysfs(idxd);
 	idxd_shutdown(pdev);
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
 	mutex_lock(&idxd_idr_lock);
 	idr_remove(&idxd_idrs[idxd->type], idxd->id);
 	mutex_unlock(&idxd_idr_lock);
@@ -470,7 +519,7 @@ static int __init idxd_init_module(void)
 	int err, i;
 
 	/*
-	 * If the CPU does not support write512, there's no point in
+	 * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
 	 * enumerating the device. We can not utilize it.
 	 */
 	if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) {
@@ -478,8 +527,10 @@ static int __init idxd_init_module(void)
 		return -ENODEV;
 	}
 
-	pr_info("%s: Intel(R) Accelerator Devices Driver %s\n",
-		DRV_NAME, IDXD_DRIVER_VERSION);
+	if (!boot_cpu_has(X86_FEATURE_ENQCMD))
+		pr_warn("Platform does not have ENQCMD(S) support.\n");
+	else
+		support_enqcmd = true;
 
 	mutex_init(&idxd_idr_lock);
 	for (i = 0; i < IDXD_TYPE_MAX; i++)
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index aef5a902829e..1041c2779f9b 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -336,6 +336,8 @@ union wqcfg {
 	u32 bits[8];
 } __packed;
 
+#define WQCFG_PASID_IDX                2
+
 /*
  * This macro calculates the offset into the WQCFG register
  * idxd - struct idxd *
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 3e31aacc8595..0a6520a4fabb 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -12,11 +12,22 @@
 static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 {
 	struct idxd_desc *desc;
+	struct idxd_device *idxd = wq->idxd;
 
 	desc = wq->descs[idx];
 	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
 	memset(desc->completion, 0, sizeof(struct dsa_completion_record));
 	desc->cpu = cpu;
+
+	if (device_pasid_enabled(idxd))
+		desc->hw->pasid = idxd->pasid;
+
+	/*
+	 * Descriptor completion vectors are 1-8 for MSIX. We will round
+	 * robin through the 8 vectors.
+	 */
+	wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1;
+	desc->hw->int_handle = wq->vec_ptr;
 	return desc;
 }
 
@@ -71,18 +82,32 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	struct idxd_device *idxd = wq->idxd;
 	int vec = desc->hw->int_handle;
 	void __iomem *portal;
+	int rc;
 
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
 
-	portal = wq->dportal + idxd_get_wq_portal_offset(IDXD_PORTAL_UNLIMITED);
+	portal = wq->portal + idxd_get_wq_portal_offset(IDXD_PORTAL_LIMITED);
+
 	/*
-	 * The wmb() flushes writes to coherent DMA data before possibly
-	 * triggering a DMA read. The wmb() is necessary even on UP because
-	 * the recipient is a device.
+	 * The wmb() flushes writes to coherent DMA data before
+	 * possibly triggering a DMA read. The wmb() is necessary
+	 * even on UP because the recipient is a device.
 	 */
 	wmb();
-	iosubmit_cmds512(portal, desc->hw, 1);
+	if (wq_dedicated(wq)) {
+		iosubmit_cmds512(portal, desc->hw, 1);
+	} else {
+		/*
+		 * It's not likely that we would receive queue full rejection
+		 * since the descriptor allocation gates at wq size. If we
+		 * receive a -EAGAIN, that means something went wrong such as the
+		 * device is not accepting descriptor at all.
+		 */
+		rc = enqcmds(portal, desc->hw);
+		if (rc < 0)
+			return rc;
+	}
 
 	/*
 	 * Pending the descriptor to the lockless list for the irq_entry
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index fc31040716e4..04154b94d3a1 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -175,6 +175,30 @@ static int idxd_config_bus_probe(struct device *dev)
 			return -EINVAL;
 		}
 
+		/* Shared WQ checks */
+		if (wq_shared(wq)) {
+			if (!device_swq_supported(idxd)) {
+				dev_warn(dev,
+					 "PASID not enabled and shared WQ.\n");
+				mutex_unlock(&wq->wq_lock);
+				return -ENXIO;
+			}
+			/*
+			 * Shared wq with the threshold set to 0 means the user
+			 * did not set the threshold or transitioned from a
+			 * dedicated wq but did not set threshold. A value
+			 * of 0 would effectively disable the shared wq. The
+			 * driver does not allow a value of 0 to be set for
+			 * threshold via sysfs.
+			 */
+			if (wq->threshold == 0) {
+				dev_warn(dev,
+					 "Shared WQ and threshold 0.\n");
+				mutex_unlock(&wq->wq_lock);
+				return -EINVAL;
+			}
+		}
+
 		rc = idxd_wq_alloc_resources(wq);
 		if (rc < 0) {
 			mutex_unlock(&wq->wq_lock);
@@ -873,6 +897,8 @@ static ssize_t wq_mode_store(struct device *dev,
 	if (sysfs_streq(buf, "dedicated")) {
 		set_bit(WQ_FLAG_DEDICATED, &wq->flags);
 		wq->threshold = 0;
+	} else if (sysfs_streq(buf, "shared") && device_swq_supported(idxd)) {
+		clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	} else {
 		return -EINVAL;
 	}
@@ -971,6 +997,87 @@ static ssize_t wq_priority_store(struct device *dev,
 static struct device_attribute dev_attr_wq_priority =
 		__ATTR(priority, 0644, wq_priority_show, wq_priority_store);
 
+static ssize_t wq_block_on_fault_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n",
+		       test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags));
+}
+
+static ssize_t wq_block_on_fault_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	bool bof;
+	int rc;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -ENXIO;
+
+	rc = kstrtobool(buf, &bof);
+	if (rc < 0)
+		return rc;
+
+	if (bof)
+		set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+	else
+		clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_block_on_fault =
+		__ATTR(block_on_fault, 0644, wq_block_on_fault_show,
+		       wq_block_on_fault_store);
+
+static ssize_t wq_threshold_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n", wq->threshold);
+}
+
+static ssize_t wq_threshold_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (val > wq->size || val <= 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -ENXIO;
+
+	if (test_bit(WQ_FLAG_DEDICATED, &wq->flags))
+		return -EINVAL;
+
+	wq->threshold = val;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_threshold =
+		__ATTR(threshold, 0644, wq_threshold_show, wq_threshold_store);
+
 static ssize_t wq_type_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
@@ -1042,6 +1149,13 @@ static ssize_t wq_name_store(struct device *dev,
 	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
 		return -EINVAL;
 
+	/*
+	 * This is temporarily placed here until we have SVM support for
+	 * dmaengine.
+	 */
+	if (wq->type == IDXD_WQT_KERNEL && device_pasid_enabled(wq->idxd))
+		return -EOPNOTSUPP;
+
 	memset(wq->name, 0, WQ_NAME_SIZE + 1);
 	strncpy(wq->name, buf, WQ_NAME_SIZE);
 	strreplace(wq->name, '\n', '\0');
@@ -1152,6 +1266,8 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_mode.attr,
 	&dev_attr_wq_size.attr,
 	&dev_attr_wq_priority.attr,
+	&dev_attr_wq_block_on_fault.attr,
+	&dev_attr_wq_threshold.attr,
 	&dev_attr_wq_type.attr,
 	&dev_attr_wq_name.attr,
 	&dev_attr_wq_cdev_minor.attr,
@@ -1303,6 +1419,16 @@ static ssize_t clients_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(clients);
 
+static ssize_t pasid_enabled_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", device_pasid_enabled(idxd));
+}
+static DEVICE_ATTR_RO(pasid_enabled);
+
 static ssize_t state_show(struct device *dev,
 			  struct device_attribute *attr, char *buf)
 {
@@ -1422,6 +1548,7 @@ static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_gen_cap.attr,
 	&dev_attr_configurable.attr,
 	&dev_attr_clients.attr,
+	&dev_attr_pasid_enabled.attr,
 	&dev_attr_state.attr,
 	&dev_attr_errors.attr,
 	&dev_attr_max_tokens.attr,
-- 
Gitee


From 67964abd133536de678c0b2db2546f2bf90110d1 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 27 Oct 2020 10:34:40 -0700
Subject: [PATCH 0572/2161] dmaengine: idxd: Clean up descriptors with fault
 error

commit e4f4d8cdeb9a2fe746411c0b9a7538b5c0232c1e upstream.

Add code to "complete" a descriptor when the descriptor or its completion
address hit a fault error when SVA mode is being used. This error can be
triggered due to bad programming by the user. A lock is introduced in order
to protect the descriptor completion lists since the fault handler will run
from the system work queue after being scheduled in the interrupt handler.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/160382008092.3911367.12766483427643278985.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h |   5 ++
 drivers/dma/idxd/init.c |   1 +
 drivers/dma/idxd/irq.c  | 146 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index dcac3bb5a0d0..7e54209c433a 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -34,6 +34,11 @@ struct idxd_irq_entry {
 	int id;
 	struct llist_head pending_llist;
 	struct list_head work_list;
+	/*
+	 * Lock to protect access between irq thread process descriptor
+	 * and irq thread processing error descriptor.
+	 */
+	spinlock_t list_lock;
 };
 
 struct idxd_group {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 1639f3b2aa58..c24106efc16e 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -97,6 +97,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	for (i = 0; i < msixcnt; i++) {
 		idxd->irq_entries[i].id = i;
 		idxd->irq_entries[i].idxd = idxd;
+		spin_lock_init(&idxd->irq_entries[i].list_lock);
 	}
 
 	msix = &idxd->msix_entries[0];
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 17a65a13fb64..593a2f6ed16c 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -11,6 +11,24 @@
 #include "idxd.h"
 #include "registers.h"
 
+enum irq_work_type {
+	IRQ_WORK_NORMAL = 0,
+	IRQ_WORK_PROCESS_FAULT,
+};
+
+struct idxd_fault {
+	struct work_struct work;
+	u64 addr;
+	struct idxd_device *idxd;
+};
+
+static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
+				 enum irq_work_type wtype,
+				 int *processed, u64 data);
+static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
+				     enum irq_work_type wtype,
+				     int *processed, u64 data);
+
 static void idxd_device_reinit(struct work_struct *work)
 {
 	struct idxd_device *idxd = container_of(work, struct idxd_device, work);
@@ -44,6 +62,46 @@ static void idxd_device_reinit(struct work_struct *work)
 	idxd_device_wqs_clear_state(idxd);
 }
 
+static void idxd_device_fault_work(struct work_struct *work)
+{
+	struct idxd_fault *fault = container_of(work, struct idxd_fault, work);
+	struct idxd_irq_entry *ie;
+	int i;
+	int processed;
+	int irqcnt = fault->idxd->num_wq_irqs + 1;
+
+	for (i = 1; i < irqcnt; i++) {
+		ie = &fault->idxd->irq_entries[i];
+		irq_process_work_list(ie, IRQ_WORK_PROCESS_FAULT,
+				      &processed, fault->addr);
+		if (processed)
+			break;
+
+		irq_process_pending_llist(ie, IRQ_WORK_PROCESS_FAULT,
+					  &processed, fault->addr);
+		if (processed)
+			break;
+	}
+
+	kfree(fault);
+}
+
+static int idxd_device_schedule_fault_process(struct idxd_device *idxd,
+					      u64 fault_addr)
+{
+	struct idxd_fault *fault;
+
+	fault = kmalloc(sizeof(*fault), GFP_ATOMIC);
+	if (!fault)
+		return -ENOMEM;
+
+	fault->addr = fault_addr;
+	fault->idxd = idxd;
+	INIT_WORK(&fault->work, idxd_device_fault_work);
+	queue_work(idxd->wq, &fault->work);
+	return 0;
+}
+
 irqreturn_t idxd_irq_handler(int vec, void *data)
 {
 	struct idxd_irq_entry *irq_entry = data;
@@ -125,6 +183,15 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	if (!err)
 		goto out;
 
+	/*
+	 * This case should rarely happen and typically is due to software
+	 * programming error by the driver.
+	 */
+	if (idxd->sw_err.valid &&
+	    idxd->sw_err.desc_valid &&
+	    idxd->sw_err.fault_addr)
+		idxd_device_schedule_fault_process(idxd, idxd->sw_err.fault_addr);
+
 	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
 	if (gensts.state == IDXD_DEVICE_STATE_HALT) {
 		idxd->state = IDXD_DEV_HALTED;
@@ -152,57 +219,110 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	return IRQ_HANDLED;
 }
 
+static bool process_fault(struct idxd_desc *desc, u64 fault_addr)
+{
+	/*
+	 * Completion address can be bad as well. Check fault address match for descriptor
+	 * and completion address.
+	 */
+	if ((u64)desc->hw == fault_addr ||
+	    (u64)desc->completion == fault_addr) {
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_DEV_FAIL);
+		return true;
+	}
+
+	return false;
+}
+
+static bool complete_desc(struct idxd_desc *desc)
+{
+	if (desc->completion->status) {
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
+		return true;
+	}
+
+	return false;
+}
+
 static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
-				     int *processed)
+				     enum irq_work_type wtype,
+				     int *processed, u64 data)
 {
 	struct idxd_desc *desc, *t;
 	struct llist_node *head;
 	int queued = 0;
+	bool completed = false;
+	unsigned long flags;
 
 	*processed = 0;
 	head = llist_del_all(&irq_entry->pending_llist);
 	if (!head)
-		return 0;
+		goto out;
 
 	llist_for_each_entry_safe(desc, t, head, llnode) {
-		if (desc->completion->status) {
-			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
+		if (wtype == IRQ_WORK_NORMAL)
+			completed = complete_desc(desc);
+		else if (wtype == IRQ_WORK_PROCESS_FAULT)
+			completed = process_fault(desc, data);
+
+		if (completed) {
 			idxd_free_desc(desc->wq, desc);
 			(*processed)++;
+			if (wtype == IRQ_WORK_PROCESS_FAULT)
+				break;
 		} else {
-			list_add_tail(&desc->list, &irq_entry->work_list);
+			spin_lock_irqsave(&irq_entry->list_lock, flags);
+			list_add_tail(&desc->list,
+				      &irq_entry->work_list);
+			spin_unlock_irqrestore(&irq_entry->list_lock, flags);
 			queued++;
 		}
 	}
 
+ out:
 	return queued;
 }
 
 static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
-				 int *processed)
+				 enum irq_work_type wtype,
+				 int *processed, u64 data)
 {
 	struct list_head *node, *next;
 	int queued = 0;
+	bool completed = false;
+	unsigned long flags;
 
 	*processed = 0;
+	spin_lock_irqsave(&irq_entry->list_lock, flags);
 	if (list_empty(&irq_entry->work_list))
-		return 0;
+		goto out;
 
 	list_for_each_safe(node, next, &irq_entry->work_list) {
 		struct idxd_desc *desc =
 			container_of(node, struct idxd_desc, list);
 
-		if (desc->completion->status) {
+		spin_unlock_irqrestore(&irq_entry->list_lock, flags);
+		if (wtype == IRQ_WORK_NORMAL)
+			completed = complete_desc(desc);
+		else if (wtype == IRQ_WORK_PROCESS_FAULT)
+			completed = process_fault(desc, data);
+
+		if (completed) {
+			spin_lock_irqsave(&irq_entry->list_lock, flags);
 			list_del(&desc->list);
-			/* process and callback */
-			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
+			spin_unlock_irqrestore(&irq_entry->list_lock, flags);
 			idxd_free_desc(desc->wq, desc);
 			(*processed)++;
+			if (wtype == IRQ_WORK_PROCESS_FAULT)
+				return queued;
 		} else {
 			queued++;
 		}
+		spin_lock_irqsave(&irq_entry->list_lock, flags);
 	}
 
+ out:
+	spin_unlock_irqrestore(&irq_entry->list_lock, flags);
 	return queued;
 }
 
@@ -230,12 +350,14 @@ static int idxd_desc_process(struct idxd_irq_entry *irq_entry)
 	 * 5. Repeat until no more descriptors.
 	 */
 	do {
-		rc = irq_process_work_list(irq_entry, &processed);
+		rc = irq_process_work_list(irq_entry, IRQ_WORK_NORMAL,
+					   &processed, 0);
 		total += processed;
 		if (rc != 0)
 			continue;
 
-		rc = irq_process_pending_llist(irq_entry, &processed);
+		rc = irq_process_pending_llist(irq_entry, IRQ_WORK_NORMAL,
+					       &processed, 0);
 		total += processed;
 	} while (rc != 0);
 
-- 
Gitee


From 99400f6c5706804565eedd2665da5de111c4bc13 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 27 Oct 2020 10:34:46 -0700
Subject: [PATCH 0573/2161] dmaengine: idxd: Add ABI documentation for shared
 wq

commit 4749f51ddd8aee5c7aa11e6593a54f96374a77ad upstream.

Add the sysfs attribute bits in ABI/stable for shared wq support.

Signed-off-by: Jing Lin <jing.lin@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/160382008649.3911367.10851752182908509837.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/ABI/stable/sysfs-driver-dma-idxd | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index b44183880935..5ea81ffd3c1a 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -77,6 +77,13 @@ Contact:        dmaengine@vger.kernel.org
 Description:    The operation capability bit mask specify the operation types
 		supported by the this device.
 
+What:		/sys/bus/dsa/devices/dsa<m>/pasid_enabled
+Date:		Oct 27, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	To indicate if PASID (process address space identifier) is
+		enabled or not for this device.
+
 What:           /sys/bus/dsa/devices/dsa<m>/state
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
@@ -122,6 +129,13 @@ KernelVersion:	5.10.0
 Contact:	dmaengine@vger.kernel.org
 Description:	The last executed device administrative command's status/error.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/block_on_fault
+Date:		Oct 27, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	To indicate block on fault is allowed or not for the work queue
+		to support on demand paging.
+
 What:           /sys/bus/dsa/devices/wq<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
-- 
Gitee


From c061ee928734494409bbc0f21a7933ecd66d0605 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 30 Oct 2020 08:49:06 -0700
Subject: [PATCH 0574/2161] dmaengine: idxd: Update calculation of group offset
 to be more readable

commit 5a71270197f39ff3e526cf130bc5d6e84db8f2d7 upstream.

Create helper macros to make group offset calculation more readable.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160407294683.839093.10740868559754142070.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c    | 12 +++++-------
 drivers/dma/idxd/registers.h | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 2f09eb89a906..d6f551dcbcb6 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -543,24 +543,22 @@ static void idxd_group_config_write(struct idxd_group *group)
 	dev_dbg(dev, "Writing group %d cfg registers\n", group->id);
 
 	/* setup GRPWQCFG */
-	for (i = 0; i < 4; i++) {
-		grpcfg_offset = idxd->grpcfg_offset +
-			group->id * 64 + i * sizeof(u64);
-		iowrite64(group->grpcfg.wqs[i],
-			  idxd->reg_base + grpcfg_offset);
+	for (i = 0; i < GRPWQCFG_STRIDES; i++) {
+		grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i);
+		iowrite64(group->grpcfg.wqs[i], idxd->reg_base + grpcfg_offset);
 		dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n",
 			group->id, i, grpcfg_offset,
 			ioread64(idxd->reg_base + grpcfg_offset));
 	}
 
 	/* setup GRPENGCFG */
-	grpcfg_offset = idxd->grpcfg_offset + group->id * 64 + 32;
+	grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id);
 	iowrite64(group->grpcfg.engines, idxd->reg_base + grpcfg_offset);
 	dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id,
 		grpcfg_offset, ioread64(idxd->reg_base + grpcfg_offset));
 
 	/* setup GRPFLAGS */
-	grpcfg_offset = idxd->grpcfg_offset + group->id * 64 + 40;
+	grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id);
 	iowrite32(group->grpcfg.flags.bits, idxd->reg_base + grpcfg_offset);
 	dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n",
 		group->id, grpcfg_offset,
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 1041c2779f9b..6f2f736097e5 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -356,4 +356,22 @@ union wqcfg {
 
 #define WQCFG_STRIDES(_idxd_dev) ((_idxd_dev)->wqcfg_size / sizeof(u32))
 
+#define GRPCFG_SIZE		64
+#define GRPWQCFG_STRIDES	4
+
+/*
+ * This macro calculates the offset into the GRPCFG register
+ * idxd - struct idxd *
+ * n - wq id
+ * ofs - the index of the 32b dword for the config register
+ *
+ * The WQCFG register block is divided into groups per each wq. The n index
+ * allows us to move to the register group that's for that particular wq.
+ * Each register is 32bits. The ofs gives us the number of register to access.
+ */
+#define GRPWQCFG_OFFSET(idxd_dev, n, ofs) ((idxd_dev)->grpcfg_offset +\
+					   (n) * GRPCFG_SIZE + sizeof(u64) * (ofs))
+#define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32)
+#define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40)
+
 #endif
-- 
Gitee


From a85773de729d05a1f8175ce843350497ebfd8a64 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 30 Oct 2020 08:51:56 -0700
Subject: [PATCH 0575/2161] dmaengine: idxd: define table offset multiplier

commit 2f8417a967d571bf8fb81cba95d7acf508ed334f upstream.

Convert table offset multiplier magic number to a define.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160407311690.839435.6941865731867828234.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c      | 17 +++++++----------
 drivers/dma/idxd/registers.h |  2 ++
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index c24106efc16e..45b0eac640c3 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -214,17 +214,14 @@ static void idxd_read_table_offsets(struct idxd_device *idxd)
 	struct device *dev = &idxd->pdev->dev;
 
 	offsets.bits[0] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET);
-	offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET
-			+ sizeof(u64));
-	idxd->grpcfg_offset = offsets.grpcfg * 0x100;
+	offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET + sizeof(u64));
+	idxd->grpcfg_offset = offsets.grpcfg * IDXD_TABLE_MULT;
 	dev_dbg(dev, "IDXD Group Config Offset: %#x\n", idxd->grpcfg_offset);
-	idxd->wqcfg_offset = offsets.wqcfg * 0x100;
-	dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n",
-		idxd->wqcfg_offset);
-	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
-	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
-		idxd->msix_perm_offset);
-	idxd->perfmon_offset = offsets.perfmon * 0x100;
+	idxd->wqcfg_offset = offsets.wqcfg * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n", idxd->wqcfg_offset);
+	idxd->msix_perm_offset = offsets.msix_perm * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n", idxd->msix_perm_offset);
+	idxd->perfmon_offset = offsets.perfmon * IDXD_TABLE_MULT;
 	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
 }
 
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 6f2f736097e5..d29a58ee2651 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -102,6 +102,8 @@ union offsets_reg {
 	u64 bits[2];
 } __packed;
 
+#define IDXD_TABLE_MULT			0x100
+
 #define IDXD_GENCFG_OFFSET		0x80
 union gencfg_reg {
 	struct {
-- 
Gitee


From 4c90b32820ac66f4fc38cb0b17ca690fb35a750c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 11 Nov 2020 15:23:46 -0700
Subject: [PATCH 0576/2161] dmaengine: idxd: fix mapping of portal size

commit 8326be9f1c0bb498baf134878a8deb8a952e0135 upstream.

Portal size is 4k. Current code is mapping all 4 portals in a single chunk.
Restrict the mapped portal size to a single portal to ensure that submission
only goes to the intended portal address.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160513342642.510187.16450549281618747065.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c    | 2 +-
 drivers/dma/idxd/registers.h | 2 +-
 drivers/dma/idxd/submit.c    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index d6f551dcbcb6..3362db78bd91 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -271,7 +271,7 @@ int idxd_wq_map_portal(struct idxd_wq *wq)
 	resource_size_t start;
 
 	start = pci_resource_start(pdev, IDXD_WQ_BAR);
-	start = start + wq->id * IDXD_PORTAL_SIZE;
+	start += idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED);
 
 	wq->portal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE);
 	if (!wq->portal)
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index d29a58ee2651..cb9ce3ee397e 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -8,7 +8,7 @@
 
 #define IDXD_MMIO_BAR		0
 #define IDXD_WQ_BAR		2
-#define IDXD_PORTAL_SIZE	0x4000
+#define IDXD_PORTAL_SIZE	PAGE_SIZE
 
 /* MMIO Device BAR0 Registers */
 #define IDXD_VER_OFFSET			0x00
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 0a6520a4fabb..935f67467e24 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -87,7 +87,7 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
 
-	portal = wq->portal + idxd_get_wq_portal_offset(IDXD_PORTAL_LIMITED);
+	portal = wq->portal;
 
 	/*
 	 * The wmb() flushes writes to coherent DMA data before
-- 
Gitee


From ed19a7ea55ee1b2b4e46294e83b5b9dd9bbb668d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 13 Nov 2020 13:16:31 +0300
Subject: [PATCH 0577/2161] dmaengine: fix error codes in channel_register()

commit 7e4be1290a38b3dd4a77cdf4565c9ffe7e620013 upstream.

The error codes were not set on some of these error paths.

Also the error handling was more confusing than it needed to be so I
cleaned it up and shuffled it around a bit.

Fixes: d2fb0a043838 ("dmaengine: break out channel registration")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201113101631.GE168908@mwanda
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 7974fa0400d8..962cbb5e5f7f 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1039,16 +1039,15 @@ static int get_dma_id(struct dma_device *device)
 static int __dma_async_device_channel_register(struct dma_device *device,
 					       struct dma_chan *chan)
 {
-	int rc = 0;
+	int rc;
 
 	chan->local = alloc_percpu(typeof(*chan->local));
 	if (!chan->local)
-		goto err_out;
+		return -ENOMEM;
 	chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
 	if (!chan->dev) {
-		free_percpu(chan->local);
-		chan->local = NULL;
-		goto err_out;
+		rc = -ENOMEM;
+		goto err_free_local;
 	}
 
 	/*
@@ -1061,7 +1060,8 @@ static int __dma_async_device_channel_register(struct dma_device *device,
 	if (chan->chan_id < 0) {
 		pr_err("%s: unable to alloc ida for chan: %d\n",
 		       __func__, chan->chan_id);
-		goto err_out;
+		rc = chan->chan_id;
+		goto err_free_dev;
 	}
 
 	chan->dev->device.class = &dma_devclass;
@@ -1082,9 +1082,10 @@ static int __dma_async_device_channel_register(struct dma_device *device,
 	mutex_lock(&device->chan_mutex);
 	ida_free(&device->chan_ida, chan->chan_id);
 	mutex_unlock(&device->chan_mutex);
- err_out:
-	free_percpu(chan->local);
+ err_free_dev:
 	kfree(chan->dev);
+ err_free_local:
+	free_percpu(chan->local);
 	return rc;
 }
 
-- 
Gitee


From 201fef47e4c946eb7af9a90c0e1b7dbf31a6fd2d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 13 Nov 2020 15:55:05 -0700
Subject: [PATCH 0578/2161] dmaengine: idxd: add ATS disable knob for work
 queues

commit 92de5fa2dc39c3fba0704f7ac914e7f02eb732f2 upstream.

With the DSA spec 1.1 update, a knob to disable ATS for individually is
introduced. Add enabling code to allow a system admin to make the
configuration through sysfs.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160530810593.1288392.2561048329116529566.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  7 ++++
 drivers/dma/idxd/device.c                     |  4 +++
 drivers/dma/idxd/idxd.h                       |  1 +
 drivers/dma/idxd/registers.h                  |  5 +--
 drivers/dma/idxd/sysfs.c                      | 34 +++++++++++++++++++
 5 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index 5ea81ffd3c1a..55285c136cf0 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -204,6 +204,13 @@ Contact:	dmaengine@vger.kernel.org
 Description:	The max batch size for this workqueue. Cannot exceed device
 		max batch size. Configurable parameter.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/ats_disable
+Date:		Nov 13, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicate whether ATS disable is turned on for the workqueue.
+		0 indicates ATS is on, and 1 indicates ATS is off for the workqueue.
+
 What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 3362db78bd91..19ca8a6308f1 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -354,6 +354,7 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	wq->group = NULL;
 	wq->threshold = 0;
 	wq->priority = 0;
+	wq->ats_dis = 0;
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
 
@@ -631,6 +632,9 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	    test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags))
 		wq->wqcfg->bof = 1;
 
+	if (idxd->hw.wq_cap.wq_ats_support)
+		wq->wqcfg->wq_ats_disable = wq->ats_dis;
+
 	/* bytes 12-15 */
 	wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
 	wq->wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 7e54209c433a..149934f8d097 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -123,6 +123,7 @@ struct idxd_wq {
 	char name[WQ_NAME_SIZE + 1];
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
+	bool ats_dis;
 };
 
 struct idxd_engine {
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index cb9ce3ee397e..5f52584b4510 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -47,7 +47,7 @@ union wq_cap_reg {
 		u64 rsvd:20;
 		u64 shared_mode:1;
 		u64 dedicated_mode:1;
-		u64 rsvd2:1;
+		u64 wq_ats_support:1;
 		u64 priority:1;
 		u64 occupancy:1;
 		u64 occupancy_int:1;
@@ -303,7 +303,8 @@ union wqcfg {
 		/* bytes 8-11 */
 		u32 mode:1;	/* shared or dedicated */
 		u32 bof:1;	/* block on fault */
-		u32 rsvd2:2;
+		u32 wq_ats_disable:1;
+		u32 rsvd2:1;
 		u32 priority:4;
 		u32 pasid:20;
 		u32 pasid_en:1;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 04154b94d3a1..014fa6c6d467 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1259,6 +1259,39 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu
 static struct device_attribute dev_attr_wq_max_batch_size =
 		__ATTR(max_batch_size, 0644, wq_max_batch_size_show, wq_max_batch_size_store);
 
+static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n", wq->ats_dis);
+}
+
+static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	bool ats_dis;
+	int rc;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (!idxd->hw.wq_cap.wq_ats_support)
+		return -EOPNOTSUPP;
+
+	rc = kstrtobool(buf, &ats_dis);
+	if (rc < 0)
+		return rc;
+
+	wq->ats_dis = ats_dis;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_ats_disable =
+		__ATTR(ats_disable, 0644, wq_ats_disable_show, wq_ats_disable_store);
+
 static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_clients.attr,
 	&dev_attr_wq_state.attr,
@@ -1273,6 +1306,7 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_cdev_minor.attr,
 	&dev_attr_wq_max_transfer_size.attr,
 	&dev_attr_wq_max_batch_size.attr,
+	&dev_attr_wq_ats_disable.attr,
 	NULL,
 };
 
-- 
Gitee


From 195836f00409e27401bb2ae873ea9d6101cac9d4 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 17 Nov 2020 13:39:14 -0700
Subject: [PATCH 0579/2161] dmaengine: idxd: add IAX configuration support in
 the IDXD driver

commit f25b463883a8a2d1b7303a63339c0d589fc94f1e upstream.

Add support to allow configuration of Intel Analytics Accelerator (IAX) in
addition to the Intel Data Streaming Accelerator (DSA). The IAX hardware
has the same configuration interface as DSA. The main difference
is the type of operations it performs. We can support the DSA and
IAX devices on the same driver with some tweaks.

IAX has a 64B completion record that needs to be 64B aligned, as opposed to
a 32B completion record that is 32B aligned for DSA. IAX also does not
support token management.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160564555488.1834439.4261958859935360473.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c      |  1 +
 drivers/dma/idxd/device.c    | 37 +++++++++++++----
 drivers/dma/idxd/idxd.h      | 24 +++++++++--
 drivers/dma/idxd/init.c      | 14 +++++++
 drivers/dma/idxd/registers.h |  1 +
 drivers/dma/idxd/submit.c    |  2 +-
 drivers/dma/idxd/sysfs.c     | 46 +++++++++++++++++++--
 include/uapi/linux/idxd.h    | 79 ++++++++++++++++++++++++++++++++++++
 8 files changed, 187 insertions(+), 17 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 010b820d8f74..0db9b82ed8cf 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -28,6 +28,7 @@ struct idxd_cdev_context {
  */
 static struct idxd_cdev_context ictx[IDXD_TYPE_MAX] = {
 	{ .name = "dsa" },
+	{ .name = "iax" }
 };
 
 struct idxd_user_context {
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 19ca8a6308f1..95f94a3ed6be 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -131,6 +131,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
 	int rc, num_descs, i;
+	int align;
+	u64 tmp;
 
 	if (wq->type != IDXD_WQT_KERNEL)
 		return 0;
@@ -142,14 +144,27 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	if (rc < 0)
 		return rc;
 
-	wq->compls_size = num_descs * sizeof(struct dsa_completion_record);
-	wq->compls = dma_alloc_coherent(dev, wq->compls_size,
-					&wq->compls_addr, GFP_KERNEL);
-	if (!wq->compls) {
+	if (idxd->type == IDXD_TYPE_DSA)
+		align = 32;
+	else if (idxd->type == IDXD_TYPE_IAX)
+		align = 64;
+	else
+		return -ENODEV;
+
+	wq->compls_size = num_descs * idxd->compl_size + align;
+	wq->compls_raw = dma_alloc_coherent(dev, wq->compls_size,
+					    &wq->compls_addr_raw, GFP_KERNEL);
+	if (!wq->compls_raw) {
 		rc = -ENOMEM;
 		goto fail_alloc_compls;
 	}
 
+	/* Adjust alignment */
+	wq->compls_addr = (wq->compls_addr_raw + (align - 1)) & ~(align - 1);
+	tmp = (u64)wq->compls_raw;
+	tmp = (tmp + (align - 1)) & ~(align - 1);
+	wq->compls = (struct dsa_completion_record *)tmp;
+
 	rc = alloc_descs(wq, num_descs);
 	if (rc < 0)
 		goto fail_alloc_descs;
@@ -163,9 +178,11 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 		struct idxd_desc *desc = wq->descs[i];
 
 		desc->hw = wq->hw_descs[i];
-		desc->completion = &wq->compls[i];
-		desc->compl_dma  = wq->compls_addr +
-			sizeof(struct dsa_completion_record) * i;
+		if (idxd->type == IDXD_TYPE_DSA)
+			desc->completion = &wq->compls[i];
+		else if (idxd->type == IDXD_TYPE_IAX)
+			desc->iax_completion = &wq->iax_compls[i];
+		desc->compl_dma = wq->compls_addr + idxd->compl_size * i;
 		desc->id = i;
 		desc->wq = wq;
 		desc->cpu = -1;
@@ -178,7 +195,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
  fail_sbitmap_init:
 	free_descs(wq);
  fail_alloc_descs:
-	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
+	dma_free_coherent(dev, wq->compls_size, wq->compls_raw,
+			  wq->compls_addr_raw);
  fail_alloc_compls:
 	free_hw_descs(wq);
 	return rc;
@@ -193,7 +211,8 @@ void idxd_wq_free_resources(struct idxd_wq *wq)
 
 	free_hw_descs(wq);
 	free_descs(wq);
-	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
+	dma_free_coherent(dev, wq->compls_size, wq->compls_raw,
+			  wq->compls_addr_raw);
 	sbitmap_queue_free(&wq->sbq);
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 149934f8d097..5a50e91c71bf 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -20,7 +20,8 @@ extern struct kmem_cache *idxd_desc_pool;
 enum idxd_type {
 	IDXD_TYPE_UNKNOWN = -1,
 	IDXD_TYPE_DSA = 0,
-	IDXD_TYPE_MAX
+	IDXD_TYPE_IAX,
+	IDXD_TYPE_MAX,
 };
 
 #define IDXD_NAME_SIZE		128
@@ -114,8 +115,13 @@ struct idxd_wq {
 	u32 vec_ptr;		/* interrupt steering */
 	struct dsa_hw_desc **hw_descs;
 	int num_descs;
-	struct dsa_completion_record *compls;
+	union {
+		struct dsa_completion_record *compls;
+		struct iax_completion_record *iax_compls;
+	};
+	void *compls_raw;
 	dma_addr_t compls_addr;
+	dma_addr_t compls_addr_raw;
 	int compls_size;
 	struct idxd_desc **descs;
 	struct sbitmap_queue sbq;
@@ -196,6 +202,7 @@ struct idxd_device {
 	int token_limit;
 	int nr_tokens;		/* non-reserved tokens */
 	unsigned int wqcfg_size;
+	int compl_size;
 
 	union sw_err_reg sw_err;
 	wait_queue_head_t cmd_waitq;
@@ -210,9 +217,15 @@ struct idxd_device {
 
 /* IDXD software descriptor */
 struct idxd_desc {
-	struct dsa_hw_desc *hw;
+	union {
+		struct dsa_hw_desc *hw;
+		struct iax_hw_desc *iax_hw;
+	};
 	dma_addr_t desc_dma;
-	struct dsa_completion_record *completion;
+	union {
+		struct dsa_completion_record *completion;
+		struct iax_completion_record *iax_completion;
+	};
 	dma_addr_t compl_dma;
 	struct dma_async_tx_descriptor txd;
 	struct llist_node llnode;
@@ -226,6 +239,7 @@ struct idxd_desc {
 #define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev)
 
 extern struct bus_type dsa_bus_type;
+extern struct bus_type iax_bus_type;
 
 extern bool support_enqcmd;
 
@@ -271,6 +285,8 @@ static inline void idxd_set_type(struct idxd_device *idxd)
 
 	if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0)
 		idxd->type = IDXD_TYPE_DSA;
+	else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0)
+		idxd->type = IDXD_TYPE_IAX;
 	else
 		idxd->type = IDXD_TYPE_UNKNOWN;
 }
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 45b0eac640c3..2c051e07c34c 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -36,12 +36,16 @@ static struct mutex idxd_idr_lock;
 static struct pci_device_id idxd_pci_tbl[] = {
 	/* DSA ver 1.0 platforms */
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_DSA_SPR0) },
+
+	/* IAX ver 1.0 platforms */
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IAX_SPR0) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, idxd_pci_tbl);
 
 static char *idxd_name[] = {
 	"dsa",
+	"iax"
 };
 
 const char *idxd_get_dev_name(struct idxd_device *idxd)
@@ -377,6 +381,14 @@ static int idxd_probe(struct idxd_device *idxd)
 	return rc;
 }
 
+static void idxd_type_init(struct idxd_device *idxd)
+{
+	if (idxd->type == IDXD_TYPE_DSA)
+		idxd->compl_size = sizeof(struct dsa_completion_record);
+	else if (idxd->type == IDXD_TYPE_IAX)
+		idxd->compl_size = sizeof(struct iax_completion_record);
+}
+
 static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct device *dev = &pdev->dev;
@@ -412,6 +424,8 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	idxd_set_type(idxd);
 
+	idxd_type_init(idxd);
+
 	dev_dbg(dev, "Set PCI master\n");
 	pci_set_master(pdev);
 	pci_set_drvdata(pdev, idxd);
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 5f52584b4510..751ecb4f9f81 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -5,6 +5,7 @@
 
 /* PCI Config */
 #define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
+#define PCI_DEVICE_ID_INTEL_IAX_SPR0	0x0cfe
 
 #define IDXD_MMIO_BAR		0
 #define IDXD_WQ_BAR		2
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 935f67467e24..bef5c5c2f1dd 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -16,7 +16,7 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 
 	desc = wq->descs[idx];
 	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
-	memset(desc->completion, 0, sizeof(struct dsa_completion_record));
+	memset(desc->completion, 0, idxd->compl_size);
 	desc->cpu = cpu;
 
 	if (device_pasid_enabled(idxd))
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 014fa6c6d467..0a9bcfc0a28b 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -41,14 +41,24 @@ static struct device_type dsa_device_type = {
 	.release = idxd_conf_device_release,
 };
 
+static struct device_type iax_device_type = {
+	.name = "iax",
+	.release = idxd_conf_device_release,
+};
+
 static inline bool is_dsa_dev(struct device *dev)
 {
 	return dev ? dev->type == &dsa_device_type : false;
 }
 
+static inline bool is_iax_dev(struct device *dev)
+{
+	return dev ? dev->type == &iax_device_type : false;
+}
+
 static inline bool is_idxd_dev(struct device *dev)
 {
-	return is_dsa_dev(dev);
+	return is_dsa_dev(dev) || is_iax_dev(dev);
 }
 
 static inline bool is_idxd_wq_dev(struct device *dev)
@@ -357,8 +367,17 @@ struct bus_type dsa_bus_type = {
 	.shutdown = idxd_config_bus_shutdown,
 };
 
+struct bus_type iax_bus_type = {
+	.name = "iax",
+	.match = idxd_config_bus_match,
+	.probe = idxd_config_bus_probe,
+	.remove = idxd_config_bus_remove,
+	.shutdown = idxd_config_bus_shutdown,
+};
+
 static struct bus_type *idxd_bus_types[] = {
-	&dsa_bus_type
+	&dsa_bus_type,
+	&iax_bus_type
 };
 
 static struct idxd_device_driver dsa_drv = {
@@ -370,8 +389,18 @@ static struct idxd_device_driver dsa_drv = {
 	},
 };
 
+static struct idxd_device_driver iax_drv = {
+	.drv = {
+		.name = "iax",
+		.bus = &iax_bus_type,
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
+};
+
 static struct idxd_device_driver *idxd_drvs[] = {
-	&dsa_drv
+	&dsa_drv,
+	&iax_drv
 };
 
 struct bus_type *idxd_get_bus_type(struct idxd_device *idxd)
@@ -383,6 +412,8 @@ static struct device_type *idxd_get_device_type(struct idxd_device *idxd)
 {
 	if (idxd->type == IDXD_TYPE_DSA)
 		return &dsa_device_type;
+	else if (idxd->type == IDXD_TYPE_IAX)
+		return &iax_device_type;
 	else
 		return NULL;
 }
@@ -523,6 +554,9 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
+	if (idxd->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
@@ -568,6 +602,9 @@ static ssize_t group_tokens_allowed_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
+	if (idxd->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
@@ -610,6 +647,9 @@ static ssize_t group_use_token_limit_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
+	if (idxd->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index fdcdfe414223..236d437947bc 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -26,6 +26,9 @@
 #define IDXD_OP_FLAG_DRDBK	0x4000
 #define IDXD_OP_FLAG_DSTS	0x8000
 
+/* IAX */
+#define IDXD_OP_FLAG_RD_SRC2_AECS	0x010000
+
 /* Opcode */
 enum dsa_opcode {
 	DSA_OPCODE_NOOP = 0,
@@ -47,6 +50,14 @@ enum dsa_opcode {
 	DSA_OPCODE_CFLUSH = 0x20,
 };
 
+enum iax_opcode {
+	IAX_OPCODE_NOOP = 0,
+	IAX_OPCODE_DRAIN = 2,
+	IAX_OPCODE_MEMMOVE,
+	IAX_OPCODE_DECOMPRESS = 0x42,
+	IAX_OPCODE_COMPRESS,
+};
+
 /* Completion record status */
 enum dsa_completion_status {
 	DSA_COMP_NONE = 0,
@@ -80,6 +91,33 @@ enum dsa_completion_status {
 	DSA_COMP_TRANSLATION_FAIL,
 };
 
+enum iax_completion_status {
+	IAX_COMP_NONE = 0,
+	IAX_COMP_SUCCESS,
+	IAX_COMP_PAGE_FAULT_IR = 0x04,
+	IAX_COMP_OUTBUF_OVERFLOW,
+	IAX_COMP_BAD_OPCODE = 0x10,
+	IAX_COMP_INVALID_FLAGS,
+	IAX_COMP_NOZERO_RESERVE,
+	IAX_COMP_INVALID_SIZE,
+	IAX_COMP_OVERLAP_BUFFERS = 0x16,
+	IAX_COMP_INT_HANDLE_INVAL = 0x19,
+	IAX_COMP_CRA_XLAT,
+	IAX_COMP_CRA_ALIGN,
+	IAX_COMP_ADDR_ALIGN,
+	IAX_COMP_PRIV_BAD,
+	IAX_COMP_TRAFFIC_CLASS_CONF,
+	IAX_COMP_PFAULT_RDBA,
+	IAX_COMP_HW_ERR1,
+	IAX_COMP_HW_ERR_DRB,
+	IAX_COMP_TRANSLATION_FAIL,
+	IAX_COMP_PRS_TIMEOUT,
+	IAX_COMP_WATCHDOG,
+	IAX_COMP_INVALID_COMP_FLAG = 0x30,
+	IAX_COMP_INVALID_FILTER_FLAG,
+	IAX_COMP_INVALID_NUM_ELEMS = 0x33,
+};
+
 #define DSA_COMP_STATUS_MASK		0x7f
 #define DSA_COMP_STATUS_WRITE		0x80
 
@@ -163,6 +201,28 @@ struct dsa_hw_desc {
 	};
 } __attribute__((packed));
 
+struct iax_hw_desc {
+	uint32_t        pasid:20;
+	uint32_t        rsvd:11;
+	uint32_t        priv:1;
+	uint32_t        flags:24;
+	uint32_t        opcode:8;
+	uint64_t        completion_addr;
+	uint64_t        src1_addr;
+	uint64_t        dst_addr;
+	uint32_t        src1_size;
+	uint16_t        int_handle;
+	union {
+		uint16_t        compr_flags;
+		uint16_t        decompr_flags;
+	};
+	uint64_t        src2_addr;
+	uint32_t        max_dst_size;
+	uint32_t        src2_size;
+	uint32_t	filter_flags;
+	uint32_t	num_inputs;
+} __attribute__((packed));
+
 struct dsa_raw_desc {
 	uint64_t	field[8];
 } __attribute__((packed));
@@ -223,4 +283,23 @@ struct dsa_raw_completion_record {
 	uint64_t	field[4];
 } __attribute__((packed));
 
+struct iax_completion_record {
+	volatile uint8_t        status;
+	uint8_t                 error_code;
+	uint16_t                rsvd;
+	uint32_t                bytes_completed;
+	uint64_t                fault_addr;
+	uint32_t                invalid_flags;
+	uint32_t                rsvd2;
+	uint32_t                output_size;
+	uint8_t                 output_bits;
+	uint8_t                 rsvd3;
+	uint16_t                rsvd4;
+	uint64_t                rsvd5[4];
+} __attribute__((packed));
+
+struct iax_raw_completion_record {
+	uint64_t	field[8];
+} __attribute__((packed));
+
 #endif
-- 
Gitee


From b234868f511d759c9e3c69a35df2b1ccf9397f6e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 16 Dec 2020 11:29:46 +0300
Subject: [PATCH 0580/2161] dmaengine: idxd: off by one in cleanup code

commit ff58f7dd0c1352a01de3a40327895bd51e03de3a upstream.

The clean up is off by one so this will start at "i" and it should start
with "i - 1" and then it doesn't unregister the zeroeth elements in the
array.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/X9nFeojulsNqUSnG@mwanda
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 0a9bcfc0a28b..816c36c13f53 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -432,7 +432,7 @@ int idxd_register_driver(void)
 	return 0;
 
 drv_fail:
-	for (; i > 0; i--)
+	while (--i >= 0)
 		driver_unregister(&idxd_drvs[i]->drv);
 	return rc;
 }
@@ -1838,7 +1838,7 @@ int idxd_register_bus_type(void)
 	return 0;
 
 bus_err:
-	for (; i > 0; i--)
+	while (--i >= 0)
 		bus_unregister(idxd_bus_types[i]);
 	return rc;
 }
-- 
Gitee


From ba25c69fa36d80211d5833708c4f3be1ab18425c Mon Sep 17 00:00:00 2001
From: Zheng Yongjun <zhengyongjun3@huawei.com>
Date: Thu, 24 Dec 2020 21:22:54 +0800
Subject: [PATCH 0581/2161] dma: idxd: use DEFINE_MUTEX() for mutex lock

commit e2fcd6e427c2fae92b366b24759f95d77b6f7bc7 upstream.

mutex lock can be initialized automatically with DEFINE_MUTEX()
rather than explicitly calling mutex_init().

Signed-off-by: Zheng Yongjun <zhengyongjun3@huawei.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20201224132254.30961-1-zhengyongjun3@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 2c051e07c34c..25cc947c6179 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -31,7 +31,7 @@ MODULE_AUTHOR("Intel Corporation");
 bool support_enqcmd;
 
 static struct idr idxd_idrs[IDXD_TYPE_MAX];
-static struct mutex idxd_idr_lock;
+static DEFINE_MUTEX(idxd_idr_lock);
 
 static struct pci_device_id idxd_pci_tbl[] = {
 	/* DSA ver 1.0 platforms */
@@ -544,7 +544,6 @@ static int __init idxd_init_module(void)
 	else
 		support_enqcmd = true;
 
-	mutex_init(&idxd_idr_lock);
 	for (i = 0; i < IDXD_TYPE_MAX; i++)
 		idr_init(&idxd_idrs[i]);
 
-- 
Gitee


From 7e80f4aa80adcb7c3bc42da262af7b6f83222fbb Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 15 Jan 2021 14:52:52 -0700
Subject: [PATCH 0582/2161] dmaengine: idxd: Fix list corruption in description
 completion

commit 16e19e11228ba660d9e322035635e7dcf160d5c2 upstream.

Sanjay reported the following kernel splat after running dmatest for stress
testing. The current code is giving up the spinlock in the middle of
a completion list walk, and that opens up opportunity for list corruption
if another thread touches the list at the same time. In order to make sure
the list is always protected, the hardware completed descriptors will be
put on a local list to be completed with callbacks from the outside of
the list lock.

kernel: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] SMP NOPTI
kernel: CPU: 62 PID: 1814 Comm: irq/89-idxd-por Tainted: G        W         5.10.0-intel-next_10_16+ #1
kernel: Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.SBT.4915.D02.2012070418 12/07/2020
kernel: RIP: 0010:irq_process_work_list+0xcd/0x170 [idxd]
kernel: Code: e8 18 65 5c d3 8b 45 d4 85 c0 75 b3 4c 89 f7 e8 b9 fe ff ff 84 c0 74 bf 4c 89 e7 e8 dd 6b 5c d3 49 8b 3f 49 8b 4f 08 48 89 c6 <48> 89 4f 08 48 89 39 4c 89 e7 48 b9 00 01 00 00 00 00 ad de 49 89
kernel: RSP: 0018:ff256768c4353df8 EFLAGS: 00010046
kernel: RAX: 0000000000000202 RBX: dead000000000100 RCX: dead000000000122
kernel: RDX: 0000000000000001 RSI: 0000000000000202 RDI: dead000000000100
kernel: RBP: ff256768c4353e40 R08: 0000000000000001 R09: 0000000000000000
kernel: R10: 0000000000000000 R11: 00000000ffffffff R12: ff1fdf9fd06b3e48
kernel: R13: 0000000000000005 R14: ff1fdf9fc4275980 R15: ff1fdf9fc4275a00
kernel: FS:  0000000000000000(0000) GS:ff1fdfa32f880000(0000) knlGS:0000000000000000
kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
kernel: CR2: 00007f87f24012a0 CR3: 000000010f630004 CR4: 0000000003771ee0
kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
kernel: DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
kernel: PKRU: 55555554
kernel: Call Trace:
kernel: ? irq_thread+0xa9/0x1b0
kernel: idxd_wq_thread+0x34/0x90 [idxd]
kernel: irq_thread_fn+0x24/0x60
kernel: irq_thread+0x10f/0x1b0
kernel: ? irq_forced_thread_fn+0x80/0x80
kernel: ? wake_threads_waitq+0x30/0x30
kernel: ? irq_thread_check_affinity+0xe0/0xe0
kernel: kthread+0x142/0x160
kernel: ? kthread_park+0x90/0x90
kernel: ret_from_fork+0x1f/0x30
kernel: Modules linked in: idxd_ktest dmatest intel_rapl_msr idxd_mdev iTCO_wdt vfio_pci vfio_virqfd iTCO_vendor_support intel_rapl_common i10nm_edac nfit x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper rapl msr pcspkr snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio snd_hda_intel snd_intel_dspcfg ofpart snd_hda_codec snd_hda_core snd_hwdep cmdlinepart snd_seq snd_seq_device idxd snd_pcm intel_spi_pci intel_spi snd_timer spi_nor input_leds joydev snd i2c_i801 mtd soundcore i2c_smbus mei_me mei i2c_ismt ipmi_ssif acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler mac_hid sunrpc nls_iso8859_1 sch_fq_codel ip_tables x_tables xfs libcrc32c ast drm_vram_helper drm_ttm_helper ttm igc wmi pinctrl_sunrisepoint hid_generic usbmouse usbkbd usbhid hid
kernel: ---[ end trace cd5ca950ef0db25f ]---

Reported-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Fixes: e4f4d8cdeb9a ("dmaengine: idxd: Clean up descriptors with fault error")
Link: https://lore.kernel.org/r/161074757267.2183951.17912830858060607266.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 86 +++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 42 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 593a2f6ed16c..fe996cec7c74 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -219,29 +219,27 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	return IRQ_HANDLED;
 }
 
-static bool process_fault(struct idxd_desc *desc, u64 fault_addr)
+static inline bool match_fault(struct idxd_desc *desc, u64 fault_addr)
 {
 	/*
 	 * Completion address can be bad as well. Check fault address match for descriptor
 	 * and completion address.
 	 */
-	if ((u64)desc->hw == fault_addr ||
-	    (u64)desc->completion == fault_addr) {
-		idxd_dma_complete_txd(desc, IDXD_COMPLETE_DEV_FAIL);
+	if ((u64)desc->hw == fault_addr || (u64)desc->completion == fault_addr) {
+		struct idxd_device *idxd = desc->wq->idxd;
+		struct device *dev = &idxd->pdev->dev;
+
+		dev_warn(dev, "desc with fault address: %#llx\n", fault_addr);
 		return true;
 	}
 
 	return false;
 }
 
-static bool complete_desc(struct idxd_desc *desc)
+static inline void complete_desc(struct idxd_desc *desc, enum idxd_complete_type reason)
 {
-	if (desc->completion->status) {
-		idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
-		return true;
-	}
-
-	return false;
+	idxd_dma_complete_txd(desc, reason);
+	idxd_free_desc(desc->wq, desc);
 }
 
 static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
@@ -251,25 +249,25 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 	struct idxd_desc *desc, *t;
 	struct llist_node *head;
 	int queued = 0;
-	bool completed = false;
 	unsigned long flags;
+	enum idxd_complete_type reason;
 
 	*processed = 0;
 	head = llist_del_all(&irq_entry->pending_llist);
 	if (!head)
 		goto out;
 
-	llist_for_each_entry_safe(desc, t, head, llnode) {
-		if (wtype == IRQ_WORK_NORMAL)
-			completed = complete_desc(desc);
-		else if (wtype == IRQ_WORK_PROCESS_FAULT)
-			completed = process_fault(desc, data);
+	if (wtype == IRQ_WORK_NORMAL)
+		reason = IDXD_COMPLETE_NORMAL;
+	else
+		reason = IDXD_COMPLETE_DEV_FAIL;
 
-		if (completed) {
-			idxd_free_desc(desc->wq, desc);
+	llist_for_each_entry_safe(desc, t, head, llnode) {
+		if (desc->completion->status) {
+			if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS)
+				match_fault(desc, data);
+			complete_desc(desc, reason);
 			(*processed)++;
-			if (wtype == IRQ_WORK_PROCESS_FAULT)
-				break;
 		} else {
 			spin_lock_irqsave(&irq_entry->list_lock, flags);
 			list_add_tail(&desc->list,
@@ -287,42 +285,46 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 				 enum irq_work_type wtype,
 				 int *processed, u64 data)
 {
-	struct list_head *node, *next;
 	int queued = 0;
-	bool completed = false;
 	unsigned long flags;
+	LIST_HEAD(flist);
+	struct idxd_desc *desc, *n;
+	enum idxd_complete_type reason;
 
 	*processed = 0;
-	spin_lock_irqsave(&irq_entry->list_lock, flags);
-	if (list_empty(&irq_entry->work_list))
-		goto out;
-
-	list_for_each_safe(node, next, &irq_entry->work_list) {
-		struct idxd_desc *desc =
-			container_of(node, struct idxd_desc, list);
+	if (wtype == IRQ_WORK_NORMAL)
+		reason = IDXD_COMPLETE_NORMAL;
+	else
+		reason = IDXD_COMPLETE_DEV_FAIL;
 
+	/*
+	 * This lock protects list corruption from access of list outside of the irq handler
+	 * thread.
+	 */
+	spin_lock_irqsave(&irq_entry->list_lock, flags);
+	if (list_empty(&irq_entry->work_list)) {
 		spin_unlock_irqrestore(&irq_entry->list_lock, flags);
-		if (wtype == IRQ_WORK_NORMAL)
-			completed = complete_desc(desc);
-		else if (wtype == IRQ_WORK_PROCESS_FAULT)
-			completed = process_fault(desc, data);
+		return 0;
+	}
 
-		if (completed) {
-			spin_lock_irqsave(&irq_entry->list_lock, flags);
+	list_for_each_entry_safe(desc, n, &irq_entry->work_list, list) {
+		if (desc->completion->status) {
 			list_del(&desc->list);
-			spin_unlock_irqrestore(&irq_entry->list_lock, flags);
-			idxd_free_desc(desc->wq, desc);
 			(*processed)++;
-			if (wtype == IRQ_WORK_PROCESS_FAULT)
-				return queued;
+			list_add_tail(&desc->list, &flist);
 		} else {
 			queued++;
 		}
-		spin_lock_irqsave(&irq_entry->list_lock, flags);
 	}
 
- out:
 	spin_unlock_irqrestore(&irq_entry->list_lock, flags);
+
+	list_for_each_entry(desc, &flist, list) {
+		if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS)
+			match_fault(desc, data);
+		complete_desc(desc, reason);
+	}
+
 	return queued;
 }
 
-- 
Gitee


From eca6c4499811e8f3873000a7bcbf331279da9e54 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 15 Jan 2021 14:52:33 -0700
Subject: [PATCH 0583/2161] dmaengine: idxd: fix misc interrupt completion

commit f5cc9ace24fbdf41b4814effbb2f9bad7046e988 upstream.

Nikhil reported the misc interrupt handler can sometimes miss handling
the command interrupt when an error interrupt happens near the same time.
Have the irq handling thread continue to process the misc interrupts until
all interrupts are processed. This is a low usage interrupt and is not
expected to handle high volume traffic. Therefore there is no concern of
this thread running for a long time.

Fixes: 0d5c10b4c84d ("dmaengine: idxd: add work queue drain support")
Reported-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161074755329.2183844.13295528344116907983.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index fe996cec7c74..a60ca11a5784 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -111,19 +111,14 @@ irqreturn_t idxd_irq_handler(int vec, void *data)
 	return IRQ_WAKE_THREAD;
 }
 
-irqreturn_t idxd_misc_thread(int vec, void *data)
+static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 {
-	struct idxd_irq_entry *irq_entry = data;
-	struct idxd_device *idxd = irq_entry->idxd;
 	struct device *dev = &idxd->pdev->dev;
 	union gensts_reg gensts;
-	u32 cause, val = 0;
+	u32 val = 0;
 	int i;
 	bool err = false;
 
-	cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
-	iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
-
 	if (cause & IDXD_INTC_ERR) {
 		spin_lock_bh(&idxd->dev_lock);
 		for (i = 0; i < 4; i++)
@@ -181,7 +176,7 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 			      val);
 
 	if (!err)
-		goto out;
+		return 0;
 
 	/*
 	 * This case should rarely happen and typically is due to software
@@ -211,10 +206,33 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 				gensts.reset_type == IDXD_DEVICE_RESET_FLR ?
 				"FLR" : "system reset");
 			spin_unlock_bh(&idxd->dev_lock);
+			return -ENXIO;
 		}
 	}
 
- out:
+	return 0;
+}
+
+irqreturn_t idxd_misc_thread(int vec, void *data)
+{
+	struct idxd_irq_entry *irq_entry = data;
+	struct idxd_device *idxd = irq_entry->idxd;
+	int rc;
+	u32 cause;
+
+	cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+	if (cause)
+		iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+
+	while (cause) {
+		rc = process_misc_interrupts(idxd, cause);
+		if (rc < 0)
+			break;
+		cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+		if (cause)
+			iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+	}
+
 	idxd_unmask_msix_vector(idxd, irq_entry->id);
 	return IRQ_HANDLED;
 }
-- 
Gitee


From 759bedc9a4df57391fe936496f29c7223773e042 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 1 Feb 2021 08:26:14 -0700
Subject: [PATCH 0584/2161] dmaengine: idxd: check device state before issue
 command

commit 89e3becd8f821e507052e012d2559dcda59f538e upstream.

Add device state check before executing command. Without the check the
command can be issued while device is in halt state and causes the driver to
block while waiting for the completion of the command.

Reported-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Fixes: 0d5c10b4c84d ("dmaengine: idxd: add work queue drain support")
Link: https://lore.kernel.org/r/161219313921.2976211.12222625226450097465.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 23 ++++++++++++++++++++++-
 drivers/dma/idxd/idxd.h   |  2 +-
 drivers/dma/idxd/init.c   |  5 ++++-
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 95f94a3ed6be..84a6ea60ecf0 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -398,17 +398,31 @@ static inline bool idxd_is_enabled(struct idxd_device *idxd)
 	return false;
 }
 
+static inline bool idxd_device_is_halted(struct idxd_device *idxd)
+{
+	union gensts_reg gensts;
+
+	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
+
+	return (gensts.state == IDXD_DEVICE_STATE_HALT);
+}
+
 /*
  * This is function is only used for reset during probe and will
  * poll for completion. Once the device is setup with interrupts,
  * all commands will be done via interrupt completion.
  */
-void idxd_device_init_reset(struct idxd_device *idxd)
+int idxd_device_init_reset(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
 	union idxd_command_reg cmd;
 	unsigned long flags;
 
+	if (idxd_device_is_halted(idxd)) {
+		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
+		return -ENXIO;
+	}
+
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.cmd = IDXD_CMD_RESET_DEVICE;
 	dev_dbg(dev, "%s: sending reset for init.\n", __func__);
@@ -419,6 +433,7 @@ void idxd_device_init_reset(struct idxd_device *idxd)
 	       IDXD_CMDSTS_ACTIVE)
 		cpu_relax();
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	return 0;
 }
 
 static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
@@ -428,6 +443,12 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	DECLARE_COMPLETION_ONSTACK(done);
 	unsigned long flags;
 
+	if (idxd_device_is_halted(idxd)) {
+		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
+		*status = IDXD_CMDSTS_HW_ERR;
+		return;
+	}
+
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.cmd = cmd_code;
 	cmd.operand = operand;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 5a50e91c71bf..81a0e65fd316 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -326,7 +326,7 @@ void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
 void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
-void idxd_device_init_reset(struct idxd_device *idxd);
+int idxd_device_init_reset(struct idxd_device *idxd);
 int idxd_device_enable(struct idxd_device *idxd);
 int idxd_device_disable(struct idxd_device *idxd);
 void idxd_device_reset(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 25cc947c6179..957c7ee91519 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -335,7 +335,10 @@ static int idxd_probe(struct idxd_device *idxd)
 	int rc;
 
 	dev_dbg(dev, "%s entered and resetting device\n", __func__);
-	idxd_device_init_reset(idxd);
+	rc = idxd_device_init_reset(idxd);
+	if (rc < 0)
+		return rc;
+
 	dev_dbg(dev, "IDXD reset complete\n");
 
 	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM)) {
-- 
Gitee


From 153ceb9a2a61129286e2a75df385c21d6ef58766 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 7 Jan 2021 09:44:51 -0700
Subject: [PATCH 0585/2161] x86/asm: Annotate movdir64b()'s dst argument with
 __iomem

commit 6ae58d871319dc22ef780baaacd393f8543a1e74 upstream.

Add a missing __iomem annotation to address a sparse warning. The caller
is expected to pass an __iomem annotated pointer to this function. The
current usages send a 64-bytes command descriptor to an MMIO location
(portal) on a device for consumption. When future usages for the
MOVDIR64B instruction warrant a separate variant of a memory to memory
operation, the argument annotation can be revisited.

Also, from the comment in movdir64b() @__dst must be supplied as an
lvalue because this tells the compiler what the object is (its size) the
instruction accesses. I.e., not the pointers but what they point to,
thus the deref'ing '*'."

The actual sparse warning is:

  sparse warnings: (new ones prefixed by >>)
     drivers/dma/idxd/submit.c: note: in included file (through include/linux/io.h, include/linux/pci.h):
  >> arch/x86/include/asm/io.h:422:27: sparse: sparse: incorrect type in \
     argument 1 (different address spaces)
		   @@     expected void *dst
		   @@     got void [noderef] __iomem *dst @@
     arch/x86/include/asm/io.h:422:27: sparse:     expected void *dst
     arch/x86/include/asm/io.h:422:27: sparse:     got void [noderef] __iomem *dst

 [ bp: Massage commit message. ]

Fixes: 0888e1030d3e ("x86/asm: Carve out a generic movdir64b() helper for general usage")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lkml.kernel.org/r/161003787823.4062451.6564503265464317197.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/special_insns.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 9f0f2d71f670..36797a9113a7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -215,10 +215,10 @@ static inline void clwb(volatile void *__p)
 #define nop() asm volatile ("nop")
 
 /* The dst parameter must be 64-bytes aligned */
-static inline void movdir64b(void *dst, const void *src)
+static inline void movdir64b(void __iomem *dst, const void *src)
 {
 	const struct { char _[64]; } *__src = src;
-	struct { char _[64]; } *__dst = dst;
+	struct { char _[64]; } __iomem *__dst = dst;
 
 	/*
 	 * MOVDIR64B %(rdx), rax.
-- 
Gitee


From accc8cf6bae3421e0c2b9a2956e41c55aff5cb57 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 7 Jan 2021 09:45:21 -0700
Subject: [PATCH 0586/2161] x86/asm: Add a missing __iomem annotation in
 enqcmds()

commit 5c99720b28381bb400d4f546734c34ddaf608761 upstream.

Add a missing __iomem annotation to address a sparse warning. The caller
is expected to pass an __iomem annotated pointer to this function. The
current usages send a 64-bytes command descriptor to an MMIO location
(portal) on a device for consumption.

Also, from the comment in movdir64b(), which also applies to enqcmds(),
@__dst must be supplied as an lvalue because this tells the compiler
what the object is (its size) the instruction accesses. I.e., not the
pointers but what they point to, thus the deref'ing '*'."

The actual sparse warning is:

  drivers/dma/idxd/submit.c: note: in included file (through arch/x86/include/asm/processor.h, \
	arch/x86/include/asm/timex.h, include/linux/timex.h, include/linux/time32.h, \
	include/linux/time.h, include/linux/stat.h, ...):
  ./arch/x86/include/asm/special_insns.h:289:41: warning: incorrect type in initializer (different address spaces)
  ./arch/x86/include/asm/special_insns.h:289:41:    expected struct <noident> *__dst
  ./arch/x86/include/asm/special_insns.h:289:41:    got void [noderef] __iomem *dst

 [ bp: Massage commit message. ]

Fixes: 7f5933f81bd8 ("x86/asm: Add an enqcmds() wrapper for the ENQCMDS instruction")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lkml.kernel.org/r/161003789741.4062451.14362269365703761223.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/special_insns.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 36797a9113a7..a1f1672b3c2e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -258,7 +258,7 @@ static inline void movdir64b(void __iomem *dst, const void *src)
 static inline int enqcmds(void __iomem *dst, const void *src)
 {
 	const struct { char _[64]; } *__src = src;
-	struct { char _[64]; } *__dst = dst;
+	struct { char _[64]; } __iomem *__dst = dst;
 	int zf;
 
 	/*
-- 
Gitee


From 54f72fed4b98d3247c0d17ee0e326af504335d49 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 15 Jan 2021 14:53:07 -0700
Subject: [PATCH 0587/2161] dmaengine: idxd: set DMA channel to be private

commit c06e424be5f5184468c5f761c0d2cf1ed0a4e0fc upstream.

Add DMA_PRIVATE attribute flag to idxd DMA channels. The dedicated WQs are
expected to be used by a single client and not shared. While doing NTB
testing this mistake was discovered, which prevented ntb_transport from
requesting DSA wqs as DMA channels via dma_request_channel().

Reported-by: Srinijia Kambham <srinija.kambham@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Srinijia Kambham <srinija.kambham@intel.com>
Fixes: 8f47d1a5e545 ("dmaengine: idxd: connect idxd to dmaengine subsystem")
Link: https://lore.kernel.org/r/161074758743.2184057.3388557138816350980.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/dma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 8ed2773d8285..90d19d06783a 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -165,6 +165,7 @@ int idxd_register_dma_device(struct idxd_device *idxd)
 	INIT_LIST_HEAD(&dma->channels);
 	dma->dev = &idxd->pdev->dev;
 
+	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
 	dma_cap_set(DMA_COMPLETION_NO_ORDER, dma->cap_mask);
 	dma->device_release = idxd_dma_release;
 
-- 
Gitee


From 8c6bb40061f52f4bbb914f3f9d204bc694ed6b97 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 18 Jan 2021 10:28:44 -0700
Subject: [PATCH 0588/2161] dmaengine: move channel device_node deletion to
 driver

commit e594443196d6e0ef3d3b30320c49b3a4d4f9a547 upstream.

Channel device_node deletion is managed by the device driver rather than
the dmaengine core. The deletion was accidentally introduced when making
channel unregister dynamic. It causes xilinx_dma module to crash on unload
as reported by Radhey. Remove chan->device_node delete in dmaengine and
also fix up idxd driver.

[   42.142705] Internal error: Oops: 96000044 [#1] SMP
[   42.147566] Modules linked in: xilinx_dma(-) clk_xlnx_clock_wizard uio_pdrv_genirq
[   42.155139] CPU: 1 PID: 2075 Comm: rmmod Not tainted 5.10.1-00026-g3a2e6dd7a05-dirty #192
[   42.163302] Hardware name: Enclustra XU5 SOM (DT)
[   42.167992] pstate: 40000005 (nZcv daif -PAN -UAO -TCO BTYPE=--)
[   42.173996] pc : xilinx_dma_chan_remove+0x74/0xa0 [xilinx_dma]
[   42.179815] lr : xilinx_dma_chan_remove+0x70/0xa0 [xilinx_dma]
[   42.185636] sp : ffffffc01112bca0
[   42.188935] x29: ffffffc01112bca0 x28: ffffff80402ea640

xilinx_dma_chan_remove+0x74/0xa0:
__list_del at ./include/linux/list.h:112 (inlined by)
__list_del_entry at./include/linux/list.h:135 (inlined by)
list_del at ./include/linux/list.h:146 (inlined by)
xilinx_dma_chan_remove at drivers/dma/xilinx/xilinx_dma.c:2546

Fixes: e81274cd6b52 ("dmaengine: add support to dynamic register/unregister of channels")
Reported-by: Radhey Shyam Pandey <radheys@xilinx.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Link: https://lore.kernel.org/r/161099092469.2495902.5064826526660062342.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Cc: stable@vger.kernel.org # 5.9+
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 1 -
 drivers/dma/idxd/dma.c  | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 962cbb5e5f7f..fe6a460c4373 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1110,7 +1110,6 @@ static void __dma_async_device_channel_unregister(struct dma_device *device,
 		  "%s called while %d clients hold a reference\n",
 		  __func__, chan->client_count);
 	mutex_lock(&dma_list_mutex);
-	list_del(&chan->device_node);
 	device->chancnt--;
 	chan->dev->chan = NULL;
 	mutex_unlock(&dma_list_mutex);
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 90d19d06783a..a15e50126434 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -206,5 +206,8 @@ int idxd_register_dma_channel(struct idxd_wq *wq)
 
 void idxd_unregister_dma_channel(struct idxd_wq *wq)
 {
-	dma_async_device_channel_unregister(&wq->idxd->dma_dev, &wq->dma_chan);
+	struct dma_chan *chan = &wq->dma_chan;
+
+	dma_async_device_channel_unregister(&wq->idxd->dma_dev, chan);
+	list_del(&chan->device_node);
 }
-- 
Gitee


From d80891282ec15a809df2e2f76a3fc0f27a6e9504 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 22 Jan 2021 11:46:00 -0700
Subject: [PATCH 0589/2161] dmaengine: idxd: add module parameter to force
 disable of SVA

commit 03d939c7e3d8800a9feb54808929c5776ac510eb upstream.

Add a module parameter that overrides the SVA feature enabling. This keeps
the driver in legacy mode even when intel_iommu=sm_on is set. In this mode,
the descriptor fields must be programmed with dma_addr_t from the Linux DMA
API for source, destination, and completion descriptors.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161134110457.4005461.13171197785259115852.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 6 ++++++
 drivers/dma/idxd/init.c                         | 8 +++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 551e4859b616..ee1c67a3ecf7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1543,6 +1543,12 @@
 			In such case C2/C3 won't be used again.
 			idle=nomwait: Disable mwait for CPU C-states
 
+	idxd.sva=	[HW]
+			Format: <bool>
+			Allow force disabling of Shared Virtual Memory (SVA)
+			support for the idxd driver. By default it is set to
+			true (1).
+
 	ieee754=	[MIPS] Select IEEE Std 754 conformance mode
 			Format: { strict | legacy | 2008 | relaxed }
 			Default: strict
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 957c7ee91519..085a0c3b62c6 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -26,6 +26,10 @@ MODULE_VERSION(IDXD_DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Intel Corporation");
 
+static bool sva = true;
+module_param(sva, bool, 0644);
+MODULE_PARM_DESC(sva, "Toggle SVA support on/off");
+
 #define DRV_NAME "idxd"
 
 bool support_enqcmd;
@@ -341,12 +345,14 @@ static int idxd_probe(struct idxd_device *idxd)
 
 	dev_dbg(dev, "IDXD reset complete\n");
 
-	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM)) {
+	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) {
 		rc = idxd_enable_system_pasid(idxd);
 		if (rc < 0)
 			dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
 		else
 			set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+	} else if (!sva) {
+		dev_warn(dev, "User forced SVA off via module param.\n");
 	}
 
 	idxd_read_caps(idxd);
-- 
Gitee


From efd22242f7c8fa8a277a120efa69e8678da4262c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 16 Feb 2021 17:13:42 -0700
Subject: [PATCH 0590/2161] dmaengine: idxd: Fix clobbering of SWERR overflow
 bit on writeback

commit ea941ac294d75d0ace50797aebf0056f6f8f7a7f upstream.

Current code blindly writes over the SWERR and the OVERFLOW bits. Write
back the bits actually read instead so the driver avoids clobbering the
OVERFLOW bit that comes after the register is read.

Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Reported-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161352082229.3511254.1002151220537623503.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index a60ca11a5784..f1463fc58112 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -124,7 +124,9 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 		for (i = 0; i < 4; i++)
 			idxd->sw_err.bits[i] = ioread64(idxd->reg_base +
 					IDXD_SWERR_OFFSET + i * sizeof(u64));
-		iowrite64(IDXD_SWERR_ACK, idxd->reg_base + IDXD_SWERR_OFFSET);
+
+		iowrite64(idxd->sw_err.bits[0] & IDXD_SWERR_ACK,
+			  idxd->reg_base + IDXD_SWERR_OFFSET);
 
 		if (idxd->sw_err.valid && idxd->sw_err.wq_idx_valid) {
 			int id = idxd->sw_err.wq_idx;
-- 
Gitee


From 2d70824ee2f50c52e9d69426bbc1b753776a2793 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 22 Mar 2021 16:36:25 -0700
Subject: [PATCH 0591/2161] dmaengine: idxd: fix delta_rec and crc size field
 for completion record

commit 4ac823e9cd85f66da274c951d21bf9f6b714b729 upstream.

The delta_rec_size and crc_val in the completion record should
be 32bits and not 16bits.

Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Reported-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161645618572.2003490.14466173451736323035.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/idxd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 236d437947bc..e33997b4d750 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -247,8 +247,8 @@ struct dsa_completion_record {
 			uint32_t	rsvd2:8;
 		};
 
-		uint16_t	delta_rec_size;
-		uint16_t	crc_val;
+		uint32_t	delta_rec_size;
+		uint32_t	crc_val;
 
 		/* DIF check & strip */
 		struct {
-- 
Gitee


From 5f10ba09a8d09fae50a50564b44e29f1a8cb0b91 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 22 Mar 2021 16:37:29 -0700
Subject: [PATCH 0592/2161] dmaengine: idxd: fix opcap sysfs attribute output

commit ea6a5735d2a61b938a302eb3629272342a9e7c46 upstream.

The operation capability register is 256bits. The current output only
prints out the first 64bits. Fix to output the entire 256bits. The current
code omits operation caps from IAX devices.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Reported-by: Lucas Van <lucas.van@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161645624963.2003736.829798666998490151.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 816c36c13f53..15342777aa79 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1447,8 +1447,14 @@ static ssize_t op_cap_show(struct device *dev,
 {
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
+	int i, rc = 0;
+
+	for (i = 0; i < 4; i++)
+		rc += sysfs_emit_at(buf, rc, "%#llx ", idxd->hw.opcap.bits[i]);
 
-	return sprintf(buf, "%#llx\n", idxd->hw.opcap.bits[0]);
+	rc--;
+	rc += sysfs_emit_at(buf, rc, "\n");
+	return rc;
 }
 static DEVICE_ATTR_RO(op_cap);
 
-- 
Gitee


From 019568b12b674957a13be7477cd1b221457f8e87 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 7 Apr 2021 12:59:47 -0700
Subject: [PATCH 0593/2161] dmaengine: idxd: fix wq size store permission state

commit 0fff71c5a311e1264988179f7dcc217fda15fadd upstream.

WQ size can only be changed when the device is disabled. Current code
allows change when device is enabled but wq is disabled. Change the check
to detect device state.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161782558755.107710.18138252584838406025.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 15342777aa79..93a3ae9e2057 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -987,7 +987,7 @@ static ssize_t wq_size_store(struct device *dev,
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
-	if (wq->state != IDXD_WQ_DISABLED)
+	if (idxd->state == IDXD_DEV_ENABLED)
 		return -EPERM;
 
 	if (size + total_claimed_wq_size(idxd) - wq->size > idxd->max_wq_size)
-- 
Gitee


From 17b39f76dfe3832d91283f0c99e1fc2b3c838775 Mon Sep 17 00:00:00 2001
From: Lv Yunlong <lyl2019@mail.ustc.edu.cn>
Date: Tue, 30 Mar 2021 18:44:58 -0700
Subject: [PATCH 0594/2161] dmaengine: Fix a double free in
 dma_async_device_register

commit ea45b6008f8095db0cc09ad6e03c7785c2986197 upstream.

In the first list_for_each_entry() macro of dma_async_device_register,
it gets the chan from list and calls __dma_async_device_channel_register
(..,chan). We can see that chan->local is allocated by alloc_percpu() and
it is freed chan->local by free_percpu(chan->local) when
__dma_async_device_channel_register() failed.

But after __dma_async_device_channel_register() failed, the caller will
goto err_out and freed the chan->local in the second time by free_percpu().

The cause of this problem is forget to set chan->local to NULL when
chan->local was freed in __dma_async_device_channel_register(). My
patch sets chan->local to NULL when the callee failed to avoid double free.

Fixes: d2fb0a0438384 ("dmaengine: break out channel registration")
Signed-off-by: Lv Yunlong <lyl2019@mail.ustc.edu.cn>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20210331014458.3944-1-lyl2019@mail.ustc.edu.cn
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index fe6a460c4373..af3ee288bc11 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1086,6 +1086,7 @@ static int __dma_async_device_channel_register(struct dma_device *device,
 	kfree(chan->dev);
  err_free_local:
 	free_percpu(chan->local);
+	chan->local = NULL;
 	return rc;
 }
 
-- 
Gitee


From d147030b3a2ab77947924e3f2f229c0ccbe994fb Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 12 Apr 2021 09:23:27 -0700
Subject: [PATCH 0595/2161] dmaengine: idxd: clear MSIX permission entry on
 shutdown

commit 6df0e6c57dfc064af330071f372f11aa8c584997 upstream.

Add disabling/clearing of MSIX permission entries on device shutdown to
mirror the enabling of the MSIX entries on probe. Current code left the
MSIX enabled and the pasid entries still programmed at device shutdown.

Fixes: 8e50d392652f ("dmaengine: idxd: Add shared workqueue support")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161824457969.882533.6020239898682672311.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 30 ++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |  2 ++
 drivers/dma/idxd/init.c   | 11 ++---------
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 84a6ea60ecf0..c09687013d29 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -574,6 +574,36 @@ void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid)
 }
 
 /* Device configuration bits */
+void idxd_msix_perm_setup(struct idxd_device *idxd)
+{
+	union msix_perm mperm;
+	int i, msixcnt;
+
+	msixcnt = pci_msix_vec_count(idxd->pdev);
+	if (msixcnt < 0)
+		return;
+
+	mperm.bits = 0;
+	mperm.pasid = idxd->pasid;
+	mperm.pasid_en = device_pasid_enabled(idxd);
+	for (i = 1; i < msixcnt; i++)
+		iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + i * 8);
+}
+
+void idxd_msix_perm_clear(struct idxd_device *idxd)
+{
+	union msix_perm mperm;
+	int i, msixcnt;
+
+	msixcnt = pci_msix_vec_count(idxd->pdev);
+	if (msixcnt < 0)
+		return;
+
+	mperm.bits = 0;
+	for (i = 1; i < msixcnt; i++)
+		iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + i * 8);
+}
+
 static void idxd_group_config_write(struct idxd_group *group)
 {
 	struct idxd_device *idxd = group->idxd;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 81a0e65fd316..eda2ee10501f 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -316,6 +316,8 @@ void idxd_unregister_driver(void);
 struct bus_type *idxd_get_bus_type(struct idxd_device *idxd);
 
 /* device interrupt control */
+void idxd_msix_perm_setup(struct idxd_device *idxd);
+void idxd_msix_perm_clear(struct idxd_device *idxd);
 irqreturn_t idxd_irq_handler(int vec, void *data);
 irqreturn_t idxd_misc_thread(int vec, void *data);
 irqreturn_t idxd_wq_thread(int irq, void *data);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 085a0c3b62c6..6584b0ec07d5 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -65,7 +65,6 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	struct idxd_irq_entry *irq_entry;
 	int i, msixcnt;
 	int rc = 0;
-	union msix_perm mperm;
 
 	msixcnt = pci_msix_vec_count(pdev);
 	if (msixcnt < 0) {
@@ -144,14 +143,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	}
 
 	idxd_unmask_error_interrupts(idxd);
-
-	/* Setup MSIX permission table */
-	mperm.bits = 0;
-	mperm.pasid = idxd->pasid;
-	mperm.pasid_en = device_pasid_enabled(idxd);
-	for (i = 1; i < msixcnt; i++)
-		iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + i * 8);
-
+	idxd_msix_perm_setup(idxd);
 	return 0;
 
  err_no_irq:
@@ -510,6 +502,7 @@ static void idxd_shutdown(struct pci_dev *pdev)
 		idxd_flush_work_list(irq_entry);
 	}
 
+	idxd_msix_perm_clear(idxd);
 	destroy_workqueue(idxd->wq);
 }
 
-- 
Gitee


From 79bdbc558cc55ed62900669768e30b6ce2d8dc74 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 12 Apr 2021 09:02:36 -0700
Subject: [PATCH 0596/2161] dmaengine: idxd: fix wq cleanup of WQCFG registers

commit ea9aadc06a9f10ad20a90edc0a484f1147d88a7a upstream.

A pre-release silicon erratum workaround where wq reset does not clear
WQCFG registers was leaked into upstream code. Use wq reset command
instead of blasting the MMIO region. This also address an issue where
we clobber registers in future devices.

Fixes: da32b28c95a7 ("dmaengine: idxd: cleanup workqueue config after disabling")
Reported-by: Shreenivaas Devarajan <shreenivaas.devarajan@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161824330020.881560.16375921906426627033.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 35 ++++++++++++++++++++++++-----------
 drivers/dma/idxd/idxd.h   |  1 +
 drivers/dma/idxd/sysfs.c  |  9 ++-------
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index c09687013d29..31c819544a22 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -282,6 +282,22 @@ void idxd_wq_drain(struct idxd_wq *wq)
 	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_WQ, operand, NULL);
 }
 
+void idxd_wq_reset(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand;
+
+	if (wq->state != IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state);
+		return;
+	}
+
+	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
+	idxd_cmd_exec(idxd, IDXD_CMD_RESET_WQ, operand, NULL);
+	wq->state = IDXD_WQ_DISABLED;
+}
+
 int idxd_wq_map_portal(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
@@ -363,8 +379,6 @@ int idxd_wq_disable_pasid(struct idxd_wq *wq)
 void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
-	struct device *dev = &idxd->pdev->dev;
-	int i, wq_offset;
 
 	lockdep_assert_held(&idxd->dev_lock);
 	memset(wq->wqcfg, 0, idxd->wqcfg_size);
@@ -376,14 +390,6 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	wq->ats_dis = 0;
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
-
-	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
-		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
-		iowrite32(0, idxd->reg_base + wq_offset);
-		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n",
-			wq->id, i, wq_offset,
-			ioread32(idxd->reg_base + wq_offset));
-	}
 }
 
 /* Device control bits */
@@ -672,7 +678,14 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	if (!wq->group)
 		return 0;
 
-	memset(wq->wqcfg, 0, idxd->wqcfg_size);
+	/*
+	 * Instead of memset the entire shadow copy of WQCFG, copy from the hardware after
+	 * wq reset. This will copy back the sticky values that are present on some devices.
+	 */
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
+		wq->wqcfg->bits[i] = ioread32(idxd->reg_base + wq_offset);
+	}
 
 	/* byte 0-3 */
 	wq->wqcfg->wq_size = wq->size;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index eda2ee10501f..76014c14f473 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -343,6 +343,7 @@ void idxd_wq_free_resources(struct idxd_wq *wq);
 int idxd_wq_enable(struct idxd_wq *wq);
 int idxd_wq_disable(struct idxd_wq *wq);
 void idxd_wq_drain(struct idxd_wq *wq);
+void idxd_wq_reset(struct idxd_wq *wq);
 int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
 void idxd_wq_disable_cleanup(struct idxd_wq *wq);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 93a3ae9e2057..39444af44dd7 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -275,7 +275,6 @@ static void disable_wq(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
-	int rc;
 
 	mutex_lock(&wq->wq_lock);
 	dev_dbg(dev, "%s removing WQ %s\n", __func__, dev_name(&wq->conf_dev));
@@ -296,17 +295,13 @@ static void disable_wq(struct idxd_wq *wq)
 	idxd_wq_unmap_portal(wq);
 
 	idxd_wq_drain(wq);
-	rc = idxd_wq_disable(wq);
+	idxd_wq_reset(wq);
 
 	idxd_wq_free_resources(wq);
 	wq->client_count = 0;
 	mutex_unlock(&wq->wq_lock);
 
-	if (rc < 0)
-		dev_warn(dev, "Failed to disable %s: %d\n",
-			 dev_name(&wq->conf_dev), rc);
-	else
-		dev_info(dev, "wq %s disabled\n", dev_name(&wq->conf_dev));
+	dev_info(dev, "wq %s disabled\n", dev_name(&wq->conf_dev));
 }
 
 static int idxd_config_bus_remove(struct device *dev)
-- 
Gitee


From 51d7e5dfe7bb6c90453c88e10a824e3855416b1a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 15 Apr 2021 12:06:54 +0100
Subject: [PATCH 0597/2161] dmaengine: idxd: Fix potential null dereference on
 pointer status

commit 28ac8e03c43dfc6a703aa420d18222540b801120 upstream.

There are calls to idxd_cmd_exec that pass a null status pointer however
a recent commit has added an assignment to *status that can end up
with a null pointer dereference.  The function expects a null status
pointer sometimes as there is a later assignment to *status where
status is first null checked.  Fix the issue by null checking status
before making the assignment.

Addresses-Coverity: ("Explicit null dereferenced")
Fixes: 89e3becd8f82 ("dmaengine: idxd: check device state before issue command")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20210415110654.1941580-1-colin.king@canonical.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 31c819544a22..78d2dc5e9bd8 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -451,7 +451,8 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 
 	if (idxd_device_is_halted(idxd)) {
 		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
-		*status = IDXD_CMDSTS_HW_ERR;
+		if (status)
+			*status = IDXD_CMDSTS_HW_ERR;
 		return;
 	}
 
-- 
Gitee


From eb1863351a2965f026b9f0c3e53045d62df66e91 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:10 -0700
Subject: [PATCH 0598/2161] dmaengine: idxd: fix dma device lifetime

commit 397862855619271296e46d10f7dfa7bafe71eb81 upstream.

The devm managed lifetime is incompatible with 'struct device' objects that
resides in idxd context. This is one of the series that clean up the idxd
driver 'struct device' lifetime. Remove embedding of dma_device and dma_chan
in idxd since it's not the only interface that idxd will use. The freeing of
the dma_device will be managed by the ->release() function.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/161852983001.2203940.14817017492384561719.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  2 -
 drivers/dma/idxd/dma.c    | 77 ++++++++++++++++++++++++++++++++-------
 drivers/dma/idxd/idxd.h   | 18 +++++++--
 3 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 78d2dc5e9bd8..d255bb016c4d 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -186,8 +186,6 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 		desc->id = i;
 		desc->wq = wq;
 		desc->cpu = -1;
-		dma_async_tx_descriptor_init(&desc->txd, &wq->dma_chan);
-		desc->txd.tx_submit = idxd_dma_tx_submit;
 	}
 
 	return 0;
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index a15e50126434..77439b645044 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -14,7 +14,10 @@
 
 static inline struct idxd_wq *to_idxd_wq(struct dma_chan *c)
 {
-	return container_of(c, struct idxd_wq, dma_chan);
+	struct idxd_dma_chan *idxd_chan;
+
+	idxd_chan = container_of(c, struct idxd_dma_chan, chan);
+	return idxd_chan->wq;
 }
 
 void idxd_dma_complete_txd(struct idxd_desc *desc,
@@ -135,7 +138,7 @@ static void idxd_dma_issue_pending(struct dma_chan *dma_chan)
 {
 }
 
-dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
+static dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
 	struct dma_chan *c = tx->chan;
 	struct idxd_wq *wq = to_idxd_wq(c);
@@ -156,14 +159,25 @@ dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 
 static void idxd_dma_release(struct dma_device *device)
 {
+	struct idxd_dma_dev *idxd_dma = container_of(device, struct idxd_dma_dev, dma);
+
+	kfree(idxd_dma);
 }
 
 int idxd_register_dma_device(struct idxd_device *idxd)
 {
-	struct dma_device *dma = &idxd->dma_dev;
+	struct idxd_dma_dev *idxd_dma;
+	struct dma_device *dma;
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
 
+	idxd_dma = kzalloc_node(sizeof(*idxd_dma), GFP_KERNEL, dev_to_node(dev));
+	if (!idxd_dma)
+		return -ENOMEM;
+
+	dma = &idxd_dma->dma;
 	INIT_LIST_HEAD(&dma->channels);
-	dma->dev = &idxd->pdev->dev;
+	dma->dev = dev;
 
 	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
 	dma_cap_set(DMA_COMPLETION_NO_ORDER, dma->cap_mask);
@@ -179,35 +193,72 @@ int idxd_register_dma_device(struct idxd_device *idxd)
 	dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources;
 	dma->device_free_chan_resources = idxd_dma_free_chan_resources;
 
-	return dma_async_device_register(&idxd->dma_dev);
+	rc = dma_async_device_register(dma);
+	if (rc < 0) {
+		kfree(idxd_dma);
+		return rc;
+	}
+
+	idxd_dma->idxd = idxd;
+	/*
+	 * This pointer is protected by the refs taken by the dma_chan. It will remain valid
+	 * as long as there are outstanding channels.
+	 */
+	idxd->idxd_dma = idxd_dma;
+	return 0;
 }
 
 void idxd_unregister_dma_device(struct idxd_device *idxd)
 {
-	dma_async_device_unregister(&idxd->dma_dev);
+	dma_async_device_unregister(&idxd->idxd_dma->dma);
 }
 
 int idxd_register_dma_channel(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
-	struct dma_device *dma = &idxd->dma_dev;
-	struct dma_chan *chan = &wq->dma_chan;
-	int rc;
+	struct dma_device *dma = &idxd->idxd_dma->dma;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_dma_chan *idxd_chan;
+	struct dma_chan *chan;
+	int rc, i;
+
+	idxd_chan = kzalloc_node(sizeof(*idxd_chan), GFP_KERNEL, dev_to_node(dev));
+	if (!idxd_chan)
+		return -ENOMEM;
 
-	memset(&wq->dma_chan, 0, sizeof(struct dma_chan));
+	chan = &idxd_chan->chan;
 	chan->device = dma;
 	list_add_tail(&chan->device_node, &dma->channels);
+
+	for (i = 0; i < wq->num_descs; i++) {
+		struct idxd_desc *desc = wq->descs[i];
+
+		dma_async_tx_descriptor_init(&desc->txd, chan);
+		desc->txd.tx_submit = idxd_dma_tx_submit;
+	}
+
 	rc = dma_async_device_channel_register(dma, chan);
-	if (rc < 0)
+	if (rc < 0) {
+		kfree(idxd_chan);
 		return rc;
+	}
+
+	wq->idxd_chan = idxd_chan;
+	idxd_chan->wq = wq;
+	get_device(&wq->conf_dev);
 
 	return 0;
 }
 
 void idxd_unregister_dma_channel(struct idxd_wq *wq)
 {
-	struct dma_chan *chan = &wq->dma_chan;
+	struct idxd_dma_chan *idxd_chan = wq->idxd_chan;
+	struct dma_chan *chan = &idxd_chan->chan;
+	struct idxd_dma_dev *idxd_dma = wq->idxd->idxd_dma;
 
-	dma_async_device_channel_unregister(&wq->idxd->dma_dev, chan);
+	dma_async_device_channel_unregister(&idxd_dma->dma, chan);
 	list_del(&chan->device_node);
+	kfree(wq->idxd_chan);
+	wq->idxd_chan = NULL;
+	put_device(&wq->conf_dev);
 }
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 76014c14f473..80e534680c9a 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -14,6 +14,9 @@
 
 extern struct kmem_cache *idxd_desc_pool;
 
+struct idxd_device;
+struct idxd_wq;
+
 #define IDXD_REG_TIMEOUT	50
 #define IDXD_DRAIN_TIMEOUT	5000
 
@@ -96,6 +99,11 @@ enum idxd_complete_type {
 	IDXD_COMPLETE_DEV_FAIL,
 };
 
+struct idxd_dma_chan {
+	struct dma_chan chan;
+	struct idxd_wq *wq;
+};
+
 struct idxd_wq {
 	void __iomem *portal;
 	struct device conf_dev;
@@ -125,7 +133,7 @@ struct idxd_wq {
 	int compls_size;
 	struct idxd_desc **descs;
 	struct sbitmap_queue sbq;
-	struct dma_chan dma_chan;
+	struct idxd_dma_chan *idxd_chan;
 	char name[WQ_NAME_SIZE + 1];
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
@@ -162,6 +170,11 @@ enum idxd_device_flag {
 	IDXD_FLAG_PASID_ENABLED,
 };
 
+struct idxd_dma_dev {
+	struct idxd_device *idxd;
+	struct dma_device dma;
+};
+
 struct idxd_device {
 	enum idxd_type type;
 	struct device conf_dev;
@@ -210,7 +223,7 @@ struct idxd_device {
 	int num_wq_irqs;
 	struct idxd_irq_entry *irq_entries;
 
-	struct dma_device dma_dev;
+	struct idxd_dma_dev *idxd_dma;
 	struct workqueue_struct *wq;
 	struct work_struct work;
 };
@@ -363,7 +376,6 @@ void idxd_unregister_dma_channel(struct idxd_wq *wq);
 void idxd_parse_completion_status(u8 status, enum dmaengine_tx_result *res);
 void idxd_dma_complete_txd(struct idxd_desc *desc,
 			   enum idxd_complete_type comp_type);
-dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx);
 
 /* cdev */
 int idxd_cdev_register(void);
-- 
Gitee


From 8348584419a1fa7353200d9b4e15be03e8a66a78 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:15 -0700
Subject: [PATCH 0599/2161] dmaengine: idxd: cleanup pci interrupt vector
 allocation management

commit 5fc8e85ff12ce0530ac658686902a0ee64600f56 upstream.

The devm managed lifetime is incompatible with 'struct device' objects that
resides in idxd context. This is one of the series that clean up the idxd
driver 'struct device' lifetime. Remove devm managed pci interrupt vectors
and replace with unmanged allocators.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/161852983563.2203940.8116028229124776669.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  4 +--
 drivers/dma/idxd/idxd.h   |  2 +-
 drivers/dma/idxd/init.c   | 64 +++++++++++++++++----------------------
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index d255bb016c4d..3f696abd74ac 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -19,7 +19,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 /* Interrupt control bits */
 void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
 {
-	struct irq_data *data = irq_get_irq_data(idxd->msix_entries[vec_id].vector);
+	struct irq_data *data = irq_get_irq_data(idxd->irq_entries[vec_id].vector);
 
 	pci_msi_mask_irq(data);
 }
@@ -36,7 +36,7 @@ void idxd_mask_msix_vectors(struct idxd_device *idxd)
 
 void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id)
 {
-	struct irq_data *data = irq_get_irq_data(idxd->msix_entries[vec_id].vector);
+	struct irq_data *data = irq_get_irq_data(idxd->irq_entries[vec_id].vector);
 
 	pci_msi_unmask_irq(data);
 }
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 80e534680c9a..401b035e42b1 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -36,6 +36,7 @@ struct idxd_device_driver {
 struct idxd_irq_entry {
 	struct idxd_device *idxd;
 	int id;
+	int vector;
 	struct llist_head pending_llist;
 	struct list_head work_list;
 	/*
@@ -219,7 +220,6 @@ struct idxd_device {
 
 	union sw_err_reg sw_err;
 	wait_queue_head_t cmd_waitq;
-	struct msix_entry *msix_entries;
 	int num_wq_irqs;
 	struct idxd_irq_entry *irq_entries;
 
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 6584b0ec07d5..5cf1bf095ae1 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -61,7 +61,6 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
 	struct device *dev = &pdev->dev;
-	struct msix_entry *msix;
 	struct idxd_irq_entry *irq_entry;
 	int i, msixcnt;
 	int rc = 0;
@@ -69,23 +68,13 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	msixcnt = pci_msix_vec_count(pdev);
 	if (msixcnt < 0) {
 		dev_err(dev, "Not MSI-X interrupt capable.\n");
-		goto err_no_irq;
+		return -ENOSPC;
 	}
 
-	idxd->msix_entries = devm_kzalloc(dev, sizeof(struct msix_entry) *
-			msixcnt, GFP_KERNEL);
-	if (!idxd->msix_entries) {
-		rc = -ENOMEM;
-		goto err_no_irq;
-	}
-
-	for (i = 0; i < msixcnt; i++)
-		idxd->msix_entries[i].entry = i;
-
-	rc = pci_enable_msix_exact(pdev, idxd->msix_entries, msixcnt);
-	if (rc) {
-		dev_err(dev, "Failed enabling %d MSIX entries.\n", msixcnt);
-		goto err_no_irq;
+	rc = pci_alloc_irq_vectors(pdev, msixcnt, msixcnt, PCI_IRQ_MSIX);
+	if (rc != msixcnt) {
+		dev_err(dev, "Failed enabling %d MSIX entries: %d\n", msixcnt, rc);
+		return -ENOSPC;
 	}
 	dev_dbg(dev, "Enabled %d msix vectors\n", msixcnt);
 
@@ -98,58 +87,57 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 					 GFP_KERNEL);
 	if (!idxd->irq_entries) {
 		rc = -ENOMEM;
-		goto err_no_irq;
+		goto err_irq_entries;
 	}
 
 	for (i = 0; i < msixcnt; i++) {
 		idxd->irq_entries[i].id = i;
 		idxd->irq_entries[i].idxd = idxd;
+		idxd->irq_entries[i].vector = pci_irq_vector(pdev, i);
 		spin_lock_init(&idxd->irq_entries[i].list_lock);
 	}
 
-	msix = &idxd->msix_entries[0];
 	irq_entry = &idxd->irq_entries[0];
-	rc = devm_request_threaded_irq(dev, msix->vector, idxd_irq_handler,
-				       idxd_misc_thread, 0, "idxd-misc",
-				       irq_entry);
+	rc = request_threaded_irq(irq_entry->vector, idxd_irq_handler, idxd_misc_thread,
+				  0, "idxd-misc", irq_entry);
 	if (rc < 0) {
 		dev_err(dev, "Failed to allocate misc interrupt.\n");
-		goto err_no_irq;
+		goto err_misc_irq;
 	}
 
-	dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n",
-		msix->vector);
+	dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n", irq_entry->vector);
 
 	/* first MSI-X entry is not for wq interrupts */
 	idxd->num_wq_irqs = msixcnt - 1;
 
 	for (i = 1; i < msixcnt; i++) {
-		msix = &idxd->msix_entries[i];
 		irq_entry = &idxd->irq_entries[i];
 
 		init_llist_head(&idxd->irq_entries[i].pending_llist);
 		INIT_LIST_HEAD(&idxd->irq_entries[i].work_list);
-		rc = devm_request_threaded_irq(dev, msix->vector,
-					       idxd_irq_handler,
-					       idxd_wq_thread, 0,
-					       "idxd-portal", irq_entry);
+		rc = request_threaded_irq(irq_entry->vector, idxd_irq_handler,
+					  idxd_wq_thread, 0, "idxd-portal", irq_entry);
 		if (rc < 0) {
-			dev_err(dev, "Failed to allocate irq %d.\n",
-				msix->vector);
-			goto err_no_irq;
+			dev_err(dev, "Failed to allocate irq %d.\n", irq_entry->vector);
+			goto err_wq_irqs;
 		}
-		dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n",
-			i, msix->vector);
+		dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, irq_entry->vector);
 	}
 
 	idxd_unmask_error_interrupts(idxd);
 	idxd_msix_perm_setup(idxd);
 	return 0;
 
- err_no_irq:
+ err_wq_irqs:
+	while (--i >= 0) {
+		irq_entry = &idxd->irq_entries[i];
+		free_irq(irq_entry->vector, irq_entry);
+	}
+ err_misc_irq:
 	/* Disable error interrupt generation */
 	idxd_mask_error_interrupts(idxd);
-	pci_disable_msix(pdev);
+ err_irq_entries:
+	pci_free_irq_vectors(pdev);
 	dev_err(dev, "No usable interrupts\n");
 	return rc;
 }
@@ -495,7 +483,8 @@ static void idxd_shutdown(struct pci_dev *pdev)
 
 	for (i = 0; i < msixcnt; i++) {
 		irq_entry = &idxd->irq_entries[i];
-		synchronize_irq(idxd->msix_entries[i].vector);
+		synchronize_irq(irq_entry->vector);
+		free_irq(irq_entry->vector, irq_entry);
 		if (i == 0)
 			continue;
 		idxd_flush_pending_llist(irq_entry);
@@ -503,6 +492,7 @@ static void idxd_shutdown(struct pci_dev *pdev)
 	}
 
 	idxd_msix_perm_clear(idxd);
+	pci_free_irq_vectors(pdev);
 	destroy_workqueue(idxd->wq);
 }
 
-- 
Gitee


From 4482326bcef5c4052b0ec8e2030c10040b800715 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:21 -0700
Subject: [PATCH 0600/2161] dmaengine: idxd: removal of pcim managed mmio
 mapping

commit a39c7cd0438ee2f0b859ee1eb86cdc52217d2223 upstream.

The devm managed lifetime is incompatible with 'struct device' objects that
resides in idxd context. This is one of the series that clean up the idxd
driver 'struct device' lifetime. Remove pcim_* management of the PCI device
and the ioremap of MMIO BAR and replace with unmanaged versions. This is
for consistency of removing all the pcim/devm based calls.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/161852984150.2203940.8043988289748519056.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 5cf1bf095ae1..11a2e14b5b80 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -384,32 +384,36 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	struct idxd_device *idxd;
 	int rc;
 
-	rc = pcim_enable_device(pdev);
+	rc = pci_enable_device(pdev);
 	if (rc)
 		return rc;
 
 	dev_dbg(dev, "Alloc IDXD context\n");
 	idxd = idxd_alloc(pdev);
-	if (!idxd)
-		return -ENOMEM;
+	if (!idxd) {
+		rc = -ENOMEM;
+		goto err_idxd_alloc;
+	}
 
 	dev_dbg(dev, "Mapping BARs\n");
-	idxd->reg_base = pcim_iomap(pdev, IDXD_MMIO_BAR, 0);
-	if (!idxd->reg_base)
-		return -ENOMEM;
+	idxd->reg_base = pci_iomap(pdev, IDXD_MMIO_BAR, 0);
+	if (!idxd->reg_base) {
+		rc = -ENOMEM;
+		goto err_iomap;
+	}
 
 	dev_dbg(dev, "Set DMA masks\n");
 	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
 	if (rc)
 		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc)
-		return rc;
+		goto err;
 
 	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 	if (rc)
 		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc)
-		return rc;
+		goto err;
 
 	idxd_set_type(idxd);
 
@@ -423,13 +427,13 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	rc = idxd_probe(idxd);
 	if (rc) {
 		dev_err(dev, "Intel(R) IDXD DMA Engine init failed\n");
-		return -ENODEV;
+		goto err;
 	}
 
 	rc = idxd_setup_sysfs(idxd);
 	if (rc) {
 		dev_err(dev, "IDXD sysfs setup failed\n");
-		return -ENODEV;
+		goto err;
 	}
 
 	idxd->state = IDXD_DEV_CONF_READY;
@@ -438,6 +442,13 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		 idxd->hw.version);
 
 	return 0;
+
+ err:
+	pci_iounmap(pdev, idxd->reg_base);
+ err_iomap:
+ err_idxd_alloc:
+	pci_disable_device(pdev);
+	return rc;
 }
 
 static void idxd_flush_pending_llist(struct idxd_irq_entry *ie)
@@ -493,6 +504,8 @@ static void idxd_shutdown(struct pci_dev *pdev)
 
 	idxd_msix_perm_clear(idxd);
 	pci_free_irq_vectors(pdev);
+	pci_iounmap(pdev, idxd->reg_base);
+	pci_disable_device(pdev);
 	destroy_workqueue(idxd->wq);
 }
 
-- 
Gitee


From 169affe5c43b4b0acd6f5d7ca327bb47e4d1d765 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:27 -0700
Subject: [PATCH 0601/2161] dmaengine: idxd: use ida for device instance
 enumeration

commit f7f7739847bd68b3c3103fd1b50d943038bd14c7 upstream.

The idr is only used for an device id, never to lookup context from that
id. Switch to plain ida.

Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161852984730.2203940.15032482460902003819.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 11a2e14b5b80..a4f0489515b4 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -34,8 +34,7 @@ MODULE_PARM_DESC(sva, "Toggle SVA support on/off");
 
 bool support_enqcmd;
 
-static struct idr idxd_idrs[IDXD_TYPE_MAX];
-static DEFINE_MUTEX(idxd_idr_lock);
+static struct ida idxd_idas[IDXD_TYPE_MAX];
 
 static struct pci_device_id idxd_pci_tbl[] = {
 	/* DSA ver 1.0 platforms */
@@ -348,12 +347,10 @@ static int idxd_probe(struct idxd_device *idxd)
 
 	dev_dbg(dev, "IDXD interrupt setup complete.\n");
 
-	mutex_lock(&idxd_idr_lock);
-	idxd->id = idr_alloc(&idxd_idrs[idxd->type], idxd, 0, 0, GFP_KERNEL);
-	mutex_unlock(&idxd_idr_lock);
+	idxd->id = ida_alloc(&idxd_idas[idxd->type], GFP_KERNEL);
 	if (idxd->id < 0) {
 		rc = -ENOMEM;
-		goto err_idr_fail;
+		goto err_ida_fail;
 	}
 
 	idxd->major = idxd_cdev_get_major(idxd);
@@ -361,7 +358,7 @@ static int idxd_probe(struct idxd_device *idxd)
 	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
 	return 0;
 
- err_idr_fail:
+ err_ida_fail:
 	idxd_mask_error_interrupts(idxd);
 	idxd_mask_msix_vectors(idxd);
  err_setup:
@@ -518,9 +515,7 @@ static void idxd_remove(struct pci_dev *pdev)
 	idxd_shutdown(pdev);
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
-	mutex_lock(&idxd_idr_lock);
-	idr_remove(&idxd_idrs[idxd->type], idxd->id);
-	mutex_unlock(&idxd_idr_lock);
+	ida_free(&idxd_idas[idxd->type], idxd->id);
 }
 
 static struct pci_driver idxd_pci_driver = {
@@ -550,7 +545,7 @@ static int __init idxd_init_module(void)
 		support_enqcmd = true;
 
 	for (i = 0; i < IDXD_TYPE_MAX; i++)
-		idr_init(&idxd_idrs[i]);
+		ida_init(&idxd_idas[i]);
 
 	err = idxd_register_bus_type();
 	if (err < 0)
-- 
Gitee


From 5939fe71675a52682850db7a54ebe7749f260aa3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:33 -0700
Subject: [PATCH 0602/2161] dmaengine: idxd: fix idxd conf_dev 'struct device'
 lifetime

commit 47c16ac27d4cb664cee53ee0b9b7e2f907923fb3 upstream.

The devm managed lifetime is incompatible with 'struct device' objects that
resides in idxd context. This is one of the series that clean up the idxd
driver 'struct device' lifetime. Fix idxd->conf_dev 'struct device'
lifetime. Address issues flagged by CONFIG_DEBUG_KOBJECT_RELEASE.
Add release functions in order to free the allocated memory at the
appropriate time.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161852985319.2203940.4650791514462735368.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h  | 36 ++++++++++------
 drivers/dma/idxd/init.c  | 56 ++++++++++++++++--------
 drivers/dma/idxd/sysfs.c | 92 ++++++++++++++--------------------------
 3 files changed, 94 insertions(+), 90 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 401b035e42b1..bb3a580732af 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -8,6 +8,7 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/wait.h>
 #include <linux/cdev.h>
+#include <linux/idr.h>
 #include "registers.h"
 
 #define IDXD_DRIVER_VERSION	"1.00"
@@ -255,6 +256,23 @@ extern struct bus_type dsa_bus_type;
 extern struct bus_type iax_bus_type;
 
 extern bool support_enqcmd;
+extern struct device_type dsa_device_type;
+extern struct device_type iax_device_type;
+
+static inline bool is_dsa_dev(struct device *dev)
+{
+	return dev->type == &dsa_device_type;
+}
+
+static inline bool is_iax_dev(struct device *dev)
+{
+	return dev->type == &iax_device_type;
+}
+
+static inline bool is_idxd_dev(struct device *dev)
+{
+	return is_dsa_dev(dev) || is_iax_dev(dev);
+}
 
 static inline bool wq_dedicated(struct idxd_wq *wq)
 {
@@ -292,18 +310,6 @@ static inline int idxd_get_wq_portal_full_offset(int wq_id,
 	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot);
 }
 
-static inline void idxd_set_type(struct idxd_device *idxd)
-{
-	struct pci_dev *pdev = idxd->pdev;
-
-	if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0)
-		idxd->type = IDXD_TYPE_DSA;
-	else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0)
-		idxd->type = IDXD_TYPE_IAX;
-	else
-		idxd->type = IDXD_TYPE_UNKNOWN;
-}
-
 static inline void idxd_wq_get(struct idxd_wq *wq)
 {
 	wq->client_count++;
@@ -319,14 +325,16 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq)
 	return wq->client_count;
 };
 
+struct ida *idxd_ida(struct idxd_device *idxd);
 const char *idxd_get_dev_name(struct idxd_device *idxd);
 int idxd_register_bus_type(void);
 void idxd_unregister_bus_type(void);
-int idxd_setup_sysfs(struct idxd_device *idxd);
-void idxd_cleanup_sysfs(struct idxd_device *idxd);
+int idxd_register_devices(struct idxd_device *idxd);
+void idxd_unregister_devices(struct idxd_device *idxd);
 int idxd_register_driver(void);
 void idxd_unregister_driver(void);
 struct bus_type *idxd_get_bus_type(struct idxd_device *idxd);
+struct device_type *idxd_get_device_type(struct idxd_device *idxd);
 
 /* device interrupt control */
 void idxd_msix_perm_setup(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index a4f0489515b4..17d3b36610a9 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -51,6 +51,11 @@ static char *idxd_name[] = {
 	"iax"
 };
 
+struct ida *idxd_ida(struct idxd_device *idxd)
+{
+	return &idxd_idas[idxd->type];
+}
+
 const char *idxd_get_dev_name(struct idxd_device *idxd)
 {
 	return idxd_name[idxd->type];
@@ -81,9 +86,8 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	 * We implement 1 completion list per MSI-X entry except for
 	 * entry 0, which is for errors and others.
 	 */
-	idxd->irq_entries = devm_kcalloc(dev, msixcnt,
-					 sizeof(struct idxd_irq_entry),
-					 GFP_KERNEL);
+	idxd->irq_entries = kcalloc_node(msixcnt, sizeof(struct idxd_irq_entry),
+					 GFP_KERNEL, dev_to_node(dev));
 	if (!idxd->irq_entries) {
 		rc = -ENOMEM;
 		goto err_irq_entries;
@@ -262,16 +266,44 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	}
 }
 
+static inline void idxd_set_type(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+
+	if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0)
+		idxd->type = IDXD_TYPE_DSA;
+	else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0)
+		idxd->type = IDXD_TYPE_IAX;
+	else
+		idxd->type = IDXD_TYPE_UNKNOWN;
+}
+
 static struct idxd_device *idxd_alloc(struct pci_dev *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct idxd_device *idxd;
+	int rc;
 
-	idxd = devm_kzalloc(dev, sizeof(struct idxd_device), GFP_KERNEL);
+	idxd = kzalloc_node(sizeof(*idxd), GFP_KERNEL, dev_to_node(dev));
 	if (!idxd)
 		return NULL;
 
 	idxd->pdev = pdev;
+	idxd_set_type(idxd);
+	idxd->id = ida_alloc(idxd_ida(idxd), GFP_KERNEL);
+	if (idxd->id < 0)
+		return NULL;
+
+	device_initialize(&idxd->conf_dev);
+	idxd->conf_dev.parent = dev;
+	idxd->conf_dev.bus = idxd_get_bus_type(idxd);
+	idxd->conf_dev.type = idxd_get_device_type(idxd);
+	rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd_get_dev_name(idxd), idxd->id);
+	if (rc < 0) {
+		put_device(&idxd->conf_dev);
+		return NULL;
+	}
+
 	spin_lock_init(&idxd->dev_lock);
 
 	return idxd;
@@ -347,20 +379,11 @@ static int idxd_probe(struct idxd_device *idxd)
 
 	dev_dbg(dev, "IDXD interrupt setup complete.\n");
 
-	idxd->id = ida_alloc(&idxd_idas[idxd->type], GFP_KERNEL);
-	if (idxd->id < 0) {
-		rc = -ENOMEM;
-		goto err_ida_fail;
-	}
-
 	idxd->major = idxd_cdev_get_major(idxd);
 
 	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
 	return 0;
 
- err_ida_fail:
-	idxd_mask_error_interrupts(idxd);
-	idxd_mask_msix_vectors(idxd);
  err_setup:
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
@@ -412,7 +435,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		goto err;
 
-	idxd_set_type(idxd);
 
 	idxd_type_init(idxd);
 
@@ -427,7 +449,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err;
 	}
 
-	rc = idxd_setup_sysfs(idxd);
+	rc = idxd_register_devices(idxd);
 	if (rc) {
 		dev_err(dev, "IDXD sysfs setup failed\n");
 		goto err;
@@ -443,6 +465,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  err:
 	pci_iounmap(pdev, idxd->reg_base);
  err_iomap:
+	put_device(&idxd->conf_dev);
  err_idxd_alloc:
 	pci_disable_device(pdev);
 	return rc;
@@ -511,11 +534,10 @@ static void idxd_remove(struct pci_dev *pdev)
 	struct idxd_device *idxd = pci_get_drvdata(pdev);
 
 	dev_dbg(&pdev->dev, "%s called\n", __func__);
-	idxd_cleanup_sysfs(idxd);
 	idxd_shutdown(pdev);
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
-	ida_free(&idxd_idas[idxd->type], idxd->id);
+	idxd_unregister_devices(idxd);
 }
 
 static struct pci_driver idxd_pci_driver = {
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 39444af44dd7..87543e70a206 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -16,51 +16,26 @@ static char *idxd_wq_type_names[] = {
 	[IDXD_WQT_USER]		= "user",
 };
 
-static void idxd_conf_device_release(struct device *dev)
+static void idxd_conf_sub_device_release(struct device *dev)
 {
 	dev_dbg(dev, "%s for %s\n", __func__, dev_name(dev));
 }
 
 static struct device_type idxd_group_device_type = {
 	.name = "group",
-	.release = idxd_conf_device_release,
+	.release = idxd_conf_sub_device_release,
 };
 
 static struct device_type idxd_wq_device_type = {
 	.name = "wq",
-	.release = idxd_conf_device_release,
+	.release = idxd_conf_sub_device_release,
 };
 
 static struct device_type idxd_engine_device_type = {
 	.name = "engine",
-	.release = idxd_conf_device_release,
+	.release = idxd_conf_sub_device_release,
 };
 
-static struct device_type dsa_device_type = {
-	.name = "dsa",
-	.release = idxd_conf_device_release,
-};
-
-static struct device_type iax_device_type = {
-	.name = "iax",
-	.release = idxd_conf_device_release,
-};
-
-static inline bool is_dsa_dev(struct device *dev)
-{
-	return dev ? dev->type == &dsa_device_type : false;
-}
-
-static inline bool is_iax_dev(struct device *dev)
-{
-	return dev ? dev->type == &iax_device_type : false;
-}
-
-static inline bool is_idxd_dev(struct device *dev)
-{
-	return is_dsa_dev(dev) || is_iax_dev(dev);
-}
-
 static inline bool is_idxd_wq_dev(struct device *dev)
 {
 	return dev ? dev->type == &idxd_wq_device_type : false;
@@ -403,7 +378,7 @@ struct bus_type *idxd_get_bus_type(struct idxd_device *idxd)
 	return idxd_bus_types[idxd->type];
 }
 
-static struct device_type *idxd_get_device_type(struct idxd_device *idxd)
+struct device_type *idxd_get_device_type(struct idxd_device *idxd)
 {
 	if (idxd->type == IDXD_TYPE_DSA)
 		return &dsa_device_type;
@@ -1642,6 +1617,30 @@ static const struct attribute_group *idxd_attribute_groups[] = {
 	NULL,
 };
 
+static void idxd_conf_device_release(struct device *dev)
+{
+	struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev);
+
+	kfree(idxd->groups);
+	kfree(idxd->wqs);
+	kfree(idxd->engines);
+	kfree(idxd->irq_entries);
+	ida_free(idxd_ida(idxd), idxd->id);
+	kfree(idxd);
+}
+
+struct device_type dsa_device_type = {
+	.name = "dsa",
+	.release = idxd_conf_device_release,
+	.groups = idxd_attribute_groups,
+};
+
+struct device_type iax_device_type = {
+	.name = "iax",
+	.release = idxd_conf_device_release,
+	.groups = idxd_attribute_groups,
+};
+
 static int idxd_setup_engine_sysfs(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -1743,39 +1742,14 @@ static int idxd_setup_wq_sysfs(struct idxd_device *idxd)
 	return rc;
 }
 
-static int idxd_setup_device_sysfs(struct idxd_device *idxd)
+int idxd_register_devices(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
 	int rc;
-	char devname[IDXD_NAME_SIZE];
 
-	sprintf(devname, "%s%d", idxd_get_dev_name(idxd), idxd->id);
-	idxd->conf_dev.parent = dev;
-	dev_set_name(&idxd->conf_dev, "%s", devname);
-	idxd->conf_dev.bus = idxd_get_bus_type(idxd);
-	idxd->conf_dev.groups = idxd_attribute_groups;
-	idxd->conf_dev.type = idxd_get_device_type(idxd);
-
-	dev_dbg(dev, "IDXD device register: %s\n", dev_name(&idxd->conf_dev));
-	rc = device_register(&idxd->conf_dev);
-	if (rc < 0) {
-		put_device(&idxd->conf_dev);
-		return rc;
-	}
-
-	return 0;
-}
-
-int idxd_setup_sysfs(struct idxd_device *idxd)
-{
-	struct device *dev = &idxd->pdev->dev;
-	int rc;
-
-	rc = idxd_setup_device_sysfs(idxd);
-	if (rc < 0) {
-		dev_dbg(dev, "Device sysfs registering failed: %d\n", rc);
+	rc = device_add(&idxd->conf_dev);
+	if (rc < 0)
 		return rc;
-	}
 
 	rc = idxd_setup_wq_sysfs(idxd);
 	if (rc < 0) {
@@ -1801,7 +1775,7 @@ int idxd_setup_sysfs(struct idxd_device *idxd)
 	return 0;
 }
 
-void idxd_cleanup_sysfs(struct idxd_device *idxd)
+void idxd_unregister_devices(struct idxd_device *idxd)
 {
 	int i;
 
-- 
Gitee


From f023fd85106ac55fdeb20afe40be59c8dddc5dbf Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 9 Dec 2021 15:10:53 +0800
Subject: [PATCH 0603/2161] Fix Conflicts for drivers/dma/idxd/sysfs.c

commit opencloudos.

Port from 0b5ad7b9522e6172342511fac6114fd8b7eb622a

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 87543e70a206..36193e555e36 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -311,7 +311,9 @@ static int idxd_config_bus_remove(struct device *dev)
 		for (i = 0; i < idxd->max_wqs; i++) {
 			struct idxd_wq *wq = &idxd->wqs[i];
 
+			mutex_lock(&wq->wq_lock);
 			idxd_wq_disable_cleanup(wq);
+			mutex_unlock(&wq->wq_lock);
 		}
 		module_put(THIS_MODULE);
 		if (rc < 0)
-- 
Gitee


From d30ae5f8a5eb629d372ca092bc3a3ec99b30d5ce Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:39 -0700
Subject: [PATCH 0604/2161] dmaengine: idxd: fix wq conf_dev 'struct device'
 lifetime

commit 7c5dd23e57c14cf7177b8a5e0fd08916e0c60005 upstream.

Remove devm_* allocation and fix wq->conf_dev 'struct device' lifetime.
Address issues flagged by CONFIG_DEBUG_KOBJECT_RELEASE. Add release
functions in order to free the allocated memory for the wq context at
device destruction time.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161852985907.2203940.6840120734115043753.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |   6 +--
 drivers/dma/idxd/idxd.h   |  20 +++++++-
 drivers/dma/idxd/init.c   | 105 ++++++++++++++++++++++++++++----------
 drivers/dma/idxd/irq.c    |   6 +--
 drivers/dma/idxd/sysfs.c  | 100 ++++++++++++++++--------------------
 5 files changed, 146 insertions(+), 91 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 3f696abd74ac..c4183294a704 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -520,7 +520,7 @@ void idxd_device_wqs_clear_state(struct idxd_device *idxd)
 	lockdep_assert_held(&idxd->dev_lock);
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
+		struct idxd_wq *wq = idxd->wqs[i];
 
 		if (wq->state == IDXD_WQ_ENABLED) {
 			idxd_wq_disable_cleanup(wq);
@@ -738,7 +738,7 @@ static int idxd_wqs_config_write(struct idxd_device *idxd)
 	int i, rc;
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
+		struct idxd_wq *wq = idxd->wqs[i];
 
 		rc = idxd_wq_config_write(wq);
 		if (rc < 0)
@@ -816,7 +816,7 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 	}
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		wq = &idxd->wqs[i];
+		wq = idxd->wqs[i];
 		group = wq->group;
 
 		if (!wq->group)
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index bb3a580732af..6cade6a05314 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -194,7 +194,7 @@ struct idxd_device {
 	spinlock_t dev_lock;	/* spinlock for device */
 	struct completion *cmd_done;
 	struct idxd_group *groups;
-	struct idxd_wq *wqs;
+	struct idxd_wq **wqs;
 	struct idxd_engine *engines;
 
 	struct iommu_sva *sva;
@@ -258,6 +258,7 @@ extern struct bus_type iax_bus_type;
 extern bool support_enqcmd;
 extern struct device_type dsa_device_type;
 extern struct device_type iax_device_type;
+extern struct device_type idxd_wq_device_type;
 
 static inline bool is_dsa_dev(struct device *dev)
 {
@@ -274,6 +275,23 @@ static inline bool is_idxd_dev(struct device *dev)
 	return is_dsa_dev(dev) || is_iax_dev(dev);
 }
 
+static inline bool is_idxd_wq_dev(struct device *dev)
+{
+	return dev->type == &idxd_wq_device_type;
+}
+
+static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
+{
+	if (wq->type == IDXD_WQT_KERNEL && strcmp(wq->name, "dmaengine") == 0)
+		return true;
+	return false;
+}
+
+static inline bool is_idxd_wq_cdev(struct idxd_wq *wq)
+{
+	return wq->type == IDXD_WQT_USER;
+}
+
 static inline bool wq_dedicated(struct idxd_wq *wq)
 {
 	return test_bit(WQ_FLAG_DEDICATED, &wq->flags);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 17d3b36610a9..a2dca27aebc3 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -145,16 +145,74 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	return rc;
 }
 
+static int idxd_setup_wqs(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_wq *wq;
+	int i, rc;
+
+	idxd->wqs = kcalloc_node(idxd->max_wqs, sizeof(struct idxd_wq *),
+				 GFP_KERNEL, dev_to_node(dev));
+	if (!idxd->wqs)
+		return -ENOMEM;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev));
+		if (!wq) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		wq->id = i;
+		wq->idxd = idxd;
+		device_initialize(&wq->conf_dev);
+		wq->conf_dev.parent = &idxd->conf_dev;
+		wq->conf_dev.bus = idxd_get_bus_type(idxd);
+		wq->conf_dev.type = &idxd_wq_device_type;
+		rc = dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id);
+		if (rc < 0) {
+			put_device(&wq->conf_dev);
+			goto err;
+		}
+
+		mutex_init(&wq->wq_lock);
+		wq->idxd_cdev.minor = -1;
+		wq->max_xfer_bytes = idxd->max_xfer_bytes;
+		wq->max_batch_size = idxd->max_batch_size;
+		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
+		if (!wq->wqcfg) {
+			put_device(&wq->conf_dev);
+			rc = -ENOMEM;
+			goto err;
+		}
+		idxd->wqs[i] = wq;
+	}
+
+	return 0;
+
+ err:
+	while (--i >= 0)
+		put_device(&idxd->wqs[i]->conf_dev);
+	return rc;
+}
+
 static int idxd_setup_internals(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
-	int i;
+	int i, rc;
 
 	init_waitqueue_head(&idxd->cmd_waitq);
+
+	rc = idxd_setup_wqs(idxd);
+	if (rc < 0)
+		return rc;
+
 	idxd->groups = devm_kcalloc(dev, idxd->max_groups,
 				    sizeof(struct idxd_group), GFP_KERNEL);
-	if (!idxd->groups)
-		return -ENOMEM;
+	if (!idxd->groups) {
+		rc = -ENOMEM;
+		goto err;
+	}
 
 	for (i = 0; i < idxd->max_groups; i++) {
 		idxd->groups[i].idxd = idxd;
@@ -163,40 +221,31 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		idxd->groups[i].tc_b = -1;
 	}
 
-	idxd->wqs = devm_kcalloc(dev, idxd->max_wqs, sizeof(struct idxd_wq),
-				 GFP_KERNEL);
-	if (!idxd->wqs)
-		return -ENOMEM;
-
 	idxd->engines = devm_kcalloc(dev, idxd->max_engines,
 				     sizeof(struct idxd_engine), GFP_KERNEL);
-	if (!idxd->engines)
-		return -ENOMEM;
-
-	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
-
-		wq->id = i;
-		wq->idxd = idxd;
-		mutex_init(&wq->wq_lock);
-		wq->idxd_cdev.minor = -1;
-		wq->max_xfer_bytes = idxd->max_xfer_bytes;
-		wq->max_batch_size = idxd->max_batch_size;
-		wq->wqcfg = devm_kzalloc(dev, idxd->wqcfg_size, GFP_KERNEL);
-		if (!wq->wqcfg)
-			return -ENOMEM;
+	if (!idxd->engines) {
+		rc = -ENOMEM;
+		goto err;
 	}
 
+
 	for (i = 0; i < idxd->max_engines; i++) {
 		idxd->engines[i].idxd = idxd;
 		idxd->engines[i].id = i;
 	}
 
 	idxd->wq = create_workqueue(dev_name(dev));
-	if (!idxd->wq)
-		return -ENOMEM;
+	if (!idxd->wq) {
+		rc = -ENOMEM;
+		goto err;
+	}
 
 	return 0;
+
+ err:
+	for (i = 0; i < idxd->max_wqs; i++)
+		put_device(&idxd->wqs[i]->conf_dev);
+	return rc;
 }
 
 static void idxd_read_table_offsets(struct idxd_device *idxd)
@@ -371,11 +420,11 @@ static int idxd_probe(struct idxd_device *idxd)
 
 	rc = idxd_setup_internals(idxd);
 	if (rc)
-		goto err_setup;
+		goto err;
 
 	rc = idxd_setup_interrupts(idxd);
 	if (rc)
-		goto err_setup;
+		goto err;
 
 	dev_dbg(dev, "IDXD interrupt setup complete.\n");
 
@@ -384,7 +433,7 @@ static int idxd_probe(struct idxd_device *idxd)
 	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
 	return 0;
 
- err_setup:
+ err:
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
 	return rc;
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index f1463fc58112..7b0181532f77 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -45,7 +45,7 @@ static void idxd_device_reinit(struct work_struct *work)
 		goto out;
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
+		struct idxd_wq *wq = idxd->wqs[i];
 
 		if (wq->state == IDXD_WQ_ENABLED) {
 			rc = idxd_wq_enable(wq);
@@ -130,7 +130,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 
 		if (idxd->sw_err.valid && idxd->sw_err.wq_idx_valid) {
 			int id = idxd->sw_err.wq_idx;
-			struct idxd_wq *wq = &idxd->wqs[id];
+			struct idxd_wq *wq = idxd->wqs[id];
 
 			if (wq->type == IDXD_WQT_USER)
 				wake_up_interruptible(&wq->idxd_cdev.err_queue);
@@ -138,7 +138,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 			int i;
 
 			for (i = 0; i < idxd->max_wqs; i++) {
-				struct idxd_wq *wq = &idxd->wqs[i];
+				struct idxd_wq *wq = idxd->wqs[i];
 
 				if (wq->type == IDXD_WQT_USER)
 					wake_up_interruptible(&wq->idxd_cdev.err_queue);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 36193e555e36..409b3ce52f07 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -26,34 +26,11 @@ static struct device_type idxd_group_device_type = {
 	.release = idxd_conf_sub_device_release,
 };
 
-static struct device_type idxd_wq_device_type = {
-	.name = "wq",
-	.release = idxd_conf_sub_device_release,
-};
-
 static struct device_type idxd_engine_device_type = {
 	.name = "engine",
 	.release = idxd_conf_sub_device_release,
 };
 
-static inline bool is_idxd_wq_dev(struct device *dev)
-{
-	return dev ? dev->type == &idxd_wq_device_type : false;
-}
-
-static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
-{
-	if (wq->type == IDXD_WQT_KERNEL &&
-	    strcmp(wq->name, "dmaengine") == 0)
-		return true;
-	return false;
-}
-
-static inline bool is_idxd_wq_cdev(struct idxd_wq *wq)
-{
-	return wq->type == IDXD_WQT_USER;
-}
-
 static int idxd_config_bus_match(struct device *dev,
 				 struct device_driver *drv)
 {
@@ -297,7 +274,7 @@ static int idxd_config_bus_remove(struct device *dev)
 		dev_dbg(dev, "%s removing dev %s\n", __func__,
 			dev_name(&idxd->conf_dev));
 		for (i = 0; i < idxd->max_wqs; i++) {
-			struct idxd_wq *wq = &idxd->wqs[i];
+			struct idxd_wq *wq = idxd->wqs[i];
 
 			if (wq->state == IDXD_WQ_DISABLED)
 				continue;
@@ -309,7 +286,7 @@ static int idxd_config_bus_remove(struct device *dev)
 		idxd_unregister_dma_device(idxd);
 		rc = idxd_device_disable(idxd);
 		for (i = 0; i < idxd->max_wqs; i++) {
-			struct idxd_wq *wq = &idxd->wqs[i];
+			struct idxd_wq *wq = idxd->wqs[i];
 
 			mutex_lock(&wq->wq_lock);
 			idxd_wq_disable_cleanup(wq);
@@ -678,7 +655,7 @@ static ssize_t group_work_queues_show(struct device *dev,
 	struct idxd_device *idxd = group->idxd;
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
+		struct idxd_wq *wq = idxd->wqs[i];
 
 		if (!wq->group)
 			continue;
@@ -935,7 +912,7 @@ static int total_claimed_wq_size(struct idxd_device *idxd)
 	int wq_size = 0;
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
+		struct idxd_wq *wq = idxd->wqs[i];
 
 		wq_size += wq->size;
 	}
@@ -1331,6 +1308,20 @@ static const struct attribute_group *idxd_wq_attribute_groups[] = {
 	NULL,
 };
 
+static void idxd_conf_wq_release(struct device *dev)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	kfree(wq->wqcfg);
+	kfree(wq);
+}
+
+struct device_type idxd_wq_device_type = {
+	.name = "wq",
+	.release = idxd_conf_wq_release,
+	.groups = idxd_wq_attribute_groups,
+};
+
 /* IDXD device attribs */
 static ssize_t version_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
@@ -1461,7 +1452,7 @@ static ssize_t clients_show(struct device *dev,
 
 	spin_lock_irqsave(&idxd->dev_lock, flags);
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
+		struct idxd_wq *wq = idxd->wqs[i];
 
 		count += wq->client_count;
 	}
@@ -1711,70 +1702,67 @@ static int idxd_setup_group_sysfs(struct idxd_device *idxd)
 	return rc;
 }
 
-static int idxd_setup_wq_sysfs(struct idxd_device *idxd)
+static int idxd_register_wq_devices(struct idxd_device *idxd)
 {
-	struct device *dev = &idxd->pdev->dev;
-	int i, rc;
+	int i, rc, j;
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
-
-		wq->conf_dev.parent = &idxd->conf_dev;
-		dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id);
-		wq->conf_dev.bus = idxd_get_bus_type(idxd);
-		wq->conf_dev.groups = idxd_wq_attribute_groups;
-		wq->conf_dev.type = &idxd_wq_device_type;
-		dev_dbg(dev, "WQ device register: %s\n",
-			dev_name(&wq->conf_dev));
-		rc = device_register(&wq->conf_dev);
-		if (rc < 0) {
-			put_device(&wq->conf_dev);
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		rc = device_add(&wq->conf_dev);
+		if (rc < 0)
 			goto cleanup;
-		}
 	}
 
 	return 0;
 
 cleanup:
-	while (i--) {
-		struct idxd_wq *wq = &idxd->wqs[i];
+	j = i - 1;
+	for (; i < idxd->max_wqs; i++)
+		put_device(&idxd->wqs[i]->conf_dev);
 
-		device_unregister(&wq->conf_dev);
-	}
+	while (j--)
+		device_unregister(&idxd->wqs[j]->conf_dev);
 	return rc;
 }
 
 int idxd_register_devices(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
-	int rc;
+	int rc, i;
 
 	rc = device_add(&idxd->conf_dev);
 	if (rc < 0)
 		return rc;
 
-	rc = idxd_setup_wq_sysfs(idxd);
+	rc = idxd_register_wq_devices(idxd);
 	if (rc < 0) {
-		/* unregister conf dev */
-		dev_dbg(dev, "Work Queue sysfs registering failed: %d\n", rc);
-		return rc;
+		dev_dbg(dev, "WQ devices registering failed: %d\n", rc);
+		goto err_wq;
 	}
 
 	rc = idxd_setup_group_sysfs(idxd);
 	if (rc < 0) {
 		/* unregister conf dev */
 		dev_dbg(dev, "Group sysfs registering failed: %d\n", rc);
-		return rc;
+		goto err;
 	}
 
 	rc = idxd_setup_engine_sysfs(idxd);
 	if (rc < 0) {
 		/* unregister conf dev */
 		dev_dbg(dev, "Engine sysfs registering failed: %d\n", rc);
-		return rc;
+		goto err;
 	}
 
 	return 0;
+
+ err:
+	for (i = 0; i < idxd->max_wqs; i++)
+		device_unregister(&idxd->wqs[i]->conf_dev);
+ err_wq:
+	device_del(&idxd->conf_dev);
+	return rc;
 }
 
 void idxd_unregister_devices(struct idxd_device *idxd)
@@ -1782,7 +1770,7 @@ void idxd_unregister_devices(struct idxd_device *idxd)
 	int i;
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = &idxd->wqs[i];
+		struct idxd_wq *wq = idxd->wqs[i];
 
 		device_unregister(&wq->conf_dev);
 	}
-- 
Gitee


From 32b165bc09337a5772bec4792e5a25c819ee26c0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:44 -0700
Subject: [PATCH 0605/2161] dmaengine: idxd: fix engine conf_dev lifetime

commit 75b911309060f42ba94bbbf46f5f497d35d5cd02 upstream.

Remove devm_* allocation and fix engine->conf_dev 'struct device'
lifetime. Address issues flagged by CONFIG_DEBUG_KOBJECT_RELEASE.
Add release functions in order to free the allocated memory at the
engine conf_dev destruction time.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161852986460.2203940.16603218225412118431.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  2 +-
 drivers/dma/idxd/idxd.h   |  3 +-
 drivers/dma/idxd/init.c   | 60 +++++++++++++++++++++++++-------
 drivers/dma/idxd/sysfs.c  | 72 +++++++++++++++++++--------------------
 4 files changed, 86 insertions(+), 51 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index c4183294a704..be1dcddfe3c4 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -786,7 +786,7 @@ static int idxd_engines_setup(struct idxd_device *idxd)
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
-		eng = &idxd->engines[i];
+		eng = idxd->engines[i];
 		group = eng->group;
 
 		if (!group)
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 6cade6a05314..b9b7e8e8c384 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -195,7 +195,7 @@ struct idxd_device {
 	struct completion *cmd_done;
 	struct idxd_group *groups;
 	struct idxd_wq **wqs;
-	struct idxd_engine *engines;
+	struct idxd_engine **engines;
 
 	struct iommu_sva *sva;
 	unsigned int pasid;
@@ -259,6 +259,7 @@ extern bool support_enqcmd;
 extern struct device_type dsa_device_type;
 extern struct device_type iax_device_type;
 extern struct device_type idxd_wq_device_type;
+extern struct device_type idxd_engine_device_type;
 
 static inline bool is_dsa_dev(struct device *dev)
 {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index a2dca27aebc3..b90ef2f519eb 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -196,6 +196,46 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 	return rc;
 }
 
+static int idxd_setup_engines(struct idxd_device *idxd)
+{
+	struct idxd_engine *engine;
+	struct device *dev = &idxd->pdev->dev;
+	int i, rc;
+
+	idxd->engines = kcalloc_node(idxd->max_engines, sizeof(struct idxd_engine *),
+				     GFP_KERNEL, dev_to_node(dev));
+	if (!idxd->engines)
+		return -ENOMEM;
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		engine = kzalloc_node(sizeof(*engine), GFP_KERNEL, dev_to_node(dev));
+		if (!engine) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		engine->id = i;
+		engine->idxd = idxd;
+		device_initialize(&engine->conf_dev);
+		engine->conf_dev.parent = &idxd->conf_dev;
+		engine->conf_dev.type = &idxd_engine_device_type;
+		rc = dev_set_name(&engine->conf_dev, "engine%d.%d", idxd->id, engine->id);
+		if (rc < 0) {
+			put_device(&engine->conf_dev);
+			goto err;
+		}
+
+		idxd->engines[i] = engine;
+	}
+
+	return 0;
+
+ err:
+	while (--i >= 0)
+		put_device(&idxd->engines[i]->conf_dev);
+	return rc;
+}
+
 static int idxd_setup_internals(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -207,6 +247,10 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 	if (rc < 0)
 		return rc;
 
+	rc = idxd_setup_engines(idxd);
+	if (rc < 0)
+		goto err_engine;
+
 	idxd->groups = devm_kcalloc(dev, idxd->max_groups,
 				    sizeof(struct idxd_group), GFP_KERNEL);
 	if (!idxd->groups) {
@@ -221,19 +265,6 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		idxd->groups[i].tc_b = -1;
 	}
 
-	idxd->engines = devm_kcalloc(dev, idxd->max_engines,
-				     sizeof(struct idxd_engine), GFP_KERNEL);
-	if (!idxd->engines) {
-		rc = -ENOMEM;
-		goto err;
-	}
-
-
-	for (i = 0; i < idxd->max_engines; i++) {
-		idxd->engines[i].idxd = idxd;
-		idxd->engines[i].id = i;
-	}
-
 	idxd->wq = create_workqueue(dev_name(dev));
 	if (!idxd->wq) {
 		rc = -ENOMEM;
@@ -243,6 +274,9 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 	return 0;
 
  err:
+	for (i = 0; i < idxd->max_engines; i++)
+		put_device(&idxd->engines[i]->conf_dev);
+ err_engine:
 	for (i = 0; i < idxd->max_wqs; i++)
 		put_device(&idxd->wqs[i]->conf_dev);
 	return rc;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 409b3ce52f07..ab02e3b4d75d 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -26,11 +26,6 @@ static struct device_type idxd_group_device_type = {
 	.release = idxd_conf_sub_device_release,
 };
 
-static struct device_type idxd_engine_device_type = {
-	.name = "engine",
-	.release = idxd_conf_sub_device_release,
-};
-
 static int idxd_config_bus_match(struct device *dev,
 				 struct device_driver *drv)
 {
@@ -464,6 +459,19 @@ static const struct attribute_group *idxd_engine_attribute_groups[] = {
 	NULL,
 };
 
+static void idxd_conf_engine_release(struct device *dev)
+{
+	struct idxd_engine *engine = container_of(dev, struct idxd_engine, conf_dev);
+
+	kfree(engine);
+}
+
+struct device_type idxd_engine_device_type = {
+	.name = "engine",
+	.release = idxd_conf_engine_release,
+	.groups = idxd_engine_attribute_groups,
+};
+
 /* Group attributes */
 
 static void idxd_set_free_tokens(struct idxd_device *idxd)
@@ -626,7 +634,7 @@ static ssize_t group_engines_show(struct device *dev,
 	struct idxd_device *idxd = group->idxd;
 
 	for (i = 0; i < idxd->max_engines; i++) {
-		struct idxd_engine *engine = &idxd->engines[i];
+		struct idxd_engine *engine = idxd->engines[i];
 
 		if (!engine->group)
 			continue;
@@ -1634,37 +1642,27 @@ struct device_type iax_device_type = {
 	.groups = idxd_attribute_groups,
 };
 
-static int idxd_setup_engine_sysfs(struct idxd_device *idxd)
+static int idxd_register_engine_devices(struct idxd_device *idxd)
 {
-	struct device *dev = &idxd->pdev->dev;
-	int i, rc;
+	int i, j, rc;
 
 	for (i = 0; i < idxd->max_engines; i++) {
-		struct idxd_engine *engine = &idxd->engines[i];
-
-		engine->conf_dev.parent = &idxd->conf_dev;
-		dev_set_name(&engine->conf_dev, "engine%d.%d",
-			     idxd->id, engine->id);
-		engine->conf_dev.bus = idxd_get_bus_type(idxd);
-		engine->conf_dev.groups = idxd_engine_attribute_groups;
-		engine->conf_dev.type = &idxd_engine_device_type;
-		dev_dbg(dev, "Engine device register: %s\n",
-			dev_name(&engine->conf_dev));
-		rc = device_register(&engine->conf_dev);
-		if (rc < 0) {
-			put_device(&engine->conf_dev);
+		struct idxd_engine *engine = idxd->engines[i];
+
+		rc = device_add(&engine->conf_dev);
+		if (rc < 0)
 			goto cleanup;
-		}
 	}
 
 	return 0;
 
 cleanup:
-	while (i--) {
-		struct idxd_engine *engine = &idxd->engines[i];
+	j = i - 1;
+	for (; i < idxd->max_engines; i++)
+		put_device(&idxd->engines[i]->conf_dev);
 
-		device_unregister(&engine->conf_dev);
-	}
+	while (j--)
+		device_unregister(&idxd->engines[j]->conf_dev);
 	return rc;
 }
 
@@ -1741,23 +1739,25 @@ int idxd_register_devices(struct idxd_device *idxd)
 		goto err_wq;
 	}
 
-	rc = idxd_setup_group_sysfs(idxd);
+	rc = idxd_register_engine_devices(idxd);
 	if (rc < 0) {
-		/* unregister conf dev */
-		dev_dbg(dev, "Group sysfs registering failed: %d\n", rc);
-		goto err;
+		dev_dbg(dev, "Engine devices registering failed: %d\n", rc);
+		goto err_engine;
 	}
 
-	rc = idxd_setup_engine_sysfs(idxd);
+	rc = idxd_setup_group_sysfs(idxd);
 	if (rc < 0) {
 		/* unregister conf dev */
-		dev_dbg(dev, "Engine sysfs registering failed: %d\n", rc);
-		goto err;
+		dev_dbg(dev, "Group sysfs registering failed: %d\n", rc);
+		goto err_group;
 	}
 
 	return 0;
 
- err:
+ err_group:
+	for (i = 0; i < idxd->max_engines; i++)
+		device_unregister(&idxd->engines[i]->conf_dev);
+ err_engine:
 	for (i = 0; i < idxd->max_wqs; i++)
 		device_unregister(&idxd->wqs[i]->conf_dev);
  err_wq:
@@ -1776,7 +1776,7 @@ void idxd_unregister_devices(struct idxd_device *idxd)
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
-		struct idxd_engine *engine = &idxd->engines[i];
+		struct idxd_engine *engine = idxd->engines[i];
 
 		device_unregister(&engine->conf_dev);
 	}
-- 
Gitee


From c570f253c534469e56afd9d09a23449c184b58b1 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:51 -0700
Subject: [PATCH 0606/2161] dmaengine: idxd: fix group conf_dev lifetime

commit defe49f96012ca91e8e673cb95b5c30b4a3735e8 upstream.

Remove devm_* allocation and fix group->conf_dev 'struct device'
lifetime. Address issues flagged by CONFIG_DEBUG_KOBJECT_RELEASE.
Add release functions in order to free the allocated memory at the
group->conf_dev destruction time.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161852987144.2203940.8830315575880047.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  8 ++---
 drivers/dma/idxd/idxd.h   |  3 +-
 drivers/dma/idxd/init.c   | 68 ++++++++++++++++++++++++++++++---------
 drivers/dma/idxd/sysfs.c  | 68 +++++++++++++++++----------------------
 4 files changed, 88 insertions(+), 59 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index be1dcddfe3c4..4fef57717049 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -659,7 +659,7 @@ static int idxd_groups_config_write(struct idxd_device *idxd)
 		ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET));
 
 	for (i = 0; i < idxd->max_groups; i++) {
-		struct idxd_group *group = &idxd->groups[i];
+		struct idxd_group *group = idxd->groups[i];
 
 		idxd_group_config_write(group);
 	}
@@ -754,7 +754,7 @@ static void idxd_group_flags_setup(struct idxd_device *idxd)
 
 	/* TC-A 0 and TC-B 1 should be defaults */
 	for (i = 0; i < idxd->max_groups; i++) {
-		struct idxd_group *group = &idxd->groups[i];
+		struct idxd_group *group = idxd->groups[i];
 
 		if (group->tc_a == -1)
 			group->tc_a = group->grpcfg.flags.tc_a = 0;
@@ -781,7 +781,7 @@ static int idxd_engines_setup(struct idxd_device *idxd)
 	struct idxd_group *group;
 
 	for (i = 0; i < idxd->max_groups; i++) {
-		group = &idxd->groups[i];
+		group = idxd->groups[i];
 		group->grpcfg.engines = 0;
 	}
 
@@ -810,7 +810,7 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 	struct device *dev = &idxd->pdev->dev;
 
 	for (i = 0; i < idxd->max_groups; i++) {
-		group = &idxd->groups[i];
+		group = idxd->groups[i];
 		for (j = 0; j < 4; j++)
 			group->grpcfg.wqs[j] = 0;
 	}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index b9b7e8e8c384..3c4ce7997c88 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -193,7 +193,7 @@ struct idxd_device {
 
 	spinlock_t dev_lock;	/* spinlock for device */
 	struct completion *cmd_done;
-	struct idxd_group *groups;
+	struct idxd_group **groups;
 	struct idxd_wq **wqs;
 	struct idxd_engine **engines;
 
@@ -260,6 +260,7 @@ extern struct device_type dsa_device_type;
 extern struct device_type iax_device_type;
 extern struct device_type idxd_wq_device_type;
 extern struct device_type idxd_engine_device_type;
+extern struct device_type idxd_group_device_type;
 
 static inline bool is_dsa_dev(struct device *dev)
 {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index b90ef2f519eb..c20ea6bf09bf 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -236,11 +236,54 @@ static int idxd_setup_engines(struct idxd_device *idxd)
 	return rc;
 }
 
-static int idxd_setup_internals(struct idxd_device *idxd)
+static int idxd_setup_groups(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
+	struct idxd_group *group;
 	int i, rc;
 
+	idxd->groups = kcalloc_node(idxd->max_groups, sizeof(struct idxd_group *),
+				    GFP_KERNEL, dev_to_node(dev));
+	if (!idxd->groups)
+		return -ENOMEM;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = kzalloc_node(sizeof(*group), GFP_KERNEL, dev_to_node(dev));
+		if (!group) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		group->id = i;
+		group->idxd = idxd;
+		device_initialize(&group->conf_dev);
+		group->conf_dev.parent = &idxd->conf_dev;
+		group->conf_dev.bus = idxd_get_bus_type(idxd);
+		group->conf_dev.type = &idxd_group_device_type;
+		rc = dev_set_name(&group->conf_dev, "group%d.%d", idxd->id, group->id);
+		if (rc < 0) {
+			put_device(&group->conf_dev);
+			goto err;
+		}
+
+		idxd->groups[i] = group;
+		group->tc_a = -1;
+		group->tc_b = -1;
+	}
+
+	return 0;
+
+ err:
+	while (--i >= 0)
+		put_device(&idxd->groups[i]->conf_dev);
+	return rc;
+}
+
+static int idxd_setup_internals(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc, i;
+
 	init_waitqueue_head(&idxd->cmd_waitq);
 
 	rc = idxd_setup_wqs(idxd);
@@ -251,29 +294,22 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 	if (rc < 0)
 		goto err_engine;
 
-	idxd->groups = devm_kcalloc(dev, idxd->max_groups,
-				    sizeof(struct idxd_group), GFP_KERNEL);
-	if (!idxd->groups) {
-		rc = -ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < idxd->max_groups; i++) {
-		idxd->groups[i].idxd = idxd;
-		idxd->groups[i].id = i;
-		idxd->groups[i].tc_a = -1;
-		idxd->groups[i].tc_b = -1;
-	}
+	rc = idxd_setup_groups(idxd);
+	if (rc < 0)
+		goto err_group;
 
 	idxd->wq = create_workqueue(dev_name(dev));
 	if (!idxd->wq) {
 		rc = -ENOMEM;
-		goto err;
+		goto err_wkq_create;
 	}
 
 	return 0;
 
- err:
+ err_wkq_create:
+	for (i = 0; i < idxd->max_groups; i++)
+		put_device(&idxd->groups[i]->conf_dev);
+ err_group:
 	for (i = 0; i < idxd->max_engines; i++)
 		put_device(&idxd->engines[i]->conf_dev);
  err_engine:
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index ab02e3b4d75d..f793688039c9 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -16,16 +16,6 @@ static char *idxd_wq_type_names[] = {
 	[IDXD_WQT_USER]		= "user",
 };
 
-static void idxd_conf_sub_device_release(struct device *dev)
-{
-	dev_dbg(dev, "%s for %s\n", __func__, dev_name(dev));
-}
-
-static struct device_type idxd_group_device_type = {
-	.name = "group",
-	.release = idxd_conf_sub_device_release,
-};
-
 static int idxd_config_bus_match(struct device *dev,
 				 struct device_driver *drv)
 {
@@ -435,7 +425,7 @@ static ssize_t engine_group_id_store(struct device *dev,
 
 	if (prevg)
 		prevg->num_engines--;
-	engine->group = &idxd->groups[id];
+	engine->group = idxd->groups[id];
 	engine->group->num_engines++;
 
 	return count;
@@ -479,7 +469,7 @@ static void idxd_set_free_tokens(struct idxd_device *idxd)
 	int i, tokens;
 
 	for (i = 0, tokens = 0; i < idxd->max_groups; i++) {
-		struct idxd_group *g = &idxd->groups[i];
+		struct idxd_group *g = idxd->groups[i];
 
 		tokens += g->tokens_reserved;
 	}
@@ -784,6 +774,19 @@ static const struct attribute_group *idxd_group_attribute_groups[] = {
 	NULL,
 };
 
+static void idxd_conf_group_release(struct device *dev)
+{
+	struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev);
+
+	kfree(group);
+}
+
+struct device_type idxd_group_device_type = {
+	.name = "group",
+	.release = idxd_conf_group_release,
+	.groups = idxd_group_attribute_groups,
+};
+
 /* IDXD work queue attribs */
 static ssize_t wq_clients_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
@@ -856,7 +859,7 @@ static ssize_t wq_group_id_store(struct device *dev,
 		return count;
 	}
 
-	group = &idxd->groups[id];
+	group = idxd->groups[id];
 	prevg = wq->group;
 
 	if (prevg)
@@ -1666,37 +1669,27 @@ static int idxd_register_engine_devices(struct idxd_device *idxd)
 	return rc;
 }
 
-static int idxd_setup_group_sysfs(struct idxd_device *idxd)
+static int idxd_register_group_devices(struct idxd_device *idxd)
 {
-	struct device *dev = &idxd->pdev->dev;
-	int i, rc;
+	int i, j, rc;
 
 	for (i = 0; i < idxd->max_groups; i++) {
-		struct idxd_group *group = &idxd->groups[i];
-
-		group->conf_dev.parent = &idxd->conf_dev;
-		dev_set_name(&group->conf_dev, "group%d.%d",
-			     idxd->id, group->id);
-		group->conf_dev.bus = idxd_get_bus_type(idxd);
-		group->conf_dev.groups = idxd_group_attribute_groups;
-		group->conf_dev.type = &idxd_group_device_type;
-		dev_dbg(dev, "Group device register: %s\n",
-			dev_name(&group->conf_dev));
-		rc = device_register(&group->conf_dev);
-		if (rc < 0) {
-			put_device(&group->conf_dev);
+		struct idxd_group *group = idxd->groups[i];
+
+		rc = device_add(&group->conf_dev);
+		if (rc < 0)
 			goto cleanup;
-		}
 	}
 
 	return 0;
 
 cleanup:
-	while (i--) {
-		struct idxd_group *group = &idxd->groups[i];
+	j = i - 1;
+	for (; i < idxd->max_groups; i++)
+		put_device(&idxd->groups[i]->conf_dev);
 
-		device_unregister(&group->conf_dev);
-	}
+	while (j--)
+		device_unregister(&idxd->groups[j]->conf_dev);
 	return rc;
 }
 
@@ -1745,10 +1738,9 @@ int idxd_register_devices(struct idxd_device *idxd)
 		goto err_engine;
 	}
 
-	rc = idxd_setup_group_sysfs(idxd);
+	rc = idxd_register_group_devices(idxd);
 	if (rc < 0) {
-		/* unregister conf dev */
-		dev_dbg(dev, "Group sysfs registering failed: %d\n", rc);
+		dev_dbg(dev, "Group device registering failed: %d\n", rc);
 		goto err_group;
 	}
 
@@ -1782,7 +1774,7 @@ void idxd_unregister_devices(struct idxd_device *idxd)
 	}
 
 	for (i = 0; i < idxd->max_groups; i++) {
-		struct idxd_group *group = &idxd->groups[i];
+		struct idxd_group *group = idxd->groups[i];
 
 		device_unregister(&group->conf_dev);
 	}
-- 
Gitee


From b68a296414d819b418e028b72e0c3f10b560d895 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:37:57 -0700
Subject: [PATCH 0607/2161] dmaengine: idxd: fix cdev setup and free device
 lifetime issues

commit 04922b7445a1950b86f130a1fe8c52cc27b3e30b upstream.

The char device setup and cleanup has device lifetime issues regarding when
parts are initialized and cleaned up. The initialization of struct device is
done incorrectly. device_initialize() needs to be called on the 'struct
device' and then additional changes can be added. The ->release() function
needs to be setup via device_type before dev_set_name() to allow proper
cleanup. The change re-parents the cdev under the wq->conf_dev to get
natural reference inheritance. No known dependency on the old device path exists.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Fixes: 42d279f9137a ("dmaengine: idxd: add char driver to expose submission portal to userland")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/161852987721.2203940.1478218825576630810.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c  | 129 ++++++++++++++-------------------------
 drivers/dma/idxd/idxd.h  |   7 ++-
 drivers/dma/idxd/init.c  |   2 +-
 drivers/dma/idxd/irq.c   |   4 +-
 drivers/dma/idxd/sysfs.c |  10 ++-
 5 files changed, 63 insertions(+), 89 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 0db9b82ed8cf..1d8a3876b745 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -39,15 +39,15 @@ struct idxd_user_context {
 	struct iommu_sva *sva;
 };
 
-enum idxd_cdev_cleanup {
-	CDEV_NORMAL = 0,
-	CDEV_FAILED,
-};
-
 static void idxd_cdev_dev_release(struct device *dev)
 {
-	dev_dbg(dev, "releasing cdev device\n");
-	kfree(dev);
+	struct idxd_cdev *idxd_cdev = container_of(dev, struct idxd_cdev, dev);
+	struct idxd_cdev_context *cdev_ctx;
+	struct idxd_wq *wq = idxd_cdev->wq;
+
+	cdev_ctx = &ictx[wq->idxd->type];
+	ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor);
+	kfree(idxd_cdev);
 }
 
 static struct device_type idxd_cdev_device_type = {
@@ -62,14 +62,11 @@ static inline struct idxd_cdev *inode_idxd_cdev(struct inode *inode)
 	return container_of(cdev, struct idxd_cdev, cdev);
 }
 
-static inline struct idxd_wq *idxd_cdev_wq(struct idxd_cdev *idxd_cdev)
-{
-	return container_of(idxd_cdev, struct idxd_wq, idxd_cdev);
-}
-
 static inline struct idxd_wq *inode_wq(struct inode *inode)
 {
-	return idxd_cdev_wq(inode_idxd_cdev(inode));
+	struct idxd_cdev *idxd_cdev = inode_idxd_cdev(inode);
+
+	return idxd_cdev->wq;
 }
 
 static int idxd_cdev_open(struct inode *inode, struct file *filp)
@@ -220,11 +217,10 @@ static __poll_t idxd_cdev_poll(struct file *filp,
 	struct idxd_user_context *ctx = filp->private_data;
 	struct idxd_wq *wq = ctx->wq;
 	struct idxd_device *idxd = wq->idxd;
-	struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
 	unsigned long flags;
 	__poll_t out = 0;
 
-	poll_wait(filp, &idxd_cdev->err_queue, wait);
+	poll_wait(filp, &wq->err_queue, wait);
 	spin_lock_irqsave(&idxd->dev_lock, flags);
 	if (idxd->sw_err.valid)
 		out = EPOLLIN | EPOLLRDNORM;
@@ -246,98 +242,67 @@ int idxd_cdev_get_major(struct idxd_device *idxd)
 	return MAJOR(ictx[idxd->type].devt);
 }
 
-static int idxd_wq_cdev_dev_setup(struct idxd_wq *wq)
+int idxd_wq_add_cdev(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
-	struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
-	struct idxd_cdev_context *cdev_ctx;
+	struct idxd_cdev *idxd_cdev;
+	struct cdev *cdev;
 	struct device *dev;
-	int minor, rc;
+	struct idxd_cdev_context *cdev_ctx;
+	int rc, minor;
 
-	idxd_cdev->dev = kzalloc(sizeof(*idxd_cdev->dev), GFP_KERNEL);
-	if (!idxd_cdev->dev)
+	idxd_cdev = kzalloc(sizeof(*idxd_cdev), GFP_KERNEL);
+	if (!idxd_cdev)
 		return -ENOMEM;
 
-	dev = idxd_cdev->dev;
-	dev->parent = &idxd->pdev->dev;
-	dev_set_name(dev, "%s/wq%u.%u", idxd_get_dev_name(idxd),
-		     idxd->id, wq->id);
-	dev->bus = idxd_get_bus_type(idxd);
-
+	idxd_cdev->wq = wq;
+	cdev = &idxd_cdev->cdev;
+	dev = &idxd_cdev->dev;
 	cdev_ctx = &ictx[wq->idxd->type];
 	minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL);
 	if (minor < 0) {
-		rc = minor;
-		kfree(dev);
-		goto ida_err;
-	}
-
-	dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
-	dev->type = &idxd_cdev_device_type;
-	rc = device_register(dev);
-	if (rc < 0) {
-		dev_err(&idxd->pdev->dev, "device register failed\n");
-		goto dev_reg_err;
+		kfree(idxd_cdev);
+		return minor;
 	}
 	idxd_cdev->minor = minor;
 
-	return 0;
-
- dev_reg_err:
-	ida_simple_remove(&cdev_ctx->minor_ida, MINOR(dev->devt));
-	put_device(dev);
- ida_err:
-	idxd_cdev->dev = NULL;
-	return rc;
-}
-
-static void idxd_wq_cdev_cleanup(struct idxd_wq *wq,
-				 enum idxd_cdev_cleanup cdev_state)
-{
-	struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
-	struct idxd_cdev_context *cdev_ctx;
-
-	cdev_ctx = &ictx[wq->idxd->type];
-	if (cdev_state == CDEV_NORMAL)
-		cdev_del(&idxd_cdev->cdev);
-	device_unregister(idxd_cdev->dev);
-	/*
-	 * The device_type->release() will be called on the device and free
-	 * the allocated struct device. We can just forget it.
-	 */
-	ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor);
-	idxd_cdev->dev = NULL;
-	idxd_cdev->minor = -1;
-}
-
-int idxd_wq_add_cdev(struct idxd_wq *wq)
-{
-	struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
-	struct cdev *cdev = &idxd_cdev->cdev;
-	struct device *dev;
-	int rc;
+	device_initialize(dev);
+	dev->parent = &wq->conf_dev;
+	dev->bus = idxd_get_bus_type(idxd);
+	dev->type = &idxd_cdev_device_type;
+	dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
 
-	rc = idxd_wq_cdev_dev_setup(wq);
+	rc = dev_set_name(dev, "%s/wq%u.%u", idxd_get_dev_name(idxd),
+			  idxd->id, wq->id);
 	if (rc < 0)
-		return rc;
+		goto err;
 
-	dev = idxd_cdev->dev;
+	wq->idxd_cdev = idxd_cdev;
 	cdev_init(cdev, &idxd_cdev_fops);
-	cdev_set_parent(cdev, &dev->kobj);
-	rc = cdev_add(cdev, dev->devt, 1);
+	rc = cdev_device_add(cdev, dev);
 	if (rc) {
 		dev_dbg(&wq->idxd->pdev->dev, "cdev_add failed: %d\n", rc);
-		idxd_wq_cdev_cleanup(wq, CDEV_FAILED);
-		return rc;
+		goto err;
 	}
 
-	init_waitqueue_head(&idxd_cdev->err_queue);
 	return 0;
+
+ err:
+	put_device(dev);
+	wq->idxd_cdev = NULL;
+	return rc;
 }
 
 void idxd_wq_del_cdev(struct idxd_wq *wq)
 {
-	idxd_wq_cdev_cleanup(wq, CDEV_NORMAL);
+	struct idxd_cdev *idxd_cdev;
+	struct idxd_cdev_context *cdev_ctx;
+
+	cdev_ctx = &ictx[wq->idxd->type];
+	idxd_cdev = wq->idxd_cdev;
+	wq->idxd_cdev = NULL;
+	cdev_device_del(&idxd_cdev->cdev, &idxd_cdev->dev);
+	put_device(&idxd_cdev->dev);
 }
 
 int idxd_cdev_register(void)
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 3c4ce7997c88..89daf746d121 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -80,10 +80,10 @@ enum idxd_wq_type {
 };
 
 struct idxd_cdev {
+	struct idxd_wq *wq;
 	struct cdev cdev;
-	struct device *dev;
+	struct device dev;
 	int minor;
-	struct wait_queue_head err_queue;
 };
 
 #define IDXD_ALLOCATED_BATCH_SIZE	128U
@@ -109,7 +109,8 @@ struct idxd_dma_chan {
 struct idxd_wq {
 	void __iomem *portal;
 	struct device conf_dev;
-	struct idxd_cdev idxd_cdev;
+	struct idxd_cdev *idxd_cdev;
+	struct wait_queue_head err_queue;
 	struct idxd_device *idxd;
 	int id;
 	enum idxd_wq_type type;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index c20ea6bf09bf..07cf7977a045 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -176,7 +176,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		}
 
 		mutex_init(&wq->wq_lock);
-		wq->idxd_cdev.minor = -1;
+		init_waitqueue_head(&wq->err_queue);
 		wq->max_xfer_bytes = idxd->max_xfer_bytes;
 		wq->max_batch_size = idxd->max_batch_size;
 		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 7b0181532f77..fc0781e3f36d 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -133,7 +133,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 			struct idxd_wq *wq = idxd->wqs[id];
 
 			if (wq->type == IDXD_WQT_USER)
-				wake_up_interruptible(&wq->idxd_cdev.err_queue);
+				wake_up_interruptible(&wq->err_queue);
 		} else {
 			int i;
 
@@ -141,7 +141,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 				struct idxd_wq *wq = idxd->wqs[i];
 
 				if (wq->type == IDXD_WQT_USER)
-					wake_up_interruptible(&wq->idxd_cdev.err_queue);
+					wake_up_interruptible(&wq->err_queue);
 			}
 		}
 
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index f793688039c9..9586b55abce5 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1169,8 +1169,16 @@ static ssize_t wq_cdev_minor_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	int minor = -1;
 
-	return sprintf(buf, "%d\n", wq->idxd_cdev.minor);
+	mutex_lock(&wq->wq_lock);
+	if (wq->idxd_cdev)
+		minor = wq->idxd_cdev->minor;
+	mutex_unlock(&wq->wq_lock);
+
+	if (minor == -1)
+		return -ENXIO;
+	return sysfs_emit(buf, "%d\n", minor);
 }
 
 static struct device_attribute dev_attr_wq_cdev_minor =
-- 
Gitee


From 639884c9077f1726fccbe908b7edaa40778535ae Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:38:03 -0700
Subject: [PATCH 0608/2161] dmaengine: idxd: iax bus removal

commit 4b73e4ebd43ce48101a4c09bf13d439a954d61c5 upstream.

There is no need to have an additional bus for the IAX device. The removal
of IAX will change user ABI as /sys/bus/iax will no longer exist.
The iax device will be moved to the dsa bus. The device id for dsa and
iax will now be combined rather than unique for each device type in order
to accommodate the iax devices. This is in preparation for fixing the
sub-driver code for idxd. There's no hardware deployment for Sapphire
Rapids platform yet, which means that users have no reason to have
developed scripts against this ABI. There is some exposure to
released versions of accel-config, but those are being fixed up and
an accel-config upgrade is reasonable to get IAX support. As far as
accel-config is concerned IAX support starts when these devices appear
under /sys/bus/dsa, and old accel-config just assumes that an empty /
missing /sys/bus/iax just means a lack of platform support.

Fixes: f25b463883a8 ("dmaengine: idxd: add IAX configuration support in the IDXD driver")
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161852988298.2203940.4529909758034944428.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c  |  2 +-
 drivers/dma/idxd/idxd.h  |  3 +-
 drivers/dma/idxd/init.c  | 21 ++++--------
 drivers/dma/idxd/sysfs.c | 74 +++-------------------------------------
 4 files changed, 13 insertions(+), 87 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 1d8a3876b745..2d976905b2a3 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -268,7 +268,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq)
 
 	device_initialize(dev);
 	dev->parent = &wq->conf_dev;
-	dev->bus = idxd_get_bus_type(idxd);
+	dev->bus = &dsa_bus_type;
 	dev->type = &idxd_cdev_device_type;
 	dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 89daf746d121..b17415aa42bd 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -257,6 +257,7 @@ extern struct bus_type dsa_bus_type;
 extern struct bus_type iax_bus_type;
 
 extern bool support_enqcmd;
+extern struct ida idxd_ida;
 extern struct device_type dsa_device_type;
 extern struct device_type iax_device_type;
 extern struct device_type idxd_wq_device_type;
@@ -346,7 +347,6 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq)
 	return wq->client_count;
 };
 
-struct ida *idxd_ida(struct idxd_device *idxd);
 const char *idxd_get_dev_name(struct idxd_device *idxd);
 int idxd_register_bus_type(void);
 void idxd_unregister_bus_type(void);
@@ -354,7 +354,6 @@ int idxd_register_devices(struct idxd_device *idxd);
 void idxd_unregister_devices(struct idxd_device *idxd);
 int idxd_register_driver(void);
 void idxd_unregister_driver(void);
-struct bus_type *idxd_get_bus_type(struct idxd_device *idxd);
 struct device_type *idxd_get_device_type(struct idxd_device *idxd);
 
 /* device interrupt control */
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 07cf7977a045..be922a8c2784 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -33,8 +33,7 @@ MODULE_PARM_DESC(sva, "Toggle SVA support on/off");
 #define DRV_NAME "idxd"
 
 bool support_enqcmd;
-
-static struct ida idxd_idas[IDXD_TYPE_MAX];
+DEFINE_IDA(idxd_ida);
 
 static struct pci_device_id idxd_pci_tbl[] = {
 	/* DSA ver 1.0 platforms */
@@ -51,11 +50,6 @@ static char *idxd_name[] = {
 	"iax"
 };
 
-struct ida *idxd_ida(struct idxd_device *idxd)
-{
-	return &idxd_idas[idxd->type];
-}
-
 const char *idxd_get_dev_name(struct idxd_device *idxd)
 {
 	return idxd_name[idxd->type];
@@ -167,7 +161,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		wq->idxd = idxd;
 		device_initialize(&wq->conf_dev);
 		wq->conf_dev.parent = &idxd->conf_dev;
-		wq->conf_dev.bus = idxd_get_bus_type(idxd);
+		wq->conf_dev.bus = &dsa_bus_type;
 		wq->conf_dev.type = &idxd_wq_device_type;
 		rc = dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id);
 		if (rc < 0) {
@@ -258,7 +252,7 @@ static int idxd_setup_groups(struct idxd_device *idxd)
 		group->idxd = idxd;
 		device_initialize(&group->conf_dev);
 		group->conf_dev.parent = &idxd->conf_dev;
-		group->conf_dev.bus = idxd_get_bus_type(idxd);
+		group->conf_dev.bus = &dsa_bus_type;
 		group->conf_dev.type = &idxd_group_device_type;
 		rc = dev_set_name(&group->conf_dev, "group%d.%d", idxd->id, group->id);
 		if (rc < 0) {
@@ -409,13 +403,13 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev)
 
 	idxd->pdev = pdev;
 	idxd_set_type(idxd);
-	idxd->id = ida_alloc(idxd_ida(idxd), GFP_KERNEL);
+	idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL);
 	if (idxd->id < 0)
 		return NULL;
 
 	device_initialize(&idxd->conf_dev);
 	idxd->conf_dev.parent = dev;
-	idxd->conf_dev.bus = idxd_get_bus_type(idxd);
+	idxd->conf_dev.bus = &dsa_bus_type;
 	idxd->conf_dev.type = idxd_get_device_type(idxd);
 	rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd_get_dev_name(idxd), idxd->id);
 	if (rc < 0) {
@@ -669,7 +663,7 @@ static struct pci_driver idxd_pci_driver = {
 
 static int __init idxd_init_module(void)
 {
-	int err, i;
+	int err;
 
 	/*
 	 * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
@@ -685,9 +679,6 @@ static int __init idxd_init_module(void)
 	else
 		support_enqcmd = true;
 
-	for (i = 0; i < IDXD_TYPE_MAX; i++)
-		ida_init(&idxd_idas[i]);
-
 	err = idxd_register_bus_type();
 	if (err < 0)
 		return err;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 9586b55abce5..b97a0a817dfb 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -301,19 +301,6 @@ struct bus_type dsa_bus_type = {
 	.shutdown = idxd_config_bus_shutdown,
 };
 
-struct bus_type iax_bus_type = {
-	.name = "iax",
-	.match = idxd_config_bus_match,
-	.probe = idxd_config_bus_probe,
-	.remove = idxd_config_bus_remove,
-	.shutdown = idxd_config_bus_shutdown,
-};
-
-static struct bus_type *idxd_bus_types[] = {
-	&dsa_bus_type,
-	&iax_bus_type
-};
-
 static struct idxd_device_driver dsa_drv = {
 	.drv = {
 		.name = "dsa",
@@ -323,25 +310,6 @@ static struct idxd_device_driver dsa_drv = {
 	},
 };
 
-static struct idxd_device_driver iax_drv = {
-	.drv = {
-		.name = "iax",
-		.bus = &iax_bus_type,
-		.owner = THIS_MODULE,
-		.mod_name = KBUILD_MODNAME,
-	},
-};
-
-static struct idxd_device_driver *idxd_drvs[] = {
-	&dsa_drv,
-	&iax_drv
-};
-
-struct bus_type *idxd_get_bus_type(struct idxd_device *idxd)
-{
-	return idxd_bus_types[idxd->type];
-}
-
 struct device_type *idxd_get_device_type(struct idxd_device *idxd)
 {
 	if (idxd->type == IDXD_TYPE_DSA)
@@ -355,28 +323,12 @@ struct device_type *idxd_get_device_type(struct idxd_device *idxd)
 /* IDXD generic driver setup */
 int idxd_register_driver(void)
 {
-	int i, rc;
-
-	for (i = 0; i < IDXD_TYPE_MAX; i++) {
-		rc = driver_register(&idxd_drvs[i]->drv);
-		if (rc < 0)
-			goto drv_fail;
-	}
-
-	return 0;
-
-drv_fail:
-	while (--i >= 0)
-		driver_unregister(&idxd_drvs[i]->drv);
-	return rc;
+	return driver_register(&dsa_drv.drv);
 }
 
 void idxd_unregister_driver(void)
 {
-	int i;
-
-	for (i = 0; i < IDXD_TYPE_MAX; i++)
-		driver_unregister(&idxd_drvs[i]->drv);
+	driver_unregister(&dsa_drv.drv);
 }
 
 /* IDXD engine attributes */
@@ -1637,7 +1589,7 @@ static void idxd_conf_device_release(struct device *dev)
 	kfree(idxd->wqs);
 	kfree(idxd->engines);
 	kfree(idxd->irq_entries);
-	ida_free(idxd_ida(idxd), idxd->id);
+	ida_free(&idxd_ida, idxd->id);
 	kfree(idxd);
 }
 
@@ -1792,26 +1744,10 @@ void idxd_unregister_devices(struct idxd_device *idxd)
 
 int idxd_register_bus_type(void)
 {
-	int i, rc;
-
-	for (i = 0; i < IDXD_TYPE_MAX; i++) {
-		rc = bus_register(idxd_bus_types[i]);
-		if (rc < 0)
-			goto bus_err;
-	}
-
-	return 0;
-
-bus_err:
-	while (--i >= 0)
-		bus_unregister(idxd_bus_types[i]);
-	return rc;
+	return bus_register(&dsa_bus_type);
 }
 
 void idxd_unregister_bus_type(void)
 {
-	int i;
-
-	for (i = 0; i < IDXD_TYPE_MAX; i++)
-		bus_unregister(idxd_bus_types[i]);
+	bus_unregister(&dsa_bus_type);
 }
-- 
Gitee


From ea92db0dafbfd5b09b65a708082bee9eee69c14b Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Apr 2021 16:38:09 -0700
Subject: [PATCH 0609/2161] dmaengine: idxd: remove detection of device type

commit 435b512dbc0dac42b34348393049b386bb1a19bd upstream.

Move all static data type for per device type to an idxd_driver_data data
structure. The data can be attached to the pci_device_id and provided by
the pci probe function. This removes a lot of unnecessary type detection
and setup code.

Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161852988924.2203940.2787590808682466398.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c   | 11 +++----
 drivers/dma/idxd/device.c | 16 +++-------
 drivers/dma/idxd/idxd.h   | 13 +++++---
 drivers/dma/idxd/init.c   | 65 +++++++++++++++------------------------
 drivers/dma/idxd/submit.c |  2 +-
 drivers/dma/idxd/sysfs.c  | 16 ++--------
 6 files changed, 48 insertions(+), 75 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 2d976905b2a3..302cba5ff779 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -45,7 +45,7 @@ static void idxd_cdev_dev_release(struct device *dev)
 	struct idxd_cdev_context *cdev_ctx;
 	struct idxd_wq *wq = idxd_cdev->wq;
 
-	cdev_ctx = &ictx[wq->idxd->type];
+	cdev_ctx = &ictx[wq->idxd->data->type];
 	ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor);
 	kfree(idxd_cdev);
 }
@@ -239,7 +239,7 @@ static const struct file_operations idxd_cdev_fops = {
 
 int idxd_cdev_get_major(struct idxd_device *idxd)
 {
-	return MAJOR(ictx[idxd->type].devt);
+	return MAJOR(ictx[idxd->data->type].devt);
 }
 
 int idxd_wq_add_cdev(struct idxd_wq *wq)
@@ -258,7 +258,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq)
 	idxd_cdev->wq = wq;
 	cdev = &idxd_cdev->cdev;
 	dev = &idxd_cdev->dev;
-	cdev_ctx = &ictx[wq->idxd->type];
+	cdev_ctx = &ictx[wq->idxd->data->type];
 	minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL);
 	if (minor < 0) {
 		kfree(idxd_cdev);
@@ -272,8 +272,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq)
 	dev->type = &idxd_cdev_device_type;
 	dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
 
-	rc = dev_set_name(dev, "%s/wq%u.%u", idxd_get_dev_name(idxd),
-			  idxd->id, wq->id);
+	rc = dev_set_name(dev, "%s/wq%u.%u", idxd->data->name_prefix, idxd->id, wq->id);
 	if (rc < 0)
 		goto err;
 
@@ -298,7 +297,7 @@ void idxd_wq_del_cdev(struct idxd_wq *wq)
 	struct idxd_cdev *idxd_cdev;
 	struct idxd_cdev_context *cdev_ctx;
 
-	cdev_ctx = &ictx[wq->idxd->type];
+	cdev_ctx = &ictx[wq->idxd->data->type];
 	idxd_cdev = wq->idxd_cdev;
 	wq->idxd_cdev = NULL;
 	cdev_device_del(&idxd_cdev->cdev, &idxd_cdev->dev);
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 4fef57717049..016df87cf5c5 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -144,14 +144,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	if (rc < 0)
 		return rc;
 
-	if (idxd->type == IDXD_TYPE_DSA)
-		align = 32;
-	else if (idxd->type == IDXD_TYPE_IAX)
-		align = 64;
-	else
-		return -ENODEV;
-
-	wq->compls_size = num_descs * idxd->compl_size + align;
+	align = idxd->data->align;
+	wq->compls_size = num_descs * idxd->data->compl_size + align;
 	wq->compls_raw = dma_alloc_coherent(dev, wq->compls_size,
 					    &wq->compls_addr_raw, GFP_KERNEL);
 	if (!wq->compls_raw) {
@@ -178,11 +172,11 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 		struct idxd_desc *desc = wq->descs[i];
 
 		desc->hw = wq->hw_descs[i];
-		if (idxd->type == IDXD_TYPE_DSA)
+		if (idxd->data->type == IDXD_TYPE_DSA)
 			desc->completion = &wq->compls[i];
-		else if (idxd->type == IDXD_TYPE_IAX)
+		else if (idxd->data->type == IDXD_TYPE_IAX)
 			desc->iax_completion = &wq->iax_compls[i];
-		desc->compl_dma = wq->compls_addr + idxd->compl_size * i;
+		desc->compl_dma = wq->compls_addr + idxd->data->compl_size * i;
 		desc->id = i;
 		desc->wq = wq;
 		desc->cpu = -1;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index b17415aa42bd..8055e872953c 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -178,9 +178,17 @@ struct idxd_dma_dev {
 	struct dma_device dma;
 };
 
-struct idxd_device {
+struct idxd_driver_data {
+	const char *name_prefix;
 	enum idxd_type type;
+	struct device_type *dev_type;
+	int compl_size;
+	int align;
+};
+
+struct idxd_device {
 	struct device conf_dev;
+	struct idxd_driver_data *data;
 	struct list_head list;
 	struct idxd_hw hw;
 	enum idxd_device_state state;
@@ -218,7 +226,6 @@ struct idxd_device {
 	int token_limit;
 	int nr_tokens;		/* non-reserved tokens */
 	unsigned int wqcfg_size;
-	int compl_size;
 
 	union sw_err_reg sw_err;
 	wait_queue_head_t cmd_waitq;
@@ -347,14 +354,12 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq)
 	return wq->client_count;
 };
 
-const char *idxd_get_dev_name(struct idxd_device *idxd);
 int idxd_register_bus_type(void);
 void idxd_unregister_bus_type(void);
 int idxd_register_devices(struct idxd_device *idxd);
 void idxd_unregister_devices(struct idxd_device *idxd);
 int idxd_register_driver(void);
 void idxd_unregister_driver(void);
-struct device_type *idxd_get_device_type(struct idxd_device *idxd);
 
 /* device interrupt control */
 void idxd_msix_perm_setup(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index be922a8c2784..e8f64324bb3a 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -35,26 +35,33 @@ MODULE_PARM_DESC(sva, "Toggle SVA support on/off");
 bool support_enqcmd;
 DEFINE_IDA(idxd_ida);
 
+static struct idxd_driver_data idxd_driver_data[] = {
+	[IDXD_TYPE_DSA] = {
+		.name_prefix = "dsa",
+		.type = IDXD_TYPE_DSA,
+		.compl_size = sizeof(struct dsa_completion_record),
+		.align = 32,
+		.dev_type = &dsa_device_type,
+	},
+	[IDXD_TYPE_IAX] = {
+		.name_prefix = "iax",
+		.type = IDXD_TYPE_IAX,
+		.compl_size = sizeof(struct iax_completion_record),
+		.align = 64,
+		.dev_type = &iax_device_type,
+	},
+};
+
 static struct pci_device_id idxd_pci_tbl[] = {
 	/* DSA ver 1.0 platforms */
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_DSA_SPR0) },
+	{ PCI_DEVICE_DATA(INTEL, DSA_SPR0, &idxd_driver_data[IDXD_TYPE_DSA]) },
 
 	/* IAX ver 1.0 platforms */
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IAX_SPR0) },
+	{ PCI_DEVICE_DATA(INTEL, IAX_SPR0, &idxd_driver_data[IDXD_TYPE_IAX]) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, idxd_pci_tbl);
 
-static char *idxd_name[] = {
-	"dsa",
-	"iax"
-};
-
-const char *idxd_get_dev_name(struct idxd_device *idxd)
-{
-	return idxd_name[idxd->type];
-}
-
 static int idxd_setup_interrupts(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
@@ -379,19 +386,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	}
 }
 
-static inline void idxd_set_type(struct idxd_device *idxd)
-{
-	struct pci_dev *pdev = idxd->pdev;
-
-	if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0)
-		idxd->type = IDXD_TYPE_DSA;
-	else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0)
-		idxd->type = IDXD_TYPE_IAX;
-	else
-		idxd->type = IDXD_TYPE_UNKNOWN;
-}
-
-static struct idxd_device *idxd_alloc(struct pci_dev *pdev)
+static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data)
 {
 	struct device *dev = &pdev->dev;
 	struct idxd_device *idxd;
@@ -402,7 +397,7 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev)
 		return NULL;
 
 	idxd->pdev = pdev;
-	idxd_set_type(idxd);
+	idxd->data = data;
 	idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL);
 	if (idxd->id < 0)
 		return NULL;
@@ -410,8 +405,8 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev)
 	device_initialize(&idxd->conf_dev);
 	idxd->conf_dev.parent = dev;
 	idxd->conf_dev.bus = &dsa_bus_type;
-	idxd->conf_dev.type = idxd_get_device_type(idxd);
-	rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd_get_dev_name(idxd), idxd->id);
+	idxd->conf_dev.type = idxd->data->dev_type;
+	rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd->data->name_prefix, idxd->id);
 	if (rc < 0) {
 		put_device(&idxd->conf_dev);
 		return NULL;
@@ -503,18 +498,11 @@ static int idxd_probe(struct idxd_device *idxd)
 	return rc;
 }
 
-static void idxd_type_init(struct idxd_device *idxd)
-{
-	if (idxd->type == IDXD_TYPE_DSA)
-		idxd->compl_size = sizeof(struct dsa_completion_record);
-	else if (idxd->type == IDXD_TYPE_IAX)
-		idxd->compl_size = sizeof(struct iax_completion_record);
-}
-
 static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct device *dev = &pdev->dev;
 	struct idxd_device *idxd;
+	struct idxd_driver_data *data = (struct idxd_driver_data *)id->driver_data;
 	int rc;
 
 	rc = pci_enable_device(pdev);
@@ -522,7 +510,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return rc;
 
 	dev_dbg(dev, "Alloc IDXD context\n");
-	idxd = idxd_alloc(pdev);
+	idxd = idxd_alloc(pdev, data);
 	if (!idxd) {
 		rc = -ENOMEM;
 		goto err_idxd_alloc;
@@ -548,9 +536,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		goto err;
 
-
-	idxd_type_init(idxd);
-
 	dev_dbg(dev, "Set PCI master\n");
 	pci_set_master(pdev);
 	pci_set_drvdata(pdev, idxd);
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index bef5c5c2f1dd..f9f979498533 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -16,7 +16,7 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 
 	desc = wq->descs[idx];
 	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
-	memset(desc->completion, 0, idxd->compl_size);
+	memset(desc->completion, 0, idxd->data->compl_size);
 	desc->cpu = cpu;
 
 	if (device_pasid_enabled(idxd))
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index b97a0a817dfb..581ce56ae4f5 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -310,16 +310,6 @@ static struct idxd_device_driver dsa_drv = {
 	},
 };
 
-struct device_type *idxd_get_device_type(struct idxd_device *idxd)
-{
-	if (idxd->type == IDXD_TYPE_DSA)
-		return &dsa_device_type;
-	else if (idxd->type == IDXD_TYPE_IAX)
-		return &iax_device_type;
-	else
-		return NULL;
-}
-
 /* IDXD generic driver setup */
 int idxd_register_driver(void)
 {
@@ -453,7 +443,7 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
-	if (idxd->type == IDXD_TYPE_IAX)
+	if (idxd->data->type == IDXD_TYPE_IAX)
 		return -EOPNOTSUPP;
 
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
@@ -501,7 +491,7 @@ static ssize_t group_tokens_allowed_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
-	if (idxd->type == IDXD_TYPE_IAX)
+	if (idxd->data->type == IDXD_TYPE_IAX)
 		return -EOPNOTSUPP;
 
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
@@ -546,7 +536,7 @@ static ssize_t group_use_token_limit_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
-	if (idxd->type == IDXD_TYPE_IAX)
+	if (idxd->data->type == IDXD_TYPE_IAX)
 		return -EOPNOTSUPP;
 
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
-- 
Gitee


From 32483d79634aa6fd8a1016ede8ee98d4314f1529 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Apr 2021 11:46:22 -0700
Subject: [PATCH 0610/2161] dmaengine: idxd: add percpu_ref to descriptor
 submission path

commit 93a40a6d7428921897bb7fed5ffb4ce83df05432 upstream.

Current submission path has no way to restrict the submitter from
stop submiting on shutdown path or wq disable path. This provides a way to
quiesce the submission path.

Modeling after 'struct reqeust_queue' usage of percpu_ref. One of the
abilities of per_cpu reference counting is the ability to stop new
references from being taken while awaiting outstanding references to be
dropped. On wq shutdown, we want to block any new submissions to the kernel
workqueue and quiesce before disabling. The percpu_ref allows us to block
any new submissions and wait for any current submission calls to finish
submitting to the workqueue.

A percpu_ref is embedded in each idxd_wq context to allow control for
individual wq. The wq->wq_active counter is elevated before calling
movdir64b() or enqcmds() to submit a descriptor to the wq and dropped once
the submission call completes. The function is gated by
percpu_ref_tryget_live(). On shutdown with percpu_ref_kill() called, any
new submission would be blocked from acquiring a ref and failed. Once all
references are dropped for the wq, shutdown can continue.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161894438293.3202472.14894701611500822232.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  26 +++++
 drivers/dma/idxd/idxd.h   |   4 +
 drivers/dma/idxd/init.c   |   1 +
 drivers/dma/idxd/submit.c |   5 +
 drivers/dma/idxd/sysfs.c  | 233 ++++++++++++++++++++------------------
 5 files changed, 161 insertions(+), 108 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 016df87cf5c5..6d674fdedb4d 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -384,6 +384,32 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	memset(wq->name, 0, WQ_NAME_SIZE);
 }
 
+static void idxd_wq_ref_release(struct percpu_ref *ref)
+{
+	struct idxd_wq *wq = container_of(ref, struct idxd_wq, wq_active);
+
+	complete(&wq->wq_dead);
+}
+
+int idxd_wq_init_percpu_ref(struct idxd_wq *wq)
+{
+	int rc;
+
+	memset(&wq->wq_active, 0, sizeof(wq->wq_active));
+	rc = percpu_ref_init(&wq->wq_active, idxd_wq_ref_release, 0, GFP_KERNEL);
+	if (rc < 0)
+		return rc;
+	reinit_completion(&wq->wq_dead);
+	return 0;
+}
+
+void idxd_wq_quiesce(struct idxd_wq *wq)
+{
+	percpu_ref_kill(&wq->wq_active);
+	wait_for_completion(&wq->wq_dead);
+	percpu_ref_exit(&wq->wq_active);
+}
+
 /* Device control bits */
 static inline bool idxd_is_enabled(struct idxd_device *idxd)
 {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 8055e872953c..1b539cbf3c14 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -108,6 +108,8 @@ struct idxd_dma_chan {
 
 struct idxd_wq {
 	void __iomem *portal;
+	struct percpu_ref wq_active;
+	struct completion wq_dead;
 	struct device conf_dev;
 	struct idxd_cdev *idxd_cdev;
 	struct wait_queue_head err_queue;
@@ -395,6 +397,8 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq);
 void idxd_wq_disable_cleanup(struct idxd_wq *wq);
 int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid);
 int idxd_wq_disable_pasid(struct idxd_wq *wq);
+void idxd_wq_quiesce(struct idxd_wq *wq);
+int idxd_wq_init_percpu_ref(struct idxd_wq *wq);
 
 /* submission */
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index e8f64324bb3a..eda5ffd307c6 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -178,6 +178,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 
 		mutex_init(&wq->wq_lock);
 		init_waitqueue_head(&wq->err_queue);
+		init_completion(&wq->wq_dead);
 		wq->max_xfer_bytes = idxd->max_xfer_bytes;
 		wq->max_batch_size = idxd->max_batch_size;
 		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index f9f979498533..5e75e214ac88 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -87,6 +87,9 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
 
+	if (!percpu_ref_tryget_live(&wq->wq_active))
+		return -ENXIO;
+
 	portal = wq->portal;
 
 	/*
@@ -109,6 +112,8 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 			return rc;
 	}
 
+	percpu_ref_put(&wq->wq_active);
+
 	/*
 	 * Pending the descriptor to the lockless list for the irq_entry
 	 * that we designated the descriptor to.
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 581ce56ae4f5..dad5c0be9ae8 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -47,6 +47,127 @@ static int idxd_config_bus_match(struct device *dev,
 	return matched;
 }
 
+static int enable_wq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	unsigned long flags;
+	int rc;
+
+	mutex_lock(&wq->wq_lock);
+
+	if (idxd->state != IDXD_DEV_ENABLED) {
+		mutex_unlock(&wq->wq_lock);
+		dev_warn(dev, "Enabling while device not enabled.\n");
+		return -EPERM;
+	}
+
+	if (wq->state != IDXD_WQ_DISABLED) {
+		mutex_unlock(&wq->wq_lock);
+		dev_warn(dev, "WQ %d already enabled.\n", wq->id);
+		return -EBUSY;
+	}
+
+	if (!wq->group) {
+		mutex_unlock(&wq->wq_lock);
+		dev_warn(dev, "WQ not attached to group.\n");
+		return -EINVAL;
+	}
+
+	if (strlen(wq->name) == 0) {
+		mutex_unlock(&wq->wq_lock);
+		dev_warn(dev, "WQ name not set.\n");
+		return -EINVAL;
+	}
+
+	/* Shared WQ checks */
+	if (wq_shared(wq)) {
+		if (!device_swq_supported(idxd)) {
+			dev_warn(dev, "PASID not enabled and shared WQ.\n");
+			mutex_unlock(&wq->wq_lock);
+			return -ENXIO;
+		}
+		/*
+		 * Shared wq with the threshold set to 0 means the user
+		 * did not set the threshold or transitioned from a
+		 * dedicated wq but did not set threshold. A value
+		 * of 0 would effectively disable the shared wq. The
+		 * driver does not allow a value of 0 to be set for
+		 * threshold via sysfs.
+		 */
+		if (wq->threshold == 0) {
+			dev_warn(dev, "Shared WQ and threshold 0.\n");
+			mutex_unlock(&wq->wq_lock);
+			return -EINVAL;
+		}
+	}
+
+	rc = idxd_wq_alloc_resources(wq);
+	if (rc < 0) {
+		mutex_unlock(&wq->wq_lock);
+		dev_warn(dev, "WQ resource alloc failed\n");
+		return rc;
+	}
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	rc = idxd_device_config(idxd);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	if (rc < 0) {
+		mutex_unlock(&wq->wq_lock);
+		dev_warn(dev, "Writing WQ %d config failed: %d\n", wq->id, rc);
+		return rc;
+	}
+
+	rc = idxd_wq_enable(wq);
+	if (rc < 0) {
+		mutex_unlock(&wq->wq_lock);
+		dev_warn(dev, "WQ %d enabling failed: %d\n", wq->id, rc);
+		return rc;
+	}
+
+	rc = idxd_wq_map_portal(wq);
+	if (rc < 0) {
+		dev_warn(dev, "wq portal mapping failed: %d\n", rc);
+		rc = idxd_wq_disable(wq);
+		if (rc < 0)
+			dev_warn(dev, "IDXD wq disable failed\n");
+		mutex_unlock(&wq->wq_lock);
+		return rc;
+	}
+
+	wq->client_count = 0;
+
+	if (wq->type == IDXD_WQT_KERNEL) {
+		rc = idxd_wq_init_percpu_ref(wq);
+		if (rc < 0) {
+			dev_dbg(dev, "percpu_ref setup failed\n");
+			mutex_unlock(&wq->wq_lock);
+			return rc;
+		}
+	}
+
+	if (is_idxd_wq_dmaengine(wq)) {
+		rc = idxd_register_dma_channel(wq);
+		if (rc < 0) {
+			dev_dbg(dev, "DMA channel register failed\n");
+			mutex_unlock(&wq->wq_lock);
+			return rc;
+		}
+	} else if (is_idxd_wq_cdev(wq)) {
+		rc = idxd_wq_add_cdev(wq);
+		if (rc < 0) {
+			dev_dbg(dev, "Cdev creation failed\n");
+			mutex_unlock(&wq->wq_lock);
+			return rc;
+		}
+	}
+
+	mutex_unlock(&wq->wq_lock);
+	dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev));
+
+	return 0;
+}
+
 static int idxd_config_bus_probe(struct device *dev)
 {
 	int rc;
@@ -94,115 +215,8 @@ static int idxd_config_bus_probe(struct device *dev)
 		return 0;
 	} else if (is_idxd_wq_dev(dev)) {
 		struct idxd_wq *wq = confdev_to_wq(dev);
-		struct idxd_device *idxd = wq->idxd;
-
-		mutex_lock(&wq->wq_lock);
-
-		if (idxd->state != IDXD_DEV_ENABLED) {
-			mutex_unlock(&wq->wq_lock);
-			dev_warn(dev, "Enabling while device not enabled.\n");
-			return -EPERM;
-		}
-
-		if (wq->state != IDXD_WQ_DISABLED) {
-			mutex_unlock(&wq->wq_lock);
-			dev_warn(dev, "WQ %d already enabled.\n", wq->id);
-			return -EBUSY;
-		}
-
-		if (!wq->group) {
-			mutex_unlock(&wq->wq_lock);
-			dev_warn(dev, "WQ not attached to group.\n");
-			return -EINVAL;
-		}
-
-		if (strlen(wq->name) == 0) {
-			mutex_unlock(&wq->wq_lock);
-			dev_warn(dev, "WQ name not set.\n");
-			return -EINVAL;
-		}
-
-		/* Shared WQ checks */
-		if (wq_shared(wq)) {
-			if (!device_swq_supported(idxd)) {
-				dev_warn(dev,
-					 "PASID not enabled and shared WQ.\n");
-				mutex_unlock(&wq->wq_lock);
-				return -ENXIO;
-			}
-			/*
-			 * Shared wq with the threshold set to 0 means the user
-			 * did not set the threshold or transitioned from a
-			 * dedicated wq but did not set threshold. A value
-			 * of 0 would effectively disable the shared wq. The
-			 * driver does not allow a value of 0 to be set for
-			 * threshold via sysfs.
-			 */
-			if (wq->threshold == 0) {
-				dev_warn(dev,
-					 "Shared WQ and threshold 0.\n");
-				mutex_unlock(&wq->wq_lock);
-				return -EINVAL;
-			}
-		}
-
-		rc = idxd_wq_alloc_resources(wq);
-		if (rc < 0) {
-			mutex_unlock(&wq->wq_lock);
-			dev_warn(dev, "WQ resource alloc failed\n");
-			return rc;
-		}
-
-		spin_lock_irqsave(&idxd->dev_lock, flags);
-		rc = idxd_device_config(idxd);
-		spin_unlock_irqrestore(&idxd->dev_lock, flags);
-		if (rc < 0) {
-			mutex_unlock(&wq->wq_lock);
-			dev_warn(dev, "Writing WQ %d config failed: %d\n",
-				 wq->id, rc);
-			return rc;
-		}
 
-		rc = idxd_wq_enable(wq);
-		if (rc < 0) {
-			mutex_unlock(&wq->wq_lock);
-			dev_warn(dev, "WQ %d enabling failed: %d\n",
-				 wq->id, rc);
-			return rc;
-		}
-
-		rc = idxd_wq_map_portal(wq);
-		if (rc < 0) {
-			dev_warn(dev, "wq portal mapping failed: %d\n", rc);
-			rc = idxd_wq_disable(wq);
-			if (rc < 0)
-				dev_warn(dev, "IDXD wq disable failed\n");
-			mutex_unlock(&wq->wq_lock);
-			return rc;
-		}
-
-		wq->client_count = 0;
-
-		dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev));
-
-		if (is_idxd_wq_dmaengine(wq)) {
-			rc = idxd_register_dma_channel(wq);
-			if (rc < 0) {
-				dev_dbg(dev, "DMA channel register failed\n");
-				mutex_unlock(&wq->wq_lock);
-				return rc;
-			}
-		} else if (is_idxd_wq_cdev(wq)) {
-			rc = idxd_wq_add_cdev(wq);
-			if (rc < 0) {
-				dev_dbg(dev, "Cdev creation failed\n");
-				mutex_unlock(&wq->wq_lock);
-				return rc;
-			}
-		}
-
-		mutex_unlock(&wq->wq_lock);
-		return 0;
+		return enable_wq(wq);
 	}
 
 	return -ENODEV;
@@ -220,6 +234,9 @@ static void disable_wq(struct idxd_wq *wq)
 		return;
 	}
 
+	if (wq->type == IDXD_WQT_KERNEL)
+		idxd_wq_quiesce(wq);
+
 	if (is_idxd_wq_dmaengine(wq))
 		idxd_unregister_dma_channel(wq);
 	else if (is_idxd_wq_cdev(wq))
-- 
Gitee


From a1a7c72827fb957ed918a4fe483159d1c349b8f3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Apr 2021 11:46:28 -0700
Subject: [PATCH 0611/2161] dmaengine: idxd: add support for readonly config
 mode

commit 8c66bbdc4fbf3c297ebc8edf71f359e4a132c9db upstream.

The read-only configuration mode is defined by the DSA spec as a mode of
the device WQ configuration. When GENCAP register bit 31 is set to 0,
the device is in RO mode and group configuration and some fields of the
workqueue configuration registers are read-only and reflect the fixed
configuration of the device. Add support for RO mode. The driver will
load the values from the registers directly setup all the internally
cached data structures based on the device configuration.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161894438847.3202472.6317563824045432727.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 116 ++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |   1 +
 drivers/dma/idxd/init.c   |   8 +++
 drivers/dma/idxd/sysfs.c  |  22 +++++---
 4 files changed, 138 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 6d674fdedb4d..3ddb1c731080 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -884,3 +884,119 @@ int idxd_device_config(struct idxd_device *idxd)
 
 	return 0;
 }
+
+static int idxd_wq_load_config(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int wqcfg_offset;
+	int i;
+
+	wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, 0);
+	memcpy_fromio(wq->wqcfg, idxd->reg_base + wqcfg_offset, idxd->wqcfg_size);
+
+	wq->size = wq->wqcfg->wq_size;
+	wq->threshold = wq->wqcfg->wq_thresh;
+	if (wq->wqcfg->priv)
+		wq->type = IDXD_WQT_KERNEL;
+
+	/* The driver does not support shared WQ mode in read-only config yet */
+	if (wq->wqcfg->mode == 0 || wq->wqcfg->pasid_en)
+		return -EOPNOTSUPP;
+
+	set_bit(WQ_FLAG_DEDICATED, &wq->flags);
+
+	wq->priority = wq->wqcfg->priority;
+
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i);
+		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n", wq->id, i, wqcfg_offset, wq->wqcfg->bits[i]);
+	}
+
+	return 0;
+}
+
+static void idxd_group_load_config(struct idxd_group *group)
+{
+	struct idxd_device *idxd = group->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int i, j, grpcfg_offset;
+
+	/*
+	 * Load WQS bit fields
+	 * Iterate through all 256 bits 64 bits at a time
+	 */
+	for (i = 0; i < GRPWQCFG_STRIDES; i++) {
+		struct idxd_wq *wq;
+
+		grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i);
+		group->grpcfg.wqs[i] = ioread64(idxd->reg_base + grpcfg_offset);
+		dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n",
+			group->id, i, grpcfg_offset, group->grpcfg.wqs[i]);
+
+		if (i * 64 >= idxd->max_wqs)
+			break;
+
+		/* Iterate through all 64 bits and check for wq set */
+		for (j = 0; j < 64; j++) {
+			int id = i * 64 + j;
+
+			/* No need to check beyond max wqs */
+			if (id >= idxd->max_wqs)
+				break;
+
+			/* Set group assignment for wq if wq bit is set */
+			if (group->grpcfg.wqs[i] & BIT(j)) {
+				wq = idxd->wqs[id];
+				wq->group = group;
+			}
+		}
+	}
+
+	grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id);
+	group->grpcfg.engines = ioread64(idxd->reg_base + grpcfg_offset);
+	dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id,
+		grpcfg_offset, group->grpcfg.engines);
+
+	/* Iterate through all 64 bits to check engines set */
+	for (i = 0; i < 64; i++) {
+		if (i >= idxd->max_engines)
+			break;
+
+		if (group->grpcfg.engines & BIT(i)) {
+			struct idxd_engine *engine = idxd->engines[i];
+
+			engine->group = group;
+		}
+	}
+
+	grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id);
+	group->grpcfg.flags.bits = ioread32(idxd->reg_base + grpcfg_offset);
+	dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n",
+		group->id, grpcfg_offset, group->grpcfg.flags.bits);
+}
+
+int idxd_device_load_config(struct idxd_device *idxd)
+{
+	union gencfg_reg reg;
+	int i, rc;
+
+	reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+	idxd->token_limit = reg.token_limit;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = idxd->groups[i];
+
+		idxd_group_load_config(group);
+	}
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		rc = idxd_wq_load_config(wq);
+		if (rc < 0)
+			return rc;
+	}
+
+	return 0;
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 1b539cbf3c14..940a2e1ddf12 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -384,6 +384,7 @@ void idxd_device_cleanup(struct idxd_device *idxd);
 int idxd_device_config(struct idxd_device *idxd);
 void idxd_device_wqs_clear_state(struct idxd_device *idxd);
 void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid);
+int idxd_device_load_config(struct idxd_device *idxd);
 
 /* work queue control */
 int idxd_wq_alloc_resources(struct idxd_wq *wq);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index eda5ffd307c6..a07e6d8eec00 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -482,6 +482,14 @@ static int idxd_probe(struct idxd_device *idxd)
 	if (rc)
 		goto err;
 
+	/* If the configs are readonly, then load them from device */
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) {
+		dev_dbg(dev, "Loading RO device config\n");
+		rc = idxd_device_load_config(idxd);
+		if (rc < 0)
+			goto err;
+	}
+
 	rc = idxd_setup_interrupts(idxd);
 	if (rc)
 		goto err;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index dad5c0be9ae8..d45cb61f300b 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -110,7 +110,8 @@ static int enable_wq(struct idxd_wq *wq)
 	}
 
 	spin_lock_irqsave(&idxd->dev_lock, flags);
-	rc = idxd_device_config(idxd);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		rc = idxd_device_config(idxd);
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 	if (rc < 0) {
 		mutex_unlock(&wq->wq_lock);
@@ -170,7 +171,7 @@ static int enable_wq(struct idxd_wq *wq)
 
 static int idxd_config_bus_probe(struct device *dev)
 {
-	int rc;
+	int rc = 0;
 	unsigned long flags;
 
 	dev_dbg(dev, "%s called\n", __func__);
@@ -188,7 +189,8 @@ static int idxd_config_bus_probe(struct device *dev)
 
 		/* Perform IDXD configuration and enabling */
 		spin_lock_irqsave(&idxd->dev_lock, flags);
-		rc = idxd_device_config(idxd);
+		if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+			rc = idxd_device_config(idxd);
 		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 		if (rc < 0) {
 			module_put(THIS_MODULE);
@@ -287,12 +289,14 @@ static int idxd_config_bus_remove(struct device *dev)
 
 		idxd_unregister_dma_device(idxd);
 		rc = idxd_device_disable(idxd);
-		for (i = 0; i < idxd->max_wqs; i++) {
-			struct idxd_wq *wq = idxd->wqs[i];
-
-			mutex_lock(&wq->wq_lock);
-			idxd_wq_disable_cleanup(wq);
-			mutex_unlock(&wq->wq_lock);
+		if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) {
+			for (i = 0; i < idxd->max_wqs; i++) {
+				struct idxd_wq *wq = idxd->wqs[i];
+
+				mutex_lock(&wq->wq_lock);
+				idxd_wq_disable_cleanup(wq);
+				mutex_unlock(&wq->wq_lock);
+			}
 		}
 		module_put(THIS_MODULE);
 		if (rc < 0)
-- 
Gitee


From 48abe46d9f07de847f9066842e3244a7a8dbee6d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Apr 2021 11:46:34 -0700
Subject: [PATCH 0612/2161] dmaengine: idxd: add interrupt handle request and
 release support

commit eb15e7154fbfa3e61c777704b2ff28eb3a0d4796 upstream.

DSA spec states that when Request Interrupt Handle and Release Interrupt
Handle command bits are set in the CMDCAP register, these device commands
must be supported by the driver.

The interrupt handle is programmed in a descriptor. When Request Interrupt
Handle is not supported, the interrupt handle is the index of the desired
entry in the MSI-X table. When the command is supported, driver must use
the command to obtain a handle to be programmed in the submitted
descriptor.

A requested handle may be revoked. After the handle is revoked, any use of
the handle will result in Invalid Interrupt Handle error.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161894439422.3202472.17579543737810265471.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c    | 71 ++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h      | 13 +++++++
 drivers/dma/idxd/init.c      | 56 +++++++++++++++++++++++++++-
 drivers/dma/idxd/registers.h |  9 ++++-
 drivers/dma/idxd/submit.c    | 35 ++++++++++++++----
 drivers/dma/idxd/sysfs.c     |  1 +
 6 files changed, 176 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 3ddb1c731080..54d5afec81cf 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -598,6 +598,77 @@ void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid)
 	dev_dbg(dev, "pasid %d drained\n", pasid);
 }
 
+int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle,
+				   enum idxd_interrupt_type irq_type)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand, status;
+
+	if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)))
+		return -EOPNOTSUPP;
+
+	dev_dbg(dev, "get int handle, idx %d\n", idx);
+
+	operand = idx & GENMASK(15, 0);
+	if (irq_type == IDXD_IRQ_IMS)
+		operand |= CMD_INT_HANDLE_IMS;
+
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_REQUEST_INT_HANDLE, operand);
+
+	idxd_cmd_exec(idxd, IDXD_CMD_REQUEST_INT_HANDLE, operand, &status);
+
+	if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "request int handle failed: %#x\n", status);
+		return -ENXIO;
+	}
+
+	*handle = (status >> IDXD_CMDSTS_RES_SHIFT) & GENMASK(15, 0);
+
+	dev_dbg(dev, "int handle acquired: %u\n", *handle);
+	return 0;
+}
+
+int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
+				   enum idxd_interrupt_type irq_type)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand, status;
+	union idxd_command_reg cmd;
+	unsigned long flags;
+
+	if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE)))
+		return -EOPNOTSUPP;
+
+	dev_dbg(dev, "release int handle, handle %d\n", handle);
+
+	memset(&cmd, 0, sizeof(cmd));
+	operand = handle & GENMASK(15, 0);
+
+	if (irq_type == IDXD_IRQ_IMS)
+		operand |= CMD_INT_HANDLE_IMS;
+
+	cmd.cmd = IDXD_CMD_RELEASE_INT_HANDLE;
+	cmd.operand = operand;
+
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_RELEASE_INT_HANDLE, operand);
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
+
+	while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE)
+		cpu_relax();
+	status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "release int handle failed: %#x\n", status);
+		return -ENXIO;
+	}
+
+	dev_dbg(dev, "int handle released.\n");
+	return 0;
+}
+
 /* Device configuration bits */
 void idxd_msix_perm_setup(struct idxd_device *idxd)
 {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 940a2e1ddf12..c1d4a1976206 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -160,6 +160,7 @@ struct idxd_hw {
 	union group_cap_reg group_cap;
 	union engine_cap_reg engine_cap;
 	struct opcap opcap;
+	u32 cmd_cap;
 };
 
 enum idxd_device_state {
@@ -237,6 +238,8 @@ struct idxd_device {
 	struct idxd_dma_dev *idxd_dma;
 	struct workqueue_struct *wq;
 	struct work_struct work;
+
+	int *int_handles;
 };
 
 /* IDXD software descriptor */
@@ -256,6 +259,7 @@ struct idxd_desc {
 	struct list_head list;
 	int id;
 	int cpu;
+	unsigned int vector;
 	struct idxd_wq *wq;
 };
 
@@ -330,6 +334,11 @@ enum idxd_portal_prot {
 	IDXD_PORTAL_LIMITED,
 };
 
+enum idxd_interrupt_type {
+	IDXD_IRQ_MSIX = 0,
+	IDXD_IRQ_IMS,
+};
+
 static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot)
 {
 	return prot * 0x1000;
@@ -385,6 +394,10 @@ int idxd_device_config(struct idxd_device *idxd);
 void idxd_device_wqs_clear_state(struct idxd_device *idxd);
 void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid);
 int idxd_device_load_config(struct idxd_device *idxd);
+int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle,
+				   enum idxd_interrupt_type irq_type);
+int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
+				   enum idxd_interrupt_type irq_type);
 
 /* work queue control */
 int idxd_wq_alloc_resources(struct idxd_wq *wq);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index a07e6d8eec00..ef58750c24cc 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -125,7 +125,25 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 			dev_err(dev, "Failed to allocate irq %d.\n", irq_entry->vector);
 			goto err_wq_irqs;
 		}
+
 		dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, irq_entry->vector);
+		if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) {
+			/*
+			 * The MSIX vector enumeration starts at 1 with vector 0 being the
+			 * misc interrupt that handles non I/O completion events. The
+			 * interrupt handles are for IMS enumeration on guest. The misc
+			 * interrupt vector does not require a handle and therefore we start
+			 * the int_handles at index 0. Since 'i' starts at 1, the first
+			 * int_handles index will be 0.
+			 */
+			rc = idxd_device_request_int_handle(idxd, i, &idxd->int_handles[i - 1],
+							    IDXD_IRQ_MSIX);
+			if (rc < 0) {
+				free_irq(irq_entry->vector, irq_entry);
+				goto err_wq_irqs;
+			}
+			dev_dbg(dev, "int handle requested: %u\n", idxd->int_handles[i - 1]);
+		}
 	}
 
 	idxd_unmask_error_interrupts(idxd);
@@ -136,6 +154,9 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	while (--i >= 0) {
 		irq_entry = &idxd->irq_entries[i];
 		free_irq(irq_entry->vector, irq_entry);
+		if (i != 0)
+			idxd_device_release_int_handle(idxd,
+						       idxd->int_handles[i], IDXD_IRQ_MSIX);
 	}
  err_misc_irq:
 	/* Disable error interrupt generation */
@@ -288,9 +309,15 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 
 	init_waitqueue_head(&idxd->cmd_waitq);
 
+	if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) {
+		idxd->int_handles = devm_kcalloc(dev, idxd->max_wqs, sizeof(int), GFP_KERNEL);
+		if (!idxd->int_handles)
+			return -ENOMEM;
+	}
+
 	rc = idxd_setup_wqs(idxd);
 	if (rc < 0)
-		return rc;
+		goto err_wqs;
 
 	rc = idxd_setup_engines(idxd);
 	if (rc < 0)
@@ -317,6 +344,8 @@ static int idxd_setup_internals(struct idxd_device *idxd)
  err_engine:
 	for (i = 0; i < idxd->max_wqs; i++)
 		put_device(&idxd->wqs[i]->conf_dev);
+ err_wqs:
+	kfree(idxd->int_handles);
 	return rc;
 }
 
@@ -345,6 +374,12 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	/* reading generic capabilities */
 	idxd->hw.gen_cap.bits = ioread64(idxd->reg_base + IDXD_GENCAP_OFFSET);
 	dev_dbg(dev, "gen_cap: %#llx\n", idxd->hw.gen_cap.bits);
+
+	if (idxd->hw.gen_cap.cmd_cap) {
+		idxd->hw.cmd_cap = ioread32(idxd->reg_base + IDXD_CMDCAP_OFFSET);
+		dev_dbg(dev, "cmd_cap: %#x\n", idxd->hw.cmd_cap);
+	}
+
 	idxd->max_xfer_bytes = 1ULL << idxd->hw.gen_cap.max_xfer_shift;
 	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
 	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
@@ -604,6 +639,24 @@ static void idxd_flush_work_list(struct idxd_irq_entry *ie)
 	}
 }
 
+static void idxd_release_int_handles(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int i, rc;
+
+	for (i = 0; i < idxd->num_wq_irqs; i++) {
+		if (idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE)) {
+			rc = idxd_device_release_int_handle(idxd, idxd->int_handles[i],
+							    IDXD_IRQ_MSIX);
+			if (rc < 0)
+				dev_warn(dev, "irq handle %d release failed\n",
+					 idxd->int_handles[i]);
+			else
+				dev_dbg(dev, "int handle requested: %u\n", idxd->int_handles[i]);
+		}
+	}
+}
+
 static void idxd_shutdown(struct pci_dev *pdev)
 {
 	struct idxd_device *idxd = pci_get_drvdata(pdev);
@@ -630,6 +683,7 @@ static void idxd_shutdown(struct pci_dev *pdev)
 	}
 
 	idxd_msix_perm_clear(idxd);
+	idxd_release_int_handles(idxd);
 	pci_free_irq_vectors(pdev);
 	pci_iounmap(pdev, idxd->reg_base);
 	pci_disable_device(pdev);
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 751ecb4f9f81..5cbf368c7367 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -24,8 +24,8 @@ union gen_cap_reg {
 		u64 overlap_copy:1;
 		u64 cache_control_mem:1;
 		u64 cache_control_cache:1;
+		u64 cmd_cap:1;
 		u64 rsvd:3;
-		u64 int_handle_req:1;
 		u64 dest_readback:1;
 		u64 drain_readback:1;
 		u64 rsvd2:6;
@@ -180,8 +180,11 @@ enum idxd_cmd {
 	IDXD_CMD_DRAIN_PASID,
 	IDXD_CMD_ABORT_PASID,
 	IDXD_CMD_REQUEST_INT_HANDLE,
+	IDXD_CMD_RELEASE_INT_HANDLE,
 };
 
+#define CMD_INT_HANDLE_IMS		0x10000
+
 #define IDXD_CMDSTS_OFFSET		0xa8
 union cmdsts_reg {
 	struct {
@@ -193,6 +196,8 @@ union cmdsts_reg {
 	u32 bits;
 } __packed;
 #define IDXD_CMDSTS_ACTIVE		0x80000000
+#define IDXD_CMDSTS_ERR_MASK		0xff
+#define IDXD_CMDSTS_RES_SHIFT		8
 
 enum idxd_cmdsts_err {
 	IDXD_CMDSTS_SUCCESS = 0,
@@ -228,6 +233,8 @@ enum idxd_cmdsts_err {
 	IDXD_CMDSTS_ERR_NO_HANDLE,
 };
 
+#define IDXD_CMDCAP_OFFSET		0xb0
+
 #define IDXD_SWERR_OFFSET		0xc0
 #define IDXD_SWERR_VALID		0x00000001
 #define IDXD_SWERR_OVERFLOW		0x00000002
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 5e75e214ac88..47b8db965100 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -23,11 +23,23 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 		desc->hw->pasid = idxd->pasid;
 
 	/*
-	 * Descriptor completion vectors are 1-8 for MSIX. We will round
-	 * robin through the 8 vectors.
+	 * Descriptor completion vectors are 1...N for MSIX. We will round
+	 * robin through the N vectors.
 	 */
 	wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1;
-	desc->hw->int_handle = wq->vec_ptr;
+	if (!idxd->int_handles) {
+		desc->hw->int_handle = wq->vec_ptr;
+	} else {
+		desc->vector = wq->vec_ptr;
+		/*
+		 * int_handles are only for descriptor completion. However for device
+		 * MSIX enumeration, vec 0 is used for misc interrupts. Therefore even
+		 * though we are rotating through 1...N for descriptor interrupts, we
+		 * need to acqurie the int_handles from 0..N-1.
+		 */
+		desc->hw->int_handle = idxd->int_handles[desc->vector - 1];
+	}
+
 	return desc;
 }
 
@@ -80,7 +92,6 @@ void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 {
 	struct idxd_device *idxd = wq->idxd;
-	int vec = desc->hw->int_handle;
 	void __iomem *portal;
 	int rc;
 
@@ -118,9 +129,19 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 * Pending the descriptor to the lockless list for the irq_entry
 	 * that we designated the descriptor to.
 	 */
-	if (desc->hw->flags & IDXD_OP_FLAG_RCI)
-		llist_add(&desc->llnode,
-			  &idxd->irq_entries[vec].pending_llist);
+	if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
+		int vec;
+
+		/*
+		 * If the driver is on host kernel, it would be the value
+		 * assigned to interrupt handle, which is index for MSIX
+		 * vector. If it's guest then can't use the int_handle since
+		 * that is the index to IMS for the entire device. The guest
+		 * device local index will be used.
+		 */
+		vec = !idxd->int_handles ? desc->hw->int_handle : desc->vector;
+		llist_add(&desc->llnode, &idxd->irq_entries[vec].pending_llist);
+	}
 
 	return 0;
 }
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index d45cb61f300b..3f4ea4d0fae7 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1600,6 +1600,7 @@ static void idxd_conf_device_release(struct device *dev)
 	kfree(idxd->wqs);
 	kfree(idxd->engines);
 	kfree(idxd->irq_entries);
+	kfree(idxd->int_handles);
 	ida_free(&idxd_ida, idxd->id);
 	kfree(idxd);
 }
-- 
Gitee


From f99fade2409d2f8219e8de7032ddb3e062361d1a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Apr 2021 11:46:40 -0700
Subject: [PATCH 0613/2161] dmaengine: idxd: convert sprintf() to sysfs_emit()
 for all usages

commit 8241571fac9eeb7f3424ad343369eaa411919da3 upstream.

Convert sprintf() to sysfs_emit() in order to check buffer overrun on sysfs
outputs.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161894440044.3202472.13926639619695319753.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 116 +++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 61 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 3f4ea4d0fae7..0460d58e3941 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -350,9 +350,9 @@ static ssize_t engine_group_id_show(struct device *dev,
 		container_of(dev, struct idxd_engine, conf_dev);
 
 	if (engine->group)
-		return sprintf(buf, "%d\n", engine->group->id);
+		return sysfs_emit(buf, "%d\n", engine->group->id);
 	else
-		return sprintf(buf, "%d\n", -1);
+		return sysfs_emit(buf, "%d\n", -1);
 }
 
 static ssize_t engine_group_id_store(struct device *dev,
@@ -447,7 +447,7 @@ static ssize_t group_tokens_reserved_show(struct device *dev,
 	struct idxd_group *group =
 		container_of(dev, struct idxd_group, conf_dev);
 
-	return sprintf(buf, "%u\n", group->tokens_reserved);
+	return sysfs_emit(buf, "%u\n", group->tokens_reserved);
 }
 
 static ssize_t group_tokens_reserved_store(struct device *dev,
@@ -495,7 +495,7 @@ static ssize_t group_tokens_allowed_show(struct device *dev,
 	struct idxd_group *group =
 		container_of(dev, struct idxd_group, conf_dev);
 
-	return sprintf(buf, "%u\n", group->tokens_allowed);
+	return sysfs_emit(buf, "%u\n", group->tokens_allowed);
 }
 
 static ssize_t group_tokens_allowed_store(struct device *dev,
@@ -540,7 +540,7 @@ static ssize_t group_use_token_limit_show(struct device *dev,
 	struct idxd_group *group =
 		container_of(dev, struct idxd_group, conf_dev);
 
-	return sprintf(buf, "%u\n", group->use_token_limit);
+	return sysfs_emit(buf, "%u\n", group->use_token_limit);
 }
 
 static ssize_t group_use_token_limit_store(struct device *dev,
@@ -583,7 +583,6 @@ static ssize_t group_engines_show(struct device *dev,
 	struct idxd_group *group =
 		container_of(dev, struct idxd_group, conf_dev);
 	int i, rc = 0;
-	char *tmp = buf;
 	struct idxd_device *idxd = group->idxd;
 
 	for (i = 0; i < idxd->max_engines; i++) {
@@ -593,12 +592,13 @@ static ssize_t group_engines_show(struct device *dev,
 			continue;
 
 		if (engine->group->id == group->id)
-			rc += sprintf(tmp + rc, "engine%d.%d ",
-					idxd->id, engine->id);
+			rc += sysfs_emit_at(buf, rc, "engine%d.%d ", idxd->id, engine->id);
 	}
 
+	if (!rc)
+		return 0;
 	rc--;
-	rc += sprintf(tmp + rc, "\n");
+	rc += sysfs_emit_at(buf, rc, "\n");
 
 	return rc;
 }
@@ -612,7 +612,6 @@ static ssize_t group_work_queues_show(struct device *dev,
 	struct idxd_group *group =
 		container_of(dev, struct idxd_group, conf_dev);
 	int i, rc = 0;
-	char *tmp = buf;
 	struct idxd_device *idxd = group->idxd;
 
 	for (i = 0; i < idxd->max_wqs; i++) {
@@ -622,12 +621,13 @@ static ssize_t group_work_queues_show(struct device *dev,
 			continue;
 
 		if (wq->group->id == group->id)
-			rc += sprintf(tmp + rc, "wq%d.%d ",
-					idxd->id, wq->id);
+			rc += sysfs_emit_at(buf, rc, "wq%d.%d ", idxd->id, wq->id);
 	}
 
+	if (!rc)
+		return 0;
 	rc--;
-	rc += sprintf(tmp + rc, "\n");
+	rc += sysfs_emit_at(buf, rc, "\n");
 
 	return rc;
 }
@@ -642,7 +642,7 @@ static ssize_t group_traffic_class_a_show(struct device *dev,
 	struct idxd_group *group =
 		container_of(dev, struct idxd_group, conf_dev);
 
-	return sprintf(buf, "%d\n", group->tc_a);
+	return sysfs_emit(buf, "%d\n", group->tc_a);
 }
 
 static ssize_t group_traffic_class_a_store(struct device *dev,
@@ -683,7 +683,7 @@ static ssize_t group_traffic_class_b_show(struct device *dev,
 	struct idxd_group *group =
 		container_of(dev, struct idxd_group, conf_dev);
 
-	return sprintf(buf, "%d\n", group->tc_b);
+	return sysfs_emit(buf, "%d\n", group->tc_b);
 }
 
 static ssize_t group_traffic_class_b_store(struct device *dev,
@@ -756,7 +756,7 @@ static ssize_t wq_clients_show(struct device *dev,
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%d\n", wq->client_count);
+	return sysfs_emit(buf, "%d\n", wq->client_count);
 }
 
 static struct device_attribute dev_attr_wq_clients =
@@ -769,12 +769,12 @@ static ssize_t wq_state_show(struct device *dev,
 
 	switch (wq->state) {
 	case IDXD_WQ_DISABLED:
-		return sprintf(buf, "disabled\n");
+		return sysfs_emit(buf, "disabled\n");
 	case IDXD_WQ_ENABLED:
-		return sprintf(buf, "enabled\n");
+		return sysfs_emit(buf, "enabled\n");
 	}
 
-	return sprintf(buf, "unknown\n");
+	return sysfs_emit(buf, "unknown\n");
 }
 
 static struct device_attribute dev_attr_wq_state =
@@ -786,9 +786,9 @@ static ssize_t wq_group_id_show(struct device *dev,
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
 	if (wq->group)
-		return sprintf(buf, "%u\n", wq->group->id);
+		return sysfs_emit(buf, "%u\n", wq->group->id);
 	else
-		return sprintf(buf, "-1\n");
+		return sysfs_emit(buf, "-1\n");
 }
 
 static ssize_t wq_group_id_store(struct device *dev,
@@ -840,8 +840,7 @@ static ssize_t wq_mode_show(struct device *dev, struct device_attribute *attr,
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%s\n",
-			wq_dedicated(wq) ? "dedicated" : "shared");
+	return sysfs_emit(buf, "%s\n", wq_dedicated(wq) ? "dedicated" : "shared");
 }
 
 static ssize_t wq_mode_store(struct device *dev,
@@ -877,7 +876,7 @@ static ssize_t wq_size_show(struct device *dev, struct device_attribute *attr,
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%u\n", wq->size);
+	return sysfs_emit(buf, "%u\n", wq->size);
 }
 
 static int total_claimed_wq_size(struct idxd_device *idxd)
@@ -928,7 +927,7 @@ static ssize_t wq_priority_show(struct device *dev,
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%u\n", wq->priority);
+	return sysfs_emit(buf, "%u\n", wq->priority);
 }
 
 static ssize_t wq_priority_store(struct device *dev,
@@ -965,8 +964,7 @@ static ssize_t wq_block_on_fault_show(struct device *dev,
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%u\n",
-		       test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags));
+	return sysfs_emit(buf, "%u\n", test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags));
 }
 
 static ssize_t wq_block_on_fault_store(struct device *dev,
@@ -1005,7 +1003,7 @@ static ssize_t wq_threshold_show(struct device *dev,
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%u\n", wq->threshold);
+	return sysfs_emit(buf, "%u\n", wq->threshold);
 }
 
 static ssize_t wq_threshold_store(struct device *dev,
@@ -1048,15 +1046,12 @@ static ssize_t wq_type_show(struct device *dev,
 
 	switch (wq->type) {
 	case IDXD_WQT_KERNEL:
-		return sprintf(buf, "%s\n",
-			       idxd_wq_type_names[IDXD_WQT_KERNEL]);
+		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_KERNEL]);
 	case IDXD_WQT_USER:
-		return sprintf(buf, "%s\n",
-			       idxd_wq_type_names[IDXD_WQT_USER]);
+		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_USER]);
 	case IDXD_WQT_NONE:
 	default:
-		return sprintf(buf, "%s\n",
-			       idxd_wq_type_names[IDXD_WQT_NONE]);
+		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_NONE]);
 	}
 
 	return -EINVAL;
@@ -1097,7 +1092,7 @@ static ssize_t wq_name_show(struct device *dev,
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%s\n", wq->name);
+	return sysfs_emit(buf, "%s\n", wq->name);
 }
 
 static ssize_t wq_name_store(struct device *dev,
@@ -1167,7 +1162,7 @@ static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attri
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%llu\n", wq->max_xfer_bytes);
+	return sysfs_emit(buf, "%llu\n", wq->max_xfer_bytes);
 }
 
 static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attribute *attr,
@@ -1201,7 +1196,7 @@ static ssize_t wq_max_batch_size_show(struct device *dev, struct device_attribut
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%u\n", wq->max_batch_size);
+	return sysfs_emit(buf, "%u\n", wq->max_batch_size);
 }
 
 static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribute *attr,
@@ -1234,7 +1229,7 @@ static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *
 {
 	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-	return sprintf(buf, "%u\n", wq->ats_dis);
+	return sysfs_emit(buf, "%u\n", wq->ats_dis);
 }
 
 static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr,
@@ -1311,7 +1306,7 @@ static ssize_t version_show(struct device *dev, struct device_attribute *attr,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%#x\n", idxd->hw.version);
+	return sysfs_emit(buf, "%#x\n", idxd->hw.version);
 }
 static DEVICE_ATTR_RO(version);
 
@@ -1322,7 +1317,7 @@ static ssize_t max_work_queues_size_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", idxd->max_wq_size);
+	return sysfs_emit(buf, "%u\n", idxd->max_wq_size);
 }
 static DEVICE_ATTR_RO(max_work_queues_size);
 
@@ -1332,7 +1327,7 @@ static ssize_t max_groups_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", idxd->max_groups);
+	return sysfs_emit(buf, "%u\n", idxd->max_groups);
 }
 static DEVICE_ATTR_RO(max_groups);
 
@@ -1342,7 +1337,7 @@ static ssize_t max_work_queues_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", idxd->max_wqs);
+	return sysfs_emit(buf, "%u\n", idxd->max_wqs);
 }
 static DEVICE_ATTR_RO(max_work_queues);
 
@@ -1352,7 +1347,7 @@ static ssize_t max_engines_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", idxd->max_engines);
+	return sysfs_emit(buf, "%u\n", idxd->max_engines);
 }
 static DEVICE_ATTR_RO(max_engines);
 
@@ -1362,7 +1357,7 @@ static ssize_t numa_node_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%d\n", dev_to_node(&idxd->pdev->dev));
+	return sysfs_emit(buf, "%d\n", dev_to_node(&idxd->pdev->dev));
 }
 static DEVICE_ATTR_RO(numa_node);
 
@@ -1372,7 +1367,7 @@ static ssize_t max_batch_size_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", idxd->max_batch_size);
+	return sysfs_emit(buf, "%u\n", idxd->max_batch_size);
 }
 static DEVICE_ATTR_RO(max_batch_size);
 
@@ -1383,7 +1378,7 @@ static ssize_t max_transfer_size_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%llu\n", idxd->max_xfer_bytes);
+	return sysfs_emit(buf, "%llu\n", idxd->max_xfer_bytes);
 }
 static DEVICE_ATTR_RO(max_transfer_size);
 
@@ -1409,7 +1404,7 @@ static ssize_t gen_cap_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%#llx\n", idxd->hw.gen_cap.bits);
+	return sysfs_emit(buf, "%#llx\n", idxd->hw.gen_cap.bits);
 }
 static DEVICE_ATTR_RO(gen_cap);
 
@@ -1419,8 +1414,7 @@ static ssize_t configurable_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n",
-			test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags));
+	return sysfs_emit(buf, "%u\n", test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags));
 }
 static DEVICE_ATTR_RO(configurable);
 
@@ -1440,7 +1434,7 @@ static ssize_t clients_show(struct device *dev,
 	}
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 
-	return sprintf(buf, "%d\n", count);
+	return sysfs_emit(buf, "%d\n", count);
 }
 static DEVICE_ATTR_RO(clients);
 
@@ -1450,7 +1444,7 @@ static ssize_t pasid_enabled_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", device_pasid_enabled(idxd));
+	return sysfs_emit(buf, "%u\n", device_pasid_enabled(idxd));
 }
 static DEVICE_ATTR_RO(pasid_enabled);
 
@@ -1463,14 +1457,14 @@ static ssize_t state_show(struct device *dev,
 	switch (idxd->state) {
 	case IDXD_DEV_DISABLED:
 	case IDXD_DEV_CONF_READY:
-		return sprintf(buf, "disabled\n");
+		return sysfs_emit(buf, "disabled\n");
 	case IDXD_DEV_ENABLED:
-		return sprintf(buf, "enabled\n");
+		return sysfs_emit(buf, "enabled\n");
 	case IDXD_DEV_HALTED:
-		return sprintf(buf, "halted\n");
+		return sysfs_emit(buf, "halted\n");
 	}
 
-	return sprintf(buf, "unknown\n");
+	return sysfs_emit(buf, "unknown\n");
 }
 static DEVICE_ATTR_RO(state);
 
@@ -1484,10 +1478,10 @@ static ssize_t errors_show(struct device *dev,
 
 	spin_lock_irqsave(&idxd->dev_lock, flags);
 	for (i = 0; i < 4; i++)
-		out += sprintf(buf + out, "%#018llx ", idxd->sw_err.bits[i]);
+		out += sysfs_emit_at(buf, out, "%#018llx ", idxd->sw_err.bits[i]);
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 	out--;
-	out += sprintf(buf + out, "\n");
+	out += sysfs_emit_at(buf, out, "\n");
 	return out;
 }
 static DEVICE_ATTR_RO(errors);
@@ -1498,7 +1492,7 @@ static ssize_t max_tokens_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", idxd->max_tokens);
+	return sysfs_emit(buf, "%u\n", idxd->max_tokens);
 }
 static DEVICE_ATTR_RO(max_tokens);
 
@@ -1508,7 +1502,7 @@ static ssize_t token_limit_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", idxd->token_limit);
+	return sysfs_emit(buf, "%u\n", idxd->token_limit);
 }
 
 static ssize_t token_limit_store(struct device *dev,
@@ -1547,7 +1541,7 @@ static ssize_t cdev_major_show(struct device *dev,
 	struct idxd_device *idxd =
 		container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%u\n", idxd->major);
+	return sysfs_emit(buf, "%u\n", idxd->major);
 }
 static DEVICE_ATTR_RO(cdev_major);
 
@@ -1556,7 +1550,7 @@ static ssize_t cmd_status_show(struct device *dev,
 {
 	struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev);
 
-	return sprintf(buf, "%#x\n", idxd->cmd_status);
+	return sysfs_emit(buf, "%#x\n", idxd->cmd_status);
 }
 static DEVICE_ATTR_RO(cmd_status);
 
-- 
Gitee


From 4b245f48b44206e4ae9adec0819748f622b5056d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Apr 2021 11:46:46 -0700
Subject: [PATCH 0614/2161] dmaengine: idxd: enable SVA feature for IOMMU

commit cf5f86a7d47df149857ba2fb72f9c6c9da46af2e upstream.

Enable IOMMU_DEV_FEAT_SVA before attempt to bind pasid. This is needed
according to iommu_sva_bind_device() comment. Currently Intel IOMMU code
does this before bind call. It really needs to be controlled by the driver.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161894440621.3202472.17644507396206848134.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index ef58750c24cc..eb0b3a00a2d7 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -501,11 +501,18 @@ static int idxd_probe(struct idxd_device *idxd)
 	dev_dbg(dev, "IDXD reset complete\n");
 
 	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) {
-		rc = idxd_enable_system_pasid(idxd);
-		if (rc < 0)
-			dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
-		else
-			set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+		rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA);
+		if (rc == 0) {
+			rc = idxd_enable_system_pasid(idxd);
+			if (rc < 0) {
+				iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
+				dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
+			} else {
+				set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+			}
+		} else {
+			dev_warn(dev, "Unable to turn on SVA feature.\n");
+		}
 	} else if (!sva) {
 		dev_warn(dev, "User forced SVA off via module param.\n");
 	}
@@ -539,6 +546,7 @@ static int idxd_probe(struct idxd_device *idxd)
  err:
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
+	iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
 	return rc;
 }
 
@@ -699,6 +707,7 @@ static void idxd_remove(struct pci_dev *pdev)
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
 	idxd_unregister_devices(idxd);
+	iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
 }
 
 static struct pci_driver idxd_pci_driver = {
-- 
Gitee


From 3a79dddcc33963b05f1beda24c5f1e6c905e5a3b Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Apr 2021 11:46:51 -0700
Subject: [PATCH 0615/2161] dmaengine: idxd: support reporting of halt
 interrupt

commit 5b0c68c473a131c2acb21abad44b0047b200e185 upstream.

Unmask the halt error interrupt so it gets reported to the interrupt
handler. When halt state interrupt is received, quiesce the kernel
WQs and unmap the portals to stop submission.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161894441167.3202472.9485946398140619501.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c    | 15 +++++++++++++++
 drivers/dma/idxd/idxd.h      |  2 ++
 drivers/dma/idxd/init.c      | 12 ++++++++++++
 drivers/dma/idxd/irq.c       |  2 ++
 drivers/dma/idxd/registers.h |  3 ++-
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 54d5afec81cf..3934e660d951 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -47,6 +47,7 @@ void idxd_unmask_error_interrupts(struct idxd_device *idxd)
 
 	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
 	genctrl.softerr_int_en = 1;
+	genctrl.halt_int_en = 1;
 	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
 }
 
@@ -56,6 +57,7 @@ void idxd_mask_error_interrupts(struct idxd_device *idxd)
 
 	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
 	genctrl.softerr_int_en = 0;
+	genctrl.halt_int_en = 0;
 	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
 }
 
@@ -312,6 +314,19 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq)
 	struct device *dev = &wq->idxd->pdev->dev;
 
 	devm_iounmap(dev, wq->portal);
+	wq->portal = NULL;
+}
+
+void idxd_wqs_unmap_portal(struct idxd_device *idxd)
+{
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		if (wq->portal)
+			idxd_wq_unmap_portal(wq);
+	}
 }
 
 int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index c1d4a1976206..d7185c6bfade 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -371,6 +371,7 @@ int idxd_register_devices(struct idxd_device *idxd);
 void idxd_unregister_devices(struct idxd_device *idxd);
 int idxd_register_driver(void);
 void idxd_unregister_driver(void);
+void idxd_wqs_quiesce(struct idxd_device *idxd);
 
 /* device interrupt control */
 void idxd_msix_perm_setup(struct idxd_device *idxd);
@@ -400,6 +401,7 @@ int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
 				   enum idxd_interrupt_type irq_type);
 
 /* work queue control */
+void idxd_wqs_unmap_portal(struct idxd_device *idxd);
 int idxd_wq_alloc_resources(struct idxd_wq *wq);
 void idxd_wq_free_resources(struct idxd_wq *wq);
 int idxd_wq_enable(struct idxd_wq *wq);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index eb0b3a00a2d7..e6bfd55e421b 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -647,6 +647,18 @@ static void idxd_flush_work_list(struct idxd_irq_entry *ie)
 	}
 }
 
+void idxd_wqs_quiesce(struct idxd_device *idxd)
+{
+	struct idxd_wq *wq;
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+		if (wq->state == IDXD_WQ_ENABLED && wq->type == IDXD_WQT_KERNEL)
+			idxd_wq_quiesce(wq);
+	}
+}
+
 static void idxd_release_int_handles(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index fc0781e3f36d..43eea5c9cbd4 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -202,6 +202,8 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 			queue_work(idxd->wq, &idxd->work);
 		} else {
 			spin_lock_bh(&idxd->dev_lock);
+			idxd_wqs_quiesce(idxd);
+			idxd_wqs_unmap_portal(idxd);
 			idxd_device_wqs_clear_state(idxd);
 			dev_err(&idxd->pdev->dev,
 				"idxd halted, need %s.\n",
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 5cbf368c7367..6c11375cc56a 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -120,7 +120,8 @@ union gencfg_reg {
 union genctrl_reg {
 	struct {
 		u32 softerr_int_en:1;
-		u32 rsvd:31;
+		u32 halt_int_en:1;
+		u32 rsvd:30;
 	};
 	u32 bits;
 } __packed;
-- 
Gitee


From 52f1eaf4e7a4216194690ebca1a0ac63d9e3496c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Apr 2021 12:00:56 -0700
Subject: [PATCH 0616/2161] dmaengine: idxd: device cmd should use dedicated
 lock

commit 53b2ee7f637c4f1fa2f50dbdb210088e30c11d2b upstream.

Create a dedicated lock for device command operations. Put the device
command operation under finer grained locking instead of using the
idxd->dev_lock.

Suggested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161894525685.3210132.16160045731436382560.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 18 +++++++++---------
 drivers/dma/idxd/idxd.h   |  1 +
 drivers/dma/idxd/init.c   |  1 +
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 3934e660d951..420b93fe5feb 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -465,13 +465,13 @@ int idxd_device_init_reset(struct idxd_device *idxd)
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.cmd = IDXD_CMD_RESET_DEVICE;
 	dev_dbg(dev, "%s: sending reset for init.\n", __func__);
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock_irqsave(&idxd->cmd_lock, flags);
 	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
 
 	while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) &
 	       IDXD_CMDSTS_ACTIVE)
 		cpu_relax();
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
 	return 0;
 }
 
@@ -494,10 +494,10 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	cmd.operand = operand;
 	cmd.int_req = 1;
 
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock_irqsave(&idxd->cmd_lock, flags);
 	wait_event_lock_irq(idxd->cmd_waitq,
 			    !test_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags),
-			    idxd->dev_lock);
+			    idxd->cmd_lock);
 
 	dev_dbg(&idxd->pdev->dev, "%s: sending cmd: %#x op: %#x\n",
 		__func__, cmd_code, operand);
@@ -511,9 +511,9 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	 * After command submitted, release lock and go to sleep until
 	 * the command completes via interrupt.
 	 */
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
 	wait_for_completion(&done);
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock_irqsave(&idxd->cmd_lock, flags);
 	if (status) {
 		*status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
 		idxd->cmd_status = *status & GENMASK(7, 0);
@@ -522,7 +522,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	__clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
 	/* Wake up other pending commands */
 	wake_up(&idxd->cmd_waitq);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
 }
 
 int idxd_device_enable(struct idxd_device *idxd)
@@ -667,13 +667,13 @@ int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
 
 	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_RELEASE_INT_HANDLE, operand);
 
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock_irqsave(&idxd->cmd_lock, flags);
 	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
 
 	while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE)
 		cpu_relax();
 	status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
 
 	if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) {
 		dev_dbg(dev, "release int handle failed: %#x\n", status);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index d7185c6bfade..87330b6940fa 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -204,6 +204,7 @@ struct idxd_device {
 	void __iomem *reg_base;
 
 	spinlock_t dev_lock;	/* spinlock for device */
+	spinlock_t cmd_lock;	/* spinlock for device commands */
 	struct completion *cmd_done;
 	struct idxd_group **groups;
 	struct idxd_wq **wqs;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index e6bfd55e421b..64dab4de217e 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -449,6 +449,7 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d
 	}
 
 	spin_lock_init(&idxd->dev_lock);
+	spin_lock_init(&idxd->cmd_lock);
 
 	return idxd;
 }
-- 
Gitee


From 300e02a3c26b78669edf9ddc5d1804510fabc443 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Apr 2021 12:00:34 -0700
Subject: [PATCH 0617/2161] dmaengine: idxd: remove MSIX masking for interrupt
 handlers

commit a16104617d212d4b482568847b25172972b87e60 upstream.

Remove interrupt masking and just let the hard irq handler keep
firing for new events. This is less of a performance impact vs
the MMIO readback inside the pci_msi_{mask,unmas}_irq(). Especially
with a loaded system those flushes can be stuck behind large amounts
of MMIO writes to flush. When guest kernel is running on top of VFIO
mdev, mask/unmask causes a vmexit each time and is not desirable.

Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/161894523436.3210025.1834640110556139277.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h |  1 -
 drivers/dma/idxd/init.c |  4 ++--
 drivers/dma/idxd/irq.c  | 12 ------------
 3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 87330b6940fa..97c96ca6ab70 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -377,7 +377,6 @@ void idxd_wqs_quiesce(struct idxd_device *idxd);
 /* device interrupt control */
 void idxd_msix_perm_setup(struct idxd_device *idxd);
 void idxd_msix_perm_clear(struct idxd_device *idxd);
-irqreturn_t idxd_irq_handler(int vec, void *data);
 irqreturn_t idxd_misc_thread(int vec, void *data);
 irqreturn_t idxd_wq_thread(int irq, void *data);
 void idxd_mask_error_interrupts(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 64dab4de217e..8003f8a25fff 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -102,7 +102,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	}
 
 	irq_entry = &idxd->irq_entries[0];
-	rc = request_threaded_irq(irq_entry->vector, idxd_irq_handler, idxd_misc_thread,
+	rc = request_threaded_irq(irq_entry->vector, NULL, idxd_misc_thread,
 				  0, "idxd-misc", irq_entry);
 	if (rc < 0) {
 		dev_err(dev, "Failed to allocate misc interrupt.\n");
@@ -119,7 +119,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 
 		init_llist_head(&idxd->irq_entries[i].pending_llist);
 		INIT_LIST_HEAD(&idxd->irq_entries[i].work_list);
-		rc = request_threaded_irq(irq_entry->vector, idxd_irq_handler,
+		rc = request_threaded_irq(irq_entry->vector, NULL,
 					  idxd_wq_thread, 0, "idxd-portal", irq_entry);
 		if (rc < 0) {
 			dev_err(dev, "Failed to allocate irq %d.\n", irq_entry->vector);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 43eea5c9cbd4..afee571e0194 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -102,15 +102,6 @@ static int idxd_device_schedule_fault_process(struct idxd_device *idxd,
 	return 0;
 }
 
-irqreturn_t idxd_irq_handler(int vec, void *data)
-{
-	struct idxd_irq_entry *irq_entry = data;
-	struct idxd_device *idxd = irq_entry->idxd;
-
-	idxd_mask_msix_vector(idxd, irq_entry->id);
-	return IRQ_WAKE_THREAD;
-}
-
 static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -237,7 +228,6 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 			iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
 	}
 
-	idxd_unmask_msix_vector(idxd, irq_entry->id);
 	return IRQ_HANDLED;
 }
 
@@ -394,8 +384,6 @@ irqreturn_t idxd_wq_thread(int irq, void *data)
 	int processed;
 
 	processed = idxd_desc_process(irq_entry);
-	idxd_unmask_msix_vector(irq_entry->idxd, irq_entry->id);
-
 	if (processed == 0)
 		return IRQ_NONE;
 
-- 
Gitee


From 8aeafd99a0a112c5ec61de9fc834f11e07f9851a Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sat, 24 Apr 2021 10:04:15 -0500
Subject: [PATCH 0618/2161] dmaengine: idxd: Add IDXD performance monitor
 support

commit 81dd4d4d6178306ab31db91bdc7353d485bdafce upstream.

Implement the IDXD performance monitor capability (named 'perfmon' in
the DSA (Data Streaming Accelerator) spec [1]), which supports the
collection of information about key events occurring during DSA and
IAX (Intel Analytics Accelerator) device execution, to assist in
performance tuning and debugging.

The idxd perfmon support is implemented as part of the IDXD driver and
interfaces with the Linux perf framework.  It has several features in
common with the existing uncore pmu support:

  - it does not support sampling
  - does not support per-thread counting

However it also has some unique features not present in the core and
uncore support:

  - all general-purpose counters are identical, thus no event constraints
  - operation is always system-wide

While the core perf subsystem assumes that all counters are by default
per-cpu, the uncore pmus are socket-scoped and use a cpu mask to
restrict counting to one cpu from each socket.  IDXD counters use a
similar strategy but expand the scope even further; since IDXD
counters are system-wide and can be read from any cpu, the IDXD perf
driver picks a single cpu to do the work (with cpu hotplug notifiers
to choose a different cpu if the chosen one is taken off-line).

More specifically, the perf userspace tool by default opens a counter
for each cpu for an event.  However, if it finds a cpumask file
associated with the pmu under sysfs, as is the case with the uncore
pmus, it will open counters only on the cpus specified by the cpumask.
Since perfmon only needs to open a single counter per event for a
given IDXD device, the perfmon driver will create a sysfs cpumask file
for the device and insert the first cpu of the system into it.  When a
user uses perf to open an event, perf will open a single counter on
the cpu specified by the cpu mask.  This amounts to the default
system-wide rather than per-cpu counting mentioned previously for
perfmon pmu events.  In order to keep the cpu mask up-to-date, the
driver implements cpu hotplug support for multiple devices, as IDXD
usually enumerates and registers more than one idxd device.

The perfmon driver implements basic perfmon hardware capability
discovery and configuration, and is initialized by the IDXD driver's
probe function.  During initialization, the driver retrieves the total
number of supported performance counters, the pmu ID, and the device
type from idxd device, and registers itself under the Linux perf
framework.

The perf userspace tool can be used to monitor single or multiple
events depending on the given configuration, as well as event groups,
which are also supported by the perfmon driver.  The user configures
events using the perf tool command-line interface by specifying the
event and corresponding event category, along with an optional set of
filters that can be used to restrict counting to specific work queues,
traffic classes, page and transfer sizes, and engines (See [1] for
specifics).

With the configuration specified by the user, the perf tool issues a
system call passing that information to the kernel, which uses it to
initialize the specified event(s).  The event(s) are opened and
started, and following termination of the perf command, they're
stopped.  At that point, the perfmon driver will read the latest count
for the event(s), calculate the difference between the latest counter
values and previously tracked counter values, and display the final
incremental count as the event count for the cycle.  An overflow
handler registered on the IDXD irq path is used to account for counter
overflows, which are signaled by an overflow interrupt.

Below are a couple of examples of perf usage for monitoring DSA events.

The following monitors all events in the 'engine' category.  Becuuse
no filters are specified, this captures all engine events for the
workload, which in this case is 19 iterations of the work generated by
the kernel dmatest module.

Details describing the events can be found in Appendix D of [1],
Performance Monitoring Events, but briefly they are:

  event 0x1:  total input data processed, in 32-byte units
  event 0x2:  total data written, in 32-byte units
  event 0x4:  number of work descriptors that read the source
  event 0x8:  number of work descriptors that write the destination
  event 0x10: number of work descriptors dispatched from batch descriptors
  event 0x20: number of work descriptors dispatched from work queues

 # perf stat -e dsa0/event=0x1,event_category=0x1/,
                dsa0/event=0x2,event_category=0x1/,
		dsa0/event=0x4,event_category=0x1/,
		dsa0/event=0x8,event_category=0x1/,
		dsa0/event=0x10,event_category=0x1/,
		dsa0/event=0x20,event_category=0x1/
		  modprobe dmatest channel=dma0chan0 timeout=2000
		  iterations=19 run=1 wait=1

     Performance counter stats for 'system wide':

                 5,332      dsa0/event=0x1,event_category=0x1/
                 5,327      dsa0/event=0x2,event_category=0x1/
                    19      dsa0/event=0x4,event_category=0x1/
                    19      dsa0/event=0x8,event_category=0x1/
                     0      dsa0/event=0x10,event_category=0x1/
                    19      dsa0/event=0x20,event_category=0x1/

          21.977436186 seconds time elapsed

The command below illustrates filter usage with a simple example.  It
specifies that MEM_MOVE operations should be counted for the DSA
device dsa0 (event 0x8 corresponds to the EV_MEM_MOVE event - Number
of Memory Move Descriptors, which is part of event category 0x3 -
Operations. The detailed category and event IDs are available in
Appendix D, Performance Monitoring Events, of [1]).  In addition to
the event and event category, a number of filters are also specified
(the detailed filter values are available in Chapter 6.4 (Filter
Support) of [1]), which will restrict counting to only those events
that meet all of the filter criteria.  In this case, the filters
specify that only MEM_MOVE operations that are serviced by work queue
wq0 and specifically engine number engine0 and traffic class tc0
having sizes between 0 and 4k and page size of between 0 and 1G result
in a counter hit; anything else will be filtered out and not appear in
the final count.  Note that filters are optional - any filter not
specified is assumed to be all ones and will pass anything.

 # perf stat -e dsa0/filter_wq=0x1,filter_tc=0x1,filter_sz=0x7,
                filter_eng=0x1,event=0x8,event_category=0x3/
		  modprobe dmatest channel=dma0chan0 timeout=2000
		  iterations=19 run=1 wait=1

     Performance counter stats for 'system wide':

       19      dsa0/filter_wq=0x1,filter_tc=0x1,filter_sz=0x7,
               filter_eng=0x1,event=0x8,event_category=0x3/

          21.865914091 seconds time elapsed

The output above reflects that the unspecified workload resulted in
the counting of 19 MEM_MOVE operation events that met the filter
criteria.

[1]: https://software.intel.com/content/www/us/en/develop/download/intel-data-streaming-accelerator-preliminary-architecture-specification.html

[ Based on work originally by Jing Lin. ]

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Link: https://lore.kernel.org/r/0c5080a7d541904c4ad42b848c76a1ce056ddac7.1619276133.git.zanussi@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../sysfs-bus-event_source-devices-dsa        |  30 +
 drivers/dma/Kconfig                           |  12 +
 drivers/dma/idxd/Makefile                     |   2 +
 drivers/dma/idxd/idxd.h                       |  45 ++
 drivers/dma/idxd/perfmon.c                    | 662 ++++++++++++++++++
 drivers/dma/idxd/perfmon.h                    | 119 ++++
 drivers/dma/idxd/registers.h                  | 108 +++
 include/linux/cpuhotplug.h                    |   1 +
 8 files changed, 979 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
 create mode 100644 drivers/dma/idxd/perfmon.c
 create mode 100644 drivers/dma/idxd/perfmon.h

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
new file mode 100644
index 000000000000..3c7d132281b0
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
@@ -0,0 +1,30 @@
+What:		/sys/bus/event_source/devices/dsa*/format
+Date:		April 2021
+KernelVersion:  5.13
+Contact:	Tom Zanussi <tom.zanussi@linux.intel.com>
+Description:	Read-only.  Attribute group to describe the magic bits
+		that go into perf_event_attr.config or
+		perf_event_attr.config1 for the IDXD DSA pmu.  (See also
+		ABI/testing/sysfs-bus-event_source-devices-format).
+
+		Each attribute in this group defines a bit range in
+		perf_event_attr.config or perf_event_attr.config1.
+		All supported attributes are listed below (See the
+		IDXD DSA Spec for possible attribute values)::
+
+		    event_category = "config:0-3"    - event category
+		    event          = "config:4-31"   - event ID
+
+		    filter_wq      = "config1:0-31"  - workqueue filter
+		    filter_tc      = "config1:32-39" - traffic class filter
+		    filter_pgsz    = "config1:40-43" - page size filter
+		    filter_sz      = "config1:44-51" - transfer size filter
+		    filter_eng     = "config1:52-59" - engine filter
+
+What:		/sys/bus/event_source/devices/dsa*/cpumask
+Date:		April 2021
+KernelVersion:  5.13
+Contact:	Tom Zanussi <tom.zanussi@linux.intel.com>
+Description:    Read-only.  This file always returns the cpu to which the
+                IDXD DSA pmu is bound for access to all dsa pmu
+		performance monitoring events.
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 9b1ce7b2d810..a8e1003dba8a 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -297,6 +297,18 @@ config INTEL_IDXD_SVM
 	depends on PCI_PASID
 	depends on PCI_IOV
 
+config INTEL_IDXD_PERFMON
+	bool "Intel Data Accelerators performance monitor support"
+	depends on INTEL_IDXD
+	help
+	  Enable performance monitor (pmu) support for the Intel(R)
+	  data accelerators present in Intel Xeon CPU.  With this
+	  enabled, perf can be used to monitor the DSA (Intel Data
+	  Streaming Accelerator) events described in the Intel DSA
+	  spec.
+
+	  If unsure, say N.
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86_64
diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index 8978b898d777..6d11558756f8 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
 idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
+
+idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 97c96ca6ab70..26482c7d4c3a 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -9,6 +9,8 @@
 #include <linux/wait.h>
 #include <linux/cdev.h>
 #include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
 #include "registers.h"
 
 #define IDXD_DRIVER_VERSION	"1.00"
@@ -29,6 +31,7 @@ enum idxd_type {
 };
 
 #define IDXD_NAME_SIZE		128
+#define IDXD_PMU_EVENT_MAX	64
 
 struct idxd_device_driver {
 	struct device_driver drv;
@@ -61,6 +64,31 @@ struct idxd_group {
 	int tc_b;
 };
 
+struct idxd_pmu {
+	struct idxd_device *idxd;
+
+	struct perf_event *event_list[IDXD_PMU_EVENT_MAX];
+	int n_events;
+
+	DECLARE_BITMAP(used_mask, IDXD_PMU_EVENT_MAX);
+
+	struct pmu pmu;
+	char name[IDXD_NAME_SIZE];
+	int cpu;
+
+	int n_counters;
+	int counter_width;
+	int n_event_categories;
+
+	bool per_counter_caps_supported;
+	unsigned long supported_event_categories;
+
+	unsigned long supported_filters;
+	int n_filters;
+
+	struct hlist_node cpuhp_node;
+};
+
 #define IDXD_MAX_PRIORITY	0xf
 
 enum idxd_wq_state {
@@ -241,6 +269,8 @@ struct idxd_device {
 	struct work_struct work;
 
 	int *int_handles;
+
+	struct idxd_pmu *idxd_pmu;
 };
 
 /* IDXD software descriptor */
@@ -437,4 +467,19 @@ int idxd_cdev_get_major(struct idxd_device *idxd);
 int idxd_wq_add_cdev(struct idxd_wq *wq);
 void idxd_wq_del_cdev(struct idxd_wq *wq);
 
+/* perfmon */
+#if IS_ENABLED(CONFIG_INTEL_IDXD_PERFMON)
+int perfmon_pmu_init(struct idxd_device *idxd);
+void perfmon_pmu_remove(struct idxd_device *idxd);
+void perfmon_counter_overflow(struct idxd_device *idxd);
+void perfmon_init(void);
+void perfmon_exit(void);
+#else
+static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
+static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
+static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
+static inline void perfmon_init(void) {}
+static inline void perfmon_exit(void) {}
+#endif
+
 #endif
diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c
new file mode 100644
index 000000000000..d73004f47cf4
--- /dev/null
+++ b/drivers/dma/idxd/perfmon.c
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#include <linux/sched/task.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include "idxd.h"
+#include "perfmon.h"
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf);
+
+static cpumask_t		perfmon_dsa_cpu_mask;
+static bool			cpuhp_set_up;
+static enum cpuhp_state		cpuhp_slot;
+
+/*
+ * perf userspace reads this attribute to determine which cpus to open
+ * counters on.  It's connected to perfmon_dsa_cpu_mask, which is
+ * maintained by the cpu hotplug handlers.
+ */
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *perfmon_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+	.attrs = perfmon_cpumask_attrs,
+};
+
+/*
+ * These attributes specify the bits in the config word that the perf
+ * syscall uses to pass the event ids and categories to perfmon.
+ */
+DEFINE_PERFMON_FORMAT_ATTR(event_category, "config:0-3");
+DEFINE_PERFMON_FORMAT_ATTR(event, "config:4-31");
+
+/*
+ * These attributes specify the bits in the config1 word that the perf
+ * syscall uses to pass filter data to perfmon.
+ */
+DEFINE_PERFMON_FORMAT_ATTR(filter_wq, "config1:0-31");
+DEFINE_PERFMON_FORMAT_ATTR(filter_tc, "config1:32-39");
+DEFINE_PERFMON_FORMAT_ATTR(filter_pgsz, "config1:40-43");
+DEFINE_PERFMON_FORMAT_ATTR(filter_sz, "config1:44-51");
+DEFINE_PERFMON_FORMAT_ATTR(filter_eng, "config1:52-59");
+
+#define PERFMON_FILTERS_START	2
+#define PERFMON_FILTERS_MAX	5
+
+static struct attribute *perfmon_format_attrs[] = {
+	&format_attr_idxd_event_category.attr,
+	&format_attr_idxd_event.attr,
+	&format_attr_idxd_filter_wq.attr,
+	&format_attr_idxd_filter_tc.attr,
+	&format_attr_idxd_filter_pgsz.attr,
+	&format_attr_idxd_filter_sz.attr,
+	&format_attr_idxd_filter_eng.attr,
+	NULL,
+};
+
+static struct attribute_group perfmon_format_attr_group = {
+	.name = "format",
+	.attrs = perfmon_format_attrs,
+};
+
+static const struct attribute_group *perfmon_attr_groups[] = {
+	&perfmon_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
+}
+
+static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
+{
+	return &idxd_pmu->pmu == event->pmu;
+}
+
+static int perfmon_collect_events(struct idxd_pmu *idxd_pmu,
+				  struct perf_event *leader,
+				  bool do_grp)
+{
+	struct perf_event *event;
+	int n, max_count;
+
+	max_count = idxd_pmu->n_counters;
+	n = idxd_pmu->n_events;
+
+	if (n >= max_count)
+		return -EINVAL;
+
+	if (is_idxd_event(idxd_pmu, leader)) {
+		idxd_pmu->event_list[n] = leader;
+		idxd_pmu->event_list[n]->hw.idx = n;
+		n++;
+	}
+
+	if (!do_grp)
+		return n;
+
+	for_each_sibling_event(event, leader) {
+		if (!is_idxd_event(idxd_pmu, event) ||
+		    event->state <= PERF_EVENT_STATE_OFF)
+			continue;
+
+		if (n >= max_count)
+			return -EINVAL;
+
+		idxd_pmu->event_list[n] = event;
+		idxd_pmu->event_list[n]->hw.idx = n;
+		n++;
+	}
+
+	return n;
+}
+
+static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu,
+				    struct perf_event *event, int idx)
+{
+	struct idxd_device *idxd = idxd_pmu->idxd;
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = idx;
+	hwc->config_base = ioread64(CNTRCFG_REG(idxd, idx));
+	hwc->event_base = ioread64(CNTRCFG_REG(idxd, idx));
+}
+
+static int perfmon_assign_event(struct idxd_pmu *idxd_pmu,
+				struct perf_event *event)
+{
+	int i;
+
+	for (i = 0; i < IDXD_PMU_EVENT_MAX; i++)
+		if (!test_and_set_bit(i, idxd_pmu->used_mask))
+			return i;
+
+	return -EINVAL;
+}
+
+/*
+ * Check whether there are enough counters to satisfy that all the
+ * events in the group can actually be scheduled at the same time.
+ *
+ * To do this, create a fake idxd_pmu object so the event collection
+ * and assignment functions can be used without affecting the internal
+ * state of the real idxd_pmu object.
+ */
+static int perfmon_validate_group(struct idxd_pmu *pmu,
+				  struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct idxd_pmu *fake_pmu;
+	int i, ret = 0, n, idx;
+
+	fake_pmu = kzalloc(sizeof(*fake_pmu), GFP_KERNEL);
+	if (!fake_pmu)
+		return -ENOMEM;
+
+	fake_pmu->pmu.name = pmu->pmu.name;
+	fake_pmu->n_counters = pmu->n_counters;
+
+	n = perfmon_collect_events(fake_pmu, leader, true);
+	if (n < 0) {
+		ret = n;
+		goto out;
+	}
+
+	fake_pmu->n_events = n;
+	n = perfmon_collect_events(fake_pmu, event, false);
+	if (n < 0) {
+		ret = n;
+		goto out;
+	}
+
+	fake_pmu->n_events = n;
+
+	for (i = 0; i < n; i++) {
+		event = fake_pmu->event_list[i];
+
+		idx = perfmon_assign_event(fake_pmu, event);
+		if (idx < 0) {
+			ret = idx;
+			goto out;
+		}
+	}
+out:
+	kfree(fake_pmu);
+
+	return ret;
+}
+
+static int perfmon_pmu_event_init(struct perf_event *event)
+{
+	struct idxd_device *idxd;
+	int ret = 0;
+
+	idxd = event_to_idxd(event);
+	event->hw.idx = -1;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* sampling not supported */
+	if (event->attr.sample_period)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	if (event->pmu != &idxd->idxd_pmu->pmu)
+		return -EINVAL;
+
+	event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
+	event->cpu = idxd->idxd_pmu->cpu;
+	event->hw.config = event->attr.config;
+
+	if (event->group_leader != event)
+		 /* non-group events have themselves as leader */
+		ret = perfmon_validate_group(idxd->idxd_pmu, event);
+
+	return ret;
+}
+
+static inline u64 perfmon_pmu_read_counter(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct idxd_device *idxd;
+	int cntr = hwc->idx;
+
+	idxd = event_to_idxd(event);
+
+	return ioread64(CNTRDATA_REG(idxd, cntr));
+}
+
+static void perfmon_pmu_event_update(struct perf_event *event)
+{
+	struct idxd_device *idxd = event_to_idxd(event);
+	u64 prev_raw_count, new_raw_count, delta, p, n;
+	int shift = 64 - idxd->idxd_pmu->counter_width;
+	struct hw_perf_event *hwc = &event->hw;
+
+	do {
+		prev_raw_count = local64_read(&hwc->prev_count);
+		new_raw_count = perfmon_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			new_raw_count) != prev_raw_count);
+
+	n = (new_raw_count << shift);
+	p = (prev_raw_count << shift);
+
+	delta = ((n - p) >> shift);
+
+	local64_add(delta, &event->count);
+}
+
+void perfmon_counter_overflow(struct idxd_device *idxd)
+{
+	int i, n_counters, max_loop = OVERFLOW_SIZE;
+	struct perf_event *event;
+	unsigned long ovfstatus;
+
+	n_counters = min(idxd->idxd_pmu->n_counters, OVERFLOW_SIZE);
+
+	ovfstatus = ioread32(OVFSTATUS_REG(idxd));
+
+	/*
+	 * While updating overflowed counters, other counters behind
+	 * them could overflow and be missed in a given pass.
+	 * Normally this could happen at most n_counters times, but in
+	 * theory a tiny counter width could result in continual
+	 * overflows and endless looping.  max_loop provides a
+	 * failsafe in that highly unlikely case.
+	 */
+	while (ovfstatus && max_loop--) {
+		/* Figure out which counter(s) overflowed */
+		for_each_set_bit(i, &ovfstatus, n_counters) {
+			unsigned long ovfstatus_clear = 0;
+
+			/* Update event->count for overflowed counter */
+			event = idxd->idxd_pmu->event_list[i];
+			perfmon_pmu_event_update(event);
+			/* Writing 1 to OVFSTATUS bit clears it */
+			set_bit(i, &ovfstatus_clear);
+			iowrite32(ovfstatus_clear, OVFSTATUS_REG(idxd));
+		}
+
+		ovfstatus = ioread32(OVFSTATUS_REG(idxd));
+	}
+
+	/*
+	 * Should never happen.  If so, it means a counter(s) looped
+	 * around twice while this handler was running.
+	 */
+	WARN_ON_ONCE(ovfstatus);
+}
+
+static inline void perfmon_reset_config(struct idxd_device *idxd)
+{
+	iowrite32(CONFIG_RESET, PERFRST_REG(idxd));
+	iowrite32(0, OVFSTATUS_REG(idxd));
+	iowrite32(0, PERFFRZ_REG(idxd));
+}
+
+static inline void perfmon_reset_counters(struct idxd_device *idxd)
+{
+	iowrite32(CNTR_RESET, PERFRST_REG(idxd));
+}
+
+static inline void perfmon_reset(struct idxd_device *idxd)
+{
+	perfmon_reset_config(idxd);
+	perfmon_reset_counters(idxd);
+}
+
+static void perfmon_pmu_event_start(struct perf_event *event, int mode)
+{
+	u32 flt_wq, flt_tc, flt_pg_sz, flt_xfer_sz, flt_eng = 0;
+	u64 cntr_cfg, cntrdata, event_enc, event_cat = 0;
+	struct hw_perf_event *hwc = &event->hw;
+	union filter_cfg flt_cfg;
+	union event_cfg event_cfg;
+	struct idxd_device *idxd;
+	int cntr;
+
+	idxd = event_to_idxd(event);
+
+	event->hw.idx = hwc->idx;
+	cntr = hwc->idx;
+
+	/* Obtain event category and event value from user space */
+	event_cfg.val = event->attr.config;
+	flt_cfg.val = event->attr.config1;
+	event_cat = event_cfg.event_cat;
+	event_enc = event_cfg.event_enc;
+
+	/* Obtain filter configuration from user space */
+	flt_wq = flt_cfg.wq;
+	flt_tc = flt_cfg.tc;
+	flt_pg_sz = flt_cfg.pg_sz;
+	flt_xfer_sz = flt_cfg.xfer_sz;
+	flt_eng = flt_cfg.eng;
+
+	if (flt_wq && test_bit(FLT_WQ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_wq, FLTCFG_REG(idxd, cntr, FLT_WQ));
+	if (flt_tc && test_bit(FLT_TC, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_tc, FLTCFG_REG(idxd, cntr, FLT_TC));
+	if (flt_pg_sz && test_bit(FLT_PG_SZ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_pg_sz, FLTCFG_REG(idxd, cntr, FLT_PG_SZ));
+	if (flt_xfer_sz && test_bit(FLT_XFER_SZ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_xfer_sz, FLTCFG_REG(idxd, cntr, FLT_XFER_SZ));
+	if (flt_eng && test_bit(FLT_ENG, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_eng, FLTCFG_REG(idxd, cntr, FLT_ENG));
+
+	/* Read the start value */
+	cntrdata = ioread64(CNTRDATA_REG(idxd, cntr));
+	local64_set(&event->hw.prev_count, cntrdata);
+
+	/* Set counter to event/category */
+	cntr_cfg = event_cat << CNTRCFG_CATEGORY_SHIFT;
+	cntr_cfg |= event_enc << CNTRCFG_EVENT_SHIFT;
+	/* Set interrupt on overflow and counter enable bits */
+	cntr_cfg |= (CNTRCFG_IRQ_OVERFLOW | CNTRCFG_ENABLE);
+
+	iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr));
+}
+
+static void perfmon_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct idxd_device *idxd;
+	int i, cntr = hwc->idx;
+	u64 cntr_cfg;
+
+	idxd = event_to_idxd(event);
+
+	/* remove this event from event list */
+	for (i = 0; i < idxd->idxd_pmu->n_events; i++) {
+		if (event != idxd->idxd_pmu->event_list[i])
+			continue;
+
+		for (++i; i < idxd->idxd_pmu->n_events; i++)
+			idxd->idxd_pmu->event_list[i - 1] = idxd->idxd_pmu->event_list[i];
+		--idxd->idxd_pmu->n_events;
+		break;
+	}
+
+	cntr_cfg = ioread64(CNTRCFG_REG(idxd, cntr));
+	cntr_cfg &= ~CNTRCFG_ENABLE;
+	iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr));
+
+	if (mode == PERF_EF_UPDATE)
+		perfmon_pmu_event_update(event);
+
+	event->hw.idx = -1;
+	clear_bit(cntr, idxd->idxd_pmu->used_mask);
+}
+
+static void perfmon_pmu_event_del(struct perf_event *event, int mode)
+{
+	perfmon_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int perfmon_pmu_event_add(struct perf_event *event, int flags)
+{
+	struct idxd_device *idxd = event_to_idxd(event);
+	struct idxd_pmu *idxd_pmu = idxd->idxd_pmu;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx, n;
+
+	n = perfmon_collect_events(idxd_pmu, event, false);
+	if (n < 0)
+		return n;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	idx = perfmon_assign_event(idxd_pmu, event);
+	if (idx < 0)
+		return idx;
+
+	perfmon_assign_hw_event(idxd_pmu, event, idx);
+
+	if (flags & PERF_EF_START)
+		perfmon_pmu_event_start(event, 0);
+
+	idxd_pmu->n_events = n;
+
+	return 0;
+}
+
+static void enable_perfmon_pmu(struct idxd_device *idxd)
+{
+	iowrite32(COUNTER_UNFREEZE, PERFFRZ_REG(idxd));
+}
+
+static void disable_perfmon_pmu(struct idxd_device *idxd)
+{
+	iowrite32(COUNTER_FREEZE, PERFFRZ_REG(idxd));
+}
+
+static void perfmon_pmu_enable(struct pmu *pmu)
+{
+	struct idxd_device *idxd = pmu_to_idxd(pmu);
+
+	enable_perfmon_pmu(idxd);
+}
+
+static void perfmon_pmu_disable(struct pmu *pmu)
+{
+	struct idxd_device *idxd = pmu_to_idxd(pmu);
+
+	disable_perfmon_pmu(idxd);
+}
+
+static void skip_filter(int i)
+{
+	int j;
+
+	for (j = i; j < PERFMON_FILTERS_MAX; j++)
+		perfmon_format_attrs[PERFMON_FILTERS_START + j] =
+			perfmon_format_attrs[PERFMON_FILTERS_START + j + 1];
+}
+
+static void idxd_pmu_init(struct idxd_pmu *idxd_pmu)
+{
+	int i;
+
+	for (i = 0 ; i < PERFMON_FILTERS_MAX; i++) {
+		if (!test_bit(i, &idxd_pmu->supported_filters))
+			skip_filter(i);
+	}
+
+	idxd_pmu->pmu.name		= idxd_pmu->name;
+	idxd_pmu->pmu.attr_groups	= perfmon_attr_groups;
+	idxd_pmu->pmu.task_ctx_nr	= perf_invalid_context;
+	idxd_pmu->pmu.event_init	= perfmon_pmu_event_init;
+	idxd_pmu->pmu.pmu_enable	= perfmon_pmu_enable,
+	idxd_pmu->pmu.pmu_disable	= perfmon_pmu_disable,
+	idxd_pmu->pmu.add		= perfmon_pmu_event_add;
+	idxd_pmu->pmu.del		= perfmon_pmu_event_del;
+	idxd_pmu->pmu.start		= perfmon_pmu_event_start;
+	idxd_pmu->pmu.stop		= perfmon_pmu_event_stop;
+	idxd_pmu->pmu.read		= perfmon_pmu_event_update;
+	idxd_pmu->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+	idxd_pmu->pmu.module		= THIS_MODULE;
+}
+
+void perfmon_pmu_remove(struct idxd_device *idxd)
+{
+	if (!idxd->idxd_pmu)
+		return;
+
+	cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
+	perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+	kfree(idxd->idxd_pmu);
+	idxd->idxd_pmu = NULL;
+}
+
+static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct idxd_pmu *idxd_pmu;
+
+	idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+
+	/* select the first online CPU as the designated reader */
+	if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
+		cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
+		idxd_pmu->cpu = cpu;
+	}
+
+	return 0;
+}
+
+static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct idxd_pmu *idxd_pmu;
+	unsigned int target;
+
+	idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+
+	if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+
+	/* migrate events if there is a valid target */
+	if (target < nr_cpu_ids)
+		cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
+	else
+		target = -1;
+
+	perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
+
+	return 0;
+}
+
+int perfmon_pmu_init(struct idxd_device *idxd)
+{
+	union idxd_perfcap perfcap;
+	struct idxd_pmu *idxd_pmu;
+	int rc = -ENODEV;
+
+	/*
+	 * perfmon module initialization failed, nothing to do
+	 */
+	if (!cpuhp_set_up)
+		return -ENODEV;
+
+	/*
+	 * If perfmon_offset or num_counters is 0, it means perfmon is
+	 * not supported on this hardware.
+	 */
+	if (idxd->perfmon_offset == 0)
+		return -ENODEV;
+
+	idxd_pmu = kzalloc(sizeof(*idxd_pmu), GFP_KERNEL);
+	if (!idxd_pmu)
+		return -ENOMEM;
+
+	idxd_pmu->idxd = idxd;
+	idxd->idxd_pmu = idxd_pmu;
+
+	if (idxd->data->type == IDXD_TYPE_DSA) {
+		rc = sprintf(idxd_pmu->name, "dsa%d", idxd->id);
+		if (rc < 0)
+			goto free;
+	} else if (idxd->data->type == IDXD_TYPE_IAX) {
+		rc = sprintf(idxd_pmu->name, "iax%d", idxd->id);
+		if (rc < 0)
+			goto free;
+	} else {
+		goto free;
+	}
+
+	perfmon_reset(idxd);
+
+	perfcap.bits = ioread64(PERFCAP_REG(idxd));
+
+	/*
+	 * If total perf counter is 0, stop further registration.
+	 * This is necessary in order to support driver running on
+	 * guest which does not have pmon support.
+	 */
+	if (perfcap.num_perf_counter == 0)
+		goto free;
+
+	/* A counter width of 0 means it can't count */
+	if (perfcap.counter_width == 0)
+		goto free;
+
+	/* Overflow interrupt and counter freeze support must be available */
+	if (!perfcap.overflow_interrupt || !perfcap.counter_freeze)
+		goto free;
+
+	/* Number of event categories cannot be 0 */
+	if (perfcap.num_event_category == 0)
+		goto free;
+
+	/*
+	 * We don't support per-counter capabilities for now.
+	 */
+	if (perfcap.cap_per_counter)
+		goto free;
+
+	idxd_pmu->n_event_categories = perfcap.num_event_category;
+	idxd_pmu->supported_event_categories = perfcap.global_event_category;
+	idxd_pmu->per_counter_caps_supported = perfcap.cap_per_counter;
+
+	/* check filter capability.  If 0, then filters are not supported */
+	idxd_pmu->supported_filters = perfcap.filter;
+	if (perfcap.filter)
+		idxd_pmu->n_filters = hweight8(perfcap.filter);
+
+	/* Store the total number of counters categories, and counter width */
+	idxd_pmu->n_counters = perfcap.num_perf_counter;
+	idxd_pmu->counter_width = perfcap.counter_width;
+
+	idxd_pmu_init(idxd_pmu);
+
+	rc = perf_pmu_register(&idxd_pmu->pmu, idxd_pmu->name, -1);
+	if (rc)
+		goto free;
+
+	rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
+	if (rc) {
+		perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+		goto free;
+	}
+out:
+	return rc;
+free:
+	kfree(idxd_pmu);
+	idxd->idxd_pmu = NULL;
+
+	goto out;
+}
+
+void __init perfmon_init(void)
+{
+	int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+					 "driver/dma/idxd/perf:online",
+					 perf_event_cpu_online,
+					 perf_event_cpu_offline);
+	if (WARN_ON(rc < 0))
+		return;
+
+	cpuhp_slot = rc;
+	cpuhp_set_up = true;
+}
+
+void __exit perfmon_exit(void)
+{
+	if (cpuhp_set_up)
+		cpuhp_remove_multi_state(cpuhp_slot);
+}
diff --git a/drivers/dma/idxd/perfmon.h b/drivers/dma/idxd/perfmon.h
new file mode 100644
index 000000000000..9a081a1bc605
--- /dev/null
+++ b/drivers/dma/idxd/perfmon.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#ifndef _PERFMON_H_
+#define _PERFMON_H_
+
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/sbitmap.h>
+#include <linux/dmaengine.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/uuid.h>
+#include <linux/idxd.h>
+#include <linux/perf_event.h>
+#include "registers.h"
+
+static inline struct idxd_pmu *event_to_pmu(struct perf_event *event)
+{
+	struct idxd_pmu *idxd_pmu;
+	struct pmu *pmu;
+
+	pmu = event->pmu;
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu;
+}
+
+static inline struct idxd_device *event_to_idxd(struct perf_event *event)
+{
+	struct idxd_pmu *idxd_pmu;
+	struct pmu *pmu;
+
+	pmu = event->pmu;
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu->idxd;
+}
+
+static inline struct idxd_device *pmu_to_idxd(struct pmu *pmu)
+{
+	struct idxd_pmu *idxd_pmu;
+
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu->idxd;
+}
+
+enum dsa_perf_events {
+	DSA_PERF_EVENT_WQ = 0,
+	DSA_PERF_EVENT_ENGINE,
+	DSA_PERF_EVENT_ADDR_TRANS,
+	DSA_PERF_EVENT_OP,
+	DSA_PERF_EVENT_COMPL,
+	DSA_PERF_EVENT_MAX,
+};
+
+enum filter_enc {
+	FLT_WQ = 0,
+	FLT_TC,
+	FLT_PG_SZ,
+	FLT_XFER_SZ,
+	FLT_ENG,
+	FLT_MAX,
+};
+
+#define CONFIG_RESET		0x0000000000000001
+#define CNTR_RESET		0x0000000000000002
+#define CNTR_ENABLE		0x0000000000000001
+#define INTR_OVFL		0x0000000000000002
+
+#define COUNTER_FREEZE		0x00000000FFFFFFFF
+#define COUNTER_UNFREEZE	0x0000000000000000
+#define OVERFLOW_SIZE		32
+
+#define CNTRCFG_ENABLE		BIT(0)
+#define CNTRCFG_IRQ_OVERFLOW	BIT(1)
+#define CNTRCFG_CATEGORY_SHIFT	8
+#define CNTRCFG_EVENT_SHIFT	32
+
+#define PERFMON_TABLE_OFFSET(_idxd)				\
+({								\
+	typeof(_idxd) __idxd = (_idxd);				\
+	((__idxd)->reg_base + (__idxd)->perfmon_offset);	\
+})
+#define PERFMON_REG_OFFSET(idxd, offset)			\
+	(PERFMON_TABLE_OFFSET(idxd) + (offset))
+
+#define PERFCAP_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFCAP_OFFSET))
+#define PERFRST_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFRST_OFFSET))
+#define OVFSTATUS_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_OVFSTATUS_OFFSET))
+#define PERFFRZ_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFFRZ_OFFSET))
+
+#define FLTCFG_REG(idxd, cntr, flt)				\
+	(PERFMON_REG_OFFSET(idxd, IDXD_FLTCFG_OFFSET) +	((cntr) * 32) + ((flt) * 4))
+
+#define CNTRCFG_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRCFG_OFFSET) + ((cntr) * 8))
+#define CNTRDATA_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRDATA_OFFSET) + ((cntr) * 8))
+#define CNTRCAP_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRCAP_OFFSET) + ((cntr) * 8))
+
+#define EVNTCAP_REG(idxd, category) \
+	(PERFMON_REG_OFFSET(idxd, IDXD_EVNTCAP_OFFSET) + ((category) * 8))
+
+#define DEFINE_PERFMON_FORMAT_ATTR(_name, _format)			\
+static ssize_t __perfmon_idxd_##_name##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,		\
+				char *page)				\
+{									\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
+	return sprintf(page, _format "\n");				\
+}									\
+static struct kobj_attribute format_attr_idxd_##_name =			\
+	__ATTR(_name, 0444, __perfmon_idxd_##_name##_show, NULL)
+
+#endif
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 6c11375cc56a..c970c3f025f0 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -386,4 +386,112 @@ union wqcfg {
 #define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32)
 #define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40)
 
+/* Following is performance monitor registers */
+#define IDXD_PERFCAP_OFFSET		0x0
+union idxd_perfcap {
+	struct {
+		u64 num_perf_counter:6;
+		u64 rsvd1:2;
+		u64 counter_width:8;
+		u64 num_event_category:4;
+		u64 global_event_category:16;
+		u64 filter:8;
+		u64 rsvd2:8;
+		u64 cap_per_counter:1;
+		u64 writeable_counter:1;
+		u64 counter_freeze:1;
+		u64 overflow_interrupt:1;
+		u64 rsvd3:8;
+	};
+	u64 bits;
+} __packed;
+
+#define IDXD_EVNTCAP_OFFSET		0x80
+union idxd_evntcap {
+	struct {
+		u64 events:28;
+		u64 rsvd:36;
+	};
+	u64 bits;
+} __packed;
+
+struct idxd_event {
+	union {
+		struct {
+			u32 event_category:4;
+			u32 events:28;
+		};
+		u32 val;
+	};
+} __packed;
+
+#define IDXD_CNTRCAP_OFFSET		0x800
+struct idxd_cntrcap {
+	union {
+		struct {
+			u32 counter_width:8;
+			u32 rsvd:20;
+			u32 num_events:4;
+		};
+		u32 val;
+	};
+	struct idxd_event events[];
+} __packed;
+
+#define IDXD_PERFRST_OFFSET		0x10
+union idxd_perfrst {
+	struct {
+		u32 perfrst_config:1;
+		u32 perfrst_counter:1;
+		u32 rsvd:30;
+	};
+	u32 val;
+} __packed;
+
+#define IDXD_OVFSTATUS_OFFSET		0x30
+#define IDXD_PERFFRZ_OFFSET		0x20
+#define IDXD_CNTRCFG_OFFSET		0x100
+union idxd_cntrcfg {
+	struct {
+		u64 enable:1;
+		u64 interrupt_ovf:1;
+		u64 global_freeze_ovf:1;
+		u64 rsvd1:5;
+		u64 event_category:4;
+		u64 rsvd2:20;
+		u64 events:28;
+		u64 rsvd3:4;
+	};
+	u64 val;
+} __packed;
+
+#define IDXD_FLTCFG_OFFSET		0x300
+
+#define IDXD_CNTRDATA_OFFSET		0x200
+union idxd_cntrdata {
+	struct {
+		u64 event_count_value;
+	};
+	u64 val;
+} __packed;
+
+union event_cfg {
+	struct {
+		u64 event_cat:4;
+		u64 event_enc:28;
+	};
+	u64 val;
+} __packed;
+
+union filter_cfg {
+	struct {
+		u64 wq:32;
+		u64 tc:8;
+		u64 pg_sz:4;
+		u64 xfer_sz:8;
+		u64 eng:8;
+	};
+	u64 val;
+} __packed;
+
 #endif
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 436d12ef291d..e45b04ba04af 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -157,6 +157,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_RAPL_ONLINE,
 	CPUHP_AP_PERF_X86_CQM_ONLINE,
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
+	CPUHP_AP_PERF_X86_IDXD_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
-- 
Gitee


From 0ed8213172b652a25c5cd430929eff7c273d47b4 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Fri, 16 Sep 2022 10:14:05 +0800
Subject: [PATCH 0619/2161] Revert "add signal.h to fix signal_pending_state
 compile error"

commit opencloudos.

This reverts commit 3f01edd36a81c7dabe72e96e9735c9b3b0b2b324.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 47b8db965100..19afb62abaff 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -7,7 +7,6 @@
 #include <uapi/linux/idxd.h>
 #include "idxd.h"
 #include "registers.h"
-#include <linux/sched/signal.h>
 
 static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 {
-- 
Gitee


From 0325da507e4c1818102c3291ad5ebe3a25a1304b Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sat, 24 Apr 2021 10:04:16 -0500
Subject: [PATCH 0620/2161] dmaengine: idxd: Enable IDXD performance monitor
 support

commit 0bde4444ec44b8e64bbd4af72fcaef58bcdbd4ce upstream.

Add the code needed in the main IDXD driver to interface with the IDXD
perfmon implementation.

[ Based on work originally by Jing Lin. ]

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Link: https://lore.kernel.org/r/a5564a5583911565d31c2af9234218c5166c4b2c.1619276133.git.zanussi@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 9 +++++++++
 drivers/dma/idxd/irq.c  | 5 +----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 8003f8a25fff..2a926bef87f2 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -21,6 +21,7 @@
 #include "../dmaengine.h"
 #include "registers.h"
 #include "idxd.h"
+#include "perfmon.h"
 
 MODULE_VERSION(IDXD_DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
@@ -541,6 +542,10 @@ static int idxd_probe(struct idxd_device *idxd)
 
 	idxd->major = idxd_cdev_get_major(idxd);
 
+	rc = perfmon_pmu_init(idxd);
+	if (rc < 0)
+		dev_warn(dev, "Failed to initialize perfmon. No PMU support: %d\n", rc);
+
 	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
 	return 0;
 
@@ -720,6 +725,7 @@ static void idxd_remove(struct pci_dev *pdev)
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
 	idxd_unregister_devices(idxd);
+	perfmon_pmu_remove(idxd);
 	iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
 }
 
@@ -749,6 +755,8 @@ static int __init idxd_init_module(void)
 	else
 		support_enqcmd = true;
 
+	perfmon_init();
+
 	err = idxd_register_bus_type();
 	if (err < 0)
 		return err;
@@ -782,5 +790,6 @@ static void __exit idxd_exit_module(void)
 	pci_unregister_driver(&idxd_pci_driver);
 	idxd_cdev_remove();
 	idxd_unregister_bus_type();
+	perfmon_exit();
 }
 module_exit(idxd_exit_module);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index afee571e0194..ae68e1e5487a 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -156,11 +156,8 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 	}
 
 	if (cause & IDXD_INTC_PERFMON_OVFL) {
-		/*
-		 * Driver does not utilize perfmon counter overflow interrupt
-		 * yet.
-		 */
 		val |= IDXD_INTC_PERFMON_OVFL;
+		perfmon_counter_overflow(idxd);
 	}
 
 	val ^= cause;
-- 
Gitee


From 61b6dacbd7d592e34981d226bb6b780ab36766ec Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 26 Apr 2021 16:09:19 -0700
Subject: [PATCH 0621/2161] dmaengine: idxd: add engine 'struct device' missing
 bus type assignment

commit 1c4841ccbd2b185587010d6178aac11953f61d4c upstream.

engine 'struct device' setup is missing assigning the bus type. Add it to
dsa_bus_type.

Fixes: 75b911309060 ("dmaengine: idxd: fix engine conf_dev lifetime")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161947841562.984844.17505646725993659651.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 2a926bef87f2..ec7305f86bf7 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -242,6 +242,7 @@ static int idxd_setup_engines(struct idxd_device *idxd)
 		engine->idxd = idxd;
 		device_initialize(&engine->conf_dev);
 		engine->conf_dev.parent = &idxd->conf_dev;
+		engine->conf_dev.bus = &dsa_bus_type;
 		engine->conf_dev.type = &idxd_engine_device_type;
 		rc = dev_set_name(&engine->conf_dev, "engine%d.%d", idxd->id, engine->id);
 		if (rc < 0) {
-- 
Gitee


From 20395b95d35697d881ed480db40d1730a56ee5d3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 26 Apr 2021 16:32:24 -0700
Subject: [PATCH 0622/2161] dmaengine: idxd: add missing dsa driver unregister

commit 077cdb355b3d8ee0f258856962e6dac06e744401 upstream.

The idxd_unregister_driver() has never been called for the idxd driver upon
removal. Add fix to call unregister driver on module removal.

Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161947994449.1053102.13189942817915448216.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index ec7305f86bf7..6201f52f13f5 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -788,6 +788,7 @@ module_init(idxd_init_module);
 
 static void __exit idxd_exit_module(void)
 {
+	idxd_unregister_driver();
 	pci_unregister_driver(&idxd_pci_driver);
 	idxd_cdev_remove();
 	idxd_unregister_bus_type();
-- 
Gitee


From 31217a53cac24fb7d94ffcf93ba8c74ddfeb03e0 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Thu, 6 May 2021 19:00:47 +0800
Subject: [PATCH 0623/2161] dmaengine: idxd: Remove redundant variable cdev_ctx

commit 58cb138e20296dffe3b2be2beb64e5aa22846b84 upstream.

Variable cdev_ctx is set to '&ictx[wq->idxd->data->type]' but this
value is not used, hence it is a redundant assignment and can be
removed.

Clean up the following clang-analyzer warning:

drivers/dma/idxd/cdev.c:300:2: warning: Value stored to 'cdev_ctx' is
never read [clang-analyzer-deadcode.DeadStores].

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/1620298847-33127-1-git-send-email-jiapeng.chong@linux.alibaba.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 302cba5ff779..6c72089ca31a 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -295,9 +295,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq)
 void idxd_wq_del_cdev(struct idxd_wq *wq)
 {
 	struct idxd_cdev *idxd_cdev;
-	struct idxd_cdev_context *cdev_ctx;
 
-	cdev_ctx = &ictx[wq->idxd->data->type];
 	idxd_cdev = wq->idxd_cdev;
 	wq->idxd_cdev = NULL;
 	cdev_device_del(&idxd_cdev->cdev, &idxd_cdev->dev);
-- 
Gitee


From f46937d607d39a497825ba2d73a3a25850ac65b6 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Sun, 9 May 2021 17:38:25 -0700
Subject: [PATCH 0624/2161] dmaengine: idxd: remove devm allocation for
 idxd->int_handles

commit 33f9f3c33e9336e5f49501c9632584c3d1f4f3a5 upstream.

Allocation of idxd->int_handles was merged incorrectly for the 5.13 merge
window. The devm_kcalloc should've been regular kcalloc due to devm_*
removal series for the driver.

Fixes: eb15e7154fbf ("dmaengine: idxd: add interrupt handle request and release support")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162060710518.130816.11349798049329202863.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 6201f52f13f5..c9df4b6ef78b 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -312,7 +312,8 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 	init_waitqueue_head(&idxd->cmd_waitq);
 
 	if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) {
-		idxd->int_handles = devm_kcalloc(dev, idxd->max_wqs, sizeof(int), GFP_KERNEL);
+		idxd->int_handles = kcalloc_node(idxd->max_wqs, sizeof(int), GFP_KERNEL,
+						 dev_to_node(dev));
 		if (!idxd->int_handles)
 			return -ENOMEM;
 	}
-- 
Gitee


From c3ea31fce6e4031e5eea0b70f1d3c6a7f722ea63 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 25 May 2021 12:23:37 -0700
Subject: [PATCH 0625/2161] dmaengine: idxd: Add missing cleanup for early
 error out in probe call

commit ddf742d4f3f12a6ba1b8e6ecbbf3ae736942f970 upstream.

The probe call stack is missing some cleanup when things fail in the
middle. Add the appropriate cleanup routines to make sure we exit
gracefully.

Fixes: a39c7cd0438e ("dmaengine: idxd: removal of pcim managed mmio mapping")
Reported-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162197061707.392656.15760573520817310791.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 61 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index c9df4b6ef78b..0064c357513e 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -168,6 +168,32 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	return rc;
 }
 
+static void idxd_cleanup_interrupts(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	struct idxd_irq_entry *irq_entry;
+	int i, msixcnt;
+
+	msixcnt = pci_msix_vec_count(pdev);
+	if (msixcnt <= 0)
+		return;
+
+	irq_entry = &idxd->irq_entries[0];
+	free_irq(irq_entry->vector, irq_entry);
+
+	for (i = 1; i < msixcnt; i++) {
+
+		irq_entry = &idxd->irq_entries[i];
+		if (idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE))
+			idxd_device_release_int_handle(idxd, idxd->int_handles[i],
+						       IDXD_IRQ_MSIX);
+		free_irq(irq_entry->vector, irq_entry);
+	}
+
+	idxd_mask_error_interrupts(idxd);
+	pci_free_irq_vectors(pdev);
+}
+
 static int idxd_setup_wqs(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -304,6 +330,19 @@ static int idxd_setup_groups(struct idxd_device *idxd)
 	return rc;
 }
 
+static void idxd_cleanup_internals(struct idxd_device *idxd)
+{
+	int i;
+
+	for (i = 0; i < idxd->max_groups; i++)
+		put_device(&idxd->groups[i]->conf_dev);
+	for (i = 0; i < idxd->max_engines; i++)
+		put_device(&idxd->engines[i]->conf_dev);
+	for (i = 0; i < idxd->max_wqs; i++)
+		put_device(&idxd->wqs[i]->conf_dev);
+	destroy_workqueue(idxd->wq);
+}
+
 static int idxd_setup_internals(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -533,12 +572,12 @@ static int idxd_probe(struct idxd_device *idxd)
 		dev_dbg(dev, "Loading RO device config\n");
 		rc = idxd_device_load_config(idxd);
 		if (rc < 0)
-			goto err;
+			goto err_config;
 	}
 
 	rc = idxd_setup_interrupts(idxd);
 	if (rc)
-		goto err;
+		goto err_config;
 
 	dev_dbg(dev, "IDXD interrupt setup complete.\n");
 
@@ -551,6 +590,8 @@ static int idxd_probe(struct idxd_device *idxd)
 	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
 	return 0;
 
+ err_config:
+	idxd_cleanup_internals(idxd);
  err:
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
@@ -558,6 +599,18 @@ static int idxd_probe(struct idxd_device *idxd)
 	return rc;
 }
 
+static void idxd_cleanup(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+
+	perfmon_pmu_remove(idxd);
+	idxd_cleanup_interrupts(idxd);
+	idxd_cleanup_internals(idxd);
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
+	iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
+}
+
 static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct device *dev = &pdev->dev;
@@ -610,7 +663,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	rc = idxd_register_devices(idxd);
 	if (rc) {
 		dev_err(dev, "IDXD sysfs setup failed\n");
-		goto err;
+		goto err_dev_register;
 	}
 
 	idxd->state = IDXD_DEV_CONF_READY;
@@ -620,6 +673,8 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	return 0;
 
+ err_dev_register:
+	idxd_cleanup(idxd);
  err:
 	pci_iounmap(pdev, idxd->reg_base);
  err_iomap:
-- 
Gitee


From 6a1f45eb4fd4fcce4682f3817102c6fd8f78f9f3 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Wed, 2 Jun 2021 18:07:26 +0800
Subject: [PATCH 0626/2161] dmaengine: idxd: Fix missing error code in
 idxd_cdev_open()

commit 99b18e88a1cf737ae924123d63b46d9a3d17b1af upstream.

The error code is missing in this code scenario, add the error code
'-EINVAL' to the return value 'rc'.

Eliminate the follow smatch warning:

drivers/dma/idxd/cdev.c:113 idxd_cdev_open() warn: missing error code
'rc'.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/1622628446-87909-1-git-send-email-jiapeng.chong@linux.alibaba.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 6c72089ca31a..e9def577c697 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -110,6 +110,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 		pasid = iommu_sva_get_pasid(sva);
 		if (pasid == IOMMU_PASID_INVALID) {
 			iommu_sva_unbind_device(sva);
+			rc = -EINVAL;
 			goto failed;
 		}
 
-- 
Gitee


From 75f1b25c158564f7798183adcc306b72cd2caf86 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 2 Jun 2021 12:07:52 +0200
Subject: [PATCH 0627/2161] dmaengine: idxd: Use cpu_feature_enabled()

commit 74b2fc882d380d8fafc2a26f01d401c2a7beeadb upstream.

When testing x86 feature bits, use cpu_feature_enabled() so that
build-disabled features can remain off, regardless of what CPUID says.

Fixes: 8e50d392652f ("dmaengine: idxd: Add shared workqueue support")
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-By: Vinod Koul <vkoul@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 0064c357513e..c8ae41d36040 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -802,12 +802,12 @@ static int __init idxd_init_module(void)
 	 * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
 	 * enumerating the device. We can not utilize it.
 	 */
-	if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) {
+	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
 		pr_warn("idxd driver failed to load without MOVDIR64B.\n");
 		return -ENODEV;
 	}
 
-	if (!boot_cpu_has(X86_FEATURE_ENQCMD))
+	if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
 		pr_warn("Platform does not have ENQCMD(S) support.\n");
 	else
 		support_enqcmd = true;
-- 
Gitee


From c83ebcb8b8af012a903244e9efaf09f72f5e96d9 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 8 Jul 2021 07:08:26 +0200
Subject: [PATCH 0628/2161] dmaengine: idxd: Simplify code and axe the use of a
 deprecated API

commit 53b50458110d829c8fc45e7803a335a515698fd8 upstream.

The wrappers in include/linux/pci-dma-compat.h should go away.

Replace 'pci_set_dma_mask/pci_set_consistent_dma_mask' by an equivalent
and less verbose 'dma_set_mask_and_coherent()' call.

Even if the code may look different, it should have exactly the same
run-time behavior.
If pci_set_dma_mask(64) fails and pci_set_dma_mask(32) succeeds, then
pci_set_consistent_dma_mask(64) will also fail.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/70c8a3bc67e41c5fefb526ecd64c5174c1e2dc76.1625720835.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index c8ae41d36040..de300ba38b14 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -637,15 +637,9 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	dev_dbg(dev, "Set DMA masks\n");
-	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 	if (rc)
-		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-	if (rc)
-		goto err;
-
-	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc)
-		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
 	if (rc)
 		goto err;
 
-- 
Gitee


From 2fc4c463e5d3fd550a5980fcdee9e1d5720a701c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 4 Jun 2021 17:06:21 -0700
Subject: [PATCH 0629/2161] dmanegine: idxd: cleanup all device related bits
 after disabling device

commit 0dcfe41e9a4ca759ccc87a48e3bb9cc3b08ff1e8 upstream.

The previous state cleanup patch only performed wq state cleanups. This
does not go far enough as when device is disabled or reset, the state
for groups and engines must also be cleaned up. Add additional state
cleanup beyond wq cleanup. Tie those cleanups directly to device
disable and reset, and wq disable and reset.

Fixes: da32b28c95a7 ("dmaengine: idxd: cleanup workqueue config after disabling")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162285154108.2096632.5572805472362321307.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 88 +++++++++++++++++++++++++++++----------
 drivers/dma/idxd/idxd.h   |  6 +--
 drivers/dma/idxd/irq.c    |  4 +-
 drivers/dma/idxd/sysfs.c  | 22 +++-------
 4 files changed, 74 insertions(+), 46 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 420b93fe5feb..928c2c8817f0 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -15,6 +15,8 @@
 
 static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 			  u32 *status);
+static void idxd_device_wqs_clear_state(struct idxd_device *idxd);
+static void idxd_wq_disable_cleanup(struct idxd_wq *wq);
 
 /* Interrupt control bits */
 void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
@@ -234,7 +236,7 @@ int idxd_wq_enable(struct idxd_wq *wq)
 	return 0;
 }
 
-int idxd_wq_disable(struct idxd_wq *wq)
+int idxd_wq_disable(struct idxd_wq *wq, bool reset_config)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
@@ -255,6 +257,8 @@ int idxd_wq_disable(struct idxd_wq *wq)
 		return -ENXIO;
 	}
 
+	if (reset_config)
+		idxd_wq_disable_cleanup(wq);
 	wq->state = IDXD_WQ_DISABLED;
 	dev_dbg(dev, "WQ %d disabled\n", wq->id);
 	return 0;
@@ -289,6 +293,7 @@ void idxd_wq_reset(struct idxd_wq *wq)
 
 	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
 	idxd_cmd_exec(idxd, IDXD_CMD_RESET_WQ, operand, NULL);
+	idxd_wq_disable_cleanup(wq);
 	wq->state = IDXD_WQ_DISABLED;
 }
 
@@ -337,7 +342,7 @@ int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
 	unsigned int offset;
 	unsigned long flags;
 
-	rc = idxd_wq_disable(wq);
+	rc = idxd_wq_disable(wq, false);
 	if (rc < 0)
 		return rc;
 
@@ -364,7 +369,7 @@ int idxd_wq_disable_pasid(struct idxd_wq *wq)
 	unsigned int offset;
 	unsigned long flags;
 
-	rc = idxd_wq_disable(wq);
+	rc = idxd_wq_disable(wq, false);
 	if (rc < 0)
 		return rc;
 
@@ -383,11 +388,11 @@ int idxd_wq_disable_pasid(struct idxd_wq *wq)
 	return 0;
 }
 
-void idxd_wq_disable_cleanup(struct idxd_wq *wq)
+static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
 
-	lockdep_assert_held(&idxd->dev_lock);
+	lockdep_assert_held(&wq->wq_lock);
 	memset(wq->wqcfg, 0, idxd->wqcfg_size);
 	wq->type = IDXD_WQT_NONE;
 	wq->size = 0;
@@ -548,22 +553,6 @@ int idxd_device_enable(struct idxd_device *idxd)
 	return 0;
 }
 
-void idxd_device_wqs_clear_state(struct idxd_device *idxd)
-{
-	int i;
-
-	lockdep_assert_held(&idxd->dev_lock);
-
-	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = idxd->wqs[i];
-
-		if (wq->state == IDXD_WQ_ENABLED) {
-			idxd_wq_disable_cleanup(wq);
-			wq->state = IDXD_WQ_DISABLED;
-		}
-	}
-}
-
 int idxd_device_disable(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -585,7 +574,7 @@ int idxd_device_disable(struct idxd_device *idxd)
 	}
 
 	spin_lock_irqsave(&idxd->dev_lock, flags);
-	idxd_device_wqs_clear_state(idxd);
+	idxd_device_clear_state(idxd);
 	idxd->state = IDXD_DEV_CONF_READY;
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 	return 0;
@@ -597,7 +586,7 @@ void idxd_device_reset(struct idxd_device *idxd)
 
 	idxd_cmd_exec(idxd, IDXD_CMD_RESET_DEVICE, 0, NULL);
 	spin_lock_irqsave(&idxd->dev_lock, flags);
-	idxd_device_wqs_clear_state(idxd);
+	idxd_device_clear_state(idxd);
 	idxd->state = IDXD_DEV_CONF_READY;
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 }
@@ -685,6 +674,59 @@ int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
 }
 
 /* Device configuration bits */
+static void idxd_engines_clear_state(struct idxd_device *idxd)
+{
+	struct idxd_engine *engine;
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_engines; i++) {
+		engine = idxd->engines[i];
+		engine->group = NULL;
+	}
+}
+
+static void idxd_groups_clear_state(struct idxd_device *idxd)
+{
+	struct idxd_group *group;
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = idxd->groups[i];
+		memset(&group->grpcfg, 0, sizeof(group->grpcfg));
+		group->num_engines = 0;
+		group->num_wqs = 0;
+		group->use_token_limit = false;
+		group->tokens_allowed = 0;
+		group->tokens_reserved = 0;
+		group->tc_a = -1;
+		group->tc_b = -1;
+	}
+}
+
+static void idxd_device_wqs_clear_state(struct idxd_device *idxd)
+{
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		if (wq->state == IDXD_WQ_ENABLED) {
+			idxd_wq_disable_cleanup(wq);
+			wq->state = IDXD_WQ_DISABLED;
+		}
+	}
+}
+
+void idxd_device_clear_state(struct idxd_device *idxd)
+{
+	idxd_groups_clear_state(idxd);
+	idxd_engines_clear_state(idxd);
+	idxd_device_wqs_clear_state(idxd);
+}
+
 void idxd_msix_perm_setup(struct idxd_device *idxd)
 {
 	union msix_perm mperm;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 26482c7d4c3a..1f0991dec679 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -420,9 +420,8 @@ int idxd_device_init_reset(struct idxd_device *idxd);
 int idxd_device_enable(struct idxd_device *idxd);
 int idxd_device_disable(struct idxd_device *idxd);
 void idxd_device_reset(struct idxd_device *idxd);
-void idxd_device_cleanup(struct idxd_device *idxd);
+void idxd_device_clear_state(struct idxd_device *idxd);
 int idxd_device_config(struct idxd_device *idxd);
-void idxd_device_wqs_clear_state(struct idxd_device *idxd);
 void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid);
 int idxd_device_load_config(struct idxd_device *idxd);
 int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle,
@@ -435,12 +434,11 @@ void idxd_wqs_unmap_portal(struct idxd_device *idxd);
 int idxd_wq_alloc_resources(struct idxd_wq *wq);
 void idxd_wq_free_resources(struct idxd_wq *wq);
 int idxd_wq_enable(struct idxd_wq *wq);
-int idxd_wq_disable(struct idxd_wq *wq);
+int idxd_wq_disable(struct idxd_wq *wq, bool reset_config);
 void idxd_wq_drain(struct idxd_wq *wq);
 void idxd_wq_reset(struct idxd_wq *wq);
 int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
-void idxd_wq_disable_cleanup(struct idxd_wq *wq);
 int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid);
 int idxd_wq_disable_pasid(struct idxd_wq *wq);
 void idxd_wq_quiesce(struct idxd_wq *wq);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index ae68e1e5487a..7a2cf0512501 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -59,7 +59,7 @@ static void idxd_device_reinit(struct work_struct *work)
 	return;
 
  out:
-	idxd_device_wqs_clear_state(idxd);
+	idxd_device_clear_state(idxd);
 }
 
 static void idxd_device_fault_work(struct work_struct *work)
@@ -192,7 +192,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 			spin_lock_bh(&idxd->dev_lock);
 			idxd_wqs_quiesce(idxd);
 			idxd_wqs_unmap_portal(idxd);
-			idxd_device_wqs_clear_state(idxd);
+			idxd_device_clear_state(idxd);
 			dev_err(&idxd->pdev->dev,
 				"idxd halted, need %s.\n",
 				gensts.reset_type == IDXD_DEVICE_RESET_FLR ?
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 0460d58e3941..71cd73fefec6 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -129,7 +129,7 @@ static int enable_wq(struct idxd_wq *wq)
 	rc = idxd_wq_map_portal(wq);
 	if (rc < 0) {
 		dev_warn(dev, "wq portal mapping failed: %d\n", rc);
-		rc = idxd_wq_disable(wq);
+		rc = idxd_wq_disable(wq, false);
 		if (rc < 0)
 			dev_warn(dev, "IDXD wq disable failed\n");
 		mutex_unlock(&wq->wq_lock);
@@ -262,8 +262,6 @@ static void disable_wq(struct idxd_wq *wq)
 
 static int idxd_config_bus_remove(struct device *dev)
 {
-	int rc;
-
 	dev_dbg(dev, "%s called for %s\n", __func__, dev_name(dev));
 
 	/* disable workqueue here */
@@ -288,22 +286,12 @@ static int idxd_config_bus_remove(struct device *dev)
 		}
 
 		idxd_unregister_dma_device(idxd);
-		rc = idxd_device_disable(idxd);
-		if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) {
-			for (i = 0; i < idxd->max_wqs; i++) {
-				struct idxd_wq *wq = idxd->wqs[i];
-
-				mutex_lock(&wq->wq_lock);
-				idxd_wq_disable_cleanup(wq);
-				mutex_unlock(&wq->wq_lock);
-			}
-		}
+		idxd_device_disable(idxd);
+		if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+			idxd_device_reset(idxd);
 		module_put(THIS_MODULE);
-		if (rc < 0)
-			dev_warn(dev, "Device disable failed\n");
-		else
-			dev_info(dev, "Device %s disabled\n", dev_name(dev));
 
+		dev_info(dev, "Device %s disabled\n", dev_name(dev));
 	}
 
 	return 0;
-- 
Gitee


From 0b23d55282f651f6d54f9c7e6a626f6ed1b255a0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 3 Jun 2021 14:57:35 -0700
Subject: [PATCH 0630/2161] dmaengine: idxd: Add wq occupancy information to
 sysfs attribute

commit e753a64bee753136087dfd70b37fdd199e942ea9 upstream.

Add occupancy information to wq sysfs attribute. Attribute will show
wq occupancy data if "WQ Occupancy Support" field in WQCAP is 1. It
displays the number of entries currently in this WQ. This is provided
as an estimate and should not be relied on to determine whether there
is space in the WQ. The data is to provide information to user apps
for flow control.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162275745546.1857062.8765615879420582018.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  7 +++++++
 drivers/dma/idxd/registers.h                  |  3 +++
 drivers/dma/idxd/sysfs.c                      | 19 +++++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index 55285c136cf0..ebd521314803 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -211,6 +211,13 @@ Contact:	dmaengine@vger.kernel.org
 Description:	Indicate whether ATS disable is turned on for the workqueue.
 		0 indicates ATS is on, and 1 indicates ATS is off for the workqueue.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/occupancy
+Date		May 25, 2021
+KernelVersion:	5.14.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Show the current number of entries in this WQ if WQ Occupancy
+		Support bit WQ capabilities is 1.
+
 What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index c970c3f025f0..7343a8f48819 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -349,6 +349,9 @@ union wqcfg {
 } __packed;
 
 #define WQCFG_PASID_IDX                2
+#define WQCFG_OCCUP_IDX		6
+
+#define WQCFG_OCCUP_MASK	0xffff
 
 /*
  * This macro calculates the offset into the WQCFG register
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 71cd73fefec6..a193de32536d 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1246,6 +1246,24 @@ static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute
 static struct device_attribute dev_attr_wq_ats_disable =
 		__ATTR(ats_disable, 0644, wq_ats_disable_show, wq_ats_disable_store);
 
+static ssize_t wq_occupancy_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+	u32 occup, offset;
+
+	if (!idxd->hw.wq_cap.occupancy)
+		return -EOPNOTSUPP;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_OCCUP_IDX);
+	occup = ioread32(idxd->reg_base + offset) & WQCFG_OCCUP_MASK;
+
+	return sysfs_emit(buf, "%u\n", occup);
+}
+
+static struct device_attribute dev_attr_wq_occupancy =
+		__ATTR(occupancy, 0444, wq_occupancy_show, NULL);
+
 static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_clients.attr,
 	&dev_attr_wq_state.attr,
@@ -1261,6 +1279,7 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_max_transfer_size.attr,
 	&dev_attr_wq_max_batch_size.attr,
 	&dev_attr_wq_ats_disable.attr,
+	&dev_attr_wq_occupancy.attr,
 	NULL,
 };
 
-- 
Gitee


From 3f063881cad203e377c3c1b833ab959b408dd45e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 3 Jun 2021 11:01:37 -0700
Subject: [PATCH 0631/2161] dmaengine: idxd: have command status always set

commit 53499d1fc11267e078557fc68ce943c1eb3b7a37 upstream.

The cached command status is only set when the write back status is
is passed in. Move the variable set outside of the check so it is
always set.

Fixes: 0d5c10b4c84d ("dmaengine: idxd: add work queue drain support")
Reported-by: Ramesh Thomas <ramesh.thomas@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162274329740.1822314.3443875665504707588.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 928c2c8817f0..c8cf1de72176 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -486,6 +486,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	union idxd_command_reg cmd;
 	DECLARE_COMPLETION_ONSTACK(done);
 	unsigned long flags;
+	u32 stat;
 
 	if (idxd_device_is_halted(idxd)) {
 		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
@@ -518,11 +519,11 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	 */
 	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
 	wait_for_completion(&done);
+	stat = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
 	spin_lock_irqsave(&idxd->cmd_lock, flags);
-	if (status) {
-		*status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
-		idxd->cmd_status = *status & GENMASK(7, 0);
-	}
+	if (status)
+		*status = stat;
+	idxd->cmd_status = stat & GENMASK(7, 0);
 
 	__clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
 	/* Wake up other pending commands */
-- 
Gitee


From f93473624f07d1bcd3dba9d45e76c43ef2f4919d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 24 Jun 2021 12:09:29 -0700
Subject: [PATCH 0632/2161] dmaengine: idxd: fix array index when int_handles
 are being used

commit da435aedb00a4ef61019ff11ae0c08ffb9b1fb18 upstream.

The index to the irq vector should be local and has no relation to
the assigned interrupt handle. Assign the MSIX interrupt index that is
programmed for the descriptor. The interrupt handle only matters when it
comes to hardware descriptor programming.

Fixes: eb15e7154fbf ("dmaengine: idxd: add interrupt handle request and release support")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162456176939.1121476.3366256009925001897.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 19afb62abaff..e29887528077 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -128,19 +128,8 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 * Pending the descriptor to the lockless list for the irq_entry
 	 * that we designated the descriptor to.
 	 */
-	if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
-		int vec;
-
-		/*
-		 * If the driver is on host kernel, it would be the value
-		 * assigned to interrupt handle, which is index for MSIX
-		 * vector. If it's guest then can't use the int_handle since
-		 * that is the index to IMS for the entire device. The guest
-		 * device local index will be used.
-		 */
-		vec = !idxd->int_handles ? desc->hw->int_handle : desc->vector;
-		llist_add(&desc->llnode, &idxd->irq_entries[vec].pending_llist);
-	}
+	if (desc->hw->flags & IDXD_OP_FLAG_RCI)
+		llist_add(&desc->llnode, &idxd->irq_entries[desc->vector].pending_llist);
 
 	return 0;
 }
-- 
Gitee


From b96d4e2c3ca169420ba9a5536e3d5009fe452e2e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 24 Jun 2021 13:43:32 -0700
Subject: [PATCH 0633/2161] dmaengine: idxd: fix setup sequence for MSIXPERM
 table

commit d5c10e0fc8645342fe5c9796b00c84ab078cd713 upstream.

The MSIX permission table should be programmed BEFORE request_irq()
happens. This prevents any possibility of an interrupt happening before the
MSIX perm table is setup, however slight.

Fixes: 6df0e6c57dfc ("dmaengine: idxd: clear MSIX permission entry on shutdown")
Sign-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162456741222.1138073.1298447364671237896.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index de300ba38b14..7eac0d167bde 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -102,6 +102,8 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 		spin_lock_init(&idxd->irq_entries[i].list_lock);
 	}
 
+	idxd_msix_perm_setup(idxd);
+
 	irq_entry = &idxd->irq_entries[0];
 	rc = request_threaded_irq(irq_entry->vector, NULL, idxd_misc_thread,
 				  0, "idxd-misc", irq_entry);
@@ -148,7 +150,6 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	}
 
 	idxd_unmask_error_interrupts(idxd);
-	idxd_msix_perm_setup(idxd);
 	return 0;
 
  err_wq_irqs:
@@ -162,6 +163,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
  err_misc_irq:
 	/* Disable error interrupt generation */
 	idxd_mask_error_interrupts(idxd);
+	idxd_msix_perm_clear(idxd);
  err_irq_entries:
 	pci_free_irq_vectors(pdev);
 	dev_err(dev, "No usable interrupts\n");
-- 
Gitee


From 48e5c75dfb7e2a5a796944ced16284aaa5a4c868 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 24 Jun 2021 12:08:21 -0700
Subject: [PATCH 0634/2161] dmaengine: idxd: add missing percpu ref put on
 failure

commit ac24a2dc06cd773895d2fba0378c2538b8176565 upstream.

When enqcmds() fails, exit path is missing a percpu_ref_put(). This can
cause failure on shutdown path when the driver is attempting to quiesce the
wq. Add missing percpu_ref_put() call on the error exit path.

Fixes: 93a40a6d7428 ("dmaengine: idxd: add percpu_ref to descriptor submission path")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162456170168.1121236.7240941044089212312.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index e29887528077..736def129fa8 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -118,8 +118,10 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 		 * device is not accepting descriptor at all.
 		 */
 		rc = enqcmds(portal, desc->hw);
-		if (rc < 0)
+		if (rc < 0) {
+			percpu_ref_put(&wq->wq_active);
 			return rc;
+		}
 	}
 
 	percpu_ref_put(&wq->wq_active);
-- 
Gitee


From 1cd5fc36466cd6bce96846c69f387d5992875177 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 24 Jun 2021 13:39:33 -0700
Subject: [PATCH 0635/2161] dmaengine: idxd: assign MSIX vectors to each WQ
 rather than roundrobin

commit 6cfd9e62e3297993f9f9d2d15f3acb14a0e8abbf upstream.

IOPS increased when changing MSIX vector to per WQ from roundrobin.
Allows descriptor to be completed by the submitter improves caching
locality.

Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Link: https://lore.kernel.org/r/162456717326.1130457.15258077196523268356.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h   |  2 --
 drivers/dma/idxd/submit.c | 21 ++++++---------------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 1f0991dec679..edfa81f0fe18 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -153,7 +153,6 @@ struct idxd_wq {
 	enum idxd_wq_state state;
 	unsigned long flags;
 	union wqcfg *wqcfg;
-	u32 vec_ptr;		/* interrupt steering */
 	struct dsa_hw_desc **hw_descs;
 	int num_descs;
 	union {
@@ -290,7 +289,6 @@ struct idxd_desc {
 	struct list_head list;
 	int id;
 	int cpu;
-	unsigned int vector;
 	struct idxd_wq *wq;
 };
 
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 736def129fa8..d842487bd46a 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -22,22 +22,13 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 		desc->hw->pasid = idxd->pasid;
 
 	/*
-	 * Descriptor completion vectors are 1...N for MSIX. We will round
-	 * robin through the N vectors.
+	 * On host, MSIX vecotr 0 is used for misc interrupt. Therefore when we match
+	 * vector 1:1 to the WQ id, we need to add 1
 	 */
-	wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1;
-	if (!idxd->int_handles) {
-		desc->hw->int_handle = wq->vec_ptr;
-	} else {
-		desc->vector = wq->vec_ptr;
-		/*
-		 * int_handles are only for descriptor completion. However for device
-		 * MSIX enumeration, vec 0 is used for misc interrupts. Therefore even
-		 * though we are rotating through 1...N for descriptor interrupts, we
-		 * need to acqurie the int_handles from 0..N-1.
-		 */
-		desc->hw->int_handle = idxd->int_handles[desc->vector - 1];
-	}
+	if (!idxd->int_handles)
+		desc->hw->int_handle = wq->id + 1;
+	else
+		desc->hw->int_handle = idxd->int_handles[wq->id];
 
 	return desc;
 }
-- 
Gitee


From e54faee65eb76f446a7a6d23e8fef2328bb71035 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 25 Jun 2021 10:38:10 +0200
Subject: [PATCH 0636/2161] dmaengine: idxd: depends on !UML

commit b2296eeac91555bd13f774efa7ab7d4b12fb71ef upstream.

Now that UML has PCI support, this driver must depend also on
!UML since it pokes at X86_64 architecture internals that don't
exist on ARCH=um.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Acked-By: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Link: https://lore.kernel.org/r/20210625103810.fe877ae0aef4.If240438e3f50ae226f3f755fc46ea498c6858393@changeid
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index a8e1003dba8a..4e71190688d2 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -275,7 +275,7 @@ config INTEL_IDMA64
 
 config INTEL_IDXD
 	tristate "Intel Data Accelerators support"
-	depends on PCI && X86_64
+	depends on PCI && X86_64 && !UML
 	depends on PCI_MSI
 	depends on SBITMAP
 	select DMA_ENGINE
-- 
Gitee


From 4c7316687e968363fcfbeed26eff9413a9205ea1 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 14 Jul 2021 14:57:19 -0700
Subject: [PATCH 0637/2161] dmaengine: idxd: fix sequence for pci driver
 remove() and shutdown()

commit 7eb25da161befbc9a80e94e1bd90d6c06aa645cf upstream.

->shutdown() call should only be responsible for quiescing the device.
Currently it is doing PCI device tear down. This causes issue when things
like MMIO mapping is removed while idxd_unregister_devices() will trigger
removal of idxd device sub-driver and still initiates MMIO writes to the
device. Another issue is with the unregistering of idxd 'struct device',
the memory context gets freed. So the teardown calls are accessing freed
memory and can cause kernel oops. Move all the teardown bits that doesn't
belong in shutdown to ->remove() call. Move unregistering of the idxd
conf_dev 'struct device' to after doing all the teardown to free all
the memory that's no longer needed.

Fixes: 47c16ac27d4c ("dmaengine: idxd: fix idxd conf_dev 'struct device' lifetime")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162629983901.395844.17964803190905549615.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c  | 26 +++++++++++++++++---------
 drivers/dma/idxd/sysfs.c |  2 --
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 7eac0d167bde..75ac6a4bc9d1 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -754,32 +754,40 @@ static void idxd_shutdown(struct pci_dev *pdev)
 	for (i = 0; i < msixcnt; i++) {
 		irq_entry = &idxd->irq_entries[i];
 		synchronize_irq(irq_entry->vector);
-		free_irq(irq_entry->vector, irq_entry);
 		if (i == 0)
 			continue;
 		idxd_flush_pending_llist(irq_entry);
 		idxd_flush_work_list(irq_entry);
 	}
-
-	idxd_msix_perm_clear(idxd);
-	idxd_release_int_handles(idxd);
-	pci_free_irq_vectors(pdev);
-	pci_iounmap(pdev, idxd->reg_base);
-	pci_disable_device(pdev);
-	destroy_workqueue(idxd->wq);
+	flush_workqueue(idxd->wq);
 }
 
 static void idxd_remove(struct pci_dev *pdev)
 {
 	struct idxd_device *idxd = pci_get_drvdata(pdev);
+	struct idxd_irq_entry *irq_entry;
+	int msixcnt = pci_msix_vec_count(pdev);
+	int i;
 
 	dev_dbg(&pdev->dev, "%s called\n", __func__);
 	idxd_shutdown(pdev);
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
 	idxd_unregister_devices(idxd);
-	perfmon_pmu_remove(idxd);
+
+	for (i = 0; i < msixcnt; i++) {
+		irq_entry = &idxd->irq_entries[i];
+		free_irq(irq_entry->vector, irq_entry);
+	}
+	idxd_msix_perm_clear(idxd);
+	idxd_release_int_handles(idxd);
+	pci_free_irq_vectors(pdev);
+	pci_iounmap(pdev, idxd->reg_base);
 	iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
+	pci_disable_device(pdev);
+	destroy_workqueue(idxd->wq);
+	perfmon_pmu_remove(idxd);
+	device_unregister(&idxd->conf_dev);
 }
 
 static struct pci_driver idxd_pci_driver = {
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index a193de32536d..33c27df40f1e 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1751,8 +1751,6 @@ void idxd_unregister_devices(struct idxd_device *idxd)
 
 		device_unregister(&group->conf_dev);
 	}
-
-	device_unregister(&idxd->conf_dev);
 }
 
 int idxd_register_bus_type(void)
-- 
Gitee


From 7f408a38f9b0785647ca9eb9933dada243f807ec Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 9 Dec 2021 19:16:42 +0800
Subject: [PATCH 0638/2161] dmaengine: idxd: fix submission race window

commit 6b4b87f2c31ac1af4f244990a7cbfb50d3f3e33f upstream.

Konstantin observed that when descriptors are submitted, the descriptor is
added to the pending list after the submission. This creates a race window
with the slight possibility that the descriptor can complete before it
gets added to the pending list and this window would cause the completion
handler to miss processing the descriptor.

To address the issue, the addition of the descriptor to the pending list
must be done before it gets submitted to the hardware. However, submitting
to swq with ENQCMDS instruction can cause a failure with the condition of
either wq is full or wq is not "active".

With the descriptor allocation being the gate to the wq capacity, it is not
possible to hit a retry with ENQCMDS submission to the swq. The only
possible failure can happen is when wq is no longer "active" due to hw
error and therefore we are moving towards taking down the portal. Given
this is a rare condition and there's no longer concern over I/O
performance, the driver can walk the completion lists in order to retrieve
and abort the descriptor.

The error path will set the descriptor to aborted status. It will take the
work list lock to prevent further processing of worklist. It will do a
delete_all on the pending llist to retrieve all descriptors on the pending
llist. The delete_all action does not require a lock. It will walk through
the acquired llist to find the aborted descriptor while add all remaining
descriptors to the work list since it holds the lock. If it does not find
the aborted descriptor on the llist, it will walk through the work
list. And if it still does not find the descriptor, then it means the
interrupt handler has removed the desc from the llist but is pending on
the work list lock and will process it once the error path releases the
lock.

Fixes: eb15e7154fbf ("dmaengine: idxd: add interrupt handle request and release support")
Reported-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162628855747.360485.10101925573082466530.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h   | 14 ++++++++
 drivers/dma/idxd/irq.c    | 27 +++++++++-----
 drivers/dma/idxd/submit.c | 75 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 99 insertions(+), 17 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index edfa81f0fe18..d875b3d41ed2 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -292,6 +292,14 @@ struct idxd_desc {
 	struct idxd_wq *wq;
 };
 
+/*
+ * This is software defined error for the completion status. We overload the error code
+ * that will never appear in completion status and only SWERR register.
+ */
+enum idxd_completion_status {
+	IDXD_COMP_DESC_ABORT = 0xff,
+};
+
 #define confdev_to_idxd(dev) container_of(dev, struct idxd_device, conf_dev)
 #define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev)
 
@@ -478,4 +486,10 @@ static inline void perfmon_init(void) {}
 static inline void perfmon_exit(void) {}
 #endif
 
+static inline void complete_desc(struct idxd_desc *desc, enum idxd_complete_type reason)
+{
+	idxd_dma_complete_txd(desc, reason);
+	idxd_free_desc(desc->wq, desc);
+}
+
 #endif
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 7a2cf0512501..2924819ca8f3 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -245,12 +245,6 @@ static inline bool match_fault(struct idxd_desc *desc, u64 fault_addr)
 	return false;
 }
 
-static inline void complete_desc(struct idxd_desc *desc, enum idxd_complete_type reason)
-{
-	idxd_dma_complete_txd(desc, reason);
-	idxd_free_desc(desc->wq, desc);
-}
-
 static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 				     enum irq_work_type wtype,
 				     int *processed, u64 data)
@@ -272,8 +266,16 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 		reason = IDXD_COMPLETE_DEV_FAIL;
 
 	llist_for_each_entry_safe(desc, t, head, llnode) {
-		if (desc->completion->status) {
-			if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS)
+		u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
+
+		if (status) {
+			if (unlikely(status == IDXD_COMP_DESC_ABORT)) {
+				complete_desc(desc, IDXD_COMPLETE_ABORT);
+				(*processed)++;
+				continue;
+			}
+
+			if (unlikely(status != DSA_COMP_SUCCESS))
 				match_fault(desc, data);
 			complete_desc(desc, reason);
 			(*processed)++;
@@ -329,7 +331,14 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 	spin_unlock_irqrestore(&irq_entry->list_lock, flags);
 
 	list_for_each_entry(desc, &flist, list) {
-		if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS)
+		u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
+
+		if (unlikely(status == IDXD_COMP_DESC_ABORT)) {
+			complete_desc(desc, IDXD_COMPLETE_ABORT);
+			continue;
+		}
+
+		if (unlikely(status != DSA_COMP_SUCCESS))
 			match_fault(desc, data);
 		complete_desc(desc, reason);
 	}
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index d842487bd46a..62548599447f 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -79,9 +79,64 @@ void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	sbitmap_queue_clear(&wq->sbq, desc->id, cpu);
 }
 
+static struct idxd_desc *list_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
+					 struct idxd_desc *desc)
+{
+	struct idxd_desc *d, *n;
+
+	lockdep_assert_held(&ie->list_lock);
+	list_for_each_entry_safe(d, n, &ie->work_list, list) {
+		if (d == desc) {
+			list_del(&d->list);
+			return d;
+		}
+	}
+
+	/*
+	 * At this point, the desc needs to be aborted is held by the completion
+	 * handler where it has taken it off the pending list but has not added to the
+	 * work list. It will be cleaned up by the interrupt handler when it sees the
+	 * IDXD_COMP_DESC_ABORT for completion status.
+	 */
+	return NULL;
+}
+
+static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
+			     struct idxd_desc *desc)
+{
+	struct idxd_desc *d, *t, *found = NULL;
+	struct llist_node *head;
+	unsigned long flags;
+
+	desc->completion->status = IDXD_COMP_DESC_ABORT;
+	/*
+	 * Grab the list lock so it will block the irq thread handler. This allows the
+	 * abort code to locate the descriptor need to be aborted.
+	 */
+	spin_lock_irqsave(&ie->list_lock, flags);
+	head = llist_del_all(&ie->pending_llist);
+	if (head) {
+		llist_for_each_entry_safe(d, t, head, llnode) {
+			if (d == desc) {
+				found = desc;
+				continue;
+			}
+			list_add_tail(&desc->list, &ie->work_list);
+		}
+	}
+
+	if (!found)
+		found = list_abort_desc(wq, ie, desc);
+	spin_unlock_irqrestore(&ie->list_lock, flags);
+
+	if (found)
+		complete_desc(found, IDXD_COMPLETE_ABORT);
+}
+
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 {
 	struct idxd_device *idxd = wq->idxd;
+	struct idxd_irq_entry *ie = NULL;
 	void __iomem *portal;
 	int rc;
 
@@ -99,6 +154,16 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 * even on UP because the recipient is a device.
 	 */
 	wmb();
+
+	/*
+	 * Pending the descriptor to the lockless list for the irq_entry
+	 * that we designated the descriptor to.
+	 */
+	if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
+		ie = &idxd->irq_entries[desc->vector];
+		llist_add(&desc->llnode, &ie->pending_llist);
+	}
+
 	if (wq_dedicated(wq)) {
 		iosubmit_cmds512(portal, desc->hw, 1);
 	} else {
@@ -111,18 +176,12 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 		rc = enqcmds(portal, desc->hw);
 		if (rc < 0) {
 			percpu_ref_put(&wq->wq_active);
+			if (ie)
+				llist_abort_desc(wq, ie, desc);
 			return rc;
 		}
 	}
 
 	percpu_ref_put(&wq->wq_active);
-
-	/*
-	 * Pending the descriptor to the lockless list for the irq_entry
-	 * that we designated the descriptor to.
-	 */
-	if (desc->hw->flags & IDXD_OP_FLAG_RCI)
-		llist_add(&desc->llnode, &idxd->irq_entries[desc->vector].pending_llist);
-
 	return 0;
 }
-- 
Gitee


From 81d0c086c23f735a6938862a1774f9639ac95c37 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 9 Dec 2021 19:50:29 +0800
Subject: [PATCH 0639/2161] Fix Conflicts for drivers/dma/idxd/submit.c

commit opencloudos.

Port from 88c5d0a2b9b043d0e000a4f64a28313e57d96a9c

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 62548599447f..6ef704dd4d0b 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -160,7 +160,7 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 * that we designated the descriptor to.
 	 */
 	if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
-		ie = &idxd->irq_entries[desc->vector];
+		ie = &idxd->irq_entries[wq->id + 1];
 		llist_add(&desc->llnode, &ie->pending_llist);
 	}
 
-- 
Gitee


From aa6603d8a7c2df516a77d1e779ea4cc1d6d1bdf5 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:09 -0700
Subject: [PATCH 0640/2161] dmaengine: idxd: add driver register helper

commit 3ecfc9135e6c82183d121c5578ed5d6f07a53ec8 upstream.

Add helper functions for dsa-driver registration similar to other
bus-types. In particular, do not require dsa-drivers to open-code the
bus, owner, and mod_name fields. Let registration and unregistration
operate on the 'struct idxd_device_driver' instead of the raw /
embedded 'struct device_driver'.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637458949.744545.14996726325385482050.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h  |  7 +++++++
 drivers/dma/idxd/init.c  | 17 +++++++++++++++++
 drivers/dma/idxd/sysfs.c |  7 ++-----
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index d875b3d41ed2..8db19b899709 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -402,6 +402,13 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq)
 	return wq->client_count;
 };
 
+int __must_check __idxd_driver_register(struct idxd_device_driver *idxd_drv,
+					struct module *module, const char *mod_name);
+#define idxd_driver_register(driver) \
+	__idxd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
+
+void idxd_driver_unregister(struct idxd_device_driver *idxd_drv);
+
 int idxd_register_bus_type(void);
 void idxd_unregister_bus_type(void);
 int idxd_register_devices(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 75ac6a4bc9d1..b15817751d5f 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -855,3 +855,20 @@ static void __exit idxd_exit_module(void)
 	perfmon_exit();
 }
 module_exit(idxd_exit_module);
+
+int __idxd_driver_register(struct idxd_device_driver *idxd_drv, struct module *owner,
+			   const char *mod_name)
+{
+	struct device_driver *drv = &idxd_drv->drv;
+
+	drv->bus = &dsa_bus_type;
+	drv->owner = owner;
+	drv->mod_name = mod_name;
+
+	return driver_register(drv);
+}
+
+void idxd_driver_unregister(struct idxd_device_driver *idxd_drv)
+{
+	driver_unregister(&idxd_drv->drv);
+}
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 33c27df40f1e..bf229b12d527 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -313,21 +313,18 @@ struct bus_type dsa_bus_type = {
 static struct idxd_device_driver dsa_drv = {
 	.drv = {
 		.name = "dsa",
-		.bus = &dsa_bus_type,
-		.owner = THIS_MODULE,
-		.mod_name = KBUILD_MODNAME,
 	},
 };
 
 /* IDXD generic driver setup */
 int idxd_register_driver(void)
 {
-	return driver_register(&dsa_drv.drv);
+	return idxd_driver_register(&dsa_drv);
 }
 
 void idxd_unregister_driver(void)
 {
-	driver_unregister(&dsa_drv.drv);
+	idxd_driver_unregister(&dsa_drv);
 }
 
 /* IDXD engine attributes */
-- 
Gitee


From 5451ad048b33afdf07ef7d2d530e4a689319b7d3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:15 -0700
Subject: [PATCH 0641/2161] dmaengine: idxd: add driver name

commit da5a11d75d6837c9c5ef40810f66ce9d2db6ca5e upstream.

Add name field in idxd_device_driver so we don't have to touch the
'struct device_driver' during declaration.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637459517.744545.7572915135318813722.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h  | 1 +
 drivers/dma/idxd/init.c  | 1 +
 drivers/dma/idxd/sysfs.c | 4 +---
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 8db19b899709..e8721ff028c2 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -34,6 +34,7 @@ enum idxd_type {
 #define IDXD_PMU_EVENT_MAX	64
 
 struct idxd_device_driver {
+	const char *name;
 	struct device_driver drv;
 };
 
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index b15817751d5f..6403d55c7ff7 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -861,6 +861,7 @@ int __idxd_driver_register(struct idxd_device_driver *idxd_drv, struct module *o
 {
 	struct device_driver *drv = &idxd_drv->drv;
 
+	drv->name = idxd_drv->name;
 	drv->bus = &dsa_bus_type;
 	drv->owner = owner;
 	drv->mod_name = mod_name;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index bf229b12d527..60779f57c118 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -311,9 +311,7 @@ struct bus_type dsa_bus_type = {
 };
 
 static struct idxd_device_driver dsa_drv = {
-	.drv = {
-		.name = "dsa",
-	},
+	.name = "dsa",
 };
 
 /* IDXD generic driver setup */
-- 
Gitee


From 1ff2be4cfa86fa01012f2ee60a4c4554defbc632 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:20 -0700
Subject: [PATCH 0642/2161] dmaengine: idxd: add 'struct idxd_dev' as wrapper
 for conf_dev

commit 700af3a0a26cbac87e4a0ae1dfa79597d0056d5f upstream.

Add a 'struct idxd_dev' that wraps the 'struct device' for idxd conf_dev
that registers with the dsa bus. This is introduced in order to deal with
multiple different types of 'devices' that are registered on the dsa_bus
when the compat driver needs to route them to the correct driver to attach.
The bind() call now can determine the type of device and then do the
appropriate driver matching.

Reviewed-by Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637460065.744545.584492831446090984.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c  |  11 +-
 drivers/dma/idxd/dma.c   |   4 +-
 drivers/dma/idxd/idxd.h  |  82 ++++++++++++--
 drivers/dma/idxd/init.c  |  98 +++++++++-------
 drivers/dma/idxd/irq.c   |   2 +-
 drivers/dma/idxd/sysfs.c | 239 ++++++++++++++++++---------------------
 6 files changed, 251 insertions(+), 185 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index e9def577c697..18a003b93812 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -41,7 +41,7 @@ struct idxd_user_context {
 
 static void idxd_cdev_dev_release(struct device *dev)
 {
-	struct idxd_cdev *idxd_cdev = container_of(dev, struct idxd_cdev, dev);
+	struct idxd_cdev *idxd_cdev = dev_to_cdev(dev);
 	struct idxd_cdev_context *cdev_ctx;
 	struct idxd_wq *wq = idxd_cdev->wq;
 
@@ -256,9 +256,10 @@ int idxd_wq_add_cdev(struct idxd_wq *wq)
 	if (!idxd_cdev)
 		return -ENOMEM;
 
+	idxd_cdev->idxd_dev.type = IDXD_DEV_CDEV;
 	idxd_cdev->wq = wq;
 	cdev = &idxd_cdev->cdev;
-	dev = &idxd_cdev->dev;
+	dev = cdev_dev(idxd_cdev);
 	cdev_ctx = &ictx[wq->idxd->data->type];
 	minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL);
 	if (minor < 0) {
@@ -268,7 +269,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq)
 	idxd_cdev->minor = minor;
 
 	device_initialize(dev);
-	dev->parent = &wq->conf_dev;
+	dev->parent = wq_confdev(wq);
 	dev->bus = &dsa_bus_type;
 	dev->type = &idxd_cdev_device_type;
 	dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
@@ -299,8 +300,8 @@ void idxd_wq_del_cdev(struct idxd_wq *wq)
 
 	idxd_cdev = wq->idxd_cdev;
 	wq->idxd_cdev = NULL;
-	cdev_device_del(&idxd_cdev->cdev, &idxd_cdev->dev);
-	put_device(&idxd_cdev->dev);
+	cdev_device_del(&idxd_cdev->cdev, cdev_dev(idxd_cdev));
+	put_device(cdev_dev(idxd_cdev));
 }
 
 int idxd_cdev_register(void)
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 77439b645044..2e52f9a50519 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -245,7 +245,7 @@ int idxd_register_dma_channel(struct idxd_wq *wq)
 
 	wq->idxd_chan = idxd_chan;
 	idxd_chan->wq = wq;
-	get_device(&wq->conf_dev);
+	get_device(wq_confdev(wq));
 
 	return 0;
 }
@@ -260,5 +260,5 @@ void idxd_unregister_dma_channel(struct idxd_wq *wq)
 	list_del(&chan->device_node);
 	kfree(wq->idxd_chan);
 	wq->idxd_chan = NULL;
-	put_device(&wq->conf_dev);
+	put_device(wq_confdev(wq));
 }
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index e8721ff028c2..ae60fcc7b625 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -17,8 +17,24 @@
 
 extern struct kmem_cache *idxd_desc_pool;
 
-struct idxd_device;
 struct idxd_wq;
+struct idxd_dev;
+
+enum idxd_dev_type {
+	IDXD_DEV_NONE = -1,
+	IDXD_DEV_DSA = 0,
+	IDXD_DEV_IAX,
+	IDXD_DEV_WQ,
+	IDXD_DEV_GROUP,
+	IDXD_DEV_ENGINE,
+	IDXD_DEV_CDEV,
+	IDXD_DEV_MAX_TYPE,
+};
+
+struct idxd_dev {
+	struct device conf_dev;
+	enum idxd_dev_type type;
+};
 
 #define IDXD_REG_TIMEOUT	50
 #define IDXD_DRAIN_TIMEOUT	5000
@@ -52,7 +68,7 @@ struct idxd_irq_entry {
 };
 
 struct idxd_group {
-	struct device conf_dev;
+	struct idxd_dev idxd_dev;
 	struct idxd_device *idxd;
 	struct grpcfg grpcfg;
 	int id;
@@ -111,7 +127,7 @@ enum idxd_wq_type {
 struct idxd_cdev {
 	struct idxd_wq *wq;
 	struct cdev cdev;
-	struct device dev;
+	struct idxd_dev idxd_dev;
 	int minor;
 };
 
@@ -139,7 +155,7 @@ struct idxd_wq {
 	void __iomem *portal;
 	struct percpu_ref wq_active;
 	struct completion wq_dead;
-	struct device conf_dev;
+	struct idxd_dev idxd_dev;
 	struct idxd_cdev *idxd_cdev;
 	struct wait_queue_head err_queue;
 	struct idxd_device *idxd;
@@ -174,7 +190,7 @@ struct idxd_wq {
 };
 
 struct idxd_engine {
-	struct device conf_dev;
+	struct idxd_dev idxd_dev;
 	int id;
 	struct idxd_group *group;
 	struct idxd_device *idxd;
@@ -218,7 +234,7 @@ struct idxd_driver_data {
 };
 
 struct idxd_device {
-	struct device conf_dev;
+	struct idxd_dev idxd_dev;
 	struct idxd_driver_data *data;
 	struct list_head list;
 	struct idxd_hw hw;
@@ -301,8 +317,58 @@ enum idxd_completion_status {
 	IDXD_COMP_DESC_ABORT = 0xff,
 };
 
-#define confdev_to_idxd(dev) container_of(dev, struct idxd_device, conf_dev)
-#define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev)
+#define idxd_confdev(idxd) &idxd->idxd_dev.conf_dev
+#define wq_confdev(wq) &wq->idxd_dev.conf_dev
+#define engine_confdev(engine) &engine->idxd_dev.conf_dev
+#define group_confdev(group) &group->idxd_dev.conf_dev
+#define cdev_dev(cdev) &cdev->idxd_dev.conf_dev
+
+#define confdev_to_idxd_dev(dev) container_of(dev, struct idxd_dev, conf_dev)
+
+static inline struct idxd_device *confdev_to_idxd(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return container_of(idxd_dev, struct idxd_device, idxd_dev);
+}
+
+static inline struct idxd_wq *confdev_to_wq(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return container_of(idxd_dev, struct idxd_wq, idxd_dev);
+}
+
+static inline struct idxd_engine *confdev_to_engine(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return container_of(idxd_dev, struct idxd_engine, idxd_dev);
+}
+
+static inline struct idxd_group *confdev_to_group(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return container_of(idxd_dev, struct idxd_group, idxd_dev);
+}
+
+static inline struct idxd_cdev *dev_to_cdev(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return container_of(idxd_dev, struct idxd_cdev, idxd_dev);
+}
+
+static inline void idxd_dev_set_type(struct idxd_dev *idev, int type)
+{
+	if (type >= IDXD_DEV_MAX_TYPE) {
+		idev->type = IDXD_DEV_NONE;
+		return;
+	}
+
+	idev->type = type;
+}
 
 extern struct bus_type dsa_bus_type;
 extern struct bus_type iax_bus_type;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 6403d55c7ff7..f500076882d2 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -200,6 +200,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
 	struct idxd_wq *wq;
+	struct device *conf_dev;
 	int i, rc;
 
 	idxd->wqs = kcalloc_node(idxd->max_wqs, sizeof(struct idxd_wq *),
@@ -214,15 +215,17 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 			goto err;
 		}
 
+		idxd_dev_set_type(&wq->idxd_dev, IDXD_DEV_WQ);
+		conf_dev = wq_confdev(wq);
 		wq->id = i;
 		wq->idxd = idxd;
-		device_initialize(&wq->conf_dev);
-		wq->conf_dev.parent = &idxd->conf_dev;
-		wq->conf_dev.bus = &dsa_bus_type;
-		wq->conf_dev.type = &idxd_wq_device_type;
-		rc = dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id);
+		device_initialize(wq_confdev(wq));
+		conf_dev->parent = idxd_confdev(idxd);
+		conf_dev->bus = &dsa_bus_type;
+		conf_dev->type = &idxd_wq_device_type;
+		rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id);
 		if (rc < 0) {
-			put_device(&wq->conf_dev);
+			put_device(conf_dev);
 			goto err;
 		}
 
@@ -233,7 +236,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		wq->max_batch_size = idxd->max_batch_size;
 		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
 		if (!wq->wqcfg) {
-			put_device(&wq->conf_dev);
+			put_device(conf_dev);
 			rc = -ENOMEM;
 			goto err;
 		}
@@ -243,8 +246,11 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 	return 0;
 
  err:
-	while (--i >= 0)
-		put_device(&idxd->wqs[i]->conf_dev);
+	while (--i >= 0) {
+		wq = idxd->wqs[i];
+		conf_dev = wq_confdev(wq);
+		put_device(conf_dev);
+	}
 	return rc;
 }
 
@@ -252,6 +258,7 @@ static int idxd_setup_engines(struct idxd_device *idxd)
 {
 	struct idxd_engine *engine;
 	struct device *dev = &idxd->pdev->dev;
+	struct device *conf_dev;
 	int i, rc;
 
 	idxd->engines = kcalloc_node(idxd->max_engines, sizeof(struct idxd_engine *),
@@ -266,15 +273,17 @@ static int idxd_setup_engines(struct idxd_device *idxd)
 			goto err;
 		}
 
+		idxd_dev_set_type(&engine->idxd_dev, IDXD_DEV_ENGINE);
+		conf_dev = engine_confdev(engine);
 		engine->id = i;
 		engine->idxd = idxd;
-		device_initialize(&engine->conf_dev);
-		engine->conf_dev.parent = &idxd->conf_dev;
-		engine->conf_dev.bus = &dsa_bus_type;
-		engine->conf_dev.type = &idxd_engine_device_type;
-		rc = dev_set_name(&engine->conf_dev, "engine%d.%d", idxd->id, engine->id);
+		device_initialize(conf_dev);
+		conf_dev->parent = idxd_confdev(idxd);
+		conf_dev->bus = &dsa_bus_type;
+		conf_dev->type = &idxd_engine_device_type;
+		rc = dev_set_name(conf_dev, "engine%d.%d", idxd->id, engine->id);
 		if (rc < 0) {
-			put_device(&engine->conf_dev);
+			put_device(conf_dev);
 			goto err;
 		}
 
@@ -284,14 +293,18 @@ static int idxd_setup_engines(struct idxd_device *idxd)
 	return 0;
 
  err:
-	while (--i >= 0)
-		put_device(&idxd->engines[i]->conf_dev);
+	while (--i >= 0) {
+		engine = idxd->engines[i];
+		conf_dev = engine_confdev(engine);
+		put_device(conf_dev);
+	}
 	return rc;
 }
 
 static int idxd_setup_groups(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
+	struct device *conf_dev;
 	struct idxd_group *group;
 	int i, rc;
 
@@ -307,15 +320,17 @@ static int idxd_setup_groups(struct idxd_device *idxd)
 			goto err;
 		}
 
+		idxd_dev_set_type(&group->idxd_dev, IDXD_DEV_GROUP);
+		conf_dev = group_confdev(group);
 		group->id = i;
 		group->idxd = idxd;
-		device_initialize(&group->conf_dev);
-		group->conf_dev.parent = &idxd->conf_dev;
-		group->conf_dev.bus = &dsa_bus_type;
-		group->conf_dev.type = &idxd_group_device_type;
-		rc = dev_set_name(&group->conf_dev, "group%d.%d", idxd->id, group->id);
+		device_initialize(conf_dev);
+		conf_dev->parent = idxd_confdev(idxd);
+		conf_dev->bus = &dsa_bus_type;
+		conf_dev->type = &idxd_group_device_type;
+		rc = dev_set_name(conf_dev, "group%d.%d", idxd->id, group->id);
 		if (rc < 0) {
-			put_device(&group->conf_dev);
+			put_device(conf_dev);
 			goto err;
 		}
 
@@ -327,8 +342,10 @@ static int idxd_setup_groups(struct idxd_device *idxd)
 	return 0;
 
  err:
-	while (--i >= 0)
-		put_device(&idxd->groups[i]->conf_dev);
+	while (--i >= 0) {
+		group = idxd->groups[i];
+		put_device(group_confdev(group));
+	}
 	return rc;
 }
 
@@ -337,11 +354,11 @@ static void idxd_cleanup_internals(struct idxd_device *idxd)
 	int i;
 
 	for (i = 0; i < idxd->max_groups; i++)
-		put_device(&idxd->groups[i]->conf_dev);
+		put_device(group_confdev(idxd->groups[i]));
 	for (i = 0; i < idxd->max_engines; i++)
-		put_device(&idxd->engines[i]->conf_dev);
+		put_device(engine_confdev(idxd->engines[i]));
 	for (i = 0; i < idxd->max_wqs; i++)
-		put_device(&idxd->wqs[i]->conf_dev);
+		put_device(wq_confdev(idxd->wqs[i]));
 	destroy_workqueue(idxd->wq);
 }
 
@@ -381,13 +398,13 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 
  err_wkq_create:
 	for (i = 0; i < idxd->max_groups; i++)
-		put_device(&idxd->groups[i]->conf_dev);
+		put_device(group_confdev(idxd->groups[i]));
  err_group:
 	for (i = 0; i < idxd->max_engines; i++)
-		put_device(&idxd->engines[i]->conf_dev);
+		put_device(engine_confdev(idxd->engines[i]));
  err_engine:
 	for (i = 0; i < idxd->max_wqs; i++)
-		put_device(&idxd->wqs[i]->conf_dev);
+		put_device(wq_confdev(idxd->wqs[i]));
  err_wqs:
 	kfree(idxd->int_handles);
 	return rc;
@@ -469,6 +486,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
 static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data)
 {
 	struct device *dev = &pdev->dev;
+	struct device *conf_dev;
 	struct idxd_device *idxd;
 	int rc;
 
@@ -476,19 +494,21 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d
 	if (!idxd)
 		return NULL;
 
+	conf_dev = idxd_confdev(idxd);
 	idxd->pdev = pdev;
 	idxd->data = data;
+	idxd_dev_set_type(&idxd->idxd_dev, idxd->data->type);
 	idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL);
 	if (idxd->id < 0)
 		return NULL;
 
-	device_initialize(&idxd->conf_dev);
-	idxd->conf_dev.parent = dev;
-	idxd->conf_dev.bus = &dsa_bus_type;
-	idxd->conf_dev.type = idxd->data->dev_type;
-	rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd->data->name_prefix, idxd->id);
+	device_initialize(conf_dev);
+	conf_dev->parent = dev;
+	conf_dev->bus = &dsa_bus_type;
+	conf_dev->type = idxd->data->dev_type;
+	rc = dev_set_name(conf_dev, "%s%d", idxd->data->name_prefix, idxd->id);
 	if (rc < 0) {
-		put_device(&idxd->conf_dev);
+		put_device(conf_dev);
 		return NULL;
 	}
 
@@ -674,7 +694,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  err:
 	pci_iounmap(pdev, idxd->reg_base);
  err_iomap:
-	put_device(&idxd->conf_dev);
+	put_device(idxd_confdev(idxd));
  err_idxd_alloc:
 	pci_disable_device(pdev);
 	return rc;
@@ -787,7 +807,7 @@ static void idxd_remove(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 	destroy_workqueue(idxd->wq);
 	perfmon_pmu_remove(idxd);
-	device_unregister(&idxd->conf_dev);
+	device_unregister(idxd_confdev(idxd));
 }
 
 static struct pci_driver idxd_pci_driver = {
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 2924819ca8f3..be65d55e1fc4 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -51,7 +51,7 @@ static void idxd_device_reinit(struct work_struct *work)
 			rc = idxd_wq_enable(wq);
 			if (rc < 0) {
 				dev_warn(dev, "Unable to re-enable wq %s\n",
-					 dev_name(&wq->conf_dev));
+					 dev_name(wq_confdev(wq)));
 			}
 		}
 	}
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 60779f57c118..f603b11141c4 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -164,7 +164,7 @@ static int enable_wq(struct idxd_wq *wq)
 	}
 
 	mutex_unlock(&wq->wq_lock);
-	dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev));
+	dev_info(dev, "wq %s enabled\n", dev_name(wq_confdev(wq)));
 
 	return 0;
 }
@@ -230,7 +230,7 @@ static void disable_wq(struct idxd_wq *wq)
 	struct device *dev = &idxd->pdev->dev;
 
 	mutex_lock(&wq->wq_lock);
-	dev_dbg(dev, "%s removing WQ %s\n", __func__, dev_name(&wq->conf_dev));
+	dev_dbg(dev, "%s removing WQ %s\n", __func__, dev_name(wq_confdev(wq)));
 	if (wq->state == IDXD_WQ_DISABLED) {
 		mutex_unlock(&wq->wq_lock);
 		return;
@@ -257,7 +257,7 @@ static void disable_wq(struct idxd_wq *wq)
 	wq->client_count = 0;
 	mutex_unlock(&wq->wq_lock);
 
-	dev_info(dev, "wq %s disabled\n", dev_name(&wq->conf_dev));
+	dev_info(dev, "wq %s disabled\n", dev_name(wq_confdev(wq)));
 }
 
 static int idxd_config_bus_remove(struct device *dev)
@@ -274,15 +274,15 @@ static int idxd_config_bus_remove(struct device *dev)
 		int i;
 
 		dev_dbg(dev, "%s removing dev %s\n", __func__,
-			dev_name(&idxd->conf_dev));
+			dev_name(idxd_confdev(idxd)));
 		for (i = 0; i < idxd->max_wqs; i++) {
 			struct idxd_wq *wq = idxd->wqs[i];
 
 			if (wq->state == IDXD_WQ_DISABLED)
 				continue;
 			dev_warn(dev, "Active wq %d on disable %s.\n", i,
-				 dev_name(&idxd->conf_dev));
-			device_release_driver(&wq->conf_dev);
+				 dev_name(wq_confdev(wq)));
+			device_release_driver(wq_confdev(wq));
 		}
 
 		idxd_unregister_dma_device(idxd);
@@ -329,8 +329,7 @@ void idxd_unregister_driver(void)
 static ssize_t engine_group_id_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
-	struct idxd_engine *engine =
-		container_of(dev, struct idxd_engine, conf_dev);
+	struct idxd_engine *engine = confdev_to_engine(dev);
 
 	if (engine->group)
 		return sysfs_emit(buf, "%d\n", engine->group->id);
@@ -342,8 +341,7 @@ static ssize_t engine_group_id_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
 {
-	struct idxd_engine *engine =
-		container_of(dev, struct idxd_engine, conf_dev);
+	struct idxd_engine *engine = confdev_to_engine(dev);
 	struct idxd_device *idxd = engine->idxd;
 	long id;
 	int rc;
@@ -397,7 +395,7 @@ static const struct attribute_group *idxd_engine_attribute_groups[] = {
 
 static void idxd_conf_engine_release(struct device *dev)
 {
-	struct idxd_engine *engine = container_of(dev, struct idxd_engine, conf_dev);
+	struct idxd_engine *engine = confdev_to_engine(dev);
 
 	kfree(engine);
 }
@@ -427,8 +425,7 @@ static ssize_t group_tokens_reserved_show(struct device *dev,
 					  struct device_attribute *attr,
 					  char *buf)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 
 	return sysfs_emit(buf, "%u\n", group->tokens_reserved);
 }
@@ -437,8 +434,7 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
 					   struct device_attribute *attr,
 					   const char *buf, size_t count)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 	struct idxd_device *idxd = group->idxd;
 	unsigned long val;
 	int rc;
@@ -475,8 +471,7 @@ static ssize_t group_tokens_allowed_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 
 	return sysfs_emit(buf, "%u\n", group->tokens_allowed);
 }
@@ -485,8 +480,7 @@ static ssize_t group_tokens_allowed_store(struct device *dev,
 					  struct device_attribute *attr,
 					  const char *buf, size_t count)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 	struct idxd_device *idxd = group->idxd;
 	unsigned long val;
 	int rc;
@@ -520,8 +514,7 @@ static ssize_t group_use_token_limit_show(struct device *dev,
 					  struct device_attribute *attr,
 					  char *buf)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 
 	return sysfs_emit(buf, "%u\n", group->use_token_limit);
 }
@@ -530,8 +523,7 @@ static ssize_t group_use_token_limit_store(struct device *dev,
 					   struct device_attribute *attr,
 					   const char *buf, size_t count)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 	struct idxd_device *idxd = group->idxd;
 	unsigned long val;
 	int rc;
@@ -563,8 +555,7 @@ static struct device_attribute dev_attr_group_use_token_limit =
 static ssize_t group_engines_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 	int i, rc = 0;
 	struct idxd_device *idxd = group->idxd;
 
@@ -592,8 +583,7 @@ static struct device_attribute dev_attr_group_engines =
 static ssize_t group_work_queues_show(struct device *dev,
 				      struct device_attribute *attr, char *buf)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 	int i, rc = 0;
 	struct idxd_device *idxd = group->idxd;
 
@@ -622,8 +612,7 @@ static ssize_t group_traffic_class_a_show(struct device *dev,
 					  struct device_attribute *attr,
 					  char *buf)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 
 	return sysfs_emit(buf, "%d\n", group->tc_a);
 }
@@ -632,8 +621,7 @@ static ssize_t group_traffic_class_a_store(struct device *dev,
 					   struct device_attribute *attr,
 					   const char *buf, size_t count)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 	struct idxd_device *idxd = group->idxd;
 	long val;
 	int rc;
@@ -663,8 +651,7 @@ static ssize_t group_traffic_class_b_show(struct device *dev,
 					  struct device_attribute *attr,
 					  char *buf)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 
 	return sysfs_emit(buf, "%d\n", group->tc_b);
 }
@@ -673,8 +660,7 @@ static ssize_t group_traffic_class_b_store(struct device *dev,
 					   struct device_attribute *attr,
 					   const char *buf, size_t count)
 {
-	struct idxd_group *group =
-		container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 	struct idxd_device *idxd = group->idxd;
 	long val;
 	int rc;
@@ -722,7 +708,7 @@ static const struct attribute_group *idxd_group_attribute_groups[] = {
 
 static void idxd_conf_group_release(struct device *dev)
 {
-	struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev);
+	struct idxd_group *group = confdev_to_group(dev);
 
 	kfree(group);
 }
@@ -737,7 +723,7 @@ struct device_type idxd_group_device_type = {
 static ssize_t wq_clients_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%d\n", wq->client_count);
 }
@@ -748,7 +734,7 @@ static struct device_attribute dev_attr_wq_clients =
 static ssize_t wq_state_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	switch (wq->state) {
 	case IDXD_WQ_DISABLED:
@@ -766,7 +752,7 @@ static struct device_attribute dev_attr_wq_state =
 static ssize_t wq_group_id_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	if (wq->group)
 		return sysfs_emit(buf, "%u\n", wq->group->id);
@@ -778,7 +764,7 @@ static ssize_t wq_group_id_store(struct device *dev,
 				 struct device_attribute *attr,
 				 const char *buf, size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	struct idxd_device *idxd = wq->idxd;
 	long id;
 	int rc;
@@ -821,7 +807,7 @@ static struct device_attribute dev_attr_wq_group_id =
 static ssize_t wq_mode_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%s\n", wq_dedicated(wq) ? "dedicated" : "shared");
 }
@@ -830,7 +816,7 @@ static ssize_t wq_mode_store(struct device *dev,
 			     struct device_attribute *attr, const char *buf,
 			     size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	struct idxd_device *idxd = wq->idxd;
 
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
@@ -857,7 +843,7 @@ static struct device_attribute dev_attr_wq_mode =
 static ssize_t wq_size_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%u\n", wq->size);
 }
@@ -880,7 +866,7 @@ static ssize_t wq_size_store(struct device *dev,
 			     struct device_attribute *attr, const char *buf,
 			     size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	unsigned long size;
 	struct idxd_device *idxd = wq->idxd;
 	int rc;
@@ -908,7 +894,7 @@ static struct device_attribute dev_attr_wq_size =
 static ssize_t wq_priority_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%u\n", wq->priority);
 }
@@ -917,7 +903,7 @@ static ssize_t wq_priority_store(struct device *dev,
 				 struct device_attribute *attr,
 				 const char *buf, size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	unsigned long prio;
 	struct idxd_device *idxd = wq->idxd;
 	int rc;
@@ -945,7 +931,7 @@ static struct device_attribute dev_attr_wq_priority =
 static ssize_t wq_block_on_fault_show(struct device *dev,
 				      struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%u\n", test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags));
 }
@@ -954,7 +940,7 @@ static ssize_t wq_block_on_fault_store(struct device *dev,
 				       struct device_attribute *attr,
 				       const char *buf, size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	struct idxd_device *idxd = wq->idxd;
 	bool bof;
 	int rc;
@@ -984,7 +970,7 @@ static struct device_attribute dev_attr_wq_block_on_fault =
 static ssize_t wq_threshold_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%u\n", wq->threshold);
 }
@@ -993,7 +979,7 @@ static ssize_t wq_threshold_store(struct device *dev,
 				  struct device_attribute *attr,
 				  const char *buf, size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	struct idxd_device *idxd = wq->idxd;
 	unsigned int val;
 	int rc;
@@ -1025,7 +1011,7 @@ static struct device_attribute dev_attr_wq_threshold =
 static ssize_t wq_type_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	switch (wq->type) {
 	case IDXD_WQT_KERNEL:
@@ -1044,7 +1030,7 @@ static ssize_t wq_type_store(struct device *dev,
 			     struct device_attribute *attr, const char *buf,
 			     size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	enum idxd_wq_type old_type;
 
 	if (wq->state != IDXD_WQ_DISABLED)
@@ -1073,7 +1059,7 @@ static struct device_attribute dev_attr_wq_type =
 static ssize_t wq_name_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%s\n", wq->name);
 }
@@ -1082,7 +1068,7 @@ static ssize_t wq_name_store(struct device *dev,
 			     struct device_attribute *attr, const char *buf,
 			     size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
@@ -1109,7 +1095,7 @@ static struct device_attribute dev_attr_wq_name =
 static ssize_t wq_cdev_minor_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	int minor = -1;
 
 	mutex_lock(&wq->wq_lock);
@@ -1143,7 +1129,7 @@ static int __get_sysfs_u64(const char *buf, u64 *val)
 static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attribute *attr,
 					 char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%llu\n", wq->max_xfer_bytes);
 }
@@ -1151,7 +1137,7 @@ static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attri
 static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attribute *attr,
 					  const char *buf, size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	struct idxd_device *idxd = wq->idxd;
 	u64 xfer_size;
 	int rc;
@@ -1177,7 +1163,7 @@ static struct device_attribute dev_attr_wq_max_transfer_size =
 
 static ssize_t wq_max_batch_size_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%u\n", wq->max_batch_size);
 }
@@ -1185,7 +1171,7 @@ static ssize_t wq_max_batch_size_show(struct device *dev, struct device_attribut
 static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribute *attr,
 				       const char *buf, size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	struct idxd_device *idxd = wq->idxd;
 	u64 batch_size;
 	int rc;
@@ -1210,7 +1196,7 @@ static struct device_attribute dev_attr_wq_max_batch_size =
 
 static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	return sysfs_emit(buf, "%u\n", wq->ats_dis);
 }
@@ -1218,7 +1204,7 @@ static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *
 static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr,
 				    const char *buf, size_t count)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 	struct idxd_device *idxd = wq->idxd;
 	bool ats_dis;
 	int rc;
@@ -1289,7 +1275,7 @@ static const struct attribute_group *idxd_wq_attribute_groups[] = {
 
 static void idxd_conf_wq_release(struct device *dev)
 {
-	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_wq *wq = confdev_to_wq(dev);
 
 	kfree(wq->wqcfg);
 	kfree(wq);
@@ -1305,8 +1291,7 @@ struct device_type idxd_wq_device_type = {
 static ssize_t version_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%#x\n", idxd->hw.version);
 }
@@ -1316,8 +1301,7 @@ static ssize_t max_work_queues_size_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->max_wq_size);
 }
@@ -1326,8 +1310,7 @@ static DEVICE_ATTR_RO(max_work_queues_size);
 static ssize_t max_groups_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->max_groups);
 }
@@ -1336,8 +1319,7 @@ static DEVICE_ATTR_RO(max_groups);
 static ssize_t max_work_queues_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->max_wqs);
 }
@@ -1346,8 +1328,7 @@ static DEVICE_ATTR_RO(max_work_queues);
 static ssize_t max_engines_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->max_engines);
 }
@@ -1356,8 +1337,7 @@ static DEVICE_ATTR_RO(max_engines);
 static ssize_t numa_node_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%d\n", dev_to_node(&idxd->pdev->dev));
 }
@@ -1366,8 +1346,7 @@ static DEVICE_ATTR_RO(numa_node);
 static ssize_t max_batch_size_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->max_batch_size);
 }
@@ -1377,8 +1356,7 @@ static ssize_t max_transfer_size_show(struct device *dev,
 				      struct device_attribute *attr,
 				      char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%llu\n", idxd->max_xfer_bytes);
 }
@@ -1387,8 +1365,7 @@ static DEVICE_ATTR_RO(max_transfer_size);
 static ssize_t op_cap_show(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 	int i, rc = 0;
 
 	for (i = 0; i < 4; i++)
@@ -1403,8 +1380,7 @@ static DEVICE_ATTR_RO(op_cap);
 static ssize_t gen_cap_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%#llx\n", idxd->hw.gen_cap.bits);
 }
@@ -1413,8 +1389,7 @@ static DEVICE_ATTR_RO(gen_cap);
 static ssize_t configurable_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags));
 }
@@ -1423,8 +1398,7 @@ static DEVICE_ATTR_RO(configurable);
 static ssize_t clients_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 	unsigned long flags;
 	int count = 0, i;
 
@@ -1443,8 +1417,7 @@ static DEVICE_ATTR_RO(clients);
 static ssize_t pasid_enabled_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", device_pasid_enabled(idxd));
 }
@@ -1453,8 +1426,7 @@ static DEVICE_ATTR_RO(pasid_enabled);
 static ssize_t state_show(struct device *dev,
 			  struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	switch (idxd->state) {
 	case IDXD_DEV_DISABLED:
@@ -1473,8 +1445,7 @@ static DEVICE_ATTR_RO(state);
 static ssize_t errors_show(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 	int i, out = 0;
 	unsigned long flags;
 
@@ -1491,8 +1462,7 @@ static DEVICE_ATTR_RO(errors);
 static ssize_t max_tokens_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->max_tokens);
 }
@@ -1501,8 +1471,7 @@ static DEVICE_ATTR_RO(max_tokens);
 static ssize_t token_limit_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->token_limit);
 }
@@ -1511,8 +1480,7 @@ static ssize_t token_limit_store(struct device *dev,
 				 struct device_attribute *attr,
 				 const char *buf, size_t count)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 	unsigned long val;
 	int rc;
 
@@ -1540,8 +1508,7 @@ static DEVICE_ATTR_RW(token_limit);
 static ssize_t cdev_major_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd =
-		container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->major);
 }
@@ -1550,7 +1517,7 @@ static DEVICE_ATTR_RO(cdev_major);
 static ssize_t cmd_status_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%#x\n", idxd->cmd_status);
 }
@@ -1590,7 +1557,7 @@ static const struct attribute_group *idxd_attribute_groups[] = {
 
 static void idxd_conf_device_release(struct device *dev)
 {
-	struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev);
+	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	kfree(idxd->groups);
 	kfree(idxd->wqs);
@@ -1615,12 +1582,12 @@ struct device_type iax_device_type = {
 
 static int idxd_register_engine_devices(struct idxd_device *idxd)
 {
+	struct idxd_engine *engine;
 	int i, j, rc;
 
 	for (i = 0; i < idxd->max_engines; i++) {
-		struct idxd_engine *engine = idxd->engines[i];
-
-		rc = device_add(&engine->conf_dev);
+		engine = idxd->engines[i];
+		rc = device_add(engine_confdev(engine));
 		if (rc < 0)
 			goto cleanup;
 	}
@@ -1629,22 +1596,26 @@ static int idxd_register_engine_devices(struct idxd_device *idxd)
 
 cleanup:
 	j = i - 1;
-	for (; i < idxd->max_engines; i++)
-		put_device(&idxd->engines[i]->conf_dev);
+	for (; i < idxd->max_engines; i++) {
+		engine = idxd->engines[i];
+		put_device(engine_confdev(engine));
+	}
 
-	while (j--)
-		device_unregister(&idxd->engines[j]->conf_dev);
+	while (j--) {
+		engine = idxd->engines[j];
+		device_unregister(engine_confdev(engine));
+	}
 	return rc;
 }
 
 static int idxd_register_group_devices(struct idxd_device *idxd)
 {
+	struct idxd_group *group;
 	int i, j, rc;
 
 	for (i = 0; i < idxd->max_groups; i++) {
-		struct idxd_group *group = idxd->groups[i];
-
-		rc = device_add(&group->conf_dev);
+		group = idxd->groups[i];
+		rc = device_add(group_confdev(group));
 		if (rc < 0)
 			goto cleanup;
 	}
@@ -1653,22 +1624,26 @@ static int idxd_register_group_devices(struct idxd_device *idxd)
 
 cleanup:
 	j = i - 1;
-	for (; i < idxd->max_groups; i++)
-		put_device(&idxd->groups[i]->conf_dev);
+	for (; i < idxd->max_groups; i++) {
+		group = idxd->groups[i];
+		put_device(group_confdev(group));
+	}
 
-	while (j--)
-		device_unregister(&idxd->groups[j]->conf_dev);
+	while (j--) {
+		group = idxd->groups[j];
+		device_unregister(group_confdev(group));
+	}
 	return rc;
 }
 
 static int idxd_register_wq_devices(struct idxd_device *idxd)
 {
+	struct idxd_wq *wq;
 	int i, rc, j;
 
 	for (i = 0; i < idxd->max_wqs; i++) {
-		struct idxd_wq *wq = idxd->wqs[i];
-
-		rc = device_add(&wq->conf_dev);
+		wq = idxd->wqs[i];
+		rc = device_add(wq_confdev(wq));
 		if (rc < 0)
 			goto cleanup;
 	}
@@ -1677,11 +1652,15 @@ static int idxd_register_wq_devices(struct idxd_device *idxd)
 
 cleanup:
 	j = i - 1;
-	for (; i < idxd->max_wqs; i++)
-		put_device(&idxd->wqs[i]->conf_dev);
+	for (; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+		put_device(wq_confdev(wq));
+	}
 
-	while (j--)
-		device_unregister(&idxd->wqs[j]->conf_dev);
+	while (j--) {
+		wq = idxd->wqs[j];
+		device_unregister(wq_confdev(wq));
+	}
 	return rc;
 }
 
@@ -1690,7 +1669,7 @@ int idxd_register_devices(struct idxd_device *idxd)
 	struct device *dev = &idxd->pdev->dev;
 	int rc, i;
 
-	rc = device_add(&idxd->conf_dev);
+	rc = device_add(idxd_confdev(idxd));
 	if (rc < 0)
 		return rc;
 
@@ -1716,12 +1695,12 @@ int idxd_register_devices(struct idxd_device *idxd)
 
  err_group:
 	for (i = 0; i < idxd->max_engines; i++)
-		device_unregister(&idxd->engines[i]->conf_dev);
+		device_unregister(engine_confdev(idxd->engines[i]));
  err_engine:
 	for (i = 0; i < idxd->max_wqs; i++)
-		device_unregister(&idxd->wqs[i]->conf_dev);
+		device_unregister(wq_confdev(idxd->wqs[i]));
  err_wq:
-	device_del(&idxd->conf_dev);
+	device_del(idxd_confdev(idxd));
 	return rc;
 }
 
@@ -1732,19 +1711,19 @@ void idxd_unregister_devices(struct idxd_device *idxd)
 	for (i = 0; i < idxd->max_wqs; i++) {
 		struct idxd_wq *wq = idxd->wqs[i];
 
-		device_unregister(&wq->conf_dev);
+		device_unregister(wq_confdev(wq));
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
 		struct idxd_engine *engine = idxd->engines[i];
 
-		device_unregister(&engine->conf_dev);
+		device_unregister(engine_confdev(engine));
 	}
 
 	for (i = 0; i < idxd->max_groups; i++) {
 		struct idxd_group *group = idxd->groups[i];
 
-		device_unregister(&group->conf_dev);
+		device_unregister(group_confdev(group));
 	}
 }
 
-- 
Gitee


From 38840a2ef42308631ba44d85e407e87c438aa5b2 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:26 -0700
Subject: [PATCH 0643/2161] dmaengine: idxd: remove IDXD_DEV_CONF_READY

commit f52058ae11523304a337de249c4c07ba5076f288 upstream.

The IDXD_DEV_CONF_READY state flag is no longer needed. The current
implementation uses this flag to stop the device from doing
configuration until the pci driver probe has completed. With the
driver architecture going towards multiple sub-driver attached to
the dsa_bus, this is no longer feasible. The sub-drivers will be
allowed to probe and return with failure when they are not ready
to complete the probe rather than using a state flag to gate the
probing.

There is no expectation that the devices auto-attach to a driver.
Userspace configuration is expected to setup the device before
enabling.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637460633.744545.8902095097471365420.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  4 ++--
 drivers/dma/idxd/idxd.h   |  1 -
 drivers/dma/idxd/init.c   |  2 --
 drivers/dma/idxd/sysfs.c  | 14 --------------
 4 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index c8cf1de72176..4a2af9799239 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -576,7 +576,7 @@ int idxd_device_disable(struct idxd_device *idxd)
 
 	spin_lock_irqsave(&idxd->dev_lock, flags);
 	idxd_device_clear_state(idxd);
-	idxd->state = IDXD_DEV_CONF_READY;
+	idxd->state = IDXD_DEV_DISABLED;
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 	return 0;
 }
@@ -588,7 +588,7 @@ void idxd_device_reset(struct idxd_device *idxd)
 	idxd_cmd_exec(idxd, IDXD_CMD_RESET_DEVICE, 0, NULL);
 	spin_lock_irqsave(&idxd->dev_lock, flags);
 	idxd_device_clear_state(idxd);
-	idxd->state = IDXD_DEV_CONF_READY;
+	idxd->state = IDXD_DEV_DISABLED;
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index ae60fcc7b625..9fc1a88f336d 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -210,7 +210,6 @@ struct idxd_hw {
 enum idxd_device_state {
 	IDXD_DEV_HALTED = -1,
 	IDXD_DEV_DISABLED = 0,
-	IDXD_DEV_CONF_READY,
 	IDXD_DEV_ENABLED,
 };
 
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index f500076882d2..c22225b14c5d 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -682,8 +682,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_dev_register;
 	}
 
-	idxd->state = IDXD_DEV_CONF_READY;
-
 	dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n",
 		 idxd->hw.version);
 
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index f603b11141c4..2a978055e22b 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -22,17 +22,9 @@ static int idxd_config_bus_match(struct device *dev,
 	int matched = 0;
 
 	if (is_idxd_dev(dev)) {
-		struct idxd_device *idxd = confdev_to_idxd(dev);
-
-		if (idxd->state != IDXD_DEV_CONF_READY)
-			return 0;
 		matched = 1;
 	} else if (is_idxd_wq_dev(dev)) {
 		struct idxd_wq *wq = confdev_to_wq(dev);
-		struct idxd_device *idxd = wq->idxd;
-
-		if (idxd->state < IDXD_DEV_CONF_READY)
-			return 0;
 
 		if (wq->state != IDXD_WQ_DISABLED) {
 			dev_dbg(dev, "%s not disabled\n", dev_name(dev));
@@ -179,11 +171,6 @@ static int idxd_config_bus_probe(struct device *dev)
 	if (is_idxd_dev(dev)) {
 		struct idxd_device *idxd = confdev_to_idxd(dev);
 
-		if (idxd->state != IDXD_DEV_CONF_READY) {
-			dev_warn(dev, "Device not ready for config\n");
-			return -EBUSY;
-		}
-
 		if (!try_module_get(THIS_MODULE))
 			return -ENXIO;
 
@@ -1430,7 +1417,6 @@ static ssize_t state_show(struct device *dev,
 
 	switch (idxd->state) {
 	case IDXD_DEV_DISABLED:
-	case IDXD_DEV_CONF_READY:
 		return sysfs_emit(buf, "disabled\n");
 	case IDXD_DEV_ENABLED:
 		return sysfs_emit(buf, "enabled\n");
-- 
Gitee


From 1e9908fe512a6e0fbb99f861d95e3d450b310b46 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:31 -0700
Subject: [PATCH 0644/2161] dmaengine: idxd: move wq_enable() to device.c

commit 1f2bb40337f0df1d9af80793e9fdacff7706e654 upstream.

Move the wq_enable() function to device.c in preparation of setting up the
idxd internal sub-driver framework. No logic changes.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637461176.744545.3806109011554118998.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 124 ++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |   1 +
 drivers/dma/idxd/sysfs.c  | 124 +-------------------------------------
 3 files changed, 126 insertions(+), 123 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 4a2af9799239..b1c509bcfa31 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1129,3 +1129,127 @@ int idxd_device_load_config(struct idxd_device *idxd)
 
 	return 0;
 }
+
+static int __drv_enable_wq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	unsigned long flags;
+	int rc = -ENXIO;
+
+	lockdep_assert_held(&wq->wq_lock);
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		goto err;
+
+	if (wq->state != IDXD_WQ_DISABLED) {
+		dev_dbg(dev, "wq %d already enabled.\n", wq->id);
+		rc = -EBUSY;
+		goto err;
+	}
+
+	if (!wq->group) {
+		dev_dbg(dev, "wq %d not attached to group.\n", wq->id);
+		goto err;
+	}
+
+	if (strlen(wq->name) == 0) {
+		dev_dbg(dev, "wq %d name not set.\n", wq->id);
+		goto err;
+	}
+
+	/* Shared WQ checks */
+	if (wq_shared(wq)) {
+		if (!device_swq_supported(idxd)) {
+			dev_dbg(dev, "PASID not enabled and shared wq.\n");
+			goto err;
+		}
+		/*
+		 * Shared wq with the threshold set to 0 means the user
+		 * did not set the threshold or transitioned from a
+		 * dedicated wq but did not set threshold. A value
+		 * of 0 would effectively disable the shared wq. The
+		 * driver does not allow a value of 0 to be set for
+		 * threshold via sysfs.
+		 */
+		if (wq->threshold == 0) {
+			dev_dbg(dev, "Shared wq and threshold 0.\n");
+			goto err;
+		}
+	}
+
+	rc = idxd_wq_alloc_resources(wq);
+	if (rc < 0) {
+		dev_dbg(dev, "wq resource alloc failed\n");
+		goto err;
+	}
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		rc = idxd_device_config(idxd);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	if (rc < 0) {
+		dev_dbg(dev, "Writing wq %d config failed: %d\n", wq->id, rc);
+		goto err;
+	}
+
+	rc = idxd_wq_enable(wq);
+	if (rc < 0) {
+		dev_dbg(dev, "wq %d enabling failed: %d\n", wq->id, rc);
+		goto err;
+	}
+
+	rc = idxd_wq_map_portal(wq);
+	if (rc < 0) {
+		dev_dbg(dev, "wq %d portal mapping failed: %d\n", wq->id, rc);
+		goto err_map_portal;
+	}
+
+	wq->client_count = 0;
+
+	if (wq->type == IDXD_WQT_KERNEL) {
+		rc = idxd_wq_init_percpu_ref(wq);
+		if (rc < 0) {
+			dev_dbg(dev, "wq %d percpu_ref setup failed\n", wq->id);
+			goto err_cpu_ref;
+		}
+	}
+
+	if (is_idxd_wq_dmaengine(wq)) {
+		rc = idxd_register_dma_channel(wq);
+		if (rc < 0) {
+			dev_dbg(dev, "wq %d DMA channel register failed\n", wq->id);
+			goto err_client;
+		}
+	} else if (is_idxd_wq_cdev(wq)) {
+		rc = idxd_wq_add_cdev(wq);
+		if (rc < 0) {
+			dev_dbg(dev, "wq %d cdev creation failed\n", wq->id);
+			goto err_client;
+		}
+	}
+
+	dev_info(dev, "wq %s enabled\n", dev_name(wq_confdev(wq)));
+	return 0;
+
+err_client:
+	idxd_wq_quiesce(wq);
+err_cpu_ref:
+	idxd_wq_unmap_portal(wq);
+err_map_portal:
+	rc = idxd_wq_disable(wq, false);
+	if (rc < 0)
+		dev_dbg(dev, "wq %s disable failed\n", dev_name(wq_confdev(wq)));
+err:
+	return rc;
+}
+
+int drv_enable_wq(struct idxd_wq *wq)
+{
+	int rc;
+
+	mutex_lock(&wq->wq_lock);
+	rc = __drv_enable_wq(wq);
+	mutex_unlock(&wq->wq_lock);
+	return rc;
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 9fc1a88f336d..551a61fa1aff 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -495,6 +495,7 @@ void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
 void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
+int drv_enable_wq(struct idxd_wq *wq);
 int idxd_device_init_reset(struct idxd_device *idxd);
 int idxd_device_enable(struct idxd_device *idxd);
 int idxd_device_disable(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 2a978055e22b..3e8cc07ebcdc 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -39,128 +39,6 @@ static int idxd_config_bus_match(struct device *dev,
 	return matched;
 }
 
-static int enable_wq(struct idxd_wq *wq)
-{
-	struct idxd_device *idxd = wq->idxd;
-	struct device *dev = &idxd->pdev->dev;
-	unsigned long flags;
-	int rc;
-
-	mutex_lock(&wq->wq_lock);
-
-	if (idxd->state != IDXD_DEV_ENABLED) {
-		mutex_unlock(&wq->wq_lock);
-		dev_warn(dev, "Enabling while device not enabled.\n");
-		return -EPERM;
-	}
-
-	if (wq->state != IDXD_WQ_DISABLED) {
-		mutex_unlock(&wq->wq_lock);
-		dev_warn(dev, "WQ %d already enabled.\n", wq->id);
-		return -EBUSY;
-	}
-
-	if (!wq->group) {
-		mutex_unlock(&wq->wq_lock);
-		dev_warn(dev, "WQ not attached to group.\n");
-		return -EINVAL;
-	}
-
-	if (strlen(wq->name) == 0) {
-		mutex_unlock(&wq->wq_lock);
-		dev_warn(dev, "WQ name not set.\n");
-		return -EINVAL;
-	}
-
-	/* Shared WQ checks */
-	if (wq_shared(wq)) {
-		if (!device_swq_supported(idxd)) {
-			dev_warn(dev, "PASID not enabled and shared WQ.\n");
-			mutex_unlock(&wq->wq_lock);
-			return -ENXIO;
-		}
-		/*
-		 * Shared wq with the threshold set to 0 means the user
-		 * did not set the threshold or transitioned from a
-		 * dedicated wq but did not set threshold. A value
-		 * of 0 would effectively disable the shared wq. The
-		 * driver does not allow a value of 0 to be set for
-		 * threshold via sysfs.
-		 */
-		if (wq->threshold == 0) {
-			dev_warn(dev, "Shared WQ and threshold 0.\n");
-			mutex_unlock(&wq->wq_lock);
-			return -EINVAL;
-		}
-	}
-
-	rc = idxd_wq_alloc_resources(wq);
-	if (rc < 0) {
-		mutex_unlock(&wq->wq_lock);
-		dev_warn(dev, "WQ resource alloc failed\n");
-		return rc;
-	}
-
-	spin_lock_irqsave(&idxd->dev_lock, flags);
-	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
-		rc = idxd_device_config(idxd);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
-	if (rc < 0) {
-		mutex_unlock(&wq->wq_lock);
-		dev_warn(dev, "Writing WQ %d config failed: %d\n", wq->id, rc);
-		return rc;
-	}
-
-	rc = idxd_wq_enable(wq);
-	if (rc < 0) {
-		mutex_unlock(&wq->wq_lock);
-		dev_warn(dev, "WQ %d enabling failed: %d\n", wq->id, rc);
-		return rc;
-	}
-
-	rc = idxd_wq_map_portal(wq);
-	if (rc < 0) {
-		dev_warn(dev, "wq portal mapping failed: %d\n", rc);
-		rc = idxd_wq_disable(wq, false);
-		if (rc < 0)
-			dev_warn(dev, "IDXD wq disable failed\n");
-		mutex_unlock(&wq->wq_lock);
-		return rc;
-	}
-
-	wq->client_count = 0;
-
-	if (wq->type == IDXD_WQT_KERNEL) {
-		rc = idxd_wq_init_percpu_ref(wq);
-		if (rc < 0) {
-			dev_dbg(dev, "percpu_ref setup failed\n");
-			mutex_unlock(&wq->wq_lock);
-			return rc;
-		}
-	}
-
-	if (is_idxd_wq_dmaengine(wq)) {
-		rc = idxd_register_dma_channel(wq);
-		if (rc < 0) {
-			dev_dbg(dev, "DMA channel register failed\n");
-			mutex_unlock(&wq->wq_lock);
-			return rc;
-		}
-	} else if (is_idxd_wq_cdev(wq)) {
-		rc = idxd_wq_add_cdev(wq);
-		if (rc < 0) {
-			dev_dbg(dev, "Cdev creation failed\n");
-			mutex_unlock(&wq->wq_lock);
-			return rc;
-		}
-	}
-
-	mutex_unlock(&wq->wq_lock);
-	dev_info(dev, "wq %s enabled\n", dev_name(wq_confdev(wq)));
-
-	return 0;
-}
-
 static int idxd_config_bus_probe(struct device *dev)
 {
 	int rc = 0;
@@ -205,7 +83,7 @@ static int idxd_config_bus_probe(struct device *dev)
 	} else if (is_idxd_wq_dev(dev)) {
 		struct idxd_wq *wq = confdev_to_wq(dev);
 
-		return enable_wq(wq);
+		return drv_enable_wq(wq);
 	}
 
 	return -ENODEV;
-- 
Gitee


From 7f366b6b26133393bcc9617e8188da8d1cb8afad Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:37 -0700
Subject: [PATCH 0645/2161] dmaengine: idxd: move wq_disable() to device.c

commit 69e4f8be596d897679e44e86a323629537c02975 upstream.

Move the wq_disable() function to device.c in preparation of setting up the
idxd internal sub-driver framework. No logic changes.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637461775.744545.9644048686618957886.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 37 +++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |  1 +
 drivers/dma/idxd/sysfs.c  | 38 +-------------------------------------
 3 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index b1c509bcfa31..8d8e249931a9 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1253,3 +1253,40 @@ int drv_enable_wq(struct idxd_wq *wq)
 	mutex_unlock(&wq->wq_lock);
 	return rc;
 }
+
+static void __drv_disable_wq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+
+	lockdep_assert_held(&wq->wq_lock);
+
+	if (wq->type == IDXD_WQT_KERNEL)
+		idxd_wq_quiesce(wq);
+
+	if (is_idxd_wq_dmaengine(wq))
+		idxd_unregister_dma_channel(wq);
+	else if (is_idxd_wq_cdev(wq))
+		idxd_wq_del_cdev(wq);
+
+	if (idxd_wq_refcount(wq))
+		dev_warn(dev, "Clients has claim on wq %d: %d\n",
+			 wq->id, idxd_wq_refcount(wq));
+
+	idxd_wq_unmap_portal(wq);
+
+	idxd_wq_drain(wq);
+	idxd_wq_reset(wq);
+
+	idxd_wq_free_resources(wq);
+	wq->client_count = 0;
+
+	dev_info(dev, "wq %s disabled\n", dev_name(wq_confdev(wq)));
+}
+
+void drv_disable_wq(struct idxd_wq *wq)
+{
+	mutex_lock(&wq->wq_lock);
+	__drv_disable_wq(wq);
+	mutex_unlock(&wq->wq_lock);
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 551a61fa1aff..e96ddbfc4569 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -496,6 +496,7 @@ void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
 int drv_enable_wq(struct idxd_wq *wq);
+void drv_disable_wq(struct idxd_wq *wq);
 int idxd_device_init_reset(struct idxd_device *idxd);
 int idxd_device_enable(struct idxd_device *idxd);
 int idxd_device_disable(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 3e8cc07ebcdc..9967fad58a01 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -89,42 +89,6 @@ static int idxd_config_bus_probe(struct device *dev)
 	return -ENODEV;
 }
 
-static void disable_wq(struct idxd_wq *wq)
-{
-	struct idxd_device *idxd = wq->idxd;
-	struct device *dev = &idxd->pdev->dev;
-
-	mutex_lock(&wq->wq_lock);
-	dev_dbg(dev, "%s removing WQ %s\n", __func__, dev_name(wq_confdev(wq)));
-	if (wq->state == IDXD_WQ_DISABLED) {
-		mutex_unlock(&wq->wq_lock);
-		return;
-	}
-
-	if (wq->type == IDXD_WQT_KERNEL)
-		idxd_wq_quiesce(wq);
-
-	if (is_idxd_wq_dmaengine(wq))
-		idxd_unregister_dma_channel(wq);
-	else if (is_idxd_wq_cdev(wq))
-		idxd_wq_del_cdev(wq);
-
-	if (idxd_wq_refcount(wq))
-		dev_warn(dev, "Clients has claim on wq %d: %d\n",
-			 wq->id, idxd_wq_refcount(wq));
-
-	idxd_wq_unmap_portal(wq);
-
-	idxd_wq_drain(wq);
-	idxd_wq_reset(wq);
-
-	idxd_wq_free_resources(wq);
-	wq->client_count = 0;
-	mutex_unlock(&wq->wq_lock);
-
-	dev_info(dev, "wq %s disabled\n", dev_name(wq_confdev(wq)));
-}
-
 static int idxd_config_bus_remove(struct device *dev)
 {
 	dev_dbg(dev, "%s called for %s\n", __func__, dev_name(dev));
@@ -133,7 +97,7 @@ static int idxd_config_bus_remove(struct device *dev)
 	if (is_idxd_wq_dev(dev)) {
 		struct idxd_wq *wq = confdev_to_wq(dev);
 
-		disable_wq(wq);
+		drv_disable_wq(wq);
 	} else if (is_idxd_dev(dev)) {
 		struct idxd_device *idxd = confdev_to_idxd(dev);
 		int i;
-- 
Gitee


From 73483b57003ad840877b85d266c1f4a6975bc612 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:43 -0700
Subject: [PATCH 0646/2161] dmaengine: idxd: remove bus shutdown

commit 3a5cc01647f07431b342e9703cda0542457ec467 upstream.

Remove ->shutdown() function for the dsa bus as it does not do anything and
is not necessary.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637462319.744545.10383189484257042066.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 9967fad58a01..c3c869d8119a 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -126,17 +126,11 @@ static int idxd_config_bus_remove(struct device *dev)
 	return 0;
 }
 
-static void idxd_config_bus_shutdown(struct device *dev)
-{
-	dev_dbg(dev, "%s called\n", __func__);
-}
-
 struct bus_type dsa_bus_type = {
 	.name = "dsa",
 	.match = idxd_config_bus_match,
 	.probe = idxd_config_bus_probe,
 	.remove = idxd_config_bus_remove,
-	.shutdown = idxd_config_bus_shutdown,
 };
 
 static struct idxd_device_driver dsa_drv = {
-- 
Gitee


From b8785f8816d087ce53c2b4bb92e2a6283c5a1efb Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:49 -0700
Subject: [PATCH 0647/2161] dmaengine: idxd: remove iax_bus_type prototype

commit 1c264299431e9a105f3974ad49b6bccc3f03540f upstream.

Remove unused iax_bus_type prototype declaration. Should have been removed
when iax_bus_type was removed.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637462909.744545.7106049898386277608.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index e96ddbfc4569..4c3d3eb94450 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -370,7 +370,6 @@ static inline void idxd_dev_set_type(struct idxd_dev *idev, int type)
 }
 
 extern struct bus_type dsa_bus_type;
-extern struct bus_type iax_bus_type;
 
 extern bool support_enqcmd;
 extern struct ida idxd_ida;
-- 
Gitee


From 13bef8d7a2bb663b92b8fa3c3a2da23bf04c027e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:43:55 -0700
Subject: [PATCH 0648/2161] dmaengine: idxd: fix bus_probe() and bus_remove()
 for dsa_bus

commit fcc2281b142bf14e3534d6b1150991194f8d1d44 upstream.

Current implementation have put all the code that should be in a driver
probe/remove in the bus probe/remove function. Add ->probe() and ->remove()
support for the dsa_drv and move all those code out of bus probe/remove.
The change does not split out the distinction between device sub-driver and
wq sub-driver. It only cleans up the bus calls. The split out will be
addressed in follow on patches.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637463586.744545.5806250155539938643.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h  |  24 +++++----
 drivers/dma/idxd/sysfs.c | 108 ++++++++++++++++++++-------------------
 2 files changed, 69 insertions(+), 63 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 4c3d3eb94450..493958ecc208 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -51,6 +51,8 @@ enum idxd_type {
 
 struct idxd_device_driver {
 	const char *name;
+	int (*probe)(struct idxd_dev *idxd_dev);
+	void (*remove)(struct idxd_dev *idxd_dev);
 	struct device_driver drv;
 };
 
@@ -323,19 +325,21 @@ enum idxd_completion_status {
 #define cdev_dev(cdev) &cdev->idxd_dev.conf_dev
 
 #define confdev_to_idxd_dev(dev) container_of(dev, struct idxd_dev, conf_dev)
+#define idxd_dev_to_idxd(idxd_dev) container_of(idxd_dev, struct idxd_device, idxd_dev)
+#define idxd_dev_to_wq(idxd_dev) container_of(idxd_dev, struct idxd_wq, idxd_dev)
 
 static inline struct idxd_device *confdev_to_idxd(struct device *dev)
 {
 	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
 
-	return container_of(idxd_dev, struct idxd_device, idxd_dev);
+	return idxd_dev_to_idxd(idxd_dev);
 }
 
 static inline struct idxd_wq *confdev_to_wq(struct device *dev)
 {
 	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
 
-	return container_of(idxd_dev, struct idxd_wq, idxd_dev);
+	return idxd_dev_to_wq(idxd_dev);
 }
 
 static inline struct idxd_engine *confdev_to_engine(struct device *dev)
@@ -379,24 +383,24 @@ extern struct device_type idxd_wq_device_type;
 extern struct device_type idxd_engine_device_type;
 extern struct device_type idxd_group_device_type;
 
-static inline bool is_dsa_dev(struct device *dev)
+static inline bool is_dsa_dev(struct idxd_dev *idxd_dev)
 {
-	return dev->type == &dsa_device_type;
+	return idxd_dev->type == IDXD_DEV_DSA;
 }
 
-static inline bool is_iax_dev(struct device *dev)
+static inline bool is_iax_dev(struct idxd_dev *idxd_dev)
 {
-	return dev->type == &iax_device_type;
+	return idxd_dev->type == IDXD_DEV_IAX;
 }
 
-static inline bool is_idxd_dev(struct device *dev)
+static inline bool is_idxd_dev(struct idxd_dev *idxd_dev)
 {
-	return is_dsa_dev(dev) || is_iax_dev(dev);
+	return is_dsa_dev(idxd_dev) || is_iax_dev(idxd_dev);
 }
 
-static inline bool is_idxd_wq_dev(struct device *dev)
+static inline bool is_idxd_wq_dev(struct idxd_dev *idxd_dev)
 {
-	return dev->type == &idxd_wq_device_type;
+	return idxd_dev->type == IDXD_DEV_WQ;
 }
 
 static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index c3c869d8119a..f82416eec926 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -19,69 +19,80 @@ static char *idxd_wq_type_names[] = {
 static int idxd_config_bus_match(struct device *dev,
 				 struct device_driver *drv)
 {
-	int matched = 0;
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
 
-	if (is_idxd_dev(dev)) {
-		matched = 1;
-	} else if (is_idxd_wq_dev(dev)) {
-		struct idxd_wq *wq = confdev_to_wq(dev);
+	return (is_idxd_dev(idxd_dev) || is_idxd_wq_dev(idxd_dev));
+}
 
-		if (wq->state != IDXD_WQ_DISABLED) {
-			dev_dbg(dev, "%s not disabled\n", dev_name(dev));
-			return 0;
-		}
-		matched = 1;
-	}
+static int idxd_config_bus_probe(struct device *dev)
+{
+	struct idxd_device_driver *idxd_drv =
+		container_of(dev->driver, struct idxd_device_driver, drv);
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
 
-	if (matched)
-		dev_dbg(dev, "%s matched\n", dev_name(dev));
+	return idxd_drv->probe(idxd_dev);
+}
 
-	return matched;
+static int idxd_config_bus_remove(struct device *dev)
+{
+	struct idxd_device_driver *idxd_drv =
+		container_of(dev->driver, struct idxd_device_driver, drv);
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	idxd_drv->remove(idxd_dev);
+	return 0;
 }
 
-static int idxd_config_bus_probe(struct device *dev)
+struct bus_type dsa_bus_type = {
+	.name = "dsa",
+	.match = idxd_config_bus_match,
+	.probe = idxd_config_bus_probe,
+	.remove = idxd_config_bus_remove,
+};
+
+static int idxd_dsa_drv_probe(struct idxd_dev *idxd_dev)
 {
-	int rc = 0;
+	struct device *dev = &idxd_dev->conf_dev;
 	unsigned long flags;
+	int rc;
 
-	dev_dbg(dev, "%s called\n", __func__);
-
-	if (is_idxd_dev(dev)) {
-		struct idxd_device *idxd = confdev_to_idxd(dev);
+	if (is_idxd_dev(idxd_dev)) {
+		struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
 
-		if (!try_module_get(THIS_MODULE))
+		if (idxd->state != IDXD_DEV_DISABLED)
 			return -ENXIO;
 
-		/* Perform IDXD configuration and enabling */
+		/* Device configuration */
 		spin_lock_irqsave(&idxd->dev_lock, flags);
 		if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 			rc = idxd_device_config(idxd);
 		spin_unlock_irqrestore(&idxd->dev_lock, flags);
 		if (rc < 0) {
-			module_put(THIS_MODULE);
-			dev_warn(dev, "Device config failed: %d\n", rc);
+			dev_dbg(dev, "Device config failed: %d\n", rc);
 			return rc;
 		}
 
-		/* start device */
+		/* Start device */
 		rc = idxd_device_enable(idxd);
 		if (rc < 0) {
-			module_put(THIS_MODULE);
 			dev_warn(dev, "Device enable failed: %d\n", rc);
 			return rc;
 		}
 
-		dev_info(dev, "Device %s enabled\n", dev_name(dev));
-
+		/* Setup DMA device without channels */
 		rc = idxd_register_dma_device(idxd);
 		if (rc < 0) {
-			module_put(THIS_MODULE);
 			dev_dbg(dev, "Failed to register dmaengine device\n");
+			idxd_device_disable(idxd);
 			return rc;
 		}
+
+		dev_info(dev, "Device %s enabled\n", dev_name(dev));
 		return 0;
-	} else if (is_idxd_wq_dev(dev)) {
-		struct idxd_wq *wq = confdev_to_wq(dev);
+	}
+
+	if (is_idxd_wq_dev(idxd_dev)) {
+		struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
 
 		return drv_enable_wq(wq);
 	}
@@ -89,21 +100,14 @@ static int idxd_config_bus_probe(struct device *dev)
 	return -ENODEV;
 }
 
-static int idxd_config_bus_remove(struct device *dev)
+static void idxd_dsa_drv_remove(struct idxd_dev *idxd_dev)
 {
-	dev_dbg(dev, "%s called for %s\n", __func__, dev_name(dev));
-
-	/* disable workqueue here */
-	if (is_idxd_wq_dev(dev)) {
-		struct idxd_wq *wq = confdev_to_wq(dev);
+	struct device *dev = &idxd_dev->conf_dev;
 
-		drv_disable_wq(wq);
-	} else if (is_idxd_dev(dev)) {
-		struct idxd_device *idxd = confdev_to_idxd(dev);
+	if (is_idxd_dev(idxd_dev)) {
+		struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
 		int i;
 
-		dev_dbg(dev, "%s removing dev %s\n", __func__,
-			dev_name(idxd_confdev(idxd)));
 		for (i = 0; i < idxd->max_wqs; i++) {
 			struct idxd_wq *wq = idxd->wqs[i];
 
@@ -118,23 +122,21 @@ static int idxd_config_bus_remove(struct device *dev)
 		idxd_device_disable(idxd);
 		if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 			idxd_device_reset(idxd);
-		module_put(THIS_MODULE);
-
-		dev_info(dev, "Device %s disabled\n", dev_name(dev));
+		return;
 	}
 
-	return 0;
-}
+	if (is_idxd_wq_dev(idxd_dev)) {
+		struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
 
-struct bus_type dsa_bus_type = {
-	.name = "dsa",
-	.match = idxd_config_bus_match,
-	.probe = idxd_config_bus_probe,
-	.remove = idxd_config_bus_remove,
-};
+		drv_disable_wq(wq);
+		return;
+	}
+}
 
 static struct idxd_device_driver dsa_drv = {
 	.name = "dsa",
+	.probe = idxd_dsa_drv_probe,
+	.remove = idxd_dsa_drv_remove,
 };
 
 /* IDXD generic driver setup */
-- 
Gitee


From dc9b3fec04d7ef885df62833608e606445b7ed2a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:01 -0700
Subject: [PATCH 0649/2161] dmaengine: idxd: move probe() bits for idxd 'struct
 device' to device.c

commit bd42805b5da33b9c75f3ce0ae9d6ff0ec3f2cd6b upstream.

Move the code related to a ->probe() function for the idxd
'struct device' to device.c to prep for the idxd device
sub-driver in device.c.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637464189.744545.17423830646786162194.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 37 ++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |  1 +
 drivers/dma/idxd/sysfs.c  | 40 ++-------------------------------------
 3 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 8d8e249931a9..b9aa209efee4 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1290,3 +1290,40 @@ void drv_disable_wq(struct idxd_wq *wq)
 	__drv_disable_wq(wq);
 	mutex_unlock(&wq->wq_lock);
 }
+
+int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
+{
+	struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
+	unsigned long flags;
+	int rc = 0;
+
+	/*
+	 * Device should be in disabled state for the idxd_drv to load. If it's in
+	 * enabled state, then the device was altered outside of driver's control.
+	 * If the state is in halted state, then we don't want to proceed.
+	 */
+	if (idxd->state != IDXD_DEV_DISABLED)
+		return -ENXIO;
+
+	/* Device configuration */
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		rc = idxd_device_config(idxd);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	if (rc < 0)
+		return -ENXIO;
+
+	/* Start device */
+	rc = idxd_device_enable(idxd);
+	if (rc < 0)
+		return rc;
+
+	/* Setup DMA device without channels */
+	rc = idxd_register_dma_device(idxd);
+	if (rc < 0) {
+		idxd_device_disable(idxd);
+		return rc;
+	}
+
+	return 0;
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 493958ecc208..dbbd36feb462 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -498,6 +498,7 @@ void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
 void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
+int idxd_device_drv_probe(struct idxd_dev *idxd_dev);
 int drv_enable_wq(struct idxd_wq *wq);
 void drv_disable_wq(struct idxd_wq *wq);
 int idxd_device_init_reset(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index f82416eec926..221a61e3bb9c 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -52,44 +52,8 @@ struct bus_type dsa_bus_type = {
 
 static int idxd_dsa_drv_probe(struct idxd_dev *idxd_dev)
 {
-	struct device *dev = &idxd_dev->conf_dev;
-	unsigned long flags;
-	int rc;
-
-	if (is_idxd_dev(idxd_dev)) {
-		struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
-
-		if (idxd->state != IDXD_DEV_DISABLED)
-			return -ENXIO;
-
-		/* Device configuration */
-		spin_lock_irqsave(&idxd->dev_lock, flags);
-		if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
-			rc = idxd_device_config(idxd);
-		spin_unlock_irqrestore(&idxd->dev_lock, flags);
-		if (rc < 0) {
-			dev_dbg(dev, "Device config failed: %d\n", rc);
-			return rc;
-		}
-
-		/* Start device */
-		rc = idxd_device_enable(idxd);
-		if (rc < 0) {
-			dev_warn(dev, "Device enable failed: %d\n", rc);
-			return rc;
-		}
-
-		/* Setup DMA device without channels */
-		rc = idxd_register_dma_device(idxd);
-		if (rc < 0) {
-			dev_dbg(dev, "Failed to register dmaengine device\n");
-			idxd_device_disable(idxd);
-			return rc;
-		}
-
-		dev_info(dev, "Device %s enabled\n", dev_name(dev));
-		return 0;
-	}
+	if (is_idxd_dev(idxd_dev))
+		return idxd_device_drv_probe(idxd_dev);
 
 	if (is_idxd_wq_dev(idxd_dev)) {
 		struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
-- 
Gitee


From e786a9d95902cc5f772468fa739a75aa7bf09985 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:07 -0700
Subject: [PATCH 0650/2161] dmaengine: idxd: idxd: move remove() bits for idxd
 'struct device' to device.c

commit 745e92a6d816277fcbd231bda5ad2d882b22fe52 upstream.

Move the code related to a ->remove() function for the idxd
'struct device' to device.c to prep for the idxd device
sub-driver in device.c.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637464768.744545.15797285510999151668.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 22 ++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |  1 +
 drivers/dma/idxd/sysfs.c  | 20 +-------------------
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index b9aa209efee4..d5a0b6fff3b9 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1327,3 +1327,25 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 
 	return 0;
 }
+
+void idxd_device_drv_remove(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+		struct device *wq_dev = wq_confdev(wq);
+
+		if (wq->state == IDXD_WQ_DISABLED)
+			continue;
+		dev_warn(dev, "Active wq %d on disable %s.\n", i, dev_name(wq_dev));
+		device_release_driver(wq_dev);
+	}
+
+	idxd_unregister_dma_device(idxd);
+	idxd_device_disable(idxd);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		idxd_device_reset(idxd);
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index dbbd36feb462..1c8abba13470 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -499,6 +499,7 @@ void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
 int idxd_device_drv_probe(struct idxd_dev *idxd_dev);
+void idxd_device_drv_remove(struct idxd_dev *idxd_dev);
 int drv_enable_wq(struct idxd_wq *wq);
 void drv_disable_wq(struct idxd_wq *wq);
 int idxd_device_init_reset(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 221a61e3bb9c..abea8aca6799 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -66,26 +66,8 @@ static int idxd_dsa_drv_probe(struct idxd_dev *idxd_dev)
 
 static void idxd_dsa_drv_remove(struct idxd_dev *idxd_dev)
 {
-	struct device *dev = &idxd_dev->conf_dev;
-
 	if (is_idxd_dev(idxd_dev)) {
-		struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
-		int i;
-
-		for (i = 0; i < idxd->max_wqs; i++) {
-			struct idxd_wq *wq = idxd->wqs[i];
-
-			if (wq->state == IDXD_WQ_DISABLED)
-				continue;
-			dev_warn(dev, "Active wq %d on disable %s.\n", i,
-				 dev_name(wq_confdev(wq)));
-			device_release_driver(wq_confdev(wq));
-		}
-
-		idxd_unregister_dma_device(idxd);
-		idxd_device_disable(idxd);
-		if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
-			idxd_device_reset(idxd);
+		idxd_device_drv_remove(idxd_dev);
 		return;
 	}
 
-- 
Gitee


From 6ad63b987565f3c942ce566d278e99e535633afd Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:13 -0700
Subject: [PATCH 0651/2161] dmanegine: idxd: open code the dsa_drv registration

commit c05257b5600bb35a580ecdb25695efff26326d59 upstream.

Don't need a wrapper to register the driver. Just do it directly.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637465319.744545.16325178432532362906.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h  |  2 ++
 drivers/dma/idxd/init.c  | 10 +++++-----
 drivers/dma/idxd/sysfs.c | 13 +------------
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 1c8abba13470..7fc26b7727c0 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -56,6 +56,8 @@ struct idxd_device_driver {
 	struct device_driver drv;
 };
 
+extern struct idxd_device_driver dsa_drv;
+
 struct idxd_irq_entry {
 	struct idxd_device *idxd;
 	int id;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index c22225b14c5d..5b628e6c04bf 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -840,9 +840,9 @@ static int __init idxd_init_module(void)
 	if (err < 0)
 		return err;
 
-	err = idxd_register_driver();
+	err = idxd_driver_register(&dsa_drv);
 	if (err < 0)
-		goto err_idxd_driver_register;
+		goto err_dsa_driver_register;
 
 	err = idxd_cdev_register();
 	if (err)
@@ -857,8 +857,8 @@ static int __init idxd_init_module(void)
 err_pci_register:
 	idxd_cdev_remove();
 err_cdev_register:
-	idxd_unregister_driver();
-err_idxd_driver_register:
+	idxd_driver_unregister(&dsa_drv);
+err_dsa_driver_register:
 	idxd_unregister_bus_type();
 	return err;
 }
@@ -866,7 +866,7 @@ module_init(idxd_init_module);
 
 static void __exit idxd_exit_module(void)
 {
-	idxd_unregister_driver();
+	idxd_driver_unregister(&dsa_drv);
 	pci_unregister_driver(&idxd_pci_driver);
 	idxd_cdev_remove();
 	idxd_unregister_bus_type();
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index abea8aca6799..9f2d06c2aa98 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -79,23 +79,12 @@ static void idxd_dsa_drv_remove(struct idxd_dev *idxd_dev)
 	}
 }
 
-static struct idxd_device_driver dsa_drv = {
+struct idxd_device_driver dsa_drv = {
 	.name = "dsa",
 	.probe = idxd_dsa_drv_probe,
 	.remove = idxd_dsa_drv_remove,
 };
 
-/* IDXD generic driver setup */
-int idxd_register_driver(void)
-{
-	return idxd_driver_register(&dsa_drv);
-}
-
-void idxd_unregister_driver(void)
-{
-	idxd_driver_unregister(&dsa_drv);
-}
-
 /* IDXD engine attributes */
 static ssize_t engine_group_id_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
-- 
Gitee


From 27e6efbb0522e6593bf92bc714dece9123b3a818 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:18 -0700
Subject: [PATCH 0652/2161] dmaengine: idxd: add type to driver in order to
 allow device matching

commit 5fee6567ec387088ec965ee60c63051bbe102cac upstream.

Add an array of support device types to the idxd_device_driver
definition in order to enable simple matching of device type to a
given driver. The deprecated / omnibus dsa_drv driver specifies
IDXD_DEV_NONE as its only role is to service legacy userspace (old
accel-config) directed bind requests and route them to them the proper
driver. It need not attach to a device when the bus is autoprobed. The
accel-config tooling is being updated to drop its dependency on this
deprecated bind scheme.

Reviewed-by: Dan Willliams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637465882.744545.17456174666211577867.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h  |  1 +
 drivers/dma/idxd/init.c  |  5 +++++
 drivers/dma/idxd/sysfs.c | 16 +++++++++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 7fc26b7727c0..4bb5a65ec237 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -51,6 +51,7 @@ enum idxd_type {
 
 struct idxd_device_driver {
 	const char *name;
+	enum idxd_dev_type *type;
 	int (*probe)(struct idxd_dev *idxd_dev);
 	void (*remove)(struct idxd_dev *idxd_dev);
 	struct device_driver drv;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 5b628e6c04bf..544ff7137292 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -879,6 +879,11 @@ int __idxd_driver_register(struct idxd_device_driver *idxd_drv, struct module *o
 {
 	struct device_driver *drv = &idxd_drv->drv;
 
+	if (!idxd_drv->type) {
+		pr_debug("driver type not set (%ps)\n", __builtin_return_address(0));
+		return -EINVAL;
+	}
+
 	drv->name = idxd_drv->name;
 	drv->bus = &dsa_bus_type;
 	drv->owner = owner;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 9f2d06c2aa98..8d48903df131 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -19,9 +19,18 @@ static char *idxd_wq_type_names[] = {
 static int idxd_config_bus_match(struct device *dev,
 				 struct device_driver *drv)
 {
+	struct idxd_device_driver *idxd_drv =
+		container_of(drv, struct idxd_device_driver, drv);
 	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+	int i = 0;
+
+	while (idxd_drv->type[i] != IDXD_DEV_NONE) {
+		if (idxd_dev->type == idxd_drv->type[i])
+			return 1;
+		i++;
+	}
 
-	return (is_idxd_dev(idxd_dev) || is_idxd_wq_dev(idxd_dev));
+	return 0;
 }
 
 static int idxd_config_bus_probe(struct device *dev)
@@ -79,10 +88,15 @@ static void idxd_dsa_drv_remove(struct idxd_dev *idxd_dev)
 	}
 }
 
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_NONE,
+};
+
 struct idxd_device_driver dsa_drv = {
 	.name = "dsa",
 	.probe = idxd_dsa_drv_probe,
 	.remove = idxd_dsa_drv_remove,
+	.type = dev_types,
 };
 
 /* IDXD engine attributes */
-- 
Gitee


From 7fbd0a758dd9750106f0f15a2a13f2eee244cdde Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:24 -0700
Subject: [PATCH 0653/2161] dmaengine: idxd: create idxd_device sub-driver

commit 034b3290ba257f1a3c8730f3fba72e11645e7b50 upstream.

The original architecture of /sys/bus/dsa invented a scheme whereby a
single entry in the list of bus drivers, /sys/bus/drivers/dsa, handled
all device types and internally routed them to different drivers.
Those internal drivers were invisible to userspace. Now, as
/sys/bus/dsa wants to grow support for alternate drivers for a given
device, for example vfio-mdev instead of kernel-internal-dmaengine, a
proper bus device-driver model is needed. The first step in that process
is separating the existing omnibus/implicit "dsa" driver into proper
individual drivers registered on /sys/bus/dsa. Establish the idxd_drv
driver that control the enabling and disabling of the accelerator device.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637466439.744545.15210886092627144577.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 13 +++++++++++++
 drivers/dma/idxd/idxd.h   |  3 +++
 drivers/dma/idxd/init.c   |  7 +++++++
 3 files changed, 23 insertions(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index d5a0b6fff3b9..12ae3f1639f1 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1349,3 +1349,16 @@ void idxd_device_drv_remove(struct idxd_dev *idxd_dev)
 	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		idxd_device_reset(idxd);
 }
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_DSA,
+	IDXD_DEV_IAX,
+	IDXD_DEV_NONE,
+};
+
+struct idxd_device_driver idxd_drv = {
+	.type = dev_types,
+	.probe = idxd_device_drv_probe,
+	.remove = idxd_device_drv_remove,
+	.name = "idxd",
+};
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 4bb5a65ec237..a356d227f755 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -58,6 +58,7 @@ struct idxd_device_driver {
 };
 
 extern struct idxd_device_driver dsa_drv;
+extern struct idxd_device_driver idxd_drv;
 
 struct idxd_irq_entry {
 	struct idxd_device *idxd;
@@ -501,6 +502,8 @@ void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
 void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
+int idxd_register_idxd_drv(void);
+void idxd_unregister_idxd_drv(void);
 int idxd_device_drv_probe(struct idxd_dev *idxd_dev);
 void idxd_device_drv_remove(struct idxd_dev *idxd_dev);
 int drv_enable_wq(struct idxd_wq *wq);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 544ff7137292..c19b03c17ab9 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -840,6 +840,10 @@ static int __init idxd_init_module(void)
 	if (err < 0)
 		return err;
 
+	err = idxd_driver_register(&idxd_drv);
+	if (err < 0)
+		goto err_idxd_driver_register;
+
 	err = idxd_driver_register(&dsa_drv);
 	if (err < 0)
 		goto err_dsa_driver_register;
@@ -859,6 +863,8 @@ static int __init idxd_init_module(void)
 err_cdev_register:
 	idxd_driver_unregister(&dsa_drv);
 err_dsa_driver_register:
+	idxd_driver_unregister(&idxd_drv);
+err_idxd_driver_register:
 	idxd_unregister_bus_type();
 	return err;
 }
@@ -866,6 +872,7 @@ module_init(idxd_init_module);
 
 static void __exit idxd_exit_module(void)
 {
+	idxd_driver_unregister(&idxd_drv);
 	idxd_driver_unregister(&dsa_drv);
 	pci_unregister_driver(&idxd_pci_driver);
 	idxd_cdev_remove();
-- 
Gitee


From 2ba1f66b0015d2e470646933f4f4b6f4cad32e31 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:30 -0700
Subject: [PATCH 0654/2161] dmaengine: idxd: create dmaengine driver for wq
 'device'

commit 0cda4f6986a3824cac500f66326ff267bf37110f upstream.

The original architecture of /sys/bus/dsa invented a scheme whereby a
single entry in the list of bus drivers, /sys/bus/drivers/dsa, handled
all device types and internally routed them to different drivers.
Those internal drivers were invisible to userspace. Now, as
/sys/bus/dsa wants to grow support for alternate drivers for a given
device, for example vfio-mdev instead of kernel-internal-dmaengine, a
proper bus device-driver model is needed. The first step in that process
is separating the existing omnibus/implicit "dsa" driver into proper
individual drivers registered on /sys/bus/dsa. Establish the
idxd_dmaengine_drv driver that controls the enabling and disabling of the
wq and also register and unregister the dma channel.

idxd_wq_alloc_resources() and idxd_wq_free_resources() also get moved to
the dmaengine driver. The resources (dma descriptors allocation and setup)
are only used by the dmaengine driver and should only happen when it loads.

The char dev driver (cdev) related bits are left in the __drv_enable_wq()
and __drv_disable_wq() calls to be moved when we split out the char dev
driver just like how the dmaengine driver is split out.

WQ autoload support is not expected currently. With the amount of
configuration needed for the device, the wq is always expected to
be enabled by a tool (or via sysfs) rather than auto enabled at driver
load.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637467033.744545.12330636655625405394.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 40 +++-----------------
 drivers/dma/idxd/dma.c    | 77 +++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h   |  3 ++
 drivers/dma/idxd/init.c   |  7 ++++
 4 files changed, 92 insertions(+), 35 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 12ae3f1639f1..4dcc9431ae3d 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1130,7 +1130,7 @@ int idxd_device_load_config(struct idxd_device *idxd)
 	return 0;
 }
 
-static int __drv_enable_wq(struct idxd_wq *wq)
+int __drv_enable_wq(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
@@ -1178,12 +1178,7 @@ static int __drv_enable_wq(struct idxd_wq *wq)
 		}
 	}
 
-	rc = idxd_wq_alloc_resources(wq);
-	if (rc < 0) {
-		dev_dbg(dev, "wq resource alloc failed\n");
-		goto err;
-	}
-
+	rc = 0;
 	spin_lock_irqsave(&idxd->dev_lock, flags);
 	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		rc = idxd_device_config(idxd);
@@ -1207,21 +1202,7 @@ static int __drv_enable_wq(struct idxd_wq *wq)
 
 	wq->client_count = 0;
 
-	if (wq->type == IDXD_WQT_KERNEL) {
-		rc = idxd_wq_init_percpu_ref(wq);
-		if (rc < 0) {
-			dev_dbg(dev, "wq %d percpu_ref setup failed\n", wq->id);
-			goto err_cpu_ref;
-		}
-	}
-
-	if (is_idxd_wq_dmaengine(wq)) {
-		rc = idxd_register_dma_channel(wq);
-		if (rc < 0) {
-			dev_dbg(dev, "wq %d DMA channel register failed\n", wq->id);
-			goto err_client;
-		}
-	} else if (is_idxd_wq_cdev(wq)) {
+	if (is_idxd_wq_cdev(wq)) {
 		rc = idxd_wq_add_cdev(wq);
 		if (rc < 0) {
 			dev_dbg(dev, "wq %d cdev creation failed\n", wq->id);
@@ -1229,12 +1210,9 @@ static int __drv_enable_wq(struct idxd_wq *wq)
 		}
 	}
 
-	dev_info(dev, "wq %s enabled\n", dev_name(wq_confdev(wq)));
 	return 0;
 
 err_client:
-	idxd_wq_quiesce(wq);
-err_cpu_ref:
 	idxd_wq_unmap_portal(wq);
 err_map_portal:
 	rc = idxd_wq_disable(wq, false);
@@ -1254,19 +1232,14 @@ int drv_enable_wq(struct idxd_wq *wq)
 	return rc;
 }
 
-static void __drv_disable_wq(struct idxd_wq *wq)
+void __drv_disable_wq(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
 
 	lockdep_assert_held(&wq->wq_lock);
 
-	if (wq->type == IDXD_WQT_KERNEL)
-		idxd_wq_quiesce(wq);
-
-	if (is_idxd_wq_dmaengine(wq))
-		idxd_unregister_dma_channel(wq);
-	else if (is_idxd_wq_cdev(wq))
+	if (is_idxd_wq_cdev(wq))
 		idxd_wq_del_cdev(wq);
 
 	if (idxd_wq_refcount(wq))
@@ -1278,10 +1251,7 @@ static void __drv_disable_wq(struct idxd_wq *wq)
 	idxd_wq_drain(wq);
 	idxd_wq_reset(wq);
 
-	idxd_wq_free_resources(wq);
 	wq->client_count = 0;
-
-	dev_info(dev, "wq %s disabled\n", dev_name(wq_confdev(wq)));
 }
 
 void drv_disable_wq(struct idxd_wq *wq)
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 2e52f9a50519..7e3281700183 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -262,3 +262,80 @@ void idxd_unregister_dma_channel(struct idxd_wq *wq)
 	wq->idxd_chan = NULL;
 	put_device(wq_confdev(wq));
 }
+
+static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -ENXIO;
+
+	mutex_lock(&wq->wq_lock);
+	wq->type = IDXD_WQT_KERNEL;
+	rc = __drv_enable_wq(wq);
+	if (rc < 0) {
+		dev_dbg(dev, "Enable wq %d failed: %d\n", wq->id, rc);
+		rc = -ENXIO;
+		goto err;
+	}
+
+	rc = idxd_wq_alloc_resources(wq);
+	if (rc < 0) {
+		dev_dbg(dev, "WQ resource alloc failed\n");
+		goto err_res_alloc;
+	}
+
+	rc = idxd_wq_init_percpu_ref(wq);
+	if (rc < 0) {
+		dev_dbg(dev, "percpu_ref setup failed\n");
+		goto err_ref;
+	}
+
+	rc = idxd_register_dma_channel(wq);
+	if (rc < 0) {
+		dev_dbg(dev, "Failed to register dma channel\n");
+		goto err_dma;
+	}
+
+	mutex_unlock(&wq->wq_lock);
+	return 0;
+
+err_dma:
+	idxd_wq_quiesce(wq);
+err_ref:
+	idxd_wq_free_resources(wq);
+err_res_alloc:
+	__drv_disable_wq(wq);
+err:
+	wq->type = IDXD_WQT_NONE;
+	mutex_unlock(&wq->wq_lock);
+	return rc;
+}
+
+static void idxd_dmaengine_drv_remove(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+
+	mutex_lock(&wq->wq_lock);
+	idxd_wq_quiesce(wq);
+	idxd_unregister_dma_channel(wq);
+	__drv_disable_wq(wq);
+	idxd_wq_free_resources(wq);
+	wq->type = IDXD_WQT_NONE;
+	mutex_unlock(&wq->wq_lock);
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_WQ,
+	IDXD_DEV_NONE,
+};
+
+struct idxd_device_driver idxd_dmaengine_drv = {
+	.probe = idxd_dmaengine_drv_probe,
+	.remove = idxd_dmaengine_drv_remove,
+	.name = "dmaengine",
+	.type = dev_types,
+};
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index a356d227f755..a840c328bec9 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -59,6 +59,7 @@ struct idxd_device_driver {
 
 extern struct idxd_device_driver dsa_drv;
 extern struct idxd_device_driver idxd_drv;
+extern struct idxd_device_driver idxd_dmaengine_drv;
 
 struct idxd_irq_entry {
 	struct idxd_device *idxd;
@@ -507,7 +508,9 @@ void idxd_unregister_idxd_drv(void);
 int idxd_device_drv_probe(struct idxd_dev *idxd_dev);
 void idxd_device_drv_remove(struct idxd_dev *idxd_dev);
 int drv_enable_wq(struct idxd_wq *wq);
+int __drv_enable_wq(struct idxd_wq *wq);
 void drv_disable_wq(struct idxd_wq *wq);
+void __drv_disable_wq(struct idxd_wq *wq);
 int idxd_device_init_reset(struct idxd_device *idxd);
 int idxd_device_enable(struct idxd_device *idxd);
 int idxd_device_disable(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index c19b03c17ab9..6f38128ce400 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -844,6 +844,10 @@ static int __init idxd_init_module(void)
 	if (err < 0)
 		goto err_idxd_driver_register;
 
+	err = idxd_driver_register(&idxd_dmaengine_drv);
+	if (err < 0)
+		goto err_idxd_dmaengine_driver_register;
+
 	err = idxd_driver_register(&dsa_drv);
 	if (err < 0)
 		goto err_dsa_driver_register;
@@ -863,6 +867,8 @@ static int __init idxd_init_module(void)
 err_cdev_register:
 	idxd_driver_unregister(&dsa_drv);
 err_dsa_driver_register:
+	idxd_driver_unregister(&idxd_dmaengine_drv);
+err_idxd_dmaengine_driver_register:
 	idxd_driver_unregister(&idxd_drv);
 err_idxd_driver_register:
 	idxd_unregister_bus_type();
@@ -872,6 +878,7 @@ module_init(idxd_init_module);
 
 static void __exit idxd_exit_module(void)
 {
+	idxd_driver_unregister(&idxd_dmaengine_drv);
 	idxd_driver_unregister(&idxd_drv);
 	idxd_driver_unregister(&dsa_drv);
 	pci_unregister_driver(&idxd_pci_driver);
-- 
Gitee


From e7ff59f8f3b2fc3018c7d41a837ce90e7d24f580 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:35 -0700
Subject: [PATCH 0655/2161] dmaengine: idxd: create user driver for wq 'device'

commit 448c3de8ac8353fc4447738ae3c56c4eb6c2131d upstream.

The original architecture of /sys/bus/dsa invented a scheme whereby a
single entry in the list of bus drivers, /sys/bus/drivers/dsa, handled
all device types and internally routed them to different drivers.
Those internal drivers were invisible to userspace. Now, as
/sys/bus/dsa wants to grow support for alternate drivers for a given
device, for example vfio-mdev instead of kernel-internal-dmaengine, a
proper bus device-driver model is needed. The first step in that process
is separating the existing omnibus/implicit "dsa" driver into proper
individual drivers registered on /sys/bus/dsa. Establish the
idxd_user_drv driver that controls the enabling and disabling of the
wq and also register and unregister a char device to allow user space
to mmap the descriptor submission portal.

The cdev related bits are moved to the cdev driver probe/remove and out of
the drv_enabe/disable_wq() calls. These bits are exclusive to the cdev
operation and not part of the generic enable/disable of the wq device.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637467578.744545.10203997610072341376.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c   | 53 +++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/device.c | 14 -----------
 drivers/dma/idxd/idxd.h   |  1 +
 drivers/dma/idxd/init.c   |  7 ++++++
 4 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 18a003b93812..b67bbf24242a 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -304,6 +304,59 @@ void idxd_wq_del_cdev(struct idxd_wq *wq)
 	put_device(cdev_dev(idxd_cdev));
 }
 
+static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -ENXIO;
+
+	mutex_lock(&wq->wq_lock);
+	wq->type = IDXD_WQT_USER;
+	rc = __drv_enable_wq(wq);
+	if (rc < 0)
+		goto err;
+
+	rc = idxd_wq_add_cdev(wq);
+	if (rc < 0)
+		goto err_cdev;
+
+	mutex_unlock(&wq->wq_lock);
+	return 0;
+
+err_cdev:
+	__drv_disable_wq(wq);
+err:
+	wq->type = IDXD_WQT_NONE;
+	mutex_unlock(&wq->wq_lock);
+	return rc;
+}
+
+static void idxd_user_drv_remove(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+
+	mutex_lock(&wq->wq_lock);
+	idxd_wq_del_cdev(wq);
+	__drv_disable_wq(wq);
+	wq->type = IDXD_WQT_NONE;
+	mutex_unlock(&wq->wq_lock);
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_WQ,
+	IDXD_DEV_NONE,
+};
+
+struct idxd_device_driver idxd_user_drv = {
+	.probe = idxd_user_drv_probe,
+	.remove = idxd_user_drv_remove,
+	.name = "user",
+	.type = dev_types,
+};
+
 int idxd_cdev_register(void)
 {
 	int rc, i;
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 4dcc9431ae3d..9bbc28d9a9eb 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1201,19 +1201,8 @@ int __drv_enable_wq(struct idxd_wq *wq)
 	}
 
 	wq->client_count = 0;
-
-	if (is_idxd_wq_cdev(wq)) {
-		rc = idxd_wq_add_cdev(wq);
-		if (rc < 0) {
-			dev_dbg(dev, "wq %d cdev creation failed\n", wq->id);
-			goto err_client;
-		}
-	}
-
 	return 0;
 
-err_client:
-	idxd_wq_unmap_portal(wq);
 err_map_portal:
 	rc = idxd_wq_disable(wq, false);
 	if (rc < 0)
@@ -1239,9 +1228,6 @@ void __drv_disable_wq(struct idxd_wq *wq)
 
 	lockdep_assert_held(&wq->wq_lock);
 
-	if (is_idxd_wq_cdev(wq))
-		idxd_wq_del_cdev(wq);
-
 	if (idxd_wq_refcount(wq))
 		dev_warn(dev, "Clients has claim on wq %d: %d\n",
 			 wq->id, idxd_wq_refcount(wq));
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index a840c328bec9..bacec9b93a7e 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -60,6 +60,7 @@ struct idxd_device_driver {
 extern struct idxd_device_driver dsa_drv;
 extern struct idxd_device_driver idxd_drv;
 extern struct idxd_device_driver idxd_dmaengine_drv;
+extern struct idxd_device_driver idxd_user_drv;
 
 struct idxd_irq_entry {
 	struct idxd_device *idxd;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 6f38128ce400..33a80f700ff8 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -848,6 +848,10 @@ static int __init idxd_init_module(void)
 	if (err < 0)
 		goto err_idxd_dmaengine_driver_register;
 
+	err = idxd_driver_register(&idxd_user_drv);
+	if (err < 0)
+		goto err_idxd_user_driver_register;
+
 	err = idxd_driver_register(&dsa_drv);
 	if (err < 0)
 		goto err_dsa_driver_register;
@@ -867,6 +871,8 @@ static int __init idxd_init_module(void)
 err_cdev_register:
 	idxd_driver_unregister(&dsa_drv);
 err_dsa_driver_register:
+	idxd_driver_unregister(&idxd_user_drv);
+err_idxd_user_driver_register:
 	idxd_driver_unregister(&idxd_dmaengine_drv);
 err_idxd_dmaengine_driver_register:
 	idxd_driver_unregister(&idxd_drv);
@@ -878,6 +884,7 @@ module_init(idxd_init_module);
 
 static void __exit idxd_exit_module(void)
 {
+	idxd_driver_unregister(&idxd_user_drv);
 	idxd_driver_unregister(&idxd_dmaengine_drv);
 	idxd_driver_unregister(&idxd_drv);
 	idxd_driver_unregister(&dsa_drv);
-- 
Gitee


From 1a3964f5e7a57ef6f50cc7ba1b4eada67f8af064 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:41 -0700
Subject: [PATCH 0656/2161] dmaengine: dsa: move dsa_bus_type out of idxd
 driver to standalone

commit d9e5481fca74f870cf2fc2f90a0e77e85c0b5b86 upstream.

In preparation for dsa_drv compat support to be built-in, move the bus
code to its own compilation unit. A follow-on patch adds the compat
implementation. Recall that the compat implementation allows for the
deprecated / omnibus dsa_drv binding scheme rather than the idiomatic
organization of a full fledged bus driver per driver type.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637468142.744545.2811632736881720857.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/Kconfig       |  4 ++
 drivers/dma/Makefile      |  2 +-
 drivers/dma/idxd/Makefile |  5 +++
 drivers/dma/idxd/bus.c    | 92 +++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/init.c   | 30 +------------
 drivers/dma/idxd/sysfs.c  | 43 ------------------
 6 files changed, 103 insertions(+), 73 deletions(-)
 create mode 100644 drivers/dma/idxd/bus.c

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 4e71190688d2..4e25d4b04c42 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -273,6 +273,10 @@ config INTEL_IDMA64
 	  Enable DMA support for Intel Low Power Subsystem such as found on
 	  Intel Skylake PCH.
 
+config INTEL_IDXD_BUS
+	tristate
+	default INTEL_IDXD
+
 config INTEL_IDXD
 	tristate "Intel Data Accelerators support"
 	depends on PCI && X86_64 && !UML
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index ff064b29021e..9fc038ff6c17 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_IMX_DMA) += imx-dma.o
 obj-$(CONFIG_IMX_SDMA) += imx-sdma.o
 obj-$(CONFIG_INTEL_IDMA64) += idma64.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioat/
-obj-$(CONFIG_INTEL_IDXD) += idxd/
+obj-y += idxd/
 obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
 obj-$(CONFIG_INTEL_MIC_X100_DMA) += mic_x100_dma.o
 obj-$(CONFIG_K3_DMA) += k3dma.o
diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index 6d11558756f8..8c29ed4d48c3 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,4 +1,9 @@
+ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=IDXD
+
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
 idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
 
 idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
+
+obj-$(CONFIG_INTEL_IDXD_BUS) += idxd_bus.o
+idxd_bus-y := bus.o
diff --git a/drivers/dma/idxd/bus.c b/drivers/dma/idxd/bus.c
new file mode 100644
index 000000000000..02837f0fb3e4
--- /dev/null
+++ b/drivers/dma/idxd/bus.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include "idxd.h"
+
+
+int __idxd_driver_register(struct idxd_device_driver *idxd_drv, struct module *owner,
+			   const char *mod_name)
+{
+	struct device_driver *drv = &idxd_drv->drv;
+
+	if (!idxd_drv->type) {
+		pr_debug("driver type not set (%ps)\n", __builtin_return_address(0));
+		return -EINVAL;
+	}
+
+	drv->name = idxd_drv->name;
+	drv->bus = &dsa_bus_type;
+	drv->owner = owner;
+	drv->mod_name = mod_name;
+
+	return driver_register(drv);
+}
+EXPORT_SYMBOL_GPL(__idxd_driver_register);
+
+void idxd_driver_unregister(struct idxd_device_driver *idxd_drv)
+{
+	driver_unregister(&idxd_drv->drv);
+}
+EXPORT_SYMBOL_GPL(idxd_driver_unregister);
+
+static int idxd_config_bus_match(struct device *dev,
+				 struct device_driver *drv)
+{
+	struct idxd_device_driver *idxd_drv =
+		container_of(drv, struct idxd_device_driver, drv);
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+	int i = 0;
+
+	while (idxd_drv->type[i] != IDXD_DEV_NONE) {
+		if (idxd_dev->type == idxd_drv->type[i])
+			return 1;
+		i++;
+	}
+
+	return 0;
+}
+
+static int idxd_config_bus_probe(struct device *dev)
+{
+	struct idxd_device_driver *idxd_drv =
+		container_of(dev->driver, struct idxd_device_driver, drv);
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return idxd_drv->probe(idxd_dev);
+}
+
+static int idxd_config_bus_remove(struct device *dev)
+{
+	struct idxd_device_driver *idxd_drv =
+		container_of(dev->driver, struct idxd_device_driver, drv);
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	idxd_drv->remove(idxd_dev);
+	return 0;
+}
+
+struct bus_type dsa_bus_type = {
+	.name = "dsa",
+	.match = idxd_config_bus_match,
+	.probe = idxd_config_bus_probe,
+	.remove = idxd_config_bus_remove,
+};
+EXPORT_SYMBOL_GPL(dsa_bus_type);
+
+static int __init dsa_bus_init(void)
+{
+	return bus_register(&dsa_bus_type);
+}
+module_init(dsa_bus_init);
+
+static void __exit dsa_bus_exit(void)
+{
+	bus_unregister(&dsa_bus_type);
+}
+module_exit(dsa_bus_exit);
+
+MODULE_DESCRIPTION("IDXD driver dsa_bus_type driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 33a80f700ff8..9b797fcdfd7b 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -26,6 +26,7 @@
 MODULE_VERSION(IDXD_DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Intel Corporation");
+MODULE_IMPORT_NS(IDXD);
 
 static bool sva = true;
 module_param(sva, bool, 0644);
@@ -836,10 +837,6 @@ static int __init idxd_init_module(void)
 
 	perfmon_init();
 
-	err = idxd_register_bus_type();
-	if (err < 0)
-		return err;
-
 	err = idxd_driver_register(&idxd_drv);
 	if (err < 0)
 		goto err_idxd_driver_register;
@@ -877,7 +874,6 @@ static int __init idxd_init_module(void)
 err_idxd_dmaengine_driver_register:
 	idxd_driver_unregister(&idxd_drv);
 err_idxd_driver_register:
-	idxd_unregister_bus_type();
 	return err;
 }
 module_init(idxd_init_module);
@@ -890,30 +886,6 @@ static void __exit idxd_exit_module(void)
 	idxd_driver_unregister(&dsa_drv);
 	pci_unregister_driver(&idxd_pci_driver);
 	idxd_cdev_remove();
-	idxd_unregister_bus_type();
 	perfmon_exit();
 }
 module_exit(idxd_exit_module);
-
-int __idxd_driver_register(struct idxd_device_driver *idxd_drv, struct module *owner,
-			   const char *mod_name)
-{
-	struct device_driver *drv = &idxd_drv->drv;
-
-	if (!idxd_drv->type) {
-		pr_debug("driver type not set (%ps)\n", __builtin_return_address(0));
-		return -EINVAL;
-	}
-
-	drv->name = idxd_drv->name;
-	drv->bus = &dsa_bus_type;
-	drv->owner = owner;
-	drv->mod_name = mod_name;
-
-	return driver_register(drv);
-}
-
-void idxd_driver_unregister(struct idxd_device_driver *idxd_drv)
-{
-	driver_unregister(&idxd_drv->drv);
-}
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 8d48903df131..633f4947ed32 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -16,49 +16,6 @@ static char *idxd_wq_type_names[] = {
 	[IDXD_WQT_USER]		= "user",
 };
 
-static int idxd_config_bus_match(struct device *dev,
-				 struct device_driver *drv)
-{
-	struct idxd_device_driver *idxd_drv =
-		container_of(drv, struct idxd_device_driver, drv);
-	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
-	int i = 0;
-
-	while (idxd_drv->type[i] != IDXD_DEV_NONE) {
-		if (idxd_dev->type == idxd_drv->type[i])
-			return 1;
-		i++;
-	}
-
-	return 0;
-}
-
-static int idxd_config_bus_probe(struct device *dev)
-{
-	struct idxd_device_driver *idxd_drv =
-		container_of(dev->driver, struct idxd_device_driver, drv);
-	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
-
-	return idxd_drv->probe(idxd_dev);
-}
-
-static int idxd_config_bus_remove(struct device *dev)
-{
-	struct idxd_device_driver *idxd_drv =
-		container_of(dev->driver, struct idxd_device_driver, drv);
-	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
-
-	idxd_drv->remove(idxd_dev);
-	return 0;
-}
-
-struct bus_type dsa_bus_type = {
-	.name = "dsa",
-	.match = idxd_config_bus_match,
-	.probe = idxd_config_bus_probe,
-	.remove = idxd_config_bus_remove,
-};
-
 static int idxd_dsa_drv_probe(struct idxd_dev *idxd_dev)
 {
 	if (is_idxd_dev(idxd_dev))
-- 
Gitee


From 56ee7ced0551d6cef69e26fafdf992272471002b Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 15 Jul 2021 11:44:47 -0700
Subject: [PATCH 0657/2161] dmaengine: idxd: move dsa_drv support to compatible
 mode

commit 6e7f3ee97bbe2c7d7a53b7dbd7a08a579e03c8c9 upstream.

The original architecture of /sys/bus/dsa invented a scheme whereby
a single entry in the list of bus drivers, /sys/bus/drivers/dsa,
handled all device types and internally routed them to different
different drivers. Those internal drivers were invisible to
userspace.

With the idxd driver transitioned to a proper bus device-driver model,
the legacy behavior needs to be preserved due to it being exposed to
user space via sysfs. Create a compat driver to provide the legacy
behavior for /sys/bus/dsa/drivers/dsa. This should satisfy user
tool accel-config v3.2 or ealier where this behavior is expected.
If the distro has a newer accel-config then the legacy mode does
not need to be enabled.

When the compat driver binds the device (i.e. dsa0) to the dsa driver,
it will be bound to the new idxd_drv. The wq device (i.e. wq0.0) will
be bound to either the dmaengine_drv or the user_drv. The dsa_drv
becomes a routing mechansim for the new drivers. It will not support
additional external drivers that are implemented later.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162637468705.744545.4399080971745974435.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/Kconfig       |  17 ++++++
 drivers/dma/idxd/Makefile |   3 +
 drivers/dma/idxd/cdev.c   |   1 +
 drivers/dma/idxd/compat.c | 114 ++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/device.c |   1 +
 drivers/dma/idxd/dma.c    |   1 +
 drivers/dma/idxd/idxd.h   |  10 +++-
 drivers/dma/idxd/init.c   |   7 ---
 drivers/dma/idxd/sysfs.c  |  40 -------------
 9 files changed, 146 insertions(+), 48 deletions(-)
 create mode 100644 drivers/dma/idxd/compat.c

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 4e25d4b04c42..63f7e2fc8760 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -291,6 +291,23 @@ config INTEL_IDXD
 
 	  If unsure, say N.
 
+config INTEL_IDXD_COMPAT
+	bool "Legacy behavior for idxd driver"
+	depends on PCI && X86_64
+	select INTEL_IDXD_BUS
+	help
+	  Compatible driver to support old /sys/bus/dsa/drivers/dsa behavior.
+	  The old behavior performed driver bind/unbind for device and wq
+	  devices all under the dsa driver. The compat driver will emulate
+	  the legacy behavior in order to allow existing support apps (i.e.
+	  accel-config) to continue function. It is expected that accel-config
+	  v3.2 and earlier will need the compat mode. A distro with later
+	  accel-config version can disable this compat config.
+
+	  Say Y if you have old applications that require such behavior.
+
+	  If unsure, say N.
+
 # Config symbol that collects all the dependencies that's necessary to
 # support shared virtual memory for the devices supported by idxd.
 config INTEL_IDXD_SVM
diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index 8c29ed4d48c3..a1e9f2b3a37c 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -7,3 +7,6 @@ idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
 
 obj-$(CONFIG_INTEL_IDXD_BUS) += idxd_bus.o
 idxd_bus-y := bus.o
+
+obj-$(CONFIG_INTEL_IDXD_COMPAT) += idxd_compat.o
+idxd_compat-y := compat.o
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index b67bbf24242a..f6a4603517ba 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -356,6 +356,7 @@ struct idxd_device_driver idxd_user_drv = {
 	.name = "user",
 	.type = dev_types,
 };
+EXPORT_SYMBOL_GPL(idxd_user_drv);
 
 int idxd_cdev_register(void)
 {
diff --git a/drivers/dma/idxd/compat.c b/drivers/dma/idxd/compat.c
new file mode 100644
index 000000000000..d67746ee0c1a
--- /dev/null
+++ b/drivers/dma/idxd/compat.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/device/bus.h>
+#include "idxd.h"
+
+extern int device_driver_attach(struct device_driver *drv, struct device *dev);
+extern void device_driver_detach(struct device *dev);
+
+#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)	\
+	struct driver_attribute driver_attr_##_name =		\
+	__ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)
+
+static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count)
+{
+	struct bus_type *bus = drv->bus;
+	struct device *dev;
+	int rc = -ENODEV;
+
+	dev = bus_find_device_by_name(bus, NULL, buf);
+	if (dev && dev->driver) {
+		device_driver_detach(dev);
+		rc = count;
+	}
+
+	return rc;
+}
+static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store);
+
+static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count)
+{
+	struct bus_type *bus = drv->bus;
+	struct device *dev;
+	struct device_driver *alt_drv;
+	int rc = -ENODEV;
+	struct idxd_dev *idxd_dev;
+
+	dev = bus_find_device_by_name(bus, NULL, buf);
+	if (!dev || dev->driver || drv != &dsa_drv.drv)
+		return -ENODEV;
+
+	idxd_dev = confdev_to_idxd_dev(dev);
+	if (is_idxd_dev(idxd_dev)) {
+		alt_drv = driver_find("idxd", bus);
+		if (!alt_drv)
+			return -ENODEV;
+	} else if (is_idxd_wq_dev(idxd_dev)) {
+		struct idxd_wq *wq = confdev_to_wq(dev);
+
+		if (is_idxd_wq_kernel(wq)) {
+			alt_drv = driver_find("dmaengine", bus);
+			if (!alt_drv)
+				return -ENODEV;
+		} else if (is_idxd_wq_user(wq)) {
+			alt_drv = driver_find("user", bus);
+			if (!alt_drv)
+				return -ENODEV;
+		} else {
+			return -ENODEV;
+		}
+	}
+
+	rc = device_driver_attach(alt_drv, dev);
+	if (rc < 0)
+		return rc;
+
+	return count;
+}
+static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store);
+
+static struct attribute *dsa_drv_compat_attrs[] = {
+	&driver_attr_bind.attr,
+	&driver_attr_unbind.attr,
+	NULL,
+};
+
+static const struct attribute_group dsa_drv_compat_attr_group = {
+	.attrs = dsa_drv_compat_attrs,
+};
+
+static const struct attribute_group *dsa_drv_compat_groups[] = {
+	&dsa_drv_compat_attr_group,
+	NULL,
+};
+
+static int idxd_dsa_drv_probe(struct idxd_dev *idxd_dev)
+{
+	return -ENODEV;
+}
+
+static void idxd_dsa_drv_remove(struct idxd_dev *idxd_dev)
+{
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_NONE,
+};
+
+struct idxd_device_driver dsa_drv = {
+	.name = "dsa",
+	.probe = idxd_dsa_drv_probe,
+	.remove = idxd_dsa_drv_remove,
+	.type = dev_types,
+	.drv = {
+		.suppress_bind_attrs = true,
+		.groups = dsa_drv_compat_groups,
+	},
+};
+
+module_idxd_driver(dsa_drv);
+MODULE_IMPORT_NS(IDXD);
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 9bbc28d9a9eb..99350ac9a292 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1318,3 +1318,4 @@ struct idxd_device_driver idxd_drv = {
 	.remove = idxd_device_drv_remove,
 	.name = "idxd",
 };
+EXPORT_SYMBOL_GPL(idxd_drv);
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 7e3281700183..2fd7ec29a08f 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -339,3 +339,4 @@ struct idxd_device_driver idxd_dmaengine_drv = {
 	.name = "dmaengine",
 	.type = dev_types,
 };
+EXPORT_SYMBOL_GPL(idxd_dmaengine_drv);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index bacec9b93a7e..d0874d8877d9 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -416,11 +416,16 @@ static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
 	return false;
 }
 
-static inline bool is_idxd_wq_cdev(struct idxd_wq *wq)
+static inline bool is_idxd_wq_user(struct idxd_wq *wq)
 {
 	return wq->type == IDXD_WQT_USER;
 }
 
+static inline bool is_idxd_wq_kernel(struct idxd_wq *wq)
+{
+	return wq->type == IDXD_WQT_KERNEL;
+}
+
 static inline bool wq_dedicated(struct idxd_wq *wq)
 {
 	return test_bit(WQ_FLAG_DEDICATED, &wq->flags);
@@ -484,6 +489,9 @@ int __must_check __idxd_driver_register(struct idxd_device_driver *idxd_drv,
 
 void idxd_driver_unregister(struct idxd_device_driver *idxd_drv);
 
+#define module_idxd_driver(__idxd_driver) \
+	module_driver(__idxd_driver, idxd_driver_register, idxd_driver_unregister)
+
 int idxd_register_bus_type(void);
 void idxd_unregister_bus_type(void);
 int idxd_register_devices(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 9b797fcdfd7b..8db56f98059f 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -849,10 +849,6 @@ static int __init idxd_init_module(void)
 	if (err < 0)
 		goto err_idxd_user_driver_register;
 
-	err = idxd_driver_register(&dsa_drv);
-	if (err < 0)
-		goto err_dsa_driver_register;
-
 	err = idxd_cdev_register();
 	if (err)
 		goto err_cdev_register;
@@ -866,8 +862,6 @@ static int __init idxd_init_module(void)
 err_pci_register:
 	idxd_cdev_remove();
 err_cdev_register:
-	idxd_driver_unregister(&dsa_drv);
-err_dsa_driver_register:
 	idxd_driver_unregister(&idxd_user_drv);
 err_idxd_user_driver_register:
 	idxd_driver_unregister(&idxd_dmaengine_drv);
@@ -883,7 +877,6 @@ static void __exit idxd_exit_module(void)
 	idxd_driver_unregister(&idxd_user_drv);
 	idxd_driver_unregister(&idxd_dmaengine_drv);
 	idxd_driver_unregister(&idxd_drv);
-	idxd_driver_unregister(&dsa_drv);
 	pci_unregister_driver(&idxd_pci_driver);
 	idxd_cdev_remove();
 	perfmon_exit();
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 633f4947ed32..b883e9f16e7f 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -16,46 +16,6 @@ static char *idxd_wq_type_names[] = {
 	[IDXD_WQT_USER]		= "user",
 };
 
-static int idxd_dsa_drv_probe(struct idxd_dev *idxd_dev)
-{
-	if (is_idxd_dev(idxd_dev))
-		return idxd_device_drv_probe(idxd_dev);
-
-	if (is_idxd_wq_dev(idxd_dev)) {
-		struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
-
-		return drv_enable_wq(wq);
-	}
-
-	return -ENODEV;
-}
-
-static void idxd_dsa_drv_remove(struct idxd_dev *idxd_dev)
-{
-	if (is_idxd_dev(idxd_dev)) {
-		idxd_device_drv_remove(idxd_dev);
-		return;
-	}
-
-	if (is_idxd_wq_dev(idxd_dev)) {
-		struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
-
-		drv_disable_wq(wq);
-		return;
-	}
-}
-
-static enum idxd_dev_type dev_types[] = {
-	IDXD_DEV_NONE,
-};
-
-struct idxd_device_driver dsa_drv = {
-	.name = "dsa",
-	.probe = idxd_dsa_drv_probe,
-	.remove = idxd_dsa_drv_remove,
-	.type = dev_types,
-};
-
 /* IDXD engine attributes */
 static ssize_t engine_group_id_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
-- 
Gitee


From 6796c47232f869cfe737a53ccd36fb23fcf8b56c Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 24 Mar 2022 21:03:01 +0800
Subject: [PATCH 0658/2161] dmaengine: idxd: Fix <linux/device/bus.h> include

commit opencloudos.

Relevant function is included in <linux/device.h>.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/compat.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/dma/idxd/compat.c b/drivers/dma/idxd/compat.c
index d67746ee0c1a..3ccbc2daf041 100644
--- a/drivers/dma/idxd/compat.c
+++ b/drivers/dma/idxd/compat.c
@@ -4,7 +4,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
-#include <linux/device/bus.h>
 #include "idxd.h"
 
 extern int device_driver_attach(struct device_driver *drv, struct device *dev);
-- 
Gitee


From 1d6f57c5711196f98f3cdc4e8ebac938c323716c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 14 Jul 2021 16:25:00 -0700
Subject: [PATCH 0659/2161] dmaengine: idxd: remove fault processing code

commit 0e96454ca26cc5c594ec792f7e5168cce726f7cf upstream.

Kernel memory are pinned and will not cause faults. Since the driver
does not support interrupts for user descriptors, no fault errors are
expected to come through the misc interrupt. Remove dead code.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162630502789.631986.10591230961790023856.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 95 ++----------------------------------------
 1 file changed, 4 insertions(+), 91 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index be65d55e1fc4..e018459b534f 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -23,10 +23,8 @@ struct idxd_fault {
 };
 
 static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
-				 enum irq_work_type wtype,
 				 int *processed, u64 data);
 static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
-				     enum irq_work_type wtype,
 				     int *processed, u64 data);
 
 static void idxd_device_reinit(struct work_struct *work)
@@ -62,46 +60,6 @@ static void idxd_device_reinit(struct work_struct *work)
 	idxd_device_clear_state(idxd);
 }
 
-static void idxd_device_fault_work(struct work_struct *work)
-{
-	struct idxd_fault *fault = container_of(work, struct idxd_fault, work);
-	struct idxd_irq_entry *ie;
-	int i;
-	int processed;
-	int irqcnt = fault->idxd->num_wq_irqs + 1;
-
-	for (i = 1; i < irqcnt; i++) {
-		ie = &fault->idxd->irq_entries[i];
-		irq_process_work_list(ie, IRQ_WORK_PROCESS_FAULT,
-				      &processed, fault->addr);
-		if (processed)
-			break;
-
-		irq_process_pending_llist(ie, IRQ_WORK_PROCESS_FAULT,
-					  &processed, fault->addr);
-		if (processed)
-			break;
-	}
-
-	kfree(fault);
-}
-
-static int idxd_device_schedule_fault_process(struct idxd_device *idxd,
-					      u64 fault_addr)
-{
-	struct idxd_fault *fault;
-
-	fault = kmalloc(sizeof(*fault), GFP_ATOMIC);
-	if (!fault)
-		return -ENOMEM;
-
-	fault->addr = fault_addr;
-	fault->idxd = idxd;
-	INIT_WORK(&fault->work, idxd_device_fault_work);
-	queue_work(idxd->wq, &fault->work);
-	return 0;
-}
-
 static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -168,15 +126,6 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 	if (!err)
 		return 0;
 
-	/*
-	 * This case should rarely happen and typically is due to software
-	 * programming error by the driver.
-	 */
-	if (idxd->sw_err.valid &&
-	    idxd->sw_err.desc_valid &&
-	    idxd->sw_err.fault_addr)
-		idxd_device_schedule_fault_process(idxd, idxd->sw_err.fault_addr);
-
 	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
 	if (gensts.state == IDXD_DEVICE_STATE_HALT) {
 		idxd->state = IDXD_DEV_HALTED;
@@ -228,43 +177,19 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	return IRQ_HANDLED;
 }
 
-static inline bool match_fault(struct idxd_desc *desc, u64 fault_addr)
-{
-	/*
-	 * Completion address can be bad as well. Check fault address match for descriptor
-	 * and completion address.
-	 */
-	if ((u64)desc->hw == fault_addr || (u64)desc->completion == fault_addr) {
-		struct idxd_device *idxd = desc->wq->idxd;
-		struct device *dev = &idxd->pdev->dev;
-
-		dev_warn(dev, "desc with fault address: %#llx\n", fault_addr);
-		return true;
-	}
-
-	return false;
-}
-
 static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
-				     enum irq_work_type wtype,
 				     int *processed, u64 data)
 {
 	struct idxd_desc *desc, *t;
 	struct llist_node *head;
 	int queued = 0;
 	unsigned long flags;
-	enum idxd_complete_type reason;
 
 	*processed = 0;
 	head = llist_del_all(&irq_entry->pending_llist);
 	if (!head)
 		goto out;
 
-	if (wtype == IRQ_WORK_NORMAL)
-		reason = IDXD_COMPLETE_NORMAL;
-	else
-		reason = IDXD_COMPLETE_DEV_FAIL;
-
 	llist_for_each_entry_safe(desc, t, head, llnode) {
 		u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
 
@@ -275,9 +200,7 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 				continue;
 			}
 
-			if (unlikely(status != DSA_COMP_SUCCESS))
-				match_fault(desc, data);
-			complete_desc(desc, reason);
+			complete_desc(desc, IDXD_COMPLETE_NORMAL);
 			(*processed)++;
 		} else {
 			spin_lock_irqsave(&irq_entry->list_lock, flags);
@@ -293,20 +216,14 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 }
 
 static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
-				 enum irq_work_type wtype,
 				 int *processed, u64 data)
 {
 	int queued = 0;
 	unsigned long flags;
 	LIST_HEAD(flist);
 	struct idxd_desc *desc, *n;
-	enum idxd_complete_type reason;
 
 	*processed = 0;
-	if (wtype == IRQ_WORK_NORMAL)
-		reason = IDXD_COMPLETE_NORMAL;
-	else
-		reason = IDXD_COMPLETE_DEV_FAIL;
 
 	/*
 	 * This lock protects list corruption from access of list outside of the irq handler
@@ -338,9 +255,7 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 			continue;
 		}
 
-		if (unlikely(status != DSA_COMP_SUCCESS))
-			match_fault(desc, data);
-		complete_desc(desc, reason);
+		complete_desc(desc, IDXD_COMPLETE_NORMAL);
 	}
 
 	return queued;
@@ -370,14 +285,12 @@ static int idxd_desc_process(struct idxd_irq_entry *irq_entry)
 	 * 5. Repeat until no more descriptors.
 	 */
 	do {
-		rc = irq_process_work_list(irq_entry, IRQ_WORK_NORMAL,
-					   &processed, 0);
+		rc = irq_process_work_list(irq_entry, &processed, 0);
 		total += processed;
 		if (rc != 0)
 			continue;
 
-		rc = irq_process_pending_llist(irq_entry, IRQ_WORK_NORMAL,
-					       &processed, 0);
+		rc = irq_process_pending_llist(irq_entry, &processed, 0);
 		total += processed;
 	} while (rc != 0);
 
-- 
Gitee


From a595cfa76b624befe226b15bdcd4cc91cb85308f Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Jul 2021 13:42:10 -0700
Subject: [PATCH 0660/2161] dmaengine: idxd: Set defaults for GRPCFG traffic
 class

commit ade8a86b512cf8db0d0e975a971ce356953cfcb3 upstream.

Set GRPCFG traffic class to value of 1 for best performance on current
generation of accelerators. Also add override option to allow experimentation.
Sysfs knobs are disabled for DSA/IAX gen1 devices.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162681373005.1968485.3761065664382799202.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 +++++
 drivers/dma/idxd/idxd.h                         |  1 +
 drivers/dma/idxd/init.c                         | 13 +++++++++++--
 drivers/dma/idxd/registers.h                    |  3 +++
 drivers/dma/idxd/sysfs.c                        |  6 ++++++
 5 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ee1c67a3ecf7..a31ade8726c9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1549,6 +1549,11 @@
 			support for the idxd driver. By default it is set to
 			true (1).
 
+	idxd.tc_override= [HW]
+			Format: <bool>
+			Allow override of default traffic class configuration
+			for the device. By default it is set to false (0).
+
 	ieee754=	[MIPS] Select IEEE Std 754 conformance mode
 			Format: { strict | legacy | 2008 | relaxed }
 			Default: strict
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index d0874d8877d9..4e4dc0110e77 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -16,6 +16,7 @@
 #define IDXD_DRIVER_VERSION	"1.00"
 
 extern struct kmem_cache *idxd_desc_pool;
+extern bool tc_override;
 
 struct idxd_wq;
 struct idxd_dev;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 8db56f98059f..eb09bc591c31 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -32,6 +32,10 @@ static bool sva = true;
 module_param(sva, bool, 0644);
 MODULE_PARM_DESC(sva, "Toggle SVA support on/off");
 
+bool tc_override;
+module_param(tc_override, bool, 0644);
+MODULE_PARM_DESC(tc_override, "Override traffic class defaults");
+
 #define DRV_NAME "idxd"
 
 bool support_enqcmd;
@@ -336,8 +340,13 @@ static int idxd_setup_groups(struct idxd_device *idxd)
 		}
 
 		idxd->groups[i] = group;
-		group->tc_a = -1;
-		group->tc_b = -1;
+		if (idxd->hw.version < DEVICE_VERSION_2 && !tc_override) {
+			group->tc_a = 1;
+			group->tc_b = 1;
+		} else {
+			group->tc_a = -1;
+			group->tc_b = -1;
+		}
 	}
 
 	return 0;
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 7343a8f48819..ffc7550a77ee 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -7,6 +7,9 @@
 #define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
 #define PCI_DEVICE_ID_INTEL_IAX_SPR0	0x0cfe
 
+#define DEVICE_VERSION_1		0x100
+#define DEVICE_VERSION_2		0x200
+
 #define IDXD_MMIO_BAR		0
 #define IDXD_WQ_BAR		2
 #define IDXD_PORTAL_SIZE	PAGE_SIZE
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index b883e9f16e7f..881a12596d4b 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -327,6 +327,9 @@ static ssize_t group_traffic_class_a_store(struct device *dev,
 	if (idxd->state == IDXD_DEV_ENABLED)
 		return -EPERM;
 
+	if (idxd->hw.version < DEVICE_VERSION_2 && !tc_override)
+		return -EPERM;
+
 	if (val < 0 || val > 7)
 		return -EINVAL;
 
@@ -366,6 +369,9 @@ static ssize_t group_traffic_class_b_store(struct device *dev,
 	if (idxd->state == IDXD_DEV_ENABLED)
 		return -EPERM;
 
+	if (idxd->hw.version < DEVICE_VERSION_2 && !tc_override)
+		return -EPERM;
+
 	if (val < 0 || val > 7)
 		return -EINVAL;
 
-- 
Gitee


From 34753796c6f8cc80dd427b2ff235138754fb44c3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 21 Jul 2021 11:35:03 -0700
Subject: [PATCH 0661/2161] dmaengine: idxd: fix uninit var for alt_drv

commit 568b2126466f926a10be0b53b40c2d6ae056d8d6 upstream.

0-day detected uninitialized alt_drv variable in the bind_store() function.
The branch can be taken when device is not idxd device or wq 'struct
device'. Init alt_drv to NULL.

Fixes: 6e7f3ee97bbe ("dmaengine: idxd: move dsa_drv support to compatible mode")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162689250332.2114335.636367120454420852.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/compat.c b/drivers/dma/idxd/compat.c
index 3ccbc2daf041..0edb2cd2e5ba 100644
--- a/drivers/dma/idxd/compat.c
+++ b/drivers/dma/idxd/compat.c
@@ -33,7 +33,7 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t cou
 {
 	struct bus_type *bus = drv->bus;
 	struct device *dev;
-	struct device_driver *alt_drv;
+	struct device_driver *alt_drv = NULL;
 	int rc = -ENODEV;
 	struct idxd_dev *idxd_dev;
 
-- 
Gitee


From 5bf5aab92b4fd7766fa2782c0e8c6c64d9dff65c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 22 Jul 2021 10:54:10 -0700
Subject: [PATCH 0662/2161] dmaengine: idxd: fix wq slot allocation index check

commit 673d812d30be67942762bb9e8548abb26a3ba4a7 upstream.

The sbitmap wait and allocate routine checks the index that is returned
from sbitmap_queue_get(). It should be idxd >= 0 as 0 is also a valid
index. This fixes issue where submission path hangs when WQ size is 1.

Fixes: 0705107fcc80 ("dmaengine: idxd: move submission to sbitmap_queue")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162697645067.3478714.506720687816951762.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 6ef704dd4d0b..65b0130ab2db 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -59,7 +59,7 @@ struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype)
 		if (signal_pending_state(TASK_INTERRUPTIBLE, current))
 			break;
 		idx = sbitmap_queue_get(sbq, &cpu);
-		if (idx > 0)
+		if (idx >= 0)
 			break;
 		schedule();
 	}
-- 
Gitee


From 911ac2b04eaa7963250e4f4812de1b69945d95bb Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Jul 2021 13:42:04 -0700
Subject: [PATCH 0663/2161] dmaengine: idxd: rotate portal address for better
 performance

commit a9c171527a3403cae6c1907744b1bc9ca301f912 upstream.

The device submission portal is on a 4k page and any of those 64bit aligned
address on the page can be used for descriptor submission. By rotating the
offset through the 4k range and prevent successive writes to the same MMIO
address, performance improvement is observed through testing.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162681372446.1968485.10634280461681015569.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  1 +
 drivers/dma/idxd/idxd.h   | 20 ++++++++++++++++++++
 drivers/dma/idxd/submit.c |  2 +-
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 99350ac9a292..41f67a195eb6 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -320,6 +320,7 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq)
 
 	devm_iounmap(dev, wq->portal);
 	wq->portal = NULL;
+	wq->portal_offset = 0;
 }
 
 void idxd_wqs_unmap_portal(struct idxd_device *idxd)
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 4e4dc0110e77..94983bced189 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -11,6 +11,7 @@
 #include <linux/idr.h>
 #include <linux/pci.h>
 #include <linux/perf_event.h>
+#include <uapi/linux/idxd.h>
 #include "registers.h"
 
 #define IDXD_DRIVER_VERSION	"1.00"
@@ -162,6 +163,7 @@ struct idxd_dma_chan {
 
 struct idxd_wq {
 	void __iomem *portal;
+	u32 portal_offset;
 	struct percpu_ref wq_active;
 	struct completion wq_dead;
 	struct idxd_dev idxd_dev;
@@ -468,6 +470,24 @@ static inline int idxd_get_wq_portal_full_offset(int wq_id,
 	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot);
 }
 
+#define IDXD_PORTAL_MASK	(PAGE_SIZE - 1)
+
+/*
+ * Even though this function can be accessed by multiple threads, it is safe to use.
+ * At worst the address gets used more than once before it gets incremented. We don't
+ * hit a threshold until iops becomes many million times a second. So the occasional
+ * reuse of the same address is tolerable compare to using an atomic variable. This is
+ * safe on a system that has atomic load/store for 32bit integers. Given that this is an
+ * Intel iEP device, that should not be a problem.
+ */
+static inline void __iomem *idxd_wq_portal_addr(struct idxd_wq *wq)
+{
+	int ofs = wq->portal_offset;
+
+	wq->portal_offset = (ofs + sizeof(struct dsa_raw_desc)) & IDXD_PORTAL_MASK;
+	return wq->portal + ofs;
+}
+
 static inline void idxd_wq_get(struct idxd_wq *wq)
 {
 	wq->client_count++;
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 65b0130ab2db..92ae9a157cc9 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -146,7 +146,7 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	if (!percpu_ref_tryget_live(&wq->wq_active))
 		return -ENXIO;
 
-	portal = wq->portal;
+	portal = idxd_wq_portal_addr(wq);
 
 	/*
 	 * The wmb() flushes writes to coherent DMA data before
-- 
Gitee


From 88a888c9195c3ee31945943a10459e6a09d6d667 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Jul 2021 13:42:15 -0700
Subject: [PATCH 0664/2161] dmanegine: idxd: add software command status

commit 125d10373ad991888c9e94d2da49bcc5ccba2127 upstream.

Enabling device and wq returns standard errno and that does not provide
enough details to indicate what exactly failed. The hardware command status
is only 8bits. Expand the command status to 32bits and use the upper 16
bits to define software errors to provide more details on the exact
failure. Bit 31 will be used to indicate the error is software set as the
driver is using some of the spec defined hardware error as well.

Cc: Ramesh Thomas <ramesh.thomas@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162681373579.1968485.5891788397526827892.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  2 ++
 drivers/dma/idxd/cdev.c                       |  5 +++-
 drivers/dma/idxd/device.c                     | 22 +++++++++++++++---
 drivers/dma/idxd/dma.c                        |  4 ++++
 drivers/dma/idxd/idxd.h                       |  2 +-
 drivers/dma/idxd/sysfs.c                      | 11 ++++++++-
 include/uapi/linux/idxd.h                     | 23 +++++++++++++++++++
 7 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index ebd521314803..c67c63bacd3c 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -128,6 +128,8 @@ Date:		Aug 28, 2020
 KernelVersion:	5.10.0
 Contact:	dmaengine@vger.kernel.org
 Description:	The last executed device administrative command's status/error.
+		Also last configuration error overloaded.
+		Writing to it will clear the status.
 
 What:		/sys/bus/dsa/devices/wq<m>.<n>/block_on_fault
 Date:		Oct 27, 2020
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index f6a4603517ba..4d2ecdb130e7 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -320,9 +320,12 @@ static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
 		goto err;
 
 	rc = idxd_wq_add_cdev(wq);
-	if (rc < 0)
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_CDEV_ERR;
 		goto err_cdev;
+	}
 
+	idxd->cmd_status = 0;
 	mutex_unlock(&wq->wq_lock);
 	return 0;
 
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 41f67a195eb6..86fa4b4590f9 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -840,6 +840,7 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	wq->wqcfg->wq_size = wq->size;
 
 	if (wq->size == 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_SIZE;
 		dev_warn(dev, "Incorrect work queue size: 0\n");
 		return -EINVAL;
 	}
@@ -975,6 +976,7 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 			continue;
 
 		if (wq_shared(wq) && !device_swq_supported(idxd)) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_SWQ_SUPPORT;
 			dev_warn(dev, "No shared wq support but configured.\n");
 			return -EINVAL;
 		}
@@ -983,8 +985,10 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 		configured++;
 	}
 
-	if (configured == 0)
+	if (configured == 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NONE_CONFIGURED;
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -1140,21 +1144,26 @@ int __drv_enable_wq(struct idxd_wq *wq)
 
 	lockdep_assert_held(&wq->wq_lock);
 
-	if (idxd->state != IDXD_DEV_ENABLED)
+	if (idxd->state != IDXD_DEV_ENABLED) {
+		idxd->cmd_status = IDXD_SCMD_DEV_NOT_ENABLED;
 		goto err;
+	}
 
 	if (wq->state != IDXD_WQ_DISABLED) {
 		dev_dbg(dev, "wq %d already enabled.\n", wq->id);
+		idxd->cmd_status = IDXD_SCMD_WQ_ENABLED;
 		rc = -EBUSY;
 		goto err;
 	}
 
 	if (!wq->group) {
 		dev_dbg(dev, "wq %d not attached to group.\n", wq->id);
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_GRP;
 		goto err;
 	}
 
 	if (strlen(wq->name) == 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_NAME;
 		dev_dbg(dev, "wq %d name not set.\n", wq->id);
 		goto err;
 	}
@@ -1162,6 +1171,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 	/* Shared WQ checks */
 	if (wq_shared(wq)) {
 		if (!device_swq_supported(idxd)) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_SVM;
 			dev_dbg(dev, "PASID not enabled and shared wq.\n");
 			goto err;
 		}
@@ -1174,6 +1184,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 		 * threshold via sysfs.
 		 */
 		if (wq->threshold == 0) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_THRESH;
 			dev_dbg(dev, "Shared wq and threshold 0.\n");
 			goto err;
 		}
@@ -1197,6 +1208,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 
 	rc = idxd_wq_map_portal(wq);
 	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_PORTAL_ERR;
 		dev_dbg(dev, "wq %d portal mapping failed: %d\n", wq->id, rc);
 		goto err_map_portal;
 	}
@@ -1259,8 +1271,10 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 	 * enabled state, then the device was altered outside of driver's control.
 	 * If the state is in halted state, then we don't want to proceed.
 	 */
-	if (idxd->state != IDXD_DEV_DISABLED)
+	if (idxd->state != IDXD_DEV_DISABLED) {
+		idxd->cmd_status = IDXD_SCMD_DEV_ENABLED;
 		return -ENXIO;
+	}
 
 	/* Device configuration */
 	spin_lock_irqsave(&idxd->dev_lock, flags);
@@ -1279,9 +1293,11 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 	rc = idxd_register_dma_device(idxd);
 	if (rc < 0) {
 		idxd_device_disable(idxd);
+		idxd->cmd_status = IDXD_SCMD_DEV_DMA_ERR;
 		return rc;
 	}
 
+	idxd->cmd_status = 0;
 	return 0;
 }
 
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 2fd7ec29a08f..a195225687bb 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -284,22 +284,26 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 
 	rc = idxd_wq_alloc_resources(wq);
 	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_RES_ALLOC_ERR;
 		dev_dbg(dev, "WQ resource alloc failed\n");
 		goto err_res_alloc;
 	}
 
 	rc = idxd_wq_init_percpu_ref(wq);
 	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_PERCPU_ERR;
 		dev_dbg(dev, "percpu_ref setup failed\n");
 		goto err_ref;
 	}
 
 	rc = idxd_register_dma_channel(wq);
 	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_DMA_CHAN_ERR;
 		dev_dbg(dev, "Failed to register dma channel\n");
 		goto err_dma;
 	}
 
+	idxd->cmd_status = 0;
 	mutex_unlock(&wq->wq_lock);
 	return 0;
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 94983bced189..bfcb03329f77 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -252,7 +252,7 @@ struct idxd_device {
 	unsigned long flags;
 	int id;
 	int major;
-	u8 cmd_status;
+	u32 cmd_status;
 
 	struct pci_dev *pdev;
 	void __iomem *reg_base;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 881a12596d4b..4c01587c9d4a 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1217,7 +1217,16 @@ static ssize_t cmd_status_show(struct device *dev,
 
 	return sysfs_emit(buf, "%#x\n", idxd->cmd_status);
 }
-static DEVICE_ATTR_RO(cmd_status);
+
+static ssize_t cmd_status_store(struct device *dev, struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	idxd->cmd_status = 0;
+	return count;
+}
+static DEVICE_ATTR_RW(cmd_status);
 
 static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_version.attr,
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index e33997b4d750..1c0175aa0e42 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -9,6 +9,29 @@
 #include <stdint.h>
 #endif
 
+/* Driver command error status */
+enum idxd_scmd_stat {
+	IDXD_SCMD_DEV_ENABLED = 0x80000010,
+	IDXD_SCMD_DEV_NOT_ENABLED = 0x80000020,
+	IDXD_SCMD_WQ_ENABLED = 0x80000021,
+	IDXD_SCMD_DEV_DMA_ERR = 0x80020000,
+	IDXD_SCMD_WQ_NO_GRP = 0x80030000,
+	IDXD_SCMD_WQ_NO_NAME = 0x80040000,
+	IDXD_SCMD_WQ_NO_SVM = 0x80050000,
+	IDXD_SCMD_WQ_NO_THRESH = 0x80060000,
+	IDXD_SCMD_WQ_PORTAL_ERR = 0x80070000,
+	IDXD_SCMD_WQ_RES_ALLOC_ERR = 0x80080000,
+	IDXD_SCMD_PERCPU_ERR = 0x80090000,
+	IDXD_SCMD_DMA_CHAN_ERR = 0x800a0000,
+	IDXD_SCMD_CDEV_ERR = 0x800b0000,
+	IDXD_SCMD_WQ_NO_SWQ_SUPPORT = 0x800c0000,
+	IDXD_SCMD_WQ_NONE_CONFIGURED = 0x800d0000,
+	IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
+};
+
+#define IDXD_SCMD_SOFTERR_MASK	0x80000000
+#define IDXD_SCMD_SOFTERR_SHIFT	16
+
 /* Descriptor flags */
 #define IDXD_OP_FLAG_FENCE	0x0001
 #define IDXD_OP_FLAG_BOF	0x0002
-- 
Gitee


From 2742f38e671708b5164c5b7c6532e540b12ba97c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 22 Jul 2021 13:10:51 -0700
Subject: [PATCH 0665/2161] dmaengine: idxd: fix abort status check

commit b60bb6e2bfc192091b8f792781b83b5e0f9324f6 upstream.

Coverity static analysis of linux-next found issue.

The check (status == IDXD_COMP_DESC_ABORT) is always false since status
was previously masked with 0x7f and IDXD_COMP_DESC_ABORT is 0xff.

Fixes: 6b4b87f2c31a ("dmaengine: idxd: fix submission race window")
Reported-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162698465160.3560828.18173186265683415384.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index e018459b534f..65dc7bbb0a13 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -194,7 +194,11 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 		u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
 
 		if (status) {
-			if (unlikely(status == IDXD_COMP_DESC_ABORT)) {
+			/*
+			 * Check against the original status as ABORT is software defined
+			 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
+			 */
+			if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) {
 				complete_desc(desc, IDXD_COMPLETE_ABORT);
 				(*processed)++;
 				continue;
@@ -250,7 +254,11 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 	list_for_each_entry(desc, &flist, list) {
 		u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
 
-		if (unlikely(status == IDXD_COMP_DESC_ABORT)) {
+		/*
+		 * Check against the original status as ABORT is software defined
+		 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
+		 */
+		if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) {
 			complete_desc(desc, IDXD_COMPLETE_ABORT);
 			continue;
 		}
-- 
Gitee


From cfc65774bde1a4bcf6de227af52350be65602fbe Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 29 Jul 2021 14:04:01 +0200
Subject: [PATCH 0666/2161] dmaengine: idxd: Fix a possible NULL pointer
 dereference

commit e9c5b0b53ccca81dd0a35c62309e243a57c7959d upstream.

'device_driver_attach()' dereferences its first argument (i.e. 'alt_drv')
so it must not be NULL.
Simplify the error handling logic about NULL 'alt_drv' in order to be
more robust and future-proof.

Fixes: 568b2126466f ("dmaengine: idxd: fix uninit var for alt_drv")
Fixes: 6e7f3ee97bbe ("dmaengine: idxd: move dsa_drv support to compatible mode")

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/77f0dc4f3966591d1f0cffb614a94085f8895a85.1627560174.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/compat.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/idxd/compat.c b/drivers/dma/idxd/compat.c
index 0edb2cd2e5ba..41c4975cd623 100644
--- a/drivers/dma/idxd/compat.c
+++ b/drivers/dma/idxd/compat.c
@@ -44,23 +44,16 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t cou
 	idxd_dev = confdev_to_idxd_dev(dev);
 	if (is_idxd_dev(idxd_dev)) {
 		alt_drv = driver_find("idxd", bus);
-		if (!alt_drv)
-			return -ENODEV;
 	} else if (is_idxd_wq_dev(idxd_dev)) {
 		struct idxd_wq *wq = confdev_to_wq(dev);
 
-		if (is_idxd_wq_kernel(wq)) {
+		if (is_idxd_wq_kernel(wq))
 			alt_drv = driver_find("dmaengine", bus);
-			if (!alt_drv)
-				return -ENODEV;
-		} else if (is_idxd_wq_user(wq)) {
+		else if (is_idxd_wq_user(wq))
 			alt_drv = driver_find("user", bus);
-			if (!alt_drv)
-				return -ENODEV;
-		} else {
-			return -ENODEV;
-		}
 	}
+	if (!alt_drv)
+		return -ENODEV;
 
 	rc = device_driver_attach(alt_drv, dev);
 	if (rc < 0)
-- 
Gitee


From 0c0a36e86ace42d7e025b522ca1827e00a2f5985 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Mon, 2 Aug 2021 10:58:20 -0700
Subject: [PATCH 0667/2161] dmaengine: idxd: Remove unused status variable in
 irq_process_work_list()

commit 53cbf462f6b5c9de364efdf443ffb74ed082463a upstream.

status is no longer used within this block:

drivers/dma/idxd/irq.c:255:6: warning: unused variable 'status'
[-Wunused-variable]
                u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
                   ^
1 warning generated.

Fixes: b60bb6e2bfc1 ("dmaengine: idxd: fix abort status check")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20210802175820.3153920-1-nathan@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 65dc7bbb0a13..91e46ca3a0ad 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -252,8 +252,6 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 	spin_unlock_irqrestore(&irq_entry->list_lock, flags);
 
 	list_for_each_entry(desc, &flist, list) {
-		u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
-
 		/*
 		 * Check against the original status as ABORT is software defined
 		 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
-- 
Gitee


From 71c03d72d5e0e9e4332e48cd71b7bcd9951b55dc Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 3 Aug 2021 15:32:06 -0700
Subject: [PATCH 0668/2161] dmaengine: idxd: add capability check for 'block on
 fault' attribute

commit 81c2f79c2104c5b48f01da674bc2f7d4bc600db4 upstream.

The device general capability has a bit that indicate whether 'block on
fault' is supported. Add check to wq sysfs knob to check if cap exists
before allowing user to toggle.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162802992615.3084999.12539468940404102898.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 4c01587c9d4a..a88886d0f27b 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -642,6 +642,9 @@ static ssize_t wq_block_on_fault_store(struct device *dev,
 	bool bof;
 	int rc;
 
+	if (!idxd->hw.gen_cap.block_on_fault)
+		return -EOPNOTSUPP;
+
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
-- 
Gitee


From c5693ae10c0a3ce6094ffe9eb38dd532f37e6318 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 3 Aug 2021 15:37:15 -0700
Subject: [PATCH 0669/2161] dmaengine: idxd: clear block on fault flag when
 clear wq

commit bd2f4ae5e019efcfadd6b491204fd60adf14f4a3 upstream.

The block on fault flag is not cleared when we disable or reset wq. This
causes it to remain set if the user does not clear it on the next
configuration load. Add clear of flag in dxd_wq_disable_cleanup()
routine.

Fixes: da32b28c95a7 ("dmaengine: idxd: cleanup workqueue config after disabling")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162803023553.3086015.8158952172068868803.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 86fa4b4590f9..21f0d732b76e 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -402,6 +402,7 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	wq->priority = 0;
 	wq->ats_dis = 0;
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
+	clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
 }
 
-- 
Gitee


From 65396ea8db2d587431b86ed8e2e41911ff720d14 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 3 Aug 2021 15:29:30 -0700
Subject: [PATCH 0670/2161] dmaengine: idxd: make I/O interrupt handler one
 shot

commit d803c8b9f3f2b8e5c047f2d0a27a9ea3ef91510f upstream.

The interrupt thread handler currently loops forever to process outstanding
completions. This causes either an "irq X: nobody cared" kernel splat or
the NMI watchdog kicks in due to running too long in the function. The irq
thread handler is expected to run again after exiting if there are
interrupts fired while the thread handler is running. So the handler code
can process all the completed I/O in a single pass and exit without losing
the follow on completed I/O.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162802977005.3084234.11836261157026497585.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 59 ++++++------------------------------------
 1 file changed, 8 insertions(+), 51 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 91e46ca3a0ad..11addb394793 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -22,11 +22,6 @@ struct idxd_fault {
 	struct idxd_device *idxd;
 };
 
-static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
-				 int *processed, u64 data);
-static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
-				     int *processed, u64 data);
-
 static void idxd_device_reinit(struct work_struct *work)
 {
 	struct idxd_device *idxd = container_of(work, struct idxd_device, work);
@@ -177,18 +172,15 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	return IRQ_HANDLED;
 }
 
-static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
-				     int *processed, u64 data)
+static void irq_process_pending_llist(struct idxd_irq_entry *irq_entry)
 {
 	struct idxd_desc *desc, *t;
 	struct llist_node *head;
-	int queued = 0;
 	unsigned long flags;
 
-	*processed = 0;
 	head = llist_del_all(&irq_entry->pending_llist);
 	if (!head)
-		goto out;
+		return;
 
 	llist_for_each_entry_safe(desc, t, head, llnode) {
 		u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
@@ -200,35 +192,25 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 			 */
 			if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) {
 				complete_desc(desc, IDXD_COMPLETE_ABORT);
-				(*processed)++;
 				continue;
 			}
 
 			complete_desc(desc, IDXD_COMPLETE_NORMAL);
-			(*processed)++;
 		} else {
 			spin_lock_irqsave(&irq_entry->list_lock, flags);
 			list_add_tail(&desc->list,
 				      &irq_entry->work_list);
 			spin_unlock_irqrestore(&irq_entry->list_lock, flags);
-			queued++;
 		}
 	}
-
- out:
-	return queued;
 }
 
-static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
-				 int *processed, u64 data)
+static void irq_process_work_list(struct idxd_irq_entry *irq_entry)
 {
-	int queued = 0;
 	unsigned long flags;
 	LIST_HEAD(flist);
 	struct idxd_desc *desc, *n;
 
-	*processed = 0;
-
 	/*
 	 * This lock protects list corruption from access of list outside of the irq handler
 	 * thread.
@@ -236,16 +218,13 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 	spin_lock_irqsave(&irq_entry->list_lock, flags);
 	if (list_empty(&irq_entry->work_list)) {
 		spin_unlock_irqrestore(&irq_entry->list_lock, flags);
-		return 0;
+		return;
 	}
 
 	list_for_each_entry_safe(desc, n, &irq_entry->work_list, list) {
 		if (desc->completion->status) {
 			list_del(&desc->list);
-			(*processed)++;
 			list_add_tail(&desc->list, &flist);
-		} else {
-			queued++;
 		}
 	}
 
@@ -263,13 +242,11 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 
 		complete_desc(desc, IDXD_COMPLETE_NORMAL);
 	}
-
-	return queued;
 }
 
-static int idxd_desc_process(struct idxd_irq_entry *irq_entry)
+irqreturn_t idxd_wq_thread(int irq, void *data)
 {
-	int rc, processed, total = 0;
+	struct idxd_irq_entry *irq_entry = data;
 
 	/*
 	 * There are two lists we are processing. The pending_llist is where
@@ -288,29 +265,9 @@ static int idxd_desc_process(struct idxd_irq_entry *irq_entry)
 	 *    and process the completed entries.
 	 * 4. If the entry is still waiting on hardware, list_add_tail() to
 	 *    the work_list.
-	 * 5. Repeat until no more descriptors.
 	 */
-	do {
-		rc = irq_process_work_list(irq_entry, &processed, 0);
-		total += processed;
-		if (rc != 0)
-			continue;
-
-		rc = irq_process_pending_llist(irq_entry, &processed, 0);
-		total += processed;
-	} while (rc != 0);
-
-	return total;
-}
-
-irqreturn_t idxd_wq_thread(int irq, void *data)
-{
-	struct idxd_irq_entry *irq_entry = data;
-	int processed;
-
-	processed = idxd_desc_process(irq_entry);
-	if (processed == 0)
-		return IRQ_NONE;
+	irq_process_work_list(irq_entry);
+	irq_process_pending_llist(irq_entry);
 
 	return IRQ_HANDLED;
 }
-- 
Gitee


From b5df7c54cb518f84c7602721d80495d9a58cc5bf Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 6 Aug 2021 08:36:43 -0700
Subject: [PATCH 0671/2161] dmaengine: idxd: remove interrupt flag for
 completion list spinlock

commit 9fce3b3a0ab4cad407a27b5e36603c23f1b5b278 upstream.

The list lock is never acquired in interrupt context. Therefore there is no
need to disable interrupts. Remove interrupt flags for lock operations.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162826417450.3454650.3733188117742416238.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c    | 12 +++++-------
 drivers/dma/idxd/submit.c |  5 ++---
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 11addb394793..d221c2e37460 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -176,7 +176,6 @@ static void irq_process_pending_llist(struct idxd_irq_entry *irq_entry)
 {
 	struct idxd_desc *desc, *t;
 	struct llist_node *head;
-	unsigned long flags;
 
 	head = llist_del_all(&irq_entry->pending_llist);
 	if (!head)
@@ -197,17 +196,16 @@ static void irq_process_pending_llist(struct idxd_irq_entry *irq_entry)
 
 			complete_desc(desc, IDXD_COMPLETE_NORMAL);
 		} else {
-			spin_lock_irqsave(&irq_entry->list_lock, flags);
+			spin_lock(&irq_entry->list_lock);
 			list_add_tail(&desc->list,
 				      &irq_entry->work_list);
-			spin_unlock_irqrestore(&irq_entry->list_lock, flags);
+			spin_unlock(&irq_entry->list_lock);
 		}
 	}
 }
 
 static void irq_process_work_list(struct idxd_irq_entry *irq_entry)
 {
-	unsigned long flags;
 	LIST_HEAD(flist);
 	struct idxd_desc *desc, *n;
 
@@ -215,9 +213,9 @@ static void irq_process_work_list(struct idxd_irq_entry *irq_entry)
 	 * This lock protects list corruption from access of list outside of the irq handler
 	 * thread.
 	 */
-	spin_lock_irqsave(&irq_entry->list_lock, flags);
+	spin_lock(&irq_entry->list_lock);
 	if (list_empty(&irq_entry->work_list)) {
-		spin_unlock_irqrestore(&irq_entry->list_lock, flags);
+		spin_unlock(&irq_entry->list_lock);
 		return;
 	}
 
@@ -228,7 +226,7 @@ static void irq_process_work_list(struct idxd_irq_entry *irq_entry)
 		}
 	}
 
-	spin_unlock_irqrestore(&irq_entry->list_lock, flags);
+	spin_unlock(&irq_entry->list_lock);
 
 	list_for_each_entry(desc, &flist, list) {
 		/*
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 92ae9a157cc9..4b514c63af15 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -106,14 +106,13 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
 {
 	struct idxd_desc *d, *t, *found = NULL;
 	struct llist_node *head;
-	unsigned long flags;
 
 	desc->completion->status = IDXD_COMP_DESC_ABORT;
 	/*
 	 * Grab the list lock so it will block the irq thread handler. This allows the
 	 * abort code to locate the descriptor need to be aborted.
 	 */
-	spin_lock_irqsave(&ie->list_lock, flags);
+	spin_lock(&ie->list_lock);
 	head = llist_del_all(&ie->pending_llist);
 	if (head) {
 		llist_for_each_entry_safe(d, t, head, llnode) {
@@ -127,7 +126,7 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
 
 	if (!found)
 		found = list_abort_desc(wq, ie, desc);
-	spin_unlock_irqrestore(&ie->list_lock, flags);
+	spin_unlock(&ie->list_lock);
 
 	if (found)
 		complete_desc(found, IDXD_COMPLETE_ABORT);
-- 
Gitee


From 92845a0a70780b99912a3fab57de7ea64702d20c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 6 Aug 2021 10:37:40 -0700
Subject: [PATCH 0672/2161] dmaengine: idxd: make submit failure path
 consistent on desc freeing

commit 0b030f54f094fcd42f4a607a675c1851129a58c8 upstream.

The submission path for dmaengine API does not do descriptor freeing on
failure. Also, with the abort mechanism, the freeing of descriptor happens
when the abort callback is completed. Therefore free descriptor on all
error paths for submission call to make things consistent. Also remove the
double free that would happen on abort in idxd_dma_tx_submit() call.

Fixes: 6b4b87f2c31a ("dmaengine: idxd: fix submission race window")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162827146072.3459011.10255348500504659810.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/dma.c    |  4 +---
 drivers/dma/idxd/submit.c | 11 +++++++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index a195225687bb..5c0a4d8a31f5 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -149,10 +149,8 @@ static dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	cookie = dma_cookie_assign(tx);
 
 	rc = idxd_submit_desc(wq, desc);
-	if (rc < 0) {
-		idxd_free_desc(wq, desc);
+	if (rc < 0)
 		return rc;
-	}
 
 	return cookie;
 }
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 4b514c63af15..de76fb4abac2 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -139,11 +139,15 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	void __iomem *portal;
 	int rc;
 
-	if (idxd->state != IDXD_DEV_ENABLED)
+	if (idxd->state != IDXD_DEV_ENABLED) {
+		idxd_free_desc(wq, desc);
 		return -EIO;
+	}
 
-	if (!percpu_ref_tryget_live(&wq->wq_active))
+	if (!percpu_ref_tryget_live(&wq->wq_active)) {
+		idxd_free_desc(wq, desc);
 		return -ENXIO;
+	}
 
 	portal = idxd_wq_portal_addr(wq);
 
@@ -175,8 +179,11 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 		rc = enqcmds(portal, desc->hw);
 		if (rc < 0) {
 			percpu_ref_put(&wq->wq_active);
+			/* abort operation frees the descriptor */
 			if (ie)
 				llist_abort_desc(wq, ie, desc);
+			else
+				idxd_free_desc(wq, desc);
 			return rc;
 		}
 	}
-- 
Gitee


From 056cad725895e5d544874b671d0021f56ae931dd Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 6 Aug 2021 10:38:37 -0700
Subject: [PATCH 0673/2161] dmaengine: idxd: set descriptor allocation size to
 threshold for swq

commit 9806eb5c79579e200f88660e043706eb490547e6 upstream.

Since submission is sent to limited portal, the actual wq size for shared
wq is set by the threshold rather than the wq size. When the wq type is
shared, set the allocated descriptors to the threshold.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162827151733.3459223.3829837172226042408.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 21f0d732b76e..e093cf225a5c 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -141,8 +141,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	if (wq->type != IDXD_WQT_KERNEL)
 		return 0;
 
-	wq->num_descs = wq->size;
-	num_descs = wq->size;
+	num_descs = wq_dedicated(wq) ? wq->size : wq->threshold;
+	wq->num_descs = num_descs;
 
 	rc = alloc_hw_descs(wq, num_descs);
 	if (rc < 0)
-- 
Gitee


From 681e5e833cfce3ed529d6afb50a6e584f6e4d156 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 19 Aug 2021 09:34:06 -0700
Subject: [PATCH 0674/2161] dmaengine: idxd: fix setting up priv mode for dwq

commit d8071323c5632bdf0a8ef9b9e5662fac43649f9d upstream.

DSA spec says WQ priv bit is 0 if the Privileged Mode Enable field of the
PCI Express PASID capability is 0 and pasid is enabled. Make sure that the
WQCFG priv field is set correctly according to usage type. Reject config if
setting up kernel WQ type and no support. Also add the correct priv setup
for a descriptor.

Fixes: 484f910e93b4 ("dmaengine: idxd: fix wq config registers offset programming")
Cc: Ramesh Thomas <ramesh.thomas@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162939084657.903168.14160019185148244596.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/Kconfig       |  1 +
 drivers/dma/idxd/device.c | 29 ++++++++++++++++++++++++++++-
 drivers/dma/idxd/dma.c    |  6 +++++-
 include/uapi/linux/idxd.h |  1 +
 4 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 63f7e2fc8760..fb75d6af5726 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -281,6 +281,7 @@ config INTEL_IDXD
 	tristate "Intel Data Accelerators support"
 	depends on PCI && X86_64 && !UML
 	depends on PCI_MSI
+	depends on PCI_PASID
 	depends on SBITMAP
 	select DMA_ENGINE
 	help
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index e093cf225a5c..241df74fc047 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -818,6 +818,15 @@ static int idxd_groups_config_write(struct idxd_device *idxd)
 	return 0;
 }
 
+static bool idxd_device_pasid_priv_enabled(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+
+	if (pdev->pasid_enabled && (pdev->pasid_features & PCI_PASID_CAP_PRIV))
+		return true;
+	return false;
+}
+
 static int idxd_wq_config_write(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
@@ -850,7 +859,6 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	wq->wqcfg->wq_thresh = wq->threshold;
 
 	/* byte 8-11 */
-	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
 	if (wq_dedicated(wq))
 		wq->wqcfg->mode = 1;
 
@@ -860,6 +868,25 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 			wq->wqcfg->pasid = idxd->pasid;
 	}
 
+	/*
+	 * Here the priv bit is set depending on the WQ type. priv = 1 if the
+	 * WQ type is kernel to indicate privileged access. This setting only
+	 * matters for dedicated WQ. According to the DSA spec:
+	 * If the WQ is in dedicated mode, WQ PASID Enable is 1, and the
+	 * Privileged Mode Enable field of the PCI Express PASID capability
+	 * is 0, this field must be 0.
+	 *
+	 * In the case of a dedicated kernel WQ that is not able to support
+	 * the PASID cap, then the configuration will be rejected.
+	 */
+	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
+	if (wq_dedicated(wq) && wq->wqcfg->pasid_en &&
+	    !idxd_device_pasid_priv_enabled(idxd) &&
+	    wq->type == IDXD_WQT_KERNEL) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_PRIV;
+		return -EOPNOTSUPP;
+	}
+
 	wq->wqcfg->priority = wq->priority;
 
 	if (idxd->hw.gen_cap.block_on_fault &&
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 5c0a4d8a31f5..e0f056c1d1f5 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -69,7 +69,11 @@ static inline void idxd_prep_desc_common(struct idxd_wq *wq,
 	hw->src_addr = addr_f1;
 	hw->dst_addr = addr_f2;
 	hw->xfer_size = len;
-	hw->priv = !!(wq->type == IDXD_WQT_KERNEL);
+	/*
+	 * For dedicated WQ, this field is ignored and HW will use the WQCFG.priv
+	 * field instead. This field should be set to 1 for kernel descriptors.
+	 */
+	hw->priv = 1;
 	hw->completion_addr = compl;
 }
 
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 1c0175aa0e42..3a6c2fe812d4 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -27,6 +27,7 @@ enum idxd_scmd_stat {
 	IDXD_SCMD_WQ_NO_SWQ_SUPPORT = 0x800c0000,
 	IDXD_SCMD_WQ_NONE_CONFIGURED = 0x800d0000,
 	IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
+	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
 };
 
 #define IDXD_SCMD_SOFTERR_MASK	0x80000000
-- 
Gitee


From 77570e39cf5bbc6740f1be95620193ab3778a491 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 24 Aug 2021 14:24:39 -0700
Subject: [PATCH 0675/2161] dmaengine: idxd: remove interrupt disable for
 cmd_lock

commit f9f4082dbc56c40093bcb5c1f62c04a916eca9a2 upstream.

The cmd_lock spinlock is not being used in hard interrupt context. There is
no need to disable irq when acquiring the lock. Convert all cmd_lock
acquisition to plain spin_lock() calls.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162984027930.1939209.15758413737332339204.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 241df74fc047..4f6516d7555f 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -462,7 +462,6 @@ int idxd_device_init_reset(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
 	union idxd_command_reg cmd;
-	unsigned long flags;
 
 	if (idxd_device_is_halted(idxd)) {
 		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
@@ -472,13 +471,13 @@ int idxd_device_init_reset(struct idxd_device *idxd)
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.cmd = IDXD_CMD_RESET_DEVICE;
 	dev_dbg(dev, "%s: sending reset for init.\n", __func__);
-	spin_lock_irqsave(&idxd->cmd_lock, flags);
+	spin_lock(&idxd->cmd_lock);
 	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
 
 	while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) &
 	       IDXD_CMDSTS_ACTIVE)
 		cpu_relax();
-	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
+	spin_unlock(&idxd->cmd_lock);
 	return 0;
 }
 
@@ -487,7 +486,6 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 {
 	union idxd_command_reg cmd;
 	DECLARE_COMPLETION_ONSTACK(done);
-	unsigned long flags;
 	u32 stat;
 
 	if (idxd_device_is_halted(idxd)) {
@@ -502,7 +500,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	cmd.operand = operand;
 	cmd.int_req = 1;
 
-	spin_lock_irqsave(&idxd->cmd_lock, flags);
+	spin_lock(&idxd->cmd_lock);
 	wait_event_lock_irq(idxd->cmd_waitq,
 			    !test_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags),
 			    idxd->cmd_lock);
@@ -519,10 +517,10 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	 * After command submitted, release lock and go to sleep until
 	 * the command completes via interrupt.
 	 */
-	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
+	spin_unlock(&idxd->cmd_lock);
 	wait_for_completion(&done);
 	stat = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
-	spin_lock_irqsave(&idxd->cmd_lock, flags);
+	spin_lock(&idxd->cmd_lock);
 	if (status)
 		*status = stat;
 	idxd->cmd_status = stat & GENMASK(7, 0);
@@ -530,7 +528,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	__clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
 	/* Wake up other pending commands */
 	wake_up(&idxd->cmd_waitq);
-	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
+	spin_unlock(&idxd->cmd_lock);
 }
 
 int idxd_device_enable(struct idxd_device *idxd)
@@ -641,7 +639,6 @@ int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
 	struct device *dev = &idxd->pdev->dev;
 	u32 operand, status;
 	union idxd_command_reg cmd;
-	unsigned long flags;
 
 	if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE)))
 		return -EOPNOTSUPP;
@@ -659,13 +656,13 @@ int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
 
 	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_RELEASE_INT_HANDLE, operand);
 
-	spin_lock_irqsave(&idxd->cmd_lock, flags);
+	spin_lock(&idxd->cmd_lock);
 	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
 
 	while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE)
 		cpu_relax();
 	status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
-	spin_unlock_irqrestore(&idxd->cmd_lock, flags);
+	spin_unlock(&idxd->cmd_lock);
 
 	if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) {
 		dev_dbg(dev, "release int handle failed: %#x\n", status);
-- 
Gitee


From 0bd8a6966b0e511bff2235d5e309b1e5ee70d207 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 24 Aug 2021 14:24:27 -0700
Subject: [PATCH 0676/2161] dmaengine: idxd: remove interrupt disable for
 dev_lock

commit f9f4082dbc56c40093bcb5c1f62c04a916eca9a2 upstream.

The spinlock is not being used in hard interrupt context. There is no need
to disable irq when acquiring the lock. The interrupt thread handler also
is not in bottom half context, therefore we can also remove disabling of
the bh. Convert all dev_lock acquisition to plain spin_lock() calls.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162984026772.1939166.11504067782824765879.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c   |  5 ++---
 drivers/dma/idxd/device.c | 31 ++++++++++++-------------------
 drivers/dma/idxd/irq.c    |  8 ++++----
 drivers/dma/idxd/sysfs.c  | 10 ++++------
 4 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 4d2ecdb130e7..b9b2b4a4124e 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -218,14 +218,13 @@ static __poll_t idxd_cdev_poll(struct file *filp,
 	struct idxd_user_context *ctx = filp->private_data;
 	struct idxd_wq *wq = ctx->wq;
 	struct idxd_device *idxd = wq->idxd;
-	unsigned long flags;
 	__poll_t out = 0;
 
 	poll_wait(filp, &wq->err_queue, wait);
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	if (idxd->sw_err.valid)
 		out = EPOLLIN | EPOLLRDNORM;
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 
 	return out;
 }
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 4f6516d7555f..83a5ff2ecf2a 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -341,19 +341,18 @@ int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
 	int rc;
 	union wqcfg wqcfg;
 	unsigned int offset;
-	unsigned long flags;
 
 	rc = idxd_wq_disable(wq, false);
 	if (rc < 0)
 		return rc;
 
 	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
 	wqcfg.pasid_en = 1;
 	wqcfg.pasid = pasid;
 	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 
 	rc = idxd_wq_enable(wq);
 	if (rc < 0)
@@ -368,19 +367,18 @@ int idxd_wq_disable_pasid(struct idxd_wq *wq)
 	int rc;
 	union wqcfg wqcfg;
 	unsigned int offset;
-	unsigned long flags;
 
 	rc = idxd_wq_disable(wq, false);
 	if (rc < 0)
 		return rc;
 
 	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
 	wqcfg.pasid_en = 0;
 	wqcfg.pasid = 0;
 	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 
 	rc = idxd_wq_enable(wq);
 	if (rc < 0)
@@ -558,7 +556,6 @@ int idxd_device_disable(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
 	u32 status;
-	unsigned long flags;
 
 	if (!idxd_is_enabled(idxd)) {
 		dev_dbg(dev, "Device is not enabled\n");
@@ -574,22 +571,20 @@ int idxd_device_disable(struct idxd_device *idxd)
 		return -ENXIO;
 	}
 
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	idxd_device_clear_state(idxd);
 	idxd->state = IDXD_DEV_DISABLED;
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 	return 0;
 }
 
 void idxd_device_reset(struct idxd_device *idxd)
 {
-	unsigned long flags;
-
 	idxd_cmd_exec(idxd, IDXD_CMD_RESET_DEVICE, 0, NULL);
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	idxd_device_clear_state(idxd);
 	idxd->state = IDXD_DEV_DISABLED;
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 }
 
 void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid)
@@ -1164,7 +1159,6 @@ int __drv_enable_wq(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
-	unsigned long flags;
 	int rc = -ENXIO;
 
 	lockdep_assert_held(&wq->wq_lock);
@@ -1216,10 +1210,10 @@ int __drv_enable_wq(struct idxd_wq *wq)
 	}
 
 	rc = 0;
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		rc = idxd_device_config(idxd);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 	if (rc < 0) {
 		dev_dbg(dev, "Writing wq %d config failed: %d\n", wq->id, rc);
 		goto err;
@@ -1288,7 +1282,6 @@ void drv_disable_wq(struct idxd_wq *wq)
 int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 {
 	struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
-	unsigned long flags;
 	int rc = 0;
 
 	/*
@@ -1302,10 +1295,10 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 	}
 
 	/* Device configuration */
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		rc = idxd_device_config(idxd);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 	if (rc < 0)
 		return -ENXIO;
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index d221c2e37460..ca88fa7a328e 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -64,7 +64,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 	bool err = false;
 
 	if (cause & IDXD_INTC_ERR) {
-		spin_lock_bh(&idxd->dev_lock);
+		spin_lock(&idxd->dev_lock);
 		for (i = 0; i < 4; i++)
 			idxd->sw_err.bits[i] = ioread64(idxd->reg_base +
 					IDXD_SWERR_OFFSET + i * sizeof(u64));
@@ -89,7 +89,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 			}
 		}
 
-		spin_unlock_bh(&idxd->dev_lock);
+		spin_unlock(&idxd->dev_lock);
 		val |= IDXD_INTC_ERR;
 
 		for (i = 0; i < 4; i++)
@@ -133,7 +133,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 			INIT_WORK(&idxd->work, idxd_device_reinit);
 			queue_work(idxd->wq, &idxd->work);
 		} else {
-			spin_lock_bh(&idxd->dev_lock);
+			spin_lock(&idxd->dev_lock);
 			idxd_wqs_quiesce(idxd);
 			idxd_wqs_unmap_portal(idxd);
 			idxd_device_clear_state(idxd);
@@ -141,7 +141,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 				"idxd halted, need %s.\n",
 				gensts.reset_type == IDXD_DEVICE_RESET_FLR ?
 				"FLR" : "system reset");
-			spin_unlock_bh(&idxd->dev_lock);
+			spin_unlock(&idxd->dev_lock);
 			return -ENXIO;
 		}
 	}
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index a88886d0f27b..a9025be940db 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1099,16 +1099,15 @@ static ssize_t clients_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	struct idxd_device *idxd = confdev_to_idxd(dev);
-	unsigned long flags;
 	int count = 0, i;
 
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	for (i = 0; i < idxd->max_wqs; i++) {
 		struct idxd_wq *wq = idxd->wqs[i];
 
 		count += wq->client_count;
 	}
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 
 	return sysfs_emit(buf, "%d\n", count);
 }
@@ -1146,12 +1145,11 @@ static ssize_t errors_show(struct device *dev,
 {
 	struct idxd_device *idxd = confdev_to_idxd(dev);
 	int i, out = 0;
-	unsigned long flags;
 
-	spin_lock_irqsave(&idxd->dev_lock, flags);
+	spin_lock(&idxd->dev_lock);
 	for (i = 0; i < 4; i++)
 		out += sysfs_emit_at(buf, out, "%#018llx ", idxd->sw_err.bits[i]);
-	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	spin_unlock(&idxd->dev_lock);
 	out--;
 	out += sysfs_emit_at(buf, out, "\n");
 	return out;
-- 
Gitee


From ca67794fc4b989144a8e1a4e081458de618636ef Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 29 Sep 2021 12:15:38 -0700
Subject: [PATCH 0677/2161] dmaengine: idxd: move out percpu_ref_exit() to
 ensure it's outside submission

commit 85f604af9c83a4656b1d07bec73298c3ba7d7c1e upstream.

percpu_ref_tryget_live() is safe to call as long as ref is between init and
exit according to the function comment. Move percpu_ref_exit() so it is
called after the dma channel is no longer valid to ensure this holds true.

Fixes: 93a40a6d7428 ("dmaengine: idxd: add percpu_ref to descriptor submission path")
Suggested-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163294293832.914350.10326422026738506152.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 1 -
 drivers/dma/idxd/dma.c    | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 83a5ff2ecf2a..cbbfa17d8d11 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -427,7 +427,6 @@ void idxd_wq_quiesce(struct idxd_wq *wq)
 {
 	percpu_ref_kill(&wq->wq_active);
 	wait_for_completion(&wq->wq_dead);
-	percpu_ref_exit(&wq->wq_active);
 }
 
 /* Device control bits */
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index e0f056c1d1f5..b90b085d18cf 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -311,6 +311,7 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 
 err_dma:
 	idxd_wq_quiesce(wq);
+	percpu_ref_exit(&wq->wq_active);
 err_ref:
 	idxd_wq_free_resources(wq);
 err_res_alloc:
@@ -329,6 +330,7 @@ static void idxd_dmaengine_drv_remove(struct idxd_dev *idxd_dev)
 	idxd_wq_quiesce(wq);
 	idxd_unregister_dma_channel(wq);
 	__drv_disable_wq(wq);
+	percpu_ref_exit(&wq->wq_active);
 	idxd_wq_free_resources(wq);
 	wq->type = IDXD_WQT_NONE;
 	mutex_unlock(&wq->wq_lock);
-- 
Gitee


From 717756334fadb9f47f86f180ed39b47e9ac8e046 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 15 Oct 2021 13:34:47 +0100
Subject: [PATCH 0678/2161] dmaengine: Remove redundant initialization of
 variable err

commit 1f6a89efbf9981a637bf472a5662e4d3746af530 upstream.

The variable err is being initialized with a value that is never read, it
is being updated later on. The assignment is redundant and can be removed
and move the declaration into the local scope.

Addresses-Coverity: ("Unused value")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Link: https://lore.kernel.org/r/20211015123447.27560-1-colin.king@canonical.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index af3ee288bc11..d9f7c097cfd6 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -695,13 +695,12 @@ static struct dma_chan *find_candidate(struct dma_device *device,
  */
 struct dma_chan *dma_get_slave_channel(struct dma_chan *chan)
 {
-	int err = -EBUSY;
-
 	/* lock against __dma_request_channel */
 	mutex_lock(&dma_list_mutex);
 
 	if (chan->client_count == 0) {
 		struct dma_device *device = chan->device;
+		int err;
 
 		dma_cap_set(DMA_PRIVATE, device->cap_mask);
 		device->privatecnt++;
-- 
Gitee


From cc93a84fa661ed7384ce9b4c4bbc82bc446e2e32 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 12 Oct 2021 11:01:59 -0700
Subject: [PATCH 0679/2161] dmaengine: idxd: check GENCAP config support for
 gencfg register

commit 79c4c3db7d86b9bec94562275efc82e58f3d0132 upstream.

DSA spec 1.2 has moved the GENCFG register under the GENCAP configuration
support with respect to writability. Add check in driver before writing to
GENCFG register.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163406171896.1303830.11217958011385656998.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index cbbfa17d8d11..27612329f510 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -791,7 +791,7 @@ static int idxd_groups_config_write(struct idxd_device *idxd)
 	struct device *dev = &idxd->pdev->dev;
 
 	/* Setup bandwidth token limit */
-	if (idxd->token_limit) {
+	if (idxd->hw.gen_cap.config_en && idxd->token_limit) {
 		reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
 		reg.token_limit = idxd->token_limit;
 		iowrite32(reg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET);
-- 
Gitee


From 09b0c65042d95c4378b132ed89bf48757a1ad43f Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 12 Oct 2021 11:01:19 -0700
Subject: [PATCH 0680/2161] dmaengine: idxd: remove gen cap field per spec 1.2
 update

commit c5b64b6826e020041ef4f68a281dee2f33c827d6 upstream.

Remove max_descs_per_engine field. The recently released DSA spec 1.2 [1]
has removed this field and made it reserved.

[1]: https://software.intel.com/content/dam/develop/external/us/en/documents-tps/341204-intel-data-streaming-accelerator-spec.pdf

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163406167978.1303649.1798682437841822837.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/registers.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index ffc7550a77ee..eeb11e6eb25b 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -36,8 +36,7 @@ union gen_cap_reg {
 		u64 max_batch_shift:4;
 		u64 max_ims_mult:6;
 		u64 config_en:1;
-		u64 max_descs_per_engine:8;
-		u64 rsvd3:24;
+		u64 rsvd3:32;
 	};
 	u64 bits;
 } __packed;
-- 
Gitee


From 4cd505c1297c6be8d7eb33b38d39ca41c8cfa8ea Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 20 Oct 2021 09:27:25 -0700
Subject: [PATCH 0681/2161] dmaengine: idxd: remove kernel wq type set when
 load configuration

commit 15af840831f69baa9efb0d50007459d2015397a5 upstream.

Remove setting of wq type on guest kernel during configuration load on RO
device config. The user will set the kernel wq type and this setting based
on config is not necessary.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163474724511.2607444.1876715711451990426.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 27612329f510..ef3281fc3f8b 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1050,8 +1050,6 @@ static int idxd_wq_load_config(struct idxd_wq *wq)
 
 	wq->size = wq->wqcfg->wq_size;
 	wq->threshold = wq->wqcfg->wq_thresh;
-	if (wq->wqcfg->priv)
-		wq->type = IDXD_WQT_KERNEL;
 
 	/* The driver does not support shared WQ mode in read-only config yet */
 	if (wq->wqcfg->mode == 0 || wq->wqcfg->pasid_en)
-- 
Gitee


From 7a5a2e3242a92b0b457afa375a8cafce86e12200 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 21 Sep 2021 13:15:58 -0700
Subject: [PATCH 0682/2161] dmanegine: idxd: fix resource free ordering on
 driver removal

commit 98da0106aac0d3c5d4a3c95d238f1ff88957bbfc upstream.

Fault triggers on ioread32() when pci driver unbind is envoked. The
placement of idxd sub-driver removal causes the probing of the device mmio
region after the mmio mapping being torn down. The driver needs the
sub-drivers to be unbound but not release the idxd context until all
shutdown activities has been done. Move the sub-driver unregistering up
before the remove() calls shutdown(). But take a device ref on the
idxd->conf_dev so that the memory does not get freed in ->release(). When
all cleanup activities has been done, release the ref to allow the idxd
memory to be freed.

[57159.542766] RIP: 0010:ioread32+0x27/0x60
[57159.547097] Code: 00 66 90 48 81 ff ff ff 03 00 77 1e 48 81 ff 00 00 01 00 76 05 0f
 b7 d7 ed c3 8b 15 03 50 41 01 b8 ff ff ff ff 85 d2 75 04 c3 <8b> 07 c3 55 83 ea 01 48
 89 fe 48 c7 c7 00 70 5f 82 48 89 e5 48 83
[57159.566647] RSP: 0018:ffffc900011abb60 EFLAGS: 00010292
[57159.572295] RAX: ffffc900011e0000 RBX: ffff888107d39800 RCX: 0000000000000000
[57159.579842] RDX: 0000000000000000 RSI: ffffffff82b1e448 RDI: ffffc900011e0090
[57159.587421] RBP: ffffc900011abb88 R08: 0000000000000000 R09: 0000000000000001
[57159.594972] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8881019840d0
[57159.602533] R13: ffff8881097e9000 R14: ffffffffa08542a0 R15: 00000000000003a8
[57159.610093] FS:  00007f991e0a8740(0000) GS:ffff888459900000(0000) knlGS:00000000000
00000
[57159.618614] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[57159.624814] CR2: ffffc900011e0090 CR3: 000000010862a002 CR4: 00000000003706e0
[57159.632397] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[57159.639973] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[57159.647601] Call Trace:
[57159.650502]  ? idxd_device_disable+0x41/0x110 [idxd]
[57159.655948]  idxd_device_drv_remove+0x2b/0x80 [idxd]
[57159.661374]  idxd_config_bus_remove+0x16/0x20
[57159.666191]  __device_release_driver+0x163/0x240
[57159.671320]  device_release_driver+0x2b/0x40
[57159.676052]  bus_remove_device+0xf5/0x160
[57159.680524]  device_del+0x19c/0x400
[57159.684440]  device_unregister+0x18/0x60
[57159.688792]  idxd_remove+0x140/0x1c0 [idxd]
[57159.693406]  pci_device_remove+0x3e/0xb0
[57159.697758]  __device_release_driver+0x163/0x240
[57159.702788]  device_driver_detach+0x43/0xb0
[57159.707424]  unbind_store+0x11e/0x130
[57159.711537]  drv_attr_store+0x24/0x30
[57159.715646]  sysfs_kf_write+0x4b/0x60
[57159.719710]  kernfs_fop_write_iter+0x153/0x1e0
[57159.724563]  new_sync_write+0x120/0x1b0
[57159.728812]  vfs_write+0x23e/0x350
[57159.732624]  ksys_write+0x70/0xf0
[57159.736335]  __x64_sys_write+0x1a/0x20
[57159.740492]  do_syscall_64+0x3b/0x90
[57159.744465]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[57159.749908] RIP: 0033:0x7f991e19c387
[57159.753898] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e
 fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51
 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
[57159.773564] RSP: 002b:00007ffc2ce2d6a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[57159.781550] RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f991e19c387
[57159.789133] RDX: 000000000000000c RSI: 000055ee2630e140 RDI: 0000000000000001
[57159.796695] RBP: 000055ee2630e140 R08: 0000000000000000 R09: 00007f991e2324e0
[57159.804246] R10: 00007f991e2323e0 R11: 0000000000000246 R12: 000000000000000c
[57159.811800] R13: 00007f991e26f520 R14: 000000000000000c R15: 00007f991e26f700
[57159.819373] Modules linked in: idxd bridge stp llc bnep sunrpc nls_iso8859_1 intel_
rapl_msr intel_rapl_common x86_pkg_temp_thermal intel_powerclamp coretemp snd_hda_code
c_realtek iTCO_wdt 8250_dw snd_hda_codec_generic kvm_intel ledtrig_audio iTCO_vendor_s
upport snd_hda_intel snd_intel_dspcfg ppdev kvm snd_hda_codec intel_wmi_thunderbolt sn
d_hwdep irqbypass iwlwifi btusb snd_hda_core rapl btrtl intel_cstate snd_seq btbcm snd
_seq_device btintel snd_pcm cfg80211 bluetooth pcspkr psmouse input_leds snd_timer int
el_lpss_pci mei_me intel_lpss snd ecdh_generic ecc mei ucsi_acpi i2c_i801 idma64 i2c_s
mbus virt_dma soundcore typec_ucsi typec wmi parport_pc parport video mac_hid acpi_pad
 sch_fq_codel drm ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
 usbkbd hid_generic usbmouse aesni_intel usbhid crypto_simd cryptd e1000e hid serio_ra
w ahci libahci pinctrl_sunrisepoint fuse msr autofs4 [last unloaded: idxd]
[57159.904082] CR2: ffffc900011e0090
[57159.907877] ---[ end trace b4e32f49ce9176a4 ]---

Fixes: 49c4959f04b5 ("dmaengine: idxd: fix sequence for pci driver remove() and shutdown()")
Reported-by: Ziye Yang <ziye.yang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163225535868.4152687.9318737776682088722.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index eb09bc591c31..7bf03f371ce1 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -797,11 +797,19 @@ static void idxd_remove(struct pci_dev *pdev)
 	int msixcnt = pci_msix_vec_count(pdev);
 	int i;
 
-	dev_dbg(&pdev->dev, "%s called\n", __func__);
+	idxd_unregister_devices(idxd);
+	/*
+	 * When ->release() is called for the idxd->conf_dev, it frees all the memory related
+	 * to the idxd context. The driver still needs those bits in order to do the rest of
+	 * the cleanup. However, we do need to unbound the idxd sub-driver. So take a ref
+	 * on the device here to hold off the freeing while allowing the idxd sub-driver
+	 * to unbind.
+	 */
+	get_device(idxd_confdev(idxd));
+	device_unregister(idxd_confdev(idxd));
 	idxd_shutdown(pdev);
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
-	idxd_unregister_devices(idxd);
 
 	for (i = 0; i < msixcnt; i++) {
 		irq_entry = &idxd->irq_entries[i];
@@ -815,7 +823,7 @@ static void idxd_remove(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 	destroy_workqueue(idxd->wq);
 	perfmon_pmu_remove(idxd);
-	device_unregister(idxd_confdev(idxd));
+	put_device(idxd_confdev(idxd));
 }
 
 static struct pci_driver idxd_pci_driver = {
-- 
Gitee


From 39329ecdceb7ad5a7b6aaa9fc5b7b75c724b201c Mon Sep 17 00:00:00 2001
From: Bixuan Cui <cuibixuan@huawei.com>
Date: Wed, 8 Sep 2021 17:28:26 +0800
Subject: [PATCH 0683/2161] dmaengine: idxd: Use list_move_tail instead of
 list_del/list_add_tail

commit ee5c6f0ca219b65f5085043d481d9b6f045693d5 upstream.

Using list_move_tail() instead of list_del() + list_add_tail()

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Bixuan Cui <cuibixuan@huawei.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20210908092826.67765-1-cuibixuan@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index ca88fa7a328e..79fcfc4883e4 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -221,8 +221,7 @@ static void irq_process_work_list(struct idxd_irq_entry *irq_entry)
 
 	list_for_each_entry_safe(desc, n, &irq_entry->work_list, list) {
 		if (desc->completion->status) {
-			list_del(&desc->list);
-			list_add_tail(&desc->list, &flist);
+			list_move_tail(&desc->list, &flist);
 		}
 	}
 
-- 
Gitee


From 34110077f88ec6706b6a87b42597b4d5b2f2ff69 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 8 Sep 2021 16:04:03 -0700
Subject: [PATCH 0684/2161] dmaengine: idxd: add halt interrupt support

commit 88d97ea82cbe352851a8654ee952d3a694c8c2c6 upstream.

Add halt interrupt support. Given that the misc interrupt handler already
check halt state, the driver just need to run the halt handling code when
receiving the halt interrupt.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163114224352.846654.14334468363464318828.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c       | 5 +++++
 drivers/dma/idxd/registers.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 79fcfc4883e4..17f2f8a31b63 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -63,6 +63,9 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 	int i;
 	bool err = false;
 
+	if (cause & IDXD_INTC_HALT_STATE)
+		goto halt;
+
 	if (cause & IDXD_INTC_ERR) {
 		spin_lock(&idxd->dev_lock);
 		for (i = 0; i < 4; i++)
@@ -121,6 +124,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 	if (!err)
 		return 0;
 
+halt:
 	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
 	if (gensts.state == IDXD_DEVICE_STATE_HALT) {
 		idxd->state = IDXD_DEV_HALTED;
@@ -134,6 +138,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 			queue_work(idxd->wq, &idxd->work);
 		} else {
 			spin_lock(&idxd->dev_lock);
+			idxd->state = IDXD_DEV_HALTED;
 			idxd_wqs_quiesce(idxd);
 			idxd_wqs_unmap_portal(idxd);
 			idxd_device_clear_state(idxd);
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index eeb11e6eb25b..262c8220adbd 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -157,6 +157,7 @@ enum idxd_device_reset_type {
 #define IDXD_INTC_CMD			0x02
 #define IDXD_INTC_OCCUPY			0x04
 #define IDXD_INTC_PERFMON_OVFL		0x08
+#define IDXD_INTC_HALT_STATE		0x10
 
 #define IDXD_CMD_OFFSET			0xa0
 union idxd_command_reg {
-- 
Gitee


From 761ee5aed111dfdc859a53255971396a8f2979f0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 1 Sep 2021 17:18:05 -0700
Subject: [PATCH 0685/2161] dmaengine: idxd: reconfig device after device reset
 command

commit e530a9f3db4188d1f4e3704b0948ef69c04d5ca6 upstream.

Device reset clears the MSIXPERM table and the device registers. Re-program
the MSIXPERM table and re-enable the error interrupts post reset.

Fixes: 745e92a6d816 ("dmaengine: idxd: idxd: move remove() bits for idxd 'struct device' to device.c")
Reported-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163054188513.2853562.12077053294595278181.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index ef3281fc3f8b..b1407465d5c4 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -583,6 +583,8 @@ void idxd_device_reset(struct idxd_device *idxd)
 	spin_lock(&idxd->dev_lock);
 	idxd_device_clear_state(idxd);
 	idxd->state = IDXD_DEV_DISABLED;
+	idxd_unmask_error_interrupts(idxd);
+	idxd_msix_perm_setup(idxd);
 	spin_unlock(&idxd->dev_lock);
 }
 
-- 
Gitee


From 933d607864ff81127c098865d8d4536fe86ec856 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 25 Oct 2021 07:59:49 -0700
Subject: [PATCH 0686/2161] dmaengine: idxd: cleanup completion record
 allocation

commit 2efe58cfaad4ad94eab888d287c174de5209a0c2 upstream.

According to core-api/dma-api-howto.rst, the address from
dma_alloc_coherent is gauranteed to align to the smallest PAGE_SIZE order.
That supercedes the 64B/32B alignment requirement of the completion record.
Remove alignment adjustment code.

Tested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163517396063.3484297.7494385225280705372.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 22 +++++-----------------
 drivers/dma/idxd/idxd.h   |  2 --
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index b1407465d5c4..fab412349f7f 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -135,8 +135,6 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
 	int rc, num_descs, i;
-	int align;
-	u64 tmp;
 
 	if (wq->type != IDXD_WQT_KERNEL)
 		return 0;
@@ -148,21 +146,13 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	if (rc < 0)
 		return rc;
 
-	align = idxd->data->align;
-	wq->compls_size = num_descs * idxd->data->compl_size + align;
-	wq->compls_raw = dma_alloc_coherent(dev, wq->compls_size,
-					    &wq->compls_addr_raw, GFP_KERNEL);
-	if (!wq->compls_raw) {
+	wq->compls_size = num_descs * idxd->data->compl_size;
+	wq->compls = dma_alloc_coherent(dev, wq->compls_size, &wq->compls_addr, GFP_KERNEL);
+	if (!wq->compls) {
 		rc = -ENOMEM;
 		goto fail_alloc_compls;
 	}
 
-	/* Adjust alignment */
-	wq->compls_addr = (wq->compls_addr_raw + (align - 1)) & ~(align - 1);
-	tmp = (u64)wq->compls_raw;
-	tmp = (tmp + (align - 1)) & ~(align - 1);
-	wq->compls = (struct dsa_completion_record *)tmp;
-
 	rc = alloc_descs(wq, num_descs);
 	if (rc < 0)
 		goto fail_alloc_descs;
@@ -191,8 +181,7 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
  fail_sbitmap_init:
 	free_descs(wq);
  fail_alloc_descs:
-	dma_free_coherent(dev, wq->compls_size, wq->compls_raw,
-			  wq->compls_addr_raw);
+	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
  fail_alloc_compls:
 	free_hw_descs(wq);
 	return rc;
@@ -207,8 +196,7 @@ void idxd_wq_free_resources(struct idxd_wq *wq)
 
 	free_hw_descs(wq);
 	free_descs(wq);
-	dma_free_coherent(dev, wq->compls_size, wq->compls_raw,
-			  wq->compls_addr_raw);
+	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
 	sbitmap_queue_free(&wq->sbq);
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index bfcb03329f77..0cf8d3145870 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -187,9 +187,7 @@ struct idxd_wq {
 		struct dsa_completion_record *compls;
 		struct iax_completion_record *iax_compls;
 	};
-	void *compls_raw;
 	dma_addr_t compls_addr;
-	dma_addr_t compls_addr_raw;
 	int compls_size;
 	struct idxd_desc **descs;
 	struct sbitmap_queue sbq;
-- 
Gitee


From f98f97fff564178da056c4a8dc0f3edc1b9af011 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 25 Oct 2021 08:01:04 -0700
Subject: [PATCH 0687/2161] dmaengine: idxd: fix resource leak on dmaengine
 driver disable

commit a3e340c1574b6679f5b333221284d0959095da52 upstream.

The wq resources needs to be released before the kernel type is reset by
__drv_disable_wq(). With dma channels unregistered and wq quiesced, all the
wq resources for dmaengine can be freed. There is no need to wait until wq
is disabled. With the wq->type being reset to "unknown", the driver is
skipping the freeing of the resources.

Fixes: 0cda4f6986a3 ("dmaengine: idxd: create dmaengine driver for wq 'device'")
Reported-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Tested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163517405099.3484556.12521975053711345244.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/dma.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index b90b085d18cf..c39e9483206a 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -329,10 +329,9 @@ static void idxd_dmaengine_drv_remove(struct idxd_dev *idxd_dev)
 	mutex_lock(&wq->wq_lock);
 	idxd_wq_quiesce(wq);
 	idxd_unregister_dma_channel(wq);
+	idxd_wq_free_resources(wq);
 	__drv_disable_wq(wq);
 	percpu_ref_exit(&wq->wq_active);
-	idxd_wq_free_resources(wq);
-	wq->type = IDXD_WQT_NONE;
 	mutex_unlock(&wq->wq_lock);
 }
 
-- 
Gitee


From cdcbb454d9ced18512d97474809bb49880788927 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 18 May 2021 13:03:15 -0700
Subject: [PATCH 0688/2161] uapi/auxvec: Define the aux vector AT_MINSIGSTKSZ

commit 7cd60e43a6def40ecb75deb8decc677995970d0b upstream.

Define AT_MINSIGSTKSZ in the generic uapi header. It is already used
as generic ABI in glibc's generic elf.h, and this define will prevent
future namespace conflicts. In particular, x86 is also using this
generic definition.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Len Brown <len.brown@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20210518200320.17239-2-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/auxvec.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index abe5f2b6581b..c7e502bf5a6f 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -33,5 +33,8 @@
 
 #define AT_EXECFN  31	/* filename of program */
 
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ	51	/* minimal stack size for signal delivery */
+#endif
 
 #endif /* _UAPI_LINUX_AUXVEC_H */
-- 
Gitee


From 46f46cbab51eb431b48e2abaf6dd4c74153ae677 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 18 May 2021 13:03:17 -0700
Subject: [PATCH 0689/2161] x86/elf: Support a new ELF aux vector
 AT_MINSIGSTKSZ

commit 1c33bb0507508af24fd754dd7123bd8e997fab2f upstream.

Historically, signal.h defines MINSIGSTKSZ (2KB) and SIGSTKSZ (8KB), for
use by all architectures with sigaltstack(2). Over time, the hardware state
size grew, but these constants did not evolve. Today, literal use of these
constants on several architectures may result in signal stack overflow, and
thus user data corruption.

A few years ago, the ARM team addressed this issue by establishing
getauxval(AT_MINSIGSTKSZ). This enables the kernel to supply a value
at runtime that is an appropriate replacement on current and future
hardware.

Add getauxval(AT_MINSIGSTKSZ) support to x86, analogous to the support
added for ARM in

  94b07c1f8c39 ("arm64: signal: Report signal frame size to userspace via auxv").

Also, include a documentation to describe x86-specific auxiliary vectors.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Len Brown <len.brown@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20210518200320.17239-4-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/elf_auxvec.rst   | 53 ++++++++++++++++++++++++++++++
 Documentation/x86/index.rst        |  1 +
 arch/x86/include/asm/elf.h         |  4 +++
 arch/x86/include/uapi/asm/auxvec.h |  4 +--
 arch/x86/kernel/signal.c           |  5 +++
 5 files changed, 65 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/x86/elf_auxvec.rst

diff --git a/Documentation/x86/elf_auxvec.rst b/Documentation/x86/elf_auxvec.rst
new file mode 100644
index 000000000000..18e4744717f9
--- /dev/null
+++ b/Documentation/x86/elf_auxvec.rst
@@ -0,0 +1,53 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================================
+x86-specific ELF Auxiliary Vectors
+==================================
+
+This document describes the semantics of the x86 auxiliary vectors.
+
+Introduction
+============
+
+ELF Auxiliary vectors enable the kernel to efficiently provide
+configuration-specific parameters to userspace. In this example, a program
+allocates an alternate stack based on the kernel-provided size::
+
+   #include <sys/auxv.h>
+   #include <elf.h>
+   #include <signal.h>
+   #include <stdlib.h>
+   #include <assert.h>
+   #include <err.h>
+
+   #ifndef AT_MINSIGSTKSZ
+   #define AT_MINSIGSTKSZ	51
+   #endif
+
+   ....
+   stack_t ss;
+
+   ss.ss_sp = malloc(ss.ss_size);
+   assert(ss.ss_sp);
+
+   ss.ss_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+   ss.ss_flags = 0;
+
+   if (sigaltstack(&ss, NULL))
+        err(1, "sigaltstack");
+
+
+The exposed auxiliary vectors
+=============================
+
+AT_SYSINFO is used for locating the vsyscall entry point.  It is not
+exported on 64-bit mode.
+
+AT_SYSINFO_EHDR is the start address of the page containing the vDSO.
+
+AT_MINSIGSTKSZ denotes the minimum stack size required by the kernel to
+deliver a signal to user-space.  AT_MINSIGSTKSZ comprehends the space
+consumed by the kernel to accommodate the user context for the current
+hardware configuration.  It does not comprehend subsequent user-space stack
+consumption, which must be added by the user.  (e.g. Above, user-space adds
+SIGSTKSZ to AT_MINSIGSTKSZ.)
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 849ba0747026..0b626ec2cb15 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -33,3 +33,4 @@ x86-specific Documentation
    x86_64/index
    sgx
    sva
+   elf_auxvec
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ae7f38bdce05..ca00132fe122 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -313,6 +313,7 @@ do {									\
 		NEW_AUX_ENT(AT_SYSINFO,	VDSO_ENTRY);			\
 		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);	\
 	}								\
+	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
 } while (0)
 
 /*
@@ -329,6 +330,7 @@ extern unsigned long task_size_32bit(void);
 extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
 
 #ifdef CONFIG_X86_32
 
@@ -350,6 +352,7 @@ do {									\
 	if (vdso64_enabled)						\
 		NEW_AUX_ENT(AT_SYSINFO_EHDR,				\
 			    (unsigned long __force)current->mm->context.vdso); \
+	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
 } while (0)
 
 /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
@@ -358,6 +361,7 @@ do {									\
 	if (vdso64_enabled)						\
 		NEW_AUX_ENT(AT_SYSINFO_EHDR,				\
 			    (unsigned long __force)current->mm->context.vdso); \
+	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
 } while (0)
 
 #define AT_SYSINFO		32
diff --git a/arch/x86/include/uapi/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h
index 580e3c567046..6beb55bbefa4 100644
--- a/arch/x86/include/uapi/asm/auxvec.h
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -12,9 +12,9 @@
 
 /* entries in ARCH_DLINFO: */
 #if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
+# define AT_VECTOR_SIZE_ARCH 3
 #else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
+# define AT_VECTOR_SIZE_ARCH 2
 #endif
 
 #endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index c747468c183a..22ef679f17c6 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -722,6 +722,11 @@ void __init init_sigframe_size(void)
 	pr_info("max sigframe size: %lu\n", max_frame_size);
 }
 
+unsigned long get_sigframe_size(void)
+{
+	return max_frame_size;
+}
+
 static inline int is_ia32_compat_frame(struct ksignal *ksig)
 {
 	return IS_ENABLED(CONFIG_IA32_EMULATION) &&
-- 
Gitee


From 396f67c80e9cba98132cc7e64c1c3770d753920a Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 18 May 2021 13:03:18 -0700
Subject: [PATCH 0690/2161] selftest/sigaltstack: Use the AT_MINSIGSTKSZ aux
 vector if available

commit bdf6c8b84a4fa726c382ef6d3518f3ae123a7ebd upstream.

The SIGSTKSZ constant may not represent enough stack size in some
architectures as the hardware state size grows.

Use getauxval(AT_MINSIGSTKSZ) to increase the stack size.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Len Brown <len.brown@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20210518200320.17239-5-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/sigaltstack/sas.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c
index ad0f8df2ca0a..cf2f99218d94 100644
--- a/tools/testing/selftests/sigaltstack/sas.c
+++ b/tools/testing/selftests/sigaltstack/sas.c
@@ -17,6 +17,7 @@
 #include <string.h>
 #include <assert.h>
 #include <errno.h>
+#include <sys/auxv.h>
 
 #include "../kselftest.h"
 
@@ -24,6 +25,11 @@
 #define SS_AUTODISARM  (1U << 31)
 #endif
 
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ	51
+#endif
+
+static unsigned int stack_size;
 static void *sstack, *ustack;
 static ucontext_t uc, sc;
 static const char *msg = "[OK]\tStack preserved";
@@ -47,7 +53,7 @@ void my_usr1(int sig, siginfo_t *si, void *u)
 #endif
 
 	if (sp < (unsigned long)sstack ||
-			sp >= (unsigned long)sstack + SIGSTKSZ) {
+			sp >= (unsigned long)sstack + stack_size) {
 		ksft_exit_fail_msg("SP is not on sigaltstack\n");
 	}
 	/* put some data on stack. other sighandler will try to overwrite it */
@@ -108,6 +114,10 @@ int main(void)
 	stack_t stk;
 	int err;
 
+	/* Make sure more than the required minimum. */
+	stack_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+	ksft_print_msg("[NOTE]\tthe stack size is %lu\n", stack_size);
+
 	ksft_print_header();
 	ksft_set_plan(3);
 
@@ -117,7 +127,7 @@ int main(void)
 	sigaction(SIGUSR1, &act, NULL);
 	act.sa_sigaction = my_usr2;
 	sigaction(SIGUSR2, &act, NULL);
-	sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+	sstack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
 		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 	if (sstack == MAP_FAILED) {
 		ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
@@ -139,7 +149,7 @@ int main(void)
 	}
 
 	stk.ss_sp = sstack;
-	stk.ss_size = SIGSTKSZ;
+	stk.ss_size = stack_size;
 	stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
 	err = sigaltstack(&stk, NULL);
 	if (err) {
@@ -161,7 +171,7 @@ int main(void)
 		}
 	}
 
-	ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+	ustack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
 		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 	if (ustack == MAP_FAILED) {
 		ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
@@ -170,7 +180,7 @@ int main(void)
 	getcontext(&uc);
 	uc.uc_link = NULL;
 	uc.uc_stack.ss_sp = ustack;
-	uc.uc_stack.ss_size = SIGSTKSZ;
+	uc.uc_stack.ss_size = stack_size;
 	makecontext(&uc, switch_fn, 0);
 	raise(SIGUSR1);
 
-- 
Gitee


From 6361f54c7267e6586886955df78cf1f872df7e58 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 18 May 2021 13:03:20 -0700
Subject: [PATCH 0691/2161] selftest/x86/signal: Include test cases for
 validating sigaltstack

commit 8919f07276991c7bf0d0802f0356331c5c62f7a2 upstream.

The test measures the kernel's signal delivery with different (enough vs.
insufficient) stack sizes.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Len Brown <len.brown@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20210518200320.17239-7-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/x86/Makefile      |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++++++++++++++++++++++
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index ac44a33b4c39..f21400d1582b 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
 			check_initial_reg_state sigreturn iopl ioperm \
 			protection_keys test_vdso test_vsyscall mov_ss_trap \
-			syscall_arg_fault fsgsbase_restore
+			syscall_arg_fault sigaltstack fsgsbase_restore
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
diff --git a/tools/testing/selftests/x86/sigaltstack.c b/tools/testing/selftests/x86/sigaltstack.c
new file mode 100644
index 000000000000..f689af75e979
--- /dev/null
+++ b/tools/testing/selftests/x86/sigaltstack.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include <signal.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/mman.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <setjmp.h>
+
+/* sigaltstack()-enforced minimum stack */
+#define ENFORCED_MINSIGSTKSZ	2048
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ	51
+#endif
+
+static int nerrs;
+
+static bool sigalrm_expected;
+
+static unsigned long at_minstack_size;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_DFL;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static int setup_altstack(void *start, unsigned long size)
+{
+	stack_t ss;
+
+	memset(&ss, 0, sizeof(ss));
+	ss.ss_size = size;
+	ss.ss_sp = start;
+
+	return sigaltstack(&ss, NULL);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+	if (sigalrm_expected) {
+		printf("[FAIL]\tWrong signal delivered: SIGSEGV (expected SIGALRM).");
+		nerrs++;
+	} else {
+		printf("[OK]\tSIGSEGV signal delivered.\n");
+	}
+
+	siglongjmp(jmpbuf, 1);
+}
+
+static void sigalrm(int sig, siginfo_t *info, void *ctx_void)
+{
+	if (!sigalrm_expected) {
+		printf("[FAIL]\tWrong signal delivered: SIGALRM (expected SIGSEGV).");
+		nerrs++;
+	} else {
+		printf("[OK]\tSIGALRM signal delivered.\n");
+	}
+}
+
+static void test_sigaltstack(void *altstack, unsigned long size)
+{
+	if (setup_altstack(altstack, size))
+		err(1, "sigaltstack()");
+
+	sigalrm_expected = (size > at_minstack_size) ? true : false;
+
+	sethandler(SIGSEGV, sigsegv, 0);
+	sethandler(SIGALRM, sigalrm, SA_ONSTACK);
+
+	if (!sigsetjmp(jmpbuf, 1)) {
+		printf("[RUN]\tTest an alternate signal stack of %ssufficient size.\n",
+		       sigalrm_expected ? "" : "in");
+		printf("\tRaise SIGALRM. %s is expected to be delivered.\n",
+		       sigalrm_expected ? "It" : "SIGSEGV");
+		raise(SIGALRM);
+	}
+
+	clearhandler(SIGALRM);
+	clearhandler(SIGSEGV);
+}
+
+int main(void)
+{
+	void *altstack;
+
+	at_minstack_size = getauxval(AT_MINSIGSTKSZ);
+
+	altstack = mmap(NULL, at_minstack_size + SIGSTKSZ, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+	if (altstack == MAP_FAILED)
+		err(1, "mmap()");
+
+	if ((ENFORCED_MINSIGSTKSZ + 1) < at_minstack_size)
+		test_sigaltstack(altstack, ENFORCED_MINSIGSTKSZ + 1);
+
+	test_sigaltstack(altstack, at_minstack_size + SIGSTKSZ);
+
+	return nerrs == 0 ? 0 : 1;
+}
-- 
Gitee


From e4d3dcafee5d3f177ab6de59c7a4e8987809a891 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 14 Dec 2021 11:37:22 +0800
Subject: [PATCH 0692/2161] x86/mce: Do not log spurious corrected mce errors

commit 2976908e4198aa02fc3f76802358f69396267189 upstream.

A user has reported that they are seeing spurious corrected errors on
their hardware.

Intel Errata HSD131, HSM142, HSW131, and BDM48 report that "spurious
corrected errors may be logged in the IA32_MC0_STATUS register with
the valid field (bit 63) set, the uncorrected error field (bit 61) not
set, a Model Specific Error Code (bits [31:16]) of 0x000F, and an MCA
Error Code (bits [15:0]) of 0x0005." The Errata PDFs are linked in the
bugzilla below.

Block these spurious errors from the console and logs.

 [ bp: Move the intel_filter_mce() header declarations into the already
   existing CONFIG_X86_MCE_INTEL ifdeffery. ]

Co-developed-by: Alexander Krupp <centos@akr.yagii.de>
Signed-off-by: Alexander Krupp <centos@akr.yagii.de>
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=206587
Link: https://lkml.kernel.org/r/20200219131611.36816-1-prarit@redhat.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/mce/core.c     |  2 ++
 arch/x86/kernel/cpu/mce/intel.c    | 17 +++++++++++++++++
 arch/x86/kernel/cpu/mce/internal.h |  2 ++
 3 files changed, 21 insertions(+)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2e6371a2f123..ad7e0e1441cc 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1893,6 +1893,8 @@ bool filter_mce(struct mce *m)
 {
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return amd_filter_mce(m);
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		return intel_filter_mce(m);
 
 	return false;
 }
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f2350967a898..04081400df5c 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -517,3 +517,20 @@ void mce_intel_feature_clear(struct cpuinfo_x86 *c)
 {
 	intel_clear_lmce();
 }
+
+bool intel_filter_mce(struct mce *m)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	/* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */
+	if ((c->x86 == 6) &&
+	    ((c->x86_model == INTEL_FAM6_HASWELL) ||
+	     (c->x86_model == INTEL_FAM6_HASWELL_L) ||
+	     (c->x86_model == INTEL_FAM6_BROADWELL) ||
+	     (c->x86_model == INTEL_FAM6_HASWELL_G)) &&
+	    (m->bank == 0) &&
+	    ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
+		return true;
+
+	return false;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 4f90f720b281..e017ddbcfb12 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -46,11 +46,13 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval);
 bool mce_intel_cmci_poll(void);
 void mce_intel_hcpu_update(unsigned long cpu);
 void cmci_disable_bank(int bank);
+bool intel_filter_mce(struct mce *m);
 #else
 # define cmci_intel_adjust_timer mce_adjust_timer_default
 static inline bool mce_intel_cmci_poll(void) { return false; }
 static inline void mce_intel_hcpu_update(unsigned long cpu) { }
 static inline void cmci_disable_bank(int bank) { }
+static inline bool intel_filter_mce(struct mce *m) { return false; };
 #endif
 
 void mce_timer_kick(unsigned long interval);
-- 
Gitee


From 3ac734e1769d47822c80e9dccbfc355f2a10f725 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 14 Sep 2020 19:21:28 +0200
Subject: [PATCH 0693/2161] x86/mce: Annotate mce_rd/wrmsrl() with noinstr

commit e100777016fdf6ec3a9d7c1773b15a2b5eca6c55 upstream.

They do get called from the #MC handler which is already marked
"noinstr".

Commit

  e2def7d49d08 ("x86/mce: Make mce_rdmsrl() panic on an inaccessible MSR")

already got rid of the instrumentation in the MSR accessors, fix the
annotation now too, in order to get rid of:

  vmlinux.o: warning: objtool: do_machine_check()+0x4a: call to mce_rdmsrl() leaves .noinstr.text section

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200915194020.28807-1-bp@alien8.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/mce/core.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ad7e0e1441cc..24c849550465 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -398,16 +398,25 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
 }
 
 /* MSR access wrappers used for error injection */
-static u64 mce_rdmsrl(u32 msr)
+static noinstr u64 mce_rdmsrl(u32 msr)
 {
 	DECLARE_ARGS(val, low, high);
 
 	if (__this_cpu_read(injectm.finished)) {
-		int offset = msr_to_offset(msr);
+		int offset;
+		u64 ret;
 
+		instrumentation_begin();
+
+		offset = msr_to_offset(msr);
 		if (offset < 0)
-			return 0;
-		return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+			ret = 0;
+		else
+			ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+
+		instrumentation_end();
+
+		return ret;
 	}
 
 	/*
@@ -443,15 +452,21 @@ __visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
 	return true;
 }
 
-static void mce_wrmsrl(u32 msr, u64 v)
+static noinstr void mce_wrmsrl(u32 msr, u64 v)
 {
 	u32 low, high;
 
 	if (__this_cpu_read(injectm.finished)) {
-		int offset = msr_to_offset(msr);
+		int offset;
 
+		instrumentation_begin();
+
+		offset = msr_to_offset(msr);
 		if (offset >= 0)
 			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+
+		instrumentation_end();
+
 		return;
 	}
 
-- 
Gitee


From 55c3e52476cbac6cb7a75990bb66d86e69e29cc9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 13 May 2021 13:41:41 +0200
Subject: [PATCH 0694/2161] x86/asm: Make <asm/asm.h> valid on cross-builds as
 well

commit 41f45fb045bcc20e71eb705b361356e715682162 upstream.

Stephen Rothwell reported that the objtool cross-build breaks on
non-x86 hosts:

  > tools/arch/x86/include/asm/asm.h:185:24: error: invalid register name for 'current_stack_pointer'
  >   185 | register unsigned long current_stack_pointer asm(_ASM_SP);
  >       |                        ^~~~~~~~~~~~~~~~~~~~~

The PowerPC host obviously doesn't know much about x86 register names.

Protect the kernel-specific bits of <asm/asm.h>, so that it can be
included by tooling and cross-built.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/asm.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 12933d406207..948bd8171348 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -117,6 +117,8 @@
 # define CC_OUT(c) [_cc_ ## c] "=qm"
 #endif
 
+#ifdef __KERNEL__
+
 /* Exception table entry */
 #ifdef __ASSEMBLY__
 # define _ASM_EXTABLE_HANDLE(from, to, handler)			\
@@ -193,4 +195,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
 
+#endif /* __KERNEL__ */
+
 #endif /* _ASM_X86_ASM_H */
-- 
Gitee


From d3bd42c191dd73864858ffc3584d101f5605f83e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:12 +0200
Subject: [PATCH 0695/2161] x86/extable: Tidy up redundant handler functions

commit 326b567f82df0c4c8f50092b9af9a3014616fb3c upstream.

No need to have the same code all over the place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132524.963232825@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/mm/extable.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index e91d45fc78fa..bf799337efb2 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -38,9 +38,8 @@ __visible bool ex_handler_fault(const struct exception_table_entry *fixup,
 				unsigned long error_code,
 				unsigned long fault_addr)
 {
-	regs->ip = ex_fixup_addr(fixup);
 	regs->ax = trapnr;
-	return true;
+	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
 }
 EXPORT_SYMBOL_GPL(ex_handler_fault);
 
@@ -124,8 +123,7 @@ __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
 				  unsigned long fault_addr)
 {
 	WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
-	regs->ip = ex_fixup_addr(fixup);
-	return true;
+	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
 }
 EXPORT_SYMBOL(ex_handler_uaccess);
 
@@ -147,9 +145,7 @@ __visible bool ex_handler_copy(const struct exception_table_entry *fixup,
 			       unsigned long fault_addr)
 {
 	WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
-	regs->ip = ex_fixup_addr(fixup);
-	regs->ax = trapnr;
-	return true;
+	return ex_handler_fault(fixup, regs, trapnr, error_code, fault_addr);
 }
 EXPORT_SYMBOL(ex_handler_copy);
 
@@ -163,10 +159,9 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup
 		show_stack_regs(regs);
 
 	/* Pretend that the read succeeded and returned 0. */
-	regs->ip = ex_fixup_addr(fixup);
 	regs->ax = 0;
 	regs->dx = 0;
-	return true;
+	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
 }
 EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
 
@@ -181,8 +176,7 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup
 		show_stack_regs(regs);
 
 	/* Pretend that the write succeeded. */
-	regs->ip = ex_fixup_addr(fixup);
-	return true;
+	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
 }
 EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
 
-- 
Gitee


From 9cea77e27ca801d704f9ae9bfbf5daa308172354 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:13 +0200
Subject: [PATCH 0696/2161] x86/extable: Get rid of redundant macros

commit 32fd8b59f91fcd3bf9459aa72d90345735cc2588 upstream.

No point in defining the identical macros twice depending on C or assembly
mode. They are still identical.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.023659534@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/asm.h | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 948bd8171348..e1166550f783 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -129,18 +129,6 @@
 	.long (handler) - . ;					\
 	.popsection
 
-# define _ASM_EXTABLE(from, to)					\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_UA(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
-
-# define _ASM_EXTABLE_CPY(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
-
-# define _ASM_EXTABLE_FAULT(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
-
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
@@ -163,18 +151,6 @@
 	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
 	" .popsection\n"
 
-# define _ASM_EXTABLE(from, to)					\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_UA(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
-
-# define _ASM_EXTABLE_CPY(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
-
-# define _ASM_EXTABLE_FAULT(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
-
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
@@ -195,6 +171,18 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
 
+#define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+#define _ASM_EXTABLE_UA(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
+
+#define _ASM_EXTABLE_CPY(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
+
+#define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_ASM_H */
-- 
Gitee


From 851bd23d5ddb3f215bd5a1152c5b57bf3d9dfe9e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:15 +0200
Subject: [PATCH 0697/2161] x86/mce: Deduplicate exception handling

commit e42404afc4ca856c48f1e05752541faa3587c472 upstream.

Prepare code for further simplification. No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.096452100@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/mce/core.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 24c849550465..dd4490b91c1b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -379,13 +379,16 @@ static int msr_to_offset(u32 msr)
 	return -1;
 }
 
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr)
+static void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
 {
-	pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
-		 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+	if (wrmsr) {
+		pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+			 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+			 regs->ip, (void *)regs->ip);
+	} else {
+		pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+			 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+	}
 
 	show_stack_regs(regs);
 
@@ -393,7 +396,14 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
 
 	while (true)
 		cpu_relax();
+}
 
+__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
+				      struct pt_regs *regs, int trapnr,
+				      unsigned long error_code,
+				      unsigned long fault_addr)
+{
+	ex_handler_msr_mce(regs, false);
 	return true;
 }
 
@@ -438,17 +448,7 @@ __visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
 				      unsigned long error_code,
 				      unsigned long fault_addr)
 {
-	pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
-		 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
-		  regs->ip, (void *)regs->ip);
-
-	show_stack_regs(regs);
-
-	panic("MCA architectural violation!\n");
-
-	while (true)
-		cpu_relax();
-
+	ex_handler_msr_mce(regs, true);
 	return true;
 }
 
-- 
Gitee


From a71a21caea71dbe16d114bf27cbdbc6767e54ec6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:16 +0200
Subject: [PATCH 0698/2161] x86/mce: Get rid of stray semicolons

commit 083b32d6f4fa26abaf585721abeee73c92ea5376 upstream.

and the random number of tabs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.154428878@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/mce/internal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index e017ddbcfb12..12eaf3456064 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -52,7 +52,7 @@ bool intel_filter_mce(struct mce *m);
 static inline bool mce_intel_cmci_poll(void) { return false; }
 static inline void mce_intel_hcpu_update(unsigned long cpu) { }
 static inline void cmci_disable_bank(int bank) { }
-static inline bool intel_filter_mce(struct mce *m) { return false; };
+static inline bool intel_filter_mce(struct mce *m) { return false; }
 #endif
 
 void mce_timer_kick(unsigned long interval);
@@ -173,7 +173,7 @@ extern bool filter_mce(struct mce *m);
 #ifdef CONFIG_X86_MCE_AMD
 extern bool amd_filter_mce(struct mce *m);
 #else
-static inline bool amd_filter_mce(struct mce *m)			{ return false; };
+static inline bool amd_filter_mce(struct mce *m) { return false; }
 #endif
 
 __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
-- 
Gitee


From 00828f4ad905020ae2149057e58126fee0c35f15 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 15 Dec 2021 16:08:35 +0800
Subject: [PATCH 0699/2161] x86/extable: Rework the exception table mechanics

commit 46d28947d9876fc0f8f93d3c69813ef6e9852595 upstream.

The exception table entries contain the instruction address, the fixup
address and the handler address. All addresses are relative. Storing the
handler address has a few downsides:

 1) Most handlers need to be exported

 2) Handlers can be defined everywhere and there is no overview about the
    handler types

 3) MCE needs to check the handler type to decide whether an in kernel #MC
    can be recovered. The functionality of the handler itself is not in any
    way special, but for these checks there need to be separate functions
    which in the worst case have to be exported.

    Some of these 'recoverable' exception fixups are pretty obscure and
    just reuse some other handler to spare code. That obfuscates e.g. the
    #MC safe copy functions. Cleaning that up would require more handlers
    and exports

Rework the exception fixup mechanics by storing a fixup type number instead
of the handler address and invoke the proper handler for each fixup
type. Also teach the extable sort to leave the type field alone.

This makes most handlers static except for special cases like the MCE
MSR fixup and the BPF fixup. This allows to add more types for cleaning up
the obscure places without adding more handler code and exports.

There is a marginal code size reduction for a production config and it
removes _eight_ exported symbols.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lkml.kernel.org/r/20210908132525.211958725@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/asm.h                 |  22 ++--
 arch/x86/include/asm/extable.h             |  44 +++++---
 arch/x86/include/asm/extable_fixup_types.h |  19 ++++
 arch/x86/include/asm/fpu/internal.h        |   4 +-
 arch/x86/include/asm/msr.h                 |   4 +-
 arch/x86/include/asm/segment.h             |   2 +-
 arch/x86/kernel/cpu/mce/core.c             |  24 +---
 arch/x86/kernel/cpu/mce/internal.h         |  10 --
 arch/x86/kernel/cpu/mce/severity.c         |  21 ++--
 arch/x86/mm/extable.c                      | 123 +++++++++------------
 arch/x86/net/bpf_jit_comp.c                |  11 +-
 scripts/sortextable.c                      |   4 +-
 12 files changed, 133 insertions(+), 155 deletions(-)
 create mode 100644 arch/x86/include/asm/extable_fixup_types.h

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index e1166550f783..254f8c8c5429 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -119,14 +119,17 @@
 
 #ifdef __KERNEL__
 
+# include <asm/extable_fixup_types.h>
+
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
+
+# define _ASM_EXTABLE_TYPE(from, to, type)			\
 	.pushsection "__ex_table","a" ;				\
 	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
-	.long (handler) - . ;					\
+	.long type ;						\
 	.popsection
 
 # define _ASM_EXTABLE_EX(from, to)				\
@@ -142,13 +145,13 @@
 	.popsection
 
 #else
-# define _EXPAND_EXTABLE_HANDLE(x) #x
-# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
+
+# define _ASM_EXTABLE_TYPE(from, to, type)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
 	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
-	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
+	" .long " __stringify(type) " \n"			\
 	" .popsection\n"
 
 # define _ASM_EXTABLE_EX(from, to)				\
@@ -172,17 +175,16 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
 #endif
 
 #define _ASM_EXTABLE(from, to)					\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+	_ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT)
 
 #define _ASM_EXTABLE_UA(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
+	_ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS)
 
 #define _ASM_EXTABLE_CPY(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
+	_ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY)
 
 #define _ASM_EXTABLE_FAULT(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+	_ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT)
 
 #endif /* __KERNEL__ */
-
 #endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index 1f0cbc52937c..93f400eb728f 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -1,12 +1,18 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _ASM_X86_EXTABLE_H
 #define _ASM_X86_EXTABLE_H
+
+#include <asm/extable_fixup_types.h>
+
 /*
- * The exception table consists of triples of addresses relative to the
- * exception table entry itself. The first address is of an instruction
- * that is allowed to fault, the second is the target at which the program
- * should continue. The third is a handler function to deal with the fault
- * caused by the instruction in the first field.
+ * The exception table consists of two addresses relative to the
+ * exception table entry itself and a type selector field.
+ *
+ * The first address is of an instruction that is allowed to fault, the
+ * second is the target at which the program should continue.
+ *
+ * The type entry is used by fixup_exception() to select the handler to
+ * deal with the fault caused by the instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -15,7 +21,7 @@
  */
 
 struct exception_table_entry {
-	int insn, fixup, handler;
+	int insn, fixup, type;
 };
 struct pt_regs;
 
@@ -25,21 +31,27 @@ struct pt_regs;
 	do {							\
 		(a)->fixup = (b)->fixup + (delta);		\
 		(b)->fixup = (tmp).fixup - (delta);		\
-		(a)->handler = (b)->handler + (delta);		\
-		(b)->handler = (tmp).handler - (delta);		\
+		(a)->type = (b)->type;				\
+		(b)->type = (tmp).type;				\
 	} while (0)
 
-enum handler_type {
-	EX_HANDLER_NONE,
-	EX_HANDLER_FAULT,
-	EX_HANDLER_UACCESS,
-	EX_HANDLER_OTHER
-};
-
 extern int fixup_exception(struct pt_regs *regs, int trapnr,
 			   unsigned long error_code, unsigned long fault_addr);
 extern int fixup_bug(struct pt_regs *regs, int trapnr);
-extern enum handler_type ex_get_fault_handler_type(unsigned long ip);
+extern int ex_get_fixup_type(unsigned long ip);
 extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
 
+#ifdef CONFIG_X86_MCE
+extern void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr);
+#else
+static inline void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { }
+#endif
+
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs);
+#else
+static inline bool ex_handler_bpf(const struct exception_table_entry *x,
+				  struct pt_regs *regs) { return false; }
+#endif
+
 #endif
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
new file mode 100644
index 000000000000..0adc117618e6
--- /dev/null
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H
+#define _ASM_X86_EXTABLE_FIXUP_TYPES_H
+
+#define	EX_TYPE_NONE			 0
+#define	EX_TYPE_DEFAULT			 1
+#define	EX_TYPE_FAULT			 2
+#define	EX_TYPE_UACCESS			 3
+#define	EX_TYPE_COPY			 4
+#define	EX_TYPE_CLEAR_FS		 5
+#define	EX_TYPE_FPU_RESTORE		 6
+#define	EX_TYPE_WRMSR			 7
+#define	EX_TYPE_RDMSR			 8
+#define	EX_TYPE_BPF			 9
+
+#define	EX_TYPE_WRMSR_IN_MCE		10
+#define	EX_TYPE_RDMSR_IN_MCE		11
+
+#endif
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index b73c5cec7ff5..c3baf5331432 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -125,7 +125,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu);
 #define kernel_insn(insn, output, input...)				\
 	asm volatile("1:" #insn "\n\t"					\
 		     "2:\n"						\
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore)	\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE)	\
 		     : output : input)
 
 static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
@@ -252,7 +252,7 @@ static inline void fxsave(struct fxregs_state *fx)
 				 XRSTORS, X86_FEATURE_XSAVES)		\
 		     "\n"						\
 		     "3:\n"						\
-		     _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
+		     _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE)	\
 		     :							\
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index b40d0295d812..dd4207d7fddb 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -94,7 +94,7 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr)
 
 	asm volatile("1: rdmsr\n"
 		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
 		     : EAX_EDX_RET(val, low, high) : "c" (msr));
 
 	return EAX_EDX_VAL(val, low, high);
@@ -104,7 +104,7 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
 {
 	asm volatile("1: wrmsr\n"
 		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
 		     : : "c" (msr), "a"(low), "d" (high) : "memory");
 }
 
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 6669164abadc..c8610da2d23a 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -350,7 +350,7 @@ static inline void __loadsegment_fs(unsigned short value)
 		     "1:	movw %0, %%fs			\n"
 		     "2:					\n"
 
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS)
 
 		     : : "rm" (value) : "memory");
 }
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index dd4490b91c1b..3d724b5029fd 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -379,7 +379,7 @@ static int msr_to_offset(u32 msr)
 	return -1;
 }
 
-static void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
+void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
 {
 	if (wrmsr) {
 		pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
@@ -398,15 +398,6 @@ static void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
 		cpu_relax();
 }
 
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr)
-{
-	ex_handler_msr_mce(regs, false);
-	return true;
-}
-
 /* MSR access wrappers used for error injection */
 static noinstr u64 mce_rdmsrl(u32 msr)
 {
@@ -436,22 +427,13 @@ static noinstr u64 mce_rdmsrl(u32 msr)
 	 */
 	asm volatile("1: rdmsr\n"
 		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
 		     : EAX_EDX_RET(val, low, high) : "c" (msr));
 
 
 	return EAX_EDX_VAL(val, low, high);
 }
 
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr)
-{
-	ex_handler_msr_mce(regs, true);
-	return true;
-}
-
 static noinstr void mce_wrmsrl(u32 msr, u64 v)
 {
 	u32 low, high;
@@ -476,7 +458,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
 	/* See comment in mce_rdmsrl() */
 	asm volatile("1: wrmsr\n"
 		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
 		     : : "c" (msr), "a"(low), "d" (high) : "memory");
 }
 
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 12eaf3456064..9e2237f68c92 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -176,14 +176,4 @@ extern bool amd_filter_mce(struct mce *m);
 static inline bool amd_filter_mce(struct mce *m) { return false; }
 #endif
 
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr);
-
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr);
-
 #endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 2c5ea37a2e9b..605d90a865bb 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -273,25 +273,24 @@ static bool is_copy_from_user(struct pt_regs *regs)
  */
 static int error_context(struct mce *m, struct pt_regs *regs)
 {
-	enum handler_type t;
-
 	if ((m->cs & 3) == 3)
 		return IN_USER;
 	if (!mc_recoverable(m->mcgstatus))
 		return IN_KERNEL;
 
-	t = ex_get_fault_handler_type(m->ip);
-	if (t == EX_HANDLER_FAULT) {
-		m->kflags |= MCE_IN_KERNEL_RECOV;
-		return IN_KERNEL_RECOV;
-	}
-	if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) {
-		m->kflags |= MCE_IN_KERNEL_RECOV;
+	switch (ex_get_fixup_type(m->ip)) {
+	case EX_TYPE_UACCESS:
+	case EX_TYPE_COPY:
+		if (!regs || !is_copy_from_user(regs))
+			return IN_KERNEL;
 		m->kflags |= MCE_IN_KERNEL_COPYIN;
+		fallthrough;
+	case EX_TYPE_FAULT:
+		m->kflags |= MCE_IN_KERNEL_RECOV;
 		return IN_KERNEL_RECOV;
+	default:
+		return IN_KERNEL;
 	}
-
-	return IN_KERNEL;
 }
 
 static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index bf799337efb2..f48a1ea4e1b5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -8,40 +8,25 @@
 #include <asm/traps.h>
 #include <asm/kdebug.h>
 
-typedef bool (*ex_handler_t)(const struct exception_table_entry *,
-			    struct pt_regs *, int, unsigned long,
-			    unsigned long);
-
 static inline unsigned long
 ex_fixup_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->fixup + x->fixup;
 }
-static inline ex_handler_t
-ex_fixup_handler(const struct exception_table_entry *x)
-{
-	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
-}
 
-__visible bool ex_handler_default(const struct exception_table_entry *fixup,
-				  struct pt_regs *regs, int trapnr,
-				  unsigned long error_code,
-				  unsigned long fault_addr)
+static bool ex_handler_default(const struct exception_table_entry *fixup,
+			       struct pt_regs *regs)
 {
 	regs->ip = ex_fixup_addr(fixup);
 	return true;
 }
-EXPORT_SYMBOL(ex_handler_default);
 
-__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
-				struct pt_regs *regs, int trapnr,
-				unsigned long error_code,
-				unsigned long fault_addr)
+static bool ex_handler_fault(const struct exception_table_entry *fixup,
+			     struct pt_regs *regs, int trapnr)
 {
 	regs->ax = trapnr;
-	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL_GPL(ex_handler_fault);
 
 /*
  * Handler for UD0 exception following a failed test against the
@@ -102,10 +87,8 @@ EXPORT_SYMBOL(ex_handler_refcount);
  * of vulnerability by restoring from the initial state (essentially, zeroing
  * out all the FPU registers) if we can't restore from the task's FPU state.
  */
-__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
-				    struct pt_regs *regs, int trapnr,
-				    unsigned long error_code,
-				    unsigned long fault_addr)
+static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
+				 struct pt_regs *regs)
 {
 	regs->ip = ex_fixup_addr(fixup);
 
@@ -115,17 +98,13 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
 	__restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
 	return true;
 }
-EXPORT_SYMBOL_GPL(ex_handler_fprestore);
 
-__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
-				  struct pt_regs *regs, int trapnr,
-				  unsigned long error_code,
-				  unsigned long fault_addr)
+static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
+			       struct pt_regs *regs, int trapnr)
 {
 	WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
-	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL(ex_handler_uaccess);
 
 __visible bool ex_handler_ext(const struct exception_table_entry *fixup,
 			      struct pt_regs *regs, int trapnr,
@@ -139,20 +118,15 @@ __visible bool ex_handler_ext(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL(ex_handler_ext);
 
-__visible bool ex_handler_copy(const struct exception_table_entry *fixup,
-			       struct pt_regs *regs, int trapnr,
-			       unsigned long error_code,
-			       unsigned long fault_addr)
+static bool ex_handler_copy(const struct exception_table_entry *fixup,
+			    struct pt_regs *regs, int trapnr)
 {
 	WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
-	return ex_handler_fault(fixup, regs, trapnr, error_code, fault_addr);
+	return ex_handler_fault(fixup, regs, trapnr);
 }
-EXPORT_SYMBOL(ex_handler_copy);
 
-__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
-				       struct pt_regs *regs, int trapnr,
-				       unsigned long error_code,
-				       unsigned long fault_addr)
+static bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
+				    struct pt_regs *regs)
 {
 	if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
 			 (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
@@ -161,14 +135,11 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup
 	/* Pretend that the read succeeded and returned 0. */
 	regs->ax = 0;
 	regs->dx = 0;
-	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
 
-__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
-				       struct pt_regs *regs, int trapnr,
-				       unsigned long error_code,
-				       unsigned long fault_addr)
+static bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
+				    struct pt_regs *regs)
 {
 	if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
 			 (unsigned int)regs->cx, (unsigned int)regs->dx,
@@ -176,44 +147,29 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup
 		show_stack_regs(regs);
 
 	/* Pretend that the write succeeded. */
-	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
 
-__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
-				   struct pt_regs *regs, int trapnr,
-				   unsigned long error_code,
-				   unsigned long fault_addr)
+static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
+				struct pt_regs *regs)
 {
 	if (static_cpu_has(X86_BUG_NULL_SEG))
 		asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
 	asm volatile ("mov %0, %%fs" : : "rm" (0));
-	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL(ex_handler_clear_fs);
 
-enum handler_type ex_get_fault_handler_type(unsigned long ip)
+int ex_get_fixup_type(unsigned long ip)
 {
-	const struct exception_table_entry *e;
-	ex_handler_t handler;
+	const struct exception_table_entry *e = search_exception_tables(ip);
 
-	e = search_exception_tables(ip);
-	if (!e)
-		return EX_HANDLER_NONE;
-	handler = ex_fixup_handler(e);
-	if (handler == ex_handler_fault)
-		return EX_HANDLER_FAULT;
-	else if (handler == ex_handler_uaccess || handler == ex_handler_copy)
-		return EX_HANDLER_UACCESS;
-	else
-		return EX_HANDLER_OTHER;
+	return e ? e->type : EX_TYPE_NONE;
 }
 
 int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
 		    unsigned long fault_addr)
 {
 	const struct exception_table_entry *e;
-	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -233,8 +189,33 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
 	if (!e)
 		return 0;
 
-	handler = ex_fixup_handler(e);
-	return handler(e, regs, trapnr, error_code, fault_addr);
+	switch (e->type) {
+	case EX_TYPE_DEFAULT:
+		return ex_handler_default(e, regs);
+	case EX_TYPE_FAULT:
+		return ex_handler_fault(e, regs, trapnr);
+	case EX_TYPE_UACCESS:
+		return ex_handler_uaccess(e, regs, trapnr);
+	case EX_TYPE_COPY:
+		return ex_handler_copy(e, regs, trapnr);
+	case EX_TYPE_CLEAR_FS:
+		return ex_handler_clear_fs(e, regs);
+	case EX_TYPE_FPU_RESTORE:
+		return ex_handler_fprestore(e, regs);
+	case EX_TYPE_RDMSR:
+		return ex_handler_rdmsr_unsafe(e, regs);
+	case EX_TYPE_WRMSR:
+		return ex_handler_wrmsr_unsafe(e, regs);
+	case EX_TYPE_BPF:
+		return ex_handler_bpf(e, regs);
+	case EX_TYPE_RDMSR_IN_MCE:
+		ex_handler_msr_mce(regs, false);
+		break;
+	case EX_TYPE_WRMSR_IN_MCE:
+		ex_handler_msr_mce(regs, true);
+		break;
+	}
+	BUG();
 }
 
 extern unsigned int early_recursion_flag;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f85b7365a770..ef97b0a8429d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -641,9 +641,7 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 	*pprog = prog;
 }
 
-static bool ex_handler_bpf(const struct exception_table_entry *x,
-			   struct pt_regs *regs, int trapnr,
-			   unsigned long error_code, unsigned long fault_addr)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
 {
 	u32 reg = x->fixup >> 8;
 
@@ -1067,12 +1065,7 @@ st:			if (is_imm8(insn->off))
 				}
 				ex->insn = delta;
 
-				delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler;
-				if (!is_simm32(delta)) {
-					pr_err("extable->handler doesn't fit into 32-bit\n");
-					return -EFAULT;
-				}
-				ex->handler = delta;
+				ex->type = EX_TYPE_BPF;
 
 				if (dst_reg > BPF_REG_9) {
 					pr_err("verifier error\n");
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index 55768654e3c6..4ebe7941f352 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -219,7 +219,7 @@ static void x86_sort_relative_table(char *extab_image, int image_size)
 
 		w(r(loc) + i, loc);
 		w(r(loc + 1) + i + 4, loc + 1);
-		w(r(loc + 2) + i + 8, loc + 2);
+		/* Don't touch the fixup type */
 
 		i += sizeof(uint32_t) * 3;
 	}
@@ -232,7 +232,7 @@ static void x86_sort_relative_table(char *extab_image, int image_size)
 
 		w(r(loc) - i, loc);
 		w(r(loc + 1) - (i + 4), loc + 1);
-		w(r(loc + 2) - (i + 8), loc + 2);
+		/* Don't touch the fixup type */
 
 		i += sizeof(uint32_t) * 3;
 	}
-- 
Gitee


From 0724fde1c945d50790eec45387699b0529b2d165 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 8 Mar 2022 18:57:47 +0800
Subject: [PATCH 0700/2161] x86/extable: Fix _ASM_EXTABLE_HANDLE() was removed

commit opencloudos.

_ASM_EXTABLE_HANDLE() was removed by 46d28947d9876fc0f8f93d3c69813ef6e9852595,
so that complied failed. Fix it.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/asm.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 254f8c8c5429..2d4eb66100c8 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -123,6 +123,13 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
+	.pushsection "__ex_table","a" ; 			\
+	.balign 4 ;						\
+	.long (from) - . ;					\
+	.long (to) - . ;					\
+	.long (handler) - . ;					\
+	.popsection
 
 # define _ASM_EXTABLE_TYPE(from, to, type)			\
 	.pushsection "__ex_table","a" ;				\
@@ -145,6 +152,14 @@
 	.popsection
 
 #else
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
+	" .pushsection \"__ex_table\",\"a\"\n"			\
+	" .balign 4\n"						\
+	" .long (" #from ") - .\n"				\
+	" .long (" #to ") - .\n"				\
+	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
+	" .popsection\n"
 
 # define _ASM_EXTABLE_TYPE(from, to, type)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-- 
Gitee


From 4937c0a09e850836c49d6b9bd5872407b0a62a4e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:19 +0200
Subject: [PATCH 0701/2161] x86/extable: Provide EX_TYPE_DEFAULT_MCE_SAFE and
 EX_TYPE_FAULT_MCE_SAFE

commit 2cadf5248b9316d3c8af876e795d61c55476f6e9 upstream.

Provide exception fixup types which can be used to identify fixups which
allow in kernel #MC recovery and make them invoke the existing handlers.

These will be used at places where #MC recovery is handled correctly by the
caller.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.269689153@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/extable_fixup_types.h | 3 +++
 arch/x86/kernel/cpu/mce/severity.c         | 2 ++
 arch/x86/mm/extable.c                      | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 0adc117618e6..409524d5d2eb 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -16,4 +16,7 @@
 #define	EX_TYPE_WRMSR_IN_MCE		10
 #define	EX_TYPE_RDMSR_IN_MCE		11
 
+#define	EX_TYPE_DEFAULT_MCE_SAFE	12
+#define	EX_TYPE_FAULT_MCE_SAFE		13
+
 #endif
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 605d90a865bb..e8a6b4ea577d 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -286,6 +286,8 @@ static int error_context(struct mce *m, struct pt_regs *regs)
 		m->kflags |= MCE_IN_KERNEL_COPYIN;
 		fallthrough;
 	case EX_TYPE_FAULT:
+	case EX_TYPE_FAULT_MCE_SAFE:
+	case EX_TYPE_DEFAULT_MCE_SAFE:
 		m->kflags |= MCE_IN_KERNEL_RECOV;
 		return IN_KERNEL_RECOV;
 	default:
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index f48a1ea4e1b5..1ec148e7f394 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -191,8 +191,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
 
 	switch (e->type) {
 	case EX_TYPE_DEFAULT:
+	case EX_TYPE_DEFAULT_MCE_SAFE:
 		return ex_handler_default(e, regs);
 	case EX_TYPE_FAULT:
+	case EX_TYPE_FAULT_MCE_SAFE:
 		return ex_handler_fault(e, regs, trapnr);
 	case EX_TYPE_UACCESS:
 		return ex_handler_uaccess(e, regs, trapnr);
-- 
Gitee


From 54f32953b8f32767241e2a7b3d4823bcb64c9831 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:21 +0200
Subject: [PATCH 0702/2161] x86/copy_mc: Use EX_TYPE_DEFAULT_MCE_SAFE for
 exception fixups

commit c1c97d175493ab32325df81133611ce8e4e05088 upstream.

Nothing in that code uses the trap number which was stored by the exception
fixup which is instantiated via _ASM_EXTABLE_FAULT().

Use _ASM_EXTABLE(... EX_TYPE_DEFAULT_MCE_SAFE) instead which just handles
the IP fixup and the type indicates to the #MC handler that the call site
can handle the abort caused by #MC correctly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.328706042@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/lib/memcpy_64.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index dc2fb886db2b..2cb18ef24427 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -288,9 +288,9 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
 	.previous
 
-	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
-	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
-	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+	_ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
+	_ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE)
+	_ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
 	_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
 	_ASM_EXTABLE(.L_write_words, .E_write_words)
 	_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
-- 
Gitee


From f9394d6300347f84b51566b10142726c6ae046ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:23 +0200
Subject: [PATCH 0703/2161] x86/fpu: Use EX_TYPE_FAULT_MCE_SAFE for exception
 fixups

commit c6304556f3ae98c943bbb4042a30205c98e4f921 upstream.

The macros used for restoring FPU state from a user space buffer can handle
all exceptions including #MC. They need to return the trap number in the
error case as the code which invokes them needs to distinguish the cause of
the failure. It aborts the operation for anything except #PF.

Use the new EX_TYPE_FAULT_MCE_SAFE exception table fixup type to document
the nature of the fixup.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.387464538@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index c3baf5331432..a58a76d112cc 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -101,7 +101,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu);
 		     "3:  negl %%eax\n"					\
 		     "    jmp  2b\n"					\
 		     ".previous\n"					\
-		     _ASM_EXTABLE_FAULT(1b, 3b)				\
+		     _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FAULT_MCE_SAFE)	\
 		     : [err] "=a" (err), output				\
 		     : "0"(0), input);					\
 	err;								\
@@ -208,7 +208,7 @@ static inline void fxsave(struct fxregs_state *fx)
 		     "3: negl %%eax\n\t"				\
 		     "jmp 2b\n\t"					\
 		     ".popsection\n\t"					\
-		     _ASM_EXTABLE_FAULT(1b, 3b)				\
+		     _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FAULT_MCE_SAFE)	\
 		     : [err] "=a" (err)					\
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
-- 
Gitee


From 095c042653cb1c998cea1b35f85182451ede2c70 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:24 +0200
Subject: [PATCH 0704/2161] x86/extable: Remove EX_TYPE_FAULT from MCE safe
 fixups

commit 0c2e62ba04cd0b7194b380bae4fc35c45bb2e46e upstream.

Now that the MC safe copy and FPU have been converted to use the MCE safe
fixup types remove EX_TYPE_FAULT from the list of types which MCE considers
to be safe to be recovered in kernel.

This removes the SGX exception handling of ENCLS from the #MC safe
handling, but according to the SGX wizards the current SGX implementations
cannot survive #MC on ENCLS:

  https://lore.kernel.org/r/YS+upEmTfpZub3s9@google.com

The code relies on the trap number being stored if ENCLS raised an
exception. That's still working, but it does no longer trick the MCE code
into assuming that #MC is handled correctly for ENCLS.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.445255957@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/mce/severity.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index e8a6b4ea577d..6fa66c2b94b2 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -285,7 +285,6 @@ static int error_context(struct mce *m, struct pt_regs *regs)
 			return IN_KERNEL;
 		m->kflags |= MCE_IN_KERNEL_COPYIN;
 		fallthrough;
-	case EX_TYPE_FAULT:
 	case EX_TYPE_FAULT_MCE_SAFE:
 	case EX_TYPE_DEFAULT_MCE_SAFE:
 		m->kflags |= MCE_IN_KERNEL_RECOV;
-- 
Gitee


From 89ef5d7f283f2a3edf65a26397072872c5e6911a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:26 +0200
Subject: [PATCH 0705/2161] x86/fpu/signal: Clarify exception handling in
 restore_fpregs_from_user()

commit 4339d0c63c2d5bea1fe6de4091ee2fe9eeea09a7 upstream.

FPU restore from a signal frame can trigger various exceptions. The
exceptions are caught with an exception table entry. The handler of this
entry stores the trap number in EAX. The FPU specific fixup negates that
trap number to convert it into an negative error code.

Any other exception than #PF is fatal and recovery is not possible. This
relies on the fact that the #PF exception number is the same as EFAULT, but
that's not really obvious.

Remove the negation from the exception fixup as it really has no value and
check for X86_TRAP_PF at the call site.

There is still confusion due to the return code conversion for the error
case which will be cleaned up separately.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.506192488@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 21 ++++++++-------------
 arch/x86/kernel/fpu/signal.c        |  5 +++--
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a58a76d112cc..43be25c8cf19 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -87,7 +87,10 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 extern void save_fpregs_to_fpstate(struct fpu *fpu);
 
-/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
+/*
+ * Returns 0 on success or the trap number when the operation raises an
+ * exception.
+ */
 #define user_insn(insn, output, input...)				\
 ({									\
 	int err;							\
@@ -97,11 +100,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu);
 	asm volatile(ASM_STAC "\n"					\
 		     "1: " #insn "\n"					\
 		     "2: " ASM_CLAC "\n"				\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  negl %%eax\n"					\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FAULT_MCE_SAFE)	\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)	\
 		     : [err] "=a" (err), output				\
 		     : "0"(0), input);					\
 	err;								\
@@ -197,18 +196,14 @@ static inline void fxsave(struct fxregs_state *fx)
 #define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
 
 /*
- * After this @err contains 0 on success or the negated trap number when
- * the operation raises an exception. For faults this results in -EFAULT.
+ * After this @err contains 0 on success or the trap number when the
+ * operation raises an exception.
  */
 #define XSTATE_OP(op, st, lmask, hmask, err)				\
 	asm volatile("1:" op "\n\t"					\
 		     "xor %[err], %[err]\n"				\
 		     "2:\n\t"						\
-		     ".pushsection .fixup,\"ax\"\n\t"			\
-		     "3: negl %%eax\n\t"				\
-		     "jmp 2b\n\t"					\
-		     ".popsection\n\t"					\
-		     _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FAULT_MCE_SAFE)	\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)	\
 		     : [err] "=a" (err)					\
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index ce26ce1a9f3e..916695f3dcca 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -13,6 +13,7 @@
 #include <asm/fpu/xstate.h>
 
 #include <asm/sigframe.h>
+#include <asm/traps.h>
 #include <asm/trace/fpu.h>
 
 static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
@@ -275,7 +276,7 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
 		fpregs_unlock();
 
 		/* Try to handle #PF, but anything else is fatal. */
-		if (ret != -EFAULT)
+		if (ret != X86_TRAP_PF)
 			return -EINVAL;
 
 		ret = fault_in_pages_readable(buf, size);
@@ -405,7 +406,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		u64 mask = user_xfeatures | xfeatures_mask_supervisor();
 
 		fpu->state.xsave.header.xfeatures &= mask;
-		ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all);
+		ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all) ? -EINVAL : 0;
 	} else {
 		ret = fxrstor_safe(&fpu->state.fxsave);
 	}
-- 
Gitee


From f895fd01d5357f258f262b02210ce986f6c4aaa0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:29 +0200
Subject: [PATCH 0706/2161] x86/fpu/signal: Move header zeroing out of
 xsave_to_user_sigframe()

commit 4164a482a5d92c29eaf53d01755103f6bbce38f2 upstream.

There is no reason to have the header zeroing in the pagefault disabled
region. Do it upfront once.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.621674721@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 17 ++++++-----------
 arch/x86/kernel/fpu/signal.c        | 12 ++++++++++++
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 43be25c8cf19..f0506cf2de67 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -317,9 +317,12 @@ static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
  * We don't use modified optimization because xrstor/xrstors might track
  * a different application.
  *
- * We don't use compacted format xsave area for
- * backward compatibility for old applications which don't understand
- * compacted format of xsave area.
+ * We don't use compacted format xsave area for backward compatibility for
+ * old applications which don't understand the compacted format of the
+ * xsave area.
+ *
+ * The caller has to zero buf::header before calling this because XSAVE*
+ * does not touch the reserved fields in the header.
  */
 static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
 {
@@ -333,14 +336,6 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
 	u32 hmask = mask >> 32;
 	int err;
 
-	/*
-	 * Clear the xsave header first, so that reserved fields are
-	 * initialized to zero.
-	 */
-	err = __clear_user(&buf->header, sizeof(buf->header));
-	if (unlikely(err))
-		return -EFAULT;
-
 	stac();
 	XSTATE_OP(XSAVE, buf, lmask, hmask, err);
 	clac();
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 916695f3dcca..fff8da5905fe 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -189,6 +189,18 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 
 	if (!access_ok(buf, size))
 		return -EACCES;
+
+	if (use_xsave()) {
+		struct xregs_state __user *xbuf = buf_fx;
+
+		/*
+		 * Clear the xsave header first, so that reserved fields are
+		 * initialized to zero.
+		 */
+		ret = __clear_user(&xbuf->header, sizeof(xbuf->header));
+		if (unlikely(ret))
+			return ret;
+	}
 retry:
 	/*
 	 * Load the FPU registers if they are not valid for the current task.
-- 
Gitee


From fbd33dcea8752b01876a6b699e7c3af157d2295f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:30 +0200
Subject: [PATCH 0707/2161] x86/fpu/signal: Move xstate clearing out of
 copy_fpregs_to_sigframe()

commit fcfb7163329ce832aafef31f26345ef5e8642a17 upstream.

When the direct saving of the FPU registers to the user space sigframe
fails, copy_fpregs_to_sigframe() attempts to clear the user buffer.

The most likely reason for such a fail is a page fault. As
copy_fpregs_to_sigframe() is invoked with pagefaults disabled the chance
that __clear_user() succeeds is minuscule.

Move the clearing out into the caller which replaces the
fault_in_pages_writeable() in that error handling path.

The return value confusion will be cleaned up separately.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.679356300@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index fff8da5905fe..61a97f7ab885 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -136,18 +136,12 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 
 static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
 {
-	int err;
-
 	if (use_xsave())
-		err = xsave_to_user_sigframe(buf);
-	else if (use_fxsr())
-		err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
+		return xsave_to_user_sigframe(buf);
+	if (use_fxsr())
+		return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
 	else
-		err = fnsave_to_user_sigframe((struct fregs_state __user *) buf);
-
-	if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
-		err = -EFAULT;
-	return err;
+		return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
 }
 
 /*
@@ -218,9 +212,9 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	fpregs_unlock();
 
 	if (ret) {
-		if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
+		if (!__clear_user(buf_fx, fpu_user_xstate_size))
 			goto retry;
-		return -EFAULT;
+		return -1;
 	}
 
 	/* Save the fsave header for the 32-bit frames. */
-- 
Gitee


From 4dc93373f919317b269135520adca134e7bdcf13 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 18 May 2021 13:03:19 -0700
Subject: [PATCH 0708/2161] x86/signal: Detect and prevent an alternate signal
 stack overflow

commit 2beb4a53fc3f1081cedc1c1a198c7f56cc4fc60c upstream.

The kernel pushes context on to the userspace stack to prepare for the
user's signal handler. When the user has supplied an alternate signal
stack, via sigaltstack(2), it is easy for the kernel to verify that the
stack size is sufficient for the current hardware context.

Check if writing the hardware context to the alternate stack will exceed
it's size. If yes, then instead of corrupting user-data and proceeding with
the original signal handler, an immediate SIGSEGV signal is delivered.

Refactor the stack pointer check code from on_sig_stack() and use the new
helper.

While the kernel allows new source code to discover and use a sufficient
alternate signal stack size, this check is still necessary to protect
binaries with insufficient alternate signal stack size from data
corruption.

Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch")
Reported-by: Florian Weimer <fweimer@redhat.com>
Suggested-by: Jann Horn <jannh@google.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Len Brown <len.brown@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20210518200320.17239-6-chang.seok.bae@intel.com
Link: https://bugzilla.kernel.org/show_bug.cgi?id=153531
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c     | 24 ++++++++++++++++++++----
 include/linux/sched/signal.h | 19 ++++++++++++-------
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 22ef679f17c6..11ee3fb5b9d0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -246,10 +246,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	     void __user **fpstate)
 {
 	/* Default to using normal stack */
+	bool nested_altstack = on_sig_stack(regs->sp);
+	bool entering_altstack = false;
 	unsigned long math_size = 0;
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
-	int onsigstack = on_sig_stack(sp);
 	int ret;
 
 	/* redzone */
@@ -258,15 +259,23 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(sp) == 0)
+		/*
+		 * This checks nested_altstack via sas_ss_flags(). Sensible
+		 * programs use SS_AUTODISARM, which disables that check, and
+		 * programs that don't use SS_AUTODISARM get compatible.
+		 */
+		if (sas_ss_flags(sp) == 0) {
 			sp = current->sas_ss_sp + current->sas_ss_size;
+			entering_altstack = true;
+		}
 	} else if (IS_ENABLED(CONFIG_X86_32) &&
-		   !onsigstack &&
+		   !nested_altstack &&
 		   regs->ss != __USER_DS &&
 		   !(ka->sa.sa_flags & SA_RESTORER) &&
 		   ka->sa.sa_restorer) {
 		/* This is the legacy signal stack switching. */
 		sp = (unsigned long) ka->sa.sa_restorer;
+		entering_altstack = true;
 	}
 
 	sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
@@ -279,8 +288,15 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	 * If we are on the alternate signal stack and would overflow it, don't.
 	 * Return an always-bogus address instead so we will die with SIGSEGV.
 	 */
-	if (onsigstack && !likely(on_sig_stack(sp)))
+	if (unlikely((nested_altstack || entering_altstack) &&
+		     !__on_sig_stack(sp))) {
+
+		if (show_unhandled_signals && printk_ratelimit())
+			pr_info("%s[%d] overflowed sigaltstack\n",
+				current->comm, task_pid_nr(current));
+
 		return (void __user *)-1L;
+	}
 
 	/* save i387 and extended state */
 	ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 23351d6ace55..f652db012726 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -514,6 +514,17 @@ static inline int kill_cad_pid(int sig, int priv)
 #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
 #define SEND_SIG_PRIV	((struct kernel_siginfo *) 1)
 
+static inline int __on_sig_stack(unsigned long sp)
+{
+#ifdef CONFIG_STACK_GROWSUP
+	return sp >= current->sas_ss_sp &&
+		sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+	return sp > current->sas_ss_sp &&
+		sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
+}
+
 /*
  * True if we are on the alternate signal stack.
  */
@@ -531,13 +542,7 @@ static inline int on_sig_stack(unsigned long sp)
 	if (current->sas_ss_flags & SS_AUTODISARM)
 		return 0;
 
-#ifdef CONFIG_STACK_GROWSUP
-	return sp >= current->sas_ss_sp &&
-		sp - current->sas_ss_sp < current->sas_ss_size;
-#else
-	return sp > current->sas_ss_sp &&
-		sp - current->sas_ss_sp <= current->sas_ss_size;
-#endif
+	return __on_sig_stack(sp);
 }
 
 static inline int sas_ss_flags(unsigned long sp)
-- 
Gitee


From ca8c34ec51821e3946a2d43cf01e91b82078e4fa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:32 +0200
Subject: [PATCH 0709/2161] x86/fpu/signal: Change return type of
 copy_fpstate_to_sigframe() to boolean

commit 052adee668284b67105375c0a524f16a423f1424 upstream.

None of the call sites cares about the actual return code. Change the
return type to boolean and return 'true' on success.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.736773588@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c         |  4 ++--
 arch/x86/include/asm/fpu/internal.h |  2 +-
 arch/x86/kernel/fpu/signal.c        | 20 ++++++++++----------
 arch/x86/kernel/signal.c            |  4 +---
 4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 30416d7f19d4..842efecc2762 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -238,8 +238,8 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 
 	sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
 	*fpstate = (struct _fpstate_32 __user *) sp;
-	if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
-				     math_size) < 0)
+	if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
+				      math_size))
 		return (void __user *) -1L;
 
 	sp -= frame_size;
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index f0506cf2de67..28b1dff3b4e2 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -385,7 +385,7 @@ static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
 	__restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
 }
 
-extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
 
 /*
  * FPU context switch related helper methods:
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 61a97f7ab885..76355b34aa01 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -165,7 +165,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
  * indicating the absence/presence of the extended state to the user.
  */
-int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
@@ -176,13 +176,14 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 
 	if (!static_cpu_has(X86_FEATURE_FPU)) {
 		struct user_i387_ia32_struct fp;
+
 		fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
 						.left = sizeof(fp)});
-		return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0;
+		return !copy_to_user(buf, &fp, sizeof(fp));
 	}
 
 	if (!access_ok(buf, size))
-		return -EACCES;
+		return false;
 
 	if (use_xsave()) {
 		struct xregs_state __user *xbuf = buf_fx;
@@ -191,9 +192,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 		 * Clear the xsave header first, so that reserved fields are
 		 * initialized to zero.
 		 */
-		ret = __clear_user(&xbuf->header, sizeof(xbuf->header));
-		if (unlikely(ret))
-			return ret;
+		if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
+			return false;
 	}
 retry:
 	/*
@@ -214,17 +214,17 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	if (ret) {
 		if (!__clear_user(buf_fx, fpu_user_xstate_size))
 			goto retry;
-		return -1;
+		return false;
 	}
 
 	/* Save the fsave header for the 32-bit frames. */
 	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-		return -1;
+		return false;
 
 	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
-		return -1;
+		return false;
 
-	return 0;
+	return true;
 }
 
 static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 11ee3fb5b9d0..5558fd40b16c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -251,7 +251,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	unsigned long math_size = 0;
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
-	int ret;
 
 	/* redzone */
 	if (IS_ENABLED(CONFIG_X86_64))
@@ -299,8 +298,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	}
 
 	/* save i387 and extended state */
-	ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
-	if (ret < 0)
+	if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
 		return (void __user *)-1L;
 
 	return (void __user *)sp;
-- 
Gitee


From e64bb0bb14adb940b080a30c799880b521908cdc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:34 +0200
Subject: [PATCH 0710/2161] x86/fpu/signal: Change return type of
 copy_fpregs_to_sigframe() helpers to boolean

commit 052adee668284b67105375c0a524f16a423f1424 upstream.

Now that copy_fpregs_to_sigframe() returns boolean the individual return
codes in the related helper functions do not make sense anymore. Change
them to return boolean success/fail.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.794334915@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 76355b34aa01..97d49dfa1334 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -65,7 +65,7 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
 /*
  * Signal frame handlers.
  */
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
 {
 	if (use_fxsr()) {
 		struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
@@ -82,18 +82,19 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
 		if (__copy_to_user(buf, &env, sizeof(env)) ||
 		    __put_user(xsave->i387.swd, &fp->status) ||
 		    __put_user(X86_FXSR_MAGIC, &fp->magic))
-			return -1;
+			return false;
 	} else {
 		struct fregs_state __user *fp = buf;
 		u32 swd;
+
 		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
-			return -1;
+			return false;
 	}
 
-	return 0;
+	return true;
 }
 
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+static inline bool save_xstate_epilog(void __user *buf, int ia32_frame)
 {
 	struct xregs_state __user *x = buf;
 	struct _fpx_sw_bytes *sw_bytes;
@@ -131,7 +132,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 
 	err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
 
-	return err;
+	return !err;
 }
 
 static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
@@ -218,10 +219,10 @@ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	}
 
 	/* Save the fsave header for the 32-bit frames. */
-	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+	if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
 		return false;
 
-	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+	if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate))
 		return false;
 
 	return true;
-- 
Gitee


From 8953839b922c1d6f0eaddbc2e8023a7bbe5768b2 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 20 Dec 2021 11:39:05 +0800
Subject: [PATCH 0711/2161] x86: get rid of get_user_ex() in
 restore_sigcontext()

commit 978727ca331ebd8b479f6a7afd27bb2e6504b2e3 upstream.

Just do copyin into a local struct and be done with that - we are
on a shallow stack here.

[reworked by tglx, removing the macro horrors while we are touching that]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 86 +++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 50 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 5558fd40b16c..c0de0bb621c4 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -47,24 +47,6 @@
 #include <asm/sigframe.h>
 #include <asm/signal.h>
 
-#define COPY(x)			do {			\
-	get_user_ex(regs->x, &sc->x);			\
-} while (0)
-
-#define GET_SEG(seg)		({			\
-	unsigned short tmp;				\
-	get_user_ex(tmp, &sc->seg);			\
-	tmp;						\
-})
-
-#define COPY_SEG(seg)		do {			\
-	regs->seg = GET_SEG(seg);			\
-} while (0)
-
-#define COPY_SEG_CPL3(seg)	do {			\
-	regs->seg = GET_SEG(seg) | 3;			\
-} while (0)
-
 #ifdef CONFIG_X86_64
 /*
  * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
@@ -92,53 +74,58 @@ static void force_valid_ss(struct pt_regs *regs)
 	    ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
 		regs->ss = __USER_DS;
 }
+# define CONTEXT_COPY_SIZE	offsetof(struct sigcontext, reserved1)
+#else
+# define CONTEXT_COPY_SIZE	sizeof(struct sigcontext)
 #endif
 
 static int restore_sigcontext(struct pt_regs *regs,
-			      struct sigcontext __user *sc,
+			      struct sigcontext __user *usc,
 			      unsigned long uc_flags)
 {
-	unsigned long buf_val;
-	void __user *buf;
-	unsigned int tmpflags;
-	unsigned int err = 0;
+	struct sigcontext sc;
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	get_user_try {
+	if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
+		return -EFAULT;
 
 #ifdef CONFIG_X86_32
-		set_user_gs(regs, GET_SEG(gs));
-		COPY_SEG(fs);
-		COPY_SEG(es);
-		COPY_SEG(ds);
+	set_user_gs(regs, sc.gs);
+	regs->fs = sc.fs;
+	regs->es = sc.es;
+	regs->ds = sc.ds;
 #endif /* CONFIG_X86_32 */
 
-		COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-		COPY(dx); COPY(cx); COPY(ip); COPY(ax);
+	regs->bx = sc.bx;
+	regs->cx = sc.cx;
+	regs->dx = sc.dx;
+	regs->si = sc.si;
+	regs->di = sc.di;
+	regs->bp = sc.bp;
+	regs->ax = sc.ax;
+	regs->sp = sc.sp;
+	regs->ip = sc.ip;
 
 #ifdef CONFIG_X86_64
-		COPY(r8);
-		COPY(r9);
-		COPY(r10);
-		COPY(r11);
-		COPY(r12);
-		COPY(r13);
-		COPY(r14);
-		COPY(r15);
+	regs->r8 = sc.r8;
+	regs->r9 = sc.r9;
+	regs->r10 = sc.r10;
+	regs->r11 = sc.r11;
+	regs->r12 = sc.r12;
+	regs->r13 = sc.r13;
+	regs->r14 = sc.r14;
+	regs->r15 = sc.r15;
 #endif /* CONFIG_X86_64 */
 
-		COPY_SEG_CPL3(cs);
-		COPY_SEG_CPL3(ss);
-
-		get_user_ex(tmpflags, &sc->flags);
-		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-		regs->orig_ax = -1;		/* disable syscall checks */
+	/* Get CS/SS and force CPL3 */
+	regs->cs = sc.cs | 0x03;
+	regs->ss = sc.ss | 0x03;
 
-		get_user_ex(buf_val, &sc->fpstate);
-		buf = (void __user *)buf_val;
-	} get_user_catch(err);
+	regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+	/* disable syscall checks */
+	regs->orig_ax = -1;
 
 #ifdef CONFIG_X86_64
 	/*
@@ -149,11 +136,10 @@ static int restore_sigcontext(struct pt_regs *regs,
 		force_valid_ss(regs);
 #endif
 
-	err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
-
 	force_iret();
 
-	return err;
+	return fpu__restore_sig((void __user *)sc.fpstate,
+			       IS_ENABLED(CONFIG_X86_32));
 }
 
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
-- 
Gitee


From 3b14b4604de0c841d9f65acd9f56439caf951861 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:35 +0200
Subject: [PATCH 0712/2161] x86/signal: Change return type of
 restore_sigcontext() to boolean

commit ee4ecdfbd28954086a09740dc931c10c93e39370 upstream.

None of the call sites cares about the return code. All they are interested
in is success or fail.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.851280949@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index c0de0bb621c4..1dec15d93591 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -79,9 +79,9 @@ static void force_valid_ss(struct pt_regs *regs)
 # define CONTEXT_COPY_SIZE	sizeof(struct sigcontext)
 #endif
 
-static int restore_sigcontext(struct pt_regs *regs,
-			      struct sigcontext __user *usc,
-			      unsigned long uc_flags)
+static bool restore_sigcontext(struct pt_regs *regs,
+			       struct sigcontext __user *usc,
+			       unsigned long uc_flags)
 {
 	struct sigcontext sc;
 
@@ -89,7 +89,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	current->restart_block.fn = do_no_restart_syscall;
 
 	if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
-		return -EFAULT;
+		return false;
 
 #ifdef CONFIG_X86_32
 	set_user_gs(regs, sc.gs);
@@ -138,8 +138,8 @@ static int restore_sigcontext(struct pt_regs *regs,
 
 	force_iret();
 
-	return fpu__restore_sig((void __user *)sc.fpstate,
-			       IS_ENABLED(CONFIG_X86_32));
+	return !fpu__restore_sig((void __user *)sc.fpstate,
+				 IS_ENABLED(CONFIG_X86_32));
 }
 
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
@@ -631,7 +631,7 @@ SYSCALL_DEFINE0(sigreturn)
 	 * x86_32 has no uc_flags bits relevant to restore_sigcontext.
 	 * Save a few cycles by skipping the __get_user.
 	 */
-	if (restore_sigcontext(regs, &frame->sc, 0))
+	if (!restore_sigcontext(regs, &frame->sc, 0))
 		goto badframe;
 	return regs->ax;
 
@@ -659,7 +659,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+	if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
 		goto badframe;
 
 	if (restore_altstack(&frame->uc.uc_stack))
@@ -917,7 +917,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+	if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
 		goto badframe;
 
 	if (compat_restore_altstack(&frame->uc.uc_stack))
-- 
Gitee


From 2deb0dbf5b1571c9b81bb34738066fd761368eba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:37 +0200
Subject: [PATCH 0713/2161] x86/fpu/signal: Change return type of
 fpu__restore_sig() to boolean

commit 052adee668284b67105375c0a524f16a423f1424 upstream.

None of the call sites cares about the error code. All they need to know is
whether the function succeeded or not.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.909065931@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  2 +-
 arch/x86/kernel/fpu/signal.c        | 22 ++++++++++------------
 arch/x86/kernel/signal.c            |  4 ++--
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 28b1dff3b4e2..7a812a772155 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -25,7 +25,7 @@
 /*
  * High level FPU state handling functions:
  */
-extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
+extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
 extern void fpu__clear_user_states(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 97d49dfa1334..3cbe0278fb73 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -433,17 +433,17 @@ static inline int xstate_sigframe_size(void)
 /*
  * Restore FPU state from a sigframe:
  */
-int fpu__restore_sig(void __user *buf, int ia32_frame)
+bool fpu__restore_sig(void __user *buf, int ia32_frame)
 {
 	unsigned int size = xstate_sigframe_size();
 	struct fpu *fpu = &current->thread.fpu;
 	void __user *buf_fx = buf;
 	bool ia32_fxstate = false;
-	int ret;
+	bool success = false;
 
 	if (unlikely(!buf)) {
 		fpu__clear_user_states(fpu);
-		return 0;
+		return true;
 	}
 
 	ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
@@ -459,23 +459,21 @@ int fpu__restore_sig(void __user *buf, int ia32_frame)
 		ia32_fxstate = true;
 	}
 
-	if (!access_ok(buf, size)) {
-		ret = -EACCES;
+	if (!access_ok(buf, size))
 		goto out;
-	}
 
 	if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
-		ret = fpregs_soft_set(current, NULL, 0,
-				      sizeof(struct user_i387_ia32_struct),
-				      NULL, buf);
+		success = !fpregs_soft_set(current, NULL, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   NULL, buf);
 	} else {
-		ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+		success = !__fpu_restore_sig(buf, buf_fx, ia32_fxstate);
 	}
 
 out:
-	if (unlikely(ret))
+	if (unlikely(!success))
 		fpu__clear_user_states(fpu);
-	return ret;
+	return success;
 }
 
 unsigned long
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 1dec15d93591..37f7fa8f8e4d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -138,8 +138,8 @@ static bool restore_sigcontext(struct pt_regs *regs,
 
 	force_iret();
 
-	return !fpu__restore_sig((void __user *)sc.fpstate,
-				 IS_ENABLED(CONFIG_X86_32));
+	return fpu__restore_sig((void __user *)sc.fpstate,
+			       IS_ENABLED(CONFIG_X86_32));
 }
 
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
-- 
Gitee


From e62e43f04941d62b36f915d8764a560a52a14636 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:38 +0200
Subject: [PATCH 0714/2161] x86/fpu/signal: Change return type of
 __fpu_restore_sig() to boolean

commit 052adee668284b67105375c0a524f16a423f1424 upstream.

Now that fpu__restore_sig() returns a boolean get rid of the individual
error codes in __fpu_restore_sig() as well.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132525.966197097@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 41 ++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 3cbe0278fb73..5b0be1c59885 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -309,8 +309,8 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
 	return 0;
 }
 
-static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
-			     bool ia32_fxstate)
+static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+			      bool ia32_fxstate)
 {
 	int state_size = fpu_kernel_xstate_size;
 	struct task_struct *tsk = current;
@@ -318,14 +318,14 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	struct user_i387_ia32_struct env;
 	u64 user_xfeatures = 0;
 	bool fx_only = false;
-	int ret;
+	bool success;
+
 
 	if (use_xsave()) {
 		struct _fpx_sw_bytes fx_sw_user;
 
-		ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user);
-		if (unlikely(ret))
-			return ret;
+		if (check_xstate_in_sigframe(buf_fx, &fx_sw_user))
+			return false;
 
 		fx_only = !fx_sw_user.magic1;
 		state_size = fx_sw_user.xstate_size;
@@ -341,8 +341,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		 * faults. If it does, fall back to the slow path below, going
 		 * through the kernel buffer with the enabled pagefault handler.
 		 */
-		return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
-						state_size);
+		return !restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
+						 state_size);
 	}
 
 	/*
@@ -350,9 +350,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	 * to be ignored for histerical raisins. The legacy state is folded
 	 * in once the larger state has been copied.
 	 */
-	ret = __copy_from_user(&env, buf, sizeof(env));
-	if (ret)
-		return ret;
+	if (__copy_from_user(&env, buf, sizeof(env)))
+		return false;
 
 	/*
 	 * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
@@ -379,17 +378,16 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	fpregs_unlock();
 
 	if (use_xsave() && !fx_only) {
-		ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
-		if (ret)
-			return ret;
+		if (copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx))
+			return false;
 	} else {
 		if (__copy_from_user(&fpu->state.fxsave, buf_fx,
 				     sizeof(fpu->state.fxsave)))
-			return -EFAULT;
+			return false;
 
 		/* Reject invalid MXCSR values. */
 		if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
-			return -EINVAL;
+			return false;
 
 		/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
 		if (use_xsave())
@@ -413,17 +411,18 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		u64 mask = user_xfeatures | xfeatures_mask_supervisor();
 
 		fpu->state.xsave.header.xfeatures &= mask;
-		ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all) ? -EINVAL : 0;
+		success = !os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all);
 	} else {
-		ret = fxrstor_safe(&fpu->state.fxsave);
+		success = !fxrstor_safe(&fpu->state.fxsave);
 	}
 
-	if (likely(!ret))
+	if (likely(success))
 		fpregs_mark_activate();
 
 	fpregs_unlock();
-	return ret;
+	return success;
 }
+
 static inline int xstate_sigframe_size(void)
 {
 	return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
@@ -467,7 +466,7 @@ bool fpu__restore_sig(void __user *buf, int ia32_frame)
 					   sizeof(struct user_i387_ia32_struct),
 					   NULL, buf);
 	} else {
-		success = !__fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+		success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
 	}
 
 out:
-- 
Gitee


From ec56ce9aba6f49dd68f75d211ab9fe0cd1a0c20c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:40 +0200
Subject: [PATCH 0715/2161] x86/fpu/signal: Change return code of
 check_xstate_in_sigframe() to boolean

commit be0040144152ed834c369a7830487e5ee4f27080 upstream.

__fpu_sig_restore() only needs success/fail information and no detailed
error code.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132526.024024598@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 5b0be1c59885..5fa497bea5fd 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -23,8 +23,8 @@ static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
  * Check for the presence of extended state information in the
  * user fpstate pointer in the sigcontext.
  */
-static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
-					   struct _fpx_sw_bytes *fx_sw)
+static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+					    struct _fpx_sw_bytes *fx_sw)
 {
 	int min_xstate_size = sizeof(struct fxregs_state) +
 			      sizeof(struct xstate_header);
@@ -32,7 +32,7 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
 	unsigned int magic2;
 
 	if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
-		return -EFAULT;
+		return false;
 
 	/* Check for the first magic field and other error scenarios. */
 	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
@@ -48,10 +48,10 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
 	 * in the memory layout.
 	 */
 	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
-		return -EFAULT;
+		return false;
 
 	if (likely(magic2 == FP_XSTATE_MAGIC2))
-		return 0;
+		return true;
 setfx:
 	trace_x86_fpu_xstate_check_failed(&current->thread.fpu);
 
@@ -59,7 +59,7 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
 	fx_sw->magic1 = 0;
 	fx_sw->xstate_size = sizeof(struct fxregs_state);
 	fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
-	return 0;
+	return true;
 }
 
 /*
@@ -324,7 +324,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	if (use_xsave()) {
 		struct _fpx_sw_bytes fx_sw_user;
 
-		if (check_xstate_in_sigframe(buf_fx, &fx_sw_user))
+		if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
 			return false;
 
 		fx_only = !fx_sw_user.magic1;
-- 
Gitee


From aa1f9fe8630bfeab02f017e812408c6e368be7c1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Sep 2021 15:29:41 +0200
Subject: [PATCH 0716/2161] x86/fpu/signal: Change return code of
 restore_fpregs_from_user() to boolean

commit be0040144152ed834c369a7830487e5ee4f27080 upstream.

__fpu_sig_restore() only needs information about success or fail and no
real error code.

This cleans up the confusing conversion of the trap number, which is
returned by the *RSTOR() exception fixups, to an error code.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210908132526.084109938@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 5fa497bea5fd..e3760ce60910 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -254,8 +254,8 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
  * Attempt to restore the FPU registers directly from user memory.
  * Pagefaults are handled and any errors returned are fatal.
  */
-static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
-				    bool fx_only, unsigned int size)
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
+				     bool fx_only, unsigned int size)
 {
 	struct fpu *fpu = &current->thread.fpu;
 	int ret;
@@ -284,12 +284,11 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
 
 		/* Try to handle #PF, but anything else is fatal. */
 		if (ret != X86_TRAP_PF)
-			return -EINVAL;
+			return false;
 
-		ret = fault_in_pages_readable(buf, size);
-		if (!ret)
+		if (!fault_in_pages_readable(buf, size))
 			goto retry;
-		return ret;
+		return false;
 	}
 
 	/*
@@ -306,7 +305,7 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
 
 	fpregs_mark_activate();
 	fpregs_unlock();
-	return 0;
+	return true;
 }
 
 static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
@@ -341,8 +340,8 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		 * faults. If it does, fall back to the slow path below, going
 		 * through the kernel buffer with the enabled pagefault handler.
 		 */
-		return !restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
-						 state_size);
+		return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
+						state_size);
 	}
 
 	/*
-- 
Gitee


From 698daae409ff203741e2ac380116186afb11bd07 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 10 Sep 2021 15:33:32 -0700
Subject: [PATCH 0717/2161] x86/asm: Fix SETZ size enqcmds() build failure

commit d81ff5fe14a950f53e2833cfa196e7bb3fd5d4e3 upstream.

When building under GCC 4.9 and 5.5:

  arch/x86/include/asm/special_insns.h: Assembler messages:
  arch/x86/include/asm/special_insns.h:286: Error: operand size mismatch for `setz'

Change the type to "bool" for condition code arguments, as documented.

Fixes: 7f5933f81bd8 ("x86/asm: Add an enqcmds() wrapper for the ENQCMDS instruction")
Co-developed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210910223332.3224851-1-keescook@chromium.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/special_insns.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index a1f1672b3c2e..44c7f80611ed 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -259,7 +259,7 @@ static inline int enqcmds(void __iomem *dst, const void *src)
 {
 	const struct { char _[64]; } *__src = src;
 	struct { char _[64]; } __iomem *__dst = dst;
-	int zf;
+	bool zf;
 
 	/*
 	 * ENQCMDS %(rdx), rax
-- 
Gitee


From a07c96f2ca8609b77271f546fa9f0f57fd122fd0 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Wed, 22 Sep 2021 22:09:01 +0200
Subject: [PATCH 0718/2161] x86/fpu/signal: Fix missed conversion to correct
 boolean retval in save_xstate_epilog()

commit 724fc0248d450224b19ef5b5ee41e392348f6704 upstream.

Fix the missing return code polarity in save_xstate_epilog().

 [ bp: Massage, use the right commit in the Fixes: tag ]

Fixes: 2af07f3a6e9f ("x86/fpu/signal: Change return type of copy_fpregs_to_sigframe() helpers to boolean")
Reported-by: Remi Duraffort <remi.duraffort@linaro.org>
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://github.com/ClangBuiltLinux/linux/issues/1461
Link: https://lkml.kernel.org/r/20210922200901.1823741-1-anders.roxell@linaro.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index e3760ce60910..d67ea654f6cc 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -106,7 +106,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame)
 	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
 
 	if (!use_xsave())
-		return err;
+		return !err;
 
 	err |= __put_user(FP_XSTATE_MAGIC2,
 			  (__u32 __user *)(buf + fpu_user_xstate_size));
-- 
Gitee


From 1d5097ed2153b6e62047607e8e53a10e6a9ce035 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 6 Oct 2021 18:33:52 +0200
Subject: [PATCH 0719/2161] x86/fpu: Restore the masking out of reserved MXCSR
 bits

commit d298b03506d3e161f7492c440babb0bfae35e650 upstream.

Ser Olmy reported a boot failure:

  init[1] bad frame in sigreturn frame:(ptrval) ip:b7c9fbe6 sp:bf933310 orax:ffffffff \
	  in libc-2.33.so[b7bed000+156000]
  Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
  CPU: 0 PID: 1 Comm: init Tainted: G        W         5.14.9 #1
  Hardware name: Hewlett-Packard HP PC/HP Board, BIOS  JD.00.06 12/06/2001
  Call Trace:
   dump_stack_lvl
   dump_stack
   panic
   do_exit.cold
   do_group_exit
   get_signal
   arch_do_signal_or_restart
   ? force_sig_info_to_task
   ? force_sig
   exit_to_user_mode_prepare
   syscall_exit_to_user_mode
   do_int80_syscall_32
   entry_INT80_32

on an old 32-bit Intel CPU:

  vendor_id       : GenuineIntel
  cpu family      : 6
  model           : 6
  model name      : Celeron (Mendocino)
  stepping        : 5
  microcode       : 0x3

Ser bisected the problem to the commit in Fixes.

tglx suggested reverting the rejection of invalid MXCSR values which
this commit introduced and replacing it with what the old code did -
simply masking them out to zero.

Further debugging confirmed his suggestion:

  fpu->state.fxsave.mxcsr: 0xb7be13b4, mxcsr_feature_mask: 0xffbf
  WARNING: CPU: 0 PID: 1 at arch/x86/kernel/fpu/signal.c:384 __fpu_restore_sig+0x51f/0x540

so restore the original behavior only for 32-bit kernels where you have
ancient machines with buggy hardware. For 32-bit programs on 64-bit
kernels, user space which supplies wrong MXCSR values is considered
malicious so fail the sigframe restoration there.

Fixes: 6f9866a166cd ("x86/fpu/signal: Let xrstor handle the features to init")
Reported-by: Ser Olmy <ser.olmy@protonmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Ser Olmy <ser.olmy@protonmail.com>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/YVtA67jImg3KlBTw@zn.tnic
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d67ea654f6cc..1af306b35021 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -384,9 +384,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 				     sizeof(fpu->state.fxsave)))
 			return false;
 
-		/* Reject invalid MXCSR values. */
-		if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
-			return false;
+		if (IS_ENABLED(CONFIG_X86_64)) {
+			/* Reject invalid MXCSR values. */
+			if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
+				return false;
+		} else {
+			/* Mask invalid bits out for historical reasons (broken hardware). */
+			fpu->state.fxsave.mxcsr &= ~mxcsr_feature_mask;
+		}
 
 		/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
 		if (use_xsave())
-- 
Gitee


From f957d8b84b03f7f048ac6d41d606a6698c630502 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 15 Oct 2021 12:46:25 +0200
Subject: [PATCH 0720/2161] x86/fpu: Mask out the invalid MXCSR bits properly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit b2381acd3fd9bacd2c63f53b2c610c89959b31cc upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a fix for the fix (yeah, /facepalm).

The correct mask to use is not the negation of the MXCSR_MASK but the
actual mask which contains the supported bits in the MXCSR register.

Reported and debugged by Ville Syrjälä <ville.syrjala@linux.intel.com>

Fixes: d298b03506d3 ("x86/fpu: Restore the masking out of reserved MXCSR bits")
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Tested-by: Ser Olmy <ser.olmy@protonmail.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/YWgYIYXLriayyezv@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 1af306b35021..40f6e8e3f545 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -390,7 +390,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 				return false;
 		} else {
 			/* Mask invalid bits out for historical reasons (broken hardware). */
-			fpu->state.fxsave.mxcsr &= ~mxcsr_feature_mask;
+			fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
 		}
 
 		/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
-- 
Gitee


From 43de1310440324efeca7a91aebe7c105a8cbc15f Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Thu, 22 Sep 2022 20:30:16 +0800
Subject: [PATCH 0721/2161] Revert "x86/fpu: Correct pkru/xstate inconsistency"

commit opencloudos.

This reverts commit 2d10af269c7af9508e6a9ce0f3fa90a1dfde1027.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 8 +++-----
 arch/x86/kernel/process_32.c        | 6 ++++--
 arch/x86/kernel/process_64.c        | 6 ++++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 7a812a772155..981026d35413 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -494,11 +494,9 @@ static inline void fpregs_restore_userregs(void)
  * The FPU context is only stored/restored for a user task and
  * PF_KTHREAD is used to distinguish between kernel and user threads.
  */
-static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
+static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 {
-	struct fpu *old_fpu = &prev->thread.fpu;
-
-	if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
+	if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
 		save_fpregs_to_fpstate(old_fpu);
 		/*
 		 * The save operation preserved register state, so the
@@ -522,7 +520,7 @@ static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
  * Delay loading of the complete FPU state until the return to userland.
  * PKRU is handled separately.
  */
-static inline void switch_fpu_finish(struct task_struct *next)
+static inline void switch_fpu_finish(struct fpu *new_fpu)
 {
 	if (cpu_feature_enabled(X86_FEATURE_FPU))
 		set_thread_flag(TIF_NEED_FPU_LOAD);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4e1972cc7cae..6c7d90527156 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -161,12 +161,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 			     *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
-		switch_fpu_prepare(prev_p, cpu);
+		switch_fpu_prepare(prev_fpu, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -222,7 +224,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	this_cpu_write(current_task, next_p);
 
-	switch_fpu_finish(next_p);
+	switch_fpu_finish(next_fpu);
 
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8855d69bab35..2f104408a2f8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -561,13 +561,15 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 		     this_cpu_read(irq_count) != -1);
 
 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
-		switch_fpu_prepare(prev_p, cpu);
+		switch_fpu_prepare(prev_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -621,7 +623,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
-	switch_fpu_finish(next_p);
+	switch_fpu_finish(next_fpu);
 
 	/* Reload sp0. */
 	update_task_stack(next_p);
-- 
Gitee


From b3701216a193643fc9ca1bda5021364d3bf05b64 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:15:54 +0200
Subject: [PATCH 0722/2161] x86/fpu: Remove pointless argument from
 switch_fpu_finish()

commit 9568bfb4f04bd9a280c592879ccd7a26a77c1390 upstream.

Unused since the FPU switching rework.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.433135710@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 2 +-
 arch/x86/kernel/process_32.c        | 2 +-
 arch/x86/kernel/process_64.c        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 981026d35413..e1dacba2189b 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -520,7 +520,7 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  * Delay loading of the complete FPU state until the return to userland.
  * PKRU is handled separately.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(void)
 {
 	if (cpu_feature_enabled(X86_FEATURE_FPU))
 		set_thread_flag(TIF_NEED_FPU_LOAD);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6c7d90527156..db44c898f22e 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -224,7 +224,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	this_cpu_write(current_task, next_p);
 
-	switch_fpu_finish(next_fpu);
+	switch_fpu_finish();
 
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2f104408a2f8..852656ce851a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -623,7 +623,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
-	switch_fpu_finish(next_fpu);
+	switch_fpu_finish();
 
 	/* Reload sp0. */
 	update_task_stack(next_p);
-- 
Gitee


From dad17e956d4b0acf8375587ef3964aef5bef3a6b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:15:56 +0200
Subject: [PATCH 0723/2161] x86/fpu: Update stale comments

commit d2d926482cdfbd5517826eca4e39dcd8757f04d3 upstream.

copy_fpstate_to_sigframe() does not have a slow path anymore. Neither does
the !ia32 restore in __fpu_restore_sig().

Update the comments accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.493570236@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 40f6e8e3f545..a40bb54c2d46 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -155,10 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  *	buf != buf_fx for 32-bit frames with fxstate.
  *
- * Try to save it directly to the user frame with disabled page fault handler.
- * If this fails then do the slow path where the FPU state is first saved to
- * task's fpu->state and then copy it to the user frame pointed to by the
- * aligned pointer 'buf_fx'.
+ * Save it directly to the user frame with disabled page fault handler. If
+ * that faults, try to clear the frame which handles the page fault.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -334,12 +332,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	}
 	
 	if (likely(!ia32_fxstate)) {
-		/*
-		 * Attempt to restore the FPU registers directly from user
-		 * memory. For that to succeed, the user access cannot cause page
-		 * faults. If it does, fall back to the slow path below, going
-		 * through the kernel buffer with the enabled pagefault handler.
-		 */
+		/* Restore the FPU registers directly from user memory. */
 		return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
 						state_size);
 	}
-- 
Gitee


From a675f20b6c8f495b19c4eb4307544dc265a17af3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:15:57 +0200
Subject: [PATCH 0724/2161] x86/pkru: Remove useless include

commit b50854eca0e014c2d3738073b387ab8ec85118ab upstream.

PKRU code does not need anything from FPU headers. Include cpufeature.h
instead and fixup the resulting fallout in perf.

This is a preparation for FPU changes in order to prevent recursive include
hell.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.551522694@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/perf_event.h | 1 +
 arch/x86/include/asm/pkru.h  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5648023e3a65..bac456f04c4b 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,7 @@
 
 #include <linux/perf_event.h>
 
+#include <asm/fpu/xstate.h>
 #include <asm/intel_ds.h>
 
 /* To enable MSR tracing please use the generic trace points. */
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index 4bb2a34ef498..f6c28480dba6 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_X86_PKRU_H
 #define _ASM_X86_PKRU_H
 
-#include <asm/fpu/xstate.h>
+#include <asm/cpufeature.h>
 
 #define PKRU_AD_BIT 0x1u
 #define PKRU_WD_BIT 0x2u
-- 
Gitee


From 6b596e985f24f6f81d618387e7e0ba22340efc1f Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 25 Mar 2022 11:38:51 +0800
Subject: [PATCH 0725/2161] stacktrace: Fix get_fs() implicit declaration

commit opencloudos.

Manually include <linux/uaccess.h>.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/stacktrace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index c9ea7eb2cb1a..7b3c0e801d29 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
+#include <linux/uaccess.h>
 
 /**
  * stack_trace_print - Print the entries in the stack trace
-- 
Gitee


From b6757d7c01a74902a9db2dfaf1a9ddec7685f5a2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:15:59 +0200
Subject: [PATCH 0726/2161] x86/fpu: Restrict xsaves()/xrstors() to independent
 states

commit f5daf836f292f795f9cf8f36e036bf47adcbc3a3 upstream.

These interfaces are really only valid for features which are independently
managed and not part of the task context state for various reasons.

Tighten the checks and adjust the misleading comments.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.608492174@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 33006fbb0351..8d19f2291df1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1202,20 +1202,14 @@ int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
 	return copy_uabi_to_xstate(xsave, NULL, ubuf);
 }
 
-static bool validate_xsaves_xrstors(u64 mask)
+static bool validate_independent_components(u64 mask)
 {
 	u64 xchk;
 
 	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
 		return false;
-	/*
-	 * Validate that this is either a task->fpstate related component
-	 * subset or an independent one.
-	 */
-	if (mask & xfeatures_mask_independent())
-		xchk = ~xfeatures_mask_independent();
-	else
-		xchk = ~xfeatures_mask_all;
+
+	xchk = ~xfeatures_mask_independent();
 
 	if (WARN_ON_ONCE(!mask || mask & xchk))
 		return false;
@@ -1233,14 +1227,13 @@ static bool validate_xsaves_xrstors(u64 mask)
  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
  * can #GP.
  *
- * The feature mask must either be a subset of the independent features or
- * a subset of the task->fpstate related features.
+ * The feature mask must be a subset of the independent features.
  */
 void xsaves(struct xregs_state *xstate, u64 mask)
 {
 	int err;
 
-	if (!validate_xsaves_xrstors(mask))
+	if (!validate_independent_components(mask))
 		return;
 
 	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
@@ -1258,14 +1251,13 @@ void xsaves(struct xregs_state *xstate, u64 mask)
  * Proper usage is to restore the state which was saved with
  * xsaves() into @xstate.
  *
- * The feature mask must either be a subset of the independent features or
- * a subset of the task->fpstate related features.
+ * The feature mask must be a subset of the independent features.
  */
 void xrstors(struct xregs_state *xstate, u64 mask)
 {
 	int err;
 
-	if (!validate_xsaves_xrstors(mask))
+	if (!validate_independent_components(mask))
 		return;
 
 	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
-- 
Gitee


From 40626a4d692814d885bf7ca5dbe0c885d823e653 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Tue, 22 Sep 2020 06:56:38 +0900
Subject: [PATCH 0727/2161] x86/fpu: Handle FPU-related and clearcpuid command
 line arguments earlier

commit 1ef5423a55c2ac6f1361811efe75b6e46d1023ed upstream.

FPU initialization handles them currently. However, in the case
of clearcpuid=, some other early initialization code may check for
features before the FPU initialization code is called. Handling the
argument earlier allows the command line to influence those early
initializations.

Signed-off-by: Mike Hommey <mh@glandium.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200921215638.37980-1-mh@glandium.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/common.c | 55 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/init.c   | 55 ------------------------------------
 2 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4898078deede..3570f72622e6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,6 +21,7 @@
 #include <linux/io.h>
 #include <linux/syscore_ops.h>
 
+#include <asm/cmdline.h>
 #include <asm/stackprotector.h>
 #include <asm/perf_event.h>
 #include <asm/mmu_context.h>
@@ -1239,6 +1240,59 @@ static void detect_nopl(void)
 #endif
 }
 
+/*
+ * We parse cpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init cpu_parse_early_param(void)
+{
+	char arg[128];
+	char *argptr = arg;
+	int arglen, res, bit;
+
+#ifdef CONFIG_X86_32
+	if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
+		setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+		pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
+#endif
+
+	if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
+		setup_clear_cpu_cap(X86_FEATURE_FXSR);
+#endif
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+	arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+	if (arglen <= 0)
+		return;
+
+	pr_info("Clearing CPUID bits:");
+	do {
+		res = get_option(&argptr, &bit);
+		if (res == 0 || res == 3)
+			break;
+
+		/* If the argument was too long, the last bit may be cut off */
+		if (res == 1 && arglen >= sizeof(arg))
+			break;
+
+		if (bit >= 0 && bit < NCAPINTS * 32) {
+			pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+			setup_clear_cpu_cap(bit);
+		}
+	} while (res == 2);
+	pr_cont("\n");
+}
+
 /*
  * Do minimum CPU detection early.
  * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -1274,6 +1328,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		get_cpu_cap(c);
 		get_cpu_address_sizes(c);
 		setup_force_cpu_cap(X86_FEATURE_CPUID);
+		cpu_parse_early_param();
 
 		if (this_cpu->c_early_init)
 			this_cpu->c_early_init(c);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d708b6a1c077..64e29927cc32 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -5,7 +5,6 @@
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
 #include <asm/setup.h>
-#include <asm/cmdline.h>
 
 #include <linux/sched.h>
 #include <linux/sched/task.h>
@@ -226,66 +225,12 @@ static void __init fpu__init_system_ctx_switch(void)
 	on_boot_cpu = 0;
 }
 
-/*
- * We parse fpu parameters early because fpu__init_system() is executed
- * before parse_early_param().
- */
-static void __init fpu__init_parse_early_param(void)
-{
-	char arg[128];
-	char *argptr = arg;
-	int arglen, res, bit;
-
-#ifdef CONFIG_X86_32
-	if (cmdline_find_option_bool(boot_command_line, "no387"))
-#ifdef CONFIG_MATH_EMULATION
-		setup_clear_cpu_cap(X86_FEATURE_FPU);
-#else
-		pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
-#endif
-
-	if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
-		setup_clear_cpu_cap(X86_FEATURE_FXSR);
-#endif
-
-	if (cmdline_find_option_bool(boot_command_line, "noxsave"))
-		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-
-	if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
-		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
-	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
-		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
-	arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
-	if (arglen <= 0)
-		return;
-
-	pr_info("Clearing CPUID bits:");
-	do {
-		res = get_option(&argptr, &bit);
-		if (res == 0 || res == 3)
-			break;
-
-		/* If the argument was too long, the last bit may be cut off */
-		if (res == 1 && arglen >= sizeof(arg))
-			break;
-
-		if (bit >= 0 && bit < NCAPINTS * 32) {
-			pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
-			setup_clear_cpu_cap(bit);
-		}
-	} while (res == 2);
-	pr_cont("\n");
-}
-
 /*
  * Called on the boot CPU once per system bootup, to set up the initial
  * FPU state that is later cloned into all processes:
  */
 void __init fpu__init_system(struct cpuinfo_x86 *c)
 {
-	fpu__init_parse_early_param();
 	fpu__init_system_early_generic(c);
 
 	/*
-- 
Gitee


From e96c2c313e02a6fa081e8ea9c0e3e2a1c6dc4867 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:01 +0200
Subject: [PATCH 0728/2161] x86/fpu: Cleanup the on_boot_cpu clutter

commit dc2f39fd1bf23eee644d409b84e8e435606997bf upstream.

Defensive programming is useful, but this on_boot_cpu debug is really
silly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.665080855@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/init.c   | 16 ----------------
 arch/x86/kernel/fpu/xstate.c |  9 ---------
 2 files changed, 25 deletions(-)

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 64e29927cc32..86bc9759fc8b 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -192,11 +192,6 @@ static void __init fpu__init_task_struct_size(void)
  */
 static void __init fpu__init_system_xstate_size_legacy(void)
 {
-	static int on_boot_cpu __initdata = 1;
-
-	WARN_ON_FPU(!on_boot_cpu);
-	on_boot_cpu = 0;
-
 	/*
 	 * Note that xstate sizes might be overwritten later during
 	 * fpu__init_system_xstate().
@@ -216,15 +211,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 	fpu_user_xstate_size = fpu_kernel_xstate_size;
 }
 
-/* Legacy code to initialize eager fpu mode. */
-static void __init fpu__init_system_ctx_switch(void)
-{
-	static bool on_boot_cpu __initdata = 1;
-
-	WARN_ON_FPU(!on_boot_cpu);
-	on_boot_cpu = 0;
-}
-
 /*
  * Called on the boot CPU once per system bootup, to set up the initial
  * FPU state that is later cloned into all processes:
@@ -243,6 +229,4 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 	fpu__init_system_xstate_size_legacy();
 	fpu__init_system_xstate();
 	fpu__init_task_struct_size();
-
-	fpu__init_system_ctx_switch();
 }
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 8d19f2291df1..b2b320a04857 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -378,13 +378,8 @@ static void __init print_xstate_offset_size(void)
  */
 static void __init setup_init_fpu_buf(void)
 {
-	static int on_boot_cpu __initdata = 1;
-
 	BUILD_BUG_ON(XFEATURE_MASK_USER_SUPPORTED != XFEATURES_INIT_FPSTATE_HANDLED);
 
-	WARN_ON_FPU(!on_boot_cpu);
-	on_boot_cpu = 0;
-
 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return;
 
@@ -718,14 +713,10 @@ static void fpu__init_disable_system_xstate(void)
 void __init fpu__init_system_xstate(void)
 {
 	unsigned int eax, ebx, ecx, edx;
-	static int on_boot_cpu __initdata = 1;
 	u64 xfeatures;
 	int err;
 	int i;
 
-	WARN_ON_FPU(!on_boot_cpu);
-	on_boot_cpu = 0;
-
 	if (!boot_cpu_has(X86_FEATURE_FPU)) {
 		pr_info("x86/fpu: No FPU detected\n");
 		return;
-- 
Gitee


From 17ab2b6aae566f1b235685133e6932ff1f970045 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:02 +0200
Subject: [PATCH 0729/2161] x86/fpu: Remove pointless memset in fpu_clone()

commit 01f9f62d3ae75077a54a11d2777082f1e58e2d9f upstream.

Zeroing the forked task's FPU registers buffer to avoid leaking init
optimized stale data into the clone is a pointless exercise for the case
where the current task has TIF_NEED_FPU_LOAD set. In that case, the FPU
registers state is copied from current's FPU register buffer which can
contain stale init optimized data as well.

The alledged information leak is non-existant because this stale init
optimized data is used nowhere and cannot leak anywhere.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.722854569@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 7ada7bd03a32..191269edac97 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -259,12 +259,6 @@ int fpu_clone(struct task_struct *dst)
 	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return 0;
 
-	/*
-	 * Don't let 'init optimized' areas of the XSAVE area
-	 * leak into the child task:
-	 */
-	memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
-
 	/*
 	 * If the FPU registers are not owned by current just memcpy() the
 	 * state.  Otherwise save the FPU registers directly into the
-- 
Gitee


From c975c20387514548d4a5e6d3024ca90503e0ac57 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 21 Dec 2021 17:40:11 +0800
Subject: [PATCH 0730/2161] x86/process: Clone FPU in copy_thread()

commit 2d16a1876f20218f8970ea4b7f679cead1cdb510 upstream.

There is no reason to clone FPU in arch_dup_task_struct(). Quite the
contrary - it prevents optimizations. Move it to copy_thread().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.780714235@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/alpha/kernel/process.c | 2 ++
 arch/x86/kernel/process.c   | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 8b1524b7af77..5b514913a655 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -251,6 +251,8 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 	childti->pcb.ksp = (unsigned long) childstack;
 	childti->pcb.flags = 1;	/* set FEN, clear everything else */
 
+	fpu_clone(p);
+
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		/* kernel thread */
 		p->thread.pkru = pkru_get_init_value();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 90d8ef86e2c3..be9adbedfc8d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -100,7 +100,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 #ifdef CONFIG_VM86
 	dst->thread.vm86 = NULL;
 #endif
-	return fpu_clone(dst);
+	return 0;
 }
 
 /*
@@ -212,6 +212,8 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	frame->flags = X86_EFLAGS_FIXED;
 #endif
 
+	fpu_clone(p);
+
 	/* Kernel thread ? */
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		memset(childregs, 0, sizeof(struct pt_regs));
-- 
Gitee


From d94a01bf36da78ebebf412283066605b2a668744 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:06 +0200
Subject: [PATCH 0731/2161] x86/fpu: Do not inherit FPU context for kernel and
 IO worker threads

commit 509e7a30cd0a9f38abac4114832d9f69ff0d73b4 upstream.

There is no reason why kernel and IO worker threads need a full clone of
the parent's FPU state. Both are kernel threads which are not supposed to
use FPU. So copying a large state or doing XSAVE() is pointless. Just clean
out the minimally required state for those tasks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.839822981@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 191269edac97..edc253b57adc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -212,6 +212,15 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
 	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
 }
 
+static inline unsigned int init_fpstate_copy_size(void)
+{
+	if (!use_xsave())
+		return fpu_kernel_xstate_size;
+
+	/* XSAVE(S) just needs the legacy and the xstate header part */
+	return sizeof(init_fpstate.xsave);
+}
+
 static inline void fpstate_init_fxstate(struct fxregs_state *fx)
 {
 	fx->cwd = 0x37f;
@@ -259,6 +268,23 @@ int fpu_clone(struct task_struct *dst)
 	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return 0;
 
+	/*
+	 * Enforce reload for user space tasks and prevent kernel threads
+	 * from trying to save the FPU registers on context switch.
+	 */
+	set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
+
+	/*
+	 * No FPU state inheritance for kernel threads and IO
+	 * worker threads.
+	 */
+	if (dst->flags & (PF_KTHREAD)) {
+		/* Clear out the minimal state */
+		memcpy(&dst_fpu->state, &init_fpstate,
+		       init_fpstate_copy_size());
+		return 0;
+	}
+
 	/*
 	 * If the FPU registers are not owned by current just memcpy() the
 	 * state.  Otherwise save the FPU registers directly into the
@@ -272,8 +298,6 @@ int fpu_clone(struct task_struct *dst)
 		save_fpregs_to_fpstate(dst_fpu);
 	fpregs_unlock();
 
-	set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
-
 	trace_x86_fpu_copy_src(src_fpu);
 	trace_x86_fpu_copy_dst(dst_fpu);
 
@@ -322,15 +346,6 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 	pkru_write_default();
 }
 
-static inline unsigned int init_fpstate_copy_size(void)
-{
-	if (!use_xsave())
-		return fpu_kernel_xstate_size;
-
-	/* XSAVE(S) just needs the legacy and the xstate header part */
-	return sizeof(init_fpstate.xsave);
-}
-
 /*
  * Reset current->fpu memory state to the init values.
  */
-- 
Gitee


From c90cebae0d57400c1979534eccf749040222b098 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 22 Dec 2021 19:51:35 +0800
Subject: [PATCH 0732/2161] KVM: SVM: Guest FPU state save/restore not needed
 for SEV-ES guest

commit ed02b213098a90c2a415a0da18f05841f8cf0a81 upstream.

The guest FPU state is automatically restored on VMRUN and saved on VMEXIT
by the hardware, so there is no reason to do this in KVM. Eliminate the
allocation of the guest_fpu save area and key off that to skip operations
related to the guest FPU state.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Message-Id: <173e429b4d0d962c6a443c4553ffdaf31b7665a4.1607620209.git.thomas.lendacky@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/vmx/vmx.c          |  4 +--
 arch/x86/kvm/x86.c              | 52 ++++++++++++++++++++++++++++-----
 3 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b90f9047541d..c42af654b8cb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1382,6 +1382,8 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 		    int reason, bool has_error_code, u32 error_code);
 
+void kvm_free_guest_fpu(struct kvm_vcpu *vcpu);
+
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6565a26f7214..053cd25ccf69 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6682,7 +6682,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
+	kvm_free_guest_fpu(&vmx->vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
@@ -6814,7 +6814,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
 	free_vpid(vmx->vpid);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
+	kvm_free_guest_fpu(&vmx->vcpu);
 free_user_fpu:
 	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
 free_partial_vcpu:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 68d58541307b..466e850c6ad8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4108,6 +4108,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 					 struct kvm_xsave *guest_xsave)
 {
+	if (!vcpu->arch.guest_fpu)
+		return;
+
 	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
 		memset(guest_xsave, 0, sizeof(struct kvm_xsave));
 		fill_xsave((u8 *) guest_xsave->region, vcpu);
@@ -4125,9 +4128,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
-	u64 xstate_bv =
-		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-	u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
+	u64 xstate_bv;
+	u32 mxcsr;
+
+	if (!vcpu->arch.guest_fpu)
+		return 0;
+
+	xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+	mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
 
 	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
 		/*
@@ -8622,9 +8630,14 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
 	kvm_save_current_fpu(vcpu->arch.user_fpu);
 
-	/* PKRU is separately restored in kvm_x86_ops->run.  */
-	__restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
-				~XFEATURE_MASK_PKRU);
+	/*
+	 * Guests with protected state can't have it set by the hypervisor,
+	 * so skip trying to set it.
+	 */
+	if (vcpu->arch.guest_fpu)
+		/* PKRU is separately restored in kvm_x86_ops->run. */
+		__restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
+					~XFEATURE_MASK_PKRU);
 
 	fpregs_mark_activate();
 	fpregs_unlock();
@@ -8637,7 +8650,12 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	fpregs_lock();
 
-	kvm_save_current_fpu(vcpu->arch.guest_fpu);
+	/*
+	 * Guests with protected state can't have it read by the hypervisor,
+	 * so skip trying to save it.
+	 */
+	if (vcpu->arch.guest_fpu)
+		kvm_save_current_fpu(vcpu->arch.guest_fpu);
 
 	restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
 
@@ -9137,6 +9155,9 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxregs_state *fxsave;
 
+	if (!vcpu->arch.guest_fpu)
+		return 0;
+
 	vcpu_load(vcpu);
 
 	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
@@ -9157,6 +9178,9 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxregs_state *fxsave;
 
+	if (!vcpu->arch.guest_fpu)
+		return 0;
+
 	vcpu_load(vcpu);
 
 	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
@@ -9215,6 +9239,9 @@ static int sync_regs(struct kvm_vcpu *vcpu)
 
 static void fx_init(struct kvm_vcpu *vcpu)
 {
+	if (!vcpu->arch.guest_fpu)
+		return;
+
 	fpstate_init(&vcpu->arch.guest_fpu->state);
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
@@ -9228,6 +9255,15 @@ static void fx_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr0 |= X86_CR0_ET;
 }
 
+void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.guest_fpu) {
+		kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+		vcpu->arch.guest_fpu = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
+
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
@@ -9335,7 +9371,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
-	if (kvm_mpx_supported()) {
+	if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
 		void *mpx_state_buffer;
 
 		/*
-- 
Gitee


From bdcc74e6d7e2789277023e95de9fc71115b76248 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 22 Dec 2021 20:14:52 +0800
Subject: [PATCH 0733/2161] x86/fpu: Cleanup xstate xcomp_bv initialization

commit 126fe0401883598b45b34dbbd5e0d7d8a0aefa21 upstream.

No point in having this duplicated all over the place with needlessly
different defines.

Provide a proper initialization function which initializes user buffers
properly and make KVM use it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.897664678@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h |  4 +++-
 arch/x86/kernel/fpu/core.c          | 35 ++++++++++++++++-------------
 arch/x86/kernel/fpu/init.c          |  6 ++---
 arch/x86/kernel/fpu/xstate.c        |  8 +++----
 arch/x86/kernel/fpu/xstate.h        | 18 +++++++++++++++
 arch/x86/kvm/x86.c                  | 10 ++-------
 6 files changed, 48 insertions(+), 33 deletions(-)
 create mode 100644 arch/x86/kernel/fpu/xstate.h

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index e1dacba2189b..df1ad0117aff 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -79,7 +79,9 @@ static __always_inline __pure bool use_fxsr(void)
 
 extern union fpregs_state init_fpstate;
 
-extern void fpstate_init(union fpregs_state *state);
+extern void fpstate_init_user(union fpregs_state *state);
+extern void fpu_init_fpstate_user(struct fpu *fpu);
+
 #ifdef CONFIG_MATH_EMULATION
 extern void fpstate_init_soft(struct swregs_state *soft);
 #else
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index edc253b57adc..5d6c4a5bbfaf 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -16,6 +16,8 @@
 #include <linux/hardirq.h>
 #include <linux/pkeys.h>
 
+#include "xstate.h"
+
 #define CREATE_TRACE_POINTS
 #include <asm/trace/fpu.h>
 
@@ -203,15 +205,6 @@ void fpu_sync_fpstate(struct fpu *fpu)
 	fpregs_unlock();
 }
 
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
-{
-	/*
-	 * XRSTORS requires these bits set in xcomp_bv, or it will
-	 * trigger #GP:
-	 */
-	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
-}
-
 static inline unsigned int init_fpstate_copy_size(void)
 {
 	if (!use_xsave())
@@ -238,23 +231,33 @@ static inline void fpstate_init_fstate(struct fregs_state *fp)
 	fp->fos = 0xffff0000u;
 }
 
-void fpstate_init(union fpregs_state *state)
+/*
+ * Used in two places:
+ * 1) Early boot to setup init_fpstate for non XSAVE systems
+ * 2) fpu_init_fpstate_user() which is invoked from KVM
+ */
+void fpstate_init_user(union fpregs_state *state)
 {
-	if (!static_cpu_has(X86_FEATURE_FPU)) {
+	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
 		fpstate_init_soft(&state->soft);
 		return;
 	}
 
-	memset(state, 0, fpu_kernel_xstate_size);
+	xstate_init_xcomp_bv(&state->xsave, xfeatures_mask_uabi());
 
-	if (static_cpu_has(X86_FEATURE_XSAVES))
-		fpstate_init_xstate(&state->xsave);
-	if (static_cpu_has(X86_FEATURE_FXSR))
+	if (cpu_feature_enabled(X86_FEATURE_FXSR))
 		fpstate_init_fxstate(&state->fxsave);
 	else
 		fpstate_init_fstate(&state->fsave);
 }
-EXPORT_SYMBOL_GPL(fpstate_init);
+
+#if IS_ENABLED(CONFIG_KVM)
+void fpu_init_fpstate_user(struct fpu *fpu)
+{
+	fpstate_init_user(&fpu->state);
+}
+EXPORT_SYMBOL_GPL(fpu_init_fpstate_user);
+#endif
 
 /* Clone current's FPU state on fork */
 int fpu_clone(struct task_struct *dst)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 86bc9759fc8b..37f872630a0e 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -121,10 +121,10 @@ static void __init fpu__init_system_mxcsr(void)
 static void __init fpu__init_system_generic(void)
 {
 	/*
-	 * Set up the legacy init FPU context. (xstate init might overwrite this
-	 * with a more modern format, if the CPU supports it.)
+	 * Set up the legacy init FPU context. Will be updated when the
+	 * CPU supports XSAVE[S].
 	 */
-	fpstate_init(&init_fpstate);
+	fpstate_init_user(&init_fpstate);
 
 	fpu__init_system_mxcsr();
 }
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b2b320a04857..0a2497ad9d9d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -15,10 +15,10 @@
 #include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
 
 #include <asm/tlbflush.h>
-#include <asm/cpufeature.h>
+
+#include "xstate.h"
 
 /*
  * Although we spell it out in here, the Processor Trace
@@ -386,9 +386,7 @@ static void __init setup_init_fpu_buf(void)
 	setup_xstate_features();
 	print_xstate_features();
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
-						     xfeatures_mask_all;
+	xstate_init_xcomp_bv(&init_fpstate.xsave, xfeatures_mask_all);
 
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
new file mode 100644
index 000000000000..0789a04ee705
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_XSTATE_H
+#define __X86_KERNEL_FPU_XSTATE_H
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/xstate.h>
+
+static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
+{
+	/*
+	 * XRSTORS requires these bits set in xcomp_bv, or it will
+	 * trigger #GP:
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+		xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
+}
+
+#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 466e850c6ad8..b31b00f394d0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9239,14 +9239,6 @@ static int sync_regs(struct kvm_vcpu *vcpu)
 
 static void fx_init(struct kvm_vcpu *vcpu)
 {
-	if (!vcpu->arch.guest_fpu)
-		return;
-
-	fpstate_init(&vcpu->arch.guest_fpu->state);
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
-			host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
 	/*
 	 * Ensure guest xcr0 is valid for loading
 	 */
@@ -9616,6 +9608,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_mce_banks;
 	}
 
+	fpu_init_fpstate_user(vcpu->arch.user_fpu);
+	fpu_init_fpstate_user(vcpu->arch.guest_fpu);
 	fx_init(vcpu);
 
 	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-- 
Gitee


From 81dc9d625d4eee3d1d3aabb55ccfc97ffbc2f339 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:09 +0200
Subject: [PATCH 0734/2161] x86/fpu/xstate: Provide and use for_each_xfeature()

commit ffd3e504c9e0de8b85755f3c7eabbbdd984cfeed upstream.

These loops evaluating xfeature bits are really hard to read. Create an
iterator and use for_each_set_bit_from() inside which already does the right
thing.

No functional changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011538.958107505@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 56 +++++++++++++++---------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0a2497ad9d9d..c438f048a53a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -4,6 +4,7 @@
  *
  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
  */
+#include <linux/bitops.h>
 #include <linux/compat.h>
 #include <linux/cpu.h>
 #include <linux/mman.h>
@@ -20,6 +21,10 @@
 
 #include "xstate.h"
 
+#define for_each_extended_xfeature(bit, mask)				\
+	(bit) = FIRST_EXTENDED_XFEATURE;				\
+	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
+
 /*
  * Although we spell it out in here, the Processor Trace
  * xfeature is completely unused.  We use other mechanisms
@@ -184,10 +189,7 @@ static void __init setup_xstate_features(void)
 	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
 						       xmm_space);
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (!xfeature_enabled(i))
-			continue;
-
+	for_each_extended_xfeature(i, xfeatures_mask_all) {
 		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
 
 		xstate_sizes[i] = eax;
@@ -291,20 +293,15 @@ static void __init setup_xstate_comp_offsets(void)
 	xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
 						     xmm_space);
 
-	if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
-		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-			if (xfeature_enabled(i))
-				xstate_comp_offsets[i] = xstate_offsets[i];
-		}
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+		for_each_extended_xfeature(i, xfeatures_mask_all)
+			xstate_comp_offsets[i] = xstate_offsets[i];
 		return;
 	}
 
 	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (!xfeature_enabled(i))
-			continue;
-
+	for_each_extended_xfeature(i, xfeatures_mask_all) {
 		if (xfeature_is_aligned(i))
 			next_offset = ALIGN(next_offset, 64);
 
@@ -328,8 +325,8 @@ static void __init setup_supervisor_only_offsets(void)
 
 	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (!xfeature_enabled(i) || !xfeature_is_supervisor(i))
+	for_each_extended_xfeature(i, xfeatures_mask_all) {
+		if (!xfeature_is_supervisor(i))
 			continue;
 
 		if (xfeature_is_aligned(i))
@@ -347,9 +344,7 @@ static void __init print_xstate_offset_size(void)
 {
 	int i;
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (!xfeature_enabled(i))
-			continue;
+	for_each_extended_xfeature(i, xfeatures_mask_all) {
 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
 			 i, xstate_comp_offsets[i], i, xstate_sizes[i]);
 	}
@@ -551,10 +546,7 @@ static void do_extra_xstate_size_checks(void)
 	int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 	int i;
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (!xfeature_enabled(i))
-			continue;
-
+	for_each_extended_xfeature(i, xfeatures_mask_all) {
 		check_xstate_against_struct(i);
 		/*
 		 * Supervisor state components can be managed only by
@@ -583,7 +575,6 @@ static void do_extra_xstate_size_checks(void)
 	XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
 }
 
-
 /*
  * Get total size of enabled xstates in XCR0 | IA32_XSS.
  *
@@ -996,6 +987,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 	struct xregs_state *xinit = &init_fpstate.xsave;
 	struct xstate_header header;
 	unsigned int zerofrom;
+	u64 mask;
 	int i;
 
 	memset(&header, 0, sizeof(header));
@@ -1049,17 +1041,15 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 
 	zerofrom = offsetof(struct xregs_state, extended_state_area);
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		/*
-		 * The ptrace buffer is in non-compacted XSAVE format.
-		 * In non-compacted format disabled features still occupy
-		 * state space, but there is no state to copy from in the
-		 * compacted init_fpstate. The gap tracking will zero this
-		 * later.
-		 */
-		if (!(xfeatures_mask_uabi() & BIT_ULL(i)))
-			continue;
+	/*
+	 * The ptrace buffer is in non-compacted XSAVE format.  In
+	 * non-compacted format disabled features still occupy state space,
+	 * but there is no state to copy from in the compacted
+	 * init_fpstate. The gap tracking will zero these states.
+	 */
+	mask = xfeatures_mask_uabi();
 
+	for_each_extended_xfeature(i, mask) {
 		/*
 		 * If there was a feature or alignment gap, zero the space
 		 * in the destination buffer.
-- 
Gitee


From 2a4c5b313bfa1bc145ab00d1a0be961c62fdcd1e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:10 +0200
Subject: [PATCH 0735/2161] x86/fpu/xstate: Mark all init only functions __init

commit 63cf05a19a5d3fb6e66b5f7ceb76e77dfc2695f2 upstream.

No point to keep them around after boot.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.017919252@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c438f048a53a..14085b2d2295 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -459,7 +459,7 @@ static int validate_user_xstate_header(const struct xstate_header *hdr)
 	return 0;
 }
 
-static void __xstate_dump_leaves(void)
+static void __init __xstate_dump_leaves(void)
 {
 	int i;
 	u32 eax, ebx, ecx, edx;
@@ -499,7 +499,7 @@ static void __xstate_dump_leaves(void)
  * that our software representation matches what the CPU
  * tells us about the state's size.
  */
-static void check_xstate_against_struct(int nr)
+static void __init check_xstate_against_struct(int nr)
 {
 	/*
 	 * Ask the CPU for the size of the state.
@@ -541,7 +541,7 @@ static void check_xstate_against_struct(int nr)
  * covered by these checks. Only the size of the buffer for task->fpu
  * is checked here.
  */
-static void do_extra_xstate_size_checks(void)
+static void __init do_extra_xstate_size_checks(void)
 {
 	int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 	int i;
@@ -643,7 +643,7 @@ static unsigned int __init get_xsave_size(void)
  * Will the runtime-enumerated 'xstate_size' fit in the init
  * task's statically-allocated buffer?
  */
-static bool is_supported_xstate_size(unsigned int test_xstate_size)
+static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
 {
 	if (test_xstate_size <= sizeof(union fpregs_state))
 		return true;
@@ -688,7 +688,7 @@ static int __init init_xstate_size(void)
  * We enabled the XSAVE hardware, but something went wrong and
  * we can not use it.  Disable it.
  */
-static void fpu__init_disable_system_xstate(void)
+static void __init fpu__init_disable_system_xstate(void)
 {
 	xfeatures_mask_all = 0;
 	cr4_clear_bits(X86_CR4_OSXSAVE);
-- 
Gitee


From 11e796b91ded71499328b00dc83120b140ffd322 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:12 +0200
Subject: [PATCH 0736/2161] x86/fpu: Move KVMs FPU swapping to FPU core

commit a0ff0611c2fbde94f6c9db8351939b08f2cb6797 upstream.

Swapping the host/guest FPU is directly fiddling with FPU internals which
requires 5 exports. The upcoming support of dynamically enabled states
would even need more.

Implement a swap function in the FPU core code and export that instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: kvm@vger.kernel.org
Link: https://lkml.kernel.org/r/20211015011539.076072399@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h      |  8 +++++
 arch/x86/include/asm/fpu/internal.h | 15 ++-------
 arch/x86/kernel/fpu/core.c          | 30 ++++++++++++++---
 arch/x86/kernel/fpu/init.c          |  1 -
 arch/x86/kernel/fpu/xstate.c        |  1 -
 arch/x86/kvm/x86.c                  | 51 +++++++----------------------
 arch/x86/mm/extable.c               |  2 +-
 7 files changed, 48 insertions(+), 60 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 8b9bfaad6e66..2928de7c5dc2 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -12,6 +12,8 @@
 #define _ASM_X86_FPU_API_H
 #include <linux/bottom_half.h>
 
+#include <asm/fpu/types.h>
+
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
  * disables preemption so be careful if you intend to use it for long periods
@@ -81,4 +83,10 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
 
 static inline void update_pasid(void) { }
 
+/* fpstate-related functions which are exported to KVM */
+extern void fpu_init_fpstate_user(struct fpu *fpu);
+
+/* KVM specific functions */
+extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
+
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index df1ad0117aff..ddea882f05f7 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -73,14 +73,8 @@ static __always_inline __pure bool use_fxsr(void)
 	return static_cpu_has(X86_FEATURE_FXSR);
 }
 
-/*
- * fpstate handling functions:
- */
-
 extern union fpregs_state init_fpstate;
-
 extern void fpstate_init_user(union fpregs_state *state);
-extern void fpu_init_fpstate_user(struct fpu *fpu);
 
 #ifdef CONFIG_MATH_EMULATION
 extern void fpstate_init_soft(struct swregs_state *soft);
@@ -380,12 +374,7 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
 	return err;
 }
 
-extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
-
-static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
-{
-	__restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
-}
+extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
 
 extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
 
@@ -466,7 +455,7 @@ static inline void fpregs_restore_userregs(void)
 		 */
 		mask = xfeatures_mask_restore_user() |
 			xfeatures_mask_supervisor();
-		__restore_fpregs_from_fpstate(&fpu->state, mask);
+		restore_fpregs_from_fpstate(&fpu->state, mask);
 
 		fpregs_activate(fpu);
 		fpu->last_cpu = cpu;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 5d6c4a5bbfaf..ebaa9bedeacb 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -124,9 +124,8 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
 	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
 	frstor(&fpu->state.fsave);
 }
-EXPORT_SYMBOL(save_fpregs_to_fpstate);
 
-void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
+void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
 {
 	/*
 	 * AMD K7/K8 and later CPUs up to Zen don't save/restore
@@ -151,7 +150,31 @@ void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
 			frstor(&fpstate->fsave);
 	}
 }
-EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate);
+
+#if IS_ENABLED(CONFIG_KVM)
+void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
+{
+	fpregs_lock();
+
+	if (save) {
+		if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
+			memcpy(&save->state, &current->thread.fpu.state,
+			       fpu_kernel_xstate_size);
+		} else {
+			save_fpregs_to_fpstate(save);
+		}
+	}
+
+	if (rstor) {
+		restore_mask &= xfeatures_mask_fpstate();
+		restore_fpregs_from_fpstate(&rstor->state, restore_mask);
+	}
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
+#endif
 
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
@@ -457,7 +480,6 @@ void fpregs_mark_activate(void)
 	fpu->last_cpu = smp_processor_id();
 	clear_thread_flag(TIF_NEED_FPU_LOAD);
 }
-EXPORT_SYMBOL_GPL(fpregs_mark_activate);
 
 /*
  * x87 math exception handling:
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 37f872630a0e..545c91c723b8 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -136,7 +136,6 @@ static void __init fpu__init_system_generic(void)
  * components into a single, continuous memory block:
  */
 unsigned int fpu_kernel_xstate_size __ro_after_init;
-EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
 
 /* Get alignment of the TYPE. */
 #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 14085b2d2295..7c2b42f6dddf 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -65,7 +65,6 @@ static short xsave_cpuid_features[] __initdata = {
  * XSAVE buffer, both supervisor and user xstates.
  */
 u64 xfeatures_mask_all __ro_after_init;
-EXPORT_SYMBOL_GPL(xfeatures_mask_all);
 
 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b31b00f394d0..52dd554f41b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -62,7 +62,9 @@
 #include <asm/mce.h>
 #include <asm/pkru.h>
 #include <linux/kernel_stat.h>
-#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 #include <asm/irq_remapping.h>
@@ -8610,58 +8612,27 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void kvm_save_current_fpu(struct fpu *fpu)
-{
-	/*
-	 * If the target FPU state is not resident in the CPU registers, just
-	 * memcpy() from current, else save CPU state directly to the target.
-	 */
-	if (test_thread_flag(TIF_NEED_FPU_LOAD))
-		memcpy(&fpu->state, &current->thread.fpu.state,
-		       fpu_kernel_xstate_size);
-	else
-		save_fpregs_to_fpstate(fpu);
-}
-
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	fpregs_lock();
-
-	kvm_save_current_fpu(vcpu->arch.user_fpu);
-
 	/*
-	 * Guests with protected state can't have it set by the hypervisor,
-	 * so skip trying to set it.
+	 * Guests with protected state have guest_fpu == NULL which makes
+	 * the swap only save the host state. Exclude PKRU from restore as
+	 * it is restored separately in kvm_x86_ops->run().
 	 */
-	if (vcpu->arch.guest_fpu)
-		/* PKRU is separately restored in kvm_x86_ops->run. */
-		__restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
-					~XFEATURE_MASK_PKRU);
-
-	fpregs_mark_activate();
-	fpregs_unlock();
-
+	fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu,
+			 ~XFEATURE_MASK_PKRU);
 	trace_kvm_fpu(1);
 }
 
 /* When vcpu_run ends, restore user space FPU context. */
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	fpregs_lock();
-
 	/*
-	 * Guests with protected state can't have it read by the hypervisor,
-	 * so skip trying to save it.
+	 * Guests with protected state have guest_fpu == NULL which makes
+	 * swap only restore the host state.
 	 */
-	if (vcpu->arch.guest_fpu)
-		kvm_save_current_fpu(vcpu->arch.guest_fpu);
-
-	restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
-
-	fpregs_mark_activate();
-	fpregs_unlock();
-
+	fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL);
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 1ec148e7f394..23ba9df477bd 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -95,7 +95,7 @@ static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
 	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
 		  (void *)instruction_pointer(regs));
 
-	__restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
+	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
 	return true;
 }
 
-- 
Gitee


From 7e10db6e26691e397870036b49c223288c06bd4e Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 22 Dec 2021 21:13:00 +0800
Subject: [PATCH 0737/2161] x86/fpu: Replace KVMs home brewed FPU copy from
 user

commit ea4d6938d4c0761672ff6237964a20db3cb95cc1 upstream.

Copying a user space buffer to the memory buffer is already available in
the FPU core. The copy mechanism in KVM lacks sanity checks and needs to
use cpuid() to lookup the offset of each component, while the FPU core has
this information cached.

Make the FPU core variant accessible for KVM and replace the home brewed
mechanism.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: kvm@vger.kernel.org
Link: https://lkml.kernel.org/r/20211015011539.134065207@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  2 +
 arch/x86/kernel/fpu/core.c     | 38 ++++++++++++++++-
 arch/x86/kernel/fpu/xstate.c   |  3 +-
 arch/x86/kvm/x86.c             | 75 ++--------------------------------
 4 files changed, 43 insertions(+), 75 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 2928de7c5dc2..79073383a9bc 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -89,4 +89,6 @@ extern void fpu_init_fpstate_user(struct fpu *fpu);
 /* KVM specific functions */
 extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
 
+extern int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *pkru);
+
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ebaa9bedeacb..9e83ee3a2ae9 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -174,7 +174,43 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 	fpregs_unlock();
 }
 EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
-#endif
+
+int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
+				 u32 *vpkru)
+{
+	union fpregs_state *kstate = &fpu->state;
+	const union fpregs_state *ustate = buf;
+	struct pkru_state *xpkru;
+	int ret;
+
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+		if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
+			return -EINVAL;
+		if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
+			return -EINVAL;
+		memcpy(&kstate->fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
+		return 0;
+	}
+
+	if (ustate->xsave.header.xfeatures & ~xcr0)
+		return -EINVAL;
+
+	ret = copy_uabi_from_kernel_to_xstate(&kstate->xsave, ustate);
+	if (ret)
+		return ret;
+
+	/* Retrieve PKRU if not in init state */
+	if (kstate->xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
+		xpkru = get_xsave_addr(&kstate->xsave, XFEATURE_PKRU);
+		*vpkru = xpkru->pkru;
+	}
+
+	/* Ensure that XCOMP_BV is set up for XSAVES */
+	xstate_init_xcomp_bv(&kstate->xsave, xfeatures_mask_uabi());
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_copy_kvm_uabi_to_fpstate);
+#endif /* CONFIG_KVM */
 
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7c2b42f6dddf..5b4419739e2e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1161,8 +1161,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
 
 /*
  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
- * format and copy to the target thread. This is called from
- * xstateregs_set().
+ * format and copy to the target thread. Used by ptrace and KVM.
  */
 int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 52dd554f41b6..1272911f4141 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4018,8 +4018,6 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
-
 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 {
 	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
@@ -4063,50 +4061,6 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 	}
 }
 
-static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
-{
-	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
-	u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
-	u64 valid;
-
-	/*
-	 * Copy legacy XSAVE area, to avoid complications with CPUID
-	 * leaves 0 and 1 in the loop below.
-	 */
-	memcpy(xsave, src, XSAVE_HDR_OFFSET);
-
-	/* Set XSTATE_BV and possibly XCOMP_BV.  */
-	xsave->header.xfeatures = xstate_bv;
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
-	/*
-	 * Copy each region from the non-compacted offset to the
-	 * possibly compacted offset.
-	 */
-	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
-	while (valid) {
-		u32 size, offset, ecx, edx;
-		u64 xfeature_mask = valid & -valid;
-		int xfeature_nr = fls64(xfeature_mask) - 1;
-
-		cpuid_count(XSTATE_CPUID, xfeature_nr,
-			    &size, &offset, &ecx, &edx);
-
-		if (xfeature_nr == XFEATURE_PKRU) {
-			memcpy(&vcpu->arch.pkru, src + offset,
-			       sizeof(vcpu->arch.pkru));
-		} else {
-			void *dest = get_xsave_addr(xsave, xfeature_nr);
-
-			if (dest)
-				memcpy(dest, src + offset, size);
-		}
-
-		valid -= xfeature_mask;
-	}
-}
-
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 					 struct kvm_xsave *guest_xsave)
 {
@@ -4125,38 +4079,15 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 	}
 }
 
-#define XSAVE_MXCSR_OFFSET 24
-
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
-	u64 xstate_bv;
-	u32 mxcsr;
-
 	if (!vcpu->arch.guest_fpu)
 		return 0;
 
-	xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-	mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
-
-	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
-		/*
-		 * Here we allow setting states that are not present in
-		 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
-		 * with old userspace.
-		 */
-		if (xstate_bv & ~kvm_supported_xcr0() ||
-			mxcsr & ~mxcsr_feature_mask)
-			return -EINVAL;
-		load_xsave(vcpu, (u8 *)guest_xsave->region);
-	} else {
-		if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
-			mxcsr & ~mxcsr_feature_mask)
-			return -EINVAL;
-		memcpy(&vcpu->arch.guest_fpu->state.fxsave,
-			guest_xsave->region, sizeof(struct fxregs_state));
-	}
-	return 0;
+	return fpu_copy_kvm_uabi_to_fpstate(vcpu->arch.guest_fpu,
+					    guest_xsave->region,
+					    supported_xcr0, &vcpu->arch.pkru);
 }
 
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
-- 
Gitee


From 172dc4339394f3663a788e34b3e6342966deec08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:15 +0200
Subject: [PATCH 0738/2161] x86/fpu: Rework copy_xstate_to_uabi_buf()

commit ca834defd33bae9cf9542ff92b15635a84e91946 upstream.

Prepare for replacing the KVM copy xstate to user function by extending
copy_xstate_to_uabi_buf() with a pkru argument which allows the caller to
hand in the pkru value, which is required for KVM because the guest PKRU is
not accessible via current. Fixup all callsites accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.191902137@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 34 ++++++++++++++++++++++++++--------
 arch/x86/kernel/fpu/xstate.h |  3 +++
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5b4419739e2e..f0ff047700f8 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -967,9 +967,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
 }
 
 /**
- * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
  * @to:		membuf descriptor
- * @tsk:	The task from which to copy the saved xstate
+ * @xsave:	The xsave from which to copy
+ * @pkru_val:	The PKRU value to store in the PKRU component
  * @copy_mode:	The requested copy mode
  *
  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
@@ -978,11 +979,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
  *
  * It supports partial copy but @to.pos always starts from zero.
  */
-void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
-			     enum xstate_copy_mode copy_mode)
+void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
+			       u32 pkru_val, enum xstate_copy_mode copy_mode)
 {
 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
-	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
 	struct xregs_state *xinit = &init_fpstate.xsave;
 	struct xstate_header header;
 	unsigned int zerofrom;
@@ -1060,10 +1060,9 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 			struct pkru_state pkru = {0};
 			/*
 			 * PKRU is not necessarily up to date in the
-			 * thread's XSAVE buffer.  Fill this part from the
-			 * per-thread storage.
+			 * XSAVE buffer. Use the provided value.
 			 */
-			pkru.pkru = tsk->thread.pkru;
+			pkru.pkru = pkru_val;
 			membuf_write(&to, &pkru, sizeof(pkru));
 		} else {
 			copy_feature(header.xfeatures & BIT_ULL(i), &to,
@@ -1083,6 +1082,25 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 		membuf_zero(&to, to.left);
 }
 
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to:		membuf descriptor
+ * @tsk:	The task from which to copy the saved xstate
+ * @copy_mode:	The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+			     enum xstate_copy_mode copy_mode)
+{
+	__copy_xstate_to_uabi_buf(to, &tsk->thread.fpu.state.xsave,
+				  tsk->thread.pkru, copy_mode);
+}
+
 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
 			    const void *kbuf, const void __user *ubuf)
 {
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 0789a04ee705..81f4202781ac 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -15,4 +15,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
 		xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
 }
 
+extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
+				      u32 pkru_val, enum xstate_copy_mode copy_mode);
+
 #endif
-- 
Gitee


From baf450a64bd48459817ad2527c18cd698b875c9f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:18 +0200
Subject: [PATCH 0739/2161] x86/fpu: Mark fpu__init_prepare_fx_sw_frame() as
 __init

commit 9603445549dacd7688532a4076c377e43a3ecfce upstream.

No need to keep it around.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.296435736@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/signal.h | 2 --
 arch/x86/kernel/fpu/internal.h    | 8 ++++++++
 arch/x86/kernel/fpu/signal.c      | 4 +++-
 arch/x86/kernel/fpu/xstate.c      | 1 +
 4 files changed, 12 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/fpu/internal.h

diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 8b6631dffefd..04868a76239a 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -31,6 +31,4 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
 unsigned long fpu__get_fpstate_size(void);
 
-extern void fpu__init_prepare_fx_sw_frame(void);
-
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
new file mode 100644
index 000000000000..036f84c236dd
--- /dev/null
+++ b/arch/x86/kernel/fpu/internal.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_INTERNAL_H
+#define __X86_KERNEL_FPU_INTERNAL_H
+
+/* Init functions */
+extern void fpu__init_prepare_fx_sw_frame(void);
+
+#endif
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a40bb54c2d46..19358bd25dd3 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -16,6 +16,8 @@
 #include <asm/traps.h>
 #include <asm/trace/fpu.h>
 
+#include "internal.h"
+
 static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
 static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
 
@@ -514,7 +516,7 @@ unsigned long fpu__get_fpstate_size(void)
  * This will be saved when ever the FP and extended state context is
  * saved on the user stack during the signal handler delivery to the user.
  */
-void fpu__init_prepare_fx_sw_frame(void)
+void __init fpu__init_prepare_fx_sw_frame(void)
 {
 	int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index f0ff047700f8..d47c5c62a5f6 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -19,6 +19,7 @@
 
 #include <asm/tlbflush.h>
 
+#include "internal.h"
 #include "xstate.h"
 
 #define for_each_extended_xfeature(bit, mask)				\
-- 
Gitee


From 5f9c11b82c56031499f88fe81e230afcee082f5b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:20 +0200
Subject: [PATCH 0740/2161] x86/fpu: Move context switch and exit to user
 inlines into sched.h

commit 63e81807c1f94e91b9d71c536112a40cd74bab85 upstream.

internal.h is a kitchen sink which needs to get out of the way to prepare
for the upcoming changes.

Move the context switch and exit to user inlines into a separate header,
which is all that code needs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.349132461@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 60 -------------------------
 arch/x86/include/asm/fpu/sched.h    | 68 +++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/core.c          |  1 +
 arch/x86/kernel/process.c           |  2 +-
 arch/x86/kernel/process_32.c        |  2 +-
 arch/x86/kernel/process_64.c        |  2 +-
 6 files changed, 72 insertions(+), 63 deletions(-)
 create mode 100644 arch/x86/include/asm/fpu/sched.h

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index ddea882f05f7..a789d2774bf6 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -26,16 +26,11 @@
  * High level FPU state handling functions:
  */
 extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
-extern void fpu__drop(struct fpu *fpu);
 extern void fpu__clear_user_states(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 
 extern void fpu_sync_fpstate(struct fpu *fpu);
 
-/* Clone and exit operations */
-extern int  fpu_clone(struct task_struct *dst);
-extern void fpu_flush_thread(void);
-
 /*
  * Boot time FPU initialization functions:
  */
@@ -81,7 +76,6 @@ extern void fpstate_init_soft(struct swregs_state *soft);
 #else
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
-extern void save_fpregs_to_fpstate(struct fpu *fpu);
 
 /*
  * Returns 0 on success or the trap number when the operation raises an
@@ -463,60 +457,6 @@ static inline void fpregs_restore_userregs(void)
 	clear_thread_flag(TIF_NEED_FPU_LOAD);
 }
 
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- *  - switch_fpu_prepare() saves the old state.
- *    This is done within the context of the old process.
- *
- *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
- *    will get loaded on return to userspace, or when the kernel needs it.
- *
- * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
- * are saved in the current thread's FPU register state.
- *
- * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
- * hold current()'s FPU registers. It is required to load the
- * registers before returning to userland or using the content
- * otherwise.
- *
- * The FPU context is only stored/restored for a user task and
- * PF_KTHREAD is used to distinguish between kernel and user threads.
- */
-static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
-{
-	if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
-		save_fpregs_to_fpstate(old_fpu);
-		/*
-		 * The save operation preserved register state, so the
-		 * fpu_fpregs_owner_ctx is still @old_fpu. Store the
-		 * current CPU number in @old_fpu, so the next return
-		 * to user space can avoid the FPU register restore
-		 * when is returns on the same CPU and still owns the
-		 * context.
-		 */
-		old_fpu->last_cpu = cpu;
-
-		trace_x86_fpu_regs_deactivated(old_fpu);
-	}
-}
-
-/*
- * Misc helper functions:
- */
-
-/*
- * Delay loading of the complete FPU state until the return to userland.
- * PKRU is handled separately.
- */
-static inline void switch_fpu_finish(void)
-{
-	if (cpu_feature_enabled(X86_FEATURE_FPU))
-		set_thread_flag(TIF_NEED_FPU_LOAD);
-}
-
 /*
  * MXCSR and XCR definitions:
  */
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
new file mode 100644
index 000000000000..cdb78d590c86
--- /dev/null
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_SCHED_H
+#define _ASM_X86_FPU_SCHED_H
+
+#include <linux/sched.h>
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/types.h>
+
+#include <asm/trace/fpu.h>
+
+extern void save_fpregs_to_fpstate(struct fpu *fpu);
+extern void fpu__drop(struct fpu *fpu);
+extern int  fpu_clone(struct task_struct *dst);
+extern void fpu_flush_thread(void);
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ *  - switch_fpu_prepare() saves the old state.
+ *    This is done within the context of the old process.
+ *
+ *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
+ *    will get loaded on return to userspace, or when the kernel needs it.
+ *
+ * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
+ * are saved in the current thread's FPU register state.
+ *
+ * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
+ * hold current()'s FPU registers. It is required to load the
+ * registers before returning to userland or using the content
+ * otherwise.
+ *
+ * The FPU context is only stored/restored for a user task and
+ * PF_KTHREAD is used to distinguish between kernel and user threads.
+ */
+static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+{
+	if (cpu_feature_enabled(X86_FEATURE_FPU) &&
+	    !(current->flags & PF_KTHREAD)) {
+		save_fpregs_to_fpstate(old_fpu);
+		/*
+		 * The save operation preserved register state, so the
+		 * fpu_fpregs_owner_ctx is still @old_fpu. Store the
+		 * current CPU number in @old_fpu, so the next return
+		 * to user space can avoid the FPU register restore
+		 * when is returns on the same CPU and still owns the
+		 * context.
+		 */
+		old_fpu->last_cpu = cpu;
+
+		trace_x86_fpu_regs_deactivated(old_fpu);
+	}
+}
+
+/*
+ * Delay loading of the complete FPU state until the return to userland.
+ * PKRU is handled separately.
+ */
+static inline void switch_fpu_finish(void)
+{
+	if (cpu_feature_enabled(X86_FEATURE_FPU))
+		set_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif /* _ASM_X86_FPU_SCHED_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 9e83ee3a2ae9..1f307ba8741f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -8,6 +8,7 @@
  */
 #include <asm/fpu/internal.h>
 #include <asm/fpu/regset.h>
+#include <asm/fpu/sched.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/types.h>
 #include <asm/traps.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index be9adbedfc8d..b871b35632c9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -31,7 +31,7 @@
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/mwait.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index db44c898f22e..7b06156abbbd 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -42,7 +42,7 @@
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
 #include <asm/desc.h>
 
 #include <linux/err.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 852656ce851a..0b85ea864049 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -43,7 +43,7 @@
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/pkru.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
-- 
Gitee


From 4b8c10671e65f3a18f5bf93d2b3c5fc62857dbe3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:21 +0200
Subject: [PATCH 0741/2161] x86/fpu: Clean up CPU feature tests

commit d06241f52cfe4a0580856ef2cfac90dc7f752cae upstream.

Further disintegration of internal.h:

Move the CPU feature tests to a core header and remove the unused one.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.401510559@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 18 ------------------
 arch/x86/kernel/fpu/core.c          |  1 +
 arch/x86/kernel/fpu/internal.h      | 11 +++++++++++
 arch/x86/kernel/fpu/regset.c        |  2 ++
 4 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a789d2774bf6..fe8b5b168223 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -50,24 +50,6 @@ extern void fpu__resume_cpu(void);
 # define WARN_ON_FPU(x) ({ (void)(x); 0; })
 #endif
 
-/*
- * FPU related CPU feature flag helper routines:
- */
-static __always_inline __pure bool use_xsaveopt(void)
-{
-	return static_cpu_has(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
-	return static_cpu_has(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
-	return static_cpu_has(X86_FEATURE_FXSR);
-}
-
 extern union fpregs_state init_fpstate;
 extern void fpstate_init_user(union fpregs_state *state);
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1f307ba8741f..24428e69982b 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -17,6 +17,7 @@
 #include <linux/hardirq.h>
 #include <linux/pkeys.h>
 
+#include "internal.h"
 #include "xstate.h"
 
 #define CREATE_TRACE_POINTS
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index 036f84c236dd..a8aac21ba364 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -2,6 +2,17 @@
 #ifndef __X86_KERNEL_FPU_INTERNAL_H
 #define __X86_KERNEL_FPU_INTERNAL_H
 
+/* CPU feature check wrappers */
+static __always_inline __pure bool use_xsave(void)
+{
+	return cpu_feature_enabled(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+	return cpu_feature_enabled(X86_FEATURE_FXSR);
+}
+
 /* Init functions */
 extern void fpu__init_prepare_fx_sw_frame(void);
 
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index d71fd3adfee0..5dce55ec9394 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -10,6 +10,8 @@
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
 
+#include "internal.h"
+
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
  * as the "regset->n" for the xstate regset will be updated based on the feature
-- 
Gitee


From f6e2163227821f428c84831014e254c652d146bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:23 +0200
Subject: [PATCH 0742/2161] x86/fpu: Make os_xrstor_booting() private

commit b579d0c3750eedc0dee433edaba88206a8e4348a upstream.

It's only required in the xstate init code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.455836597@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 25 -------------------------
 arch/x86/kernel/fpu/xstate.c        | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fe8b5b168223..65a7662117e1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -224,31 +224,6 @@ static inline void fxsave(struct fxregs_state *fx)
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
 
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void os_xrstor_booting(struct xregs_state *xstate)
-{
-	u64 mask = xfeatures_mask_fpstate();
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
-	else
-		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-
-	/*
-	 * We should never fault when copying from a kernel buffer, and the FPU
-	 * state we set at boot time should be valid.
-	 */
-	WARN_ON_FPU(err);
-}
-
 /*
  * Save processor xstate to xsave area.
  *
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d47c5c62a5f6..cb6b93b25d89 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -350,6 +350,29 @@ static void __init print_xstate_offset_size(void)
 	}
 }
 
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static __init void os_xrstor_booting(struct xregs_state *xstate)
+{
+	u64 mask = xfeatures_mask_fpstate();
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+	else
+		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+	/*
+	 * We should never fault when copying from a kernel buffer, and the FPU
+	 * state we set at boot time should be valid.
+	 */
+	WARN_ON_FPU(err);
+}
+
 /*
  * All supported features have either init state all zeros or are
  * handled in setup_init_fpu() individually. This is an explicit
-- 
Gitee


From 0f712f98c4042d14bed86b88411d2611e3866538 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:24 +0200
Subject: [PATCH 0743/2161] x86/fpu: Move os_xsave() and os_xrstor() to core

commit df95b0f1aa56dfa71a0ef657e3e62294ee6d9034 upstream.

Nothing outside the core code needs these.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.513368075@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 165 --------------------------
 arch/x86/include/asm/fpu/xstate.h   |   6 -
 arch/x86/kernel/fpu/signal.c        |   1 +
 arch/x86/kernel/fpu/xstate.h        | 174 ++++++++++++++++++++++++++++
 4 files changed, 175 insertions(+), 171 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 65a7662117e1..1a6c62ed1295 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -160,171 +160,6 @@ static inline void fxsave(struct fxregs_state *fx)
 		asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
 }
 
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-
-/*
- * After this @err contains 0 on success or the trap number when the
- * operation raises an exception.
- */
-#define XSTATE_OP(op, st, lmask, hmask, err)				\
-	asm volatile("1:" op "\n\t"					\
-		     "xor %[err], %[err]\n"				\
-		     "2:\n\t"						\
-		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)	\
-		     : [err] "=a" (err)					\
-		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
-		     : "memory")
-
-/*
- * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
- * format and supervisor states in addition to modified optimization in
- * XSAVEOPT.
- *
- * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
- * supports modified optimization which is not supported by XSAVE.
- *
- * We use XSAVE as a fallback.
- *
- * The 661 label is defined in the ALTERNATIVE* macros as the address of the
- * original instruction which gets replaced. We need to use it here as the
- * address of the instruction where we might get an exception at.
- */
-#define XSTATE_XSAVE(st, lmask, hmask, err)				\
-	asm volatile(ALTERNATIVE_2(XSAVE,				\
-				   XSAVEOPT, X86_FEATURE_XSAVEOPT,	\
-				   XSAVES,   X86_FEATURE_XSAVES)	\
-		     "\n"						\
-		     "xor %[err], %[err]\n"				\
-		     "3:\n"						\
-		     ".pushsection .fixup,\"ax\"\n"			\
-		     "4: movl $-2, %[err]\n"				\
-		     "jmp 3b\n"						\
-		     ".popsection\n"					\
-		     _ASM_EXTABLE(661b, 4b)				\
-		     : [err] "=r" (err)					\
-		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
-		     : "memory")
-
-/*
- * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
- * XSAVE area format.
- */
-#define XSTATE_XRESTORE(st, lmask, hmask)				\
-	asm volatile(ALTERNATIVE(XRSTOR,				\
-				 XRSTORS, X86_FEATURE_XSAVES)		\
-		     "\n"						\
-		     "3:\n"						\
-		     _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE)	\
-		     :							\
-		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
-		     : "memory")
-
-/*
- * Save processor xstate to xsave area.
- *
- * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
- * and command line options. The choice is permanent until the next reboot.
- */
-static inline void os_xsave(struct xregs_state *xstate)
-{
-	u64 mask = xfeatures_mask_all;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	WARN_ON_FPU(!alternatives_patched);
-
-	XSTATE_XSAVE(xstate, lmask, hmask, err);
-
-	/* We should never fault when copying to a kernel buffer: */
-	WARN_ON_FPU(err);
-}
-
-/*
- * Restore processor xstate from xsave area.
- *
- * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
- */
-static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	XSTATE_XRESTORE(xstate, lmask, hmask);
-}
-
-/*
- * Save xstate to user space xsave area.
- *
- * We don't use modified optimization because xrstor/xrstors might track
- * a different application.
- *
- * We don't use compacted format xsave area for backward compatibility for
- * old applications which don't understand the compacted format of the
- * xsave area.
- *
- * The caller has to zero buf::header before calling this because XSAVE*
- * does not touch the reserved fields in the header.
- */
-static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
-{
-	/*
-	 * Include the features which are not xsaved/rstored by the kernel
-	 * internally, e.g. PKRU. That's user space ABI and also required
-	 * to allow the signal handler to modify PKRU.
-	 */
-	u64 mask = xfeatures_mask_uabi();
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	stac();
-	XSTATE_OP(XSAVE, buf, lmask, hmask, err);
-	clac();
-
-	return err;
-}
-
-/*
- * Restore xstate from user space xsave area.
- */
-static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
-{
-	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	stac();
-	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-	clac();
-
-	return err;
-}
-
-/*
- * Restore xstate from kernel space xsave area, return an error code instead of
- * an exception.
- */
-static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
-		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
-	else
-		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-
-	return err;
-}
-
 extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
 
 extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 2df5bd667a43..df3396a72dbc 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -78,12 +78,6 @@
 				      XFEATURE_MASK_INDEPENDENT | \
 				      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
-#ifdef CONFIG_X86_64
-#define REX_PREFIX	"0x48, "
-#else
-#define REX_PREFIX
-#endif
-
 extern u64 xfeatures_mask_all;
 
 static inline u64 xfeatures_mask_supervisor(void)
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 19358bd25dd3..c723514ceefd 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -17,6 +17,7 @@
 #include <asm/trace/fpu.h>
 
 #include "internal.h"
+#include "xstate.h"
 
 static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
 static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 81f4202781ac..ae61baa97682 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -18,4 +18,178 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
 extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 				      u32 pkru_val, enum xstate_copy_mode copy_mode);
 
+/* XSAVE/XRSTOR wrapper functions */
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX	"0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/*
+ * After this @err contains 0 on success or the trap number when the
+ * operation raises an exception.
+ */
+#define XSTATE_OP(op, st, lmask, hmask, err)				\
+	asm volatile("1:" op "\n\t"					\
+		     "xor %[err], %[err]\n"				\
+		     "2:\n\t"						\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)	\
+		     : [err] "=a" (err)					\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
+ * format and supervisor states in addition to modified optimization in
+ * XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err)				\
+	asm volatile(ALTERNATIVE_2(XSAVE,				\
+				   XSAVEOPT, X86_FEATURE_XSAVEOPT,	\
+				   XSAVES,   X86_FEATURE_XSAVES)	\
+		     "\n"						\
+		     "xor %[err], %[err]\n"				\
+		     "3:\n"						\
+		     ".pushsection .fixup,\"ax\"\n"			\
+		     "4: movl $-2, %[err]\n"				\
+		     "jmp 3b\n"						\
+		     ".popsection\n"					\
+		     _ASM_EXTABLE(661b, 4b)				\
+		     : [err] "=r" (err)					\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask)				\
+	asm volatile(ALTERNATIVE(XRSTOR,				\
+				 XRSTORS, X86_FEATURE_XSAVES)		\
+		     "\n"						\
+		     "3:\n"						\
+		     _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE)	\
+		     :							\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
+
+/*
+ * Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
+ */
+static inline void os_xsave(struct xregs_state *xstate)
+{
+	u64 mask = xfeatures_mask_all;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	WARN_ON_FPU(!alternatives_patched);
+
+	XSTATE_XSAVE(xstate, lmask, hmask, err);
+
+	/* We should never fault when copying to a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
+ */
+static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	XSTATE_XRESTORE(xstate, lmask, hmask);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for backward compatibility for
+ * old applications which don't understand the compacted format of the
+ * xsave area.
+ *
+ * The caller has to zero buf::header before calling this because XSAVE*
+ * does not touch the reserved fields in the header.
+ */
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
+{
+	/*
+	 * Include the features which are not xsaved/rstored by the kernel
+	 * internally, e.g. PKRU. That's user space ABI and also required
+	 * to allow the signal handler to modify PKRU.
+	 */
+	u64 mask = xfeatures_mask_uabi();
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	stac();
+	XSTATE_OP(XSAVE, buf, lmask, hmask, err);
+	clac();
+
+	return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
+{
+	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	stac();
+	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+	clac();
+
+	return err;
+}
+
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+	else
+		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+	return err;
+}
+
+
 #endif
-- 
Gitee


From 759e5a8af0f90fa2620f663422d846081a426599 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:26 +0200
Subject: [PATCH 0744/2161] x86/fpu: Move legacy ASM wrappers to core

commit 34002571cb4199a446f7582704424d20a01c276e upstream.

Nothing outside the core code requires them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.572439164@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 101 --------------------------
 arch/x86/kernel/fpu/core.c          |   1 +
 arch/x86/kernel/fpu/legacy.h        | 108 ++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/signal.c        |   1 +
 arch/x86/kernel/fpu/xstate.c        |   1 +
 5 files changed, 111 insertions(+), 101 deletions(-)
 create mode 100644 arch/x86/kernel/fpu/legacy.h

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 1a6c62ed1295..b0cb1a9c5af6 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -59,107 +59,6 @@ extern void fpstate_init_soft(struct swregs_state *soft);
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 
-/*
- * Returns 0 on success or the trap number when the operation raises an
- * exception.
- */
-#define user_insn(insn, output, input...)				\
-({									\
-	int err;							\
-									\
-	might_fault();							\
-									\
-	asm volatile(ASM_STAC "\n"					\
-		     "1: " #insn "\n"					\
-		     "2: " ASM_CLAC "\n"				\
-		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)	\
-		     : [err] "=a" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-#define kernel_insn_err(insn, output, input...)				\
-({									\
-	int err;							\
-	asm volatile("1:" #insn "\n\t"					\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-#define kernel_insn(insn, output, input...)				\
-	asm volatile("1:" #insn "\n\t"					\
-		     "2:\n"						\
-		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE)	\
-		     : output : input)
-
-static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
-{
-	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
-}
-
-static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
-	else
-		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
-
-}
-
-static inline void fxrstor(struct fxregs_state *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else
-		kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int fxrstor_safe(struct fxregs_state *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else
-		return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else
-		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void frstor(struct fregs_state *fx)
-{
-	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int frstor_safe(struct fregs_state *fx)
-{
-	return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
-{
-	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void fxsave(struct fxregs_state *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
-	else
-		asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
-}
-
 extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
 
 extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 24428e69982b..c72830cda6dd 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -18,6 +18,7 @@
 #include <linux/pkeys.h>
 
 #include "internal.h"
+#include "legacy.h"
 #include "xstate.h"
 
 #define CREATE_TRACE_POINTS
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
new file mode 100644
index 000000000000..2ff36b0f79e9
--- /dev/null
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_LEGACY_H
+#define __X86_KERNEL_FPU_LEGACY_H
+
+#include <asm/fpu/types.h>
+
+/*
+ * Returns 0 on success or the trap number when the operation raises an
+ * exception.
+ */
+#define user_insn(insn, output, input...)				\
+({									\
+	int err;							\
+									\
+	might_fault();							\
+									\
+	asm volatile(ASM_STAC "\n"					\
+		     "1: " #insn "\n"					\
+		     "2: " ASM_CLAC "\n"				\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)	\
+		     : [err] "=a" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+#define kernel_insn_err(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+#define kernel_insn(insn, output, input...)				\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE)	\
+		     : output : input)
+
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
+{
+	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+	else
+		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+}
+
+static inline void fxrstor(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else
+		kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_safe(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else
+		return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else
+		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void frstor(struct fregs_state *fx)
+{
+	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_safe(struct fregs_state *fx)
+{
+	return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
+{
+	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void fxsave(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+	else
+		asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index c723514ceefd..90eecef2fc68 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -17,6 +17,7 @@
 #include <asm/trace/fpu.h>
 
 #include "internal.h"
+#include "legacy.h"
 #include "xstate.h"
 
 static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index cb6b93b25d89..fd9fc4a55886 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -20,6 +20,7 @@
 #include <asm/tlbflush.h>
 
 #include "internal.h"
+#include "legacy.h"
 #include "xstate.h"
 
 #define for_each_extended_xfeature(bit, mask)				\
-- 
Gitee


From b20965f5b8aeca7c78efddc9e833f8003e339bd7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:28 +0200
Subject: [PATCH 0745/2161] x86/fpu: Make WARN_ON_FPU() private

commit cdcb6fa14e1499ff2b2a3f3e0938c7b3b7ef2cd6 upstream.

No point in being in global headers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.628516182@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 9 ---------
 arch/x86/kernel/fpu/init.c          | 2 ++
 arch/x86/kernel/fpu/internal.h      | 6 ++++++
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index b0cb1a9c5af6..30ea3472c463 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -41,15 +41,6 @@ extern void fpu__init_system(struct cpuinfo_x86 *c);
 extern void fpu__init_check_bugs(void);
 extern void fpu__resume_cpu(void);
 
-/*
- * Debugging facility:
- */
-#ifdef CONFIG_X86_DEBUG_FPU
-# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
-#else
-# define WARN_ON_FPU(x) ({ (void)(x); 0; })
-#endif
-
 extern union fpregs_state init_fpstate;
 extern void fpstate_init_user(union fpregs_state *state);
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 545c91c723b8..24873dfe2dba 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -10,6 +10,8 @@
 #include <linux/sched/task.h>
 #include <linux/init.h>
 
+#include "internal.h"
+
 /*
  * Initialize the registers found in all CPUs, CR0 and CR4:
  */
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index a8aac21ba364..5ddc09e03c2a 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -13,6 +13,12 @@ static __always_inline __pure bool use_fxsr(void)
 	return cpu_feature_enabled(X86_FEATURE_FXSR);
 }
 
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
 /* Init functions */
 extern void fpu__init_prepare_fx_sw_frame(void);
 
-- 
Gitee


From 00016dff5ac159c6fbbc0b2fef1497b9493612ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:30 +0200
Subject: [PATCH 0746/2161] x86/fpu: Move fpregs_restore_userregs() to core

commit 9848fb96839bfd6ad4c00748842ccfd5bd3b0346 upstream.

Only used internally in the FPU core code.

While at it, convert to the percpu accessors which verify preemption is
disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.686806639@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 83 ----------------------------
 arch/x86/kernel/fpu/context.h       | 85 +++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/core.c          |  1 +
 arch/x86/kernel/fpu/regset.c        |  1 +
 arch/x86/kernel/fpu/signal.c        |  1 +
 5 files changed, 88 insertions(+), 83 deletions(-)
 create mode 100644 arch/x86/kernel/fpu/context.h

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 30ea3472c463..3712f299f41e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -54,91 +54,8 @@ extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
 
 extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
 
-/*
- * FPU context switch related helper methods:
- */
-
 DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
-/*
- * The in-register FPU state for an FPU context on a CPU is assumed to be
- * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
- * matches the FPU.
- *
- * If the FPU register state is valid, the kernel can skip restoring the
- * FPU state from memory.
- *
- * Any code that clobbers the FPU registers or updates the in-memory
- * FPU state for a task MUST let the rest of the kernel know that the
- * FPU registers are no longer valid for this task.
- *
- * Either one of these invalidation functions is enough. Invalidate
- * a resource you control: CPU if using the CPU for something else
- * (with preemption disabled), FPU for the current task, or a task that
- * is prevented from running by the current task.
- */
-static inline void __cpu_invalidate_fpregs_state(void)
-{
-	__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
-}
-
-static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
-{
-	fpu->last_cpu = -1;
-}
-
-static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
-{
-	return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
-}
-
-/*
- * These generally need preemption protection to work,
- * do try to avoid using these on their own:
- */
-static inline void fpregs_deactivate(struct fpu *fpu)
-{
-	this_cpu_write(fpu_fpregs_owner_ctx, NULL);
-	trace_x86_fpu_regs_deactivated(fpu);
-}
-
-static inline void fpregs_activate(struct fpu *fpu)
-{
-	this_cpu_write(fpu_fpregs_owner_ctx, fpu);
-	trace_x86_fpu_regs_activated(fpu);
-}
-
-/* Internal helper for switch_fpu_return() and signal frame setup */
-static inline void fpregs_restore_userregs(void)
-{
-	struct fpu *fpu = &current->thread.fpu;
-	int cpu = smp_processor_id();
-
-	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
-		return;
-
-	if (!fpregs_state_valid(fpu, cpu)) {
-		u64 mask;
-
-		/*
-		 * This restores _all_ xstate which has not been
-		 * established yet.
-		 *
-		 * If PKRU is enabled, then the PKRU value is already
-		 * correct because it was either set in switch_to() or in
-		 * flush_thread(). So it is excluded because it might be
-		 * not up to date in current->thread.fpu.xsave state.
-		 */
-		mask = xfeatures_mask_restore_user() |
-			xfeatures_mask_supervisor();
-		restore_fpregs_from_fpstate(&fpu->state, mask);
-
-		fpregs_activate(fpu);
-		fpu->last_cpu = cpu;
-	}
-	clear_thread_flag(TIF_NEED_FPU_LOAD);
-}
-
 /*
  * MXCSR and XCR definitions:
  */
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
new file mode 100644
index 000000000000..e652282842c8
--- /dev/null
+++ b/arch/x86/kernel/fpu/context.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_CONTEXT_H
+#define __X86_KERNEL_FPU_CONTEXT_H
+
+#include <asm/fpu/xstate.h>
+#include <asm/trace/fpu.h>
+
+/* Functions related to FPU context tracking */
+
+/*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Either one of these invalidation functions is enough. Invalidate
+ * a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
+ */
+static inline void __cpu_invalidate_fpregs_state(void)
+{
+	__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+	fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
+{
+	return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+	__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+	trace_x86_fpu_regs_deactivated(fpu);
+}
+
+static inline void fpregs_activate(struct fpu *fpu)
+{
+	__this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+	trace_x86_fpu_regs_activated(fpu);
+}
+
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+	int cpu = smp_processor_id();
+
+	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+		return;
+
+	if (!fpregs_state_valid(fpu, cpu)) {
+		u64 mask;
+
+		/*
+		 * This restores _all_ xstate which has not been
+		 * established yet.
+		 *
+		 * If PKRU is enabled, then the PKRU value is already
+		 * correct because it was either set in switch_to() or in
+		 * flush_thread(). So it is excluded because it might be
+		 * not up to date in current->thread.fpu.xsave state.
+		 */
+		mask = xfeatures_mask_restore_user() |
+			xfeatures_mask_supervisor();
+		restore_fpregs_from_fpstate(&fpu->state, mask);
+
+		fpregs_activate(fpu);
+		fpu->last_cpu = cpu;
+	}
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index c72830cda6dd..1d899f2ae7d8 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -17,6 +17,7 @@
 #include <linux/hardirq.h>
 #include <linux/pkeys.h>
 
+#include "context.h"
 #include "internal.h"
 #include "legacy.h"
 #include "xstate.h"
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 5dce55ec9394..444ac96540f2 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -10,6 +10,7 @@
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
 
+#include "context.h"
 #include "internal.h"
 
 /*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 90eecef2fc68..af286b855901 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -16,6 +16,7 @@
 #include <asm/traps.h>
 #include <asm/trace/fpu.h>
 
+#include "context.h"
 #include "internal.h"
 #include "legacy.h"
 #include "xstate.h"
-- 
Gitee


From 65aa325baa54932325b57f3270d5d91e7d79da26 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Tue, 7 Jul 2020 19:47:22 +0200
Subject: [PATCH 0747/2161] x86/cpu: Use XGETBV and XSETBV mnemonics in
 fpu/internal.h

commit 86109813990b5d6d6cfb8072382ee69d11ea9460 upstream.

Current minimum required version of binutils is 2.23, which supports
XGETBV and XSETBV instruction mnemonics.

Replace the byte-wise specification of XGETBV and XSETBV with these
proper mnemonics.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200707174722.58651-1-ubizjak@gmail.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 3712f299f41e..1df0a979d88f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -73,9 +73,7 @@ static inline u64 xgetbv(u32 index)
 {
 	u32 eax, edx;
 
-	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
-		     : "=a" (eax), "=d" (edx)
-		     : "c" (index));
+	asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
 	return eax + ((u64)edx << 32);
 }
 
@@ -84,8 +82,7 @@ static inline void xsetbv(u32 index, u64 value)
 	u32 eax = value;
 	u32 edx = value >> 32;
 
-	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
-		     : : "a" (eax), "d" (edx), "c" (index));
+	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
 }
 
 #endif /* _ASM_X86_FPU_INTERNAL_H */
-- 
Gitee


From 2614ae251427a99f1b0ea3b25f4036b713de6a9c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 7 Sep 2020 15:15:27 +0200
Subject: [PATCH 0748/2161] x86/fpu: Move xgetbv()/xsetbv() into a separate
 header

commit 1b4fb8545f2b00f2844c4b7619d64d98440a477c upstream.

The xgetbv() function is needed in the pre-decompression boot code,
but asm/fpu/internal.h can't be included there directly. Doing so
opens the door to include-hell due to various include-magic in
boot/compressed/misc.h.

Avoid that by moving xgetbv()/xsetbv() to a separate header file and
include it instead.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200907131613.12703-27-joro@8bytes.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 30 +------------------------
 arch/x86/include/asm/fpu/xcr.h      | 34 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 29 deletions(-)
 create mode 100644 arch/x86/include/asm/fpu/xcr.h

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 1df0a979d88f..74b7cc3d2e77 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -19,6 +19,7 @@
 #include <asm/user.h>
 #include <asm/fpu/api.h>
 #include <asm/fpu/xstate.h>
+#include <asm/fpu/xcr.h>
 #include <asm/cpufeature.h>
 #include <asm/trace/fpu.h>
 
@@ -56,33 +57,4 @@ extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size
 
 DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
-/*
- * MXCSR and XCR definitions:
- */
-
-static inline void ldmxcsr(u32 mxcsr)
-{
-	asm volatile("ldmxcsr %0" :: "m" (mxcsr));
-}
-
-extern unsigned int mxcsr_feature_mask;
-
-#define XCR_XFEATURE_ENABLED_MASK	0x00000000
-
-static inline u64 xgetbv(u32 index)
-{
-	u32 eax, edx;
-
-	asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
-	return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
-	u32 eax = value;
-	u32 edx = value >> 32;
-
-	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
-}
-
 #endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
new file mode 100644
index 000000000000..1c7ab8d95da5
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_XCR_H
+#define _ASM_X86_FPU_XCR_H
+
+/*
+ * MXCSR and XCR definitions:
+ */
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+	asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
+extern unsigned int mxcsr_feature_mask;
+
+#define XCR_XFEATURE_ENABLED_MASK	0x00000000
+
+static inline u64 xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
+}
+
+#endif /* _ASM_X86_FPU_XCR_H */
-- 
Gitee


From ba90f0a6215af837956cd0edc937557465c7df6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:31 +0200
Subject: [PATCH 0749/2161] x86/fpu: Move mxcsr related code to core

commit d9d005f32aac7362a1998f4b7fdf8874e91546bd upstream.

No need to expose that to code which only needs the XCR0 accessors.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.740012411@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xcr.h | 11 -----------
 arch/x86/kernel/fpu/init.c     |  1 +
 arch/x86/kernel/fpu/legacy.h   |  7 +++++++
 arch/x86/kernel/fpu/regset.c   |  1 +
 arch/x86/kernel/fpu/xstate.c   |  3 ++-
 5 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
index 1c7ab8d95da5..79f95d3787e2 100644
--- a/arch/x86/include/asm/fpu/xcr.h
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -2,17 +2,6 @@
 #ifndef _ASM_X86_FPU_XCR_H
 #define _ASM_X86_FPU_XCR_H
 
-/*
- * MXCSR and XCR definitions:
- */
-
-static inline void ldmxcsr(u32 mxcsr)
-{
-	asm volatile("ldmxcsr %0" :: "m" (mxcsr));
-}
-
-extern unsigned int mxcsr_feature_mask;
-
 #define XCR_XFEATURE_ENABLED_MASK	0x00000000
 
 static inline u64 xgetbv(u32 index)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 24873dfe2dba..e77084a6ae7c 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 
 #include "internal.h"
+#include "legacy.h"
 
 /*
  * Initialize the registers found in all CPUs, CR0 and CR4:
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
index 2ff36b0f79e9..17c26b164c63 100644
--- a/arch/x86/kernel/fpu/legacy.h
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -4,6 +4,13 @@
 
 #include <asm/fpu/types.h>
 
+extern unsigned int mxcsr_feature_mask;
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+	asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
 /*
  * Returns 0 on success or the trap number when the operation raises an
  * exception.
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 444ac96540f2..dd939fd7af66 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -12,6 +12,7 @@
 
 #include "context.h"
 #include "internal.h"
+#include "legacy.h"
 
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index fd9fc4a55886..9f6f2b244589 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -14,8 +14,9 @@
 
 #include <asm/fpu/api.h>
 #include <asm/fpu/internal.h>
-#include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xcr.h>
 
 #include <asm/tlbflush.h>
 
-- 
Gitee


From 9076713b2abba344dcf2a198cf6701b4783d925e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:33 +0200
Subject: [PATCH 0750/2161] x86/fpu: Move fpstate functions to api.h

commit 90489f1dee8b703a3301857917c0aba0b22b5d83 upstream.

Move function declarations which need to be globally available to api.h
where they belong.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.792363754@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h      | 9 +++++++++
 arch/x86/include/asm/fpu/internal.h | 9 ---------
 arch/x86/kernel/fpu/internal.h      | 3 +++
 arch/x86/math-emu/fpu_entry.c       | 2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 79073383a9bc..d2a063377afa 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -83,6 +83,15 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
 
 static inline void update_pasid(void) { }
 
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+
+/* fpstate */
+extern union fpregs_state init_fpstate;
+
 /* fpstate-related functions which are exported to KVM */
 extern void fpu_init_fpstate_user(struct fpu *fpu);
 
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 74b7cc3d2e77..d8bb49134ebb 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -42,15 +42,6 @@ extern void fpu__init_system(struct cpuinfo_x86 *c);
 extern void fpu__init_check_bugs(void);
 extern void fpu__resume_cpu(void);
 
-extern union fpregs_state init_fpstate;
-extern void fpstate_init_user(union fpregs_state *state);
-
-#ifdef CONFIG_MATH_EMULATION
-extern void fpstate_init_soft(struct swregs_state *soft);
-#else
-static inline void fpstate_init_soft(struct swregs_state *soft) {}
-#endif
-
 extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
 
 extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index 5ddc09e03c2a..bd7f813242dd 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -22,4 +22,7 @@ static __always_inline __pure bool use_fxsr(void)
 /* Init functions */
 extern void fpu__init_prepare_fx_sw_frame(void);
 
+/* Used in init.c */
+extern void fpstate_init_user(union fpregs_state *state);
+
 #endif
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 8679a9d6c47f..50195e249753 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -31,7 +31,7 @@
 #include <linux/uaccess.h>
 #include <asm/traps.h>
 #include <asm/user.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 
 #include "fpu_system.h"
 #include "fpu_emu.h"
-- 
Gitee


From 4d03d64c5d1592e94e66c00ac5e554bb27d9c348 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:35 +0200
Subject: [PATCH 0751/2161] x86/fpu: Remove internal.h dependency from
 fpu/signal.h

commit 0ae67cc34f765078a63137120e4567ad2f050b75 upstream.

In order to remove internal.h make signal.h independent of it.

Include asm/fpu/xstate.h to fix a missing update_regset_xstate_info()
prototype, which is
Reported-by: kernel test robot <lkp@intel.com>

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.844565975@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c         |  1 -
 arch/x86/include/asm/fpu/api.h      |  3 +++
 arch/x86/include/asm/fpu/internal.h |  7 -------
 arch/x86/include/asm/fpu/signal.h   | 13 +++++++++++++
 arch/x86/kernel/fpu/signal.c        |  1 -
 arch/x86/kernel/ptrace.c            |  2 +-
 arch/x86/kernel/signal.c            |  1 -
 arch/x86/mm/extable.c               |  3 ++-
 8 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 842efecc2762..97c4a29ed1f4 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,7 +24,6 @@
 #include <linux/syscalls.h>
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/ptrace.h>
 #include <asm/ia32_unistd.h>
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index d2a063377afa..2e395fa5fc86 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -89,6 +89,9 @@ extern void fpstate_init_soft(struct swregs_state *soft);
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 
+/* State tracking */
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
 /* fpstate */
 extern union fpregs_state init_fpstate;
 
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index d8bb49134ebb..8f97d3e375de 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -26,7 +26,6 @@
 /*
  * High level FPU state handling functions:
  */
-extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__clear_user_states(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 
@@ -42,10 +41,4 @@ extern void fpu__init_system(struct cpuinfo_x86 *c);
 extern void fpu__init_check_bugs(void);
 extern void fpu__resume_cpu(void);
 
-extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
-
-extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
-
-DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
-
 #endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 04868a76239a..9a63a21c219d 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -5,6 +5,11 @@
 #ifndef _ASM_X86_FPU_SIGNAL_H
 #define _ASM_X86_FPU_SIGNAL_H
 
+#include <linux/compat.h>
+#include <linux/user.h>
+
+#include <asm/fpu/types.h>
+
 #ifdef CONFIG_X86_64
 # include <uapi/asm/sigcontext.h>
 # include <asm/user32.h>
@@ -31,4 +36,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
 unsigned long fpu__get_fpstate_size(void);
 
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+extern void fpu__clear_user_states(struct fpu *fpu);
+extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
+
+extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
+
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index af286b855901..b829b7a9f995 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -7,7 +7,6 @@
 #include <linux/cpu.h>
 #include <linux/pagemap.h>
 
-#include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 1f9d856feb24..ace1834c9b0f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -30,9 +30,9 @@
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
 #include <asm/debugreg.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 37f7fa8f8e4d..d02d8ba89078 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -29,7 +29,6 @@
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
-#include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 23ba9df477bd..7c71d57c806a 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -4,7 +4,8 @@
 #include <linux/sched/debug.h>
 #include <xen/xen.h>
 
-#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xstate.h>
 #include <asm/traps.h>
 #include <asm/kdebug.h>
 
-- 
Gitee


From ce08d11840537e935320c13236488099715e696e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:38 +0200
Subject: [PATCH 0752/2161] x86/fpu: Mop up the internal.h leftovers

commit 6415bb80926379310afd74800415f6ebf4bb5c31 upstream.

Move the global interfaces to api.h and the rest into the core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011539.948837194@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h      | 10 ++++++++++
 arch/x86/include/asm/fpu/internal.h | 18 ------------------
 arch/x86/kernel/fpu/init.c          |  1 +
 arch/x86/kernel/fpu/xstate.h        |  3 +++
 4 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 2e395fa5fc86..ced6a3f8bac0 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -83,6 +83,16 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
 
 static inline void update_pasid(void) { }
 
+/* Trap handling */
+extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern void fpu_sync_fpstate(struct fpu *fpu);
+
+/* Boot, hotplug and resume */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+
 #ifdef CONFIG_MATH_EMULATION
 extern void fpstate_init_soft(struct swregs_state *soft);
 #else
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 8f97d3e375de..8df83e887ff6 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -23,22 +23,4 @@
 #include <asm/cpufeature.h>
 #include <asm/trace/fpu.h>
 
-/*
- * High level FPU state handling functions:
- */
-extern void fpu__clear_user_states(struct fpu *fpu);
-extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
-
-extern void fpu_sync_fpstate(struct fpu *fpu);
-
-/*
- * Boot time FPU initialization functions:
- */
-extern void fpu__init_cpu(void);
-extern void fpu__init_system_xstate(void);
-extern void fpu__init_cpu_xstate(void);
-extern void fpu__init_system(struct cpuinfo_x86 *c);
-extern void fpu__init_check_bugs(void);
-extern void fpu__resume_cpu(void);
-
 #endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index e77084a6ae7c..d420d29e58be 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -12,6 +12,7 @@
 
 #include "internal.h"
 #include "legacy.h"
+#include "xstate.h"
 
 /*
  * Initialize the registers found in all CPUs, CR0 and CR4:
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index ae61baa97682..bb6d7d298d2a 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -18,6 +18,9 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
 extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 				      u32 pkru_val, enum xstate_copy_mode copy_mode);
 
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system_xstate(void);
+
 /* XSAVE/XRSTOR wrapper functions */
 
 #ifdef CONFIG_X86_64
-- 
Gitee


From c53815e5cc83e56454044d6341dddf019d945bb2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:39 +0200
Subject: [PATCH 0753/2161] x86/fpu: Replace the includes of fpu/internal.h

commit b56d2795b29792c465cc8ef036abad5127a003fb upstream.

Now that the file is empty, fixup all references with the proper includes
and delete the former kitchen sink.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011540.001197214@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/internal.h | 26 --------------------------
 arch/x86/kernel/cpu/bugs.c          |  2 +-
 arch/x86/kernel/cpu/common.c        |  2 +-
 arch/x86/kernel/fpu/bugs.c          |  2 +-
 arch/x86/kernel/fpu/core.c          |  2 +-
 arch/x86/kernel/fpu/init.c          |  2 +-
 arch/x86/kernel/fpu/regset.c        |  2 +-
 arch/x86/kernel/fpu/xstate.c        |  1 -
 arch/x86/kernel/smpboot.c           |  2 +-
 arch/x86/kernel/traps.c             |  2 +-
 arch/x86/kvm/vmx/vmx.c              |  2 +-
 arch/x86/mm/mpx.c                   |  2 +-
 arch/x86/power/cpu.c                |  2 +-
 13 files changed, 11 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 8df83e887ff6..e69de29bb2d1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_FPU_INTERNAL_H
-#define _ASM_X86_FPU_INTERNAL_H
-
-#include <linux/compat.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-
-#include <asm/user.h>
-#include <asm/fpu/api.h>
-#include <asm/fpu/xstate.h>
-#include <asm/fpu/xcr.h>
-#include <asm/cpufeature.h>
-#include <asm/trace/fpu.h>
-
-#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index b290aa9bfb1c..dcb1d2452550 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,7 +21,7 @@
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/msr.h>
 #include <asm/vmx.h>
 #include <asm/paravirt.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3570f72622e6..3b05135ec124 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -40,7 +40,7 @@
 #include <asm/setup.h>
 #include <asm/apic.h>
 #include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/mtrr.h>
 #include <asm/hwcap2.h>
 #include <linux/numa.h>
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 2954fab15e51..794e70151203 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,7 +2,7 @@
 /*
  * x86 FPU bug checks:
  */
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 
 /*
  * Boot time CPU/FPU FDIV bug detection code:
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1d899f2ae7d8..b602284f26b0 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -6,7 +6,7 @@
  *  General FPU state handling cleanups
  *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/sched.h>
 #include <asm/fpu/signal.h>
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d420d29e58be..23791355ca67 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -2,7 +2,7 @@
 /*
  * x86 FPU boot time init code:
  */
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/tlbflush.h>
 #include <asm/setup.h>
 
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index dd939fd7af66..06c0918fe8db 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -5,7 +5,7 @@
 #include <linux/sched/task_stack.h>
 #include <linux/vmalloc.h>
 
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9f6f2b244589..e525803610e0 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -13,7 +13,6 @@
 #include <linux/proc_fs.h>
 
 #include <asm/fpu/api.h>
-#include <asm/fpu/internal.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/xcr.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4384f323854b..1c48a520754a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -69,7 +69,7 @@
 #include <asm/mwait.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f1fe995309a4..36efe7268a94 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -51,7 +51,7 @@
 #include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/cpu_entry_area.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 053cd25ccf69..66f4e50235c0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -33,7 +33,7 @@
 #include <asm/cpu.h>
 #include <asm/debugreg.h>
 #include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/io.h>
 #include <asm/irq_remapping.h>
 #include <asm/kexec.h>
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 895fb7a9294d..7621c8ad2089 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -18,7 +18,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mpx.h>
 #include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/xstate.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/mpx.h>
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 915bb1639763..8d7569bb704a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -20,7 +20,7 @@
 #include <asm/page.h>
 #include <asm/mce.h>
 #include <asm/suspend.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/debugreg.h>
 #include <asm/cpu.h>
 #include <asm/mmu_context.h>
-- 
Gitee


From eec6c1274f28f5e7dd96ddcf80dd973241ec71c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:41 +0200
Subject: [PATCH 0754/2161] x86/fpu: Provide a proper function for
 ex_handler_fprestore()

commit 079ec41b22b952cdf3126527d735e373c9125f6d upstream.

To make upcoming changes for support of dynamically enabled features
simpler, provide a proper function for the exception handler which removes
exposure of FPU internals.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211015011540.053515012@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h | 4 +---
 arch/x86/kernel/fpu/core.c     | 5 +++++
 arch/x86/kernel/fpu/internal.h | 2 ++
 arch/x86/mm/extable.c          | 5 ++---
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index ced6a3f8bac0..a1f197ff401b 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -86,6 +86,7 @@ static inline void update_pasid(void) { }
 /* Trap handling */
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern void fpu_sync_fpstate(struct fpu *fpu);
+extern void fpu_reset_from_exception_fixup(void);
 
 /* Boot, hotplug and resume */
 extern void fpu__init_cpu(void);
@@ -102,9 +103,6 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {}
 /* State tracking */
 DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
-/* fpstate */
-extern union fpregs_state init_fpstate;
-
 /* fpstate-related functions which are exported to KVM */
 extern void fpu_init_fpstate_user(struct fpu *fpu);
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b602284f26b0..a8996e78ad71 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -155,6 +155,11 @@ void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
 	}
 }
 
+void fpu_reset_from_exception_fixup(void)
+{
+	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
+}
+
 #if IS_ENABLED(CONFIG_KVM)
 void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 {
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index bd7f813242dd..479f2db6e160 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -2,6 +2,8 @@
 #ifndef __X86_KERNEL_FPU_INTERNAL_H
 #define __X86_KERNEL_FPU_INTERNAL_H
 
+extern union fpregs_state init_fpstate;
+
 /* CPU feature check wrappers */
 static __always_inline __pure bool use_xsave(void)
 {
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 7c71d57c806a..11a7d67e6725 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -4,8 +4,7 @@
 #include <linux/sched/debug.h>
 #include <xen/xen.h>
 
-#include <asm/fpu/signal.h>
-#include <asm/fpu/xstate.h>
+#include <asm/fpu/api.h>
 #include <asm/traps.h>
 #include <asm/kdebug.h>
 
@@ -96,7 +95,7 @@ static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
 	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
 		  (void *)instruction_pointer(regs));
 
-	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
+	fpu_reset_from_exception_fixup();
 	return true;
 }
 
-- 
Gitee


From bad53011d3b91ab443331a776115e5449faa8ef2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 03:16:17 +0200
Subject: [PATCH 0755/2161] x86/fpu: Replace KVMs home brewed FPU copy to user

commit bf5d00470787067ff27593c6a097b5eb6e01168e upstream.

Similar to the copy from user function the FPU core has this already
implemented with all bells and whistles.

Get rid of the duplicated code and use the core functionality.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: kvm@vger.kernel.org
Link: https://lkml.kernel.org/r/20211015011539.244101845@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  1 +
 arch/x86/kernel/fpu/core.c     | 18 +++++++++++
 arch/x86/kvm/x86.c             | 58 +++-------------------------------
 3 files changed, 23 insertions(+), 54 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index a1f197ff401b..360cef8367a7 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -110,5 +110,6 @@ extern void fpu_init_fpstate_user(struct fpu *fpu);
 extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
 
 extern int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *pkru);
+extern void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, unsigned int size, u32 pkru);
 
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index a8996e78ad71..2a61ca6d267e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -184,6 +184,24 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 }
 EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
 
+void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf,
+			       unsigned int size, u32 pkru)
+{
+	union fpregs_state *kstate = &fpu->state;
+	union fpregs_state *ustate = buf;
+	struct membuf mb = { .p = buf, .left = size };
+
+	if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+		__copy_xstate_to_uabi_buf(mb, &kstate->xsave, pkru,
+					  XSTATE_COPY_XSAVE);
+	} else {
+		memcpy(&ustate->fxsave, &kstate->fxsave, sizeof(ustate->fxsave));
+		/* Make it restorable on a XSAVE enabled host */
+		ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
+	}
+}
+EXPORT_SYMBOL_GPL(fpu_copy_fpstate_to_kvm_uabi);
+
 int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
 				 u32 *vpkru)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1272911f4141..22bd393eff40 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4018,65 +4018,15 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
-{
-	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
-	u64 xstate_bv = xsave->header.xfeatures;
-	u64 valid;
-
-	/*
-	 * Copy legacy XSAVE area, to avoid complications with CPUID
-	 * leaves 0 and 1 in the loop below.
-	 */
-	memcpy(dest, xsave, XSAVE_HDR_OFFSET);
-
-	/* Set XSTATE_BV */
-	xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
-	*(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
-
-	/*
-	 * Copy each region from the possibly compacted offset to the
-	 * non-compacted offset.
-	 */
-	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
-	while (valid) {
-		u32 size, offset, ecx, edx;
-		u64 xfeature_mask = valid & -valid;
-		int xfeature_nr = fls64(xfeature_mask) - 1;
-		void *src;
-
-		cpuid_count(XSTATE_CPUID, xfeature_nr,
-			    &size, &offset, &ecx, &edx);
-
-		if (xfeature_nr == XFEATURE_PKRU) {
-			memcpy(dest + offset, &vcpu->arch.pkru,
-			       sizeof(vcpu->arch.pkru));
-		} else {
-			src = get_xsave_addr(xsave, xfeature_nr);
-			if (src)
-				memcpy(dest + offset, src, size);
-		}
-
-		valid -= xfeature_mask;
-	}
-}
-
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 					 struct kvm_xsave *guest_xsave)
 {
 	if (!vcpu->arch.guest_fpu)
 		return;
 
-	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
-		memset(guest_xsave, 0, sizeof(struct kvm_xsave));
-		fill_xsave((u8 *) guest_xsave->region, vcpu);
-	} else {
-		memcpy(guest_xsave->region,
-			&vcpu->arch.guest_fpu->state.fxsave,
-			sizeof(struct fxregs_state));
-		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
-			XFEATURE_MASK_FPSSE;
-	}
+	fpu_copy_fpstate_to_kvm_uabi(vcpu->arch.guest_fpu, guest_xsave->region,
+				     sizeof(guest_xsave->region),
+				     vcpu->arch.pkru);
 }
 
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
@@ -4087,7 +4037,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 
 	return fpu_copy_kvm_uabi_to_fpstate(vcpu->arch.guest_fpu,
 					    guest_xsave->region,
-					    supported_xcr0, &vcpu->arch.pkru);
+					    kvm_supported_xcr0(), &vcpu->arch.pkru);
 }
 
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
-- 
Gitee


From 37215fe1e5a15306958a556ee4d95ac8ab123768 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:27 +0200
Subject: [PATCH 0756/2161] x86/fpu: Provide struct fpstate

commit 87d0e5be0fac322f4415128def9f16a71a267a40 upstream.

New xfeatures will not longer be automatically stored in the regular XSAVE
buffer in thread_struct::fpu.

The kernel will provide the default sized buffer for storing the regular
features up to AVX512 in thread_struct::fpu and if a task requests to use
one of the new features then the register storage has to be extended.

The state will be accessed via a pointer in thread_struct::fpu which
defaults to the builtin storage and can be switched when extended storage
is required.

To avoid conditionals all over the code, create a new container for the
register storage which will gain other information, e.g. size, feature
masks etc., later. For now it just contains the register storage, which
gives it exactly the same layout as the exiting fpu::state.

Stick fpu::state and the new fpu::__fpstate into an anonymous union and
initialize the pointer. Add build time checks to validate that both are
at the same place and have the same size.

This allows step by step conversion of all users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.234458659@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 20 +++++++++++++++++++-
 arch/x86/include/asm/processor.h |  4 ++--
 arch/x86/kernel/fpu/core.c       | 11 ++++++++++-
 arch/x86/kernel/fpu/init.c       |  9 +++++++--
 arch/x86/kernel/fpu/internal.h   |  1 +
 5 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c7231ce9856..8162d355f542 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -289,6 +289,13 @@ union fpregs_state {
 	u8 __padding[PAGE_SIZE];
 };
 
+struct fpstate {
+	/* @regs: The register state union for all supported formats */
+	union fpregs_state		regs;
+
+	/* @regs is dynamically sized! Don't add anything after @regs! */
+} __aligned(64);
+
 /*
  * Highest level per task FPU state data structure that
  * contains the FPU register state plus various FPU
@@ -316,6 +323,14 @@ struct fpu {
 	 */
 	unsigned long			avx512_timestamp;
 
+	/*
+	 * @fpstate:
+	 *
+	 * Pointer to the active struct fpstate. Initialized to
+	 * point at @__fpstate below.
+	 */
+	struct fpstate			*fpstate;
+
 	/*
 	 * @state:
 	 *
@@ -325,7 +340,10 @@ struct fpu {
 	 * copy. If the task context-switches away then they get
 	 * saved here and represent the FPU state.
 	 */
-	union fpregs_state		state;
+	union {
+		struct fpstate			__fpstate;
+		union fpregs_state		state;
+	};
 	/*
 	 * WARNING: 'state' is dynamically-sized.  Do not put
 	 * anything after it here.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 798079909a73..b80b4c24a9aa 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -505,11 +505,11 @@ struct thread_struct {
 	 */
 };
 
-/* Whitelist the FPU state from the task_struct for hardened usercopy. */
+/* Whitelist the FPU register state from the task_struct for hardened usercopy. */
 static inline void arch_thread_struct_whitelist(unsigned long *offset,
 						unsigned long *size)
 {
-	*offset = offsetof(struct thread_struct, fpu.state);
+	*offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
 	*size = fpu_kernel_xstate_size;
 }
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 2a61ca6d267e..9ff23408c187 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -337,10 +337,17 @@ void fpstate_init_user(union fpregs_state *state)
 		fpstate_init_fstate(&state->fsave);
 }
 
+void fpstate_reset(struct fpu *fpu)
+{
+	/* Set the fpstate pointer to the default fpstate */
+	fpu->fpstate = &fpu->__fpstate;
+}
+
 #if IS_ENABLED(CONFIG_KVM)
 void fpu_init_fpstate_user(struct fpu *fpu)
 {
-	fpstate_init_user(&fpu->state);
+	fpstate_reset(fpu);
+	fpstate_init_user(&fpu->fpstate->regs);
 }
 EXPORT_SYMBOL_GPL(fpu_init_fpstate_user);
 #endif
@@ -354,6 +361,8 @@ int fpu_clone(struct task_struct *dst)
 	/* The new task's FPU state cannot be valid in the hardware. */
 	dst_fpu->last_cpu = -1;
 
+	fpstate_reset(dst_fpu);
+
 	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return 0;
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 23791355ca67..31ecbfba9ff7 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -165,7 +165,7 @@ static void __init fpu__init_task_struct_size(void)
 	 * Subtract off the static size of the register state.
 	 * It potentially has a bunch of padding.
 	 */
-	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+	task_size -= sizeof(current->thread.fpu.__fpstate.regs);
 
 	/*
 	 * Add back the dynamically-calculated register state
@@ -180,10 +180,14 @@ static void __init fpu__init_task_struct_size(void)
 	 * you hit a compile error here, check the structure to
 	 * see if something got added to the end.
 	 */
-	CHECK_MEMBER_AT_END_OF(struct fpu, state);
+	CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate);
 	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
 	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
 
+	BUILD_BUG_ON(sizeof(struct fpstate) != sizeof(union fpregs_state));
+	BUILD_BUG_ON(offsetof(struct thread_struct, fpu.state) !=
+		     offsetof(struct thread_struct, fpu.__fpstate));
+
 	arch_task_struct_size = task_size;
 }
 
@@ -220,6 +224,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  */
 void __init fpu__init_system(struct cpuinfo_x86 *c)
 {
+	fpstate_reset(&current->thread.fpu);
 	fpu__init_system_early_generic(c);
 
 	/*
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index 479f2db6e160..63bd75fe95a8 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -26,5 +26,6 @@ extern void fpu__init_prepare_fx_sw_frame(void);
 
 /* Used in init.c */
 extern void fpstate_init_user(union fpregs_state *state);
+extern void fpstate_reset(struct fpu *fpu);
 
 #endif
-- 
Gitee


From 36ffba7927d1e9010d022483af716b693b4bbbb2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:28 +0200
Subject: [PATCH 0757/2161] x86/fpu: Convert fpstate_init() to struct fpstate

commit f83ac56acdad0815366bb541b6cc9d24f6cea2b2 upstream.

Convert fpstate_init() and related code to the new register storage
mechanism in preparation for dynamically sized buffers.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.292157401@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c     | 44 +++++++++++++++++-----------------
 arch/x86/kernel/fpu/internal.h |  4 ++--
 arch/x86/kernel/fpu/signal.c   |  2 +-
 arch/x86/kernel/fpu/xstate.c   | 12 +++++-----
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 9ff23408c187..e8595422dc49 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -29,7 +29,7 @@
  * Represents the initial FPU state. It's mostly (but not completely) zeroes,
  * depending on the FPU hardware format:
  */
-union fpregs_state init_fpstate __ro_after_init;
+struct fpstate init_fpstate __ro_after_init;
 
 /*
  * Track whether the kernel is using the FPU state
@@ -157,7 +157,7 @@ void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
 
 void fpu_reset_from_exception_fixup(void)
 {
-	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
+	restore_fpregs_from_fpstate(&init_fpstate.regs, xfeatures_mask_fpstate());
 }
 
 #if IS_ENABLED(CONFIG_KVM)
@@ -297,24 +297,24 @@ static inline unsigned int init_fpstate_copy_size(void)
 		return fpu_kernel_xstate_size;
 
 	/* XSAVE(S) just needs the legacy and the xstate header part */
-	return sizeof(init_fpstate.xsave);
+	return sizeof(init_fpstate.regs.xsave);
 }
 
-static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+static inline void fpstate_init_fxstate(struct fpstate *fpstate)
 {
-	fx->cwd = 0x37f;
-	fx->mxcsr = MXCSR_DEFAULT;
+	fpstate->regs.fxsave.cwd = 0x37f;
+	fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
 }
 
 /*
  * Legacy x87 fpstate state init:
  */
-static inline void fpstate_init_fstate(struct fregs_state *fp)
+static inline void fpstate_init_fstate(struct fpstate *fpstate)
 {
-	fp->cwd = 0xffff037fu;
-	fp->swd = 0xffff0000u;
-	fp->twd = 0xffffffffu;
-	fp->fos = 0xffff0000u;
+	fpstate->regs.fsave.cwd = 0xffff037fu;
+	fpstate->regs.fsave.swd = 0xffff0000u;
+	fpstate->regs.fsave.twd = 0xffffffffu;
+	fpstate->regs.fsave.fos = 0xffff0000u;
 }
 
 /*
@@ -322,19 +322,19 @@ static inline void fpstate_init_fstate(struct fregs_state *fp)
  * 1) Early boot to setup init_fpstate for non XSAVE systems
  * 2) fpu_init_fpstate_user() which is invoked from KVM
  */
-void fpstate_init_user(union fpregs_state *state)
+void fpstate_init_user(struct fpstate *fpstate)
 {
 	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
-		fpstate_init_soft(&state->soft);
+		fpstate_init_soft(&fpstate->regs.soft);
 		return;
 	}
 
-	xstate_init_xcomp_bv(&state->xsave, xfeatures_mask_uabi());
+	xstate_init_xcomp_bv(&fpstate->regs.xsave, xfeatures_mask_uabi());
 
 	if (cpu_feature_enabled(X86_FEATURE_FXSR))
-		fpstate_init_fxstate(&state->fxsave);
+		fpstate_init_fxstate(fpstate);
 	else
-		fpstate_init_fstate(&state->fsave);
+		fpstate_init_fstate(fpstate);
 }
 
 void fpstate_reset(struct fpu *fpu)
@@ -347,7 +347,7 @@ void fpstate_reset(struct fpu *fpu)
 void fpu_init_fpstate_user(struct fpu *fpu)
 {
 	fpstate_reset(fpu);
-	fpstate_init_user(&fpu->fpstate->regs);
+	fpstate_init_user(fpu->fpstate);
 }
 EXPORT_SYMBOL_GPL(fpu_init_fpstate_user);
 #endif
@@ -378,7 +378,7 @@ int fpu_clone(struct task_struct *dst)
 	 */
 	if (dst->flags & (PF_KTHREAD)) {
 		/* Clear out the minimal state */
-		memcpy(&dst_fpu->state, &init_fpstate,
+		memcpy(&dst_fpu->state, &init_fpstate.regs,
 		       init_fpstate_copy_size());
 		return 0;
 	}
@@ -435,11 +435,11 @@ void fpu__drop(struct fpu *fpu)
 static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 {
 	if (use_xsave())
-		os_xrstor(&init_fpstate.xsave, features_mask);
+		os_xrstor(&init_fpstate.regs.xsave, features_mask);
 	else if (use_fxsr())
-		fxrstor(&init_fpstate.fxsave);
+		fxrstor(&init_fpstate.regs.fxsave);
 	else
-		frstor(&init_fpstate.fsave);
+		frstor(&init_fpstate.regs.fsave);
 
 	pkru_write_default();
 }
@@ -466,7 +466,7 @@ static void fpu_reset_fpstate(void)
 	 * user space as PKRU is eagerly written in switch_to() and
 	 * flush_thread().
 	 */
-	memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size());
+	memcpy(&fpu->state, &init_fpstate.regs, init_fpstate_copy_size());
 	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpregs_unlock();
 }
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index 63bd75fe95a8..e1d8a352f12d 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -2,7 +2,7 @@
 #ifndef __X86_KERNEL_FPU_INTERNAL_H
 #define __X86_KERNEL_FPU_INTERNAL_H
 
-extern union fpregs_state init_fpstate;
+extern struct fpstate init_fpstate;
 
 /* CPU feature check wrappers */
 static __always_inline __pure bool use_xsave(void)
@@ -25,7 +25,7 @@ static __always_inline __pure bool use_fxsr(void)
 extern void fpu__init_prepare_fx_sw_frame(void);
 
 /* Used in init.c */
-extern void fpstate_init_user(union fpregs_state *state);
+extern void fpstate_init_user(struct fpstate *fpstate);
 extern void fpstate_reset(struct fpu *fpu);
 
 #endif
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index b829b7a9f995..6f2d6d637600 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -243,7 +243,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
 			ret = fxrstor_from_user_sigframe(buf);
 
 		if (!ret && unlikely(init_bv))
-			os_xrstor(&init_fpstate.xsave, init_bv);
+			os_xrstor(&init_fpstate.regs.xsave, init_bv);
 		return ret;
 	} else if (use_fxsr()) {
 		return fxrstor_from_user_sigframe(buf);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index e525803610e0..c79000aa16fc 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -405,12 +405,12 @@ static void __init setup_init_fpu_buf(void)
 	setup_xstate_features();
 	print_xstate_features();
 
-	xstate_init_xcomp_bv(&init_fpstate.xsave, xfeatures_mask_all);
+	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, xfeatures_mask_all);
 
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
 	 */
-	os_xrstor_booting(&init_fpstate.xsave);
+	os_xrstor_booting(&init_fpstate.regs.xsave);
 
 	/*
 	 * All components are now in init state. Read the state back so
@@ -428,7 +428,7 @@ static void __init setup_init_fpu_buf(void)
 	 * state is all zeroes or if not to add the necessary handling
 	 * here.
 	 */
-	fxsave(&init_fpstate.fxsave);
+	fxsave(&init_fpstate.regs.fxsave);
 }
 
 static int xfeature_uncompacted_offset(int xfeature_nr)
@@ -669,11 +669,11 @@ static unsigned int __init get_xsave_size(void)
  */
 static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
 {
-	if (test_xstate_size <= sizeof(union fpregs_state))
+	if (test_xstate_size <= sizeof(init_fpstate.regs))
 		return true;
 
 	pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
-			sizeof(union fpregs_state), test_xstate_size);
+			sizeof(init_fpstate.regs), test_xstate_size);
 	return false;
 }
 
@@ -1008,7 +1008,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 			       u32 pkru_val, enum xstate_copy_mode copy_mode)
 {
 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
-	struct xregs_state *xinit = &init_fpstate.xsave;
+	struct xregs_state *xinit = &init_fpstate.regs.xsave;
 	struct xstate_header header;
 	unsigned int zerofrom;
 	u64 mask;
-- 
Gitee


From 0beb1932884227fdea752e6fd23acd09831d2490 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:30 +0200
Subject: [PATCH 0758/2161] x86/fpu: Convert restore_fpregs_from_fpstate() to
 struct fpstate

commit 18b3fa1ad15fa8d777ac32f117553cce1a968460 upstream.

Convert restore_fpregs_from_fpstate() and related code to the new
register storage mechanism in preparation for dynamically sized buffers.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.347395546@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/signal.h |  2 +-
 arch/x86/kernel/fpu/context.h     |  2 +-
 arch/x86/kernel/fpu/core.c        | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 9a63a21c219d..22b0273a8bf1 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -40,7 +40,7 @@ extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size
 extern void fpu__clear_user_states(struct fpu *fpu);
 extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
 
-extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
+extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask);
 
 extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
 
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index e652282842c8..f8f510519688 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -74,7 +74,7 @@ static inline void fpregs_restore_userregs(void)
 		 */
 		mask = xfeatures_mask_restore_user() |
 			xfeatures_mask_supervisor();
-		restore_fpregs_from_fpstate(&fpu->state, mask);
+		restore_fpregs_from_fpstate(fpu->fpstate, mask);
 
 		fpregs_activate(fpu);
 		fpu->last_cpu = cpu;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e8595422dc49..3a308cc48c97 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -129,7 +129,7 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
 	frstor(&fpu->state.fsave);
 }
 
-void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
+void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
 {
 	/*
 	 * AMD K7/K8 and later CPUs up to Zen don't save/restore
@@ -146,18 +146,18 @@ void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
 	}
 
 	if (use_xsave()) {
-		os_xrstor(&fpstate->xsave, mask);
+		os_xrstor(&fpstate->regs.xsave, mask);
 	} else {
 		if (use_fxsr())
-			fxrstor(&fpstate->fxsave);
+			fxrstor(&fpstate->regs.fxsave);
 		else
-			frstor(&fpstate->fsave);
+			frstor(&fpstate->regs.fsave);
 	}
 }
 
 void fpu_reset_from_exception_fixup(void)
 {
-	restore_fpregs_from_fpstate(&init_fpstate.regs, xfeatures_mask_fpstate());
+	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
 }
 
 #if IS_ENABLED(CONFIG_KVM)
@@ -176,7 +176,7 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 
 	if (rstor) {
 		restore_mask &= xfeatures_mask_fpstate();
-		restore_fpregs_from_fpstate(&rstor->state, restore_mask);
+		restore_fpregs_from_fpstate(rstor->fpstate, restore_mask);
 	}
 
 	fpregs_mark_activate();
-- 
Gitee


From cc4bdf82014bac4055a97f320c7ce4a6f6ea75f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:31 +0200
Subject: [PATCH 0759/2161] x86/fpu: Replace KVMs xstate component clearing

commit 087df48c298c1cb829f4cd468d90f93234b1bc44 upstream.

In order to prepare for the support of dynamically enabled FPU features,
move the clearing of xstate components to the FPU core code.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: kvm@vger.kernel.org
Link: https://lkml.kernel.org/r/20211013145322.399567049@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h    |  1 +
 arch/x86/include/asm/fpu/xstate.h |  1 -
 arch/x86/kernel/fpu/xstate.c      | 12 +++++++++++-
 arch/x86/kernel/fpu/xstate.h      |  2 ++
 arch/x86/kvm/x86.c                | 14 +++++---------
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 360cef8367a7..e78b03c477fd 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -105,6 +105,7 @@ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
 /* fpstate-related functions which are exported to KVM */
 extern void fpu_init_fpstate_user(struct fpu *fpu);
+extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
 
 /* KVM specific functions */
 extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index df3396a72dbc..ee5481ea5400 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -128,7 +128,6 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 extern void __init update_regset_xstate_info(unsigned int size,
 					     u64 xstate_mask);
 
-void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int xfeature_size(int xfeature_nr);
 int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c79000aa16fc..f04da95f73e5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -905,7 +905,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 
 	return __raw_xsave_addr(xsave, xfeature_nr);
 }
-EXPORT_SYMBOL_GPL(get_xsave_addr);
 
 /*
  * This wraps up the common operations that need to occur when retrieving
@@ -1284,6 +1283,17 @@ void xrstors(struct xregs_state *xstate, u64 mask)
 	WARN_ON_ONCE(err);
 }
 
+#if IS_ENABLED(CONFIG_KVM)
+void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
+{
+	void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
+
+	if (addr)
+		memset(addr, 0, xstate_sizes[xfeature]);
+}
+EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+#endif
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * Report the amount of time elapsed in millisecond since last AVX512
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index bb6d7d298d2a..99f8cfec719d 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -21,6 +21,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsav
 extern void fpu__init_cpu_xstate(void);
 extern void fpu__init_system_xstate(void);
 
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
 /* XSAVE/XRSTOR wrapper functions */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 22bd393eff40..25cbefdc34e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9216,7 +9216,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vcpu->arch.apf.halted = false;
 
 	if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
-		void *mpx_state_buffer;
+		struct fpstate *fpstate = vcpu->arch.guest_fpu->fpstate;
 
 		/*
 		 * To avoid have the INIT path from kvm_apic_has_events() that be
@@ -9224,14 +9224,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		 */
 		if (init_event)
 			kvm_put_guest_fpu(vcpu);
-		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-					XFEATURE_BNDREGS);
-		if (mpx_state_buffer)
-			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
-		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-					XFEATURE_BNDCSR);
-		if (mpx_state_buffer)
-			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+
+		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
 		if (init_event)
 			kvm_load_guest_fpu(vcpu);
 	}
-- 
Gitee


From 8d5a4cf51b7ddcd86b7c56734d696a8f4d867b21 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:33 +0200
Subject: [PATCH 0760/2161] x86/KVM: Convert to fpstate

commit 1c57572d754fc54e0b8ac0df5350969ce6292d12 upstream.

Convert KVM code to the new register storage mechanism in preparation for
dynamically sized buffers.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: kvm@vger.kernel.org
Link: https://lkml.kernel.org/r/20211013145322.451439983@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 25cbefdc34e6..260172cd7f70 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9012,7 +9012,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 	vcpu_load(vcpu);
 
-	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+	fxsave = &vcpu->arch.guest_fpu->fpstate->regs.fxsave;
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
 	fpu->fsw = fxsave->swd;
@@ -9035,7 +9035,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 	vcpu_load(vcpu);
 
-	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+	fxsave = &vcpu->arch.guest_fpu->fpstate->regs.fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
-- 
Gitee


From bb069ef23be0deaeddd93f28449875fe8c680b62 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:34 +0200
Subject: [PATCH 0761/2161] x86/fpu: Convert tracing to fpstate

commit cceb496420fa11a6e11989abc68b8e7564dc40f9 upstream.

Convert FPU tracing code to the new register storage mechanism in
preparation for dynamically sized buffers.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.503327333@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/trace/fpu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index fd86d4bd23df..39c7ebe9a3e7 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -22,8 +22,8 @@ DECLARE_EVENT_CLASS(x86_fpu,
 		__entry->fpu		= fpu;
 		__entry->load_fpu	= test_thread_flag(TIF_NEED_FPU_LOAD);
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
-			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
-			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
+			__entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
+			__entry->xcomp_bv  = fpu->fpstate->regs.xsave.header.xcomp_bv;
 		}
 	),
 	TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
-- 
Gitee


From eef3f82a31d81aff8a0f57acdab841978b9cd940 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:36 +0200
Subject: [PATCH 0762/2161] x86/fpu/regset: Convert to fpstate

commit caee31a36c33ed7788d0b3d93a663860157f6c55 upstream.

Convert regset related code to the new register storage mechanism in
preparation for dynamically sized buffers.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.555239736@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 06c0918fe8db..0f068ba7b93d 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -78,8 +78,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	sync_fpstate(fpu);
 
 	if (!use_xsave()) {
-		return membuf_write(&to, &fpu->state.fxsave,
-				    sizeof(fpu->state.fxsave));
+		return membuf_write(&to, &fpu->fpstate->regs.fxsave,
+				    sizeof(fpu->fpstate->regs.fxsave));
 	}
 
 	copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
@@ -114,15 +114,15 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	fpu_force_restore(fpu);
 
 	/* Copy the state  */
-	memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate));
+	memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate));
 
 	/* Clear xmm8..15 */
-	BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16);
-	memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16);
+	BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16);
+	memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16);
 
 	/* Mark FP and SSE as in use when XSAVE is enabled */
 	if (use_xsave())
-		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+		fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
 
 	return 0;
 }
@@ -168,7 +168,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	}
 
 	fpu_force_restore(fpu);
-	ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf);
+	ret = copy_uabi_from_kernel_to_xstate(&fpu->fpstate->regs.xsave,
+					      kbuf ?: tmpbuf);
 
 out:
 	vfree(tmpbuf);
@@ -287,7 +288,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
 void
 convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 {
-	__convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave);
+	__convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave);
 }
 
 void convert_to_fxsr(struct fxregs_state *fxsave,
@@ -330,7 +331,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 		return fpregs_soft_get(target, regset, to);
 
 	if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
-		return membuf_write(&to, &fpu->state.fsave,
+		return membuf_write(&to, &fpu->fpstate->regs.fsave,
 				    sizeof(struct fregs_state));
 	}
 
@@ -341,7 +342,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 		copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
 		fx = &fxsave;
 	} else {
-		fx = &fpu->state.fxsave;
+		fx = &fpu->fpstate->regs.fxsave;
 	}
 
 	__convert_from_fxsr(&env, target, fx);
@@ -370,16 +371,16 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	fpu_force_restore(fpu);
 
 	if (cpu_feature_enabled(X86_FEATURE_FXSR))
-		convert_to_fxsr(&fpu->state.fxsave, &env);
+		convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env);
 	else
-		memcpy(&fpu->state.fsave, &env, sizeof(env));
+		memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env));
 
 	/*
 	 * Update the header bit in the xsave header, indicating the
 	 * presence of FP.
 	 */
 	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
-		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+		fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP;
 
 	return 0;
 }
-- 
Gitee


From 64dfdfb2664aaca0b20a7682d0d01e3a38f68144 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:37 +0200
Subject: [PATCH 0763/2161] x86/fpu/signal: Convert to fpstate

commit 7e049e8b74591038c831e765585ae9038b7880a1 upstream.

Convert signal related code to the new register storage mechanism in
preparation for dynamically sized buffers.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.607370221@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 6f2d6d637600..e5e72125f9d5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -72,13 +72,13 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
 static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
 {
 	if (use_fxsr()) {
-		struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+		struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave;
 		struct user_i387_ia32_struct env;
 		struct _fpstate_32 __user *fp = buf;
 
 		fpregs_lock();
 		if (!test_thread_flag(TIF_NEED_FPU_LOAD))
-			fxsave(&tsk->thread.fpu.state.fxsave);
+			fxsave(&tsk->thread.fpu.fpstate->regs.fxsave);
 		fpregs_unlock();
 
 		convert_from_fxsr(&env, tsk);
@@ -303,7 +303,7 @@ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
 	 * been restored from a user buffer directly.
 	 */
 	if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
-		os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
+		os_xrstor(&fpu->fpstate->regs.xsave, xfeatures_mask_supervisor());
 
 	fpregs_mark_activate();
 	fpregs_unlock();
@@ -317,6 +317,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
 	struct user_i387_ia32_struct env;
+	union fpregs_state *fpregs;
 	u64 user_xfeatures = 0;
 	bool fx_only = false;
 	bool success;
@@ -349,6 +350,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	if (__copy_from_user(&env, buf, sizeof(env)))
 		return false;
 
+	fpregs = &fpu->fpstate->regs;
 	/*
 	 * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
 	 * not modified on context switch and that the xstate is considered
@@ -366,7 +368,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		 * the right place in memory. It's ia32 mode. Shrug.
 		 */
 		if (xfeatures_mask_supervisor())
-			os_xsave(&fpu->state.xsave);
+			os_xsave(&fpregs->xsave);
 		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
 	__fpu_invalidate_fpregs_state(fpu);
@@ -374,29 +376,29 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	fpregs_unlock();
 
 	if (use_xsave() && !fx_only) {
-		if (copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx))
+		if (copy_sigframe_from_user_to_xstate(&fpregs->xsave, buf_fx))
 			return false;
 	} else {
-		if (__copy_from_user(&fpu->state.fxsave, buf_fx,
-				     sizeof(fpu->state.fxsave)))
+		if (__copy_from_user(&fpregs->fxsave, buf_fx,
+				     sizeof(fpregs->fxsave)))
 			return false;
 
 		if (IS_ENABLED(CONFIG_X86_64)) {
 			/* Reject invalid MXCSR values. */
-			if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
+			if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
 				return false;
 		} else {
 			/* Mask invalid bits out for historical reasons (broken hardware). */
-			fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+			fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
 		}
 
 		/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
 		if (use_xsave())
-			fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+			fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
 	}
 
 	/* Fold the legacy FP storage */
-	convert_to_fxsr(&fpu->state.fxsave, &env);
+	convert_to_fxsr(&fpregs->fxsave, &env);
 
 	fpregs_lock();
 	if (use_xsave()) {
@@ -411,10 +413,10 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		 */
 		u64 mask = user_xfeatures | xfeatures_mask_supervisor();
 
-		fpu->state.xsave.header.xfeatures &= mask;
-		success = !os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all);
+		fpregs->xsave.header.xfeatures &= mask;
+		success = !os_xrstor_safe(&fpregs->xsave, xfeatures_mask_all);
 	} else {
-		success = !fxrstor_safe(&fpu->state.fxsave);
+		success = !fxrstor_safe(&fpregs->fxsave);
 	}
 
 	if (likely(success))
-- 
Gitee


From 7fe729aaabbf3499abfe3e37a3d2f6864c8bc2bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 27 Oct 2020 11:09:50 +0100
Subject: [PATCH 0764/2161] x86/fpu: Simplify fpregs_[un]lock()

commit opencloudos.

There is no point in disabling preemption and then disabling bottom
halfs.

Just disabling bottom halfs is sufficient as it implicitly disables
preemption on !RT kernels.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201027101349.455380473@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index e78b03c477fd..814533e5506c 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -42,17 +42,18 @@ static inline void kernel_fpu_begin(void)
  * A context switch will (and softirq might) save CPU's FPU registers to
  * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
  * a random state.
+ *
+ * local_bh_disable() protects against both preemption and soft interrupts
+ * on !RT kernels.
  */
 static inline void fpregs_lock(void)
 {
-	preempt_disable();
 	local_bh_disable();
 }
 
 static inline void fpregs_unlock(void)
 {
 	local_bh_enable();
-	preempt_enable();
 }
 
 #ifdef CONFIG_X86_DEBUG_FPU
-- 
Gitee


From 5c8ae7bfd052b9c00f20fefcbc269d5f511393ab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 27 Oct 2020 11:09:51 +0100
Subject: [PATCH 0765/2161] x86/fpu: Make kernel FPU protection RT friendly

commit cba08c5dc6dc1a906a0b5ddac9a9ac6c9a64f2e8 upstream.

Non RT kernels need to protect FPU against preemption and bottom half
processing. This is achieved by disabling bottom halfs via
local_bh_disable() which implictly disables preemption.

On RT kernels this protection mechanism is not sufficient because
local_bh_disable() does not disable preemption. It serializes bottom half
related processing via a CPU local lock.

As bottom halfs are running always in thread context on RT kernels
disabling preemption is the proper choice as it implicitly prevents bottom
half processing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201027101349.588965083@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 814533e5506c..d5696457ae55 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -45,15 +45,29 @@ static inline void kernel_fpu_begin(void)
  *
  * local_bh_disable() protects against both preemption and soft interrupts
  * on !RT kernels.
+ *
+ * On RT kernels local_bh_disable() is not sufficient because it only
+ * serializes soft interrupt related sections via a local lock, but stays
+ * preemptible. Disabling preemption is the right choice here as bottom
+ * half processing is always in thread context on RT kernels so it
+ * implicitly prevents bottom half processing as well.
+ *
+ * Disabling preemption also serializes against kernel_fpu_begin().
  */
 static inline void fpregs_lock(void)
 {
-	local_bh_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_bh_disable();
+	else
+		preempt_disable();
 }
 
 static inline void fpregs_unlock(void)
 {
-	local_bh_enable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_bh_enable();
+	else
+		preempt_enable();
 }
 
 #ifdef CONFIG_X86_DEBUG_FPU
-- 
Gitee


From 5946bf0fed719fa172899efde302a484f149c61f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:39 +0200
Subject: [PATCH 0766/2161] x86/fpu/core: Convert to fpstate

commit c20942ce5128ef92e2c451f943ba33462ad2fbc4 upstream.

Convert the rest of the core code to the new register storage mechanism in
preparation for dynamically sized buffers.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.659456185@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  4 ++--
 arch/x86/kernel/fpu/core.c     | 44 ++++++++++++++++++----------------
 arch/x86/kernel/fpu/init.c     |  2 +-
 arch/x86/kernel/fpu/xstate.c   |  2 +-
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index d5696457ae55..462f408f2e92 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -38,9 +38,9 @@ static inline void kernel_fpu_begin(void)
 }
 
 /*
- * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
  * A context switch will (and softirq might) save CPU's FPU registers to
- * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
+ * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
  * a random state.
  *
  * local_bh_disable() protects against both preemption and soft interrupts
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3a308cc48c97..3a89709abc8a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -89,7 +89,7 @@ bool irq_fpu_usable(void)
 EXPORT_SYMBOL(irq_fpu_usable);
 
 /*
- * Save the FPU register state in fpu->state. The register state is
+ * Save the FPU register state in fpu->fpstate->regs. The register state is
  * preserved.
  *
  * Must be called with fpregs_lock() held.
@@ -105,19 +105,19 @@ EXPORT_SYMBOL(irq_fpu_usable);
 void save_fpregs_to_fpstate(struct fpu *fpu)
 {
 	if (likely(use_xsave())) {
-		os_xsave(&fpu->state.xsave);
+		os_xsave(&fpu->fpstate->regs.xsave);
 
 		/*
 		 * AVX512 state is tracked here because its use is
 		 * known to slow the max clock speed of the core.
 		 */
-		if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+		if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
 			fpu->avx512_timestamp = jiffies;
 		return;
 	}
 
 	if (likely(use_fxsr())) {
-		fxsave(&fpu->state.fxsave);
+		fxsave(&fpu->fpstate->regs.fxsave);
 		return;
 	}
 
@@ -125,8 +125,8 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
 	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
 	 * so we have to reload them from the memory state.
 	 */
-	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
-	frstor(&fpu->state.fsave);
+	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
+	frstor(&fpu->fpstate->regs.fsave);
 }
 
 void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
@@ -167,7 +167,8 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 
 	if (save) {
 		if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
-			memcpy(&save->state, &current->thread.fpu.state,
+			memcpy(&save->fpstate->regs,
+			       &current->thread.fpu.fpstate->regs,
 			       fpu_kernel_xstate_size);
 		} else {
 			save_fpregs_to_fpstate(save);
@@ -187,7 +188,7 @@ EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
 void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf,
 			       unsigned int size, u32 pkru)
 {
-	union fpregs_state *kstate = &fpu->state;
+	union fpregs_state *kstate = &fpu->fpstate->regs;
 	union fpregs_state *ustate = buf;
 	struct membuf mb = { .p = buf, .left = size };
 
@@ -205,7 +206,7 @@ EXPORT_SYMBOL_GPL(fpu_copy_fpstate_to_kvm_uabi);
 int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
 				 u32 *vpkru)
 {
-	union fpregs_state *kstate = &fpu->state;
+	union fpregs_state *kstate = &fpu->fpstate->regs;
 	const union fpregs_state *ustate = buf;
 	struct pkru_state *xpkru;
 	int ret;
@@ -378,7 +379,7 @@ int fpu_clone(struct task_struct *dst)
 	 */
 	if (dst->flags & (PF_KTHREAD)) {
 		/* Clear out the minimal state */
-		memcpy(&dst_fpu->state, &init_fpstate.regs,
+		memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
 		       init_fpstate_copy_size());
 		return 0;
 	}
@@ -389,11 +390,12 @@ int fpu_clone(struct task_struct *dst)
 	 * child's FPU context, without any memory-to-memory copying.
 	 */
 	fpregs_lock();
-	if (test_thread_flag(TIF_NEED_FPU_LOAD))
-		memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
-
-	else
+	if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
+		memcpy(&dst_fpu->fpstate->regs, &src_fpu->fpstate->regs,
+		       fpu_kernel_xstate_size);
+	} else {
 		save_fpregs_to_fpstate(dst_fpu);
+	}
 	fpregs_unlock();
 
 	trace_x86_fpu_copy_src(src_fpu);
@@ -466,7 +468,7 @@ static void fpu_reset_fpstate(void)
 	 * user space as PKRU is eagerly written in switch_to() and
 	 * flush_thread().
 	 */
-	memcpy(&fpu->state, &init_fpstate.regs, init_fpstate_copy_size());
+	memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
 	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpregs_unlock();
 }
@@ -493,7 +495,7 @@ void fpu__clear_user_states(struct fpu *fpu)
 	 */
 	if (xfeatures_mask_supervisor() &&
 	    !fpregs_state_valid(fpu, smp_processor_id())) {
-		os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
+		os_xrstor(&fpu->fpstate->regs.xsave, xfeatures_mask_supervisor());
 	}
 
 	/* Reset user states in registers. */
@@ -574,11 +576,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
 		 * fully reproduce the context of the exception.
 		 */
 		if (boot_cpu_has(X86_FEATURE_FXSR)) {
-			cwd = fpu->state.fxsave.cwd;
-			swd = fpu->state.fxsave.swd;
+			cwd = fpu->fpstate->regs.fxsave.cwd;
+			swd = fpu->fpstate->regs.fxsave.swd;
 		} else {
-			cwd = (unsigned short)fpu->state.fsave.cwd;
-			swd = (unsigned short)fpu->state.fsave.swd;
+			cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
+			swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
 		}
 
 		err = swd & ~cwd;
@@ -592,7 +594,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
 		unsigned short mxcsr = MXCSR_DEFAULT;
 
 		if (boot_cpu_has(X86_FEATURE_XMM))
-			mxcsr = fpu->state.fxsave.mxcsr;
+			mxcsr = fpu->fpstate->regs.fxsave.mxcsr;
 
 		err = ~(mxcsr >> 7) & mxcsr;
 	}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 31ecbfba9ff7..b524cd053114 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -38,7 +38,7 @@ static void fpu__init_cpu_generic(void)
 	/* Flush out any pending x87 state: */
 #ifdef CONFIG_MATH_EMULATION
 	if (!boot_cpu_has(X86_FEATURE_FPU))
-		fpstate_init_soft(&current->thread.fpu.state.soft);
+		fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
 	else
 #endif
 		asm volatile ("fninit");
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index f04da95f73e5..b4b05a46b519 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1121,7 +1121,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 			     enum xstate_copy_mode copy_mode)
 {
-	__copy_xstate_to_uabi_buf(to, &tsk->thread.fpu.state.xsave,
+	__copy_xstate_to_uabi_buf(to, &tsk->thread.fpu.fpstate->regs.xsave,
 				  tsk->thread.pkru, copy_mode);
 }
 
-- 
Gitee


From 3b07e80fd9c2546bbb55d46df23b0b8e36d5f12a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:40 +0200
Subject: [PATCH 0767/2161] x86/math-emu: Convert to fpstate

commit 63d6bdf36ce1541e656966604c12ac4d9fc5d1f0 upstream.

Convert math emulation code to the new register storage
mechanism in preparation for dynamically sized buffers.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.711347464@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/math-emu/fpu_aux.c    | 2 +-
 arch/x86/math-emu/fpu_entry.c  | 4 ++--
 arch/x86/math-emu/fpu_system.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 034748459482..d62662bdd460 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft)
 
 void finit(void)
 {
-	fpstate_init_soft(&current->thread.fpu.state.soft);
+	fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
 }
 
 /*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 50195e249753..7fe56c594aa6 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -640,7 +640,7 @@ int fpregs_soft_set(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    const void *kbuf, const void __user *ubuf)
 {
-	struct swregs_state *s387 = &target->thread.fpu.state.soft;
+	struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft;
 	void *space = s387->st_space;
 	int ret;
 	int offset, other, i, tags, regnr, tag, newtop;
@@ -691,7 +691,7 @@ int fpregs_soft_get(struct task_struct *target,
 		    const struct user_regset *regset,
 		    struct membuf to)
 {
-	struct swregs_state *s387 = &target->thread.fpu.state.soft;
+	struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft;
 	const void *space = s387->st_space;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
 
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 9b41391867dc..eec3e4805c75 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d)
 	return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE;
 }
 
-#define I387			(&current->thread.fpu.state)
+#define I387			(&current->thread.fpu.fpstate->regs)
 #define FPU_info		(I387->soft.info)
 
 #define FPU_CS			(*(unsigned short *) &(FPU_info->regs->cs))
-- 
Gitee


From 2664ae490313ff397e1fb3a7a1e5e326f0b29e77 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:42 +0200
Subject: [PATCH 0768/2161] x86/fpu: Remove fpu::state

commit 2f27b5034244c4ebd70c90066defa771a99a5320 upstream.

All users converted. Remove it along with the sanity checks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.765063318@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 18 +++++++-----------
 arch/x86/kernel/fpu/init.c       |  4 ----
 arch/x86/kernel/fpu/xstate.c     |  2 +-
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 8162d355f542..27529e3240d4 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -332,20 +332,16 @@ struct fpu {
 	struct fpstate			*fpstate;
 
 	/*
-	 * @state:
+	 * @__fpstate:
 	 *
-	 * In-memory copy of all FPU registers that we save/restore
-	 * over context switches. If the task is using the FPU then
-	 * the registers in the FPU are more recent than this state
-	 * copy. If the task context-switches away then they get
-	 * saved here and represent the FPU state.
+	 * Initial in-memory storage for FPU registers which are saved in
+	 * context switch and when the kernel uses the FPU. The registers
+	 * are restored from this storage on return to user space if they
+	 * are not longer containing the tasks FPU register state.
 	 */
-	union {
-		struct fpstate			__fpstate;
-		union fpregs_state		state;
-	};
+	struct fpstate			__fpstate;
 	/*
-	 * WARNING: 'state' is dynamically-sized.  Do not put
+	 * WARNING: '__fpstate' is dynamically-sized.  Do not put
 	 * anything after it here.
 	 */
 };
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index b524cd053114..cffbaf491886 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -184,10 +184,6 @@ static void __init fpu__init_task_struct_size(void)
 	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
 	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
 
-	BUILD_BUG_ON(sizeof(struct fpstate) != sizeof(union fpregs_state));
-	BUILD_BUG_ON(offsetof(struct thread_struct, fpu.state) !=
-		     offsetof(struct thread_struct, fpu.__fpstate));
-
 	arch_task_struct_size = task_size;
 }
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b4b05a46b519..11d28bd03c20 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -933,7 +933,7 @@ const void *get_xsave_field_ptr(int xfeature_nr)
 	 */
 	fpu_sync_fpstate(fpu);
 
-	return get_xsave_addr(&fpu->state.xsave, xfeature_nr);
+	return get_xsave_addr(&fpu->fpstate->regs.xsave, xfeature_nr);
 }
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
-- 
Gitee


From 0ecd4d78a1b56ef8187f80b2e6162569c3bd69bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:43 +0200
Subject: [PATCH 0769/2161] x86/fpu: Do not leak fpstate pointer on fork

commit f0cbc8b3cdf7d1c724155cd9cecffe329bb96119 upstream.

If fork fails early then the copied task struct would carry the fpstate
pointer of the parent task.

Not a problem right now, but later when dynamically allocated buffers
are available, keeping the pointer might result in freeing the
parent's buffer. Set it to NULL which prevents that. If fork reaches
clone_thread(), the pointer will be correctly set to the new task
context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.817101108@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/process.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b871b35632c9..e239aa4f10ee 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -100,6 +100,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 #ifdef CONFIG_VM86
 	dst->thread.vm86 = NULL;
 #endif
+	/* Drop the copied pointer to current's fpstate */
+	dst->thread.fpu.fpstate = NULL;
 	return 0;
 }
 
-- 
Gitee


From 848577011cdbdda408e6ad6e41b8ba06121ecfac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:45 +0200
Subject: [PATCH 0770/2161] x86/process: Move arch_thread_struct_whitelist()
 out of line

commit 2dd8eedc80b184bb16aad697ae60367c5bf07299 upstream.

In preparation for dynamically enabled FPU features move the function
out of line as the goal is to expose less and not more information.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.869001791@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/processor.h |  9 +++------
 arch/x86/kernel/fpu/core.c       | 10 ++++++++++
 arch/x86/kernel/fpu/internal.h   |  2 ++
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b80b4c24a9aa..7f3db4d6077b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -426,9 +426,6 @@ DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
 #endif	/* X86_64 */
 
-extern unsigned int fpu_kernel_xstate_size;
-extern unsigned int fpu_user_xstate_size;
-
 struct perf_event;
 
 typedef struct {
@@ -505,12 +502,12 @@ struct thread_struct {
 	 */
 };
 
-/* Whitelist the FPU register state from the task_struct for hardened usercopy. */
+extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size);
+
 static inline void arch_thread_struct_whitelist(unsigned long *offset,
 						unsigned long *size)
 {
-	*offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
-	*size = fpu_kernel_xstate_size;
+	fpu_thread_struct_whitelist(offset, size);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3a89709abc8a..dfb16a077368 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -404,6 +404,16 @@ int fpu_clone(struct task_struct *dst)
 	return 0;
 }
 
+/*
+ * Whitelist the FPU register state embedded into task_struct for hardened
+ * usercopy.
+ */
+void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+	*offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
+	*size = fpu_kernel_xstate_size;
+}
+
 /*
  * Drops current FPU state: deactivates the fpregs and
  * the fpstate. NOTE: it still leaves previous contents
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index e1d8a352f12d..5c4f71ff6ae9 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -2,6 +2,8 @@
 #ifndef __X86_KERNEL_FPU_INTERNAL_H
 #define __X86_KERNEL_FPU_INTERNAL_H
 
+extern unsigned int fpu_kernel_xstate_size;
+extern unsigned int fpu_user_xstate_size;
 extern struct fpstate init_fpstate;
 
 /* CPU feature check wrappers */
-- 
Gitee


From bd32d647af94dd9fb81295f6ee6a11c84ea374ad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:46 +0200
Subject: [PATCH 0771/2161] x86/fpu: Add size and mask information to fpstate

commit 248452ce21aeb08da2d2af23d88f890886bd379f upstream.

Add state size and feature mask information to the fpstate container. This
will be used for runtime checks with the upcoming support for dynamically
enabled features and dynamically sized buffers. That avoids conditionals
all over the place as the required information is accessible for both
default and extended buffers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.921388806@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 12 ++++++++++++
 arch/x86/kernel/fpu/core.c       |  6 ++++++
 arch/x86/kernel/fpu/init.c       |  9 +++++++++
 arch/x86/kernel/fpu/xstate.c     |  3 +++
 4 files changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 27529e3240d4..899a27022772 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -290,6 +290,18 @@ union fpregs_state {
 };
 
 struct fpstate {
+	/* @kernel_size: The size of the kernel register image */
+	unsigned int		size;
+
+	/* @user_size: The size in non-compacted UABI format */
+	unsigned int		user_size;
+
+	/* @xfeatures:		xfeatures for which the storage is sized */
+	u64			xfeatures;
+
+	/* @user_xfeatures:	xfeatures valid in UABI buffers */
+	u64			user_xfeatures;
+
 	/* @regs: The register state union for all supported formats */
 	union fpregs_state		regs;
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index dfb16a077368..25c5fe984eaa 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -342,6 +342,12 @@ void fpstate_reset(struct fpu *fpu)
 {
 	/* Set the fpstate pointer to the default fpstate */
 	fpu->fpstate = &fpu->__fpstate;
+
+	/* Initialize sizes and feature masks */
+	fpu->fpstate->size		= fpu_kernel_xstate_size;
+	fpu->fpstate->user_size		= fpu_user_xstate_size;
+	fpu->fpstate->xfeatures		= xfeatures_mask_all;
+	fpu->fpstate->user_xfeatures	= xfeatures_mask_uabi();
 }
 
 #if IS_ENABLED(CONFIG_KVM)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index cffbaf491886..65d763faace9 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -212,6 +212,14 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 	}
 
 	fpu_user_xstate_size = fpu_kernel_xstate_size;
+	fpstate_reset(&current->thread.fpu);
+}
+
+static void __init fpu__init_init_fpstate(void)
+{
+	/* Bring init_fpstate size and features up to date */
+	init_fpstate.size		= fpu_kernel_xstate_size;
+	init_fpstate.xfeatures		= xfeatures_mask_all;
 }
 
 /*
@@ -233,4 +241,5 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 	fpu__init_system_xstate_size_legacy();
 	fpu__init_system_xstate();
 	fpu__init_task_struct_size();
+	fpu__init_init_fpstate();
 }
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 11d28bd03c20..4f1076a78ad1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -717,6 +717,7 @@ static void __init fpu__init_disable_system_xstate(void)
 	xfeatures_mask_all = 0;
 	cr4_clear_bits(X86_CR4_OSXSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	fpstate_reset(&current->thread.fpu);
 }
 
 /*
@@ -789,6 +790,8 @@ void __init fpu__init_system_xstate(void)
 	if (err)
 		goto out_disable;
 
+	fpstate_reset(&current->thread.fpu);
+
 	/*
 	 * Update info used for ptrace frames; use standard-format size and no
 	 * supervisor xstates:
-- 
Gitee


From 33fdf396680b8487069fc929d02d1221dbdece48 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:48 +0200
Subject: [PATCH 0772/2161] x86/fpu: Use fpstate::size

commit be31dfdfd75b172af3ddcfa7511cdc3bb7adb25e upstream.

Make use of fpstate::size in various places which require the buffer size
information for sanity checks or memcpy() sizing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145322.973518954@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c   | 13 ++++++-------
 arch/x86/kernel/fpu/signal.c |  7 +++----
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 25c5fe984eaa..2eb60025ae8c 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -166,13 +166,12 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 	fpregs_lock();
 
 	if (save) {
-		if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
-			memcpy(&save->fpstate->regs,
-			       &current->thread.fpu.fpstate->regs,
-			       fpu_kernel_xstate_size);
-		} else {
+		struct fpstate *fpcur = current->thread.fpu.fpstate;
+
+		if (test_thread_flag(TIF_NEED_FPU_LOAD))
+			memcpy(&save->fpstate->regs, &fpcur->regs, fpcur->size);
+		else
 			save_fpregs_to_fpstate(save);
-		}
 	}
 
 	if (rstor) {
@@ -398,7 +397,7 @@ int fpu_clone(struct task_struct *dst)
 	fpregs_lock();
 	if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
 		memcpy(&dst_fpu->fpstate->regs, &src_fpu->fpstate->regs,
-		       fpu_kernel_xstate_size);
+		       dst_fpu->fpstate->size);
 	} else {
 		save_fpregs_to_fpstate(dst_fpu);
 	}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index e5e72125f9d5..2768ee7df761 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -313,15 +313,13 @@ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
 static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 			      bool ia32_fxstate)
 {
-	int state_size = fpu_kernel_xstate_size;
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
 	struct user_i387_ia32_struct env;
+	bool success, fx_only = false;
 	union fpregs_state *fpregs;
+	unsigned int state_size;
 	u64 user_xfeatures = 0;
-	bool fx_only = false;
-	bool success;
-
 
 	if (use_xsave()) {
 		struct _fpx_sw_bytes fx_sw_user;
@@ -334,6 +332,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		user_xfeatures = fx_sw_user.xfeatures;
 	} else {
 		user_xfeatures = XFEATURE_MASK_FPSSE;
+		state_size = fpu->fpstate->size;
 	}
 	
 	if (likely(!ia32_fxstate)) {
-- 
Gitee


From ecbd92e08076af2011db632cdfd600dfa94a5e55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:49 +0200
Subject: [PATCH 0773/2161] x86/fpu/xstate: Use fpstate for os_xsave()

commit 073e627a4537e682c43a1e8df659ce24cbced40c upstream.

With variable feature sets XSAVE[S] requires to know the feature set for
which the buffer is valid. Retrieve it from fpstate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145323.025695590@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c   | 2 +-
 arch/x86/kernel/fpu/signal.c | 4 ++--
 arch/x86/kernel/fpu/xstate.h | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 2eb60025ae8c..bacef650a169 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -105,7 +105,7 @@ EXPORT_SYMBOL(irq_fpu_usable);
 void save_fpregs_to_fpstate(struct fpu *fpu)
 {
 	if (likely(use_xsave())) {
-		os_xsave(&fpu->fpstate->regs.xsave);
+		os_xsave(fpu->fpstate);
 
 		/*
 		 * AVX512 state is tracked here because its use is
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 2768ee7df761..16a4906712db 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -349,7 +349,6 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	if (__copy_from_user(&env, buf, sizeof(env)))
 		return false;
 
-	fpregs = &fpu->fpstate->regs;
 	/*
 	 * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
 	 * not modified on context switch and that the xstate is considered
@@ -367,13 +366,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		 * the right place in memory. It's ia32 mode. Shrug.
 		 */
 		if (xfeatures_mask_supervisor())
-			os_xsave(&fpregs->xsave);
+			os_xsave(fpu->fpstate);
 		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
 	__fpu_invalidate_fpregs_state(fpu);
 	__cpu_invalidate_fpregs_state();
 	fpregs_unlock();
 
+	fpregs = &fpu->fpstate->regs;
 	if (use_xsave() && !fx_only) {
 		if (copy_sigframe_from_user_to_xstate(&fpregs->xsave, buf_fx))
 			return false;
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 99f8cfec719d..24a1479caea2 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -101,16 +101,16 @@ extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
  * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
  * and command line options. The choice is permanent until the next reboot.
  */
-static inline void os_xsave(struct xregs_state *xstate)
+static inline void os_xsave(struct fpstate *fpstate)
 {
-	u64 mask = xfeatures_mask_all;
+	u64 mask = fpstate->xfeatures;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
 
 	WARN_ON_FPU(!alternatives_patched);
 
-	XSTATE_XSAVE(xstate, lmask, hmask, err);
+	XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
 
 	/* We should never fault when copying to a kernel buffer: */
 	WARN_ON_FPU(err);
-- 
Gitee


From 9d8d8f7e006f921005017efc965f697a3f67999f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:51 +0200
Subject: [PATCH 0774/2161] x86/fpu/xstate: Use fpstate for
 xsave_to_user_sigframe()

commit 073e627a4537e682c43a1e8df659ce24cbced40c upstream.

With dynamically enabled features the sigframe code must know the features
which are enabled for the task. Get them from fpstate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145323.077781448@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 24a1479caea2..3e9eaf9f7cf3 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -149,7 +149,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
 	 * internally, e.g. PKRU. That's user space ABI and also required
 	 * to allow the signal handler to modify PKRU.
 	 */
-	u64 mask = xfeatures_mask_uabi();
+	u64 mask = current->thread.fpu.fpstate->user_xfeatures;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
-- 
Gitee


From 20b1e7f984952e5def9c63d1f2e50f7df5b75d54 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:52 +0200
Subject: [PATCH 0775/2161] x86/fpu: Use fpstate in
 fpu_copy_kvm_uabi_to_fpstate()

commit ad6ede407aae01d9617e172b27e179ce1046cbfc upstream.

Straight forward conversion. No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145323.129699950@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index bacef650a169..d4072cc2e17b 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -205,7 +205,7 @@ EXPORT_SYMBOL_GPL(fpu_copy_fpstate_to_kvm_uabi);
 int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
 				 u32 *vpkru)
 {
-	union fpregs_state *kstate = &fpu->fpstate->regs;
+	struct fpstate *kstate = fpu->fpstate;
 	const union fpregs_state *ustate = buf;
 	struct pkru_state *xpkru;
 	int ret;
@@ -215,25 +215,25 @@ int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
 			return -EINVAL;
 		if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
 			return -EINVAL;
-		memcpy(&kstate->fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
+		memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
 		return 0;
 	}
 
 	if (ustate->xsave.header.xfeatures & ~xcr0)
 		return -EINVAL;
 
-	ret = copy_uabi_from_kernel_to_xstate(&kstate->xsave, ustate);
+	ret = copy_uabi_from_kernel_to_xstate(&kstate->regs.xsave, ustate);
 	if (ret)
 		return ret;
 
 	/* Retrieve PKRU if not in init state */
-	if (kstate->xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
-		xpkru = get_xsave_addr(&kstate->xsave, XFEATURE_PKRU);
+	if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
+		xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
 		*vpkru = xpkru->pkru;
 	}
 
 	/* Ensure that XCOMP_BV is set up for XSAVES */
-	xstate_init_xcomp_bv(&kstate->xsave, xfeatures_mask_uabi());
+	xstate_init_xcomp_bv(&kstate->regs.xsave, xfeatures_mask_uabi());
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fpu_copy_kvm_uabi_to_fpstate);
-- 
Gitee


From 4020a5fa569d320b794285d96ba2a5431e7f4a0f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:54 +0200
Subject: [PATCH 0776/2161] x86/fpu: Use fpstate in __copy_xstate_to_uabi_buf()

commit 3ac8d75778fc8c1c22daad9bc674166b862f6f6e upstream.

With dynamically enabled features the copy function must know the features
and the size which is valid for the task. Retrieve them from fpstate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145323.181495492@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c   |  8 ++++----
 arch/x86/kernel/fpu/xstate.c | 11 ++++++-----
 arch/x86/kernel/fpu/xstate.h |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d4072cc2e17b..15497d72b9a0 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -187,15 +187,15 @@ EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
 void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf,
 			       unsigned int size, u32 pkru)
 {
-	union fpregs_state *kstate = &fpu->fpstate->regs;
+	struct fpstate *kstate = fpu->fpstate;
 	union fpregs_state *ustate = buf;
 	struct membuf mb = { .p = buf, .left = size };
 
 	if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
-		__copy_xstate_to_uabi_buf(mb, &kstate->xsave, pkru,
-					  XSTATE_COPY_XSAVE);
+		__copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE);
 	} else {
-		memcpy(&ustate->fxsave, &kstate->fxsave, sizeof(ustate->fxsave));
+		memcpy(&ustate->fxsave, &kstate->regs.fxsave,
+		       sizeof(ustate->fxsave));
 		/* Make it restorable on a XSAVE enabled host */
 		ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
 	}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4f1076a78ad1..c3b60b1da14f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -996,7 +996,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
 /**
  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
  * @to:		membuf descriptor
- * @xsave:	The xsave from which to copy
+ * @fpstate:	The fpstate buffer from which to copy
  * @pkru_val:	The PKRU value to store in the PKRU component
  * @copy_mode:	The requested copy mode
  *
@@ -1006,11 +1006,12 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
  *
  * It supports partial copy but @to.pos always starts from zero.
  */
-void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
+void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
 			       u32 pkru_val, enum xstate_copy_mode copy_mode)
 {
 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
 	struct xregs_state *xinit = &init_fpstate.regs.xsave;
+	struct xregs_state *xsave = &fpstate->regs.xsave;
 	struct xstate_header header;
 	unsigned int zerofrom;
 	u64 mask;
@@ -1030,7 +1031,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 		break;
 
 	case XSTATE_COPY_XSAVE:
-		header.xfeatures &= xfeatures_mask_uabi();
+		header.xfeatures &= fpstate->user_xfeatures;
 		break;
 	}
 
@@ -1073,7 +1074,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 	 * but there is no state to copy from in the compacted
 	 * init_fpstate. The gap tracking will zero these states.
 	 */
-	mask = xfeatures_mask_uabi();
+	mask = fpstate->user_xfeatures;
 
 	for_each_extended_xfeature(i, mask) {
 		/*
@@ -1124,7 +1125,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
 			     enum xstate_copy_mode copy_mode)
 {
-	__copy_xstate_to_uabi_buf(to, &tsk->thread.fpu.fpstate->regs.xsave,
+	__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
 				  tsk->thread.pkru, copy_mode);
 }
 
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 3e9eaf9f7cf3..b74c5953558c 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -15,7 +15,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
 		xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
 }
 
-extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave,
+extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
 				      u32 pkru_val, enum xstate_copy_mode copy_mode);
 
 extern void fpu__init_cpu_xstate(void);
-- 
Gitee


From e822bb0aa8811398d7751b771ad13817da14821f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Oct 2021 16:55:55 +0200
Subject: [PATCH 0777/2161] x86/fpu/xstate: Use fpstate for
 copy_uabi_to_xstate()

commit 073e627a4537e682c43a1e8df659ce24cbced40c upstream.

Prepare for dynamically enabled states per task. The function needs to
retrieve the features and sizes which are valid in a fpstate
context. Retrieve them from fpstate.

Move the function declarations to the core header as they are not
required anywhere else.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211013145323.233529986@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 12 ------------
 arch/x86/kernel/fpu/core.c        |  2 +-
 arch/x86/kernel/fpu/regset.c      |  5 ++---
 arch/x86/kernel/fpu/signal.c      |  2 +-
 arch/x86/kernel/fpu/xstate.c      | 18 ++++++++++--------
 arch/x86/kernel/fpu/xstate.h      | 12 ++++++++++++
 6 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index ee5481ea5400..489a1dcde16f 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -130,20 +130,8 @@ extern void __init update_regset_xstate_info(unsigned int size,
 
 const void *get_xsave_field_ptr(int xfeature_nr);
 int xfeature_size(int xfeature_nr);
-int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 
 void xsaves(struct xregs_state *xsave, u64 mask);
 void xrstors(struct xregs_state *xsave, u64 mask);
 
-enum xstate_copy_mode {
-	XSTATE_COPY_FP,
-	XSTATE_COPY_FX,
-	XSTATE_COPY_XSAVE,
-};
-
-struct membuf;
-void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
-			     enum xstate_copy_mode mode);
-
 #endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 15497d72b9a0..128e1d9b16dc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -222,7 +222,7 @@ int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
 	if (ustate->xsave.header.xfeatures & ~xcr0)
 		return -EINVAL;
 
-	ret = copy_uabi_from_kernel_to_xstate(&kstate->regs.xsave, ustate);
+	ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
 	if (ret)
 		return ret;
 
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 0f068ba7b93d..5bdf78c527e9 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -8,11 +8,11 @@
 #include <asm/fpu/api.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
 
 #include "context.h"
 #include "internal.h"
 #include "legacy.h"
+#include "xstate.h"
 
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
@@ -168,8 +168,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	}
 
 	fpu_force_restore(fpu);
-	ret = copy_uabi_from_kernel_to_xstate(&fpu->fpstate->regs.xsave,
-					      kbuf ?: tmpbuf);
+	ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
 
 out:
 	vfree(tmpbuf);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 16a4906712db..b815b95b3f64 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -375,7 +375,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 
 	fpregs = &fpu->fpstate->regs;
 	if (use_xsave() && !fx_only) {
-		if (copy_sigframe_from_user_to_xstate(&fpregs->xsave, buf_fx))
+		if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
 			return false;
 	} else {
 		if (__copy_from_user(&fpregs->fxsave, buf_fx,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c3b60b1da14f..6b73d98e027b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -460,10 +460,11 @@ int xfeature_size(int xfeature_nr)
 }
 
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-static int validate_user_xstate_header(const struct xstate_header *hdr)
+static int validate_user_xstate_header(const struct xstate_header *hdr,
+				       struct fpstate *fpstate)
 {
 	/* No unknown or supervisor features may be set */
-	if (hdr->xfeatures & ~xfeatures_mask_uabi())
+	if (hdr->xfeatures & ~fpstate->user_xfeatures)
 		return -EINVAL;
 
 	/* Userspace must use the uncompacted format */
@@ -1142,9 +1143,10 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
 }
 
 
-static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
+static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
 			       const void __user *ubuf)
 {
+	struct xregs_state *xsave = &fpstate->regs.xsave;
 	unsigned int offset, size;
 	struct xstate_header hdr;
 	u64 mask;
@@ -1154,7 +1156,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
 	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
 		return -EFAULT;
 
-	if (validate_user_xstate_header(&hdr))
+	if (validate_user_xstate_header(&hdr, fpstate))
 		return -EINVAL;
 
 	/* Validate MXCSR when any of the related features is in use */
@@ -1209,9 +1211,9 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
  * format and copy to the target thread. Used by ptrace and KVM.
  */
-int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
 {
-	return copy_uabi_to_xstate(xsave, kbuf, NULL);
+	return copy_uabi_to_xstate(fpstate, kbuf, NULL);
 }
 
 /*
@@ -1219,10 +1221,10 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
  * XSAVE[S] format and copy to the target thread. This is called from the
  * sigreturn() and rt_sigreturn() system calls.
  */
-int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
+int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
 				      const void __user *ubuf)
 {
-	return copy_uabi_to_xstate(xsave, NULL, ubuf);
+	return copy_uabi_to_xstate(fpstate, NULL, ubuf);
 }
 
 static bool validate_independent_components(u64 mask)
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index b74c5953558c..379dbfa4f526 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -15,8 +15,20 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
 		xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
 }
 
+enum xstate_copy_mode {
+	XSTATE_COPY_FP,
+	XSTATE_COPY_FX,
+	XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
 extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
 				      u32 pkru_val, enum xstate_copy_mode copy_mode);
+extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+				    enum xstate_copy_mode mode);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
+extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+
 
 extern void fpu__init_cpu_xstate(void);
 extern void fpu__init_system_xstate(void);
-- 
Gitee


From 103915cdfcc6d70abe36aca1aa0c6c53ac9e147c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 00:51:51 +0200
Subject: [PATCH 0778/2161] x86/fpu/signal: Use fpstate for size and features

commit 5509cc78080d29b23706dbf076d51691b69f3c79 upstream.

For dynamically enabled features it's required to get the features which
are enabled for that context when restoring from sigframe.

The same applies for all signal frame size calculations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/87ilxz5iew.ffs@tglx
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 44 ++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index b815b95b3f64..00332da8fdf0 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -41,7 +41,7 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
 	/* Check for the first magic field and other error scenarios. */
 	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
 	    fx_sw->xstate_size < min_xstate_size ||
-	    fx_sw->xstate_size > fpu_user_xstate_size ||
+	    fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
 	    fx_sw->xstate_size > fx_sw->extended_size)
 		goto setfx;
 
@@ -98,7 +98,8 @@ static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
 	return true;
 }
 
-static inline bool save_xstate_epilog(void __user *buf, int ia32_frame)
+static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
+				      unsigned int usize)
 {
 	struct xregs_state __user *x = buf;
 	struct _fpx_sw_bytes *sw_bytes;
@@ -113,7 +114,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame)
 		return !err;
 
 	err |= __put_user(FP_XSTATE_MAGIC2,
-			  (__u32 __user *)(buf + fpu_user_xstate_size));
+			  (__u32 __user *)(buf + usize));
 
 	/*
 	 * Read the xfeatures which we copied (directly from the cpu or
@@ -171,6 +172,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
 bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
 	struct task_struct *tsk = current;
+	struct fpstate *fpstate = tsk->thread.fpu.fpstate;
 	int ia32_fxstate = (buf != buf_fx);
 	int ret;
 
@@ -215,7 +217,7 @@ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	fpregs_unlock();
 
 	if (ret) {
-		if (!__clear_user(buf_fx, fpu_user_xstate_size))
+		if (!__clear_user(buf_fx, fpstate->user_size))
 			goto retry;
 		return false;
 	}
@@ -224,17 +226,18 @@ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
 		return false;
 
-	if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate))
+	if (use_fxsr() &&
+	    !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate->user_size))
 		return false;
 
 	return true;
 }
 
-static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
-				      bool fx_only)
+static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
+				      u64 xrestore, bool fx_only)
 {
 	if (use_xsave()) {
-		u64 init_bv = xfeatures_mask_uabi() & ~xrestore;
+		u64 init_bv = ufeatures & ~xrestore;
 		int ret;
 
 		if (likely(!fx_only))
@@ -265,7 +268,8 @@ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
 retry:
 	fpregs_lock();
 	pagefault_disable();
-	ret = __restore_fpregs_from_user(buf, xrestore, fx_only);
+	ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
+					 xrestore, fx_only);
 	pagefault_enable();
 
 	if (unlikely(ret)) {
@@ -332,7 +336,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		user_xfeatures = fx_sw_user.xfeatures;
 	} else {
 		user_xfeatures = XFEATURE_MASK_FPSSE;
-		state_size = fpu->fpstate->size;
+		state_size = fpu->fpstate->user_size;
 	}
 	
 	if (likely(!ia32_fxstate)) {
@@ -425,10 +429,11 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 	return success;
 }
 
-static inline int xstate_sigframe_size(void)
+static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
 {
-	return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
-			fpu_user_xstate_size;
+	unsigned int size = fpstate->user_size;
+
+	return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
 }
 
 /*
@@ -436,17 +441,19 @@ static inline int xstate_sigframe_size(void)
  */
 bool fpu__restore_sig(void __user *buf, int ia32_frame)
 {
-	unsigned int size = xstate_sigframe_size();
 	struct fpu *fpu = &current->thread.fpu;
 	void __user *buf_fx = buf;
 	bool ia32_fxstate = false;
 	bool success = false;
+	unsigned int size;
 
 	if (unlikely(!buf)) {
 		fpu__clear_user_states(fpu);
 		return true;
 	}
 
+	size = xstate_sigframe_size(fpu->fpstate);
+
 	ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
 		       IS_ENABLED(CONFIG_IA32_EMULATION));
 
@@ -481,7 +488,7 @@ unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 		     unsigned long *buf_fx, unsigned long *size)
 {
-	unsigned long frame_size = xstate_sigframe_size();
+	unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate);
 
 	*buf_fx = sp = round_down(sp - frame_size, 64);
 	if (ia32_frame && use_fxsr()) {
@@ -494,9 +501,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 	return sp;
 }
 
-unsigned long fpu__get_fpstate_size(void)
+unsigned long __init fpu__get_fpstate_size(void)
 {
-	unsigned long ret = xstate_sigframe_size();
+	unsigned long ret = fpu_user_xstate_size;
+
+	if (use_xsave())
+		ret += FP_XSTATE_MAGIC2_SIZE;
 
 	/*
 	 * This space is needed on (most) 32-bit kernels, or when a 32-bit
-- 
Gitee


From c3896b26c4eaf01f2211f9cd9c91e7bbc5facaf5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 01:09:29 +0200
Subject: [PATCH 0779/2161] x86/fpu: Provide struct fpu_config

commit 578971f4e228f386ad4d7ce16e979f2ed922de54 upstream.

Provide a struct to store information about the maximum supported and the
default feature set and buffer sizes for both user and kernel space.

This allows quick retrieval of this information for the upcoming support
for dynamically enabled features.

 [ bp: Add vertical spacing between the struct members. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211014230739.126107370@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 42 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/core.c       |  4 +++
 2 files changed, 46 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 899a27022772..7d8b173f7105 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -358,4 +358,46 @@ struct fpu {
 	 */
 };
 
+/*
+ * FPU state configuration data. Initialized at boot time. Read only after init.
+ */
+struct fpu_state_config {
+	/*
+	 * @max_size:
+	 *
+	 * The maximum size of the register state buffer. Includes all
+	 * supported features except independent managed features.
+	 */
+	unsigned int		max_size;
+
+	/*
+	 * @default_size:
+	 *
+	 * The default size of the register state buffer. Includes all
+	 * supported features except independent managed features and
+	 * features which have to be requested by user space before usage.
+	 */
+	unsigned int		default_size;
+
+	/*
+	 * @max_features:
+	 *
+	 * The maximum supported features bitmap. Does not include
+	 * independent managed features.
+	 */
+	u64 max_features;
+
+	/*
+	 * @default_features:
+	 *
+	 * The default supported features bitmap. Does not include
+	 * independent managed features and features which have to
+	 * be requested by user space before usage.
+	 */
+	u64 default_features;
+};
+
+/* FPU state configuration information */
+extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
+
 #endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 128e1d9b16dc..706a28ba644a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -25,6 +25,10 @@
 #define CREATE_TRACE_POINTS
 #include <asm/trace/fpu.h>
 
+/* The FPU state configuration data for kernel and user space */
+struct fpu_state_config	fpu_kernel_cfg __ro_after_init;
+struct fpu_state_config fpu_user_cfg __ro_after_init;
+
 /*
  * Represents the initial FPU state. It's mostly (but not completely) zeroes,
  * depending on the FPU hardware format:
-- 
Gitee


From 8763c8444a066cc4a050662bacd598ecec8ae001 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 01:09:31 +0200
Subject: [PATCH 0780/2161] x86/fpu: Cleanup
 fpu__init_system_xstate_size_legacy()

commit 9fe8a6f5eed8fff6b2d7dbc99b911334e311732d upstream.

Clean the function up before making changes.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211014230739.184014242@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/init.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 65d763faace9..c9293ade321d 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -199,17 +199,12 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 	 * Note that xstate sizes might be overwritten later during
 	 * fpu__init_system_xstate().
 	 */
-
-	if (!boot_cpu_has(X86_FEATURE_FPU)) {
+	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		fpu_kernel_xstate_size = sizeof(struct swregs_state);
-	} else {
-		if (boot_cpu_has(X86_FEATURE_FXSR))
-			fpu_kernel_xstate_size =
-				sizeof(struct fxregs_state);
-		else
-			fpu_kernel_xstate_size =
-				sizeof(struct fregs_state);
-	}
+	else if (cpu_feature_enabled(X86_FEATURE_FXSR))
+		fpu_kernel_xstate_size = sizeof(struct fxregs_state);
+	else
+		fpu_kernel_xstate_size = sizeof(struct fregs_state);
 
 	fpu_user_xstate_size = fpu_kernel_xstate_size;
 	fpstate_reset(&current->thread.fpu);
-- 
Gitee


From 691e71cc708b678523a757918842fc44ed38feaa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 01:09:32 +0200
Subject: [PATCH 0781/2161] x86/fpu/xstate: Cleanup size calculations

commit cd9ae761744912a96d7fd968b9c0173594e3f6be upstream.

The size calculations are partially unreadable gunk. Clean them up.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211014230739.241223689@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 82 ++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6b73d98e027b..913c2ab0be48 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -524,7 +524,7 @@ static void __init __xstate_dump_leaves(void)
  * that our software representation matches what the CPU
  * tells us about the state's size.
  */
-static void __init check_xstate_against_struct(int nr)
+static bool __init check_xstate_against_struct(int nr)
 {
 	/*
 	 * Ask the CPU for the size of the state.
@@ -554,7 +554,9 @@ static void __init check_xstate_against_struct(int nr)
 	    ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) {
 		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
 		XSTATE_WARN_ON(1);
+		return false;
 	}
+	return true;
 }
 
 /*
@@ -566,38 +568,44 @@ static void __init check_xstate_against_struct(int nr)
  * covered by these checks. Only the size of the buffer for task->fpu
  * is checked here.
  */
-static void __init do_extra_xstate_size_checks(void)
+static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
 {
-	int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 	int i;
 
 	for_each_extended_xfeature(i, xfeatures_mask_all) {
-		check_xstate_against_struct(i);
+		if (!check_xstate_against_struct(i))
+			return false;
 		/*
 		 * Supervisor state components can be managed only by
 		 * XSAVES.
 		 */
-		if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
-			XSTATE_WARN_ON(xfeature_is_supervisor(i));
+		if (!compacted && xfeature_is_supervisor(i)) {
+			XSTATE_WARN_ON(1);
+			return false;
+		}
 
 		/* Align from the end of the previous feature */
 		if (xfeature_is_aligned(i))
-			paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
+			size = ALIGN(size, 64);
 		/*
-		 * The offset of a given state in the non-compacted
-		 * format is given to us in a CPUID leaf.  We check
-		 * them for being ordered (increasing offsets) in
-		 * setup_xstate_features(). XSAVES uses compacted format.
+		 * In compacted format the enabled features are packed,
+		 * i.e. disabled features do not occupy space.
+		 *
+		 * In non-compacted format the offsets are fixed and
+		 * disabled states still occupy space in the memory buffer.
 		 */
-		if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
-			paranoid_xstate_size = xfeature_uncompacted_offset(i);
+		if (!compacted)
+			size = xfeature_uncompacted_offset(i);
 		/*
-		 * The compacted-format offset always depends on where
-		 * the previous state ended.
+		 * Add the feature size even for non-compacted format
+		 * to make the end result correct
 		 */
-		paranoid_xstate_size += xfeature_size(i);
+		size += xfeature_size(i);
 	}
-	XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
+	XSTATE_WARN_ON(size != kernel_size);
+	return size == kernel_size;
 }
 
 /*
@@ -650,7 +658,7 @@ static unsigned int __init get_xsaves_size_no_independent(void)
 	return size;
 }
 
-static unsigned int __init get_xsave_size(void)
+static unsigned int __init get_xsave_size_user(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 	/*
@@ -681,31 +689,33 @@ static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
 static int __init init_xstate_size(void)
 {
 	/* Recompute the context size for enabled features: */
-	unsigned int possible_xstate_size;
-	unsigned int xsave_size;
+	unsigned int user_size, kernel_size;
 
-	xsave_size = get_xsave_size();
+	/* Uncompacted user space size */
+	user_size = get_xsave_size_user();
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		possible_xstate_size = get_xsaves_size_no_independent();
+	/*
+	 * XSAVES kernel size includes supervisor states and
+	 * uses compacted format.
+	 *
+	 * XSAVE does not support supervisor states so
+	 * kernel and user size is identical.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+		kernel_size = get_xsaves_size_no_independent();
 	else
-		possible_xstate_size = xsave_size;
+		kernel_size = user_size;
 
-	/* Ensure we have the space to store all enabled: */
-	if (!is_supported_xstate_size(possible_xstate_size))
+	/* Ensure we have the space to store all enabled features. */
+	if (!is_supported_xstate_size(kernel_size))
 		return -EINVAL;
 
-	/*
-	 * The size is OK, we are definitely going to use xsave,
-	 * make it known to the world that we need more space.
-	 */
-	fpu_kernel_xstate_size = possible_xstate_size;
-	do_extra_xstate_size_checks();
+	if (!paranoid_xstate_size_valid(kernel_size))
+		return -EINVAL;
+
+	fpu_kernel_xstate_size = kernel_size;
+	fpu_user_xstate_size = user_size;
 
-	/*
-	 * User space is always in standard format.
-	 */
-	fpu_user_xstate_size = xsave_size;
 	return 0;
 }
 
-- 
Gitee


From ea6d4bd20965a35313053d25ade2fa8a32b054b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:01:35 +0200
Subject: [PATCH 0782/2161] x86/fpu: Limit xstate copy size in xstateregs_set()

commit 07d6688b22e09be465652cf2da0da6bf86154df6 upstream.

If the count argument is larger than the xstate size, this will happily
copy beyond the end of xstate.

Fixes: 91c3dba7dbc1 ("x86/fpu/xstate: Fix PTRACE frames for XSAVES")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121452.120741557@linutronix.de
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 5bdf78c527e9..f8c485ab73f5 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -153,7 +153,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	/*
 	 * A whole standard-format XSAVE buffer is needed:
 	 */
-	if ((pos != 0) || (count < fpu_user_xstate_size))
+	if (pos != 0 || count != fpu_user_xstate_size)
 		return -EFAULT;
 
 	if (!kbuf) {
-- 
Gitee


From 809528cff403a2a896b248e1da92ab25687bb305 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 01:09:34 +0200
Subject: [PATCH 0783/2161] x86/fpu: Move xstate size to fpu_*_cfg

commit opencloudos.

Use the new kernel and user space config storage to store and retrieve the
XSTATE buffer sizes. The default and the maximum size are the same for now,
but will change when support for dynamically enabled features is added.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211014230739.296830097@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c     |  8 ++++----
 arch/x86/kernel/fpu/init.c     | 31 ++++++++++++++-----------------
 arch/x86/kernel/fpu/internal.h |  2 --
 arch/x86/kernel/fpu/regset.c   |  2 +-
 arch/x86/kernel/fpu/signal.c   |  6 +++---
 arch/x86/kernel/fpu/xstate.c   | 32 ++++++++++++++++++--------------
 arch/x86/kernel/fpu/xstate.h   |  2 +-
 7 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 706a28ba644a..b6459a3f995e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -298,7 +298,7 @@ void fpu_sync_fpstate(struct fpu *fpu)
 static inline unsigned int init_fpstate_copy_size(void)
 {
 	if (!use_xsave())
-		return fpu_kernel_xstate_size;
+		return fpu_kernel_cfg.default_size;
 
 	/* XSAVE(S) just needs the legacy and the xstate header part */
 	return sizeof(init_fpstate.regs.xsave);
@@ -347,8 +347,8 @@ void fpstate_reset(struct fpu *fpu)
 	fpu->fpstate = &fpu->__fpstate;
 
 	/* Initialize sizes and feature masks */
-	fpu->fpstate->size		= fpu_kernel_xstate_size;
-	fpu->fpstate->user_size		= fpu_user_xstate_size;
+	fpu->fpstate->size		= fpu_kernel_cfg.default_size;
+	fpu->fpstate->user_size		= fpu_user_cfg.default_size;
 	fpu->fpstate->xfeatures		= xfeatures_mask_all;
 	fpu->fpstate->user_xfeatures	= xfeatures_mask_uabi();
 }
@@ -420,7 +420,7 @@ int fpu_clone(struct task_struct *dst)
 void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
 {
 	*offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
-	*size = fpu_kernel_xstate_size;
+	*size = fpu_kernel_cfg.default_size;
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index c9293ade321d..58043ed08662 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -133,14 +133,6 @@ static void __init fpu__init_system_generic(void)
 	fpu__init_system_mxcsr();
 }
 
-/*
- * Size of the FPU context state. All tasks in the system use the
- * same context size, regardless of what portion they use.
- * This is inherent to the XSAVE architecture which puts all state
- * components into a single, continuous memory block:
- */
-unsigned int fpu_kernel_xstate_size __ro_after_init;
-
 /* Get alignment of the TYPE. */
 #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
 
@@ -171,7 +163,7 @@ static void __init fpu__init_task_struct_size(void)
 	 * Add back the dynamically-calculated register state
 	 * size.
 	 */
-	task_size += fpu_kernel_xstate_size;
+	task_size += fpu_kernel_cfg.default_size;
 
 	/*
 	 * We dynamically size 'struct fpu', so we require that
@@ -195,25 +187,30 @@ static void __init fpu__init_task_struct_size(void)
  */
 static void __init fpu__init_system_xstate_size_legacy(void)
 {
+	unsigned int size;
+
 	/*
-	 * Note that xstate sizes might be overwritten later during
-	 * fpu__init_system_xstate().
+	 * Note that the size configuration might be overwritten later
+	 * during fpu__init_system_xstate().
 	 */
 	if (!cpu_feature_enabled(X86_FEATURE_FPU))
-		fpu_kernel_xstate_size = sizeof(struct swregs_state);
+		size = sizeof(struct swregs_state);
 	else if (cpu_feature_enabled(X86_FEATURE_FXSR))
-		fpu_kernel_xstate_size = sizeof(struct fxregs_state);
+		size = sizeof(struct fxregs_state);
 	else
-		fpu_kernel_xstate_size = sizeof(struct fregs_state);
+		size = sizeof(struct fregs_state);
 
-	fpu_user_xstate_size = fpu_kernel_xstate_size;
+	fpu_kernel_cfg.max_size = size;
+	fpu_kernel_cfg.default_size = size;
+	fpu_user_cfg.max_size = size;
+	fpu_user_cfg.default_size = size;
 	fpstate_reset(&current->thread.fpu);
 }
 
 static void __init fpu__init_init_fpstate(void)
 {
 	/* Bring init_fpstate size and features up to date */
-	init_fpstate.size		= fpu_kernel_xstate_size;
+	init_fpstate.size		= fpu_kernel_cfg.max_size;
 	init_fpstate.xfeatures		= xfeatures_mask_all;
 }
 
@@ -234,7 +231,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 
 	fpu__init_system_generic();
 	fpu__init_system_xstate_size_legacy();
-	fpu__init_system_xstate();
+	fpu__init_system_xstate(fpu_kernel_cfg.max_size);
 	fpu__init_task_struct_size();
 	fpu__init_init_fpstate();
 }
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index 5c4f71ff6ae9..e1d8a352f12d 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -2,8 +2,6 @@
 #ifndef __X86_KERNEL_FPU_INTERNAL_H
 #define __X86_KERNEL_FPU_INTERNAL_H
 
-extern unsigned int fpu_kernel_xstate_size;
-extern unsigned int fpu_user_xstate_size;
 extern struct fpstate init_fpstate;
 
 /* CPU feature check wrappers */
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index f8c485ab73f5..437d7c930c0b 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -153,7 +153,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	/*
 	 * A whole standard-format XSAVE buffer is needed:
 	 */
-	if (pos != 0 || count != fpu_user_xstate_size)
+	if (pos != 0 || count != fpu_user_cfg.max_size)
 		return -EFAULT;
 
 	if (!kbuf) {
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 00332da8fdf0..d5eb91f8433c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -503,7 +503,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
 unsigned long __init fpu__get_fpstate_size(void)
 {
-	unsigned long ret = fpu_user_xstate_size;
+	unsigned long ret = fpu_user_cfg.max_size;
 
 	if (use_xsave())
 		ret += FP_XSTATE_MAGIC2_SIZE;
@@ -531,12 +531,12 @@ unsigned long __init fpu__get_fpstate_size(void)
  */
 void __init fpu__init_prepare_fx_sw_frame(void)
 {
-	int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+	int size = fpu_user_cfg.default_size + FP_XSTATE_MAGIC2_SIZE;
 
 	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
 	fx_sw_reserved.extended_size = size;
 	fx_sw_reserved.xfeatures = xfeatures_mask_uabi();
-	fx_sw_reserved.xstate_size = fpu_user_xstate_size;
+	fx_sw_reserved.xstate_size = fpu_user_cfg.default_size;
 
 	if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
 	    IS_ENABLED(CONFIG_X86_32)) {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 913c2ab0be48..a7c3cc42b017 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -77,13 +77,6 @@ static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init =
 static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
 
-/*
- * The XSAVE area of kernel can be in standard or compacted format;
- * it is always in standard format for user mode. This is the user
- * mode standard format size used for signal and ptrace frames.
- */
-unsigned int fpu_user_xstate_size __ro_after_init;
-
 /*
  * Return whether the system supports a given xfeature.
  *
@@ -713,8 +706,11 @@ static int __init init_xstate_size(void)
 	if (!paranoid_xstate_size_valid(kernel_size))
 		return -EINVAL;
 
-	fpu_kernel_xstate_size = kernel_size;
-	fpu_user_xstate_size = user_size;
+	/* Keep it the same for now */
+	fpu_kernel_cfg.max_size = kernel_size;
+	fpu_kernel_cfg.default_size = kernel_size;
+	fpu_user_cfg.max_size = user_size;
+	fpu_user_cfg.default_size = user_size;
 
 	return 0;
 }
@@ -723,11 +719,18 @@ static int __init init_xstate_size(void)
  * We enabled the XSAVE hardware, but something went wrong and
  * we can not use it.  Disable it.
  */
-static void __init fpu__init_disable_system_xstate(void)
+static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
 {
 	xfeatures_mask_all = 0;
 	cr4_clear_bits(X86_CR4_OSXSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+	/* Restore the legacy size.*/
+	fpu_kernel_cfg.max_size = legacy_size;
+	fpu_kernel_cfg.default_size = legacy_size;
+	fpu_user_cfg.max_size = legacy_size;
+	fpu_user_cfg.default_size = legacy_size;
+
 	fpstate_reset(&current->thread.fpu);
 }
 
@@ -735,7 +738,7 @@ static void __init fpu__init_disable_system_xstate(void)
  * Enable and initialize the xsave feature.
  * Called once per system bootup.
  */
-void __init fpu__init_system_xstate(void)
+void __init fpu__init_system_xstate(unsigned int legacy_size)
 {
 	unsigned int eax, ebx, ecx, edx;
 	u64 xfeatures;
@@ -807,7 +810,8 @@ void __init fpu__init_system_xstate(void)
 	 * Update info used for ptrace frames; use standard-format size and no
 	 * supervisor xstates:
 	 */
-	update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi());
+	update_regset_xstate_info(fpu_user_cfg.max_size,
+				  xfeatures_mask_uabi());
 
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
@@ -827,13 +831,13 @@ void __init fpu__init_system_xstate(void)
 	print_xstate_offset_size();
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
 		xfeatures_mask_all,
-		fpu_kernel_xstate_size,
+		fpu_kernel_cfg.max_size,
 		boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
 	return;
 
 out_disable:
 	/* something went wrong, try to boot without any XSAVE support */
-	fpu__init_disable_system_xstate();
+	fpu__init_disable_system_xstate(legacy_size);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 379dbfa4f526..3d45eb04471b 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -31,7 +31,7 @@ extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void
 
 
 extern void fpu__init_cpu_xstate(void);
-extern void fpu__init_system_xstate(void);
+extern void fpu__init_system_xstate(unsigned int legacy_size);
 
 extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 
-- 
Gitee


From b87916d8fe79cc2244a805e4950939c37d3b0f84 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 01:09:35 +0200
Subject: [PATCH 0784/2161] x86/fpu: Move xstate feature masks to fpu_*_cfg

commit opencloudos.

Move the feature mask storage to the kernel and user config
structs. Default and maximum feature set are the same for now.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211014230739.352041752@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 10 +++---
 arch/x86/kernel/fpu/core.c        |  4 +--
 arch/x86/kernel/fpu/init.c        |  2 +-
 arch/x86/kernel/fpu/signal.c      |  3 +-
 arch/x86/kernel/fpu/xstate.c      | 57 ++++++++++++++++---------------
 5 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 489a1dcde16f..0edd3c3a5e4f 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -78,11 +78,9 @@
 				      XFEATURE_MASK_INDEPENDENT | \
 				      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
-extern u64 xfeatures_mask_all;
-
 static inline u64 xfeatures_mask_supervisor(void)
 {
-	return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+	return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
 }
 
 /*
@@ -91,7 +89,7 @@ static inline u64 xfeatures_mask_supervisor(void)
  */
 static inline u64 xfeatures_mask_uabi(void)
 {
-	return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
+	return fpu_kernel_cfg.max_features & XFEATURE_MASK_USER_SUPPORTED;
 }
 
 /*
@@ -102,7 +100,7 @@ static inline u64 xfeatures_mask_uabi(void)
  */
 static inline u64 xfeatures_mask_restore_user(void)
 {
-	return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE;
+	return fpu_kernel_cfg.max_features & XFEATURE_MASK_USER_RESTORE;
 }
 
 /*
@@ -111,7 +109,7 @@ static inline u64 xfeatures_mask_restore_user(void)
  */
 static inline u64 xfeatures_mask_fpstate(void)
 {
-	return xfeatures_mask_all & \
+	return fpu_kernel_cfg.max_features & \
 		(XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED);
 }
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b6459a3f995e..ec982743a47b 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -349,8 +349,8 @@ void fpstate_reset(struct fpu *fpu)
 	/* Initialize sizes and feature masks */
 	fpu->fpstate->size		= fpu_kernel_cfg.default_size;
 	fpu->fpstate->user_size		= fpu_user_cfg.default_size;
-	fpu->fpstate->xfeatures		= xfeatures_mask_all;
-	fpu->fpstate->user_xfeatures	= xfeatures_mask_uabi();
+	fpu->fpstate->xfeatures		= fpu_kernel_cfg.default_features;
+	fpu->fpstate->user_xfeatures	= fpu_user_cfg.default_features;
 }
 
 #if IS_ENABLED(CONFIG_KVM)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 58043ed08662..7074154131e6 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -211,7 +211,7 @@ static void __init fpu__init_init_fpstate(void)
 {
 	/* Bring init_fpstate size and features up to date */
 	init_fpstate.size		= fpu_kernel_cfg.max_size;
-	init_fpstate.xfeatures		= xfeatures_mask_all;
+	init_fpstate.xfeatures		= fpu_kernel_cfg.max_features;
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d5eb91f8433c..59f1f5ec7b8d 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -417,7 +417,8 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		u64 mask = user_xfeatures | xfeatures_mask_supervisor();
 
 		fpregs->xsave.header.xfeatures &= mask;
-		success = !os_xrstor_safe(&fpregs->xsave, xfeatures_mask_all);
+		success = !os_xrstor_safe(&fpregs->xsave,
+					  fpu_kernel_cfg.max_features);
 	} else {
 		success = !fxrstor_safe(&fpregs->fxsave);
 	}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a7c3cc42b017..7c0dd51effd9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -62,12 +62,6 @@ static short xsave_cpuid_features[] __initdata = {
 	X86_FEATURE_ENQCMD,
 };
 
-/*
- * This represents the full set of bits that should ever be set in a kernel
- * XSAVE buffer, both supervisor and user xstates.
- */
-u64 xfeatures_mask_all __ro_after_init;
-
 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
@@ -84,7 +78,7 @@ static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init
  */
 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 {
-	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
+	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
 
 	if (unlikely(feature_name)) {
 		long xfeature_idx, max_idx;
@@ -134,7 +128,7 @@ static bool xfeature_is_supervisor(int xfeature_nr)
  */
 void fpu__init_cpu_xstate(void)
 {
-	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
 		return;
 
 	cr4_set_bits(X86_CR4_OSXSAVE);
@@ -144,7 +138,7 @@ void fpu__init_cpu_xstate(void)
 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
 	 * states can be set here.
 	 */
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
 
 	/*
 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
@@ -157,7 +151,7 @@ void fpu__init_cpu_xstate(void)
 
 static bool xfeature_enabled(enum xfeature xfeature)
 {
-	return xfeatures_mask_all & BIT_ULL(xfeature);
+	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
 }
 
 /*
@@ -183,7 +177,7 @@ static void __init setup_xstate_features(void)
 	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
 						       xmm_space);
 
-	for_each_extended_xfeature(i, xfeatures_mask_all) {
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
 
 		xstate_sizes[i] = eax;
@@ -288,14 +282,14 @@ static void __init setup_xstate_comp_offsets(void)
 						     xmm_space);
 
 	if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) {
-		for_each_extended_xfeature(i, xfeatures_mask_all)
+		for_each_extended_xfeature(i, fpu_kernel_cfg.max_features)
 			xstate_comp_offsets[i] = xstate_offsets[i];
 		return;
 	}
 
 	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 
-	for_each_extended_xfeature(i, xfeatures_mask_all) {
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 		if (xfeature_is_aligned(i))
 			next_offset = ALIGN(next_offset, 64);
 
@@ -319,7 +313,7 @@ static void __init setup_supervisor_only_offsets(void)
 
 	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 
-	for_each_extended_xfeature(i, xfeatures_mask_all) {
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 		if (!xfeature_is_supervisor(i))
 			continue;
 
@@ -338,7 +332,7 @@ static void __init print_xstate_offset_size(void)
 {
 	int i;
 
-	for_each_extended_xfeature(i, xfeatures_mask_all) {
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
 			 i, xstate_comp_offsets[i], i, xstate_sizes[i]);
 	}
@@ -398,7 +392,7 @@ static void __init setup_init_fpu_buf(void)
 	setup_xstate_features();
 	print_xstate_features();
 
-	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, xfeatures_mask_all);
+	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
 
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
@@ -567,7 +561,7 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
 	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 	int i;
 
-	for_each_extended_xfeature(i, xfeatures_mask_all) {
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 		if (!check_xstate_against_struct(i))
 			return false;
 		/*
@@ -721,7 +715,7 @@ static int __init init_xstate_size(void)
  */
 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
 {
-	xfeatures_mask_all = 0;
+	fpu_kernel_cfg.max_features = 0;
 	cr4_clear_bits(X86_CR4_OSXSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 
@@ -765,13 +759,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	 * Find user xstates supported by the processor.
 	 */
 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-	xfeatures_mask_all = eax + ((u64)edx << 32);
+	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
 
 	/*
 	 * Find supervisor xstates supported by the processor.
 	 */
 	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
-	xfeatures_mask_all |= ecx + ((u64)edx << 32);
+	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
 
 	if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		/*
@@ -780,7 +774,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 		 * booting without it.  This is too early to BUG().
 		 */
 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
-		       xfeatures_mask_all);
+		       fpu_kernel_cfg.max_features);
 		goto out_disable;
 	}
 
@@ -789,14 +783,21 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	 */
 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
 		if (!boot_cpu_has(xsave_cpuid_features[i]))
-			xfeatures_mask_all &= ~BIT_ULL(i);
+			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
 	}
 
-	xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED |
+	fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
 			      XFEATURE_MASK_SUPERVISOR_SUPPORTED;
 
+	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
+	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+
+	/* Identical for now */
+	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
+	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
+
 	/* Store it for paranoia check at the end */
-	xfeatures = xfeatures_mask_all;
+	xfeatures = fpu_kernel_cfg.max_features;
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
@@ -822,15 +823,15 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	 * Paranoia check whether something in the setup modified the
 	 * xfeatures mask.
 	 */
-	if (xfeatures != xfeatures_mask_all) {
+	if (xfeatures != fpu_kernel_cfg.max_features) {
 		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
-		       xfeatures, xfeatures_mask_all);
+		       xfeatures, fpu_kernel_cfg.max_features);
 		goto out_disable;
 	}
 
 	print_xstate_offset_size();
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
-		xfeatures_mask_all,
+		fpu_kernel_cfg.max_features,
 		fpu_kernel_cfg.max_size,
 		boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
 	return;
@@ -905,7 +906,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 	 * We should not ever be requesting features that we
 	 * have not enabled.
 	 */
-	WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)),
+	WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)),
 		  "get of unsupported state");
 	/*
 	 * This assumes the last 'xsave*' instruction to
-- 
Gitee


From 2216fbd6cd2257b3305cb25e6f33e83d7471bd7f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 01:09:37 +0200
Subject: [PATCH 0785/2161] x86/fpu: Mop up xfeatures_mask_uabi()

commit daddee24731938781b7876d20335ea3754d23484 upstream.

Use the new fpu_user_cfg to retrieve the information instead of
xfeatures_mask_uabi() which will be no longer correct when dynamically
enabled features become available.

Using fpu_user_cfg is appropriate when setting XCOMP_BV in the
init_fpstate since it has space allocated for "max_features". But,
normal fpstates might only have space for default xfeatures. Since
XRSTOR* derives the format of the XSAVE buffer from XCOMP_BV, this can
lead to XRSTOR reading out of bounds.

So when copying actively used fpstate, simply read the XCOMP_BV features
bits directly out of the fpstate instead.

This correction courtesy of Dave Hansen <dave.hansen@linux.intel.com>

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211014230739.408879849@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 9 ---------
 arch/x86/kernel/fpu/core.c        | 4 ++--
 arch/x86/kernel/fpu/signal.c      | 2 +-
 arch/x86/kernel/fpu/xstate.c      | 6 +++---
 4 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 0edd3c3a5e4f..9181b11e9ec5 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -83,15 +83,6 @@ static inline u64 xfeatures_mask_supervisor(void)
 	return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
 }
 
-/*
- * The xfeatures which are enabled in XCR0 and expected to be in ptrace
- * buffers and signal frames.
- */
-static inline u64 xfeatures_mask_uabi(void)
-{
-	return fpu_kernel_cfg.max_features & XFEATURE_MASK_USER_SUPPORTED;
-}
-
 /*
  * The xfeatures which are restored by the kernel when returning to user
  * mode. This is not necessarily the same as xfeatures_mask_uabi() as the
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ec982743a47b..b10b6c820fe6 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -237,7 +237,7 @@ int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
 	}
 
 	/* Ensure that XCOMP_BV is set up for XSAVES */
-	xstate_init_xcomp_bv(&kstate->regs.xsave, xfeatures_mask_uabi());
+	xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fpu_copy_kvm_uabi_to_fpstate);
@@ -333,7 +333,7 @@ void fpstate_init_user(struct fpstate *fpstate)
 		return;
 	}
 
-	xstate_init_xcomp_bv(&fpstate->regs.xsave, xfeatures_mask_uabi());
+	xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);
 
 	if (cpu_feature_enabled(X86_FEATURE_FXSR))
 		fpstate_init_fxstate(fpstate);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 59f1f5ec7b8d..fcdd25fe9262 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -536,7 +536,7 @@ void __init fpu__init_prepare_fx_sw_frame(void)
 
 	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
 	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xfeatures = xfeatures_mask_uabi();
+	fx_sw_reserved.xfeatures = fpu_user_cfg.default_features;
 	fx_sw_reserved.xstate_size = fpu_user_cfg.default_size;
 
 	if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7c0dd51effd9..89cfdf19fdd9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -767,7 +767,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
 	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
 
-	if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		/*
 		 * This indicates that something really unexpected happened
 		 * with the enumeration.  Disable XSAVE and try to continue
@@ -812,7 +812,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	 * supervisor xstates:
 	 */
 	update_regset_xstate_info(fpu_user_cfg.max_size,
-				  xfeatures_mask_uabi());
+				  fpu_user_cfg.max_features);
 
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
@@ -850,7 +850,7 @@ void fpu__resume_cpu(void)
 	 * Restore XCR0 on xsave capable CPUs:
 	 */
 	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
 
 	/*
 	 * Restore IA32_XSS. The same CPUID bit enumerates support
-- 
Gitee


From e6a7a4abb102261020e9d56056e754f1be57b425 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 01:09:38 +0200
Subject: [PATCH 0786/2161] x86/fpu: Rework restore_regs_from_fpstate()

commit eda32f4f93b452c5fe3c352523e7f7cc085c8205 upstream.

xfeatures_mask_fpstate() is no longer valid when dynamically enabled
features come into play.

Rework restore_regs_from_fpstate() so it takes a constant mask which will
then be applied against the maximum feature set so that the restore
operation brings all features which are not in the xsave buffer xfeature
bitmap into init state.

This ensures that if the previous task used a dynamically enabled feature
that the task which restores has all unused components properly initialized.

Cleanup the last user of xfeatures_mask_fpstate() as well and remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211014230739.461348278@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 27 +++++++--------------------
 arch/x86/kernel/fpu/context.h     |  6 +-----
 arch/x86/kernel/fpu/core.c        | 17 ++++++++++++++---
 arch/x86/kernel/fpu/xstate.c      |  2 +-
 4 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 9181b11e9ec5..bf7962b7fe78 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -78,30 +78,17 @@
 				      XFEATURE_MASK_INDEPENDENT | \
 				      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
-static inline u64 xfeatures_mask_supervisor(void)
-{
-	return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
-}
-
 /*
- * The xfeatures which are restored by the kernel when returning to user
- * mode. This is not necessarily the same as xfeatures_mask_uabi() as the
- * kernel does not manage all XCR0 enabled features via xsave/xrstor as
- * some of them have to be switched eagerly on context switch and exec().
+ * The feature mask required to restore FPU state:
+ * - All user states which are not eagerly switched in switch_to()/exec()
+ * - The suporvisor states
  */
-static inline u64 xfeatures_mask_restore_user(void)
-{
-	return fpu_kernel_cfg.max_features & XFEATURE_MASK_USER_RESTORE;
-}
+#define XFEATURE_MASK_FPSTATE	(XFEATURE_MASK_USER_RESTORE | \
+				 XFEATURE_MASK_SUPERVISOR_SUPPORTED)
 
-/*
- * Like xfeatures_mask_restore_user() but additionally restors the
- * supported supervisor states.
- */
-static inline u64 xfeatures_mask_fpstate(void)
+static inline u64 xfeatures_mask_supervisor(void)
 {
-	return fpu_kernel_cfg.max_features & \
-		(XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED);
+	return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
 }
 
 static inline u64 xfeatures_mask_independent(void)
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index f8f510519688..a06ebf315d83 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -61,8 +61,6 @@ static inline void fpregs_restore_userregs(void)
 		return;
 
 	if (!fpregs_state_valid(fpu, cpu)) {
-		u64 mask;
-
 		/*
 		 * This restores _all_ xstate which has not been
 		 * established yet.
@@ -72,9 +70,7 @@ static inline void fpregs_restore_userregs(void)
 		 * flush_thread(). So it is excluded because it might be
 		 * not up to date in current->thread.fpu.xsave state.
 		 */
-		mask = xfeatures_mask_restore_user() |
-			xfeatures_mask_supervisor();
-		restore_fpregs_from_fpstate(fpu->fpstate, mask);
+		restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);
 
 		fpregs_activate(fpu);
 		fpu->last_cpu = cpu;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b10b6c820fe6..82e21e59c88b 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -150,6 +150,17 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
 	}
 
 	if (use_xsave()) {
+		/*
+		 * Restoring state always needs to modify all features
+		 * which are in @mask even if the current task cannot use
+		 * extended features.
+		 *
+		 * So fpstate->xfeatures cannot be used here, because then
+		 * a feature for which the task has no permission but was
+		 * used by the previous task would not go into init state.
+		 */
+		mask = fpu_kernel_cfg.max_features & mask;
+
 		os_xrstor(&fpstate->regs.xsave, mask);
 	} else {
 		if (use_fxsr())
@@ -161,7 +172,7 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
 
 void fpu_reset_from_exception_fixup(void)
 {
-	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
+	restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
 }
 
 #if IS_ENABLED(CONFIG_KVM)
@@ -179,7 +190,7 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 	}
 
 	if (rstor) {
-		restore_mask &= xfeatures_mask_fpstate();
+		restore_mask &= XFEATURE_MASK_FPSTATE;
 		restore_fpregs_from_fpstate(rstor->fpstate, restore_mask);
 	}
 
@@ -518,7 +529,7 @@ void fpu__clear_user_states(struct fpu *fpu)
 	}
 
 	/* Reset user states in registers. */
-	restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user());
+	restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
 
 	/*
 	 * Now all FPU registers have their desired values.  Inform the FPU
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 89cfdf19fdd9..064769de856c 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -344,7 +344,7 @@ static void __init print_xstate_offset_size(void)
  */
 static __init void os_xrstor_booting(struct xregs_state *xstate)
 {
-	u64 mask = xfeatures_mask_fpstate();
+	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
-- 
Gitee


From 1790fa36477992e0a8112de554f63bdf7d0a2758 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2021 01:09:40 +0200
Subject: [PATCH 0787/2161] x86/fpu/xstate: Move remaining xfeature helpers to
 core

commit d72c87018d00782c3ac0a844c372158087debc0a upstream.

Now that everything is mopped up, move all the helpers and prototypes into
the core header. They are not required by the outside.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211014230739.514095101@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 13 -------------
 arch/x86/kernel/fpu/xstate.h      | 13 +++++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index bf7962b7fe78..ea2bb7f99e57 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -86,19 +86,6 @@
 #define XFEATURE_MASK_FPSTATE	(XFEATURE_MASK_USER_RESTORE | \
 				 XFEATURE_MASK_SUPERVISOR_SUPPORTED)
 
-static inline u64 xfeatures_mask_supervisor(void)
-{
-	return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
-}
-
-static inline u64 xfeatures_mask_independent(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
-		return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
-
-	return XFEATURE_MASK_INDEPENDENT;
-}
-
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 3d45eb04471b..a1aa0bad2c9c 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -35,6 +35,19 @@ extern void fpu__init_system_xstate(unsigned int legacy_size);
 
 extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 
+static inline u64 xfeatures_mask_supervisor(void)
+{
+	return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+}
+
+static inline u64 xfeatures_mask_independent(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
+		return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+
+	return XFEATURE_MASK_INDEPENDENT;
+}
+
 /* XSAVE/XRSTOR wrapper functions */
 
 #ifdef CONFIG_X86_64
-- 
Gitee


From edf69121e4d044df7ef5feed3ed2182246dc50d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Oct 2021 20:55:49 +0200
Subject: [PATCH 0788/2161] x86/fpu: Prepare for sanitizing KVM FPU code

commit 75c52dad5e327605f1025f399dafdf4aaf5dae9c upstream.

For the upcoming AMX support it's necessary to do a proper integration with
KVM. To avoid more nasty hackery in KVM which violate encapsulation extend
struct fpu and fpstate so the fpstate switching can be consolidated and
simplified.

Currently KVM allocates two FPU structs which are used for saving the user
state of the vCPU thread and restoring the guest state when entering
vcpu_run() and doing the reverse operation before leaving vcpu_run().

With the new fpstate mechanism this can be reduced to one extra buffer by
swapping the fpstate pointer in current::thread::fpu. This makes the
upcoming support for AMX and XFD simpler because then fpstate information
(features, sizes, xfd) are always consistent and it does not require any
nasty workarounds.

Add fpu::__task_fpstate to save the regular fpstate pointer while the task
is inside vcpu_run(). Add some state fields to fpstate to indicate the
nature of the state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211022185312.896403942@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 44 +++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 7d8b173f7105..63e9ca544334 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -302,8 +302,32 @@ struct fpstate {
 	/* @user_xfeatures:	xfeatures valid in UABI buffers */
 	u64			user_xfeatures;
 
+	/* @is_valloc:		Indicator for dynamically allocated state */
+	unsigned int		is_valloc	: 1;
+
+	/* @is_guest:		Indicator for guest state (KVM) */
+	unsigned int		is_guest	: 1;
+
+	/*
+	 * @is_confidential:	Indicator for KVM confidential mode.
+	 *			The FPU registers are restored by the
+	 *			vmentry firmware from encrypted guest
+	 *			memory. On vmexit the FPU registers are
+	 *			saved by firmware to encrypted guest memory
+	 *			and the registers are scrubbed before
+	 *			returning to the host. So there is no
+	 *			content which is worth saving and restoring.
+	 *			The fpstate has to be there so that
+	 *			preemption and softirq FPU usage works
+	 *			without special casing.
+	 */
+	unsigned int		is_confidential	: 1;
+
+	/* @in_use:		State is in use */
+	unsigned int		in_use		: 1;
+
 	/* @regs: The register state union for all supported formats */
-	union fpregs_state		regs;
+	union fpregs_state	regs;
 
 	/* @regs is dynamically sized! Don't add anything after @regs! */
 } __aligned(64);
@@ -343,6 +367,14 @@ struct fpu {
 	 */
 	struct fpstate			*fpstate;
 
+	/*
+	 * @__task_fpstate:
+	 *
+	 * Pointer to an inactive struct fpstate. Initialized to NULL. Is
+	 * used only for KVM support to swap out the regular task fpstate.
+	 */
+	struct fpstate			*__task_fpstate;
+
 	/*
 	 * @__fpstate:
 	 *
@@ -358,6 +390,16 @@ struct fpu {
 	 */
 };
 
+/*
+ * Guest pseudo FPU container
+ */
+struct fpu_guest {
+	/*
+	 * @fpstate:			Pointer to the allocated guest fpstate
+	 */
+	struct fpstate			*fpstate;
+};
+
 /*
  * FPU state configuration data. Initialized at boot time. Read only after init.
  */
-- 
Gitee


From caf0e6de0f4ef6f36931b140083d6b1d3f2245be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Oct 2021 20:55:51 +0200
Subject: [PATCH 0789/2161] x86/fpu: Provide infrastructure for KVM FPU cleanup

commit 69f6ed1d14c6bcf712f4bb22a231c15eeab401e7 upstream.

For the upcoming AMX support it's necessary to do a proper integration with
KVM. Currently KVM allocates two FPU structs which are used for saving the user
state of the vCPU thread and restoring the guest state when entering
vcpu_run() and doing the reverse operation before leaving vcpu_run().

With the new fpstate mechanism this can be reduced to one extra buffer by
swapping the fpstate pointer in current::thread::fpu. This makes the
upcoming support for AMX and XFD simpler because then fpstate information
(features, sizes, xfd) are always consistent and it does not require any
nasty workarounds.

Provide:

  - An allocator which initializes the state properly

  - A replacement for the existing FPU swap mechanim

Aside of the reduced memory footprint, this also makes state switching
more efficient when TIF_FPU_NEED_LOAD is set. It does not require a
memcpy as the state is already correct in the to be swapped out fpstate.

The existing interfaces will be removed once KVM is converted over.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211022185312.954684740@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h | 13 ++++++
 arch/x86/kernel/fpu/core.c     | 85 +++++++++++++++++++++++++++++++---
 2 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 462f408f2e92..67453bf798ae 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -123,9 +123,22 @@ extern void fpu_init_fpstate_user(struct fpu *fpu);
 extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
 
 /* KVM specific functions */
+extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
+extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
+extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
 extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
 
 extern int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *pkru);
 extern void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, unsigned int size, u32 pkru);
 
+static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
+{
+	gfpu->fpstate->is_confidential = true;
+}
+
+static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
+{
+	return gfpu->fpstate->is_confidential;
+}
+
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 82e21e59c88b..25649202adfc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -176,6 +176,75 @@ void fpu_reset_from_exception_fixup(void)
 }
 
 #if IS_ENABLED(CONFIG_KVM)
+static void __fpstate_reset(struct fpstate *fpstate);
+
+bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
+{
+	struct fpstate *fpstate;
+	unsigned int size;
+
+	size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+	fpstate = vzalloc(size);
+	if (!fpstate)
+		return false;
+
+	__fpstate_reset(fpstate);
+	fpstate_init_user(fpstate);
+	fpstate->is_valloc	= true;
+	fpstate->is_guest	= true;
+
+	gfpu->fpstate = fpstate;
+	return true;
+}
+EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+
+void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
+{
+	struct fpstate *fps = gfpu->fpstate;
+
+	if (!fps)
+		return;
+
+	if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use))
+		return;
+
+	gfpu->fpstate = NULL;
+	vfree(fps);
+}
+EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+
+int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
+{
+	struct fpstate *guest_fps = guest_fpu->fpstate;
+	struct fpu *fpu = &current->thread.fpu;
+	struct fpstate *cur_fps = fpu->fpstate;
+
+	fpregs_lock();
+	if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+		save_fpregs_to_fpstate(fpu);
+
+	/* Swap fpstate */
+	if (enter_guest) {
+		fpu->__task_fpstate = cur_fps;
+		fpu->fpstate = guest_fps;
+		guest_fps->in_use = true;
+	} else {
+		guest_fps->in_use = false;
+		fpu->fpstate = fpu->__task_fpstate;
+		fpu->__task_fpstate = NULL;
+	}
+
+	cur_fps = fpu->fpstate;
+
+	if (!cur_fps->is_confidential)
+		restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+
 void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 {
 	fpregs_lock();
@@ -352,16 +421,20 @@ void fpstate_init_user(struct fpstate *fpstate)
 		fpstate_init_fstate(fpstate);
 }
 
+static void __fpstate_reset(struct fpstate *fpstate)
+{
+	/* Initialize sizes and feature masks */
+	fpstate->size		= fpu_kernel_cfg.default_size;
+	fpstate->user_size	= fpu_user_cfg.default_size;
+	fpstate->xfeatures	= fpu_kernel_cfg.default_features;
+	fpstate->user_xfeatures	= fpu_user_cfg.default_features;
+}
+
 void fpstate_reset(struct fpu *fpu)
 {
 	/* Set the fpstate pointer to the default fpstate */
 	fpu->fpstate = &fpu->__fpstate;
-
-	/* Initialize sizes and feature masks */
-	fpu->fpstate->size		= fpu_kernel_cfg.default_size;
-	fpu->fpstate->user_size		= fpu_user_cfg.default_size;
-	fpu->fpstate->xfeatures		= fpu_kernel_cfg.default_features;
-	fpu->fpstate->user_xfeatures	= fpu_user_cfg.default_features;
+	__fpstate_reset(fpu->fpstate);
 }
 
 #if IS_ENABLED(CONFIG_KVM)
-- 
Gitee


From ad129ec19433e8063eff655ca5d1f3cf38249fe5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 8 Jun 2020 11:02:18 -0700
Subject: [PATCH 0790/2161] KVM: x86: Unexport x86_fpu_cache and make it static

commit 80fbd280beea1bcc298660f9024cbc4bf392c7f7 upstream.

Make x86_fpu_cache static now that FPU allocation and destruction is
handled entirely by common x86 code.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200608180218.20946-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/x86.c              | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c42af654b8cb..41058cf669b3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1231,7 +1231,6 @@ struct kvm_arch_async_pf {
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
-extern struct kmem_cache *x86_fpu_cache;
 
 #define __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 260172cd7f70..74141f49d3d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -227,8 +227,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
-struct kmem_cache *x86_fpu_cache;
-EXPORT_SYMBOL_GPL(x86_fpu_cache);
+static struct kmem_cache *x86_fpu_cache;
 
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
-- 
Gitee


From 887081e86b4f4ef6bf3073ffaa1e1e96fe42c880 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 24 Dec 2021 17:15:26 +0800
Subject: [PATCH 0791/2161] x86/kvm: Convert FPU handling to a single swap
 buffer

commit d69c1382e1b73a0496a70872a035ca2b22d074e5 upstream.

For the upcoming AMX support it's necessary to do a proper integration with
KVM. Currently KVM allocates two FPU structs which are used for saving the user
state of the vCPU thread and restoring the guest state when entering
vcpu_run() and doing the reverse operation before leaving vcpu_run().

With the new fpstate mechanism this can be reduced to one extra buffer by
swapping the fpstate pointer in current::thread::fpu. This makes the
upcoming support for AMX and XFD simpler because then fpstate information
(features, sizes, xfd) are always consistent and it does not require any
nasty workarounds.

Convert the KVM FPU code over to this new scheme.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211022185313.019454292@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h  |  4 +-
 arch/x86/include/asm/kvm_host.h |  7 +---
 arch/x86/kernel/fpu/core.c      | 16 ++++----
 arch/x86/kvm/svm.c              | 26 +++----------
 arch/x86/kvm/vmx/vmx.c          | 21 ++--------
 arch/x86/kvm/x86.c              | 68 ++++++++++-----------------------
 6 files changed, 42 insertions(+), 100 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 67453bf798ae..280f2c25df82 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -128,8 +128,8 @@ extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
 extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
 extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
 
-extern int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *pkru);
-extern void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, unsigned int size, u32 pkru);
+extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
+extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
 
 static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
 {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 41058cf669b3..d77b82f016cc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -618,11 +618,10 @@ struct kvm_vcpu_arch {
 	 *
 	 * Note that while the PKRU state lives inside the fpu registers,
 	 * it is switched out separately at VMENTER and VMEXIT time. The
-	 * "guest_fpu" state here contains the guest FPU context, with the
+	 * "guest_fpstate" state here contains the guest FPU context, with the
 	 * host PRKU bits.
 	 */
-	struct fpu *user_fpu;
-	struct fpu *guest_fpu;
+	struct fpu_guest guest_fpu;
 
 	u64 xcr0;
 	u64 guest_supported_xcr0;
@@ -1381,8 +1380,6 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 		    int reason, bool has_error_code, u32 error_code);
 
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu);
-
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 25649202adfc..fbe8de14aa6a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -268,10 +268,10 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
 }
 EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
 
-void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf,
-			       unsigned int size, u32 pkru)
+void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
+				    unsigned int size, u32 pkru)
 {
-	struct fpstate *kstate = fpu->fpstate;
+	struct fpstate *kstate = gfpu->fpstate;
 	union fpregs_state *ustate = buf;
 	struct membuf mb = { .p = buf, .left = size };
 
@@ -284,12 +284,12 @@ void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf,
 		ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
 	}
 }
-EXPORT_SYMBOL_GPL(fpu_copy_fpstate_to_kvm_uabi);
+EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
 
-int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
-				 u32 *vpkru)
+int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
+				   u64 xcr0, u32 *vpkru)
 {
-	struct fpstate *kstate = fpu->fpstate;
+	struct fpstate *kstate = gfpu->fpstate;
 	const union fpregs_state *ustate = buf;
 	struct pkru_state *xpkru;
 	int ret;
@@ -320,7 +320,7 @@ int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0,
 	xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(fpu_copy_kvm_uabi_to_fpstate);
+EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
 #endif /* CONFIG_KVM */
 
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 291fcca1fee4..f5a234a19801 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2219,25 +2219,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto out;
 	}
 
-	svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
-						     GFP_KERNEL_ACCOUNT);
-	if (!svm->vcpu.arch.user_fpu) {
-		printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
-		err = -ENOMEM;
+	if (!fpu_alloc_guest_fpstate(&svm->vcpu.arch.guest_fpu)) {
+		pr_err("kvm: failed to allocate vcpu's fpu\n");
 		goto free_partial_svm;
 	}
 
-	svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
-						     GFP_KERNEL_ACCOUNT);
-	if (!svm->vcpu.arch.guest_fpu) {
-		printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
-		err = -ENOMEM;
-		goto free_user_fpu;
-	}
-
 	err = kvm_vcpu_init(&svm->vcpu, kvm, id);
 	if (err)
-		goto free_svm;
+		goto free_guest_fpu;
 
 	err = -ENOMEM;
 	page = alloc_page(GFP_KERNEL_ACCOUNT);
@@ -2293,10 +2282,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	__free_page(page);
 uninit:
 	kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
-	kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
-free_user_fpu:
-	kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
+free_guest_fpu:
+	fpu_free_guest_fpstate(&svm->vcpu.arch.guest_fpu);
 free_partial_svm:
 	kmem_cache_free(kvm_vcpu_cache, svm);
 out:
@@ -2327,8 +2314,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	__free_page(virt_to_page(svm->nested.hsave));
 	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
 	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
-	kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
+	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 66f4e50235c0..3e77ace23118 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6681,8 +6681,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
-	kvm_free_guest_fpu(&vmx->vcpu);
+	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
@@ -6700,20 +6699,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx)
 		return ERR_PTR(-ENOMEM);
 
-	vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
-			GFP_KERNEL_ACCOUNT);
-	if (!vmx->vcpu.arch.user_fpu) {
-		printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
-		err = -ENOMEM;
-		goto free_partial_vcpu;
-	}
-
-	vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
-			GFP_KERNEL_ACCOUNT);
-	if (!vmx->vcpu.arch.guest_fpu) {
+	if (!fpu_alloc_guest_fpstate(&vmx->vcpu.arch.guest_fpu)) {
 		printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
 		err = -ENOMEM;
-		goto free_user_fpu;
+		goto free_partial_vcpu;
 	}
 
 	vmx->vpid = allocate_vpid();
@@ -6814,9 +6803,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
 	free_vpid(vmx->vpid);
-	kvm_free_guest_fpu(&vmx->vcpu);
-free_user_fpu:
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
+	fpu_free_guest_fpstate(&vmx->vcpu.arch.guest_fpu);
 free_partial_vcpu:
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	return ERR_PTR(err);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 74141f49d3d3..0fbb74276e51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -227,8 +227,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
-static struct kmem_cache *x86_fpu_cache;
-
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
@@ -4020,23 +4018,24 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 					 struct kvm_xsave *guest_xsave)
 {
-	if (!vcpu->arch.guest_fpu)
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 		return;
 
-	fpu_copy_fpstate_to_kvm_uabi(vcpu->arch.guest_fpu, guest_xsave->region,
-				     sizeof(guest_xsave->region),
-				     vcpu->arch.pkru);
+	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+				       guest_xsave->region,
+				       sizeof(guest_xsave->region),
+				       vcpu->arch.pkru);
 }
 
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
-	if (!vcpu->arch.guest_fpu)
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 		return 0;
 
-	return fpu_copy_kvm_uabi_to_fpstate(vcpu->arch.guest_fpu,
-					    guest_xsave->region,
-					    kvm_supported_xcr0(), &vcpu->arch.pkru);
+	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+					      guest_xsave->region,
+					      kvm_supported_xcr0(), &vcpu->arch.pkru);
 }
 
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -7253,18 +7252,11 @@ int kvm_arch_init(void *opaque)
 	}
 
 	r = -ENOMEM;
-	x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
-					  __alignof__(struct fpu), SLAB_ACCOUNT,
-					  NULL);
-	if (!x86_fpu_cache) {
-		printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
-		goto out;
-	}
 
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out_free_x86_fpu_cache;
+		goto out;
 	}
 
 	r = kvm_mmu_module_init();
@@ -7297,8 +7289,6 @@ int kvm_arch_init(void *opaque)
 
 out_free_percpu:
 	free_percpu(shared_msrs);
-out_free_x86_fpu_cache:
-	kmem_cache_destroy(x86_fpu_cache);
 out:
 	return r;
 }
@@ -7322,7 +7312,6 @@ void kvm_arch_exit(void)
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 	free_percpu(shared_msrs);
-	kmem_cache_destroy(x86_fpu_cache);
 }
 
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
@@ -8496,23 +8485,17 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	/*
-	 * Guests with protected state have guest_fpu == NULL which makes
-	 * the swap only save the host state. Exclude PKRU from restore as
-	 * it is restored separately in kvm_x86_ops->run().
+	 * Exclude PKRU from restore as restored separately in
+	 * kvm_x86_ops.run().
 	 */
-	fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu,
-			 ~XFEATURE_MASK_PKRU);
+	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
 	trace_kvm_fpu(1);
 }
 
 /* When vcpu_run ends, restore user space FPU context. */
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	/*
-	 * Guests with protected state have guest_fpu == NULL which makes
-	 * swap only restore the host state.
-	 */
-	fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL);
+	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }
@@ -9006,12 +8989,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxregs_state *fxsave;
 
-	if (!vcpu->arch.guest_fpu)
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 		return 0;
 
 	vcpu_load(vcpu);
 
-	fxsave = &vcpu->arch.guest_fpu->fpstate->regs.fxsave;
+	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
 	fpu->fsw = fxsave->swd;
@@ -9029,12 +9012,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxregs_state *fxsave;
 
-	if (!vcpu->arch.guest_fpu)
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 		return 0;
 
 	vcpu_load(vcpu);
 
-	fxsave = &vcpu->arch.guest_fpu->fpstate->regs.fxsave;
+	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
@@ -9098,15 +9081,6 @@ static void fx_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr0 |= X86_CR0_ET;
 }
 
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.guest_fpu) {
-		kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
-		vcpu->arch.guest_fpu = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
-
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
@@ -9214,8 +9188,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
-	if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
-		struct fpstate *fpstate = vcpu->arch.guest_fpu->fpstate;
+	if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+		struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
 
 		/*
 		 * To avoid have the INIT path from kvm_apic_has_events() that be
@@ -9455,8 +9429,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_mce_banks;
 	}
 
-	fpu_init_fpstate_user(vcpu->arch.user_fpu);
-	fpu_init_fpstate_user(vcpu->arch.guest_fpu);
 	fx_init(vcpu);
 
 	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-- 
Gitee


From d9af9a7144a8b52bc23d43548705087d64777cda Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Oct 2021 20:55:54 +0200
Subject: [PATCH 0792/2161] x86/fpu: Remove old KVM FPU interface

commit 582b01b6ab2714a0a4d554cea7f0d4efeaa2154d upstream.

No more users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211022185313.074853631@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  2 --
 arch/x86/kernel/fpu/core.c     | 32 --------------------------------
 2 files changed, 34 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 280f2c25df82..e6b418515c71 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -119,14 +119,12 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {}
 DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
 /* fpstate-related functions which are exported to KVM */
-extern void fpu_init_fpstate_user(struct fpu *fpu);
 extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
 
 /* KVM specific functions */
 extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
 extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
 extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
-extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
 
 extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
 extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index fbe8de14aa6a..622c2ad0a510 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -245,29 +245,6 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 }
 EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
 
-void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
-{
-	fpregs_lock();
-
-	if (save) {
-		struct fpstate *fpcur = current->thread.fpu.fpstate;
-
-		if (test_thread_flag(TIF_NEED_FPU_LOAD))
-			memcpy(&save->fpstate->regs, &fpcur->regs, fpcur->size);
-		else
-			save_fpregs_to_fpstate(save);
-	}
-
-	if (rstor) {
-		restore_mask &= XFEATURE_MASK_FPSTATE;
-		restore_fpregs_from_fpstate(rstor->fpstate, restore_mask);
-	}
-
-	fpregs_mark_activate();
-	fpregs_unlock();
-}
-EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
-
 void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
 				    unsigned int size, u32 pkru)
 {
@@ -437,15 +414,6 @@ void fpstate_reset(struct fpu *fpu)
 	__fpstate_reset(fpu->fpstate);
 }
 
-#if IS_ENABLED(CONFIG_KVM)
-void fpu_init_fpstate_user(struct fpu *fpu)
-{
-	fpstate_reset(fpu);
-	fpstate_init_user(fpu->fpstate);
-}
-EXPORT_SYMBOL_GPL(fpu_init_fpstate_user);
-#endif
-
 /* Clone current's FPU state on fork */
 int fpu_clone(struct task_struct *dst)
 {
-- 
Gitee


From 9441bad32e14b77e0f90cb36021002dab71406db Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Oct 2021 15:55:05 -0700
Subject: [PATCH 0793/2161] signal: Add an optional check for altstack size

commit 1bdda24c4af64cd2d65dec5192ab624c5fee7ca0 upstream.

New x86 FPU features will be very large, requiring ~10k of stack in
signal handlers.  These new features require a new approach called
"dynamic features".

The kernel currently tries to ensure that altstacks are reasonably
sized. Right now, on x86, sys_sigaltstack() requires a size of >=2k.
However, that 2k is a constant. Simply raising that 2k requirement
to >10k for the new features would break existing apps which have a
compiled-in size of 2k.

Instead of universally enforcing a larger stack, prohibit a process from
using dynamic features without properly-sized altstacks. This must be
enforced in two places:

 * A dynamic feature can not be enabled without an large-enough altstack
   for each process thread.
 * Once a dynamic feature is enabled, any request to install a too-small
   altstack will be rejected

The dynamic feature enabling code must examine each thread in a
process to ensure that the altstacks are large enough. Add a new lock
(sigaltstack_lock()) to ensure that threads can not race and change
their altstack after being examined.

Add the infrastructure in form of a config option and provide empty
stubs for architectures which do not need dynamic altstack size checks.

This implementation will be fleshed out for x86 in a future patch called

  x86/arch_prctl: Add controls for dynamic XSTATE components

  [dhansen: commit message. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-2-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/Kconfig           |  3 +++
 include/linux/signal.h |  6 ++++++
 kernel/signal.c        | 35 +++++++++++++++++++++++++++++------
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a8df66e64544..b4c9475c053a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -983,6 +983,9 @@ config RELR
 config ARCH_HAS_MEM_ENCRYPT
 	bool
 
+config DYNAMIC_SIGFRAME
+	bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 1a5f88316b08..2123902d60ff 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -454,6 +454,12 @@ int __save_altstack(stack_t __user *, unsigned long);
 		sas_ss_reset(t); \
 } while (0);
 
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+bool sigaltstack_size_valid(size_t ss_size);
+#else
+static inline bool sigaltstack_size_valid(size_t size) { return true; }
+#endif /* !CONFIG_DYNAMIC_SIGFRAME */
+
 #ifdef CONFIG_PROC_FS
 struct seq_file;
 extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8c97fc72d78b..0227c3a62345 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4001,11 +4001,29 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 	return 0;
 }
 
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+static inline void sigaltstack_lock(void)
+	__acquires(&current->sighand->siglock)
+{
+	spin_lock_irq(&current->sighand->siglock);
+}
+
+static inline void sigaltstack_unlock(void)
+	__releases(&current->sighand->siglock)
+{
+	spin_unlock_irq(&current->sighand->siglock);
+}
+#else
+static inline void sigaltstack_lock(void) { }
+static inline void sigaltstack_unlock(void) { }
+#endif
+
 static int
 do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
 		size_t min_ss_size)
 {
 	struct task_struct *t = current;
+	int ret = 0;
 
 	if (oss) {
 		memset(oss, 0, sizeof(stack_t));
@@ -4029,19 +4047,24 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
 				ss_mode != 0))
 			return -EINVAL;
 
+		sigaltstack_lock();
 		if (ss_mode == SS_DISABLE) {
 			ss_size = 0;
 			ss_sp = NULL;
 		} else {
 			if (unlikely(ss_size < min_ss_size))
-				return -ENOMEM;
+				ret = -ENOMEM;
+			if (!sigaltstack_size_valid(ss_size))
+				ret = -ENOMEM;
 		}
-
-		t->sas_ss_sp = (unsigned long) ss_sp;
-		t->sas_ss_size = ss_size;
-		t->sas_ss_flags = ss_flags;
+		if (!ret) {
+			t->sas_ss_sp = (unsigned long) ss_sp;
+			t->sas_ss_size = ss_size;
+			t->sas_ss_flags = ss_flags;
+		}
+		sigaltstack_unlock();
 	}
-	return 0;
+	return ret;
 }
 
 SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
-- 
Gitee


From 0e45dab8ad5b9be912551995af253bf139245a11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Oct 2021 15:55:06 -0700
Subject: [PATCH 0794/2161] x86/signal: Implement sigaltstack size validation

commit 3aac3ebea08f2d342364f827c8979ab0e1dd591e upstream.

For historical reasons MINSIGSTKSZ is a constant which became already too
small with AVX512 support.

Add a mechanism to enforce strict checking of the sigaltstack size against
the real size of the FPU frame.

The strict check can be enabled via a config option and can also be
controlled via the kernel command line option 'strict_sas_size' independent
of the config switch.

Enabling it might break existing applications which allocate a too small
sigaltstack but 'work' because they never get a signal delivered. Though it
can be handy to filter out binaries which are not yet aware of
AT_MINSIGSTKSZ.

Also the upcoming support for dynamically enabled FPU features requires a
strict sanity check to ensure that:

   - Enabling of a dynamic feature, which changes the sigframe size fits
     into an enabled sigaltstack

   - Installing a too small sigaltstack after a dynamic feature has been
     added is not possible.

Implement the base check which is controlled by config and command line
options.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-3-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         |  9 +++++
 arch/x86/Kconfig                              | 17 ++++++++++
 arch/x86/kernel/signal.c                      | 34 +++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a31ade8726c9..66045c063f51 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4693,6 +4693,15 @@ qspinlock.numa_spinlock_threshold_ns=	[NUMA, PV_OPS]
 	stifb=		[HW]
 			Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
 
+        strict_sas_size=
+			[X86]
+			Format: <bool>
+			Enable or disable strict sigaltstack size checks
+			against the required signal frame size which
+			depends on the supported FPU features. This can
+			be used to filter out binaries which have
+			not yet been made aware of AT_MINSIGSTKSZ.
+
 	sunrpc.min_resvport=
 	sunrpc.max_resvport=
 			[NFS,SUNRPC]
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cd3525c44cc7..3a9dccb9c3d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
 	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
 	select CLOCKSOURCE_WATCHDOG
 	select DCACHE_WORD_ACCESS
+	select DYNAMIC_SIGFRAME
 	select EDAC_ATOMIC_SCRUB
 	select EDAC_SUPPORT
 	select GENERIC_CLOCKEVENTS
@@ -2516,6 +2517,22 @@ config MODIFY_LDT_SYSCALL
 
 	  Saying 'N' here may make sense for embedded or server kernels.
 
+config STRICT_SIGALTSTACK_SIZE
+	bool "Enforce strict size checking for sigaltstack"
+	depends on DYNAMIC_SIGFRAME
+	help
+	  For historical reasons MINSIGSTKSZ is a constant which became
+	  already too small with AVX512 support. Add a mechanism to
+	  enforce strict checking of the sigaltstack size against the
+	  real size of the FPU frame. This option enables the check
+	  by default. It can also be controlled via the kernel command
+	  line option 'strict_sas_size' independent of this config
+	  switch. Enabling it might break existing applications which
+	  allocate a too small sigaltstack but 'work' because they
+	  never get a signal delivered.
+
+	  Say 'N' unless you want to really enforce this check.
+
 source "kernel/livepatch/Kconfig"
 
 endmenu
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d02d8ba89078..d64599f12518 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -38,6 +38,7 @@
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
+#include <asm/fpu/xstate.h>
 #endif /* CONFIG_X86_64 */
 
 #include <asm/syscall.h>
@@ -897,6 +898,39 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 	force_sig(SIGSEGV);
 }
 
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
+static bool strict_sigaltstack_size __ro_after_init = true;
+#else
+static bool strict_sigaltstack_size __ro_after_init = false;
+#endif
+
+static int __init strict_sas_size(char *arg)
+{
+	return kstrtobool(arg, &strict_sigaltstack_size);
+}
+__setup("strict_sas_size", strict_sas_size);
+
+/*
+ * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
+ * exceeds that size already. As such programs might never use the
+ * sigaltstack they just continued to work. While always checking against
+ * the real size would be correct, this might be considered a regression.
+ *
+ * Therefore avoid the sanity check, unless enforced by kernel config or
+ * command line option.
+ */
+bool sigaltstack_size_valid(size_t ss_size)
+{
+	lockdep_assert_held(&current->sighand->siglock);
+
+	if (strict_sigaltstack_size)
+		return ss_size > get_sigframe_size();
+
+	return true;
+}
+#endif /* CONFIG_DYNAMIC_SIGFRAME */
+
 #ifdef CONFIG_X86_X32_ABI
 asmlinkage long sys32_x32_rt_sigreturn(void)
 {
-- 
Gitee


From 4c6aff995f773d7662fdfef7aa689f4389e73cb0 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:07 -0700
Subject: [PATCH 0795/2161] x86/fpu/xstate: Provide xstate_calculate_size()

commit 84e4dccc8fce20b497388d756e12de5c9006eb48 upstream.

Split out the size calculation from the paranoia check so it can be used
for recalculating buffer sizes when dynamically enabled features are
supported.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
[ tglx: Adopted to changed base code ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-4-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 46 ++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 064769de856c..b3053f0fb173 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -546,6 +546,33 @@ static bool __init check_xstate_against_struct(int nr)
 	return true;
 }
 
+static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+{
+	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	int i;
+
+	for_each_extended_xfeature(i, xfeatures) {
+		/* Align from the end of the previous feature */
+		if (xfeature_is_aligned(i))
+			size = ALIGN(size, 64);
+		/*
+		 * In compacted format the enabled features are packed,
+		 * i.e. disabled features do not occupy space.
+		 *
+		 * In non-compacted format the offsets are fixed and
+		 * disabled states still occupy space in the memory buffer.
+		 */
+		if (!compacted)
+			size = xfeature_uncompacted_offset(i);
+		/*
+		 * Add the feature size even for non-compacted format
+		 * to make the end result correct
+		 */
+		size += xfeature_size(i);
+	}
+	return size;
+}
+
 /*
  * This essentially double-checks what the cpu told us about
  * how large the XSAVE buffer needs to be.  We are recalculating
@@ -572,25 +599,8 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
 			XSTATE_WARN_ON(1);
 			return false;
 		}
-
-		/* Align from the end of the previous feature */
-		if (xfeature_is_aligned(i))
-			size = ALIGN(size, 64);
-		/*
-		 * In compacted format the enabled features are packed,
-		 * i.e. disabled features do not occupy space.
-		 *
-		 * In non-compacted format the offsets are fixed and
-		 * disabled states still occupy space in the memory buffer.
-		 */
-		if (!compacted)
-			size = xfeature_uncompacted_offset(i);
-		/*
-		 * Add the feature size even for non-compacted format
-		 * to make the end result correct
-		 */
-		size += xfeature_size(i);
 	}
+	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
 	XSTATE_WARN_ON(size != kernel_size);
 	return size == kernel_size;
 }
-- 
Gitee


From 597c962ff0f46cad62e654afae60f9be02a70d41 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Oct 2021 15:55:08 -0700
Subject: [PATCH 0796/2161] x86/fpu: Add members to struct fpu to cache
 permission information

commit 6f6a7c09c4065a5b140194dfcfe4cf7104fec4d2 upstream.

Dynamically enabled features can be requested by any thread of a running
process at any time. The request does neither enable the feature nor
allocate larger buffers. It just stores the permission to use the feature
by adding the features to the permission bitmap and by calculating the
required sizes for kernel and user space.

The reallocation of the kernel buffer happens when the feature is used
for the first time which is caught by an exception. The permission
bitmap is then checked and if the feature is permitted, then it becomes
fully enabled. If not, the task dies similarly to a task which uses an
undefined instruction.

The size information is precomputed to allow proper sigaltstack size checks
once the feature is permitted, but not yet in use because otherwise this
would open race windows where too small stacks could be installed causing
a later fail on signal delivery.

Initialize them to the default feature set and sizes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-5-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 46 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/core.c       |  5 ++++
 2 files changed, 51 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 63e9ca544334..e722788e87d9 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -332,6 +332,45 @@ struct fpstate {
 	/* @regs is dynamically sized! Don't add anything after @regs! */
 } __aligned(64);
 
+struct fpu_state_perm {
+	/*
+	 * @__state_perm:
+	 *
+	 * This bitmap indicates the permission for state components, which
+	 * are available to a thread group. The permission prctl() sets the
+	 * enabled state bits in thread_group_leader()->thread.fpu.
+	 *
+	 * All run time operations use the per thread information in the
+	 * currently active fpu.fpstate which contains the xfeature masks
+	 * and sizes for kernel and user space.
+	 *
+	 * This master permission field is only to be used when
+	 * task.fpu.fpstate based checks fail to validate whether the task
+	 * is allowed to expand it's xfeatures set which requires to
+	 * allocate a larger sized fpstate buffer.
+	 *
+	 * Do not access this field directly.  Use the provided helper
+	 * function. Unlocked access is possible for quick checks.
+	 */
+	u64				__state_perm;
+
+	/*
+	 * @__state_size:
+	 *
+	 * The size required for @__state_perm. Only valid to access
+	 * with sighand locked.
+	 */
+	unsigned int			__state_size;
+
+	/*
+	 * @__user_state_size:
+	 *
+	 * The size required for @__state_perm user part. Only valid to
+	 * access with sighand locked.
+	 */
+	unsigned int			__user_state_size;
+};
+
 /*
  * Highest level per task FPU state data structure that
  * contains the FPU register state plus various FPU
@@ -375,6 +414,13 @@ struct fpu {
 	 */
 	struct fpstate			*__task_fpstate;
 
+	/*
+	 * @perm:
+	 *
+	 * Permission related information
+	 */
+	struct fpu_state_perm		perm;
+
 	/*
 	 * @__fpstate:
 	 *
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 622c2ad0a510..97821db508cc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -412,6 +412,11 @@ void fpstate_reset(struct fpu *fpu)
 	/* Set the fpstate pointer to the default fpstate */
 	fpu->fpstate = &fpu->__fpstate;
 	__fpstate_reset(fpu->fpstate);
+
+	/* Initialize the permission related info in fpu */
+	fpu->perm.__state_perm		= fpu_kernel_cfg.default_features;
+	fpu->perm.__state_size		= fpu_kernel_cfg.default_size;
+	fpu->perm.__user_state_size	= fpu_user_cfg.default_size;
 }
 
 /* Clone current's FPU state on fork */
-- 
Gitee


From dcca98608a1be7b65b8169bca00a6fca9e6d56eb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Oct 2021 15:55:09 -0700
Subject: [PATCH 0797/2161] x86/fpu: Add fpu_state_config::legacy_features

commit c33f0a81a2cf3920465309ce683534751bb86485 upstream.

The upcoming prctl() which is required to request the permission for a
dynamically enabled feature will also provide an option to retrieve the
supported features. If the CPU does not support XSAVE, the supported
features would be 0 even when the CPU supports FP and SSE.

Provide separate storage for the legacy feature set to avoid that and fill
in the bits in the legacy init function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-6-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 7 +++++++
 arch/x86/kernel/fpu/init.c       | 9 ++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index e722788e87d9..785d3063fe11 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -483,6 +483,13 @@ struct fpu_state_config {
 	 * be requested by user space before usage.
 	 */
 	u64 default_features;
+	/*
+	 * @legacy_features:
+	 *
+	 * Features which can be reported back to user space
+	 * even without XSAVE support, i.e. legacy features FP + SSE
+	 */
+	u64 legacy_features;
 };
 
 /* FPU state configuration information */
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7074154131e6..621f4b6cac4a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -193,12 +193,15 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 	 * Note that the size configuration might be overwritten later
 	 * during fpu__init_system_xstate().
 	 */
-	if (!cpu_feature_enabled(X86_FEATURE_FPU))
+	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
 		size = sizeof(struct swregs_state);
-	else if (cpu_feature_enabled(X86_FEATURE_FXSR))
+	} else if (cpu_feature_enabled(X86_FEATURE_FXSR)) {
 		size = sizeof(struct fxregs_state);
-	else
+		fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE;
+	} else {
 		size = sizeof(struct fregs_state);
+		fpu_user_cfg.legacy_features = XFEATURE_MASK_FP;
+	}
 
 	fpu_kernel_cfg.max_size = size;
 	fpu_kernel_cfg.default_size = size;
-- 
Gitee


From e26c9e0e0320e6b1567ab4e9b8e99b8a23f0202f Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:10 -0700
Subject: [PATCH 0798/2161] x86/arch_prctl: Add controls for dynamic XSTATE
 components

commit 1bdda24c4af64cd2d65dec5192ab624c5fee7ca0 upstream.

Dynamically enabled XSTATE features are by default disabled for all
processes. A process has to request permission to use such a feature.

To support this implement a architecture specific prctl() with the options:

   - ARCH_GET_XCOMP_SUPP

     Copies the supported feature bitmap into the user space provided
     u64 storage. The pointer is handed in via arg2

   - ARCH_GET_XCOMP_PERM

     Copies the process wide permitted feature bitmap into the user space
     provided u64 storage. The pointer is handed in via arg2

   - ARCH_REQ_XCOMP_PERM

     Request permission for a feature set. A feature set can be mapped to a
     facility, e.g. AMX, and can require one or more XSTATE components to
     be enabled.

     The feature argument is the number of the highest XSTATE component
     which is required for a facility to work.

     The request argument is not a user supplied bitmap because that makes
     filtering harder (think seccomp) and even impossible because to
     support 32bit tasks the argument would have to be a pointer.

The permission mechanism works this way:

   Task asks for permission for a facility and kernel checks whether that's
   supported. If supported it does:

     1) Check whether permission has already been granted

     2) Compute the size of the required kernel and user space buffer
        (sigframe) size.

     3) Validate that no task has a sigaltstack installed
        which is smaller than the resulting sigframe size

     4) Add the requested feature bit(s) to the permission bitmap of
        current->group_leader->fpu and store the sizes in the group
        leaders fpu struct as well.

If that is successful then the feature is still not enabled for any of the
tasks. The first usage of a related instruction will result in a #NM
trap. The trap handler validates the permission bit of the tasks group
leader and if permitted it installs a larger kernel buffer and transfers
the permission and size info to the new fpstate container which makes all
the FPU functions which require per task information aware of the extended
feature set.

  [ tglx: Adopted to new base code, added missing serialization,
          massaged namings, comments and changelog ]

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-7-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h    |   4 +
 arch/x86/include/asm/proto.h      |   2 +-
 arch/x86/include/uapi/asm/prctl.h |   4 +
 arch/x86/kernel/fpu/xstate.c      | 157 ++++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/xstate.h      |   6 ++
 arch/x86/kernel/process.c         |   9 +-
 6 files changed, 179 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index e6b418515c71..97d609920bba 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -139,4 +139,8 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
 	return gfpu->fpstate->is_confidential;
 }
 
+/* prctl */
+struct task_struct;
+extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2);
+
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 6e81788a30c1..7950b4ba72f4 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -36,6 +36,6 @@ void x86_report_nx(void);
 extern int reboot_force;
 
 long do_arch_prctl_common(struct task_struct *task, int option,
-			  unsigned long cpuid_enabled);
+			  unsigned long arg2);
 
 #endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..754a07856817 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -10,6 +10,10 @@
 #define ARCH_GET_CPUID		0x1011
 #define ARCH_SET_CPUID		0x1012
 
+#define ARCH_GET_XCOMP_SUPP	0x1021
+#define ARCH_GET_XCOMP_PERM	0x1022
+#define ARCH_REQ_XCOMP_PERM	0x1023
+
 #define ARCH_MAP_VDSO_X32	0x2001
 #define ARCH_MAP_VDSO_32	0x2002
 #define ARCH_MAP_VDSO_64	0x2003
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b3053f0fb173..9c24ca89ce5f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -8,9 +8,11 @@
 #include <linux/compat.h>
 #include <linux/cpu.h>
 #include <linux/mman.h>
+#include <linux/nospec.h>
 #include <linux/pkeys.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/sched/signal.h>
 
 #include <asm/fpu/api.h>
 #include <asm/fpu/regset.h>
@@ -18,6 +20,8 @@
 #include <asm/fpu/xcr.h>
 
 #include <asm/tlbflush.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
 
 #include "internal.h"
 #include "legacy.h"
@@ -1325,6 +1329,159 @@ void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
 #endif
 
+#ifdef CONFIG_X86_64
+static int validate_sigaltstack(unsigned int usize)
+{
+	struct task_struct *thread, *leader = current->group_leader;
+	unsigned long framesize = get_sigframe_size();
+
+	lockdep_assert_held(&current->sighand->siglock);
+
+	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
+	framesize -= fpu_user_cfg.max_size;
+	framesize += usize;
+	for_each_thread(leader, thread) {
+		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
+			return -ENOSPC;
+	}
+	return 0;
+}
+
+static int __xstate_request_perm(u64 permitted, u64 requested)
+{
+	/*
+	 * This deliberately does not exclude !XSAVES as we still might
+	 * decide to optionally context switch XCR0 or talk the silicon
+	 * vendors into extending XFD for the pre AMX states.
+	 */
+	bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+	struct fpu *fpu = &current->group_leader->thread.fpu;
+	unsigned int ksize, usize;
+	u64 mask;
+	int ret;
+
+	/* Check whether fully enabled */
+	if ((permitted & requested) == requested)
+		return 0;
+
+	/* Calculate the resulting kernel state size */
+	mask = permitted | requested;
+	ksize = xstate_calculate_size(mask, compacted);
+
+	/* Calculate the resulting user state size */
+	mask &= XFEATURE_MASK_USER_SUPPORTED;
+	usize = xstate_calculate_size(mask, false);
+
+	ret = validate_sigaltstack(usize);
+	if (ret)
+		return ret;
+
+	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
+	WRITE_ONCE(fpu->perm.__state_perm, requested);
+	/* Protected by sighand lock */
+	fpu->perm.__state_size = ksize;
+	fpu->perm.__user_state_size = usize;
+	return ret;
+}
+
+/*
+ * Permissions array to map facilities with more than one component
+ */
+static const u64 xstate_prctl_req[XFEATURE_MAX] = {
+	/* [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE, */
+};
+
+static int xstate_request_perm(unsigned long idx)
+{
+	u64 permitted, requested;
+	int ret;
+
+	if (idx >= XFEATURE_MAX)
+		return -EINVAL;
+
+	/*
+	 * Look up the facility mask which can require more than
+	 * one xstate component.
+	 */
+	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
+	requested = xstate_prctl_req[idx];
+	if (!requested)
+		return -EOPNOTSUPP;
+
+	if ((fpu_user_cfg.max_features & requested) != requested)
+		return -EOPNOTSUPP;
+
+	/* Lockless quick check */
+	permitted = xstate_get_host_group_perm();
+	if ((permitted & requested) == requested)
+		return 0;
+
+	/* Protect against concurrent modifications */
+	spin_lock_irq(&current->sighand->siglock);
+	permitted = xstate_get_host_group_perm();
+	ret = __xstate_request_perm(permitted, requested);
+	spin_unlock_irq(&current->sighand->siglock);
+	return ret;
+}
+#else /* CONFIG_X86_64 */
+static inline int xstate_request_perm(unsigned long idx)
+{
+	return -EPERM;
+}
+#endif  /* !CONFIG_X86_64 */
+
+/**
+ * fpu_xstate_prctl - xstate permission operations
+ * @tsk:	Redundant pointer to current
+ * @option:	A subfunction of arch_prctl()
+ * @arg2:	option argument
+ * Return:	0 if successful; otherwise, an error code
+ *
+ * Option arguments:
+ *
+ * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
+ * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
+ * ARCH_REQ_XCOMP_PERM: Facility number requested
+ *
+ * For facilities which require more than one XSTATE component, the request
+ * must be the highest state component number related to that facility,
+ * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
+ * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
+ */
+long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
+{
+	u64 __user *uptr = (u64 __user *)arg2;
+	u64 permitted, supported;
+	unsigned long idx = arg2;
+
+	if (tsk != current)
+		return -EPERM;
+
+	switch (option) {
+	case ARCH_GET_XCOMP_SUPP:
+		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
+		return put_user(supported, uptr);
+
+	case ARCH_GET_XCOMP_PERM:
+		/*
+		 * Lockless snapshot as it can also change right after the
+		 * dropping the lock.
+		 */
+		permitted = xstate_get_host_group_perm();
+		permitted &= XFEATURE_MASK_USER_SUPPORTED;
+		return put_user(permitted, uptr);
+
+	case ARCH_REQ_XCOMP_PERM:
+		if (!IS_ENABLED(CONFIG_X86_64))
+			return -EOPNOTSUPP;
+
+		return xstate_request_perm(idx);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * Report the amount of time elapsed in millisecond since last AVX512
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index a1aa0bad2c9c..4ce1dc030f38 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -15,6 +15,12 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
 		xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
 }
 
+static inline u64 xstate_get_host_group_perm(void)
+{
+	/* Pairs with WRITE_ONCE() in xstate_request_perm() */
+	return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm);
+}
+
 enum xstate_copy_mode {
 	XSTATE_COPY_FP,
 	XSTATE_COPY_FX,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e239aa4f10ee..8b028f4a1cab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -31,6 +31,7 @@
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/mwait.h>
+#include <asm/fpu/api.h>
 #include <asm/fpu/sched.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
@@ -971,13 +972,17 @@ unsigned long get_wchan(struct task_struct *p)
 }
 
 long do_arch_prctl_common(struct task_struct *task, int option,
-			  unsigned long cpuid_enabled)
+			  unsigned long arg2)
 {
 	switch (option) {
 	case ARCH_GET_CPUID:
 		return get_cpuid_mode();
 	case ARCH_SET_CPUID:
-		return set_cpuid_mode(task, cpuid_enabled);
+		return set_cpuid_mode(task, arg2);
+	case ARCH_GET_XCOMP_SUPP:
+	case ARCH_GET_XCOMP_PERM:
+	case ARCH_REQ_XCOMP_PERM:
+		return fpu_xstate_prctl(task, option, arg2);
 	}
 
 	return -EINVAL;
-- 
Gitee


From 54d664b8b15a1f92a82551262a43157777417d5f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Oct 2021 15:55:11 -0700
Subject: [PATCH 0799/2161] x86/fpu: Add basic helpers for dynamically enabled
 features

commit 23686ef25d4ae81bc12fe3994d1905191fcf71f8 upstream.

To allow building up the infrastructure required to support dynamically
enabled FPU features, add:

 - XFEATURES_MASK_DYNAMIC

   This constant will hold xfeatures which can be dynamically enabled.

 - fpu_state_size_dynamic()

   A static branch for 64-bit and a simple 'return false' for 32-bit.

   This helper allows to add dynamic-feature-specific changes to common
   code which is shared between 32-bit and 64-bit without #ifdeffery.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-8-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 21 +++++++++++++++++++++
 arch/x86/kernel/fpu/core.c        |  4 ++++
 2 files changed, 25 insertions(+)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index ea2bb7f99e57..7cbbeb0e526c 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -43,6 +43,9 @@
 #define XFEATURE_MASK_USER_RESTORE	\
 	(XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
 
+/* Features which are dynamically enabled for a process on request */
+#define XFEATURE_MASK_USER_DYNAMIC	0ULL
+
 /* All currently supported supervisor features */
 #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
 
@@ -97,4 +100,22 @@ int xfeature_size(int xfeature_nr);
 void xsaves(struct xregs_state *xsave, u64 mask);
 void xrstors(struct xregs_state *xsave, u64 mask);
 
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+#endif
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+	return static_branch_unlikely(&__fpu_state_size_dynamic);
+}
+#else
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+	return false;
+}
+#endif
+
 #endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 97821db508cc..2e5a0b86e593 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -25,6 +25,10 @@
 #define CREATE_TRACE_POINTS
 #include <asm/trace/fpu.h>
 
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+#endif
+
 /* The FPU state configuration data for kernel and user space */
 struct fpu_state_config	fpu_kernel_cfg __ro_after_init;
 struct fpu_state_config fpu_user_cfg __ro_after_init;
-- 
Gitee


From f0a5162c4854512fffa06a0031f582c8346c52f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Oct 2021 15:55:12 -0700
Subject: [PATCH 0800/2161] x86/signal: Use fpu::__state_user_size for sigalt
 stack validation

commit 4b7ca609a33dd8696bcbd2f1ad949e26a591592f upstream.

Use the current->group_leader->fpu to check for pending permissions to use
extended features and validate against the resulting user space size which
is stored in the group leaders fpu struct as well.

This prevents a task from installing a too small sized sigaltstack after
permissions to use dynamically enabled features have been granted, but
the task has not (yet) used a related instruction.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-9-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d64599f12518..dbf28eeea3cc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -30,6 +30,7 @@
 #include <asm/processor.h>
 #include <asm/ucontext.h>
 #include <asm/fpu/signal.h>
+#include <asm/fpu/xstate.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
@@ -709,12 +710,15 @@ SYSCALL_DEFINE0(rt_sigreturn)
 
 /* max_frame_size tells userspace the worst case signal stack size. */
 static unsigned long __ro_after_init max_frame_size;
+static unsigned int __ro_after_init fpu_default_state_size;
 
 void __init init_sigframe_size(void)
 {
+	fpu_default_state_size = fpu__get_fpstate_size();
+
 	max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
 
-	max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING;
+	max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;
 
 	/* Userspace expects an aligned size. */
 	max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
@@ -917,15 +921,38 @@ __setup("strict_sas_size", strict_sas_size);
  * sigaltstack they just continued to work. While always checking against
  * the real size would be correct, this might be considered a regression.
  *
- * Therefore avoid the sanity check, unless enforced by kernel config or
- * command line option.
+ * Therefore avoid the sanity check, unless enforced by kernel
+ * configuration or command line option.
+ *
+ * When dynamic FPU features are supported, the check is also enforced when
+ * the task has permissions to use dynamic features. Tasks which have no
+ * permission are checked against the size of the non-dynamic feature set
+ * if strict checking is enabled. This avoids forcing all tasks on the
+ * system to allocate large sigaltstacks even if they are never going
+ * to use a dynamic feature. As this is serialized via sighand::siglock
+ * any permission request for a dynamic feature either happened already
+ * or will see the newly install sigaltstack size in the permission checks.
  */
 bool sigaltstack_size_valid(size_t ss_size)
 {
+	unsigned long fsize = max_frame_size - fpu_default_state_size;
+	u64 mask;
+
 	lockdep_assert_held(&current->sighand->siglock);
 
+	if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
+		return true;
+
+	fsize += current->group_leader->thread.fpu.perm.__user_state_size;
+	if (likely(ss_size > fsize))
+		return true;
+
 	if (strict_sigaltstack_size)
-		return ss_size > get_sigframe_size();
+		return ss_size > fsize;
+
+	mask = current->group_leader->thread.fpu.perm.__state_perm;
+	if (mask & XFEATURE_MASK_USER_DYNAMIC)
+		return ss_size > fsize;
 
 	return true;
 }
-- 
Gitee


From 29989959dc712665eb9d026584e158a6873103fc Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:13 -0700
Subject: [PATCH 0801/2161] x86/fpu/signal: Prepare for variable sigframe
 length

commit 53599b4d54b9b8dda1d537a558946869d2acbddc upstream.

The software reserved portion of the fxsave frame in the signal frame
is copied from structures which have been set up at boot time. With
dynamically enabled features the content of these structures is no
longer correct because the xfeatures and size can be different per task.

Calculate the software reserved portion at runtime and fill in the
xfeatures and size values from the tasks active fpstate.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-10-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/internal.h |  3 --
 arch/x86/kernel/fpu/signal.c   | 62 ++++++++++++++--------------------
 arch/x86/kernel/fpu/xstate.c   |  1 -
 3 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index e1d8a352f12d..dbdb31f55fc7 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -21,9 +21,6 @@ static __always_inline __pure bool use_fxsr(void)
 # define WARN_ON_FPU(x) ({ (void)(x); 0; })
 #endif
 
-/* Init functions */
-extern void fpu__init_prepare_fx_sw_frame(void);
-
 /* Used in init.c */
 extern void fpstate_init_user(struct fpstate *fpstate);
 extern void fpstate_reset(struct fpu *fpu);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index fcdd25fe9262..da0a6842399f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -20,9 +20,6 @@
 #include "legacy.h"
 #include "xstate.h"
 
-static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
-static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
-
 /*
  * Check for the presence of extended state information in the
  * user fpstate pointer in the sigcontext.
@@ -98,23 +95,42 @@ static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
 	return true;
 }
 
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed to by the fpstate pointer in the sigcontext.
+ * This is saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
+				 struct fpstate *fpstate)
+{
+	sw_bytes->magic1 = FP_XSTATE_MAGIC1;
+	sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
+	sw_bytes->xfeatures = fpstate->user_xfeatures;
+	sw_bytes->xstate_size = fpstate->user_size;
+
+	if (ia32_frame)
+		sw_bytes->extended_size += sizeof(struct fregs_state);
+}
+
 static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
-				      unsigned int usize)
+				      struct fpstate *fpstate)
 {
 	struct xregs_state __user *x = buf;
-	struct _fpx_sw_bytes *sw_bytes;
+	struct _fpx_sw_bytes sw_bytes;
 	u32 xfeatures;
 	int err;
 
 	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
-	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
-	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+	save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
+	err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));
 
 	if (!use_xsave())
 		return !err;
 
 	err |= __put_user(FP_XSTATE_MAGIC2,
-			  (__u32 __user *)(buf + usize));
+			  (__u32 __user *)(buf + fpstate->user_size));
 
 	/*
 	 * Read the xfeatures which we copied (directly from the cpu or
@@ -173,7 +189,7 @@ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
 	struct task_struct *tsk = current;
 	struct fpstate *fpstate = tsk->thread.fpu.fpstate;
-	int ia32_fxstate = (buf != buf_fx);
+	bool ia32_fxstate = (buf != buf_fx);
 	int ret;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
@@ -226,8 +242,7 @@ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
 		return false;
 
-	if (use_fxsr() &&
-	    !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate->user_size))
+	if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
 		return false;
 
 	return true;
@@ -523,28 +538,3 @@ unsigned long __init fpu__get_fpstate_size(void)
 	return ret;
 }
 
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-void __init fpu__init_prepare_fx_sw_frame(void)
-{
-	int size = fpu_user_cfg.default_size + FP_XSTATE_MAGIC2_SIZE;
-
-	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
-	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xfeatures = fpu_user_cfg.default_features;
-	fx_sw_reserved.xstate_size = fpu_user_cfg.default_size;
-
-	if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
-	    IS_ENABLED(CONFIG_X86_32)) {
-		int fsave_header_size = sizeof(struct fregs_state);
-
-		fx_sw_reserved_ia32 = fx_sw_reserved;
-		fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
-	}
-}
-
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9c24ca89ce5f..d9155925d7ae 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -828,7 +828,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	update_regset_xstate_info(fpu_user_cfg.max_size,
 				  fpu_user_cfg.max_features);
 
-	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
 	setup_xstate_comp_offsets();
 	setup_supervisor_only_offsets();
-- 
Gitee


From 1f8b8eb435e61e2a3c5e1bf1d3693ac7f96bed8b Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 27 Dec 2021 14:49:56 +0800
Subject: [PATCH 0802/2161] x86/fpu: Prepare fpu_clone() for dynamically
 enabled features

commit 9e798e9aa14c45fb94e47b30bf6347b369ce9df7 upstream.

The default portion of the parent's FPU state is saved in a child task.
With dynamic features enabled, the non-default portion is not saved in a
child's fpstate because these register states are defined to be
caller-saved. The new task's fpstate is therefore the default buffer.

Fork inherits the permission of the parent.

Also, do not use memcpy() when TIF_NEED_FPU_LOAD is set because it is
invalid when the parent has dynamic features.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-11-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/sched.h |  2 +-
 arch/x86/kernel/fpu/core.c       | 35 +++++++++++++++++++++++---------
 arch/x86/kernel/process.c        |  2 +-
 3 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
index cdb78d590c86..99a8820e8cc4 100644
--- a/arch/x86/include/asm/fpu/sched.h
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -11,7 +11,7 @@
 
 extern void save_fpregs_to_fpstate(struct fpu *fpu);
 extern void fpu__drop(struct fpu *fpu);
-extern int  fpu_clone(struct task_struct *dst);
+extern int  fpu_clone(struct task_struct *dst, unsigned long clone_flags);
 extern void fpu_flush_thread(void);
 
 /*
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 2e5a0b86e593..9038a5b1d8dd 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -423,8 +423,20 @@ void fpstate_reset(struct fpu *fpu)
 	fpu->perm.__user_state_size	= fpu_user_cfg.default_size;
 }
 
+static inline void fpu_inherit_perms(struct fpu *dst_fpu)
+{
+	if (fpu_state_size_dynamic()) {
+		struct fpu *src_fpu = &current->group_leader->thread.fpu;
+
+		spin_lock_irq(&current->sighand->siglock);
+		/* Fork also inherits the permissions of the parent */
+		dst_fpu->perm = src_fpu->perm;
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
 /* Clone current's FPU state on fork */
-int fpu_clone(struct task_struct *dst)
+int fpu_clone(struct task_struct *dst, unsigned long clone_flags)
 {
 	struct fpu *src_fpu = &current->thread.fpu;
 	struct fpu *dst_fpu = &dst->thread.fpu;
@@ -455,17 +467,20 @@ int fpu_clone(struct task_struct *dst)
 	}
 
 	/*
-	 * If the FPU registers are not owned by current just memcpy() the
-	 * state.  Otherwise save the FPU registers directly into the
-	 * child's FPU context, without any memory-to-memory copying.
+	 * Save the default portion of the current FPU state into the
+	 * clone. Assume all dynamic features to be defined as caller-
+	 * saved, which enables skipping both the expansion of fpstate
+	 * and the copying of any dynamic state.
+	 *
+	 * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
+	 * copying is not valid when current uses non-default states.
 	 */
 	fpregs_lock();
-	if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		memcpy(&dst_fpu->fpstate->regs, &src_fpu->fpstate->regs,
-		       dst_fpu->fpstate->size);
-	} else {
-		save_fpregs_to_fpstate(dst_fpu);
-	}
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		fpregs_restore_userregs();
+	save_fpregs_to_fpstate(dst_fpu);
+	if (!(clone_flags & CLONE_THREAD))
+		fpu_inherit_perms(dst_fpu);
 	fpregs_unlock();
 
 	trace_x86_fpu_copy_src(src_fpu);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8b028f4a1cab..f36cf2c463ff 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -215,7 +215,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	frame->flags = X86_EFLAGS_FIXED;
 #endif
 
-	fpu_clone(p);
+	fpu_clone(p, clone_flags);
 
 	/* Kernel thread ? */
 	if (unlikely(p->flags & PF_KTHREAD)) {
-- 
Gitee


From 226da26ad8853b621df6dae16e97982992c39f32 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:15 -0700
Subject: [PATCH 0803/2161] x86/fpu: Reset permission and fpstate on exec()

commit e61d6310a0f80cb986fd2076d432760b3619fb6d upstream.

On exec(), extended register states saved in the buffer is cleared. With
dynamic features, each task carries variables besides the register states.
The struct fpu has permission information and struct fpstate contains
buffer size and feature masks. They are all dynamically updated with
dynamic features.

Reset the current task's entire FPU data before an exec() so that the new
task starts with default permission and fpstate.

Rename the register state reset function because the old naming confuses as
it does not reset struct fpstate.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-12-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 9038a5b1d8dd..782bb4bd9117 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -544,7 +544,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 /*
  * Reset current->fpu memory state to the init values.
  */
-static void fpu_reset_fpstate(void)
+static void fpu_reset_fpregs(void)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
@@ -579,7 +579,7 @@ void fpu__clear_user_states(struct fpu *fpu)
 
 	fpregs_lock();
 	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
-		fpu_reset_fpstate();
+		fpu_reset_fpregs();
 		fpregs_unlock();
 		return;
 	}
@@ -609,7 +609,8 @@ void fpu__clear_user_states(struct fpu *fpu)
 
 void fpu_flush_thread(void)
 {
-	fpu_reset_fpstate();
+	fpstate_reset(&current->thread.fpu);
+	fpu_reset_fpregs();
 }
 /*
  * Load FPU context before returning to userspace.
-- 
Gitee


From b4af8dd989fbe8b3a28eaa3f97e0d88437f7d576 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:16 -0700
Subject: [PATCH 0804/2161] x86/cpufeatures: Add eXtended Feature Disabling
 (XFD) feature bit

commit c351101678ce54492b6e09810ec02efc0df036a9 upstream.

Intel's eXtended Feature Disable (XFD) feature is an extension of the XSAVE
architecture. XFD allows the kernel to enable a feature state in XCR0 and
to receive a #NM trap when a task uses instructions accessing that state.

This is going to be used to postpone the allocation of a larger XSTATE
buffer for a task to the point where it is actually using a related
instruction after the permission to use that facility has been granted.

XFD is not used by the kernel, but only applied to userspace. This is a
matter of policy as the kernel knows how a fpstate is reallocated and the
XFD state.

The compacted XSAVE format is adjustable for dynamic features. Make XFD
depend on XSAVES.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-13-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/kernel/cpu/cpuid-deps.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 0ec98ca7e8d8..47d22c3ca6c9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -272,6 +272,7 @@
 #define X86_FEATURE_XSAVEC		(10*32+ 1) /* XSAVEC instruction */
 #define X86_FEATURE_XGETBV1		(10*32+ 2) /* XGETBV with ECX = 1 instruction */
 #define X86_FEATURE_XSAVES		(10*32+ 3) /* XSAVES/XRSTORS instructions */
+#define X86_FEATURE_XFD			(10*32+ 4) /* "" eXtended Feature Disabling */
 
 /*
  * Extended auxiliary flags: Linux defined - for features scattered in various
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 3a02707c1f4d..cb761b31e14a 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -70,6 +70,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_CQM_MBM_LOCAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
+	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
 	{}
 };
 
-- 
Gitee


From 213e43aef0ae3516d0d91a64ca366d921342a3d3 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:17 -0700
Subject: [PATCH 0805/2161] x86/msr-index: Add MSRs for XFD

commit dae1bd58389615d401a84aedc38fa075ef8f7de6 upstream.

XFD introduces two MSRs:

    - IA32_XFD to enable/disable a feature controlled by XFD

    - IA32_XFD_ERR to expose to the #NM trap handler which feature
      was tried to be used for the first time.

Both use the same xstate-component bitmap format, used by XCR0.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-14-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/msr-index.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index aba7a12c522b..cfd0cd50d07d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -581,6 +581,8 @@
 
 #define MSR_IA32_BNDCFGS_RSVD		0x00000ffc
 
+#define MSR_IA32_XFD			0x000001c4
+#define MSR_IA32_XFD_ERR		0x000001c5
 #define MSR_IA32_XSS			0x00000da0
 
 #define FEATURE_CONTROL_LOCKED				(1<<0)
-- 
Gitee


From 41c00b730b57f286697ed62bf0ab47072236837b Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:18 -0700
Subject: [PATCH 0806/2161] x86/fpu: Add XFD state to fpstate

commit 8bf26758ca9659866b844dd51037314b4c0fa6bd upstream.

Add storage for XFD register state to struct fpstate. This will be used to
store the XFD MSR state. This will be used for switching the XFD MSR when
FPU content is restored.

Add a per-CPU variable to cache the current MSR value so the MSR has only
to be written when the values are different.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-15-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 3 +++
 arch/x86/kernel/fpu/core.c       | 2 ++
 arch/x86/kernel/fpu/xstate.h     | 4 ++++
 3 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 785d3063fe11..e77597865b49 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -302,6 +302,9 @@ struct fpstate {
 	/* @user_xfeatures:	xfeatures valid in UABI buffers */
 	u64			user_xfeatures;
 
+	/* @xfd:		xfeatures disabled to trap userspace use. */
+	u64			xfd;
+
 	/* @is_valloc:		Indicator for dynamically allocated state */
 	unsigned int		is_valloc	: 1;
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 782bb4bd9117..ca701bb5165f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -27,6 +27,7 @@
 
 #ifdef CONFIG_X86_64
 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+DEFINE_PER_CPU(u64, xfd_state);
 #endif
 
 /* The FPU state configuration data for kernel and user space */
@@ -409,6 +410,7 @@ static void __fpstate_reset(struct fpstate *fpstate)
 	fpstate->user_size	= fpu_user_cfg.default_size;
 	fpstate->xfeatures	= fpu_kernel_cfg.default_features;
 	fpstate->user_xfeatures	= fpu_user_cfg.default_features;
+	fpstate->xfd		= init_fpstate.xfd;
 }
 
 void fpstate_reset(struct fpu *fpu)
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 4ce1dc030f38..32a4dee4de3b 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -5,6 +5,10 @@
 #include <asm/cpufeature.h>
 #include <asm/fpu/xstate.h>
 
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(u64, xfd_state);
+#endif
+
 static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
 {
 	/*
-- 
Gitee


From 2c40bfa4215687db2ddbb38dd3c25f9e59026f03 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Oct 2021 15:55:19 -0700
Subject: [PATCH 0807/2161] x86/fpu: Add sanity checks for XFD

commit 5529acf47ec31ece0815f69d43f5e6a1e485a0f3 upstream.

Add debug functionality to ensure that the XFD MSR is up to date for XSAVE*
and XRSTOR* operations.

 [ tglx: Improve comment. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-16-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c   |  9 +++---
 arch/x86/kernel/fpu/signal.c |  6 ++--
 arch/x86/kernel/fpu/xstate.c | 58 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/xstate.h | 34 ++++++++++++++++++---
 4 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ca701bb5165f..f069557187f4 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -166,7 +166,7 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
 		 */
 		mask = fpu_kernel_cfg.max_features & mask;
 
-		os_xrstor(&fpstate->regs.xsave, mask);
+		os_xrstor(fpstate, mask);
 	} else {
 		if (use_fxsr())
 			fxrstor(&fpstate->regs.fxsave);
@@ -534,7 +534,7 @@ void fpu__drop(struct fpu *fpu)
 static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 {
 	if (use_xsave())
-		os_xrstor(&init_fpstate.regs.xsave, features_mask);
+		os_xrstor(&init_fpstate, features_mask);
 	else if (use_fxsr())
 		fxrstor(&init_fpstate.regs.fxsave);
 	else
@@ -591,9 +591,8 @@ void fpu__clear_user_states(struct fpu *fpu)
 	 * corresponding registers.
 	 */
 	if (xfeatures_mask_supervisor() &&
-	    !fpregs_state_valid(fpu, smp_processor_id())) {
-		os_xrstor(&fpu->fpstate->regs.xsave, xfeatures_mask_supervisor());
-	}
+	    !fpregs_state_valid(fpu, smp_processor_id()))
+		os_xrstor_supervisor(fpu->fpstate);
 
 	/* Reset user states in registers. */
 	restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index da0a6842399f..7d7dfa36b1fc 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -261,7 +261,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
 			ret = fxrstor_from_user_sigframe(buf);
 
 		if (!ret && unlikely(init_bv))
-			os_xrstor(&init_fpstate.regs.xsave, init_bv);
+			os_xrstor(&init_fpstate, init_bv);
 		return ret;
 	} else if (use_fxsr()) {
 		return fxrstor_from_user_sigframe(buf);
@@ -322,7 +322,7 @@ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
 	 * been restored from a user buffer directly.
 	 */
 	if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
-		os_xrstor(&fpu->fpstate->regs.xsave, xfeatures_mask_supervisor());
+		os_xrstor_supervisor(fpu->fpstate);
 
 	fpregs_mark_activate();
 	fpregs_unlock();
@@ -432,7 +432,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
 		u64 mask = user_xfeatures | xfeatures_mask_supervisor();
 
 		fpregs->xsave.header.xfeatures &= mask;
-		success = !os_xrstor_safe(&fpregs->xsave,
+		success = !os_xrstor_safe(fpu->fpstate,
 					  fpu_kernel_cfg.max_features);
 	} else {
 		success = !fxrstor_safe(&fpregs->fxsave);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d9155925d7ae..754609f6f670 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1329,6 +1329,64 @@ EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
 #endif
 
 #ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
+ * can safely operate on the @fpstate buffer.
+ */
+static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+	u64 xfd = __this_cpu_read(xfd_state);
+
+	if (fpstate->xfd == xfd)
+		return true;
+
+	 /*
+	  * The XFD MSR does not match fpstate->xfd. That's invalid when
+	  * the passed in fpstate is current's fpstate.
+	  */
+	if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
+		return false;
+
+	/*
+	 * XRSTOR(S) from init_fpstate are always correct as it will just
+	 * bring all components into init state and not read from the
+	 * buffer. XSAVE(S) raises #PF after init.
+	 */
+	if (fpstate == &init_fpstate)
+		return rstor;
+
+	/*
+	 * XSAVE(S): clone(), fpu_swap_kvm_fpu()
+	 * XRSTORS(S): fpu_swap_kvm_fpu()
+	 */
+
+	/*
+	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
+	 * the buffer area for XFD-disabled state components.
+	 */
+	mask &= ~xfd;
+
+	/*
+	 * Remove features which are valid in fpstate. They
+	 * have space allocated in fpstate.
+	 */
+	mask &= ~fpstate->xfeatures;
+
+	/*
+	 * Any remaining state components in 'mask' might be written
+	 * by XSAVE/XRSTOR. Fail validation it found.
+	 */
+	return !mask;
+}
+
+void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
+}
+#endif /* CONFIG_X86_DEBUG_FPU */
+
 static int validate_sigaltstack(unsigned int usize)
 {
 	struct task_struct *thread, *leader = current->group_leader;
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 32a4dee4de3b..29024244965b 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -130,6 +130,12 @@ static inline u64 xfeatures_mask_independent(void)
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
+extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
+#else
+static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
+#endif
+
 /*
  * Save processor xstate to xsave area.
  *
@@ -144,6 +150,7 @@ static inline void os_xsave(struct fpstate *fpstate)
 	int err;
 
 	WARN_ON_FPU(!alternatives_patched);
+	xfd_validate_state(fpstate, mask, false);
 
 	XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
 
@@ -156,12 +163,23 @@ static inline void os_xsave(struct fpstate *fpstate)
  *
  * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
  */
-static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
+static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	xfd_validate_state(fpstate, mask, true);
+	XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/* Restore of supervisor state. Does not require XFD */
+static inline void os_xrstor_supervisor(struct fpstate *fpstate)
 {
+	u64 mask = xfeatures_mask_supervisor();
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 
-	XSTATE_XRESTORE(xstate, lmask, hmask);
+	XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
 }
 
 /*
@@ -184,11 +202,14 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
 	 * internally, e.g. PKRU. That's user space ABI and also required
 	 * to allow the signal handler to modify PKRU.
 	 */
-	u64 mask = current->thread.fpu.fpstate->user_xfeatures;
+	struct fpstate *fpstate = current->thread.fpu.fpstate;
+	u64 mask = fpstate->user_xfeatures;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
 
+	xfd_validate_state(fpstate, mask, false);
+
 	stac();
 	XSTATE_OP(XSAVE, buf, lmask, hmask, err);
 	clac();
@@ -206,6 +227,8 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64
 	u32 hmask = mask >> 32;
 	int err;
 
+	xfd_validate_state(current->thread.fpu.fpstate, mask, true);
+
 	stac();
 	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
 	clac();
@@ -217,12 +240,15 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64
  * Restore xstate from kernel space xsave area, return an error code instead of
  * an exception.
  */
-static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
+static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
 {
+	struct xregs_state *xstate = &fpstate->regs.xsave;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
 
+	/* Must enforce XFD update here */
+
 	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
 	else
-- 
Gitee


From f1b1524c0ab278ee305d67570ad578949d523361 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:20 -0700
Subject: [PATCH 0808/2161] x86/fpu: Update XFD state where required

commit 672365477ae8afca5a1cca98c1deb733235e4525 upstream.

The IA32_XFD_MSR allows to arm #NM traps for XSTATE components which are
enabled in XCR0. The register has to be restored before the tasks XSTATE is
restored. The life time rules are the same as for FPU state.

XFD is updated on return to userspace only when the FPU state of the task
is not up to date in the registers. It's updated before the XRSTORS so
that eventually enabled dynamic features are restored as well and not
brought into init state.

Also in signal handling for restoring FPU state from user space the
correctness of the XFD state has to be ensured.

Add it to CPU initialization and resume as well.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211021225527.10184-17-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/context.h |  2 ++
 arch/x86/kernel/fpu/core.c    | 28 +++++++++++++++++++++++++++-
 arch/x86/kernel/fpu/signal.c  |  2 ++
 arch/x86/kernel/fpu/xstate.c  | 12 ++++++++++++
 arch/x86/kernel/fpu/xstate.h  | 19 ++++++++++++++++++-
 5 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index a06ebf315d83..958accf2ccf0 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -69,6 +69,8 @@ static inline void fpregs_restore_userregs(void)
 		 * correct because it was either set in switch_to() or in
 		 * flush_thread(). So it is excluded because it might be
 		 * not up to date in current->thread.fpu.xsave state.
+		 *
+		 * XFD state is handled in restore_fpregs_from_fpstate().
 		 */
 		restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index f069557187f4..fc28937cdc72 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -155,6 +155,23 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
 	}
 
 	if (use_xsave()) {
+		/*
+		 * Dynamically enabled features are enabled in XCR0, but
+		 * usage requires also that the corresponding bits in XFD
+		 * are cleared.  If the bits are set then using a related
+		 * instruction will raise #NM. This allows to do the
+		 * allocation of the larger FPU buffer lazy from #NM or if
+		 * the task has no permission to kill it which would happen
+		 * via #UD if the feature is disabled in XCR0.
+		 *
+		 * XFD state is following the same life time rules as
+		 * XSTATE and to restore state correctly XFD has to be
+		 * updated before XRSTORS otherwise the component would
+		 * stay in or go into init state even if the bits are set
+		 * in fpstate::regs::xsave::xfeatures.
+		 */
+		xfd_update_state(fpstate);
+
 		/*
 		 * Restoring state always needs to modify all features
 		 * which are in @mask even if the current task cannot use
@@ -241,8 +258,17 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 
 	cur_fps = fpu->fpstate;
 
-	if (!cur_fps->is_confidential)
+	if (!cur_fps->is_confidential) {
+		/* Includes XFD update */
 		restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+	} else {
+		/*
+		 * XSTATE is restored by firmware from encrypted
+		 * memory. Make sure XFD state is correct while
+		 * running with guest fpstate
+		 */
+		xfd_update_state(cur_fps);
+	}
 
 	fpregs_mark_activate();
 	fpregs_unlock();
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 7d7dfa36b1fc..b0b06ad87f1a 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -282,6 +282,8 @@ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
 
 retry:
 	fpregs_lock();
+	/* Ensure that XFD is up to date */
+	xfd_update_state(fpu->fpstate);
 	pagefault_disable();
 	ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
 					 xrestore, fx_only);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 754609f6f670..73c8f0f6fc82 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -137,6 +137,15 @@ void fpu__init_cpu_xstate(void)
 
 	cr4_set_bits(X86_CR4_OSXSAVE);
 
+	/*
+	 * Must happen after CR4 setup and before xsetbv() to allow KVM
+	 * lazy passthrough.  Write independent of the dynamic state static
+	 * key as that does not work on the boot CPU. This also ensures
+	 * that any stale state is wiped out from XFD.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_XFD))
+		wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
+
 	/*
 	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
@@ -873,6 +882,9 @@ void fpu__resume_cpu(void)
 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
 				     xfeatures_mask_independent());
 	}
+
+	if (fpu_state_size_dynamic())
+		wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 29024244965b..e18210dff88c 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -136,6 +136,22 @@ extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
 static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
 #endif
 
+#ifdef CONFIG_X86_64
+static inline void xfd_update_state(struct fpstate *fpstate)
+{
+	if (fpu_state_size_dynamic()) {
+		u64 xfd = fpstate->xfd;
+
+		if (__this_cpu_read(xfd_state) != xfd) {
+			wrmsrl(MSR_IA32_XFD, xfd);
+			__this_cpu_write(xfd_state, xfd);
+		}
+	}
+}
+#else
+static inline void xfd_update_state(struct fpstate *fpstate) { }
+#endif
+
 /*
  * Save processor xstate to xsave area.
  *
@@ -247,7 +263,8 @@ static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
 	u32 hmask = mask >> 32;
 	int err;
 
-	/* Must enforce XFD update here */
+	/* Ensure that XFD is up to date */
+	xfd_update_state(fpstate);
 
 	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
-- 
Gitee


From 82e61beebcab1f10359210194905a00f5ce0b34b Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 27 Dec 2021 16:21:18 +0800
Subject: [PATCH 0809/2161] x86/fpu/xstate: Add XFD #NM handler

commit 783e87b404956f8958657aed8a6a72aa98d5b7e1 upstream.

If the XFD MSR has feature bits set then #NM will be raised when user space
attempts to use an instruction related to one of these features.

When the task has no permissions to use that feature, raise SIGILL, which
is the same behavior as #UD.

If the task has permissions, calculate the new buffer size for the extended
feature set and allocate a larger fpstate. In the unlikely case that
vzalloc() fails, SIGSEGV is raised.

The allocation function will be added in the next step. Provide a stub
which fails for now.

  [ tglx: Updated serialization ]

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211021225527.10184-18-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h |  2 ++
 arch/x86/kernel/fpu/xstate.c      | 47 +++++++++++++++++++++++++++
 arch/x86/kernel/traps.c           | 53 +++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 7cbbeb0e526c..b9fedf7c2637 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -100,6 +100,8 @@ int xfeature_size(int xfeature_nr);
 void xsaves(struct xregs_state *xsave, u64 mask);
 void xrstors(struct xregs_state *xsave, u64 mask);
 
+int xfd_enable_feature(u64 xfd_err);
+
 #ifdef CONFIG_X86_64
 DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
 #endif
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 73c8f0f6fc82..51b670f9a969 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1492,6 +1492,53 @@ static int xstate_request_perm(unsigned long idx)
 	spin_unlock_irq(&current->sighand->siglock);
 	return ret;
 }
+
+/* Place holder for now */
+static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
+			   unsigned int usize)
+{
+	return -ENOMEM;
+}
+
+int xfd_enable_feature(u64 xfd_err)
+{
+	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+	unsigned int ksize, usize;
+	struct fpu *fpu;
+
+	if (!xfd_event) {
+		pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+		return 0;
+	}
+
+	/* Protect against concurrent modifications */
+	spin_lock_irq(&current->sighand->siglock);
+
+	/* If not permitted let it die */
+	if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) {
+		spin_unlock_irq(&current->sighand->siglock);
+		return -EPERM;
+	}
+
+	fpu = &current->group_leader->thread.fpu;
+	ksize = fpu->perm.__state_size;
+	usize = fpu->perm.__user_state_size;
+	/*
+	 * The feature is permitted. State size is sufficient.  Dropping
+	 * the lock is safe here even if more features are added from
+	 * another task, the retrieved buffer sizes are valid for the
+	 * currently requested feature(s).
+	 */
+	spin_unlock_irq(&current->sighand->siglock);
+
+	/*
+	 * Try to allocate a new fpstate. If that fails there is no way
+	 * out.
+	 */
+	if (fpstate_realloc(xfd_event, ksize, usize))
+		return -EFAULT;
+	return 0;
+}
 #else /* CONFIG_X86_64 */
 static inline int xstate_request_perm(unsigned long idx)
 {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 36efe7268a94..bd1065cef1e6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -212,6 +212,21 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 	}
 }
 
+/*
+ * Posix requires to provide the address of the faulting instruction for
+ * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
+ *
+ * This address is usually regs->ip, but when an uprobe moved the code out
+ * of line then regs->ip points to the XOL code which would confuse
+ * anything which analyzes the fault address vs. the unmodified binary. If
+ * a trap happened in XOL code then uprobe maps regs->ip back to the
+ * original instruction address.
+ */
+static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
+{
+	return (void __user *)uprobe_get_trap_addr(regs);
+}
+
 #define IP ((void __user *)uprobe_get_trap_addr(regs))
 #define DO_ERROR(trapnr, signr, sicode, addr, str, name)		   \
 dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	   \
@@ -806,11 +821,49 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 	cond_local_irq_enable(regs);
 }
 
+static bool handle_xfd_event(struct pt_regs *regs)
+{
+	u64 xfd_err;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
+		return false;
+
+	rdmsrl(MSR_IA32_XFD_ERR, xfd_err);
+	if (!xfd_err)
+		return false;
+
+	wrmsrl(MSR_IA32_XFD_ERR, 0);
+
+	/* Die if that happens in kernel space */
+	if (WARN_ON(!user_mode(regs)))
+		return false;
+
+	local_irq_enable();
+
+	err = xfd_enable_feature(xfd_err);
+
+	switch (err) {
+	case -EPERM:
+		force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs));
+		break;
+	case -EFAULT:
+		force_sig(SIGSEGV);
+		break;
+	}
+
+	local_irq_disable();
+	return true;
+}
+
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
 	unsigned long cr0 = read_cr0();
 
+	if (handle_xfd_event(regs))
+		return;
+
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 
 #ifdef CONFIG_MATH_EMULATION
-- 
Gitee


From 0b1479c0b65d7852e1e93bc130e5fe09eed01065 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:22 -0700
Subject: [PATCH 0810/2161] x86/fpu/xstate: Add fpstate_realloc()/free()

commit 500afbf645a040a39e1af0dba2fdf6ebf224bd47 upstream.

The fpstate embedded in struct fpu is the default state for storing the FPU
registers. It's sized so that the default supported features can be stored.
For dynamically enabled features the register buffer is too small.

The #NM handler detects first use of a feature which is disabled in the
XFD MSR. After handling permission checks it recalculates the size for
kernel space and user space state and invokes fpstate_realloc() which
tries to reallocate fpstate and install it.

Provide the allocator function which checks whether the current buffer size
is sufficient and if not allocates one. If allocation is successful the new
fpstate is initialized with the new features and sizes and the now enabled
features is removed from the task's XFD mask.

realloc_fpstate() uses vzalloc(). If use of this mechanism grows to
re-allocate buffers larger than 64KB, a more sophisticated allocation
scheme that includes purpose-built reclaim capability might be justified.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211021225527.10184-19-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  7 +++
 arch/x86/kernel/fpu/xstate.c   | 97 +++++++++++++++++++++++++++++++---
 arch/x86/kernel/process.c      | 10 ++++
 3 files changed, 106 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 97d609920bba..4e71fd37ffd0 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -118,6 +118,13 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {}
 /* State tracking */
 DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
+/* Process cleanup */
+#ifdef CONFIG_X86_64
+extern void fpstate_free(struct fpu *fpu);
+#else
+static inline void fpstate_free(struct fpu *fpu) { }
+#endif
+
 /* fpstate-related functions which are exported to KVM */
 extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 51b670f9a969..886f1dfeeca5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -12,6 +12,7 @@
 #include <linux/pkeys.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
 #include <linux/sched/signal.h>
 
 #include <asm/fpu/api.h>
@@ -23,6 +24,7 @@
 #include <asm/prctl.h>
 #include <asm/elf.h>
 
+#include "context.h"
 #include "internal.h"
 #include "legacy.h"
 #include "xstate.h"
@@ -1399,6 +1401,91 @@ void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
 }
 #endif /* CONFIG_X86_DEBUG_FPU */
 
+void fpstate_free(struct fpu *fpu)
+{
+	if (fpu->fpstate || fpu->fpstate != &fpu->__fpstate)
+		vfree(fpu->fpstate);
+}
+
+/**
+ * fpu_install_fpstate - Update the active fpstate in the FPU
+ *
+ * @fpu:	A struct fpu * pointer
+ * @newfps:	A struct fpstate * pointer
+ *
+ * Returns:	A null pointer if the last active fpstate is the embedded
+ *		one or the new fpstate is already installed;
+ *		otherwise, a pointer to the old fpstate which has to
+ *		be freed by the caller.
+ */
+static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
+					   struct fpstate *newfps)
+{
+	struct fpstate *oldfps = fpu->fpstate;
+
+	if (fpu->fpstate == newfps)
+		return NULL;
+
+	fpu->fpstate = newfps;
+	return oldfps != &fpu->__fpstate ? oldfps : NULL;
+}
+
+/**
+ * fpstate_realloc - Reallocate struct fpstate for the requested new features
+ *
+ * @xfeatures:	A bitmap of xstate features which extend the enabled features
+ *		of that task
+ * @ksize:	The required size for the kernel buffer
+ * @usize:	The required size for user space buffers
+ *
+ * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
+ * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
+ * with large states are likely to live longer.
+ *
+ * Returns: 0 on success, -ENOMEM on allocation error.
+ */
+static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
+			   unsigned int usize)
+{
+	struct fpu *fpu = &current->thread.fpu;
+	struct fpstate *curfps, *newfps = NULL;
+	unsigned int fpsize;
+
+	curfps = fpu->fpstate;
+	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
+
+	newfps = vzalloc(fpsize);
+	if (!newfps)
+		return -ENOMEM;
+	newfps->size = ksize;
+	newfps->user_size = usize;
+	newfps->is_valloc = true;
+
+	fpregs_lock();
+	/*
+	 * Ensure that the current state is in the registers before
+	 * swapping fpstate as that might invalidate it due to layout
+	 * changes.
+	 */
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		fpregs_restore_userregs();
+
+	newfps->xfeatures = curfps->xfeatures | xfeatures;
+	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
+	newfps->xfd = curfps->xfd & ~xfeatures;
+
+	curfps = fpu_install_fpstate(fpu, newfps);
+
+	/* Do the final updates within the locked region */
+	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
+	xfd_update_state(newfps);
+
+	fpregs_unlock();
+
+	vfree(curfps);
+	return 0;
+}
+
 static int validate_sigaltstack(unsigned int usize)
 {
 	struct task_struct *thread, *leader = current->group_leader;
@@ -1421,7 +1508,8 @@ static int __xstate_request_perm(u64 permitted, u64 requested)
 	/*
 	 * This deliberately does not exclude !XSAVES as we still might
 	 * decide to optionally context switch XCR0 or talk the silicon
-	 * vendors into extending XFD for the pre AMX states.
+	 * vendors into extending XFD for the pre AMX states, especially
+	 * AVX512.
 	 */
 	bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
 	struct fpu *fpu = &current->group_leader->thread.fpu;
@@ -1493,13 +1581,6 @@ static int xstate_request_perm(unsigned long idx)
 	return ret;
 }
 
-/* Place holder for now */
-static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
-			   unsigned int usize)
-{
-	return -ENOMEM;
-}
-
 int xfd_enable_feature(u64 xfd_err)
 {
 	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f36cf2c463ff..4003f0188f8f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -33,6 +33,7 @@
 #include <asm/mwait.h>
 #include <asm/fpu/api.h>
 #include <asm/fpu/sched.h>
+#include <asm/fpu/xstate.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
@@ -103,9 +104,18 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 #endif
 	/* Drop the copied pointer to current's fpstate */
 	dst->thread.fpu.fpstate = NULL;
+
 	return 0;
 }
 
+#ifdef CONFIG_X86_64
+void arch_release_task_struct(struct task_struct *tsk)
+{
+	if (fpu_state_size_dynamic())
+		fpstate_free(&tsk->thread.fpu);
+}
+#endif
+
 /*
  * Free current thread data structures etc..
  */
-- 
Gitee


From 89ac37afc7fd553e326d4cfc49345815dc2904df Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:23 -0700
Subject: [PATCH 0811/2161] x86/fpu/xstate: Prepare XSAVE feature table for
 gaps in state component numbers

commit 70c3f1671b0cbc386b387f1de33b7837e276a195 upstream.

The kernel checks at boot time which features are available by walking a
XSAVE feature table which contains the CPUID feature bit numbers which need
to be checked whether a feature is available on a CPU or not. So far the
feature numbers have been linear, but AMX will create a gap which the
current code cannot handle.

Make the table entries explicitly indexed and adjust the loop code
accordingly to prepare for that.

No functional change.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211021225527.10184-20-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 886f1dfeeca5..70aadb7a5cea 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -54,18 +54,18 @@ static const char *xfeature_names[] =
 	"unknown xstate feature"	,
 };
 
-static short xsave_cpuid_features[] __initdata = {
-	X86_FEATURE_FPU,
-	X86_FEATURE_XMM,
-	X86_FEATURE_AVX,
-	X86_FEATURE_MPX,
-	X86_FEATURE_MPX,
-	X86_FEATURE_AVX512F,
-	X86_FEATURE_AVX512F,
-	X86_FEATURE_AVX512F,
-	X86_FEATURE_INTEL_PT,
-	X86_FEATURE_PKU,
-	X86_FEATURE_ENQCMD,
+static unsigned short xsave_cpuid_features[] __initdata = {
+	[XFEATURE_FP]				= X86_FEATURE_FPU,
+	[XFEATURE_SSE]				= X86_FEATURE_XMM,
+	[XFEATURE_YMM]				= X86_FEATURE_AVX,
+	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
+	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
+	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
+	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
+	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
+	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
+	[XFEATURE_PKRU]				= X86_FEATURE_PKU,
+	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
 };
 
 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
@@ -807,7 +807,10 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	 * Clear XSAVE features that are disabled in the normal CPUID.
 	 */
 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
-		if (!boot_cpu_has(xsave_cpuid_features[i]))
+		unsigned short cid = xsave_cpuid_features[i];
+
+		/* Careful: X86_FEATURE_FPU is 0! */
+		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
 			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
 	}
 
-- 
Gitee


From 8a98b98a19c9d0a312627e421aa29b0ac6bbb1b9 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 27 Dec 2021 16:54:45 +0800
Subject: [PATCH 0812/2161] x86/fpu/amx: Define AMX state components and have
 it used for boot-time checks

commit eec2113eabd92b7bfbaf1033fa82dc8eb4951203 upstream.

The XSTATE initialization uses check_xstate_against_struct() to sanity
check the size of XSTATE-enabled features. AMX is a XSAVE-enabled feature,
and its size is not hard-coded but discoverable at run-time via CPUID.

The AMX state is composed of state components 17 and 18, which are all user
state components. The first component is the XTILECFG state of a 64-byte
tile-related control register. The state component 18, called XTILEDATA,
contains the actual tile data, and the state size varies on
implementations. The architectural maximum, as defined in the CPUID(0x1d,
1): EAX[15:0], is a byte less than 64KB. The first implementation supports
8KB.

Check the XTILEDATA state size dynamically. The feature introduces the new
tile register, TMM. Define one register struct only and read the number of
registers from CPUID. Cross-check the overall size with CPUID again.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211021225527.10184-21-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/fpu/types.h   | 32 ++++++++++++
 arch/x86/include/asm/fpu/xstate.h  |  2 +
 arch/x86/kernel/fpu/xstate.c       | 80 +++++++++++++++++++++++++++++-
 4 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 47d22c3ca6c9..2b59e15ebfde 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -289,6 +289,7 @@
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_AMX_TILE		(18*32+24) /* AMX tile Support */
 
 /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
 #define X86_FEATURE_CLZERO		(13*32+ 0) /* CLZERO instruction */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index e77597865b49..7e1954d5dd50 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -120,6 +120,9 @@ enum xfeature {
 	XFEATURE_RSRVD_COMP_13,
 	XFEATURE_RSRVD_COMP_14,
 	XFEATURE_LBR,
+	XFEATURE_RSRVD_COMP_16,
+	XFEATURE_XTILE_CFG,
+	XFEATURE_XTILE_DATA,
 
 	XFEATURE_MAX,
 };
@@ -136,12 +139,21 @@ enum xfeature {
 #define XFEATURE_MASK_PKRU		(1 << XFEATURE_PKRU)
 #define XFEATURE_MASK_PASID		(1 << XFEATURE_PASID)
 #define XFEATURE_MASK_LBR		(1 << XFEATURE_LBR)
+#define XFEATURE_MASK_XTILE_CFG		(1 << XFEATURE_XTILE_CFG)
+#define XFEATURE_MASK_XTILE_DATA	(1 << XFEATURE_XTILE_DATA)
 
 #define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
 					 | XFEATURE_MASK_ZMM_Hi256 \
 					 | XFEATURE_MASK_Hi16_ZMM)
 
+#ifdef CONFIG_X86_64
+# define XFEATURE_MASK_XTILE		(XFEATURE_MASK_XTILE_DATA \
+					 | XFEATURE_MASK_XTILE_CFG)
+#else
+# define XFEATURE_MASK_XTILE		(0)
+#endif
+
 #define FIRST_EXTENDED_XFEATURE	XFEATURE_YMM
 
 struct reg_128_bit {
@@ -153,6 +165,9 @@ struct reg_256_bit {
 struct reg_512_bit {
 	u8	regbytes[512/8];
 };
+struct reg_1024_byte {
+	u8	regbytes[1024];
+};
 
 /*
  * State component 2:
@@ -237,6 +252,23 @@ struct pkru_state {
 	u32				pad;
 } __packed;
 
+/*
+ * State component 17: 64-byte tile configuration register.
+ */
+struct xtile_cfg {
+	u64				tcfg[8];
+} __packed;
+
+/*
+ * State component 18: 1KB tile data register.
+ * Each register represents 16 64-byte rows of the matrix
+ * data. But the number of registers depends on the actual
+ * implementation.
+ */
+struct xtile_data {
+	struct reg_1024_byte		tmm;
+} __packed;
+
 /*
  * State component 10 is supervisor state used for context-switching the
  * PASID state.
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index b9fedf7c2637..b50fe07a5b1d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -14,6 +14,8 @@
 
 #define XSTATE_CPUID		0x0000000d
 
+#define TILE_CPUID		0x0000001d
+
 #define FXSAVE_SIZE	512
 
 #define XSAVE_HDR_SIZE	    64
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 70aadb7a5cea..d666dee0550d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -52,6 +52,14 @@ static const char *xfeature_names[] =
 	"Protection Keys User registers",
 	"PASID state",
 	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"AMX Tile config"		,
+	"AMX Tile data"			,
+	"unknown xstate feature"	,
 };
 
 static unsigned short xsave_cpuid_features[] __initdata = {
@@ -66,6 +74,8 @@ static unsigned short xsave_cpuid_features[] __initdata = {
 	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
 	[XFEATURE_PKRU]				= X86_FEATURE_PKU,
 	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
+	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
+	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
 };
 
 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
@@ -241,6 +251,8 @@ static void __init print_xstate_features(void)
 	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
 	print_xstate_feature(XFEATURE_MASK_PKRU);
 	print_xstate_feature(XFEATURE_MASK_PASID);
+	print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
+	print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
 }
 
 /*
@@ -521,6 +533,67 @@ static void __init __xstate_dump_leaves(void)
 	}								\
 } while (0)
 
+/**
+ * check_xtile_data_against_struct - Check tile data state size.
+ *
+ * Calculate the state size by multiplying the single tile size which is
+ * recorded in a C struct, and the number of tiles that the CPU informs.
+ * Compare the provided size with the calculation.
+ *
+ * @size:	The tile data state size
+ *
+ * Returns:	0 on success, -EINVAL on mismatch.
+ */
+static int __init check_xtile_data_against_struct(int size)
+{
+	u32 max_palid, palid, state_size;
+	u32 eax, ebx, ecx, edx;
+	u16 max_tile;
+
+	/*
+	 * Check the maximum palette id:
+	 *   eax: the highest numbered palette subleaf.
+	 */
+	cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
+
+	/*
+	 * Cross-check each tile size and find the maximum number of
+	 * supported tiles.
+	 */
+	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
+		u16 tile_size, max;
+
+		/*
+		 * Check the tile size info:
+		 *   eax[31:16]:  bytes per title
+		 *   ebx[31:16]:  the max names (or max number of tiles)
+		 */
+		cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
+		tile_size = eax >> 16;
+		max = ebx >> 16;
+
+		if (tile_size != sizeof(struct xtile_data)) {
+			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
+			       __stringify(XFEATURE_XTILE_DATA),
+			       sizeof(struct xtile_data), tile_size);
+			__xstate_dump_leaves();
+			return -EINVAL;
+		}
+
+		if (max > max_tile)
+			max_tile = max;
+	}
+
+	state_size = sizeof(struct xtile_data) * max_tile;
+	if (size != state_size) {
+		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
+		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
+		__xstate_dump_leaves();
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /*
  * We have a C struct for each 'xstate'.  We need to ensure
  * that our software representation matches what the CPU
@@ -544,6 +617,11 @@ static bool __init check_xstate_against_struct(int nr)
 	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
 	XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
 	XCHECK_SZ(sz, nr, XFEATURE_PASID,     struct ia32_pasid_state);
+	XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
+
+	/* The tile data size varies between implementations. */
+	if (nr == XFEATURE_XTILE_DATA)
+		check_xtile_data_against_struct(sz);
 
 	/*
 	 * Make *SURE* to add any feature numbers in below if
@@ -553,7 +631,7 @@ static bool __init check_xstate_against_struct(int nr)
 	if ((nr < XFEATURE_YMM) ||
 	    (nr >= XFEATURE_MAX) ||
 	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
-	    ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) {
+	    ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
 		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
 		XSTATE_WARN_ON(1);
 		return false;
-- 
Gitee


From 2267b187cbdaf4dc08210b1c9233c76f7213eadc Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:25 -0700
Subject: [PATCH 0813/2161] x86/fpu: Calculate the default sizes independently

commit 2ae996e0c1a38ca57a52438ab9deec6761dcba62 upstream.

When dynamically enabled states are supported the maximum and default sizes
for the kernel buffers and user space interfaces are not longer identical.

Put the necessary calculations in place which only take the default enabled
features into account.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211021225527.10184-22-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d666dee0550d..ddf4ed27c3ff 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -779,35 +779,40 @@ static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
 static int __init init_xstate_size(void)
 {
 	/* Recompute the context size for enabled features: */
-	unsigned int user_size, kernel_size;
+	unsigned int user_size, kernel_size, kernel_default_size;
+	bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
 
 	/* Uncompacted user space size */
 	user_size = get_xsave_size_user();
 
 	/*
 	 * XSAVES kernel size includes supervisor states and
-	 * uses compacted format.
+	 * uses compacted format when available.
 	 *
 	 * XSAVE does not support supervisor states so
 	 * kernel and user size is identical.
 	 */
-	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+	if (compacted)
 		kernel_size = get_xsaves_size_no_independent();
 	else
 		kernel_size = user_size;
 
-	/* Ensure we have the space to store all enabled features. */
-	if (!is_supported_xstate_size(kernel_size))
+	kernel_default_size =
+		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
+
+	/* Ensure we have the space to store all default enabled features. */
+	if (!is_supported_xstate_size(kernel_default_size))
 		return -EINVAL;
 
 	if (!paranoid_xstate_size_valid(kernel_size))
 		return -EINVAL;
 
-	/* Keep it the same for now */
 	fpu_kernel_cfg.max_size = kernel_size;
-	fpu_kernel_cfg.default_size = kernel_size;
 	fpu_user_cfg.max_size = user_size;
-	fpu_user_cfg.default_size = user_size;
+
+	fpu_kernel_cfg.default_size = kernel_default_size;
+	fpu_user_cfg.default_size =
+		xstate_calculate_size(fpu_user_cfg.default_features, false);
 
 	return 0;
 }
@@ -892,15 +897,21 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
 	}
 
+	if (!cpu_feature_enabled(X86_FEATURE_XFD))
+		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
 	fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
 			      XFEATURE_MASK_SUPERVISOR_SUPPORTED;
 
 	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
 	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
 
-	/* Identical for now */
+	/* Clean out dynamic features from default */
 	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
+	fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
 	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
+	fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
 
 	/* Store it for paranoia check at the end */
 	xfeatures = fpu_kernel_cfg.max_features;
@@ -911,6 +922,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	if (err)
 		goto out_disable;
 
+	/* Reset the state for the current task */
 	fpstate_reset(&current->thread.fpu);
 
 	/*
-- 
Gitee


From d4179450d73203f31594e1c1143a4eff375de3f6 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:26 -0700
Subject: [PATCH 0814/2161] x86/fpu: Add XFD handling for dynamic states

commit db3e7321b4b84b1cb39598ff79b90d1252481378 upstream.

To handle the dynamic sizing of buffers on first use the XFD MSR has to be
armed. Store the delta between the maximum available and the default
feature bits in init_fpstate where it can be retrieved for task creation.

If the delta is non zero then dynamic features are enabled. This needs also
to enable the static key which guards the XFD updates. This is delayed to
an initcall because the FPU setup runs before jump labels are initialized.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211021225527.10184-23-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index ddf4ed27c3ff..4569528eff5b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -833,6 +833,12 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
 	fpu_user_cfg.max_size = legacy_size;
 	fpu_user_cfg.default_size = legacy_size;
 
+	/*
+	 * Prevent enabling the static branch which enables writes to the
+	 * XFD MSR.
+	 */
+	init_fpstate.xfd = 0;
+
 	fpstate_reset(&current->thread.fpu);
 }
 
@@ -916,6 +922,14 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 	/* Store it for paranoia check at the end */
 	xfeatures = fpu_kernel_cfg.max_features;
 
+	/*
+	 * Initialize the default XFD state in initfp_state and enable the
+	 * dynamic sizing mechanism if dynamic states are available.  The
+	 * static key cannot be enabled here because this runs before
+	 * jump_label_init(). This is delayed to an initcall.
+	 */
+	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
+
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
 	err = init_xstate_size();
@@ -1494,9 +1508,21 @@ void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
 }
 #endif /* CONFIG_X86_DEBUG_FPU */
 
+static int __init xfd_update_static_branch(void)
+{
+	/*
+	 * If init_fpstate.xfd has bits set then dynamic features are
+	 * available and the dynamic sizing must be enabled.
+	 */
+	if (init_fpstate.xfd)
+		static_branch_enable(&__fpu_state_size_dynamic);
+	return 0;
+}
+arch_initcall(xfd_update_static_branch)
+
 void fpstate_free(struct fpu *fpu)
 {
-	if (fpu->fpstate || fpu->fpstate != &fpu->__fpstate)
+	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
 		vfree(fpu->fpstate);
 }
 
-- 
Gitee


From cda6d90ddfa2ba5b326ecf31af6eb5dc7cae6fb9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Jun 2021 16:18:25 +0200
Subject: [PATCH 0815/2161] x86/fpu: Make init_fpstate correct with optimized
 XSAVE

commit f9dfb5e390fab2df9f7944bb91e7705aba14cd26 upstream.

The XSAVE init code initializes all enabled and supported components with
XRSTOR(S) to init state. Then it XSAVEs the state of the components back
into init_fpstate which is used in several places to fill in the init state
of components.

This works correctly with XSAVE, but not with XSAVEOPT and XSAVES because
those use the init optimization and skip writing state of components which
are in init state. So init_fpstate.xsave still contains all zeroes after
this operation.

There are two ways to solve that:

   1) Use XSAVE unconditionally, but that requires to reshuffle the buffer when
      XSAVES is enabled because XSAVES uses compacted format.

   2) Save the components which are known to have a non-zero init state by other
      means.

Looking deeper, #2 is the right thing to do because all components the
kernel supports have all-zeroes init state except the legacy features (FP,
SSE). Those cannot be hard coded because the states are not identical on all
CPUs, but they can be saved with FXSAVE which avoids all conditionals.

Use FXSAVE to save the legacy FP/SSE components in init_fpstate along with
a BUILD_BUG_ON() which reminds developers to validate that a newly added
component has all zeroes init state. As a bonus remove the now unused
copy_xregs_to_kernel_booting() crutch.

The XSAVE and reshuffle method can still be implemented in the unlikely
case that components are added which have a non-zero init state and no
other means to save them. For now, FXSAVE is just simple and good enough.

  [ bp: Fix a typo or two in the text. ]

Fixes: 6bad06b76892 ("x86, xsave: Use xsaveopt in context-switch path when supported")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210618143444.587311343@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4569528eff5b..551e76e1815f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -404,14 +404,17 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
 	 XFEATURE_MASK_Hi16_ZMM	 |		\
 	 XFEATURE_MASK_PKRU |			\
 	 XFEATURE_MASK_BNDREGS |		\
-	 XFEATURE_MASK_BNDCSR)
+	 XFEATURE_MASK_BNDCSR |			\
+	 XFEATURE_MASK_PASID)
 
 /*
  * setup the xstate image representing the init state
  */
 static void __init setup_init_fpu_buf(void)
 {
-	BUILD_BUG_ON(XFEATURE_MASK_USER_SUPPORTED != XFEATURES_INIT_FPSTATE_HANDLED);
+	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+			XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+			XFEATURES_INIT_FPSTATE_HANDLED);
 
 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return;
-- 
Gitee


From a4085993de18a440bc62b06641ed597c5b1dfa93 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Thu, 21 Oct 2021 15:55:27 -0700
Subject: [PATCH 0816/2161] x86/fpu/amx: Enable the AMX feature in 64-bit mode

commit 2308ee57d93d896618dd65c996429c9d3e469fe0 upstream.

Add the AMX state components in XFEATURE_MASK_USER_SUPPORTED and the
TILE_DATA component to the dynamic states and update the permission check
table accordingly.

This is only effective on 64 bit kernels as for 32bit kernels
XFEATURE_MASK_TILE is defined as 0.

TILE_DATA is caller-saved state and the only dynamic state. Add build time
sanity check to ensure the assumption that every dynamic feature is caller-
saved.

Make AMX state depend on XFD as it is dynamic feature.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211021225527.10184-24-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/xstate.h | 5 +++--
 arch/x86/kernel/cpu/cpuid-deps.c  | 1 +
 arch/x86/kernel/fpu/core.c        | 6 ++++++
 arch/x86/kernel/fpu/xstate.c      | 5 +++--
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index b50fe07a5b1d..7b2e3a7905fa 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -35,7 +35,8 @@
 				      XFEATURE_MASK_Hi16_ZMM	 | \
 				      XFEATURE_MASK_PKRU | \
 				      XFEATURE_MASK_BNDREGS | \
-				      XFEATURE_MASK_BNDCSR)
+				      XFEATURE_MASK_BNDCSR | \
+				      XFEATURE_MASK_XTILE)
 
 /*
  * Features which are restored when returning to user space.
@@ -46,7 +47,7 @@
 	(XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
 
 /* Features which are dynamically enabled for a process on request */
-#define XFEATURE_MASK_USER_DYNAMIC	0ULL
+#define XFEATURE_MASK_USER_DYNAMIC	XFEATURE_MASK_XTILE_DATA
 
 /* All currently supported supervisor features */
 #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index cb761b31e14a..255b372d4b42 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -71,6 +71,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
+	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
 	{}
 };
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index fc28937cdc72..914c6e987029 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -494,6 +494,12 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags)
 		return 0;
 	}
 
+	/*
+	 * If a new feature is added, ensure all dynamic features are
+	 * caller-saved from here!
+	 */
+	BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
 	/*
 	 * Save the default portion of the current FPU state into the
 	 * clone. Assume all dynamic features to be defined as caller-
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 551e76e1815f..30a6704933b2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -405,7 +405,8 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
 	 XFEATURE_MASK_PKRU |			\
 	 XFEATURE_MASK_BNDREGS |		\
 	 XFEATURE_MASK_BNDCSR |			\
-	 XFEATURE_MASK_PASID)
+	 XFEATURE_MASK_PASID |			\
+	 XFEATURE_MASK_XTILE)
 
 /*
  * setup the xstate image representing the init state
@@ -1667,7 +1668,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested)
  * Permissions array to map facilities with more than one component
  */
 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
-	/* [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE, */
+	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
 };
 
 static int xstate_request_perm(unsigned long idx)
-- 
Gitee


From dc3c459bf5616b5629b9974a59fbe7b5a5d46c8f Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 27 Dec 2021 17:31:03 +0800
Subject: [PATCH 0817/2161] selftests/x86/amx: Add test cases for AMX state
 management

commit 6a3e0651b4a00daa314c59d6e4228dfa7a986983 upstream.

AMX TILEDATA is a very large XSAVE feature.  It could have caused
nasty XSAVE buffer space waste in two places:

 * Signal stacks
 * Kernel task_struct->fpu buffers

To avoid this waste, neither of these buffers have AMX state by
default.  The non-default features are called "dynamic" features.

There is an arch_prctl(ARCH_REQ_XCOMP_PERM) which allows a task
to declare that it wants to use AMX or other "dynamic" XSAVE
features.  This arch_prctl() ensures that sufficient sigaltstack
space is available before it will succeed.  It also expands the
task_struct buffer.

Functions of this test:
 * Test arch_prctl(ARCH_REQ_XCOMP_PERM).  Ensure that it checks for
   proper sigaltstack sizing and that the sizing is enforced for
   future sigaltstack calls.
 * Ensure that ARCH_REQ_XCOMP_PERM is inherited across fork()
 * Ensure that TILEDATA use before the prctl() is fatal
 * Ensure that TILEDATA is cleared across fork()

Note: Generally, compiler support is needed to do something with
AMX.  Instead, directly load AMX state from userspace with a
plain XSAVE.  Do not depend on the compiler.

 [ dhansen: bunches of cleanups ]

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211026122524.7BEDAA95@davehans-spike.ostc.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/x86/Makefile |   3 +-
 tools/testing/selftests/x86/amx.c    | 697 +++++++++++++++++++++++++++
 2 files changed, 699 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/amx.c

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index f21400d1582b..0e2e8f6ce2fb 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -17,7 +17,8 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
-TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \
+			amx
 # Some selftests require 32bit support enabled also on 64bit systems
 TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
 
diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c
new file mode 100644
index 000000000000..ce012ad15fa5
--- /dev/null
+++ b/tools/testing/selftests/x86/amx.c
@@ -0,0 +1,697 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <setjmp.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <x86intrin.h>
+
+#include <linux/futex.h>
+
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+#define XSAVE_HDR_OFFSET	512
+#define XSAVE_HDR_SIZE		64
+
+struct xsave_buffer {
+	union {
+		struct {
+			char legacy[XSAVE_HDR_OFFSET];
+			char header[XSAVE_HDR_SIZE];
+			char extended[0];
+		};
+		char bytes[0];
+	};
+};
+
+static inline uint64_t xgetbv(uint32_t index)
+{
+	uint32_t eax, edx;
+
+	asm volatile("xgetbv;"
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((uint64_t)edx << 32);
+}
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+	asm volatile("cpuid;"
+		     : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+		     : "0" (*eax), "2" (*ecx));
+}
+
+static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xsave (%%rdi)"
+		     : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)
+		     : "memory");
+}
+
+static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xrstor (%%rdi)"
+		     : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi));
+}
+
+/* err() exits and will not return */
+#define fatal_error(msg, ...)	err(1, "[FAIL]\t" msg, ##__VA_ARGS__)
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		fatal_error("sigaction");
+}
+
+static void clearhandler(int sig)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_DFL;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		fatal_error("sigaction");
+}
+
+#define XFEATURE_XTILECFG	17
+#define XFEATURE_XTILEDATA	18
+#define XFEATURE_MASK_XTILECFG	(1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA	(1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE	(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+
+#define CPUID_LEAF1_ECX_XSAVE_MASK	(1 << 26)
+#define CPUID_LEAF1_ECX_OSXSAVE_MASK	(1 << 27)
+static inline void check_cpuid_xsave(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	/*
+	 * CPUID.1:ECX.XSAVE[bit 26] enumerates general
+	 * support for the XSAVE feature set, including
+	 * XGETBV.
+	 */
+	eax = 1;
+	ecx = 0;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!(ecx & CPUID_LEAF1_ECX_XSAVE_MASK))
+		fatal_error("cpuid: no CPU xsave support");
+	if (!(ecx & CPUID_LEAF1_ECX_OSXSAVE_MASK))
+		fatal_error("cpuid: no OS xsave support");
+}
+
+static uint32_t xbuf_size;
+
+static struct {
+	uint32_t xbuf_offset;
+	uint32_t size;
+} xtiledata;
+
+#define CPUID_LEAF_XSTATE		0xd
+#define CPUID_SUBLEAF_XSTATE_USER	0x0
+#define TILE_CPUID			0x1d
+#define TILE_PALETTE_ID			0x1
+
+static void check_cpuid_xtiledata(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	eax = CPUID_LEAF_XSTATE;
+	ecx = CPUID_SUBLEAF_XSTATE_USER;
+	cpuid(&eax, &ebx, &ecx, &edx);
+
+	/*
+	 * EBX enumerates the size (in bytes) required by the XSAVE
+	 * instruction for an XSAVE area containing all the user state
+	 * components corresponding to bits currently set in XCR0.
+	 *
+	 * Stash that off so it can be used to allocate buffers later.
+	 */
+	xbuf_size = ebx;
+
+	eax = CPUID_LEAF_XSTATE;
+	ecx = XFEATURE_XTILEDATA;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	/*
+	 * eax: XTILEDATA state component size
+	 * ebx: XTILEDATA state component offset in user buffer
+	 */
+	if (!eax || !ebx)
+		fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d",
+				eax, ebx);
+
+	xtiledata.size	      = eax;
+	xtiledata.xbuf_offset = ebx;
+}
+
+/* The helpers for managing XSAVE buffer and tile states: */
+
+struct xsave_buffer *alloc_xbuf(void)
+{
+	struct xsave_buffer *xbuf;
+
+	/* XSAVE buffer should be 64B-aligned. */
+	xbuf = aligned_alloc(64, xbuf_size);
+	if (!xbuf)
+		fatal_error("aligned_alloc()");
+	return xbuf;
+}
+
+static inline void clear_xstate_header(struct xsave_buffer *buffer)
+{
+	memset(&buffer->header, 0, sizeof(buffer->header));
+}
+
+static inline uint64_t get_xstatebv(struct xsave_buffer *buffer)
+{
+	/* XSTATE_BV is at the beginning of the header: */
+	return *(uint64_t *)&buffer->header;
+}
+
+static inline void set_xstatebv(struct xsave_buffer *buffer, uint64_t bv)
+{
+	/* XSTATE_BV is at the beginning of the header: */
+	*(uint64_t *)(&buffer->header) = bv;
+}
+
+static void set_rand_tiledata(struct xsave_buffer *xbuf)
+{
+	int *ptr = (int *)&xbuf->bytes[xtiledata.xbuf_offset];
+	int data;
+	int i;
+
+	/*
+	 * Ensure that 'data' is never 0.  This ensures that
+	 * the registers are never in their initial configuration
+	 * and thus never tracked as being in the init state.
+	 */
+	data = rand() | 1;
+
+	for (i = 0; i < xtiledata.size / sizeof(int); i++, ptr++)
+		*ptr = data;
+}
+
+struct xsave_buffer *stashed_xsave;
+
+static void init_stashed_xsave(void)
+{
+	stashed_xsave = alloc_xbuf();
+	if (!stashed_xsave)
+		fatal_error("failed to allocate stashed_xsave\n");
+	clear_xstate_header(stashed_xsave);
+}
+
+static void free_stashed_xsave(void)
+{
+	free(stashed_xsave);
+}
+
+/* See 'struct _fpx_sw_bytes' at sigcontext.h */
+#define SW_BYTES_OFFSET		464
+/* N.B. The struct's field name varies so read from the offset. */
+#define SW_BYTES_BV_OFFSET	(SW_BYTES_OFFSET + 8)
+
+static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *buffer)
+{
+	return (struct _fpx_sw_bytes *)(buffer + SW_BYTES_OFFSET);
+}
+
+static inline uint64_t get_fpx_sw_bytes_features(void *buffer)
+{
+	return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET);
+}
+
+/* Work around printf() being unsafe in signals: */
+#define SIGNAL_BUF_LEN 1000
+char signal_message_buffer[SIGNAL_BUF_LEN];
+void sig_print(char *msg)
+{
+	int left = SIGNAL_BUF_LEN - strlen(signal_message_buffer) - 1;
+
+	strncat(signal_message_buffer, msg, left);
+}
+
+static volatile bool noperm_signaled;
+static int noperm_errs;
+
+/*
+ * Signal handler for when AMX is used but
+ * permission has not been obtained.
+ */
+static void handle_noperm(int sig, siginfo_t *si, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t *)ctx_void;
+	void *xbuf = ctx->uc_mcontext.fpregs;
+	struct _fpx_sw_bytes *sw_bytes;
+	uint64_t features;
+
+	/* Reset the signal message buffer: */
+	signal_message_buffer[0] = '\0';
+	sig_print("\tAt SIGILL handler,\n");
+
+	if (si->si_code != ILL_ILLOPC) {
+		noperm_errs++;
+		sig_print("[FAIL]\tInvalid signal code.\n");
+	} else {
+		sig_print("[OK]\tValid signal code (ILL_ILLOPC).\n");
+	}
+
+	sw_bytes = get_fpx_sw_bytes(xbuf);
+	/*
+	 * Without permission, the signal XSAVE buffer should not
+	 * have room for AMX register state (aka. xtiledata).
+	 * Check that the size does not overlap with where xtiledata
+	 * will reside.
+	 *
+	 * This also implies that no state components *PAST*
+	 * XTILEDATA (features >=19) can be present in the buffer.
+	 */
+	if (sw_bytes->xstate_size <= xtiledata.xbuf_offset) {
+		sig_print("[OK]\tValid xstate size\n");
+	} else {
+		noperm_errs++;
+		sig_print("[FAIL]\tInvalid xstate size\n");
+	}
+
+	features = get_fpx_sw_bytes_features(xbuf);
+	/*
+	 * Without permission, the XTILEDATA feature
+	 * bit should not be set.
+	 */
+	if ((features & XFEATURE_MASK_XTILEDATA) == 0) {
+		sig_print("[OK]\tValid xstate mask\n");
+	} else {
+		noperm_errs++;
+		sig_print("[FAIL]\tInvalid xstate mask\n");
+	}
+
+	noperm_signaled = true;
+	ctx->uc_mcontext.gregs[REG_RIP] += 3; /* Skip the faulting XRSTOR */
+}
+
+/* Return true if XRSTOR is successful; otherwise, false. */
+static inline bool xrstor_safe(struct xsave_buffer *xbuf, uint64_t mask)
+{
+	noperm_signaled = false;
+	xrstor(xbuf, mask);
+
+	/* Print any messages produced by the signal code: */
+	printf("%s", signal_message_buffer);
+	/*
+	 * Reset the buffer to make sure any future printing
+	 * only outputs new messages:
+	 */
+	signal_message_buffer[0] = '\0';
+
+	if (noperm_errs)
+		fatal_error("saw %d errors in noperm signal handler\n", noperm_errs);
+
+	return !noperm_signaled;
+}
+
+/*
+ * Use XRSTOR to populate the XTILEDATA registers with
+ * random data.
+ *
+ * Return true if successful; otherwise, false.
+ */
+static inline bool load_rand_tiledata(struct xsave_buffer *xbuf)
+{
+	clear_xstate_header(xbuf);
+	set_xstatebv(xbuf, XFEATURE_MASK_XTILEDATA);
+	set_rand_tiledata(xbuf);
+	return xrstor_safe(xbuf, XFEATURE_MASK_XTILEDATA);
+}
+
+/* Return XTILEDATA to its initial configuration. */
+static inline void init_xtiledata(void)
+{
+	clear_xstate_header(stashed_xsave);
+	xrstor_safe(stashed_xsave, XFEATURE_MASK_XTILEDATA);
+}
+
+enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED };
+
+/* arch_prctl() and sigaltstack() test */
+
+#define ARCH_GET_XCOMP_PERM	0x1022
+#define ARCH_REQ_XCOMP_PERM	0x1023
+
+static void req_xtiledata_perm(void)
+{
+	syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+}
+
+static void validate_req_xcomp_perm(enum expected_result exp)
+{
+	unsigned long bitmask;
+	long rc;
+
+	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+	if (exp == FAIL_EXPECTED) {
+		if (rc) {
+			printf("[OK]\tARCH_REQ_XCOMP_PERM saw expected failure..\n");
+			return;
+		}
+
+		fatal_error("ARCH_REQ_XCOMP_PERM saw unexpected success.\n");
+	} else if (rc) {
+		fatal_error("ARCH_REQ_XCOMP_PERM saw unexpected failure.\n");
+	}
+
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
+	if (rc) {
+		fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
+	} else if (bitmask & XFEATURE_MASK_XTILE) {
+		printf("\tARCH_REQ_XCOMP_PERM is successful.\n");
+	}
+}
+
+static void validate_xcomp_perm(enum expected_result exp)
+{
+	bool load_success = load_rand_tiledata(stashed_xsave);
+
+	if (exp == FAIL_EXPECTED) {
+		if (load_success) {
+			noperm_errs++;
+			printf("[FAIL]\tLoad tiledata succeeded.\n");
+		} else {
+			printf("[OK]\tLoad tiledata failed.\n");
+		}
+	} else if (exp == SUCCESS_EXPECTED) {
+		if (load_success) {
+			printf("[OK]\tLoad tiledata succeeded.\n");
+		} else {
+			noperm_errs++;
+			printf("[FAIL]\tLoad tiledata failed.\n");
+		}
+	}
+}
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ	51
+#endif
+
+static void *alloc_altstack(unsigned int size)
+{
+	void *altstack;
+
+	altstack = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+
+	if (altstack == MAP_FAILED)
+		fatal_error("mmap() for altstack");
+
+	return altstack;
+}
+
+static void setup_altstack(void *addr, unsigned long size, enum expected_result exp)
+{
+	stack_t ss;
+	int rc;
+
+	memset(&ss, 0, sizeof(ss));
+	ss.ss_size = size;
+	ss.ss_sp = addr;
+
+	rc = sigaltstack(&ss, NULL);
+
+	if (exp == FAIL_EXPECTED) {
+		if (rc) {
+			printf("[OK]\tsigaltstack() failed.\n");
+		} else {
+			fatal_error("sigaltstack() succeeded unexpectedly.\n");
+		}
+	} else if (rc) {
+		fatal_error("sigaltstack()");
+	}
+}
+
+static void test_dynamic_sigaltstack(void)
+{
+	unsigned int small_size, enough_size;
+	unsigned long minsigstksz;
+	void *altstack;
+
+	minsigstksz = getauxval(AT_MINSIGSTKSZ);
+	printf("\tAT_MINSIGSTKSZ = %lu\n", minsigstksz);
+	/*
+	 * getauxval() itself can return 0 for failure or
+	 * success.  But, in this case, AT_MINSIGSTKSZ
+	 * will always return a >=0 value if implemented.
+	 * Just check for 0.
+	 */
+	if (minsigstksz == 0) {
+		printf("no support for AT_MINSIGSTKSZ, skipping sigaltstack tests\n");
+		return;
+	}
+
+	enough_size = minsigstksz * 2;
+
+	altstack = alloc_altstack(enough_size);
+	printf("\tAllocate memory for altstack (%u bytes).\n", enough_size);
+
+	/*
+	 * Try setup_altstack() with a size which can not fit
+	 * XTILEDATA.  ARCH_REQ_XCOMP_PERM should fail.
+	 */
+	small_size = minsigstksz - xtiledata.size;
+	printf("\tAfter sigaltstack() with small size (%u bytes).\n", small_size);
+	setup_altstack(altstack, small_size, SUCCESS_EXPECTED);
+	validate_req_xcomp_perm(FAIL_EXPECTED);
+
+	/*
+	 * Try setup_altstack() with a size derived from
+	 * AT_MINSIGSTKSZ.  It should be more than large enough
+	 * and thus ARCH_REQ_XCOMP_PERM should succeed.
+	 */
+	printf("\tAfter sigaltstack() with enough size (%u bytes).\n", enough_size);
+	setup_altstack(altstack, enough_size, SUCCESS_EXPECTED);
+	validate_req_xcomp_perm(SUCCESS_EXPECTED);
+
+	/*
+	 * Try to coerce setup_altstack() to again accept a
+	 * too-small altstack.  This ensures that big-enough
+	 * sigaltstacks can not shrink to a too-small value
+	 * once XTILEDATA permission is established.
+	 */
+	printf("\tThen, sigaltstack() with small size (%u bytes).\n", small_size);
+	setup_altstack(altstack, small_size, FAIL_EXPECTED);
+}
+
+static void test_dynamic_state(void)
+{
+	pid_t parent, child, grandchild;
+
+	parent = fork();
+	if (parent < 0) {
+		/* fork() failed */
+		fatal_error("fork");
+	} else if (parent > 0) {
+		int status;
+		/* fork() succeeded.  Now in the parent. */
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("arch_prctl test parent exit");
+		return;
+	}
+	/* fork() succeeded.  Now in the child . */
+
+	printf("[RUN]\tCheck ARCH_REQ_XCOMP_PERM around process fork() and sigaltack() test.\n");
+
+	printf("\tFork a child.\n");
+	child = fork();
+	if (child < 0) {
+		fatal_error("fork");
+	} else if (child > 0) {
+		int status;
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("arch_prctl test child exit");
+		_exit(0);
+	}
+
+	/*
+	 * The permission request should fail without an
+	 * XTILEDATA-compatible signal stack
+	 */
+	printf("\tTest XCOMP_PERM at child.\n");
+	validate_xcomp_perm(FAIL_EXPECTED);
+
+	/*
+	 * Set up an XTILEDATA-compatible signal stack and
+	 * also obtain permission to populate XTILEDATA.
+	 */
+	printf("\tTest dynamic sigaltstack at child:\n");
+	test_dynamic_sigaltstack();
+
+	/* Ensure that XTILEDATA can be populated. */
+	printf("\tTest XCOMP_PERM again at child.\n");
+	validate_xcomp_perm(SUCCESS_EXPECTED);
+
+	printf("\tFork a grandchild.\n");
+	grandchild = fork();
+	if (grandchild < 0) {
+		/* fork() failed */
+		fatal_error("fork");
+	} else if (!grandchild) {
+		/* fork() succeeded.  Now in the (grand)child. */
+		printf("\tTest XCOMP_PERM at grandchild.\n");
+
+		/*
+		 * Ensure that the grandchild inherited
+		 * permission and a compatible sigaltstack:
+		 */
+		validate_xcomp_perm(SUCCESS_EXPECTED);
+	} else {
+		int status;
+		/* fork() succeeded.  Now in the parent. */
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("fork test grandchild");
+	}
+
+	_exit(0);
+}
+
+/*
+ * Save current register state and compare it to @xbuf1.'
+ *
+ * Returns false if @xbuf1 matches the registers.
+ * Returns true  if @xbuf1 differs from the registers.
+ */
+static inline bool __validate_tiledata_regs(struct xsave_buffer *xbuf1)
+{
+	struct xsave_buffer *xbuf2;
+	int ret;
+
+	xbuf2 = alloc_xbuf();
+	if (!xbuf2)
+		fatal_error("failed to allocate XSAVE buffer\n");
+
+	xsave(xbuf2, XFEATURE_MASK_XTILEDATA);
+	ret = memcmp(&xbuf1->bytes[xtiledata.xbuf_offset],
+		     &xbuf2->bytes[xtiledata.xbuf_offset],
+		     xtiledata.size);
+
+	free(xbuf2);
+
+	if (ret == 0)
+		return false;
+	return true;
+}
+
+static inline void validate_tiledata_regs_same(struct xsave_buffer *xbuf)
+{
+	int ret = __validate_tiledata_regs(xbuf);
+
+	if (ret != 0)
+		fatal_error("TILEDATA registers changed");
+}
+
+static inline void validate_tiledata_regs_changed(struct xsave_buffer *xbuf)
+{
+	int ret = __validate_tiledata_regs(xbuf);
+
+	if (ret == 0)
+		fatal_error("TILEDATA registers did not change");
+}
+
+/* tiledata inheritance test */
+
+static void test_fork(void)
+{
+	pid_t child, grandchild;
+
+	child = fork();
+	if (child < 0) {
+		/* fork() failed */
+		fatal_error("fork");
+	} else if (child > 0) {
+		/* fork() succeeded.  Now in the parent. */
+		int status;
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("fork test child");
+		return;
+	}
+	/* fork() succeeded.  Now in the child. */
+	printf("[RUN]\tCheck tile data inheritance.\n\tBefore fork(), load tiledata\n");
+
+	load_rand_tiledata(stashed_xsave);
+
+	grandchild = fork();
+	if (grandchild < 0) {
+		/* fork() failed */
+		fatal_error("fork");
+	} else if (grandchild > 0) {
+		/* fork() succeeded.  Still in the first child. */
+		int status;
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("fork test grand child");
+		_exit(0);
+	}
+	/* fork() succeeded.  Now in the (grand)child. */
+
+	/*
+	 * TILEDATA registers are not preserved across fork().
+	 * Ensure that their value has changed:
+	 */
+	validate_tiledata_regs_changed(stashed_xsave);
+
+	_exit(0);
+}
+
+int main(void)
+{
+	/* Check hardware availability at first */
+	check_cpuid_xsave();
+	check_cpuid_xtiledata();
+
+	init_stashed_xsave();
+	sethandler(SIGILL, handle_noperm, 0);
+
+	test_dynamic_state();
+
+	/* Request permission for the following tests */
+	req_xtiledata_perm();
+
+	test_fork();
+
+	clearhandler(SIGILL);
+	free_stashed_xsave();
+
+	return 0;
+}
-- 
Gitee


From 05ccb6205cf1bc9fc20c4d380762383cc3a9b913 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 26 Oct 2021 05:25:25 -0700
Subject: [PATCH 0818/2161] selftests/x86/amx: Add context switch test

commit 101c669d165d341b8c35424eb3878138044394ef upstream.

XSAVE state is thread-local.  The kernel switches between thread
state at context switch time.  Generally, running a selftest for
a while will naturally expose it to some context switching and
and will test the XSAVE code.

Instead of just hoping that the tests get context-switched at
random times, force context-switches on purpose.  Spawn off a few
userspace threads and force context-switches between them.
Ensure that the kernel correctly context switches each thread's
unique AMX state.

 [ dhansen: bunches of cleanups ]

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211026122525.6EFD5758@davehans-spike.ostc.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/x86/amx.c | 160 +++++++++++++++++++++++++++++-
 1 file changed, 157 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c
index ce012ad15fa5..3615ef4a48bb 100644
--- a/tools/testing/selftests/x86/amx.c
+++ b/tools/testing/selftests/x86/amx.c
@@ -3,6 +3,7 @@
 #define _GNU_SOURCE
 #include <err.h>
 #include <errno.h>
+#include <pthread.h>
 #include <setjmp.h>
 #include <stdio.h>
 #include <string.h>
@@ -10,8 +11,6 @@
 #include <unistd.h>
 #include <x86intrin.h>
 
-#include <linux/futex.h>
-
 #include <sys/auxv.h>
 #include <sys/mman.h>
 #include <sys/shm.h>
@@ -259,7 +258,6 @@ void sig_print(char *msg)
 
 static volatile bool noperm_signaled;
 static int noperm_errs;
-
 /*
  * Signal handler for when AMX is used but
  * permission has not been obtained.
@@ -674,6 +672,158 @@ static void test_fork(void)
 	_exit(0);
 }
 
+/* Context switching test */
+
+static struct _ctxtswtest_cfg {
+	unsigned int iterations;
+	unsigned int num_threads;
+} ctxtswtest_config;
+
+struct futex_info {
+	pthread_t thread;
+	int nr;
+	pthread_mutex_t mutex;
+	struct futex_info *next;
+};
+
+static void *check_tiledata(void *info)
+{
+	struct futex_info *finfo = (struct futex_info *)info;
+	struct xsave_buffer *xbuf;
+	int i;
+
+	xbuf = alloc_xbuf();
+	if (!xbuf)
+		fatal_error("unable to allocate XSAVE buffer");
+
+	/*
+	 * Load random data into 'xbuf' and then restore
+	 * it to the tile registers themselves.
+	 */
+	load_rand_tiledata(xbuf);
+	for (i = 0; i < ctxtswtest_config.iterations; i++) {
+		pthread_mutex_lock(&finfo->mutex);
+
+		/*
+		 * Ensure the register values have not
+		 * diverged from those recorded in 'xbuf'.
+		 */
+		validate_tiledata_regs_same(xbuf);
+
+		/* Load new, random values into xbuf and registers */
+		load_rand_tiledata(xbuf);
+
+		/*
+		 * The last thread's last unlock will be for
+		 * thread 0's mutex.  However, thread 0 will
+		 * have already exited the loop and the mutex
+		 * will already be unlocked.
+		 *
+		 * Because this is not an ERRORCHECK mutex,
+		 * that inconsistency will be silently ignored.
+		 */
+		pthread_mutex_unlock(&finfo->next->mutex);
+	}
+
+	free(xbuf);
+	/*
+	 * Return this thread's finfo, which is
+	 * a unique value for this thread.
+	 */
+	return finfo;
+}
+
+static int create_threads(int num, struct futex_info *finfo)
+{
+	int i;
+
+	for (i = 0; i < num; i++) {
+		int next_nr;
+
+		finfo[i].nr = i;
+		/*
+		 * Thread 'i' will wait on this mutex to
+		 * be unlocked.  Lock it immediately after
+		 * initialization:
+		 */
+		pthread_mutex_init(&finfo[i].mutex, NULL);
+		pthread_mutex_lock(&finfo[i].mutex);
+
+		next_nr = (i + 1) % num;
+		finfo[i].next = &finfo[next_nr];
+
+		if (pthread_create(&finfo[i].thread, NULL, check_tiledata, &finfo[i]))
+			fatal_error("pthread_create()");
+	}
+	return 0;
+}
+
+static void affinitize_cpu0(void)
+{
+	cpu_set_t cpuset;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+		fatal_error("sched_setaffinity to CPU 0");
+}
+
+static void test_context_switch(void)
+{
+	struct futex_info *finfo;
+	int i;
+
+	/* Affinitize to one CPU to force context switches */
+	affinitize_cpu0();
+
+	req_xtiledata_perm();
+
+	printf("[RUN]\tCheck tiledata context switches, %d iterations, %d threads.\n",
+	       ctxtswtest_config.iterations,
+	       ctxtswtest_config.num_threads);
+
+
+	finfo = malloc(sizeof(*finfo) * ctxtswtest_config.num_threads);
+	if (!finfo)
+		fatal_error("malloc()");
+
+	create_threads(ctxtswtest_config.num_threads, finfo);
+
+	/*
+	 * This thread wakes up thread 0
+	 * Thread 0 will wake up 1
+	 * Thread 1 will wake up 2
+	 * ...
+	 * the last thread will wake up 0
+	 *
+	 * ... this will repeat for the configured
+	 * number of iterations.
+	 */
+	pthread_mutex_unlock(&finfo[0].mutex);
+
+	/* Wait for all the threads to finish: */
+	for (i = 0; i < ctxtswtest_config.num_threads; i++) {
+		void *thread_retval;
+		int rc;
+
+		rc = pthread_join(finfo[i].thread, &thread_retval);
+
+		if (rc)
+			fatal_error("pthread_join() failed for thread %d err: %d\n",
+					i, rc);
+
+		if (thread_retval != &finfo[i])
+			fatal_error("unexpected thread retval for thread %d: %p\n",
+					i, thread_retval);
+
+	}
+
+	printf("[OK]\tNo incorrect case was found.\n");
+
+	free(finfo);
+}
+
 int main(void)
 {
 	/* Check hardware availability at first */
@@ -690,6 +840,10 @@ int main(void)
 
 	test_fork();
 
+	ctxtswtest_config.iterations = 10;
+	ctxtswtest_config.num_threads = 5;
+	test_context_switch();
+
 	clearhandler(SIGILL);
 	free_stashed_xsave();
 
-- 
Gitee


From edcedc09c2f6a1f73ab8676b11dd81f6d42f231a Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 25 Oct 2021 15:04:13 +1100
Subject: [PATCH 0819/2161] x86/fpu: Include vmalloc.h for vzalloc()

commit 868c250bb4639531ff33b2d879fbef39c1d9ed39 upstream.

Explicitly include that header to avoid build errors when vzalloc()
becomes "invisible" to the compiler due to header reorganizations.

This is not a problem in the tip tree but occurred when integrating
linux-next.

 [ bp: Commit message. ]

Link: https://lore.kernel.org/r/20211025151144.552c60ca@canb.auug.org.au
Fixes: 69f6ed1d14c6 ("x86/fpu: Provide infrastructure for KVM FPU cleanup")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 914c6e987029..036facdadbc4 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -16,6 +16,7 @@
 
 #include <linux/hardirq.h>
 #include <linux/pkeys.h>
+#include <linux/vmalloc.h>
 
 #include "context.h"
 #include "internal.h"
-- 
Gitee


From c18746c6c6e7947880acafd32290eb219ee3e3b6 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 26 Oct 2021 02:11:57 -0700
Subject: [PATCH 0820/2161] Documentation/x86: Add documentation for using
 dynamic XSTATE features

commit d7a9590f608dbedd917eb0857a074accdf0d3919 upstream.

Explain how dynamic XSTATE features can be enabled via the
architecture-specific prctl() along with dynamic sigframe size and
first use trap handling.

Fix:

Documentation/x86/xstate.rst:15: WARNING: Title underline too short.

as reported by Stephen Rothwell <sfr@canb.auug.org.au>

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211026091157.16711-1-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/index.rst  |  1 +
 Documentation/x86/xstate.rst | 65 ++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 Documentation/x86/xstate.rst

diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 0b626ec2cb15..d86094a3c14e 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -34,3 +34,4 @@ x86-specific Documentation
    sgx
    sva
    elf_auxvec
+   xstate
diff --git a/Documentation/x86/xstate.rst b/Documentation/x86/xstate.rst
new file mode 100644
index 000000000000..65de3f054ba5
--- /dev/null
+++ b/Documentation/x86/xstate.rst
@@ -0,0 +1,65 @@
+Using XSTATE features in user space applications
+================================================
+
+The x86 architecture supports floating-point extensions which are
+enumerated via CPUID. Applications consult CPUID and use XGETBV to
+evaluate which features have been enabled by the kernel XCR0.
+
+Up to AVX-512 and PKRU states, these features are automatically enabled by
+the kernel if available. Features like AMX TILE_DATA (XSTATE component 18)
+are enabled by XCR0 as well, but the first use of related instruction is
+trapped by the kernel because by default the required large XSTATE buffers
+are not allocated automatically.
+
+Using dynamically enabled XSTATE features in user space applications
+--------------------------------------------------------------------
+
+The kernel provides an arch_prctl(2) based mechanism for applications to
+request the usage of such features. The arch_prctl(2) options related to
+this are:
+
+-ARCH_GET_XCOMP_SUPP
+
+ arch_prctl(ARCH_GET_XCOMP_SUPP, &features);
+
+ ARCH_GET_XCOMP_SUPP stores the supported features in userspace storage of
+ type uint64_t. The second argument is a pointer to that storage.
+
+-ARCH_GET_XCOMP_PERM
+
+ arch_prctl(ARCH_GET_XCOMP_PERM, &features);
+
+ ARCH_GET_XCOMP_PERM stores the features for which the userspace process
+ has permission in userspace storage of type uint64_t. The second argument
+ is a pointer to that storage.
+
+-ARCH_REQ_XCOMP_PERM
+
+ arch_prctl(ARCH_REQ_XCOMP_PERM, feature_nr);
+
+ ARCH_REQ_XCOMP_PERM allows to request permission for a dynamically enabled
+ feature or a feature set. A feature set can be mapped to a facility, e.g.
+ AMX, and can require one or more XSTATE components to be enabled.
+
+ The feature argument is the number of the highest XSTATE component which
+ is required for a facility to work.
+
+When requesting permission for a feature, the kernel checks the
+availability. The kernel ensures that sigaltstacks in the process's tasks
+are large enough to accommodate the resulting large signal frame. It
+enforces this both during ARCH_REQ_XCOMP_SUPP and during any subsequent
+sigaltstack(2) calls. If an installed sigaltstack is smaller than the
+resulting sigframe size, ARCH_REQ_XCOMP_SUPP results in -ENOSUPP. Also,
+sigaltstack(2) results in -ENOMEM if the requested altstack is too small
+for the permitted features.
+
+Permission, when granted, is valid per process. Permissions are inherited
+on fork(2) and cleared on exec(3).
+
+The first use of an instruction related to a dynamically enabled feature is
+trapped by the kernel. The trap handler checks whether the process has
+permission to use the feature. If the process has no permission then the
+kernel sends SIGILL to the application. If the process has permission then
+the handler allocates a larger xstate buffer for the task so the large
+state can be context switched. In the unlikely cases that the allocation
+fails, the kernel sends SIGSEGV.
-- 
Gitee


From b6d5abf6d80bf19844a9d9cdf9f13bab1139698b Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 2 Nov 2021 15:47:50 -0700
Subject: [PATCH 0821/2161] x86/fpu: Optimize out sigframe xfeatures when in
 init state

commit 30d02551ba4f681cfa605cedacf231b8641169f0 upstream.

tl;dr: AMX state is ~8k.  Signal frames can have space for this
~8k and each signal entry writes out all 8k even if it is zeros.
Skip writing zeros for AMX to speed up signal delivery by about
4% overall when AMX is in its init state.

This is a user-visible change to the sigframe ABI.

== Hardware XSAVE Background ==

XSAVE state components may be tracked by the processor as being
in their initial configuration.  Software can detect which
features are in this configuration by looking at the XSTATE_BV
field in an XSAVE buffer or with the XGETBV(1) instruction.

Both the XSAVE and XSAVEOPT instructions enumerate features s
being in the initial configuration via the XSTATE_BV field in the
XSAVE header,  However, XSAVEOPT declines to actually write
features in their initial configuration to the buffer.  XSAVE
writes the feature unconditionally, regardless of whether it is
in the initial configuration or not.

Basically, XSAVE users never need to inspect XSTATE_BV to
determine if the feature has been written to the buffer.
XSAVEOPT users *do* need to inspect XSTATE_BV.  They might also
need to clear out the buffer if they want to make an isolated
change to the state, like modifying one register.

== Software Signal / XSAVE Background ==

Signal frames have historically been written with XSAVE itself.
Each state is written in its entirety, regardless of being in its
initial configuration.

In other words, the signal frame ABI uses the XSAVE behavior, not
the XSAVEOPT behavior.

== Problem ==

This means that any application which has acquired permission to
use AMX via ARCH_REQ_XCOMP_PERM will write 8k of state to the
signal frame.  This 8k write will occur even when AMX was in its
initial configuration and software *knows* this because of
XSTATE_BV.

This problem also exists to a lesser degree with AVX-512 and its
2k of state.  However, AVX-512 use does not require
ARCH_REQ_XCOMP_PERM and is more likely to have existing users
which would be impacted by any change in behavior.

== Solution ==

Stop writing out AMX xfeatures which are in their initial state
to the signal frame.  This effectively makes the signal frame
XSAVE buffer look as if it were written with a combination of
XSAVEOPT and XSAVE behavior.  Userspace which handles XSAVEOPT-
style buffers should be able to handle this naturally.

For now, include only the AMX xfeatures: XTILE and XTILEDATA in
this new behavior.  These require new ABI to use anyway, which
makes their users very unlikely to be broken.  This XSAVEOPT-like
behavior should be expected for all future dynamic xfeatures.  It
may also be extended to legacy features like AVX-512 in the
future.

Only attempt this optimization on systems with dynamic features.
Disable dynamic feature support (XFD) if XGETBV1 is unavailable
by adding a CPUID dependency.

This has been measured to reduce the *overall* cycle cost of
signal delivery by about 4%.

Fixes: 2308ee57d93d ("x86/fpu/amx: Enable the AMX feature in 64-bit mode")
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: "Chang S. Bae" <chang.seok.bae@intel.com>
Link: https://lore.kernel.org/r/20211102224750.FA412E26@davehans-spike.ostc.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/xstate.rst      |  9 ++++++++
 arch/x86/include/asm/fpu/xcr.h    | 12 ++++++++++
 arch/x86/include/asm/fpu/xstate.h |  7 ++++++
 arch/x86/kernel/cpu/cpuid-deps.c  |  1 +
 arch/x86/kernel/fpu/xstate.h      | 37 +++++++++++++++++++++++++++++--
 5 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/Documentation/x86/xstate.rst b/Documentation/x86/xstate.rst
index 65de3f054ba5..5cec7fb558d6 100644
--- a/Documentation/x86/xstate.rst
+++ b/Documentation/x86/xstate.rst
@@ -63,3 +63,12 @@ kernel sends SIGILL to the application. If the process has permission then
 the handler allocates a larger xstate buffer for the task so the large
 state can be context switched. In the unlikely cases that the allocation
 fails, the kernel sends SIGSEGV.
+
+Dynamic features in signal frames
+---------------------------------
+
+Dynamcally enabled features are not written to the signal frame upon signal
+entry if the feature is in its initial configuration.  This differs from
+non-dynamic features which are always written regardless of their
+configuration.  Signal handlers can examine the XSAVE buffer's XSTATE_BV
+field to determine if a features was written.
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
index 79f95d3787e2..9656a5bc6fea 100644
--- a/arch/x86/include/asm/fpu/xcr.h
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -3,6 +3,7 @@
 #define _ASM_X86_FPU_XCR_H
 
 #define XCR_XFEATURE_ENABLED_MASK	0x00000000
+#define XCR_XFEATURE_IN_USE_MASK	0x00000001
 
 static inline u64 xgetbv(u32 index)
 {
@@ -20,4 +21,15 @@ static inline void xsetbv(u32 index, u64 value)
 	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
 }
 
+/*
+ * Return a mask of xfeatures which are currently being tracked
+ * by the processor as being in the initial configuration.
+ *
+ * Callers should check X86_FEATURE_XGETBV1.
+ */
+static inline u64 xfeatures_in_use(void)
+{
+	return xgetbv(XCR_XFEATURE_IN_USE_MASK);
+}
+
 #endif /* _ASM_X86_FPU_XCR_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 7b2e3a7905fa..6c56b3eedd60 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -92,6 +92,13 @@
 #define XFEATURE_MASK_FPSTATE	(XFEATURE_MASK_USER_RESTORE | \
 				 XFEATURE_MASK_SUPERVISOR_SUPPORTED)
 
+/*
+ * Features in this mask have space allocated in the signal frame, but may not
+ * have that space initialized when the feature is in its init state.
+ */
+#define XFEATURE_MASK_SIGFRAME_INITOPT	(XFEATURE_MASK_XTILE | \
+					 XFEATURE_MASK_USER_DYNAMIC)
+
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 255b372d4b42..0a160870bc4b 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
+	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
 	{}
 };
 
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index e18210dff88c..86ea7c0fa2f6 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -4,6 +4,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/fpu/xstate.h>
+#include <asm/fpu/xcr.h>
 
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(u64, xfd_state);
@@ -198,6 +199,32 @@ static inline void os_xrstor_supervisor(struct fpstate *fpstate)
 	XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
 }
 
+/*
+ * XSAVE itself always writes all requested xfeatures.  Removing features
+ * from the request bitmap reduces the features which are written.
+ * Generate a mask of features which must be written to a sigframe.  The
+ * unset features can be optimized away and not written.
+ *
+ * This optimization is user-visible.  Only use for states where
+ * uninitialized sigframe contents are tolerable, like dynamic features.
+ *
+ * Users of buffers produced with this optimization must check XSTATE_BV
+ * to determine which features have been optimized out.
+ */
+static inline u64 xfeatures_need_sigframe_write(void)
+{
+	u64 xfeaures_to_write;
+
+	/* In-use features must be written: */
+	xfeaures_to_write = xfeatures_in_use();
+
+	/* Also write all non-optimizable sigframe features: */
+	xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
+			     ~XFEATURE_MASK_SIGFRAME_INITOPT;
+
+	return xfeaures_to_write;
+}
+
 /*
  * Save xstate to user space xsave area.
  *
@@ -220,10 +247,16 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
 	 */
 	struct fpstate *fpstate = current->thread.fpu.fpstate;
 	u64 mask = fpstate->user_xfeatures;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
+	u32 lmask;
+	u32 hmask;
 	int err;
 
+	/* Optimize away writing unnecessary xfeatures: */
+	if (fpu_state_size_dynamic())
+		mask &= xfeatures_need_sigframe_write();
+
+	lmask = mask;
+	hmask = mask >> 32;
 	xfd_validate_state(fpstate, mask, false);
 
 	stac();
-- 
Gitee


From f3ffa52c25a54a6e8bd9756419ab7ba181b07b05 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 26 Nov 2021 13:47:46 +0100
Subject: [PATCH 0822/2161] x86/fpu/signal: Initialize sw_bytes in
 save_xstate_epilog()

commit 52d0b8b18776f184c53632c5e0068201491cdb61 upstream.

save_sw_bytes() did not fully initialize sw_bytes, which caused KMSAN
to report an infoleak (see below).
Initialize sw_bytes explicitly to avoid this.

KMSAN report follows:

=====================================================
BUG: KMSAN: kernel-infoleak in instrument_copy_to_user ./include/linux/instrumented.h:121
BUG: KMSAN: kernel-infoleak in __copy_to_user ./include/linux/uaccess.h:154
BUG: KMSAN: kernel-infoleak in save_xstate_epilog+0x2df/0x510 arch/x86/kernel/fpu/signal.c:127
 instrument_copy_to_user ./include/linux/instrumented.h:121
 __copy_to_user ./include/linux/uaccess.h:154
 save_xstate_epilog+0x2df/0x510 arch/x86/kernel/fpu/signal.c:127
 copy_fpstate_to_sigframe+0x861/0xb60 arch/x86/kernel/fpu/signal.c:245
 get_sigframe+0x656/0x7e0 arch/x86/kernel/signal.c:296
 __setup_rt_frame+0x14d/0x2a60 arch/x86/kernel/signal.c:471
 setup_rt_frame arch/x86/kernel/signal.c:781
 handle_signal arch/x86/kernel/signal.c:825
 arch_do_signal_or_restart+0x417/0xdd0 arch/x86/kernel/signal.c:870
 handle_signal_work kernel/entry/common.c:149
 exit_to_user_mode_loop+0x1f6/0x490 kernel/entry/common.c:173
 exit_to_user_mode_prepare kernel/entry/common.c:208
 __syscall_exit_to_user_mode_work kernel/entry/common.c:290
 syscall_exit_to_user_mode+0x7e/0xc0 kernel/entry/common.c:302
 do_syscall_64+0x60/0xd0 arch/x86/entry/common.c:88
 entry_SYSCALL_64_after_hwframe+0x44/0xae ??:?

Local variable sw_bytes created at:
 save_xstate_epilog+0x80/0x510 arch/x86/kernel/fpu/signal.c:121
 copy_fpstate_to_sigframe+0x861/0xb60 arch/x86/kernel/fpu/signal.c:245

Bytes 20-47 of 48 are uninitialized
Memory access of size 48 starts at ffff8880801d3a18
Data copied to user address 00007ffd90e2ef50
=====================================================

Link: https://lore.kernel.org/all/CAG_fn=V9T6OKPonSjsi9PmWB0hMHFC=yawozdft8i1-MSxrv=w@mail.gmail.com/
Fixes: 53599b4d54b9b8dd ("x86/fpu/signal: Prepare for variable sigframe length")
Reported-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Alexander Potapenko <glider@google.com>
Link: https://lkml.kernel.org/r/20211126124746.761278-1-glider@google.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index b0b06ad87f1a..8fd5f6a94828 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -118,7 +118,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
 				      struct fpstate *fpstate)
 {
 	struct xregs_state __user *x = buf;
-	struct _fpx_sw_bytes sw_bytes;
+	struct _fpx_sw_bytes sw_bytes = {};
 	u32 xfeatures;
 	int err;
 
-- 
Gitee


From 48812455f5ba585d2dfd07f7f3aeb860cfaeadd0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 17 Apr 2020 10:37:23 -0700
Subject: [PATCH 0823/2161] ntb: intel: Add Icelake (gen4) support for Intel
 NTB

commit 26bfe3d0b227ab6d38692640b44ce48f2d857602 upstream.

Adding 4th generation Intel NTB support bits. There are a lot of common
parts that the gen4 NTB has with gen3 NTB on Skylake. The commonalities are
reused in gen4 Icelake NTB.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/ntb/hw/intel/Makefile       |   2 +-
 drivers/ntb/hw/intel/ntb_hw_gen1.c  |  45 +--
 drivers/ntb/hw/intel/ntb_hw_gen3.c  |  13 +-
 drivers/ntb/hw/intel/ntb_hw_gen3.h  |   8 +
 drivers/ntb/hw/intel/ntb_hw_gen4.c  | 500 ++++++++++++++++++++++++++++
 drivers/ntb/hw/intel/ntb_hw_gen4.h  |  87 +++++
 drivers/ntb/hw/intel/ntb_hw_intel.h |  12 +
 7 files changed, 640 insertions(+), 27 deletions(-)
 create mode 100644 drivers/ntb/hw/intel/ntb_hw_gen4.c
 create mode 100644 drivers/ntb/hw/intel/ntb_hw_gen4.h

diff --git a/drivers/ntb/hw/intel/Makefile b/drivers/ntb/hw/intel/Makefile
index 60ec8a773eea..f80da0ba15b2 100644
--- a/drivers/ntb/hw/intel/Makefile
+++ b/drivers/ntb/hw/intel/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_NTB_INTEL) += ntb_hw_intel.o
-ntb_hw_intel-y := ntb_hw_gen1.o ntb_hw_gen3.o
+ntb_hw_intel-y := ntb_hw_gen1.o ntb_hw_gen3.o ntb_hw_gen4.o
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.c b/drivers/ntb/hw/intel/ntb_hw_gen1.c
index bb57ec239029..cae70e7fe5bd 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen1.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen1.c
@@ -60,6 +60,7 @@
 #include "ntb_hw_intel.h"
 #include "ntb_hw_gen1.h"
 #include "ntb_hw_gen3.h"
+#include "ntb_hw_gen4.h"
 
 #define NTB_NAME	"ntb_hw_intel"
 #define NTB_DESC	"Intel(R) PCI-E Non-Transparent Bridge Driver"
@@ -762,6 +763,8 @@ static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf,
 		return ndev_ntb_debugfs_read(filp, ubuf, count, offp);
 	else if (pdev_is_gen3(ndev->ntb.pdev))
 		return ndev_ntb3_debugfs_read(filp, ubuf, count, offp);
+	else if (pdev_is_gen4(ndev->ntb.pdev))
+		return ndev_ntb4_debugfs_read(filp, ubuf, count, offp);
 
 	return -ENXIO;
 }
@@ -1858,16 +1861,15 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev,
 	int rc, node;
 
 	node = dev_to_node(&pdev->dev);
+	ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node);
+	if (!ndev) {
+		rc = -ENOMEM;
+		goto err_ndev;
+	}
 
-	if (pdev_is_gen1(pdev)) {
-		ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node);
-		if (!ndev) {
-			rc = -ENOMEM;
-			goto err_ndev;
-		}
-
-		ndev_init_struct(ndev, pdev);
+	ndev_init_struct(ndev, pdev);
 
+	if (pdev_is_gen1(pdev)) {
 		rc = intel_ntb_init_pci(ndev, pdev);
 		if (rc)
 			goto err_init_pci;
@@ -1875,17 +1877,8 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev,
 		rc = xeon_init_dev(ndev);
 		if (rc)
 			goto err_init_dev;
-
 	} else if (pdev_is_gen3(pdev)) {
-		ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node);
-		if (!ndev) {
-			rc = -ENOMEM;
-			goto err_ndev;
-		}
-
-		ndev_init_struct(ndev, pdev);
 		ndev->ntb.ops = &intel_ntb3_ops;
-
 		rc = intel_ntb_init_pci(ndev, pdev);
 		if (rc)
 			goto err_init_pci;
@@ -1893,7 +1886,15 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev,
 		rc = gen3_init_dev(ndev);
 		if (rc)
 			goto err_init_dev;
+	} else if (pdev_is_gen4(pdev)) {
+		ndev->ntb.ops = &intel_ntb4_ops;
+		rc = intel_ntb_init_pci(ndev, pdev);
+		if (rc)
+			goto err_init_pci;
 
+		rc = gen4_init_dev(ndev);
+		if (rc)
+			goto err_init_dev;
 	} else {
 		rc = -EINVAL;
 		goto err_ndev;
@@ -1915,7 +1916,7 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev,
 
 err_register:
 	ndev_deinit_debugfs(ndev);
-	if (pdev_is_gen1(pdev) || pdev_is_gen3(pdev))
+	if (pdev_is_gen1(pdev) || pdev_is_gen3(pdev) || pdev_is_gen4(pdev))
 		xeon_deinit_dev(ndev);
 err_init_dev:
 	intel_ntb_deinit_pci(ndev);
@@ -1931,7 +1932,7 @@ static void intel_ntb_pci_remove(struct pci_dev *pdev)
 
 	ntb_unregister_device(&ndev->ntb);
 	ndev_deinit_debugfs(ndev);
-	if (pdev_is_gen1(pdev) || pdev_is_gen3(pdev))
+	if (pdev_is_gen1(pdev) || pdev_is_gen3(pdev) || pdev_is_gen4(pdev))
 		xeon_deinit_dev(ndev);
 	intel_ntb_deinit_pci(ndev);
 	kfree(ndev);
@@ -2036,6 +2037,7 @@ static const struct file_operations intel_ntb_debugfs_info = {
 };
 
 static const struct pci_device_id intel_ntb_pci_tbl[] = {
+	/* GEN1 */
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
@@ -2051,7 +2053,12 @@ static const struct pci_device_id intel_ntb_pci_tbl[] = {
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_BDX)},
+
+	/* GEN3 */
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SKX)},
+
+	/* GEN4 */
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_ICX)},
 	{0}
 };
 MODULE_DEVICE_TABLE(pci, intel_ntb_pci_tbl);
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen3.c b/drivers/ntb/hw/intel/ntb_hw_gen3.c
index c3397160db7f..ffcfc3e02c35 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen3.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen3.c
@@ -415,9 +415,8 @@ ssize_t ndev_ntb3_debugfs_read(struct file *filp, char __user *ubuf,
 	return ret;
 }
 
-static int intel_ntb3_link_enable(struct ntb_dev *ntb,
-				  enum ntb_speed max_speed,
-				  enum ntb_width max_width)
+int intel_ntb3_link_enable(struct ntb_dev *ntb, enum ntb_speed max_speed,
+		enum ntb_width max_width)
 {
 	struct intel_ntb_dev *ndev;
 	u32 ntb_ctl;
@@ -532,7 +531,7 @@ static int intel_ntb3_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx,
 	return 0;
 }
 
-static int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
+int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
 				   resource_size_t *db_size,
 				   u64 *db_data, int db_bit)
 {
@@ -563,7 +562,7 @@ static int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
 	return 0;
 }
 
-static int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 {
 	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 	int bit;
@@ -581,7 +580,7 @@ static int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 	return 0;
 }
 
-static u64 intel_ntb3_db_read(struct ntb_dev *ntb)
+u64 intel_ntb3_db_read(struct ntb_dev *ntb)
 {
 	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
@@ -590,7 +589,7 @@ static u64 intel_ntb3_db_read(struct ntb_dev *ntb)
 			    ndev->self_reg->db_clear);
 }
 
-static int intel_ntb3_db_clear(struct ntb_dev *ntb, u64 db_bits)
+int intel_ntb3_db_clear(struct ntb_dev *ntb, u64 db_bits)
 {
 	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen3.h b/drivers/ntb/hw/intel/ntb_hw_gen3.h
index 75fb86ca27bb..2bc5d8356045 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen3.h
+++ b/drivers/ntb/hw/intel/ntb_hw_gen3.h
@@ -104,6 +104,14 @@ static inline void gen3_db_iowrite(u64 bits, void __iomem *mmio)
 ssize_t ndev_ntb3_debugfs_read(struct file *filp, char __user *ubuf,
 				      size_t count, loff_t *offp);
 int gen3_init_dev(struct intel_ntb_dev *ndev);
+int intel_ntb3_link_enable(struct ntb_dev *ntb, enum ntb_speed max_speed,
+		enum ntb_width max_width);
+u64 intel_ntb3_db_read(struct ntb_dev *ntb);
+int intel_ntb3_db_clear(struct ntb_dev *ntb, u64 db_bits);
+int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits);
+int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
+				resource_size_t *db_size,
+				u64 *db_data, int db_bit);
 
 extern const struct ntb_dev_ops intel_ntb3_ops;
 
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.c b/drivers/ntb/hw/intel/ntb_hw_gen4.c
new file mode 100644
index 000000000000..aab614099d34
--- /dev/null
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.c
@@ -0,0 +1,500 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/ntb.h>
+#include <linux/log2.h>
+
+#include "ntb_hw_intel.h"
+#include "ntb_hw_gen1.h"
+#include "ntb_hw_gen3.h"
+#include "ntb_hw_gen4.h"
+
+static int gen4_poll_link(struct intel_ntb_dev *ndev);
+static int gen4_link_is_up(struct intel_ntb_dev *ndev);
+
+static const struct intel_ntb_reg gen4_reg = {
+	.poll_link		= gen4_poll_link,
+	.link_is_up		= gen4_link_is_up,
+	.db_ioread		= gen3_db_ioread,
+	.db_iowrite		= gen3_db_iowrite,
+	.db_size		= sizeof(u32),
+	.ntb_ctl		= GEN4_NTBCNTL_OFFSET,
+	.mw_bar			= {2, 4},
+};
+
+static const struct intel_ntb_alt_reg gen4_pri_reg = {
+	.db_clear		= GEN4_IM_INT_STATUS_OFFSET,
+	.db_mask		= GEN4_IM_INT_DISABLE_OFFSET,
+	.spad			= GEN4_IM_SPAD_OFFSET,
+};
+
+static const struct intel_ntb_xlat_reg gen4_sec_xlat = {
+	.bar2_limit		= GEN4_IM23XLMT_OFFSET,
+	.bar2_xlat		= GEN4_IM23XBASE_OFFSET,
+	.bar2_idx		= GEN4_IM23XBASEIDX_OFFSET,
+};
+
+static const struct intel_ntb_alt_reg gen4_b2b_reg = {
+	.db_bell		= GEN4_IM_DOORBELL_OFFSET,
+	.spad			= GEN4_EM_SPAD_OFFSET,
+};
+
+static int gen4_poll_link(struct intel_ntb_dev *ndev)
+{
+	u16 reg_val;
+
+	/*
+	 * We need to write to DLLSCS bit in the SLOTSTS before we
+	 * can clear the hardware link interrupt on ICX NTB.
+	 */
+	iowrite16(GEN4_SLOTSTS_DLLSCS, ndev->self_mmio + GEN4_SLOTSTS);
+	ndev->reg->db_iowrite(ndev->db_link_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_clear);
+
+	reg_val = ioread16(ndev->self_mmio + GEN4_LINK_STATUS_OFFSET);
+	if (reg_val == ndev->lnk_sta)
+		return 0;
+
+	ndev->lnk_sta = reg_val;
+
+	return 1;
+}
+
+static int gen4_link_is_up(struct intel_ntb_dev *ndev)
+{
+	return NTB_LNK_STA_ACTIVE(ndev->lnk_sta);
+}
+
+static int gen4_init_isr(struct intel_ntb_dev *ndev)
+{
+	int i;
+
+	/*
+	 * The MSIX vectors and the interrupt status bits are not lined up
+	 * on Gen3 (Skylake) and Gen4. By default the link status bit is bit
+	 * 32, however it is by default MSIX vector0. We need to fixup to
+	 * line them up. The vectors at reset is 1-32,0. We need to reprogram
+	 * to 0-32.
+	 */
+	for (i = 0; i < GEN4_DB_MSIX_VECTOR_COUNT; i++)
+		iowrite8(i, ndev->self_mmio + GEN4_INTVEC_OFFSET + i);
+
+	return ndev_init_isr(ndev, GEN4_DB_MSIX_VECTOR_COUNT,
+			     GEN4_DB_MSIX_VECTOR_COUNT,
+			     GEN4_DB_MSIX_VECTOR_SHIFT,
+			     GEN4_DB_TOTAL_SHIFT);
+}
+
+static int gen4_setup_b2b_mw(struct intel_ntb_dev *ndev,
+			    const struct intel_b2b_addr *addr,
+			    const struct intel_b2b_addr *peer_addr)
+{
+	struct pci_dev *pdev;
+	void __iomem *mmio;
+	phys_addr_t bar_addr;
+
+	pdev = ndev->ntb.pdev;
+	mmio = ndev->self_mmio;
+
+	/* setup incoming bar limits == base addrs (zero length windows) */
+	bar_addr = addr->bar2_addr64;
+	iowrite64(bar_addr, mmio + GEN4_IM23XLMT_OFFSET);
+	bar_addr = ioread64(mmio + GEN4_IM23XLMT_OFFSET);
+	dev_dbg(&pdev->dev, "IM23XLMT %#018llx\n", bar_addr);
+
+	bar_addr = addr->bar4_addr64;
+	iowrite64(bar_addr, mmio + GEN4_IM45XLMT_OFFSET);
+	bar_addr = ioread64(mmio + GEN4_IM45XLMT_OFFSET);
+	dev_dbg(&pdev->dev, "IM45XLMT %#018llx\n", bar_addr);
+
+	/* zero incoming translation addrs */
+	iowrite64(0, mmio + GEN4_IM23XBASE_OFFSET);
+	iowrite64(0, mmio + GEN4_IM45XBASE_OFFSET);
+
+	ndev->peer_mmio = ndev->self_mmio;
+
+	return 0;
+}
+
+static int gen4_init_ntb(struct intel_ntb_dev *ndev)
+{
+	int rc;
+
+
+	ndev->mw_count = XEON_MW_COUNT;
+	ndev->spad_count = GEN4_SPAD_COUNT;
+	ndev->db_count = GEN4_DB_COUNT;
+	ndev->db_link_mask = GEN4_DB_LINK_BIT;
+
+	ndev->self_reg = &gen4_pri_reg;
+	ndev->xlat_reg = &gen4_sec_xlat;
+	ndev->peer_reg = &gen4_b2b_reg;
+
+	if (ndev->ntb.topo == NTB_TOPO_B2B_USD)
+		rc = gen4_setup_b2b_mw(ndev, &xeon_b2b_dsd_addr,
+				&xeon_b2b_usd_addr);
+	else
+		rc = gen4_setup_b2b_mw(ndev, &xeon_b2b_usd_addr,
+				&xeon_b2b_dsd_addr);
+	if (rc)
+		return rc;
+
+	ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
+
+	ndev->reg->db_iowrite(ndev->db_valid_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_mask);
+
+	return 0;
+}
+
+static enum ntb_topo gen4_ppd_topo(struct intel_ntb_dev *ndev, u32 ppd)
+{
+	switch (ppd & GEN4_PPD_TOPO_MASK) {
+	case GEN4_PPD_TOPO_B2B_USD:
+		return NTB_TOPO_B2B_USD;
+	case GEN4_PPD_TOPO_B2B_DSD:
+		return NTB_TOPO_B2B_DSD;
+	}
+
+	return NTB_TOPO_NONE;
+}
+
+int gen4_init_dev(struct intel_ntb_dev *ndev)
+{
+	struct pci_dev *pdev = ndev->ntb.pdev;
+	u32 ppd1/*, ppd0*/;
+	u16 lnkctl;
+	int rc;
+
+	ndev->reg = &gen4_reg;
+
+	ppd1 = ioread32(ndev->self_mmio + GEN4_PPD1_OFFSET);
+	ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1);
+	dev_dbg(&pdev->dev, "ppd %#x topo %s\n", ppd1,
+		ntb_topo_string(ndev->ntb.topo));
+	if (ndev->ntb.topo == NTB_TOPO_NONE)
+		return -EINVAL;
+
+	rc = gen4_init_ntb(ndev);
+	if (rc)
+		return rc;
+
+	/* init link setup */
+	lnkctl = ioread16(ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+	lnkctl |= GEN4_LINK_CTRL_LINK_DISABLE;
+	iowrite16(lnkctl, ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+
+	return gen4_init_isr(ndev);
+}
+
+ssize_t ndev_ntb4_debugfs_read(struct file *filp, char __user *ubuf,
+				      size_t count, loff_t *offp)
+{
+	struct intel_ntb_dev *ndev;
+	void __iomem *mmio;
+	char *buf;
+	size_t buf_size;
+	ssize_t ret, off;
+	union { u64 v64; u32 v32; u16 v16; } u;
+
+	ndev = filp->private_data;
+	mmio = ndev->self_mmio;
+
+	buf_size = min(count, 0x800ul);
+
+	buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	off = 0;
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB Device Information:\n");
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Connection Topology -\t%s\n",
+			 ntb_topo_string(ndev->ntb.topo));
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB CTL -\t\t%#06x\n", ndev->ntb_ctl);
+	off += scnprintf(buf + off, buf_size - off,
+			 "LNK STA (cached) -\t\t%#06x\n", ndev->lnk_sta);
+
+	if (!ndev->reg->link_is_up(ndev))
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tDown\n");
+	else {
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tUp\n");
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Speed -\t\tPCI-E Gen %u\n",
+				 NTB_LNK_STA_SPEED(ndev->lnk_sta));
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Width -\t\tx%u\n",
+				 NTB_LNK_STA_WIDTH(ndev->lnk_sta));
+	}
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Memory Window Count -\t%u\n", ndev->mw_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Scratchpad Count -\t%u\n", ndev->spad_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Count -\t%u\n", ndev->db_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Vector Count -\t%u\n", ndev->db_vec_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Vector Shift -\t%u\n", ndev->db_vec_shift);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Valid Mask -\t%#llx\n", ndev->db_valid_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Link Mask -\t%#llx\n", ndev->db_link_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask Cached -\t%#llx\n", ndev->db_mask);
+
+	u.v64 = ndev_db_read(ndev, mmio + ndev->self_reg->db_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask -\t\t%#llx\n", u.v64);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Incoming XLAT:\n");
+
+	u.v64 = ioread64(mmio + GEN4_IM23XBASE_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "IM23XBASE -\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + GEN4_IM45XBASE_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "IM45XBASE -\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + GEN4_IM23XLMT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "IM23XLMT -\t\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + GEN4_IM45XLMT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "IM45XLMT -\t\t\t%#018llx\n", u.v64);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Statistics:\n");
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Hardware Errors:\n");
+
+	if (!pci_read_config_word(ndev->ntb.pdev,
+				  GEN4_DEVSTS_OFFSET, &u.v16))
+		off += scnprintf(buf + off, buf_size - off,
+				"DEVSTS -\t\t%#06x\n", u.v16);
+
+	u.v16 = ioread16(mmio + GEN4_LINK_STATUS_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			"LNKSTS -\t\t%#06x\n", u.v16);
+
+	if (!pci_read_config_dword(ndev->ntb.pdev,
+				   GEN4_UNCERRSTS_OFFSET, &u.v32))
+		off += scnprintf(buf + off, buf_size - off,
+				 "UNCERRSTS -\t\t%#06x\n", u.v32);
+
+	if (!pci_read_config_dword(ndev->ntb.pdev,
+				   GEN4_CORERRSTS_OFFSET, &u.v32))
+		off += scnprintf(buf + off, buf_size - off,
+				 "CORERRSTS -\t\t%#06x\n", u.v32);
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, off);
+	kfree(buf);
+	return ret;
+}
+
+static int intel_ntb4_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx,
+				   dma_addr_t addr, resource_size_t size)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+	unsigned long xlat_reg, limit_reg, idx_reg;
+	unsigned short base_idx, reg_val16;
+	resource_size_t bar_size, mw_size;
+	void __iomem *mmio;
+	u64 base, limit, reg_val;
+	int bar;
+
+	if (pidx != NTB_DEF_PEER_IDX)
+		return -EINVAL;
+
+	if (idx >= ndev->b2b_idx && !ndev->b2b_off)
+		idx += 1;
+
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
+
+	bar_size = pci_resource_len(ndev->ntb.pdev, bar);
+
+	if (idx == ndev->b2b_idx)
+		mw_size = bar_size - ndev->b2b_off;
+	else
+		mw_size = bar_size;
+
+	/* hardware requires that addr is aligned to bar size */
+	if (addr & (bar_size - 1))
+		return -EINVAL;
+
+	/* make sure the range fits in the usable mw size */
+	if (size > mw_size)
+		return -EINVAL;
+
+	mmio = ndev->self_mmio;
+	xlat_reg = ndev->xlat_reg->bar2_xlat + (idx * 0x10);
+	limit_reg = ndev->xlat_reg->bar2_limit + (idx * 0x10);
+	idx_reg = ndev->xlat_reg->bar2_idx + (idx * 0x2);
+	base = pci_resource_start(ndev->ntb.pdev, bar);
+
+	/* Set the limit if supported, if size is not mw_size */
+	if (limit_reg && size != mw_size) {
+		limit = base + size;
+		base_idx = __ilog2_u64(size);
+	} else {
+		limit = base + mw_size;
+		base_idx = __ilog2_u64(mw_size);
+	}
+
+
+	/* set and verify setting the translation address */
+	iowrite64(addr, mmio + xlat_reg);
+	reg_val = ioread64(mmio + xlat_reg);
+	if (reg_val != addr) {
+		iowrite64(0, mmio + xlat_reg);
+		return -EIO;
+	}
+
+	dev_dbg(&ntb->pdev->dev, "BAR %d IMXBASE: %#Lx\n", bar, reg_val);
+
+	/* set and verify setting the limit */
+	iowrite64(limit, mmio + limit_reg);
+	reg_val = ioread64(mmio + limit_reg);
+	if (reg_val != limit) {
+		iowrite64(base, mmio + limit_reg);
+		iowrite64(0, mmio + xlat_reg);
+		return -EIO;
+	}
+
+	dev_dbg(&ntb->pdev->dev, "BAR %d IMXLMT: %#Lx\n", bar, reg_val);
+
+	iowrite16(base_idx, mmio + idx_reg);
+	reg_val16 = ioread16(mmio + idx_reg);
+	if (reg_val16 != base_idx) {
+		iowrite64(base, mmio + limit_reg);
+		iowrite64(0, mmio + xlat_reg);
+		iowrite16(0, mmio + idx_reg);
+		return -EIO;
+	}
+
+	dev_dbg(&ntb->pdev->dev, "BAR %d IMBASEIDX: %#x\n", bar, reg_val16);
+
+	return 0;
+}
+
+static int intel_ntb4_link_enable(struct ntb_dev *ntb,
+		enum ntb_speed max_speed, enum ntb_width max_width)
+{
+	struct intel_ntb_dev *ndev;
+	u32 ntb_ctl, ppd0;
+	u16 lnkctl;
+
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
+
+	dev_dbg(&ntb->pdev->dev,
+			"Enabling link with max_speed %d max_width %d\n",
+			max_speed, max_width);
+
+	if (max_speed != NTB_SPEED_AUTO)
+		dev_dbg(&ntb->pdev->dev,
+				"ignoring max_speed %d\n", max_speed);
+	if (max_width != NTB_WIDTH_AUTO)
+		dev_dbg(&ntb->pdev->dev,
+				"ignoring max_width %d\n", max_width);
+
+	ntb_ctl = NTB_CTL_E2I_BAR23_SNOOP | NTB_CTL_I2E_BAR23_SNOOP;
+	ntb_ctl |= NTB_CTL_E2I_BAR45_SNOOP | NTB_CTL_I2E_BAR45_SNOOP;
+	iowrite32(ntb_ctl, ndev->self_mmio + ndev->reg->ntb_ctl);
+
+	lnkctl = ioread16(ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+	lnkctl &= ~GEN4_LINK_CTRL_LINK_DISABLE;
+	iowrite16(lnkctl, ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+
+	/* start link training in PPD0 */
+	ppd0 = ioread32(ndev->self_mmio + GEN4_PPD0_OFFSET);
+	ppd0 |= GEN4_PPD_LINKTRN;
+	iowrite32(ppd0, ndev->self_mmio + GEN4_PPD0_OFFSET);
+
+	/* make sure link training has started */
+	ppd0 = ioread32(ndev->self_mmio + GEN4_PPD0_OFFSET);
+	if (!(ppd0 & GEN4_PPD_LINKTRN)) {
+		dev_warn(&ntb->pdev->dev, "Link is not training\n");
+		return -ENXIO;
+	}
+
+	ndev->dev_up = 1;
+
+	return 0;
+}
+
+int intel_ntb4_link_disable(struct ntb_dev *ntb)
+{
+	struct intel_ntb_dev *ndev;
+	u32 ntb_cntl;
+	u16 lnkctl;
+
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
+
+	dev_dbg(&ntb->pdev->dev, "Disabling link\n");
+
+	/* clear the snoop bits */
+	ntb_cntl = ioread32(ndev->self_mmio + ndev->reg->ntb_ctl);
+	ntb_cntl &= ~(NTB_CTL_E2I_BAR23_SNOOP | NTB_CTL_I2E_BAR23_SNOOP);
+	ntb_cntl &= ~(NTB_CTL_E2I_BAR45_SNOOP | NTB_CTL_I2E_BAR45_SNOOP);
+	iowrite32(ntb_cntl, ndev->self_mmio + ndev->reg->ntb_ctl);
+
+	lnkctl = ioread16(ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+	lnkctl |= GEN4_LINK_CTRL_LINK_DISABLE;
+	iowrite16(lnkctl, ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+
+	ndev->dev_up = 0;
+
+	return 0;
+}
+
+const struct ntb_dev_ops intel_ntb4_ops = {
+	.mw_count		= intel_ntb_mw_count,
+	.mw_get_align		= intel_ntb_mw_get_align,
+	.mw_set_trans		= intel_ntb4_mw_set_trans,
+	.peer_mw_count		= intel_ntb_peer_mw_count,
+	.peer_mw_get_addr	= intel_ntb_peer_mw_get_addr,
+	.link_is_up		= intel_ntb_link_is_up,
+	.link_enable		= intel_ntb4_link_enable,
+	.link_disable		= intel_ntb4_link_disable,
+	.db_valid_mask		= intel_ntb_db_valid_mask,
+	.db_vector_count	= intel_ntb_db_vector_count,
+	.db_vector_mask		= intel_ntb_db_vector_mask,
+	.db_read		= intel_ntb3_db_read,
+	.db_clear		= intel_ntb3_db_clear,
+	.db_set_mask		= intel_ntb_db_set_mask,
+	.db_clear_mask		= intel_ntb_db_clear_mask,
+	.peer_db_addr		= intel_ntb3_peer_db_addr,
+	.peer_db_set		= intel_ntb3_peer_db_set,
+	.spad_is_unsafe		= intel_ntb_spad_is_unsafe,
+	.spad_count		= intel_ntb_spad_count,
+	.spad_read		= intel_ntb_spad_read,
+	.spad_write		= intel_ntb_spad_write,
+	.peer_spad_addr		= intel_ntb_peer_spad_addr,
+	.peer_spad_read		= intel_ntb_peer_spad_read,
+	.peer_spad_write	= intel_ntb_peer_spad_write,
+};
+
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.h b/drivers/ntb/hw/intel/ntb_hw_gen4.h
new file mode 100644
index 000000000000..7d7a82e8518d
--- /dev/null
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)          */
+/* Copyright(c) 2020 Intel Corporation. All rights reserved.   */
+#ifndef _NTB_INTEL_GEN4_H_
+#define _NTB_INTEL_GEN4_H_
+
+#include "ntb_hw_intel.h"
+
+/* Intel Gen4 NTB hardware */
+/* PCIe config space */
+#define GEN4_IMBAR23SZ_OFFSET		0x00c4
+#define GEN4_IMBAR45SZ_OFFSET		0x00c5
+#define GEN4_EMBAR23SZ_OFFSET		0x00c6
+#define GEN4_EMBAR45SZ_OFFSET		0x00c7
+#define GEN4_DEVCTRL_OFFSET		0x0048
+#define GEN4_DEVSTS_OFFSET		0x004a
+#define GEN4_UNCERRSTS_OFFSET		0x0104
+#define GEN4_CORERRSTS_OFFSET		0x0110
+
+/* BAR0 MMIO */
+#define GEN4_NTBCNTL_OFFSET		0x0000
+#define GEN4_IM23XBASE_OFFSET		0x0010	/* IMBAR1XBASE */
+#define GEN4_IM23XLMT_OFFSET		0x0018  /* IMBAR1XLMT */
+#define GEN4_IM45XBASE_OFFSET		0x0020	/* IMBAR2XBASE */
+#define GEN4_IM45XLMT_OFFSET		0x0028  /* IMBAR2XLMT */
+#define GEN4_IM_INT_STATUS_OFFSET	0x0040
+#define GEN4_IM_INT_DISABLE_OFFSET	0x0048
+#define GEN4_INTVEC_OFFSET		0x0050  /* 0-32 vecs */
+#define GEN4_IM23XBASEIDX_OFFSET	0x0074
+#define GEN4_IM45XBASEIDX_OFFSET	0x0076
+#define GEN4_IM_SPAD_OFFSET		0x0080  /* 0-15 SPADs */
+#define GEN4_IM_SPAD_SEM_OFFSET		0x00c0	/* SPAD hw semaphore */
+#define GEN4_IM_SPAD_STICKY_OFFSET	0x00c4  /* sticky SPAD */
+#define GEN4_IM_DOORBELL_OFFSET		0x0100  /* 0-31 doorbells */
+#define GEN4_EM_SPAD_OFFSET		0x8080
+/* note, link status is now in MMIO and not config space for NTB */
+#define GEN4_LINK_CTRL_OFFSET		0xb050
+#define GEN4_LINK_STATUS_OFFSET		0xb052
+#define GEN4_PPD0_OFFSET		0xb0d4
+#define GEN4_PPD1_OFFSET		0xb4c0
+#define GEN4_LTSSMSTATEJMP		0xf040
+
+#define GEN4_PPD_CLEAR_TRN		0x0001
+#define GEN4_PPD_LINKTRN		0x0008
+#define GEN4_PPD_CONN_MASK		0x0300
+#define GEN4_PPD_CONN_B2B		0x0200
+#define GEN4_PPD_DEV_MASK		0x1000
+#define GEN4_PPD_DEV_DSD		0x1000
+#define GEN4_PPD_DEV_USD		0x0000
+#define GEN4_LINK_CTRL_LINK_DISABLE	0x0010
+
+#define GEN4_SLOTSTS			0xb05a
+#define GEN4_SLOTSTS_DLLSCS		0x100
+
+#define GEN4_PPD_TOPO_MASK	(GEN4_PPD_CONN_MASK | GEN4_PPD_DEV_MASK)
+#define GEN4_PPD_TOPO_B2B_USD	(GEN4_PPD_CONN_B2B | GEN4_PPD_DEV_USD)
+#define GEN4_PPD_TOPO_B2B_DSD	(GEN4_PPD_CONN_B2B | GEN4_PPD_DEV_DSD)
+
+#define GEN4_DB_COUNT			32
+#define GEN4_DB_LINK			32
+#define GEN4_DB_LINK_BIT		BIT_ULL(GEN4_DB_LINK)
+#define GEN4_DB_MSIX_VECTOR_COUNT	33
+#define GEN4_DB_MSIX_VECTOR_SHIFT	1
+#define GEN4_DB_TOTAL_SHIFT		33
+#define GEN4_SPAD_COUNT			16
+
+#define NTB_CTL_E2I_BAR23_SNOOP		0x000004
+#define NTB_CTL_E2I_BAR23_NOSNOOP	0x000008
+#define NTB_CTL_I2E_BAR23_SNOOP		0x000010
+#define NTB_CTL_I2E_BAR23_NOSNOOP	0x000020
+#define NTB_CTL_E2I_BAR45_SNOOP		0x000040
+#define NTB_CTL_E2I_BAR45_NOSNOO	0x000080
+#define NTB_CTL_I2E_BAR45_SNOOP		0x000100
+#define NTB_CTL_I2E_BAR45_NOSNOOP	0x000200
+#define NTB_CTL_BUSNO_DIS_INC		0x000400
+#define NTB_CTL_LINK_DOWN		0x010000
+
+#define NTB_SJC_FORCEDETECT		0x000004
+
+ssize_t ndev_ntb4_debugfs_read(struct file *filp, char __user *ubuf,
+				      size_t count, loff_t *offp);
+int gen4_init_dev(struct intel_ntb_dev *ndev);
+ssize_t ndev_ntb4_debugfs_read(struct file *filp, char __user *ubuf,
+				      size_t count, loff_t *offp);
+
+extern const struct ntb_dev_ops intel_ntb4_ops;
+
+#endif
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h
index e071e28bca3f..d61fcd91714b 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.h
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.h
@@ -72,6 +72,7 @@
 #define PCI_DEVICE_ID_INTEL_NTB_PS_BDX	0x6F0E
 #define PCI_DEVICE_ID_INTEL_NTB_SS_BDX	0x6F0F
 #define PCI_DEVICE_ID_INTEL_NTB_B2B_SKX	0x201C
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_ICX	0x347e
 
 /* Ntb control and link status */
 #define NTB_CTL_CFG_LOCK		BIT(0)
@@ -120,6 +121,7 @@ struct intel_ntb_xlat_reg {
 	unsigned long			bar0_base;
 	unsigned long			bar2_xlat;
 	unsigned long			bar2_limit;
+	unsigned short			bar2_idx;
 };
 
 struct intel_b2b_addr {
@@ -182,6 +184,9 @@ struct intel_ntb_dev {
 
 	struct dentry			*debugfs_dir;
 	struct dentry			*debugfs_info;
+
+	/* gen4 entries */
+	int				dev_up;
 };
 
 #define ntb_ndev(__ntb) container_of(__ntb, struct intel_ntb_dev, ntb)
@@ -219,4 +224,11 @@ static inline int pdev_is_gen3(struct pci_dev *pdev)
 	return 0;
 }
 
+static inline int pdev_is_gen4(struct pci_dev *pdev)
+{
+	if (pdev->device == PCI_DEVICE_ID_INTEL_NTB_B2B_ICX)
+		return 1;
+
+	return 0;
+}
 #endif
-- 
Gitee


From 109bc9239b3269e8933cf58b17d021649103bd03 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 5 Jun 2020 08:13:34 -0700
Subject: [PATCH 0824/2161] ntb: intel: add hw workaround for NTB BAR alignment

commit 134a86545c6072c049a2c6e77d6843e650df511d upstream.

Add NTB_HWERR_BAR_ALIGN hw errata flag to work around issue where the
aligment for the XLAT base must be BAR size aligned rather than 4k page
aligned. On ICX platform, the XLAT base can be 4k page size aligned
rather than BAR size aligned unlike the previous gen Intel NTB. However,
a silicon errata prevented this from working as expected and a workaround
is introduced to resolve the issue.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/ntb/hw/intel/ntb_hw_gen1.h |  1 +
 drivers/ntb/hw/intel/ntb_hw_gen4.c | 78 +++++++++++++++++++++++++-----
 drivers/ntb/hw/intel/ntb_hw_gen4.h | 13 +++++
 3 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.h b/drivers/ntb/hw/intel/ntb_hw_gen1.h
index 544cf5c06f4d..1b759942d8af 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen1.h
+++ b/drivers/ntb/hw/intel/ntb_hw_gen1.h
@@ -140,6 +140,7 @@
 #define NTB_HWERR_SB01BASE_LOCKUP	BIT_ULL(1)
 #define NTB_HWERR_B2BDOORBELL_BIT14	BIT_ULL(2)
 #define NTB_HWERR_MSIX_VECTOR32_BAD	BIT_ULL(3)
+#define NTB_HWERR_BAR_ALIGN		BIT_ULL(4)
 
 extern struct intel_b2b_addr xeon_b2b_usd_addr;
 extern struct intel_b2b_addr xeon_b2b_dsd_addr;
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.c b/drivers/ntb/hw/intel/ntb_hw_gen4.c
index aab614099d34..28fa0ab7b0fb 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen4.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.c
@@ -177,6 +177,9 @@ int gen4_init_dev(struct intel_ntb_dev *ndev)
 
 	ndev->reg = &gen4_reg;
 
+	if (pdev_is_ICX(pdev))
+		ndev->hwerr_flags |= NTB_HWERR_BAR_ALIGN;
+
 	ppd1 = ioread32(ndev->self_mmio + GEN4_PPD1_OFFSET);
 	ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1);
 	dev_dbg(&pdev->dev, "ppd %#x topo %s\n", ppd1,
@@ -342,9 +345,14 @@ static int intel_ntb4_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx,
 	else
 		mw_size = bar_size;
 
-	/* hardware requires that addr is aligned to bar size */
-	if (addr & (bar_size - 1))
-		return -EINVAL;
+	if (ndev->hwerr_flags & NTB_HWERR_BAR_ALIGN) {
+		/* hardware requires that addr is aligned to bar size */
+		if (addr & (bar_size - 1))
+			return -EINVAL;
+	} else {
+		if (addr & (PAGE_SIZE - 1))
+			return -EINVAL;
+	}
 
 	/* make sure the range fits in the usable mw size */
 	if (size > mw_size)
@@ -353,7 +361,6 @@ static int intel_ntb4_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx,
 	mmio = ndev->self_mmio;
 	xlat_reg = ndev->xlat_reg->bar2_xlat + (idx * 0x10);
 	limit_reg = ndev->xlat_reg->bar2_limit + (idx * 0x10);
-	idx_reg = ndev->xlat_reg->bar2_idx + (idx * 0x2);
 	base = pci_resource_start(ndev->ntb.pdev, bar);
 
 	/* Set the limit if supported, if size is not mw_size */
@@ -387,16 +394,19 @@ static int intel_ntb4_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx,
 
 	dev_dbg(&ntb->pdev->dev, "BAR %d IMXLMT: %#Lx\n", bar, reg_val);
 
-	iowrite16(base_idx, mmio + idx_reg);
-	reg_val16 = ioread16(mmio + idx_reg);
-	if (reg_val16 != base_idx) {
-		iowrite64(base, mmio + limit_reg);
-		iowrite64(0, mmio + xlat_reg);
-		iowrite16(0, mmio + idx_reg);
-		return -EIO;
+	if (ndev->hwerr_flags & NTB_HWERR_BAR_ALIGN) {
+		idx_reg = ndev->xlat_reg->bar2_idx + (idx * 0x2);
+		iowrite16(base_idx, mmio + idx_reg);
+		reg_val16 = ioread16(mmio + idx_reg);
+		if (reg_val16 != base_idx) {
+			iowrite64(base, mmio + limit_reg);
+			iowrite64(0, mmio + xlat_reg);
+			iowrite16(0, mmio + idx_reg);
+			return -EIO;
+		}
+		dev_dbg(&ntb->pdev->dev, "BAR %d IMBASEIDX: %#x\n", bar, reg_val16);
 	}
 
-	dev_dbg(&ntb->pdev->dev, "BAR %d IMBASEIDX: %#x\n", bar, reg_val16);
 
 	return 0;
 }
@@ -471,9 +481,51 @@ int intel_ntb4_link_disable(struct ntb_dev *ntb)
 	return 0;
 }
 
+static int intel_ntb4_mw_get_align(struct ntb_dev *ntb, int pidx, int idx,
+				   resource_size_t *addr_align,
+				   resource_size_t *size_align,
+				   resource_size_t *size_max)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+	resource_size_t bar_size, mw_size;
+	int bar;
+
+	if (pidx != NTB_DEF_PEER_IDX)
+		return -EINVAL;
+
+	if (idx >= ndev->b2b_idx && !ndev->b2b_off)
+		idx += 1;
+
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
+
+	bar_size = pci_resource_len(ndev->ntb.pdev, bar);
+
+	if (idx == ndev->b2b_idx)
+		mw_size = bar_size - ndev->b2b_off;
+	else
+		mw_size = bar_size;
+
+	if (addr_align) {
+		if (ndev->hwerr_flags & NTB_HWERR_BAR_ALIGN)
+			*addr_align = pci_resource_len(ndev->ntb.pdev, bar);
+		else
+			*addr_align = PAGE_SIZE;
+	}
+
+	if (size_align)
+		*size_align = 1;
+
+	if (size_max)
+		*size_max = mw_size;
+
+	return 0;
+}
+
 const struct ntb_dev_ops intel_ntb4_ops = {
 	.mw_count		= intel_ntb_mw_count,
-	.mw_get_align		= intel_ntb_mw_get_align,
+	.mw_get_align		= intel_ntb4_mw_get_align,
 	.mw_set_trans		= intel_ntb4_mw_set_trans,
 	.peer_mw_count		= intel_ntb_peer_mw_count,
 	.peer_mw_get_addr	= intel_ntb_peer_mw_get_addr,
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.h b/drivers/ntb/hw/intel/ntb_hw_gen4.h
index 7d7a82e8518d..a868c788de02 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen4.h
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.h
@@ -5,6 +5,10 @@
 
 #include "ntb_hw_intel.h"
 
+/* Supported PCI device revision range for ICX */
+#define PCI_DEVICE_REVISION_ICX_MIN	0x2
+#define PCI_DEVICE_REVISION_ICX_MAX	0xF
+
 /* Intel Gen4 NTB hardware */
 /* PCIe config space */
 #define GEN4_IMBAR23SZ_OFFSET		0x00c4
@@ -84,4 +88,13 @@ ssize_t ndev_ntb4_debugfs_read(struct file *filp, char __user *ubuf,
 
 extern const struct ntb_dev_ops intel_ntb4_ops;
 
+static inline int pdev_is_ICX(struct pci_dev *pdev)
+{
+	if (pdev_is_gen4(pdev) &&
+	    pdev->revision >= PCI_DEVICE_REVISION_ICX_MIN &&
+	    pdev->revision <= PCI_DEVICE_REVISION_ICX_MAX)
+		return 1;
+	return 0;
+}
+
 #endif
-- 
Gitee


From fe8ca5a3a0ef4a858648fcd44db6463bf3d6e375 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 May 2020 10:28:53 -0700
Subject: [PATCH 0825/2161] ntb: intel: fix static declaration

commit 893733c58d59f43e4c5219bf9ab9e9d39bb14289 upstream.

intel_ntb4_link_disable() missing static declaration.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/ntb/hw/intel/ntb_hw_gen4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.c b/drivers/ntb/hw/intel/ntb_hw_gen4.c
index 28fa0ab7b0fb..bc4541cbf8c6 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen4.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.c
@@ -456,7 +456,7 @@ static int intel_ntb4_link_enable(struct ntb_dev *ntb,
 	return 0;
 }
 
-int intel_ntb4_link_disable(struct ntb_dev *ntb)
+static int intel_ntb4_link_disable(struct ntb_dev *ntb)
 {
 	struct intel_ntb_dev *ndev;
 	u32 ntb_cntl;
-- 
Gitee


From 995691d7c78bf2b3621022615227eea517e813a0 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 29 Jun 2020 13:34:50 +0800
Subject: [PATCH 0826/2161] powercap: intel_rapl: add support for Sapphire
 Rapids

commit 2d798d9f5967a3f134b1c55acbbe026c032e3429 upstream.

RAPL on SPR behaves similar to Haswell server, except that SPR uses
a fixed energy unit (1 Joule) for the PSYS RAPL domain.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/powercap/intel_rapl_common.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 925b0004a0ed..a242c3480296 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -96,6 +96,7 @@ struct rapl_defaults {
 	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
 				    bool to_raw);
 	unsigned int dram_domain_energy_unit;
+	unsigned int psys_domain_energy_unit;
 };
 static struct rapl_defaults *rapl_defaults;
 
@@ -536,12 +537,23 @@ static void rapl_init_domains(struct rapl_package *rp)
 		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
 			rd->regs[j] = rp->priv->regs[i][j];
 
-		if (i == RAPL_DOMAIN_DRAM) {
+		switch (i) {
+		case RAPL_DOMAIN_DRAM:
 			rd->domain_energy_unit =
 			    rapl_defaults->dram_domain_energy_unit;
 			if (rd->domain_energy_unit)
 				pr_info("DRAM domain energy unit %dpj\n",
 					rd->domain_energy_unit);
+			break;
+		case RAPL_DOMAIN_PLATFORM:
+			rd->domain_energy_unit =
+			    rapl_defaults->psys_domain_energy_unit;
+			if (rd->domain_energy_unit)
+				pr_info("Platform domain energy unit %dpj\n",
+					rd->domain_energy_unit);
+			break;
+		default:
+			break;
 		}
 		rd++;
 	}
@@ -922,6 +934,14 @@ static const struct rapl_defaults rapl_defaults_hsw_server = {
 	.dram_domain_energy_unit = 15300,
 };
 
+static const struct rapl_defaults rapl_defaults_spr_server = {
+	.check_unit = rapl_check_unit_core,
+	.set_floor_freq = set_floor_freq_default,
+	.compute_time_window = rapl_compute_time_window_core,
+	.dram_domain_energy_unit = 15300,
+	.psys_domain_energy_unit = 1000000000,
+};
+
 static const struct rapl_defaults rapl_defaults_byt = {
 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
 	.check_unit = rapl_check_unit_atom,
@@ -978,6 +998,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	INTEL_CPU_FAM6(ICELAKE_NNPI, rapl_defaults_core),
 	INTEL_CPU_FAM6(ICELAKE_X, rapl_defaults_hsw_server),
 	INTEL_CPU_FAM6(ICELAKE_D, rapl_defaults_hsw_server),
+	INTEL_CPU_FAM6(SAPPHIRERAPIDS_X, rapl_defaults_spr_server),
 
 	INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
 	INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
-- 
Gitee


From 44f69bd9fbc2665cace77066c4b4fe28050412a6 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:08 -0700
Subject: [PATCH 0827/2161] perf/x86/intel/lbr: Add a function pointer for LBR
 reset

commit 9f354a726cb1d4eb00a0784a27eaa0a3283cff71 upstream.

The method to reset Architectural LBRs is different from previous
model-specific LBR. Perf has to implement a different function.

A function pointer is introduced for LBR reset. The enum of
LBR_FORMAT_* is also moved to perf_event.h. Perf should initialize the
corresponding functions at boot time, and avoid checking lbr_format at
run time.

The current 64-bit LBR reset function is set as default.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-3-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c |  7 +++++++
 arch/x86/events/intel/lbr.c  | 20 +++-----------------
 arch/x86/events/perf_event.h | 17 +++++++++++++++++
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f0ac975f3d22..c6acc095dad4 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4249,6 +4249,8 @@ static __initconst const struct x86_pmu core_pmu = {
 	.cpu_dead		= intel_pmu_cpu_dead,
 
 	.check_period		= intel_pmu_check_period,
+
+	.lbr_reset		= intel_pmu_lbr_reset_64,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4293,6 +4295,8 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.check_period		= intel_pmu_check_period,
 
 	.aux_output_match	= intel_pmu_aux_output_match,
+
+	.lbr_reset		= intel_pmu_lbr_reset_64,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -4931,6 +4935,9 @@ __init int intel_pmu_init(void)
 		x86_pmu.intel_cap.capabilities = capabilities;
 	}
 
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+		x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
+
 	intel_ds_init();
 
 	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 73dbd30f82d3..0a45ee3b578a 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -8,17 +8,6 @@
 
 #include "../perf_event.h"
 
-enum {
-	LBR_FORMAT_32		= 0x00,
-	LBR_FORMAT_LIP		= 0x01,
-	LBR_FORMAT_EIP		= 0x02,
-	LBR_FORMAT_EIP_FLAGS	= 0x03,
-	LBR_FORMAT_EIP_FLAGS2	= 0x04,
-	LBR_FORMAT_INFO		= 0x05,
-	LBR_FORMAT_TIME		= 0x06,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_TIME,
-};
-
 static const enum {
 	LBR_EIP_FLAGS		= 1,
 	LBR_TSX			= 2,
@@ -194,7 +183,7 @@ static void __intel_pmu_lbr_disable(void)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-static void intel_pmu_lbr_reset_32(void)
+void intel_pmu_lbr_reset_32(void)
 {
 	int i;
 
@@ -202,7 +191,7 @@ static void intel_pmu_lbr_reset_32(void)
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 }
 
-static void intel_pmu_lbr_reset_64(void)
+void intel_pmu_lbr_reset_64(void)
 {
 	int i;
 
@@ -221,10 +210,7 @@ void intel_pmu_lbr_reset(void)
 	if (!x86_pmu.lbr_nr)
 		return;
 
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_reset_32();
-	else
-		intel_pmu_lbr_reset_64();
+	x86_pmu.lbr_reset();
 
 	cpuc->last_task_ctx = NULL;
 	cpuc->last_log_id = 0;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index bac456f04c4b..9824261affd5 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -205,6 +205,17 @@ struct intel_excl_cntrs {
 struct x86_perf_task_context;
 #define MAX_LBR_ENTRIES		32
 
+enum {
+	LBR_FORMAT_32		= 0x00,
+	LBR_FORMAT_LIP		= 0x01,
+	LBR_FORMAT_EIP		= 0x02,
+	LBR_FORMAT_EIP_FLAGS	= 0x03,
+	LBR_FORMAT_EIP_FLAGS2	= 0x04,
+	LBR_FORMAT_INFO		= 0x05,
+	LBR_FORMAT_TIME		= 0x06,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_TIME,
+};
+
 enum {
 	X86_PERF_KFREE_SHARED = 0,
 	X86_PERF_KFREE_EXCL   = 1,
@@ -727,6 +738,8 @@ struct x86_pmu {
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 
+	void		(*lbr_reset)(void);
+
 	/*
 	 * Intel PT/LBR/BTS are exclusive
 	 */
@@ -1098,6 +1111,10 @@ u64 lbr_from_signext_quirk_wr(u64 val);
 
 void intel_pmu_lbr_reset(void);
 
+void intel_pmu_lbr_reset_32(void);
+
+void intel_pmu_lbr_reset_64(void);
+
 void intel_pmu_lbr_add(struct perf_event *event);
 
 void intel_pmu_lbr_del(struct perf_event *event);
-- 
Gitee


From f22140041d7bed0689149ae543bca2ceeb20860f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:09 -0700
Subject: [PATCH 0828/2161] perf/x86/intel/lbr: Add a function pointer for LBR
 read

commit 9f354a726cb1d4eb00a0784a27eaa0a3283cff71 upstream.

The method to read Architectural LBRs is different from previous
model-specific LBR. Perf has to implement a different function.

A function pointer for LBR read is introduced. Perf should initialize
the corresponding function at boot time, and avoid checking lbr_format
at run time.

The current 64-bit LBR read function is set as default.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-4-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c | 6 +++++-
 arch/x86/events/intel/lbr.c  | 9 +++------
 arch/x86/events/perf_event.h | 5 +++++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c6acc095dad4..9fad33e74e04 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4251,6 +4251,7 @@ static __initconst const struct x86_pmu core_pmu = {
 	.check_period		= intel_pmu_check_period,
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
+	.lbr_read		= intel_pmu_lbr_read_64,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4297,6 +4298,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.aux_output_match	= intel_pmu_aux_output_match,
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
+	.lbr_read		= intel_pmu_lbr_read_64,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -4935,8 +4937,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.intel_cap.capabilities = capabilities;
 	}
 
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) {
 		x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
+		x86_pmu.lbr_read = intel_pmu_lbr_read_32;
+	}
 
 	intel_ds_init();
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 0a45ee3b578a..ecebcdee9bca 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -539,7 +539,7 @@ void intel_pmu_lbr_disable_all(void)
 		__intel_pmu_lbr_disable();
 }
 
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	u64 tos = intel_pmu_lbr_tos();
@@ -575,7 +575,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  * is the same as the linear address, allowing us to merge the LIP and EIP
  * LBR formats.
  */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
 	bool need_info = false, call_stack = false;
 	unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -679,10 +679,7 @@ void intel_pmu_lbr_read(void)
 	    cpuc->lbr_users == cpuc->lbr_pebs_users)
 		return;
 
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_read_32(cpuc);
-	else
-		intel_pmu_lbr_read_64(cpuc);
+	x86_pmu.lbr_read(cpuc);
 
 	intel_pmu_lbr_filter(cpuc);
 }
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9824261affd5..7895ed60eced 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -739,6 +739,7 @@ struct x86_pmu {
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 
 	void		(*lbr_reset)(void);
+	void		(*lbr_read)(struct cpu_hw_events *cpuc);
 
 	/*
 	 * Intel PT/LBR/BTS are exclusive
@@ -1125,6 +1126,10 @@ void intel_pmu_lbr_disable_all(void);
 
 void intel_pmu_lbr_read(void);
 
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
+
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+
 void intel_pmu_lbr_init_core(void);
 
 void intel_pmu_lbr_init_nhm(void);
-- 
Gitee


From 5352e19ba2b9f04290a798e8764f3f33c85be34c Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Wed, 23 Oct 2019 10:12:54 +0300
Subject: [PATCH 0829/2161] perf/x86/intel: Implement LBR callstack context
 synchronization

commit 421ca868ea3b7c1ca1a541ed6dff3c101a563b95 upstream.

Implement intel_pmu_lbr_swap_task_ctx() method updating counters
of the events that requested LBR callstack data on a sample.

The counter can be zero for the case when task context belongs to
a thread that has just come from a block on a futex and the context
contains saved (lbr_stack_state == LBR_VALID) LBR register values.

For the values to be restored at LBR registers on the next thread's
switch-in event it swaps the counter value with the one that is
expected to be non zero at the previous equivalent task perf event
context.

Swap operation type ensures the previous task perf event context
stays consistent with the amount of events that requested LBR
callstack data on a sample.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: https://lkml.kernel.org/r/261ac742-9022-c3f4-5885-1eae7415b091@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c  | 23 +++++++++++++++++++++++
 arch/x86/events/perf_event.h |  3 +++
 2 files changed, 26 insertions(+)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index ecebcdee9bca..6faefb59d0e6 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -409,6 +409,29 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
                rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
+				 struct perf_event_context *next)
+{
+	struct x86_perf_task_context *prev_ctx_data, *next_ctx_data;
+
+	swap(prev->task_ctx_data, next->task_ctx_data);
+
+	/*
+	 * Architecture specific synchronization makes sense in
+	 * case both prev->task_ctx_data and next->task_ctx_data
+	 * pointers are allocated.
+	 */
+
+	prev_ctx_data = next->task_ctx_data;
+	next_ctx_data = prev->task_ctx_data;
+
+	if (!prev_ctx_data || !next_ctx_data)
+		return;
+
+	swap(prev_ctx_data->lbr_callstack_users,
+	     next_ctx_data->lbr_callstack_users);
+}
+
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7895ed60eced..b369369b6595 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1106,6 +1106,9 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
 
 void intel_ds_init(void);
 
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
+				 struct perf_event_context *next);
+
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
 
 u64 lbr_from_signext_quirk_wr(u64 val);
-- 
Gitee


From 18434cc3f1dae03cd512c9c25b26d55fbacb2613 Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Wed, 23 Oct 2019 10:11:04 +0300
Subject: [PATCH 0830/2161] perf/core, perf/x86: Introduce swap_task_ctx()
 method at 'struct pmu'

commit fc1adfe306b71e094df636012f8c0fed971cad45 upstream.

Declare swap_task_ctx() methods at the generic and x86 specific
pmu types to bridge calls to platform specific PMU code on optimized
context switch path between equivalent task perf event contexts.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: https://lkml.kernel.org/r/9a0aa84a-f062-9b64-3133-373658550c4b@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/perf_event.h | 8 ++++++++
 include/linux/perf_event.h   | 9 +++++++++
 2 files changed, 17 insertions(+)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index b369369b6595..3b70a3bba477 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -752,6 +752,14 @@ struct x86_pmu {
 	u64             (*update_topdown_event)(struct perf_event *event);
 	int             (*set_topdown_event_period)(struct perf_event *event);
 
+	/*
+	 * perf task context (i.e. struct perf_event_context::task_ctx_data)
+	 * switch helper to bridge calls from perf/core to perf/x86.
+	 * See struct pmu::swap_task_ctx() usage for examples;
+	 */
+	void		(*swap_task_ctx)(struct perf_event_context *prev,
+					 struct perf_event_context *next);
+
 	/*
 	 * AMD bits
 	 */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5862ff3f031a..2ac3350a547e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -418,6 +418,15 @@ struct pmu {
 	 */
 	size_t				task_ctx_size;
 
+	/*
+	 * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
+	 * can be synchronized using this function. See Intel LBR callstack support
+	 * implementation and Perf core context switch handling callbacks for usage
+	 * examples.
+	 */
+	void (*swap_task_ctx)		(struct perf_event_context *prev,
+					 struct perf_event_context *next);
+					/* optional */
 
 	/*
 	 * Set up pmu-private data structures for an AUX area
-- 
Gitee


From d3d0362694c03f4fa1a95ad6b40b85f6e30722ea Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Wed, 23 Oct 2019 10:13:56 +0300
Subject: [PATCH 0831/2161] perf/x86: Synchronize PMU task contexts on
 optimized context switches

commit c2b98a8661514f29a44ebd0925cf4b1503beb48c upstream.

Install Intel specific PMU task context synchronization adapter and
extend optimized context switch path with PMU specific task context
synchronization to fix LBR callstack virtualization on context switches.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: https://lkml.kernel.org/r/9c6445a9-bdba-ef03-3859-f1f91198f27a@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c |  7 +++++++
 kernel/events/core.c         | 13 ++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9fad33e74e04..217005472c29 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4154,6 +4154,12 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx,
 	intel_pmu_lbr_sched_task(ctx, sched_in);
 }
 
+static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
+				    struct perf_event_context *next)
+{
+	intel_pmu_lbr_swap_task_ctx(prev, next);
+}
+
 static int intel_pmu_check_period(struct perf_event *event, u64 value)
 {
 	return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
@@ -4292,6 +4298,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 
 	.guest_get_msrs		= intel_guest_get_msrs,
 	.sched_task		= intel_pmu_sched_task,
+	.swap_task_ctx		= intel_pmu_swap_task_ctx,
 
 	.check_period		= intel_pmu_check_period,
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8e04df75cb34..50f5c9492dbd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3240,10 +3240,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 		raw_spin_lock(&ctx->lock);
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
+			struct pmu *pmu = ctx->pmu;
+
 			WRITE_ONCE(ctx->task, next);
 			WRITE_ONCE(next_ctx->task, task);
 
-			swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+			/*
+			 * PMU specific parts of task perf context can require
+			 * additional synchronization. As an example of such
+			 * synchronization see implementation details of Intel
+			 * LBR call stack data profiling;
+			 */
+			if (pmu->swap_task_ctx)
+				pmu->swap_task_ctx(ctx, next_ctx);
+			else
+				swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
 
 			/*
 			 * RCU_INIT_POINTER here is safe because we've not
-- 
Gitee


From a4a913c9c56fdb9274d04d2b0b2269728dae79f5 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:10 -0700
Subject: [PATCH 0832/2161] perf/x86/intel/lbr: Add the function pointers for
 LBR save and restore

commit 799571bf38fc2b4b744fa448184b5915739b10fd upstream.

The MSRs of Architectural LBR are different from previous model-specific
LBR. Perf has to implement different functions to save and restore them.

The function pointers for LBR save and restore are introduced. Perf
should initialize the corresponding functions at boot time.

The generic optimizations, e.g. avoiding restore LBR if no one else
touched them, still apply for Architectural LBRs. The related codes are
not moved to model-specific functions.

Current model-specific LBR functions are set as default.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-5-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c |  4 ++
 arch/x86/events/intel/lbr.c  | 79 ++++++++++++++++++++++--------------
 arch/x86/events/perf_event.h |  6 +++
 3 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 217005472c29..fc61f01c8c84 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4258,6 +4258,8 @@ static __initconst const struct x86_pmu core_pmu = {
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
 	.lbr_read		= intel_pmu_lbr_read_64,
+	.lbr_save		= intel_pmu_lbr_save,
+	.lbr_restore		= intel_pmu_lbr_restore,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4306,6 +4308,8 @@ static __initconst const struct x86_pmu intel_pmu = {
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
 	.lbr_read		= intel_pmu_lbr_read_64,
+	.lbr_save		= intel_pmu_lbr_save,
+	.lbr_restore		= intel_pmu_lbr_restore,
 };
 
 static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 6faefb59d0e6..3c7c3ff11407 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -323,31 +323,13 @@ static inline u64 rdlbr_to(unsigned int idx)
 	return val;
 }
 
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+void intel_pmu_lbr_restore(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx = ctx;
 	int i;
 	unsigned lbr_idx, mask;
-	u64 tos;
-
-	if (task_ctx->lbr_callstack_users == 0 ||
-	    task_ctx->lbr_stack_state == LBR_NONE) {
-		intel_pmu_lbr_reset();
-		return;
-	}
-
-	tos = task_ctx->tos;
-	/*
-	 * Does not restore the LBR registers, if
-	 * - No one else touched them, and
-	 * - Did not enter C6
-	 */
-	if ((task_ctx == cpuc->last_task_ctx) &&
-	    (task_ctx->log_id == cpuc->last_log_id) &&
-	    rdlbr_from(tos)) {
-		task_ctx->lbr_stack_state = LBR_NONE;
-		return;
-	}
+	u64 tos = task_ctx->tos;
 
 	mask = x86_pmu.lbr_nr - 1;
 	for (i = 0; i < task_ctx->valid_lbrs; i++) {
@@ -368,24 +350,48 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 	}
 
 	wrmsrl(x86_pmu.lbr_tos, tos);
-	task_ctx->lbr_stack_state = LBR_NONE;
 
 	if (cpuc->lbr_select)
 		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	unsigned lbr_idx, mask;
-	u64 tos, from;
-	int i;
+	u64 tos;
 
-	if (task_ctx->lbr_callstack_users == 0) {
+	if (task_ctx->lbr_callstack_users == 0 ||
+	    task_ctx->lbr_stack_state == LBR_NONE) {
+		intel_pmu_lbr_reset();
+		return;
+	}
+
+	tos = task_ctx->tos;
+	/*
+	 * Does not restore the LBR registers, if
+	 * - No one else touched them, and
+	 * - Did not enter C6
+	 */
+	if ((task_ctx == cpuc->last_task_ctx) &&
+	    (task_ctx->log_id == cpuc->last_log_id) &&
+	    rdlbr_from(tos)) {
 		task_ctx->lbr_stack_state = LBR_NONE;
 		return;
 	}
 
+	x86_pmu.lbr_restore(task_ctx);
+
+	task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+void intel_pmu_lbr_save(void *ctx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx = ctx;
+	unsigned lbr_idx, mask;
+	u64 tos, from;
+	int i;
+
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
@@ -400,13 +406,26 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 	}
 	task_ctx->valid_lbrs = i;
 	task_ctx->tos = tos;
+
+	if (cpuc->lbr_select)
+		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (task_ctx->lbr_callstack_users == 0) {
+		task_ctx->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	x86_pmu.lbr_save(task_ctx);
+
 	task_ctx->lbr_stack_state = LBR_VALID;
 
 	cpuc->last_task_ctx = task_ctx;
 	cpuc->last_log_id = ++task_ctx->log_id;
-
-	if (cpuc->lbr_select)
-               rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 3b70a3bba477..5dd8d1e95ede 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -740,6 +740,8 @@ struct x86_pmu {
 
 	void		(*lbr_reset)(void);
 	void		(*lbr_read)(struct cpu_hw_events *cpuc);
+	void		(*lbr_save)(void *ctx);
+	void		(*lbr_restore)(void *ctx);
 
 	/*
 	 * Intel PT/LBR/BTS are exclusive
@@ -1141,6 +1143,10 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
 
 void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
 
+void intel_pmu_lbr_save(void *ctx);
+
+void intel_pmu_lbr_restore(void *ctx);
+
 void intel_pmu_lbr_init_core(void);
 
 void intel_pmu_lbr_init_nhm(void);
-- 
Gitee


From 27b9d76ce7f8d647070bfbb2f6fc1ae6ed97b057 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:11 -0700
Subject: [PATCH 0833/2161] perf/x86/intel/lbr: Factor out a new struct for
 generic optimization

commit 530bfff6480307d210734222a54d56af7f908957 upstream.

To reduce the overhead of a context switch with LBR enabled, some
generic optimizations were introduced, e.g. avoiding restore LBR if no
one else touched them. The generic optimizations can also be used by
Architecture LBR later. Currently, the fields for the generic
optimizations are part of structure x86_perf_task_context, which will be
deprecated by Architecture LBR. A new structure should be introduced
for the common fields of generic optimization, which can be shared
between Architecture LBR and model-specific LBR.

Both 'valid_lbrs' and 'tos' are also used by the generic optimizations,
but they are not moved into the new structure, because Architecture LBR
is stack-like. The 'valid_lbrs' which records the index of the valid LBR
is not required anymore. The TOS MSR will be removed.

LBR registers may be cleared in the deep Cstate. If so, the generic
optimizations should not be applied. Perf has to unconditionally
restore the LBR registers. A generic function is required to detect the
reset due to the deep Cstate. lbr_is_reset_in_cstate() is introduced.
Currently, for the model-specific LBR, the TOS MSR is used to detect the
reset. There will be another method introduced for Architecture LBR
later.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-6-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c  | 38 ++++++++++++++++++++----------------
 arch/x86/events/perf_event.h | 10 +++++++---
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 3c7c3ff11407..92a96ad55a6b 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -355,33 +355,37 @@ void intel_pmu_lbr_restore(void *ctx)
 		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
+static __always_inline bool
+lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx)
+{
+	return !rdlbr_from(task_ctx->tos);
+}
+
 static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	u64 tos;
 
-	if (task_ctx->lbr_callstack_users == 0 ||
-	    task_ctx->lbr_stack_state == LBR_NONE) {
+	if (task_ctx->opt.lbr_callstack_users == 0 ||
+	    task_ctx->opt.lbr_stack_state == LBR_NONE) {
 		intel_pmu_lbr_reset();
 		return;
 	}
 
-	tos = task_ctx->tos;
 	/*
 	 * Does not restore the LBR registers, if
 	 * - No one else touched them, and
-	 * - Did not enter C6
+	 * - Was not cleared in Cstate
 	 */
 	if ((task_ctx == cpuc->last_task_ctx) &&
-	    (task_ctx->log_id == cpuc->last_log_id) &&
-	    rdlbr_from(tos)) {
-		task_ctx->lbr_stack_state = LBR_NONE;
+	    (task_ctx->opt.log_id == cpuc->last_log_id) &&
+	    !lbr_is_reset_in_cstate(task_ctx)) {
+		task_ctx->opt.lbr_stack_state = LBR_NONE;
 		return;
 	}
 
 	x86_pmu.lbr_restore(task_ctx);
 
-	task_ctx->lbr_stack_state = LBR_NONE;
+	task_ctx->opt.lbr_stack_state = LBR_NONE;
 }
 
 void intel_pmu_lbr_save(void *ctx)
@@ -415,17 +419,17 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (task_ctx->lbr_callstack_users == 0) {
-		task_ctx->lbr_stack_state = LBR_NONE;
+	if (task_ctx->opt.lbr_callstack_users == 0) {
+		task_ctx->opt.lbr_stack_state = LBR_NONE;
 		return;
 	}
 
 	x86_pmu.lbr_save(task_ctx);
 
-	task_ctx->lbr_stack_state = LBR_VALID;
+	task_ctx->opt.lbr_stack_state = LBR_VALID;
 
 	cpuc->last_task_ctx = task_ctx;
-	cpuc->last_log_id = ++task_ctx->log_id;
+	cpuc->last_log_id = ++task_ctx->opt.log_id;
 }
 
 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
@@ -447,8 +451,8 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
 	if (!prev_ctx_data || !next_ctx_data)
 		return;
 
-	swap(prev_ctx_data->lbr_callstack_users,
-	     next_ctx_data->lbr_callstack_users);
+	swap(prev_ctx_data->opt.lbr_callstack_users,
+	     next_ctx_data->opt.lbr_callstack_users);
 }
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
@@ -503,7 +507,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
 
 	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
 		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users++;
+		task_ctx->opt.lbr_callstack_users++;
 	}
 
 	/*
@@ -543,7 +547,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
 	if (branch_user_callstack(cpuc->br_sel) &&
 	    event->ctx->task_ctx_data) {
 		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users--;
+		task_ctx->opt.lbr_callstack_users--;
 	}
 
 	if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5dd8d1e95ede..e0bdd832877b 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -787,6 +787,12 @@ struct x86_pmu {
 	int (*aux_output_match) (struct perf_event *event);
 };
 
+struct x86_perf_task_context_opt {
+	int lbr_callstack_users;
+	int lbr_stack_state;
+	int log_id;
+};
+
 struct x86_perf_task_context {
 	u64 lbr_from[MAX_LBR_ENTRIES];
 	u64 lbr_to[MAX_LBR_ENTRIES];
@@ -794,9 +800,7 @@ struct x86_perf_task_context {
 	u64 lbr_sel;
 	int tos;
 	int valid_lbrs;
-	int lbr_callstack_users;
-	int lbr_stack_state;
-	int log_id;
+	struct x86_perf_task_context_opt opt;
 };
 
 #define x86_add_quirk(func_)						\
-- 
Gitee


From d4c4936c1cfc8158c8515886cddd2d529a344a25 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:12 -0700
Subject: [PATCH 0834/2161] perf/x86/intel/lbr: Use dynamic data structure for
 task_ctx

commit f42be8651a7a9d5cb165e5d176fc0b09621b4f4d upstream.

The type of task_ctx is hardcoded as struct x86_perf_task_context,
which doesn't apply for Architecture LBR. For example, Architecture LBR
doesn't have the TOS MSR. The number of LBR entries is variable. A new
struct will be introduced for Architecture LBR. Perf has to determine
the type of task_ctx at run time.

The type of task_ctx pointer is changed to 'void *', which will be
determined at run time.

The generic LBR optimization can be shared between Architecture LBR and
model-specific LBR. Both need to access the structure for the generic
LBR optimization. A helper task_context_opt() is introduced to retrieve
the pointer of the structure at run time.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-7-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c  | 59 ++++++++++++++++--------------------
 arch/x86/events/perf_event.h |  7 ++++-
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 92a96ad55a6b..07ea60545254 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -355,18 +355,17 @@ void intel_pmu_lbr_restore(void *ctx)
 		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
-static __always_inline bool
-lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx)
+static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
-	return !rdlbr_from(task_ctx->tos);
+	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos);
 }
 
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+static void __intel_pmu_lbr_restore(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (task_ctx->opt.lbr_callstack_users == 0 ||
-	    task_ctx->opt.lbr_stack_state == LBR_NONE) {
+	if (task_context_opt(ctx)->lbr_callstack_users == 0 ||
+	    task_context_opt(ctx)->lbr_stack_state == LBR_NONE) {
 		intel_pmu_lbr_reset();
 		return;
 	}
@@ -376,16 +375,16 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 	 * - No one else touched them, and
 	 * - Was not cleared in Cstate
 	 */
-	if ((task_ctx == cpuc->last_task_ctx) &&
-	    (task_ctx->opt.log_id == cpuc->last_log_id) &&
-	    !lbr_is_reset_in_cstate(task_ctx)) {
-		task_ctx->opt.lbr_stack_state = LBR_NONE;
+	if ((ctx == cpuc->last_task_ctx) &&
+	    (task_context_opt(ctx)->log_id == cpuc->last_log_id) &&
+	    !lbr_is_reset_in_cstate(ctx)) {
+		task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
 		return;
 	}
 
-	x86_pmu.lbr_restore(task_ctx);
+	x86_pmu.lbr_restore(ctx);
 
-	task_ctx->opt.lbr_stack_state = LBR_NONE;
+	task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
 }
 
 void intel_pmu_lbr_save(void *ctx)
@@ -415,27 +414,27 @@ void intel_pmu_lbr_save(void *ctx)
 		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+static void __intel_pmu_lbr_save(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (task_ctx->opt.lbr_callstack_users == 0) {
-		task_ctx->opt.lbr_stack_state = LBR_NONE;
+	if (task_context_opt(ctx)->lbr_callstack_users == 0) {
+		task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
 		return;
 	}
 
-	x86_pmu.lbr_save(task_ctx);
+	x86_pmu.lbr_save(ctx);
 
-	task_ctx->opt.lbr_stack_state = LBR_VALID;
+	task_context_opt(ctx)->lbr_stack_state = LBR_VALID;
 
-	cpuc->last_task_ctx = task_ctx;
-	cpuc->last_log_id = ++task_ctx->opt.log_id;
+	cpuc->last_task_ctx = ctx;
+	cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
 }
 
 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
 				 struct perf_event_context *next)
 {
-	struct x86_perf_task_context *prev_ctx_data, *next_ctx_data;
+	void *prev_ctx_data, *next_ctx_data;
 
 	swap(prev->task_ctx_data, next->task_ctx_data);
 
@@ -451,14 +450,14 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
 	if (!prev_ctx_data || !next_ctx_data)
 		return;
 
-	swap(prev_ctx_data->opt.lbr_callstack_users,
-	     next_ctx_data->opt.lbr_callstack_users);
+	swap(task_context_opt(prev_ctx_data)->lbr_callstack_users,
+	     task_context_opt(next_ctx_data)->lbr_callstack_users);
 }
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
+	void *task_ctx;
 
 	if (!cpuc->lbr_users)
 		return;
@@ -495,7 +494,6 @@ static inline bool branch_user_callstack(unsigned br_sel)
 void intel_pmu_lbr_add(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -505,10 +503,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
 
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
-	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->opt.lbr_callstack_users++;
-	}
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
+		task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
 
 	/*
 	 * Request pmu::sched_task() callback, which will fire inside the
@@ -539,16 +535,13 @@ void intel_pmu_lbr_add(struct perf_event *event)
 void intel_pmu_lbr_del(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
 
 	if (branch_user_callstack(cpuc->br_sel) &&
-	    event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->opt.lbr_callstack_users--;
-	}
+	    event->ctx->task_ctx_data)
+		task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
 
 	if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
 		cpuc->lbr_select = 0;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e0bdd832877b..aed7e654b5c5 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -274,7 +274,7 @@ struct cpu_hw_events {
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 	struct er_account		*lbr_sel;
 	u64				br_sel;
-	struct x86_perf_task_context	*last_task_ctx;
+	void				*last_task_ctx;
 	int				last_log_id;
 	int				lbr_select;
 
@@ -851,6 +851,11 @@ static struct perf_pmu_events_ht_attr event_attr_##v = {		\
 struct pmu *x86_get_pmu(void);
 extern struct x86_pmu x86_pmu __read_mostly;
 
+static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
+{
+	return &((struct x86_perf_task_context *)ctx)->opt;
+}
+
 static inline bool x86_pmu_has_lbr_callstack(void)
 {
 	return  x86_pmu.lbr_sel_map &&
-- 
Gitee


From 3068204835adf819ff5ee295c018c565f6cf8614 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:13 -0700
Subject: [PATCH 0835/2161] x86/msr-index: Add bunch of MSRs for Arch LBR

commit d6a162a41bfd2ff9ea4cbb338d3df6a3f9b7e89f upstream.

Add Arch LBR related MSRs and the new LBR INFO bits in MSR-index.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-8-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/msr-index.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index cfd0cd50d07d..2ee1ea7e3284 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -149,7 +149,23 @@
 #define LBR_INFO_MISPRED		BIT_ULL(63)
 #define LBR_INFO_IN_TX			BIT_ULL(62)
 #define LBR_INFO_ABORT			BIT_ULL(61)
+#define LBR_INFO_CYC_CNT_VALID		BIT_ULL(60)
 #define LBR_INFO_CYCLES			0xffff
+#define LBR_INFO_BR_TYPE_OFFSET		56
+#define LBR_INFO_BR_TYPE		(0xfull << LBR_INFO_BR_TYPE_OFFSET)
+
+#define MSR_ARCH_LBR_CTL		0x000014ce
+#define ARCH_LBR_CTL_LBREN		BIT(0)
+#define ARCH_LBR_CTL_CPL_OFFSET		1
+#define ARCH_LBR_CTL_CPL		(0x3ull << ARCH_LBR_CTL_CPL_OFFSET)
+#define ARCH_LBR_CTL_STACK_OFFSET	3
+#define ARCH_LBR_CTL_STACK		(0x1ull << ARCH_LBR_CTL_STACK_OFFSET)
+#define ARCH_LBR_CTL_FILTER_OFFSET	16
+#define ARCH_LBR_CTL_FILTER		(0x7full << ARCH_LBR_CTL_FILTER_OFFSET)
+#define MSR_ARCH_LBR_DEPTH		0x000014cf
+#define MSR_ARCH_LBR_FROM_0		0x00001500
+#define MSR_ARCH_LBR_TO_0		0x00001600
+#define MSR_ARCH_LBR_INFO_0		0x00001200
 
 #define MSR_IA32_PEBS_ENABLE		0x000003f1
 #define MSR_PEBS_DATA_CFG		0x000003f2
-- 
Gitee


From 3fd3b8adcd5e9c2da30148fbbca7000c8993f886 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:14 -0700
Subject: [PATCH 0836/2161] perf/x86: Expose CPUID enumeration bits for arch
 LBR

commit af6cf129706b2f79e12f97e62d977e7f653cdfd1 upstream.

The LBR capabilities of Architecture LBR are retrieved from the CPUID
enumeration once at boot time. The capabilities have to be saved for
future usage.

Several new fields are added into structure x86_pmu to indicate the
capabilities. The fields will be used in the following patches.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-9-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/perf_event.h      | 13 ++++++++++
 arch/x86/include/asm/perf_event.h | 40 +++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index aed7e654b5c5..d9c17e988427 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -738,6 +738,19 @@ struct x86_pmu {
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 
+	/*
+	 * Intel Architectural LBR CPUID Enumeration
+	 */
+	unsigned int	lbr_depth_mask:8;
+	unsigned int	lbr_deep_c_reset:1;
+	unsigned int	lbr_lip:1;
+	unsigned int	lbr_cpl:1;
+	unsigned int	lbr_filter:1;
+	unsigned int	lbr_call_stack:1;
+	unsigned int	lbr_mispred:1;
+	unsigned int	lbr_timed_lbr:1;
+	unsigned int	lbr_br_type:1;
+
 	void		(*lbr_reset)(void);
 	void		(*lbr_read)(struct cpu_hw_events *cpuc);
 	void		(*lbr_save)(void *ctx);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6a174de55b04..9f8c040177e6 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -142,6 +142,46 @@ union cpuid10_edx {
 	unsigned int full;
 };
 
+/*
+ * Intel Architectural LBR CPUID detection/enumeration details:
+ */
+union cpuid28_eax {
+	struct {
+		/* Supported LBR depth values */
+		unsigned int	lbr_depth_mask:8;
+		unsigned int	reserved:22;
+		/* Deep C-state Reset */
+		unsigned int	lbr_deep_c_reset:1;
+		/* IP values contain LIP */
+		unsigned int	lbr_lip:1;
+	} split;
+	unsigned int		full;
+};
+
+union cpuid28_ebx {
+	struct {
+		/* CPL Filtering Supported */
+		unsigned int    lbr_cpl:1;
+		/* Branch Filtering Supported */
+		unsigned int    lbr_filter:1;
+		/* Call-stack Mode Supported */
+		unsigned int    lbr_call_stack:1;
+	} split;
+	unsigned int            full;
+};
+
+union cpuid28_ecx {
+	struct {
+		/* Mispredict Bit Supported */
+		unsigned int    lbr_mispred:1;
+		/* Timed LBRs Supported */
+		unsigned int    lbr_timed_lbr:1;
+		/* Branch Type Field Supported */
+		unsigned int    lbr_br_type:1;
+	} split;
+	unsigned int            full;
+};
+
 struct x86_pmu_capability {
 	int		version;
 	int		num_counters_gp;
-- 
Gitee


From 4d5dc9878424c40b38e139cdcfaf57100e1c2023 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:15 -0700
Subject: [PATCH 0837/2161] perf/x86/intel/lbr: Support LBR_CTL

commit 49d8184f2036ff5b8d1eea3d61bac8b23420eca7 upstream.

An IA32_LBR_CTL is introduced for Architecture LBR to enable and config
LBR registers to replace the previous LBR_SELECT.

All the related members in struct cpu_hw_events and struct x86_pmu
have to be renamed.

Some new macros are added to reflect the layout of LBR_CTL.

The mapping from PERF_SAMPLE_BRANCH_* to the corresponding bits in
LBR_CTL MSR is saved in lbr_ctl_map now, which is not a const value.
The value relies on the CPUID enumeration.

For the previous model-specific LBR, most of the bits in LBR_SELECT
operate in the suppressed mode. For the bits in LBR_CTL, the polarity is
inverted.

For the previous model-specific LBR format 5 (LBR_FORMAT_INFO), if the
NO_CYCLES and NO_FLAGS type are set, the flag LBR_NO_INFO will be set to
avoid the unnecessary LBR_INFO MSR read. Although Architecture LBR also
has a dedicated LBR_INFO MSR, perf doesn't need to check and set the
flag LBR_NO_INFO. For Architecture LBR, XSAVES instruction will be used
as the default way to read the LBR MSRs all together. The overhead which
the flag tries to avoid doesn't exist anymore. Dropping the flag can
save the extra check for the flag in the lbr_read() later, and make the
code cleaner.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-10-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c  | 43 ++++++++++++++++++++++++++++++++++++
 arch/x86/events/perf_event.h | 15 ++++++++++---
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 07ea60545254..e82b1d364dab 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -132,6 +132,44 @@ enum {
 	 X86_BR_IRQ		|\
 	 X86_BR_INT)
 
+/*
+ * Intel LBR_CTL bits
+ *
+ * Hardware branch filter for Arch LBR
+ */
+#define ARCH_LBR_KERNEL_BIT		1  /* capture at ring0 */
+#define ARCH_LBR_USER_BIT		2  /* capture at ring > 0 */
+#define ARCH_LBR_CALL_STACK_BIT		3  /* enable call stack */
+#define ARCH_LBR_JCC_BIT		16 /* capture conditional branches */
+#define ARCH_LBR_REL_JMP_BIT		17 /* capture relative jumps */
+#define ARCH_LBR_IND_JMP_BIT		18 /* capture indirect jumps */
+#define ARCH_LBR_REL_CALL_BIT		19 /* capture relative calls */
+#define ARCH_LBR_IND_CALL_BIT		20 /* capture indirect calls */
+#define ARCH_LBR_RETURN_BIT		21 /* capture near returns */
+#define ARCH_LBR_OTHER_BRANCH_BIT	22 /* capture other branches */
+
+#define ARCH_LBR_KERNEL			(1ULL << ARCH_LBR_KERNEL_BIT)
+#define ARCH_LBR_USER			(1ULL << ARCH_LBR_USER_BIT)
+#define ARCH_LBR_CALL_STACK		(1ULL << ARCH_LBR_CALL_STACK_BIT)
+#define ARCH_LBR_JCC			(1ULL << ARCH_LBR_JCC_BIT)
+#define ARCH_LBR_REL_JMP		(1ULL << ARCH_LBR_REL_JMP_BIT)
+#define ARCH_LBR_IND_JMP		(1ULL << ARCH_LBR_IND_JMP_BIT)
+#define ARCH_LBR_REL_CALL		(1ULL << ARCH_LBR_REL_CALL_BIT)
+#define ARCH_LBR_IND_CALL		(1ULL << ARCH_LBR_IND_CALL_BIT)
+#define ARCH_LBR_RETURN			(1ULL << ARCH_LBR_RETURN_BIT)
+#define ARCH_LBR_OTHER_BRANCH		(1ULL << ARCH_LBR_OTHER_BRANCH_BIT)
+
+#define ARCH_LBR_ANY			 \
+	(ARCH_LBR_JCC			|\
+	 ARCH_LBR_REL_JMP		|\
+	 ARCH_LBR_IND_JMP		|\
+	 ARCH_LBR_REL_CALL		|\
+	 ARCH_LBR_IND_CALL		|\
+	 ARCH_LBR_RETURN		|\
+	 ARCH_LBR_OTHER_BRANCH)
+
+#define ARCH_LBR_CTL_MASK			0x7f000e
+
 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
 /*
@@ -818,6 +856,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
 
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
+		reg->config = mask;
+		return 0;
+	}
+
 	/*
 	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
 	 * in suppress mode. So LBR_SELECT should be set to
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index d9c17e988427..f67b37d353ec 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -272,7 +272,10 @@ struct cpu_hw_events {
 	int				lbr_pebs_users;
 	struct perf_branch_stack	lbr_stack;
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
-	struct er_account		*lbr_sel;
+	union {
+		struct er_account		*lbr_sel;
+		struct er_account		*lbr_ctl;
+	};
 	u64				br_sel;
 	void				*last_task_ctx;
 	int				last_log_id;
@@ -733,8 +736,14 @@ struct x86_pmu {
 	 */
 	unsigned int	lbr_tos, lbr_from, lbr_to,
 			lbr_nr;			   /* LBR base regs and size */
-	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
-	const int	*lbr_sel_map;		   /* lbr_select mappings */
+	union {
+		u64	lbr_sel_mask;		   /* LBR_SELECT valid bits */
+		u64	lbr_ctl_mask;		   /* LBR_CTL valid bits */
+	};
+	union {
+		const int	*lbr_sel_map;	   /* lbr_select mappings */
+		int		*lbr_ctl_map;	   /* LBR_CTL mappings */
+	};
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 
-- 
Gitee


From d92326f14bb48364304543014fc8a03951623c97 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 27 Jan 2020 08:53:54 -0800
Subject: [PATCH 0838/2161] perf/core: Add new branch sample type for HW index
 of raw branch records

commit bbfd5e4fab63703375eafaf241a0c696024a59e1 upstream.

The low level index is the index in the underlying hardware buffer of
the most recently captured taken branch which is always saved in
branch_entries[0]. It is very useful for reconstructing the call stack.
For example, in Intel LBR call stack mode, the depth of reconstructed
LBR call stack limits to the number of LBR registers. With the low level
index information, perf tool may stitch the stacks of two samples. The
reconstructed LBR call stack can break the HW limitation.

Add a new branch sample type to retrieve low level index of raw branch
records. The low level index is between -1 (unknown) and max depth which
can be retrieved in /sys/devices/cpu/caps/branches.

Only when the new branch sample type is set, the low level index
information is dumped into the PERF_SAMPLE_BRANCH_STACK output.
Perf tool should check the attr.branch_sample_type, and apply the
corresponding format for PERF_SAMPLE_BRANCH_STACK samples.
Otherwise, some user case may be broken. For example, users may parse a
perf.data, which include the new branch sample type, with an old version
perf tool (without the check). Users probably get incorrect information
without any warning.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200127165355.27495-2-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/powerpc/perf/core-book3s.c |  1 +
 arch/x86/events/intel/lbr.c     |  3 +++
 include/linux/perf_event.h      | 12 ++++++++++++
 include/uapi/linux/perf_event.h |  8 +++++++-
 kernel/events/core.c            | 10 ++++++++++
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6f013e418834..58de6cec907b 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -529,6 +529,7 @@ static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
 		}
 	}
 	cpuhw->bhrb_stack.nr = u_index;
+	cpuhw->bhrb_stack.hw_idx = -1ULL;
 	return;
 }
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index e82b1d364dab..2337c0cd9f27 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -645,6 +645,7 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
+	cpuc->lbr_stack.hw_idx = -1ULL;
 }
 
 /*
@@ -740,6 +741,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		out++;
 	}
 	cpuc->lbr_stack.nr = out;
+	cpuc->lbr_stack.hw_idx = -1ULL;
 }
 
 void intel_pmu_lbr_read(void)
@@ -1183,6 +1185,7 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
 	int i;
 
 	cpuc->lbr_stack.nr = x86_pmu.lbr_nr;
+	cpuc->lbr_stack.hw_idx = -1ULL;
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		u64 info = lbr->lbr[i].info;
 		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2ac3350a547e..62ee6d9a0db5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -92,14 +92,26 @@ struct perf_raw_record {
 /*
  * branch stack layout:
  *  nr: number of taken branches stored in entries[]
+ *  hw_idx: The low level index of raw branch records
+ *          for the most recent branch.
+ *          -1ULL means invalid/unknown.
  *
  * Note that nr can vary from sample to sample
  * branches (to, from) are stored from most recent
  * to least recent, i.e., entries[0] contains the most
  * recent branch.
+ * The entries[] is an abstraction of raw branch records,
+ * which may not be stored in age order in HW, e.g. Intel LBR.
+ * The hw_idx is to expose the low level index of raw
+ * branch record for the most recent branch aka entries[0].
+ * The hw_idx index is between -1 (unknown) and max depth,
+ * which can be retrieved in /sys/devices/cpu/caps/branches.
+ * For the architectures whose raw branch records are
+ * already stored in age order, the hw_idx should be 0.
  */
 struct perf_branch_stack {
 	__u64				nr;
+	__u64				hw_idx;
 	struct perf_branch_entry	entries[0];
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ceccd980ffcf..507eddef8715 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -180,6 +180,8 @@ enum perf_branch_sample_type_shift {
 
 	PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT	= 16, /* save branch type */
 
+	PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT	= 17, /* save low level index of raw branch records */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -207,6 +209,8 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_TYPE_SAVE	=
 		1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
 
+	PERF_SAMPLE_BRANCH_HW_INDEX	= 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
@@ -849,7 +853,9 @@ enum perf_event_type {
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
 	 *
 	 *	{ u64                   nr;
-	 *        { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
+	 *	  { u64	hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX
+	 *        { u64 from, to, flags } lbr[nr];
+	 *      } && PERF_SAMPLE_BRANCH_STACK
 	 *
 	 * 	{ u64			abi; # enum perf_sample_regs_abi
 	 * 	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 50f5c9492dbd..387cbca76d98 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6405,6 +6405,11 @@ static void perf_output_read(struct perf_output_handle *handle,
 		perf_output_read_one(handle, event, enabled, running);
 }
 
+static inline bool perf_sample_save_hw_index(struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
 void perf_output_sample(struct perf_output_handle *handle,
 			struct perf_event_header *header,
 			struct perf_sample_data *data,
@@ -6493,6 +6498,8 @@ void perf_output_sample(struct perf_output_handle *handle,
 			     * sizeof(struct perf_branch_entry);
 
 			perf_output_put(handle, data->br_stack->nr);
+			if (perf_sample_save_hw_index(event))
+				perf_output_put(handle, data->br_stack->hw_idx);
 			perf_output_copy(handle, data->br_stack->entries, size);
 		} else {
 			/*
@@ -6682,6 +6689,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
 		int size = sizeof(u64); /* nr */
 		if (data->br_stack) {
+			if (perf_sample_save_hw_index(event))
+				size += sizeof(u64);
+
 			size += data->br_stack->nr
 			      * sizeof(struct perf_branch_entry);
 		}
-- 
Gitee


From fb6c77d5fcd836314604e0c2637144bf275065c3 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 27 Jan 2020 08:53:55 -0800
Subject: [PATCH 0839/2161] perf/x86/intel: Output LBR TOS information
 correctly

commit db278b90c326ce5895be09b6171f5ff3df1e3cca upstream.

For Intel LBR, the LBR Top-of-Stack (TOS) information is the HW index of
raw branch record for the most recent branch.

For non-adaptive PEBS and non-PEBS, the TOS information can be directly
retrieved from TOS MSR read in intel_pmu_lbr_read().

For adaptive PEBS, the LBR information stored in PEBS record doesn't
include the TOS information. For single PEBS, TOS can be directly read
from MSR, because the PMI is triggered immediately after PEBS is
written. TOS MSR is still unchanged.
For large PEBS, TOS MSR has stale value. Set -1ULL to indicate that the
TOS information is not available.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200127165355.27495-3-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 2337c0cd9f27..77425624752c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -645,7 +645,7 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
-	cpuc->lbr_stack.hw_idx = -1ULL;
+	cpuc->lbr_stack.hw_idx = tos;
 }
 
 /*
@@ -741,7 +741,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		out++;
 	}
 	cpuc->lbr_stack.nr = out;
-	cpuc->lbr_stack.hw_idx = -1ULL;
+	cpuc->lbr_stack.hw_idx = tos;
 }
 
 void intel_pmu_lbr_read(void)
@@ -1185,7 +1185,13 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
 	int i;
 
 	cpuc->lbr_stack.nr = x86_pmu.lbr_nr;
-	cpuc->lbr_stack.hw_idx = -1ULL;
+
+	/* Cannot get TOS for large PEBS */
+	if (cpuc->n_pebs == cpuc->n_large_pebs)
+		cpuc->lbr_stack.hw_idx = -1ULL;
+	else
+		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
+
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		u64 info = lbr->lbr[i].info;
 		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
-- 
Gitee


From a71a44200f5ebf38ee8da9bdaf7bbb654e22ae4f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:16 -0700
Subject: [PATCH 0840/2161] perf/x86/intel/lbr: Unify the stored format of LBR
 information

commit 5624986dc61b81a77fb6136bc232593483d1c254 upstream.

Current LBR information in the structure x86_perf_task_context is stored
in a different format from the PEBS LBR record and Architecture LBR,
which prevents the sharing of the common codes.

Use the format of the PEBS LBR record as a unified format. Use a generic
name lbr_entry to replace pebs_lbr_entry.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-11-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/ds.c        |  6 +++---
 arch/x86/events/intel/lbr.c       | 20 ++++++++++----------
 arch/x86/events/perf_event.h      |  6 ++----
 arch/x86/include/asm/perf_event.h |  6 +-----
 4 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 5965d341350c..384ccdb7e572 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -951,7 +951,7 @@ static void adaptive_pebs_record_size_update(void)
 	if (pebs_data_cfg & PEBS_DATACFG_XMMS)
 		sz += sizeof(struct pebs_xmm);
 	if (pebs_data_cfg & PEBS_DATACFG_LBRS)
-		sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry);
+		sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
 
 	cpuc->pebs_record_size = sz;
 }
@@ -1592,10 +1592,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	}
 
 	if (format_size & PEBS_DATACFG_LBRS) {
-		struct pebs_lbr *lbr = next_record;
+		struct lbr_entry *lbr = next_record;
 		int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
 					& 0xff) + 1;
-		next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry);
+		next_record = next_record + num_lbr * sizeof(struct lbr_entry);
 
 		if (has_branch_stack(event)) {
 			intel_pmu_store_pebs_lbrs(lbr);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 77425624752c..b8baaf15c5f4 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -372,11 +372,11 @@ void intel_pmu_lbr_restore(void *ctx)
 	mask = x86_pmu.lbr_nr - 1;
 	for (i = 0; i < task_ctx->valid_lbrs; i++) {
 		lbr_idx = (tos - i) & mask;
-		wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
-		wrlbr_to  (lbr_idx, task_ctx->lbr_to[i]);
+		wrlbr_from(lbr_idx, task_ctx->lbr[i].from);
+		wrlbr_to(lbr_idx, task_ctx->lbr[i].to);
 
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
 	}
 
 	for (; i < x86_pmu.lbr_nr; i++) {
@@ -440,10 +440,10 @@ void intel_pmu_lbr_save(void *ctx)
 		from = rdlbr_from(lbr_idx);
 		if (!from)
 			break;
-		task_ctx->lbr_from[i] = from;
-		task_ctx->lbr_to[i]   = rdlbr_to(lbr_idx);
+		task_ctx->lbr[i].from = from;
+		task_ctx->lbr[i].to = rdlbr_to(lbr_idx);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
 	}
 	task_ctx->valid_lbrs = i;
 	task_ctx->tos = tos;
@@ -1179,7 +1179,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	}
 }
 
-void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	int i;
@@ -1193,11 +1193,11 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
 		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
 
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		u64 info = lbr->lbr[i].info;
+		u64 info = lbr[i].info;
 		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
 
-		e->from		= lbr->lbr[i].from;
-		e->to		= lbr->lbr[i].to;
+		e->from		= lbr[i].from;
+		e->to		= lbr[i].to;
 		e->mispred	= !!(info & LBR_INFO_MISPRED);
 		e->predicted	= !(info & LBR_INFO_MISPRED);
 		e->in_tx	= !!(info & LBR_INFO_IN_TX);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f67b37d353ec..40279e0e2ec7 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -816,13 +816,11 @@ struct x86_perf_task_context_opt {
 };
 
 struct x86_perf_task_context {
-	u64 lbr_from[MAX_LBR_ENTRIES];
-	u64 lbr_to[MAX_LBR_ENTRIES];
-	u64 lbr_info[MAX_LBR_ENTRIES];
 	u64 lbr_sel;
 	int tos;
 	int valid_lbrs;
 	struct x86_perf_task_context_opt opt;
+	struct lbr_entry lbr[MAX_LBR_ENTRIES];
 };
 
 #define x86_add_quirk(func_)						\
@@ -1143,7 +1141,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
 
 void intel_pmu_auto_reload_read(struct perf_event *event);
 
-void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
 
 void intel_ds_init(void);
 
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 9f8c040177e6..eda69992bd82 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -353,14 +353,10 @@ struct pebs_xmm {
 	u64 xmm[16*2];	/* two entries for each register */
 };
 
-struct pebs_lbr_entry {
+struct lbr_entry {
 	u64 from, to, info;
 };
 
-struct pebs_lbr {
-	struct pebs_lbr_entry lbr[0]; /* Variable length */
-};
-
 /*
  * IBS cpuid feature detection
  */
-- 
Gitee


From ef159bbc3b5a1b095d5b0f52971921ebe3c6b8f8 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:17 -0700
Subject: [PATCH 0841/2161] perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from}
 wrappers __always_inline

commit 020d91e5f32da4f4b929b3a6e680135fd526107c upstream.

The {rd,wr}lbr_{to,from} wrappers are invoked in hot paths, e.g. context
switch and NMI handler. They should be always inline to achieve better
performance. However, the CONFIG_OPTIMIZE_INLINING allows the compiler
to uninline functions marked 'inline'.

Mark the {rd,wr}lbr_{to,from} wrappers as __always_inline to force
inline the wrappers.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-12-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index b8baaf15c5f4..21f4f071f2c0 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -332,18 +332,18 @@ static u64 lbr_from_signext_quirk_rd(u64 val)
 	return val;
 }
 
-static inline void wrlbr_from(unsigned int idx, u64 val)
+static __always_inline void wrlbr_from(unsigned int idx, u64 val)
 {
 	val = lbr_from_signext_quirk_wr(val);
 	wrmsrl(x86_pmu.lbr_from + idx, val);
 }
 
-static inline void wrlbr_to(unsigned int idx, u64 val)
+static __always_inline void wrlbr_to(unsigned int idx, u64 val)
 {
 	wrmsrl(x86_pmu.lbr_to + idx, val);
 }
 
-static inline u64 rdlbr_from(unsigned int idx)
+static __always_inline u64 rdlbr_from(unsigned int idx)
 {
 	u64 val;
 
@@ -352,7 +352,7 @@ static inline u64 rdlbr_from(unsigned int idx)
 	return lbr_from_signext_quirk_rd(val);
 }
 
-static inline u64 rdlbr_to(unsigned int idx)
+static __always_inline u64 rdlbr_to(unsigned int idx)
 {
 	u64 val;
 
-- 
Gitee


From cbdf10527d2baa420c8baddaa6fcdfb572410747 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:18 -0700
Subject: [PATCH 0842/2161] perf/x86/intel/lbr: Factor out rdlbr_all() and
 wrlbr_all()

commit fda1f99f34a8f0975086bcfef34da865009995c1 upstream.

The previous model-specific LBR and Architecture LBR (legacy way) use a
similar method to save/restore the LBR information, which directly
accesses the LBR registers. The codes which read/write a set of LBR
registers can be shared between them.

Factor out two functions which are used to read/write a set of LBR
registers.

Add lbr_info into structure x86_pmu, and use it to replace the hardcoded
LBR INFO MSR, because the LBR INFO MSR address of the previous
model-specific LBR is different from Architecture LBR. The MSR address
should be assigned at boot time. For now, only Sky Lake and later
platforms have the LBR INFO MSR.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-13-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c  | 66 +++++++++++++++++++++++++++---------
 arch/x86/events/perf_event.h |  2 +-
 2 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 21f4f071f2c0..d3d129c7d7ef 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -237,7 +237,7 @@ void intel_pmu_lbr_reset_64(void)
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 		wrmsrl(x86_pmu.lbr_to   + i, 0);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + i, 0);
+			wrmsrl(x86_pmu.lbr_info + i, 0);
 	}
 }
 
@@ -343,6 +343,11 @@ static __always_inline void wrlbr_to(unsigned int idx, u64 val)
 	wrmsrl(x86_pmu.lbr_to + idx, val);
 }
 
+static __always_inline void wrlbr_info(unsigned int idx, u64 val)
+{
+	wrmsrl(x86_pmu.lbr_info + idx, val);
+}
+
 static __always_inline u64 rdlbr_from(unsigned int idx)
 {
 	u64 val;
@@ -361,8 +366,44 @@ static __always_inline u64 rdlbr_to(unsigned int idx)
 	return val;
 }
 
+static __always_inline u64 rdlbr_info(unsigned int idx)
+{
+	u64 val;
+
+	rdmsrl(x86_pmu.lbr_info + idx, val);
+
+	return val;
+}
+
+static inline void
+wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+	wrlbr_from(idx, lbr->from);
+	wrlbr_to(idx, lbr->to);
+	if (need_info)
+		wrlbr_info(idx, lbr->info);
+}
+
+static inline bool
+rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+	u64 from = rdlbr_from(idx);
+
+	/* Don't read invalid entry */
+	if (!from)
+		return false;
+
+	lbr->from = from;
+	lbr->to = rdlbr_to(idx);
+	if (need_info)
+		lbr->info = rdlbr_info(idx);
+
+	return true;
+}
+
 void intel_pmu_lbr_restore(void *ctx)
 {
+	bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx = ctx;
 	int i;
@@ -372,11 +413,7 @@ void intel_pmu_lbr_restore(void *ctx)
 	mask = x86_pmu.lbr_nr - 1;
 	for (i = 0; i < task_ctx->valid_lbrs; i++) {
 		lbr_idx = (tos - i) & mask;
-		wrlbr_from(lbr_idx, task_ctx->lbr[i].from);
-		wrlbr_to(lbr_idx, task_ctx->lbr[i].to);
-
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
+		wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info);
 	}
 
 	for (; i < x86_pmu.lbr_nr; i++) {
@@ -384,7 +421,7 @@ void intel_pmu_lbr_restore(void *ctx)
 		wrlbr_from(lbr_idx, 0);
 		wrlbr_to(lbr_idx, 0);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
+			wrlbr_info(lbr_idx, 0);
 	}
 
 	wrmsrl(x86_pmu.lbr_tos, tos);
@@ -427,23 +464,19 @@ static void __intel_pmu_lbr_restore(void *ctx)
 
 void intel_pmu_lbr_save(void *ctx)
 {
+	bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx = ctx;
 	unsigned lbr_idx, mask;
-	u64 tos, from;
+	u64 tos;
 	int i;
 
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		lbr_idx = (tos - i) & mask;
-		from = rdlbr_from(lbr_idx);
-		if (!from)
+		if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info))
 			break;
-		task_ctx->lbr[i].from = from;
-		task_ctx->lbr[i].to = rdlbr_to(lbr_idx);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
 	}
 	task_ctx->valid_lbrs = i;
 	task_ctx->tos = tos;
@@ -689,7 +722,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		if (lbr_format == LBR_FORMAT_INFO && need_info) {
 			u64 info;
 
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+			info = rdlbr_info(lbr_idx);
 			mis = !!(info & LBR_INFO_MISPRED);
 			pred = !mis;
 			in_tx = !!(info & LBR_INFO_IN_TX);
@@ -1336,6 +1369,7 @@ __init void intel_pmu_lbr_init_skl(void)
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
 	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+	x86_pmu.lbr_info = MSR_LBR_INFO_0;
 
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
@@ -1421,7 +1455,7 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
 	lbr->nr = x86_pmu.lbr_nr;
 	lbr->from = x86_pmu.lbr_from;
 	lbr->to = x86_pmu.lbr_to;
-	lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? MSR_LBR_INFO_0 : 0;
+	lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0;
 
 	return 0;
 }
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 40279e0e2ec7..9d17dc8374ca 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -735,7 +735,7 @@ struct x86_pmu {
 	 * Intel LBR
 	 */
 	unsigned int	lbr_tos, lbr_from, lbr_to,
-			lbr_nr;			   /* LBR base regs and size */
+			lbr_info, lbr_nr;	   /* LBR base regs and size */
 	union {
 		u64	lbr_sel_mask;		   /* LBR_SELECT valid bits */
 		u64	lbr_ctl_mask;		   /* LBR_CTL valid bits */
-- 
Gitee


From fcc0588523a9007bb8e6fd668a736921d1fcda12 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:19 -0700
Subject: [PATCH 0843/2161] perf/x86/intel/lbr: Factor out intel_pmu_store_lbr

commit 631618a0dca31dc23dcce38cf345c6139bd8a1e9 upstream.

The way to store the LBR information from a PEBS LBR record can be
reused in Architecture LBR, because
- The LBR information is stored like a stack. Entry 0 is always the
  youngest branch.
- The layout of the LBR INFO MSR is similar.

The LBR information may be retrieved from either the LBR registers
(non-PEBS event) or a buffer (PEBS event). Extend rdlbr_*() to support
both methods.

Explicitly check the invalid entry (0s), which can avoid unnecessary MSR
access if using a non-PEBS event. For a PEBS event, the check should
slightly improve the performance as well. The invalid entries are cut.
The intel_pmu_lbr_filter() doesn't need to check and filter them out.

Cannot share the function with current model-specific LBR read, because
the direction of the LBR growth is opposite.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-14-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c | 82 +++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index d3d129c7d7ef..0d7a85903964 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -348,28 +348,37 @@ static __always_inline void wrlbr_info(unsigned int idx, u64 val)
 	wrmsrl(x86_pmu.lbr_info + idx, val);
 }
 
-static __always_inline u64 rdlbr_from(unsigned int idx)
+static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr)
 {
 	u64 val;
 
+	if (lbr)
+		return lbr->from;
+
 	rdmsrl(x86_pmu.lbr_from + idx, val);
 
 	return lbr_from_signext_quirk_rd(val);
 }
 
-static __always_inline u64 rdlbr_to(unsigned int idx)
+static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr)
 {
 	u64 val;
 
+	if (lbr)
+		return lbr->to;
+
 	rdmsrl(x86_pmu.lbr_to + idx, val);
 
 	return val;
 }
 
-static __always_inline u64 rdlbr_info(unsigned int idx)
+static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr)
 {
 	u64 val;
 
+	if (lbr)
+		return lbr->info;
+
 	rdmsrl(x86_pmu.lbr_info + idx, val);
 
 	return val;
@@ -387,16 +396,16 @@ wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
 static inline bool
 rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
 {
-	u64 from = rdlbr_from(idx);
+	u64 from = rdlbr_from(idx, NULL);
 
 	/* Don't read invalid entry */
 	if (!from)
 		return false;
 
 	lbr->from = from;
-	lbr->to = rdlbr_to(idx);
+	lbr->to = rdlbr_to(idx, NULL);
 	if (need_info)
-		lbr->info = rdlbr_info(idx);
+		lbr->info = rdlbr_info(idx, NULL);
 
 	return true;
 }
@@ -432,7 +441,7 @@ void intel_pmu_lbr_restore(void *ctx)
 
 static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
-	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos);
+	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
 }
 
 static void __intel_pmu_lbr_restore(void *ctx)
@@ -709,8 +718,8 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		u16 cycles = 0;
 		int lbr_flags = lbr_desc[lbr_format];
 
-		from = rdlbr_from(lbr_idx);
-		to   = rdlbr_to(lbr_idx);
+		from = rdlbr_from(lbr_idx, NULL);
+		to   = rdlbr_to(lbr_idx, NULL);
 
 		/*
 		 * Read LBR call stack entries
@@ -722,7 +731,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		if (lbr_format == LBR_FORMAT_INFO && need_info) {
 			u64 info;
 
-			info = rdlbr_info(lbr_idx);
+			info = rdlbr_info(lbr_idx, NULL);
 			mis = !!(info & LBR_INFO_MISPRED);
 			pred = !mis;
 			in_tx = !!(info & LBR_INFO_IN_TX);
@@ -777,6 +786,42 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	cpuc->lbr_stack.hw_idx = tos;
 }
 
+static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
+				struct lbr_entry *entries)
+{
+	struct perf_branch_entry *e;
+	struct lbr_entry *lbr;
+	u64 from, to, info;
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr = entries ? &entries[i] : NULL;
+		e = &cpuc->lbr_entries[i];
+
+		from = rdlbr_from(i, lbr);
+		/*
+		 * Read LBR entries until invalid entry (0s) is detected.
+		 */
+		if (!from)
+			break;
+
+		to = rdlbr_to(i, lbr);
+		info = rdlbr_info(i, lbr);
+
+		e->from		= from;
+		e->to		= to;
+		e->mispred	= !!(info & LBR_INFO_MISPRED);
+		e->predicted	= !(info & LBR_INFO_MISPRED);
+		e->in_tx	= !!(info & LBR_INFO_IN_TX);
+		e->abort	= !!(info & LBR_INFO_ABORT);
+		e->cycles	= info & LBR_INFO_CYCLES;
+		e->type		= 0;
+		e->reserved	= 0;
+	}
+
+	cpuc->lbr_stack.nr = i;
+}
+
 void intel_pmu_lbr_read(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1215,9 +1260,6 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int i;
-
-	cpuc->lbr_stack.nr = x86_pmu.lbr_nr;
 
 	/* Cannot get TOS for large PEBS */
 	if (cpuc->n_pebs == cpuc->n_large_pebs)
@@ -1225,19 +1267,7 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 	else
 		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		u64 info = lbr[i].info;
-		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
-
-		e->from		= lbr[i].from;
-		e->to		= lbr[i].to;
-		e->mispred	= !!(info & LBR_INFO_MISPRED);
-		e->predicted	= !(info & LBR_INFO_MISPRED);
-		e->in_tx	= !!(info & LBR_INFO_IN_TX);
-		e->abort	= !!(info & LBR_INFO_ABORT);
-		e->cycles	= info & LBR_INFO_CYCLES;
-		e->reserved	= 0;
-	}
+	intel_pmu_store_lbr(cpuc, lbr);
 	intel_pmu_lbr_filter(cpuc);
 }
 
-- 
Gitee


From 57bee862cc500beb33a0c549636777d26909aeeb Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:20 -0700
Subject: [PATCH 0844/2161] perf/x86/intel/lbr: Support Architectural LBR

commit 47125db27e47e9d44c878bf8925aa057824bb0d5 upstream.

Last Branch Records (LBR) enables recording of software path history by
logging taken branches and other control flows within architectural
registers now. Intel CPUs have had model-specific LBR for quite some
time, but this evolves them into an architectural feature now.

The main improvements of Architectural LBR implemented includes:
- Linux kernel can support the LBR features without knowing the model
  number of the current CPU.
- Architectural LBR capabilities can be enumerated by CPUID. The
  lbr_ctl_map is based on the CPUID Enumeration.
- The possible LBR depth can be retrieved from CPUID enumeration. The
  max value is written to the new MSR_ARCH_LBR_DEPTH as the number of
  LBR entries.
- A new IA32_LBR_CTL MSR is introduced to enable and configure LBRs,
  which replaces the IA32_DEBUGCTL[bit 0] and the LBR_SELECT MSR.
- Each LBR record or entry is still comprised of three MSRs,
  IA32_LBR_x_FROM_IP, IA32_LBR_x_TO_IP and IA32_LBR_x_TO_IP.
  But they become the architectural MSRs.
- Architectural LBR is stack-like now. Entry 0 is always the youngest
  branch, entry 1 the next youngest... The TOS MSR has been removed.

The way to enable/disable Architectural LBR is similar to the previous
model-specific LBR. __intel_pmu_lbr_enable/disable() can be reused, but
some modifications are required, which include:
- MSR_ARCH_LBR_CTL is used to enable and configure the Architectural
  LBR.
- When checking the value of the IA32_DEBUGCTL MSR, ignoring the
  DEBUGCTLMSR_LBR (bit 0) for Architectural LBR, which has no meaning
  and always return 0.
- The FREEZE_LBRS_ON_PMI has to be explicitly set/clear, because
  MSR_IA32_DEBUGCTLMSR is not touched in __intel_pmu_lbr_disable() for
  Architectural LBR.
- Only MSR_ARCH_LBR_CTL is cleared in __intel_pmu_lbr_disable() for
  Architectural LBR.

Some Architectural LBR dedicated functions are implemented to
reset/read/save/restore LBR.
- For reset, writing to the ARCH_LBR_DEPTH MSR clears all Arch LBR
  entries, which is a lot faster and can improve the context switch
  latency.
- For read, the branch type information can be retrieved from
  the MSR_ARCH_LBR_INFO_*. But it's not fully compatible due to
  OTHER_BRANCH type. The software decoding is still required for the
  OTHER_BRANCH case.
  LBR records are stored in the age order as well. Reuse
  intel_pmu_store_lbr(). Check the CPUID enumeration before accessing
  the corresponding bits in LBR_INFO.
- For save/restore, applying the fast reset (writing ARCH_LBR_DEPTH).
  Reading 'lbr_from' of entry 0 instead of the TOS MSR to check if the
  LBR registers are reset in the deep C-state. If 'the deep C-state
  reset' bit is not set in CPUID enumeration, ignoring the check.
  XSAVE support for Architectural LBR will be implemented later.

The number of LBR entries cannot be hardcoded anymore, which should be
retrieved from CPUID enumeration. A new structure
x86_perf_task_context_arch_lbr is introduced for Architectural LBR.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-15-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c |   3 +
 arch/x86/events/intel/lbr.c  | 251 +++++++++++++++++++++++++++++++++--
 arch/x86/events/perf_event.h |  10 ++
 3 files changed, 253 insertions(+), 11 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fc61f01c8c84..10183b167dfc 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4953,6 +4953,9 @@ __init int intel_pmu_init(void)
 		x86_pmu.lbr_read = intel_pmu_lbr_read_32;
 	}
 
+	if (boot_cpu_has(X86_FEATURE_ARCH_LBR))
+		intel_pmu_arch_lbr_init();
+
 	intel_ds_init();
 
 	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 0d7a85903964..e4e249a78451 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -172,6 +172,14 @@ enum {
 
 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
+static __always_inline bool is_lbr_call_stack_bit_set(u64 config)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return !!(config & ARCH_LBR_CALL_STACK);
+
+	return !!(config & LBR_CALL_STACK);
+}
+
 /*
  * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
  * otherwise it becomes near impossible to get a reliable stack.
@@ -195,27 +203,40 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	 */
 	if (cpuc->lbr_sel)
 		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-	if (!pmi && cpuc->lbr_sel)
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel)
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	orig_debugctl = debugctl;
-	debugctl |= DEBUGCTLMSR_LBR;
+
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+		debugctl |= DEBUGCTLMSR_LBR;
 	/*
 	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
 	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
 	 * may cause superfluous increase/decrease of LBR_TOS.
 	 */
-	if (!(lbr_select & LBR_CALL_STACK))
+	if (is_lbr_call_stack_bit_set(lbr_select))
+		debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+	else
 		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
 	if (orig_debugctl != debugctl)
 		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
 }
 
 static void __intel_pmu_lbr_disable(void)
 {
 	u64 debugctl;
 
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
+		wrmsrl(MSR_ARCH_LBR_CTL, 0);
+		return;
+	}
+
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -241,6 +262,12 @@ void intel_pmu_lbr_reset_64(void)
 	}
 }
 
+static void intel_pmu_arch_lbr_reset(void)
+{
+	/* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */
+	wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr);
+}
+
 void intel_pmu_lbr_reset(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -439,8 +466,28 @@ void intel_pmu_lbr_restore(void *ctx)
 		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
+static void intel_pmu_arch_lbr_restore(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+	struct lbr_entry *entries = task_ctx->entries;
+	int i;
+
+	/* Fast reset the LBRs before restore if the call stack is not full. */
+	if (!entries[x86_pmu.lbr_nr - 1].from)
+		intel_pmu_arch_lbr_reset();
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		if (!entries[i].from)
+			break;
+		wrlbr_all(&entries[i], i, true);
+	}
+}
+
 static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL);
+
 	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
 }
 
@@ -494,6 +541,22 @@ void intel_pmu_lbr_save(void *ctx)
 		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
+static void intel_pmu_arch_lbr_save(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+	struct lbr_entry *entries = task_ctx->entries;
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		if (!rdlbr_all(&entries[i], i, true))
+			break;
+	}
+
+	/* LBR call stack is not full. Reset is required in restore. */
+	if (i < x86_pmu.lbr_nr)
+		entries[x86_pmu.lbr_nr - 1].from = 0;
+}
+
 static void __intel_pmu_lbr_save(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -786,6 +849,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	cpuc->lbr_stack.hw_idx = tos;
 }
 
+static __always_inline int get_lbr_br_type(u64 info)
+{
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type)
+		return 0;
+
+	return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+}
+
+static __always_inline bool get_lbr_mispred(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
+		return 0;
+
+	return !!(info & LBR_INFO_MISPRED);
+}
+
+static __always_inline bool get_lbr_predicted(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
+		return 0;
+
+	return !(info & LBR_INFO_MISPRED);
+}
+
+static __always_inline bool get_lbr_cycles(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+	    !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
+		return 0;
+
+	return info & LBR_INFO_CYCLES;
+}
+
 static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
 				struct lbr_entry *entries)
 {
@@ -810,18 +906,23 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
 
 		e->from		= from;
 		e->to		= to;
-		e->mispred	= !!(info & LBR_INFO_MISPRED);
-		e->predicted	= !(info & LBR_INFO_MISPRED);
+		e->mispred	= get_lbr_mispred(info);
+		e->predicted	= get_lbr_predicted(info);
 		e->in_tx	= !!(info & LBR_INFO_IN_TX);
 		e->abort	= !!(info & LBR_INFO_ABORT);
-		e->cycles	= info & LBR_INFO_CYCLES;
-		e->type		= 0;
+		e->cycles	= get_lbr_cycles(info);
+		e->type		= get_lbr_br_type(info);
 		e->reserved	= 0;
 	}
 
 	cpuc->lbr_stack.nr = i;
 }
 
+static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
+{
+	intel_pmu_store_lbr(cpuc, NULL);
+}
+
 void intel_pmu_lbr_read(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1197,6 +1298,27 @@ common_branch_type(int type)
 	return PERF_BR_UNKNOWN;
 }
 
+enum {
+	ARCH_LBR_BR_TYPE_JCC			= 0,
+	ARCH_LBR_BR_TYPE_NEAR_IND_JMP		= 1,
+	ARCH_LBR_BR_TYPE_NEAR_REL_JMP		= 2,
+	ARCH_LBR_BR_TYPE_NEAR_IND_CALL		= 3,
+	ARCH_LBR_BR_TYPE_NEAR_REL_CALL		= 4,
+	ARCH_LBR_BR_TYPE_NEAR_RET		= 5,
+	ARCH_LBR_BR_TYPE_KNOWN_MAX		= ARCH_LBR_BR_TYPE_NEAR_RET,
+
+	ARCH_LBR_BR_TYPE_MAP_MAX		= 16,
+};
+
+static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = {
+	[ARCH_LBR_BR_TYPE_JCC]			= X86_BR_JCC,
+	[ARCH_LBR_BR_TYPE_NEAR_IND_JMP]		= X86_BR_IND_JMP,
+	[ARCH_LBR_BR_TYPE_NEAR_REL_JMP]		= X86_BR_JMP,
+	[ARCH_LBR_BR_TYPE_NEAR_IND_CALL]	= X86_BR_IND_CALL,
+	[ARCH_LBR_BR_TYPE_NEAR_REL_CALL]	= X86_BR_CALL,
+	[ARCH_LBR_BR_TYPE_NEAR_RET]		= X86_BR_RET,
+};
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -1209,7 +1331,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 {
 	u64 from, to;
 	int br_sel = cpuc->br_sel;
-	int i, j, type;
+	int i, j, type, to_plm;
 	bool compress = false;
 
 	/* if sampling all branches, then nothing to filter */
@@ -1221,8 +1343,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 
 		from = cpuc->lbr_entries[i].from;
 		to = cpuc->lbr_entries[i].to;
+		type = cpuc->lbr_entries[i].type;
 
-		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+		/*
+		 * Parse the branch type recorded in LBR_x_INFO MSR.
+		 * Doesn't support OTHER_BRANCH decoding for now.
+		 * OTHER_BRANCH branch type still rely on software decoding.
+		 */
+		if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+		    type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) {
+			to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+			type = arch_lbr_br_type_map[type] | to_plm;
+		} else
+			type = branch_type(from, to, cpuc->lbr_entries[i].abort);
 		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
 			if (cpuc->lbr_entries[i].in_tx)
 				type |= X86_BR_IN_TX;
@@ -1261,8 +1394,9 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	/* Cannot get TOS for large PEBS */
-	if (cpuc->n_pebs == cpuc->n_large_pebs)
+	/* Cannot get TOS for large PEBS and Arch LBR */
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) ||
+	    (cpuc->n_pebs == cpuc->n_large_pebs))
 		cpuc->lbr_stack.hw_idx = -1ULL;
 	else
 		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
@@ -1324,6 +1458,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
 };
 
+static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= ARCH_LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= ARCH_LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= ARCH_LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= ARCH_LBR_RETURN |
+						  ARCH_LBR_OTHER_BRANCH,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = ARCH_LBR_REL_CALL |
+						  ARCH_LBR_IND_CALL |
+						  ARCH_LBR_OTHER_BRANCH,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = ARCH_LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]         = ARCH_LBR_JCC,
+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = ARCH_LBR_REL_CALL |
+						  ARCH_LBR_IND_CALL |
+						  ARCH_LBR_RETURN |
+						  ARCH_LBR_CALL_STACK,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= ARCH_LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= ARCH_LBR_REL_CALL,
+};
+
 /* core */
 void __init intel_pmu_lbr_init_core(void)
 {
@@ -1471,6 +1625,81 @@ void intel_pmu_lbr_init_knl(void)
 		x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
 }
 
+void __init intel_pmu_arch_lbr_init(void)
+{
+	union cpuid28_eax eax;
+	union cpuid28_ebx ebx;
+	union cpuid28_ecx ecx;
+	unsigned int unused_edx;
+	u64 lbr_nr;
+
+	/* Arch LBR Capabilities */
+	cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx);
+
+	lbr_nr = fls(eax.split.lbr_depth_mask) * 8;
+	if (!lbr_nr)
+		goto clear_arch_lbr;
+
+	/* Apply the max depth of Arch LBR */
+	if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr))
+		goto clear_arch_lbr;
+
+	x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask;
+	x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset;
+	x86_pmu.lbr_lip = eax.split.lbr_lip;
+	x86_pmu.lbr_cpl = ebx.split.lbr_cpl;
+	x86_pmu.lbr_filter = ebx.split.lbr_filter;
+	x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack;
+	x86_pmu.lbr_mispred = ecx.split.lbr_mispred;
+	x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr;
+	x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
+	x86_pmu.lbr_nr = lbr_nr;
+
+	x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) +
+				       lbr_nr * sizeof(struct lbr_entry);
+
+	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
+	x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
+	x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0;
+
+	/* LBR callstack requires both CPL and Branch Filtering support */
+	if (!x86_pmu.lbr_cpl ||
+	    !x86_pmu.lbr_filter ||
+	    !x86_pmu.lbr_call_stack)
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP;
+
+	if (!x86_pmu.lbr_cpl) {
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP;
+	} else if (!x86_pmu.lbr_filter) {
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP;
+	}
+
+	x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK;
+	x86_pmu.lbr_ctl_map  = arch_lbr_ctl_map;
+
+	if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter)
+		x86_pmu.lbr_ctl_map = NULL;
+
+	x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
+	x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
+	x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
+	x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+
+	pr_cont("Architectural LBR, ");
+
+	return;
+
+clear_arch_lbr:
+	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR);
+}
+
 /**
  * x86_perf_get_lbr - get the LBR records information
  *
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9d17dc8374ca..eeeba8ee71ea 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -823,6 +823,11 @@ struct x86_perf_task_context {
 	struct lbr_entry lbr[MAX_LBR_ENTRIES];
 };
 
+struct x86_perf_task_context_arch_lbr {
+	struct x86_perf_task_context_opt opt;
+	struct lbr_entry entries[];
+};
+
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
@@ -873,6 +878,9 @@ extern struct x86_pmu x86_pmu __read_mostly;
 
 static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
 {
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt;
+
 	return &((struct x86_perf_task_context *)ctx)->opt;
 }
 
@@ -1192,6 +1200,8 @@ void intel_pmu_lbr_init_skl(void);
 
 void intel_pmu_lbr_init_knl(void);
 
+void intel_pmu_arch_lbr_init(void);
+
 void intel_pmu_pebs_data_source_nhm(void);
 
 void intel_pmu_pebs_data_source_skl(bool pmem);
-- 
Gitee


From 87269607dbe8c2a575451550d63ae79469a1aa9b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:21 -0700
Subject: [PATCH 0845/2161] perf/core: Factor out functions to allocate/free
 the task_ctx_data

commit ff9ff926889dd8026b4ba55266a010c27f68604f upstream.

The method to allocate/free the task_ctx_data is going to be changed in
the following patch. Currently, the task_ctx_data is allocated/freed in
several different places. To avoid repeatedly modifying the same codes
in several different places, alloc_task_ctx_data() and
free_task_ctx_data() are factored out to allocate/free the
task_ctx_data. The modification only needs to be applied once.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/events/core.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 387cbca76d98..fd05e15b9e82 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1181,12 +1181,22 @@ static void get_ctx(struct perf_event_context *ctx)
 	refcount_inc(&ctx->refcount);
 }
 
+static void *alloc_task_ctx_data(struct pmu *pmu)
+{
+	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+}
+
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+{
+	kfree(task_ctx_data);
+}
+
 static void free_ctx(struct rcu_head *head)
 {
 	struct perf_event_context *ctx;
 
 	ctx = container_of(head, struct perf_event_context, rcu_head);
-	kfree(ctx->task_ctx_data);
+	free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
 	kfree(ctx);
 }
 
@@ -4299,7 +4309,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 		goto errout;
 
 	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
-		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+		task_ctx_data = alloc_task_ctx_data(pmu);
 		if (!task_ctx_data) {
 			err = -ENOMEM;
 			goto errout;
@@ -4357,11 +4367,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 		}
 	}
 
-	kfree(task_ctx_data);
+	free_task_ctx_data(pmu, task_ctx_data);
 	return ctx;
 
 errout:
-	kfree(task_ctx_data);
+	free_task_ctx_data(pmu, task_ctx_data);
 	return ERR_PTR(err);
 }
 
@@ -11875,8 +11885,7 @@ inherit_event(struct perf_event *parent_event,
 	    !child_ctx->task_ctx_data) {
 		struct pmu *pmu = child_event->pmu;
 
-		child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
-						   GFP_KERNEL);
+		child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
 		if (!child_ctx->task_ctx_data) {
 			free_event(child_event);
 			return ERR_PTR(-ENOMEM);
-- 
Gitee


From 2b27edaec3183ad25a5e171f7bed4b57a0d5a14b Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Wed, 23 Oct 2019 10:11:54 +0300
Subject: [PATCH 0846/2161] perf/x86: Install platform specific
 ->swap_task_ctx() adapter

commit a44399703b4893de4eadb970867fd5efd4461514 upstream.

Bridge perf core and x86 swap_task_ctx() method calls.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: https://lkml.kernel.org/r/b157e97d-32c3-aeaf-13ba-47350c677906@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 94794c41b285..9921d0995071 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2354,6 +2354,13 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 		x86_pmu.sched_task(ctx, sched_in);
 }
 
+static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
+				  struct perf_event_context *next)
+{
+	if (x86_pmu.swap_task_ctx)
+		x86_pmu.swap_task_ctx(prev, next);
+}
+
 void perf_check_microcode(void)
 {
 	if (x86_pmu.check_microcode)
@@ -2408,6 +2415,7 @@ static struct pmu pmu = {
 	.event_idx		= x86_pmu_event_idx,
 	.sched_task		= x86_pmu_sched_task,
 	.task_ctx_size          = sizeof(struct x86_perf_task_context),
+	.swap_task_ctx		= x86_pmu_swap_task_ctx,
 	.check_period		= x86_pmu_check_period,
 
 	.aux_output_match	= x86_pmu_aux_output_match,
-- 
Gitee


From 14e1694d2e6dbce70ee2f8e06e559a842f36b82e Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:22 -0700
Subject: [PATCH 0847/2161] perf/core: Use kmem_cache to allocate the PMU
 specific data

commit 217c2a633ebb36f1cc6d249f4ef2e4a809d46818 upstream.

Currently, the PMU specific data task_ctx_data is allocated by the
function kzalloc() in the perf generic code. When there is no specific
alignment requirement for the task_ctx_data, the method works well for
now. However, there will be a problem once a specific alignment
requirement is introduced in future features, e.g., the Architecture LBR
XSAVE feature requires 64-byte alignment. If the specific alignment
requirement is not fulfilled, the XSAVE family of instructions will fail
to save/restore the xstate to/from the task_ctx_data.

The function kzalloc() itself only guarantees a natural alignment. A
new method to allocate the task_ctx_data has to be introduced, which
has to meet the requirements as below:
- must be a generic method can be used by different architectures,
  because the allocation of the task_ctx_data is implemented in the
  perf generic code;
- must be an alignment-guarantee method (The alignment requirement is
  not changed after the boot);
- must be able to allocate/free a buffer (smaller than a page size)
  dynamically;
- should not cause extra CPU overhead or space overhead.

Several options were considered as below:
- One option is to allocate a larger buffer for task_ctx_data. E.g.,
    ptr = kmalloc(size + alignment, GFP_KERNEL);
    ptr &= ~(alignment - 1);
  This option causes space overhead.
- Another option is to allocate the task_ctx_data in the PMU specific
  code. To do so, several function pointers have to be added. As a
  result, both the generic structure and the PMU specific structure
  will become bigger. Besides, extra function calls are added when
  allocating/freeing the buffer. This option will increase both the
  space overhead and CPU overhead.
- The third option is to use a kmem_cache to allocate a buffer for the
  task_ctx_data. The kmem_cache can be created with a specific alignment
  requirement by the PMU at boot time. A new pointer for kmem_cache has
  to be added in the generic struct pmu, which would be used to
  dynamically allocate a buffer for the task_ctx_data at run time.
  Although the new pointer is added to the struct pmu, the existing
  variable task_ctx_size is not required anymore. The size of the
  generic structure is kept the same.

The third option which meets all the aforementioned requirements is used
to replace kzalloc() for the PMU specific data allocation. A later patch
will remove the kzalloc() method and the related variables.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 62ee6d9a0db5..95374af89a00 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -430,6 +430,11 @@ struct pmu {
 	 */
 	size_t				task_ctx_size;
 
+	/*
+	 * Kmem cache of PMU specific data
+	 */
+	struct kmem_cache		*task_ctx_cache;
+
 	/*
 	 * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
 	 * can be synchronized using this function. See Intel LBR callstack support
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fd05e15b9e82..67f98de8ebd4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1183,12 +1183,18 @@ static void get_ctx(struct perf_event_context *ctx)
 
 static void *alloc_task_ctx_data(struct pmu *pmu)
 {
+	if (pmu->task_ctx_cache)
+		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
+
 	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
 }
 
 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
 {
-	kfree(task_ctx_data);
+	if (pmu->task_ctx_cache && task_ctx_data)
+		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+	else
+		kfree(task_ctx_data);
 }
 
 static void free_ctx(struct rcu_head *head)
-- 
Gitee


From a5766a2f28e992cfd4d1c83e459f63a75bed3ab8 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:23 -0700
Subject: [PATCH 0848/2161] perf/x86/intel/lbr: Create kmem_cache for the LBR
 context data

commit 33cad284497cf40f55ad6029c06011de3538ebed upstream.

A new kmem_cache method is introduced to allocate the PMU specific data
task_ctx_data, which requires the PMU specific code to create a
kmem_cache.

Currently, the task_ctx_data is only used by the Intel LBR call stack
feature, which is introduced since Haswell. The kmem_cache should be
only created for Haswell and later platforms. There is no alignment
requirement for the existing platforms.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-18-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index e4e249a78451..e784c1d485ca 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1531,9 +1531,17 @@ void __init intel_pmu_lbr_init_snb(void)
 	 */
 }
 
+static inline struct kmem_cache *
+create_lbr_kmem_cache(size_t size, size_t align)
+{
+	return kmem_cache_create("x86_lbr", size, align, 0, NULL);
+}
+
 /* haswell */
 void intel_pmu_lbr_init_hsw(void)
 {
+	size_t size = sizeof(struct x86_perf_task_context);
+
 	x86_pmu.lbr_nr	 = 16;
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
@@ -1542,6 +1550,8 @@ void intel_pmu_lbr_init_hsw(void)
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
+	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
 	if (lbr_from_signext_quirk_needed())
 		static_branch_enable(&lbr_from_quirk_key);
 }
@@ -1549,6 +1559,8 @@ void intel_pmu_lbr_init_hsw(void)
 /* skylake */
 __init void intel_pmu_lbr_init_skl(void)
 {
+	size_t size = sizeof(struct x86_perf_task_context);
+
 	x86_pmu.lbr_nr	 = 32;
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
@@ -1558,6 +1570,8 @@ __init void intel_pmu_lbr_init_skl(void)
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
+	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
 	/*
 	 * SW branch filter usage:
 	 * - support syscall, sysret capture.
@@ -1631,6 +1645,7 @@ void __init intel_pmu_arch_lbr_init(void)
 	union cpuid28_ebx ebx;
 	union cpuid28_ecx ecx;
 	unsigned int unused_edx;
+	size_t size;
 	u64 lbr_nr;
 
 	/* Arch LBR Capabilities */
@@ -1655,8 +1670,10 @@ void __init intel_pmu_arch_lbr_init(void)
 	x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
 	x86_pmu.lbr_nr = lbr_nr;
 
-	x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) +
-				       lbr_nr * sizeof(struct lbr_entry);
+	size = sizeof(struct x86_perf_task_context_arch_lbr) +
+	       lbr_nr * sizeof(struct lbr_entry);
+	x86_get_pmu()->task_ctx_size = size;
+	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
 
 	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
 	x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
-- 
Gitee


From ae6b38c09cb3475774030dca48dfd1b3bf0f931c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:24 -0700
Subject: [PATCH 0849/2161] perf/x86: Remove task_ctx_size

commit 5a09928d339f3cf0973991ddc3a2798825c84c99 upstream.

A new kmem_cache method has replaced the kzalloc() to allocate the PMU
specific data. The task_ctx_size is not required anymore.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-19-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/core.c      | 1 -
 arch/x86/events/intel/lbr.c | 1 -
 include/linux/perf_event.h  | 4 ----
 kernel/events/core.c        | 4 +---
 4 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9921d0995071..10b90528ab06 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2414,7 +2414,6 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.sched_task		= x86_pmu_sched_task,
-	.task_ctx_size          = sizeof(struct x86_perf_task_context),
 	.swap_task_ctx		= x86_pmu_swap_task_ctx,
 	.check_period		= x86_pmu_check_period,
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index e784c1d485ca..3ad528996d1c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1672,7 +1672,6 @@ void __init intel_pmu_arch_lbr_init(void)
 
 	size = sizeof(struct x86_perf_task_context_arch_lbr) +
 	       lbr_nr * sizeof(struct lbr_entry);
-	x86_get_pmu()->task_ctx_size = size;
 	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
 
 	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 95374af89a00..2f73da9936cf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -425,10 +425,6 @@ struct pmu {
 	 */
 	void (*sched_task)		(struct perf_event_context *ctx,
 					bool sched_in);
-	/*
-	 * PMU specific data size
-	 */
-	size_t				task_ctx_size;
 
 	/*
 	 * Kmem cache of PMU specific data
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67f98de8ebd4..c07e264d170e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1186,15 +1186,13 @@ static void *alloc_task_ctx_data(struct pmu *pmu)
 	if (pmu->task_ctx_cache)
 		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
 
-	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+	return NULL;
 }
 
 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
 {
 	if (pmu->task_ctx_cache && task_ctx_data)
 		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
-	else
-		kfree(task_ctx_data);
 }
 
 static void free_ctx(struct rcu_head *head)
-- 
Gitee


From 6f900eedb0fd75d330a244bd6c7d06adb7b493a3 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:28 -0700
Subject: [PATCH 0850/2161] perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR
 context switch (part2)

commit ce711ea3cab9ad325d849792d442848e553095b8 upstream.

In the LBR call stack mode, LBR information is used to reconstruct a
call stack. To get the complete call stack, perf has to save/restore
all LBR registers during a context switch. Due to a large number of the
LBR registers, this process causes a high CPU overhead. To reduce the
CPU overhead during a context switch, use the XSAVES/XRSTORS
instructions.

Every XSAVE area must follow a canonical format: the legacy region, an
XSAVE header and the extended region. Although the LBR information is
only kept in the extended region, a space for the legacy region and
XSAVE header is still required. Add a new dedicated structure for LBR
XSAVES support.

Before enabling XSAVES support, the size of the LBR state has to be
sanity checked, because:
- the size of the software structure is calculated from the max number
of the LBR depth, which is enumerated by the CPUID leaf for Arch LBR.
The size of the LBR state is enumerated by the CPUID leaf for XSAVE
support of Arch LBR. If the values from the two CPUID leaves are not
consistent, it may trigger a buffer overflow. For example, a hypervisor
may unconsciously set inconsistent values for the two emulated CPUID.
- unlike other state components, the size of an LBR state depends on the
max number of LBRs, which may vary from generation to generation.

Expose the function xfeature_size() for the sanity check.
The LBR XSAVES support will be disabled if the size of the LBR state
enumerated by CPUID doesn't match with the size of the software
structure.

The XSAVE instruction requires 64-byte alignment for state buffers. A
new macro is added to reflect the alignment requirement. A 64-byte
aligned kmem_cache is created for architecture LBR.

Currently, the structure for each state component is maintained in
fpu/types.h. The structure for the new LBR state component should be
maintained in the same place. Move structure lbr_entry to fpu/types.h as
well for broader sharing.

Add dedicated lbr_save/lbr_restore functions for LBR XSAVES support,
which invokes the corresponding xstate helpers to XSAVES/XRSTORS LBR
information at the context switch when the call stack mode is enabled.
Since the XSAVES/XRSTORS instructions will be eventually invoked, the
dedicated functions is named with '_xsaves'/'_xrstors' postfix.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-23-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/lbr.c       | 79 +++++++++++++++++++++++++++++--
 arch/x86/events/perf_event.h      | 21 ++++++++
 arch/x86/include/asm/fpu/types.h  | 20 ++++++++
 arch/x86/include/asm/perf_event.h |  4 --
 4 files changed, 115 insertions(+), 9 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 3ad528996d1c..1d2be4692650 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -483,6 +483,17 @@ static void intel_pmu_arch_lbr_restore(void *ctx)
 	}
 }
 
+/*
+ * Restore the Architecture LBR state from the xsave area in the perf
+ * context data for the task via the XRSTORS instruction.
+ */
+static void intel_pmu_arch_lbr_xrstors(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+	xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
 static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
 	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
@@ -557,6 +568,17 @@ static void intel_pmu_arch_lbr_save(void *ctx)
 		entries[x86_pmu.lbr_nr - 1].from = 0;
 }
 
+/*
+ * Save the Architecture LBR state to the xsave area in the perf
+ * context data for the task via the XSAVES instruction.
+ */
+static void intel_pmu_arch_lbr_xsaves(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+	xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
 static void __intel_pmu_lbr_save(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1639,12 +1661,40 @@ void intel_pmu_lbr_init_knl(void)
 		x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
 }
 
+/*
+ * LBR state size is variable based on the max number of registers.
+ * This calculates the expected state size, which should match
+ * what the hardware enumerates for the size of XFEATURE_LBR.
+ */
+static inline unsigned int get_lbr_state_size(void)
+{
+	return sizeof(struct arch_lbr_state) +
+	       x86_pmu.lbr_nr * sizeof(struct lbr_entry);
+}
+
+static bool is_arch_lbr_xsave_available(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_XSAVES))
+		return false;
+
+	/*
+	 * Check the LBR state with the corresponding software structure.
+	 * Disable LBR XSAVES support if the size doesn't match.
+	 */
+	if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size()))
+		return false;
+
+	return true;
+}
+
 void __init intel_pmu_arch_lbr_init(void)
 {
+	struct pmu *pmu = x86_get_pmu();
 	union cpuid28_eax eax;
 	union cpuid28_ebx ebx;
 	union cpuid28_ecx ecx;
 	unsigned int unused_edx;
+	bool arch_lbr_xsave;
 	size_t size;
 	u64 lbr_nr;
 
@@ -1670,9 +1720,22 @@ void __init intel_pmu_arch_lbr_init(void)
 	x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
 	x86_pmu.lbr_nr = lbr_nr;
 
-	size = sizeof(struct x86_perf_task_context_arch_lbr) +
-	       lbr_nr * sizeof(struct lbr_entry);
-	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
+	arch_lbr_xsave = is_arch_lbr_xsave_available();
+	if (arch_lbr_xsave) {
+		size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) +
+		       get_lbr_state_size();
+		pmu->task_ctx_cache = create_lbr_kmem_cache(size,
+							    XSAVE_ALIGNMENT);
+	}
+
+	if (!pmu->task_ctx_cache) {
+		arch_lbr_xsave = false;
+
+		size = sizeof(struct x86_perf_task_context_arch_lbr) +
+		       lbr_nr * sizeof(struct lbr_entry);
+		pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+	}
 
 	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
 	x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
@@ -1705,8 +1768,14 @@ void __init intel_pmu_arch_lbr_init(void)
 
 	x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
 	x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
-	x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
-	x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+	if (arch_lbr_xsave) {
+		x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves;
+		x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors;
+		pr_cont("XSAVE ");
+	} else {
+		x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
+		x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+	}
 
 	pr_cont("Architectural LBR, ");
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index eeeba8ee71ea..9131c46b9374 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -828,6 +828,27 @@ struct x86_perf_task_context_arch_lbr {
 	struct lbr_entry entries[];
 };
 
+/*
+ * Add padding to guarantee the 64-byte alignment of the state buffer.
+ *
+ * The structure is dynamically allocated. The size of the LBR state may vary
+ * based on the number of LBR registers.
+ *
+ * Do not put anything after the LBR state.
+ */
+struct x86_perf_task_context_arch_lbr_xsave {
+	struct x86_perf_task_context_opt		opt;
+
+	union {
+		struct xregs_state			xsave;
+		struct {
+			struct fxregs_state		i387;
+			struct xstate_header		header;
+			struct arch_lbr_state		lbr;
+		} __attribute__ ((packed, aligned (XSAVE_ALIGNMENT)));
+	};
+};
+
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 7e1954d5dd50..edbe0b5704ac 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -252,6 +252,26 @@ struct pkru_state {
 	u32				pad;
 } __packed;
 
+/*
+ * State component 15: Architectural LBR configuration state.
+ * The size of Arch LBR state depends on the number of LBRs (lbr_depth).
+ */
+
+struct lbr_entry {
+	u64 from;
+	u64 to;
+	u64 info;
+};
+
+struct arch_lbr_state {
+	u64 lbr_ctl;
+	u64 lbr_depth;
+	u64 ler_from;
+	u64 ler_to;
+	u64 ler_info;
+	struct lbr_entry		entries[];
+} __packed;
+
 /*
  * State component 17: 64-byte tile configuration register.
  */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index eda69992bd82..65c338ef434c 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -353,10 +353,6 @@ struct pebs_xmm {
 	u64 xmm[16*2];	/* two entries for each register */
 };
 
-struct lbr_entry {
-	u64 from, to, info;
-};
-
 /*
  * IBS cpuid feature detection
  */
-- 
Gitee


From e0a0b37ea996f88904728755bd97a083594ba5fe Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:29 -0700
Subject: [PATCH 0851/2161] perf/x86/intel/lbr: Support XSAVES for arch LBR
 read

commit c085fb8774671e83f6199a8e838fbc0e57094029 upstream.

Reading LBR registers in a perf NMI handler for a non-PEBS event
causes a high overhead because the number of LBR registers is huge.
To reduce the overhead, the XSAVES instruction should be used to replace
the LBR registers' reading method.

The XSAVES buffer used for LBR read has to be per-CPU because the NMI
handler invoked the lbr_read(). The existing task_ctx_data buffer
cannot be used which is per-task and only be allocated for the LBR call
stack mode. A new lbr_xsave pointer is introduced in the cpu_hw_events
as an XSAVES buffer for LBR read.

The XSAVES buffer should be allocated only when LBR is used by a
non-PEBS event on the CPU because the total size of the lbr_xsave is
not small (~1.4KB).

The XSAVES buffer is allocated when a non-PEBS event is added, but it
is lazily released in x86_release_hardware() when perf releases the
entire PMU hardware resource, because perf may frequently schedule the
event, e.g. high context switch. The lazy release method reduces the
overhead of frequently allocate/free the buffer.

If the lbr_xsave fails to be allocated, roll back to normal Arch LBR
lbr_read().

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-24-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/core.c       |  1 +
 arch/x86/events/intel/lbr.c  | 40 +++++++++++++++++++++++++++++++++++-
 arch/x86/events/perf_event.h |  7 +++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 10b90528ab06..3f5bf76bf711 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -360,6 +360,7 @@ void x86_release_hardware(void)
 	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
 		release_ds_buffers();
+		release_lbr_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 1d2be4692650..faa798839971 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -658,6 +658,7 @@ static inline bool branch_user_callstack(unsigned br_sel)
 
 void intel_pmu_lbr_add(struct perf_event *event)
 {
+	struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	if (!x86_pmu.lbr_nr)
@@ -695,6 +696,29 @@ void intel_pmu_lbr_add(struct perf_event *event)
 	perf_sched_cb_inc(event->ctx->pmu);
 	if (!cpuc->lbr_users++ && !event->total_time_running)
 		intel_pmu_lbr_reset();
+
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+	    kmem_cache && !cpuc->lbr_xsave &&
+	    (cpuc->lbr_users != cpuc->lbr_pebs_users))
+		cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
+}
+
+void release_lbr_buffers(void)
+{
+	struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+	struct cpu_hw_events *cpuc;
+	int cpu;
+
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+		if (kmem_cache && cpuc->lbr_xsave) {
+			kmem_cache_free(kmem_cache, cpuc->lbr_xsave);
+			cpuc->lbr_xsave = NULL;
+		}
+	}
 }
 
 void intel_pmu_lbr_del(struct perf_event *event)
@@ -945,6 +969,19 @@ static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
 	intel_pmu_store_lbr(cpuc, NULL);
 }
 
+static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave;
+
+	if (!xsave) {
+		intel_pmu_store_lbr(cpuc, NULL);
+		return;
+	}
+	xsaves(&xsave->xsave, XFEATURE_MASK_LBR);
+
+	intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
+}
+
 void intel_pmu_lbr_read(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1767,14 +1804,15 @@ void __init intel_pmu_arch_lbr_init(void)
 		x86_pmu.lbr_ctl_map = NULL;
 
 	x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
-	x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
 	if (arch_lbr_xsave) {
 		x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves;
 		x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors;
+		x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave;
 		pr_cont("XSAVE ");
 	} else {
 		x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
 		x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+		x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
 	}
 
 	pr_cont("Architectural LBR, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9131c46b9374..fd391a7da62c 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -280,6 +280,7 @@ struct cpu_hw_events {
 	void				*last_task_ctx;
 	int				last_log_id;
 	int				lbr_select;
+	void				*lbr_xsave;
 
 	/*
 	 * Intel host/guest exclude bits
@@ -1117,6 +1118,8 @@ void release_ds_buffers(void);
 
 void reserve_ds_buffers(void);
 
+void release_lbr_buffers(void);
+
 extern struct event_constraint bts_constraint;
 extern struct event_constraint vlbr_constraint;
 
@@ -1258,6 +1261,10 @@ static inline void release_ds_buffers(void)
 {
 }
 
+static inline void release_lbr_buffers(void)
+{
+}
+
 static inline int intel_pmu_init(void)
 {
 	return 0;
-- 
Gitee


From 7b29c22c531e5614e47ef6a5e1ac075834b9207d Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Sun, 26 Jul 2020 21:31:29 -0700
Subject: [PATCH 0852/2161] x86/cpufeatures: Add enumeration for SERIALIZE
 instruction

commit 85b23fbc7d88f8c6e3951721802d7845bc39663d upstream.

The Intel architecture defines a set of Serializing Instructions (a
detailed definition can be found in Vol.3 Section 8.3 of the Intel "main"
manual, SDM). However, these instructions do more than what is required,
have side effects and/or may be rather invasive. Furthermore, some of
these instructions are only available in kernel mode or may cause VMExits.
Thus, software using these instructions only to serialize execution (as
defined in the manual) must handle the undesired side effects.

As indicated in the name, SERIALIZE is a new Intel architecture
Serializing Instruction. Crucially, it does not have any of the mentioned
side effects. Also, it does not cause VMExit and can be used in user mode.

This new instruction is currently documented in the latest "extensions"
manual (ISE). It will appear in the "main" manual in the future.

Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/r/20200727043132.15082-2-ricardo.neri-calderon@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 2b59e15ebfde..c1daa97a3426 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -366,6 +366,7 @@
 #define X86_FEATURE_SRBDS_CTRL		(18*32+ 9) /* "" SRBDS mitigation MSR available */
 #define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
+#define X86_FEATURE_SERIALIZE		(18*32+14) /* SERIALIZE instruction */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
-- 
Gitee


From cbb60ee511380ef1ad4afd2a9f4bdabea19a8df9 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 31 Dec 2021 10:56:38 +0800
Subject: [PATCH 0853/2161] x86/cpu: Relocate sync_core() to sync_core.h

commit 9998a9832c4027e907353e5e05fde730cf624b77 upstream.

Having sync_core() in processor.h is problematic since it is not possible
to check for hardware capabilities via the *cpu_has() family of macros.
The latter needs the definitions in processor.h.

It also looks more intuitive to relocate the function to sync_core.h.

This changeset does not make changes in functionality.

Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20200727043132.15082-3-ricardo.neri-calderon@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/processor.h      | 66 ---------------------------
 arch/x86/include/asm/sync_core.h      | 66 +++++++++++++++++++++++++++
 arch/x86/kernel/alternative.c         |  1 +
 arch/x86/kernel/cpu/mce/core.c        |  1 +
 drivers/misc/sgi-gru/grufault.c       |  1 +
 drivers/misc/sgi-gru/gruhandles.c     |  1 +
 drivers/misc/sgi-gru/grukservices.c   |  1 +
 kernel/tkernel/ttools/ttools_module.c |  1 +
 8 files changed, 72 insertions(+), 66 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7f3db4d6077b..111cfb4c1deb 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -650,72 +650,6 @@ static __always_inline void cpu_relax(void)
 	rep_nop();
 }
 
-/*
- * This function forces the icache and prefetched instruction stream to
- * catch up with reality in two very specific cases:
- *
- *  a) Text was modified using one virtual address and is about to be executed
- *     from the same physical page at a different virtual address.
- *
- *  b) Text was modified on a different CPU, may subsequently be
- *     executed on this CPU, and you want to make sure the new version
- *     gets executed.  This generally means you're calling this in a IPI.
- *
- * If you're calling this for a different reason, you're probably doing
- * it wrong.
- */
-static inline void sync_core(void)
-{
-	/*
-	 * There are quite a few ways to do this.  IRET-to-self is nice
-	 * because it works on every CPU, at any CPL (so it's compatible
-	 * with paravirtualization), and it never exits to a hypervisor.
-	 * The only down sides are that it's a bit slow (it seems to be
-	 * a bit more than 2x slower than the fastest options) and that
-	 * it unmasks NMIs.  The "push %cs" is needed because, in
-	 * paravirtual environments, __KERNEL_CS may not be a valid CS
-	 * value when we do IRET directly.
-	 *
-	 * In case NMI unmasking or performance ever becomes a problem,
-	 * the next best option appears to be MOV-to-CR2 and an
-	 * unconditional jump.  That sequence also works on all CPUs,
-	 * but it will fault at CPL3 (i.e. Xen PV).
-	 *
-	 * CPUID is the conventional way, but it's nasty: it doesn't
-	 * exist on some 486-like CPUs, and it usually exits to a
-	 * hypervisor.
-	 *
-	 * Like all of Linux's memory ordering operations, this is a
-	 * compiler barrier as well.
-	 */
-#ifdef CONFIG_X86_32
-	asm volatile (
-		"pushfl\n\t"
-		"pushl %%cs\n\t"
-		"pushl $1f\n\t"
-		"iret\n\t"
-		"1:"
-		: ASM_CALL_CONSTRAINT : : "memory");
-#else
-	unsigned int tmp;
-
-	asm volatile (
-		UNWIND_HINT_SAVE
-		"mov %%ss, %0\n\t"
-		"pushq %q0\n\t"
-		"pushq %%rsp\n\t"
-		"addq $8, (%%rsp)\n\t"
-		"pushfq\n\t"
-		"mov %%cs, %0\n\t"
-		"pushq %q0\n\t"
-		"pushq $1f\n\t"
-		"iretq\n\t"
-		UNWIND_HINT_RESTORE
-		"1:"
-		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
-#endif
-}
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void amd_e400_c1e_apic_setup(void);
 
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index 43b5e02a7b4b..6466eb0bcae4 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -6,6 +6,72 @@
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
 
+/*
+ * This function forces the icache and prefetched instruction stream to
+ * catch up with reality in two very specific cases:
+ *
+ *  a) Text was modified using one virtual address and is about to be executed
+ *     from the same physical page at a different virtual address.
+ *
+ *  b) Text was modified on a different CPU, may subsequently be
+ *     executed on this CPU, and you want to make sure the new version
+ *     gets executed.  This generally means you're calling this in a IPI.
+ *
+ * If you're calling this for a different reason, you're probably doing
+ * it wrong.
+ */
+static inline void sync_core(void)
+{
+	/*
+	 * There are quite a few ways to do this.  IRET-to-self is nice
+	 * because it works on every CPU, at any CPL (so it's compatible
+	 * with paravirtualization), and it never exits to a hypervisor.
+	 * The only down sides are that it's a bit slow (it seems to be
+	 * a bit more than 2x slower than the fastest options) and that
+	 * it unmasks NMIs.  The "push %cs" is needed because, in
+	 * paravirtual environments, __KERNEL_CS may not be a valid CS
+	 * value when we do IRET directly.
+	 *
+	 * In case NMI unmasking or performance ever becomes a problem,
+	 * the next best option appears to be MOV-to-CR2 and an
+	 * unconditional jump.  That sequence also works on all CPUs,
+	 * but it will fault at CPL3 (i.e. Xen PV).
+	 *
+	 * CPUID is the conventional way, but it's nasty: it doesn't
+	 * exist on some 486-like CPUs, and it usually exits to a
+	 * hypervisor.
+	 *
+	 * Like all of Linux's memory ordering operations, this is a
+	 * compiler barrier as well.
+	 */
+#ifdef CONFIG_X86_32
+	asm volatile (
+		"pushfl\n\t"
+		"pushl %%cs\n\t"
+		"pushl $1f\n\t"
+		"iret\n\t"
+		"1:"
+		: ASM_CALL_CONSTRAINT : : "memory");
+#else
+	unsigned int tmp;
+
+	asm volatile (
+		UNWIND_HINT_SAVE
+		"mov %%ss, %0\n\t"
+		"pushq %q0\n\t"
+		"pushq %%rsp\n\t"
+		"addq $8, (%%rsp)\n\t"
+		"pushfq\n\t"
+		"mov %%cs, %0\n\t"
+		"pushq %q0\n\t"
+		"pushq $1f\n\t"
+		"iretq\n\t"
+		UNWIND_HINT_RESTORE
+		"1:"
+		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
+#endif
+}
+
 /*
  * Ensure that a core serializing instruction is issued before returning
  * to user-mode. x86 implements return to user-space through sysexit,
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e0b2510aa0b8..a7e013bff99f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -15,6 +15,7 @@
 #include <linux/kprobes.h>
 #include <linux/mmu_context.h>
 #include <linux/bsearch.h>
+#include <linux/sync_core.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 3d724b5029fd..e202cc618cd1 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -42,6 +42,7 @@
 #include <linux/export.h>
 #include <linux/jump_label.h>
 #include <linux/set_memory.h>
+#include <linux/sync_core.h>
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 4b713a80b572..f253bb6f2ae4 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -20,6 +20,7 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <linux/security.h>
+#include <linux/sync_core.h>
 #include <linux/prefetch.h>
 #include <asm/pgtable.h>
 #include "gru.h"
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index f7224f90f413..1d75d5e540bc 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -16,6 +16,7 @@
 #define GRU_OPERATION_TIMEOUT	(((cycles_t) local_cpu_data->itc_freq)*10)
 #define CLKS2NSEC(c)		((c) *1000000000 / local_cpu_data->itc_freq)
 #else
+#include <linux/sync_core.h>
 #include <asm/tsc.h>
 #define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
 #define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 0197441a1eae..f6e600bfac5d 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -16,6 +16,7 @@
 #include <linux/miscdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
+#include <linux/sync_core.h>
 #include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/export.h>
diff --git a/kernel/tkernel/ttools/ttools_module.c b/kernel/tkernel/ttools/ttools_module.c
index a4a7b7c6bdee..0ea5efd7e350 100644
--- a/kernel/tkernel/ttools/ttools_module.c
+++ b/kernel/tkernel/ttools/ttools_module.c
@@ -15,6 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include "ttools.h"
+#include <asm/sync_core.h>
 
 #define TTOOLS_MINOR		254
 #define TTOOLS_VER		"2.0"
-- 
Gitee


From 2dada37ff8a6b44864e6ef4569ce6ccb3ec52b33 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 31 Dec 2021 11:16:23 +0800
Subject: [PATCH 0854/2161] x86/cpu: Refactor sync_core() for readability

commit f69ca629d89d65737537e05308ac531f7bb07d5c upstream.

Instead of having #ifdef/#endif blocks inside sync_core() for X86_64 and
X86_32, implement the new function iret_to_self() with two versions.

In this manner, avoid having to use even more more #ifdef/#endif blocks
when adding support for SERIALIZE in sync_core().

Co-developed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200727043132.15082-4-ricardo.neri-calderon@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/special_insns.h |  1 -
 arch/x86/include/asm/sync_core.h     | 60 ++++++++++++++++------------
 arch/x86/kernel/ftrace.c             |  1 +
 3 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 44c7f80611ed..70578c8def0d 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -278,7 +278,6 @@ static inline int enqcmds(void __iomem *dst, const void *src)
 	return 0;
 }
 
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index 6466eb0bcae4..ee40c11cd2af 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -6,6 +6,39 @@
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
 
+#ifdef CONFIG_X86_32
+static inline void iret_to_self(void)
+{
+	asm volatile (
+		"pushfl\n\t"
+		"pushl %%cs\n\t"
+		"pushl $1f\n\t"
+		"iret\n\t"
+		"1:"
+		: ASM_CALL_CONSTRAINT : : "memory");
+}
+#else
+static inline void iret_to_self(void)
+{
+	unsigned int tmp;
+
+	asm volatile (
+		UNWIND_HINT_SAVE
+		"mov %%ss, %0\n\t"
+		"pushq %q0\n\t"
+		"pushq %%rsp\n\t"
+		"addq $8, (%%rsp)\n\t"
+		"pushfq\n\t"
+		"mov %%cs, %0\n\t"
+		"pushq %q0\n\t"
+		"pushq $1f\n\t"
+		"iretq\n\t"
+		UNWIND_HINT_RESTORE
+		"1:"
+		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
+}
+#endif /* CONFIG_X86_32 */
+
 /*
  * This function forces the icache and prefetched instruction stream to
  * catch up with reality in two very specific cases:
@@ -44,32 +77,7 @@ static inline void sync_core(void)
 	 * Like all of Linux's memory ordering operations, this is a
 	 * compiler barrier as well.
 	 */
-#ifdef CONFIG_X86_32
-	asm volatile (
-		"pushfl\n\t"
-		"pushl %%cs\n\t"
-		"pushl $1f\n\t"
-		"iret\n\t"
-		"1:"
-		: ASM_CALL_CONSTRAINT : : "memory");
-#else
-	unsigned int tmp;
-
-	asm volatile (
-		UNWIND_HINT_SAVE
-		"mov %%ss, %0\n\t"
-		"pushq %q0\n\t"
-		"pushq %%rsp\n\t"
-		"addq $8, (%%rsp)\n\t"
-		"pushfq\n\t"
-		"mov %%cs, %0\n\t"
-		"pushq %q0\n\t"
-		"pushq $1f\n\t"
-		"iretq\n\t"
-		UNWIND_HINT_RESTORE
-		"1:"
-		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
-#endif
+	iret_to_self();
 }
 
 /*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 024c3053dbba..1b72aa2a7d5f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -31,6 +31,7 @@
 #include <asm/ftrace.h>
 #include <asm/nops.h>
 #include <asm/text-patching.h>
+#include <asm/sync_core.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-- 
Gitee


From 9bf19ed043cab1b1679d7ea00e48ac845a370952 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 27 May 2020 15:46:56 -0700
Subject: [PATCH 0855/2161] perf/x86/rapl: Refactor to share the RAPL code
 between Intel and AMD CPUs

commit 5c95c68949880035b68e5c48fdf4899ec0989631 upstream.

This patch modifies the rapl_model struct to include architecture specific
knowledge in this previously Intel specific structure, and in particular
it adds the MSR for POWER_UNIT and the rapl_msrs array.

No functional changes.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200527224659.206129-3-eranian@google.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/rapl.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 9050d7b8abc5..4bf07273fc1c 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -119,7 +119,9 @@ struct rapl_pmus {
 };
 
 struct rapl_model {
+	struct perf_msr *rapl_msrs;
 	unsigned long	events;
+	unsigned int	msr_power_unit;
 	bool		apply_quirk;
 };
 
@@ -129,7 +131,7 @@ static struct rapl_pmus *rapl_pmus;
 static cpumask_t rapl_cpu_mask;
 static unsigned int rapl_cntr_mask;
 static u64 rapl_timer_ms;
-static struct perf_msr rapl_msrs[];
+static struct perf_msr *rapl_msrs;
 
 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 {
@@ -504,7 +506,7 @@ static bool test_msr(int idx, void *data)
 	return test_bit(idx, (unsigned long *) data);
 }
 
-static struct perf_msr rapl_msrs[] = {
+static struct perf_msr intel_rapl_msrs[] = {
 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr },
 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr },
 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr },
@@ -566,13 +568,13 @@ static int rapl_cpu_online(unsigned int cpu)
 	return 0;
 }
 
-static int rapl_check_hw_unit(bool apply_quirk)
+static int rapl_check_hw_unit(struct rapl_model *rm)
 {
 	u64 msr_rapl_power_unit_bits;
 	int i;
 
 	/* protect rdmsrl() to handle virtualization */
-	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
 		return -1;
 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
@@ -583,7 +585,7 @@ static int rapl_check_hw_unit(bool apply_quirk)
 	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
 	 */
-	if (apply_quirk)
+	if (rm->apply_quirk)
 		rapl_hw_unit[PERF_RAPL_RAM] = 16;
 
 	/*
@@ -667,6 +669,8 @@ static struct rapl_model model_snb = {
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_PP1),
 	.apply_quirk	= false,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_snbep = {
@@ -674,6 +678,8 @@ static struct rapl_model model_snbep = {
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
 	.apply_quirk	= false,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_hsw = {
@@ -682,6 +688,8 @@ static struct rapl_model model_hsw = {
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PP1),
 	.apply_quirk	= false,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_hsx = {
@@ -689,12 +697,16 @@ static struct rapl_model model_hsx = {
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
 	.apply_quirk	= true,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_knl = {
 	.events		= BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
 	.apply_quirk	= true,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_skl = {
@@ -704,6 +716,8 @@ static struct rapl_model model_skl = {
 			  BIT(PERF_RAPL_PP1) |
 			  BIT(PERF_RAPL_PSYS),
 	.apply_quirk	= false,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static const struct x86_cpu_id rapl_model_match[] __initconst = {
@@ -748,10 +762,13 @@ static int __init rapl_pmu_init(void)
 		return -ENODEV;
 
 	rm = (struct rapl_model *) id->driver_data;
+
+	rapl_msrs = rm->rapl_msrs;
+
 	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
 					false, (void *) &rm->events);
 
-	ret = rapl_check_hw_unit(rm->apply_quirk);
+	ret = rapl_check_hw_unit(rm);
 	if (ret)
 		return ret;
 
-- 
Gitee


From dfa805d5480123fd2116e018b52616f3f150ed71 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 27 May 2020 15:46:57 -0700
Subject: [PATCH 0856/2161] perf/x86/rapl: Flip logic on default events
 visibility

commit 2a3e3f73a23b4ff2c0065d3a42edc18ad94b7851 upstream.

This patch modifies the default visibility of the attribute_group
for each RAPL event. By default if the grp.is_visible field is NULL,
sysfs considers that it must display the attribute group.
If the field is not NULL (callback function), then the return value
of the callback determines the visibility (0 = not visible). The RAPL
attribute groups had the field set to NULL, meaning that unless they
failed the probing from perf_msr_probe(), they would be visible. We want
to avoid having to specify attribute groups that are not supported by the HW
in the rapl_msrs[] array, they don't have an MSR address to begin with.

Therefore, we intialize the visible field of all RAPL attribute groups
to a callback that returns 0. If the RAPL msr goes through probing
and succeeds the is_visible field will be set back to NULL (visible).
If the probing fails the field is set to a callback that return 0 (not visible).

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200527224659.206129-4-eranian@google.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/rapl.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 4bf07273fc1c..3b8fb41e99de 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -448,9 +448,16 @@ static struct attribute *rapl_events_cores[] = {
 	NULL,
 };
 
+static umode_t
+rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+	return 0;
+}
+
 static struct attribute_group rapl_events_cores_group = {
 	.name  = "events",
 	.attrs = rapl_events_cores,
+	.is_visible = rapl_not_visible,
 };
 
 static struct attribute *rapl_events_pkg[] = {
@@ -463,6 +470,7 @@ static struct attribute *rapl_events_pkg[] = {
 static struct attribute_group rapl_events_pkg_group = {
 	.name  = "events",
 	.attrs = rapl_events_pkg,
+	.is_visible = rapl_not_visible,
 };
 
 static struct attribute *rapl_events_ram[] = {
@@ -475,6 +483,7 @@ static struct attribute *rapl_events_ram[] = {
 static struct attribute_group rapl_events_ram_group = {
 	.name  = "events",
 	.attrs = rapl_events_ram,
+	.is_visible = rapl_not_visible,
 };
 
 static struct attribute *rapl_events_gpu[] = {
@@ -487,6 +496,7 @@ static struct attribute *rapl_events_gpu[] = {
 static struct attribute_group rapl_events_gpu_group = {
 	.name  = "events",
 	.attrs = rapl_events_gpu,
+	.is_visible = rapl_not_visible,
 };
 
 static struct attribute *rapl_events_psys[] = {
@@ -499,6 +509,7 @@ static struct attribute *rapl_events_psys[] = {
 static struct attribute_group rapl_events_psys_group = {
 	.name  = "events",
 	.attrs = rapl_events_psys,
+	.is_visible = rapl_not_visible,
 };
 
 static bool test_msr(int idx, void *data)
-- 
Gitee


From 589bc1e7e6077dabbfd99aff6b9b859c8aa71d16 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 27 May 2020 15:46:58 -0700
Subject: [PATCH 0857/2161] perf/x86/rapl: Make perf_probe_msr() more robust
 and flexible

commit 4c953f879460bf65ea3c119354026b126fe8ee57 upstream.

This patch modifies perf_probe_msr() by allowing passing of
struct perf_msr array where some entries are not populated, i.e.,
they have either an msr address of 0 or no attribute_group pointer.
This helps with certain call paths, e.g., RAPL.

In case the grp is NULL, the default sysfs visibility rule
applies which is to make the group visible. Without the patch,
you would get a kernel crash with a NULL group.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200527224659.206129-5-eranian@google.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/probe.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c
index c2ede2f3b277..136a1e847254 100644
--- a/arch/x86/events/probe.c
+++ b/arch/x86/events/probe.c
@@ -10,6 +10,11 @@ not_visible(struct kobject *kobj, struct attribute *attr, int i)
 	return 0;
 }
 
+/*
+ * Accepts msr[] array with non populated entries as long as either
+ * msr[i].msr is 0 or msr[i].grp is NULL. Note that the default sysfs
+ * visibility is visible when group->is_visible callback is set.
+ */
 unsigned long
 perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
 {
@@ -24,8 +29,16 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
 		if (!msr[bit].no_check) {
 			struct attribute_group *grp = msr[bit].grp;
 
+			/* skip entry with no group */
+			if (!grp)
+				continue;
+
 			grp->is_visible = not_visible;
 
+			/* skip unpopulated entry */
+			if (!msr[bit].msr)
+				continue;
+
 			if (msr[bit].test && !msr[bit].test(bit, data))
 				continue;
 			/* Virt sucks; you cannot tell if a R/O MSR is present :/ */
-- 
Gitee


From 08387515a5d9a1cd164014e3c2d1c106066e61d8 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 31 Dec 2021 16:07:38 +0800
Subject: [PATCH 0858/2161] perf/x86/rapl: Support multiple RAPL unit quirks

commit 74f41adab0f4a61857833e1b6fa8e9ad12c251b6 upstream.

There will be more platforms with different fixed energy units.
Enhance the code to support different RAPL unit quirks for different
platforms.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Link: https://lore.kernel.org/r/20200811153149.12242-3-rui.zhang@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/rapl.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 3b8fb41e99de..0a62c191541e 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -118,11 +118,16 @@ struct rapl_pmus {
 	struct rapl_pmu		*pmus[];
 };
 
+enum rapl_unit_quirk {
+	RAPL_UNIT_QUIRK_NONE,
+	RAPL_UNIT_QUIRK_INTEL_HSW,
+};
+
 struct rapl_model {
 	struct perf_msr *rapl_msrs;
 	unsigned long	events;
 	unsigned int	msr_power_unit;
-	bool		apply_quirk;
+	enum rapl_unit_quirk	unit_quirk;
 };
 
  /* 1/2^hw_unit Joule */
@@ -590,15 +595,20 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 
+	switch (rm->unit_quirk) {
 	/*
 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
 	 * different than the unit from power unit MSR. See
 	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
 	 */
-	if (rm->apply_quirk)
+	case RAPL_UNIT_QUIRK_INTEL_HSW:
 		rapl_hw_unit[PERF_RAPL_RAM] = 16;
-
+		break;
+	default:
+		break;
+	}
+	
 	/*
 	 * Calculate the timer rate:
 	 * Use reference of 200W for scaling the timeout to avoid counter
@@ -679,7 +689,6 @@ static struct rapl_model model_snb = {
 	.events		= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_PP1),
-	.apply_quirk	= false,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
 	.rapl_msrs      = intel_rapl_msrs,
 };
@@ -688,7 +697,6 @@ static struct rapl_model model_snbep = {
 	.events		= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
-	.apply_quirk	= false,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
 	.rapl_msrs      = intel_rapl_msrs,
 };
@@ -698,7 +706,6 @@ static struct rapl_model model_hsw = {
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PP1),
-	.apply_quirk	= false,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
 	.rapl_msrs      = intel_rapl_msrs,
 };
@@ -707,7 +714,7 @@ static struct rapl_model model_hsx = {
 	.events		= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
-	.apply_quirk	= true,
+	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
 	.rapl_msrs      = intel_rapl_msrs,
 };
@@ -715,7 +722,7 @@ static struct rapl_model model_hsx = {
 static struct rapl_model model_knl = {
 	.events		= BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
-	.apply_quirk	= true,
+	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
 	.rapl_msrs      = intel_rapl_msrs,
 };
@@ -726,7 +733,6 @@ static struct rapl_model model_skl = {
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PP1) |
 			  BIT(PERF_RAPL_PSYS),
-	.apply_quirk	= false,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
 	.rapl_msrs      = intel_rapl_msrs,
 };
-- 
Gitee


From 0b624689b38c6777fc192d2129d4c3306f223693 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Mon, 20 Jul 2020 21:37:49 -0700
Subject: [PATCH 0859/2161] x86/cpu: Add Lakefield, Alder Lake and Rocket Lake
 models to the to Intel CPU family

commit e00b62f0b06d0ae2b844049f216807617aff0cdb upstream.

Add three new Intel CPU models.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200721043749.31567-1-tony.luck@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/intel-family.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index c606c0b70738..4b5c185fa025 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -86,6 +86,13 @@
 #define INTEL_FAM6_COMETLAKE		0xA5
 #define INTEL_FAM6_COMETLAKE_L		0xA6
 
+#define INTEL_FAM6_ROCKETLAKE		0xA7
+
+/* Hybrid Core/Atom Processors */
+
+#define	INTEL_FAM6_LAKEFIELD		0x8A
+#define INTEL_FAM6_ALDERLAKE		0x97
+
 /* "Small Core" Processors (Atom) */
 
 #define INTEL_FAM6_ATOM_BONNELL		0x1C /* Diamondville, Pineview */
-- 
Gitee


From 9fd961e185faebc702c838197c7e522aa79ce7f6 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 3 Jun 2020 10:33:52 -0700
Subject: [PATCH 0860/2161] x86/cpu: Add Sapphire Rapids CPU model number

commit be25d1b5ea6a3a3ecbb5474e2ae8e32d2ba055ea upstream.

Latest edition (039) of "Intel Architecture Instruction Set Extensions
and Future Features Programming Reference" includes three new CPU model
numbers. Linux already has the two Ice Lake server ones. Add the new
model number for Sapphire Rapids.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200603173352.15506-1-tony.luck@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/intel-family.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 4b5c185fa025..1433791d9e6d 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -93,6 +93,8 @@
 #define	INTEL_FAM6_LAKEFIELD		0x8A
 #define INTEL_FAM6_ALDERLAKE		0x97
 
+#define INTEL_FAM6_SAPPHIRERAPIDS_X	0x8F
+
 /* "Small Core" Processors (Atom) */
 
 #define INTEL_FAM6_ATOM_BONNELL		0x1C /* Diamondville, Pineview */
-- 
Gitee


From a915725d509fab2a127b632a0d8c032c9ad64abc Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 11 Aug 2020 23:31:49 +0800
Subject: [PATCH 0861/2161] perf/x86/rapl: Add support for Intel SPR platform

commit bcfd218b66790243ef303c1b35ce59f786ded225 upstream.

Intel SPR platform uses fixed 16 bit energy unit for DRAM RAPL domain,
and fixed 0 bit energy unit for Psys RAPL domain.
After this, on SPR platform the energy counters appear in perf list.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Len Brown <len.brown@intel.com>
Link: https://lore.kernel.org/r/20200811153149.12242-4-rui.zhang@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/rapl.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 0a62c191541e..b4952869355a 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -121,6 +121,7 @@ struct rapl_pmus {
 enum rapl_unit_quirk {
 	RAPL_UNIT_QUIRK_NONE,
 	RAPL_UNIT_QUIRK_INTEL_HSW,
+	RAPL_UNIT_QUIRK_INTEL_SPR,
 };
 
 struct rapl_model {
@@ -605,6 +606,14 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
 	case RAPL_UNIT_QUIRK_INTEL_HSW:
 		rapl_hw_unit[PERF_RAPL_RAM] = 16;
 		break;
+	/*
+	 * SPR shares the same DRAM domain energy unit as HSW, plus it
+	 * also has a fixed energy unit for Psys domain.
+	 */
+	case RAPL_UNIT_QUIRK_INTEL_SPR:
+		rapl_hw_unit[PERF_RAPL_RAM] = 16;
+		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
+		break;
 	default:
 		break;
 	}
@@ -737,6 +746,16 @@ static struct rapl_model model_skl = {
 	.rapl_msrs      = intel_rapl_msrs,
 };
 
+static struct rapl_model model_spr = {
+	.events		= BIT(PERF_RAPL_PP0) |
+			  BIT(PERF_RAPL_PKG) |
+			  BIT(PERF_RAPL_RAM) |
+			  BIT(PERF_RAPL_PSYS),
+	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
+};
+
 static const struct x86_cpu_id rapl_model_match[] __initconst = {
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE,		model_snb),
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X,		model_snbep),
@@ -763,6 +782,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS,	model_hsw),
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L,		model_skl),
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE,		model_skl),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SAPPHIRERAPIDS_X,	model_spr),
 	{},
 };
 
-- 
Gitee


From 274533f8a3b7dbdab95c05b35f7d3fca3d229b4d Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Tue, 25 Aug 2020 08:47:57 +0800
Subject: [PATCH 0862/2161] x86/cpufeatures: Enumerate TSX suspend load address
 tracking instructions

commit 18ec63faefb3fd311822556cd9b949f66b1eecee upstream.

Intel TSX suspend load tracking instructions aim to give a way to choose
which memory accesses do not need to be tracked in the TSX read set. Add
TSX suspend load tracking CPUID feature flag TSXLDTRK for enumeration.

A processor supports Intel TSX suspend load address tracking if
CPUID.0x07.0x0:EDX[16] is present. Two instructions XSUSLDTRK, XRESLDTRK
are available when this feature is present.

The CPU feature flag is shown as "tsxldtrk" in /proc/cpuinfo.

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1598316478-23337-2-git-send-email-cathy.zhang@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index c1daa97a3426..63bd1c3ef774 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -367,6 +367,7 @@
 #define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_SERIALIZE		(18*32+14) /* SERIALIZE instruction */
+#define X86_FEATURE_TSXLDTRK		(18*32+16) /* TSX Suspend Load Address Tracking */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
-- 
Gitee


From ffb5bbc0db8dbcd689373f9a405718628040cee8 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Tue, 14 Jan 2020 12:04:55 +0800
Subject: [PATCH 0863/2161] tools/power turbostat: Support Tiger Lake

commit 4bf7132a0ace8888398af8cec6485ee4c6db5ea8 upstream.

From a turbostat point of view, Tiger Lake looks like Ice Lake.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/power/x86/turbostat/turbostat.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 988326b67a91..4eae186b881f 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4627,6 +4627,8 @@ unsigned int intel_model_duplicates(unsigned int model)
 
 	case INTEL_FAM6_ICELAKE_L:
 	case INTEL_FAM6_ICELAKE_NNPI:
+	case INTEL_FAM6_TIGERLAKE_L:
+	case INTEL_FAM6_TIGERLAKE:
 		return INTEL_FAM6_CANNONLAKE_L;
 
 	case INTEL_FAM6_ATOM_TREMONT_D:
-- 
Gitee


From d2cc593552674de2f45c163517fe938b6d0913e8 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Tue, 14 Jan 2020 12:06:49 +0800
Subject: [PATCH 0864/2161] tools/power turbostat: Support Ice Lake server

commit 23274faf96500700da83c4f0ff12d78ae03d5604 upstream.

From a turbostat point of view, Ice Lake server looks like Sky Lake server.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/power/x86/turbostat/turbostat.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 4eae186b881f..0e1749ecd5c3 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4633,6 +4633,9 @@ unsigned int intel_model_duplicates(unsigned int model)
 
 	case INTEL_FAM6_ATOM_TREMONT_D:
 		return INTEL_FAM6_ATOM_GOLDMONT_D;
+
+	case INTEL_FAM6_ICELAKE_X:
+		return INTEL_FAM6_SKYLAKE_X;
 	}
 	return model;
 }
-- 
Gitee


From 0274247d47fe838e28c9dd308b3a4c44340875ee Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 13 Aug 2020 19:06:03 -0400
Subject: [PATCH 0865/2161] tools/power turbostat: Support additional CPU model
 numbers

commit e7af1ed3fa4756e8df8270a8635d852a94266061 upstream.

Initial support for models recently added to intel-family.h.

Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/power/x86/turbostat/turbostat.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0e1749ecd5c3..b35baedd0b90 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4629,12 +4629,16 @@ unsigned int intel_model_duplicates(unsigned int model)
 	case INTEL_FAM6_ICELAKE_NNPI:
 	case INTEL_FAM6_TIGERLAKE_L:
 	case INTEL_FAM6_TIGERLAKE:
+	case INTEL_FAM6_ROCKETLAKE:
+	case INTEL_FAM6_LAKEFIELD:
+	case INTEL_FAM6_ALDERLAKE:
 		return INTEL_FAM6_CANNONLAKE_L;
 
 	case INTEL_FAM6_ATOM_TREMONT_D:
 		return INTEL_FAM6_ATOM_GOLDMONT_D;
 
 	case INTEL_FAM6_ICELAKE_X:
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
 		return INTEL_FAM6_SKYLAKE_X;
 	}
 	return model;
-- 
Gitee


From 00ce638123361704fc161a8fa8b77dcb8f1e8ef2 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 28 Oct 2020 18:44:45 -0700
Subject: [PATCH 0866/2161] PCI: Add defines for Designated Vendor-Specific
 Extended Capability

commit 1dc2da5cd51f648de6d1df87e2bc6ea13f72f19c upstream.

Add PCIe Designated Vendor-Specific Extended Capability (DVSEC) and defines
for the header offsets. Defined in PCIe r5.0, sec 7.9.6.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/pci_regs.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 29d6e93fd15e..340cbf95065f 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -714,6 +714,7 @@
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
 #define PCI_EXT_CAP_ID_L1SS	0x1E	/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
+#define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
@@ -1056,6 +1057,10 @@
 #define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
 
+/* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */
+#define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
+#define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
+
 /* Data Link Feature */
 #define PCI_DLF_CAP		0x04	/* Capabilities Register */
 #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
-- 
Gitee


From 144aa2a53603dd9a5d917b2bd79d40c32c43df83 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:33 -0700
Subject: [PATCH 0867/2161] mfd: Intel Platform Monitoring Technology support

commit 4f8217d5b0ca8ace78a27dc371b87697eedc421d upstream.

Intel Platform Monitoring Technology (PMT) is an architecture for
enumerating and accessing hardware monitoring facilities. PMT supports
multiple types of monitoring capabilities. This driver creates platform
devices for each type so that they may be managed by capability specific
drivers (to be introduced). Capabilities are discovered using PCIe DVSEC
ids. Support is included for the 3 current capability types, Telemetry,
Watcher, and Crashlog. The features are available on new Intel platforms
starting from Tiger Lake for which support is added. This patch adds
support for Tiger Lake (TGL), Alder Lake (ADL), and Out-of-Band Management
Services Module (OOBMSM).

Also add a quirk mechanism for several early hardware differences and bugs.
For Tiger Lake and Alder Lake, do not support Watcher and Crashlog
capabilities since they will not be compatible with future product. Also,
fix use a quirk to fix the discovery table offset.

Co-developed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 MAINTAINERS             |   5 +
 drivers/mfd/Kconfig     |  10 ++
 drivers/mfd/Makefile    |   1 +
 drivers/mfd/intel_pmt.c | 223 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 239 insertions(+)
 create mode 100644 drivers/mfd/intel_pmt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 668003e83bda..f6ee5e8f4c1b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8420,6 +8420,11 @@ F:	drivers/mfd/intel_soc_pmic*
 F:	include/linux/mfd/intel_msic.h
 F:	include/linux/mfd/intel_soc_pmic*
 
+INTEL PMT DRIVER
+M:	"David E. Box" <david.e.box@linux.intel.com>
+S:	Maintained
+F:	drivers/mfd/intel_pmt.c
+
 INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
 M:	Stanislav Yakovlev <stas.yakovlev@gmail.com>
 L:	linux-wireless@vger.kernel.org
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 43169f25da1f..f30f2d16504f 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -632,6 +632,16 @@ config MFD_INTEL_MSIC
 	  Passage) chip. This chip embeds audio, battery, GPIO, etc.
 	  devices used in Intel Medfield platforms.
 
+config MFD_INTEL_PMT
+	tristate "Intel Platform Monitoring Technology (PMT) support"
+	depends on PCI
+	select MFD_CORE
+	help
+	  The Intel Platform Monitoring Technology (PMT) is an interface that
+	  provides access to hardware monitor registers. This driver supports
+	  Telemetry, Watcher, and Crashlog PMT capabilities/devices for
+	  platforms starting from Tiger Lake.
+
 config MFD_IPAQ_MICRO
 	bool "Atmel Micro ASIC (iPAQ h3100/h3600/h3700) Support"
 	depends on SA1100_H3100 || SA1100_H3600
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index c1067ea46204..597cb4fc96e1 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -212,6 +212,7 @@ obj-$(CONFIG_MFD_INTEL_LPSS)	+= intel-lpss.o
 obj-$(CONFIG_MFD_INTEL_LPSS_PCI)	+= intel-lpss-pci.o
 obj-$(CONFIG_MFD_INTEL_LPSS_ACPI)	+= intel-lpss-acpi.o
 obj-$(CONFIG_MFD_INTEL_MSIC)	+= intel_msic.o
+obj-$(CONFIG_MFD_INTEL_PMT)	+= intel_pmt.o
 obj-$(CONFIG_MFD_PALMAS)	+= palmas.o
 obj-$(CONFIG_MFD_VIPERBOARD)    += viperboard.o
 obj-$(CONFIG_MFD_RC5T583)	+= rc5t583.o rc5t583-irq.o
diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
new file mode 100644
index 000000000000..744b230cdcca
--- /dev/null
+++ b/drivers/mfd/intel_pmt.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitoring Technology PMT driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: David E. Box <david.e.box@linux.intel.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kernel.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+
+/* Intel DVSEC capability vendor space offsets */
+#define INTEL_DVSEC_ENTRIES		0xA
+#define INTEL_DVSEC_SIZE		0xB
+#define INTEL_DVSEC_TABLE		0xC
+#define INTEL_DVSEC_TABLE_BAR(x)	((x) & GENMASK(2, 0))
+#define INTEL_DVSEC_TABLE_OFFSET(x)	((x) & GENMASK(31, 3))
+#define INTEL_DVSEC_ENTRY_SIZE		4
+
+/* PMT capabilities */
+#define DVSEC_INTEL_ID_TELEMETRY	2
+#define DVSEC_INTEL_ID_WATCHER		3
+#define DVSEC_INTEL_ID_CRASHLOG		4
+
+struct intel_dvsec_header {
+	u16	length;
+	u16	id;
+	u8	num_entries;
+	u8	entry_size;
+	u8	tbir;
+	u32	offset;
+};
+
+enum pmt_quirks {
+	/* Watcher capability not supported */
+	PMT_QUIRK_NO_WATCHER	= BIT(0),
+
+	/* Crashlog capability not supported */
+	PMT_QUIRK_NO_CRASHLOG	= BIT(1),
+
+	/* Use shift instead of mask to read discovery table offset */
+	PMT_QUIRK_TABLE_SHIFT	= BIT(2),
+};
+
+struct pmt_platform_info {
+	unsigned long quirks;
+};
+
+static const struct pmt_platform_info tgl_info = {
+	.quirks = PMT_QUIRK_NO_WATCHER | PMT_QUIRK_NO_CRASHLOG |
+		  PMT_QUIRK_TABLE_SHIFT,
+};
+
+static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
+		       unsigned long quirks)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res, *tmp;
+	struct mfd_cell *cell;
+	const char *name;
+	int count = header->num_entries;
+	int size = header->entry_size;
+	int id = header->id;
+	int i;
+
+	switch (id) {
+	case DVSEC_INTEL_ID_TELEMETRY:
+		name = "pmt_telemetry";
+		break;
+	case DVSEC_INTEL_ID_WATCHER:
+		if (quirks & PMT_QUIRK_NO_WATCHER) {
+			dev_info(dev, "Watcher not supported\n");
+			return 0;
+		}
+		name = "pmt_watcher";
+		break;
+	case DVSEC_INTEL_ID_CRASHLOG:
+		if (quirks & PMT_QUIRK_NO_CRASHLOG) {
+			dev_info(dev, "Crashlog not supported\n");
+			return 0;
+		}
+		name = "pmt_crashlog";
+		break;
+	default:
+		dev_err(dev, "Unrecognized PMT capability: %d\n", id);
+		return -EINVAL;
+	}
+
+	if (!header->num_entries || !header->entry_size) {
+		dev_err(dev, "Invalid count or size for %s header\n", name);
+		return -EINVAL;
+	}
+
+	cell = devm_kzalloc(dev, sizeof(*cell), GFP_KERNEL);
+	if (!cell)
+		return -ENOMEM;
+
+	res = devm_kcalloc(dev, count, sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	if (quirks & PMT_QUIRK_TABLE_SHIFT)
+		header->offset >>= 3;
+
+	/*
+	 * The PMT DVSEC contains the starting offset and count for a block of
+	 * discovery tables, each providing access to monitoring facilities for
+	 * a section of the device. Create a resource list of these tables to
+	 * provide to the driver.
+	 */
+	for (i = 0, tmp = res; i < count; i++, tmp++) {
+		tmp->start = pdev->resource[header->tbir].start +
+			     header->offset + i * (size << 2);
+		tmp->end = tmp->start + (size << 2) - 1;
+		tmp->flags = IORESOURCE_MEM;
+	}
+
+	cell->resources = res;
+	cell->num_resources = count;
+	cell->name = name;
+
+	return devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, cell, 1, NULL, 0,
+				    NULL);
+}
+
+static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct pmt_platform_info *info;
+	unsigned long quirks = 0;
+	bool found_devices = false;
+	int ret, pos = 0;
+
+	ret = pcim_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	info = (struct pmt_platform_info *)id->driver_data;
+
+	if (info)
+		quirks = info->quirks;
+
+	do {
+		struct intel_dvsec_header header;
+		u32 table;
+		u16 vid;
+
+		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
+		if (!pos)
+			break;
+
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
+		if (vid != PCI_VENDOR_ID_INTEL)
+			continue;
+
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
+				     &header.id);
+		pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
+				     &header.num_entries);
+		pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
+				     &header.entry_size);
+		pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
+				      &table);
+
+		header.tbir = INTEL_DVSEC_TABLE_BAR(table);
+		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
+
+		ret = pmt_add_dev(pdev, &header, quirks);
+		if (ret) {
+			dev_warn(&pdev->dev,
+				 "Failed to add device for DVSEC id %d\n",
+				 header.id);
+			continue;
+		}
+
+		found_devices = true;
+	} while (true);
+
+	if (!found_devices)
+		return -ENODEV;
+
+	pm_runtime_put(&pdev->dev);
+	pm_runtime_allow(&pdev->dev);
+
+	return 0;
+}
+
+static void pmt_pci_remove(struct pci_dev *pdev)
+{
+	pm_runtime_forbid(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+}
+
+#define PCI_DEVICE_ID_INTEL_PMT_ADL	0x467d
+#define PCI_DEVICE_ID_INTEL_PMT_OOBMSM	0x09a7
+#define PCI_DEVICE_ID_INTEL_PMT_TGL	0x9a0d
+static const struct pci_device_id pmt_pci_ids[] = {
+	{ PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, pmt_pci_ids);
+
+static struct pci_driver pmt_pci_driver = {
+	.name = "intel-pmt",
+	.id_table = pmt_pci_ids,
+	.probe = pmt_pci_probe,
+	.remove = pmt_pci_remove,
+};
+module_pci_driver(pmt_pci_driver);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Platform Monitoring Technology PMT driver");
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From 89b8384a63de2846164b7cf649856e1d14428ae9 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 4 Jan 2022 17:01:42 +0800
Subject: [PATCH 0868/2161] platform/x86: Intel PMT class driver

commit e2729113ce66d8d21f729b41bc3ed3feaf1acf69 upstream.

Intel Platform Monitoring Technology is meant to provide a common way to
access telemetry and system metrics.

Register mappings are not provided by the driver. Instead, a GUID is read
from a header for each endpoint. The GUID identifies the device and is to
be used with an XML, provided by the vendor, to discover the available set
of metrics and their register mapping.  This allows firmware updates to
modify the register space without needing to update the driver every time
with new mappings. Firmware writes a new GUID in this case to specify the
new mapping.  Software tools with access to the associated XML file can
then interpret the changes.

The module manages access to all Intel PMT endpoints on a system,
independent of the device exporting them. It creates an intel_pmt class
to manage the devices. For each telemetry endpoint, sysfs files provide
GUID and size information as well as a pointer to the parent device the
telemetry came from. Software may discover the association between
endpoints and devices by iterating through the list in sysfs, or by looking
for the existence of the class folder under the device of interest.  A
binary sysfs attribute of the same name allows software to then read or map
the telemetry space for direct access.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/testing/sysfs-class-intel_pmt         |  54 ++++
 MAINTAINERS                                   |   1 +
 drivers/platform/x86/Kconfig                  |  12 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel_pmt_class.c        | 297 ++++++++++++++++++
 drivers/platform/x86/intel_pmt_class.h        |  52 +++
 6 files changed, 417 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-intel_pmt
 create mode 100644 drivers/platform/x86/intel_pmt_class.c
 create mode 100644 drivers/platform/x86/intel_pmt_class.h

diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt
new file mode 100644
index 000000000000..926b5cf95fd1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-intel_pmt
@@ -0,0 +1,54 @@
+What:		/sys/class/intel_pmt/
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The intel_pmt/ class directory contains information for
+		devices that expose hardware telemetry using Intel Platform
+		Monitoring Technology (PMT)
+
+What:		/sys/class/intel_pmt/telem<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The telem<x> directory contains files describing an instance of
+		a PMT telemetry device that exposes hardware telemetry. Each
+		telem<x> directory has an associated telem file. This file
+		may be opened and mapped or read to access the telemetry space
+		of the device. The register layout of the telemetry space is
+		determined from an XML file that matches the PCI device id and
+		GUID for the device.
+
+What:		/sys/class/intel_pmt/telem<x>/telem
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The telemetry data for this telemetry device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/telem<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The GUID for this telemetry device. The GUID identifies
+		the version of the XML file for the parent device that is to
+		be used to get the register layout.
+
+What:		/sys/class/intel_pmt/telem<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The size of telemetry region in bytes that corresponds to
+		the mapping size for the telem file.
+
+What:		/sys/class/intel_pmt/telem<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The offset of telemetry region in bytes that corresponds to
+		the mapping for the telem file.
diff --git a/MAINTAINERS b/MAINTAINERS
index f6ee5e8f4c1b..b362ea22b5c5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8424,6 +8424,7 @@ INTEL PMT DRIVER
 M:	"David E. Box" <david.e.box@linux.intel.com>
 S:	Maintained
 F:	drivers/mfd/intel_pmt.c
+F:	drivers/platform/x86/intel_pmt_*
 
 INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
 M:	Stanislav Yakovlev <stas.yakovlev@gmail.com>
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 000d5693fae7..a043242f9261 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1210,6 +1210,18 @@ config SURFACE_3_BUTTON
 	---help---
 	  This driver handles the power/home/volume buttons on the Microsoft Surface 3 tablet.
 
+config INTEL_PMT_CLASS
+	tristate "Intel Platform Monitoring Technology (PMT) Class driver"
+	help
+	  The Intel Platform Monitoring Technology (PMT) class driver provides
+	  the basic sysfs interface and file hierarchy uses by PMT devices.
+
+	  For more information, see:
+	  <file:Documentation/ABI/testing/sysfs-class-intel_pmt>
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_class.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	---help---
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 415104033060..5cc77a5ed1ff 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -91,6 +91,7 @@ obj-$(CONFIG_INTEL_TELEMETRY)	+= intel_telemetry_core.o \
 				   intel_telemetry_pltdrv.o \
 				   intel_telemetry_debugfs.o
 obj-$(CONFIG_INTEL_PMC_CORE)    += intel_pmc_core.o intel_pmc_core_pltdrv.o
+obj-$(CONFIG_INTEL_PMT_CLASS)	+= intel_pmt_class.o
 obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
 obj-$(CONFIG_MLX_PLATFORM)	+= mlx-platform.o
 obj-$(CONFIG_INTEL_TURBO_MAX_3) += intel_turbo_max_3.o
diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
new file mode 100644
index 000000000000..aa88dc23bbde
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+
+#include "intel_pmt_class.h"
+
+#define PMT_XA_START		0
+#define PMT_XA_MAX		INT_MAX
+#define PMT_XA_LIMIT		XA_LIMIT(PMT_XA_START, PMT_XA_MAX)
+
+/*
+ * sysfs
+ */
+static ssize_t
+intel_pmt_read(struct file *filp, struct kobject *kobj,
+	       struct bin_attribute *attr, char *buf, loff_t off,
+	       size_t count)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+
+	if (off < 0)
+		return -EINVAL;
+
+	if (off >= entry->size)
+		return 0;
+
+	if (count > entry->size - off)
+		count = entry->size - off;
+
+	memcpy_fromio(buf, entry->base + off, count);
+
+	return count;
+}
+
+static int
+intel_pmt_mmap(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, struct vm_area_struct *vma)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+	unsigned long vsize = vma->vm_end - vma->vm_start;
+	struct device *dev = kobj_to_dev(kobj);
+	unsigned long phys = entry->base_addr;
+	unsigned long pfn = PFN_DOWN(phys);
+	unsigned long psize;
+
+	if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE))
+		return -EROFS;
+
+	psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
+	if (vsize > psize) {
+		dev_err(dev, "Requested mmap size is too large\n");
+		return -EINVAL;
+	}
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+		vsize, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static ssize_t
+guid_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%x\n", entry->guid);
+}
+static DEVICE_ATTR_RO(guid);
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%zu\n", entry->size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t
+offset_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%lu\n", offset_in_page(entry->base_addr));
+}
+static DEVICE_ATTR_RO(offset);
+
+static struct attribute *intel_pmt_attrs[] = {
+	&dev_attr_guid.attr,
+	&dev_attr_size.attr,
+	&dev_attr_offset.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(intel_pmt);
+
+static struct class intel_pmt_class = {
+	.name = "intel_pmt",
+	.owner = THIS_MODULE,
+	.dev_groups = intel_pmt_groups,
+};
+
+static int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
+				    struct intel_pmt_header *header,
+				    struct device *dev,
+				    struct resource *disc_res)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev->parent);
+	u8 bir;
+
+	/*
+	 * The base offset should always be 8 byte aligned.
+	 *
+	 * For non-local access types the lower 3 bits of base offset
+	 * contains the index of the base address register where the
+	 * telemetry can be found.
+	 */
+	bir = GET_BIR(header->base_offset);
+
+	/* Local access and BARID only for now */
+	switch (header->access_type) {
+	case ACCESS_LOCAL:
+		if (bir) {
+			dev_err(dev,
+				"Unsupported BAR index %d for access type %d\n",
+				bir, header->access_type);
+			return -EINVAL;
+		}
+		/*
+		 * For access_type LOCAL, the base address is as follows:
+		 * base address = end of discovery region + base offset
+		 */
+		entry->base_addr = disc_res->end + 1 + header->base_offset;
+		break;
+	case ACCESS_BARID:
+		/*
+		 * If another BAR was specified then the base offset
+		 * represents the offset within that BAR. SO retrieve the
+		 * address from the parent PCI device and add offset.
+		 */
+		entry->base_addr = pci_resource_start(pci_dev, bir) +
+				   GET_ADDRESS(header->base_offset);
+		break;
+	default:
+		dev_err(dev, "Unsupported access type %d\n",
+			header->access_type);
+		return -EINVAL;
+	}
+
+	entry->guid = header->guid;
+	entry->size = header->size;
+
+	return 0;
+}
+
+static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
+				  struct intel_pmt_namespace *ns,
+				  struct device *parent)
+{
+	struct resource res;
+	struct device *dev;
+	int ret;
+
+	ret = xa_alloc(ns->xa, &entry->devid, entry, PMT_XA_LIMIT, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	dev = device_create(&intel_pmt_class, parent, MKDEV(0, 0), entry,
+			    "%s%d", ns->name, entry->devid);
+
+	if (IS_ERR(dev)) {
+		dev_err(parent, "Could not create %s%d device node\n",
+			ns->name, entry->devid);
+		ret = PTR_ERR(dev);
+		goto fail_dev_create;
+	}
+
+	entry->kobj = &dev->kobj;
+
+	if (ns->attr_grp) {
+		ret = sysfs_create_group(entry->kobj, ns->attr_grp);
+		if (ret)
+			goto fail_sysfs;
+	}
+
+	/* if size is 0 assume no data buffer, so no file needed */
+	if (!entry->size)
+		return 0;
+
+	res.start = entry->base_addr;
+	res.end = res.start + entry->size - 1;
+	res.flags = IORESOURCE_MEM;
+
+	entry->base = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(entry->base)) {
+		ret = PTR_ERR(entry->base);
+		goto fail_ioremap;
+	}
+
+	sysfs_bin_attr_init(&entry->pmt_bin_attr);
+	entry->pmt_bin_attr.attr.name = ns->name;
+	entry->pmt_bin_attr.attr.mode = 0440;
+	entry->pmt_bin_attr.mmap = intel_pmt_mmap;
+	entry->pmt_bin_attr.read = intel_pmt_read;
+	entry->pmt_bin_attr.size = entry->size;
+
+	ret = sysfs_create_bin_file(&dev->kobj, &entry->pmt_bin_attr);
+	if (!ret)
+		return 0;
+
+fail_ioremap:
+	sysfs_remove_group(entry->kobj, ns->attr_grp);
+fail_sysfs:
+	device_unregister(dev);
+fail_dev_create:
+	xa_erase(ns->xa, entry->devid);
+
+	return ret;
+}
+
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns,
+			 struct platform_device *pdev, int idx)
+{
+	struct intel_pmt_header header;
+	struct resource	*disc_res;
+	int ret = -ENODEV;
+
+	disc_res = platform_get_resource(pdev, IORESOURCE_MEM, idx);
+	if (!disc_res)
+		return ret;
+
+	entry->disc_table = devm_platform_ioremap_resource(pdev, idx);
+	if (IS_ERR(entry->disc_table))
+		return PTR_ERR(entry->disc_table);
+
+	ret = ns->pmt_header_decode(entry, &header, &pdev->dev);
+	if (ret)
+		return ret;
+
+	ret = intel_pmt_populate_entry(entry, &header, &pdev->dev, disc_res);
+	if (ret)
+		return ret;
+
+	return intel_pmt_dev_register(entry, ns, &pdev->dev);
+
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_create);
+
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns)
+{
+	struct device *dev = kobj_to_dev(entry->kobj);
+
+	if (entry->size)
+		sysfs_remove_bin_file(entry->kobj, &entry->pmt_bin_attr);
+
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
+
+	device_unregister(dev);
+	xa_erase(ns->xa, entry->devid);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_destroy);
+
+static int __init pmt_class_init(void)
+{
+	return class_register(&intel_pmt_class);
+}
+
+static void __exit pmt_class_exit(void)
+{
+	class_unregister(&intel_pmt_class);
+}
+
+module_init(pmt_class_init);
+module_exit(pmt_class_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Class driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h
new file mode 100644
index 000000000000..de8f8139ba31
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_PMT_CLASS_H
+#define _INTEL_PMT_CLASS_H
+
+#include <linux/platform_device.h>
+#include <linux/xarray.h>
+#include <linux/types.h>
+#include <linux/bits.h>
+#include <linux/err.h>
+#include <linux/io.h>
+
+/* PMT access types */
+#define ACCESS_BARID		2
+#define ACCESS_LOCAL		3
+
+/* PMT discovery base address/offset register layout */
+#define GET_BIR(v)		((v) & GENMASK(2, 0))
+#define GET_ADDRESS(v)		((v) & GENMASK(31, 3))
+
+struct intel_pmt_entry {
+	struct bin_attribute	pmt_bin_attr;
+	struct kobject		*kobj;
+	void __iomem		*disc_table;
+	void __iomem		*base;
+	unsigned long		base_addr;
+	size_t			size;
+	u32			guid;
+	int			devid;
+};
+
+struct intel_pmt_header {
+	u32	base_offset;
+	u32	size;
+	u32	guid;
+	u8	access_type;
+};
+
+struct intel_pmt_namespace {
+	const char *name;
+	struct xarray *xa;
+	const struct attribute_group *attr_grp;
+	int (*pmt_header_decode)(struct intel_pmt_entry *entry,
+				 struct intel_pmt_header *header,
+				 struct device *dev);
+};
+
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns,
+			 struct platform_device *pdev, int idx);
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns);
+#endif
-- 
Gitee


From 974dba2ff917cd8af61d548334ff381910ad4676 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 4 Jan 2022 17:28:37 +0800
Subject: [PATCH 0869/2161] platform/x86: Intel PMT Telemetry capability driver

commit 68fe8e6e2c4b04e2733d77834f55a4a0e172b770 upstream.

PMT Telemetry is a capability of the Intel Platform Monitoring Technology.
The Telemetry capability provides access to device telemetry metrics that
provide hardware performance data to users from read-only register spaces.

With this driver present the intel_pmt directory can be populated with
telem<x> devices. These devices will contain the standard intel_pmt sysfs
data and a "telem" binary sysfs attribute which can be used to access the
telemetry data.

Also create a PCI device id list for early telemetry hardware that require
workarounds for known issues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Co-developed-by: David E. Box <david.e.box@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/Kconfig               |  11 ++
 drivers/platform/x86/Makefile              |   1 +
 drivers/platform/x86/intel_pmt_telemetry.c | 160 +++++++++++++++++++++
 3 files changed, 172 insertions(+)
 create mode 100644 drivers/platform/x86/intel_pmt_telemetry.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index a043242f9261..efc7c7f2e09f 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1222,6 +1222,17 @@ config INTEL_PMT_CLASS
 	  To compile this driver as a module, choose M here: the module
 	  will be called intel_pmt_class.
 
+config INTEL_PMT_TELEMETRY
+	tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver"
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitory Technology (PMT) Telemetry driver provides
+	  access to hardware telemetry metrics on devices that support the
+	  feature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_telemetry.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	---help---
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 5cc77a5ed1ff..8835ede470d1 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_INTEL_TELEMETRY)	+= intel_telemetry_core.o \
 				   intel_telemetry_debugfs.o
 obj-$(CONFIG_INTEL_PMC_CORE)    += intel_pmc_core.o intel_pmc_core_pltdrv.o
 obj-$(CONFIG_INTEL_PMT_CLASS)	+= intel_pmt_class.o
+obj-$(CONFIG_INTEL_PMT_TELEMETRY)	+= intel_pmt_telemetry.o
 obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
 obj-$(CONFIG_MLX_PLATFORM)	+= mlx-platform.o
 obj-$(CONFIG_INTEL_TURBO_MAX_3) += intel_turbo_max_3.o
diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
new file mode 100644
index 000000000000..f8a87614efa4
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "David E. Box" <david.e.box@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/overflow.h>
+
+#include "intel_pmt_class.h"
+
+#define TELEM_DEV_NAME		"pmt_telemetry"
+
+#define TELEM_SIZE_OFFSET	0x0
+#define TELEM_GUID_OFFSET	0x4
+#define TELEM_BASE_OFFSET	0x8
+#define TELEM_ACCESS(v)		((v) & GENMASK(3, 0))
+/* size is in bytes */
+#define TELEM_SIZE(v)		(((v) & GENMASK(27, 12)) >> 10)
+
+/* Used by client hardware to identify a fixed telemetry entry*/
+#define TELEM_CLIENT_FIXED_BLOCK_GUID	0x10000000
+
+struct pmt_telem_priv {
+	int				num_entries;
+	struct intel_pmt_entry		entry[];
+};
+
+/*
+ * Early implementations of PMT on client platforms have some
+ * differences from the server platforms (which use the Out Of Band
+ * Management Services Module OOBMSM). This list tracks those
+ * platforms as needed to handle those differences. Newer client
+ * platforms are expected to be fully compatible with server.
+ */
+static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
+	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
+	{ }
+};
+
+static bool intel_pmt_is_early_client_hw(struct device *dev)
+{
+	struct pci_dev *parent = to_pci_dev(dev->parent);
+
+	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
+}
+
+static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry,
+				      struct device *dev)
+{
+	u32 guid = readl(entry->disc_table + TELEM_GUID_OFFSET);
+
+	if (guid != TELEM_CLIENT_FIXED_BLOCK_GUID)
+		return false;
+
+	return intel_pmt_is_early_client_hw(dev);
+}
+
+static int pmt_telem_header_decode(struct intel_pmt_entry *entry,
+				   struct intel_pmt_header *header,
+				   struct device *dev)
+{
+	void __iomem *disc_table = entry->disc_table;
+
+	if (pmt_telem_region_overlaps(entry, dev))
+		return 1;
+
+	header->access_type = TELEM_ACCESS(readl(disc_table));
+	header->guid = readl(disc_table + TELEM_GUID_OFFSET);
+	header->base_offset = readl(disc_table + TELEM_BASE_OFFSET);
+
+	/* Size is measured in DWORDS, but accessor returns bytes */
+	header->size = TELEM_SIZE(readl(disc_table));
+
+	return 0;
+}
+
+static DEFINE_XARRAY_ALLOC(telem_array);
+static struct intel_pmt_namespace pmt_telem_ns = {
+	.name = "telem",
+	.xa = &telem_array,
+	.pmt_header_decode = pmt_telem_header_decode,
+};
+
+static int pmt_telem_remove(struct platform_device *pdev)
+{
+	struct pmt_telem_priv *priv = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < priv->num_entries; i++)
+		intel_pmt_dev_destroy(&priv->entry[i], &pmt_telem_ns);
+
+	return 0;
+}
+
+static int pmt_telem_probe(struct platform_device *pdev)
+{
+	struct pmt_telem_priv *priv;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(priv, entry, pdev->num_resources);
+	priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, priv);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct intel_pmt_entry *entry = &priv->entry[i];
+
+		ret = intel_pmt_dev_create(entry, &pmt_telem_ns, pdev, i);
+		if (ret < 0)
+			goto abort_probe;
+		if (ret)
+			continue;
+
+		priv->num_entries++;
+	}
+
+	return 0;
+abort_probe:
+	pmt_telem_remove(pdev);
+	return ret;
+}
+
+static struct platform_driver pmt_telem_driver = {
+	.driver = {
+		.name   = TELEM_DEV_NAME,
+	},
+	.remove = pmt_telem_remove,
+	.probe  = pmt_telem_probe,
+};
+
+static int __init pmt_telem_init(void)
+{
+	return platform_driver_register(&pmt_telem_driver);
+}
+module_init(pmt_telem_init);
+
+static void __exit pmt_telem_exit(void)
+{
+	platform_driver_unregister(&pmt_telem_driver);
+	xa_destroy(&telem_array);
+}
+module_exit(pmt_telem_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Telemetry driver");
+MODULE_ALIAS("platform:" TELEM_DEV_NAME);
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From 6db532a7ee3ecca4ec04626e87ff016ead776781 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 4 Jan 2022 17:39:48 +0800
Subject: [PATCH 0870/2161] platform/x86: Intel PMT Crashlog capability driver

commit 5ef9998c96b0c99c49c202054586967e609286d2 upstream.

Add support for the Intel Platform Monitoring Technology crashlog
interface. This interface provides a few sysfs values to allow for
controlling the crashlog telemetry interface as well as a character
driver to allow for mapping the crashlog memory region so that it can be
accessed after a crashlog has been recorded.

This driver is meant to only support the server version of the crashlog
which is identified as crash_type 1 with a version of zero. Currently no
other types are supported.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/testing/sysfs-class-intel_pmt         |  65 ++++
 drivers/platform/x86/Kconfig                  |  11 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel_pmt_crashlog.c     | 328 ++++++++++++++++++
 4 files changed, 405 insertions(+)
 create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c

diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt
index 926b5cf95fd1..ed4c886a21b1 100644
--- a/Documentation/ABI/testing/sysfs-class-intel_pmt
+++ b/Documentation/ABI/testing/sysfs-class-intel_pmt
@@ -52,3 +52,68 @@ Contact:	David Box <david.e.box@linux.intel.com>
 Description:
 		(RO) The offset of telemetry region in bytes that corresponds to
 		the mapping for the telem file.
+
+What:		/sys/class/intel_pmt/crashlog<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		The crashlog<x> directory contains files for configuring an
+		instance of a PMT crashlog device that can perform crash data
+		recording. Each crashlog<x> device has an associated crashlog
+		file. This file can be opened and mapped or read to access the
+		resulting crashlog buffer. The register layout for the buffer
+		can be determined from an XML file of specified GUID for the
+		parent device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/crashlog
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The crashlog buffer for this crashlog device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/crashlog<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The GUID for this crashlog device. The GUID identifies the
+		version of the XML file for the parent device that should be
+		used to determine the register layout.
+
+What:		/sys/class/intel_pmt/crashlog<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The length of the result buffer in bytes that corresponds
+		to the size for the crashlog buffer.
+
+What:		/sys/class/intel_pmt/crashlog<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The offset of the buffer in bytes that corresponds
+		to the mapping for the crashlog device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/enable
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RW) Boolean value controlling if the crashlog functionality
+		is enabled for the crashlog device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/trigger
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RW) Boolean value controlling the triggering of the crashlog
+		device node. When read it provides data on if the crashlog has
+		been triggered. When written to it can be used to either clear
+		the current trigger by writing false, or to trigger a new
+		event if the trigger is not currently set.
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index efc7c7f2e09f..924794c0464e 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1233,6 +1233,17 @@ config INTEL_PMT_TELEMETRY
 	  To compile this driver as a module, choose M here: the module
 	  will be called intel_pmt_telemetry.
 
+config INTEL_PMT_CRASHLOG
+	tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver"
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitoring Technology (PMT) crashlog driver provides
+	  access to hardware crashlog capabilities on devices that support the
+	  feature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_crashlog.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	---help---
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 8835ede470d1..da88c2bb3fff 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_INTEL_TELEMETRY)	+= intel_telemetry_core.o \
 obj-$(CONFIG_INTEL_PMC_CORE)    += intel_pmc_core.o intel_pmc_core_pltdrv.o
 obj-$(CONFIG_INTEL_PMT_CLASS)	+= intel_pmt_class.o
 obj-$(CONFIG_INTEL_PMT_TELEMETRY)	+= intel_pmt_telemetry.o
+obj-$(CONFIG_INTEL_PMT_CRASHLOG)	+= intel_pmt_crashlog.o
 obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
 obj-$(CONFIG_MLX_PLATFORM)	+= mlx-platform.o
 obj-$(CONFIG_INTEL_TURBO_MAX_3) += intel_turbo_max_3.o
diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
new file mode 100644
index 000000000000..97dd749c8290
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitoring Technology Crashlog driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/overflow.h>
+
+#include "intel_pmt_class.h"
+
+#define DRV_NAME		"pmt_crashlog"
+
+/* Crashlog discovery header types */
+#define CRASH_TYPE_OOBMSM	1
+
+/* Control Flags */
+#define CRASHLOG_FLAG_DISABLE		BIT(27)
+
+/*
+ * Bits 28 and 29 control the state of bit 31.
+ *
+ * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured.
+ * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31.
+ * Bit 30 is read-only and reserved as 0.
+ * Bit 31 is the read-only status with a 1 indicating log is complete.
+ */
+#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(28)
+#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_COMPLETE	BIT(31)
+#define CRASHLOG_FLAG_TRIGGER_MASK	GENMASK(31, 28)
+
+/* Crashlog Discovery Header */
+#define CONTROL_OFFSET		0x0
+#define GUID_OFFSET		0x4
+#define BASE_OFFSET		0x8
+#define SIZE_OFFSET		0xC
+#define GET_ACCESS(v)		((v) & GENMASK(3, 0))
+#define GET_TYPE(v)		(((v) & GENMASK(7, 4)) >> 4)
+#define GET_VERSION(v)		(((v) & GENMASK(19, 16)) >> 16)
+/* size is in bytes */
+#define GET_SIZE(v)		((v) * sizeof(u32))
+
+struct crashlog_entry {
+	/* entry must be first member of struct */
+	struct intel_pmt_entry		entry;
+	struct mutex			control_mutex;
+};
+
+struct pmt_crashlog_priv {
+	int			num_entries;
+	struct crashlog_entry	entry[];
+};
+
+/*
+ * I/O
+ */
+static bool pmt_crashlog_complete(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* return current value of the crashlog complete flag */
+	return !!(control & CRASHLOG_FLAG_TRIGGER_COMPLETE);
+}
+
+static bool pmt_crashlog_disabled(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* return current value of the crashlog disabled flag */
+	return !!(control & CRASHLOG_FLAG_DISABLE);
+}
+
+static bool pmt_crashlog_supported(struct intel_pmt_entry *entry)
+{
+	u32 discovery_header = readl(entry->disc_table + CONTROL_OFFSET);
+	u32 crash_type, version;
+
+	crash_type = GET_TYPE(discovery_header);
+	version = GET_VERSION(discovery_header);
+
+	/*
+	 * Currently we only recognize OOBMSM version 0 devices.
+	 * We can ignore all other crashlog devices in the system.
+	 */
+	return crash_type == CRASH_TYPE_OOBMSM && version == 0;
+}
+
+static void pmt_crashlog_set_disable(struct intel_pmt_entry *entry,
+				     bool disable)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* clear trigger bits so we are only modifying disable flag */
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+
+	if (disable)
+		control |= CRASHLOG_FLAG_DISABLE;
+	else
+		control &= ~CRASHLOG_FLAG_DISABLE;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+static void pmt_crashlog_set_clear(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+	control |= CRASHLOG_FLAG_TRIGGER_CLEAR;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+static void pmt_crashlog_set_execute(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+	control |= CRASHLOG_FLAG_TRIGGER_EXECUTE;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+/*
+ * sysfs
+ */
+static ssize_t
+enable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+	int enabled = !pmt_crashlog_disabled(entry);
+
+	return sprintf(buf, "%d\n", enabled);
+}
+
+static ssize_t
+enable_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct crashlog_entry *entry;
+	bool enabled;
+	int result;
+
+	entry = dev_get_drvdata(dev);
+
+	result = kstrtobool(buf, &enabled);
+	if (result)
+		return result;
+
+	mutex_lock(&entry->control_mutex);
+	pmt_crashlog_set_disable(&entry->entry, !enabled);
+	mutex_unlock(&entry->control_mutex);
+
+	return count;
+}
+static DEVICE_ATTR_RW(enable);
+
+static ssize_t
+trigger_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry;
+	int trigger;
+
+	entry = dev_get_drvdata(dev);
+	trigger = pmt_crashlog_complete(entry);
+
+	return sprintf(buf, "%d\n", trigger);
+}
+
+static ssize_t
+trigger_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct crashlog_entry *entry;
+	bool trigger;
+	int result;
+
+	entry = dev_get_drvdata(dev);
+
+	result = kstrtobool(buf, &trigger);
+	if (result)
+		return result;
+
+	mutex_lock(&entry->control_mutex);
+
+	if (!trigger) {
+		pmt_crashlog_set_clear(&entry->entry);
+	} else if (pmt_crashlog_complete(&entry->entry)) {
+		/* we cannot trigger a new crash if one is still pending */
+		result = -EEXIST;
+		goto err;
+	} else if (pmt_crashlog_disabled(&entry->entry)) {
+		/* if device is currently disabled, return busy */
+		result = -EBUSY;
+		goto err;
+	} else {
+		pmt_crashlog_set_execute(&entry->entry);
+	}
+
+	result = count;
+err:
+	mutex_unlock(&entry->control_mutex);
+	return result;
+}
+static DEVICE_ATTR_RW(trigger);
+
+static struct attribute *pmt_crashlog_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_trigger.attr,
+	NULL
+};
+
+static struct attribute_group pmt_crashlog_group = {
+	.attrs	= pmt_crashlog_attrs,
+};
+
+static int pmt_crashlog_header_decode(struct intel_pmt_entry *entry,
+				      struct intel_pmt_header *header,
+				      struct device *dev)
+{
+	void __iomem *disc_table = entry->disc_table;
+	struct crashlog_entry *crashlog;
+
+	if (!pmt_crashlog_supported(entry))
+		return 1;
+
+	/* initialize control mutex */
+	crashlog = container_of(entry, struct crashlog_entry, entry);
+	mutex_init(&crashlog->control_mutex);
+
+	header->access_type = GET_ACCESS(readl(disc_table));
+	header->guid = readl(disc_table + GUID_OFFSET);
+	header->base_offset = readl(disc_table + BASE_OFFSET);
+
+	/* Size is measured in DWORDS, but accessor returns bytes */
+	header->size = GET_SIZE(readl(disc_table + SIZE_OFFSET));
+
+	return 0;
+}
+
+static DEFINE_XARRAY_ALLOC(crashlog_array);
+static struct intel_pmt_namespace pmt_crashlog_ns = {
+	.name = "crashlog",
+	.xa = &crashlog_array,
+	.attr_grp = &pmt_crashlog_group,
+	.pmt_header_decode = pmt_crashlog_header_decode,
+};
+
+/*
+ * initialization
+ */
+static int pmt_crashlog_remove(struct platform_device *pdev)
+{
+	struct pmt_crashlog_priv *priv = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < priv->num_entries; i++)
+		intel_pmt_dev_destroy(&priv->entry[i].entry, &pmt_crashlog_ns);
+
+	return 0;
+}
+
+static int pmt_crashlog_probe(struct platform_device *pdev)
+{
+	struct pmt_crashlog_priv *priv;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(priv, entry, pdev->num_resources);
+	priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, priv);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct intel_pmt_entry *entry = &priv->entry[i].entry;
+
+		ret = intel_pmt_dev_create(entry, &pmt_crashlog_ns, pdev, i);
+		if (ret < 0)
+			goto abort_probe;
+		if (ret)
+			continue;
+
+		priv->num_entries++;
+	}
+
+	return 0;
+abort_probe:
+	pmt_crashlog_remove(pdev);
+	return ret;
+}
+
+static struct platform_driver pmt_crashlog_driver = {
+	.driver = {
+		.name   = DRV_NAME,
+	},
+	.remove = pmt_crashlog_remove,
+	.probe  = pmt_crashlog_probe,
+};
+
+static int __init pmt_crashlog_init(void)
+{
+	return platform_driver_register(&pmt_crashlog_driver);
+}
+
+static void __exit pmt_crashlog_exit(void)
+{
+	platform_driver_unregister(&pmt_crashlog_driver);
+	xa_destroy(&crashlog_array);
+}
+
+module_init(pmt_crashlog_init);
+module_exit(pmt_crashlog_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Crashlog driver");
+MODULE_ALIAS("platform:" DRV_NAME);
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From de166ae81235654847f69586b7a9ece1e1cb8a0c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 17 Nov 2020 10:22:51 +0300
Subject: [PATCH 0871/2161] platform/x86: pmt: Fix a potential Oops on error in
 probe

commit d3d73d25e0d9bc43fd2a6f4b4e58ff182e55b217 upstream.

The "ns->attr_grp" pointer can be NULL so this error handling code needs
to check for that to avoid an Oops.

Fixes: e2729113ce66 ("platform/x86: Intel PMT class driver")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20201117072251.GC1111239@mwanda
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/intel_pmt_class.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index aa88dc23bbde..c8939fba4509 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -225,7 +225,8 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
 		return 0;
 
 fail_ioremap:
-	sysfs_remove_group(entry->kobj, ns->attr_grp);
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
 fail_sysfs:
 	device_unregister(dev);
 fail_dev_create:
-- 
Gitee


From 0f33ae817b7d7eead9f9c3b8d4e33951d90c5971 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 23 Nov 2020 08:36:12 -0700
Subject: [PATCH 0872/2161] ntb: intel: add Intel NTB LTR vendor support for
 gen4 NTB

commit 75b6f6487cedd0e4c8e07d68b68b8f85cd352bfe upstream.

Intel NTB device has custom LTR management that is not compliant with the
PCIe standard. Add support to set LTR status triggered by link status
change.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/ntb/hw/intel/ntb_hw_gen1.h |  1 +
 drivers/ntb/hw/intel/ntb_hw_gen4.c | 27 ++++++++++++++++++++++++++-
 drivers/ntb/hw/intel/ntb_hw_gen4.h | 15 +++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.h b/drivers/ntb/hw/intel/ntb_hw_gen1.h
index 1b759942d8af..344249fc18d1 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen1.h
+++ b/drivers/ntb/hw/intel/ntb_hw_gen1.h
@@ -141,6 +141,7 @@
 #define NTB_HWERR_B2BDOORBELL_BIT14	BIT_ULL(2)
 #define NTB_HWERR_MSIX_VECTOR32_BAD	BIT_ULL(3)
 #define NTB_HWERR_BAR_ALIGN		BIT_ULL(4)
+#define NTB_HWERR_LTR_BAD		BIT_ULL(5)
 
 extern struct intel_b2b_addr xeon_b2b_usd_addr;
 extern struct intel_b2b_addr xeon_b2b_dsd_addr;
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.c b/drivers/ntb/hw/intel/ntb_hw_gen4.c
index bc4541cbf8c6..fede05151f69 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen4.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.c
@@ -177,8 +177,10 @@ int gen4_init_dev(struct intel_ntb_dev *ndev)
 
 	ndev->reg = &gen4_reg;
 
-	if (pdev_is_ICX(pdev))
+	if (pdev_is_ICX(pdev)) {
 		ndev->hwerr_flags |= NTB_HWERR_BAR_ALIGN;
+		ndev->hwerr_flags |= NTB_HWERR_LTR_BAD;
+	}
 
 	ppd1 = ioread32(ndev->self_mmio + GEN4_PPD1_OFFSET);
 	ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1);
@@ -431,6 +433,25 @@ static int intel_ntb4_link_enable(struct ntb_dev *ntb,
 		dev_dbg(&ntb->pdev->dev,
 				"ignoring max_width %d\n", max_width);
 
+	if (!(ndev->hwerr_flags & NTB_HWERR_LTR_BAD)) {
+		u32 ltr;
+
+		/* Setup active snoop LTR values */
+		ltr = NTB_LTR_ACTIVE_REQMNT | NTB_LTR_ACTIVE_VAL | NTB_LTR_ACTIVE_LATSCALE;
+		/* Setup active non-snoop values */
+		ltr = (ltr << NTB_LTR_NS_SHIFT) | ltr;
+		iowrite32(ltr, ndev->self_mmio + GEN4_LTR_ACTIVE_OFFSET);
+
+		/* Setup idle snoop LTR values */
+		ltr = NTB_LTR_IDLE_VAL | NTB_LTR_IDLE_LATSCALE | NTB_LTR_IDLE_REQMNT;
+		/* Setup idle non-snoop values */
+		ltr = (ltr << NTB_LTR_NS_SHIFT) | ltr;
+		iowrite32(ltr, ndev->self_mmio + GEN4_LTR_IDLE_OFFSET);
+
+		/* setup PCIe LTR to active */
+		iowrite8(NTB_LTR_SWSEL_ACTIVE, ndev->self_mmio + GEN4_LTR_SWSEL_OFFSET);
+	}
+
 	ntb_ctl = NTB_CTL_E2I_BAR23_SNOOP | NTB_CTL_I2E_BAR23_SNOOP;
 	ntb_ctl |= NTB_CTL_E2I_BAR45_SNOOP | NTB_CTL_I2E_BAR45_SNOOP;
 	iowrite32(ntb_ctl, ndev->self_mmio + ndev->reg->ntb_ctl);
@@ -476,6 +497,10 @@ static int intel_ntb4_link_disable(struct ntb_dev *ntb)
 	lnkctl |= GEN4_LINK_CTRL_LINK_DISABLE;
 	iowrite16(lnkctl, ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
 
+	/* set LTR to idle */
+	if (!(ndev->hwerr_flags & NTB_HWERR_LTR_BAD))
+		iowrite8(NTB_LTR_SWSEL_IDLE, ndev->self_mmio + GEN4_LTR_SWSEL_OFFSET);
+
 	ndev->dev_up = 0;
 
 	return 0;
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.h b/drivers/ntb/hw/intel/ntb_hw_gen4.h
index a868c788de02..3fcd3fdce9ed 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen4.h
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.h
@@ -35,6 +35,9 @@
 #define GEN4_IM_SPAD_SEM_OFFSET		0x00c0	/* SPAD hw semaphore */
 #define GEN4_IM_SPAD_STICKY_OFFSET	0x00c4  /* sticky SPAD */
 #define GEN4_IM_DOORBELL_OFFSET		0x0100  /* 0-31 doorbells */
+#define GEN4_LTR_SWSEL_OFFSET		0x30ec
+#define GEN4_LTR_ACTIVE_OFFSET		0x30f0
+#define GEN4_LTR_IDLE_OFFSET		0x30f4
 #define GEN4_EM_SPAD_OFFSET		0x8080
 /* note, link status is now in MMIO and not config space for NTB */
 #define GEN4_LINK_CTRL_OFFSET		0xb050
@@ -80,6 +83,18 @@
 
 #define NTB_SJC_FORCEDETECT		0x000004
 
+#define NTB_LTR_SWSEL_ACTIVE		0x0
+#define NTB_LTR_SWSEL_IDLE		0x1
+
+#define NTB_LTR_NS_SHIFT		16
+#define NTB_LTR_ACTIVE_VAL		0x0000  /* 0 us */
+#define NTB_LTR_ACTIVE_LATSCALE		0x0800  /* 1us scale */
+#define NTB_LTR_ACTIVE_REQMNT		0x8000  /* snoop req enable */
+
+#define NTB_LTR_IDLE_VAL		0x0258  /* 600 us */
+#define NTB_LTR_IDLE_LATSCALE		0x0800  /* 1us scale */
+#define NTB_LTR_IDLE_REQMNT		0x8000  /* snoop req enable */
+
 ssize_t ndev_ntb4_debugfs_read(struct file *filp, char __user *ubuf,
 				      size_t count, loff_t *offp);
 int gen4_init_dev(struct intel_ntb_dev *ndev);
-- 
Gitee


From 6e3b9cb0a87171f0b73ce85975bf1a553217fbbd Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 4 Jan 2022 18:02:38 +0800
Subject: [PATCH 0873/2161] x86: Enumerate AVX512 FP16 CPUID feature flag

commit e1b35da5e624f8b09d2e98845c2e4c84b179d9a4 upstream.

Enumerate AVX512 Half-precision floating point (FP16) CPUID feature
flag. Compared with using FP32, using FP16 cut the number of bits
required for storage in half, reducing the exponent from 8 bits to 5,
and the mantissa from 23 bits to 10. Using FP16 also enables developers
to train and run inference on deep learning models fast when all
precision or magnitude (FP32) is not needed.

A processor supports AVX512 FP16 if CPUID.(EAX=7,ECX=0):EDX[bit 23]
is present. The AVX512 FP16 requires AVX512BW feature be implemented
since the instructions for manipulating 32bit masks are associated with
AVX512BW.

The only in-kernel usage of this is kvm passthrough. The CPU feature
flag is shown as "avx512_fp16" in /proc/cpuinfo.

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Message-Id: <20201208033441.28207-2-kyung.min.park@intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/kernel/cpu/cpuid-deps.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 63bd1c3ef774..1bf62f7fd480 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -370,6 +370,7 @@
 #define X86_FEATURE_TSXLDTRK		(18*32+16) /* TSX Suspend Load Address Tracking */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_AVX512_FP16		(18*32+23) /* AVX512 FP16 */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 0a160870bc4b..4e2d4809e7b0 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_CQM_MBM_TOTAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_CQM_MBM_LOCAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_FP16,		X86_FEATURE_AVX512BW  },
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
-- 
Gitee


From 0fddba5342adafc73e6d16b20c46affdcf8b6a49 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:06 -0800
Subject: [PATCH 0874/2161] platform/x86: intel_pmt: Make INTEL_PMT_CLASS
 non-user-selectable

commit 35d8a973fe4d38afee944db636c3d2b1df3741a7 upstream.

Fix error in Kconfig that exposed INTEL_PMT_CLASS as a user selectable
option. It is already selected by INTEL_PMT_TELEMETRY and
INTEL_PMT_CRASHLOG which are user selectable.

Fixes: e2729113ce66 ("platform/x86: Intel PMT class driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-1-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 924794c0464e..d57bab6e5ddf 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1211,7 +1211,7 @@ config SURFACE_3_BUTTON
 	  This driver handles the power/home/volume buttons on the Microsoft Surface 3 tablet.
 
 config INTEL_PMT_CLASS
-	tristate "Intel Platform Monitoring Technology (PMT) Class driver"
+	tristate
 	help
 	  The Intel Platform Monitoring Technology (PMT) class driver provides
 	  the basic sysfs interface and file hierarchy uses by PMT devices.
-- 
Gitee


From 9fdb4535bdd35a20852e0dc92c4f2f009f0b4dbf Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:07 -0800
Subject: [PATCH 0875/2161] platform/x86: intel_pmt_telemetry: Add dependency
 on MFD_INTEL_PMT

commit f3f6da5014dea3cc005b36948abe3664b5d1f7d3 upstream.

All devices that expose Intel Platform Monitoring Technology (PMT)
telemetry are currently owned by the intel_pmt MFD driver. Therefore make
the telemetry driver depend on the MFD driver for build.

Fixes: 68fe8e6e2c4b ("platform/x86: Intel PMT Telemetry capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-2-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index d57bab6e5ddf..dd126142531c 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1224,6 +1224,7 @@ config INTEL_PMT_CLASS
 
 config INTEL_PMT_TELEMETRY
 	tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver"
+	depends on MFD_INTEL_PMT
 	select INTEL_PMT_CLASS
 	help
 	  The Intel Platform Monitory Technology (PMT) Telemetry driver provides
-- 
Gitee


From 8727d9220bd8721b71d5a15e94e1d4b5c23be420 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:08 -0800
Subject: [PATCH 0876/2161] platform/x86: intel_pmt_crashlog: Add dependency on
 MFD_INTEL_PMT

commit fdd3feb37e36bec2ad75d76f8ac4d0273c5c0a91 upstream.

All devices that expose Intel Platform Monitoring Technology (PMT)
crashlog are currently owned by the intel_pmt MFD driver. Therefore make
the crashlog driver depend on the MFD driver for build.

Fixes: 5ef9998c96b0 ("platform/x86: Intel PMT Crashlog capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-3-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index dd126142531c..55aba46d24d9 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1236,6 +1236,7 @@ config INTEL_PMT_TELEMETRY
 
 config INTEL_PMT_CRASHLOG
 	tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver"
+	depends on MFD_INTEL_PMT
 	select INTEL_PMT_CLASS
 	help
 	  The Intel Platform Monitoring Technology (PMT) crashlog driver provides
-- 
Gitee


From e4867ef325812ea0ccb4c896790e43a2c7e1f89a Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 4 Jan 2022 20:43:29 +0800
Subject: [PATCH 0877/2161] Enumerate AVX Vector Neural Network instructions

commit b85a0425d8056f3bd8d0a94ecdddf2a39d32a801 upstream.

Add AVX version of the Vector Neural Network (VNNI) Instructions.

A processor supports AVX VNNI instructions if CPUID.0x07.0x1:EAX[4] is
present. The following instructions are available when this feature is
present.
  1. VPDPBUS: Multiply and Add Unsigned and Signed Bytes
  2. VPDPBUSDS: Multiply and Add Unsigned and Signed Bytes with Saturation
  3. VPDPWSSD: Multiply and Add Signed Word Integers
  4. VPDPWSSDS: Multiply and Add Signed Integers with Saturation

The only in-kernel usage of this is kvm passthrough. The CPU feature
flag is shown as "avx_vnni" in /proc/cpuinfo.

This instruction is currently documented in the latest "extensions"
manual (ISE). It will appear in the "main" manual (SDM) in the future.

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Message-Id: <20210105004909.42000-2-yang.zhong@intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1bf62f7fd480..ae1dbfcbd5cb 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -288,6 +288,7 @@
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL	(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
 #define X86_FEATURE_AMX_TILE		(18*32+24) /* AMX tile Support */
 
-- 
Gitee


From ca20dee28b39a60bfa54aadb6224b7ac66003360 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 24 Feb 2021 12:10:04 -0800
Subject: [PATCH 0878/2161] mfd: intel_pmt: Fix nuisance messages and handling
 of disabled capabilities

commit a1a5c1c3df282dc122508a17500317266ef19e46 upstream.

Some products will be available that have PMT capabilities that are not
supported. Remove the warnings in this instance to avoid nuisance messages
and confusion.

Also return an error code for capabilities that are disabled by quirk to
prevent them from keeping the driver loaded if only disabled capabilities
are found.

Fixes: 4f8217d5b0ca ("mfd: Intel Platform Monitoring Technology support")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/mfd/intel_pmt.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
index 744b230cdcca..65da2b17a204 100644
--- a/drivers/mfd/intel_pmt.c
+++ b/drivers/mfd/intel_pmt.c
@@ -79,19 +79,18 @@ static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
 	case DVSEC_INTEL_ID_WATCHER:
 		if (quirks & PMT_QUIRK_NO_WATCHER) {
 			dev_info(dev, "Watcher not supported\n");
-			return 0;
+			return -EINVAL;
 		}
 		name = "pmt_watcher";
 		break;
 	case DVSEC_INTEL_ID_CRASHLOG:
 		if (quirks & PMT_QUIRK_NO_CRASHLOG) {
 			dev_info(dev, "Crashlog not supported\n");
-			return 0;
+			return -EINVAL;
 		}
 		name = "pmt_crashlog";
 		break;
 	default:
-		dev_err(dev, "Unrecognized PMT capability: %d\n", id);
 		return -EINVAL;
 	}
 
@@ -174,12 +173,8 @@ static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
 
 		ret = pmt_add_dev(pdev, &header, quirks);
-		if (ret) {
-			dev_warn(&pdev->dev,
-				 "Failed to add device for DVSEC id %d\n",
-				 header.id);
+		if (ret)
 			continue;
-		}
 
 		found_devices = true;
 	} while (true);
-- 
Gitee


From c82b2ec593a0036a402bb632b10efa14a843c0d6 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 24 Feb 2021 12:10:05 -0800
Subject: [PATCH 0879/2161] mfd: intel_pmt: Add support for DG1

commit aa47ad3f853ae72c32b7e46dfc8bc2c8dc2dbad7 upstream.

Adds PMT Telemetry aggregator support for the DG1 graphics PCIe card. The
device does not have the DVSEC region in its PCI config space so hard
code the discovery table data in the driver. Also requires a fix for DG1
in the Telemetry driver for how the ACCESS_TYPE field is used.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/mfd/intel_pmt.c                    | 101 +++++++++++++++------
 drivers/platform/x86/intel_pmt_class.c     |  46 ++++++++++
 drivers/platform/x86/intel_pmt_class.h     |   1 +
 drivers/platform/x86/intel_pmt_telemetry.c |  20 ----
 4 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
index 65da2b17a204..dd7eb614c28e 100644
--- a/drivers/mfd/intel_pmt.c
+++ b/drivers/mfd/intel_pmt.c
@@ -49,10 +49,14 @@ enum pmt_quirks {
 
 	/* Use shift instead of mask to read discovery table offset */
 	PMT_QUIRK_TABLE_SHIFT	= BIT(2),
+
+	/* DVSEC not present (provided in driver data) */
+	PMT_QUIRK_NO_DVSEC	= BIT(3),
 };
 
 struct pmt_platform_info {
 	unsigned long quirks;
+	struct intel_dvsec_header **capabilities;
 };
 
 static const struct pmt_platform_info tgl_info = {
@@ -60,6 +64,26 @@ static const struct pmt_platform_info tgl_info = {
 		  PMT_QUIRK_TABLE_SHIFT,
 };
 
+/* DG1 Platform with DVSEC quirk*/
+static struct intel_dvsec_header dg1_telemetry = {
+	.length = 0x10,
+	.id = 2,
+	.num_entries = 1,
+	.entry_size = 3,
+	.tbir = 0,
+	.offset = 0x466000,
+};
+
+static struct intel_dvsec_header *dg1_capabilities[] = {
+	&dg1_telemetry,
+	NULL
+};
+
+static const struct pmt_platform_info dg1_info = {
+	.quirks = PMT_QUIRK_NO_DVSEC,
+	.capabilities = dg1_capabilities,
+};
+
 static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
 		       unsigned long quirks)
 {
@@ -147,37 +171,54 @@ static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (info)
 		quirks = info->quirks;
 
-	do {
-		struct intel_dvsec_header header;
-		u32 table;
-		u16 vid;
+	if (info && (info->quirks & PMT_QUIRK_NO_DVSEC)) {
+		struct intel_dvsec_header **header;
 
-		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
-		if (!pos)
-			break;
+		header = info->capabilities;
+		while (*header) {
+			ret = pmt_add_dev(pdev, *header, quirks);
+			if (ret)
+				dev_warn(&pdev->dev,
+					 "Failed to add device for DVSEC id %d\n",
+					 (*header)->id);
+			else
+				found_devices = true;
 
-		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
-		if (vid != PCI_VENDOR_ID_INTEL)
-			continue;
-
-		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
-				     &header.id);
-		pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
-				     &header.num_entries);
-		pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
-				     &header.entry_size);
-		pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
-				      &table);
-
-		header.tbir = INTEL_DVSEC_TABLE_BAR(table);
-		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
-
-		ret = pmt_add_dev(pdev, &header, quirks);
-		if (ret)
-			continue;
-
-		found_devices = true;
-	} while (true);
+			++header;
+		}
+	} else {
+		do {
+			struct intel_dvsec_header header;
+			u32 table;
+			u16 vid;
+
+			pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
+			if (!pos)
+				break;
+
+			pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
+			if (vid != PCI_VENDOR_ID_INTEL)
+				continue;
+
+			pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
+					     &header.id);
+			pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
+					     &header.num_entries);
+			pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
+					     &header.entry_size);
+			pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
+					      &table);
+
+			header.tbir = INTEL_DVSEC_TABLE_BAR(table);
+			header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
+
+			ret = pmt_add_dev(pdev, &header, quirks);
+			if (ret)
+				continue;
+
+			found_devices = true;
+		} while (true);
+	}
 
 	if (!found_devices)
 		return -ENODEV;
@@ -195,10 +236,12 @@ static void pmt_pci_remove(struct pci_dev *pdev)
 }
 
 #define PCI_DEVICE_ID_INTEL_PMT_ADL	0x467d
+#define PCI_DEVICE_ID_INTEL_PMT_DG1	0x490e
 #define PCI_DEVICE_ID_INTEL_PMT_OOBMSM	0x09a7
 #define PCI_DEVICE_ID_INTEL_PMT_TGL	0x9a0d
 static const struct pci_device_id pmt_pci_ids[] = {
 	{ PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_DG1, &dg1_info) },
 	{ PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) },
 	{ PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) },
 	{ }
diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index c8939fba4509..228e21f1ce5c 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -19,6 +19,28 @@
 #define PMT_XA_MAX		INT_MAX
 #define PMT_XA_LIMIT		XA_LIMIT(PMT_XA_START, PMT_XA_MAX)
 
+/*
+ * Early implementations of PMT on client platforms have some
+ * differences from the server platforms (which use the Out Of Band
+ * Management Services Module OOBMSM). This list tracks those
+ * platforms as needed to handle those differences. Newer client
+ * platforms are expected to be fully compatible with server.
+ */
+static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
+	{ PCI_VDEVICE(INTEL, 0x490e) }, /* DG1 */
+	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
+	{ }
+};
+
+bool intel_pmt_is_early_client_hw(struct device *dev)
+{
+	struct pci_dev *parent = to_pci_dev(dev->parent);
+
+	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
+
 /*
  * sysfs
  */
@@ -147,6 +169,30 @@ static int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
 		 * base address = end of discovery region + base offset
 		 */
 		entry->base_addr = disc_res->end + 1 + header->base_offset;
+
+		/*
+		 * Some hardware use a different calculation for the base address
+		 * when access_type == ACCESS_LOCAL. On the these systems
+		 * ACCCESS_LOCAL refers to an address in the same BAR as the
+		 * header but at a fixed offset. But as the header address was
+		 * supplied to the driver, we don't know which BAR it was in.
+		 * So search for the bar whose range includes the header address.
+		 */
+		if (intel_pmt_is_early_client_hw(dev)) {
+			int i;
+
+			entry->base_addr = 0;
+			for (i = 0; i < 6; i++)
+				if (disc_res->start >= pci_resource_start(pci_dev, i) &&
+				   (disc_res->start <= pci_resource_end(pci_dev, i))) {
+					entry->base_addr = pci_resource_start(pci_dev, i) +
+							   header->base_offset;
+					break;
+				}
+			if (!entry->base_addr)
+				return -EINVAL;
+		}
+
 		break;
 	case ACCESS_BARID:
 		/*
diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h
index de8f8139ba31..1337019c2873 100644
--- a/drivers/platform/x86/intel_pmt_class.h
+++ b/drivers/platform/x86/intel_pmt_class.h
@@ -44,6 +44,7 @@ struct intel_pmt_namespace {
 				 struct device *dev);
 };
 
+bool intel_pmt_is_early_client_hw(struct device *dev);
 int intel_pmt_dev_create(struct intel_pmt_entry *entry,
 			 struct intel_pmt_namespace *ns,
 			 struct platform_device *pdev, int idx);
diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
index f8a87614efa4..9b95ef050457 100644
--- a/drivers/platform/x86/intel_pmt_telemetry.c
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -34,26 +34,6 @@ struct pmt_telem_priv {
 	struct intel_pmt_entry		entry[];
 };
 
-/*
- * Early implementations of PMT on client platforms have some
- * differences from the server platforms (which use the Out Of Band
- * Management Services Module OOBMSM). This list tracks those
- * platforms as needed to handle those differences. Newer client
- * platforms are expected to be fully compatible with server.
- */
-static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
-	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
-	{ }
-};
-
-static bool intel_pmt_is_early_client_hw(struct device *dev)
-{
-	struct pci_dev *parent = to_pci_dev(dev->parent);
-
-	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
-}
-
 static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry,
 				      struct device *dev)
 {
-- 
Gitee


From 4c017dc549d0c51afe58d5f260b9867ad1978ac1 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 16 Mar 2021 19:44:54 -0700
Subject: [PATCH 0880/2161] platform/x86: intel_pmt_class: Initial resource to
 0

commit 501bb68a66cfc0bc2a2458483400cb49daca974f upstream.

Initialize the struct resource in intel_pmt_dev_register to zero to avoid a
fault should the char *name field be non-zero.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210317024455.3071477-1-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/intel_pmt_class.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index 228e21f1ce5c..c86ff15b1ed5 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -219,7 +219,7 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
 				  struct intel_pmt_namespace *ns,
 				  struct device *parent)
 {
-	struct resource res;
+	struct resource res = {0};
 	struct device *dev;
 	int ret;
 
-- 
Gitee


From 32da008ef79da7125c3a782749d4cbf5d1f1fbc8 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 16 Mar 2021 19:44:55 -0700
Subject: [PATCH 0881/2161] platform/x86: intel_pmt_crashlog: Fix incorrect
 macros

commit d4dc4bf28263f25e0907072ce163dd454c6aa51a upstream.

Fixes off-by-one bugs in the macro assignments for the crashlog control
bits. Was initially tested on emulation but bug revealed after testing on
silicon.

Fixes: 5ef9998c96b0 ("platform/x86: Intel PMT Crashlog capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210317024455.3071477-2-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/intel_pmt_crashlog.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
index 97dd749c8290..92d315a16cfd 100644
--- a/drivers/platform/x86/intel_pmt_crashlog.c
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -23,18 +23,17 @@
 #define CRASH_TYPE_OOBMSM	1
 
 /* Control Flags */
-#define CRASHLOG_FLAG_DISABLE		BIT(27)
+#define CRASHLOG_FLAG_DISABLE		BIT(28)
 
 /*
- * Bits 28 and 29 control the state of bit 31.
+ * Bits 29 and 30 control the state of bit 31.
  *
- * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured.
- * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31.
- * Bit 30 is read-only and reserved as 0.
+ * Bit 29 will clear bit 31, if set, allowing a new crashlog to be captured.
+ * Bit 30 will immediately trigger a crashlog to be generated, setting bit 31.
  * Bit 31 is the read-only status with a 1 indicating log is complete.
  */
-#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(28)
-#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(30)
 #define CRASHLOG_FLAG_TRIGGER_COMPLETE	BIT(31)
 #define CRASHLOG_FLAG_TRIGGER_MASK	GENMASK(31, 28)
 
-- 
Gitee


From e812798166942d314c388da34c3b518be3e8b6d5 Mon Sep 17 00:00:00 2001
From: zgpeng <zgpeng@tencent.com>
Date: Thu, 19 May 2022 06:35:02 +0800
Subject: [PATCH 0882/2161] rbtree: Add generic add and find helpers

commit 2d24dd5798d0474d9bf705bfca8725e7d20f9d54 upstream.

I've always been bothered by the endless (fragile) boilerplate for
rbtree, and I recently wrote some rbtree helpers for objtool and
figured I should lift them into the kernel and use them more widely.

Provide:

partial-order; less() based:
 - rb_add(): add a new entry to the rbtree
 - rb_add_cached(): like rb_add(), but for a rb_root_cached

total-order; cmp() based:
 - rb_find(): find an entry in an rbtree
 - rb_find_add(): find an entry, and add if not found

 - rb_find_first(): find the first (leftmost) matching entry
 - rb_next_match(): continue from rb_find_first()
 - rb_for_each(): iterate a sub-tree using the previous two

Inlining and constant propagation should see the compiler inline the
whole thing, including the various compare functions.

Signed-off-by: zgpeng <zgpeng@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/rbtree.h | 194 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 1fd61a9af45c..2f0a976b2eea 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -158,4 +158,198 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
 	rb_replace_node(victim, new, &root->rb_root);
 }
 
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ *
+ * Returns @node when it is the new leftmost, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+	      bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color_cached(node, tree, leftmost);
+
+	return leftmost ? node : NULL;
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+       bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+	    int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+	int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = node->rb_left;
+		else if (c > 0)
+			node = node->rb_right;
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+	struct rb_node *match = NULL;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c <= 0) {
+			if (!c)
+				match = node;
+			node = node->rb_left;
+		} else if (c > 0) {
+			node = node->rb_right;
+		}
+	}
+
+	return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	node = rb_next(node);
+	if (node && cmp(key, node))
+		node = NULL;
+	return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+	for ((node) = rb_find_first((key), (tree), (cmp)); \
+	     (node); (node) = rb_next_match((key), (node), (cmp)))
+
 #endif	/* _LINUX_RBTREE_H */
-- 
Gitee


From e8ecc594f9c146dc169cae262540368a00b7801b Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 6 Jan 2022 15:55:11 +0800
Subject: [PATCH 0883/2161] perf/x86/intel/uncore: Parse uncore discovery
 tables

commit edae1f06c2cda41edffc93de6aedc8ba8dc883c3 upstream.

A self-describing mechanism for the uncore PerfMon hardware has been
introduced with the latest Intel platforms. By reading through an MMIO
page worth of information, perf can 'discover' all the standard uncore
PerfMon registers in a machine.

The discovery mechanism relies on BIOS's support. With a proper BIOS,
a PCI device with the unique capability ID 0x23 can be found on each
die. Perf can retrieve the information of all available uncore PerfMons
from the device via MMIO. The information is composed of one global
discovery table and several unit discovery tables.
- The global discovery table includes global uncore information of the
  die, e.g., the address of the global control register, the offset of
  the global status register, the number of uncore units, the offset of
  unit discovery tables, etc.
- The unit discovery table includes generic uncore unit information,
  e.g., the access type, the counter width, the address of counters,
  the address of the counter control, the unit ID, the unit type, etc.
  The unit is also called "box" in the code.
Perf can provide basic uncore support based on this information
with the following patches.

To locate the PCI device with the discovery tables, check the generic
PCI ID first. If it doesn't match, go through the entire PCI device tree
and locate the device with the unique capability ID.

The uncore information is similar among dies. To save parsing time and
space, only completely parse and store the discovery tables on the first
die and the first box of each die. The parsed information is stored in
an
RB tree structure, intel_uncore_discovery_type. The size of the stored
discovery tables varies among platforms. It's around 4KB for a Sapphire
Rapids server.

If a BIOS doesn't support the 'discovery' mechanism, the uncore driver
will exit with -ENODEV. There is nothing changed.

Add a module parameter to disable the discovery feature. If a BIOS gets
the discovery tables wrong, users can have an option to disable the
feature. For the current patchset, the uncore driver will exit with
-ENODEV. In the future, it may fall back to the hardcode uncore driver
on a known platform.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1616003977-90612-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/Makefile           |   2 +-
 arch/x86/events/intel/uncore.c           |  31 ++-
 arch/x86/events/intel/uncore_discovery.c | 318 +++++++++++++++++++++++
 arch/x86/events/intel/uncore_discovery.h | 105 ++++++++
 4 files changed, 448 insertions(+), 8 deletions(-)
 create mode 100644 arch/x86/events/intel/uncore_discovery.c
 create mode 100644 arch/x86/events/intel/uncore_discovery.h

diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index e67a5886336c..10bde6c5abb2 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -3,6 +3,6 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel-uncore.o
-intel-uncore-objs			:= uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o
+intel-uncore-objs			:= uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o uncore_discovery.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE)	+= intel-cstate.o
 intel-cstate-objs			:= cstate.o
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 7fb1f904ab63..a6163ab3faa2 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -4,7 +4,12 @@
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
 #include "uncore.h"
+#include "uncore_discovery.h"
 
+static bool uncore_no_discover;
+module_param(uncore_no_discover, bool, 0);
+MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism "
+				     "(default: enable the discovery mechanism).");
 static struct intel_uncore_type *empty_uncore[] = { NULL, };
 struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
 struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
@@ -1482,6 +1487,9 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
 	.mmio_init = snr_uncore_mmio_init,
 };
 
+static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
+};
+
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP,	  nhm_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM,	  nhm_uncore_init),
@@ -1525,16 +1533,20 @@ static int __init intel_uncore_init(void)
 	struct intel_uncore_init_fun *uncore_init;
 	int pret = 0, cret = 0, mret = 0, ret;
 
-	id = x86_match_cpu(intel_uncore_match);
-	if (!id)
-		return -ENODEV;
-
 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return -ENODEV;
 
 	max_dies = topology_max_packages() * topology_max_die_per_package();
 
-	uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+	id = x86_match_cpu(intel_uncore_match);
+	if (!id) {
+		if (!uncore_no_discover && intel_uncore_has_discovery_tables())
+			uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init;
+		else
+			return -ENODEV;
+	} else
+		uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+
 	if (uncore_init->pci_init) {
 		pret = uncore_init->pci_init();
 		if (!pret)
@@ -1551,8 +1563,10 @@ static int __init intel_uncore_init(void)
 		mret = uncore_mmio_init();
 	}
 
-	if (cret && pret && mret)
-		return -ENODEV;
+	if (cret && pret && mret) {
+		ret = -ENODEV;
+		goto free_discovery;
+	}
 
 	/* Install hotplug callbacks to setup the targets for each package */
 	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
@@ -1567,6 +1581,8 @@ static int __init intel_uncore_init(void)
 	uncore_types_exit(uncore_msr_uncores);
 	uncore_types_exit(uncore_mmio_uncores);
 	uncore_pci_exit();
+free_discovery:
+	intel_uncore_clear_discovery_tables();
 	return ret;
 }
 module_init(intel_uncore_init);
@@ -1577,5 +1593,6 @@ static void __exit intel_uncore_exit(void)
 	uncore_types_exit(uncore_msr_uncores);
 	uncore_types_exit(uncore_mmio_uncores);
 	uncore_pci_exit();
+	intel_uncore_clear_discovery_tables();
 }
 module_exit(intel_uncore_exit);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
new file mode 100644
index 000000000000..7519ce346b57
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Support Intel uncore PerfMon discovery mechanism.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "uncore.h"
+#include "uncore_discovery.h"
+
+static struct rb_root discovery_tables = RB_ROOT;
+static int num_discovered_types[UNCORE_ACCESS_MAX];
+
+static bool has_generic_discovery_table(void)
+{
+	struct pci_dev *dev;
+	int dvsec;
+
+	dev = pci_get_device(PCI_VENDOR_ID_INTEL, UNCORE_DISCOVERY_TABLE_DEVICE, NULL);
+	if (!dev)
+		return false;
+
+	/* A discovery table device has the unique capability ID. */
+	dvsec = pci_find_next_ext_capability(dev, 0, UNCORE_EXT_CAP_ID_DISCOVERY);
+	pci_dev_put(dev);
+	if (dvsec)
+		return true;
+
+	return false;
+}
+
+static int logical_die_id;
+
+static int get_device_die_id(struct pci_dev *dev)
+{
+	int cpu, node = pcibus_to_node(dev->bus);
+
+	/*
+	 * If the NUMA info is not available, assume that the logical die id is
+	 * continuous in the order in which the discovery table devices are
+	 * detected.
+	 */
+	if (node < 0)
+		return logical_die_id++;
+
+	for_each_cpu(cpu, cpumask_of_node(node)) {
+		struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+		if (c->initialized && cpu_to_node(cpu) == node)
+			return c->logical_die_id;
+	}
+
+	/*
+	 * All CPUs of a node may be offlined. For this case,
+	 * the PCI and MMIO type of uncore blocks which are
+	 * enumerated by the device will be unavailable.
+	 */
+	return -1;
+}
+
+#define __node_2_type(cur)	\
+	rb_entry((cur), struct intel_uncore_discovery_type, node)
+
+static inline int __type_cmp(const void *key, const struct rb_node *b)
+{
+	struct intel_uncore_discovery_type *type_b = __node_2_type(b);
+	const u16 *type_id = key;
+
+	if (type_b->type > *type_id)
+		return -1;
+	else if (type_b->type < *type_id)
+		return 1;
+
+	return 0;
+}
+
+static inline struct intel_uncore_discovery_type *
+search_uncore_discovery_type(u16 type_id)
+{
+	struct rb_node *node = rb_find(&type_id, &discovery_tables, __type_cmp);
+
+	return (node) ? __node_2_type(node) : NULL;
+}
+
+static inline bool __type_less(struct rb_node *a, const struct rb_node *b)
+{
+	return (__node_2_type(a)->type < __node_2_type(b)->type);
+}
+
+static struct intel_uncore_discovery_type *
+add_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+	struct intel_uncore_discovery_type *type;
+
+	if (unit->access_type >= UNCORE_ACCESS_MAX) {
+		pr_warn("Unsupported access type %d\n", unit->access_type);
+		return NULL;
+	}
+
+	type = kzalloc(sizeof(struct intel_uncore_discovery_type), GFP_KERNEL);
+	if (!type)
+		return NULL;
+
+	type->box_ctrl_die = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL);
+	if (!type->box_ctrl_die)
+		goto free_type;
+
+	type->access_type = unit->access_type;
+	num_discovered_types[type->access_type]++;
+	type->type = unit->box_type;
+
+	rb_add(&type->node, &discovery_tables, __type_less);
+
+	return type;
+
+free_type:
+	kfree(type);
+
+	return NULL;
+
+}
+
+static struct intel_uncore_discovery_type *
+get_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+	struct intel_uncore_discovery_type *type;
+
+	type = search_uncore_discovery_type(unit->box_type);
+	if (type)
+		return type;
+
+	return add_uncore_discovery_type(unit);
+}
+
+static void
+uncore_insert_box_info(struct uncore_unit_discovery *unit,
+		       int die, bool parsed)
+{
+	struct intel_uncore_discovery_type *type;
+	unsigned int *box_offset, *ids;
+	int i;
+
+	if (WARN_ON_ONCE(!unit->ctl || !unit->ctl_offset || !unit->ctr_offset))
+		return;
+
+	if (parsed) {
+		type = search_uncore_discovery_type(unit->box_type);
+		if (WARN_ON_ONCE(!type))
+			return;
+		/* Store the first box of each die */
+		if (!type->box_ctrl_die[die])
+			type->box_ctrl_die[die] = unit->ctl;
+		return;
+	}
+
+	type = get_uncore_discovery_type(unit);
+	if (!type)
+		return;
+
+	box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+	if (!box_offset)
+		return;
+
+	ids = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+	if (!ids)
+		goto free_box_offset;
+
+	/* Store generic information for the first box */
+	if (!type->num_boxes) {
+		type->box_ctrl = unit->ctl;
+		type->box_ctrl_die[die] = unit->ctl;
+		type->num_counters = unit->num_regs;
+		type->counter_width = unit->bit_width;
+		type->ctl_offset = unit->ctl_offset;
+		type->ctr_offset = unit->ctr_offset;
+		*ids = unit->box_id;
+		goto end;
+	}
+
+	for (i = 0; i < type->num_boxes; i++) {
+		ids[i] = type->ids[i];
+		box_offset[i] = type->box_offset[i];
+
+		if (WARN_ON_ONCE(unit->box_id == ids[i]))
+			goto free_ids;
+	}
+	ids[i] = unit->box_id;
+	box_offset[i] = unit->ctl - type->box_ctrl;
+	kfree(type->ids);
+	kfree(type->box_offset);
+end:
+	type->ids = ids;
+	type->box_offset = box_offset;
+	type->num_boxes++;
+	return;
+
+free_ids:
+	kfree(ids);
+
+free_box_offset:
+	kfree(box_offset);
+
+}
+
+static int parse_discovery_table(struct pci_dev *dev, int die,
+				 u32 bar_offset, bool *parsed)
+{
+	struct uncore_global_discovery global;
+	struct uncore_unit_discovery unit;
+	void __iomem *io_addr;
+	resource_size_t addr;
+	unsigned long size;
+	u32 val;
+	int i;
+
+	pci_read_config_dword(dev, bar_offset, &val);
+
+	if (val & UNCORE_DISCOVERY_MASK)
+		return -EINVAL;
+
+	addr = (resource_size_t)(val & ~UNCORE_DISCOVERY_MASK);
+	size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE;
+	io_addr = ioremap(addr, size);
+	if (!io_addr)
+		return -ENOMEM;
+
+	/* Read Global Discovery State */
+	memcpy_fromio(&global, io_addr, sizeof(struct uncore_global_discovery));
+	if (uncore_discovery_invalid_unit(global)) {
+		pr_info("Invalid Global Discovery State: 0x%llx 0x%llx 0x%llx\n",
+			global.table1, global.ctl, global.table3);
+		iounmap(io_addr);
+		return -EINVAL;
+	}
+	iounmap(io_addr);
+
+	size = (1 + global.max_units) * global.stride * 8;
+	io_addr = ioremap(addr, size);
+	if (!io_addr)
+		return -ENOMEM;
+
+	/* Parsing Unit Discovery State */
+	for (i = 0; i < global.max_units; i++) {
+		memcpy_fromio(&unit, io_addr + (i + 1) * (global.stride * 8),
+			      sizeof(struct uncore_unit_discovery));
+
+		if (uncore_discovery_invalid_unit(unit))
+			continue;
+
+		if (unit.access_type >= UNCORE_ACCESS_MAX)
+			continue;
+
+		uncore_insert_box_info(&unit, die, *parsed);
+	}
+
+	*parsed = true;
+	iounmap(io_addr);
+	return 0;
+}
+
+bool intel_uncore_has_discovery_tables(void)
+{
+	u32 device, val, entry_id, bar_offset;
+	int die, dvsec = 0, ret = true;
+	struct pci_dev *dev = NULL;
+	bool parsed = false;
+
+	if (has_generic_discovery_table())
+		device = UNCORE_DISCOVERY_TABLE_DEVICE;
+	else
+		device = PCI_ANY_ID;
+
+	/*
+	 * Start a new search and iterates through the list of
+	 * the discovery table devices.
+	 */
+	while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
+		while ((dvsec = pci_find_next_ext_capability(dev, dvsec, UNCORE_EXT_CAP_ID_DISCOVERY))) {
+			pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC_OFFSET, &val);
+			entry_id = val & UNCORE_DISCOVERY_DVSEC_ID_MASK;
+			if (entry_id != UNCORE_DISCOVERY_DVSEC_ID_PMON)
+				continue;
+
+			pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC2_OFFSET, &val);
+
+			if (val & ~UNCORE_DISCOVERY_DVSEC2_BIR_MASK) {
+				ret = false;
+				goto err;
+			}
+			bar_offset = UNCORE_DISCOVERY_BIR_BASE +
+				     (val & UNCORE_DISCOVERY_DVSEC2_BIR_MASK) * UNCORE_DISCOVERY_BIR_STEP;
+
+			die = get_device_die_id(dev);
+			if (die < 0)
+				continue;
+
+			parse_discovery_table(dev, die, bar_offset, &parsed);
+		}
+	}
+
+	/* None of the discovery tables are available */
+	if (!parsed)
+		ret = false;
+err:
+	pci_dev_put(dev);
+
+	return ret;
+}
+
+void intel_uncore_clear_discovery_tables(void)
+{
+	struct intel_uncore_discovery_type *type, *next;
+
+	rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) {
+		kfree(type->box_ctrl_die);
+		kfree(type);
+	}
+}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
new file mode 100644
index 000000000000..95afa393145a
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* Generic device ID of a discovery table device */
+#define UNCORE_DISCOVERY_TABLE_DEVICE		0x09a7
+/* Capability ID for a discovery table device */
+#define UNCORE_EXT_CAP_ID_DISCOVERY		0x23
+/* First DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC_OFFSET		0x8
+/* Mask of the supported discovery entry type */
+#define UNCORE_DISCOVERY_DVSEC_ID_MASK		0xffff
+/* PMON discovery entry type ID */
+#define UNCORE_DISCOVERY_DVSEC_ID_PMON		0x1
+/* Second DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC2_OFFSET		0xc
+/* Mask of the discovery table BAR offset */
+#define UNCORE_DISCOVERY_DVSEC2_BIR_MASK	0x7
+/* Discovery table BAR base offset */
+#define UNCORE_DISCOVERY_BIR_BASE		0x10
+/* Discovery table BAR step */
+#define UNCORE_DISCOVERY_BIR_STEP		0x4
+/* Mask of the discovery table offset */
+#define UNCORE_DISCOVERY_MASK			0xf
+/* Global discovery table size */
+#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE	0x20
+
+#define uncore_discovery_invalid_unit(unit)			\
+	(!unit.table1 || !unit.ctl || !unit.table3 ||	\
+	 unit.table1 == -1ULL || unit.ctl == -1ULL ||	\
+	 unit.table3 == -1ULL)
+
+enum uncore_access_type {
+	UNCORE_ACCESS_MSR	= 0,
+	UNCORE_ACCESS_MMIO,
+	UNCORE_ACCESS_PCI,
+
+	UNCORE_ACCESS_MAX,
+};
+
+struct uncore_global_discovery {
+	union {
+		u64	table1;
+		struct {
+			u64	type : 8,
+				stride : 8,
+				max_units : 10,
+				__reserved_1 : 36,
+				access_type : 2;
+		};
+	};
+
+	u64	ctl;		/* Global Control Address */
+
+	union {
+		u64	table3;
+		struct {
+			u64	status_offset : 8,
+				num_status : 16,
+				__reserved_2 : 40;
+		};
+	};
+};
+
+struct uncore_unit_discovery {
+	union {
+		u64	table1;
+		struct {
+			u64	num_regs : 8,
+				ctl_offset : 8,
+				bit_width : 8,
+				ctr_offset : 8,
+				status_offset : 8,
+				__reserved_1 : 22,
+				access_type : 2;
+			};
+		};
+
+	u64	ctl;		/* Unit Control Address */
+
+	union {
+		u64	table3;
+		struct {
+			u64	box_type : 16,
+				box_id : 16,
+				__reserved_2 : 32;
+		};
+	};
+};
+
+struct intel_uncore_discovery_type {
+	struct rb_node	node;
+	enum uncore_access_type	access_type;
+	u64		box_ctrl;	/* Unit ctrl addr of the first box */
+	u64		*box_ctrl_die;	/* Unit ctrl addr of the first box of each die */
+	u16		type;		/* Type ID of the uncore block */
+	u8		num_counters;
+	u8		counter_width;
+	u8		ctl_offset;	/* Counter Control 0 offset */
+	u8		ctr_offset;	/* Counter 0 offset */
+	u16		num_boxes;	/* number of boxes for the uncore block */
+	unsigned int	*ids;		/* Box IDs */
+	unsigned int	*box_offset;	/* Box offset */
+};
+
+bool intel_uncore_has_discovery_tables(void);
+void intel_uncore_clear_discovery_tables(void);
-- 
Gitee


From 71422c73a7c394dcfbf5e45d7795cb67171321ea Mon Sep 17 00:00:00 2001
From: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Date: Mon, 1 Jun 2020 11:35:42 +0300
Subject: [PATCH 0884/2161] perf/x86/intel/uncore: Wrap the max dies
 calculation into an accessor

commit 36b533bc5e3ed1039406f3b27e746b4d18f2cac1 upstream.

The accessor to return number of dies on the platform.

Signed-off-by: Alexander Antonov <alexander.antonov@linux.intel.com>
Signed-off-by: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Link: https://lkml.kernel.org/r/20200601083543.30011-3-alexander.antonov@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c | 13 +++++++------
 arch/x86/events/intel/uncore.h |  3 +++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index a6163ab3faa2..df516aab58cf 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -21,7 +21,7 @@ struct pci_driver *uncore_pci_driver;
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_extra_dev *uncore_extra_pci_dev;
-static int max_dies;
+int __uncore_max_dies;
 
 /* mask of cpus that collect uncore events */
 static cpumask_t uncore_cpu_mask;
@@ -113,7 +113,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu
 	 * The unsigned check also catches the '-1' return value for non
 	 * existent mappings in the topology map.
 	 */
-	return dieid < max_dies ? pmu->boxes[dieid] : NULL;
+	return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL;
 }
 
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -882,7 +882,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 {
 	int die;
 
-	for (die = 0; die < max_dies; die++)
+	for (die = 0; die < uncore_max_dies(); die++)
 		kfree(pmu->boxes[die]);
 	kfree(pmu->boxes);
 }
@@ -920,7 +920,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 	if (!pmus)
 		return -ENOMEM;
 
-	size = max_dies * sizeof(struct intel_uncore_box *);
+	size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
 
 	for (i = 0; i < type->num_boxes; i++) {
 		pmus[i].func_id	= setid ? i : -1;
@@ -1117,7 +1117,7 @@ static int __init uncore_pci_init(void)
 	size_t size;
 	int ret;
 
-	size = max_dies * sizeof(struct pci_extra_dev);
+	size = uncore_max_dies() * sizeof(struct pci_extra_dev);
 	uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
 	if (!uncore_extra_pci_dev) {
 		ret = -ENOMEM;
@@ -1536,7 +1536,8 @@ static int __init intel_uncore_init(void)
 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return -ENODEV;
 
-	max_dies = topology_max_packages() * topology_max_die_per_package();
+	__uncore_max_dies =
+		topology_max_packages() * topology_max_die_per_package();
 
 	id = x86_match_cpu(intel_uncore_match);
 	if (!id) {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 97bb76605f7a..3928481888ff 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -169,6 +169,9 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
 ssize_t uncore_event_show(struct device *dev,
 			  struct device_attribute *attr, char *buf);
 
+extern int __uncore_max_dies;
+#define uncore_max_dies()	(__uncore_max_dies)
+
 #define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
 {								\
 	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\
-- 
Gitee


From 3e7cc4d0c37c47ac7e1a97ad3781171c0aa28a42 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 17 Mar 2021 10:59:34 -0700
Subject: [PATCH 0885/2161] perf/x86/intel/uncore: Generic support for the MSR
 type of uncore blocks

commit d6c754130435ab786711bed75d04a2388a6b4da8 upstream.

The discovery table provides the generic uncore block information for
the MSR type of uncore blocks, e.g., the counter width, the number of
counters, the location of control/counter registers, which is good
enough to provide basic uncore support. It can be used as a fallback
solution when the kernel doesn't support a platform.

The name of the uncore box cannot be retrieved from the discovery table.
uncore_type_&typeID_&boxID will be used as its name. Save the type ID
and the box ID information in the struct intel_uncore_type.
Factor out uncore_get_pmu_name() to handle different naming methods.

Implement generic support for the MSR type of uncore block.

Some advanced features, such as filters and constraints, cannot be
retrieved from discovery tables. Features that rely on that
information are not be supported here.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1616003977-90612-3-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c           |  45 ++++++--
 arch/x86/events/intel/uncore.h           |   3 +
 arch/x86/events/intel/uncore_discovery.c | 126 +++++++++++++++++++++++
 arch/x86/events/intel/uncore_discovery.h |  18 ++++
 4 files changed, 182 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index df516aab58cf..daf60c277ffb 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -10,7 +10,7 @@ static bool uncore_no_discover;
 module_param(uncore_no_discover, bool, 0);
 MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism "
 				     "(default: enable the discovery mechanism).");
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type *empty_uncore[] = { NULL, };
 struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
 struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 struct intel_uncore_type **uncore_mmio_uncores = empty_uncore;
@@ -830,6 +830,34 @@ static const struct attribute_group uncore_pmu_attr_group = {
 	.attrs = uncore_pmu_attrs,
 };
 
+static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
+{
+	struct intel_uncore_type *type = pmu->type;
+
+	/*
+	 * No uncore block name in discovery table.
+	 * Use uncore_type_&typeid_&boxid as name.
+	 */
+	if (!type->name) {
+		if (type->num_boxes == 1)
+			sprintf(pmu->name, "uncore_type_%u", type->type_id);
+		else {
+			sprintf(pmu->name, "uncore_type_%u_%d",
+				type->type_id, type->box_ids[pmu->pmu_idx]);
+		}
+		return;
+	}
+
+	if (type->num_boxes == 1) {
+		if (strlen(type->name) > 0)
+			sprintf(pmu->name, "uncore_%s", type->name);
+		else
+			sprintf(pmu->name, "uncore");
+	} else
+		sprintf(pmu->name, "uncore_%s_%d", type->name, pmu->pmu_idx);
+
+}
+
 static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 {
 	int ret;
@@ -854,15 +882,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 		pmu->pmu.attr_groups = pmu->type->attr_groups;
 	}
 
-	if (pmu->type->num_boxes == 1) {
-		if (strlen(pmu->type->name) > 0)
-			sprintf(pmu->name, "uncore_%s", pmu->type->name);
-		else
-			sprintf(pmu->name, "uncore");
-	} else {
-		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
-			pmu->pmu_idx);
-	}
+	uncore_get_pmu_name(pmu);
 
 	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
 	if (!ret)
@@ -900,6 +920,10 @@ static void uncore_type_exit(struct intel_uncore_type *type)
 		kfree(type->pmus);
 		type->pmus = NULL;
 	}
+	if (type->box_ids) {
+		kfree(type->box_ids);
+		type->box_ids = NULL;
+	}
 	kfree(type->events_group);
 	type->events_group = NULL;
 }
@@ -1488,6 +1512,7 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
 };
 
 static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
+	.cpu_init = intel_uncore_generic_uncore_cpu_init,
 };
 
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 3928481888ff..ae33f8754723 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -50,6 +50,7 @@ struct intel_uncore_type {
 	int perf_ctr_bits;
 	int fixed_ctr_bits;
 	int num_freerunning_types;
+	int type_id;
 	unsigned perf_ctr;
 	unsigned event_ctl;
 	unsigned event_mask;
@@ -65,6 +66,7 @@ struct intel_uncore_type {
 	unsigned single_fixed:1;
 	unsigned pair_ctr_ctl:1;
 	unsigned *msr_offsets;
+	unsigned *box_ids;
 	struct event_constraint unconstrainted;
 	struct event_constraint *constraints;
 	struct intel_uncore_pmu *pmus;
@@ -514,6 +516,7 @@ uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
 void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
 u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
 
+extern struct intel_uncore_type *empty_uncore[];
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct intel_uncore_type **uncore_mmio_uncores;
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 7519ce346b57..fefb3e231d7a 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -316,3 +316,129 @@ void intel_uncore_clear_discovery_tables(void)
 		kfree(type);
 	}
 }
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh, thresh, "config:24-31");
+
+static struct attribute *generic_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh.attr,
+	NULL,
+};
+
+static const struct attribute_group generic_uncore_format_group = {
+	.name = "format",
+	.attrs = generic_uncore_formats_attr,
+};
+
+static void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_INT);
+}
+
+static void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+static void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(uncore_msr_box_ctl(box), 0);
+}
+
+static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box,
+					    struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box,
+					     struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, 0);
+}
+
+static struct intel_uncore_ops generic_uncore_msr_ops = {
+	.init_box		= intel_generic_uncore_msr_init_box,
+	.disable_box		= intel_generic_uncore_msr_disable_box,
+	.enable_box		= intel_generic_uncore_msr_enable_box,
+	.disable_event		= intel_generic_uncore_msr_disable_event,
+	.enable_event		= intel_generic_uncore_msr_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+};
+
+static bool uncore_update_uncore_type(enum uncore_access_type type_id,
+				      struct intel_uncore_type *uncore,
+				      struct intel_uncore_discovery_type *type)
+{
+	uncore->type_id = type->type;
+	uncore->num_boxes = type->num_boxes;
+	uncore->num_counters = type->num_counters;
+	uncore->perf_ctr_bits = type->counter_width;
+	uncore->box_ids = type->ids;
+
+	switch (type_id) {
+	case UNCORE_ACCESS_MSR:
+		uncore->ops = &generic_uncore_msr_ops;
+		uncore->perf_ctr = (unsigned int)type->box_ctrl + type->ctr_offset;
+		uncore->event_ctl = (unsigned int)type->box_ctrl + type->ctl_offset;
+		uncore->box_ctl = (unsigned int)type->box_ctrl;
+		uncore->msr_offsets = type->box_offset;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id)
+{
+	struct intel_uncore_discovery_type *type;
+	struct intel_uncore_type **uncores;
+	struct intel_uncore_type *uncore;
+	struct rb_node *node;
+	int i = 0;
+
+	uncores = kcalloc(num_discovered_types[type_id] + 1,
+			  sizeof(struct intel_uncore_type *), GFP_KERNEL);
+	if (!uncores)
+		return empty_uncore;
+
+	for (node = rb_first(&discovery_tables); node; node = rb_next(node)) {
+		type = rb_entry(node, struct intel_uncore_discovery_type, node);
+		if (type->access_type != type_id)
+			continue;
+
+		uncore = kzalloc(sizeof(struct intel_uncore_type), GFP_KERNEL);
+		if (!uncore)
+			break;
+
+		uncore->event_mask = GENERIC_PMON_RAW_EVENT_MASK;
+		uncore->format_group = &generic_uncore_format_group;
+
+		if (!uncore_update_uncore_type(type_id, uncore, type)) {
+			kfree(uncore);
+			continue;
+		}
+		uncores[i++] = uncore;
+	}
+
+	return uncores;
+}
+
+void intel_uncore_generic_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR);
+}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 95afa393145a..87078ba932fc 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -28,6 +28,23 @@
 	 unit.table1 == -1ULL || unit.ctl == -1ULL ||	\
 	 unit.table3 == -1ULL)
 
+#define GENERIC_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define GENERIC_PMON_CTL_UMASK_MASK	0x0000ff00
+#define GENERIC_PMON_CTL_EDGE_DET	(1 << 18)
+#define GENERIC_PMON_CTL_INVERT		(1 << 23)
+#define GENERIC_PMON_CTL_TRESH_MASK	0xff000000
+#define GENERIC_PMON_RAW_EVENT_MASK	(GENERIC_PMON_CTL_EV_SEL_MASK | \
+					 GENERIC_PMON_CTL_UMASK_MASK | \
+					 GENERIC_PMON_CTL_EDGE_DET | \
+					 GENERIC_PMON_CTL_INVERT | \
+					 GENERIC_PMON_CTL_TRESH_MASK)
+
+#define GENERIC_PMON_BOX_CTL_FRZ	(1 << 0)
+#define GENERIC_PMON_BOX_CTL_RST_CTRL	(1 << 8)
+#define GENERIC_PMON_BOX_CTL_RST_CTRS	(1 << 9)
+#define GENERIC_PMON_BOX_CTL_INT	(GENERIC_PMON_BOX_CTL_RST_CTRL | \
+					 GENERIC_PMON_BOX_CTL_RST_CTRS)
+
 enum uncore_access_type {
 	UNCORE_ACCESS_MSR	= 0,
 	UNCORE_ACCESS_MMIO,
@@ -103,3 +120,4 @@ struct intel_uncore_discovery_type {
 
 bool intel_uncore_has_discovery_tables(void);
 void intel_uncore_clear_discovery_tables(void);
+void intel_uncore_generic_uncore_cpu_init(void);
-- 
Gitee


From 652058902fa8f73f58303768c634008de5064257 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 14 Sep 2020 07:34:15 -0700
Subject: [PATCH 0886/2161] perf/x86/intel/uncore: Factor out
 uncore_pci_get_dev_die_info()

commit c8872d90e0a3651a096860d3241625ccfa1647e0 upstream.

The socket and die information is required to register/unregister a PMU
in the uncore PCI sub driver. The codes, which get the socket and die
information from a BUS number, can be shared.

Factor out uncore_pci_get_dev_die_info(), which will be used later.

There is no functional change.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1600094060-82746-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index daf60c277ffb..05a2adca645c 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1006,6 +1006,26 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
 	return 0;
 }
 
+/*
+ * Get the die information of a PCI device.
+ * @pdev: The PCI device.
+ * @phys_id: The physical socket id which the device maps to.
+ * @die: The die id which the device maps to.
+ */
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
+				       int *phys_id, int *die)
+{
+	*phys_id = uncore_pcibus_to_physid(pdev->bus);
+	if (*phys_id < 0)
+		return -ENODEV;
+
+	*die = (topology_max_die_per_package() > 1) ? *phys_id :
+				topology_phys_to_logical_pkg(*phys_id);
+	if (*die < 0)
+		return -EINVAL;
+
+	return 0;
+}
 /*
  * add a pci uncore device
  */
@@ -1016,14 +1036,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	struct intel_uncore_box *box;
 	int phys_id, die, ret;
 
-	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (phys_id < 0)
-		return -ENODEV;
-
-	die = (topology_max_die_per_package() > 1) ? phys_id :
-					topology_phys_to_logical_pkg(phys_id);
-	if (die < 0)
-		return -EINVAL;
+	ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+	if (ret)
+		return ret;
 
 	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
 		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
-- 
Gitee


From 4c536fb820532b6ecd4a7329c103c8ff186c28f0 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 14 Sep 2020 07:34:16 -0700
Subject: [PATCH 0887/2161] perf/x86/intel/uncore: Factor out
 uncore_pci_find_dev_pmu()

commit c8872d90e0a3651a096860d3241625ccfa1647e0 upstream.

When an uncore PCI sub driver gets a remove notification, the
corresponding PMU has to be retrieved and unregistered. The codes, which
find the corresponding PMU by comparing the pci_device_id table, can be
shared.

Factor out uncore_pci_find_dev_pmu(), which will be used later.

There is no functional change.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1600094060-82746-3-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c | 48 +++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 05a2adca645c..cc816c4fb596 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1026,6 +1026,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
 
 	return 0;
 }
+
+/*
+ * Find the PMU of a PCI device.
+ * @pdev: The PCI device.
+ * @ids: The ID table of the available PCI devices with a PMU.
+ */
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
+{
+	struct intel_uncore_pmu *pmu = NULL;
+	struct intel_uncore_type *type;
+	kernel_ulong_t data;
+	unsigned int devfn;
+
+	while (ids && ids->vendor) {
+		if ((ids->vendor == pdev->vendor) &&
+		    (ids->device == pdev->device)) {
+			data = ids->driver_data;
+			devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(data),
+					  UNCORE_PCI_DEV_FUNC(data));
+			if (devfn == pdev->devfn) {
+				type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(data)];
+				pmu = &type->pmus[UNCORE_PCI_DEV_IDX(data)];
+				break;
+			}
+		}
+		ids++;
+	}
+	return pmu;
+}
+
 /*
  * add a pci uncore device
  */
@@ -1057,21 +1088,8 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	 */
 	if (id->driver_data & ~0xffff) {
 		struct pci_driver *pci_drv = pdev->driver;
-		const struct pci_device_id *ids = pci_drv->id_table;
-		unsigned int devfn;
-
-		while (ids && ids->vendor) {
-			if ((ids->vendor == pdev->vendor) &&
-			    (ids->device == pdev->device)) {
-				devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
-						  UNCORE_PCI_DEV_FUNC(ids->driver_data));
-				if (devfn == pdev->devfn) {
-					pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
-					break;
-				}
-			}
-			ids++;
-		}
+
+		pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table);
 		if (pmu == NULL)
 			return -ENODEV;
 	} else {
-- 
Gitee


From 39df4dfb4dbc6aeb0dab64b7cbf077a899714888 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 14 Sep 2020 07:34:19 -0700
Subject: [PATCH 0888/2161] perf/x86/intel/uncore: Generic support for the PCI
 sub driver

commit 95a7fc77443328ac8b68378df8e137a044ece5e8 upstream.

Some uncore counters may be located in the configuration space of a PCI
device, which already has a bonded driver. Currently, the uncore driver
cannot register a PCI uncore PMU for these counters, because, to
register a PCI uncore PMU, the uncore driver must be bond to the device.
However, one device can only have one bonded driver.

Add an uncore PCI sub driver to support such kind of devices.

The sub driver doesn't own the device. In initialization, the sub
driver searches the device via pci_get_device(), and register the
corresponding PMU for the device. In the meantime, the sub driver
registers a PCI bus notifier, which is used to notify the sub driver
once the device is removed. The sub driver can unregister the PMU
accordingly.

The sub driver only searches the devices defined in its id table. The
id table varies on different platforms, which will be implemented in the
following platform-specific patch.

Suggested-by: Bjorn Helgaas <helgaas@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1600094060-82746-6-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c | 81 ++++++++++++++++++++++++++++++++++
 arch/x86/events/intel/uncore.h |  1 +
 2 files changed, 82 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index cc816c4fb596..e1e9e09a1614 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -17,6 +17,8 @@ struct intel_uncore_type **uncore_mmio_uncores = empty_uncore;
 
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
+/* The PCI driver for the device which the uncore doesn't own. */
+struct pci_driver *uncore_pci_sub_driver;
 /* pci bus to socket mapping */
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
@@ -1169,6 +1171,80 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	kfree(box);
 }
 
+static int uncore_bus_notify(struct notifier_block *nb,
+			     unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct intel_uncore_pmu *pmu;
+	int phys_id, die;
+
+	/* Unregister the PMU when the device is going to be deleted. */
+	if (action != BUS_NOTIFY_DEL_DEVICE)
+		return NOTIFY_DONE;
+
+	pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table);
+	if (!pmu)
+		return NOTIFY_DONE;
+
+	if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+		return NOTIFY_DONE;
+
+	uncore_pci_pmu_unregister(pmu, phys_id, die);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_notifier = {
+	.notifier_call = uncore_bus_notify,
+};
+
+static void uncore_pci_sub_driver_init(void)
+{
+	const struct pci_device_id *ids = uncore_pci_sub_driver->id_table;
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct pci_dev *pci_sub_dev;
+	bool notify = false;
+	unsigned int devfn;
+	int phys_id, die;
+
+	while (ids && ids->vendor) {
+		pci_sub_dev = NULL;
+		type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(ids->driver_data)];
+		/*
+		 * Search the available device, and register the
+		 * corresponding PMU.
+		 */
+		while ((pci_sub_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
+						     ids->device, pci_sub_dev))) {
+			devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
+					  UNCORE_PCI_DEV_FUNC(ids->driver_data));
+			if (devfn != pci_sub_dev->devfn)
+				continue;
+
+			pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
+			if (!pmu)
+				continue;
+
+			if (uncore_pci_get_dev_die_info(pci_sub_dev,
+							&phys_id, &die))
+				continue;
+
+			if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
+						     phys_id, die))
+				notify = true;
+		}
+		ids++;
+	}
+
+	if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier))
+		notify = false;
+
+	if (!notify)
+		uncore_pci_sub_driver = NULL;
+}
+
 static int __init uncore_pci_init(void)
 {
 	size_t size;
@@ -1192,6 +1268,9 @@ static int __init uncore_pci_init(void)
 	if (ret)
 		goto errtype;
 
+	if (uncore_pci_sub_driver)
+		uncore_pci_sub_driver_init();
+
 	pcidrv_registered = true;
 	return 0;
 
@@ -1209,6 +1288,8 @@ static void uncore_pci_exit(void)
 {
 	if (pcidrv_registered) {
 		pcidrv_registered = false;
+		if (uncore_pci_sub_driver)
+			bus_unregister_notifier(&pci_bus_type, &uncore_notifier);
 		pci_unregister_driver(uncore_pci_driver);
 		uncore_types_exit(uncore_pci_uncores);
 		kfree(uncore_extra_pci_dev);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index ae33f8754723..40aa586bdb6b 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -521,6 +521,7 @@ extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct intel_uncore_type **uncore_mmio_uncores;
 extern struct pci_driver *uncore_pci_driver;
+extern struct pci_driver *uncore_pci_sub_driver;
 extern raw_spinlock_t pci2phy_map_lock;
 extern struct list_head pci2phy_map_head;
 extern struct pci_extra_dev *uncore_extra_pci_dev;
-- 
Gitee


From 7b7a5c36377a083160f3cc5be209aab7dada7dfc Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 14 Sep 2020 07:34:17 -0700
Subject: [PATCH 0889/2161] perf/x86/intel/uncore: Factor out
 uncore_pci_pmu_register()

commit c8872d90e0a3651a096860d3241625ccfa1647e0 upstream.

The PMU registration in the uncore PCI sub driver is similar as the
normal PMU registration for a PCI device. The codes to register a PCI
PMU can be shared.

Factor out uncore_pci_pmu_register(), which will be used later.

The pci_set_drvdata() is not included in uncore_pci_pmu_register(). The
uncore PCI sub driver doesn't own the PCI device. It will not touch the
private driver data pointer for the device.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1600094060-82746-4-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c | 82 +++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e1e9e09a1614..e5d915d81fa6 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1059,6 +1059,55 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
 	return pmu;
 }
 
+/*
+ * Register the PMU for a PCI device
+ * @pdev: The PCI device.
+ * @type: The corresponding PMU type of the device.
+ * @pmu: The corresponding PMU of the device.
+ * @phys_id: The physical socket id which the device maps to.
+ * @die: The die id which the device maps to.
+ */
+static int uncore_pci_pmu_register(struct pci_dev *pdev,
+				   struct intel_uncore_type *type,
+				   struct intel_uncore_pmu *pmu,
+				   int phys_id, int die)
+{
+	struct intel_uncore_box *box;
+	int ret;
+
+	if (WARN_ON_ONCE(pmu->boxes[die] != NULL))
+		return -EINVAL;
+
+	box = uncore_alloc_box(type, NUMA_NO_NODE);
+	if (!box)
+		return -ENOMEM;
+
+	if (pmu->func_id < 0)
+		pmu->func_id = pdev->devfn;
+	else
+		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+	atomic_inc(&box->refcnt);
+	box->pci_phys_id = phys_id;
+	box->dieid = die;
+	box->pci_dev = pdev;
+	box->pmu = pmu;
+	uncore_box_init(box);
+
+	pmu->boxes[die] = box;
+	if (atomic_inc_return(&pmu->activeboxes) > 1)
+		return 0;
+
+	/* First active box registers the pmu */
+	ret = uncore_pmu_register(pmu);
+	if (ret) {
+		pmu->boxes[die] = NULL;
+		uncore_box_exit(box);
+		kfree(box);
+	}
+	return ret;
+}
+
 /*
  * add a pci uncore device
  */
@@ -1066,7 +1115,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 {
 	struct intel_uncore_type *type;
 	struct intel_uncore_pmu *pmu = NULL;
-	struct intel_uncore_box *box;
 	int phys_id, die, ret;
 
 	ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
@@ -1102,38 +1150,10 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 		pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
 	}
 
-	if (WARN_ON_ONCE(pmu->boxes[die] != NULL))
-		return -EINVAL;
-
-	box = uncore_alloc_box(type, NUMA_NO_NODE);
-	if (!box)
-		return -ENOMEM;
-
-	if (pmu->func_id < 0)
-		pmu->func_id = pdev->devfn;
-	else
-		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
-
-	atomic_inc(&box->refcnt);
-	box->pci_phys_id = phys_id;
-	box->dieid = die;
-	box->pci_dev = pdev;
-	box->pmu = pmu;
-	uncore_box_init(box);
-	pci_set_drvdata(pdev, box);
+	ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
 
-	pmu->boxes[die] = box;
-	if (atomic_inc_return(&pmu->activeboxes) > 1)
-		return 0;
+	pci_set_drvdata(pdev, pmu->boxes[die]);
 
-	/* First active box registers the pmu */
-	ret = uncore_pmu_register(pmu);
-	if (ret) {
-		pci_set_drvdata(pdev, NULL);
-		pmu->boxes[die] = NULL;
-		uncore_box_exit(box);
-		kfree(box);
-	}
 	return ret;
 }
 
-- 
Gitee


From adcb65f64a7047b5f543fe3782457106496e685d Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 14 Sep 2020 07:34:18 -0700
Subject: [PATCH 0890/2161] perf/x86/intel/uncore: Factor out
 uncore_pci_pmu_unregister()

commit c8872d90e0a3651a096860d3241625ccfa1647e0 upstream.

The PMU unregistration in the uncore PCI sub driver is similar as the
normal PMU unregistration for a PCI device. The codes to unregister a
PCI PMU can be shared.

Factor out uncore_pci_pmu_unregister(), which will be used later.

Use uncore_pci_get_dev_die_info() to replace the codes which retrieve
the socket and die informaion.

The pci_set_drvdata() is not included in uncore_pci_pmu_unregister() as
well, because the uncore PCI sub driver will not touch the private
driver data pointer of the device.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1600094060-82746-5-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c | 35 ++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e5d915d81fa6..ba62fd6379ad 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1157,18 +1157,38 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	return ret;
 }
 
+/*
+ * Unregister the PMU of a PCI device
+ * @pmu: The corresponding PMU is unregistered.
+ * @phys_id: The physical socket id which the device maps to.
+ * @die: The die id which the device maps to.
+ */
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
+				      int phys_id, int die)
+{
+	struct intel_uncore_box *box = pmu->boxes[die];
+
+	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
+		return;
+
+	pmu->boxes[die] = NULL;
+	if (atomic_dec_return(&pmu->activeboxes) == 0)
+		uncore_pmu_unregister(pmu);
+	uncore_box_exit(box);
+	kfree(box);
+}
+
 static void uncore_pci_remove(struct pci_dev *pdev)
 {
 	struct intel_uncore_box *box;
 	struct intel_uncore_pmu *pmu;
 	int i, phys_id, die;
 
-	phys_id = uncore_pcibus_to_physid(pdev->bus);
+	if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+		return;
 
 	box = pci_get_drvdata(pdev);
 	if (!box) {
-		die = (topology_max_die_per_package() > 1) ? phys_id :
-					topology_phys_to_logical_pkg(phys_id);
 		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
 			if (uncore_extra_pci_dev[die].dev[i] == pdev) {
 				uncore_extra_pci_dev[die].dev[i] = NULL;
@@ -1180,15 +1200,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	}
 
 	pmu = box->pmu;
-	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
-		return;
 
 	pci_set_drvdata(pdev, NULL);
-	pmu->boxes[box->dieid] = NULL;
-	if (atomic_dec_return(&pmu->activeboxes) == 0)
-		uncore_pmu_unregister(pmu);
-	uncore_box_exit(box);
-	kfree(box);
+
+	uncore_pci_pmu_unregister(pmu, phys_id, die);
 }
 
 static int uncore_bus_notify(struct notifier_block *nb,
-- 
Gitee


From 15db160ff89a9fd2bfc151daedd3094d05f3f4c4 Mon Sep 17 00:00:00 2001
From: Steve Wahl <steve.wahl@hpe.com>
Date: Fri, 8 Jan 2021 09:35:48 -0600
Subject: [PATCH 0891/2161] perf/x86/intel/uncore: Store the logical die id
 instead of the physical die id.

commit ba9506be4e402ee597b8f41204008b97989b5eef upstream.

The phys_id isn't really used other than to map to a logical die id.
Calculate the logical die id earlier, and store that instead of the
phys_id.

Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lkml.kernel.org/r/20210108153549.108989-2-steve.wahl@hpe.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c       | 58 ++++++++++------------------
 arch/x86/events/intel/uncore.h       |  5 +--
 arch/x86/events/intel/uncore_snb.c   |  2 +-
 arch/x86/events/intel/uncore_snbep.c | 31 +++++++--------
 4 files changed, 39 insertions(+), 57 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index ba62fd6379ad..e761abf59fe7 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -36,21 +36,21 @@ struct event_constraint uncore_constraint_empty =
 
 MODULE_LICENSE("GPL");
 
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
 {
 	struct pci2phy_map *map;
-	int phys_id = -1;
+	int die_id = -1;
 
 	raw_spin_lock(&pci2phy_map_lock);
 	list_for_each_entry(map, &pci2phy_map_head, list) {
 		if (map->segment == pci_domain_nr(bus)) {
-			phys_id = map->pbus_to_physid[bus->number];
+			die_id = map->pbus_to_dieid[bus->number];
 			break;
 		}
 	}
 	raw_spin_unlock(&pci2phy_map_lock);
 
-	return phys_id;
+	return die_id;
 }
 
 static void uncore_free_pcibus_map(void)
@@ -91,7 +91,7 @@ struct pci2phy_map *__find_pci2phy_map(int segment)
 	alloc = NULL;
 	map->segment = segment;
 	for (i = 0; i < 256; i++)
-		map->pbus_to_physid[i] = -1;
+		map->pbus_to_dieid[i] = -1;
 	list_add_tail(&map->list, &pci2phy_map_head);
 
 end:
@@ -334,7 +334,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
 
 	uncore_pmu_init_hrtimer(box);
 	box->cpu = -1;
-	box->pci_phys_id = -1;
 	box->dieid = -1;
 
 	/* set default hrtimer timeout */
@@ -1011,18 +1010,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
 /*
  * Get the die information of a PCI device.
  * @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
-				       int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
 {
-	*phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (*phys_id < 0)
-		return -ENODEV;
-
-	*die = (topology_max_die_per_package() > 1) ? *phys_id :
-				topology_phys_to_logical_pkg(*phys_id);
+	*die = uncore_pcibus_to_dieid(pdev->bus);
 	if (*die < 0)
 		return -EINVAL;
 
@@ -1064,13 +1056,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
  * @pdev: The PCI device.
  * @type: The corresponding PMU type of the device.
  * @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
 static int uncore_pci_pmu_register(struct pci_dev *pdev,
 				   struct intel_uncore_type *type,
 				   struct intel_uncore_pmu *pmu,
-				   int phys_id, int die)
+				   int die)
 {
 	struct intel_uncore_box *box;
 	int ret;
@@ -1088,7 +1079,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
 		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
 
 	atomic_inc(&box->refcnt);
-	box->pci_phys_id = phys_id;
 	box->dieid = die;
 	box->pci_dev = pdev;
 	box->pmu = pmu;
@@ -1115,9 +1105,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 {
 	struct intel_uncore_type *type;
 	struct intel_uncore_pmu *pmu = NULL;
-	int phys_id, die, ret;
+	int die, ret;
 
-	ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+	ret = uncore_pci_get_dev_die_info(pdev, &die);
 	if (ret)
 		return ret;
 
@@ -1150,7 +1140,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 		pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
 	}
 
-	ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+	ret = uncore_pci_pmu_register(pdev, type, pmu, die);
 
 	pci_set_drvdata(pdev, pmu->boxes[die]);
 
@@ -1160,17 +1150,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 /*
  * Unregister the PMU of a PCI device
  * @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
-				      int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
 {
 	struct intel_uncore_box *box = pmu->boxes[die];
 
-	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
-		return;
-
 	pmu->boxes[die] = NULL;
 	if (atomic_dec_return(&pmu->activeboxes) == 0)
 		uncore_pmu_unregister(pmu);
@@ -1182,9 +1167,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
 	struct intel_uncore_box *box;
 	struct intel_uncore_pmu *pmu;
-	int i, phys_id, die;
+	int i, die;
 
-	if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+	if (uncore_pci_get_dev_die_info(pdev, &die))
 		return;
 
 	box = pci_get_drvdata(pdev);
@@ -1203,7 +1188,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 
 	pci_set_drvdata(pdev, NULL);
 
-	uncore_pci_pmu_unregister(pmu, phys_id, die);
+	uncore_pci_pmu_unregister(pmu, die);
 }
 
 static int uncore_bus_notify(struct notifier_block *nb,
@@ -1212,7 +1197,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
 	struct device *dev = data;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct intel_uncore_pmu *pmu;
-	int phys_id, die;
+	int die;
 
 	/* Unregister the PMU when the device is going to be deleted. */
 	if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1222,10 +1207,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
 	if (!pmu)
 		return NOTIFY_DONE;
 
-	if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+	if (uncore_pci_get_dev_die_info(pdev, &die))
 		return NOTIFY_DONE;
 
-	uncore_pci_pmu_unregister(pmu, phys_id, die);
+	uncore_pci_pmu_unregister(pmu, die);
 
 	return NOTIFY_OK;
 }
@@ -1242,7 +1227,7 @@ static void uncore_pci_sub_driver_init(void)
 	struct pci_dev *pci_sub_dev;
 	bool notify = false;
 	unsigned int devfn;
-	int phys_id, die;
+	int die;
 
 	while (ids && ids->vendor) {
 		pci_sub_dev = NULL;
@@ -1262,12 +1247,11 @@ static void uncore_pci_sub_driver_init(void)
 			if (!pmu)
 				continue;
 
-			if (uncore_pci_get_dev_die_info(pci_sub_dev,
-							&phys_id, &die))
+			if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
 				continue;
 
 			if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
-						     phys_id, die))
+						     die))
 				notify = true;
 		}
 		ids++;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 40aa586bdb6b..0998130b3bfc 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -113,7 +113,6 @@ struct intel_uncore_extra_reg {
 };
 
 struct intel_uncore_box {
-	int pci_phys_id;
 	int dieid;	/* Logical die ID */
 	int n_active;	/* number of active events */
 	int n_events;
@@ -162,11 +161,11 @@ struct freerunning_counters {
 struct pci2phy_map {
 	struct list_head list;
 	int segment;
-	int pbus_to_physid[256];
+	int pbus_to_dieid[256];
 };
 
 struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
 
 ssize_t uncore_event_show(struct device *dev,
 			  struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index aec6e63c6a04..c7adcfa2c923 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -567,7 +567,7 @@ int snb_pci2phy_map_init(int devid)
 		pci_dev_put(dev);
 		return -ENOMEM;
 	}
-	map->pbus_to_physid[bus] = 0;
+	map->pbus_to_dieid[bus] = 0;
 	raw_spin_unlock(&pci2phy_map_lock);
 
 	pci_dev_put(dev);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 0aa40798252b..47d0caf6bb85 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1329,7 +1329,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
 static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
 {
 	struct pci_dev *ubox_dev = NULL;
-	int i, bus, nodeid, segment;
+	int i, bus, nodeid, segment, die_id;
 	struct pci2phy_map *map;
 	int err = 0;
 	u32 config = 0;
@@ -1365,7 +1365,11 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 		 */
 		for (i = 0; i < 8; i++) {
 			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				map->pbus_to_physid[bus] = i;
+				if (topology_max_die_per_package() > 1)
+					die_id = i;
+				else
+					die_id = topology_phys_to_logical_pkg(i);
+				map->pbus_to_dieid[bus] = die_id;
 				break;
 			}
 		}
@@ -1382,17 +1386,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 			i = -1;
 			if (reverse) {
 				for (bus = 255; bus >= 0; bus--) {
-					if (map->pbus_to_physid[bus] >= 0)
-						i = map->pbus_to_physid[bus];
+					if (map->pbus_to_dieid[bus] >= 0)
+						i = map->pbus_to_dieid[bus];
 					else
-						map->pbus_to_physid[bus] = i;
+						map->pbus_to_dieid[bus] = i;
 				}
 			} else {
 				for (bus = 0; bus <= 255; bus++) {
-					if (map->pbus_to_physid[bus] >= 0)
-						i = map->pbus_to_physid[bus];
+					if (map->pbus_to_dieid[bus] >= 0)
+						i = map->pbus_to_dieid[bus];
 					else
-						map->pbus_to_physid[bus] = i;
+						map->pbus_to_dieid[bus] = i;
 				}
 			}
 		}
@@ -4390,19 +4394,14 @@ int snr_uncore_pci_init(void)
 static struct pci_dev *snr_uncore_get_mc_dev(int id)
 {
 	struct pci_dev *mc_dev = NULL;
-	int phys_id, pkg;
+	int pkg;
 
 	while (1) {
 		mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
 		if (!mc_dev)
 			break;
-		phys_id = uncore_pcibus_to_physid(mc_dev->bus);
-		if (phys_id < 0)
-			continue;
-		pkg = topology_phys_to_logical_pkg(phys_id);
-		if (pkg < 0)
-			continue;
-		else if (pkg == id)
+		pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+		if (pkg == id)
 			break;
 	}
 	return mc_dev;
-- 
Gitee


From 7b7637d9bc084f183054bbb1d0a5f2258fd84acf Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 17 Mar 2021 10:59:35 -0700
Subject: [PATCH 0892/2161] perf/x86/intel/uncore: Rename uncore_notifier to
 uncore_pci_sub_notifier

commit 6477dc3934775f82a571fac469fd8c348e611095 upstream.

Perf will use a similar method to the PCI sub driver to register
the PMUs for the PCI type of uncore blocks. The method requires a BUS
notifier to support hotplug. The current BUS notifier cannot be reused,
because it searches a const id_table for the corresponding registered
PMU. The PCI type of uncore blocks in the discovery tables doesn't
provide an id_table.

Factor out uncore_bus_notify() and add the pointer of an id_table as a
parameter. The uncore_bus_notify() will be reused in the following
patch.

The current BUS notifier is only used by the PCI sub driver. Its name is
too generic. Rename it to uncore_pci_sub_notifier, which is specific for
the PCI sub driver.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1616003977-90612-4-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e761abf59fe7..b231cbd8e01c 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1192,7 +1192,8 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 }
 
 static int uncore_bus_notify(struct notifier_block *nb,
-			     unsigned long action, void *data)
+			     unsigned long action, void *data,
+			     const struct pci_device_id *ids)
 {
 	struct device *dev = data;
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -1203,7 +1204,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
 	if (action != BUS_NOTIFY_DEL_DEVICE)
 		return NOTIFY_DONE;
 
-	pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table);
+	pmu = uncore_pci_find_dev_pmu(pdev, ids);
 	if (!pmu)
 		return NOTIFY_DONE;
 
@@ -1215,8 +1216,15 @@ static int uncore_bus_notify(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block uncore_notifier = {
-	.notifier_call = uncore_bus_notify,
+static int uncore_pci_sub_bus_notify(struct notifier_block *nb,
+				     unsigned long action, void *data)
+{
+	return uncore_bus_notify(nb, action, data,
+				 uncore_pci_sub_driver->id_table);
+}
+
+static struct notifier_block uncore_pci_sub_notifier = {
+	.notifier_call = uncore_pci_sub_bus_notify,
 };
 
 static void uncore_pci_sub_driver_init(void)
@@ -1257,7 +1265,7 @@ static void uncore_pci_sub_driver_init(void)
 		ids++;
 	}
 
-	if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier))
+	if (notify && bus_register_notifier(&pci_bus_type, &uncore_pci_sub_notifier))
 		notify = false;
 
 	if (!notify)
@@ -1308,7 +1316,7 @@ static void uncore_pci_exit(void)
 	if (pcidrv_registered) {
 		pcidrv_registered = false;
 		if (uncore_pci_sub_driver)
-			bus_unregister_notifier(&pci_bus_type, &uncore_notifier);
+			bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier);
 		pci_unregister_driver(uncore_pci_driver);
 		uncore_types_exit(uncore_pci_uncores);
 		kfree(uncore_extra_pci_dev);
-- 
Gitee


From 1f7ed08fabdb998e7f9a603d38522724de5328fb Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 17 Mar 2021 10:59:36 -0700
Subject: [PATCH 0893/2161] perf/x86/intel/uncore: Generic support for the PCI
 type of uncore blocks

commit 95a7fc77443328ac8b68378df8e137a044ece5e8 upstream.

The discovery table provides the generic uncore block information
for the PCI type of uncore blocks, which is good enough to provide
basic uncore support.

The PCI BUS and DEVFN information can be retrieved from the box control
field. Introduce the uncore_pci_pmus_register() to register all the
PCICFG type of uncore blocks. The old PCI probe/remove way is dropped.

The PCI BUS and DEVFN information are different among dies. Add box_ctls
to store the box control field of each die.

Add a new BUS notifier for the PCI type of uncore block to support the
hotplug. If the device is "hot remove", the corresponding registered PMU
has to be unregistered. Perf cannot locate the PMU by searching a const
pci_device_id table, because the discovery tables don't provide such
information. Introduce uncore_pci_find_dev_pmu_from_types() to search
the whole uncore_pci_uncores for the PMU.

Implement generic support for the PCI type of uncore block.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1616003977-90612-5-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c           | 91 ++++++++++++++++++++++--
 arch/x86/events/intel/uncore.h           |  6 +-
 arch/x86/events/intel/uncore_discovery.c | 80 +++++++++++++++++++++
 arch/x86/events/intel/uncore_discovery.h |  7 ++
 4 files changed, 177 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index b231cbd8e01c..f3136c7ecef7 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1021,10 +1021,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
 	return 0;
 }
 
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev)
+{
+	struct intel_uncore_type **types = uncore_pci_uncores;
+	struct intel_uncore_type *type;
+	u64 box_ctl;
+	int i, die;
+
+	for (; *types; types++) {
+		type = *types;
+		for (die = 0; die < __uncore_max_dies; die++) {
+			for (i = 0; i < type->num_boxes; i++) {
+				if (!type->box_ctls[die])
+					continue;
+				box_ctl = type->box_ctls[die] + type->pci_offsets[i];
+				if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) &&
+				    pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(box_ctl) &&
+				    pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl))
+					return &type->pmus[i];
+			}
+		}
+	}
+
+	return NULL;
+}
+
 /*
  * Find the PMU of a PCI device.
  * @pdev: The PCI device.
  * @ids: The ID table of the available PCI devices with a PMU.
+ *       If NULL, search the whole uncore_pci_uncores.
  */
 static struct intel_uncore_pmu *
 uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
@@ -1034,6 +1061,9 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
 	kernel_ulong_t data;
 	unsigned int devfn;
 
+	if (!ids)
+		return uncore_pci_find_dev_pmu_from_types(pdev);
+
 	while (ids && ids->vendor) {
 		if ((ids->vendor == pdev->vendor) &&
 		    (ids->device == pdev->device)) {
@@ -1272,6 +1302,48 @@ static void uncore_pci_sub_driver_init(void)
 		uncore_pci_sub_driver = NULL;
 }
 
+static int uncore_pci_bus_notify(struct notifier_block *nb,
+				     unsigned long action, void *data)
+{
+	return uncore_bus_notify(nb, action, data, NULL);
+}
+
+static struct notifier_block uncore_pci_notifier = {
+	.notifier_call = uncore_pci_bus_notify,
+};
+
+
+static void uncore_pci_pmus_register(void)
+{
+	struct intel_uncore_type **types = uncore_pci_uncores;
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct pci_dev *pdev;
+	u64 box_ctl;
+	int i, die;
+
+	for (; *types; types++) {
+		type = *types;
+		for (die = 0; die < __uncore_max_dies; die++) {
+			for (i = 0; i < type->num_boxes; i++) {
+				if (!type->box_ctls[die])
+					continue;
+				box_ctl = type->box_ctls[die] + type->pci_offsets[i];
+				pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl),
+								   UNCORE_DISCOVERY_PCI_BUS(box_ctl),
+								   UNCORE_DISCOVERY_PCI_DEVFN(box_ctl));
+				if (!pdev)
+					continue;
+				pmu = &type->pmus[i];
+
+				uncore_pci_pmu_register(pdev, type, pmu, die);
+			}
+		}
+	}
+
+	bus_register_notifier(&pci_bus_type, &uncore_pci_notifier);
+}
+
 static int __init uncore_pci_init(void)
 {
 	size_t size;
@@ -1288,12 +1360,15 @@ static int __init uncore_pci_init(void)
 	if (ret)
 		goto errtype;
 
-	uncore_pci_driver->probe = uncore_pci_probe;
-	uncore_pci_driver->remove = uncore_pci_remove;
+	if (uncore_pci_driver) {
+		uncore_pci_driver->probe = uncore_pci_probe;
+		uncore_pci_driver->remove = uncore_pci_remove;
 
-	ret = pci_register_driver(uncore_pci_driver);
-	if (ret)
-		goto errtype;
+		ret = pci_register_driver(uncore_pci_driver);
+		if (ret)
+			goto errtype;
+	} else
+		uncore_pci_pmus_register();
 
 	if (uncore_pci_sub_driver)
 		uncore_pci_sub_driver_init();
@@ -1317,7 +1392,10 @@ static void uncore_pci_exit(void)
 		pcidrv_registered = false;
 		if (uncore_pci_sub_driver)
 			bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier);
-		pci_unregister_driver(uncore_pci_driver);
+		if (uncore_pci_driver)
+			pci_unregister_driver(uncore_pci_driver);
+		else
+			bus_unregister_notifier(&pci_bus_type, &uncore_pci_notifier);
 		uncore_types_exit(uncore_pci_uncores);
 		kfree(uncore_extra_pci_dev);
 		uncore_free_pcibus_map();
@@ -1654,6 +1732,7 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
 
 static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
 	.cpu_init = intel_uncore_generic_uncore_cpu_init,
+	.pci_init = intel_uncore_generic_uncore_pci_init,
 };
 
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 0998130b3bfc..90f667dc5d89 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -58,6 +58,7 @@ struct intel_uncore_type {
 	unsigned fixed_ctr;
 	unsigned fixed_ctl;
 	unsigned box_ctl;
+	u64 *box_ctls;	/* Unit ctrl addr of the first box of each die */
 	union {
 		unsigned msr_offset;
 		unsigned mmio_offset;
@@ -65,7 +66,10 @@ struct intel_uncore_type {
 	unsigned num_shared_regs:8;
 	unsigned single_fixed:1;
 	unsigned pair_ctr_ctl:1;
-	unsigned *msr_offsets;
+	union {
+		unsigned *msr_offsets;
+		unsigned *pci_offsets;
+	};
 	unsigned *box_ids;
 	struct event_constraint unconstrainted;
 	struct event_constraint *constraints;
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index fefb3e231d7a..784d7b45fada 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -377,6 +377,71 @@ static struct intel_uncore_ops generic_uncore_msr_ops = {
 	.read_counter		= uncore_msr_read_counter,
 };
 
+static void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	__set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+	pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT);
+}
+
+static void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+static void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void intel_generic_uncore_pci_enable_event(struct intel_uncore_box *box,
+					    struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+					     struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, 0);
+}
+
+static u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+					   struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static struct intel_uncore_ops generic_uncore_pci_ops = {
+	.init_box	= intel_generic_uncore_pci_init_box,
+	.disable_box	= intel_generic_uncore_pci_disable_box,
+	.enable_box	= intel_generic_uncore_pci_enable_box,
+	.disable_event	= intel_generic_uncore_pci_disable_event,
+	.enable_event	= intel_generic_uncore_pci_enable_event,
+	.read_counter	= intel_generic_uncore_pci_read_counter,
+};
+
 static bool uncore_update_uncore_type(enum uncore_access_type type_id,
 				      struct intel_uncore_type *uncore,
 				      struct intel_uncore_discovery_type *type)
@@ -395,6 +460,14 @@ static bool uncore_update_uncore_type(enum uncore_access_type type_id,
 		uncore->box_ctl = (unsigned int)type->box_ctrl;
 		uncore->msr_offsets = type->box_offset;
 		break;
+	case UNCORE_ACCESS_PCI:
+		uncore->ops = &generic_uncore_pci_ops;
+		uncore->perf_ctr = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctr_offset;
+		uncore->event_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctl_offset;
+		uncore->box_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl);
+		uncore->box_ctls = type->box_ctrl_die;
+		uncore->pci_offsets = type->box_offset;
+		break;
 	default:
 		return false;
 	}
@@ -442,3 +515,10 @@ void intel_uncore_generic_uncore_cpu_init(void)
 {
 	uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR);
 }
+
+int intel_uncore_generic_uncore_pci_init(void)
+{
+	uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI);
+
+	return 0;
+}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 87078ba932fc..1639ff7baed8 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -23,6 +23,12 @@
 /* Global discovery table size */
 #define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE	0x20
 
+#define UNCORE_DISCOVERY_PCI_DOMAIN(data)	((data >> 28) & 0x7)
+#define UNCORE_DISCOVERY_PCI_BUS(data)		((data >> 20) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DEVFN(data)	((data >> 12) & 0xff)
+#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data)	(data & 0xfff)
+
+
 #define uncore_discovery_invalid_unit(unit)			\
 	(!unit.table1 || !unit.ctl || !unit.table3 ||	\
 	 unit.table1 == -1ULL || unit.ctl == -1ULL ||	\
@@ -121,3 +127,4 @@ struct intel_uncore_discovery_type {
 bool intel_uncore_has_discovery_tables(void);
 void intel_uncore_clear_discovery_tables(void);
 void intel_uncore_generic_uncore_cpu_init(void);
+int intel_uncore_generic_uncore_pci_init(void);
-- 
Gitee


From 43c8a723078096d48b62af1474e4daf39324dbfe Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 May 2020 08:19:28 -0700
Subject: [PATCH 0894/2161] perf/x86/intel/uncore: Record the size of mapped
 area

commit 1b94d31de422399421422af0e63c9685e7485901 upstream.

Perf cannot validate an address before the actual access to MMIO space
of some uncore units, e.g. IMC on TGL. Accessing an invalid address,
which exceeds mapped area, can trigger oops.

Perf never records the size of mapped area. Generic functions, e.g.
uncore_mmio_read_counter(), cannot get the correct size for address
validation.

Add mmio_map_size in intel_uncore_type to record the size of mapped
area. Print warning message if ioremap fails.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1590679169-61823-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.h       |  1 +
 arch/x86/events/intel/uncore_snb.c   |  7 ++++++-
 arch/x86/events/intel/uncore_snbep.c | 11 +++++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 90f667dc5d89..81b4c0be1a97 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -63,6 +63,7 @@ struct intel_uncore_type {
 		unsigned msr_offset;
 		unsigned mmio_offset;
 	};
+	unsigned mmio_map_size;
 	unsigned num_shared_regs:8;
 	unsigned single_fixed:1;
 	unsigned pair_ctr_ctl:1;
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index c7adcfa2c923..c57d5b0d169c 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -431,6 +431,7 @@ static const struct attribute_group snb_uncore_imc_format_group = {
 
 static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
 {
+	struct intel_uncore_type *type = box->pmu->type;
 	struct pci_dev *pdev = box->pci_dev;
 	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
 	resource_size_t addr;
@@ -446,7 +447,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
 
 	addr &= ~(PAGE_SIZE - 1);
 
-	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+	box->io_addr = ioremap(addr, type->mmio_map_size);
+	if (!box->io_addr)
+		pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
 	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
 }
 
@@ -602,6 +606,7 @@ static struct intel_uncore_type snb_uncore_imc = {
 	.num_counters   = 2,
 	.num_boxes	= 1,
 	.num_freerunning_types	= SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size	= SNB_UNCORE_PCI_IMC_MAP_SIZE,
 	.freerunning	= snb_uncore_imc_freerunning,
 	.event_descs	= snb_uncore_imc_events,
 	.format_group	= &snb_uncore_imc_format_group,
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 47d0caf6bb85..7bb0e6f9f32c 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -4411,6 +4411,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
 				       unsigned int box_ctl, int mem_offset)
 {
 	struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid);
+	struct intel_uncore_type *type = box->pmu->type;
 	resource_size_t addr;
 	u32 pci_dword;
 
@@ -4425,9 +4426,11 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
 
 	addr += box_ctl;
 
-	box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE);
-	if (!box->io_addr)
+	box->io_addr = ioremap(addr, type->mmio_map_size);
+	if (!box->io_addr) {
+		pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
 		return;
+	}
 
 	writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
 }
@@ -4520,6 +4523,7 @@ static struct intel_uncore_type snr_uncore_imc = {
 	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
 	.box_ctl	= SNR_IMC_MMIO_PMON_BOX_CTL,
 	.mmio_offset	= SNR_IMC_MMIO_OFFSET,
+	.mmio_map_size	= SNR_IMC_MMIO_SIZE,
 	.ops		= &snr_uncore_mmio_ops,
 	.format_group	= &skx_uncore_format_group,
 };
@@ -4560,6 +4564,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = {
 	.num_counters		= 3,
 	.num_boxes		= 1,
 	.num_freerunning_types	= SNR_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size		= SNR_IMC_MMIO_SIZE,
 	.freerunning		= snr_imc_freerunning,
 	.ops			= &snr_uncore_imc_freerunning_ops,
 	.event_descs		= snr_uncore_imc_freerunning_events,
@@ -4977,6 +4982,7 @@ static struct intel_uncore_type icx_uncore_imc = {
 	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
 	.box_ctl	= SNR_IMC_MMIO_PMON_BOX_CTL,
 	.mmio_offset	= SNR_IMC_MMIO_OFFSET,
+	.mmio_map_size	= SNR_IMC_MMIO_SIZE,
 	.ops		= &icx_uncore_mmio_ops,
 	.format_group	= &skx_uncore_format_group,
 };
@@ -5034,6 +5040,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = {
 	.num_counters		= 5,
 	.num_boxes		= 4,
 	.num_freerunning_types	= ICX_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size		= SNR_IMC_MMIO_SIZE,
 	.freerunning		= icx_imc_freerunning,
 	.ops			= &icx_uncore_imc_freerunning_ops,
 	.event_descs		= icx_uncore_imc_freerunning_events,
-- 
Gitee


From adfde156b6ccb96009281cc6a317d48120b73c4c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 17 Mar 2021 10:59:37 -0700
Subject: [PATCH 0895/2161] perf/x86/intel/uncore: Generic support for the MMIO
 type of uncore blocks

commit c4c55e362a521d763356b9e02bc9a4348c71a471 upstream.

The discovery table provides the generic uncore block information
for the MMIO type of uncore blocks, which is good enough to provide
basic uncore support.

The box control field is composed of the BAR address and box control
offset. When initializing the uncore blocks, perf should ioremap the
address from the box control field.

Implement the generic support for the MMIO type of uncore block.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1616003977-90612-6-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c           |  1 +
 arch/x86/events/intel/uncore.h           |  1 +
 arch/x86/events/intel/uncore_discovery.c | 98 ++++++++++++++++++++++++
 arch/x86/events/intel/uncore_discovery.h |  1 +
 4 files changed, 101 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index f3136c7ecef7..9337724ea6f4 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1733,6 +1733,7 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
 static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
 	.cpu_init = intel_uncore_generic_uncore_cpu_init,
 	.pci_init = intel_uncore_generic_uncore_pci_init,
+	.mmio_init = intel_uncore_generic_uncore_mmio_init,
 };
 
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 81b4c0be1a97..3dfc0599fa5f 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -70,6 +70,7 @@ struct intel_uncore_type {
 	union {
 		unsigned *msr_offsets;
 		unsigned *pci_offsets;
+		unsigned *mmio_offsets;
 	};
 	unsigned *box_ids;
 	struct event_constraint unconstrainted;
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 784d7b45fada..aba9bff95413 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -442,6 +442,90 @@ static struct intel_uncore_ops generic_uncore_pci_ops = {
 	.read_counter	= intel_generic_uncore_pci_read_counter,
 };
 
+#define UNCORE_GENERIC_MMIO_SIZE		0x4000
+
+static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+
+	if (!type->box_ctls || !type->box_ctls[box->dieid] || !type->mmio_offsets)
+		return 0;
+
+	return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx];
+}
+
+static void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
+{
+	unsigned int box_ctl = generic_uncore_mmio_box_ctl(box);
+	struct intel_uncore_type *type = box->pmu->type;
+	resource_size_t addr;
+
+	if (!box_ctl) {
+		pr_warn("Uncore type %d box %d: Invalid box control address.\n",
+			type->type_id, type->box_ids[box->pmu->pmu_idx]);
+		return;
+	}
+
+	addr = box_ctl;
+	box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
+	if (!box->io_addr) {
+		pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
+			type->type_id, type->box_ids[box->pmu->pmu_idx],
+			(unsigned long long)addr);
+		return;
+	}
+
+	writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+	if (!box->io_addr)
+		return;
+
+	writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+	if (!box->io_addr)
+		return;
+
+	writel(0, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+					     struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box->io_addr)
+		return;
+
+	writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+static void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+					      struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box->io_addr)
+		return;
+
+	writel(0, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops generic_uncore_mmio_ops = {
+	.init_box	= intel_generic_uncore_mmio_init_box,
+	.exit_box	= uncore_mmio_exit_box,
+	.disable_box	= intel_generic_uncore_mmio_disable_box,
+	.enable_box	= intel_generic_uncore_mmio_enable_box,
+	.disable_event	= intel_generic_uncore_mmio_disable_event,
+	.enable_event	= intel_generic_uncore_mmio_enable_event,
+	.read_counter	= uncore_mmio_read_counter,
+};
+
 static bool uncore_update_uncore_type(enum uncore_access_type type_id,
 				      struct intel_uncore_type *uncore,
 				      struct intel_uncore_discovery_type *type)
@@ -468,6 +552,15 @@ static bool uncore_update_uncore_type(enum uncore_access_type type_id,
 		uncore->box_ctls = type->box_ctrl_die;
 		uncore->pci_offsets = type->box_offset;
 		break;
+	case UNCORE_ACCESS_MMIO:
+		uncore->ops = &generic_uncore_mmio_ops;
+		uncore->perf_ctr = (unsigned int)type->ctr_offset;
+		uncore->event_ctl = (unsigned int)type->ctl_offset;
+		uncore->box_ctl = (unsigned int)type->box_ctrl;
+		uncore->box_ctls = type->box_ctrl_die;
+		uncore->mmio_offsets = type->box_offset;
+		uncore->mmio_map_size = UNCORE_GENERIC_MMIO_SIZE;
+		break;
 	default:
 		return false;
 	}
@@ -522,3 +615,8 @@ int intel_uncore_generic_uncore_pci_init(void)
 
 	return 0;
 }
+
+void intel_uncore_generic_uncore_mmio_init(void)
+{
+	uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO);
+}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 1639ff7baed8..1d652939a01c 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -128,3 +128,4 @@ bool intel_uncore_has_discovery_tables(void);
 void intel_uncore_clear_discovery_tables(void);
 void intel_uncore_generic_uncore_cpu_init(void);
 int intel_uncore_generic_uncore_pci_init(void);
+void intel_uncore_generic_uncore_mmio_init(void);
-- 
Gitee


From 382a937a31247e21fae45988f48d68f8daa7707b Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 23 Mar 2020 17:26:01 -0700
Subject: [PATCH 0896/2161] PCI/DPC: Move DPC data into struct pci_dev

commit be06c1b42eea749547d2f0248dc0a7c1153f67b9 upstream.

We only need 25 bits of data for DPC, so I don't think it's worth the
complexity of allocating and keeping track of the struct dpc_dev separately
from the pci_dev.  Move that data into the struct pci_dev.

Link: https://lore.kernel.org/r/98323eaa18080adbe5bb30846862f09f8722d4b3.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pcie/dpc.c | 103 +++++++++++++----------------------------
 include/linux/pci.h    |   5 ++
 2 files changed, 36 insertions(+), 72 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index e06f42f58d3d..6b116d7fdb89 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -17,13 +17,6 @@
 #include "portdrv.h"
 #include "../pci.h"
 
-struct dpc_dev {
-	struct pcie_device	*dev;
-	u16			cap_pos;
-	bool			rp_extensions;
-	u8			rp_log_size;
-};
-
 static const char * const rp_pio_error_string[] = {
 	"Configuration Request received UR Completion",	 /* Bit Position 0  */
 	"Configuration Request received CA Completion",	 /* Bit Position 1  */
@@ -46,63 +39,42 @@ static const char * const rp_pio_error_string[] = {
 	"Memory Request Completion Timeout",		 /* Bit Position 18 */
 };
 
-static struct dpc_dev *to_dpc_dev(struct pci_dev *dev)
-{
-	struct device *device;
-
-	device = pcie_port_find_device(dev, PCIE_PORT_SERVICE_DPC);
-	if (!device)
-		return NULL;
-	return get_service_data(to_pcie_device(device));
-}
-
 void pci_save_dpc_state(struct pci_dev *dev)
 {
-	struct dpc_dev *dpc;
 	struct pci_cap_saved_state *save_state;
 	u16 *cap;
 
 	if (!pci_is_pcie(dev))
 		return;
 
-	dpc = to_dpc_dev(dev);
-	if (!dpc)
-		return;
-
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
 	if (!save_state)
 		return;
 
 	cap = (u16 *)&save_state->cap.data[0];
-	pci_read_config_word(dev, dpc->cap_pos + PCI_EXP_DPC_CTL, cap);
+	pci_read_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, cap);
 }
 
 void pci_restore_dpc_state(struct pci_dev *dev)
 {
-	struct dpc_dev *dpc;
 	struct pci_cap_saved_state *save_state;
 	u16 *cap;
 
 	if (!pci_is_pcie(dev))
 		return;
 
-	dpc = to_dpc_dev(dev);
-	if (!dpc)
-		return;
-
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
 	if (!save_state)
 		return;
 
 	cap = (u16 *)&save_state->cap.data[0];
-	pci_write_config_word(dev, dpc->cap_pos + PCI_EXP_DPC_CTL, *cap);
+	pci_write_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, *cap);
 }
 
-static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
+static int dpc_wait_rp_inactive(struct pci_dev *pdev)
 {
 	unsigned long timeout = jiffies + HZ;
-	struct pci_dev *pdev = dpc->dev->port;
-	u16 cap = dpc->cap_pos, status;
+	u16 cap = pdev->dpc_cap, status;
 
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
 	while (status & PCI_EXP_DPC_RP_BUSY &&
@@ -119,15 +91,13 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-	struct dpc_dev *dpc;
 	u16 cap;
 
 	/*
 	 * DPC disables the Link automatically in hardware, so it has
 	 * already been reset by the time we get here.
 	 */
-	dpc = to_dpc_dev(pdev);
-	cap = dpc->cap_pos;
+	cap = pdev->dpc_cap;
 
 	/*
 	 * Wait until the Link is inactive, then clear DPC Trigger Status
@@ -135,7 +105,7 @@ static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 	 */
 	pcie_wait_for_link(pdev, false);
 
-	if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
+	if (pdev->dpc_rp_extensions && dpc_wait_rp_inactive(pdev))
 		return PCI_ERS_RESULT_DISCONNECT;
 
 	pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
@@ -147,10 +117,9 @@ static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 	return PCI_ERS_RESULT_RECOVERED;
 }
 
-static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
+static void dpc_process_rp_pio_error(struct pci_dev *pdev)
 {
-	struct pci_dev *pdev = dpc->dev->port;
-	u16 cap = dpc->cap_pos, dpc_status, first_error;
+	u16 cap = pdev->dpc_cap, dpc_status, first_error;
 	u32 status, mask, sev, syserr, exc, dw0, dw1, dw2, dw3, log, prefix;
 	int i;
 
@@ -175,7 +144,7 @@ static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
 				first_error == i ? " (First)" : "");
 	}
 
-	if (dpc->rp_log_size < 4)
+	if (pdev->dpc_rp_log_size < 4)
 		goto clear_status;
 	pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG,
 			      &dw0);
@@ -188,12 +157,12 @@ static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
 	pci_err(pdev, "TLP Header: %#010x %#010x %#010x %#010x\n",
 		dw0, dw1, dw2, dw3);
 
-	if (dpc->rp_log_size < 5)
+	if (pdev->dpc_rp_log_size < 5)
 		goto clear_status;
 	pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG, &log);
 	pci_err(pdev, "RP PIO ImpSpec Log %#010x\n", log);
 
-	for (i = 0; i < dpc->rp_log_size - 5; i++) {
+	for (i = 0; i < pdev->dpc_rp_log_size - 5; i++) {
 		pci_read_config_dword(pdev,
 			cap + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG, &prefix);
 		pci_err(pdev, "TLP Prefix Header: dw%d, %#010x\n", i, prefix);
@@ -226,10 +195,9 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
 
 static irqreturn_t dpc_handler(int irq, void *context)
 {
+	struct pci_dev *pdev = context;
+	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
 	struct aer_err_info info;
-	struct dpc_dev *dpc = context;
-	struct pci_dev *pdev = dpc->dev->port;
-	u16 cap = dpc->cap_pos, status, source, reason, ext_reason;
 
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source);
@@ -248,8 +216,8 @@ static irqreturn_t dpc_handler(int irq, void *context)
 				     "reserved error");
 
 	/* show RP PIO error detail information */
-	if (dpc->rp_extensions && reason == 3 && ext_reason == 0)
-		dpc_process_rp_pio_error(dpc);
+	if (pdev->dpc_rp_extensions && reason == 3 && ext_reason == 0)
+		dpc_process_rp_pio_error(pdev);
 	else if (reason == 0 &&
 		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 		 aer_get_device_error_info(pdev, &info)) {
@@ -266,9 +234,8 @@ static irqreturn_t dpc_handler(int irq, void *context)
 
 static irqreturn_t dpc_irq(int irq, void *context)
 {
-	struct dpc_dev *dpc = (struct dpc_dev *)context;
-	struct pci_dev *pdev = dpc->dev->port;
-	u16 cap = dpc->cap_pos, status;
+	struct pci_dev *pdev = context;
+	u16 cap = pdev->dpc_cap, status;
 
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
 
@@ -285,7 +252,6 @@ static irqreturn_t dpc_irq(int irq, void *context)
 #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
 static int dpc_probe(struct pcie_device *dev)
 {
-	struct dpc_dev *dpc;
 	struct pci_dev *pdev = dev->port;
 	struct device *device = &dev->device;
 	int status;
@@ -294,43 +260,37 @@ static int dpc_probe(struct pcie_device *dev)
 	if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native)
 		return -ENOTSUPP;
 
-	dpc = devm_kzalloc(device, sizeof(*dpc), GFP_KERNEL);
-	if (!dpc)
-		return -ENOMEM;
-
-	dpc->cap_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
-	dpc->dev = dev;
-	set_service_data(dev, dpc);
+	pdev->dpc_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
 
 	status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
 					   dpc_handler, IRQF_SHARED,
-					   "pcie-dpc", dpc);
+					   "pcie-dpc", pdev);
 	if (status) {
 		pci_warn(pdev, "request IRQ%d failed: %d\n", dev->irq,
 			 status);
 		return status;
 	}
 
-	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
-	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
 
-	dpc->rp_extensions = (cap & PCI_EXP_DPC_CAP_RP_EXT);
-	if (dpc->rp_extensions) {
-		dpc->rp_log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
-		if (dpc->rp_log_size < 4 || dpc->rp_log_size > 9) {
+	pdev->dpc_rp_extensions = (cap & PCI_EXP_DPC_CAP_RP_EXT) ? 1 : 0;
+	if (pdev->dpc_rp_extensions) {
+		pdev->dpc_rp_log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
+		if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) {
 			pci_err(pdev, "RP PIO log size %u is invalid\n",
-				dpc->rp_log_size);
-			dpc->rp_log_size = 0;
+				pdev->dpc_rp_log_size);
+			pdev->dpc_rp_log_size = 0;
 		}
 	}
 
 	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
-	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
+	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
 
 	pci_info(pdev, "error containment capabilities: Int Msg #%d, RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
 		 cap & PCI_EXP_DPC_IRQ, FLAG(cap, PCI_EXP_DPC_CAP_RP_EXT),
 		 FLAG(cap, PCI_EXP_DPC_CAP_POISONED_TLP),
-		 FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), dpc->rp_log_size,
+		 FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), pdev->dpc_rp_log_size,
 		 FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE));
 
 	pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_DPC, sizeof(u16));
@@ -339,13 +299,12 @@ static int dpc_probe(struct pcie_device *dev)
 
 static void dpc_remove(struct pcie_device *dev)
 {
-	struct dpc_dev *dpc = get_service_data(dev);
 	struct pci_dev *pdev = dev->port;
 	u16 ctl;
 
-	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
 	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
-	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
+	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
 }
 
 static struct pcie_port_service_driver dpcdriver = {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 136dcfac14cd..34fb420796be 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -453,6 +453,11 @@ struct pci_dev {
 	const struct attribute_group **msi_irq_groups;
 #endif
 	struct pci_vpd *vpd;
+#ifdef CONFIG_PCIE_DPC
+	u16		dpc_cap;
+	unsigned int	dpc_rp_extensions:1;
+	u8		dpc_rp_log_size;
+#endif
 #ifdef CONFIG_PCI_ATS
 	union {
 		struct pci_sriov	*sriov;		/* PF: SR-IOV info */
-- 
Gitee


From 6c5ed4dd03d8686a1c087b55f5be54fafbfa27c6 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Mon, 23 Mar 2020 17:26:02 -0700
Subject: [PATCH 0897/2161] PCI/ERR: Remove service dependency in
 pcie_do_recovery()

commit b6cf1a42f916af0b056079c37fc5fa7bf8e4b2e2 upstream.

Previously we passed the PCIe service type parameter to pcie_do_recovery(),
where reset_link() looked up the underlying pci_port_service_driver and its
.reset_link() function pointer. Instead of using this roundabout way, we
can just pass the driver-specific .reset_link() callback function when
calling pcie_do_recovery() function.

This allows us to call pcie_do_recovery() from code that is not a PCIe port
service driver, e.g., Error Disconnect Recover (EDR) support.

Remove pcie_port_find_service() and pcie_port_service_driver.reset_link
since they are now unused.

Link: https://lore.kernel.org/r/60e02b87b526cdf2930400059d98704bf0a147d1.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/PCI/pcieaer-howto.rst | 19 +++-------
 drivers/pci/pci.h                   |  2 +-
 drivers/pci/pcie/aer.c              | 12 +++----
 drivers/pci/pcie/dpc.c              |  3 +-
 drivers/pci/pcie/err.c              | 54 +++++------------------------
 drivers/pci/pcie/portdrv.h          |  5 ---
 drivers/pci/pcie/portdrv_core.c     | 21 -----------
 7 files changed, 19 insertions(+), 97 deletions(-)

diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
index 18bdefaafd1a..afbd8c1c321d 100644
--- a/Documentation/PCI/pcieaer-howto.rst
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -156,12 +156,6 @@ default reset_link function, but different upstream ports might
 have different specifications to reset pci express link, so all
 upstream ports should provide their own reset_link functions.
 
-In struct pcie_port_service_driver, a new pointer, reset_link, is
-added.
-::
-
-	pci_ers_result_t (*reset_link) (struct pci_dev *dev);
-
 Section 3.2.2.2 provides more detailed info on when to call
 reset_link.
 
@@ -212,15 +206,10 @@ error_detected(dev, pci_channel_io_frozen) to all drivers within
 a hierarchy in question. Then, performing link reset at upstream is
 necessary. As different kinds of devices might use different approaches
 to reset link, AER port service driver is required to provide the
-function to reset link. Firstly, kernel looks for if the upstream
-component has an aer driver. If it has, kernel uses the reset_link
-callback of the aer driver. If the upstream component has no aer driver
-and the port is downstream port, we will perform a hot reset as the
-default by setting the Secondary Bus Reset bit of the Bridge Control
-register associated with the downstream port. As for upstream ports,
-they should provide their own aer service drivers with reset_link
-function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
-reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
+function to reset link via callback parameter of pcie_do_recovery()
+function. If reset_link is not NULL, recovery function will use it
+to reset the link. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER
+and reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
 to mmio_enabled.
 
 helper functions
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 638e83f8085c..d411701a44e4 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -542,7 +542,7 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
 
 /* PCI error reporting and recovery */
 void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
-		      u32 service);
+		      pci_ers_result_t (*reset_link)(struct pci_dev *pdev));
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 4a818b07a1af..c0540c3761dc 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -102,6 +102,7 @@ struct aer_stats {
 #define ERR_UNCOR_ID(d)			(d >> 16)
 
 static int pcie_aer_disable;
+static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
 void pci_no_aer(void)
 {
@@ -1053,11 +1054,9 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 					info->status);
 		pci_aer_clear_device_status(dev);
 	} else if (info->severity == AER_NONFATAL)
-		pcie_do_recovery(dev, pci_channel_io_normal,
-				 PCIE_PORT_SERVICE_AER);
+		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
 	else if (info->severity == AER_FATAL)
-		pcie_do_recovery(dev, pci_channel_io_frozen,
-				 PCIE_PORT_SERVICE_AER);
+		pcie_do_recovery(dev, pci_channel_io_frozen, aer_root_reset);
 	pci_dev_put(dev);
 }
 
@@ -1094,10 +1093,10 @@ static void aer_recover_work_func(struct work_struct *work)
 		cper_print_aer(pdev, entry.severity, entry.regs);
 		if (entry.severity == AER_NONFATAL)
 			pcie_do_recovery(pdev, pci_channel_io_normal,
-					 PCIE_PORT_SERVICE_AER);
+					 aer_root_reset);
 		else if (entry.severity == AER_FATAL)
 			pcie_do_recovery(pdev, pci_channel_io_frozen,
-					 PCIE_PORT_SERVICE_AER);
+					 aer_root_reset);
 		pci_dev_put(pdev);
 	}
 }
@@ -1501,7 +1500,6 @@ static struct pcie_port_service_driver aerdriver = {
 
 	.probe		= aer_probe,
 	.remove		= aer_remove,
-	.reset_link	= aer_root_reset,
 };
 
 /**
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 6b116d7fdb89..1ae5d94944eb 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -227,7 +227,7 @@ static irqreturn_t dpc_handler(int irq, void *context)
 	}
 
 	/* We configure DPC so it only triggers on ERR_FATAL */
-	pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC);
+	pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link);
 
 	return IRQ_HANDLED;
 }
@@ -313,7 +313,6 @@ static struct pcie_port_service_driver dpcdriver = {
 	.service	= PCIE_PORT_SERVICE_DPC,
 	.probe		= dpc_probe,
 	.remove		= dpc_remove,
-	.reset_link	= dpc_reset_link,
 };
 
 int __init pcie_dpc_init(void)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index c37c8daa83c6..5956ef74805a 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -142,49 +142,9 @@ static int report_resume(struct pci_dev *dev, void *data)
 	return 0;
 }
 
-/**
- * default_reset_link - default reset function
- * @dev: pointer to pci_dev data structure
- *
- * Invoked when performing link reset on a Downstream Port or a
- * Root Port with no aer driver.
- */
-static pci_ers_result_t default_reset_link(struct pci_dev *dev)
-{
-	int rc;
-
-	rc = pci_bus_error_reset(dev);
-	pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
-	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
-}
-
-static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
-{
-	pci_ers_result_t status;
-	struct pcie_port_service_driver *driver = NULL;
-
-	driver = pcie_port_find_service(dev, service);
-	if (driver && driver->reset_link) {
-		status = driver->reset_link(dev);
-	} else if (pcie_downstream_port(dev)) {
-		status = default_reset_link(dev);
-	} else {
-		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
-			pci_name(dev));
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
-	if (status != PCI_ERS_RESULT_RECOVERED) {
-		pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
-			pci_name(dev));
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
-	return status;
-}
-
-void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
-		      u32 service)
+void pcie_do_recovery(struct pci_dev *dev,
+		      enum pci_channel_state state,
+		      pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
 {
 	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
 	struct pci_bus *bus;
@@ -201,14 +161,16 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
 	pci_dbg(dev, "broadcast error_detected message\n");
 	if (state == pci_channel_io_frozen) {
 		pci_walk_bus(bus, report_frozen_detected, &status);
-		status = reset_link(dev, service);
-		if (status != PCI_ERS_RESULT_RECOVERED)
+		status = reset_link(dev);
+		if (status != PCI_ERS_RESULT_RECOVERED) {
+			pci_warn(dev, "link reset failed\n");
 			goto failed;
+		}
 	} else {
 		pci_walk_bus(bus, report_normal_detected, &status);
 	}
 	if (state == pci_channel_io_frozen &&
-	    reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED)
+	    reset_link(dev) != PCI_ERS_RESULT_RECOVERED)
 		goto failed;
 
 	if (status == PCI_ERS_RESULT_CAN_RECOVER) {
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 1e673619b101..64b5e081cdb2 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -92,9 +92,6 @@ struct pcie_port_service_driver {
 	/* Device driver may resume normal operations */
 	void (*error_resume)(struct pci_dev *dev);
 
-	/* Link Reset Capability - AER service driver specific */
-	pci_ers_result_t (*reset_link)(struct pci_dev *dev);
-
 	int port_type;  /* Type of the port this driver can handle */
 	u32 service;    /* Port service this device represents */
 
@@ -161,7 +158,5 @@ static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev)
 }
 #endif
 
-struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
-							u32 service);
 struct device *pcie_port_find_device(struct pci_dev *dev, u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 5075cb9e850c..50a9522ab07d 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -458,27 +458,6 @@ static int find_service_iter(struct device *device, void *data)
 	return 0;
 }
 
-/**
- * pcie_port_find_service - find the service driver
- * @dev: PCI Express port the service is associated with
- * @service: Service to find
- *
- * Find PCI Express port service driver associated with given service
- */
-struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
-							u32 service)
-{
-	struct pcie_port_service_driver *drv;
-	struct portdrv_service_data pdrvs;
-
-	pdrvs.drv = NULL;
-	pdrvs.service = service;
-	device_for_each_child(&dev->dev, &pdrvs, find_service_iter);
-
-	drv = pdrvs.drv;
-	return drv;
-}
-
 /**
  * pcie_port_find_device - find the struct device
  * @dev: PCI Express port the service is associated with
-- 
Gitee


From d79f2989535d912b1da35fcb387c45d063ca7f51 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Mon, 23 Mar 2020 17:26:04 -0700
Subject: [PATCH 0898/2161] PCI/DPC: Cache DPC capabilities in
 pci_init_capabilities()

commit 27005618178ef9e9bf9c42fd91101771c92e9308 upstream.

Since Error Disconnect Recover needs to use DPC error handling routines
even if the OS doesn't have control of DPC, move the initalization and
caching of DPC capabilities from the DPC driver to pci_init_capabilities().

Link: https://lore.kernel.org/r/5888380657c8b9551675b5dbd48e370e4fd2703d.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci.h      |  2 ++
 drivers/pci/pcie/dpc.c | 33 +++++++++++++++++++++------------
 drivers/pci/probe.c    |  1 +
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d411701a44e4..96e179b7fafa 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -452,9 +452,11 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 #ifdef CONFIG_PCIE_DPC
 void pci_save_dpc_state(struct pci_dev *dev);
 void pci_restore_dpc_state(struct pci_dev *dev);
+void pci_dpc_init(struct pci_dev *pdev);
 #else
 static inline void pci_save_dpc_state(struct pci_dev *dev) {}
 static inline void pci_restore_dpc_state(struct pci_dev *dev) {}
+static inline void pci_dpc_init(struct pci_dev *pdev) {}
 #endif
 
 #ifdef CONFIG_PCI_ATS
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 1ae5d94944eb..a1c9d45876bd 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -249,6 +249,27 @@ static irqreturn_t dpc_irq(int irq, void *context)
 	return IRQ_HANDLED;
 }
 
+void pci_dpc_init(struct pci_dev *pdev)
+{
+	u16 cap;
+
+	pdev->dpc_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
+	if (!pdev->dpc_cap)
+		return;
+
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
+	if (!(cap & PCI_EXP_DPC_CAP_RP_EXT))
+		return;
+
+	pdev->dpc_rp_extensions = true;
+	pdev->dpc_rp_log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
+	if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) {
+		pci_err(pdev, "RP PIO log size %u is invalid\n",
+			pdev->dpc_rp_log_size);
+		pdev->dpc_rp_log_size = 0;
+	}
+}
+
 #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
 static int dpc_probe(struct pcie_device *dev)
 {
@@ -260,8 +281,6 @@ static int dpc_probe(struct pcie_device *dev)
 	if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native)
 		return -ENOTSUPP;
 
-	pdev->dpc_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
-
 	status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
 					   dpc_handler, IRQF_SHARED,
 					   "pcie-dpc", pdev);
@@ -274,16 +293,6 @@ static int dpc_probe(struct pcie_device *dev)
 	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
 	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
 
-	pdev->dpc_rp_extensions = (cap & PCI_EXP_DPC_CAP_RP_EXT) ? 1 : 0;
-	if (pdev->dpc_rp_extensions) {
-		pdev->dpc_rp_log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
-		if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) {
-			pci_err(pdev, "RP PIO log size %u is invalid\n",
-				pdev->dpc_rp_log_size);
-			pdev->dpc_rp_log_size = 0;
-		}
-	}
-
 	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
 	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 58bf772157ec..8e4ba3cdadd0 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2382,6 +2382,7 @@ static void pci_init_capabilities(struct pci_dev *dev)
 
 	/* Advanced Error Reporting */
 	pci_aer_init(dev);
+	pci_dpc_init(dev);		/* Downstream Port Containment */
 
 	pcie_report_downtraining(dev);
 
-- 
Gitee


From c47ee7c409ef68b8a949fdb2d8866fa77bd69ddf Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Mon, 23 Mar 2020 17:26:06 -0700
Subject: [PATCH 0899/2161] PCI/DPC: Expose dpc_process_error(),
 dpc_reset_link() for use by EDR

commit aea47413e7ceec6024f5a2b743cb1a4b2176bf3f upstream.

If firmware controls DPC, it is generally responsible for managing the DPC
capability and events, and the OS should not access the DPC capability.

However, if firmware controls DPC and both the OS and the platform support
Error Disconnect Recover (EDR) notifications, the OS EDR notify handler is
responsible for recovery, and the notify handler may read/write the DPC
capability until it clears the DPC Trigger Status bit.  See [1], sec 4.5.1,
table 4-6.

Expose some DPC error handling functions so they can be used by the EDR
notify handler.

[1] Downstream Port Containment Related Enhancements ECN, Jan 28, 2019,
    affecting PCI Firmware Specification, Rev. 3.2
    https://members.pcisig.com/wg/PCI-SIG/document/12888

Link: https://lore.kernel.org/r/e9000bb15b3a4293e81d98bb29ead7c84a6393c9.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci.h      |  2 ++
 drivers/pci/pcie/dpc.c | 12 +++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 96e179b7fafa..287fc4d6033a 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -453,6 +453,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 void pci_save_dpc_state(struct pci_dev *dev);
 void pci_restore_dpc_state(struct pci_dev *dev);
 void pci_dpc_init(struct pci_dev *pdev);
+void dpc_process_error(struct pci_dev *pdev);
+pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
 #else
 static inline void pci_save_dpc_state(struct pci_dev *dev) {}
 static inline void pci_restore_dpc_state(struct pci_dev *dev) {}
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index a1c9d45876bd..22998ee2f7ea 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -89,7 +89,7 @@ static int dpc_wait_rp_inactive(struct pci_dev *pdev)
 	return 0;
 }
 
-static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
+pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
 	u16 cap;
 
@@ -193,9 +193,8 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
 	return 1;
 }
 
-static irqreturn_t dpc_handler(int irq, void *context)
+void dpc_process_error(struct pci_dev *pdev)
 {
-	struct pci_dev *pdev = context;
 	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
 	struct aer_err_info info;
 
@@ -225,6 +224,13 @@ static irqreturn_t dpc_handler(int irq, void *context)
 		pci_cleanup_aer_uncorrect_error_status(pdev);
 		pci_aer_clear_fatal_status(pdev);
 	}
+}
+
+static irqreturn_t dpc_handler(int irq, void *context)
+{
+	struct pci_dev *pdev = context;
+
+	dpc_process_error(pdev);
 
 	/* We configure DPC so it only triggers on ERR_FATAL */
 	pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link);
-- 
Gitee


From e0e0a3f751e790dd80f98ee5a0a139ad7a6d817b Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Mon, 23 Mar 2020 17:26:08 -0700
Subject: [PATCH 0900/2161] PCI/AER: Rationalize error status register clearing

commit 894020fdd88c1e9a74c60b67c0f19f1c7696ba2f upstream.

Conflicts:
  - Minor refractor due to code differences, Thanks to Katrin
  - Introduce an compact helper in include/linux/aer
    Too many 3rd part modules depend on this symbol, it can break
    easily later if we drop in some other modules, so use a
    function as alias. We may make is a GPL_SYMBOL later.

The AER interfaces to clear error status registers were a confusing mess:

	- pci_cleanup_aer_uncorrect_error_status() cleared non-fatal errors
	from the Uncorrectable Error Status register.

	- pci_aer_clear_fatal_status() cleared fatal errors from the
	Uncorrectable Error Status register.

	- pci_cleanup_aer_error_status_regs() cleared the Root Error Status
	register (for Root Ports), the Uncorrectable Error Status register,
	and the Correctable Error Status register.

Rename them to make them consistent:

	From                                     To
	---------------------------------------- -------------------------------
	pci_cleanup_aer_uncorrect_error_status() pci_aer_clear_nonfatal_status()
	pci_aer_clear_fatal_status()             pci_aer_clear_fatal_status()
	pci_cleanup_aer_error_status_regs()      pci_aer_clear_status()

Since pci_cleanup_aer_error_status_regs() (renamed to
pci_aer_clear_status()) is only used within drivers/pci/, move the
declaration from <linux/aer.h> to drivers/pci/pci.h.

[bhelgaas: commit log, add renames]
Link: https://lore.kernel.org/r/d1310a75dc3d28f7e8da4e99c45fbd3e60fe238e.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Kairui Song <kasong@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/PCI/pcieaer-howto.rst       |  4 ++--
 drivers/net/ethernet/intel/ice/ice_main.c |  2 +-
 drivers/ntb/hw/idt/ntb_hw_idt.c           |  4 ++--
 drivers/pci/pci.c                         |  2 +-
 drivers/pci/pci.h                         |  2 ++
 drivers/pci/pcie/aer.c                    |  8 ++++----
 drivers/pci/pcie/dpc.c                    |  2 +-
 drivers/pci/pcie/err.c                    |  2 +-
 drivers/scsi/lpfc/lpfc_attr.c             |  6 +-----
 include/linux/aer.h                       | 21 ++++++++++++++-------
 10 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
index afbd8c1c321d..0b36b9ebfa4b 100644
--- a/Documentation/PCI/pcieaer-howto.rst
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -232,9 +232,9 @@ messages to root port when an error is detected.
 
 ::
 
-  int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);`
+  int pci_aer_clear_nonfatal_status(struct pci_dev *dev);`
 
-pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
+pci_aer_clear_nonfatal_status clears non-fatal errors in the uncorrectable
 error status register.
 
 Frequent Asked Questions
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 7151053ca28c..0461100e5c9d 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -6601,7 +6601,7 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev)
 
 	err = pci_aer_clear_nonfatal_status(pdev);
 	if (err)
-		dev_dbg(&pdev->dev, "pci_aer_clear_nonfatal_status failed, error %d\n",
+		dev_dbg(&pdev->dev, "pci_aer_clear_nonfatal_status() failed, error %d\n",
 			err);
 		/* non-fatal, continue */
 
diff --git a/drivers/ntb/hw/idt/ntb_hw_idt.c b/drivers/ntb/hw/idt/ntb_hw_idt.c
index dcf234680535..edae52384b8a 100644
--- a/drivers/ntb/hw/idt/ntb_hw_idt.c
+++ b/drivers/ntb/hw/idt/ntb_hw_idt.c
@@ -2674,8 +2674,8 @@ static int idt_init_pci(struct idt_ntb_dev *ndev)
 	ret = pci_enable_pcie_error_reporting(pdev);
 	if (ret != 0)
 		dev_warn(&pdev->dev, "PCIe AER capability disabled\n");
-	else /* Cleanup uncorrectable error status before getting to init */
-		pci_cleanup_aer_uncorrect_error_status(pdev);
+	else /* Cleanup nonfatal error status before getting to init */
+		pci_aer_clear_nonfatal_status(pdev);
 
 	/* First enable the PCI device */
 	ret = pcim_enable_device(pdev);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index a73e7cabca94..b90c7319dbc5 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1472,7 +1472,7 @@ void pci_restore_state(struct pci_dev *dev)
 	pci_restore_rebar_state(dev);
 	pci_restore_dpc_state(dev);
 
-	pci_cleanup_aer_error_status_regs(dev);
+	pci_aer_clear_status(dev);
 	pci_restore_aer_state(dev);
 
 	pci_restore_config_space(dev);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 287fc4d6033a..1a995d611437 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -670,12 +670,14 @@ void pci_aer_exit(struct pci_dev *dev);
 extern const struct attribute_group aer_stats_attr_group;
 void pci_aer_clear_fatal_status(struct pci_dev *dev);
 void pci_aer_clear_device_status(struct pci_dev *dev);
+int pci_aer_clear_status(struct pci_dev *dev);
 #else
 static inline void pci_no_aer(void) { }
 static inline void pci_aer_init(struct pci_dev *d) { }
 static inline void pci_aer_exit(struct pci_dev *d) { }
 static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
 static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
+static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
 #endif
 
 #ifdef CONFIG_ACPI
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index c0540c3761dc..b37ca1f96b5c 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -377,7 +377,7 @@ void pci_aer_clear_device_status(struct pci_dev *dev)
 	pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
 }
 
-int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
+int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
 	int pos;
 	u32 status, sev;
@@ -398,7 +398,7 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status);
+EXPORT_SYMBOL_GPL(pci_aer_clear_nonfatal_status);
 
 void pci_aer_clear_fatal_status(struct pci_dev *dev)
 {
@@ -420,7 +420,7 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
 		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 }
 
-int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
+int pci_aer_clear_status(struct pci_dev *dev)
 {
 	int pos;
 	u32 status;
@@ -516,7 +516,7 @@ void pci_aer_init(struct pci_dev *dev)
 	n = pcie_cap_has_rtctl(dev) ? 5 : 4;
 	pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_ERR, sizeof(u32) * n);
 
-	pci_cleanup_aer_error_status_regs(dev);
+	pci_aer_clear_status(dev);
 }
 
 void pci_aer_exit(struct pci_dev *dev)
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 22998ee2f7ea..762170423fdd 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -221,7 +221,7 @@ void dpc_process_error(struct pci_dev *pdev)
 		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 		 aer_get_device_error_info(pdev, &info)) {
 		aer_print_error(pdev, &info);
-		pci_cleanup_aer_uncorrect_error_status(pdev);
+		pci_aer_clear_nonfatal_status(pdev);
 		pci_aer_clear_fatal_status(pdev);
 	}
 }
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 5956ef74805a..82750614fc91 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -197,7 +197,7 @@ void pcie_do_recovery(struct pci_dev *dev,
 	pci_walk_bus(bus, report_resume, &status);
 
 	pci_aer_clear_device_status(dev);
-	pci_cleanup_aer_uncorrect_error_status(dev);
+	pci_aer_clear_nonfatal_status(dev);
 	pci_info(dev, "AER: Device recovery successful\n");
 	return;
 
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 94def7d5be78..6aeb3f6dfd8f 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -5028,7 +5028,7 @@ static DEVICE_ATTR(lpfc_aer_support, S_IRUGO | S_IWUSR,
  * Description:
  * If the @buf contains 1 and the device currently has the AER support
  * enabled, then invokes the kernel AER helper routine
- * pci_aer_clear_nonfatal_status to clean up the uncorrectable error
+ * pci_aer_clear_nonfatal_status() to clean up the uncorrectable
  * status register.
  *
  * Notes:
@@ -5054,11 +5054,7 @@ lpfc_aer_cleanup_state(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 
 	if (phba->hba_flag & HBA_AER_ENABLED)
-#if defined(BUILD_SLES15SP2) || defined(BUILD_RHEL83)
 		rc = pci_aer_clear_nonfatal_status(phba->pcidev);
-#else
-		rc = pci_cleanup_aer_uncorrect_error_status(phba->pcidev);
-#endif
 
 	if (rc == 0)
 		return strlen(buf);
diff --git a/include/linux/aer.h b/include/linux/aer.h
index fa19e01f418a..4d8299f152d7 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -44,8 +44,7 @@ struct aer_capability_regs {
 /* PCIe port driver needs this function to enable AER */
 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
-int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
-int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
+int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
 void pci_save_aer_state(struct pci_dev *dev);
 void pci_restore_aer_state(struct pci_dev *dev);
 #else
@@ -57,11 +56,7 @@ static inline int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
-static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
-{
-	return -EINVAL;
-}
-static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
+static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
@@ -74,5 +69,17 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
 int cper_severity_to_aer(int cper_severity);
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
 		       int severity, struct aer_capability_regs *aer_regs);
+
+/*
+ * TK4: For compact reason, an alias for pci_cleanup_aer_uncorrect_error_status
+ * to pci_aer_clear_nonfatal_status is added here. Since many modules really
+ * expects an 5.4 kernel to have pci_cleanup_aer_uncorrect_error_status, this
+ * is useful as a compact layer.
+ */
+static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
+{
+       return pci_aer_clear_nonfatal_status(dev);
+}
+
 #endif //_AER_H_
 
-- 
Gitee


From b12f1c6525775162ef166c43a450fc4a038b59d6 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 13 May 2020 17:38:59 -0500
Subject: [PATCH 0901/2161] PCI: Fix pci_host_bridge struct device release/free
 handling

commit 9885440b16b8fc1dd7275800fd28f56a92f60896 upstream.

The PCI code has several paths where the struct pci_host_bridge is freed
directly. This is wrong because it contains a struct device which is
refcounted and should be freed using put_device(). This can result in
use-after-free errors. I think this problem has existed since 2012 with
commit 7b5436635800 ("PCI: add generic device into pci_host_bridge
struct"). It generally hasn't mattered as most host bridge drivers are
still built-in and can't unbind.

The problem is a struct device should never be freed directly once
device_initialize() is called and a ref is held, but that doesn't happen
until pci_register_host_bridge(). There's then a window between allocating
the host bridge and pci_register_host_bridge() where kfree should be used.
This is fragile and requires callers to do the right thing. To fix this, we
need to split device_register() into device_initialize() and device_add()
calls, so that the host bridge struct is always freed by using a
put_device().

devm_pci_alloc_host_bridge() is using devm_kzalloc() to allocate struct
pci_host_bridge which will be freed directly. Instead, we can use a custom
devres action to call put_device().

Link: https://lore.kernel.org/r/20200513223859.11295-2-robh@kernel.org
Reported-by: Anders Roxell <anders.roxell@linaro.org>
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
[tyhicks: Minor contextual change in pci_init_host_bridge() due to the
 lack of a native_dpc member in the pci_host_bridge struct. It was added
 in v5.7 with commit ac1c8e35a326 ("PCI/DPC: Add Error Disconnect
 Recover (EDR) support")]
Signed-off-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/probe.c  | 36 +++++++++++++++++++-----------------
 drivers/pci/remove.c |  2 +-
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8e4ba3cdadd0..46d06549d9ed 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -568,7 +568,7 @@ static struct pci_bus *pci_alloc_bus(struct pci_bus *parent)
 	return b;
 }
 
-static void devm_pci_release_host_bridge_dev(struct device *dev)
+static void pci_release_host_bridge_dev(struct device *dev)
 {
 	struct pci_host_bridge *bridge = to_pci_host_bridge(dev);
 
@@ -577,12 +577,7 @@ static void devm_pci_release_host_bridge_dev(struct device *dev)
 
 	pci_free_resource_list(&bridge->windows);
 	pci_free_resource_list(&bridge->dma_ranges);
-}
-
-static void pci_release_host_bridge_dev(struct device *dev)
-{
-	devm_pci_release_host_bridge_dev(dev);
-	kfree(to_pci_host_bridge(dev));
+	kfree(bridge);
 }
 
 static void pci_init_host_bridge(struct pci_host_bridge *bridge)
@@ -601,6 +596,8 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge)
 	bridge->native_shpc_hotplug = 1;
 	bridge->native_pme = 1;
 	bridge->native_ltr = 1;
+
+	device_initialize(&bridge->dev);
 }
 
 struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
@@ -618,17 +615,25 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
 }
 EXPORT_SYMBOL(pci_alloc_host_bridge);
 
+static void devm_pci_alloc_host_bridge_release(void *data)
+{
+	pci_free_host_bridge(data);
+}
+
 struct pci_host_bridge *devm_pci_alloc_host_bridge(struct device *dev,
 						   size_t priv)
 {
+	int ret;
 	struct pci_host_bridge *bridge;
 
-	bridge = devm_kzalloc(dev, sizeof(*bridge) + priv, GFP_KERNEL);
+	bridge = pci_alloc_host_bridge(priv);
 	if (!bridge)
 		return NULL;
 
-	pci_init_host_bridge(bridge);
-	bridge->dev.release = devm_pci_release_host_bridge_dev;
+	ret = devm_add_action_or_reset(dev, devm_pci_alloc_host_bridge_release,
+				       bridge);
+	if (ret)
+		return NULL;
 
 	return bridge;
 }
@@ -636,10 +641,7 @@ EXPORT_SYMBOL(devm_pci_alloc_host_bridge);
 
 void pci_free_host_bridge(struct pci_host_bridge *bridge)
 {
-	pci_free_resource_list(&bridge->windows);
-	pci_free_resource_list(&bridge->dma_ranges);
-
-	kfree(bridge);
+	put_device(&bridge->dev);
 }
 EXPORT_SYMBOL(pci_free_host_bridge);
 
@@ -870,7 +872,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 	if (err)
 		goto free;
 
-	err = device_register(&bridge->dev);
+	err = device_add(&bridge->dev);
 	if (err) {
 		put_device(&bridge->dev);
 		goto free;
@@ -937,7 +939,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 
 unregister:
 	put_device(&bridge->dev);
-	device_unregister(&bridge->dev);
+	device_del(&bridge->dev);
 
 free:
 	kfree(bus);
@@ -2967,7 +2969,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 	return bridge->bus;
 
 err_out:
-	kfree(bridge);
+	put_device(&bridge->dev);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(pci_create_root_bus);
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index e9c6b120cf45..95dec03d9f2a 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -160,6 +160,6 @@ void pci_remove_root_bus(struct pci_bus *bus)
 	host_bridge->bus = NULL;
 
 	/* remove the host bridge */
-	device_unregister(&host_bridge->dev);
+	device_del(&host_bridge->dev);
 }
 EXPORT_SYMBOL_GPL(pci_remove_root_bus);
-- 
Gitee


From 1506d1c08588e2ff1855b6551808508f95784e79 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Mon, 23 Mar 2020 17:26:07 -0700
Subject: [PATCH 0902/2161] PCI/DPC: Add Error Disconnect Recover (EDR) support

commit ac1c8e35a3262d04cc81b07fac6480a3539e3b0f upstream.

Error Disconnect Recover (EDR) is a feature that allows ACPI firmware to
notify OSPM that a device has been disconnected due to an error condition
(ACPI v6.3, sec 5.6.6).  OSPM advertises its support for EDR on PCI devices
via _OSC (see [1], sec 4.5.1, table 4-4).  The OSPM EDR notify handler
should invalidate software state associated with disconnected devices and
may attempt to recover them.  OSPM communicates the status of recovery to
the firmware via _OST (sec 6.3.5.2).

For PCIe, firmware may use Downstream Port Containment (DPC) to support
EDR.  Per [1], sec 4.5.1, table 4-6, even if firmware has retained control
of DPC, OSPM may read/write DPC control and status registers during the EDR
notification processing window, i.e., from the time it receives an EDR
notification until it clears the DPC Trigger Status.

Note that per [1], sec 4.5.1 and 4.5.2.4,

  1. If the OS supports EDR, it should advertise that to firmware by
     setting OSC_PCI_EDR_SUPPORT in _OSC Support.

  2. If the OS sets OSC_PCI_EXPRESS_DPC_CONTROL in _OSC Control to request
     control of the DPC capability, it must also set OSC_PCI_EDR_SUPPORT in
     _OSC Support.

Add an EDR notify handler to attempt recovery.

[1] Downstream Port Containment Related Enhancements ECN, Jan 28, 2019,
    affecting PCI Firmware Specification, Rev. 3.2
    https://members.pcisig.com/wg/PCI-SIG/document/12888

[bhelgaas: squash add/enable patches into one]
Link: https://lore.kernel.org/r/90f91fe6d25c13f9d2255d2ce97ca15be307e1bb.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/pci_root.c   |  15 +++
 drivers/pci/pci-acpi.c    |   2 +
 drivers/pci/pcie/Kconfig  |  10 ++
 drivers/pci/pcie/Makefile |   1 +
 drivers/pci/pcie/edr.c    | 239 ++++++++++++++++++++++++++++++++++++++
 drivers/pci/probe.c       |   1 +
 include/linux/acpi.h      |   6 +-
 include/linux/pci-acpi.h  |   8 ++
 include/linux/pci.h       |   1 +
 9 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 drivers/pci/pcie/edr.c

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index d1e666ef3fcc..0cb9df5462c3 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -131,6 +131,7 @@ static struct pci_osc_bit_struct pci_osc_support_bit[] = {
 	{ OSC_PCI_CLOCK_PM_SUPPORT, "ClockPM" },
 	{ OSC_PCI_SEGMENT_GROUPS_SUPPORT, "Segments" },
 	{ OSC_PCI_MSI_SUPPORT, "MSI" },
+	{ OSC_PCI_EDR_SUPPORT, "EDR" },
 	{ OSC_PCI_HPX_TYPE_3_SUPPORT, "HPX-Type3" },
 };
 
@@ -141,6 +142,7 @@ static struct pci_osc_bit_struct pci_osc_control_bit[] = {
 	{ OSC_PCI_EXPRESS_AER_CONTROL, "AER" },
 	{ OSC_PCI_EXPRESS_CAPABILITY_CONTROL, "PCIeCapability" },
 	{ OSC_PCI_EXPRESS_LTR_CONTROL, "LTR" },
+	{ OSC_PCI_EXPRESS_DPC_CONTROL, "DPC" },
 };
 
 static void decode_osc_bits(struct acpi_pci_root *root, char *msg, u32 word,
@@ -440,6 +442,8 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
 		support |= OSC_PCI_ASPM_SUPPORT | OSC_PCI_CLOCK_PM_SUPPORT;
 	if (pci_msi_enabled())
 		support |= OSC_PCI_MSI_SUPPORT;
+	if (IS_ENABLED(CONFIG_PCIE_EDR))
+		support |= OSC_PCI_EDR_SUPPORT;
 
 	decode_osc_support(root, "OS supports", support);
 	status = acpi_pci_osc_support(root, support);
@@ -487,6 +491,15 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
 			control |= OSC_PCI_EXPRESS_AER_CONTROL;
 	}
 
+	/*
+	 * Per the Downstream Port Containment Related Enhancements ECN to
+	 * the PCI Firmware Spec, r3.2, sec 4.5.1, table 4-5,
+	 * OSC_PCI_EXPRESS_DPC_CONTROL indicates the OS supports both DPC
+	 * and EDR.
+	 */
+	if (IS_ENABLED(CONFIG_PCIE_DPC) && IS_ENABLED(CONFIG_PCIE_EDR))
+		control |= OSC_PCI_EXPRESS_DPC_CONTROL;
+
 	requested = control;
 	status = acpi_pci_osc_control_set(handle, &control,
 					  OSC_PCI_EXPRESS_CAPABILITY_CONTROL);
@@ -916,6 +929,8 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 		host_bridge->native_pme = 0;
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_LTR_CONTROL))
 		host_bridge->native_ltr = 0;
+	if (!(root->osc_control_set & OSC_PCI_EXPRESS_DPC_CONTROL))
+		host_bridge->native_dpc = 0;
 
 	/*
 	 * Evaluate the "PCI Boot Configuration" _DSM Function.  If it
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 3a87b5b7f9b0..6c6045939fdf 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -1250,6 +1250,7 @@ static void pci_acpi_setup(struct device *dev)
 
 	pci_acpi_optimize_delay(pci_dev, adev->handle);
 	pci_acpi_set_external_facing(pci_dev);
+	pci_acpi_add_edr_notifier(pci_dev);
 
 	pci_acpi_add_pm_notifier(adev, pci_dev);
 	if (!adev->wakeup.flags.valid)
@@ -1277,6 +1278,7 @@ static void pci_acpi_cleanup(struct device *dev)
 	if (!adev)
 		return;
 
+	pci_acpi_remove_edr_notifier(pci_dev);
 	pci_acpi_remove_pm_notifier(adev);
 	if (adev->wakeup.flags.valid) {
 		acpi_device_power_remove_dependent(adev, dev);
diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
index a5a174d70e05..2dc4ef1141d7 100644
--- a/drivers/pci/pcie/Kconfig
+++ b/drivers/pci/pcie/Kconfig
@@ -151,3 +151,13 @@ config PCIE_BW
 	  This enables PCI Express Bandwidth Change Notification.  If
 	  you know link width or rate changes occur only to correct
 	  unreliable links, you may answer Y.
+
+config PCIE_EDR
+	bool "PCI Express Error Disconnect Recover support"
+	depends on PCIE_DPC && ACPI
+	help
+	  This option adds Error Disconnect Recover support as specified
+	  in the Downstream Port Containment Related Enhancements ECN to
+	  the PCI Firmware Specification r3.2.  Enable this if you want to
+	  support hybrid DPC model which uses both firmware and OS to
+	  implement DPC.
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index efb9d2e71e9e..68da9280ff11 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_PCIE_PME)		+= pme.o
 obj-$(CONFIG_PCIE_DPC)		+= dpc.o
 obj-$(CONFIG_PCIE_PTM)		+= ptm.o
 obj-$(CONFIG_PCIE_BW)		+= bw_notification.o
+obj-$(CONFIG_PCIE_EDR)		+= edr.o
diff --git a/drivers/pci/pcie/edr.c b/drivers/pci/pcie/edr.c
new file mode 100644
index 000000000000..594622a6cb16
--- /dev/null
+++ b/drivers/pci/pcie/edr.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Error Disconnect Recover support
+ * Author: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
+ *
+ * Copyright (C) 2020 Intel Corp.
+ */
+
+#define dev_fmt(fmt) "EDR: " fmt
+
+#include <linux/pci.h>
+#include <linux/pci-acpi.h>
+
+#include "portdrv.h"
+#include "../pci.h"
+
+#define EDR_PORT_DPC_ENABLE_DSM		0x0C
+#define EDR_PORT_LOCATE_DSM		0x0D
+#define EDR_OST_SUCCESS			0x80
+#define EDR_OST_FAILED			0x81
+
+/*
+ * _DSM wrapper function to enable/disable DPC
+ * @pdev   : PCI device structure
+ *
+ * returns 0 on success or errno on failure.
+ */
+static int acpi_enable_dpc(struct pci_dev *pdev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+	union acpi_object *obj, argv4, req;
+	int status = 0;
+
+	/*
+	 * Behavior when calling unsupported _DSM functions is undefined,
+	 * so check whether EDR_PORT_DPC_ENABLE_DSM is supported.
+	 */
+	if (!acpi_check_dsm(adev->handle, &pci_acpi_dsm_guid, 5,
+			    1ULL << EDR_PORT_DPC_ENABLE_DSM))
+		return 0;
+
+	req.type = ACPI_TYPE_INTEGER;
+	req.integer.value = 1;
+
+	argv4.type = ACPI_TYPE_PACKAGE;
+	argv4.package.count = 1;
+	argv4.package.elements = &req;
+
+	/*
+	 * Per Downstream Port Containment Related Enhancements ECN to PCI
+	 * Firmware Specification r3.2, sec 4.6.12, EDR_PORT_DPC_ENABLE_DSM is
+	 * optional.  Return success if it's not implemented.
+	 */
+	obj = acpi_evaluate_dsm(adev->handle, &pci_acpi_dsm_guid, 5,
+				EDR_PORT_DPC_ENABLE_DSM, &argv4);
+	if (!obj)
+		return 0;
+
+	if (obj->type != ACPI_TYPE_INTEGER) {
+		pci_err(pdev, FW_BUG "Enable DPC _DSM returned non integer\n");
+		status = -EIO;
+	}
+
+	if (obj->integer.value != 1) {
+		pci_err(pdev, "Enable DPC _DSM failed to enable DPC\n");
+		status = -EIO;
+	}
+
+	ACPI_FREE(obj);
+
+	return status;
+}
+
+/*
+ * _DSM wrapper function to locate DPC port
+ * @pdev   : Device which received EDR event
+ *
+ * Returns pci_dev or NULL.  Caller is responsible for dropping a reference
+ * on the returned pci_dev with pci_dev_put().
+ */
+static struct pci_dev *acpi_dpc_port_get(struct pci_dev *pdev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+	union acpi_object *obj;
+	u16 port;
+
+	/*
+	 * Behavior when calling unsupported _DSM functions is undefined,
+	 * so check whether EDR_PORT_DPC_ENABLE_DSM is supported.
+	 */
+	if (!acpi_check_dsm(adev->handle, &pci_acpi_dsm_guid, 5,
+			    1ULL << EDR_PORT_LOCATE_DSM))
+		return pci_dev_get(pdev);
+
+	obj = acpi_evaluate_dsm(adev->handle, &pci_acpi_dsm_guid, 5,
+				EDR_PORT_LOCATE_DSM, NULL);
+	if (!obj)
+		return pci_dev_get(pdev);
+
+	if (obj->type != ACPI_TYPE_INTEGER) {
+		ACPI_FREE(obj);
+		pci_err(pdev, FW_BUG "Locate Port _DSM returned non integer\n");
+		return NULL;
+	}
+
+	/*
+	 * Firmware returns DPC port BDF details in following format:
+	 *	15:8 = bus
+	 *	 7:3 = device
+	 *	 2:0 = function
+	 */
+	port = obj->integer.value;
+
+	ACPI_FREE(obj);
+
+	return pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
+					   PCI_BUS_NUM(port), port & 0xff);
+}
+
+/*
+ * _OST wrapper function to let firmware know the status of EDR event
+ * @pdev   : Device used to send _OST
+ * @edev   : Device which experienced EDR event
+ * @status : Status of EDR event
+ */
+static int acpi_send_edr_status(struct pci_dev *pdev, struct pci_dev *edev,
+				u16 status)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+	u32 ost_status;
+
+	pci_dbg(pdev, "Status for %s: %#x\n", pci_name(edev), status);
+
+	ost_status = PCI_DEVID(edev->bus->number, edev->devfn) << 16;
+	ost_status |= status;
+
+	status = acpi_evaluate_ost(adev->handle, ACPI_NOTIFY_DISCONNECT_RECOVER,
+				   ost_status, NULL);
+	if (ACPI_FAILURE(status))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void edr_handle_event(acpi_handle handle, u32 event, void *data)
+{
+	struct pci_dev *pdev = data, *edev;
+	pci_ers_result_t estate = PCI_ERS_RESULT_DISCONNECT;
+	u16 status;
+
+	pci_info(pdev, "ACPI event %#x received\n", event);
+
+	if (event != ACPI_NOTIFY_DISCONNECT_RECOVER)
+		return;
+
+	/* Locate the port which issued EDR event */
+	edev = acpi_dpc_port_get(pdev);
+	if (!edev) {
+		pci_err(pdev, "Firmware failed to locate DPC port\n");
+		return;
+	}
+
+	pci_dbg(pdev, "Reported EDR dev: %s\n", pci_name(edev));
+
+	/* If port does not support DPC, just send the OST */
+	if (!edev->dpc_cap) {
+		pci_err(edev, FW_BUG "This device doesn't support DPC\n");
+		goto send_ost;
+	}
+
+	/* Check if there is a valid DPC trigger */
+	pci_read_config_word(edev, edev->dpc_cap + PCI_EXP_DPC_STATUS, &status);
+	if (!(status & PCI_EXP_DPC_STATUS_TRIGGER)) {
+		pci_err(edev, "Invalid DPC trigger %#010x\n", status);
+		goto send_ost;
+	}
+
+	dpc_process_error(edev);
+	pci_aer_raw_clear_status(edev);
+
+	/*
+	 * Irrespective of whether the DPC event is triggered by ERR_FATAL
+	 * or ERR_NONFATAL, since the link is already down, use the FATAL
+	 * error recovery path for both cases.
+	 */
+	estate = pcie_do_recovery(edev, pci_channel_io_frozen, dpc_reset_link);
+
+send_ost:
+
+	/*
+	 * If recovery is successful, send _OST(0xF, BDF << 16 | 0x80)
+	 * to firmware. If not successful, send _OST(0xF, BDF << 16 | 0x81).
+	 */
+	if (estate == PCI_ERS_RESULT_RECOVERED) {
+		pci_dbg(edev, "DPC port successfully recovered\n");
+		acpi_send_edr_status(pdev, edev, EDR_OST_SUCCESS);
+	} else {
+		pci_dbg(edev, "DPC port recovery failed\n");
+		acpi_send_edr_status(pdev, edev, EDR_OST_FAILED);
+	}
+
+	pci_dev_put(edev);
+}
+
+void pci_acpi_add_edr_notifier(struct pci_dev *pdev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+	acpi_status status;
+
+	if (!adev) {
+		pci_dbg(pdev, "No valid ACPI node, skipping EDR init\n");
+		return;
+	}
+
+	status = acpi_install_notify_handler(adev->handle, ACPI_SYSTEM_NOTIFY,
+					     edr_handle_event, pdev);
+	if (ACPI_FAILURE(status)) {
+		pci_err(pdev, "Failed to install notify handler\n");
+		return;
+	}
+
+	if (acpi_enable_dpc(pdev))
+		acpi_remove_notify_handler(adev->handle, ACPI_SYSTEM_NOTIFY,
+					   edr_handle_event);
+	else
+		pci_dbg(pdev, "Notify handler installed\n");
+}
+
+void pci_acpi_remove_edr_notifier(struct pci_dev *pdev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+
+	if (!adev)
+		return;
+
+	acpi_remove_notify_handler(adev->handle, ACPI_SYSTEM_NOTIFY,
+				   edr_handle_event);
+	pci_dbg(pdev, "Notify handler removed\n");
+}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 46d06549d9ed..431512fbff65 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -596,6 +596,7 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge)
 	bridge->native_shpc_hotplug = 1;
 	bridge->native_pme = 1;
 	bridge->native_ltr = 1;
+	bridge->native_dpc = 1;
 
 	device_initialize(&bridge->dev);
 }
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index ed68f45c09f7..fb43a7940a68 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -539,8 +539,9 @@ extern bool osc_pc_lpi_support_confirmed;
 #define OSC_PCI_CLOCK_PM_SUPPORT		0x00000004
 #define OSC_PCI_SEGMENT_GROUPS_SUPPORT		0x00000008
 #define OSC_PCI_MSI_SUPPORT			0x00000010
+#define OSC_PCI_EDR_SUPPORT			0x00000080
 #define OSC_PCI_HPX_TYPE_3_SUPPORT		0x00000100
-#define OSC_PCI_SUPPORT_MASKS			0x0000011f
+#define OSC_PCI_SUPPORT_MASKS			0x0000019f
 
 /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */
 #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	0x00000001
@@ -549,7 +550,8 @@ extern bool osc_pc_lpi_support_confirmed;
 #define OSC_PCI_EXPRESS_AER_CONTROL		0x00000008
 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL	0x00000010
 #define OSC_PCI_EXPRESS_LTR_CONTROL		0x00000020
-#define OSC_PCI_CONTROL_MASKS			0x0000003f
+#define OSC_PCI_EXPRESS_DPC_CONTROL		0x00000080
+#define OSC_PCI_CONTROL_MASKS			0x000000bf
 
 #define ACPI_GSB_ACCESS_ATTRIB_QUICK		0x00000002
 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV         0x00000004
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 62b7fdcc661c..2d155bfb8fbf 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -112,6 +112,14 @@ extern const guid_t pci_acpi_dsm_guid;
 #define RESET_DELAY_DSM			0x08
 #define FUNCTION_DELAY_DSM		0x09
 
+#ifdef CONFIG_PCIE_EDR
+void pci_acpi_add_edr_notifier(struct pci_dev *pdev);
+void pci_acpi_remove_edr_notifier(struct pci_dev *pdev);
+#else
+static inline void pci_acpi_add_edr_notifier(struct pci_dev *pdev) { }
+static inline void pci_acpi_remove_edr_notifier(struct pci_dev *pdev) { }
+#endif /* CONFIG_PCIE_EDR */
+
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
 static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 34fb420796be..f6792203017b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -528,6 +528,7 @@ struct pci_host_bridge {
 	unsigned int	native_shpc_hotplug:1;	/* OS may use SHPC hotplug */
 	unsigned int	native_pme:1;		/* OS may use PCIe PME */
 	unsigned int	native_ltr:1;		/* OS may use PCIe LTR */
+	unsigned int	native_dpc:1;		/* OS may use PCIe DPC */
 	unsigned int	preserve_config:1;	/* Preserve FW resource setup */
 
 	/* Resource alignment requirements */
-- 
Gitee


From 78e67fbfacea9774fd94a5d01f4176b4a4d1c0f9 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 7 Jan 2022 16:04:17 +0800
Subject: [PATCH 0903/2161] PCI: pciehp: Ignore Link Down/Up caused by DPC

commit a97396c6eb13f65bea894dbe7739b2e883d40a3e upstream.

Downstream Port Containment (PCIe r5.0, sec. 6.2.10) disables the link upon
an error and attempts to re-enable it when instructed by the DPC driver.

A slot which is both DPC- and hotplug-capable is currently powered off by
pciehp once DPC is triggered (due to the link change) and powered back up
on successful recovery.  That's undesirable, the slot should remain powered
so the hotplugged device remains bound to its driver.  DPC notifies the
driver of the error and of successful recovery in pcie_do_recovery() and
the driver may then restore the device to working state.

Moreover, Sinan points out that turning off slot power by pciehp may foil
recovery by DPC:  Power off/on is a cold reset concurrently to DPC's warm
reset.  Sathyanarayanan reports extended delays or failure in link
retraining by DPC if pciehp brings down the slot.

Fix by detecting whether a Link Down event is caused by DPC and awaiting
recovery if so.  On successful recovery, ignore both the Link Down and the
subsequent Link Up event.

Afterwards, check whether the link is down to detect surprise-removal or
another DPC event immediately after DPC recovery.  Ensure that the
corresponding DLLSC event is not ignored by synthesizing it and invoking
irq_wake_thread() to trigger a re-run of pciehp_ist().

The IRQ threads of the hotplug and DPC drivers, pciehp_ist() and
dpc_handler(), race against each other.  If pciehp is faster than DPC, it
will wait until DPC recovery completes.

Recovery consists of two steps:  The first step (waiting for link
disablement) is recognizable by pciehp through a set DPC Trigger Status
bit.  The second step (waiting for link retraining) is recognizable through
a newly introduced PCI_DPC_RECOVERING flag.

If DPC is faster than pciehp, neither of the two flags will be set and
pciehp may glean the recovery status from the new PCI_DPC_RECOVERED flag.
The flag is zero if DPC didn't occur at all, hence DLLSC events are not
ignored by default.

pciehp waits up to 4 seconds before assuming that DPC recovery failed and
bringing down the slot.  This timeout is not taken from the spec (it
doesn't mandate one) but based on a report from Yicong Yang that DPC may
take a bit more than 3 seconds on HiSilicon's Kunpeng platform.

The timeout is necessary because the DPC Trigger Status bit may never
clear:  On Root Ports which support RP Extensions for DPC, the DPC driver
polls the DPC RP Busy bit for up to 1 second before giving up on DPC
recovery.  Without the timeout, pciehp would then wait indefinitely for DPC
to complete.

This commit draws inspiration from previous attempts to synchronize DPC
with pciehp:

By Sinan Kaya, August 2018:
https://lore.kernel.org/linux-pci/20180818065126.77912-1-okaya@kernel.org/

By Ethan Zhao, October 2020:
https://lore.kernel.org/linux-pci/20201007113158.48933-1-haifeng.zhao@intel.com/

By Kuppuswamy Sathyanarayanan, March 2021:
https://lore.kernel.org/linux-pci/59cb30f5e5ac6d65427ceaadf1012b2ba8dbf66c.1615606143.git.sathyanarayanan.kuppuswamy@linux.intel.com/

Link: https://lore.kernel.org/r/0be565d97438fe2a6d57354b3aa4e8626952a00b.1619857124.git.lukas@wunner.de
Reported-by: Sinan Kaya <okaya@kernel.org>
Reported-by: Ethan Zhao <haifeng.zhao@intel.com>
Reported-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Tested-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/hotplug/pciehp_hpc.c | 36 +++++++++++++++
 drivers/pci/pci.h                |  4 ++
 drivers/pci/pcie/dpc.c           | 77 +++++++++++++++++++++++++++++---
 3 files changed, 111 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 88b996764ff9..6ebb25e312b5 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -524,6 +524,32 @@ void pciehp_power_off_slot(struct controller *ctrl)
 		 PCI_EXP_SLTCTL_PWR_OFF);
 }
 
+static void pciehp_ignore_dpc_link_change(struct controller *ctrl,
+					  struct pci_dev *pdev, int irq)
+{
+	/*
+	 * Ignore link changes which occurred while waiting for DPC recovery.
+	 * Could be several if DPC triggered multiple times consecutively.
+	 */
+	synchronize_hardirq(irq);
+	atomic_and(~PCI_EXP_SLTSTA_DLLSC, &ctrl->pending_events);
+	if (pciehp_poll_mode)
+		pcie_capability_write_word(pdev, PCI_EXP_SLTSTA,
+					   PCI_EXP_SLTSTA_DLLSC);
+	ctrl_info(ctrl, "Slot(%s): Link Down/Up ignored (recovered by DPC)\n",
+		  slot_name(ctrl));
+
+	/*
+	 * If the link is unexpectedly down after successful recovery,
+	 * the corresponding link change may have been ignored above.
+	 * Synthesize it to ensure that it is acted on.
+	 */
+	down_read(&ctrl->reset_lock);
+	if (!pciehp_check_link_active(ctrl))
+		pciehp_request(ctrl, PCI_EXP_SLTSTA_DLLSC);
+	up_read(&ctrl->reset_lock);
+}
+
 static irqreturn_t pciehp_isr(int irq, void *dev_id)
 {
 	struct controller *ctrl = (struct controller *)dev_id;
@@ -667,6 +693,16 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id)
 				      PCI_EXP_SLTCTL_ATTN_IND_ON);
 	}
 
+	/*
+	 * Ignore Link Down/Up events caused by Downstream Port Containment
+	 * if recovery from the error succeeded.
+	 */
+	if ((events & PCI_EXP_SLTSTA_DLLSC) && pci_dpc_recovered(pdev) &&
+	    ctrl->state == ON_STATE) {
+		events &= ~PCI_EXP_SLTSTA_DLLSC;
+		pciehp_ignore_dpc_link_change(ctrl, pdev, irq);
+	}
+
 	/*
 	 * Disable requests have higher priority than Presence Detect Changed
 	 * or Data Link Layer State Changed events.
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1a995d611437..8f73ac94f9bd 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -410,6 +410,8 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
 
 /* pci_dev priv_flags */
 #define PCI_DEV_ADDED 0
+#define PCI_DPC_RECOVERED 1
+#define PCI_DPC_RECOVERING 2
 
 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
 {
@@ -455,10 +457,12 @@ void pci_restore_dpc_state(struct pci_dev *dev);
 void pci_dpc_init(struct pci_dev *pdev);
 void dpc_process_error(struct pci_dev *pdev);
 pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
+bool pci_dpc_recovered(struct pci_dev *pdev);
 #else
 static inline void pci_save_dpc_state(struct pci_dev *dev) {}
 static inline void pci_restore_dpc_state(struct pci_dev *dev) {}
 static inline void pci_dpc_init(struct pci_dev *pdev) {}
+static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; }
 #endif
 
 #ifdef CONFIG_PCI_ATS
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 762170423fdd..5081e4c78795 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -71,6 +71,58 @@ void pci_restore_dpc_state(struct pci_dev *dev)
 	pci_write_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, *cap);
 }
 
+static DECLARE_WAIT_QUEUE_HEAD(dpc_completed_waitqueue);
+
+#ifdef CONFIG_HOTPLUG_PCI_PCIE
+static bool dpc_completed(struct pci_dev *pdev)
+{
+	u16 status;
+
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_STATUS, &status);
+	if ((status != 0xffff) && (status & PCI_EXP_DPC_STATUS_TRIGGER))
+		return false;
+
+	if (test_bit(PCI_DPC_RECOVERING, &pdev->priv_flags))
+		return false;
+
+	return true;
+}
+
+/**
+ * pci_dpc_recovered - whether DPC triggered and has recovered successfully
+ * @pdev: PCI device
+ *
+ * Return true if DPC was triggered for @pdev and has recovered successfully.
+ * Wait for recovery if it hasn't completed yet.  Called from the PCIe hotplug
+ * driver to recognize and ignore Link Down/Up events caused by DPC.
+ */
+bool pci_dpc_recovered(struct pci_dev *pdev)
+{
+	struct pci_host_bridge *host;
+
+	if (!pdev->dpc_cap)
+		return false;
+
+	/*
+	 * Synchronization between hotplug and DPC is not supported
+	 * if DPC is owned by firmware and EDR is not enabled.
+	 */
+	host = pci_find_host_bridge(pdev->bus);
+	if (!host->native_dpc && !IS_ENABLED(CONFIG_PCIE_EDR))
+		return false;
+
+	/*
+	 * Need a timeout in case DPC never completes due to failure of
+	 * dpc_wait_rp_inactive().  The spec doesn't mandate a time limit,
+	 * but reports indicate that DPC completes within 4 seconds.
+	 */
+	wait_event_timeout(dpc_completed_waitqueue, dpc_completed(pdev),
+			   msecs_to_jiffies(4000));
+
+	return test_and_clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+}
+#endif /* CONFIG_HOTPLUG_PCI_PCIE */
+
 static int dpc_wait_rp_inactive(struct pci_dev *pdev)
 {
 	unsigned long timeout = jiffies + HZ;
@@ -91,8 +143,11 @@ static int dpc_wait_rp_inactive(struct pci_dev *pdev)
 
 pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
+	pci_ers_result_t ret;
 	u16 cap;
 
+	set_bit(PCI_DPC_RECOVERING, &pdev->priv_flags);
+
 	/*
 	 * DPC disables the Link automatically in hardware, so it has
 	 * already been reset by the time we get here.
@@ -105,16 +160,26 @@ pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 	 */
 	pcie_wait_for_link(pdev, false);
 
-	if (pdev->dpc_rp_extensions && dpc_wait_rp_inactive(pdev))
-		return PCI_ERS_RESULT_DISCONNECT;
+	if (pdev->dpc_rp_extensions && dpc_wait_rp_inactive(pdev)) {
+		clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+		ret = PCI_ERS_RESULT_DISCONNECT;
+		goto out;
+	}
 
 	pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
 			      PCI_EXP_DPC_STATUS_TRIGGER);
 
-	if (!pcie_wait_for_link(pdev, true))
-		return PCI_ERS_RESULT_DISCONNECT;
-
-	return PCI_ERS_RESULT_RECOVERED;
+	if (!pcie_wait_for_link(pdev, true)) {
+		clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+		ret = PCI_ERS_RESULT_DISCONNECT;
+	} else {
+		set_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+		ret = PCI_ERS_RESULT_RECOVERED;
+	}
+out:
+	clear_bit(PCI_DPC_RECOVERING, &pdev->priv_flags);
+	wake_up_all(&dpc_completed_waitqueue);
+	return ret;
 }
 
 static void dpc_process_rp_pio_error(struct pci_dev *pdev)
-- 
Gitee


From 12c6952f12e47a627c901e83b1ddd709e6840f8f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:25 -0700
Subject: [PATCH 0904/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 framework

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

Intel Sapphire Rapids supports a discovery mechanism, that allows an
uncore driver to discover the different components ("boxes") of the
chip.

All the generic information of the uncore boxes should be retrieved from
the discovery tables. This has been enabled with the commit edae1f06c2cd
("perf/x86/intel/uncore: Parse uncore discovery tables"). Add
use_discovery to indicate the case. The uncore driver doesn't need to
hard code the generic information for each uncore box.
But we still need to enable various functionality that cannot be
directly discovered.

To support these functionalities, the Sapphire Rapids server framework
is introduced here. Each specific uncore unit will be added into the
framework in the following patches.

Add use_discovery to indicate that the discovery mechanism is required
for the platform. Currently, Intel Sapphire Rapids is one of the
platforms.

The box ID from the discovery table is the accurate index. Use it if
applicable.

All the undiscovered platform-specific features will be hard code in the
spr_uncores[]. Add uncore_type_customized_copy(), instead of the memcpy,
to only overwrite these features.

The specific uncore unit hasn't been added here. From user's
perspective, there is nothing changed for now.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c           | 26 +++++--
 arch/x86/events/intel/uncore.h           |  3 +
 arch/x86/events/intel/uncore_discovery.c |  2 +-
 arch/x86/events/intel/uncore_discovery.h |  3 +
 arch/x86/events/intel/uncore_snbep.c     | 87 ++++++++++++++++++++++++
 5 files changed, 116 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 9337724ea6f4..840301864961 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -854,9 +854,13 @@ static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
 			sprintf(pmu->name, "uncore_%s", type->name);
 		else
 			sprintf(pmu->name, "uncore");
-	} else
-		sprintf(pmu->name, "uncore_%s_%d", type->name, pmu->pmu_idx);
-
+	} else {
+		/*
+		 * Use the box ID from the discovery table if applicable.
+		 */
+		sprintf(pmu->name, "uncore_%s_%d", type->name,
+			type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx);
+	}
 }
 
 static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
@@ -1648,6 +1652,7 @@ struct intel_uncore_init_fun {
 	void	(*cpu_init)(void);
 	int	(*pci_init)(void);
 	void	(*mmio_init)(void);
+	bool	use_discovery;
 };
 
 static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
@@ -1730,6 +1735,13 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
 	.mmio_init = snr_uncore_mmio_init,
 };
 
+static const struct intel_uncore_init_fun spr_uncore_init __initconst = {
+	.cpu_init = spr_uncore_cpu_init,
+	.pci_init = spr_uncore_pci_init,
+	.mmio_init = spr_uncore_mmio_init,
+	.use_discovery = true,
+};
+
 static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
 	.cpu_init = intel_uncore_generic_uncore_cpu_init,
 	.pci_init = intel_uncore_generic_uncore_pci_init,
@@ -1767,6 +1779,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE,	  icl_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_D,	  icx_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_X,	  icx_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SAPPHIRERAPIDS_X,	spr_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_D, snr_uncore_init),
 	{},
 };
@@ -1791,8 +1804,13 @@ static int __init intel_uncore_init(void)
 			uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init;
 		else
 			return -ENODEV;
-	} else
+	} else {
 		uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+		if (uncore_no_discover && uncore_init->use_discovery)
+			return -ENODEV;
+		if (uncore_init->use_discovery && !intel_uncore_has_discovery_tables())
+			return -ENODEV;
+	}
 
 	if (uncore_init->pci_init) {
 		pret = uncore_init->pci_init();
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 3dfc0599fa5f..a682acc31e9a 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -563,6 +563,9 @@ void snr_uncore_mmio_init(void);
 int icx_uncore_pci_init(void);
 void icx_uncore_cpu_init(void);
 void icx_uncore_mmio_init(void);
+int spr_uncore_pci_init(void);
+void spr_uncore_cpu_init(void);
+void spr_uncore_mmio_init(void);
 
 /* uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index aba9bff95413..93148e215bbc 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -568,7 +568,7 @@ static bool uncore_update_uncore_type(enum uncore_access_type type_id,
 	return true;
 }
 
-static struct intel_uncore_type **
+struct intel_uncore_type **
 intel_uncore_generic_init_uncores(enum uncore_access_type type_id)
 {
 	struct intel_uncore_discovery_type *type;
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 1d652939a01c..d7ccc8af6d18 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -129,3 +129,6 @@ void intel_uncore_clear_discovery_tables(void);
 void intel_uncore_generic_uncore_cpu_init(void);
 int intel_uncore_generic_uncore_pci_init(void);
 void intel_uncore_generic_uncore_mmio_init(void);
+
+struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bb0e6f9f32c..605e973a3ed0 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* SandyBridge-EP/IvyTown uncore support */
 #include "uncore.h"
+#include "uncore_discovery.h"
 
 /* SNB-EP pci bus to socket mapping */
 #define SNBEP_CPUNODEID			0x40
@@ -5059,3 +5060,89 @@ void icx_uncore_mmio_init(void)
 }
 
 /* end of ICX uncore support */
+
+/* SPR uncore support */
+
+#define UNCORE_SPR_NUM_UNCORE_TYPES		12
+
+static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+};
+
+static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
+					struct intel_uncore_type *from_type)
+{
+	if (!to_type || !from_type)
+		return;
+
+	if (from_type->name)
+		to_type->name = from_type->name;
+	if (from_type->fixed_ctr_bits)
+		to_type->fixed_ctr_bits = from_type->fixed_ctr_bits;
+	if (from_type->event_mask)
+		to_type->event_mask = from_type->event_mask;
+	if (from_type->event_mask_ext)
+		to_type->event_mask_ext = from_type->event_mask_ext;
+	if (from_type->fixed_ctr)
+		to_type->fixed_ctr = from_type->fixed_ctr;
+	if (from_type->fixed_ctl)
+		to_type->fixed_ctl = from_type->fixed_ctl;
+	if (from_type->fixed_ctr_bits)
+		to_type->fixed_ctr_bits = from_type->fixed_ctr_bits;
+	if (from_type->num_shared_regs)
+		to_type->num_shared_regs = from_type->num_shared_regs;
+	if (from_type->constraints)
+		to_type->constraints = from_type->constraints;
+	if (from_type->ops)
+		to_type->ops = from_type->ops;
+	if (from_type->event_descs)
+		to_type->event_descs = from_type->event_descs;
+	if (from_type->format_group)
+		to_type->format_group = from_type->format_group;
+}
+
+static struct intel_uncore_type **
+uncore_get_uncores(enum uncore_access_type type_id)
+{
+	struct intel_uncore_type **types, **start_types;
+
+	start_types = types = intel_uncore_generic_init_uncores(type_id);
+
+	/* Only copy the customized features */
+	for (; *types; types++) {
+		if ((*types)->type_id >= UNCORE_SPR_NUM_UNCORE_TYPES)
+			continue;
+		uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]);
+	}
+
+	return start_types;
+}
+
+void spr_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR);
+}
+
+int spr_uncore_pci_init(void)
+{
+	uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI);
+	return 0;
+}
+
+void spr_uncore_mmio_init(void)
+{
+	uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO);
+}
+
+/* end of SPR uncore support */
-- 
Gitee


From 7ca7898a22085b99015afd651f3b1ca0f38a0b00 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:26 -0700
Subject: [PATCH 0905/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 CHA support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

CHA merges the caching agent and Home Agent (HA) responsibilities of the
chip into a single block. It's one of the Sapphire Rapids server uncore
units.

The layout of the control registers for a CHA uncore unit is a little
bit different from the generic one. The CHA uncore unit also supports a
filter register for TID. So a specific format and ops are required.
Expose the common MSR ops which can be reused.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-3-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_discovery.c |  6 +-
 arch/x86/events/intel/uncore_discovery.h |  4 ++
 arch/x86/events/intel/uncore_snbep.c     | 90 +++++++++++++++++++++++-
 3 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 93148e215bbc..25f1c0157078 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -337,17 +337,17 @@ static const struct attribute_group generic_uncore_format_group = {
 	.attrs = generic_uncore_formats_attr,
 };
 
-static void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box)
+void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box)
 {
 	wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_INT);
 }
 
-static void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box)
+void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box)
 {
 	wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ);
 }
 
-static void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box)
+void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box)
 {
 	wrmsrl(uncore_msr_box_ctl(box), 0);
 }
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index d7ccc8af6d18..e836a68a8025 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -130,5 +130,9 @@ void intel_uncore_generic_uncore_cpu_init(void);
 int intel_uncore_generic_uncore_pci_init(void);
 void intel_uncore_generic_uncore_mmio_init(void);
 
+void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box);
+
 struct intel_uncore_type **
 intel_uncore_generic_init_uncores(enum uncore_access_type type_id);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 605e973a3ed0..face34468d29 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -419,6 +419,17 @@
 #define ICX_NUMBER_IMC_CHN			2
 #define ICX_IMC_MEM_STRIDE			0x4
 
+/* SPR */
+#define SPR_RAW_EVENT_MASK_EXT			0xffffff
+
+/* SPR CHA */
+#define SPR_CHA_PMON_CTL_TID_EN			(1 << 16)
+#define SPR_CHA_PMON_EVENT_MASK			(SNBEP_PMON_RAW_EVENT_MASK | \
+						 SPR_CHA_PMON_CTL_TID_EN)
+#define SPR_CHA_PMON_BOX_FILTER_TID		0x3ff
+
+#define SPR_C0_MSR_PMON_BOX_FILTER0		0x200e
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
 DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
@@ -431,6 +442,7 @@ DEFINE_UNCORE_FORMAT_ATTR(umask_ext4, umask, "config:8-15,32-55");
 DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en2, tid_en, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
 DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
 DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
@@ -5063,10 +5075,86 @@ void icx_uncore_mmio_init(void)
 
 /* SPR uncore support */
 
+static void spr_uncore_msr_enable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE)
+		wrmsrl(reg1->reg, reg1->config);
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void spr_uncore_msr_disable_event(struct intel_uncore_box *box,
+					 struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE)
+		wrmsrl(reg1->reg, 0);
+
+	wrmsrl(hwc->config_base, 0);
+}
+
+static int spr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	bool tie_en = !!(event->hw.config & SPR_CHA_PMON_CTL_TID_EN);
+	struct intel_uncore_type *type = box->pmu->type;
+
+	if (tie_en) {
+		reg1->reg = SPR_C0_MSR_PMON_BOX_FILTER0 +
+			    HSWEP_CBO_MSR_OFFSET * type->box_ids[box->pmu->pmu_idx];
+		reg1->config = event->attr.config1 & SPR_CHA_PMON_BOX_FILTER_TID;
+		reg1->idx = 0;
+	}
+
+	return 0;
+}
+
+static struct intel_uncore_ops spr_uncore_chabox_ops = {
+	.init_box		= intel_generic_uncore_msr_init_box,
+	.disable_box		= intel_generic_uncore_msr_disable_box,
+	.enable_box		= intel_generic_uncore_msr_enable_box,
+	.disable_event		= spr_uncore_msr_disable_event,
+	.enable_event		= spr_uncore_msr_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= spr_cha_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct attribute *spr_uncore_cha_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask_ext4.attr,
+	&format_attr_tid_en2.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid5.attr,
+	NULL,
+};
+static const struct attribute_group spr_uncore_chabox_format_group = {
+	.name = "format",
+	.attrs = spr_uncore_cha_formats_attr,
+};
+
+static struct intel_uncore_type spr_uncore_chabox = {
+	.name			= "cha",
+	.event_mask		= SPR_CHA_PMON_EVENT_MASK,
+	.event_mask_ext		= SPR_RAW_EVENT_MASK_EXT,
+	.num_shared_regs	= 1,
+	.ops			= &spr_uncore_chabox_ops,
+	.format_group		= &spr_uncore_chabox_format_group,
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
-	NULL,
+	&spr_uncore_chabox,
 	NULL,
 	NULL,
 	NULL,
-- 
Gitee


From c7ecfdee234faa4ca5b11e9f2f3397646bb71953 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:27 -0700
Subject: [PATCH 0906/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 IIO support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

The IIO stacks are responsible for managing the traffic between the PCI
Express* (PCIe*) domain and the mesh domain. The IIO PMON block is
situated near the IIO stacks traffic controller capturing the traffic
controller as well as the PCIe* root port information.

The layout of the control registers for a IIO uncore unit is a little
bit different from the generic one.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-4-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index face34468d29..171da318ebf7 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5151,11 +5151,18 @@ static struct intel_uncore_type spr_uncore_chabox = {
 	.format_group		= &spr_uncore_chabox_format_group,
 };
 
+static struct intel_uncore_type spr_uncore_iio = {
+	.name			= "iio",
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.event_mask_ext		= SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
+	.format_group		= &snr_uncore_iio_format_group,
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_chabox,
-	NULL,
+	&spr_uncore_iio,
 	NULL,
 	NULL,
 	NULL,
-- 
Gitee


From 55880a14d384680046eb6e295b84b248ee9e9c93 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:28 -0700
Subject: [PATCH 0907/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 IRP support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

The IRP is responsible for maintaining coherency for the IIO traffic
targeting coherent memory.

The layout of the control registers for a IRP uncore unit is a little
bit different from the generic one.

Factor out SPR_UNCORE_COMMON_FORMAT, which can be reused by the
following uncore units.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-5-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 171da318ebf7..168b09a4b9b3 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5158,12 +5158,37 @@ static struct intel_uncore_type spr_uncore_iio = {
 	.format_group		= &snr_uncore_iio_format_group,
 };
 
+static struct attribute *spr_uncore_raw_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask_ext4.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static const struct attribute_group spr_uncore_raw_format_group = {
+	.name			= "format",
+	.attrs			= spr_uncore_raw_formats_attr,
+};
+
+#define SPR_UNCORE_COMMON_FORMAT()				\
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,	\
+	.event_mask_ext		= SPR_RAW_EVENT_MASK_EXT,	\
+	.format_group		= &spr_uncore_raw_format_group
+
+static struct intel_uncore_type spr_uncore_irp = {
+	SPR_UNCORE_COMMON_FORMAT(),
+	.name			= "irp",
+
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_chabox,
 	&spr_uncore_iio,
-	NULL,
+	&spr_uncore_irp,
 	NULL,
 	NULL,
 	NULL,
-- 
Gitee


From 4a2a1ee4f8dcc88168f9dca44eb63fba843c65e2 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:29 -0700
Subject: [PATCH 0908/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 M2PCIe support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

M2PCIe* blocks manage the interface between the mesh and each IIO stack.

The layout of the control registers for a M2PCIe uncore unit is similar
to a IRP uncore unit.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-6-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 168b09a4b9b3..ff0deb780115 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5183,13 +5183,18 @@ static struct intel_uncore_type spr_uncore_irp = {
 
 };
 
+static struct intel_uncore_type spr_uncore_m2pcie = {
+	SPR_UNCORE_COMMON_FORMAT(),
+	.name			= "m2pcie",
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_chabox,
 	&spr_uncore_iio,
 	&spr_uncore_irp,
-	NULL,
+	&spr_uncore_m2pcie,
 	NULL,
 	NULL,
 	NULL,
-- 
Gitee


From 4ef3ceca58d1c0253b192772aba2ed909581500a Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:30 -0700
Subject: [PATCH 0909/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 PCU support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

The PCU is the primary power controller for the Sapphire Rapids.

Except the name, all the information can be retrieved from the discovery
tables.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-7-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index ff0deb780115..bf69882cb6fc 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5188,6 +5188,10 @@ static struct intel_uncore_type spr_uncore_m2pcie = {
 	.name			= "m2pcie",
 };
 
+static struct intel_uncore_type spr_uncore_pcu = {
+	.name			= "pcu",
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
@@ -5195,7 +5199,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_iio,
 	&spr_uncore_irp,
 	&spr_uncore_m2pcie,
-	NULL,
+	&spr_uncore_pcu,
 	NULL,
 	NULL,
 	NULL,
-- 
Gitee


From 932b800e1183fb4391a47a75da5834adb1f13796 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:31 -0700
Subject: [PATCH 0910/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 IMC support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

The Sapphire Rapids IMC provides the interface to the DRAM and
communicates to the rest of the uncore through the M2M block.

The layout of the control registers for a IMC uncore unit is a little
bit different from the generic one. There is a fixed counter for IMC.
So a specific format and ops are required. Expose the common MMIO ops
which can be reused.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-8-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_discovery.c | 10 +++----
 arch/x86/events/intel/uncore_discovery.h |  6 ++++
 arch/x86/events/intel/uncore_snbep.c     | 35 +++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 25f1c0157078..cc44311fe2d8 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -454,7 +454,7 @@ static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box)
 	return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx];
 }
 
-static void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
+void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
 {
 	unsigned int box_ctl = generic_uncore_mmio_box_ctl(box);
 	struct intel_uncore_type *type = box->pmu->type;
@@ -478,7 +478,7 @@ static void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
 	writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr);
 }
 
-static void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
+void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
 {
 	if (!box->io_addr)
 		return;
@@ -486,7 +486,7 @@ static void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
 	writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr);
 }
 
-static void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
+void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
 {
 	if (!box->io_addr)
 		return;
@@ -505,8 +505,8 @@ static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
 	writel(hwc->config, box->io_addr + hwc->config_base);
 }
 
-static void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
-					      struct perf_event *event)
+void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+					     struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index e836a68a8025..97232437dfb9 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -134,5 +134,11 @@ void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box);
 void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box);
 void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box);
 
+void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+					     struct perf_event *event);
+
 struct intel_uncore_type **
 intel_uncore_generic_init_uncores(enum uncore_access_type type_id);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index bf69882cb6fc..b0a4fb564cfe 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5192,6 +5192,39 @@ static struct intel_uncore_type spr_uncore_pcu = {
 	.name			= "pcu",
 };
 
+static void spr_uncore_mmio_enable_event(struct intel_uncore_box *box,
+					 struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box->io_addr)
+		return;
+
+	if (uncore_pmc_fixed(hwc->idx))
+		writel(SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base);
+	else
+		writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops spr_uncore_mmio_ops = {
+	.init_box		= intel_generic_uncore_mmio_init_box,
+	.exit_box		= uncore_mmio_exit_box,
+	.disable_box		= intel_generic_uncore_mmio_disable_box,
+	.enable_box		= intel_generic_uncore_mmio_enable_box,
+	.disable_event		= intel_generic_uncore_mmio_disable_event,
+	.enable_event		= spr_uncore_mmio_enable_event,
+	.read_counter		= uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type spr_uncore_imc = {
+	SPR_UNCORE_COMMON_FORMAT(),
+	.name			= "imc",
+	.fixed_ctr_bits		= 48,
+	.fixed_ctr		= SNR_IMC_MMIO_PMON_FIXED_CTR,
+	.fixed_ctl		= SNR_IMC_MMIO_PMON_FIXED_CTL,
+	.ops			= &spr_uncore_mmio_ops,
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
@@ -5201,7 +5234,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_m2pcie,
 	&spr_uncore_pcu,
 	NULL,
-	NULL,
+	&spr_uncore_imc,
 	NULL,
 	NULL,
 	NULL,
-- 
Gitee


From 9803f5cd3278591235507210e4a4b04ce1095f04 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:32 -0700
Subject: [PATCH 0911/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 M2M support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

The M2M blocks manage the interface between the mesh (operating on both
the mesh and the SMI3 protocol) and the memory controllers.

The layout of the control registers for a M2M uncore unit is a little
 bit different from the generic one. So a specific format and ops are
required. Expose the common PCI ops which can be reused.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-9-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_discovery.c | 14 +++++------
 arch/x86/events/intel/uncore_discovery.h |  8 +++++++
 arch/x86/events/intel/uncore_snbep.c     | 30 +++++++++++++++++++++++-
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index cc44311fe2d8..6322df1dd8e0 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -377,7 +377,7 @@ static struct intel_uncore_ops generic_uncore_msr_ops = {
 	.read_counter		= uncore_msr_read_counter,
 };
 
-static void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
+void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
 {
 	struct pci_dev *pdev = box->pci_dev;
 	int box_ctl = uncore_pci_box_ctl(box);
@@ -386,7 +386,7 @@ static void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
 	pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT);
 }
 
-static void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
+void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
 {
 	struct pci_dev *pdev = box->pci_dev;
 	int box_ctl = uncore_pci_box_ctl(box);
@@ -394,7 +394,7 @@ static void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
 	pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ);
 }
 
-static void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box)
+void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box)
 {
 	struct pci_dev *pdev = box->pci_dev;
 	int box_ctl = uncore_pci_box_ctl(box);
@@ -411,8 +411,8 @@ static void intel_generic_uncore_pci_enable_event(struct intel_uncore_box *box,
 	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
 }
 
-static void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
-					     struct perf_event *event)
+void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+					    struct perf_event *event)
 {
 	struct pci_dev *pdev = box->pci_dev;
 	struct hw_perf_event *hwc = &event->hw;
@@ -420,8 +420,8 @@ static void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
 	pci_write_config_dword(pdev, hwc->config_base, 0);
 }
 
-static u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
-					   struct perf_event *event)
+u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+					  struct perf_event *event)
 {
 	struct pci_dev *pdev = box->pci_dev;
 	struct hw_perf_event *hwc = &event->hw;
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 97232437dfb9..b85655ba835b 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -140,5 +140,13 @@ void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box);
 void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
 					     struct perf_event *event);
 
+void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+					    struct perf_event *event);
+u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+					  struct perf_event *event);
+
 struct intel_uncore_type **
 intel_uncore_generic_init_uncores(enum uncore_access_type type_id);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index b0a4fb564cfe..afc9fad3b5e0 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5225,6 +5225,34 @@ static struct intel_uncore_type spr_uncore_imc = {
 	.ops			= &spr_uncore_mmio_ops,
 };
 
+static void spr_uncore_pci_enable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32));
+	pci_write_config_dword(pdev, hwc->config_base, (u32)hwc->config);
+}
+
+static struct intel_uncore_ops spr_uncore_pci_ops = {
+	.init_box		= intel_generic_uncore_pci_init_box,
+	.disable_box		= intel_generic_uncore_pci_disable_box,
+	.enable_box		= intel_generic_uncore_pci_enable_box,
+	.disable_event		= intel_generic_uncore_pci_disable_event,
+	.enable_event		= spr_uncore_pci_enable_event,
+	.read_counter		= intel_generic_uncore_pci_read_counter,
+};
+
+#define SPR_UNCORE_PCI_COMMON_FORMAT()			\
+	SPR_UNCORE_COMMON_FORMAT(),			\
+	.ops			= &spr_uncore_pci_ops
+
+static struct intel_uncore_type spr_uncore_m2m = {
+	SPR_UNCORE_PCI_COMMON_FORMAT(),
+	.name			= "m2m",
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
@@ -5235,7 +5263,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_pcu,
 	NULL,
 	&spr_uncore_imc,
-	NULL,
+	&spr_uncore_m2m,
 	NULL,
 	NULL,
 	NULL,
-- 
Gitee


From 47a92c8a0cc7c712d24f82e1afd8dfd8ce6c6503 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:33 -0700
Subject: [PATCH 0912/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 UPI support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

Sapphire Rapids uses a coherent interconnect for scaling to multiple
sockets known as Intel UPI. Intel UPI technology provides a cache
coherent socket to socket external communication interface between
processors.

The layout of the control registers for a UPI uncore unit is similar to
a M2M uncore unit.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-10-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index afc9fad3b5e0..448639741959 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5253,6 +5253,11 @@ static struct intel_uncore_type spr_uncore_m2m = {
 	.name			= "m2m",
 };
 
+static struct intel_uncore_type spr_uncore_upi = {
+	SPR_UNCORE_PCI_COMMON_FORMAT(),
+	.name			= "upi",
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
@@ -5264,7 +5269,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	NULL,
 	&spr_uncore_imc,
 	&spr_uncore_m2m,
-	NULL,
+	&spr_uncore_upi,
 	NULL,
 	NULL,
 	NULL,
-- 
Gitee


From 6adcc041e1a1c2d9c9bb7732a8f24ba9e4d7905c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:34 -0700
Subject: [PATCH 0913/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 M3UPI support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

M3 Intel UPI is the interface between the mesh and the Intel UPI link
layer. It is responsible for translating between the mesh protocol
packets and the flits that are used for transmitting data across the
Intel UPI interface.

The layout of the control registers for a M3UPI uncore unit is similar
to a UPI uncore unit.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-11-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 448639741959..9e44a698e8da 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5258,6 +5258,11 @@ static struct intel_uncore_type spr_uncore_upi = {
 	.name			= "upi",
 };
 
+static struct intel_uncore_type spr_uncore_m3upi = {
+	SPR_UNCORE_PCI_COMMON_FORMAT(),
+	.name			= "m3upi",
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
@@ -5270,7 +5275,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_imc,
 	&spr_uncore_m2m,
 	&spr_uncore_upi,
-	NULL,
+	&spr_uncore_m3upi,
 	NULL,
 	NULL,
 };
-- 
Gitee


From 0f531649a86c310b88212a9f06b146ca882bf1a2 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:35 -0700
Subject: [PATCH 0914/2161] perf/x86/intel/uncore: Add Sapphire Rapids server
 MDF support

commit c54c53d9921adef2c239cb43d5a936b63c57ebf0 upstream.

The MDF subsystem is a new IP built to support the new Intel Xeon
architecture that bridges multiple dies with a embedded bridge system.

The layout of the control registers for a MDF uncore unit is similar to
a IRP uncore unit.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-12-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 9e44a698e8da..2696c34cc73c 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5263,6 +5263,11 @@ static struct intel_uncore_type spr_uncore_m3upi = {
 	.name			= "m3upi",
 };
 
+static struct intel_uncore_type spr_uncore_mdf = {
+	SPR_UNCORE_COMMON_FORMAT(),
+	.name			= "mdf",
+};
+
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
@@ -5277,7 +5282,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_upi,
 	&spr_uncore_m3upi,
 	NULL,
-	NULL,
+	&spr_uncore_mdf,
 };
 
 static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
-- 
Gitee


From 56b9a041b33c9ef449d53217a15302be27c834d1 Mon Sep 17 00:00:00 2001
From: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Date: Mon, 1 Jun 2020 11:35:41 +0300
Subject: [PATCH 0915/2161] perf/x86/intel/uncore: Expose an Uncore unit to
 PMON mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 19a39819818dee57e363bd44bd096e2e940a456b upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each Uncore unit type, by its nature, can be mapped to its own context -
which platform component each PMON block of that type is supposed to
monitor.

Intel® Xeon® Scalable processor family (code name Skylake-SP) makes
significant changes in the integrated I/O (IIO) architecture. The new
solution introduces IIO stacks which are responsible for managing traffic
between the PCIe domain and the Mesh domain. Each IIO stack has its own
PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link
or various built-in accelerators. IIO PMON blocks allow concurrent
monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack.

Software is supposed to program required perf counters within each IIO
stack and gather performance data. The tricky thing here is that IIO PMON
reports data per IIO stack but users have no idea what IIO stacks are -
they only know devices which are connected to the platform.

Understanding IIO stack concept to find which IIO stack that particular
IO device is connected to, or to identify an IIO PMON block to program
for monitoring specific IIO stack assumes a lot of implicit knowledge
about given Intel server platform architecture.

Usage example:
    ls /sys/devices/uncore_<type>_<pmu_idx>/die*

Signed-off-by: Alexander Antonov <alexander.antonov@linux.intel.com>
Signed-off-by: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Link: https://lkml.kernel.org/r/20200601083543.30011-2-alexander.antonov@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c |  8 ++++++++
 arch/x86/events/intel/uncore.h | 12 ++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 840301864961..cd95d724fb5d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -881,10 +881,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 			.read		= uncore_pmu_event_read,
 			.module		= THIS_MODULE,
 			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+			.attr_update	= pmu->type->attr_update,
 		};
 	} else {
 		pmu->pmu = *pmu->type->pmu;
 		pmu->pmu.attr_groups = pmu->type->attr_groups;
+		pmu->pmu.attr_update = pmu->type->attr_update;
 	}
 
 	uncore_get_pmu_name(pmu);
@@ -917,6 +919,9 @@ static void uncore_type_exit(struct intel_uncore_type *type)
 	struct intel_uncore_pmu *pmu = type->pmus;
 	int i;
 
+	if (type->cleanup_mapping)
+		type->cleanup_mapping(type);
+
 	if (pmu) {
 		for (i = 0; i < type->num_boxes; i++, pmu++) {
 			uncore_pmu_unregister(pmu);
@@ -988,6 +993,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 
 	type->pmu_group = &uncore_pmu_attr_group;
 
+	if (type->set_mapping)
+		type->set_mapping(type);
+
 	return 0;
 
 err:
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index a682acc31e9a..d0f9988a466e 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -80,7 +80,19 @@ struct intel_uncore_type {
 	struct uncore_event_desc *event_descs;
 	struct freerunning_counters *freerunning;
 	const struct attribute_group *attr_groups[4];
+	const struct attribute_group **attr_update;
 	struct pmu *pmu; /* for custom pmu ops */
+	/*
+	 * Uncore PMU would store relevant platform topology configuration here
+	 * to identify which platform component each PMON block of that type is
+	 * supposed to monitor.
+	 */
+	u64 *topology;
+	/*
+	 * Optional callbacks for managing mapping of Uncore units to PMONs
+	 */
+	int (*set_mapping)(struct intel_uncore_type *type);
+	void (*cleanup_mapping)(struct intel_uncore_type *type);
 };
 
 #define pmu_group attr_groups[0]
-- 
Gitee


From b01a905cd1afaea826719491abf1ea7378e21ccb Mon Sep 17 00:00:00 2001
From: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Date: Mon, 1 Jun 2020 11:35:43 +0300
Subject: [PATCH 0916/2161] perf/x86/intel/uncore: Expose an Uncore unit to IIO
 PMON mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit bb42b3d39781d7fcd3be7f9f9bf11b6661b5fdf1 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current version supports a server line starting Intel® Xeon® Processor
Scalable Family and introduces mapping for IIO Uncore units only.
Other units can be added on demand.

IIO stack to PMON mapping is exposed through:
    /sys/devices/uncore_iio_<pmu_idx>/dieX
    where dieX is file which holds "Segment:Root Bus" for PCIe root port,
    which can be monitored by that IIO PMON block.

Details are explained in Documentation/ABI/testing/sysfs-devices-mapping

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Alexander Antonov <alexander.antonov@linux.intel.com>
Signed-off-by: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Link: https://lkml.kernel.org/r/20200601083543.30011-4-alexander.antonov@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/testing/sysfs-devices-mapping         |  33 +++
 arch/x86/events/intel/uncore.h                |   9 +
 arch/x86/events/intel/uncore_snbep.c          | 191 ++++++++++++++++++
 3 files changed, 233 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-mapping

diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping
new file mode 100644
index 000000000000..490ccfd67f12
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -0,0 +1,33 @@
+What:           /sys/devices/uncore_iio_x/dieX
+Date:           February 2020
+Contact:        Roman Sudarikov <roman.sudarikov@linux.intel.com>
+Description:
+                Each IIO stack (PCIe root port) has its own IIO PMON block, so
+                each dieX file (where X is die number) holds "Segment:Root Bus"
+                for PCIe root port, which can be monitored by that IIO PMON
+                block.
+                For example, on 4-die Xeon platform with up to 6 IIO stacks per
+                die and, therefore, 6 IIO PMON blocks per die, the mapping of
+                IIO PMON block 0 exposes as the following:
+
+                $ ls /sys/devices/uncore_iio_0/die*
+                -r--r--r-- /sys/devices/uncore_iio_0/die0
+                -r--r--r-- /sys/devices/uncore_iio_0/die1
+                -r--r--r-- /sys/devices/uncore_iio_0/die2
+                -r--r--r-- /sys/devices/uncore_iio_0/die3
+
+                $ tail /sys/devices/uncore_iio_0/die*
+                ==> /sys/devices/uncore_iio_0/die0 <==
+                0000:00
+                ==> /sys/devices/uncore_iio_0/die1 <==
+                0000:40
+                ==> /sys/devices/uncore_iio_0/die2 <==
+                0000:80
+                ==> /sys/devices/uncore_iio_0/die3 <==
+                0000:c0
+
+                Which means:
+                IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000
+                IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
+                IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
+                IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index d0f9988a466e..92f6a6d163c9 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -188,6 +188,15 @@ int uncore_pcibus_to_dieid(struct pci_bus *bus);
 ssize_t uncore_event_show(struct device *dev,
 			  struct device_attribute *attr, char *buf);
 
+static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
+{
+	return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+}
+
+#define to_device_attribute(n)	container_of(n, struct device_attribute, attr)
+#define to_dev_ext_attribute(n)	container_of(n, struct dev_ext_attribute, attr)
+#define attr_to_ext_attr(n)	to_dev_ext_attribute(to_device_attribute(n))
+
 extern int __uncore_max_dies;
 #define uncore_max_dies()	(__uncore_max_dies)
 
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2696c34cc73c..6c59f80cb6c7 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -274,6 +274,30 @@
 #define SKX_CPUNODEID			0xc0
 #define SKX_GIDNIDMAP			0xd4
 
+/*
+ * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR
+ * that BIOS programmed. MSR has package scope.
+ * |  Bit  |  Default  |  Description
+ * | [63]  |    00h    | VALID - When set, indicates the CPU bus
+ *                       numbers have been initialized. (RO)
+ * |[62:48]|    ---    | Reserved
+ * |[47:40]|    00h    | BUS_NUM_5 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(5). (RO)
+ * |[39:32]|    00h    | BUS_NUM_4 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(4). (RO)
+ * |[31:24]|    00h    | BUS_NUM_3 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(3). (RO)
+ * |[23:16]|    00h    | BUS_NUM_2 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(2). (RO)
+ * |[15:8] |    00h    | BUS_NUM_1 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(1). (RO)
+ * | [7:0] |    00h    | BUS_NUM_0 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(0). (RO)
+ */
+#define SKX_MSR_CPU_BUS_NUMBER		0x300
+#define SKX_MSR_CPU_BUS_VALID_BIT	(1ULL << 63)
+#define BUS_NUM_STRIDE			8
+
 /* SKX CHA */
 #define SKX_CHA_MSR_PMON_BOX_FILTER_TID		(0x1ffULL << 0)
 #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 9)
@@ -3620,6 +3644,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
 	.read_counter		= uncore_msr_read_counter,
 };
 
+static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
+{
+	return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+}
+
+static umode_t
+skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+	struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+	/* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */
+	return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode;
+}
+
+static ssize_t skx_iio_mapping_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pci_bus *bus = pci_find_next_bus(NULL);
+	struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+	struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+	long die = (long)ea->var;
+
+	/*
+	 * Current implementation is for single segment configuration hence it's
+	 * safe to take the segment value from the first available root bus.
+	 */
+	return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus),
+					   skx_iio_stack(uncore_pmu, die));
+}
+
+static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
+{
+	u64 msr_value;
+
+	if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) ||
+			!(msr_value & SKX_MSR_CPU_BUS_VALID_BIT))
+		return -ENXIO;
+
+	*topology = msr_value;
+
+	return 0;
+}
+
+static int die_to_cpu(int die)
+{
+	int res = 0, cpu, current_die;
+	/*
+	 * Using cpus_read_lock() to ensure cpu is not going down between
+	 * looking at cpu_online_mask.
+	 */
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		current_die = topology_logical_die_id(cpu);
+		if (current_die == die) {
+			res = cpu;
+			break;
+		}
+	}
+	cpus_read_unlock();
+	return res;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+	int i, ret;
+	struct pci_bus *bus = NULL;
+
+	/*
+	 * Verified single-segment environments only; disabled for multiple
+	 * segment topologies for now except VMD domains.
+	 * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
+	 */
+	while ((bus = pci_find_next_bus(bus))
+		&& (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff))
+		;
+	if (bus)
+		return -EPERM;
+
+	type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL);
+	if (!type->topology)
+		return -ENOMEM;
+
+	for (i = 0; i < uncore_max_dies(); i++) {
+		ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]);
+		if (ret) {
+			kfree(type->topology);
+			type->topology = NULL;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static struct attribute_group skx_iio_mapping_group = {
+	.is_visible	= skx_iio_mapping_visible,
+};
+
+static const struct attribute_group *skx_iio_attr_update[] = {
+	&skx_iio_mapping_group,
+	NULL,
+};
+
+static int skx_iio_set_mapping(struct intel_uncore_type *type)
+{
+	char buf[64];
+	int ret;
+	long die = -1;
+	struct attribute **attrs = NULL;
+	struct dev_ext_attribute *eas = NULL;
+
+	ret = skx_iio_get_topology(type);
+	if (ret)
+		return ret;
+
+	/* One more for NULL. */
+	attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		goto err;
+
+	eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL);
+	if (!eas)
+		goto err;
+
+	for (die = 0; die < uncore_max_dies(); die++) {
+		sprintf(buf, "die%ld", die);
+		sysfs_attr_init(&eas[die].attr.attr);
+		eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
+		if (!eas[die].attr.attr.name)
+			goto err;
+		eas[die].attr.attr.mode = 0444;
+		eas[die].attr.show = skx_iio_mapping_show;
+		eas[die].attr.store = NULL;
+		eas[die].var = (void *)die;
+		attrs[die] = &eas[die].attr.attr;
+	}
+	skx_iio_mapping_group.attrs = attrs;
+
+	return 0;
+err:
+	for (; die >= 0; die--)
+		kfree(eas[die].attr.attr.name);
+	kfree(eas);
+	kfree(attrs);
+	kfree(type->topology);
+	type->attr_update = NULL;
+	return -ENOMEM;
+}
+
+static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+	struct attribute **attr = skx_iio_mapping_group.attrs;
+
+	if (!attr)
+		return;
+
+	for (; *attr; attr++)
+		kfree((*attr)->name);
+	kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs));
+	kfree(skx_iio_mapping_group.attrs);
+	skx_iio_mapping_group.attrs = NULL;
+	kfree(type->topology);
+}
+
 static struct intel_uncore_type skx_uncore_iio = {
 	.name			= "iio",
 	.num_counters		= 4,
@@ -3634,6 +3822,9 @@ static struct intel_uncore_type skx_uncore_iio = {
 	.constraints		= skx_uncore_iio_constraints,
 	.ops			= &skx_uncore_iio_ops,
 	.format_group		= &skx_uncore_iio_format_group,
+	.attr_update		= skx_iio_attr_update,
+	.set_mapping		= skx_iio_set_mapping,
+	.cleanup_mapping	= skx_iio_cleanup_mapping,
 };
 
 enum perf_uncore_iio_freerunning_type_id {
-- 
Gitee


From f8bae6b6105c17b1bf9b78aba04ffeb8d4e56e29 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:36 -0700
Subject: [PATCH 0917/2161] perf/x86/intel/uncore: Add alias PMU name

commit 8053f2d752e2936f494ede62766a6c9e9fb674f2 upstream.

A perf PMU may have two PMU names. For example, Intel Sapphire Rapids
server supports the discovery mechanism. Without the platform-specific
support, an uncore PMU is named by a type ID plus a box ID, e.g.,
uncore_type_0_0, because the real name of the uncore PMU cannot be
retrieved from the discovery table. With the platform-specific support
later, perf has the mapping information from a type ID to a specific
uncore unit. Just like the previous platforms, the uncore PMU is named
by the real PMU name, e.g., uncore_cha_0. The user scripts which work
well with the old numeric name may not work anymore.

Add a new attribute "alias" to indicate the old numeric name. The
following userspace perf tool patch will handle both names. The user
scripts should work properly with the updated perf tool.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-13-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../sysfs-bus-event_source-devices-uncore     | 13 +++++++++
 arch/x86/events/intel/uncore.c                | 19 +++++++++----
 arch/x86/events/intel/uncore.h                |  1 +
 arch/x86/events/intel/uncore_snbep.c          | 28 ++++++++++++++++++-
 4 files changed, 54 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-uncore

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-uncore b/Documentation/ABI/testing/sysfs-bus-event_source-devices-uncore
new file mode 100644
index 000000000000..b56e8f019fd4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-uncore
@@ -0,0 +1,13 @@
+What:		/sys/bus/event_source/devices/uncore_*/alias
+Date:		June 2021
+KernelVersion:	5.15
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Read-only.  An attribute to describe the alias name of
+		the uncore PMU if an alias exists on some platforms.
+		The 'perf(1)' tool should treat both names the same.
+		They both can be used to access the uncore PMU.
+
+		Example:
+
+		$ cat /sys/devices/uncore_cha_2/alias
+		uncore_type_0_2
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index cd95d724fb5d..3d6b5d87b3cc 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -831,6 +831,18 @@ static const struct attribute_group uncore_pmu_attr_group = {
 	.attrs = uncore_pmu_attrs,
 };
 
+void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
+{
+	struct intel_uncore_type *type = pmu->type;
+
+	if (type->num_boxes == 1)
+		sprintf(pmu_name, "uncore_type_%u", type->type_id);
+	else {
+		sprintf(pmu_name, "uncore_type_%u_%d",
+			type->type_id, type->box_ids[pmu->pmu_idx]);
+	}
+}
+
 static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
 {
 	struct intel_uncore_type *type = pmu->type;
@@ -840,12 +852,7 @@ static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
 	 * Use uncore_type_&typeid_&boxid as name.
 	 */
 	if (!type->name) {
-		if (type->num_boxes == 1)
-			sprintf(pmu->name, "uncore_type_%u", type->type_id);
-		else {
-			sprintf(pmu->name, "uncore_type_%u_%d",
-				type->type_id, type->box_ids[pmu->pmu_idx]);
-		}
+		uncore_get_alias_name(pmu->name, pmu);
 		return;
 	}
 
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 92f6a6d163c9..c8f9e4118813 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -541,6 +541,7 @@ struct event_constraint *
 uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
 void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
 u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu);
 
 extern struct intel_uncore_type *empty_uncore[];
 extern struct intel_uncore_type **uncore_msr_uncores;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 6c59f80cb6c7..65cc047db71e 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5333,6 +5333,26 @@ static const struct attribute_group spr_uncore_chabox_format_group = {
 	.attrs = spr_uncore_cha_formats_attr,
 };
 
+static ssize_t alias_show(struct device *dev,
+			  struct device_attribute *attr,
+			  char *buf)
+{
+	struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+	char pmu_name[UNCORE_PMU_NAME_LEN];
+
+	uncore_get_alias_name(pmu_name, pmu);
+	return sysfs_emit(buf, "%s\n", pmu_name);
+}
+
+static DEVICE_ATTR_RO(alias);
+
+static struct attribute *uncore_alias_attrs[] = {
+	&dev_attr_alias.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(uncore_alias);
+
 static struct intel_uncore_type spr_uncore_chabox = {
 	.name			= "cha",
 	.event_mask		= SPR_CHA_PMON_EVENT_MASK,
@@ -5340,6 +5360,7 @@ static struct intel_uncore_type spr_uncore_chabox = {
 	.num_shared_regs	= 1,
 	.ops			= &spr_uncore_chabox_ops,
 	.format_group		= &spr_uncore_chabox_format_group,
+	.attr_update		= uncore_alias_groups,
 };
 
 static struct intel_uncore_type spr_uncore_iio = {
@@ -5347,6 +5368,7 @@ static struct intel_uncore_type spr_uncore_iio = {
 	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
 	.event_mask_ext		= SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
 	.format_group		= &snr_uncore_iio_format_group,
+	.attr_update		= uncore_alias_groups,
 };
 
 static struct attribute *spr_uncore_raw_formats_attr[] = {
@@ -5366,7 +5388,8 @@ static const struct attribute_group spr_uncore_raw_format_group = {
 #define SPR_UNCORE_COMMON_FORMAT()				\
 	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,	\
 	.event_mask_ext		= SPR_RAW_EVENT_MASK_EXT,	\
-	.format_group		= &spr_uncore_raw_format_group
+	.format_group		= &spr_uncore_raw_format_group,	\
+	.attr_update		= uncore_alias_groups
 
 static struct intel_uncore_type spr_uncore_irp = {
 	SPR_UNCORE_COMMON_FORMAT(),
@@ -5381,6 +5404,7 @@ static struct intel_uncore_type spr_uncore_m2pcie = {
 
 static struct intel_uncore_type spr_uncore_pcu = {
 	.name			= "pcu",
+	.attr_update		= uncore_alias_groups,
 };
 
 static void spr_uncore_mmio_enable_event(struct intel_uncore_box *box,
@@ -5506,6 +5530,8 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
 		to_type->event_descs = from_type->event_descs;
 	if (from_type->format_group)
 		to_type->format_group = from_type->format_group;
+	if (from_type->attr_update)
+		to_type->attr_update = from_type->attr_update;
 }
 
 static struct intel_uncore_type **
-- 
Gitee


From edcb2b5e34f4a88fdd68045ddadbfafe6ccbd008 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:37 -0700
Subject: [PATCH 0918/2161] perf/x86/intel/uncore: Factor out
 snr_uncore_mmio_map()

commit c8872d90e0a3651a096860d3241625ccfa1647e0 upstream.

The IMC free-running counters on Sapphire Rapids server are also
accessed by MMIO, which is similar to the previous platforms, SNR and
ICX. The only difference is the device ID of the device which contains
BAR address.

Factor out snr_uncore_mmio_map() which ioremap the MMIO space. It can be
reused in the following patch for SPR.

Use the snr_uncore_mmio_map() in the icx_uncore_imc_freerunning_init_box().
There is no box_ctl for the free-running counters.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-14-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 36 +++++++++++++++++++---------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 65cc047db71e..901c634df417 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -4595,13 +4595,15 @@ int snr_uncore_pci_init(void)
 	return 0;
 }
 
-static struct pci_dev *snr_uncore_get_mc_dev(int id)
+#define SNR_MC_DEVICE_ID	0x3451
+
+static struct pci_dev *snr_uncore_get_mc_dev(unsigned int device, int id)
 {
 	struct pci_dev *mc_dev = NULL;
 	int pkg;
 
 	while (1) {
-		mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
+		mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, mc_dev);
 		if (!mc_dev)
 			break;
 		pkg = uncore_pcibus_to_dieid(mc_dev->bus);
@@ -4611,16 +4613,17 @@ static struct pci_dev *snr_uncore_get_mc_dev(int id)
 	return mc_dev;
 }
 
-static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
-				       unsigned int box_ctl, int mem_offset)
+static int snr_uncore_mmio_map(struct intel_uncore_box *box,
+			       unsigned int box_ctl, int mem_offset,
+			       unsigned int device)
 {
-	struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid);
+	struct pci_dev *pdev = snr_uncore_get_mc_dev(device, box->dieid);
 	struct intel_uncore_type *type = box->pmu->type;
 	resource_size_t addr;
 	u32 pci_dword;
 
 	if (!pdev)
-		return;
+		return -ENODEV;
 
 	pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword);
 	addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23;
@@ -4633,16 +4636,25 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
 	box->io_addr = ioremap(addr, type->mmio_map_size);
 	if (!box->io_addr) {
 		pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
-		return;
+		return -EINVAL;
 	}
 
-	writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
+	return 0;
+}
+
+static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
+				       unsigned int box_ctl, int mem_offset,
+				       unsigned int device)
+{
+	if (!snr_uncore_mmio_map(box, box_ctl, mem_offset, device))
+		writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
 }
 
 static void snr_uncore_mmio_init_box(struct intel_uncore_box *box)
 {
 	__snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box),
-				   SNR_IMC_MMIO_MEM0_OFFSET);
+				   SNR_IMC_MMIO_MEM0_OFFSET,
+				   SNR_MC_DEVICE_ID);
 }
 
 static void snr_uncore_mmio_disable_box(struct intel_uncore_box *box)
@@ -5159,7 +5171,8 @@ static void icx_uncore_imc_init_box(struct intel_uncore_box *box)
 	int mem_offset = (box->pmu->pmu_idx / ICX_NUMBER_IMC_CHN) * ICX_IMC_MEM_STRIDE +
 			 SNR_IMC_MMIO_MEM0_OFFSET;
 
-	__snr_uncore_mmio_init_box(box, box_ctl, mem_offset);
+	__snr_uncore_mmio_init_box(box, box_ctl, mem_offset,
+				   SNR_MC_DEVICE_ID);
 }
 
 static struct intel_uncore_ops icx_uncore_mmio_ops = {
@@ -5229,7 +5242,8 @@ static void icx_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
 	int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE +
 			 SNR_IMC_MMIO_MEM0_OFFSET;
 
-	__snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box), mem_offset);
+	snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box),
+			    mem_offset, SNR_MC_DEVICE_ID);
 }
 
 static struct intel_uncore_ops icx_uncore_imc_freerunning_ops = {
-- 
Gitee


From 50ddd210219335bbc46aa87912e7edf1360ba4b2 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:38 -0700
Subject: [PATCH 0919/2161] perf/x86/intel/uncore: Support IIO free-running
 counters on Sapphire Rapids server

commit 543ac280b3576c0009e8c0fcd4d6bfc9978d7bd0 upstream.

Several free-running counters for IIO uncore blocks are supported on
Sapphire Rapids server.

They are not enumerated in the discovery tables. Extend
generic_init_uncores() to support extra uncore types. The uncore types
for the free-running counters is inserted right after the uncore types
retrieved from the discovery table.

The number of the free-running counter boxes is calculated from the max
number of the corresponding standard boxes.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-15-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_discovery.c |  10 +-
 arch/x86/events/intel/uncore_discovery.h |   2 +-
 arch/x86/events/intel/uncore_snbep.c     | 135 ++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 11 deletions(-)

diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 6322df1dd8e0..3049c646fa20 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -569,7 +569,7 @@ static bool uncore_update_uncore_type(enum uncore_access_type type_id,
 }
 
 struct intel_uncore_type **
-intel_uncore_generic_init_uncores(enum uncore_access_type type_id)
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra)
 {
 	struct intel_uncore_discovery_type *type;
 	struct intel_uncore_type **uncores;
@@ -577,7 +577,7 @@ intel_uncore_generic_init_uncores(enum uncore_access_type type_id)
 	struct rb_node *node;
 	int i = 0;
 
-	uncores = kcalloc(num_discovered_types[type_id] + 1,
+	uncores = kcalloc(num_discovered_types[type_id] + num_extra + 1,
 			  sizeof(struct intel_uncore_type *), GFP_KERNEL);
 	if (!uncores)
 		return empty_uncore;
@@ -606,17 +606,17 @@ intel_uncore_generic_init_uncores(enum uncore_access_type type_id)
 
 void intel_uncore_generic_uncore_cpu_init(void)
 {
-	uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR);
+	uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR, 0);
 }
 
 int intel_uncore_generic_uncore_pci_init(void)
 {
-	uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI);
+	uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI, 0);
 
 	return 0;
 }
 
 void intel_uncore_generic_uncore_mmio_init(void)
 {
-	uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO);
+	uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO, 0);
 }
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index b85655ba835b..7280c8a3c831 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -149,4 +149,4 @@ u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
 					  struct perf_event *event);
 
 struct intel_uncore_type **
-intel_uncore_generic_init_uncores(enum uncore_access_type type_id);
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 901c634df417..1a17fcf48508 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5498,6 +5498,7 @@ static struct intel_uncore_type spr_uncore_mdf = {
 };
 
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
+#define UNCORE_SPR_IIO				1
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_chabox,
@@ -5514,6 +5515,92 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_mdf,
 };
 
+enum perf_uncore_spr_iio_freerunning_type_id {
+	SPR_IIO_MSR_IOCLK,
+	SPR_IIO_MSR_BW_IN,
+	SPR_IIO_MSR_BW_OUT,
+
+	SPR_IIO_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters spr_iio_freerunning[] = {
+	[SPR_IIO_MSR_IOCLK]	= { 0x340e, 0x1, 0x10, 1, 48 },
+	[SPR_IIO_MSR_BW_IN]	= { 0x3800, 0x1, 0x10, 8, 48 },
+	[SPR_IIO_MSR_BW_OUT]	= { 0x3808, 0x1, 0x10, 8, 48 },
+};
+
+static struct uncore_event_desc spr_uncore_iio_freerunning_events[] = {
+	/* Free-Running IIO CLOCKS Counter */
+	INTEL_UNCORE_EVENT_DESC(ioclk,			"event=0xff,umask=0x10"),
+	/* Free-Running IIO BANDWIDTH IN Counters */
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0,		"event=0xff,umask=0x20"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1,		"event=0xff,umask=0x21"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2,		"event=0xff,umask=0x22"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3,		"event=0xff,umask=0x23"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port4,		"event=0xff,umask=0x24"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port5,		"event=0xff,umask=0x25"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port6,		"event=0xff,umask=0x26"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port7,		"event=0xff,umask=0x27"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit,	"MiB"),
+	/* Free-Running IIO BANDWIDTH OUT Counters */
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0,		"event=0xff,umask=0x30"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1,		"event=0xff,umask=0x31"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2,		"event=0xff,umask=0x32"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3,		"event=0xff,umask=0x33"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port4,		"event=0xff,umask=0x34"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port4.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port4.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port5,		"event=0xff,umask=0x35"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port5.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port5.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port6,		"event=0xff,umask=0x36"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port6.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port6.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port7,		"event=0xff,umask=0x37"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port7.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port7.unit,	"MiB"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type spr_uncore_iio_free_running = {
+	.name			= "iio_free_running",
+	.num_counters		= 17,
+	.num_freerunning_types	= SPR_IIO_FREERUNNING_TYPE_MAX,
+	.freerunning		= spr_iio_freerunning,
+	.ops			= &skx_uncore_iio_freerunning_ops,
+	.event_descs		= spr_uncore_iio_freerunning_events,
+	.format_group		= &skx_uncore_iio_freerunning_format_group,
+};
+
+#define UNCORE_SPR_MSR_EXTRA_UNCORES		1
+
+static struct intel_uncore_type *spr_msr_uncores[UNCORE_SPR_MSR_EXTRA_UNCORES] = {
+	&spr_uncore_iio_free_running,
+};
+
 static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
 					struct intel_uncore_type *from_type)
 {
@@ -5549,11 +5636,13 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
 }
 
 static struct intel_uncore_type **
-uncore_get_uncores(enum uncore_access_type type_id)
+uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
+		    struct intel_uncore_type **extra)
 {
 	struct intel_uncore_type **types, **start_types;
+	int i;
 
-	start_types = types = intel_uncore_generic_init_uncores(type_id);
+	start_types = types = intel_uncore_generic_init_uncores(type_id, num_extra);
 
 	/* Only copy the customized features */
 	for (; *types; types++) {
@@ -5562,23 +5651,59 @@ uncore_get_uncores(enum uncore_access_type type_id)
 		uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]);
 	}
 
+	for (i = 0; i < num_extra; i++, types++)
+		*types = extra[i];
+
 	return start_types;
 }
 
+static struct intel_uncore_type *
+uncore_find_type_by_id(struct intel_uncore_type **types, int type_id)
+{
+	for (; *types; types++) {
+		if (type_id == (*types)->type_id)
+			return *types;
+	}
+
+	return NULL;
+}
+
+static int uncore_type_max_boxes(struct intel_uncore_type **types,
+				 int type_id)
+{
+	struct intel_uncore_type *type;
+	int i, max = 0;
+
+	type = uncore_find_type_by_id(types, type_id);
+	if (!type)
+		return 0;
+
+	for (i = 0; i < type->num_boxes; i++) {
+		if (type->box_ids[i] > max)
+			max = type->box_ids[i];
+	}
+
+	return max + 1;
+}
+
 void spr_uncore_cpu_init(void)
 {
-	uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR);
+	uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
+						UNCORE_SPR_MSR_EXTRA_UNCORES,
+						spr_msr_uncores);
+
+	spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
 }
 
 int spr_uncore_pci_init(void)
 {
-	uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI);
+	uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL);
 	return 0;
 }
 
 void spr_uncore_mmio_init(void)
 {
-	uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO);
+	uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL);
 }
 
 /* end of SPR uncore support */
-- 
Gitee


From 6b167100090753d8d0944d6eaf3f49f337b89d3c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 30 Jun 2021 14:08:39 -0700
Subject: [PATCH 0920/2161] perf/x86/intel/uncore: Support IMC free-running
 counters on Sapphire Rapids server

commit c76826a65f50038f050424365dbf3f97203f8710 upstream.

Several free-running counters for IMC uncore blocks are supported on
Sapphire Rapids server.

They are not enumerated in the discovery tables. The number of the
free-running counter boxes is calculated from the number of
corresponding standard boxes.

The snbep_pci2phy_map_init() is invoked to setup the mapping from a PCI
BUS to a Die ID, which is used to locate the corresponding MC device of
a IMC uncore unit in the spr_uncore_imc_freerunning_init_box().

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/1625087320-194204-16-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore_snbep.c | 66 +++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 1a17fcf48508..6e5360f2b6d7 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5499,6 +5499,7 @@ static struct intel_uncore_type spr_uncore_mdf = {
 
 #define UNCORE_SPR_NUM_UNCORE_TYPES		12
 #define UNCORE_SPR_IIO				1
+#define UNCORE_SPR_IMC				6
 
 static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
 	&spr_uncore_chabox,
@@ -5595,12 +5596,65 @@ static struct intel_uncore_type spr_uncore_iio_free_running = {
 	.format_group		= &skx_uncore_iio_freerunning_format_group,
 };
 
+enum perf_uncore_spr_imc_freerunning_type_id {
+	SPR_IMC_DCLK,
+	SPR_IMC_PQ_CYCLES,
+
+	SPR_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters spr_imc_freerunning[] = {
+	[SPR_IMC_DCLK]		= { 0x22b0, 0x0, 0, 1, 48 },
+	[SPR_IMC_PQ_CYCLES]	= { 0x2318, 0x8, 0, 2, 48 },
+};
+
+static struct uncore_event_desc spr_uncore_imc_freerunning_events[] = {
+	INTEL_UNCORE_EVENT_DESC(dclk,			"event=0xff,umask=0x10"),
+
+	INTEL_UNCORE_EVENT_DESC(rpq_cycles,		"event=0xff,umask=0x20"),
+	INTEL_UNCORE_EVENT_DESC(wpq_cycles,		"event=0xff,umask=0x21"),
+	{ /* end: all zeroes */ },
+};
+
+#define SPR_MC_DEVICE_ID	0x3251
+
+static void spr_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+	int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE + SNR_IMC_MMIO_MEM0_OFFSET;
+
+	snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box),
+			    mem_offset, SPR_MC_DEVICE_ID);
+}
+
+static struct intel_uncore_ops spr_uncore_imc_freerunning_ops = {
+	.init_box	= spr_uncore_imc_freerunning_init_box,
+	.exit_box	= uncore_mmio_exit_box,
+	.read_counter	= uncore_mmio_read_counter,
+	.hw_config	= uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type spr_uncore_imc_free_running = {
+	.name			= "imc_free_running",
+	.num_counters		= 3,
+	.mmio_map_size		= SNR_IMC_MMIO_SIZE,
+	.num_freerunning_types	= SPR_IMC_FREERUNNING_TYPE_MAX,
+	.freerunning		= spr_imc_freerunning,
+	.ops			= &spr_uncore_imc_freerunning_ops,
+	.event_descs		= spr_uncore_imc_freerunning_events,
+	.format_group		= &skx_uncore_iio_freerunning_format_group,
+};
+
 #define UNCORE_SPR_MSR_EXTRA_UNCORES		1
+#define UNCORE_SPR_MMIO_EXTRA_UNCORES		1
 
 static struct intel_uncore_type *spr_msr_uncores[UNCORE_SPR_MSR_EXTRA_UNCORES] = {
 	&spr_uncore_iio_free_running,
 };
 
+static struct intel_uncore_type *spr_mmio_uncores[UNCORE_SPR_MMIO_EXTRA_UNCORES] = {
+	&spr_uncore_imc_free_running,
+};
+
 static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
 					struct intel_uncore_type *from_type)
 {
@@ -5703,7 +5757,17 @@ int spr_uncore_pci_init(void)
 
 void spr_uncore_mmio_init(void)
 {
-	uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL);
+	int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true);
+
+	if (ret)
+		uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL);
+	else {
+		uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
+							 UNCORE_SPR_MMIO_EXTRA_UNCORES,
+							 spr_mmio_uncores);
+
+		spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2;
+	}
 }
 
 /* end of SPR uncore support */
-- 
Gitee


From 2a17e31b9b0ae305a3cc7f82853f2e30b702a468 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Jan 2022 04:35:12 -0800
Subject: [PATCH 0921/2161] x86/fpu: Extend fpu_xstate_prctl() with guest
 permissions

commit 980fe2fddcff21937c93532b4597c8ea450346c1 upstream.

KVM requires a clear separation of host user space and guest permissions
for dynamic XSTATE components.

Add a guest permissions member to struct fpu and a separate set of prctl()
arguments: ARCH_GET_XCOMP_GUEST_PERM and ARCH_REQ_XCOMP_GUEST_PERM.

The semantics are equivalent to the host user space permission control
except for the following constraints:

  1) Permissions have to be requested before the first vCPU is created

  2) Permissions are frozen when the first vCPU is created to ensure
     consistency. Any attempt to expand permissions via the prctl() after
     that point is rejected.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-2-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h    |  2 ++
 arch/x86/include/asm/fpu/types.h  |  9 ++++++
 arch/x86/include/uapi/asm/prctl.h | 26 ++++++++-------
 arch/x86/kernel/fpu/core.c        |  3 ++
 arch/x86/kernel/fpu/xstate.c      | 53 +++++++++++++++++++++++--------
 arch/x86/kernel/fpu/xstate.h      | 13 ++++++--
 arch/x86/kernel/process.c         |  2 ++
 7 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 4e71fd37ffd0..bfd5d1cd04d3 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -128,6 +128,8 @@ static inline void fpstate_free(struct fpu *fpu) { }
 /* fpstate-related functions which are exported to KVM */
 extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
 
+extern inline u64 xstate_get_guest_group_perm(void);
+
 /* KVM specific functions */
 extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
 extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index edbe0b5704ac..89e30c763ef0 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -387,6 +387,8 @@ struct fpstate {
 	/* @regs is dynamically sized! Don't add anything after @regs! */
 } __aligned(64);
 
+#define FPU_GUEST_PERM_LOCKED		BIT_ULL(63)
+
 struct fpu_state_perm {
 	/*
 	 * @__state_perm:
@@ -476,6 +478,13 @@ struct fpu {
 	 */
 	struct fpu_state_perm		perm;
 
+	/*
+	 * @guest_perm:
+	 *
+	 * Permission related information for guest pseudo FPUs
+	 */
+	struct fpu_state_perm		guest_perm;
+
 	/*
 	 * @__fpstate:
 	 *
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 754a07856817..500b96e71f18 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -2,20 +2,22 @@
 #ifndef _ASM_X86_PRCTL_H
 #define _ASM_X86_PRCTL_H
 
-#define ARCH_SET_GS		0x1001
-#define ARCH_SET_FS		0x1002
-#define ARCH_GET_FS		0x1003
-#define ARCH_GET_GS		0x1004
+#define ARCH_SET_GS			0x1001
+#define ARCH_SET_FS			0x1002
+#define ARCH_GET_FS			0x1003
+#define ARCH_GET_GS			0x1004
 
-#define ARCH_GET_CPUID		0x1011
-#define ARCH_SET_CPUID		0x1012
+#define ARCH_GET_CPUID			0x1011
+#define ARCH_SET_CPUID			0x1012
 
-#define ARCH_GET_XCOMP_SUPP	0x1021
-#define ARCH_GET_XCOMP_PERM	0x1022
-#define ARCH_REQ_XCOMP_PERM	0x1023
+#define ARCH_GET_XCOMP_SUPP		0x1021
+#define ARCH_GET_XCOMP_PERM		0x1022
+#define ARCH_REQ_XCOMP_PERM		0x1023
+#define ARCH_GET_XCOMP_GUEST_PERM	0x1024
+#define ARCH_REQ_XCOMP_GUEST_PERM	0x1025
 
-#define ARCH_MAP_VDSO_X32	0x2001
-#define ARCH_MAP_VDSO_32	0x2002
-#define ARCH_MAP_VDSO_64	0x2003
+#define ARCH_MAP_VDSO_X32		0x2001
+#define ARCH_MAP_VDSO_32		0x2002
+#define ARCH_MAP_VDSO_64		0x2003
 
 #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 036facdadbc4..f6c1ddd56d1c 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -450,6 +450,8 @@ void fpstate_reset(struct fpu *fpu)
 	fpu->perm.__state_perm		= fpu_kernel_cfg.default_features;
 	fpu->perm.__state_size		= fpu_kernel_cfg.default_size;
 	fpu->perm.__user_state_size	= fpu_user_cfg.default_size;
+	/* Same defaults for guests */
+	fpu->guest_perm = fpu->perm;
 }
 
 static inline void fpu_inherit_perms(struct fpu *dst_fpu)
@@ -460,6 +462,7 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu)
 		spin_lock_irq(&current->sighand->siglock);
 		/* Fork also inherits the permissions of the parent */
 		dst_fpu->perm = src_fpu->perm;
+		dst_fpu->guest_perm = src_fpu->guest_perm;
 		spin_unlock_irq(&current->sighand->siglock);
 	}
 }
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 30a6704933b2..337bea0c9fba 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1626,7 +1626,7 @@ static int validate_sigaltstack(unsigned int usize)
 	return 0;
 }
 
-static int __xstate_request_perm(u64 permitted, u64 requested)
+static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
 {
 	/*
 	 * This deliberately does not exclude !XSAVES as we still might
@@ -1636,9 +1636,10 @@ static int __xstate_request_perm(u64 permitted, u64 requested)
 	 */
 	bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
 	struct fpu *fpu = &current->group_leader->thread.fpu;
+	struct fpu_state_perm *perm;
 	unsigned int ksize, usize;
 	u64 mask;
-	int ret;
+	int ret = 0;
 
 	/* Check whether fully enabled */
 	if ((permitted & requested) == requested)
@@ -1652,15 +1653,18 @@ static int __xstate_request_perm(u64 permitted, u64 requested)
 	mask &= XFEATURE_MASK_USER_SUPPORTED;
 	usize = xstate_calculate_size(mask, false);
 
-	ret = validate_sigaltstack(usize);
-	if (ret)
-		return ret;
+	if (!guest) {
+		ret = validate_sigaltstack(usize);
+		if (ret)
+			return ret;
+	}
 
+	perm = guest ? &fpu->guest_perm : &fpu->perm;
 	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
-	WRITE_ONCE(fpu->perm.__state_perm, requested);
+	WRITE_ONCE(perm->__state_perm, requested);
 	/* Protected by sighand lock */
-	fpu->perm.__state_size = ksize;
-	fpu->perm.__user_state_size = usize;
+	perm->__state_size = ksize;
+	perm->__user_state_size = usize;
 	return ret;
 }
 
@@ -1671,7 +1675,7 @@ static const u64 xstate_prctl_req[XFEATURE_MAX] = {
 	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
 };
 
-static int xstate_request_perm(unsigned long idx)
+static int xstate_request_perm(unsigned long idx, bool guest)
 {
 	u64 permitted, requested;
 	int ret;
@@ -1692,14 +1696,19 @@ static int xstate_request_perm(unsigned long idx)
 		return -EOPNOTSUPP;
 
 	/* Lockless quick check */
-	permitted = xstate_get_host_group_perm();
+	permitted = xstate_get_group_perm(guest);
 	if ((permitted & requested) == requested)
 		return 0;
 
 	/* Protect against concurrent modifications */
 	spin_lock_irq(&current->sighand->siglock);
-	permitted = xstate_get_host_group_perm();
-	ret = __xstate_request_perm(permitted, requested);
+	permitted = xstate_get_group_perm(guest);
+
+	/* First vCPU allocation locks the permissions. */
+	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
+		ret = -EBUSY;
+	else
+		ret = __xstate_request_perm(permitted, requested, guest);
 	spin_unlock_irq(&current->sighand->siglock);
 	return ret;
 }
@@ -1744,12 +1753,18 @@ int xfd_enable_feature(u64 xfd_err)
 	return 0;
 }
 #else /* CONFIG_X86_64 */
-static inline int xstate_request_perm(unsigned long idx)
+static inline int xstate_request_perm(unsigned long idx, bool guest)
 {
 	return -EPERM;
 }
 #endif  /* !CONFIG_X86_64 */
 
+inline u64 xstate_get_guest_group_perm(void)
+{
+	return xstate_get_group_perm(true);
+}
+EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+
 /**
  * fpu_xstate_prctl - xstate permission operations
  * @tsk:	Redundant pointer to current
@@ -1773,6 +1788,7 @@ long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
 	u64 __user *uptr = (u64 __user *)arg2;
 	u64 permitted, supported;
 	unsigned long idx = arg2;
+	bool guest = false;
 
 	if (tsk != current)
 		return -EPERM;
@@ -1791,11 +1807,20 @@ long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
 		return put_user(permitted, uptr);
 
+	case ARCH_GET_XCOMP_GUEST_PERM:
+		permitted = xstate_get_guest_group_perm();
+		permitted &= XFEATURE_MASK_USER_SUPPORTED;
+		return put_user(permitted, uptr);
+
+	case ARCH_REQ_XCOMP_GUEST_PERM:
+		guest = true;
+		fallthrough;
+
 	case ARCH_REQ_XCOMP_PERM:
 		if (!IS_ENABLED(CONFIG_X86_64))
 			return -EOPNOTSUPP;
 
-		return xstate_request_perm(idx);
+		return xstate_request_perm(idx, guest);
 
 	default:
 		return -EINVAL;
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 86ea7c0fa2f6..98a472775c97 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -20,10 +20,19 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
 		xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
 }
 
-static inline u64 xstate_get_host_group_perm(void)
+static inline u64 xstate_get_group_perm(bool guest)
 {
+	struct fpu *fpu = &current->group_leader->thread.fpu;
+	struct fpu_state_perm *perm;
+
 	/* Pairs with WRITE_ONCE() in xstate_request_perm() */
-	return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm);
+	perm = guest ? &fpu->guest_perm : &fpu->perm;
+	return READ_ONCE(perm->__state_perm);
+}
+
+static inline u64 xstate_get_host_group_perm(void)
+{
+	return xstate_get_group_perm(false);
 }
 
 enum xstate_copy_mode {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4003f0188f8f..976d24f03b65 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -992,6 +992,8 @@ long do_arch_prctl_common(struct task_struct *task, int option,
 	case ARCH_GET_XCOMP_SUPP:
 	case ARCH_GET_XCOMP_PERM:
 	case ARCH_REQ_XCOMP_PERM:
+	case ARCH_GET_XCOMP_GUEST_PERM:
+	case ARCH_REQ_XCOMP_GUEST_PERM:
 		return fpu_xstate_prctl(task, option, arg2);
 	}
 
-- 
Gitee


From 117914451f5f015a2d771ce79d393eb8f46566c2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Jan 2022 04:35:13 -0800
Subject: [PATCH 0922/2161] x86/fpu: Prepare guest FPU for dynamically enabled
 FPU features

commit 36487e6228c4cb04257c92266a04078a384bc4ec upstream.

To support dynamically enabled FPU features for guests prepare the guest
pseudo FPU container to keep track of the currently enabled xfeatures and
the guest permissions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-3-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 13 +++++++++++++
 arch/x86/kernel/fpu/core.c       | 26 +++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 89e30c763ef0..71de7a7724fe 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -504,6 +504,19 @@ struct fpu {
  * Guest pseudo FPU container
  */
 struct fpu_guest {
+	/*
+	 * @xfeatures:			xfeature bitmap of features which are
+	 *				currently enabled for the guest vCPU.
+	 */
+	u64				xfeatures;
+
+	/*
+	 * @perm:			xfeature bitmap of features which are
+	 *				permitted to be enabled for the guest
+	 *				vCPU.
+	 */
+	u64				perm;
+
 	/*
 	 * @fpstate:			Pointer to the allocated guest fpstate
 	 */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index f6c1ddd56d1c..603cc727ead9 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -201,6 +201,26 @@ void fpu_reset_from_exception_fixup(void)
 #if IS_ENABLED(CONFIG_KVM)
 static void __fpstate_reset(struct fpstate *fpstate);
 
+static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
+{
+	struct fpu_state_perm *fpuperm;
+	u64 perm;
+
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return;
+
+	spin_lock_irq(&current->sighand->siglock);
+	fpuperm = &current->group_leader->thread.fpu.guest_perm;
+	perm = fpuperm->__state_perm;
+
+	/* First fpstate allocation locks down permissions. */
+	WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);
+
+	spin_unlock_irq(&current->sighand->siglock);
+
+	gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED;
+}
+
 bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
 {
 	struct fpstate *fpstate;
@@ -216,7 +236,11 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
 	fpstate->is_valloc	= true;
 	fpstate->is_guest	= true;
 
-	gfpu->fpstate = fpstate;
+	gfpu->fpstate		= fpstate;
+	gfpu->xfeatures		= fpu_user_cfg.default_features;
+	gfpu->perm		= fpu_user_cfg.default_features;
+	fpu_init_guest_permissions(gfpu);
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
-- 
Gitee


From 69f311a4f3c74f7b8539ae1333f9882a2f566551 Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 5 Jan 2022 04:35:14 -0800
Subject: [PATCH 0923/2161] kvm: x86: Fix xstate_required_size() to follow
 XSTATE alignment rule

commit cc04b6a21d431359eceeec0d812b492088b04af5 upstream.

CPUID.0xD.1.EBX enumerates the size of the XSAVE area (in compacted
format) required by XSAVES. If CPUID.0xD.i.ECX[1] is set for a state
component (i), this state component should be located on the next
64-bytes boundary following the preceding state component in the
compacted layout.

Fix xstate_required_size() to follow the alignment rule. AMX is the
first state component with 64-bytes alignment to catch this bug.

Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-4-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 30d903596e37..051860a96c47 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -41,7 +41,11 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 		if (xstate_bv & 0x1) {
 		        u32 eax, ebx, ecx, edx, offset;
 		        cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
-			offset = compacted ? ret : ebx;
+			/* ECX[1]: 64B alignment in compacted form */
+			if (compacted)
+				offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
+			else
+				offset = ebx;
 			ret = max(ret, offset + eax);
 		}
 
-- 
Gitee


From 8ae63fa7d344598a2deb6e4bbe8ea845790d87cb Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 3 Mar 2022 11:53:55 +0800
Subject: [PATCH 0924/2161] kvm: x86: Exclude unpermitted xfeatures at
 KVM_GET_SUPPORTED_CPUID

commit 445ecdf79be0c71ca248f7611aeefceaea3ec59f upstream.

KVM_GET_SUPPORTED_CPUID should not include any dynamic xstates in
CPUID[0xD] if they have not been requested with prctl. Otherwise
a process which directly passes KVM_GET_SUPPORTED_CPUID to
KVM_SET_CPUID2 would now fail even if it doesn't intend to use a
dynamically enabled feature. Userspace must know that prctl is
required and allocate >4K xstate buffer before setting any dynamic
bit.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-5-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/virt/kvm/api.txt | 4 ++++
 arch/x86/kvm/cpuid.c           | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index fd22224853e5..f4be0a07d456 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -1435,6 +1435,10 @@ userspace capabilities, and with user requirements (for example, the
 user may wish to constrain cpuid to emulate older hardware, or for
 feature consistency across a cluster).
 
+Dynamically-enabled feature bits need to be requested with
+``arch_prctl()`` before calling this ioctl. Feature bits that have not
+been requested are excluded from the result.
+
 Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may
 expose cpuid features (e.g. MONITOR) which are not supported by kvm in
 its default configuration. If userspace enables such capabilities, it
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 051860a96c47..630fa33f9e13 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -653,11 +653,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	case 0xd: {
 		int idx;
 		u64 supported = kvm_supported_xcr0();
+		u64 guest_perm = xstate_get_guest_group_perm();
 
-		entry->eax &= supported;
+		entry->eax &= supported & guest_perm;
 		entry->ebx = xstate_required_size(supported, false);
 		entry->ecx = entry->ebx;
-		entry->edx &= supported >> 32;
+		entry->edx &= (supported & guest_perm) >> 32;
 		if (!supported)
 			break;
 
-- 
Gitee


From 694d706c21c217a046224a065dc6ab6aa595a579 Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 5 Jan 2022 04:35:16 -0800
Subject: [PATCH 0925/2161] x86/fpu: Make XFD initialization in
 __fpstate_reset() a function argument

commit b0237dad2d7f8820b5b415291431d8259e787470 upstream.

vCPU threads are different from native tasks regarding to the initial XFD
value. While all native tasks follow a fixed value (init_fpstate::xfd)
established by the FPU core at boot, vCPU threads need to obey the reset
value (i.e. ZERO) defined by the specification, to meet the expectation of
the guest.

Let the caller supply an argument and adjust the host and guest related
invocations accordingly.

Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-6-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 603cc727ead9..9c593a32bd1f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -199,7 +199,7 @@ void fpu_reset_from_exception_fixup(void)
 }
 
 #if IS_ENABLED(CONFIG_KVM)
-static void __fpstate_reset(struct fpstate *fpstate);
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
 
 static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
 {
@@ -231,7 +231,8 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
 	if (!fpstate)
 		return false;
 
-	__fpstate_reset(fpstate);
+	/* Leave xfd to 0 (the reset value defined by spec) */
+	__fpstate_reset(fpstate, 0);
 	fpstate_init_user(fpstate);
 	fpstate->is_valloc	= true;
 	fpstate->is_guest	= true;
@@ -454,21 +455,21 @@ void fpstate_init_user(struct fpstate *fpstate)
 		fpstate_init_fstate(fpstate);
 }
 
-static void __fpstate_reset(struct fpstate *fpstate)
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
 {
 	/* Initialize sizes and feature masks */
 	fpstate->size		= fpu_kernel_cfg.default_size;
 	fpstate->user_size	= fpu_user_cfg.default_size;
 	fpstate->xfeatures	= fpu_kernel_cfg.default_features;
 	fpstate->user_xfeatures	= fpu_user_cfg.default_features;
-	fpstate->xfd		= init_fpstate.xfd;
+	fpstate->xfd		= xfd;
 }
 
 void fpstate_reset(struct fpu *fpu)
 {
 	/* Set the fpstate pointer to the default fpstate */
 	fpu->fpstate = &fpu->__fpstate;
-	__fpstate_reset(fpu->fpstate);
+	__fpstate_reset(fpu->fpstate, init_fpstate.xfd);
 
 	/* Initialize the permission related info in fpu */
 	fpu->perm.__state_perm		= fpu_kernel_cfg.default_features;
-- 
Gitee


From f1580a167af76ee680fb20b656d09cddc009e81e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Jan 2022 04:35:17 -0800
Subject: [PATCH 0926/2161] x86/fpu: Add guest support to xfd_enable_feature()

commit c270ce393dfd700e7510a4579568deeefba954fd upstream.

Guest support for dynamically enabled FPU features requires a few
modifications to the enablement function which is currently invoked from
the #NM handler:

  1) Use guest permissions and sizes for the update

  2) Update fpu_guest state accordingly

  3) Take into account that the enabling can be triggered either from a
     running guest via XSETBV and MSR_IA32_XFD write emulation or from
     a guest restore. In the latter case the guests fpstate is not the
     current tasks active fpstate.

Split the function and implement the guest mechanics throughout the
callchain.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-7-yang.zhong@intel.com>
[Add 32-bit stub for __xfd_enable_feature. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 93 +++++++++++++++++++++---------------
 arch/x86/kernel/fpu/xstate.h |  6 +++
 2 files changed, 60 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 337bea0c9fba..9b4bd1251799 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1530,29 +1530,6 @@ void fpstate_free(struct fpu *fpu)
 		vfree(fpu->fpstate);
 }
 
-/**
- * fpu_install_fpstate - Update the active fpstate in the FPU
- *
- * @fpu:	A struct fpu * pointer
- * @newfps:	A struct fpstate * pointer
- *
- * Returns:	A null pointer if the last active fpstate is the embedded
- *		one or the new fpstate is already installed;
- *		otherwise, a pointer to the old fpstate which has to
- *		be freed by the caller.
- */
-static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
-					   struct fpstate *newfps)
-{
-	struct fpstate *oldfps = fpu->fpstate;
-
-	if (fpu->fpstate == newfps)
-		return NULL;
-
-	fpu->fpstate = newfps;
-	return oldfps != &fpu->__fpstate ? oldfps : NULL;
-}
-
 /**
  * fpstate_realloc - Reallocate struct fpstate for the requested new features
  *
@@ -1560,6 +1537,7 @@ static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
  *		of that task
  * @ksize:	The required size for the kernel buffer
  * @usize:	The required size for user space buffers
+ * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
  *
  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
@@ -1568,13 +1546,13 @@ static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
  * Returns: 0 on success, -ENOMEM on allocation error.
  */
 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
-			   unsigned int usize)
+			   unsigned int usize, struct fpu_guest *guest_fpu)
 {
 	struct fpu *fpu = &current->thread.fpu;
 	struct fpstate *curfps, *newfps = NULL;
 	unsigned int fpsize;
+	bool in_use;
 
-	curfps = fpu->fpstate;
 	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
 
 	newfps = vzalloc(fpsize);
@@ -1584,28 +1562,55 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
 	newfps->user_size = usize;
 	newfps->is_valloc = true;
 
+	/*
+	 * When a guest FPU is supplied, use @guest_fpu->fpstate
+	 * as reference independent whether it is in use or not.
+	 */
+	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
+
+	/* Determine whether @curfps is the active fpstate */
+	in_use = fpu->fpstate == curfps;
+
+	if (guest_fpu) {
+		newfps->is_guest = true;
+		newfps->is_confidential = curfps->is_confidential;
+		newfps->in_use = curfps->in_use;
+		guest_fpu->xfeatures |= xfeatures;
+	}
+
 	fpregs_lock();
 	/*
-	 * Ensure that the current state is in the registers before
-	 * swapping fpstate as that might invalidate it due to layout
-	 * changes.
+	 * If @curfps is in use, ensure that the current state is in the
+	 * registers before swapping fpstate as that might invalidate it
+	 * due to layout changes.
 	 */
-	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
 		fpregs_restore_userregs();
 
 	newfps->xfeatures = curfps->xfeatures | xfeatures;
 	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
 	newfps->xfd = curfps->xfd & ~xfeatures;
 
-	curfps = fpu_install_fpstate(fpu, newfps);
-
 	/* Do the final updates within the locked region */
 	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
-	xfd_update_state(newfps);
 
+	if (guest_fpu) {
+		guest_fpu->fpstate = newfps;
+		/* If curfps is active, update the FPU fpstate pointer */
+		if (in_use)
+			fpu->fpstate = newfps;
+	} else {
+		fpu->fpstate = newfps;
+	}
+
+	if (in_use)
+		xfd_update_state(fpu->fpstate);
 	fpregs_unlock();
 
-	vfree(curfps);
+	/* Only free valloc'ed state */
+	if (curfps && curfps->is_valloc)
+		vfree(curfps);
+
 	return 0;
 }
 
@@ -1713,14 +1718,16 @@ static int xstate_request_perm(unsigned long idx, bool guest)
 	return ret;
 }
 
-int xfd_enable_feature(u64 xfd_err)
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
 {
 	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+	struct fpu_state_perm *perm;
 	unsigned int ksize, usize;
 	struct fpu *fpu;
 
 	if (!xfd_event) {
-		pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+		if (!guest_fpu)
+			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
 		return 0;
 	}
 
@@ -1728,14 +1735,16 @@ int xfd_enable_feature(u64 xfd_err)
 	spin_lock_irq(&current->sighand->siglock);
 
 	/* If not permitted let it die */
-	if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) {
+	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
 		spin_unlock_irq(&current->sighand->siglock);
 		return -EPERM;
 	}
 
 	fpu = &current->group_leader->thread.fpu;
-	ksize = fpu->perm.__state_size;
-	usize = fpu->perm.__user_state_size;
+	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
+	ksize = perm->__state_size;
+	usize = perm->__user_state_size;
+
 	/*
 	 * The feature is permitted. State size is sufficient.  Dropping
 	 * the lock is safe here even if more features are added from
@@ -1748,10 +1757,16 @@ int xfd_enable_feature(u64 xfd_err)
 	 * Try to allocate a new fpstate. If that fails there is no way
 	 * out.
 	 */
-	if (fpstate_realloc(xfd_event, ksize, usize))
+	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
 		return -EFAULT;
 	return 0;
 }
+
+int xfd_enable_feature(u64 xfd_err)
+{
+	return __xfd_enable_feature(xfd_err, NULL);
+}
+
 #else /* CONFIG_X86_64 */
 static inline int xstate_request_perm(unsigned long idx, bool guest)
 {
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 98a472775c97..67ed6bbc19b8 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -158,8 +158,14 @@ static inline void xfd_update_state(struct fpstate *fpstate)
 		}
 	}
 }
+
+extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
 #else
 static inline void xfd_update_state(struct fpstate *fpstate) { }
+
+static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
+	return -EPERM;
+}
 #endif
 
 /*
-- 
Gitee


From db7170c2d5337987223e052fbbb6fb5248e51eda Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:13 +0200
Subject: [PATCH 0927/2161] lockdep: Add preemption enabled/disabled assertion
 APIs

commit 8fd8ad5c5dfcb09cf62abadd4043eaf1afbbd0ce upstream.

Asserting that preemption is enabled or disabled is a critical sanity
check.  Developers are usually reluctant to add such a check in a
fastpath as reading the preemption count can be costly.

Extend the lockdep API with macros asserting that preemption is disabled
or enabled. If lockdep is disabled, or if the underlying architecture
does not support kernel preemption, this assert has no runtime overhead.

References: f54bb2ec02c8 ("locking/lockdep: Add IRQs disabled/enabled assertion APIs: ...")
Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-8-a.darwish@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/lockdep.h | 19 +++++++++++++++++++
 lib/Kconfig.debug       |  1 +
 2 files changed, 20 insertions(+)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b8a835fd611b..42c7255d030f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -647,12 +647,31 @@ do {									\
 			  "Not in hardirq as expected\n");		\
 	} while (0)
 
+#define lockdep_assert_preemption_enabled()				\
+do {									\
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
+		     debug_locks			&&		\
+		     (preempt_count() != 0		||		\
+		      !this_cpu_read(hardirqs_enabled)));		\
+} while (0)
+
+#define lockdep_assert_preemption_disabled()				\
+do {									\
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
+		     debug_locks			&&		\
+		     (preempt_count() == 0		&&		\
+		      this_cpu_read(hardirqs_enabled)));		\
+} while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
 # define lockdep_assert_in_irq() do { } while (0)
+
+# define lockdep_assert_preemption_enabled() do { } while (0)
+# define lockdep_assert_preemption_disabled() do { } while (0)
 #endif
 
 #ifdef CONFIG_LOCKDEP
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ee00c6c8a373..af036ce4dcfe 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1069,6 +1069,7 @@ config PROVE_LOCKING
 	select DEBUG_RWSEMS
 	select DEBUG_WW_MUTEX_SLOWPATH
 	select DEBUG_LOCK_ALLOC
+	select PREEMPT_COUNT if !ARCH_NO_PREEMPT
 	select TRACE_IRQFLAGS
 	default n
 	help
-- 
Gitee


From c18f0ec853db102d5e16a30ba7a674471755ef52 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 5 Jan 2022 04:35:18 -0800
Subject: [PATCH 0928/2161] x86/fpu: Provide fpu_enable_guest_xfd_features()
 for KVM

commit 0781d60f658e25fbad3b6e4261f54eb1cd3dc302 upstream.

Provide a wrapper for expanding the guest fpstate buffer according
to requested xfeatures. KVM wants to call this wrapper to manage
any dynamic xstate used by the guest.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220105123532.12586-8-yang.zhong@intel.com>
[Remove unnecessary 32-bit check. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  1 +
 arch/x86/kernel/fpu/core.c     | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index bfd5d1cd04d3..73ec5efcd6d1 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -134,6 +134,7 @@ extern inline u64 xstate_get_guest_group_perm(void);
 extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
 extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
 extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
+extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);
 
 extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
 extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 9c593a32bd1f..b541a2148a0a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -261,6 +261,28 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
 }
 EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
 
+/*
+  * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
+  * @guest_fpu:         Pointer to the guest FPU container
+  * @xfeatures:         Features requested by guest CPUID
+  *
+  * Enable all dynamic xfeatures according to guest perm and requested CPUID.
+  *
+  * Return: 0 on success, error code otherwise
+  */
+int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
+{
+	lockdep_assert_preemption_enabled();
+
+	/* Nothing to do if all requested features are already enabled. */
+	xfeatures &= ~guest_fpu->xfeatures;
+	if (!xfeatures)
+		return 0;
+
+	return __xfd_enable_feature(xfeatures, guest_fpu);
+}
+EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+
 int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 {
 	struct fpstate *guest_fps = guest_fpu->fpstate;
-- 
Gitee


From f767dcb4a2e6d61b918617af9fa64751a0eed297 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 3 Mar 2022 17:14:24 +0800
Subject: [PATCH 0929/2161] kvm: x86: Enable dynamic xfeatures at
 KVM_SET_CPUID2

commit 5ab2f45bba4894a0db4af8567da3efd6228dd010 upstream.

KVM can request fpstate expansion in two approaches:

  1) When intercepting guest updates to XCR0 and XFD MSR;

  2) Before vcpu runs (e.g. at KVM_SET_CPUID2);

The first option doesn't waste memory for legacy guest if it doesn't
support XFD. However doing so introduces more complexity and also
imposes an order requirement in the restoring path, i.e. XCR0/XFD
must be restored before XSTATE.

Given that the agreement is to do the static approach. This is
considered a better tradeoff though it does waste 8K memory for
legacy guest if its CPUID includes dynamically-enabled xfeatures.

Successful fpstate expansion requires userspace VMM to acquire
guest xstate permissions before calling KVM_SET_CPUID2.

Also take the chance to adjust the indent in kvm_set_cpuid().

Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-9-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 630fa33f9e13..abb7db5086fb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -134,6 +134,25 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 			return -EINVAL;
 	}
 
+	/*
+	 * Exposing dynamic xfeatures to the guest requires additional
+	 * enabling in the FPU, e.g. to expand the guest XSAVE state size.
+	 */
+	best = kvm_find_cpuid_entry(vcpu, 0xd, 0);
+	if (best) {
+		u64 xfeatures;
+		int r;
+
+		xfeatures = best->eax | ((u64)best->edx << 32);
+		xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
+		if (xfeatures) {
+			r = fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu,
+							  xfeatures);
+			if (r)
+				return r;
+		}
+	}
+
 	best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
 	if (kvm_hlt_in_guest(vcpu->kvm) && best &&
 		(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
-- 
Gitee


From 5f02835ca99e40eb29543c2c537f95178c63280b Mon Sep 17 00:00:00 2001
From: Kevin Tian <kevin.tian@intel.com>
Date: Wed, 5 Jan 2022 04:35:20 -0800
Subject: [PATCH 0930/2161] x86/fpu: Provide fpu_update_guest_xfd() for
 IA32_XFD emulation

commit 8eb9a48ac1e86a8a59f7123b529d6e498fb1f163 upstream.

Guest XFD can be updated either in the emulation path or in the
restore path.

Provide a wrapper to update guest_fpu::fpstate::xfd. If the guest
fpstate is currently in-use, also update the per-cpu xfd cache and
the actual MSR.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-10-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  6 ++++++
 arch/x86/kernel/fpu/core.c     | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 73ec5efcd6d1..9decd94198eb 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -136,6 +136,12 @@ extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
 extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
 extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);
 
+#ifdef CONFIG_X86_64
+extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
+#else
+static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
+#endif
+
 extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
 extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b541a2148a0a..4de39c980288 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -283,6 +283,18 @@ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
 }
 EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
 
+#ifdef CONFIG_X86_64
+void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
+{
+	fpregs_lock();
+	guest_fpu->fpstate->xfd = xfd;
+	if (guest_fpu->fpstate->in_use)
+		xfd_update_state(guest_fpu->fpstate);
+	fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+#endif /* CONFIG_X86_64 */
+
 int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 {
 	struct fpstate *guest_fps = guest_fpu->fpstate;
-- 
Gitee


From 60f9e44b38ede8cbbba249518ea6929903075390 Mon Sep 17 00:00:00 2001
From: Fares Mehanna <faresx@amazon.de>
Date: Wed, 15 Sep 2021 13:39:50 +0000
Subject: [PATCH 0931/2161] kvm: x86: Add AMD PMU MSRs to msrs_to_save_all[]

commit e1fc1553cd78292ab3521c94c9dd6e3e70e606a1 upstream.

Intel PMU MSRs is in msrs_to_save_all[], so add AMD PMU MSRs to have a
consistent behavior between Intel and AMD when using KVM_GET_MSRS,
KVM_SET_MSRS or KVM_GET_MSR_INDEX_LIST.

We have to add legacy and new MSRs to handle guests running without
X86_FEATURE_PERFCTR_CORE.

Signed-off-by: Fares Mehanna <faresx@amazon.de>
Message-Id: <20210915133951.22389-1-faresx@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0fbb74276e51..74edc663dbea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1251,6 +1251,13 @@ static const u32 msrs_to_save_all[] = {
 	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+	MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+	MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+	MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+	MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+	MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
 };
 
 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
-- 
Gitee


From 1b5007e32b4cf0a4fc2d18dbcdfea63a2c6eca4a Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 3 Mar 2022 19:16:55 +0800
Subject: [PATCH 0932/2161] kvm: x86: Add emulation for IA32_XFD

commit 820a6ee944e74e57255ac2e90916ecdaade57b95 upstream.

Intel's eXtended Feature Disable (XFD) feature allows the software
to dynamically adjust fpstate buffer size for XSAVE features which
have large state.

Because guest fpstate has been expanded for all possible dynamic
xstates at KVM_SET_CPUID2, emulation of the IA32_XFD MSR is
straightforward. For write just call fpu_update_guest_xfd() to
update the guest fpu container once all the sanity checks are passed.
For read simply return the cached value in the container.

Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-11-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 74edc663dbea..43aee8875375 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1251,6 +1251,7 @@ static const u32 msrs_to_save_all[] = {
 	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+	MSR_IA32_XFD,
 
 	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
 	MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
@@ -2999,6 +3000,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.msr_misc_features_enables = data;
 		break;
+#ifdef CONFIG_X86_64
+	case MSR_IA32_XFD:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+			return 1;
+
+		if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+			     vcpu->arch.guest_supported_xcr0))
+			return 1;
+
+		fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+		break;
+#endif
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
@@ -3243,6 +3257,15 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_K7_HWCR:
 		msr_info->data = vcpu->arch.msr_hwcr;
 		break;
+#ifdef CONFIG_X86_64
+	case MSR_IA32_XFD:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+			return 1;
+
+		msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+		break;
+#endif
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -5238,7 +5261,12 @@ static void kvm_init_msr_list(void)
 			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
 			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
 				continue;
+			break;
 		}
+		case MSR_IA32_XFD:
+			if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+				continue;
+			break;
 		default:
 			break;
 		}
-- 
Gitee


From e5cce62bee157894fa46f54928fdc31bcdd84a12 Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 5 Jan 2022 04:35:22 -0800
Subject: [PATCH 0933/2161] x86/fpu: Prepare xfd_err in struct fpu_guest

commit 1df4fd834e8e2c00973ac2003ad0e6feb8750b31 upstream.

When XFD causes an instruction to generate #NM, IA32_XFD_ERR
contains information about which disabled state components are
being accessed. The #NM handler is expected to check this
information and then enable the state components by clearing
IA32_XFD for the faulting task (if having permission).

If the XFD_ERR value generated in guest is consumed/clobbered
by the host before the guest itself doing so, it may lead to
non-XFD-related #NM treated as XFD #NM in host (due to non-zero
value in XFD_ERR), or XFD-related #NM treated as non-XFD #NM in
guest (XFD_ERR cleared by the host #NM handler).

Introduce a new field in fpu_guest to save the guest xfd_err value.
KVM is expected to save guest xfd_err before interrupt is enabled
and restore it right before entering the guest (with interrupt
disabled).

Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-12-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 71de7a7724fe..a852c0df934c 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -517,6 +517,11 @@ struct fpu_guest {
 	 */
 	u64				perm;
 
+	/*
+	 * @xfd_err:			Save the guest value.
+	 */
+	u64				xfd_err;
+
 	/*
 	 * @fpstate:			Pointer to the allocated guest fpstate
 	 */
-- 
Gitee


From ebdcd47f287f9e088bf123b4b81c47ce98cb83dd Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 17 Feb 2020 23:02:30 +0800
Subject: [PATCH 0934/2161] KVM: VMX: Add 'else' to split mutually exclusive
 case

commit d71f5e03257c6c62568e76ee204a678396a55722 upstream.

Each if branch in handle_external_interrupt_irqoff() is mutually
exclusive. Add 'else' to make it clear and also avoid some unnecessary
check.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3e77ace23118..f3a6a092072a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6222,15 +6222,13 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
 	/* if exit due to PF check for async PF */
-	if (is_page_fault(vmx->exit_intr_info))
+	if (is_page_fault(vmx->exit_intr_info)) {
 		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
 	/* Handle machine checks before interrupts are enabled */
-	if (is_machine_check(vmx->exit_intr_info))
+	} else if (is_machine_check(vmx->exit_intr_info)) {
 		kvm_machine_check();
-
 	/* We need to handle NMIs before interrupts are enabled */
-	if (is_nmi(vmx->exit_intr_info)) {
+	} else if (is_nmi(vmx->exit_intr_info)) {
 		kvm_before_interrupt(&vmx->vcpu);
 		asm("int $2");
 		kvm_after_interrupt(&vmx->vcpu);
-- 
Gitee


From 08924e83345313073364ad9dbb802eabb97dd43c Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 5 Jan 2022 04:35:23 -0800
Subject: [PATCH 0935/2161] kvm: x86: Intercept #NM for saving IA32_XFD_ERR

commit ec5be88ab29fd9145c7ced20b58fb96f7c6b6890 upstream.

Guest IA32_XFD_ERR is generally modified in two places:

  - Set by CPU when #NM is triggered;
  - Cleared by guest in its #NM handler;

Intercept #NM for the first case when a nonzero value is written
to IA32_XFD. Nonzero indicates that the guest is willing to do
dynamic fpstate expansion for certain xfeatures, thus KVM needs to
manage and virtualize guest XFD_ERR properly. The vcpu exception
bitmap is updated in XFD write emulation according to guest_fpu::xfd.

Save the current XFD_ERR value to the guest_fpu container in the #NM
VM-exit handler. This must be done with interrupt disabled, otherwise
the unsaved MSR value may be clobbered by host activity.

The saving operation is conducted conditionally only when guest_fpu:xfd
includes a non-zero value. Doing so also avoids misread on a platform
which doesn't support XFD but #NM is triggered due to L1 interception.

Queueing #NM to the guest is postponed to handle_exception_nmi(). This
goes through the nested_vmx check so a virtual vmexit is queued instead
when #NM is triggered in L2 but L1 wants to intercept it.

Restore the host value (always ZERO outside of the host #NM
handler) before enabling interrupt.

Restore the guest value from the guest_fpu container right before
entering the guest (with interrupt disabled).

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-13-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmcs.h |  5 +++++
 arch/x86/kvm/vmx/vmx.c  | 48 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c      |  6 ++++++
 3 files changed, 59 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 481ad879197b..052828914440 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -111,6 +111,11 @@ static inline bool is_machine_check(u32 intr_info)
 		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 }
 
+static inline bool is_nm_fault(u32 intr_info)
+{
+	return is_exception_n(intr_info, NM_VECTOR);
+}
+
 /* Undocumented: icebp/int1 */
 static inline bool is_icebp(u32 intr_info)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f3a6a092072a..d917cc8a2cdb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -34,6 +34,7 @@
 #include <asm/debugreg.h>
 #include <asm/desc.h>
 #include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
 #include <asm/io.h>
 #include <asm/irq_remapping.h>
 #include <asm/kexec.h>
@@ -779,6 +780,13 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	if (is_guest_mode(vcpu))
 		eb |= get_vmcs12(vcpu)->exception_bitmap;
 
+	/*
+	 * Trap #NM if guest xfd contains a non-zero value so guest XFD_ERR
+	 * can be saved timely.
+	 */
+	if (vcpu->arch.guest_fpu.fpstate->xfd)
+		eb |= (1u << NM_VECTOR);
+
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1925,6 +1933,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_KERNEL_GS_BASE:
 		vmx_write_guest_kernel_gs_base(vmx, data);
 		break;
+	case MSR_IA32_XFD:
+		ret = kvm_set_msr_common(vcpu, msr_info);
+		/* Update #NM interception according to guest xfd */
+		if (!ret)
+			update_exception_bitmap(vcpu);
+		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
 		if (is_guest_mode(vcpu))
@@ -4619,6 +4633,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	if (is_machine_check(intr_info) || is_nmi(intr_info))
 		return 1; /* handled by handle_exception_nmi_irqoff() */
 
+	/*
+	 * Queue the exception here instead of in handle_nm_fault_irqoff().
+	 * This ensures the nested_vmx check is not skipped so vmexit can
+	 * be reflected to L1 (when it intercepts #NM) before reaching this
+	 * point.
+	 */
+	if (is_nm_fault(intr_info)) {
+		kvm_queue_exception(vcpu, NM_VECTOR);
+		return 1;
+	}
+
 	if (is_invalid_opcode(intr_info))
 		return handle_ud(vcpu);
 
@@ -6217,6 +6242,26 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
 }
 
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Save xfd_err to guest_fpu before interrupt is enabled, so the
+	 * MSR value is not clobbered by the host activity before the guest
+	 * has chance to consume it.
+	 *
+	 * Do not blindly read xfd_err here, since this exception might
+	 * be caused by L1 interception on a platform which doesn't
+	 * support xfd at all.
+	 *
+	 * Do it conditionally upon guest_fpu::xfd. xfd_err matters
+	 * only when xfd contains a non-zero value.
+	 *
+	 * Queuing exception is done in vmx_handle_exit. See comment there.
+	 */
+	if (vcpu->arch.guest_fpu.fpstate->xfd)
+		rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 {
 	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -6224,6 +6269,9 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 	/* if exit due to PF check for async PF */
 	if (is_page_fault(vmx->exit_intr_info)) {
 		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
+	/* if exit due to NM, handle before interrupts are enabled */
+	} else if (is_nm_fault(vmx->exit_intr_info)) {
+		handle_nm_fault_irqoff(&vmx->vcpu);
 	/* Handle machine checks before interrupts are enabled */
 	} else if (is_machine_check(vmx->exit_intr_info)) {
 		kvm_machine_check();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43aee8875375..f206e299c207 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8246,6 +8246,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
 		switch_fpu_return();
 
+	if (vcpu->arch.guest_fpu.xfd_err)
+		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
 		set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -8290,6 +8293,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->handle_exit_irqoff(vcpu);
 
+	if (vcpu->arch.guest_fpu.xfd_err)
+		wrmsrl(MSR_IA32_XFD_ERR, 0);
+
 	/*
 	 * Consume any pending interrupts, including the possible source of
 	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
-- 
Gitee


From 4714fdfabed1ba8907f1904d739ea56ce120da31 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Jan 2022 04:35:28 -0800
Subject: [PATCH 0936/2161] x86/fpu: Add uabi_size to guest_fpu

commit c60427dd50ba9b20063ccaed0e98d62e886d7a3b upstream.

Userspace needs to inquire KVM about the buffer size to work
with the new KVM_SET_XSAVE and KVM_GET_XSAVE2. Add the size info
to guest_fpu for KVM to access.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-18-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/types.h | 5 +++++
 arch/x86/kernel/fpu/core.c       | 1 +
 arch/x86/kernel/fpu/xstate.c     | 1 +
 3 files changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index a852c0df934c..9a1cc1761817 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -522,6 +522,11 @@ struct fpu_guest {
 	 */
 	u64				xfd_err;
 
+	/*
+	 * @uabi_size:			Size required for save/restore
+	 */
+	unsigned int			uabi_size;
+
 	/*
 	 * @fpstate:			Pointer to the allocated guest fpstate
 	 */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 4de39c980288..99eda6b0e471 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -240,6 +240,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
 	gfpu->fpstate		= fpstate;
 	gfpu->xfeatures		= fpu_user_cfg.default_features;
 	gfpu->perm		= fpu_user_cfg.default_features;
+	gfpu->uabi_size		= fpu_user_cfg.default_size;
 	fpu_init_guest_permissions(gfpu);
 
 	return true;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9b4bd1251799..07e6c7ed4803 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1576,6 +1576,7 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
 		newfps->is_confidential = curfps->is_confidential;
 		newfps->in_use = curfps->in_use;
 		guest_fpu->xfeatures |= xfeatures;
+		guest_fpu->uabi_size = usize;
 	}
 
 	fpregs_lock();
-- 
Gitee


From 2a6cf8d671daebc7ec8e1d6923d0f3e7e8e848d4 Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 5 Jan 2022 04:35:24 -0800
Subject: [PATCH 0937/2161] kvm: x86: Emulate IA32_XFD_ERR for guest

commit 548e83650a51dce0d188b9e41b1e2ca5d63597cf upstream.

Emulate read/write to IA32_XFD_ERR MSR.

Only the saved value in the guest_fpu container is touched in the
emulation handler. Actual MSR update is handled right before entering
the guest (with preemption disabled)

Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-14-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f206e299c207..905caa53d9bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1251,7 +1251,7 @@ static const u32 msrs_to_save_all[] = {
 	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
-	MSR_IA32_XFD,
+	MSR_IA32_XFD, MSR_IA32_XFD_ERR,
 
 	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
 	MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
@@ -3012,6 +3012,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
 		break;
+	case MSR_IA32_XFD_ERR:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+			return 1;
+
+		if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+			     vcpu->arch.guest_supported_xcr0))
+			return 1;
+
+		vcpu->arch.guest_fpu.xfd_err = data;
+		break;
 #endif
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
@@ -3265,6 +3276,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
 		break;
+	case MSR_IA32_XFD_ERR:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+			return 1;
+
+		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+		break;
 #endif
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
@@ -5264,6 +5282,7 @@ static void kvm_init_msr_list(void)
 			break;
 		}
 		case MSR_IA32_XFD:
+		case MSR_IA32_XFD_ERR:
 			if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
 				continue;
 			break;
-- 
Gitee


From 494f08bc3c491b209797cf331850374c5c5b38ce Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 4 Mar 2022 15:43:18 +0800
Subject: [PATCH 0938/2161] kvm: x86: Disable RDMSR interception of
 IA32_XFD_ERR

commit 61f208134a871047f1d642ed3b813f4f71e30b0e upstream.

This saves one unnecessary VM-exit in guest #NM handler, given that the
MSR is already restored with the guest value before the guest is resumed.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-15-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d917cc8a2cdb..5979c22cb2cd 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7132,6 +7132,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
 			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
 		update_intel_pt_cfg(vcpu);
+	
+	if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+		vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+					  !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
 }
 
 /*
-- 
Gitee


From 11994c1edaa88ae2471ec638270847d8c75dcd4c Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 4 Mar 2022 16:03:35 +0800
Subject: [PATCH 0939/2161] kvm: x86: Add XCR0 support for Intel AMX

commit 86aff7a4799286635efd94dab17b513544703cad upstream.

Two XCR0 bits are defined for AMX to support XSAVE mechanism. Bit 17
is for tilecfg and bit 18 is for tiledata.

The value of XCR0[17:18] is always either 00b or 11b. Also, SDM
recommends that only 64-bit operating systems enable Intel AMX by
setting XCR0[18:17]. 32-bit host kernel never sets the tile bits in
vcpu->arch.guest_supported_xcr0.

Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-16-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 5 +++++
 arch/x86/kvm/x86.h | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 905caa53d9bb..1d69f36634ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -902,6 +902,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
 			return 1;
 	}
+
+	if ((xcr0 & XFEATURE_MASK_XTILE) &&
+	    ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
+		return 1;
+
 	vcpu->arch.xcr0 = xcr0;
 
 	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c1854a8ac6a4..7e782c49636a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -289,7 +289,7 @@ enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vc
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
 				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
-				| XFEATURE_MASK_PKRU)
+				| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
-- 
Gitee


From 1ea62835bcb72efe157521887790301458f3f40f Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 5 Jan 2022 04:35:27 -0800
Subject: [PATCH 0940/2161] kvm: x86: Add CPUID support for Intel AMX

commit 690a757d610e50c2c3acd2e4bc3992cfc63feff2 upstream.

Extend CPUID emulation to support XFD, AMX_TILE, AMX_INT8 and
AMX_BF16. Adding those bits into kvm_cpu_caps finally activates all
previous logics in this series.

Hide XFD on 32bit host kernels. Otherwise it leads to a weird situation
where KVM tells userspace to migrate MSR_IA32_XFD and then rejects
attempts to read/write the MSR.

Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-17-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h |  2 ++
 arch/x86/kvm/cpuid.c               | 27 +++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ae1dbfcbd5cb..0d29142d0c0c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -290,7 +290,9 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_AMX_BF16		(18*32+22) /* AMX bf16 Support */
 #define X86_FEATURE_AMX_TILE		(18*32+24) /* AMX tile Support */
+#define X86_FEATURE_AMX_INT8		(18*32+25) /* AMX int8 Support */
 
 /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
 #define X86_FEATURE_CLZERO		(13*32+ 0) /* CLZERO instruction */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index abb7db5086fb..b4a0138f2f68 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -343,9 +343,11 @@ void kvm_set_cpu_caps(void)
 #ifdef CONFIG_X86_64
 	unsigned int f_gbpages = F(GBPAGES);
 	unsigned int f_lm = F(LM);
+	unsigned int f_xfd = F(XFD);
 #else
 	unsigned int f_gbpages = 0;
 	unsigned int f_lm = 0;
+	unsigned int f_xfd = 0;
 #endif
 	memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
 
@@ -402,7 +404,8 @@ void kvm_set_cpu_caps(void)
 	kvm_cpu_cap_mask(CPUID_7_EDX,
 		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
 		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
-		F(MD_CLEAR)
+		F(MD_CLEAR) |
+		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
 	);
 
 	kvm_cpu_cap_mask(CPUID_7_1_EAX,
@@ -410,7 +413,7 @@ void kvm_set_cpu_caps(void)
 	);
 
 	kvm_cpu_cap_mask(CPUID_D_1_EAX,
-		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
+		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
 	);
 
 	kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
@@ -492,6 +495,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
 	case 0x14:
 	case 0x17:
 	case 0x18:
+	case 0x1d:
+	case 0x1e:
 	case 0x1f:
 	case 0x8000001d:
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -735,6 +740,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 				goto out;
 		}
 		break;
+	/* Intel AMX TILE */
+	case 0x1d:
+		if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+			entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+			break;
+		}
+
+		for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+			if (!do_host_cpuid(array, function, i))
+				goto out;
+		}
+		break;
+	case 0x1e: /* TMUL information */
+		if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+			entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+			break;
+		}
+		break;
 	case KVM_CPUID_SIGNATURE: {
 		static const char signature[12] = "KVMKVMKVM\0\0";
 		const u32 *sigptr = (const u32 *)signature;
-- 
Gitee


From ac105eb4477de0c4139a88dd99be826e2ee47517 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 4 Mar 2022 18:38:57 +0800
Subject: [PATCH 0941/2161] kvm: x86: Add support for getting/setting expanded
 xstate buffer

commit be50b2065dfa3d88428fdfdc340d154d96bf6848 upstream.

With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set
xstate data from/to KVM. This doesn't work when dynamic xfeatures
(e.g. AMX) are exposed to the guest as they require a larger buffer
size.

Introduce a new capability (KVM_CAP_XSAVE2). Userspace VMM gets the
required xstate buffer size via KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2).
KVM_SET_XSAVE is extended to work with both legacy and new capabilities
by doing properly-sized memdup_user() based on the guest fpu container.
KVM_GET_XSAVE is kept for backward-compatible reason. Instead,
KVM_GET_XSAVE2 is introduced under KVM_CAP_XSAVE2 as the preferred
interface for getting xstate buffer (4KB or larger size) from KVM
(Link: https://lkml.org/lkml/2021/12/15/510)

Also, update the api doc with the new KVM_GET_XSAVE2 ioctl.

Signed-off-by: Guang Zeng <guang.zeng@intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-19-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/virt/kvm/api.txt  | 41 ++++++++++++++++++++++++++++--
 arch/x86/include/uapi/asm/kvm.h | 16 +++++++++++-
 arch/x86/kvm/cpuid.c            |  2 +-
 arch/x86/kvm/cpuid.h            |  2 ++
 arch/x86/kvm/x86.c              | 45 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h        |  4 +++
 6 files changed, 105 insertions(+), 5 deletions(-)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index f4be0a07d456..a2cf8c388fd7 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -1330,6 +1330,7 @@ Returns: 0 on success, -1 on error
 
 struct kvm_xsave {
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
 This ioctl would copy current vcpu's xsave struct to the userspace.
@@ -1337,7 +1338,7 @@ This ioctl would copy current vcpu's xsave struct to the userspace.
 
 4.43 KVM_SET_XSAVE
 
-Capability: KVM_CAP_XSAVE
+Capability: KVM_CAP_XSAVE and KVM_CAP_XSAVE2
 Architectures: x86
 Type: vcpu ioctl
 Parameters: struct kvm_xsave (in)
@@ -1345,9 +1346,18 @@ Returns: 0 on success, -1 on error
 
 struct kvm_xsave {
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
-This ioctl would copy userspace's xsave struct to the kernel.
+This ioctl would copy userspace's xsave struct to the kernel. It copies
+as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2),
+when invoked on the vm file descriptor. The size value returned by
+KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
+Currently, it is only greater than 4096 if a dynamic feature has been
+enabled with ``arch_prctl()``, but this may change in the future.
+
+The offsets of the state save areas in struct kvm_xsave follow the
+contents of CPUID leaf 0xD on the host.
 
 
 4.44 KVM_GET_XCRS
@@ -4136,6 +4146,33 @@ Valid values for 'action':
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
+4.42 KVM_GET_XSAVE2
+------------------
+
+:Capability: KVM_CAP_XSAVE2
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_xsave (out)
+:Returns: 0 on success, -1 on error
+
+
+::
+
+  struct kvm_xsave {
+	__u32 region[1024];
+	__u32 extra[0];
+  };
+
+This ioctl would copy current vcpu's xsave struct to the userspace. It
+copies as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+when invoked on the vm file descriptor. The size value returned by
+KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
+Currently, it is only greater than 4096 if a dynamic feature has been
+enabled with ``arch_prctl()``, but this may change in the future.
+
+The offsets of the state save areas in struct kvm_xsave follow the contents
+of CPUID leaf 0xD on the host.
+
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 503d3f42da16..e2823aedadc3 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -337,9 +337,23 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
 struct kvm_xsave {
+	/*
+	 * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+	 * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * respectively, when invoked on the vm file descriptor.
+	 *
+	 * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * will always be at least 4096. Currently, it is only greater
+	 * than 4096 if a dynamic feature has been enabled with
+	 * ``arch_prctl()``, but this may change in the future.
+	 *
+	 * The offsets of the state save areas in struct kvm_xsave follow
+	 * the contents of CPUID leaf 0xD on the host.
+	 */
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
 #define KVM_MAX_XCRS	16
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b4a0138f2f68..079df2910971 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -31,7 +31,7 @@
 u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
 
-static u32 xstate_required_size(u64 xstate_bv, bool compacted)
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
 {
 	int feature_bit = 0;
 	u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 2485944c6888..a6f51510a30b 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -41,6 +41,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
 	       u32 *ecx, u32 *edx, bool check_limit);
 
+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
 
 static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d69f36634ed..d0e1b7fc64d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3497,6 +3497,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
 		r = kvm_x86_ops->nested_enable_evmcs != NULL;
 		break;
+	case KVM_CAP_XSAVE2: {
+		u64 guest_perm = xstate_get_guest_group_perm();
+
+		r = xstate_required_size(kvm_supported_xcr0() & guest_perm, false);
+		if (r < sizeof(struct kvm_xsave))
+			r = sizeof(struct kvm_xsave);
+		break;
+	}
 	default:
 		break;
 	}
@@ -4080,6 +4088,16 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 				       vcpu->arch.pkru);
 }
 
+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+					  u8 *state, unsigned int size)
+{
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+		return;
+
+	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+				       state, size, vcpu->arch.pkru);
+}
+
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
@@ -4395,6 +4413,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
+		r = -EINVAL;
+		if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+			break;
+
 		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
 		if (!u.xsave)
@@ -4409,7 +4431,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		u.xsave = memdup_user(argp, sizeof(*u.xsave));
+		int size = vcpu->arch.guest_fpu.uabi_size;
+
+		u.xsave = memdup_user(argp, size);
 		if (IS_ERR(u.xsave)) {
 			r = PTR_ERR(u.xsave);
 			goto out_nofree;
@@ -4418,6 +4442,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
 	}
+
+	case KVM_GET_XSAVE2: {
+		int size = vcpu->arch.guest_fpu.uabi_size;
+
+		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+		r = -ENOMEM;
+		if (!u.xsave)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, u.xsave, size))
+			break;
+
+		r = 0;
+		break;
+	}
+
 	case KVM_GET_XCRS: {
 		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1b6b8e05868d..1babccf3f772 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1003,6 +1003,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PMU_EVENT_FILTER 173
 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
+#define KVM_CAP_XSAVE2 208
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1464,6 +1465,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_ARM_SVE */
 #define KVM_ARM_VCPU_FINALIZE	  _IOW(KVMIO,  0xc2, int)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
-- 
Gitee


From 614380d085036042cd26affdc79ebe496bcfeccb Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 4 Mar 2022 23:28:59 +0800
Subject: [PATCH 0942/2161] kvm: selftests: Add support for KVM_CAP_XSAVE2

commit 415a3c33e847349c0f76575b3ebfdfae2f5a681a upstream.

When KVM_CAP_XSAVE2 is supported, userspace is expected to allocate
buffer for KVM_GET_XSAVE2 and KVM_SET_XSAVE using the size returned
by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2).

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Guang Zeng <guang.zeng@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-20-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/arch/x86/include/uapi/asm/kvm.h         | 16 ++++-
 tools/include/uapi/linux/kvm.h                |  3 +
 .../testing/selftests/kvm/include/kvm_util.h  |  2 +
 .../selftests/kvm/include/x86_64/processor.h  | 12 ++++
 tools/testing/selftests/kvm/lib/kvm_util.c    | 32 +++++++++
 .../selftests/kvm/lib/x86_64/processor.c      | 68 ++++++++++++++++++-
 .../testing/selftests/kvm/x86_64/evmcs_test.c |  2 +-
 tools/testing/selftests/kvm/x86_64/smm_test.c |  2 +-
 .../testing/selftests/kvm/x86_64/state_test.c |  2 +-
 .../kvm/x86_64/vmx_set_nested_state_test.c    |  2 +-
 10 files changed, 133 insertions(+), 8 deletions(-)

diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 503d3f42da16..e2823aedadc3 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -337,9 +337,23 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
 struct kvm_xsave {
+	/*
+	 * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+	 * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * respectively, when invoked on the vm file descriptor.
+	 *
+	 * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * will always be at least 4096. Currently, it is only greater
+	 * than 4096 if a dynamic feature has been enabled with
+	 * ``arch_prctl()``, but this may change in the future.
+	 *
+	 * The offsets of the state save areas in struct kvm_xsave follow
+	 * the contents of CPUID leaf 0xD on the host.
+	 */
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
 #define KVM_MAX_XCRS	16
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 52641d8ca9e8..e7db8eef2e29 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1000,6 +1000,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PMU_EVENT_FILTER 173
 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
+#define KVM_CAP_XSAVE2 207
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1406,6 +1407,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_XSAVE */
 #define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
 #define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 29cccaf96baf..84fbf8463619 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -66,6 +66,7 @@ enum vm_mem_backing_src_type {
 };
 
 int kvm_check_cap(long cap);
+int vm_check_cap(struct kvm_vm *vm, long cap);
 int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
@@ -151,6 +152,7 @@ vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
 struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_size,
 				 void *guest_code);
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
+void vm_xsave_req_perm(void);
 
 bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index aead07c24afc..69160b7912bb 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -10,6 +10,9 @@
 
 #include <assert.h>
 #include <stdint.h>
+#include <syscall.h>
+
+#include <asm/prctl.h>
 
 #define X86_EFLAGS_FIXED	 (1u << 1)
 
@@ -307,6 +310,7 @@ struct kvm_x86_state;
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_x86_state *state);
+void kvm_x86_state_cleanup(struct kvm_x86_state *state);
 
 struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
 void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
@@ -1086,6 +1090,14 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
 /* VMX_EPT_VPID_CAP bits */
 #define VMX_EPT_VPID_CAP_AD_BITS	(1ULL << 21)
 
+#define XSTATE_XTILE_CFG_BIT		17
+#define XSTATE_XTILE_DATA_BIT		18
+
+#define XSTATE_XTILE_CFG_MASK		(1ULL << XSTATE_XTILE_CFG_BIT)
+#define XSTATE_XTILE_DATA_MASK		(1ULL << XSTATE_XTILE_DATA_BIT)
+#define XFEATURE_XTILE_MASK		(XSTATE_XTILE_CFG_MASK | \
+					XSTATE_XTILE_DATA_MASK)
+
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 41cf45416060..5e59f15e60d7 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -62,6 +62,33 @@ int kvm_check_cap(long cap)
 	return ret;
 }
 
+/* VM Check Capability
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   On success, the Value corresponding to the capability (KVM_CAP_*)
+ *   specified by the value of cap.  On failure a TEST_ASSERT failure
+ *   is produced.
+ *
+ * Looks up and returns the value corresponding to the capability
+ * (KVM_CAP_*) given by cap.
+ */
+int vm_check_cap(struct kvm_vm *vm, long cap)
+{
+	int ret;
+
+	ret = ioctl(vm->fd, KVM_CHECK_EXTENSION, cap);
+	TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION VM IOCTL failed,\n"
+		"  rc: %i errno: %i", ret, errno);
+
+	return ret;
+}
+
 /* VM Enable Capability
  *
  * Input Args:
@@ -138,6 +165,11 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 
 	DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
 
+	/*
+	 * Permission needs to be requested before KVM_SET_CPUID2.
+	 */
+	vm_xsave_req_perm();
+
 	vm = calloc(1, sizeof(*vm));
 	TEST_ASSERT(vm != NULL, "Insufficient Memory");
 
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 7d8f7fc73646..87509fb2baae 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -641,6 +641,46 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m
 	sregs.cr3 = vm->pgd;
 	vcpu_sregs_set(vm, vcpuid, &sregs);
 }
+
+#define CPUID_XFD_BIT (1 << 4)
+static bool is_xfd_supported(void)
+{
+	int eax, ebx, ecx, edx;
+	const int leaf = 0xd, subleaf = 0x1;
+
+	__asm__ __volatile__(
+		"cpuid"
+		: /* output */ "=a"(eax), "=b"(ebx),
+		  "=c"(ecx), "=d"(edx)
+		: /* input */ "0"(leaf), "2"(subleaf));
+
+	return !!(eax & CPUID_XFD_BIT);
+}
+
+void vm_xsave_req_perm(void)
+{
+	unsigned long bitmask;
+	long rc;
+
+	if (!is_xfd_supported())
+		return;
+
+	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
+		     XSTATE_XTILE_DATA_BIT);
+	/*
+	 * The older kernel version(<5.15) can't support
+	 * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
+	 */
+	if (rc)
+		return;
+
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
+	TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
+	TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK,
+		    "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
+		    bitmask);
+}
+
 /* Adds a vCPU with reasonable defaults (i.e., a stack)
  *
  * Input Args:
@@ -987,10 +1027,10 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
 }
 
 struct kvm_x86_state {
+	struct kvm_xsave *xsave;
 	struct kvm_vcpu_events events;
 	struct kvm_mp_state mp_state;
 	struct kvm_regs regs;
-	struct kvm_xsave xsave;
 	struct kvm_xcrs xcrs;
 	struct kvm_sregs sregs;
 	struct kvm_debugregs debugregs;
@@ -1014,6 +1054,22 @@ static int kvm_get_num_msrs(struct kvm_vm *vm)
 	return nmsrs.nmsrs;
 }
 
+static int vcpu_save_xsave_state(struct kvm_vm *vm, struct vcpu *vcpu,
+				 struct kvm_x86_state *state)
+{
+	int size;
+
+	size = vm_check_cap(vm, KVM_CAP_XSAVE2);
+	if (!size)
+		size = sizeof(struct kvm_xsave);
+
+	state->xsave = malloc(size);
+	if (size == sizeof(struct kvm_xsave))
+		return ioctl(vcpu->fd, KVM_GET_XSAVE, state->xsave);
+	else
+		return ioctl(vcpu->fd, KVM_GET_XSAVE2, state->xsave);
+}
+
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 {
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
@@ -1057,7 +1113,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
                 r);
 
-	r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave);
+	r = vcpu_save_xsave_state(vm, vcpu, state);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
                 r);
 
@@ -1102,7 +1158,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	int r;
 
-	r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
+	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
                 r);
 
@@ -1143,6 +1199,12 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	}
 }
 
+void kvm_x86_state_cleanup(struct kvm_x86_state *state)
+{
+	free(state->xsave);
+	free(state);
+}
+
 bool is_intel_cpu(void)
 {
 	int eax, ebx, ecx, edx;
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
index 92915e6408e7..92eeedbf0f48 100644
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -138,7 +138,7 @@ int main(int argc, char *argv[])
 		vcpu_enable_evmcs(vm, VCPU_ID);
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 
 		memset(&regs2, 0, sizeof(regs2));
 		vcpu_regs_get(vm, VCPU_ID, &regs2);
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
index 8c063646f2a0..9cd269ab056d 100644
--- a/tools/testing/selftests/kvm/x86_64/smm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -148,7 +148,7 @@ int main(int argc, char *argv[])
 		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 	}
 
 done:
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
index 3ab5ec3da9f4..1f07b57b6591 100644
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -180,7 +180,7 @@ int main(int argc, char *argv[])
 		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 
 		memset(&regs2, 0, sizeof(regs2));
 		vcpu_regs_get(vm, VCPU_ID, &regs2);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
index 9ef7fab39d48..7d4d28009dc7 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
@@ -217,7 +217,7 @@ void test_vmx_nested_state(struct kvm_vm *vm)
 	TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
 	TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
 
-	free(state);
+	kvm_x86_state_cleanup(state);
 }
 
 int main(int argc, char *argv[])
-- 
Gitee


From c63c82c3ba7c3632cca3c9a5db48fcaa5da43239 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Jan 2022 04:35:31 -0800
Subject: [PATCH 0943/2161] x86/fpu: Provide fpu_sync_guest_vmexit_xfd_state()

commit 5429cead01192ff4019ea0b13316268d14fd1ec2 upstream.

KVM can disable the write emulation for the XFD MSR when the vCPU's fpstate
is already correctly sized to reduce the overhead.

When write emulation is disabled the XFD MSR state after a VMEXIT is
unknown and therefore not in sync with the software states in fpstate and
the per CPU XFD cache.

Provide fpu_sync_guest_vmexit_xfd_state() which has to be invoked after a
VMEXIT before enabling interrupts when write emulation is disabled for the
XFD MSR.

It could be invoked unconditionally even when write emulation is enabled
for the price of a pointless MSR read.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-21-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  2 ++
 arch/x86/kernel/fpu/core.c     | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 9decd94198eb..1a5df329f592 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -138,8 +138,10 @@ extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatu
 
 #ifdef CONFIG_X86_64
 extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
+extern void fpu_sync_guest_vmexit_xfd_state(void);
 #else
 static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
+static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
 #endif
 
 extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 99eda6b0e471..31d7dedb386e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -294,6 +294,30 @@ void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
 	fpregs_unlock();
 }
 EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+
+/**
+ * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
+ *
+ * Must be invoked from KVM after a VMEXIT before enabling interrupts when
+ * XFD write emulation is disabled. This is required because the guest can
+ * freely modify XFD and the state at VMEXIT is not guaranteed to be the
+ * same as the state on VMENTER. So software state has to be udpated before
+ * any operation which depends on it can take place.
+ *
+ * Note: It can be invoked unconditionally even when write emulation is
+ * enabled for the price of a then pointless MSR read.
+ */
+void fpu_sync_guest_vmexit_xfd_state(void)
+{
+	struct fpstate *fps = current->thread.fpu.fpstate;
+
+	lockdep_assert_irqs_disabled();
+	if (fpu_state_size_dynamic()) {
+		rdmsrl(MSR_IA32_XFD, fps->xfd);
+		__this_cpu_write(xfd_state, fps->xfd);
+	}
+}
+EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
 #endif /* CONFIG_X86_64 */
 
 int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
-- 
Gitee


From a60a17b4896f68ef82412a309ee770a6f3149081 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sat, 5 Mar 2022 13:00:07 +0800
Subject: [PATCH 0944/2161] kvm: x86: Disable interception for IA32_XFD on
 demand

commit b5274b1b7ba89fe8ed38cc470041cd6ba0dfb79b upstream.

Always intercepting IA32_XFD causes non-negligible overhead when this
register is updated frequently in the guest.

Disable r/w emulation after intercepting the first WRMSR(IA32_XFD)
with a non-zero value.

Disable WRMSR emulation implies that IA32_XFD becomes out-of-sync
with the software states in fpstate and the per-cpu xfd cache. This
leads to two additional changes accordingly:

  - Call fpu_sync_guest_vmexit_xfd_state() after vm-exit to bring
    software states back in-sync with the MSR, before handle_exit_irqoff()
    is called.

  - Always trap #NM once write interception is disabled for IA32_XFD.
    The #NM exception is rare if the guest doesn't use dynamic
    features. Otherwise, there is at most one exception per guest
    task given a dynamic feature.

p.s. We have confirmed that SDM is being revised to say that
when setting IA32_XFD[18] the AMX register state is not guaranteed
to be preserved. This clarification avoids adding mess for a creative
guest which sets IA32_XFD[18]=1 before saving active AMX state to
its own storage.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-22-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/vmx.c          | 23 ++++++++++++++++++-----
 arch/x86/kvm/x86.c              |  8 ++++++++
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d77b82f016cc..8ea16df9633b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -571,6 +571,7 @@ struct kvm_vcpu_arch {
 	u64 smbase;
 	u64 smi_count;
 	bool tpr_access_reporting;
+	bool xfd_no_write_intercept;
 	u64 ia32_xss;
 	u64 microcode_version;
 	u64 arch_capabilities;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5979c22cb2cd..aee9804a6351 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -781,10 +781,11 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb |= get_vmcs12(vcpu)->exception_bitmap;
 
 	/*
-	 * Trap #NM if guest xfd contains a non-zero value so guest XFD_ERR
-	 * can be saved timely.
+	 * Disabling xfd interception indicates that dynamic xfeatures
+	 * might be used in the guest. Always trap #NM in this case
+	 * to save guest xfd_err timely.
 	 */
-	if (vcpu->arch.guest_fpu.fpstate->xfd)
+	if (vcpu->arch.xfd_no_write_intercept)
 		eb |= (1u << NM_VECTOR);
 
 	vmcs_write32(EXCEPTION_BITMAP, eb);
@@ -1935,9 +1936,21 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_XFD:
 		ret = kvm_set_msr_common(vcpu, msr_info);
-		/* Update #NM interception according to guest xfd */
-		if (!ret)
+		/*
+		 * Always intercepting WRMSR could incur non-negligible
+		 * overhead given xfd might be changed frequently in
+		 * guest context switch. Disable write interception
+		 * upon the first write with a non-zero value (indicating
+		 * potential usage on dynamic xfeatures). Also update
+		 * exception bitmap to trap #NM for proper virtualization
+		 * of guest xfd_err.
+		 */
+		if (!ret && data) {
+			vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+						      MSR_TYPE_RW);
+			vcpu->arch.xfd_no_write_intercept = true;
 			update_exception_bitmap(vcpu);
+		}
 		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0e1b7fc64d3..18da412d05e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8358,6 +8358,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
 
+	/*
+	 * Sync xfd before calling handle_exit_irqoff() which may
+	 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+	 * in #NM irqoff handler).
+	 */
+	if (vcpu->arch.xfd_no_write_intercept)
+		fpu_sync_guest_vmexit_xfd_state();
+
 	kvm_x86_ops->handle_exit_irqoff(vcpu);
 
 	if (vcpu->arch.guest_fpu.xfd_err)
-- 
Gitee


From 95110d77c7cba93a758dded003fad6d370d21f3d Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Thu, 27 Aug 2020 20:11:40 +0300
Subject: [PATCH 0945/2161] KVM: SVM: refactor msr permission bitmap allocation

commit f4c847a95654b898834ed513870a4fb01c27b29e upstream.

Replace svm_vcpu_init_msrpm with svm_vcpu_alloc_msrpm, that also allocates
the msr bitmap and add svm_vcpu_free_msrpm to free it.

This will be used later to move the nested msr permission bitmap allocation
to nested.c

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20200827171145.374620-4-mlevitsk@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f5a234a19801..7b4bead512e4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1085,18 +1085,29 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
 	msrpm[offset] = tmp;
 }
 
-static void svm_vcpu_init_msrpm(u32 *msrpm)
+static u32 *svm_vcpu_init_msrpm(void)
 {
 	int i;
+	u32 *msrpm;
+	struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
 
+	if (!pages)
+		return NULL;
+
+	msrpm = page_address(pages);
 	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 
 	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 		if (!direct_access_msrs[i].always)
 			continue;
-
 		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
 	}
+	return msrpm;
+}
+
+static void svm_vcpu_free_msrpm(u32 *msrpm)
+{
+	__free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
 }
 
 static void add_msr_offset(u32 offset)
@@ -2205,9 +2216,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	struct vcpu_svm *svm;
 	struct page *page;
-	struct page *msrpm_pages;
 	struct page *hsave_page;
-	struct page *nested_msrpm_pages;
 	int err;
 
 	BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0,
@@ -2233,21 +2242,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!page)
 		goto uninit;
 
-	msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
-	if (!msrpm_pages)
-		goto free_page1;
-
-	nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
-	if (!nested_msrpm_pages)
-		goto free_page2;
-
 	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
 	if (!hsave_page)
-		goto free_page3;
+		goto free_page1;
 
 	err = avic_init_vcpu(svm);
 	if (err)
-		goto free_page4;
+		goto free_page2;
 
 	/* We initialize this flag to true to make sure that the is_running
 	 * bit would be set the first time the vcpu is loaded.
@@ -2256,11 +2257,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	svm->nested.hsave = page_address(hsave_page);
 
-	svm->msrpm = page_address(msrpm_pages);
-	svm_vcpu_init_msrpm(svm->msrpm);
+	svm->msrpm = svm_vcpu_init_msrpm();
+	if (!svm->msrpm)
+		goto free_page2;
 
-	svm->nested.msrpm = page_address(nested_msrpm_pages);
-	svm_vcpu_init_msrpm(svm->nested.msrpm);
+	svm->nested.msrpm = svm_vcpu_init_msrpm();
+	if (!svm->nested.msrpm)
+		goto free_page3;
 
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
@@ -2272,12 +2275,10 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	return &svm->vcpu;
 
-free_page4:
-	__free_page(hsave_page);
 free_page3:
-	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
+	svm_vcpu_free_msrpm(svm->msrpm);
 free_page2:
-	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
+	__free_page(hsave_page);
 free_page1:
 	__free_page(page);
 uninit:
-- 
Gitee


From 553f0e0fdce7c850e0c90a762be1aed52253a9c9 Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Fri, 25 Sep 2020 16:34:18 +0200
Subject: [PATCH 0946/2161] KVM: x86: Prepare MSR bitmaps for userspace tracked
 MSRs

commit 476c9bd8e997b495524500cd82471e59b3aac20e upstream.

Prepare vmx and svm for a subsequent change that ensures the MSR permission
bitmap is set to allow an MSR that userspace is tracking to force a vmx_vmexit
in the guest.

Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Reviewed-by: Oliver Upton <oupton@google.com>
[agraf: rebase, adapt SVM scheme to nested changes that came in between]
Signed-off-by: Alexander Graf <graf@amazon.com>
Message-Id: <20200925143422.21718-5-graf@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c        | 60 ++++++++++++++++------------
 arch/x86/kvm/vmx/nested.c |  2 +-
 arch/x86/kvm/vmx/vmx.c    | 83 +++++++++++++++++++--------------------
 arch/x86/kvm/vmx/vmx.h    |  2 +-
 4 files changed, 77 insertions(+), 70 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7b4bead512e4..f6b9f7a16e15 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1040,7 +1040,7 @@ static bool valid_msr_intercept(u32 index)
 	return false;
 }
 
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 {
 	u8 bit_write;
 	unsigned long tmp;
@@ -1059,7 +1059,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
 	return !!test_bit(bit_write,  &tmp);
 }
 
-static void set_msr_interception(u32 *msrpm, unsigned msr,
+static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 				 int read, int write)
 {
 	u8 bit_read, bit_write;
@@ -1085,11 +1085,10 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
 	msrpm[offset] = tmp;
 }
 
-static u32 *svm_vcpu_init_msrpm(void)
+static u32 *svm_vcpu_alloc_msrpm(void)
 {
-	int i;
-	u32 *msrpm;
 	struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+	u32 *msrpm;
 
 	if (!pages)
 		return NULL;
@@ -1097,12 +1096,18 @@ static u32 *svm_vcpu_init_msrpm(void)
 	msrpm = page_address(pages);
 	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 
+	return msrpm;
+}
+
+static void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+{
+	int i;
+
 	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 		if (!direct_access_msrs[i].always)
 			continue;
-		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+		set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 	}
-	return msrpm;
 }
 
 static void svm_vcpu_free_msrpm(u32 *msrpm)
@@ -1153,26 +1158,26 @@ static void init_msrpm_offsets(void)
 	}
 }
 
-static void svm_enable_lbrv(struct vcpu_svm *svm)
+static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 {
-	u32 *msrpm = svm->msrpm;
+	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
-	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 }
 
-static void svm_disable_lbrv(struct vcpu_svm *svm)
+static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 {
-	u32 *msrpm = svm->msrpm;
+	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
-	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
-	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
-	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
-	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 }
 
 static void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -2257,14 +2262,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	svm->nested.hsave = page_address(hsave_page);
 
-	svm->msrpm = svm_vcpu_init_msrpm();
+	svm->msrpm = svm_vcpu_alloc_msrpm();
 	if (!svm->msrpm)
 		goto free_page2;
 
-	svm->nested.msrpm = svm_vcpu_init_msrpm();
+	svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+
+	svm->nested.msrpm = svm_vcpu_alloc_msrpm();
 	if (!svm->nested.msrpm)
 		goto free_page3;
 
+	/* We only need the L1 pass-through MSR state, so leave vcpu as NULL */
+	svm_vcpu_init_msrpm(vcpu, svm->nested.msrpm);
+
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
@@ -4351,7 +4361,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		 * We update the L1 MSR bit as well since it will end up
 		 * touching the MSR anyway now.
 		 */
-		set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
 		break;
 	case MSR_IA32_PRED_CMD:
 		if (!msr->host_initiated &&
@@ -4366,7 +4376,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 			break;
 
 		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
-		set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
 		break;
 	case MSR_AMD64_VIRT_SPEC_CTRL:
 		if (!msr->host_initiated &&
@@ -4430,9 +4440,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		svm->vmcb->save.dbgctl = data;
 		mark_dirty(svm->vmcb, VMCB_LBR);
 		if (data & (1ULL<<0))
-			svm_enable_lbrv(svm);
+			svm_enable_lbrv(vcpu);
 		else
-			svm_disable_lbrv(svm);
+			svm_disable_lbrv(vcpu);
 		break;
 	case MSR_VM_HSAVE_PA:
 		svm->nested.hsave_msr = data;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 7a853b13131c..e9ca618c8ed4 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4410,7 +4410,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 
 	if (pt_mode == PT_MODE_HOST_GUEST) {
 		vmx->pt_desc.guest.ctl = 0;
-		pt_update_intercept_for_msr(vmx);
+		pt_update_intercept_for_msr(vcpu);
 	}
 
 	return 0;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aee9804a6351..9d549ce4cb4e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -344,7 +344,7 @@ module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
 							  u32 msr, int type);
 
 void vmx_vmexit(void);
@@ -2020,7 +2020,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * in the merging. We update the vmcs01 here for L1 as well
 		 * since it will end up touching the MSR anyway now.
 		 */
-		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+		vmx_disable_intercept_for_msr(vcpu,
 					      MSR_IA32_SPEC_CTRL,
 					      MSR_TYPE_RW);
 		break;
@@ -2049,8 +2049,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * vmcs02.msr_bitmap here since it gets completely overwritten
 		 * in the merging.
 		 */
-		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
-					      MSR_TYPE_W);
+		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
 		break;
 	case MSR_IA32_CR_PAT:
 		if (!kvm_pat_valid(data))
@@ -2119,7 +2118,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vmcs_write64(GUEST_IA32_RTIT_CTL, data);
 		vmx->pt_desc.guest.ctl = data;
-		pt_update_intercept_for_msr(vmx);
+		pt_update_intercept_for_msr(vcpu);
 		break;
 	case MSR_IA32_RTIT_STATUS:
 		if ((pt_mode != PT_MODE_HOST_GUEST) ||
@@ -3608,9 +3607,11 @@ void free_vpid(int vpid)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
 							  u32 msr, int type)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
 	int f = sizeof(unsigned long);
 
 	if (!cpu_has_vmx_msr_bitmap())
@@ -3646,9 +3647,11 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit
 	}
 }
 
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
 							 u32 msr, int type)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
 	int f = sizeof(unsigned long);
 
 	if (!cpu_has_vmx_msr_bitmap())
@@ -3684,13 +3687,13 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm
 	}
 }
 
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
-			     			      u32 msr, int type, bool value)
+static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+						      u32 msr, int type, bool value)
 {
 	if (value)
-		vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+		vmx_enable_intercept_for_msr(vcpu, msr, type);
 	else
-		vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+		vmx_disable_intercept_for_msr(vcpu, msr, type);
 }
 
 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
@@ -3708,8 +3711,8 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
 	return mode;
 }
 
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
-					 u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu,
+					 unsigned long *msr_bitmap, u8 mode)
 {
 	int msr;
 
@@ -3724,11 +3727,11 @@ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
 		 * TPR reads and writes can be virtualized even if virtual interrupt
 		 * delivery is not in use.
 		 */
-		vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+		vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
 		if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
-			vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
-			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
-			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+			vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+			vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+			vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
 		}
 	}
 }
@@ -3744,30 +3747,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
 		return;
 
 	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
-		vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+		vmx_update_msr_bitmap_x2apic(vcpu, msr_bitmap, mode);
 
 	vmx->msr_bitmap_mode = mode;
 }
 
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
 {
-	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
 	u32 i;
 
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
-							MSR_TYPE_RW, flag);
+	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
 	for (i = 0; i < vmx->pt_desc.addr_range; i++) {
-		vmx_set_intercept_for_msr(msr_bitmap,
-			MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
-		vmx_set_intercept_for_msr(msr_bitmap,
-			MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+		vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+		vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
 	}
 }
 
@@ -6796,18 +6793,18 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto free_msrs;
 
 	msr_bitmap = vmx->vmcs01.msr_bitmap;
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
 	if (kvm_cstate_in_guest(kvm)) {
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
 	}
 	vmx->msr_bitmap_mode = 0;
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a80e25495f0f..8bf935e122d3 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -354,7 +354,7 @@ bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
 struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
 void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
 
 #define POSTED_INTR_ON  0
-- 
Gitee


From b06b58175e24aff7bf6e4f96f5bec8ce0e01121b Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 28 Mar 2022 23:07:50 +0800
Subject: [PATCH 0947/2161] KVM: VMX: Fix 'struct kvm_vcpu' reference

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9d549ce4cb4e..a359b8d6e542 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6793,18 +6793,18 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto free_msrs;
 
 	msr_bitmap = vmx->vmcs01.msr_bitmap;
-	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
-	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
 	if (kvm_cstate_in_guest(kvm)) {
-		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
 	}
 	vmx->msr_bitmap_mode = 0;
 
-- 
Gitee


From b264121d87750c0d66090feba87fcb39557673d8 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 29 Mar 2022 15:36:10 +0800
Subject: [PATCH 0948/2161] KVM: SVM: Fix 'vcpu' undeclared in
 svm_create_vcpu()

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f6b9f7a16e15..0eaed742c8da 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2266,14 +2266,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!svm->msrpm)
 		goto free_page2;
 
-	svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+	svm_vcpu_init_msrpm(&svm->vcpu, svm->msrpm);
 
 	svm->nested.msrpm = svm_vcpu_alloc_msrpm();
 	if (!svm->nested.msrpm)
 		goto free_page3;
 
 	/* We only need the L1 pass-through MSR state, so leave vcpu as NULL */
-	svm_vcpu_init_msrpm(vcpu, svm->nested.msrpm);
+	svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
 
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
-- 
Gitee


From e47d9703478c012fa551831135a16e269cea94a9 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Dec 2021 09:53:20 -0500
Subject: [PATCH 0949/2161] selftest: kvm: Reorder vcpu_load_state steps for
 AMX

commit 551447cfa5dc208b7fba7aa98391d5cc8149fa5a upstream.

For AMX support it is recommended to load XCR0 after XFD, so
that KVM does not see XFD=0, XCR=1 for a save state that will
eventually be disabled (which would lead to premature allocation
of the space required for that save state).

It is also required to load XSAVE data after XCR0 and XFD, so
that KVM can trigger allocation of the extra space required to
store AMX state.

Adjust vcpu_load_state to obey these new requirements.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20211223145322.2914028-2-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../selftests/kvm/lib/x86_64/processor.c        | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 87509fb2baae..4aa8e92fd829 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1158,24 +1158,25 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	int r;
 
-	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
+	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
                 r);
 
+	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
+	TEST_ASSERT(r == state->msrs.nmsrs,
+		"Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
+		r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
+
 	if (kvm_check_cap(KVM_CAP_XCRS)) {
 		r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
 		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
 			    r);
 	}
 
-	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
+	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
                 r);
 
-	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
-        TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
-                r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
-
 	r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
                 r);
-- 
Gitee


From 8429046f075ff92d08badb603c2f1c7129faad9b Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Thu, 23 Dec 2021 09:53:21 -0500
Subject: [PATCH 0950/2161] selftest: kvm: Move struct kvm_x86_state to header

commit 6559b4a523cd65f6005b4592833b16ba970abdf5 upstream.

Those changes can avoid dereferencing pointer compile issue
when amx_test.c reference state->xsave.

Move struct kvm_x86_state definition to processor.h.

Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20211223145322.2914028-3-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../selftests/kvm/include/x86_64/processor.h     | 16 +++++++++++++++-
 .../testing/selftests/kvm/lib/x86_64/processor.c | 15 ---------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 69160b7912bb..22b305e0f59b 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -71,6 +71,21 @@ struct desc_ptr {
 	uint64_t address;
 } __attribute__((packed));
 
+struct kvm_x86_state {
+	struct kvm_xsave *xsave;
+	struct kvm_vcpu_events events;
+	struct kvm_mp_state mp_state;
+	struct kvm_regs regs;
+	struct kvm_xcrs xcrs;
+	struct kvm_sregs sregs;
+	struct kvm_debugregs debugregs;
+	union {
+		struct kvm_nested_state nested;
+		char nested_[16384];
+	};
+	struct kvm_msrs msrs;
+};
+
 static inline uint64_t get_desc64_base(const struct desc64 *desc)
 {
 	return ((uint64_t)desc->base3 << 32) |
@@ -306,7 +321,6 @@ static inline unsigned long get_xmm(int n)
 
 bool is_intel_cpu(void);
 
-struct kvm_x86_state;
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_x86_state *state);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 4aa8e92fd829..ae8576f27f66 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1026,21 +1026,6 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
 	sregs_dump(stream, &sregs, indent + 4);
 }
 
-struct kvm_x86_state {
-	struct kvm_xsave *xsave;
-	struct kvm_vcpu_events events;
-	struct kvm_mp_state mp_state;
-	struct kvm_regs regs;
-	struct kvm_xcrs xcrs;
-	struct kvm_sregs sregs;
-	struct kvm_debugregs debugregs;
-	union {
-		struct kvm_nested_state nested;
-		char nested_[16384];
-	};
-	struct kvm_msrs msrs;
-};
-
 static int kvm_get_num_msrs(struct kvm_vm *vm)
 {
 	struct kvm_msr_list nmsrs;
-- 
Gitee


From 6750ae2212bdc7179a9ecb195262e931cbb7057d Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Thu, 23 Dec 2021 09:53:22 -0500
Subject: [PATCH 0951/2161] selftest: kvm: Add amx selftest

commit bf70636d9443c9e0718fd98765ba634e631ed079 upstream.

This selftest covers two aspects of AMX.  The first is triggering #NM
exception and checking the MSR XFD_ERR value.  The second case is
loading tile config and tile data into guest registers and trapping to
the host side for a complete save/load of the guest state.  TMM0
is also checked against memory data after save/restore.

Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20211223145322.2914028-4-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/kvm/Makefile          |   1 +
 tools/testing/selftests/kvm/x86_64/amx_test.c | 448 ++++++++++++++++++
 2 files changed, 449 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/amx_test.c

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c5ec868fa1e5..df8494d5a468 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -25,6 +25,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/amx_test
 TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
new file mode 100644
index 000000000000..523c1e99ed64
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * amx tests
+ *
+ * Copyright (C) 2021, Intel, Inc.
+ *
+ * Tests for amx #NM exception and save/restore.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+#define VCPU_ID				0
+#define X86_FEATURE_XSAVE		(1 << 26)
+#define X86_FEATURE_OSXSAVE		(1 << 27)
+
+#define PAGE_SIZE			(1 << 12)
+#define NUM_TILES			8
+#define TILE_SIZE			1024
+#define XSAVE_SIZE			((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
+
+/* Tile configuration associated: */
+#define MAX_TILES			16
+#define RESERVED_BYTES			14
+
+#define XFEATURE_XTILECFG		17
+#define XFEATURE_XTILEDATA		18
+#define XFEATURE_MASK_XTILECFG		(1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA		(1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE		(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+
+#define TILE_CPUID			0x1d
+#define XSTATE_CPUID			0xd
+#define TILE_PALETTE_CPUID_SUBLEAVE	0x1
+#define XSTATE_USER_STATE_SUBLEAVE	0x0
+
+#define XSAVE_HDR_OFFSET		512
+
+struct xsave_data {
+	u8 area[XSAVE_SIZE];
+} __aligned(64);
+
+struct tile_config {
+	u8  palette_id;
+	u8  start_row;
+	u8  reserved[RESERVED_BYTES];
+	u16 colsb[MAX_TILES];
+	u8  rows[MAX_TILES];
+};
+
+struct tile_data {
+	u8 data[NUM_TILES * TILE_SIZE];
+};
+
+struct xtile_info {
+	u16 bytes_per_tile;
+	u16 bytes_per_row;
+	u16 max_names;
+	u16 max_rows;
+	u32 xsave_offset;
+	u32 xsave_size;
+};
+
+static struct xtile_info xtile;
+
+static inline u64 __xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile("xgetbv;"
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void __xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
+}
+
+static inline void __ldtilecfg(void *cfg)
+{
+	asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
+		     : : "a"(cfg));
+}
+
+static inline void __tileloadd(void *tile)
+{
+	asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
+		     : : "a"(tile), "d"(0));
+}
+
+static inline void __tilerelease(void)
+{
+	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
+}
+
+static inline void __xsavec(struct xsave_data *data, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xsavec (%%rdi)"
+		     : : "D" (data), "a" (rfbm_lo), "d" (rfbm_hi)
+		     : "memory");
+}
+
+static inline void check_cpuid_xsave(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	eax = 1;
+	ecx = 0;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!(ecx & X86_FEATURE_XSAVE))
+		GUEST_ASSERT(!"cpuid: no CPU xsave support!");
+	if (!(ecx & X86_FEATURE_OSXSAVE))
+		GUEST_ASSERT(!"cpuid: no OS xsave support!");
+}
+
+static bool check_xsave_supports_xtile(void)
+{
+	return __xgetbv(0) & XFEATURE_MASK_XTILE;
+}
+
+static bool enum_xtile_config(void)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = TILE_CPUID;
+	ecx = TILE_PALETTE_CPUID_SUBLEAVE;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!eax || !ebx || !ecx)
+		return false;
+
+	xtile.max_names = ebx >> 16;
+	if (xtile.max_names < NUM_TILES)
+		return false;
+
+	xtile.bytes_per_tile = eax >> 16;
+	if (xtile.bytes_per_tile < TILE_SIZE)
+		return false;
+
+	xtile.bytes_per_row = ebx;
+	xtile.max_rows = ecx;
+
+	return true;
+}
+
+static bool enum_xsave_tile(void)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = XSTATE_CPUID;
+	ecx = XFEATURE_XTILEDATA;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!eax || !ebx)
+		return false;
+
+	xtile.xsave_offset = ebx;
+	xtile.xsave_size = eax;
+
+	return true;
+}
+
+static bool check_xsave_size(void)
+{
+	u32 eax, ebx, ecx, edx;
+	bool valid = false;
+
+	eax = XSTATE_CPUID;
+	ecx = XSTATE_USER_STATE_SUBLEAVE;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (ebx && ebx <= XSAVE_SIZE)
+		valid = true;
+
+	return valid;
+}
+
+static bool check_xtile_info(void)
+{
+	bool ret = false;
+
+	if (!check_xsave_size())
+		return ret;
+
+	if (!enum_xsave_tile())
+		return ret;
+
+	if (!enum_xtile_config())
+		return ret;
+
+	if (sizeof(struct tile_data) >= xtile.xsave_size)
+		ret = true;
+
+	return ret;
+}
+
+static void set_tilecfg(struct tile_config *cfg)
+{
+	int i;
+
+	/* Only palette id 1 */
+	cfg->palette_id = 1;
+	for (i = 0; i < xtile.max_names; i++) {
+		cfg->colsb[i] = xtile.bytes_per_row;
+		cfg->rows[i] = xtile.max_rows;
+	}
+}
+
+static void set_xstatebv(void *data, uint64_t bv)
+{
+	*(uint64_t *)(data + XSAVE_HDR_OFFSET) = bv;
+}
+
+static u64 get_xstatebv(void *data)
+{
+	return *(u64 *)(data + XSAVE_HDR_OFFSET);
+}
+
+static void init_regs(void)
+{
+	uint64_t cr4, xcr0;
+
+	/* turn on CR4.OSXSAVE */
+	cr4 = get_cr4();
+	cr4 |= X86_CR4_OSXSAVE;
+	set_cr4(cr4);
+
+	xcr0 = __xgetbv(0);
+	xcr0 |= XFEATURE_MASK_XTILE;
+	__xsetbv(0x0, xcr0);
+}
+
+static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
+						    struct tile_data *tiledata,
+						    struct xsave_data *xsave_data)
+{
+	init_regs();
+	check_cpuid_xsave();
+	GUEST_ASSERT(check_xsave_supports_xtile());
+	GUEST_ASSERT(check_xtile_info());
+
+	/* check xtile configs */
+	GUEST_ASSERT(xtile.xsave_offset == 2816);
+	GUEST_ASSERT(xtile.xsave_size == 8192);
+	GUEST_ASSERT(xtile.max_names == 8);
+	GUEST_ASSERT(xtile.bytes_per_tile == 1024);
+	GUEST_ASSERT(xtile.bytes_per_row == 64);
+	GUEST_ASSERT(xtile.max_rows == 16);
+	GUEST_SYNC(1);
+
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(2);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	GUEST_SYNC(3);
+	/* Check save/restore when trap to userspace */
+	__tileloadd(tiledata);
+	GUEST_SYNC(4);
+	__tilerelease();
+	GUEST_SYNC(5);
+	/* bit 18 not in the XCOMP_BV after xsavec() */
+	set_xstatebv(xsave_data, XFEATURE_MASK_XTILEDATA);
+	__xsavec(xsave_data, XFEATURE_MASK_XTILEDATA);
+	GUEST_ASSERT((get_xstatebv(xsave_data) & XFEATURE_MASK_XTILEDATA) == 0);
+
+	/* xfd=0x40000, disable amx tiledata */
+	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILEDATA);
+	GUEST_SYNC(6);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILEDATA);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	/* Trigger #NM exception */
+	__tileloadd(tiledata);
+	GUEST_SYNC(10);
+
+	GUEST_DONE();
+}
+
+void guest_nm_handler(struct ex_regs *regs)
+{
+	/* Check if #NM is triggered by XFEATURE_MASK_XTILEDATA */
+	GUEST_SYNC(7);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILEDATA);
+	GUEST_SYNC(8);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILEDATA);
+	/* Clear xfd_err */
+	wrmsr(MSR_IA32_XFD_ERR, 0);
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(9);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_cpuid_entry2 *entry;
+	struct kvm_regs regs1, regs2;
+	bool amx_supported = false;
+	struct kvm_vm *vm;
+	struct kvm_run *run;
+	struct kvm_x86_state *state;
+	int xsave_restore_size = 0;
+	vm_vaddr_t amx_cfg, tiledata, xsavedata;
+	struct ucall uc;
+	u32 amx_offset;
+	int stage, ret;
+
+	/* Create VM */
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+	entry = kvm_get_supported_cpuid_entry(1);
+	if (!(entry->ecx & X86_FEATURE_XSAVE)) {
+		print_skip("XSAVE feature not supported");
+		exit(KSFT_SKIP);
+	}
+
+	if (kvm_get_cpuid_max_basic() >= 0xd) {
+		entry = kvm_get_supported_cpuid_index(0xd, 0);
+		amx_supported = entry && !!(entry->eax & XFEATURE_MASK_XTILE);
+		if (!amx_supported) {
+			print_skip("AMX is not supported by the vCPU (eax=0x%x)", entry->eax);
+			exit(KSFT_SKIP);
+		}
+		/* Get xsave/restore max size */
+		xsave_restore_size = entry->ecx;
+	}
+
+	run = vcpu_state(vm, VCPU_ID);
+	vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+	/* Register #NM handler */
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vm, VCPU_ID);
+	vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
+
+	/* amx cfg for guest_code */
+	amx_cfg = vm_vaddr_alloc_page(vm);
+	memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
+
+	/* amx tiledata for guest_code */
+	tiledata = vm_vaddr_alloc_pages(vm, 2);
+	memset(addr_gva2hva(vm, tiledata), rand() | 1, 2 * getpagesize());
+
+	/* xsave data for guest_code */
+	xsavedata = vm_vaddr_alloc_pages(vm, 3);
+	memset(addr_gva2hva(vm, xsavedata), 0, 3 * getpagesize());
+	vcpu_args_set(vm, VCPU_ID, 3, amx_cfg, tiledata, xsavedata);
+
+	for (stage = 1; ; stage++) {
+		_vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Stage %d: unexpected exit reason: %u (%s),\n",
+			    stage, run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		case UCALL_ABORT:
+			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+				  __FILE__, uc.args[1]);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			switch (uc.args[1]) {
+			case 1:
+			case 2:
+			case 3:
+			case 5:
+			case 6:
+			case 7:
+			case 8:
+				fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
+				break;
+			case 4:
+			case 10:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
+
+				/* Compacted mode, get amx offset by xsave area
+				 * size subtract 8K amx size.
+				 */
+				amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
+				state = vcpu_save_state(vm, VCPU_ID);
+				void *amx_start = (void *)state->xsave + amx_offset;
+				void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
+				/* Only check TMM0 register, 1 tile */
+				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
+				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d\n", ret);
+				kvm_x86_state_cleanup(state);
+				break;
+			case 9:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			fprintf(stderr, "UCALL_DONE\n");
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		state = vcpu_save_state(vm, VCPU_ID);
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		kvm_vm_restart(vm, O_RDWR);
+		vm_vcpu_add(vm, VCPU_ID);
+		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+		vcpu_load_state(vm, VCPU_ID, state);
+		run = vcpu_state(vm, VCPU_ID);
+		kvm_x86_state_cleanup(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vm, VCPU_ID, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+done:
+	kvm_vm_free(vm);
+}
-- 
Gitee


From 9a61d5fcebddf057ed6e868432f7e0526f504731 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Thu, 13 Jan 2022 13:08:25 -0500
Subject: [PATCH 0952/2161] x86/fpu: Fix inline prefix warnings

commit c862dcd199759d4a45e65dab47b03e3e8a144e3a upstream.

Fix sparse warnings in xstate and remove inline prefix.

Fixes: 980fe2fddcff ("x86/fpu: Extend fpu_xstate_prctl() with guest permissions")
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Reported-by: kernel test robot <lkp@intel.com>
Message-Id: <20220113180825.322333-1-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h | 2 +-
 arch/x86/kernel/fpu/xstate.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 1a5df329f592..febe35e181c2 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -128,7 +128,7 @@ static inline void fpstate_free(struct fpu *fpu) { }
 /* fpstate-related functions which are exported to KVM */
 extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
 
-extern inline u64 xstate_get_guest_group_perm(void);
+extern u64 xstate_get_guest_group_perm(void);
 
 /* KVM specific functions */
 extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 07e6c7ed4803..f25e4605b6a5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1775,7 +1775,7 @@ static inline int xstate_request_perm(unsigned long idx, bool guest)
 }
 #endif  /* !CONFIG_X86_64 */
 
-inline u64 xstate_get_guest_group_perm(void)
+u64 xstate_get_guest_group_perm(void)
 {
 	return xstate_get_group_perm(true);
 }
-- 
Gitee


From 4903d57b8100bfcd93c76130961ce11f75b82fbd Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Mon, 17 Jan 2022 20:48:17 -0500
Subject: [PATCH 0953/2161] kvm: selftests: conditionally build
 vm_xsave_req_perm()

commit 1a1d1dbce6d5477e2bb08ce1ef0d77caa838cc8e upstream.

vm_xsave_req_perm() is currently defined and used by x86_64 only.
Make it compiled into vm_create_with_vcpus() only when on x86_64
machines. Otherwise, it would cause linkage errors, e.g. on s390x.

Fixes: 415a3c33e8 ("kvm: selftests: Add support for KVM_CAP_XSAVE2")
Reported-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Tested-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Message-Id: <20220118014817.30910-1-wei.w.wang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 5e59f15e60d7..a802ca9e75da 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -165,10 +165,12 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 
 	DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
 
+#ifdef __x86_64__
 	/*
 	 * Permission needs to be requested before KVM_SET_CPUID2.
 	 */
 	vm_xsave_req_perm();
+#endif
 
 	vm = calloc(1, sizeof(*vm));
 	TEST_ASSERT(vm != NULL, "Insufficient Memory");
-- 
Gitee


From 72f04d3a348a7c09ac94801d021e13676c9a48d9 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 26 Jan 2022 07:44:34 -0500
Subject: [PATCH 0954/2161] selftests: kvm: move vm_xsave_req_perm call to
 amx_test

commit dd4516aee365fc9c944c9d6036b6b87363398680 upstream.

There is no need for tests other than amx_test to enable dynamic xsave
states.  Remove the call to vm_xsave_req_perm from generic code,
and move it inside the test.  While at it, allow customizing the bit
that is requested, so that future tests can use it differently.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h       |  1 -
 .../testing/selftests/kvm/include/x86_64/processor.h |  1 +
 tools/testing/selftests/kvm/lib/kvm_util.c           |  7 -------
 tools/testing/selftests/kvm/lib/x86_64/processor.c   | 12 ++++++------
 tools/testing/selftests/kvm/x86_64/amx_test.c        |  2 ++
 5 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 84fbf8463619..c34bc4e9e731 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -152,7 +152,6 @@ vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
 struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_size,
 				 void *guest_code);
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
-void vm_xsave_req_perm(void);
 
 bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 22b305e0f59b..555d96d6ffe9 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -345,6 +345,7 @@ void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
 
 uint32_t kvm_get_cpuid_max(void);
 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
+void vm_xsave_req_perm(int bit);
 
 /*
  * Basic CPU control in CR0
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index a802ca9e75da..87b1971b8e22 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -165,13 +165,6 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 
 	DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
 
-#ifdef __x86_64__
-	/*
-	 * Permission needs to be requested before KVM_SET_CPUID2.
-	 */
-	vm_xsave_req_perm();
-#endif
-
 	vm = calloc(1, sizeof(*vm));
 	TEST_ASSERT(vm != NULL, "Insufficient Memory");
 
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index ae8576f27f66..f6e33c91934c 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -657,16 +657,16 @@ static bool is_xfd_supported(void)
 	return !!(eax & CPUID_XFD_BIT);
 }
 
-void vm_xsave_req_perm(void)
+void vm_xsave_req_perm(int bit)
 {
-	unsigned long bitmask;
+	u64 bitmask;
 	long rc;
 
 	if (!is_xfd_supported())
-		return;
+		exit(KSFT_SKIP);
+
+	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
 
-	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
-		     XSTATE_XTILE_DATA_BIT);
 	/*
 	 * The older kernel version(<5.15) can't support
 	 * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
@@ -676,7 +676,7 @@ void vm_xsave_req_perm(void)
 
 	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
 	TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
-	TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK,
+	TEST_ASSERT(bitmask & (1ULL << bit),
 		    "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
 		    bitmask);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
index 523c1e99ed64..52a3ef6629e8 100644
--- a/tools/testing/selftests/kvm/x86_64/amx_test.c
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -329,6 +329,8 @@ int main(int argc, char *argv[])
 	u32 amx_offset;
 	int stage, ret;
 
+	vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT);
+
 	/* Create VM */
 	vm = vm_create_default(VCPU_ID, 0, guest_code);
 
-- 
Gitee


From 38aaae994d5f5b6546d8664edd762c698a74584c Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 7 Mar 2022 11:33:35 +0800
Subject: [PATCH 0955/2161] KVM: x86: add system attribute to retrieve full set
 of supported xsave states

commit dd6e631220181162478984d2d46dd979e04d8e75 upstream.

Because KVM_GET_SUPPORTED_CPUID is meant to be passed (by simple-minded
VMMs) to KVM_SET_CPUID2, it cannot include any dynamic xsave states that
have not been enabled.  Probing those, for example so that they can be
passed to ARCH_REQ_XCOMP_GUEST_PERM, requires a new ioctl or arch_prctl.
The latter is in fact worse, even though that is what the rest of the
API uses, because it would require supported_xcr0 to be moved from the
KVM module to the kernel just for this use.  In addition, the value
would be nonsensical (or an error would have to be returned) until
the KVM module is loaded in.

Therefore, to limit the growth of system ioctls, add a /dev/kvm
variant of KVM_{GET,HAS}_DEVICE_ATTR, and implement it in x86
with just one group (0) and attribute (KVM_X86_XCOMP_GUEST_SUPP).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/virt/kvm/api.txt  |  2 ++
 arch/x86/include/uapi/asm/kvm.h |  3 ++
 arch/x86/kvm/x86.c              | 51 +++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h        |  1 +
 4 files changed, 57 insertions(+)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index a2cf8c388fd7..57bafd45137b 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -2731,6 +2731,7 @@ struct kvm_create_device {
 
 Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
   KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+  KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set)
 Type: device ioctl, vm ioctl, vcpu ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
@@ -2759,6 +2760,7 @@ struct kvm_device_attr {
 
 Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
   KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+  KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device
 Type: device ioctl, vm ioctl, vcpu ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index e2823aedadc3..58204fa6dd75 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -410,6 +410,9 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_NESTED_VMX_VMCS_SIZE	0x1000
 
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP	0
+
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
 	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 18da412d05e6..1f5dbf07f957 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3435,6 +3435,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_GET_MSR_FEATURES:
 	case KVM_CAP_MSR_PLATFORM_INFO:
 	case KVM_CAP_EXCEPTION_PAYLOAD:
+	case KVM_CAP_SYS_ATTRIBUTES:
 		r = 1;
 		break;
 	case KVM_CAP_SYNC_REGS:
@@ -3512,6 +3513,40 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 }
 
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+	u64 __user *uaddr = kvm_get_attr_addr(attr);
+
+	if (attr->group)
+		return -ENXIO;
+
+	if (IS_ERR(uaddr))
+		return PTR_ERR(uaddr);
+
+	switch (attr->attr) {
+	case KVM_X86_XCOMP_GUEST_SUPP:
+		if (put_user(kvm_supported_xcr0(), uaddr))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENXIO;
+		break;
+	}
+}
+
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+	if (attr->group)
+		return -ENXIO;
+
+	switch (attr->attr) {
+	case KVM_X86_XCOMP_GUEST_SUPP:
+		return 0;
+	default:
+		return -ENXIO;
+	}
+}
+
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {
@@ -3598,6 +3633,22 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		r = msr_io(NULL, argp, do_get_msr_feature, 1);
 		break;
 	}
+	case KVM_GET_DEVICE_ATTR: {
+		struct kvm_device_attr attr;
+		r = -EFAULT;
+		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+			break;
+		r = kvm_x86_dev_get_attr(&attr);
+		break;
+	}
+	case KVM_HAS_DEVICE_ATTR: {
+		struct kvm_device_attr attr;
+		r = -EFAULT;
+		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+			break;
+		r = kvm_x86_dev_has_attr(&attr);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1babccf3f772..81caac385bce 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1004,6 +1004,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
 #define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
Gitee


From 898661b23b02e7bad9cc3075be91743637bb3c77 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 7 Mar 2022 12:11:16 +0800
Subject: [PATCH 0956/2161] kvm: selftests: sync uapi/linux/kvm.h with Linux
 header

commit fa68118144c63e292628945c5b8feb16b84fea7d upstream.

KVM_CAP_XSAVE2 is out of sync due to a conflict.  Copy the whole
file while at it.

Reported-by: Yang Zhong <yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/include/uapi/linux/kvm.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index e7db8eef2e29..c84595087d3d 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1000,7 +1000,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PMU_EVENT_FILTER 173
 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
-#define KVM_CAP_XSAVE2 207
+#define KVM_CAP_VM_GPA_BITS 207
+#define KVM_CAP_XSAVE2 208
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1032,11 +1033,20 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
+struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -1048,6 +1058,7 @@ struct kvm_irq_routing_entry {
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
 };
@@ -1074,6 +1085,8 @@ struct kvm_x86_mce {
 #endif
 
 #ifdef KVM_CAP_XEN_HVM
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
+
 struct kvm_xen_hvm_config {
 	__u32 flags;
 	__u32 msr;
@@ -1407,8 +1420,6 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_XSAVE */
 #define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
 #define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
-/* Available with KVM_CAP_XSAVE2 */
-#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
@@ -1464,6 +1475,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_ARM_SVE */
 #define KVM_ARM_VCPU_FINALIZE	  _IOW(KVMIO,  0xc2, int)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
-- 
Gitee


From edccaf3d8a09feeb9dfea9011f4a6efd6bbd27c9 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 26 Jan 2022 07:51:00 -0500
Subject: [PATCH 0957/2161] selftests: kvm: check dynamic bits against
 KVM_X86_XCOMP_GUEST_SUPP

commit b19c99b9f4486f23e3b7248dd4ce3d83e19b9032 upstream.

Provide coverage for the new API.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/arch/x86/include/uapi/asm/kvm.h             |  3 +++
 tools/include/uapi/linux/kvm.h                    |  1 +
 .../testing/selftests/kvm/lib/x86_64/processor.c  | 15 +++++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index e2823aedadc3..58204fa6dd75 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -410,6 +410,9 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_NESTED_VMX_VMCS_SIZE	0x1000
 
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP	0
+
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
 	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index c84595087d3d..1e33f90c3d9a 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1002,6 +1002,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index f6e33c91934c..88afa1998adb 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -659,8 +659,23 @@ static bool is_xfd_supported(void)
 
 void vm_xsave_req_perm(int bit)
 {
+	int kvm_fd;
 	u64 bitmask;
 	long rc;
+	struct kvm_device_attr attr = {
+		.group = 0,
+		.attr = KVM_X86_XCOMP_GUEST_SUPP,
+		.addr = (unsigned long) &bitmask
+	};
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+	rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
+	close(kvm_fd);
+	if (rc == -1 && (errno == ENXIO || errno == EINVAL))
+		exit(KSFT_SKIP);
+	TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
+	if (!(bitmask & (1ULL << bit)))
+		exit(KSFT_SKIP);
 
 	if (!is_xfd_supported())
 		exit(KSFT_SKIP);
-- 
Gitee


From fb651dc140c13c0ffe497aa4b87dbd2507b47774 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 7 Mar 2022 16:12:28 +0800
Subject: [PATCH 0958/2161] KVM: x86: Add a helper to retrieve userspace
 address from kvm_device_attr

commit 56f289a8d23addfa4408a08f07f42fcfe2a7bd69 upstream.

Add a helper to handle converting the u64 userspace address embedded in
struct kvm_device_attr into a userspace pointer, it's all too easy to
forget the intermediate "unsigned long" cast as well as the truncation
check.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f5dbf07f957..7ff9a5f6903e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3547,6 +3547,15 @@ static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
 	}
 }
 
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+	void __user *uaddr = (void __user*)(unsigned long)attr->addr;
+
+	if ((u64)(unsigned long)uaddr != attr->addr)
+		return ERR_PTR(-EFAULT);
+	return uaddr;
+}
+
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {
-- 
Gitee


From 195839c577dcf129c766cdda835e4f687579a822 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 7 Mar 2022 16:55:12 +0800
Subject: [PATCH 0959/2161] KVM: x86: Use ERR_PTR_USR() to return -EFAULT as a
 __user pointer

commit 6e37ec8825a113bc2dd1b280be10e5ac6eb4f6b1 upstream.

Use ERR_PTR_USR() when returning -EFAULT from kvm_get_attr_addr(), sparse
complains about implicitly casting the kernel pointer from ERR_PTR() into
a __user pointer.

>> arch/x86/kvm/x86.c:4342:31: sparse: sparse: incorrect type in return expression
   (different address spaces) @@     expected void [noderef] __user * @@     got void * @@
   arch/x86/kvm/x86.c:4342:31: sparse:     expected void [noderef] __user *
   arch/x86/kvm/x86.c:4342:31: sparse:     got void *
>> arch/x86/kvm/x86.c:4342:31: sparse: sparse: incorrect type in return expression
   (different address spaces) @@     expected void [noderef] __user * @@     got void * @@
   arch/x86/kvm/x86.c:4342:31: sparse:     expected void [noderef] __user *
   arch/x86/kvm/x86.c:4342:31: sparse:     got void *

No functional change intended.

Fixes: 56f289a8d23a ("KVM: x86: Add a helper to retrieve userspace address from kvm_device_attr")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220202005157.2545816-1-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7ff9a5f6903e..24d1dab0a54a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -81,6 +81,8 @@
 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
+#define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
+
 #define emul_to_vcpu(ctxt) \
 	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
 
@@ -3513,6 +3515,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 }
 
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+	void __user *uaddr = (void __user*)(unsigned long)attr->addr;
+ 
+	if ((u64)(unsigned long)uaddr != attr->addr)
+		return ERR_PTR_USR(-EFAULT);
+	return uaddr;
+}
+
 static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
 {
 	u64 __user *uaddr = kvm_get_attr_addr(attr);
@@ -3547,15 +3558,6 @@ static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
 	}
 }
 
-static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
-{
-	void __user *uaddr = (void __user*)(unsigned long)attr->addr;
-
-	if ((u64)(unsigned long)uaddr != attr->addr)
-		return ERR_PTR(-EFAULT);
-	return uaddr;
-}
-
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {
-- 
Gitee


From c9bab4a08a594c92d3a5bc9a1bf246ee7ea20e76 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2019 09:57:09 +0100
Subject: [PATCH 0960/2161] dma-direct: remove __dma_direct_free_pages

commit acaade1af3587132e7ea585f470a95261e14f60c upstream.

We can just call dma_free_contiguous directly instead of wrapping it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-direct.h |  1 -
 kernel/dma/direct.c        | 11 +++--------
 kernel/dma/remap.c         |  4 ++--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index a56ea8cd399b..9e63d7240da8 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -80,6 +80,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
-void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page);
 int dma_direct_supported(struct device *dev, u64 mask);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 86baa9a12ef4..8b9e1f031618 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -154,7 +154,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		 * so log an error and fail.
 		 */
 		dev_info(dev, "Rejecting highmem page from CMA.\n");
-		__dma_direct_free_pages(dev, size, page);
+		dma_free_contiguous(dev, page, size);
 		return NULL;
 	}
 
@@ -176,11 +176,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	return ret;
 }
 
-void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
-{
-	dma_free_contiguous(dev, page, size);
-}
-
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs)
 {
@@ -189,7 +184,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
 	    !force_dma_unencrypted(dev)) {
 		/* cpu_addr is a struct page cookie, not a kernel address */
-		__dma_direct_free_pages(dev, size, cpu_addr);
+		dma_free_contiguous(dev, cpu_addr, size);
 		return;
 	}
 
@@ -199,7 +194,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		cpu_addr = cached_kernel_address(cpu_addr);
-	__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
+	dma_free_contiguous(dev, virt_to_page(cpu_addr), size);
 }
 
 void *dma_direct_alloc(struct device *dev, size_t size,
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index c00b9258fa6a..fb1e50c2d48a 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -238,7 +238,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 			dma_pgprot(dev, PAGE_KERNEL, attrs),
 			__builtin_return_address(0));
 	if (!ret) {
-		__dma_direct_free_pages(dev, size, page);
+		dma_free_contiguous(dev, page, size);
 		return ret;
 	}
 
@@ -256,7 +256,7 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		struct page *page = pfn_to_page(__phys_to_pfn(phys));
 
 		vunmap(vaddr);
-		__dma_direct_free_pages(dev, size, page);
+		dma_free_contiguous(dev, page, size);
 	}
 }
 
-- 
Gitee


From 937956d9726526aafd4a45bded8bd8b8eadfdc28 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2019 09:57:32 +0100
Subject: [PATCH 0961/2161] dma-direct: remove the dma_handle argument to
 __dma_direct_alloc_pages

commit 4e1003aa56a7d60ddb048e43a7a51368fcfe36af upstream.

The argument isn't used anywhere, so stop passing it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-direct.h | 2 +-
 kernel/dma/direct.c        | 4 ++--
 kernel/dma/remap.c         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 9e63d7240da8..9f690b175d6a 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -79,6 +79,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
+		gfp_t gfp, unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8b9e1f031618..d2b92e8fcc58 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -84,7 +84,7 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 }
 
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+		gfp_t gfp, unsigned long attrs)
 {
 	size_t alloc_size = PAGE_ALIGN(size);
 	int node = dev_to_node(dev);
@@ -132,7 +132,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	struct page *page;
 	void *ret;
 
-	page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+	page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
 	if (!page)
 		return NULL;
 
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index fb1e50c2d48a..90d5ce77c189 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -226,7 +226,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		goto done;
 	}
 
-	page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
+	page = __dma_direct_alloc_pages(dev, size, flags, attrs);
 	if (!page)
 		return NULL;
 
-- 
Gitee


From f4f06be897ea957686628e156a7c0e90f9d9c904 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2019 11:01:37 +0100
Subject: [PATCH 0962/2161] dma-direct: provide mmap and get_sgtable method
 overrides

commit 34dc0ea6bc960f1f57b2148f01a3f4da23f87013 upstream.

For dma-direct we know that the DMA address is an encoding of the
physical address that we can trivially decode.  Use that fact to
provide implementations that do not need the arch_dma_coherent_to_pfn
architecture hook.  Note that we still can only support mmap of
non-coherent memory only if the architecture provides a way to set an
uncached bit in the page tables.  This must be true for architectures
that use the generic remap helpers, but other architectures can also
manually select it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/arc/Kconfig                       |  1 -
 arch/arm/Kconfig                       |  1 -
 arch/arm/mm/dma-mapping.c              |  6 ---
 arch/arm64/Kconfig                     |  1 -
 arch/ia64/Kconfig                      |  2 +-
 arch/ia64/kernel/dma-mapping.c         |  6 ---
 arch/microblaze/Kconfig                |  1 -
 arch/mips/Kconfig                      |  4 +-
 arch/mips/mm/dma-noncoherent.c         |  6 ---
 arch/powerpc/platforms/Kconfig.cputype |  1 -
 include/linux/dma-direct.h             |  7 +++
 include/linux/dma-noncoherent.h        |  2 -
 kernel/dma/Kconfig                     | 12 ++++--
 kernel/dma/direct.c                    | 59 ++++++++++++++++++++++++++
 kernel/dma/mapping.c                   | 45 +++-----------------
 kernel/dma/remap.c                     |  6 ---
 16 files changed, 85 insertions(+), 75 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 8383155c8c82..4d7b671c8ff4 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -6,7 +6,6 @@
 config ARC
 	def_bool y
 	select ARC_TIMERS
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SETUP_DMA_OPS
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9aa88715f196..63cafb433eba 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -7,7 +7,6 @@ config ARM
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
-	select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB
 	select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 27576c7b836e..58d5765fb129 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -2346,12 +2346,6 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
 			      size, dir);
 }
 
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return dma_to_pfn(dev, dma_addr);
-}
-
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs)
 {
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 09d370f8c989..8a6cd40cc966 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -12,7 +12,6 @@ config ARM64
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
 	select ARCH_HAS_FAST_MULTIPLIER
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 16714477eef4..bab7cd878464 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -33,7 +33,7 @@ config IA64
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_VIRT_CPU_ACCOUNTING
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
+	select DMA_NONCOHERENT_MMAP
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select VIRT_TO_BUS
 	select GENERIC_IRQ_PROBE
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 4a3262795890..09ef9ce9988d 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -19,9 +19,3 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
 {
 	dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
 }
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return page_to_pfn(virt_to_page(cpu_addr));
-}
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index c9c4be822456..261c26df1c9f 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -4,7 +4,6 @@ config MICROBLAZE
 	select ARCH_32BIT_OFF_T
 	select ARCH_NO_SWAP
 	select ARCH_HAS_BINFMT_FLAT if !MMU
-	select ARCH_HAS_DMA_COHERENT_TO_PFN if MMU
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 6ecdc690f733..3a8acc0c127b 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1135,9 +1135,9 @@ config DMA_NONCOHERENT
 	select ARCH_HAS_DMA_WRITE_COMBINE
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_UNCACHED_SEGMENT
-	select NEED_DMA_MAP_STATE
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
+	select DMA_NONCOHERENT_MMAP
 	select DMA_NONCOHERENT_CACHE_SYNC
+	select NEED_DMA_MAP_STATE
 
 config SYS_HAS_EARLY_PRINTK
 	bool
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index 1d4d57dd9acf..fcf6d3eaac66 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -59,12 +59,6 @@ void *cached_kernel_address(void *addr)
 	return __va(addr) - UNCAC_BASE;
 }
 
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return page_to_pfn(virt_to_page(cached_kernel_address(cpu_addr)));
-}
-
 static inline void dma_sync_virt(void *addr, size_t size,
 		enum dma_data_direction dir)
 {
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index f0330ce498d1..97af19141aed 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -459,7 +459,6 @@ config NOT_COHERENT_CACHE
 	bool
 	depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
 		GAMECUBE_COMMON || AMIGAONE
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 9f690b175d6a..cc34a436c882 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -80,5 +80,12 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		gfp_t gfp, unsigned long attrs);
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
+bool dma_direct_can_mmap(struct device *dev);
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index dd3de6d88fc0..e30fca1f1b12 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -41,8 +41,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr);
 
 #ifdef CONFIG_MMU
 /*
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 73c5c2b8e824..4c103a24e380 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -51,9 +51,6 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 config ARCH_HAS_DMA_PREP_COHERENT
 	bool
 
-config ARCH_HAS_DMA_COHERENT_TO_PFN
-	bool
-
 config ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	bool
 
@@ -68,9 +65,18 @@ config SWIOTLB
 	bool
 	select NEED_DMA_MAP_STATE
 
+#
+# Should be selected if we can mmap non-coherent mappings to userspace.
+# The only thing that is really required is a way to set an uncached bit
+# in the pagetables
+#
+config DMA_NONCOHERENT_MMAP
+	bool
+
 config DMA_REMAP
 	depends on MMU
 	select GENERIC_ALLOCATOR
+	select DMA_NONCOHERENT_MMAP
 	bool
 
 config DMA_DIRECT_REMAP
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index d2b92e8fcc58..3cbbca2d5208 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -43,6 +43,12 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 	return phys_to_dma(dev, phys);
 }
 
+static inline struct page *dma_direct_to_page(struct device *dev,
+		dma_addr_t dma_addr)
+{
+	return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr)));
+}
+
 u64 dma_direct_get_required_mask(struct device *dev)
 {
 	phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
@@ -380,6 +386,59 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 }
 EXPORT_SYMBOL(dma_direct_map_resource);
 
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	struct page *page = dma_direct_to_page(dev, dma_addr);
+	int ret;
+
+	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+	if (!ret)
+		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+	return ret;
+}
+
+#ifdef CONFIG_MMU
+bool dma_direct_can_mmap(struct device *dev)
+{
+	return dev_is_dma_coherent(dev) ||
+		IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP);
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	unsigned long user_count = vma_pages(vma);
+	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr));
+	int ret = -ENXIO;
+
+	vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
+		return -ENXIO;
+	return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+			user_count << PAGE_SHIFT, vma->vm_page_prot);
+}
+#else /* CONFIG_MMU */
+bool dma_direct_can_mmap(struct device *dev)
+{
+	return false;
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	return -ENXIO;
+}
+#endif /* CONFIG_MMU */
+
 /*
  * Because 32-bit DMA masks are so common we expect every architecture to be
  * able to satisfy them - either by not supporting more physical memory, or by
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 8682a5305cb3..98e3d873792e 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -112,24 +112,9 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
 		 void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		 unsigned long attrs)
 {
-	struct page *page;
+	struct page *page = virt_to_page(cpu_addr);
 	int ret;
 
-	if (!dev_is_dma_coherent(dev)) {
-		unsigned long pfn;
-
-		if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
-			return -ENXIO;
-
-		/* If the PFN is not valid, we do not have a struct page */
-		pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
-		if (!pfn_valid(pfn))
-			return -ENXIO;
-		page = pfn_to_page(pfn);
-	} else {
-		page = virt_to_page(cpu_addr);
-	}
-
 	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
 	if (!ret)
 		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
@@ -154,7 +139,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (dma_is_direct(ops))
-		return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+		return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
 				size, attrs);
 	if (!ops->get_sgtable)
 		return -ENXIO;
@@ -194,7 +179,6 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 	unsigned long user_count = vma_pages(vma);
 	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	unsigned long off = vma->vm_pgoff;
-	unsigned long pfn;
 	int ret = -ENXIO;
 
 	vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -205,19 +189,8 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 	if (off >= count || user_count > count - off)
 		return -ENXIO;
 
-	if (!dev_is_dma_coherent(dev)) {
-		if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
-			return -ENXIO;
-
-		/* If the PFN is not valid, we do not have a struct page */
-		pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
-		if (!pfn_valid(pfn))
-			return -ENXIO;
-	} else {
-		pfn = page_to_pfn(virt_to_page(cpu_addr));
-	}
-
-	return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+	return remap_pfn_range(vma, vma->vm_start,
+			page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
 			user_count << PAGE_SHIFT, vma->vm_page_prot);
 #else
 	return -ENXIO;
@@ -235,12 +208,8 @@ bool dma_can_mmap(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops)) {
-		return IS_ENABLED(CONFIG_MMU) &&
-		       (dev_is_dma_coherent(dev) ||
-			IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN));
-	}
-
+	if (dma_is_direct(ops))
+		return dma_direct_can_mmap(dev);
 	return ops->mmap != NULL;
 }
 EXPORT_SYMBOL_GPL(dma_can_mmap);
@@ -265,7 +234,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (dma_is_direct(ops))
-		return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size,
+		return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
 				attrs);
 	if (!ops->mmap)
 		return -ENXIO;
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 90d5ce77c189..3c49499ee6b0 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -259,10 +259,4 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_free_contiguous(dev, page, size);
 	}
 }
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return __phys_to_pfn(dma_to_phys(dev, dma_addr));
-}
 #endif /* CONFIG_DMA_DIRECT_REMAP */
-- 
Gitee


From 2314a42bcc0d308718532af63d37addc2e60021a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2019 11:06:32 +0100
Subject: [PATCH 0963/2161] dma-mapping: merge the generic remapping helpers
 into dma-direct

commit 3acac065508f6cc60ac9d3e4b7c6cc37fd91d531 upstream.

Integrate the generic dma remapping implementation into the main flow.
This prepares for architectures like xtensa that use an uncached
segment for pages in the kernel mapping, but can also remap highmem
from CMA.  To simplify that implementation we now always deduct the
page from the physical address via the DMA address instead of the
virtual address.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 57 +++++++++++++++++++++++++++++++++++----------
 kernel/dma/remap.c  | 49 --------------------------------------
 2 files changed, 45 insertions(+), 61 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 3cbbca2d5208..e5eb887f1ae4 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -12,6 +12,7 @@
 #include <linux/dma-contiguous.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/pfn.h>
+#include <linux/vmalloc.h>
 #include <linux/set_memory.h>
 #include <linux/swiotlb.h>
 
@@ -138,6 +139,15 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	struct page *page;
 	void *ret;
 
+	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    dma_alloc_need_uncached(dev, attrs) &&
+	    !gfpflags_allow_blocking(gfp)) {
+		ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp);
+		if (!ret)
+			return NULL;
+		goto done;
+	}
+
 	page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
 	if (!page)
 		return NULL;
@@ -147,9 +157,28 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		/* remove any dirty cache lines on the kernel alias */
 		if (!PageHighMem(page))
 			arch_dma_prep_coherent(page, size);
-		*dma_handle = phys_to_dma(dev, page_to_phys(page));
 		/* return the page pointer as the opaque cookie */
-		return page;
+		ret = page;
+		goto done;
+	}
+
+	if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	     dma_alloc_need_uncached(dev, attrs)) ||
+	    (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+		/* remove any dirty cache lines on the kernel alias */
+		arch_dma_prep_coherent(page, PAGE_ALIGN(size));
+
+		/* create a coherent mapping */
+		ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size),
+				dma_pgprot(dev, PAGE_KERNEL, attrs),
+				__builtin_return_address(0));
+		if (!ret) {
+			dma_free_contiguous(dev, page, size);
+			return ret;
+		}
+
+		memset(ret, 0, size);
+		goto done;
 	}
 
 	if (PageHighMem(page)) {
@@ -165,12 +194,9 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	}
 
 	ret = page_address(page);
-	if (force_dma_unencrypted(dev)) {
+	if (force_dma_unencrypted(dev))
 		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
-		*dma_handle = phys_to_dma_unencrypted(dev, page_to_phys(page));
-	} else {
-		*dma_handle = phys_to_dma(dev, page_to_phys(page));
-	}
+
 	memset(ret, 0, size);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
@@ -178,7 +204,8 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		arch_dma_prep_coherent(page, size);
 		ret = uncached_kernel_address(ret);
 	}
-
+done:
+	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 	return ret;
 }
 
@@ -194,19 +221,24 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		return;
 	}
 
+	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    dma_free_from_pool(cpu_addr, PAGE_ALIGN(size)))
+		return;
+
 	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
-	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		cpu_addr = cached_kernel_address(cpu_addr);
-	dma_free_contiguous(dev, virt_to_page(cpu_addr), size);
+	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+		vunmap(cpu_addr);
+
+	dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
 }
 
 void *dma_direct_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 	return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
@@ -216,6 +248,7 @@ void dma_direct_free(struct device *dev, size_t size,
 		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
 {
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
 	else
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 3c49499ee6b0..d47bd40fc0f5 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -210,53 +210,4 @@ bool dma_free_from_pool(void *start, size_t size)
 	gen_pool_free(atomic_pool, (unsigned long)start, size);
 	return true;
 }
-
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t flags, unsigned long attrs)
-{
-	struct page *page = NULL;
-	void *ret;
-
-	size = PAGE_ALIGN(size);
-
-	if (!gfpflags_allow_blocking(flags)) {
-		ret = dma_alloc_from_pool(size, &page, flags);
-		if (!ret)
-			return NULL;
-		goto done;
-	}
-
-	page = __dma_direct_alloc_pages(dev, size, flags, attrs);
-	if (!page)
-		return NULL;
-
-	/* remove any dirty cache lines on the kernel alias */
-	arch_dma_prep_coherent(page, size);
-
-	/* create a coherent mapping */
-	ret = dma_common_contiguous_remap(page, size,
-			dma_pgprot(dev, PAGE_KERNEL, attrs),
-			__builtin_return_address(0));
-	if (!ret) {
-		dma_free_contiguous(dev, page, size);
-		return ret;
-	}
-
-	memset(ret, 0, size);
-done:
-	*dma_handle = phys_to_dma(dev, page_to_phys(page));
-	return ret;
-}
-
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, unsigned long attrs)
-{
-	if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
-		phys_addr_t phys = dma_to_phys(dev, dma_handle);
-		struct page *page = pfn_to_page(__phys_to_pfn(phys));
-
-		vunmap(vaddr);
-		dma_free_contiguous(dev, page, size);
-	}
-}
 #endif /* CONFIG_DMA_DIRECT_REMAP */
-- 
Gitee


From c3fd7ea8b7a5d9927453daf2b909923691b777cd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Nov 2019 18:03:11 +0100
Subject: [PATCH 0964/2161] dma-mapping: drop the dev argument to
 arch_sync_dma_for_*

commit 56e35f9c5b87ec1ae93e483284e189c84388de16 upstream.

These are pure cache maintainance routines, so drop the unused
struct device argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/arc/mm/dma.c                 |  8 ++++----
 arch/arm/mm/dma-mapping.c         |  8 ++++----
 arch/arm/xen/mm.c                 | 12 ++++++------
 arch/arm64/mm/dma-mapping.c       |  8 ++++----
 arch/c6x/mm/dma-coherent.c        | 14 +++++++-------
 arch/csky/mm/dma-mapping.c        |  8 ++++----
 arch/hexagon/kernel/dma.c         |  4 ++--
 arch/ia64/mm/init.c               |  4 ++--
 arch/m68k/kernel/dma.c            |  4 ++--
 arch/microblaze/kernel/dma.c      | 14 +++++++-------
 arch/mips/bmips/dma.c             |  2 +-
 arch/mips/jazz/jazzdma.c          | 17 ++++++++---------
 arch/mips/mm/dma-noncoherent.c    | 12 ++++++------
 arch/nds32/kernel/dma.c           |  8 ++++----
 arch/nios2/mm/dma-mapping.c       |  8 ++++----
 arch/openrisc/kernel/dma.c        |  2 +-
 arch/parisc/kernel/pci-dma.c      |  8 ++++----
 arch/powerpc/mm/dma-noncoherent.c |  8 ++++----
 arch/sh/kernel/dma-coherent.c     |  6 +++---
 arch/sparc/kernel/ioport.c        |  4 ++--
 arch/xtensa/kernel/pci-dma.c      |  8 ++++----
 drivers/iommu/dma-iommu.c         | 10 +++++-----
 drivers/xen/swiotlb-xen.c         |  8 ++++----
 include/linux/dma-noncoherent.h   | 20 ++++++++++----------
 include/xen/swiotlb-xen.h         |  8 ++++----
 kernel/dma/direct.c               | 14 +++++++-------
 26 files changed, 113 insertions(+), 114 deletions(-)

diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 73a7e88a1e92..e947572a521e 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -48,8 +48,8 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
  * upper layer functions (in include/linux/dma-mapping.h)
  */
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
@@ -69,8 +69,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 58d5765fb129..297639e24b31 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -2332,15 +2332,15 @@ void arch_teardown_dma_ops(struct device *dev)
 }
 
 #ifdef CONFIG_SWIOTLB
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_page_cpu_to_dev(phys_to_page(paddr), paddr & (PAGE_SIZE - 1),
 			      size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_page_dev_to_cpu(phys_to_page(paddr), paddr & (PAGE_SIZE - 1),
 			      size, dir);
diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index 38fa917c8585..a6a2514e5fe8 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -70,20 +70,20 @@ static void dma_cache_maint(dma_addr_t handle, size_t size, u32 op)
  * pfn_valid returns true the pages is local and we can use the native
  * dma-direct functions, otherwise we call the Xen specific version.
  */
-void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+void xen_dma_sync_for_cpu(dma_addr_t handle, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	if (pfn_valid(PFN_DOWN(handle)))
-		arch_sync_dma_for_cpu(dev, paddr, size, dir);
+		arch_sync_dma_for_cpu(paddr, size, dir);
 	else if (dir != DMA_TO_DEVICE)
 		dma_cache_maint(handle, size, GNTTAB_CACHE_INVAL);
 }
 
-void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+void xen_dma_sync_for_device(dma_addr_t handle, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	if (pfn_valid(PFN_DOWN(handle)))
-		arch_sync_dma_for_device(dev, paddr, size, dir);
+		arch_sync_dma_for_device(paddr, size, dir);
 	else if (dir == DMA_FROM_DEVICE)
 		dma_cache_maint(handle, size, GNTTAB_CACHE_INVAL);
 	else
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 9239416e93d4..6c45350e33aa 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -13,14 +13,14 @@
 
 #include <asm/cacheflush.h>
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_map_area(phys_to_virt(paddr), size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_unmap_area(phys_to_virt(paddr), size, dir);
 }
diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c
index b319808e8f6b..a5909091cb14 100644
--- a/arch/c6x/mm/dma-coherent.c
+++ b/arch/c6x/mm/dma-coherent.c
@@ -140,7 +140,7 @@ void __init coherent_mem_init(phys_addr_t start, u32 size)
 		      sizeof(long));
 }
 
-static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
+static void c6x_dma_sync(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 	BUG_ON(!valid_dma_direction(dir));
@@ -160,14 +160,14 @@ static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
 	}
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	return c6x_dma_sync(dev, paddr, size, dir);
+	return c6x_dma_sync(paddr, size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	return c6x_dma_sync(dev, paddr, size, dir);
+	return c6x_dma_sync(paddr, size, dir);
 }
diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c
index 06e85b565454..8f6571ae27c8 100644
--- a/arch/csky/mm/dma-mapping.c
+++ b/arch/csky/mm/dma-mapping.c
@@ -58,8 +58,8 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 	cache_op(page_to_phys(page), size, dma_wbinv_set_zero_range);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-			      size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
@@ -74,8 +74,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-			   size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index f561b127c4b4..25f388d9cfcc 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -55,8 +55,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	gen_pool_free(coherent_pool, (unsigned long) vaddr, size);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	void *addr = phys_to_virt(paddr);
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index ee50506d86f4..df6d3dfa9d82 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -73,8 +73,8 @@ __ia64_sync_icache_dcache (pte_t pte)
  * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
  * flush them when they get mapped into an executable vm-area.
  */
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	unsigned long pfn = PHYS_PFN(paddr);
 
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index 3fab684cc0db..871a0e11da34 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -61,8 +61,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 
 #endif /* CONFIG_MMU && !CONFIG_COLDFIRE */
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t handle,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t handle, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_BIDIRECTIONAL:
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index a89c2d4ed5ff..d7bebd04247b 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -15,7 +15,7 @@
 #include <linux/bug.h>
 #include <asm/cacheflush.h>
 
-static void __dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
+static void __dma_sync(phys_addr_t paddr, size_t size,
 		enum dma_data_direction direction)
 {
 	switch (direction) {
@@ -31,14 +31,14 @@ static void __dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
 	}
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	__dma_sync(dev, paddr, size, dir);
+	__dma_sync(paddr, size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	__dma_sync(dev, paddr, size, dir);
+	__dma_sync(paddr, size, dir);
 }
diff --git a/arch/mips/bmips/dma.c b/arch/mips/bmips/dma.c
index 3d13c77c125f..df56bf4179e3 100644
--- a/arch/mips/bmips/dma.c
+++ b/arch/mips/bmips/dma.c
@@ -64,7 +64,7 @@ phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr)
 	return dma_addr;
 }
 
-void arch_sync_dma_for_cpu_all(struct device *dev)
+void arch_sync_dma_for_cpu_all(void)
 {
 	void __iomem *cbr = BMIPS_GET_CBR();
 	u32 cfg;
diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index a01e14955187..c64a297e82b3 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -592,7 +592,7 @@ static dma_addr_t jazz_dma_map_page(struct device *dev, struct page *page,
 	phys_addr_t phys = page_to_phys(page) + offset;
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(dev, phys, size, dir);
+		arch_sync_dma_for_device(phys, size, dir);
 	return vdma_alloc(phys, size);
 }
 
@@ -600,7 +600,7 @@ static void jazz_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_cpu(dev, vdma_log2phys(dma_addr), size, dir);
+		arch_sync_dma_for_cpu(vdma_log2phys(dma_addr), size, dir);
 	vdma_free(dma_addr);
 }
 
@@ -612,7 +612,7 @@ static int jazz_dma_map_sg(struct device *dev, struct scatterlist *sglist,
 
 	for_each_sg(sglist, sg, nents, i) {
 		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-			arch_sync_dma_for_device(dev, sg_phys(sg), sg->length,
+			arch_sync_dma_for_device(sg_phys(sg), sg->length,
 				dir);
 		sg->dma_address = vdma_alloc(sg_phys(sg), sg->length);
 		if (sg->dma_address == DMA_MAPPING_ERROR)
@@ -631,8 +631,7 @@ static void jazz_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 
 	for_each_sg(sglist, sg, nents, i) {
 		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-			arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length,
-				dir);
+			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 		vdma_free(sg->dma_address);
 	}
 }
@@ -640,13 +639,13 @@ static void jazz_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 static void jazz_dma_sync_single_for_device(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
-	arch_sync_dma_for_device(dev, vdma_log2phys(addr), size, dir);
+	arch_sync_dma_for_device(vdma_log2phys(addr), size, dir);
 }
 
 static void jazz_dma_sync_single_for_cpu(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
-	arch_sync_dma_for_cpu(dev, vdma_log2phys(addr), size, dir);
+	arch_sync_dma_for_cpu(vdma_log2phys(addr), size, dir);
 }
 
 static void jazz_dma_sync_sg_for_device(struct device *dev,
@@ -656,7 +655,7 @@ static void jazz_dma_sync_sg_for_device(struct device *dev,
 	int i;
 
 	for_each_sg(sgl, sg, nents, i)
-		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
 }
 
 static void jazz_dma_sync_sg_for_cpu(struct device *dev,
@@ -666,7 +665,7 @@ static void jazz_dma_sync_sg_for_cpu(struct device *dev,
 	int i;
 
 	for_each_sg(sgl, sg, nents, i)
-		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 }
 
 const struct dma_map_ops jazz_dma_ops = {
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index fcf6d3eaac66..dc42ffc83825 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -27,7 +27,7 @@
  * R10000 and R12000 are used in such systems, the SGI IP28 Indigo² rsp.
  * SGI IP32 aka O2.
  */
-static inline bool cpu_needs_post_dma_flush(struct device *dev)
+static inline bool cpu_needs_post_dma_flush(void)
 {
 	switch (boot_cpu_type()) {
 	case CPU_R10000:
@@ -112,17 +112,17 @@ static inline void dma_sync_phys(phys_addr_t paddr, size_t size,
 	} while (left);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	dma_sync_phys(paddr, size, dir);
 }
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	if (cpu_needs_post_dma_flush(dev))
+	if (cpu_needs_post_dma_flush())
 		dma_sync_phys(paddr, size, dir);
 }
 #endif
diff --git a/arch/nds32/kernel/dma.c b/arch/nds32/kernel/dma.c
index 4206d4b6c8ce..69d762182d49 100644
--- a/arch/nds32/kernel/dma.c
+++ b/arch/nds32/kernel/dma.c
@@ -46,8 +46,8 @@ static inline void cache_op(phys_addr_t paddr, size_t size,
 	} while (left);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_FROM_DEVICE:
@@ -61,8 +61,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c
index 9cb238664584..0ed711e37902 100644
--- a/arch/nios2/mm/dma-mapping.c
+++ b/arch/nios2/mm/dma-mapping.c
@@ -18,8 +18,8 @@
 #include <linux/cache.h>
 #include <asm/cacheflush.h>
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	void *vaddr = phys_to_virt(paddr);
 
@@ -42,8 +42,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	void *vaddr = phys_to_virt(paddr);
 
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index 4d5b8bd1d795..adec711ad39d 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -125,7 +125,7 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	free_pages_exact(vaddr, size);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t addr, size_t size,
+void arch_sync_dma_for_device(phys_addr_t addr, size_t size,
 		enum dma_data_direction dir)
 {
 	unsigned long cl;
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index ca35d9a76e50..a60d47fd4d55 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -439,14 +439,14 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	free_pages((unsigned long)__va(dma_handle), order);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size);
 }
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 2a82984356f8..5ab4f868e919 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -104,14 +104,14 @@ static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir)
 #endif
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_sync_page(paddr, size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_sync_page(paddr, size, dir);
 }
diff --git a/arch/sh/kernel/dma-coherent.c b/arch/sh/kernel/dma-coherent.c
index b17514619b7e..eeb25a4fa55f 100644
--- a/arch/sh/kernel/dma-coherent.c
+++ b/arch/sh/kernel/dma-coherent.c
@@ -25,7 +25,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	 * Pages from the page allocator may have data present in
 	 * cache. So flush the cache before using uncached memory.
 	 */
-	arch_sync_dma_for_device(dev, virt_to_phys(ret), size,
+	arch_sync_dma_for_device(virt_to_phys(ret), size,
 			DMA_BIDIRECTIONAL);
 
 	ret_nocache = (void __force *)ioremap_nocache(virt_to_phys(ret), size);
@@ -59,8 +59,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	iounmap(vaddr);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	void *addr = sh_cacheop_vaddr(phys_to_virt(paddr));
 
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index f89603855f1e..e59461d03b9a 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -366,8 +366,8 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
 
 /* IIep is write-through, not flushing on cpu to device transfer. */
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	if (dir != PCI_DMA_TODEVICE)
 		dma_make_coherent(paddr, PAGE_ALIGN(size));
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 154979d62b73..2b86a2a04236 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -44,8 +44,8 @@ static void do_cache_op(phys_addr_t paddr, size_t size,
 		}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_BIDIRECTIONAL:
@@ -62,8 +62,8 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_BIDIRECTIONAL:
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index a735515ee4e9..7137dc4ceb6d 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -669,7 +669,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	arch_sync_dma_for_cpu(dev, phys, size, dir);
+	arch_sync_dma_for_cpu(phys, size, dir);
 }
 
 static void iommu_dma_sync_single_for_device(struct device *dev,
@@ -681,7 +681,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	arch_sync_dma_for_device(dev, phys, size, dir);
+	arch_sync_dma_for_device(phys, size, dir);
 }
 
 static void iommu_dma_sync_sg_for_cpu(struct device *dev,
@@ -695,7 +695,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 		return;
 
 	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 }
 
 static void iommu_dma_sync_sg_for_device(struct device *dev,
@@ -709,7 +709,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 		return;
 
 	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
 }
 
 static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
@@ -724,7 +724,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 	dma_handle =__iommu_dma_map(dev, phys, size, prot);
 	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
 	    dma_handle != DMA_MAPPING_ERROR)
-		arch_sync_dma_for_device(dev, phys, size, dir);
+		arch_sync_dma_for_device(phys, size, dir);
 	return dma_handle;
 }
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 06346422f743..486d7978ea97 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -411,7 +411,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 
 done:
 	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		xen_dma_sync_for_device(dev, dev_addr, phys, size, dir);
+		xen_dma_sync_for_device(dev_addr, phys, size, dir);
 	return dev_addr;
 }
 
@@ -431,7 +431,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 	BUG_ON(dir == DMA_NONE);
 
 	if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		xen_dma_sync_for_cpu(hwdev, dev_addr, paddr, size, dir);
+		xen_dma_sync_for_cpu(dev_addr, paddr, size, dir);
 
 	/* NOTE: We use dev_addr here, not paddr! */
 	if (is_xen_swiotlb_buffer(dev_addr))
@@ -445,7 +445,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr,
 	phys_addr_t paddr = xen_bus_to_phys(dma_addr);
 
 	if (!dev_is_dma_coherent(dev))
-		xen_dma_sync_for_cpu(dev, dma_addr, paddr, size, dir);
+		xen_dma_sync_for_cpu(dma_addr, paddr, size, dir);
 
 	if (is_xen_swiotlb_buffer(dma_addr))
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
@@ -461,7 +461,7 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr,
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
 
 	if (!dev_is_dma_coherent(dev))
-		xen_dma_sync_for_device(dev, dma_addr, paddr, size, dir);
+		xen_dma_sync_for_device(dma_addr, paddr, size, dir);
 }
 
 /*
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index e30fca1f1b12..ca9b5770caee 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -73,29 +73,29 @@ static inline void arch_dma_cache_sync(struct device *dev, void *vaddr,
 #endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir);
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
 #else
-static inline void arch_sync_dma_for_device(struct device *dev,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 }
 #endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir);
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
 #else
-static inline void arch_sync_dma_for_cpu(struct device *dev,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 }
 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
-void arch_sync_dma_for_cpu_all(struct device *dev);
+void arch_sync_dma_for_cpu_all(void);
 #else
-static inline void arch_sync_dma_for_cpu_all(struct device *dev)
+static inline void arch_sync_dma_for_cpu_all(void)
 {
 }
 #endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */
diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h
index d71380f6ed0b..ffc0d3902b71 100644
--- a/include/xen/swiotlb-xen.h
+++ b/include/xen/swiotlb-xen.h
@@ -4,10 +4,10 @@
 
 #include <linux/swiotlb.h>
 
-void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir);
-void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir);
+void xen_dma_sync_for_cpu(dma_addr_t handle, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
+void xen_dma_sync_for_device(dma_addr_t handle, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
 
 extern int xen_swiotlb_init(int verbose, bool early);
 extern const struct dma_map_ops xen_swiotlb_dma_ops;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index e5eb887f1ae4..4cd095a4eb41 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -266,7 +266,7 @@ void dma_direct_sync_single_for_device(struct device *dev,
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
 
 	if (!dev_is_dma_coherent(dev))
-		arch_sync_dma_for_device(dev, paddr, size, dir);
+		arch_sync_dma_for_device(paddr, size, dir);
 }
 EXPORT_SYMBOL(dma_direct_sync_single_for_device);
 
@@ -284,7 +284,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir, SYNC_FOR_DEVICE);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(dev, paddr, sg->length,
+			arch_sync_dma_for_device(paddr, sg->length,
 					dir);
 	}
 }
@@ -300,8 +300,8 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
 	phys_addr_t paddr = dma_to_phys(dev, addr);
 
 	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(dev, paddr, size, dir);
-		arch_sync_dma_for_cpu_all(dev);
+		arch_sync_dma_for_cpu(paddr, size, dir);
+		arch_sync_dma_for_cpu_all();
 	}
 
 	if (unlikely(is_swiotlb_buffer(paddr)))
@@ -319,7 +319,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_cpu(dev, paddr, sg->length, dir);
+			arch_sync_dma_for_cpu(paddr, sg->length, dir);
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
 			swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
@@ -327,7 +327,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 	}
 
 	if (!dev_is_dma_coherent(dev))
-		arch_sync_dma_for_cpu_all(dev);
+		arch_sync_dma_for_cpu_all();
 }
 EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
 
@@ -378,7 +378,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	}
 
 	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(dev, phys, size, dir);
+		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
 }
 EXPORT_SYMBOL(dma_direct_map_page);
-- 
Gitee


From c625366c63aedad5760b164a185e9d0a1ea18504 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Nov 2019 17:06:04 +0100
Subject: [PATCH 0965/2161] dma-direct: unify the dma_capable definitions

commit 130c1ccbf55330b55e82612a6e54eebb82c9d746 upstream.

Currently each architectures that wants to override dma_to_phys and
phys_to_dma also has to provide dma_capable.  But there isn't really
any good reason for that.  powerpc and mips just have copies of the
generic one minus the latests fix, and the arm one was the inspiration
for said fix, but misses the bus_dma_mask handling.
Make all architectures use the generic version instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Reviewed-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/arm/include/asm/dma-direct.h     | 19 -------------------
 arch/mips/include/asm/dma-direct.h    |  8 --------
 arch/powerpc/include/asm/dma-direct.h |  9 ---------
 include/linux/dma-direct.h            |  2 +-
 4 files changed, 1 insertion(+), 37 deletions(-)

diff --git a/arch/arm/include/asm/dma-direct.h b/arch/arm/include/asm/dma-direct.h
index c9f4e70a4fe2..b939a31f7a14 100644
--- a/arch/arm/include/asm/dma-direct.h
+++ b/arch/arm/include/asm/dma-direct.h
@@ -14,23 +14,4 @@ static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 	return __pfn_to_phys(dma_to_pfn(dev, dev_addr)) + offset;
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
-{
-	u64 limit, mask;
-
-	if (!dev->dma_mask)
-		return 0;
-
-	mask = *dev->dma_mask;
-
-	limit = (mask + 1) & ~mask;
-	if (limit && size > limit)
-		return 0;
-
-	if ((addr | (addr + size - 1)) & ~mask)
-		return 0;
-
-	return 1;
-}
-
 #endif /* ASM_ARM_DMA_DIRECT_H */
diff --git a/arch/mips/include/asm/dma-direct.h b/arch/mips/include/asm/dma-direct.h
index b5c240806e1b..14e352651ce9 100644
--- a/arch/mips/include/asm/dma-direct.h
+++ b/arch/mips/include/asm/dma-direct.h
@@ -2,14 +2,6 @@
 #ifndef _MIPS_DMA_DIRECT_H
 #define _MIPS_DMA_DIRECT_H 1
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
-{
-	if (!dev->dma_mask)
-		return false;
-
-	return addr + size - 1 <= *dev->dma_mask;
-}
-
 dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr);
 phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr);
 
diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h
index a2912b47102c..e29e8a236b8d 100644
--- a/arch/powerpc/include/asm/dma-direct.h
+++ b/arch/powerpc/include/asm/dma-direct.h
@@ -2,15 +2,6 @@
 #ifndef ASM_POWERPC_DMA_DIRECT_H
 #define ASM_POWERPC_DMA_DIRECT_H 1
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
-{
-	if (!dev->dma_mask)
-		return false;
-
-	return addr + size - 1 <=
-		min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
-}
-
 static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
 	if (!dev)
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index cc34a436c882..d025fd39cd80 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -39,6 +39,7 @@ static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 
 	return paddr + ((phys_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
 }
+#endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
@@ -53,7 +54,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
 }
-#endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
 #ifdef CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED
 bool force_dma_unencrypted(struct device *dev);
-- 
Gitee


From 1e1bc1742b2714f097af8cfd5b471ebd5e79ac8c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Nov 2019 17:07:43 +0100
Subject: [PATCH 0966/2161] dma-direct: avoid a forward declaration for
 phys_to_dma

commit c7345159f7db6fb69ec1c3b3f8f28cd05c731be2 upstream.

Move dma_capable down a bit so that we don't need a forward declaration
for phys_to_dma.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-direct.h | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index d025fd39cd80..bcd731febe82 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -6,8 +6,6 @@
 #include <linux/memblock.h> /* for min_low_pfn */
 #include <linux/mem_encrypt.h>
 
-static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
-
 #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA
 #include <asm/dma-direct.h>
 #ifndef phys_to_dma_unencrypted
@@ -41,20 +39,6 @@ static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 }
 #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
-{
-	dma_addr_t end = addr + size - 1;
-
-	if (!dev->dma_mask)
-		return false;
-
-	if (!IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
-	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
-		return false;
-
-	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
-}
-
 #ifdef CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED
 bool force_dma_unencrypted(struct device *dev);
 #else
@@ -69,6 +53,20 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 	return __sme_clr(__dma_to_phys(dev, daddr));
 }
 
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+	dma_addr_t end = addr + size - 1;
+
+	if (!dev->dma_mask)
+		return false;
+
+	if (!IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
+	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
+		return false;
+
+	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
+}
+
 u64 dma_direct_get_required_mask(struct device *dev);
 void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
-- 
Gitee


From e52d4b5480cb99762307d3909fb039cade7d4064 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2019 17:38:58 +0100
Subject: [PATCH 0967/2161] dma-direct: exclude dma_direct_map_resource from
 the min_low_pfn check

commit 68a33b1794665ba8a1d1ef1d3bfcc7c587d380a6 upstream.

The valid memory address check in dma_capable only makes sense when mapping
normal memory, not when using dma_map_resource to map a device resource.
Add a new boolean argument to dma_capable to exclude that check for the
dma_map_resource case.

Fixes: b12d66278dd6 ("dma-direct: check for overflows on 32 bit DMA addresses")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/amd_gart_64.c | 4 ++--
 drivers/xen/swiotlb-xen.c     | 4 ++--
 include/linux/dma-direct.h    | 5 +++--
 kernel/dma/direct.c           | 4 ++--
 kernel/dma/swiotlb.c          | 2 +-
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index a6ac3712db8b..5cfab41e8509 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -185,13 +185,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
 static inline int
 need_iommu(struct device *dev, unsigned long addr, size_t size)
 {
-	return force_iommu || !dma_capable(dev, addr, size);
+	return force_iommu || !dma_capable(dev, addr, size, true);
 }
 
 static inline int
 nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
 {
-	return !dma_capable(dev, addr, size);
+	return !dma_capable(dev, addr, size, true);
 }
 
 /* Map a single continuous physical area into the IOMMU.
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 486d7978ea97..5fbadd07819b 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -381,7 +381,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (dma_capable(dev, dev_addr, size) &&
+	if (dma_capable(dev, dev_addr, size, true) &&
 	    !range_straddles_page_boundary(phys, size) &&
 		!xen_arch_need_swiotlb(dev, phys, dev_addr) &&
 		swiotlb_force != SWIOTLB_FORCE)
@@ -403,7 +403,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	/*
 	 * Ensure that the address returned is DMA'ble
 	 */
-	if (unlikely(!dma_capable(dev, dev_addr, size))) {
+	if (unlikely(!dma_capable(dev, dev_addr, size, true))) {
 		swiotlb_tbl_unmap_single(dev, map, size, size, dir,
 				attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		return DMA_MAPPING_ERROR;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index bcd731febe82..98d1251078eb 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -53,14 +53,15 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 	return __sme_clr(__dma_to_phys(dev, daddr));
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
+		bool is_ram)
 {
 	dma_addr_t end = addr + size - 1;
 
 	if (!dev->dma_mask)
 		return false;
 
-	if (!IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
+	if (is_ram && !IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
 	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
 		return false;
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4cd095a4eb41..bd9dca0872f4 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -361,7 +361,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
 		size_t size)
 {
 	return swiotlb_force != SWIOTLB_FORCE &&
-		dma_capable(dev, dma_addr, size);
+		dma_capable(dev, dma_addr, size, true);
 }
 
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
@@ -410,7 +410,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 {
 	dma_addr_t dma_addr = paddr;
 
-	if (unlikely(!dma_capable(dev, dma_addr, size))) {
+	if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
 		report_addr(dev, dma_addr, size);
 		return DMA_MAPPING_ERROR;
 	}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d53359c3f5dd..859c624f0f41 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -679,7 +679,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 
 	/* Ensure that the address returned is DMA'ble */
 	*dma_addr = phys_to_dma_unencrypted(dev, *phys);
-	if (unlikely(!dma_capable(dev, *dma_addr, size))) {
+	if (unlikely(!dma_capable(dev, *dma_addr, size, true))) {
 		swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		return false;
-- 
Gitee


From e5c472a40557d4dc7232bd901b0a45bcd11b61d2 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Thu, 21 Nov 2019 10:26:44 +0100
Subject: [PATCH 0968/2161] dma-mapping: treat dev->bus_dma_mask as a DMA limit

commit a7ba70f1787f977f970cd116076c6fce4b9e01cc upstream.

Using a mask to represent bus DMA constraints has a set of limitations.
The biggest one being it can only hold a power of two (minus one). The
DMA mapping code is already aware of this and treats dev->bus_dma_mask
as a limit. This quirk is already used by some architectures although
still rare.

With the introduction of the Raspberry Pi 4 we've found a new contender
for the use of bus DMA limits, as its PCIe bus can only address the
lower 3GB of memory (of a total of 4GB). This is impossible to represent
with a mask. To make things worse the device-tree code rounds non power
of two bus DMA limits to the next power of two, which is unacceptable in
this case.

In the light of this, rename dev->bus_dma_mask to dev->bus_dma_limit all
over the tree and treat it as such. Note that dev->bus_dma_limit should
contain the higher accessible DMA address.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/mips/pci/fixup-sb1250.c  | 16 ++++++++--------
 arch/powerpc/sysdev/fsl_pci.c |  6 +++---
 arch/x86/kernel/pci-dma.c     |  2 +-
 arch/x86/mm/mem_encrypt.c     |  2 +-
 drivers/acpi/arm64/iort.c     | 20 +++++++-------------
 drivers/ata/ahci.c            |  2 +-
 drivers/iommu/dma-iommu.c     |  3 +--
 drivers/of/device.c           |  9 +++++----
 include/linux/device.h        |  6 +++---
 include/linux/dma-direct.h    |  2 +-
 include/linux/dma-mapping.h   |  2 +-
 kernel/dma/direct.c           | 27 +++++++++++++--------------
 12 files changed, 45 insertions(+), 52 deletions(-)

diff --git a/arch/mips/pci/fixup-sb1250.c b/arch/mips/pci/fixup-sb1250.c
index 8a41b359cf90..40efc990cdce 100644
--- a/arch/mips/pci/fixup-sb1250.c
+++ b/arch/mips/pci/fixup-sb1250.c
@@ -21,22 +21,22 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_PCI,
 
 /*
  * The BCM1250, etc. PCI host bridge does not support DAC on its 32-bit
- * bus, so we set the bus's DMA mask accordingly.  However the HT link
+ * bus, so we set the bus's DMA limit accordingly.  However the HT link
  * down the artificial PCI-HT bridge supports 40-bit addressing and the
  * SP1011 HT-PCI bridge downstream supports both DAC and a 64-bit bus
  * width, so we record the PCI-HT bridge's secondary and subordinate bus
- * numbers and do not set the mask for devices present in the inclusive
+ * numbers and do not set the limit for devices present in the inclusive
  * range of those.
  */
-struct sb1250_bus_dma_mask_exclude {
+struct sb1250_bus_dma_limit_exclude {
 	bool set;
 	unsigned char start;
 	unsigned char end;
 };
 
-static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
+static int sb1250_bus_dma_limit(struct pci_dev *dev, void *data)
 {
-	struct sb1250_bus_dma_mask_exclude *exclude = data;
+	struct sb1250_bus_dma_limit_exclude *exclude = data;
 	bool exclude_this;
 	bool ht_bridge;
 
@@ -55,7 +55,7 @@ static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
 			exclude->start, exclude->end);
 	} else {
 		dev_dbg(&dev->dev, "disabling DAC for device");
-		dev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+		dev->dev.bus_dma_limit = DMA_BIT_MASK(32);
 	}
 
 	return 0;
@@ -63,9 +63,9 @@ static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
 
 static void quirk_sb1250_pci_dac(struct pci_dev *dev)
 {
-	struct sb1250_bus_dma_mask_exclude exclude = { .set = false };
+	struct sb1250_bus_dma_limit_exclude exclude = { .set = false };
 
-	pci_walk_bus(dev->bus, sb1250_bus_dma_mask, &exclude);
+	pci_walk_bus(dev->bus, sb1250_bus_dma_limit, &exclude);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_PCI,
 			quirk_sb1250_pci_dac);
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index ff0e2b156cb5..617a443d673d 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -115,8 +115,8 @@ static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 
-	pdev->dev.bus_dma_mask =
-		hose->dma_window_base_cur + hose->dma_window_size;
+	pdev->dev.bus_dma_limit =
+		hose->dma_window_base_cur + hose->dma_window_size - 1;
 }
 
 static void setup_swiotlb_ops(struct pci_controller *hose)
@@ -135,7 +135,7 @@ static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
 	 * mapping that allows addressing any RAM address from across PCI.
 	 */
 	if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) {
-		dev->bus_dma_mask = 0;
+		dev->bus_dma_limit = 0;
 		dev->archdata.dma_offset = pci64_dma_offset;
 	}
 }
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index fa4352dce491..3a75d665d43c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -146,7 +146,7 @@ rootfs_initcall(pci_iommu_init);
 
 static int via_no_dac_cb(struct pci_dev *pdev, void *data)
 {
-	pdev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+	pdev->dev.bus_dma_limit = DMA_BIT_MASK(32);
 	return 0;
 }
 
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 7b558939b89c..ebf70fb3f821 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -367,7 +367,7 @@ bool force_dma_unencrypted(struct device *dev)
 	if (sme_active()) {
 		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
 		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
-						dev->bus_dma_mask);
+						dev->bus_dma_limit);
 
 		if (dma_dev_mask <= dma_enc_mask)
 			return true;
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 9b63fb62189c..0e1e50f3a4d2 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1062,8 +1062,8 @@ static int rc_dma_get_range(struct device *dev, u64 *size)
  */
 void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 {
-	u64 mask, dmaaddr = 0, size = 0, offset = 0;
-	int ret, msb;
+	u64 end, mask, dmaaddr = 0, size = 0, offset = 0;
+	int ret;
 
 	/*
 	 * If @dev is expected to be DMA-capable then the bus code that created
@@ -1090,19 +1090,13 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 	}
 
 	if (!ret) {
-		msb = fls64(dmaaddr + size - 1);
 		/*
-		 * Round-up to the power-of-two mask or set
-		 * the mask to the whole 64-bit address space
-		 * in case the DMA region covers the full
-		 * memory window.
+		 * Limit coherent and dma mask based on size retrieved from
+		 * firmware.
 		 */
-		mask = msb == 64 ? U64_MAX : (1ULL << msb) - 1;
-		/*
-		 * Limit coherent and dma mask based on size
-		 * retrieved from firmware.
-		 */
-		dev->bus_dma_mask = mask;
+		end = dmaaddr + size - 1;
+		mask = DMA_BIT_MASK(ilog2(end) + 1);
+		dev->bus_dma_limit = end;
 		dev->coherent_dma_mask = mask;
 		*dev->dma_mask = mask;
 	}
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 8beb418ce167..6b657cbec5cd 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -900,7 +900,7 @@ static int ahci_configure_dma_masks(struct pci_dev *pdev, int using_dac)
 	 * value, don't extend it here. This happens on STA2X11, for example.
 	 *
 	 * XXX: manipulating the DMA mask from platform code is completely
-	 * bogus, platform code should use dev->bus_dma_mask instead..
+	 * bogus, platform code should use dev->bus_dma_limit instead..
 	 */
 	if (pdev->dma_mask && pdev->dma_mask < DMA_BIT_MASK(32))
 		return 0;
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7137dc4ceb6d..806c338c5062 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -415,8 +415,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
 		iova_len = roundup_pow_of_two(iova_len);
 
-	if (dev->bus_dma_mask)
-		dma_limit &= dev->bus_dma_mask;
+	dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);
 
 	if (domain->geometry.force_aperture)
 		dma_limit = min(dma_limit, domain->geometry.aperture_end);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index da8158392010..e9127db7b067 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -93,7 +93,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 	bool coherent;
 	unsigned long offset;
 	const struct iommu_ops *iommu;
-	u64 mask;
+	u64 mask, end;
 
 	ret = of_dma_get_range(np, &dma_addr, &paddr, &size);
 	if (ret < 0) {
@@ -148,12 +148,13 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 	 * Limit coherent and dma mask based on size and default mask
 	 * set by the driver.
 	 */
-	mask = DMA_BIT_MASK(ilog2(dma_addr + size - 1) + 1);
+	end = dma_addr + size - 1;
+	mask = DMA_BIT_MASK(ilog2(end) + 1);
 	dev->coherent_dma_mask &= mask;
 	*dev->dma_mask &= mask;
-	/* ...but only set bus mask if we found valid dma-ranges earlier */
+	/* ...but only set bus limit if we found valid dma-ranges earlier */
 	if (!ret)
-		dev->bus_dma_mask = mask;
+		dev->bus_dma_limit = end;
 
 	coherent = of_dma_is_coherent(np);
 	dev_dbg(dev, "device is%sdma coherent\n",
diff --git a/include/linux/device.h b/include/linux/device.h
index e9850ad0cdf2..23cac07ac1c1 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1188,8 +1188,8 @@ struct dev_links_info {
  * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
  * 		hardware supports 64-bit addresses for consistent allocations
  * 		such descriptors.
- * @bus_dma_mask: Mask of an upstream bridge or bus which imposes a smaller DMA
- *		limit than the device itself supports.
+ * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
+ *		DMA limit than the device itself supports.
  * @dma_pfn_offset: offset of DMA memory range relatively of RAM
  * @dma_parms:	A low level driver may set these to teach IOMMU code about
  * 		segment limitations.
@@ -1271,7 +1271,7 @@ struct device {
 					     not all hardware supports
 					     64 bit addresses for consistent
 					     allocations such descriptors. */
-	u64		bus_dma_mask;	/* upstream dma_mask constraint */
+	u64		bus_dma_limit;	/* upstream dma constraint */
 	unsigned long	dma_pfn_offset;
 
 	struct device_dma_parameters *dma_parms;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 98d1251078eb..a4a8acb6073e 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -65,7 +65,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
 	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
 		return false;
 
-	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
+	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
 }
 
 u64 dma_direct_get_required_mask(struct device *dev);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index d0d8651ee149..bb17ef73242a 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -703,7 +703,7 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
  */
 static inline bool dma_addressing_limited(struct device *dev)
 {
-	return min_not_zero(dma_get_mask(dev), dev->bus_dma_mask) <
+	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
 			    dma_get_required_mask(dev);
 }
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index bd9dca0872f4..847def538d10 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -28,10 +28,10 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 {
 	if (!dev->dma_mask) {
 		dev_err_once(dev, "DMA map on device without dma_mask\n");
-	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) {
 		dev_err_once(dev,
-			"overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
-			&dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
+			"overflow %pad+%zu of DMA mask %llx bus limit %llx\n",
+			&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
 	}
 	WARN_ON_ONCE(1);
 }
@@ -59,15 +59,14 @@ u64 dma_direct_get_required_mask(struct device *dev)
 }
 
 static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
-		u64 *phys_mask)
+		u64 *phys_limit)
 {
-	if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
-		dma_mask = dev->bus_dma_mask;
+	u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
 
 	if (force_dma_unencrypted(dev))
-		*phys_mask = __dma_to_phys(dev, dma_mask);
+		*phys_limit = __dma_to_phys(dev, dma_limit);
 	else
-		*phys_mask = dma_to_phys(dev, dma_mask);
+		*phys_limit = dma_to_phys(dev, dma_limit);
 
 	/*
 	 * Optimistically try the zone that the physical address mask falls
@@ -77,9 +76,9 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	 * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
 	 * zones.
 	 */
-	if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+	if (*phys_limit <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
 		return GFP_DMA;
-	if (*phys_mask <= DMA_BIT_MASK(32))
+	if (*phys_limit <= DMA_BIT_MASK(32))
 		return GFP_DMA32;
 	return 0;
 }
@@ -87,7 +86,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 {
 	return phys_to_dma_direct(dev, phys) + size - 1 <=
-			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
+			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -96,7 +95,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	size_t alloc_size = PAGE_ALIGN(size);
 	int node = dev_to_node(dev);
 	struct page *page = NULL;
-	u64 phys_mask;
+	u64 phys_limit;
 
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
@@ -104,7 +103,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	/* we always manually zero the memory once we are done: */
 	gfp &= ~__GFP_ZERO;
 	gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-			&phys_mask);
+			&phys_limit);
 	page = dma_alloc_contiguous(dev, alloc_size, gfp);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		dma_free_contiguous(dev, page, alloc_size);
@@ -118,7 +117,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		page = NULL;
 
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
-		    phys_mask < DMA_BIT_MASK(64) &&
+		    phys_limit < DMA_BIT_MASK(64) &&
 		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
 			gfp |= GFP_DMA32;
 			goto again;
-- 
Gitee


From fdc5d99edf8b4865966da7053b85f31948a5c73a Mon Sep 17 00:00:00 2001
From: Huang Shijie <sjhuang@iluvatar.ai>
Date: Wed, 4 Dec 2019 16:52:00 -0800
Subject: [PATCH 0969/2161] lib/genalloc.c: export symbol addr_in_gen_pool

commit fd7eb2513f8549cf2f6b0c323377816268424bed upstream.

We use addr_in_gen_pool() in a driver module.  So export it.

Link: http://lkml.kernel.org/r/20181224070622.22197-2-sjhuang@iluvatar.ai
Signed-off-by: Huang Shijie <sjhuang@iluvatar.ai>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexey Skidanov <alexey.skidanov@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 lib/genalloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/genalloc.c b/lib/genalloc.c
index 80d10d02cf38..1ca92e74bc63 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -568,6 +568,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
 	rcu_read_unlock();
 	return found;
 }
+EXPORT_SYMBOL(addr_in_gen_pool);
 
 /**
  * gen_pool_avail - get available free space of the pool
-- 
Gitee


From 0a340e71cb00e05cb01af776b761c0b0ccb04d65 Mon Sep 17 00:00:00 2001
From: Huang Shijie <sjhuang@iluvatar.ai>
Date: Wed, 4 Dec 2019 16:52:03 -0800
Subject: [PATCH 0970/2161] lib/genalloc.c: rename addr_in_gen_pool to
 gen_pool_has_addr

commit 964975ac6677c97ae61ec9d6969dd5d03f18d1c3 upstream.

Follow the kernel conventions, rename addr_in_gen_pool to
gen_pool_has_addr.

[sjhuang@iluvatar.ai: fix Documentation/ too]
 Link: http://lkml.kernel.org/r/20181229015914.5573-1-sjhuang@iluvatar.ai
Link: http://lkml.kernel.org/r/20181228083950.20398-1-sjhuang@iluvatar.ai
Signed-off-by: Huang Shijie <sjhuang@iluvatar.ai>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/core-api/genalloc.rst | 2 +-
 arch/arm/mm/dma-mapping.c           | 2 +-
 drivers/misc/sram-exec.c            | 2 +-
 include/linux/genalloc.h            | 2 +-
 kernel/dma/remap.c                  | 2 +-
 lib/genalloc.c                      | 6 +++---
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst
index 6b38a39fab24..a534cc7ebd05 100644
--- a/Documentation/core-api/genalloc.rst
+++ b/Documentation/core-api/genalloc.rst
@@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future.
    :functions: gen_pool_for_each_chunk
 
 .. kernel-doc:: lib/genalloc.c
-   :functions: addr_in_gen_pool
+   :functions: gen_pool_has_addr
 
 .. kernel-doc:: lib/genalloc.c
    :functions: gen_pool_avail
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 297639e24b31..e7d039804e3e 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -529,7 +529,7 @@ static void *__alloc_from_pool(size_t size, struct page **ret_page)
 
 static bool __in_atomic_pool(void *start, size_t size)
 {
-	return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+	return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
 }
 
 static int __free_from_pool(void *start, size_t size)
diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c
index 426ad912b441..d054e2842a5f 100644
--- a/drivers/misc/sram-exec.c
+++ b/drivers/misc/sram-exec.c
@@ -96,7 +96,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
 	if (!part)
 		return NULL;
 
-	if (!addr_in_gen_pool(pool, (unsigned long)dst, size))
+	if (!gen_pool_has_addr(pool, (unsigned long)dst, size))
 		return NULL;
 
 	base = (unsigned long)part->base;
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 4bd583bd6934..5b14a0f38124 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -206,7 +206,7 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev,
 		int min_alloc_order, int nid, const char *name);
 extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
 
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+extern bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
 			size_t size);
 
 #ifdef CONFIG_OF
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index d47bd40fc0f5..d14cbc83986a 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -178,7 +178,7 @@ bool dma_in_atomic_pool(void *start, size_t size)
 	if (unlikely(!atomic_pool))
 		return false;
 
-	return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+	return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
 }
 
 void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 1ca92e74bc63..cc5f615bc9f2 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -541,7 +541,7 @@ void gen_pool_for_each_chunk(struct gen_pool *pool,
 EXPORT_SYMBOL(gen_pool_for_each_chunk);
 
 /**
- * addr_in_gen_pool - checks if an address falls within the range of a pool
+ * gen_pool_has_addr - checks if an address falls within the range of a pool
  * @pool:	the generic memory pool
  * @start:	start address
  * @size:	size of the region
@@ -549,7 +549,7 @@ EXPORT_SYMBOL(gen_pool_for_each_chunk);
  * Check if the range of addresses falls within the specified pool. Returns
  * true if the entire range is contained in the pool and false otherwise.
  */
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
 			size_t size)
 {
 	bool found = false;
@@ -568,7 +568,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
 	rcu_read_unlock();
 	return found;
 }
-EXPORT_SYMBOL(addr_in_gen_pool);
+EXPORT_SYMBOL(gen_pool_has_addr);
 
 /**
  * gen_pool_avail - get available free space of the pool
-- 
Gitee


From f7ed14d49d8c7b613ce4a0e6f40f40a4b082ced7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Feb 2020 14:44:38 +0100
Subject: [PATCH 0971/2161] dma-direct: improve swiotlb error reporting

commit 4a47cbae04844f0c5e2365aa6c217b61850bb832 upstream.

Untangle the way how dma_direct_map_page calls into swiotlb to be able
to properly report errors where the swiotlb DMA address overflows the
mask separately from overflows in the !swiotlb case.  This means that
siotlb_map now has to do a little more work that duplicates
dma_direct_map_page, but doing so greatly simplifies the calling
convention.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/xen/swiotlb-xen.c |  3 +--
 include/linux/swiotlb.h   | 11 +++--------
 kernel/dma/direct.c       | 16 +++++++---------
 kernel/dma/swiotlb.c      | 39 +++++++++++++++++++++------------------
 4 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 5fbadd07819b..43dce90059e5 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -392,8 +392,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 */
 	trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
 
-	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys,
-				     size, size, dir, attrs);
+	map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs);
 	if (map == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 16361efb7ac9..d9a911efbb88 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -61,6 +61,9 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev,
 				    size_t size, enum dma_data_direction dir,
 				    enum dma_sync_target target);
 
+dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
 extern phys_addr_t io_tlb_start, io_tlb_end;
@@ -70,8 +73,6 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 	return paddr >= io_tlb_start && paddr < io_tlb_end;
 }
 
-bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
@@ -82,12 +83,6 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 {
 	return false;
 }
-static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys,
-		dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	return false;
-}
 static inline void swiotlb_exit(void)
 {
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 847def538d10..6edc909e15f6 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -356,13 +356,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 EXPORT_SYMBOL(dma_direct_unmap_sg);
 #endif
 
-static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
-		size_t size)
-{
-	return swiotlb_force != SWIOTLB_FORCE &&
-		dma_capable(dev, dma_addr, size, true);
-}
-
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
@@ -370,8 +363,13 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	phys_addr_t phys = page_to_phys(page) + offset;
 	dma_addr_t dma_addr = phys_to_dma(dev, phys);
 
-	if (unlikely(!dma_direct_possible(dev, dma_addr, size)) &&
-	    !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) {
+	if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+		return swiotlb_map(dev, phys, size, dir, attrs);
+
+	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+		if (swiotlb_force != SWIOTLB_NO_FORCE)
+			return swiotlb_map(dev, phys, size, dir, attrs);
+
 		report_addr(dev, dma_addr, size);
 		return DMA_MAPPING_ERROR;
 	}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 859c624f0f41..bd86c09c9f1e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -22,6 +22,7 @@
 
 #include <linux/cache.h>
 #include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
@@ -657,35 +658,37 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 }
 
 /*
- * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing
+ * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing
  * to the device copy the data into it as well.
  */
-bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
+dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
-	trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force);
+	phys_addr_t swiotlb_addr;
+	dma_addr_t dma_addr;
 
-	if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) {
-		dev_warn_ratelimited(dev,
-			"Cannot do DMA to address %pa\n", phys);
-		return false;
-	}
+	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
+			      swiotlb_force);
 
-	/* Oh well, have to allocate and map a bounce buffer. */
-	*phys = swiotlb_tbl_map_single(dev, *phys, size, size, dir,
+	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
 			attrs);
-	if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
-		return false;
+	if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
+		return DMA_MAPPING_ERROR;
 
 	/* Ensure that the address returned is DMA'ble */
-	*dma_addr = phys_to_dma_unencrypted(dev, *phys);
-	if (unlikely(!dma_capable(dev, *dma_addr, size, true))) {
-		swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
+	dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
+	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+		swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
-		return false;
+		dev_WARN_ONCE(dev, 1,
+			"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+		return DMA_MAPPING_ERROR;
 	}
 
-	return true;
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(swiotlb_addr, size, dir);
+	return dma_addr;
 }
 
 size_t swiotlb_max_mapping_size(struct device *dev)
-- 
Gitee


From 8274f4c1da409bece331e8464e6d9beafff497ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Feb 2020 14:54:50 +0100
Subject: [PATCH 0972/2161] dma-direct: improve DMA mask overflow reporting

commit 75467ee48a5e04cf3ae3cb39aea6adee73aeff91 upstream.

Remove the unset dma_mask case as that won't get into mapping calls
anymore, and also report the other errors unconditonally and with a
slightly improved message.  Remove the now pointless report_addr helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad@darnok.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 6edc909e15f6..b311cd5829a9 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -24,18 +24,6 @@
 #define ARCH_ZONE_DMA_BITS 24
 #endif
 
-static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
-{
-	if (!dev->dma_mask) {
-		dev_err_once(dev, "DMA map on device without dma_mask\n");
-	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) {
-		dev_err_once(dev,
-			"overflow %pad+%zu of DMA mask %llx bus limit %llx\n",
-			&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
-	}
-	WARN_ON_ONCE(1);
-}
-
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 		phys_addr_t phys)
 {
@@ -370,7 +358,9 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		if (swiotlb_force != SWIOTLB_NO_FORCE)
 			return swiotlb_map(dev, phys, size, dir, attrs);
 
-		report_addr(dev, dma_addr, size);
+		dev_WARN_ONCE(dev, 1,
+			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
 		return DMA_MAPPING_ERROR;
 	}
 
@@ -408,7 +398,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 	dma_addr_t dma_addr = paddr;
 
 	if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
-		report_addr(dev, dma_addr, size);
+		dev_err_once(dev,
+			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+		WARN_ON_ONCE(1);
 		return DMA_MAPPING_ERROR;
 	}
 
-- 
Gitee


From 091c79211b09650b9b213ac397c9b77d9eb24f32 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Feb 2020 12:24:02 -0800
Subject: [PATCH 0973/2161] dma-direct: remove the cached_kernel_address hook

commit 4f8232bbf887123f78bcdca3dfd2b3dfa52a0112 upstream.

dma-direct now finds the kernel address for coherent allocations based
on the dma address, so the cached_kernel_address hooks is unused and
can be removed entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/Kconfig                    |  2 +-
 arch/microblaze/mm/consistent.c |  7 -------
 arch/mips/mm/dma-noncoherent.c  |  5 -----
 arch/nios2/mm/dma-mapping.c     | 10 ----------
 include/linux/dma-noncoherent.h |  1 -
 5 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index b4c9475c053a..a1424bd07d25 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -265,7 +265,7 @@ config ARCH_HAS_SET_DIRECT_MAP
 
 #
 # Select if arch has an uncached kernel segment and provides the
-# uncached_kernel_address / cached_kernel_address symbols to use it
+# uncached_kernel_address symbol to use it
 #
 config ARCH_HAS_UNCACHED_SEGMENT
 	select ARCH_HAS_DMA_PREP_COHERENT
diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c
index 8c5f0c332d8b..cede7c5e8135 100644
--- a/arch/microblaze/mm/consistent.c
+++ b/arch/microblaze/mm/consistent.c
@@ -49,11 +49,4 @@ void *uncached_kernel_address(void *ptr)
 		pr_warn("ERROR: Your cache coherent area is CACHED!!!\n");
 	return (void *)addr;
 }
-
-void *cached_kernel_address(void *ptr)
-{
-	unsigned long addr = (unsigned long)ptr;
-
-	return (void *)(addr & ~UNCACHED_SHADOW_MASK);
-}
 #endif /* CONFIG_MMU */
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index dc42ffc83825..77dce28ad0a0 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -54,11 +54,6 @@ void *uncached_kernel_address(void *addr)
 	return (void *)(__pa(addr) + UNCAC_BASE);
 }
 
-void *cached_kernel_address(void *addr)
-{
-	return __va(addr) - UNCAC_BASE;
-}
-
 static inline void dma_sync_virt(void *addr, size_t size,
 		enum dma_data_direction dir)
 {
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c
index 0ed711e37902..f30f2749257c 100644
--- a/arch/nios2/mm/dma-mapping.c
+++ b/arch/nios2/mm/dma-mapping.c
@@ -75,13 +75,3 @@ void *uncached_kernel_address(void *ptr)
 
 	return (void *)ptr;
 }
-
-void *cached_kernel_address(void *ptr)
-{
-	unsigned long addr = (unsigned long)ptr;
-
-	addr &= ~CONFIG_NIOS2_IO_REGION_BASE;
-	addr |= CONFIG_NIOS2_KERNEL_REGION_BASE;
-
-	return (void *)ptr;
-}
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index ca9b5770caee..b6b72e19b0cd 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -109,6 +109,5 @@ static inline void arch_dma_prep_coherent(struct page *page, size_t size)
 #endif /* CONFIG_ARCH_HAS_DMA_PREP_COHERENT */
 
 void *uncached_kernel_address(void *addr);
-void *cached_kernel_address(void *addr);
 
 #endif /* _LINUX_DMA_NONCOHERENT_H */
-- 
Gitee


From ada5e75d878d6e5d59a2691706ed7eed3c844dd2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Feb 2020 12:26:00 -0800
Subject: [PATCH 0974/2161] dma-direct: consolidate the error handling in
 dma_direct_alloc_pages

commit 3d0fc341c4bb66b2c41c0d1ec954a6d300e100b7 upstream.

Use a goto label to merge two error return cases.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b311cd5829a9..6717ad3d0892 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -159,11 +159,8 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size),
 				dma_pgprot(dev, PAGE_KERNEL, attrs),
 				__builtin_return_address(0));
-		if (!ret) {
-			dma_free_contiguous(dev, page, size);
-			return ret;
-		}
-
+		if (!ret)
+			goto out_free_pages;
 		memset(ret, 0, size);
 		goto done;
 	}
@@ -176,8 +173,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		 * so log an error and fail.
 		 */
 		dev_info(dev, "Rejecting highmem page from CMA.\n");
-		dma_free_contiguous(dev, page, size);
-		return NULL;
+		goto out_free_pages;
 	}
 
 	ret = page_address(page);
@@ -194,6 +190,9 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 done:
 	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 	return ret;
+out_free_pages:
+	dma_free_contiguous(dev, page, size);
+	return NULL;
 }
 
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
-- 
Gitee


From 3bbbdd10c4539c72abcdc53e92940d322890194d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Feb 2020 15:55:43 -0800
Subject: [PATCH 0975/2161] dma-direct: make uncached_kernel_address more
 general

commit fa7e2247c5729f990c7456fe09f3af99c8f2571b upstream.

Rename the symbol to arch_dma_set_uncached, and pass a size to it as
well as allow an error return.  That will allow reusing this hook for
in-place pagetable remapping.

As the in-place remap doesn't always require an explicit cache flush,
also detangle ARCH_HAS_DMA_PREP_COHERENT from ARCH_HAS_DMA_SET_UNCACHED.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/Kconfig                    |  8 ++++----
 arch/microblaze/mm/consistent.c |  2 +-
 arch/mips/Kconfig               |  3 ++-
 arch/mips/mm/dma-noncoherent.c  |  2 +-
 arch/nios2/Kconfig              |  3 ++-
 arch/nios2/mm/dma-mapping.c     |  2 +-
 include/linux/dma-noncoherent.h |  2 +-
 kernel/dma/direct.c             | 10 ++++++----
 8 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a1424bd07d25..3f9931ba331f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -264,11 +264,11 @@ config ARCH_HAS_SET_DIRECT_MAP
 	bool
 
 #
-# Select if arch has an uncached kernel segment and provides the
-# uncached_kernel_address symbol to use it
+# Select if the architecture provides the arch_dma_set_uncached symbol to
+# either provide an uncached segement alias for a DMA allocation, or
+# to remap the page tables in place.
 #
-config ARCH_HAS_UNCACHED_SEGMENT
-	select ARCH_HAS_DMA_PREP_COHERENT
+config ARCH_HAS_DMA_SET_UNCACHED
 	bool
 
 # Select if arch init_task must go in the __init_task_data section
diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c
index cede7c5e8135..e09b66e43cb6 100644
--- a/arch/microblaze/mm/consistent.c
+++ b/arch/microblaze/mm/consistent.c
@@ -40,7 +40,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 #define UNCACHED_SHADOW_MASK 0
 #endif /* CONFIG_XILINX_UNCACHED_SHADOW */
 
-void *uncached_kernel_address(void *ptr)
+void *arch_dma_set_uncached(void *ptr, size_t size)
 {
 	unsigned long addr = (unsigned long)ptr;
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 3a8acc0c127b..f586c22eab92 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1133,8 +1133,9 @@ config DMA_NONCOHERENT
 	# significant advantages.
 	#
 	select ARCH_HAS_DMA_WRITE_COMBINE
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_UNCACHED_SEGMENT
+	select ARCH_HAS_DMA_SET_UNCACHED
 	select DMA_NONCOHERENT_MMAP
 	select DMA_NONCOHERENT_CACHE_SYNC
 	select NEED_DMA_MAP_STATE
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index 77dce28ad0a0..fcea92d95d86 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -49,7 +49,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 	dma_cache_wback_inv((unsigned long)page_address(page), size);
 }
 
-void *uncached_kernel_address(void *addr)
+void *arch_dma_set_uncached(void *addr, size_t size)
 {
 	return (void *)(__pa(addr) + UNCAC_BASE);
 }
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 44b5da37e8bd..2fc4ed210b5f 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -2,9 +2,10 @@
 config NIOS2
 	def_bool y
 	select ARCH_32BIT_OFF_T
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_UNCACHED_SEGMENT
+	select ARCH_HAS_DMA_SET_UNCACHED
 	select ARCH_NO_SWAP
 	select TIMER_OF
 	select GENERIC_ATOMIC64
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c
index f30f2749257c..fd887d5f3f9a 100644
--- a/arch/nios2/mm/dma-mapping.c
+++ b/arch/nios2/mm/dma-mapping.c
@@ -67,7 +67,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 	flush_dcache_range(start, start + size);
 }
 
-void *uncached_kernel_address(void *ptr)
+void *arch_dma_set_uncached(void *ptr, size_t size)
 {
 	unsigned long addr = (unsigned long)ptr;
 
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index b6b72e19b0cd..1a4039506673 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -108,6 +108,6 @@ static inline void arch_dma_prep_coherent(struct page *page, size_t size)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_PREP_COHERENT */
 
-void *uncached_kernel_address(void *addr);
+void *arch_dma_set_uncached(void *addr, size_t size);
 
 #endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 6717ad3d0892..9ba00903a3c5 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -182,10 +182,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 
 	memset(ret, 0, size);
 
-	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
 	    dma_alloc_need_uncached(dev, attrs)) {
 		arch_dma_prep_coherent(page, size);
-		ret = uncached_kernel_address(ret);
+		ret = arch_dma_set_uncached(ret, size);
+		if (IS_ERR(ret))
+			goto out_free_pages;
 	}
 done:
 	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
@@ -223,7 +225,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 void *dma_direct_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
 	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
@@ -233,7 +235,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 void dma_direct_free(struct device *dev, size_t size,
 		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
 {
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
 	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
-- 
Gitee


From 087574b4a239ef86e5f233e2b4f37d4176b90d84 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Feb 2020 12:35:05 -0800
Subject: [PATCH 0976/2161] dma-direct: provide a arch_dma_clear_uncached hook

commit 999a5d1203baa7cff00586361feae263ee3f23a5 upstream.

This allows the arch code to reset the page tables to cached access when
freeing a dma coherent allocation that was set to uncached using
arch_dma_set_uncached.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/Kconfig                    | 7 +++++++
 include/linux/dma-noncoherent.h | 1 +
 kernel/dma/direct.c             | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 3f9931ba331f..eaa0738e0539 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -271,6 +271,13 @@ config ARCH_HAS_SET_DIRECT_MAP
 config ARCH_HAS_DMA_SET_UNCACHED
 	bool
 
+#
+# Select if the architectures provides the arch_dma_clear_uncached symbol
+# to undo an in-place page table remap for uncached access.
+#
+config ARCH_HAS_DMA_CLEAR_UNCACHED
+	bool
+
 # Select if arch init_task must go in the __init_task_data section
 config ARCH_TASK_STRUCT_ON_STACK
        bool
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 1a4039506673..b59f1b6be3e9 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -109,5 +109,6 @@ static inline void arch_dma_prep_coherent(struct page *page, size_t size)
 #endif /* CONFIG_ARCH_HAS_DMA_PREP_COHERENT */
 
 void *arch_dma_set_uncached(void *addr, size_t size);
+void arch_dma_clear_uncached(void *addr, size_t size);
 
 #endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9ba00903a3c5..e342dc52a39f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -218,6 +218,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 
 	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
 		vunmap(cpu_addr);
+	else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
+		arch_dma_clear_uncached(cpu_addr, size);
 
 	dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
 }
-- 
Gitee


From 29333fcb389db26dbaaadfff65ec045f68e683d9 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 14 Apr 2020 17:04:52 -0700
Subject: [PATCH 0977/2161] dma-remap: separate DMA atomic pools from direct
 remap code

commit e860c299ac0d738b44ff91693f11e63080a29698 upstream.

DMA atomic pools will be needed beyond only CONFIG_DMA_DIRECT_REMAP so
separate them out into their own file.

This also adds a new Kconfig option that can be subsequently used for
options, such as CONFIG_AMD_MEM_ENCRYPT, that will utilize the coherent
pools but do not have a dependency on direct remapping.

For this patch alone, there is no functional change introduced.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Rientjes <rientjes@google.com>
[hch: fixup copyrights and remove unused includes]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/Kconfig  |   6 ++-
 kernel/dma/Makefile |   1 +
 kernel/dma/pool.c   | 123 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/dma/remap.c  | 121 +------------------------------------------
 4 files changed, 130 insertions(+), 121 deletions(-)
 create mode 100644 kernel/dma/pool.c

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 4c103a24e380..d006668c0027 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -79,10 +79,14 @@ config DMA_REMAP
 	select DMA_NONCOHERENT_MMAP
 	bool
 
-config DMA_DIRECT_REMAP
+config DMA_COHERENT_POOL
 	bool
 	select DMA_REMAP
 
+config DMA_DIRECT_REMAP
+	bool
+	select DMA_COHERENT_POOL
+
 config DMA_CMA
 	bool "DMA Contiguous Memory Allocator"
 	depends on HAVE_DMA_CONTIGUOUS && CMA
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index d237cf3dc181..370f63344e9c 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -6,4 +6,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT)	+= coherent.o
 obj-$(CONFIG_DMA_VIRT_OPS)		+= virt.o
 obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
+obj-$(CONFIG_DMA_COHERENT_POOL)		+= pool.o
 obj-$(CONFIG_DMA_REMAP)			+= remap.o
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
new file mode 100644
index 000000000000..3df5d9d39922
--- /dev/null
+++ b/kernel/dma/pool.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2020 Google LLC
+ */
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/dma-contiguous.h>
+#include <linux/init.h>
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+
+static struct gen_pool *atomic_pool __ro_after_init;
+
+#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+static int __init early_coherent_pool(char *p)
+{
+	atomic_pool_size = memparse(p, &p);
+	return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+static gfp_t dma_atomic_pool_gfp(void)
+{
+	if (IS_ENABLED(CONFIG_ZONE_DMA))
+		return GFP_DMA;
+	if (IS_ENABLED(CONFIG_ZONE_DMA32))
+		return GFP_DMA32;
+	return GFP_KERNEL;
+}
+
+static int __init dma_atomic_pool_init(void)
+{
+	unsigned int pool_size_order = get_order(atomic_pool_size);
+	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+	struct page *page;
+	void *addr;
+	int ret;
+
+	if (dev_get_cma_area(NULL))
+		page = dma_alloc_from_contiguous(NULL, nr_pages,
+						 pool_size_order, false);
+	else
+		page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
+	if (!page)
+		goto out;
+
+	arch_dma_prep_coherent(page, atomic_pool_size);
+
+	atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
+	if (!atomic_pool)
+		goto free_page;
+
+	addr = dma_common_contiguous_remap(page, atomic_pool_size,
+					   pgprot_dmacoherent(PAGE_KERNEL),
+					   __builtin_return_address(0));
+	if (!addr)
+		goto destroy_genpool;
+
+	ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
+				page_to_phys(page), atomic_pool_size, -1);
+	if (ret)
+		goto remove_mapping;
+	gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
+
+	pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
+		atomic_pool_size / 1024);
+	return 0;
+
+remove_mapping:
+	dma_common_free_remap(addr, atomic_pool_size);
+destroy_genpool:
+	gen_pool_destroy(atomic_pool);
+	atomic_pool = NULL;
+free_page:
+	if (!dma_release_from_contiguous(NULL, page, nr_pages))
+		__free_pages(page, pool_size_order);
+out:
+	pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
+		atomic_pool_size / 1024);
+	return -ENOMEM;
+}
+postcore_initcall(dma_atomic_pool_init);
+
+bool dma_in_atomic_pool(void *start, size_t size)
+{
+	if (unlikely(!atomic_pool))
+		return false;
+
+	return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
+}
+
+void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
+{
+	unsigned long val;
+	void *ptr = NULL;
+
+	if (!atomic_pool) {
+		WARN(1, "coherent pool not initialised!\n");
+		return NULL;
+	}
+
+	val = gen_pool_alloc(atomic_pool, size);
+	if (val) {
+		phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+
+		*ret_page = pfn_to_page(__phys_to_pfn(phys));
+		ptr = (void *)val;
+		memset(ptr, 0, size);
+	}
+
+	return ptr;
+}
+
+bool dma_free_from_pool(void *start, size_t size)
+{
+	if (!dma_in_atomic_pool(start, size))
+		return false;
+	gen_pool_free(atomic_pool, (unsigned long)start, size);
+	return true;
+}
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index d14cbc83986a..f7b402849891 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -1,13 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (C) 2012 ARM Ltd.
  * Copyright (c) 2014 The Linux Foundation
  */
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
-#include <linux/dma-contiguous.h>
-#include <linux/init.h>
-#include <linux/genalloc.h>
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
@@ -97,117 +92,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size)
 	unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
 	vunmap(cpu_addr);
 }
-
-#ifdef CONFIG_DMA_DIRECT_REMAP
-static struct gen_pool *atomic_pool __ro_after_init;
-
-#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
-
-static int __init early_coherent_pool(char *p)
-{
-	atomic_pool_size = memparse(p, &p);
-	return 0;
-}
-early_param("coherent_pool", early_coherent_pool);
-
-static gfp_t dma_atomic_pool_gfp(void)
-{
-	if (IS_ENABLED(CONFIG_ZONE_DMA))
-		return GFP_DMA;
-	if (IS_ENABLED(CONFIG_ZONE_DMA32))
-		return GFP_DMA32;
-	return GFP_KERNEL;
-}
-
-static int __init dma_atomic_pool_init(void)
-{
-	unsigned int pool_size_order = get_order(atomic_pool_size);
-	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
-	struct page *page;
-	void *addr;
-	int ret;
-
-	if (dev_get_cma_area(NULL))
-		page = dma_alloc_from_contiguous(NULL, nr_pages,
-						 pool_size_order, false);
-	else
-		page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
-	if (!page)
-		goto out;
-
-	arch_dma_prep_coherent(page, atomic_pool_size);
-
-	atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
-	if (!atomic_pool)
-		goto free_page;
-
-	addr = dma_common_contiguous_remap(page, atomic_pool_size,
-					   pgprot_dmacoherent(PAGE_KERNEL),
-					   __builtin_return_address(0));
-	if (!addr)
-		goto destroy_genpool;
-
-	ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
-				page_to_phys(page), atomic_pool_size, -1);
-	if (ret)
-		goto remove_mapping;
-	gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
-
-	pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
-		atomic_pool_size / 1024);
-	return 0;
-
-remove_mapping:
-	dma_common_free_remap(addr, atomic_pool_size);
-destroy_genpool:
-	gen_pool_destroy(atomic_pool);
-	atomic_pool = NULL;
-free_page:
-	if (!dma_release_from_contiguous(NULL, page, nr_pages))
-		__free_pages(page, pool_size_order);
-out:
-	pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
-		atomic_pool_size / 1024);
-	return -ENOMEM;
-}
-postcore_initcall(dma_atomic_pool_init);
-
-bool dma_in_atomic_pool(void *start, size_t size)
-{
-	if (unlikely(!atomic_pool))
-		return false;
-
-	return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
-}
-
-void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
-{
-	unsigned long val;
-	void *ptr = NULL;
-
-	if (!atomic_pool) {
-		WARN(1, "coherent pool not initialised!\n");
-		return NULL;
-	}
-
-	val = gen_pool_alloc(atomic_pool, size);
-	if (val) {
-		phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
-
-		*ret_page = pfn_to_page(__phys_to_pfn(phys));
-		ptr = (void *)val;
-		memset(ptr, 0, size);
-	}
-
-	return ptr;
-}
-
-bool dma_free_from_pool(void *start, size_t size)
-{
-	if (!dma_in_atomic_pool(start, size))
-		return false;
-	gen_pool_free(atomic_pool, (unsigned long)start, size);
-	return true;
-}
-#endif /* CONFIG_DMA_DIRECT_REMAP */
-- 
Gitee


From 4eec1c9a63cce3161096b13fc3d066a4d282c40e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 14 Apr 2020 17:04:55 -0700
Subject: [PATCH 0978/2161] dma-pool: add additional coherent pools to map to
 gfp mask

commit c84dc6e68a1d2464e050d9694be4e4ff49e32bfd upstream.

The single atomic pool is allocated from the lowest zone possible since
it is guaranteed to be applicable for any DMA allocation.

Devices may allocate through the DMA API but not have a strict reliance
on GFP_DMA memory.  Since the atomic pool will be used for all
non-blockable allocations, returning all memory from ZONE_DMA may
unnecessarily deplete the zone.

Provision for multiple atomic pools that will map to the optimal gfp
mask of the device.

When allocating non-blockable memory, determine the optimal gfp mask of
the device and use the appropriate atomic pool.

The coherent DMA mask will remain the same between allocation and free
and, thus, memory will be freed to the same atomic pool it was allocated
from.

__dma_atomic_pool_init() will be changed to return struct gen_pool *
later once dynamic expansion is added.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c   |   5 +-
 include/linux/dma-direct.h  |   2 +
 include/linux/dma-mapping.h |   6 +-
 kernel/dma/direct.c         |  12 ++--
 kernel/dma/pool.c           | 120 +++++++++++++++++++++++-------------
 5 files changed, 91 insertions(+), 54 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 806c338c5062..3c3a395ecbc4 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -937,7 +937,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
 
 	/* Non-coherent atomic allocation? Easy */
 	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_free_from_pool(cpu_addr, alloc_size))
+	    dma_free_from_pool(dev, cpu_addr, alloc_size))
 		return;
 
 	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
@@ -1020,7 +1020,8 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 
 	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    !gfpflags_allow_blocking(gfp) && !coherent)
-		cpu_addr = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp);
+		cpu_addr = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page,
+					       gfp);
 	else
 		cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs);
 	if (!cpu_addr)
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index a4a8acb6073e..1a729c4c5699 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -69,6 +69,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
 }
 
 u64 dma_direct_get_required_mask(struct device *dev);
+gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+				  u64 *phys_mask);
 void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index bb17ef73242a..d6586cf1ecf6 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -639,9 +639,9 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
 			pgprot_t prot, const void *caller);
 void dma_common_free_remap(void *cpu_addr, size_t size);
 
-bool dma_in_atomic_pool(void *start, size_t size);
-void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags);
-bool dma_free_from_pool(void *start, size_t size);
+void *dma_alloc_from_pool(struct device *dev, size_t size,
+			  struct page **ret_page, gfp_t flags);
+bool dma_free_from_pool(struct device *dev, void *start, size_t size);
 
 int
 dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index e342dc52a39f..1a396c657536 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -46,8 +46,8 @@ u64 dma_direct_get_required_mask(struct device *dev)
 	return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
 }
 
-static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
-		u64 *phys_limit)
+gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+				  u64 *phys_limit)
 {
 	u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
 
@@ -90,8 +90,8 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 
 	/* we always manually zero the memory once we are done: */
 	gfp &= ~__GFP_ZERO;
-	gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-			&phys_limit);
+	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+					   &phys_limit);
 	page = dma_alloc_contiguous(dev, alloc_size, gfp);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		dma_free_contiguous(dev, page, alloc_size);
@@ -129,7 +129,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    dma_alloc_need_uncached(dev, attrs) &&
 	    !gfpflags_allow_blocking(gfp)) {
-		ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp);
+		ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp);
 		if (!ret)
 			return NULL;
 		goto done;
@@ -210,7 +210,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 	}
 
 	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_free_from_pool(cpu_addr, PAGE_ALIGN(size)))
+	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
 		return;
 
 	if (force_dma_unencrypted(dev))
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 3df5d9d39922..db4f89ac5f5f 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -10,7 +10,9 @@
 #include <linux/genalloc.h>
 #include <linux/slab.h>
 
-static struct gen_pool *atomic_pool __ro_after_init;
+static struct gen_pool *atomic_pool_dma __ro_after_init;
+static struct gen_pool *atomic_pool_dma32 __ro_after_init;
+static struct gen_pool *atomic_pool_kernel __ro_after_init;
 
 #define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
 static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
@@ -22,89 +24,119 @@ static int __init early_coherent_pool(char *p)
 }
 early_param("coherent_pool", early_coherent_pool);
 
-static gfp_t dma_atomic_pool_gfp(void)
+static int __init __dma_atomic_pool_init(struct gen_pool **pool,
+					 size_t pool_size, gfp_t gfp)
 {
-	if (IS_ENABLED(CONFIG_ZONE_DMA))
-		return GFP_DMA;
-	if (IS_ENABLED(CONFIG_ZONE_DMA32))
-		return GFP_DMA32;
-	return GFP_KERNEL;
-}
-
-static int __init dma_atomic_pool_init(void)
-{
-	unsigned int pool_size_order = get_order(atomic_pool_size);
-	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+	const unsigned int order = get_order(pool_size);
+	const unsigned long nr_pages = pool_size >> PAGE_SHIFT;
 	struct page *page;
 	void *addr;
 	int ret;
 
 	if (dev_get_cma_area(NULL))
-		page = dma_alloc_from_contiguous(NULL, nr_pages,
-						 pool_size_order, false);
+		page = dma_alloc_from_contiguous(NULL, nr_pages, order, false);
 	else
-		page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
+		page = alloc_pages(gfp, order);
 	if (!page)
 		goto out;
 
-	arch_dma_prep_coherent(page, atomic_pool_size);
+	arch_dma_prep_coherent(page, pool_size);
 
-	atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
-	if (!atomic_pool)
+	*pool = gen_pool_create(PAGE_SHIFT, -1);
+	if (!*pool)
 		goto free_page;
 
-	addr = dma_common_contiguous_remap(page, atomic_pool_size,
+	addr = dma_common_contiguous_remap(page, pool_size,
 					   pgprot_dmacoherent(PAGE_KERNEL),
 					   __builtin_return_address(0));
 	if (!addr)
 		goto destroy_genpool;
 
-	ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
-				page_to_phys(page), atomic_pool_size, -1);
+	ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page),
+				pool_size, -1);
 	if (ret)
 		goto remove_mapping;
-	gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
+	gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL);
 
-	pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
-		atomic_pool_size / 1024);
+	pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
+		pool_size >> 10, &gfp);
 	return 0;
 
 remove_mapping:
-	dma_common_free_remap(addr, atomic_pool_size);
+	dma_common_free_remap(addr, pool_size);
 destroy_genpool:
-	gen_pool_destroy(atomic_pool);
-	atomic_pool = NULL;
+	gen_pool_destroy(*pool);
+	*pool = NULL;
 free_page:
 	if (!dma_release_from_contiguous(NULL, page, nr_pages))
-		__free_pages(page, pool_size_order);
+		__free_pages(page, order);
 out:
-	pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
-		atomic_pool_size / 1024);
+	pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
+	       pool_size >> 10, &gfp);
 	return -ENOMEM;
 }
+
+static int __init dma_atomic_pool_init(void)
+{
+	int ret = 0;
+	int err;
+
+	ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size,
+				     GFP_KERNEL);
+	if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+		err = __dma_atomic_pool_init(&atomic_pool_dma,
+					     atomic_pool_size, GFP_DMA);
+		if (!ret && err)
+			ret = err;
+	}
+	if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
+		err = __dma_atomic_pool_init(&atomic_pool_dma32,
+					     atomic_pool_size, GFP_DMA32);
+		if (!ret && err)
+			ret = err;
+	}
+	return ret;
+}
 postcore_initcall(dma_atomic_pool_init);
 
-bool dma_in_atomic_pool(void *start, size_t size)
+static inline struct gen_pool *dev_to_pool(struct device *dev)
 {
-	if (unlikely(!atomic_pool))
-		return false;
+	u64 phys_mask;
+	gfp_t gfp;
+
+	gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+					  &phys_mask);
+	if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA)
+		return atomic_pool_dma;
+	if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32)
+		return atomic_pool_dma32;
+	return atomic_pool_kernel;
+}
 
-	return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
+static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size)
+{
+	struct gen_pool *pool = dev_to_pool(dev);
+
+	if (unlikely(!pool))
+		return false;
+	return gen_pool_has_addr(pool, (unsigned long)start, size);
 }
 
-void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
+void *dma_alloc_from_pool(struct device *dev, size_t size,
+			  struct page **ret_page, gfp_t flags)
 {
+	struct gen_pool *pool = dev_to_pool(dev);
 	unsigned long val;
 	void *ptr = NULL;
 
-	if (!atomic_pool) {
-		WARN(1, "coherent pool not initialised!\n");
+	if (!pool) {
+		WARN(1, "%pGg atomic pool not initialised!\n", &flags);
 		return NULL;
 	}
 
-	val = gen_pool_alloc(atomic_pool, size);
+	val = gen_pool_alloc(pool, size);
 	if (val) {
-		phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+		phys_addr_t phys = gen_pool_virt_to_phys(pool, val);
 
 		*ret_page = pfn_to_page(__phys_to_pfn(phys));
 		ptr = (void *)val;
@@ -114,10 +146,12 @@ void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
 	return ptr;
 }
 
-bool dma_free_from_pool(void *start, size_t size)
+bool dma_free_from_pool(struct device *dev, void *start, size_t size)
 {
-	if (!dma_in_atomic_pool(start, size))
+	struct gen_pool *pool = dev_to_pool(dev);
+
+	if (!dma_in_atomic_pool(dev, start, size))
 		return false;
-	gen_pool_free(atomic_pool, (unsigned long)start, size);
+	gen_pool_free(pool, (unsigned long)start, size);
 	return true;
 }
-- 
Gitee


From 325e4715847f48758b02d8b3e0cbb92de2a9668c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 20 Apr 2020 12:09:58 +0200
Subject: [PATCH 0979/2161] dma-pool: dynamically expanding atomic pools

commit 54adadf9b08571fb8b11dc9d0d3a2ddd39825efd upstream.

When an atomic pool becomes fully depleted because it is now relied upon
for all non-blocking allocations through the DMA API, allow background
expansion of each pool by a kworker.

When an atomic pool has less than the default size of memory left, kick
off a kworker to dynamically expand the pool in the background.  The pool
is doubled in size, up to MAX_ORDER-1.  If memory cannot be allocated at
the requested order, smaller allocation(s) are attempted.

This allows the default size to be kept quite low when one or more of the
atomic pools is not used.

Allocations for lowmem should also use GFP_KERNEL for the benefits of
reclaim, so use GFP_KERNEL | GFP_DMA and GFP_KERNEL | GFP_DMA32 for
lowmem allocations.

This also allows __dma_atomic_pool_init() to return a pointer to the pool
to make initialization cleaner.

Also switch over some node ids to the more appropriate NUMA_NO_NODE.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 122 +++++++++++++++++++++++++++++++---------------
 1 file changed, 84 insertions(+), 38 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index db4f89ac5f5f..ffe866c2c034 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -9,13 +9,17 @@
 #include <linux/init.h>
 #include <linux/genalloc.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 
 static struct gen_pool *atomic_pool_dma __ro_after_init;
 static struct gen_pool *atomic_pool_dma32 __ro_after_init;
 static struct gen_pool *atomic_pool_kernel __ro_after_init;
 
 #define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
+static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+/* Dynamic background expansion when the atomic pool is near capacity */
+static struct work_struct atomic_pool_work;
 
 static int __init early_coherent_pool(char *p)
 {
@@ -24,76 +28,116 @@ static int __init early_coherent_pool(char *p)
 }
 early_param("coherent_pool", early_coherent_pool);
 
-static int __init __dma_atomic_pool_init(struct gen_pool **pool,
-					 size_t pool_size, gfp_t gfp)
+static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
+			      gfp_t gfp)
 {
-	const unsigned int order = get_order(pool_size);
-	const unsigned long nr_pages = pool_size >> PAGE_SHIFT;
+	unsigned int order;
 	struct page *page;
 	void *addr;
-	int ret;
+	int ret = -ENOMEM;
+
+	/* Cannot allocate larger than MAX_ORDER-1 */
+	order = min(get_order(pool_size), MAX_ORDER-1);
+
+	do {
+		pool_size = 1 << (PAGE_SHIFT + order);
 
-	if (dev_get_cma_area(NULL))
-		page = dma_alloc_from_contiguous(NULL, nr_pages, order, false);
-	else
-		page = alloc_pages(gfp, order);
+		if (dev_get_cma_area(NULL))
+			page = dma_alloc_from_contiguous(NULL, 1 << order,
+							 order, false);
+		else
+			page = alloc_pages(gfp, order);
+	} while (!page && order-- > 0);
 	if (!page)
 		goto out;
 
 	arch_dma_prep_coherent(page, pool_size);
 
-	*pool = gen_pool_create(PAGE_SHIFT, -1);
-	if (!*pool)
-		goto free_page;
-
 	addr = dma_common_contiguous_remap(page, pool_size,
 					   pgprot_dmacoherent(PAGE_KERNEL),
 					   __builtin_return_address(0));
 	if (!addr)
-		goto destroy_genpool;
+		goto free_page;
 
-	ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page),
-				pool_size, -1);
+	ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
+				pool_size, NUMA_NO_NODE);
 	if (ret)
 		goto remove_mapping;
-	gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL);
 
-	pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
-		pool_size >> 10, &gfp);
 	return 0;
 
 remove_mapping:
 	dma_common_free_remap(addr, pool_size);
-destroy_genpool:
-	gen_pool_destroy(*pool);
-	*pool = NULL;
 free_page:
-	if (!dma_release_from_contiguous(NULL, page, nr_pages))
+	if (!dma_release_from_contiguous(NULL, page, 1 << order))
 		__free_pages(page, order);
 out:
-	pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
-	       pool_size >> 10, &gfp);
-	return -ENOMEM;
+	return ret;
+}
+
+static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
+{
+	if (pool && gen_pool_avail(pool) < atomic_pool_size)
+		atomic_pool_expand(pool, gen_pool_size(pool), gfp);
+}
+
+static void atomic_pool_work_fn(struct work_struct *work)
+{
+	if (IS_ENABLED(CONFIG_ZONE_DMA))
+		atomic_pool_resize(atomic_pool_dma,
+				   GFP_KERNEL | GFP_DMA);
+	if (IS_ENABLED(CONFIG_ZONE_DMA32))
+		atomic_pool_resize(atomic_pool_dma32,
+				   GFP_KERNEL | GFP_DMA32);
+	atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
+}
+
+static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
+						      gfp_t gfp)
+{
+	struct gen_pool *pool;
+	int ret;
+
+	pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
+	if (!pool)
+		return NULL;
+
+	gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
+
+	ret = atomic_pool_expand(pool, pool_size, gfp);
+	if (ret) {
+		gen_pool_destroy(pool);
+		pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
+		       pool_size >> 10, &gfp);
+		return NULL;
+	}
+
+	pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
+		gen_pool_size(pool) >> 10, &gfp);
+	return pool;
 }
 
 static int __init dma_atomic_pool_init(void)
 {
 	int ret = 0;
-	int err;
 
-	ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size,
-				     GFP_KERNEL);
+	INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
+
+	atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
+						    GFP_KERNEL);
+	if (!atomic_pool_kernel)
+		ret = -ENOMEM;
 	if (IS_ENABLED(CONFIG_ZONE_DMA)) {
-		err = __dma_atomic_pool_init(&atomic_pool_dma,
-					     atomic_pool_size, GFP_DMA);
-		if (!ret && err)
-			ret = err;
+		atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
+						GFP_KERNEL | GFP_DMA);
+		if (!atomic_pool_dma)
+			ret = -ENOMEM;
 	}
 	if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
-		err = __dma_atomic_pool_init(&atomic_pool_dma32,
-					     atomic_pool_size, GFP_DMA32);
-		if (!ret && err)
-			ret = err;
+		atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
+						GFP_KERNEL | GFP_DMA32);
+		if (!atomic_pool_dma32)
+			ret = -ENOMEM;
 	}
 	return ret;
 }
@@ -142,6 +186,8 @@ void *dma_alloc_from_pool(struct device *dev, size_t size,
 		ptr = (void *)val;
 		memset(ptr, 0, size);
 	}
+	if (gen_pool_avail(pool) < atomic_pool_size)
+		schedule_work(&atomic_pool_work);
 
 	return ptr;
 }
-- 
Gitee


From b774ecdaa2e36a8a255f7b8187f41273ea3db8b8 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 14 Apr 2020 17:04:58 -0700
Subject: [PATCH 0980/2161] dma-direct: atomic allocations must come from
 atomic coherent pools

commit 76a19940bd62a81148c303f3df6d0cee9ae4b509 upstream.

When a device requires unencrypted memory and the context does not allow
blocking, memory must be returned from the atomic coherent pools.

This avoids the remap when CONFIG_DMA_DIRECT_REMAP is not enabled and the
config only requires CONFIG_DMA_COHERENT_POOL.  This will be used for
CONFIG_AMD_MEM_ENCRYPT in a subsequent patch.

Keep all memory in these pools unencrypted.  When set_memory_decrypted()
fails, this prohibits the memory from being added.  If adding memory to
the genpool fails, and set_memory_encrypted() subsequently fails, there
is no alternative other than leaking the memory.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 46 ++++++++++++++++++++++++++++++++++++++-------
 kernel/dma/pool.c   | 27 +++++++++++++++++++++++---
 2 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1a396c657536..70f0298aa9d0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -77,6 +77,39 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
+/*
+ * Decrypting memory is allowed to block, so if this device requires
+ * unencrypted memory it must come from atomic pools.
+ */
+static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp,
+					      unsigned long attrs)
+{
+	if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+		return false;
+	if (gfpflags_allow_blocking(gfp))
+		return false;
+	if (force_dma_unencrypted(dev))
+		return true;
+	if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
+		return false;
+	if (dma_alloc_need_uncached(dev, attrs))
+		return true;
+	return false;
+}
+
+static inline bool dma_should_free_from_pool(struct device *dev,
+					     unsigned long attrs)
+{
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+		return true;
+	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+	    !force_dma_unencrypted(dev))
+		return false;
+	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
+		return true;
+	return false;
+}
+
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		gfp_t gfp, unsigned long attrs)
 {
@@ -126,9 +159,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	struct page *page;
 	void *ret;
 
-	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_alloc_need_uncached(dev, attrs) &&
-	    !gfpflags_allow_blocking(gfp)) {
+	if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
 		ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp);
 		if (!ret)
 			return NULL;
@@ -202,6 +233,11 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 {
 	unsigned int page_order = get_order(size);
 
+	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+	if (dma_should_free_from_pool(dev, attrs) &&
+	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
+		return;
+
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
 	    !force_dma_unencrypted(dev)) {
 		/* cpu_addr is a struct page cookie, not a kernel address */
@@ -209,10 +245,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		return;
 	}
 
-	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
-		return;
-
 	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index ffe866c2c034..c8d61b3a7bd6 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -8,6 +8,7 @@
 #include <linux/dma-contiguous.h>
 #include <linux/init.h>
 #include <linux/genalloc.h>
+#include <linux/set_memory.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 
@@ -53,22 +54,42 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 
 	arch_dma_prep_coherent(page, pool_size);
 
+#ifdef CONFIG_DMA_DIRECT_REMAP
 	addr = dma_common_contiguous_remap(page, pool_size,
 					   pgprot_dmacoherent(PAGE_KERNEL),
 					   __builtin_return_address(0));
 	if (!addr)
 		goto free_page;
-
+#else
+	addr = page_to_virt(page);
+#endif
+	/*
+	 * Memory in the atomic DMA pools must be unencrypted, the pools do not
+	 * shrink so no re-encryption occurs in dma_direct_free_pages().
+	 */
+	ret = set_memory_decrypted((unsigned long)page_to_virt(page),
+				   1 << order);
+	if (ret)
+		goto remove_mapping;
 	ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
 				pool_size, NUMA_NO_NODE);
 	if (ret)
-		goto remove_mapping;
+		goto encrypt_mapping;
 
 	return 0;
 
+encrypt_mapping:
+	ret = set_memory_encrypted((unsigned long)page_to_virt(page),
+				   1 << order);
+	if (WARN_ON_ONCE(ret)) {
+		/* Decrypt succeeded but encrypt failed, purposely leak */
+		goto out;
+	}
 remove_mapping:
+#ifdef CONFIG_DMA_DIRECT_REMAP
 	dma_common_free_remap(addr, pool_size);
-free_page:
+#endif
+free_page: __maybe_unused
 	if (!dma_release_from_contiguous(NULL, page, 1 << order))
 		__free_pages(page, order);
 out:
-- 
Gitee


From aaa96b25005ba920c1e244a3b1df49a4f4dcf34d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 14 Apr 2020 17:04:59 -0700
Subject: [PATCH 0981/2161] dma-pool: add pool sizes to debugfs

commit 2edc5bb3c5cc42131438460a50b7b16905c81c2a upstream.

The atomic DMA pools can dynamically expand based on non-blocking
allocations that need to use it.

Export the sizes of each of these pools, in bytes, through debugfs for
measurement.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Rientjes <rientjes@google.com>
[hch: remove the !CONFIG_DEBUG_FS stubs]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index c8d61b3a7bd6..dde6de7f8e83 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2012 ARM Ltd.
  * Copyright (C) 2020 Google LLC
  */
+#include <linux/debugfs.h>
 #include <linux/dma-direct.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/dma-contiguous.h>
@@ -13,8 +14,11 @@
 #include <linux/workqueue.h>
 
 static struct gen_pool *atomic_pool_dma __ro_after_init;
+static unsigned long pool_size_dma;
 static struct gen_pool *atomic_pool_dma32 __ro_after_init;
+static unsigned long pool_size_dma32;
 static struct gen_pool *atomic_pool_kernel __ro_after_init;
+static unsigned long pool_size_kernel;
 
 #define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
 static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
@@ -29,6 +33,29 @@ static int __init early_coherent_pool(char *p)
 }
 early_param("coherent_pool", early_coherent_pool);
 
+static void __init dma_atomic_pool_debugfs_init(void)
+{
+	struct dentry *root;
+
+	root = debugfs_create_dir("dma_pools", NULL);
+	if (IS_ERR_OR_NULL(root))
+		return;
+
+	debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
+	debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
+	debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
+}
+
+static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
+{
+	if (gfp & __GFP_DMA)
+		pool_size_dma += size;
+	else if (gfp & __GFP_DMA32)
+		pool_size_dma32 += size;
+	else
+		pool_size_kernel += size;
+}
+
 static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 			      gfp_t gfp)
 {
@@ -76,6 +103,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 	if (ret)
 		goto encrypt_mapping;
 
+	dma_atomic_pool_size_add(gfp, pool_size);
 	return 0;
 
 encrypt_mapping:
@@ -160,6 +188,8 @@ static int __init dma_atomic_pool_init(void)
 		if (!atomic_pool_dma32)
 			ret = -ENOMEM;
 	}
+
+	dma_atomic_pool_debugfs_init();
 	return ret;
 }
 postcore_initcall(dma_atomic_pool_init);
-- 
Gitee


From 7d1741872e6fb0b1dd2f862d05d44e1eba0bf9bc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 14 Apr 2020 17:05:02 -0700
Subject: [PATCH 0982/2161] dma-pool: scale the default DMA coherent pool size
 with memory capacity

commit 1d659236fb43c4d2b37af7a4309681e834e9ec9a upstream.

When AMD memory encryption is enabled, some devices may use more than
256KB/sec from the atomic pools.  It would be more appropriate to scale
the default size based on memory capacity unless the coherent_pool
option is used on the kernel command line.

This provides a slight optimization on initial expansion and is deemed
appropriate due to the increased reliance on the atomic pools.  Note that
the default size of 128KB per pool will normally be larger than the
single coherent pool implementation since there are now up to three
coherent pools (DMA, DMA32, and kernel).

Note that even prior to this patch, coherent_pool= for sizes larger than
1 << (PAGE_SHIFT + MAX_ORDER-1) can fail.  With new dynamic expansion
support, this would be trivially extensible to allow even larger initial
sizes.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index dde6de7f8e83..35bb51c31fff 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -20,8 +20,8 @@ static unsigned long pool_size_dma32;
 static struct gen_pool *atomic_pool_kernel __ro_after_init;
 static unsigned long pool_size_kernel;
 
-#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+/* Size can be defined by the coherent_pool command line */
+static size_t atomic_pool_size;
 
 /* Dynamic background expansion when the atomic pool is near capacity */
 static struct work_struct atomic_pool_work;
@@ -170,6 +170,16 @@ static int __init dma_atomic_pool_init(void)
 {
 	int ret = 0;
 
+	/*
+	 * If coherent_pool was not used on the command line, default the pool
+	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+	 */
+	if (!atomic_pool_size) {
+		atomic_pool_size = max(totalram_pages() >> PAGE_SHIFT, 1UL) *
+					SZ_128K;
+		atomic_pool_size = min_t(size_t, atomic_pool_size,
+					 1 << (PAGE_SHIFT + MAX_ORDER-1));
+	}
 	INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
 
 	atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
-- 
Gitee


From 2af78e1bb24c2c91e6034af4a9d8633d78eecaa9 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 23 Apr 2020 09:31:31 -0700
Subject: [PATCH 0983/2161] dma-contiguous: fix comment for
 dma_release_from_contiguous

commit 298f3db6ee690259927b105d5ad1079563361323 upstream.

Commit 90ae409f9eb3 ("dma-direct: fix zone selection
after an unaddressable CMA allocation") changed the logic in
dma_release_from_contiguous to remove the normal pages fallback path,
but did not update the comment. Fix that.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/contiguous.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 69cfb4345388..5b2920d94f83 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -221,8 +221,8 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
  * @gfp:   Allocation flags.
  *
  * This function allocates contiguous memory buffer for specified device. It
- * first tries to use device specific contiguous memory area if available or
- * the default global one, then tries a fallback allocation of normal pages.
+ * tries to use device specific contiguous memory area if available, or the
+ * default global one.
  *
  * Note that it byapss one-page size of allocations from the global area as
  * the addresses within one page are always contiguous, so there is no need
-- 
Gitee


From 8c3fe62a5bef52ee00cce43f8c265388058362a4 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 13 May 2020 15:32:08 +0200
Subject: [PATCH 0984/2161] dma-mapping: add generic helpers for mapping
 sgtable objects

commit d9d200bcebc1f6e56f0178cbb8db9953e8cc9a11 upstream.

struct sg_table is a common structure used for describing a memory
buffer. It consists of a scatterlist with memory pages and DMA addresses
(sgl entry), as well as the number of scatterlist entries: CPU pages
(orig_nents entry) and DMA mapped pages (nents entry).

It turned out that it was a common mistake to misuse nents and orig_nents
entries, calling DMA-mapping functions with a wrong number of entries or
ignoring the number of mapped entries returned by the dma_map_sg
function.

To avoid such issues, let's introduce a common wrappers operating
directly on the struct sg_table objects, which take care of the proper
use of the nents and orig_nents entries.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-mapping.h | 80 +++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index d6586cf1ecf6..3f57b35e3f8d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -618,6 +618,86 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 	return dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
+/**
+ * dma_map_sgtable - Map the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ * @attrs:	Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After success the
+ * ownership for the buffer is transferred to the DMA domain.  One has to
+ * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
+ * ownership of the buffer back to the CPU domain before touching the
+ * buffer by the CPU.
+ *
+ * Returns 0 on success or -EINVAL on error during mapping the buffer.
+ */
+static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	int nents;
+
+	nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+	if (nents <= 0)
+		return -EINVAL;
+	sgt->nents = nents;
+	return 0;
+}
+
+/**
+ * dma_unmap_sgtable - Unmap the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ * @attrs:	Optional DMA attributes for the unmap operation
+ *
+ * Unmaps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After this function
+ * the ownership of the buffer is transferred back to the CPU domain.
+ */
+static inline void dma_unmap_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	dma_unmap_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+}
+
+/**
+ * dma_sync_sgtable_for_cpu - Synchronize the given buffer for CPU access
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ *
+ * Performs the needed cache synchronization and moves the ownership of the
+ * buffer back to the CPU domain, so it is safe to perform any access to it
+ * by the CPU. Before doing any further DMA operations, one has to transfer
+ * the ownership of the buffer back to the DMA domain by calling the
+ * dma_sync_sgtable_for_device().
+ */
+static inline void dma_sync_sgtable_for_cpu(struct device *dev,
+		struct sg_table *sgt, enum dma_data_direction dir)
+{
+	dma_sync_sg_for_cpu(dev, sgt->sgl, sgt->orig_nents, dir);
+}
+
+/**
+ * dma_sync_sgtable_for_device - Synchronize the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ *
+ * Performs the needed cache synchronization and moves the ownership of the
+ * buffer back to the DMA domain, so it is safe to perform the DMA operation.
+ * Once finished, one has to call dma_sync_sgtable_for_cpu() or
+ * dma_unmap_sgtable().
+ */
+static inline void dma_sync_sgtable_for_device(struct device *dev,
+		struct sg_table *sgt, enum dma_data_direction dir)
+{
+	dma_sync_sg_for_device(dev, sgt->sgl, sgt->orig_nents, dir);
+}
+
 #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0)
 #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
 #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
-- 
Gitee


From 91ac1c3762d9dc68c6a25e1da4787e65f2b59c77 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 13 May 2020 15:32:09 +0200
Subject: [PATCH 0985/2161] scatterlist: add generic wrappers for iterating
 over sgtable objects

commit 709d6d73c756107fb8a292a9f957d630097425fa upstream.

struct sg_table is a common structure used for describing a memory
buffer. It consists of a scatterlist with memory pages and DMA addresses
(sgl entry), as well as the number of scatterlist entries: CPU pages
(orig_nents entry) and DMA mapped pages (nents entry).

It turned out that it was a common mistake to misuse nents and orig_nents
entries, calling the scatterlist iterating functions with a wrong number
of the entries.

To avoid such issues, lets introduce a common wrappers operating directly
on the struct sg_table objects, which take care of the proper use of
the nents and orig_nents entries.

While touching this, lets clarify some ambiguities in the comments for
the existing for_each helpers.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/scatterlist.h | 50 ++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6eec50fb36c8..4f922afb607a 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -151,6 +151,20 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
 #define for_each_sg(sglist, sg, nr, __i)	\
 	for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
 
+/*
+ * Loop over each sg element in the given sg_table object.
+ */
+#define for_each_sgtable_sg(sgt, sg, i)		\
+	for_each_sg(sgt->sgl, sg, sgt->orig_nents, i)
+
+/*
+ * Loop over each sg element in the given *DMA mapped* sg_table object.
+ * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses
+ * of the each element.
+ */
+#define for_each_sgtable_dma_sg(sgt, sg, i)	\
+	for_each_sg(sgt->sgl, sg, sgt->nents, i)
+
 /**
  * sg_chain - Chain two sglists together
  * @prv:	First scatterlist
@@ -401,9 +415,10 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
  * @sglist:	sglist to iterate over
  * @piter:	page iterator to hold current page, sg, sg_pgoffset
  * @nents:	maximum number of sg entries to iterate over
- * @pgoffset:	starting page offset
+ * @pgoffset:	starting page offset (in pages)
  *
  * Callers may use sg_page_iter_page() to get each page pointer.
+ * In each loop it operates on PAGE_SIZE unit.
  */
 #define for_each_sg_page(sglist, piter, nents, pgoffset)		   \
 	for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \
@@ -412,18 +427,47 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
 /**
  * for_each_sg_dma_page - iterate over the pages of the given sg list
  * @sglist:	sglist to iterate over
- * @dma_iter:	page iterator to hold current page
+ * @dma_iter:	DMA page iterator to hold current page
  * @dma_nents:	maximum number of sg entries to iterate over, this is the value
  *              returned from dma_map_sg
- * @pgoffset:	starting page offset
+ * @pgoffset:	starting page offset (in pages)
  *
  * Callers may use sg_page_iter_dma_address() to get each page's DMA address.
+ * In each loop it operates on PAGE_SIZE unit.
  */
 #define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset)            \
 	for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents,        \
 				  pgoffset);                                   \
 	     __sg_page_iter_dma_next(dma_iter);)
 
+/**
+ * for_each_sgtable_page - iterate over all pages in the sg_table object
+ * @sgt:	sg_table object to iterate over
+ * @piter:	page iterator to hold current page
+ * @pgoffset:	starting page offset (in pages)
+ *
+ * Iterates over the all memory pages in the buffer described by
+ * a scatterlist stored in the given sg_table object.
+ * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit.
+ */
+#define for_each_sgtable_page(sgt, piter, pgoffset)	\
+	for_each_sg_page(sgt->sgl, piter, sgt->orig_nents, pgoffset)
+
+/**
+ * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object
+ * @sgt:	sg_table object to iterate over
+ * @dma_iter:	DMA page iterator to hold current page
+ * @pgoffset:	starting page offset (in pages)
+ *
+ * Iterates over the all DMA mapped pages in the buffer described by
+ * a scatterlist stored in the given sg_table object.
+ * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE
+ * unit.
+ */
+#define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset)	\
+	for_each_sg_dma_page(sgt->sgl, dma_iter, sgt->nents, pgoffset)
+
+
 /*
  * Mapping sg iterator
  *
-- 
Gitee


From 02cf8b242f92c9d7a2fbd87c9b24a126d34e4848 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 13 May 2020 15:32:10 +0200
Subject: [PATCH 0986/2161] iommu: add generic helper for mapping sgtable
 objects

commit 48530d9fab0d3bf08827f9167be54acf66d4d457 upstream.

struct sg_table is a common structure used for describing a memory
buffer. It consists of a scatterlist with memory pages and DMA addresses
(sgl entry), as well as the number of scatterlist entries: CPU pages
(orig_nents entry) and DMA mapped pages (nents entry).

It turned out that it was a common mistake to misuse nents and orig_nents
entries, calling mapping functions with a wrong number of entries.

To avoid such issues, lets introduce a common wrapper operating directly
on the struct sg_table objects, which take care of the proper use of
the nents and orig_nents entries.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 499e22136ecf..513ec3dc0170 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -476,6 +476,22 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t io
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
 			iommu_fault_handler_t handler, void *token);
 
+/**
+ * iommu_map_sgtable - Map the given buffer to the IOMMU domain
+ * @domain:	The IOMMU domain to perform the mapping
+ * @iova:	The start address to map the buffer
+ * @sgt:	The sg_table object describing the buffer
+ * @prot:	IOMMU protection bits
+ *
+ * Creates a mapping at @iova for the buffer described by a scatterlist
+ * stored in the given sg_table object in the provided IOMMU domain.
+ */
+static inline size_t iommu_map_sgtable(struct iommu_domain *domain,
+			unsigned long iova, struct sg_table *sgt, int prot)
+{
+	return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot);
+}
+
 extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
 extern void generic_iommu_put_resv_regions(struct device *dev,
-- 
Gitee


From c510081bc1e00fdd7337a52504c77603d0629f2a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Jun 2020 10:29:49 +0200
Subject: [PATCH 0987/2161] dma-direct: re-enable mmap for !CONFIG_MMU

commit 1fbf57d0530217b9f264eedc4867bf91479cdf3c upstream.

nommu configfs can trivially map the coherent allocations to user space,
as no actual page table setup is required and the kernel and the user
space programs share the same address space.

Fixes: 62fcee9a3bd7 ("dma-mapping: remove CONFIG_ARCH_NO_COHERENT_DMA_MMAP")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: dillon min <dillon.minfei@gmail.com>
Reviewed-by: Vladimir Murzin <vladimir.murzin@arm.com>
Tested-by: dillon min <dillon.minfei@gmail.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/Kconfig  |  1 +
 kernel/dma/direct.c | 14 --------------
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index d006668c0027..e0dae570a515 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -71,6 +71,7 @@ config SWIOTLB
 # in the pagetables
 #
 config DMA_NONCOHERENT_MMAP
+	default y if !MMU
 	bool
 
 config DMA_REMAP
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 70f0298aa9d0..8cf29c74c29e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -457,7 +457,6 @@ int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
 	return ret;
 }
 
-#ifdef CONFIG_MMU
 bool dma_direct_can_mmap(struct device *dev)
 {
 	return dev_is_dma_coherent(dev) ||
@@ -483,19 +482,6 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 	return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
 			user_count << PAGE_SHIFT, vma->vm_page_prot);
 }
-#else /* CONFIG_MMU */
-bool dma_direct_can_mmap(struct device *dev)
-{
-	return false;
-}
-
-int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
-		void *cpu_addr, dma_addr_t dma_addr, size_t size,
-		unsigned long attrs)
-{
-	return -ENXIO;
-}
-#endif /* CONFIG_MMU */
 
 /*
  * Because 32-bit DMA masks are so common we expect every architecture to be
-- 
Gitee


From de0e14e8d7a8696f79328ba2405c471b5ff5aa20 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 15 Jun 2020 08:52:31 +0200
Subject: [PATCH 0988/2161] dma-direct: mark __dma_direct_alloc_pages static

commit 26749b3201ab05e288fbf78fbc8585dfa2da3218 upstream.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-direct.h | 2 --
 kernel/dma/direct.c        | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 1a729c4c5699..5edf1eefed50 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -79,8 +79,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
-struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
-		gfp_t gfp, unsigned long attrs);
 int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8cf29c74c29e..ca0db4f9eabf 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -110,7 +110,7 @@ static inline bool dma_should_free_from_pool(struct device *dev,
 	return false;
 }
 
-struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
+static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		gfp_t gfp, unsigned long attrs)
 {
 	size_t alloc_size = PAGE_ALIGN(size);
-- 
Gitee


From a68705c1359dd75aa2efba10d9c16a22f3c98dc5 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 11 Jun 2020 12:20:28 -0700
Subject: [PATCH 0989/2161] dma-direct: always align allocation size in
 dma_direct_alloc_pages()

commit 633d5fce78a61e8727674467944939f55b0bcfab upstream.

dma_alloc_contiguous() does size >> PAGE_SHIFT and set_memory_decrypted()
works at page granularity.  It's necessary to page align the allocation
size in dma_direct_alloc_pages() for consistent behavior.

This also fixes an issue when arch_dma_prep_coherent() is called on an
unaligned allocation size for dma_alloc_need_uncached() when
CONFIG_DMA_DIRECT_REMAP is disabled but CONFIG_ARCH_HAS_DMA_SET_UNCACHED
is enabled.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index ca0db4f9eabf..6cdf5b37ae10 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -113,11 +113,12 @@ static inline bool dma_should_free_from_pool(struct device *dev,
 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		gfp_t gfp, unsigned long attrs)
 {
-	size_t alloc_size = PAGE_ALIGN(size);
 	int node = dev_to_node(dev);
 	struct page *page = NULL;
 	u64 phys_limit;
 
+	WARN_ON_ONCE(!PAGE_ALIGNED(size));
+
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
 
@@ -125,14 +126,14 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	gfp &= ~__GFP_ZERO;
 	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
 					   &phys_limit);
-	page = dma_alloc_contiguous(dev, alloc_size, gfp);
+	page = dma_alloc_contiguous(dev, size, gfp);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
-		dma_free_contiguous(dev, page, alloc_size);
+		dma_free_contiguous(dev, page, size);
 		page = NULL;
 	}
 again:
 	if (!page)
-		page = alloc_pages_node(node, gfp, get_order(alloc_size));
+		page = alloc_pages_node(node, gfp, get_order(size));
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		dma_free_contiguous(dev, page, size);
 		page = NULL;
@@ -159,8 +160,10 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	struct page *page;
 	void *ret;
 
+	size = PAGE_ALIGN(size);
+
 	if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
-		ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp);
+		ret = dma_alloc_from_pool(dev, size, &page, gfp);
 		if (!ret)
 			return NULL;
 		goto done;
@@ -184,10 +187,10 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	     dma_alloc_need_uncached(dev, attrs)) ||
 	    (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
 		/* remove any dirty cache lines on the kernel alias */
-		arch_dma_prep_coherent(page, PAGE_ALIGN(size));
+		arch_dma_prep_coherent(page, size);
 
 		/* create a coherent mapping */
-		ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size),
+		ret = dma_common_contiguous_remap(page, size,
 				dma_pgprot(dev, PAGE_KERNEL, attrs),
 				__builtin_return_address(0));
 		if (!ret)
-- 
Gitee


From 25d02426b319c68e1c968e8ce7e1cc98fa5a7cdc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 11 Jun 2020 12:20:29 -0700
Subject: [PATCH 0990/2161] dma-direct: re-encrypt memory if
 dma_direct_alloc_pages() fails

commit 96a539fa3bb71f443ae08e57b9f63d6e5bb2207c upstream.

If arch_dma_set_uncached() fails after memory has been decrypted, it needs
to be re-encrypted before freeing.

Fixes: fa7e2247c572 ("dma-direct: make uncached_kernel_address more general")
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 6cdf5b37ae10..4d21465ab266 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -194,7 +194,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 				dma_pgprot(dev, PAGE_KERNEL, attrs),
 				__builtin_return_address(0));
 		if (!ret)
-			goto out_free_pages;
+			goto out_encrypt_pages;
 		memset(ret, 0, size);
 		goto done;
 	}
@@ -226,6 +226,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 done:
 	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 	return ret;
+
+out_encrypt_pages:
+	if (force_dma_unencrypted(dev))
+		set_memory_encrypted((unsigned long)page_address(page),
+
+				     1 << get_order(size));
 out_free_pages:
 	dma_free_contiguous(dev, page, size);
 	return NULL;
-- 
Gitee


From 58defbee559e9497fdcaf0ce8ef9d11f366d5992 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 11 Jun 2020 12:20:30 -0700
Subject: [PATCH 0991/2161] dma-direct: check return value when encrypting or
 decrypting memory

commit 56fccf21d1961a06e2a0c96ce446ebf036651062 upstream.

__change_page_attr() can fail which will cause set_memory_encrypted() and
set_memory_decrypted() to return non-zero.

If the device requires unencrypted DMA memory and decryption fails, simply
free the memory and fail.

If attempting to re-encrypt in the failure path and that encryption fails,
there is no alternative other than to leak the memory.

Fixes: c10f07aa27da ("dma/direct: Handle force decryption for DMA coherent buffers in common code")
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4d21465ab266..cd599817b47c 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -159,6 +159,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 {
 	struct page *page;
 	void *ret;
+	int err;
 
 	size = PAGE_ALIGN(size);
 
@@ -211,8 +212,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	}
 
 	ret = page_address(page);
-	if (force_dma_unencrypted(dev))
-		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
+	if (force_dma_unencrypted(dev)) {
+		err = set_memory_decrypted((unsigned long)ret,
+					   1 << get_order(size));
+		if (err)
+			goto out_free_pages;
+	}
 
 	memset(ret, 0, size);
 
@@ -228,10 +233,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	return ret;
 
 out_encrypt_pages:
-	if (force_dma_unencrypted(dev))
-		set_memory_encrypted((unsigned long)page_address(page),
-
-				     1 << get_order(size));
+	if (force_dma_unencrypted(dev)) {
+		err = set_memory_encrypted((unsigned long)page_address(page),
+					   1 << get_order(size));
+		/* If memory cannot be re-encrypted, it must be leaked */
+		if (err)
+			return NULL;
+	}
 out_free_pages:
 	dma_free_contiguous(dev, page, size);
 	return NULL;
-- 
Gitee


From 973112e7734f085f92640ad16ea0240ddb1f3392 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 11 Jun 2020 12:20:32 -0700
Subject: [PATCH 0992/2161] dma-direct: add missing set_memory_decrypted() for
 coherent mapping

commit 1a2b3357e860d890f8045367b179c7e7e802cd71 upstream.

When a coherent mapping is created in dma_direct_alloc_pages(), it needs
to be decrypted if the device requires unencrypted DMA before returning.

Fixes: 3acac065508f ("dma-mapping: merge the generic remapping helpers into dma-direct")
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index cd599817b47c..af6780fbdf34 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -196,6 +196,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 				__builtin_return_address(0));
 		if (!ret)
 			goto out_encrypt_pages;
+		if (force_dma_unencrypted(dev)) {
+			err = set_memory_decrypted((unsigned long)ret,
+						   1 << get_order(size));
+			if (err)
+				goto out_free_pages;
+		}
 		memset(ret, 0, size);
 		goto done;
 	}
-- 
Gitee


From 3eb6763b0842e40c1652677d1d71186df586a33d Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 18 Mar 2022 20:21:42 +0800
Subject: [PATCH 0993/2161] dma-mapping: DMA_COHERENT_POOL should select
 GENERIC_ALLOCATOR

commit d07ae4c486908615ab336b987c7c367d132fd844 upstream.

The dma coherent pool code needs genalloc.  Move the select over
from DMA_REMAP, which doesn't actually need it.

Fixes: dbed452a078d ("dma-pool: decouple DMA_REMAP from DMA_COHERENT_POOL")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index e0dae570a515..03f37179c495 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -76,13 +76,13 @@ config DMA_NONCOHERENT_MMAP
 
 config DMA_REMAP
 	depends on MMU
-	select GENERIC_ALLOCATOR
 	select DMA_NONCOHERENT_MMAP
 	bool
 
 config DMA_COHERENT_POOL
 	bool
 	select DMA_REMAP
+	select GENERIC_ALLOCATOR
 
 config DMA_DIRECT_REMAP
 	bool
-- 
Gitee


From 45a7e2cb04da6aec652d7a0299a5d2034c6dc85d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Jun 2020 15:03:56 +0200
Subject: [PATCH 0994/2161] dma-mapping: Add a new dma_need_sync API

commit 3aa91625007807bfca4155df1867a5c924a08662 upstream.

Add a new API to check if calls to dma_sync_single_for_{device,cpu} are
required for a given DMA streaming mapping.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200629130359.2690853-2-hch@lst.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/DMA-API.txt   |  8 ++++++++
 include/linux/dma-direct.h  |  1 +
 include/linux/dma-mapping.h |  5 +++++
 kernel/dma/direct.c         |  6 ++++++
 kernel/dma/mapping.c        | 10 ++++++++++
 5 files changed, 30 insertions(+)

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 2d8d2fed7317..f41620439ef3 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -204,6 +204,14 @@ Returns the maximum size of a mapping for the device. The size parameter
 of the mapping functions like dma_map_single(), dma_map_page() and
 others should not be larger than the returned value.
 
+::
+
+	bool
+	dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+
+Returns %true if dma_sync_single_for_{device,cpu} calls are required to
+transfer memory ownership.  Returns %false if those calls can be skipped.
+
 ::
 
 	unsigned long
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 5edf1eefed50..99c9e1a17f7b 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -87,4 +87,5 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
+bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 3f57b35e3f8d..13dc9680dd30 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -470,6 +470,7 @@ int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
+bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 unsigned long dma_get_merge_boundary(struct device *dev);
 #else /* CONFIG_HAS_DMA */
 static inline dma_addr_t dma_map_page_attrs(struct device *dev,
@@ -580,6 +581,10 @@ static inline size_t dma_max_mapping_size(struct device *dev)
 {
 	return 0;
 }
+static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	return false;
+}
 static inline unsigned long dma_get_merge_boundary(struct device *dev)
 {
 	return 0;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index af6780fbdf34..615eca482fc1 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -539,3 +539,9 @@ size_t dma_direct_max_mapping_size(struct device *dev)
 		return swiotlb_max_mapping_size(dev);
 	return SIZE_MAX;
 }
+
+bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	return !dev_is_dma_coherent(dev) ||
+		is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
+}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 98e3d873792e..a8c18c9a796f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -397,6 +397,16 @@ size_t dma_max_mapping_size(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_max_mapping_size);
 
+bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (dma_is_direct(ops))
+		return dma_direct_need_sync(dev, dma_addr);
+	return ops->sync_single_for_cpu || ops->sync_single_for_device;
+}
+EXPORT_SYMBOL_GPL(dma_need_sync);
+
 unsigned long dma_get_merge_boundary(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
Gitee


From 438e98181ed81ebfac3a07235fa6d1a361247279 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Jul 2020 09:45:11 +0200
Subject: [PATCH 0995/2161] dma-mapping: move the remaining DMA API calls out
 of line

commit d3fa60d7bfdc9a0ff1a524bdef96b3db1fd62022 upstream.

For a long time the DMA API has been implemented inline in dma-mapping.h,
but the function bodies can be quite large.  Move them all out of line.

This also removes all the dma_direct_* exports as those are just
implementation details and should never be used by drivers directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-direct.h  |  58 +++++++++
 include/linux/dma-mapping.h | 247 ++++--------------------------------
 kernel/dma/direct.c         |   9 --
 kernel/dma/mapping.c        | 164 ++++++++++++++++++++++++
 4 files changed, 244 insertions(+), 234 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 99c9e1a17f7b..65ffb4bddac8 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -88,4 +88,62 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 		unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
 bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+}
+static inline void dma_direct_unmap_sg(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+size_t dma_direct_max_mapping_size(struct device *dev);
+
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 13dc9680dd30..823e8832600e 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -197,73 +197,6 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_DMA_DECLARE_COHERENT */
 
-static inline bool dma_is_direct(const struct dma_map_ops *ops)
-{
-	return likely(!ops);
-}
-
-/*
- * All the dma_direct_* declarations are here just for the indirect call bypass,
- * and must not be used directly drivers!
- */
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
-int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
-dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
-    defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-#else
-static inline void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
-static inline void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
-{
-}
-#endif
-
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
-    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
-    defined(CONFIG_SWIOTLB)
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-#else
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-}
-static inline void dma_direct_unmap_sg(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-}
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
-static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
-{
-}
-#endif
-
-size_t dma_direct_max_mapping_size(struct device *dev);
-
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
 
@@ -280,164 +213,6 @@ static inline void set_dma_ops(struct device *dev,
 	dev->dma_ops = dma_ops;
 }
 
-static inline dma_addr_t dma_map_page_attrs(struct device *dev,
-		struct page *page, size_t offset, size_t size,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr;
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
-	else
-		addr = ops->map_page(dev, page, offset, size, dir, attrs);
-	debug_dma_map_page(dev, page, offset, size, dir, addr);
-
-	return addr;
-}
-
-static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_unmap_page(dev, addr, size, dir, attrs);
-	else if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, attrs);
-	debug_dma_unmap_page(dev, addr, size, dir);
-}
-
-/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
- */
-static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-				   int nents, enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	int ents;
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
-	else
-		ents = ops->map_sg(dev, sg, nents, dir, attrs);
-	BUG_ON(ents < 0);
-	debug_dma_map_sg(dev, sg, nents, ents, dir);
-
-	return ents;
-}
-
-static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-				      int nents, enum dma_data_direction dir,
-				      unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (dma_is_direct(ops))
-		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
-	else if (ops->unmap_sg)
-		ops->unmap_sg(dev, sg, nents, dir, attrs);
-}
-
-static inline dma_addr_t dma_map_resource(struct device *dev,
-					  phys_addr_t phys_addr,
-					  size_t size,
-					  enum dma_data_direction dir,
-					  unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr = DMA_MAPPING_ERROR;
-
-	BUG_ON(!valid_dma_direction(dir));
-
-	/* Don't allow RAM to be mapped */
-	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
-		return DMA_MAPPING_ERROR;
-
-	if (dma_is_direct(ops))
-		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
-	else if (ops->map_resource)
-		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
-
-	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
-	return addr;
-}
-
-static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
-				      size_t size, enum dma_data_direction dir,
-				      unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (!dma_is_direct(ops) && ops->unmap_resource)
-		ops->unmap_resource(dev, addr, size, dir, attrs);
-	debug_dma_unmap_resource(dev, addr, size, dir);
-}
-
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-					   size_t size,
-					   enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-	else if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(dev, addr, size, dir);
-	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
-}
-
-static inline void dma_sync_single_for_device(struct device *dev,
-					      dma_addr_t addr, size_t size,
-					      enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_single_for_device(dev, addr, size, dir);
-	else if (ops->sync_single_for_device)
-		ops->sync_single_for_device(dev, addr, size, dir);
-	debug_dma_sync_single_for_device(dev, addr, size, dir);
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-		    int nelems, enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
-	else if (ops->sync_sg_for_cpu)
-		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
-	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
-}
-
-static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-		       int nelems, enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
-	else if (ops->sync_sg_for_device)
-		ops->sync_sg_for_device(dev, sg, nelems, dir);
-	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
-
-}
 
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
@@ -448,6 +223,28 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+		size_t offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
+int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+				      int nents, enum dma_data_direction dir,
+				      unsigned long attrs);
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir);
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir);
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nelems, enum dma_data_direction dir);
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nelems, enum dma_data_direction dir);
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flag, unsigned long attrs);
 void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 615eca482fc1..8c8d17b0db99 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -313,7 +313,6 @@ void dma_direct_sync_single_for_device(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(paddr, size, dir);
 }
-EXPORT_SYMBOL(dma_direct_sync_single_for_device);
 
 void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -333,7 +332,6 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir);
 	}
 }
-EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
 #endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
@@ -352,7 +350,6 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
 	if (unlikely(is_swiotlb_buffer(paddr)))
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
-EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
 
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -374,7 +371,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_cpu_all();
 }
-EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
 
 void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -387,7 +383,6 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 	if (unlikely(is_swiotlb_buffer(phys)))
 		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
 }
-EXPORT_SYMBOL(dma_direct_unmap_page);
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -399,7 +394,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 			     attrs);
 }
-EXPORT_SYMBOL(dma_direct_unmap_sg);
 #endif
 
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
@@ -426,7 +420,6 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
 }
-EXPORT_SYMBOL(dma_direct_map_page);
 
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
@@ -448,7 +441,6 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
 	return 0;
 }
-EXPORT_SYMBOL(dma_direct_map_sg);
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -465,7 +457,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 
 	return dma_addr;
 }
-EXPORT_SYMBOL(dma_direct_map_resource);
 
 int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a8c18c9a796f..b53953024512 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -105,6 +105,170 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 }
 EXPORT_SYMBOL(dmam_alloc_attrs);
 
+static inline bool dma_is_direct(const struct dma_map_ops *ops)
+{
+	return likely(!ops);
+}
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+		size_t offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr;
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	else
+		addr = ops->map_page(dev, page, offset, size, dir, attrs);
+	debug_dma_map_page(dev, page, offset, size, dir, addr);
+
+	return addr;
+}
+EXPORT_SYMBOL(dma_map_page_attrs);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	else if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
+	debug_dma_unmap_page(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_page_attrs);
+
+/*
+ * dma_maps_sg_attrs returns 0 on error and > 0 on success.
+ * It should never return a value < 0.
+ */
+int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	int ents;
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+	else
+		ents = ops->map_sg(dev, sg, nents, dir, attrs);
+	BUG_ON(ents < 0);
+	debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+	return ents;
+}
+EXPORT_SYMBOL(dma_map_sg_attrs);
+
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+				      int nents, enum dma_data_direction dir,
+				      unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	debug_dma_unmap_sg(dev, sg, nents, dir);
+	if (dma_is_direct(ops))
+		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+	else if (ops->unmap_sg)
+		ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+EXPORT_SYMBOL(dma_unmap_sg_attrs);
+
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr = DMA_MAPPING_ERROR;
+
+	BUG_ON(!valid_dma_direction(dir));
+
+	/* Don't allow RAM to be mapped */
+	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
+		return DMA_MAPPING_ERROR;
+
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+	else if (ops->map_resource)
+		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
+
+	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+	return addr;
+}
+EXPORT_SYMBOL(dma_map_resource);
+
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (!dma_is_direct(ops) && ops->unmap_resource)
+		ops->unmap_resource(dev, addr, size, dir, attrs);
+	debug_dma_unmap_resource(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_resource);
+
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+	else if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(dev, addr, size, dir);
+	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_device(dev, addr, size, dir);
+	else if (ops->sync_single_for_device)
+		ops->sync_single_for_device(dev, addr, size, dir);
+	debug_dma_sync_single_for_device(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
+
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nelems, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_cpu)
+		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nelems, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_device)
+		ops->sync_sg_for_device(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_device);
+
 /*
  * Create scatter-list for the already allocated DMA buffer.
  */
-- 
Gitee


From d3f21688951d0a307e89a02e310ea82a120c67eb Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Wed, 13 Apr 2022 08:32:22 +0200
Subject: [PATCH 0996/2161] dma-direct: avoid redundant memory sync for swiotlb

commit 9e02977bfad006af328add9434c8bffa40e053bb upstream.

When we looked into FIO performance with swiotlb enabled in VM, we found
swiotlb_bounce() is always called one more time than expected for each DMA
read request.

It turns out that the bounce buffer is copied to original DMA buffer twice
after the completion of a DMA request (one is done by in
dma_direct_sync_single_for_cpu(), the other by swiotlb_tbl_unmap_single()).
But the content in bounce buffer actually doesn't change between the two
rounds of copy. So, one round of copy is redundant.

Pass DMA_ATTR_SKIP_CPU_SYNC flag to swiotlb_tbl_unmap_single() to
skip the memory copy in it.

This fix increases FIO 64KB sequential read throughput in a guest with
swiotlb=force by 5.6%.

Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct code")
Reported-by: Wang Zhaoyang1 <zhaoyang1.wang@intel.com>
Reported-by: Gao Liang <liang.gao@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8c8d17b0db99..6752992db594 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -381,7 +381,8 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, size, dir,
+					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-- 
Gitee


From 20fcf021c11e501cde6f5d863cef510a57cb9467 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Jul 2020 09:47:08 +0200
Subject: [PATCH 0997/2161] dma-mapping: inline the fast path dma-direct calls

commit b4174173005972f8f6497883d08d87e0aba1b604 upstream.

Inline the single page map/unmap/sync dma-direct calls into the now
out of line generic wrappers.  This restores the behavior of a single
function call that we had before moving the generic calls out of line.
Besides the dma-mapping callers there are just a few callers in IOMMU
drivers that have a bypass mode, and more of those are going to be
switched to the generic bypass soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-direct.h | 92 ++++++++++++++++++++++++++++----------
 kernel/dma/direct.c        | 66 ---------------------------
 2 files changed, 69 insertions(+), 89 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 65ffb4bddac8..19965fa55b95 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -1,10 +1,16 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Internals of the DMA direct mapping implementation.  Only for use by the
+ * DMA mapping code and IOMMU drivers.
+ */
 #ifndef _LINUX_DMA_DIRECT_H
 #define _LINUX_DMA_DIRECT_H 1
 
 #include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/memblock.h> /* for min_low_pfn */
 #include <linux/mem_encrypt.h>
+#include <linux/swiotlb.h>
 
 #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA
 #include <asm/dma-direct.h>
@@ -88,25 +94,17 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 		unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
 bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs);
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
+size_t dma_direct_max_mapping_size(struct device *dev);
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir);
 #else
-static inline void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
 static inline void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -116,34 +114,82 @@ static inline void dma_direct_sync_sg_for_device(struct device *dev,
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
 #else
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-}
 static inline void dma_direct_unmap_sg(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
 		unsigned long attrs)
 {
 }
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (unlikely(is_swiotlb_buffer(paddr)))
+		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_device(paddr, size, dir);
+}
+
 static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (!dev_is_dma_coherent(dev)) {
+		arch_sync_dma_for_cpu(paddr, size, dir);
+		arch_sync_dma_for_cpu_all();
+	}
+
+	if (unlikely(is_swiotlb_buffer(paddr)))
+		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
-static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+
+static inline dma_addr_t dma_direct_map_page(struct device *dev,
+		struct page *page, unsigned long offset, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
+	phys_addr_t phys = page_to_phys(page) + offset;
+	dma_addr_t dma_addr = phys_to_dma(dev, phys);
+
+	if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+		return swiotlb_map(dev, phys, size, dir, attrs);
+
+	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+		if (swiotlb_force != SWIOTLB_NO_FORCE)
+			return swiotlb_map(dev, phys, size, dir, attrs);
+
+		dev_WARN_ONCE(dev, 1,
+			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+		return DMA_MAPPING_ERROR;
+	}
+
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(phys, size, dir);
+	return dma_addr;
 }
-#endif
 
-size_t dma_direct_max_mapping_size(struct device *dev);
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	phys_addr_t phys = dma_to_phys(dev, addr);
 
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+	if (unlikely(is_swiotlb_buffer(phys)))
+		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+}
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 6752992db594..7f08bd9e0d0f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -10,11 +10,9 @@
 #include <linux/dma-direct.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-contiguous.h>
-#include <linux/dma-noncoherent.h>
 #include <linux/pfn.h>
 #include <linux/vmalloc.h>
 #include <linux/set_memory.h>
-#include <linux/swiotlb.h>
 
 /*
  * Most architectures use ZONE_DMA for the first 16 Megabytes, but
@@ -302,18 +300,6 @@ void dma_direct_free(struct device *dev, size_t size,
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-	phys_addr_t paddr = dma_to_phys(dev, addr);
-
-	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
-
-	if (!dev_is_dma_coherent(dev))
-		arch_sync_dma_for_device(paddr, size, dir);
-}
-
 void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -337,20 +323,6 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-	phys_addr_t paddr = dma_to_phys(dev, addr);
-
-	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(paddr, size, dir);
-		arch_sync_dma_for_cpu_all();
-	}
-
-	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
-}
-
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -372,19 +344,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		arch_sync_dma_for_cpu_all();
 }
 
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	phys_addr_t phys = dma_to_phys(dev, addr);
-
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-
-	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir,
-					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
-}
-
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
 {
@@ -397,31 +356,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 }
 #endif
 
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	phys_addr_t phys = page_to_phys(page) + offset;
-	dma_addr_t dma_addr = phys_to_dma(dev, phys);
-
-	if (unlikely(swiotlb_force == SWIOTLB_FORCE))
-		return swiotlb_map(dev, phys, size, dir, attrs);
-
-	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-		if (swiotlb_force != SWIOTLB_NO_FORCE)
-			return swiotlb_map(dev, phys, size, dir, attrs);
-
-		dev_WARN_ONCE(dev, 1,
-			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
-			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
-		return DMA_MAPPING_ERROR;
-	}
-
-	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(phys, size, dir);
-	return dma_addr;
-}
-
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
 {
-- 
Gitee


From b8f84e33652e06d9cdc940a05493331e4c1eee53 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 23 Mar 2020 18:19:30 +0100
Subject: [PATCH 0998/2161] dma-mapping: add a dma_ops_bypass flag to struct
 device

commit d35834c64820c7ef397f8a244061d4450720540e upstream.

Several IOMMU drivers have a bypass mode where they can use a direct
mapping if the devices DMA mask is large enough.  Add generic support
to the core dma-mapping code to do that to switch those drivers to
a common solution.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/device.h |  8 +++++
 kernel/dma/Kconfig     |  8 +++++
 kernel/dma/mapping.c   | 74 +++++++++++++++++++++++++++++-------------
 3 files changed, 68 insertions(+), 22 deletions(-)

diff --git a/include/linux/device.h b/include/linux/device.h
index 23cac07ac1c1..9264517f5095 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1218,6 +1218,11 @@ struct dev_links_info {
  *              device.
  * @dma_coherent: this particular device is dma coherent, even if the
  *		architecture supports non-coherent devices.
+ * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
+ *		streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
+ *		and optionall (if the coherent mask is large enough) also
+ *		for dma allocations.  This flag is managed by the dma ops
+ *		instance from ->dma_supported.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -1316,6 +1321,9 @@ struct device {
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 	bool			dma_coherent:1;
 #endif
+#ifdef CONFIG_DMA_OPS_BYPASS
+	bool			dma_ops_bypass : 1;
+#endif
 };
 
 static inline struct device *kobj_to_dev(struct kobject *kobj)
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 03f37179c495..4cfd623129fc 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -5,6 +5,14 @@ config HAS_DMA
 	depends on !NO_DMA
 	default y
 
+#
+# IOMMU drivers that can bypass the IOMMU code and optionally use the direct
+# mapping fast path should select this option and set the dma_ops_bypass
+# flag in struct device where applicable
+#
+config DMA_OPS_BYPASS
+	bool
+
 config NEED_SG_DMA_LENGTH
 	bool
 
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b53953024512..0d129421e75f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -105,9 +105,35 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 }
 EXPORT_SYMBOL(dmam_alloc_attrs);
 
-static inline bool dma_is_direct(const struct dma_map_ops *ops)
+static bool dma_go_direct(struct device *dev, dma_addr_t mask,
+		const struct dma_map_ops *ops)
 {
-	return likely(!ops);
+	if (likely(!ops))
+		return true;
+#ifdef CONFIG_DMA_OPS_BYPASS
+	if (dev->dma_ops_bypass)
+		return min_not_zero(mask, dev->bus_dma_limit) >=
+			    dma_direct_get_required_mask(dev);
+#endif
+	return false;
+}
+
+
+/*
+ * Check if the devices uses a direct mapping for streaming DMA operations.
+ * This allows IOMMU drivers to set a bypass mode if the DMA mask is large
+ * enough.
+ */
+static inline bool dma_alloc_direct(struct device *dev,
+		const struct dma_map_ops *ops)
+{
+	return dma_go_direct(dev, dev->coherent_dma_mask, ops);
+}
+
+static inline bool dma_map_direct(struct device *dev,
+		const struct dma_map_ops *ops)
+{
+	return dma_go_direct(dev, *dev->dma_mask, ops);
 }
 
 dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
@@ -118,7 +144,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
 	else
 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
@@ -134,7 +160,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_unmap_page(dev, addr, size, dir, attrs);
 	else if (ops->unmap_page)
 		ops->unmap_page(dev, addr, size, dir, attrs);
@@ -153,7 +179,7 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
 	int ents;
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
 	else
 		ents = ops->map_sg(dev, sg, nents, dir, attrs);
@@ -172,7 +198,7 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
 	else if (ops->unmap_sg)
 		ops->unmap_sg(dev, sg, nents, dir, attrs);
@@ -191,7 +217,7 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
 	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
 		return DMA_MAPPING_ERROR;
 
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
 	else if (ops->map_resource)
 		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
@@ -207,7 +233,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (!dma_is_direct(ops) && ops->unmap_resource)
+	if (!dma_map_direct(dev, ops) && ops->unmap_resource)
 		ops->unmap_resource(dev, addr, size, dir, attrs);
 	debug_dma_unmap_resource(dev, addr, size, dir);
 }
@@ -219,7 +245,7 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 	else if (ops->sync_single_for_cpu)
 		ops->sync_single_for_cpu(dev, addr, size, dir);
@@ -233,7 +259,7 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_sync_single_for_device(dev, addr, size, dir);
 	else if (ops->sync_single_for_device)
 		ops->sync_single_for_device(dev, addr, size, dir);
@@ -247,7 +273,7 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
 	else if (ops->sync_sg_for_cpu)
 		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
@@ -261,7 +287,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
 	else if (ops->sync_sg_for_device)
 		ops->sync_sg_for_device(dev, sg, nelems, dir);
@@ -302,7 +328,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
 				size, attrs);
 	if (!ops->get_sgtable)
@@ -372,7 +398,7 @@ bool dma_can_mmap(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_can_mmap(dev);
 	return ops->mmap != NULL;
 }
@@ -397,7 +423,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
 				attrs);
 	if (!ops->mmap)
@@ -410,7 +436,7 @@ u64 dma_get_required_mask(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_get_required_mask(dev);
 	if (ops->get_required_mask)
 		return ops->get_required_mask(dev);
@@ -441,7 +467,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* let the implementation decide on the zone to allocate from: */
 	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
 	else if (ops->alloc)
 		cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
@@ -473,7 +499,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 		return;
 
 	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
 	else if (ops->free)
 		ops->free(dev, size, cpu_addr, dma_handle, attrs);
@@ -484,7 +510,11 @@ int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	/*
+	 * ->dma_supported sets the bypass flag, so we must always call
+	 * into the method here unless the device is truly direct mapped.
+	 */
+	if (!ops)
 		return dma_direct_supported(dev, mask);
 	if (!ops->dma_supported)
 		return 1;
@@ -540,7 +570,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 
 	BUG_ON(!valid_dma_direction(dir));
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		arch_dma_cache_sync(dev, vaddr, size, dir);
 	else if (ops->cache_sync)
 		ops->cache_sync(dev, vaddr, size, dir);
@@ -552,7 +582,7 @@ size_t dma_max_mapping_size(struct device *dev)
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	size_t size = SIZE_MAX;
 
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		size = dma_direct_max_mapping_size(dev);
 	else if (ops && ops->max_mapping_size)
 		size = ops->max_mapping_size(dev);
@@ -565,7 +595,7 @@ bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		return dma_direct_need_sync(dev, dma_addr);
 	return ops->sync_single_for_cpu || ops->sync_single_for_device;
 }
-- 
Gitee


From 6314d41c9c647f4018d106b7b43322e816daa3ce Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 8 Jun 2020 15:22:17 +0200
Subject: [PATCH 0999/2161] dma-pool: fix too large DMA pools on medium memory
 size systems

commit 3ee06a6d532f75f20528ff4d2c473cda36c484fe upstream.

On systems with at least 32 MiB, but less than 32 GiB of RAM, the DMA
memory pools are much larger than intended (e.g. 2 MiB instead of 128
KiB on a 256 MiB system).

Fix this by correcting the calculation of the number of GiBs of RAM in
the system.  Invert the order of the min/max operations, to keep on
calculating in pages until the last step, which aids readability.

Fixes: 1d659236fb43c4d2 ("dma-pool: scale the default DMA coherent pool size with memory capacity")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 35bb51c31fff..8cfa01243ed2 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -175,10 +175,9 @@ static int __init dma_atomic_pool_init(void)
 	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
 	 */
 	if (!atomic_pool_size) {
-		atomic_pool_size = max(totalram_pages() >> PAGE_SHIFT, 1UL) *
-					SZ_128K;
-		atomic_pool_size = min_t(size_t, atomic_pool_size,
-					 1 << (PAGE_SHIFT + MAX_ORDER-1));
+		unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
+		pages = min_t(unsigned long, pages, MAX_ORDER_NR_PAGES);
+		atomic_pool_size = max_t(size_t, pages << PAGE_SHIFT, SZ_128K);
 	}
 	INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
 
-- 
Gitee


From 2237d772926deb25a8d56c05007ba492daac768d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Sun, 21 Jun 2020 13:43:02 -0700
Subject: [PATCH 1000/2161] dma-mapping: warn when coherent pool is depleted

commit 71cdec4fab76667dabdbb2ca232b039004ebd40f upstream.

When a DMA coherent pool is depleted, allocation failures may or may not
get reported in the kernel log depending on the allocator.

The admin does have a workaround, however, by using coherent_pool= on the
kernel command line.

Provide some guidance on the failure and a recommended minimum size for
the pools (double the size).

Signed-off-by: David Rientjes <rientjes@google.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 8cfa01243ed2..39ca26fa41b5 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -239,12 +239,16 @@ void *dma_alloc_from_pool(struct device *dev, size_t size,
 	}
 
 	val = gen_pool_alloc(pool, size);
-	if (val) {
+	if (likely(val)) {
 		phys_addr_t phys = gen_pool_virt_to_phys(pool, val);
 
 		*ret_page = pfn_to_page(__phys_to_pfn(phys));
 		ptr = (void *)val;
 		memset(ptr, 0, size);
+	} else {
+		WARN_ONCE(1, "DMA coherent pool depleted, increase size "
+			     "(recommended min coherent_pool=%zuK)\n",
+			  gen_pool_size(pool) >> 9);
 	}
 	if (gen_pool_avail(pool) < atomic_pool_size)
 		schedule_work(&atomic_pool_work);
-- 
Gitee


From 1393d5969925e35812ed85e94869c4f44926cc9b Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Tue, 14 Jul 2020 14:39:26 +0200
Subject: [PATCH 1001/2161] dma-pool: get rid of dma_in_atomic_pool()

commit 23e469be6239d9cf3d921fc3e38545491df56534 upstream.

The function is only used once and can be simplified to a one-liner.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 39ca26fa41b5..318035e093fb 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -217,15 +217,6 @@ static inline struct gen_pool *dev_to_pool(struct device *dev)
 	return atomic_pool_kernel;
 }
 
-static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size)
-{
-	struct gen_pool *pool = dev_to_pool(dev);
-
-	if (unlikely(!pool))
-		return false;
-	return gen_pool_has_addr(pool, (unsigned long)start, size);
-}
-
 void *dma_alloc_from_pool(struct device *dev, size_t size,
 			  struct page **ret_page, gfp_t flags)
 {
@@ -260,7 +251,7 @@ bool dma_free_from_pool(struct device *dev, void *start, size_t size)
 {
 	struct gen_pool *pool = dev_to_pool(dev);
 
-	if (!dma_in_atomic_pool(dev, start, size))
+	if (!pool || !gen_pool_has_addr(pool, (unsigned long)start, size))
 		return false;
 	gen_pool_free(pool, (unsigned long)start, size);
 	return true;
-- 
Gitee


From 2de9e0a2ec02c83bb501488b63d7234d36ef837a Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Tue, 14 Jul 2020 14:39:27 +0200
Subject: [PATCH 1002/2161] dma-pool: introduce dma_guess_pool()

commit 48b6703858dd5526c82d8ff2dbac59acab3a9dda upstream.

dma-pool's dev_to_pool() creates the false impression that there is a
way to grantee a mapping between a device's DMA constraints and an
atomic pool. It tuns out it's just a guess, and the device might need to
use an atomic pool containing memory from a 'safer' (or lower) memory
zone.

To help mitigate this, introduce dma_guess_pool() which can be fed a
device's DMA constraints and atomic pools already known to be faulty, in
order for it to provide an better guess on which pool to use.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 318035e093fb..5b9eaa2b498d 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void)
 }
 postcore_initcall(dma_atomic_pool_init);
 
-static inline struct gen_pool *dev_to_pool(struct device *dev)
+static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev)
 {
 	u64 phys_mask;
 	gfp_t gfp;
@@ -217,10 +217,30 @@ static inline struct gen_pool *dev_to_pool(struct device *dev)
 	return atomic_pool_kernel;
 }
 
+static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool)
+{
+	if (bad_pool == atomic_pool_kernel)
+		return atomic_pool_dma32 ? : atomic_pool_dma;
+
+	if (bad_pool == atomic_pool_dma32)
+		return atomic_pool_dma;
+
+	return NULL;
+}
+
+static inline struct gen_pool *dma_guess_pool(struct device *dev,
+					      struct gen_pool *bad_pool)
+{
+	if (bad_pool)
+		return dma_get_safer_pool(bad_pool);
+
+	return dma_guess_pool_from_device(dev);
+}
+
 void *dma_alloc_from_pool(struct device *dev, size_t size,
 			  struct page **ret_page, gfp_t flags)
 {
-	struct gen_pool *pool = dev_to_pool(dev);
+	struct gen_pool *pool = dma_guess_pool(dev, NULL);
 	unsigned long val;
 	void *ptr = NULL;
 
@@ -249,7 +269,7 @@ void *dma_alloc_from_pool(struct device *dev, size_t size,
 
 bool dma_free_from_pool(struct device *dev, void *start, size_t size)
 {
-	struct gen_pool *pool = dev_to_pool(dev);
+	struct gen_pool *pool = dma_guess_pool(dev, NULL);
 
 	if (!pool || !gen_pool_has_addr(pool, (unsigned long)start, size))
 		return false;
-- 
Gitee


From 120a094b36a55289ced3b3d108a9152e1bc87819 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Tue, 14 Jul 2020 14:39:28 +0200
Subject: [PATCH 1003/2161] dma-pool: make sure atomic pool suits device

commit 81e9d894e03f9a279102c7aac62ea7cbf9949f4b upstream.

When allocating DMA memory from a pool, the core can only guess which
atomic pool will fit a device's constraints. If it doesn't, get a safer
atomic pool and try again.

Fixes: c84dc6e68a1d ("dma-pool: add additional coherent pools to map to gfp mask")
Reported-by: Jeremy Linton <jeremy.linton@arm.com>
Suggested-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 57 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 5b9eaa2b498d..d48d9acb585f 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -240,39 +240,56 @@ static inline struct gen_pool *dma_guess_pool(struct device *dev,
 void *dma_alloc_from_pool(struct device *dev, size_t size,
 			  struct page **ret_page, gfp_t flags)
 {
-	struct gen_pool *pool = dma_guess_pool(dev, NULL);
-	unsigned long val;
+	struct gen_pool *pool = NULL;
+	unsigned long val = 0;
 	void *ptr = NULL;
-
-	if (!pool) {
-		WARN(1, "%pGg atomic pool not initialised!\n", &flags);
-		return NULL;
+	phys_addr_t phys;
+
+	while (1) {
+		pool = dma_guess_pool(dev, pool);
+		if (!pool) {
+			WARN(1, "Failed to get suitable pool for %s\n",
+			     dev_name(dev));
+			break;
+		}
+
+		val = gen_pool_alloc(pool, size);
+		if (!val)
+			continue;
+
+		phys = gen_pool_virt_to_phys(pool, val);
+		if (dma_coherent_ok(dev, phys, size))
+			break;
+
+		gen_pool_free(pool, val, size);
+		val = 0;
 	}
 
-	val = gen_pool_alloc(pool, size);
-	if (likely(val)) {
-		phys_addr_t phys = gen_pool_virt_to_phys(pool, val);
 
+	if (val) {
 		*ret_page = pfn_to_page(__phys_to_pfn(phys));
 		ptr = (void *)val;
 		memset(ptr, 0, size);
-	} else {
-		WARN_ONCE(1, "DMA coherent pool depleted, increase size "
-			     "(recommended min coherent_pool=%zuK)\n",
-			  gen_pool_size(pool) >> 9);
+
+		if (gen_pool_avail(pool) < atomic_pool_size)
+			schedule_work(&atomic_pool_work);
 	}
-	if (gen_pool_avail(pool) < atomic_pool_size)
-		schedule_work(&atomic_pool_work);
 
 	return ptr;
 }
 
 bool dma_free_from_pool(struct device *dev, void *start, size_t size)
 {
-	struct gen_pool *pool = dma_guess_pool(dev, NULL);
+	struct gen_pool *pool = NULL;
 
-	if (!pool || !gen_pool_has_addr(pool, (unsigned long)start, size))
-		return false;
-	gen_pool_free(pool, (unsigned long)start, size);
-	return true;
+	while (1) {
+		pool = dma_guess_pool(dev, pool);
+		if (!pool)
+			return false;
+
+		if (gen_pool_has_addr(pool, (unsigned long)start, size)) {
+			gen_pool_free(pool, (unsigned long)start, size);
+			return true;
+		}
+	}
 }
-- 
Gitee


From 5ef674c748353de7c7be7afe73c2715dfd2f6e57 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Wed, 8 Jul 2020 18:49:39 +0200
Subject: [PATCH 1004/2161] dma-pool: do not allocate pool memory from CMA

commit d9765e41d8e9ea2251bf73735a2895c8bad546fc upstream.

There is no guarantee to CMA's placement, so allocating a zone specific
atomic pool from CMA might return memory from a completely different
memory zone. So stop using it.

Fixes: c84dc6e68a1d ("dma-pool: add additional coherent pools to map to gfp mask")
Reported-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index d48d9acb585f..6bc74a2d5127 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -6,7 +6,6 @@
 #include <linux/debugfs.h>
 #include <linux/dma-direct.h>
 #include <linux/dma-noncoherent.h>
-#include <linux/dma-contiguous.h>
 #include <linux/init.h>
 #include <linux/genalloc.h>
 #include <linux/set_memory.h>
@@ -69,12 +68,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 
 	do {
 		pool_size = 1 << (PAGE_SHIFT + order);
-
-		if (dev_get_cma_area(NULL))
-			page = dma_alloc_from_contiguous(NULL, 1 << order,
-							 order, false);
-		else
-			page = alloc_pages(gfp, order);
+		page = alloc_pages(gfp, order);
 	} while (!page && order-- > 0);
 	if (!page)
 		goto out;
@@ -118,8 +112,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 	dma_common_free_remap(addr, pool_size);
 #endif
 free_page: __maybe_unused
-	if (!dma_release_from_contiguous(NULL, page, 1 << order))
-		__free_pages(page, order);
+	__free_pages(page, order);
 out:
 	return ret;
 }
-- 
Gitee


From 44bd8c392e86eb1f8ba933469306a6f8a542aefb Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sat, 19 Mar 2022 12:18:33 +0800
Subject: [PATCH 1005/2161] dma-pool: fix coherent pool allocations for IOMMU
 mappings

commit 9420139f516d7fbc248ce17f35275cb005ed98ea upstream.

When allocating coherent pool memory for an IOMMU mapping we don't care
about the DMA mask.  Move the guess for the initial GFP mask into the
dma_direct_alloc_pages and pass dma_coherent_ok as a function pointer
argument so that it doesn't get applied to the IOMMU case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c   |   4 +-
 include/linux/dma-direct.h  |   2 -
 include/linux/dma-mapping.h |   5 +-
 kernel/dma/direct.c         |  11 +++-
 kernel/dma/pool.c           | 114 +++++++++++++++---------------------
 5 files changed, 61 insertions(+), 75 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3c3a395ecbc4..aee2eccdc171 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1020,8 +1020,8 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 
 	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    !gfpflags_allow_blocking(gfp) && !coherent)
-		cpu_addr = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page,
-					       gfp);
+		page = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &cpu_addr,
+					       gfp, NULL);
 	else
 		cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs);
 	if (!cpu_addr)
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 19965fa55b95..3571fbc2b6af 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -75,8 +75,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
 }
 
 u64 dma_direct_get_required_mask(struct device *dev);
-gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
-				  u64 *phys_mask);
 void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 823e8832600e..c2249246a095 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -521,8 +521,9 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
 			pgprot_t prot, const void *caller);
 void dma_common_free_remap(void *cpu_addr, size_t size);
 
-void *dma_alloc_from_pool(struct device *dev, size_t size,
-			  struct page **ret_page, gfp_t flags);
+struct page *dma_alloc_from_pool(struct device *dev, size_t size,
+		void **cpu_addr, gfp_t flags,
+		bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t));
 bool dma_free_from_pool(struct device *dev, void *start, size_t size);
 
 int
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 7f08bd9e0d0f..bd4062daed1b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -44,7 +44,7 @@ u64 dma_direct_get_required_mask(struct device *dev)
 	return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
 }
 
-gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 				  u64 *phys_limit)
 {
 	u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
@@ -162,8 +162,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	size = PAGE_ALIGN(size);
 
 	if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
-		ret = dma_alloc_from_pool(dev, size, &page, gfp);
-		if (!ret)
+		u64 phys_mask;
+
+		gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+				&phys_mask);
+		page = dma_alloc_from_pool(dev, size, &ret, gfp,
+				dma_coherent_ok);
+		if (!page)
 			return NULL;
 		goto done;
 	}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 6bc74a2d5127..5d071d4a3cba 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -196,93 +196,75 @@ static int __init dma_atomic_pool_init(void)
 }
 postcore_initcall(dma_atomic_pool_init);
 
-static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev)
+static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
 {
-	u64 phys_mask;
-	gfp_t gfp;
-
-	gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-					  &phys_mask);
-	if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA)
+	if (prev == NULL) {
+		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+			return atomic_pool_dma32;
+		if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+			return atomic_pool_dma;
+		return atomic_pool_kernel;
+	}
+	if (prev == atomic_pool_kernel)
+		return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
+	if (prev == atomic_pool_dma32)
 		return atomic_pool_dma;
-	if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32)
-		return atomic_pool_dma32;
-	return atomic_pool_kernel;
+	return NULL;
 }
 
-static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool)
+static struct page *__dma_alloc_from_pool(struct device *dev, size_t size,
+		struct gen_pool *pool, void **cpu_addr,
+		bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
 {
-	if (bad_pool == atomic_pool_kernel)
-		return atomic_pool_dma32 ? : atomic_pool_dma;
+	unsigned long addr;
+	phys_addr_t phys;
 
-	if (bad_pool == atomic_pool_dma32)
-		return atomic_pool_dma;
+	addr = gen_pool_alloc(pool, size);
+	if (!addr)
+		return NULL;
 
-	return NULL;
-}
+	phys = gen_pool_virt_to_phys(pool, addr);
+	if (phys_addr_ok && !phys_addr_ok(dev, phys, size)) {
+		gen_pool_free(pool, addr, size);
+		return NULL;
+	}
 
-static inline struct gen_pool *dma_guess_pool(struct device *dev,
-					      struct gen_pool *bad_pool)
-{
-	if (bad_pool)
-		return dma_get_safer_pool(bad_pool);
+	if (gen_pool_avail(pool) < atomic_pool_size)
+		schedule_work(&atomic_pool_work);
 
-	return dma_guess_pool_from_device(dev);
+	*cpu_addr = (void *)addr;
+	memset(*cpu_addr, 0, size);
+	return pfn_to_page(__phys_to_pfn(phys));
 }
 
-void *dma_alloc_from_pool(struct device *dev, size_t size,
-			  struct page **ret_page, gfp_t flags)
+struct page *dma_alloc_from_pool(struct device *dev, size_t size,
+		void **cpu_addr, gfp_t gfp,
+		bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
 {
 	struct gen_pool *pool = NULL;
-	unsigned long val = 0;
-	void *ptr = NULL;
-	phys_addr_t phys;
-
-	while (1) {
-		pool = dma_guess_pool(dev, pool);
-		if (!pool) {
-			WARN(1, "Failed to get suitable pool for %s\n",
-			     dev_name(dev));
-			break;
-		}
-
-		val = gen_pool_alloc(pool, size);
-		if (!val)
-			continue;
-
-		phys = gen_pool_virt_to_phys(pool, val);
-		if (dma_coherent_ok(dev, phys, size))
-			break;
-
-		gen_pool_free(pool, val, size);
-		val = 0;
-	}
-
-
-	if (val) {
-		*ret_page = pfn_to_page(__phys_to_pfn(phys));
-		ptr = (void *)val;
-		memset(ptr, 0, size);
+	struct page *page;
 
-		if (gen_pool_avail(pool) < atomic_pool_size)
-			schedule_work(&atomic_pool_work);
+	while ((pool = dma_guess_pool(pool, gfp))) {
+		page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
+					     phys_addr_ok);
+		if (page)
+			return page;
 	}
 
-	return ptr;
+	WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev));
+	return NULL;
 }
 
 bool dma_free_from_pool(struct device *dev, void *start, size_t size)
 {
 	struct gen_pool *pool = NULL;
 
-	while (1) {
-		pool = dma_guess_pool(dev, pool);
-		if (!pool)
-			return false;
-
-		if (gen_pool_has_addr(pool, (unsigned long)start, size)) {
-			gen_pool_free(pool, (unsigned long)start, size);
-			return true;
-		}
+	while ((pool = dma_guess_pool(pool, 0))) {
+		if (!gen_pool_has_addr(pool, (unsigned long)start, size))
+			continue;
+		gen_pool_free(pool, (unsigned long)start, size);
+		return true;
 	}
+
+	return false;
 }
-- 
Gitee


From 29ee28b1189ab1eb5768b90442888c734cb11033 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Fri, 14 Aug 2020 12:26:23 +0200
Subject: [PATCH 1006/2161] dma-pool: Only allocate from CMA when in same
 memory zone

commit d7e673ec2c8e0ea39c4c70fc490d67d7fbda869d upstream.

There is no guarantee to CMA's placement, so allocating a zone specific
atomic pool from CMA might return memory from a completely different
memory zone. To get around this double check CMA's placement before
allocating from it.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 5d071d4a3cba..06582b488e31 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -3,7 +3,9 @@
  * Copyright (C) 2012 ARM Ltd.
  * Copyright (C) 2020 Google LLC
  */
+#include <linux/cma.h>
 #include <linux/debugfs.h>
+#include <linux/dma-contiguous.h>
 #include <linux/dma-direct.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/init.h>
@@ -55,6 +57,29 @@ static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
 		pool_size_kernel += size;
 }
 
+static bool cma_in_zone(gfp_t gfp)
+{
+	unsigned long size;
+	phys_addr_t end;
+	struct cma *cma;
+
+	cma = dev_get_cma_area(NULL);
+	if (!cma)
+		return false;
+
+	size = cma_get_size(cma);
+	if (!size)
+		return false;
+
+	/* CMA can't cross zone boundaries, see cma_activate_area() */
+	end = cma_get_base(cma) + size - 1;
+	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+		return end <= DMA_BIT_MASK(zone_dma_bits);
+	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+		return end <= DMA_BIT_MASK(32);
+	return true;
+}
+
 static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 			      gfp_t gfp)
 {
@@ -68,7 +93,11 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 
 	do {
 		pool_size = 1 << (PAGE_SHIFT + order);
-		page = alloc_pages(gfp, order);
+		if (cma_in_zone(gfp))
+			page = dma_alloc_from_contiguous(NULL, 1 << order,
+							 order, false);
+		if (!page)
+			page = alloc_pages(gfp, order);
 	} while (!page && order-- > 0);
 	if (!page)
 		goto out;
-- 
Gitee


From f3f34f4dc6752bd8e85ea50e8facde3e8c6c7e2e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 26 Aug 2020 14:33:30 +0300
Subject: [PATCH 1007/2161] dma-pool: Fix an uninitialized variable bug in
 atomic_pool_expand()

commit 892fc9f6835ecf075efac20789b012c5c9997fcc upstream.

The "page" pointer can be used with out being initialized.

Fixes: d7e673ec2c8e ("dma-pool: Only allocate from CMA when in same memory zone")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/pool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 06582b488e31..1281c0f0442b 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,7 +84,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 			      gfp_t gfp)
 {
 	unsigned int order;
-	struct page *page;
+	struct page *page = NULL;
 	void *addr;
 	int ret = -ENOMEM;
 
-- 
Gitee


From 7bf38ec8bc7226fa384ae9f39ed8e1e2465b7b1a Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 24 Nov 2021 17:33:33 +0800
Subject: [PATCH 1008/2161] dma-mapping: add a new dma_alloc_pages API

commit efa70f2fdc842e63a0a13223e0e83cedcc2117f1 upstream.

This API is the equivalent of alloc_pages, except that the returned memory
is guaranteed to be DMA addressable by the passed in device.  The
implementation will also be used to provide a more sensible replacement
for DMA_ATTR_NON_CONSISTENT flag.

Additionally dma_alloc_noncoherent is switched over to use dma_alloc_pages
as its backend.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de> (MIPS part)
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/DMA-attributes.txt |  8 ----
 arch/arm/mm/dma-mapping-nommu.c  |  2 +
 arch/arm/mm/dma-mapping.c        |  4 ++
 arch/x86/kernel/amd_gart_64.c    |  2 +
 drivers/iommu/dma-iommu.c        |  2 +
 drivers/parisc/ccio-dma.c        |  2 +
 drivers/parisc/sba_iommu.c       |  2 +
 include/linux/dma-direct.h       |  5 ++
 include/linux/dma-mapping.h      | 20 +++++---
 include/linux/dma-noncoherent.h  |  3 --
 kernel/dma/direct.c              | 52 ++++++++++++++++++++-
 kernel/dma/mapping.c             | 79 ++++++++++++++++++++++++++++++--
 kernel/dma/virt.c                |  2 +
 13 files changed, 162 insertions(+), 21 deletions(-)

diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
index 8f8d97f65d73..6ec8de3fde83 100644
--- a/Documentation/DMA-attributes.txt
+++ b/Documentation/DMA-attributes.txt
@@ -43,14 +43,6 @@ Since it is optional for platforms to implement DMA_ATTR_WRITE_COMBINE,
 those that do not will simply ignore the attribute and exhibit default
 behavior.
 
-DMA_ATTR_NON_CONSISTENT
------------------------
-
-DMA_ATTR_NON_CONSISTENT lets the platform to choose to return either
-consistent or non-consistent memory as it sees fit.  By using this API,
-you are guaranteeing to the platform that you have all the correct and
-necessary sync points for this memory in the driver.
-
 DMA_ATTR_NO_KERNEL_MAPPING
 --------------------------
 
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 287ef898a55e..43c6d66b6e73 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -176,6 +176,8 @@ static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist
 const struct dma_map_ops arm_nommu_dma_ops = {
 	.alloc			= arm_nommu_dma_alloc,
 	.free			= arm_nommu_dma_free,
+	.alloc_pages		= dma_direct_alloc_pages,
+	.free_pages		= dma_direct_free_pages,
 	.mmap			= arm_nommu_dma_mmap,
 	.map_page		= arm_nommu_dma_map_page,
 	.unmap_page		= arm_nommu_dma_unmap_page,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index e7d039804e3e..b182a59a3a4e 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -182,6 +182,8 @@ static void arm_dma_sync_single_for_device(struct device *dev,
 const struct dma_map_ops arm_dma_ops = {
 	.alloc			= arm_dma_alloc,
 	.free			= arm_dma_free,
+	.alloc_pages		= dma_direct_alloc_pages,
+	.free_pages		= dma_direct_free_pages,
 	.mmap			= arm_dma_mmap,
 	.get_sgtable		= arm_dma_get_sgtable,
 	.map_page		= arm_dma_map_page,
@@ -209,6 +211,8 @@ static int arm_coherent_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 const struct dma_map_ops arm_coherent_dma_ops = {
 	.alloc			= arm_coherent_dma_alloc,
 	.free			= arm_coherent_dma_free,
+	.alloc_pages		= dma_direct_alloc_pages,
+	.free_pages		= dma_direct_free_pages,
 	.mmap			= arm_coherent_dma_mmap,
 	.get_sgtable		= arm_dma_get_sgtable,
 	.map_page		= arm_coherent_dma_map_page,
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 5cfab41e8509..0ab8b47df055 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -681,6 +681,8 @@ static const struct dma_map_ops gart_dma_ops = {
 	.get_sgtable			= dma_common_get_sgtable,
 	.dma_supported			= dma_direct_supported,
 	.get_required_mask		= dma_direct_get_required_mask,
+	.alloc_pages			= dma_direct_alloc_pages,
+	.free_pages			= dma_direct_free_pages,
 };
 
 static void gart_iommu_shutdown(void)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index aee2eccdc171..d3a91e469e7c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1104,6 +1104,8 @@ static unsigned long iommu_dma_get_merge_boundary(struct device *dev)
 static const struct dma_map_ops iommu_dma_ops = {
 	.alloc			= iommu_dma_alloc,
 	.free			= iommu_dma_free,
+	.alloc_pages		= dma_common_alloc_pages,
+	.free_pages		= dma_common_free_pages,
 	.mmap			= iommu_dma_mmap,
 	.get_sgtable		= iommu_dma_get_sgtable,
 	.map_page		= iommu_dma_map_page,
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index ad290f79983b..d1eaa0dc89a3 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1025,6 +1025,8 @@ static const struct dma_map_ops ccio_ops = {
 	.map_sg = 		ccio_map_sg,
 	.unmap_sg = 		ccio_unmap_sg,
 	.get_sgtable =		dma_common_get_sgtable,
+	.alloc_pages =		dma_common_alloc_pages,
+	.free_pages =		dma_common_free_pages,
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index e410033b6df0..9a7c0b81f998 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1077,6 +1077,8 @@ static const struct dma_map_ops sba_ops = {
 	.map_sg =		sba_map_sg,
 	.unmap_sg =		sba_unmap_sg,
 	.get_sgtable =		dma_common_get_sgtable,
+	.alloc_pages =		dma_common_alloc_pages,
+	.free_pages =		dma_common_free_pages,
 };
 
 
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 3571fbc2b6af..3e8f832d5f78 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -79,6 +79,11 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
+struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
+void dma_direct_free_pages(struct device *dev, size_t size,
+		struct page *page, dma_addr_t dma_addr,
+		enum dma_data_direction dir);
 void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index c2249246a095..b231ea32648f 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -31,11 +31,6 @@
  * buffered to improve performance.
  */
 #define DMA_ATTR_WRITE_COMBINE		(1UL << 2)
-/*
- * DMA_ATTR_NON_CONSISTENT: Lets the platform to choose to return either
- * consistent or non-consistent memory as it sees fit.
- */
-#define DMA_ATTR_NON_CONSISTENT		(1UL << 3)
 /*
  * DMA_ATTR_NO_KERNEL_MAPPING: Lets the platform to avoid creating a kernel
  * virtual mapping for the allocated buffer.
@@ -84,6 +79,11 @@ struct dma_map_ops {
 	void (*free)(struct device *dev, size_t size,
 			      void *vaddr, dma_addr_t dma_handle,
 			      unsigned long attrs);
+	struct page *(*alloc_pages)(struct device *dev, size_t size,
+			dma_addr_t *dma_handle, enum dma_data_direction dir,
+			gfp_t gfp);
+	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
+			dma_addr_t dma_handle, enum dma_data_direction dir);
 	int (*mmap)(struct device *, struct vm_area_struct *,
 			  void *, dma_addr_t, size_t,
 			  unsigned long attrs);
@@ -388,6 +388,11 @@ static inline unsigned long dma_get_merge_boundary(struct device *dev)
 }
 #endif /* CONFIG_HAS_DMA */
 
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+		dma_addr_t dma_handle, enum dma_data_direction dir);
+
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
@@ -512,7 +517,10 @@ static inline void dma_sync_sgtable_for_device(struct device *dev,
 extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
-
+struct page *dma_common_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
+void dma_common_free_pages(struct device *dev, size_t size, struct page *vaddr,
+		dma_addr_t dma_handle, enum dma_data_direction dir);
 struct page **dma_common_find_pages(void *cpu_addr);
 void *dma_common_contiguous_remap(struct page *page, size_t size,
 			pgprot_t prot, const void *caller);
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index b59f1b6be3e9..431cbba6cf35 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -31,9 +31,6 @@ static __always_inline bool dma_alloc_need_uncached(struct device *dev,
 		return false;
 	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
 		return false;
-	if (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
-	    (attrs & DMA_ATTR_NON_CONSISTENT))
-		return false;
 	return true;
 }
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index bd4062daed1b..31bdca5b2aa7 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (C) 2018 Christoph Hellwig.
+ * Copyright (C) 2018-2020 Christoph Hellwig.
  *
  * DMA operations that map physical memory directly without using an IOMMU.
  */
@@ -303,6 +303,56 @@ void dma_direct_free(struct device *dev, size_t size,
 		dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
 }
 
+struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+	struct page *page;
+	void *ret;
+
+	if (dma_should_alloc_from_pool(dev, gfp, 0)) {
+		page = dma_alloc_from_pool(dev, size, &ret, gfp,
+				dma_coherent_ok);
+		if (!page)
+			return NULL;
+		goto done;
+	}
+
+	page = __dma_direct_alloc_pages(dev, size, gfp);
+	if (!page)
+		return NULL;
+	ret = page_address(page);
+	if (force_dma_unencrypted(dev)) {
+		if (set_memory_decrypted((unsigned long)ret,
+				1 << get_order(size)))
+			goto out_free_pages;
+	}
+	memset(ret, 0, size);
+done:
+	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+	return page;
+out_free_pages:
+	dma_free_contiguous(dev, page, size);
+	return NULL;
+}
+
+void dma_direct_free_pages(struct device *dev, size_t size,
+		struct page *page, dma_addr_t dma_addr,
+		enum dma_data_direction dir)
+{
+	unsigned int page_order = get_order(size);
+	void *vaddr = page_address(page);
+
+	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+	if (dma_should_free_from_pool(dev, 0) &&
+	    dma_free_from_pool(dev, vaddr, size))
+		return;
+
+	if (force_dma_unencrypted(dev))
+		set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
+
+	dma_free_contiguous(dev, page, size);
+}
+
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_SWIOTLB)
 void dma_direct_sync_sg_for_device(struct device *dev,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 0d129421e75f..fa63e1da86f5 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -8,6 +8,7 @@
 #include <linux/memblock.h> /* for max_pfn */
 #include <linux/acpi.h>
 #include <linux/dma-direct.h>
+#include <linux/dma-contiguous.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
@@ -346,9 +347,7 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
 {
 	if (force_dma_unencrypted(dev))
 		prot = pgprot_decrypted(prot);
-	if (dev_is_dma_coherent(dev) ||
-	    (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
-             (attrs & DMA_ATTR_NON_CONSISTENT)))
+	if (dev_is_dma_coherent(dev))
 		return prot;
 #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
 	if (attrs & DMA_ATTR_WRITE_COMBINE)
@@ -387,6 +386,40 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 #endif /* CONFIG_MMU */
 }
 
+struct page *dma_common_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	struct page *page;
+
+	page = dma_alloc_contiguous(dev, size, gfp);
+	if (!page)
+		page = alloc_pages_node(dev_to_node(dev), gfp, get_order(size));
+	if (!page)
+		return NULL;
+
+	*dma_handle = ops->map_page(dev, page, 0, size, dir,
+				    DMA_ATTR_SKIP_CPU_SYNC);
+	if (*dma_handle == DMA_MAPPING_ERROR) {
+		dma_free_contiguous(dev, page, size);
+		return NULL;
+	}
+
+	memset(page_address(page), 0, size);
+	return page;
+}
+
+void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
+		dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (ops->unmap_page)
+		ops->unmap_page(dev, dma_handle, size, dir,
+				DMA_ATTR_SKIP_CPU_SYNC);
+	dma_free_contiguous(dev, page, size);
+}
+
 /**
  * dma_can_mmap - check if a given device supports dma_mmap_*
  * @dev: device to check
@@ -506,6 +539,46 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 }
 EXPORT_SYMBOL(dma_free_attrs);
 
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	struct page *page;
+
+	if (WARN_ON_ONCE(!dev->coherent_dma_mask))
+		return NULL;
+	if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
+		return NULL;
+
+	size = PAGE_ALIGN(size);
+	if (dma_alloc_direct(dev, ops))
+		page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
+	else if (ops->alloc_pages)
+		page = ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+	else
+		return NULL;
+
+	debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+
+	return page;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_pages);
+
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+		dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	size = PAGE_ALIGN(size);
+	debug_dma_unmap_page(dev, dma_handle, size, dir);
+
+	if (dma_alloc_direct(dev, ops))
+		dma_direct_free_pages(dev, size, page, dma_handle, dir);
+	else if (ops->free_pages)
+		ops->free_pages(dev, size, page, dma_handle, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_pages);
+
 int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
index ebe128833af7..6986bf1fd668 100644
--- a/kernel/dma/virt.c
+++ b/kernel/dma/virt.c
@@ -55,5 +55,7 @@ const struct dma_map_ops dma_virt_ops = {
 	.free			= dma_virt_free,
 	.map_page		= dma_virt_map_page,
 	.map_sg			= dma_virt_map_sg,
+	.alloc_pages		= dma_common_alloc_pages,
+	.free_pages		= dma_common_free_pages,
 };
 EXPORT_SYMBOL(dma_virt_ops);
-- 
Gitee


From f0aac9fa939a2a57ae101d5b03abea00a276077b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2020 17:06:40 +0200
Subject: [PATCH 1009/2161] dma-direct: remove dma_direct_{alloc,free}_pages

commit 2f5388a29be82a62529d146499e70987e856f6f7 upstream.

Just merge these helpers into the main dma_direct_{alloc,free} routines,
as the additional checks are always false for the two callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/amd_gart_64.c |  6 +++---
 include/linux/dma-direct.h    |  4 ----
 kernel/dma/direct.c           | 39 ++++++++++++++---------------------
 kernel/dma/pool.c             |  2 +-
 4 files changed, 19 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 0ab8b47df055..631af911dea6 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -469,7 +469,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
 {
 	void *vaddr;
 
-	vaddr = dma_direct_alloc_pages(dev, size, dma_addr, flag, attrs);
+	vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs);
 	if (!vaddr ||
 	    !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24))
 		return vaddr;
@@ -481,7 +481,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
 		goto out_free;
 	return vaddr;
 out_free:
-	dma_direct_free_pages(dev, size, vaddr, *dma_addr, attrs);
+	dma_direct_free(dev, size, vaddr, *dma_addr, attrs);
 	return NULL;
 }
 
@@ -491,7 +491,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
 		   dma_addr_t dma_addr, unsigned long attrs)
 {
 	gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
-	dma_direct_free_pages(dev, size, vaddr, dma_addr, attrs);
+	dma_direct_free(dev, size, vaddr, dma_addr, attrs);
 }
 
 static int no_agp;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 3e8f832d5f78..a6fac4aac61f 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -84,10 +84,6 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 void dma_direct_free_pages(struct device *dev, size_t size,
 		struct page *page, dma_addr_t dma_addr,
 		enum dma_data_direction dir);
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
-void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
-		dma_addr_t dma_addr, unsigned long attrs);
 int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 31bdca5b2aa7..3da1d646cf91 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -152,13 +152,18 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	return page;
 }
 
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
+void *dma_direct_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
 	struct page *page;
 	void *ret;
 	int err;
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    dma_alloc_need_uncached(dev, attrs))
+		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+
 	size = PAGE_ALIGN(size);
 
 	if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
@@ -254,11 +259,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	return NULL;
 }
 
-void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
-		dma_addr_t dma_addr, unsigned long attrs)
+void dma_direct_free(struct device *dev, size_t size,
+		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
 {
 	unsigned int page_order = get_order(size);
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    dma_alloc_need_uncached(dev, attrs)) {
+		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
+		return;
+	}
+
 	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
 	if (dma_should_free_from_pool(dev, attrs) &&
 	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
@@ -282,27 +294,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 	dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
 }
 
-void *dma_direct_alloc(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
-{
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
-	return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
-}
-
-void dma_direct_free(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
-{
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
-	else
-		dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
-}
-
 struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
 {
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 1281c0f0442b..fe11643ff9cc 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -115,7 +115,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 #endif
 	/*
 	 * Memory in the atomic DMA pools must be unencrypted, the pools do not
-	 * shrink so no re-encryption occurs in dma_direct_free_pages().
+	 * shrink so no re-encryption occurs in dma_direct_free().
 	 */
 	ret = set_memory_decrypted((unsigned long)page_to_virt(page),
 				   1 << order);
-- 
Gitee


From 45bef2328a12defd509891a35da266b6a4b33017 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2020 17:14:28 +0200
Subject: [PATCH 1010/2161] dma-direct: lift gfp_t manipulation out
 of__dma_direct_alloc_pages

commit 3773dfe6ea4d220dcccb03827fca94ec3d39bc17 upstream.

Move the detailed gfp_t setup from __dma_direct_alloc_pages into the
caller to clean things up a little.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 3da1d646cf91..d1ba3023cc42 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -109,7 +109,7 @@ static inline bool dma_should_free_from_pool(struct device *dev,
 }
 
 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
-		gfp_t gfp, unsigned long attrs)
+		gfp_t gfp)
 {
 	int node = dev_to_node(dev);
 	struct page *page = NULL;
@@ -117,11 +117,6 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 
 	WARN_ON_ONCE(!PAGE_ALIGNED(size));
 
-	if (attrs & DMA_ATTR_NO_WARN)
-		gfp |= __GFP_NOWARN;
-
-	/* we always manually zero the memory once we are done: */
-	gfp &= ~__GFP_ZERO;
 	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
 					   &phys_limit);
 	page = dma_alloc_contiguous(dev, size, gfp);
@@ -165,6 +160,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 
 	size = PAGE_ALIGN(size);
+	if (attrs & DMA_ATTR_NO_WARN)
+		gfp |= __GFP_NOWARN;
 
 	if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
 		u64 phys_mask;
@@ -178,7 +175,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 		goto done;
 	}
 
-	page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
+	/* we always manually zero the memory once we are done */
+	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
 	if (!page)
 		return NULL;
 
-- 
Gitee


From b8b4646cbf19c0f3dba1ac5459821ac4a4a97340 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 2 Sep 2020 10:31:03 -0700
Subject: [PATCH 1011/2161] swiotlb: Use %pa to print phys_addr_t variables

commit 4db7b6aacc573b89227a1b5960af9d2a17f17762 upstream.

There is an extension to a %p to print phys_addr_t type of variables.
Use it here.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index bd86c09c9f1e..760b21f86e16 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -172,9 +172,7 @@ void swiotlb_print_info(void)
 		return;
 	}
 
-	pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n",
-	       (unsigned long long)io_tlb_start,
-	       (unsigned long long)io_tlb_end,
+	pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end,
 	       bytes >> 20);
 }
 
-- 
Gitee


From 489902e336178ecb0f1ea1e9ecf4035559b694c2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 2 Sep 2020 20:31:05 +0300
Subject: [PATCH 1012/2161] swiotlb: Mark max_segment with static keyword

commit b51e627158cb0f67569aa46b5a08b568d20a81a4 upstream.

Sparse is not happy about max_segment declaration:

  CHECK   kernel/dma/swiotlb.c
  kernel/dma/swiotlb.c:96:14: warning: symbol 'max_segment' was not declared. Should it be static?

Mark it static as suggested.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 760b21f86e16..b2f9527fada5 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -93,7 +93,7 @@ static unsigned int io_tlb_index;
  * Max segment that we can provide which (if pages are contingous) will
  * not be bounced (unless SWIOTLB_FORCE is set).
  */
-unsigned int max_segment;
+static unsigned int max_segment;
 
 /*
  * We need to save away the original address corresponding to a mapped entry
-- 
Gitee


From 0bb422fe4f46b59f6c5fadc37d218a98c53be2ac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Feb 2020 18:11:10 +0100
Subject: [PATCH 1013/2161] dma-direct: relax addressability checks in
 dma_direct_supported

commit 91ef26f914171cf753330f13724fd9142b5b1640 upstream.

dma_direct_supported tries to find the minimum addressable bitmask
based on the end pfn and optional magic that architectures can use
to communicate the size of the magic ZONE_DMA that can be used
for bounce buffering.  But between the DMA offsets that can change
per device (or sometimes even region), the fact the ZONE_DMA isn't
even guaranteed to be the lowest addresses and failure of having
proper interfaces to the MM code this fails at least for one
arm subarchitecture.

As all the legacy DMA implementations have supported 32-bit DMA
masks, and 32-bit masks are guranteed to always work by the API
contract (using bounce buffers if needed), we can short cut the
complicated check and always return true without breaking existing
assumptions.  Hopefully we can properly clean up the interaction
with the arch defined zones and the bootmem allocator eventually.

Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs")
Reported-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index d1ba3023cc42..1c7d9720fdcb 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -476,28 +476,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 			user_count << PAGE_SHIFT, vma->vm_page_prot);
 }
 
-/*
- * Because 32-bit DMA masks are so common we expect every architecture to be
- * able to satisfy them - either by not supporting more physical memory, or by
- * providing a ZONE_DMA32.  If neither is the case, the architecture needs to
- * use an IOMMU instead of the direct mapping.
- */
 int dma_direct_supported(struct device *dev, u64 mask)
 {
-	u64 min_mask;
-
-	if (IS_ENABLED(CONFIG_ZONE_DMA))
-		min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
-	else
-		min_mask = DMA_BIT_MASK(32);
+	u64 min_mask = (max_pfn - 1) << PAGE_SHIFT;
 
-	min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
+	/*
+	 * Because 32-bit DMA masks are so common we expect every architecture
+	 * to be able to satisfy them - either by not supporting more physical
+	 * memory, or by providing a ZONE_DMA32.  If neither is the case, the
+	 * architecture needs to use an IOMMU instead of the direct mapping.
+	 */
+	if (mask >= DMA_BIT_MASK(32))
+		return 1;
 
 	/*
 	 * This check needs to be against the actual bit mask value, so use
 	 * phys_to_dma_unencrypted() here so that the SME encryption mask isn't
 	 * part of the check.
 	 */
+	if (IS_ENABLED(CONFIG_ZONE_DMA))
+		min_mask = min_t(u64, min_mask, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS));
 	return mask >= phys_to_dma_unencrypted(dev, min_mask);
 }
 
-- 
Gitee


From 121389766a8aa8d50edb097c53aede24089352a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Sep 2020 17:56:22 +0200
Subject: [PATCH 1014/2161] dma-direct: remove __dma_to_phys

commit 7bc5c428a660d4d1bc95ba54bf4cb6bccf8c3029 upstream.

There is no harm in just always clearing the SME encryption bit, while
significantly simplifying the interface.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/arm/include/asm/dma-direct.h    |  2 +-
 arch/mips/bmips/dma.c                |  2 +-
 arch/mips/cavium-octeon/dma-octeon.c |  2 +-
 arch/mips/include/asm/dma-direct.h   |  2 +-
 arch/mips/pci/pci-ar2315.c           |  2 +-
 arch/mips/sgi-ip32/ip32-dma.c        |  2 +-
 include/linux/dma-direct.h           | 12 ++++--------
 kernel/dma/direct.c                  |  6 +-----
 8 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/arch/arm/include/asm/dma-direct.h b/arch/arm/include/asm/dma-direct.h
index b939a31f7a14..bca0de567534 100644
--- a/arch/arm/include/asm/dma-direct.h
+++ b/arch/arm/include/asm/dma-direct.h
@@ -8,7 +8,7 @@ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 	return pfn_to_dma(dev, __phys_to_pfn(paddr)) + offset;
 }
 
-static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr)
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 {
 	unsigned int offset = dev_addr & ~PAGE_MASK;
 	return __pfn_to_phys(dma_to_pfn(dev, dev_addr)) + offset;
diff --git a/arch/mips/bmips/dma.c b/arch/mips/bmips/dma.c
index df56bf4179e3..ba2a5d33dfd3 100644
--- a/arch/mips/bmips/dma.c
+++ b/arch/mips/bmips/dma.c
@@ -52,7 +52,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t pa)
 	return pa;
 }
 
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr)
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr)
 {
 	struct bmips_dma_range *r;
 
diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c
index 14ea680d180e..388b13ba2558 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -177,7 +177,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 	return paddr;
 }
 
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr)
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
 #ifdef CONFIG_PCI
 	if (dev && dev_is_pci(dev))
diff --git a/arch/mips/include/asm/dma-direct.h b/arch/mips/include/asm/dma-direct.h
index 14e352651ce9..8e178651c638 100644
--- a/arch/mips/include/asm/dma-direct.h
+++ b/arch/mips/include/asm/dma-direct.h
@@ -3,6 +3,6 @@
 #define _MIPS_DMA_DIRECT_H 1
 
 dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr);
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr);
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr);
 
 #endif /* _MIPS_DMA_DIRECT_H */
diff --git a/arch/mips/pci/pci-ar2315.c b/arch/mips/pci/pci-ar2315.c
index 0fed6fc17fe4..89b908ff4991 100644
--- a/arch/mips/pci/pci-ar2315.c
+++ b/arch/mips/pci/pci-ar2315.c
@@ -175,7 +175,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 	return paddr + ar2315_dev_offset(dev);
 }
 
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr)
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr - ar2315_dev_offset(dev);
 }
diff --git a/arch/mips/sgi-ip32/ip32-dma.c b/arch/mips/sgi-ip32/ip32-dma.c
index fa7b17cb5385..160317294d97 100644
--- a/arch/mips/sgi-ip32/ip32-dma.c
+++ b/arch/mips/sgi-ip32/ip32-dma.c
@@ -27,7 +27,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 	return dma_addr;
 }
 
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr)
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr)
 {
 	phys_addr_t paddr = dma_addr & RAM_OFFSET_MASK;
 
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index a6fac4aac61f..77b264d20ba2 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -37,11 +37,12 @@ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 	return __sme_set(phys_to_dma_unencrypted(dev, paddr));
 }
 
-static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr)
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 {
-	phys_addr_t paddr = (phys_addr_t)dev_addr;
+	phys_addr_t paddr = (phys_addr_t)dev_addr +
+		((phys_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
 
-	return paddr + ((phys_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
+	return __sme_clr(paddr);
 }
 #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
@@ -54,11 +55,6 @@ static inline bool force_dma_unencrypted(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */
 
-static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
-{
-	return __sme_clr(__dma_to_phys(dev, daddr));
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
 		bool is_ram)
 {
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1c7d9720fdcb..c1198dd1b949 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -49,11 +49,6 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 {
 	u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
 
-	if (force_dma_unencrypted(dev))
-		*phys_limit = __dma_to_phys(dev, dma_limit);
-	else
-		*phys_limit = dma_to_phys(dev, dma_limit);
-
 	/*
 	 * Optimistically try the zone that the physical address mask falls
 	 * into first.  If that returns memory that isn't actually addressable
@@ -62,6 +57,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	 * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
 	 * zones.
 	 */
+	*phys_limit = dma_to_phys(dev, dma_limit);
 	if (*phys_limit <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
 		return GFP_DMA;
 	if (*phys_limit <= DMA_BIT_MASK(32))
-- 
Gitee


From 86abf88005dd16418a112a5760b882b81bf6ef4a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:24:56 -0700
Subject: [PATCH 1015/2161] bpf: Add typecast to bpf helpers to help BTF
 generation

commit 7c6a469e3416fa23568c2395a3faa7dd6e376dcb upstream.

When pahole converts dwarf to btf it emits only used types.
Wrap existing bpf helper functions into typedef and use it in
typecast to make gcc emits this type into dwarf.
Then pahole will convert it to btf.
The "btf_#name_of_helper" types will be used to figure out
types of arguments of bpf helpers.
The generated code before and after is the same.
Only dwarf and btf sections are different.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-3-ast@kernel.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/filter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7c75019718e4..ee401930f397 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -482,10 +482,11 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 #define BPF_CALL_x(x, name, ...)					       \
 	static __always_inline						       \
 	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
+	typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
 	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));	       \
 	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))	       \
 	{								       \
-		return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
+		return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
 	}								       \
 	static __always_inline						       \
 	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))
-- 
Gitee


From 1e5ee4cbb3740605eac199845c9b68f4aa54e18c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 May 2020 08:19:29 -0700
Subject: [PATCH 1016/2161] perf/x86/intel/uncore: Validate MMIO address before
 accessing

commit f01719730bbe04b90ae60c7e9d2b6d3533308502 upstream.

An oops will be triggered, if perf tries to access an invalid address
which exceeds the mapped area.

Check the address before the actual access to MMIO sapce of an uncore
unit.

Suggested-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1590679169-61823-3-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/uncore.c       |  3 +++
 arch/x86/events/intel/uncore.h       | 12 ++++++++++++
 arch/x86/events/intel/uncore_snbep.c |  6 ++++++
 3 files changed, 21 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 3d6b5d87b3cc..b02a900deb65 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -139,6 +139,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return 0;
 
+	if (!uncore_mmio_is_valid_offset(box, event->hw.event_base))
+		return 0;
+
 	return readq(box->io_addr + event->hw.event_base);
 }
 
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index c8f9e4118813..9e66271a8d70 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -227,6 +227,18 @@ static inline bool uncore_pmc_freerunning(int idx)
 	return idx == UNCORE_PMC_IDX_FREERUNNING;
 }
 
+static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box,
+					       unsigned long offset)
+{
+	if (offset < box->pmu->type->mmio_map_size)
+		return true;
+
+	pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n",
+		     offset, box->pmu->type->name);
+
+	return false;
+}
+
 static inline
 unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box)
 {
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 6e5360f2b6d7..3fe2d6970f90 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -4689,6 +4689,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return;
 
+	if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+		return;
+
 	writel(hwc->config | SNBEP_PMON_CTL_EN,
 	       box->io_addr + hwc->config_base);
 }
@@ -4701,6 +4704,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return;
 
+	if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+		return;
+
 	writel(hwc->config, box->io_addr + hwc->config_base);
 }
 
-- 
Gitee


From 2c71d58950e68e6fc811f267b7be26b12e11372e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:47 +0200
Subject: [PATCH 1017/2161] x86/msi: Use generic MSI domain ops

commit 9006c133a422f474d7d8e10a8baae179f70c22f5 upstream.

pci_msi_get_hwirq() and pci_msi_set_desc are not longer special. Enable the
generic MSI domain ops in the core and PCI MSI code unconditionally and get
rid of the x86 specific implementations in the X86 MSI code and in the
hyperv PCI driver.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200826112332.564274859@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/msi.h          |  2 --
 arch/x86/kernel/apic/msi.c          | 23 +----------------------
 drivers/pci/controller/pci-hyperv.c |  8 --------
 drivers/pci/msi.c                   |  6 +-----
 include/linux/msi.h                 |  1 -
 kernel/irq/msi.c                    |  6 ------
 6 files changed, 2 insertions(+), 44 deletions(-)

diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 9db196bba88b..322fd905da9c 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -58,6 +58,4 @@ typedef struct x86_msi_addr_hi {
 struct msi_msg;
 u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid);
 
-void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
-
 #endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 188166a0e77c..c3455f2be799 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -155,12 +155,6 @@ static struct irq_chip pci_msi_controller = {
 				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
-static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
-					 msi_alloc_info_t *arg)
-{
-	return arg->hwirq;
-}
-
 int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
 		    msi_alloc_info_t *arg)
 {
@@ -179,17 +173,8 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
 }
 EXPORT_SYMBOL_GPL(pci_msi_prepare);
 
-void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
-{
-	arg->desc = desc;
-	arg->hwirq = pci_msi_domain_calc_hwirq(desc);
-}
-EXPORT_SYMBOL_GPL(pci_msi_set_desc);
-
 static struct msi_domain_ops pci_msi_domain_ops = {
-	.get_hwirq	= pci_msi_get_hwirq,
 	.msi_prepare	= pci_msi_prepare,
-	.set_desc	= pci_msi_set_desc,
 };
 
 static struct msi_domain_info pci_msi_domain_info = {
@@ -289,12 +274,6 @@ static struct irq_chip dmar_msi_controller = {
 				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
-static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
-					  msi_alloc_info_t *arg)
-{
-	return arg->hwirq;
-}
-
 static int dmar_msi_init(struct irq_domain *domain,
 			 struct msi_domain_info *info, unsigned int virq,
 			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
@@ -306,7 +285,6 @@ static int dmar_msi_init(struct irq_domain *domain,
 }
 
 static struct msi_domain_ops dmar_msi_domain_ops = {
-	.get_hwirq	= dmar_msi_get_hwirq,
 	.msi_init	= dmar_msi_init,
 };
 
@@ -348,6 +326,7 @@ int dmar_alloc_hwirq(int id, int node, void *arg)
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_DMAR;
 	info.devid = id;
+	info.hwirq = id;
 	info.data = arg;
 
 	return irq_domain_alloc_irqs(domain, 1, node, &info);
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 36235782adab..cb799e458e93 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1510,16 +1510,8 @@ static struct irq_chip hv_msi_irq_chip = {
 	.irq_unmask		= hv_irq_unmask,
 };
 
-static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
-						   msi_alloc_info_t *arg)
-{
-	return arg->hwirq;
-}
-
 static struct msi_domain_ops hv_msi_ops = {
-	.get_hwirq	= hv_msi_domain_ops_get_hwirq,
 	.msi_prepare	= pci_msi_prepare,
-	.set_desc	= pci_msi_set_desc,
 	.msi_free	= hv_msi_free,
 };
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index c0279e86bf5d..dec89aa8bccb 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1370,7 +1370,7 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg)
  *
  * The ID number is only used within the irqdomain.
  */
-irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc)
+static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc)
 {
 	struct pci_dev *dev = msi_desc_to_pci_dev(desc);
 
@@ -1421,16 +1421,12 @@ static int pci_msi_domain_handle_error(struct irq_domain *domain,
 	return error;
 }
 
-#ifdef GENERIC_MSI_DOMAIN_OPS
 static void pci_msi_domain_set_desc(msi_alloc_info_t *arg,
 				    struct msi_desc *desc)
 {
 	arg->desc = desc;
 	arg->hwirq = pci_msi_domain_calc_hwirq(desc);
 }
-#else
-#define pci_msi_domain_set_desc		NULL
-#endif
 
 static struct msi_domain_ops pci_msi_domain_ops_default = {
 	.set_desc	= pci_msi_domain_set_desc,
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 3de0c1c41862..b211b1533fe0 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -413,7 +413,6 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent);
-irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc);
 int pci_msi_domain_check_cap(struct irq_domain *domain,
 			     struct msi_domain_info *info, struct device *dev);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5d3da0db092f..0fef031d34d1 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -187,7 +187,6 @@ static const struct irq_domain_ops msi_domain_ops = {
 	.deactivate	= msi_domain_deactivate,
 };
 
-#ifdef GENERIC_MSI_DOMAIN_OPS
 static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
 						msi_alloc_info_t *arg)
 {
@@ -206,11 +205,6 @@ static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
 {
 	arg->desc = desc;
 }
-#else
-#define msi_domain_ops_get_hwirq	NULL
-#define msi_domain_ops_prepare		NULL
-#define msi_domain_ops_set_desc		NULL
-#endif /* !GENERIC_MSI_DOMAIN_OPS */
 
 static int msi_domain_ops_init(struct irq_domain *domain,
 			       struct msi_domain_info *info,
-- 
Gitee


From a8a0262c6028e6fe56ade8141e0fb9287a97ee1d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:17:02 +0200
Subject: [PATCH 1018/2161] PCI/MSI: Make arch_.*_msi_irq[s] fallbacks
 selectable

commit opencloudos.

The arch_.*_msi_irq[s] fallbacks are compiled in whether an architecture
requires them or not. Architectures which are fully utilizing hierarchical
irq domains should never call into that code.

It's not only architectures which depend on that by implementing one or
more of the weak functions, there is also a bunch of drivers which relies
on the weak functions which invoke msi_controller::setup_irq[s] and
msi_controller::teardown_irq.

Make the architectures and drivers which rely on them select them in Kconfig
and if not selected replace them by stub functions which emit a warning and
fail the PCI/MSI interrupt allocation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112333.992429909@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/ia64/Kconfig              |  1 +
 arch/mips/Kconfig              |  1 +
 arch/powerpc/Kconfig           |  1 +
 arch/s390/Kconfig              |  1 +
 arch/sparc/Kconfig             |  1 +
 drivers/pci/Kconfig            |  3 +++
 drivers/pci/controller/Kconfig |  2 ++
 drivers/pci/msi.c              |  3 ++-
 include/linux/msi.h            | 31 ++++++++++++++++++++++++++-----
 9 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bab7cd878464..69ef5c990dd1 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -57,6 +57,7 @@ config IA64
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 	select NUMA if !FLATMEM
+	select PCI_MSI_ARCH_FALLBACKS
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f586c22eab92..57f957694dc1 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -82,6 +82,7 @@ config MIPS
 	select MODULES_USE_ELF_RELA if MODULES && 64BIT
 	select MODULES_USE_ELF_REL if MODULES
 	select PERF_USE_VMALLOC
+	select PCI_MSI_ARCH_FALLBACKS
 	select RTC_LIB
 	select SYSCTL_EXCEPTION_TRACE
 	select VIRT_TO_BUS
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 757175ccf53c..c0c7d51fc3ee 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -242,6 +242,7 @@ config PPC
 	select OLD_SIGACTION			if PPC32
 	select OLD_SIGSUSPEND
 	select PCI_DOMAINS			if PCI
+	select PCI_MSI_ARCH_FALLBACKS
 	select PCI_SYSCALL			if PCI
 	select PPC_DAWR				if PPC64
 	select RTC_LIB
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 43a81d0ad507..6bda8c9b519d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -182,6 +182,7 @@ config S390
 	select OLD_SIGSUSPEND3
 	select PCI_DOMAINS		if PCI
 	select PCI_MSI			if PCI
+	select PCI_MSI_ARCH_FALLBACKS
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 349e27771cea..2717701a0bfc 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select GENERIC_STRNLEN_USER
 	select MODULES_USE_ELF_RELA
 	select PCI_SYSCALL if PCI
+	select PCI_MSI_ARCH_FALLBACKS
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select CPU_NO_EFFICIENT_FFS
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index a304f5ea11b9..52a3d54f74c3 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -56,6 +56,9 @@ config PCI_MSI_IRQ_DOMAIN
 	depends on PCI_MSI
 	select GENERIC_MSI_IRQ_DOMAIN
 
+config PCI_MSI_ARCH_FALLBACKS
+	bool
+
 config PCI_QUIRKS
 	default y
 	bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index 70e078238899..addfbf1e6122 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -69,6 +69,7 @@ config PCI_TEGRA
 	bool "NVIDIA Tegra PCIe controller"
 	depends on ARCH_TEGRA || COMPILE_TEST
 	depends on PCI_MSI_IRQ_DOMAIN
+	select PCI_MSI_ARCH_FALLBACKS
 	help
 	  Say Y here if you want support for the PCIe host controller found
 	  on NVIDIA Tegra SoCs.
@@ -105,6 +106,7 @@ config PCI_HOST_GENERIC
 config PCIE_XILINX
 	bool "Xilinx AXI PCIe host bridge support"
 	depends on OF || COMPILE_TEST
+	select PCI_MSI_ARCH_FALLBACKS
 	help
 	  Say 'Y' here if you want kernel to support the Xilinx AXI PCIe
 	  Host Bridge driver.
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index dec89aa8bccb..111941199341 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -58,8 +58,8 @@ static void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
 #define pci_msi_teardown_msi_irqs	arch_teardown_msi_irqs
 #endif
 
+#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
 /* Arch hooks */
-
 int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 {
 	struct msi_controller *chip = dev->bus->msi;
@@ -132,6 +132,7 @@ void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
 {
 	return default_teardown_msi_irqs(dev);
 }
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
 
 static void default_restore_msi_irq(struct pci_dev *dev, int irq)
 {
diff --git a/include/linux/msi.h b/include/linux/msi.h
index b211b1533fe0..fe1ab1ba2551 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -238,17 +238,38 @@ void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
 /*
- * The arch hooks to setup up msi irqs. Those functions are
- * implemented as weak symbols so that they /can/ be overriden by
- * architecture specific code if needed.
+ * The arch hooks to setup up msi irqs. Default functions are implemented
+ * as weak symbols so that they /can/ be overriden by architecture specific
+ * code if needed. These hooks must be enabled by the architecture or by
+ * drivers which depend on them via msi_controller based MSI handling.
+ *
+ * If CONFIG_PCI_MSI_ARCH_FALLBACKS is not selected they are replaced by
+ * stubs with warnings.
  */
+#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
 int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
 void arch_teardown_msi_irq(unsigned int irq);
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void arch_teardown_msi_irqs(struct pci_dev *dev);
-void arch_restore_msi_irqs(struct pci_dev *dev);
-
 void default_teardown_msi_irqs(struct pci_dev *dev);
+#else
+static inline int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	WARN_ON_ONCE(1);
+	return -ENODEV;
+}
+
+static inline void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+	WARN_ON_ONCE(1);
+}
+#endif
+
+/*
+ * The restore hooks are still available as they are useful even
+ * for fully irq domain based setups. Courtesy to XEN/X86.
+ */
+void arch_restore_msi_irqs(struct pci_dev *dev);
 void default_restore_msi_irqs(struct pci_dev *dev);
 
 struct msi_controller {
-- 
Gitee


From 52ae19c74e2d50415b9c96d5eaf761b4bb81382e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 27 Sep 2020 10:46:44 +0200
Subject: [PATCH 1019/2161] x86/apic/msi: Unbreak DMAR and HPET MSI

commit d27e623ace6af259075b6e0437380ee8d6268c5d upstream.

Switching the DMAR and HPET MSI code to use the generic MSI domain ops
missed to add the flag which tells the core code to update the domain
operations with the defaults. As a consequence the core code crashes
when an interrupt in one of those domains is allocated.

Add the missing flags.

Fixes: 9006c133a422 ("x86/msi: Use generic MSI domain ops")
Reported-by: Qian Cai <cai@redhat.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/87wo0fli8b.fsf@nanos.tec.linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/msi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index c3455f2be799..17a71d0580d3 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -291,6 +291,7 @@ static struct msi_domain_ops dmar_msi_domain_ops = {
 static struct msi_domain_info dmar_msi_domain_info = {
 	.ops		= &dmar_msi_domain_ops,
 	.chip		= &dmar_msi_controller,
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS,
 };
 
 static struct irq_domain *dmar_get_irq_domain(void)
-- 
Gitee


From b371fdf4faf01a970724f5f84d1a3b347f58f50f Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Mon, 17 Aug 2020 22:00:49 +0100
Subject: [PATCH 1020/2161] iommu: Rename iommu_tlb_* functions to
 iommu_iotlb_*

commit opencloudos.

To keep naming consistent we should stick with *iotlb*. This patch
renames a few remaining functions.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Link: https://lore.kernel.org/r/20200817210051.13546-1-murphyt7@tcd.ie
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c       |  2 +-
 drivers/iommu/iommu.c           |  4 ++--
 drivers/vfio/vfio_iommu_type1.c |  2 +-
 include/linux/io-pgtable.h      |  2 +-
 include/linux/iommu.h           | 10 +++++-----
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d3a91e469e7c..94729fd7c697 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -466,7 +466,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
 	WARN_ON(unmapped != size);
 
 	if (!cookie->fq_domain)
-		iommu_tlb_sync(domain, &iotlb_gather);
+		iommu_iotlb_sync(domain, &iotlb_gather);
 	iommu_dma_free_iova(cookie, dma_addr, size);
 }
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 8a915752fb1f..e9dc8d362bd3 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -768,7 +768,7 @@ static int iommu_create_device_direct_mappings(struct iommu_group *group,
 
 	}
 
-	iommu_flush_tlb_all(domain);
+	iommu_flush_iotlb_all(domain);
 
 out:
 	iommu_put_resv_regions(dev, &mappings);
@@ -2336,7 +2336,7 @@ size_t iommu_unmap(struct iommu_domain *domain,
 
 	iommu_iotlb_gather_init(&iotlb_gather);
 	ret = __iommu_unmap(domain, iova, size, &iotlb_gather);
-	iommu_tlb_sync(domain, &iotlb_gather);
+	iommu_iotlb_sync(domain, &iotlb_gather);
 
 	return ret;
 }
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f8b3af3ebbdd..15a780f4d75e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -707,7 +707,7 @@ static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
 	long unlocked = 0;
 	struct vfio_regions *entry, *next;
 
-	iommu_tlb_sync(domain->domain, iotlb_gather);
+	iommu_iotlb_sync(domain->domain, iotlb_gather);
 
 	list_for_each_entry_safe(entry, next, regions, list) {
 		unlocked += vfio_unpin_pages_remote(dma,
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ec7a13405f10..a2ebcbff7d1a 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -31,7 +31,7 @@ enum io_pgtable_fmt {
  *                  single page.  IOMMUs that cannot batch TLB invalidation
  *                  operations efficiently will typically issue them here, but
  *                  others may decide to update the iommu_iotlb_gather structure
- *                  and defer the invalidation until iommu_tlb_sync() instead.
+ *                  and defer the invalidation until iommu_iotlb_sync() instead.
  *
  * Note that these can all be called in atomic context and must therefore
  * not block.
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 513ec3dc0170..3d2b46daa4c3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -556,13 +556,13 @@ extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
-static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
+static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
 {
 	if (domain->ops->flush_iotlb_all)
 		domain->ops->flush_iotlb_all(domain);
 }
 
-static inline void iommu_tlb_sync(struct iommu_domain *domain,
+static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
 	if (domain->ops->iotlb_sync)
@@ -585,7 +585,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 	if (gather->pgsize != size ||
 	    end < gather->start || start > gather->end) {
 		if (gather->pgsize)
-			iommu_tlb_sync(domain, gather);
+			iommu_iotlb_sync(domain, gather);
 		gather->pgsize = size;
 	}
 
@@ -765,11 +765,11 @@ static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain,
 	return 0;
 }
 
-static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
+static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
 {
 }
 
-static inline void iommu_tlb_sync(struct iommu_domain *domain,
+static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
 }
-- 
Gitee


From 2410f63d5099a72c9b618d9dff13092fab4db218 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 3 Sep 2020 12:34:04 +0100
Subject: [PATCH 1021/2161] iommu/dma: Remove broken huge page handling

commit 4604393ca0c6e4a73c04798c3f1d9878e70b7251 upstream.

The attempt to handle huge page allocations was originally added since
the comments around stripping __GFP_COMP in other implementations were
nonsensical, and we naively assumed that split_huge_page() could simply
be called equivalently to split_page(). It turns out that this doesn't
actually work correctly, so just get rid of it - there's little point
going to the effort of allocating huge pages if we're only going to
split them anyway.

Reported-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/e287dbe69aa0933abafd97c80631940fd188ddd1.1599132844.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 94729fd7c697..b696b5704b8e 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -516,6 +516,9 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 	/* IOMMU can map any pages, so himem can also be used here */
 	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
 
+	/* It makes no sense to muck about with huge pages */
+	gfp &= ~__GFP_COMP;
+
 	while (count) {
 		struct page *page = NULL;
 		unsigned int order_size;
@@ -536,15 +539,9 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 			page = alloc_pages_node(nid, alloc_flags, order);
 			if (!page)
 				continue;
-			if (!order)
-				break;
-			if (!PageCompound(page)) {
+			if (order)
 				split_page(page, order);
-				break;
-			} else if (!split_huge_page(page)) {
-				break;
-			}
-			__free_pages(page, order);
+			break;
 		}
 		if (!page) {
 			__iommu_dma_free_pages(pages, i);
-- 
Gitee


From 79bd6c6e603005bf272466684c073839a90d8f07 Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Thu, 10 Sep 2020 13:25:38 +0100
Subject: [PATCH 1022/2161] iommu/dma: Handle init_iova_flush_queue() failure
 in dma-iommu path

commit b34e9b0de3c4117e70f09ae354d763cc4dd85bdf upstream.

The init_iova_flush_queue() function can fail if we run out of memory. Fall
back to noflush queue if it fails.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20200910122539.3662-1-murphyt7@tcd.ie
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index b696b5704b8e..02f18221f412 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -353,8 +353,11 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 
 	if (!cookie->fq_domain && !iommu_domain_get_attr(domain,
 			DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) {
-		cookie->fq_domain = domain;
-		init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all, NULL);
+		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
+					NULL))
+			pr_warn("iova flush queue initialization failed\n");
+		else
+			cookie->fq_domain = domain;
 	}
 
 	if (!dev)
-- 
Gitee


From 0a420f2beef99753e5b027431780bbb3e04576b7 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.pan.linux@gmail.com>
Date: Fri, 25 Sep 2020 09:32:42 -0700
Subject: [PATCH 1023/2161] docs: IOMMU user API

commit d0023e3ee28d12b2b0af4856c405f3d13fb59d1c upstream.

IOMMU UAPI is newly introduced to support communications between guest
virtual IOMMU and host IOMMU. There has been lots of discussions on how
it should work with VFIO UAPI and userspace in general.

This document is intended to clarify the UAPI design and usage. The
mechanics of how future extensions should be achieved are also covered
in this documentation.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Cc: linux-api@vger.kernel.org
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-doc@vger.kernel.org
Link: https://lore.kernel.org/r/1601051567-54787-2-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/userspace-api/iommu.rst | 209 ++++++++++++++++++++++++++
 MAINTAINERS                           |   1 +
 2 files changed, 210 insertions(+)
 create mode 100644 Documentation/userspace-api/iommu.rst

diff --git a/Documentation/userspace-api/iommu.rst b/Documentation/userspace-api/iommu.rst
new file mode 100644
index 000000000000..d3108c1519d5
--- /dev/null
+++ b/Documentation/userspace-api/iommu.rst
@@ -0,0 +1,209 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. iommu:
+
+=====================================
+IOMMU Userspace API
+=====================================
+
+IOMMU UAPI is used for virtualization cases where communications are
+needed between physical and virtual IOMMU drivers. For baremetal
+usage, the IOMMU is a system device which does not need to communicate
+with userspace directly.
+
+The primary use cases are guest Shared Virtual Address (SVA) and
+guest IO virtual address (IOVA), wherein the vIOMMU implementation
+relies on the physical IOMMU and for this reason requires interactions
+with the host driver.
+
+.. contents:: :local:
+
+Functionalities
+===============
+Communications of user and kernel involve both directions. The
+supported user-kernel APIs are as follows:
+
+1. Bind/Unbind guest PASID (e.g. Intel VT-d)
+2. Bind/Unbind guest PASID table (e.g. ARM SMMU)
+3. Invalidate IOMMU caches upon guest requests
+4. Report errors to the guest and serve page requests
+
+Requirements
+============
+The IOMMU UAPIs are generic and extensible to meet the following
+requirements:
+
+1. Emulated and para-virtualised vIOMMUs
+2. Multiple vendors (Intel VT-d, ARM SMMU, etc.)
+3. Extensions to the UAPI shall not break existing userspace
+
+Interfaces
+==========
+Although the data structures defined in IOMMU UAPI are self-contained,
+there are no user API functions introduced. Instead, IOMMU UAPI is
+designed to work with existing user driver frameworks such as VFIO.
+
+Extension Rules & Precautions
+-----------------------------
+When IOMMU UAPI gets extended, the data structures can *only* be
+modified in two ways:
+
+1. Adding new fields by re-purposing the padding[] field. No size change.
+2. Adding new union members at the end. May increase the structure sizes.
+
+No new fields can be added *after* the variable sized union in that it
+will break backward compatibility when offset moves. A new flag must
+be introduced whenever a change affects the structure using either
+method. The IOMMU driver processes the data based on flags which
+ensures backward compatibility.
+
+Version field is only reserved for the unlikely event of UAPI upgrade
+at its entirety.
+
+It's *always* the caller's responsibility to indicate the size of the
+structure passed by setting argsz appropriately.
+Though at the same time, argsz is user provided data which is not
+trusted. The argsz field allows the user app to indicate how much data
+it is providing; it's still the kernel's responsibility to validate
+whether it's correct and sufficient for the requested operation.
+
+Compatibility Checking
+----------------------
+When IOMMU UAPI extension results in some structure size increase,
+IOMMU UAPI code shall handle the following cases:
+
+1. User and kernel has exact size match
+2. An older user with older kernel header (smaller UAPI size) running on a
+   newer kernel (larger UAPI size)
+3. A newer user with newer kernel header (larger UAPI size) running
+   on an older kernel.
+4. A malicious/misbehaving user passing illegal/invalid size but within
+   range. The data may contain garbage.
+
+Feature Checking
+----------------
+While launching a guest with vIOMMU, it is strongly advised to check
+the compatibility upfront, as some subsequent errors happening during
+vIOMMU operation, such as cache invalidation failures cannot be nicely
+escalated to the guest due to IOMMU specifications. This can lead to
+catastrophic failures for the users.
+
+User applications such as QEMU are expected to import kernel UAPI
+headers. Backward compatibility is supported per feature flags.
+For example, an older QEMU (with older kernel header) can run on newer
+kernel. Newer QEMU (with new kernel header) may refuse to initialize
+on an older kernel if new feature flags are not supported by older
+kernel. Simply recompiling existing code with newer kernel header should
+not be an issue in that only existing flags are used.
+
+IOMMU vendor driver should report the below features to IOMMU UAPI
+consumers (e.g. via VFIO).
+
+1. IOMMU_NESTING_FEAT_SYSWIDE_PASID
+2. IOMMU_NESTING_FEAT_BIND_PGTBL
+3. IOMMU_NESTING_FEAT_BIND_PASID_TABLE
+4. IOMMU_NESTING_FEAT_CACHE_INVLD
+5. IOMMU_NESTING_FEAT_PAGE_REQUEST
+
+Take VFIO as example, upon request from VFIO userspace (e.g. QEMU),
+VFIO kernel code shall query IOMMU vendor driver for the support of
+the above features. Query result can then be reported back to the
+userspace caller. Details can be found in
+Documentation/driver-api/vfio.rst.
+
+
+Data Passing Example with VFIO
+------------------------------
+As the ubiquitous userspace driver framework, VFIO is already IOMMU
+aware and shares many key concepts such as device model, group, and
+protection domain. Other user driver frameworks can also be extended
+to support IOMMU UAPI but it is outside the scope of this document.
+
+In this tight-knit VFIO-IOMMU interface, the ultimate consumer of the
+IOMMU UAPI data is the host IOMMU driver. VFIO facilitates user-kernel
+transport, capability checking, security, and life cycle management of
+process address space ID (PASID).
+
+VFIO layer conveys the data structures down to the IOMMU driver. It
+follows the pattern below::
+
+   struct {
+	__u32 argsz;
+	__u32 flags;
+	__u8  data[];
+   };
+
+Here data[] contains the IOMMU UAPI data structures. VFIO has the
+freedom to bundle the data as well as parse data size based on its own flags.
+
+In order to determine the size and feature set of the user data, argsz
+and flags (or the equivalent) are also embedded in the IOMMU UAPI data
+structures.
+
+A "__u32 argsz" field is *always* at the beginning of each structure.
+
+For example:
+::
+
+   struct iommu_cache_invalidate_info {
+	__u32	argsz;
+	#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1
+	__u32	version;
+	/* IOMMU paging structure cache */
+	#define IOMMU_CACHE_INV_TYPE_IOTLB	(1 << 0) /* IOMMU IOTLB */
+	#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB	(1 << 1) /* Device IOTLB */
+	#define IOMMU_CACHE_INV_TYPE_PASID	(1 << 2) /* PASID cache */
+	#define IOMMU_CACHE_INV_TYPE_NR		(3)
+	__u8	cache;
+	__u8	granularity;
+	__u8	padding[6];
+	union {
+		struct iommu_inv_pasid_info pasid_info;
+		struct iommu_inv_addr_info addr_info;
+	} granu;
+   };
+
+VFIO is responsible for checking its own argsz and flags. It then
+invokes appropriate IOMMU UAPI functions. The user pointers are passed
+to the IOMMU layer for further processing. The responsibilities are
+divided as follows:
+
+- Generic IOMMU layer checks argsz range based on UAPI data in the
+  current kernel version.
+
+- Generic IOMMU layer checks content of the UAPI data for non-zero
+  reserved bits in flags, padding fields, and unsupported version.
+  This is to ensure not breaking userspace in the future when these
+  fields or flags are used.
+
+- Vendor IOMMU driver checks argsz based on vendor flags. UAPI data
+  is consumed based on flags. Vendor driver has access to
+  unadulterated argsz value in case of vendor specific future
+  extensions. Currently, it does not perform the copy_from_user()
+  itself. A __user pointer can be provided in some future scenarios
+  where there's vendor data outside of the structure definition.
+
+IOMMU code treats UAPI data in two categories:
+
+- structure contains vendor data
+  (Example: iommu_uapi_cache_invalidate())
+
+- structure contains only generic data
+  (Example: iommu_uapi_sva_bind_gpasid())
+
+
+
+Sharing UAPI with in-kernel users
+---------------------------------
+For UAPIs that are shared with in-kernel users, a wrapper function is
+provided to distinguish the callers. For example,
+
+Userspace caller ::
+
+  int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
+                                   struct device *dev,
+                                   void __user *udata)
+
+In-kernel caller ::
+
+  int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+                              struct device *dev, ioasid_t ioasid);
diff --git a/MAINTAINERS b/MAINTAINERS
index b362ea22b5c5..4bb2cfd5800e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8586,6 +8586,7 @@ L:	iommu@lists.linux-foundation.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
 S:	Maintained
 F:	Documentation/devicetree/bindings/iommu/
+F:	Documentation/userspace-api/iommu.rst
 F:	drivers/iommu/
 F:	include/linux/iommu.h
 F:	include/linux/of_iommu.h
-- 
Gitee


From 7a589586958d13b1ff8dd19b7b5a38b3f74c4f9d Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.pan.linux@gmail.com>
Date: Fri, 25 Sep 2020 09:32:45 -0700
Subject: [PATCH 1024/2161] iommu/uapi: Rename uapi functions

commit 23cc3493b5e107b8deb697cf3157a07276b5eff7 upstream.

User APIs such as iommu_sva_unbind_gpasid() may also be used by the
kernel. Since we introduced user pointer to the UAPI functions,
in-kernel callers cannot share the same APIs. In-kernel callers are also
trusted, there is no need to validate the data.

We plan to have two flavors of the same API functions, one called
through ioctls, carrying a user pointer and one called directly with
valid IOMMU UAPI structs. To differentiate both, let's rename existing
functions with an iommu_uapi_ prefix.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/1601051567-54787-5-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 18 +++++++++---------
 include/linux/iommu.h | 31 ++++++++++++++++---------------
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e9dc8d362bd3..f383b7e5c8b7 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1979,35 +1979,35 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
-int iommu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
-			   struct iommu_cache_invalidate_info *inv_info)
+int iommu_uapi_cache_invalidate(struct iommu_domain *domain, struct device *dev,
+				struct iommu_cache_invalidate_info *inv_info)
 {
 	if (unlikely(!domain->ops->cache_invalidate))
 		return -ENODEV;
 
 	return domain->ops->cache_invalidate(domain, dev, inv_info);
 }
-EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
+EXPORT_SYMBOL_GPL(iommu_uapi_cache_invalidate);
 
-int iommu_sva_bind_gpasid(struct iommu_domain *domain,
-			   struct device *dev, struct iommu_gpasid_bind_data *data)
+int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
+			       struct device *dev, struct iommu_gpasid_bind_data *data)
 {
 	if (unlikely(!domain->ops->sva_bind_gpasid))
 		return -ENODEV;
 
 	return domain->ops->sva_bind_gpasid(domain, dev, data);
 }
-EXPORT_SYMBOL_GPL(iommu_sva_bind_gpasid);
+EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
 
-int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
-			     ioasid_t pasid)
+int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+				 ioasid_t pasid)
 {
 	if (unlikely(!domain->ops->sva_unbind_gpasid))
 		return -ENODEV;
 
 	return domain->ops->sva_unbind_gpasid(dev, pasid);
 }
-EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
+EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
 
 static void __iommu_detach_device(struct iommu_domain *domain,
 				  struct device *dev)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3d2b46daa4c3..7a44a225f5ec 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -449,13 +449,13 @@ extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
 				struct device *dev);
-extern int iommu_cache_invalidate(struct iommu_domain *domain,
-				  struct device *dev,
-				  struct iommu_cache_invalidate_info *inv_info);
-extern int iommu_sva_bind_gpasid(struct iommu_domain *domain,
-		struct device *dev, struct iommu_gpasid_bind_data *data);
-extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
-				struct device *dev, ioasid_t pasid);
+extern int iommu_uapi_cache_invalidate(struct iommu_domain *domain,
+				       struct device *dev,
+				       struct iommu_cache_invalidate_info *inv_info);
+extern int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
+				      struct device *dev, struct iommu_gpasid_bind_data *data);
+extern int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
+					struct device *dev, ioasid_t pasid);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
@@ -1072,21 +1072,22 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 	return IOMMU_PASID_INVALID;
 }
 
-static inline int
-iommu_cache_invalidate(struct iommu_domain *domain,
-		       struct device *dev,
-		       struct iommu_cache_invalidate_info *inv_info)
+static inline int iommu_uapi_cache_invalidate(struct iommu_domain *domain,
+					      struct device *dev,
+					      struct iommu_cache_invalidate_info *inv_info)
 {
 	return -ENODEV;
 }
-static inline int iommu_sva_bind_gpasid(struct iommu_domain *domain,
-				struct device *dev, struct iommu_gpasid_bind_data *data)
+
+static inline int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
+					     struct device *dev,
+					     struct iommu_gpasid_bind_data *data)
 {
 	return -ENODEV;
 }
 
-static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
-					   struct device *dev, u32 pasid)
+static inline int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
+					       struct device *dev, int pasid)
 {
 	return -ENODEV;
 }
-- 
Gitee


From 9e8cdc807629af49f86594fce33e4a4c9fb7040e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.pan.linux@gmail.com>
Date: Fri, 25 Sep 2020 09:32:46 -0700
Subject: [PATCH 1025/2161] iommu/uapi: Handle data and argsz filled by users

commit d90573812eea63c6bc8ab8a38f661b4c27c3cdc0 upstream.

IOMMU user APIs are responsible for processing user data. This patch
changes the interface such that user pointers can be passed into IOMMU
code directly. Separate kernel APIs without user pointers are introduced
for in-kernel users of the UAPI functionality.

IOMMU UAPI data has a user filled argsz field which indicates the data
length of the structure. User data is not trusted, argsz must be
validated based on the current kernel data size, mandatory data size,
and feature flags.

User data may also be extended, resulting in possible argsz increase.
Backward compatibility is ensured based on size and flags (or
the functional equivalent fields) checking.

This patch adds sanity checks in the IOMMU layer. In addition to argsz,
reserved/unused fields in padding, flags, and version are also checked.
Details are documented in Documentation/userspace-api/iommu.rst

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/1601051567-54787-6-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c      | 194 +++++++++++++++++++++++++++++++++++--
 include/linux/iommu.h      |  28 ++++--
 include/uapi/linux/iommu.h |   1 +
 3 files changed, 207 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f383b7e5c8b7..a7ed323f97f3 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1979,34 +1979,214 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+/*
+ * Check flags and other user provided data for valid combinations. We also
+ * make sure no reserved fields or unused flags are set. This is to ensure
+ * not breaking userspace in the future when these fields or flags are used.
+ */
+static int iommu_check_cache_invl_data(struct iommu_cache_invalidate_info *info)
+{
+	u32 mask;
+	int i;
+
+	if (info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+		return -EINVAL;
+
+	mask = (1 << IOMMU_CACHE_INV_TYPE_NR) - 1;
+	if (info->cache & ~mask)
+		return -EINVAL;
+
+	if (info->granularity >= IOMMU_INV_GRANU_NR)
+		return -EINVAL;
+
+	switch (info->granularity) {
+	case IOMMU_INV_GRANU_ADDR:
+		if (info->cache & IOMMU_CACHE_INV_TYPE_PASID)
+			return -EINVAL;
+
+		mask = IOMMU_INV_ADDR_FLAGS_PASID |
+			IOMMU_INV_ADDR_FLAGS_ARCHID |
+			IOMMU_INV_ADDR_FLAGS_LEAF;
+
+		if (info->granu.addr_info.flags & ~mask)
+			return -EINVAL;
+		break;
+	case IOMMU_INV_GRANU_PASID:
+		mask = IOMMU_INV_PASID_FLAGS_PASID |
+			IOMMU_INV_PASID_FLAGS_ARCHID;
+		if (info->granu.pasid_info.flags & ~mask)
+			return -EINVAL;
+
+		break;
+	case IOMMU_INV_GRANU_DOMAIN:
+		if (info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Check reserved padding fields */
+	for (i = 0; i < sizeof(info->padding); i++) {
+		if (info->padding[i])
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 int iommu_uapi_cache_invalidate(struct iommu_domain *domain, struct device *dev,
-				struct iommu_cache_invalidate_info *inv_info)
+				void __user *uinfo)
 {
+	struct iommu_cache_invalidate_info inv_info = { 0 };
+	u32 minsz;
+	int ret;
+
 	if (unlikely(!domain->ops->cache_invalidate))
 		return -ENODEV;
 
-	return domain->ops->cache_invalidate(domain, dev, inv_info);
+	/*
+	 * No new spaces can be added before the variable sized union, the
+	 * minimum size is the offset to the union.
+	 */
+	minsz = offsetof(struct iommu_cache_invalidate_info, granu);
+
+	/* Copy minsz from user to get flags and argsz */
+	if (copy_from_user(&inv_info, uinfo, minsz))
+		return -EFAULT;
+
+	/* Fields before the variable size union are mandatory */
+	if (inv_info.argsz < minsz)
+		return -EINVAL;
+
+	/* PASID and address granu require additional info beyond minsz */
+	if (inv_info.granularity == IOMMU_INV_GRANU_PASID &&
+	    inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.pasid_info))
+		return -EINVAL;
+
+	if (inv_info.granularity == IOMMU_INV_GRANU_ADDR &&
+	    inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.addr_info))
+		return -EINVAL;
+
+	/*
+	 * User might be using a newer UAPI header which has a larger data
+	 * size, we shall support the existing flags within the current
+	 * size. Copy the remaining user data _after_ minsz but not more
+	 * than the current kernel supported size.
+	 */
+	if (copy_from_user((void *)&inv_info + minsz, uinfo + minsz,
+			   min_t(u32, inv_info.argsz, sizeof(inv_info)) - minsz))
+		return -EFAULT;
+
+	/* Now the argsz is validated, check the content */
+	ret = iommu_check_cache_invl_data(&inv_info);
+	if (ret)
+		return ret;
+
+	return domain->ops->cache_invalidate(domain, dev, &inv_info);
 }
 EXPORT_SYMBOL_GPL(iommu_uapi_cache_invalidate);
 
-int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
-			       struct device *dev, struct iommu_gpasid_bind_data *data)
+static int iommu_check_bind_data(struct iommu_gpasid_bind_data *data)
+{
+	u32 mask;
+	int i;
+
+	if (data->version != IOMMU_GPASID_BIND_VERSION_1)
+		return -EINVAL;
+
+	/* Check the range of supported formats */
+	if (data->format >= IOMMU_PASID_FORMAT_LAST)
+		return -EINVAL;
+
+	/* Check all flags */
+	mask = IOMMU_SVA_GPASID_VAL;
+	if (data->flags & ~mask)
+		return -EINVAL;
+
+	/* Check reserved padding fields */
+	for (i = 0; i < sizeof(data->padding); i++) {
+		if (data->padding[i])
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int iommu_sva_prepare_bind_data(void __user *udata,
+				       struct iommu_gpasid_bind_data *data)
+{
+	u32 minsz;
+
+	/*
+	 * No new spaces can be added before the variable sized union, the
+	 * minimum size is the offset to the union.
+	 */
+	minsz = offsetof(struct iommu_gpasid_bind_data, vendor);
+
+	/* Copy minsz from user to get flags and argsz */
+	if (copy_from_user(data, udata, minsz))
+		return -EFAULT;
+
+	/* Fields before the variable size union are mandatory */
+	if (data->argsz < minsz)
+		return -EINVAL;
+	/*
+	 * User might be using a newer UAPI header, we shall let IOMMU vendor
+	 * driver decide on what size it needs. Since the guest PASID bind data
+	 * can be vendor specific, larger argsz could be the result of extension
+	 * for one vendor but it should not affect another vendor.
+	 * Copy the remaining user data _after_ minsz
+	 */
+	if (copy_from_user((void *)data + minsz, udata + minsz,
+			   min_t(u32, data->argsz, sizeof(*data)) - minsz))
+		return -EFAULT;
+
+	return iommu_check_bind_data(data);
+}
+
+int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+			       void __user *udata)
 {
+	struct iommu_gpasid_bind_data data = { 0 };
+	int ret;
+
 	if (unlikely(!domain->ops->sva_bind_gpasid))
 		return -ENODEV;
 
-	return domain->ops->sva_bind_gpasid(domain, dev, data);
+	ret = iommu_sva_prepare_bind_data(udata, &data);
+	if (ret)
+		return ret;
+
+	return domain->ops->sva_bind_gpasid(domain, dev, &data);
 }
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
 
-int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
-				 ioasid_t pasid)
+int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+			     ioasid_t pasid)
 {
 	if (unlikely(!domain->ops->sva_unbind_gpasid))
 		return -ENODEV;
 
 	return domain->ops->sva_unbind_gpasid(dev, pasid);
 }
+EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
+
+int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+				 void __user *udata)
+{
+	struct iommu_gpasid_bind_data data = { 0 };
+	int ret;
+
+	if (unlikely(!domain->ops->sva_bind_gpasid))
+		return -ENODEV;
+
+	ret = iommu_sva_prepare_bind_data(udata, &data);
+	if (ret)
+		return ret;
+
+	return iommu_sva_unbind_gpasid(domain, dev, data.hpasid);
+}
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
 
 static void __iommu_detach_device(struct iommu_domain *domain,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 7a44a225f5ec..4972c1e20848 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -451,11 +451,14 @@ extern void iommu_detach_device(struct iommu_domain *domain,
 				struct device *dev);
 extern int iommu_uapi_cache_invalidate(struct iommu_domain *domain,
 				       struct device *dev,
-				       struct iommu_cache_invalidate_info *inv_info);
+				       void __user *uinfo);
+
 extern int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
-				      struct device *dev, struct iommu_gpasid_bind_data *data);
+				      struct device *dev, void __user *udata);
 extern int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
-					struct device *dev, ioasid_t pasid);
+					struct device *dev, void __user *udata);
+extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+				   struct device *dev, ioasid_t pasid);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
@@ -1072,22 +1075,29 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 	return IOMMU_PASID_INVALID;
 }
 
-static inline int iommu_uapi_cache_invalidate(struct iommu_domain *domain,
-					      struct device *dev,
-					      struct iommu_cache_invalidate_info *inv_info)
+static inline int
+iommu_uapi_cache_invalidate(struct iommu_domain *domain,
+			    struct device *dev,
+			    struct iommu_cache_invalidate_info *inv_info)
 {
 	return -ENODEV;
 }
 
 static inline int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
-					     struct device *dev,
-					     struct iommu_gpasid_bind_data *data)
+					     struct device *dev, void __user *udata)
 {
 	return -ENODEV;
 }
 
 static inline int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
-					       struct device *dev, int pasid)
+					       struct device *dev, void __user *udata)
+{
+	return -ENODEV;
+}
+
+static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+					  struct device *dev,
+					  ioasid_t pasid)
 {
 	return -ENODEV;
 }
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 05a3434b3d56..59178fc229ca 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -324,6 +324,7 @@ struct iommu_gpasid_bind_data {
 #define IOMMU_GPASID_BIND_VERSION_1	1
 	__u32 version;
 #define IOMMU_PASID_FORMAT_INTEL_VTD	1
+#define IOMMU_PASID_FORMAT_LAST		2
 	__u32 format;
 	__u32 addr_width;
 #define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
-- 
Gitee


From 4943035cbad8208e033ec307ec1114c3177ff461 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 1 Apr 2022 15:49:06 +0800
Subject: [PATCH 1026/2161] iommu: Fix implicit declaration of function
 'copy_from_user'

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a7ed323f97f3..24c77529812c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -25,6 +25,7 @@
 #include <linux/property.h>
 #include <linux/fsl/mc.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <trace/events/iommu.h>
 
 static struct kset *iommu_group_kset;
-- 
Gitee


From 5eec930a2eed2fc200f7e92c028beb996dc0540e Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:03 +0100
Subject: [PATCH 1027/2161] ACPI/IORT: Make iort_get_device_domain IRQ domain
 agnostic

commit d1718a1b7a86743b9c517bf9521695ba909c734f upstream.

iort_get_device_domain() is PCI specific but it need not be,
since it can be used to retrieve IRQ domain nexus of any kind
by adding an irq_domain_bus_token input to it.

Make it PCI agnostic by also renaming the requestor ID input
to a more generic ID name.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>   # pci/msi.c
Cc: Will Deacon <will@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Link: https://lore.kernel.org/r/20200619082013.13661-3-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/arm64/iort.c | 14 +++++++-------
 drivers/pci/msi.c         |  3 ++-
 include/linux/acpi_iort.h |  7 ++++---
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 0e1e50f3a4d2..e2943cb5c292 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -503,7 +503,6 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev)
 		node = iort_get_iort_node(dev->fwnode);
 		if (node)
 			return node;
-
 		/*
 		 * if not, then it should be a platform device defined in
 		 * DSDT/SSDT (with Named Component node in IORT)
@@ -594,13 +593,13 @@ static int __maybe_unused iort_find_its_base(u32 its_id, phys_addr_t *base)
 /**
  * iort_dev_find_its_id() - Find the ITS identifier for a device
  * @dev: The device.
- * @req_id: Device's requester ID
+ * @id: Device's ID
  * @idx: Index of the ITS identifier list.
  * @its_id: ITS identifier.
  *
  * Returns: 0 on success, appropriate error value otherwise
  */
-static int iort_dev_find_its_id(struct device *dev, u32 req_id,
+static int iort_dev_find_its_id(struct device *dev, u32 id,
 				unsigned int idx, int *its_id)
 {
 	struct acpi_iort_its_group *its;
@@ -610,7 +609,7 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id,
 	if (!node)
 		return -ENXIO;
 
-	node = iort_node_map_id(node, req_id, NULL, IORT_MSI_TYPE);
+	node = iort_node_map_id(node, id, NULL, IORT_MSI_TYPE);
 	if (!node)
 		return -ENXIO;
 
@@ -633,19 +632,20 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id,
  *
  * Returns: the MSI domain for this device, NULL otherwise
  */
-struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id)
+struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
+					  enum irq_domain_bus_token bus_token)
 {
 	struct fwnode_handle *handle;
 	int its_id;
 
-	if (iort_dev_find_its_id(dev, req_id, 0, &its_id))
+	if (iort_dev_find_its_id(dev, id, 0, &its_id))
 		return NULL;
 
 	handle = iort_find_domain_token(its_id);
 	if (!handle)
 		return NULL;
 
-	return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI);
+	return irq_find_matching_fwnode(handle, bus_token);
 }
 
 static void iort_set_device_domain(struct device *dev,
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 111941199341..c094a7e8c2bc 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1570,7 +1570,8 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 	pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid);
 	dom = of_msi_map_get_device_domain(&pdev->dev, rid);
 	if (!dom)
-		dom = iort_get_device_domain(&pdev->dev, rid);
+		dom = iort_get_device_domain(&pdev->dev, rid,
+					     DOMAIN_BUS_PCI_MSI);
 	return dom;
 }
 
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 8e7e2ec37f1b..08ec6bd2297f 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -29,7 +29,8 @@ struct fwnode_handle *iort_find_domain_token(int trans_id);
 #ifdef CONFIG_ACPI_IORT
 void acpi_iort_init(void);
 u32 iort_msi_map_rid(struct device *dev, u32 req_id);
-struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
+struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
+					  enum irq_domain_bus_token bus_token);
 void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
@@ -40,8 +41,8 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 static inline void acpi_iort_init(void) { }
 static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id)
 { return req_id; }
-static inline struct irq_domain *iort_get_device_domain(struct device *dev,
-							u32 req_id)
+static inline struct irq_domain *iort_get_device_domain(
+	struct device *dev, u32 id, enum irq_domain_bus_token bus_token)
 { return NULL; }
 static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
-- 
Gitee


From 4b0a2e06da3d4671a4317349a8b991c3f4ae43b7 Mon Sep 17 00:00:00 2001
From: Diana Craciun <diana.craciun@oss.nxp.com>
Date: Fri, 19 Jun 2020 09:20:10 +0100
Subject: [PATCH 1028/2161] of/irq: make of_msi_map_get_device_domain() bus
 agnostic

commit 6f881aba01109a01a43e4f135673c19190f61133 upstream.

of_msi_map_get_device_domain() is PCI specific but it need not be and
can be easily changed to be bus agnostic in order to be used by other
busses by adding an IRQ domain bus token as an input parameter.

Signed-off-by: Diana Craciun <diana.craciun@oss.nxp.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>   # pci/msi.c
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200619082013.13661-10-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/of/irq.c       | 8 +++++---
 drivers/pci/msi.c      | 2 +-
 include/linux/of_irq.h | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index a296eaf52a5b..48a40326984f 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -613,18 +613,20 @@ u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in)
  * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain
  * @dev: device for which the mapping is to be done.
  * @rid: Requester ID for the device.
+ * @bus_token: Bus token
  *
  * Walk up the device hierarchy looking for devices with a "msi-map"
  * property.
  *
  * Returns: the MSI domain for this device (or NULL on failure)
  */
-struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid)
+struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id,
+						u32 bus_token)
 {
 	struct device_node *np = NULL;
 
-	__of_msi_map_rid(dev, &np, rid);
-	return irq_find_matching_host(np, DOMAIN_BUS_PCI_MSI);
+	__of_msi_map_rid(dev, &np, id);
+	return irq_find_matching_host(np, bus_token);
 }
 
 /**
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index c094a7e8c2bc..4fa71f1dcc0e 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1568,7 +1568,7 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 	u32 rid = pci_dev_id(pdev);
 
 	pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid);
-	dom = of_msi_map_get_device_domain(&pdev->dev, rid);
+	dom = of_msi_map_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI);
 	if (!dom)
 		dom = iort_get_device_domain(&pdev->dev, rid,
 					     DOMAIN_BUS_PCI_MSI);
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 1214cabb2247..7142a3722758 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -52,7 +52,8 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev,
 					    struct device_node *np,
 					    enum irq_domain_bus_token token);
 extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
-						       u32 rid);
+							u32 id,
+							u32 bus_token);
 extern void of_msi_configure(struct device *dev, struct device_node *np);
 u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in);
 #else
@@ -85,7 +86,7 @@ static inline struct irq_domain *of_msi_get_domain(struct device *dev,
 	return NULL;
 }
 static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
-							      u32 rid)
+						u32 id, u32 bus_token)
 {
 	return NULL;
 }
-- 
Gitee


From 671cd6bf4a78c12ef0c87f768abd43129d280310 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 3 Oct 2019 15:32:09 +0300
Subject: [PATCH 1029/2161] software node: Make argument to to_software_node
 const

commit 56c9aa07942434490890ac35bba99026e66cb949 upstream.

to_software_node() does not need to modify the fwnode_handle it operates
on; therefore make it const. This allows passing a const fwnode_handle to
to_software_node().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/swnode.c    | 4 ++--
 include/linux/property.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index ac051019e50f..0b890f75e59b 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -71,9 +71,9 @@ software_node_to_swnode(const struct software_node *node)
 	return swnode;
 }
 
-const struct software_node *to_software_node(struct fwnode_handle *fwnode)
+const struct software_node *to_software_node(const struct fwnode_handle *fwnode)
 {
-	struct swnode *swnode = to_swnode(fwnode);
+	const struct swnode *swnode = to_swnode(fwnode);
 
 	return swnode ? swnode->node : NULL;
 }
diff --git a/include/linux/property.h b/include/linux/property.h
index 1e4dc477c695..aa9651c7743f 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -420,7 +420,8 @@ struct software_node {
 };
 
 bool is_software_node(const struct fwnode_handle *fwnode);
-const struct software_node *to_software_node(struct fwnode_handle *fwnode);
+const struct software_node *
+to_software_node(const struct fwnode_handle *fwnode);
 struct fwnode_handle *software_node_fwnode(const struct software_node *node);
 
 const struct software_node *
-- 
Gitee


From e11eae2c7faeeaff72f13fede67936e13df6757a Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 3 Oct 2019 15:32:10 +0300
Subject: [PATCH 1030/2161] device property: Move fwnode_get_parent() up

commit a57b7fb783eb352d91b0bf8391682b30bacae667 upstream.

Move fwnode_get_parent() above fwnode_get_next_parent(), making the order
the same as in the header file.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/property.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 98c6c4ea665c..1a5c1ba0cd7c 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -556,6 +556,19 @@ int device_add_properties(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_add_properties);
 
+/**
+ * fwnode_get_parent - Return parent firwmare node
+ * @fwnode: Firmware whose parent is retrieved
+ *
+ * Return parent firmware node of the given node if possible or %NULL if no
+ * parent was available.
+ */
+struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode)
+{
+	return fwnode_call_ptr_op(fwnode, get_parent);
+}
+EXPORT_SYMBOL_GPL(fwnode_get_parent);
+
 /**
  * fwnode_get_next_parent - Iterate to the node's parent
  * @fwnode: Firmware whose parent is retrieved
@@ -600,19 +613,6 @@ const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
 	return fwnode_call_ptr_op(fwnode, get_name_prefix);
 }
 
-/**
- * fwnode_get_parent - Return parent firwmare node
- * @fwnode: Firmware whose parent is retrieved
- *
- * Return parent firmware node of the given node if possible or %NULL if no
- * parent was available.
- */
-struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode)
-{
-	return fwnode_call_ptr_op(fwnode, get_parent);
-}
-EXPORT_SYMBOL_GPL(fwnode_get_parent);
-
 /**
  * fwnode_get_next_child_node - Return the next child node handle for a node
  * @fwnode: Firmware node to find the next child node for.
-- 
Gitee


From 490edeed9f804e098208962e4e06f2de7c3defe5 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 3 Oct 2019 15:32:11 +0300
Subject: [PATCH 1031/2161] device property: Add functions for accessing node's
 parents

commit 87e5e95db31a27d117fbb4a5d464f44adb4c2ee2 upstream.

Add two convenience functions for accessing node's parents:

fwnode_count_parents() returns the number of parent nodes a given node
has. fwnode_get_nth_parent() returns node's parent at a given distance
from the node itself.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/property.c  | 46 ++++++++++++++++++++++++++++++++++++++++
 include/linux/property.h |  3 +++
 2 files changed, 49 insertions(+)

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 1a5c1ba0cd7c..63903cf28197 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -979,6 +979,52 @@ fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port);
 
+/**
+ * fwnode_count_parents - Return the number of parents a node has
+ * @fwnode: The node the parents of which are to be counted
+ *
+ * Returns the number of parents a node has.
+ */
+unsigned int fwnode_count_parents(const struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *__fwnode;
+	unsigned int count;
+
+	__fwnode = fwnode_get_parent(fwnode);
+
+	for (count = 0; __fwnode; count++)
+		__fwnode = fwnode_get_next_parent(__fwnode);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(fwnode_count_parents);
+
+/**
+ * fwnode_get_nth_parent - Return an nth parent of a node
+ * @fwnode: The node the parent of which is requested
+ * @depth: Distance of the parent from the node
+ *
+ * Returns the nth parent of a node. If there is no parent at the requested
+ * @depth, %NULL is returned. If @depth is 0, the functionality is equivalent to
+ * fwnode_handle_get(). For @depth == 1, it is fwnode_get_parent() and so on.
+ *
+ * The caller is responsible for calling fwnode_handle_put() for the returned
+ * node.
+ */
+struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode,
+					    unsigned int depth)
+{
+	unsigned int i;
+
+	fwnode_handle_get(fwnode);
+
+	for (i = 0; i < depth && fwnode; i++)
+		fwnode = fwnode_get_next_parent(fwnode);
+
+	return fwnode;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_nth_parent);
+
 /**
  * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint
  * @fwnode: Endpoint firmware node pointing to the remote endpoint
diff --git a/include/linux/property.h b/include/linux/property.h
index aa9651c7743f..054661109661 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -85,6 +85,9 @@ const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(
 	struct fwnode_handle *fwnode);
+unsigned int fwnode_count_parents(const struct fwnode_handle *fwn);
+struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn,
+					    unsigned int depth);
 struct fwnode_handle *fwnode_get_next_child_node(
 	const struct fwnode_handle *fwnode, struct fwnode_handle *child);
 struct fwnode_handle *fwnode_get_next_available_child_node(
-- 
Gitee


From d71f0042a6b32a6a7a86ce08f454c0ef56e3e89c Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Sun, 8 Sep 2019 09:56:37 -0700
Subject: [PATCH 1032/2161] iommu/amd: Remove unnecessary locking from AMD
 iommu driver

commit 37ec8eb851c1876580a963f283fe7496592b9f72 upstream.

With or without locking it doesn't make sense for two writers to be
writing to the same IOVA range at the same time. Even with locking we
still have a race condition, whoever gets the lock first, so we still
can't be sure what the result will be. With locking the result will be
more sane, it will be correct for the last writer, but still useless
because we can't be sure which writer will get the lock last. It's a
fundamentally broken design to have two writers writing to the same
IOVA range at the same time.

So we can remove the locking and work on the assumption that no two
writers will be writing to the same IOVA range at the same time.

The only exception is when we have to allocate a middle page in the page
tables, the middle page can cover more than just the IOVA range a writer
has been allocated. However this isn't an issue in the AMD driver
because it can atomically allocate middle pages using "cmpxchg64()".

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c       | 10 +---------
 drivers/iommu/amd_iommu_types.h |  1 -
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index ba22f4fbeae6..bd8a93efadf9 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2358,7 +2358,6 @@ static void protection_domain_free(struct protection_domain *domain)
 static int protection_domain_init(struct protection_domain *domain)
 {
 	spin_lock_init(&domain->lock);
-	mutex_init(&domain->api_lock);
 	domain->id = domain_id_alloc();
 	if (!domain->id)
 		return -ENOMEM;
@@ -2543,9 +2542,7 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= IOMMU_PROT_IW;
 
-	mutex_lock(&domain->api_lock);
 	ret = iommu_map_page(domain, iova, paddr, page_size, prot, GFP_KERNEL);
-	mutex_unlock(&domain->api_lock);
 
 	domain_flush_np_cache(domain, iova, page_size);
 
@@ -2557,16 +2554,11 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
 			      struct iommu_iotlb_gather *gather)
 {
 	struct protection_domain *domain = to_pdomain(dom);
-	size_t unmap_size;
 
 	if (domain->mode == PAGE_MODE_NONE)
 		return 0;
 
-	mutex_lock(&domain->api_lock);
-	unmap_size = iommu_unmap_page(domain, iova, page_size);
-	mutex_unlock(&domain->api_lock);
-
-	return unmap_size;
+	return iommu_unmap_page(domain, iova, page_size);
 }
 
 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 3b72515e6e20..34847774f1bb 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -472,7 +472,6 @@ struct protection_domain {
 	struct iommu_domain domain; /* generic domain handle used by
 				       iommu core code */
 	spinlock_t lock;	/* mostly used to lock the page table*/
-	struct mutex api_lock;	/* protect page tables in the iommu-api path */
 	u16 id;			/* the domain id written to the device table */
 	int mode;		/* paging mode (0-6 levels) */
 	u64 *pt_root;		/* page table root pointer */
-- 
Gitee


From 72d40d2eb9f756ab1b513fc45a53e2086469d777 Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Sun, 8 Sep 2019 09:56:39 -0700
Subject: [PATCH 1033/2161] iommu/dma-iommu: Handle deferred devices

commit 795bbbb9b6f80306be3d45c79527324036a68509 upstream.

Handle devices which defer their attach to the iommu in the dma-iommu api

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 02f18221f412..10f125fafe8c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -23,6 +23,7 @@
 #include <linux/pci.h>
 #include <linux/scatterlist.h>
 #include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
 
 struct iommu_dma_msi_page {
 	struct list_head	list;
@@ -366,6 +367,21 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	return iova_reserve_iommu_regions(dev, domain);
 }
 
+static int iommu_dma_deferred_attach(struct device *dev,
+		struct iommu_domain *domain)
+{
+	const struct iommu_ops *ops = domain->ops;
+
+	if (!is_kdump_kernel())
+		return 0;
+
+	if (unlikely(ops->is_attach_deferred &&
+			ops->is_attach_deferred(domain, dev)))
+		return iommu_attach_device(domain, dev);
+
+	return 0;
+}
+
 /**
  * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
  *                    page flags.
@@ -482,6 +498,9 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	size_t iova_off = iova_offset(iovad, phys);
 	dma_addr_t iova;
 
+	if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+		return DMA_MAPPING_ERROR;
+
 	size = iova_align(iovad, size + iova_off);
 
 	iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
@@ -588,6 +607,9 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 
 	*dma_handle = DMA_MAPPING_ERROR;
 
+	if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+		return NULL;
+
 	min_size = alloc_sizes & -alloc_sizes;
 	if (min_size < PAGE_SIZE) {
 		min_size = PAGE_SIZE;
@@ -720,7 +742,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 	int prot = dma_info_to_prot(dir, coherent, attrs);
 	dma_addr_t dma_handle;
 
-	dma_handle =__iommu_dma_map(dev, phys, size, prot);
+	dma_handle = __iommu_dma_map(dev, phys, size, prot);
 	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
 	    dma_handle != DMA_MAPPING_ERROR)
 		arch_sync_dma_for_device(phys, size, dir);
@@ -830,6 +852,9 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	unsigned long mask = dma_get_seg_boundary(dev);
 	int i;
 
+	if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+		return 0;
+
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
 
-- 
Gitee


From 47abe4dcc0215f7aeda67d7e1f54c66f512cba05 Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Sun, 8 Sep 2019 09:56:40 -0700
Subject: [PATCH 1034/2161] iommu/dma-iommu: Use the dev->coherent_dma_mask

commit 6e2350207f40e24884da262976f7fd4fba387e8a upstream.

Use the dev->coherent_dma_mask when allocating in the dma-iommu ops api.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 10f125fafe8c..9389aeb7660d 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -490,7 +490,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
 }
 
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
-		size_t size, int prot)
+		size_t size, int prot, dma_addr_t dma_mask)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -503,7 +503,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 
 	size = iova_align(iovad, size + iova_off);
 
-	iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
+	iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev);
 	if (!iova)
 		return DMA_MAPPING_ERROR;
 
@@ -742,7 +742,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 	int prot = dma_info_to_prot(dir, coherent, attrs);
 	dma_addr_t dma_handle;
 
-	dma_handle = __iommu_dma_map(dev, phys, size, prot);
+	dma_handle = __iommu_dma_map(dev, phys, size, prot, dma_get_mask(dev));
 	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
 	    dma_handle != DMA_MAPPING_ERROR)
 		arch_sync_dma_for_device(phys, size, dir);
@@ -945,7 +945,8 @@ static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	return __iommu_dma_map(dev, phys, size,
-			dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO);
+			dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
+			dma_get_mask(dev));
 }
 
 static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
@@ -1052,7 +1053,8 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 	if (!cpu_addr)
 		return NULL;
 
-	*handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot);
+	*handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot,
+			dev->coherent_dma_mask);
 	if (*handle == DMA_MAPPING_ERROR) {
 		__iommu_dma_free(dev, size, cpu_addr);
 		return NULL;
-- 
Gitee


From c0528f2bbd49c54a652a20f1fedec21db57a3759 Mon Sep 17 00:00:00 2001
From: "Suthikulpanit, Suravee" <Suravee.Suthikulpanit@amd.com>
Date: Mon, 14 Oct 2019 20:06:19 +0000
Subject: [PATCH 1035/2161] iommu/amd: Simpify decoding logic for
 INVALID_PPR_REQUEST event

commit 470eb3b31134ada545dcb41e94a0c97b42c95803 upstream.

Reuse existing macro to simplify the code and improve readability.

Cc: Joerg Roedel <jroedel@suse.de>
Cc: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index bd8a93efadf9..9f6a33c0557f 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -577,8 +577,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 			pasid, address, flags);
 		break;
 	case EVENT_TYPE_INV_PPR_REQ:
-		pasid = ((event[0] >> 16) & 0xFFFF)
-			| ((event[1] << 6) & 0xF0000);
+		pasid = PPR_PASID(*((u64 *)__evt));
 		tag = event[1] & 0x03FF;
 		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
 			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-- 
Gitee


From cceb7c1beb61760ab7de23a9219288fae217705e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Fri, 18 Oct 2019 11:00:33 +0200
Subject: [PATCH 1036/2161] iommu/amd: Pass gfp flags to iommu_map_page() in
 amd_iommu_map()

commit 3057fb9377eb5e73386dd0d8804bf72bdd23e391 upstream.

A recent commit added a gfp parameter to amd_iommu_map() to make it
callable from atomic context, but forgot to pass it down to
iommu_map_page() and left GFP_KERNEL there. This caused
sleep-while-atomic warnings and needs to be fixed.

Reported-by: Qian Cai <cai@lca.pw>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: 781ca2de89ba ("iommu: Add gfp parameter to iommu_ops::map")
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 9f6a33c0557f..0ab1e87ff554 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2541,7 +2541,7 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= IOMMU_PROT_IW;
 
-	ret = iommu_map_page(domain, iova, paddr, page_size, prot, GFP_KERNEL);
+	ret = iommu_map_page(domain, iova, paddr, page_size, prot, gfp);
 
 	domain_flush_np_cache(domain, iova, page_size);
 
-- 
Gitee


From db9167e2f96f62c923e77ff6563c07e9f64b6b43 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 24 Oct 2019 14:54:10 +0200
Subject: [PATCH 1037/2161] iommu/amd: Do not re-fetch iommu->cmd_buf_tail

commit a5bbbf37c6f8522a1afd46c37b5a0d1ce63232b7 upstream.

The compiler is not smart enough to realize that iommu->cmd_buf_tail
can't be modified across memcpy:

41 8b 45 74          mov    0x74(%r13),%eax   # iommu->cmd_buf_tail
44 8d 78 10          lea    0x10(%rax),%r15d  # += sizeof(*cmd)
41 81 e7 ff 1f 00 00 and    $0x1fff,%r15d     # %= CMD_BUFFER_SIZE
49 03 45 68          add    0x68(%r13),%rax   # target = iommu->cmd_buf + iommu->cmd_buf_tail
45 89 7d 74          mov    %r15d,0x74(%r13)  # store to iommu->cmd_buf_tail
49 8b 34 24          mov    (%r12),%rsi       # memcpy
49 8b 7c 24 08       mov    0x8(%r12),%rdi    # memcpy
48 89 30             mov    %rsi,(%rax)       # memcpy
48 89 78 08          mov    %rdi,0x8(%rax)    # memcpy
49 8b 55 38          mov    0x38(%r13),%rdx   # iommu->mmio_base
41 8b 45 74          mov    0x74(%r13),%eax   # redundant load of iommu->cmd_buf_tail
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
89 82 08 20 00 00    mov    %eax,0x2008(%rdx) # writel

CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Joerg Roedel <jroedel@suse.de>
CC: linux-kernel@vger.kernel.org
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 0ab1e87ff554..90c706fa911b 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -826,17 +826,18 @@ static void copy_cmd_to_buffer(struct amd_iommu *iommu,
 			       struct iommu_cmd *cmd)
 {
 	u8 *target;
-
-	target = iommu->cmd_buf + iommu->cmd_buf_tail;
-
-	iommu->cmd_buf_tail += sizeof(*cmd);
-	iommu->cmd_buf_tail %= CMD_BUFFER_SIZE;
+	u32 tail;
 
 	/* Copy command to buffer */
+	tail = iommu->cmd_buf_tail;
+	target = iommu->cmd_buf + tail;
 	memcpy(target, cmd, sizeof(*cmd));
 
+	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
+	iommu->cmd_buf_tail = tail;
+
 	/* Tell the IOMMU about it */
-	writel(iommu->cmd_buf_tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
 }
 
 static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
-- 
Gitee


From 911b62d7bd66c4e3584f12c71521b56bf521bbfc Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 6 Apr 2022 15:44:48 +0800
Subject: [PATCH 1038/2161] PCI/AER: Fix implicit declaration of function
 'pci_cleanup_aer_uncorrect_error_status'

commit opencloudos.

'pci_cleanup_aer_uncorrect_error_status' was renamed to 'pci_aer_clear_nonfatal_status',
in commit '894020fdd88c1e9a74c60b67c0f19f1c7696ba2f'.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 4 ++--
 drivers/net/ethernet/intel/i40e/kcompat.h   | 2 +-
 drivers/net/ethernet/intel/iavf/kcompat.h   | 2 +-
 drivers/scsi/mpt3sas/mpt3sas_scsih.c        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)
 mode change 100755 => 100644 drivers/scsi/mpt3sas/mpt3sas_scsih.c

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b67de06e5f1b..c8a4038fc254 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -16224,10 +16224,10 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev)
 			result = PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	err = pci_cleanup_aer_uncorrect_error_status(pdev);
+	err = pci_aer_clear_nonfatal_status(pdev);
 	if (err) {
 		dev_info(&pdev->dev,
-			 "pci_cleanup_aer_uncorrect_error_status failed 0x%0x\n",
+			 "pci_aer_clear_nonfatal_status failed 0x%0x\n",
 			 err);
 		/* non-fatal, continue */
 	}
diff --git a/drivers/net/ethernet/intel/i40e/kcompat.h b/drivers/net/ethernet/intel/i40e/kcompat.h
index c3ee144db4e7..0be8e017d1b8 100644
--- a/drivers/net/ethernet/intel/i40e/kcompat.h
+++ b/drivers/net/ethernet/intel/i40e/kcompat.h
@@ -2309,7 +2309,7 @@ static inline int pci_enable_pcie_error_reporting(struct pci_dev __always_unused
 	return 0;
 }
 #define pci_disable_pcie_error_reporting(dev) do {} while (0)
-#define pci_cleanup_aer_uncorrect_error_status(dev) do {} while (0)
+#define pci_aer_clear_nonfatal_status(dev) do {} while (0)
 
 void *_kc_kmemdup(const void *src, size_t len, unsigned gfp);
 #define kmemdup(src, len, gfp) _kc_kmemdup(src, len, gfp)
diff --git a/drivers/net/ethernet/intel/iavf/kcompat.h b/drivers/net/ethernet/intel/iavf/kcompat.h
index b37a3f3ac77a..8b988e9617cb 100644
--- a/drivers/net/ethernet/intel/iavf/kcompat.h
+++ b/drivers/net/ethernet/intel/iavf/kcompat.h
@@ -2294,7 +2294,7 @@ static inline int pci_enable_pcie_error_reporting(struct pci_dev __always_unused
 	return 0;
 }
 #define pci_disable_pcie_error_reporting(dev) do {} while (0)
-#define pci_cleanup_aer_uncorrect_error_status(dev) do {} while (0)
+#define pci_aer_clear_nonfatal_status(dev) do {} while (0)
 
 void *_kc_kmemdup(const void *src, size_t len, unsigned gfp);
 #define kmemdup(src, len, gfp) _kc_kmemdup(src, len, gfp)
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
old mode 100755
new mode 100644
index d38617f7f6f7..4bada4334c9f
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -16187,7 +16187,7 @@ scsih_pci_resume(struct pci_dev *pdev)
 	&& (CONFIG_SUSE_PATCHLEVEL >= 2))))
 	pci_aer_clear_nonfatal_status(pdev);
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19) 	
-	pci_cleanup_aer_uncorrect_error_status(pdev);
+	pci_aer_clear_nonfatal_status(pdev);
 #endif
 	mpt3sas_base_start_watchdog(ioc);
 	mpt3sas_base_start_hba_unplug_watchdog(ioc);
-- 
Gitee


From 98e90b5ff3a60ff6aade3a87f2976a4e31fdde67 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 9 Oct 2019 14:54:01 -0500
Subject: [PATCH 1039/2161] PCI/ATS: Move pci_prg_resp_pasid_required() to
 CONFIG_PCI_PRI

commit 8cbb8a9374a271099bacdc890fb16d374261332b upstream.

pci_prg_resp_pasid_required() returns the value of the "PRG Response PASID
Required" bit from the PRI capability, but the interface was previously
defined under #ifdef CONFIG_PCI_PASID.

Move it from CONFIG_PCI_PASID to CONFIG_PCI_PRI so it's with the other
PRI-related things.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/ats.c       | 55 +++++++++++++++++++----------------------
 include/linux/pci-ats.h | 11 ++++-----
 2 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 40ad10716b2a..bf8be40811f6 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -298,6 +298,31 @@ int pci_reset_pri(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_GPL(pci_reset_pri);
 
+/**
+ * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit
+ *				 status.
+ * @pdev: PCI device structure
+ *
+ * Returns 1 if PASID is required in PRG Response Message, 0 otherwise.
+ */
+int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{
+	u16 status;
+	int pos;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+
+	if (status & PCI_PRI_STATUS_PASID)
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
+
 /**
  * pci_pri_supported - Check if PRI is supported.
  * @pdev: PCI device structure
@@ -427,36 +452,6 @@ int pci_pasid_features(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_GPL(pci_pasid_features);
 
-/**
- * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit
- *				 status.
- * @pdev: PCI device structure
- *
- * Returns 1 if PASID is required in PRG Response Message, 0 otherwise.
- *
- * Even though the PRG response PASID status is read from PRI Status
- * Register, since this API will mainly be used by PASID users, this
- * function is defined within #ifdef CONFIG_PCI_PASID instead of
- * CONFIG_PCI_PRI.
- */
-int pci_prg_resp_pasid_required(struct pci_dev *pdev)
-{
-	u16 status;
-	int pri;
-
-	pri = pdev->pri_cap;
-	if (!pri)
-		return 0;
-
-	pci_read_config_word(pdev, pri + PCI_PRI_STATUS, &status);
-
-	if (status & PCI_PRI_STATUS_PASID)
-		return 1;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
-
 #define PASID_NUMBER_SHIFT	8
 #define PASID_NUMBER_MASK	(0x1f << PASID_NUMBER_SHIFT)
 /**
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 8be85e1372eb..24e2a5c3741c 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -10,6 +10,7 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs);
 void pci_disable_pri(struct pci_dev *pdev);
 void pci_restore_pri_state(struct pci_dev *pdev);
 int pci_reset_pri(struct pci_dev *pdev);
+int pci_prg_resp_pasid_required(struct pci_dev *pdev);
 bool pci_pri_supported(struct pci_dev *pdev);
 #else /* CONFIG_PCI_PRI */
 static inline bool pci_pri_supported(struct pci_dev *pdev)
@@ -32,6 +33,10 @@ static inline int pci_reset_pri(struct pci_dev *pdev)
 	return -ENODEV;
 }
 
+static inline int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI_PRI */
 
 #ifdef CONFIG_PCI_PASID
@@ -41,7 +46,6 @@ void pci_disable_pasid(struct pci_dev *pdev);
 void pci_restore_pasid_state(struct pci_dev *pdev);
 int pci_pasid_features(struct pci_dev *pdev);
 int pci_max_pasids(struct pci_dev *pdev);
-int pci_prg_resp_pasid_required(struct pci_dev *pdev);
 
 #else  /* CONFIG_PCI_PASID */
 
@@ -67,11 +71,6 @@ static inline int pci_max_pasids(struct pci_dev *pdev)
 {
 	return -EINVAL;
 }
-
-static inline int pci_prg_resp_pasid_required(struct pci_dev *pdev)
-{
-	return 0;
-}
 #endif /* CONFIG_PCI_PASID */
 
 
-- 
Gitee


From dc5ea6775d925afe5cd026e8afa614259b97beea Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Thu, 5 Sep 2019 14:31:42 -0500
Subject: [PATCH 1040/2161] PCI/ATS: Handle sharing of PF PRI Capability with
 all VFs

commit 9bf49e36d7183a170a9906d19acc5254818fc574 upstream.

Per PCIe r5.0, sec 9.3.7.11, VFs must not implement the PRI Capability.  If
the PF implements PRI, it is shared by the VFs.  Since VFs don't have a PRI
Capability, pci_enable_pri() always failed, which caused IOMMU setup to
fail.

Update the PRI interfaces so for VFs they reflect the state of the PF PRI.

[bhelgaas: rebase without pri_cap caching, commit log]
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Link: https://lore.kernel.org/r/b971e31f8695980da8e4a7f93e3b6a3edba3edaa.1567029860.git.sathyanarayanan.kuppuswamy@linux.intel.com
Link: https://lore.kernel.org/r/20190905193146.90250-2-helgaas@kernel.org
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/ats.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index bf8be40811f6..c210b4a0c652 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -203,6 +203,17 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
 	u32 max_requests;
 	int pri = pdev->pri_cap;
 
+	/*
+	 * VFs must not implement the PRI Capability.  If their PF
+	 * implements PRI, it is shared by the VFs, so if the PF PRI is
+	 * enabled, it is also enabled for the VF.
+	 */
+	if (pdev->is_virtfn) {
+		if (pci_physfn(pdev)->pri_enabled)
+			return 0;
+		return -EINVAL;
+	}
+
 	if (WARN_ON(pdev->pri_enabled))
 		return -EBUSY;
 
@@ -238,6 +249,10 @@ void pci_disable_pri(struct pci_dev *pdev)
 	u16 control;
 	int pri = pdev->pri_cap;
 
+	/* VFs share the PF PRI */
+	if (pdev->is_virtfn)
+		return;
+
 	if (WARN_ON(!pdev->pri_enabled))
 		return;
 
@@ -262,6 +277,9 @@ void pci_restore_pri_state(struct pci_dev *pdev)
 	u32 reqs = pdev->pri_reqs_alloc;
 	int pri = pdev->pri_cap;
 
+	if (pdev->is_virtfn)
+		return;
+
 	if (!pdev->pri_enabled)
 		return;
 
@@ -285,6 +303,9 @@ int pci_reset_pri(struct pci_dev *pdev)
 	u16 control;
 	int pri = pdev->pri_cap;
 
+	if (pdev->is_virtfn)
+		return 0;
+
 	if (WARN_ON(pdev->pri_enabled))
 		return -EBUSY;
 
@@ -310,6 +331,9 @@ int pci_prg_resp_pasid_required(struct pci_dev *pdev)
 	u16 status;
 	int pos;
 
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
 	if (!pos)
 		return 0;
-- 
Gitee


From effc7695776feab871c09b701dc89f1c664e775c Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Thu, 5 Sep 2019 14:31:43 -0500
Subject: [PATCH 1041/2161] PCI/ATS: Handle sharing of PF PASID Capability with
 all VFs

commit 2b0ae7cc3bfc3fae124c25870f41291c670b4549 upstream.

Per PCIe r5.0, sec 9.3.7.14, if a PF implements the PASID Capability, the
PF PASID configuration is shared by its VFs.  VFs must not implement their
own PASID Capability.  Since VFs don't have a PASID Capability,
pci_enable_pasid() always failed, which caused IOMMU setup to fail.

Update the PASID interfaces so for VFs they reflect the state of the PF
PASID.

[bhelgaas: rebase without pasid_cap caching, commit log]
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Link: https://lore.kernel.org/r/8ba1ac192e4ac737508b6ac15002158e176bab91.1567029860.git.sathyanarayanan.kuppuswamy@linux.intel.com
Link: https://lore.kernel.org/r/20190905193146.90250-3-helgaas@kernel.org
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/ats.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index c210b4a0c652..78fb31a80775 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -378,6 +378,16 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
 	u16 control, supported;
 	int pos;
 
+	/*
+	 * VFs must not implement the PASID Capability, but if a PF
+	 * supports PASID, its VFs share the PF PASID configuration.
+	 */
+	if (pdev->is_virtfn) {
+		if (pci_physfn(pdev)->pasid_enabled)
+			return 0;
+		return -EINVAL;
+	}
+
 	if (WARN_ON(pdev->pasid_enabled))
 		return -EBUSY;
 
@@ -415,6 +425,10 @@ void pci_disable_pasid(struct pci_dev *pdev)
 	u16 control = 0;
 	int pos;
 
+	/* VFs share the PF PASID configuration */
+	if (pdev->is_virtfn)
+		return;
+
 	if (WARN_ON(!pdev->pasid_enabled))
 		return;
 
@@ -437,6 +451,9 @@ void pci_restore_pasid_state(struct pci_dev *pdev)
 	u16 control;
 	int pos;
 
+	if (pdev->is_virtfn)
+		return;
+
 	if (!pdev->pasid_enabled)
 		return;
 
@@ -464,6 +481,9 @@ int pci_pasid_features(struct pci_dev *pdev)
 	u16 supported;
 	int pos;
 
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
 	if (!pos)
 		return -EINVAL;
@@ -490,6 +510,9 @@ int pci_max_pasids(struct pci_dev *pdev)
 	u16 supported;
 	int pos;
 
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
 	if (!pos)
 		return -EINVAL;
-- 
Gitee


From 0cc57b6ed0badecdfc4a7afd5cf31b2aa9b40890 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Thu, 5 Sep 2019 14:31:44 -0500
Subject: [PATCH 1042/2161] PCI/ATS: Disable PF/VF ATS service independently

commit 3ad62192097443e8c3a8e244475bacaecb894d4e upstream.

Previously we didn't disable the PF ATS until all associated VFs had
disabled it.  But per PCIe spec r5.0, sec 9.3.7.8, the ATS Capability in
VFs and associated PFs may be enabled independently.  Leaving ATS enabled
in the PF unnecessarily may have power and performance impacts.

Remove this dependency logic in the ATS enable/disable code.

[bhelgaas: commit log]
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Link: https://lore.kernel.org/r/8163ab8fa66afd2cba514ae95d29ab12104781aa.1567029860.git.sathyanarayanan.kuppuswamy@linux.intel.com
Link: https://lore.kernel.org/r/20190905193146.90250-4-helgaas@kernel.org
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/ats.c   | 11 -----------
 include/linux/pci.h |  1 -
 2 files changed, 12 deletions(-)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 78fb31a80775..5b7d566ecc9d 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -76,8 +76,6 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
 		pdev = pci_physfn(dev);
 		if (pdev->ats_stu != ps)
 			return -EINVAL;
-
-		atomic_inc(&pdev->ats_ref_cnt);  /* count enabled VFs */
 	} else {
 		dev->ats_stu = ps;
 		ctrl |= PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU);
@@ -95,20 +93,11 @@ EXPORT_SYMBOL_GPL(pci_enable_ats);
  */
 void pci_disable_ats(struct pci_dev *dev)
 {
-	struct pci_dev *pdev;
 	u16 ctrl;
 
 	if (WARN_ON(!dev->ats_enabled))
 		return;
 
-	if (atomic_read(&dev->ats_ref_cnt))
-		return;		/* VFs still enabled */
-
-	if (dev->is_virtfn) {
-		pdev = pci_physfn(dev);
-		atomic_dec(&pdev->ats_ref_cnt);
-	}
-
 	pci_read_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, &ctrl);
 	ctrl &= ~PCI_ATS_CTRL_ENABLE;
 	pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f6792203017b..b665cb3aaac2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -465,7 +465,6 @@ struct pci_dev {
 	};
 	u16		ats_cap;	/* ATS Capability offset */
 	u8		ats_stu;	/* ATS Smallest Translation Unit */
-	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
 #endif
 #ifdef CONFIG_PCI_PRI
 	u16		pri_cap;	/* PRI Capability offset */
-- 
Gitee


From 4ecfa2d000a698141a8b88a3b7a7049db623a201 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Thu, 5 Sep 2019 14:31:46 -0500
Subject: [PATCH 1043/2161] PCI/ATS: Cache PASID Capability offset

commit 751035b8dc061ae434c3311bac9cd6d0e5e00f94 upstream.

Previously each PASID interface searched for the PASID Capability.  Cache
the capability offset the first time we use it instead of searching each
time.

[bhelgaas: commit log, reorder patch to later, call pci_pasid_init() from
pci_init_capabilities()]
Link: https://lore.kernel.org/r/4957778959fa34eab3e8b3065d1951989c61cb0f.1567029860.git.sathyanarayanan.kuppuswamy@linux.intel.com
Link: https://lore.kernel.org/r/20190905193146.90250-6-helgaas@kernel.org
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/ats.c   | 42 +++++++++++++++++++++---------------------
 drivers/pci/pci.h   |  6 ++++++
 drivers/pci/probe.c |  3 +++
 include/linux/pci.h |  1 +
 4 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 5b7d566ecc9d..f549a7715a66 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -353,6 +353,11 @@ EXPORT_SYMBOL_GPL(pci_pri_supported);
 #endif /* CONFIG_PCI_PRI */
 
 #ifdef CONFIG_PCI_PASID
+void pci_pasid_init(struct pci_dev *pdev)
+{
+	pdev->pasid_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
+}
+
 /**
  * pci_enable_pasid - Enable the PASID capability
  * @pdev: PCI device structure
@@ -365,7 +370,7 @@ EXPORT_SYMBOL_GPL(pci_pri_supported);
 int pci_enable_pasid(struct pci_dev *pdev, int features)
 {
 	u16 control, supported;
-	int pos;
+	int pasid = pdev->pasid_cap;
 
 	/*
 	 * VFs must not implement the PASID Capability, but if a PF
@@ -383,11 +388,10 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
 	if (!pdev->eetlp_prefix_path)
 		return -EINVAL;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (!pasid)
 		return -EINVAL;
 
-	pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported);
+	pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported);
 	supported &= PCI_PASID_CAP_EXEC | PCI_PASID_CAP_PRIV;
 
 	/* User wants to enable anything unsupported? */
@@ -397,7 +401,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
 	control = PCI_PASID_CTRL_ENABLE | features;
 	pdev->pasid_features = features;
 
-	pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control);
+	pci_write_config_word(pdev, pasid + PCI_PASID_CTRL, control);
 
 	pdev->pasid_enabled = 1;
 
@@ -412,7 +416,7 @@ EXPORT_SYMBOL_GPL(pci_enable_pasid);
 void pci_disable_pasid(struct pci_dev *pdev)
 {
 	u16 control = 0;
-	int pos;
+	int pasid = pdev->pasid_cap;
 
 	/* VFs share the PF PASID configuration */
 	if (pdev->is_virtfn)
@@ -421,11 +425,10 @@ void pci_disable_pasid(struct pci_dev *pdev)
 	if (WARN_ON(!pdev->pasid_enabled))
 		return;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (!pasid)
 		return;
 
-	pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control);
+	pci_write_config_word(pdev, pasid + PCI_PASID_CTRL, control);
 
 	pdev->pasid_enabled = 0;
 }
@@ -438,7 +441,7 @@ EXPORT_SYMBOL_GPL(pci_disable_pasid);
 void pci_restore_pasid_state(struct pci_dev *pdev)
 {
 	u16 control;
-	int pos;
+	int pasid = pdev->pasid_cap;
 
 	if (pdev->is_virtfn)
 		return;
@@ -446,12 +449,11 @@ void pci_restore_pasid_state(struct pci_dev *pdev)
 	if (!pdev->pasid_enabled)
 		return;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (!pasid)
 		return;
 
 	control = PCI_PASID_CTRL_ENABLE | pdev->pasid_features;
-	pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control);
+	pci_write_config_word(pdev, pasid + PCI_PASID_CTRL, control);
 }
 EXPORT_SYMBOL_GPL(pci_restore_pasid_state);
 
@@ -468,16 +470,15 @@ EXPORT_SYMBOL_GPL(pci_restore_pasid_state);
 int pci_pasid_features(struct pci_dev *pdev)
 {
 	u16 supported;
-	int pos;
+	int pasid = pdev->pasid_cap;
 
 	if (pdev->is_virtfn)
 		pdev = pci_physfn(pdev);
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (!pasid)
 		return -EINVAL;
 
-	pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported);
+	pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported);
 
 	supported &= PCI_PASID_CAP_EXEC | PCI_PASID_CAP_PRIV;
 
@@ -497,16 +498,15 @@ EXPORT_SYMBOL_GPL(pci_pasid_features);
 int pci_max_pasids(struct pci_dev *pdev)
 {
 	u16 supported;
-	int pos;
+	int pasid = pdev->pasid_cap;
 
 	if (pdev->is_virtfn)
 		pdev = pci_physfn(pdev);
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (!pasid)
 		return -EINVAL;
 
-	pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported);
+	pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported);
 
 	supported = (supported & PASID_NUMBER_MASK) >> PASID_NUMBER_SHIFT;
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 8f73ac94f9bd..b5176c7579e5 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -480,6 +480,12 @@ void pci_pri_init(struct pci_dev *dev);
 static inline void pci_pri_init(struct pci_dev *dev) { }
 #endif
 
+#ifdef CONFIG_PCI_PASID
+void pci_pasid_init(struct pci_dev *dev);
+#else
+static inline void pci_pasid_init(struct pci_dev *dev) { }
+#endif
+
 #ifdef CONFIG_PCI_IOV
 int pci_iov_init(struct pci_dev *dev);
 void pci_iov_release(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 431512fbff65..8d33e4556486 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2377,6 +2377,9 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	/* Page Request Interface */
 	pci_pri_init(dev);
 
+	/* Process Address Space ID */
+	pci_pasid_init(dev);
+
 	/* Enable ACS P2P upstream forwarding */
 	pci_enable_acs(dev);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b665cb3aaac2..3bd17bb7f23c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -471,6 +471,7 @@ struct pci_dev {
 	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
 #endif
 #ifdef CONFIG_PCI_PASID
+	u16		pasid_cap;	/* PASID Capability offset */
 	u16		pasid_features;
 #endif
 #ifdef CONFIG_PCI_P2PDMA
-- 
Gitee


From 31485ab97ba5f60f7c78a6c8ff52346886efe712 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 9 Oct 2019 16:07:51 -0500
Subject: [PATCH 1044/2161] PCI/ATS: Cache PRI PRG Response PASID Required bit

commit e5adf79a1d8086aefa56f48eeb08f8fe4e054a3d upstream.

The PRG Response PASID Required bit in the PRI Capability is read-only.
Read it once when we enumerate the device and cache the value so we don't
need to read it again.

Based-on-patch-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/ats.c   | 22 ++++++++++------------
 include/linux/pci.h |  1 +
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index f549a7715a66..b5955577fe82 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -177,7 +177,16 @@ EXPORT_SYMBOL_GPL(pci_ats_page_aligned);
 #ifdef CONFIG_PCI_PRI
 void pci_pri_init(struct pci_dev *pdev)
 {
+	u16 status;
+
 	pdev->pri_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+
+	if (!pdev->pri_cap)
+		return;
+
+	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
+	if (status & PCI_PRI_STATUS_PASID)
+		pdev->pasid_required = 1;
 }
 
 /**
@@ -317,22 +326,11 @@ EXPORT_SYMBOL_GPL(pci_reset_pri);
  */
 int pci_prg_resp_pasid_required(struct pci_dev *pdev)
 {
-	u16 status;
-	int pos;
 
 	if (pdev->is_virtfn)
 		pdev = pci_physfn(pdev);
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
-		return 0;
-
-	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
-
-	if (status & PCI_PRI_STATUS_PASID)
-		return 1;
-
-	return 0;
+	return pdev->pasid_required;
 }
 EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3bd17bb7f23c..4a8c82dc0e70 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -469,6 +469,7 @@ struct pci_dev {
 #ifdef CONFIG_PCI_PRI
 	u16		pri_cap;	/* PRI Capability offset */
 	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
+	unsigned int	pasid_required:1; /* PRG Response PASID Required */
 #endif
 #ifdef CONFIG_PCI_PASID
 	u16		pasid_cap;	/* PASID Capability offset */
-- 
Gitee


From bad6309d3ba38797a7245e52101e0cf85320f3db Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Thu, 11 Aug 2022 16:57:38 +0800
Subject: [PATCH 1045/2161] iommu/vt-d: Fix RID2PASID setup/teardown failure

commit 4140d77a022101376bbfa3ec3e3da5063455c60e upstream.

Backport Reason: fix SPR device can't boot with pcie aspeed device in sm_on
mode.

Conflict: conflict with "struct list_head global"

The IOMMU driver shares the pasid table for PCI alias devices. When the
RID2PASID entry of the shared pasid table has been filled by the first
device, the subsequent device will encounter the "DMAR: Setup RID2PASID
failed" failure as the pasid entry has already been marked as present.
As the result, the IOMMU probing process will be aborted.

On the contrary, when any alias device is hot-removed from the system,
for example, by writing to /sys/bus/pci/devices/.../remove, the shared
RID2PASID will be cleared without any notifications to other devices.
As the result, any DMAs from those rest devices are blocked.

Sharing pasid table among PCI alias devices could save two memory pages
for devices underneath the PCIe-to-PCI bridges. Anyway, considering that
those devices are rare on modern platforms that support VT-d in scalable
mode and the saved memory is negligible, it's reasonable to remove this
part of immature code to make the driver feasible and stable.

Fixes: ef848b7e5a6a0 ("iommu/vt-d: Setup pasid entry for RID2PASID support")
Reported-by: Chenyi Qiang <chenyi.qiang@intel.com>
Reported-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220623065720.727849-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20220625133430.2200315-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 24 -------------
 drivers/iommu/intel/pasid.c | 69 ++-----------------------------------
 drivers/iommu/intel/pasid.h |  1 -
 include/linux/intel-iommu.h |  3 --
 4 files changed, 3 insertions(+), 94 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5367ab008015..2859acd610b2 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -365,30 +365,6 @@ struct device_domain_info *get_domain_info(struct device *dev)
 DEFINE_SPINLOCK(device_domain_lock);
 static LIST_HEAD(device_domain_list);
 
-/*
- * Iterate over elements in device_domain_list and call the specified
- * callback @fn against each element.
- */
-int for_each_device_domain(int (*fn)(struct device_domain_info *info,
-				     void *data), void *data)
-{
-	int ret = 0;
-	unsigned long flags;
-	struct device_domain_info *info;
-
-	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_entry(info, &device_domain_list, global) {
-		ret = fn(info, data);
-		if (ret) {
-			spin_unlock_irqrestore(&device_domain_lock, flags);
-			return ret;
-		}
-	}
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-
-	return 0;
-}
-
 const struct iommu_ops intel_iommu_ops;
 
 static bool translation_pre_enabled(struct intel_iommu *iommu)
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 07c390aed1fe..980503957972 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -86,54 +86,6 @@ void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid)
 /*
  * Per device pasid table management:
  */
-static inline void
-device_attach_pasid_table(struct device_domain_info *info,
-			  struct pasid_table *pasid_table)
-{
-	info->pasid_table = pasid_table;
-	list_add(&info->table, &pasid_table->dev);
-}
-
-static inline void
-device_detach_pasid_table(struct device_domain_info *info,
-			  struct pasid_table *pasid_table)
-{
-	info->pasid_table = NULL;
-	list_del(&info->table);
-}
-
-struct pasid_table_opaque {
-	struct pasid_table	**pasid_table;
-	int			segment;
-	int			bus;
-	int			devfn;
-};
-
-static int search_pasid_table(struct device_domain_info *info, void *opaque)
-{
-	struct pasid_table_opaque *data = opaque;
-
-	if (info->iommu->segment == data->segment &&
-	    info->bus == data->bus &&
-	    info->devfn == data->devfn &&
-	    info->pasid_table) {
-		*data->pasid_table = info->pasid_table;
-		return 1;
-	}
-
-	return 0;
-}
-
-static int get_alias_pasid_table(struct pci_dev *pdev, u16 alias, void *opaque)
-{
-	struct pasid_table_opaque *data = opaque;
-
-	data->segment = pci_domain_nr(pdev->bus);
-	data->bus = PCI_BUS_NUM(alias);
-	data->devfn = alias & 0xff;
-
-	return for_each_device_domain(&search_pasid_table, data);
-}
 
 /*
  * Allocate a pasid table for @dev. It should be called in a
@@ -143,28 +95,18 @@ int intel_pasid_alloc_table(struct device *dev)
 {
 	struct device_domain_info *info;
 	struct pasid_table *pasid_table;
-	struct pasid_table_opaque data;
 	struct page *pages;
 	u32 max_pasid = 0;
-	int ret, order;
-	int size;
+	int order, size;
 
 	might_sleep();
 	info = get_domain_info(dev);
 	if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table))
 		return -EINVAL;
 
-	/* DMA alias device already has a pasid table, use it: */
-	data.pasid_table = &pasid_table;
-	ret = pci_for_each_dma_alias(to_pci_dev(dev),
-				     &get_alias_pasid_table, &data);
-	if (ret)
-		goto attach_out;
-
 	pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL);
 	if (!pasid_table)
 		return -ENOMEM;
-	INIT_LIST_HEAD(&pasid_table->dev);
 
 	if (info->pasid_supported)
 		max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)),
@@ -182,9 +124,7 @@ int intel_pasid_alloc_table(struct device *dev)
 	pasid_table->table = page_address(pages);
 	pasid_table->order = order;
 	pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
-
-attach_out:
-	device_attach_pasid_table(info, pasid_table);
+	info->pasid_table = pasid_table;
 
 	return 0;
 }
@@ -202,10 +142,7 @@ void intel_pasid_free_table(struct device *dev)
 		return;
 
 	pasid_table = info->pasid_table;
-	device_detach_pasid_table(info, pasid_table);
-
-	if (!list_empty(&pasid_table->dev))
-		return;
+	info->pasid_table = NULL;
 
 	/* Free scalable mode PASID directory tables: */
 	dir = pasid_table->table;
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index d5552e2c160d..cec210b6348b 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -74,7 +74,6 @@ struct pasid_table {
 	void			*table;		/* pasid table pointer */
 	int			order;		/* page order of pasid table */
 	u32			max_pasid;	/* max pasid */
-	struct list_head	dev;		/* device list */
 };
 
 /* Get PRESENT bit of a PASID directory entry. */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0eb0cbcafc09..eb85c4ce1616 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -636,7 +636,6 @@ struct subdev_domain_info {
 struct device_domain_info {
 	struct list_head link;	/* link to domain siblings */
 	struct list_head global; /* link to global list */
-	struct list_head table;	/* link to pasid table */
 	struct list_head subdevices; /* subdevices sibling */
 	u32 segment;		/* PCI segment number */
 	u8 bus;			/* PCI bus number */
@@ -750,8 +749,6 @@ extern int dmar_ir_support(void);
 void *alloc_pgtable_page(int node);
 void free_pgtable_page(void *vaddr);
 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain);
-int for_each_device_domain(int (*fn)(struct device_domain_info *info,
-				     void *data), void *data);
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
 struct dmar_domain *find_domain(struct device *dev);
-- 
Gitee


From 7f6dc463f46c02a529cb5a08ad78894ba2639c6c Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 6 Apr 2022 17:46:30 +0800
Subject: [PATCH 1046/2161] iommu: Fix unused variable 'ops'

commit opencloudos.

Use CONFIG_SMMU_BYPASS_DEV to enbrace it.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 24c77529812c..68d5e6007e4f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1511,7 +1511,9 @@ static int iommu_group_alloc_default_domain(struct bus_type *bus,
 static int iommu_alloc_default_domain(struct iommu_group *group,
 				      struct device *dev)
 {
+#ifdef CONFIG_SMMU_BYPASS_DEV
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
+#endif
 	unsigned int type = iommu_def_domain_type;
 
 	if (group->default_domain)
-- 
Gitee


From 9d427b8d08bf44d519e73a480eca2d2b53122203 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:04 +0100
Subject: [PATCH 1047/2161] x86/devicetree: Fix the ioapic interrupt type table

commit 2e730cb56b2cd1626fecaf23ef1537fb24721ef2 upstream.

The ioapic interrupt type table is wrong as it assumes that polarity in
IO/APIC context means active high when set. But the IO/APIC polarity is
working the other way round. This works because the ordering of the entries
is consistent with the device tree and the type information is not used by
the IO/APIC interrupt chip.

The whole trigger and polarity business of IO/APIC is misleading and the
corresponding constants which are defined as 0/1 are not used consistently
and are going to be removed.

Rename the type table members to 'is_level' and 'active_low' and adjust the
type information for consistency sake.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-5-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/devicetree.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index ddffd80f5c52..6a4cb71c2498 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -184,31 +184,31 @@ static unsigned int ioapic_id;
 
 struct of_ioapic_type {
 	u32 out_type;
-	u32 trigger;
-	u32 polarity;
+	u32 is_level;
+	u32 active_low;
 };
 
 static struct of_ioapic_type of_ioapic_type[] =
 {
 	{
-		.out_type	= IRQ_TYPE_EDGE_RISING,
-		.trigger	= IOAPIC_EDGE,
-		.polarity	= 1,
+		.out_type	= IRQ_TYPE_EDGE_FALLING,
+		.is_level	= 0,
+		.active_low	= 1,
 	},
 	{
-		.out_type	= IRQ_TYPE_LEVEL_LOW,
-		.trigger	= IOAPIC_LEVEL,
-		.polarity	= 0,
+		.out_type	= IRQ_TYPE_LEVEL_HIGH,
+		.is_level	= 1,
+		.active_low	= 0,
 	},
 	{
-		.out_type	= IRQ_TYPE_LEVEL_HIGH,
-		.trigger	= IOAPIC_LEVEL,
-		.polarity	= 1,
+		.out_type	= IRQ_TYPE_LEVEL_LOW,
+		.is_level	= 1,
+		.active_low	= 1,
 	},
 	{
-		.out_type	= IRQ_TYPE_EDGE_FALLING,
-		.trigger	= IOAPIC_EDGE,
-		.polarity	= 0,
+		.out_type	= IRQ_TYPE_EDGE_RISING,
+		.is_level	= 0,
+		.active_low	= 0,
 	},
 };
 
@@ -228,7 +228,7 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 		return -EINVAL;
 
 	it = &of_ioapic_type[type_index];
-	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low);
 	tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
 	tmp.ioapic.pin = fwspec->param[0];
 
-- 
Gitee


From ed16023fdbc36503f1869cdc5baf4dba90d47e8f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:15 +0100
Subject: [PATCH 1048/2161] PCI: vmd: Use msi_msg shadow structs

commit e16c8058a10ba8e38d0d1ad0b64e444b245ffdbd upstream.

Use the x86 shadow structs in msi_msg instead of the macros.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-16-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 5a1827342b97..7702216a6919 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -18,7 +18,6 @@
 #include <asm/irqdomain.h>
 #include <asm/device.h>
 #include <asm/msi.h>
-#include <asm/msidef.h>
 
 #define VMD_CFGBAR	0
 #define VMD_MEMBAR1	2
@@ -128,10 +127,10 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	struct vmd_irq_list *irq = vmdirq->irq;
 	struct vmd_dev *vmd = irq_data_get_irq_handler_data(data);
 
-	msg->address_hi = MSI_ADDR_BASE_HI;
-	msg->address_lo = MSI_ADDR_BASE_LO |
-			  MSI_ADDR_DEST_ID(index_from_irqs(vmd, irq));
-	msg->data = 0;
+	memset(msg, 0, sizeof(*msg));
+	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+	msg->arch_addr_lo.destid_0_7 = index_from_irqs(vmd, irq);
 }
 
 /*
-- 
Gitee


From 0851f46c705331c44ed489fd8155736a85184184 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 24 Oct 2020 22:35:22 +0100
Subject: [PATCH 1049/2161] genirq/irqdomain: Implement get_name() method on
 irqchip fwnodes

commit 2cbd5a45e5296b28d64224ffbbd33d427704ba1b upstream.

Prerequisite to make x86 more irqdomain compliant.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201024213535.443185-23-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/irq/irqdomain.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5e03cbee70d6..f5b33728fcec 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -42,7 +42,16 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
 static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
 #endif
 
-const struct fwnode_operations irqchip_fwnode_ops;
+static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+	struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+	return fwid->name;
+}
+
+const struct fwnode_operations irqchip_fwnode_ops = {
+	.get_name = irqchip_fwnode_get_name,
+};
 EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
 
 /**
-- 
Gitee


From 64fa9e5e74615c2eaff52ac2d2bd375483a90fd9 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 3 Nov 2020 16:36:22 +0000
Subject: [PATCH 1050/2161] x86/ioapic: Use I/O-APIC ID for finding irqdomain,
 not index

commit f36a74b9345aebaf5d325380df87a54720229d18 upstream.

In commit b643128b917 ("x86/ioapic: Use irq_find_matching_fwspec() to
find remapping irqdomain") the I/O-APIC code was changed to find its
parent irqdomain using irq_find_matching_fwspec(), but the key used
for the lookup was wrong. It shouldn't use 'ioapic' which is the index
into its own ioapics[] array. It should use the actual arbitration
ID of the I/O-APIC in question, which is mpc_ioapic_id(ioapic).

Fixes: b643128b917 ("x86/ioapic: Use irq_find_matching_fwspec() to find remapping irqdomain")
Reported-by: lkp <oliver.sang@intel.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/57adf2c305cd0c5e9d860b2f3007a7e676fd0f9f.camel@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e7d6a83037d3..655e8a816098 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2358,14 +2358,14 @@ static int mp_irqdomain_create(int ioapic)
 	if (cfg->dev) {
 		fn = of_node_to_fwnode(cfg->dev);
 	} else {
-		fn = irq_domain_alloc_named_id_fwnode("IO-APIC", ioapic);
+		fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
 		if (!fn)
 			return -ENOMEM;
 	}
 
 	fwspec.fwnode = fn;
 	fwspec.param_count = 1;
-	fwspec.param[0] = ioapic;
+	fwspec.param[0] = mpc_ioapic_id(ioapic);
 
 	parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
 	if (!parent) {
-- 
Gitee


From 943427cd6d0c0db0f0d34d211bb823f7d61b997f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Nov 2020 15:34:32 +0100
Subject: [PATCH 1051/2161] x86/ioapic: Correct the PCI/ISA trigger type
 selection

commit aec8da04e4d71afdd4ab3025ea34a6517435f363 upstream.

PCI's default trigger type is level and ISA's is edge. The recent
refactoring made it the other way round, which went unnoticed as it seems
only to cause havoc on some AMD systems.

Make the comment and code do the right thing again.

Fixes: a27dca645d2c ("x86/io_apic: Cleanup trigger/polarity helpers")
Reported-by: Tom Lendacky <thomas.lendacky@amd.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Reported-by: Qian Cai <cai@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Tom Lendacky <thomas.lendacky@amd.com>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/87d00lgu13.fsf@nanos.tec.linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 655e8a816098..41055e93d902 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -822,9 +822,9 @@ static bool irq_is_level(int idx)
 	case MP_IRQTRIG_DEFAULT:
 		/*
 		 * Conforms to spec, ie. bus-type dependent trigger
-		 * mode. PCI defaults to egde, ISA to level.
+		 * mode. PCI defaults to level, ISA to edge.
 		 */
-		level = test_bit(bus, mp_bus_not_pci);
+		level = !test_bit(bus, mp_bus_not_pci);
 		/* Take EISA into account */
 		return eisa_irq_is_level(idx, bus, level);
 	case MP_IRQTRIG_EDGE:
-- 
Gitee


From f3f317833540bde5dd1ba8ffd014b9077424df05 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 11 Nov 2020 14:43:21 +0000
Subject: [PATCH 1052/2161] iommu/amd: Fix union of bitfields in intcapxt
 support

commit 2fb6acf3edfeb904505f9ba3fd01166866062591 upstream.

All the bitfields in here are overlaid on top of each other since
they're a union. Change the second u64 to be in a struct so it does
the intended thing.

Fixes: b5c3786ee370 ("iommu/amd: Use msi_msg shadow structs")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201111144322.1659970-2-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu_init.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 64fc6c152096..84d6109bfcc5 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -1984,13 +1984,15 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
 
 union intcapxt {
 	u64	capxt;
-	u64	reserved_0		:  2,
-		dest_mode_logical	:  1,
-		reserved_1		:  5,
-		destid_0_23		: 24,
-		vector			:  8,
-		reserved_2		: 16,
-		destid_24_31		:  8;
+	struct {
+		u64	reserved_0		:  2,
+			dest_mode_logical	:  1,
+			reserved_1		:  5,
+			destid_0_23		: 24,
+			vector			:  8,
+			reserved_2		: 16,
+			destid_24_31		:  8;
+	};
 } __attribute__ ((packed));
 
 /**
-- 
Gitee


From 86a8a7cad91701fce7d51858980c324f5bbf7b49 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 11 Nov 2020 14:43:20 +0000
Subject: [PATCH 1053/2161] iommu/amd: Don't register interrupt remapping
 irqdomain when IR is disabled

commit 2df985f5e44c43f5d29d8cc3aaa8e8ac697e9de6 upstream.

Registering the remapping irq domain unconditionally is potentially
allowing I/O-APIC and MSI interrupts to be parented in the IOMMU IR domain
even when IR is disabled. Don't do that.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201111144322.1659970-1-dwmw2@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu_init.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84d6109bfcc5..7498a82e5d98 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -1593,9 +1593,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	if (ret)
 		return ret;
 
-	ret = amd_iommu_create_irq_domain(iommu);
-	if (ret)
-		return ret;
+	if (amd_iommu_irq_remap) {
+		ret = amd_iommu_create_irq_domain(iommu);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Make sure IOMMU is not considered to translate itself. The IVRS
-- 
Gitee


From 556826a46f9c0316c85805c1695b956dfbd18763 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:29 +0200
Subject: [PATCH 1054/2161] iommu/amd: Prevent NULL pointer dereference

commit 23357b61f8062a8a8c9c84c0252056cd6d849ec8 upstream.

Dereferencing irq_data before checking it for NULL is suboptimal.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 90c706fa911b..f67b24486288 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3685,8 +3685,8 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
 
 	for (i = 0; i < nr_irqs; i++) {
 		irq_data = irq_domain_get_irq_data(domain, virq + i);
-		cfg = irqd_cfg(irq_data);
-		if (!irq_data || !cfg) {
+		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
+		if (!cfg) {
 			ret = -EINVAL;
 			goto out_free_data;
 		}
-- 
Gitee


From f8c1a74c552bb1e95c2c3b5c54ab45e662300d37 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:31 +0200
Subject: [PATCH 1055/2161] PCI: vmd: Dont abuse vector irqomain as parent

commit 585dfe8abc4460810c07114b64aca10a74fa2718 upstream.

VMD has it's own PCI/MSI interrupt domain which is not in any way depending
on the x86 vector domain. PCI devices behind VMD share the VMD MSIX vector
entries via a VMD specific message translation to the actual VMD MSIX
vector. The VMD device interrupt handler for the VMD MSIX vectors invokes
all interrupt handlers of the devices which share a vector.

Making the x86 vector domain the actual parent of the VMD irq domain is
pointless and actually counterproductive. When a device interrupt is
requested then it will activate the interrupt which traverses down the
hierarchy and consumes an interrupt vector in the vector domain which is
never used.

The domain is self contained and has no parent dependencies, so just hand
in NULL for the parent and be done with it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112330.928952181@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 7702216a6919..15dd6ed3ba40 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -678,7 +678,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 		return -ENODEV;
 
 	vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info,
-						    x86_vector_domain);
+						    NULL);
+
 	if (!vmd->irq_domain) {
 		irq_domain_free_fwnode(fn);
 		return -ENODEV;
-- 
Gitee


From 373fe077e9b9c365a9fe77383e6ec970d977f630 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:48 +0200
Subject: [PATCH 1056/2161] x86/irq: Move apic_post_init() invocation to one
 place

commit bb733e4336988e40072c759fb27057b5fe82c7d4 upstream.

No point to call it from both 32bit and 64bit implementations of
default_setup_apic_routing(). Move it to the caller.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112332.658496557@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/apic/apic.c     | 3 +++
 arch/x86/kernel/apic/probe_32.c | 3 ---
 arch/x86/kernel/apic/probe_64.c | 3 ---
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0fddfb8716f8..dfff7f6440ca 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1482,6 +1482,9 @@ void __init apic_intr_mode_init(void)
 		break;
 	}
 
+	if (x86_platform.apic_post_init)
+		x86_platform.apic_post_init();
+
 	apic_bsp_setup(upmode);
 }
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index a4432ae622de..3a179e238a9c 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -167,9 +167,6 @@ void __init default_setup_apic_routing(void)
 
 	if (apic->setup_apic_routing)
 		apic->setup_apic_routing();
-
-	if (x86_platform.apic_post_init)
-		x86_platform.apic_post_init();
 }
 
 void __init generic_apic_probe(void)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 29f0e0984557..fdf09158bc45 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -31,9 +31,6 @@ void __init default_setup_apic_routing(void)
 			break;
 		}
 	}
-
-	if (x86_platform.apic_post_init)
-		x86_platform.apic_post_init();
 }
 
 int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-- 
Gitee


From 1393302937d9c8bded16b73d5780e807005a4b24 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:49 +0200
Subject: [PATCH 1057/2161] x86/pci: Reducde #ifdeffery in PCI init code

commit 445d3595ab290ba16ca5f202c7a67d71460cb39f upstream.

Adding a function call before the first #ifdef in arch_pci_init() triggers
a 'mixed declarations and code' warning if PCI_DIRECT is enabled.

Use stub functions and move the #ifdeffery to the header file where it is
not in the way.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112332.767707340@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pci_x86.h | 11 +++++++++++
 arch/x86/pci/init.c            | 10 +++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 73bb404f4d2a..490411dba438 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -114,9 +114,20 @@ extern const struct pci_raw_ops pci_direct_conf1;
 extern bool port_cf9_safe;
 
 /* arch_initcall level */
+#ifdef CONFIG_PCI_DIRECT
 extern int pci_direct_probe(void);
 extern void pci_direct_init(int type);
+#else
+static inline int pci_direct_probe(void) { return -1; }
+static inline  void pci_direct_init(int type) { }
+#endif
+
+#ifdef CONFIG_PCI_BIOS
 extern void pci_pcbios_init(void);
+#else
+static inline void pci_pcbios_init(void) { }
+#endif
+
 extern void __init dmi_check_pciprobe(void);
 extern void __init dmi_check_skip_isa_align(void);
 
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 5fc617edf108..bf6690935943 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -8,11 +8,9 @@
    in the right sequence from here. */
 static __init int pci_arch_init(void)
 {
-#ifdef CONFIG_PCI_DIRECT
-	int type = 0;
+	int type;
 
 	type = pci_direct_probe();
-#endif
 
 	if (!(pci_probe & PCI_PROBE_NOEARLY))
 		pci_mmcfg_early_init();
@@ -20,18 +18,16 @@ static __init int pci_arch_init(void)
 	if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
 		return 0;
 
-#ifdef CONFIG_PCI_BIOS
 	pci_pcbios_init();
-#endif
+
 	/*
 	 * don't check for raw_pci_ops here because we want pcbios as last
 	 * fallback, yet it's needed to run first to set pcibios_last_bus
 	 * in case legacy PCI probing is used. otherwise detecting peer busses
 	 * fails.
 	 */
-#ifdef CONFIG_PCI_DIRECT
 	pci_direct_init(type);
-#endif
+
 	if (!raw_pci_ops && !raw_pci_ext_ops)
 		printk(KERN_ERR
 		"PCI: Fatal: No config space access function found\n");
-- 
Gitee


From 95503422f49675f87a627d799e1c2967f541b249 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 14 Apr 2022 21:10:29 +0800
Subject: [PATCH 1058/2161] x86/irq: Initialize PCI/MSI domain at PCI init time

commit 6b15ffa07dc325f4e4dd98c877bfa970202c378b upstream.

No point in initializing the default PCI/MSI interrupt domain early and no
point to create it when XEN PV/HVM/DOM0 are active.

Move the initialization to pci_arch_init() and convert it to init ops so
that XEN can override it as XEN has it's own PCI/MSI management. The XEN
override comes in a later step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112332.859209894@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/irqdomain.h |  6 ++++--
 arch/x86/include/asm/x86_init.h  |  3 +++
 arch/x86/kernel/apic/msi.c       | 29 ++++++++++++++++++-----------
 arch/x86/kernel/apic/vector.c    |  2 --
 arch/x86/kernel/x86_init.c       |  4 +++-
 arch/x86/pci/init.c              |  3 +++
 6 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index a565908d81bc..20cf28a133e3 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -54,9 +54,11 @@ extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
 #endif /* CONFIG_X86_IO_APIC */
 
 #ifdef CONFIG_PCI_MSI
-extern void arch_init_msi_domain(struct irq_domain *domain);
+void x86_create_pci_msi_domain(void);
+struct irq_domain *native_create_pci_msi_domain(void);
 #else
-static inline void arch_init_msi_domain(struct irq_domain *domain) { }
+static inline void x86_create_pci_msi_domain(void) { }
+#define native_create_pci_msi_domain	NULL
 #endif
 
 #endif
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 4faf3d8625ba..92341731f203 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -8,6 +8,7 @@ struct mpc_bus;
 struct mpc_cpu;
 struct mpc_table;
 struct cpuinfo_x86;
+struct irq_domain;
 
 /**
  * struct x86_init_mpparse - platform specific mpparse ops
@@ -53,6 +54,7 @@ struct x86_init_resources {
  * @trap_init:			platform specific trap setup
  * @intr_mode_select:		interrupt delivery mode selection
  * @intr_mode_init:		interrupt delivery mode setup
+ * @create_pci_msi_domain:	Create the PCI/MSI interrupt domain
  */
 struct x86_init_irqs {
 	void (*pre_vector_init)(void);
@@ -60,6 +62,7 @@ struct x86_init_irqs {
 	void (*trap_init)(void);
 	void (*intr_mode_select)(void);
 	void (*intr_mode_init)(void);
+	struct irq_domain *(*create_pci_msi_domain)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 17a71d0580d3..90b4f972c78b 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -20,7 +20,7 @@
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 
-static struct irq_domain *msi_default_domain;
+static struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
 
 static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
 {
@@ -186,25 +186,32 @@ static struct msi_domain_info pci_msi_domain_info = {
 	.handler_name	= "edge",
 };
 
-void __init arch_init_msi_domain(struct irq_domain *parent)
+struct irq_domain * __init native_create_pci_msi_domain(void)
 {
 	struct fwnode_handle *fn;
+	struct irq_domain *d;
 
 	if (disable_apic)
-		return;
+		return NULL;
 
 	fn = irq_domain_alloc_named_fwnode("PCI-MSI");
-	if (fn) {
-		msi_default_domain =
-			pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
-						  parent);
-	}
-	if (!msi_default_domain) {
+	if (!fn)
+		return NULL;
+
+	d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
+				      x86_vector_domain);
+	if (!d) {
 		irq_domain_free_fwnode(fn);
-		pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+		pr_warn("Failed to initialize PCI-MSI irqdomain.\n");
 	} else {
-		msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
+		d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
 	}
+	return d;
+}
+
+void __init x86_create_pci_msi_domain(void)
+{
+	x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
 }
 
 #ifdef CONFIG_IRQ_REMAP
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index a7ffebef580b..0e1e33f4734b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -754,8 +754,6 @@ int __init arch_early_irq_init(void)
 	BUG_ON(x86_vector_domain == NULL);
 	irq_set_default_host(x86_vector_domain);
 
-	arch_init_msi_domain(x86_vector_domain);
-
 	BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
 
 	/*
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 9ef42c293326..32ea56412e71 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -24,6 +24,7 @@
 #include <asm/tsc.h>
 #include <asm/iommu.h>
 #include <asm/mach_traps.h>
+#include <asm/irqdomain.h>
 
 void x86_init_noop(void) { }
 void __init x86_init_uint_noop(unsigned int unused) { }
@@ -59,7 +60,8 @@ struct x86_init_ops x86_init __initdata = {
 		.intr_init		= native_init_IRQ,
 		.trap_init		= x86_init_noop,
 		.intr_mode_select	= apic_intr_mode_select,
-		.intr_mode_init		= apic_intr_mode_init
+		.intr_mode_init		= apic_intr_mode_init,
+		.create_pci_msi_domain	= native_create_pci_msi_domain,
 	},
 
 	.oem = {
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index bf6690935943..00bfa1ebad6c 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <asm/pci_x86.h>
 #include <asm/x86_init.h>
+#include <asm/irqdomain.h>
 
 /* arch_initcall has too random ordering, so call the initializers
    in the right sequence from here. */
@@ -10,6 +11,8 @@ static __init int pci_arch_init(void)
 {
 	int type;
 
+	x86_create_pci_msi_domain();
+
 	type = pci_direct_probe();
 
 	if (!(pci_probe & PCI_PROBE_NOEARLY))
-- 
Gitee


From 4076d6f5b5a332efd089352d5abe60ef39ce8a6c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:51 +0200
Subject: [PATCH 1059/2161] irqdomain/msi: Provide DOMAIN_BUS_VMD_MSI

commit c6c9e2838c5f0b94773511586123bcb125757f2a upstream.

PCI devices behind a VMD bus are not subject to interrupt remapping, but
the irq domain for VMD MSI cannot be distinguished from a regular PCI/MSI
irq domain.

Add a new domain bus token and allow it in the bus token check in
msi_check_reservation_mode() to keep the functionality the same once VMD
uses this token.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <maz@kernel.org>
Acked-by: Jon Derrick <jonathan.derrick@intel.com>
Link: https://lore.kernel.org/r/20200826112332.954409970@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/irqdomain.h | 1 +
 kernel/irq/msi.c          | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 6fc9f0ac5f2b..a7619bb43c33 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -84,6 +84,7 @@ enum irq_domain_bus_token {
 	DOMAIN_BUS_IPI,
 	DOMAIN_BUS_FSL_MC_MSI,
 	DOMAIN_BUS_TI_SCI_INTA_MSI,
+	DOMAIN_BUS_VMD_MSI,
 };
 
 /**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 0fef031d34d1..5396232895b0 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -364,8 +364,13 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
 {
 	struct msi_desc *desc;
 
-	if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+	switch(domain->bus_token) {
+	case DOMAIN_BUS_PCI_MSI:
+	case DOMAIN_BUS_VMD_MSI:
+		break;
+	default:
 		return false;
+	}
 
 	if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
 		return false;
-- 
Gitee


From 0afb043e8c5ebc2c9a95cd6c8e05b01d671d7370 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:52 +0200
Subject: [PATCH 1060/2161] PCI_vmd_Mark_VMD_irqdomain_with_DOMAIN_BUS_VMD_MSI

commit d7f954e54079b4bf6088956d59f43768ec71269a upstream.

Devices on the VMD bus use their own MSI irq domain, but it is not
distinguishable from regular PCI/MSI irq domains. This is required
to exclude VMD devices from getting the irq domain pointer set by
interrupt remapping.

Override the default bus token.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20200826112333.047315047@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 15dd6ed3ba40..f9b4c961e48b 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -685,6 +685,12 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 		return -ENODEV;
 	}
 
+	/*
+	 * Override the irq domain bus token so the domain can be distinguished
+	 * from a regular PCI/MSI domain.
+	 */
+	irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI);
+
 	pci_add_resource(&resources, &vmd->resources[0]);
 	pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]);
 	pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]);
-- 
Gitee


From 601bd97d619d48f6da9622988247a267e37fd68f Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Jan 2020 06:37:50 -0700
Subject: [PATCH 1061/2161] PCI: vmd: Remove dma_map_ops overrides

commit 962e329d888cfcb923d39af4e14fd616467de167 upstream.

Devices on the VMD domain use the VMD endpoint's requester ID and have been
relying on the VMD endpoint's DMA operations.  The problem with this was
that VMD domain devices would use the VMD endpoint's attributes when doing
DMA and IOMMU mapping.  We can be smarter about this by only using the VMD
endpoint when mapping and providing the correct child device's attributes
during DMA operations.

Remove the dma_map_ops redirect.

Link: https://lore.kernel.org/r/1579613871-301529-7-git-send-email-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/Kconfig |   1 -
 drivers/pci/controller/vmd.c   | 150 ---------------------------------
 2 files changed, 151 deletions(-)

diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index addfbf1e6122..fe377ee56d04 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -269,7 +269,6 @@ config PCIE_TANGO_SMP8759
 
 config VMD
 	depends on PCI_MSI && X86_64 && SRCU
-	select X86_DEV_DMA_OPS
 	tristate "Intel Volume Management Device Driver"
 	---help---
 	  Adds support for the Intel Volume Management Device (VMD). VMD is a
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index f9b4c961e48b..dc19c1d57a5b 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -97,9 +97,6 @@ struct vmd_dev {
 	struct irq_domain	*irq_domain;
 	struct pci_bus		*bus;
 	u8			busn_start;
-
-	struct dma_map_ops	dma_ops;
-	struct dma_domain	dma_domain;
 };
 
 static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
@@ -294,151 +291,6 @@ static struct msi_domain_info vmd_msi_domain_info = {
 	.chip		= &vmd_msi_controller,
 };
 
-/*
- * VMD replaces the requester ID with its own.  DMA mappings for devices in a
- * VMD domain need to be mapped for the VMD, not the device requiring
- * the mapping.
- */
-static struct device *to_vmd_dev(struct device *dev)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
-
-	return &vmd->dev->dev;
-}
-
-static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
-		       gfp_t flag, unsigned long attrs)
-{
-	return dma_alloc_attrs(to_vmd_dev(dev), size, addr, flag, attrs);
-}
-
-static void vmd_free(struct device *dev, size_t size, void *vaddr,
-		     dma_addr_t addr, unsigned long attrs)
-{
-	return dma_free_attrs(to_vmd_dev(dev), size, vaddr, addr, attrs);
-}
-
-static int vmd_mmap(struct device *dev, struct vm_area_struct *vma,
-		    void *cpu_addr, dma_addr_t addr, size_t size,
-		    unsigned long attrs)
-{
-	return dma_mmap_attrs(to_vmd_dev(dev), vma, cpu_addr, addr, size,
-			attrs);
-}
-
-static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt,
-			   void *cpu_addr, dma_addr_t addr, size_t size,
-			   unsigned long attrs)
-{
-	return dma_get_sgtable_attrs(to_vmd_dev(dev), sgt, cpu_addr, addr, size,
-			attrs);
-}
-
-static dma_addr_t vmd_map_page(struct device *dev, struct page *page,
-			       unsigned long offset, size_t size,
-			       enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-	return dma_map_page_attrs(to_vmd_dev(dev), page, offset, size, dir,
-			attrs);
-}
-
-static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size,
-			   enum dma_data_direction dir, unsigned long attrs)
-{
-	dma_unmap_page_attrs(to_vmd_dev(dev), addr, size, dir, attrs);
-}
-
-static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-		      enum dma_data_direction dir, unsigned long attrs)
-{
-	return dma_map_sg_attrs(to_vmd_dev(dev), sg, nents, dir, attrs);
-}
-
-static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
-			 enum dma_data_direction dir, unsigned long attrs)
-{
-	dma_unmap_sg_attrs(to_vmd_dev(dev), sg, nents, dir, attrs);
-}
-
-static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-				    size_t size, enum dma_data_direction dir)
-{
-	dma_sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir);
-}
-
-static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr,
-				       size_t size, enum dma_data_direction dir)
-{
-	dma_sync_single_for_device(to_vmd_dev(dev), addr, size, dir);
-}
-
-static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-				int nents, enum dma_data_direction dir)
-{
-	dma_sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir);
-}
-
-static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-				   int nents, enum dma_data_direction dir)
-{
-	dma_sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir);
-}
-
-static int vmd_dma_supported(struct device *dev, u64 mask)
-{
-	return dma_supported(to_vmd_dev(dev), mask);
-}
-
-static u64 vmd_get_required_mask(struct device *dev)
-{
-	return dma_get_required_mask(to_vmd_dev(dev));
-}
-
-static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
-{
-	struct dma_domain *domain = &vmd->dma_domain;
-
-	if (get_dma_ops(&vmd->dev->dev))
-		del_dma_domain(domain);
-}
-
-#define ASSIGN_VMD_DMA_OPS(source, dest, fn)	\
-	do {					\
-		if (source->fn)			\
-			dest->fn = vmd_##fn;	\
-	} while (0)
-
-static void vmd_setup_dma_ops(struct vmd_dev *vmd)
-{
-	const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev);
-	struct dma_map_ops *dest = &vmd->dma_ops;
-	struct dma_domain *domain = &vmd->dma_domain;
-
-	domain->domain_nr = vmd->sysdata.domain;
-	domain->dma_ops = dest;
-
-	if (!source)
-		return;
-	ASSIGN_VMD_DMA_OPS(source, dest, alloc);
-	ASSIGN_VMD_DMA_OPS(source, dest, free);
-	ASSIGN_VMD_DMA_OPS(source, dest, mmap);
-	ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable);
-	ASSIGN_VMD_DMA_OPS(source, dest, map_page);
-	ASSIGN_VMD_DMA_OPS(source, dest, unmap_page);
-	ASSIGN_VMD_DMA_OPS(source, dest, map_sg);
-	ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device);
-	ASSIGN_VMD_DMA_OPS(source, dest, dma_supported);
-	ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask);
-	add_dma_domain(domain);
-}
-#undef ASSIGN_VMD_DMA_OPS
-
 static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
 				  unsigned int devfn, int reg, int len)
 {
@@ -705,7 +557,6 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	}
 
 	vmd_attach_resources(vmd);
-	vmd_setup_dma_ops(vmd);
 	dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
 
 	pci_scan_child_bus(vmd->bus);
@@ -821,7 +672,6 @@ static void vmd_remove(struct pci_dev *dev)
 	pci_stop_root_bus(vmd->bus);
 	pci_remove_root_bus(vmd->bus);
 	vmd_cleanup_srcu(vmd);
-	vmd_teardown_dma_ops(vmd);
 	vmd_detach_resources(vmd);
 	irq_domain_remove(vmd->irq_domain);
 	irq_domain_free_fwnode(fn);
-- 
Gitee


From a6ad44729193dbf1cc8c6b652381b7c487bdba67 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Jan 2020 06:37:51 -0700
Subject: [PATCH 1062/2161] x86/PCI: Remove X86_DEV_DMA_OPS

commit dab0198413d227f13be7da8abf0d5bc8620427f0 upstream.

There are no users of X86_DEV_DMA_OPS left, so remove the code.

Link: https://lore.kernel.org/r/1579613871-301529-8-git-send-email-jonathan.derrick@intel.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/Kconfig              |  3 ---
 arch/x86/include/asm/device.h | 10 ---------
 arch/x86/pci/common.c         | 38 -----------------------------------
 3 files changed, 51 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3a9dccb9c3d0..a69c3289f325 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -3023,9 +3023,6 @@ config HAVE_ATOMIC_IOMAP
 	def_bool y
 	depends on X86_32
 
-config X86_DEV_DMA_OPS
-	bool
-
 source "drivers/firmware/Kconfig"
 
 source "arch/x86/kvm/Kconfig"
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index a8f6c809d9b1..3e6c75a6d070 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -11,16 +11,6 @@ struct dev_archdata {
 #endif
 };
 
-#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
-struct dma_domain {
-	struct list_head node;
-	const struct dma_map_ops *dma_ops;
-	int domain_nr;
-};
-void add_dma_domain(struct dma_domain *domain);
-void del_dma_domain(struct dma_domain *domain);
-#endif
-
 struct pdev_archdata {
 };
 
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index a0b44fe0d2a4..f67855adee5e 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -627,43 +627,6 @@ unsigned int pcibios_assign_all_busses(void)
 	return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
 }
 
-#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
-static LIST_HEAD(dma_domain_list);
-static DEFINE_SPINLOCK(dma_domain_list_lock);
-
-void add_dma_domain(struct dma_domain *domain)
-{
-	spin_lock(&dma_domain_list_lock);
-	list_add(&domain->node, &dma_domain_list);
-	spin_unlock(&dma_domain_list_lock);
-}
-EXPORT_SYMBOL_GPL(add_dma_domain);
-
-void del_dma_domain(struct dma_domain *domain)
-{
-	spin_lock(&dma_domain_list_lock);
-	list_del(&domain->node);
-	spin_unlock(&dma_domain_list_lock);
-}
-EXPORT_SYMBOL_GPL(del_dma_domain);
-
-static void set_dma_domain_ops(struct pci_dev *pdev)
-{
-	struct dma_domain *domain;
-
-	spin_lock(&dma_domain_list_lock);
-	list_for_each_entry(domain, &dma_domain_list, node) {
-		if (pci_domain_nr(pdev->bus) == domain->domain_nr) {
-			pdev->dev.dma_ops = domain->dma_ops;
-			break;
-		}
-	}
-	spin_unlock(&dma_domain_list_lock);
-}
-#else
-static void set_dma_domain_ops(struct pci_dev *pdev) {}
-#endif
-
 static void set_dev_domain_options(struct pci_dev *pdev)
 {
 	if (is_vmd(pdev->bus))
@@ -699,7 +662,6 @@ int pcibios_add_device(struct pci_dev *dev)
 		pa_data = data->next;
 		memunmap(data);
 	}
-	set_dma_domain_ops(dev);
 	set_dev_domain_options(dev);
 	return 0;
 }
-- 
Gitee


From bd2e7f93ca5f514816a951c52da12ede8dd3a208 Mon Sep 17 00:00:00 2001
From: Sushma Kalakota <sushmax.kalakota@intel.com>
Date: Wed, 8 Jan 2020 15:05:10 -0700
Subject: [PATCH 1063/2161] PCI: vmd: Add two VMD Device IDs

commit 449a01d2659c207b012e6d3bb6edfff8c94a4e55 upstream.

Add new VMD device IDs that require the bus restriction mode.

Signed-off-by: Sushma Kalakota <sushmax.kalakota@intel.com>
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index dc19c1d57a5b..fc2fcf63948e 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -718,6 +718,10 @@ static const struct pci_device_id vmd_ids[] = {
 				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
 		.driver_data = VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x467f),
+		.driver_data = VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x4c3d),
+		.driver_data = VMD_FEAT_HAS_BUS_RESTRICTIONS,},
 	{0,}
 };
 MODULE_DEVICE_TABLE(pci, vmd_ids);
-- 
Gitee


From aa2f3a06c4e77c7c1d93bac6e40f90d2bbdf05a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:17:01 +0200
Subject: [PATCH 1064/2161] x86/pci: Set default irq domain in
 pcibios_add_device()

commit 2c681e6b37674dc3941869cb262e26c8a6b34047 upstream.

Now that interrupt remapping sets the irqdomain pointer when a PCI device
is added it's possible to store the default irq domain in the device struct
in pcibios_add_device().

If the bus to which a device is connected has an irq domain associated then
this domain is used otherwise the default domain (PCI/MSI native or XEN
PCI/MSI) is used. Using the bus domain ensures that special MSI bus domains
like VMD work.

This makes XEN and the non-remapped native case work solely based on the
irq domain pointer in struct device for PCI/MSI and allows to remove the
arch fallback and make most of the x86_msi ops private to XEN in the next
steps.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200826112333.900423047@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/irqdomain.h |  2 ++
 arch/x86/kernel/apic/msi.c       |  2 +-
 arch/x86/pci/common.c            | 18 +++++++++++++++++-
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index 20cf28a133e3..125c23b7bad3 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -56,9 +56,11 @@ extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
 #ifdef CONFIG_PCI_MSI
 void x86_create_pci_msi_domain(void);
 struct irq_domain *native_create_pci_msi_domain(void);
+extern struct irq_domain *x86_pci_msi_default_domain;
 #else
 static inline void x86_create_pci_msi_domain(void) { }
 #define native_create_pci_msi_domain	NULL
+#define x86_pci_msi_default_domain	NULL
 #endif
 
 #endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 90b4f972c78b..7ced252da3d2 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -20,7 +20,7 @@
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 
-static struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
+struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
 
 static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
 {
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f67855adee5e..f1063d5b2462 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -19,6 +19,7 @@
 #include <asm/smp.h>
 #include <asm/pci_x86.h>
 #include <asm/setup.h>
+#include <asm/irqdomain.h>
 
 unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
 				PCI_PROBE_MMCONF;
@@ -635,8 +636,9 @@ static void set_dev_domain_options(struct pci_dev *pdev)
 
 int pcibios_add_device(struct pci_dev *dev)
 {
-	struct setup_data *data;
 	struct pci_setup_rom *rom;
+	struct irq_domain *msidom;
+	struct setup_data *data;
 	u64 pa_data;
 
 	pa_data = boot_params.hdr.setup_data;
@@ -663,6 +665,20 @@ int pcibios_add_device(struct pci_dev *dev)
 		memunmap(data);
 	}
 	set_dev_domain_options(dev);
+
+	/*
+	 * Setup the initial MSI domain of the device. If the underlying
+	 * bus has a PCI/MSI irqdomain associated use the bus domain,
+	 * otherwise set the default domain. This ensures that special irq
+	 * domains e.g. VMD are preserved. The default ensures initial
+	 * operation if irq remapping is not active. If irq remapping is
+	 * active it will overwrite the domain pointer when the device is
+	 * associated to a remapping domain.
+	 */
+	msidom = dev_get_msi_domain(&dev->bus->dev);
+	if (!msidom)
+		msidom = x86_pci_msi_default_domain;
+	dev_set_msi_domain(&dev->dev, msidom);
 	return 0;
 }
 
-- 
Gitee


From e3a537a4e9c1e8dd2747992526bc46749c3e5a0e Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:44:00 +0800
Subject: [PATCH 1065/2161] Revert "Intel: Documentation: admin-guide: PM: Add
 intel_idle document"

commit opencloudos.

This reverts commit 5498dbde067552e98df9584bda6beb2b33c3e84e.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/pm/intel_idle.rst   | 246 ------------------
 .../admin-guide/pm/working-state.rst          |   1 -
 2 files changed, 247 deletions(-)
 delete mode 100644 Documentation/admin-guide/pm/intel_idle.rst

diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst
deleted file mode 100644
index afbf778035f8..000000000000
--- a/Documentation/admin-guide/pm/intel_idle.rst
+++ /dev/null
@@ -1,246 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-.. include:: <isonum.txt>
-
-==============================================
-``intel_idle`` CPU Idle Time Management Driver
-==============================================
-
-:Copyright: |copy| 2020 Intel Corporation
-
-:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-
-General Information
-===================
-
-``intel_idle`` is a part of the
-:doc:`CPU idle time management subsystem <cpuidle>` in the Linux kernel
-(``CPUIdle``).  It is the default CPU idle time management driver for the
-Nehalem and later generations of Intel processors, but the level of support for
-a particular processor model in it depends on whether or not it recognizes that
-processor model and may also depend on information coming from the platform
-firmware.  [To understand ``intel_idle`` it is necessary to know how ``CPUIdle``
-works in general, so this is the time to get familiar with :doc:`cpuidle` if you
-have not done that yet.]
-
-``intel_idle`` uses the ``MWAIT`` instruction to inform the processor that the
-logical CPU executing it is idle and so it may be possible to put some of the
-processor's functional blocks into low-power states.  That instruction takes two
-arguments (passed in the ``EAX`` and ``ECX`` registers of the target CPU), the
-first of which, referred to as a *hint*, can be used by the processor to
-determine what can be done (for details refer to Intel Software Developer’s
-Manual [1]_).  Accordingly, ``intel_idle`` refuses to work with processors in
-which the support for the ``MWAIT`` instruction has been disabled (for example,
-via the platform firmware configuration menu) or which do not support that
-instruction at all.
-
-``intel_idle`` is not modular, so it cannot be unloaded, which means that the
-only way to pass early-configuration-time parameters to it is via the kernel
-command line.
-
-
-.. _intel-idle-enumeration-of-states:
-
-Enumeration of Idle States
-==========================
-
-Each ``MWAIT`` hint value is interpreted by the processor as a license to
-reconfigure itself in a certain way in order to save energy.  The processor
-configurations (with reduced power draw) resulting from that are referred to
-as C-states (in the ACPI terminology) or idle states.  The list of meaningful
-``MWAIT`` hint values and idle states (i.e. low-power configurations of the
-processor) corresponding to them depends on the processor model and it may also
-depend on the configuration of the platform.
-
-In order to create a list of available idle states required by the ``CPUIdle``
-subsystem (see :ref:`idle-states-representation` in :doc:`cpuidle`),
-``intel_idle`` can use two sources of information: static tables of idle states
-for different processor models included in the driver itself and the ACPI tables
-of the system.  The former are always used if the processor model at hand is
-recognized by ``intel_idle`` and the latter are used if that is required for
-the given processor model (which is the case for all server processor models
-recognized by ``intel_idle``) or if the processor model is not recognized.
-
-If the ACPI tables are going to be used for building the list of available idle
-states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI
-objects corresponding to the CPUs in the system (refer to the ACPI specification
-[2]_ for the description of ``_CST`` and its output package).  Because the
-``CPUIdle`` subsystem expects that the list of idle states supplied by the
-driver will be suitable for all of the CPUs handled by it and ``intel_idle`` is
-registered as the ``CPUIdle`` driver for all of the CPUs in the system, the
-driver looks for the first ``_CST`` object returning at least one valid idle
-state description and such that all of the idle states included in its return
-package are of the FFH (Functional Fixed Hardware) type, which means that the
-``MWAIT`` instruction is expected to be used to tell the processor that it can
-enter one of them.  The return package of that ``_CST`` is then assumed to be
-applicable to all of the other CPUs in the system and the idle state
-descriptions extracted from it are stored in a preliminary list of idle states
-coming from the ACPI tables.  [This step is skipped if ``intel_idle`` is
-configured to ignore the ACPI tables; see `below <intel-idle-parameters_>`_.]
-
-Next, the first (index 0) entry in the list of available idle states is
-initialized to represent a "polling idle state" (a pseudo-idle state in which
-the target CPU continuously fetches and executes instructions), and the
-subsequent (real) idle state entries are populated as follows.
-
-If the processor model at hand is recognized by ``intel_idle``, there is a
-(static) table of idle state descriptions for it in the driver.  In that case,
-the "internal" table is the primary source of information on idle states and the
-information from it is copied to the final list of available idle states.  If
-using the ACPI tables for the enumeration of idle states is not required
-(depending on the processor model), all of the listed idle state are enabled by
-default (so all of them will be taken into consideration by ``CPUIdle``
-governors during CPU idle state selection).  Otherwise, some of the listed idle
-states may not be enabled by default if there are no matching entries in the
-preliminary list of idle states coming from the ACPI tables.  In that case user
-space still can enable them later (on a per-CPU basis) with the help of
-the ``disable`` idle state attribute in ``sysfs`` (see
-:ref:`idle-states-representation` in :doc:`cpuidle`).  This basically means that
-the idle states "known" to the driver may not be enabled by default if they have
-not been exposed by the platform firmware (through the ACPI tables).
-
-If the given processor model is not recognized by ``intel_idle``, but it
-supports ``MWAIT``, the preliminary list of idle states coming from the ACPI
-tables is used for building the final list that will be supplied to the
-``CPUIdle`` core during driver registration.  For each idle state in that list,
-the description, ``MWAIT`` hint and exit latency are copied to the corresponding
-entry in the final list of idle states.  The name of the idle state represented
-by it (to be returned by the ``name`` idle state attribute in ``sysfs``) is
-"CX_ACPI", where X is the index of that idle state in the final list (note that
-the minimum value of X is 1, because 0 is reserved for the "polling" state), and
-its target residency is based on the exit latency value.  Specifically, for
-C1-type idle states the exit latency value is also used as the target residency
-(for compatibility with the majority of the "internal" tables of idle states for
-various processor models recognized by ``intel_idle``) and for the other idle
-state types (C2 and C3) the target residency value is 3 times the exit latency
-(again, that is because it reflects the target residency to exit latency ratio
-in the majority of cases for the processor models recognized by ``intel_idle``).
-All of the idle states in the final list are enabled by default in this case.
-
-
-.. _intel-idle-initialization:
-
-Initialization
-==============
-
-The initialization of ``intel_idle`` starts with checking if the kernel command
-line options forbid the use of the ``MWAIT`` instruction.  If that is the case,
-an error code is returned right away.
-
-The next step is to check whether or not the processor model is known to the
-driver, which determines the idle states enumeration method (see
-`above <intel-idle-enumeration-of-states_>`_), and whether or not the processor
-supports ``MWAIT`` (the initialization fails if that is not the case).  Then,
-the ``MWAIT`` support in the processor is enumerated through ``CPUID`` and the
-driver initialization fails if the level of support is not as expected (for
-example, if the total number of ``MWAIT`` substates returned is 0).
-
-Next, if the driver is not configured to ignore the ACPI tables (see
-`below <intel-idle-parameters_>`_), the idle states information provided by the
-platform firmware is extracted from them.
-
-Then, ``CPUIdle`` device objects are allocated for all CPUs and the list of
-available idle states is created as explained
-`above <intel-idle-enumeration-of-states_>`_.
-
-Finally, ``intel_idle`` is registered with the help of cpuidle_register_driver()
-as the ``CPUIdle`` driver for all CPUs in the system and a CPU online callback
-for configuring individual CPUs is registered via cpuhp_setup_state(), which
-(among other things) causes the callback routine to be invoked for all of the
-CPUs present in the system at that time (each CPU executes its own instance of
-the callback routine).  That routine registers a ``CPUIdle`` device for the CPU
-running it (which enables the ``CPUIdle`` subsystem to operate that CPU) and
-optionally performs some CPU-specific initialization actions that may be
-required for the given processor model.
-
-
-.. _intel-idle-parameters:
-
-Kernel Command Line Options and Module Parameters
-=================================================
-
-The *x86* architecture support code recognizes three kernel command line
-options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
-and ``idle=nomwait``.  If any of them is present in the kernel command line, the
-``MWAIT`` instruction is not allowed to be used, so the initialization of
-``intel_idle`` will fail.
-
-Apart from that there are two module parameters recognized by ``intel_idle``
-itself that can be set via the kernel command line (they cannot be updated via
-sysfs, so that is the only way to change their values).
-
-The ``max_cstate`` parameter value is the maximum idle state index in the list
-of idle states supplied to the ``CPUIdle`` core during the registration of the
-driver.  It is also the maximum number of regular (non-polling) idle states that
-can be used by ``intel_idle``, so the enumeration of idle states is terminated
-after finding that number of usable idle states (the other idle states that
-potentially might have been used if ``max_cstate`` had been greater are not
-taken into consideration at all).  Setting ``max_cstate`` can prevent
-``intel_idle`` from exposing idle states that are regarded as "too deep" for
-some reason to the ``CPUIdle`` core, but it does so by making them effectively
-invisible until the system is shut down and started again which may not always
-be desirable.  In practice, it is only really necessary to do that if the idle
-states in question cannot be enabled during system startup, because in the
-working state of the system the CPU power management quality of service (PM
-QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states
-even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`).
-Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail.
-
-The ``noacpi`` module parameter (which is recognized by ``intel_idle`` if the
-kernel has been configured with ACPI support), can be set to make the driver
-ignore the system's ACPI tables entirely (it is unset by default).
-
-
-.. _intel-idle-core-and-package-idle-states:
-
-Core and Package Levels of Idle States
-======================================
-
-Typically, in a processor supporting the ``MWAIT`` instruction there are (at
-least) two levels of idle states (or C-states).  One level, referred to as
-"core C-states", covers individual cores in the processor, whereas the other
-level, referred to as "package C-states", covers the entire processor package
-and it may also involve other components of the system (GPUs, memory
-controllers, I/O hubs etc.).
-
-Some of the ``MWAIT`` hint values allow the processor to use core C-states only
-(most importantly, that is the case for the ``MWAIT`` hint value corresponding
-to the ``C1`` idle state), but the majority of them give it a license to put
-the target core (i.e. the core containing the logical CPU executing ``MWAIT``
-with the given hint value) into a specific core C-state and then (if possible)
-to enter a specific package C-state at the deeper level.  For example, the
-``MWAIT`` hint value representing the ``C3`` idle state allows the processor to
-put the target core into the low-power state referred to as "core ``C3``" (or
-``CC3``), which happens if all of the logical CPUs (SMT siblings) in that core
-have executed ``MWAIT`` with the ``C3`` hint value (or with a hint value
-representing a deeper idle state), and in addition to that (in the majority of
-cases) it gives the processor a license to put the entire package (possibly
-including some non-CPU components such as a GPU or a memory controller) into the
-low-power state referred to as "package ``C3``" (or ``PC3``), which happens if
-all of the cores have gone into the ``CC3`` state and (possibly) some additional
-conditions are satisfied (for instance, if the GPU is covered by ``PC3``, it may
-be required to be in a certain GPU-specific low-power state for ``PC3`` to be
-reachable).
-
-As a rule, there is no simple way to make the processor use core C-states only
-if the conditions for entering the corresponding package C-states are met, so
-the logical CPU executing ``MWAIT`` with a hint value that is not core-level
-only (like for ``C1``) must always assume that this may cause the processor to
-enter a package C-state.  [That is why the exit latency and target residency
-values corresponding to the majority of ``MWAIT`` hint values in the "internal"
-tables of idle states in ``intel_idle`` reflect the properties of package
-C-states.]  If using package C-states is not desirable at all, either
-:ref:`PM QoS <cpu-pm-qos>` or the ``max_cstate`` module parameter of
-``intel_idle`` described `above <intel-idle-parameters_>`_ must be used to
-restrict the range of permissible idle states to the ones with core-level only
-``MWAIT`` hint values (like ``C1``).
-
-
-References
-==========
-
-.. [1] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2B*,
-       https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2b-manual.html
-
-.. [2] *Advanced Configuration and Power Interface (ACPI) Specification*,
-       https://uefi.org/specifications
diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst
index 88f717e59a42..fc298eb1234b 100644
--- a/Documentation/admin-guide/pm/working-state.rst
+++ b/Documentation/admin-guide/pm/working-state.rst
@@ -8,7 +8,6 @@ Working-State Power Management
    :maxdepth: 2
 
    cpuidle
-   intel_idle
    cpufreq
    intel_pstate
    intel_epb
-- 
Gitee


From 684c06562a2bd1a8960d76026a424e76c5057ff5 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:44:18 +0800
Subject: [PATCH 1066/2161] Revert "Intel: ACPI: processor: Make
 ACPI_PROCESSOR_CSTATE depend on ACPI_PROCESSOR"

commit opencloudos.

This reverts commit 3e500689ff15b410e114ec8f8f38666db4704cc0.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 5559ef264060..ebe1e9e5fd81 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -241,7 +241,6 @@ config ACPI_CPU_FREQ_PSS
 
 config ACPI_PROCESSOR_CSTATE
 	def_bool y
-	depends on ACPI_PROCESSOR
 	depends on IA64 || X86
 
 config ACPI_PROCESSOR_IDLE
-- 
Gitee


From b75b5ed3a7ebb3e53c496ba47b4e6345fe7d8905 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:44:32 +0800
Subject: [PATCH 1067/2161] Revert "Intel: intel_idle: Use ACPI _CST on server
 systems"

commit opencloudos.

This reverts commit 477f166f1100bbec7984d7352703e830a8b9defd.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 70 +++++++++++----------------------------
 1 file changed, 20 insertions(+), 50 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4b0ad1864d76..244633b2bc18 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -131,7 +131,7 @@ static struct cpuidle_state nehalem_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -168,7 +168,7 @@ static struct cpuidle_state snb_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -303,7 +303,7 @@ static struct cpuidle_state ivb_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -348,7 +348,7 @@ static struct cpuidle_state ivt_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 80,
 		.enter = &intel_idle,
@@ -385,7 +385,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 250,
 		.enter = &intel_idle,
@@ -422,7 +422,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 500,
 		.enter = &intel_idle,
@@ -459,7 +459,7 @@ static struct cpuidle_state hsw_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -527,7 +527,7 @@ static struct cpuidle_state bdw_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -596,7 +596,7 @@ static struct cpuidle_state skl_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -665,7 +665,7 @@ static struct cpuidle_state skx_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -844,7 +844,7 @@ static struct cpuidle_state bxt_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -905,7 +905,7 @@ static struct cpuidle_state dnv_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.flags = MWAIT2flg(0x01),
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -1027,13 +1027,6 @@ static const struct idle_cpu idle_cpu_nehalem = {
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_nhx = {
-	.state_table = nehalem_cstates,
-	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
-	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
-};
-
 static const struct idle_cpu idle_cpu_atom = {
 	.state_table = atom_cstates,
 };
@@ -1052,12 +1045,6 @@ static const struct idle_cpu idle_cpu_snb = {
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_snx = {
-	.state_table = snb_cstates,
-	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
-};
-
 static const struct idle_cpu idle_cpu_byt = {
 	.state_table = byt_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1078,7 +1065,6 @@ static const struct idle_cpu idle_cpu_ivb = {
 static const struct idle_cpu idle_cpu_ivt = {
 	.state_table = ivt_cstates,
 	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
 };
 
 static const struct idle_cpu idle_cpu_hsw = {
@@ -1086,23 +1072,11 @@ static const struct idle_cpu idle_cpu_hsw = {
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_hsx = {
-	.state_table = hsw_cstates,
-	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
-};
-
 static const struct idle_cpu idle_cpu_bdw = {
 	.state_table = bdw_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_bdx = {
-	.state_table = bdw_cstates,
-	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
-};
-
 static const struct idle_cpu idle_cpu_skl = {
 	.state_table = skl_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1111,7 +1085,6 @@ static const struct idle_cpu idle_cpu_skl = {
 static const struct idle_cpu idle_cpu_skx = {
 	.state_table = skx_cstates,
 	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
 };
 
 static const struct idle_cpu idle_cpu_icx = {
@@ -1123,12 +1096,10 @@ static const struct idle_cpu idle_cpu_icx = {
 static const struct idle_cpu idle_cpu_avn = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
 };
 
 static const struct idle_cpu idle_cpu_knl = {
 	.state_table = knl_cstates,
-	.use_acpi = true,
 };
 
 static const struct idle_cpu idle_cpu_bxt = {
@@ -1139,21 +1110,20 @@ static const struct idle_cpu idle_cpu_bxt = {
 static const struct idle_cpu idle_cpu_dnv = {
 	.state_table = dnv_cstates,
 	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
 };
 
 static const struct x86_cpu_id intel_idle_ids[] __initconst = {
-	INTEL_CPU_FAM6(NEHALEM_EP,		idle_cpu_nhx),
+	INTEL_CPU_FAM6(NEHALEM_EP,		idle_cpu_nehalem),
 	INTEL_CPU_FAM6(NEHALEM,			idle_cpu_nehalem),
 	INTEL_CPU_FAM6(NEHALEM_G,		idle_cpu_nehalem),
 	INTEL_CPU_FAM6(WESTMERE,		idle_cpu_nehalem),
-	INTEL_CPU_FAM6(WESTMERE_EP,		idle_cpu_nhx),
-	INTEL_CPU_FAM6(NEHALEM_EX,		idle_cpu_nhx),
+	INTEL_CPU_FAM6(WESTMERE_EP,		idle_cpu_nehalem),
+	INTEL_CPU_FAM6(NEHALEM_EX,		idle_cpu_nehalem),
 	INTEL_CPU_FAM6(ATOM_BONNELL,		idle_cpu_atom),
 	INTEL_CPU_FAM6(ATOM_BONNELL_MID,	idle_cpu_lincroft),
-	INTEL_CPU_FAM6(WESTMERE_EX,		idle_cpu_nhx),
+	INTEL_CPU_FAM6(WESTMERE_EX,		idle_cpu_nehalem),
 	INTEL_CPU_FAM6(SANDYBRIDGE,		idle_cpu_snb),
-	INTEL_CPU_FAM6(SANDYBRIDGE_X,		idle_cpu_snx),
+	INTEL_CPU_FAM6(SANDYBRIDGE_X,		idle_cpu_snb),
 	INTEL_CPU_FAM6(ATOM_SALTWELL,		idle_cpu_atom),
 	INTEL_CPU_FAM6(ATOM_SILVERMONT,		idle_cpu_byt),
 	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID,	idle_cpu_tangier),
@@ -1161,14 +1131,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(IVYBRIDGE,		idle_cpu_ivb),
 	INTEL_CPU_FAM6(IVYBRIDGE_X,		idle_cpu_ivt),
 	INTEL_CPU_FAM6(HASWELL,			idle_cpu_hsw),
-	INTEL_CPU_FAM6(HASWELL_X,		idle_cpu_hsx),
+	INTEL_CPU_FAM6(HASWELL_X,		idle_cpu_hsw),
 	INTEL_CPU_FAM6(HASWELL_L,		idle_cpu_hsw),
 	INTEL_CPU_FAM6(HASWELL_G,		idle_cpu_hsw),
 	INTEL_CPU_FAM6(ATOM_SILVERMONT_D,	idle_cpu_avn),
 	INTEL_CPU_FAM6(BROADWELL,		idle_cpu_bdw),
 	INTEL_CPU_FAM6(BROADWELL_G,		idle_cpu_bdw),
-	INTEL_CPU_FAM6(BROADWELL_X,		idle_cpu_bdx),
-	INTEL_CPU_FAM6(BROADWELL_D,		idle_cpu_bdx),
+	INTEL_CPU_FAM6(BROADWELL_X,		idle_cpu_bdw),
+	INTEL_CPU_FAM6(BROADWELL_D,		idle_cpu_bdw),
 	INTEL_CPU_FAM6(SKYLAKE_L,		idle_cpu_skl),
 	INTEL_CPU_FAM6(SKYLAKE,			idle_cpu_skl),
 	INTEL_CPU_FAM6(KABYLAKE_L,		idle_cpu_skl),
-- 
Gitee


From f1a84298b9ddd4f7e3243fd056ef84b167fcc3fb Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:44:45 +0800
Subject: [PATCH 1068/2161] Revert "Intel: intel_idle: Add module parameter to
 prevent ACPI _CST from being used"

commit opencloudos.

This reverts commit a5346c01280391d1848a0e27fc4245163bacc785.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 244633b2bc18..5413197bca02 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1174,10 +1174,6 @@ static bool intel_idle_max_cstate_reached(int cstate)
 #ifdef CONFIG_ACPI_PROCESSOR_CSTATE
 #include <acpi/processor.h>
 
-static bool no_acpi __read_mostly;
-module_param(no_acpi, bool, 0444);
-MODULE_PARM_DESC(no_acpi, "Do not use ACPI _CST for building the idle states list");
-
 static struct acpi_processor_power acpi_state_table;
 
 /**
@@ -1207,11 +1203,6 @@ static bool intel_idle_acpi_cst_extract(void)
 {
 	unsigned int cpu;
 
-	if (no_acpi) {
-		pr_debug("Not allowed to use ACPI _CST\n");
-		return false;
-	}
-
 	for_each_possible_cpu(cpu) {
 		struct acpi_processor *pr = per_cpu(processors, cpu);
 
-- 
Gitee


From 66618490e4649d57de502868faea101ad4699d6f Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:44:57 +0800
Subject: [PATCH 1069/2161] Revert "Intel: intel_idle: Allow ACPI _CST to be
 used for selected known processors"

commit opencloudos.

This reverts commit 7afae7d3ecb7d04b8ec5a5afd12b6937a3557c23.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 45 +++------------------------------------
 1 file changed, 3 insertions(+), 42 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 5413197bca02..bad573e168d5 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -80,7 +80,6 @@ struct idle_cpu {
 	unsigned long auto_demotion_disable_flags;
 	bool byt_auto_demotion_disable_flag;
 	bool disable_promotion_to_c1e;
-	bool use_acpi;
 };
 
 static const struct idle_cpu *icpu;
@@ -91,11 +90,6 @@ static void intel_idle_s2idle(struct cpuidle_device *dev,
 			      struct cpuidle_driver *drv, int index);
 static struct cpuidle_state *cpuidle_state_table;
 
-/*
- * Enable this state by default even if the ACPI _CST does not list it.
- */
-#define CPUIDLE_FLAG_ALWAYS_ENABLE	BIT(15)
-
 /*
  * Set this flag for states where the HW flushes the TLB for us
  * and so we don't need cross-calls to keep it consistent.
@@ -1272,33 +1266,9 @@ static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		state->enter_s2idle = intel_idle_s2idle;
 	}
 }
-
-static bool intel_idle_off_by_default(u32 mwait_hint)
-{
-	int cstate, limit;
-
-	/*
-	 * If there are no _CST C-states, do not disable any C-states by
-	 * default.
-	 */
-	if (!acpi_state_table.count)
-		return false;
-
-	limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
-	/*
-	 * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of
-	 * the interesting states are ACPI_CSTATE_FFH.
-	 */
-	for (cstate = 1; cstate < limit; cstate++) {
-		if (acpi_state_table.states[cstate].address == mwait_hint)
-			return false;
-	}
-	return true;
-}
 #else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 static inline bool intel_idle_acpi_cst_extract(void) { return false; }
 static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
-static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
 /*
@@ -1339,13 +1309,10 @@ static int __init intel_idle_probe(void)
 	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
 
 	icpu = (const struct idle_cpu *)id->driver_data;
-	if (icpu) {
+	if (icpu)
 		cpuidle_state_table = icpu->state_table;
-		if (icpu->use_acpi)
-			intel_idle_acpi_cst_extract();
-	} else if (!intel_idle_acpi_cst_extract()) {
+	else if (!intel_idle_acpi_cst_extract())
 		return -ENODEV;
-	}
 
 	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
 		 boot_cpu_data.x86_model);
@@ -1553,13 +1520,7 @@ static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 			continue;
 
 		/* Structure copy. */
-		drv->states[drv->state_count] = cpuidle_state_table[cstate];
-
-		if (icpu->use_acpi && intel_idle_off_by_default(mwait_hint) &&
-		    !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))
-			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF;
-
-		drv->state_count++;
+		drv->states[drv->state_count++] = cpuidle_state_table[cstate];
 	}
 
 	if (icpu->byt_auto_demotion_disable_flag) {
-- 
Gitee


From 267911a37147b6398d557d53199741fc55660f25 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:45:10 +0800
Subject: [PATCH 1070/2161] Revert "Intel: cpuidle: Allow idle states to be
 disabled by default"

commit opencloudos.

This reverts commit 489377c4fa5adb54aff670498597fb8d6babdbc0.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |  6 ------
 Documentation/admin-guide/pm/cpuidle.rst           |  3 ---
 drivers/cpuidle/cpuidle.c                          |  7 +------
 drivers/cpuidle/sysfs.c                            | 10 ----------
 include/linux/cpuidle.h                            |  3 ---
 5 files changed, 1 insertion(+), 28 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index b39531a3c5bc..c24afa60a30e 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -196,12 +196,6 @@ Description:
 		does not reflect it. Likewise, if one enables a deep state but a
 		lighter state still is disabled, then this has no effect.
 
-What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status
-Date:		December 2019
-KernelVersion:	v5.6
-Contact:	Linux power management list <linux-pm@vger.kernel.org>
-Description:
-		(RO) The default status of this state, "enabled" or "disabled".
 
 What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
 Date:		March 2014
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index 311cd7cc2b75..e70b365dbc60 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -506,9 +506,6 @@ object corresponding to it, as follows:
 ``disable``
 	Whether or not this idle state is disabled.
 
-``default_status``
-	The default status of this state, "enabled" or "disabled".
-
 ``latency``
 	Exit latency of the idle state in microseconds.
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 4c4ce65f399b..73f08cda21e0 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -569,17 +569,12 @@ static void __cpuidle_device_init(struct cpuidle_device *dev)
  */
 static int __cpuidle_register_device(struct cpuidle_device *dev)
 {
+	int ret;
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
-	int i, ret;
 
 	if (!try_module_get(drv->owner))
 		return -EINVAL;
 
-	for (i = 0; i < drv->state_count; i++) {
-		if (drv->states[i].flags & CPUIDLE_FLAG_OFF)
-			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_USER;
-	}
-
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
 
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 306a6213df1c..f8747322b3c7 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -292,14 +292,6 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \
 	return sprintf(buf, "%s\n", state->_name);\
 }
 
-static ssize_t show_state_default_status(struct cpuidle_state *state,
-					  struct cpuidle_state_usage *state_usage,
-					  char *buf)
-{
-	return sprintf(buf, "%s\n",
-			       state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled");
-}
-
 define_show_state_function(exit_latency)
 define_show_state_function(target_residency)
 define_show_state_function(power_usage)
@@ -322,7 +314,6 @@ define_one_state_ro(time, show_state_time);
 define_one_state_rw(disable, show_state_disable, store_state_disable);
 define_one_state_ro(above, show_state_above);
 define_one_state_ro(below, show_state_below);
-define_one_state_ro(default_status, show_state_default_status);
 
 static struct attribute *cpuidle_state_default_attrs[] = {
 	&attr_name.attr,
@@ -335,7 +326,6 @@ static struct attribute *cpuidle_state_default_attrs[] = {
 	&attr_disable.attr,
 	&attr_above.attr,
 	&attr_below.attr,
-	&attr_default_status.attr,
 	NULL
 };
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 4447967e7a9f..4b6b5bea8f79 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -29,8 +29,6 @@ struct cpuidle_driver;
  * CPUIDLE DEVICE INTERFACE *
  ****************************/
 
-#define CPUIDLE_STATE_DISABLED_BY_USER         BIT(0)
-
 struct cpuidle_state_usage {
 	unsigned long long	disable;
 	unsigned long long	usage;
@@ -74,7 +72,6 @@ struct cpuidle_state {
 #define CPUIDLE_FLAG_POLLING	BIT(0) /* polling state */
 #define CPUIDLE_FLAG_COUPLED	BIT(1) /* state applies to multiple cpus */
 #define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */
-#define CPUIDLE_FLAG_OFF       BIT(4) /* disable this state by default */
 
 struct cpuidle_device_kobj;
 struct cpuidle_state_kobj;
-- 
Gitee


From 8ea392630ea1306ae4332ce1d46ea6911d476486 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:45:23 +0800
Subject: [PATCH 1071/2161] Revert "Intel: intel_idle: Use ACPI _CST for
 processor models without C-state tables"

commit opencloudos.

This reverts commit 489253b0ba5e3f9aa2ef073a07007b3eee316102.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 191 ++++++--------------------------------
 1 file changed, 29 insertions(+), 162 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index bad573e168d5..eafc441ebab7 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -41,7 +41,6 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/acpi.h>
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
 #include <linux/tick.h>
@@ -1148,129 +1147,6 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	{}
 };
 
-#define INTEL_CPU_FAM6_MWAIT \
-	{ X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_MWAIT, 0 }
-
-static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
-	INTEL_CPU_FAM6_MWAIT,
-	{}
-};
-
-static bool intel_idle_max_cstate_reached(int cstate)
-{
-	if (cstate + 1 > max_cstate) {
-		pr_info("max_cstate %d reached\n", max_cstate);
-		return true;
-	}
-	return false;
-}
-
-#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
-#include <acpi/processor.h>
-
-static struct acpi_processor_power acpi_state_table;
-
-/**
- * intel_idle_cst_usable - Check if the _CST information can be used.
- *
- * Check if all of the C-states listed by _CST in the max_cstate range are
- * ACPI_CSTATE_FFH, which means that they should be entered via MWAIT.
- */
-static bool intel_idle_cst_usable(void)
-{
-	int cstate, limit;
-
-	limit = min_t(int, min_t(int, CPUIDLE_STATE_MAX, max_cstate + 1),
-		      acpi_state_table.count);
-
-	for (cstate = 1; cstate < limit; cstate++) {
-		struct acpi_processor_cx *cx = &acpi_state_table.states[cstate];
-
-		if (cx->entry_method != ACPI_CSTATE_FFH)
-			return false;
-	}
-
-	return true;
-}
-
-static bool intel_idle_acpi_cst_extract(void)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct acpi_processor *pr = per_cpu(processors, cpu);
-
-		if (!pr)
-			continue;
-
-		if (acpi_processor_evaluate_cst(pr->handle, cpu, &acpi_state_table))
-			continue;
-
-		acpi_state_table.count++;
-
-		if (!intel_idle_cst_usable())
-			continue;
-
-		if (!acpi_processor_claim_cst_control()) {
-			acpi_state_table.count = 0;
-			return false;
-		}
-
-		return true;
-	}
-
-	pr_debug("ACPI _CST not found or not usable\n");
-	return false;
-}
-
-static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
-{
-	int cstate, limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
-
-	/*
-	 * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of
-	 * the interesting states are ACPI_CSTATE_FFH.
-	 */
-	for (cstate = 1; cstate < limit; cstate++) {
-		struct acpi_processor_cx *cx;
-		struct cpuidle_state *state;
-
-		if (intel_idle_max_cstate_reached(cstate))
-			break;
-
-		cx = &acpi_state_table.states[cstate];
-
-		state = &drv->states[drv->state_count++];
-
-		snprintf(state->name, CPUIDLE_NAME_LEN, "C%d_ACPI", cstate);
-		strlcpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
-		state->exit_latency = cx->latency;
-		/*
-		 * For C1-type C-states use the same number for both the exit
-		 * latency and target residency, because that is the case for
-		 * C1 in the majority of the static C-states tables above.
-		 * For the other types of C-states, however, set the target
-		 * residency to 3 times the exit latency which should lead to
-		 * a reasonable balance between energy-efficiency and
-		 * performance in the majority of interesting cases.
-		 */
-		state->target_residency = cx->latency;
-		if (cx->type > ACPI_STATE_C1)
-			state->target_residency *= 3;
-
-		state->flags = MWAIT2flg(cx->address);
-		if (cx->type > ACPI_STATE_C2)
-			state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
-
-		state->enter = intel_idle;
-		state->enter_s2idle = intel_idle_s2idle;
-	}
-}
-#else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
-static inline bool intel_idle_acpi_cst_extract(void) { return false; }
-static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
-#endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
-
 /*
  * intel_idle_probe()
  */
@@ -1285,15 +1161,17 @@ static int __init intel_idle_probe(void)
 	}
 
 	id = x86_match_cpu(intel_idle_ids);
-	if (id) {
-		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
-			pr_debug("Please enable MWAIT in BIOS SETUP\n");
-			return -ENODEV;
-		}
-	} else {
-		id = x86_match_cpu(intel_mwait_ids);
-		if (!id)
-			return -ENODEV;
+	if (!id) {
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+		    boot_cpu_data.x86 == 6)
+			pr_debug("does not run on family %d model %d\n",
+				 boot_cpu_data.x86, boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
+		pr_debug("Please enable MWAIT in BIOS SETUP\n");
+		return -ENODEV;
 	}
 
 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
@@ -1309,10 +1187,7 @@ static int __init intel_idle_probe(void)
 	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
 
 	icpu = (const struct idle_cpu *)id->driver_data;
-	if (icpu)
-		cpuidle_state_table = icpu->state_table;
-	else if (!intel_idle_acpi_cst_extract())
-		return -ENODEV;
+	cpuidle_state_table = icpu->state_table;
 
 	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
 		 boot_cpu_data.x86_model);
@@ -1494,20 +1369,33 @@ static void intel_idle_state_table_update(void)
 	}
 }
 
-static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
+/*
+ * intel_idle_cpuidle_driver_init()
+ * allocate, initialize cpuidle_states
+ */
+static void __init intel_idle_cpuidle_driver_init(void)
 {
 	int cstate;
+	struct cpuidle_driver *drv = &intel_idle_driver;
+
+	intel_idle_state_table_update();
+
+	cpuidle_poll_state_init(drv);
+	drv->state_count = 1;
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
 		unsigned int mwait_hint;
 
-		if (intel_idle_max_cstate_reached(cstate))
-			break;
-
 		if (!cpuidle_state_table[cstate].enter &&
 		    !cpuidle_state_table[cstate].enter_s2idle)
 			break;
 
+		if (cstate + 1 > max_cstate) {
+			pr_info("max_cstate %d reached\n", max_cstate);
+			break;
+		}
+
+
 		/* If marked as unusable, skip this state. */
 		if (cpuidle_state_table[cstate].disabled != 0) {
 			pr_debug("state %s is disabled\n",
@@ -1529,24 +1417,6 @@ static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	}
 }
 
-/*
- * intel_idle_cpuidle_driver_init()
- * allocate, initialize cpuidle_states
- */
-static void __init intel_idle_cpuidle_driver_init(void)
-{
-	struct cpuidle_driver *drv = &intel_idle_driver;
-
-	intel_idle_state_table_update();
-
-	cpuidle_poll_state_init(drv);
-	drv->state_count = 1;
-
-	if (icpu)
-		intel_idle_init_cstates_icpu(drv);
-	else
-		intel_idle_init_cstates_acpi(drv);
-}
 
 /*
  * intel_idle_cpu_init()
@@ -1565,9 +1435,6 @@ static int intel_idle_cpu_init(unsigned int cpu)
 		return -EIO;
 	}
 
-	if (!icpu)
-		return 0;
-
 	if (icpu->auto_demotion_disable_flags)
 		auto_demotion_disable();
 
-- 
Gitee


From 5d31a1ea8a0021c18726cea9ae1e37812000bc63 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:45:37 +0800
Subject: [PATCH 1072/2161] Revert "Intel: intel_idle: Refactor
 intel_idle_cpuidle_driver_init()"

commit opencloudos.

This reverts commit 222c40a22036aba1de454a6236f0f51fd178ba89.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 48 +++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index eafc441ebab7..3fe44d4cf6f7 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -973,22 +973,6 @@ static void intel_idle_s2idle(struct cpuidle_device *dev,
 	mwait_idle_with_hints(eax, ecx);
 }
 
-static bool intel_idle_verify_cstate(unsigned int mwait_hint)
-{
-	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
-	unsigned int num_substates = (mwait_substates >> mwait_cstate * 4) &
-					MWAIT_SUBSTATE_MASK;
-
-	/* Ignore the C-state if there are NO sub-states in CPUID for it. */
-	if (num_substates == 0)
-		return false;
-
-	if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
-		mark_tsc_unstable("TSC halts in idle states deeper than C2");
-
-	return true;
-}
-
 static void __setup_broadcast_timer(bool on)
 {
 	if (on)
@@ -1384,10 +1368,10 @@ static void __init intel_idle_cpuidle_driver_init(void)
 	drv->state_count = 1;
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
-		unsigned int mwait_hint;
+		int num_substates, mwait_hint, mwait_cstate;
 
-		if (!cpuidle_state_table[cstate].enter &&
-		    !cpuidle_state_table[cstate].enter_s2idle)
+		if ((cpuidle_state_table[cstate].enter == NULL) &&
+		    (cpuidle_state_table[cstate].enter_s2idle == NULL))
 			break;
 
 		if (cstate + 1 > max_cstate) {
@@ -1395,20 +1379,34 @@ static void __init intel_idle_cpuidle_driver_init(void)
 			break;
 		}
 
+		mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
+		mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint);
+
+		/* number of sub-states for this state in CPUID.MWAIT */
+		num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4))
+					& MWAIT_SUBSTATE_MASK;
+
+		/* if NO sub-states for this state in CPUID, skip it */
+		if (num_substates == 0)
+			continue;
 
-		/* If marked as unusable, skip this state. */
+		/* if state marked as disabled, skip it */
 		if (cpuidle_state_table[cstate].disabled != 0) {
 			pr_debug("state %s is disabled\n",
 				 cpuidle_state_table[cstate].name);
 			continue;
 		}
 
-		mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
-		if (!intel_idle_verify_cstate(mwait_hint))
-			continue;
 
-		/* Structure copy. */
-		drv->states[drv->state_count++] = cpuidle_state_table[cstate];
+		if (((mwait_cstate + 1) > 2) &&
+			!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+			mark_tsc_unstable("TSC halts in idle"
+					" states deeper than C2");
+
+		drv->states[drv->state_count] =	/* structure copy */
+			cpuidle_state_table[cstate];
+
+		drv->state_count += 1;
 	}
 
 	if (icpu->byt_auto_demotion_disable_flag) {
-- 
Gitee


From 122d988b4dde84bd9e457c71a7552bb466fee2c9 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:45:48 +0800
Subject: [PATCH 1073/2161] Revert "Intel: ACPI: processor: Export
 acpi_processor_evaluate_cst()"

commit opencloudos.

This reverts commit 253644e7d2554be0e4737720ab0e5ef4b1fc83ce.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/acpi_processor.c | 157 ----------------------------------
 drivers/acpi/processor_idle.c | 142 ++++++++++++++++++++++++++++++
 include/linux/acpi.h          |   9 --
 3 files changed, 142 insertions(+), 166 deletions(-)

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 5379bc3f275d..8a53f3c5b70e 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -729,161 +729,4 @@ bool acpi_processor_claim_cst_control(void)
 	return true;
 }
 EXPORT_SYMBOL_GPL(acpi_processor_claim_cst_control);
-
-/**
- * acpi_processor_evaluate_cst - Evaluate the processor _CST control method.
- * @handle: ACPI handle of the processor object containing the _CST.
- * @cpu: The numeric ID of the target CPU.
- * @info: Object write the C-states information into.
- *
- * Extract the C-state information for the given CPU from the output of the _CST
- * control method under the corresponding ACPI processor object (or processor
- * device object) and populate @info with it.
- *
- * If any ACPI_ADR_SPACE_FIXED_HARDWARE C-states are found, invoke
- * acpi_processor_ffh_cstate_probe() to verify them and update the
- * cpu_cstate_entry data for @cpu.
- */
-int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
-				struct acpi_processor_power *info)
-{
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *cst;
-	acpi_status status;
-	u64 count;
-	int last_index = 0;
-	int i, ret = 0;
-
-	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
-	if (ACPI_FAILURE(status)) {
-		acpi_handle_debug(handle, "No _CST\n");
-		return -ENODEV;
-	}
-
-	cst = buffer.pointer;
-
-	/* There must be at least 2 elements. */
-	if (!cst || cst->type != ACPI_TYPE_PACKAGE || cst->package.count < 2) {
-		acpi_handle_warn(handle, "Invalid _CST output\n");
-		ret = -EFAULT;
-		goto end;
-	}
-
-	count = cst->package.elements[0].integer.value;
-
-	/* Validate the number of C-states. */
-	if (count < 1 || count != cst->package.count - 1) {
-		acpi_handle_warn(handle, "Inconsistent _CST data\n");
-		ret = -EFAULT;
-		goto end;
-	}
-
-	for (i = 1; i <= count; i++) {
-		union acpi_object *element;
-		union acpi_object *obj;
-		struct acpi_power_register *reg;
-		struct acpi_processor_cx cx;
-
-		/*
-		 * If there is not enough space for all C-states, skip the
-		 * excess ones and log a warning.
-		 */
-		if (last_index >= ACPI_PROCESSOR_MAX_POWER - 1) {
-			acpi_handle_warn(handle,
-					 "No room for more idle states (limit: %d)\n",
-					 ACPI_PROCESSOR_MAX_POWER - 1);
-			break;
-		}
-
-		memset(&cx, 0, sizeof(cx));
-
-		element = &cst->package.elements[i];
-		if (element->type != ACPI_TYPE_PACKAGE)
-			continue;
-
-		if (element->package.count != 4)
-			continue;
-
-		obj = &element->package.elements[0];
-
-		if (obj->type != ACPI_TYPE_BUFFER)
-			continue;
-
-		reg = (struct acpi_power_register *)obj->buffer.pointer;
-
-		obj = &element->package.elements[1];
-		if (obj->type != ACPI_TYPE_INTEGER)
-			continue;
-
-		cx.type = obj->integer.value;
-		/*
-		 * There are known cases in which the _CST output does not
-		 * contain C1, so if the type of the first state found is not
-		 * C1, leave an empty slot for C1 to be filled in later.
-		 */
-		if (i == 1 && cx.type != ACPI_STATE_C1)
-			last_index = 1;
-
-		cx.address = reg->address;
-		cx.index = last_index + 1;
-
-		if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
-			if (!acpi_processor_ffh_cstate_probe(cpu, &cx, reg)) {
-				/*
-				 * In the majority of cases _CST describes C1 as
-				 * a FIXED_HARDWARE C-state, but if the command
-				 * line forbids using MWAIT, use CSTATE_HALT for
-				 * C1 regardless.
-				 */
-				if (cx.type == ACPI_STATE_C1 &&
-				    boot_option_idle_override == IDLE_NOMWAIT) {
-					cx.entry_method = ACPI_CSTATE_HALT;
-					snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
-				} else {
-					cx.entry_method = ACPI_CSTATE_FFH;
-				}
-			} else if (cx.type == ACPI_STATE_C1) {
-				/*
-				 * In the special case of C1, FIXED_HARDWARE can
-				 * be handled by executing the HLT instruction.
-				 */
-				cx.entry_method = ACPI_CSTATE_HALT;
-				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
-			} else {
-				continue;
-			}
-		} else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
-			cx.entry_method = ACPI_CSTATE_SYSTEMIO;
-			snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
-				 cx.address);
-		} else {
-			continue;
-		}
-
-		if (cx.type == ACPI_STATE_C1)
-			cx.valid = 1;
-
-		obj = &element->package.elements[2];
-		if (obj->type != ACPI_TYPE_INTEGER)
-			continue;
-
-		cx.latency = obj->integer.value;
-
-		obj = &element->package.elements[3];
-		if (obj->type != ACPI_TYPE_INTEGER)
-			continue;
-
-		memcpy(&info->states[++last_index], &cx, sizeof(cx));
-	}
-
-	acpi_handle_info(handle, "Found %d idle states\n", last_index);
-
-	info->count = last_index;
-
-      end:
-	kfree(buffer.pointer);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(acpi_processor_evaluate_cst);
 #endif /* CONFIG_ACPI_PROCESSOR_CSTATE */
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a1b2f3ec49e7..306aeff7ed50 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -297,6 +297,148 @@ static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
 	return 0;
 }
 
+static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
+				       struct acpi_processor_power *info)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *cst;
+	acpi_status status;
+	u64 count;
+	int last_index = 0;
+	int i, ret = 0;
+
+	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
+	if (ACPI_FAILURE(status)) {
+		acpi_handle_debug(handle, "No _CST\n");
+		return -ENODEV;
+	}
+
+	cst = buffer.pointer;
+
+	/* There must be at least 2 elements. */
+	if (!cst || cst->type != ACPI_TYPE_PACKAGE || cst->package.count < 2) {
+		acpi_handle_warn(handle, "Invalid _CST output\n");
+		ret = -EFAULT;
+		goto end;
+	}
+
+	count = cst->package.elements[0].integer.value;
+
+	/* Validate the number of C-states. */
+	if (count < 1 || count != cst->package.count - 1) {
+		acpi_handle_warn(handle, "Inconsistent _CST data\n");
+		ret = -EFAULT;
+		goto end;
+	}
+
+	for (i = 1; i <= count; i++) {
+		union acpi_object *element;
+		union acpi_object *obj;
+		struct acpi_power_register *reg;
+		struct acpi_processor_cx cx;
+
+		/*
+		 * If there is not enough space for all C-states, skip the
+		 * excess ones and log a warning.
+		 */
+		if (last_index >= ACPI_PROCESSOR_MAX_POWER - 1) {
+			acpi_handle_warn(handle,
+					 "No room for more idle states (limit: %d)\n",
+					 ACPI_PROCESSOR_MAX_POWER - 1);
+			break;
+		}
+
+		memset(&cx, 0, sizeof(cx));
+
+		element = &cst->package.elements[i];
+		if (element->type != ACPI_TYPE_PACKAGE)
+			continue;
+
+		if (element->package.count != 4)
+			continue;
+
+		obj = &element->package.elements[0];
+
+		if (obj->type != ACPI_TYPE_BUFFER)
+			continue;
+
+		reg = (struct acpi_power_register *)obj->buffer.pointer;
+
+		obj = &element->package.elements[1];
+		if (obj->type != ACPI_TYPE_INTEGER)
+			continue;
+
+		cx.type = obj->integer.value;
+		/*
+		 * There are known cases in which the _CST output does not
+		 * contain C1, so if the type of the first state found is not
+		 * C1, leave an empty slot for C1 to be filled in later.
+		 */
+		if (i == 1 && cx.type != ACPI_STATE_C1)
+			last_index = 1;
+
+		cx.address = reg->address;
+		cx.index = last_index + 1;
+
+		if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
+			if (!acpi_processor_ffh_cstate_probe(cpu, &cx, reg)) {
+				/*
+				 * In the majority of cases _CST describes C1 as
+				 * a FIXED_HARDWARE C-state, but if the command
+				 * line forbids using MWAIT, use CSTATE_HALT for
+				 * C1 regardless.
+				 */
+				if (cx.type == ACPI_STATE_C1 &&
+				    boot_option_idle_override == IDLE_NOMWAIT) {
+					cx.entry_method = ACPI_CSTATE_HALT;
+					snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
+				} else {
+					cx.entry_method = ACPI_CSTATE_FFH;
+				}
+			} else if (cx.type == ACPI_STATE_C1) {
+				/*
+				 * In the special case of C1, FIXED_HARDWARE can
+				 * be handled by executing the HLT instruction.
+				 */
+				cx.entry_method = ACPI_CSTATE_HALT;
+				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
+			} else {
+				continue;
+			}
+		} else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+			cx.entry_method = ACPI_CSTATE_SYSTEMIO;
+			snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
+				 cx.address);
+		} else {
+			continue;
+		}
+
+		if (cx.type == ACPI_STATE_C1)
+			cx.valid = 1;
+
+		obj = &element->package.elements[2];
+		if (obj->type != ACPI_TYPE_INTEGER)
+			continue;
+
+		cx.latency = obj->integer.value;
+
+		obj = &element->package.elements[3];
+		if (obj->type != ACPI_TYPE_INTEGER)
+			continue;
+
+		memcpy(&info->states[++last_index], &cx, sizeof(cx));
+	}
+
+	acpi_handle_info(handle, "Found %d idle states\n", last_index);
+
+	info->count = last_index;
+
+      end:
+	kfree(buffer.pointer);
+
+	return ret;
+}
+
 static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 {
 	int ret;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fb43a7940a68..14aaa40323e9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -284,19 +284,10 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
 /* Validate the processor object's proc_id */
 bool acpi_duplicate_processor_id(int proc_id);
 /* Processor _CTS control */
-struct acpi_processor_power;
-
 #ifdef CONFIG_ACPI_PROCESSOR_CSTATE
 bool acpi_processor_claim_cst_control(void);
-int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
-				struct acpi_processor_power *info);
 #else
 static inline bool acpi_processor_claim_cst_control(void) { return false; }
-static inline int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
-					      struct acpi_processor_power *info)
-{
-	return -ENODEV;
-}
 #endif
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
-- 
Gitee


From 981cc88ac3698ef8eccc72d83417d16d39e1f5cc Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:46:00 +0800
Subject: [PATCH 1074/2161] Revert "Intel: ACPI: processor: Clean up
 acpi_processor_evaluate_cst()"

commit opencloudos.

This reverts commit fcd3700ad6430c6c6f8054ade4a310e2525ebeae.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/processor_idle.c | 114 ++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 52 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 306aeff7ed50..97c9ccf5a7e0 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -304,29 +304,29 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 	union acpi_object *cst;
 	acpi_status status;
 	u64 count;
-	int last_index = 0;
+	int current_count = 0;
 	int i, ret = 0;
 
 	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
-		acpi_handle_debug(handle, "No _CST\n");
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
 		return -ENODEV;
 	}
 
 	cst = buffer.pointer;
 
-	/* There must be at least 2 elements. */
-	if (!cst || cst->type != ACPI_TYPE_PACKAGE || cst->package.count < 2) {
-		acpi_handle_warn(handle, "Invalid _CST output\n");
+	/* There must be at least 2 elements */
+	if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) {
+		pr_err("not enough elements in _CST\n");
 		ret = -EFAULT;
 		goto end;
 	}
 
 	count = cst->package.elements[0].integer.value;
 
-	/* Validate the number of C-states. */
+	/* Validate number of power states. */
 	if (count < 1 || count != cst->package.count - 1) {
-		acpi_handle_warn(handle, "Inconsistent _CST data\n");
+		pr_err("count given by _CST is not valid\n");
 		ret = -EFAULT;
 		goto end;
 	}
@@ -337,101 +337,111 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 		struct acpi_power_register *reg;
 		struct acpi_processor_cx cx;
 
-		/*
-		 * If there is not enough space for all C-states, skip the
-		 * excess ones and log a warning.
-		 */
-		if (last_index >= ACPI_PROCESSOR_MAX_POWER - 1) {
-			acpi_handle_warn(handle,
-					 "No room for more idle states (limit: %d)\n",
-					 ACPI_PROCESSOR_MAX_POWER - 1);
-			break;
-		}
-
 		memset(&cx, 0, sizeof(cx));
 
-		element = &cst->package.elements[i];
+		element = &(cst->package.elements[i]);
 		if (element->type != ACPI_TYPE_PACKAGE)
 			continue;
 
 		if (element->package.count != 4)
 			continue;
 
-		obj = &element->package.elements[0];
+		obj = &(element->package.elements[0]);
 
 		if (obj->type != ACPI_TYPE_BUFFER)
 			continue;
 
 		reg = (struct acpi_power_register *)obj->buffer.pointer;
 
-		obj = &element->package.elements[1];
+		if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&
+		    (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))
+			continue;
+
+		/* There should be an easy way to extract an integer... */
+		obj = &(element->package.elements[1]);
 		if (obj->type != ACPI_TYPE_INTEGER)
 			continue;
 
 		cx.type = obj->integer.value;
 		/*
-		 * There are known cases in which the _CST output does not
-		 * contain C1, so if the type of the first state found is not
-		 * C1, leave an empty slot for C1 to be filled in later.
+		 * Some buggy BIOSes won't list C1 in _CST -
+		 * Let acpi_processor_get_power_info_default() handle them later
 		 */
 		if (i == 1 && cx.type != ACPI_STATE_C1)
-			last_index = 1;
+			current_count++;
 
 		cx.address = reg->address;
-		cx.index = last_index + 1;
+		cx.index = current_count + 1;
 
+		cx.entry_method = ACPI_CSTATE_SYSTEMIO;
 		if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
-			if (!acpi_processor_ffh_cstate_probe(cpu, &cx, reg)) {
-				/*
-				 * In the majority of cases _CST describes C1 as
-				 * a FIXED_HARDWARE C-state, but if the command
-				 * line forbids using MWAIT, use CSTATE_HALT for
-				 * C1 regardless.
-				 */
-				if (cx.type == ACPI_STATE_C1 &&
-				    boot_option_idle_override == IDLE_NOMWAIT) {
-					cx.entry_method = ACPI_CSTATE_HALT;
-					snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
-				} else {
-					cx.entry_method = ACPI_CSTATE_FFH;
-				}
+			if (acpi_processor_ffh_cstate_probe
+					(cpu, &cx, reg) == 0) {
+				cx.entry_method = ACPI_CSTATE_FFH;
 			} else if (cx.type == ACPI_STATE_C1) {
 				/*
-				 * In the special case of C1, FIXED_HARDWARE can
-				 * be handled by executing the HLT instruction.
+				 * C1 is a special case where FIXED_HARDWARE
+				 * can be handled in non-MWAIT way as well.
+				 * In that case, save this _CST entry info.
+				 * Otherwise, ignore this info and continue.
 				 */
 				cx.entry_method = ACPI_CSTATE_HALT;
 				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
 			} else {
 				continue;
 			}
-		} else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
-			cx.entry_method = ACPI_CSTATE_SYSTEMIO;
+			if (cx.type == ACPI_STATE_C1 &&
+			    (boot_option_idle_override == IDLE_NOMWAIT)) {
+				/*
+				 * In most cases the C1 space_id obtained from
+				 * _CST object is FIXED_HARDWARE access mode.
+				 * But when the option of idle=halt is added,
+				 * the entry_method type should be changed from
+				 * CSTATE_FFH to CSTATE_HALT.
+				 * When the option of idle=nomwait is added,
+				 * the C1 entry_method type should be
+				 * CSTATE_HALT.
+				 */
+				cx.entry_method = ACPI_CSTATE_HALT;
+				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
+			}
+		} else {
 			snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
 				 cx.address);
-		} else {
-			continue;
 		}
 
-		if (cx.type == ACPI_STATE_C1)
+		if (cx.type == ACPI_STATE_C1) {
 			cx.valid = 1;
+		}
 
-		obj = &element->package.elements[2];
+		obj = &(element->package.elements[2]);
 		if (obj->type != ACPI_TYPE_INTEGER)
 			continue;
 
 		cx.latency = obj->integer.value;
 
-		obj = &element->package.elements[3];
+		obj = &(element->package.elements[3]);
 		if (obj->type != ACPI_TYPE_INTEGER)
 			continue;
 
-		memcpy(&info->states[++last_index], &cx, sizeof(cx));
+		current_count++;
+		memcpy(&info->states[current_count], &cx, sizeof(cx));
+
+		/*
+		 * We support total ACPI_PROCESSOR_MAX_POWER - 1
+		 * (From 1 through ACPI_PROCESSOR_MAX_POWER - 1)
+		 */
+		if (current_count >= (ACPI_PROCESSOR_MAX_POWER - 1)) {
+			pr_warn("Limiting number of power states to max (%d)\n",
+				ACPI_PROCESSOR_MAX_POWER);
+			pr_warn("Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
+			break;
+		}
 	}
 
-	acpi_handle_info(handle, "Found %d idle states\n", last_index);
+	acpi_handle_info(handle, "Found %d idle states\n", current_count);
 
-	info->count = last_index;
+	info->count = current_count;
 
       end:
 	kfree(buffer.pointer);
-- 
Gitee


From ba141cfc01558f7c543fc52d773df310103714bd Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:46:12 +0800
Subject: [PATCH 1075/2161] Revert "Intel: ACPI: processor: Introduce
 acpi_processor_evaluate_cst()"

commit opencloudos.

This reverts commit e008baa991effe6a2af8b6f546bfbab0b0897622.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/processor_idle.c | 52 ++++++++++++++---------------------
 1 file changed, 20 insertions(+), 32 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 97c9ccf5a7e0..f2e019b39661 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -297,17 +297,21 @@ static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
 	return 0;
 }
 
-static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
-				       struct acpi_processor_power *info)
+static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 {
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *cst;
 	acpi_status status;
 	u64 count;
-	int current_count = 0;
+	int current_count;
 	int i, ret = 0;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *cst;
+
+	if (nocst)
+		return -ENODEV;
 
-	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
+	current_count = 0;
+
+	status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
 		return -ENODEV;
@@ -331,6 +335,9 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 		goto end;
 	}
 
+	/* Tell driver that at least _CST is supported. */
+	pr->flags.has_cst = 1;
+
 	for (i = 1; i <= count; i++) {
 		union acpi_object *element;
 		union acpi_object *obj;
@@ -376,7 +383,7 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 		cx.entry_method = ACPI_CSTATE_SYSTEMIO;
 		if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
 			if (acpi_processor_ffh_cstate_probe
-					(cpu, &cx, reg) == 0) {
+					(pr->id, &cx, reg) == 0) {
 				cx.entry_method = ACPI_CSTATE_FFH;
 			} else if (cx.type == ACPI_STATE_C1) {
 				/*
@@ -425,7 +432,7 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 			continue;
 
 		current_count++;
-		memcpy(&info->states[current_count], &cx, sizeof(cx));
+		memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
 
 		/*
 		 * We support total ACPI_PROCESSOR_MAX_POWER - 1
@@ -439,9 +446,12 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 		}
 	}
 
-	acpi_handle_info(handle, "Found %d idle states\n", current_count);
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n",
+			  current_count));
 
-	info->count = current_count;
+	/* Validate number of power states discovered */
+	if (current_count < 2)
+		ret = -EFAULT;
 
       end:
 	kfree(buffer.pointer);
@@ -449,28 +459,6 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 	return ret;
 }
 
-static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
-{
-	int ret;
-
-	if (nocst)
-		return -ENODEV;
-
-	ret = acpi_processor_evaluate_cst(pr->handle, pr->id, &pr->power);
-	if (ret)
-		return ret;
-
-	/*
-	 * It is expected that there will be at least 2 states, C1 and
-	 * something else (C2 or C3), so fail if that is not the case.
-	 */
-	if (pr->power.count < 2)
-		return -EFAULT;
-
-	pr->flags.has_cst = 1;
-	return 0;
-}
-
 static void acpi_processor_power_verify_c3(struct acpi_processor *pr,
 					   struct acpi_processor_cx *cx)
 {
-- 
Gitee


From cccc47389d73749650485122a29b49ca71799495 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 11:46:24 +0800
Subject: [PATCH 1076/2161] Revert "Intel: ACPI: processor: Export function to
 claim _CST control"

commit opencloudos.

This reverts commit d5d524770e0288e41c6df299ea82da99ca9d1af4.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/acpi_processor.c | 25 -------------------------
 drivers/acpi/processor_idle.c | 12 ++++++++----
 include/linux/acpi.h          |  6 ------
 3 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 8a53f3c5b70e..2c4dda0787e8 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -705,28 +705,3 @@ void __init acpi_processor_init(void)
 	acpi_scan_add_handler_with_hotplug(&processor_handler, "processor");
 	acpi_scan_add_handler(&processor_container_handler);
 }
-
-#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
-/**
- * acpi_processor_claim_cst_control - Request _CST control from the platform.
- */
-bool acpi_processor_claim_cst_control(void)
-{
-	static bool cst_control_claimed;
-	acpi_status status;
-
-	if (!acpi_gbl_FADT.cst_control || cst_control_claimed)
-		return true;
-
-	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
-				    acpi_gbl_FADT.cst_control, 8);
-	if (ACPI_FAILURE(status)) {
-		pr_warn("ACPI: Failed to claim processor _CST control\n");
-		return false;
-	}
-
-	cst_control_claimed = true;
-	return true;
-}
-EXPORT_SYMBOL_GPL(acpi_processor_claim_cst_control);
-#endif /* CONFIG_ACPI_PROCESSOR_CSTATE */
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index f2e019b39661..ed56c6d20b08 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -900,6 +900,7 @@ static int acpi_processor_setup_cstates(struct acpi_processor *pr)
 
 static inline void acpi_processor_cstate_first_run_checks(void)
 {
+	acpi_status status;
 	static int first_run;
 
 	if (first_run)
@@ -911,10 +912,13 @@ static inline void acpi_processor_cstate_first_run_checks(void)
 			  max_cstate);
 	first_run++;
 
-	if (nocst)
-		return;
-
-	acpi_processor_claim_cst_control();
+	if (acpi_gbl_FADT.cst_control && !nocst) {
+		status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+					    acpi_gbl_FADT.cst_control, 8);
+		if (ACPI_FAILURE(status))
+			ACPI_EXCEPTION((AE_INFO, status,
+					"Notifying BIOS of _CST ability failed"));
+	}
 }
 #else
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 14aaa40323e9..f56ee0edd625 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -283,12 +283,6 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
 
 /* Validate the processor object's proc_id */
 bool acpi_duplicate_processor_id(int proc_id);
-/* Processor _CTS control */
-#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
-bool acpi_processor_claim_cst_control(void);
-#else
-static inline bool acpi_processor_claim_cst_control(void) { return false; }
-#endif
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
-- 
Gitee


From 53d43a8cb33908f8daaa28d718fc7f7abc8dbbb4 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Sun, 24 Apr 2022 12:12:12 +0800
Subject: [PATCH 1077/2161] Revert "Intel: intel_idle: Customize IceLake server
 support"

commit opencloudos.

This reverts commit 1089fe3845313b55fe1bab78b7bc1d8e41016491.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3fe44d4cf6f7..347b08b56042 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -675,35 +675,6 @@ static struct cpuidle_state skx_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state icx_cstates[] = {
-	{
-		.name = "C1",
-		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
-		.exit_latency = 1,
-		.target_residency = 1,
-		.enter = &intel_idle,
-		.enter_s2idle = intel_idle_s2idle, },
-	{
-		.name = "C1E",
-		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
-		.exit_latency = 4,
-		.target_residency = 4,
-		.enter = &intel_idle,
-		.enter_s2idle = intel_idle_s2idle, },
-	{
-		.name = "C6",
-		.desc = "MWAIT 0x20",
-		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
-		.exit_latency = 128,
-		.target_residency = 384,
-		.enter = &intel_idle,
-		.enter_s2idle = intel_idle_s2idle, },
-	{
-		.enter = NULL }
-};
-
 static struct cpuidle_state atom_cstates[] = {
 	{
 		.name = "C1E",
@@ -1064,12 +1035,6 @@ static const struct idle_cpu idle_cpu_skx = {
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_icx = {
-	.state_table = icx_cstates,
-	.disable_promotion_to_c1e = true,
-	.use_acpi = true,
-};
-
 static const struct idle_cpu idle_cpu_avn = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1121,7 +1086,6 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(KABYLAKE_L,		idle_cpu_skl),
 	INTEL_CPU_FAM6(KABYLAKE,		idle_cpu_skl),
 	INTEL_CPU_FAM6(SKYLAKE_X,		idle_cpu_skx),
-        INTEL_CPU_FAM6(ICELAKE_X,		idle_cpu_icx),
 	INTEL_CPU_FAM6(XEON_PHI_KNL,		idle_cpu_knl),
 	INTEL_CPU_FAM6(XEON_PHI_KNM,		idle_cpu_knl),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT,		idle_cpu_bxt),
-- 
Gitee


From 2e81badbb022c6daee1daff497354199fcfae3d8 Mon Sep 17 00:00:00 2001
From: Yin Fengwei <fengwei.yin@intel.com>
Date: Thu, 24 Oct 2019 15:04:20 +0800
Subject: [PATCH 1078/2161] ACPI: processor_idle: Skip dummy wait if kernel is
 in guest

commit fa583f71a99c85e52781ed877c82c8757437b680 upstream.

In function acpi_idle_do_entry(), an ioport access is used for
dummy wait to guarantee hardware behavior. But it could trigger
unnecessary VMexit if kernel is running as guest in virtualization
environment.

If it's in virtualization environment, the deeper C state enter
operation (inb()) will trap to hypervisor. It's not needed to do
dummy wait after the inb() call. So we could just remove the
dummy io port access to avoid unnecessary VMexit.

And keep dummy io port access to maintain timing for native
environment.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/processor_idle.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index ed56c6d20b08..2ae95df2e74f 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -642,6 +642,19 @@ static int acpi_idle_bm_check(void)
 	return bm_status;
 }
 
+static void wait_for_freeze(void)
+{
+#ifdef	CONFIG_X86
+	/* No delay is needed if we are in guest */
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return;
+#endif
+	/* Dummy wait op - must do something useless after P_LVL2 read
+	   because chipsets cannot guarantee that STPCLK# signal
+	   gets asserted in time to freeze execution properly. */
+	inl(acpi_gbl_FADT.xpm_timer_block.address);
+}
+
 /**
  * acpi_idle_do_entry - enter idle state using the appropriate method
  * @cx: cstate data
@@ -658,10 +671,7 @@ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx)
 	} else {
 		/* IO port based C-state */
 		inb(cx->address);
-		/* Dummy wait op - must do something useless after P_LVL2 read
-		   because chipsets cannot guarantee that STPCLK# signal
-		   gets asserted in time to freeze execution properly. */
-		inl(acpi_gbl_FADT.xpm_timer_block.address);
+		wait_for_freeze();
 	}
 }
 
@@ -682,8 +692,7 @@ static int acpi_idle_play_dead(struct cpuidle_device *dev, int index)
 			safe_halt();
 		else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) {
 			inb(cx->address);
-			/* See comment in acpi_idle_do_entry() */
-			inl(acpi_gbl_FADT.xpm_timer_block.address);
+			wait_for_freeze();
 		} else
 			return -ENODEV;
 	}
-- 
Gitee


From daed295241a54aa5f1051953c1342d07ccf25d65 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 4 Nov 2019 12:16:17 +0100
Subject: [PATCH 1079/2161] cpuidle: Consolidate disabled state checks

commit 99e98d3fb1008ef7416e16a1fd355cb73a253502 upstream.

There are two reasons why CPU idle states may be disabled: either
because the driver has disabled them or because they have been
disabled by user space via sysfs.

In the former case, the state's "disabled" flag is set once during
the initialization of the driver and it is never cleared later (it
is read-only effectively).  In the latter case, the "disable" field
of the given state's cpuidle_state_usage struct is set and it may be
changed via sysfs.  Thus checking whether or not an idle state has
been disabled involves reading these two flags every time.

In order to avoid the additional check of the state's "disabled" flag
(which is effectively read-only anyway), use the value of it at the
init time to set a (new) flag in the "disable" field of that state's
cpuidle_state_usage structure and use the sysfs interface to
manipulate another (new) flag in it.  This way the state is disabled
whenever the "disable" field of its cpuidle_state_usage structure is
nonzero, whatever the reason, and it is the only place to look into
to check whether or not the state has been disabled.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle-powernv.c  |  7 ++--
 drivers/cpuidle/cpuidle.c          | 24 +++++++-------
 drivers/cpuidle/governors/ladder.c |  4 +--
 drivers/cpuidle/governors/menu.c   |  8 ++---
 drivers/cpuidle/governors/teo.c    |  5 ++-
 drivers/cpuidle/sysfs.c            | 51 ++++++++++++++++++------------
 include/linux/cpuidle.h            |  3 ++
 7 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b3..1b299e801f74 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -56,13 +56,10 @@ static u64 get_snooze_timeout(struct cpuidle_device *dev,
 		return default_snooze_timeout;
 
 	for (i = index + 1; i < drv->state_count; i++) {
-		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
-
-		if (s->disabled || su->disable)
+		if (dev->states_usage[i].disable)
 			continue;
 
-		return s->target_residency * tb_ticks_per_usec;
+		return drv->states[i].target_residency * tb_ticks_per_usec;
 	}
 
 	return default_snooze_timeout;
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 73f08cda21e0..11e08ed7f4b3 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -84,12 +84,12 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 
 	for (i = 1; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable || s->exit_latency <= latency_req
-		    || s->exit_latency > max_latency
-		    || (s->flags & forbidden_flags)
-		    || (s2idle && !s->enter_s2idle))
+		if (dev->states_usage[i].disable ||
+		    s->exit_latency <= latency_req ||
+		    s->exit_latency > max_latency ||
+		    (s->flags & forbidden_flags) ||
+		    (s2idle && !s->enter_s2idle))
 			continue;
 
 		latency_req = s->exit_latency;
@@ -266,8 +266,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 
 		if (diff < drv->states[entered_state].target_residency) {
 			for (i = entered_state - 1; i >= 0; i--) {
-				if (drv->states[i].disabled ||
-				    dev->states_usage[i].disable)
+				if (dev->states_usage[i].disable)
 					continue;
 
 				/* Shallower states are enabled, so update. */
@@ -276,8 +275,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 			}
 		} else if (diff > delay) {
 			for (i = entered_state + 1; i < drv->state_count; i++) {
-				if (drv->states[i].disabled ||
-				    dev->states_usage[i].disable)
+				if (dev->states_usage[i].disable)
 					continue;
 
 				/*
@@ -381,7 +379,7 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 
 	limit_ns = TICK_NSEC;
 	for (i = 1; i < drv->state_count; i++) {
-		if (drv->states[i].disabled || dev->states_usage[i].disable)
+		if (dev->states_usage[i].disable)
 			continue;
 
 		limit_ns = (u64)drv->states[i].target_residency * NSEC_PER_USEC;
@@ -569,12 +567,16 @@ static void __cpuidle_device_init(struct cpuidle_device *dev)
  */
 static int __cpuidle_register_device(struct cpuidle_device *dev)
 {
-	int ret;
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int i, ret;
 
 	if (!try_module_get(drv->owner))
 		return -EINVAL;
 
+	for (i = 0; i < drv->state_count; i++)
+		if (drv->states[i].disabled)
+			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
+
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
 
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 428eeb832fe7..b0126b8c32fe 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -84,7 +84,6 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 
 	/* consider promotion */
 	if (last_idx < drv->state_count - 1 &&
-	    !drv->states[last_idx + 1].disabled &&
 	    !dev->states_usage[last_idx + 1].disable &&
 	    last_residency > last_state->threshold.promotion_time &&
 	    drv->states[last_idx + 1].exit_latency <= latency_req) {
@@ -98,8 +97,7 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 
 	/* consider demotion */
 	if (last_idx > first_idx &&
-	    (drv->states[last_idx].disabled ||
-	    dev->states_usage[last_idx].disable ||
+	    (dev->states_usage[last_idx].disable ||
 	    drv->states[last_idx].exit_latency > latency_req)) {
 		int i;
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index e5a5d0c8d66b..38b2b72102a8 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -298,7 +298,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
 	    ((data->next_timer_us < drv->states[1].target_residency ||
 	      latency_req < drv->states[1].exit_latency) &&
-	     !drv->states[0].disabled && !dev->states_usage[0].disable)) {
+	     !dev->states_usage[0].disable)) {
 		/*
 		 * In this case state[0] will be used no matter what, so return
 		 * it right away and keep the tick running if state[0] is a
@@ -349,9 +349,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	idx = -1;
 	for (i = 0; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable)
+		if (dev->states_usage[i].disable)
 			continue;
 
 		if (idx == -1)
@@ -422,8 +421,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * tick, so try to correct that.
 			 */
 			for (i = idx - 1; i >= 0; i--) {
-				if (drv->states[i].disabled ||
-				    dev->states_usage[i].disable)
+				if (dev->states_usage[i].disable)
 					continue;
 
 				idx = i;
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index c71773c88890..e726bb1c6ba6 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -212,7 +212,7 @@ static int teo_find_shallower_state(struct cpuidle_driver *drv,
 	int i;
 
 	for (i = state_idx - 1; i >= 0; i--) {
-		if (drv->states[i].disabled || dev->states_usage[i].disable)
+		if (dev->states_usage[i].disable)
 			continue;
 
 		state_idx = i;
@@ -257,9 +257,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 	for (i = 0; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable) {
+		if (dev->states_usage[i].disable) {
 			/*
 			 * Ignore disabled states with target residencies beyond
 			 * the anticipated idle duration.
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index f8747322b3c7..b9d2bae2edb0 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -255,25 +255,6 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \
 	return sprintf(buf, "%u\n", state->_name);\
 }
 
-#define define_store_state_ull_function(_name) \
-static ssize_t store_state_##_name(struct cpuidle_state *state, \
-				   struct cpuidle_state_usage *state_usage, \
-				   const char *buf, size_t size)	\
-{ \
-	unsigned long long value; \
-	int err; \
-	if (!capable(CAP_SYS_ADMIN)) \
-		return -EPERM; \
-	err = kstrtoull(buf, 0, &value); \
-	if (err) \
-		return err; \
-	if (value) \
-		state_usage->_name = 1; \
-	else \
-		state_usage->_name = 0; \
-	return size; \
-}
-
 #define define_show_state_ull_function(_name) \
 static ssize_t show_state_##_name(struct cpuidle_state *state, \
 				  struct cpuidle_state_usage *state_usage, \
@@ -299,11 +280,39 @@ define_show_state_ull_function(usage)
 define_show_state_ull_function(time)
 define_show_state_str_function(name)
 define_show_state_str_function(desc)
-define_show_state_ull_function(disable)
-define_store_state_ull_function(disable)
 define_show_state_ull_function(above)
 define_show_state_ull_function(below)
 
+static ssize_t show_state_disable(struct cpuidle_state *state,
+				  struct cpuidle_state_usage *state_usage,
+				  char *buf)
+{
+	return sprintf(buf, "%llu\n",
+		       state_usage->disable & CPUIDLE_STATE_DISABLED_BY_USER);
+}
+
+static ssize_t store_state_disable(struct cpuidle_state *state,
+				   struct cpuidle_state_usage *state_usage,
+				   const char *buf, size_t size)
+{
+	unsigned int value;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = kstrtouint(buf, 0, &value);
+	if (err)
+		return err;
+
+	if (value)
+		state_usage->disable |= CPUIDLE_STATE_DISABLED_BY_USER;
+	else
+		state_usage->disable &= ~CPUIDLE_STATE_DISABLED_BY_USER;
+
+	return size;
+}
+
 define_one_state_ro(name, show_state_name);
 define_one_state_ro(desc, show_state_desc);
 define_one_state_ro(latency, show_state_exit_latency);
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 4b6b5bea8f79..d23a3b1ddcf6 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -29,6 +29,9 @@ struct cpuidle_driver;
  * CPUIDLE DEVICE INTERFACE *
  ****************************/
 
+#define CPUIDLE_STATE_DISABLED_BY_USER		BIT(0)
+#define CPUIDLE_STATE_DISABLED_BY_DRIVER	BIT(1)
+
 struct cpuidle_state_usage {
 	unsigned long long	disable;
 	unsigned long long	usage;
-- 
Gitee


From 374fca7377eb29b78e3082040fc09109e8ba8819 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 7 Nov 2019 15:25:12 +0100
Subject: [PATCH 1080/2161] cpuidle: Use nanoseconds as the unit of time

commit c1d51f684c72b5eb2aecbbd47be3a2977a2dc903 upstream.

Currently, the cpuidle subsystem uses microseconds as the unit of
time which (among other things) causes the idle loop to incur some
integer division overhead for no clear benefit.

In order to allow cpuidle to measure time in nanoseconds, add two
new fields, exit_latency_ns and target_residency_ns, to represent the
exit latency and target residency of an idle state in nanoseconds,
respectively, to struct cpuidle_state and initialize them with the
help of the corresponding values in microseconds provided by drivers.
Additionally, change cpuidle_governor_latency_req() to return the
idle state exit latency constraint in nanoseconds.

Also meeasure idle state residency (last_residency_ns in struct
cpuidle_device and time_ns in struct cpuidle_driver) in nanoseconds
and update the cpuidle core and governors accordingly.

However, the menu governor still computes typical intervals in
microseconds to avoid integer overflows.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Doug Smythies <dsmythies@telus.net>
Tested-by: Doug Smythies <dsmythies@telus.net>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle.c            |  36 ++++----
 drivers/cpuidle/driver.c             |  29 +++++--
 drivers/cpuidle/governor.c           |   7 +-
 drivers/cpuidle/governors/haltpoll.c |   7 +-
 drivers/cpuidle/governors/ladder.c   |  25 +++---
 drivers/cpuidle/governors/menu.c     | 123 ++++++++++++---------------
 drivers/cpuidle/governors/teo.c      |  76 ++++++++---------
 drivers/cpuidle/poll_state.c         |   2 +
 drivers/cpuidle/sysfs.c              |  20 ++++-
 include/linux/cpuidle.h              |   8 +-
 kernel/sched/idle.c                  |   2 +-
 11 files changed, 174 insertions(+), 161 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 11e08ed7f4b3..936cc580a4cd 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -75,24 +75,24 @@ int cpuidle_play_dead(void)
 
 static int find_deepest_state(struct cpuidle_driver *drv,
 			      struct cpuidle_device *dev,
-			      unsigned int max_latency,
+			      u64 max_latency_ns,
 			      unsigned int forbidden_flags,
 			      bool s2idle)
 {
-	unsigned int latency_req = 0;
+	u64 latency_req = 0;
 	int i, ret = 0;
 
 	for (i = 1; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
 
 		if (dev->states_usage[i].disable ||
-		    s->exit_latency <= latency_req ||
-		    s->exit_latency > max_latency ||
+		    s->exit_latency_ns <= latency_req ||
+		    s->exit_latency_ns > max_latency_ns ||
 		    (s->flags & forbidden_flags) ||
 		    (s2idle && !s->enter_s2idle))
 			continue;
 
-		latency_req = s->exit_latency;
+		latency_req = s->exit_latency_ns;
 		ret = i;
 	}
 	return ret;
@@ -124,7 +124,7 @@ void cpuidle_use_deepest_state(bool enable)
 int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 			       struct cpuidle_device *dev)
 {
-	return find_deepest_state(drv, dev, UINT_MAX, 0, false);
+	return find_deepest_state(drv, dev, U64_MAX, 0, false);
 }
 
 #ifdef CONFIG_SUSPEND
@@ -181,7 +181,7 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * that interrupts won't be enabled when it exits and allows the tick to
 	 * be frozen safely.
 	 */
-	index = find_deepest_state(drv, dev, UINT_MAX, 0, true);
+	index = find_deepest_state(drv, dev, U64_MAX, 0, true);
 	if (index > 0)
 		enter_s2idle_proper(drv, dev, index);
 
@@ -210,7 +210,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	 * CPU as a broadcast timer, this call may fail if it is not available.
 	 */
 	if (broadcast && tick_broadcast_enter()) {
-		index = find_deepest_state(drv, dev, target_state->exit_latency,
+		index = find_deepest_state(drv, dev, target_state->exit_latency_ns,
 					   CPUIDLE_FLAG_TIMER_STOP, false);
 		if (index < 0) {
 			default_idle_call();
@@ -248,7 +248,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		local_irq_enable();
 
 	if (entered_state >= 0) {
-		s64 diff, delay = drv->states[entered_state].exit_latency;
+		s64 diff, delay = drv->states[entered_state].exit_latency_ns;
 		int i;
 
 		/*
@@ -256,15 +256,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		 * This can be moved to within driver enter routine,
 		 * but that results in multiple copies of same code.
 		 */
-		diff = ktime_us_delta(time_end, time_start);
-		if (diff > INT_MAX)
-			diff = INT_MAX;
+		diff = ktime_sub(time_end, time_start);
 
-		dev->last_residency = (int)diff;
-		dev->states_usage[entered_state].time += dev->last_residency;
+		dev->last_residency_ns = diff;
+		dev->states_usage[entered_state].time_ns += diff;
 		dev->states_usage[entered_state].usage++;
 
-		if (diff < drv->states[entered_state].target_residency) {
+		if (diff < drv->states[entered_state].target_residency_ns) {
 			for (i = entered_state - 1; i >= 0; i--) {
 				if (dev->states_usage[i].disable)
 					continue;
@@ -282,14 +280,14 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 				 * Update if a deeper state would have been a
 				 * better match for the observed idle duration.
 				 */
-				if (diff - delay >= drv->states[i].target_residency)
+				if (diff - delay >= drv->states[i].target_residency_ns)
 					dev->states_usage[entered_state].below++;
 
 				break;
 			}
 		}
 	} else {
-		dev->last_residency = 0;
+		dev->last_residency_ns = 0;
 	}
 
 	return entered_state;
@@ -382,7 +380,7 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 		if (dev->states_usage[i].disable)
 			continue;
 
-		limit_ns = (u64)drv->states[i].target_residency * NSEC_PER_USEC;
+		limit_ns = (u64)drv->states[i].target_residency_ns;
 		break;
 	}
 
@@ -554,7 +552,7 @@ static void __cpuidle_unregister_device(struct cpuidle_device *dev)
 static void __cpuidle_device_init(struct cpuidle_device *dev)
 {
 	memset(dev->states_usage, 0, sizeof(dev->states_usage));
-	dev->last_residency = 0;
+	dev->last_residency_ns = 0;
 	dev->next_hrtimer = 0;
 }
 
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 9db154224999..fcaf8b2bab96 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -165,16 +165,27 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 	if (!drv->cpumask)
 		drv->cpumask = (struct cpumask *)cpu_possible_mask;
 
-	/*
-	 * Look for the timer stop flag in the different states, so that we know
-	 * if the broadcast timer has to be set up.  The loop is in the reverse
-	 * order, because usually one of the deeper states have this flag set.
-	 */
-	for (i = drv->state_count - 1; i >= 0 ; i--) {
-		if (drv->states[i].flags & CPUIDLE_FLAG_TIMER_STOP) {
+	for (i = 0; i < drv->state_count; i++) {
+		struct cpuidle_state *s = &drv->states[i];
+
+		/*
+		 * Look for the timer stop flag in the different states and if
+		 * it is found, indicate that the broadcast timer has to be set
+		 * up.
+		 */
+		if (s->flags & CPUIDLE_FLAG_TIMER_STOP)
 			drv->bctimer = 1;
-			break;
-		}
+
+		/*
+		 * The core will use the target residency and exit latency
+		 * values in nanoseconds, but allow drivers to provide them in
+		 * microseconds too.
+		 */
+		if (s->target_residency > 0)
+			s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
+
+		if (s->exit_latency > 0)
+			s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
 	}
 }
 
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index e9801f26c732..e48271e117a3 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -107,11 +107,14 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
  * cpuidle_governor_latency_req - Compute a latency constraint for CPU
  * @cpu: Target CPU
  */
-int cpuidle_governor_latency_req(unsigned int cpu)
+s64 cpuidle_governor_latency_req(unsigned int cpu)
 {
 	int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 	struct device *device = get_cpu_device(cpu);
 	int device_req = dev_pm_qos_raw_resume_latency(device);
 
-	return device_req < global_req ? device_req : global_req;
+	if (device_req > global_req)
+		device_req = global_req;
+
+	return (s64)device_req * NSEC_PER_USEC;
 }
diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c
index 7a703d2e0064..cb2a96eafc02 100644
--- a/drivers/cpuidle/governors/haltpoll.c
+++ b/drivers/cpuidle/governors/haltpoll.c
@@ -49,7 +49,7 @@ static int haltpoll_select(struct cpuidle_driver *drv,
 			   struct cpuidle_device *dev,
 			   bool *stop_tick)
 {
-	int latency_req = cpuidle_governor_latency_req(dev->cpu);
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 
 	if (!drv->state_count || latency_req == 0) {
 		*stop_tick = false;
@@ -75,10 +75,9 @@ static int haltpoll_select(struct cpuidle_driver *drv,
 	return 0;
 }
 
-static void adjust_poll_limit(struct cpuidle_device *dev, unsigned int block_us)
+static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
 {
 	unsigned int val;
-	u64 block_ns = block_us*NSEC_PER_USEC;
 
 	/* Grow cpu_halt_poll_us if
 	 * cpu_halt_poll_us < block_ns < guest_halt_poll_us
@@ -115,7 +114,7 @@ static void haltpoll_reflect(struct cpuidle_device *dev, int index)
 	dev->last_state_idx = index;
 
 	if (index != 0)
-		adjust_poll_limit(dev, dev->last_residency);
+		adjust_poll_limit(dev, dev->last_residency_ns);
 }
 
 /**
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index b0126b8c32fe..8e9058c4ea63 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -27,8 +27,8 @@ struct ladder_device_state {
 	struct {
 		u32 promotion_count;
 		u32 demotion_count;
-		u32 promotion_time;
-		u32 demotion_time;
+		u64 promotion_time_ns;
+		u64 demotion_time_ns;
 	} threshold;
 	struct {
 		int promotion_count;
@@ -68,9 +68,10 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 {
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
 	struct ladder_device_state *last_state;
-	int last_residency, last_idx = dev->last_state_idx;
+	int last_idx = dev->last_state_idx;
 	int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0;
-	int latency_req = cpuidle_governor_latency_req(dev->cpu);
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+	s64 last_residency;
 
 	/* Special case when user has set very strict latency requirement */
 	if (unlikely(latency_req == 0)) {
@@ -80,13 +81,13 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 
 	last_state = &ldev->states[last_idx];
 
-	last_residency = dev->last_residency - drv->states[last_idx].exit_latency;
+	last_residency = dev->last_residency_ns - drv->states[last_idx].exit_latency_ns;
 
 	/* consider promotion */
 	if (last_idx < drv->state_count - 1 &&
 	    !dev->states_usage[last_idx + 1].disable &&
-	    last_residency > last_state->threshold.promotion_time &&
-	    drv->states[last_idx + 1].exit_latency <= latency_req) {
+	    last_residency > last_state->threshold.promotion_time_ns &&
+	    drv->states[last_idx + 1].exit_latency_ns <= latency_req) {
 		last_state->stats.promotion_count++;
 		last_state->stats.demotion_count = 0;
 		if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) {
@@ -98,11 +99,11 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 	/* consider demotion */
 	if (last_idx > first_idx &&
 	    (dev->states_usage[last_idx].disable ||
-	    drv->states[last_idx].exit_latency > latency_req)) {
+	    drv->states[last_idx].exit_latency_ns > latency_req)) {
 		int i;
 
 		for (i = last_idx - 1; i > first_idx; i--) {
-			if (drv->states[i].exit_latency <= latency_req)
+			if (drv->states[i].exit_latency_ns <= latency_req)
 				break;
 		}
 		ladder_do_selection(dev, ldev, last_idx, i);
@@ -110,7 +111,7 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 	}
 
 	if (last_idx > first_idx &&
-	    last_residency < last_state->threshold.demotion_time) {
+	    last_residency < last_state->threshold.demotion_time_ns) {
 		last_state->stats.demotion_count++;
 		last_state->stats.promotion_count = 0;
 		if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) {
@@ -150,9 +151,9 @@ static int ladder_enable_device(struct cpuidle_driver *drv,
 		lstate->threshold.demotion_count = DEMOTION_COUNT;
 
 		if (i < drv->state_count - 1)
-			lstate->threshold.promotion_time = state->exit_latency;
+			lstate->threshold.promotion_time_ns = state->exit_latency_ns;
 		if (i > first_idx)
-			lstate->threshold.demotion_time = state->exit_latency;
+			lstate->threshold.demotion_time_ns = state->exit_latency_ns;
 	}
 
 	return 0;
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 38b2b72102a8..b0a7ad566081 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,22 +19,12 @@
 #include <linux/sched/stat.h>
 #include <linux/math64.h>
 
-/*
- * Please note when changing the tuning values:
- * If (MAX_INTERESTING-1) * RESOLUTION > UINT_MAX, the result of
- * a scaling operation multiplication may overflow on 32 bit platforms.
- * In that case, #define RESOLUTION as ULL to get 64 bit result:
- * #define RESOLUTION 1024ULL
- *
- * The default values do not overflow.
- */
 #define BUCKETS 12
 #define INTERVAL_SHIFT 3
 #define INTERVALS (1UL << INTERVAL_SHIFT)
 #define RESOLUTION 1024
 #define DECAY 8
-#define MAX_INTERESTING 50000
-
+#define MAX_INTERESTING (50000 * NSEC_PER_USEC)
 
 /*
  * Concepts and ideas behind the menu governor
@@ -120,14 +110,14 @@ struct menu_device {
 	int             needs_update;
 	int             tick_wakeup;
 
-	unsigned int	next_timer_us;
+	u64		next_timer_ns;
 	unsigned int	bucket;
 	unsigned int	correction_factor[BUCKETS];
 	unsigned int	intervals[INTERVALS];
 	int		interval_ptr;
 };
 
-static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
 {
 	int bucket = 0;
 
@@ -140,15 +130,15 @@ static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters
 	if (nr_iowaiters)
 		bucket = BUCKETS/2;
 
-	if (duration < 10)
+	if (duration_ns < 10ULL * NSEC_PER_USEC)
 		return bucket;
-	if (duration < 100)
+	if (duration_ns < 100ULL * NSEC_PER_USEC)
 		return bucket + 1;
-	if (duration < 1000)
+	if (duration_ns < 1000ULL * NSEC_PER_USEC)
 		return bucket + 2;
-	if (duration < 10000)
+	if (duration_ns < 10000ULL * NSEC_PER_USEC)
 		return bucket + 3;
-	if (duration < 100000)
+	if (duration_ns < 100000ULL * NSEC_PER_USEC)
 		return bucket + 4;
 	return bucket + 5;
 }
@@ -276,13 +266,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		       bool *stop_tick)
 {
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
-	int latency_req = cpuidle_governor_latency_req(dev->cpu);
-	int i;
-	int idx;
-	unsigned int interactivity_req;
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 	unsigned int predicted_us;
+	u64 predicted_ns;
+	u64 interactivity_req;
 	unsigned long nr_iowaiters;
 	ktime_t delta_next;
+	int i, idx;
 
 	if (data->needs_update) {
 		menu_update(drv, dev);
@@ -290,14 +280,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	}
 
 	/* determine the expected residency time, round up */
-	data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next));
+	data->next_timer_ns = tick_nohz_get_sleep_length(&delta_next);
 
 	nr_iowaiters = nr_iowait_cpu(dev->cpu);
-	data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
+	data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
 
 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
-	    ((data->next_timer_us < drv->states[1].target_residency ||
-	      latency_req < drv->states[1].exit_latency) &&
+	    ((data->next_timer_ns < drv->states[1].target_residency_ns ||
+	      latency_req < drv->states[1].exit_latency_ns) &&
 	     !dev->states_usage[0].disable)) {
 		/*
 		 * In this case state[0] will be used no matter what, so return
@@ -308,18 +298,15 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		return 0;
 	}
 
-	/*
-	 * Force the result of multiplication to be 64 bits even if both
-	 * operands are 32 bits.
-	 * Make sure to round up for half microseconds.
-	 */
-	predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us *
-					 data->correction_factor[data->bucket],
-					 RESOLUTION * DECAY);
-	/*
-	 * Use the lowest expected idle interval to pick the idle state.
-	 */
-	predicted_us = min(predicted_us, get_typical_interval(data, predicted_us));
+	/* Round up the result for half microseconds. */
+	predicted_us = div_u64(data->next_timer_ns *
+			       data->correction_factor[data->bucket] +
+			       (RESOLUTION * DECAY * NSEC_PER_USEC) / 2,
+			       RESOLUTION * DECAY * NSEC_PER_USEC);
+	/* Use the lowest expected idle interval to pick the idle state. */
+	predicted_ns = (u64)min(predicted_us,
+				get_typical_interval(data, predicted_us)) *
+				NSEC_PER_USEC;
 
 	if (tick_nohz_tick_stopped()) {
 		/*
@@ -330,14 +317,15 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * the known time till the closest timer event for the idle
 		 * state selection.
 		 */
-		if (predicted_us < TICK_USEC)
-			predicted_us = ktime_to_us(delta_next);
+		if (predicted_ns < TICK_NSEC)
+			predicted_ns = delta_next;
 	} else {
 		/*
 		 * Use the performance multiplier and the user-configurable
 		 * latency_req to determine the maximum exit latency.
 		 */
-		interactivity_req = predicted_us / performance_multiplier(nr_iowaiters);
+		interactivity_req = div64_u64(predicted_ns,
+					      performance_multiplier(nr_iowaiters));
 		if (latency_req > interactivity_req)
 			latency_req = interactivity_req;
 	}
@@ -356,19 +344,19 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		if (idx == -1)
 			idx = i; /* first enabled state */
 
-		if (s->target_residency > predicted_us) {
+		if (s->target_residency_ns > predicted_ns) {
 			/*
 			 * Use a physical idle state, not busy polling, unless
 			 * a timer is going to trigger soon enough.
 			 */
 			if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
-			    s->exit_latency <= latency_req &&
-			    s->target_residency <= data->next_timer_us) {
-				predicted_us = s->target_residency;
+			    s->exit_latency_ns <= latency_req &&
+			    s->target_residency_ns <= data->next_timer_ns) {
+				predicted_ns = s->target_residency_ns;
 				idx = i;
 				break;
 			}
-			if (predicted_us < TICK_USEC)
+			if (predicted_ns < TICK_NSEC)
 				break;
 
 			if (!tick_nohz_tick_stopped()) {
@@ -378,7 +366,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 				 * tick in that case and let the governor run
 				 * again in the next iteration of the loop.
 				 */
-				predicted_us = drv->states[idx].target_residency;
+				predicted_ns = drv->states[idx].target_residency_ns;
 				break;
 			}
 
@@ -388,13 +376,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * closest timer event, select this one to avoid getting
 			 * stuck in the shallow one for too long.
 			 */
-			if (drv->states[idx].target_residency < TICK_USEC &&
-			    s->target_residency <= ktime_to_us(delta_next))
+			if (drv->states[idx].target_residency_ns < TICK_NSEC &&
+			    s->target_residency_ns <= delta_next)
 				idx = i;
 
 			return idx;
 		}
-		if (s->exit_latency > latency_req)
+		if (s->exit_latency_ns > latency_req)
 			break;
 
 		idx = i;
@@ -408,12 +396,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * expected idle duration is shorter than the tick period length.
 	 */
 	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-	     predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
-		unsigned int delta_next_us = ktime_to_us(delta_next);
-
+	     predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
 		*stop_tick = false;
 
-		if (idx > 0 && drv->states[idx].target_residency > delta_next_us) {
+		if (idx > 0 && drv->states[idx].target_residency_ns > delta_next) {
 			/*
 			 * The tick is not going to be stopped and the target
 			 * residency of the state to be returned is not within
@@ -425,7 +411,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 					continue;
 
 				idx = i;
-				if (drv->states[i].target_residency <= delta_next_us)
+				if (drv->states[i].target_residency_ns <= delta_next)
 					break;
 			}
 		}
@@ -461,7 +447,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
 	int last_idx = dev->last_state_idx;
 	struct cpuidle_state *target = &drv->states[last_idx];
-	unsigned int measured_us;
+	u64 measured_ns;
 	unsigned int new_factor;
 
 	/*
@@ -479,7 +465,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * assume the state was never reached and the exit latency is 0.
 	 */
 
-	if (data->tick_wakeup && data->next_timer_us > TICK_USEC) {
+	if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) {
 		/*
 		 * The nohz code said that there wouldn't be any events within
 		 * the tick boundary (if the tick was stopped), but the idle
@@ -489,7 +475,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		 * have been idle long (but not forever) to help the idle
 		 * duration predictor do a better job next time.
 		 */
-		measured_us = 9 * MAX_INTERESTING / 10;
+		measured_ns = 9 * MAX_INTERESTING / 10;
 	} else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) &&
 		   dev->poll_time_limit) {
 		/*
@@ -499,28 +485,29 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		 * the CPU might have been woken up from idle by the next timer.
 		 * Assume that to be the case.
 		 */
-		measured_us = data->next_timer_us;
+		measured_ns = data->next_timer_ns;
 	} else {
 		/* measured value */
-		measured_us = dev->last_residency;
+		measured_ns = dev->last_residency_ns;
 
 		/* Deduct exit latency */
-		if (measured_us > 2 * target->exit_latency)
-			measured_us -= target->exit_latency;
+		if (measured_ns > 2 * target->exit_latency_ns)
+			measured_ns -= target->exit_latency_ns;
 		else
-			measured_us /= 2;
+			measured_ns /= 2;
 	}
 
 	/* Make sure our coefficients do not exceed unity */
-	if (measured_us > data->next_timer_us)
-		measured_us = data->next_timer_us;
+	if (measured_ns > data->next_timer_ns)
+		measured_ns = data->next_timer_ns;
 
 	/* Update our correction ratio */
 	new_factor = data->correction_factor[data->bucket];
 	new_factor -= new_factor / DECAY;
 
-	if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING)
-		new_factor += RESOLUTION * measured_us / data->next_timer_us;
+	if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING)
+		new_factor += div64_u64(RESOLUTION * measured_ns,
+					data->next_timer_ns);
 	else
 		/*
 		 * we were idle so long that we count it as a perfect
@@ -540,7 +527,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	data->correction_factor[data->bucket] = new_factor;
 
 	/* update the repeating-pattern data */
-	data->intervals[data->interval_ptr++] = measured_us;
+	data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
 	if (data->interval_ptr >= INTERVALS)
 		data->interval_ptr = 0;
 }
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index e726bb1c6ba6..f5090363de72 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -104,7 +104,7 @@ struct teo_cpu {
 	u64 sleep_length_ns;
 	struct teo_idle_state states[CPUIDLE_STATE_MAX];
 	int interval_idx;
-	unsigned int intervals[INTERVALS];
+	u64 intervals[INTERVALS];
 };
 
 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@@ -117,9 +117,8 @@ static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
-	unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns);
 	int i, idx_hit = -1, idx_timer = -1;
-	unsigned int measured_us;
+	u64 measured_ns;
 
 	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
 		/*
@@ -127,23 +126,21 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		 * enough to the closest timer event expected at the idle state
 		 * selection time to be discarded.
 		 */
-		measured_us = UINT_MAX;
+		measured_ns = U64_MAX;
 	} else {
-		unsigned int lat;
+		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
 
-		lat = drv->states[dev->last_state_idx].exit_latency;
-
-		measured_us = ktime_to_us(cpu_data->time_span_ns);
+		measured_ns = cpu_data->time_span_ns;
 		/*
 		 * The delay between the wakeup and the first instruction
 		 * executed by the CPU is not likely to be worst-case every
 		 * time, so take 1/2 of the exit latency as a very rough
 		 * approximation of the average of it.
 		 */
-		if (measured_us >= lat)
-			measured_us -= lat / 2;
+		if (measured_ns >= lat_ns)
+			measured_ns -= lat_ns / 2;
 		else
-			measured_us /= 2;
+			measured_ns /= 2;
 	}
 
 	/*
@@ -155,9 +152,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 
 		cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT;
 
-		if (drv->states[i].target_residency <= sleep_length_us) {
+		if (drv->states[i].target_residency_ns <= cpu_data->sleep_length_ns) {
 			idx_timer = i;
-			if (drv->states[i].target_residency <= measured_us)
+			if (drv->states[i].target_residency_ns <= measured_ns)
 				idx_hit = i;
 		}
 	}
@@ -193,7 +190,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * Save idle duration values corresponding to non-timer wakeups for
 	 * pattern detection.
 	 */
-	cpu_data->intervals[cpu_data->interval_idx++] = measured_us;
+	cpu_data->intervals[cpu_data->interval_idx++] = measured_ns;
 	if (cpu_data->interval_idx >= INTERVALS)
 		cpu_data->interval_idx = 0;
 }
@@ -203,11 +200,11 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
  * @drv: cpuidle driver containing state data.
  * @dev: Target CPU.
  * @state_idx: Index of the capping idle state.
- * @duration_us: Idle duration value to match.
+ * @duration_ns: Idle duration value to match.
  */
 static int teo_find_shallower_state(struct cpuidle_driver *drv,
 				    struct cpuidle_device *dev, int state_idx,
-				    unsigned int duration_us)
+				    u64 duration_ns)
 {
 	int i;
 
@@ -216,7 +213,7 @@ static int teo_find_shallower_state(struct cpuidle_driver *drv,
 			continue;
 
 		state_idx = i;
-		if (drv->states[i].target_residency <= duration_us)
+		if (drv->states[i].target_residency_ns <= duration_ns)
 			break;
 	}
 	return state_idx;
@@ -232,8 +229,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		      bool *stop_tick)
 {
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
-	int latency_req = cpuidle_governor_latency_req(dev->cpu);
-	unsigned int duration_us, hits, misses, early_hits;
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+	u64 duration_ns;
+	unsigned int hits, misses, early_hits;
 	int max_early_idx, prev_max_early_idx, constraint_idx, idx, i;
 	ktime_t delta_tick;
 
@@ -244,8 +242,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 	cpu_data->time_span_ns = local_clock();
 
-	cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick);
-	duration_us = ktime_to_us(cpu_data->sleep_length_ns);
+	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
+	cpu_data->sleep_length_ns = duration_ns;
 
 	hits = 0;
 	misses = 0;
@@ -263,7 +261,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * Ignore disabled states with target residencies beyond
 			 * the anticipated idle duration.
 			 */
-			if (s->target_residency > duration_us)
+			if (s->target_residency_ns > duration_ns)
 				continue;
 
 			/*
@@ -302,7 +300,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * shallow for that role.
 			 */
 			if (!(tick_nohz_tick_stopped() &&
-			      drv->states[idx].target_residency < TICK_USEC)) {
+			      drv->states[idx].target_residency_ns < TICK_NSEC)) {
 				prev_max_early_idx = max_early_idx;
 				early_hits = cpu_data->states[i].early_hits;
 				max_early_idx = idx;
@@ -317,10 +315,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			misses = cpu_data->states[i].misses;
 		}
 
-		if (s->target_residency > duration_us)
+		if (s->target_residency_ns > duration_ns)
 			break;
 
-		if (s->exit_latency > latency_req && constraint_idx > i)
+		if (s->exit_latency_ns > latency_req && constraint_idx > i)
 			constraint_idx = i;
 
 		idx = i;
@@ -329,7 +327,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 		if (early_hits < cpu_data->states[i].early_hits &&
 		    !(tick_nohz_tick_stopped() &&
-		      drv->states[i].target_residency < TICK_USEC)) {
+		      drv->states[i].target_residency_ns < TICK_NSEC)) {
 			prev_max_early_idx = max_early_idx;
 			early_hits = cpu_data->states[i].early_hits;
 			max_early_idx = i;
@@ -355,7 +353,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 		if (max_early_idx >= 0) {
 			idx = max_early_idx;
-			duration_us = drv->states[idx].target_residency;
+			duration_ns = drv->states[idx].target_residency_ns;
 		}
 	}
 
@@ -377,9 +375,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * the current expected idle duration value.
 		 */
 		for (i = 0; i < INTERVALS; i++) {
-			unsigned int val = cpu_data->intervals[i];
+			u64 val = cpu_data->intervals[i];
 
-			if (val >= duration_us)
+			if (val >= duration_ns)
 				continue;
 
 			count++;
@@ -391,17 +389,17 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * values are in the interesting range.
 		 */
 		if (count > INTERVALS / 2) {
-			unsigned int avg_us = div64_u64(sum, count);
+			u64 avg_ns = div64_u64(sum, count);
 
 			/*
 			 * Avoid spending too much time in an idle state that
 			 * would be too shallow.
 			 */
-			if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) {
-				duration_us = avg_us;
-				if (drv->states[idx].target_residency > avg_us)
+			if (!(tick_nohz_tick_stopped() && avg_ns < TICK_NSEC)) {
+				duration_ns = avg_ns;
+				if (drv->states[idx].target_residency_ns > avg_ns)
 					idx = teo_find_shallower_state(drv, dev,
-								       idx, avg_us);
+								       idx, avg_ns);
 			}
 		}
 	}
@@ -411,9 +409,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * expected idle duration is shorter than the tick period length.
 	 */
 	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-	    duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
-		unsigned int delta_tick_us = ktime_to_us(delta_tick);
-
+	    duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
 		*stop_tick = false;
 
 		/*
@@ -422,8 +418,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * till the closest timer including the tick, try to correct
 		 * that.
 		 */
-		if (idx > 0 && drv->states[idx].target_residency > delta_tick_us)
-			idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us);
+		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick)
+			idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
 	}
 
 	return idx;
@@ -467,7 +463,7 @@ static int teo_enable_device(struct cpuidle_driver *drv,
 	memset(cpu_data, 0, sizeof(*cpu_data));
 
 	for (i = 0; i < INTERVALS; i++)
-		cpu_data->intervals[i] = UINT_MAX;
+		cpu_data->intervals[i] = U64_MAX;
 
 	return 0;
 }
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index c8fa5f41dfc4..9f1ace9c53da 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -49,6 +49,8 @@ void cpuidle_poll_state_init(struct cpuidle_driver *drv)
 	snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
 	state->exit_latency = 0;
 	state->target_residency = 0;
+	state->exit_latency_ns = 0;
+	state->target_residency_ns = 0;
 	state->power_usage = -1;
 	state->enter = poll_idle;
 	state->disabled = false;
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index b9d2bae2edb0..cd5e2bbf1a29 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -273,16 +273,30 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \
 	return sprintf(buf, "%s\n", state->_name);\
 }
 
-define_show_state_function(exit_latency)
-define_show_state_function(target_residency)
+#define define_show_state_time_function(_name) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+				  struct cpuidle_state_usage *state_usage, \
+				  char *buf) \
+{ \
+	return sprintf(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \
+}
+
+define_show_state_time_function(exit_latency)
+define_show_state_time_function(target_residency)
 define_show_state_function(power_usage)
 define_show_state_ull_function(usage)
-define_show_state_ull_function(time)
 define_show_state_str_function(name)
 define_show_state_str_function(desc)
 define_show_state_ull_function(above)
 define_show_state_ull_function(below)
 
+static ssize_t show_state_time(struct cpuidle_state *state,
+			       struct cpuidle_state_usage *state_usage,
+			       char *buf)
+{
+	return sprintf(buf, "%llu\n", ktime_to_us(state_usage->time_ns));
+}
+
 static ssize_t show_state_disable(struct cpuidle_state *state,
 				  struct cpuidle_state_usage *state_usage,
 				  char *buf)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index d23a3b1ddcf6..22602747f468 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -35,7 +35,7 @@ struct cpuidle_driver;
 struct cpuidle_state_usage {
 	unsigned long long	disable;
 	unsigned long long	usage;
-	unsigned long long	time; /* in US */
+	u64			time_ns;
 	unsigned long long	above; /* Number of times it's been too deep */
 	unsigned long long	below; /* Number of times it's been too shallow */
 #ifdef CONFIG_SUSPEND
@@ -48,6 +48,8 @@ struct cpuidle_state {
 	char		name[CPUIDLE_NAME_LEN];
 	char		desc[CPUIDLE_DESC_LEN];
 
+	u64		exit_latency_ns;
+	u64		target_residency_ns;
 	unsigned int	flags;
 	unsigned int	exit_latency; /* in US */
 	int		power_usage; /* in mW */
@@ -89,7 +91,7 @@ struct cpuidle_device {
 	ktime_t			next_hrtimer;
 
 	int			last_state_idx;
-	int			last_residency;
+	u64			last_residency_ns;
 	u64			poll_limit_ns;
 	struct cpuidle_state_usage	states_usage[CPUIDLE_STATE_MAX];
 	struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
@@ -263,7 +265,7 @@ struct cpuidle_governor {
 
 #ifdef CONFIG_CPU_IDLE
 extern int cpuidle_register_governor(struct cpuidle_governor *gov);
-extern int cpuidle_governor_latency_req(unsigned int cpu);
+extern s64 cpuidle_governor_latency_req(unsigned int cpu);
 #else
 static inline int cpuidle_register_governor(struct cpuidle_governor *gov)
 {return 0;}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3f8c7867c14c..2eb06682f89e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * update no idle residency and return.
 	 */
 	if (current_clr_polling_and_test()) {
-		dev->last_residency = 0;
+		dev->last_residency_ns = 0;
 		local_irq_enable();
 		return -EBUSY;
 	}
-- 
Gitee


From ef2f07cd874f3af745f1fe7cec2f576cbe71a22d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 12 Nov 2019 10:51:16 +0100
Subject: [PATCH 1081/2161] cpuidle: teo: Exclude cpuidle overhead from
 computations

commit b6495b7f004d01b9ecf9ed5fd31368241d3c5589 upstream.

One purpose of the computations in teo_update() is to determine
whether or not the (saved) time till the next timer event and the
measured idle duration fall into the same "bin", so avoid using
values that include the cpuidle overhead to obtain the latter.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/governors/teo.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index f5090363de72..480fcce5f531 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -130,7 +130,14 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	} else {
 		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
 
-		measured_ns = cpu_data->time_span_ns;
+		/*
+		 * The computations below are to determine whether or not the
+		 * (saved) time till the next timer event and the measured idle
+		 * duration fall into the same "bin", so use last_residency_ns
+		 * for that instead of time_span_ns which includes the cpuidle
+		 * overhead.
+		 */
+		measured_ns = dev->last_residency_ns;
 		/*
 		 * The delay between the wakeup and the first instruction
 		 * executed by the CPU is not likely to be worst-case every
-- 
Gitee


From 240cb11fc8b80a5aa1b590dee5fe2902f53977a2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 13 Nov 2019 01:10:13 +0100
Subject: [PATCH 1082/2161] cpuidle: teo: Avoid code duplication in
 conditionals

commit 85f6a17f24f9f7faa4aaecf98e12acdd312aa4c9 upstream.

There are three places in teo_select() where a given amount of time
is compared with TICK_NSEC if tick_nohz_tick_stopped() returns true,
which is a bit of duplicated code.

Avoid that code duplication by defining a helper function to do the
check and using it in all of the places in question.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/governors/teo.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 480fcce5f531..6deaaf5f05b5 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -202,6 +202,11 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		cpu_data->interval_idx = 0;
 }
 
+static bool teo_time_ok(u64 interval_ns)
+{
+	return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC;
+}
+
 /**
  * teo_find_shallower_state - Find shallower idle state matching given duration.
  * @drv: cpuidle driver containing state data.
@@ -306,8 +311,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * check if the current candidate state is not too
 			 * shallow for that role.
 			 */
-			if (!(tick_nohz_tick_stopped() &&
-			      drv->states[idx].target_residency_ns < TICK_NSEC)) {
+			if (teo_time_ok(drv->states[idx].target_residency_ns)) {
 				prev_max_early_idx = max_early_idx;
 				early_hits = cpu_data->states[i].early_hits;
 				max_early_idx = idx;
@@ -333,8 +337,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		misses = cpu_data->states[i].misses;
 
 		if (early_hits < cpu_data->states[i].early_hits &&
-		    !(tick_nohz_tick_stopped() &&
-		      drv->states[i].target_residency_ns < TICK_NSEC)) {
+		    teo_time_ok(drv->states[i].target_residency_ns)) {
 			prev_max_early_idx = max_early_idx;
 			early_hits = cpu_data->states[i].early_hits;
 			max_early_idx = i;
@@ -402,7 +405,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * Avoid spending too much time in an idle state that
 			 * would be too shallow.
 			 */
-			if (!(tick_nohz_tick_stopped() && avg_ns < TICK_NSEC)) {
+			if (teo_time_ok(avg_ns)) {
 				duration_ns = avg_ns;
 				if (drv->states[idx].target_residency_ns > avg_ns)
 					idx = teo_find_shallower_state(drv, dev,
-- 
Gitee


From efa7e08305454ae89450bae650c59e9e1193bc38 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 18 Nov 2019 12:11:24 +0100
Subject: [PATCH 1083/2161] cpuidle: Introduce cpuidle_driver_state_disabled()
 for driver quirks

commit cbda56d5fefcebc01448982a55836c88a825b34c upstream.

Commit 99e98d3fb100 ("cpuidle: Consolidate disabled state checks")
overlooked the fact that the imx6q and tegra20 cpuidle drivers use
the "disabled" field in struct cpuidle_state for quirks which trigger
after the initialization of cpuidle, so reading the initial value of
that field is not sufficient for those drivers.

In order to allow them to implement the quirks without using the
"disabled" field in struct cpuidle_state, introduce a new helper
function and modify them to use it.

Fixes: 99e98d3fb100 ("cpuidle: Consolidate disabled state checks")
Reported-by: Len Brown <lenb@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/arm/mach-imx/cpuidle-imx6q.c     |  4 ++--
 arch/arm/mach-tegra/cpuidle-tegra20.c |  2 +-
 drivers/cpuidle/driver.c              | 28 +++++++++++++++++++++++++++
 include/linux/cpuidle.h               |  4 ++++
 4 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c b/arch/arm/mach-imx/cpuidle-imx6q.c
index 39a7d9393641..24dd5bbe60e4 100644
--- a/arch/arm/mach-imx/cpuidle-imx6q.c
+++ b/arch/arm/mach-imx/cpuidle-imx6q.c
@@ -62,13 +62,13 @@ static struct cpuidle_driver imx6q_cpuidle_driver = {
  */
 void imx6q_cpuidle_fec_irqs_used(void)
 {
-	imx6q_cpuidle_driver.states[1].disabled = true;
+	cpuidle_driver_state_disabled(&imx6q_cpuidle_driver, 1, true);
 }
 EXPORT_SYMBOL_GPL(imx6q_cpuidle_fec_irqs_used);
 
 void imx6q_cpuidle_fec_irqs_unused(void)
 {
-	imx6q_cpuidle_driver.states[1].disabled = false;
+	cpuidle_driver_state_disabled(&imx6q_cpuidle_driver, 1, false);
 }
 EXPORT_SYMBOL_GPL(imx6q_cpuidle_fec_irqs_unused);
 
diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
index 2447427cb4a8..69f3fa270fbe 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra20.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
@@ -203,7 +203,7 @@ void tegra20_cpuidle_pcie_irqs_in_use(void)
 {
 	pr_info_once(
 		"Disabling cpuidle LP2 state, since PCIe IRQs are in use\n");
-	tegra_idle_driver.states[1].disabled = true;
+	cpuidle_driver_state_disabled(&tegra_idle_driver, 1, true);
 }
 
 int __init tegra20_cpuidle_init(void)
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index fcaf8b2bab96..c76423aaef4d 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -389,3 +389,31 @@ void cpuidle_driver_unref(void)
 
 	spin_unlock(&cpuidle_driver_lock);
 }
+
+/**
+ * cpuidle_driver_state_disabled - Disable or enable an idle state
+ * @drv: cpuidle driver owning the state
+ * @idx: State index
+ * @disable: Whether or not to disable the state
+ */
+void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx,
+				 bool disable)
+{
+	unsigned int cpu;
+
+	mutex_lock(&cpuidle_lock);
+
+	for_each_cpu(cpu, drv->cpumask) {
+		struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+
+		if (!dev)
+			continue;
+
+		if (disable)
+			dev->states_usage[idx].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
+		else
+			dev->states_usage[idx].disable &= ~CPUIDLE_STATE_DISABLED_BY_DRIVER;
+	}
+
+	mutex_unlock(&cpuidle_lock);
+}
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 22602747f468..afb6a573b46d 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -149,6 +149,8 @@ extern int cpuidle_register_driver(struct cpuidle_driver *drv);
 extern struct cpuidle_driver *cpuidle_get_driver(void);
 extern struct cpuidle_driver *cpuidle_driver_ref(void);
 extern void cpuidle_driver_unref(void);
+extern void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx,
+					bool disable);
 extern void cpuidle_unregister_driver(struct cpuidle_driver *drv);
 extern int cpuidle_register_device(struct cpuidle_device *dev);
 extern void cpuidle_unregister_device(struct cpuidle_device *dev);
@@ -186,6 +188,8 @@ static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
 static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; }
 static inline struct cpuidle_driver *cpuidle_driver_ref(void) {return NULL; }
 static inline void cpuidle_driver_unref(void) {}
+static inline void cpuidle_driver_state_disabled(struct cpuidle_driver *drv,
+					       int idx, bool disable) { }
 static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { }
 static inline int cpuidle_register_device(struct cpuidle_device *dev)
 {return -ENODEV; }
-- 
Gitee


From 0bc05ada95b6a7c53775d2f269f674f65ed64613 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Sat, 16 Nov 2019 14:16:12 +0100
Subject: [PATCH 1084/2161] cpuidle: Allow idle injection to apply exit latency
 limit

commit c55b51a06b01d67a99457bb82a8c31081c7faa23 upstream.

In some cases it may be useful to specify an exit latency limit for
the idle state to be used during CPU idle time injection.

Instead of duplicating the information in struct cpuidle_device
or propagating the latency limit in the call stack, replace the
use_deepest_state field with forced_latency_limit_ns to represent
that limit, so that the deepest idle state with exit latency within
that limit is forced (i.e. no governors) when it is set.

A zero exit latency limit for forced idle means to use governors in
the usual way (analogous to use_deepest_state equal to "false" before
this change).

Additionally, add play_idle_precise() taking two arguments, the
duration of forced idle and the idle state exit latency limit, both
in nanoseconds, and redefine play_idle() as a wrapper around that
new function.

This change is preparatory, no functional impact is expected.

Suggested-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
[ rjw: Subject, changelog, cpuidle_use_deepest_state() kerneldoc, whitespace ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle.c | 13 +++++++------
 include/linux/cpu.h       |  7 ++++++-
 include/linux/cpuidle.h   |  6 +++---
 kernel/sched/idle.c       | 14 +++++++-------
 4 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 936cc580a4cd..ec7045d48200 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -99,20 +99,21 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 }
 
 /**
- * cpuidle_use_deepest_state - Set/clear governor override flag.
- * @enable: New value of the flag.
+ * cpuidle_use_deepest_state - Set/unset governor override mode.
+ * @latency_limit_ns: Idle state exit latency limit (or no override if 0).
  *
- * Set/unset the current CPU to use the deepest idle state (override governors
- * going forward if set).
+ * If @latency_limit_ns is nonzero, set the current CPU to use the deepest idle
+ * state with exit latency within @latency_limit_ns (override governors going
+ * forward), or do not override governors if it is zero.
  */
-void cpuidle_use_deepest_state(bool enable)
+void cpuidle_use_deepest_state(u64 latency_limit_ns)
 {
 	struct cpuidle_device *dev;
 
 	preempt_disable();
 	dev = cpuidle_get_device();
 	if (dev)
-		dev->use_deepest_state = enable;
+		dev->forced_idle_latency_limit_ns = latency_limit_ns;
 	preempt_enable();
 }
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4e9822cb11f3..94cda8c3b5d1 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -190,7 +190,12 @@ void arch_cpu_idle_dead(void);
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
-void play_idle(unsigned long duration_us);
+void play_idle_precise(u64 duration_ns, u64 latency_ns);
+
+static inline void play_idle(unsigned long duration_us)
+{
+	play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX);
+}
 
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index afb6a573b46d..72b26ff1de4b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -85,7 +85,6 @@ struct cpuidle_driver_kobj;
 struct cpuidle_device {
 	unsigned int		registered:1;
 	unsigned int		enabled:1;
-	unsigned int		use_deepest_state:1;
 	unsigned int		poll_time_limit:1;
 	unsigned int		cpu;
 	ktime_t			next_hrtimer;
@@ -93,6 +92,7 @@ struct cpuidle_device {
 	int			last_state_idx;
 	u64			last_residency_ns;
 	u64			poll_limit_ns;
+	u64			forced_idle_latency_limit_ns;
 	struct cpuidle_state_usage	states_usage[CPUIDLE_STATE_MAX];
 	struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
 	struct cpuidle_driver_kobj *kobj_driver;
@@ -216,7 +216,7 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 				      struct cpuidle_device *dev);
 extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev);
-extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_use_deepest_state(u64 latency_limit_ns);
 #else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 					     struct cpuidle_device *dev)
@@ -224,7 +224,7 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
 				       struct cpuidle_device *dev)
 {return -ENODEV; }
-static inline void cpuidle_use_deepest_state(bool enable)
+static inline void cpuidle_use_deepest_state(u64 latency_limit_ns)
 {
 }
 #endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2eb06682f89e..bb86b00ebb6f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -165,7 +165,7 @@ static void cpuidle_idle_call(void)
 	 * until a proper wakeup interrupt happens.
 	 */
 
-	if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+	if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
 		if (idle_should_enter_s2idle()) {
 			rcu_idle_enter();
 
@@ -312,7 +312,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-void play_idle(unsigned long duration_us)
+void play_idle_precise(u64 duration_ns, u64 latency_ns)
 {
 	struct idle_timer it;
 
@@ -324,29 +324,29 @@ void play_idle(unsigned long duration_us)
 	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
 	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
 	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
-	WARN_ON_ONCE(!duration_us);
+	WARN_ON_ONCE(!duration_ns);
 
 	rcu_sleep_check();
 	preempt_disable();
 	current->flags |= PF_IDLE;
-	cpuidle_use_deepest_state(true);
+	cpuidle_use_deepest_state(latency_ns);
 
 	it.done = 0;
 	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	it.timer.function = idle_inject_timer_fn;
-	hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC),
+	hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
 		      HRTIMER_MODE_REL_PINNED);
 
 	while (!READ_ONCE(it.done))
 		do_idle();
 
-	cpuidle_use_deepest_state(false);
+	cpuidle_use_deepest_state(0);
 	current->flags &= ~PF_IDLE;
 
 	preempt_fold_need_resched();
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(play_idle);
+EXPORT_SYMBOL_GPL(play_idle_precise);
 
 void cpu_startup_entry(enum cpuhp_state state)
 {
-- 
Gitee


From 2c9690eddf2c9d08f2d77da6febaa27c523bbcec Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Sat, 16 Nov 2019 14:16:13 +0100
Subject: [PATCH 1085/2161] cpuidle: Pass exit latency limit to
 cpuidle_use_deepest_state()

commit 5aa9ba6312e36c18626e73506b92d1513d815435 upstream.

Modify cpuidle_use_deepest_state() to take an additional exit latency
limit argument to be passed to find_deepest_idle_state() and make
cpuidle_idle_call() pass dev->forced_idle_latency_limit_ns to it for
forced idle.

Suggested-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
[ rjw: Rebase and rearrange code, subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle.c | 5 +++--
 include/linux/cpuidle.h   | 6 ++++--
 kernel/sched/idle.c       | 8 +++++++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ec7045d48200..ab9d81d0623d 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -123,9 +123,10 @@ void cpuidle_use_deepest_state(u64 latency_limit_ns)
  * @dev: cpuidle device for the given CPU.
  */
 int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-			       struct cpuidle_device *dev)
+			       struct cpuidle_device *dev,
+			       u64 latency_limit_ns)
 {
-	return find_deepest_state(drv, dev, U64_MAX, 0, false);
+	return find_deepest_state(drv, dev, latency_limit_ns, 0, false);
 }
 
 #ifdef CONFIG_SUSPEND
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 72b26ff1de4b..2dbe46b7c213 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -213,13 +213,15 @@ static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
 
 #ifdef CONFIG_CPU_IDLE
 extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev);
+				      struct cpuidle_device *dev,
+				      u64 latency_limit_ns);
 extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev);
 extern void cpuidle_use_deepest_state(u64 latency_limit_ns);
 #else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-					     struct cpuidle_device *dev)
+					     struct cpuidle_device *dev,
+					     u64 latency_limit_ns)
 {return -ENODEV; }
 static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
 				       struct cpuidle_device *dev)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bb86b00ebb6f..75a2dbbedcdc 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -166,6 +166,8 @@ static void cpuidle_idle_call(void)
 	 */
 
 	if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
+		u64 max_latency_ns;
+
 		if (idle_should_enter_s2idle()) {
 			rcu_idle_enter();
 
@@ -176,12 +178,16 @@ static void cpuidle_idle_call(void)
 			}
 
 			rcu_idle_exit();
+
+			max_latency_ns = U64_MAX;
+		} else {
+			max_latency_ns = dev->forced_idle_latency_limit_ns;
 		}
 
 		tick_nohz_idle_stop_tick();
 		rcu_idle_enter();
 
-		next_state = cpuidle_find_deepest_state(drv, dev);
+		next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
 		call_cpuidle(drv, dev, next_state);
 	} else {
 		bool stop_tick = true;
-- 
Gitee


From 0b351f33f6305fd09e6a43cde0c09642c25212d5 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 21 Nov 2019 04:19:12 +0100
Subject: [PATCH 1086/2161] cpuidle: Fix Kconfig indentation

commit 656b4e639831a6110dab8cd7e557c41f87514869 upstream.

Adjust indentation from spaces to tab (+optional two spaces) as in
coding style with command like:
	$ sed -e 's/^        /\t/' -i */Kconfig

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/Kconfig     | 16 ++++++++--------
 drivers/cpuidle/Kconfig.arm | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 88727b7c0d59..c0aeedd66f02 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -16,7 +16,7 @@ config CPU_IDLE
 if CPU_IDLE
 
 config CPU_IDLE_MULTIPLE_DRIVERS
-        bool
+	bool
 
 config CPU_IDLE_GOV_LADDER
 	bool "Ladder governor (for periodic timer tick)"
@@ -63,13 +63,13 @@ source "drivers/cpuidle/Kconfig.powerpc"
 endmenu
 
 config HALTPOLL_CPUIDLE
-       tristate "Halt poll cpuidle driver"
-       depends on X86 && KVM_GUEST
-       default y
-       help
-         This option enables halt poll cpuidle driver, which allows to poll
-         before halting in the guest (more efficient than polling in the
-         host via halt_poll_ns for some scenarios).
+	tristate "Halt poll cpuidle driver"
+	depends on X86 && KVM_GUEST
+	default y
+	help
+	 This option enables halt poll cpuidle driver, which allows to poll
+	 before halting in the guest (more efficient than polling in the
+	 host via halt_poll_ns for some scenarios).
 
 endif
 
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index d8530475493c..e91ab792d14d 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -3,15 +3,15 @@
 # ARM CPU Idle drivers
 #
 config ARM_CPUIDLE
-        bool "Generic ARM/ARM64 CPU idle Driver"
-        select DT_IDLE_STATES
+	bool "Generic ARM/ARM64 CPU idle Driver"
+	select DT_IDLE_STATES
 	select CPU_IDLE_MULTIPLE_DRIVERS
-        help
-          Select this to enable generic cpuidle driver for ARM.
-          It provides a generic idle driver whose idle states are configured
-          at run-time through DT nodes. The CPUidle suspend backend is
-          initialized by calling the CPU operations init idle hook
-          provided by architecture code.
+	help
+	  Select this to enable generic cpuidle driver for ARM.
+	  It provides a generic idle driver whose idle states are configured
+	  at run-time through DT nodes. The CPUidle suspend backend is
+	  initialized by calling the CPU operations init idle hook
+	  provided by architecture code.
 
 config ARM_PSCI_CPUIDLE
 	bool "PSCI CPU idle Driver"
-- 
Gitee


From 3d5bf8f66e349ab65618236efd0782d8d23809b3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 21 Nov 2019 19:41:51 +0100
Subject: [PATCH 1087/2161] cpuidle: Drop disabled field from struct
 cpuidle_state

commit ba1e78a1dc0ca3e92f0be82279e6ba24177af7d6 upstream.

After recent cpuidle updates the "disabled" field in struct
cpuidle_state is only used by two drivers (intel_idle and shmobile
cpuidle) for marking unusable idle states, but that may as well be
achieved with the help of a state flag, so define an "unusable" idle
state flag, CPUIDLE_FLAG_UNUSABLE, make the drivers in question use
it instead of the "disabled" field and make the core set
CPUIDLE_STATE_DISABLED_BY_DRIVER for the idle states with that flag
set.

After the above changes, the "disabled" field in struct cpuidle_state
is not used any more, so drop it.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/sh/kernel/cpu/shmobile/cpuidle.c | 8 ++++----
 drivers/cpuidle/cpuidle.c             | 2 +-
 drivers/cpuidle/poll_state.c          | 1 -
 drivers/idle/intel_idle.c             | 6 +++---
 include/linux/cpuidle.h               | 2 +-
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c
index dbd2cdec2ddb..b0f9c8f8fd14 100644
--- a/arch/sh/kernel/cpu/shmobile/cpuidle.c
+++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c
@@ -67,7 +67,7 @@ static struct cpuidle_driver cpuidle_driver = {
 			.enter = cpuidle_sleep_enter,
 			.name = "C2",
 			.desc = "SuperH Sleep Mode [SF]",
-			.disabled = true,
+			.flags = CPUIDLE_FLAG_UNUSABLE,
 		},
 		{
 			.exit_latency = 2300,
@@ -76,7 +76,7 @@ static struct cpuidle_driver cpuidle_driver = {
 			.enter = cpuidle_sleep_enter,
 			.name = "C3",
 			.desc = "SuperH Mobile Standby Mode [SF]",
-			.disabled = true,
+			.flags = CPUIDLE_FLAG_UNUSABLE,
 		},
 	},
 	.safe_state_index = 0,
@@ -86,10 +86,10 @@ static struct cpuidle_driver cpuidle_driver = {
 int __init sh_mobile_setup_cpuidle(void)
 {
 	if (sh_mobile_sleep_supported & SUSP_SH_SF)
-		cpuidle_driver.states[1].disabled = false;
+		cpuidle_driver.states[1].flags = CPUIDLE_FLAG_NONE;
 
 	if (sh_mobile_sleep_supported & SUSP_SH_STANDBY)
-		cpuidle_driver.states[2].disabled = false;
+		cpuidle_driver.states[2].flags = CPUIDLE_FLAG_NONE;
 
 	return cpuidle_register(&cpuidle_driver, NULL);
 }
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ab9d81d0623d..ac71cc1292a3 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -574,7 +574,7 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 		return -EINVAL;
 
 	for (i = 0; i < drv->state_count; i++)
-		if (drv->states[i].disabled)
+		if (drv->states[i].flags & CPUIDLE_FLAG_UNUSABLE)
 			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
 
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index 9f1ace9c53da..f7e83613ae94 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -53,7 +53,6 @@ void cpuidle_poll_state_init(struct cpuidle_driver *drv)
 	state->target_residency_ns = 0;
 	state->power_usage = -1;
 	state->enter = poll_idle;
-	state->disabled = false;
 	state->flags = CPUIDLE_FLAG_POLLING;
 }
 EXPORT_SYMBOL_GPL(cpuidle_poll_state_init);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 347b08b56042..75fd2a7b0842 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1291,8 +1291,8 @@ static void sklh_idle_state_table_update(void)
 			return;
 	}
 
-	skl_cstates[5].disabled = 1;	/* C8-SKL */
-	skl_cstates[6].disabled = 1;	/* C9-SKL */
+	skl_cstates[5].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C8-SKL */
+	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
 /*
  * intel_idle_state_table_update()
@@ -1355,7 +1355,7 @@ static void __init intel_idle_cpuidle_driver_init(void)
 			continue;
 
 		/* if state marked as disabled, skip it */
-		if (cpuidle_state_table[cstate].disabled != 0) {
+		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) {
 			pr_debug("state %s is disabled\n",
 				 cpuidle_state_table[cstate].name);
 			continue;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 2dbe46b7c213..1dabe36bd011 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -54,7 +54,6 @@ struct cpuidle_state {
 	unsigned int	exit_latency; /* in US */
 	int		power_usage; /* in mW */
 	unsigned int	target_residency; /* in US */
-	bool		disabled; /* disabled on all CPUs */
 
 	int (*enter)	(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv,
@@ -77,6 +76,7 @@ struct cpuidle_state {
 #define CPUIDLE_FLAG_POLLING	BIT(0) /* polling state */
 #define CPUIDLE_FLAG_COUPLED	BIT(1) /* state applies to multiple cpus */
 #define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */
+#define CPUIDLE_FLAG_UNUSABLE	BIT(3) /* avoid using this state */
 
 struct cpuidle_device_kobj;
 struct cpuidle_state_kobj;
-- 
Gitee


From 55f0239b51ec0ea2c978f5ed2ec2f02caa287ca8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 28 Nov 2019 14:38:03 -0800
Subject: [PATCH 1088/2161] cpuidle: minor Kconfig help text fixes

commit 4d30d4a0441dac4452fcc188cd21536bedaec5df upstream.

End sentences in help text with a period (aka full stop).

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/Kconfig.arm | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index e91ab792d14d..a224d33dda7f 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -65,21 +65,21 @@ config ARM_U8500_CPUIDLE
 	bool "Cpu Idle Driver for the ST-E u8500 processors"
 	depends on ARCH_U8500 && !ARM64
 	help
-	  Select this to enable cpuidle for ST-E u8500 processors
+	  Select this to enable cpuidle for ST-E u8500 processors.
 
 config ARM_AT91_CPUIDLE
 	bool "Cpu Idle Driver for the AT91 processors"
 	default y
 	depends on ARCH_AT91 && !ARM64
 	help
-	  Select this to enable cpuidle for AT91 processors
+	  Select this to enable cpuidle for AT91 processors.
 
 config ARM_EXYNOS_CPUIDLE
 	bool "Cpu Idle Driver for the Exynos processors"
 	depends on ARCH_EXYNOS && !ARM64
 	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
 	help
-	  Select this to enable cpuidle for Exynos processors
+	  Select this to enable cpuidle for Exynos processors.
 
 config ARM_MVEBU_V7_CPUIDLE
 	bool "CPU Idle Driver for mvebu v7 family processors"
-- 
Gitee


From 03030b9c5a931575dbb0b1b3561ef7684424fab5 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 21 Nov 2019 04:19:15 +0100
Subject: [PATCH 1089/2161] cpufreq: Fix Kconfig indentation

commit cde10f856a7de191a773d880bb524ea15084e1cc upstream.

Adjust indentation from spaces to tab (+optional two spaces) as in
coding style with command like:
	$ sed -e 's/^        /\t/' -i */Kconfig

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpufreq/Kconfig.powerpc |  8 ++++----
 drivers/cpufreq/Kconfig.x86     | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/cpufreq/Kconfig.powerpc b/drivers/cpufreq/Kconfig.powerpc
index 35b4f700f054..58151ca56695 100644
--- a/drivers/cpufreq/Kconfig.powerpc
+++ b/drivers/cpufreq/Kconfig.powerpc
@@ -48,9 +48,9 @@ config PPC_PASEMI_CPUFREQ
 	  PWRficient processors.
 
 config POWERNV_CPUFREQ
-       tristate "CPU frequency scaling for IBM POWERNV platform"
-       depends on PPC_POWERNV
-       default y
-       help
+	tristate "CPU frequency scaling for IBM POWERNV platform"
+	depends on PPC_POWERNV
+	default y
+	help
 	 This adds support for CPU frequency switching on IBM POWERNV
 	 platform
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index dfa6457deaf6..a6528388952e 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -4,17 +4,17 @@
 #
 
 config X86_INTEL_PSTATE
-       bool "Intel P state control"
-       depends on X86
-       select ACPI_PROCESSOR if ACPI
-       select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
-       help
-          This driver provides a P state for Intel core processors.
+	bool "Intel P state control"
+	depends on X86
+	select ACPI_PROCESSOR if ACPI
+	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
+	help
+	  This driver provides a P state for Intel core processors.
 	  The driver implements an internal governor and will become
-          the scaling driver and governor for Sandy bridge processors.
+	  the scaling driver and governor for Sandy bridge processors.
 
 	  When this driver is enabled it will become the preferred
-          scaling driver for Sandy bridge processors.
+	  scaling driver for Sandy bridge processors.
 
 	  If in doubt, say N.
 
-- 
Gitee


From 7a48ac5cb771fde09a3739b3633c314c520cedf3 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Wed, 20 Nov 2019 21:40:04 +0800
Subject: [PATCH 1090/2161] power: avs: Fix Kconfig indentation

commit 2a0efc77735b48c42aedbe99df1f8f91aa402f57 upstream.

Adjust indentation from spaces to tab (+optional two spaces) as in
coding style with command like:
	$ sed -e 's/^        /\t/' -i */Kconfig

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/power/avs/Kconfig | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/power/avs/Kconfig b/drivers/power/avs/Kconfig
index b5a217b828dc..089b6244b716 100644
--- a/drivers/power/avs/Kconfig
+++ b/drivers/power/avs/Kconfig
@@ -13,9 +13,9 @@ menuconfig POWER_AVS
 	  Say Y here to enable Adaptive Voltage Scaling class support.
 
 config ROCKCHIP_IODOMAIN
-        tristate "Rockchip IO domain support"
-        depends on POWER_AVS && ARCH_ROCKCHIP && OF
-        help
-          Say y here to enable support io domains on Rockchip SoCs. It is
-          necessary for the io domain setting of the SoC to match the
-          voltage supplied by the regulators.
+	tristate "Rockchip IO domain support"
+	depends on POWER_AVS && ARCH_ROCKCHIP && OF
+	help
+	  Say y here to enable support io domains on Rockchip SoCs. It is
+	  necessary for the io domain setting of the SoC to match the
+	  voltage supplied by the regulators.
-- 
Gitee


From 3e2b9e17ab572cf8f1f9634fe5a90f2706a5ac44 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 10 Dec 2019 12:48:35 +0100
Subject: [PATCH 1091/2161] cpuidle: Fix cpuidle_driver_state_disabled()

commit b0142d66f4edb8578b7772d6d7ad731836b82ddb upstream.

It turns out that cpuidle_driver_state_disabled() can be called
before registering the cpufreq driver on some platforms, which
was not expected when it was introduced and which leads to a NULL
pointer dereference when trying to walk the CPUs associated with
the given cpuidle driver.

Fix the problem by making cpuidle_driver_state_disabled() check if
the driver's mask of CPUs associated with it is present and to set
CPUIDLE_FLAG_UNUSABLE for the given idle state in the driver's states
list if that is not the case to cause __cpuidle_register_device() to
set CPUIDLE_STATE_DISABLED_BY_DRIVER for that state for all cpuidle
devices registered by it later.

Fixes: cbda56d5fefc ("cpuidle: Introduce cpuidle_driver_state_disabled() for driver quirks")
Reported-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Tested-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/driver.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index c76423aaef4d..ce6a5f80fb83 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -403,6 +403,13 @@ void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx,
 
 	mutex_lock(&cpuidle_lock);
 
+	spin_lock(&cpuidle_driver_lock);
+
+	if (!drv->cpumask) {
+		drv->states[idx].flags |= CPUIDLE_FLAG_UNUSABLE;
+		goto unlock;
+	}
+
 	for_each_cpu(cpu, drv->cpumask) {
 		struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 
@@ -415,5 +422,8 @@ void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx,
 			dev->states_usage[idx].disable &= ~CPUIDLE_STATE_DISABLED_BY_DRIVER;
 	}
 
+unlock:
+	spin_unlock(&cpuidle_driver_lock);
+
 	mutex_unlock(&cpuidle_lock);
 }
-- 
Gitee


From df3e84217a4001248a288c190ca3c7bbe09c7966 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 11 Dec 2019 11:30:50 +0100
Subject: [PATCH 1092/2161] cpuidle: Drop unnecessary type cast in
 cpuidle_poll_time()

commit d4d8140176972fdb3f00bffc88e63af781de8d67 upstream.

The data type of the target_residency_ns field in struct cpuidle_state
is u64, so it does not need to be cast into u64.

Get rid of the unnecessary type cast.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ac71cc1292a3..2b1eee61787a 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -382,7 +382,7 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 		if (dev->states_usage[i].disable)
 			continue;
 
-		limit_ns = (u64)drv->states[i].target_residency_ns;
+		limit_ns = drv->states[i].target_residency_ns;
 		break;
 	}
 
-- 
Gitee


From dd5f939fffc4d87784ab7db300b0f6418d430179 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sun, 15 Dec 2019 13:02:05 +0000
Subject: [PATCH 1093/2161] cpuidle: clps711x: convert to
 devm_platform_ioremap_resource()

commit 22c48a439d6ac1dbe2db1092b64052592a47b017 upstream.

Use devm_platform_ioremap_resource() to simplify code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle-clps711x.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-clps711x.c b/drivers/cpuidle/cpuidle-clps711x.c
index 6e36740f5719..fc22c59b6c73 100644
--- a/drivers/cpuidle/cpuidle-clps711x.c
+++ b/drivers/cpuidle/cpuidle-clps711x.c
@@ -37,10 +37,7 @@ static struct cpuidle_driver clps711x_idle_driver = {
 
 static int __init clps711x_cpuidle_probe(struct platform_device *pdev)
 {
-	struct resource *res;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	clps711x_halt = devm_ioremap_resource(&pdev->dev, res);
+	clps711x_halt = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(clps711x_halt))
 		return PTR_ERR(clps711x_halt);
 
-- 
Gitee


From 43d08ff300ef32850abe1b4eedb181ceadbfeca8 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sun, 15 Dec 2019 13:02:06 +0000
Subject: [PATCH 1094/2161] cpuidle: kirkwood: convert to
 devm_platform_ioremap_resource()

commit 85c3ebd4a0510c0700e4f3e205bbb46757a53feb upstream.

Use devm_platform_ioremap_resource() to simplify code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle-kirkwood.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c
index d23d8f468c12..511c4f46027a 100644
--- a/drivers/cpuidle/cpuidle-kirkwood.c
+++ b/drivers/cpuidle/cpuidle-kirkwood.c
@@ -55,10 +55,7 @@ static struct cpuidle_driver kirkwood_idle_driver = {
 /* Initialize CPU idle by registering the idle states */
 static int kirkwood_cpuidle_probe(struct platform_device *pdev)
 {
-	struct resource *res;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	ddr_operation_base = devm_ioremap_resource(&pdev->dev, res);
+	ddr_operation_base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(ddr_operation_base))
 		return PTR_ERR(ddr_operation_base);
 
-- 
Gitee


From 6528bad1955b61c69003214f2028e0931dacfba2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 2 Jan 2020 21:42:16 +0100
Subject: [PATCH 1095/2161] cpuidle: Drop unused cpuidle_driver_ref/unref()
 functions

commit 577a2f41f4c7aced4fed41b20ee77cedd8c197cf upstream.

The cpuidle_driver_ref() and cpuidle_driver_unref() functions are not
used and the refcnt field in struct cpuidle_driver operated by them
is not updated anywhere else (so it is permanently equal to 0), so
drop both of them along with refcnt.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/driver.c | 46 ----------------------------------------
 include/linux/cpuidle.h  |  5 -----
 2 files changed, 51 deletions(-)

diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index ce6a5f80fb83..4070e573bf43 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -155,8 +155,6 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 {
 	int i;
 
-	drv->refcnt = 0;
-
 	/*
 	 * Use all possible CPUs as the default, because if the kernel boots
 	 * with some CPUs offline and then we online one of them, the CPU
@@ -240,9 +238,6 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv)
  */
 static void __cpuidle_unregister_driver(struct cpuidle_driver *drv)
 {
-	if (WARN_ON(drv->refcnt > 0))
-		return;
-
 	if (drv->bctimer) {
 		drv->bctimer = 0;
 		on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
@@ -349,47 +344,6 @@ struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev)
 }
 EXPORT_SYMBOL_GPL(cpuidle_get_cpu_driver);
 
-/**
- * cpuidle_driver_ref - get a reference to the driver.
- *
- * Increment the reference counter of the cpuidle driver associated with
- * the current CPU.
- *
- * Returns a pointer to the driver, or NULL if the current CPU has no driver.
- */
-struct cpuidle_driver *cpuidle_driver_ref(void)
-{
-	struct cpuidle_driver *drv;
-
-	spin_lock(&cpuidle_driver_lock);
-
-	drv = cpuidle_get_driver();
-	if (drv)
-		drv->refcnt++;
-
-	spin_unlock(&cpuidle_driver_lock);
-	return drv;
-}
-
-/**
- * cpuidle_driver_unref - puts down the refcount for the driver
- *
- * Decrement the reference counter of the cpuidle driver associated with
- * the current CPU.
- */
-void cpuidle_driver_unref(void)
-{
-	struct cpuidle_driver *drv;
-
-	spin_lock(&cpuidle_driver_lock);
-
-	drv = cpuidle_get_driver();
-	if (drv && !WARN_ON(drv->refcnt <= 0))
-		drv->refcnt--;
-
-	spin_unlock(&cpuidle_driver_lock);
-}
-
 /**
  * cpuidle_driver_state_disabled - Disable or enable an idle state
  * @drv: cpuidle driver owning the state
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 1dabe36bd011..23744d49bc22 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -115,7 +115,6 @@ DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev);
 struct cpuidle_driver {
 	const char		*name;
 	struct module 		*owner;
-	int                     refcnt;
 
         /* used by the cpuidle framework to setup the broadcast timer */
 	unsigned int            bctimer:1;
@@ -147,8 +146,6 @@ extern u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 
 extern int cpuidle_register_driver(struct cpuidle_driver *drv);
 extern struct cpuidle_driver *cpuidle_get_driver(void);
-extern struct cpuidle_driver *cpuidle_driver_ref(void);
-extern void cpuidle_driver_unref(void);
 extern void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx,
 					bool disable);
 extern void cpuidle_unregister_driver(struct cpuidle_driver *drv);
@@ -186,8 +183,6 @@ static inline u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
 {return -ENODEV; }
 static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; }
-static inline struct cpuidle_driver *cpuidle_driver_ref(void) {return NULL; }
-static inline void cpuidle_driver_unref(void) {}
 static inline void cpuidle_driver_state_disabled(struct cpuidle_driver *drv,
 					       int idx, bool disable) { }
 static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { }
-- 
Gitee


From 07e937a060f50c39ac51fb940d7f8b3fdd5ef952 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Sun, 29 Dec 2019 19:09:12 +0100
Subject: [PATCH 1096/2161] cpuidle: arm: Enable compile testing for some of
 drivers

commit 53eb82b0977da8fbba472bd2275fa2ad27f50621 upstream.

Some of cpuidle drivers for ARMv7 can be compile tested on this
architecture because they do not depend on mach-specific bits.  Enable
compile testing for big.LITTLE, Kirkwood, Zynq, AT91, Exynos and mvebu
cpuidle drivers.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/Kconfig.arm | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index a224d33dda7f..62272ecfa771 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -25,7 +25,7 @@ config ARM_PSCI_CPUIDLE
 
 config ARM_BIG_LITTLE_CPUIDLE
 	bool "Support for ARM big.LITTLE processors"
-	depends on ARCH_VEXPRESS_TC2_PM || ARCH_EXYNOS
+	depends on ARCH_VEXPRESS_TC2_PM || ARCH_EXYNOS || COMPILE_TEST
 	depends on MCPM && !ARM64
 	select ARM_CPU_SUSPEND
 	select CPU_IDLE_MULTIPLE_DRIVERS
@@ -51,13 +51,13 @@ config ARM_HIGHBANK_CPUIDLE
 
 config ARM_KIRKWOOD_CPUIDLE
 	bool "CPU Idle Driver for Marvell Kirkwood SoCs"
-	depends on MACH_KIRKWOOD && !ARM64
+	depends on (MACH_KIRKWOOD || COMPILE_TEST) && !ARM64
 	help
 	  This adds the CPU Idle driver for Marvell Kirkwood SoCs.
 
 config ARM_ZYNQ_CPUIDLE
 	bool "CPU Idle Driver for Xilinx Zynq processors"
-	depends on ARCH_ZYNQ && !ARM64
+	depends on (ARCH_ZYNQ || COMPILE_TEST) && !ARM64
 	help
 	  Select this to enable cpuidle on Xilinx Zynq processors.
 
@@ -70,19 +70,19 @@ config ARM_U8500_CPUIDLE
 config ARM_AT91_CPUIDLE
 	bool "Cpu Idle Driver for the AT91 processors"
 	default y
-	depends on ARCH_AT91 && !ARM64
+	depends on (ARCH_AT91 || COMPILE_TEST) && !ARM64
 	help
 	  Select this to enable cpuidle for AT91 processors.
 
 config ARM_EXYNOS_CPUIDLE
 	bool "Cpu Idle Driver for the Exynos processors"
-	depends on ARCH_EXYNOS && !ARM64
+	depends on (ARCH_EXYNOS || COMPILE_TEST) && !ARM64
 	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
 	help
 	  Select this to enable cpuidle for Exynos processors.
 
 config ARM_MVEBU_V7_CPUIDLE
 	bool "CPU Idle Driver for mvebu v7 family processors"
-	depends on ARCH_MVEBU && !ARM64
+	depends on (ARCH_MVEBU || COMPILE_TEST) && !ARM64
 	help
 	  Select this to enable cpuidle on Armada 370, 38x and XP processors.
-- 
Gitee


From caf7a785583c763b082f2b2b6d79c2b0c84ca9a9 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@st.com>
Date: Mon, 20 Jan 2020 14:24:08 +0100
Subject: [PATCH 1097/2161] cpuidle: coupled: fix warnings when compiling with
 W=1

commit 32014c86d4e1639b430d1f833f0848b99638ca39 upstream.

Fix warnings that show up when compiling with W=1

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/coupled.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index b607278df25b..04003b90dc49 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -89,6 +89,7 @@
  * @coupled_cpus: mask of cpus that are part of the coupled set
  * @requested_state: array of requested states for cpus in the coupled set
  * @ready_waiting_counts: combined count of cpus  in ready or waiting loops
+ * @abort_barrier: synchronisation point for abort cases
  * @online_count: count of cpus that are online
  * @refcnt: reference count of cpuidle devices that are using this struct
  * @prevent: flag to prevent coupled idle while a cpu is hotplugging
@@ -338,7 +339,7 @@ static void cpuidle_coupled_poke(int cpu)
 
 /**
  * cpuidle_coupled_poke_others - wake up all other cpus that may be waiting
- * @dev: struct cpuidle_device for this cpu
+ * @this_cpu: target cpu
  * @coupled: the struct coupled that contains the current cpu
  *
  * Calls cpuidle_coupled_poke on all other online cpus.
@@ -355,7 +356,7 @@ static void cpuidle_coupled_poke_others(int this_cpu,
 
 /**
  * cpuidle_coupled_set_waiting - mark this cpu as in the wait loop
- * @dev: struct cpuidle_device for this cpu
+ * @cpu: target cpu
  * @coupled: the struct coupled that contains the current cpu
  * @next_state: the index in drv->states of the requested state for this cpu
  *
@@ -376,7 +377,7 @@ static int cpuidle_coupled_set_waiting(int cpu,
 
 /**
  * cpuidle_coupled_set_not_waiting - mark this cpu as leaving the wait loop
- * @dev: struct cpuidle_device for this cpu
+ * @cpu: target cpu
  * @coupled: the struct coupled that contains the current cpu
  *
  * Removes the requested idle state for the specified cpuidle device.
@@ -412,7 +413,7 @@ static void cpuidle_coupled_set_done(int cpu, struct cpuidle_coupled *coupled)
 
 /**
  * cpuidle_coupled_clear_pokes - spin until the poke interrupt is processed
- * @cpu - this cpu
+ * @cpu: this cpu
  *
  * Turns on interrupts and spins until any outstanding poke interrupts have
  * been processed and the poke bit has been cleared.
-- 
Gitee


From f1aa9a77707bce7a697946c6b59880483eecd8d9 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@st.com>
Date: Mon, 20 Jan 2020 14:33:59 +0100
Subject: [PATCH 1098/2161] cpuidle: sysfs: fix warnings when compiling with
 W=1

commit a09da3fbc17f36feac00fea37a225596ae302360 upstream.

Fix kernel documentation comments to remove warnings when
compiling with W=1.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/sysfs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index cd5e2bbf1a29..b41e51a978cf 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -142,6 +142,7 @@ static struct attribute_group cpuidle_attr_group = {
 
 /**
  * cpuidle_add_interface - add CPU global sysfs attributes
+ * @dev: the target device
  */
 int cpuidle_add_interface(struct device *dev)
 {
@@ -153,6 +154,7 @@ int cpuidle_add_interface(struct device *dev)
 
 /**
  * cpuidle_remove_interface - remove CPU global sysfs attributes
+ * @dev: the target device
  */
 void cpuidle_remove_interface(struct device *dev)
 {
@@ -615,7 +617,7 @@ static struct kobj_type ktype_driver_cpuidle = {
 
 /**
  * cpuidle_add_driver_sysfs - adds the driver name sysfs attribute
- * @device: the target device
+ * @dev: the target device
  */
 static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev)
 {
@@ -646,7 +648,7 @@ static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev)
 
 /**
  * cpuidle_remove_driver_sysfs - removes the driver name sysfs attribute
- * @device: the target device
+ * @dev: the target device
  */
 static void cpuidle_remove_driver_sysfs(struct cpuidle_device *dev)
 {
-- 
Gitee


From 9dc42372aca62392b4ae7397522910d89d99d594 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@st.com>
Date: Tue, 21 Jan 2020 09:27:58 +0100
Subject: [PATCH 1099/2161] cpuidle: fix cpuidle_find_deepest_state() kerneldoc
 warnings

commit cefb9409ff995fcc98ce44a07b549ba1b827c31b upstream.

Fix cpuidle_find_deepest_state() kernel documentation to avoid
warnings when compiling with W=1.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2b1eee61787a..9cc612387309 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -121,6 +121,9 @@ void cpuidle_use_deepest_state(u64 latency_limit_ns)
  * cpuidle_find_deepest_state - Find the deepest available idle state.
  * @drv: cpuidle driver for the given CPU.
  * @dev: cpuidle device for the given CPU.
+ * @latency_limit_ns: Idle state exit latency limit
+ *
+ * Return: the index of the deepest available idle state.
  */
 int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 			       struct cpuidle_device *dev,
-- 
Gitee


From 6e7afc05b4d8251785377197cf8f1e86ec6034a6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:55:14 +0100
Subject: [PATCH 1100/2161] ACPI: processor: Export function to claim _CST
 control

commit bc94638886ab21f8247d3f7f39573d3feb7d8284 upstream.

The intel_idle driver will be modified to use ACPI _CST subsequently
and it will need to notify the platform firmware of that if
acpi_gbl_FADT.cst_control is set, so add a routine for this purpose,
acpi_processor_claim_cst_control(), to acpi_processor.c (so that it
is always present which is required by intel_idle) and export it
to allow the ACPI processor driver (which is modular) to call it.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/acpi_processor.c | 25 +++++++++++++++++++++++++
 drivers/acpi/processor_idle.c | 12 ++++--------
 include/linux/acpi.h          |  6 ++++++
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 2c4dda0787e8..8a53f3c5b70e 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -705,3 +705,28 @@ void __init acpi_processor_init(void)
 	acpi_scan_add_handler_with_hotplug(&processor_handler, "processor");
 	acpi_scan_add_handler(&processor_container_handler);
 }
+
+#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
+/**
+ * acpi_processor_claim_cst_control - Request _CST control from the platform.
+ */
+bool acpi_processor_claim_cst_control(void)
+{
+	static bool cst_control_claimed;
+	acpi_status status;
+
+	if (!acpi_gbl_FADT.cst_control || cst_control_claimed)
+		return true;
+
+	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+				    acpi_gbl_FADT.cst_control, 8);
+	if (ACPI_FAILURE(status)) {
+		pr_warn("ACPI: Failed to claim processor _CST control\n");
+		return false;
+	}
+
+	cst_control_claimed = true;
+	return true;
+}
+EXPORT_SYMBOL_GPL(acpi_processor_claim_cst_control);
+#endif /* CONFIG_ACPI_PROCESSOR_CSTATE */
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 2ae95df2e74f..dd737d836c03 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -909,7 +909,6 @@ static int acpi_processor_setup_cstates(struct acpi_processor *pr)
 
 static inline void acpi_processor_cstate_first_run_checks(void)
 {
-	acpi_status status;
 	static int first_run;
 
 	if (first_run)
@@ -921,13 +920,10 @@ static inline void acpi_processor_cstate_first_run_checks(void)
 			  max_cstate);
 	first_run++;
 
-	if (acpi_gbl_FADT.cst_control && !nocst) {
-		status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
-					    acpi_gbl_FADT.cst_control, 8);
-		if (ACPI_FAILURE(status))
-			ACPI_EXCEPTION((AE_INFO, status,
-					"Notifying BIOS of _CST ability failed"));
-	}
+	if (nocst)
+		return;
+
+	acpi_processor_claim_cst_control();
 }
 #else
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index f56ee0edd625..14aaa40323e9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -283,6 +283,12 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
 
 /* Validate the processor object's proc_id */
 bool acpi_duplicate_processor_id(int proc_id);
+/* Processor _CTS control */
+#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
+bool acpi_processor_claim_cst_control(void);
+#else
+static inline bool acpi_processor_claim_cst_control(void) { return false; }
+#endif
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
-- 
Gitee


From af9512fcbbdf9c8fbba55429a3d39492b88ad471 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:55:24 +0100
Subject: [PATCH 1101/2161] ACPI: processor: Introduce
 acpi_processor_evaluate_cst()

commit 987c785319b99e32602f7f86cfae3cf9b81e402b upstream.

In order to separate the ACPI _CST evaluation from checks
specific to the ACPI processor driver, move the majority of
the acpi_processor_get_power_info_cst() function body to a new
function, acpi_processor_evaluate_cst(), that will extract
the C-states information from _CST output, and redefine
acpi_processor_get_power_info_cst() as a wrapper around it.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/processor_idle.c | 52 +++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index dd737d836c03..e92d0e6d4cd1 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -297,21 +297,17 @@ static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
 	return 0;
 }
 
-static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
+static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
+				       struct acpi_processor_power *info)
 {
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *cst;
 	acpi_status status;
 	u64 count;
-	int current_count;
+	int current_count = 0;
 	int i, ret = 0;
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *cst;
-
-	if (nocst)
-		return -ENODEV;
 
-	current_count = 0;
-
-	status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
+	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
 		return -ENODEV;
@@ -335,9 +331,6 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 		goto end;
 	}
 
-	/* Tell driver that at least _CST is supported. */
-	pr->flags.has_cst = 1;
-
 	for (i = 1; i <= count; i++) {
 		union acpi_object *element;
 		union acpi_object *obj;
@@ -383,7 +376,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 		cx.entry_method = ACPI_CSTATE_SYSTEMIO;
 		if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
 			if (acpi_processor_ffh_cstate_probe
-					(pr->id, &cx, reg) == 0) {
+					(cpu, &cx, reg) == 0) {
 				cx.entry_method = ACPI_CSTATE_FFH;
 			} else if (cx.type == ACPI_STATE_C1) {
 				/*
@@ -432,7 +425,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 			continue;
 
 		current_count++;
-		memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
+		memcpy(&info->states[current_count], &cx, sizeof(cx));
 
 		/*
 		 * We support total ACPI_PROCESSOR_MAX_POWER - 1
@@ -446,12 +439,9 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 		}
 	}
 
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n",
-			  current_count));
+	acpi_handle_info(handle, "Found %d idle states\n", current_count);
 
-	/* Validate number of power states discovered */
-	if (current_count < 2)
-		ret = -EFAULT;
+	info->count = current_count;
 
       end:
 	kfree(buffer.pointer);
@@ -459,6 +449,28 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 	return ret;
 }
 
+static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
+{
+	int ret;
+
+	if (nocst)
+		return -ENODEV;
+
+	ret = acpi_processor_evaluate_cst(pr->handle, pr->id, &pr->power);
+	if (ret)
+		return ret;
+
+	/*
+	 * It is expected that there will be at least 2 states, C1 and
+	 * something else (C2 or C3), so fail if that is not the case.
+	 */
+	if (pr->power.count < 2)
+		return -EFAULT;
+
+	pr->flags.has_cst = 1;
+	return 0;
+}
+
 static void acpi_processor_power_verify_c3(struct acpi_processor *pr,
 					   struct acpi_processor_cx *cx)
 {
-- 
Gitee


From b2444e15e90b7b3bc140c9c0cb686ec7027f74d2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:55:33 +0100
Subject: [PATCH 1102/2161] ACPI: processor: Clean up
 acpi_processor_evaluate_cst()

commit aa659a3fca79abd9a3ca3ddc535db631b01c9a02 upstream.

Clean up acpi_processor_evaluate_cst() in multiple ways:

 * Rename current_count to last_index which matches the purpose of
   the variable better.

 * Consistently use acpi_handle_*() for printing messages and make
   the messages cleaner.

 * Drop redundant parens and braces.

 * Rewrite and clarify comments.

 * Rearrange checks and drop the redundant ones.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/processor_idle.c | 114 ++++++++++++++++------------------
 1 file changed, 52 insertions(+), 62 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index e92d0e6d4cd1..7c2fe3b2ec31 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -304,29 +304,29 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 	union acpi_object *cst;
 	acpi_status status;
 	u64 count;
-	int current_count = 0;
+	int last_index = 0;
 	int i, ret = 0;
 
 	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
+		acpi_handle_debug(handle, "No _CST\n");
 		return -ENODEV;
 	}
 
 	cst = buffer.pointer;
 
-	/* There must be at least 2 elements */
-	if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) {
-		pr_err("not enough elements in _CST\n");
+	/* There must be at least 2 elements. */
+	if (!cst || cst->type != ACPI_TYPE_PACKAGE || cst->package.count < 2) {
+		acpi_handle_warn(handle, "Invalid _CST output\n");
 		ret = -EFAULT;
 		goto end;
 	}
 
 	count = cst->package.elements[0].integer.value;
 
-	/* Validate number of power states. */
+	/* Validate the number of C-states. */
 	if (count < 1 || count != cst->package.count - 1) {
-		pr_err("count given by _CST is not valid\n");
+		acpi_handle_warn(handle, "Inconsistent _CST data\n");
 		ret = -EFAULT;
 		goto end;
 	}
@@ -337,111 +337,101 @@ static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
 		struct acpi_power_register *reg;
 		struct acpi_processor_cx cx;
 
+		/*
+		 * If there is not enough space for all C-states, skip the
+		 * excess ones and log a warning.
+		 */
+		if (last_index >= ACPI_PROCESSOR_MAX_POWER - 1) {
+			acpi_handle_warn(handle,
+					 "No room for more idle states (limit: %d)\n",
+					 ACPI_PROCESSOR_MAX_POWER - 1);
+			break;
+		}
+
 		memset(&cx, 0, sizeof(cx));
 
-		element = &(cst->package.elements[i]);
+		element = &cst->package.elements[i];
 		if (element->type != ACPI_TYPE_PACKAGE)
 			continue;
 
 		if (element->package.count != 4)
 			continue;
 
-		obj = &(element->package.elements[0]);
+		obj = &element->package.elements[0];
 
 		if (obj->type != ACPI_TYPE_BUFFER)
 			continue;
 
 		reg = (struct acpi_power_register *)obj->buffer.pointer;
 
-		if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&
-		    (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))
-			continue;
-
-		/* There should be an easy way to extract an integer... */
-		obj = &(element->package.elements[1]);
+		obj = &element->package.elements[1];
 		if (obj->type != ACPI_TYPE_INTEGER)
 			continue;
 
 		cx.type = obj->integer.value;
 		/*
-		 * Some buggy BIOSes won't list C1 in _CST -
-		 * Let acpi_processor_get_power_info_default() handle them later
+		 * There are known cases in which the _CST output does not
+		 * contain C1, so if the type of the first state found is not
+		 * C1, leave an empty slot for C1 to be filled in later.
 		 */
 		if (i == 1 && cx.type != ACPI_STATE_C1)
-			current_count++;
+			last_index = 1;
 
 		cx.address = reg->address;
-		cx.index = current_count + 1;
+		cx.index = last_index + 1;
 
-		cx.entry_method = ACPI_CSTATE_SYSTEMIO;
 		if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
-			if (acpi_processor_ffh_cstate_probe
-					(cpu, &cx, reg) == 0) {
-				cx.entry_method = ACPI_CSTATE_FFH;
+			if (!acpi_processor_ffh_cstate_probe(cpu, &cx, reg)) {
+				/*
+				 * In the majority of cases _CST describes C1 as
+				 * a FIXED_HARDWARE C-state, but if the command
+				 * line forbids using MWAIT, use CSTATE_HALT for
+				 * C1 regardless.
+				 */
+				if (cx.type == ACPI_STATE_C1 &&
+				    boot_option_idle_override == IDLE_NOMWAIT) {
+					cx.entry_method = ACPI_CSTATE_HALT;
+					snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
+				} else {
+					cx.entry_method = ACPI_CSTATE_FFH;
+				}
 			} else if (cx.type == ACPI_STATE_C1) {
 				/*
-				 * C1 is a special case where FIXED_HARDWARE
-				 * can be handled in non-MWAIT way as well.
-				 * In that case, save this _CST entry info.
-				 * Otherwise, ignore this info and continue.
+				 * In the special case of C1, FIXED_HARDWARE can
+				 * be handled by executing the HLT instruction.
 				 */
 				cx.entry_method = ACPI_CSTATE_HALT;
 				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
 			} else {
 				continue;
 			}
-			if (cx.type == ACPI_STATE_C1 &&
-			    (boot_option_idle_override == IDLE_NOMWAIT)) {
-				/*
-				 * In most cases the C1 space_id obtained from
-				 * _CST object is FIXED_HARDWARE access mode.
-				 * But when the option of idle=halt is added,
-				 * the entry_method type should be changed from
-				 * CSTATE_FFH to CSTATE_HALT.
-				 * When the option of idle=nomwait is added,
-				 * the C1 entry_method type should be
-				 * CSTATE_HALT.
-				 */
-				cx.entry_method = ACPI_CSTATE_HALT;
-				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
-			}
-		} else {
+		} else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+			cx.entry_method = ACPI_CSTATE_SYSTEMIO;
 			snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
 				 cx.address);
+		} else {
+			continue;
 		}
 
-		if (cx.type == ACPI_STATE_C1) {
+		if (cx.type == ACPI_STATE_C1)
 			cx.valid = 1;
-		}
 
-		obj = &(element->package.elements[2]);
+		obj = &element->package.elements[2];
 		if (obj->type != ACPI_TYPE_INTEGER)
 			continue;
 
 		cx.latency = obj->integer.value;
 
-		obj = &(element->package.elements[3]);
+		obj = &element->package.elements[3];
 		if (obj->type != ACPI_TYPE_INTEGER)
 			continue;
 
-		current_count++;
-		memcpy(&info->states[current_count], &cx, sizeof(cx));
-
-		/*
-		 * We support total ACPI_PROCESSOR_MAX_POWER - 1
-		 * (From 1 through ACPI_PROCESSOR_MAX_POWER - 1)
-		 */
-		if (current_count >= (ACPI_PROCESSOR_MAX_POWER - 1)) {
-			pr_warn("Limiting number of power states to max (%d)\n",
-				ACPI_PROCESSOR_MAX_POWER);
-			pr_warn("Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
-			break;
-		}
+		memcpy(&info->states[++last_index], &cx, sizeof(cx));
 	}
 
-	acpi_handle_info(handle, "Found %d idle states\n", current_count);
+	acpi_handle_info(handle, "Found %d idle states\n", last_index);
 
-	info->count = current_count;
+	info->count = last_index;
 
       end:
 	kfree(buffer.pointer);
-- 
Gitee


From 085052768729b4d9fe5e8db615511a49ded6d75a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 16 Dec 2019 12:07:01 +0100
Subject: [PATCH 1103/2161] ACPI: processor: Make ACPI_PROCESSOR_CSTATE depend
 on ACPI_PROCESSOR

commit 239ed06d0eefc4ee351af69bd64742ada56c4bdb upstream.

To avoid build errors when CONFIG_ACPI_PROCESSOR_CSTATE is set and
CONFIG_ACPI_PROCESSOR is not (that may appear in randconfig builds),
make the former depend on the latter.

Acked-by: Len Brown <len.brown@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index ebe1e9e5fd81..5559ef264060 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -241,6 +241,7 @@ config ACPI_CPU_FREQ_PSS
 
 config ACPI_PROCESSOR_CSTATE
 	def_bool y
+	depends on ACPI_PROCESSOR
 	depends on IA64 || X86
 
 config ACPI_PROCESSOR_IDLE
-- 
Gitee


From 20255ddcbd767b15635d2d14754e79ae1afc329c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:55:42 +0100
Subject: [PATCH 1104/2161] ACPI: processor: Export
 acpi_processor_evaluate_cst()

commit bc94638886ab21f8247d3f7f39573d3feb7d8284 upstream.

The intel_idle driver will be modified to use ACPI _CST subsequently
and it will need to call acpi_processor_evaluate_cst(), so move that
function to acpi_processor.c so that it is always present (which is
required by intel_idle) and export it to modules to allow the ACPI
processor driver (which is modular) to call it.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/acpi_processor.c | 157 ++++++++++++++++++++++++++++++++++
 drivers/acpi/processor_idle.c | 142 ------------------------------
 include/linux/acpi.h          |   9 ++
 3 files changed, 166 insertions(+), 142 deletions(-)

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 8a53f3c5b70e..5379bc3f275d 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -729,4 +729,161 @@ bool acpi_processor_claim_cst_control(void)
 	return true;
 }
 EXPORT_SYMBOL_GPL(acpi_processor_claim_cst_control);
+
+/**
+ * acpi_processor_evaluate_cst - Evaluate the processor _CST control method.
+ * @handle: ACPI handle of the processor object containing the _CST.
+ * @cpu: The numeric ID of the target CPU.
+ * @info: Object write the C-states information into.
+ *
+ * Extract the C-state information for the given CPU from the output of the _CST
+ * control method under the corresponding ACPI processor object (or processor
+ * device object) and populate @info with it.
+ *
+ * If any ACPI_ADR_SPACE_FIXED_HARDWARE C-states are found, invoke
+ * acpi_processor_ffh_cstate_probe() to verify them and update the
+ * cpu_cstate_entry data for @cpu.
+ */
+int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
+				struct acpi_processor_power *info)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *cst;
+	acpi_status status;
+	u64 count;
+	int last_index = 0;
+	int i, ret = 0;
+
+	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
+	if (ACPI_FAILURE(status)) {
+		acpi_handle_debug(handle, "No _CST\n");
+		return -ENODEV;
+	}
+
+	cst = buffer.pointer;
+
+	/* There must be at least 2 elements. */
+	if (!cst || cst->type != ACPI_TYPE_PACKAGE || cst->package.count < 2) {
+		acpi_handle_warn(handle, "Invalid _CST output\n");
+		ret = -EFAULT;
+		goto end;
+	}
+
+	count = cst->package.elements[0].integer.value;
+
+	/* Validate the number of C-states. */
+	if (count < 1 || count != cst->package.count - 1) {
+		acpi_handle_warn(handle, "Inconsistent _CST data\n");
+		ret = -EFAULT;
+		goto end;
+	}
+
+	for (i = 1; i <= count; i++) {
+		union acpi_object *element;
+		union acpi_object *obj;
+		struct acpi_power_register *reg;
+		struct acpi_processor_cx cx;
+
+		/*
+		 * If there is not enough space for all C-states, skip the
+		 * excess ones and log a warning.
+		 */
+		if (last_index >= ACPI_PROCESSOR_MAX_POWER - 1) {
+			acpi_handle_warn(handle,
+					 "No room for more idle states (limit: %d)\n",
+					 ACPI_PROCESSOR_MAX_POWER - 1);
+			break;
+		}
+
+		memset(&cx, 0, sizeof(cx));
+
+		element = &cst->package.elements[i];
+		if (element->type != ACPI_TYPE_PACKAGE)
+			continue;
+
+		if (element->package.count != 4)
+			continue;
+
+		obj = &element->package.elements[0];
+
+		if (obj->type != ACPI_TYPE_BUFFER)
+			continue;
+
+		reg = (struct acpi_power_register *)obj->buffer.pointer;
+
+		obj = &element->package.elements[1];
+		if (obj->type != ACPI_TYPE_INTEGER)
+			continue;
+
+		cx.type = obj->integer.value;
+		/*
+		 * There are known cases in which the _CST output does not
+		 * contain C1, so if the type of the first state found is not
+		 * C1, leave an empty slot for C1 to be filled in later.
+		 */
+		if (i == 1 && cx.type != ACPI_STATE_C1)
+			last_index = 1;
+
+		cx.address = reg->address;
+		cx.index = last_index + 1;
+
+		if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
+			if (!acpi_processor_ffh_cstate_probe(cpu, &cx, reg)) {
+				/*
+				 * In the majority of cases _CST describes C1 as
+				 * a FIXED_HARDWARE C-state, but if the command
+				 * line forbids using MWAIT, use CSTATE_HALT for
+				 * C1 regardless.
+				 */
+				if (cx.type == ACPI_STATE_C1 &&
+				    boot_option_idle_override == IDLE_NOMWAIT) {
+					cx.entry_method = ACPI_CSTATE_HALT;
+					snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
+				} else {
+					cx.entry_method = ACPI_CSTATE_FFH;
+				}
+			} else if (cx.type == ACPI_STATE_C1) {
+				/*
+				 * In the special case of C1, FIXED_HARDWARE can
+				 * be handled by executing the HLT instruction.
+				 */
+				cx.entry_method = ACPI_CSTATE_HALT;
+				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
+			} else {
+				continue;
+			}
+		} else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+			cx.entry_method = ACPI_CSTATE_SYSTEMIO;
+			snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
+				 cx.address);
+		} else {
+			continue;
+		}
+
+		if (cx.type == ACPI_STATE_C1)
+			cx.valid = 1;
+
+		obj = &element->package.elements[2];
+		if (obj->type != ACPI_TYPE_INTEGER)
+			continue;
+
+		cx.latency = obj->integer.value;
+
+		obj = &element->package.elements[3];
+		if (obj->type != ACPI_TYPE_INTEGER)
+			continue;
+
+		memcpy(&info->states[++last_index], &cx, sizeof(cx));
+	}
+
+	acpi_handle_info(handle, "Found %d idle states\n", last_index);
+
+	info->count = last_index;
+
+      end:
+	kfree(buffer.pointer);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(acpi_processor_evaluate_cst);
 #endif /* CONFIG_ACPI_PROCESSOR_CSTATE */
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 7c2fe3b2ec31..dcc289e30166 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -297,148 +297,6 @@ static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
 	return 0;
 }
 
-static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
-				       struct acpi_processor_power *info)
-{
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *cst;
-	acpi_status status;
-	u64 count;
-	int last_index = 0;
-	int i, ret = 0;
-
-	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
-	if (ACPI_FAILURE(status)) {
-		acpi_handle_debug(handle, "No _CST\n");
-		return -ENODEV;
-	}
-
-	cst = buffer.pointer;
-
-	/* There must be at least 2 elements. */
-	if (!cst || cst->type != ACPI_TYPE_PACKAGE || cst->package.count < 2) {
-		acpi_handle_warn(handle, "Invalid _CST output\n");
-		ret = -EFAULT;
-		goto end;
-	}
-
-	count = cst->package.elements[0].integer.value;
-
-	/* Validate the number of C-states. */
-	if (count < 1 || count != cst->package.count - 1) {
-		acpi_handle_warn(handle, "Inconsistent _CST data\n");
-		ret = -EFAULT;
-		goto end;
-	}
-
-	for (i = 1; i <= count; i++) {
-		union acpi_object *element;
-		union acpi_object *obj;
-		struct acpi_power_register *reg;
-		struct acpi_processor_cx cx;
-
-		/*
-		 * If there is not enough space for all C-states, skip the
-		 * excess ones and log a warning.
-		 */
-		if (last_index >= ACPI_PROCESSOR_MAX_POWER - 1) {
-			acpi_handle_warn(handle,
-					 "No room for more idle states (limit: %d)\n",
-					 ACPI_PROCESSOR_MAX_POWER - 1);
-			break;
-		}
-
-		memset(&cx, 0, sizeof(cx));
-
-		element = &cst->package.elements[i];
-		if (element->type != ACPI_TYPE_PACKAGE)
-			continue;
-
-		if (element->package.count != 4)
-			continue;
-
-		obj = &element->package.elements[0];
-
-		if (obj->type != ACPI_TYPE_BUFFER)
-			continue;
-
-		reg = (struct acpi_power_register *)obj->buffer.pointer;
-
-		obj = &element->package.elements[1];
-		if (obj->type != ACPI_TYPE_INTEGER)
-			continue;
-
-		cx.type = obj->integer.value;
-		/*
-		 * There are known cases in which the _CST output does not
-		 * contain C1, so if the type of the first state found is not
-		 * C1, leave an empty slot for C1 to be filled in later.
-		 */
-		if (i == 1 && cx.type != ACPI_STATE_C1)
-			last_index = 1;
-
-		cx.address = reg->address;
-		cx.index = last_index + 1;
-
-		if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
-			if (!acpi_processor_ffh_cstate_probe(cpu, &cx, reg)) {
-				/*
-				 * In the majority of cases _CST describes C1 as
-				 * a FIXED_HARDWARE C-state, but if the command
-				 * line forbids using MWAIT, use CSTATE_HALT for
-				 * C1 regardless.
-				 */
-				if (cx.type == ACPI_STATE_C1 &&
-				    boot_option_idle_override == IDLE_NOMWAIT) {
-					cx.entry_method = ACPI_CSTATE_HALT;
-					snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
-				} else {
-					cx.entry_method = ACPI_CSTATE_FFH;
-				}
-			} else if (cx.type == ACPI_STATE_C1) {
-				/*
-				 * In the special case of C1, FIXED_HARDWARE can
-				 * be handled by executing the HLT instruction.
-				 */
-				cx.entry_method = ACPI_CSTATE_HALT;
-				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
-			} else {
-				continue;
-			}
-		} else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
-			cx.entry_method = ACPI_CSTATE_SYSTEMIO;
-			snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
-				 cx.address);
-		} else {
-			continue;
-		}
-
-		if (cx.type == ACPI_STATE_C1)
-			cx.valid = 1;
-
-		obj = &element->package.elements[2];
-		if (obj->type != ACPI_TYPE_INTEGER)
-			continue;
-
-		cx.latency = obj->integer.value;
-
-		obj = &element->package.elements[3];
-		if (obj->type != ACPI_TYPE_INTEGER)
-			continue;
-
-		memcpy(&info->states[++last_index], &cx, sizeof(cx));
-	}
-
-	acpi_handle_info(handle, "Found %d idle states\n", last_index);
-
-	info->count = last_index;
-
-      end:
-	kfree(buffer.pointer);
-
-	return ret;
-}
-
 static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 {
 	int ret;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 14aaa40323e9..fb43a7940a68 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -284,10 +284,19 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
 /* Validate the processor object's proc_id */
 bool acpi_duplicate_processor_id(int proc_id);
 /* Processor _CTS control */
+struct acpi_processor_power;
+
 #ifdef CONFIG_ACPI_PROCESSOR_CSTATE
 bool acpi_processor_claim_cst_control(void);
+int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
+				struct acpi_processor_power *info);
 #else
 static inline bool acpi_processor_claim_cst_control(void) { return false; }
+static inline int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
+					      struct acpi_processor_power *info)
+{
+	return -ENODEV;
+}
 #endif
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
-- 
Gitee


From 9241be6f7ef56c1d4a8a540e7d601a71f235d749 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:55:52 +0100
Subject: [PATCH 1105/2161] intel_idle: Refactor
 intel_idle_cpuidle_driver_init()

commit 9f3d6daf61e5156139cd05643f7f1c2a9b7b49b0 upstream.

Move the C-state verification and checks from
intel_idle_cpuidle_driver_init() to a separate function,
intel_idle_verify_cstate(), and make the former call it after
checking the CPUIDLE_FLAG_UNUSABLE state flag.

Also combine the drv->states[] updates with the incrementation of
drv->state_count.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 49 ++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 75fd2a7b0842..47255d3cf51f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -944,6 +944,22 @@ static void intel_idle_s2idle(struct cpuidle_device *dev,
 	mwait_idle_with_hints(eax, ecx);
 }
 
+static bool intel_idle_verify_cstate(unsigned int mwait_hint)
+{
+	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
+	unsigned int num_substates = (mwait_substates >> mwait_cstate * 4) &
+					MWAIT_SUBSTATE_MASK;
+
+	/* Ignore the C-state if there are NO sub-states in CPUID for it. */
+	if (num_substates == 0)
+		return false;
+
+	if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+		mark_tsc_unstable("TSC halts in idle states deeper than C2");
+
+	return true;
+}
+
 static void __setup_broadcast_timer(bool on)
 {
 	if (on)
@@ -1332,10 +1348,10 @@ static void __init intel_idle_cpuidle_driver_init(void)
 	drv->state_count = 1;
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
-		int num_substates, mwait_hint, mwait_cstate;
+		unsigned int mwait_hint;
 
-		if ((cpuidle_state_table[cstate].enter == NULL) &&
-		    (cpuidle_state_table[cstate].enter_s2idle == NULL))
+		if (!cpuidle_state_table[cstate].enter &&
+		    !cpuidle_state_table[cstate].enter_s2idle)
 			break;
 
 		if (cstate + 1 > max_cstate) {
@@ -1343,34 +1359,19 @@ static void __init intel_idle_cpuidle_driver_init(void)
 			break;
 		}
 
-		mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
-		mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint);
-
-		/* number of sub-states for this state in CPUID.MWAIT */
-		num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4))
-					& MWAIT_SUBSTATE_MASK;
-
-		/* if NO sub-states for this state in CPUID, skip it */
-		if (num_substates == 0)
-			continue;
-
-		/* if state marked as disabled, skip it */
+		/* If marked as unusable, skip this state. */
 		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) {
 			pr_debug("state %s is disabled\n",
 				 cpuidle_state_table[cstate].name);
 			continue;
 		}
 
+		mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
+		if (!intel_idle_verify_cstate(mwait_hint))
+			continue;
 
-		if (((mwait_cstate + 1) > 2) &&
-			!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
-			mark_tsc_unstable("TSC halts in idle"
-					" states deeper than C2");
-
-		drv->states[drv->state_count] =	/* structure copy */
-			cpuidle_state_table[cstate];
-
-		drv->state_count += 1;
+		/* Structure copy. */
+		drv->states[drv->state_count++] = cpuidle_state_table[cstate];
 	}
 
 	if (icpu->byt_auto_demotion_disable_flag) {
-- 
Gitee


From da976cf2b6321513f1ea91cb9543563de7bdaa83 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:56:01 +0100
Subject: [PATCH 1106/2161] intel_idle: Use ACPI _CST for processor models
 without C-state tables

commit 18734958e9bfbc055805d110a38dc76307eba742 upstream.

Modify the intel_idle driver to get the C-states information from ACPI
_CST if the processor model is not recognized by it.

The processor is still required to support MWAIT and the information
from ACPI _CST will only be used if all of the C-states listed by
_CST are of the ACPI_CSTATE_FFH type (which means that they are
expected to be entered via MWAIT).

Moreover, the driver assumes that the _CST information is the same
for all CPUs in the system, so it is sufficient to evaluate _CST for
one of them and extract the common list of C-states from there.
Also _CST is evaluated once at the system initialization time and
the driver does not respond to _CST change notifications (that can
be changed in the future).

The main functional difference between intel_idle with this change
and the ACPI processor driver is that the former sets the target
residency to be equal to the exit latency (provided by _CST) for
C1-type C-states and to 3 times the exit latency value for the other
C-state types, whereas the latter obtains the target residency by
multiplying the exit latency by the same number (2 by default) for
all C-state types.  Therefore it is expected that in general using
the former instead of the latter on the same system will lead to
improved energy-efficiency.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 190 ++++++++++++++++++++++++++++++++------
 1 file changed, 162 insertions(+), 28 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 47255d3cf51f..28812d93d59a 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -41,6 +41,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/acpi.h>
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
 #include <linux/tick.h>
@@ -1111,6 +1112,129 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	{}
 };
 
+#define INTEL_CPU_FAM6_MWAIT \
+	{ X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_MWAIT, 0 }
+
+static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+	INTEL_CPU_FAM6_MWAIT,
+	{}
+};
+
+static bool intel_idle_max_cstate_reached(int cstate)
+{
+	if (cstate + 1 > max_cstate) {
+		pr_info("max_cstate %d reached\n", max_cstate);
+		return true;
+	}
+	return false;
+}
+
+#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
+#include <acpi/processor.h>
+
+static struct acpi_processor_power acpi_state_table;
+
+/**
+ * intel_idle_cst_usable - Check if the _CST information can be used.
+ *
+ * Check if all of the C-states listed by _CST in the max_cstate range are
+ * ACPI_CSTATE_FFH, which means that they should be entered via MWAIT.
+ */
+static bool intel_idle_cst_usable(void)
+{
+	int cstate, limit;
+
+	limit = min_t(int, min_t(int, CPUIDLE_STATE_MAX, max_cstate + 1),
+		      acpi_state_table.count);
+
+	for (cstate = 1; cstate < limit; cstate++) {
+		struct acpi_processor_cx *cx = &acpi_state_table.states[cstate];
+
+		if (cx->entry_method != ACPI_CSTATE_FFH)
+			return false;
+	}
+
+	return true;
+}
+
+static bool intel_idle_acpi_cst_extract(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct acpi_processor *pr = per_cpu(processors, cpu);
+
+		if (!pr)
+			continue;
+
+		if (acpi_processor_evaluate_cst(pr->handle, cpu, &acpi_state_table))
+			continue;
+
+		acpi_state_table.count++;
+
+		if (!intel_idle_cst_usable())
+			continue;
+
+		if (!acpi_processor_claim_cst_control()) {
+			acpi_state_table.count = 0;
+			return false;
+		}
+
+		return true;
+	}
+
+	pr_debug("ACPI _CST not found or not usable\n");
+	return false;
+}
+
+static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
+{
+	int cstate, limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
+
+	/*
+	 * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of
+	 * the interesting states are ACPI_CSTATE_FFH.
+	 */
+	for (cstate = 1; cstate < limit; cstate++) {
+		struct acpi_processor_cx *cx;
+		struct cpuidle_state *state;
+
+		if (intel_idle_max_cstate_reached(cstate))
+			break;
+
+		cx = &acpi_state_table.states[cstate];
+
+		state = &drv->states[drv->state_count++];
+
+		snprintf(state->name, CPUIDLE_NAME_LEN, "C%d_ACPI", cstate);
+		strlcpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
+		state->exit_latency = cx->latency;
+		/*
+		 * For C1-type C-states use the same number for both the exit
+		 * latency and target residency, because that is the case for
+		 * C1 in the majority of the static C-states tables above.
+		 * For the other types of C-states, however, set the target
+		 * residency to 3 times the exit latency which should lead to
+		 * a reasonable balance between energy-efficiency and
+		 * performance in the majority of interesting cases.
+		 */
+		state->target_residency = cx->latency;
+		if (cx->type > ACPI_STATE_C1)
+			state->target_residency *= 3;
+
+		state->flags = MWAIT2flg(cx->address);
+		if (cx->type > ACPI_STATE_C2)
+			state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
+
+		state->enter = intel_idle;
+		state->enter_s2idle = intel_idle_s2idle;
+	}
+}
+#else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
+static inline bool intel_idle_acpi_cst_extract(void) { return false; }
+static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
+#endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
+
 /*
  * intel_idle_probe()
  */
@@ -1125,17 +1249,15 @@ static int __init intel_idle_probe(void)
 	}
 
 	id = x86_match_cpu(intel_idle_ids);
-	if (!id) {
-		if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-		    boot_cpu_data.x86 == 6)
-			pr_debug("does not run on family %d model %d\n",
-				 boot_cpu_data.x86, boot_cpu_data.x86_model);
-		return -ENODEV;
-	}
-
-	if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
-		pr_debug("Please enable MWAIT in BIOS SETUP\n");
-		return -ENODEV;
+	if (id) {
+		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
+			pr_debug("Please enable MWAIT in BIOS SETUP\n");
+			return -ENODEV;
+		}
+	} else {
+		id = x86_match_cpu(intel_mwait_ids);
+		if (!id)
+			return -ENODEV;
 	}
 
 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
@@ -1151,7 +1273,10 @@ static int __init intel_idle_probe(void)
 	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
 
 	icpu = (const struct idle_cpu *)id->driver_data;
-	cpuidle_state_table = icpu->state_table;
+	if (icpu)
+		cpuidle_state_table = icpu->state_table;
+	else if (!intel_idle_acpi_cst_extract())
+		return -ENODEV;
 
 	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
 		 boot_cpu_data.x86_model);
@@ -1333,31 +1458,19 @@ static void intel_idle_state_table_update(void)
 	}
 }
 
-/*
- * intel_idle_cpuidle_driver_init()
- * allocate, initialize cpuidle_states
- */
-static void __init intel_idle_cpuidle_driver_init(void)
+static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 {
 	int cstate;
-	struct cpuidle_driver *drv = &intel_idle_driver;
-
-	intel_idle_state_table_update();
-
-	cpuidle_poll_state_init(drv);
-	drv->state_count = 1;
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
 		unsigned int mwait_hint;
 
-		if (!cpuidle_state_table[cstate].enter &&
-		    !cpuidle_state_table[cstate].enter_s2idle)
+		if (intel_idle_max_cstate_reached(cstate))
 			break;
 
-		if (cstate + 1 > max_cstate) {
-			pr_info("max_cstate %d reached\n", max_cstate);
+		if (!cpuidle_state_table[cstate].enter &&
+		    !cpuidle_state_table[cstate].enter_s2idle)
 			break;
-		}
 
 		/* If marked as unusable, skip this state. */
 		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) {
@@ -1380,6 +1493,24 @@ static void __init intel_idle_cpuidle_driver_init(void)
 	}
 }
 
+/*
+ * intel_idle_cpuidle_driver_init()
+ * allocate, initialize cpuidle_states
+ */
+static void __init intel_idle_cpuidle_driver_init(void)
+{
+	struct cpuidle_driver *drv = &intel_idle_driver;
+
+	intel_idle_state_table_update();
+
+	cpuidle_poll_state_init(drv);
+	drv->state_count = 1;
+
+	if (icpu)
+		intel_idle_init_cstates_icpu(drv);
+	else
+		intel_idle_init_cstates_acpi(drv);
+}
 
 /*
  * intel_idle_cpu_init()
@@ -1398,6 +1529,9 @@ static int intel_idle_cpu_init(unsigned int cpu)
 		return -EIO;
 	}
 
+	if (!icpu)
+		return 0;
+
 	if (icpu->auto_demotion_disable_flags)
 		auto_demotion_disable();
 
-- 
Gitee


From 273c3d4f6673b6201fc2c022d9f7397cf97bedd3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:56:13 +0100
Subject: [PATCH 1107/2161] cpuidle: Allow idle states to be disabled by
 default

commit 75a80267410e38ab76c4ceb39753f96d72113781 upstream.

In certain situations it may be useful to prevent some idle states
from being used by default while allowing user space to enable them
later on.

For this purpose, introduce a new state flag, CPUIDLE_FLAG_OFF, to
mark idle states that should be disabled by default, make the core
set CPUIDLE_STATE_DISABLED_BY_USER for those states at the
initialization time and add a new state attribute in sysfs,
"default_status", to inform user space of the initial status of
the given idle state ("disabled" if CPUIDLE_FLAG_OFF is set for it,
"enabled" otherwise).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |  6 ++++++
 Documentation/admin-guide/pm/cpuidle.rst           |  3 +++
 drivers/cpuidle/cpuidle.c                          |  6 +++++-
 drivers/cpuidle/sysfs.c                            | 10 ++++++++++
 include/linux/cpuidle.h                            |  1 +
 5 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index c24afa60a30e..b39531a3c5bc 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -196,6 +196,12 @@ Description:
 		does not reflect it. Likewise, if one enables a deep state but a
 		lighter state still is disabled, then this has no effect.
 
+What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status
+Date:		December 2019
+KernelVersion:	v5.6
+Contact:	Linux power management list <linux-pm@vger.kernel.org>
+Description:
+		(RO) The default status of this state, "enabled" or "disabled".
 
 What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
 Date:		March 2014
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index e70b365dbc60..311cd7cc2b75 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -506,6 +506,9 @@ object corresponding to it, as follows:
 ``disable``
 	Whether or not this idle state is disabled.
 
+``default_status``
+	The default status of this state, "enabled" or "disabled".
+
 ``latency``
 	Exit latency of the idle state in microseconds.
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 9cc612387309..c35928ccc14d 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -576,10 +576,14 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 	if (!try_module_get(drv->owner))
 		return -EINVAL;
 
-	for (i = 0; i < drv->state_count; i++)
+	for (i = 0; i < drv->state_count; i++) {
 		if (drv->states[i].flags & CPUIDLE_FLAG_UNUSABLE)
 			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
 
+		if (drv->states[i].flags & CPUIDLE_FLAG_OFF)
+			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_USER;
+	}
+
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
 
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index b41e51a978cf..55107565b319 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -329,6 +329,14 @@ static ssize_t store_state_disable(struct cpuidle_state *state,
 	return size;
 }
 
+static ssize_t show_state_default_status(struct cpuidle_state *state,
+					  struct cpuidle_state_usage *state_usage,
+					  char *buf)
+{
+	return sprintf(buf, "%s\n",
+		       state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled");
+}
+
 define_one_state_ro(name, show_state_name);
 define_one_state_ro(desc, show_state_desc);
 define_one_state_ro(latency, show_state_exit_latency);
@@ -339,6 +347,7 @@ define_one_state_ro(time, show_state_time);
 define_one_state_rw(disable, show_state_disable, store_state_disable);
 define_one_state_ro(above, show_state_above);
 define_one_state_ro(below, show_state_below);
+define_one_state_ro(default_status, show_state_default_status);
 
 static struct attribute *cpuidle_state_default_attrs[] = {
 	&attr_name.attr,
@@ -351,6 +360,7 @@ static struct attribute *cpuidle_state_default_attrs[] = {
 	&attr_disable.attr,
 	&attr_above.attr,
 	&attr_below.attr,
+	&attr_default_status.attr,
 	NULL
 };
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 23744d49bc22..ec2ef63771f0 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -77,6 +77,7 @@ struct cpuidle_state {
 #define CPUIDLE_FLAG_COUPLED	BIT(1) /* state applies to multiple cpus */
 #define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */
 #define CPUIDLE_FLAG_UNUSABLE	BIT(3) /* avoid using this state */
+#define CPUIDLE_FLAG_OFF	BIT(4) /* disable this state by default */
 
 struct cpuidle_device_kobj;
 struct cpuidle_state_kobj;
-- 
Gitee


From d0dccffaae951807fa981c860d4da07db7a30180 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:56:21 +0100
Subject: [PATCH 1108/2161] intel_idle: Allow ACPI _CST to be used for selected
 known processors

commit bff8e60a86f4960133f90ad9add9adeb082b8154 upstream.

Update the intel_idle driver to get the C-states information from ACPI
_CST in some cases in which the processor is known to the driver, as long as
that information is available and the new use_acpi flag is set in the
profile of the processor in question.

In the cases when there is a specific table of C-states for the given
processor in the driver, that table is used as the primary source of
information on the available C-states, but if ACPI _CST is present,
the C-states that are not listed by it will not be enabled by default
(they still can be enabled later by user space via sysfs, though).

The new CPUIDLE_FLAG_ALWAYS_ENABLE flag can be used for marking
C-states that should be enabled by default even if they are not
listed by ACPI _CST.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 45 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 28812d93d59a..a072b84d9595 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -80,6 +80,7 @@ struct idle_cpu {
 	unsigned long auto_demotion_disable_flags;
 	bool byt_auto_demotion_disable_flag;
 	bool disable_promotion_to_c1e;
+	bool use_acpi;
 };
 
 static const struct idle_cpu *icpu;
@@ -90,6 +91,11 @@ static void intel_idle_s2idle(struct cpuidle_device *dev,
 			      struct cpuidle_driver *drv, int index);
 static struct cpuidle_state *cpuidle_state_table;
 
+/*
+ * Enable this state by default even if the ACPI _CST does not list it.
+ */
+#define CPUIDLE_FLAG_ALWAYS_ENABLE	BIT(15)
+
 /*
  * Set this flag for states where the HW flushes the TLB for us
  * and so we don't need cross-calls to keep it consistent.
@@ -1230,9 +1236,33 @@ static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		state->enter_s2idle = intel_idle_s2idle;
 	}
 }
+
+static bool intel_idle_off_by_default(u32 mwait_hint)
+{
+	int cstate, limit;
+
+	/*
+	 * If there are no _CST C-states, do not disable any C-states by
+	 * default.
+	 */
+	if (!acpi_state_table.count)
+		return false;
+
+	limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
+	/*
+	 * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of
+	 * the interesting states are ACPI_CSTATE_FFH.
+	 */
+	for (cstate = 1; cstate < limit; cstate++) {
+		if (acpi_state_table.states[cstate].address == mwait_hint)
+			return false;
+	}
+	return true;
+}
 #else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 static inline bool intel_idle_acpi_cst_extract(void) { return false; }
 static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
+static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
 /*
@@ -1273,10 +1303,13 @@ static int __init intel_idle_probe(void)
 	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
 
 	icpu = (const struct idle_cpu *)id->driver_data;
-	if (icpu)
+	if (icpu) {
 		cpuidle_state_table = icpu->state_table;
-	else if (!intel_idle_acpi_cst_extract())
+		if (icpu->use_acpi)
+			intel_idle_acpi_cst_extract();
+	} else if (!intel_idle_acpi_cst_extract()) {
 		return -ENODEV;
+	}
 
 	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
 		 boot_cpu_data.x86_model);
@@ -1484,7 +1517,13 @@ static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 			continue;
 
 		/* Structure copy. */
-		drv->states[drv->state_count++] = cpuidle_state_table[cstate];
+		drv->states[drv->state_count] = cpuidle_state_table[cstate];
+
+		if (icpu->use_acpi && intel_idle_off_by_default(mwait_hint) &&
+		    !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))
+			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF;
+
+		drv->state_count++;
 	}
 
 	if (icpu->byt_auto_demotion_disable_flag) {
-- 
Gitee


From 8ee82c5dd8200066d2ad6e8d32058e3c6ff1aaea Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:56:29 +0100
Subject: [PATCH 1109/2161] intel_idle: Add module parameter to prevent ACPI
 _CST from being used

commit 4ec32d9e8e5b6d6eb491eeee3938665d8a2388fa upstream.

Add a new module parameter called "no_acpi" to the intel_idle driver
to allow the driver to be prevented from using ACPI _CST via kernel
command line.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a072b84d9595..26fe383bb921 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1138,6 +1138,10 @@ static bool intel_idle_max_cstate_reached(int cstate)
 #ifdef CONFIG_ACPI_PROCESSOR_CSTATE
 #include <acpi/processor.h>
 
+static bool no_acpi __read_mostly;
+module_param(no_acpi, bool, 0444);
+MODULE_PARM_DESC(no_acpi, "Do not use ACPI _CST for building the idle states list");
+
 static struct acpi_processor_power acpi_state_table;
 
 /**
@@ -1167,6 +1171,11 @@ static bool intel_idle_acpi_cst_extract(void)
 {
 	unsigned int cpu;
 
+	if (no_acpi) {
+		pr_debug("Not allowed to use ACPI _CST\n");
+		return false;
+	}
+
 	for_each_possible_cpu(cpu) {
 		struct acpi_processor *pr = per_cpu(processors, cpu);
 
-- 
Gitee


From d23cc207dd0508e11bfedb4f06a9dde12b6bc0db Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Dec 2019 09:56:38 +0100
Subject: [PATCH 1110/2161] intel_idle: Use ACPI _CST on server systems

commit e6d4f08a677654385869ba0c39d7c9ceec47e5c5 upstream.

In many cases, especially on server systems, it is desirable to avoid
enabling C-states that have been disabled in the platform firmware
(BIOS) setup, except for C1E.

As a rule, the C-states disabled this way are not listed by ACPI
_CST, so if that is used by intel_idle along with the specific
table of C-states that it has for the given processor, the C-states
disabled through the platform firmware will not be enabled by default
by intel_idle.

Accordingly, set the use_acpi flag (introduced previously) in all
server processor profiles defined in intel_idle (so as to make it use
ACPI _CST to decide which C-states to enable by default) and set
the CPUIDLE_FLAG_ALWAYS_ENABLE flag (also introduced previously)
for C1E in all C-states tables in intel_idle that contain C1 too
(so that C1E is enabled regardless of whether or not it is listed
by ACPI _CST).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 70 ++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 26fe383bb921..1467490adfc3 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -131,7 +131,7 @@ static struct cpuidle_state nehalem_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -168,7 +168,7 @@ static struct cpuidle_state snb_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -303,7 +303,7 @@ static struct cpuidle_state ivb_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -348,7 +348,7 @@ static struct cpuidle_state ivt_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 80,
 		.enter = &intel_idle,
@@ -385,7 +385,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 250,
 		.enter = &intel_idle,
@@ -422,7 +422,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 500,
 		.enter = &intel_idle,
@@ -459,7 +459,7 @@ static struct cpuidle_state hsw_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -527,7 +527,7 @@ static struct cpuidle_state bdw_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -596,7 +596,7 @@ static struct cpuidle_state skl_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -665,7 +665,7 @@ static struct cpuidle_state skx_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -815,7 +815,7 @@ static struct cpuidle_state bxt_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -876,7 +876,7 @@ static struct cpuidle_state dnv_cstates[] = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01),
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
@@ -998,6 +998,13 @@ static const struct idle_cpu idle_cpu_nehalem = {
 	.disable_promotion_to_c1e = true,
 };
 
+static const struct idle_cpu idle_cpu_nhx = {
+	.state_table = nehalem_cstates,
+	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_atom = {
 	.state_table = atom_cstates,
 };
@@ -1016,6 +1023,12 @@ static const struct idle_cpu idle_cpu_snb = {
 	.disable_promotion_to_c1e = true,
 };
 
+static const struct idle_cpu idle_cpu_snx = {
+	.state_table = snb_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_byt = {
 	.state_table = byt_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1036,6 +1049,7 @@ static const struct idle_cpu idle_cpu_ivb = {
 static const struct idle_cpu idle_cpu_ivt = {
 	.state_table = ivt_cstates,
 	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
 };
 
 static const struct idle_cpu idle_cpu_hsw = {
@@ -1043,11 +1057,23 @@ static const struct idle_cpu idle_cpu_hsw = {
 	.disable_promotion_to_c1e = true,
 };
 
+static const struct idle_cpu idle_cpu_hsx = {
+	.state_table = hsw_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_bdw = {
 	.state_table = bdw_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
+static const struct idle_cpu idle_cpu_bdx = {
+	.state_table = bdw_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_skl = {
 	.state_table = skl_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1056,15 +1082,18 @@ static const struct idle_cpu idle_cpu_skl = {
 static const struct idle_cpu idle_cpu_skx = {
 	.state_table = skx_cstates,
 	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
 };
 
 static const struct idle_cpu idle_cpu_avn = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
 };
 
 static const struct idle_cpu idle_cpu_knl = {
 	.state_table = knl_cstates,
+	.use_acpi = true,
 };
 
 static const struct idle_cpu idle_cpu_bxt = {
@@ -1075,20 +1104,21 @@ static const struct idle_cpu idle_cpu_bxt = {
 static const struct idle_cpu idle_cpu_dnv = {
 	.state_table = dnv_cstates,
 	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
 };
 
 static const struct x86_cpu_id intel_idle_ids[] __initconst = {
-	INTEL_CPU_FAM6(NEHALEM_EP,		idle_cpu_nehalem),
+	INTEL_CPU_FAM6(NEHALEM_EP,		idle_cpu_nhx),
 	INTEL_CPU_FAM6(NEHALEM,			idle_cpu_nehalem),
 	INTEL_CPU_FAM6(NEHALEM_G,		idle_cpu_nehalem),
 	INTEL_CPU_FAM6(WESTMERE,		idle_cpu_nehalem),
-	INTEL_CPU_FAM6(WESTMERE_EP,		idle_cpu_nehalem),
-	INTEL_CPU_FAM6(NEHALEM_EX,		idle_cpu_nehalem),
+	INTEL_CPU_FAM6(WESTMERE_EP,		idle_cpu_nhx),
+	INTEL_CPU_FAM6(NEHALEM_EX,		idle_cpu_nhx),
 	INTEL_CPU_FAM6(ATOM_BONNELL,		idle_cpu_atom),
 	INTEL_CPU_FAM6(ATOM_BONNELL_MID,	idle_cpu_lincroft),
-	INTEL_CPU_FAM6(WESTMERE_EX,		idle_cpu_nehalem),
+	INTEL_CPU_FAM6(WESTMERE_EX,		idle_cpu_nhx),
 	INTEL_CPU_FAM6(SANDYBRIDGE,		idle_cpu_snb),
-	INTEL_CPU_FAM6(SANDYBRIDGE_X,		idle_cpu_snb),
+	INTEL_CPU_FAM6(SANDYBRIDGE_X,		idle_cpu_snx),
 	INTEL_CPU_FAM6(ATOM_SALTWELL,		idle_cpu_atom),
 	INTEL_CPU_FAM6(ATOM_SILVERMONT,		idle_cpu_byt),
 	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID,	idle_cpu_tangier),
@@ -1096,14 +1126,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(IVYBRIDGE,		idle_cpu_ivb),
 	INTEL_CPU_FAM6(IVYBRIDGE_X,		idle_cpu_ivt),
 	INTEL_CPU_FAM6(HASWELL,			idle_cpu_hsw),
-	INTEL_CPU_FAM6(HASWELL_X,		idle_cpu_hsw),
+	INTEL_CPU_FAM6(HASWELL_X,		idle_cpu_hsx),
 	INTEL_CPU_FAM6(HASWELL_L,		idle_cpu_hsw),
 	INTEL_CPU_FAM6(HASWELL_G,		idle_cpu_hsw),
 	INTEL_CPU_FAM6(ATOM_SILVERMONT_D,	idle_cpu_avn),
 	INTEL_CPU_FAM6(BROADWELL,		idle_cpu_bdw),
 	INTEL_CPU_FAM6(BROADWELL_G,		idle_cpu_bdw),
-	INTEL_CPU_FAM6(BROADWELL_X,		idle_cpu_bdw),
-	INTEL_CPU_FAM6(BROADWELL_D,		idle_cpu_bdw),
+	INTEL_CPU_FAM6(BROADWELL_X,		idle_cpu_bdx),
+	INTEL_CPU_FAM6(BROADWELL_D,		idle_cpu_bdx),
 	INTEL_CPU_FAM6(SKYLAKE_L,		idle_cpu_skl),
 	INTEL_CPU_FAM6(SKYLAKE,			idle_cpu_skl),
 	INTEL_CPU_FAM6(KABYLAKE_L,		idle_cpu_skl),
-- 
Gitee


From 0050529af8da6524ca8464a86a150c2e0877e1d3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 15 Jan 2020 10:54:58 +0100
Subject: [PATCH 1111/2161] Documentation: admin-guide: PM: Add intel_idle
 document

commit a3299182216397a0b943d2549d1997f4eba2bdd2 upstream.

Add an admin-guide document for the intel_idle driver to describe
how it works: how it enumerates idle states, what happens during the
initialization of it, how it can be controlled via the kernel command
line and so on.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/pm/intel_idle.rst   | 246 ++++++++++++++++++
 .../admin-guide/pm/working-state.rst          |   1 +
 2 files changed, 247 insertions(+)
 create mode 100644 Documentation/admin-guide/pm/intel_idle.rst

diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst
new file mode 100644
index 000000000000..afbf778035f8
--- /dev/null
+++ b/Documentation/admin-guide/pm/intel_idle.rst
@@ -0,0 +1,246 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+==============================================
+``intel_idle`` CPU Idle Time Management Driver
+==============================================
+
+:Copyright: |copy| 2020 Intel Corporation
+
+:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+General Information
+===================
+
+``intel_idle`` is a part of the
+:doc:`CPU idle time management subsystem <cpuidle>` in the Linux kernel
+(``CPUIdle``).  It is the default CPU idle time management driver for the
+Nehalem and later generations of Intel processors, but the level of support for
+a particular processor model in it depends on whether or not it recognizes that
+processor model and may also depend on information coming from the platform
+firmware.  [To understand ``intel_idle`` it is necessary to know how ``CPUIdle``
+works in general, so this is the time to get familiar with :doc:`cpuidle` if you
+have not done that yet.]
+
+``intel_idle`` uses the ``MWAIT`` instruction to inform the processor that the
+logical CPU executing it is idle and so it may be possible to put some of the
+processor's functional blocks into low-power states.  That instruction takes two
+arguments (passed in the ``EAX`` and ``ECX`` registers of the target CPU), the
+first of which, referred to as a *hint*, can be used by the processor to
+determine what can be done (for details refer to Intel Software Developer’s
+Manual [1]_).  Accordingly, ``intel_idle`` refuses to work with processors in
+which the support for the ``MWAIT`` instruction has been disabled (for example,
+via the platform firmware configuration menu) or which do not support that
+instruction at all.
+
+``intel_idle`` is not modular, so it cannot be unloaded, which means that the
+only way to pass early-configuration-time parameters to it is via the kernel
+command line.
+
+
+.. _intel-idle-enumeration-of-states:
+
+Enumeration of Idle States
+==========================
+
+Each ``MWAIT`` hint value is interpreted by the processor as a license to
+reconfigure itself in a certain way in order to save energy.  The processor
+configurations (with reduced power draw) resulting from that are referred to
+as C-states (in the ACPI terminology) or idle states.  The list of meaningful
+``MWAIT`` hint values and idle states (i.e. low-power configurations of the
+processor) corresponding to them depends on the processor model and it may also
+depend on the configuration of the platform.
+
+In order to create a list of available idle states required by the ``CPUIdle``
+subsystem (see :ref:`idle-states-representation` in :doc:`cpuidle`),
+``intel_idle`` can use two sources of information: static tables of idle states
+for different processor models included in the driver itself and the ACPI tables
+of the system.  The former are always used if the processor model at hand is
+recognized by ``intel_idle`` and the latter are used if that is required for
+the given processor model (which is the case for all server processor models
+recognized by ``intel_idle``) or if the processor model is not recognized.
+
+If the ACPI tables are going to be used for building the list of available idle
+states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI
+objects corresponding to the CPUs in the system (refer to the ACPI specification
+[2]_ for the description of ``_CST`` and its output package).  Because the
+``CPUIdle`` subsystem expects that the list of idle states supplied by the
+driver will be suitable for all of the CPUs handled by it and ``intel_idle`` is
+registered as the ``CPUIdle`` driver for all of the CPUs in the system, the
+driver looks for the first ``_CST`` object returning at least one valid idle
+state description and such that all of the idle states included in its return
+package are of the FFH (Functional Fixed Hardware) type, which means that the
+``MWAIT`` instruction is expected to be used to tell the processor that it can
+enter one of them.  The return package of that ``_CST`` is then assumed to be
+applicable to all of the other CPUs in the system and the idle state
+descriptions extracted from it are stored in a preliminary list of idle states
+coming from the ACPI tables.  [This step is skipped if ``intel_idle`` is
+configured to ignore the ACPI tables; see `below <intel-idle-parameters_>`_.]
+
+Next, the first (index 0) entry in the list of available idle states is
+initialized to represent a "polling idle state" (a pseudo-idle state in which
+the target CPU continuously fetches and executes instructions), and the
+subsequent (real) idle state entries are populated as follows.
+
+If the processor model at hand is recognized by ``intel_idle``, there is a
+(static) table of idle state descriptions for it in the driver.  In that case,
+the "internal" table is the primary source of information on idle states and the
+information from it is copied to the final list of available idle states.  If
+using the ACPI tables for the enumeration of idle states is not required
+(depending on the processor model), all of the listed idle state are enabled by
+default (so all of them will be taken into consideration by ``CPUIdle``
+governors during CPU idle state selection).  Otherwise, some of the listed idle
+states may not be enabled by default if there are no matching entries in the
+preliminary list of idle states coming from the ACPI tables.  In that case user
+space still can enable them later (on a per-CPU basis) with the help of
+the ``disable`` idle state attribute in ``sysfs`` (see
+:ref:`idle-states-representation` in :doc:`cpuidle`).  This basically means that
+the idle states "known" to the driver may not be enabled by default if they have
+not been exposed by the platform firmware (through the ACPI tables).
+
+If the given processor model is not recognized by ``intel_idle``, but it
+supports ``MWAIT``, the preliminary list of idle states coming from the ACPI
+tables is used for building the final list that will be supplied to the
+``CPUIdle`` core during driver registration.  For each idle state in that list,
+the description, ``MWAIT`` hint and exit latency are copied to the corresponding
+entry in the final list of idle states.  The name of the idle state represented
+by it (to be returned by the ``name`` idle state attribute in ``sysfs``) is
+"CX_ACPI", where X is the index of that idle state in the final list (note that
+the minimum value of X is 1, because 0 is reserved for the "polling" state), and
+its target residency is based on the exit latency value.  Specifically, for
+C1-type idle states the exit latency value is also used as the target residency
+(for compatibility with the majority of the "internal" tables of idle states for
+various processor models recognized by ``intel_idle``) and for the other idle
+state types (C2 and C3) the target residency value is 3 times the exit latency
+(again, that is because it reflects the target residency to exit latency ratio
+in the majority of cases for the processor models recognized by ``intel_idle``).
+All of the idle states in the final list are enabled by default in this case.
+
+
+.. _intel-idle-initialization:
+
+Initialization
+==============
+
+The initialization of ``intel_idle`` starts with checking if the kernel command
+line options forbid the use of the ``MWAIT`` instruction.  If that is the case,
+an error code is returned right away.
+
+The next step is to check whether or not the processor model is known to the
+driver, which determines the idle states enumeration method (see
+`above <intel-idle-enumeration-of-states_>`_), and whether or not the processor
+supports ``MWAIT`` (the initialization fails if that is not the case).  Then,
+the ``MWAIT`` support in the processor is enumerated through ``CPUID`` and the
+driver initialization fails if the level of support is not as expected (for
+example, if the total number of ``MWAIT`` substates returned is 0).
+
+Next, if the driver is not configured to ignore the ACPI tables (see
+`below <intel-idle-parameters_>`_), the idle states information provided by the
+platform firmware is extracted from them.
+
+Then, ``CPUIdle`` device objects are allocated for all CPUs and the list of
+available idle states is created as explained
+`above <intel-idle-enumeration-of-states_>`_.
+
+Finally, ``intel_idle`` is registered with the help of cpuidle_register_driver()
+as the ``CPUIdle`` driver for all CPUs in the system and a CPU online callback
+for configuring individual CPUs is registered via cpuhp_setup_state(), which
+(among other things) causes the callback routine to be invoked for all of the
+CPUs present in the system at that time (each CPU executes its own instance of
+the callback routine).  That routine registers a ``CPUIdle`` device for the CPU
+running it (which enables the ``CPUIdle`` subsystem to operate that CPU) and
+optionally performs some CPU-specific initialization actions that may be
+required for the given processor model.
+
+
+.. _intel-idle-parameters:
+
+Kernel Command Line Options and Module Parameters
+=================================================
+
+The *x86* architecture support code recognizes three kernel command line
+options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
+and ``idle=nomwait``.  If any of them is present in the kernel command line, the
+``MWAIT`` instruction is not allowed to be used, so the initialization of
+``intel_idle`` will fail.
+
+Apart from that there are two module parameters recognized by ``intel_idle``
+itself that can be set via the kernel command line (they cannot be updated via
+sysfs, so that is the only way to change their values).
+
+The ``max_cstate`` parameter value is the maximum idle state index in the list
+of idle states supplied to the ``CPUIdle`` core during the registration of the
+driver.  It is also the maximum number of regular (non-polling) idle states that
+can be used by ``intel_idle``, so the enumeration of idle states is terminated
+after finding that number of usable idle states (the other idle states that
+potentially might have been used if ``max_cstate`` had been greater are not
+taken into consideration at all).  Setting ``max_cstate`` can prevent
+``intel_idle`` from exposing idle states that are regarded as "too deep" for
+some reason to the ``CPUIdle`` core, but it does so by making them effectively
+invisible until the system is shut down and started again which may not always
+be desirable.  In practice, it is only really necessary to do that if the idle
+states in question cannot be enabled during system startup, because in the
+working state of the system the CPU power management quality of service (PM
+QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states
+even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`).
+Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail.
+
+The ``noacpi`` module parameter (which is recognized by ``intel_idle`` if the
+kernel has been configured with ACPI support), can be set to make the driver
+ignore the system's ACPI tables entirely (it is unset by default).
+
+
+.. _intel-idle-core-and-package-idle-states:
+
+Core and Package Levels of Idle States
+======================================
+
+Typically, in a processor supporting the ``MWAIT`` instruction there are (at
+least) two levels of idle states (or C-states).  One level, referred to as
+"core C-states", covers individual cores in the processor, whereas the other
+level, referred to as "package C-states", covers the entire processor package
+and it may also involve other components of the system (GPUs, memory
+controllers, I/O hubs etc.).
+
+Some of the ``MWAIT`` hint values allow the processor to use core C-states only
+(most importantly, that is the case for the ``MWAIT`` hint value corresponding
+to the ``C1`` idle state), but the majority of them give it a license to put
+the target core (i.e. the core containing the logical CPU executing ``MWAIT``
+with the given hint value) into a specific core C-state and then (if possible)
+to enter a specific package C-state at the deeper level.  For example, the
+``MWAIT`` hint value representing the ``C3`` idle state allows the processor to
+put the target core into the low-power state referred to as "core ``C3``" (or
+``CC3``), which happens if all of the logical CPUs (SMT siblings) in that core
+have executed ``MWAIT`` with the ``C3`` hint value (or with a hint value
+representing a deeper idle state), and in addition to that (in the majority of
+cases) it gives the processor a license to put the entire package (possibly
+including some non-CPU components such as a GPU or a memory controller) into the
+low-power state referred to as "package ``C3``" (or ``PC3``), which happens if
+all of the cores have gone into the ``CC3`` state and (possibly) some additional
+conditions are satisfied (for instance, if the GPU is covered by ``PC3``, it may
+be required to be in a certain GPU-specific low-power state for ``PC3`` to be
+reachable).
+
+As a rule, there is no simple way to make the processor use core C-states only
+if the conditions for entering the corresponding package C-states are met, so
+the logical CPU executing ``MWAIT`` with a hint value that is not core-level
+only (like for ``C1``) must always assume that this may cause the processor to
+enter a package C-state.  [That is why the exit latency and target residency
+values corresponding to the majority of ``MWAIT`` hint values in the "internal"
+tables of idle states in ``intel_idle`` reflect the properties of package
+C-states.]  If using package C-states is not desirable at all, either
+:ref:`PM QoS <cpu-pm-qos>` or the ``max_cstate`` module parameter of
+``intel_idle`` described `above <intel-idle-parameters_>`_ must be used to
+restrict the range of permissible idle states to the ones with core-level only
+``MWAIT`` hint values (like ``C1``).
+
+
+References
+==========
+
+.. [1] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2B*,
+       https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2b-manual.html
+
+.. [2] *Advanced Configuration and Power Interface (ACPI) Specification*,
+       https://uefi.org/specifications
diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst
index fc298eb1234b..88f717e59a42 100644
--- a/Documentation/admin-guide/pm/working-state.rst
+++ b/Documentation/admin-guide/pm/working-state.rst
@@ -8,6 +8,7 @@ Working-State Power Management
    :maxdepth: 2
 
    cpuidle
+   intel_idle
    cpufreq
    intel_pstate
    intel_epb
-- 
Gitee


From e0683971a5a3aedefe91ff4c22eba3a0350b8f81 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 10 Jan 2020 11:43:23 +0100
Subject: [PATCH 1112/2161] intel_idle: Eliminate __setup_broadcast_timer()

commit cbd2c4c25d7e0ba7afe42f2397865b164d1109eb upstream.

The __setup_broadcast_timer() static function is only called in one
place and "true" is passed to it as the argument in there, so
effectively it is a wrapper arround tick_broadcast_enable().

To simplify the code, call tick_broadcast_enable() directly instead
of __setup_broadcast_timer() and drop the latter.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 1467490adfc3..33e353bdd4b6 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -967,14 +967,6 @@ static bool intel_idle_verify_cstate(unsigned int mwait_hint)
 	return true;
 }
 
-static void __setup_broadcast_timer(bool on)
-{
-	if (on)
-		tick_broadcast_enable();
-	else
-		tick_broadcast_disable();
-}
-
 static void auto_demotion_disable(void)
 {
 	unsigned long long msr_bits;
@@ -1624,7 +1616,7 @@ static int intel_idle_cpu_online(unsigned int cpu)
 	struct cpuidle_device *dev;
 
 	if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
-		__setup_broadcast_timer(true);
+		tick_broadcast_enable();
 
 	/*
 	 * Some systems can hotplug a cpu at runtime after
-- 
Gitee


From 82a29e8cbf15725cac7020b31c45a9d8853ee694 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 10 Jan 2020 11:44:58 +0100
Subject: [PATCH 1113/2161] intel_idle: Fold intel_idle_probe() into
 intel_idle_init()

commit a6c86e336282c67a0b04f64b3b5574794249e3e1 upstream.

There is no particular reason why intel_idle_probe() needs to be
a separate function and folding it into intel_idle_init() causes
the code to be somewhat easier to follow, so do just that.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 97 +++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 55 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 33e353bdd4b6..690309c78dfb 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1296,58 +1296,6 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
-/*
- * intel_idle_probe()
- */
-static int __init intel_idle_probe(void)
-{
-	unsigned int eax, ebx, ecx;
-	const struct x86_cpu_id *id;
-
-	if (max_cstate == 0) {
-		pr_debug("disabled\n");
-		return -EPERM;
-	}
-
-	id = x86_match_cpu(intel_idle_ids);
-	if (id) {
-		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
-			pr_debug("Please enable MWAIT in BIOS SETUP\n");
-			return -ENODEV;
-		}
-	} else {
-		id = x86_match_cpu(intel_mwait_ids);
-		if (!id)
-			return -ENODEV;
-	}
-
-	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
-		return -ENODEV;
-
-	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
-
-	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
-	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
-	    !mwait_substates)
-			return -ENODEV;
-
-	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
-
-	icpu = (const struct idle_cpu *)id->driver_data;
-	if (icpu) {
-		cpuidle_state_table = icpu->state_table;
-		if (icpu->use_acpi)
-			intel_idle_acpi_cst_extract();
-	} else if (!intel_idle_acpi_cst_extract()) {
-		return -ENODEV;
-	}
-
-	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
-		 boot_cpu_data.x86_model);
-
-	return 0;
-}
-
 /*
  * intel_idle_cpuidle_devices_uninit()
  * Unregisters the cpuidle devices.
@@ -1632,15 +1580,54 @@ static int intel_idle_cpu_online(unsigned int cpu)
 
 static int __init intel_idle_init(void)
 {
+	const struct x86_cpu_id *id;
+	unsigned int eax, ebx, ecx;
 	int retval;
 
 	/* Do not load intel_idle at all for now if idle= is passed */
 	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
 		return -ENODEV;
 
-	retval = intel_idle_probe();
-	if (retval)
-		return retval;
+	if (max_cstate == 0) {
+		pr_debug("disabled\n");
+		return -EPERM;
+	}
+
+	id = x86_match_cpu(intel_idle_ids);
+	if (id) {
+		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
+			pr_debug("Please enable MWAIT in BIOS SETUP\n");
+			return -ENODEV;
+		}
+	} else {
+		id = x86_match_cpu(intel_mwait_ids);
+		if (!id)
+			return -ENODEV;
+	}
+
+	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return -ENODEV;
+
+	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+
+	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
+	    !mwait_substates)
+			return -ENODEV;
+
+	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
+
+	icpu = (const struct idle_cpu *)id->driver_data;
+	if (icpu) {
+		cpuidle_state_table = icpu->state_table;
+		if (icpu->use_acpi)
+			intel_idle_acpi_cst_extract();
+	} else if (!intel_idle_acpi_cst_extract()) {
+		return -ENODEV;
+	}
+
+	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
+		 boot_cpu_data.x86_model);
 
 	intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
 	if (intel_idle_cpuidle_devices == NULL)
-- 
Gitee


From 03a04444eaaa9ffedcabdda113977e9d200c836e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 10 Jan 2020 11:45:49 +0100
Subject: [PATCH 1114/2161] intel_idle: Clean up NULL pointer check in
 intel_idle_init()

commit 533da74a8c8d75af7958f4aa1abc944f061d1d85 upstream.

Instead of comparing intel_idle_cpuidle_devices with NULL apply
the "!" (not) operator to it when checking it against NULL.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 690309c78dfb..fddb72e5f6fa 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1630,7 +1630,7 @@ static int __init intel_idle_init(void)
 		 boot_cpu_data.x86_model);
 
 	intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
-	if (intel_idle_cpuidle_devices == NULL)
+	if (!intel_idle_cpuidle_devices)
 		return -ENOMEM;
 
 	intel_idle_cpuidle_driver_init();
-- 
Gitee


From 6121652fb8207a1235458aff6a01b97118e55883 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 10 Jan 2020 11:48:25 +0100
Subject: [PATCH 1115/2161] intel_idle: Rearrange
 intel_idle_cpuidle_driver_init()

commit 3d3a1ae9b4bedc3e8c88ff4baeb0a195cb447fa1 upstream.

Notice that intel_idle_state_table_update() only needs to be called
if icpu is not NULL, so fold it into intel_idle_init_cstates_icpu(),
and pass a pointer to the driver object to
intel_idle_cpuidle_driver_init() as an argument instead of
referencing it locally in there.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index fddb72e5f6fa..c535b0b55ea5 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1447,16 +1447,12 @@ static void sklh_idle_state_table_update(void)
 	skl_cstates[5].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C8-SKL */
 	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
-/*
- * intel_idle_state_table_update()
- *
- * Update the default state_table for this CPU-id
- */
 
-static void intel_idle_state_table_update(void)
+static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 {
-	switch (boot_cpu_data.x86_model) {
+	int cstate;
 
+	switch (boot_cpu_data.x86_model) {
 	case INTEL_FAM6_IVYBRIDGE_X:
 		ivt_idle_state_table_update();
 		break;
@@ -1468,11 +1464,6 @@ static void intel_idle_state_table_update(void)
 		sklh_idle_state_table_update();
 		break;
 	}
-}
-
-static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
-{
-	int cstate;
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
 		unsigned int mwait_hint;
@@ -1515,12 +1506,8 @@ static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
  * intel_idle_cpuidle_driver_init()
  * allocate, initialize cpuidle_states
  */
-static void __init intel_idle_cpuidle_driver_init(void)
+static void __init intel_idle_cpuidle_driver_init(struct cpuidle_driver *drv)
 {
-	struct cpuidle_driver *drv = &intel_idle_driver;
-
-	intel_idle_state_table_update();
-
 	cpuidle_poll_state_init(drv);
 	drv->state_count = 1;
 
@@ -1633,7 +1620,8 @@ static int __init intel_idle_init(void)
 	if (!intel_idle_cpuidle_devices)
 		return -ENOMEM;
 
-	intel_idle_cpuidle_driver_init();
+	intel_idle_cpuidle_driver_init(&intel_idle_driver);
+
 	retval = cpuidle_register_driver(&intel_idle_driver);
 	if (retval) {
 		struct cpuidle_driver *drv = cpuidle_get_driver();
-- 
Gitee


From 7bad8caa9f40084be72953a1313df9b13deb345a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 10 Jan 2020 11:49:58 +0100
Subject: [PATCH 1116/2161] intel_idle: Move and clean up
 intel_idle_cpuidle_devices_uninit()

commit 0755a9bd9963ec3d611129cf2cdbb3e041ccbd6a upstream.

Move intel_idle_cpuidle_devices_uninit() closer to its caller,
intel_idle_init(), add the __init modifier to its header, drop a
redundant local variable from it and fix up its kerneldoc comment.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c535b0b55ea5..9841046dfd32 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1296,21 +1296,6 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
-/*
- * intel_idle_cpuidle_devices_uninit()
- * Unregisters the cpuidle devices.
- */
-static void intel_idle_cpuidle_devices_uninit(void)
-{
-	int i;
-	struct cpuidle_device *dev;
-
-	for_each_online_cpu(i) {
-		dev = per_cpu_ptr(intel_idle_cpuidle_devices, i);
-		cpuidle_unregister_device(dev);
-	}
-}
-
 /*
  * ivt_idle_state_table_update(void)
  *
@@ -1565,6 +1550,17 @@ static int intel_idle_cpu_online(unsigned int cpu)
 	return 0;
 }
 
+/**
+ * intel_idle_cpuidle_devices_uninit - Unregister all cpuidle devices.
+ */
+static void __init intel_idle_cpuidle_devices_uninit(void)
+{
+	int i;
+
+	for_each_online_cpu(i)
+		cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
+}
+
 static int __init intel_idle_init(void)
 {
 	const struct x86_cpu_id *id;
-- 
Gitee


From ad4894cf9dbbd5ddb89976d8fa12d664c0364038 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 10 Jan 2020 11:51:22 +0100
Subject: [PATCH 1117/2161] intel_idle: Annotate initialization code and data
 structures

commit 095928ae484b9f765fb4967c6cf76a2f86fc9787 upstream.

Annotate the functions that are only used at the initialization time
with __init and the data structures used by them with __initdata or
__initconst.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 9841046dfd32..b796eaea8715 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1148,7 +1148,7 @@ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
 	{}
 };
 
-static bool intel_idle_max_cstate_reached(int cstate)
+static bool __init intel_idle_max_cstate_reached(int cstate)
 {
 	if (cstate + 1 > max_cstate) {
 		pr_info("max_cstate %d reached\n", max_cstate);
@@ -1164,7 +1164,7 @@ static bool no_acpi __read_mostly;
 module_param(no_acpi, bool, 0444);
 MODULE_PARM_DESC(no_acpi, "Do not use ACPI _CST for building the idle states list");
 
-static struct acpi_processor_power acpi_state_table;
+static struct acpi_processor_power acpi_state_table __initdata;
 
 /**
  * intel_idle_cst_usable - Check if the _CST information can be used.
@@ -1172,7 +1172,7 @@ static struct acpi_processor_power acpi_state_table;
  * Check if all of the C-states listed by _CST in the max_cstate range are
  * ACPI_CSTATE_FFH, which means that they should be entered via MWAIT.
  */
-static bool intel_idle_cst_usable(void)
+static bool __init intel_idle_cst_usable(void)
 {
 	int cstate, limit;
 
@@ -1189,7 +1189,7 @@ static bool intel_idle_cst_usable(void)
 	return true;
 }
 
-static bool intel_idle_acpi_cst_extract(void)
+static bool __init intel_idle_acpi_cst_extract(void)
 {
 	unsigned int cpu;
 
@@ -1224,7 +1224,7 @@ static bool intel_idle_acpi_cst_extract(void)
 	return false;
 }
 
-static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
+static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 {
 	int cstate, limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
 
@@ -1268,7 +1268,7 @@ static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 	}
 }
 
-static bool intel_idle_off_by_default(u32 mwait_hint)
+static bool __init intel_idle_off_by_default(u32 mwait_hint)
 {
 	int cstate, limit;
 
@@ -1302,7 +1302,7 @@ static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
  * Tune IVT multi-socket targets
  * Assumption: num_sockets == (max_package_num + 1)
  */
-static void ivt_idle_state_table_update(void)
+static void __init ivt_idle_state_table_update(void)
 {
 	/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
 	int cpu, package_num, num_sockets = 1;
@@ -1329,10 +1329,11 @@ static void ivt_idle_state_table_update(void)
  * Translate IRTL (Interrupt Response Time Limit) MSR to usec
  */
 
-static unsigned int irtl_ns_units[] = {
-	1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
+static const unsigned int irtl_ns_units[] __initconst = {
+	1, 32, 1024, 32768, 1048576, 33554432, 0, 0
+};
 
-static unsigned long long irtl_2_usec(unsigned long long irtl)
+static unsigned long long __init irtl_2_usec(unsigned long long irtl)
 {
 	unsigned long long ns;
 
@@ -1349,7 +1350,7 @@ static unsigned long long irtl_2_usec(unsigned long long irtl)
  * On BXT, we trust the IRTL to show the definitive maximum latency
  * We use the same value for target_residency.
  */
-static void bxt_idle_state_table_update(void)
+static void __init bxt_idle_state_table_update(void)
 {
 	unsigned long long msr;
 	unsigned int usec;
@@ -1396,7 +1397,7 @@ static void bxt_idle_state_table_update(void)
  * On SKL-H (model 0x5e) disable C8 and C9 if:
  * C10 is enabled and SGX disabled
  */
-static void sklh_idle_state_table_update(void)
+static void __init sklh_idle_state_table_update(void)
 {
 	unsigned long long msr;
 	unsigned int eax, ebx, ecx, edx;
@@ -1433,7 +1434,7 @@ static void sklh_idle_state_table_update(void)
 	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
 
-static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
+static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 {
 	int cstate;
 
-- 
Gitee


From a87fd773ae7454974514d1860258e0459f3b7bd7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 10 Jan 2020 11:52:32 +0100
Subject: [PATCH 1118/2161] intel_idle: Move 3 functions closer to their
 callers

commit 1aefbd7aeb7695c1a9e6ece9a1612ee88e1ea3f8 upstream.

Move intel_idle_verify_cstate(), auto_demotion_disable() and
c1e_promotion_disable() closer to their callers.

While at it, annotate intel_idle_verify_cstate() with __init,
as it is only used during the initialization of the driver.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 67 ++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index b796eaea8715..27c50fba26d5 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -951,39 +951,6 @@ static void intel_idle_s2idle(struct cpuidle_device *dev,
 	mwait_idle_with_hints(eax, ecx);
 }
 
-static bool intel_idle_verify_cstate(unsigned int mwait_hint)
-{
-	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
-	unsigned int num_substates = (mwait_substates >> mwait_cstate * 4) &
-					MWAIT_SUBSTATE_MASK;
-
-	/* Ignore the C-state if there are NO sub-states in CPUID for it. */
-	if (num_substates == 0)
-		return false;
-
-	if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
-		mark_tsc_unstable("TSC halts in idle states deeper than C2");
-
-	return true;
-}
-
-static void auto_demotion_disable(void)
-{
-	unsigned long long msr_bits;
-
-	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
-	msr_bits &= ~(icpu->auto_demotion_disable_flags);
-	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
-}
-static void c1e_promotion_disable(void)
-{
-	unsigned long long msr_bits;
-
-	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
-	msr_bits &= ~0x2;
-	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
-}
-
 static const struct idle_cpu idle_cpu_nehalem = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@@ -1434,6 +1401,22 @@ static void __init sklh_idle_state_table_update(void)
 	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
 
+static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
+{
+	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
+	unsigned int num_substates = (mwait_substates >> mwait_cstate * 4) &
+					MWAIT_SUBSTATE_MASK;
+
+	/* Ignore the C-state if there are NO sub-states in CPUID for it. */
+	if (num_substates == 0)
+		return false;
+
+	if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+		mark_tsc_unstable("TSC halts in idle states deeper than C2");
+
+	return true;
+}
+
 static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 {
 	int cstate;
@@ -1503,6 +1486,24 @@ static void __init intel_idle_cpuidle_driver_init(struct cpuidle_driver *drv)
 		intel_idle_init_cstates_acpi(drv);
 }
 
+static void auto_demotion_disable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
+	msr_bits &= ~(icpu->auto_demotion_disable_flags);
+	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
+}
+
+static void c1e_promotion_disable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+	msr_bits &= ~0x2;
+	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
 /*
  * intel_idle_cpu_init()
  * allocate, initialize, register cpuidle_devices
-- 
Gitee


From 8fb3eb6ca2f099f99d5f3b0b19e27be968838aea Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 17 Jan 2020 11:46:24 +0100
Subject: [PATCH 1119/2161] intel_idle: Clean up irtl_2_usec()

commit 86e9466ae620824dee861f0f6ed933fcd0b449f3 upstream.

Move the irtl_ns_units[] definition into irtl_2_usec() which is the
only user of it, use div_u64() for the division in there (as the
divisor is small enough) and use the NSEC_PER_USEC symbol for the
divisor.  Also convert the irtl_2_usec() comment to a proper
kerneldo one.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 27c50fba26d5..9f38ff02a7b8 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1292,16 +1292,17 @@ static void __init ivt_idle_state_table_update(void)
 	/* else, 1 and 2 socket systems use default ivt_cstates */
 }
 
-/*
- * Translate IRTL (Interrupt Response Time Limit) MSR to usec
+/**
+ * irtl_2_usec - IRTL to microseconds conversion.
+ * @irtl: IRTL MSR value.
+ *
+ * Translate the IRTL (Interrupt Response Time Limit) MSR value to microseconds.
  */
-
-static const unsigned int irtl_ns_units[] __initconst = {
-	1, 32, 1024, 32768, 1048576, 33554432, 0, 0
-};
-
 static unsigned long long __init irtl_2_usec(unsigned long long irtl)
 {
+	static const unsigned int irtl_ns_units[] __initconst = {
+		1, 32, 1024, 32768, 1048576, 33554432, 0, 0
+	};
 	unsigned long long ns;
 
 	if (!irtl)
@@ -1309,8 +1310,9 @@ static unsigned long long __init irtl_2_usec(unsigned long long irtl)
 
 	ns = irtl_ns_units[(irtl >> 10) & 0x7];
 
-	return div64_u64((irtl & 0x3FF) * ns, 1000);
+	return div_u64((irtl & 0x3FF) * ns, NSEC_PER_USEC);
 }
+
 /*
  * bxt_idle_state_table_update(void)
  *
-- 
Gitee


From 399f0e4b90986f21ee180475f8266fb874e53705 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 3 Feb 2020 11:57:08 +0100
Subject: [PATCH 1120/2161] intel_idle: Introduce 'use_acpi' module parameter

commit 3a5be9b8f43346a24f31c0017cb2566a6b2c72c5 upstream.

For diagnostics, it is generally useful to be able to make intel_idle
take the system's ACPI tables into consideration even if that is not
required for the processor model in there, so introduce a new module
parameter, 'use_acpi', to make that happen and update the documentation
to cover it.

While at it, fix the 'no_acpi' module parameter name in the
documentation.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/pm/intel_idle.rst | 13 +++++++++----
 drivers/idle/intel_idle.c                   | 11 +++++++++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst
index afbf778035f8..8998598746a4 100644
--- a/Documentation/admin-guide/pm/intel_idle.rst
+++ b/Documentation/admin-guide/pm/intel_idle.rst
@@ -60,6 +60,9 @@ of the system.  The former are always used if the processor model at hand is
 recognized by ``intel_idle`` and the latter are used if that is required for
 the given processor model (which is the case for all server processor models
 recognized by ``intel_idle``) or if the processor model is not recognized.
+[There is a module parameter that can be used to make the driver use the ACPI
+tables with any processor model recognized by it; see
+`below <intel-idle-parameters_>`_.]
 
 If the ACPI tables are going to be used for building the list of available idle
 states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI
@@ -165,7 +168,7 @@ and ``idle=nomwait``.  If any of them is present in the kernel command line, the
 ``MWAIT`` instruction is not allowed to be used, so the initialization of
 ``intel_idle`` will fail.
 
-Apart from that there are two module parameters recognized by ``intel_idle``
+Apart from that there are three module parameters recognized by ``intel_idle``
 itself that can be set via the kernel command line (they cannot be updated via
 sysfs, so that is the only way to change their values).
 
@@ -186,9 +189,11 @@ QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states
 even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`).
 Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail.
 
-The ``noacpi`` module parameter (which is recognized by ``intel_idle`` if the
-kernel has been configured with ACPI support), can be set to make the driver
-ignore the system's ACPI tables entirely (it is unset by default).
+The ``no_acpi`` and ``use_acpi`` module parameters (recognized by ``intel_idle``
+if the kernel has been configured with ACPI support) can be set to make the
+driver ignore the system's ACPI tables entirely or use them for all of the
+recognized processor models, respectively (they both are unset by default and
+``use_acpi`` has no effect if ``no_acpi`` is set).
 
 
 .. _intel-idle-core-and-package-idle-states:
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 9f38ff02a7b8..01cae77864f2 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1131,6 +1131,10 @@ static bool no_acpi __read_mostly;
 module_param(no_acpi, bool, 0444);
 MODULE_PARM_DESC(no_acpi, "Do not use ACPI _CST for building the idle states list");
 
+static bool force_use_acpi __read_mostly; /* No effect if no_acpi is set. */
+module_param_named(use_acpi, force_use_acpi, bool, 0444);
+MODULE_PARM_DESC(use_acpi, "Use ACPI _CST for building the idle states list");
+
 static struct acpi_processor_power acpi_state_table __initdata;
 
 /**
@@ -1258,6 +1262,8 @@ static bool __init intel_idle_off_by_default(u32 mwait_hint)
 	return true;
 }
 #else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
+#define force_use_acpi	(false)
+
 static inline bool intel_idle_acpi_cst_extract(void) { return false; }
 static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
@@ -1460,7 +1466,8 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		/* Structure copy. */
 		drv->states[drv->state_count] = cpuidle_state_table[cstate];
 
-		if (icpu->use_acpi && intel_idle_off_by_default(mwait_hint) &&
+		if ((icpu->use_acpi || force_use_acpi) &&
+		    intel_idle_off_by_default(mwait_hint) &&
 		    !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))
 			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF;
 
@@ -1607,7 +1614,7 @@ static int __init intel_idle_init(void)
 	icpu = (const struct idle_cpu *)id->driver_data;
 	if (icpu) {
 		cpuidle_state_table = icpu->state_table;
-		if (icpu->use_acpi)
+		if (icpu->use_acpi || force_use_acpi)
 			intel_idle_acpi_cst_extract();
 	} else if (!intel_idle_acpi_cst_extract()) {
 		return -ENODEV;
-- 
Gitee


From eec0fd323bcb618e52d7089bb3d92142b7826051 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 3 Feb 2020 11:57:18 +0100
Subject: [PATCH 1121/2161] intel_idle: Introduce 'states_off' module parameter

commit 4dcb78ee579cdf90e30c5a0223f6f160ea37182d upstream.

In certain system configurations it may not be desirable to use some
C-states assumed to be available by intel_idle and the driver needs
to be prevented from using them even before the cpuidle sysfs
interface becomes accessible to user space.  Currently, the only way
to achieve that is by setting the 'max_cstate' module parameter to a
value lower than the index of the shallowest of the C-states in
question, but that may be overly intrusive, because it effectively
makes all of the idle states deeper than the 'max_cstate' one go
away (and the C-state to avoid may be in the middle of the range
normally regarded as available).

To allow that limitation to be overcome, introduce a new module
parameter called 'states_off' to represent a list of idle states to
be disabled by default in the form of a bitmask and update the
documentation to cover it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/pm/intel_idle.rst | 19 ++++++++++++++++-
 drivers/idle/intel_idle.c                   | 23 ++++++++++++++++++---
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst
index 8998598746a4..89309e1b0e48 100644
--- a/Documentation/admin-guide/pm/intel_idle.rst
+++ b/Documentation/admin-guide/pm/intel_idle.rst
@@ -168,7 +168,7 @@ and ``idle=nomwait``.  If any of them is present in the kernel command line, the
 ``MWAIT`` instruction is not allowed to be used, so the initialization of
 ``intel_idle`` will fail.
 
-Apart from that there are three module parameters recognized by ``intel_idle``
+Apart from that there are four module parameters recognized by ``intel_idle``
 itself that can be set via the kernel command line (they cannot be updated via
 sysfs, so that is the only way to change their values).
 
@@ -195,6 +195,23 @@ driver ignore the system's ACPI tables entirely or use them for all of the
 recognized processor models, respectively (they both are unset by default and
 ``use_acpi`` has no effect if ``no_acpi`` is set).
 
+The value of the ``states_off`` module parameter (0 by default) represents a
+list of idle states to be disabled by default in the form of a bitmask.
+
+Namely, the positions of the bits that are set in the ``states_off`` value are
+the indices of idle states to be disabled by default (as reflected by the names
+of the corresponding idle state directories in ``sysfs``, :file:`state0`,
+:file:`state1` ... :file:`state<i>` ..., where ``<i>`` is the index of the given
+idle state; see :ref:`idle-states-representation` in :doc:`cpuidle`).
+
+For example, if ``states_off`` is equal to 3, the driver will disable idle
+states 0 and 1 by default, and if it is equal to 8, idle state 3 will be
+disabled by default and so on (bit positions beyond the maximum idle state index
+are ignored).
+
+The idle states disabled this way can be enabled (on a per-CPU basis) from user
+space via ``sysfs``.
+
 
 .. _intel-idle-core-and-package-idle-states:
 
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 01cae77864f2..9556e4c843e1 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -63,6 +63,7 @@ static struct cpuidle_driver intel_idle_driver = {
 };
 /* intel_idle.max_cstate=0 disables driver */
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
+static unsigned int disabled_states_mask;
 
 static unsigned int mwait_substates;
 
@@ -1234,6 +1235,9 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		if (cx->type > ACPI_STATE_C2)
 			state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
 
+		if (disabled_states_mask & BIT(cstate))
+			state->flags |= CPUIDLE_FLAG_OFF;
+
 		state->enter = intel_idle;
 		state->enter_s2idle = intel_idle_s2idle;
 	}
@@ -1466,9 +1470,10 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		/* Structure copy. */
 		drv->states[drv->state_count] = cpuidle_state_table[cstate];
 
-		if ((icpu->use_acpi || force_use_acpi) &&
-		    intel_idle_off_by_default(mwait_hint) &&
-		    !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))
+		if ((disabled_states_mask & BIT(drv->state_count)) ||
+		    ((icpu->use_acpi || force_use_acpi) &&
+		     intel_idle_off_by_default(mwait_hint) &&
+		     !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE)))
 			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF;
 
 		drv->state_count++;
@@ -1487,6 +1492,10 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 static void __init intel_idle_cpuidle_driver_init(struct cpuidle_driver *drv)
 {
 	cpuidle_poll_state_init(drv);
+
+	if (disabled_states_mask & BIT(0))
+		drv->states[0].flags |= CPUIDLE_FLAG_OFF;
+
 	drv->state_count = 1;
 
 	if (icpu)
@@ -1667,3 +1676,11 @@ device_initcall(intel_idle_init);
  * is the easiest way (currently) to continue doing that.
  */
 module_param(max_cstate, int, 0444);
+/*
+ * The positions of the bits that are set in this number are the indices of the
+ * idle states to be disabled by default (as reflected by the names of the
+ * corresponding idle state directories in sysfs, "state0", "state1" ...
+ * "state<i>" ..., where <i> is the index of the given state).
+ */
+module_param_named(states_off, disabled_states_mask, uint, 0444);
+MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
-- 
Gitee


From 9a0fc6a2f6817b10102d0b2846f217cac89051ff Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 5 Feb 2020 02:08:31 +0100
Subject: [PATCH 1122/2161] cpuidle: Documentation: Clean up PM QoS description

commit f06572ef476d368a239f0238ecf7b00b9cdbf5bf upstream.

Clean up the language in one paragraph in the PM QoS description in
Documentation/admin-guide/pm/cpuidle.rst.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/pm/cpuidle.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index 311cd7cc2b75..6a06dc473dd6 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -632,16 +632,16 @@ class priority list and destroyed.  If that happens, the priority list mechanism
 will be used, again, to determine the new effective value for the whole list
 and that value will become the new real constraint.
 
-In turn, for each CPU there is only one resume latency PM QoS request
-associated with the :file:`power/pm_qos_resume_latency_us` file under
+In turn, for each CPU there is one resume latency PM QoS request associated with
+the :file:`power/pm_qos_resume_latency_us` file under
 :file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
 this single PM QoS request to be updated regardless of which user space
 process does that.  In other words, this PM QoS request is shared by the entire
 user space, so access to the file associated with it needs to be arbitrated
 to avoid confusion.  [Arguably, the only legitimate use of this mechanism in
 practice is to pin a process to the CPU in question and let it use the
-``sysfs`` interface to control the resume latency constraint for it.]  It
-still only is a request, however.  It is a member of a priority list used to
+``sysfs`` interface to control the resume latency constraint for it.]  It is
+still only a request, however.  It is an entry in a priority list used to
 determine the effective value to be set as the resume latency constraint for the
 CPU in question every time the list of requests is updated this way or another
 (there may be other requests coming from kernel code in that list).
-- 
Gitee


From 80b7fe88b5e31a8459ae06c3f96c609d76341287 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:40:54 +0100
Subject: [PATCH 1123/2161] intel_idle: Simplify LAPIC timer reliability checks

commit 40ab82e08d789d8f4437fec63142b40120c1c50f upstream.

The lapic_timer_always_reliable variable really takes only two values
and some arithmetic in intel_idle() related to comparing it with the
target C-state's MWAIT hint value is unnecessary.

Simplify the code by replacing lapic_timer_always_reliable with
a bool variable lapic_timer_always_reliable and dropping the
LAPIC_TIMER_ALWAYS_RELIABLE symbol along with the excess
computations in intel_idle().

While at it, add a comment explaining the branch taken in intel_idle()
if the LAPIC timer is only reliable in C1 and modify the related debug
message in intel_idle_init() accordingly (the modification of this
message in the only expected functional impact of the change made
here).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 9556e4c843e1..37ad03f20226 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -66,10 +66,7 @@ static int max_cstate = CPUIDLE_STATE_MAX - 1;
 static unsigned int disabled_states_mask;
 
 static unsigned int mwait_substates;
-
-#define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
-/* Reliable LAPIC Timer States, bit 1 for C1 etc.  */
-static unsigned int lapic_timer_reliable_states = (1 << 1);	 /* Default to only C1 */
+static bool lapic_timer_always_reliable;
 
 struct idle_cpu {
 	struct cpuidle_state *state_table;
@@ -908,7 +905,6 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	unsigned long ecx = 1; /* break on interrupt flag */
 	struct cpuidle_state *state = &drv->states[index];
 	unsigned long eax = flg2MWAIT(state->flags);
-	unsigned int cstate;
 	bool uninitialized_var(tick);
 	int cpu = smp_processor_id();
 
@@ -919,13 +915,16 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
 		leave_mm(cpu);
 
-	if (!static_cpu_has(X86_FEATURE_ARAT)) {
-		cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) &
-				MWAIT_CSTATE_MASK) + 1;
-		tick = false;
-		if (!(lapic_timer_reliable_states & (1 << (cstate)))) {
+	if (!static_cpu_has(X86_FEATURE_ARAT) && !lapic_timer_always_reliable) {
+		/*
+		 * Switch over to one-shot tick broadcast if the target C-state
+		 * is deeper than C1.
+		 */
+		if ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) {
 			tick = true;
 			tick_broadcast_enter();
+		} else {
+			tick = false;
 		}
 	}
 
@@ -1555,7 +1554,7 @@ static int intel_idle_cpu_online(unsigned int cpu)
 {
 	struct cpuidle_device *dev;
 
-	if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
+	if (!lapic_timer_always_reliable)
 		tick_broadcast_enable();
 
 	/*
@@ -1647,15 +1646,15 @@ static int __init intel_idle_init(void)
 	}
 
 	if (boot_cpu_has(X86_FEATURE_ARAT))	/* Always Reliable APIC Timer */
-		lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
+		lapic_timer_always_reliable = true;
 
 	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
 				   intel_idle_cpu_online, NULL);
 	if (retval < 0)
 		goto hp_setup_fail;
 
-	pr_debug("lapic_timer_reliable_states 0x%x\n",
-		 lapic_timer_reliable_states);
+	pr_debug("Local APIC timer is reliable in %s\n",
+		 lapic_timer_always_reliable ? "all C-states" : "C1");
 
 	return 0;
 
-- 
Gitee


From f5229731efd9ab536c28e083849acfc8ad89a4e8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:41:06 +0100
Subject: [PATCH 1124/2161] intel_idle: Clean up definitions of cpuidle
 callbacks

commit bc721c1e4517a5110c4b0ae2e80345d3db6425fc upstream.

Add proper kerneldoc descriptions to intel_idle() and
intel_idle_s2idle(), annotate the latter with __cpuidle and
reorder the declarations of local variables in both of them to
reflect the mwait_idle_with_hints() arguments order.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 37ad03f20226..8cff52dad76e 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -892,19 +892,28 @@ static struct cpuidle_state dnv_cstates[] = {
 };
 
 /**
- * intel_idle
- * @dev: cpuidle_device
- * @drv: cpuidle driver
- * @index: index of cpuidle state
+ * intel_idle - Ask the processor to enter the given idle state.
+ * @dev: cpuidle device of the target CPU.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ * @index: Target idle state index.
+ *
+ * Use the MWAIT instruction to notify the processor that the CPU represented by
+ * @dev is idle and it can try to enter the idle state corresponding to @index.
+ *
+ * If the local APIC timer is not known to be reliable in the target idle state,
+ * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
+ *
+ * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
+ * flushing user TLBs.
  *
  * Must be called under local_irq_disable().
  */
 static __cpuidle int intel_idle(struct cpuidle_device *dev,
 				struct cpuidle_driver *drv, int index)
 {
-	unsigned long ecx = 1; /* break on interrupt flag */
 	struct cpuidle_state *state = &drv->states[index];
 	unsigned long eax = flg2MWAIT(state->flags);
+	unsigned long ecx = 1; /* break on interrupt flag */
 	bool uninitialized_var(tick);
 	int cpu = smp_processor_id();
 
@@ -937,16 +946,22 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 }
 
 /**
- * intel_idle_s2idle - simplified "enter" callback routine for suspend-to-idle
- * @dev: cpuidle_device
- * @drv: cpuidle driver
- * @index: state index
+ * intel_idle_s2idle - Ask the processor to enter the given idle state.
+ * @dev: cpuidle device of the target CPU.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ * @index: Target idle state index.
+ *
+ * Use the MWAIT instruction to notify the processor that the CPU represented by
+ * @dev is idle and it can try to enter the idle state corresponding to @index.
+ *
+ * Invoked as a suspend-to-idle callback routine with frozen user space, frozen
+ * scheduler tick and suspended scheduler clock on the target CPU.
  */
-static void intel_idle_s2idle(struct cpuidle_device *dev,
-			     struct cpuidle_driver *drv, int index)
+static __cpuidle void intel_idle_s2idle(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int index)
 {
-	unsigned long ecx = 1; /* break on interrupt flag */
 	unsigned long eax = flg2MWAIT(drv->states[index].flags);
+	unsigned long ecx = 1; /* break on interrupt flag */
 
 	mwait_idle_with_hints(eax, ecx);
 }
-- 
Gitee


From 22dfb4bf46206f7f85cb27a13bdd646d8e9ea854 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:41:15 +0100
Subject: [PATCH 1125/2161] intel_idle: Relocate definitions of cpuidle
 callbacks

commit 30a996fbb359ed53536a055af84a54223beabf91 upstream.

Move the definitions of intel_idle() and intel_idle_s2idle() before
the definitions of cpuidle_state structures referring to them to
avoid having to use additional declarations of them (and drop those
declarations).

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 154 +++++++++++++++++++-------------------
 1 file changed, 75 insertions(+), 79 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 8cff52dad76e..4d18f708a7d4 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -83,10 +83,6 @@ struct idle_cpu {
 
 static const struct idle_cpu *icpu;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
-static int intel_idle(struct cpuidle_device *dev,
-			struct cpuidle_driver *drv, int index);
-static void intel_idle_s2idle(struct cpuidle_device *dev,
-			      struct cpuidle_driver *drv, int index);
 static struct cpuidle_state *cpuidle_state_table;
 
 /*
@@ -112,6 +108,81 @@ static struct cpuidle_state *cpuidle_state_table;
 #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
 #define MWAIT2flg(eax) ((eax & 0xFF) << 24)
 
+/**
+ * intel_idle - Ask the processor to enter the given idle state.
+ * @dev: cpuidle device of the target CPU.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ * @index: Target idle state index.
+ *
+ * Use the MWAIT instruction to notify the processor that the CPU represented by
+ * @dev is idle and it can try to enter the idle state corresponding to @index.
+ *
+ * If the local APIC timer is not known to be reliable in the target idle state,
+ * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
+ *
+ * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
+ * flushing user TLBs.
+ *
+ * Must be called under local_irq_disable().
+ */
+static __cpuidle int intel_idle(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv, int index)
+{
+	struct cpuidle_state *state = &drv->states[index];
+	unsigned long eax = flg2MWAIT(state->flags);
+	unsigned long ecx = 1; /* break on interrupt flag */
+	bool uninitialized_var(tick);
+	int cpu = smp_processor_id();
+
+	/*
+	 * leave_mm() to avoid costly and often unnecessary wakeups
+	 * for flushing the user TLB's associated with the active mm.
+	 */
+	if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
+		leave_mm(cpu);
+
+	if (!static_cpu_has(X86_FEATURE_ARAT) && !lapic_timer_always_reliable) {
+		/*
+		 * Switch over to one-shot tick broadcast if the target C-state
+		 * is deeper than C1.
+		 */
+		if ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) {
+			tick = true;
+			tick_broadcast_enter();
+		} else {
+			tick = false;
+		}
+	}
+
+	mwait_idle_with_hints(eax, ecx);
+
+	if (!static_cpu_has(X86_FEATURE_ARAT) && tick)
+		tick_broadcast_exit();
+
+	return index;
+}
+
+/**
+ * intel_idle_s2idle - Ask the processor to enter the given idle state.
+ * @dev: cpuidle device of the target CPU.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ * @index: Target idle state index.
+ *
+ * Use the MWAIT instruction to notify the processor that the CPU represented by
+ * @dev is idle and it can try to enter the idle state corresponding to @index.
+ *
+ * Invoked as a suspend-to-idle callback routine with frozen user space, frozen
+ * scheduler tick and suspended scheduler clock on the target CPU.
+ */
+static __cpuidle void intel_idle_s2idle(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int index)
+{
+	unsigned long eax = flg2MWAIT(drv->states[index].flags);
+	unsigned long ecx = 1; /* break on interrupt flag */
+
+	mwait_idle_with_hints(eax, ecx);
+}
+
 /*
  * States are indexed by the cstate number,
  * which is also the index into the MWAIT hint array.
@@ -891,81 +962,6 @@ static struct cpuidle_state dnv_cstates[] = {
 		.enter = NULL }
 };
 
-/**
- * intel_idle - Ask the processor to enter the given idle state.
- * @dev: cpuidle device of the target CPU.
- * @drv: cpuidle driver (assumed to point to intel_idle_driver).
- * @index: Target idle state index.
- *
- * Use the MWAIT instruction to notify the processor that the CPU represented by
- * @dev is idle and it can try to enter the idle state corresponding to @index.
- *
- * If the local APIC timer is not known to be reliable in the target idle state,
- * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
- *
- * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
- * flushing user TLBs.
- *
- * Must be called under local_irq_disable().
- */
-static __cpuidle int intel_idle(struct cpuidle_device *dev,
-				struct cpuidle_driver *drv, int index)
-{
-	struct cpuidle_state *state = &drv->states[index];
-	unsigned long eax = flg2MWAIT(state->flags);
-	unsigned long ecx = 1; /* break on interrupt flag */
-	bool uninitialized_var(tick);
-	int cpu = smp_processor_id();
-
-	/*
-	 * leave_mm() to avoid costly and often unnecessary wakeups
-	 * for flushing the user TLB's associated with the active mm.
-	 */
-	if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-		leave_mm(cpu);
-
-	if (!static_cpu_has(X86_FEATURE_ARAT) && !lapic_timer_always_reliable) {
-		/*
-		 * Switch over to one-shot tick broadcast if the target C-state
-		 * is deeper than C1.
-		 */
-		if ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) {
-			tick = true;
-			tick_broadcast_enter();
-		} else {
-			tick = false;
-		}
-	}
-
-	mwait_idle_with_hints(eax, ecx);
-
-	if (!static_cpu_has(X86_FEATURE_ARAT) && tick)
-		tick_broadcast_exit();
-
-	return index;
-}
-
-/**
- * intel_idle_s2idle - Ask the processor to enter the given idle state.
- * @dev: cpuidle device of the target CPU.
- * @drv: cpuidle driver (assumed to point to intel_idle_driver).
- * @index: Target idle state index.
- *
- * Use the MWAIT instruction to notify the processor that the CPU represented by
- * @dev is idle and it can try to enter the idle state corresponding to @index.
- *
- * Invoked as a suspend-to-idle callback routine with frozen user space, frozen
- * scheduler tick and suspended scheduler clock on the target CPU.
- */
-static __cpuidle void intel_idle_s2idle(struct cpuidle_device *dev,
-					struct cpuidle_driver *drv, int index)
-{
-	unsigned long eax = flg2MWAIT(drv->states[index].flags);
-	unsigned long ecx = 1; /* break on interrupt flag */
-
-	mwait_idle_with_hints(eax, ecx);
-}
-
 static const struct idle_cpu idle_cpu_nehalem = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
-- 
Gitee


From 97ae48c7d0a8ae2c934ae378099cb5b094891f32 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:41:24 +0100
Subject: [PATCH 1126/2161] intel_idle: Add __initdata annotations to init time
 variables

commit 7f843dd71258ca0e01674be1008eca98917f3f9f upstream.

Annotate static variables cpuidle_state_table and mwait_substates
with __initdata, because they are only used during the initialization
of the driver.

Also notice that static variable icpu could be annotated analogously
and the structure pointed to by it could be __initconst, but two of
its fields are accessed via icpu in intel_idle_cpu_init() and
auto_demotion_disable(), so introduce two new static variables,
auto_demotion_disable_flags and disable_promotion_to_c1e, to hold
the values of these fields, set them during the initialization and
use them in those functions instead of accessing the source data
structure via icpu.

That allows icpu to be annotated with __initdata, so do that,
and it will also allow some __initconst annotations to be added
subsequently.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4d18f708a7d4..78e519cddfd8 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -65,7 +65,11 @@ static struct cpuidle_driver intel_idle_driver = {
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
 static unsigned int disabled_states_mask;
 
-static unsigned int mwait_substates;
+static unsigned int mwait_substates __initdata;
+
+static unsigned long auto_demotion_disable_flags;
+static bool disable_promotion_to_c1e;
+
 static bool lapic_timer_always_reliable;
 
 struct idle_cpu {
@@ -81,9 +85,9 @@ struct idle_cpu {
 	bool use_acpi;
 };
 
-static const struct idle_cpu *icpu;
+static const struct idle_cpu *icpu __initdata;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
-static struct cpuidle_state *cpuidle_state_table;
+static struct cpuidle_state *cpuidle_state_table __initdata;
 
 /*
  * Enable this state by default even if the ACPI _CST does not list it.
@@ -1519,7 +1523,7 @@ static void auto_demotion_disable(void)
 	unsigned long long msr_bits;
 
 	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
-	msr_bits &= ~(icpu->auto_demotion_disable_flags);
+	msr_bits &= ~auto_demotion_disable_flags;
 	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
 }
 
@@ -1549,13 +1553,10 @@ static int intel_idle_cpu_init(unsigned int cpu)
 		return -EIO;
 	}
 
-	if (!icpu)
-		return 0;
-
-	if (icpu->auto_demotion_disable_flags)
+	if (auto_demotion_disable_flags)
 		auto_demotion_disable();
 
-	if (icpu->disable_promotion_to_c1e)
+	if (disable_promotion_to_c1e)
 		c1e_promotion_disable();
 
 	return 0;
@@ -1633,6 +1634,8 @@ static int __init intel_idle_init(void)
 	icpu = (const struct idle_cpu *)id->driver_data;
 	if (icpu) {
 		cpuidle_state_table = icpu->state_table;
+		auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
+		disable_promotion_to_c1e = icpu->disable_promotion_to_c1e;
 		if (icpu->use_acpi || force_use_acpi)
 			intel_idle_acpi_cst_extract();
 	} else if (!intel_idle_acpi_cst_extract()) {
-- 
Gitee


From 4e5a22a4ca322d79bd83425dad9a437310a65bbe Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:45:06 +0100
Subject: [PATCH 1127/2161] intel_idle: Annotate init time data structures

commit ab1a8522d81ed79f66ba87a02046ef7523f0d702 upstream.

Add __initdata or __initconst annotations to the static data
structures that are only used during the initialization of the
driver.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 78 +++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 78e519cddfd8..a5d578ee8f84 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -192,7 +192,7 @@ static __cpuidle void intel_idle_s2idle(struct cpuidle_device *dev,
  * which is also the index into the MWAIT hint array.
  * Thus C0 is a dummy.
  */
-static struct cpuidle_state nehalem_cstates[] = {
+static struct cpuidle_state nehalem_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -229,7 +229,7 @@ static struct cpuidle_state nehalem_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state snb_cstates[] = {
+static struct cpuidle_state snb_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -274,7 +274,7 @@ static struct cpuidle_state snb_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state byt_cstates[] = {
+static struct cpuidle_state byt_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -319,7 +319,7 @@ static struct cpuidle_state byt_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state cht_cstates[] = {
+static struct cpuidle_state cht_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -364,7 +364,7 @@ static struct cpuidle_state cht_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state ivb_cstates[] = {
+static struct cpuidle_state ivb_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -409,7 +409,7 @@ static struct cpuidle_state ivb_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state ivt_cstates[] = {
+static struct cpuidle_state ivt_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -446,7 +446,7 @@ static struct cpuidle_state ivt_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state ivt_cstates_4s[] = {
+static struct cpuidle_state ivt_cstates_4s[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -483,7 +483,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state ivt_cstates_8s[] = {
+static struct cpuidle_state ivt_cstates_8s[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -520,7 +520,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state hsw_cstates[] = {
+static struct cpuidle_state hsw_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -588,7 +588,7 @@ static struct cpuidle_state hsw_cstates[] = {
 	{
 		.enter = NULL }
 };
-static struct cpuidle_state bdw_cstates[] = {
+static struct cpuidle_state bdw_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -657,7 +657,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state skl_cstates[] = {
+static struct cpuidle_state skl_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -726,7 +726,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state skx_cstates[] = {
+static struct cpuidle_state skx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -755,7 +755,7 @@ static struct cpuidle_state skx_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state atom_cstates[] = {
+static struct cpuidle_state atom_cstates[] __initdata = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x00",
@@ -791,7 +791,7 @@ static struct cpuidle_state atom_cstates[] = {
 	{
 		.enter = NULL }
 };
-static struct cpuidle_state tangier_cstates[] = {
+static struct cpuidle_state tangier_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -835,7 +835,7 @@ static struct cpuidle_state tangier_cstates[] = {
 	{
 		.enter = NULL }
 };
-static struct cpuidle_state avn_cstates[] = {
+static struct cpuidle_state avn_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -855,7 +855,7 @@ static struct cpuidle_state avn_cstates[] = {
 	{
 		.enter = NULL }
 };
-static struct cpuidle_state knl_cstates[] = {
+static struct cpuidle_state knl_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -876,7 +876,7 @@ static struct cpuidle_state knl_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state bxt_cstates[] = {
+static struct cpuidle_state bxt_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -937,7 +937,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state dnv_cstates[] = {
+static struct cpuidle_state dnv_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -966,116 +966,116 @@ static struct cpuidle_state dnv_cstates[] = {
 		.enter = NULL }
 };
 
-static const struct idle_cpu idle_cpu_nehalem = {
+static const struct idle_cpu idle_cpu_nehalem __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_nhx = {
+static const struct idle_cpu idle_cpu_nhx __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_atom = {
+static const struct idle_cpu idle_cpu_atom __initconst = {
 	.state_table = atom_cstates,
 };
 
-static const struct idle_cpu idle_cpu_tangier = {
+static const struct idle_cpu idle_cpu_tangier __initconst = {
 	.state_table = tangier_cstates,
 };
 
-static const struct idle_cpu idle_cpu_lincroft = {
+static const struct idle_cpu idle_cpu_lincroft __initconst = {
 	.state_table = atom_cstates,
 	.auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
 };
 
-static const struct idle_cpu idle_cpu_snb = {
+static const struct idle_cpu idle_cpu_snb __initconst = {
 	.state_table = snb_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_snx = {
+static const struct idle_cpu idle_cpu_snx __initconst = {
 	.state_table = snb_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_byt = {
+static const struct idle_cpu idle_cpu_byt __initconst = {
 	.state_table = byt_cstates,
 	.disable_promotion_to_c1e = true,
 	.byt_auto_demotion_disable_flag = true,
 };
 
-static const struct idle_cpu idle_cpu_cht = {
+static const struct idle_cpu idle_cpu_cht __initconst = {
 	.state_table = cht_cstates,
 	.disable_promotion_to_c1e = true,
 	.byt_auto_demotion_disable_flag = true,
 };
 
-static const struct idle_cpu idle_cpu_ivb = {
+static const struct idle_cpu idle_cpu_ivb __initconst = {
 	.state_table = ivb_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_ivt = {
+static const struct idle_cpu idle_cpu_ivt __initconst = {
 	.state_table = ivt_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_hsw = {
+static const struct idle_cpu idle_cpu_hsw __initconst = {
 	.state_table = hsw_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_hsx = {
+static const struct idle_cpu idle_cpu_hsx __initconst = {
 	.state_table = hsw_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_bdw = {
+static const struct idle_cpu idle_cpu_bdw __initconst = {
 	.state_table = bdw_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_bdx = {
+static const struct idle_cpu idle_cpu_bdx __initconst = {
 	.state_table = bdw_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_skl = {
+static const struct idle_cpu idle_cpu_skl __initconst = {
 	.state_table = skl_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_skx = {
+static const struct idle_cpu idle_cpu_skx __initconst = {
 	.state_table = skx_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_avn = {
+static const struct idle_cpu idle_cpu_avn __initconst = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_knl = {
+static const struct idle_cpu idle_cpu_knl __initconst = {
 	.state_table = knl_cstates,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_bxt = {
+static const struct idle_cpu idle_cpu_bxt __initconst = {
 	.state_table = bxt_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_dnv = {
+static const struct idle_cpu idle_cpu_dnv __initconst = {
 	.state_table = dnv_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
-- 
Gitee


From b7e165c2a474a1ee3ff16c96e642cb689c5e7507 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:45:18 +0100
Subject: [PATCH 1128/2161] intel_idle: Reorder declarations of static
 variables

commit 6eb0443ac89d8f256a89bb2a14c8238b86a92a61 upstream.

Reorder declarations of static variables so that the __initdata ones
are declared together.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a5d578ee8f84..fac1d04ed062 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -65,7 +65,7 @@ static struct cpuidle_driver intel_idle_driver = {
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
 static unsigned int disabled_states_mask;
 
-static unsigned int mwait_substates __initdata;
+static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
 static unsigned long auto_demotion_disable_flags;
 static bool disable_promotion_to_c1e;
@@ -86,9 +86,10 @@ struct idle_cpu {
 };
 
 static const struct idle_cpu *icpu __initdata;
-static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 static struct cpuidle_state *cpuidle_state_table __initdata;
 
+static unsigned int mwait_substates __initdata;
+
 /*
  * Enable this state by default even if the ACPI _CST does not list it.
  */
-- 
Gitee


From e3b1416d93903f8101c5c7a991f64a34e2c0dd31 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:45:29 +0100
Subject: [PATCH 1129/2161] intel_idle: Clean up kerneldoc comments for
 multiple functions

commit 6eacb15fef4ef90b8c6467bd80b8be2de1fe1b3a upstream.

Turn the description comments of some functions in the intel_idle
driver into proper kerneldoc ones and clean them up.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 40 ++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index fac1d04ed062..317146eb6e85 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1288,11 +1288,11 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
-/*
- * ivt_idle_state_table_update(void)
+/**
+ * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
- * Tune IVT multi-socket targets
- * Assumption: num_sockets == (max_package_num + 1)
+ * Tune IVT multi-socket targets.
+ * Assumption: num_sockets == (max_package_num + 1).
  */
 static void __init ivt_idle_state_table_update(void)
 {
@@ -1338,11 +1338,11 @@ static unsigned long long __init irtl_2_usec(unsigned long long irtl)
 	return div_u64((irtl & 0x3FF) * ns, NSEC_PER_USEC);
 }
 
-/*
- * bxt_idle_state_table_update(void)
+/**
+ * bxt_idle_state_table_update - Fix up the Broxton idle states table.
  *
- * On BXT, we trust the IRTL to show the definitive maximum latency
- * We use the same value for target_residency.
+ * On BXT, trust the IRTL (Interrupt Response Time Limit) MSR to show the
+ * definitive maximum latency and use the same value for target_residency.
  */
 static void __init bxt_idle_state_table_update(void)
 {
@@ -1385,11 +1385,11 @@ static void __init bxt_idle_state_table_update(void)
 	}
 
 }
-/*
- * sklh_idle_state_table_update(void)
+
+/**
+ * sklh_idle_state_table_update - Fix up the Sky Lake idle states table.
  *
- * On SKL-H (model 0x5e) disable C8 and C9 if:
- * C10 is enabled and SGX disabled
+ * On SKL-H (model 0x5e) skip C8 and C9 if C10 is enabled and SGX disabled.
  */
 static void __init sklh_idle_state_table_update(void)
 {
@@ -1500,9 +1500,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	}
 }
 
-/*
- * intel_idle_cpuidle_driver_init()
- * allocate, initialize cpuidle_states
+/**
+ * intel_idle_cpuidle_driver_init - Create the list of available idle states.
+ * @drv: cpuidle driver structure to initialize.
  */
 static void __init intel_idle_cpuidle_driver_init(struct cpuidle_driver *drv)
 {
@@ -1537,10 +1537,12 @@ static void c1e_promotion_disable(void)
 	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
 }
 
-/*
- * intel_idle_cpu_init()
- * allocate, initialize, register cpuidle_devices
- * @cpu: cpu/core to initialize
+/**
+ * intel_idle_cpu_init - Register the target CPU with the cpuidle core.
+ * @cpu: CPU to initialize.
+ *
+ * Register a cpuidle device object for @cpu and update its MSRs in accordance
+ * with the processor model flags.
  */
 static int intel_idle_cpu_init(unsigned int cpu)
 {
-- 
Gitee


From cdca66856e8b091ec98cc918c36505bb7b7a4bb7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:45:39 +0100
Subject: [PATCH 1130/2161] intel_idle: Define CPUIDLE_FLAG_TLB_FLUSHED as
 BIT(16)

commit a472e4b5921e01c4ef84dd87d2bb30f1a6a896d4 upstream.

Use the BIT() macro for defining CPUIDLE_FLAG_TLB_FLUSHED instead of
the hex bit encoding of the same value.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 317146eb6e85..0d3d01c4db0c 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -101,7 +101,7 @@ static unsigned int mwait_substates __initdata;
  * If this flag is set, SW flushes the TLB, so even if the
  * HW doesn't do the flushing, this flag is safe to use.
  */
-#define CPUIDLE_FLAG_TLB_FLUSHED	0x10000
+#define CPUIDLE_FLAG_TLB_FLUSHED	BIT(16)
 
 /*
  * MWAIT takes an 8-bit "hint" in EAX "suggesting"
-- 
Gitee


From 2f7765e920b5c416e5cf168575123e8f24175e1e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 6 Feb 2020 18:45:49 +0100
Subject: [PATCH 1131/2161] intel_idle: Update copyright notice, known
 limitations and version

commit 317e5ec3ecaaa2d461d3044eb6cb9cc3a59de2f2 upstream.

Update the copyright notice in intel_idle.c to cover the recent
changes, drop the description of a "known limitation" that is not
a limitation any more and bump up the driver version number.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 0d3d01c4db0c..810ac2cbabe9 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -2,8 +2,9 @@
 /*
  * intel_idle.c - native hardware idle loop for modern Intel processors
  *
- * Copyright (c) 2013, Intel Corporation.
+ * Copyright (c) 2013 - 2020, Intel Corporation.
  * Len Brown <len.brown@intel.com>
+ * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
  */
 
 /*
@@ -25,11 +26,6 @@
 /*
  * Known limitations
  *
- * The driver currently initializes for_each_online_cpu() upon modprobe.
- * It it unaware of subsequent processors hot-added to the system.
- * This means that if you boot with maxcpus=n and later online
- * processors above n, those processors will use C1 only.
- *
  * ACPI has a .suspend hack to turn off deep c-statees during suspend
  * to avoid complications with the lapic timer workaround.
  * Have not seen issues with suspend, but may need same workaround here.
@@ -55,7 +51,7 @@
 #include <asm/mwait.h>
 #include <asm/msr.h>
 
-#define INTEL_IDLE_VERSION "0.4.1"
+#define INTEL_IDLE_VERSION "0.5.1"
 
 static struct cpuidle_driver intel_idle_driver = {
 	.name = "intel_idle",
-- 
Gitee


From d4b791fa37db9823daf147cffc9244b4bd2f291d Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Wed, 4 Mar 2020 12:32:48 +0100
Subject: [PATCH 1132/2161] cpuidle: haltpoll: allow force loading on hosts
 without the REALTIME hint

commit dd52551fb786aa7371bf9fe922573deafc26287e upstream.

Before commit 1328edca4a14 ("cpuidle-haltpoll: Enable kvm guest polling
when dedicated physical CPUs are available") the cpuidle-haltpoll driver
could also be used in scenarios when the host does not advertise the
KVM_HINTS_REALTIME hint.

While the behavior introduced by the aforementioned commit makes sense as
the default there are cases where the old behavior is desired, for example,
when other kernel changes triggered by presence by this hint are unwanted,
for some workloads where the latency benefit from polling overweights the
loss from idle CPU capacity that otherwise would be available, or just when
running under older Qemu versions that lack this hint.

Let's provide a typical "force" module parameter that allows restoring the
old behavior.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle-haltpoll.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
index b0ce9bc78113..db124bc1ca2c 100644
--- a/drivers/cpuidle/cpuidle-haltpoll.c
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -18,6 +18,10 @@
 #include <linux/kvm_para.h>
 #include <linux/cpuidle_haltpoll.h>
 
+static bool force __read_mostly;
+module_param(force, bool, 0444);
+MODULE_PARM_DESC(force, "Load unconditionally");
+
 static struct cpuidle_device __percpu *haltpoll_cpuidle_devices;
 static enum cpuhp_state haltpoll_hp_state;
 
@@ -90,6 +94,11 @@ static void haltpoll_uninit(void)
 	haltpoll_cpuidle_devices = NULL;
 }
 
+static bool haltpool_want(void)
+{
+	return kvm_para_has_hint(KVM_HINTS_REALTIME) || force;
+}
+
 static int __init haltpoll_init(void)
 {
 	int ret;
@@ -101,8 +110,7 @@ static int __init haltpoll_init(void)
 
 	cpuidle_poll_state_init(drv);
 
-	if (!kvm_para_available() ||
-		!kvm_para_has_hint(KVM_HINTS_REALTIME))
+	if (!kvm_para_available() || !haltpool_want())
 		return -ENODEV;
 
 	ret = cpuidle_register_driver(drv);
-- 
Gitee


From 23cec22439eb590fe85ca8dfffa89efea37d2b05 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 29 Jun 2020 13:58:28 +0200
Subject: [PATCH 1133/2161] intel_idle: Eliminate redundant static variable

commit dab20177b626198603176604c0f85ea1d67044ef upstream.

The value of the lapic_timer_always_reliable static variable in
the intel_idle driver reflects the boot_cpu_has(X86_FEATURE_ARAT)
value and so it also reflects the static_cpu_has(X86_FEATURE_ARAT)
value.

Hence, the lapic_timer_always_reliable check in intel_idle() is
redundant and apart from this lapic_timer_always_reliable is only
used in two places in which boot_cpu_has(X86_FEATURE_ARAT) can be
used directly.

Eliminate the lapic_timer_always_reliable variable in accordance
with the above observations.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 810ac2cbabe9..a81a8e06e351 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -66,8 +66,6 @@ static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 static unsigned long auto_demotion_disable_flags;
 static bool disable_promotion_to_c1e;
 
-static bool lapic_timer_always_reliable;
-
 struct idle_cpu {
 	struct cpuidle_state *state_table;
 
@@ -142,7 +140,7 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
 		leave_mm(cpu);
 
-	if (!static_cpu_has(X86_FEATURE_ARAT) && !lapic_timer_always_reliable) {
+	if (!static_cpu_has(X86_FEATURE_ARAT)) {
 		/*
 		 * Switch over to one-shot tick broadcast if the target C-state
 		 * is deeper than C1.
@@ -1565,7 +1563,7 @@ static int intel_idle_cpu_online(unsigned int cpu)
 {
 	struct cpuidle_device *dev;
 
-	if (!lapic_timer_always_reliable)
+	if (!boot_cpu_has(X86_FEATURE_ARAT))
 		tick_broadcast_enable();
 
 	/*
@@ -1658,16 +1656,13 @@ static int __init intel_idle_init(void)
 		goto init_driver_fail;
 	}
 
-	if (boot_cpu_has(X86_FEATURE_ARAT))	/* Always Reliable APIC Timer */
-		lapic_timer_always_reliable = true;
-
 	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
 				   intel_idle_cpu_online, NULL);
 	if (retval < 0)
 		goto hp_setup_fail;
 
 	pr_debug("Local APIC timer is reliable in %s\n",
-		 lapic_timer_always_reliable ? "all C-states" : "C1");
+		 boot_cpu_has(X86_FEATURE_ARAT) ? "all C-states" : "C1");
 
 	return 0;
 
-- 
Gitee


From b290dfb82ae4fb3ed6da60c4649e895ceda62639 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Fri, 10 Jul 2020 12:12:01 +0800
Subject: [PATCH 1134/2161] intel_idle: Customize IceLake server support

commit a472ad2bcea479ba068880125d7273fc95c14b70 upstream.

On ICX platform, the C1E auto-promotion is enabled by default.
As a result, the CPU might fall into C1E more offen than previous
platforms. Besides, the C1E is not exposed to sysfs on ICX, which
is inconsistent with previous server platforms.

So disable C1E auto-promotion and expose C1E as a separate idle
state, so the C1E and C6 can be disabled via sysfs when necessary.

Beside C1 and C1E, the exit latency of C6 was measured
by a dedicated tool. However the exit latency(41us) exposed
by _CST is much smaller than the one we measured(128us). This
is probably due to the _CST uses the exit latency when woken
up from PC0+C6, rather than PC6+C6 when C6 was measured. Choose
the latter as we need the longest latency in theory.

Reported-by: kernel test robot <lkp@intel.com>
Tested-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a81a8e06e351..13effdf71525 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -750,6 +750,35 @@ static struct cpuidle_state skx_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+static struct cpuidle_state icx_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.exit_latency = 4,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 128,
+		.target_residency = 384,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static struct cpuidle_state atom_cstates[] __initdata = {
 	{
 		.name = "C1E",
@@ -1054,6 +1083,12 @@ static const struct idle_cpu idle_cpu_skx __initconst = {
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_icx __initconst = {
+	.state_table = icx_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_avn __initconst = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1108,6 +1143,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(KABYLAKE_L,		idle_cpu_skl),
 	INTEL_CPU_FAM6(KABYLAKE,		idle_cpu_skl),
 	INTEL_CPU_FAM6(SKYLAKE_X,		idle_cpu_skx),
+	INTEL_CPU_FAM6(ICELAKE_X,		idle_cpu_icx),
 	INTEL_CPU_FAM6(XEON_PHI_KNL,		idle_cpu_knl),
 	INTEL_CPU_FAM6(XEON_PHI_KNM,		idle_cpu_knl),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT,		idle_cpu_bxt),
-- 
Gitee


From 199d31bfa5e10e857bb6b3114242359172a59653 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jun 2020 13:09:38 -0700
Subject: [PATCH 1135/2161] treewide: Remove uninitialized_var() usage for
 intel_idle.c

commit opencloudos.

Using uninitialized_var() is dangerous as it papers over real bugs[1]
(or can in the future), and suppresses unrelated compiler warnings
(e.g. "unused variable"). If the compiler thinks it is uninitialized,
either simply initialize the variable or make compiler changes.

In preparation for removing[2] the[3] macro[4], remove all remaining
needless uses with the following script:

git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \
	xargs perl -pi -e \
		's/\buninitialized_var\(([^\)]+)\)/\1/g;
		 s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;'

drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid
pathological white-space.

No outstanding warnings were found building allmodconfig with GCC 9.3.0
for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64,
alpha, and m68k.

[1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/
[2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/
[3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/
[4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/

Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5
Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB
Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers
Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 13effdf71525..e6bcb4ebf04a 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -130,7 +130,7 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	struct cpuidle_state *state = &drv->states[index];
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned long ecx = 1; /* break on interrupt flag */
-	bool uninitialized_var(tick);
+	bool tick;
 	int cpu = smp_processor_id();
 
 	/*
-- 
Gitee


From 54851dd08ea1330a6fd25fa0caa7298f381b1f52 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 25 Apr 2022 20:59:28 +0800
Subject: [PATCH 1136/2161] cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic

commit bf9282dc26e7fe2a0736edc568762f0f05d12416 upstream.

This allows moving the leave_mm() call into generic code before
rcu_idle_enter(). Gets rid of more trace_*_rcuidle() users.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Marco Elver <elver@google.com>
Link: https://lkml.kernel.org/r/20200821085348.369441600@infradead.org
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/mmu.h  |  1 +
 arch/x86/mm/tlb.c           | 13 ++-----------
 drivers/cpuidle/cpuidle.c   |  4 ++++
 drivers/idle/intel_idle.c   | 16 ----------------
 include/linux/cpuidle.h     | 13 +++++++------
 include/linux/mmu_context.h |  5 +++++
 6 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index e78c7db87801..d232de747837 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -63,5 +63,6 @@ typedef struct {
 	}
 
 void leave_mm(int cpu);
+#define leave_mm leave_mm
 
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 851359b7edc5..14b44ac910ad 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -422,21 +422,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 		load_new_mm_cr3(next->pgd, new_asid, true);
 
-		/*
-		 * NB: This gets called via leave_mm() in the idle path
-		 * where RCU functions differently.  Tracing normally
-		 * uses RCU, so we need to use the _rcuidle variant.
-		 *
-		 * (There is no good reason for this.  The idle code should
-		 *  be rearranged to call this before rcu_idle_enter().)
-		 */
-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 	} else {
 		/* The new ASID is already up to date. */
 		load_new_mm_cr3(next->pgd, new_asid, false);
 
-		/* See above wrt _rcuidle. */
-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
 	}
 
 	/* Make sure we write CR3 before loaded_mm. */
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c35928ccc14d..0acdbc7b0ef3 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/tick.h>
+#include <linux/mmu_context.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -225,6 +226,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		broadcast = false;
 	}
 
+	if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
+		leave_mm(dev->cpu);
+
 	/* Take note of the planned idle state. */
 	sched_idle_set_state(target_state);
 
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index e6bcb4ebf04a..cf6d1f7a0871 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -89,14 +89,6 @@ static unsigned int mwait_substates __initdata;
  */
 #define CPUIDLE_FLAG_ALWAYS_ENABLE	BIT(15)
 
-/*
- * Set this flag for states where the HW flushes the TLB for us
- * and so we don't need cross-calls to keep it consistent.
- * If this flag is set, SW flushes the TLB, so even if the
- * HW doesn't do the flushing, this flag is safe to use.
- */
-#define CPUIDLE_FLAG_TLB_FLUSHED	BIT(16)
-
 /*
  * MWAIT takes an 8-bit "hint" in EAX "suggesting"
  * the C-state (top nibble) and sub-state (bottom nibble)
@@ -131,14 +123,6 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned long ecx = 1; /* break on interrupt flag */
 	bool tick;
-	int cpu = smp_processor_id();
-
-	/*
-	 * leave_mm() to avoid costly and often unnecessary wakeups
-	 * for flushing the user TLB's associated with the active mm.
-	 */
-	if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-		leave_mm(cpu);
 
 	if (!static_cpu_has(X86_FEATURE_ARAT)) {
 		/*
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index ec2ef63771f0..9386ec4f21d1 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -72,12 +72,13 @@ struct cpuidle_state {
 };
 
 /* Idle State Flags */
-#define CPUIDLE_FLAG_NONE       (0x00)
-#define CPUIDLE_FLAG_POLLING	BIT(0) /* polling state */
-#define CPUIDLE_FLAG_COUPLED	BIT(1) /* state applies to multiple cpus */
-#define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */
-#define CPUIDLE_FLAG_UNUSABLE	BIT(3) /* avoid using this state */
-#define CPUIDLE_FLAG_OFF	BIT(4) /* disable this state by default */
+#define CPUIDLE_FLAG_NONE       	(0x00)
+#define CPUIDLE_FLAG_POLLING		BIT(0) /* polling state */
+#define CPUIDLE_FLAG_COUPLED		BIT(1) /* state applies to multiple cpus */
+#define CPUIDLE_FLAG_TIMER_STOP 	BIT(2) /* timer is stopped on this state */
+#define CPUIDLE_FLAG_UNUSABLE		BIT(3) /* avoid using this state */
+#define CPUIDLE_FLAG_OFF		BIT(4) /* disable this state by default */
+#define CPUIDLE_FLAG_TLB_FLUSHED	BIT(5) /* idle-state flushes TLBs */
 
 struct cpuidle_device_kobj;
 struct cpuidle_state_kobj;
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index d9a543a9e1cc..2494b640d5c7 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -3,6 +3,7 @@
 #define _LINUX_MMU_CONTEXT_H
 
 #include <asm/mmu_context.h>
+#include <asm/mmu.h>
 
 struct mm_struct;
 
@@ -14,4 +15,8 @@ void unuse_mm(struct mm_struct *mm);
 # define switch_mm_irqs_off switch_mm
 #endif
 
+#ifndef leave_mm
+static inline void leave_mm(int cpu) { }
+#endif
+
 #endif
-- 
Gitee


From 162a5802c0a0371aa4ad5e0120cef6368a56b07a Mon Sep 17 00:00:00 2001
From: Alexander Monakov <amonakov@ispras.ru>
Date: Mon, 12 Oct 2020 15:50:33 +0300
Subject: [PATCH 1137/2161] intel_idle: mention assumption that WBINVD is not
 needed

commit 8bb2e2a887afdf8a39e68fa0dccf82a168aae655 upstream.

Intel SDM does not explicitly say that entering a C-state via MWAIT will
implicitly flush CPU caches as appropriate for that C-state. However,
documentation for individual Intel CPU generations does mention this
behavior.

Since intel_idle binds to any Intel CPU with MWAIT, list this assumption
of MWAIT behavior.

In passing, reword opening comment to make it clear that the driver can
load on any old and future Intel CPU with MWAIT.

Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index cf6d1f7a0871..5d263497ff58 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -8,7 +8,7 @@
  */
 
 /*
- * intel_idle is a cpuidle driver that loads on specific Intel processors
+ * intel_idle is a cpuidle driver that loads on all Intel CPUs with MWAIT
  * in lieu of the legacy ACPI processor_idle driver.  The intent is to
  * make Linux more efficient on these processors, as intel_idle knows
  * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
@@ -20,7 +20,11 @@
  * All CPUs have same idle states as boot CPU
  *
  * Chipset BM_STS (bus master status) bit is a NOP
- *	for preventing entry into deep C-stats
+ *	for preventing entry into deep C-states
+ *
+ * CPU will flush caches as needed when entering a C-state via MWAIT
+ *	(in contrast to entering ACPI C3, in which case the WBINVD
+ *	instruction needs to be executed to flush the caches)
  */
 
 /*
-- 
Gitee


From 7c1e47f601b419e0498c1fb141b40f54ea5e3ef3 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 15 Oct 2020 16:44:27 +0200
Subject: [PATCH 1138/2161] cpuidle: Remove pointless stub

commit bae314dd5d8dfdd90ee584003a0f8c06e1bf3ea2 upstream.

The cpuidle.h header is declaring a function with an empty stub
for the cpuidle disabled case, but that function is only called
by cpuidle governors which depend on cpuidle anyway.

In other words, the function is only called when cpuidle is enabled,
so there is no need for the stub.

Remove the pointless stub.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/cpuidle.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 9386ec4f21d1..48a1840a7e3d 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -266,13 +266,8 @@ struct cpuidle_governor {
 	void (*reflect)		(struct cpuidle_device *dev, int index);
 };
 
-#ifdef CONFIG_CPU_IDLE
 extern int cpuidle_register_governor(struct cpuidle_governor *gov);
 extern s64 cpuidle_governor_latency_req(unsigned int cpu);
-#else
-static inline int cpuidle_register_governor(struct cpuidle_governor *gov)
-{return 0;}
-#endif
 
 #define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter,			\
 				idx,					\
-- 
Gitee


From d309e3dab6380c92db56f25e6cbb0959407fe042 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 16 Oct 2020 17:28:32 +0200
Subject: [PATCH 1139/2161] intel_idle: Ignore _CST if control cannot be taken
 from the platform

commit 75af76d0a34e048651a6af311781d7206b6964c7 upstream.

e6d4f08a6776 ("intel_idle: Use ACPI _CST on server systems") avoids
enabling c-states that have been disabled by the platform with the
exception of C1E.

Unfortunately, BIOS implementations are not always consistent in terms
of how capabilities are advertised and control cannot always be handed
over. If control cannot be handed over then intel_idle reports that "ACPI
_CST not found or not usable" but does not clear acpi_state_table.count
meaning the information is still partially used.

This patch ignores ACPI information if CST control cannot be requested from
the platform. This was only observed on a number of Haswell platforms that
had identical CPUs but not identical BIOS versions.  While this problem
may be rare overall, 24 separate test cases bisected to this specific
commit across 4 separate test machines and is worth addressing. If the
situation occurs, the kernel behaves as it did before commit e6d4f08a6776
and uses any c-states that are discovered.

The affected test cases were all ones that involved a small number of
processes -- exec microbenchmark, pipe microbenchmark, git test suite,
netperf, tbench with one client and system call microbenchmark. Each
case benefits from being able to use turboboost which is prevented if the
lower c-states are unavailable. This may mask real regressions specific
to older hardware so it is worth addressing.

C-state status before and after the patch

5.9.0-vanilla            POLL     latency:0      disabled:0 default:enabled
5.9.0-vanilla            C1       latency:2      disabled:0 default:enabled
5.9.0-vanilla            C1E      latency:10     disabled:0 default:enabled
5.9.0-vanilla            C3       latency:33     disabled:1 default:disabled
5.9.0-vanilla            C6       latency:133    disabled:1 default:disabled
5.9.0-ignore-cst-v1r1    POLL     latency:0      disabled:0 default:enabled
5.9.0-ignore-cst-v1r1    C1       latency:2      disabled:0 default:enabled
5.9.0-ignore-cst-v1r1    C1E      latency:10     disabled:0 default:enabled
5.9.0-ignore-cst-v1r1    C3       latency:33     disabled:0 default:enabled
5.9.0-ignore-cst-v1r1    C6       latency:133    disabled:0 default:enabled

Patch enables C3/C6.

Netperf UDP_STREAM

netperf-udp
                                      5.5.0                  5.9.0
                                    vanilla        ignore-cst-v1r1
Hmean     send-64         193.41 (   0.00%)      226.54 *  17.13%*
Hmean     send-128        392.16 (   0.00%)      450.54 *  14.89%*
Hmean     send-256        769.94 (   0.00%)      881.85 *  14.53%*
Hmean     send-1024      2994.21 (   0.00%)     3468.95 *  15.85%*
Hmean     send-2048      5725.60 (   0.00%)     6628.99 *  15.78%*
Hmean     send-3312      8468.36 (   0.00%)    10288.02 *  21.49%*
Hmean     send-4096     10135.46 (   0.00%)    12387.57 *  22.22%*
Hmean     send-8192     17142.07 (   0.00%)    19748.11 *  15.20%*
Hmean     send-16384    28539.71 (   0.00%)    30084.45 *   5.41%*

Fixes: e6d4f08a6776 ("intel_idle: Use ACPI _CST on server systems")
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: 5.6+ <stable@vger.kernel.org> # 5.6+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 5d263497ff58..3e5e7ab3ff41 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1217,14 +1217,13 @@ static bool __init intel_idle_acpi_cst_extract(void)
 		if (!intel_idle_cst_usable())
 			continue;
 
-		if (!acpi_processor_claim_cst_control()) {
-			acpi_state_table.count = 0;
-			return false;
-		}
+		if (!acpi_processor_claim_cst_control())
+			break;
 
 		return true;
 	}
 
+	acpi_state_table.count = 0;
 	pr_debug("ACPI _CST not found or not usable\n");
 	return false;
 }
-- 
Gitee


From 317b195587d440e0ee876d11cdd4291f7bb17172 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Sun, 25 Oct 2020 00:29:53 +0800
Subject: [PATCH 1140/2161] intel_idle: Fix max_cstate for processor models
 without C-state tables

commit 4e0ba5577dba686f96c1c10ef4166380667fdec7 upstream.

Currently intel_idle driver gets the c-state information from ACPI
_CST if the processor model is not recognized by it. However the
c-state in _CST starts with index 1 which is different from the
index in intel_idle driver's internal c-state table.

While intel_idle_max_cstate_reached() was previously introduced to
deal with intel_idle driver's internal c-state table, re-using
this function directly on _CST is incorrect.

Fix this by subtracting 1 from the index when checking max_cstate
in the _CST case.

For example, append intel_idle.max_cstate=1 in boot command line,
Before the patch:
grep . /sys/devices/system/cpu/cpu0/cpuidle/state*/name
POLL
After the patch:
grep . /sys/devices/system/cpu/cpu0/cpuidle/state*/name
/sys/devices/system/cpu/cpu0/cpuidle/state0/name:POLL
/sys/devices/system/cpu/cpu0/cpuidle/state1/name:C1_ACPI

Fixes: 18734958e9bf ("intel_idle: Use ACPI _CST for processor models without C-state tables")
Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Cc: 5.6+ <stable@vger.kernel.org> # 5.6+
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3e5e7ab3ff41..f5194f854ac9 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1240,7 +1240,7 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		struct acpi_processor_cx *cx;
 		struct cpuidle_state *state;
 
-		if (intel_idle_max_cstate_reached(cstate))
+		if (intel_idle_max_cstate_reached(cstate - 1))
 			break;
 
 		cx = &acpi_state_table.states[cstate];
-- 
Gitee


From 4c00dfa91fff1ec773288588acd65f54c756878b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Nov 2020 11:28:35 +0100
Subject: [PATCH 1141/2161] intel_idle: Fix intel_idle() vs tracing

commit 6e1d2bc675bd57640f5658a4a657ae488db4c204 upstream.

cpuidle->enter() callbacks should not call into tracing because RCU
has already been disabled. Instead of doing the broadcast thing
itself, simply advertise to the cpuidle core that those states stop
the timer.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lkml.kernel.org/r/20201123143510.GR3021@hirez.programming.kicks-ass.net
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index f5194f854ac9..36cc9981f339 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -126,26 +126,9 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	struct cpuidle_state *state = &drv->states[index];
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned long ecx = 1; /* break on interrupt flag */
-	bool tick;
-
-	if (!static_cpu_has(X86_FEATURE_ARAT)) {
-		/*
-		 * Switch over to one-shot tick broadcast if the target C-state
-		 * is deeper than C1.
-		 */
-		if ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) {
-			tick = true;
-			tick_broadcast_enter();
-		} else {
-			tick = false;
-		}
-	}
 
 	mwait_idle_with_hints(eax, ecx);
 
-	if (!static_cpu_has(X86_FEATURE_ARAT) && tick)
-		tick_broadcast_exit();
-
 	return index;
 }
 
@@ -1228,6 +1211,20 @@ static bool __init intel_idle_acpi_cst_extract(void)
 	return false;
 }
 
+static bool __init intel_idle_state_needs_timer_stop(struct cpuidle_state *state)
+{
+	unsigned long eax = flg2MWAIT(state->flags);
+
+	if (boot_cpu_has(X86_FEATURE_ARAT))
+		return false;
+
+	/*
+	 * Switch over to one-shot tick broadcast if the target C-state
+	 * is deeper than C1.
+	 */
+	return !!((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK);
+}
+
 static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 {
 	int cstate, limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
@@ -1270,6 +1267,9 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		if (disabled_states_mask & BIT(cstate))
 			state->flags |= CPUIDLE_FLAG_OFF;
 
+		if (intel_idle_state_needs_timer_stop(state))
+			state->flags |= CPUIDLE_FLAG_TIMER_STOP;
+
 		state->enter = intel_idle;
 		state->enter_s2idle = intel_idle_s2idle;
 	}
@@ -1508,6 +1508,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		     !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE)))
 			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF;
 
+		if (intel_idle_state_needs_timer_stop(&drv->states[drv->state_count]))
+			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_TIMER_STOP;
+
 		drv->state_count++;
 	}
 
-- 
Gitee


From e78d64ef6a891d34654f038b0c1d1906afb514e4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 Nov 2020 12:54:34 +0100
Subject: [PATCH 1142/2161] intel_idle: Build fix

commit 4d916140bf28ff027997144ea1bb4299e1536f87 upstream.

Because CONFIG_ soup.

Fixes: 6e1d2bc675bd ("intel_idle: Fix intel_idle() vs tracing")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201130115402.GO3040@hirez.programming.kicks-ass.net
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 36cc9981f339..69c20484eaab 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1141,6 +1141,20 @@ static bool __init intel_idle_max_cstate_reached(int cstate)
 	return false;
 }
 
+static bool __init intel_idle_state_needs_timer_stop(struct cpuidle_state *state)
+{
+	unsigned long eax = flg2MWAIT(state->flags);
+
+	if (boot_cpu_has(X86_FEATURE_ARAT))
+		return false;
+
+	/*
+	 * Switch over to one-shot tick broadcast if the target C-state
+	 * is deeper than C1.
+	 */
+	return !!((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK);
+}
+
 #ifdef CONFIG_ACPI_PROCESSOR_CSTATE
 #include <acpi/processor.h>
 
@@ -1211,20 +1225,6 @@ static bool __init intel_idle_acpi_cst_extract(void)
 	return false;
 }
 
-static bool __init intel_idle_state_needs_timer_stop(struct cpuidle_state *state)
-{
-	unsigned long eax = flg2MWAIT(state->flags);
-
-	if (boot_cpu_has(X86_FEATURE_ARAT))
-		return false;
-
-	/*
-	 * Switch over to one-shot tick broadcast if the target C-state
-	 * is deeper than C1.
-	 */
-	return !!((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK);
-}
-
 static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 {
 	int cstate, limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
-- 
Gitee


From f52e31408bca540ed1f2689d86286f7406b4c4ba Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Sun, 27 Dec 2020 12:11:16 +0200
Subject: [PATCH 1143/2161] intel_idle: add SnowRidge C-state table

commit 9cf93f056f783f986c19f40d5304d1bcffa0fc0d upstream.

Add C-state table for the SnowRidge SoC which is found on Intel Jacobsville
platforms.

The following has been changed.

 1. C1E latency changed from 10us to 15us. It was measured using the
    open source "wult" tool (the "nic" method, 15us is the 99.99th
    percentile).

 2. C1E power break even changed from 20us to 25us, which may result
    in less C1E residency in some workloads.

 3. C6 latency changed from 50us to 130us. Measured the same way as C1E.

The C6 C-state is supported only by some SnowRidge revisions, so add a C-state
table commentary about this.

On SnowRidge, C6 support is enumerated via the usual mechanism: "mwait" leaf of
the "cpuid" instruction. The 'intel_idle' driver does check this leaf, so even
though C6 is present in the table, the driver will only use it if the CPU does
support it.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 41 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 69c20484eaab..9fbc1cb49a30 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -961,6 +961,39 @@ static struct cpuidle_state dnv_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+/*
+ * Note, depending on HW and FW revision, SnowRidge SoC may or may not support
+ * C6, and this is indicated in the CPUID mwait leaf.
+ */
+static struct cpuidle_state snr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 2,
+		.target_residency = 2,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.exit_latency = 15,
+		.target_residency = 25,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 130,
+		.target_residency = 500,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static const struct idle_cpu idle_cpu_nehalem __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@@ -1082,6 +1115,12 @@ static const struct idle_cpu idle_cpu_dnv __initconst = {
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_snr __initconst = {
+	.state_table = snr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(NEHALEM_EP,		idle_cpu_nhx),
 	INTEL_CPU_FAM6(NEHALEM,			idle_cpu_nehalem),
@@ -1120,7 +1159,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(ATOM_GOLDMONT,		idle_cpu_bxt),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS,	idle_cpu_bxt),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT_D,		idle_cpu_dnv),
-	INTEL_CPU_FAM6(ATOM_TREMONT_D,		idle_cpu_dnv),
+	INTEL_CPU_FAM6(ATOM_TREMONT_D,		idle_cpu_snr),
 	{}
 };
 
-- 
Gitee


From d89061f3c4d9434732413bec2abe740d945b5c1d Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 15 Jan 2021 15:56:46 -0800
Subject: [PATCH 1144/2161] intel_idle: remove definition of DEBUG

commit 651bc5816c39e57833fea4478c8ecfb72ad47e44 upstream.

Defining DEBUG should only be done in development.
So remove DEBUG.

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 9fbc1cb49a30..f239d8ca698f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -37,7 +37,7 @@
  */
 
 /* un-comment DEBUG to enable pr_debug() statements */
-#define DEBUG
+/* #define DEBUG */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-- 
Gitee


From 7d25c5a5e7d53cd9393ba3f0a4d6da7a354b1b05 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 8 Mar 2021 16:31:46 +0200
Subject: [PATCH 1145/2161] intel_idle: update ICX C6 data

commit d484b8bfc6fa71a088e4ac85d9ce11aa0385867e upstream.

Change IceLake Xeon C6 latency from 128 us to 170 us. The latency
was measured with the "wult" tool and corresponds to the 99.99th
percentile when measuring with the "nic" method. Note, the 128 us
figure correspond to the median latency, but in intel_idle we use
the "worst case" latency figure instead.

C6 target residency was increased from 384 us to 600 us, which may
result in less C6 residency in some workloads. This value was tested
and compared to values 384, and 1000. Value 600 is a reasonable
tradeoff between power and performance.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index f239d8ca698f..5698df6d208b 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -742,8 +742,8 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.name = "C6",
 		.desc = "MWAIT 0x20",
 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
-		.exit_latency = 128,
-		.target_residency = 384,
+		.exit_latency = 170,
+		.target_residency = 600,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
-- 
Gitee


From 4e7dc2c28cf4116a933a2c26a5e7d83a68d476f3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 29 Mar 2021 20:37:12 +0200
Subject: [PATCH 1146/2161] cpuidle: menu: Take negative "sleep length" values
 into account

commit 060e3535adf5c961b01421b9fdaddab8dd43ba85 upstream.

Make the menu governor check the tick_nohz_get_next_hrtimer()
return value so as to avoid dealing with negative "sleep length"
values and make it use that value directly when the tick is stopped.

While at it, rename local variable delta_next in menu_select() to
delta_tick which better reflects its purpose.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/governors/menu.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b0a7ad566081..c3aa8d6ccee3 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -271,7 +271,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	u64 predicted_ns;
 	u64 interactivity_req;
 	unsigned long nr_iowaiters;
-	ktime_t delta_next;
+	ktime_t delta, delta_tick;
 	int i, idx;
 
 	if (data->needs_update) {
@@ -280,7 +280,12 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	}
 
 	/* determine the expected residency time, round up */
-	data->next_timer_ns = tick_nohz_get_sleep_length(&delta_next);
+	delta = tick_nohz_get_sleep_length(&delta_tick);
+	if (unlikely(delta < 0)) {
+		delta = 0;
+		delta_tick = 0;
+	}
+	data->next_timer_ns = delta;
 
 	nr_iowaiters = nr_iowait_cpu(dev->cpu);
 	data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
@@ -318,7 +323,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * state selection.
 		 */
 		if (predicted_ns < TICK_NSEC)
-			predicted_ns = delta_next;
+			predicted_ns = data->next_timer_ns;
 	} else {
 		/*
 		 * Use the performance multiplier and the user-configurable
@@ -377,7 +382,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * stuck in the shallow one for too long.
 			 */
 			if (drv->states[idx].target_residency_ns < TICK_NSEC &&
-			    s->target_residency_ns <= delta_next)
+			    s->target_residency_ns <= delta_tick)
 				idx = i;
 
 			return idx;
@@ -399,7 +404,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	     predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
 		*stop_tick = false;
 
-		if (idx > 0 && drv->states[idx].target_residency_ns > delta_next) {
+		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
 			/*
 			 * The tick is not going to be stopped and the target
 			 * residency of the state to be returned is not within
@@ -411,7 +416,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 					continue;
 
 				idx = i;
-				if (drv->states[i].target_residency_ns <= delta_next)
+				if (drv->states[i].target_residency_ns <= delta_tick)
 					break;
 			}
 		}
-- 
Gitee


From 018622d0c830562d885d16209a588d34d643271b Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 7 Apr 2021 09:10:28 +0300
Subject: [PATCH 1147/2161] intel_idle: add Iclelake-D support

commit 22141d5f411895bb1b0df2a6b05f702e11e63918 upstream.

This patch adds Icelake Xeon D support to the intel_idle driver.

Since Icelake D and Icelake SP C-state characteristics the same,
we use Icelake SP C-states table for Icelake D as well.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 5698df6d208b..256a8751d5b1 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1154,6 +1154,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(KABYLAKE,		idle_cpu_skl),
 	INTEL_CPU_FAM6(SKYLAKE_X,		idle_cpu_skx),
 	INTEL_CPU_FAM6(ICELAKE_X,		idle_cpu_icx),
+	INTEL_CPU_FAM6(ICELAKE_D,		idle_cpu_icx),
 	INTEL_CPU_FAM6(XEON_PHI_KNL,		idle_cpu_knl),
 	INTEL_CPU_FAM6(XEON_PHI_KNM,		idle_cpu_knl),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT,		idle_cpu_bxt),
-- 
Gitee


From 1b03f3533640ac67af1a108fe7f4433b43443b2d Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Fri, 28 May 2021 11:20:54 +0800
Subject: [PATCH 1148/2161] intel_idle: Adjust the SKX C6 parameters if PC6 is
 disabled

commit 64233338499126c5c31e07165735ab5441c7e45a upstream.

Because cpuidle assumes worst-case C-state parameters, PC6 parameters
are used for describing C6, which is worst-case for requesting CC6.
When PC6 is enabled, this is appropriate. But if PC6 is disabled
in the BIOS, the exit latency and target residency should be adjusted
accordingly.

Exit latency:
Previously the C6 exit latency was measured as the PC6 exit latency.
With PC6 disabled, the C6 exit latency should be the one of CC6.

Target residency:
With PC6 disabled, the idle duration within [CC6, PC6) would make the
idle governor choose C1E over C6. This would cause low energy-efficiency.
We should lower the bar to request C6 when PC6 is disabled.

To fill this gap, check if PC6 is disabled in the BIOS in the
MSR_PKG_CST_CONFIG_CONTROL(0xe2) register. If so, use the CC6 exit latency
for C6 and set target_residency to 3 times of the new exit latency. [This
is consistent with how intel_idle driver uses _CST to calculate the
target_residency.] As a result, the OS would be more likely to choose C6
over C1E when PC6 is disabled, which is reasonable, because if C6 is
enabled, it implies that the user cares about energy, so choosing C6 more
frequently makes sense.

The new CC6 exit latency of 92us was measured with wult[1] on SKX via NIC
wakeup as the 99.99th percentile. Also CLX and CPX both have the same CPU
model number as SkX, but their CC6 exit latencies are similar to the SKX
one, 96us and 89us respectively, so reuse the SKX value for them.

There is a concern that it might be better to use a more generic approach
instead of optimizing every platform. However, if the required code
complexity and different PC6 bit interpretation on different platforms
are taken into account, tuning the code per platform seems to be an
acceptable tradeoff.

Link: https://intel.github.io/wult/ # [1]
Suggested-by: Len Brown <len.brown@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 256a8751d5b1..a0c243465d2f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1485,6 +1485,36 @@ static void __init sklh_idle_state_table_update(void)
 	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
 
+/**
+ * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake
+ * idle states table.
+ */
+static void __init skx_idle_state_table_update(void)
+{
+	unsigned long long msr;
+
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/*
+	 * 000b: C0/C1 (no package C-state support)
+	 * 001b: C2
+	 * 010b: C6 (non-retention)
+	 * 011b: C6 (retention)
+	 * 111b: No Package C state limits.
+	 */
+	if ((msr & 0x7) < 2) {
+		/*
+		 * Uses the CC6 + PC0 latency and 3 times of
+		 * latency for target_residency if the PC6
+		 * is disabled in BIOS. This is consistent
+		 * with how intel_idle driver uses _CST
+		 * to set the target_residency.
+		 */
+		skx_cstates[2].exit_latency = 92;
+		skx_cstates[2].target_residency = 276;
+	}
+}
+
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 {
 	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1516,6 +1546,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	case INTEL_FAM6_SKYLAKE:
 		sklh_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SKYLAKE_X:
+		skx_idle_state_table_update();
+		break;
 	}
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
-- 
Gitee


From 1f88f79e6faa29b3f7b6fbe3701b6e6edca0c106 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 17 Sep 2021 10:20:22 +0300
Subject: [PATCH 1149/2161] intel_idle: enable interrupts before C1 on Xeons

commit c227233ad64c77e57db738ab0e46439db71822a3 upstream.

Enable local interrupts before requesting C1 on the last two generations
of Intel Xeon platforms: Sky Lake, Cascade Lake, Cooper Lake, Ice Lake.
This decreases average C1 interrupt latency by about 5-10%, as measured
with the 'wult' tool.

The '->enter()' function of the driver enters C-states with local
interrupts disabled by executing the 'monitor' and 'mwait' pair of
instructions. If an interrupt happens, the CPU exits the C-state and
continues executing instructions after 'mwait'. It does not jump to
the interrupt handler, because local interrupts are disabled. The
cpuidle subsystem enables interrupts a bit later, after doing some
housekeeping.

With this patch, we enable local interrupts before requesting C1. In
this case, if the CPU wakes up because of an interrupt, it will jump
to the interrupt handler right away. The cpuidle housekeeping will be
done after the pending interrupt(s) are handled.

Enabling interrupts before entering a C-state has measurable impact
for faster C-states, like C1. Deeper, but slower C-states like C6 do
not really benefit from this sort of change, because their latency is
a lot higher comparing to the delay added by cpuidle housekeeping.

This change was also tested with cyclictest and dbench. In case of Ice
Lake, the average cyclictest latency decreased by 5.1%, and the average
'dbench' throughput increased by about 0.8%. Both tests were run for 4
hours with only C1 enabled (all other idle states, including 'POLL',
were disabled). CPU frequency was pinned to HFM, and uncore frequency
was pinned to the maximum value. The other platforms had similar
single-digit percentage improvements.

It is worth noting that this patch affects 'cpuidle' statistics a tiny
bit.  Before this patch, C1 residency did not include the interrupt
handling time, but with this patch, it will include it. This is similar
to what happens in case of the 'POLL' state, which also runs with
interrupts enabled.

Suggested-by: Len Brown <len.brown@intel.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a0c243465d2f..c5eb07150db4 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -88,6 +88,12 @@ static struct cpuidle_state *cpuidle_state_table __initdata;
 
 static unsigned int mwait_substates __initdata;
 
+/*
+ * Enable interrupts before entering the C-state. On some platforms and for
+ * some C-states, this may measurably decrease interrupt latency.
+ */
+#define CPUIDLE_FLAG_IRQ_ENABLE		BIT(14)
+
 /*
  * Enable this state by default even if the ACPI _CST does not list it.
  */
@@ -127,6 +133,9 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned long ecx = 1; /* break on interrupt flag */
 
+	if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE)
+		local_irq_enable();
+
 	mwait_idle_with_hints(eax, ecx);
 
 	return index;
@@ -696,7 +705,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
@@ -725,7 +734,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-- 
Gitee


From cfeca695297007569b559567af14d2058400df88 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:15:58 +0200
Subject: [PATCH 1150/2161] intel_idle: add SPR support

commit 9edf3c0ffef0ec1bed8300315852b5c6a0997130 upstream.

Add Sapphire Rapids Xeon support.

Up until very recently, the C1 and C1E C-states were independent, but this
has changed in some new chips, including Sapphire Rapids Xeon (SPR). In these
chips the C1 and C1E states cannot be enabled at the same time. The "C1E
promotion" bit in 'MSR_IA32_POWER_CTL' also has its semantics changed a bit.

Here are the C1, C1E, and "C1E promotion" bit rules on Xeons before SPR.

1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1  C-state.
   b. C1E requests end up with C1E C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Here are the C1, C1E, and "C1E promotion" bit rules on Sapphire Rapids Xeon.
1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1 C-state.
   b. C1E requests end up with C1 C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Before SPR Xeon, the 'intel_idle' driver was disabling C1E promotion and was
exposing C1 and C1E as independent C-states. But on SPR, C1 and C1E cannot be
enabled at the same time.

This patch adds both C1 and C1E states. However, C1E is marked as with the
"CPUIDLE_FLAG_UNUSABLE" flag, which means that in won't be registered by
default. The C1E promotion bit will be cleared, which means that by default
only C1 and C6 will be registered on SPR.

The next patch will add an option for enabling C1E and disabling C1 on SPR.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 47 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c5eb07150db4..df854c0bc6cd 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -759,6 +759,46 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+/*
+ * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
+ * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
+ * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
+ * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
+ * both C1 and C1E requests end up with C1, so there is effectively no C1E.
+ *
+ * By default we enable C1 and disable C1E by marking it with
+ * 'CPUIDLE_FLAG_UNUSABLE'.
+ */
+static struct cpuidle_state spr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \
+					   CPUIDLE_FLAG_UNUSABLE,
+		.exit_latency = 2,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 290,
+		.target_residency = 800,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static struct cpuidle_state atom_cstates[] __initdata = {
 	{
 		.name = "C1E",
@@ -1102,6 +1142,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_spr __initconst = {
+	.state_table = spr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_avn __initconst = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1164,6 +1210,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(SKYLAKE_X,		idle_cpu_skx),
 	INTEL_CPU_FAM6(ICELAKE_X,		idle_cpu_icx),
 	INTEL_CPU_FAM6(ICELAKE_D,		idle_cpu_icx),
+	INTEL_CPU_FAM6(SAPPHIRERAPIDS_X,	idle_cpu_spr),
 	INTEL_CPU_FAM6(XEON_PHI_KNL,		idle_cpu_knl),
 	INTEL_CPU_FAM6(XEON_PHI_KNM,		idle_cpu_knl),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT,		idle_cpu_bxt),
-- 
Gitee


From 1632e0267cb6abd1b9f3f38117388d210d3a4fba Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:15:59 +0200
Subject: [PATCH 1151/2161] intel_idle: add 'preferred_cstates' module argument

commit da0e58c038e60e7e65d30813ebdfe91687aa8a24 upstream.

On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually
exclusive - only one of them can be enabled. By default, 'intel_idle' driver
enables C1 and disables C1E. However, some users prefer to use C1E instead of
C1, because it saves more energy.

This patch adds a new module parameter ('preferred_cstates') for enabling C1E
and disabling C1. Here is the idea behind it.

1. This option has effect only for "mutually exclusive" C-states like C1 and
   C1E on SPR.
2. It does not have any effect on independent C-states, which do not require
   other C-states to be disabled (most states on most platforms as of today).
3. For mutually exclusive C-states, the 'intel_idle' driver always has a
   reasonable default, such as enabling C1 on SPR by default. On other
   platforms, the default may be different.
4. Users can override the default using the 'preferred_cstates' parameter.
5. The parameter accepts the preferred C-states bit-mask, similarly to the
   existing 'states_off' parameter.
6. This parameter is not limited to C1/C1E, and leaves room for supporting
   other mutually exclusive C-states, if they come in the future.

Today 'intel_idle' can only be compiled-in, which means that on SPR, in order
to disable C1 and enable C1E, users should boot with the following kernel
argument: intel_idle.preferred_cstates=4

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 46 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index df854c0bc6cd..22970be8644c 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -64,6 +64,7 @@ static struct cpuidle_driver intel_idle_driver = {
 /* intel_idle.max_cstate=0 disables driver */
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
 static unsigned int disabled_states_mask;
+static unsigned int preferred_states_mask;
 
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
@@ -1401,6 +1402,8 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
+static void c1e_promotion_enable(void);
+
 /**
  * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
@@ -1571,6 +1574,26 @@ static void __init skx_idle_state_table_update(void)
 	}
 }
 
+/**
+ * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
+ */
+static void __init spr_idle_state_table_update(void)
+{
+	/* Check if user prefers C1E over C1. */
+	if (preferred_states_mask & BIT(2)) {
+		if (preferred_states_mask & BIT(1))
+			/* Both can't be enabled, stick to the defaults. */
+			return;
+
+		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
+		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
+
+		/* Enable C1E using the "C1E promotion" bit. */
+		c1e_promotion_enable();
+		disable_promotion_to_c1e = false;
+	}
+}
+
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 {
 	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1605,6 +1628,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	case INTEL_FAM6_SKYLAKE_X:
 		skx_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		spr_idle_state_table_update();
+		break;
 	}
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
@@ -1677,6 +1703,15 @@ static void auto_demotion_disable(void)
 	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
 }
 
+static void c1e_promotion_enable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+	msr_bits |= 0x2;
+	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
 static void c1e_promotion_disable(void)
 {
 	unsigned long long msr_bits;
@@ -1846,3 +1881,14 @@ module_param(max_cstate, int, 0444);
  */
 module_param_named(states_off, disabled_states_mask, uint, 0444);
 MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
+/*
+ * Some platforms come with mutually exclusive C-states, so that if one is
+ * enabled, the other C-states must not be used. Example: C1 and C1E on
+ * Sapphire Rapids platform. This parameter allows for selecting the
+ * preferred C-states among the groups of mutually exclusive C-states - the
+ * selected C-states will be registered, the other C-states from the mutually
+ * exclusive group won't be registered. If the platform has no mutually
+ * exclusive C-states, this parameter has no effect.
+ */
+module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
+MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
-- 
Gitee


From 19fcc7b034609eb7b0011f2e3ca5d38c72c8cfc9 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:16:00 +0200
Subject: [PATCH 1152/2161] intel_idle: add core C6 optimization for SPR

commit 3a9cf77b60dc9839b6674943bb7c9dcd524b6294 upstream.

Add a Sapphire Rapids Xeon C6 optimization, similar to what we have for Sky Lake
Xeon: if package C6 is disabled, adjust C6 exit latency and target residency to
match core C6 values, instead of using the default package C6 values.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 22970be8644c..195fc3b041b2 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1579,6 +1579,8 @@ static void __init skx_idle_state_table_update(void)
  */
 static void __init spr_idle_state_table_update(void)
 {
+	unsigned long long msr;
+
 	/* Check if user prefers C1E over C1. */
 	if (preferred_states_mask & BIT(2)) {
 		if (preferred_states_mask & BIT(1))
@@ -1592,6 +1594,19 @@ static void __init spr_idle_state_table_update(void)
 		c1e_promotion_enable();
 		disable_promotion_to_c1e = false;
 	}
+
+	/*
+	 * By default, the C6 state assumes the worst-case scenario of package
+	 * C6. However, if PC6 is disabled, we update the numbers to match
+	 * core C6.
+	 */
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/* Limit value 2 and above allow for PC6. */
+	if ((msr & 0x7) < 2) {
+		spr_cstates[2].exit_latency = 190;
+		spr_cstates[2].target_residency = 600;
+	}
 }
 
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
-- 
Gitee


From 22b0cab523e703ac9bd1c72f002eebbc9b80acbe Mon Sep 17 00:00:00 2001
From: Yihao Wu <wuyihao@linux.alibaba.com>
Date: Wed, 8 Apr 2020 18:11:36 +0800
Subject: [PATCH 1153/2161] cpuidle-haltpoll: Fix small typo

commit 4902f7fcb3bcb4ce088db97bfd194401a784cc60 upstream.

Fix a spelling typo in cpuidle-haltpoll.c.

Signed-off-by: Yihao Wu <wuyihao@linux.alibaba.com>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle-haltpoll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
index db124bc1ca2c..fcc53215bac8 100644
--- a/drivers/cpuidle/cpuidle-haltpoll.c
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -94,7 +94,7 @@ static void haltpoll_uninit(void)
 	haltpoll_cpuidle_devices = NULL;
 }
 
-static bool haltpool_want(void)
+static bool haltpoll_want(void)
 {
 	return kvm_para_has_hint(KVM_HINTS_REALTIME) || force;
 }
@@ -110,7 +110,7 @@ static int __init haltpoll_init(void)
 
 	cpuidle_poll_state_init(drv);
 
-	if (!kvm_para_available() || !haltpool_want())
+	if (!kvm_para_available() || !haltpoll_want())
 		return -ENODEV;
 
 	ret = cpuidle_register_driver(drv);
-- 
Gitee


From 6d1b218fa89616c1609ab0062246fd8d6f611f0f Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 4 Mar 2022 15:53:28 +0800
Subject: [PATCH 1154/2161] cpuidle: haltpoll: Call cpuidle_poll_state_init()
 later

commit 659b66e98bb38dc6300dca3c9ebebeba194b575b upstream.

Call cpuidle_poll_state_init() only if it is needed to avoid doing
useless work.

Signed-off-by: Li RongQing <lirongqing@baidu.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/cpuidle/cpuidle-haltpoll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
index fcc53215bac8..3a39a7f48b77 100644
--- a/drivers/cpuidle/cpuidle-haltpoll.c
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -108,11 +108,11 @@ static int __init haltpoll_init(void)
 	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
 		return -ENODEV;
 
-	cpuidle_poll_state_init(drv);
-
 	if (!kvm_para_available() || !haltpoll_want())
 		return -ENODEV;
 
+	cpuidle_poll_state_init(drv);
+
 	ret = cpuidle_register_driver(drv);
 	if (ret < 0)
 		return ret;
-- 
Gitee


From 441b59ed0fe5f25da871f7e2dd0695d026d0326c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 15 Mar 2022 20:35:47 +0100
Subject: [PATCH 1155/2161] cpuidle: intel_idle: Update intel_idle() kerneldoc
 comment

commit a335b1e6bb29300d3bc6749763a4298627e594ba upstream.

Commit bf9282dc26e7 ("cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic")
moved the leave_mm() call away from intel_idle(), but it didn't update
its kerneldoc comment accordingly, so do that now.

Fixes: bf9282dc26e7 ("cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 195fc3b041b2..c11e5faa05f4 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -122,9 +122,6 @@ static unsigned int mwait_substates __initdata;
  * If the local APIC timer is not known to be reliable in the target idle state,
  * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
  *
- * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
- * flushing user TLBs.
- *
  * Must be called under local_irq_disable().
  */
 static __cpuidle int intel_idle(struct cpuidle_device *dev,
-- 
Gitee


From 0be002f303752f2335d405b442a375d9eed16d23 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 15 Mar 2022 20:36:42 +0100
Subject: [PATCH 1156/2161] cpuidle: intel_idle: Drop redundant backslash at
 line end

commit 03eb65224e5711e7a2f34b500d44866b322a249a upstream.

Drop a redundant backslash character at the end of a line in the
spr_cstates[] definition.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c11e5faa05f4..b43a785bfc04 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -779,7 +779,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
 					   CPUIDLE_FLAG_UNUSABLE,
 		.exit_latency = 2,
 		.target_residency = 4,
-- 
Gitee


From 68ebddbd561e71c62e55fe9d055d930f0ca04e74 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Fri, 13 Dec 2019 19:44:34 +0800
Subject: [PATCH 1157/2161] PCI/AER: Log which device prevents error recovery

commit 01daacfb9035e5b86d43a01f11a0614648f306c1 upstream.

PCI error recovery will fail if any device under the Root Port doesn't have
an error_detected callback.  Currently only the failure result is printed,
which is not enough to identify the driver that lacks the callback.

Log a message to identify the device with no error_detected callback.

[bhelgaas: tweak log message]
Link: https://lore.kernel.org/r/1576237474-32021-1-git-send-email-yangyicong@hisilicon.com
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pcie/err.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 82750614fc91..d49c94a9d4b0 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -61,10 +61,12 @@ static int report_error_detected(struct pci_dev *dev,
 		 * error callbacks of "any" device in the subtree, and will
 		 * exit in the disconnected error state.
 		 */
-		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
 			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-		else
+			pci_info(dev, "AER: Can't recover (no error_detected callback)\n");
+		} else {
 			vote = PCI_ERS_RESULT_NONE;
+		}
 	} else {
 		err_handler = dev->driver->err_handler;
 		vote = err_handler->error_detected(dev, state);
-- 
Gitee


From cd3c204120c33bbd550bbc656101067d4339465a Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 13 Dec 2019 16:46:05 -0600
Subject: [PATCH 1158/2161] PCI/AER: Factor message prefixes with dev_fmt()

commit 8d077c3ce0109c406c265cafc334258caee47e6d upstream.

Define dev_fmt() with the common prefix of log messages so we don't have to
repeat it in every printk.  No functional change intended.

Link: https://lore.kernel.org/r/20191213225709.GA213811@google.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pcie/err.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index d49c94a9d4b0..633c2a6ff15d 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -10,6 +10,8 @@
  *	Zhang Yanmin (yanmin.zhang@intel.com)
  */
 
+#define dev_fmt(fmt) "AER: " fmt
+
 #include <linux/pci.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -63,7 +65,7 @@ static int report_error_detected(struct pci_dev *dev,
 		 */
 		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
 			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-			pci_info(dev, "AER: Can't recover (no error_detected callback)\n");
+			pci_info(dev, "can't recover (no error_detected callback)\n");
 		} else {
 			vote = PCI_ERS_RESULT_NONE;
 		}
@@ -200,12 +202,12 @@ void pcie_do_recovery(struct pci_dev *dev,
 
 	pci_aer_clear_device_status(dev);
 	pci_aer_clear_nonfatal_status(dev);
-	pci_info(dev, "AER: Device recovery successful\n");
+	pci_info(dev, "device recovery successful\n");
 	return;
 
 failed:
 	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
 
 	/* TODO: Should kernel panic here? */
-	pci_info(dev, "AER: Device recovery failed\n");
+	pci_info(dev, "device recovery failed\n");
 }
-- 
Gitee


From 46b62e5738702729952a4d965a14ec30d976dd7c Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 28 Apr 2022 15:45:02 +0800
Subject: [PATCH 1159/2161] PCI/ERR: Combine pci_channel_io_frozen cases

commit b5dfbeacf74865a8d62a4f70f501cdc61510f8e0 upstream.

pcie_do_recovery() had two "if (state == pci_channel_io_frozen)" cases
right after each other.  Combine them to make this easier to read.  No
functional change intended.

Link: https://lore.kernel.org/r/20200317170654.GA23125@infradead.org
[bhelgaas: split from https://lore.kernel.org/r/a255fcb3a3fdebcd90f84e08b555f1786eb8eba2.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com]
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pcie/err.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 633c2a6ff15d..42fe70009787 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -173,9 +173,6 @@ void pcie_do_recovery(struct pci_dev *dev,
 	} else {
 		pci_walk_bus(bus, report_normal_detected, &status);
 	}
-	if (state == pci_channel_io_frozen &&
-	    reset_link(dev) != PCI_ERS_RESULT_RECOVERED)
-		goto failed;
 
 	if (status == PCI_ERS_RESULT_CAN_RECOVER) {
 		status = PCI_ERS_RESULT_RECOVERED;
-- 
Gitee


From 4422706f74164f7983a8d9f266ebf9f5d90211ca Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Mon, 23 Mar 2020 17:26:03 -0700
Subject: [PATCH 1160/2161] PCI/ERR: Return status of pcie_do_recovery()

commit e8e5ff2aeec19ade42f0535f4b554a3f6e1a58f7 upstream.

As per the DPC Enhancements ECN [1], sec 4.5.1, table 4-4, if the OS
supports Error Disconnect Recover (EDR), it must invalidate the software
state associated with child devices of the port without attempting to
access the child device hardware. In addition, if the OS supports DPC, it
must attempt to recover the child devices if the port implements the DPC
Capability. If the OS continues operation, the OS must inform the firmware
of the status of the recovery operation via the _OST method.

Return the result of pcie_do_recovery() so we can report it to firmware via
_OST.

[1] Downstream Port Containment Related Enhancements ECN, Jan 28, 2019,
    affecting PCI Firmware Specification, Rev. 3.2
    https://members.pcisig.com/wg/PCI-SIG/document/12888

Link: https://lore.kernel.org/r/eb60ec89448769349c6722954ffbf2de163155b5.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci.h      |  5 +++--
 drivers/pci/pcie/err.c | 10 ++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index b5176c7579e5..1095e4096c0d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -555,8 +555,9 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
 #endif
 
 /* PCI error reporting and recovery */
-void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
-		      pci_ers_result_t (*reset_link)(struct pci_dev *pdev));
+pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
+			enum pci_channel_state state,
+			pci_ers_result_t (*reset_link)(struct pci_dev *pdev));
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 42fe70009787..14bb8f54723e 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -146,9 +146,9 @@ static int report_resume(struct pci_dev *dev, void *data)
 	return 0;
 }
 
-void pcie_do_recovery(struct pci_dev *dev,
-		      enum pci_channel_state state,
-		      pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
+pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
+			enum pci_channel_state state,
+			pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
 {
 	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
 	struct pci_bus *bus;
@@ -200,11 +200,13 @@ void pcie_do_recovery(struct pci_dev *dev,
 	pci_aer_clear_device_status(dev);
 	pci_aer_clear_nonfatal_status(dev);
 	pci_info(dev, "device recovery successful\n");
-	return;
+	return status;
 
 failed:
 	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
 
 	/* TODO: Should kernel panic here? */
 	pci_info(dev, "device recovery failed\n");
+
+	return status;
 }
-- 
Gitee


From eb1689e97efc01cb6ef19ebbab06dd721b3834b6 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 28 Apr 2022 16:16:31 +0800
Subject: [PATCH 1161/2161] PCI/AER: Add pci_aer_raw_clear_status() to
 unconditionally clear Error Status

commit 20e15e673b05a045fdbe534d40edf948e1b0b1af upstream.

Per the SFI _OSC and DPC Updates ECN [1] implementation note flowchart, the
OS seems to be expected to clear AER status even if it doesn't have
ownership of the AER capability.  Unlike the DPC capability, where a DPC
ECN [2] specifies a window when the OS is allowed to access DPC registers
even if it doesn't have ownership, there is no clear model for AER.

Add pci_aer_raw_clear_status() to clear the AER error status registers
unconditionally.  This is intended for use only by the EDR path (see [2]).

[1] System Firmware Intermediary (SFI) _OSC and DPC Updates ECN, Feb 24,
    2020, affecting PCI Firmware Specification, Rev. 3.2
    https://members.pcisig.com/wg/PCI-SIG/document/14076
[2] Downstream Port Containment Related Enhancements ECN, Jan 28, 2019,
    affecting PCI Firmware Specification, Rev. 3.2
    https://members.pcisig.com/wg/PCI-SIG/document/12888

[bhelgaas: changelog]
Link: https://lore.kernel.org/r/c19ad28f3633cce67448609e89a75635da0da07d.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci.h      |  2 ++
 drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1095e4096c0d..e855e24ad79b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -682,6 +682,7 @@ extern const struct attribute_group aer_stats_attr_group;
 void pci_aer_clear_fatal_status(struct pci_dev *dev);
 void pci_aer_clear_device_status(struct pci_dev *dev);
 int pci_aer_clear_status(struct pci_dev *dev);
+int pci_aer_raw_clear_status(struct pci_dev *dev);
 #else
 static inline void pci_no_aer(void) { }
 static inline void pci_aer_init(struct pci_dev *d) { }
@@ -689,6 +690,7 @@ static inline void pci_aer_exit(struct pci_dev *d) { }
 static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
 static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
 static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
+static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
 #endif
 
 #ifdef CONFIG_ACPI
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index b37ca1f96b5c..f4274d301235 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -420,7 +420,16 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
 		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 }
 
-int pci_aer_clear_status(struct pci_dev *dev)
+/**
+ * pci_aer_raw_clear_status - Clear AER error registers.
+ * @dev: the PCI device
+ *
+ * Clearing AER error status registers unconditionally, regardless of
+ * whether they're owned by firmware or the OS.
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_aer_raw_clear_status(struct pci_dev *dev)
 {
 	int pos;
 	u32 status;
@@ -433,9 +442,6 @@ int pci_aer_clear_status(struct pci_dev *dev)
 	if (!pos)
 		return -EIO;
 
-	if (pcie_aer_get_firmware_first(dev))
-		return -EIO;
-
 	port_type = pci_pcie_type(dev);
 	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
 		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
@@ -451,6 +457,14 @@ int pci_aer_clear_status(struct pci_dev *dev)
 	return 0;
 }
 
+int pci_aer_clear_status(struct pci_dev *dev)
+{
+	if (pcie_aer_get_firmware_first(dev))
+		return -EIO;
+
+	return pci_aer_raw_clear_status(dev);
+}
+
 void pci_save_aer_state(struct pci_dev *dev)
 {
 	struct pci_cap_saved_state *save_state;
-- 
Gitee


From 70996e1cca062513b7d2f9e6d23d21b8c8ab0077 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Wed, 15 Apr 2020 17:38:32 -0700
Subject: [PATCH 1162/2161] PCI/EDR: Log only ACPI_NOTIFY_DISCONNECT_RECOVER
 events

commit af03958da0678c3162ae534829cabf9f67f0d950 upstream.

Previously we logged *all* ACPI SYSTEM-level events, which may include lots
of non-EDR events.  Move the message so we only log those related to EDR.

Link: https://lore.kernel.org/r/01afb4e01efbe455de0c445bef6cf3ffc59340d2.1586996350.git.sathyanarayanan.kuppuswamy@linux.intel.com
[bhelgaas: drop the pci_dbg() of all events since ACPI can log those
already]
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pcie/edr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/edr.c b/drivers/pci/pcie/edr.c
index 594622a6cb16..a6b9b479b97a 100644
--- a/drivers/pci/pcie/edr.c
+++ b/drivers/pci/pcie/edr.c
@@ -148,11 +148,11 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
 	pci_ers_result_t estate = PCI_ERS_RESULT_DISCONNECT;
 	u16 status;
 
-	pci_info(pdev, "ACPI event %#x received\n", event);
-
 	if (event != ACPI_NOTIFY_DISCONNECT_RECOVER)
 		return;
 
+	pci_info(pdev, "EDR event received\n");
+
 	/* Locate the port which issued EDR event */
 	edev = acpi_dpc_port_get(pdev);
 	if (!edev) {
-- 
Gitee


From 86fa519a6455cf146f72ac76a8d5efbd57850485 Mon Sep 17 00:00:00 2001
From: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Date: Mon, 27 Apr 2020 18:25:13 -0500
Subject: [PATCH 1163/2161] PCI/AER: Use only _OSC to determine AER ownership

commit c100beb9ccfb98e2474586a4006483cbf770c823 upstream.

Per the PCI Firmware spec, r3.2, sec 4.5.1, the OS can request control of
AER via bit 3 of the _OSC Control Field.  In the returned value of the
Control Field:

  The firmware sets [bit 3] to 1 to grant control over PCI Express Advanced
  Error Reporting.  ...  after control is transferred to the operating
  system, firmware must not modify the Advanced Error Reporting Capability.
  If control of this feature was requested and denied or was not requested,
  firmware returns this bit set to 0.

Previously the pci_root driver looked at the HEST FIRMWARE_FIRST bit to
determine whether to request ownership of the AER Capability.  This was
based on ACPI spec v6.3, sec 18.3.2.4, and similar sections, which say
things like:

  Bit [0] - FIRMWARE_FIRST: If set, indicates that system firmware will
            handle errors from this source first.

  Bit [1] - GLOBAL: If set, indicates that the settings contained in this
            structure apply globally to all PCI Express Devices.

These ACPI references don't say anything about ownership of the AER
Capability.

Remove use of the FIRMWARE_FIRST bit and rely only on the _OSC bit to
determine whether we have control of the AER Capability.

Link: https://lore.kernel.org/r/20181115231605.24352-1-mr.nuke.me@gmail.com/ v1
Link: https://lore.kernel.org/r/20190326172343.28946-1-mr.nuke.me@gmail.com/ v2
Link: https://lore.kernel.org/r/67af2931705bed9a588b5a39d369cb70b9942190.1587925636.git.sathyanarayanan.kuppuswamy@linux.intel.com
[bhelgaas: commit log, note: Alex posted this identical patch 18 months
ago, and I failed to apply it then, so I made him the author, added links
to his postings, and added his Signed-off-by]
Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/pci_root.c  |  9 ++-------
 drivers/pci/pcie/aer.c   | 26 +-------------------------
 include/linux/pci-acpi.h |  6 ------
 3 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 0cb9df5462c3..57b7ba6d9ad1 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -483,13 +483,8 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
 	if (IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC))
 		control |= OSC_PCI_SHPC_NATIVE_HP_CONTROL;
 
-	if (pci_aer_available()) {
-		if (aer_acpi_firmware_first())
-			dev_info(&device->dev,
-				 "PCIe AER handled by firmware\n");
-		else
-			control |= OSC_PCI_EXPRESS_AER_CONTROL;
-	}
+	if (pci_aer_available())
+		control |= OSC_PCI_EXPRESS_AER_CONTROL;
 
 	/*
 	 * Per the Downstream Port Containment Related Enhancements ECN to
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index f4274d301235..efc26773cc6d 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -318,30 +318,6 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev)
 		aer_set_firmware_first(dev);
 	return dev->__aer_firmware_first;
 }
-
-static bool aer_firmware_first;
-
-/**
- * aer_acpi_firmware_first - Check if APEI should control AER.
- */
-bool aer_acpi_firmware_first(void)
-{
-	static bool parsed = false;
-	struct aer_hest_parse_info info = {
-		.pci_dev	= NULL,	/* Check all PCIe devices */
-		.firmware_first	= 0,
-	};
-
-	if (pcie_ports_native)
-		return false;
-
-	if (!parsed) {
-		apei_hest_parse(aer_hest_parse, &info);
-		aer_firmware_first = info.firmware_first;
-		parsed = true;
-	}
-	return aer_firmware_first;
-}
 #endif
 
 #define	PCI_EXP_AER_FLAGS	(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
@@ -1523,7 +1499,7 @@ static struct pcie_port_service_driver aerdriver = {
  */
 int __init pcie_aer_init(void)
 {
-	if (!pci_aer_available() || aer_acpi_firmware_first())
+	if (!pci_aer_available())
 		return -ENXIO;
 	return pcie_port_service_register(&aerdriver);
 }
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 2d155bfb8fbf..11c98875538a 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -125,10 +125,4 @@ static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
 static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
 #endif	/* CONFIG_ACPI */
 
-#ifdef CONFIG_ACPI_APEI
-extern bool aer_acpi_firmware_first(void);
-#else
-static inline bool aer_acpi_firmware_first(void) { return false; }
-#endif
-
 #endif	/* _PCI_ACPI_H_ */
-- 
Gitee


From 1a3061e05f819f5a69632096f7266814568bd48b Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Tue, 26 May 2020 16:18:29 -0700
Subject: [PATCH 1164/2161] PCI/AER: Remove HEST/FIRMWARE_FIRST parsing for AER
 ownership

commit 708b2000362476c9c7a3571c0cc774dffb91836a upstream.

Commit c100beb9ccfb ("PCI/AER: Use only _OSC to determine AER ownership")
removed the use of HEST in determining AER ownership, but the AER driver
still used HEST to verify AER ownership in some of its APIs.

Per the ACPI spec v6.3, sec 18.3.2.4, some HEST table entries contain a
FIRMWARE_FIRST bit, but that bit does not tell us anything about ownership
of the AER capability.

Remove parsing of HEST to look for FIRMWARE_FIRST.

Add pcie_aer_is_native() for the places that need to know whether the OS
owns the AER capability.

[bhelgaas: commit log, reorder patch, remove unused __aer_firmware_first]
Link: https://lore.kernel.org/r/9a37f53a4e6ff4942ff8e18dbb20b00e16c47341.1590534843.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pcie/aer.c     | 118 ++++---------------------------------
 drivers/pci/pcie/dpc.c     |   2 +-
 drivers/pci/pcie/portdrv.h |  13 +---
 include/linux/pci.h        |   2 -
 4 files changed, 14 insertions(+), 121 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index efc26773cc6d..803273ba30db 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -217,118 +217,22 @@ void pcie_ecrc_get_policy(char *str)
 }
 #endif	/* CONFIG_PCIE_ECRC */
 
-#ifdef CONFIG_ACPI_APEI
-static inline int hest_match_pci(struct acpi_hest_aer_common *p,
-				 struct pci_dev *pci)
-{
-	return   ACPI_HEST_SEGMENT(p->bus) == pci_domain_nr(pci->bus) &&
-		 ACPI_HEST_BUS(p->bus)     == pci->bus->number &&
-		 p->device                 == PCI_SLOT(pci->devfn) &&
-		 p->function               == PCI_FUNC(pci->devfn);
-}
-
-static inline bool hest_match_type(struct acpi_hest_header *hest_hdr,
-				struct pci_dev *dev)
-{
-	u16 hest_type = hest_hdr->type;
-	u8 pcie_type = pci_pcie_type(dev);
-
-	if ((hest_type == ACPI_HEST_TYPE_AER_ROOT_PORT &&
-		pcie_type == PCI_EXP_TYPE_ROOT_PORT) ||
-	    (hest_type == ACPI_HEST_TYPE_AER_ENDPOINT &&
-		pcie_type == PCI_EXP_TYPE_ENDPOINT) ||
-	    (hest_type == ACPI_HEST_TYPE_AER_BRIDGE &&
-		(dev->class >> 16) == PCI_BASE_CLASS_BRIDGE))
-		return true;
-	return false;
-}
-
-struct aer_hest_parse_info {
-	struct pci_dev *pci_dev;
-	int firmware_first;
-};
-
-static int hest_source_is_pcie_aer(struct acpi_hest_header *hest_hdr)
-{
-	if (hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT ||
-	    hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT ||
-	    hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE)
-		return 1;
-	return 0;
-}
-
-static int aer_hest_parse(struct acpi_hest_header *hest_hdr, void *data)
-{
-	struct aer_hest_parse_info *info = data;
-	struct acpi_hest_aer_common *p;
-	int ff;
-
-	if (!hest_source_is_pcie_aer(hest_hdr))
-		return 0;
-
-	p = (struct acpi_hest_aer_common *)(hest_hdr + 1);
-	ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST);
-
-	/*
-	 * If no specific device is supplied, determine whether
-	 * FIRMWARE_FIRST is set for *any* PCIe device.
-	 */
-	if (!info->pci_dev) {
-		info->firmware_first |= ff;
-		return 0;
-	}
-
-	/* Otherwise, check the specific device */
-	if (p->flags & ACPI_HEST_GLOBAL) {
-		if (hest_match_type(hest_hdr, info->pci_dev))
-			info->firmware_first = ff;
-	} else
-		if (hest_match_pci(p, info->pci_dev))
-			info->firmware_first = ff;
-
-	return 0;
-}
-
-static void aer_set_firmware_first(struct pci_dev *pci_dev)
-{
-	int rc;
-	struct aer_hest_parse_info info = {
-		.pci_dev	= pci_dev,
-		.firmware_first	= 0,
-	};
-
-	rc = apei_hest_parse(aer_hest_parse, &info);
-
-	if (rc)
-		pci_dev->__aer_firmware_first = 0;
-	else
-		pci_dev->__aer_firmware_first = info.firmware_first;
-	pci_dev->__aer_firmware_first_valid = 1;
-}
+#define	PCI_EXP_AER_FLAGS	(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
+				 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
 
-int pcie_aer_get_firmware_first(struct pci_dev *dev)
+int pcie_aer_is_native(struct pci_dev *dev)
 {
-	if (!pci_is_pcie(dev))
-		return 0;
+	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 
-	if (pcie_ports_native)
+	if (!dev->aer_cap)
 		return 0;
 
-	if (!dev->__aer_firmware_first_valid)
-		aer_set_firmware_first(dev);
-	return dev->__aer_firmware_first;
+	return pcie_ports_native || host->native_aer;
 }
-#endif
-
-#define	PCI_EXP_AER_FLAGS	(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
-				 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
 
 int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 {
-	if (pcie_aer_get_firmware_first(dev))
-		return -EIO;
-
-	if (!dev->aer_cap)
+	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
 	return pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS);
@@ -337,7 +241,7 @@ EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting);
 
 int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 {
-	if (pcie_aer_get_firmware_first(dev))
+	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
 	return pcie_capability_clear_word(dev, PCI_EXP_DEVCTL,
@@ -362,7 +266,7 @@ int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 	if (!pos)
 		return -EIO;
 
-	if (pcie_aer_get_firmware_first(dev))
+	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
 	/* Clear status bits for ERR_NONFATAL errors only */
@@ -385,7 +289,7 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
 	if (!pos)
 		return;
 
-	if (pcie_aer_get_firmware_first(dev))
+	if (!pcie_aer_is_native(dev))
 		return;
 
 	/* Clear status bits for ERR_FATAL errors only */
@@ -435,7 +339,7 @@ int pci_aer_raw_clear_status(struct pci_dev *dev)
 
 int pci_aer_clear_status(struct pci_dev *dev)
 {
-	if (pcie_aer_get_firmware_first(dev))
+	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
 	return pci_aer_raw_clear_status(dev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 5081e4c78795..8ce228265218 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -349,7 +349,7 @@ static int dpc_probe(struct pcie_device *dev)
 	int status;
 	u16 ctl, cap;
 
-	if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native)
+	if (!pcie_aer_is_native(pdev) && !pcie_ports_dpc_native)
 		return -ENOTSUPP;
 
 	status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 64b5e081cdb2..af7cf237432a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -29,8 +29,10 @@ extern bool pcie_ports_dpc_native;
 
 #ifdef CONFIG_PCIEAER
 int pcie_aer_init(void);
+int pcie_aer_is_native(struct pci_dev *dev);
 #else
 static inline int pcie_aer_init(void) { return 0; }
+static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
 #ifdef CONFIG_HOTPLUG_PCI_PCIE
@@ -147,16 +149,5 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-#ifdef CONFIG_ACPI_APEI
-int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
-#else
-static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev)
-{
-	if (pci_dev->__aer_firmware_first_valid)
-		return pci_dev->__aer_firmware_first;
-	return 0;
-}
-#endif
-
 struct device *pcie_port_find_device(struct pci_dev *dev, u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a8c82dc0e70..b808b12d0761 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -421,8 +421,6 @@ struct pci_dev {
 	 * itself internal but devices downstream from it are external.
 	 */
 	unsigned int	external_facing:1;
-	unsigned int	__aer_firmware_first_valid:1;
-	unsigned int	__aer_firmware_first:1;
 	unsigned int	broken_intx_masking:1;	/* INTx masking can't be used */
 	unsigned int	io_window_1k:1;		/* Intel bridge 1K I/O windows */
 	unsigned int	irq_managed:1;
-- 
Gitee


From e096cef6642bf100a881d30f5ac67a2718843393 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 16 Jul 2020 17:34:30 -0500
Subject: [PATCH 1165/2161] PCI/ERR: Rename pci_aer_clear_device_status() to
 pcie_clear_device_status()

commit 600a5b4fc8e8f6dad098cd50e0e727cb2a16be46 upstream.

pci_aer_clear_device_status() clears the error bits in the PCIe Device
Status Register (PCI_EXP_DEVSTA).  Every PCIe device has this register,
regardless of whether it supports AER.

Rename pci_aer_clear_device_status() to pcie_clear_device_status() to make
clear that it is PCIe-specific but not AER-specific.  Move it to
drivers/pci/pci.c, again since it's not AER-specific.  No functional change
intended.

Link: https://lore.kernel.org/r/20200717195619.766662-1-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci.c      |  8 ++++++++
 drivers/pci/pci.h      |  3 +--
 drivers/pci/pcie/aer.c | 10 +---------
 drivers/pci/pcie/err.c |  2 +-
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b90c7319dbc5..3917d15e744a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1970,6 +1970,14 @@ int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
 }
 EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state);
 
+void pcie_clear_device_status(struct pci_dev *dev)
+{
+	u16 sta;
+
+	pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
+	pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
+}
+
 /**
  * pcie_clear_root_pme_status - Clear root port PME interrupt status.
  * @dev: PCIe root port or event collector.
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index e855e24ad79b..99ec1ebfc27b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -92,6 +92,7 @@ void pci_refresh_power_state(struct pci_dev *dev);
 void pci_power_up(struct pci_dev *dev);
 void pci_disable_enabled_device(struct pci_dev *dev);
 int pci_finish_runtime_suspend(struct pci_dev *dev);
+void pcie_clear_device_status(struct pci_dev *dev);
 void pcie_clear_root_pme_status(struct pci_dev *dev);
 bool pci_check_pme_status(struct pci_dev *dev);
 void pci_pme_wakeup_bus(struct pci_bus *bus);
@@ -680,7 +681,6 @@ void pci_aer_init(struct pci_dev *dev);
 void pci_aer_exit(struct pci_dev *dev);
 extern const struct attribute_group aer_stats_attr_group;
 void pci_aer_clear_fatal_status(struct pci_dev *dev);
-void pci_aer_clear_device_status(struct pci_dev *dev);
 int pci_aer_clear_status(struct pci_dev *dev);
 int pci_aer_raw_clear_status(struct pci_dev *dev);
 #else
@@ -688,7 +688,6 @@ static inline void pci_no_aer(void) { }
 static inline void pci_aer_init(struct pci_dev *d) { }
 static inline void pci_aer_exit(struct pci_dev *d) { }
 static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
-static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
 static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
 static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
 #endif
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 803273ba30db..472c8974f6ab 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -249,14 +249,6 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 
-void pci_aer_clear_device_status(struct pci_dev *dev)
-{
-	u16 sta;
-
-	pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
-	pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
-}
-
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
 	int pos;
@@ -946,7 +938,7 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 		if (pos)
 			pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
 					info->status);
-		pci_aer_clear_device_status(dev);
+		pcie_clear_device_status(dev);
 	} else if (info->severity == AER_NONFATAL)
 		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
 	else if (info->severity == AER_FATAL)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 14bb8f54723e..24a864fdf3cd 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -197,7 +197,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	pci_dbg(dev, "broadcast resume message\n");
 	pci_walk_bus(bus, report_resume, &status);
 
-	pci_aer_clear_device_status(dev);
+	pcie_clear_device_status(dev);
 	pci_aer_clear_nonfatal_status(dev);
 	pci_info(dev, "device recovery successful\n");
 	return status;
-- 
Gitee


From 4404ae22d44c333f32f8355595cbb48605fb0fad Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 22 Jun 2020 19:35:23 +0800
Subject: [PATCH 1166/2161] PCI/ERR: Clear PCIe Device Status errors only if OS
 owns AER

commit 068c29a248b6ddbfdf7bb150b547569759620d36 upstream.

pcie_clear_device_status() resets the error bits in the PCIe Device Status
Register (PCI_EXP_DEVSTA).

Previously we did this unconditionally, but on ACPI systems, the _OSC AER
bit negotiates control of the AER capability.  Per sec 4.5.1 of the System
Firmware Intermediary _OSC and DPC Updates ECN [1], this bit also covers
other error enable/status bits including the following:

  Correctable Error Reporting Enable
  Non-Fatal Error Reporting Enable
  Fatal Error Reporting Enable
  Unsupported Request Reporting Enable

These bits are all in the PCIe Device Control register (the ECN omitted
"Reporting", but I think that's a typo), so by implication the _OSC AER bit
also applies to the error status bits in the PCIe Device Status register:

  Correctable Error Detected
  Non-Fatal Error Detected
  Fatal Error Detected
  Unsupported Request Detected

Clear the PCIe Device Status error bits only when the OS controls the AER
capability and related error enable/status bits.  If platform firmware
controls the AER capability, firmware is responsible for clearing these
bits.

One call path leading here is:

  ghes_do_proc
    ghes_handle_aer
      aer_recover_queue
        schedule_work(&aer_recover_work)
  ...
  aer_recover_work_func
    pcie_do_recovery
      pcie_clear_device_status

[1] System Firmware Intermediary (SFI) _OSC and DPC Updates ECN, Feb 24,
    2020, affecting PCI Firmware Specification, Rev. 3.2
    https://members.pcisig.com/wg/PCI-SIG/document/14076
[bhelgaas: commit log, move test from pcie_clear_device_status() to callers]
Link: https://lore.kernel.org/r/20200622113523.891666-1-Jonathan.Cameron@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pcie/aer.c | 3 ++-
 drivers/pci/pcie/err.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 472c8974f6ab..bac0e0403349 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -938,7 +938,8 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 		if (pos)
 			pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
 					info->status);
-		pcie_clear_device_status(dev);
+		if (pcie_aer_is_native(dev))
+			pcie_clear_device_status(dev);
 	} else if (info->severity == AER_NONFATAL)
 		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
 	else if (info->severity == AER_FATAL)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 24a864fdf3cd..f0cfd3df376b 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -197,7 +197,8 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	pci_dbg(dev, "broadcast resume message\n");
 	pci_walk_bus(bus, report_resume, &status);
 
-	pcie_clear_device_status(dev);
+	if (pcie_aer_is_native(dev))
+		pcie_clear_device_status(dev);
 	pci_aer_clear_nonfatal_status(dev);
 	pci_info(dev, "device recovery successful\n");
 	return status;
-- 
Gitee


From 6c1c54df88d9abf13da351b852765be552becffc Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 4 Jan 2021 15:02:58 -0800
Subject: [PATCH 1167/2161] PCI/ERR: Retain status from error notification

commit 387c72cdd7fb6bef650fb078d0f6ae9682abf631 upstream.

Overwriting the frozen detected status with the result of the link reset
loses the NEED_RESET result that drivers are depending on for error
handling to report the .slot_reset() callback. Retain this status so
that subsequent error handling has the correct flow.

Link: https://lore.kernel.org/r/20210104230300.1277180-4-kbusch@kernel.org
Reported-by: Hinko Kocevar <hinko.kocevar@ess.eu>
Tested-by: Hedi Berriche <hedi.berriche@hpe.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Sean V Kelley <sean.v.kelley@intel.com>
Acked-by: Hedi Berriche <hedi.berriche@hpe.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pcie/err.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f0cfd3df376b..fb86b1803cd1 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -165,8 +165,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	pci_dbg(dev, "broadcast error_detected message\n");
 	if (state == pci_channel_io_frozen) {
 		pci_walk_bus(bus, report_frozen_detected, &status);
-		status = reset_link(dev);
-		if (status != PCI_ERS_RESULT_RECOVERED) {
+		if (reset_link(dev) != PCI_ERS_RESULT_RECOVERED) {
 			pci_warn(dev, "link reset failed\n");
 			goto failed;
 		}
-- 
Gitee


From ab0dc98a208b1776291267720d2e71fe968bfdcc Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 28 Apr 2022 17:34:18 +0800
Subject: [PATCH 1168/2161] config: enable CONFIG_PCIE_DPC

commit opencloudos.

Some pcie device maybe take PCIE ghes on SPR platform.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/configs/oc.config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/configs/oc.config b/arch/x86/configs/oc.config
index 8c32b556976f..f37fc4e31fed 100644
--- a/arch/x86/configs/oc.config
+++ b/arch/x86/configs/oc.config
@@ -101,6 +101,8 @@ CONFIG_ACPI_HMAT=y
 CONFIG_ACPI_APEI=y
 CONFIG_ACPI_APEI_GHES=y
 CONFIG_ACPI_APEI_PCIEAER=y
+CONFIG_PCIE_DPC=y
+CONFIG_PCIE_EDR=y
 CONFIG_ACPI_APEI_MEMORY_FAILURE=y
 CONFIG_ACPI_APEI_EINJ=m
 CONFIG_ACPI_APEI_ERST_DEBUG=m
-- 
Gitee


From 69389ad4e3e9726c737b602437a9997f7571adfd Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 12 May 2022 13:38:34 +0800
Subject: [PATCH 1169/2161] Merge branches 'iommu/fixes', 'arm/mediatek',
 'arm/smmu', 'arm/exynos', 'unisoc', 'x86/vt-d', 'x86/amd' and 'core' into
 next

commit 49d11527e560f7b62bd740d42e01d895e1d7a606 upstream.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 20f7625ca45a..7bea7d1e647e 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -32,5 +32,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
-obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o
-obj-$(CONFIG_IOMMU_SVA_LIB) += io-pgfault.o
+obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o io-pgfault.o
-- 
Gitee


From 3c51d826a720fb0a27c8e126cc757668103ad953 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 12 May 2022 13:39:46 +0800
Subject: [PATCH 1170/2161] iommu/sva: Rename CONFIG_IOMMU_SVA_LIB to
 CONFIG_IOMMU_SVA

commit 7ba564722d98e3e7bc3922ad4f2885ca0336674e upstream.

This CONFIG option originally only referred to the Shared
Virtual Address (SVA) library. But it is now also used for
non-library portions of code.

Drop the "_LIB" suffix so that there is just one configuration
option for all code relating to SVA.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220207230254.3342514-2-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig         | 4 ++--
 drivers/iommu/Makefile        | 2 +-
 drivers/iommu/intel/Kconfig   | 2 +-
 drivers/iommu/iommu-sva-lib.h | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 5230bfeaa032..765b68b96ab4 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -102,8 +102,8 @@ config IOMMU_DMA
 	select IRQ_MSI_IOMMU
 	select NEED_SG_DMA_LENGTH
 
-# Shared Virtual Addressing library
-config IOMMU_SVA_LIB
+# Shared Virtual Addressing
+config IOMMU_SVA
 	bool
 	select IOASID
 
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 7bea7d1e647e..ca05f1026d95 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -32,4 +32,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
-obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o io-pgfault.o
+obj-$(CONFIG_IOMMU_SVA) += iommu-sva-lib.o io-pgfault.o
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index bb2f780edb4f..85271bf23f1d 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -47,7 +47,7 @@ config INTEL_IOMMU_SVM
 	select PCI_PRI
 	select MMU_NOTIFIER
 	select IOASID
-	select IOMMU_SVA_LIB
+	select IOMMU_SVA
 	help
 	  Shared Virtual Memory (SVM) provides a facility for devices
 	  to access DMA resources through process address space by
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
index 031155010ca8..95dc3ebc1928 100644
--- a/drivers/iommu/iommu-sva-lib.h
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -17,7 +17,7 @@ struct device;
 struct iommu_fault;
 struct iopf_queue;
 
-#ifdef CONFIG_IOMMU_SVA_LIB
+#ifdef CONFIG_IOMMU_SVA
 int iommu_queue_iopf(struct iommu_fault *fault, void *cookie);
 
 int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
@@ -28,7 +28,7 @@ struct iopf_queue *iopf_queue_alloc(const char *name);
 void iopf_queue_free(struct iopf_queue *queue);
 int iopf_queue_discard_partial(struct iopf_queue *queue);
 
-#else /* CONFIG_IOMMU_SVA_LIB */
+#else /* CONFIG_IOMMU_SVA */
 static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
 {
 	return -ENODEV;
@@ -64,5 +64,5 @@ static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
 {
 	return -ENODEV;
 }
-#endif /* CONFIG_IOMMU_SVA_LIB */
+#endif /* CONFIG_IOMMU_SVA */
 #endif /* _IOMMU_SVA_LIB_H */
-- 
Gitee


From 3730849ef57474596aa729173b6315466a347e8b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 12 Mar 2021 21:07:15 -0800
Subject: [PATCH 1171/2161] mm/fork: clear PASID for new mm

commit 82e69a121be4b1597ce758534816a8ee04c8b761 upstream.

When a new mm is created, its PASID should be cleared, i.e.  the PASID is
initialized to its init state 0 on both ARM and X86.

This patch was part of the series introducing mm->pasid, but got lost
along the way [1].  It still makes sense to have it, because each address
space has a different PASID.  And the IOMMU code in
iommu_sva_alloc_pasid() expects the pasid field of a new mm struct to be
cleared.

[1] https://lore.kernel.org/linux-iommu/YDgh53AcQHT+T3L0@otcwcpicx3.sc.intel.com/

Link: https://lkml.kernel.org/r/20210302103837.2562625-1-jean-philippe@linaro.org
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/mm_types.h | 1 +
 kernel/fork.c            | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cb1e045256c4..5c7a9cf2d50b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
 #endif
 #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
 
+#define INIT_PASID	0
 
 struct address_space;
 struct mem_cgroup;
diff --git a/kernel/fork.c b/kernel/fork.c
index 2511cd4718e7..b4e063593f74 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1004,6 +1004,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif
 }
 
+static void mm_init_pasid(struct mm_struct *mm)
+{
+#ifdef CONFIG_IOMMU_SUPPORT
+	mm->pasid = INIT_PASID;
+#endif
+}
+
 static void mm_init_uprobes_state(struct mm_struct *mm)
 {
 #ifdef CONFIG_UPROBES
@@ -1032,6 +1039,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
+	mm_init_pasid(mm);
 	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_mm_init(mm);
 	init_tlb_flush_pending(mm);
-- 
Gitee


From b4c0198fa34b422b18e8f8a710de98ce74a64ad0 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:45 -0800
Subject: [PATCH 1172/2161] mm: Change CONFIG option for mm->pasid field

commit 7a853c2d5951419fdf3c1c9d2b6f5a38f6a6857d upstream.

This currently depends on CONFIG_IOMMU_SUPPORT. But it is only
needed when CONFIG_IOMMU_SVA option is enabled.

Change the CONFIG guards around definition and initialization
of mm->pasid field.

Suggested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220207230254.3342514-3-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/mm_types.h | 2 +-
 kernel/fork.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5c7a9cf2d50b..604e40941d8d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -533,7 +533,7 @@ struct mm_struct {
 #endif
 		struct work_struct async_put_work;
 
-#ifdef CONFIG_IOMMU_SUPPORT
+#ifdef CONFIG_IOMMU_SVA
 		u32 pasid;
 #endif
 	} __randomize_layout;
diff --git a/kernel/fork.c b/kernel/fork.c
index b4e063593f74..1e239a1d0683 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1006,7 +1006,7 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 
 static void mm_init_pasid(struct mm_struct *mm)
 {
-#ifdef CONFIG_IOMMU_SUPPORT
+#ifdef CONFIG_IOMMU_SVA
 	mm->pasid = INIT_PASID;
 #endif
 }
-- 
Gitee


From f06d261bef86fd71a4a17f95c21de2560e54fd96 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:46 -0800
Subject: [PATCH 1173/2161] iommu/ioasid: Introduce a helper to check for valid
 PASIDs

commit 7a5fbc9bcba5325a45297a4ba00091f39a63a1ed upstream.

Define a pasid_valid() helper to check if a given PASID is valid.

  [ bp: Massage commit message. ]

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Suggested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220207230254.3342514-4-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/ioasid.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index e9dacd4b9f6b..2237f64dbaae 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -41,6 +41,10 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_set_data(ioasid_t ioasid, void *data);
+static inline bool pasid_valid(ioasid_t ioasid)
+{
+	return ioasid != INVALID_IOASID;
+}
 
 #else /* !CONFIG_IOASID */
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
@@ -78,5 +82,10 @@ static inline int ioasid_set_data(ioasid_t ioasid, void *data)
 	return -ENOTSUPP;
 }
 
+static inline bool pasid_valid(ioasid_t ioasid)
+{
+	return false;
+}
+
 #endif /* CONFIG_IOASID */
 #endif /* __LINUX_IOASID_H */
-- 
Gitee


From 20e1c21f1b062eb67ecab88768e9aa69e9089d7d Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 12 May 2022 14:51:32 +0800
Subject: [PATCH 1174/2161] kernel/fork: Initialize mm's PASID

commit a6cbd44093ef305b02ad5f80ed54abf0148a696c upstream.

A new mm doesn't have a PASID yet when it's created. Initialize
the mm's PASID on fork() or for init_mm to INVALID_IOASID (-1).

INIT_PASID (0) is reserved for kernel legacy DMA PASID. It cannot be
allocated to a user process. Initializing the process's PASID to 0 may
cause confusion that's why the process uses the reserved kernel legacy
DMA PASID. Initializing the PASID to INVALID_IOASID (-1) explicitly
tells the process doesn't have a valid PASID yet.

Even though the only user of mm_pasid_init() is in fork.c, define it in
<linux/sched/mm.h> as the first of three mm/pasid life cycle functions
(init/set/drop) to keep these all together.

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-5-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/sched/mm.h | 10 ++++++++++
 kernel/fork.c            | 10 ++--------
 mm/init-mm.c             |  4 ++++
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 3a1d899019af..c9fef831ba21 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -8,6 +8,7 @@
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
 #include <linux/sync_core.h>
+#include <linux/ioasid.h>
 
 /*
  * Routines for handling mm_structs
@@ -393,4 +394,13 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_IOMMU_SVA
+static inline void mm_pasid_init(struct mm_struct *mm)
+{
+	mm->pasid = INVALID_IOASID;
+}
+#else
+static inline void mm_pasid_init(struct mm_struct *mm) {}
+#endif
+
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1e239a1d0683..b3026a2fbda8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -94,6 +94,7 @@
 #include <linux/livepatch.h>
 #include <linux/thread_info.h>
 #include <linux/stackleak.h>
+#include <linux/sched/mm.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1004,13 +1005,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif
 }
 
-static void mm_init_pasid(struct mm_struct *mm)
-{
-#ifdef CONFIG_IOMMU_SVA
-	mm->pasid = INIT_PASID;
-#endif
-}
-
 static void mm_init_uprobes_state(struct mm_struct *mm)
 {
 #ifdef CONFIG_UPROBES
@@ -1039,7 +1033,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
-	mm_init_pasid(mm);
+	mm_pasid_init(mm);
 	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_mm_init(mm);
 	init_tlb_flush_pending(mm);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 19603302a77f..2c31799d3514 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -9,6 +9,7 @@
 
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
+#include <linux/ioasid.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 
@@ -37,5 +38,8 @@ struct mm_struct init_mm = {
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
+#ifdef CONFIG_IOMMU_SVA
+	.pasid		= INVALID_IOASID,
+#endif
 	INIT_MM_CONTEXT(init_mm)
 };
-- 
Gitee


From 8a2e25120326b9d8d2f7abf7769571c59ce25328 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Fri, 28 Jan 2022 18:44:33 +0800
Subject: [PATCH 1175/2161] iommu: Fix some W=1 warnings

commit 30209b93177a75843673de92771716c941c20ef5 upstream.

The code is mostly free of W=1 warning, so fix the following:

drivers/iommu/iommu.c:996: warning: expecting prototype for iommu_group_for_each_dev(). Prototype was for __iommu_group_for_each_dev() instead
drivers/iommu/iommu.c:3048: warning: Function parameter or member 'drvdata' not described in 'iommu_sva_bind_device'
drivers/iommu/ioasid.c:354: warning: Function parameter or member 'ioasid' not described in 'ioasid_get'
drivers/iommu/omap-iommu.c:1098: warning: expecting prototype for omap_iommu_suspend_prepare(). Prototype was for omap_iommu_prepare() instead

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1643366673-26803-1-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c     |  1 +
 drivers/iommu/iommu.c      | 24 ++++++++++++------------
 drivers/iommu/omap-iommu.c |  2 +-
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 50ee27bbd04e..06fee7416816 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -349,6 +349,7 @@ EXPORT_SYMBOL_GPL(ioasid_alloc);
 
 /**
  * ioasid_get - obtain a reference to the IOASID
+ * @ioasid: the ID to get
  */
 void ioasid_get(ioasid_t ioasid)
 {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 68d5e6007e4f..d9e6776151c6 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -926,17 +926,6 @@ static int iommu_group_device_count(struct iommu_group *group)
 	return ret;
 }
 
-/**
- * iommu_group_for_each_dev - iterate over each device in the group
- * @group: the group
- * @data: caller opaque data to be passed to callback function
- * @fn: caller supplied callback function
- *
- * This function is called by group users to iterate over group devices.
- * Callers should hold a reference count to the group during callback.
- * The group->mutex is held across callbacks, which will block calls to
- * iommu_group_add/remove_device.
- */
 static int __iommu_group_for_each_dev(struct iommu_group *group, void *data,
 				      int (*fn)(struct device *, void *))
 {
@@ -951,7 +940,17 @@ static int __iommu_group_for_each_dev(struct iommu_group *group, void *data,
 	return ret;
 }
 
-
+/**
+ * iommu_group_for_each_dev - iterate over each device in the group
+ * @group: the group
+ * @data: caller opaque data to be passed to callback function
+ * @fn: caller supplied callback function
+ *
+ * This function is called by group users to iterate over group devices.
+ * Callers should hold a reference count to the group during callback.
+ * The group->mutex is held across callbacks, which will block calls to
+ * iommu_group_add/remove_device.
+ */
 int iommu_group_for_each_dev(struct iommu_group *group, void *data,
 			     int (*fn)(struct device *, void *))
 {
@@ -2983,6 +2982,7 @@ EXPORT_SYMBOL_GPL(iommu_aux_get_pasid);
  * iommu_sva_bind_device() - Bind a process address space to a device
  * @dev: the device
  * @mm: the mm to bind, caller must hold a reference to it
+ * @drvdata: opaque data pointer to pass to bind callback
  *
  * Create a bond between device and address space, allowing the device to access
  * the mm using the returned PASID. If a bond already exists between @device and
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 09c6e1c680db..39c9d1eded70 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1097,7 +1097,7 @@ static __maybe_unused int omap_iommu_runtime_resume(struct device *dev)
 }
 
 /**
- * omap_iommu_suspend_prepare - prepare() dev_pm_ops implementation
+ * omap_iommu_prepare - prepare() dev_pm_ops implementation
  * @dev:	iommu device
  *
  * This function performs the necessary checks to determine if the IOMMU
-- 
Gitee


From d09cdeffbb8752baab671e331875cdebba4af7c3 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:48 -0800
Subject: [PATCH 1176/2161] iommu/sva: Assign a PASID to mm on PASID allocation
 and free it on mm exit

commit 701fac40384f07197b106136012804c3cae0b3de upstream.

PASIDs are process-wide. It was attempted to use refcounted PASIDs to
free them when the last thread drops the refcount. This turned out to
be complex and error prone. Given the fact that the PASID space is 20
bits, which allows up to 1M processes to have a PASID associated
concurrently, PASID resource exhaustion is not a realistic concern.

Therefore, it was decided to simplify the approach and stick with lazy
on demand PASID allocation, but drop the eager free approach and make an
allocated PASID's lifetime bound to the lifetime of the process.

Get rid of the refcounting mechanisms and replace/rename the interfaces
to reflect this new approach.

  [ bp: Massage commit message. ]

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-6-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c   |  4 ++--
 drivers/iommu/intel/svm.c     |  9 --------
 drivers/iommu/ioasid.c        | 39 ++++-------------------------------
 drivers/iommu/iommu-sva-lib.c | 39 +++++++++++------------------------
 drivers/iommu/iommu-sva-lib.h |  1 -
 include/linux/ioasid.h        | 12 ++---------
 include/linux/sched/mm.h      | 16 ++++++++++++++
 kernel/fork.c                 |  1 +
 8 files changed, 37 insertions(+), 84 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2859acd610b2..a402b806f46d 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4719,7 +4719,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 link_failed:
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
-		ioasid_put(domain->default_pasid);
+		ioasid_free(domain->default_pasid);
 
 	return ret;
 }
@@ -4749,7 +4749,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
-		ioasid_put(domain->default_pasid);
+		ioasid_free(domain->default_pasid);
 }
 
 static int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e013198cdba7..82fec4389dd5 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -529,11 +529,6 @@ static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
 	return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
 }
 
-static void intel_svm_free_pasid(struct mm_struct *mm)
-{
-	iommu_sva_free_pasid(mm);
-}
-
 static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
 					   struct device *dev,
 					   struct mm_struct *mm,
@@ -684,8 +679,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 				kfree(svm);
 			}
 		}
-		/* Drop a PASID reference and free it if no reference. */
-		intel_svm_free_pasid(mm);
 	}
 out:
 	return ret;
@@ -1070,8 +1063,6 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void
 	}
 
 	sva = intel_svm_bind_mm(iommu, dev, mm, flags);
-	if (IS_ERR_OR_NULL(sva))
-		intel_svm_free_pasid(mm);
 	mutex_unlock(&pasid_mutex);
 
 	return sva;
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 06fee7416816..a786c034907c 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -2,7 +2,7 @@
 /*
  * I/O Address Space ID allocator. There is one global IOASID space, split into
  * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc and ioasid_put.
+ * free IOASIDs with ioasid_alloc() and ioasid_free().
  */
 #include <linux/ioasid.h>
 #include <linux/module.h>
@@ -15,7 +15,6 @@ struct ioasid_data {
 	struct ioasid_set *set;
 	void *private;
 	struct rcu_head rcu;
-	refcount_t refs;
 };
 
 /*
@@ -315,7 +314,6 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 
 	data->set = set;
 	data->private = private;
-	refcount_set(&data->refs, 1);
 
 	/*
 	 * Custom allocator needs allocator data to perform platform specific
@@ -348,35 +346,11 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 EXPORT_SYMBOL_GPL(ioasid_alloc);
 
 /**
- * ioasid_get - obtain a reference to the IOASID
- * @ioasid: the ID to get
- */
-void ioasid_get(ioasid_t ioasid)
-{
-	struct ioasid_data *ioasid_data;
-
-	spin_lock(&ioasid_allocator_lock);
-	ioasid_data = xa_load(&active_allocator->xa, ioasid);
-	if (ioasid_data)
-		refcount_inc(&ioasid_data->refs);
-	else
-		WARN_ON(1);
-	spin_unlock(&ioasid_allocator_lock);
-}
-EXPORT_SYMBOL_GPL(ioasid_get);
-
-/**
- * ioasid_put - Release a reference to an ioasid
+ * ioasid_free - Free an ioasid
  * @ioasid: the ID to remove
- *
- * Put a reference to the IOASID, free it when the number of references drops to
- * zero.
- *
- * Return: %true if the IOASID was freed, %false otherwise.
  */
-bool ioasid_put(ioasid_t ioasid)
+void ioasid_free(ioasid_t ioasid)
 {
-	bool free = false;
 	struct ioasid_data *ioasid_data;
 
 	spin_lock(&ioasid_allocator_lock);
@@ -386,10 +360,6 @@ bool ioasid_put(ioasid_t ioasid)
 		goto exit_unlock;
 	}
 
-	free = refcount_dec_and_test(&ioasid_data->refs);
-	if (!free)
-		goto exit_unlock;
-
 	active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
 	/* Custom allocator needs additional steps to free the xa element */
 	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
@@ -399,9 +369,8 @@ bool ioasid_put(ioasid_t ioasid)
 
 exit_unlock:
 	spin_unlock(&ioasid_allocator_lock);
-	return free;
 }
-EXPORT_SYMBOL_GPL(ioasid_put);
+EXPORT_SYMBOL_GPL(ioasid_free);
 
 /**
  * ioasid_find - Find IOASID data
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
index bd41405d34e9..106506143896 100644
--- a/drivers/iommu/iommu-sva-lib.c
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -18,8 +18,7 @@ static DECLARE_IOASID_SET(iommu_sva_pasid);
  *
  * Try to allocate a PASID for this mm, or take a reference to the existing one
  * provided it fits within the [@min, @max] range. On success the PASID is
- * available in mm->pasid, and must be released with iommu_sva_free_pasid().
- * @min must be greater than 0, because 0 indicates an unused mm->pasid.
+ * available in mm->pasid and will be available for the lifetime of the mm.
  *
  * Returns 0 on success and < 0 on error.
  */
@@ -33,38 +32,24 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
 		return -EINVAL;
 
 	mutex_lock(&iommu_sva_lock);
-	if (mm->pasid) {
-		if (mm->pasid >= min && mm->pasid <= max)
-			ioasid_get(mm->pasid);
-		else
+	/* Is a PASID already associated with this mm? */
+	if (pasid_valid(mm->pasid)) {
+		if (mm->pasid < min || mm->pasid >= max)
 			ret = -EOVERFLOW;
-	} else {
-		pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
-		if (pasid == INVALID_IOASID)
-			ret = -ENOMEM;
-		else
-			mm->pasid = pasid;
+		goto out;
 	}
+
+	pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
+	if (!pasid_valid(pasid))
+		ret = -ENOMEM;
+	else
+		mm_pasid_set(mm, pasid);
+out:
 	mutex_unlock(&iommu_sva_lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
 
-/**
- * iommu_sva_free_pasid - Release the mm's PASID
- * @mm: the mm
- *
- * Drop one reference to a PASID allocated with iommu_sva_alloc_pasid()
- */
-void iommu_sva_free_pasid(struct mm_struct *mm)
-{
-	mutex_lock(&iommu_sva_lock);
-	if (ioasid_put(mm->pasid))
-		mm->pasid = 0;
-	mutex_unlock(&iommu_sva_lock);
-}
-EXPORT_SYMBOL_GPL(iommu_sva_free_pasid);
-
 /* ioasid_find getter() requires a void * argument */
 static bool __mmget_not_zero(void *mm)
 {
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
index 95dc3ebc1928..8909ea1094e3 100644
--- a/drivers/iommu/iommu-sva-lib.h
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -9,7 +9,6 @@
 #include <linux/mm_types.h>
 
 int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
-void iommu_sva_free_pasid(struct mm_struct *mm);
 struct mm_struct *iommu_sva_find(ioasid_t pasid);
 
 /* I/O Page fault */
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 2237f64dbaae..af1c9d62e642 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -34,8 +34,7 @@ struct ioasid_allocator_ops {
 #if IS_ENABLED(CONFIG_IOASID)
 ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		      void *private);
-void ioasid_get(ioasid_t ioasid);
-bool ioasid_put(ioasid_t ioasid);
+void ioasid_free(ioasid_t ioasid);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -53,14 +52,7 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
 	return INVALID_IOASID;
 }
 
-static inline void ioasid_get(ioasid_t ioasid)
-{
-}
-
-static inline bool ioasid_put(ioasid_t ioasid)
-{
-	return false;
-}
+static inline void ioasid_free(ioasid_t ioasid) { }
 
 static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 				bool (*getter)(void *))
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c9fef831ba21..5122f54f01b1 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -399,8 +399,24 @@ static inline void mm_pasid_init(struct mm_struct *mm)
 {
 	mm->pasid = INVALID_IOASID;
 }
+
+/* Associate a PASID with an mm_struct: */
+static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
+{
+	mm->pasid = pasid;
+}
+
+static inline void mm_pasid_drop(struct mm_struct *mm)
+{
+	if (pasid_valid(mm->pasid)) {
+		ioasid_free(mm->pasid);
+		mm->pasid = INVALID_IOASID;
+	}
+}
 #else
 static inline void mm_pasid_init(struct mm_struct *mm) {}
+static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
+static inline void mm_pasid_drop(struct mm_struct *mm) {}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index b3026a2fbda8..ebbdc39597cf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1099,6 +1099,7 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
+	mm_pasid_drop(mm);
 	mmdrop(mm);
 }
 
-- 
Gitee


From 66dc35d24661356ec97adac170b807576a9ebc9a Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:49 -0800
Subject: [PATCH 1177/2161] x86/fpu: Clear PASID when copying fpstate

commit dc7507ddce593cbd9c93330024a5658db6f8ef73 upstream.

The kernel must allocate a Process Address Space ID (PASID) on behalf of
each process which will use ENQCMD and program it into the new MSR to
communicate the process identity to platform hardware. ENQCMD uses the
PASID stored in this MSR to tag requests from this process.

The PASID state must be cleared on fork() since fork creates a
new address space.

For clone(), it would be functionally OK to copy the PASID. However,
clearing it is _also_ functionally OK since any PASID use will trigger
the #GP handler to populate the MSR.

Copying the PASID state has two main downsides:
 * It requires differentiating fork() and clone() in the code,
   both in the FPU code and keeping tsk->pasid_activated consistent.
 * It guarantees that the PASID is out of its init state, which
   incurs small but non-zero cost on every XSAVE/XRSTOR.

The main downside of clearing the PASID at fpstate copy is the future,
one-time #GP for the thread.

Use the simplest approach: clear the PASID state both on clone() and
fork().  Rely on the #GP handler for MSR population in children.

Also, just clear the PASID bit from xfeatures if XSAVE is supported.
This will have no effect on systems that do not have PASID support.  It
is virtually zero overhead because 'dst_fpu' was just written and
the whole thing is cache hot.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-7-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 31d7dedb386e..5f0a01b8fad8 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -605,6 +605,13 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags)
 		fpu_inherit_perms(dst_fpu);
 	fpregs_unlock();
 
+	/*
+	 * Children never inherit PASID state.
+	 * Force it to have its init value:
+	 */
+	if (use_xsave())
+		dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;
+
 	trace_x86_fpu_copy_src(src_fpu);
 	trace_x86_fpu_copy_dst(dst_fpu);
 
-- 
Gitee


From d25e2fc55582bbbada163eeab87c4e2e20cdacd3 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Mon, 16 Mar 2020 21:28:05 -0400
Subject: [PATCH 1178/2161] psi: Move PF_MEMSTALL out of task->flags

commit 1066d1b6974e095d5a6c472ad9180a957b496cd6 upstream.

The task->flags is a 32-bits flag, in which 31 bits have already been
consumed. So it is hardly to introduce other new per process flag.
Currently there're still enough spaces in the bit-field section of
task_struct, so we can define the memstall state as a single bit in
task_struct instead.
This patch also removes an out-of-date comment pointed by Matthew.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/1584408485-1921-1-git-send-email-laoar.shao@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Alex Shi <alexsshi@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/sched.h |  6 ++++--
 kernel/sched/psi.c    | 12 ++++++------
 kernel/sched/stats.h  | 10 +++++-----
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7999c06015f..b151fe06ae8f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -792,9 +792,12 @@ struct task_struct {
 	unsigned			frozen:1;
 #endif
 #ifdef CONFIG_BLK_CGROUP
-	/* to be used once the psi infrastructure lands upstream. */
 	unsigned			use_memdelay:1;
 #endif
+#ifdef CONFIG_PSI
+	/* Stalled due to lack of memory */
+	unsigned			in_memstall:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
@@ -1495,7 +1498,6 @@ extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
-#define PF_MEMSTALL		0x01000000	/* Stalled due to lack of memory */
 #define PF_UMH			0x02000000	/* I'm an Usermodehelper process */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9154e745f097..10eeebf1808e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -818,17 +818,17 @@ void psi_memstall_enter(unsigned long *flags)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	*flags = current->flags & PF_MEMSTALL;
+	*flags = current->in_memstall;
 	if (*flags)
 		return;
 	/*
-	 * PF_MEMSTALL setting & accounting needs to be atomic wrt
+	 * in_memstall setting & accounting needs to be atomic wrt
 	 * changes to the task's scheduling state, otherwise we can
 	 * race with CPU migration.
 	 */
 	rq = this_rq_lock_irq(&rf);
 
-	current->flags |= PF_MEMSTALL;
+	current->in_memstall = 1;
 	psi_task_change(current, 0, TSK_MEMSTALL);
 
 	rq_unlock_irq(rq, &rf);
@@ -851,13 +851,13 @@ void psi_memstall_leave(unsigned long *flags)
 	if (*flags)
 		return;
 	/*
-	 * PF_MEMSTALL clearing & accounting needs to be atomic wrt
+	 * in_memstall clearing & accounting needs to be atomic wrt
 	 * changes to the task's scheduling state, otherwise we could
 	 * race with CPU migration.
 	 */
 	rq = this_rq_lock_irq(&rf);
 
-	current->flags &= ~PF_MEMSTALL;
+	current->in_memstall = 0;
 	psi_task_change(current, TSK_MEMSTALL, 0);
 
 	rq_unlock_irq(rq, &rf);
@@ -921,7 +921,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 	else if (task->in_iowait)
 		task_flags = TSK_IOWAIT;
 
-	if (task->flags & PF_MEMSTALL)
+	if (task->in_memstall)
 		task_flags |= TSK_MEMSTALL;
 
 	if (task_flags)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4543af7b140c..46213bb177c5 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -71,7 +71,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 		return;
 
 	if (!wakeup || p->sched_psi_wake_requeue) {
-		if (p->flags & PF_MEMSTALL)
+		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
 		if (p->sched_psi_wake_requeue)
 			p->sched_psi_wake_requeue = 0;
@@ -91,7 +91,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 		return;
 
 	if (!sleep) {
-		if (p->flags & PF_MEMSTALL)
+		if (p->in_memstall)
 			clear |= TSK_MEMSTALL;
 	} else {
 		if (p->in_iowait)
@@ -110,14 +110,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
 	 * deregister its sleep-persistent psi states from the old
 	 * queue, and let psi_enqueue() know it has to requeue.
 	 */
-	if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
+	if (unlikely(p->in_iowait || p->in_memstall)) {
 		struct rq_flags rf;
 		struct rq *rq;
 		int clear = 0;
 
 		if (p->in_iowait)
 			clear |= TSK_IOWAIT;
-		if (p->flags & PF_MEMSTALL)
+		if (p->in_memstall)
 			clear |= TSK_MEMSTALL;
 
 		rq = __task_rq_lock(p, &rf);
@@ -132,7 +132,7 @@ static inline void psi_task_tick(struct rq *rq)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	if (unlikely(rq->curr->flags & PF_MEMSTALL))
+	if (unlikely(rq->curr->in_memstall))
 		psi_memstall_tick(rq->curr, cpu_of(rq));
 }
 #else /* CONFIG_PSI */
-- 
Gitee


From 24e778001da08b8f7c70379c5e25d5247f03a9a4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 7 Feb 2022 15:02:50 -0800
Subject: [PATCH 1179/2161] sched: Define and initialize a flag to identify
 valid PASID in the task

commit a3d29e8291b622780eb6e4e3eeaf2b24ec78fd43 upstream.

Add a new single bit field to the task structure to track whether this task
has initialized the IA32_PASID MSR to the mm's PASID.

Initialize the field to zero when creating a new task with fork/clone.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-8-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/sched.h | 3 +++
 kernel/fork.c         | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b151fe06ae8f..ae223f2470a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -798,6 +798,9 @@ struct task_struct {
 	/* Stalled due to lack of memory */
 	unsigned			in_memstall:1;
 #endif
+#ifdef CONFIG_IOMMU_SVA
+	unsigned			pasid_activated:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index ebbdc39597cf..41de611bae6b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -953,6 +953,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->use_memdelay = 0;
 #endif
 
+#ifdef CONFIG_IOMMU_SVA
+	tsk->pasid_activated = 0;
+#endif
+
 #ifdef CONFIG_MEMCG
 	tsk->active_memcg = NULL;
 #endif
-- 
Gitee


From 5eecee38f760198fe11028a1e11bc6db9301eb55 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:51 -0800
Subject: [PATCH 1180/2161] x86/traps: Demand-populate PASID MSR via #GP

commit fa6af69f38d3f409bedc55d0112eec36ed526d4b upstream.

All tasks start with PASID state disabled. This means that the first
time they execute an ENQCMD instruction they will take a #GP fault.

Modify the #GP fault handler to check if the "mm" for the task has
already been allocated a PASID. If so, try to fix the #GP fault by
loading the IA32_PASID MSR.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-9-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/traps.c | 55 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bd1065cef1e6..9b741cd9f778 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -39,6 +39,7 @@
 #include <linux/io.h>
 #include <linux/hardirq.h>
 #include <linux/atomic.h>
+#include <linux/ioasid.h>
 
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
@@ -462,12 +463,66 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
 }
 
+/*
+ * The unprivileged ENQCMD instruction generates #GPs if the
+ * IA32_PASID MSR has not been populated.  If possible, populate
+ * the MSR from a PASID previously allocated to the mm.
+ */
+static bool try_fixup_enqcmd_gp(void)
+{
+#ifdef CONFIG_IOMMU_SVA
+	u32 pasid;
+
+	/*
+	 * MSR_IA32_PASID is managed using XSAVE.  Directly
+	 * writing to the MSR is only possible when fpregs
+	 * are valid and the fpstate is not.  This is
+	 * guaranteed when handling a userspace exception
+	 * in *before* interrupts are re-enabled.
+	 */
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * Hardware without ENQCMD will not generate
+	 * #GPs that can be fixed up here.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+		return false;
+
+	pasid = current->mm->pasid;
+
+	/*
+	 * If the mm has not been allocated a
+	 * PASID, the #GP can not be fixed up.
+	 */
+	if (!pasid_valid(pasid))
+		return false;
+
+	/*
+	 * Did this thread already have its PASID activated?
+	 * If so, the #GP must be from something else.
+	 */
+	if (current->pasid_activated)
+		return false;
+
+	wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
+	current->pasid_activated = 1;
+
+	return true;
+#else
+	return false;
+#endif
+}
+
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	const char *desc = "general protection fault";
 	struct task_struct *tsk;
 
+	if (user_mode(regs) && try_fixup_enqcmd_gp())
+		return;
+
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	cond_local_irq_enable(regs);
 
-- 
Gitee


From 69d1beb6aae9e4b2bcc9dcd229f5bb418e1d1809 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:52 -0800
Subject: [PATCH 1181/2161] x86/cpufeatures: Re-enable ENQCMD

commit 7c1ef59145f1c8bf9a2cc7a6ebf2fd56bbb440de upstream.

The ENQCMD feature can only be used if CONFIG_INTEL_IOMMU_SVM is set.
Add X86_FEATURE_ENQCMD to the disabled features mask as appropriate so
that cpu_feature_enabled() can be used to check the feature.

  [ bp: Massage commit message. ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-10-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/disabled-features.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 0c6adf09fae5..6cec2044e9b1 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -62,8 +62,11 @@
 # define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
 #endif
 
-/* Force disable because it's broken beyond repair */
-#define DISABLE_ENQCMD		(1 << (X86_FEATURE_ENQCMD & 31))
+#ifdef CONFIG_INTEL_IOMMU_SVM
+# define DISABLE_ENQCMD		0
+#else
+# define DISABLE_ENQCMD		(1 << (X86_FEATURE_ENQCMD & 31))
+#endif
 
 #ifdef CONFIG_X86_SGX
 # define DISABLE_SGX	0
-- 
Gitee


From f0e5a18be0e19084a199c87ef46e9a5ee313d160 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 12 May 2022 16:58:38 +0800
Subject: [PATCH 1182/2161] tools/objtool: Check for use of the ENQCMD
 instruction in the kernel

commit 6e3133d901e89a4ba83ce7ebd8c27bbeaa9ed1f2 upstream.

The ENQCMD instruction implicitly accesses the PASID_MSR to fill in the
pasid field of the descriptor being submitted to an accelerator. But
there is no precise (and stable across kernel changes) point at which
the PASID_MSR is updated from the value for one task to the next.

Kernel code that uses accelerators must always use the ENQCMDS instruction
which does not access the PASID_MSR.

Check for use of the ENQCMD instruction in the kernel and warn on its
usage.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20220207230254.3342514-11-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/objtool/arch/x86/decode.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index a62e032863a8..0c538eb96cd7 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -73,9 +73,10 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 {
 	struct insn insn;
 	int x86_64, sign;
-	unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0,
-		      rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0,
-		      modrm_reg = 0, sib = 0;
+	unsigned char op1, op2, op3,
+		      rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0,
+		      modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0,
+		      sib = 0;
 
 	x86_64 = is_x86_64(elf);
 	if (x86_64 == -1)
@@ -97,6 +98,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 
 	op1 = insn.opcode.bytes[0];
 	op2 = insn.opcode.bytes[1];
+	op3 = insn.opcode.bytes[2];
 
 	if (insn.rex_prefix.nbytes) {
 		rex = insn.rex_prefix.bytes[0];
@@ -384,6 +386,14 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 			/* nopl/nopw */
 			*type = INSN_NOP;
 
+		} else if (op2 == 0x38 && op3 == 0xf8) {
+			if (insn.prefixes.nbytes == 1 &&
+			    insn.prefixes.bytes[0] == 0xf2) {
+				/* ENQCMD cannot be used in the kernel. */
+				WARN("ENQCMD instruction at %s:%lx", sec->name,
+				     offset);
+			}
+
 		} else if (op2 == 0xa0 || op2 == 0xa8) {
 
 			/* push fs/gs */
-- 
Gitee


From 161a83a047d6b67a9a378b304c5cda8475209270 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:54 -0800
Subject: [PATCH 1183/2161] Documentation/x86: Update documentation for SVA
 (Shared Virtual Addressing)

commit 83aa52ffed5d35a08e24452d0471e1684075cdf8 upstream.

Adjust the documentation to the new way how a PASID is being allocated,
freed and fixed up.

Based on a patch by Ashok Raj <ashok.raj@intel.com>

  [ bp: Massage commit message, fix htmldocs build warning ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-12-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/sva.rst | 53 ++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/Documentation/x86/sva.rst b/Documentation/x86/sva.rst
index 076efd51ef1f..2e9b8b0f9a0f 100644
--- a/Documentation/x86/sva.rst
+++ b/Documentation/x86/sva.rst
@@ -104,18 +104,47 @@ The MSR must be configured on each logical CPU before any application
 thread can interact with a device. Threads that belong to the same
 process share the same page tables, thus the same MSR value.
 
-PASID is cleared when a process is created. The PASID allocation and MSR
-programming may occur long after a process and its threads have been created.
-One thread must call iommu_sva_bind_device() to allocate the PASID for the
-process. If a thread uses ENQCMD without the MSR first being populated, a #GP
-will be raised. The kernel will update the PASID MSR with the PASID for all
-threads in the process. A single process PASID can be used simultaneously
-with multiple devices since they all share the same address space.
-
-One thread can call iommu_sva_unbind_device() to free the allocated PASID.
-The kernel will clear the PASID MSR for all threads belonging to the process.
-
-New threads inherit the MSR value from the parent.
+PASID Life Cycle Management
+===========================
+
+PASID is initialized as INVALID_IOASID (-1) when a process is created.
+
+Only processes that access SVA-capable devices need to have a PASID
+allocated. This allocation happens when a process opens/binds an SVA-capable
+device but finds no PASID for this process. Subsequent binds of the same, or
+other devices will share the same PASID.
+
+Although the PASID is allocated to the process by opening a device,
+it is not active in any of the threads of that process. It's loaded to the
+IA32_PASID MSR lazily when a thread tries to submit a work descriptor
+to a device using the ENQCMD.
+
+That first access will trigger a #GP fault because the IA32_PASID MSR
+has not been initialized with the PASID value assigned to the process
+when the device was opened. The Linux #GP handler notes that a PASID has
+been allocated for the process, and so initializes the IA32_PASID MSR
+and returns so that the ENQCMD instruction is re-executed.
+
+On fork(2) or exec(2) the PASID is removed from the process as it no
+longer has the same address space that it had when the device was opened.
+
+On clone(2) the new task shares the same address space, so will be
+able to use the PASID allocated to the process. The IA32_PASID is not
+preemptively initialized as the PASID value might not be allocated yet or
+the kernel does not know whether this thread is going to access the device
+and the cleared IA32_PASID MSR reduces context switch overhead by xstate
+init optimization. Since #GP faults have to be handled on any threads that
+were created before the PASID was assigned to the mm of the process, newly
+created threads might as well be treated in a consistent way.
+
+Due to complexity of freeing the PASID and clearing all IA32_PASID MSRs in
+all threads in unbind, free the PASID lazily only on mm exit.
+
+If a process does a close(2) of the device file descriptor and munmap(2)
+of the device MMIO portal, then the driver will unbind the device. The
+PASID is still marked VALID in the PASID_MSR for any threads in the
+process that accessed the device. But this is harmless as without the
+MMIO portal they cannot submit new work to the device.
 
 Relationships
 =============
-- 
Gitee


From bfecbffe8ef690bdb8e67789237cc38562a8328e Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Fri, 10 Dec 2021 14:55:03 -0800
Subject: [PATCH 1184/2161] signal: Skip the altstack update when not needed

commit 6c3118c32129b4197999a8928ba776bcabd0f5c4 upstream.

== Background ==

Support for large, "dynamic" fpstates was recently merged.  This
included code to ensure that sigaltstacks are sufficiently sized for
these large states.  A new lock was added to remove races between
enabling large features and setting up sigaltstacks.

== Problem ==

The new lock (sigaltstack_lock()) is acquired in the sigreturn path
before restoring the old sigaltstack.  Unfortunately, contention on the
new lock causes a measurable signal handling performance regression [1].
However, the common case is that no *changes* are made to the
sigaltstack state at sigreturn.

== Solution ==

do_sigaltstack() acquires sigaltstack_lock() and is used for both
sys_sigaltstack() and restoring the sigaltstack in sys_sigreturn().
Check for changes to the sigaltstack before taking the lock.  If no
changes were made, return before acquiring the lock.

This removes lock contention from the common-case sigreturn path.

[1] https://lore.kernel.org/lkml/20211207012128.GA16074@xsang-OptiPlex-9020/

Fixes: 3aac3ebea08f ("x86/signal: Implement sigaltstack size validation")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20211210225503.12734-1-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/signal.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/signal.c b/kernel/signal.c
index 0227c3a62345..509e31ae24ab 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4047,6 +4047,15 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
 				ss_mode != 0))
 			return -EINVAL;
 
+		/*
+		 * Return before taking any locks if no actual
+		 * sigaltstack changes were requested.
+		 */
+		if (t->sas_ss_sp == (unsigned long)ss_sp &&
+		    t->sas_ss_size == ss_size &&
+		    t->sas_ss_flags == ss_flags)
+			return 0;
+
 		sigaltstack_lock();
 		if (ss_mode == SS_DISABLE) {
 			ss_size = 0;
-- 
Gitee


From 8654ee7cf26648a3fbe90464f4debffa43a49e7a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 14 Feb 2022 13:05:49 +0100
Subject: [PATCH 1185/2161] x86/ptrace: Fix xfpregs_set()'s incorrect xmm
 clearing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 44cad52cc14ae10062f142ec16ede489bccf4469 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xfpregs_set() handles 32-bit REGSET_XFP and 64-bit REGSET_FP. The actual
code treats these regsets as modern FX state (i.e. the beginning part of
XSTATE). The declarations of the regsets thought they were the legacy
i387 format. The code thought they were the 32-bit (no xmm8..15) variant
of XSTATE and, for good measure, made the high bits disappear by zeroing
the wrong part of the buffer. The latter broke ptrace, and everything
else confused anyone trying to understand the code. In particular, the
nonsense definitions of the regsets confused me when I wrote this code.

Clean this all up. Change the declarations to match reality (which
shouldn't change the generated code, let alone the ABI) and fix
xfpregs_set() to clear the correct bits and to only do so for 32-bit
callers.

Fixes: 6164331d15f7 ("x86/fpu: Rewrite xfpregs_set()")
Reported-by: Luís Ferreira <contact@lsferreira.net>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215524
Link: https://lore.kernel.org/r/YgpFnZpF01WwR8wU@zn.tnic
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/regset.c | 9 ++++-----
 arch/x86/kernel/ptrace.c     | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 437d7c930c0b..75ffaef8c299 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -91,11 +91,9 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 		const void *kbuf, const void __user *ubuf)
 {
 	struct fpu *fpu = &target->thread.fpu;
-	struct user32_fxsr_struct newstate;
+	struct fxregs_state newstate;
 	int ret;
 
-	BUILD_BUG_ON(sizeof(newstate) != sizeof(struct fxregs_state));
-
 	if (!cpu_feature_enabled(X86_FEATURE_FXSR))
 		return -ENODEV;
 
@@ -116,9 +114,10 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	/* Copy the state  */
 	memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate));
 
-	/* Clear xmm8..15 */
+	/* Clear xmm8..15 for 32-bit callers */
 	BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16);
-	memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16);
+	if (in_ia32_syscall())
+		memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16);
 
 	/* Mark FP and SSE as in use when XSAVE is enabled */
 	if (use_xsave())
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index ace1834c9b0f..5ffe13fcf45e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1216,7 +1216,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
 	},
 	[REGSET_FP] = {
 		.core_note_type = NT_PRFPREG,
-		.n = sizeof(struct user_i387_struct) / sizeof(long),
+		.n = sizeof(struct fxregs_state) / sizeof(long),
 		.size = sizeof(long), .align = sizeof(long),
 		.active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
 	},
@@ -1263,7 +1263,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
 	},
 	[REGSET_XFP] = {
 		.core_note_type = NT_PRXFPREG,
-		.n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
+		.n = sizeof(struct fxregs_state) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
 		.active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
 	},
-- 
Gitee


From 8129e4d697acd62cca49264b45c2e5e7744cbd28 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 3 Feb 2022 11:43:07 -0800
Subject: [PATCH 1186/2161] x86/cpufeatures: Put the AMX macros in the word 18
 block

commit fa31a4d669bd471e9510db1abf9b91e1a6be6ff7 upstream.

These macros are for bits in CPUID.(EAX=7,ECX=0):EDX, not for bits in
CPUID(EAX=7,ECX=1):EAX. Put them with their brethren.

  [ bp: Sort word 18 bits properly, as caught by Like Xu
    <like.xu.linux@gmail.com> ]

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20220203194308.2469117-1-jmattson@google.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 0d29142d0c0c..2e27817e087f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -290,9 +290,6 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
-#define X86_FEATURE_AMX_BF16		(18*32+22) /* AMX bf16 Support */
-#define X86_FEATURE_AMX_TILE		(18*32+24) /* AMX tile Support */
-#define X86_FEATURE_AMX_INT8		(18*32+25) /* AMX int8 Support */
 
 /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
 #define X86_FEATURE_CLZERO		(13*32+ 0) /* CLZERO instruction */
@@ -373,7 +370,10 @@
 #define X86_FEATURE_TSXLDTRK		(18*32+16) /* TSX Suspend Load Address Tracking */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_AMX_BF16		(18*32+22) /* AMX bf16 Support */
 #define X86_FEATURE_AVX512_FP16		(18*32+23) /* AVX512 FP16 */
+#define X86_FEATURE_AMX_TILE		(18*32+24) /* AMX tile Support */
+#define X86_FEATURE_AMX_INT8		(18*32+25) /* AMX int8 Support */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */
-- 
Gitee


From fceb7224698efe072217370be44789fa0d1e64d6 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Sat, 29 Jan 2022 09:36:46 -0800
Subject: [PATCH 1187/2161] x86/fpu/xstate: Fix the ARCH_REQ_XCOMP_PERM
 implementation

commit 063452fd94d153d4eb38ad58f210f3d37a09cca4 upstream.

ARCH_REQ_XCOMP_PERM is supposed to add the requested feature to the
permission bitmap of thread_group_leader()->fpu. But the code overwrites
the bitmap with the requested feature bit only rather than adding it.

Fix the code to add the requested feature bit to the master bitmask.

Fixes: db8268df0983 ("x86/arch_prctl: Add controls for dynamic XSTATE components")
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Paolo Bonzini <bonzini@gnu.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220129173647.27981-2-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index f25e4605b6a5..fa2960e3c054 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1667,7 +1667,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
 
 	perm = guest ? &fpu->guest_perm : &fpu->perm;
 	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
-	WRITE_ONCE(perm->__state_perm, requested);
+	WRITE_ONCE(perm->__state_perm, mask);
 	/* Protected by sighand lock */
 	perm->__state_size = ksize;
 	perm->__user_state_size = usize;
-- 
Gitee


From 4d8c239ffafa848e49ba15e08b385a339f96b5f4 Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Sat, 29 Jan 2022 09:36:47 -0800
Subject: [PATCH 1188/2161] selftests/x86/amx: Update the ARCH_REQ_XCOMP_PERM
 test

commit 20df737561484cb2d42e537663c03a7311d2b3c1 upstream.

Update the arch_prctl test to check the permission bitmap whether the
requested feature is added as expected or not.

Every non-dynamic feature that is enabled is permitted already for use.
TILECFG is not dynamic feature. Ensure the bit is always on from
ARCH_GET_XCOMP_PERM.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220129173647.27981-3-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/x86/amx.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c
index 3615ef4a48bb..2189f0322d8b 100644
--- a/tools/testing/selftests/x86/amx.c
+++ b/tools/testing/selftests/x86/amx.c
@@ -368,9 +368,16 @@ static void req_xtiledata_perm(void)
 
 static void validate_req_xcomp_perm(enum expected_result exp)
 {
-	unsigned long bitmask;
+	unsigned long bitmask, expected_bitmask;
 	long rc;
 
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
+	if (rc) {
+		fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
+	} else if (!(bitmask & XFEATURE_MASK_XTILECFG)) {
+		fatal_error("ARCH_GET_XCOMP_PERM returns XFEATURE_XTILECFG off.");
+	}
+
 	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
 	if (exp == FAIL_EXPECTED) {
 		if (rc) {
@@ -383,10 +390,15 @@ static void validate_req_xcomp_perm(enum expected_result exp)
 		fatal_error("ARCH_REQ_XCOMP_PERM saw unexpected failure.\n");
 	}
 
+	expected_bitmask = bitmask | XFEATURE_MASK_XTILEDATA;
+
 	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
 	if (rc) {
 		fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
-	} else if (bitmask & XFEATURE_MASK_XTILE) {
+	} else if (bitmask != expected_bitmask) {
+		fatal_error("ARCH_REQ_XCOMP_PERM set a wrong bitmask: %lx, expected: %lx.\n",
+			    bitmask, expected_bitmask);
+	} else {
 		printf("\tARCH_REQ_XCOMP_PERM is successful.\n");
 	}
 }
-- 
Gitee


From bd27b3c36e572b9444412b5a7b80a35c3e1524b1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2022 14:47:08 +0100
Subject: [PATCH 1189/2161] x86/fpu: Remove redundant XCOMP_BV initialization

commit a9f84fb7158fea60cbcadef5c0166fb22b469091 upstream.

fpu_copy_uabi_to_guest_fpstate() initializes the XCOMP_BV field in the
XSAVE header. That's a leftover from the old KVM FPU buffer handling code.

Since

  d69c1382e1b7 ("x86/kvm: Convert FPU handling to a single swap buffer")

KVM uses the FPU core allocation code, which initializes the XCOMP_BV
field already.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220324134623.408932232@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/core.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 5f0a01b8fad8..1d9a52ae2b83 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -408,9 +408,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
 		xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
 		*vpkru = xpkru->pkru;
 	}
-
-	/* Ensure that XCOMP_BV is set up for XSAVES */
-	xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
-- 
Gitee


From e9493e00781c40581f370e7e0964932f699d7c77 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2022 14:47:09 +0100
Subject: [PATCH 1190/2161] x86/fpu: Remove unused supervisor only offsets

commit d47f71f6de7970d504748d1a60a11c51af5bce47 upstream.

No users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220324134623.465066249@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index fa2960e3c054..710e194e1394 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -84,8 +84,6 @@ static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init =
-	{ [ 0 ... XFEATURE_MAX - 1] = -1};
 
 /*
  * Return whether the system supports a given xfeature.
@@ -325,33 +323,6 @@ static void __init setup_xstate_comp_offsets(void)
 	}
 }
 
-/*
- * Setup offsets of a supervisor-state-only XSAVES buffer:
- *
- * The offsets stored in xstate_comp_offsets[] only work for one specific
- * value of the Requested Feature BitMap (RFBM).  In cases where a different
- * RFBM value is used, a different set of offsets is required.  This set of
- * offsets is for when RFBM=xfeatures_mask_supervisor().
- */
-static void __init setup_supervisor_only_offsets(void)
-{
-	unsigned int next_offset;
-	int i;
-
-	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
-	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
-		if (!xfeature_is_supervisor(i))
-			continue;
-
-		if (xfeature_is_aligned(i))
-			next_offset = ALIGN(next_offset, 64);
-
-		xstate_supervisor_only_offsets[i] = next_offset;
-		next_offset += xstate_sizes[i];
-	}
-}
-
 /*
  * Print out xstate component offsets and sizes
  */
@@ -952,7 +923,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 
 	setup_init_fpu_buf();
 	setup_xstate_comp_offsets();
-	setup_supervisor_only_offsets();
 
 	/*
 	 * Paranoia check whether something in the setup modified the
-- 
Gitee


From 09bbe38789d4658050492f6c9ff819bfa6040682 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2022 14:47:11 +0100
Subject: [PATCH 1191/2161] x86/fpu/xsave: Initialize offset/size cache early

commit 35a77d4503d9d9d0e19e3a2a0d3fc9ab09fb6857 upstream.

Reading XSTATE feature information from CPUID over and over does not make
sense. The information has to be cached anyway, so it can be done early.

Prepare for runtime calculation of XSTATE offsets and allow
consolidation of the size calculation functions in a later step.

Rename the function while at it as it does not setup any features.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220324134623.519411939@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 710e194e1394..940bf3c25c11 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -181,7 +181,7 @@ static bool xfeature_enabled(enum xfeature xfeature)
  * Record the offsets and sizes of various xstates contained
  * in the XSAVE state memory layout.
  */
-static void __init setup_xstate_features(void)
+static void __init setup_xstate_cache(void)
 {
 	u32 eax, ebx, ecx, edx, i;
 	/* start at the beginning of the "extended state" */
@@ -391,7 +391,6 @@ static void __init setup_init_fpu_buf(void)
 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return;
 
-	setup_xstate_features();
 	print_xstate_features();
 
 	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
@@ -907,6 +906,10 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
+
+	/* Cache size, offset and flags for initialization */
+	setup_xstate_cache();
+
 	err = init_xstate_size();
 	if (err)
 		goto out_disable;
-- 
Gitee


From 65d98365014676ccf94c95fcca74aba477c87890 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2022 14:47:12 +0100
Subject: [PATCH 1192/2161] x86/fpu: Cache xfeature flags from CPUID

commit 6afbb58cc2251c1d83472ca3005638206e73b6b8 upstream.

In preparation for runtime calculation of XSAVE offsets cache the feature
flags for each XSTATE component during feature enumeration via CPUID(0xD).

EDX has two relevant bits:
    0	Supervisor component
    1	Feature storage must be 64 byte aligned

These bits are currently only evaluated during init, but the alignment bit
must be cached to make runtime calculation of XSAVE offsets efficient.

Cache the full EDX content and use it for the existing alignment and
supervisor checks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220324134623.573656209@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 49 ++++++++++--------------------------
 1 file changed, 13 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 940bf3c25c11..6f214d6f490e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -84,6 +84,10 @@ static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
+
+#define XSTATE_FLAG_SUPERVISOR	BIT(0)
+#define XSTATE_FLAG_ALIGNED64	BIT(1)
 
 /*
  * Return whether the system supports a given xfeature.
@@ -123,17 +127,14 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 }
 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
 
-static bool xfeature_is_supervisor(int xfeature_nr)
+static bool xfeature_is_aligned64(int xfeature_nr)
 {
-	/*
-	 * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1)
-	 * returns ECX[0] set to (1) for a supervisor state, and cleared (0)
-	 * for a user state.
-	 */
-	u32 eax, ebx, ecx, edx;
+	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
+}
 
-	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
-	return ecx & 1;
+static bool xfeature_is_supervisor(int xfeature_nr)
+{
+	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
 }
 
 /*
@@ -204,6 +205,7 @@ static void __init setup_xstate_cache(void)
 		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
 
 		xstate_sizes[i] = eax;
+		xstate_flags[i] = ecx;
 
 		/*
 		 * If an xfeature is supervisor state, the offset in EBX is
@@ -262,31 +264,6 @@ static void __init print_xstate_features(void)
 	WARN_ON(nr >= XFEATURE_MAX);	\
 } while (0)
 
-/*
- * We could cache this like xstate_size[], but we only use
- * it here, so it would be a waste of space.
- */
-static int xfeature_is_aligned(int xfeature_nr)
-{
-	u32 eax, ebx, ecx, edx;
-
-	CHECK_XFEATURE(xfeature_nr);
-
-	if (!xfeature_enabled(xfeature_nr)) {
-		WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n",
-			  xfeature_nr);
-		return 0;
-	}
-
-	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
-	/*
-	 * The value returned by ECX[1] indicates the alignment
-	 * of state component 'i' when the compacted format
-	 * of the extended region of an XSAVE area is used:
-	 */
-	return !!(ecx & 2);
-}
-
 /*
  * This function sets up offsets and sizes of all extended states in
  * xsave area. This supports both standard format and compacted format
@@ -315,7 +292,7 @@ static void __init setup_xstate_comp_offsets(void)
 	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 
 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
-		if (xfeature_is_aligned(i))
+		if (xfeature_is_aligned64(i))
 			next_offset = ALIGN(next_offset, 64);
 
 		xstate_comp_offsets[i] = next_offset;
@@ -620,7 +597,7 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
 
 	for_each_extended_xfeature(i, xfeatures) {
 		/* Align from the end of the previous feature */
-		if (xfeature_is_aligned(i))
+		if (xfeature_is_aligned64(i))
 			size = ALIGN(size, 64);
 		/*
 		 * In compacted format the enabled features are packed,
-- 
Gitee


From bdd7339305e864be442e7d5f12a59c1908884b81 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2022 14:47:13 +0100
Subject: [PATCH 1193/2161] x86/fpu/xsave: Handle compacted offsets correctly
 with supervisor states

commit 7aa5128b5fea26cf224766303ea3b8df343f9a87 upstream.

So far the cached fixed compacted offsets worked, but with (re-)enabling
of ENQCMD this does no longer work with KVM fpstate.

KVM does not have supervisor features enabled for the guest FPU, which
means that KVM has then a different XSAVE area layout than the host FPU
state. This in turn breaks the copy from/to UABI functions when invoked for
a guest state.

Remove the pre-calculated compacted offsets and calculate the offset
of each component at runtime based on the XCOMP_BV field in the XSAVE
header.

The runtime overhead is not interesting because these copy from/to UABI
functions are not used in critical fast paths. KVM uses them to save and
restore FPU state during migration. The host uses them for ptrace and for
the slow path of 32bit signal handling.

Fixes: 7c1ef59145f1 ("x86/cpufeatures: Re-enable ENQCMD")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220324134623.627636809@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 86 +++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 45 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6f214d6f490e..b67ec14fe841 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -82,8 +82,6 @@ static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init =
-	{ [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
 
 #define XSTATE_FLAG_SUPERVISOR	BIT(0)
@@ -137,6 +135,33 @@ static bool xfeature_is_supervisor(int xfeature_nr)
 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
 }
 
+static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
+{
+	unsigned int offs, i;
+
+	/*
+	 * Non-compacted format and legacy features use the cached fixed
+	 * offsets.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVES) || xfeature <= XFEATURE_SSE)
+		return xstate_offsets[xfeature];
+
+	/*
+	 * Compacted format offsets depend on the actual content of the
+	 * compacted xsave area which is determined by the xcomp_bv header
+	 * field.
+	 */
+	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	for_each_extended_xfeature(i, xcomp_bv) {
+		if (xfeature_is_aligned64(i))
+			offs = ALIGN(offs, 64);
+		if (i == xfeature)
+			break;
+		offs += xstate_sizes[i];
+	}
+	return offs;
+}
+
 /*
  * Enable the extended processor state save/restore feature.
  * Called once per CPU onlining.
@@ -264,42 +289,6 @@ static void __init print_xstate_features(void)
 	WARN_ON(nr >= XFEATURE_MAX);	\
 } while (0)
 
-/*
- * This function sets up offsets and sizes of all extended states in
- * xsave area. This supports both standard format and compacted format
- * of the xsave area.
- */
-static void __init setup_xstate_comp_offsets(void)
-{
-	unsigned int next_offset;
-	int i;
-
-	/*
-	 * The FP xstates and SSE xstates are legacy states. They are always
-	 * in the fixed offsets in the xsave area in either compacted form
-	 * or standard form.
-	 */
-	xstate_comp_offsets[XFEATURE_FP] = 0;
-	xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
-						     xmm_space);
-
-	if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) {
-		for_each_extended_xfeature(i, fpu_kernel_cfg.max_features)
-			xstate_comp_offsets[i] = xstate_offsets[i];
-		return;
-	}
-
-	next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
-	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
-		if (xfeature_is_aligned64(i))
-			next_offset = ALIGN(next_offset, 64);
-
-		xstate_comp_offsets[i] = next_offset;
-		next_offset += xstate_sizes[i];
-	}
-}
-
 /*
  * Print out xstate component offsets and sizes
  */
@@ -309,7 +298,8 @@ static void __init print_xstate_offset_size(void)
 
 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
-			 i, xstate_comp_offsets[i], i, xstate_sizes[i]);
+			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
+			i, xstate_sizes[i]);
 	}
 }
 
@@ -902,7 +892,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
 				  fpu_user_cfg.max_features);
 
 	setup_init_fpu_buf();
-	setup_xstate_comp_offsets();
 
 	/*
 	 * Paranoia check whether something in the setup modified the
@@ -957,13 +946,19 @@ void fpu__resume_cpu(void)
  */
 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 {
-	if (!xfeature_enabled(xfeature_nr)) {
-		WARN_ON_FPU(1);
+	u64 xcomp_bv = xsave->header.xcomp_bv;
+
+	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
 		return NULL;
+
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
+			return NULL;
 	}
 
-	return (void *)xsave + xstate_comp_offsets[xfeature_nr];
+	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
 }
+
 /*
  * Given the xsave area and a state inside, this function returns the
  * address of the state.
@@ -994,8 +989,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 	 * We should not ever be requesting features that we
 	 * have not enabled.
 	 */
-	WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)),
-		  "get of unsupported state");
+	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+		return NULL;
+
 	/*
 	 * This assumes the last 'xsave*' instruction to
 	 * have requested that 'xfeature_nr' be saved.
-- 
Gitee


From 8ac62c299d78994053e803c645ece56b399c34ae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2022 14:47:14 +0100
Subject: [PATCH 1194/2161] x86/fpu/xstate: Handle supervisor states in XSTATE
 permissions

commit 781c64bfcb735960717d1cb45428047ff6a5030c upstream.

The size calculation in __xstate_request_perm() fails to take supervisor
states into account because the permission bitmap is only relevant for user
states.

Up to 5.17 this does not matter because there are no supervisor states
supported, but the (re-)enabling of ENQCMD makes them available.

Fixes: 7c1ef59145f1 ("x86/cpufeatures: Re-enable ENQCMD")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220324134623.681768598@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b67ec14fe841..b8c4d6ee9d17 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1599,6 +1599,9 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
 
 	/* Calculate the resulting kernel state size */
 	mask = permitted | requested;
+	/* Take supervisor states into account on the host */
+	if (!guest)
+		mask |= xfeatures_mask_supervisor();
 	ksize = xstate_calculate_size(mask, compacted);
 
 	/* Calculate the resulting user state size */
-- 
Gitee


From a4028bdf16213bfb5c1f3a1b3ad9940def6bb526 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Mar 2022 20:43:21 +0200
Subject: [PATCH 1195/2161] x86/fpu/xstate: Consolidate size calculations

commit d6d6d50f1e801a790a242c80eeda261e36c43b7b upstream.

Use the offset calculation to do the size calculation which avoids yet
another series of CPUID instructions for each invocation.

  [ Fix the FP/SSE only case which missed to take the xstate
    header into account, as
    Reported-by: kernel test robot <oliver.sang@intel.com> ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/87o81pgbp2.ffs@tglx
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/fpu/xstate.c | 49 ++++++------------------------------
 1 file changed, 8 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b8c4d6ee9d17..e13b4fea4d68 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -386,25 +386,6 @@ static void __init setup_init_fpu_buf(void)
 	fxsave(&init_fpstate.regs.fxsave);
 }
 
-static int xfeature_uncompacted_offset(int xfeature_nr)
-{
-	u32 eax, ebx, ecx, edx;
-
-	/*
-	 * Only XSAVES supports supervisor states and it uses compacted
-	 * format. Checking a supervisor state's uncompacted offset is
-	 * an error.
-	 */
-	if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) {
-		WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
-		return -1;
-	}
-
-	CHECK_XFEATURE(xfeature_nr);
-	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
-	return ebx;
-}
-
 int xfeature_size(int xfeature_nr)
 {
 	u32 eax, ebx, ecx, edx;
@@ -582,29 +563,15 @@ static bool __init check_xstate_against_struct(int nr)
 
 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
 {
-	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-	int i;
+	unsigned int topmost = fls64(xfeatures) -  1;
+	unsigned int offset = xstate_offsets[topmost];
 
-	for_each_extended_xfeature(i, xfeatures) {
-		/* Align from the end of the previous feature */
-		if (xfeature_is_aligned64(i))
-			size = ALIGN(size, 64);
-		/*
-		 * In compacted format the enabled features are packed,
-		 * i.e. disabled features do not occupy space.
-		 *
-		 * In non-compacted format the offsets are fixed and
-		 * disabled states still occupy space in the memory buffer.
-		 */
-		if (!compacted)
-			size = xfeature_uncompacted_offset(i);
-		/*
-		 * Add the feature size even for non-compacted format
-		 * to make the end result correct
-		 */
-		size += xfeature_size(i);
-	}
-	return size;
+	if (topmost <= XFEATURE_SSE)
+		return sizeof(struct xregs_state);
+
+	if (compacted)
+		offset = xfeature_get_offset(xfeatures, topmost);
+	return offset + xstate_sizes[topmost];
 }
 
 /*
-- 
Gitee


From 4494ea5b57dd58210e369d617a149bb4c1b0eca6 Mon Sep 17 00:00:00 2001
From: Erik Kaneda <erik.kaneda@intel.com>
Date: Fri, 4 Jun 2021 14:26:06 -0700
Subject: [PATCH 1196/2161] ACPICA: iASL: add disassembler support for PRMT

commit d71df85aacd26fe4ac5fbfd383e01e7552ccfcc3 upstream.

ACPICA commit f70e7593e37c9e29f19be8ad3ef93f3f34799368

Link: https://github.com/acpica/acpica/commit/f70e7593
Signed-off-by: Erik Kaneda <erik.kaneda@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/acpi/actbl2.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index d068272f6e56..ca256f073699 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -39,6 +39,7 @@
 #define ACPI_SIG_PDTT           "PDTT"	/* Platform Debug Trigger Table */
 #define ACPI_SIG_PMTT           "PMTT"	/* Platform Memory Topology Table */
 #define ACPI_SIG_PPTT           "PPTT"	/* Processor Properties Topology Table */
+#define ACPI_SIG_PRMT           "PRMT"	/* Platform Runtime Mechanism Table */
 #define ACPI_SIG_RASF           "RASF"	/* RAS Feature table */
 #define ACPI_SIG_SBST           "SBST"	/* Smart Battery Specification Table */
 #define ACPI_SIG_SDEI           "SDEI"	/* Software Delegated Exception Interface Table */
@@ -1538,6 +1539,43 @@ struct acpi_pptt_id {
 	u16 spin_rev;
 };
 
+/*******************************************************************************
+ *
+ * PRMT - Platform Runtime Mechanism Table
+ *        Version 1
+ *
+ ******************************************************************************/
+
+struct acpi_table_prmt {
+	struct acpi_table_header header;	/* Common ACPI table header */
+};
+
+struct acpi_table_prmt_header {
+	u8 platform_guid[16];
+	u32 module_info_offset;
+	u32 module_info_count;
+};
+
+struct acpi_prmt_module_info {
+	u16 revision;
+	u16 length;
+	u8 module_guid[16];
+	u16 major_rev;
+	u16 minor_rev;
+	u16 handler_info_count;
+	u32 handler_info_offset;
+	u64 mmio_list_pointer;
+};
+
+struct acpi_prmt_handler_info {
+	u16 revision;
+	u16 length;
+	u8 handler_guid[16];
+	u64 handler_address;
+	u64 static_data_buffer_address;
+	u64 acpi_param_buffer_address;
+};
+
 /*******************************************************************************
  *
  * RASF - RAS Feature Table (ACPI 5.0)
-- 
Gitee


From 68aed671e8d9803a00c1624eae5924dfd8bc8938 Mon Sep 17 00:00:00 2001
From: Erik Kaneda <erik.kaneda@intel.com>
Date: Thu, 4 Jun 2020 13:44:20 -0700
Subject: [PATCH 1197/2161] ACPICA: iASL: add new OperationRegion subtype
 keyword PlatformRtMechanism

commit f083906fa9c1643e5c16b887cc29099fdfc3e875 upstream.

ACPICA commit 2c2eefa827bd37297f5f9ca4b263fcba829aaf3f

Link: https://github.com/acpica/acpica/commit/2c2eefa8
Signed-off-by: Erik Kaneda <erik.kaneda@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/acpica/dbdisply.c | 2 ++
 drivers/acpi/acpica/utdecode.c | 3 ++-
 include/acpi/actypes.h         | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/acpica/dbdisply.c b/drivers/acpi/acpica/dbdisply.c
index 30ab62b0fec8..33b3b8f9b219 100644
--- a/drivers/acpi/acpica/dbdisply.c
+++ b/drivers/acpi/acpica/dbdisply.c
@@ -51,6 +51,8 @@ static acpi_adr_space_type acpi_gbl_space_id_list[] = {
 	ACPI_ADR_SPACE_IPMI,
 	ACPI_ADR_SPACE_GPIO,
 	ACPI_ADR_SPACE_GSBUS,
+	ACPI_ADR_SPACE_PLATFORM_COMM,
+	ACPI_ADR_SPACE_PLATFORM_RT,
 	ACPI_ADR_SPACE_DATA_TABLE,
 	ACPI_ADR_SPACE_FIXED_HARDWARE
 };
diff --git a/drivers/acpi/acpica/utdecode.c b/drivers/acpi/acpica/utdecode.c
index 65beaa237669..1a41bd048eed 100644
--- a/drivers/acpi/acpica/utdecode.c
+++ b/drivers/acpi/acpica/utdecode.c
@@ -78,7 +78,8 @@ const char *acpi_gbl_region_types[ACPI_NUM_PREDEFINED_REGIONS] = {
 	"IPMI",			/* 0x07 */
 	"GeneralPurposeIo",	/* 0x08 */
 	"GenericSerialBus",	/* 0x09 */
-	"PlatformCommChannel"	/* 0x0A */
+	"PlatformCommChannel",	/* 0x0A */
+	"PlatformRtMechanism"	/* 0x0B */
 };
 
 const char *acpi_ut_get_region_name(u8 space_id)
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 9373662cdb44..2b4276f73da2 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -815,8 +815,9 @@ typedef u8 acpi_adr_space_type;
 #define ACPI_ADR_SPACE_GPIO             (acpi_adr_space_type) 8
 #define ACPI_ADR_SPACE_GSBUS            (acpi_adr_space_type) 9
 #define ACPI_ADR_SPACE_PLATFORM_COMM    (acpi_adr_space_type) 10
+#define ACPI_ADR_SPACE_PLATFORM_RT      (acpi_adr_space_type) 11
 
-#define ACPI_NUM_PREDEFINED_REGIONS     11
+#define ACPI_NUM_PREDEFINED_REGIONS     12
 
 /*
  * Special Address Spaces
-- 
Gitee


From e5073f162cc65eba1eee0dfec10877dc4fe413da Mon Sep 17 00:00:00 2001
From: Erik Kaneda <erik.kaneda@intel.com>
Date: Fri, 4 Jun 2021 14:26:07 -0700
Subject: [PATCH 1198/2161] ACPICA: Add support for PlatformRtMechanism
 OperationRegion handler

commit 04da290dd22c806c401913bcc1ed6356599b09c3 upstream.

ACPICA commit cdf48b141d7da38e47fe4020310033ddd1971f9e

Writing a buffer to a PlatformRtMechanism FieldUnit invokes a
bidirectional transaction. The input buffer contains 26 bytes
containing 9 bytes of status, a command byte and a 16-byte UUID.
This change will will simply pass this incoming buffer to a handler
registered by the OS.

Link: https://github.com/acpica/acpica/commit/cdf48b14
Signed-off-by: Erik Kaneda <erik.kaneda@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/acpica/acutils.h  |  2 ++
 drivers/acpi/acpica/exfield.c  |  8 +++++--
 drivers/acpi/acpica/exserial.c | 12 ++++++++++
 drivers/acpi/acpica/utuuid.c   | 41 ++++++++++++++++++++++++++++++++++
 include/acpi/acconfig.h        |  2 ++
 5 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/acpica/acutils.h b/drivers/acpi/acpica/acutils.h
index 601808be86d1..201e2d0dc01e 100644
--- a/drivers/acpi/acpica/acutils.h
+++ b/drivers/acpi/acpica/acutils.h
@@ -735,6 +735,8 @@ const char *acpi_ah_match_uuid(u8 *data);
  */
 #if (defined ACPI_ASL_COMPILER || defined ACPI_EXEC_APP || defined ACPI_HELP_APP)
 void acpi_ut_convert_string_to_uuid(char *in_string, u8 *uuid_buffer);
+
+acpi_status acpi_ut_convert_uuid_to_string(char *uuid_buffer, char *out_string);
 #endif
 
 #endif				/* _ACUTILS_H */
diff --git a/drivers/acpi/acpica/exfield.c b/drivers/acpi/acpica/exfield.c
index d3d2dbfba680..4de2c103a0f2 100644
--- a/drivers/acpi/acpica/exfield.c
+++ b/drivers/acpi/acpica/exfield.c
@@ -138,7 +138,9 @@ acpi_ex_read_data_from_field(struct acpi_walk_state *walk_state,
 		    || obj_desc->field.region_obj->region.space_id ==
 		    ACPI_ADR_SPACE_GSBUS
 		    || obj_desc->field.region_obj->region.space_id ==
-		    ACPI_ADR_SPACE_IPMI)) {
+		    ACPI_ADR_SPACE_IPMI
+		    || obj_desc->field.region_obj->region.space_id ==
+		    ACPI_ADR_SPACE_PLATFORM_RT)) {
 
 		/* SMBus, GSBus, IPMI serial */
 
@@ -295,7 +297,9 @@ acpi_ex_write_data_to_field(union acpi_operand_object *source_desc,
 		    || obj_desc->field.region_obj->region.space_id ==
 		    ACPI_ADR_SPACE_GSBUS
 		    || obj_desc->field.region_obj->region.space_id ==
-		    ACPI_ADR_SPACE_IPMI)) {
+		    ACPI_ADR_SPACE_IPMI
+		    || obj_desc->field.region_obj->region.space_id ==
+		    ACPI_ADR_SPACE_PLATFORM_RT)) {
 
 		/* SMBus, GSBus, IPMI serial */
 
diff --git a/drivers/acpi/acpica/exserial.c b/drivers/acpi/acpica/exserial.c
index c5aa4b0deb70..fd36aaada6a0 100644
--- a/drivers/acpi/acpica/exserial.c
+++ b/drivers/acpi/acpica/exserial.c
@@ -195,6 +195,12 @@ acpi_ex_read_serial_bus(union acpi_operand_object *obj_desc,
 		function = ACPI_READ | (accessor_type << 16);
 		break;
 
+	case ACPI_ADR_SPACE_PLATFORM_RT:
+
+		buffer_length = ACPI_PRM_INPUT_BUFFER_SIZE;
+		function = ACPI_READ;
+		break;
+
 	default:
 		return_ACPI_STATUS(AE_AML_INVALID_SPACE_ID);
 	}
@@ -311,6 +317,12 @@ acpi_ex_write_serial_bus(union acpi_operand_object *source_desc,
 		function = ACPI_WRITE | (accessor_type << 16);
 		break;
 
+	case ACPI_ADR_SPACE_PLATFORM_RT:
+
+		buffer_length = ACPI_PRM_INPUT_BUFFER_SIZE;
+		function = ACPI_WRITE;
+		break;
+
 	default:
 		return_ACPI_STATUS(AE_AML_INVALID_SPACE_ID);
 	}
diff --git a/drivers/acpi/acpica/utuuid.c b/drivers/acpi/acpica/utuuid.c
index 0a7cf8007643..1ccd9ceac51a 100644
--- a/drivers/acpi/acpica/utuuid.c
+++ b/drivers/acpi/acpica/utuuid.c
@@ -61,4 +61,45 @@ void acpi_ut_convert_string_to_uuid(char *in_string, u8 *uuid_buffer)
 					       1]);
 	}
 }
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_convert_uuid_to_string
+ *
+ * PARAMETERS:  uuid_buffer         - 16-byte UUID buffer
+ *              out_string          - 36-byte formatted UUID string
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Convert 16-byte UUID buffer to 36-byte formatted UUID string
+ *              out_string must be 37 bytes to include null terminator.
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ut_convert_uuid_to_string(char *uuid_buffer, char *out_string)
+{
+	u32 i;
+
+	if (!uuid_buffer || !out_string) {
+		return (AE_BAD_PARAMETER);
+	}
+
+	for (i = 0; i < UUID_BUFFER_LENGTH; i++) {
+		out_string[acpi_gbl_map_to_uuid_offset[i]] =
+		    acpi_ut_hex_to_ascii_char(uuid_buffer[i], 4);
+
+		out_string[acpi_gbl_map_to_uuid_offset[i] + 1] =
+		    acpi_ut_hex_to_ascii_char(uuid_buffer[i], 0);
+	}
+
+	/* Insert required hyphens (dashes) */
+
+	out_string[UUID_HYPHEN1_OFFSET] =
+	    out_string[UUID_HYPHEN2_OFFSET] =
+	    out_string[UUID_HYPHEN3_OFFSET] =
+	    out_string[UUID_HYPHEN4_OFFSET] = '-';
+
+	out_string[UUID_STRING_LENGTH] = 0;	/* Null terminate */
+	return (AE_OK);
+}
 #endif
diff --git a/include/acpi/acconfig.h b/include/acpi/acconfig.h
index 42d573172e53..a42a21c45d33 100644
--- a/include/acpi/acconfig.h
+++ b/include/acpi/acconfig.h
@@ -188,6 +188,8 @@
 #define ACPI_MAX_GSBUS_DATA_SIZE        255
 #define ACPI_MAX_GSBUS_BUFFER_SIZE      ACPI_SERIAL_HEADER_SIZE + ACPI_MAX_GSBUS_DATA_SIZE
 
+#define ACPI_PRM_INPUT_BUFFER_SIZE      26
+
 /* _sx_d and _sx_w control methods */
 
 #define ACPI_NUM_sx_d_METHODS           4
-- 
Gitee


From 23c5a2d6f6c545ea8cd82230cda5e16335bd5045 Mon Sep 17 00:00:00 2001
From: Erik Kaneda <erik.kaneda@intel.com>
Date: Wed, 9 Jun 2021 20:41:51 -0700
Subject: [PATCH 1199/2161] ACPICA: Add PRMT module header to facilitate
 parsing

commit 9f8c7baedabc9693fbd7890f8fda40578bde4f73 upstream.

ACPICA commit bd46cb07e614fd85ea69e54c1f6f0ae0a5fb20ab

This structure is used in to parse PRMT in other Operating Systems
that relies on using subtable headers in order to parse ACPI tables.
Although the PRMT doesn't have "subtables" it has a list of module
information structures that act as subtables.

Link: https://github.com/acpica/acpica/commit/bd46cb07
Signed-off-by: Erik Kaneda <erik.kaneda@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/acpi/actbl2.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index ca256f073699..0470387f9b72 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -1556,6 +1556,11 @@ struct acpi_table_prmt_header {
 	u32 module_info_count;
 };
 
+struct acpi_prmt_module_header {
+	u16 revision;
+	u16 length;
+};
+
 struct acpi_prmt_module_info {
 	u16 revision;
 	u16 length;
-- 
Gitee


From ad03ca994ff0746ea484384b7ed5e1391a715a62 Mon Sep 17 00:00:00 2001
From: Erik Kaneda <erik.kaneda@intel.com>
Date: Wed, 9 Jun 2021 20:41:52 -0700
Subject: [PATCH 1200/2161] ACPI: PRM: implement OperationRegion handler for
 the PlatformRtMechanism subtype

commit cefc7ca46235f01d5233e3abd4b79452af01d9e9 upstream.

Platform Runtime Mechanism (PRM) is a firmware interface that exposes
a set of binary executables that can either be called from the AML
interpreter or device drivers by bypassing the AML interpreter.
This change implements the AML interpreter path.

According to the specification [1], PRM services are listed in an
ACPI table called the PRMT. This patch parses module and handler
information listed in the PRMT and registers the PlatformRtMechanism
OpRegion handler before ACPI tables are loaded.

Each service is defined by a 16-byte GUID and called from writing a
26-byte ASL buffer containing the identifier to a FieldUnit object
defined inside a PlatformRtMechanism OperationRegion.

    OperationRegion (PRMR, PlatformRtMechanism, 0, 26)
    Field (PRMR, BufferAcc, NoLock, Preserve)
    {
        PRMF, 208 // Write to this field to invoke the OperationRegion Handler
    }

The 26-byte ASL buffer is defined as the following:

Byte Offset   Byte Length    Description
=============================================================
     0             1         PRM OperationRegion handler status
     1             8         PRM service status
     9             1         PRM command
    10            16         PRM handler GUID

The ASL caller fills out a 26-byte buffer containing the PRM command
and the PRM handler GUID like so:

    /* Local0 is the PRM data buffer */
    Local0 = buffer (26){}

    /* Create byte fields over the buffer */
    CreateByteField (Local0, 0x9, CMD)
    CreateField (Local0, 0x50, 0x80, GUID)

    /* Fill in the command and data fields of the data buffer */
    CMD = 0 // run command
    GUID = ToUUID("xxxx-xx-xxx-xxxx")

    /*
     * Invoke PRM service with an ID that matches GUID and save the
     * result.
     */
    Local0 = (\_SB.PRMT.PRMF = Local0)

Byte offset 0 - 8 are written by the handler as a status passed back to AML
and used by ASL like so:

    /* Create byte fields over the buffer */
    CreateByteField (Local0, 0x0, PSTA)
    CreateQWordField (Local0, 0x1, USTA)

In this ASL code, PSTA contains a status from the OperationRegion and
USTA contains a status from the PRM service.

The 26-byte buffer is recieved by acpi_platformrt_space_handler. This
handler will look at the command value and the handler guid and take
the approperiate actions.

Command value    Action
=====================================================================
    0            Run the PRM service indicated by the PRM handler
                 GUID (bytes 10-26)

    1            Prevent PRM runtime updates from happening to the
                 service's parent module

    2            Allow PRM updates from happening to the service's parent module

This patch enables command value 0.

Link: https://uefi.org/sites/default/files/resources/Platform%20Runtime%20Mechanism%20-%20with%20legal%20notice.pdf # [1]
Signed-off-by: Erik Kaneda <erik.kaneda@intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/Kconfig  |   5 +
 drivers/acpi/Makefile |   1 +
 drivers/acpi/bus.c    |   2 +
 drivers/acpi/prmt.c   | 303 ++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/tables.c |   9 ++
 include/linux/acpi.h  |   1 +
 include/linux/prmt.h  |   7 +
 7 files changed, 328 insertions(+)
 create mode 100644 drivers/acpi/prmt.c
 create mode 100644 include/linux/prmt.h

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 5559ef264060..3df2594eac1b 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -594,3 +594,8 @@ config X86_PM_TIMER
 
 	  You should nearly always say Y here because many modern
 	  systems require this timer.
+
+config ACPI_PRMT
+	bool "Platform Runtime Mechanism Support"
+	depends on EFI && X86_64
+	default y
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index ef1ac4d127da..dbfa2448060f 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -61,6 +61,7 @@ acpi-y				+= acpi_lpat.o
 acpi-$(CONFIG_ACPI_LPIT)	+= acpi_lpit.o
 acpi-$(CONFIG_ACPI_GENERIC_GSI) += irq.o
 acpi-$(CONFIG_ACPI_WATCHDOG)	+= acpi_watchdog.o
+acpi-$(CONFIG_ACPI_PRMT)	+= prmt.o
 
 # Address translation
 acpi-$(CONFIG_ACPI_ADXL)	+= acpi_adxl.o
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 54002670cb7a..c3c8934f61bd 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -28,6 +28,7 @@
 #include <linux/pci.h>
 #include <acpi/apei.h>
 #include <linux/suspend.h>
+#include <linux/prmt.h>
 
 #include "internal.h"
 
@@ -1238,6 +1239,7 @@ static int __init acpi_init(void)
 		acpi_kobj = NULL;
 	}
 
+	init_prmt();
 	result = acpi_bus_init();
 	if (result) {
 		disable_acpi();
diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
new file mode 100644
index 000000000000..33c274698d07
--- /dev/null
+++ b/drivers/acpi/prmt.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Author: Erik Kaneda <erik.kaneda@intel.com>
+ * Copyright 2020 Intel Corporation
+ *
+ * prmt.c
+ *
+ * Each PRM service is an executable that is run in a restricted environment
+ * that is invoked by writing to the PlatformRtMechanism OperationRegion from
+ * AML bytecode.
+ *
+ * init_prmt initializes the Platform Runtime Mechanism (PRM) services by
+ * processing data in the PRMT as well as registering an ACPI OperationRegion
+ * handler for the PlatformRtMechanism subtype.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/efi.h>
+#include <linux/acpi.h>
+#include <linux/prmt.h>
+#include <asm/efi.h>
+
+#pragma pack(1)
+struct prm_mmio_addr_range {
+	u64 phys_addr;
+	u64 virt_addr;
+	u32 length;
+};
+
+struct prm_mmio_info {
+	u64 mmio_count;
+	struct prm_mmio_addr_range addr_ranges[];
+};
+
+struct prm_buffer {
+	u8 prm_status;
+	u64 efi_status;
+	u8 prm_cmd;
+	guid_t handler_guid;
+};
+
+struct prm_context_buffer {
+	char signature[ACPI_NAMESEG_SIZE];
+	u16 revision;
+	u16 reserved;
+	guid_t identifier;
+	u64 static_data_buffer;
+	struct prm_mmio_info *mmio_ranges;
+};
+#pragma pack()
+
+
+LIST_HEAD(prm_module_list);
+
+struct prm_handler_info {
+	guid_t guid;
+	u64 handler_addr;
+	u64 static_data_buffer_addr;
+	u64 acpi_param_buffer_addr;
+
+	struct list_head handler_list;
+};
+
+struct prm_module_info {
+	guid_t guid;
+	u16 major_rev;
+	u16 minor_rev;
+	u16 handler_count;
+	struct prm_mmio_info *mmio_info;
+	bool updatable;
+
+	struct list_head module_list;
+	struct prm_handler_info handlers[];
+};
+
+
+static u64 efi_pa_va_lookup(u64 pa)
+{
+	efi_memory_desc_t *md;
+	u64 pa_offset = pa & ~PAGE_MASK;
+	u64 page = pa & PAGE_MASK;
+
+	for_each_efi_memory_desc(md) {
+		if (md->phys_addr < pa && pa < md->phys_addr + PAGE_SIZE * md->num_pages)
+			return pa_offset + md->virt_addr + page - md->phys_addr;
+	}
+
+	return 0;
+}
+
+
+#define get_first_handler(a) ((struct acpi_prmt_handler_info *) ((char *) (a) + a->handler_info_offset))
+#define get_next_handler(a) ((struct acpi_prmt_handler_info *) (sizeof(struct acpi_prmt_handler_info) + (char *) a))
+
+static int __init
+acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
+{
+	struct acpi_prmt_module_info *module_info;
+	struct acpi_prmt_handler_info *handler_info;
+	struct prm_handler_info *th;
+	struct prm_module_info *tm;
+	u64 mmio_count = 0;
+	u64 cur_handler = 0;
+	u32 module_info_size = 0;
+	u64 mmio_range_size = 0;
+	void *temp_mmio;
+
+	module_info = (struct acpi_prmt_module_info *) header;
+	module_info_size = struct_size(tm, handlers, module_info->handler_info_count);
+	tm = kmalloc(module_info_size, GFP_KERNEL);
+
+	guid_copy(&tm->guid, (guid_t *) module_info->module_guid);
+	tm->major_rev = module_info->major_rev;
+	tm->minor_rev = module_info->minor_rev;
+	tm->handler_count = module_info->handler_info_count;
+	tm->updatable = true;
+
+	if (module_info->mmio_list_pointer) {
+		/*
+		 * Each module is associated with a list of addr
+		 * ranges that it can use during the service
+		 */
+		mmio_count = *(u64 *) memremap(module_info->mmio_list_pointer, 8, MEMREMAP_WB);
+		mmio_range_size = struct_size(tm->mmio_info, addr_ranges, mmio_count);
+		tm->mmio_info = kmalloc(mmio_range_size, GFP_KERNEL);
+		temp_mmio = memremap(module_info->mmio_list_pointer, mmio_range_size, MEMREMAP_WB);
+		memmove(tm->mmio_info, temp_mmio, mmio_range_size);
+	} else {
+		mmio_range_size = struct_size(tm->mmio_info, addr_ranges, mmio_count);
+		tm->mmio_info = kmalloc(mmio_range_size, GFP_KERNEL);
+		tm->mmio_info->mmio_count = 0;
+	}
+
+	INIT_LIST_HEAD(&tm->module_list);
+	list_add(&tm->module_list, &prm_module_list);
+
+	handler_info = get_first_handler(module_info);
+	do {
+		th = &tm->handlers[cur_handler];
+
+		guid_copy(&th->guid, (guid_t *)handler_info->handler_guid);
+		th->handler_addr = efi_pa_va_lookup(handler_info->handler_address);
+		th->static_data_buffer_addr = efi_pa_va_lookup(handler_info->static_data_buffer_address);
+		th->acpi_param_buffer_addr = efi_pa_va_lookup(handler_info->acpi_param_buffer_address);
+	} while (++cur_handler < tm->handler_count && (handler_info = get_next_handler(handler_info)));
+
+	return 0;
+}
+
+#define GET_MODULE	0
+#define GET_HANDLER	1
+
+static void *find_guid_info(const guid_t *guid, u8 mode)
+{
+	struct prm_handler_info *cur_handler;
+	struct prm_module_info *cur_module;
+	int i = 0;
+
+	list_for_each_entry(cur_module, &prm_module_list, module_list) {
+		for (i = 0; i < cur_module->handler_count; ++i) {
+			cur_handler = &cur_module->handlers[i];
+			if (guid_equal(guid, &cur_handler->guid)) {
+				if (mode == GET_MODULE)
+					return (void *)cur_module;
+				else
+					return (void *)cur_handler;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+
+static struct prm_module_info *find_prm_module(const guid_t *guid)
+{
+	return (struct prm_module_info *)find_guid_info(guid, GET_MODULE);
+}
+
+static struct prm_handler_info *find_prm_handler(const guid_t *guid)
+{
+	return (struct prm_handler_info *) find_guid_info(guid, GET_HANDLER);
+}
+
+/* In-coming PRM commands */
+
+#define PRM_CMD_RUN_SERVICE		0
+#define PRM_CMD_START_TRANSACTION	1
+#define PRM_CMD_END_TRANSACTION		2
+
+/* statuses that can be passed back to ASL */
+
+#define PRM_HANDLER_SUCCESS 		0
+#define PRM_HANDLER_ERROR 		1
+#define INVALID_PRM_COMMAND 		2
+#define PRM_HANDLER_GUID_NOT_FOUND 	3
+#define UPDATE_LOCK_ALREADY_HELD 	4
+#define UPDATE_UNLOCK_WITHOUT_LOCK 	5
+
+/*
+ * This is the PlatformRtMechanism opregion space handler.
+ * @function: indicates the read/write. In fact as the PlatformRtMechanism
+ * message is driven by command, only write is meaningful.
+ *
+ * @addr   : not used
+ * @bits   : not used.
+ * @value  : it is an in/out parameter. It points to the PRM message buffer.
+ * @handler_context: not used
+ */
+static acpi_status acpi_platformrt_space_handler(u32 function,
+						 acpi_physical_address addr,
+						 u32 bits, acpi_integer *value,
+						 void *handler_context,
+						 void *region_context)
+{
+	struct prm_buffer *buffer = ACPI_CAST_PTR(struct prm_buffer, value);
+	struct prm_handler_info *handler;
+	struct prm_module_info *module;
+	efi_status_t status;
+	struct prm_context_buffer context;
+
+	/*
+	 * The returned acpi_status will always be AE_OK. Error values will be
+	 * saved in the first byte of the PRM message buffer to be used by ASL.
+	 */
+	switch (buffer->prm_cmd) {
+	case PRM_CMD_RUN_SERVICE:
+
+		handler = find_prm_handler(&buffer->handler_guid);
+		module = find_prm_module(&buffer->handler_guid);
+		if (!handler || !module)
+			goto invalid_guid;
+
+		ACPI_COPY_NAMESEG(context.signature, "PRMC");
+		context.revision = 0x0;
+		context.reserved = 0x0;
+		context.identifier = handler->guid;
+		context.static_data_buffer = handler->static_data_buffer_addr;
+		context.mmio_ranges = module->mmio_info;
+
+		status = efi_call_virt_pointer(handler, handler_addr,
+					       handler->acpi_param_buffer_addr,
+					       &context);
+		if (status == EFI_SUCCESS) {
+			buffer->prm_status = PRM_HANDLER_SUCCESS;
+		} else {
+			buffer->prm_status = PRM_HANDLER_ERROR;
+			buffer->efi_status = status;
+		}
+		break;
+
+	case PRM_CMD_START_TRANSACTION:
+
+		module = find_prm_module(&buffer->handler_guid);
+		if (!module)
+			goto invalid_guid;
+
+		if (module->updatable)
+			module->updatable = false;
+		else
+			buffer->prm_status = UPDATE_LOCK_ALREADY_HELD;
+		break;
+
+	case PRM_CMD_END_TRANSACTION:
+
+		module = find_prm_module(&buffer->handler_guid);
+		if (!module)
+			goto invalid_guid;
+
+		if (module->updatable)
+			buffer->prm_status = UPDATE_UNLOCK_WITHOUT_LOCK;
+		else
+			module->updatable = true;
+		break;
+
+	default:
+
+		buffer->prm_status = INVALID_PRM_COMMAND;
+		break;
+	}
+
+	return AE_OK;
+
+invalid_guid:
+	buffer->prm_status = PRM_HANDLER_GUID_NOT_FOUND;
+	return AE_OK;
+}
+
+void __init init_prmt(void)
+{
+	acpi_status status;
+	int mc = acpi_table_parse_entries(ACPI_SIG_PRMT, sizeof(struct acpi_table_prmt) +
+					  sizeof (struct acpi_table_prmt_header),
+					  0, acpi_parse_prmt, 0);
+	pr_info("PRM: found %u modules\n", mc);
+
+	status = acpi_install_address_space_handler(ACPI_ROOT_OBJECT,
+						    ACPI_ADR_SPACE_PLATFORM_RT,
+						    &acpi_platformrt_space_handler,
+						    NULL, NULL);
+	if (ACPI_FAILURE(status))
+		pr_alert("PRM: OperationRegion handler could not be installed\n");
+}
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index b2cafa37df7c..1abb5f92f127 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -39,6 +39,7 @@ static int acpi_apic_instance __initdata;
 enum acpi_subtable_type {
 	ACPI_SUBTABLE_COMMON,
 	ACPI_SUBTABLE_HMAT,
+	ACPI_SUBTABLE_PRMT,
 };
 
 struct acpi_subtable_entry {
@@ -222,6 +223,8 @@ acpi_get_entry_type(struct acpi_subtable_entry *entry)
 		return entry->hdr->common.type;
 	case ACPI_SUBTABLE_HMAT:
 		return entry->hdr->hmat.type;
+	case ACPI_SUBTABLE_PRMT:
+		return 0;
 	}
 	return 0;
 }
@@ -234,6 +237,8 @@ acpi_get_entry_length(struct acpi_subtable_entry *entry)
 		return entry->hdr->common.length;
 	case ACPI_SUBTABLE_HMAT:
 		return entry->hdr->hmat.length;
+	case ACPI_SUBTABLE_PRMT:
+		return entry->hdr->prmt.length;
 	}
 	return 0;
 }
@@ -246,6 +251,8 @@ acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
 		return sizeof(entry->hdr->common);
 	case ACPI_SUBTABLE_HMAT:
 		return sizeof(entry->hdr->hmat);
+	case ACPI_SUBTABLE_PRMT:
+		return sizeof(entry->hdr->prmt);
 	}
 	return 0;
 }
@@ -255,6 +262,8 @@ acpi_get_subtable_type(char *id)
 {
 	if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
 		return ACPI_SUBTABLE_HMAT;
+	if (strncmp(id, ACPI_SIG_PRMT, 4) == 0)
+		return ACPI_SUBTABLE_PRMT;
 	return ACPI_SUBTABLE_COMMON;
 }
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fb43a7940a68..7563577f337c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -132,6 +132,7 @@ enum acpi_address_range_id {
 union acpi_subtable_headers {
 	struct acpi_subtable_header common;
 	struct acpi_hmat_structure hmat;
+	struct acpi_prmt_module_header prmt;
 };
 
 typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
diff --git a/include/linux/prmt.h b/include/linux/prmt.h
new file mode 100644
index 000000000000..24da8364b919
--- /dev/null
+++ b/include/linux/prmt.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifdef CONFIG_ACPI_PRMT
+void init_prmt(void);
+#else
+static inline void init_prmt(void) { }
+#endif
-- 
Gitee


From 78608f003fc1ca42e02040a066e8da155140b04f Mon Sep 17 00:00:00 2001
From: Erik Kaneda <erik.kaneda@intel.com>
Date: Wed, 9 Jun 2021 20:41:53 -0700
Subject: [PATCH 1201/2161] ACPI: Add \_SB._OSC bit for PRM

commit opencloudos.

Signed-off-by: Erik Kaneda <erik.kaneda@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/bus.c   | 1 +
 include/linux/acpi.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index c3c8934f61bd..0ad5f1e55096 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -303,6 +303,7 @@ static void acpi_bus_osc_support(void)
 
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
+	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT;
 
 #ifdef CONFIG_X86
 	if (boot_cpu_has(X86_FEATURE_HWP)) {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7563577f337c..26556d201796 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -530,6 +530,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_PCLPI_SUPPORT			0x00000080
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
+#define OSC_SB_PRM_SUPPORT			0x00020000
 
 extern bool osc_sb_apei_support_acked;
 extern bool osc_pc_lpi_support_confirmed;
-- 
Gitee


From 8d5510869ac5f84e98a3c9d94f542368384a2bcb Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Mon, 16 May 2022 17:07:31 +0800
Subject: [PATCH 1202/2161] ACPI: Correct \_SB._OSC bit definition for PRM

commit opencloudos.

According to Platform Runtime Mechanism Specification v1.0 [1],
Page 42, \_SB._OSC bit 21 is used to indicate OS support for PRM.

Update the definition of the PRM support bit in the code to match the
specification.

Link: https://uefi.org/sites/default/files/resources/Platform%20Runtime%20Mechanism%20-%20with%20legal%20notice.pdf # [1]
Fixes: 60faa8f1ac6e ("ACPI: Add \_SB._OSC bit for PRM")
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 26556d201796..dd0f2e4b79a7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -530,7 +530,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_PCLPI_SUPPORT			0x00000080
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
-#define OSC_SB_PRM_SUPPORT			0x00020000
+#define OSC_SB_PRM_SUPPORT			0x00200000
 
 extern bool osc_sb_apei_support_acked;
 extern bool osc_pc_lpi_support_confirmed;
-- 
Gitee


From 43b02cd10b1659986007598507636beaf8c832fc Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Fri, 2 Jul 2021 15:03:50 +0800
Subject: [PATCH 1203/2161] ACPI: Do not singal PRM support if not enabled

commit 392ed6a789803fbfd49994e95fe99cd07b07eb87 upstream.

If the OS confirms PRM (Platform Runtime Mechanism) support through
the \_SB._OSC PRM bit, the BIOS may start relying on the presence of
PRM support in the OS, so prevent the PRM bit from being set in the
\_SB._OSC capabilities bitmask when PRM support is not built in so
as to avoid confusing the BIOS in that case.

Fixes: 60faa8f1ac6e ("ACPI: Add \_SB._OSC bit for PRM")
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
[ rjw: Rewrite subject and changelog, replace #ifdef with if (IS_ENABLED()) ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/bus.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 0ad5f1e55096..ccfa8d8f4834 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -303,7 +303,8 @@ static void acpi_bus_osc_support(void)
 
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
-	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT;
+	if (IS_ENABLED(CONFIG_ACPI_PRMT))
+		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT;
 
 #ifdef CONFIG_X86
 	if (boot_cpu_has(X86_FEATURE_HWP)) {
-- 
Gitee


From 2f754b7417f9cc71bd010f4e348c689d5c852b11 Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Fri, 6 Aug 2021 08:46:24 +0800
Subject: [PATCH 1204/2161] ACPI: PRM: Deal with table not present or no module
 found

commit 2bbfa0addd63fd06756b7af8bf146ae166e2abf5 upstream.

On the system PRMT table is not present, dmesg output:

	$ dmesg | grep PRM
	[    1.532237] ACPI: PRMT not present
	[    1.532237] PRM: found 4294967277 modules

The result of acpi_table_parse_entries need to be checked and return
immediately if PRMT table is not present or no PRM module found.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 33c274698d07..9395a0fe038f 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -292,6 +292,12 @@ void __init init_prmt(void)
 	int mc = acpi_table_parse_entries(ACPI_SIG_PRMT, sizeof(struct acpi_table_prmt) +
 					  sizeof (struct acpi_table_prmt_header),
 					  0, acpi_parse_prmt, 0);
+	/*
+	 * Return immediately if PRMT table is not present or no PRM module found.
+	 */
+	if (mc <= 0)
+		return;
+
 	pr_info("PRM: found %u modules\n", mc);
 
 	status = acpi_install_address_space_handler(ACPI_ROOT_OBJECT,
-- 
Gitee


From 8162c2990521c2f8a362ec6679be81f1addcbf06 Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Wed, 8 Sep 2021 18:55:45 +0800
Subject: [PATCH 1205/2161] ACPI: PRM: Find PRMT table before parsing it

commit 3265cc3ec52e75fc8daf189954cebda27ad26b2e upstream.

Find and verify PRMT before parsing it, which eliminates a
warning on machines without PRMT:

	[    7.197173] ACPI: PRMT not present

Fixes: cefc7ca46235 ("ACPI: PRM: implement OperationRegion handler for the PlatformRtMechanism subtype")
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Tested-by: Paul Menzel <pmenzel@molgen.mpg.de>
Cc: 5.14+ <stable@vger.kernel.org> # 5.14+
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 9395a0fe038f..960ed630168c 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -288,10 +288,18 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 
 void __init init_prmt(void)
 {
+	struct acpi_table_header *tbl;
 	acpi_status status;
-	int mc = acpi_table_parse_entries(ACPI_SIG_PRMT, sizeof(struct acpi_table_prmt) +
+	int mc;
+
+	status = acpi_get_table(ACPI_SIG_PRMT, 0, &tbl);
+	if (ACPI_FAILURE(status))
+		return;
+
+	mc = acpi_table_parse_entries(ACPI_SIG_PRMT, sizeof(struct acpi_table_prmt) +
 					  sizeof (struct acpi_table_prmt_header),
 					  0, acpi_parse_prmt, 0);
+	acpi_put_table(tbl);
 	/*
 	 * Return immediately if PRMT table is not present or no PRM module found.
 	 */
-- 
Gitee


From 8ed1ba6cdd1db98cbacbd48aeae17ccdc48b9f96 Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Wed, 20 Oct 2021 11:23:17 +0800
Subject: [PATCH 1206/2161] ACPI: PRM: Handle memory allocation and memory
 remap failure

commit c52ca713279da78881d36b49acd36288a46bbec8 upstream.

Handle memory allocation and memory remap failure in acpi_parse_prmt()
when system runs out of memory to avoid the potential NULL pointer
dereference errors.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 960ed630168c..6633cdc5d4ac 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -99,7 +99,7 @@ acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
 	struct acpi_prmt_handler_info *handler_info;
 	struct prm_handler_info *th;
 	struct prm_module_info *tm;
-	u64 mmio_count = 0;
+	u64 *mmio_count;
 	u64 cur_handler = 0;
 	u32 module_info_size = 0;
 	u64 mmio_range_size = 0;
@@ -108,6 +108,8 @@ acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
 	module_info = (struct acpi_prmt_module_info *) header;
 	module_info_size = struct_size(tm, handlers, module_info->handler_info_count);
 	tm = kmalloc(module_info_size, GFP_KERNEL);
+	if (!tm)
+		goto parse_prmt_out1;
 
 	guid_copy(&tm->guid, (guid_t *) module_info->module_guid);
 	tm->major_rev = module_info->major_rev;
@@ -120,14 +122,24 @@ acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
 		 * Each module is associated with a list of addr
 		 * ranges that it can use during the service
 		 */
-		mmio_count = *(u64 *) memremap(module_info->mmio_list_pointer, 8, MEMREMAP_WB);
-		mmio_range_size = struct_size(tm->mmio_info, addr_ranges, mmio_count);
+		mmio_count = (u64 *) memremap(module_info->mmio_list_pointer, 8, MEMREMAP_WB);
+		if (!mmio_count)
+			goto parse_prmt_out2;
+
+		mmio_range_size = struct_size(tm->mmio_info, addr_ranges, *mmio_count);
 		tm->mmio_info = kmalloc(mmio_range_size, GFP_KERNEL);
+		if (!tm->mmio_info)
+			goto parse_prmt_out3;
+
 		temp_mmio = memremap(module_info->mmio_list_pointer, mmio_range_size, MEMREMAP_WB);
+		if (!temp_mmio)
+			goto parse_prmt_out4;
 		memmove(tm->mmio_info, temp_mmio, mmio_range_size);
 	} else {
-		mmio_range_size = struct_size(tm->mmio_info, addr_ranges, mmio_count);
-		tm->mmio_info = kmalloc(mmio_range_size, GFP_KERNEL);
+		tm->mmio_info = kmalloc(sizeof(*tm->mmio_info), GFP_KERNEL);
+		if (!tm->mmio_info)
+			goto parse_prmt_out2;
+
 		tm->mmio_info->mmio_count = 0;
 	}
 
@@ -145,6 +157,15 @@ acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
 	} while (++cur_handler < tm->handler_count && (handler_info = get_next_handler(handler_info)));
 
 	return 0;
+
+parse_prmt_out4:
+	kfree(tm->mmio_info);
+parse_prmt_out3:
+	memunmap(mmio_count);
+parse_prmt_out2:
+	kfree(tm);
+parse_prmt_out1:
+	return -ENOMEM;
 }
 
 #define GET_MODULE	0
-- 
Gitee


From 838c82719792200c197445bc067718d6f1e308ec Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Fri, 6 Aug 2021 09:57:24 +0800
Subject: [PATCH 1207/2161] ACPI/PRM: Obtain PRM handler revision from firmware

commit opencloudos.

PRM handler revision is useful for the debug purpose of PRM information
dump and PRM runtime update. Combined with PRM module revision, if the
version of a PRM handler to be updated is not greater than the current
version, the update of this PRM handler will be ignored.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 6633cdc5d4ac..f9a0f7fd71c6 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -54,6 +54,7 @@ LIST_HEAD(prm_module_list);
 
 struct prm_handler_info {
 	guid_t guid;
+	u16 rev;
 	u64 handler_addr;
 	u64 static_data_buffer_addr;
 	u64 acpi_param_buffer_addr;
@@ -151,6 +152,7 @@ acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
 		th = &tm->handlers[cur_handler];
 
 		guid_copy(&th->guid, (guid_t *)handler_info->handler_guid);
+		th->rev = handler_info->revision;
 		th->handler_addr = efi_pa_va_lookup(handler_info->handler_address);
 		th->static_data_buffer_addr = efi_pa_va_lookup(handler_info->static_data_buffer_address);
 		th->acpi_param_buffer_addr = efi_pa_va_lookup(handler_info->acpi_param_buffer_address);
-- 
Gitee


From 683c6874d94083ea1df9860ad13592fc8644b9a4 Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Wed, 25 Aug 2021 13:49:51 +0800
Subject: [PATCH 1208/2161] ACPI/PRM: Add module GUID match to find a module

commit opencloudos.

The current method to find a PRM module or a PRM handler is to match
the target PRM handler GUID. Add a module GUID matching method to
find a module.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index f9a0f7fd71c6..5e629cbc69f0 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -180,6 +180,14 @@ static void *find_guid_info(const guid_t *guid, u8 mode)
 	int i = 0;
 
 	list_for_each_entry(cur_module, &prm_module_list, module_list) {
+		/*
+		 * Module GUID match
+		 */
+		if (mode == GET_MODULE && guid_equal(guid, &cur_module->guid))
+			return (void *)cur_module;
+		/*
+		 * Handler GUID match
+		 */
 		for (i = 0; i < cur_module->handler_count; ++i) {
 			cur_handler = &cur_module->handlers[i];
 			if (guid_equal(guid, &cur_handler->guid)) {
-- 
Gitee


From cd3e357b470af6eee1cf395beee4c871c35181b0 Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Mon, 11 Oct 2021 14:52:15 +0800
Subject: [PATCH 1209/2161] ACPI/PRM: Add a sysfs interface for runtime update

commit opencloudos.

Add a sysfs interface for PRM runtime update. The current PRM module
information can be obtain by:

        $ cat /sys/firmware/acpi/prm_update
        Module GUID  : 7a3871b6-2820-4bc8-9b7d-7811ba3be347
        Major Rev    : 1
        Minor Rev    : 0
        Handler Count: 1
         Handler GUID: 1de4055d-d2f3-4e11-b7d9-7d6c19173fee
                  Rev: 0

        Module GUID  : 501e9589-8e8e-4daf-a466-09422613199d
        Major Rev    : 1
        Minor Rev    : 0
        Handler Count: 3
         Handler GUID: 77cfbb8c-8f09-4c3b-b7e8-103a65101c60
                  Rev: 0
         Handler GUID: 5402385f-f04b-44d8-984c-bef829ac2ae0
                  Rev: 0
         Handler GUID: f6a58d47-e04f-4f5a-86b8-2a50d4aa109b
                  Rev: 0

PRM image will be loaded from /lib/firmware/prm.efi into the kernel
memory. Then invoke PRM runtime update by:

        $echo 1 > /sys/firmware/acpi/prm_update

PRM module and handler export descriptors will be dumped for sanity check.
The export descriptor information can be obtained from kernel log:

[  375.024671] PRM: runtime update
[  375.029099] Platform guid    : d7eaf54d-c9b9-4075-89f0-71943dbcfa61
[  375.029106] Module signature : PRM_MEDT
[  375.029109] Module revision  : 0
[  375.029111] Module identifier: 501e9589-8e8e-4daf-a466-09422613199d
[  375.029113] Handler count    : 3
[  375.029114]  handler guid    : 77cfbb8c-8f09-4c3b-b7e8-103a65101c60
[  375.029116]  handler name    : PrmHandler1
[  375.029118]  handler guid    : 5402385f-f04b-44d8-984c-bef829ac2ae0
[  375.029120]  handler name    : PrmHandler2
[  375.029122]  handler guid    : f6a58d47-e04f-4f5a-86b8-2a50d4aa109b
[  375.029123]  handler name    : PrmHandlerN

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 146 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 145 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 5e629cbc69f0..8cd7d51b6107 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -18,6 +18,9 @@
 #include <linux/efi.h>
 #include <linux/acpi.h>
 #include <linux/prmt.h>
+#include <linux/err.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
 #include <asm/efi.h>
 
 #pragma pack(1)
@@ -47,8 +50,21 @@ struct prm_context_buffer {
 	u64 static_data_buffer;
 	struct prm_mmio_info *mmio_ranges;
 };
-#pragma pack()
 
+struct prm_handler_export_descriptor {
+	guid_t handler_guid;
+	char handler_name[128];
+};
+
+struct prm_module_export_descriptor {
+	char signature[8];
+	u16 revision;
+	u16 handler_count;
+	guid_t platform_guid;
+	guid_t  identifier;
+	struct prm_handler_export_descriptor handlers[];
+};
+#pragma pack()
 
 LIST_HEAD(prm_module_list);
 
@@ -74,6 +90,15 @@ struct prm_module_info {
 	struct prm_handler_info handlers[];
 };
 
+/* fake device for request_firmware */
+static struct platform_device *prm_pdev;
+
+enum prm_state {
+	PRM_STS_OK		= 0,
+	PRM_STS_NFOUND,
+	PRM_STS_ERR,
+	PRM_STS_MAX,
+};
 
 static u64 efi_pa_va_lookup(u64 pa)
 {
@@ -317,6 +342,118 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 	return AE_OK;
 }
 
+static int prm_dump_raw_image(const u8 *data, int size)
+{
+	struct prm_module_export_descriptor *med;
+	struct prm_handler_export_descriptor *hed;
+	unsigned char *signature = "PRM_MEDT";
+	int i, med_offset, sig_size = 8;
+
+	/*
+	 * Scan "PRM_MEDT" string for module export descriptor structure.
+	 */
+	for (i = 0; i <= size - sig_size; i++) {
+		if (memcmp(data + i, signature, sig_size) == 0) {
+			med_offset = i;
+			pr_info("PRM: found module export descriptor structure, offset = 0x%x\n", i);
+			break;
+		}
+	}
+
+	if (i > size - sig_size) {
+		pr_err("PRM: no module export descriptor structure found\n");
+		return PRM_STS_NFOUND;
+	}
+
+	med = (struct prm_module_export_descriptor *)(data + med_offset);
+	pr_info("Platform guid    : %pUl\n", &med->platform_guid);
+	pr_info("Module signature : %s\n", med->signature);
+	pr_info("Module revision  : %d\n", med->revision);
+	pr_info("Module identifier: %pUl\n", &med->identifier);
+	pr_info("Handler count    : %d\n", med->handler_count);
+
+	hed = med->handlers;
+	for(i = 0; i < med->handler_count; i++) {
+		pr_info(" handler guid    : %pUl\n", &hed->handler_guid);
+		pr_info(" handler name    : %s\n", hed->handler_name);
+		hed++;
+	}
+
+	return PRM_STS_OK;
+}
+
+static int prm_load_image(struct device *dev)
+{
+	const struct firmware *firmware;
+	char name[16];
+	int ret;
+
+	sprintf(name, "prm.efi");
+
+	if (request_firmware_direct(&firmware, name, dev)) {
+		pr_err("PRM: image %s load failed\n", name);
+		return PRM_STS_NFOUND;
+	}
+
+	ret = prm_dump_raw_image(firmware->data, firmware->size);
+	if (ret != PRM_STS_OK) {
+		pr_err("PRM: dump raw image error\n");
+		goto load_out;
+	}
+
+load_out:
+	release_firmware(firmware);
+	return ret;
+}
+
+static ssize_t prm_update_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	struct prm_handler_info *cur_handler;
+	struct prm_module_info *cur_module;
+	char *s = buf;
+	int i = 0;
+
+	list_for_each_entry(cur_module, &prm_module_list, module_list) {
+		s += sprintf(s, "Module GUID  : %pUl\n", &cur_module->guid);
+		s += sprintf(s, "Major Rev    : %d\n", cur_module->major_rev);
+		s += sprintf(s, "Minor Rev    : %d\n", cur_module->minor_rev);
+		s += sprintf(s, "Handler Count: %d\n", cur_module->handler_count);
+		for (i = 0; i < cur_module->handler_count; ++i) {
+			cur_handler = &cur_module->handlers[i];
+			s += sprintf(s, " Handler GUID: %pUl\n", &cur_handler->guid);
+			s += sprintf(s, "          Rev: %d\n", cur_handler->rev);
+		}
+		s += sprintf(s, "\n");
+	}
+
+	return (s - buf);
+}
+
+static ssize_t prm_update_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t size)
+{
+	unsigned long val;
+	ssize_t ret = 0;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
+	if (val != 1)
+		return size;
+
+	pr_info("PRM: runtime update\n");
+	ret = prm_load_image(&prm_pdev->dev);
+	if (ret == PRM_STS_OK)
+		ret = size;
+
+	return ret;
+}
+
+static const struct kobj_attribute prm_update_attr =
+__ATTR(prm_update, 0644, prm_update_show, prm_update_store);
+
 void __init init_prmt(void)
 {
 	struct acpi_table_header *tbl;
@@ -339,10 +476,17 @@ void __init init_prmt(void)
 
 	pr_info("PRM: found %u modules\n", mc);
 
+	prm_pdev = platform_device_register_simple("prm", -1, NULL, 0);
+	if (IS_ERR(prm_pdev))
+		return;
+
 	status = acpi_install_address_space_handler(ACPI_ROOT_OBJECT,
 						    ACPI_ADR_SPACE_PLATFORM_RT,
 						    &acpi_platformrt_space_handler,
 						    NULL, NULL);
 	if (ACPI_FAILURE(status))
 		pr_alert("PRM: OperationRegion handler could not be installed\n");
+
+	if (sysfs_create_file(acpi_kobj, &prm_update_attr.attr))
+		pr_err("PRM: failed to create prm sysfs entry\n");
 }
-- 
Gitee


From 1d4c55086e3136fb510a5fbff4802106fbd3b1e7 Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Wed, 25 Aug 2021 16:01:15 +0800
Subject: [PATCH 1210/2161] ACPI/PRM: find self update handler

commit opencloudos.

Find self update PRM handler by the specific PRM handler GUID.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 8cd7d51b6107..0b36781dd7bb 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -382,6 +382,22 @@ static int prm_dump_raw_image(const u8 *data, int size)
 	return PRM_STS_OK;
 }
 
+#define	PRM_UPDATE_HANDLER_GUID	EFI_GUID(0xa13cdd48, 0xc822, 0x4913, 0xb0, 0xcf, 0x3c, 0xb4, 0x68, 0x23, 0xd3, 0x76)
+
+static int prm_parse_image(const u8 *data, int size)
+{
+	struct prm_handler_info *update_handler;
+	efi_guid_t update_handler_guid = PRM_UPDATE_HANDLER_GUID;
+
+	update_handler = find_prm_handler(&update_handler_guid);
+	if (!update_handler) {
+		pr_err("PRM: failed to find self update handler\n");
+		return PRM_STS_NFOUND;
+	}
+	pr_info("PRM: found self update handler\n");
+	return PRM_STS_OK;
+}
+
 static int prm_load_image(struct device *dev)
 {
 	const struct firmware *firmware;
@@ -401,6 +417,8 @@ static int prm_load_image(struct device *dev)
 		goto load_out;
 	}
 
+	ret = prm_parse_image(firmware->data, firmware->size);
+
 load_out:
 	release_firmware(firmware);
 	return ret;
-- 
Gitee


From 64892d15f30016026805c64b999a02d490c9566e Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Wed, 20 Oct 2021 16:33:44 +0800
Subject: [PATCH 1211/2161] ACPI/PRM: load and parse PRM image

commit opencloudos.

There are 3 phases of BIOS-OS communications to load and parse
PRM image:
  - phase 1: get loaded image size
  - phase 2: parse loaded image
  - phase 3: collect module info
After BIOS-OS communications, PRM image is loaded with module info.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 216 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 0b36781dd7bb..997e6796284a 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -21,6 +21,8 @@
 #include <linux/err.h>
 #include <linux/firmware.h>
 #include <linux/platform_device.h>
+#include <linux/moduleloader.h>
+#include <linux/set_memory.h>
 #include <asm/efi.h>
 
 #pragma pack(1)
@@ -64,6 +66,26 @@ struct prm_module_export_descriptor {
 	guid_t  identifier;
 	struct prm_handler_export_descriptor handlers[];
 };
+
+struct prm_image_section {
+	u64 base;
+	u64 size;
+	u64 attr;
+};
+
+struct prm_handler_param_buffer {
+	u32 phase;
+	u32 reserved;
+	u64 raw_base;
+	u64 raw_size;
+	u64 loaded_base;
+	u64 loaded_size;
+	u64 seclist_base;
+	u64 seclist_size;
+	guid_t module_guid;
+	u64 module_info_base;
+	u64 module_info_size;
+};
 #pragma pack()
 
 LIST_HEAD(prm_module_list);
@@ -100,6 +122,13 @@ enum prm_state {
 	PRM_STS_MAX,
 };
 
+enum prm_phase {
+	PRM_PHASE_GET_SIZE	= 0,
+	PRM_PHASE_LOAD_IMAGE,
+	PRM_PHASE_GET_MODINFO,
+	PRM_PHASE_MAX,
+};
+
 static u64 efi_pa_va_lookup(u64 pa)
 {
 	efi_memory_desc_t *md;
@@ -382,12 +411,150 @@ static int prm_dump_raw_image(const u8 *data, int size)
 	return PRM_STS_OK;
 }
 
+/*
+ * PE/COFF image section attribute definition.
+ */
+#define	PRM_IMAGE_SEC_CODE	BIT(5)
+#define	PRM_IMAGE_SEC_MEM_X	BIT(29)
+#define	PRM_IMAGE_SEC_MEM_R	BIT(30)
+#define	PRM_IMAGE_SEC_MEM_W	BIT(31)
+/*
+ * BIOS-OS communication phase 1: get loaded image size and section list size
+ * - input:
+ *   raw image base
+ *   raw image size
+ * - output:
+ *   loaded image size
+ *   section list size
+ */
+static int efi_get_loaded_size(struct prm_handler_info *handler,
+				 struct prm_handler_param_buffer *param_buffer)
+{
+	efi_status_t status;
+
+	param_buffer->phase = PRM_PHASE_GET_SIZE;
+	/*
+	 * Return loaded image size
+	 */
+	status = efi_call_virt_pointer(handler, handler_addr, param_buffer);
+	if (status != EFI_SUCCESS) {
+		pr_err("PRM: failed to query loaded image size or module info size, 0x%lx\n", status);
+		return PRM_STS_ERR;
+	}
+	pr_info("PRM: phase:%d, raw_base:0x%llx, raw_size:0x%llx, loaded_size:0x%llx, seclist_size:0x%llx\n",
+		param_buffer->phase, param_buffer->raw_base, param_buffer->raw_size,
+		param_buffer->loaded_size, param_buffer->seclist_size);
+
+	return PRM_STS_OK;
+}
+
+/*
+ * BIOS-OS communication phase 2: parse loaded image
+ * - input:
+ *   loaded image base
+ *   loaded image size
+ *   section list base
+ *   section list size
+ * - output:
+ *   section list data structure
+ *   module GUID
+ *   module info size
+ */
+static int efi_parse_loaded_image(struct prm_handler_info *handler,
+		struct prm_handler_param_buffer *param_buffer)
+{
+	struct prm_module_info *module_info;
+	efi_status_t status;
+
+	param_buffer->phase = PRM_PHASE_LOAD_IMAGE;
+	param_buffer->loaded_base = (u64)module_alloc(param_buffer->loaded_size);
+	if (!param_buffer->loaded_base)
+		return PRM_STS_ERR;
+
+	param_buffer->seclist_base = (u64)kmalloc(param_buffer->seclist_size, GFP_KERNEL);
+	if (!param_buffer->seclist_base)
+		goto phase2_out1;
+	/*
+	 * Return module GUID and module info size
+	 */
+	status = efi_call_virt_pointer(handler, handler_addr, param_buffer);
+	if (status != EFI_SUCCESS) {
+		pr_err("PRM: failed to parse loaded image\n");
+		goto phase2_out2;
+	}
+	module_info = find_prm_module(&param_buffer->module_guid);
+	if (!module_info) {
+		pr_err("PRM: no existing module found, update module GUID:%pUl\n",
+			&param_buffer->module_guid);
+		goto phase2_out2;
+	}
+	pr_info("PRM: phase:%d, module_GUID:%pUl, module_info_size:0x%llx\n",
+		param_buffer->phase, &param_buffer->module_guid, param_buffer->module_info_size);
+
+	return PRM_STS_OK;
+
+phase2_out2:
+	kfree((void *)param_buffer->seclist_base);
+phase2_out1:
+	module_memfree((void *)param_buffer->loaded_base);
+	return PRM_STS_ERR;
+}
+
+/*
+ * BIOS-OS communication phase 3: collect module info
+ * - input:
+ *   module info base
+ *   module info size
+ * - output:
+ *   module info data structure
+ */
+static int efi_collect_module_info(struct prm_handler_info *handler,
+				    struct prm_handler_param_buffer *param_buffer)
+{
+	struct acpi_prmt_module_info *acpi_module_info;
+	struct acpi_prmt_handler_info *acpi_handler_info;
+	efi_status_t status;
+	int i;
+
+	param_buffer->phase = PRM_PHASE_GET_MODINFO;
+	param_buffer->module_info_base = (u64)kmalloc(param_buffer->module_info_size, GFP_KERNEL);
+	if (!param_buffer->module_info_base)
+		return PRM_STS_ERR;
+	/*
+	 * Return code section data structure
+	 */
+        status = efi_call_virt_pointer(handler, handler_addr, param_buffer);
+        if (status != EFI_SUCCESS) {
+		pr_err("PRM: failed to generate module info data structure, 0x%lx\n", status);
+		kfree((void *)param_buffer->module_info_base);
+                return PRM_STS_ERR;
+	}
+	pr_info("PRM: phase:%d\n", param_buffer->phase);
+	acpi_module_info = (struct acpi_prmt_module_info *)param_buffer->module_info_base;
+	pr_info("Module GUID  : %pUl\n", acpi_module_info->module_guid);
+	pr_info("Major Rev    : %d\n", acpi_module_info->major_rev);
+	pr_info("Minor Rev    : %d\n", acpi_module_info->minor_rev);
+	pr_info("Handler Count: %d\n", acpi_module_info->handler_info_count);
+
+	acpi_handler_info = get_first_handler(acpi_module_info);
+	for (i = 0; i < acpi_module_info->handler_info_count; i++) {
+		pr_info("Handler GUID: %pUl\n", acpi_handler_info->handler_guid);
+		pr_info("Handler addr: 0x%llx\n", acpi_handler_info->handler_address);
+		acpi_handler_info++;
+	}
+	return PRM_STS_OK;
+}
+
 #define	PRM_UPDATE_HANDLER_GUID	EFI_GUID(0xa13cdd48, 0xc822, 0x4913, 0xb0, 0xcf, 0x3c, 0xb4, 0x68, 0x23, 0xd3, 0x76)
 
 static int prm_parse_image(const u8 *data, int size)
 {
 	struct prm_handler_info *update_handler;
+	struct prm_handler_param_buffer *param_buffer;
+	struct prm_image_section *sec;
 	efi_guid_t update_handler_guid = PRM_UPDATE_HANDLER_GUID;
+	size_t param_size;
+	int num_pages, num_secs, ret, i;
 
 	update_handler = find_prm_handler(&update_handler_guid);
 	if (!update_handler) {
@@ -395,7 +562,56 @@ static int prm_parse_image(const u8 *data, int size)
 		return PRM_STS_NFOUND;
 	}
 	pr_info("PRM: found self update handler\n");
+
+	param_size = sizeof(struct prm_handler_param_buffer);
+	param_buffer = (struct prm_handler_param_buffer *)kmalloc(param_size, GFP_KERNEL);
+	if (!param_buffer)
+		return PRM_STS_ERR;
+
+	memset(param_buffer, 0, param_size);
+
+	param_buffer->raw_base = (u64)data;
+	param_buffer->raw_size = size;
+	/*
+	 * BIOS-OS communication phase 1: get loaded image size
+	 */
+	ret = efi_get_loaded_size(update_handler, param_buffer);
+	if (ret != PRM_STS_OK)
+		goto parse_image_out;
+	/*
+	 * BIOS-OS communication phase 2: parse loaded image
+	 */
+	ret = efi_parse_loaded_image(update_handler, param_buffer);
+	if (ret != PRM_STS_OK)
+		goto parse_image_out;
+	/*
+	 * BIOS-OS communication phase 3: get module info
+	 */
+	ret = efi_collect_module_info(update_handler, param_buffer);
+	if (ret != PRM_STS_OK)
+		goto parse_image_out;
+	/*
+	 * PRM image load and parse completed, fix page table for code sections.
+	 * First make the page read-only, and only then make it executable to
+	 * prevent it from being W+X in between.
+	 */
+	num_secs = param_buffer->seclist_size / sizeof(struct prm_image_section);
+	sec = (struct prm_image_section *)param_buffer->seclist_base;
+	for (i = 0; i < num_secs; i++) {
+		pr_info("PRM: sec_base:0x%llx, sec_size:0x%llx, sec_attr:0x%llx\n",
+			sec->base, sec->size, sec->attr);
+		if (sec->attr & PRM_IMAGE_SEC_CODE) {
+			num_pages = DIV_ROUND_UP(sec->size, PAGE_SIZE);
+			set_memory_ro((unsigned long)sec->base, num_pages);
+			set_memory_x((unsigned long)sec->base, num_pages);
+		}
+		sec++;
+	}
 	return PRM_STS_OK;
+
+parse_image_out:
+	kfree(param_buffer);
+	return PRM_STS_ERR;
 }
 
 static int prm_load_image(struct device *dev)
-- 
Gitee


From ef4a0e4846b79a2734365cc9cb19ac7b93b0fe2c Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Wed, 20 Oct 2021 16:36:49 +0800
Subject: [PATCH 1212/2161] ACPI/PRM: update module info data structure

commit opencloudos.

Parse module info data structure from ACPI and update system PRM
module list if the version number meets the requirement.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/prmt.c | 94 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 997e6796284a..9dc59c741696 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -112,6 +112,7 @@ struct prm_module_info {
 	struct prm_handler_info handlers[];
 };
 
+static DEFINE_MUTEX(prm_mutex);
 /* fake device for request_firmware */
 static struct platform_device *prm_pdev;
 
@@ -336,6 +337,7 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 
 	case PRM_CMD_START_TRANSACTION:
 
+		mutex_lock(&prm_mutex);
 		module = find_prm_module(&buffer->handler_guid);
 		if (!module)
 			goto invalid_guid;
@@ -356,6 +358,7 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 			buffer->prm_status = UPDATE_UNLOCK_WITHOUT_LOCK;
 		else
 			module->updatable = true;
+		mutex_unlock(&prm_mutex);
 		break;
 
 	default:
@@ -545,6 +548,86 @@ static int efi_collect_module_info(struct prm_handler_info *handler,
 	return PRM_STS_OK;
 }
 
+static int prm_compare_version(struct prm_module_info *old_mod,
+				struct prm_module_info *new_mod)
+{
+	/*
+	 * PRM updates are always applied in monotonically increasing fashion.
+	 */
+	if (new_mod->major_rev > old_mod->major_rev)
+		return PRM_STS_OK;
+	else if (new_mod->major_rev == old_mod->major_rev) {
+		if (new_mod->minor_rev > old_mod->minor_rev)
+			return PRM_STS_OK;
+	}
+	return PRM_STS_ERR;
+}
+
+static int prm_update_module(struct prm_handler_param_buffer *param_buffer)
+{
+	struct acpi_prmt_module_info *acpi_mod;
+	struct acpi_prmt_handler_info *acpi_handler;
+	struct prm_module_info *old_mod, *new_mod;
+	struct prm_handler_info *old_handler, *new_handler;
+	int mod_size, i;
+
+	acpi_mod = (struct acpi_prmt_module_info *)param_buffer->module_info_base;
+	old_mod = find_prm_module((guid_t *)acpi_mod->module_guid);
+	/*
+	 * Don't update PRM module if it's already in transaction.
+	 */
+	if (!old_mod->updatable)
+		return PRM_STS_ERR;
+
+	mod_size = struct_size(new_mod, handlers, acpi_mod->handler_info_count);
+	new_mod = (struct prm_module_info *)kmalloc(mod_size, GFP_KERNEL);
+	if (!new_mod)
+		return PRM_STS_ERR;
+
+	guid_copy(&new_mod->guid, (guid_t *)acpi_mod->module_guid);
+	new_mod->major_rev = acpi_mod->major_rev;
+	new_mod->minor_rev = acpi_mod->minor_rev;
+	new_mod->handler_count = acpi_mod->handler_info_count;
+	new_mod->updatable = true;
+
+	/*
+	 * Don't update PRM module if its version number is not
+	 * greater than the current PRM module.
+	 */
+	if (prm_compare_version(old_mod, new_mod) != PRM_STS_OK) {
+		pr_err("PRM: version number not meet the requirement\n");
+		kfree(new_mod);
+		return PRM_STS_ERR;
+	}
+	/*
+	 * Inherit mmio resource from existing module
+	 */
+	new_mod->mmio_info = old_mod->mmio_info;
+
+	acpi_handler = get_first_handler(acpi_mod);
+
+	for (i = 0; i < acpi_mod->handler_info_count; i++) {
+		old_handler = &old_mod->handlers[i];
+		new_handler = &new_mod->handlers[i];
+
+		guid_copy(&new_handler->guid, (guid_t *)acpi_handler->handler_guid);
+		new_handler->rev = acpi_handler->revision;
+		new_handler->handler_addr = acpi_handler->handler_address;
+		/*
+		 * Inherit static data buffer and acpi parameter buffer from existing handler
+		 */
+		new_handler->static_data_buffer_addr = old_handler->static_data_buffer_addr;
+		new_handler->acpi_param_buffer_addr = old_handler->acpi_param_buffer_addr;
+		acpi_handler = get_next_handler(acpi_handler);
+		pr_info("PRM: count:%d,old addr:0x%llx, new addr:0x%llx\n",i, old_handler->handler_addr, new_handler->handler_addr);
+	}
+
+	INIT_LIST_HEAD(&new_mod->module_list);
+	list_replace(&old_mod->module_list, &new_mod->module_list);
+	kfree(old_mod);
+	return PRM_STS_OK;
+}
+
 #define	PRM_UPDATE_HANDLER_GUID	EFI_GUID(0xa13cdd48, 0xc822, 0x4913, 0xb0, 0xcf, 0x3c, 0xb4, 0x68, 0x23, 0xd3, 0x76)
 
 static int prm_parse_image(const u8 *data, int size)
@@ -607,7 +690,11 @@ static int prm_parse_image(const u8 *data, int size)
 		}
 		sec++;
 	}
-	return PRM_STS_OK;
+	/*
+	 * Update system PRM module list
+	 */
+	ret = prm_update_module(param_buffer);
+	return ret;
 
 parse_image_out:
 	kfree(param_buffer);
@@ -627,6 +714,7 @@ static int prm_load_image(struct device *dev)
 		return PRM_STS_NFOUND;
 	}
 
+	mutex_lock(&prm_mutex);
 	ret = prm_dump_raw_image(firmware->data, firmware->size);
 	if (ret != PRM_STS_OK) {
 		pr_err("PRM: dump raw image error\n");
@@ -634,8 +722,8 @@ static int prm_load_image(struct device *dev)
 	}
 
 	ret = prm_parse_image(firmware->data, firmware->size);
-
 load_out:
+	mutex_unlock(&prm_mutex);
 	release_firmware(firmware);
 	return ret;
 }
@@ -648,6 +736,7 @@ static ssize_t prm_update_show(struct kobject *kobj,
 	char *s = buf;
 	int i = 0;
 
+	mutex_lock(&prm_mutex);
 	list_for_each_entry(cur_module, &prm_module_list, module_list) {
 		s += sprintf(s, "Module GUID  : %pUl\n", &cur_module->guid);
 		s += sprintf(s, "Major Rev    : %d\n", cur_module->major_rev);
@@ -660,6 +749,7 @@ static ssize_t prm_update_show(struct kobject *kobj,
 		}
 		s += sprintf(s, "\n");
 	}
+	mutex_unlock(&prm_mutex);
 
 	return (s - buf);
 }
-- 
Gitee


From 5d047c7387f0cd47a6daeb0fea8f50b0da7bc580 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 18 May 2022 15:07:30 +0800
Subject: [PATCH 1213/2161] Documentation: Introduce Platform Firmware Runtime
 Update documentation

commit opencloudos.

Add the Platform Firmware Runtime Update/Telemetry documentation.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/pfru.rst | 98 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 Documentation/x86/pfru.rst

diff --git a/Documentation/x86/pfru.rst b/Documentation/x86/pfru.rst
new file mode 100644
index 000000000000..321729f46737
--- /dev/null
+++ b/Documentation/x86/pfru.rst
@@ -0,0 +1,98 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================================================
+The Linux Platform Firmware Runtime Update and Telemetry
+========================================================
+
+According to the specification of <Management Mode Firmware Runtime Update>[1],
+certain computing systems require high Service Level Agreements (SLAs) where
+system reboot fewer firmware updates are required to deploy firmware changes
+to address bug fixes, security updates and to debug and root cause issues. This
+technology is called Intel Seamless Update. The management mode (MM),
+UEFI runtime services and ACPI services handle most of the system runtime
+functions. Changing the MM code execution during runtime is called MM Runtime
+Update. Since the "MM" acronyms might be misunderstood as "Memory Management",
+this driver uses "Platform Firmware Runtime Update"(PFRU)
+
+PFRU provides the following facilities: Performs a runtime firmware driver update
+and activate. Ability to inject firmware code at runtime, for dynamic instrumentation.
+PFRU Telemetry is a service which allows Runtime Update handler to produce telemetry
+data to upper layer OS consumer at runtime. The OS provides interfaces to let the
+users query the telemetry data via read operations. The specification specifies the
+interface and recommended policy to extract the data, the format and use are left to
+individual OEM's and BIOS implementations on what that data represents.
+
+PFRU interfaces
+=====================
+
+The user space tool manipulates on /dev/pfru/update for code injection and
+driver update. PFRU stands for Platform Firmware Runtime Update, and the /dev/pfru
+directory might be reserved for future usage.
+
+ 1. mmap the capsule file
+    fd_capsule = open("capsule.cap", O_RDONLY);
+    fstat(fd_capsule, &stat);
+    addr = mmap(0, stat.st_size, PROT_READ, fd_capsule);
+
+ 2. Get the capability information(version control, etc) from BIOS via
+    read() and do sanity check in user space.
+    fd_update = open("/dev/pfru/update", O_RDWR);
+    read(fd_update, &cap, sizeof(cap));
+    sanity_check(&cap);
+
+ 3. Write the capsule file to runtime update communication buffer
+    //kernel might return error if capsule file size is longer than
+    //communication buffer
+    write(fd_update, addr, stat.st_size);
+
+ 4. Stage the code injection
+    ioctl(fd_update, PFRU_IOC_STATGE);
+
+ 5. Activate the code injection
+    ioctl(fd_update, PFRU_IOC_ACTIVATE);
+
+ 6. Stage and activate the code injection
+    ioctl(fd_update, PFRU_IOC_STAGE_ACTIVATE);
+
+    PFRU_IOC_STATGE: Stage a capsule image from communication buffer
+                    and perform authentication.
+    PFRU_IOC_ACTIVATE: Activate a previous staged capsule image.
+    PFRU_IOC_STAGE_ACTIVATE: Perform both stage and activation actions.
+
+PFRU Telemetry
+=============
+
+The user space tool manipulates on /dev/pfru/telemetry for PFRU telemetry log.
+Sample code:
+
+ 1. Open telemetry device
+    fd_log = open("/dev/pfru/telemetry", O_RDWR);
+
+ 2. Get log level, log type, revision_id via one ioctl invoke
+    ioctl(fd_log, PFRU_IOC_GET_LOG_INFO, &info);
+
+ 3. Set log level, log type, revision_id
+    ioctl(fd_log, PFRU_IOC_SET_LOG_INFO, &info);
+
+ 4. ioctl(fd_log, PFRU_IOC_GET_DATA_INFO, &data_info);
+    Query the information of PFRU telemetry log buffer. The user is
+    responsible for parsing the result per the specification.
+
+ 5. Read the telemetry data:
+    read(fd_log, buf, data_info.size);
+
+Please refer to tools/testing/selftests/pfru/pfru_test.c for detail.
+
+According to <Management Mode Firmware Runtime Update>[1], the telemetry
+buffer is a wrap around buffer. If the telemetry buffer gets full, most recent
+log data will overwrite old log data. Besides, it is required in the spec that
+the read of telemetry should support both full data retrieval and delta telemetry
+data retrieval. Since this requirement is more likely a policy we leave this
+implementation in user space. That is to say, it is recommended for the user
+to double-read the telemetry parameters such as chunk1_size, chunk2_size,
+rollover_cnt in data_info structure to make sure that there is no more data appended
+while the user is reading the buffer. Besides, only after the runtime update has
+been run at least once, the telemetry log would have valid data, otherwise errno code
+of EBUSY would be returned.
+
+[1] https://uefi.org/sites/default/files/resources/Intel_MM_OS_Interface_Spec_Rev100.pdf
-- 
Gitee


From ebf51a6bf10a6c675cb183a8b11634c1965576c0 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 18 May 2022 15:03:11 +0800
Subject: [PATCH 1214/2161] efi: Introduce
 EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER and corresponding structures

commit 1882de7fc56c2b0ea91dd9fd9922d434fc3feb15 upstream.

Platform Firmware Runtime Update image starts with UEFI headers, and the headers
are defined in UEFI specification, but some of them have not been defined in the
kernel yet.

For example, the header layout of a capsule file looks like this:

EFI_CAPSULE_HEADER
EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER
EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER
EFI_FIRMWARE_IMAGE_AUTHENTICATION

These structures would be used by the Platform Firmware Runtime Update
driver to parse the format of capsule file to verify if the corresponding
version number is valid. The EFI_CAPSULE_HEADER has been defined in the
kernel, however the rest are not, thus introduce corresponding UEFI structures
accordingly.

The reason why efi_manage_capsule_header_t and efi_manage_capsule_image_header_t
are packedi might be that:
According to the uefi spec,
[Figure 23-6 Firmware Management and Firmware Image Management headers]
EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER is located at the lowest offset within
the body of the capsule. And this structure is designed to be unaligned to save
space, because in this way the adjacent drivers and binary payload elements could
start on byte boundary with no padding. And the
EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER is at the head of each payload, so
packing this structure also makes room for more data.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/efi.h | 50 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/include/linux/efi.h b/include/linux/efi.h
index c82ef0eba4f8..401e98551376 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -146,6 +146,56 @@ struct efi_boot_memmap {
 	unsigned long		*buff_size;
 };
 
+#pragma pack(1)
+
+/* EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER */
+typedef struct {
+	u32	ver;
+	u16	emb_drv_cnt;
+	u16	payload_cnt;
+	/*
+	 * Variable array indicated by number of
+	 * (emb_drv_cnt + payload_cnt)
+	 */
+	u64	offset_list[];
+} efi_manage_capsule_header_t;
+
+/* EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER */
+typedef struct {
+	u32	ver;
+	guid_t	image_type_id;
+	u8	image_index;
+	u8	reserved_bytes[3];
+	u32	image_size;
+	u32	vendor_code_size;
+	/* ver = 2. */
+	u64	hw_ins;
+	/* ver = v3. */
+	u64	capsule_support;
+} efi_manage_capsule_image_header_t;
+
+#pragma pack()
+
+/* WIN_CERTIFICATE */
+typedef struct {
+	u32	len;
+	u16	rev;
+	u16	cert_type;
+} win_cert_t;
+
+/* WIN_CERTIFICATE_UEFI_GUID */
+typedef struct {
+	win_cert_t	hdr;
+	guid_t		cert_type;
+	u8		cert_data[];
+} win_cert_uefi_guid_t;
+
+/* EFI_FIRMWARE_IMAGE_AUTHENTICATIO */
+typedef struct {
+	u64				mon_count;
+	win_cert_uefi_guid_t		auth_info;
+} efi_image_auth_t;
+
 /*
  * EFI capsule flags
  */
-- 
Gitee


From bbdf79552e7158d86b068d3701d4f945fc63ff88 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Tue, 7 Sep 2021 23:27:42 +0800
Subject: [PATCH 1215/2161] drivers/acpi: Introduce Platform Firmware Runtime
 Update device driver

commit 0fde088a9be4114cda4e4560293977a51815877d Intel-BKC.

Introduce the pfru_update driver which can be used for Platform Firmware
Runtime code injection and driver update. The user is expected to provide
the update firmware in the form of capsule file, and pass it to the driver
via ioctl. Then the driver would hand this capsule file to the Platform
Firmware Runtime Update via the ACPI device _DSM method. At last the low
level Management Mode would do the firmware update.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/ioctl/ioctl-number.rst |   1 +
 drivers/acpi/Kconfig                 |   1 +
 drivers/acpi/Makefile                |   1 +
 drivers/acpi/pfru/Kconfig            |  15 +
 drivers/acpi/pfru/Makefile           |   2 +
 drivers/acpi/pfru/pfru_update.c      | 544 +++++++++++++++++++++++++++
 include/uapi/linux/pfru.h            | 106 ++++++
 7 files changed, 670 insertions(+)
 create mode 100644 drivers/acpi/pfru/Kconfig
 create mode 100644 drivers/acpi/pfru/Makefile
 create mode 100644 drivers/acpi/pfru/pfru_update.c
 create mode 100644 include/uapi/linux/pfru.h

diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst
index d1702f9e7c39..9250d2aa5fc2 100644
--- a/Documentation/ioctl/ioctl-number.rst
+++ b/Documentation/ioctl/ioctl-number.rst
@@ -352,6 +352,7 @@ Code  Seq#    Include File                                           Comments
                                                                      <mailto:aherrman@de.ibm.com>
 0xE5  00-3F  linux/fuse.h
 0xEC  00-01  drivers/platform/chrome/cros_ec_dev.h                   ChromeOS EC driver
+0xEE  00-1F  uapi/linux/pfru.h                                       Platform Firmware Runtime Update and Telemetry
 0xF3  00-3F  drivers/usb/misc/sisusbvga/sisusb.h                     sisfb (in development)
                                                                      <mailto:thomas@winischhofer.net>
 0xF4  00-1F  video/mbxfb.h                                           mbxfb
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 3df2594eac1b..0bd025566712 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -478,6 +478,7 @@ source "drivers/acpi/hmat/Kconfig"
 
 source "drivers/acpi/apei/Kconfig"
 source "drivers/acpi/dptf/Kconfig"
+source "drivers/acpi/pfru/Kconfig"
 
 config ACPI_WATCHDOG
 	bool
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index dbfa2448060f..d129670daea3 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_ACPI_CPPC_LIB)	+= cppc_acpi.o
 obj-$(CONFIG_ACPI_SPCR_TABLE)	+= spcr.o
 obj-$(CONFIG_ACPI_DEBUGGER_USER) += acpi_dbg.o
 obj-$(CONFIG_ACPI_PPTT) 	+= pptt.o
+obj-$(CONFIG_ACPI_PFRU)		+= pfru/
 
 # processor has its own "processor." module_param namespace
 processor-y			:= processor_driver.o
diff --git a/drivers/acpi/pfru/Kconfig b/drivers/acpi/pfru/Kconfig
new file mode 100644
index 000000000000..3f31b7d95f3b
--- /dev/null
+++ b/drivers/acpi/pfru/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+config ACPI_PFRU
+	tristate "ACPI Platform Firmware Runtime Update (PFRU)"
+	depends on 64BIT
+	help
+	  In order to reduce the system reboot times and update the platform firmware
+	  in time, Platform Firmware Runtime Update is leveraged to patch the system
+	  without reboot. This driver supports Platform Firmware Runtime Update,
+	  which is composed of two parts: code injection and driver update.
+
+	  For more information, see:
+	  <file:Documentation/x86/pfru_update.rst>
+
+	  To compile this driver as a module, choose M here:
+	  the module will be called pfru_update.
diff --git a/drivers/acpi/pfru/Makefile b/drivers/acpi/pfru/Makefile
new file mode 100644
index 000000000000..098cbe80cf3d
--- /dev/null
+++ b/drivers/acpi/pfru/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_ACPI_PFRU) += pfru_update.o
diff --git a/drivers/acpi/pfru/pfru_update.c b/drivers/acpi/pfru/pfru_update.c
new file mode 100644
index 000000000000..826935514509
--- /dev/null
+++ b/drivers/acpi/pfru/pfru_update.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACPI Platform Firmware Runtime Update Device Driver
+ *
+ * Copyright (C) 2021 Intel Corporation
+ * Author: Chen Yu <yu.c.chen@intel.com>
+ */
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/efi.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/uuid.h>
+#include <uapi/linux/pfru.h>
+
+struct pfru_device {
+	guid_t uuid;
+	int rev_id;
+	struct device *dev;
+};
+
+static struct pfru_device pfru_dev;
+static struct pfru_device *get_pfru_dev(void)
+{
+	return &pfru_dev;
+}
+
+static int query_capability(struct update_cap_info *cap)
+{
+	union acpi_object *out_obj;
+	struct pfru_device *pf_dev;
+	acpi_handle handle;
+	int i, ret = -EINVAL;
+
+	pf_dev = get_pfru_dev();
+	handle = ACPI_HANDLE(pf_dev->dev);
+
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_dev->uuid,
+					  pf_dev->rev_id, FUNC_QUERY_UPDATE_CAP,
+					  NULL, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		union acpi_object *obj = &out_obj->package.elements[i];
+
+		switch (i) {
+		case 0:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->status = obj->integer.value;
+			break;
+		case 1:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->update_cap = obj->integer.value;
+			break;
+		case 2:
+			if (obj->type != ACPI_TYPE_BUFFER)
+				goto free_acpi_buffer;
+			memcpy(&cap->code_type, obj->buffer.pointer,
+			       obj->buffer.length);
+			break;
+		case 3:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->fw_version = obj->integer.value;
+			break;
+		case 4:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->code_rt_version = obj->integer.value;
+			break;
+		case 5:
+			if (obj->type != ACPI_TYPE_BUFFER)
+				goto free_acpi_buffer;
+			memcpy(&cap->drv_type, obj->buffer.pointer,
+			       obj->buffer.length);
+			break;
+		case 6:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->drv_rt_version = obj->integer.value;
+			break;
+		case 7:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->drv_svn = obj->integer.value;
+			break;
+		case 8:
+			if (obj->type != ACPI_TYPE_BUFFER)
+				goto free_acpi_buffer;
+			memcpy(&cap->platform_id, obj->buffer.pointer,
+			       obj->buffer.length);
+			break;
+		case 9:
+			if (obj->type != ACPI_TYPE_BUFFER)
+				goto free_acpi_buffer;
+			memcpy(&cap->oem_id, obj->buffer.pointer,
+			       obj->buffer.length);
+			break;
+		}
+	}
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int query_buffer(struct com_buf_info *info)
+{
+	union acpi_object *out_obj;
+	struct pfru_device *pf_dev;
+	acpi_handle handle;
+	int i, ret = -EINVAL;
+
+	pf_dev = get_pfru_dev();
+	handle = ACPI_HANDLE(pf_dev->dev);
+
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_dev->uuid,
+					  pf_dev->rev_id, FUNC_QUERY_BUF,
+					  NULL, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		union acpi_object *obj = &out_obj->package.elements[i];
+
+		switch (i) {
+		case 0:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->status = obj->integer.value;
+			break;
+		case 1:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->ext_status = obj->integer.value;
+			break;
+		case 2:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->addr_lo = obj->integer.value;
+			break;
+		case 3:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->addr_hi = obj->integer.value;
+			break;
+		case 4:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->buf_size = obj->integer.value;
+			break;
+		}
+	}
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int get_image_type(efi_manage_capsule_image_header_t *img_hdr,
+			  int *type)
+{
+	int ret;
+	guid_t code_inj_id, drv_update_id, *image_type_id;
+
+	ret = guid_parse(PFRU_CODE_INJ_UUID, &code_inj_id);
+	if (ret)
+		return ret;
+	ret = guid_parse(PFRU_DRV_UPDATE_UUID, &drv_update_id);
+	if (ret)
+		return ret;
+	/* check whether this is a code injection or driver update */
+	image_type_id = &img_hdr->image_type_id;
+	if (guid_equal(image_type_id, &code_inj_id))
+		*type = CODE_INJECT_TYPE;
+	else if (guid_equal(image_type_id, &drv_update_id))
+		*type = DRIVER_UPDATE_TYPE;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * The (u64 hw_ins) was introduced in UEFI spec version 2,
+ * and (u64 capsule_support) was introduced in version 3.
+ * The size needs to be adjusted accordingly.
+ */
+static int adjust_efi_size(efi_manage_capsule_image_header_t *img_hdr,
+			   int *size)
+{
+	int tmp_size = *size;
+
+	tmp_size += sizeof(efi_manage_capsule_image_header_t);
+	switch (img_hdr->ver) {
+	case 1:
+		tmp_size -= 2 * sizeof(u64);
+		break;
+	case 2:
+		tmp_size -= sizeof(u64);
+		break;
+	default:
+		/* only support version 1 and 2 */
+		return -EINVAL;
+	}
+	*size = tmp_size;
+
+	return 0;
+}
+
+/*
+ * Sanity check if the capsule image has a newer version than current one.
+ * Return: true if it is valid, false otherwise.
+ */
+static bool valid_version(const void *data, struct update_cap_info *cap)
+{
+	struct payload_hdr *payload_hdr;
+	efi_capsule_header_t *cap_hdr;
+	efi_manage_capsule_header_t *m_hdr;
+	efi_manage_capsule_image_header_t *m_img_hdr;
+	efi_image_auth_t *auth;
+	int type, size, ret;
+
+	cap_hdr = (efi_capsule_header_t *)data;
+	size = cap_hdr->headersize;
+	m_hdr = (efi_manage_capsule_header_t *)(data + size);
+	/*
+	 * Current data structure size plus variable array indicated
+	 * by number of (emb_drv_cnt + payload_cnt)
+	 */
+	size += sizeof(efi_manage_capsule_header_t) +
+		      (m_hdr->emb_drv_cnt + m_hdr->payload_cnt) * sizeof(u64);
+	m_img_hdr = (efi_manage_capsule_image_header_t *)(data + size);
+
+	ret = get_image_type(m_img_hdr, &type);
+	if (ret)
+		return false;
+
+	ret = adjust_efi_size(m_img_hdr, &size);
+	if (ret)
+		return false;
+
+	auth = (efi_image_auth_t *)(data + size);
+	size += sizeof(u64) + auth->auth_info.hdr.len;
+	payload_hdr = (struct payload_hdr *)(data + size);
+
+	/* Finally, compare the version. */
+	if (type == CODE_INJECT_TYPE)
+		return payload_hdr->rt_ver >= cap->code_rt_version;
+	else
+		return payload_hdr->rt_ver >= cap->drv_rt_version;
+}
+
+static void parse_update_result(struct updated_result *result)
+{
+	pr_debug("Update result:\n");
+	pr_debug("Status:%d\n", result->status);
+	pr_debug("Extended Status:%d\n", result->ext_status);
+	pr_debug("Authentication Time Low:%ld\n", result->low_auth_time);
+	pr_debug("Authentication Time High:%ld\n", result->high_auth_time);
+	pr_debug("Execution Time Low:%ld\n", result->low_exec_time);
+	pr_debug("Execution Time High:%ld\n", result->high_exec_time);
+}
+
+static int start_acpi_update(int action)
+{
+	union acpi_object *out_obj, in_obj, in_buf;
+	struct updated_result update_result;
+	acpi_handle handle;
+	struct pfru_device *pf_dev;
+	int i, ret = -EINVAL;
+
+	pf_dev = get_pfru_dev();
+	memset(&in_obj, 0, sizeof(in_obj));
+	memset(&in_buf, 0, sizeof(in_buf));
+	in_obj.type = ACPI_TYPE_PACKAGE;
+	in_obj.package.count = 1;
+	in_obj.package.elements = &in_buf;
+	in_buf.type = ACPI_TYPE_INTEGER;
+	in_buf.integer.value = action;
+
+	handle = ACPI_HANDLE(pf_dev->dev);
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_dev->uuid,
+					  pf_dev->rev_id, FUNC_START,
+					  &in_obj, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		union acpi_object *obj = &out_obj->package.elements[i];
+
+		switch (i) {
+		case 0:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.status = obj->integer.value;
+			break;
+		case 1:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.ext_status = obj->integer.value;
+			break;
+		case 2:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.low_auth_time = obj->integer.value;
+			break;
+		case 3:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.high_auth_time = obj->integer.value;
+			break;
+		case 4:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.low_exec_time = obj->integer.value;
+			break;
+		case 5:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.high_exec_time = obj->integer.value;
+			break;
+		}
+	}
+	parse_update_result(&update_result);
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static long pfru_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *p;
+	int ret = 0, rev;
+	struct pfru_device *pf_dev;
+
+	pf_dev = get_pfru_dev();
+
+	p = (void __user *)arg;
+
+	switch (cmd) {
+	case PFRU_IOC_SET_REV:
+		if (copy_from_user(&rev, p, sizeof(unsigned int)))
+			return -EFAULT;
+		if (!valid_revid(rev))
+			return -EFAULT;
+		pf_dev->rev_id = rev;
+		break;
+	case PFRU_IOC_STAGE:
+		ret = start_acpi_update(START_STAGE);
+		if (ret)
+			return ret;
+		break;
+	case PFRU_IOC_ACTIVATE:
+		ret = start_acpi_update(START_ACTIVATE);
+		if (ret)
+			return ret;
+		break;
+	case PFRU_IOC_STAGE_ACTIVATE:
+		ret = start_acpi_update(START_STAGE_ACTIVATE);
+		if (ret)
+			return ret;
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long compat_pfru_ioctl(struct file *filep, unsigned int cmd,
+			      unsigned long arg)
+{
+	return pfru_ioctl(filep, cmd, arg);
+}
+#endif
+
+static int pfru_open(struct inode *inode, struct file *file)
+{
+	return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM;
+}
+
+static ssize_t pfru_write(struct file *file, const char __user *buf,
+			  size_t len, loff_t *ppos)
+{
+	struct update_cap_info cap;
+	struct com_buf_info info;
+	phys_addr_t phy_addr;
+	struct iov_iter iter;
+	struct iovec iov;
+	char *buf_ptr;
+	int ret;
+
+	ret = query_buffer(&info);
+	if (ret)
+		return ret;
+
+	if (len > info.buf_size)
+		return -EINVAL;
+
+	iov.iov_base = (void __user *)buf;
+	iov.iov_len = len;
+	iov_iter_init(&iter, WRITE, &iov, 1, len);
+
+	/* map the communication buffer */
+	phy_addr = (phys_addr_t)(info.addr_lo | (info.addr_hi << 32));
+	buf_ptr = memremap(phy_addr, info.buf_size, MEMREMAP_WB);
+	if (IS_ERR(buf_ptr))
+		return PTR_ERR(buf_ptr);
+	if (!copy_from_iter_full(buf_ptr, len, &iter)) {
+		pr_err("error! could not read capsule file\n");
+		ret = -EINVAL;
+		goto unmap;
+	}
+
+	/* Check if the capsule header has a valid version number. */
+	ret = query_capability(&cap);
+	if (ret)
+		goto unmap;
+
+	if (cap.status != DSM_SUCCEED) {
+		ret = -EBUSY;
+		goto unmap;
+	}
+	if (!valid_version(buf_ptr, &cap)) {
+		ret = -EINVAL;
+		goto unmap;
+	}
+	ret = 0;
+unmap:
+	memunmap(buf_ptr);
+
+	return ret ?: len;
+}
+
+static ssize_t pfru_read(struct file *filp, char __user *ubuf,
+			 size_t size, loff_t *off)
+{
+	struct update_cap_info cap;
+	int ret;
+
+	ret = query_capability(&cap);
+	if (ret)
+		return ret;
+
+	size = min_t(size_t, size, sizeof(cap));
+
+	if (copy_to_user(ubuf, &cap, size))
+		return -EFAULT;
+
+	return size;
+}
+
+static const struct file_operations acpi_pfru_fops = {
+	.owner		= THIS_MODULE,
+	.write		= pfru_write,
+	.read		= pfru_read,
+	.open		= pfru_open,
+	.unlocked_ioctl = pfru_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_pfru_ioctl,
+#endif
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice pfru_misc_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "pfru_update",
+	.nodename = "pfru/update",
+	.fops = &acpi_pfru_fops,
+};
+
+static int acpi_pfru_remove(struct platform_device *pdev)
+{
+	misc_deregister(&pfru_misc_dev);
+
+	return 0;
+}
+
+static int acpi_pfru_probe(struct platform_device *pdev)
+{
+	acpi_handle handle;
+	struct pfru_device *pf_dev;
+	int ret;
+
+	pf_dev = get_pfru_dev();
+
+	ret = guid_parse(PFRU_UUID, &pf_dev->uuid);
+	if (ret)
+		return ret;
+	/* default rev id is 1 */
+	pf_dev->rev_id = 1;
+	pf_dev->dev = &pdev->dev;
+	handle = ACPI_HANDLE(pf_dev->dev);
+	if (!acpi_has_method(handle, "_DSM")) {
+		pr_err("Missing _DSM\n");
+		return -ENODEV;
+	}
+
+	ret = misc_register(&pfru_misc_dev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static const struct acpi_device_id acpi_pfru_ids[] = {
+	{"INTC1080", 0},
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, acpi_pfru_ids);
+
+static struct platform_driver acpi_pfru_driver = {
+	.driver = {
+		.name = "pfru_update",
+		.acpi_match_table = acpi_pfru_ids,
+	},
+	.probe = acpi_pfru_probe,
+	.remove = acpi_pfru_remove,
+};
+module_platform_driver(acpi_pfru_driver);
+
+MODULE_DESCRIPTION("Platform Firmware Runtime Update device driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/uapi/linux/pfru.h b/include/uapi/linux/pfru.h
new file mode 100644
index 000000000000..81eb8ad5a57e
--- /dev/null
+++ b/include/uapi/linux/pfru.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Platform Firmware Runtime Update header
+ *
+ * Copyright(c) 2021 Intel Corporation. All rights reserved.
+ */
+#ifndef __PFRU_H__
+#define __PFRU_H__
+
+#include <linux/ioctl.h>
+#include <linux/uuid.h>
+
+#define PFRU_UUID		"ECF9533B-4A3C-4E89-939E-C77112601C6D"
+#define PFRU_CODE_INJ_UUID		"B2F84B79-7B6E-4E45-885F-3FB9BB185402"
+#define PFRU_DRV_UPDATE_UUID		"4569DD8C-75F1-429A-A3D6-24DE8097A0DF"
+
+#define FUNC_STANDARD_QUERY	0
+#define FUNC_QUERY_UPDATE_CAP	1
+#define FUNC_QUERY_BUF		2
+#define FUNC_START		3
+
+#define CODE_INJECT_TYPE	1
+#define DRIVER_UPDATE_TYPE	2
+
+#define REVID_1		1
+#define REVID_2		2
+
+#define PFRU_MAGIC 0xEE
+
+#define PFRU_IOC_SET_REV _IOW(PFRU_MAGIC, 0x01, unsigned int)
+#define PFRU_IOC_STAGE _IOW(PFRU_MAGIC, 0x02, unsigned int)
+#define PFRU_IOC_ACTIVATE _IOW(PFRU_MAGIC, 0x03, unsigned int)
+#define PFRU_IOC_STAGE_ACTIVATE _IOW(PFRU_MAGIC, 0x04, unsigned int)
+
+static inline int valid_revid(int id)
+{
+	return (id == REVID_1) || (id == REVID_2);
+}
+
+/* Capsule file payload header */
+struct payload_hdr {
+	__u32	sig;
+	__u32	hdr_version;
+	__u32	hdr_size;
+	__u32	hw_ver;
+	__u32	rt_ver;
+	guid_t	platform_id;
+};
+
+enum start_action {
+	START_STAGE,
+	START_ACTIVATE,
+	START_STAGE_ACTIVATE,
+};
+
+enum dsm_status {
+	DSM_SUCCEED,
+	DSM_FUNC_NOT_SUPPORT,
+	DSM_INVAL_INPUT,
+	DSM_HARDWARE_ERR,
+	DSM_RETRY_SUGGESTED,
+	DSM_UNKNOWN,
+	DSM_FUNC_SPEC_ERR,
+};
+
+struct update_cap_info {
+	enum dsm_status status;
+	int update_cap;
+
+	guid_t code_type;
+	int fw_version;
+	int code_rt_version;
+
+	guid_t drv_type;
+	int drv_rt_version;
+	int drv_svn;
+
+	guid_t platform_id;
+	guid_t oem_id;
+
+	char oem_info[];
+};
+
+struct com_buf_info {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	unsigned long addr_lo;
+	unsigned long addr_hi;
+	int buf_size;
+};
+
+struct capsulate_buf_info {
+	unsigned long src;
+	int size;
+};
+
+struct updated_result {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	unsigned long low_auth_time;
+	unsigned long high_auth_time;
+	unsigned long low_exec_time;
+	unsigned long high_exec_time;
+};
+
+#endif /* __PFRU_H__ */
-- 
Gitee


From 9ebf31e3f6bfc043efff4d42bedd3e034ee479b5 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Tue, 7 Sep 2021 23:39:40 +0800
Subject: [PATCH 1216/2161] drivers/acpi: Introduce Platform Firmware Runtime
 Update Telemetry

commit 0fde088a9be4114cda4e4560293977a51815877d Intel-BKC.

Platform Firmware Runtime Update(PFRU) Telemetry Service is part of RoT
(Root of Trust), which allows PFRU handler and other PFRU drivers to produce
telemetry data to upper layer OS consumer at runtime.

The linux provides interfaces for the user to query the parameters of
telemetry data, and the user could read out the telemetry data accordingly.

Typical log looks like:

[SmmRuntimeUpdateHandler.ProcessSmmRuntimeUpdate] ProcessSmmRuntimeUpdate = START, Action = 2
[SmmRuntimeUpdateHandler.ProcessSmmRuntimeUpdate] FwVersion = 0, CodeInjectionVersion = 1
[ShadowSmmRuntimeUpdateImage] Image = 0x74D9B000, ImageSize = 0x1172
[ProcessSmmRuntimeUpdate] ShadowSmmRuntimeUpdateImage.Status = Success
[ValidateSmmRuntimeUpdateImage] CapsuleHeader.CapsuleGuid = 6DCBD5ED-E82D-4C44-BDA1-7194199AD92A
[ValidateSmmRuntimeUpdateImage] FmpCapHeader.Version = 1
[ValidateSmmRuntimeUpdateImage] FmpCapImageHeader.UpdateImageTypeId = B2F84B79-7B6E-4E45-885F-3FB9BB185402
[ValidateSmmRuntimeUpdateImage] SmmRuntimeUpdateVerifyImageWithDenylist.Status = Success
[ValidateSmmRuntimeUpdateImage] SmmRuntimeUpdateVerifyImageWithAllowlist.Status = Success
[SmmCodeInjectionVerifyPayloadHeader] PayloadHeader.Signature = 0x31494353
[SmmCodeInjectionVerifyPayloadHeader] PayloadHeader.PlatformId = 63462139-A8B1-AA4E-9024-F2BB53EA4723
[SmmCodeInjectionVerifyPayloadHeader] PayloadHeader.SupportedSmmFirmwareVersion = 0, PayloadHeader.SmmCodeInjectionRuntimeVersion = 1
[ProcessSmmRuntimeUpdate] ValidateSmmRuntimeUpdateImage.Status = Success
CPU CSR[0B102D28] Before = 7FBF830E
CPU CSR[0B102D28] After = 7FBF8310
[ProcessSmmRuntimeUpdate] ProcessSmmCodeInjection.Status = Success
[SmmRuntimeUpdateHandler.ProcessSmmRuntimeUpdate] ProcessSmmRuntimeUpdate = End, Status = Success

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/pfru/Kconfig          |  14 +
 drivers/acpi/pfru/Makefile         |   1 +
 drivers/acpi/pfru/pfru_telemetry.c | 412 +++++++++++++++++++++++++++++
 include/uapi/linux/pfru.h          |  46 ++++
 4 files changed, 473 insertions(+)
 create mode 100644 drivers/acpi/pfru/pfru_telemetry.c

diff --git a/drivers/acpi/pfru/Kconfig b/drivers/acpi/pfru/Kconfig
index 3f31b7d95f3b..e2934058884e 100644
--- a/drivers/acpi/pfru/Kconfig
+++ b/drivers/acpi/pfru/Kconfig
@@ -13,3 +13,17 @@ config ACPI_PFRU
 
 	  To compile this driver as a module, choose M here:
 	  the module will be called pfru_update.
+
+config ACPI_PFRU_TELEMETRY
+	tristate "ACPI Platform Firmware Runtime Update Telemetry Service"
+	depends on ACPI_PFRU
+	help
+	  PFRU(Platform Firmware Runtime Update) Telemetry Service is part of
+	  RoT(Root of Trust), which allows Platform Firmware Runtime Update handler
+	  and other PFRU drivers to produce telemetry data to upper layer OS consumer
+	  at runtime.
+
+	  For more information, see:
+	  <file:Documentation/x86/pfru_update.rst>
+
+	  If unsure, please say N.
diff --git a/drivers/acpi/pfru/Makefile b/drivers/acpi/pfru/Makefile
index 098cbe80cf3d..30060ba320ea 100644
--- a/drivers/acpi/pfru/Makefile
+++ b/drivers/acpi/pfru/Makefile
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_ACPI_PFRU) += pfru_update.o
+obj-$(CONFIG_ACPI_PFRU_TELEMETRY) += pfru_telemetry.o
diff --git a/drivers/acpi/pfru/pfru_telemetry.c b/drivers/acpi/pfru/pfru_telemetry.c
new file mode 100644
index 000000000000..726f2a03dd11
--- /dev/null
+++ b/drivers/acpi/pfru/pfru_telemetry.c
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACPI Platform Firmware Runtime Update
+ * Telemetry Service Device Driver
+ *
+ * Copyright (C) 2021 Intel Corporation
+ * Author: Chen Yu <yu.c.chen@intel.com>
+ */
+#include <linux/acpi.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/minmax.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/platform_device.h>
+#include <linux/uuid.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/pfru.h>
+
+struct pfru_telem_device {
+	struct device *dev;
+	guid_t uuid;
+	struct telem_info info;
+};
+
+static struct pfru_telem_device telem_dev;
+static struct pfru_telem_device *get_pfru_telem_dev(void)
+{
+	return &telem_dev;
+}
+
+static int get_pfru_data_info(struct telem_data_info *data_info,
+			      int log_type)
+{
+	struct pfru_telem_device *pf_telem_dev;
+	union acpi_object *out_obj, in_obj, in_buf;
+	acpi_handle handle;
+	int i, ret = -EINVAL;
+
+	pf_telem_dev = get_pfru_telem_dev();
+	handle = ACPI_HANDLE(pf_telem_dev->dev);
+
+	memset(&in_obj, 0, sizeof(in_obj));
+	memset(&in_buf, 0, sizeof(in_buf));
+	in_obj.type = ACPI_TYPE_PACKAGE;
+	in_obj.package.count = 1;
+	in_obj.package.elements = &in_buf;
+	in_buf.type = ACPI_TYPE_INTEGER;
+	in_buf.integer.value = log_type;
+
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_telem_dev->uuid,
+					  pf_telem_dev->info.log_revid, FUNC_GET_DATA,
+					  &in_obj, ACPI_TYPE_PACKAGE);
+	if (!out_obj) {
+		pr_err("Failed to invoke _DSM\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		union acpi_object *obj = &out_obj->package.elements[i];
+
+		switch (i) {
+		case 0:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->status = obj->integer.value;
+			break;
+		case 1:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->ext_status = obj->integer.value;
+			break;
+		case 2:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->max_data_size = obj->integer.value;
+			break;
+		case 3:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk1_addr_lo = obj->integer.value;
+			break;
+		case 4:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk1_addr_hi = obj->integer.value;
+			break;
+		case 5:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk1_size = obj->integer.value;
+			break;
+		case 6:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk2_addr_lo = obj->integer.value;
+			break;
+		case 7:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk2_addr_hi = obj->integer.value;
+			break;
+		case 8:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk2_size = obj->integer.value;
+			break;
+		case 9:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->rollover_cnt = obj->integer.value;
+			break;
+		case 10:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->reset_cnt = obj->integer.value;
+			break;
+		}
+	}
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int set_pfru_log_level(int level)
+{
+	union acpi_object *out_obj, *obj, in_obj, in_buf;
+	struct pfru_telem_device *pf_telem_dev;
+	enum dsm_status status;
+	acpi_handle handle;
+	int ret = -EINVAL;
+
+	pf_telem_dev = get_pfru_telem_dev();
+	handle = ACPI_HANDLE(pf_telem_dev->dev);
+
+	memset(&in_obj, 0, sizeof(in_obj));
+	memset(&in_buf, 0, sizeof(in_buf));
+	in_obj.type = ACPI_TYPE_PACKAGE;
+	in_obj.package.count = 1;
+	in_obj.package.elements = &in_buf;
+	in_buf.type = ACPI_TYPE_INTEGER;
+	in_buf.integer.value = level;
+
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_telem_dev->uuid,
+					  pf_telem_dev->info.log_revid, FUNC_SET_LEV,
+					  &in_obj, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	obj = &out_obj->package.elements[0];
+	status = obj->integer.value;
+	if (status) {
+		pr_err("get MM telemetry level error status %d\n",
+		       status);
+		goto free_acpi_buffer;
+	}
+
+	obj = &out_obj->package.elements[1];
+	status = obj->integer.value;
+	if (status) {
+		pr_err("get MM telemetry level error extend status %d\n",
+		       status);
+		goto free_acpi_buffer;
+	}
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int get_pfru_log_level(int *level)
+{
+	struct pfru_telem_device *pf_telem_dev;
+	union acpi_object *out_obj, *obj;
+	enum dsm_status status;
+	acpi_handle handle;
+	int ret = -EINVAL;
+
+	pf_telem_dev = get_pfru_telem_dev();
+	handle = ACPI_HANDLE(pf_telem_dev->dev);
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_telem_dev->uuid,
+					  pf_telem_dev->info.log_revid, FUNC_GET_LEV,
+					  NULL, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	obj = &out_obj->package.elements[0];
+	if (obj->type != ACPI_TYPE_INTEGER)
+		goto free_acpi_buffer;
+	status = obj->integer.value;
+	if (status) {
+		pr_err("get MM telemetry level error status %d\n",
+		       status);
+		goto free_acpi_buffer;
+	}
+
+	obj = &out_obj->package.elements[1];
+	if (obj->type != ACPI_TYPE_INTEGER)
+		goto free_acpi_buffer;
+	status = obj->integer.value;
+	if (status) {
+		pr_err("get MM telemetry level error status %d\n",
+		       status);
+		goto free_acpi_buffer;
+	}
+
+	obj = &out_obj->package.elements[2];
+	if (obj->type != ACPI_TYPE_INTEGER)
+		goto free_acpi_buffer;
+	*level = obj->integer.value;
+
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int valid_log_level(int level)
+{
+	return (level == LOG_ERR) || (level == LOG_WARN) ||
+		(level == LOG_INFO) || (level == LOG_VERB);
+}
+
+static int valid_log_type(int type)
+{
+	return (type == LOG_EXEC_IDX) || (type == LOG_HISTORY_IDX);
+}
+
+static long pfru_telemetry_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+
+{
+	struct pfru_telem_device *pf_telem_dev;
+	struct telem_data_info data_info;
+	struct telem_info info;
+	void __user *p;
+	int ret = 0;
+
+	pf_telem_dev = get_pfru_telem_dev();
+	p = (void __user *)arg;
+
+	switch (cmd) {
+	case PFRU_LOG_IOC_SET_INFO:
+		if (copy_from_user(&info, p, sizeof(info)))
+			return -EFAULT;
+		if (valid_revid(info.log_revid))
+			pf_telem_dev->info.log_revid = info.log_revid;
+
+		if (valid_log_level(info.log_level)) {
+			ret = set_pfru_log_level(info.log_level);
+			if (ret)
+				return ret;
+			pf_telem_dev->info.log_level = info.log_level;
+		}
+		if (valid_log_type(info.log_type))
+			pf_telem_dev->info.log_type = info.log_type;
+		break;
+	case PFRU_LOG_IOC_GET_INFO:
+		ret = get_pfru_log_level(&info.log_level);
+		if (ret)
+			return ret;
+		info.log_type = pf_telem_dev->info.log_type;
+		info.log_revid = pf_telem_dev->info.log_revid;
+		if (copy_to_user(p, &info, sizeof(info)))
+			ret = -EFAULT;
+		break;
+	case PFRU_LOG_IOC_GET_DATA_INFO:
+		ret = get_pfru_data_info(&data_info, pf_telem_dev->info.log_type);
+		if (ret)
+			return ret;
+		if (copy_to_user(p, &data_info, sizeof(struct telem_data_info)))
+			ret = -EFAULT;
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	return ret;
+}
+
+static ssize_t pfru_telemetry_read(struct file *filp, char __user *ubuf,
+				   size_t size, loff_t *off)
+{
+	struct pfru_telem_device *pf_telem_dev;
+	struct telem_data_info info;
+	phys_addr_t base_addr;
+	int buf_size, ret;
+	char *buf_ptr;
+
+	if (*off < 0)
+		return -EINVAL;
+
+	pf_telem_dev = get_pfru_telem_dev();
+
+	ret = get_pfru_data_info(&info, pf_telem_dev->info.log_type);
+	if (ret) {
+		pr_err("Could not get telemetry data info %d\n", ret);
+		return ret;
+	}
+
+	base_addr = (phys_addr_t)(info.chunk2_addr_lo |
+			(info.chunk2_addr_hi << 32));
+	if (!base_addr) {
+		pr_err("Telemetry data not ready\n");
+		return -EBUSY;
+	}
+
+	buf_size = info.max_data_size;
+	if (*off >= buf_size)
+		return 0;
+
+	buf_ptr = memremap(base_addr, buf_size, MEMREMAP_WB);
+	if (IS_ERR(buf_ptr))
+		return PTR_ERR(buf_ptr);
+
+	size = min_t(size_t, size, buf_size - *off);
+
+	ret = -EFAULT;
+	if (copy_to_user(ubuf, buf_ptr + *off, size))
+		goto out;
+	ret = 0;
+out:
+	memunmap(buf_ptr);
+
+	return ret ? ret : size;
+}
+
+#ifdef CONFIG_COMPAT
+static long compat_pfru_telemetry_ioctl(struct file *filep, unsigned int cmd,
+					unsigned long arg)
+{
+	return pfru_telemetry_ioctl(filep, cmd, arg);
+}
+#endif
+
+static const struct file_operations acpi_pfru_telemetry_fops = {
+	.owner		= THIS_MODULE,
+	.read		= pfru_telemetry_read,
+	.unlocked_ioctl = pfru_telemetry_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_pfru_telemetry_ioctl,
+#endif
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice pfru_telemetry_misc_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "pfru_telemetry",
+	.nodename = "pfru/telemetry",
+	.fops = &acpi_pfru_telemetry_fops,
+};
+
+static int acpi_pfru_telemetry_remove(struct platform_device *pdev)
+{
+	misc_deregister(&pfru_telemetry_misc_dev);
+
+	return 0;
+}
+
+static int acpi_pfru_telemetry_probe(struct platform_device *pdev)
+{
+	struct pfru_telem_device *pf_telem_dev;
+	acpi_handle handle;
+	int ret;
+
+	pf_telem_dev = get_pfru_telem_dev();
+
+	ret = guid_parse(PFRU_TELEMETRY_UUID, &pf_telem_dev->uuid);
+	if (ret)
+		return ret;
+
+	pf_telem_dev->info.log_revid = 1;
+	pf_telem_dev->dev = &pdev->dev;
+	handle = ACPI_HANDLE(pf_telem_dev->dev);
+	if (!acpi_has_method(handle, "_DSM")) {
+		pr_err("Missing _DSM\n");
+		return -ENODEV;
+	}
+
+	ret = misc_register(&pfru_telemetry_misc_dev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static const struct acpi_device_id acpi_pfru_telemetry_ids[] = {
+	{"INTC1081", 0},
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, acpi_pfru_telemetry_ids);
+
+static struct platform_driver acpi_pfru_telemetry_driver = {
+	.driver = {
+		.name = "pfru_telemetry",
+		.acpi_match_table = acpi_pfru_telemetry_ids,
+	},
+	.probe = acpi_pfru_telemetry_probe,
+	.remove = acpi_pfru_telemetry_remove,
+};
+module_platform_driver(acpi_pfru_telemetry_driver);
+
+MODULE_DESCRIPTION("Platform Firmware Runtime Update Telemetry Service device driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/uapi/linux/pfru.h b/include/uapi/linux/pfru.h
index 81eb8ad5a57e..b4d5c0078cfb 100644
--- a/include/uapi/linux/pfru.h
+++ b/include/uapi/linux/pfru.h
@@ -103,4 +103,50 @@ struct updated_result {
 	unsigned long high_exec_time;
 };
 
+#define PFRU_TELEMETRY_UUID	"75191659-8178-4D9D-B88F-AC5E5E93E8BF"
+
+/* Telemetry structures. */
+struct telem_data_info {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	/* Maximum supported size of data of
+	 * all Data Chunks combined.
+	 */
+	unsigned long chunk1_addr_lo;
+	unsigned long chunk1_addr_hi;
+	unsigned long chunk2_addr_lo;
+	unsigned long chunk2_addr_hi;
+	int max_data_size;
+	int chunk1_size;
+	int chunk2_size;
+	int rollover_cnt;
+	int reset_cnt;
+};
+
+struct telem_info {
+	int log_level;
+	int log_type;
+	int log_revid;
+};
+
+/* Two logs: history and execution log */
+#define LOG_EXEC_IDX	0
+#define LOG_HISTORY_IDX	1
+#define NR_LOG_TYPE	2
+
+#define LOG_ERR		0
+#define LOG_WARN	1
+#define LOG_INFO	2
+#define LOG_VERB	4
+
+#define FUNC_SET_LEV		1
+#define FUNC_GET_LEV		2
+#define FUNC_GET_DATA		3
+
+#define LOG_NAME_SIZE		10
+
+#define PFRU_LOG_IOC_SET_INFO _IOW(PFRU_MAGIC, 0x05, struct telem_info)
+#define PFRU_LOG_IOC_GET_INFO _IOR(PFRU_MAGIC, 0x06, struct telem_info)
+#define PFRU_LOG_IOC_GET_DATA_INFO _IOR(PFRU_MAGIC, 0x07, struct telem_data_info)
+
 #endif /* __PFRU_H__ */
-- 
Gitee


From 1e03d1f6e22308791f0e8eed14abe9736cd34b74 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Tue, 7 Sep 2021 23:40:30 +0800
Subject: [PATCH 1217/2161] selftests/pfru: add test for Platform Firmware
 Runtime Update and Telemetry

commit opencloudos.

Introduce a simple test for Platform Firmware Runtime Update and Telemetry
drivers. It is based on ioctl to either update firmware driver or code injection,
and read corresponding PFRU Telemetry log into user space.

For example:

./pfru_test -h
usage: pfru_test [OPTIONS]
 code injection:
  -l, --load
  -s, --stage
  -a, --activate
  -u, --update [stage and activate]
  -q, --query
  -d, --revid update
 telemetry:
  -G, --getloginfo
  -T, --type(0:execution, 1:history)
  -L, --level(0, 1, 2, 4)
  -R, --read
  -D, --revid log

./pfru_test -G
 log_level:4
 log_type:0
 log_revid:2
 max_data_size:65536
 chunk1_size:0
 chunk2_size:1401
 rollover_cnt:0
 reset_cnt:4

./pfru_test -q
 code injection image type:794bf8b2-6e7b-454e-885f-3fb9bb185402
 fw_version:0
 code_rt_version:1
 driver update image type:0e5f0b14-f849-7945-ad81-bc7b6d2bb245
 drv_rt_version:0
 drv_svn:0
 platform id:39214663-b1a8-4eaa-9024-f2bb53ea4723
 oem id:a36db54f-ea2a-e14e-b7c4-b5780e51ba3d

Tested-by: Dou Shengnan <shengnanx.dou@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/Makefile         |   1 +
 tools/testing/selftests/pfru/Makefile    |   7 +
 tools/testing/selftests/pfru/config      |   2 +
 tools/testing/selftests/pfru/pfru.h      | 152 +++++++++++
 tools/testing/selftests/pfru/pfru_test.c | 324 +++++++++++++++++++++++
 5 files changed, 486 insertions(+)
 create mode 100644 tools/testing/selftests/pfru/Makefile
 create mode 100644 tools/testing/selftests/pfru/config
 create mode 100644 tools/testing/selftests/pfru/pfru.h
 create mode 100644 tools/testing/selftests/pfru/pfru_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 6ede894d5c50..23b2e36ab645 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -43,6 +43,7 @@ TARGETS += seccomp
 TARGETS += sgx
 TARGETS += sigaltstack
 TARGETS += size
+TARGETS += pfru
 TARGETS += sparc64
 TARGETS += splice
 TARGETS += static_keys
diff --git a/tools/testing/selftests/pfru/Makefile b/tools/testing/selftests/pfru/Makefile
new file mode 100644
index 000000000000..c61916ccf637
--- /dev/null
+++ b/tools/testing/selftests/pfru/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+CFLAGS += -Wall -O2
+LDLIBS := -luuid
+
+TEST_GEN_PROGS := pfru_test
+include ../lib.mk
diff --git a/tools/testing/selftests/pfru/config b/tools/testing/selftests/pfru/config
new file mode 100644
index 000000000000..37f53609acbd
--- /dev/null
+++ b/tools/testing/selftests/pfru/config
@@ -0,0 +1,2 @@
+CONFIG_ACPI_PFRU=m
+CONFIG_ACPI_PFRU_TELEMETRY=m
diff --git a/tools/testing/selftests/pfru/pfru.h b/tools/testing/selftests/pfru/pfru.h
new file mode 100644
index 000000000000..8cd4ed80b161
--- /dev/null
+++ b/tools/testing/selftests/pfru/pfru.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Platform Firmware Runtime Update header
+ *
+ * Copyright(c) 2021 Intel Corporation. All rights reserved.
+ */
+#ifndef __PFRU_H__
+#define __PFRU_H__
+
+#include <linux/ioctl.h>
+#include <uuid/uuid.h>
+
+#define PFRU_UUID		"ECF9533B-4A3C-4E89-939E-C77112601C6D"
+#define PFRU_CODE_INJ_UUID		"B2F84B79-7B6E-4E45-885F-3FB9BB185402"
+#define PFRU_DRV_UPDATE_UUID		"4569DD8C-75F1-429A-A3D6-24DE8097A0DF"
+
+#define FUNC_STANDARD_QUERY	0
+#define FUNC_QUERY_UPDATE_CAP	1
+#define FUNC_QUERY_BUF		2
+#define FUNC_START		3
+
+#define CODE_INJECT_TYPE	1
+#define DRIVER_UPDATE_TYPE	2
+
+#define REVID_1		1
+#define REVID_2		2
+
+#define PFRU_MAGIC 0xEE
+
+#define PFRU_IOC_SET_REV _IOW(PFRU_MAGIC, 0x01, unsigned int)
+#define PFRU_IOC_STAGE _IOW(PFRU_MAGIC, 0x02, unsigned int)
+#define PFRU_IOC_ACTIVATE _IOW(PFRU_MAGIC, 0x03, unsigned int)
+#define PFRU_IOC_STAGE_ACTIVATE _IOW(PFRU_MAGIC, 0x04, unsigned int)
+
+static inline int valid_revid(int id)
+{
+	return (id == REVID_1) || (id == REVID_2);
+}
+
+/* Capsule file payload header */
+struct payload_hdr {
+	__u32	sig;
+	__u32	hdr_version;
+	__u32	hdr_size;
+	__u32	hw_ver;
+	__u32	rt_ver;
+	uuid_t	platform_id;
+};
+
+enum start_action {
+	START_STAGE,
+	START_ACTIVATE,
+	START_STAGE_ACTIVATE,
+};
+
+enum dsm_status {
+	DSM_SUCCEED,
+	DSM_FUNC_NOT_SUPPORT,
+	DSM_INVAL_INPUT,
+	DSM_HARDWARE_ERR,
+	DSM_RETRY_SUGGESTED,
+	DSM_UNKNOWN,
+	DSM_FUNC_SPEC_ERR,
+};
+
+struct update_cap_info {
+	enum dsm_status status;
+	int update_cap;
+
+	uuid_t code_type;
+	int fw_version;
+	int code_rt_version;
+
+	uuid_t drv_type;
+	int drv_rt_version;
+	int drv_svn;
+
+	uuid_t platform_id;
+	uuid_t oem_id;
+
+	char oem_info[];
+};
+
+struct com_buf_info {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	unsigned long addr_lo;
+	unsigned long addr_hi;
+	int buf_size;
+};
+
+struct capsulate_buf_info {
+	unsigned long src;
+	int size;
+};
+
+struct updated_result {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	unsigned long low_auth_time;
+	unsigned long high_auth_time;
+	unsigned long low_exec_time;
+	unsigned long high_exec_time;
+};
+
+#define PFRU_TELEMETRY_UUID	"75191659-8178-4D9D-B88F-AC5E5E93E8BF"
+
+/* Telemetry structures. */
+struct telem_data_info {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	/* Maximum supported size of data of
+	 * all Data Chunks combined.
+	 */
+	unsigned long chunk1_addr_lo;
+	unsigned long chunk1_addr_hi;
+	unsigned long chunk2_addr_lo;
+	unsigned long chunk2_addr_hi;
+	int max_data_size;
+	int chunk1_size;
+	int chunk2_size;
+	int rollover_cnt;
+	int reset_cnt;
+};
+
+struct telem_info {
+	int log_level;
+	int log_type;
+	int log_revid;
+};
+
+/* Two logs: history and execution log */
+#define LOG_EXEC_IDX	0
+#define LOG_HISTORY_IDX	1
+#define NR_LOG_TYPE	2
+
+#define LOG_ERR		0
+#define LOG_WARN	1
+#define LOG_INFO	2
+#define LOG_VERB	4
+
+#define FUNC_SET_LEV		1
+#define FUNC_GET_LEV		2
+#define FUNC_GET_DATA		3
+
+#define LOG_NAME_SIZE		10
+
+#define PFRU_LOG_IOC_SET_INFO _IOW(PFRU_MAGIC, 0x05, struct telem_info)
+#define PFRU_LOG_IOC_GET_INFO _IOR(PFRU_MAGIC, 0x06, struct telem_info)
+#define PFRU_LOG_IOC_GET_DATA_INFO _IOR(PFRU_MAGIC, 0x07, struct telem_data_info)
+
+#endif /* __PFRU_H__ */
diff --git a/tools/testing/selftests/pfru/pfru_test.c b/tools/testing/selftests/pfru/pfru_test.c
new file mode 100644
index 000000000000..aae77e6bdad2
--- /dev/null
+++ b/tools/testing/selftests/pfru/pfru_test.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Runtime Update/Telemetry (see Documentation/x86/pfru_update.rst)
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include "pfru.h"
+
+#define MAX_LOG_SIZE 65536
+
+struct update_cap_info cap_info;
+struct com_buf_info buf_info;
+struct capsulate_buf_info image_info;
+struct telem_data_info data_info;
+char *capsule_name;
+int action, query_cap, log_type, log_level, log_read, log_getinfo,
+	revid, log_revid;
+int set_log_level, set_log_type,
+	set_revid, set_log_revid;
+
+char *progname;
+
+static int valid_log_level(int level)
+{
+	return (level == LOG_ERR) || (level == LOG_WARN) ||
+		(level == LOG_INFO) || (level == LOG_VERB);
+}
+
+static int valid_log_type(int type)
+{
+	return (type == LOG_EXEC_IDX) || (type == LOG_HISTORY_IDX);
+}
+
+static void help(void)
+{
+	fprintf(stderr,
+		"usage: %s [OPTIONS]\n"
+		" code injection:\n"
+		"  -l, --load\n"
+		"  -s, --stage\n"
+		"  -a, --activate\n"
+		"  -u, --update [stage and activate]\n"
+		"  -q, --query\n"
+		"  -d, --revid update\n"
+		" telemetry:\n"
+		"  -G, --getloginfo\n"
+		"  -T, --type(0:execution, 1:history)\n"
+		"  -L, --level(0, 1, 2, 4)\n"
+		"  -R, --read\n"
+		"  -D, --revid log\n",
+		progname);
+}
+
+char *option_string = "l:sauqd:GT:L:RD:h";
+static struct option long_options[] = {
+	{"load", required_argument, 0, 'l'},
+	{"stage", no_argument, 0, 's'},
+	{"activate", no_argument, 0, 'a'},
+	{"update", no_argument, 0, 'u'},
+	{"query", no_argument, 0, 'q'},
+	{"getloginfo", no_argument, 0, 'G'},
+	{"type", required_argument, 0, 'T'},
+	{"level", required_argument, 0, 'L'},
+	{"read", no_argument, 0, 'R'},
+	{"setrev", required_argument, 0, 'd'},
+	{"setrevlog", required_argument, 0, 'D'},
+	{"help", no_argument, 0, 'h'},
+	{}
+};
+
+static void parse_options(int argc, char **argv)
+{
+	char *pathname;
+	int c;
+
+	pathname = strdup(argv[0]);
+	progname = basename(pathname);
+
+	while (1) {
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, option_string,
+				long_options, &option_index);
+		if (c == -1)
+			break;
+		switch (c) {
+		case 'l':
+			capsule_name = optarg;
+			break;
+		case 's':
+			action = 1;
+			break;
+		case 'a':
+			action = 2;
+			break;
+		case 'u':
+			action = 3;
+			break;
+		case 'q':
+			query_cap = 1;
+			break;
+		case 'G':
+			log_getinfo = 1;
+			break;
+		case 'T':
+			log_type = atoi(optarg);
+			set_log_type = 1;
+			break;
+		case 'L':
+			log_level = atoi(optarg);
+			set_log_level = 1;
+			break;
+		case 'R':
+			log_read = 1;
+			break;
+		case 'd':
+			revid = atoi(optarg);
+			set_revid = 1;
+			break;
+		case 'D':
+			log_revid = atoi(optarg);
+			set_log_revid = 1;
+			break;
+		case 'h':
+			help();
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void print_cap(struct update_cap_info *cap)
+{
+	char *uuid = malloc(37);
+
+	if (!uuid) {
+		perror("Can not allocate uuid buffer\n");
+		exit(1);
+	}
+	uuid_unparse(cap->code_type, uuid);
+	printf("code injection image type:%s\n", uuid);
+	printf("fw_version:%d\n", cap->fw_version);
+	printf("code_rt_version:%d\n", cap->code_rt_version);
+
+	uuid_unparse(cap->drv_type, uuid);
+	printf("driver update image type:%s\n", uuid);
+	printf("drv_rt_version:%d\n", cap->drv_rt_version);
+	printf("drv_svn:%d\n", cap->drv_svn);
+
+	uuid_unparse(cap->platform_id, uuid);
+	printf("platform id:%s\n", uuid);
+	uuid_unparse(cap->oem_id, uuid);
+	printf("oem id:%s\n", uuid);
+
+	free(uuid);
+}
+
+int main(int argc, char *argv[])
+{
+	int fd_update, fd_log, fd_capsule;
+	struct telem_data_info data_info;
+	struct telem_info info;
+	struct update_cap_info cap;
+	void *addr_map_capsule;
+	struct stat st;
+	char *log_buf;
+	int ret = 0;
+
+	parse_options(argc, argv);
+
+	fd_log = open("/dev/pfru/telemetry", O_RDWR);
+	if (fd_log < 0) {
+		perror("Cannot open telemetry device...");
+		return 1;
+	}
+	fd_update = open("/dev/pfru/update", O_RDWR);
+	if (fd_update < 0) {
+		perror("Cannot open code injection device...");
+		return 1;
+	}
+
+	if (query_cap) {
+		ret = read(fd_update, &cap, sizeof(cap));
+		if (ret == -1) {
+			perror("Read error.");
+			return 1;
+		}
+		print_cap(&cap);
+	}
+
+	if (log_getinfo) {
+		ret = ioctl(fd_log, PFRU_LOG_IOC_GET_DATA_INFO, &data_info);
+		if (ret) {
+			perror("Get log data info failed.");
+			return 1;
+		}
+		ret = ioctl(fd_log, PFRU_LOG_IOC_GET_INFO, &info);
+		if (ret) {
+			perror("Get log info failed.");
+			return 1;
+		}
+		printf("log_level:%d\n", info.log_level);
+		printf("log_type:%d\n", info.log_type);
+		printf("log_revid:%d\n", info.log_revid);
+		printf("max_data_size:%d\n", data_info.max_data_size);
+		printf("chunk1_size:%d\n", data_info.chunk1_size);
+		printf("chunk2_size:%d\n", data_info.chunk2_size);
+		printf("rollover_cnt:%d\n", data_info.rollover_cnt);
+		printf("reset_cnt:%d\n", data_info.reset_cnt);
+
+		return 0;
+	}
+
+	info.log_level = -1;
+	info.log_type = -1;
+	info.log_revid = -1;
+
+	if (set_log_level) {
+		if (!valid_log_level(log_level)) {
+			printf("Invalid log level %d\n",
+			       log_level);
+		} else {
+			info.log_level = log_level;
+		}
+	}
+	if (set_log_type) {
+		if (!valid_log_type(log_type)) {
+			printf("Invalid log type %d\n",
+			       log_type);
+		} else {
+			info.log_type = log_type;
+		}
+	}
+	if (set_log_revid) {
+		if (!valid_revid(log_revid)) {
+			printf("Invalid log revid %d\n",
+			       log_revid);
+		} else {
+			info.log_revid = log_revid;
+		}
+	}
+
+	ret = ioctl(fd_log, PFRU_LOG_IOC_SET_INFO, &info);
+	if (ret) {
+		perror("Log information set failed.(log_level, log_type, log_revid)");
+		return 1;
+	}
+
+	if (set_revid) {
+		ret = ioctl(fd_update, PFRU_IOC_SET_REV, &revid);
+		if (ret) {
+			perror("mru update revid set failed");
+			return 1;
+		}
+		printf("mru update revid set to %d\n", revid);
+	}
+
+	if (capsule_name) {
+		fd_capsule = open(capsule_name, O_RDONLY);
+		if (fd_capsule < 0) {
+			perror("Can not open capsule file...");
+			return 1;
+		}
+		if (fstat(fd_capsule, &st) < 0) {
+			perror("Can not fstat capsule file...");
+			return 1;
+		}
+		addr_map_capsule = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED,
+					fd_capsule, 0);
+		if (addr_map_capsule == MAP_FAILED) {
+			perror("Failed to mmap capsule file.");
+			return 1;
+		}
+		ret = write(fd_update, (char *)addr_map_capsule, st.st_size);
+		printf("Load %d bytes of capsule file into the system\n",
+		       ret);
+		if (ret == -1) {
+			perror("Failed to load capsule file");
+			return 1;
+		}
+		munmap(addr_map_capsule, st.st_size);
+		printf("Load done.\n");
+	}
+
+	if (action) {
+		if (action == 1)
+			ret = ioctl(fd_update, PFRU_IOC_STAGE, NULL);
+		else if (action == 2)
+			ret = ioctl(fd_update, PFRU_IOC_ACTIVATE, NULL);
+		else if (action == 3)
+			ret = ioctl(fd_update, PFRU_IOC_STAGE_ACTIVATE, NULL);
+		else
+			return 1;
+		printf("Update finished, return %d\n", ret);
+	}
+
+	if (log_read) {
+		log_buf = malloc(MAX_LOG_SIZE + 1);
+		if (!log_buf) {
+			perror("log_buf allocate failed.");
+			return 1;
+		}
+		ret = read(fd_log, log_buf, MAX_LOG_SIZE);
+		if (ret == -1) {
+			perror("Read error.");
+			return 1;
+		}
+		log_buf[ret] = '\0';
+		printf("%s\n", log_buf);
+		free(log_buf);
+	}
+
+	return 0;
+}
-- 
Gitee


From 5255a04d15f52d29cf0be4d6937752292c2da3cb Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 18 May 2022 16:32:49 +0800
Subject: [PATCH 1218/2161] drivers/acpi: pfru: fix <include/linux/minmax.h>
 not found

commit opencloudos.

Replace <include/linux/minmax.h> with <include/linux/kernel.h>.
<include/linux/minmax.h> had not been introduced.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/pfru/pfru_telemetry.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/pfru/pfru_telemetry.c b/drivers/acpi/pfru/pfru_telemetry.c
index 726f2a03dd11..5eadf70d91e9 100644
--- a/drivers/acpi/pfru/pfru_telemetry.c
+++ b/drivers/acpi/pfru/pfru_telemetry.c
@@ -10,7 +10,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/minmax.h>
+#include <linux/kernel.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-- 
Gitee


From 151a3c03eceefacbead0d173e4daa0d71f2fef77 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 18 May 2022 16:40:20 +0800
Subject: [PATCH 1219/2161] selftests/pfru: fix typedef of __u32 missing in
 userspace lib

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/pfru/pfru.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/pfru/pfru.h b/tools/testing/selftests/pfru/pfru.h
index 8cd4ed80b161..655fc0bd7f6c 100644
--- a/tools/testing/selftests/pfru/pfru.h
+++ b/tools/testing/selftests/pfru/pfru.h
@@ -37,6 +37,8 @@ static inline int valid_revid(int id)
 	return (id == REVID_1) || (id == REVID_2);
 }
 
+typedef unsigned int __u32;
+
 /* Capsule file payload header */
 struct payload_hdr {
 	__u32	sig;
-- 
Gitee


From 62f1a3b5108137312b69e0e75f797464f3d3e55f Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Thu, 6 Jan 2022 15:54:48 +0800
Subject: [PATCH 1220/2161] ACPI: pfr_update: Fix return value check in
 pfru_write()

commit 31834aaa4e2a26d8d1f6b36703bb35cfdb8fc98c upstream.

In case of error, memremap() returns NULL pointer not
ERR_PTR(). The IS_ERR() test in the return value check
should be replaced with NULL test.

Fixes: 0db89fa243e5 ("ACPI: Introduce Platform Firmware Runtime Update device driver")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Acked-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/pfru/pfru_update.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/pfru/pfru_update.c b/drivers/acpi/pfru/pfru_update.c
index 826935514509..ac6ac0152e4d 100644
--- a/drivers/acpi/pfru/pfru_update.c
+++ b/drivers/acpi/pfru/pfru_update.c
@@ -425,8 +425,8 @@ static ssize_t pfru_write(struct file *file, const char __user *buf,
 	/* map the communication buffer */
 	phy_addr = (phys_addr_t)(info.addr_lo | (info.addr_hi << 32));
 	buf_ptr = memremap(phy_addr, info.buf_size, MEMREMAP_WB);
-	if (IS_ERR(buf_ptr))
-		return PTR_ERR(buf_ptr);
+	if (!buf_ptr)
+		return -ENOMEM;
 	if (!copy_from_iter_full(buf_ptr, len, &iter)) {
 		pr_err("error! could not read capsule file\n");
 		ret = -EINVAL;
-- 
Gitee


From fc7d8991d22a1e04c86eed679bff480736c86b78 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 18 May 2022 17:24:27 +0800
Subject: [PATCH 1221/2161] ACPI: pfr_telemetry: Fix info leak in
 pfrt_log_ioctl()

commit 7bf2e4d5ca1c94a9b0f730498b3d01768a72dcbd upstream.

The "data_info" struct is copied to the user.  It has a 4 byte struct
hole after the last struct member so we need to memset that to avoid
copying uninitialized stack data to the user.

Fixes: b0013e037a8b ("ACPI: Introduce Platform Firmware Runtime Telemetry driver")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/pfru/pfru_telemetry.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/acpi/pfru/pfru_telemetry.c b/drivers/acpi/pfru/pfru_telemetry.c
index 5eadf70d91e9..5b80eee891c5 100644
--- a/drivers/acpi/pfru/pfru_telemetry.c
+++ b/drivers/acpi/pfru/pfru_telemetry.c
@@ -42,6 +42,7 @@ static int get_pfru_data_info(struct telem_data_info *data_info,
 	pf_telem_dev = get_pfru_telem_dev();
 	handle = ACPI_HANDLE(pf_telem_dev->dev);
 
+	memset(data_info, 0, sizeof(*data_info));
 	memset(&in_obj, 0, sizeof(in_obj));
 	memset(&in_buf, 0, sizeof(in_buf));
 	in_obj.type = ACPI_TYPE_PACKAGE;
-- 
Gitee


From bed91dcca21359c9c29ecd935b262e06ff12e1b9 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 15 Jan 2020 17:28:51 +0800
Subject: [PATCH 1222/2161] x86/resctrl: Add task resctrl information display

commit e79f15a4598c1f3f3f7f3319ca308c63c91fdaf2 upstream.

Monitoring tools that want to find out which resctrl control and monitor
groups a task belongs to must currently read the "tasks" file in every
group until they locate the process ID.

Add an additional file /proc/{pid}/cpu_resctrl_groups to provide this
information:

1)   res:
     mon:

resctrl is not available.

2)   res:/
     mon:

Task is part of the root resctrl control group, and it is not associated
to any monitor group.

3)  res:/
    mon:mon0

Task is part of the root resctrl control group and monitor group mon0.

4)  res:group0
    mon:

Task is part of resctrl control group group0, and it is not associated
to any monitor group.

5) res:group0
   mon:mon1

Task is part of resctrl control group group0 and monitor group mon1.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Jinshi Chen <jinshi.chen@intel.com>
Link: https://lkml.kernel.org/r/20200115092851.14761-1-yu.c.chen@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/Kconfig                       |  1 +
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 86 ++++++++++++++++++++++++++
 fs/proc/Kconfig                        |  4 ++
 fs/proc/base.c                         |  7 +++
 include/linux/resctrl.h                | 14 +++++
 5 files changed, 112 insertions(+)
 create mode 100644 include/linux/resctrl.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a69c3289f325..18cdd5f87c82 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -457,6 +457,7 @@ config X86_CPU_RESCTRL
 	bool "x86 CPU resource control support"
 	depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
 	select KERNFS
+	select PROC_CPU_RESCTRL		if PROC_FS
 	help
 	  Enable x86 CPU resource control support.
 
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 28f786289fce..9612955a15a2 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -728,6 +728,92 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 	return ret;
 }
 
+#ifdef CONFIG_PROC_CPU_RESCTRL
+
+/*
+ * A task can only be part of one resctrl control group and of one monitor
+ * group which is associated to that control group.
+ *
+ * 1)   res:
+ *      mon:
+ *
+ *    resctrl is not available.
+ *
+ * 2)   res:/
+ *      mon:
+ *
+ *    Task is part of the root resctrl control group, and it is not associated
+ *    to any monitor group.
+ *
+ * 3)  res:/
+ *     mon:mon0
+ *
+ *    Task is part of the root resctrl control group and monitor group mon0.
+ *
+ * 4)  res:group0
+ *     mon:
+ *
+ *    Task is part of resctrl control group group0, and it is not associated
+ *    to any monitor group.
+ *
+ * 5) res:group0
+ *    mon:mon1
+ *
+ *    Task is part of resctrl control group group0 and monitor group mon1.
+ */
+int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
+		      struct pid *pid, struct task_struct *tsk)
+{
+	struct rdtgroup *rdtg;
+	int ret = 0;
+
+	mutex_lock(&rdtgroup_mutex);
+
+	/* Return empty if resctrl has not been mounted. */
+	if (!static_branch_unlikely(&rdt_enable_key)) {
+		seq_puts(s, "res:\nmon:\n");
+		goto unlock;
+	}
+
+	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
+		struct rdtgroup *crg;
+
+		/*
+		 * Task information is only relevant for shareable
+		 * and exclusive groups.
+		 */
+		if (rdtg->mode != RDT_MODE_SHAREABLE &&
+		    rdtg->mode != RDT_MODE_EXCLUSIVE)
+			continue;
+
+		if (rdtg->closid != tsk->closid)
+			continue;
+
+		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
+			   rdtg->kn->name);
+		seq_puts(s, "mon:");
+		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
+				    mon.crdtgrp_list) {
+			if (tsk->rmid != crg->mon.rmid)
+				continue;
+			seq_printf(s, "%s", crg->kn->name);
+			break;
+		}
+		seq_putc(s, '\n');
+		goto unlock;
+	}
+	/*
+	 * The above search should succeed. Otherwise return
+	 * with an error.
+	 */
+	ret = -ENOENT;
+unlock:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return ret;
+}
+#endif
+
 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 				    struct seq_file *seq, void *v)
 {
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index cb5629bd5fff..ae96a339d24d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -103,3 +103,7 @@ config PROC_CHILDREN
 config PROC_PID_ARCH_STATUS
 	def_bool n
 	depends on PROC_FS
+
+config PROC_CPU_RESCTRL
+	def_bool n
+	depends on PROC_FS
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 519f2d89ec9f..54480df1c66e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -94,6 +94,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/stat.h>
 #include <linux/posix-timers.h>
+#include <linux/resctrl.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -3060,6 +3061,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 #ifdef CONFIG_CGROUPS
 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
+#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+	ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
 #endif
 	ONE("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
@@ -3461,6 +3465,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 #ifdef CONFIG_CGROUPS
 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
+#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+	ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
 #endif
 	ONE("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
new file mode 100644
index 000000000000..daf5cf64c6a6
--- /dev/null
+++ b/include/linux/resctrl.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RESCTRL_H
+#define _RESCTRL_H
+
+#ifdef CONFIG_PROC_CPU_RESCTRL
+
+int proc_resctrl_show(struct seq_file *m,
+		      struct pid_namespace *ns,
+		      struct pid *pid,
+		      struct task_struct *tsk);
+
+#endif
+
+#endif /* _RESCTRL_H */
-- 
Gitee


From 133ddd6002dbe4f3ee06ea2341c9c2bed35776d5 Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <xiaochen.shen@intel.com>
Date: Thu, 9 Jan 2020 00:28:06 +0800
Subject: [PATCH 1223/2161] x86/resctrl: Clean up unused function parameter in
 mkdir path

commit 32ada3b9e04c6f6d4916967bd8bbe2450ad5bc5e upstream.

Commit

  334b0f4e9b1b ("x86/resctrl: Fix a deadlock due to inaccurate reference")

changed the argument to rdtgroup_kn_lock_live()/rdtgroup_kn_unlock()
within mkdir_rdt_prepare(). That change resulted in an unused function
parameter to mkdir_rdt_prepare().

Clean up the unused function parameter in mkdir_rdt_prepare() and its
callers rdtgroup_mkdir_mon() and rdtgroup_mkdir_ctrl_mon().

Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/1578500886-21771-5-git-send-email-xiaochen.shen@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 9612955a15a2..ddc184841ea8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2725,7 +2725,6 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
 }
 
 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
-			     struct kernfs_node *prgrp_kn,
 			     const char *name, umode_t mode,
 			     enum rdt_group_type rtype, struct rdtgroup **r)
 {
@@ -2836,15 +2835,12 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
  * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
  */
 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
-			      struct kernfs_node *prgrp_kn,
-			      const char *name,
-			      umode_t mode)
+			      const char *name, umode_t mode)
 {
 	struct rdtgroup *rdtgrp, *prgrp;
 	int ret;
 
-	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
-				&rdtgrp);
+	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
 	if (ret)
 		return ret;
 
@@ -2866,7 +2862,6 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
  * to allocate and monitor resources.
  */
 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
-				   struct kernfs_node *prgrp_kn,
 				   const char *name, umode_t mode)
 {
 	struct rdtgroup *rdtgrp;
@@ -2874,8 +2869,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
 	u32 closid;
 	int ret;
 
-	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
-				&rdtgrp);
+	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
 	if (ret)
 		return ret;
 
@@ -2949,14 +2943,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	 * subdirectory
 	 */
 	if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
-		return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+		return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
 
 	/*
 	 * If RDT monitoring is supported and the parent directory is a valid
 	 * "mon_groups" directory, add a monitoring subdirectory.
 	 */
 	if (rdt_mon_capable && is_mon_groups(parent_kn, name))
-		return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+		return rdtgroup_mkdir_mon(parent_kn, name, mode);
 
 	return -EPERM;
 }
-- 
Gitee


From db2b35ffc947eef925d34a7483eac587952cb89c Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:34 -0800
Subject: [PATCH 1224/2161] selftests/resctrl: Add README for resctrl tests

commit 034c7678dd2c05fb08e6fa3b0ac527ead8f1b11b upstream.

resctrl tests will be implemented. README is added for the tool first.

Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/README | 53 ++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/README

diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README
new file mode 100644
index 000000000000..6e5a0ffa18e8
--- /dev/null
+++ b/tools/testing/selftests/resctrl/README
@@ -0,0 +1,53 @@
+resctrl_tests - resctrl file system test suit
+
+Authors:
+	Fenghua Yu <fenghua.yu@intel.com>
+	Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+
+resctrl_tests tests various resctrl functionalities and interfaces including
+both software and hardware.
+
+Currently it supports Memory Bandwidth Monitoring test and Memory Bandwidth
+Allocation test on Intel RDT hardware. More tests will be added in the future.
+And the test suit can be extended to cover AMD QoS and ARM MPAM hardware
+as well.
+
+BUILD
+-----
+
+Run "make" to build executable file "resctrl_tests".
+
+RUN
+---
+
+To use resctrl_tests, root or sudoer privileges are required. This is because
+the test needs to mount resctrl file system and change contents in the file
+system.
+
+Executing the test without any parameter will run all supported tests:
+
+	sudo ./resctrl_tests
+
+OVERVIEW OF EXECUTION
+---------------------
+
+A test case has four stages:
+
+  - setup: mount resctrl file system, create group, setup schemata, move test
+    process pids to tasks, start benchmark.
+  - execute: let benchmark run
+  - verify: get resctrl data and verify the data with another source, e.g.
+    perf event.
+  - teardown: umount resctrl and clear temporary files.
+
+ARGUMENTS
+---------
+
+Parameter '-h' shows usage information.
+
+usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits]
+        -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM default benchmark is builtin fill_buf
+        -t test list: run tests specified in the test list, e.g. -t mbm, mba, cqm, cat
+        -n no_of_bits: run cache tests using specified no of bits in cache bit mask
+        -p cpu_no: specify CPU number to run the test. 1 is default
+        -h: help
-- 
Gitee


From 1f3c311592c4b448f0f27e04154d822c774ea3ab Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Thu, 16 Jan 2020 13:32:35 -0800
Subject: [PATCH 1225/2161] selftests/resctrl: Add basic resctrl file system
 operations and data

commit 591a6e8588fc1dbdc04355e8ad7b0be43d221c9c upstream.

The basic resctrl file system operations and data are added for future
usage by resctrl selftest tool.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/Makefile    |  12 +
 tools/testing/selftests/resctrl/resctrl.h   |  50 +++
 tools/testing/selftests/resctrl/resctrlfs.c | 462 ++++++++++++++++++++
 3 files changed, 524 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/Makefile
 create mode 100644 tools/testing/selftests/resctrl/resctrl.h
 create mode 100644 tools/testing/selftests/resctrl/resctrlfs.c

diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
new file mode 100644
index 000000000000..76bbd6d3e4a8
--- /dev/null
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -0,0 +1,12 @@
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -g -Wall
+SRCS=$(wildcard *.c)
+OBJS=$(SRCS:.c=.o)
+
+$(OBJS): $(SRCS)
+	$(CC) $(CFLAGS) -c $(SRCS)
+
+.PHONY: clean
+
+clean:
+	$(RM) $(OBJS)
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
new file mode 100644
index 000000000000..ba98cd6efc64
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+#ifndef RESCTRL_H
+#define RESCTRL_H
+#include <stdio.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <dirent.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <asm/unistd.h>
+#include <linux/perf_event.h>
+
+#define RESCTRL_PATH		"/sys/fs/resctrl"
+#define PHYS_ID_PATH		"/sys/devices/system/cpu/cpu"
+
+#define PARENT_EXIT(err_msg)			\
+	do {					\
+		perror(err_msg);		\
+		kill(ppid, SIGKILL);		\
+		exit(EXIT_FAILURE);		\
+	} while (0)
+
+pid_t bm_pid, ppid;
+
+int remount_resctrlfs(bool mum_resctrlfs);
+int get_resource_id(int cpu_no, int *resource_id);
+int validate_bw_report_request(char *bw_report);
+bool validate_resctrl_feature_request(char *resctrl_val);
+char *fgrep(FILE *inf, const char *str);
+int taskset_benchmark(pid_t bm_pid, int cpu_no);
+void run_benchmark(int signum, siginfo_t *info, void *ucontext);
+int write_schemata(char *ctrlgrp, char *schemata, int cpu_no,
+		   char *resctrl_val);
+int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
+			    char *resctrl_val);
+int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
+		    int group_fd, unsigned long flags);
+int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush,
+		 int op, char *resctrl_va);
+
+#endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
new file mode 100644
index 000000000000..1b125f9d8e5d
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic resctrl file system operations
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+int tests_run;
+
+static int find_resctrl_mount(char *buffer)
+{
+	FILE *mounts;
+	char line[256], *fs, *mntpoint;
+
+	mounts = fopen("/proc/mounts", "r");
+	if (!mounts) {
+		perror("/proc/mounts");
+		return -ENXIO;
+	}
+	while (!feof(mounts)) {
+		if (!fgets(line, 256, mounts))
+			break;
+		fs = strtok(line, " \t");
+		if (!fs)
+			continue;
+		mntpoint = strtok(NULL, " \t");
+		if (!mntpoint)
+			continue;
+		fs = strtok(NULL, " \t");
+		if (!fs)
+			continue;
+		if (strcmp(fs, "resctrl"))
+			continue;
+
+		fclose(mounts);
+		if (buffer)
+			strncpy(buffer, mntpoint, 256);
+
+		return 0;
+	}
+
+	fclose(mounts);
+
+	return -ENOENT;
+}
+
+/*
+ * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl
+ * @mum_resctrlfs:	Should the resctrl FS be remounted?
+ *
+ * If not mounted, mount it.
+ * If mounted and mum_resctrlfs then remount resctrl FS.
+ * If mounted and !mum_resctrlfs then noop
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int remount_resctrlfs(bool mum_resctrlfs)
+{
+	char mountpoint[256];
+	int ret;
+
+	ret = find_resctrl_mount(mountpoint);
+	if (ret)
+		strcpy(mountpoint, RESCTRL_PATH);
+
+	if (!ret && mum_resctrlfs && umount(mountpoint)) {
+		printf("not ok unmounting \"%s\"\n", mountpoint);
+		perror("# umount");
+		tests_run++;
+	}
+
+	if (!ret && !mum_resctrlfs)
+		return 0;
+
+	ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL);
+	printf("%sok mounting resctrl to \"%s\"\n", ret ? "not " : "",
+	       RESCTRL_PATH);
+	if (ret)
+		perror("# mount");
+
+	tests_run++;
+
+	return ret;
+}
+
+int umount_resctrlfs(void)
+{
+	if (umount(RESCTRL_PATH)) {
+		perror("# Unable to umount resctrl");
+
+		return errno;
+	}
+
+	return 0;
+}
+
+/*
+ * get_resource_id - Get socket number/l3 id for a specified CPU
+ * @cpu_no:	CPU number
+ * @resource_id: Socket number or l3_id
+ *
+ * Return: >= 0 on success, < 0 on failure.
+ */
+int get_resource_id(int cpu_no, int *resource_id)
+{
+	char phys_pkg_path[1024];
+	FILE *fp;
+
+	sprintf(phys_pkg_path, "%s%d/topology/physical_package_id",
+		PHYS_ID_PATH, cpu_no);
+	fp = fopen(phys_pkg_path, "r");
+	if (!fp) {
+		perror("Failed to open physical_package_id");
+
+		return -1;
+	}
+	if (fscanf(fp, "%d", resource_id) <= 0) {
+		perror("Could not get socket number or l3 id");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu
+ * @bm_pid:	PID that should be binded
+ * @cpu_no:	CPU number at which the PID would be binded
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int taskset_benchmark(pid_t bm_pid, int cpu_no)
+{
+	cpu_set_t my_set;
+
+	CPU_ZERO(&my_set);
+	CPU_SET(cpu_no, &my_set);
+
+	if (sched_setaffinity(bm_pid, sizeof(cpu_set_t), &my_set)) {
+		perror("Unable to taskset benchmark");
+
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * run_benchmark - Run a specified benchmark or fill_buf (default benchmark)
+ *		   in specified signal. Direct benchmark stdio to /dev/null.
+ * @signum:	signal number
+ * @info:	signal info
+ * @ucontext:	user context in signal handling
+ *
+ * Return: void
+ */
+void run_benchmark(int signum, siginfo_t *info, void *ucontext)
+{
+	unsigned long span;
+	int operation, ret;
+	char **benchmark_cmd;
+	FILE *fp;
+
+	benchmark_cmd = info->si_ptr;
+
+	/*
+	 * Direct stdio of child to /dev/null, so that only parent writes to
+	 * stdio (console)
+	 */
+	fp = freopen("/dev/null", "w", stdout);
+	if (!fp)
+		PARENT_EXIT("Unable to direct benchmark status to /dev/null");
+
+	if (strcmp(benchmark_cmd[0], "fill_buf") == 0) {
+		/* Execute default fill_buf benchmark */
+		span = strtoul(benchmark_cmd[1], NULL, 10);
+		operation = atoi(benchmark_cmd[4]);
+		if (run_fill_buf(span, 1, 1, operation, NULL))
+			fprintf(stderr, "Error in running fill buffer\n");
+	} else {
+		/* Execute specified benchmark */
+		ret = execvp(benchmark_cmd[0], benchmark_cmd);
+		if (ret)
+			perror("wrong\n");
+	}
+
+	fclose(stdout);
+	PARENT_EXIT("Unable to run specified benchmark");
+}
+
+/*
+ * create_grp - Create a group only if one doesn't exist
+ * @grp_name:	Name of the group
+ * @grp:	Full path and name of the group
+ * @parent_grp:	Full path and name of the parent group
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+static int create_grp(const char *grp_name, char *grp, const char *parent_grp)
+{
+	int found_grp = 0;
+	struct dirent *ep;
+	DIR *dp;
+
+	/* Check if requested grp exists or not */
+	dp = opendir(parent_grp);
+	if (dp) {
+		while ((ep = readdir(dp)) != NULL) {
+			if (strcmp(ep->d_name, grp_name) == 0)
+				found_grp = 1;
+		}
+		closedir(dp);
+	} else {
+		perror("Unable to open resctrl for group");
+
+		return -1;
+	}
+
+	/* Requested grp doesn't exist, hence create it */
+	if (found_grp == 0) {
+		if (mkdir(grp, 0) == -1) {
+			perror("Unable to create group");
+
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int write_pid_to_tasks(char *tasks, pid_t pid)
+{
+	FILE *fp;
+
+	fp = fopen(tasks, "w");
+	if (!fp) {
+		perror("Failed to open tasks file");
+
+		return -1;
+	}
+	if (fprintf(fp, "%d\n", pid) < 0) {
+		perror("Failed to wr pid to tasks file");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * write_bm_pid_to_resctrl - Write a PID (i.e. benchmark) to resctrl FS
+ * @bm_pid:		PID that should be written
+ * @ctrlgrp:		Name of the control monitor group (con_mon grp)
+ * @mongrp:		Name of the monitor group (mon grp)
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ *
+ * If a con_mon grp is requested, create it and write pid to it, otherwise
+ * write pid to root con_mon grp.
+ * If a mon grp is requested, create it and write pid to it, otherwise
+ * pid is not written, this means that pid is in con_mon grp and hence
+ * should consult con_mon grp's mon_data directory for results.
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
+			    char *resctrl_val)
+{
+	char controlgroup[128], monitorgroup[512], monitorgroup_p[256];
+	char tasks[1024];
+	int ret = 0;
+
+	if (strlen(ctrlgrp))
+		sprintf(controlgroup, "%s/%s", RESCTRL_PATH, ctrlgrp);
+	else
+		sprintf(controlgroup, "%s", RESCTRL_PATH);
+
+	/* Create control and monitoring group and write pid into it */
+	ret = create_grp(ctrlgrp, controlgroup, RESCTRL_PATH);
+	if (ret)
+		goto out;
+	sprintf(tasks, "%s/tasks", controlgroup);
+	ret = write_pid_to_tasks(tasks, bm_pid);
+	if (ret)
+		goto out;
+
+	/* Create mon grp and write pid into it for "mbm" test */
+	if ((strcmp(resctrl_val, "mbm") == 0)) {
+		if (mongrp) {
+			sprintf(monitorgroup_p, "%s/mon_groups", controlgroup);
+			sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp);
+			ret = create_grp(mongrp, monitorgroup, monitorgroup_p);
+			if (ret)
+				goto out;
+
+			sprintf(tasks, "%s/mon_groups/%s/tasks",
+				controlgroup, mongrp);
+			ret = write_pid_to_tasks(tasks, bm_pid);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	printf("%sok writing benchmark parameters to resctrl FS\n",
+	       ret ? "not " : "");
+	if (ret)
+		perror("# writing to resctrlfs");
+
+	tests_run++;
+
+	return ret;
+}
+
+/*
+ * write_schemata - Update schemata of a con_mon grp
+ * @ctrlgrp:		Name of the con_mon grp
+ * @schemata:		Schemata that should be updated to
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ *
+ * Update schemata of a con_mon grp *only* if requested resctrl feature is
+ * allocation type
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
+{
+	char controlgroup[1024], schema[1024], reason[64];
+	int resource_id, ret = 0;
+	FILE *fp;
+
+	if (strcmp(resctrl_val, "mba") != 0)
+		return -ENOENT;
+
+	if (!schemata) {
+		printf("# Skipping empty schemata update\n");
+
+		return -1;
+	}
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		sprintf(reason, "Failed to get resource id");
+		ret = -1;
+
+		goto out;
+	}
+
+	if (strlen(ctrlgrp) != 0)
+		sprintf(controlgroup, "%s/%s/schemata", RESCTRL_PATH, ctrlgrp);
+	else
+		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
+
+	sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
+
+	fp = fopen(controlgroup, "w");
+	if (!fp) {
+		sprintf(reason, "Failed to open control group");
+		ret = -1;
+
+		goto out;
+	}
+
+	if (fprintf(fp, "%s\n", schema) < 0) {
+		sprintf(reason, "Failed to write schemata in control group");
+		fclose(fp);
+		ret = -1;
+
+		goto out;
+	}
+	fclose(fp);
+
+out:
+	printf("%sok Write schema \"%s\" to resctrl FS%s%s\n",
+	       ret ? "not " : "", schema, ret ? " # " : "",
+	       ret ? reason : "");
+	tests_run++;
+
+	return ret;
+}
+
+char *fgrep(FILE *inf, const char *str)
+{
+	char line[256];
+	int slen = strlen(str);
+
+	while (!feof(inf)) {
+		if (!fgets(line, 256, inf))
+			break;
+		if (strncmp(line, str, slen))
+			continue;
+
+		return strdup(line);
+	}
+
+	return NULL;
+}
+
+/*
+ * validate_resctrl_feature_request - Check if requested feature is valid.
+ * @resctrl_val:	Requested feature
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+bool validate_resctrl_feature_request(char *resctrl_val)
+{
+	FILE *inf = fopen("/proc/cpuinfo", "r");
+	bool found = false;
+	char *res;
+
+	if (!inf)
+		return false;
+
+	res = fgrep(inf, "flags");
+
+	if (res) {
+		char *s = strchr(res, ':');
+
+		found = s && !strstr(s, resctrl_val);
+		free(res);
+	}
+	fclose(inf);
+
+	return found;
+}
+
+int validate_bw_report_request(char *bw_report)
+{
+	if (strcmp(bw_report, "reads") == 0)
+		return 0;
+	if (strcmp(bw_report, "writes") == 0)
+		return 0;
+	if (strcmp(bw_report, "nt-writes") == 0) {
+		strcpy(bw_report, "writes");
+		return 0;
+	}
+	if (strcmp(bw_report, "total") == 0)
+		return 0;
+
+	fprintf(stderr, "Requested iMC B/W report type unavailable\n");
+
+	return -1;
+}
+
+int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	int ret;
+
+	ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
+		      group_fd, flags);
+	return ret;
+}
-- 
Gitee


From b46427be981fff9310489f1b38ba8738e5ca04b7 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Thu, 16 Jan 2020 13:32:36 -0800
Subject: [PATCH 1226/2161] selftests/resctrl: Read memory bandwidth from perf
 IMC counter and from resctrl file system

commit 1d3f08687d76c0a80be35bbab6d64b1a8f2730e8 upstream.

Total memory bandwidth can be monitored from perf IMC counter and from
resctrl file system. Later the two will be compared to verify the total
memory bandwidth read from resctrl is correct.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl_val.c | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/resctrl_val.c

diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
new file mode 100644
index 000000000000..0ca4c9252516
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory bandwidth monitoring and allocation library
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define UNCORE_IMC		"uncore_imc"
+#define READ_FILE_NAME		"events/cas_count_read"
+#define WRITE_FILE_NAME		"events/cas_count_write"
+#define DYN_PMU_PATH		"/sys/bus/event_source/devices"
+#define SCALE			0.00006103515625
+#define MAX_IMCS		20
+#define MAX_TOKENS		5
+#define READ			0
+#define WRITE			1
+#define CON_MON_MBM_LOCAL_BYTES_PATH				\
+	"%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define CON_MBM_LOCAL_BYTES_PATH		\
+	"%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define MON_MBM_LOCAL_BYTES_PATH		\
+	"%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define MBM_LOCAL_BYTES_PATH			\
+	"%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+struct membw_read_format {
+	__u64 value;         /* The value of the event */
+	__u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+	__u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
+	__u64 id;            /* if PERF_FORMAT_ID */
+};
+
+struct imc_counter_config {
+	__u32 type;
+	__u64 event;
+	__u64 umask;
+	struct perf_event_attr pe;
+	struct membw_read_format return_value;
+	int fd;
+};
+
+static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
+
+void membw_initialize_perf_event_attr(int i, int j)
+{
+	memset(&imc_counters_config[i][j].pe, 0,
+	       sizeof(struct perf_event_attr));
+	imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type;
+	imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr);
+	imc_counters_config[i][j].pe.disabled = 1;
+	imc_counters_config[i][j].pe.inherit = 1;
+	imc_counters_config[i][j].pe.exclude_guest = 0;
+	imc_counters_config[i][j].pe.config =
+		imc_counters_config[i][j].umask << 8 |
+		imc_counters_config[i][j].event;
+	imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER;
+	imc_counters_config[i][j].pe.read_format =
+		PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+}
+
+void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
+{
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0);
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void membw_ioctl_perf_event_ioc_disable(int i, int j)
+{
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0);
+}
+
+/*
+ * get_event_and_umask:	Parse config into event and umask
+ * @cas_count_cfg:	Config
+ * @count:		iMC number
+ * @op:			Operation (read/write)
+ */
+void get_event_and_umask(char *cas_count_cfg, int count, bool op)
+{
+	char *token[MAX_TOKENS];
+	int i = 0;
+
+	strcat(cas_count_cfg, ",");
+	token[0] = strtok(cas_count_cfg, "=,");
+
+	for (i = 1; i < MAX_TOKENS; i++)
+		token[i] = strtok(NULL, "=,");
+
+	for (i = 0; i < MAX_TOKENS; i++) {
+		if (!token[i])
+			break;
+		if (strcmp(token[i], "event") == 0) {
+			if (op == READ)
+				imc_counters_config[count][READ].event =
+				strtol(token[i + 1], NULL, 16);
+			else
+				imc_counters_config[count][WRITE].event =
+				strtol(token[i + 1], NULL, 16);
+		}
+		if (strcmp(token[i], "umask") == 0) {
+			if (op == READ)
+				imc_counters_config[count][READ].umask =
+				strtol(token[i + 1], NULL, 16);
+			else
+				imc_counters_config[count][WRITE].umask =
+				strtol(token[i + 1], NULL, 16);
+		}
+	}
+}
-- 
Gitee


From 40a7ceac1c08de6c3095214be51a75070d995c03 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Thu, 16 Jan 2020 13:32:37 -0800
Subject: [PATCH 1227/2161] selftests/resctrl: Add callback to start a
 benchmark

commit 7f4d257e3a2a4864268f6f0da7c85ca4f083e643 upstream.

The callback starts a child process and puts the child pid in created
resctrl group with specified memory bandwidth in schemata. The child
starts running benchmark.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl.h     |  29 +
 tools/testing/selftests/resctrl/resctrl_val.c | 574 ++++++++++++++++++
 2 files changed, 603 insertions(+)

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index ba98cd6efc64..fb42087c904d 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -3,6 +3,8 @@
 #ifndef RESCTRL_H
 #define RESCTRL_H
 #include <stdio.h>
+#include <stdarg.h>
+#include <math.h>
 #include <errno.h>
 #include <sched.h>
 #include <stdlib.h>
@@ -19,6 +21,7 @@
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
 
+#define MB			(1024 * 1024)
 #define RESCTRL_PATH		"/sys/fs/resctrl"
 #define PHYS_ID_PATH		"/sys/devices/system/cpu/cpu"
 
@@ -29,10 +32,35 @@
 		exit(EXIT_FAILURE);		\
 	} while (0)
 
+/*
+ * resctrl_val_param:	resctrl test parameters
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ * @ctrlgrp:		Name of the control monitor group (con_mon grp)
+ * @mongrp:		Name of the monitor group (mon grp)
+ * @cpu_no:		CPU number to which the benchmark would be binded
+ * @span:		Memory bytes accessed in each benchmark iteration
+ * @mum_resctrlfs:	Should the resctrl FS be remounted?
+ * @filename:		Name of file to which the o/p should be written
+ * @bw_report:		Bandwidth report type (reads vs writes)
+ * @setup:		Call back function to setup test environment
+ */
+struct resctrl_val_param {
+	char		*resctrl_val;
+	char		ctrlgrp[64];
+	char		mongrp[64];
+	int		cpu_no;
+	unsigned long	span;
+	int		mum_resctrlfs;
+	char		filename[64];
+	char		*bw_report;
+	int		(*setup)(int num, ...);
+};
+
 pid_t bm_pid, ppid;
 
 int remount_resctrlfs(bool mum_resctrlfs);
 int get_resource_id(int cpu_no, int *resource_id);
+int umount_resctrlfs(void);
 int validate_bw_report_request(char *bw_report);
 bool validate_resctrl_feature_request(char *resctrl_val);
 char *fgrep(FILE *inf, const char *str);
@@ -46,5 +74,6 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 		    int group_fd, unsigned long flags);
 int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush,
 		 int op, char *resctrl_va);
+int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param);
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 0ca4c9252516..0d1bd03ec4be 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -47,6 +47,8 @@ struct imc_counter_config {
 	int fd;
 };
 
+static char mbm_total_path[1024];
+static int imcs;
 static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
 
 void membw_initialize_perf_event_attr(int i, int j)
@@ -115,3 +117,575 @@ void get_event_and_umask(char *cas_count_cfg, int count, bool op)
 		}
 	}
 }
+
+static int open_perf_event(int i, int cpu_no, int j)
+{
+	imc_counters_config[i][j].fd =
+		perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1,
+				PERF_FLAG_FD_CLOEXEC);
+
+	if (imc_counters_config[i][j].fd == -1) {
+		fprintf(stderr, "Error opening leader %llx\n",
+			imc_counters_config[i][j].pe.config);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Get type and config (read and write) of an iMC counter */
+static int read_from_imc_dir(char *imc_dir, int count)
+{
+	char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024];
+	FILE *fp;
+
+	/* Get type of iMC counter */
+	sprintf(imc_counter_type, "%s%s", imc_dir, "type");
+	fp = fopen(imc_counter_type, "r");
+	if (!fp) {
+		perror("Failed to open imc counter type file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) {
+		perror("Could not get imc type");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	imc_counters_config[count][WRITE].type =
+				imc_counters_config[count][READ].type;
+
+	/* Get read config */
+	sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME);
+	fp = fopen(imc_counter_cfg, "r");
+	if (!fp) {
+		perror("Failed to open imc config file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+		perror("Could not get imc cas count read");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	get_event_and_umask(cas_count_cfg, count, READ);
+
+	/* Get write config */
+	sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME);
+	fp = fopen(imc_counter_cfg, "r");
+	if (!fp) {
+		perror("Failed to open imc config file");
+
+		return -1;
+	}
+	if  (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+		perror("Could not get imc cas count write");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	get_event_and_umask(cas_count_cfg, count, WRITE);
+
+	return 0;
+}
+
+/*
+ * A system can have 'n' number of iMC (Integrated Memory Controller)
+ * counters, get that 'n'. For each iMC counter get it's type and config.
+ * Also, each counter has two configs, one for read and the other for write.
+ * A config again has two parts, event and umask.
+ * Enumerate all these details into an array of structures.
+ *
+ * Return: >= 0 on success. < 0 on failure.
+ */
+static int num_of_imcs(void)
+{
+	unsigned int count = 0;
+	char imc_dir[512];
+	struct dirent *ep;
+	int ret;
+	DIR *dp;
+
+	dp = opendir(DYN_PMU_PATH);
+	if (dp) {
+		while ((ep = readdir(dp))) {
+			if (strstr(ep->d_name, UNCORE_IMC)) {
+				sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH,
+					ep->d_name);
+				ret = read_from_imc_dir(imc_dir, count);
+				if (ret) {
+					closedir(dp);
+
+					return ret;
+				}
+				count++;
+			}
+		}
+		closedir(dp);
+		if (count == 0) {
+			perror("Unable find iMC counters!\n");
+
+			return -1;
+		}
+	} else {
+		perror("Unable to open PMU directory!\n");
+
+		return -1;
+	}
+
+	return count;
+}
+
+static int initialize_mem_bw_imc(void)
+{
+	int imc, j;
+
+	imcs = num_of_imcs();
+	if (imcs <= 0)
+		return imcs;
+
+	/* Initialize perf_event_attr structures for all iMC's */
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++)
+			membw_initialize_perf_event_attr(imc, j);
+	}
+
+	return 0;
+}
+
+/*
+ * get_mem_bw_imc:	Memory band width as reported by iMC counters
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ * @bw_report:		Bandwidth report type (reads, writes)
+ *
+ * Memory B/W utilized by a process on a socket can be calculated using
+ * iMC counters. Perf events are used to read these counters.
+ *
+ * Return: >= 0 on success. < 0 on failure.
+ */
+static float get_mem_bw_imc(int cpu_no, char *bw_report)
+{
+	float reads, writes, of_mul_read, of_mul_write;
+	int imc, j, ret;
+
+	/* Start all iMC counters to log values (both read and write) */
+	reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1;
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++) {
+			ret = open_perf_event(imc, cpu_no, j);
+			if (ret)
+				return -1;
+		}
+		for (j = 0; j < 2; j++)
+			membw_ioctl_perf_event_ioc_reset_enable(imc, j);
+	}
+
+	sleep(1);
+
+	/* Stop counters after a second to get results (both read and write) */
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++)
+			membw_ioctl_perf_event_ioc_disable(imc, j);
+	}
+
+	/*
+	 * Get results which are stored in struct type imc_counter_config
+	 * Take over flow into consideration before calculating total b/w
+	 */
+	for (imc = 0; imc < imcs; imc++) {
+		struct imc_counter_config *r =
+			&imc_counters_config[imc][READ];
+		struct imc_counter_config *w =
+			&imc_counters_config[imc][WRITE];
+
+		if (read(r->fd, &r->return_value,
+			 sizeof(struct membw_read_format)) == -1) {
+			perror("Couldn't get read b/w through iMC");
+
+			return -1;
+		}
+
+		if (read(w->fd, &w->return_value,
+			 sizeof(struct membw_read_format)) == -1) {
+			perror("Couldn't get write bw through iMC");
+
+			return -1;
+		}
+
+		__u64 r_time_enabled = r->return_value.time_enabled;
+		__u64 r_time_running = r->return_value.time_running;
+
+		if (r_time_enabled != r_time_running)
+			of_mul_read = (float)r_time_enabled /
+					(float)r_time_running;
+
+		__u64 w_time_enabled = w->return_value.time_enabled;
+		__u64 w_time_running = w->return_value.time_running;
+
+		if (w_time_enabled != w_time_running)
+			of_mul_write = (float)w_time_enabled /
+					(float)w_time_running;
+		reads += r->return_value.value * of_mul_read * SCALE;
+		writes += w->return_value.value * of_mul_write * SCALE;
+	}
+
+	for (imc = 0; imc < imcs; imc++) {
+		close(imc_counters_config[imc][READ].fd);
+		close(imc_counters_config[imc][WRITE].fd);
+	}
+
+	if (strcmp(bw_report, "reads") == 0)
+		return reads;
+
+	if (strcmp(bw_report, "writes") == 0)
+		return writes;
+
+	return (reads + writes);
+}
+
+void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id)
+{
+	if (ctrlgrp && mongrp)
+		sprintf(mbm_total_path, CON_MON_MBM_LOCAL_BYTES_PATH,
+			RESCTRL_PATH, ctrlgrp, mongrp, resource_id);
+	else if (!ctrlgrp && mongrp)
+		sprintf(mbm_total_path, MON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			mongrp, resource_id);
+	else if (ctrlgrp && !mongrp)
+		sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			ctrlgrp, resource_id);
+	else if (!ctrlgrp && !mongrp)
+		sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			resource_id);
+}
+
+/*
+ * initialize_mem_bw_resctrl:	Appropriately populate "mbm_total_path"
+ * @ctrlgrp:			Name of the control monitor group (con_mon grp)
+ * @mongrp:			Name of the monitor group (mon grp)
+ * @cpu_no:			CPU number that the benchmark PID is binded to
+ * @resctrl_val:		Resctrl feature (Eg: mbm, mba.. etc)
+ */
+static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp,
+				      int cpu_no, char *resctrl_val)
+{
+	int resource_id;
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		perror("Could not get resource_id");
+		return;
+	}
+
+	if (strcmp(resctrl_val, "mbm") == 0)
+		set_mbm_path(ctrlgrp, mongrp, resource_id);
+
+	if ((strcmp(resctrl_val, "mba") == 0)) {
+		if (ctrlgrp)
+			sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH,
+				RESCTRL_PATH, ctrlgrp, resource_id);
+		else
+			sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH,
+				RESCTRL_PATH, resource_id);
+	}
+}
+
+/*
+ * Get MBM Local bytes as reported by resctrl FS
+ * For MBM,
+ * 1. If con_mon grp and mon grp are given, then read from con_mon grp's mon grp
+ * 2. If only con_mon grp is given, then read from con_mon grp
+ * 3. If both are not given, then read from root con_mon grp
+ * For MBA,
+ * 1. If con_mon grp is given, then read from it
+ * 2. If con_mon grp is not given, then read from root con_mon grp
+ */
+static unsigned long get_mem_bw_resctrl(void)
+{
+	unsigned long mbm_total = 0;
+	FILE *fp;
+
+	fp = fopen(mbm_total_path, "r");
+	if (!fp) {
+		perror("Failed to open total bw file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%lu", &mbm_total) <= 0) {
+		perror("Could not get mbm local bytes");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return mbm_total;
+}
+
+pid_t bm_pid, ppid;
+
+static void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
+{
+	kill(bm_pid, SIGKILL);
+	printf("Ending\n\n");
+
+	exit(EXIT_SUCCESS);
+}
+
+/*
+ * print_results_bw:	the memory bandwidth results are stored in a file
+ * @filename:		file that stores the results
+ * @bm_pid:		child pid that runs benchmark
+ * @bw_imc:		perf imc counter value
+ * @bw_resc:		memory bandwidth value
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+static int print_results_bw(char *filename,  int bm_pid, float bw_imc,
+			    unsigned long bw_resc)
+{
+	unsigned long diff = fabs(bw_imc - bw_resc);
+	FILE *fp;
+
+	if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
+		printf("Pid: %d \t Mem_BW_iMC: %f \t ", bm_pid, bw_imc);
+		printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff);
+	} else {
+		fp = fopen(filename, "a");
+		if (!fp) {
+			perror("Cannot open results file");
+
+			return errno;
+		}
+		if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n",
+			    bm_pid, bw_imc, bw_resc, diff) <= 0) {
+			fclose(fp);
+			perror("Could not log results.");
+
+			return errno;
+		}
+		fclose(fp);
+	}
+
+	return 0;
+}
+
+static int
+measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
+{
+	unsigned long bw_imc, bw_resc, bw_resc_end;
+	int ret;
+
+	/*
+	 * Measure memory bandwidth from resctrl and from
+	 * another source which is perf imc value or could
+	 * be something else if perf imc event is not available.
+	 * Compare the two values to validate resctrl value.
+	 * It takes 1sec to measure the data.
+	 */
+	bw_imc = get_mem_bw_imc(param->cpu_no, param->bw_report);
+	if (bw_imc <= 0)
+		return bw_imc;
+
+	bw_resc_end = get_mem_bw_resctrl();
+	if (bw_resc_end <= 0)
+		return bw_resc_end;
+
+	bw_resc = (bw_resc_end - *bw_resc_start) / MB;
+	ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
+	if (ret)
+		return ret;
+
+	*bw_resc_start = bw_resc_end;
+
+	return 0;
+}
+
+/*
+ * resctrl_val:	execute benchmark and measure memory bandwidth on
+ *			the benchmark
+ * @benchmark_cmd:	benchmark command and its arguments
+ * @param:		parameters passed to resctrl_val()
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
+{
+	char *resctrl_val = param->resctrl_val;
+	unsigned long bw_resc_start = 0;
+	struct sigaction sigact;
+	int ret = 0, pipefd[2];
+	char pipe_message = 0;
+	union sigval value;
+
+	if (strcmp(param->filename, "") == 0)
+		sprintf(param->filename, "stdio");
+
+	if ((strcmp(resctrl_val, "mba")) == 0 ||
+	    (strcmp(resctrl_val, "mbm")) == 0) {
+		ret = validate_bw_report_request(param->bw_report);
+		if (ret)
+			return ret;
+	}
+
+	ret = remount_resctrlfs(param->mum_resctrlfs);
+	if (ret)
+		return ret;
+
+	/*
+	 * If benchmark wasn't successfully started by child, then child should
+	 * kill parent, so save parent's pid
+	 */
+	ppid = getpid();
+
+	if (pipe(pipefd)) {
+		perror("# Unable to create pipe");
+
+		return -1;
+	}
+
+	/*
+	 * Fork to start benchmark, save child's pid so that it can be killed
+	 * when needed
+	 */
+	bm_pid = fork();
+	if (bm_pid == -1) {
+		perror("# Unable to fork");
+
+		return -1;
+	}
+
+	if (bm_pid == 0) {
+		/*
+		 * Mask all signals except SIGUSR1, parent uses SIGUSR1 to
+		 * start benchmark
+		 */
+		sigfillset(&sigact.sa_mask);
+		sigdelset(&sigact.sa_mask, SIGUSR1);
+
+		sigact.sa_sigaction = run_benchmark;
+		sigact.sa_flags = SA_SIGINFO;
+
+		/* Register for "SIGUSR1" signal from parent */
+		if (sigaction(SIGUSR1, &sigact, NULL))
+			PARENT_EXIT("Can't register child for signal");
+
+		/* Tell parent that child is ready */
+		close(pipefd[0]);
+		pipe_message = 1;
+		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			perror("# failed signaling parent process");
+			close(pipefd[1]);
+			return -1;
+		}
+		close(pipefd[1]);
+
+		/* Suspend child until delivery of "SIGUSR1" from parent */
+		sigsuspend(&sigact.sa_mask);
+
+		PARENT_EXIT("Child is done");
+	}
+
+	printf("# benchmark PID: %d\n", bm_pid);
+
+	/*
+	 * Register CTRL-C handler for parent, as it has to kill benchmark
+	 * before exiting
+	 */
+	sigact.sa_sigaction = ctrlc_handler;
+	sigemptyset(&sigact.sa_mask);
+	sigact.sa_flags = SA_SIGINFO;
+	if (sigaction(SIGINT, &sigact, NULL) ||
+	    sigaction(SIGHUP, &sigact, NULL)) {
+		perror("# sigaction");
+		ret = errno;
+		goto out;
+	}
+
+	value.sival_ptr = benchmark_cmd;
+
+	/* Taskset benchmark to specified cpu */
+	ret = taskset_benchmark(bm_pid, param->cpu_no);
+	if (ret)
+		goto out;
+
+	/* Write benchmark to specified control&monitoring grp in resctrl FS */
+	ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
+				      resctrl_val);
+	if (ret)
+		goto out;
+
+	if ((strcmp(resctrl_val, "mbm") == 0) ||
+	    (strcmp(resctrl_val, "mba") == 0)) {
+		ret = initialize_mem_bw_imc();
+		if (ret)
+			goto out;
+
+		initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
+					  param->cpu_no, resctrl_val);
+	}
+
+	/* Parent waits for child to be ready. */
+	close(pipefd[1]);
+	while (pipe_message != 1) {
+		if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			perror("# failed reading message from child process");
+			close(pipefd[0]);
+			goto out;
+		}
+	}
+	close(pipefd[0]);
+
+	/* Signal child to start benchmark */
+	if (sigqueue(bm_pid, SIGUSR1, value) == -1) {
+		perror("# sigqueue SIGUSR1 to child");
+		ret = errno;
+		goto out;
+	}
+
+	/* Give benchmark enough time to fully run */
+	sleep(1);
+
+	/* Test runs until the callback setup() tells the test to stop. */
+	while (1) {
+		if (strcmp(resctrl_val, "mbm") == 0) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+
+			ret = measure_vals(param, &bw_resc_start);
+			if (ret)
+				break;
+		} else if ((strcmp(resctrl_val, "mba") == 0)) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+
+			ret = measure_vals(param, &bw_resc_start);
+			if (ret)
+				break;
+		} else {
+			break;
+		}
+	}
+
+out:
+	kill(bm_pid, SIGKILL);
+	umount_resctrlfs();
+
+	return ret;
+}
-- 
Gitee


From 153b7f384b2755f73cdc1a70e5a894cea9eac949 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Thu, 16 Jan 2020 13:32:38 -0800
Subject: [PATCH 1228/2161] selftests/resctrl: Add built in benchmark

commit a2561b12fe392e0024e3187d325dbcbde7b99c04 upstream.

Built-in benchmark fill_buf generates stressful memory bandwidth
and cache traffic.

Later it will be used as a default benchmark by various resctrl tests
such as MBA (Memory Bandwidth Allocation) and MBM (Memory Bandwidth
Monitoring) tests.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/fill_buf.c | 207 +++++++++++++++++++++
 1 file changed, 207 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/fill_buf.c

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
new file mode 100644
index 000000000000..b76bf6a4a0a5
--- /dev/null
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fill_buf benchmark
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <malloc.h>
+#include <string.h>
+
+#include "resctrl.h"
+
+#define CL_SIZE			(64)
+#define PAGE_SIZE		(4 * 1024)
+#define MB			(1024 * 1024)
+
+static unsigned char *startptr;
+
+static void sb(void)
+{
+#if defined(__i386) || defined(__x86_64)
+	asm volatile("sfence\n\t"
+		     : : : "memory");
+#endif
+}
+
+static void ctrl_handler(int signo)
+{
+	free(startptr);
+	printf("\nEnding\n");
+	sb();
+	exit(EXIT_SUCCESS);
+}
+
+static void cl_flush(void *p)
+{
+#if defined(__i386) || defined(__x86_64)
+	asm volatile("clflush (%0)\n\t"
+		     : : "r"(p) : "memory");
+#endif
+}
+
+static void mem_flush(void *p, size_t s)
+{
+	char *cp = (char *)p;
+	size_t i = 0;
+
+	s = s / CL_SIZE; /* mem size in cache llines */
+
+	for (i = 0; i < s; i++)
+		cl_flush(&cp[i * CL_SIZE]);
+
+	sb();
+}
+
+static void *malloc_and_init_memory(size_t s)
+{
+	uint64_t *p64;
+	size_t s64;
+
+	void *p = memalign(PAGE_SIZE, s);
+
+	p64 = (uint64_t *)p;
+	s64 = s / sizeof(uint64_t);
+
+	while (s64 > 0) {
+		*p64 = (uint64_t)rand();
+		p64 += (CL_SIZE / sizeof(uint64_t));
+		s64 -= (CL_SIZE / sizeof(uint64_t));
+	}
+
+	return p;
+}
+
+static int fill_one_span_read(unsigned char *start_ptr, unsigned char *end_ptr)
+{
+	unsigned char sum, *p;
+
+	sum = 0;
+	p = start_ptr;
+	while (p < end_ptr) {
+		sum += *p;
+		p += (CL_SIZE / 2);
+	}
+
+	return sum;
+}
+
+static
+void fill_one_span_write(unsigned char *start_ptr, unsigned char *end_ptr)
+{
+	unsigned char *p;
+
+	p = start_ptr;
+	while (p < end_ptr) {
+		*p = '1';
+		p += (CL_SIZE / 2);
+	}
+}
+
+static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
+			   char *resctrl_val)
+{
+	int ret = 0;
+	FILE *fp;
+
+	while (1)
+		ret = fill_one_span_read(start_ptr, end_ptr);
+
+	/* Consume read result so that reading memory is not optimized out. */
+	fp = fopen("/dev/null", "w");
+	if (!fp)
+		perror("Unable to write to /dev/null");
+	fprintf(fp, "Sum: %d ", ret);
+	fclose(fp);
+
+	return 0;
+}
+
+static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr,
+			    char *resctrl_val)
+{
+	while (1)
+		fill_one_span_write(start_ptr, end_ptr);
+
+	return 0;
+}
+
+static int
+fill_cache(unsigned long long buf_size, int malloc_and_init, int memflush,
+	   int op, char *resctrl_val)
+{
+	unsigned char *start_ptr, *end_ptr;
+	unsigned long long i;
+	int ret;
+
+	if (malloc_and_init)
+		start_ptr = malloc_and_init_memory(buf_size);
+	else
+		start_ptr = malloc(buf_size);
+
+	if (!start_ptr)
+		return -1;
+
+	startptr = start_ptr;
+	end_ptr = start_ptr + buf_size;
+
+	/*
+	 * It's better to touch the memory once to avoid any compiler
+	 * optimizations
+	 */
+	if (!malloc_and_init) {
+		for (i = 0; i < buf_size; i++)
+			*start_ptr++ = (unsigned char)rand();
+	}
+
+	start_ptr = startptr;
+
+	/* Flush the memory before using to avoid "cache hot pages" effect */
+	if (memflush)
+		mem_flush(start_ptr, buf_size);
+
+	if (op == 0)
+		ret = fill_cache_read(start_ptr, end_ptr, resctrl_val);
+	else
+		ret = fill_cache_write(start_ptr, end_ptr, resctrl_val);
+
+	if (ret) {
+		printf("\n Errror in fill cache read/write...\n");
+		return -1;
+	}
+
+	free(startptr);
+
+	return 0;
+}
+
+int run_fill_buf(unsigned long span, int malloc_and_init_memory,
+		 int memflush, int op, char *resctrl_val)
+{
+	unsigned long long cache_size = span;
+	int ret;
+
+	/* set up ctrl-c handler */
+	if (signal(SIGINT, ctrl_handler) == SIG_ERR)
+		printf("Failed to catch SIGINT!\n");
+	if (signal(SIGHUP, ctrl_handler) == SIG_ERR)
+		printf("Failed to catch SIGHUP!\n");
+
+	ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op,
+			 resctrl_val);
+	if (ret) {
+		printf("\n Errror in fill cache\n");
+		return -1;
+	}
+
+	return 0;
+}
-- 
Gitee


From 8ac18a92c0c6a86c53e809e07a0c960dd621d728 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:39 -0800
Subject: [PATCH 1229/2161] selftests/resctrl: Add MBM test

commit ecdbb911f22d6cc5d422571dedf048b889d71417 upstream.

MBM (Memory Bandwidth Monitoring) test is the first implemented selftest.
It starts a stressful memory bandwidth benchmark and assigns the
bandwidth pid in a resctrl monitoring group. Read and compare perf IMC
counter and MBM total bytes for the benchmark. The numbers should be
close enough to pass the test.

Default benchmark is built-in fill_buf. But users can specify their
own benchmark by option "-b".

We can add memory bandwidth monitoring for multiple processes in the
future.

Co-developed-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/Makefile      |   7 +-
 tools/testing/selftests/resctrl/mbm_test.c    | 145 ++++++++++++++++++
 tools/testing/selftests/resctrl/resctrl.h     |   6 +
 .../testing/selftests/resctrl/resctrl_tests.c | 132 ++++++++++++++++
 tools/testing/selftests/resctrl/resctrl_val.c |   2 +
 tools/testing/selftests/resctrl/resctrlfs.c   |  77 ++++++++++
 6 files changed, 368 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/resctrl/mbm_test.c
 create mode 100644 tools/testing/selftests/resctrl/resctrl_tests.c

diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
index 76bbd6d3e4a8..d585cc1948cc 100644
--- a/tools/testing/selftests/resctrl/Makefile
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -3,10 +3,15 @@ CFLAGS = -g -Wall
 SRCS=$(wildcard *.c)
 OBJS=$(SRCS:.c=.o)
 
+all: resctrl_tests
+
 $(OBJS): $(SRCS)
 	$(CC) $(CFLAGS) -c $(SRCS)
 
+resctrl_tests: $(OBJS)
+	$(CC) $(CFLAGS) -o $@ $^
+
 .PHONY: clean
 
 clean:
-	$(RM) $(OBJS)
+	$(RM) $(OBJS) resctrl_tests
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
new file mode 100644
index 000000000000..4700f7453f81
--- /dev/null
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Bandwidth Monitoring (MBM) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define RESULT_FILE_NAME	"result_mbm"
+#define MAX_DIFF		300
+#define NUM_OF_RUNS		5
+
+static void
+show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
+{
+	unsigned long avg_bw_imc = 0, avg_bw_resc = 0;
+	unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+	long avg_diff = 0;
+	int runs;
+
+	/*
+	 * Discard the first value which is inaccurate due to monitoring setup
+	 * transition phase.
+	 */
+	for (runs = 1; runs < NUM_OF_RUNS ; runs++) {
+		sum_bw_imc += bw_imc[runs];
+		sum_bw_resc += bw_resc[runs];
+	}
+
+	avg_bw_imc = sum_bw_imc / 4;
+	avg_bw_resc = sum_bw_resc / 4;
+	avg_diff = avg_bw_resc - avg_bw_imc;
+
+	printf("%sok MBM: diff within %d%%\n",
+	       labs(avg_diff) > MAX_DIFF ? "not " : "", MAX_DIFF);
+	tests_run++;
+	printf("# avg_diff: %lu\n", labs(avg_diff));
+	printf("# Span (MB): %d\n", span);
+	printf("# avg_bw_imc: %lu\n", avg_bw_imc);
+	printf("# avg_bw_resc: %lu\n", avg_bw_resc);
+}
+
+static int check_results(int span)
+{
+	unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS];
+	char temp[1024], *token_array[8];
+	char output[] = RESULT_FILE_NAME;
+	int runs;
+	FILE *fp;
+
+	printf("# Checking for pass/fail\n");
+
+	fp = fopen(output, "r");
+	if (!fp) {
+		perror(output);
+
+		return errno;
+	}
+
+	runs = 0;
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int i = 0;
+
+		while (token) {
+			token_array[i++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		bw_resc[runs] = strtoul(token_array[5], NULL, 0);
+		bw_imc[runs] = strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+
+	show_bw_info(bw_imc, bw_resc, span);
+
+	fclose(fp);
+
+	return 0;
+}
+
+static int mbm_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	static int num_of_runs;
+	va_list param;
+	int ret = 0;
+
+	/* Run NUM_OF_RUNS times */
+	if (num_of_runs++ >= NUM_OF_RUNS)
+		return -1;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Set up shemata with 100% allocation on the first run. */
+	if (num_of_runs == 0)
+		ret = write_schemata(p->ctrlgrp, "100", p->cpu_no,
+				     p->resctrl_val);
+
+	return ret;
+}
+
+void mbm_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd)
+{
+	struct resctrl_val_param param = {
+		.resctrl_val	= "mbm",
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.span		= span,
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 1,
+		.filename	= RESULT_FILE_NAME,
+		.bw_report	=  bw_report,
+		.setup		= mbm_setup
+	};
+	int ret;
+
+	remove(RESULT_FILE_NAME);
+
+	if (!validate_resctrl_feature_request("mbm"))
+		return -1;
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results(span);
+	if (ret)
+		return ret;
+
+	mbm_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index fb42087c904d..c92bc50c6751 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -57,7 +57,10 @@ struct resctrl_val_param {
 };
 
 pid_t bm_pid, ppid;
+int tests_run;
 
+bool check_resctrlfs_support(void);
+int filter_dmesg(void);
 int remount_resctrlfs(bool mum_resctrlfs);
 int get_resource_id(int cpu_no, int *resource_id);
 int umount_resctrlfs(void);
@@ -75,5 +78,8 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush,
 		 int op, char *resctrl_va);
 int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param);
+int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd);
+void tests_cleanup(void);
+void mbm_test_cleanup(void);
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
new file mode 100644
index 000000000000..496c8030fe43
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resctrl tests
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define BENCHMARK_ARGS		64
+#define BENCHMARK_ARG_SIZE	64
+
+static void cmd_help(void)
+{
+	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list]\n");
+	printf("\t-b benchmark_cmd [options]: run specified benchmark\n");
+	printf("\t default benchmark is builtin fill_buf\n");
+	printf("\t-t test list: run tests specified in the test list, ");
+	printf("e.g. -t mbm,mba\n");
+	printf("\t-h: help\n");
+}
+
+void tests_cleanup(void)
+{
+	mbm_test_cleanup();
+}
+
+int main(int argc, char **argv)
+{
+	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, ben_ind;
+	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
+	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
+	bool has_ben = false, mbm_test = true;
+	int ben_count;
+
+	for (i = 0; i < argc; i++) {
+		if (strcmp(argv[i], "-b") == 0) {
+			ben_ind = i + 1;
+			ben_count = argc - ben_ind;
+			argc_new = ben_ind - 1;
+			has_ben = true;
+			break;
+		}
+	}
+
+	while ((c = getopt(argc_new, argv, "ht:b:")) != -1) {
+		char *token;
+
+		switch (c) {
+		case 't':
+			token = strtok(optarg, ",");
+
+			mbm_test = false;
+			while (token) {
+				if (!strcmp(token, "mbm")) {
+					mbm_test = true;
+				} else {
+					printf("invalid argument\n");
+
+					return -1;
+				}
+				token = strtok(NULL, ":\t");
+			}
+			break;
+		case 'p':
+			cpu_no = atoi(optarg);
+			break;
+		case 'h':
+			cmd_help();
+
+			return 0;
+		default:
+			printf("invalid argument\n");
+
+			return -1;
+		}
+	}
+
+	printf("TAP version 13\n");
+
+	/*
+	 * Typically we need root privileges, because:
+	 * 1. We write to resctrl FS
+	 * 2. We execute perf commands
+	 */
+	if (geteuid() != 0)
+		printf("# WARNING: not running as root, tests may fail.\n");
+
+	if (has_ben) {
+		/* Extract benchmark command from command line. */
+		for (i = ben_ind; i < argc; i++) {
+			benchmark_cmd[i - ben_ind] = benchmark_cmd_area[i];
+			sprintf(benchmark_cmd[i - ben_ind], "%s", argv[i]);
+		}
+		benchmark_cmd[ben_count] = NULL;
+	} else {
+		/* If no benchmark is given by "-b" argument, use fill_buf. */
+		for (i = 0; i < 6; i++)
+			benchmark_cmd[i] = benchmark_cmd_area[i];
+
+		strcpy(benchmark_cmd[0], "fill_buf");
+		sprintf(benchmark_cmd[1], "%d", span);
+		strcpy(benchmark_cmd[2], "1");
+		strcpy(benchmark_cmd[3], "1");
+		strcpy(benchmark_cmd[4], "0");
+		strcpy(benchmark_cmd[5], "");
+		benchmark_cmd[6] = NULL;
+	}
+
+	sprintf(bw_report, "reads");
+	sprintf(bm_type, "fill_buf");
+
+	check_resctrlfs_support();
+	filter_dmesg();
+
+	if (mbm_test) {
+		printf("# Starting MBM BW change ...\n");
+		if (!has_ben)
+			sprintf(benchmark_cmd[5], "%s", "mba");
+		res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
+		printf("%sok MBM: bw change\n", res ? "not " : "");
+		mbm_test_cleanup();
+		tests_run++;
+	}
+
+	printf("1..%d\n", tests_run);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 0d1bd03ec4be..4f7bd5a4d86e 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -435,6 +435,8 @@ pid_t bm_pid, ppid;
 static void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
 {
 	kill(bm_pid, SIGKILL);
+	umount_resctrlfs();
+	tests_cleanup();
 	printf("Ending\n\n");
 
 	exit(EXIT_SUCCESS);
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 1b125f9d8e5d..07c8394b427f 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -388,6 +388,41 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	return ret;
 }
 
+bool check_resctrlfs_support(void)
+{
+	FILE *inf = fopen("/proc/filesystems", "r");
+	DIR *dp;
+	char *res;
+	bool ret = false;
+
+	if (!inf)
+		return false;
+
+	res = fgrep(inf, "nodev\tresctrl\n");
+
+	if (res) {
+		ret = true;
+		free(res);
+	}
+
+	fclose(inf);
+
+	printf("%sok kernel supports resctrl filesystem\n", ret ? "" : "not ");
+	tests_run++;
+
+	dp = opendir(RESCTRL_PATH);
+	printf("%sok resctrl mountpoint \"%s\" exists\n",
+	       dp ? "" : "not ", RESCTRL_PATH);
+	if (dp)
+		closedir(dp);
+	tests_run++;
+
+	printf("# resctrl filesystem %s mounted\n",
+	       find_resctrl_mount(NULL) ? "not" : "is");
+
+	return ret;
+}
+
 char *fgrep(FILE *inf, const char *str)
 {
 	char line[256];
@@ -433,6 +468,48 @@ bool validate_resctrl_feature_request(char *resctrl_val)
 	return found;
 }
 
+int filter_dmesg(void)
+{
+	char line[1024];
+	FILE *fp;
+	int pipefds[2];
+	pid_t pid;
+	int ret;
+
+	ret = pipe(pipefds);
+	if (ret) {
+		perror("pipe");
+		return ret;
+	}
+	pid = fork();
+	if (pid == 0) {
+		close(pipefds[0]);
+		dup2(pipefds[1], STDOUT_FILENO);
+		execlp("dmesg", "dmesg", NULL);
+		perror("executing dmesg");
+		exit(1);
+	}
+	close(pipefds[1]);
+	fp = fdopen(pipefds[0], "r");
+	if (!fp) {
+		perror("fdopen(pipe)");
+		kill(pid, SIGTERM);
+
+		return -1;
+	}
+
+	while (fgets(line, 1024, fp)) {
+		if (strstr(line, "intel_rdt:"))
+			printf("# dmesg: %s", line);
+		if (strstr(line, "resctrl:"))
+			printf("# dmesg: %s", line);
+	}
+	fclose(fp);
+	waitpid(pid, NULL, 0);
+
+	return 0;
+}
+
 int validate_bw_report_request(char *bw_report)
 {
 	if (strcmp(bw_report, "reads") == 0)
-- 
Gitee


From e1c7e80293e80c5dda2c1ca7af1f85487eb6f3ab Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:40 -0800
Subject: [PATCH 1230/2161] selftests/resctrl: Add MBA test

commit 01fee6b4d1f93247e806dddb0065b88317949085 upstream.

MBA (Memory Bandwidth Allocation) test starts a stressful memory
bandwidth benchmark and allocates memory bandwidth from 100% down
to 10% for the benchmark process. For each allocation, compare
perf IMC counter and mbm total bytes from resctrl. The difference
between the two values should be within a threshold to pass the test.

Default benchmark is built-in fill_buf. But users can specify their
own benchmark by option "-b".

We can add memory bandwidth allocation for multiple processes in the
future.

Co-developed-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/mba_test.c    | 171 ++++++++++++++++++
 tools/testing/selftests/resctrl/resctrl.h     |   2 +
 .../testing/selftests/resctrl/resctrl_tests.c |  16 +-
 3 files changed, 188 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/resctrl/mba_test.c

diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
new file mode 100644
index 000000000000..7bf8eaa6204b
--- /dev/null
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Bandwidth Allocation (MBA) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define RESULT_FILE_NAME	"result_mba"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF		300
+#define ALLOCATION_MAX		100
+#define ALLOCATION_MIN		10
+#define ALLOCATION_STEP		10
+
+/*
+ * Change schemata percentage from 100 to 10%. Write schemata to specified
+ * con_mon grp, mon_grp in resctrl FS.
+ * For each allocation, run 5 times in order to get average values.
+ */
+static int mba_setup(int num, ...)
+{
+	static int runs_per_allocation, allocation = 100;
+	struct resctrl_val_param *p;
+	char allocation_str[64];
+	va_list param;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	if (runs_per_allocation >= NUM_OF_RUNS)
+		runs_per_allocation = 0;
+
+	/* Only set up schemata once every NUM_OF_RUNS of allocations */
+	if (runs_per_allocation++ != 0)
+		return 0;
+
+	if (allocation < ALLOCATION_MIN || allocation > ALLOCATION_MAX)
+		return -1;
+
+	sprintf(allocation_str, "%d", allocation);
+
+	write_schemata(p->ctrlgrp, allocation_str, p->cpu_no, p->resctrl_val);
+	allocation -= ALLOCATION_STEP;
+
+	return 0;
+}
+
+static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
+{
+	int allocation, runs;
+	bool failed = false;
+
+	printf("# Results are displayed in (MB)\n");
+	/* Memory bandwidth from 100% down to 10% */
+	for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP;
+	     allocation++) {
+		unsigned long avg_bw_imc, avg_bw_resc;
+		unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+		unsigned long avg_diff;
+
+		/*
+		 * The first run is discarded due to inaccurate value from
+		 * phase transition.
+		 */
+		for (runs = NUM_OF_RUNS * allocation + 1;
+		     runs < NUM_OF_RUNS * allocation + NUM_OF_RUNS ; runs++) {
+			sum_bw_imc += bw_imc[runs];
+			sum_bw_resc += bw_resc[runs];
+		}
+
+		avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1);
+		avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1);
+		avg_diff = labs((long)(avg_bw_resc - avg_bw_imc));
+
+		printf("%sok MBA schemata percentage %u smaller than %d %%\n",
+		       avg_diff > MAX_DIFF ? "not " : "",
+		       ALLOCATION_MAX - ALLOCATION_STEP * allocation,
+		       MAX_DIFF);
+		tests_run++;
+		printf("# avg_diff: %lu\n", avg_diff);
+		printf("# avg_bw_imc: %lu\n", avg_bw_imc);
+		printf("# avg_bw_resc: %lu\n", avg_bw_resc);
+		if (avg_diff > MAX_DIFF)
+			failed = true;
+	}
+
+	printf("%sok schemata change using MBA%s\n", failed ? "not " : "",
+	       failed ? " # at least one test failed" : "");
+	tests_run++;
+}
+
+static int check_results(void)
+{
+	char *token_array[8], output[] = RESULT_FILE_NAME, temp[512];
+	unsigned long bw_imc[1024], bw_resc[1024];
+	int runs;
+	FILE *fp;
+
+	fp = fopen(output, "r");
+	if (!fp) {
+		perror(output);
+
+		return errno;
+	}
+
+	runs = 0;
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		/* Field 3 is perf imc value */
+		bw_imc[runs] = strtoul(token_array[3], NULL, 0);
+		/* Field 5 is resctrl value */
+		bw_resc[runs] = strtoul(token_array[5], NULL, 0);
+		runs++;
+	}
+
+	fclose(fp);
+
+	show_mba_info(bw_imc, bw_resc);
+
+	return 0;
+}
+
+void mba_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd)
+{
+	struct resctrl_val_param param = {
+		.resctrl_val	= "mba",
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 1,
+		.filename	= RESULT_FILE_NAME,
+		.bw_report	= bw_report,
+		.setup		= mba_setup
+	};
+	int ret;
+
+	remove(RESULT_FILE_NAME);
+
+	if (!validate_resctrl_feature_request("mba"))
+		return -1;
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results();
+	if (ret)
+		return ret;
+
+	mba_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index c92bc50c6751..a6196ab6b59f 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -81,5 +81,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param);
 int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd);
 void tests_cleanup(void);
 void mbm_test_cleanup(void);
+int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd);
+void mba_test_cleanup(void);
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 496c8030fe43..0ed953a97db2 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -26,6 +26,7 @@ static void cmd_help(void)
 void tests_cleanup(void)
 {
 	mbm_test_cleanup();
+	mba_test_cleanup();
 }
 
 int main(int argc, char **argv)
@@ -33,7 +34,7 @@ int main(int argc, char **argv)
 	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, ben_ind;
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
-	bool has_ben = false, mbm_test = true;
+	bool has_ben = false, mbm_test = true, mba_test = true;
 	int ben_count;
 
 	for (i = 0; i < argc; i++) {
@@ -54,9 +55,12 @@ int main(int argc, char **argv)
 			token = strtok(optarg, ",");
 
 			mbm_test = false;
+			mba_test = false;
 			while (token) {
 				if (!strcmp(token, "mbm")) {
 					mbm_test = true;
+				} else if (!strcmp(token, "mba")) {
+					mba_test = true;
 				} else {
 					printf("invalid argument\n");
 
@@ -126,6 +130,16 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
+	if (mba_test) {
+		printf("# Starting MBA Schemata change ...\n");
+		if (!has_ben)
+			sprintf(benchmark_cmd[1], "%d", span);
+		res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd);
+		printf("%sok MBA: schemata change\n", res ? "not " : "");
+		mba_test_cleanup();
+		tests_run++;
+	}
+
 	printf("1..%d\n", tests_run);
 
 	return 0;
-- 
Gitee


From 1000e110c693ab69616b1d33045fd5c8045feae0 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:41 -0800
Subject: [PATCH 1231/2161] selftests/resctrl: Add Cache QoS Monitoring (CQM)
 selftest

commit 78941183d1b151317beb37b25690b7d87fe2596d upstream.

Cache QoS Monitoring (CQM) selftest starts stressful cache benchmark
with specified size of memory to access the cache. Last Level cache
occupancy reported by CQM should be close to the size of the memory.

Co-developed-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cache.c       |  99 +++++++++
 tools/testing/selftests/resctrl/cqm_test.c    | 176 ++++++++++++++++
 tools/testing/selftests/resctrl/resctrl.h     |  16 ++
 .../testing/selftests/resctrl/resctrl_tests.c |  31 ++-
 tools/testing/selftests/resctrl/resctrl_val.c |  63 +++++-
 tools/testing/selftests/resctrl/resctrlfs.c   | 193 +++++++++++++++++-
 6 files changed, 558 insertions(+), 20 deletions(-)
 create mode 100644 tools/testing/selftests/resctrl/cache.c
 create mode 100644 tools/testing/selftests/resctrl/cqm_test.c

diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
new file mode 100644
index 000000000000..d7719219ab51
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdint.h>
+#include "resctrl.h"
+
+struct read_format {
+	__u64 nr;			/* The number of events */
+	struct {
+		__u64 value;		/* The value of the event */
+	} values[2];
+};
+
+char llc_occup_path[1024];
+
+/*
+ * Get LLC Occupancy as reported by RESCTRL FS
+ * For CQM,
+ * 1. If con_mon grp and mon grp given, then read from mon grp in
+ * con_mon grp
+ * 2. If only con_mon grp given, then read from con_mon grp
+ * 3. If both not given, then read from root con_mon grp
+ * For CAT,
+ * 1. If con_mon grp given, then read from it
+ * 2. If con_mon grp not given, then read from root con_mon grp
+ *
+ * Return: =0 on success.  <0 on failure.
+ */
+static int get_llc_occu_resctrl(unsigned long *llc_occupancy)
+{
+	FILE *fp;
+
+	fp = fopen(llc_occup_path, "r");
+	if (!fp) {
+		perror("Failed to open results file");
+
+		return errno;
+	}
+	if (fscanf(fp, "%lu", llc_occupancy) <= 0) {
+		perror("Could not get llc occupancy");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * print_results_cache:	the cache results are stored in a file
+ * @filename:		file that stores the results
+ * @bm_pid:		child pid that runs benchmark
+ * @llc_value:		perf miss value /
+ *			llc occupancy value reported by resctrl FS
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+static int print_results_cache(char *filename, int bm_pid,
+			       unsigned long llc_value)
+{
+	FILE *fp;
+
+	if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
+		printf("Pid: %d \t LLC_value: %lu\n", bm_pid,
+		       llc_value);
+	} else {
+		fp = fopen(filename, "a");
+		if (!fp) {
+			perror("Cannot open results file");
+
+			return errno;
+		}
+		fprintf(fp, "Pid: %d \t llc_value: %lu\n", bm_pid, llc_value);
+		fclose(fp);
+	}
+
+	return 0;
+}
+
+int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
+{
+	unsigned long llc_occu_resc = 0, llc_value = 0;
+	int ret;
+
+	/*
+	 * Measure llc occupancy from resctrl.
+	 */
+	if (!strcmp(param->resctrl_val, "cqm")) {
+		ret = get_llc_occu_resctrl(&llc_occu_resc);
+		if (ret < 0)
+			return ret;
+		llc_value = llc_occu_resc;
+	}
+	ret = print_results_cache(param->filename, bm_pid, llc_value);
+	if (ret)
+		return ret;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c
new file mode 100644
index 000000000000..c8756152bd61
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cqm_test.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cache Monitoring Technology (CQM) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+#include <unistd.h>
+
+#define RESULT_FILE_NAME	"result_cqm"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF		2000000
+#define MAX_DIFF_PERCENT	15
+
+int count_of_bits;
+char cbm_mask[256];
+unsigned long long_mask;
+unsigned long cache_size;
+
+static int cqm_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	va_list param;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Run NUM_OF_RUNS times */
+	if (p->num_of_runs >= NUM_OF_RUNS)
+		return -1;
+
+	p->num_of_runs++;
+
+	return 0;
+}
+
+static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits,
+			    unsigned long span)
+{
+	unsigned long avg_llc_occu_resc = 0;
+	float diff_percent;
+	long avg_diff = 0;
+	bool res;
+
+	avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1);
+	avg_diff = (long)abs(span - avg_llc_occu_resc);
+
+	diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100;
+
+	if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) ||
+	    (abs(avg_diff) <= MAX_DIFF))
+		res = true;
+	else
+		res = false;
+
+	printf("%sok CQM: diff within %d, %d\%%\n", res ? "" : "not",
+	       MAX_DIFF, (int)MAX_DIFF_PERCENT);
+
+	printf("# diff: %ld\n", avg_diff);
+	printf("# percent diff=%d\n", abs((int)diff_percent));
+	printf("# Results are displayed in (Bytes)\n");
+	printf("# Number of bits: %d\n", no_of_bits);
+	printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc);
+	printf("# llc_occu_exp (span): %lu\n", span);
+
+	tests_run++;
+}
+
+static int check_results(struct resctrl_val_param *param, int no_of_bits)
+{
+	char *token_array[8], temp[512];
+	unsigned long sum_llc_occu_resc = 0;
+	int runs = 0;
+	FILE *fp;
+
+	printf("# checking for pass/fail\n");
+	fp = fopen(param->filename, "r");
+	if (!fp) {
+		perror("# Error in opening file\n");
+
+		return errno;
+	}
+
+	while (fgets(temp, 1024, fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		/* Field 3 is llc occ resc value */
+		if (runs > 0)
+			sum_llc_occu_resc += strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+	fclose(fp);
+	show_cache_info(sum_llc_occu_resc, no_of_bits, param->span);
+
+	return 0;
+}
+
+void cqm_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
+{
+	int ret, mum_resctrlfs;
+
+	cache_size = 0;
+	mum_resctrlfs = 1;
+
+	ret = remount_resctrlfs(mum_resctrlfs);
+	if (ret)
+		return ret;
+
+	if (!validate_resctrl_feature_request("cqm"))
+		return -1;
+
+	ret = get_cbm_mask("L3");
+	if (ret)
+		return ret;
+
+	long_mask = strtoul(cbm_mask, NULL, 16);
+
+	ret = get_cache_size(cpu_no, "L3", &cache_size);
+	if (ret)
+		return ret;
+	printf("cache size :%lu\n", cache_size);
+
+	count_of_bits = count_bits(long_mask);
+
+	if (n < 1 || n > count_of_bits) {
+		printf("Invalid input value for numbr_of_bits n!\n");
+		printf("Please Enter value in range 1 to %d\n", count_of_bits);
+		return -1;
+	}
+
+	struct resctrl_val_param param = {
+		.resctrl_val	= "cqm",
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 0,
+		.filename	= RESULT_FILE_NAME,
+		.mask		= ~(long_mask << n) & long_mask,
+		.span		= cache_size * n / count_of_bits,
+		.num_of_runs	= 0,
+		.setup		= cqm_setup,
+	};
+
+	if (strcmp(benchmark_cmd[0], "fill_buf") == 0)
+		sprintf(benchmark_cmd[1], "%lu", param.span);
+
+	remove(RESULT_FILE_NAME);
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results(&param, n);
+	if (ret)
+		return ret;
+
+	cqm_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index a6196ab6b59f..d326c7e95862 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -18,12 +18,16 @@
 #include <sys/mount.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/eventfd.h>
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
 
 #define MB			(1024 * 1024)
 #define RESCTRL_PATH		"/sys/fs/resctrl"
 #define PHYS_ID_PATH		"/sys/devices/system/cpu/cpu"
+#define CBM_MASK_PATH		"/sys/fs/resctrl/info"
 
 #define PARENT_EXIT(err_msg)			\
 	do {					\
@@ -53,12 +57,16 @@ struct resctrl_val_param {
 	int		mum_resctrlfs;
 	char		filename[64];
 	char		*bw_report;
+	unsigned long	mask;
+	int		num_of_runs;
 	int		(*setup)(int num, ...);
 };
 
 pid_t bm_pid, ppid;
 int tests_run;
 
+char llc_occup_path[1024];
+
 bool check_resctrlfs_support(void);
 int filter_dmesg(void);
 int remount_resctrlfs(bool mum_resctrlfs);
@@ -83,5 +91,13 @@ void tests_cleanup(void);
 void mbm_test_cleanup(void);
 int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd);
 void mba_test_cleanup(void);
+int get_cbm_mask(char *cache_type);
+int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size);
+void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
+int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd);
+unsigned int count_bits(unsigned long n);
+void cqm_test_cleanup(void);
+int get_core_sibling(int cpu_no);
+int measure_cache_vals(struct resctrl_val_param *param, int bm_pid);
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 0ed953a97db2..70179a61df3b 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -15,11 +15,13 @@
 
 static void cmd_help(void)
 {
-	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list]\n");
-	printf("\t-b benchmark_cmd [options]: run specified benchmark\n");
+	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n");
+	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM");
 	printf("\t default benchmark is builtin fill_buf\n");
 	printf("\t-t test list: run tests specified in the test list, ");
-	printf("e.g. -t mbm,mba\n");
+	printf("e.g. -t mbm, mba, cqm\n");
+	printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
+	printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n");
 	printf("\t-h: help\n");
 }
 
@@ -27,15 +29,16 @@ void tests_cleanup(void)
 {
 	mbm_test_cleanup();
 	mba_test_cleanup();
+	cqm_test_cleanup();
 }
 
 int main(int argc, char **argv)
 {
-	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, ben_ind;
+	bool has_ben = false, mbm_test = true, mba_test = true, cqm_test = true;
+	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5;
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
-	bool has_ben = false, mbm_test = true, mba_test = true;
-	int ben_count;
+	int ben_ind, ben_count;
 
 	for (i = 0; i < argc; i++) {
 		if (strcmp(argv[i], "-b") == 0) {
@@ -56,11 +59,14 @@ int main(int argc, char **argv)
 
 			mbm_test = false;
 			mba_test = false;
+			cqm_test = false;
 			while (token) {
 				if (!strcmp(token, "mbm")) {
 					mbm_test = true;
 				} else if (!strcmp(token, "mba")) {
 					mba_test = true;
+				} else if (!strcmp(token, "cqm")) {
+					cqm_test = true;
 				} else {
 					printf("invalid argument\n");
 
@@ -72,6 +78,9 @@ int main(int argc, char **argv)
 		case 'p':
 			cpu_no = atoi(optarg);
 			break;
+		case 'n':
+			no_of_bits = atoi(optarg);
+			break;
 		case 'h':
 			cmd_help();
 
@@ -140,6 +149,16 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
+	if (cqm_test) {
+		printf("# Starting CQM test ...\n");
+		if (!has_ben)
+			sprintf(benchmark_cmd[5], "%s", "cqm");
+		res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd);
+		printf("%sok CQM: test\n", res ? "not " : "");
+		cqm_test_cleanup();
+		tests_run++;
+	}
+
 	printf("1..%d\n", tests_run);
 
 	return 0;
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 4f7bd5a4d86e..520fea3606d1 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -31,6 +31,18 @@
 #define MBM_LOCAL_BYTES_PATH			\
 	"%s/mon_data/mon_L3_%02d/mbm_local_bytes"
 
+#define CON_MON_LCC_OCCUP_PATH		\
+	"%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define CON_LCC_OCCUP_PATH		\
+	"%s/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define MON_LCC_OCCUP_PATH		\
+	"%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define LCC_OCCUP_PATH			\
+	"%s/mon_data/mon_L3_%02d/llc_occupancy"
+
 struct membw_read_format {
 	__u64 value;         /* The value of the event */
 	__u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
@@ -432,7 +444,7 @@ static unsigned long get_mem_bw_resctrl(void)
 
 pid_t bm_pid, ppid;
 
-static void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
+void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
 {
 	kill(bm_pid, SIGKILL);
 	umount_resctrlfs();
@@ -480,6 +492,42 @@ static int print_results_bw(char *filename,  int bm_pid, float bw_imc,
 	return 0;
 }
 
+static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num)
+{
+	if (strlen(ctrlgrp) && strlen(mongrp))
+		sprintf(llc_occup_path,	CON_MON_LCC_OCCUP_PATH,	RESCTRL_PATH,
+			ctrlgrp, mongrp, sock_num);
+	else if (!strlen(ctrlgrp) && strlen(mongrp))
+		sprintf(llc_occup_path,	MON_LCC_OCCUP_PATH, RESCTRL_PATH,
+			mongrp, sock_num);
+	else if (strlen(ctrlgrp) && !strlen(mongrp))
+		sprintf(llc_occup_path,	CON_LCC_OCCUP_PATH, RESCTRL_PATH,
+			ctrlgrp, sock_num);
+	else if (!strlen(ctrlgrp) && !strlen(mongrp))
+		sprintf(llc_occup_path, LCC_OCCUP_PATH,	RESCTRL_PATH, sock_num);
+}
+
+/*
+ * initialize_llc_occu_resctrl:	Appropriately populate "llc_occup_path"
+ * @ctrlgrp:			Name of the control monitor group (con_mon grp)
+ * @mongrp:			Name of the monitor group (mon grp)
+ * @cpu_no:			CPU number that the benchmark PID is binded to
+ * @resctrl_val:		Resctrl feature (Eg: cat, cqm.. etc)
+ */
+static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
+					int cpu_no, char *resctrl_val)
+{
+	int resource_id;
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		perror("# Unable to resource_id");
+		return;
+	}
+
+	if (strcmp(resctrl_val, "cqm") == 0)
+		set_cqm_path(ctrlgrp, mongrp, resource_id);
+}
+
 static int
 measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
 {
@@ -634,7 +682,9 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 
 		initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
 					  param->cpu_no, resctrl_val);
-	}
+	} else if (strcmp(resctrl_val, "cqm") == 0)
+		initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp,
+					    param->cpu_no, resctrl_val);
 
 	/* Parent waits for child to be ready. */
 	close(pipefd[1]);
@@ -660,7 +710,8 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 
 	/* Test runs until the callback setup() tells the test to stop. */
 	while (1) {
-		if (strcmp(resctrl_val, "mbm") == 0) {
+		if ((strcmp(resctrl_val, "mbm") == 0) ||
+		    (strcmp(resctrl_val, "mba") == 0)) {
 			ret = param->setup(1, param);
 			if (ret) {
 				ret = 0;
@@ -670,14 +721,14 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 			ret = measure_vals(param, &bw_resc_start);
 			if (ret)
 				break;
-		} else if ((strcmp(resctrl_val, "mba") == 0)) {
+		} else if (strcmp(resctrl_val, "cqm") == 0) {
 			ret = param->setup(1, param);
 			if (ret) {
 				ret = 0;
 				break;
 			}
-
-			ret = measure_vals(param, &bw_resc_start);
+			sleep(1);
+			ret = measure_cache_vals(param, bm_pid);
 			if (ret)
 				break;
 		} else {
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 07c8394b427f..9c050a6c1723 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -49,6 +49,8 @@ static int find_resctrl_mount(char *buffer)
 	return -ENOENT;
 }
 
+char cbm_mask[256];
+
 /*
  * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl
  * @mum_resctrlfs:	Should the resctrl FS be remounted?
@@ -130,6 +132,145 @@ int get_resource_id(int cpu_no, int *resource_id)
 	return 0;
 }
 
+/*
+ * get_cache_size - Get cache size for a specified CPU
+ * @cpu_no:	CPU number
+ * @cache_type:	Cache level L2/L3
+ * @cache_size:	pointer to cache_size
+ *
+ * Return: = 0 on success, < 0 on failure.
+ */
+int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size)
+{
+	char cache_path[1024], cache_str[64];
+	int length, i, cache_num;
+	FILE *fp;
+
+	if (!strcmp(cache_type, "L3")) {
+		cache_num = 3;
+	} else if (!strcmp(cache_type, "L2")) {
+		cache_num = 2;
+	} else {
+		perror("Invalid cache level");
+		return -1;
+	}
+
+	sprintf(cache_path, "/sys/bus/cpu/devices/cpu%d/cache/index%d/size",
+		cpu_no, cache_num);
+	fp = fopen(cache_path, "r");
+	if (!fp) {
+		perror("Failed to open cache size");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cache_str) <= 0) {
+		perror("Could not get cache_size");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	length = (int)strlen(cache_str);
+
+	*cache_size = 0;
+
+	for (i = 0; i < length; i++) {
+		if ((cache_str[i] >= '0') && (cache_str[i] <= '9'))
+
+			*cache_size = *cache_size * 10 + (cache_str[i] - '0');
+
+		else if (cache_str[i] == 'K')
+
+			*cache_size = *cache_size * 1024;
+
+		else if (cache_str[i] == 'M')
+
+			*cache_size = *cache_size * 1024 * 1024;
+
+		else
+			break;
+	}
+
+	return 0;
+}
+
+#define CORE_SIBLINGS_PATH	"/sys/bus/cpu/devices/cpu"
+
+/*
+ * get_cbm_mask - Get cbm mask for given cache
+ * @cache_type:	Cache level L2/L3
+ *
+ * Mask is stored in cbm_mask which is global variable.
+ *
+ * Return: = 0 on success, < 0 on failure.
+ */
+int get_cbm_mask(char *cache_type)
+{
+	char cbm_mask_path[1024];
+	FILE *fp;
+
+	sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type);
+
+	fp = fopen(cbm_mask_path, "r");
+	if (!fp) {
+		perror("Failed to open cache level");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cbm_mask) <= 0) {
+		perror("Could not get max cbm_mask");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * get_core_sibling - Get sibling core id from the same socket for given CPU
+ * @cpu_no:	CPU number
+ *
+ * Return:	> 0 on success, < 0 on failure.
+ */
+int get_core_sibling(int cpu_no)
+{
+	char core_siblings_path[1024], cpu_list_str[64];
+	int sibling_cpu_no = -1;
+	FILE *fp;
+
+	sprintf(core_siblings_path, "%s%d/topology/core_siblings_list",
+		CORE_SIBLINGS_PATH, cpu_no);
+
+	fp = fopen(core_siblings_path, "r");
+	if (!fp) {
+		perror("Failed to open core siblings path");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cpu_list_str) <= 0) {
+		perror("Could not get core_siblings list");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	char *token = strtok(cpu_list_str, "-,");
+
+	while (token) {
+		sibling_cpu_no = atoi(token);
+		/* Skipping core 0 as we don't want to run test on core 0 */
+		if (sibling_cpu_no != 0)
+			break;
+		token = strtok(NULL, "-,");
+	}
+
+	return sibling_cpu_no;
+}
+
 /*
  * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu
  * @bm_pid:	PID that should be binded
@@ -164,9 +305,10 @@ int taskset_benchmark(pid_t bm_pid, int cpu_no)
  */
 void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 {
-	unsigned long span;
-	int operation, ret;
+	int operation, ret, malloc_and_init_memory, memflush;
+	unsigned long span, buffer_span;
 	char **benchmark_cmd;
+	char resctrl_val[64];
 	FILE *fp;
 
 	benchmark_cmd = info->si_ptr;
@@ -182,8 +324,18 @@ void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 	if (strcmp(benchmark_cmd[0], "fill_buf") == 0) {
 		/* Execute default fill_buf benchmark */
 		span = strtoul(benchmark_cmd[1], NULL, 10);
+		malloc_and_init_memory = atoi(benchmark_cmd[2]);
+		memflush =  atoi(benchmark_cmd[3]);
 		operation = atoi(benchmark_cmd[4]);
-		if (run_fill_buf(span, 1, 1, operation, NULL))
+		sprintf(resctrl_val, "%s", benchmark_cmd[5]);
+
+		if (strcmp(resctrl_val, "cqm") != 0)
+			buffer_span = span * MB;
+		else
+			buffer_span = span;
+
+		if (run_fill_buf(buffer_span, malloc_and_init_memory, memflush,
+				 operation, resctrl_val))
 			fprintf(stderr, "Error in running fill buffer\n");
 	} else {
 		/* Execute specified benchmark */
@@ -210,6 +362,14 @@ static int create_grp(const char *grp_name, char *grp, const char *parent_grp)
 	struct dirent *ep;
 	DIR *dp;
 
+	/*
+	 * At this point, we are guaranteed to have resctrl FS mounted and if
+	 * length of grp_name == 0, it means, user wants to use root con_mon
+	 * grp, so do nothing
+	 */
+	if (strlen(grp_name) == 0)
+		return 0;
+
 	/* Check if requested grp exists or not */
 	dp = opendir(parent_grp);
 	if (dp) {
@@ -293,9 +453,10 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
 	if (ret)
 		goto out;
 
-	/* Create mon grp and write pid into it for "mbm" test */
-	if ((strcmp(resctrl_val, "mbm") == 0)) {
-		if (mongrp) {
+	/* Create mon grp and write pid into it for "mbm" and "cqm" test */
+	if ((strcmp(resctrl_val, "cqm") == 0) ||
+	    (strcmp(resctrl_val, "mbm") == 0)) {
+		if (strlen(mongrp)) {
 			sprintf(monitorgroup_p, "%s/mon_groups", controlgroup);
 			sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp);
 			ret = create_grp(mongrp, monitorgroup, monitorgroup_p);
@@ -339,7 +500,8 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	int resource_id, ret = 0;
 	FILE *fp;
 
-	if (strcmp(resctrl_val, "mba") != 0)
+	if ((strcmp(resctrl_val, "mba") != 0) &&
+	    (strcmp(resctrl_val, "cqm") != 0))
 		return -ENOENT;
 
 	if (!schemata) {
@@ -360,7 +522,10 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	else
 		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
 
-	sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
+	if (!strcmp(resctrl_val, "cqm"))
+		sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata);
+	if (strcmp(resctrl_val, "mba") == 0)
+		sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
 
 	fp = fopen(controlgroup, "w");
 	if (!fp) {
@@ -537,3 +702,15 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 		      group_fd, flags);
 	return ret;
 }
+
+unsigned int count_bits(unsigned long n)
+{
+	unsigned int count = 0;
+
+	while (n) {
+		count += n & 1;
+		n >>= 1;
+	}
+
+	return count;
+}
-- 
Gitee


From ef774a8a532a994a2d06fa5399093b02436c6275 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:42 -0800
Subject: [PATCH 1232/2161] selftests/resctrl: Add Cache Allocation Technology
 (CAT) selftest

commit 790bf585b0eeec9aa0e680ba090142b98da7f948 upstream.

Cache Allocation Technology (CAT) selftest allocates a portion of
last level cache and starts a benchmark to read each cache
line in this portion of cache. Measure the cache misses in perf and
the misses should be equal to the number of cache lines in this
portion of cache.

We don't use CQM to calculate cache usage because some CAT enabled
platforms don't have CQM.

Co-developed-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cache.c       | 175 +++++++++++-
 tools/testing/selftests/resctrl/cat_test.c    | 250 ++++++++++++++++++
 tools/testing/selftests/resctrl/fill_buf.c    |  10 +-
 tools/testing/selftests/resctrl/resctrl.h     |   3 +
 .../testing/selftests/resctrl/resctrl_tests.c |  15 +-
 tools/testing/selftests/resctrl/resctrlfs.c   |   3 +-
 6 files changed, 451 insertions(+), 5 deletions(-)
 create mode 100644 tools/testing/selftests/resctrl/cat_test.c

diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
index d7719219ab51..38dbf4962e33 100644
--- a/tools/testing/selftests/resctrl/cache.c
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -10,8 +10,105 @@ struct read_format {
 	} values[2];
 };
 
+static struct perf_event_attr pea_llc_miss;
+static struct read_format rf_cqm;
+static int fd_lm;
 char llc_occup_path[1024];
 
+static void initialize_perf_event_attr(void)
+{
+	pea_llc_miss.type = PERF_TYPE_HARDWARE;
+	pea_llc_miss.size = sizeof(struct perf_event_attr);
+	pea_llc_miss.read_format = PERF_FORMAT_GROUP;
+	pea_llc_miss.exclude_kernel = 1;
+	pea_llc_miss.exclude_hv = 1;
+	pea_llc_miss.exclude_idle = 1;
+	pea_llc_miss.exclude_callchain_kernel = 1;
+	pea_llc_miss.inherit = 1;
+	pea_llc_miss.exclude_guest = 1;
+	pea_llc_miss.disabled = 1;
+}
+
+static void ioctl_perf_event_ioc_reset_enable(void)
+{
+	ioctl(fd_lm, PERF_EVENT_IOC_RESET, 0);
+	ioctl(fd_lm, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+static int perf_event_open_llc_miss(pid_t pid, int cpu_no)
+{
+	fd_lm = perf_event_open(&pea_llc_miss, pid, cpu_no, -1,
+				PERF_FLAG_FD_CLOEXEC);
+	if (fd_lm == -1) {
+		perror("Error opening leader");
+		ctrlc_handler(0, NULL, NULL);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int initialize_llc_perf(void)
+{
+	memset(&pea_llc_miss, 0, sizeof(struct perf_event_attr));
+	memset(&rf_cqm, 0, sizeof(struct read_format));
+
+	/* Initialize perf_event_attr structures for HW_CACHE_MISSES */
+	initialize_perf_event_attr();
+
+	pea_llc_miss.config = PERF_COUNT_HW_CACHE_MISSES;
+
+	rf_cqm.nr = 1;
+
+	return 0;
+}
+
+static int reset_enable_llc_perf(pid_t pid, int cpu_no)
+{
+	int ret = 0;
+
+	ret = perf_event_open_llc_miss(pid, cpu_no);
+	if (ret < 0)
+		return ret;
+
+	/* Start counters to log values */
+	ioctl_perf_event_ioc_reset_enable();
+
+	return 0;
+}
+
+/*
+ * get_llc_perf:	llc cache miss through perf events
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ *
+ * Perf events like HW_CACHE_MISSES could be used to validate number of
+ * cache lines allocated.
+ *
+ * Return: =0 on success.  <0 on failure.
+ */
+static int get_llc_perf(unsigned long *llc_perf_miss)
+{
+	__u64 total_misses;
+
+	/* Stop counters after one span to get miss rate */
+
+	ioctl(fd_lm, PERF_EVENT_IOC_DISABLE, 0);
+
+	if (read(fd_lm, &rf_cqm, sizeof(struct read_format)) == -1) {
+		perror("Could not get llc misses through perf");
+
+		return -1;
+	}
+
+	total_misses = rf_cqm.values[0].value;
+
+	close(fd_lm);
+
+	*llc_perf_miss = total_misses;
+
+	return 0;
+}
+
 /*
  * Get LLC Occupancy as reported by RESCTRL FS
  * For CQM,
@@ -79,9 +176,19 @@ static int print_results_cache(char *filename, int bm_pid,
 
 int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
 {
-	unsigned long llc_occu_resc = 0, llc_value = 0;
+	unsigned long llc_perf_miss = 0, llc_occu_resc = 0, llc_value = 0;
 	int ret;
 
+	/*
+	 * Measure cache miss from perf.
+	 */
+	if (!strcmp(param->resctrl_val, "cat")) {
+		ret = get_llc_perf(&llc_perf_miss);
+		if (ret < 0)
+			return ret;
+		llc_value = llc_perf_miss;
+	}
+
 	/*
 	 * Measure llc occupancy from resctrl.
 	 */
@@ -97,3 +204,69 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
 
 	return 0;
 }
+
+/*
+ * cache_val:		execute benchmark and measure LLC occupancy resctrl
+ * and perf cache miss for the benchmark
+ * @param:		parameters passed to cache_val()
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+int cat_val(struct resctrl_val_param *param)
+{
+	int malloc_and_init_memory = 1, memflush = 1, operation = 0, ret = 0;
+	char *resctrl_val = param->resctrl_val;
+	pid_t bm_pid;
+
+	if (strcmp(param->filename, "") == 0)
+		sprintf(param->filename, "stdio");
+
+	bm_pid = getpid();
+
+	/* Taskset benchmark to specified cpu */
+	ret = taskset_benchmark(bm_pid, param->cpu_no);
+	if (ret)
+		return ret;
+
+	/* Write benchmark to specified con_mon grp, mon_grp in resctrl FS*/
+	ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
+				      resctrl_val);
+	if (ret)
+		return ret;
+
+	if ((strcmp(resctrl_val, "cat") == 0)) {
+		ret = initialize_llc_perf();
+		if (ret)
+			return ret;
+	}
+
+	/* Test runs until the callback setup() tells the test to stop. */
+	while (1) {
+		if (strcmp(resctrl_val, "cat") == 0) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+			ret = reset_enable_llc_perf(bm_pid, param->cpu_no);
+			if (ret)
+				break;
+
+			if (run_fill_buf(param->span, malloc_and_init_memory,
+					 memflush, operation, resctrl_val)) {
+				fprintf(stderr, "Error-running fill buffer\n");
+				ret = -1;
+				break;
+			}
+
+			sleep(1);
+			ret = measure_cache_vals(param, bm_pid);
+			if (ret)
+				break;
+		} else {
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
new file mode 100644
index 000000000000..c194fcc42407
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cache Allocation Technology (CAT) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+#include <unistd.h>
+
+#define RESULT_FILE_NAME1	"result_cat1"
+#define RESULT_FILE_NAME2	"result_cat2"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF_PERCENT	4
+#define MAX_DIFF		1000000
+
+int count_of_bits;
+char cbm_mask[256];
+unsigned long long_mask;
+unsigned long cache_size;
+
+/*
+ * Change schemata. Write schemata to specified
+ * con_mon grp, mon_grp in resctrl FS.
+ * Run 5 times in order to get average values.
+ */
+static int cat_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	char schemata[64];
+	va_list param;
+	int ret = 0;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Run NUM_OF_RUNS times */
+	if (p->num_of_runs >= NUM_OF_RUNS)
+		return -1;
+
+	if (p->num_of_runs == 0) {
+		sprintf(schemata, "%lx", p->mask);
+		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
+				     p->resctrl_val);
+	}
+	p->num_of_runs++;
+
+	return ret;
+}
+
+static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits,
+			    unsigned long span)
+{
+	unsigned long allocated_cache_lines = span / 64;
+	unsigned long avg_llc_perf_miss = 0;
+	float diff_percent;
+
+	avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1);
+	diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) /
+				allocated_cache_lines * 100;
+
+	printf("%sok CAT: cache miss rate within %d%%\n",
+	       abs((int)diff_percent) > MAX_DIFF_PERCENT ? "not " : "",
+	       MAX_DIFF_PERCENT);
+	tests_run++;
+	printf("# Percent diff=%d\n", abs((int)diff_percent));
+	printf("# Number of bits: %d\n", no_of_bits);
+	printf("# Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss);
+	printf("# Allocated cache lines: %lu\n", allocated_cache_lines);
+}
+
+static int check_results(struct resctrl_val_param *param)
+{
+	char *token_array[8], temp[512];
+	unsigned long sum_llc_perf_miss = 0;
+	int runs = 0, no_of_bits = 0;
+	FILE *fp;
+
+	printf("# Checking for pass/fail\n");
+	fp = fopen(param->filename, "r");
+	if (!fp) {
+		perror("# Cannot open file");
+
+		return errno;
+	}
+
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+		/*
+		 * Discard the first value which is inaccurate due to monitoring
+		 * setup transition phase.
+		 */
+		if (runs > 0)
+			sum_llc_perf_miss += strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+
+	fclose(fp);
+	no_of_bits = count_bits(param->mask);
+
+	show_cache_info(sum_llc_perf_miss, no_of_bits, param->span);
+
+	return 0;
+}
+
+void cat_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME1);
+	remove(RESULT_FILE_NAME2);
+}
+
+int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
+{
+	unsigned long l_mask, l_mask_1;
+	int ret, pipefd[2], sibling_cpu_no;
+	char pipe_message;
+	pid_t bm_pid;
+
+	cache_size = 0;
+
+	ret = remount_resctrlfs(true);
+	if (ret)
+		return ret;
+
+	if (!validate_resctrl_feature_request("cat"))
+		return -1;
+
+	/* Get default cbm mask for L3/L2 cache */
+	ret = get_cbm_mask(cache_type);
+	if (ret)
+		return ret;
+
+	long_mask = strtoul(cbm_mask, NULL, 16);
+
+	/* Get L3/L2 cache size */
+	ret = get_cache_size(cpu_no, cache_type, &cache_size);
+	if (ret)
+		return ret;
+	printf("cache size :%lu\n", cache_size);
+
+	/* Get max number of bits from default-cabm mask */
+	count_of_bits = count_bits(long_mask);
+
+	if (n < 1 || n > count_of_bits - 1) {
+		printf("Invalid input value for no_of_bits n!\n");
+		printf("Please Enter value in range 1 to %d\n",
+		       count_of_bits - 1);
+		return -1;
+	}
+
+	/* Get core id from same socket for running another thread */
+	sibling_cpu_no = get_core_sibling(cpu_no);
+	if (sibling_cpu_no < 0)
+		return -1;
+
+	struct resctrl_val_param param = {
+		.resctrl_val	= "cat",
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 0,
+		.setup		= cat_setup,
+	};
+
+	l_mask = long_mask >> n;
+	l_mask_1 = ~l_mask & long_mask;
+
+	/* Set param values for parent thread which will be allocated bitmask
+	 * with (max_bits - n) bits
+	 */
+	param.span = cache_size * (count_of_bits - n) / count_of_bits;
+	strcpy(param.ctrlgrp, "c2");
+	strcpy(param.mongrp, "m2");
+	strcpy(param.filename, RESULT_FILE_NAME2);
+	param.mask = l_mask;
+	param.num_of_runs = 0;
+
+	if (pipe(pipefd)) {
+		perror("# Unable to create pipe");
+		return errno;
+	}
+
+	bm_pid = fork();
+
+	/* Set param values for child thread which will be allocated bitmask
+	 * with n bits
+	 */
+	if (bm_pid == 0) {
+		param.mask = l_mask_1;
+		strcpy(param.ctrlgrp, "c1");
+		strcpy(param.mongrp, "m1");
+		param.span = cache_size * n / count_of_bits;
+		strcpy(param.filename, RESULT_FILE_NAME1);
+		param.num_of_runs = 0;
+		param.cpu_no = sibling_cpu_no;
+	}
+
+	remove(param.filename);
+
+	ret = cat_val(&param);
+	if (ret)
+		return ret;
+
+	ret = check_results(&param);
+	if (ret)
+		return ret;
+
+	if (bm_pid == 0) {
+		/* Tell parent that child is ready */
+		close(pipefd[0]);
+		pipe_message = 1;
+		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			close(pipefd[1]);
+			perror("# failed signaling parent process");
+			return errno;
+		}
+
+		close(pipefd[1]);
+		while (1)
+			;
+	} else {
+		/* Parent waits for child to be ready. */
+		close(pipefd[1]);
+		pipe_message = 0;
+		while (pipe_message != 1) {
+			if (read(pipefd[0], &pipe_message,
+				 sizeof(pipe_message)) < sizeof(pipe_message)) {
+				perror("# failed reading from child process");
+				break;
+			}
+		}
+		close(pipefd[0]);
+		kill(bm_pid, SIGKILL);
+	}
+
+	cat_test_cleanup();
+	if (bm_pid)
+		umount_resctrlfs();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index b76bf6a4a0a5..84d2a8b9657a 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -113,8 +113,11 @@ static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
 	int ret = 0;
 	FILE *fp;
 
-	while (1)
+	while (1) {
 		ret = fill_one_span_read(start_ptr, end_ptr);
+		if (!strcmp(resctrl_val, "cat"))
+			break;
+	}
 
 	/* Consume read result so that reading memory is not optimized out. */
 	fp = fopen("/dev/null", "w");
@@ -129,8 +132,11 @@ static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
 static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr,
 			    char *resctrl_val)
 {
-	while (1)
+	while (1) {
 		fill_one_span_write(start_ptr, end_ptr);
+		if (!strcmp(resctrl_val, "cat"))
+			break;
+	}
 
 	return 0;
 }
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index d326c7e95862..e8399389f3ad 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -94,6 +94,9 @@ void mba_test_cleanup(void);
 int get_cbm_mask(char *cache_type);
 int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size);
 void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
+int cat_val(struct resctrl_val_param *param);
+void cat_test_cleanup(void);
+int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type);
 int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd);
 unsigned int count_bits(unsigned long n);
 void cqm_test_cleanup(void);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 70179a61df3b..7f75b0b01bb5 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -19,7 +19,7 @@ static void cmd_help(void)
 	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM");
 	printf("\t default benchmark is builtin fill_buf\n");
 	printf("\t-t test list: run tests specified in the test list, ");
-	printf("e.g. -t mbm, mba, cqm\n");
+	printf("e.g. -t mbm, mba, cqm, cat\n");
 	printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
 	printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n");
 	printf("\t-h: help\n");
@@ -30,6 +30,7 @@ void tests_cleanup(void)
 	mbm_test_cleanup();
 	mba_test_cleanup();
 	cqm_test_cleanup();
+	cat_test_cleanup();
 }
 
 int main(int argc, char **argv)
@@ -39,6 +40,7 @@ int main(int argc, char **argv)
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
 	int ben_ind, ben_count;
+	bool cat_test = true;
 
 	for (i = 0; i < argc; i++) {
 		if (strcmp(argv[i], "-b") == 0) {
@@ -60,6 +62,7 @@ int main(int argc, char **argv)
 			mbm_test = false;
 			mba_test = false;
 			cqm_test = false;
+			cat_test = false;
 			while (token) {
 				if (!strcmp(token, "mbm")) {
 					mbm_test = true;
@@ -67,6 +70,8 @@ int main(int argc, char **argv)
 					mba_test = true;
 				} else if (!strcmp(token, "cqm")) {
 					cqm_test = true;
+				} else if (!strcmp(token, "cat")) {
+					cat_test = true;
 				} else {
 					printf("invalid argument\n");
 
@@ -159,6 +164,14 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
+	if (cat_test) {
+		printf("# Starting CAT test ...\n");
+		res = cat_perf_miss_val(cpu_no, no_of_bits, "L3");
+		printf("%sok CAT: test\n", res ? "not " : "");
+		tests_run++;
+		cat_test_cleanup();
+	}
+
 	printf("1..%d\n", tests_run);
 
 	return 0;
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 9c050a6c1723..8772c8c4d5d0 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -501,6 +501,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	FILE *fp;
 
 	if ((strcmp(resctrl_val, "mba") != 0) &&
+	    (strcmp(resctrl_val, "cat") != 0) &&
 	    (strcmp(resctrl_val, "cqm") != 0))
 		return -ENOENT;
 
@@ -522,7 +523,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	else
 		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
 
-	if (!strcmp(resctrl_val, "cqm"))
+	if (!strcmp(resctrl_val, "cat") || !strcmp(resctrl_val, "cqm"))
 		sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata);
 	if (strcmp(resctrl_val, "mba") == 0)
 		sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
-- 
Gitee


From 00323243d0f8b266332ee160df66f497fdef904f Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Thu, 16 Jan 2020 13:32:43 -0800
Subject: [PATCH 1233/2161] selftests/resctrl: Add vendor detection mechanism

commit 53f74fbec9f069241aa6f4c4d908229c77ee83e2 upstream.

RESCTRL feature is supported both on Intel and AMD now. Some features
are implemented differently. Add vendor detection mechanism. Use the vendor
check where there are differences.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl.h     |  1 +
 .../testing/selftests/resctrl/resctrl_tests.c | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index e8399389f3ad..39bf59c6b9c5 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -66,6 +66,7 @@ pid_t bm_pid, ppid;
 int tests_run;
 
 char llc_occup_path[1024];
+bool is_amd;
 
 bool check_resctrlfs_support(void);
 int filter_dmesg(void);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 7f75b0b01bb5..884e918b1e97 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -13,6 +13,27 @@
 #define BENCHMARK_ARGS		64
 #define BENCHMARK_ARG_SIZE	64
 
+bool is_amd;
+
+void detect_amd(void)
+{
+	FILE *inf = fopen("/proc/cpuinfo", "r");
+	char *res;
+
+	if (!inf)
+		return;
+
+	res = fgrep(inf, "vendor_id");
+
+	if (res) {
+		char *s = strchr(res, ':');
+
+		is_amd = s && !strcmp(s, ": AuthenticAMD\n");
+		free(res);
+	}
+	fclose(inf);
+}
+
 static void cmd_help(void)
 {
 	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n");
@@ -107,6 +128,9 @@ int main(int argc, char **argv)
 	if (geteuid() != 0)
 		printf("# WARNING: not running as root, tests may fail.\n");
 
+	/* Detect AMD vendor */
+	detect_amd();
+
 	if (has_ben) {
 		/* Extract benchmark command from command line. */
 		for (i = ben_ind; i < argc; i++) {
-- 
Gitee


From dc8ad2d20d2d56948340a13f3bcb834947bf4a80 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Thu, 16 Jan 2020 13:32:44 -0800
Subject: [PATCH 1234/2161] selftests/resctrl: Use cache index3 id for AMD
 schemata masks

commit c0327e1d7c42adb31a9541de6812f740893177a1 upstream.

AMD uses the cache l3 boundary for schemata masks. Update it accordigly.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrlfs.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 8772c8c4d5d0..19c0ec4045a4 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -113,8 +113,13 @@ int get_resource_id(int cpu_no, int *resource_id)
 	char phys_pkg_path[1024];
 	FILE *fp;
 
-	sprintf(phys_pkg_path, "%s%d/topology/physical_package_id",
-		PHYS_ID_PATH, cpu_no);
+	if (is_amd)
+		sprintf(phys_pkg_path, "%s%d/cache/index3/id",
+			PHYS_ID_PATH, cpu_no);
+	else
+		sprintf(phys_pkg_path, "%s%d/topology/physical_package_id",
+			PHYS_ID_PATH, cpu_no);
+
 	fp = fopen(phys_pkg_path, "r");
 	if (!fp) {
 		perror("Failed to open physical_package_id");
-- 
Gitee


From 21174ce5991f282989c902d1b9bc45e280107c85 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Thu, 16 Jan 2020 13:32:45 -0800
Subject: [PATCH 1235/2161] selftests/resctrl: Disable MBA and MBM tests for
 AMD

commit 85f553d24ada5399cab730e3199a970e2cc78c82 upstream.

For now, disable MBA and MBM tests for AMD. Deciding test pass/fail
is not clear right now. We can enable when we have some clarity.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cat_test.c      | 4 ++--
 tools/testing/selftests/resctrl/resctrl_tests.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index c194fcc42407..5da43767b973 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -64,8 +64,8 @@ static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits,
 				allocated_cache_lines * 100;
 
 	printf("%sok CAT: cache miss rate within %d%%\n",
-	       abs((int)diff_percent) > MAX_DIFF_PERCENT ? "not " : "",
-	       MAX_DIFF_PERCENT);
+	       !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT ?
+	       "not " : "", MAX_DIFF_PERCENT);
 	tests_run++;
 	printf("# Percent diff=%d\n", abs((int)diff_percent));
 	printf("# Number of bits: %d\n", no_of_bits);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 884e918b1e97..425cc85ac883 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -158,7 +158,7 @@ int main(int argc, char **argv)
 	check_resctrlfs_support();
 	filter_dmesg();
 
-	if (mbm_test) {
+	if (!is_amd && mbm_test) {
 		printf("# Starting MBM BW change ...\n");
 		if (!has_ben)
 			sprintf(benchmark_cmd[5], "%s", "mba");
@@ -168,7 +168,7 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
-	if (mba_test) {
+	if (!is_amd && mba_test) {
 		printf("# Starting MBA Schemata change ...\n");
 		if (!has_ben)
 			sprintf(benchmark_cmd[1], "%d", span);
-- 
Gitee


From 2d965669c0a854cfd64a1a404a4e3a4395763164 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 12 Feb 2020 10:38:29 +0000
Subject: [PATCH 1236/2161] selftests/resctrl: fix spelling mistake "Errror" ->
 "Error"

commit 14f4283aa3e69a68e0a2e3bdb8a651a39bf18c52 upstream.

There are two spelling mistakes in error messages. Fix these.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/fill_buf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index 84d2a8b9657a..79c611c99a3d 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -181,7 +181,7 @@ fill_cache(unsigned long long buf_size, int malloc_and_init, int memflush,
 		ret = fill_cache_write(start_ptr, end_ptr, resctrl_val);
 
 	if (ret) {
-		printf("\n Errror in fill cache read/write...\n");
+		printf("\n Error in fill cache read/write...\n");
 		return -1;
 	}
 
@@ -205,7 +205,7 @@ int run_fill_buf(unsigned long span, int malloc_and_init_memory,
 	ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op,
 			 resctrl_val);
 	if (ret) {
-		printf("\n Errror in fill cache\n");
+		printf("\n Error in fill cache\n");
 		return -1;
 	}
 
-- 
Gitee


From dca57cbb39f8482fab8767b92f6892630b646aa1 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 5 May 2020 15:36:12 -0700
Subject: [PATCH 1237/2161] x86/resctrl: Rename asm/resctrl_sched.h to
 asm/resctrl.h

commit 8dd97c65185c5a63c668e5bd8a861c04f47a35ed upstream.

asm/resctrl_sched.h is dedicated to the code used for configuration
of the CPU resource control state when a task is scheduled.

Rename resctrl_sched.h to resctrl.h in preparation of additions that
will no longer make this file dedicated to work done during scheduling.

No functional change.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/6914e0ef880b539a82a6d889f9423496d471ad1d.1588715690.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/{resctrl_sched.h => resctrl.h} | 6 +++---
 arch/x86/kernel/cpu/resctrl/core.c                  | 2 +-
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c           | 2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c              | 2 +-
 arch/x86/kernel/process_32.c                        | 2 +-
 arch/x86/kernel/process_64.c                        | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)
 rename arch/x86/include/asm/{resctrl_sched.h => resctrl.h} (96%)

diff --git a/arch/x86/include/asm/resctrl_sched.h b/arch/x86/include/asm/resctrl.h
similarity index 96%
rename from arch/x86/include/asm/resctrl_sched.h
rename to arch/x86/include/asm/resctrl.h
index f6b7fe2833cc..99b5728e441d 100644
--- a/arch/x86/include/asm/resctrl_sched.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_RESCTRL_SCHED_H
-#define _ASM_X86_RESCTRL_SCHED_H
+#ifndef _ASM_X86_RESCTRL_H
+#define _ASM_X86_RESCTRL_H
 
 #ifdef CONFIG_X86_CPU_RESCTRL
 
@@ -90,4 +90,4 @@ static inline void resctrl_sched_in(void) {}
 
 #endif /* CONFIG_X86_CPU_RESCTRL */
 
-#endif /* _ASM_X86_RESCTRL_SCHED_H */
+#endif /* _ASM_X86_RESCTRL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 87a34b6e06a2..00e67073c586 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -22,7 +22,7 @@
 #include <linux/cpuhotplug.h>
 
 #include <asm/intel-family.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include "internal.h"
 
 /* Mutex to protect rdtgroup access. */
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index d7623e1b927d..4bd28b388a1a 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -24,7 +24,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/intel-family.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include <asm/perf_event.h>
 
 #include "../../events/perf_event.h" /* For X86_CONFIG() */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index ddc184841ea8..a19e5ca8f117 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -29,7 +29,7 @@
 
 #include <uapi/linux/magic.h>
 
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include "internal.h"
 
 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7b06156abbbd..670e5c1b6b86 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,7 +53,7 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include <asm/proto.h>
 
 #include "process.h"
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0b85ea864049..8e148bef009b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -54,7 +54,7 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include <asm/unistd.h>
 #include <asm/fsgsbase.h>
 #ifdef CONFIG_IA32_EMULATION
-- 
Gitee


From e935f9cbb2328fe5e566f73251707571c98abc5a Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 5 May 2020 15:36:13 -0700
Subject: [PATCH 1238/2161] x86/cpu: Move resctrl CPUID code to resctrl/

commit 0118ad82c2a64ebcf15d7565ed35361407efadfa upstream.

The function determining a platform's support and properties of cache
occupancy and memory bandwidth monitoring (properties of
X86_FEATURE_CQM_LLC) can be found among the common CPU code. After
the feature's properties is populated in the per-CPU data the resctrl
subsystem is the only consumer (via boot_cpu_data).

Move the function that obtains the CPU information used by resctrl to
the resctrl subsystem and rename it from init_cqm() to
resctrl_cpu_detect(). The function continues to be called from the
common CPU code. This move is done in preparation of the addition of some
vendor specific code.

No functional change.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/38433b99f9d16c8f4ee796f8cc42b871531fa203.1588715690.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/resctrl.h     |  3 +++
 arch/x86/kernel/cpu/common.c       | 27 ++-------------------------
 arch/x86/kernel/cpu/resctrl/core.c | 24 ++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 99b5728e441d..07603064df8f 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -84,9 +84,12 @@ static inline void resctrl_sched_in(void)
 		__resctrl_sched_in();
 }
 
+void resctrl_cpu_detect(struct cpuinfo_x86 *c);
+
 #else
 
 static inline void resctrl_sched_in(void) {}
+static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {}
 
 #endif /* CONFIG_X86_CPU_RESCTRL */
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3b05135ec124..c4258c7f225d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,6 +59,7 @@
 #include <asm/uv/uv.h>
 #endif
 #include <asm/sigframe.h>
+#include <asm/resctrl.h>
 
 #ifdef CONFIG_NUMA_AWARE_SPINLOCKS
 #include <asm/qspinlock.h>
@@ -889,30 +890,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
 	}
 }
 
-static void init_cqm(struct cpuinfo_x86 *c)
-{
-	if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
-		c->x86_cache_max_rmid  = -1;
-		c->x86_cache_occ_scale = -1;
-		return;
-	}
-
-	/* will be overridden if occupancy monitoring exists */
-	c->x86_cache_max_rmid = cpuid_ebx(0xf);
-
-	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
-	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
-	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
-		u32 eax, ebx, ecx, edx;
-
-		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
-		cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
-
-		c->x86_cache_max_rmid  = ecx;
-		c->x86_cache_occ_scale = ebx;
-	}
-}
-
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
 	u32 eax, ebx, ecx, edx;
@@ -980,7 +957,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 
 	init_scattered_cpuid_features(c);
 	init_speculation_control(c);
-	init_cqm(c);
+	resctrl_cpu_detect(c);
 
 	/*
 	 * Clear/Set all flags overridden by options, after probe.
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 00e67073c586..aa1c7606eb01 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -960,6 +960,30 @@ static __init void rdt_init_res_defs(void)
 
 static enum cpuhp_state rdt_online;
 
+void resctrl_cpu_detect(struct cpuinfo_x86 *c)
+{
+	if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
+		c->x86_cache_max_rmid  = -1;
+		c->x86_cache_occ_scale = -1;
+		return;
+	}
+
+	/* will be overridden if occupancy monitoring exists */
+	c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
+		u32 eax, ebx, ecx, edx;
+
+		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
+		cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+		c->x86_cache_max_rmid  = ecx;
+		c->x86_cache_occ_scale = ebx;
+	}
+}
+
 static int __init resctrl_late_init(void)
 {
 	struct rdt_resource *r;
-- 
Gitee


From 286b182d0418bd002ba8139a3dcd6cac50aaae23 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 5 May 2020 15:36:14 -0700
Subject: [PATCH 1239/2161] x86/resctrl: Remove unnecessary RMID checks

commit f0d339db56478e3bcd98d5e985d3d69cacf27549 upstream.

The cache and memory bandwidth monitoring properties are read using
CPUID on every CPU. After the information is read from the system a
sanity check is run to

 (1) ensure that the RMID data is initialized for the boot CPU in case
     the information was not available on the boot CPU and

 (2) the boot CPU's RMID is set to the minimum of RMID obtained
     from all CPUs.

Every known platform that supports resctrl has the same maximum RMID
on all CPUs. Both sanity checks found in x86_init_cache_qos() can thus
safely be removed.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/c9a3b60d34091840c8b0bd1c6fab15e5ba92cb17.1588715690.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/common.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4258c7f225d..39557f130216 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1476,20 +1476,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
 #endif
 }
 
-static void x86_init_cache_qos(struct cpuinfo_x86 *c)
-{
-	/*
-	 * The heavy lifting of max_rmid and cache_occ_scale are handled
-	 * in get_cpu_cap().  Here we just set the max_rmid for the boot_cpu
-	 * in case CQM bits really aren't there in this CPU.
-	 */
-	if (c != &boot_cpu_data) {
-		boot_cpu_data.x86_cache_max_rmid =
-			min(boot_cpu_data.x86_cache_max_rmid,
-			    c->x86_cache_max_rmid);
-	}
-}
-
 /*
  * Validate that ACPI/mptables have the same information about the
  * effective APIC id and update the package map.
@@ -1609,7 +1595,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #endif
 
 	x86_init_rdrand(c);
-	x86_init_cache_qos(c);
 	setup_pku(c);
 
 	/*
-- 
Gitee


From 7a6d1618b11fcdafb6054b274a40eb834f1c13f9 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 5 May 2020 15:36:15 -0700
Subject: [PATCH 1240/2161] x86/resctrl: Query LLC monitoring properties once
 during boot

commit 923f3a2b48bdccb6a1d1f0dd48de03de7ad936d9 upstream.

Cache and memory bandwidth monitoring are features that are part of
x86 CPU resource control that is supported by the resctrl subsystem.
The monitoring properties are obtained via CPUID from every CPU
and only used within the resctrl subsystem where the properties are
only read from boot_cpu_data.

Obtain the monitoring properties once, placed in boot_cpu_data, via the
->c_bsp_init() helpers of the vendors that support X86_FEATURE_CQM_LLC.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/6d74a6ac3e69f4b7a8b4115835f9455faf0f468d.1588715690.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/amd.c          | 3 +++
 arch/x86/kernel/cpu/common.c       | 2 --
 arch/x86/kernel/cpu/intel.c        | 7 +++++++
 arch/x86/kernel/cpu/resctrl/core.c | 1 +
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index eea739a66a4b..e18408e98b84 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -18,6 +18,7 @@
 #include <asm/pci-direct.h>
 #include <asm/delay.h>
 #include <asm/debugreg.h>
+#include <asm/resctrl.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/mmconfig.h>
@@ -574,6 +575,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 			x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
 		}
 	}
+
+	resctrl_cpu_detect(c);
 }
 
 static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 39557f130216..37787f7f36d3 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,7 +59,6 @@
 #include <asm/uv/uv.h>
 #endif
 #include <asm/sigframe.h>
-#include <asm/resctrl.h>
 
 #ifdef CONFIG_NUMA_AWARE_SPINLOCKS
 #include <asm/qspinlock.h>
@@ -957,7 +956,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 
 	init_scattered_cpuid_features(c);
 	init_speculation_control(c);
-	resctrl_cpu_detect(c);
 
 	/*
 	 * Clear/Set all flags overridden by options, after probe.
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f3f42e216992..91fd4b7c25f8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -19,6 +19,7 @@
 #include <asm/microcode_intel.h>
 #include <asm/hwcap2.h>
 #include <asm/elf.h>
+#include <asm/resctrl.h>
 
 #ifdef CONFIG_X86_64
 #include <linux/topology.h>
@@ -341,6 +342,11 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		detect_ht_early(c);
 }
 
+static void bsp_init_intel(struct cpuinfo_x86 *c)
+{
+	resctrl_cpu_detect(c);
+}
+
 #ifdef CONFIG_X86_32
 /*
  *	Early probe support logic for ppro memory erratum #50
@@ -1025,6 +1031,7 @@ static const struct cpu_dev intel_cpu_dev = {
 #endif
 	.c_detect_tlb	= intel_detect_tlb,
 	.c_early_init   = early_init_intel,
+	.c_bsp_init	= bsp_init_intel,
 	.c_init		= init_intel,
 	.c_x86_vendor	= X86_VENDOR_INTEL,
 };
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index aa1c7606eb01..a95e015ebd66 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -960,6 +960,7 @@ static __init void rdt_init_res_defs(void)
 
 static enum cpuhp_state rdt_online;
 
+/* Runs once on the BSP during boot. */
 void resctrl_cpu_detect(struct cpuinfo_x86 *c)
 {
 	if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
-- 
Gitee


From 2ff045a6972b26832fddfcf2f3a89d462199671a Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 5 May 2020 15:36:16 -0700
Subject: [PATCH 1241/2161] x86/resctrl: Maintain MBM counter width per
 resource

commit 46637d4570e108d1f6721cfa2cca1d078882761a upstream.

The original Memory Bandwidth Monitoring (MBM) architectural
definition defines counters of up to 62 bits in the IA32_QM_CTR MSR,
and the first-generation MBM implementation uses 24 bit counters.
Software is required to poll at 1 second or faster to ensure that
data is retrieved before a counter rollover occurs more than once
under worst conditions.

As system bandwidths scale the software requirement is maintained with
the introduction of a per-resource enumerable MBM counter width.

In preparation for supporting hardware with an enumerable MBM counter
width the current globally static MBM counter width is moved to a
per-resource MBM counter width. Currently initialized to 24 always
to result in no functional change.

In essence there is one function, mbm_overflow_count() that needs to
know the counter width to handle rollovers. The static value
used within mbm_overflow_count() will be replaced with a value
discovered from the hardware. Support for learning the MBM counter
width from hardware is added in the change that follows.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/e36743b9800f16ce600f86b89127391f61261f23.1588715690.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c |  8 +++++---
 arch/x86/kernel/cpu/resctrl/internal.h    |  7 +++++--
 arch/x86/kernel/cpu/resctrl/monitor.c     | 21 +++++++++++++--------
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  2 +-
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 055c8613b531..934c8fb8a64a 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -495,14 +495,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 	return ret;
 }
 
-void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
-		    struct rdtgroup *rdtgrp, int evtid, int first)
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
+		    int evtid, int first)
 {
 	/*
 	 * setup the parameters to send to the IPI to read the data.
 	 */
 	rr->rgrp = rdtgrp;
 	rr->evtid = evtid;
+	rr->r = r;
 	rr->d = d;
 	rr->val = 0;
 	rr->first = first;
@@ -539,7 +541,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 		goto out;
 	}
 
-	mon_event_read(&rr, d, rdtgrp, evtid, false);
+	mon_event_read(&rr, r, d, rdtgrp, evtid, false);
 
 	if (rr.val & RMID_VAL_ERROR)
 		seq_puts(m, "Error\n");
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 499cb2e727a0..a1a458654b40 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -88,6 +88,7 @@ union mon_data_bits {
 
 struct rmid_read {
 	struct rdtgroup		*rgrp;
+	struct rdt_resource	*r;
 	struct rdt_domain	*d;
 	int			evtid;
 	bool			first;
@@ -461,6 +462,7 @@ struct rdt_resource {
 	struct list_head	evt_list;
 	int			num_rmid;
 	unsigned int		mon_scale;
+	unsigned int		mbm_width;
 	unsigned long		fflags;
 };
 
@@ -588,8 +590,9 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    unsigned int dom_id);
 void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    struct rdt_domain *d);
-void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
-		    struct rdtgroup *rdtgrp, int evtid, int first);
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
+		    int evtid, int first);
 void mbm_setup_overflow_handler(struct rdt_domain *dom,
 				unsigned long delay_ms);
 void mbm_handle_overflow(struct work_struct *work);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 50f683ecd2c6..8a3309b2a442 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -214,11 +214,11 @@ void free_rmid(u32 rmid)
 		list_add_tail(&entry->list, &rmid_free_lru);
 }
 
-static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 {
 	u64 shift, chunks;
 
-	shift = 64 - rdt_resources_all[RDT_RESOURCE_MBA].membw.mbm_width;
+	shift = 64 - width;
 	chunks = (cur_msr << shift) - (prev_msr << shift);
 	return chunks >>= shift;
 }
@@ -257,7 +257,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 		return 0;
 	}
 
-	chunks = mbm_overflow_count(m->prev_msr, tval);
+	chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width);
 	m->chunks += chunks;
 	m->prev_msr = tval;
 
@@ -279,7 +279,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
 		return;
 
-	chunks = mbm_overflow_count(m->prev_bw_msr, tval);
+	chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
 	cur_bw = (chunks * r->mon_scale) >> 20;
 
 	if (m->delta_comp)
@@ -432,11 +432,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	}
 }
 
-static void mbm_update(struct rdt_domain *d, int rmid)
+static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
 {
 	struct rmid_read rr;
 
 	rr.first = false;
+	rr.r = r;
 	rr.d = d;
 
 	/*
@@ -508,6 +509,7 @@ void mbm_handle_overflow(struct work_struct *work)
 	struct rdtgroup *prgrp, *crgrp;
 	int cpu = smp_processor_id();
 	struct list_head *head;
+	struct rdt_resource *r;
 	struct rdt_domain *d;
 
 	mutex_lock(&rdtgroup_mutex);
@@ -515,16 +517,18 @@ void mbm_handle_overflow(struct work_struct *work)
 	if (!static_branch_likely(&rdt_mon_enable_key))
 		goto out_unlock;
 
-	d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	d = get_domain_from_cpu(cpu, r);
 	if (!d)
 		goto out_unlock;
 
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
-		mbm_update(d, prgrp->mon.rmid);
+		mbm_update(r, d, prgrp->mon.rmid);
 
 		head = &prgrp->mon.crdtgrp_list;
 		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
-			mbm_update(d, crgrp->mon.rmid);
+			mbm_update(r, d, crgrp->mon.rmid);
 
 		if (is_mba_sc(NULL))
 			update_mba_bw(prgrp, d);
@@ -617,6 +621,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 
 	r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
 	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+	r->mbm_width = MBM_CNTR_WIDTH;
 
 	/*
 	 * A reasonable upper limit on the max threshold is the number
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index a19e5ca8f117..f4efb77775ee 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2455,7 +2455,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
 			goto out_destroy;
 
 		if (is_mbm_event(mevt->evtid))
-			mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+			mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
 	}
 	kernfs_activate(kn);
 	return 0;
-- 
Gitee


From 33b4d5ca2acc05629be20da167226854533765e5 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 5 May 2020 15:36:17 -0700
Subject: [PATCH 1242/2161] x86/resctrl: Support CPUID enumeration of MBM
 counter width

commit f3d44f18b0662327c42128b9d3604489bdb6e36f upstream.

The original Memory Bandwidth Monitoring (MBM) architectural
definition defines counters of up to 62 bits in the
IA32_QM_CTR MSR while the first-generation MBM implementation
uses statically defined 24 bit counters.

Expand the MBM CPUID enumeration properties to include the MBM
counter width. The previously undefined EAX output register contains,
in bits [7:0], the MBM counter width encoded as an offset from
24 bits. Enumerating this property is only specified for Intel
CPUs.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/afa3af2f753f6bc301fb743bc8944e749cb24afa.1588715690.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/processor.h   | 3 ++-
 arch/x86/kernel/cpu/resctrl/core.c | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 111cfb4c1deb..584414a798d5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -99,9 +99,10 @@ struct cpuinfo_x86 {
 	/* in KB - valid for CPUS which support this call: */
 	unsigned int		x86_cache_size;
 	int			x86_cache_alignment;	/* In bytes */
-	/* Cache QoS architectural values: */
+	/* Cache QoS architectural values, valid only on the BSP: */
 	int			x86_cache_max_rmid;	/* max index */
 	int			x86_cache_occ_scale;	/* scale to bytes */
+	int			x86_cache_mbm_width_offset;
 	int			x86_power;
 	unsigned long		loops_per_jiffy;
 	/* cpuid returned max cores value: */
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index a95e015ebd66..e4a9a43f1d88 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -966,6 +966,7 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
 	if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
 		c->x86_cache_max_rmid  = -1;
 		c->x86_cache_occ_scale = -1;
+		c->x86_cache_mbm_width_offset = -1;
 		return;
 	}
 
@@ -982,6 +983,10 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
 
 		c->x86_cache_max_rmid  = ecx;
 		c->x86_cache_occ_scale = ebx;
+		if (c->x86_vendor == X86_VENDOR_INTEL)
+			c->x86_cache_mbm_width_offset = eax & 0xff;
+		else
+			c->x86_cache_mbm_width_offset = -1;
 	}
 }
 
-- 
Gitee


From def8257c90047a98465676bab546a9e6917e992e Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 5 May 2020 15:36:18 -0700
Subject: [PATCH 1243/2161] x86/resctrl: Support wider MBM counters

commit 0c4d5ba1b998e713815b7790d3db6ced0ae49489 upstream.

The original Memory Bandwidth Monitoring (MBM) architectural
definition defines counters of up to 62 bits in the
IA32_QM_CTR MSR while the first-generation MBM implementation
uses statically defined 24 bit counters.

The MBM CPUID enumeration properties have been expanded to include
the MBM counter width, encoded as an offset from 24 bits.

While eight bits are available for the counter width offset IA32_QM_CTR
MSR only supports 62 bit counters. Add a sanity check, with warning
printed when encountered, to ensure counters cannot exceed the 62 bit
limit.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/69d52abd5b14794d3a0f05ba7c755ed1f4c0d5ed.1588715690.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/internal.h | 8 +++++++-
 arch/x86/kernel/cpu/resctrl/monitor.c  | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index a1a458654b40..26f4d8571676 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -31,7 +31,7 @@
 
 #define CQM_LIMBOCHECK_INTERVAL	1000
 
-#define MBM_CNTR_WIDTH			24
+#define MBM_CNTR_WIDTH_BASE		24
 #define MBM_CNTR_WIDTH_AMD		44
 #define MBM_OVERFLOW_INTERVAL		1000
 #define MAX_MBA_BW			100u
@@ -41,6 +41,12 @@
 
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
+/*
+ * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
+ * data to be returned. The counter width is discovered from the hardware
+ * as an offset from MBM_CNTR_WIDTH_BASE.
+ */
+#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
 
 
 struct rdt_fs_context {
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 8a3309b2a442..6e9efb81fcab 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -616,12 +616,18 @@ static void l3_mon_evt_init(struct rdt_resource *r)
 
 int rdt_get_mon_l3_config(struct rdt_resource *r)
 {
+	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
 	unsigned int cl_size = boot_cpu_data.x86_cache_size;
 	int ret;
 
 	r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
 	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
-	r->mbm_width = MBM_CNTR_WIDTH;
+	r->mbm_width = MBM_CNTR_WIDTH_BASE;
+
+	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
+		r->mbm_width += mbm_offset;
+	else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
+		pr_warn("Ignoring impossible MBM counter offset\n");
 
 	/*
 	 * A reasonable upper limit on the max threshold is the number
-- 
Gitee


From d63c2614066a8940614597602ce976fa421b0ec1 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Thu, 4 Jun 2020 14:45:16 -0500
Subject: [PATCH 1244/2161] x86/resctrl: Fix memory bandwidth counter width for
 AMD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 2c18bd525c47f882f033b0a813ecd09c93e1ecdf upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Memory bandwidth is calculated reading the monitoring counter
at two intervals and calculating the delta. It is the software’s
responsibility to read the count often enough to avoid having
the count roll over _twice_ between reads.

The current code hardcodes the bandwidth monitoring counter's width
to 24 bits for AMD. This is due to default base counter width which
is 24. Currently, AMD does not implement the CPUID 0xF.[ECX=1]:EAX
to adjust the counter width. But, the AMD hardware supports much
wider bandwidth counter with the default width of 44 bits.

Kernel reads these monitoring counters every 1 second and adjusts the
counter value for overflow. With 24 bits and scale value of 64 for AMD,
it can only measure up to 1GB/s without overflowing. For the rates
above 1GB/s this will fail to measure the bandwidth.

Fix the issue setting the default width to 44 bits by adjusting the
offset.

AMD future products will implement CPUID 0xF.[ECX=1]:EAX.

 [ bp: Let the line stick out and drop {}-brackets around a single
   statement. ]

Fixes: 4d05bf71f157 ("x86/resctrl: Introduce AMD QOS feature")
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/159129975546.62538.5656031125604254041.stgit@naples-babu.amd.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     | 10 +++++-----
 arch/x86/kernel/cpu/resctrl/internal.h |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index e4a9a43f1d88..fb47bdbc1254 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -260,7 +260,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 	r->num_closid = edx.split.cos_max + 1;
 	r->membw.max_delay = eax.split.max_delay + 1;
 	r->default_ctrl = MAX_MBA_BW;
-	r->membw.mbm_width = MBM_CNTR_WIDTH;
+	r->membw.mbm_width = MBM_CNTR_WIDTH_BASE;
 	if (ecx & MBA_IS_LINEAR) {
 		r->membw.delay_linear = true;
 		r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
@@ -983,10 +983,10 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
 
 		c->x86_cache_max_rmid  = ecx;
 		c->x86_cache_occ_scale = ebx;
-		if (c->x86_vendor == X86_VENDOR_INTEL)
-			c->x86_cache_mbm_width_offset = eax & 0xff;
-		else
-			c->x86_cache_mbm_width_offset = -1;
+		c->x86_cache_mbm_width_offset = eax & 0xff;
+
+		if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+			c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 26f4d8571676..5b9fbeca0054 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -38,6 +38,7 @@
 #define MBA_IS_LINEAR			0x4
 #define MBA_MAX_MBPS			U32_MAX
 #define MAX_MBA_BW_AMD			0x800
+#define MBM_CNTR_WIDTH_OFFSET_AMD	20
 
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
-- 
Gitee


From f3094df0f16afa3966d90879d3ec1df627865871 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Wed, 21 Sep 2022 16:13:24 +0800
Subject: [PATCH 1245/2161] Revert "x86/resctrl: Fix memory bandwidth counter
 width for AMD"

commit opencloudos.

This reverts commit 5900a592c5db56ccdc9a545ef3d9bfe620fe663e.

Because we port the upstream fix patch in next patch.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     | 2 --
 arch/x86/kernel/cpu/resctrl/internal.h | 2 --
 arch/x86/kernel/cpu/resctrl/monitor.c  | 3 +--
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index fb47bdbc1254..6a9df71c1b9e 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -260,7 +260,6 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 	r->num_closid = edx.split.cos_max + 1;
 	r->membw.max_delay = eax.split.max_delay + 1;
 	r->default_ctrl = MAX_MBA_BW;
-	r->membw.mbm_width = MBM_CNTR_WIDTH_BASE;
 	if (ecx & MBA_IS_LINEAR) {
 		r->membw.delay_linear = true;
 		r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
@@ -290,7 +289,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
 	/* AMD does not use delay */
 	r->membw.delay_linear = false;
 
-	r->membw.mbm_width = MBM_CNTR_WIDTH_AMD;
 	r->membw.min_bw = 0;
 	r->membw.bw_gran = 1;
 	/* Max value is 2048, Data width should be 4 in decimal */
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 5b9fbeca0054..2c79eaab2e2a 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -375,7 +375,6 @@ struct rdt_cache {
  * @min_bw:		Minimum memory bandwidth percentage user can request
  * @bw_gran:		Granularity at which the memory bandwidth is allocated
  * @delay_linear:	True if memory B/W delay is in linear scale
- * @mbm_width:		memory B/W monitor counter width
  * @mba_sc:		True if MBA software controller(mba_sc) is enabled
  * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
  */
@@ -384,7 +383,6 @@ struct rdt_membw {
 	u32		min_bw;
 	u32		bw_gran;
 	u32		delay_linear;
-	u32		mbm_width;
 	bool		mba_sc;
 	u32		*mb_map;
 };
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 6e9efb81fcab..ddd91344682c 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -216,9 +216,8 @@ void free_rmid(u32 rmid)
 
 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 {
-	u64 shift, chunks;
+	u64 shift = 64 - width, chunks;
 
-	shift = 64 - width;
 	chunks = (cur_msr << shift) - (prev_msr << shift);
 	return chunks >>= shift;
 }
-- 
Gitee


From 4e5c9ca8e222f179d00a34e6fd11855011e63416 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:21 +0000
Subject: [PATCH 1246/2161] x86/resctrl: Remove struct rdt_membw::max_delay

commit e89f85b9171665c917dca59920884f3d4fe0b1ef upstream.

max_delay is used by x86's __get_mem_config_intel() as a local variable.
Remove it, replacing it with a local variable.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-3-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     | 8 ++++----
 arch/x86/kernel/cpu/resctrl/internal.h | 3 ---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 6a9df71c1b9e..9225ee5cca6f 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -254,16 +254,16 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 {
 	union cpuid_0x10_3_eax eax;
 	union cpuid_0x10_x_edx edx;
-	u32 ebx, ecx;
+	u32 ebx, ecx, max_delay;
 
 	cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
 	r->num_closid = edx.split.cos_max + 1;
-	r->membw.max_delay = eax.split.max_delay + 1;
+	max_delay = eax.split.max_delay + 1;
 	r->default_ctrl = MAX_MBA_BW;
 	if (ecx & MBA_IS_LINEAR) {
 		r->membw.delay_linear = true;
-		r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
-		r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay;
+		r->membw.min_bw = MAX_MBA_BW - max_delay;
+		r->membw.bw_gran = MAX_MBA_BW - max_delay;
 	} else {
 		if (!rdt_get_mb_table(r))
 			return false;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 2c79eaab2e2a..fdce3b3feaf3 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -370,8 +370,6 @@ struct rdt_cache {
 
 /**
  * struct rdt_membw - Memory bandwidth allocation related data
- * @max_delay:		Max throttle delay. Delay is the hardware
- *			representation for memory bandwidth.
  * @min_bw:		Minimum memory bandwidth percentage user can request
  * @bw_gran:		Granularity at which the memory bandwidth is allocated
  * @delay_linear:	True if memory B/W delay is in linear scale
@@ -379,7 +377,6 @@ struct rdt_cache {
  * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
  */
 struct rdt_membw {
-	u32		max_delay;
 	u32		min_bw;
 	u32		bw_gran;
 	u32		delay_linear;
-- 
Gitee


From af633d84defcbe52974f5e98fdbeb52361e70611 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:22 +0000
Subject: [PATCH 1247/2161] x86/resctrl: Fix stale comment

commit ae0fbedd2a18cd82a2c0c5605a0a60865bc54576 upstream.

The comment in rdtgroup_init() refers to the non existent function
rdt_mount(), which has now been renamed rdt_get_tree(). Fix the
comment.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-4-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f4efb77775ee..f47ceedb4e32 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -3169,7 +3169,7 @@ int __init rdtgroup_init(void)
 	 * It may also be ok since that would enable debugging of RDT before
 	 * resctrl is mounted.
 	 * The reason why the debugfs directory is created here and not in
-	 * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
+	 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
 	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
 	 * (the lockdep class of inode->i_rwsem). Other filesystem
 	 * interactions (eg. SyS_getdents) have the lock ordering:
-- 
Gitee


From ca37efc7352fca66e2ecc8f430bfea52275d9397 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:23 +0000
Subject: [PATCH 1248/2161] x86/resctrl: Use container_of() in delayed_work
 handlers

commit f995801ba3a0660cb352c8beb794379c82781ca3 upstream.

mbm_handle_overflow() and cqm_handle_limbo() are both provided with
the domain's work_struct when called, but use get_domain_from_cpu()
to find the domain, along with the appropriate error handling.

container_of() saves some list walking and bitmap testing, use that
instead.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-5-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/monitor.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index ddd91344682c..a98519a3a2e6 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -475,19 +475,13 @@ void cqm_handle_limbo(struct work_struct *work)
 	mutex_lock(&rdtgroup_mutex);
 
 	r = &rdt_resources_all[RDT_RESOURCE_L3];
-	d = get_domain_from_cpu(cpu, r);
-
-	if (!d) {
-		pr_warn_once("Failure to get domain for limbo worker\n");
-		goto out_unlock;
-	}
+	d = container_of(work, struct rdt_domain, cqm_limbo.work);
 
 	__check_limbo(d, false);
 
 	if (has_busy_rmid(r, d))
 		schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
 
-out_unlock:
 	mutex_unlock(&rdtgroup_mutex);
 }
 
@@ -517,10 +511,7 @@ void mbm_handle_overflow(struct work_struct *work)
 		goto out_unlock;
 
 	r = &rdt_resources_all[RDT_RESOURCE_L3];
-
-	d = get_domain_from_cpu(cpu, r);
-	if (!d)
-		goto out_unlock;
+	d = container_of(work, struct rdt_domain, mbm_over.work);
 
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
 		mbm_update(r, d, prgrp->mon.rmid);
-- 
Gitee


From cb24c4da09456b96a2226d5febec35e3839c253e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:24 +0000
Subject: [PATCH 1249/2161] x86/resctrl: Include pid.h

commit a21a4391f20c0ab45db452e22bc3e8afe8b36e46 upstream.

We are about to disturb the header soup. This header uses struct pid
and struct pid_namespace. Include their header.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-6-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/resctrl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index daf5cf64c6a6..9b05af9b3e28 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -2,6 +2,8 @@
 #ifndef _RESCTRL_H
 #define _RESCTRL_H
 
+#include <linux/pid.h>
+
 #ifdef CONFIG_PROC_CPU_RESCTRL
 
 int proc_resctrl_show(struct seq_file *m,
-- 
Gitee


From e29e27fe20d246375e2304cc08d3f36c839d9288 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:25 +0000
Subject: [PATCH 1250/2161] x86/resctrl: Use is_closid_match() in more places

commit e6b2fac36fcc0b73cbef063d700a9841850e37a0 upstream.

rdtgroup_tasks_assigned() and show_rdt_tasks() loop over threads testing
for a CTRL/MON group match by closid/rmid with the provided rdtgrp.
Further down the file are helpers to do this, move these further up and
make use of them here.

These helpers additionally check for alloc/mon capable. This is harmless
as rdtgroup_mkdir() tests these capable flags before allowing the config
directories to be created.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-7-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 30 ++++++++++++--------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f47ceedb4e32..7403f3ea744a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -591,6 +591,18 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 	return 0;
 }
 
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+	return (rdt_alloc_capable &&
+	       (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+	return (rdt_mon_capable &&
+	       (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
 /**
  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
  * @r: Resource group
@@ -606,8 +618,7 @@ int rdtgroup_tasks_assigned(struct rdtgroup *r)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
-		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
+		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
 			ret = 1;
 			break;
 		}
@@ -705,8 +716,7 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
-		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
+		if (is_closid_match(t, r) || is_rmid_match(t, r))
 			seq_printf(s, "%d\n", t->pid);
 	}
 	rcu_read_unlock();
@@ -2232,18 +2242,6 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	return 0;
 }
 
-static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
-{
-	return (rdt_alloc_capable &&
-		(r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
-}
-
-static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
-{
-	return (rdt_mon_capable &&
-		(r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
-}
-
 /*
  * Move tasks from one to the other group. If @from is NULL, then all tasks
  * in the systems are moved unconditionally (used for teardown).
-- 
Gitee


From 9477d2104961adb7c85309590a56dbbdcb9f62be Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:26 +0000
Subject: [PATCH 1251/2161] x86/resctrl: Add struct
 rdt_membw::arch_needs_linear to explain AMD/Intel MBA difference

commit 41215b7947f1b1b86fd77a7bebd2320599aea7bd upstream.

The configuration values user-space provides to the resctrl filesystem
are ABI. To make this work on another architecture, all the ABI bits
should be moved out of /arch/x86 and under /fs.

To do this, the differences between AMD and Intel CPUs needs to be
explained to resctrl via resource properties, instead of function
pointers that let the arch code accept subtly different values on
different platforms/architectures.

For MBA, Intel CPUs reject configuration attempts for non-linear
resources, whereas AMD ignore this field as its MBA resource is never
linear. To merge the parse/validate functions, this difference needs to
be explained.

Add struct rdt_membw::arch_needs_linear to indicate the arch code needs
the linear property to be true to configure this resource. AMD can set
this and delay_linear to false. Intel can set arch_needs_linear to
true to keep the existing "No support for non-linear MB domains" error
message for affected platforms.

 [ bp: convert "we" etc to passive voice. ]

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-8-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        | 4 +++-
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 8 +++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 9225ee5cca6f..92bb202f7c00 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -260,6 +260,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 	r->num_closid = edx.split.cos_max + 1;
 	max_delay = eax.split.max_delay + 1;
 	r->default_ctrl = MAX_MBA_BW;
+	r->membw.arch_needs_linear = true;
 	if (ecx & MBA_IS_LINEAR) {
 		r->membw.delay_linear = true;
 		r->membw.min_bw = MAX_MBA_BW - max_delay;
@@ -267,6 +268,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 	} else {
 		if (!rdt_get_mb_table(r))
 			return false;
+		r->membw.arch_needs_linear = false;
 	}
 	r->data_width = 3;
 
@@ -287,7 +289,7 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
 	r->default_ctrl = MAX_MBA_BW_AMD;
 
 	/* AMD does not use delay */
-	r->membw.delay_linear = false;
+	r->membw.arch_needs_linear = false;
 
 	r->membw.min_bw = 0;
 	r->membw.bw_gran = 1;
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 934c8fb8a64a..e3bcd77add2b 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -33,6 +33,12 @@ static bool bw_validate_amd(char *buf, unsigned long *data,
 	unsigned long bw;
 	int ret;
 
+	/* temporary: always false on AMD */
+	if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
+		rdt_last_cmd_puts("No support for non-linear MB domains\n");
+		return false;
+	}
+
 	ret = kstrtoul(buf, 10, &bw);
 	if (ret) {
 		rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
@@ -82,7 +88,7 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 	/*
 	 * Only linear delay values is supported for current Intel SKUs.
 	 */
-	if (!r->membw.delay_linear) {
+	if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
 		rdt_last_cmd_puts("No support for non-linear MB domains\n");
 		return false;
 	}
-- 
Gitee


From 0f58fac1ca17cbe77a21cad81a410e9387ef7c76 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:27 +0000
Subject: [PATCH 1252/2161] x86/resctrl: Merge AMD/Intel parse_bw() calls

commit 5df3ca9334d5603e4afbb95953d0affb37dcf86b upstream.

Now after arch_needs_linear has been added, the parse_bw() calls are
almost the same between AMD and Intel.

The difference is '!is_mba_sc()', which is not checked on AMD. This
will always be true on AMD CPUs as mba_sc cannot be enabled as
is_mba_linear() is false.

Removing this duplication means user-space visible behaviour and
error messages are not validated or generated in different places.

Reviewed-by : Babu Moger <babu.moger@amd.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-9-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        |  3 +-
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 57 +----------------------
 arch/x86/kernel/cpu/resctrl/internal.h    |  6 +--
 3 files changed, 5 insertions(+), 61 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 92bb202f7c00..ea0b4f640a3f 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -168,6 +168,7 @@ struct rdt_resource rdt_resources_all[] = {
 		.name			= "MB",
 		.domains		= domain_init(RDT_RESOURCE_MBA),
 		.cache_level		= 3,
+		.parse_ctrlval		= parse_bw,
 		.format_str		= "%d=%*u",
 		.fflags			= RFTYPE_RES_MB,
 	},
@@ -925,7 +926,6 @@ static __init void rdt_init_res_defs_intel(void)
 		else if (r->rid == RDT_RESOURCE_MBA) {
 			r->msr_base = MSR_IA32_MBA_THRTL_BASE;
 			r->msr_update = mba_wrmsr_intel;
-			r->parse_ctrlval = parse_bw_intel;
 		}
 	}
 }
@@ -945,7 +945,6 @@ static __init void rdt_init_res_defs_amd(void)
 		else if (r->rid == RDT_RESOURCE_MBA) {
 			r->msr_base = MSR_IA32_MBA_BW_BASE;
 			r->msr_update = mba_wrmsr_amd;
-			r->parse_ctrlval = parse_bw_amd;
 		}
 	}
 }
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index e3bcd77add2b..b0e24cb6f85c 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -21,59 +21,6 @@
 #include <linux/slab.h>
 #include "internal.h"
 
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and maximum bandwidth values specified by
- * the hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate_amd(char *buf, unsigned long *data,
-			    struct rdt_resource *r)
-{
-	unsigned long bw;
-	int ret;
-
-	/* temporary: always false on AMD */
-	if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
-		rdt_last_cmd_puts("No support for non-linear MB domains\n");
-		return false;
-	}
-
-	ret = kstrtoul(buf, 10, &bw);
-	if (ret) {
-		rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
-		return false;
-	}
-
-	if (bw < r->membw.min_bw || bw > r->default_ctrl) {
-		rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
-				    r->membw.min_bw, r->default_ctrl);
-		return false;
-	}
-
-	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
-	return true;
-}
-
-int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r,
-		 struct rdt_domain *d)
-{
-	unsigned long bw_val;
-
-	if (d->have_new_ctrl) {
-		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
-		return -EINVAL;
-	}
-
-	if (!bw_validate_amd(data->buf, &bw_val, r))
-		return -EINVAL;
-
-	d->new_ctrl = bw_val;
-	d->have_new_ctrl = true;
-
-	return 0;
-}
-
 /*
  * Check whether MBA bandwidth percentage value is correct. The value is
  * checked against the minimum and max bandwidth values specified by the
@@ -110,8 +57,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 	return true;
 }
 
-int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r,
-		   struct rdt_domain *d)
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+	     struct rdt_domain *d)
 {
 	unsigned long bw_val;
 
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index fdce3b3feaf3..3b018ea5c0df 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -470,10 +470,8 @@ struct rdt_resource {
 
 int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
 	      struct rdt_domain *d);
-int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r,
-		   struct rdt_domain *d);
-int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r,
-		 struct rdt_domain *d);
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+	     struct rdt_domain *d);
 
 extern struct mutex rdtgroup_mutex;
 
-- 
Gitee


From bb4589da3aa4e9568884b0f0703316a1012625d4 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:28 +0000
Subject: [PATCH 1253/2161] x86/resctrl: Add struct
 rdt_cache::arch_has_{sparse, empty}_bitmaps

commit 41215b7947f1b1b86fd77a7bebd2320599aea7bd upstream.

Intel CPUs expect the cache bitmap provided by user-space to have on a
single span of 1s, whereas AMD can support bitmaps like 0xf00f. Arm's
MPAM support also allows sparse bitmaps.

Similarly, Intel CPUs check at least one bit set, whereas AMD CPUs are
quite happy with an empty bitmap. Arm's MPAM allows an empty bitmap.

To move resctrl out to /fs/, platform differences like this need to be
explained.

Add two resource properties arch_has_{empty,sparse}_bitmaps. Test these
around the relevant parts of cbm_validate().

Merging the validate calls causes AMD to gain the min_cbm_bits test
needed for Haswell, but as it always sets this value to 1, it will never
match.

 [ bp: Massage commit message. ]

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-10-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        | 14 ++++----
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 39 ++++++-----------------
 arch/x86/kernel/cpu/resctrl/internal.h    |  8 ++---
 3 files changed, 22 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index ea0b4f640a3f..b39cdd638746 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -921,9 +921,10 @@ static __init void rdt_init_res_defs_intel(void)
 		    r->rid == RDT_RESOURCE_L3CODE ||
 		    r->rid == RDT_RESOURCE_L2 ||
 		    r->rid == RDT_RESOURCE_L2DATA ||
-		    r->rid == RDT_RESOURCE_L2CODE)
-			r->cbm_validate = cbm_validate_intel;
-		else if (r->rid == RDT_RESOURCE_MBA) {
+		    r->rid == RDT_RESOURCE_L2CODE) {
+			r->cache.arch_has_sparse_bitmaps = false;
+			r->cache.arch_has_empty_bitmaps = false;
+		} else if (r->rid == RDT_RESOURCE_MBA) {
 			r->msr_base = MSR_IA32_MBA_THRTL_BASE;
 			r->msr_update = mba_wrmsr_intel;
 		}
@@ -940,9 +941,10 @@ static __init void rdt_init_res_defs_amd(void)
 		    r->rid == RDT_RESOURCE_L3CODE ||
 		    r->rid == RDT_RESOURCE_L2 ||
 		    r->rid == RDT_RESOURCE_L2DATA ||
-		    r->rid == RDT_RESOURCE_L2CODE)
-			r->cbm_validate = cbm_validate_amd;
-		else if (r->rid == RDT_RESOURCE_MBA) {
+		    r->rid == RDT_RESOURCE_L2CODE) {
+			r->cache.arch_has_sparse_bitmaps = true;
+			r->cache.arch_has_empty_bitmaps = true;
+		} else if (r->rid == RDT_RESOURCE_MBA) {
 			r->msr_base = MSR_IA32_MBA_BW_BASE;
 			r->msr_update = mba_wrmsr_amd;
 		}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index b0e24cb6f85c..c877642e8a14 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -76,12 +76,14 @@ int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
 }
 
 /*
- * Check whether a cache bit mask is valid. The SDM says:
+ * Check whether a cache bit mask is valid.
+ * For Intel the SDM says:
  *	Please note that all (and only) contiguous '1' combinations
  *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
  * Additionally Haswell requires at least two bits set.
+ * AMD allows non-contiguous bitmasks.
  */
-bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
 {
 	unsigned long first_bit, zero_bit, val;
 	unsigned int cbm_len = r->cache.cbm_len;
@@ -93,7 +95,8 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
 		return false;
 	}
 
-	if (val == 0 || val > r->default_ctrl) {
+	if ((!r->cache.arch_has_empty_bitmaps && val == 0) ||
+	    val > r->default_ctrl) {
 		rdt_last_cmd_puts("Mask out of range\n");
 		return false;
 	}
@@ -101,7 +104,9 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
 	first_bit = find_first_bit(&val, cbm_len);
 	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
 
-	if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) {
+	/* Are non-contiguous bitmaps allowed? */
+	if (!r->cache.arch_has_sparse_bitmaps &&
+	    (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
 		rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
 		return false;
 	}
@@ -116,30 +121,6 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
 	return true;
 }
 
-/*
- * Check whether a cache bit mask is valid. AMD allows non-contiguous
- * bitmasks
- */
-bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r)
-{
-	unsigned long val;
-	int ret;
-
-	ret = kstrtoul(buf, 16, &val);
-	if (ret) {
-		rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
-		return false;
-	}
-
-	if (val > r->default_ctrl) {
-		rdt_last_cmd_puts("Mask out of range\n");
-		return false;
-	}
-
-	*data = val;
-	return true;
-}
-
 /*
  * Read one cache bit mask (hex). Check that it is valid for the current
  * resource type.
@@ -165,7 +146,7 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
 		return -EINVAL;
 	}
 
-	if (!r->cbm_validate(data->buf, &cbm_val, r))
+	if (!cbm_validate(data->buf, &cbm_val, r))
 		return -EINVAL;
 
 	if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 3b018ea5c0df..2024d6e7ed41 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -359,6 +359,8 @@ struct msr_param {
  *			in a cache bit mask
  * @shareable_bits:	Bitmask of shareable resource with other
  *			executing entities
+ * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
+ * @arch_has_empty_bitmaps:	True if the '0' bitmap is valid.
  */
 struct rdt_cache {
 	unsigned int	cbm_len;
@@ -366,6 +368,8 @@ struct rdt_cache {
 	unsigned int	cbm_idx_mult;
 	unsigned int	cbm_idx_offset;
 	unsigned int	shareable_bits;
+	bool		arch_has_sparse_bitmaps;
+	bool		arch_has_empty_bitmaps;
 };
 
 /**
@@ -433,7 +437,6 @@ struct rdt_parse_data {
  * @cache:		Cache allocation related data
  * @format_str:		Per resource format string to show domain value
  * @parse_ctrlval:	Per resource function pointer to parse control values
- * @cbm_validate	Cache bitmask validate function
  * @evt_list:		List of monitoring events
  * @num_rmid:		Number of RMIDs available
  * @mon_scale:		cqm counter * mon_scale = occupancy in bytes
@@ -460,7 +463,6 @@ struct rdt_resource {
 	int (*parse_ctrlval)(struct rdt_parse_data *data,
 			     struct rdt_resource *r,
 			     struct rdt_domain *d);
-	bool (*cbm_validate)(char *buf, u32 *data, struct rdt_resource *r);
 	struct list_head	evt_list;
 	int			num_rmid;
 	unsigned int		mon_scale;
@@ -603,8 +605,6 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void cqm_handle_limbo(struct work_struct *work);
 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
 void __check_limbo(struct rdt_domain *d, bool force_free);
-bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r);
-bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r);
 void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
 
 #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
-- 
Gitee


From 04f052983c4cba8539bacccff02892379c2a901b Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 8 Jul 2020 16:39:29 +0000
Subject: [PATCH 1254/2161] cacheinfo: Move resctrl's get_cache_id() to the
 cacheinfo header file

commit 709c4362725abb5fa1e36fd94893a9b0d049df82 upstream.

resctrl/core.c defines get_cache_id() for use in its cpu-hotplug
callbacks. This gets the id attribute of the cache at the corresponding
level of a CPU.

Later rework means this private function needs to be shared. Move
it to the header file.

The name conflicts with a different definition in intel_cacheinfo.c,
name it get_cpu_cacheinfo_id() to show its relation with
get_cpu_cacheinfo().

Now this is visible on other architectures, check the id attribute
has actually been set.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20200708163929.2783-11-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c | 17 ++---------------
 include/linux/cacheinfo.h          | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index b39cdd638746..3d1462aca69c 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -349,19 +349,6 @@ static void rdt_get_cdp_l2_config(void)
 	rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
 }
 
-static int get_cache_id(int cpu, int level)
-{
-	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
-	int i;
-
-	for (i = 0; i < ci->num_leaves; i++) {
-		if (ci->info_list[i].level == level)
-			return ci->info_list[i].id;
-	}
-
-	return -1;
-}
-
 static void
 mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 {
@@ -559,7 +546,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
  */
 static void domain_add_cpu(int cpu, struct rdt_resource *r)
 {
-	int id = get_cache_id(cpu, r->cache_level);
+	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
 	struct list_head *add_pos = NULL;
 	struct rdt_domain *d;
 
@@ -605,7 +592,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 {
-	int id = get_cache_id(cpu, r->cache_level);
+	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
 	struct rdt_domain *d;
 
 	d = rdt_find_domain(r, id, NULL);
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 46b92cd61d0c..4f72b47973c3 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -3,6 +3,7 @@
 #define _LINUX_CACHEINFO_H
 
 #include <linux/bitops.h>
+#include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 
@@ -119,4 +120,24 @@ int acpi_find_last_cache_level(unsigned int cpu);
 
 const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
 
+/*
+ * Get the id of the cache associated with @cpu at level @level.
+ * cpuhp lock must be held.
+ */
+static inline int get_cpu_cacheinfo_id(int cpu, int level)
+{
+	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+	int i;
+
+	for (i = 0; i < ci->num_leaves; i++) {
+		if (ci->info_list[i].level == level) {
+			if (ci->info_list[i].attributes & CACHE_ID)
+				return ci->info_list[i].id;
+			return -1;
+		}
+	}
+
+	return -1;
+}
+
 #endif /* _LINUX_CACHEINFO_H */
-- 
Gitee


From 4682b192b0c57c2e6e7098f3bfb700aa69f656ef Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 24 Aug 2020 12:11:20 -0700
Subject: [PATCH 1255/2161] x86/resctrl: Enumerate per-thread MBA controls

commit e48cb1a3fb9165009fe318e1db2fde10d303c1d3 upstream.

Some systems support per-thread Memory Bandwidth Allocation (MBA) which
applies a throttling delay value to each hardware thread instead of to
a core. Per-thread MBA is enumerated by CPUID.

No feature flag is shown in /proc/cpuinfo. User applications need to
check a resctrl throttling mode info file to know if the feature is
supported.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/1598296281-127595-2-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/kernel/cpu/cpuid-deps.c   | 1 +
 arch/x86/kernel/cpu/scattered.c    | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 2e27817e087f..3772f818d378 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -286,6 +286,7 @@
 #define X86_FEATURE_CQM_MBM_LOCAL	(11*32+ 3) /* LLC Local MBM monitoring */
 #define X86_FEATURE_FENCE_SWAPGS_USER	(11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL	(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
+#define X86_FEATURE_PER_THREAD_MBA	(11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 4e2d4809e7b0..91ae604925f3 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -71,6 +71,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
 	{ X86_FEATURE_AVX512_FP16,		X86_FEATURE_AVX512BW  },
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
+	{ X86_FEATURE_PER_THREAD_MBA,		X86_FEATURE_MBA       },
 	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
 	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index adf9b71386ef..5cc4b5d43f45 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -35,6 +35,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_CDP_L3,		CPUID_ECX,  2, 0x00000010, 1 },
 	{ X86_FEATURE_CDP_L2,		CPUID_ECX,  2, 0x00000010, 2 },
 	{ X86_FEATURE_MBA,		CPUID_EBX,  3, 0x00000010, 0 },
+	{ X86_FEATURE_PER_THREAD_MBA,	CPUID_ECX,  0, 0x00000010, 3 },
 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
-- 
Gitee


From d3205446a65ed89b65036677c51cb86b402c5435 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 24 Aug 2020 12:11:21 -0700
Subject: [PATCH 1256/2161] x86/resctrl: Enable user to view thread or core
 throttling mode

commit 29b6bd41ee24f69a85666b9f68d500b382d408fd upstream.

Early Intel hardware implementations of Memory Bandwidth Allocation (MBA)
could only control bandwidth at the processor core level. This meant that
when two processes with different bandwidth allocations ran simultaneously
on the same core the hardware had to resolve this difference. It did so by
applying the higher throttling value (lower bandwidth) to both processes.

Newer implementations can apply different throttling values to each
thread on a core.

Introduce a new resctrl file, "thread_throttle_mode", on Intel systems
that shows to the user how throttling values are allocated, per-core or
per-thread.

On systems that support per-core throttling, the file will display "max".
On newer systems that support per-thread throttling, the file will display
"per-thread".

AMD confirmed in [1] that AMD bandwidth allocation is already at thread
level but that the AMD implementation does not use a memory delay
throttle mode. So to avoid confusion the thread throttling mode would be
UNDEFINED on AMD systems and the "thread_throttle_mode" file will not be
visible.

Originally-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/1598296281-127595-3-git-send-email-fenghua.yu@intel.com
Link: [1] https://lore.kernel.org/lkml/18d277fd-6523-319c-d560-66b63ff606b8@amd.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/resctrl_ui.rst       | 18 ++++++++-
 arch/x86/kernel/cpu/resctrl/core.c     |  6 +++
 arch/x86/kernel/cpu/resctrl/internal.h | 19 +++++++++
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 53 +++++++++++++++++++++++++-
 4 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/Documentation/x86/resctrl_ui.rst b/Documentation/x86/resctrl_ui.rst
index 5368cedfb530..e59b7b93a9b4 100644
--- a/Documentation/x86/resctrl_ui.rst
+++ b/Documentation/x86/resctrl_ui.rst
@@ -138,6 +138,18 @@ with respect to allocation:
 		non-linear. This field is purely informational
 		only.
 
+"thread_throttle_mode":
+		Indicator on Intel systems of how tasks running on threads
+		of a physical core are throttled in cases where they
+		request different memory bandwidth percentages:
+
+		"max":
+			the smallest percentage is applied
+			to all threads
+		"per-thread":
+			bandwidth percentages are directly applied to
+			the threads running on the core
+
 If RDT monitoring is available there will be an "L3_MON" directory
 with the following files:
 
@@ -364,8 +376,10 @@ to the next control step available on the hardware.
 
 The bandwidth throttling is a core specific mechanism on some of Intel
 SKUs. Using a high bandwidth and a low bandwidth setting on two threads
-sharing a core will result in both threads being throttled to use the
-low bandwidth. The fact that Memory bandwidth allocation(MBA) is a core
+sharing a core may result in both threads being throttled to use the
+low bandwidth (see "thread_throttle_mode").
+
+The fact that Memory bandwidth allocation(MBA) may be a core
 specific mechanism where as memory bandwidth monitoring(MBM) is done at
 the package level may lead to confusion when users try to apply control
 via the MBA and then monitor the bandwidth to see if the controls are
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 3d1462aca69c..4274618b4222 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -273,6 +273,12 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 	}
 	r->data_width = 3;
 
+	if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
+		r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
+	else
+		r->membw.throttle_mode = THREAD_THROTTLE_MAX;
+	thread_throttle_mode_init();
+
 	r->alloc_capable = true;
 	r->alloc_enabled = true;
 
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 2024d6e7ed41..454b49533d51 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -372,11 +372,27 @@ struct rdt_cache {
 	bool		arch_has_empty_bitmaps;
 };
 
+/**
+ * enum membw_throttle_mode - System's memory bandwidth throttling mode
+ * @THREAD_THROTTLE_UNDEFINED:	Not relevant to the system
+ * @THREAD_THROTTLE_MAX:	Memory bandwidth is throttled at the core
+ *				always using smallest bandwidth percentage
+ *				assigned to threads, aka "max throttling"
+ * @THREAD_THROTTLE_PER_THREAD:	Memory bandwidth is throttled at the thread
+ */
+enum membw_throttle_mode {
+	THREAD_THROTTLE_UNDEFINED = 0,
+	THREAD_THROTTLE_MAX,
+	THREAD_THROTTLE_PER_THREAD,
+};
+
 /**
  * struct rdt_membw - Memory bandwidth allocation related data
  * @min_bw:		Minimum memory bandwidth percentage user can request
  * @bw_gran:		Granularity at which the memory bandwidth is allocated
  * @delay_linear:	True if memory B/W delay is in linear scale
+ * @throttle_mode:	Bandwidth throttling mode when threads request
+ *			different memory bandwidths
  * @mba_sc:		True if MBA software controller(mba_sc) is enabled
  * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
  */
@@ -384,8 +400,10 @@ struct rdt_membw {
 	u32		min_bw;
 	u32		bw_gran;
 	u32		delay_linear;
+	enum membw_throttle_mode	throttle_mode;
 	bool		mba_sc;
 	u32		*mb_map;
+	
 };
 
 static inline bool is_llc_occupancy_enabled(void)
@@ -606,5 +624,6 @@ void cqm_handle_limbo(struct work_struct *work);
 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
 void __check_limbo(struct rdt_domain *d, bool force_free);
 void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+void __init thread_throttle_mode_init(void);
 
 #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 7403f3ea744a..8fc0c2126003 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1026,6 +1026,19 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
 	return 0;
 }
 
+static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
+					 struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
+		seq_puts(seq, "per-thread\n");
+	else
+		seq_puts(seq, "max\n");
+
+	return 0;
+}
+
 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 				       char *buf, size_t nbytes, loff_t off)
 {
@@ -1522,6 +1535,17 @@ static struct rftype res_common_files[] = {
 		.seq_show	= rdt_delay_linear_show,
 		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
 	},
+	/*
+	 * Platform specific which (if any) capabilities are provided by
+	 * thread_throttle_mode. Defer "fflags" initialization to platform
+	 * discovery.
+	 */
+	{
+		.name		= "thread_throttle_mode",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_thread_throttle_mode_show,
+	},
 	{
 		.name		= "max_threshold_occupancy",
 		.mode		= 0644,
@@ -1592,7 +1616,7 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
 	lockdep_assert_held(&rdtgroup_mutex);
 
 	for (rft = rfts; rft < rfts + len; rft++) {
-		if ((fflags & rft->fflags) == rft->fflags) {
+		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
 			ret = rdtgroup_add_file(kn, rft);
 			if (ret)
 				goto error;
@@ -1609,6 +1633,33 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
 	return ret;
 }
 
+static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
+{
+	struct rftype *rfts, *rft;
+	int len;
+
+	rfts = res_common_files;
+	len = ARRAY_SIZE(res_common_files);
+
+	for (rft = rfts; rft < rfts + len; rft++) {
+		if (!strcmp(rft->name, name))
+			return rft;
+	}
+
+	return NULL;
+}
+
+void __init thread_throttle_mode_init(void)
+{
+	struct rftype *rft;
+
+	rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
+	if (!rft)
+		return;
+
+	rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
+}
+
 /**
  * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
  * @r: The resource group with which the file is associated.
-- 
Gitee


From c5772d34942391975a3031a8251f1ee8a07acb16 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 14 Oct 2020 00:49:27 +0000
Subject: [PATCH 1257/2161] x86/resctrl: Correct MBM total and local values

commit 4868a61d498af3627c3b571aa48107a845001e51 upstream.

Intel Memory Bandwidth Monitoring (MBM) counters may report system
memory bandwidth incorrectly on some Intel processors. The errata SKX99
for Skylake server, BDF102 for Broadwell server, and the correction
factor table are documented in Documentation/x86/resctrl.rst.

Intel MBM counters track metrics according to the assigned Resource
Monitor ID (RMID) for that logical core. The IA32_QM_CTR register
(MSR 0xC8E) used to report these metrics, may report incorrect system
bandwidth for certain RMID values.

Due to the errata, system memory bandwidth may not match what is
reported.

To work around the errata, correct MBM total and local readings using a
correction factor table. If rmid > rmid threshold, MBM total and local
values should be multiplied by the correction factor.

 [ bp: Mark mbm_cf_table[] __initdata. ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20201014004927.1839452-3-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     |  4 ++
 arch/x86/kernel/cpu/resctrl/internal.h |  1 +
 arch/x86/kernel/cpu/resctrl/monitor.c  | 82 +++++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 4274618b4222..c0a6ec72b9f6 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -887,6 +887,10 @@ static __init void __check_quirks_intel(void)
 			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
 		else
 			set_rdt_options("!l3cat");
+		fallthrough;
+	case INTEL_FAM6_BROADWELL_X:
+		intel_rdt_mbm_apply_quirk();
+		break;
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 454b49533d51..861f59293b30 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -616,6 +616,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 void mbm_setup_overflow_handler(struct rdt_domain *dom,
 				unsigned long delay_ms);
 void mbm_handle_overflow(struct work_struct *work);
+void __init intel_rdt_mbm_apply_quirk(void);
 bool is_mba_sc(struct rdt_resource *r);
 void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
 u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index a98519a3a2e6..7ac31210e452 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -64,6 +64,69 @@ unsigned int rdt_mon_features;
  */
 unsigned int resctrl_cqm_threshold;
 
+#define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
+
+/*
+ * The correction factor table is documented in Documentation/x86/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ *    for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ *    equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ *    to calculate corrected value by shifting:
+ *    corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+	u32 rmidthreshold;
+	u64 cf;
+} mbm_cf_table[] __initdata = {
+	{7,	CF(1.000000)},
+	{15,	CF(1.000000)},
+	{15,	CF(0.969650)},
+	{31,	CF(1.000000)},
+	{31,	CF(1.066667)},
+	{31,	CF(0.969650)},
+	{47,	CF(1.142857)},
+	{63,	CF(1.000000)},
+	{63,	CF(1.185115)},
+	{63,	CF(1.066553)},
+	{79,	CF(1.454545)},
+	{95,	CF(1.000000)},
+	{95,	CF(1.230769)},
+	{95,	CF(1.142857)},
+	{95,	CF(1.066667)},
+	{127,	CF(1.000000)},
+	{127,	CF(1.254863)},
+	{127,	CF(1.185255)},
+	{151,	CF(1.000000)},
+	{127,	CF(1.066667)},
+	{167,	CF(1.000000)},
+	{159,	CF(1.454334)},
+	{183,	CF(1.000000)},
+	{127,	CF(0.969744)},
+	{191,	CF(1.280246)},
+	{191,	CF(1.230921)},
+	{215,	CF(1.000000)},
+	{191,	CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+	/* Correct MBM value. */
+	if (rmid > mbm_cf_rmidthreshold)
+		val = (val * mbm_cf) >> 20;
+
+	return val;
+}
+
 static inline struct rmid_entry *__rmid_entry(u32 rmid)
 {
 	struct rmid_entry *entry;
@@ -260,7 +323,8 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 	m->chunks += chunks;
 	m->prev_msr = tval;
 
-	rr->val += m->chunks;
+	rr->val += get_corrected_mbm_count(rmid, m->chunks);
+
 	return 0;
 }
 
@@ -279,7 +343,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 		return;
 
 	chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
-	cur_bw = (chunks * r->mon_scale) >> 20;
+	cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20;
 
 	if (m->delta_comp)
 		m->delta_bw = abs(cur_bw - m->prev_bw);
@@ -642,3 +706,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 
 	return 0;
 }
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+	int cf_index;
+
+	cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+	if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+		pr_info("No MBM correction factor available\n");
+		return;
+	}
+
+	mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+	mbm_cf = mbm_cf_table[cf_index].cf;
+}
-- 
Gitee


From 153e13d424d127e9809108ae5df29b452db96159 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 10 Aug 2020 08:55:08 +0100
Subject: [PATCH 1258/2161] x86/resctrl: Fix spelling in user-visible warning
 messages

commit 93921baa3f6ff77e57d7e772165aa7bd709b5387 upstream.

Fix spelling mistake "Could't" -> "Couldn't" in user-visible warning
messages.

 [ bp: Massage commit message; s/cpu/CPU/g ]

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200810075508.46490-1-colin.king@canonical.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index c0a6ec72b9f6..bbd83035a88a 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -558,7 +558,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
 	d = rdt_find_domain(r, id, &add_pos);
 	if (IS_ERR(d)) {
-		pr_warn("Could't find cache id for cpu %d\n", cpu);
+		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
 		return;
 	}
 
@@ -603,7 +603,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 
 	d = rdt_find_domain(r, id, NULL);
 	if (IS_ERR_OR_NULL(d)) {
-		pr_warn("Could't find cache id for cpu %d\n", cpu);
+		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
 		return;
 	}
 
-- 
Gitee


From f93286505440b61dcbae7d12577f735284d74fa8 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Mon, 30 Nov 2020 09:57:20 -0600
Subject: [PATCH 1259/2161] x86/resctrl: Fix AMD L3 QOS CDP enable/disable

commit fae3a13d2a3d49a89391889808428cf1e72afbd7 upstream.

When the AMD QoS feature CDP (code and data prioritization) is enabled
or disabled, the CDP bit in MSR 0000_0C81 is written on one of the CPUs
in an L3 domain (core complex). That is not correct - the CDP bit needs
to be updated on all the logical CPUs in the domain.

This was not spelled out clearly in the spec earlier. The specification
has been updated and the updated document, "AMD64 Technology Platform
Quality of Service Extensions Publication # 56375 Revision: 1.02 Issue
Date: October 2020" is available now. Refer the section: Code and Data
Prioritization.

Fix the issue by adding a new flag arch_has_per_cpu_cfg in rdt_cache
data structure.

The documentation can be obtained at:
https://developer.amd.com/wp-content/resources/56375.pdf
Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537

 [ bp: Massage commit message. ]

Fixes: 4d05bf71f157 ("x86/resctrl: Introduce AMD QOS feature")
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/160675180380.15628.3309402017215002347.stgit@bmoger-ubuntu
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     | 4 ++++
 arch/x86/kernel/cpu/resctrl/internal.h | 3 +++
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 9 +++++++--
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index bbd83035a88a..7ca6a483118b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -564,6 +564,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
 	if (d) {
 		cpumask_set_cpu(cpu, &d->cpu_mask);
+		if (r->cache.arch_has_per_cpu_cfg)
+			rdt_domain_reconfigure_cdp(r);
 		return;
 	}
 
@@ -921,6 +923,7 @@ static __init void rdt_init_res_defs_intel(void)
 		    r->rid == RDT_RESOURCE_L2CODE) {
 			r->cache.arch_has_sparse_bitmaps = false;
 			r->cache.arch_has_empty_bitmaps = false;
+			r->cache.arch_has_per_cpu_cfg = false;
 		} else if (r->rid == RDT_RESOURCE_MBA) {
 			r->msr_base = MSR_IA32_MBA_THRTL_BASE;
 			r->msr_update = mba_wrmsr_intel;
@@ -941,6 +944,7 @@ static __init void rdt_init_res_defs_amd(void)
 		    r->rid == RDT_RESOURCE_L2CODE) {
 			r->cache.arch_has_sparse_bitmaps = true;
 			r->cache.arch_has_empty_bitmaps = true;
+			r->cache.arch_has_per_cpu_cfg = true;
 		} else if (r->rid == RDT_RESOURCE_MBA) {
 			r->msr_base = MSR_IA32_MBA_BW_BASE;
 			r->msr_update = mba_wrmsr_amd;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 861f59293b30..87c32a5d9ebf 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -361,6 +361,8 @@ struct msr_param {
  *			executing entities
  * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
  * @arch_has_empty_bitmaps:	True if the '0' bitmap is valid.
+ * @arch_has_per_cpu_cfg:	True if QOS_CFG register for this cache
+ *				level has CPU scope.
  */
 struct rdt_cache {
 	unsigned int	cbm_len;
@@ -370,6 +372,7 @@ struct rdt_cache {
 	unsigned int	shareable_bits;
 	bool		arch_has_sparse_bitmaps;
 	bool		arch_has_empty_bitmaps;
+	bool		arch_has_per_cpu_cfg;
 };
 
 /**
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 8fc0c2126003..dca0ea7952b0 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1890,8 +1890,13 @@ static int set_cache_qos_cfg(int level, bool enable)
 
 	r_l = &rdt_resources_all[level];
 	list_for_each_entry(d, &r_l->domains, list) {
-		/* Pick one CPU from each domain instance to update MSR */
-		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+		if (r_l->cache.arch_has_per_cpu_cfg)
+			/* Pick all the CPUs in the domain instance */
+			for_each_cpu(cpu, &d->cpu_mask)
+				cpumask_set_cpu(cpu, cpu_mask);
+		else
+			/* Pick one CPU from each domain instance to update MSR */
+			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 	}
 	cpu = get_cpu();
 	/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
-- 
Gitee


From bdc6821c70e284672b6a5236760420bb2006879e Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 14 Oct 2020 00:49:26 +0000
Subject: [PATCH 1260/2161] Documentation/x86: Rename resctrl_ui.rst and add
 two errata to the file

commit d1b22e36e3188f47dbf8aaef54d9e79df8e91f4c upstream.

Intel Memory Bandwidth Monitoring (MBM) counters may report system
memory bandwidth incorrectly on some Intel processors. This is reported
in documented in erratum SKX99, erratum BDF102 and in the RDT reference
manual, see Documentation/x86/index.rst.

To work around the errata, MBM total and local readings are corrected
using a correction factor table.

Since the correction factor table is not publicly documented anywhere,
document the table and the errata in Documentation/x86/resctrl.rst for
future reference.

 [ bp: Move web links to the doc, massage. ]

Suggested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20201014004927.1839452-2-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/index.rst                   |  2 +-
 .../x86/{resctrl_ui.rst => resctrl.rst}       | 93 +++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)
 rename Documentation/x86/{resctrl_ui.rst => resctrl.rst} (90%)

diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index d86094a3c14e..8c748ee1726b 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -26,7 +26,7 @@ x86-specific Documentation
    pti
    mds
    microcode
-   resctrl_ui
+   resctrl
    tsx_async_abort
    usb-legacy-support
    i386/index
diff --git a/Documentation/x86/resctrl_ui.rst b/Documentation/x86/resctrl.rst
similarity index 90%
rename from Documentation/x86/resctrl_ui.rst
rename to Documentation/x86/resctrl.rst
index e59b7b93a9b4..71a531061e4e 100644
--- a/Documentation/x86/resctrl_ui.rst
+++ b/Documentation/x86/resctrl.rst
@@ -1209,3 +1209,96 @@ View the llc occupancy snapshot::
 
   # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
   11234000
+
+Intel RDT Errata
+================
+
+Intel MBM Counters May Report System Memory Bandwidth Incorrectly
+-----------------------------------------------------------------
+
+Errata SKX99 for Skylake server and BDF102 for Broadwell server.
+
+Problem: Intel Memory Bandwidth Monitoring (MBM) counters track metrics
+according to the assigned Resource Monitor ID (RMID) for that logical
+core. The IA32_QM_CTR register (MSR 0xC8E), used to report these
+metrics, may report incorrect system bandwidth for certain RMID values.
+
+Implication: Due to the errata, system memory bandwidth may not match
+what is reported.
+
+Workaround: MBM total and local readings are corrected according to the
+following correction factor table:
+
++---------------+---------------+---------------+-----------------+
+|core count	|rmid count	|rmid threshold	|correction factor|
++---------------+---------------+---------------+-----------------+
+|1		|8		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|2		|16		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|3		|24		|15		|0.969650	  |
++---------------+---------------+---------------+-----------------+
+|4		|32		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|6		|48		|31		|0.969650	  |
++---------------+---------------+---------------+-----------------+
+|7		|56		|47		|1.142857	  |
++---------------+---------------+---------------+-----------------+
+|8		|64		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|9		|72		|63		|1.185115	  |
++---------------+---------------+---------------+-----------------+
+|10		|80		|63		|1.066553	  |
++---------------+---------------+---------------+-----------------+
+|11		|88		|79		|1.454545	  |
++---------------+---------------+---------------+-----------------+
+|12		|96		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|13		|104		|95		|1.230769	  |
++---------------+---------------+---------------+-----------------+
+|14		|112		|95		|1.142857	  |
++---------------+---------------+---------------+-----------------+
+|15		|120		|95		|1.066667	  |
++---------------+---------------+---------------+-----------------+
+|16		|128		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|17		|136		|127		|1.254863	  |
++---------------+---------------+---------------+-----------------+
+|18		|144		|127		|1.185255	  |
++---------------+---------------+---------------+-----------------+
+|19		|152		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|20		|160		|127		|1.066667	  |
++---------------+---------------+---------------+-----------------+
+|21		|168		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|22		|176		|159		|1.454334	  |
++---------------+---------------+---------------+-----------------+
+|23		|184		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|24		|192		|127		|0.969744	  |
++---------------+---------------+---------------+-----------------+
+|25		|200		|191		|1.280246	  |
++---------------+---------------+---------------+-----------------+
+|26		|208		|191		|1.230921	  |
++---------------+---------------+---------------+-----------------+
+|27		|216		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|28		|224		|191		|1.143118	  |
++---------------+---------------+---------------+-----------------+
+
+If rmid > rmid threshold, MBM total and local values should be multiplied
+by the correction factor.
+
+See:
+
+1. Erratum SKX99 in Intel Xeon Processor Scalable Family Specification Update:
+http://web.archive.org/web/20200716124958/https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-spec-update.html
+
+2. Erratum BDF102 in Intel Xeon E5-2600 v4 Processor Product Family Specification Update:
+http://web.archive.org/web/20191125200531/https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/xeon-e5-v4-spec-update.pdf
+
+3. The errata in Intel Resource Director Technology (Intel RDT) on 2nd Generation Intel Xeon Scalable Processors Reference Manual:
+https://software.intel.com/content/www/us/en/develop/articles/intel-resource-director-technology-rdt-reference-manual.html
+
+for further information.
-- 
Gitee


From 0db4dca2837dc5ab4301cf5a7ac94fc2e3662776 Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Wed, 11 Nov 2020 00:02:28 +0100
Subject: [PATCH 1261/2161] x86/resctrl: Constify kernfs_ops

commit 2002d2951398317d0f46e64ae6d8dd58ed541c6d upstream.

The only usage of the kf_ops field in the rftype struct is to pass
it as argument to __kernfs_create_file(), which accepts a pointer to
const. Make it a pointer to const. This makes it possible to make
rdtgroup_kf_single_ops and kf_mondata_ops const, which allows the
compiler to put them in read-only memory.

Signed-off-by: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20201110230228.801785-1-rikard.falkeborn@gmail.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/internal.h | 2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 87c32a5d9ebf..9f71461e151c 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -265,7 +265,7 @@ void __exit rdtgroup_exit(void);
 struct rftype {
 	char			*name;
 	umode_t			mode;
-	struct kernfs_ops	*kf_ops;
+	const struct kernfs_ops	*kf_ops;
 	unsigned long		flags;
 	unsigned long		fflags;
 
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index dca0ea7952b0..5b721b163542 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -240,13 +240,13 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 	return -EINVAL;
 }
 
-static struct kernfs_ops rdtgroup_kf_single_ops = {
+static const struct kernfs_ops rdtgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.write			= rdtgroup_file_write,
 	.seq_show		= rdtgroup_seqfile_show,
 };
 
-static struct kernfs_ops kf_mondata_ops = {
+static const struct kernfs_ops kf_mondata_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.seq_show		= rdtgroup_mondata_show,
 };
-- 
Gitee


From 73d3b05a64fad114d5eeebc0cd49909aaf759041 Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <xiaochen.shen@intel.com>
Date: Tue, 1 Dec 2020 02:06:58 +0800
Subject: [PATCH 1262/2161] x86/resctrl: Clean up unused function parameter in
 rmdir path

commit 32ada3b9e04c6f6d4916967bd8bbe2450ad5bc5e upstream.

Commit

  fd8d9db3559a ("x86/resctrl: Remove superfluous kernfs_get() calls to prevent refcount leak")

removed superfluous kernfs_get() calls in rdtgroup_ctrl_remove() and
rdtgroup_rmdir_ctrl(). That change resulted in an unused function
parameter to these two functions.

Clean up the unused function parameter in rdtgroup_ctrl_remove(),
rdtgroup_rmdir_mon() and their callers rdtgroup_rmdir_ctrl() and
rdtgroup_rmdir().

Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/1606759618-13181-1-git-send-email-xiaochen.shen@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 5b721b163542..2ab0fd017313 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -3009,8 +3009,7 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	return -EPERM;
 }
 
-static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
-			      cpumask_var_t tmpmask)
+static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
 {
 	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
 	int cpu;
@@ -3042,8 +3041,7 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 	return 0;
 }
 
-static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
-				struct rdtgroup *rdtgrp)
+static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
 {
 	rdtgrp->flags = RDT_DELETED;
 	list_del(&rdtgrp->rdtgroup_list);
@@ -3052,8 +3050,7 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
 	return 0;
 }
 
-static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
-			       cpumask_var_t tmpmask)
+static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
 {
 	int cpu;
 
@@ -3080,7 +3077,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 	closid_free(rdtgrp->closid);
 	free_rmid(rdtgrp->mon.rmid);
 
-	rdtgroup_ctrl_remove(kn, rdtgrp);
+	rdtgroup_ctrl_remove(rdtgrp);
 
 	/*
 	 * Free all the child monitor group rmids.
@@ -3117,13 +3114,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 	    rdtgrp != &rdtgroup_default) {
 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
-			ret = rdtgroup_ctrl_remove(kn, rdtgrp);
+			ret = rdtgroup_ctrl_remove(rdtgrp);
 		} else {
-			ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+			ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
 		}
 	} else if (rdtgrp->type == RDTMON_GROUP &&
 		 is_mon_groups(parent_kn, kn->name)) {
-		ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+		ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
 	} else {
 		ret = -EPERM;
 	}
-- 
Gitee


From 3c55c488d158373dab51076a3fef5ca528479c21 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:14 +0000
Subject: [PATCH 1263/2161] x86/resctrl: Split struct rdt_resource

commit 63c8b1231929b8aa80abc753c1c91b6b49e2c0b0 upstream.

resctrl is the defacto Linux ABI for SoC resource partitioning features.

To support it on another architecture, it needs to be abstracted from
the features provided by Intel RDT and AMD PQoS, and moved to /fs/.
struct rdt_resource contains a mix of architecture private details
and properties of the filesystem interface user-space uses.

Start by splitting struct rdt_resource, into an architecture private
'hw' struct, which contains the common resctrl structure that would be
used by any architecture. The foreach helpers are most commonly used by
the filesystem code, and should return the common resctrl structure.
for_each_rdt_resource() is changed to walk the common structure in its
parent arch private structure.

Move as much of the structure as possible into the common structure
in the core code's header file. The x86 hardware accessors remain
part of the architecture private code, as do num_closid, mon_scale
and mbm_width.

mon_scale and mbm_width are used to detect overflow of the hardware
counters, and convert them from their native size to bytes. Any
cross-architecture abstraction should be in terms of bytes, making
these properties private.

The hardware's num_closid is kept in the private structure to force the
filesystem code to use a helper to access it. MPAM would return a single
value for the system, regardless of the resource. Using the helper
prevents this field from being confused with the version of num_closid
that is being exposed to user-space (added in a later patch).

After this split, filesystem code touching a 'hw' struct indicates
where an abstraction is needed.

Splitting this structure only moves types around, and should not lead
to any change in behaviour.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-2-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        | 257 +++++++++++++---------
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c |  14 +-
 arch/x86/kernel/cpu/resctrl/internal.h    | 144 +++---------
 arch/x86/kernel/cpu/resctrl/monitor.c     |  32 +--
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c |   4 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  68 +++---
 include/linux/resctrl.h                   | 110 +++++++++
 7 files changed, 359 insertions(+), 270 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 7ca6a483118b..80d1657c773a 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -57,120 +57,134 @@ static void
 mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
 	      struct rdt_resource *r);
 
-#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
 
-struct rdt_resource rdt_resources_all[] = {
+struct rdt_hw_resource rdt_resources_all[] = {
 	[RDT_RESOURCE_L3] =
 	{
-		.rid			= RDT_RESOURCE_L3,
-		.name			= "L3",
-		.domains		= domain_init(RDT_RESOURCE_L3),
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_L3,
+			.name			= "L3",
+			.cache_level		= 3,
+			.cache = {
+				.min_cbm_bits	= 1,
+				.cbm_idx_mult	= 1,
+				.cbm_idx_offset	= 0,
+			},
+			.domains		= domain_init(RDT_RESOURCE_L3),
+			.parse_ctrlval		= parse_cbm,
+			.format_str		= "%d=%0*x",
+			.fflags			= RFTYPE_RES_CACHE,
+		},
 		.msr_base		= MSR_IA32_L3_CBM_BASE,
 		.msr_update		= cat_wrmsr,
-		.cache_level		= 3,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 1,
-			.cbm_idx_offset	= 0,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L3DATA] =
 	{
-		.rid			= RDT_RESOURCE_L3DATA,
-		.name			= "L3DATA",
-		.domains		= domain_init(RDT_RESOURCE_L3DATA),
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_L3DATA,
+			.name			= "L3DATA",
+			.cache_level		= 3,
+			.cache = {
+				.min_cbm_bits	= 1,
+				.cbm_idx_mult	= 2,
+				.cbm_idx_offset	= 0,
+			},
+			.domains		= domain_init(RDT_RESOURCE_L3DATA),
+			.parse_ctrlval		= parse_cbm,
+			.format_str		= "%d=%0*x",
+			.fflags			= RFTYPE_RES_CACHE,
+		},
 		.msr_base		= MSR_IA32_L3_CBM_BASE,
 		.msr_update		= cat_wrmsr,
-		.cache_level		= 3,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 2,
-			.cbm_idx_offset	= 0,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L3CODE] =
 	{
-		.rid			= RDT_RESOURCE_L3CODE,
-		.name			= "L3CODE",
-		.domains		= domain_init(RDT_RESOURCE_L3CODE),
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_L3CODE,
+			.name			= "L3CODE",
+			.cache_level		= 3,
+			.cache = {
+				.min_cbm_bits	= 1,
+				.cbm_idx_mult	= 2,
+				.cbm_idx_offset	= 1,
+			},
+			.domains		= domain_init(RDT_RESOURCE_L3CODE),
+			.parse_ctrlval		= parse_cbm,
+			.format_str		= "%d=%0*x",
+			.fflags			= RFTYPE_RES_CACHE,
+		},
 		.msr_base		= MSR_IA32_L3_CBM_BASE,
 		.msr_update		= cat_wrmsr,
-		.cache_level		= 3,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 2,
-			.cbm_idx_offset	= 1,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L2] =
 	{
-		.rid			= RDT_RESOURCE_L2,
-		.name			= "L2",
-		.domains		= domain_init(RDT_RESOURCE_L2),
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_L2,
+			.name			= "L2",
+			.cache_level		= 2,
+			.cache = {
+				.min_cbm_bits	= 1,
+				.cbm_idx_mult	= 1,
+				.cbm_idx_offset	= 0,
+			},
+			.domains		= domain_init(RDT_RESOURCE_L2),
+			.parse_ctrlval		= parse_cbm,
+			.format_str		= "%d=%0*x",
+			.fflags			= RFTYPE_RES_CACHE,
+		},
 		.msr_base		= MSR_IA32_L2_CBM_BASE,
 		.msr_update		= cat_wrmsr,
-		.cache_level		= 2,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 1,
-			.cbm_idx_offset	= 0,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L2DATA] =
 	{
-		.rid			= RDT_RESOURCE_L2DATA,
-		.name			= "L2DATA",
-		.domains		= domain_init(RDT_RESOURCE_L2DATA),
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_L2DATA,
+			.name			= "L2DATA",
+			.cache_level		= 2,
+			.cache = {
+				.min_cbm_bits	= 1,
+				.cbm_idx_mult	= 2,
+				.cbm_idx_offset	= 0,
+			},
+			.domains		= domain_init(RDT_RESOURCE_L2DATA),
+			.parse_ctrlval		= parse_cbm,
+			.format_str		= "%d=%0*x",
+			.fflags			= RFTYPE_RES_CACHE,
+		},
 		.msr_base		= MSR_IA32_L2_CBM_BASE,
 		.msr_update		= cat_wrmsr,
-		.cache_level		= 2,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 2,
-			.cbm_idx_offset	= 0,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L2CODE] =
 	{
-		.rid			= RDT_RESOURCE_L2CODE,
-		.name			= "L2CODE",
-		.domains		= domain_init(RDT_RESOURCE_L2CODE),
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_L2CODE,
+			.name			= "L2CODE",
+			.cache_level		= 2,
+			.cache = {
+				.min_cbm_bits	= 1,
+				.cbm_idx_mult	= 2,
+				.cbm_idx_offset	= 1,
+			},
+			.domains		= domain_init(RDT_RESOURCE_L2CODE),
+			.parse_ctrlval		= parse_cbm,
+			.format_str		= "%d=%0*x",
+			.fflags			= RFTYPE_RES_CACHE,
+		},
 		.msr_base		= MSR_IA32_L2_CBM_BASE,
 		.msr_update		= cat_wrmsr,
-		.cache_level		= 2,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 2,
-			.cbm_idx_offset	= 1,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_MBA] =
 	{
-		.rid			= RDT_RESOURCE_MBA,
-		.name			= "MB",
-		.domains		= domain_init(RDT_RESOURCE_MBA),
-		.cache_level		= 3,
-		.parse_ctrlval		= parse_bw,
-		.format_str		= "%d=%*u",
-		.fflags			= RFTYPE_RES_MB,
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_MBA,
+			.name			= "MB",
+			.cache_level		= 3,
+			.domains		= domain_init(RDT_RESOURCE_MBA),
+			.parse_ctrlval		= parse_bw,
+			.format_str		= "%d=%*u",
+			.fflags			= RFTYPE_RES_MB,
+		},
 	},
 };
 
@@ -199,7 +213,8 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
  */
 static inline void cache_alloc_hsw_probe(void)
 {
-	struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_resource *r  = &hw_res->r_resctrl;
 	u32 l, h, max_cbm = BIT_MASK(20) - 1;
 
 	if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0))
@@ -211,7 +226,7 @@ static inline void cache_alloc_hsw_probe(void)
 	if (l != max_cbm)
 		return;
 
-	r->num_closid = 4;
+	hw_res->num_closid = 4;
 	r->default_ctrl = max_cbm;
 	r->cache.cbm_len = 20;
 	r->cache.shareable_bits = 0xc0000;
@@ -225,7 +240,7 @@ static inline void cache_alloc_hsw_probe(void)
 bool is_mba_sc(struct rdt_resource *r)
 {
 	if (!r)
-		return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc;
+		return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
 
 	return r->membw.mba_sc;
 }
@@ -253,12 +268,13 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r)
 
 static bool __get_mem_config_intel(struct rdt_resource *r)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	union cpuid_0x10_3_eax eax;
 	union cpuid_0x10_x_edx edx;
 	u32 ebx, ecx, max_delay;
 
 	cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
-	r->num_closid = edx.split.cos_max + 1;
+	hw_res->num_closid = edx.split.cos_max + 1;
 	max_delay = eax.split.max_delay + 1;
 	r->default_ctrl = MAX_MBA_BW;
 	r->membw.arch_needs_linear = true;
@@ -287,12 +303,13 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 
 static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	union cpuid_0x10_3_eax eax;
 	union cpuid_0x10_x_edx edx;
 	u32 ebx, ecx;
 
 	cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full);
-	r->num_closid = edx.split.cos_max + 1;
+	hw_res->num_closid = edx.split.cos_max + 1;
 	r->default_ctrl = MAX_MBA_BW_AMD;
 
 	/* AMD does not use delay */
@@ -311,12 +328,13 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
 
 static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	union cpuid_0x10_1_eax eax;
 	union cpuid_0x10_x_edx edx;
 	u32 ebx, ecx;
 
 	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
-	r->num_closid = edx.split.cos_max + 1;
+	hw_res->num_closid = edx.split.cos_max + 1;
 	r->cache.cbm_len = eax.split.cbm_len + 1;
 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
 	r->cache.shareable_bits = ebx & r->default_ctrl;
@@ -327,10 +345,12 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 
 static void rdt_get_cdp_config(int level, int type)
 {
-	struct rdt_resource *r_l = &rdt_resources_all[level];
-	struct rdt_resource *r = &rdt_resources_all[type];
+	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
+	struct rdt_hw_resource *hw_res_l = resctrl_to_arch_res(r_l);
+	struct rdt_resource *r = &rdt_resources_all[type].r_resctrl;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
-	r->num_closid = r_l->num_closid / 2;
+	hw_res->num_closid = hw_res_l->num_closid / 2;
 	r->cache.cbm_len = r_l->cache.cbm_len;
 	r->default_ctrl = r_l->default_ctrl;
 	r->cache.shareable_bits = r_l->cache.shareable_bits;
@@ -359,9 +379,10 @@ static void
 mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(r->msr_base + i, d->ctrl_val[i]);
+		wrmsrl(hw_res->msr_base + i, d->ctrl_val[i]);
 }
 
 /*
@@ -383,19 +404,21 @@ mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
 		struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	/*  Write the delay values for mba. */
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
+		wrmsrl(hw_res->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
 }
 
 static void
 cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
+		wrmsrl(hw_res->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
 }
 
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
@@ -414,13 +437,14 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
 void rdt_ctrl_update(void *arg)
 {
 	struct msr_param *m = arg;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
 	struct rdt_resource *r = m->res;
 	int cpu = smp_processor_id();
 	struct rdt_domain *d;
 
 	d = get_domain_from_cpu(cpu, r);
 	if (d) {
-		r->msr_update(d, m, r);
+		hw_res->msr_update(d, m, r);
 		return;
 	}
 	pr_warn_once("cpu %d not found in any domain for resource %s\n",
@@ -462,6 +486,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
 
 void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	int i;
 
 	/*
@@ -470,7 +495,7 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 	 * For Memory Allocation: Set b/w requested to 100%
 	 * and the bandwidth in MBps to U32_MAX
 	 */
-	for (i = 0; i < r->num_closid; i++, dc++, dm++) {
+	for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) {
 		*dc = r->default_ctrl;
 		*dm = MBA_MAX_MBPS;
 	}
@@ -478,14 +503,15 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 
 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct msr_param m;
 	u32 *dc, *dm;
 
-	dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
+	dc = kmalloc_array(hw_res->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
 	if (!dc)
 		return -ENOMEM;
 
-	dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
+	dm = kmalloc_array(hw_res->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
 	if (!dm) {
 		kfree(dc);
 		return -ENOMEM;
@@ -496,8 +522,8 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	setup_default_ctrlval(r, dc, dm);
 
 	m.low = 0;
-	m.high = r->num_closid;
-	r->msr_update(d, &m, r);
+	m.high = hw_res->num_closid;
+	hw_res->msr_update(d, &m, r);
 	return 0;
 }
 
@@ -649,7 +675,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		return;
 	}
 
-	if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+	if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
 		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
 			cancel_delayed_work(&d->mbm_over);
 			mbm_setup_overflow_handler(d, 0);
@@ -821,19 +847,22 @@ static bool __init rdt_cpu_has(int flag)
 
 static __init bool get_mem_config(void)
 {
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
+
 	if (!rdt_cpu_has(X86_FEATURE_MBA))
 		return false;
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		return __get_mem_config_intel(&rdt_resources_all[RDT_RESOURCE_MBA]);
+		return __get_mem_config_intel(&hw_res->r_resctrl);
 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return __rdt_get_mem_config_amd(&rdt_resources_all[RDT_RESOURCE_MBA]);
+		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
 
 	return false;
 }
 
 static __init bool get_rdt_alloc_resources(void)
 {
+	struct rdt_resource *r;
 	bool ret = false;
 
 	if (rdt_alloc_capable)
@@ -843,14 +872,16 @@ static __init bool get_rdt_alloc_resources(void)
 		return false;
 
 	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
-		rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+		rdt_get_cache_alloc_cfg(1, r);
 		if (rdt_cpu_has(X86_FEATURE_CDP_L3))
 			rdt_get_cdp_l3_config();
 		ret = true;
 	}
 	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
-		rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+		r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
+		rdt_get_cache_alloc_cfg(2, r);
 		if (rdt_cpu_has(X86_FEATURE_CDP_L2))
 			rdt_get_cdp_l2_config();
 		ret = true;
@@ -864,6 +895,8 @@ static __init bool get_rdt_alloc_resources(void)
 
 static __init bool get_rdt_mon_resources(void)
 {
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
 	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
 		rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
@@ -874,7 +907,7 @@ static __init bool get_rdt_mon_resources(void)
 	if (!rdt_mon_features)
 		return false;
 
-	return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+	return !rdt_get_mon_l3_config(r);
 }
 
 static __init void __check_quirks_intel(void)
@@ -912,9 +945,12 @@ static __init bool get_rdt_resources(void)
 
 static __init void rdt_init_res_defs_intel(void)
 {
+	struct rdt_hw_resource *hw_res;
 	struct rdt_resource *r;
 
 	for_each_rdt_resource(r) {
+		hw_res = resctrl_to_arch_res(r);
+
 		if (r->rid == RDT_RESOURCE_L3 ||
 		    r->rid == RDT_RESOURCE_L3DATA ||
 		    r->rid == RDT_RESOURCE_L3CODE ||
@@ -925,17 +961,20 @@ static __init void rdt_init_res_defs_intel(void)
 			r->cache.arch_has_empty_bitmaps = false;
 			r->cache.arch_has_per_cpu_cfg = false;
 		} else if (r->rid == RDT_RESOURCE_MBA) {
-			r->msr_base = MSR_IA32_MBA_THRTL_BASE;
-			r->msr_update = mba_wrmsr_intel;
+			hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
+			hw_res->msr_update = mba_wrmsr_intel;
 		}
 	}
 }
 
 static __init void rdt_init_res_defs_amd(void)
 {
+	struct rdt_hw_resource *hw_res;
 	struct rdt_resource *r;
 
 	for_each_rdt_resource(r) {
+		hw_res = resctrl_to_arch_res(r);
+
 		if (r->rid == RDT_RESOURCE_L3 ||
 		    r->rid == RDT_RESOURCE_L3DATA ||
 		    r->rid == RDT_RESOURCE_L3CODE ||
@@ -946,8 +985,8 @@ static __init void rdt_init_res_defs_amd(void)
 			r->cache.arch_has_empty_bitmaps = true;
 			r->cache.arch_has_per_cpu_cfg = true;
 		} else if (r->rid == RDT_RESOURCE_MBA) {
-			r->msr_base = MSR_IA32_MBA_BW_BASE;
-			r->msr_update = mba_wrmsr_amd;
+			hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
+			hw_res->msr_update = mba_wrmsr_amd;
 		}
 	}
 }
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index c877642e8a14..3f0c33d5b658 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -284,10 +284,12 @@ int update_domains(struct rdt_resource *r, int closid)
 static int rdtgroup_parse_resource(char *resname, char *tok,
 				   struct rdtgroup *rdtgrp)
 {
+	struct rdt_hw_resource *hw_res;
 	struct rdt_resource *r;
 
 	for_each_alloc_enabled_rdt_resource(r) {
-		if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid)
+		hw_res = resctrl_to_arch_res(r);
+		if (!strcmp(resname, r->name) && rdtgrp->closid < hw_res->num_closid)
 			return parse_line(tok, r, rdtgrp);
 	}
 	rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
@@ -394,6 +396,7 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v)
 {
+	struct rdt_hw_resource *hw_res;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
 	int ret = 0;
@@ -418,7 +421,8 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 		} else {
 			closid = rdtgrp->closid;
 			for_each_alloc_enabled_rdt_resource(r) {
-				if (closid < r->num_closid)
+				hw_res = resctrl_to_arch_res(r);
+				if (closid < hw_res->num_closid)
 					show_doms(s, r, closid);
 			}
 		}
@@ -449,6 +453,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
+	struct rdt_hw_resource *hw_res;
 	u32 resid, evtid, domid;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
@@ -468,7 +473,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 	domid = md.u.domid;
 	evtid = md.u.evtid;
 
-	r = &rdt_resources_all[resid];
+	hw_res = &rdt_resources_all[resid];
+	r = &hw_res->r_resctrl;
 	d = rdt_find_domain(r, domid, NULL);
 	if (IS_ERR_OR_NULL(d)) {
 		ret = -ENOENT;
@@ -482,7 +488,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 	else if (rr.val & RMID_VAL_UNAVAIL)
 		seq_puts(m, "Unavailable\n");
 	else
-		seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+		seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale);
 
 out:
 	rdtgroup_kn_unlock(of->kn);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 9f71461e151c..9453b6cc83d9 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_RESCTRL_INTERNAL_H
 #define _ASM_X86_RESCTRL_INTERNAL_H
 
+#include <linux/resctrl.h>
 #include <linux/sched.h>
 #include <linux/kernfs.h>
 #include <linux/fs_context.h>
@@ -349,66 +350,6 @@ struct msr_param {
 	int			high;
 };
 
-/**
- * struct rdt_cache - Cache allocation related data
- * @cbm_len:		Length of the cache bit mask
- * @min_cbm_bits:	Minimum number of consecutive bits to be set
- * @cbm_idx_mult:	Multiplier of CBM index
- * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
- *			closid * cbm_idx_multi + cbm_idx_offset
- *			in a cache bit mask
- * @shareable_bits:	Bitmask of shareable resource with other
- *			executing entities
- * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
- * @arch_has_empty_bitmaps:	True if the '0' bitmap is valid.
- * @arch_has_per_cpu_cfg:	True if QOS_CFG register for this cache
- *				level has CPU scope.
- */
-struct rdt_cache {
-	unsigned int	cbm_len;
-	unsigned int	min_cbm_bits;
-	unsigned int	cbm_idx_mult;
-	unsigned int	cbm_idx_offset;
-	unsigned int	shareable_bits;
-	bool		arch_has_sparse_bitmaps;
-	bool		arch_has_empty_bitmaps;
-	bool		arch_has_per_cpu_cfg;
-};
-
-/**
- * enum membw_throttle_mode - System's memory bandwidth throttling mode
- * @THREAD_THROTTLE_UNDEFINED:	Not relevant to the system
- * @THREAD_THROTTLE_MAX:	Memory bandwidth is throttled at the core
- *				always using smallest bandwidth percentage
- *				assigned to threads, aka "max throttling"
- * @THREAD_THROTTLE_PER_THREAD:	Memory bandwidth is throttled at the thread
- */
-enum membw_throttle_mode {
-	THREAD_THROTTLE_UNDEFINED = 0,
-	THREAD_THROTTLE_MAX,
-	THREAD_THROTTLE_PER_THREAD,
-};
-
-/**
- * struct rdt_membw - Memory bandwidth allocation related data
- * @min_bw:		Minimum memory bandwidth percentage user can request
- * @bw_gran:		Granularity at which the memory bandwidth is allocated
- * @delay_linear:	True if memory B/W delay is in linear scale
- * @throttle_mode:	Bandwidth throttling mode when threads request
- *			different memory bandwidths
- * @mba_sc:		True if MBA software controller(mba_sc) is enabled
- * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
- */
-struct rdt_membw {
-	u32		min_bw;
-	u32		bw_gran;
-	u32		delay_linear;
-	enum membw_throttle_mode	throttle_mode;
-	bool		mba_sc;
-	u32		*mb_map;
-	
-};
-
 static inline bool is_llc_occupancy_enabled(void)
 {
 	return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
@@ -441,56 +382,29 @@ struct rdt_parse_data {
 };
 
 /**
- * struct rdt_resource - attributes of an RDT resource
- * @rid:		The index of the resource
- * @alloc_enabled:	Is allocation enabled on this machine
- * @mon_enabled:	Is monitoring enabled for this feature
- * @alloc_capable:	Is allocation available on this machine
- * @mon_capable:	Is monitor feature available on this machine
- * @name:		Name to use in "schemata" file
- * @num_closid:		Number of CLOSIDs available
- * @cache_level:	Which cache level defines scope of this resource
- * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
+ * struct rdt_hw_resource - arch private attributes of a resctrl resource
+ * @r_resctrl:		Attributes of the resource used directly by resctrl.
+ * @num_closid:		Maximum number of closid this hardware can support.
  * @msr_base:		Base MSR address for CBMs
  * @msr_update:		Function pointer to update QOS MSRs
- * @data_width:		Character width of data when displaying
- * @domains:		All domains for this resource
- * @cache:		Cache allocation related data
- * @format_str:		Per resource format string to show domain value
- * @parse_ctrlval:	Per resource function pointer to parse control values
- * @evt_list:		List of monitoring events
- * @num_rmid:		Number of RMIDs available
  * @mon_scale:		cqm counter * mon_scale = occupancy in bytes
- * @fflags:		flags to choose base and info files
+ * @mbm_width:		Monitor width, to detect and correct for overflow.
  */
-struct rdt_resource {
-	int			rid;
-	bool			alloc_enabled;
-	bool			mon_enabled;
-	bool			alloc_capable;
-	bool			mon_capable;
-	char			*name;
+struct rdt_hw_resource {
+	struct rdt_resource	r_resctrl;
 	int			num_closid;
-	int			cache_level;
-	u32			default_ctrl;
 	unsigned int		msr_base;
 	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
 				 struct rdt_resource *r);
-	int			data_width;
-	struct list_head	domains;
-	struct rdt_cache	cache;
-	struct rdt_membw	membw;
-	const char		*format_str;
-	int (*parse_ctrlval)(struct rdt_parse_data *data,
-			     struct rdt_resource *r,
-			     struct rdt_domain *d);
-	struct list_head	evt_list;
-	int			num_rmid;
 	unsigned int		mon_scale;
 	unsigned int		mbm_width;
-	unsigned long		fflags;
 };
 
+static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
+{
+	return container_of(r, struct rdt_hw_resource, r_resctrl);
+}
+
 int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
 	      struct rdt_domain *d);
 int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
@@ -498,7 +412,7 @@ int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
 
 extern struct mutex rdtgroup_mutex;
 
-extern struct rdt_resource rdt_resources_all[];
+extern struct rdt_hw_resource rdt_resources_all[];
 extern struct rdtgroup rdtgroup_default;
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
@@ -517,33 +431,41 @@ enum {
 	RDT_NUM_RESOURCES,
 };
 
+static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res);
+
+	hw_res++;
+	return &hw_res->r_resctrl;
+}
+
+/*
+ * To return the common struct rdt_resource, which is contained in struct
+ * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
+ */
 #define for_each_rdt_resource(r)					      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)
+	for (r = &rdt_resources_all[0].r_resctrl;			      \
+	     r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl;	      \
+	     r = resctrl_inc(r))
 
 #define for_each_capable_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->alloc_capable || r->mon_capable)
 
 #define for_each_alloc_capable_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->alloc_capable)
 
 #define for_each_mon_capable_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->mon_capable)
 
 #define for_each_alloc_enabled_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->alloc_enabled)
 
 #define for_each_mon_enabled_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->mon_enabled)
 
 /* CPUID.(EAX=10H, ECX=ResID=1).EAX */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 7ac31210e452..5838952c2211 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -174,7 +174,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
 	struct rdt_resource *r;
 	u32 crmid = 1, nrmid;
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3];
+	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 
 	/*
 	 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
@@ -232,7 +232,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 	int cpu;
 	u64 val;
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3];
+	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 
 	entry->busy = 0;
 	cpu = get_cpu();
@@ -287,6 +287,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 
 static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
 	struct mbm_state *m;
 	u64 chunks, tval;
 
@@ -319,7 +320,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 		return 0;
 	}
 
-	chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width);
+	chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width);
 	m->chunks += chunks;
 	m->prev_msr = tval;
 
@@ -334,7 +335,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
  */
 static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 {
-	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
 	struct mbm_state *m = &rr->d->mbm_local[rmid];
 	u64 tval, cur_bw, chunks;
 
@@ -342,8 +343,8 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
 		return;
 
-	chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
-	cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20;
+	chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width);
+	cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20;
 
 	if (m->delta_comp)
 		m->delta_bw = abs(cur_bw - m->prev_bw);
@@ -416,6 +417,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 {
 	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
 	struct mbm_state *pmbm_data, *cmbm_data;
+	struct rdt_hw_resource *hw_r_mba;
 	u32 cur_bw, delta_bw, user_bw;
 	struct rdt_resource *r_mba;
 	struct rdt_domain *dom_mba;
@@ -425,7 +427,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	if (!is_mbm_local_enabled())
 		return;
 
-	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+	hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+	r_mba = &hw_r_mba->r_resctrl;
 	closid = rgrp->closid;
 	rmid = rgrp->mon.rmid;
 	pmbm_data = &dom_mbm->mbm_local[rmid];
@@ -474,7 +477,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 		return;
 	}
 
-	cur_msr = r_mba->msr_base + closid;
+	cur_msr = hw_r_mba->msr_base + closid;
 	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
 	dom_mba->ctrl_val[closid] = new_msr_val;
 
@@ -538,7 +541,7 @@ void cqm_handle_limbo(struct work_struct *work)
 
 	mutex_lock(&rdtgroup_mutex);
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3];
+	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 	d = container_of(work, struct rdt_domain, cqm_limbo.work);
 
 	__check_limbo(d, false);
@@ -574,7 +577,7 @@ void mbm_handle_overflow(struct work_struct *work)
 	if (!static_branch_likely(&rdt_mon_enable_key))
 		goto out_unlock;
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3];
+	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 	d = container_of(work, struct rdt_domain, mbm_over.work);
 
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
@@ -671,15 +674,16 @@ static void l3_mon_evt_init(struct rdt_resource *r)
 int rdt_get_mon_l3_config(struct rdt_resource *r)
 {
 	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	unsigned int cl_size = boot_cpu_data.x86_cache_size;
 	int ret;
 
-	r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
 	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
-	r->mbm_width = MBM_CNTR_WIDTH_BASE;
+	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
 
 	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
-		r->mbm_width += mbm_offset;
+		hw_res->mbm_width += mbm_offset;
 	else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
 		pr_warn("Ignoring impossible MBM counter offset\n");
 
@@ -693,7 +697,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 	resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
 
 	/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
-	resctrl_cqm_threshold /= r->mon_scale;
+	resctrl_cqm_threshold /= hw_res->mon_scale;
 
 	ret = dom_data_init(r);
 	if (ret)
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 4bd28b388a1a..e154ea812a4a 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -684,8 +684,8 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
 	 *   resource, the portion of cache used by it should be made
 	 *   unavailable to all future allocations from both resources.
 	 */
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled ||
-	    rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) {
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl.alloc_enabled ||
+	    rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl.alloc_enabled) {
 		rdt_last_cmd_puts("CDP enabled\n");
 		return -EINVAL;
 	}
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 2ab0fd017313..875232b3acd4 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -100,12 +100,15 @@ int closids_supported(void)
 
 static void closid_init(void)
 {
+	struct rdt_hw_resource *hw_res;
 	struct rdt_resource *r;
 	int rdt_min_closid = 32;
 
 	/* Compute rdt_min_closid across all resources */
-	for_each_alloc_enabled_rdt_resource(r)
-		rdt_min_closid = min(rdt_min_closid, r->num_closid);
+	for_each_alloc_enabled_rdt_resource(r) {
+		hw_res = resctrl_to_arch_res(r);
+		rdt_min_closid = min(rdt_min_closid, hw_res->num_closid);
+	}
 
 	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
 
@@ -843,8 +846,10 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
 				struct seq_file *seq, void *v)
 {
 	struct rdt_resource *r = of->kn->parent->priv;
+	struct rdt_hw_resource *hw_res;
 
-	seq_printf(seq, "%d\n", r->num_closid);
+	hw_res = resctrl_to_arch_res(r);
+	seq_printf(seq, "%d\n", hw_res->num_closid);
 	return 0;
 }
 
@@ -1020,8 +1025,9 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
 				  struct seq_file *seq, void *v)
 {
 	struct rdt_resource *r = of->kn->parent->priv;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
-	seq_printf(seq, "%u\n", resctrl_cqm_threshold * r->mon_scale);
+	seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
 
 	return 0;
 }
@@ -1042,7 +1048,7 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 				       char *buf, size_t nbytes, loff_t off)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct rdt_hw_resource *hw_res;
 	unsigned int bytes;
 	int ret;
 
@@ -1053,7 +1059,8 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 	if (bytes > (boot_cpu_data.x86_cache_size * 1024))
 		return -EINVAL;
 
-	resctrl_cqm_threshold = bytes / r->mon_scale;
+	hw_res = resctrl_to_arch_res(of->kn->parent->priv);
+	resctrl_cqm_threshold = bytes / hw_res->mon_scale;
 
 	return nbytes;
 }
@@ -1111,16 +1118,16 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
 
 	switch (r->rid) {
 	case RDT_RESOURCE_L3DATA:
-		_r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
+		_r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE].r_resctrl;
 		break;
 	case RDT_RESOURCE_L3CODE:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA];
+		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl;
 		break;
 	case RDT_RESOURCE_L2DATA:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE];
+		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE].r_resctrl;
 		break;
 	case RDT_RESOURCE_L2CODE:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA];
+		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl;
 		break;
 	default:
 		ret = -ENOENT;
@@ -1867,7 +1874,7 @@ static void l2_qos_cfg_update(void *arg)
 
 static inline bool is_mba_linear(void)
 {
-	return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
+	return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
 }
 
 static int set_cache_qos_cfg(int level, bool enable)
@@ -1888,7 +1895,7 @@ static int set_cache_qos_cfg(int level, bool enable)
 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
 
-	r_l = &rdt_resources_all[level];
+	r_l = &rdt_resources_all[level].r_resctrl;
 	list_for_each_entry(d, &r_l->domains, list) {
 		if (r_l->cache.arch_has_per_cpu_cfg)
 			/* Pick all the CPUs in the domain instance */
@@ -1917,10 +1924,10 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
 	if (!r->alloc_capable)
 		return;
 
-	if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
+	if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl)
 		l2_qos_cfg_update(&r->alloc_enabled);
 
-	if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
+	if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl)
 		l3_qos_cfg_update(&r->alloc_enabled);
 }
 
@@ -1932,7 +1939,7 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
  */
 static int set_mba_sc(bool mba_sc)
 {
-	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
 	struct rdt_domain *d;
 
 	if (!is_mbm_enabled() || !is_mba_linear() ||
@@ -1948,9 +1955,9 @@ static int set_mba_sc(bool mba_sc)
 
 static int cdp_enable(int level, int data_type, int code_type)
 {
-	struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
-	struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
-	struct rdt_resource *r_l = &rdt_resources_all[level];
+	struct rdt_resource *r_ldata = &rdt_resources_all[data_type].r_resctrl;
+	struct rdt_resource *r_lcode = &rdt_resources_all[code_type].r_resctrl;
+	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
 	int ret;
 
 	if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
@@ -1980,13 +1987,13 @@ static int cdpl2_enable(void)
 
 static void cdp_disable(int level, int data_type, int code_type)
 {
-	struct rdt_resource *r = &rdt_resources_all[level];
+	struct rdt_resource *r = &rdt_resources_all[level].r_resctrl;
 
 	r->alloc_enabled = r->alloc_capable;
 
-	if (rdt_resources_all[data_type].alloc_enabled) {
-		rdt_resources_all[data_type].alloc_enabled = false;
-		rdt_resources_all[code_type].alloc_enabled = false;
+	if (rdt_resources_all[data_type].r_resctrl.alloc_enabled) {
+		rdt_resources_all[data_type].r_resctrl.alloc_enabled = false;
+		rdt_resources_all[code_type].r_resctrl.alloc_enabled = false;
 		set_cache_qos_cfg(level, false);
 	}
 }
@@ -2003,9 +2010,9 @@ static void cdpl2_disable(void)
 
 static void cdp_disable_all(void)
 {
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl.alloc_enabled)
 		cdpl3_disable();
-	if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+	if (rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl.alloc_enabled)
 		cdpl2_disable();
 }
 
@@ -2153,7 +2160,7 @@ static int rdt_get_tree(struct fs_context *fc)
 		static_branch_enable_cpuslocked(&rdt_enable_key);
 
 	if (is_mbm_enabled()) {
-		r = &rdt_resources_all[RDT_RESOURCE_L3];
+		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 		list_for_each_entry(dom, &r->domains, list)
 			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
 	}
@@ -2262,6 +2269,7 @@ static int rdt_init_fs_context(struct fs_context *fc)
 
 static int reset_all_ctrls(struct rdt_resource *r)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct msr_param msr_param;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
@@ -2272,7 +2280,7 @@ static int reset_all_ctrls(struct rdt_resource *r)
 
 	msr_param.res = r;
 	msr_param.low = 0;
-	msr_param.high = r->num_closid;
+	msr_param.high = hw_res->num_closid;
 
 	/*
 	 * Disable resource control for this resource by setting all
@@ -2282,7 +2290,7 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	list_for_each_entry(d, &r->domains, list) {
 		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 
-		for (i = 0; i < r->num_closid; i++)
+		for (i = 0; i < hw_res->num_closid; i++)
 			d->ctrl_val[i] = r->default_ctrl;
 	}
 	cpu = get_cpu();
@@ -3133,13 +3141,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 {
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl.alloc_enabled)
 		seq_puts(seq, ",cdp");
 
-	if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+	if (rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl.alloc_enabled)
 		seq_puts(seq, ",cdpl2");
 
-	if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
+	if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
 		seq_puts(seq, ",mba_MBps");
 
 	return 0;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 9b05af9b3e28..5ccf36b7dbbf 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -2,6 +2,8 @@
 #ifndef _RESCTRL_H
 #define _RESCTRL_H
 
+#include <linux/kernel.h>
+#include <linux/list.h>
 #include <linux/pid.h>
 
 #ifdef CONFIG_PROC_CPU_RESCTRL
@@ -13,4 +15,112 @@ int proc_resctrl_show(struct seq_file *m,
 
 #endif
 
+struct rdt_domain;
+
+/**
+ * struct resctrl_cache - Cache allocation related data
+ * @cbm_len:		Length of the cache bit mask
+ * @min_cbm_bits:	Minimum number of consecutive bits to be set
+ * @cbm_idx_mult:	Multiplier of CBM index
+ * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
+ *			closid * cbm_idx_multi + cbm_idx_offset
+ *			in a cache bit mask
+ * @shareable_bits:	Bitmask of shareable resource with other
+ *			executing entities
+ * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
+ * @arch_has_empty_bitmaps:	True if the '0' bitmap is valid.
+ * @arch_has_per_cpu_cfg:	True if QOS_CFG register for this cache
+ *				level has CPU scope.
+ */
+struct resctrl_cache {
+	unsigned int	cbm_len;
+	unsigned int	min_cbm_bits;
+	unsigned int	cbm_idx_mult;	// TODO remove this
+	unsigned int	cbm_idx_offset; // TODO remove this
+	unsigned int	shareable_bits;
+	bool		arch_has_sparse_bitmaps;
+	bool		arch_has_empty_bitmaps;
+	bool		arch_has_per_cpu_cfg;
+};
+
+/**
+ * enum membw_throttle_mode - System's memory bandwidth throttling mode
+ * @THREAD_THROTTLE_UNDEFINED:	Not relevant to the system
+ * @THREAD_THROTTLE_MAX:	Memory bandwidth is throttled at the core
+ *				always using smallest bandwidth percentage
+ *				assigned to threads, aka "max throttling"
+ * @THREAD_THROTTLE_PER_THREAD:	Memory bandwidth is throttled at the thread
+ */
+enum membw_throttle_mode {
+	THREAD_THROTTLE_UNDEFINED = 0,
+	THREAD_THROTTLE_MAX,
+	THREAD_THROTTLE_PER_THREAD,
+};
+
+/**
+ * struct resctrl_membw - Memory bandwidth allocation related data
+ * @min_bw:		Minimum memory bandwidth percentage user can request
+ * @bw_gran:		Granularity at which the memory bandwidth is allocated
+ * @delay_linear:	True if memory B/W delay is in linear scale
+ * @arch_needs_linear:	True if we can't configure non-linear resources
+ * @throttle_mode:	Bandwidth throttling mode when threads request
+ *			different memory bandwidths
+ * @mba_sc:		True if MBA software controller(mba_sc) is enabled
+ * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
+ */
+struct resctrl_membw {
+	u32				min_bw;
+	u32				bw_gran;
+	u32				delay_linear;
+	bool				arch_needs_linear;
+	enum membw_throttle_mode	throttle_mode;
+	bool				mba_sc;
+	u32				*mb_map;
+};
+
+struct rdt_parse_data;
+
+/**
+ * struct rdt_resource - attributes of a resctrl resource
+ * @rid:		The index of the resource
+ * @alloc_enabled:	Is allocation enabled on this machine
+ * @mon_enabled:	Is monitoring enabled for this feature
+ * @alloc_capable:	Is allocation available on this machine
+ * @mon_capable:	Is monitor feature available on this machine
+ * @num_rmid:		Number of RMIDs available
+ * @cache_level:	Which cache level defines scope of this resource
+ * @cache:		Cache allocation related data
+ * @membw:		If the component has bandwidth controls, their properties.
+ * @domains:		All domains for this resource
+ * @name:		Name to use in "schemata" file.
+ * @data_width:		Character width of data when displaying
+ * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
+ * @format_str:		Per resource format string to show domain value
+ * @parse_ctrlval:	Per resource function pointer to parse control values
+ * @evt_list:		List of monitoring events
+ * @fflags:		flags to choose base and info files
+ */
+struct rdt_resource {
+	int			rid;
+	bool			alloc_enabled;
+	bool			mon_enabled;
+	bool			alloc_capable;
+	bool			mon_capable;
+	int			num_rmid;
+	int			cache_level;
+	struct resctrl_cache	cache;
+	struct resctrl_membw	membw;
+	struct list_head	domains;
+	char			*name;
+	int			data_width;
+	u32			default_ctrl;
+	const char		*format_str;
+	int			(*parse_ctrlval)(struct rdt_parse_data *data,
+						 struct rdt_resource *r,
+						 struct rdt_domain *d);
+	struct list_head	evt_list;
+	unsigned long		fflags;
+
+};
+
 #endif /* _RESCTRL_H */
-- 
Gitee


From 1e930c70640bdc518031cb3cd976d1725688dd0f Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Mon, 21 Dec 2020 08:00:09 -0800
Subject: [PATCH 1264/2161] x86/resctrl: Add printf attribute to log function

commit 3ff4ec0e281d0b234917e6e3033dd3067a5ea945 upstream.

Mark the function with the __printf attribute to allow the compiler to
more thoroughly typecheck its arguments against a format string with
-Wformat and similar flags.

 [ bp: Massage commit message. ]

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20201221160009.3752017-1-trix@redhat.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/internal.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 9453b6cc83d9..0652fc13ecf8 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -494,6 +494,7 @@ union cpuid_0x10_x_edx {
 
 void rdt_last_cmd_clear(void);
 void rdt_last_cmd_puts(const char *s);
+__printf(1, 2)
 void rdt_last_cmd_printf(const char *fmt, ...);
 
 void rdt_ctrl_update(void *arg);
-- 
Gitee


From 3b106a727990475c77b55b3579e6b6fd6c0997ba Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 17 Dec 2020 14:31:20 -0800
Subject: [PATCH 1265/2161] x86/resctrl: Use task_curr() instead of
 task_struct->on_cpu to prevent unnecessary IPI

commit e0ad6dc8969f790f14bddcfd7ea284b7e5f88a16 upstream.

James reported in [1] that there could be two tasks running on the same CPU
with task_struct->on_cpu set. Using task_struct->on_cpu as a test if a task
is running on a CPU may thus match the old task for a CPU while the
scheduler is running and IPI it unnecessarily.

task_curr() is the correct helper to use. While doing so move the #ifdef
check of the CONFIG_SMP symbol to be a C conditional used to determine
if this helper should be used to ensure the code is always checked for
correctness by the compiler.

[1] https://lore.kernel.org/lkml/a782d2f3-d2f6-795f-f4b1-9462205fd581@arm.com

Reported-by: James Morse <james.morse@arm.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/e9e68ce1441a73401e08b641cc3b9a3cf13fe6d4.1608243147.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 875232b3acd4..3355172596fc 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2326,19 +2326,15 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 			t->closid = to->closid;
 			t->rmid = to->mon.rmid;
 
-#ifdef CONFIG_SMP
 			/*
-			 * This is safe on x86 w/o barriers as the ordering
-			 * of writing to task_cpu() and t->on_cpu is
-			 * reverse to the reading here. The detection is
-			 * inaccurate as tasks might move or schedule
-			 * before the smp function call takes place. In
-			 * such a case the function call is pointless, but
+			 * If the task is on a CPU, set the CPU in the mask.
+			 * The detection is inaccurate as tasks might move or
+			 * schedule before the smp function call takes place.
+			 * In such a case the function call is pointless, but
 			 * there is no other side effect.
 			 */
-			if (mask && t->on_cpu)
+			if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
 				cpumask_set_cpu(task_cpu(t), mask);
-#endif
 		}
 	}
 	read_unlock(&tasklist_lock);
-- 
Gitee


From e22fbcc63501700cef917f87a7f9425f0b980885 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Thu, 17 Dec 2020 14:31:21 -0800
Subject: [PATCH 1266/2161] x86/resctrl: Apply READ_ONCE/WRITE_ONCE to
 task_struct.{rmid,closid}

commit 6d3b47ddffed70006cf4ba360eef61e9ce097d8f upstream.

A CPU's current task can have its {closid, rmid} fields read locally
while they are being concurrently written to from another CPU.
This can happen anytime __resctrl_sched_in() races with either
__rdtgroup_move_task() or rdt_move_group_tasks().

Prevent load / store tearing for those accesses by giving them the
READ_ONCE() / WRITE_ONCE() treatment.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/9921fda88ad81afb9885b517fbe864a2bc7c35a9.1608243147.git.reinette.chatre@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/resctrl.h         | 11 +++++++----
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 10 +++++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 07603064df8f..d60ed0668a59 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -56,19 +56,22 @@ static void __resctrl_sched_in(void)
 	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
 	u32 closid = state->default_closid;
 	u32 rmid = state->default_rmid;
+	u32 tmp;
 
 	/*
 	 * If this task has a closid/rmid assigned, use it.
 	 * Else use the closid/rmid assigned to this cpu.
 	 */
 	if (static_branch_likely(&rdt_alloc_enable_key)) {
-		if (current->closid)
-			closid = current->closid;
+		tmp = READ_ONCE(current->closid);
+		if (tmp)
+			closid = tmp;
 	}
 
 	if (static_branch_likely(&rdt_mon_enable_key)) {
-		if (current->rmid)
-			rmid = current->rmid;
+		tmp = READ_ONCE(current->rmid);
+		if (tmp)
+			rmid = tmp;
 	}
 
 	if (closid != state->cur_closid || rmid != state->cur_rmid) {
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 3355172596fc..cc0a069a4bca 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -566,11 +566,11 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 	 */
 
 	if (rdtgrp->type == RDTCTRL_GROUP) {
-		tsk->closid = rdtgrp->closid;
-		tsk->rmid = rdtgrp->mon.rmid;
+		WRITE_ONCE(tsk->closid, rdtgrp->closid);
+		WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
 	} else if (rdtgrp->type == RDTMON_GROUP) {
 		if (rdtgrp->mon.parent->closid == tsk->closid) {
-			tsk->rmid = rdtgrp->mon.rmid;
+			WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
 		} else {
 			rdt_last_cmd_puts("Can't move task to different control group\n");
 			return -EINVAL;
@@ -2323,8 +2323,8 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 	for_each_process_thread(p, t) {
 		if (!from || is_closid_match(t, from) ||
 		    is_rmid_match(t, from)) {
-			t->closid = to->closid;
-			t->rmid = to->mon.rmid;
+			WRITE_ONCE(t->closid, to->closid);
+			WRITE_ONCE(t->rmid, to->mon.rmid);
 
 			/*
 			 * If the task is on a CPU, set the CPU in the mask.
-- 
Gitee


From e13f6339004cd8e6fdb4b02e50dd17bb6011021b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:35 +0000
Subject: [PATCH 1267/2161] selftests/resctrl: Enable gcc checks to detect
 buffer overflows

commit a9d26a302dea29eb84f491b1340a57e56c631a71 upstream.

David reported a buffer overflow error in the check_results() function of
the cmt unit test and he suggested enabling _FORTIFY_SOURCE gcc compiler
option to automatically detect any such errors.

Feature Test Macros man page describes_FORTIFY_SOURCE as below

"Defining this macro causes some lightweight checks to be performed to
detect some buffer overflow errors when employing various string and memory
manipulation functions (for example, memcpy, memset, stpcpy, strcpy,
strncpy, strcat, strncat, sprintf, snprintf, vsprintf, vsnprintf, gets, and
wide character variants thereof). For some functions, argument consistency
is checked; for example, a check is made that open has been supplied with a
mode argument when the specified flags include O_CREAT. Not all problems
are detected, just some common cases.

If _FORTIFY_SOURCE is set to 1, with compiler optimization level 1 (gcc
-O1) and above, checks that shouldn't change the behavior of conforming
programs are performed.

With _FORTIFY_SOURCE set to 2, some more checking is added, but some
conforming programs might fail.

Some of the checks can be performed at compile time (via macros logic
implemented in header files), and result in compiler warnings; other checks
take place at run time, and result in a run-time error if the check fails.

Use of this macro requires compiler support, available with gcc since
version 4.0."

Fix the buffer overflow error in the check_results() function of the cmt
unit test and enable _FORTIFY_SOURCE gcc check to catch any future buffer
overflow errors.

Reported-by: David Binderman <dcb314@hotmail.com>
Suggested-by: David Binderman <dcb314@hotmail.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/Makefile   | 2 +-
 tools/testing/selftests/resctrl/cqm_test.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
index d585cc1948cc..6bcee2ec91a9 100644
--- a/tools/testing/selftests/resctrl/Makefile
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -1,5 +1,5 @@
 CC = $(CROSS_COMPILE)gcc
-CFLAGS = -g -Wall
+CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2
 SRCS=$(wildcard *.c)
 OBJS=$(SRCS:.c=.o)
 
diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c
index c8756152bd61..5e7308ac63be 100644
--- a/tools/testing/selftests/resctrl/cqm_test.c
+++ b/tools/testing/selftests/resctrl/cqm_test.c
@@ -86,7 +86,7 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits)
 		return errno;
 	}
 
-	while (fgets(temp, 1024, fp)) {
+	while (fgets(temp, sizeof(temp), fp)) {
 		char *token = strtok(temp, ":\t");
 		int fields = 0;
 
-- 
Gitee


From 4569dd2eee222c1442cd48a9eed81a2a13d510a8 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:36 +0000
Subject: [PATCH 1268/2161] selftests/resctrl: Fix compilation issues for
 global variables

commit 8236c51d85a64643588505a6791e022cc8d84864 upstream.

Reinette reported following compilation issue on Fedora 32, gcc version
10.1.1

/usr/bin/ld: cqm_test.o:<src_dir>/cqm_test.c:22: multiple definition of
`cache_size'; cat_test.o:<src_dir>/cat_test.c:23: first defined here

The same issue is reported for long_mask, cbm_mask, count_of_bits etc
variables as well. Compiler isn't happy because these variables are
defined globally in two .c files namely cqm_test.c and cat_test.c and
the compiler during compilation finds that the variable is already
defined (multiple definition error).

Taking a closer look at the usage of these variables reveals that these
variables are used only locally in functions such as cqm_resctrl_val()
(defined in cqm_test.c) and cat_perf_miss_val() (defined in cat_test.c).
These variables are not shared between those functions. So, there is no
need for these variables to be global. Hence, fix this issue by making
them static variables.

Reported-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cat_test.c  | 10 +++++-----
 tools/testing/selftests/resctrl/cqm_test.c  | 10 +++++-----
 tools/testing/selftests/resctrl/resctrl.h   |  2 +-
 tools/testing/selftests/resctrl/resctrlfs.c | 10 +++++-----
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 5da43767b973..bdeeb5772592 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -17,10 +17,10 @@
 #define MAX_DIFF_PERCENT	4
 #define MAX_DIFF		1000000
 
-int count_of_bits;
-char cbm_mask[256];
-unsigned long long_mask;
-unsigned long cache_size;
+static int count_of_bits;
+static char cbm_mask[256];
+static unsigned long long_mask;
+static unsigned long cache_size;
 
 /*
  * Change schemata. Write schemata to specified
@@ -136,7 +136,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 		return -1;
 
 	/* Get default cbm mask for L3/L2 cache */
-	ret = get_cbm_mask(cache_type);
+	ret = get_cbm_mask(cache_type, cbm_mask);
 	if (ret)
 		return ret;
 
diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c
index 5e7308ac63be..de33d1c0466e 100644
--- a/tools/testing/selftests/resctrl/cqm_test.c
+++ b/tools/testing/selftests/resctrl/cqm_test.c
@@ -16,10 +16,10 @@
 #define MAX_DIFF		2000000
 #define MAX_DIFF_PERCENT	15
 
-int count_of_bits;
-char cbm_mask[256];
-unsigned long long_mask;
-unsigned long cache_size;
+static int count_of_bits;
+static char cbm_mask[256];
+static unsigned long long_mask;
+static unsigned long cache_size;
 
 static int cqm_setup(int num, ...)
 {
@@ -125,7 +125,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
 	if (!validate_resctrl_feature_request("cqm"))
 		return -1;
 
-	ret = get_cbm_mask("L3");
+	ret = get_cbm_mask("L3", cbm_mask);
 	if (ret)
 		return ret;
 
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 39bf59c6b9c5..959c71e39bdc 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -92,7 +92,7 @@ void tests_cleanup(void);
 void mbm_test_cleanup(void);
 int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd);
 void mba_test_cleanup(void);
-int get_cbm_mask(char *cache_type);
+int get_cbm_mask(char *cache_type, char *cbm_mask);
 int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size);
 void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
 int cat_val(struct resctrl_val_param *param);
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 19c0ec4045a4..2a16100c9c3f 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -49,8 +49,6 @@ static int find_resctrl_mount(char *buffer)
 	return -ENOENT;
 }
 
-char cbm_mask[256];
-
 /*
  * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl
  * @mum_resctrlfs:	Should the resctrl FS be remounted?
@@ -205,16 +203,18 @@ int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size)
 /*
  * get_cbm_mask - Get cbm mask for given cache
  * @cache_type:	Cache level L2/L3
- *
- * Mask is stored in cbm_mask which is global variable.
+ * @cbm_mask:	cbm_mask returned as a string
  *
  * Return: = 0 on success, < 0 on failure.
  */
-int get_cbm_mask(char *cache_type)
+int get_cbm_mask(char *cache_type, char *cbm_mask)
 {
 	char cbm_mask_path[1024];
 	FILE *fp;
 
+	if (!cbm_mask)
+		return -1;
+
 	sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type);
 
 	fp = fopen(cbm_mask_path, "r");
-- 
Gitee


From 37652ac2885a27180345db29a1c4a39277adaad3 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:37 +0000
Subject: [PATCH 1269/2161] selftests/resctrl: Fix compilation issues for other
 global variables

commit 896016d2ad051811ff9c9c087393adc063322fbc upstream.

Reinette reported following compilation issue on Fedora 32, gcc version
10.1.1

/usr/bin/ld: resctrl_tests.o:<src_dir>/resctrl.h:65: multiple definition
of `bm_pid'; cache.o:<src_dir>/resctrl.h:65: first defined here

Other variables are ppid, tests_run, llc_occup_path, is_amd. Compiler
isn't happy because these variables are defined globally in two .c files
but are not declared as extern.

To fix issues for the global variables, declare them as extern.

Chang Log:
- Split this patch from v4's patch 1 (Shuah).

Reported-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 959c71e39bdc..12b77182cb44 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -62,11 +62,11 @@ struct resctrl_val_param {
 	int		(*setup)(int num, ...);
 };
 
-pid_t bm_pid, ppid;
-int tests_run;
+extern pid_t bm_pid, ppid;
+extern int tests_run;
 
-char llc_occup_path[1024];
-bool is_amd;
+extern char llc_occup_path[1024];
+extern bool is_amd;
 
 bool check_resctrlfs_support(void);
 int filter_dmesg(void);
-- 
Gitee


From d739fe75d0a0a39908d6a9452dd284f8ea5cc4ae Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:38 +0000
Subject: [PATCH 1270/2161] selftests/resctrl: Clean up resctrl features check

commit 2428673638ea28fa93d2a38b1c3e8d70122b00ee upstream.

Checking resctrl features call strcmp() to compare feature strings
(e.g. "mba", "cat" etc). The checkings are error prone and don't have
good coding style. Define the constant strings in macros and call
strncmp() to solve the potential issues.

Suggested-by: Shuah Khan <skhan@linuxfoundation.org>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cache.c       |  8 +++----
 tools/testing/selftests/resctrl/cat_test.c    |  2 +-
 tools/testing/selftests/resctrl/cqm_test.c    |  2 +-
 tools/testing/selftests/resctrl/fill_buf.c    |  4 ++--
 tools/testing/selftests/resctrl/mba_test.c    |  2 +-
 tools/testing/selftests/resctrl/mbm_test.c    |  2 +-
 tools/testing/selftests/resctrl/resctrl.h     |  5 +++++
 .../testing/selftests/resctrl/resctrl_tests.c | 12 +++++-----
 tools/testing/selftests/resctrl/resctrl_val.c | 22 +++++++++----------
 tools/testing/selftests/resctrl/resctrlfs.c   | 17 +++++++-------
 10 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
index 38dbf4962e33..5922cc1b0386 100644
--- a/tools/testing/selftests/resctrl/cache.c
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -182,7 +182,7 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
 	/*
 	 * Measure cache miss from perf.
 	 */
-	if (!strcmp(param->resctrl_val, "cat")) {
+	if (!strncmp(param->resctrl_val, CAT_STR, sizeof(CAT_STR))) {
 		ret = get_llc_perf(&llc_perf_miss);
 		if (ret < 0)
 			return ret;
@@ -192,7 +192,7 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
 	/*
 	 * Measure llc occupancy from resctrl.
 	 */
-	if (!strcmp(param->resctrl_val, "cqm")) {
+	if (!strncmp(param->resctrl_val, CQM_STR, sizeof(CQM_STR))) {
 		ret = get_llc_occu_resctrl(&llc_occu_resc);
 		if (ret < 0)
 			return ret;
@@ -234,7 +234,7 @@ int cat_val(struct resctrl_val_param *param)
 	if (ret)
 		return ret;
 
-	if ((strcmp(resctrl_val, "cat") == 0)) {
+	if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) {
 		ret = initialize_llc_perf();
 		if (ret)
 			return ret;
@@ -242,7 +242,7 @@ int cat_val(struct resctrl_val_param *param)
 
 	/* Test runs until the callback setup() tells the test to stop. */
 	while (1) {
-		if (strcmp(resctrl_val, "cat") == 0) {
+		if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) {
 			ret = param->setup(1, param);
 			if (ret) {
 				ret = 0;
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index bdeeb5772592..20823725daca 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -164,7 +164,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 		return -1;
 
 	struct resctrl_val_param param = {
-		.resctrl_val	= "cat",
+		.resctrl_val	= CAT_STR,
 		.cpu_no		= cpu_no,
 		.mum_resctrlfs	= 0,
 		.setup		= cat_setup,
diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c
index de33d1c0466e..271752e9ef5b 100644
--- a/tools/testing/selftests/resctrl/cqm_test.c
+++ b/tools/testing/selftests/resctrl/cqm_test.c
@@ -145,7 +145,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
 	}
 
 	struct resctrl_val_param param = {
-		.resctrl_val	= "cqm",
+		.resctrl_val	= CQM_STR,
 		.ctrlgrp	= "c1",
 		.mongrp		= "m1",
 		.cpu_no		= cpu_no,
diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index 79c611c99a3d..51e5cf22632f 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -115,7 +115,7 @@ static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
 
 	while (1) {
 		ret = fill_one_span_read(start_ptr, end_ptr);
-		if (!strcmp(resctrl_val, "cat"))
+		if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)))
 			break;
 	}
 
@@ -134,7 +134,7 @@ static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr,
 {
 	while (1) {
 		fill_one_span_write(start_ptr, end_ptr);
-		if (!strcmp(resctrl_val, "cat"))
+		if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)))
 			break;
 	}
 
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 7bf8eaa6204b..6449fbd96096 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -141,7 +141,7 @@ void mba_test_cleanup(void)
 int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd)
 {
 	struct resctrl_val_param param = {
-		.resctrl_val	= "mba",
+		.resctrl_val	= MBA_STR,
 		.ctrlgrp	= "c1",
 		.mongrp		= "m1",
 		.cpu_no		= cpu_no,
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index 4700f7453f81..ec6cfe01c9c2 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -114,7 +114,7 @@ void mbm_test_cleanup(void)
 int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd)
 {
 	struct resctrl_val_param param = {
-		.resctrl_val	= "mbm",
+		.resctrl_val	= MBM_STR,
 		.ctrlgrp	= "c1",
 		.mongrp		= "m1",
 		.span		= span,
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 12b77182cb44..36da6136af96 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -62,6 +62,11 @@ struct resctrl_val_param {
 	int		(*setup)(int num, ...);
 };
 
+#define MBM_STR			"mbm"
+#define MBA_STR			"mba"
+#define CQM_STR			"cqm"
+#define CAT_STR			"cat"
+
 extern pid_t bm_pid, ppid;
 extern int tests_run;
 
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 425cc85ac883..4b109a59f72d 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -85,13 +85,13 @@ int main(int argc, char **argv)
 			cqm_test = false;
 			cat_test = false;
 			while (token) {
-				if (!strcmp(token, "mbm")) {
+				if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) {
 					mbm_test = true;
-				} else if (!strcmp(token, "mba")) {
+				} else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) {
 					mba_test = true;
-				} else if (!strcmp(token, "cqm")) {
+				} else if (!strncmp(token, CQM_STR, sizeof(CQM_STR))) {
 					cqm_test = true;
-				} else if (!strcmp(token, "cat")) {
+				} else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) {
 					cat_test = true;
 				} else {
 					printf("invalid argument\n");
@@ -161,7 +161,7 @@ int main(int argc, char **argv)
 	if (!is_amd && mbm_test) {
 		printf("# Starting MBM BW change ...\n");
 		if (!has_ben)
-			sprintf(benchmark_cmd[5], "%s", "mba");
+			sprintf(benchmark_cmd[5], "%s", MBA_STR);
 		res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
 		printf("%sok MBM: bw change\n", res ? "not " : "");
 		mbm_test_cleanup();
@@ -181,7 +181,7 @@ int main(int argc, char **argv)
 	if (cqm_test) {
 		printf("# Starting CQM test ...\n");
 		if (!has_ben)
-			sprintf(benchmark_cmd[5], "%s", "cqm");
+			sprintf(benchmark_cmd[5], "%s", CQM_STR);
 		res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd);
 		printf("%sok CQM: test\n", res ? "not " : "");
 		cqm_test_cleanup();
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 520fea3606d1..aed71fd0713b 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -397,10 +397,10 @@ static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp,
 		return;
 	}
 
-	if (strcmp(resctrl_val, "mbm") == 0)
+	if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)))
 		set_mbm_path(ctrlgrp, mongrp, resource_id);
 
-	if ((strcmp(resctrl_val, "mba") == 0)) {
+	if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
 		if (ctrlgrp)
 			sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH,
 				RESCTRL_PATH, ctrlgrp, resource_id);
@@ -524,7 +524,7 @@ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
 		return;
 	}
 
-	if (strcmp(resctrl_val, "cqm") == 0)
+	if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
 		set_cqm_path(ctrlgrp, mongrp, resource_id);
 }
 
@@ -579,8 +579,8 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 	if (strcmp(param->filename, "") == 0)
 		sprintf(param->filename, "stdio");
 
-	if ((strcmp(resctrl_val, "mba")) == 0 ||
-	    (strcmp(resctrl_val, "mbm")) == 0) {
+	if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) ||
+	    !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
 		ret = validate_bw_report_request(param->bw_report);
 		if (ret)
 			return ret;
@@ -674,15 +674,15 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 	if (ret)
 		goto out;
 
-	if ((strcmp(resctrl_val, "mbm") == 0) ||
-	    (strcmp(resctrl_val, "mba") == 0)) {
+	if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
+	    !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
 		ret = initialize_mem_bw_imc();
 		if (ret)
 			goto out;
 
 		initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
 					  param->cpu_no, resctrl_val);
-	} else if (strcmp(resctrl_val, "cqm") == 0)
+	} else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
 		initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp,
 					    param->cpu_no, resctrl_val);
 
@@ -710,8 +710,8 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 
 	/* Test runs until the callback setup() tells the test to stop. */
 	while (1) {
-		if ((strcmp(resctrl_val, "mbm") == 0) ||
-		    (strcmp(resctrl_val, "mba") == 0)) {
+		if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
+		    !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
 			ret = param->setup(1, param);
 			if (ret) {
 				ret = 0;
@@ -721,7 +721,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 			ret = measure_vals(param, &bw_resc_start);
 			if (ret)
 				break;
-		} else if (strcmp(resctrl_val, "cqm") == 0) {
+		} else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) {
 			ret = param->setup(1, param);
 			if (ret) {
 				ret = 0;
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 2a16100c9c3f..4174e48e06d1 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -334,7 +334,7 @@ void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 		operation = atoi(benchmark_cmd[4]);
 		sprintf(resctrl_val, "%s", benchmark_cmd[5]);
 
-		if (strcmp(resctrl_val, "cqm") != 0)
+		if (strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
 			buffer_span = span * MB;
 		else
 			buffer_span = span;
@@ -459,8 +459,8 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
 		goto out;
 
 	/* Create mon grp and write pid into it for "mbm" and "cqm" test */
-	if ((strcmp(resctrl_val, "cqm") == 0) ||
-	    (strcmp(resctrl_val, "mbm") == 0)) {
+	if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)) ||
+	    !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
 		if (strlen(mongrp)) {
 			sprintf(monitorgroup_p, "%s/mon_groups", controlgroup);
 			sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp);
@@ -505,9 +505,9 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	int resource_id, ret = 0;
 	FILE *fp;
 
-	if ((strcmp(resctrl_val, "mba") != 0) &&
-	    (strcmp(resctrl_val, "cat") != 0) &&
-	    (strcmp(resctrl_val, "cqm") != 0))
+	if (strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) &&
+	    strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) &&
+	    strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
 		return -ENOENT;
 
 	if (!schemata) {
@@ -528,9 +528,10 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	else
 		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
 
-	if (!strcmp(resctrl_val, "cat") || !strcmp(resctrl_val, "cqm"))
+	if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) ||
+	    !strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
 		sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata);
-	if (strcmp(resctrl_val, "mba") == 0)
+	if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)))
 		sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
 
 	fp = fopen(controlgroup, "w");
-- 
Gitee


From 9434a0cdc8ec3816e664641325164beae457ff6d Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Wed, 17 Mar 2021 02:22:39 +0000
Subject: [PATCH 1271/2161] selftests/resctrl: Ensure sibling CPU is not same
 as original CPU

commit f5f16ae4fae9d4d51aa365a0e1d84d368bef53ea upstream.

The resctrl tests can accept a CPU on which the tests are run and use
default of CPU #1 if it is not provided. In the CAT test a "sibling CPU"
is determined that is from the same package where another thread will be
run.

The current algorithm with which a "sibling CPU" is determined does not
take the provided/default CPU into account and when that CPU is the
first CPU in a package then the "sibling CPU" will be selected to be the
same CPU since it starts by picking the first CPU from core_siblings_list.

Fix the "sibling CPU" selection by taking the provided/default CPU into
account and ensuring a sibling that is a different CPU is selected.

Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrlfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 4174e48e06d1..bc52076bee7f 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -268,7 +268,7 @@ int get_core_sibling(int cpu_no)
 	while (token) {
 		sibling_cpu_no = atoi(token);
 		/* Skipping core 0 as we don't want to run test on core 0 */
-		if (sibling_cpu_no != 0)
+		if (sibling_cpu_no != 0 && sibling_cpu_no != cpu_no)
 			break;
 		token = strtok(NULL, "-,");
 	}
-- 
Gitee


From f703ec9956ef6b481aa3476d579bea237bb7e8b6 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:40 +0000
Subject: [PATCH 1272/2161] selftests/resctrl: Fix missing options "-n" and
 "-p"

commit d7af3d0d515cbdf63b6c3398a3c15ecb1bc2bd38 upstream.

resctrl test suite accepts command line arguments (like -b, -t, -n and -p)
as documented in the help. But passing -n and -p throws an invalid option
error. This happens because -n and -p are missing in the list of
characters that getopt() recognizes as valid arguments. Hence, they are
treated as invalid options.

Fix this by adding them to the list of characters that getopt() recognizes
as valid arguments. Please note that the main() function already has the
logic to deal with the values passed as part of these arguments and hence
no changes are needed there.

Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl_tests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 4b109a59f72d..ac2269610aa9 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -73,7 +73,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	while ((c = getopt(argc_new, argv, "ht:b:")) != -1) {
+	while ((c = getopt(argc_new, argv, "ht:b:n:p:")) != -1) {
 		char *token;
 
 		switch (c) {
-- 
Gitee


From 42df15a92d29bea7a09cf2a4a99e2d473bc21f11 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:41 +0000
Subject: [PATCH 1273/2161] selftests/resctrl: Rename CQM test as CMT test

commit 2f320911d9fab38597d2a32d91b4f31165e0c9b4 upstream.

CMT (Cache Monitoring Technology) [1] is a H/W feature that reports cache
occupancy of a process. resctrl selftest suite has a unit test to test CMT
for LLC but the test is named as CQM (Cache Quality Monitoring).
Furthermore, the unit test source file is named as cqm_test.c and several
functions, variables, comments, preprocessors and statements widely use
"cqm" as either suffix or prefix. This rampant misusage of CQM for CMT
might confuse someone who is newly looking at resctrl selftests because
this feature is named CMT in the Intel Software Developer's Manual.

Hence, rename all the occurrences (unit test source file name, functions,
variables, comments and preprocessors) of cqm with cmt.

[1] Please see Intel SDM, Volume 3, chapter 17 and section 18 for more
    information on CMT: https://software.intel.com/content/www/us/en/develop/articles/intel-sdm.html

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/README        |  4 +--
 tools/testing/selftests/resctrl/cache.c       |  4 +--
 .../resctrl/{cqm_test.c => cmt_test.c}        | 20 +++++++-------
 tools/testing/selftests/resctrl/resctrl.h     |  6 ++---
 .../testing/selftests/resctrl/resctrl_tests.c | 26 +++++++++----------
 tools/testing/selftests/resctrl/resctrl_val.c | 12 ++++-----
 tools/testing/selftests/resctrl/resctrlfs.c   | 10 +++----
 7 files changed, 41 insertions(+), 41 deletions(-)
 rename tools/testing/selftests/resctrl/{cqm_test.c => cmt_test.c} (89%)

diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README
index 6e5a0ffa18e8..4b36b25b6ac0 100644
--- a/tools/testing/selftests/resctrl/README
+++ b/tools/testing/selftests/resctrl/README
@@ -46,8 +46,8 @@ ARGUMENTS
 Parameter '-h' shows usage information.
 
 usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits]
-        -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM default benchmark is builtin fill_buf
-        -t test list: run tests specified in the test list, e.g. -t mbm, mba, cqm, cat
+        -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT default benchmark is builtin fill_buf
+        -t test list: run tests specified in the test list, e.g. -t mbm, mba, cmt, cat
         -n no_of_bits: run cache tests using specified no of bits in cache bit mask
         -p cpu_no: specify CPU number to run the test. 1 is default
         -h: help
diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
index 5922cc1b0386..2aa1b5c7d9e1 100644
--- a/tools/testing/selftests/resctrl/cache.c
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -111,7 +111,7 @@ static int get_llc_perf(unsigned long *llc_perf_miss)
 
 /*
  * Get LLC Occupancy as reported by RESCTRL FS
- * For CQM,
+ * For CMT,
  * 1. If con_mon grp and mon grp given, then read from mon grp in
  * con_mon grp
  * 2. If only con_mon grp given, then read from con_mon grp
@@ -192,7 +192,7 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
 	/*
 	 * Measure llc occupancy from resctrl.
 	 */
-	if (!strncmp(param->resctrl_val, CQM_STR, sizeof(CQM_STR))) {
+	if (!strncmp(param->resctrl_val, CMT_STR, sizeof(CMT_STR))) {
 		ret = get_llc_occu_resctrl(&llc_occu_resc);
 		if (ret < 0)
 			return ret;
diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cmt_test.c
similarity index 89%
rename from tools/testing/selftests/resctrl/cqm_test.c
rename to tools/testing/selftests/resctrl/cmt_test.c
index 271752e9ef5b..4b63838dda32 100644
--- a/tools/testing/selftests/resctrl/cqm_test.c
+++ b/tools/testing/selftests/resctrl/cmt_test.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Cache Monitoring Technology (CQM) test
+ * Cache Monitoring Technology (CMT) test
  *
  * Copyright (C) 2018 Intel Corporation
  *
@@ -11,7 +11,7 @@
 #include "resctrl.h"
 #include <unistd.h>
 
-#define RESULT_FILE_NAME	"result_cqm"
+#define RESULT_FILE_NAME	"result_cmt"
 #define NUM_OF_RUNS		5
 #define MAX_DIFF		2000000
 #define MAX_DIFF_PERCENT	15
@@ -21,7 +21,7 @@ static char cbm_mask[256];
 static unsigned long long_mask;
 static unsigned long cache_size;
 
-static int cqm_setup(int num, ...)
+static int cmt_setup(int num, ...)
 {
 	struct resctrl_val_param *p;
 	va_list param;
@@ -58,7 +58,7 @@ static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits,
 	else
 		res = false;
 
-	printf("%sok CQM: diff within %d, %d\%%\n", res ? "" : "not",
+	printf("%sok CMT: diff within %d, %d\%%\n", res ? "" : "not",
 	       MAX_DIFF, (int)MAX_DIFF_PERCENT);
 
 	printf("# diff: %ld\n", avg_diff);
@@ -106,12 +106,12 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits)
 	return 0;
 }
 
-void cqm_test_cleanup(void)
+void cmt_test_cleanup(void)
 {
 	remove(RESULT_FILE_NAME);
 }
 
-int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
+int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
 {
 	int ret, mum_resctrlfs;
 
@@ -122,7 +122,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
 	if (ret)
 		return ret;
 
-	if (!validate_resctrl_feature_request("cqm"))
+	if (!validate_resctrl_feature_request(CMT_STR))
 		return -1;
 
 	ret = get_cbm_mask("L3", cbm_mask);
@@ -145,7 +145,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
 	}
 
 	struct resctrl_val_param param = {
-		.resctrl_val	= CQM_STR,
+		.resctrl_val	= CMT_STR,
 		.ctrlgrp	= "c1",
 		.mongrp		= "m1",
 		.cpu_no		= cpu_no,
@@ -154,7 +154,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
 		.mask		= ~(long_mask << n) & long_mask,
 		.span		= cache_size * n / count_of_bits,
 		.num_of_runs	= 0,
-		.setup		= cqm_setup,
+		.setup		= cmt_setup,
 	};
 
 	if (strcmp(benchmark_cmd[0], "fill_buf") == 0)
@@ -170,7 +170,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
 	if (ret)
 		return ret;
 
-	cqm_test_cleanup();
+	cmt_test_cleanup();
 
 	return 0;
 }
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 36da6136af96..1a58767a0bd2 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -64,7 +64,7 @@ struct resctrl_val_param {
 
 #define MBM_STR			"mbm"
 #define MBA_STR			"mba"
-#define CQM_STR			"cqm"
+#define CMT_STR			"cmt"
 #define CAT_STR			"cat"
 
 extern pid_t bm_pid, ppid;
@@ -103,9 +103,9 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
 int cat_val(struct resctrl_val_param *param);
 void cat_test_cleanup(void);
 int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type);
-int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd);
+int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd);
 unsigned int count_bits(unsigned long n);
-void cqm_test_cleanup(void);
+void cmt_test_cleanup(void);
 int get_core_sibling(int cpu_no);
 int measure_cache_vals(struct resctrl_val_param *param, int bm_pid);
 
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index ac2269610aa9..6b2ed2efaad4 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -37,10 +37,10 @@ void detect_amd(void)
 static void cmd_help(void)
 {
 	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n");
-	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM");
+	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT");
 	printf("\t default benchmark is builtin fill_buf\n");
 	printf("\t-t test list: run tests specified in the test list, ");
-	printf("e.g. -t mbm, mba, cqm, cat\n");
+	printf("e.g. -t mbm, mba, cmt, cat\n");
 	printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
 	printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n");
 	printf("\t-h: help\n");
@@ -50,13 +50,13 @@ void tests_cleanup(void)
 {
 	mbm_test_cleanup();
 	mba_test_cleanup();
-	cqm_test_cleanup();
+	cmt_test_cleanup();
 	cat_test_cleanup();
 }
 
 int main(int argc, char **argv)
 {
-	bool has_ben = false, mbm_test = true, mba_test = true, cqm_test = true;
+	bool has_ben = false, mbm_test = true, mba_test = true, cmt_test = true;
 	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5;
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
@@ -82,15 +82,15 @@ int main(int argc, char **argv)
 
 			mbm_test = false;
 			mba_test = false;
-			cqm_test = false;
+			cmt_test = false;
 			cat_test = false;
 			while (token) {
 				if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) {
 					mbm_test = true;
 				} else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) {
 					mba_test = true;
-				} else if (!strncmp(token, CQM_STR, sizeof(CQM_STR))) {
-					cqm_test = true;
+				} else if (!strncmp(token, CMT_STR, sizeof(CMT_STR))) {
+					cmt_test = true;
 				} else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) {
 					cat_test = true;
 				} else {
@@ -178,13 +178,13 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
-	if (cqm_test) {
-		printf("# Starting CQM test ...\n");
+	if (cmt_test) {
+		printf("# Starting CMT test ...\n");
 		if (!has_ben)
-			sprintf(benchmark_cmd[5], "%s", CQM_STR);
-		res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd);
-		printf("%sok CQM: test\n", res ? "not " : "");
-		cqm_test_cleanup();
+			sprintf(benchmark_cmd[5], "%s", CMT_STR);
+		res = cmt_resctrl_val(cpu_no, no_of_bits, benchmark_cmd);
+		printf("%sok CMT: test\n", res ? "not " : "");
+		cmt_test_cleanup();
 		tests_run++;
 	}
 
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index aed71fd0713b..17770095c98e 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -492,7 +492,7 @@ static int print_results_bw(char *filename,  int bm_pid, float bw_imc,
 	return 0;
 }
 
-static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num)
+static void set_cmt_path(const char *ctrlgrp, const char *mongrp, char sock_num)
 {
 	if (strlen(ctrlgrp) && strlen(mongrp))
 		sprintf(llc_occup_path,	CON_MON_LCC_OCCUP_PATH,	RESCTRL_PATH,
@@ -512,7 +512,7 @@ static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num)
  * @ctrlgrp:			Name of the control monitor group (con_mon grp)
  * @mongrp:			Name of the monitor group (mon grp)
  * @cpu_no:			CPU number that the benchmark PID is binded to
- * @resctrl_val:		Resctrl feature (Eg: cat, cqm.. etc)
+ * @resctrl_val:		Resctrl feature (Eg: cat, cmt.. etc)
  */
 static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
 					int cpu_no, char *resctrl_val)
@@ -524,8 +524,8 @@ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
 		return;
 	}
 
-	if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
-		set_cqm_path(ctrlgrp, mongrp, resource_id);
+	if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
+		set_cmt_path(ctrlgrp, mongrp, resource_id);
 }
 
 static int
@@ -682,7 +682,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 
 		initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
 					  param->cpu_no, resctrl_val);
-	} else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+	} else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
 		initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp,
 					    param->cpu_no, resctrl_val);
 
@@ -721,7 +721,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 			ret = measure_vals(param, &bw_resc_start);
 			if (ret)
 				break;
-		} else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) {
+		} else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) {
 			ret = param->setup(1, param);
 			if (ret) {
 				ret = 0;
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index bc52076bee7f..b47f4f150189 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -334,7 +334,7 @@ void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 		operation = atoi(benchmark_cmd[4]);
 		sprintf(resctrl_val, "%s", benchmark_cmd[5]);
 
-		if (strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+		if (strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
 			buffer_span = span * MB;
 		else
 			buffer_span = span;
@@ -458,8 +458,8 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
 	if (ret)
 		goto out;
 
-	/* Create mon grp and write pid into it for "mbm" and "cqm" test */
-	if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)) ||
+	/* Create mon grp and write pid into it for "mbm" and "cmt" test */
+	if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)) ||
 	    !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
 		if (strlen(mongrp)) {
 			sprintf(monitorgroup_p, "%s/mon_groups", controlgroup);
@@ -507,7 +507,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 
 	if (strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) &&
 	    strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) &&
-	    strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+	    strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
 		return -ENOENT;
 
 	if (!schemata) {
@@ -529,7 +529,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
 
 	if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) ||
-	    !strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+	    !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
 		sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata);
 	if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)))
 		sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
-- 
Gitee


From 1daf978916a4e88999d2d257ae6f81f2cd81f56a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 22 Jun 2020 11:16:47 -0700
Subject: [PATCH 1274/2161] selftests: Add header documentation and helpers

commit 245dd6041d0d923fb6563a0704749d3a7de16c56 upstream.

Add "how to use this API" documentation to kselftest.h, and include some
addition helpers and notes to make things easier to use.

Additionally removes the incorrect "Bail out!" line from the standard exit
path. The TAP13 specification says that "Bail out!"  should be used when
giving up before all tests have been run. For a "normal" execution run,
the selftests should not report "Bail out!".

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/kselftest.h | 73 ++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 0ac49d91a260..2cd5eefed4d2 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -6,6 +6,37 @@
  * Copyright (c) 2014 Shuah Khan <shuahkh@osg.samsung.com>
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  *
+ * Using this API consists of first counting how many tests your code
+ * has to run, and then starting up the reporting:
+ *
+ *     ksft_print_header();
+ *     ksft_set_plan(total_number_of_tests);
+ *
+ * For each test, report any progress, debugging, etc with:
+ *
+ *     ksft_print_msg(fmt, ...);
+ *
+ * and finally report the pass/fail/skip/xfail state of the test with one of:
+ *
+ *     ksft_test_result(condition, fmt, ...);
+ *     ksft_test_result_pass(fmt, ...);
+ *     ksft_test_result_fail(fmt, ...);
+ *     ksft_test_result_skip(fmt, ...);
+ *     ksft_test_result_xfail(fmt, ...);
+ *     ksft_test_result_error(fmt, ...);
+ *
+ * When all tests are finished, clean up and exit the program with one of:
+ *
+ *    ksft_exit(condition);
+ *    ksft_exit_pass();
+ *    ksft_exit_fail();
+ *
+ * If the program wants to report details on why the entire program has
+ * failed, it can instead exit with a message (this is usually done when
+ * the program is aborting before finishing all tests):
+ *
+ *    ksft_exit_fail_msg(fmt, ...);
+ *
  */
 #ifndef __KSELFTEST_H
 #define __KSELFTEST_H
@@ -74,7 +105,7 @@ static inline void ksft_print_cnts(void)
 	if (ksft_plan != ksft_test_num())
 		printf("# Planned tests != run tests (%u != %u)\n",
 			ksft_plan, ksft_test_num());
-	printf("# Pass %d Fail %d Xfail %d Xpass %d Skip %d Error %d\n",
+	printf("# Totals: pass:%d fail:%d xfail:%d xpass:%d skip:%d error:%d\n",
 		ksft_cnt.ksft_pass, ksft_cnt.ksft_fail,
 		ksft_cnt.ksft_xfail, ksft_cnt.ksft_xpass,
 		ksft_cnt.ksft_xskip, ksft_cnt.ksft_error);
@@ -120,6 +151,32 @@ static inline void ksft_test_result_fail(const char *msg, ...)
 	va_end(args);
 }
 
+/**
+ * ksft_test_result() - Report test success based on truth of condition
+ *
+ * @condition: if true, report test success, otherwise failure.
+ */
+#define ksft_test_result(condition, fmt, ...) do {	\
+	if (!!(condition))				\
+		ksft_test_result_pass(fmt, ##__VA_ARGS__);\
+	else						\
+		ksft_test_result_fail(fmt, ##__VA_ARGS__);\
+	} while (0)
+
+static inline void ksft_test_result_xfail(const char *msg, ...)
+{
+	int saved_errno = errno;
+	va_list args;
+
+	ksft_cnt.ksft_xfail++;
+
+	va_start(args, msg);
+	printf("ok %d # XFAIL ", ksft_test_num());
+	errno = saved_errno;
+	vprintf(msg, args);
+	va_end(args);
+}
+
 static inline void ksft_test_result_skip(const char *msg, ...)
 {
 	int saved_errno = errno;
@@ -134,6 +191,7 @@ static inline void ksft_test_result_skip(const char *msg, ...)
 	va_end(args);
 }
 
+/* TODO: how does "error" differ from "fail" or "skip"? */
 static inline void ksft_test_result_error(const char *msg, ...)
 {
 	int saved_errno = errno;
@@ -156,11 +214,22 @@ static inline int ksft_exit_pass(void)
 
 static inline int ksft_exit_fail(void)
 {
-	printf("Bail out!\n");
 	ksft_print_cnts();
 	exit(KSFT_FAIL);
 }
 
+/**
+ * ksft_exit() - Exit selftest based on truth of condition
+ *
+ * @condition: if true, exit self test with success, otherwise fail.
+ */
+#define ksft_exit(condition) do {	\
+	if (!!(condition))		\
+		ksft_exit_pass();	\
+	else				\
+		ksft_exit_fail();	\
+	} while (0)
+
 static inline int ksft_exit_fail_msg(const char *msg, ...)
 {
 	int saved_errno = errno;
-- 
Gitee


From b778179c964273ea5e6edf6f01b54541e4ac1f66 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:42 +0000
Subject: [PATCH 1275/2161] selftests/resctrl: Call kselftest APIs to log test
 results

commit ca2f4214f9671dfc08b6c5723188e03574203dc5 upstream.

Call kselftest APIs instead of using printf() to log test results
for cleaner code and better future extension.

Suggested-by: Shuah Khan <skhan@linuxfoundation.org>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cat_test.c    | 37 +++++++--------
 tools/testing/selftests/resctrl/cmt_test.c    | 42 ++++++++---------
 tools/testing/selftests/resctrl/mba_test.c    | 24 +++++-----
 tools/testing/selftests/resctrl/mbm_test.c    | 28 ++++++------
 tools/testing/selftests/resctrl/resctrl.h     |  2 +-
 .../testing/selftests/resctrl/resctrl_tests.c | 40 +++++++++--------
 tools/testing/selftests/resctrl/resctrl_val.c |  4 +-
 tools/testing/selftests/resctrl/resctrlfs.c   | 45 +++++++------------
 8 files changed, 105 insertions(+), 117 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 20823725daca..1daf911076c7 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -52,25 +52,28 @@ static int cat_setup(int num, ...)
 	return ret;
 }
 
-static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits,
-			    unsigned long span)
+static int show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits,
+			   unsigned long span)
 {
 	unsigned long allocated_cache_lines = span / 64;
 	unsigned long avg_llc_perf_miss = 0;
 	float diff_percent;
+	int ret;
 
 	avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1);
 	diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) /
 				allocated_cache_lines * 100;
 
-	printf("%sok CAT: cache miss rate within %d%%\n",
-	       !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT ?
-	       "not " : "", MAX_DIFF_PERCENT);
-	tests_run++;
-	printf("# Percent diff=%d\n", abs((int)diff_percent));
-	printf("# Number of bits: %d\n", no_of_bits);
-	printf("# Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss);
-	printf("# Allocated cache lines: %lu\n", allocated_cache_lines);
+	ret = !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT;
+	ksft_print_msg("Cache miss rate %swithin %d%%\n",
+		       ret ? "not " : "", MAX_DIFF_PERCENT);
+
+	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
+	ksft_print_msg("Number of bits: %d\n", no_of_bits);
+	ksft_print_msg("Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss);
+	ksft_print_msg("Allocated cache lines: %lu\n", allocated_cache_lines);
+
+	return ret;
 }
 
 static int check_results(struct resctrl_val_param *param)
@@ -80,7 +83,7 @@ static int check_results(struct resctrl_val_param *param)
 	int runs = 0, no_of_bits = 0;
 	FILE *fp;
 
-	printf("# Checking for pass/fail\n");
+	ksft_print_msg("Checking for pass/fail\n");
 	fp = fopen(param->filename, "r");
 	if (!fp) {
 		perror("# Cannot open file");
@@ -108,9 +111,7 @@ static int check_results(struct resctrl_val_param *param)
 	fclose(fp);
 	no_of_bits = count_bits(param->mask);
 
-	show_cache_info(sum_llc_perf_miss, no_of_bits, param->span);
-
-	return 0;
+	return show_cache_info(sum_llc_perf_miss, no_of_bits, param->span);
 }
 
 void cat_test_cleanup(void)
@@ -146,15 +147,15 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 	ret = get_cache_size(cpu_no, cache_type, &cache_size);
 	if (ret)
 		return ret;
-	printf("cache size :%lu\n", cache_size);
+	ksft_print_msg("Cache size :%lu\n", cache_size);
 
 	/* Get max number of bits from default-cabm mask */
 	count_of_bits = count_bits(long_mask);
 
 	if (n < 1 || n > count_of_bits - 1) {
-		printf("Invalid input value for no_of_bits n!\n");
-		printf("Please Enter value in range 1 to %d\n",
-		       count_of_bits - 1);
+		ksft_print_msg("Invalid input value for no_of_bits n!\n");
+		ksft_print_msg("Please enter value in range 1 to %d\n",
+			       count_of_bits - 1);
 		return -1;
 	}
 
diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c
index 4b63838dda32..b1ab1bd1f74d 100644
--- a/tools/testing/selftests/resctrl/cmt_test.c
+++ b/tools/testing/selftests/resctrl/cmt_test.c
@@ -39,36 +39,33 @@ static int cmt_setup(int num, ...)
 	return 0;
 }
 
-static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits,
-			    unsigned long span)
+static int show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits,
+			   unsigned long span)
 {
 	unsigned long avg_llc_occu_resc = 0;
 	float diff_percent;
 	long avg_diff = 0;
-	bool res;
+	int ret;
 
 	avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1);
 	avg_diff = (long)abs(span - avg_llc_occu_resc);
 
 	diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100;
 
-	if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) ||
-	    (abs(avg_diff) <= MAX_DIFF))
-		res = true;
-	else
-		res = false;
+	ret = (abs((int)diff_percent) > MAX_DIFF_PERCENT) &&
+	      (abs(avg_diff) > MAX_DIFF);
 
-	printf("%sok CMT: diff within %d, %d\%%\n", res ? "" : "not",
-	       MAX_DIFF, (int)MAX_DIFF_PERCENT);
+	ksft_print_msg("%s cache miss diff within %d, %d\%%\n",
+		       ret ? "Fail:" : "Pass:", MAX_DIFF, (int)MAX_DIFF_PERCENT);
 
-	printf("# diff: %ld\n", avg_diff);
-	printf("# percent diff=%d\n", abs((int)diff_percent));
-	printf("# Results are displayed in (Bytes)\n");
-	printf("# Number of bits: %d\n", no_of_bits);
-	printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc);
-	printf("# llc_occu_exp (span): %lu\n", span);
+	ksft_print_msg("Diff: %ld\n", avg_diff);
+	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
+	ksft_print_msg("Results are displayed in (Bytes)\n");
+	ksft_print_msg("Number of bits: %d\n", no_of_bits);
+	ksft_print_msg("Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc);
+	ksft_print_msg("llc_occu_exp (span): %lu\n", span);
 
-	tests_run++;
+	return ret;
 }
 
 static int check_results(struct resctrl_val_param *param, int no_of_bits)
@@ -78,7 +75,7 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits)
 	int runs = 0;
 	FILE *fp;
 
-	printf("# checking for pass/fail\n");
+	ksft_print_msg("Checking for pass/fail\n");
 	fp = fopen(param->filename, "r");
 	if (!fp) {
 		perror("# Error in opening file\n");
@@ -101,9 +98,8 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits)
 		runs++;
 	}
 	fclose(fp);
-	show_cache_info(sum_llc_occu_resc, no_of_bits, param->span);
 
-	return 0;
+	return show_cache_info(sum_llc_occu_resc, no_of_bits, param->span);
 }
 
 void cmt_test_cleanup(void)
@@ -134,13 +130,13 @@ int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
 	ret = get_cache_size(cpu_no, "L3", &cache_size);
 	if (ret)
 		return ret;
-	printf("cache size :%lu\n", cache_size);
+	ksft_print_msg("Cache size :%lu\n", cache_size);
 
 	count_of_bits = count_bits(long_mask);
 
 	if (n < 1 || n > count_of_bits) {
-		printf("Invalid input value for numbr_of_bits n!\n");
-		printf("Please Enter value in range 1 to %d\n", count_of_bits);
+		ksft_print_msg("Invalid input value for numbr_of_bits n!\n");
+		ksft_print_msg("Please enter value in range 1 to %d\n", count_of_bits);
 		return -1;
 	}
 
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 6449fbd96096..f42d4ba70363 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -56,7 +56,7 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 	int allocation, runs;
 	bool failed = false;
 
-	printf("# Results are displayed in (MB)\n");
+	ksft_print_msg("Results are displayed in (MB)\n");
 	/* Memory bandwidth from 100% down to 10% */
 	for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP;
 	     allocation++) {
@@ -78,21 +78,21 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 		avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1);
 		avg_diff = labs((long)(avg_bw_resc - avg_bw_imc));
 
-		printf("%sok MBA schemata percentage %u smaller than %d %%\n",
-		       avg_diff > MAX_DIFF ? "not " : "",
-		       ALLOCATION_MAX - ALLOCATION_STEP * allocation,
-		       MAX_DIFF);
-		tests_run++;
-		printf("# avg_diff: %lu\n", avg_diff);
-		printf("# avg_bw_imc: %lu\n", avg_bw_imc);
-		printf("# avg_bw_resc: %lu\n", avg_bw_resc);
+		ksft_print_msg("%s MBA schemata percentage %u smaller than %d %%\n",
+			       avg_diff > MAX_DIFF ? "Fail:" : "Pass:",
+			       ALLOCATION_MAX - ALLOCATION_STEP * allocation,
+			       MAX_DIFF);
+		ksft_print_msg("avg_diff: %lu\n", avg_diff);
+		ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc);
+		ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc);
 		if (avg_diff > MAX_DIFF)
 			failed = true;
 	}
 
-	printf("%sok schemata change using MBA%s\n", failed ? "not " : "",
-	       failed ? " # at least one test failed" : "");
-	tests_run++;
+	ksft_print_msg("%s schemata change using MBA\n",
+		       failed ? "Fail:" : "Pass:");
+	if (failed)
+		ksft_print_msg("At least one test failed");
 }
 
 static int check_results(void)
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index ec6cfe01c9c2..0d65ba4b62b4 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -14,13 +14,13 @@
 #define MAX_DIFF		300
 #define NUM_OF_RUNS		5
 
-static void
+static int
 show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
 {
 	unsigned long avg_bw_imc = 0, avg_bw_resc = 0;
 	unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
 	long avg_diff = 0;
-	int runs;
+	int runs, ret;
 
 	/*
 	 * Discard the first value which is inaccurate due to monitoring setup
@@ -35,13 +35,15 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
 	avg_bw_resc = sum_bw_resc / 4;
 	avg_diff = avg_bw_resc - avg_bw_imc;
 
-	printf("%sok MBM: diff within %d%%\n",
-	       labs(avg_diff) > MAX_DIFF ? "not " : "", MAX_DIFF);
-	tests_run++;
-	printf("# avg_diff: %lu\n", labs(avg_diff));
-	printf("# Span (MB): %d\n", span);
-	printf("# avg_bw_imc: %lu\n", avg_bw_imc);
-	printf("# avg_bw_resc: %lu\n", avg_bw_resc);
+	ret = labs(avg_diff) > MAX_DIFF;
+	ksft_print_msg("%s MBM: diff within %d%%\n",
+		       ret ? "Fail:" : "Pass:", MAX_DIFF);
+	ksft_print_msg("avg_diff: %lu\n", labs(avg_diff));
+	ksft_print_msg("Span (MB): %d\n", span);
+	ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc);
+	ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc);
+
+	return ret;
 }
 
 static int check_results(int span)
@@ -49,10 +51,10 @@ static int check_results(int span)
 	unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS];
 	char temp[1024], *token_array[8];
 	char output[] = RESULT_FILE_NAME;
-	int runs;
+	int runs, ret;
 	FILE *fp;
 
-	printf("# Checking for pass/fail\n");
+	ksft_print_msg("Checking for pass/fail\n");
 
 	fp = fopen(output, "r");
 	if (!fp) {
@@ -76,11 +78,11 @@ static int check_results(int span)
 		runs++;
 	}
 
-	show_bw_info(bw_imc, bw_resc, span);
+	ret = show_bw_info(bw_imc, bw_resc, span);
 
 	fclose(fp);
 
-	return 0;
+	return ret;
 }
 
 static int mbm_setup(int num, ...)
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 1a58767a0bd2..ebf88217f9de 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -23,6 +23,7 @@
 #include <sys/eventfd.h>
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
+#include "../kselftest.h"
 
 #define MB			(1024 * 1024)
 #define RESCTRL_PATH		"/sys/fs/resctrl"
@@ -68,7 +69,6 @@ struct resctrl_val_param {
 #define CAT_STR			"cat"
 
 extern pid_t bm_pid, ppid;
-extern int tests_run;
 
 extern char llc_occup_path[1024];
 extern bool is_amd;
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 6b2ed2efaad4..ccc1d6987cc6 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -60,7 +60,7 @@ int main(int argc, char **argv)
 	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5;
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
-	int ben_ind, ben_count;
+	int ben_ind, ben_count, tests = 0;
 	bool cat_test = true;
 
 	for (i = 0; i < argc; i++) {
@@ -87,12 +87,16 @@ int main(int argc, char **argv)
 			while (token) {
 				if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) {
 					mbm_test = true;
+					tests++;
 				} else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) {
 					mba_test = true;
+					tests++;
 				} else if (!strncmp(token, CMT_STR, sizeof(CMT_STR))) {
 					cmt_test = true;
+					tests++;
 				} else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) {
 					cat_test = true;
+					tests++;
 				} else {
 					printf("invalid argument\n");
 
@@ -118,7 +122,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	printf("TAP version 13\n");
+	ksft_print_header();
 
 	/*
 	 * Typically we need root privileges, because:
@@ -126,7 +130,7 @@ int main(int argc, char **argv)
 	 * 2. We execute perf commands
 	 */
 	if (geteuid() != 0)
-		printf("# WARNING: not running as root, tests may fail.\n");
+		return ksft_exit_fail_msg("Not running as root, abort testing.\n");
 
 	/* Detect AMD vendor */
 	detect_amd();
@@ -155,48 +159,46 @@ int main(int argc, char **argv)
 	sprintf(bw_report, "reads");
 	sprintf(bm_type, "fill_buf");
 
-	check_resctrlfs_support();
+	if (!check_resctrlfs_support())
+		return ksft_exit_fail_msg("resctrl FS does not exist\n");
+
 	filter_dmesg();
 
+	ksft_set_plan(tests ? : 4);
+
 	if (!is_amd && mbm_test) {
-		printf("# Starting MBM BW change ...\n");
+		ksft_print_msg("Starting MBM BW change ...\n");
 		if (!has_ben)
 			sprintf(benchmark_cmd[5], "%s", MBA_STR);
 		res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
-		printf("%sok MBM: bw change\n", res ? "not " : "");
+		ksft_test_result(!res, "MBM: bw change\n");
 		mbm_test_cleanup();
-		tests_run++;
 	}
 
 	if (!is_amd && mba_test) {
-		printf("# Starting MBA Schemata change ...\n");
+		ksft_print_msg("Starting MBA Schemata change ...\n");
 		if (!has_ben)
 			sprintf(benchmark_cmd[1], "%d", span);
 		res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd);
-		printf("%sok MBA: schemata change\n", res ? "not " : "");
+		ksft_test_result(!res, "MBA: schemata change\n");
 		mba_test_cleanup();
-		tests_run++;
 	}
 
 	if (cmt_test) {
-		printf("# Starting CMT test ...\n");
+		ksft_print_msg("Starting CMT test ...\n");
 		if (!has_ben)
 			sprintf(benchmark_cmd[5], "%s", CMT_STR);
 		res = cmt_resctrl_val(cpu_no, no_of_bits, benchmark_cmd);
-		printf("%sok CMT: test\n", res ? "not " : "");
+		ksft_test_result(!res, "CMT: test\n");
 		cmt_test_cleanup();
-		tests_run++;
 	}
 
 	if (cat_test) {
-		printf("# Starting CAT test ...\n");
+		ksft_print_msg("Starting CAT test ...\n");
 		res = cat_perf_miss_val(cpu_no, no_of_bits, "L3");
-		printf("%sok CAT: test\n", res ? "not " : "");
-		tests_run++;
+		ksft_test_result(!res, "CAT: test\n");
 		cat_test_cleanup();
 	}
 
-	printf("1..%d\n", tests_run);
-
-	return 0;
+	return ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 17770095c98e..5dfae51133bc 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -449,7 +449,7 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
 	kill(bm_pid, SIGKILL);
 	umount_resctrlfs();
 	tests_cleanup();
-	printf("Ending\n\n");
+	ksft_print_msg("Ending\n\n");
 
 	exit(EXIT_SUCCESS);
 }
@@ -645,7 +645,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 		PARENT_EXIT("Child is done");
 	}
 
-	printf("# benchmark PID: %d\n", bm_pid);
+	ksft_print_msg("Benchmark PID: %d\n", bm_pid);
 
 	/*
 	 * Register CTRL-C handler for parent, as it has to kill benchmark
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index b47f4f150189..6b22a186790a 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -10,8 +10,6 @@
  */
 #include "resctrl.h"
 
-int tests_run;
-
 static int find_resctrl_mount(char *buffer)
 {
 	FILE *mounts;
@@ -68,23 +66,17 @@ int remount_resctrlfs(bool mum_resctrlfs)
 	if (ret)
 		strcpy(mountpoint, RESCTRL_PATH);
 
-	if (!ret && mum_resctrlfs && umount(mountpoint)) {
-		printf("not ok unmounting \"%s\"\n", mountpoint);
-		perror("# umount");
-		tests_run++;
-	}
+	if (!ret && mum_resctrlfs && umount(mountpoint))
+		ksft_print_msg("Fail: unmounting \"%s\"\n", mountpoint);
 
 	if (!ret && !mum_resctrlfs)
 		return 0;
 
+	ksft_print_msg("Mounting resctrl to \"%s\"\n", RESCTRL_PATH);
 	ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL);
-	printf("%sok mounting resctrl to \"%s\"\n", ret ? "not " : "",
-	       RESCTRL_PATH);
 	if (ret)
 		perror("# mount");
 
-	tests_run++;
-
 	return ret;
 }
 
@@ -477,13 +469,10 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
 	}
 
 out:
-	printf("%sok writing benchmark parameters to resctrl FS\n",
-	       ret ? "not " : "");
+	ksft_print_msg("Writing benchmark parameters to resctrl FS\n");
 	if (ret)
 		perror("# writing to resctrlfs");
 
-	tests_run++;
-
 	return ret;
 }
 
@@ -511,7 +500,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 		return -ENOENT;
 
 	if (!schemata) {
-		printf("# Skipping empty schemata update\n");
+		ksft_print_msg("Skipping empty schemata update\n");
 
 		return -1;
 	}
@@ -552,10 +541,9 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	fclose(fp);
 
 out:
-	printf("%sok Write schema \"%s\" to resctrl FS%s%s\n",
-	       ret ? "not " : "", schema, ret ? " # " : "",
-	       ret ? reason : "");
-	tests_run++;
+	ksft_print_msg("Write schema \"%s\" to resctrl FS%s%s\n",
+		       schema, ret ? " # " : "",
+		       ret ? reason : "");
 
 	return ret;
 }
@@ -579,18 +567,17 @@ bool check_resctrlfs_support(void)
 
 	fclose(inf);
 
-	printf("%sok kernel supports resctrl filesystem\n", ret ? "" : "not ");
-	tests_run++;
+	ksft_print_msg("%s kernel supports resctrl filesystem\n",
+		       ret ? "Pass:" : "Fail:");
 
 	dp = opendir(RESCTRL_PATH);
-	printf("%sok resctrl mountpoint \"%s\" exists\n",
-	       dp ? "" : "not ", RESCTRL_PATH);
+	ksft_print_msg("%s resctrl mountpoint \"%s\" exists\n",
+		       dp ? "Pass:" : "Fail:", RESCTRL_PATH);
 	if (dp)
 		closedir(dp);
-	tests_run++;
 
-	printf("# resctrl filesystem %s mounted\n",
-	       find_resctrl_mount(NULL) ? "not" : "is");
+	ksft_print_msg("resctrl filesystem %s mounted\n",
+		       find_resctrl_mount(NULL) ? "not" : "is");
 
 	return ret;
 }
@@ -672,9 +659,9 @@ int filter_dmesg(void)
 
 	while (fgets(line, 1024, fp)) {
 		if (strstr(line, "intel_rdt:"))
-			printf("# dmesg: %s", line);
+			ksft_print_msg("dmesg: %s", line);
 		if (strstr(line, "resctrl:"))
-			printf("# dmesg: %s", line);
+			ksft_print_msg("dmesg: %s", line);
 	}
 	fclose(fp);
 	waitpid(pid, NULL, 0);
-- 
Gitee


From ff5cacb50640df7490018d9bea4282408707dbf2 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:43 +0000
Subject: [PATCH 1276/2161] selftests/resctrl: Share show_cache_info() by CAT
 and CMT tests

commit 03216ed7bb4de8ce707eb4de23a08516a542770f upstream.

show_cache_info() functions are defined separately in CAT and CMT
tests. But the functions are same for the tests and unnecessary
to be defined separately. Share the function by the tests.

Suggested-by: Shuah Khan <skhan@linuxfoundation.org>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cache.c    | 42 ++++++++++++++++++++++
 tools/testing/selftests/resctrl/cat_test.c | 28 ++-------------
 tools/testing/selftests/resctrl/cmt_test.c | 33 ++---------------
 tools/testing/selftests/resctrl/resctrl.h  |  4 +++
 4 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
index 2aa1b5c7d9e1..362e3a418caa 100644
--- a/tools/testing/selftests/resctrl/cache.c
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -270,3 +270,45 @@ int cat_val(struct resctrl_val_param *param)
 
 	return ret;
 }
+
+/*
+ * show_cache_info:	show cache test result information
+ * @sum_llc_val:	sum of LLC cache result data
+ * @no_of_bits:		number of bits
+ * @cache_span:		cache span in bytes for CMT or in lines for CAT
+ * @max_diff:		max difference
+ * @max_diff_percent:	max difference percentage
+ * @num_of_runs:	number of runs
+ * @platform:		show test information on this platform
+ * @cmt:		CMT test or CAT test
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+int show_cache_info(unsigned long sum_llc_val, int no_of_bits,
+		    unsigned long cache_span, unsigned long max_diff,
+		    unsigned long max_diff_percent, unsigned long num_of_runs,
+		    bool platform, bool cmt)
+{
+	unsigned long avg_llc_val = 0;
+	float diff_percent;
+	long avg_diff = 0;
+	int ret;
+
+	avg_llc_val = sum_llc_val / (num_of_runs - 1);
+	avg_diff = (long)abs(cache_span - avg_llc_val);
+	diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100;
+
+	ret = platform && abs((int)diff_percent) > max_diff_percent &&
+	      (cmt ? (abs(avg_diff) > max_diff) : true);
+
+	ksft_print_msg("%s cache miss rate within %d%%\n",
+		       ret ? "Fail:" : "Pass:", max_diff_percent);
+
+	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
+	ksft_print_msg("Number of bits: %d\n", no_of_bits);
+	ksft_print_msg("Average LLC val: %lu\n", avg_llc_val);
+	ksft_print_msg("Cache span (%s): %lu\n", cmt ? "bytes" : "lines",
+		       cache_span);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 1daf911076c7..090d3afc7a78 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -52,30 +52,6 @@ static int cat_setup(int num, ...)
 	return ret;
 }
 
-static int show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits,
-			   unsigned long span)
-{
-	unsigned long allocated_cache_lines = span / 64;
-	unsigned long avg_llc_perf_miss = 0;
-	float diff_percent;
-	int ret;
-
-	avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1);
-	diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) /
-				allocated_cache_lines * 100;
-
-	ret = !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT;
-	ksft_print_msg("Cache miss rate %swithin %d%%\n",
-		       ret ? "not " : "", MAX_DIFF_PERCENT);
-
-	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
-	ksft_print_msg("Number of bits: %d\n", no_of_bits);
-	ksft_print_msg("Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss);
-	ksft_print_msg("Allocated cache lines: %lu\n", allocated_cache_lines);
-
-	return ret;
-}
-
 static int check_results(struct resctrl_val_param *param)
 {
 	char *token_array[8], temp[512];
@@ -111,7 +87,9 @@ static int check_results(struct resctrl_val_param *param)
 	fclose(fp);
 	no_of_bits = count_bits(param->mask);
 
-	return show_cache_info(sum_llc_perf_miss, no_of_bits, param->span);
+	return show_cache_info(sum_llc_perf_miss, no_of_bits, param->span / 64,
+			       MAX_DIFF, MAX_DIFF_PERCENT, NUM_OF_RUNS,
+			       !is_amd, false);
 }
 
 void cat_test_cleanup(void)
diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c
index b1ab1bd1f74d..8968e36db99d 100644
--- a/tools/testing/selftests/resctrl/cmt_test.c
+++ b/tools/testing/selftests/resctrl/cmt_test.c
@@ -39,35 +39,6 @@ static int cmt_setup(int num, ...)
 	return 0;
 }
 
-static int show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits,
-			   unsigned long span)
-{
-	unsigned long avg_llc_occu_resc = 0;
-	float diff_percent;
-	long avg_diff = 0;
-	int ret;
-
-	avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1);
-	avg_diff = (long)abs(span - avg_llc_occu_resc);
-
-	diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100;
-
-	ret = (abs((int)diff_percent) > MAX_DIFF_PERCENT) &&
-	      (abs(avg_diff) > MAX_DIFF);
-
-	ksft_print_msg("%s cache miss diff within %d, %d\%%\n",
-		       ret ? "Fail:" : "Pass:", MAX_DIFF, (int)MAX_DIFF_PERCENT);
-
-	ksft_print_msg("Diff: %ld\n", avg_diff);
-	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
-	ksft_print_msg("Results are displayed in (Bytes)\n");
-	ksft_print_msg("Number of bits: %d\n", no_of_bits);
-	ksft_print_msg("Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc);
-	ksft_print_msg("llc_occu_exp (span): %lu\n", span);
-
-	return ret;
-}
-
 static int check_results(struct resctrl_val_param *param, int no_of_bits)
 {
 	char *token_array[8], temp[512];
@@ -99,7 +70,9 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits)
 	}
 	fclose(fp);
 
-	return show_cache_info(sum_llc_occu_resc, no_of_bits, param->span);
+	return show_cache_info(sum_llc_occu_resc, no_of_bits, param->span,
+			       MAX_DIFF, MAX_DIFF_PERCENT, NUM_OF_RUNS,
+			       true, true);
 }
 
 void cmt_test_cleanup(void)
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index ebf88217f9de..81f322245ef7 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -108,5 +108,9 @@ unsigned int count_bits(unsigned long n);
 void cmt_test_cleanup(void);
 int get_core_sibling(int cpu_no);
 int measure_cache_vals(struct resctrl_val_param *param, int bm_pid);
+int show_cache_info(unsigned long sum_llc_val, int no_of_bits,
+		    unsigned long cache_span, unsigned long max_diff,
+		    unsigned long max_diff_percent, unsigned long num_of_runs,
+		    bool platform, bool cmt);
 
 #endif /* RESCTRL_H */
-- 
Gitee


From 67f9ce2430231e5a16b0c25fcfaab2c8ba02af39 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Wed, 17 Mar 2021 02:22:44 +0000
Subject: [PATCH 1277/2161] selftests/resctrl: Fix a printed message

commit f29838e6fa131f4a323225457112fb869d15931b upstream.

Add a missing newline to the printed help text to improve readability.

Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl_tests.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index ccc1d6987cc6..355bd28b996a 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -37,8 +37,8 @@ void detect_amd(void)
 static void cmd_help(void)
 {
 	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n");
-	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT");
-	printf("\t default benchmark is builtin fill_buf\n");
+	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT\n");
+	printf("\t   default benchmark is builtin fill_buf\n");
 	printf("\t-t test list: run tests specified in the test list, ");
 	printf("e.g. -t mbm, mba, cmt, cat\n");
 	printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
-- 
Gitee


From d4d9495efb6d9a50e1cb5bc87ebf6c2d3023b678 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:45 +0000
Subject: [PATCH 1278/2161] selftests/resctrl: Add config dependencies

commit b67a7665a917e7305eaa573a474c859fe4c5949e upstream.

Add the config file for test dependencies.

Suggested-by: Shuah Khan <skhan@linuxfoundation.org>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/config | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/config

diff --git a/tools/testing/selftests/resctrl/config b/tools/testing/selftests/resctrl/config
new file mode 100644
index 000000000000..8d9f2deb56ed
--- /dev/null
+++ b/tools/testing/selftests/resctrl/config
@@ -0,0 +1,2 @@
+CONFIG_X86_CPU_RESCTRL=y
+CONFIG_PROC_CPU_RESCTRL=y
-- 
Gitee


From a4481ebe24e5a1395ef83b9cc36fd854d959ba15 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:46 +0000
Subject: [PATCH 1279/2161] selftests/resctrl: Check for resctrl mount point
 only if resctrl FS is supported

commit a3611fbc6e58c147bdd409b356baf15ddf57271e upstream.

check_resctrlfs_support() does the following
1. Checks if the platform supports resctrl file system or not by looking
   for resctrl in /proc/filesystems
2. Calls opendir() on default resctrl file system path
   (i.e. /sys/fs/resctrl)
3. Checks if resctrl file system is mounted or not by looking at
   /proc/mounts

Steps 2 and 3 will fail if the platform does not support resctrl file
system. So, there is no need to check for them if step 1 fails.

Fix this by returning immediately if the platform does not support
resctrl file system.

Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrlfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 6b22a186790a..87195eb78356 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -570,6 +570,9 @@ bool check_resctrlfs_support(void)
 	ksft_print_msg("%s kernel supports resctrl filesystem\n",
 		       ret ? "Pass:" : "Fail:");
 
+	if (!ret)
+		return ret;
+
 	dp = opendir(RESCTRL_PATH);
 	ksft_print_msg("%s resctrl mountpoint \"%s\" exists\n",
 		       dp ? "Pass:" : "Fail:", RESCTRL_PATH);
-- 
Gitee


From 4930abb16481031ffcdabf04d8200dfd6c0be591 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:47 +0000
Subject: [PATCH 1280/2161] selftests/resctrl: Use resctrl/info for feature
 detection

commit ee0415681eb661efa1eb2db7acc263f2c7df1e23 upstream.

Resctrl test suite before running any unit test (like cmt, cat, mbm and
mba) should first check if the feature is enabled (by kernel and not just
supported by H/W) on the platform or not.
validate_resctrl_feature_request() is supposed to do that. This function
intends to grep for relevant flags in /proc/cpuinfo but there are several
issues here

1. validate_resctrl_feature_request() calls fgrep() to get flags from
   /proc/cpuinfo. But, fgrep() can only return a string with maximum of 255
   characters and hence the complete cpu flags are never returned.
2. The substring search logic is also busted. If strstr() finds requested
   resctrl feature in the cpu flags, it returns pointer to the first
   occurrence. But, the logic negates the return value of strstr() and
   hence validate_resctrl_feature_request() returns false if the feature is
   present in the cpu flags and returns true if the feature is not present.
3. validate_resctrl_feature_request() checks if a resctrl feature is
   reported in /proc/cpuinfo flags or not. Having a cpu flag means that the
   H/W supports the feature, but it doesn't mean that the kernel enabled
   it. A user could selectively enable only a subset of resctrl features
   using kernel command line arguments. Hence, /proc/cpuinfo isn't a
   reliable source to check if a feature is enabled or not.

The 3rd issue being the major one and fixing it requires changing the way
validate_resctrl_feature_request() works. Since, /proc/cpuinfo isn't the
right place to check if a resctrl feature is enabled or not, a more
appropriate place is /sys/fs/resctrl/info directory. Change
validate_resctrl_feature_request() such that,

1. For cat, check if /sys/fs/resctrl/info/L3 directory is present or not
2. For mba, check if /sys/fs/resctrl/info/MB directory is present or not
3. For cmt, check if /sys/fs/resctrl/info/L3_MON directory is present and
   check if /sys/fs/resctrl/info/L3_MON/mon_features has llc_occupancy
4. For mbm, check if /sys/fs/resctrl/info/L3_MON directory is present and
   check if /sys/fs/resctrl/info/L3_MON/mon_features has
   mbm_<total/local>_bytes

Please note that only L3_CAT, L3_CMT, MBA and MBM are supported. CDP and L2
variants can be added later.

Reported-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl.h   |  6 ++-
 tools/testing/selftests/resctrl/resctrlfs.c | 52 ++++++++++++++++-----
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 81f322245ef7..1ad10c47e31d 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -29,6 +29,10 @@
 #define RESCTRL_PATH		"/sys/fs/resctrl"
 #define PHYS_ID_PATH		"/sys/devices/system/cpu/cpu"
 #define CBM_MASK_PATH		"/sys/fs/resctrl/info"
+#define L3_PATH			"/sys/fs/resctrl/info/L3"
+#define MB_PATH			"/sys/fs/resctrl/info/MB"
+#define L3_MON_PATH		"/sys/fs/resctrl/info/L3_MON"
+#define L3_MON_FEATURES_PATH	"/sys/fs/resctrl/info/L3_MON/mon_features"
 
 #define PARENT_EXIT(err_msg)			\
 	do {					\
@@ -79,7 +83,7 @@ int remount_resctrlfs(bool mum_resctrlfs);
 int get_resource_id(int cpu_no, int *resource_id);
 int umount_resctrlfs(void);
 int validate_bw_report_request(char *bw_report);
-bool validate_resctrl_feature_request(char *resctrl_val);
+bool validate_resctrl_feature_request(const char *resctrl_val);
 char *fgrep(FILE *inf, const char *str);
 int taskset_benchmark(pid_t bm_pid, int cpu_no);
 void run_benchmark(int signum, siginfo_t *info, void *ucontext);
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 87195eb78356..26563175acf6 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -606,26 +606,56 @@ char *fgrep(FILE *inf, const char *str)
  * validate_resctrl_feature_request - Check if requested feature is valid.
  * @resctrl_val:	Requested feature
  *
- * Return: 0 on success, non-zero on failure
+ * Return: True if the feature is supported, else false
  */
-bool validate_resctrl_feature_request(char *resctrl_val)
+bool validate_resctrl_feature_request(const char *resctrl_val)
 {
-	FILE *inf = fopen("/proc/cpuinfo", "r");
+	struct stat statbuf;
 	bool found = false;
 	char *res;
+	FILE *inf;
 
-	if (!inf)
+	if (!resctrl_val)
 		return false;
 
-	res = fgrep(inf, "flags");
-
-	if (res) {
-		char *s = strchr(res, ':');
+	if (remount_resctrlfs(false))
+		return false;
 
-		found = s && !strstr(s, resctrl_val);
-		free(res);
+	if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) {
+		if (!stat(L3_PATH, &statbuf))
+			return true;
+	} else if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
+		if (!stat(MB_PATH, &statbuf))
+			return true;
+	} else if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
+		   !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) {
+		if (!stat(L3_MON_PATH, &statbuf)) {
+			inf = fopen(L3_MON_FEATURES_PATH, "r");
+			if (!inf)
+				return false;
+
+			if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) {
+				res = fgrep(inf, "llc_occupancy");
+				if (res) {
+					found = true;
+					free(res);
+				}
+			}
+
+			if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
+				res = fgrep(inf, "mbm_total_bytes");
+				if (res) {
+					free(res);
+					res = fgrep(inf, "mbm_local_bytes");
+					if (res) {
+						found = true;
+						free(res);
+					}
+				}
+			}
+			fclose(inf);
+		}
 	}
-	fclose(inf);
 
 	return found;
 }
-- 
Gitee


From ab9f28c5dd8c914bf4bd963c78fbc5c42101a77e Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:48 +0000
Subject: [PATCH 1281/2161] selftests/resctrl: Fix MBA/MBM results reporting
 format

commit 06bd03a57f8c2e3a8698a7ce7dead4ef18e00902 upstream.

MBM unit test starts fill_buf (default built-in benchmark) in a new con_mon
group (c1, m1) and records resctrl reported mbm values and iMC (Integrated
Memory Controller) values every second. It does this for five seconds
(randomly chosen value) in total. It then calculates average of resctrl_mbm
values and imc_mbm values and if the difference is greater than 300 MB/sec
(randomly chosen value), the test treats it as a failure. MBA unit test is
similar to MBM but after every run it changes schemata.

Checking for a difference of 300 MB/sec doesn't look very meaningful when
the mbm values are changing over a wide range. For example, below are the
values running MBA test on SKL with different allocations

1. With 10% as schemata both iMC and resctrl mbm_values are around 2000
   MB/sec
2. With 100% as schemata both iMC and resctrl mbm_values are around 10000
   MB/sec

A 300 MB/sec difference between resctrl_mbm and imc_mbm values is
acceptable at 100% schemata but it isn't acceptable at 10% schemata because
that's a huge difference.

So, fix this by checking for percentage difference instead of absolute
difference i.e. check if the difference between resctrl_mbm value and
imc_mbm value is within 5% (randomly chosen value) of imc_mbm value. If the
difference is greater than 5% of imc_mbm value, treat it is a failure.

Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/mba_test.c | 22 +++++++++++++---------
 tools/testing/selftests/resctrl/mbm_test.c | 15 ++++++++-------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index f42d4ba70363..8842d379e886 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -12,7 +12,7 @@
 
 #define RESULT_FILE_NAME	"result_mba"
 #define NUM_OF_RUNS		5
-#define MAX_DIFF		300
+#define MAX_DIFF_PERCENT	5
 #define ALLOCATION_MAX		100
 #define ALLOCATION_MIN		10
 #define ALLOCATION_STEP		10
@@ -62,7 +62,8 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 	     allocation++) {
 		unsigned long avg_bw_imc, avg_bw_resc;
 		unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
-		unsigned long avg_diff;
+		int avg_diff_per;
+		float avg_diff;
 
 		/*
 		 * The first run is discarded due to inaccurate value from
@@ -76,16 +77,19 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 
 		avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1);
 		avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1);
-		avg_diff = labs((long)(avg_bw_resc - avg_bw_imc));
+		avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc;
+		avg_diff_per = (int)(avg_diff * 100);
 
-		ksft_print_msg("%s MBA schemata percentage %u smaller than %d %%\n",
-			       avg_diff > MAX_DIFF ? "Fail:" : "Pass:",
-			       ALLOCATION_MAX - ALLOCATION_STEP * allocation,
-			       MAX_DIFF);
-		ksft_print_msg("avg_diff: %lu\n", avg_diff);
+		ksft_print_msg("%s MBA: diff within %d%% for schemata %u\n",
+			       avg_diff_per > MAX_DIFF_PERCENT ?
+			       "Fail:" : "Pass:",
+			       MAX_DIFF_PERCENT,
+			       ALLOCATION_MAX - ALLOCATION_STEP * allocation);
+
+		ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per);
 		ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc);
 		ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc);
-		if (avg_diff > MAX_DIFF)
+		if (avg_diff_per > MAX_DIFF_PERCENT)
 			failed = true;
 	}
 
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index 0d65ba4b62b4..651d4ac15986 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -11,7 +11,7 @@
 #include "resctrl.h"
 
 #define RESULT_FILE_NAME	"result_mbm"
-#define MAX_DIFF		300
+#define MAX_DIFF_PERCENT	5
 #define NUM_OF_RUNS		5
 
 static int
@@ -19,8 +19,8 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
 {
 	unsigned long avg_bw_imc = 0, avg_bw_resc = 0;
 	unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
-	long avg_diff = 0;
-	int runs, ret;
+	int runs, ret, avg_diff_per;
+	float avg_diff = 0;
 
 	/*
 	 * Discard the first value which is inaccurate due to monitoring setup
@@ -33,12 +33,13 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
 
 	avg_bw_imc = sum_bw_imc / 4;
 	avg_bw_resc = sum_bw_resc / 4;
-	avg_diff = avg_bw_resc - avg_bw_imc;
+	avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc;
+	avg_diff_per = (int)(avg_diff * 100);
 
-	ret = labs(avg_diff) > MAX_DIFF;
+	ret = avg_diff_per > MAX_DIFF_PERCENT;
 	ksft_print_msg("%s MBM: diff within %d%%\n",
-		       ret ? "Fail:" : "Pass:", MAX_DIFF);
-	ksft_print_msg("avg_diff: %lu\n", labs(avg_diff));
+		       ret ? "Fail:" : "Pass:", MAX_DIFF_PERCENT);
+	ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per);
 	ksft_print_msg("Span (MB): %d\n", span);
 	ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc);
 	ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc);
-- 
Gitee


From bdc4fc0f35c37da32c56bc89dd6e5788895924d4 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:49 +0000
Subject: [PATCH 1282/2161] selftests/resctrl: Don't hard code value of
 "no_of_bits" variable

commit 09a67934625a5941737c566b48e4e574ac4d1d99 upstream.

Cache related tests (like CAT and CMT) depend on a variable called
no_of_bits to run. no_of_bits defines the number of contiguous bits
that should be set in the CBM mask and a user can pass a value for
no_of_bits using -n command line argument. If a user hasn't passed any
value, it defaults to 5 (randomly chosen value).

Hard coding no_of_bits to 5 will make the cache tests fail to run on
systems that support maximum cbm mask that is less than or equal to 5 bits.
Hence, don't hard code no_of_bits value.

If a user passes a value for "no_of_bits" using -n option, use it.
Otherwise, no_of_bits is equal to half of the maximum number of bits in
the cbm mask.

Please note that CMT test is still hard coded to 5 bits. It will change in
subsequent patches that change CMT test.

Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cat_test.c      | 5 ++++-
 tools/testing/selftests/resctrl/resctrl_tests.c | 8 ++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 090d3afc7a78..04d706b4f10e 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -130,7 +130,10 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 	/* Get max number of bits from default-cabm mask */
 	count_of_bits = count_bits(long_mask);
 
-	if (n < 1 || n > count_of_bits - 1) {
+	if (!n)
+		n = count_of_bits / 2;
+
+	if (n > count_of_bits - 1) {
 		ksft_print_msg("Invalid input value for no_of_bits n!\n");
 		ksft_print_msg("Please enter value in range 1 to %d\n",
 			       count_of_bits - 1);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 355bd28b996a..2ace464b96d1 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -57,7 +57,7 @@ void tests_cleanup(void)
 int main(int argc, char **argv)
 {
 	bool has_ben = false, mbm_test = true, mba_test = true, cmt_test = true;
-	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5;
+	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 0;
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
 	int ben_ind, ben_count, tests = 0;
@@ -110,6 +110,10 @@ int main(int argc, char **argv)
 			break;
 		case 'n':
 			no_of_bits = atoi(optarg);
+			if (no_of_bits <= 0) {
+				printf("Bail out! invalid argument for no_of_bits\n");
+				return -1;
+			}
 			break;
 		case 'h':
 			cmd_help();
@@ -188,7 +192,7 @@ int main(int argc, char **argv)
 		ksft_print_msg("Starting CMT test ...\n");
 		if (!has_ben)
 			sprintf(benchmark_cmd[5], "%s", CMT_STR);
-		res = cmt_resctrl_val(cpu_no, no_of_bits, benchmark_cmd);
+		res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd);
 		ksft_test_result(!res, "CMT: test\n");
 		cmt_test_cleanup();
 	}
-- 
Gitee


From 7b6a8a7dfb0bfb406da3324f7e7f97708dfdb90d Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:50 +0000
Subject: [PATCH 1283/2161] selftests/resctrl: Modularize resctrl test suite
 main() function

commit c9fb4e7cee1ebf38257c93f7f5c8915a1424611e upstream.

Resctrl test suite main() function does the following things
1. Parses command line arguments passed by user
2. Some setup checks
3. Logic that calls into each unit test
4. Print result and clean up after running each unit test

Introduce wrapper functions for steps 3 and 4 to modularize the main()
function. Adding these wrapper functions makes it easier to add any logic
to each individual test.

Please note that this is a preparatory patch for the next one and no
functional changes are intended.

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../testing/selftests/resctrl/resctrl_tests.c | 88 ++++++++++++-------
 1 file changed, 57 insertions(+), 31 deletions(-)

diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 2ace464b96d1..e63e0d8764ef 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -54,10 +54,58 @@ void tests_cleanup(void)
 	cat_test_cleanup();
 }
 
+static void run_mbm_test(bool has_ben, char **benchmark_cmd, int span,
+			 int cpu_no, char *bw_report)
+{
+	int res;
+
+	ksft_print_msg("Starting MBM BW change ...\n");
+	if (!has_ben)
+		sprintf(benchmark_cmd[5], "%s", MBA_STR);
+	res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
+	ksft_test_result(!res, "MBM: bw change\n");
+	mbm_test_cleanup();
+}
+
+static void run_mba_test(bool has_ben, char **benchmark_cmd, int span,
+			 int cpu_no, char *bw_report)
+{
+	int res;
+
+	ksft_print_msg("Starting MBA Schemata change ...\n");
+	if (!has_ben)
+		sprintf(benchmark_cmd[1], "%d", span);
+	res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd);
+	ksft_test_result(!res, "MBA: schemata change\n");
+	mba_test_cleanup();
+}
+
+static void run_cmt_test(bool has_ben, char **benchmark_cmd, int cpu_no)
+{
+	int res;
+
+	ksft_print_msg("Starting CMT test ...\n");
+	if (!has_ben)
+		sprintf(benchmark_cmd[5], "%s", CMT_STR);
+	res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd);
+	ksft_test_result(!res, "CMT: test\n");
+	cmt_test_cleanup();
+}
+
+static void run_cat_test(int cpu_no, int no_of_bits)
+{
+	int res;
+
+	ksft_print_msg("Starting CAT test ...\n");
+	res = cat_perf_miss_val(cpu_no, no_of_bits, "L3");
+	ksft_test_result(!res, "CAT: test\n");
+	cat_test_cleanup();
+}
+
 int main(int argc, char **argv)
 {
 	bool has_ben = false, mbm_test = true, mba_test = true, cmt_test = true;
-	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 0;
+	int c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 0;
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
 	int ben_ind, ben_count, tests = 0;
@@ -170,39 +218,17 @@ int main(int argc, char **argv)
 
 	ksft_set_plan(tests ? : 4);
 
-	if (!is_amd && mbm_test) {
-		ksft_print_msg("Starting MBM BW change ...\n");
-		if (!has_ben)
-			sprintf(benchmark_cmd[5], "%s", MBA_STR);
-		res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
-		ksft_test_result(!res, "MBM: bw change\n");
-		mbm_test_cleanup();
-	}
+	if (!is_amd && mbm_test)
+		run_mbm_test(has_ben, benchmark_cmd, span, cpu_no, bw_report);
 
-	if (!is_amd && mba_test) {
-		ksft_print_msg("Starting MBA Schemata change ...\n");
-		if (!has_ben)
-			sprintf(benchmark_cmd[1], "%d", span);
-		res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd);
-		ksft_test_result(!res, "MBA: schemata change\n");
-		mba_test_cleanup();
-	}
+	if (!is_amd && mba_test)
+		run_mba_test(has_ben, benchmark_cmd, span, cpu_no, bw_report);
 
-	if (cmt_test) {
-		ksft_print_msg("Starting CMT test ...\n");
-		if (!has_ben)
-			sprintf(benchmark_cmd[5], "%s", CMT_STR);
-		res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd);
-		ksft_test_result(!res, "CMT: test\n");
-		cmt_test_cleanup();
-	}
+	if (cmt_test)
+		run_cmt_test(has_ben, benchmark_cmd, cpu_no);
 
-	if (cat_test) {
-		ksft_print_msg("Starting CAT test ...\n");
-		res = cat_perf_miss_val(cpu_no, no_of_bits, "L3");
-		ksft_test_result(!res, "CAT: test\n");
-		cat_test_cleanup();
-	}
+	if (cat_test)
+		run_cat_test(cpu_no, no_of_bits);
 
 	return ksft_exit_pass();
 }
-- 
Gitee


From b9848a57ca130586f4242d573730dc5eac1649c0 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:51 +0000
Subject: [PATCH 1284/2161] selftests/resctrl: Skip the test if requested
 resctrl feature is not supported

commit f1dd71982d1949a988cedbf4d9f2c726ee24344f upstream.

There could be two reasons why a resctrl feature might not be enabled on
the platform
1. H/W might not support the feature
2. Even if the H/W supports it, the user might have disabled the feature
   through kernel command line arguments

Hence, any resctrl unit test (like cmt, cat, mbm and mba) before starting
the test will first check if the feature is enabled on the platform or not.
If the feature isn't enabled, then the test returns with an error status.
For example, if MBA isn't supported on a platform and if the user tries to
run MBA, the output will look like this

ok mounting resctrl to "/sys/fs/resctrl"
not ok MBA: schemata change

But, not supporting a feature isn't a test failure. So, instead of treating
it as an error, use the SKIP directive of the TAP protocol. With the
change, the output will look as below

ok MBA # SKIP Hardware does not support MBA or MBA is disabled

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cat_test.c    |  3 ---
 tools/testing/selftests/resctrl/mba_test.c    |  3 ---
 tools/testing/selftests/resctrl/mbm_test.c    |  3 ---
 .../testing/selftests/resctrl/resctrl_tests.c | 23 +++++++++++++++++++
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 04d706b4f10e..cd4f68388e0f 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -111,9 +111,6 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 	if (ret)
 		return ret;
 
-	if (!validate_resctrl_feature_request("cat"))
-		return -1;
-
 	/* Get default cbm mask for L3/L2 cache */
 	ret = get_cbm_mask(cache_type, cbm_mask);
 	if (ret)
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 8842d379e886..26f12ad4c663 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -158,9 +158,6 @@ int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd)
 
 	remove(RESULT_FILE_NAME);
 
-	if (!validate_resctrl_feature_request("mba"))
-		return -1;
-
 	ret = resctrl_val(benchmark_cmd, &param);
 	if (ret)
 		return ret;
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index 651d4ac15986..02b1ed03f1e5 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -131,9 +131,6 @@ int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd)
 
 	remove(RESULT_FILE_NAME);
 
-	if (!validate_resctrl_feature_request("mbm"))
-		return -1;
-
 	ret = resctrl_val(benchmark_cmd, &param);
 	if (ret)
 		return ret;
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index e63e0d8764ef..fb246bc41f47 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -60,6 +60,12 @@ static void run_mbm_test(bool has_ben, char **benchmark_cmd, int span,
 	int res;
 
 	ksft_print_msg("Starting MBM BW change ...\n");
+
+	if (!validate_resctrl_feature_request(MBM_STR)) {
+		ksft_test_result_skip("Hardware does not support MBM or MBM is disabled\n");
+		return;
+	}
+
 	if (!has_ben)
 		sprintf(benchmark_cmd[5], "%s", MBA_STR);
 	res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
@@ -73,6 +79,12 @@ static void run_mba_test(bool has_ben, char **benchmark_cmd, int span,
 	int res;
 
 	ksft_print_msg("Starting MBA Schemata change ...\n");
+
+	if (!validate_resctrl_feature_request(MBA_STR)) {
+		ksft_test_result_skip("Hardware does not support MBA or MBA is disabled\n");
+		return;
+	}
+
 	if (!has_ben)
 		sprintf(benchmark_cmd[1], "%d", span);
 	res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd);
@@ -85,6 +97,11 @@ static void run_cmt_test(bool has_ben, char **benchmark_cmd, int cpu_no)
 	int res;
 
 	ksft_print_msg("Starting CMT test ...\n");
+	if (!validate_resctrl_feature_request(CMT_STR)) {
+		ksft_test_result_skip("Hardware does not support CMT or CMT is disabled\n");
+		return;
+	}
+
 	if (!has_ben)
 		sprintf(benchmark_cmd[5], "%s", CMT_STR);
 	res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd);
@@ -97,6 +114,12 @@ static void run_cat_test(int cpu_no, int no_of_bits)
 	int res;
 
 	ksft_print_msg("Starting CAT test ...\n");
+
+	if (!validate_resctrl_feature_request(CAT_STR)) {
+		ksft_test_result_skip("Hardware does not support CAT or CAT is disabled\n");
+		return;
+	}
+
 	res = cat_perf_miss_val(cpu_no, no_of_bits, "L3");
 	ksft_test_result(!res, "CAT: test\n");
 	cat_test_cleanup();
-- 
Gitee


From 8436f9852a292c93d3d403193d7e9c8b6425e683 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:52 +0000
Subject: [PATCH 1285/2161] selftests/resctrl: Fix unmount resctrl FS

commit 4e5cb354c85eafe88709cefc2fdce4911fb6ac17 upstream.

umount_resctrlfs() directly attempts to unmount resctrl file system without
checking if resctrl FS is already mounted or not. It returns 0 on success
and on failure it prints an error message and returns an error status.
Calling umount_resctrlfs() when resctrl FS isn't mounted will return an
error status.

There could be situations where-in the caller might not know if resctrl
FS is already mounted or not and the caller might still want to unmount
resctrl FS if it's already mounted (For example during teardown).

To support above use cases, change umount_resctrlfs() such that it now
first checks if resctrl FS is already mounted or not and unmounts resctrl
FS only if it's already mounted.

unmount resctrl FS upon exit. For example, running only mba test on a
Broadwell (BDW) machine (MBA isn't supported on BDW CPU).

This happens because validate_resctrl_feature_request() would mount resctrl
FS to check if mba is enabled on the platform or not and finds that the H/W
doesn't support mba and hence will return false to run_mba_test(). This in
turn makes the main() function return without unmounting resctrl FS.

Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl_tests.c | 2 ++
 tools/testing/selftests/resctrl/resctrlfs.c     | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index fb246bc41f47..f51b5fc066a3 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -253,5 +253,7 @@ int main(int argc, char **argv)
 	if (cat_test)
 		run_cat_test(cpu_no, no_of_bits);
 
+	umount_resctrlfs();
+
 	return ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 26563175acf6..ade5f2b8b843 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -82,6 +82,9 @@ int remount_resctrlfs(bool mum_resctrlfs)
 
 int umount_resctrlfs(void)
 {
+	if (find_resctrl_mount(NULL))
+		return 0;
+
 	if (umount(RESCTRL_PATH)) {
 		perror("# Unable to umount resctrl");
 
-- 
Gitee


From b25409ffec9912f83b3d1eb542478e757ee79db3 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:53 +0000
Subject: [PATCH 1286/2161] selftests/resctrl: Fix incorrect parsing of iMC
 counters

commit d81343b5eedf84be71a4313e8fd073d0c510afcf upstream.

iMC (Integrated Memory Controller) counters are usually at
"/sys/bus/event_source/devices/" and are named as "uncore_imc_<n>".
num_of_imcs() function tries to count number of such iMC counters so that
it could appropriately initialize required number of perf_attr structures
that could be used to read these iMC counters.

num_of_imcs() function assumes that all the directories under this path
that start with "uncore_imc" are iMC counters. But, on some systems there
could be directories named as "uncore_imc_free_running" which aren't iMC
counters. Trying to read from such directories will result in "not found
file" errors and MBM/MBA tests will fail.

Hence, fix the logic in num_of_imcs() such that it looks at the first
character after "uncore_imc_" to check if it's a numerical digit or not. If
it's a digit then the directory represents an iMC counter, else, skip the
directory.

Reported-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl_val.c | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 5dfae51133bc..20d457c47ded 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -221,8 +221,8 @@ static int read_from_imc_dir(char *imc_dir, int count)
  */
 static int num_of_imcs(void)
 {
+	char imc_dir[512], *temp;
 	unsigned int count = 0;
-	char imc_dir[512];
 	struct dirent *ep;
 	int ret;
 	DIR *dp;
@@ -230,7 +230,25 @@ static int num_of_imcs(void)
 	dp = opendir(DYN_PMU_PATH);
 	if (dp) {
 		while ((ep = readdir(dp))) {
-			if (strstr(ep->d_name, UNCORE_IMC)) {
+			temp = strstr(ep->d_name, UNCORE_IMC);
+			if (!temp)
+				continue;
+
+			/*
+			 * imc counters are named as "uncore_imc_<n>", hence
+			 * increment the pointer to point to <n>. Note that
+			 * sizeof(UNCORE_IMC) would count for null character as
+			 * well and hence the last underscore character in
+			 * uncore_imc'_' need not be counted.
+			 */
+			temp = temp + sizeof(UNCORE_IMC);
+
+			/*
+			 * Some directories under "DYN_PMU_PATH" could have
+			 * names like "uncore_imc_free_running", hence, check if
+			 * first character is a numerical digit or not.
+			 */
+			if (temp[0] >= '0' && temp[0] <= '9') {
 				sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH,
 					ep->d_name);
 				ret = read_from_imc_dir(imc_dir, count);
-- 
Gitee


From 20aa36d61c1bd74cc87dbd752e322fe9f7b4bff6 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:54 +0000
Subject: [PATCH 1287/2161] selftests/resctrl: Fix checking for < 0 for
 unsigned values

commit 1205b688c92558a04d8dd4cbc2b213e0fceba5db upstream.

Dan reported following static checker warnings

tools/testing/selftests/resctrl/resctrl_val.c:545 measure_vals()
warn: 'bw_imc' unsigned <= 0

tools/testing/selftests/resctrl/resctrl_val.c:549 measure_vals()
warn: 'bw_resc_end' unsigned <= 0

These warnings are reported because
1. measure_vals() declares 'bw_imc' and 'bw_resc_end' as unsigned long
   variables
2. Return value of get_mem_bw_imc() and get_mem_bw_resctrl() are assigned
   to 'bw_imc' and 'bw_resc_end' respectively
3. The returned values are checked for <= 0 to see if the calls failed

Checking for < 0 for an unsigned value doesn't make any sense.

Fix this issue by changing the implementation of get_mem_bw_imc() and
get_mem_bw_resctrl() such that they now accept reference to a variable
and set the variable appropriately upon success and return 0, else return
< 0 on error.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/resctrl_val.c | 41 +++++++++++--------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 20d457c47ded..95224345c78e 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -300,9 +300,9 @@ static int initialize_mem_bw_imc(void)
  * Memory B/W utilized by a process on a socket can be calculated using
  * iMC counters. Perf events are used to read these counters.
  *
- * Return: >= 0 on success. < 0 on failure.
+ * Return: = 0 on success. < 0 on failure.
  */
-static float get_mem_bw_imc(int cpu_no, char *bw_report)
+static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc)
 {
 	float reads, writes, of_mul_read, of_mul_write;
 	int imc, j, ret;
@@ -373,13 +373,18 @@ static float get_mem_bw_imc(int cpu_no, char *bw_report)
 		close(imc_counters_config[imc][WRITE].fd);
 	}
 
-	if (strcmp(bw_report, "reads") == 0)
-		return reads;
+	if (strcmp(bw_report, "reads") == 0) {
+		*bw_imc = reads;
+		return 0;
+	}
 
-	if (strcmp(bw_report, "writes") == 0)
-		return writes;
+	if (strcmp(bw_report, "writes") == 0) {
+		*bw_imc = writes;
+		return 0;
+	}
 
-	return (reads + writes);
+	*bw_imc = reads + writes;
+	return 0;
 }
 
 void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id)
@@ -438,9 +443,8 @@ static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp,
  * 1. If con_mon grp is given, then read from it
  * 2. If con_mon grp is not given, then read from root con_mon grp
  */
-static unsigned long get_mem_bw_resctrl(void)
+static int get_mem_bw_resctrl(unsigned long *mbm_total)
 {
-	unsigned long mbm_total = 0;
 	FILE *fp;
 
 	fp = fopen(mbm_total_path, "r");
@@ -449,7 +453,7 @@ static unsigned long get_mem_bw_resctrl(void)
 
 		return -1;
 	}
-	if (fscanf(fp, "%lu", &mbm_total) <= 0) {
+	if (fscanf(fp, "%lu", mbm_total) <= 0) {
 		perror("Could not get mbm local bytes");
 		fclose(fp);
 
@@ -457,7 +461,7 @@ static unsigned long get_mem_bw_resctrl(void)
 	}
 	fclose(fp);
 
-	return mbm_total;
+	return 0;
 }
 
 pid_t bm_pid, ppid;
@@ -549,7 +553,8 @@ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
 static int
 measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
 {
-	unsigned long bw_imc, bw_resc, bw_resc_end;
+	unsigned long bw_resc, bw_resc_end;
+	float bw_imc;
 	int ret;
 
 	/*
@@ -559,13 +564,13 @@ measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
 	 * Compare the two values to validate resctrl value.
 	 * It takes 1sec to measure the data.
 	 */
-	bw_imc = get_mem_bw_imc(param->cpu_no, param->bw_report);
-	if (bw_imc <= 0)
-		return bw_imc;
+	ret = get_mem_bw_imc(param->cpu_no, param->bw_report, &bw_imc);
+	if (ret < 0)
+		return ret;
 
-	bw_resc_end = get_mem_bw_resctrl();
-	if (bw_resc_end <= 0)
-		return bw_resc_end;
+	ret = get_mem_bw_resctrl(&bw_resc_end);
+	if (ret < 0)
+		return ret;
 
 	bw_resc = (bw_resc_end - *bw_resc_start) / MB;
 	ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
-- 
Gitee


From 00e196bed32ac5442ce1ca97891216ca9c8cbd5b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 17 Mar 2021 02:22:55 +0000
Subject: [PATCH 1288/2161] selftests/resctrl: Create .gitignore to include
 resctrl_tests

commit 4808bf209efd586512e31590f19c1affbe56980e upstream.

Create .gitignore to hold the test file resctrl_tests generated after
compiling.

Suggested-by: Shuah Khan <skhan@linuxfoundation.org>
Tested-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/.gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/.gitignore

diff --git a/tools/testing/selftests/resctrl/.gitignore b/tools/testing/selftests/resctrl/.gitignore
new file mode 100644
index 000000000000..ab68442b6bc8
--- /dev/null
+++ b/tools/testing/selftests/resctrl/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+resctrl_tests
-- 
Gitee


From 3fd4e68048955ead03c09b77e9d63f9050d9af15 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 7 Apr 2021 19:57:28 +0000
Subject: [PATCH 1289/2161] selftests/resctrl: Change a few printed messages

commit e75074781f1735c1976bc551e29ccf2ba9a4b17f upstream.

Change a few printed messages to report test progress more clearly.

Add a missing "\n" at the end of one printed message.

Suggested-by: Shuah Khan <shuah@kernel.org>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/cache.c     | 2 +-
 tools/testing/selftests/resctrl/mba_test.c  | 6 +++---
 tools/testing/selftests/resctrl/mbm_test.c  | 2 +-
 tools/testing/selftests/resctrl/resctrlfs.c | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
index 362e3a418caa..68ff856d36f0 100644
--- a/tools/testing/selftests/resctrl/cache.c
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -301,7 +301,7 @@ int show_cache_info(unsigned long sum_llc_val, int no_of_bits,
 	ret = platform && abs((int)diff_percent) > max_diff_percent &&
 	      (cmt ? (abs(avg_diff) > max_diff) : true);
 
-	ksft_print_msg("%s cache miss rate within %d%%\n",
+	ksft_print_msg("%s Check cache miss rate within %d%%\n",
 		       ret ? "Fail:" : "Pass:", max_diff_percent);
 
 	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 26f12ad4c663..1a1bdb6180cf 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -80,7 +80,7 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 		avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc;
 		avg_diff_per = (int)(avg_diff * 100);
 
-		ksft_print_msg("%s MBA: diff within %d%% for schemata %u\n",
+		ksft_print_msg("%s Check MBA diff within %d%% for schemata %u\n",
 			       avg_diff_per > MAX_DIFF_PERCENT ?
 			       "Fail:" : "Pass:",
 			       MAX_DIFF_PERCENT,
@@ -93,10 +93,10 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 			failed = true;
 	}
 
-	ksft_print_msg("%s schemata change using MBA\n",
+	ksft_print_msg("%s Check schemata change using MBA\n",
 		       failed ? "Fail:" : "Pass:");
 	if (failed)
-		ksft_print_msg("At least one test failed");
+		ksft_print_msg("At least one test failed\n");
 }
 
 static int check_results(void)
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index 02b1ed03f1e5..8392e5c55ed0 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -37,7 +37,7 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
 	avg_diff_per = (int)(avg_diff * 100);
 
 	ret = avg_diff_per > MAX_DIFF_PERCENT;
-	ksft_print_msg("%s MBM: diff within %d%%\n",
+	ksft_print_msg("%s Check MBM diff within %d%%\n",
 		       ret ? "Fail:" : "Pass:", MAX_DIFF_PERCENT);
 	ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per);
 	ksft_print_msg("Span (MB): %d\n", span);
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index ade5f2b8b843..5f5a166ade60 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -570,14 +570,14 @@ bool check_resctrlfs_support(void)
 
 	fclose(inf);
 
-	ksft_print_msg("%s kernel supports resctrl filesystem\n",
+	ksft_print_msg("%s Check kernel supports resctrl filesystem\n",
 		       ret ? "Pass:" : "Fail:");
 
 	if (!ret)
 		return ret;
 
 	dp = opendir(RESCTRL_PATH);
-	ksft_print_msg("%s resctrl mountpoint \"%s\" exists\n",
+	ksft_print_msg("%s Check resctrl mountpoint \"%s\" exists\n",
 		       dp ? "Pass:" : "Fail:", RESCTRL_PATH);
 	if (dp)
 		closedir(dp);
-- 
Gitee


From 2944d9d5602f04573d4c4738732186c14b253da3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Sun, 25 Apr 2021 14:12:29 -0700
Subject: [PATCH 1290/2161] x86/resctrl: Fix init const confusion

commit 4029b9706d53e5e8db2e1cee6ecd75e60b62cd09 upstream.

const variable must be initconst, not initdata.

Signed-off-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210425211229.3157674-1-ak@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/monitor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 5838952c2211..b2546803a323 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -84,7 +84,7 @@ unsigned int resctrl_cqm_threshold;
 static const struct mbm_correction_factor_table {
 	u32 rmidthreshold;
 	u64 cf;
-} mbm_cf_table[] __initdata = {
+} mbm_cf_table[] __initconst = {
 	{7,	CF(1.000000)},
 	{15,	CF(1.000000)},
 	{15,	CF(0.969650)},
-- 
Gitee


From 132c3249d351cfdc07167525a2c8621f455f873c Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <xiaochen.shen@intel.com>
Date: Thu, 27 May 2021 17:31:53 +0800
Subject: [PATCH 1291/2161] selftests/resctrl: Fix incorrect parsing of option
 "-t"

commit 1421ec684a43379b2aa3cfda20b03d38282dc990 upstream.

Resctrl test suite accepts command line argument "-t" to specify the
unit tests to run in the test list (e.g., -t mbm,mba,cmt,cat) as
documented in the help.

When calling strtok() to parse the option, the incorrect delimiters
argument ":\t" is used. As a result, passing "-t mbm,mba,cmt,cat" throws
an invalid option error.

Fix this by using delimiters argument "," instead of ":\t" for parsing
of unit tests list. At the same time, remove the unnecessary "spaces"
between the unit tests in help documentation to prevent confusion.

Fixes: 790bf585b0ee ("selftests/resctrl: Add Cache Allocation Technology (CAT) selftest")
Fixes: 78941183d1b1 ("selftests/resctrl: Add Cache QoS Monitoring (CQM) selftest")
Fixes: ecdbb911f22d ("selftests/resctrl: Add MBM test")
Fixes: 034c7678dd2c ("selftests/resctrl: Add README for resctrl tests")
Cc: stable@vger.kernel.org
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/resctrl/README          | 2 +-
 tools/testing/selftests/resctrl/resctrl_tests.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README
index 4b36b25b6ac0..3d2bbd4fa3aa 100644
--- a/tools/testing/selftests/resctrl/README
+++ b/tools/testing/selftests/resctrl/README
@@ -47,7 +47,7 @@ Parameter '-h' shows usage information.
 
 usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits]
         -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT default benchmark is builtin fill_buf
-        -t test list: run tests specified in the test list, e.g. -t mbm, mba, cmt, cat
+        -t test list: run tests specified in the test list, e.g. -t mbm,mba,cmt,cat
         -n no_of_bits: run cache tests using specified no of bits in cache bit mask
         -p cpu_no: specify CPU number to run the test. 1 is default
         -h: help
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index f51b5fc066a3..973f09a66e1e 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -40,7 +40,7 @@ static void cmd_help(void)
 	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT\n");
 	printf("\t   default benchmark is builtin fill_buf\n");
 	printf("\t-t test list: run tests specified in the test list, ");
-	printf("e.g. -t mbm, mba, cmt, cat\n");
+	printf("e.g. -t mbm,mba,cmt,cat\n");
 	printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
 	printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n");
 	printf("\t-h: help\n");
@@ -173,7 +173,7 @@ int main(int argc, char **argv)
 
 					return -1;
 				}
-				token = strtok(NULL, ":\t");
+				token = strtok(NULL, ",");
 			}
 			break;
 		case 'p':
-- 
Gitee


From a7ecf88555a2de55735f0416760d4f50ab607664 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Wed, 16 Jun 2021 20:15:30 +0200
Subject: [PATCH 1292/2161] x86/resctrl: Fix kernel-doc in pseudo_lock.c

commit f9b871c89ae61d5a4c0b81659fa6819c50d4ced2 upstream.

Add undocumented parameters detected by scripts/kernel-doc.

Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20210616181530.4094-1-fmdefrancesco@gmail.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index e154ea812a4a..3cbfa37d6f0d 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -49,6 +49,7 @@ static struct class *pseudo_lock_class;
 
 /**
  * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * @void: It takes no parameters.
  *
  * Capture the list of platforms that have been validated to support
  * pseudo-locking. This includes testing to ensure pseudo-locked regions
@@ -162,7 +163,7 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor)
 }
 
 /**
- * pseudo_lock_pm_req - A power management QoS request list entry
+ * struct pseudo_lock_pm_req - A power management QoS request list entry
  * @list:	Entry within the @pm_reqs list for a pseudo-locked region
  * @req:	PM QoS request
  */
@@ -184,6 +185,7 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
 
 /**
  * pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ * @plr: Pseudo-locked region
  *
  * To prevent the cache from being affected by power management entering
  * C6 has to be avoided. This is accomplished by requesting a latency
@@ -196,6 +198,8 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
  * the ACPI latencies need to be considered while keeping in mind that C2
  * may be set to map to deeper sleep states. In this case the latency
  * requirement needs to prevent entering C2 also.
+ *
+ * Return: 0 on success, <0 on failure
  */
 static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
 {
@@ -520,7 +524,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
 
 /**
  * rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @r: resource group being queried
+ * @rdtgrp: resource group being queried
  *
  * Return: 1 if monitor groups have been created for this resource
  * group, 0 otherwise.
@@ -1140,6 +1144,8 @@ static int measure_l3_residency(void *_plr)
 
 /**
  * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ * @rdtgrp: Resource group to which the pseudo-locked region belongs.
+ * @sel: Selector of which measurement to perform on a pseudo-locked region.
  *
  * The measurement of latency to access a pseudo-locked region should be
  * done from a cpu that is associated with that pseudo-locked region.
-- 
Gitee


From 28c639474663acf7587d72e2aaeea17a7b61f68d Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Sat, 19 Jun 2021 00:32:06 +0200
Subject: [PATCH 1293/2161] x86/resctrl: Fix kernel-doc in internal.h

commit fd2afa70eff057fab57c9e06708b68677b261a0c upstream.

Add description of undocumented parameters. Issues detected by
scripts/kernel-doc.

Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20210618223206.29539-1-fmdefrancesco@gmail.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/internal.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 0652fc13ecf8..9b23d008354a 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -72,6 +72,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  * struct mon_evt - Entry in the event list of a resource
  * @evtid:		event id
  * @name:		name of the event
+ * @list:		entry in &rdt_resource->evt_list
  */
 struct mon_evt {
 	u32			evtid;
@@ -80,10 +81,13 @@ struct mon_evt {
 };
 
 /**
- * struct mon_data_bits - Monitoring details for each event file
- * @rid:               Resource id associated with the event file.
+ * union mon_data_bits - Monitoring details for each event file
+ * @priv:              Used to store monitoring event data in @u
+ *                     as kernfs private data
+ * @rid:               Resource id associated with the event file
  * @evtid:             Event id associated with the event file
  * @domid:             The domain to which the event file belongs
+ * @u:                 Name of the bit fields struct
  */
 union mon_data_bits {
 	void *priv;
@@ -121,6 +125,7 @@ enum rdt_group_type {
  * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
  * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
  *                          allowed AND the allocations are Cache Pseudo-Locked
+ * @RDT_NUM_MODES: Total number of modes
  *
  * The mode of a resource group enables control over the allowed overlap
  * between allocations associated with different resource groups (classes
@@ -144,7 +149,7 @@ enum rdtgrp_mode {
 
 /**
  * struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn		kernlfs node for the mon_data directory
+ * @mon_data_kn:		kernfs node for the mon_data directory
  * @parent:			parent rdtgrp
  * @crdtgrp_list:		child rdtgroup node list
  * @rmid:			rmid for this rdtgroup
@@ -284,11 +289,11 @@ struct rftype {
 /**
  * struct mbm_state - status for each MBM counter in each domain
  * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
- * @prev_msr	Value of IA32_QM_CTR for this RMID last time we read it
+ * @prev_msr:	Value of IA32_QM_CTR for this RMID last time we read it
  * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
- * @prev_bw	The most recent bandwidth in MBps
- * @delta_bw	Difference between the current and previous bandwidth
- * @delta_comp	Indicates whether to compute the delta_bw
+ * @prev_bw:	The most recent bandwidth in MBps
+ * @delta_bw:	Difference between the current and previous bandwidth
+ * @delta_comp:	Indicates whether to compute the delta_bw
  */
 struct mbm_state {
 	u64	chunks;
-- 
Gitee


From 4fc571ee53e62e39496b21a973de0fbd7158d24d Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:15 +0000
Subject: [PATCH 1294/2161] x86/resctrl: Split struct rdt_domain

commit 792e0f6f789bda5e31b1dbcfcc84068da36a79b1 upstream.

resctrl is the defacto Linux ABI for SoC resource partitioning features.

To support it on another architecture, it needs to be abstracted from
the features provided by Intel RDT and AMD PQoS, and moved to /fs/.
struct rdt_domain contains a mix of architecture private details and
properties of the filesystem interface user-space uses.

Continue by splitting struct rdt_domain, into an architecture private
'hw' struct, which contains the common resctrl structure that would be
used by any architecture. The hardware values in ctrl_val and mbps_val
need to be accessed via helpers to allow another architecture to convert
these into a different format if necessary. After this split, filesystem
code paths touching a 'hw' struct indicates where an abstraction is
needed.

Splitting this structure only moves types around, and should not lead
to any change in behaviour.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-3-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        | 32 ++++++++++-------
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 10 ++++--
 arch/x86/kernel/cpu/resctrl/internal.h    | 43 +++++++----------------
 arch/x86/kernel/cpu/resctrl/monitor.c     |  8 +++--
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 29 +++++++++------
 include/linux/resctrl.h                   | 32 ++++++++++++++++-
 6 files changed, 94 insertions(+), 60 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 80d1657c773a..f796ec53a63f 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -379,10 +379,11 @@ static void
 mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(hw_res->msr_base + i, d->ctrl_val[i]);
+		wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
 }
 
 /*
@@ -404,21 +405,23 @@ mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
 		struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	/*  Write the delay values for mba. */
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(hw_res->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
+		wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
 }
 
 static void
 cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(hw_res->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
+		wrmsrl(hw_res->msr_base + cbm_idx(r, i), hw_dom->ctrl_val[i]);
 }
 
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
@@ -504,21 +507,22 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct msr_param m;
 	u32 *dc, *dm;
 
-	dc = kmalloc_array(hw_res->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
+	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val), GFP_KERNEL);
 	if (!dc)
 		return -ENOMEM;
 
-	dm = kmalloc_array(hw_res->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
+	dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val), GFP_KERNEL);
 	if (!dm) {
 		kfree(dc);
 		return -ENOMEM;
 	}
 
-	d->ctrl_val = dc;
-	d->mbps_val = dm;
+	hw_dom->ctrl_val = dc;
+	hw_dom->mbps_val = dm;
 	setup_default_ctrlval(r, dc, dm);
 
 	m.low = 0;
@@ -580,6 +584,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 {
 	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
 	struct list_head *add_pos = NULL;
+	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *d;
 
 	d = rdt_find_domain(r, id, &add_pos);
@@ -595,10 +600,11 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 	}
 
-	d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
-	if (!d)
+	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+	if (!hw_dom)
 		return;
 
+	d = &hw_dom->d_resctrl;
 	d->id = id;
 	cpumask_set_cpu(cpu, &d->cpu_mask);
 
@@ -627,6 +633,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 {
 	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *d;
 
 	d = rdt_find_domain(r, id, NULL);
@@ -634,6 +641,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
 		return;
 	}
+	hw_dom = resctrl_to_arch_dom(d);
 
 	cpumask_clear_cpu(cpu, &d->cpu_mask);
 	if (cpumask_empty(&d->cpu_mask)) {
@@ -666,12 +674,12 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		if (d->plr)
 			d->plr->d = NULL;
 
-		kfree(d->ctrl_val);
-		kfree(d->mbps_val);
+		kfree(hw_dom->ctrl_val);
+		kfree(hw_dom->mbps_val);
 		bitmap_free(d->rmid_busy_llc);
 		kfree(d->mbm_total);
 		kfree(d->mbm_local);
-		kfree(d);
+		kfree(hw_dom);
 		return;
 	}
 
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 3f0c33d5b658..08eef539cb6c 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -238,6 +238,7 @@ static int parse_line(char *line, struct rdt_resource *r,
 
 int update_domains(struct rdt_resource *r, int closid)
 {
+	struct rdt_hw_domain *hw_dom;
 	struct msr_param msr_param;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
@@ -254,7 +255,8 @@ int update_domains(struct rdt_resource *r, int closid)
 
 	mba_sc = is_mba_sc(r);
 	list_for_each_entry(d, &r->domains, list) {
-		dc = !mba_sc ? d->ctrl_val : d->mbps_val;
+		hw_dom = resctrl_to_arch_dom(d);
+		dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
 		if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
 			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 			dc[closid] = d->new_ctrl;
@@ -375,17 +377,19 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
 static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
 {
+	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *dom;
 	bool sep = false;
 	u32 ctrl_val;
 
 	seq_printf(s, "%*s:", max_name_width, r->name);
 	list_for_each_entry(dom, &r->domains, list) {
+		hw_dom = resctrl_to_arch_dom(dom);
 		if (sep)
 			seq_puts(s, ";");
 
-		ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] :
-			    dom->mbps_val[closid]);
+		ctrl_val = (!is_mba_sc(r) ? hw_dom->ctrl_val[closid] :
+			    hw_dom->mbps_val[closid]);
 		seq_printf(s, r->format_str, dom->id, max_data_width,
 			   ctrl_val);
 		sep = true;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 9b23d008354a..79dfdd07bb15 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -305,44 +305,25 @@ struct mbm_state {
 };
 
 /**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list:	all instances of this resource
- * @id:		unique id for this instance
- * @cpu_mask:	which cpus share this resource
- * @rmid_busy_llc:
- *		bitmap of which limbo RMIDs are above threshold
- * @mbm_total:	saved state for MBM total bandwidth
- * @mbm_local:	saved state for MBM local bandwidth
- * @mbm_over:	worker to periodically read MBM h/w counters
- * @cqm_limbo:	worker to periodically read CQM h/w counters
- * @mbm_work_cpu:
- *		worker cpu for MBM h/w counters
- * @cqm_work_cpu:
- *		worker cpu for CQM h/w counters
+ * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
+ *			  a resource
+ * @d_resctrl:	Properties exposed to the resctrl file system
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
  * @mbps_val:	When mba_sc is enabled, this holds the bandwidth in MBps
- * @new_ctrl:	new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- * @plr:	pseudo-locked region (if any) associated with domain
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
  */
-struct rdt_domain {
-	struct list_head		list;
-	int				id;
-	struct cpumask			cpu_mask;
-	unsigned long			*rmid_busy_llc;
-	struct mbm_state		*mbm_total;
-	struct mbm_state		*mbm_local;
-	struct delayed_work		mbm_over;
-	struct delayed_work		cqm_limbo;
-	int				mbm_work_cpu;
-	int				cqm_work_cpu;
+struct rdt_hw_domain {
+	struct rdt_domain		d_resctrl;
 	u32				*ctrl_val;
 	u32				*mbps_val;
-	u32				new_ctrl;
-	bool				have_new_ctrl;
-	struct pseudo_lock_region	*plr;
 };
 
+static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
+{
+	return container_of(r, struct rdt_hw_domain, d_resctrl);
+}
+
 /**
  * struct msr_param - set a range of MSRs from a domain
  * @res:       The resource to use
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index b2546803a323..d59f3bc38f1e 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -418,6 +418,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
 	struct mbm_state *pmbm_data, *cmbm_data;
 	struct rdt_hw_resource *hw_r_mba;
+	struct rdt_hw_domain *hw_dom_mba;
 	u32 cur_bw, delta_bw, user_bw;
 	struct rdt_resource *r_mba;
 	struct rdt_domain *dom_mba;
@@ -438,11 +439,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 		pr_warn_once("Failure to get domain for MBA update\n");
 		return;
 	}
+	hw_dom_mba = resctrl_to_arch_dom(dom_mba);
 
 	cur_bw = pmbm_data->prev_bw;
-	user_bw = dom_mba->mbps_val[closid];
+	user_bw = hw_dom_mba->mbps_val[closid];
 	delta_bw = pmbm_data->delta_bw;
-	cur_msr_val = dom_mba->ctrl_val[closid];
+	cur_msr_val = hw_dom_mba->ctrl_val[closid];
 
 	/*
 	 * For Ctrl groups read data from child monitor groups.
@@ -479,7 +481,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 
 	cur_msr = hw_r_mba->msr_base + closid;
 	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
-	dom_mba->ctrl_val[closid] = new_msr_val;
+	hw_dom_mba->ctrl_val[closid] = new_msr_val;
 
 	/*
 	 * Delta values are updated dynamically package wise for each
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index cc0a069a4bca..f1233553d780 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -915,7 +915,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
 	list_for_each_entry(dom, &r->domains, list) {
 		if (sep)
 			seq_putc(seq, ';');
-		ctrl = dom->ctrl_val;
+		ctrl = resctrl_to_arch_dom(dom)->ctrl_val;
 		sw_shareable = 0;
 		exclusive = 0;
 		seq_printf(seq, "%d=", dom->id);
@@ -1193,7 +1193,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 	}
 
 	/* Check for overlap with other resource groups */
-	ctrl = d->ctrl_val;
+	ctrl = resctrl_to_arch_dom(d)->ctrl_val;
 	for (i = 0; i < closids_supported(); i++, ctrl++) {
 		ctrl_b = *ctrl;
 		mode = rdtgroup_mode_by_closid(i);
@@ -1262,6 +1262,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
  */
 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 {
+	struct rdt_hw_domain *hw_dom;
 	int closid = rdtgrp->closid;
 	struct rdt_resource *r;
 	bool has_cache = false;
@@ -1272,7 +1273,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 			continue;
 		has_cache = true;
 		list_for_each_entry(d, &r->domains, list) {
-			if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
+			hw_dom = resctrl_to_arch_dom(d);
+			if (rdtgroup_cbm_overlaps(r, d, hw_dom->ctrl_val[closid],
 						  rdtgrp->closid, false)) {
 				rdt_last_cmd_puts("Schemata overlaps\n");
 				return false;
@@ -1404,6 +1406,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
 static int rdtgroup_size_show(struct kernfs_open_file *of,
 			      struct seq_file *s, void *v)
 {
+	struct rdt_hw_domain *hw_dom;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
 	struct rdt_domain *d;
@@ -1438,14 +1441,15 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 		sep = false;
 		seq_printf(s, "%*s:", max_name_width, r->name);
 		list_for_each_entry(d, &r->domains, list) {
+			hw_dom = resctrl_to_arch_dom(d);
 			if (sep)
 				seq_putc(s, ';');
 			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 				size = 0;
 			} else {
 				ctrl = (!is_mba_sc(r) ?
-						d->ctrl_val[rdtgrp->closid] :
-						d->mbps_val[rdtgrp->closid]);
+						hw_dom->ctrl_val[rdtgrp->closid] :
+						hw_dom->mbps_val[rdtgrp->closid]);
 				if (r->rid == RDT_RESOURCE_MBA)
 					size = ctrl;
 				else
@@ -1940,6 +1944,7 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
 static int set_mba_sc(bool mba_sc)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *d;
 
 	if (!is_mbm_enabled() || !is_mba_linear() ||
@@ -1947,8 +1952,10 @@ static int set_mba_sc(bool mba_sc)
 		return -EINVAL;
 
 	r->membw.mba_sc = mba_sc;
-	list_for_each_entry(d, &r->domains, list)
-		setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
+	list_for_each_entry(d, &r->domains, list) {
+		hw_dom = resctrl_to_arch_dom(d);
+		setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val);
+	}
 
 	return 0;
 }
@@ -2270,6 +2277,7 @@ static int rdt_init_fs_context(struct fs_context *fc)
 static int reset_all_ctrls(struct rdt_resource *r)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct rdt_hw_domain *hw_dom;
 	struct msr_param msr_param;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
@@ -2288,10 +2296,11 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	 * from each domain to update the MSRs below.
 	 */
 	list_for_each_entry(d, &r->domains, list) {
+		hw_dom = resctrl_to_arch_dom(d);
 		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 
 		for (i = 0; i < hw_res->num_closid; i++)
-			d->ctrl_val[i] = r->default_ctrl;
+			hw_dom->ctrl_val[i] = r->default_ctrl;
 	}
 	cpu = get_cpu();
 	/* Update CBM on this cpu if it's in cpu_mask. */
@@ -2670,7 +2679,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
 	d->have_new_ctrl = false;
 	d->new_ctrl = r->cache.shareable_bits;
 	used_b = r->cache.shareable_bits;
-	ctrl = d->ctrl_val;
+	ctrl = resctrl_to_arch_dom(d)->ctrl_val;
 	for (i = 0; i < closids_supported(); i++, ctrl++) {
 		if (closid_allocated(i) && i != closid) {
 			mode = rdtgroup_mode_by_closid(i);
@@ -2687,7 +2696,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
 			 * with an exclusive group.
 			 */
 			if (d_cdp)
-				peer_ctl = d_cdp->ctrl_val[i];
+				peer_ctl = resctrl_to_arch_dom(d_cdp)->ctrl_val[i];
 			else
 				peer_ctl = 0;
 			used_b |= *ctrl | peer_ctl;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 5ccf36b7dbbf..a4c89dafd7fa 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -15,7 +15,37 @@ int proc_resctrl_show(struct seq_file *m,
 
 #endif
 
-struct rdt_domain;
+/**
+ * struct rdt_domain - group of CPUs sharing a resctrl resource
+ * @list:		all instances of this resource
+ * @id:			unique id for this instance
+ * @cpu_mask:		which CPUs share this resource
+ * @new_ctrl:		new ctrl value to be loaded
+ * @have_new_ctrl:	did user provide new_ctrl for this domain
+ * @rmid_busy_llc:	bitmap of which limbo RMIDs are above threshold
+ * @mbm_total:		saved state for MBM total bandwidth
+ * @mbm_local:		saved state for MBM local bandwidth
+ * @mbm_over:		worker to periodically read MBM h/w counters
+ * @cqm_limbo:		worker to periodically read CQM h/w counters
+ * @mbm_work_cpu:	worker CPU for MBM h/w counters
+ * @cqm_work_cpu:	worker CPU for CQM h/w counters
+ * @plr:		pseudo-locked region (if any) associated with domain
+ */
+struct rdt_domain {
+	struct list_head		list;
+	int				id;
+	struct cpumask			cpu_mask;
+	u32				new_ctrl;
+	bool				have_new_ctrl;
+	unsigned long			*rmid_busy_llc;
+	struct mbm_state		*mbm_total;
+	struct mbm_state		*mbm_local;
+	struct delayed_work		mbm_over;
+	struct delayed_work		cqm_limbo;
+	int				mbm_work_cpu;
+	int				cqm_work_cpu;
+	struct pseudo_lock_region	*plr;
+};
 
 /**
  * struct resctrl_cache - Cache allocation related data
-- 
Gitee


From 54cd4ca85185b54c0bd12e820178c6becdd7e07a Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:16 +0000
Subject: [PATCH 1295/2161] x86/resctrl: Add a separate schema list for resctrl

commit cdb9ebc9178461c27d618bb1238e851da17271de upstream.

Resctrl exposes schemata to user-space, which allow the control values
to be specified for a group of tasks.

User-visible properties of the interface, (such as the schemata names
and how the values are parsed) are rooted in a struct provided by the
architecture code. (struct rdt_hw_resource). Once a second architecture
uses resctrl, this would allow user-visible properties to diverge
between architectures.

These properties should come from the resctrl code that will be common
to all architectures. Resctrl has no per-schema structure, only struct
rdt_{hw_,}resource. Create a struct resctrl_schema to hold the
rdt_resource. Before a second architecture can be supported, this
structure will also need to hold the schema name visible to user-space
and the type of configuration values for resctrl.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-4-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/internal.h |  1 +
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 43 +++++++++++++++++++++++++-
 include/linux/resctrl.h                | 11 +++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 79dfdd07bb15..b3551795a5af 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -111,6 +111,7 @@ extern unsigned int resctrl_cqm_threshold;
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
+extern struct list_head resctrl_schema_all;
 
 enum rdt_group_type {
 	RDTCTRL_GROUP = 0,
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f1233553d780..ca1d97e45de8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -39,6 +39,9 @@ static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
 LIST_HEAD(rdt_all_groups);
 
+/* list of entries for the schemata file */
+LIST_HEAD(resctrl_schema_all);
+
 /* Kernel fs node for "info" directory under root */
 static struct kernfs_node *kn_info;
 
@@ -2109,6 +2112,35 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 	return ret;
 }
 
+static int schemata_list_create(void)
+{
+	struct resctrl_schema *s;
+	struct rdt_resource *r;
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		s = kzalloc(sizeof(*s), GFP_KERNEL);
+		if (!s)
+			return -ENOMEM;
+
+		s->res = r;
+
+		INIT_LIST_HEAD(&s->list);
+		list_add(&s->list, &resctrl_schema_all);
+	}
+
+	return 0;
+}
+
+static void schemata_list_destroy(void)
+{
+	struct resctrl_schema *s, *tmp;
+
+	list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
 static int rdt_get_tree(struct fs_context *fc)
 {
 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
@@ -2130,11 +2162,17 @@ static int rdt_get_tree(struct fs_context *fc)
 	if (ret < 0)
 		goto out_cdp;
 
+	ret = schemata_list_create();
+	if (ret) {
+		schemata_list_destroy();
+		goto out_mba;
+	}
+
 	closid_init();
 
 	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
 	if (ret < 0)
-		goto out_mba;
+		goto out_schemata_free;
 
 	if (rdt_mon_capable) {
 		ret = mongroup_create_dir(rdtgroup_default.kn,
@@ -2184,6 +2222,8 @@ static int rdt_get_tree(struct fs_context *fc)
 		kernfs_remove(kn_mongrp);
 out_info:
 	kernfs_remove(kn_info);
+out_schemata_free:
+	schemata_list_destroy();
 out_mba:
 	if (ctx->enable_mba_mbps)
 		set_mba_sc(false);
@@ -2430,6 +2470,7 @@ static void rdt_kill_sb(struct super_block *sb)
 	rmdir_all_sub();
 	rdt_pseudo_lock_release();
 	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
+	schemata_list_destroy();
 	static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
 	static_branch_disable_cpuslocked(&rdt_mon_enable_key);
 	static_branch_disable_cpuslocked(&rdt_enable_key);
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index a4c89dafd7fa..5a21d483da6a 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -153,4 +153,15 @@ struct rdt_resource {
 
 };
 
+/**
+ * struct resctrl_schema - configuration abilities of a resource presented to
+ *			   user-space
+ * @list:	Member of resctrl_schema_all.
+ * @res:	The resource structure exported by the architecture to describe
+ *		the hardware that is configured by this schema.
+ */
+struct resctrl_schema {
+	struct list_head		list;
+	struct rdt_resource		*res;
+};
 #endif /* _RESCTRL_H */
-- 
Gitee


From d7db28afb5b4984f51a5e3abcbf4f5e27c4f6b58 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:17 +0000
Subject: [PATCH 1296/2161] x86/resctrl: Pass the schema in info dir's private
 pointer

commit f2594492308d2a950c9f765eb719480f3b881f0a upstream.

Many of resctrl's per-schema files return a value from struct
rdt_resource, which they take as their 'priv' pointer.

Moving properties that resctrl exposes to user-space into the core 'fs'
code, (e.g. the name of the schema), means some of the functions that
back the filesystem need the schema struct (to where the properties are
moved), but currently take struct rdt_resource. For example, once the
CDP resources are merged, struct rdt_resource no longer reflects all the
properties of the schema.

For the info dirs that represent a control, the information needed
will be accessed via struct resctrl_schema, as this is how the resource
is being used. For the monitors, its still struct rdt_resource as the
monitors aren't described as schema.

This difference means the type of the private pointers varies between
control and monitor info dirs.

Change the 'priv' pointer to point to struct resctrl_schema for
the per-schema files that represent a control. The type can be
determined from the fflags field. If the flags are RF_MON_INFO, its
a struct rdt_resource. If the flags are RF_CTRL_INFO, its a struct
resctrl_schema. No entry in res_common_files[] has both flags.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-5-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 38 +++++++++++++++++---------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index ca1d97e45de8..fbd292dc55e7 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -848,7 +848,8 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 static int rdt_num_closids_show(struct kernfs_open_file *of,
 				struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 	struct rdt_hw_resource *hw_res;
 
 	hw_res = resctrl_to_arch_res(r);
@@ -859,7 +860,8 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%x\n", r->default_ctrl);
 	return 0;
@@ -868,7 +870,8 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
 	return 0;
@@ -877,7 +880,8 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 				   struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%x\n", r->cache.shareable_bits);
 	return 0;
@@ -900,13 +904,14 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 static int rdt_bit_usage_show(struct kernfs_open_file *of,
 			      struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
 	/*
 	 * Use unsigned long even though only 32 bits are used to ensure
 	 * test_bit() is used safely.
 	 */
 	unsigned long sw_shareable = 0, hw_shareable = 0;
 	unsigned long exclusive = 0, pseudo_locked = 0;
+	struct rdt_resource *r = s->res;
 	struct rdt_domain *dom;
 	int i, hwb, swb, excl, psl;
 	enum rdtgrp_mode mode;
@@ -978,7 +983,8 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
 static int rdt_min_bw_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%u\n", r->membw.min_bw);
 	return 0;
@@ -1009,7 +1015,8 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%u\n", r->membw.bw_gran);
 	return 0;
@@ -1018,7 +1025,8 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of,
 static int rdt_delay_linear_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%u\n", r->membw.delay_linear);
 	return 0;
@@ -1038,7 +1046,8 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
 static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
 					 struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
 		seq_puts(seq, "per-thread\n");
@@ -1771,14 +1780,14 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
 	return ret;
 }
 
-static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
 				      unsigned long fflags)
 {
 	struct kernfs_node *kn_subdir;
 	int ret;
 
 	kn_subdir = kernfs_create_dir(kn_info, name,
-				      kn_info->mode, r);
+				      kn_info->mode, priv);
 	if (IS_ERR(kn_subdir))
 		return PTR_ERR(kn_subdir);
 
@@ -1795,6 +1804,7 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
 
 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 {
+	struct resctrl_schema *s;
 	struct rdt_resource *r;
 	unsigned long fflags;
 	char name[32];
@@ -1809,9 +1819,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 	if (ret)
 		goto out_destroy;
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	/* loop over enabled controls, these are all alloc_enabled */
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
 		fflags =  r->fflags | RF_CTRL_INFO;
-		ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
+		ret = rdtgroup_mkdir_info_resdir(s, r->name, fflags);
 		if (ret)
 			goto out_destroy;
 	}
-- 
Gitee


From dd7c6870d79009ef446b8885a245cbf3fbc6cc3a Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:18 +0000
Subject: [PATCH 1297/2161] x86/resctrl: Label the resources with their
 configuration type

commit 208ab16847c562c0d53a0266b6628ef6cb5ab5c2 upstream.

The names of resources are used for the schema name presented to
user-space. The name used is rooted in a structure provided by the
architecture code because the names are different when CDP is enabled.
x86 implements this by swapping between two sets of resource structures
based on their alloc_enabled flag. The type of configuration in-use is
encoded in the name (and cbm_idx_offset).

Once the CDP behaviour is moved into the parts of resctrl that will
move to /fs/, there will be two struct resctrl_schema for one struct
rdt_resource. The schema describes the type of configuration being
applied to the resource. The name of the schema should be generated
by resctrl, base on the type of configuration. To do this struct
resctrl_schema needs to store the type of configuration in use for a
schema.

Create an enum resctrl_conf_type describing the options, and add it to
struct resctrl_schema. The underlying resources are still separate, as
cbm_idx_offset is still in use.

Temporarily label all the entries in rdt_resources_all[] and copy that
value to struct resctrl_schema. Copying the value ensures there is no
mismatch while the filesystem parts of resctrl are modified to use the
schema. Once the resources are merged, the filesystem code can assign
this value based on the schema being created.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-6-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     |  7 +++++++
 arch/x86/kernel/cpu/resctrl/internal.h |  2 ++
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  1 +
 include/linux/resctrl.h                | 14 ++++++++++++++
 4 files changed, 24 insertions(+)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index f796ec53a63f..9d384bba110a 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -62,6 +62,7 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
 struct rdt_hw_resource rdt_resources_all[] = {
 	[RDT_RESOURCE_L3] =
 	{
+		.conf_type			= CDP_NONE,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_L3,
 			.name			= "L3",
@@ -81,6 +82,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_L3DATA] =
 	{
+		.conf_type			= CDP_DATA,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_L3DATA,
 			.name			= "L3DATA",
@@ -100,6 +102,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_L3CODE] =
 	{
+		.conf_type			= CDP_CODE,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_L3CODE,
 			.name			= "L3CODE",
@@ -119,6 +122,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_L2] =
 	{
+		.conf_type			= CDP_NONE,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_L2,
 			.name			= "L2",
@@ -138,6 +142,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_L2DATA] =
 	{
+		.conf_type			= CDP_DATA,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_L2DATA,
 			.name			= "L2DATA",
@@ -157,6 +162,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_L2CODE] =
 	{
+		.conf_type			= CDP_CODE,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_L2CODE,
 			.name			= "L2CODE",
@@ -176,6 +182,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_MBA] =
 	{
+		.conf_type			= CDP_NONE,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_MBA,
 			.name			= "MB",
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index b3551795a5af..3630ca62d132 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -370,6 +370,7 @@ struct rdt_parse_data {
 
 /**
  * struct rdt_hw_resource - arch private attributes of a resctrl resource
+ * @conf_type:		The type that should be used when configuring. temporary
  * @r_resctrl:		Attributes of the resource used directly by resctrl.
  * @num_closid:		Maximum number of closid this hardware can support.
  * @msr_base:		Base MSR address for CBMs
@@ -378,6 +379,7 @@ struct rdt_parse_data {
  * @mbm_width:		Monitor width, to detect and correct for overflow.
  */
 struct rdt_hw_resource {
+	enum resctrl_conf_type	conf_type;
 	struct rdt_resource	r_resctrl;
 	int			num_closid;
 	unsigned int		msr_base;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index fbd292dc55e7..caf9884adf78 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2135,6 +2135,7 @@ static int schemata_list_create(void)
 			return -ENOMEM;
 
 		s->res = r;
+		s->conf_type = resctrl_to_arch_res(r)->conf_type;
 
 		INIT_LIST_HEAD(&s->list);
 		list_add(&s->list, &resctrl_schema_all);
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 5a21d483da6a..095ed48168d7 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -15,6 +15,18 @@ int proc_resctrl_show(struct seq_file *m,
 
 #endif
 
+/**
+ * enum resctrl_conf_type - The type of configuration.
+ * @CDP_NONE:	No prioritisation, both code and data are controlled or monitored.
+ * @CDP_CODE:	Configuration applies to instruction fetches.
+ * @CDP_DATA:	Configuration applies to reads and writes.
+ */
+enum resctrl_conf_type {
+	CDP_NONE,
+	CDP_CODE,
+	CDP_DATA,
+};
+
 /**
  * struct rdt_domain - group of CPUs sharing a resctrl resource
  * @list:		all instances of this resource
@@ -157,11 +169,13 @@ struct rdt_resource {
  * struct resctrl_schema - configuration abilities of a resource presented to
  *			   user-space
  * @list:	Member of resctrl_schema_all.
+ * @conf_type:	Whether this schema is specific to code/data.
  * @res:	The resource structure exported by the architecture to describe
  *		the hardware that is configured by this schema.
  */
 struct resctrl_schema {
 	struct list_head		list;
+	enum resctrl_conf_type		conf_type;
 	struct rdt_resource		*res;
 };
 #endif /* _RESCTRL_H */
-- 
Gitee


From 34a0f3084d974e8d8bdd9c48c4c74119f5d4c64b Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:19 +0000
Subject: [PATCH 1298/2161] x86/resctrl: Walk the resctrl schema list instead
 of an arch list

commit 331ebe4c43496cdc7f8d9a32d4ef59300b748435 upstream.

When parsing a schema configuration value from user-space, resctrl walks
the architectures rdt_resources_all[] array to find a matching struct
rdt_resource.

Once the CDP resources are merged there will be one resource in use
by two schemata. Anything walking rdt_resources_all[] on behalf of a
user-space request should walk the list of struct resctrl_schema
instead.

Change the users of for_each_alloc_enabled_rdt_resource() to walk the
schema instead. Schemata were only created for alloc_enabled resources
so these two lists are currently equivalent.

schemata_list_create() and rdt_kill_sb() are ignored. The first
creates the schema list, and will eventually loop over the resource
indexes using an arch helper to retrieve the resource. rdt_kill_sb()
will eventually make use of an arch 'reset everything' helper.

After the filesystem code is moved, rdtgroup_pseudo_locked_in_hierarchy()
remains part of the x86 specific hooks to support pseudo lock. This
code walks each domain, and still does this after the separate resources
are merged.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-7-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 23 +++++++++++++++--------
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 18 ++++++++++++------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 08eef539cb6c..405b99d31ef9 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -287,10 +287,12 @@ static int rdtgroup_parse_resource(char *resname, char *tok,
 				   struct rdtgroup *rdtgrp)
 {
 	struct rdt_hw_resource *hw_res;
+	struct resctrl_schema *s;
 	struct rdt_resource *r;
 
-	for_each_alloc_enabled_rdt_resource(r) {
-		hw_res = resctrl_to_arch_res(r);
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
+		hw_res = resctrl_to_arch_res(s->res);
 		if (!strcmp(resname, r->name) && rdtgrp->closid < hw_res->num_closid)
 			return parse_line(tok, r, rdtgrp);
 	}
@@ -301,6 +303,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok,
 ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off)
 {
+	struct resctrl_schema *s;
 	struct rdtgroup *rdtgrp;
 	struct rdt_domain *dom;
 	struct rdt_resource *r;
@@ -331,8 +334,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 		goto out;
 	}
 
-	for_each_alloc_enabled_rdt_resource(r) {
-		list_for_each_entry(dom, &r->domains, list)
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		list_for_each_entry(dom, &s->res->domains, list)
 			dom->have_new_ctrl = false;
 	}
 
@@ -353,7 +356,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 			goto out;
 	}
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
 		ret = update_domains(r, rdtgrp->closid);
 		if (ret)
 			goto out;
@@ -401,6 +405,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v)
 {
 	struct rdt_hw_resource *hw_res;
+	struct resctrl_schema *schema;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
 	int ret = 0;
@@ -409,8 +414,10 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (rdtgrp) {
 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-			for_each_alloc_enabled_rdt_resource(r)
+			list_for_each_entry(schema, &resctrl_schema_all, list) {
+				r = schema->res;
 				seq_printf(s, "%s:uninitialized\n", r->name);
+			}
 		} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
 			if (!rdtgrp->plr->d) {
 				rdt_last_cmd_clear();
@@ -424,8 +431,8 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			}
 		} else {
 			closid = rdtgrp->closid;
-			for_each_alloc_enabled_rdt_resource(r) {
-				hw_res = resctrl_to_arch_res(r);
+			list_for_each_entry(schema, &resctrl_schema_all, list) {
+				hw_res = resctrl_to_arch_res(schema->res);
 				if (closid < hw_res->num_closid)
 					show_doms(s, r, closid);
 			}
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index caf9884adf78..c2032aa0ca13 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -104,12 +104,12 @@ int closids_supported(void)
 static void closid_init(void)
 {
 	struct rdt_hw_resource *hw_res;
-	struct rdt_resource *r;
+	struct resctrl_schema *s;
 	int rdt_min_closid = 32;
 
 	/* Compute rdt_min_closid across all resources */
-	for_each_alloc_enabled_rdt_resource(r) {
-		hw_res = resctrl_to_arch_res(r);
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		hw_res = resctrl_to_arch_res(s->res);
 		rdt_min_closid = min(rdt_min_closid, hw_res->num_closid);
 	}
 
@@ -1276,11 +1276,13 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 {
 	struct rdt_hw_domain *hw_dom;
 	int closid = rdtgrp->closid;
+	struct resctrl_schema *s;
 	struct rdt_resource *r;
 	bool has_cache = false;
 	struct rdt_domain *d;
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
 		if (r->rid == RDT_RESOURCE_MBA)
 			continue;
 		has_cache = true;
@@ -1418,6 +1420,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
 static int rdtgroup_size_show(struct kernfs_open_file *of,
 			      struct seq_file *s, void *v)
 {
+	struct resctrl_schema *schema;
 	struct rdt_hw_domain *hw_dom;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
@@ -1449,7 +1452,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 		goto out;
 	}
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	list_for_each_entry(schema, &resctrl_schema_all, list) {
+		r = schema->res;
 		sep = false;
 		seq_printf(s, "%*s:", max_name_width, r->name);
 		list_for_each_entry(d, &r->domains, list) {
@@ -2820,10 +2824,12 @@ static void rdtgroup_init_mba(struct rdt_resource *r)
 /* Initialize the RDT group's allocations. */
 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
 {
+	struct resctrl_schema *s;
 	struct rdt_resource *r;
 	int ret;
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
 		if (r->rid == RDT_RESOURCE_MBA) {
 			rdtgroup_init_mba(r);
 		} else {
-- 
Gitee


From aed16e023ca0d27b7cf689336155464333eb04c3 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:20 +0000
Subject: [PATCH 1299/2161] x86/resctrl: Store the effective num_closid in the
 schema

commit 3183e87c1b797caaeb208b01c99bea8140273a16 upstream.

Struct resctrl_schema holds properties that vary with the style of
configuration that resctrl applies to a resource. There are already
two values for the hardware's num_closid, depending on whether the
architecture presents the L3 or L3CODE/L3DATA resources.

As the way CDP changes the number of control groups that resctrl can
create is part of the user-space interface, it should be managed by the
filesystem parts of resctrl. This allows the architecture code to only
describe the value the hardware supports.

Add num_closid to resctrl_schema. This is the value seen by the
filesystem, which may be different to the maximum value described by the
arch code when CDP is enabled.

These functions operate on the num_closid value that is exposed to
user-space:

  * rdtgroup_parse_resource()
  * rdtgroup_schemata_show()
  * rdt_num_closids_show()
  * closid_init()

Change them to use the schema value instead. schemata_list_create() sets
this value, and reaches into the architecture-specific structure to get
the value. This will eventually be replaced with a helper.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-8-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c |  9 +++------
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 13 ++++---------
 include/linux/resctrl.h                   |  4 ++++
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 405b99d31ef9..d10fddaef5f4 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -286,14 +286,12 @@ int update_domains(struct rdt_resource *r, int closid)
 static int rdtgroup_parse_resource(char *resname, char *tok,
 				   struct rdtgroup *rdtgrp)
 {
-	struct rdt_hw_resource *hw_res;
 	struct resctrl_schema *s;
 	struct rdt_resource *r;
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
-		hw_res = resctrl_to_arch_res(s->res);
-		if (!strcmp(resname, r->name) && rdtgrp->closid < hw_res->num_closid)
+		if (!strcmp(resname, r->name) && rdtgrp->closid < s->num_closid)
 			return parse_line(tok, r, rdtgrp);
 	}
 	rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
@@ -404,7 +402,6 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v)
 {
-	struct rdt_hw_resource *hw_res;
 	struct resctrl_schema *schema;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
@@ -432,8 +429,8 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 		} else {
 			closid = rdtgrp->closid;
 			list_for_each_entry(schema, &resctrl_schema_all, list) {
-				hw_res = resctrl_to_arch_res(schema->res);
-				if (closid < hw_res->num_closid)
+				r = schema->res;
+				if (closid < schema->num_closid)
 					show_doms(s, r, closid);
 			}
 		}
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index c2032aa0ca13..347d52c26c72 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -103,15 +103,12 @@ int closids_supported(void)
 
 static void closid_init(void)
 {
-	struct rdt_hw_resource *hw_res;
 	struct resctrl_schema *s;
 	int rdt_min_closid = 32;
 
 	/* Compute rdt_min_closid across all resources */
-	list_for_each_entry(s, &resctrl_schema_all, list) {
-		hw_res = resctrl_to_arch_res(s->res);
-		rdt_min_closid = min(rdt_min_closid, hw_res->num_closid);
-	}
+	list_for_each_entry(s, &resctrl_schema_all, list)
+		rdt_min_closid = min(rdt_min_closid, s->num_closid);
 
 	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
 
@@ -849,11 +846,8 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
 				struct seq_file *seq, void *v)
 {
 	struct resctrl_schema *s = of->kn->parent->priv;
-	struct rdt_resource *r = s->res;
-	struct rdt_hw_resource *hw_res;
 
-	hw_res = resctrl_to_arch_res(r);
-	seq_printf(seq, "%d\n", hw_res->num_closid);
+	seq_printf(seq, "%u\n", s->num_closid);
 	return 0;
 }
 
@@ -2140,6 +2134,7 @@ static int schemata_list_create(void)
 
 		s->res = r;
 		s->conf_type = resctrl_to_arch_res(r)->conf_type;
+		s->num_closid = resctrl_to_arch_res(r)->num_closid;
 
 		INIT_LIST_HEAD(&s->list);
 		list_add(&s->list, &resctrl_schema_all);
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 095ed48168d7..59d0fa78bb69 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -172,10 +172,14 @@ struct rdt_resource {
  * @conf_type:	Whether this schema is specific to code/data.
  * @res:	The resource structure exported by the architecture to describe
  *		the hardware that is configured by this schema.
+ * @num_closid:	The number of closid that can be used with this schema. When
+ *		features like CDP are enabled, this will be lower than the
+ *		hardware supports for the resource.
  */
 struct resctrl_schema {
 	struct list_head		list;
 	enum resctrl_conf_type		conf_type;
 	struct rdt_resource		*res;
+	int				num_closid;
 };
 #endif /* _RESCTRL_H */
-- 
Gitee


From 28f21186f8fac938e5135568bc78cc83c985aa7d Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:21 +0000
Subject: [PATCH 1300/2161] x86/resctrl: Add resctrl_arch_get_num_closid()

commit eb6f3187694158ca36e50083e861531488d5c1b1 upstream.

To initialise struct resctrl_schema's num_closid, schemata_list_create()
reaches into the architectures private structure to retrieve num_closid
from the struct rdt_hw_resource. The 'half the closids' behaviour should
be part of the filesystem parts of resctrl that are the same on any
architecture. struct resctrl_schema's num_closid should include any
correction for CDP.

Having two properties called num_closid is likely to be confusing when
they have different values.

Add a helper to read the resource's num_closid from the arch code.
This should return the number of closid that the resource supports,
regardless of whether CDP is in use. Once the CDP resources are merged,
schemata_list_create() can apply the correction itself.

Using a type with an obvious size for the arch helper means changing the
type of num_closid to u32, which matches the type already used by struct
rdtgroup.

reset_all_ctrls() does not use resctrl_arch_get_num_closid(), even
though it sets up a structure for modifying the hardware. This function
will be part of the architecture code, the maximum closid should be the
maximum value the hardware has, regardless of the way resctrl is using
it. All the uses of num_closid in core.c are naturally part of the
architecture specific code.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-9-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     | 5 +++++
 arch/x86/kernel/cpu/resctrl/internal.h | 8 ++++++--
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 ++--
 include/linux/resctrl.h                | 6 +++++-
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 9d384bba110a..933ea7cf606b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -444,6 +444,11 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
 	return NULL;
 }
 
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
+{
+	return resctrl_to_arch_res(r)->num_closid;
+}
+
 void rdt_ctrl_update(void *arg)
 {
 	struct msr_param *m = arg;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 3630ca62d132..d11b827c4d0e 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -372,7 +372,11 @@ struct rdt_parse_data {
  * struct rdt_hw_resource - arch private attributes of a resctrl resource
  * @conf_type:		The type that should be used when configuring. temporary
  * @r_resctrl:		Attributes of the resource used directly by resctrl.
- * @num_closid:		Maximum number of closid this hardware can support.
+ * @num_closid:		Maximum number of closid this hardware can support,
+ *			regardless of CDP. This is exposed via
+ *			resctrl_arch_get_num_closid() to avoid confusion
+ *			with struct resctrl_schema's property of the same name,
+ *			which has been corrected for features like CDP.
  * @msr_base:		Base MSR address for CBMs
  * @msr_update:		Function pointer to update QOS MSRs
  * @mon_scale:		cqm counter * mon_scale = occupancy in bytes
@@ -381,7 +385,7 @@ struct rdt_parse_data {
 struct rdt_hw_resource {
 	enum resctrl_conf_type	conf_type;
 	struct rdt_resource	r_resctrl;
-	int			num_closid;
+	u32			num_closid;
 	unsigned int		msr_base;
 	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
 				 struct rdt_resource *r);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 347d52c26c72..ea6b42272d5e 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -104,7 +104,7 @@ int closids_supported(void)
 static void closid_init(void)
 {
 	struct resctrl_schema *s;
-	int rdt_min_closid = 32;
+	u32 rdt_min_closid = 32;
 
 	/* Compute rdt_min_closid across all resources */
 	list_for_each_entry(s, &resctrl_schema_all, list)
@@ -2134,7 +2134,7 @@ static int schemata_list_create(void)
 
 		s->res = r;
 		s->conf_type = resctrl_to_arch_res(r)->conf_type;
-		s->num_closid = resctrl_to_arch_res(r)->num_closid;
+		s->num_closid = resctrl_arch_get_num_closid(r);
 
 		INIT_LIST_HEAD(&s->list);
 		list_add(&s->list, &resctrl_schema_all);
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 59d0fa78bb69..b9d200592e54 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -180,6 +180,10 @@ struct resctrl_schema {
 	struct list_head		list;
 	enum resctrl_conf_type		conf_type;
 	struct rdt_resource		*res;
-	int				num_closid;
+	u32				num_closid;
 };
+
+/* The number of closid supported by this resource regardless of CDP */
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
+
 #endif /* _RESCTRL_H */
-- 
Gitee


From f81900f6ea2ff460e0fccd79e61e5194dcb4f3b7 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:22 +0000
Subject: [PATCH 1301/2161] x86/resctrl: Pass the schema to resctrl filesystem
 functions

commit 1c290682c0c9c47aa7594ffc83b9cedd20c1ec87 upstream.

Once the CDP resources are merged, there will be two struct
resctrl_schema for one struct rdt_resource. CDP becomes a type of
configuration that belongs to the schema.

Helpers like rdtgroup_cbm_overlaps() need access to the schema to query
the configuration (or configurations) based on schema properties.

Change these functions to take a struct schema instead of the struct
rdt_resource. All the modified functions are part of the filesystem code
that will move to /fs/resctrl once it is possible to support a second
architecture.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-10-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 23 +++++++++++++----------
 arch/x86/kernel/cpu/resctrl/internal.h    |  6 +++---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 19 +++++++++++--------
 include/linux/resctrl.h                   |  3 ++-
 4 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index d10fddaef5f4..219b057e65b0 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -57,9 +57,10 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 	return true;
 }
 
-int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
 	     struct rdt_domain *d)
 {
+	struct rdt_resource *r = s->res;
 	unsigned long bw_val;
 
 	if (d->have_new_ctrl) {
@@ -125,10 +126,11 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
  * Read one cache bit mask (hex). Check that it is valid for the current
  * resource type.
  */
-int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 	      struct rdt_domain *d)
 {
 	struct rdtgroup *rdtgrp = data->rdtgrp;
+	struct rdt_resource *r = s->res;
 	u32 cbm_val;
 
 	if (d->have_new_ctrl) {
@@ -160,12 +162,12 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
 	 * The CBM may not overlap with the CBM of another closid if
 	 * either is exclusive.
 	 */
-	if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) {
+	if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
 		rdt_last_cmd_puts("Overlaps with exclusive group\n");
 		return -EINVAL;
 	}
 
-	if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) {
+	if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
 		if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 			rdt_last_cmd_puts("Overlaps with other group\n");
@@ -185,9 +187,10 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
  * separated by ";". The "id" is in decimal, and must match one of
  * the "id"s for this resource.
  */
-static int parse_line(char *line, struct rdt_resource *r,
+static int parse_line(char *line, struct resctrl_schema *s,
 		      struct rdtgroup *rdtgrp)
 {
+	struct rdt_resource *r = s->res;
 	struct rdt_parse_data data;
 	char *dom = NULL, *id;
 	struct rdt_domain *d;
@@ -213,7 +216,7 @@ static int parse_line(char *line, struct rdt_resource *r,
 		if (d->id == dom_id) {
 			data.buf = dom;
 			data.rdtgrp = rdtgrp;
-			if (r->parse_ctrlval(&data, r, d))
+			if (r->parse_ctrlval(&data, s, d))
 				return -EINVAL;
 			if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
 				/*
@@ -292,7 +295,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok,
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
 		if (!strcmp(resname, r->name) && rdtgrp->closid < s->num_closid)
-			return parse_line(tok, r, rdtgrp);
+			return parse_line(tok, s, rdtgrp);
 	}
 	rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
 	return -EINVAL;
@@ -377,8 +380,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 	return ret ?: nbytes;
 }
 
-static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
 {
+	struct rdt_resource *r = schema->res;
 	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *dom;
 	bool sep = false;
@@ -429,9 +433,8 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 		} else {
 			closid = rdtgrp->closid;
 			list_for_each_entry(schema, &resctrl_schema_all, list) {
-				r = schema->res;
 				if (closid < schema->num_closid)
-					show_doms(s, r, closid);
+					show_doms(s, schema, closid);
 			}
 		}
 	} else {
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index d11b827c4d0e..c83188eec105 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -398,9 +398,9 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
 	return container_of(r, struct rdt_hw_resource, r_resctrl);
 }
 
-int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 	      struct rdt_domain *d);
-int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
 	     struct rdt_domain *d);
 
 extern struct mutex rdtgroup_mutex;
@@ -502,7 +502,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off);
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
 			   unsigned long cbm, int closid, bool exclusive);
 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
 				  unsigned long cbm);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index ea6b42272d5e..b7a68a536e6c 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1221,7 +1221,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 
 /**
  * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
- * @r: Resource to which domain instance @d belongs.
+ * @s: Schema for the resource to which domain instance @d belongs.
  * @d: The domain instance for which @closid is being tested.
  * @cbm: Capacity bitmask being tested.
  * @closid: Intended closid for @cbm.
@@ -1239,9 +1239,10 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
  *
  * Return: true if CBM overlap detected, false if there is no overlap
  */
-bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
 			   unsigned long cbm, int closid, bool exclusive)
 {
+	struct rdt_resource *r = s->res;
 	struct rdt_resource *r_cdp;
 	struct rdt_domain *d_cdp;
 
@@ -1282,7 +1283,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 		has_cache = true;
 		list_for_each_entry(d, &r->domains, list) {
 			hw_dom = resctrl_to_arch_dom(d);
-			if (rdtgroup_cbm_overlaps(r, d, hw_dom->ctrl_val[closid],
+			if (rdtgroup_cbm_overlaps(s, d,
+						  hw_dom->ctrl_val[closid],
 						  rdtgrp->closid, false)) {
 				rdt_last_cmd_puts("Schemata overlaps\n");
 				return false;
@@ -2717,11 +2719,12 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
  * Set the RDT domain up to start off with all usable allocations. That is,
  * all shareable and unused bits. All-zero CBM is invalid.
  */
-static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
+static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 				 u32 closid)
 {
 	struct rdt_resource *r_cdp = NULL;
 	struct rdt_domain *d_cdp = NULL;
+	struct rdt_resource *r = s->res;
 	u32 used_b = 0, unused_b = 0;
 	unsigned long tmp_cbm;
 	enum rdtgrp_mode mode;
@@ -2791,13 +2794,13 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
  * If there are no more shareable bits available on any domain then
  * the entire allocation will fail.
  */
-static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid)
+static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
 {
 	struct rdt_domain *d;
 	int ret;
 
-	list_for_each_entry(d, &r->domains, list) {
-		ret = __init_one_rdt_domain(d, r, closid);
+	list_for_each_entry(d, &s->res->domains, list) {
+		ret = __init_one_rdt_domain(d, s, closid);
 		if (ret < 0)
 			return ret;
 	}
@@ -2828,7 +2831,7 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
 		if (r->rid == RDT_RESOURCE_MBA) {
 			rdtgroup_init_mba(r);
 		} else {
-			ret = rdtgroup_init_cat(r, rdtgrp->closid);
+			ret = rdtgroup_init_cat(s, rdtgrp->closid);
 			if (ret < 0)
 				return ret;
 		}
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index b9d200592e54..979592c869e6 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -121,6 +121,7 @@ struct resctrl_membw {
 };
 
 struct rdt_parse_data;
+struct resctrl_schema;
 
 /**
  * struct rdt_resource - attributes of a resctrl resource
@@ -158,7 +159,7 @@ struct rdt_resource {
 	u32			default_ctrl;
 	const char		*format_str;
 	int			(*parse_ctrlval)(struct rdt_parse_data *data,
-						 struct rdt_resource *r,
+						 struct resctrl_schema *s,
 						 struct rdt_domain *d);
 	struct list_head	evt_list;
 	unsigned long		fflags;
-- 
Gitee


From 7d108cd68d0696df4080f14c183880615d68d304 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:23 +0000
Subject: [PATCH 1302/2161] x86/resctrl: Swizzle rdt_resource and
 resctrl_schema in pseudo_lock_region

commit 32150edd3fcf6ee002668878e0b010d402db29b2 upstream.

struct pseudo_lock_region points to the rdt_resource.

Once the resources are merged, this won't be unique. The resource name
is moving into the schema, so that the filesystem portions of resctrl can
generate it.

Swap pseudo_lock_region's rdt_resource pointer for a schema pointer.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-11-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 4 ++--
 arch/x86/kernel/cpu/resctrl/internal.h    | 6 +++---
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 8 ++++----
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 4 ++--
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 219b057e65b0..0ee1ded5b8ff 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -227,7 +227,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
 				 * the required initialization for single
 				 * region and return.
 				 */
-				rdtgrp->plr->r = r;
+				rdtgrp->plr->s = s;
 				rdtgrp->plr->d = d;
 				rdtgrp->plr->cbm = d->new_ctrl;
 				d->plr = rdtgrp->plr;
@@ -426,7 +426,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 				ret = -ENODEV;
 			} else {
 				seq_printf(s, "%s:%d=%x\n",
-					   rdtgrp->plr->r->name,
+					   rdtgrp->plr->s->res->name,
 					   rdtgrp->plr->d->id,
 					   rdtgrp->plr->cbm);
 			}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index c83188eec105..abc00d5a04dd 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -164,8 +164,8 @@ struct mongroup {
 
 /**
  * struct pseudo_lock_region - pseudo-lock region information
- * @r:			RDT resource to which this pseudo-locked region
- *			belongs
+ * @s:			Resctrl schema for the resource to which this
+ *			pseudo-locked region belongs
  * @d:			RDT domain to which this pseudo-locked region
  *			belongs
  * @cbm:		bitmask of the pseudo-locked region
@@ -185,7 +185,7 @@ struct mongroup {
  * @pm_reqs:		Power management QoS requests related to this region
  */
 struct pseudo_lock_region {
-	struct rdt_resource	*r;
+	struct resctrl_schema	*s;
 	struct rdt_domain	*d;
 	u32			cbm;
 	wait_queue_head_t	lock_thread_wq;
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 3cbfa37d6f0d..a50b2da3739b 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -250,7 +250,7 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
 	plr->line_size = 0;
 	kfree(plr->kmem);
 	plr->kmem = NULL;
-	plr->r = NULL;
+	plr->s = NULL;
 	if (plr->d)
 		plr->d->plr = NULL;
 	plr->d = NULL;
@@ -294,10 +294,10 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
 
 	ci = get_cpu_cacheinfo(plr->cpu);
 
-	plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm);
+	plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
 
 	for (i = 0; i < ci->num_leaves; i++) {
-		if (ci->info_list[i].level == plr->r->cache_level) {
+		if (ci->info_list[i].level == plr->s->res->cache_level) {
 			plr->line_size = ci->info_list[i].coherency_line_size;
 			return 0;
 		}
@@ -800,7 +800,7 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm
 	unsigned long cbm_b;
 
 	if (d->plr) {
-		cbm_len = d->plr->r->cache.cbm_len;
+		cbm_len = d->plr->s->res->cache.cbm_len;
 		cbm_b = d->plr->cbm;
 		if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
 			return true;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index b7a68a536e6c..5a107f953321 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1439,8 +1439,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 			ret = -ENODEV;
 		} else {
 			seq_printf(s, "%*s:", max_name_width,
-				   rdtgrp->plr->r->name);
-			size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
+				   rdtgrp->plr->s->res->name);
+			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
 						    rdtgrp->plr->d,
 						    rdtgrp->plr->cbm);
 			seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
-- 
Gitee


From ffa7f5c7aaa7febdd399041fafa2cec621d20983 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:24 +0000
Subject: [PATCH 1303/2161] x86/resctrl: Add a helper to read/set the CDP
 configuration

commit c091e90721b836c2367fa3017636d92427f3f8f7 upstream.

Whether CDP is enabled for a hardware resource like the L3 cache can be
found by inspecting the alloc_enabled flags of the L3CODE/L3DATA struct
rdt_hw_resources, even if they aren't in use.

Once these resources are merged, the flags can't be compared. Whether
CDP is enabled needs tracking explicitly. If another architecture is
emulating CDP the behaviour may not be per-resource. 'cdp_capable' needs
to be visible to resctrl, even if its not in use, as this affects the
padding of the schemata table visible to user-space.

Add cdp_enabled to struct rdt_hw_resource and cdp_capable to struct
rdt_resource. Add resctrl_arch_set_cdp_enabled() to let resctrl enable
or disable CDP on a resource. resctrl_arch_get_cdp_enabled() lets it
read the current state.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-12-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        |  4 ++
 arch/x86/kernel/cpu/resctrl/internal.h    | 11 +++-
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c |  4 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 75 +++++++++++++----------
 include/linux/resctrl.h                   |  3 +-
 5 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 933ea7cf606b..af8a2ec216ba 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -368,6 +368,10 @@ static void rdt_get_cdp_config(int level, int type)
 	 * "cdp" during resctrl file system mount time.
 	 */
 	r->alloc_enabled = false;
+	rdt_resources_all[level].cdp_enabled = false;
+	rdt_resources_all[type].cdp_enabled = false;
+	r_l->cdp_capable = true;
+	r->cdp_capable = true;
 }
 
 static void rdt_get_cdp_l3_config(void)
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index abc00d5a04dd..3069a38516ef 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -381,6 +381,7 @@ struct rdt_parse_data {
  * @msr_update:		Function pointer to update QOS MSRs
  * @mon_scale:		cqm counter * mon_scale = occupancy in bytes
  * @mbm_width:		Monitor width, to detect and correct for overflow.
+ * @cdp_enabled:	CDP state of this resource
  */
 struct rdt_hw_resource {
 	enum resctrl_conf_type	conf_type;
@@ -391,6 +392,7 @@ struct rdt_hw_resource {
 				 struct rdt_resource *r);
 	unsigned int		mon_scale;
 	unsigned int		mbm_width;
+	bool			cdp_enabled;
 };
 
 static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
@@ -411,7 +413,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 extern struct dentry *debugfs_resctrl;
 
-enum {
+enum resctrl_res_level {
 	RDT_RESOURCE_L3,
 	RDT_RESOURCE_L3DATA,
 	RDT_RESOURCE_L3CODE,
@@ -432,6 +434,13 @@ static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
 	return &hw_res->r_resctrl;
 }
 
+static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
+{
+	return rdt_resources_all[l].cdp_enabled;
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
+
 /*
  * To return the common struct rdt_resource, which is contained in struct
  * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index a50b2da3739b..730b5d4b3f7b 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -688,8 +688,8 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
 	 *   resource, the portion of cache used by it should be made
 	 *   unavailable to all future allocations from both resources.
 	 */
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl.alloc_enabled ||
-	    rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl.alloc_enabled) {
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
+	    resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
 		rdt_last_cmd_puts("CDP enabled\n");
 		return -EINVAL;
 	}
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 5a107f953321..5413a48b89ae 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1940,14 +1940,16 @@ static int set_cache_qos_cfg(int level, bool enable)
 /* Restore the qos cfg state when a domain comes online */
 void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
 {
-	if (!r->alloc_capable)
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	if (!r->cdp_capable)
 		return;
 
 	if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl)
-		l2_qos_cfg_update(&r->alloc_enabled);
+		l2_qos_cfg_update(&hw_res->cdp_enabled);
 
 	if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl)
-		l3_qos_cfg_update(&r->alloc_enabled);
+		l3_qos_cfg_update(&hw_res->cdp_enabled);
 }
 
 /*
@@ -1991,51 +1993,62 @@ static int cdp_enable(int level, int data_type, int code_type)
 		r_l->alloc_enabled = false;
 		r_ldata->alloc_enabled = true;
 		r_lcode->alloc_enabled = true;
+		rdt_resources_all[level].cdp_enabled = true;
+		rdt_resources_all[data_type].cdp_enabled = true;
+		rdt_resources_all[code_type].cdp_enabled = true;
 	}
 	return ret;
 }
 
-static int cdpl3_enable(void)
-{
-	return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
-			  RDT_RESOURCE_L3CODE);
-}
-
-static int cdpl2_enable(void)
-{
-	return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
-			  RDT_RESOURCE_L2CODE);
-}
-
 static void cdp_disable(int level, int data_type, int code_type)
 {
-	struct rdt_resource *r = &rdt_resources_all[level].r_resctrl;
+	struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
+	struct rdt_resource *r = &r_hw->r_resctrl;
 
 	r->alloc_enabled = r->alloc_capable;
 
-	if (rdt_resources_all[data_type].r_resctrl.alloc_enabled) {
+	if (r_hw->cdp_enabled) {
 		rdt_resources_all[data_type].r_resctrl.alloc_enabled = false;
 		rdt_resources_all[code_type].r_resctrl.alloc_enabled = false;
 		set_cache_qos_cfg(level, false);
+		r_hw->cdp_enabled = false;
+		rdt_resources_all[data_type].cdp_enabled = false;
+		rdt_resources_all[code_type].cdp_enabled = false;
 	}
 }
 
-static void cdpl3_disable(void)
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
 {
-	cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
-}
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
+	enum resctrl_res_level code_type, data_type;
 
-static void cdpl2_disable(void)
-{
-	cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
+	if (!hw_res->r_resctrl.cdp_capable)
+		return -EINVAL;
+
+	if (l == RDT_RESOURCE_L3) {
+		code_type = RDT_RESOURCE_L3CODE;
+		data_type = RDT_RESOURCE_L3DATA;
+	} else if (l == RDT_RESOURCE_L2) {
+		code_type = RDT_RESOURCE_L2CODE;
+		data_type = RDT_RESOURCE_L2DATA;
+	} else {
+		return -EINVAL;
+	}
+
+	if (enable)
+		return cdp_enable(l, data_type, code_type);
+
+	cdp_disable(l, data_type, code_type);
+
+	return 0;
 }
 
 static void cdp_disable_all(void)
 {
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl.alloc_enabled)
-		cdpl3_disable();
-	if (rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl.alloc_enabled)
-		cdpl2_disable();
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
+		resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
+		resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
 }
 
 /*
@@ -2113,10 +2126,10 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 	int ret = 0;
 
 	if (ctx->enable_cdpl2)
-		ret = cdpl2_enable();
+		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
 
 	if (!ret && ctx->enable_cdpl3)
-		ret = cdpl3_enable();
+		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
 
 	if (!ret && ctx->enable_mba_mbps)
 		ret = set_mba_sc(true);
@@ -3204,10 +3217,10 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 {
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl.alloc_enabled)
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
 		seq_puts(seq, ",cdp");
 
-	if (rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl.alloc_enabled)
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
 		seq_puts(seq, ",cdpl2");
 
 	if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 979592c869e6..4b30571fbc8e 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -142,6 +142,7 @@ struct resctrl_schema;
  * @parse_ctrlval:	Per resource function pointer to parse control values
  * @evt_list:		List of monitoring events
  * @fflags:		flags to choose base and info files
+ * @cdp_capable:	Is the CDP feature available on this resource
  */
 struct rdt_resource {
 	int			rid;
@@ -163,7 +164,7 @@ struct rdt_resource {
 						 struct rdt_domain *d);
 	struct list_head	evt_list;
 	unsigned long		fflags;
-
+	bool			cdp_capable;
 };
 
 /**
-- 
Gitee


From de29b6e978386accedc1e8415ddbdd818cf4101a Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:25 +0000
Subject: [PATCH 1304/2161] x86/resctrl: Move the schemata names into struct
 resctrl_schema

commit e198fde3fe0892a5d1e28c0e29f1eebfb6f8c1cd upstream.

resctrl 'info' directories and schema parsing use the schema name.
This lives in the struct rdt_resource, and is specified by the
architecture code.

Once the CDP resources are merged, there will only be one resource (and
one name) in use by two schemata. To allow the CDP CODE/DATA property to
be the type of configuration the schema uses, the name should also be
per-schema.

Add a name field to struct resctrl_schema, and use this wherever
the schema name is exposed (or read from) user-space. Calculating
max_name_width for padding the schemata file also moves as this is
visible to user-space. As the names in struct rdt_resource already
include the CDP information, schemata_list_create() copies them.

schemata_list_create() includes the length of the CDP suffix when
calculating max_name_width in preparation for CDP resources being
merged.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-13-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        |  5 ----
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 10 +++-----
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 29 +++++++++++++++++++----
 include/linux/resctrl.h                   |  2 ++
 4 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index af8a2ec216ba..b2bf097c1ecb 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -776,13 +776,8 @@ static int resctrl_offline_cpu(unsigned int cpu)
 static __init void rdt_init_padding(void)
 {
 	struct rdt_resource *r;
-	int cl;
 
 	for_each_alloc_capable_rdt_resource(r) {
-		cl = strlen(r->name);
-		if (cl > max_name_width)
-			max_name_width = cl;
-
 		if (r->data_width > max_data_width)
 			max_data_width = r->data_width;
 	}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 0ee1ded5b8ff..104b285f8a60 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -290,11 +290,9 @@ static int rdtgroup_parse_resource(char *resname, char *tok,
 				   struct rdtgroup *rdtgrp)
 {
 	struct resctrl_schema *s;
-	struct rdt_resource *r;
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
-		r = s->res;
-		if (!strcmp(resname, r->name) && rdtgrp->closid < s->num_closid)
+		if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
 			return parse_line(tok, s, rdtgrp);
 	}
 	rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
@@ -388,7 +386,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
 	bool sep = false;
 	u32 ctrl_val;
 
-	seq_printf(s, "%*s:", max_name_width, r->name);
+	seq_printf(s, "%*s:", max_name_width, schema->name);
 	list_for_each_entry(dom, &r->domains, list) {
 		hw_dom = resctrl_to_arch_dom(dom);
 		if (sep)
@@ -408,7 +406,6 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 {
 	struct resctrl_schema *schema;
 	struct rdtgroup *rdtgrp;
-	struct rdt_resource *r;
 	int ret = 0;
 	u32 closid;
 
@@ -416,8 +413,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 	if (rdtgrp) {
 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 			list_for_each_entry(schema, &resctrl_schema_all, list) {
-				r = schema->res;
-				seq_printf(s, "%s:uninitialized\n", r->name);
+				seq_printf(s, "%s:uninitialized\n", schema->name);
 			}
 		} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
 			if (!rdtgrp->plr->d) {
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 5413a48b89ae..921f26a6b690 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1439,7 +1439,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 			ret = -ENODEV;
 		} else {
 			seq_printf(s, "%*s:", max_name_width,
-				   rdtgrp->plr->s->res->name);
+				   rdtgrp->plr->s->name);
 			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
 						    rdtgrp->plr->d,
 						    rdtgrp->plr->cbm);
@@ -1451,7 +1451,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 	list_for_each_entry(schema, &resctrl_schema_all, list) {
 		r = schema->res;
 		sep = false;
-		seq_printf(s, "%*s:", max_name_width, r->name);
+		seq_printf(s, "%*s:", max_name_width, schema->name);
 		list_for_each_entry(d, &r->domains, list) {
 			hw_dom = resctrl_to_arch_dom(d);
 			if (sep)
@@ -1823,7 +1823,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
 		fflags =  r->fflags | RF_CTRL_INFO;
-		ret = rdtgroup_mkdir_info_resdir(s, r->name, fflags);
+		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
 		if (ret)
 			goto out_destroy;
 	}
@@ -2141,6 +2141,7 @@ static int schemata_list_create(void)
 {
 	struct resctrl_schema *s;
 	struct rdt_resource *r;
+	int ret, cl;
 
 	for_each_alloc_enabled_rdt_resource(r) {
 		s = kzalloc(sizeof(*s), GFP_KERNEL);
@@ -2151,6 +2152,26 @@ static int schemata_list_create(void)
 		s->conf_type = resctrl_to_arch_res(r)->conf_type;
 		s->num_closid = resctrl_arch_get_num_closid(r);
 
+		ret = snprintf(s->name, sizeof(s->name), r->name);
+		if (ret >= sizeof(s->name)) {
+			kfree(s);
+			return -EINVAL;
+		}
+
+		cl = strlen(s->name);
+
+		/*
+		 * If CDP is supported by this resource, but not enabled,
+		 * include the suffix. This ensures the tabular format of the
+		 * schemata file does not change between mounts of the
+		 * filesystem.
+		 */
+		if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
+			cl += 4;
+
+		if (cl > max_name_width)
+			max_name_width = cl;
+
 		INIT_LIST_HEAD(&s->list);
 		list_add(&s->list, &resctrl_schema_all);
 	}
@@ -2789,7 +2810,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 	 */
 	tmp_cbm = d->new_ctrl;
 	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
-		rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id);
+		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
 		return -ENOSPC;
 	}
 	d->have_new_ctrl = true;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 4b30571fbc8e..e482ce790ce2 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -171,6 +171,7 @@ struct rdt_resource {
  * struct resctrl_schema - configuration abilities of a resource presented to
  *			   user-space
  * @list:	Member of resctrl_schema_all.
+ * @name:	The name to use in the "schemata" file.
  * @conf_type:	Whether this schema is specific to code/data.
  * @res:	The resource structure exported by the architecture to describe
  *		the hardware that is configured by this schema.
@@ -180,6 +181,7 @@ struct rdt_resource {
  */
 struct resctrl_schema {
 	struct list_head		list;
+	char				name[8];
 	enum resctrl_conf_type		conf_type;
 	struct rdt_resource		*res;
 	u32				num_closid;
-- 
Gitee


From 0bbbba5332911fef20932b3a1f1c9011f6c17927 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:26 +0000
Subject: [PATCH 1305/2161] x86/resctrl: Group staged configuration into a
 separate struct

commit e8f7282552b902af3bd1f07a87d657b7f5f12ab8 upstream.

When configuration changes are made, the new value is written to struct
rdt_domain's new_ctrl field and the have_new_ctrl flag is set. Later
new_ctrl is copied to hardware by a call to update_domains().

Once the CDP resources are merged, there will be one new_ctrl field in
use by two struct resctrl_schema requiring a per-schema IPI to copy the
value to hardware.

Move new_ctrl and have_new_ctrl into a new struct resctrl_staged_config.
Before the CDP resources can be merged, struct rdt_domain will need an
array of these, one per type of configuration. Using the type as an
index to the array will ensure that a schema configuration string can't
specify the same domain twice.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-14-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 43 +++++++++++++++--------
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 22 +++++++-----
 include/linux/resctrl.h                   | 16 ++++++---
 3 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 104b285f8a60..9ddfa7607234 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -62,16 +62,17 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
 {
 	struct rdt_resource *r = s->res;
 	unsigned long bw_val;
+	struct resctrl_staged_config *cfg = &d->staged_config;
 
-	if (d->have_new_ctrl) {
+	if (cfg->have_new_ctrl) {
 		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
 		return -EINVAL;
 	}
 
 	if (!bw_validate(data->buf, &bw_val, r))
 		return -EINVAL;
-	d->new_ctrl = bw_val;
-	d->have_new_ctrl = true;
+	cfg->new_ctrl = bw_val;
+	cfg->have_new_ctrl = true;
 
 	return 0;
 }
@@ -129,11 +130,12 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
 int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 	      struct rdt_domain *d)
 {
+	struct resctrl_staged_config *cfg = &d->staged_config;
 	struct rdtgroup *rdtgrp = data->rdtgrp;
 	struct rdt_resource *r = s->res;
 	u32 cbm_val;
 
-	if (d->have_new_ctrl) {
+	if (cfg->have_new_ctrl) {
 		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
 		return -EINVAL;
 	}
@@ -175,8 +177,8 @@ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 		}
 	}
 
-	d->new_ctrl = cbm_val;
-	d->have_new_ctrl = true;
+	cfg->new_ctrl = cbm_val;
+	cfg->have_new_ctrl = true;
 
 	return 0;
 }
@@ -190,6 +192,7 @@ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 static int parse_line(char *line, struct resctrl_schema *s,
 		      struct rdtgroup *rdtgrp)
 {
+	struct resctrl_staged_config *cfg;
 	struct rdt_resource *r = s->res;
 	struct rdt_parse_data data;
 	char *dom = NULL, *id;
@@ -219,6 +222,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
 			if (r->parse_ctrlval(&data, s, d))
 				return -EINVAL;
 			if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
+				cfg = &d->staged_config;
 				/*
 				 * In pseudo-locking setup mode and just
 				 * parsed a valid CBM that should be
@@ -229,7 +233,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
 				 */
 				rdtgrp->plr->s = s;
 				rdtgrp->plr->d = d;
-				rdtgrp->plr->cbm = d->new_ctrl;
+				rdtgrp->plr->cbm = cfg->new_ctrl;
 				d->plr = rdtgrp->plr;
 				return 0;
 			}
@@ -239,14 +243,27 @@ static int parse_line(char *line, struct resctrl_schema *s,
 	return -EINVAL;
 }
 
+static void apply_config(struct rdt_hw_domain *hw_dom,
+			 struct resctrl_staged_config *cfg, int closid,
+			 cpumask_var_t cpu_mask, bool mba_sc)
+{
+	struct rdt_domain *dom = &hw_dom->d_resctrl;
+	u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
+
+	if (cfg->new_ctrl != dc[closid]) {
+		cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
+		dc[closid] = cfg->new_ctrl;
+	}
+}
+
 int update_domains(struct rdt_resource *r, int closid)
 {
+	struct resctrl_staged_config *cfg;
 	struct rdt_hw_domain *hw_dom;
 	struct msr_param msr_param;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
 	bool mba_sc;
-	u32 *dc;
 	int cpu;
 
 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
@@ -259,11 +276,9 @@ int update_domains(struct rdt_resource *r, int closid)
 	mba_sc = is_mba_sc(r);
 	list_for_each_entry(d, &r->domains, list) {
 		hw_dom = resctrl_to_arch_dom(d);
-		dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
-		if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
-			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
-			dc[closid] = d->new_ctrl;
-		}
+		cfg = &hw_dom->d_resctrl.staged_config;
+		if (cfg->have_new_ctrl)
+			apply_config(hw_dom, cfg, closid, cpu_mask, mba_sc);
 	}
 
 	/*
@@ -335,7 +350,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		list_for_each_entry(dom, &s->res->domains, list)
-			dom->have_new_ctrl = false;
+			memset(&dom->staged_config, 0, sizeof(dom->staged_config));
 	}
 
 	while ((tok = strsep(&buf, "\n")) != NULL) {
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 921f26a6b690..37c1871d9b5a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2757,6 +2757,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 				 u32 closid)
 {
 	struct rdt_resource *r_cdp = NULL;
+	struct resctrl_staged_config *cfg;
 	struct rdt_domain *d_cdp = NULL;
 	struct rdt_resource *r = s->res;
 	u32 used_b = 0, unused_b = 0;
@@ -2766,8 +2767,9 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 	int i;
 
 	rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
-	d->have_new_ctrl = false;
-	d->new_ctrl = r->cache.shareable_bits;
+	cfg = &d->staged_config;
+	cfg->have_new_ctrl = false;
+	cfg->new_ctrl = r->cache.shareable_bits;
 	used_b = r->cache.shareable_bits;
 	ctrl = resctrl_to_arch_dom(d)->ctrl_val;
 	for (i = 0; i < closids_supported(); i++, ctrl++) {
@@ -2791,29 +2793,29 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 				peer_ctl = 0;
 			used_b |= *ctrl | peer_ctl;
 			if (mode == RDT_MODE_SHAREABLE)
-				d->new_ctrl |= *ctrl | peer_ctl;
+				cfg->new_ctrl |= *ctrl | peer_ctl;
 		}
 	}
 	if (d->plr && d->plr->cbm > 0)
 		used_b |= d->plr->cbm;
 	unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
 	unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
-	d->new_ctrl |= unused_b;
+	cfg->new_ctrl |= unused_b;
 	/*
 	 * Force the initial CBM to be valid, user can
 	 * modify the CBM based on system availability.
 	 */
-	d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r);
+	cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
 	/*
 	 * Assign the u32 CBM to an unsigned long to ensure that
 	 * bitmap_weight() does not access out-of-bound memory.
 	 */
-	tmp_cbm = d->new_ctrl;
+	tmp_cbm = cfg->new_ctrl;
 	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
 		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
 		return -ENOSPC;
 	}
-	d->have_new_ctrl = true;
+	cfg->have_new_ctrl = true;
 
 	return 0;
 }
@@ -2845,11 +2847,13 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
 /* Initialize MBA resource with default values. */
 static void rdtgroup_init_mba(struct rdt_resource *r)
 {
+	struct resctrl_staged_config *cfg;
 	struct rdt_domain *d;
 
 	list_for_each_entry(d, &r->domains, list) {
-		d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
-		d->have_new_ctrl = true;
+		cfg = &d->staged_config;
+		cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
+		cfg->have_new_ctrl = true;
 	}
 }
 
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index e482ce790ce2..ff7f7d7e1348 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -27,13 +27,21 @@ enum resctrl_conf_type {
 	CDP_DATA,
 };
 
+/**
+ * struct resctrl_staged_config - parsed configuration to be applied
+ * @new_ctrl:		new ctrl value to be loaded
+ * @have_new_ctrl:	whether the user provided new_ctrl is valid
+ */
+struct resctrl_staged_config {
+	u32			new_ctrl;
+	bool			have_new_ctrl;
+};
+
 /**
  * struct rdt_domain - group of CPUs sharing a resctrl resource
  * @list:		all instances of this resource
  * @id:			unique id for this instance
  * @cpu_mask:		which CPUs share this resource
- * @new_ctrl:		new ctrl value to be loaded
- * @have_new_ctrl:	did user provide new_ctrl for this domain
  * @rmid_busy_llc:	bitmap of which limbo RMIDs are above threshold
  * @mbm_total:		saved state for MBM total bandwidth
  * @mbm_local:		saved state for MBM local bandwidth
@@ -42,13 +50,12 @@ enum resctrl_conf_type {
  * @mbm_work_cpu:	worker CPU for MBM h/w counters
  * @cqm_work_cpu:	worker CPU for CQM h/w counters
  * @plr:		pseudo-locked region (if any) associated with domain
+ * @staged_config:	parsed configuration to be applied
  */
 struct rdt_domain {
 	struct list_head		list;
 	int				id;
 	struct cpumask			cpu_mask;
-	u32				new_ctrl;
-	bool				have_new_ctrl;
 	unsigned long			*rmid_busy_llc;
 	struct mbm_state		*mbm_total;
 	struct mbm_state		*mbm_local;
@@ -57,6 +64,7 @@ struct rdt_domain {
 	int				mbm_work_cpu;
 	int				cqm_work_cpu;
 	struct pseudo_lock_region	*plr;
+	struct resctrl_staged_config	staged_config;
 };
 
 /**
-- 
Gitee


From 89917e1bd23083a1b9d65006c4ded6e45e57774e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:27 +0000
Subject: [PATCH 1306/2161] x86/resctrl: Allow different CODE/DATA
 configurations to be staged

commit 75408e43509ed6207870c0e7e28656acbbc1f7fd upstream.

Before the CDP resources can be merged, struct rdt_domain will need an
array of struct resctrl_staged_config, one per type of configuration.

Use the type as an index to the array to ensure that a schema
configuration string can't specify the same domain twice. This will
allow two schemata to apply configuration changes to one resource.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-15-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 20 ++++++++++++++------
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  5 +++--
 include/linux/resctrl.h                   |  4 +++-
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 9ddfa7607234..f29848f98846 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -60,10 +60,11 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
 	     struct rdt_domain *d)
 {
+	struct resctrl_staged_config *cfg;
 	struct rdt_resource *r = s->res;
 	unsigned long bw_val;
-	struct resctrl_staged_config *cfg = &d->staged_config;
 
+	cfg = &d->staged_config[s->conf_type];
 	if (cfg->have_new_ctrl) {
 		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
 		return -EINVAL;
@@ -130,11 +131,12 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
 int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 	      struct rdt_domain *d)
 {
-	struct resctrl_staged_config *cfg = &d->staged_config;
 	struct rdtgroup *rdtgrp = data->rdtgrp;
+	struct resctrl_staged_config *cfg;
 	struct rdt_resource *r = s->res;
 	u32 cbm_val;
 
+	cfg = &d->staged_config[s->conf_type];
 	if (cfg->have_new_ctrl) {
 		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
 		return -EINVAL;
@@ -192,6 +194,7 @@ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 static int parse_line(char *line, struct resctrl_schema *s,
 		      struct rdtgroup *rdtgrp)
 {
+	enum resctrl_conf_type t = s->conf_type;
 	struct resctrl_staged_config *cfg;
 	struct rdt_resource *r = s->res;
 	struct rdt_parse_data data;
@@ -222,7 +225,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
 			if (r->parse_ctrlval(&data, s, d))
 				return -EINVAL;
 			if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
-				cfg = &d->staged_config;
+				cfg = &d->staged_config[t];
 				/*
 				 * In pseudo-locking setup mode and just
 				 * parsed a valid CBM that should be
@@ -261,6 +264,7 @@ int update_domains(struct rdt_resource *r, int closid)
 	struct resctrl_staged_config *cfg;
 	struct rdt_hw_domain *hw_dom;
 	struct msr_param msr_param;
+	enum resctrl_conf_type t;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
 	bool mba_sc;
@@ -276,9 +280,13 @@ int update_domains(struct rdt_resource *r, int closid)
 	mba_sc = is_mba_sc(r);
 	list_for_each_entry(d, &r->domains, list) {
 		hw_dom = resctrl_to_arch_dom(d);
-		cfg = &hw_dom->d_resctrl.staged_config;
-		if (cfg->have_new_ctrl)
+		for (t = 0; t < CDP_NUM_TYPES; t++) {
+			cfg = &hw_dom->d_resctrl.staged_config[t];
+			if (!cfg->have_new_ctrl)
+				continue;
+
 			apply_config(hw_dom, cfg, closid, cpu_mask, mba_sc);
+		}
 	}
 
 	/*
@@ -350,7 +358,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		list_for_each_entry(dom, &s->res->domains, list)
-			memset(&dom->staged_config, 0, sizeof(dom->staged_config));
+			memset(dom->staged_config, 0, sizeof(dom->staged_config));
 	}
 
 	while ((tok = strsep(&buf, "\n")) != NULL) {
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 37c1871d9b5a..3c1cabd74ac8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2756,6 +2756,7 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
 static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 				 u32 closid)
 {
+	enum resctrl_conf_type t = s->conf_type;
 	struct rdt_resource *r_cdp = NULL;
 	struct resctrl_staged_config *cfg;
 	struct rdt_domain *d_cdp = NULL;
@@ -2767,7 +2768,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 	int i;
 
 	rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
-	cfg = &d->staged_config;
+	cfg = &d->staged_config[t];
 	cfg->have_new_ctrl = false;
 	cfg->new_ctrl = r->cache.shareable_bits;
 	used_b = r->cache.shareable_bits;
@@ -2851,7 +2852,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r)
 	struct rdt_domain *d;
 
 	list_for_each_entry(d, &r->domains, list) {
-		cfg = &d->staged_config;
+		cfg = &d->staged_config[CDP_NONE];
 		cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
 		cfg->have_new_ctrl = true;
 	}
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index ff7f7d7e1348..51ba372f96cd 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -27,6 +27,8 @@ enum resctrl_conf_type {
 	CDP_DATA,
 };
 
+#define CDP_NUM_TYPES	(CDP_DATA + 1)
+
 /**
  * struct resctrl_staged_config - parsed configuration to be applied
  * @new_ctrl:		new ctrl value to be loaded
@@ -64,7 +66,7 @@ struct rdt_domain {
 	int				mbm_work_cpu;
 	int				cqm_work_cpu;
 	struct pseudo_lock_region	*plr;
-	struct resctrl_staged_config	staged_config;
+	struct resctrl_staged_config	staged_config[CDP_NUM_TYPES];
 };
 
 /**
-- 
Gitee


From eff73d075a4076e700e227dd33002118d9ea56e4 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:28 +0000
Subject: [PATCH 1307/2161] x86/resctrl: Rename update_domains() to
 resctrl_arch_update_domains()

commit 2e6678195d59c51b6ca234169ad3de01134d3dec upstream.

update_domains() merges the staged configuration changes into the arch
codes configuration array. Rename to make it clear it is part of the
arch code interface to resctrl.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-16-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 4 ++--
 arch/x86/kernel/cpu/resctrl/internal.h    | 1 -
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 2 +-
 include/linux/resctrl.h                   | 1 +
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index f29848f98846..8cde76df888a 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -259,7 +259,7 @@ static void apply_config(struct rdt_hw_domain *hw_dom,
 	}
 }
 
-int update_domains(struct rdt_resource *r, int closid)
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 {
 	struct resctrl_staged_config *cfg;
 	struct rdt_hw_domain *hw_dom;
@@ -380,7 +380,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
-		ret = update_domains(r, rdtgrp->closid);
+		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
 		if (ret)
 			goto out;
 	}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 3069a38516ef..338e624420d2 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -526,7 +526,6 @@ void rdt_pseudo_lock_release(void);
 int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
 void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
-int update_domains(struct rdt_resource *r, int closid);
 int closids_supported(void);
 void closid_free(int closid);
 int alloc_rmid(void);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 3c1cabd74ac8..ec6970e391f2 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2875,7 +2875,7 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
 				return ret;
 		}
 
-		ret = update_domains(r, rdtgrp->closid);
+		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
 		if (ret < 0) {
 			rdt_last_cmd_puts("Failed to initialize allocations\n");
 			return ret;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 51ba372f96cd..be5881171576 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -199,5 +199,6 @@ struct resctrl_schema {
 
 /* The number of closid supported by this resource regardless of CDP */
 u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
 
 #endif /* _RESCTRL_H */
-- 
Gitee


From a58d0679de2eb34e7eeb70bea16179bd2cbeed3b Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:29 +0000
Subject: [PATCH 1308/2161] x86/resctrl: Add a helper to read a closid's
 configuration

commit f07e9d0250577a23eb06d4334798291616c01f2d upstream.

Functions like show_doms() reach into the architecture's private
structure to retrieve the configuration from the struct rdt_hw_resource.

The hardware configuration may look completely different to the
values resctrl gets from user-space. The staged configuration and
resctrl_arch_update_domains() allow the architecture to convert or
translate these values.

Resctrl shouldn't read or write the ctrl_val[] values directly. Add
a helper to read the current configuration. This will allow another
architecture to scale the bitmaps if necessary, and possibly use
controls that don't take the user-space control format at all.

Of the remaining functions that access ctrl_val[] directly,
apply_config() is part of the architecture-specific code, and is
called via resctrl_arch_update_domains(). reset_all_ctrls() will be an
architecture specific helper.

update_mba_bw() manipulates both ctrl_val[], mbps_val[] and the
hardware. The mbps_val[] that matches the mba_sc state of the resource
is changed, but the other is left unchanged. Abstracting this is the
subject of later patches that affect set_mba_sc() too.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-17-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 16 ++++++---
 arch/x86/kernel/cpu/resctrl/monitor.c     |  6 +++-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 43 ++++++++++-------------
 include/linux/resctrl.h                   |  2 ++
 4 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 8cde76df888a..4da08ba0deda 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -401,22 +401,30 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 	return ret ?: nbytes;
 }
 
+void resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+			     u32 closid, u32 *value)
+{
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+
+	if (!is_mba_sc(r))
+		*value = hw_dom->ctrl_val[closid];
+	else
+		*value = hw_dom->mbps_val[closid];
+}
+
 static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
 {
 	struct rdt_resource *r = schema->res;
-	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *dom;
 	bool sep = false;
 	u32 ctrl_val;
 
 	seq_printf(s, "%*s:", max_name_width, schema->name);
 	list_for_each_entry(dom, &r->domains, list) {
-		hw_dom = resctrl_to_arch_dom(dom);
 		if (sep)
 			seq_puts(s, ";");
 
-		ctrl_val = (!is_mba_sc(r) ? hw_dom->ctrl_val[closid] :
-			    hw_dom->mbps_val[closid]);
+		resctrl_arch_get_config(r, dom, closid, &ctrl_val);
 		seq_printf(s, r->format_str, dom->id, max_data_width,
 			   ctrl_val);
 		sep = true;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index d59f3bc38f1e..839ce268bdd1 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -442,8 +442,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	hw_dom_mba = resctrl_to_arch_dom(dom_mba);
 
 	cur_bw = pmbm_data->prev_bw;
-	user_bw = hw_dom_mba->mbps_val[closid];
+	resctrl_arch_get_config(r_mba, dom_mba, closid, &user_bw);
 	delta_bw = pmbm_data->delta_bw;
+	/*
+	 * resctrl_arch_get_config() chooses the mbps/ctrl value to return
+	 * based on is_mba_sc(). For now, reach into the hw_dom.
+	 */
 	cur_msr_val = hw_dom_mba->ctrl_val[closid];
 
 	/*
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index ec6970e391f2..638361e3cc9e 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -910,27 +910,27 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
 	int i, hwb, swb, excl, psl;
 	enum rdtgrp_mode mode;
 	bool sep = false;
-	u32 *ctrl;
+	u32 ctrl_val;
 
 	mutex_lock(&rdtgroup_mutex);
 	hw_shareable = r->cache.shareable_bits;
 	list_for_each_entry(dom, &r->domains, list) {
 		if (sep)
 			seq_putc(seq, ';');
-		ctrl = resctrl_to_arch_dom(dom)->ctrl_val;
 		sw_shareable = 0;
 		exclusive = 0;
 		seq_printf(seq, "%d=", dom->id);
-		for (i = 0; i < closids_supported(); i++, ctrl++) {
+		for (i = 0; i < closids_supported(); i++) {
 			if (!closid_allocated(i))
 				continue;
+			resctrl_arch_get_config(r, dom, i, &ctrl_val);
 			mode = rdtgroup_mode_by_closid(i);
 			switch (mode) {
 			case RDT_MODE_SHAREABLE:
-				sw_shareable |= *ctrl;
+				sw_shareable |= ctrl_val;
 				break;
 			case RDT_MODE_EXCLUSIVE:
-				exclusive |= *ctrl;
+				exclusive |= ctrl_val;
 				break;
 			case RDT_MODE_PSEUDO_LOCKSETUP:
 			/*
@@ -1188,7 +1188,6 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 {
 	enum rdtgrp_mode mode;
 	unsigned long ctrl_b;
-	u32 *ctrl;
 	int i;
 
 	/* Check for any overlap with regions used by hardware directly */
@@ -1199,9 +1198,8 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 	}
 
 	/* Check for overlap with other resource groups */
-	ctrl = resctrl_to_arch_dom(d)->ctrl_val;
-	for (i = 0; i < closids_supported(); i++, ctrl++) {
-		ctrl_b = *ctrl;
+	for (i = 0; i < closids_supported(); i++) {
+		resctrl_arch_get_config(r, d, i, (u32 *)&ctrl_b);
 		mode = rdtgroup_mode_by_closid(i);
 		if (closid_allocated(i) && i != closid &&
 		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
@@ -1269,12 +1267,12 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
  */
 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 {
-	struct rdt_hw_domain *hw_dom;
 	int closid = rdtgrp->closid;
 	struct resctrl_schema *s;
 	struct rdt_resource *r;
 	bool has_cache = false;
 	struct rdt_domain *d;
+	u32 ctrl;
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
@@ -1282,10 +1280,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 			continue;
 		has_cache = true;
 		list_for_each_entry(d, &r->domains, list) {
-			hw_dom = resctrl_to_arch_dom(d);
-			if (rdtgroup_cbm_overlaps(s, d,
-						  hw_dom->ctrl_val[closid],
-						  rdtgrp->closid, false)) {
+			resctrl_arch_get_config(r, d, closid, &ctrl);
+			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
 				rdt_last_cmd_puts("Schemata overlaps\n");
 				return false;
 			}
@@ -1417,7 +1413,6 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 			      struct seq_file *s, void *v)
 {
 	struct resctrl_schema *schema;
-	struct rdt_hw_domain *hw_dom;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
 	struct rdt_domain *d;
@@ -1453,15 +1448,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 		sep = false;
 		seq_printf(s, "%*s:", max_name_width, schema->name);
 		list_for_each_entry(d, &r->domains, list) {
-			hw_dom = resctrl_to_arch_dom(d);
 			if (sep)
 				seq_putc(s, ';');
 			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 				size = 0;
 			} else {
-				ctrl = (!is_mba_sc(r) ?
-						hw_dom->ctrl_val[rdtgrp->closid] :
-						hw_dom->mbps_val[rdtgrp->closid]);
+				resctrl_arch_get_config(r, d, rdtgrp->closid,
+							&ctrl);
 				if (r->rid == RDT_RESOURCE_MBA)
 					size = ctrl;
 				else
@@ -2764,7 +2757,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 	u32 used_b = 0, unused_b = 0;
 	unsigned long tmp_cbm;
 	enum rdtgrp_mode mode;
-	u32 peer_ctl, *ctrl;
+	u32 peer_ctl, ctrl_val;
 	int i;
 
 	rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
@@ -2772,8 +2765,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 	cfg->have_new_ctrl = false;
 	cfg->new_ctrl = r->cache.shareable_bits;
 	used_b = r->cache.shareable_bits;
-	ctrl = resctrl_to_arch_dom(d)->ctrl_val;
-	for (i = 0; i < closids_supported(); i++, ctrl++) {
+	for (i = 0; i < closids_supported(); i++) {
 		if (closid_allocated(i) && i != closid) {
 			mode = rdtgroup_mode_by_closid(i);
 			if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
@@ -2789,12 +2781,13 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 			 * with an exclusive group.
 			 */
 			if (d_cdp)
-				peer_ctl = resctrl_to_arch_dom(d_cdp)->ctrl_val[i];
+				resctrl_arch_get_config(r_cdp, d_cdp, i, &peer_ctl);
 			else
 				peer_ctl = 0;
-			used_b |= *ctrl | peer_ctl;
+			resctrl_arch_get_config(r, d, i, &ctrl_val);
+			used_b |= ctrl_val | peer_ctl;
 			if (mode == RDT_MODE_SHAREABLE)
-				cfg->new_ctrl |= *ctrl | peer_ctl;
+				cfg->new_ctrl |= ctrl_val | peer_ctl;
 		}
 	}
 	if (d->plr && d->plr->cbm > 0)
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index be5881171576..3a2309403094 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -200,5 +200,7 @@ struct resctrl_schema {
 /* The number of closid supported by this resource regardless of CDP */
 u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
+void resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+			     u32 closid, u32 *value);
 
 #endif /* _RESCTRL_H */
-- 
Gitee


From 7d57a7a4d31a82d47b01ff28a97c72f62453de57 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:30 +0000
Subject: [PATCH 1309/2161] x86/resctrl: Pass configuration type to
 resctrl_arch_get_config()

commit fa8f711d2f14381d1a47420b6da94b62e6484c56 upstream.

The ctrl_val[] array for a struct rdt_hw_resource only holds
configurations of one type. The type is implicit.

Once the CDP resources are merged, the ctrl_val[] array will hold all
the configurations for the hardware resource. When a particular type of
configuration is needed, it must be specified explicitly.

Pass the expected type from the schema into resctrl_arch_get_config().
Nothing uses this yet, but once a single ctrl_val[] array is used for
the three struct rdt_hw_resources that share hardware, the type will be
used to return the correct configuration value from the shared array.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-18-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c |  5 ++--
 arch/x86/kernel/cpu/resctrl/monitor.c     |  2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 35 +++++++++++++++--------
 include/linux/resctrl.h                   |  3 +-
 4 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 4da08ba0deda..9ead0c0bf6ee 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -402,7 +402,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 }
 
 void resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
-			     u32 closid, u32 *value)
+			     u32 closid, enum resctrl_conf_type type, u32 *value)
 {
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 
@@ -424,7 +424,8 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
 		if (sep)
 			seq_puts(s, ";");
 
-		resctrl_arch_get_config(r, dom, closid, &ctrl_val);
+		resctrl_arch_get_config(r, dom, closid, schema->conf_type,
+					&ctrl_val);
 		seq_printf(s, r->format_str, dom->id, max_data_width,
 			   ctrl_val);
 		sep = true;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 839ce268bdd1..082291c9197d 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -442,7 +442,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	hw_dom_mba = resctrl_to_arch_dom(dom_mba);
 
 	cur_bw = pmbm_data->prev_bw;
-	resctrl_arch_get_config(r_mba, dom_mba, closid, &user_bw);
+	resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE, &user_bw);
 	delta_bw = pmbm_data->delta_bw;
 	/*
 	 * resctrl_arch_get_config() chooses the mbps/ctrl value to return
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 638361e3cc9e..b38f5dab1225 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -923,7 +923,8 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
 		for (i = 0; i < closids_supported(); i++) {
 			if (!closid_allocated(i))
 				continue;
-			resctrl_arch_get_config(r, dom, i, &ctrl_val);
+			resctrl_arch_get_config(r, dom, i, s->conf_type,
+						&ctrl_val);
 			mode = rdtgroup_mode_by_closid(i);
 			switch (mode) {
 			case RDT_MODE_SHAREABLE:
@@ -1099,6 +1100,7 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of,
  *         Used to return the result.
  * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
  *         Used to return the result.
+ * @peer_type: The CDP configuration type of the peer resource.
  *
  * RDT resources are managed independently and by extension the RDT domains
  * (RDT resource instances) are managed independently also. The Code and
@@ -1116,7 +1118,8 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of,
  */
 static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
 			    struct rdt_resource **r_cdp,
-			    struct rdt_domain **d_cdp)
+			    struct rdt_domain **d_cdp,
+			    enum resctrl_conf_type *peer_type)
 {
 	struct rdt_resource *_r_cdp = NULL;
 	struct rdt_domain *_d_cdp = NULL;
@@ -1125,15 +1128,19 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
 	switch (r->rid) {
 	case RDT_RESOURCE_L3DATA:
 		_r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE].r_resctrl;
+		*peer_type = CDP_CODE;
 		break;
 	case RDT_RESOURCE_L3CODE:
 		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl;
+		*peer_type = CDP_DATA;
 		break;
 	case RDT_RESOURCE_L2DATA:
 		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE].r_resctrl;
+		*peer_type = CDP_CODE;
 		break;
 	case RDT_RESOURCE_L2CODE:
 		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl;
+		*peer_type = CDP_DATA;
 		break;
 	default:
 		ret = -ENOENT;
@@ -1184,7 +1191,8 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
  * Return: false if CBM does not overlap, true if it does.
  */
 static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
-				    unsigned long cbm, int closid, bool exclusive)
+				    unsigned long cbm, int closid,
+				    enum resctrl_conf_type type, bool exclusive)
 {
 	enum rdtgrp_mode mode;
 	unsigned long ctrl_b;
@@ -1199,7 +1207,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 
 	/* Check for overlap with other resource groups */
 	for (i = 0; i < closids_supported(); i++) {
-		resctrl_arch_get_config(r, d, i, (u32 *)&ctrl_b);
+		resctrl_arch_get_config(r, d, i, type, (u32 *)&ctrl_b);
 		mode = rdtgroup_mode_by_closid(i);
 		if (closid_allocated(i) && i != closid &&
 		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
@@ -1240,17 +1248,19 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
 			   unsigned long cbm, int closid, bool exclusive)
 {
+	enum resctrl_conf_type peer_type;
 	struct rdt_resource *r = s->res;
 	struct rdt_resource *r_cdp;
 	struct rdt_domain *d_cdp;
 
-	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
+	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
+				    exclusive))
 		return true;
 
-	if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
+	if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp, &peer_type) < 0)
 		return false;
 
-	return  __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
+	return  __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, peer_type, exclusive);
 }
 
 /**
@@ -1280,7 +1290,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 			continue;
 		has_cache = true;
 		list_for_each_entry(d, &r->domains, list) {
-			resctrl_arch_get_config(r, d, closid, &ctrl);
+			resctrl_arch_get_config(r, d, closid, s->conf_type, &ctrl);
 			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
 				rdt_last_cmd_puts("Schemata overlaps\n");
 				return false;
@@ -1454,7 +1464,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 				size = 0;
 			} else {
 				resctrl_arch_get_config(r, d, rdtgrp->closid,
-							&ctrl);
+							schema->conf_type, &ctrl);
 				if (r->rid == RDT_RESOURCE_MBA)
 					size = ctrl;
 				else
@@ -2752,6 +2762,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 	enum resctrl_conf_type t = s->conf_type;
 	struct rdt_resource *r_cdp = NULL;
 	struct resctrl_staged_config *cfg;
+	enum resctrl_conf_type peer_type;
 	struct rdt_domain *d_cdp = NULL;
 	struct rdt_resource *r = s->res;
 	u32 used_b = 0, unused_b = 0;
@@ -2760,7 +2771,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 	u32 peer_ctl, ctrl_val;
 	int i;
 
-	rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
+	rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp, &peer_type);
 	cfg = &d->staged_config[t];
 	cfg->have_new_ctrl = false;
 	cfg->new_ctrl = r->cache.shareable_bits;
@@ -2781,10 +2792,10 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 			 * with an exclusive group.
 			 */
 			if (d_cdp)
-				resctrl_arch_get_config(r_cdp, d_cdp, i, &peer_ctl);
+				resctrl_arch_get_config(r_cdp, d_cdp, i, peer_type, &peer_ctl);
 			else
 				peer_ctl = 0;
-			resctrl_arch_get_config(r, d, i, &ctrl_val);
+			resctrl_arch_get_config(r, d, i, s->conf_type, &ctrl_val);
 			used_b |= ctrl_val | peer_ctl;
 			if (mode == RDT_MODE_SHAREABLE)
 				cfg->new_ctrl |= ctrl_val | peer_ctl;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 3a2309403094..69d7387b7f22 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -201,6 +201,7 @@ struct resctrl_schema {
 u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
 void resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
-			     u32 closid, u32 *value);
+			     u32 closid, enum resctrl_conf_type type,
+			     u32 *value);
 
 #endif /* _RESCTRL_H */
-- 
Gitee


From 7fc874ef5063fefbcffb2356785de124e1b1413e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:31 +0000
Subject: [PATCH 1310/2161] x86/resctrl: Make ctrlval arrays the same size

commit 141739aa73505539f315d15068b9c0707ab5ecb4 upstream.

The CODE and DATA resources report a num_closid that is half the actual
size supported by the hardware. This behaviour is visible to user-space
when CDP is enabled.

The CODE and DATA resources have their own ctrlval arrays which are
half the size of the underlying hardware because num_closid was already
adjusted. One holds the odd configurations values, the other even.

Before the CDP resources can be merged, the 'half the closids' behaviour
needs to be implemented by schemata_list_create(), but this causes the
ctrl_val[] array to be full sized.

Remove the logic from the architecture specific rdt_get_cdp_config()
setup, and add it to schemata_list_create(). Functions that walk all the
configurations, such as domain_setup_ctrlval() and reset_all_ctrls(),
take num_closid directly from struct rdt_hw_resource also have
to halve num_closid as only the lower half of each array is in
use. domain_setup_ctrlval() and reset_all_ctrls() both copy struct
rdt_hw_resource's num_closid to a struct msr_param. Correct the value
here.

This is temporary as a subsequent patch will merge all three ctrl_val[]
arrays such that when CDP is in use, the CODA/DATA layout in the array
matches the hardware. reset_all_ctrls()'s loop over the whole of
ctrl_val[] is not touched as this is harmless, and will be required as
it is once the resources are merged.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-19-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     | 10 +++++++++-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  9 +++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index b2bf097c1ecb..33479b3a0e67 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -357,7 +357,7 @@ static void rdt_get_cdp_config(int level, int type)
 	struct rdt_resource *r = &rdt_resources_all[type].r_resctrl;
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
-	hw_res->num_closid = hw_res_l->num_closid / 2;
+	hw_res->num_closid = hw_res_l->num_closid;
 	r->cache.cbm_len = r_l->cache.cbm_len;
 	r->default_ctrl = r_l->default_ctrl;
 	r->cache.shareable_bits = r_l->cache.shareable_bits;
@@ -543,6 +543,14 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 
 	m.low = 0;
 	m.high = hw_res->num_closid;
+
+	/*
+	 * temporary: the array is full-size, but cat_wrmsr() still re-maps
+	 * the index.
+	 */
+	if (hw_res->conf_type != CDP_NONE)
+		m.high /= 2;
+
 	hw_res->msr_update(d, &m, r);
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index b38f5dab1225..d6e65661a8ba 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2154,6 +2154,8 @@ static int schemata_list_create(void)
 		s->res = r;
 		s->conf_type = resctrl_to_arch_res(r)->conf_type;
 		s->num_closid = resctrl_arch_get_num_closid(r);
+		if (resctrl_arch_get_cdp_enabled(r->rid))
+			s->num_closid /= 2;
 
 		ret = snprintf(s->name, sizeof(s->name), r->name);
 		if (ret >= sizeof(s->name)) {
@@ -2381,6 +2383,13 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	msr_param.low = 0;
 	msr_param.high = hw_res->num_closid;
 
+	/*
+	 * temporary: the array is full-sized, but cat_wrmsr() still re-maps
+	 * the index.
+	 */
+	if (hw_res->cdp_enabled)
+		msr_param.high /= 2;
+
 	/*
 	 * Disable resource control for this resource by setting all
 	 * CBMs in all domains to the maximum mask value. Pick one CPU
-- 
Gitee


From 19e10bb0bc4b0870a9e2091ea416e66259ea0a7e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:32 +0000
Subject: [PATCH 1311/2161] x86/resctrl: Apply offset correction when config is
 staged

commit 2e7df368fc9260ac2229335755de2f403ec8f08f upstream.

When resctrl comes to copy the CAT MSR values from the ctrl_val[] array
into hardware, it applies an offset adjustment based on the type of
the resource. CODE and DATA resources have their closid mapped into an
odd/even range. This mapping is based on a property of the resource.

This happens once the new control value has been written to the ctrl_val[]
array. Once the CDP resources are merged, there will only be a single
property that needs to cover both odd/even mappings to the single
ctrl_val[] array. The offset adjustment must be applied before the new
value is written to the array.

Move the logic from cat_wrmsr() to resctrl_arch_update_domains(). The
value provided to apply_config() is now an index in the array, not the
closid. The parameters provided via struct msr_param are now indexes
too. As resctrl's use of closid is a u32, struct msr_param's type is
changed to match.

With this, the CODE and DATA resources only use the odd or even
indexes in the array. This allows the temporary num_closid/2 fixes in
domain_setup_ctrlval() and reset_all_ctrls() to be removed.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-20-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        | 15 +----------
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 32 +++++++++++++++++------
 arch/x86/kernel/cpu/resctrl/internal.h    |  4 +--
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  7 -----
 4 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 33479b3a0e67..5fbfd047d5af 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -195,11 +195,6 @@ struct rdt_hw_resource rdt_resources_all[] = {
 	},
 };
 
-static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
-{
-	return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
-}
-
 /*
  * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
  * as they do not have CPUID enumeration support for Cache allocation.
@@ -432,7 +427,7 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(hw_res->msr_base + cbm_idx(r, i), hw_dom->ctrl_val[i]);
+		wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
 }
 
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
@@ -543,14 +538,6 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 
 	m.low = 0;
 	m.high = hw_res->num_closid;
-
-	/*
-	 * temporary: the array is full-size, but cat_wrmsr() still re-maps
-	 * the index.
-	 */
-	if (hw_res->conf_type != CDP_NONE)
-		m.high /= 2;
-
 	hw_res->msr_update(d, &m, r);
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 9ead0c0bf6ee..fdb0e11a78dc 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -246,17 +246,29 @@ static int parse_line(char *line, struct resctrl_schema *s,
 	return -EINVAL;
 }
 
-static void apply_config(struct rdt_hw_domain *hw_dom,
-			 struct resctrl_staged_config *cfg, int closid,
+static u32 cbm_idx(struct rdt_resource *r, unsigned int closid)
+{
+	if (r->rid == RDT_RESOURCE_MBA)
+		return closid;
+
+	return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
+}
+
+static bool apply_config(struct rdt_hw_domain *hw_dom,
+			 struct resctrl_staged_config *cfg, u32 idx,
 			 cpumask_var_t cpu_mask, bool mba_sc)
 {
 	struct rdt_domain *dom = &hw_dom->d_resctrl;
 	u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
 
-	if (cfg->new_ctrl != dc[closid]) {
+	if (cfg->new_ctrl != dc[idx]) {
 		cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
-		dc[closid] = cfg->new_ctrl;
+		dc[idx] = cfg->new_ctrl;
+
+		return true;
 	}
+
+	return false;
 }
 
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
@@ -269,11 +281,12 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 	struct rdt_domain *d;
 	bool mba_sc;
 	int cpu;
+	u32 idx;
 
 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
 
-	msr_param.low = closid;
+	msr_param.low = cbm_idx(r, closid);
 	msr_param.high = msr_param.low + 1;
 	msr_param.res = r;
 
@@ -285,7 +298,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 			if (!cfg->have_new_ctrl)
 				continue;
 
-			apply_config(hw_dom, cfg, closid, cpu_mask, mba_sc);
+			idx = cbm_idx(r, closid);
+			if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc))
+				continue;
 		}
 	}
 
@@ -405,11 +420,12 @@ void resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			     u32 closid, enum resctrl_conf_type type, u32 *value)
 {
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	u32 idx = cbm_idx(r, closid);
 
 	if (!is_mba_sc(r))
-		*value = hw_dom->ctrl_val[closid];
+		*value = hw_dom->ctrl_val[idx];
 	else
-		*value = hw_dom->mbps_val[closid];
+		*value = hw_dom->mbps_val[idx];
 }
 
 static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 338e624420d2..ef43f83de009 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -333,8 +333,8 @@ static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
  */
 struct msr_param {
 	struct rdt_resource	*res;
-	int			low;
-	int			high;
+	u32			low;
+	u32			high;
 };
 
 static inline bool is_llc_occupancy_enabled(void)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index d6e65661a8ba..685b1cd9562e 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2383,13 +2383,6 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	msr_param.low = 0;
 	msr_param.high = hw_res->num_closid;
 
-	/*
-	 * temporary: the array is full-sized, but cat_wrmsr() still re-maps
-	 * the index.
-	 */
-	if (hw_res->cdp_enabled)
-		msr_param.high /= 2;
-
 	/*
 	 * Disable resource control for this resource by setting all
 	 * CBMs in all domains to the maximum mask value. Pick one CPU
-- 
Gitee


From 92d784c9a6c3763d795dfb775234fcf6464bd835 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:33 +0000
Subject: [PATCH 1312/2161] x86/resctrl: Calculate the index from the
 configuration type

commit 2b8dd4ab65dad1251822fbf74fb0d5623e4eaee0 upstream.

resctrl uses cbm_idx() to map a closid to an index in the configuration
array. This is based on a multiplier and offset that are held in the
resource.

To merge the resources, the resctrl arch code needs to calculate the
index from something else, as there will only be one resource.

Decide based on the staged configuration type. This makes the static
mult and offset parameters redundant.

 [ bp: Remove superfluous brackets in get_config_index() ]

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-21-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c        | 12 -----------
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 25 ++++++++++++++---------
 include/linux/resctrl.h                   |  6 ------
 3 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 5fbfd047d5af..938d88a2d9c6 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -69,8 +69,6 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.cache_level		= 3,
 			.cache = {
 				.min_cbm_bits	= 1,
-				.cbm_idx_mult	= 1,
-				.cbm_idx_offset	= 0,
 			},
 			.domains		= domain_init(RDT_RESOURCE_L3),
 			.parse_ctrlval		= parse_cbm,
@@ -89,8 +87,6 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.cache_level		= 3,
 			.cache = {
 				.min_cbm_bits	= 1,
-				.cbm_idx_mult	= 2,
-				.cbm_idx_offset	= 0,
 			},
 			.domains		= domain_init(RDT_RESOURCE_L3DATA),
 			.parse_ctrlval		= parse_cbm,
@@ -109,8 +105,6 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.cache_level		= 3,
 			.cache = {
 				.min_cbm_bits	= 1,
-				.cbm_idx_mult	= 2,
-				.cbm_idx_offset	= 1,
 			},
 			.domains		= domain_init(RDT_RESOURCE_L3CODE),
 			.parse_ctrlval		= parse_cbm,
@@ -129,8 +123,6 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.cache_level		= 2,
 			.cache = {
 				.min_cbm_bits	= 1,
-				.cbm_idx_mult	= 1,
-				.cbm_idx_offset	= 0,
 			},
 			.domains		= domain_init(RDT_RESOURCE_L2),
 			.parse_ctrlval		= parse_cbm,
@@ -149,8 +141,6 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.cache_level		= 2,
 			.cache = {
 				.min_cbm_bits	= 1,
-				.cbm_idx_mult	= 2,
-				.cbm_idx_offset	= 0,
 			},
 			.domains		= domain_init(RDT_RESOURCE_L2DATA),
 			.parse_ctrlval		= parse_cbm,
@@ -169,8 +159,6 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.cache_level		= 2,
 			.cache = {
 				.min_cbm_bits	= 1,
-				.cbm_idx_mult	= 2,
-				.cbm_idx_offset	= 1,
 			},
 			.domains		= domain_init(RDT_RESOURCE_L2CODE),
 			.parse_ctrlval		= parse_cbm,
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index fdb0e11a78dc..92d79c88b965 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -246,12 +246,17 @@ static int parse_line(char *line, struct resctrl_schema *s,
 	return -EINVAL;
 }
 
-static u32 cbm_idx(struct rdt_resource *r, unsigned int closid)
+static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
 {
-	if (r->rid == RDT_RESOURCE_MBA)
+	switch (type) {
+	default:
+	case CDP_NONE:
 		return closid;
-
-	return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
+	case CDP_CODE:
+		return closid * 2 + 1;
+	case CDP_DATA:
+		return closid * 2;
+	}
 }
 
 static bool apply_config(struct rdt_hw_domain *hw_dom,
@@ -286,10 +291,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
 
-	msr_param.low = cbm_idx(r, closid);
-	msr_param.high = msr_param.low + 1;
-	msr_param.res = r;
-
 	mba_sc = is_mba_sc(r);
 	list_for_each_entry(d, &r->domains, list) {
 		hw_dom = resctrl_to_arch_dom(d);
@@ -298,9 +299,13 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 			if (!cfg->have_new_ctrl)
 				continue;
 
-			idx = cbm_idx(r, closid);
+			idx = get_config_index(closid, t);
 			if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc))
 				continue;
+
+			msr_param.low = idx;
+			msr_param.high = msr_param.low + 1;
+			msr_param.res = r;
 		}
 	}
 
@@ -420,7 +425,7 @@ void resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			     u32 closid, enum resctrl_conf_type type, u32 *value)
 {
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
-	u32 idx = cbm_idx(r, closid);
+	u32 idx = get_config_index(closid, type);
 
 	if (!is_mba_sc(r))
 		*value = hw_dom->ctrl_val[idx];
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 69d7387b7f22..18dd764af0dd 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -73,10 +73,6 @@ struct rdt_domain {
  * struct resctrl_cache - Cache allocation related data
  * @cbm_len:		Length of the cache bit mask
  * @min_cbm_bits:	Minimum number of consecutive bits to be set
- * @cbm_idx_mult:	Multiplier of CBM index
- * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
- *			closid * cbm_idx_multi + cbm_idx_offset
- *			in a cache bit mask
  * @shareable_bits:	Bitmask of shareable resource with other
  *			executing entities
  * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
@@ -87,8 +83,6 @@ struct rdt_domain {
 struct resctrl_cache {
 	unsigned int	cbm_len;
 	unsigned int	min_cbm_bits;
-	unsigned int	cbm_idx_mult;	// TODO remove this
-	unsigned int	cbm_idx_offset; // TODO remove this
 	unsigned int	shareable_bits;
 	bool		arch_has_sparse_bitmaps;
 	bool		arch_has_empty_bitmaps;
-- 
Gitee


From 80b9a5e3537469f08ab4b155ae6bbc06d2cda31d Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:34 +0000
Subject: [PATCH 1313/2161] x86/resctrl: Merge the ctrl_val arrays

commit 43ac1dbf6101722944758f364ea39859d5db3ce0 upstream.

Each struct rdt_hw_resource has its own ctrl_val[] array. When CDP is
enabled, two resources are in use, each with its own ctrl_val[] array
that holds half of the configuration used by hardware. One uses the odd
slots, the other the even. rdt_cdp_peer_get() is the helper to find the
alternate resource, its domain, and corresponding entry in the other
ctrl_val[] array.

Once the CDP resources are merged there will be one struct
rdt_hw_resource and one ctrl_val[] array for each hardware resource.
This will include changes to rdt_cdp_peer_get(), making it hard to
bisect any issue.

Merge the ctrl_val[] arrays for three CODE/DATA/NONE resources first.
Doing this before merging the resources temporarily complicates
allocating and freeing the ctrl_val arrays. Add a helper to allocate
the ctrl_val array, that returns the value on the L2 or L3 resource if
it already exists. This gets removed once the resources are merged, and
there really is only one ctrl_val[] array.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-22-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c | 65 ++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 938d88a2d9c6..a79a9efd4f1f 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -503,6 +503,57 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 	}
 }
 
+static u32 *alloc_ctrlval_array(struct rdt_resource *r, struct rdt_domain *d,
+				bool mba_sc)
+{
+	/* these are for the underlying hardware, they may not match r/d */
+	struct rdt_domain *underlying_domain;
+	struct rdt_hw_resource *hw_res;
+	struct rdt_hw_domain *hw_dom;
+	bool remapped;
+
+	switch (r->rid) {
+	case RDT_RESOURCE_L3DATA:
+	case RDT_RESOURCE_L3CODE:
+		hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
+		remapped = true;
+		break;
+	case RDT_RESOURCE_L2DATA:
+	case RDT_RESOURCE_L2CODE:
+		hw_res = &rdt_resources_all[RDT_RESOURCE_L2];
+		remapped = true;
+		break;
+	default:
+		hw_res = resctrl_to_arch_res(r);
+		remapped = false;
+	}
+
+	/*
+	 * If we changed the resource, we need to search for the underlying
+	 * domain. Doing this for all resources would make it tricky to add the
+	 * first resource, as domains aren't added to a resource list until
+	 * after the ctrlval arrays have been allocated.
+	 */
+	if (remapped)
+		underlying_domain = rdt_find_domain(&hw_res->r_resctrl, d->id,
+						    NULL);
+	else
+		underlying_domain = d;
+	hw_dom = resctrl_to_arch_dom(underlying_domain);
+
+	if (mba_sc) {
+		if (hw_dom->mbps_val)
+			return hw_dom->mbps_val;
+		return kmalloc_array(hw_res->num_closid,
+				     sizeof(*hw_dom->mbps_val), GFP_KERNEL);
+	} else {
+		if (hw_dom->ctrl_val)
+			return hw_dom->ctrl_val;
+		return kmalloc_array(hw_res->num_closid,
+				     sizeof(*hw_dom->ctrl_val), GFP_KERNEL);
+	}
+}
+
 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
@@ -510,11 +561,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	struct msr_param m;
 	u32 *dc, *dm;
 
-	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val), GFP_KERNEL);
+	dc = alloc_ctrlval_array(r, d, false);
 	if (!dc)
 		return -ENOMEM;
 
-	dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val), GFP_KERNEL);
+	dm = alloc_ctrlval_array(r, d, true);
 	if (!dm) {
 		kfree(dc);
 		return -ENOMEM;
@@ -673,8 +724,14 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		if (d->plr)
 			d->plr->d = NULL;
 
-		kfree(hw_dom->ctrl_val);
-		kfree(hw_dom->mbps_val);
+		/* temporary: these four don't have a unique ctrlval array */
+		if (r->rid != RDT_RESOURCE_L3CODE &&
+		    r->rid != RDT_RESOURCE_L3DATA &&
+		    r->rid != RDT_RESOURCE_L2CODE &&
+		    r->rid != RDT_RESOURCE_L2DATA) {
+			kfree(hw_dom->ctrl_val);
+			kfree(hw_dom->mbps_val);
+		}
 		bitmap_free(d->rmid_busy_llc);
 		kfree(d->mbm_total);
 		kfree(d->mbm_local);
-- 
Gitee


From 61eb5cae69ef3b8f114c0f458aa297e0493aafe5 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:35 +0000
Subject: [PATCH 1314/2161] x86/resctrl: Remove rdt_cdp_peer_get()

commit fbc06c69805976e1b5c7e6bd0b89c5b0f5282cdf upstream.

When CDP is enabled, rdt_cdp_peer_get() finds the alternative
CODE/DATA resource and returns the alternative domain. This is used
to determine if bitmaps overlap when there are aliased entries
in the two struct rdt_hw_resources.

Now that the ctrl_val[] used by the CODE/DATA resources is the same,
the search for an alternate resource/domain is not needed.

Replace rdt_cdp_peer_get() with resctrl_peer_type(), which returns
the alternative type. This can be passed to resctrl_arch_get_config()
with the same resource and domain.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-23-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 99 ++++----------------------
 1 file changed, 14 insertions(+), 85 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 685b1cd9562e..a606ecdbc0ec 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1092,82 +1092,17 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of,
 	return 0;
 }
 
-/**
- * rdt_cdp_peer_get - Retrieve CDP peer if it exists
- * @r: RDT resource to which RDT domain @d belongs
- * @d: Cache instance for which a CDP peer is requested
- * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
- *         Used to return the result.
- * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
- *         Used to return the result.
- * @peer_type: The CDP configuration type of the peer resource.
- *
- * RDT resources are managed independently and by extension the RDT domains
- * (RDT resource instances) are managed independently also. The Code and
- * Data Prioritization (CDP) RDT resources, while managed independently,
- * could refer to the same underlying hardware. For example,
- * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
- *
- * When provided with an RDT resource @r and an instance of that RDT
- * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
- * resource and the exact instance that shares the same hardware.
- *
- * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
- *         If a CDP peer was found, @r_cdp will point to the peer RDT resource
- *         and @d_cdp will point to the peer RDT domain.
- */
-static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
-			    struct rdt_resource **r_cdp,
-			    struct rdt_domain **d_cdp,
-			    enum resctrl_conf_type *peer_type)
+static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
 {
-	struct rdt_resource *_r_cdp = NULL;
-	struct rdt_domain *_d_cdp = NULL;
-	int ret = 0;
-
-	switch (r->rid) {
-	case RDT_RESOURCE_L3DATA:
-		_r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE].r_resctrl;
-		*peer_type = CDP_CODE;
-		break;
-	case RDT_RESOURCE_L3CODE:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl;
-		*peer_type = CDP_DATA;
-		break;
-	case RDT_RESOURCE_L2DATA:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE].r_resctrl;
-		*peer_type = CDP_CODE;
-		break;
-	case RDT_RESOURCE_L2CODE:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl;
-		*peer_type = CDP_DATA;
-		break;
+	switch (my_type) {
+	case CDP_CODE:
+		return CDP_DATA;
+	case CDP_DATA:
+		return CDP_CODE;
 	default:
-		ret = -ENOENT;
-		goto out;
-	}
-
-	/*
-	 * When a new CPU comes online and CDP is enabled then the new
-	 * RDT domains (if any) associated with both CDP RDT resources
-	 * are added in the same CPU online routine while the
-	 * rdtgroup_mutex is held. It should thus not happen for one
-	 * RDT domain to exist and be associated with its RDT CDP
-	 * resource but there is no RDT domain associated with the
-	 * peer RDT CDP resource. Hence the WARN.
-	 */
-	_d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
-	if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
-		_r_cdp = NULL;
-		_d_cdp = NULL;
-		ret = -EINVAL;
+	case CDP_NONE:
+		return CDP_NONE;
 	}
-
-out:
-	*r_cdp = _r_cdp;
-	*d_cdp = _d_cdp;
-
-	return ret;
 }
 
 /**
@@ -1248,19 +1183,16 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
 			   unsigned long cbm, int closid, bool exclusive)
 {
-	enum resctrl_conf_type peer_type;
+	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
 	struct rdt_resource *r = s->res;
-	struct rdt_resource *r_cdp;
-	struct rdt_domain *d_cdp;
 
 	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
 				    exclusive))
 		return true;
 
-	if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp, &peer_type) < 0)
+	if (!resctrl_arch_get_cdp_enabled(r->rid))
 		return false;
-
-	return  __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, peer_type, exclusive);
+	return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
 }
 
 /**
@@ -2761,11 +2693,9 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
 static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 				 u32 closid)
 {
+	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
 	enum resctrl_conf_type t = s->conf_type;
-	struct rdt_resource *r_cdp = NULL;
 	struct resctrl_staged_config *cfg;
-	enum resctrl_conf_type peer_type;
-	struct rdt_domain *d_cdp = NULL;
 	struct rdt_resource *r = s->res;
 	u32 used_b = 0, unused_b = 0;
 	unsigned long tmp_cbm;
@@ -2773,7 +2703,6 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 	u32 peer_ctl, ctrl_val;
 	int i;
 
-	rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp, &peer_type);
 	cfg = &d->staged_config[t];
 	cfg->have_new_ctrl = false;
 	cfg->new_ctrl = r->cache.shareable_bits;
@@ -2793,8 +2722,8 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 			 * usage to ensure there is no overlap
 			 * with an exclusive group.
 			 */
-			if (d_cdp)
-				resctrl_arch_get_config(r_cdp, d_cdp, i, peer_type, &peer_ctl);
+			if (resctrl_arch_get_cdp_enabled(r->rid))
+				resctrl_arch_get_config(r, d, i, peer_type, &peer_ctl);
 			else
 				peer_ctl = 0;
 			resctrl_arch_get_config(r, d, i, s->conf_type, &ctrl_val);
-- 
Gitee


From 2c89b86dcc7da19b7039b2fc7657c454e6228123 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:36 +0000
Subject: [PATCH 1315/2161] x86/resctrl: Expand resctrl_arch_update_domains()'s
 msr_param range

commit 327364d5b6b6f8c89d2d6253a986d80323512890 upstream.

resctrl_arch_update_domains() specifies the one closid that has been
modified and needs copying to the hardware.

resctrl_arch_update_domains() takes a struct rdt_resource and a closid
as arguments, but copies all the staged configurations for that closid
into the ctrl_val[] array.

resctrl_arch_update_domains() is called once per schema, but once the
resources and domains are merged, the second call of a L2CODE/L2DATA
pair will find no staged configurations, as they were previously
applied. The msr_param of the first call only has one index, so would
only have update the hardware for the last staged configuration.

To avoid a second round of IPIs when changing L2CODE and L2DATA in one
go, expand the range of the msr_param if multiple staged configurations
are found.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-24-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 92d79c88b965..a487cf7ff04e 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -292,6 +292,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 		return -ENOMEM;
 
 	mba_sc = is_mba_sc(r);
+	msr_param.res = NULL;
 	list_for_each_entry(d, &r->domains, list) {
 		hw_dom = resctrl_to_arch_dom(d);
 		for (t = 0; t < CDP_NUM_TYPES; t++) {
@@ -303,9 +304,14 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 			if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc))
 				continue;
 
-			msr_param.low = idx;
-			msr_param.high = msr_param.low + 1;
-			msr_param.res = r;
+			if (!msr_param.res) {
+				msr_param.low = idx;
+				msr_param.high = msr_param.low + 1;
+				msr_param.res = r;
+			} else {
+				msr_param.low = min(msr_param.low, idx);
+				msr_param.high = max(msr_param.high, idx + 1);
+			}
 		}
 	}
 
-- 
Gitee


From a5317eefd7f45932469195c755e4b9d40445bd8e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 28 Jul 2021 17:06:37 +0000
Subject: [PATCH 1316/2161] x86/resctrl: Merge the CDP resources

commit 5c3b63cdba441c6a530b974ff73b14161d96a0c3 upstream.

resctrl uses struct rdt_resource to describe the available hardware
resources. The domains of the CDP aliases share a single ctrl_val[]
array. The only differences between the struct rdt_hw_resource aliases
is the name and conf_type.

The name from struct rdt_hw_resource is visible to user-space. To
support another architecture, as many user-visible details should be
handled in the filesystem parts of the code that is common to all
architectures. The name and conf_type go together.

Remove conf_type and the CDP aliases. When CDP is supported and enabled,
schemata_list_create() can create two schemata using the single
resource, generating the CODE/DATA suffix to the schema name itself.

This allows the alloc_ctrlval_array() and complications around free()ing
the ctrl_val arrays to be removed.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-25-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c     | 178 ++-----------------------
 arch/x86/kernel/cpu/resctrl/internal.h |   6 -
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 140 +++++++++----------
 3 files changed, 85 insertions(+), 239 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index a79a9efd4f1f..5eb1b58676b2 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -62,7 +62,6 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
 struct rdt_hw_resource rdt_resources_all[] = {
 	[RDT_RESOURCE_L3] =
 	{
-		.conf_type			= CDP_NONE,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_L3,
 			.name			= "L3",
@@ -78,45 +77,8 @@ struct rdt_hw_resource rdt_resources_all[] = {
 		.msr_base		= MSR_IA32_L3_CBM_BASE,
 		.msr_update		= cat_wrmsr,
 	},
-	[RDT_RESOURCE_L3DATA] =
-	{
-		.conf_type			= CDP_DATA,
-		.r_resctrl = {
-			.rid			= RDT_RESOURCE_L3DATA,
-			.name			= "L3DATA",
-			.cache_level		= 3,
-			.cache = {
-				.min_cbm_bits	= 1,
-			},
-			.domains		= domain_init(RDT_RESOURCE_L3DATA),
-			.parse_ctrlval		= parse_cbm,
-			.format_str		= "%d=%0*x",
-			.fflags			= RFTYPE_RES_CACHE,
-		},
-		.msr_base		= MSR_IA32_L3_CBM_BASE,
-		.msr_update		= cat_wrmsr,
-	},
-	[RDT_RESOURCE_L3CODE] =
-	{
-		.conf_type			= CDP_CODE,
-		.r_resctrl = {
-			.rid			= RDT_RESOURCE_L3CODE,
-			.name			= "L3CODE",
-			.cache_level		= 3,
-			.cache = {
-				.min_cbm_bits	= 1,
-			},
-			.domains		= domain_init(RDT_RESOURCE_L3CODE),
-			.parse_ctrlval		= parse_cbm,
-			.format_str		= "%d=%0*x",
-			.fflags			= RFTYPE_RES_CACHE,
-		},
-		.msr_base		= MSR_IA32_L3_CBM_BASE,
-		.msr_update		= cat_wrmsr,
-	},
 	[RDT_RESOURCE_L2] =
 	{
-		.conf_type			= CDP_NONE,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_L2,
 			.name			= "L2",
@@ -132,45 +94,8 @@ struct rdt_hw_resource rdt_resources_all[] = {
 		.msr_base		= MSR_IA32_L2_CBM_BASE,
 		.msr_update		= cat_wrmsr,
 	},
-	[RDT_RESOURCE_L2DATA] =
-	{
-		.conf_type			= CDP_DATA,
-		.r_resctrl = {
-			.rid			= RDT_RESOURCE_L2DATA,
-			.name			= "L2DATA",
-			.cache_level		= 2,
-			.cache = {
-				.min_cbm_bits	= 1,
-			},
-			.domains		= domain_init(RDT_RESOURCE_L2DATA),
-			.parse_ctrlval		= parse_cbm,
-			.format_str		= "%d=%0*x",
-			.fflags			= RFTYPE_RES_CACHE,
-		},
-		.msr_base		= MSR_IA32_L2_CBM_BASE,
-		.msr_update		= cat_wrmsr,
-	},
-	[RDT_RESOURCE_L2CODE] =
-	{
-		.conf_type			= CDP_CODE,
-		.r_resctrl = {
-			.rid			= RDT_RESOURCE_L2CODE,
-			.name			= "L2CODE",
-			.cache_level		= 2,
-			.cache = {
-				.min_cbm_bits	= 1,
-			},
-			.domains		= domain_init(RDT_RESOURCE_L2CODE),
-			.parse_ctrlval		= parse_cbm,
-			.format_str		= "%d=%0*x",
-			.fflags			= RFTYPE_RES_CACHE,
-		},
-		.msr_base		= MSR_IA32_L2_CBM_BASE,
-		.msr_update		= cat_wrmsr,
-	},
 	[RDT_RESOURCE_MBA] =
 	{
-		.conf_type			= CDP_NONE,
 		.r_resctrl = {
 			.rid			= RDT_RESOURCE_MBA,
 			.name			= "MB",
@@ -333,40 +258,24 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 	r->alloc_enabled = true;
 }
 
-static void rdt_get_cdp_config(int level, int type)
+static void rdt_get_cdp_config(int level)
 {
-	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
-	struct rdt_hw_resource *hw_res_l = resctrl_to_arch_res(r_l);
-	struct rdt_resource *r = &rdt_resources_all[type].r_resctrl;
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-
-	hw_res->num_closid = hw_res_l->num_closid;
-	r->cache.cbm_len = r_l->cache.cbm_len;
-	r->default_ctrl = r_l->default_ctrl;
-	r->cache.shareable_bits = r_l->cache.shareable_bits;
-	r->data_width = (r->cache.cbm_len + 3) / 4;
-	r->alloc_capable = true;
 	/*
 	 * By default, CDP is disabled. CDP can be enabled by mount parameter
 	 * "cdp" during resctrl file system mount time.
 	 */
-	r->alloc_enabled = false;
 	rdt_resources_all[level].cdp_enabled = false;
-	rdt_resources_all[type].cdp_enabled = false;
-	r_l->cdp_capable = true;
-	r->cdp_capable = true;
+	rdt_resources_all[level].r_resctrl.cdp_capable = true;
 }
 
 static void rdt_get_cdp_l3_config(void)
 {
-	rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA);
-	rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE);
+	rdt_get_cdp_config(RDT_RESOURCE_L3);
 }
 
 static void rdt_get_cdp_l2_config(void)
 {
-	rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA);
-	rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
+	rdt_get_cdp_config(RDT_RESOURCE_L2);
 }
 
 static void
@@ -503,57 +412,6 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 	}
 }
 
-static u32 *alloc_ctrlval_array(struct rdt_resource *r, struct rdt_domain *d,
-				bool mba_sc)
-{
-	/* these are for the underlying hardware, they may not match r/d */
-	struct rdt_domain *underlying_domain;
-	struct rdt_hw_resource *hw_res;
-	struct rdt_hw_domain *hw_dom;
-	bool remapped;
-
-	switch (r->rid) {
-	case RDT_RESOURCE_L3DATA:
-	case RDT_RESOURCE_L3CODE:
-		hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
-		remapped = true;
-		break;
-	case RDT_RESOURCE_L2DATA:
-	case RDT_RESOURCE_L2CODE:
-		hw_res = &rdt_resources_all[RDT_RESOURCE_L2];
-		remapped = true;
-		break;
-	default:
-		hw_res = resctrl_to_arch_res(r);
-		remapped = false;
-	}
-
-	/*
-	 * If we changed the resource, we need to search for the underlying
-	 * domain. Doing this for all resources would make it tricky to add the
-	 * first resource, as domains aren't added to a resource list until
-	 * after the ctrlval arrays have been allocated.
-	 */
-	if (remapped)
-		underlying_domain = rdt_find_domain(&hw_res->r_resctrl, d->id,
-						    NULL);
-	else
-		underlying_domain = d;
-	hw_dom = resctrl_to_arch_dom(underlying_domain);
-
-	if (mba_sc) {
-		if (hw_dom->mbps_val)
-			return hw_dom->mbps_val;
-		return kmalloc_array(hw_res->num_closid,
-				     sizeof(*hw_dom->mbps_val), GFP_KERNEL);
-	} else {
-		if (hw_dom->ctrl_val)
-			return hw_dom->ctrl_val;
-		return kmalloc_array(hw_res->num_closid,
-				     sizeof(*hw_dom->ctrl_val), GFP_KERNEL);
-	}
-}
-
 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
@@ -561,11 +419,13 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	struct msr_param m;
 	u32 *dc, *dm;
 
-	dc = alloc_ctrlval_array(r, d, false);
+	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
+			   GFP_KERNEL);
 	if (!dc)
 		return -ENOMEM;
 
-	dm = alloc_ctrlval_array(r, d, true);
+	dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val),
+			   GFP_KERNEL);
 	if (!dm) {
 		kfree(dc);
 		return -ENOMEM;
@@ -724,14 +584,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		if (d->plr)
 			d->plr->d = NULL;
 
-		/* temporary: these four don't have a unique ctrlval array */
-		if (r->rid != RDT_RESOURCE_L3CODE &&
-		    r->rid != RDT_RESOURCE_L3DATA &&
-		    r->rid != RDT_RESOURCE_L2CODE &&
-		    r->rid != RDT_RESOURCE_L2DATA) {
-			kfree(hw_dom->ctrl_val);
-			kfree(hw_dom->mbps_val);
-		}
+		kfree(hw_dom->ctrl_val);
+		kfree(hw_dom->mbps_val);
 		bitmap_free(d->rmid_busy_llc);
 		kfree(d->mbm_total);
 		kfree(d->mbm_local);
@@ -1011,11 +865,7 @@ static __init void rdt_init_res_defs_intel(void)
 		hw_res = resctrl_to_arch_res(r);
 
 		if (r->rid == RDT_RESOURCE_L3 ||
-		    r->rid == RDT_RESOURCE_L3DATA ||
-		    r->rid == RDT_RESOURCE_L3CODE ||
-		    r->rid == RDT_RESOURCE_L2 ||
-		    r->rid == RDT_RESOURCE_L2DATA ||
-		    r->rid == RDT_RESOURCE_L2CODE) {
+		    r->rid == RDT_RESOURCE_L2) {
 			r->cache.arch_has_sparse_bitmaps = false;
 			r->cache.arch_has_empty_bitmaps = false;
 			r->cache.arch_has_per_cpu_cfg = false;
@@ -1035,11 +885,7 @@ static __init void rdt_init_res_defs_amd(void)
 		hw_res = resctrl_to_arch_res(r);
 
 		if (r->rid == RDT_RESOURCE_L3 ||
-		    r->rid == RDT_RESOURCE_L3DATA ||
-		    r->rid == RDT_RESOURCE_L3CODE ||
-		    r->rid == RDT_RESOURCE_L2 ||
-		    r->rid == RDT_RESOURCE_L2DATA ||
-		    r->rid == RDT_RESOURCE_L2CODE) {
+		    r->rid == RDT_RESOURCE_L2) {
 			r->cache.arch_has_sparse_bitmaps = true;
 			r->cache.arch_has_empty_bitmaps = true;
 			r->cache.arch_has_per_cpu_cfg = true;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index ef43f83de009..2f92015bf129 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -370,7 +370,6 @@ struct rdt_parse_data {
 
 /**
  * struct rdt_hw_resource - arch private attributes of a resctrl resource
- * @conf_type:		The type that should be used when configuring. temporary
  * @r_resctrl:		Attributes of the resource used directly by resctrl.
  * @num_closid:		Maximum number of closid this hardware can support,
  *			regardless of CDP. This is exposed via
@@ -384,7 +383,6 @@ struct rdt_parse_data {
  * @cdp_enabled:	CDP state of this resource
  */
 struct rdt_hw_resource {
-	enum resctrl_conf_type	conf_type;
 	struct rdt_resource	r_resctrl;
 	u32			num_closid;
 	unsigned int		msr_base;
@@ -415,11 +413,7 @@ extern struct dentry *debugfs_resctrl;
 
 enum resctrl_res_level {
 	RDT_RESOURCE_L3,
-	RDT_RESOURCE_L3DATA,
-	RDT_RESOURCE_L3CODE,
 	RDT_RESOURCE_L2,
-	RDT_RESOURCE_L2DATA,
-	RDT_RESOURCE_L2CODE,
 	RDT_RESOURCE_MBA,
 
 	/* Must be the last */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index a606ecdbc0ec..4bae879e2812 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1880,10 +1880,10 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
 	if (!r->cdp_capable)
 		return;
 
-	if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA].r_resctrl)
+	if (r->rid == RDT_RESOURCE_L2)
 		l2_qos_cfg_update(&hw_res->cdp_enabled);
 
-	if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA].r_resctrl)
+	if (r->rid == RDT_RESOURCE_L3)
 		l3_qos_cfg_update(&hw_res->cdp_enabled);
 }
 
@@ -1912,68 +1912,42 @@ static int set_mba_sc(bool mba_sc)
 	return 0;
 }
 
-static int cdp_enable(int level, int data_type, int code_type)
+static int cdp_enable(int level)
 {
-	struct rdt_resource *r_ldata = &rdt_resources_all[data_type].r_resctrl;
-	struct rdt_resource *r_lcode = &rdt_resources_all[code_type].r_resctrl;
 	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
 	int ret;
 
-	if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
-	    !r_lcode->alloc_capable)
+	if (!r_l->alloc_capable)
 		return -EINVAL;
 
 	ret = set_cache_qos_cfg(level, true);
-	if (!ret) {
-		r_l->alloc_enabled = false;
-		r_ldata->alloc_enabled = true;
-		r_lcode->alloc_enabled = true;
+	if (!ret)
 		rdt_resources_all[level].cdp_enabled = true;
-		rdt_resources_all[data_type].cdp_enabled = true;
-		rdt_resources_all[code_type].cdp_enabled = true;
-	}
+
 	return ret;
 }
 
-static void cdp_disable(int level, int data_type, int code_type)
+static void cdp_disable(int level)
 {
 	struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
-	struct rdt_resource *r = &r_hw->r_resctrl;
-
-	r->alloc_enabled = r->alloc_capable;
 
 	if (r_hw->cdp_enabled) {
-		rdt_resources_all[data_type].r_resctrl.alloc_enabled = false;
-		rdt_resources_all[code_type].r_resctrl.alloc_enabled = false;
 		set_cache_qos_cfg(level, false);
 		r_hw->cdp_enabled = false;
-		rdt_resources_all[data_type].cdp_enabled = false;
-		rdt_resources_all[code_type].cdp_enabled = false;
 	}
 }
 
 int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
 {
 	struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
-	enum resctrl_res_level code_type, data_type;
 
 	if (!hw_res->r_resctrl.cdp_capable)
 		return -EINVAL;
 
-	if (l == RDT_RESOURCE_L3) {
-		code_type = RDT_RESOURCE_L3CODE;
-		data_type = RDT_RESOURCE_L3DATA;
-	} else if (l == RDT_RESOURCE_L2) {
-		code_type = RDT_RESOURCE_L2CODE;
-		data_type = RDT_RESOURCE_L2DATA;
-	} else {
-		return -EINVAL;
-	}
-
 	if (enable)
-		return cdp_enable(l, data_type, code_type);
+		return cdp_enable(l);
 
-	cdp_disable(l, data_type, code_type);
+	cdp_disable(l);
 
 	return 0;
 }
@@ -2072,50 +2046,82 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 	return ret;
 }
 
-static int schemata_list_create(void)
+static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
 {
 	struct resctrl_schema *s;
-	struct rdt_resource *r;
+	const char *suffix = "";
 	int ret, cl;
 
-	for_each_alloc_enabled_rdt_resource(r) {
-		s = kzalloc(sizeof(*s), GFP_KERNEL);
-		if (!s)
-			return -ENOMEM;
-
-		s->res = r;
-		s->conf_type = resctrl_to_arch_res(r)->conf_type;
-		s->num_closid = resctrl_arch_get_num_closid(r);
-		if (resctrl_arch_get_cdp_enabled(r->rid))
-			s->num_closid /= 2;
-
-		ret = snprintf(s->name, sizeof(s->name), r->name);
-		if (ret >= sizeof(s->name)) {
-			kfree(s);
-			return -EINVAL;
-		}
-
-		cl = strlen(s->name);
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
 
-		/*
-		 * If CDP is supported by this resource, but not enabled,
-		 * include the suffix. This ensures the tabular format of the
-		 * schemata file does not change between mounts of the
-		 * filesystem.
-		 */
-		if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
-			cl += 4;
+	s->res = r;
+	s->num_closid = resctrl_arch_get_num_closid(r);
+	if (resctrl_arch_get_cdp_enabled(r->rid))
+		s->num_closid /= 2;
 
-		if (cl > max_name_width)
-			max_name_width = cl;
+	s->conf_type = type;
+	switch (type) {
+	case CDP_CODE:
+		suffix = "CODE";
+		break;
+	case CDP_DATA:
+		suffix = "DATA";
+		break;
+	case CDP_NONE:
+		suffix = "";
+		break;
+	}
 
-		INIT_LIST_HEAD(&s->list);
-		list_add(&s->list, &resctrl_schema_all);
+	ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
+	if (ret >= sizeof(s->name)) {
+		kfree(s);
+		return -EINVAL;
 	}
 
+	cl = strlen(s->name);
+
+	/*
+	 * If CDP is supported by this resource, but not enabled,
+	 * include the suffix. This ensures the tabular format of the
+	 * schemata file does not change between mounts of the filesystem.
+	 */
+	if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
+		cl += 4;
+
+	if (cl > max_name_width)
+		max_name_width = cl;
+
+	INIT_LIST_HEAD(&s->list);
+	list_add(&s->list, &resctrl_schema_all);
+
 	return 0;
 }
 
+static int schemata_list_create(void)
+{
+	struct rdt_resource *r;
+	int ret = 0;
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		if (resctrl_arch_get_cdp_enabled(r->rid)) {
+			ret = schemata_list_add(r, CDP_CODE);
+			if (ret)
+				break;
+
+			ret = schemata_list_add(r, CDP_DATA);
+		} else {
+			ret = schemata_list_add(r, CDP_NONE);
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 static void schemata_list_destroy(void)
 {
 	struct resctrl_schema *s, *tmp;
-- 
Gitee


From 77055d5bdcd1c88f8a5ce924470f8dd550cf5493 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 11 Aug 2021 16:38:31 +0000
Subject: [PATCH 1317/2161] x86/resctrl: Make resctrl_arch_get_config() return
 its value

commit 111136e69c9df50c3ca7d4e3977344b8a2d0d947 upstream.

resctrl_arch_get_config() has no return, but does pass a single value
back via one of its arguments.

Return the value instead.

Suggested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210811163831.14917-1-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 13 ++++++-------
 arch/x86/kernel/cpu/resctrl/monitor.c     |  2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 20 ++++++++++++--------
 include/linux/resctrl.h                   |  5 ++---
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index a487cf7ff04e..87666275eed9 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -427,16 +427,15 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 	return ret ?: nbytes;
 }
 
-void resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
-			     u32 closid, enum resctrl_conf_type type, u32 *value)
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+			    u32 closid, enum resctrl_conf_type type)
 {
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	u32 idx = get_config_index(closid, type);
 
 	if (!is_mba_sc(r))
-		*value = hw_dom->ctrl_val[idx];
-	else
-		*value = hw_dom->mbps_val[idx];
+		return hw_dom->ctrl_val[idx];
+	return hw_dom->mbps_val[idx];
 }
 
 static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
@@ -451,8 +450,8 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
 		if (sep)
 			seq_puts(s, ";");
 
-		resctrl_arch_get_config(r, dom, closid, schema->conf_type,
-					&ctrl_val);
+		ctrl_val = resctrl_arch_get_config(r, dom, closid,
+						   schema->conf_type);
 		seq_printf(s, r->format_str, dom->id, max_data_width,
 			   ctrl_val);
 		sep = true;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 082291c9197d..e9f78eafe635 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -442,7 +442,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	hw_dom_mba = resctrl_to_arch_dom(dom_mba);
 
 	cur_bw = pmbm_data->prev_bw;
-	resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE, &user_bw);
+	user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
 	delta_bw = pmbm_data->delta_bw;
 	/*
 	 * resctrl_arch_get_config() chooses the mbps/ctrl value to return
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 4bae879e2812..60f0dc5e955a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -923,8 +923,8 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
 		for (i = 0; i < closids_supported(); i++) {
 			if (!closid_allocated(i))
 				continue;
-			resctrl_arch_get_config(r, dom, i, s->conf_type,
-						&ctrl_val);
+			ctrl_val = resctrl_arch_get_config(r, dom, i,
+							   s->conf_type);
 			mode = rdtgroup_mode_by_closid(i);
 			switch (mode) {
 			case RDT_MODE_SHAREABLE:
@@ -1142,7 +1142,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 
 	/* Check for overlap with other resource groups */
 	for (i = 0; i < closids_supported(); i++) {
-		resctrl_arch_get_config(r, d, i, type, (u32 *)&ctrl_b);
+		ctrl_b = resctrl_arch_get_config(r, d, i, type);
 		mode = rdtgroup_mode_by_closid(i);
 		if (closid_allocated(i) && i != closid &&
 		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
@@ -1222,7 +1222,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 			continue;
 		has_cache = true;
 		list_for_each_entry(d, &r->domains, list) {
-			resctrl_arch_get_config(r, d, closid, s->conf_type, &ctrl);
+			ctrl = resctrl_arch_get_config(r, d, closid,
+						       s->conf_type);
 			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
 				rdt_last_cmd_puts("Schemata overlaps\n");
 				return false;
@@ -1395,8 +1396,9 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 				size = 0;
 			} else {
-				resctrl_arch_get_config(r, d, rdtgrp->closid,
-							schema->conf_type, &ctrl);
+				ctrl = resctrl_arch_get_config(r, d,
+							       rdtgrp->closid,
+							       schema->conf_type);
 				if (r->rid == RDT_RESOURCE_MBA)
 					size = ctrl;
 				else
@@ -2729,10 +2731,12 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 			 * with an exclusive group.
 			 */
 			if (resctrl_arch_get_cdp_enabled(r->rid))
-				resctrl_arch_get_config(r, d, i, peer_type, &peer_ctl);
+				peer_ctl = resctrl_arch_get_config(r, d, i,
+								   peer_type);
 			else
 				peer_ctl = 0;
-			resctrl_arch_get_config(r, d, i, s->conf_type, &ctrl_val);
+			ctrl_val = resctrl_arch_get_config(r, d, i,
+							   s->conf_type);
 			used_b |= ctrl_val | peer_ctl;
 			if (mode == RDT_MODE_SHAREABLE)
 				cfg->new_ctrl |= ctrl_val | peer_ctl;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 18dd764af0dd..21deb5212bbd 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -194,8 +194,7 @@ struct resctrl_schema {
 /* The number of closid supported by this resource regardless of CDP */
 u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
-void resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
-			     u32 closid, enum resctrl_conf_type type,
-			     u32 *value);
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+			    u32 closid, enum resctrl_conf_type type);
 
 #endif /* _RESCTRL_H */
-- 
Gitee


From 5791d5a076cd535b188ebf9fade12b7a94e2b8cb Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 17 Sep 2021 16:59:58 +0000
Subject: [PATCH 1318/2161] x86/resctrl: Free the ctrlval arrays when
 domain_setup_mon_state() fails

commit 64e87d4bd3201bf8a4685083ee4daf5c0d001452 upstream.

domain_add_cpu() is called whenever a CPU is brought online. The
earlier call to domain_setup_ctrlval() allocates the control value
arrays.

If domain_setup_mon_state() fails, the control value arrays are not
freed.

Add the missing kfree() calls.

Fixes: 1bd2a63b4f0de ("x86/intel_rdt/mba_sc: Add initialization support")
Fixes: edf6fa1c4a951 ("x86/intel_rdt/cqm: Add RMID (Resource monitoring ID) management")
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20210917165958.28313-1-james.morse@arm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 5eb1b58676b2..5dab0ff55fff 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -526,6 +526,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 	}
 
 	if (r->mon_capable && domain_setup_mon_state(r, d)) {
+		kfree(d->ctrl_val);
+		kfree(d->mbps_val);
 		kfree(d);
 		return;
 	}
-- 
Gitee


From 6ddd3b16c1913d6da885349b49eec8f183de8455 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 17 Sep 2021 16:59:58 +0000
Subject: [PATCH 1319/2161] x86/resctrl: Free the ctrlval arrays when
 domain_setup_mon_state() fails

commit 64e87d4bd3201bf8a4685083ee4daf5c0d001452 upstream.

domain_add_cpu() is called whenever a CPU is brought online. The
earlier call to domain_setup_ctrlval() allocates the control value
arrays.

If domain_setup_mon_state() fails, the control value arrays are not
freed.

Add the missing kfree() calls.

Fixes: 1bd2a63b4f0de ("x86/intel_rdt/mba_sc: Add initialization support")
Fixes: edf6fa1c4a951 ("x86/intel_rdt/cqm: Add RMID (Resource monitoring ID) management")
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20210917165958.28313-1-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 5dab0ff55fff..2b2a9b4f3d4c 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -526,8 +526,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 	}
 
 	if (r->mon_capable && domain_setup_mon_state(r, d)) {
-		kfree(d->ctrl_val);
-		kfree(d->mbps_val);
+		kfree(hw_dom->ctrl_val);
+		kfree(hw_dom->mbps_val);
 		kfree(d);
 		return;
 	}
-- 
Gitee


From 0f7e33eb9ac6c2edf3e695e594098a614049a28e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 17 Sep 2021 16:59:24 +0000
Subject: [PATCH 1320/2161] x86/resctrl: Fix kfree() of the wrong type in
 domain_add_cpu()

commit d4ebfca26dfab2803c31d68a30225be31b2f9ecf upstream.

Commit in Fixes separated the architecture specific and filesystem parts
of the resctrl domain structures.

This left the error paths in domain_add_cpu() kfree()ing the memory with
the wrong type.

This will cause a problem if someone adds a new member to struct
rdt_hw_domain meaning d_resctrl is no longer the first member.

Fixes: 792e0f6f789b ("x86/resctrl: Split struct rdt_domain")
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20210917165924.28254-1-james.morse@arm.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 2b2a9b4f3d4c..53857855aa0e 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -521,14 +521,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 	rdt_domain_reconfigure_cdp(r);
 
 	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
-		kfree(d);
+		kfree(hw_dom);
 		return;
 	}
 
 	if (r->mon_capable && domain_setup_mon_state(r, d)) {
 		kfree(hw_dom->ctrl_val);
 		kfree(hw_dom->mbps_val);
-		kfree(d);
+		kfree(hw_dom);
 		return;
 	}
 
-- 
Gitee


From 456e574f269be47f373b9c09a792ad178266c4a0 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 7 Dec 2021 22:37:35 +0000
Subject: [PATCH 1321/2161] x86/resctrl: Remove redundant assignment to
 variable chunks

commit df0114f1f8711dbf481324c44cf5a8349130b913 upstream.

The variable chunks is being shifted right and re-assinged the shifted
value which is then returned. Since chunks is not being read afterwards
the assignment is redundant and the >>= operator can be replaced with a
shift >> operator instead.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lkml.kernel.org/r/20211207223735.35173-1-colin.i.king@gmail.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/resctrl/monitor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index e9f78eafe635..2a968ee87fb5 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -282,7 +282,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 	u64 shift = 64 - width, chunks;
 
 	chunks = (cur_msr << shift) - (prev_msr << shift);
-	return chunks >>= shift;
+	return chunks >> shift;
 }
 
 static int __mon_event_count(u32 rmid, struct rmid_read *rr)
-- 
Gitee


From d667389c395c8ef7944c3c69f246c0b053b4d530 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 22 Jul 2021 18:55:12 +0800
Subject: [PATCH 1322/2161] dma-debug: use memory_intersects() directly

commit 1d7db834a027ecfb5ac93be2e8bb45bfcbbf6eaa upstream.

There is already a memory_intersects() helper in sections.h,
use memory_intersects() directly instead of private overlap().

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/debug.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index cb6425e52bf7..b1a9a1eff8de 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1118,20 +1118,10 @@ static void check_for_stack(struct device *dev,
 	}
 }
 
-static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
-{
-	unsigned long a1 = (unsigned long)addr;
-	unsigned long b1 = a1 + len;
-	unsigned long a2 = (unsigned long)start;
-	unsigned long b2 = (unsigned long)end;
-
-	return !(b1 <= a2 || a1 >= b2);
-}
-
 static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len)
 {
-	if (overlap(addr, len, _stext, _etext) ||
-	    overlap(addr, len, __start_rodata, __end_rodata))
+	if (memory_intersects(_stext, _etext, addr, len) ||
+	    memory_intersects(__start_rodata, __end_rodata, addr, len))
 		err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
 }
 
-- 
Gitee


From 0b7b509b4ece73945fb95c3c6e6dfb8b47181712 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 24 May 2022 21:06:08 +0800
Subject: [PATCH 1323/2161] dma-mapping: allow map_sg() ops to return negative
 error codes

commit fffe3cc8c2194f60c4af4fac7f27d25e8828f001 upstream.

Allow dma_map_sgtable() to pass errors from the map_sg() ops. This
will be required for returning appropriate error codes when mapping
P2PDMA memory.

Introduce __dma_map_sg_attrs() which will return the raw error code
from the map_sg operation (whether it be negative or zero). Then add a
dma_map_sg_attrs() wrapper to convert any negative errors to zero to
satisfy the existing calling convention.

dma_map_sgtable() defines three error codes that .map_sg implementations
are allowed to return: -EINVAL, -ENOMEM and -EIO. The latter of which
is a generic return for cases that are passing DMA_MAPPING_ERROR
through.

dma_map_sgtable() will convert a zero error return for old map_sg() ops
into a -EIO return and return any negative errors as reported.

This allows map_sg implementations to start returning multiple
negative error codes. Legacy map_sg implementations can continue
to return zero until they are all converted.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-mapping.h | 40 +++++-------------
 kernel/dma/mapping.c        | 82 +++++++++++++++++++++++++++++++++----
 2 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index b231ea32648f..7d1f46821f6e 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -99,8 +99,9 @@ struct dma_map_ops {
 			   size_t size, enum dma_data_direction dir,
 			   unsigned long attrs);
 	/*
-	 * map_sg returns 0 on error and a value > 0 on success.
-	 * It should never return a value < 0.
+	 * map_sg should return a negative error code on error. See
+	 * dma_map_sgtable() for a list of appropriate error codes
+	 * and their meanings.
 	 */
 	int (*map_sg)(struct device *dev, struct scatterlist *sg,
 		      int nents, enum dma_data_direction dir,
@@ -233,6 +234,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 				      int nents, enum dma_data_direction dir,
 				      unsigned long attrs);
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs);
 dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
@@ -290,6 +293,11 @@ static inline void dma_unmap_sg_attrs(struct device *dev,
 		unsigned long attrs)
 {
 }
+static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	return -EOPNOTSUPP;
+}
 static inline dma_addr_t dma_map_resource(struct device *dev,
 		phys_addr_t phys_addr, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
@@ -425,34 +433,6 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 	return dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
-/**
- * dma_map_sgtable - Map the given buffer for DMA
- * @dev:	The device for which to perform the DMA operation
- * @sgt:	The sg_table object describing the buffer
- * @dir:	DMA direction
- * @attrs:	Optional DMA attributes for the map operation
- *
- * Maps a buffer described by a scatterlist stored in the given sg_table
- * object for the @dir DMA operation by the @dev device. After success the
- * ownership for the buffer is transferred to the DMA domain.  One has to
- * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
- * ownership of the buffer back to the CPU domain before touching the
- * buffer by the CPU.
- *
- * Returns 0 on success or -EINVAL on error during mapping the buffer.
- */
-static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	int nents;
-
-	nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-	if (nents <= 0)
-		return -EINVAL;
-	sgt->nents = nents;
-	return 0;
-}
-
 /**
  * dma_unmap_sgtable - Unmap the given buffer for DMA
  * @dev:	The device for which to perform the DMA operation
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index fa63e1da86f5..5b73e55a6bae 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -169,12 +169,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
 }
 EXPORT_SYMBOL(dma_unmap_page_attrs);
 
-/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
- */
-int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
-		enum dma_data_direction dir, unsigned long attrs)
+static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+	 int nents, enum dma_data_direction dir, unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	int ents;
@@ -184,13 +180,83 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
 		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
 	else
 		ents = ops->map_sg(dev, sg, nents, dir, attrs);
-	BUG_ON(ents < 0);
-	debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+	if (ents > 0)
+		debug_dma_map_sg(dev, sg, nents, ents, dir);
+	else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+			      ents != -EIO && ents != 0))
+		return -EIO;
 
 	return ents;
 }
+
+/**
+ * dma_map_sg_attrs - Map the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sg:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ * @attrs:	Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist passed in the sg argument with
+ * nents segments for the @dir DMA operation by the @dev device.
+ *
+ * Returns the number of mapped entries (which can be less than nents)
+ * on success. Zero is returned for any error.
+ *
+ * dma_unmap_sg_attrs() should be used to unmap the buffer with the
+ * original sg and original nents (not the value returned by this funciton).
+ */
+int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+		    int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	int ret;
+
+	ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs);
+	if (ret < 0)
+		return 0;
+	return ret;
+}
 EXPORT_SYMBOL(dma_map_sg_attrs);
 
+/**
+ * dma_map_sgtable - Map the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ * @attrs:	Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After success, the
+ * ownership for the buffer is transferred to the DMA domain.  One has to
+ * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
+ * ownership of the buffer back to the CPU domain before touching the
+ * buffer by the CPU.
+ *
+ * Returns 0 on success or a negative error code on error. The following
+ * error codes are supported with the given meaning:
+ *
+ *   -EINVAL - An invalid argument, unaligned access or other error
+ *	       in usage. Will not succeed if retried.
+ *   -ENOMEM - Insufficient resources (like memory or IOVA space) to
+ *	       complete the mapping. Should succeed if retried later.
+ *   -EIO    - Legacy error code with an unknown meaning. eg. this is
+ *	       returned if a lower level call returned DMA_MAPPING_ERROR.
+ */
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		    enum dma_data_direction dir, unsigned long attrs)
+{
+	int nents;
+
+	nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+	if (nents == 0)
+		return -EIO;
+	if (nents < 0)
+		return nents;
+	sgt->nents = nents;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dma_map_sgtable);
+
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 				      int nents, enum dma_data_direction dir,
 				      unsigned long attrs)
-- 
Gitee


From 3567bf71c1a50a0ccea600a87d1d7783831d0e37 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 29 Jul 2021 14:15:20 -0600
Subject: [PATCH 1324/2161] dma-direct: return appropriate error code from
 dma_direct_map_sg()

commit c81be74e7d790f090d3c8a52e20a334dc2506a3f upstream.

Now that the map_sg() op expects error codes instead of return zero on
error, convert dma_direct_map_sg() to return an error code. Per the
documentation for dma_map_sgtable(), -EIO is returned due to an
DMA_MAPPING_ERROR with unknown cause.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index c1198dd1b949..a4365f4ced00 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -414,7 +414,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 
 out_unmap:
 	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-	return 0;
+	return -EIO;
 }
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-- 
Gitee


From c976ee6b3cd2f35e68dca1058128579cb32ac4d4 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 24 May 2022 21:19:46 +0800
Subject: [PATCH 1325/2161] iommu: Mark __iommu_map_sg() as static

commit 9930264fd997dad4c2cd8914cec7acab7c0ed984 upstream.

Now __iommu_map_sg() is used only in iommu.c file, so mark it static.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/ab722e9970739929738066777b8ee7930e32abd5.1591930156.git.baolin.wang@linux.alibaba.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d9e6776151c6..30f93ab32e8c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2532,9 +2532,9 @@ size_t iommu_unmap_fast(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_fast);
 
-size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-		      struct scatterlist *sg, unsigned int nents, int prot,
-		      gfp_t gfp)
+static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+			     struct scatterlist *sg, unsigned int nents, int prot,
+			     gfp_t gfp)
 {
 	const struct iommu_ops *ops = domain->ops;
 	size_t len = 0, mapped = 0;
-- 
Gitee


From f00a5e24941a1f328b40a5d87dcfde3cf09bd8dd Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 25 May 2022 10:51:03 +0800
Subject: [PATCH 1326/2161] iommu: return full error code from
 iommu_map_sg[_atomic]()

commit ad8f36e4b6b1c826a0daa5fda2c5839205b5aa8b upstream.

Convert to ssize_t return code so the return code from __iommu_map()
can be returned all the way down through dma_iommu_map_sg().

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 15 +++++++--------
 include/linux/iommu.h | 22 +++++++++++-----------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 30f93ab32e8c..4ab9e38a76ce 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2532,9 +2532,9 @@ size_t iommu_unmap_fast(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_fast);
 
-static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-			     struct scatterlist *sg, unsigned int nents, int prot,
-			     gfp_t gfp)
+static ssize_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		struct scatterlist *sg, unsigned int nents, int prot,
+		gfp_t gfp)
 {
 	const struct iommu_ops *ops = domain->ops;
 	size_t len = 0, mapped = 0;
@@ -2575,19 +2575,18 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 	/* undo mappings already done */
 	iommu_unmap(domain, iova, mapped);
 
-	return 0;
-
+	return ret;
 }
 
-size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-		    struct scatterlist *sg, unsigned int nents, int prot)
+ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		     struct scatterlist *sg, unsigned int nents, int prot)
 {
 	might_sleep();
 	return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(iommu_map_sg);
 
-size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
+ssize_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
 		    struct scatterlist *sg, unsigned int nents, int prot)
 {
 	return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4972c1e20848..b33962b36042 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -470,11 +470,11 @@ extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 extern size_t iommu_unmap_fast(struct iommu_domain *domain,
 			       unsigned long iova, size_t size,
 			       struct iommu_iotlb_gather *iotlb_gather);
-extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-			   struct scatterlist *sg,unsigned int nents, int prot);
-extern size_t iommu_map_sg_atomic(struct iommu_domain *domain,
-				  unsigned long iova, struct scatterlist *sg,
-				  unsigned int nents, int prot);
+extern ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		struct scatterlist *sg, unsigned int nents, int prot);
+extern ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
+				   unsigned long iova, struct scatterlist *sg,
+				   unsigned int nents, int prot);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
 			iommu_fault_handler_t handler, void *token);
@@ -754,18 +754,18 @@ static inline size_t iommu_unmap_fast(struct iommu_domain *domain,
 	return 0;
 }
 
-static inline size_t iommu_map_sg(struct iommu_domain *domain,
-				  unsigned long iova, struct scatterlist *sg,
-				  unsigned int nents, int prot)
+static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
+				   unsigned long iova, struct scatterlist *sg,
+				   unsigned int nents, int prot)
 {
-	return 0;
+	return -ENODEV;
 }
 
-static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain,
+static inline ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
 					 unsigned long iova, struct scatterlist *sg,
 					 unsigned int nents, int prot)
 {
-	return 0;
+	return -ENODEV;
 }
 
 static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-- 
Gitee


From ee69dd7e716fc0a45023f117344d031692c842d5 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jan 2021 21:35:06 +0800
Subject: [PATCH 1327/2161] iova: Make has_iova_flush_queue() private

commit 9cc0aaeb96e7f894d4735f069174948c1516fea7 upstream.

Function has_iova_flush_queue() has no users outside iova.c, so make it
private.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-2-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 2 +-
 include/linux/iova.h | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index ff59d8aa3041..21fc92cee7e8 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -55,7 +55,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
-bool has_iova_flush_queue(struct iova_domain *iovad)
+static bool has_iova_flush_queue(struct iova_domain *iovad)
 {
 	return !!iovad->fq;
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index a0637abffee8..781b96ac706f 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -155,7 +155,6 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
-bool has_iova_flush_queue(struct iova_domain *iovad);
 int init_iova_flush_queue(struct iova_domain *iovad,
 			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
@@ -236,11 +235,6 @@ static inline void init_iova_domain(struct iova_domain *iovad,
 {
 }
 
-static inline bool has_iova_flush_queue(struct iova_domain *iovad)
-{
-	return false;
-}
-
 static inline int init_iova_flush_queue(struct iova_domain *iovad,
 					iova_flush_cb flush_cb,
 					iova_entry_dtor entry_dtor)
-- 
Gitee


From 594bd67023c346b134b7e494d3e1aac3352a70a4 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 25 May 2022 11:46:15 +0800
Subject: [PATCH 1328/2161] iova: Delete copy_reserved_iova()

commit 622106190175dbac2b0b0ee7d4275c474e5fe051 upstream.

Since commit c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the
iommu ops"), function copy_reserved_iova() is not referenced, so delete
it.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-3-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 30 ------------------------------
 include/linux/iova.h |  6 ------
 2 files changed, 36 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 21fc92cee7e8..2fe1b61f2e52 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -712,36 +712,6 @@ reserve_iova(struct iova_domain *iovad,
 }
 EXPORT_SYMBOL_GPL(reserve_iova);
 
-/**
- * copy_reserved_iova - copies the reserved between domains
- * @from: - source doamin from where to copy
- * @to: - destination domin where to copy
- * This function copies reserved iova's from one doamin to
- * other.
- */
-void
-copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
-{
-	unsigned long flags;
-	struct rb_node *node;
-
-	spin_lock_irqsave(&from->iova_rbtree_lock, flags);
-	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
-		struct iova *iova = rb_entry(node, struct iova, node);
-		struct iova *new_iova;
-
-		if (iova->pfn_lo == IOVA_ANCHOR)
-			continue;
-
-		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
-		if (!new_iova)
-			pr_err("Reserve iova range %lx@%lx failed\n",
-			       iova->pfn_lo, iova->pfn_lo);
-	}
-	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
-}
-EXPORT_SYMBOL_GPL(copy_reserved_iova);
-
 struct iova *
 split_and_remove_iova(struct iova_domain *iovad, struct iova *iova,
 		      unsigned long pfn_lo, unsigned long pfn_hi)
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 781b96ac706f..5e20ff08a435 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -152,7 +152,6 @@ unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 			      unsigned long limit_pfn, bool flush_rcache);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
-void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
 int init_iova_flush_queue(struct iova_domain *iovad,
@@ -224,11 +223,6 @@ static inline struct iova *reserve_iova(struct iova_domain *iovad,
 	return NULL;
 }
 
-static inline void copy_reserved_iova(struct iova_domain *from,
-				      struct iova_domain *to)
-{
-}
-
 static inline void init_iova_domain(struct iova_domain *iovad,
 				    unsigned long granule,
 				    unsigned long start_pfn)
-- 
Gitee


From 6cbaf894b0a1887df720665adc8b8d545065bc9c Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 25 May 2022 11:53:05 +0800
Subject: [PATCH 1329/2161] iova: Stop exporting some more functions

commit 2cf7dbff0a955f546a1d2c132b94f9d5b837b714 upstream.

The following functions are not referenced outside dma-iommu.c (and
iova.c), which can only be built-in:
- init_iova_flush_queue()
- free_iova_fast()
- queue_iova()
- alloc_iova_fast()

So stop exporting them.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-4-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 2fe1b61f2e52..de4e31e05eab 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -112,7 +112,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(init_iova_flush_queue);
 
 static struct rb_node *
 __get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
@@ -453,7 +452,6 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 
 	return new_iova->pfn_lo;
 }
-EXPORT_SYMBOL_GPL(alloc_iova_fast);
 
 /**
  * free_iova_fast - free iova pfn range into rcache
@@ -600,7 +598,6 @@ void queue_iova(struct iova_domain *iovad,
 		mod_timer(&iovad->fq_timer,
 			  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
 }
-EXPORT_SYMBOL_GPL(queue_iova);
 
 /**
  * put_iova_domain - destroys the iova doamin
-- 
Gitee


From 3d423b549c696ac0773134590fd2e761dcf7db6e Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jan 2021 21:35:09 +0800
Subject: [PATCH 1330/2161] iommu: Stop exporting iommu_map_sg_atomic()

commit b91910a83d041d87115068c773438575d8279534 upstream.

Function iommu_map_sg_atomic() is only referenced in dma-iommu.c, which
can only be built-in, so stop exporting.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-5-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 4ab9e38a76ce..b99df2c045ee 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2591,7 +2591,6 @@ ssize_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
 {
 	return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
 }
-EXPORT_SYMBOL_GPL(iommu_map_sg_atomic);
 
 int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 			       phys_addr_t paddr, u64 size, int prot)
-- 
Gitee


From 240769b63604886b7ba1a8175c21c0e95009a472 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jan 2021 21:35:10 +0800
Subject: [PATCH 1331/2161] iommu: Delete iommu_domain_window_disable()

commit ab0a7119ba67be9e377b195d2b9baa9fb8b3b53e upstream.

Function iommu_domain_window_disable() is not referenced in the tree, so
delete it.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-6-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 9 ---------
 include/linux/iommu.h | 6 ------
 2 files changed, 15 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b99df2c045ee..8b696b9a70a7 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2603,15 +2603,6 @@ int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_window_enable);
 
-void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
-{
-	if (unlikely(domain->ops->domain_window_disable == NULL))
-		return;
-
-	return domain->ops->domain_window_disable(domain, wnd_nr);
-}
-EXPORT_SYMBOL_GPL(iommu_domain_window_disable);
-
 /**
  * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
  * @domain: the iommu domain where the fault has happened
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b33962b36042..30fcc1e7a622 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -554,7 +554,6 @@ int iommu_enable_nesting(struct iommu_domain *domain);
 extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
 				      int prot);
-extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
@@ -784,11 +783,6 @@ static inline int iommu_domain_window_enable(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
-static inline void iommu_domain_window_disable(struct iommu_domain *domain,
-					       u32 wnd_nr)
-{
-}
-
 static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
 	return 0;
-- 
Gitee


From fdb07fa5232d917ea7281ffd10e4ebdf18614dc4 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jan 2021 21:35:11 +0800
Subject: [PATCH 1332/2161] iommu: Delete iommu_dev_has_feature()

commit 262948f8ba573dc9c61650df8f23eaea7d43bc61 upstream.

Function iommu_dev_has_feature() has never been referenced in the tree,
and there does not appear to be anything coming soon to use it, so delete
it.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-7-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 11 -----------
 include/linux/iommu.h |  7 -------
 2 files changed, 18 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 8b696b9a70a7..1c750d3e2a40 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2868,17 +2868,6 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
 /*
  * Per device IOMMU features.
  */
-bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
-{
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
-
-	if (ops && ops->dev_has_feat)
-		return ops->dev_has_feat(dev, feat);
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(iommu_dev_has_feature);
-
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
 	if (dev->iommu && dev->iommu->iommu_dev) {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 30fcc1e7a622..bf1a5866b2d6 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -664,7 +664,6 @@ static inline void dev_iommu_priv_set(struct device *dev, void *priv)
 int iommu_probe_device(struct device *dev);
 void iommu_release_device(struct device *dev);
 
-bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f);
 bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features f);
@@ -1013,12 +1012,6 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 	return NULL;
 }
 
-static inline bool
-iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
-{
-	return false;
-}
-
 static inline bool
 iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
 {
-- 
Gitee


From c221264e9e9d4dfcefe86ac23a35991e65e4b951 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 25 May 2022 12:09:17 +0800
Subject: [PATCH 1333/2161] dma-iommu: use static-key to minimize the impact in
 the fast-path

commit a8e8af35c9f4f75f981a95488c7066d31bac4bef upstream.

Let's move out the is_kdump_kernel() check from iommu_dma_deferred_attach()
to iommu_dma_init(), and use the static-key in the fast-path to minimize
the impact in the normal case.

Co-developed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210126115337.20068-2-lijiang@redhat.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9389aeb7660d..86cb49ee65fd 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -50,6 +50,7 @@ struct iommu_dma_cookie {
 	struct iommu_domain		*fq_domain;
 };
 
+static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled);
 bool iommu_dma_forcedac __read_mostly;
 
 static int __init iommu_dma_forcedac_setup(char *str)
@@ -372,9 +373,6 @@ static int iommu_dma_deferred_attach(struct device *dev,
 {
 	const struct iommu_ops *ops = domain->ops;
 
-	if (!is_kdump_kernel())
-		return 0;
-
 	if (unlikely(ops->is_attach_deferred &&
 			ops->is_attach_deferred(domain, dev)))
 		return iommu_attach_device(domain, dev);
@@ -498,7 +496,8 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	size_t iova_off = iova_offset(iovad, phys);
 	dma_addr_t iova;
 
-	if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
+	    iommu_dma_deferred_attach(dev, domain))
 		return DMA_MAPPING_ERROR;
 
 	size = iova_align(iovad, size + iova_off);
@@ -607,7 +606,8 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 
 	*dma_handle = DMA_MAPPING_ERROR;
 
-	if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
+	    iommu_dma_deferred_attach(dev, domain))
 		return NULL;
 
 	min_size = alloc_sizes & -alloc_sizes;
@@ -852,7 +852,8 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	unsigned long mask = dma_get_seg_boundary(dev);
 	int i;
 
-	if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
+	    iommu_dma_deferred_attach(dev, domain))
 		return 0;
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
@@ -1260,6 +1261,9 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 static int iommu_dma_init(void)
 {
+	if (is_kdump_kernel())
+		static_branch_enable(&iommu_deferred_attach_enabled);
+
 	return iova_cache_get();
 }
 arch_initcall(iommu_dma_init);
-- 
Gitee


From 4ebf92abb1de8f3b38c935abc28bb037fe256e9f Mon Sep 17 00:00:00 2001
From: Lianbo Jiang <lijiang@redhat.com>
Date: Tue, 26 Jan 2021 19:53:37 +0800
Subject: [PATCH 1334/2161] iommu: use the __iommu_attach_device() directly for
 deferred attach

commit 3ab657291638ea267654c3e4798161b2cee6ae01 upstream.

Currently, because domain attach allows to be deferred from iommu
driver to device driver, and when iommu initializes, the devices
on the bus will be scanned and the default groups will be allocated.

Due to the above changes, some devices could be added to the same
group as below:

[    3.859417] pci 0000:01:00.0: Adding to iommu group 16
[    3.864572] pci 0000:01:00.1: Adding to iommu group 16
[    3.869738] pci 0000:02:00.0: Adding to iommu group 17
[    3.874892] pci 0000:02:00.1: Adding to iommu group 17

But when attaching these devices, it doesn't allow that a group has
more than one device, otherwise it will return an error. This conflicts
with the deferred attaching. Unfortunately, it has two devices in the
same group for my side, for example:

[    9.627014] iommu_group_device_count(): device name[0]:0000:01:00.0
[    9.633545] iommu_group_device_count(): device name[1]:0000:01:00.1
...
[   10.255609] iommu_group_device_count(): device name[0]:0000:02:00.0
[   10.262144] iommu_group_device_count(): device name[1]:0000:02:00.1

Finally, which caused the failure of tg3 driver when tg3 driver calls
the dma_alloc_coherent() to allocate coherent memory in the tg3_test_dma().

[    9.660310] tg3 0000:01:00.0: DMA engine test failed, aborting
[    9.754085] tg3: probe of 0000:01:00.0 failed with error -12
[    9.997512] tg3 0000:01:00.1: DMA engine test failed, aborting
[   10.043053] tg3: probe of 0000:01:00.1 failed with error -12
[   10.288905] tg3 0000:02:00.0: DMA engine test failed, aborting
[   10.334070] tg3: probe of 0000:02:00.0 failed with error -12
[   10.578303] tg3 0000:02:00.1: DMA engine test failed, aborting
[   10.622629] tg3: probe of 0000:02:00.1 failed with error -12

In addition, the similar situations also occur in other drivers such
as the bnxt_en driver. That can be reproduced easily in kdump kernel
when SME is active.

Let's move the handling currently in iommu_dma_deferred_attach() into
the iommu core code so that it can call the __iommu_attach_device()
directly instead of the iommu_attach_device(). The external interface
iommu_attach_device() is not suitable for handling this situation.

Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210126115337.20068-3-lijiang@redhat.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 18 +++---------------
 drivers/iommu/iommu.c     | 10 ++++++++++
 include/linux/iommu.h     |  1 +
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 86cb49ee65fd..b22a7049bd60 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -368,18 +368,6 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	return iova_reserve_iommu_regions(dev, domain);
 }
 
-static int iommu_dma_deferred_attach(struct device *dev,
-		struct iommu_domain *domain)
-{
-	const struct iommu_ops *ops = domain->ops;
-
-	if (unlikely(ops->is_attach_deferred &&
-			ops->is_attach_deferred(domain, dev)))
-		return iommu_attach_device(domain, dev);
-
-	return 0;
-}
-
 /**
  * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
  *                    page flags.
@@ -497,7 +485,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	dma_addr_t iova;
 
 	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
-	    iommu_dma_deferred_attach(dev, domain))
+	    iommu_deferred_attach(dev, domain))
 		return DMA_MAPPING_ERROR;
 
 	size = iova_align(iovad, size + iova_off);
@@ -607,7 +595,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 	*dma_handle = DMA_MAPPING_ERROR;
 
 	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
-	    iommu_dma_deferred_attach(dev, domain))
+	    iommu_deferred_attach(dev, domain))
 		return NULL;
 
 	min_size = alloc_sizes & -alloc_sizes;
@@ -853,7 +841,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	int i;
 
 	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
-	    iommu_dma_deferred_attach(dev, domain))
+	    iommu_deferred_attach(dev, domain))
 		return 0;
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1c750d3e2a40..98ee77889328 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1981,6 +1981,16 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
+{
+	const struct iommu_ops *ops = domain->ops;
+
+	if (ops->is_attach_deferred && ops->is_attach_deferred(domain, dev))
+		return __iommu_attach_device(domain, dev);
+
+	return 0;
+}
+
 /*
  * Check flags and other user provided data for valid combinations. We also
  * make sure no reserved fields or unused flags are set. This is to ensure
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bf1a5866b2d6..dcd19a19b62e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -406,6 +406,7 @@ int  iommu_device_sysfs_add(struct iommu_device *iommu,
 void iommu_device_sysfs_remove(struct iommu_device *iommu);
 int  iommu_device_link(struct iommu_device   *iommu, struct device *link);
 void iommu_device_unlink(struct iommu_device *iommu, struct device *link);
+int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain);
 
 static inline void iommu_device_set_ops(struct iommu_device *iommu,
 					const struct iommu_ops *ops)
-- 
Gitee


From 8c360b5c8a38e4bbb828c38e264dcfdf3bff515a Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Tue, 24 Nov 2020 16:20:53 +0800
Subject: [PATCH 1335/2161] iommu: Allow the dma-iommu api to use bounce
 buffers

commit 82612d66d51d3bacdd789e31d2e875d2494b7514 upstream.

Allow the dma-iommu api to use bounce buffers for untrusted devices.
This is a copy of the intel bounce buffer code.

Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20201124082057.2614359-4-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 162 +++++++++++++++++++++++++++++++++++---
 1 file changed, 149 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index b22a7049bd60..0a93093abb7e 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -21,9 +21,11 @@
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/pci.h>
+#include <linux/swiotlb.h>
 #include <linux/scatterlist.h>
 #include <linux/vmalloc.h>
 #include <linux/crash_dump.h>
+#include <linux/dma-direct.h>
 
 struct iommu_dma_msi_page {
 	struct list_head	list;
@@ -475,6 +477,31 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
 	iommu_dma_free_iova(cookie, dma_addr, size);
 }
 
+static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
+		size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	struct iommu_domain *domain = iommu_get_dma_domain(dev);
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	phys_addr_t phys;
+
+	phys = iommu_iova_to_phys(domain, dma_addr);
+	if (WARN_ON(!phys))
+		return;
+
+	__iommu_dma_unmap(dev, dma_addr, size);
+
+	if (unlikely(is_swiotlb_buffer(phys)))
+		swiotlb_tbl_unmap_single(dev, phys, size,
+				iova_align(iovad, size), dir, attrs);
+}
+
+static bool dev_is_untrusted(struct device *dev)
+{
+	return dev_is_pci(dev) && to_pci_dev(dev)->untrusted;
+}
+
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 		size_t size, int prot, dma_addr_t dma_mask)
 {
@@ -501,6 +528,54 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	return iova + iova_off;
 }
 
+static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
+		size_t org_size, dma_addr_t dma_mask, bool coherent,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	int prot = dma_info_to_prot(dir, coherent, attrs);
+	struct iommu_domain *domain = iommu_get_dma_domain(dev);
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	size_t aligned_size = org_size;
+	void *padding_start;
+	size_t padding_size;
+	dma_addr_t iova;
+
+	/*
+	 * If both the physical buffer start address and size are
+	 * page aligned, we don't need to use a bounce page.
+	 */
+	if (IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev) &&
+	    iova_offset(iovad, phys | org_size)) {
+		aligned_size = iova_align(iovad, org_size);
+		phys = swiotlb_tbl_map_single(dev, phys, org_size,
+					      aligned_size, dir, attrs);
+
+		if (phys == DMA_MAPPING_ERROR)
+			return DMA_MAPPING_ERROR;
+
+		/* Cleanup the padding area. */
+		padding_start = phys_to_virt(phys);
+		padding_size = aligned_size;
+
+		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+		    (dir == DMA_TO_DEVICE ||
+		     dir == DMA_BIDIRECTIONAL)) {
+			padding_start += org_size;
+			padding_size -= org_size;
+		}
+
+		memset(padding_start, 0, padding_size);
+	}
+
+	iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
+	if ((iova == DMA_MAPPING_ERROR) && is_swiotlb_buffer(phys))
+		swiotlb_tbl_unmap_single(dev, phys, org_size,
+				aligned_size, dir, attrs);
+
+	return iova;
+}
+
 static void __iommu_dma_free_pages(struct page **pages, int count)
 {
 	while (count--)
@@ -674,11 +749,15 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
 {
 	phys_addr_t phys;
 
-	if (dev_is_dma_coherent(dev))
+	if (dev_is_dma_coherent(dev) && !dev_is_untrusted(dev))
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	arch_sync_dma_for_cpu(phys, size, dir);
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu(phys, size, dir);
+
+	if (is_swiotlb_buffer(phys))
+		swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_CPU);
 }
 
 static void iommu_dma_sync_single_for_device(struct device *dev,
@@ -686,11 +765,15 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
 {
 	phys_addr_t phys;
 
-	if (dev_is_dma_coherent(dev))
+	if (dev_is_dma_coherent(dev) && !dev_is_untrusted(dev))
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	arch_sync_dma_for_device(phys, size, dir);
+	if (is_swiotlb_buffer(phys))
+		swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_DEVICE);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_device(phys, size, dir);
 }
 
 static void iommu_dma_sync_sg_for_cpu(struct device *dev,
@@ -700,11 +783,17 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 	struct scatterlist *sg;
 	int i;
 
-	if (dev_is_dma_coherent(dev))
+	if (dev_is_dma_coherent(dev) && !dev_is_untrusted(dev))
 		return;
 
-	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
+	for_each_sg(sgl, sg, nelems, i) {
+		if (!dev_is_dma_coherent(dev))
+			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
+
+		if (is_swiotlb_buffer(sg_phys(sg)))
+			swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
+						dir, SYNC_FOR_CPU);
+	}
 }
 
 static void iommu_dma_sync_sg_for_device(struct device *dev,
@@ -714,11 +803,17 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 	struct scatterlist *sg;
 	int i;
 
-	if (dev_is_dma_coherent(dev))
+	if (dev_is_dma_coherent(dev) && !dev_is_untrusted(dev))
 		return;
 
-	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
+	for_each_sg(sgl, sg, nelems, i) {
+		if (is_swiotlb_buffer(sg_phys(sg)))
+			swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
+						dir, SYNC_FOR_DEVICE);
+
+		if (!dev_is_dma_coherent(dev))
+			arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
+	}
 }
 
 static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
@@ -727,10 +822,10 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 {
 	phys_addr_t phys = page_to_phys(page) + offset;
 	bool coherent = dev_is_dma_coherent(dev);
-	int prot = dma_info_to_prot(dir, coherent, attrs);
 	dma_addr_t dma_handle;
 
-	dma_handle = __iommu_dma_map(dev, phys, size, prot, dma_get_mask(dev));
+	dma_handle = __iommu_dma_map_swiotlb(dev, phys, size, dma_get_mask(dev),
+			coherent, dir, attrs);
 	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
 	    dma_handle != DMA_MAPPING_ERROR)
 		arch_sync_dma_for_device(phys, size, dir);
@@ -742,7 +837,7 @@ static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
 {
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		iommu_dma_sync_single_for_cpu(dev, dma_handle, size, dir);
-	__iommu_dma_unmap(dev, dma_handle, size);
+	__iommu_dma_unmap_swiotlb(dev, dma_handle, size, dir, attrs);
 }
 
 /*
@@ -820,6 +915,39 @@ static void __invalidate_sg(struct scatterlist *sg, int nents)
 	}
 }
 
+static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i)
+		__iommu_dma_unmap_swiotlb(dev, sg_dma_address(s),
+				sg_dma_len(s), dir, attrs);
+}
+
+static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		sg_dma_address(s) = __iommu_dma_map_swiotlb(dev, sg_phys(s),
+				s->length, dma_get_mask(dev),
+				dev_is_dma_coherent(dev), dir, attrs);
+		if (sg_dma_address(s) == DMA_MAPPING_ERROR)
+			goto out_unmap;
+		sg_dma_len(s) = s->length;
+	}
+
+	return nents;
+
+out_unmap:
+	iommu_dma_unmap_sg_swiotlb(dev, sg, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
+	return 0;
+}
+
 /*
  * The DMA API client is passing in a scatterlist which could describe
  * any old buffer layout, but the IOMMU API requires everything to be
@@ -847,6 +975,9 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
 
+	if (dev_is_untrusted(dev))
+		return iommu_dma_map_sg_swiotlb(dev, sg, nents, dir, attrs);
+
 	/*
 	 * Work out how much IOVA space we need, and align the segments to
 	 * IOVA granules for the IOMMU driver to handle. With some clever
@@ -916,6 +1047,11 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		iommu_dma_sync_sg_for_cpu(dev, sg, nents, dir);
 
+	if (dev_is_untrusted(dev)) {
+		iommu_dma_unmap_sg_swiotlb(dev, sg, nents, dir, attrs);
+		return;
+	}
+
 	/*
 	 * The scatterlist segments are mapped into a single
 	 * contiguous IOVA allocation, so this is incredibly easy.
-- 
Gitee


From ac86ed7daa8228b6d204b111b229f60fbd387352 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 25 May 2022 15:28:49 +0800
Subject: [PATCH 1336/2161] iommu: Add gfp parameter to iommu_ops::map(part2)

commit opencloudos.

Append changes in drivers/iommu/dma-iommu.c and drivers/iommu/virtio-iommu.c

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c    | 6 +++---
 drivers/iommu/virtio-iommu.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 0a93093abb7e..f4843aa4e322 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -521,7 +521,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	if (!iova)
 		return DMA_MAPPING_ERROR;
 
-	if (iommu_map(domain, iova, phys - iova_off, size, prot)) {
+	if (iommu_map_atomic(domain, iova, phys - iova_off, size, prot)) {
 		iommu_dma_free_iova(cookie, iova, size);
 		return DMA_MAPPING_ERROR;
 	}
@@ -705,7 +705,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 			arch_dma_prep_coherent(sg_page(sg), sg->length);
 	}
 
-	if (iommu_map_sg(domain, iova, sgt.sgl, sgt.orig_nents, ioprot)
+	if (iommu_map_sg_atomic(domain, iova, sgt.sgl, sgt.orig_nents, ioprot)
 			< size)
 		goto out_free_sg;
 
@@ -1025,7 +1025,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	 * We'll leave any physical concatenation to the IOMMU driver's
 	 * implementation - it knows better than we do.
 	 */
-	if (iommu_map_sg(domain, iova, sg, nents, prot) < iova_len)
+	if (iommu_map_sg_atomic(domain, iova, sg, nents, prot) < iova_len)
 		goto out_free_iova;
 
 	return __finalise_sg(dev, sg, nents, iova);
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 670231300d17..07600b05f2b6 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -707,7 +707,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 }
 
 static int viommu_map(struct iommu_domain *domain, unsigned long iova,
-		      phys_addr_t paddr, size_t size, int prot)
+		      phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
 	int ret;
 	u32 flags;
-- 
Gitee


From 2e99def2dcfd04c7de31c0dc7b8bf94db29818d4 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 29 Jul 2021 14:15:22 -0600
Subject: [PATCH 1337/2161] iommu/dma: return error code from
 iommu_dma_map_sg()

commit dabb16f67215918c48cf3ff1fc4bc36ca421da2b upstream.

Return appropriate error codes EINVAL or ENOMEM from
iommup_dma_map_sg(). If lower level code returns ENOMEM, then we
return it, other errors are coalesced into EINVAL.

iommu_dma_map_sg_swiotlb() returns -EIO as its an unknown error
from a call that returns DMA_MAPPING_ERROR.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f4843aa4e322..446de21d3ecd 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -945,7 +945,7 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg,
 
 out_unmap:
 	iommu_dma_unmap_sg_swiotlb(dev, sg, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-	return 0;
+	return -EIO;
 }
 
 /*
@@ -966,11 +966,13 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	dma_addr_t iova;
 	size_t iova_len = 0;
 	unsigned long mask = dma_get_seg_boundary(dev);
+	ssize_t ret;
 	int i;
 
-	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
-	    iommu_deferred_attach(dev, domain))
-		return 0;
+	if (static_branch_unlikely(&iommu_deferred_attach_enabled)) {
+		ret = iommu_deferred_attach(dev, domain);
+		goto out;
+	}
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
@@ -1018,14 +1020,17 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	}
 
 	iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
-	if (!iova)
+	if (!iova) {
+		ret = -ENOMEM;
 		goto out_restore_sg;
+	}
 
 	/*
 	 * We'll leave any physical concatenation to the IOMMU driver's
 	 * implementation - it knows better than we do.
 	 */
-	if (iommu_map_sg_atomic(domain, iova, sg, nents, prot) < iova_len)
+	ret = iommu_map_sg_atomic(domain, iova, sg, nents, prot);
+	if (ret < iova_len)
 		goto out_free_iova;
 
 	return __finalise_sg(dev, sg, nents, iova);
@@ -1034,7 +1039,10 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	iommu_dma_free_iova(cookie, iova, iova_len);
 out_restore_sg:
 	__invalidate_sg(sg, nents);
-	return 0;
+out:
+	if (ret != -ENOMEM)
+		return -EINVAL;
+	return ret;
 }
 
 static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-- 
Gitee


From 87c08c27d5dd410e159c2db84c6a4cc61e6e77bb Mon Sep 17 00:00:00 2001
From: Martin Oliveira <martin.oliveira@eideticom.com>
Date: Thu, 29 Jul 2021 14:15:36 -0600
Subject: [PATCH 1338/2161] x86/amd_gart: return error code from gart_map_sg()

commit fcacc8a614391d3bf009319509c846fef47d9352 upstream.

The .map_sg() op now expects an error code instead of zero on failure.

So make __dma_map_cont() return a valid errno (which is then propagated
to gart_map_sg() via dma_map_cont()) and return it in case of failure.

Also, return -EINVAL in case of invalid nents.

Signed-off-by: Martin Oliveira <martin.oliveira@eideticom.com>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/amd_gart_64.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 631af911dea6..8555d9a62924 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -332,7 +332,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
 	int i;
 
 	if (iommu_start == -1)
-		return -1;
+		return -ENOMEM;
 
 	for_each_sg(start, s, nelems, i) {
 		unsigned long pages, addr;
@@ -381,13 +381,13 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		       enum dma_data_direction dir, unsigned long attrs)
 {
 	struct scatterlist *s, *ps, *start_sg, *sgmap;
-	int need = 0, nextneed, i, out, start;
+	int need = 0, nextneed, i, out, start, ret;
 	unsigned long pages = 0;
 	unsigned int seg_size;
 	unsigned int max_seg_size;
 
 	if (nents == 0)
-		return 0;
+		return -EINVAL;
 
 	out		= 0;
 	start		= 0;
@@ -415,8 +415,9 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 			if (!iommu_merge || !nextneed || !need || s->offset ||
 			    (s->length + seg_size > max_seg_size) ||
 			    (ps->offset + ps->length) % PAGE_SIZE) {
-				if (dma_map_cont(dev, start_sg, i - start,
-						 sgmap, pages, need) < 0)
+				ret = dma_map_cont(dev, start_sg, i - start,
+						   sgmap, pages, need);
+				if (ret < 0)
 					goto error;
 				out++;
 
@@ -433,7 +434,8 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
 		ps = s;
 	}
-	if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+	ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+	if (ret < 0)
 		goto error;
 	out++;
 	flush_gart();
@@ -459,7 +461,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	iommu_full(dev, pages << PAGE_SHIFT, dir);
 	for_each_sg(sg, s, nents, i)
 		s->dma_address = DMA_MAPPING_ERROR;
-	return 0;
+	return ret;
 }
 
 /* allocate and map a coherent mapping */
-- 
Gitee


From b3fd675c6e3da0c41e2eb03578dee8f23a77c2c2 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 29 Jul 2021 14:15:37 -0600
Subject: [PATCH 1339/2161] x86/amd_gart: don't set failed sg dma_address to
 DMA_MAPPING_ERROR

commit 183dc86335e69f516b94f169afd784cf97cb8421 upstream.

Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.GC13345@lst.de/
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/amd_gart_64.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 8555d9a62924..5181d87fd122 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -459,8 +459,6 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		panic("dma_map_sg: overflow on %lu pages\n", pages);
 
 	iommu_full(dev, pages << PAGE_SHIFT, dir);
-	for_each_sg(sg, s, nents, i)
-		s->dma_address = DMA_MAPPING_ERROR;
 	return ret;
 }
 
-- 
Gitee


From 964e90befed333b36d7ee1fe526c79b7c0c93908 Mon Sep 17 00:00:00 2001
From: Martin Oliveira <martin.oliveira@eideticom.com>
Date: Thu, 29 Jul 2021 14:15:38 -0600
Subject: [PATCH 1340/2161] dma-mapping: return error code from
 dma_dummy_map_sg()

commit 6506932b3268001ff15c081aceb47c7f9aa9fb2d upstream.

The .map_sg() op now expects an error code instead of zero on failure.

The only errno to return is -EINVAL in the case when DMA is not
supported.

Signed-off-by: Martin Oliveira <martin.oliveira@eideticom.com>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/dummy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index 05607642c888..01eb048892b3 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
 		int nelems, enum dma_data_direction dir,
 		unsigned long attrs)
 {
-	return 0;
+	return -EINVAL;
 }
 
 static int dma_dummy_supported(struct device *hwdev, u64 mask)
-- 
Gitee


From 8b9afc9464592d23d856ff321f3fc72be37deab9 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Mon, 9 Aug 2021 17:13:31 +0200
Subject: [PATCH 1341/2161] dma-mapping: disallow .map_sg operations from
 returning zero on error

commit d03c54419274f96434e2ad74e59e67ec6d54ca86 upstream.

Now that all the .map_sg operations have been converted to returning
proper error codes, drop the code to handle a zero return value,
add a warning if a zero is returned.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/mapping.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 5b73e55a6bae..752e5378f964 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -184,7 +184,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 	if (ents > 0)
 		debug_dma_map_sg(dev, sg, nents, ents, dir);
 	else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
-			      ents != -EIO && ents != 0))
+			      ents != -EIO))
 		return -EIO;
 
 	return ents;
@@ -248,8 +248,6 @@ int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
 	int nents;
 
 	nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-	if (nents == 0)
-		return -EIO;
 	if (nents < 0)
 		return nents;
 	sgt->nents = nents;
-- 
Gitee


From 1ae87b854c64c8f43eae20979995e120456a5e7f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Aug 2021 17:19:36 +0200
Subject: [PATCH 1342/2161] dma-mapping: return an unsigned int from
 dma_map_sg{,_attrs}

commit 2a047e0662aee1bd773e0415accd785ad26a9398 upstream.

These can only return 0 for failure or the number of entries, so turn
the return value into an unsigned int.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-mapping.h | 9 +++++----
 kernel/dma/mapping.c        | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 7d1f46821f6e..590349b6feb9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -229,8 +229,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 		unsigned long attrs);
 void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir, unsigned long attrs);
-int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 				      int nents, enum dma_data_direction dir,
 				      unsigned long attrs);
@@ -283,8 +283,9 @@ static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 }
-static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
+static inline unsigned int dma_map_sg_attrs(struct device *dev,
+		struct scatterlist *sg, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
 {
 	return 0;
 }
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 752e5378f964..d8a00b97eae5 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -206,7 +206,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
  * dma_unmap_sg_attrs() should be used to unmap the buffer with the
  * original sg and original nents (not the value returned by this funciton).
  */
-int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 		    int nents, enum dma_data_direction dir, unsigned long attrs)
 {
 	int ret;
-- 
Gitee


From 7d1bf54f49f3fdff877afe7622053838e6c42a02 Mon Sep 17 00:00:00 2001
From: Thomas Tai <thomas.tai@oracle.com>
Date: Thu, 17 Sep 2020 18:43:03 +0200
Subject: [PATCH 1343/2161] dma-direct: Fix potential NULL pointer dereference

commit f959dcd6ddfd29235030e8026471ac1b022ad2b0 upstream.

When booting the kernel v5.9-rc4 on a VM, the kernel would panic when
printing a warning message in swiotlb_map(). The dev->dma_mask must not
be a NULL pointer when calling the dma mapping layer. A NULL pointer
check can potentially avoid the panic.

Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-direct.h |  3 ---
 kernel/dma/mapping.c       | 11 +++++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 77b264d20ba2..9f24c1789177 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -60,9 +60,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
 {
 	dma_addr_t end = addr + size - 1;
 
-	if (!dev->dma_mask)
-		return false;
-
 	if (is_ram && !IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
 	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
 		return false;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index d8a00b97eae5..c77fc5b8e26c 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -145,6 +145,10 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
+
+	if (WARN_ON_ONCE(!dev->dma_mask))
+		return DMA_MAPPING_ERROR;
+
 	if (dma_map_direct(dev, ops))
 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
 	else
@@ -176,6 +180,10 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 	int ents;
 
 	BUG_ON(!valid_dma_direction(dir));
+
+	if (WARN_ON_ONCE(!dev->dma_mask))
+		return 0;
+
 	if (dma_map_direct(dev, ops))
 		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
 	else
@@ -278,6 +286,9 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
 
 	BUG_ON(!valid_dma_direction(dir));
 
+	if (WARN_ON_ONCE(!dev->dma_mask))
+		return DMA_MAPPING_ERROR;
+
 	/* Don't allow RAM to be mapped */
 	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
 		return DMA_MAPPING_ERROR;
-- 
Gitee


From bb0cabc8385fcfb29b8e087c894a544321a05f06 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 26 Sep 2020 16:39:36 +0200
Subject: [PATCH 1344/2161] dma-direct check for highmem pages in
 dma_direct_alloc_pages

commit 08a89c28304ae74e4c7422f784359e41a37e3e7c upstream.

Check for highmem pages from CMA, just like in the dma_direct_alloc path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a4365f4ced00..0c6ef78c32d4 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -305,6 +305,17 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 	page = __dma_direct_alloc_pages(dev, size, gfp);
 	if (!page)
 		return NULL;
+	if (PageHighMem(page)) {
+		/*
+		 * Depending on the cma= arguments and per-arch setup
+		 * dma_alloc_contiguous could return highmem pages.
+		 * Without remapping there is no way to return them here,
+		 * so log an error and fail.
+		 */
+		dev_info(dev, "Rejecting highmem page from CMA.\n");
+		goto out_free_pages;
+	}
+
 	ret = page_address(page);
 	if (force_dma_unencrypted(dev)) {
 		if (set_memory_decrypted((unsigned long)ret,
-- 
Gitee


From 5407cdf1b8396fcad357340661ec1eeceae2b890 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 7 Oct 2020 11:06:09 +0200
Subject: [PATCH 1345/2161] dma-direct: factor out a dma_direct_alloc_from_pool
 helper

commit 5b138c534fda57c2ebc1e6de72578aa1d70788a6 upstream.

This ensures dma_direct_alloc_pages will use the right gfp mask, as
well as keeping the code for that common between the two allocators.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/direct.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 0c6ef78c32d4..3609af362eaa 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -143,6 +143,22 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	return page;
 }
 
+static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp)
+{
+	struct page *page;
+	u64 phys_mask;
+	void *ret;
+
+	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+					   &phys_mask);
+	page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
+	if (!page)
+		return NULL;
+	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+	return ret;
+}
+
 void *dma_direct_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
@@ -159,17 +175,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
 
-	if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
-		u64 phys_mask;
-
-		gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-				&phys_mask);
-		page = dma_alloc_from_pool(dev, size, &ret, gfp,
-				dma_coherent_ok);
-		if (!page)
-			return NULL;
-		goto done;
-	}
+	if (dma_should_alloc_from_pool(dev, gfp, attrs))
+		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	/* we always manually zero the memory once we are done */
 	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
@@ -294,13 +301,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 	struct page *page;
 	void *ret;
 
-	if (dma_should_alloc_from_pool(dev, gfp, 0)) {
-		page = dma_alloc_from_pool(dev, size, &ret, gfp,
-				dma_coherent_ok);
-		if (!page)
-			return NULL;
-		goto done;
-	}
+	if (dma_should_alloc_from_pool(dev, gfp, 0))
+		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	page = __dma_direct_alloc_pages(dev, size, gfp);
 	if (!page)
@@ -323,7 +325,6 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 			goto out_free_pages;
 	}
 	memset(ret, 0, size);
-done:
 	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 	return page;
 out_free_pages:
-- 
Gitee


From 347da6b5ca16b7e4d377671d66a5637798c290b8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 7 Oct 2020 11:04:08 +0200
Subject: [PATCH 1346/2161] dma-direct: simplify the DMA_ATTR_NO_KERNEL_MAPPING
 handling

commit 849facea92fa68d9292f9b06d7c4ee9e7a06b8dc upstream.

Use and entirely separate code path for the DMA_ATTR_NO_KERNEL_MAPPING
path.  This avoids any confusion about the ret type, and avoids lots of
attr checks and helpers that can be significantly simplified now.

It also ensures that common handling is applied to architetures still
using the arch alloc/free hooks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-noncoherent.h |  13 -----
 kernel/dma/direct.c             | 100 +++++++++++++-------------------
 2 files changed, 39 insertions(+), 74 deletions(-)

diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 431cbba6cf35..73252358824d 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -21,19 +21,6 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
 
-/*
- * Check if an allocation needs to be marked uncached to be coherent.
- */
-static __always_inline bool dma_alloc_need_uncached(struct device *dev,
-		unsigned long attrs)
-{
-	if (dev_is_dma_coherent(dev))
-		return false;
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
-		return false;
-	return true;
-}
-
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 3609af362eaa..822e618067ce 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -71,39 +71,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
-/*
- * Decrypting memory is allowed to block, so if this device requires
- * unencrypted memory it must come from atomic pools.
- */
-static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp,
-					      unsigned long attrs)
-{
-	if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
-		return false;
-	if (gfpflags_allow_blocking(gfp))
-		return false;
-	if (force_dma_unencrypted(dev))
-		return true;
-	if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
-		return false;
-	if (dma_alloc_need_uncached(dev, attrs))
-		return true;
-	return false;
-}
-
-static inline bool dma_should_free_from_pool(struct device *dev,
-					     unsigned long attrs)
-{
-	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
-		return true;
-	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-	    !force_dma_unencrypted(dev))
-		return false;
-	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
-		return true;
-	return false;
-}
-
 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		gfp_t gfp)
 {
@@ -166,35 +133,45 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	void *ret;
 	int err;
 
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
-
 	size = PAGE_ALIGN(size);
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
 
-	if (dma_should_alloc_from_pool(dev, gfp, attrs))
-		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
-
-	/* we always manually zero the memory once we are done */
-	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
-	if (!page)
-		return NULL;
-
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
 	    !force_dma_unencrypted(dev)) {
+		page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+		if (!page)
+			return NULL;
 		/* remove any dirty cache lines on the kernel alias */
 		if (!PageHighMem(page))
 			arch_dma_prep_coherent(page, size);
+		*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 		/* return the page pointer as the opaque cookie */
-		ret = page;
-		goto done;
+		return page;
 	}
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    !dev_is_dma_coherent(dev))
+		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+
+	/*
+	 * Remapping or decrypting memory may block. If either is required and
+	 * we can't block, allocate the memory from the atomic pools.
+	 */
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+	    !gfpflags_allow_blocking(gfp) &&
+	    (force_dma_unencrypted(dev) ||
+	     (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
+		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+	/* we always manually zero the memory once we are done */
+	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+	if (!page)
+		return NULL;
+
 	if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	     dma_alloc_need_uncached(dev, attrs)) ||
+	     !dev_is_dma_coherent(dev)) ||
 	    (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
 		/* remove any dirty cache lines on the kernel alias */
 		arch_dma_prep_coherent(page, size);
@@ -237,7 +214,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	memset(ret, 0, size);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    dma_alloc_need_uncached(dev, attrs)) {
+	    !dev_is_dma_coherent(dev)) {
 		arch_dma_prep_coherent(page, size);
 		ret = arch_dma_set_uncached(ret, size);
 		if (IS_ERR(ret))
@@ -265,25 +242,25 @@ void dma_direct_free(struct device *dev, size_t size,
 {
 	unsigned int page_order = get_order(size);
 
+	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+	    !force_dma_unencrypted(dev)) {
+		/* cpu_addr is a struct page cookie, not a kernel address */
+		dma_free_contiguous(dev, cpu_addr, size);
+		return;
+	}
+
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
 	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_alloc_need_uncached(dev, attrs)) {
+	    !dev_is_dma_coherent(dev)) {
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
 		return;
 	}
 
 	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
-	if (dma_should_free_from_pool(dev, attrs) &&
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
 	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
 		return;
 
-	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-	    !force_dma_unencrypted(dev)) {
-		/* cpu_addr is a struct page cookie, not a kernel address */
-		dma_free_contiguous(dev, cpu_addr, size);
-		return;
-	}
-
 	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
@@ -301,7 +278,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 	struct page *page;
 	void *ret;
 
-	if (dma_should_alloc_from_pool(dev, gfp, 0))
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+	    force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	page = __dma_direct_alloc_pages(dev, size, gfp);
@@ -340,7 +318,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
 	void *vaddr = page_address(page);
 
 	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
-	if (dma_should_free_from_pool(dev, 0) &&
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
 	    dma_free_from_pool(dev, vaddr, size))
 		return;
 
-- 
Gitee


From e31f967f572e136982f205e43b1901ebfbc6971a Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Wed, 27 Oct 2021 11:47:57 -0600
Subject: [PATCH 1347/2161] iommu/dma: Fix incorrect error return on iommu
 deferred attach

commit ac315f96b3bd6f6b8f18f387816c7c2cc6b32e02 upstream.

scsi_dma_map() was reporting a failure during boot on an AMD machine
with the IOMMU enabled.

  scsi_dma_map failed: request for 36 bytes!

The issue was tracked down to a mistake in logic: should not return
an error if iommu_deferred_attach() returns zero.

Reported-by: Marshall Midden <marshallmidden@gmail.com>
Fixes: dabb16f67215 ("iommu/dma: return error code from iommu_dma_map_sg()")
Link: https://lore.kernel.org/all/CAD2CkAWjS8=kKwEEN4cgVNjyFORUibzEiCUA-X+SMtbo0JoMmA@mail.gmail.com
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20211027174757.119755-1-logang@deltatee.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 446de21d3ecd..dc55c0166c5a 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -971,7 +971,8 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 
 	if (static_branch_unlikely(&iommu_deferred_attach_enabled)) {
 		ret = iommu_deferred_attach(dev, domain);
-		goto out;
+		if (ret)
+			goto out;
 	}
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-- 
Gitee


From b7dd068bd1d9c4a5ed2d79dd306fc56782dfecc2 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Thu, 14 Oct 2021 13:38:31 +0800
Subject: [PATCH 1348/2161] iommu/vt-d: Do not falsely log intel_iommu is
 unsupported kernel option

commit 5240aed2cd2594fb392239f11b9681e5e1591619 upstream.

Handling of intel_iommu kernel command line option should return "true" to
indicate option is valid and so avoid logging it as unknown by the core
parsing code.

Also log unknown sub-options at the notice level to let user know of
potential typos or similar.

Reported-by: Eero Tamminen <eero.t.tamminen@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://lore.kernel.org/r/20210831112947.310080-1-tvrtko.ursulin@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20211014053839.727419-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index a402b806f46d..5139aa0acda0 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -390,6 +390,7 @@ static int __init intel_iommu_setup(char *str)
 {
 	if (!str)
 		return -EINVAL;
+
 	while (*str) {
 		if (!strncmp(str, "on", 2)) {
 			dmar_disabled = 0;
@@ -419,13 +420,16 @@ static int __init intel_iommu_setup(char *str)
 		} else if (!strncmp(str, "tboot_noforce", 13)) {
 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 			intel_iommu_tboot_noforce = 1;
+		} else {
+			pr_notice("Unknown option - '%s'\n", str);
 		}
 
 		str += strcspn(str, ",");
 		while (*str == ',')
 			str++;
 	}
-	return 0;
+
+	return 1;
 }
 __setup("intel_iommu=", intel_iommu_setup);
 
-- 
Gitee


From f77dcaac92b10876639edbf9716d38010e351909 Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Thu, 14 Oct 2021 13:38:32 +0800
Subject: [PATCH 1349/2161] iommu/vt-d: Dump DMAR translation structure when
 DMA fault occurs

commit 914ff7719e8adaf87131dd7cd0a3520afcf89516 upstream.

When the dmar translation fault happens, the kernel prints a single line
fault reason with corresponding hexadecimal code defined in the Intel VT-d
specification.

Currently, when user wants to debug the translation fault in detail,
debugfs is used for dumping the dmar_translation_struct, which is not
available when the kernel failed to boot.

Dump the DMAR translation structure, pagewalk the IO page table and print
the page table entry when the fault happens.

This takes effect only when CONFIG_DMAR_DEBUG is enabled.

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Link: https://lore.kernel.org/r/20210815203845.31287-1-kyung.min.park@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20211014053839.727419-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Kconfig |   4 ++
 drivers/iommu/intel/dmar.c  |  10 +++-
 drivers/iommu/intel/iommu.c | 113 ++++++++++++++++++++++++++++++++++++
 include/linux/dmar.h        |   8 +++
 4 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 85271bf23f1d..6436039b3512 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -6,6 +6,9 @@ config DMAR_TABLE
 config DMAR_PERF
 	bool
 
+config DMAR_DEBUG
+	bool
+
 config INTEL_IOMMU
 	bool "Support for Intel IOMMU using DMA Remapping Devices"
 	depends on PCI_MSI && ACPI && (X86 || IA64)
@@ -30,6 +33,7 @@ config INTEL_IOMMU_DEBUGFS
 	bool "Export Intel IOMMU internals in Debugfs"
 	depends on IOMMU_DEBUGFS
 	select DMAR_PERF
+	select DMAR_DEBUG
 	help
 	  !!!WARNING!!!
 
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index f42bf569fabe..1f20539cb228 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1941,12 +1941,16 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 
 	reason = dmar_get_fault_reason(fault_reason, &fault_type);
 
-	if (fault_type == INTR_REMAP)
+	if (fault_type == INTR_REMAP) {
 		pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n",
 		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
 		       PCI_FUNC(source_id & 0xFF), addr >> 48,
 		       fault_reason, reason);
-	else if (pasid == INVALID_IOASID)
+
+		return 0;
+	}
+
+	if (pasid == INVALID_IOASID)
 		pr_err("[%s NO_PASID] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
 		       type ? "DMA Read" : "DMA Write",
 		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
@@ -1959,6 +1963,8 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 		       PCI_FUNC(source_id & 0xFF), addr,
 		       fault_reason, reason);
 
+	dmar_fault_dump_ptes(iommu, source_id, addr, pasid);
+
 	return 0;
 }
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5139aa0acda0..23c1bb44ca09 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -157,6 +157,8 @@ static struct intel_iommu **g_iommus;
 
 static void __init check_tylersburg_isoch(void);
 static int rwbf_quirk;
+static inline struct device_domain_info *
+dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
 
 /*
  * set to 1 to panic kernel if can't successfully enable VT-d
@@ -974,6 +976,117 @@ static void free_context_table(struct intel_iommu *iommu)
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
+#ifdef CONFIG_DMAR_DEBUG
+static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
+{
+	struct device_domain_info *info;
+	struct dma_pte *parent, *pte;
+	struct dmar_domain *domain;
+	int offset, level;
+
+	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
+	if (!info || !info->domain) {
+		pr_info("device [%02x:%02x.%d] not probed\n",
+			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+		return;
+	}
+
+	domain = info->domain;
+	level = agaw_to_level(domain->agaw);
+	parent = domain->pgd;
+	if (!parent) {
+		pr_info("no page table setup\n");
+		return;
+	}
+
+	while (1) {
+		offset = pfn_level_offset(pfn, level);
+		pte = &parent[offset];
+		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
+			pr_info("PTE not present at level %d\n", level);
+			break;
+		}
+
+		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
+
+		if (level == 1)
+			break;
+
+		parent = phys_to_virt(dma_pte_addr(pte));
+		level--;
+	}
+}
+
+void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+			  unsigned long long addr, u32 pasid)
+{
+	struct pasid_dir_entry *dir, *pde;
+	struct pasid_entry *entries, *pte;
+	struct context_entry *ctx_entry;
+	struct root_entry *rt_entry;
+	u8 devfn = source_id & 0xff;
+	u8 bus = source_id >> 8;
+	int i, dir_index, index;
+
+	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
+
+	/* root entry dump */
+	rt_entry = &iommu->root_entry[bus];
+	if (!rt_entry) {
+		pr_info("root table entry is not present\n");
+		return;
+	}
+
+	if (sm_supported(iommu))
+		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
+			rt_entry->hi, rt_entry->lo);
+	else
+		pr_info("root entry: 0x%016llx", rt_entry->lo);
+
+	/* context entry dump */
+	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
+	if (!ctx_entry) {
+		pr_info("context table entry is not present\n");
+		return;
+	}
+
+	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
+		ctx_entry->hi, ctx_entry->lo);
+
+	/* legacy mode does not require PASID entries */
+	if (!sm_supported(iommu))
+		goto pgtable_walk;
+
+	/* get the pointer to pasid directory entry */
+	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
+	if (!dir) {
+		pr_info("pasid directory entry is not present\n");
+		return;
+	}
+	/* For request-without-pasid, get the pasid from context entry */
+	if (intel_iommu_sm && pasid == INVALID_IOASID)
+		pasid = PASID_RID2PASID;
+
+	dir_index = pasid >> PASID_PDE_SHIFT;
+	pde = &dir[dir_index];
+	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
+
+	/* get the pointer to the pasid table entry */
+	entries = get_pasid_table_from_pde(pde);
+	if (!entries) {
+		pr_info("pasid table entry is not present\n");
+		return;
+	}
+	index = pasid & PASID_PTE_MASK;
+	pte = &entries[index];
+	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
+		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
+
+pgtable_walk:
+	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
+}
+#endif
+
 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 				      unsigned long pfn, int *target_level)
 {
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e04436a7ff27..45e903d84733 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -131,6 +131,14 @@ static inline int dmar_res_noop(struct acpi_dmar_header *hdr, void *arg)
 	return 0;
 }
 
+#ifdef CONFIG_DMAR_DEBUG
+void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+			  unsigned long long addr, u32 pasid);
+#else
+static inline void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+					unsigned long long addr, u32 pasid) {}
+#endif
+
 #ifdef CONFIG_INTEL_IOMMU
 extern int iommu_detected, no_iommu;
 extern int intel_iommu_init(void);
-- 
Gitee


From 7caf70687651085d6a85e276c09b642fa6c2ca3d Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 25 May 2022 20:35:41 +0800
Subject: [PATCH 1350/2161] iommu: Indicate queued flushes via gather data

commit 7a7c5badf85806eab75e31ab8d45021f1545b0e3 upstream.

Since iommu_iotlb_gather exists to help drivers optimise flushing for a
given unmap request, it is also the logical place to indicate whether
the unmap is strict or not, and thus help them further optimise for
whether to expect a sync or a flush_all subsequently. As part of that,
it also seems fair to make the flush queue code take responsibility for
enforcing the really subtle ordering requirement it brings, so that we
don't need to worry about forgetting that if new drivers want to add
flush queue support, and can consolidate the existing versions.

While we're adding to the kerneldoc, also fill in some info for
@freelist which was overlooked previously.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/bf5f8e2ad84e48c712ccbf80fa8c610594c7595f.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 1 +
 drivers/iommu/iova.c      | 7 +++++++
 include/linux/iommu.h     | 8 +++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index dc55c0166c5a..3a3ed162f3b6 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -468,6 +468,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
 	dma_addr -= iova_off;
 	size = iova_align(iovad, size + iova_off);
 	iommu_iotlb_gather_init(&iotlb_gather);
+	iotlb_gather.queued = cookie->fq_domain;
 
 	unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather);
 	WARN_ON(unmapped != size);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index de4e31e05eab..9768130b4e07 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -569,6 +569,13 @@ void queue_iova(struct iova_domain *iovad,
 	unsigned long flags;
 	unsigned idx;
 
+	/*
+	 * Order against the IOMMU driver's pagetable update from unmapping
+	 * @pte, to guarantee that iova_domain_flush() observes that if called
+	 * from a different CPU before we release the lock below.
+	 */
+	smp_wmb();
+
 	spin_lock_irqsave(&fq->lock, flags);
 
 	/*
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index dcd19a19b62e..04bfbbe21b16 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -191,16 +191,22 @@ enum iommu_dev_features {
  * @start: IOVA representing the start of the range to be flushed
  * @end: IOVA representing the end of the range to be flushed (exclusive)
  * @pgsize: The interval at which to perform the flush
+ * @freelist: Removed pages to free after sync
+ * @queued: Indicates that the flush will be queued
  *
  * This structure is intended to be updated by multiple calls to the
  * ->unmap() function in struct iommu_ops before eventually being passed
- * into ->iotlb_sync().
+ * into ->iotlb_sync(). Drivers can add pages to @freelist to be freed after
+ * ->iotlb_sync() or ->iotlb_flush_all() have cleared all cached references to
+ * them. @queued is set to indicate when ->iotlb_flush_all() will be called
+ * later instead of ->iotlb_sync(), so drivers may optimise accordingly.
  */
 struct iommu_iotlb_gather {
 	unsigned long		start;
 	unsigned long		end;
 	size_t			pgsize;
 	struct page		*freelist;
+	bool			queued;
 };
 
 /**
-- 
Gitee


From 498598f7573a13b774203e6bbf8ae3696633f1e6 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:30 +0100
Subject: [PATCH 1351/2161] iommu: Introduce explicit type for non-strict DMA
 domains

commit bf3aed4660c6e3c44c69f07d8927ee5a22a952ac upstream.

Promote the difference between strict and non-strict DMA domains from an
internal detail to a distinct domain feature and type, to pave the road
for exposing it through the sysfs default domain interface.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/08cd2afaf6b63c58ad49acec3517c9b32c2bb946.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c |  2 +-
 drivers/iommu/iommu.c     |  8 ++++++--
 include/linux/iommu.h     | 11 +++++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3a3ed162f3b6..878ad51e6364 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1298,7 +1298,7 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
 	 * The IOMMU core code allocates the default DMA domain, which the
 	 * underlying IOMMU driver needs to support via the dma-iommu layer.
 	 */
-	if (domain->type == IOMMU_DOMAIN_DMA) {
+	if (iommu_is_dma_domain(domain)) {
 		if (iommu_dma_init_domain(domain, dma_base, size, dev))
 			goto out_err;
 		dev->dma_ops = &iommu_dma_ops;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 98ee77889328..25d873527ba0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -123,6 +123,7 @@ static const char *iommu_domain_type_str(unsigned int t)
 	case IOMMU_DOMAIN_UNMANAGED:
 		return "Unmanaged";
 	case IOMMU_DOMAIN_DMA:
+	case IOMMU_DOMAIN_DMA_FQ:
 		return "Translated";
 	default:
 		return "Unknown";
@@ -520,6 +521,9 @@ static ssize_t iommu_group_show_type(struct iommu_group *group,
 		case IOMMU_DOMAIN_DMA:
 			type = "DMA\n";
 			break;
+		case IOMMU_DOMAIN_DMA_FQ:
+			type = "DMA-FQ\n";
+			break;
 		}
 	}
 	strcpy(buf, type);
@@ -731,7 +735,7 @@ static int iommu_create_device_direct_mappings(struct iommu_group *group,
 	unsigned long pg_size;
 	int ret = 0;
 
-	if (!domain || domain->type != IOMMU_DOMAIN_DMA)
+	if (!domain || !iommu_is_dma_domain(domain))
 		return 0;
 
 	BUG_ON(!domain->pgsize_bitmap);
@@ -1919,7 +1923,7 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 	domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
 
 	/* Temporarily avoid -EEXIST while drivers still get their own cookies */
-	if (type == IOMMU_DOMAIN_DMA && !domain->iova_cookie && iommu_get_dma_cookie(domain)) {
+	if (iommu_is_dma_domain(domain) && !domain->iova_cookie && iommu_get_dma_cookie(domain)) {
 		iommu_domain_free(domain);
 		domain = NULL;
 	}
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 04bfbbe21b16..6a655c0ff8e5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -67,6 +67,7 @@ struct iommu_domain_geometry {
 #define __IOMMU_DOMAIN_DMA_API	(1U << 1)  /* Domain for use in DMA-API
 					      implementation              */
 #define __IOMMU_DOMAIN_PT	(1U << 2)  /* Domain is identity mapped   */
+#define __IOMMU_DOMAIN_DMA_FQ	(1U << 3)  /* DMA-API uses flush queue    */
 
 /*
  * This are the possible domain-types
@@ -79,12 +80,17 @@ struct iommu_domain_geometry {
  *	IOMMU_DOMAIN_DMA	- Internally used for DMA-API implementations.
  *				  This flag allows IOMMU drivers to implement
  *				  certain optimizations for these domains
+ *	IOMMU_DOMAIN_DMA_FQ	- As above, but definitely using batched TLB
+ *				  invalidation.
  */
 #define IOMMU_DOMAIN_BLOCKED	(0U)
 #define IOMMU_DOMAIN_IDENTITY	(__IOMMU_DOMAIN_PT)
 #define IOMMU_DOMAIN_UNMANAGED	(__IOMMU_DOMAIN_PAGING)
 #define IOMMU_DOMAIN_DMA	(__IOMMU_DOMAIN_PAGING |	\
 				 __IOMMU_DOMAIN_DMA_API)
+#define IOMMU_DOMAIN_DMA_FQ	(__IOMMU_DOMAIN_PAGING |	\
+				 __IOMMU_DOMAIN_DMA_API |	\
+				 __IOMMU_DOMAIN_DMA_FQ)
 
 struct iommu_domain {
 	unsigned type;
@@ -96,6 +102,11 @@ struct iommu_domain {
 	struct iommu_dma_cookie *iova_cookie;
 };
 
+static inline bool iommu_is_dma_domain(struct iommu_domain *domain)
+{
+	return domain->type & __IOMMU_DOMAIN_DMA_API;
+}
+
 enum iommu_cap {
 	IOMMU_CAP_CACHE_COHERENCY,	/* IOMMU can enforce cache coherent DMA
 					   transactions */
-- 
Gitee


From db7be5867c29550fd91f941e96613b570d775be6 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 18 Jun 2021 17:20:59 +0200
Subject: [PATCH 1352/2161] iommu/dma: Pass address limit rather than size to
 iommu_setup_dma_ops()

commit ac6d704679d343e55615551f19e9b2e18d68518b upstream.

Passing a 64-bit address width to iommu_setup_dma_ops() is valid on
virtual platforms, but isn't currently possible. The overflow check in
iommu_dma_init_domain() prevents this even when @dma_base isn't 0. Pass
a limit address instead of a size, so callers don't have to fake a size
to work around the check.

The base and limit parameters are being phased out, because:
* they are redundant for x86 callers. dma-iommu already reserves the
  first page, and the upper limit is already in domain->geometry.
* they can now be obtained from dev->dma_range_map on Arm.
But removing them on Arm isn't completely straightforward so is left for
future work. As an intermediate step, simplify the x86 callers by
passing dummy limits.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210618152059.1194210-5-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c   |  2 +-
 drivers/iommu/dma-iommu.c   | 12 ++++++------
 drivers/iommu/intel/iommu.c |  5 +----
 include/linux/dma-iommu.h   |  4 ++--
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index f67b24486288..64007b76c277 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2205,7 +2205,7 @@ static void amd_iommu_probe_finalize(struct device *dev)
 	/* Domains are initialized for this device - have a look what we ended up with */
 	domain = iommu_get_domain_for_dev(dev);
 	if (domain->type == IOMMU_DOMAIN_DMA)
-		iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
+		iommu_setup_dma_ops(dev, 0, U64_MAX);
 }
 
 static void amd_iommu_release_device(struct device *dev)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 878ad51e6364..17c7a027a8f5 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -305,16 +305,16 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
  * iommu_dma_init_domain - Initialise a DMA mapping domain
  * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
  * @base: IOVA at which the mappable address space starts
- * @size: Size of IOVA space
+ * @limit: Last address of the IOVA space
  * @dev: Device the domain is being initialised for
  *
- * @base and @size should be exact multiples of IOMMU page granularity to
+ * @base and @limit + 1 should be exact multiples of IOMMU page granularity to
  * avoid rounding surprises. If necessary, we reserve the page at address 0
  * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
  * any change which could make prior IOVAs invalid will fail.
  */
 static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
-		u64 size, struct device *dev)
+				 dma_addr_t limit, struct device *dev)
 {
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	unsigned long order, base_pfn;
@@ -333,7 +333,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	/* Check the domain allows at least some access to the device... */
 	if (domain->geometry.force_aperture) {
 		if (base > domain->geometry.aperture_end ||
-		    base + size <= domain->geometry.aperture_start) {
+		    limit < domain->geometry.aperture_start) {
 			pr_warn("specified DMA range outside IOMMU capability\n");
 			return -EFAULT;
 		}
@@ -1287,7 +1287,7 @@ static const struct dma_map_ops iommu_dma_ops = {
  * The IOMMU core code allocates the default DMA domain, which the underlying
  * IOMMU driver needs to support via the dma-iommu layer.
  */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
@@ -1299,7 +1299,7 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
 	 * underlying IOMMU driver needs to support via the dma-iommu layer.
 	 */
 	if (iommu_is_dma_domain(domain)) {
-		if (iommu_dma_init_domain(domain, dma_base, size, dev))
+		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
 			goto out_err;
 		dev->dma_ops = &iommu_dma_ops;
 	}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 23c1bb44ca09..f56e1b2b8a9b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5294,13 +5294,10 @@ static void intel_iommu_release_device(struct device *dev)
 
 static void intel_iommu_probe_finalize(struct device *dev)
 {
-	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
-		iommu_setup_dma_ops(dev, base,
-				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
+		iommu_setup_dma_ops(dev, 0, U64_MAX);
 	else
 		set_dma_ops(dev, NULL);
 }
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 927e22639c34..411a7b7cb600 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -19,7 +19,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
 /* Setup call for arch DMA mapping code */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size);
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 /*
@@ -47,7 +47,7 @@ struct msi_msg;
 struct device;
 
 static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
-		u64 size)
+				       u64 dma_limit)
 {
 }
 
-- 
Gitee


From e53e3ab316f76abc1396bb3d509de6b6f1d6b80c Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 10 Jun 2021 10:01:14 +0800
Subject: [PATCH 1353/2161] iommu/vt-d: Define counter explicitly as unsigned
 int

commit 9739ba327c01e26f672661ea751132c29a54d3d9 upstream.

Avoid below checkpatch warning.

WARNING: Prefer 'unsigned int' to bare use of 'unsigned'
+       unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];

Fixes: 29a27719abaa ("iommu/vt-d: Replace iommu_bmp with a refcount")
Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210530075053.264218-1-parav@nvidia.com
Link: https://lore.kernel.org/r/20210610020115.1637656-23-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/intel-iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index eb85c4ce1616..f2bdaad1b784 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -537,7 +537,7 @@ struct context_entry {
 struct dmar_domain {
 	int	nid;			/* node id */
 
-	unsigned	iommu_refcnt[DMAR_UNITS_SUPPORTED];
+	unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED];
 					/* Refcount of devices per iommu */
 
 
-- 
Gitee


From 7d17663eab071cb2349e0a689320526b84352833 Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66@hisilicon.com>
Date: Mon, 10 May 2021 19:53:02 +0800
Subject: [PATCH 1354/2161] iommu/iova: Put free_iova_mem() outside of spinlock
 iova_rbtree_lock

commit 7978724f399ae7eba5b6d36ae5a7224d5bf3859a upstream.

It is not necessary to put free_iova_mem() inside of spinlock/unlock
iova_rbtree_lock which only leads to more completion for the spinlock.
It has a small promote on the performance after the change. And also
rename private_free_iova() as remove_iova() because the function will not
free iova after that change.

Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1620647582-194621-1-git-send-email-chenxiang66@hisilicon.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 9768130b4e07..f2a43d989fca 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -346,12 +346,11 @@ private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 	return NULL;
 }
 
-static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+static void remove_iova(struct iova_domain *iovad, struct iova *iova)
 {
 	assert_spin_locked(&iovad->iova_rbtree_lock);
 	__cached_rbnode_delete_update(iovad, iova);
 	rb_erase(&iova->node, &iovad->rbroot);
-	free_iova_mem(iova);
 }
 
 /**
@@ -386,8 +385,9 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	private_free_iova(iovad, iova);
+	remove_iova(iovad, iova);
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(__free_iova);
 
@@ -406,10 +406,13 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	iova = private_find_iova(iovad, pfn);
-	if (iova)
-		private_free_iova(iovad, iova);
+	if (!iova) {
+		spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+		return;
+	}
+	remove_iova(iovad, iova);
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-
+	free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
@@ -805,7 +808,8 @@ iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
 		if (WARN_ON(!iova))
 			continue;
 
-		private_free_iova(iovad, iova);
+		remove_iova(iovad, iova);
+		free_iova_mem(iova);
 	}
 
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-- 
Gitee


From b4845a81cfe3d738d74d093aa2a37c5bc340fd34 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 26 May 2022 11:50:37 +0800
Subject: [PATCH 1355/2161] iommu: Remove unused of_get_dma_window()

commit bb6bfd79d9bc69f0808a4156ec3ca9fb78694039 upstream.

of_get_dma_window() was added in 2012 and removed in 2014 in commit
891846516317 ("memory: Add NVIDIA Tegra memory controller support").
Remove it and simplify the header to use forward declarations for
structs rather than includes.

Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210527193710.1281746-1-robh@kernel.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/of_iommu.c | 68 ----------------------------------------
 include/linux/of_iommu.h | 17 ++--------
 2 files changed, 3 insertions(+), 82 deletions(-)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 614a93aa5305..e2b9d3e5675f 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -16,74 +16,6 @@
 
 #define NO_IOMMU	1
 
-/**
- * of_get_dma_window - Parse *dma-window property and returns 0 if found.
- *
- * @dn: device node
- * @prefix: prefix for property name if any
- * @index: index to start to parse
- * @busno: Returns busno if supported. Otherwise pass NULL
- * @addr: Returns address that DMA starts
- * @size: Returns the range that DMA can handle
- *
- * This supports different formats flexibly. "prefix" can be
- * configured if any. "busno" and "index" are optionally
- * specified. Set 0(or NULL) if not used.
- */
-int of_get_dma_window(struct device_node *dn, const char *prefix, int index,
-		      unsigned long *busno, dma_addr_t *addr, size_t *size)
-{
-	const __be32 *dma_window, *end;
-	int bytes, cur_index = 0;
-	char propname[NAME_MAX], addrname[NAME_MAX], sizename[NAME_MAX];
-
-	if (!dn || !addr || !size)
-		return -EINVAL;
-
-	if (!prefix)
-		prefix = "";
-
-	snprintf(propname, sizeof(propname), "%sdma-window", prefix);
-	snprintf(addrname, sizeof(addrname), "%s#dma-address-cells", prefix);
-	snprintf(sizename, sizeof(sizename), "%s#dma-size-cells", prefix);
-
-	dma_window = of_get_property(dn, propname, &bytes);
-	if (!dma_window)
-		return -ENODEV;
-	end = dma_window + bytes / sizeof(*dma_window);
-
-	while (dma_window < end) {
-		u32 cells;
-		const void *prop;
-
-		/* busno is one cell if supported */
-		if (busno)
-			*busno = be32_to_cpup(dma_window++);
-
-		prop = of_get_property(dn, addrname, NULL);
-		if (!prop)
-			prop = of_get_property(dn, "#address-cells", NULL);
-
-		cells = prop ? be32_to_cpup(prop) : of_n_addr_cells(dn);
-		if (!cells)
-			return -EINVAL;
-		*addr = of_read_number(dma_window, cells);
-		dma_window += cells;
-
-		prop = of_get_property(dn, sizename, NULL);
-		cells = prop ? be32_to_cpup(prop) : of_n_size_cells(dn);
-		if (!cells)
-			return -EINVAL;
-		*size = of_read_number(dma_window, cells);
-		dma_window += cells;
-
-		if (cur_index++ == index)
-			break;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(of_get_dma_window);
-
 static int of_iommu_xlate(struct device *dev,
 			  struct of_phandle_args *iommu_spec)
 {
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index f3d40dd7bb66..fc7fd92927dd 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -2,28 +2,17 @@
 #ifndef __OF_IOMMU_H
 #define __OF_IOMMU_H
 
-#include <linux/device.h>
-#include <linux/iommu.h>
-#include <linux/of.h>
+struct device;
+struct device_node;
+struct iommu_ops;
 
 #ifdef CONFIG_OF_IOMMU
 
-extern int of_get_dma_window(struct device_node *dn, const char *prefix,
-			     int index, unsigned long *busno, dma_addr_t *addr,
-			     size_t *size);
-
 extern const struct iommu_ops *of_iommu_configure(struct device *dev,
 					struct device_node *master_np);
 
 #else
 
-static inline int of_get_dma_window(struct device_node *dn, const char *prefix,
-			    int index, unsigned long *busno, dma_addr_t *addr,
-			    size_t *size)
-{
-	return -EINVAL;
-}
-
 static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
 					 struct device_node *master_np)
 {
-- 
Gitee


From d40ee04b326bdd93ac970a83e8420aacc351346d Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 26 May 2022 15:16:35 +0800
Subject: [PATCH 1356/2161] iommu/vt-d: Prepare for multiple DMA domain types

commit 78ca078459d779319a22a8e2a18fc491da8a229b upstream.

In preparation for the strict vs. non-strict decision for DMA domains
to be expressed in the domain type, make sure we expose our flush queue
awareness by accepting the new domain type, and test the specific
feature flag where we want to identify DMA domains in general. The DMA
ops reset/setup can simply be made unconditional, since iommu-dma
already knows only to touch DMA domains.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/31a8ef868d593a2f3826a6a120edee81815375a7.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f56e1b2b8a9b..f8c31d5bd35b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -561,7 +561,7 @@ struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 	int iommu_id;
 
 	/* si_domain and vm domain should not get here. */
-	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
+	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
 		return NULL;
 
 	for_each_domain_iommu(iommu_id, domain)
@@ -1124,7 +1124,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 			if (domain_use_first_level(domain)) {
 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
-				if (domain->domain.type == IOMMU_DOMAIN_DMA)
+				if (iommu_is_dma_domain(&domain->domain))
 					pteval |= DMA_FL_PTE_ACCESS;
 			}
 			if (cmpxchg64(&pte->val, 0ULL, pteval))
@@ -4657,7 +4657,7 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 
 	switch (type) {
 	case IOMMU_DOMAIN_DMA:
-	/* fallthrough */
+	case IOMMU_DOMAIN_DMA_FQ:
 	case IOMMU_DOMAIN_UNMANAGED:
 		dmar_domain = alloc_domain(0);
 		if (!dmar_domain) {
@@ -5294,12 +5294,8 @@ static void intel_iommu_release_device(struct device *dev)
 
 static void intel_iommu_probe_finalize(struct device *dev)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-
-	if (domain && domain->type == IOMMU_DOMAIN_DMA)
-		iommu_setup_dma_ops(dev, 0, U64_MAX);
-	else
-		set_dma_ops(dev, NULL);
+	set_dma_ops(dev, NULL);
+	iommu_setup_dma_ops(dev, 0, U64_MAX);
 }
 
 static void intel_iommu_get_resv_regions(struct device *device,
-- 
Gitee


From e5c48d88de5a1de63005c584c91cbe62065892cf Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 26 May 2022 16:03:49 +0800
Subject: [PATCH 1357/2161] iommu: Handle freelists when using deferred
 flushing in iommu drivers(part2)

commit 2a2b8eaa5b25668a6f717f94b55f4e3aaf87629d upstream.

Append changes for drivers/iommu/dma-iommu.c.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 17c7a027a8f5..2b2d47c88f99 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -64,6 +64,19 @@ static int __init iommu_dma_forcedac_setup(char *str)
 	return ret;
 }
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
+
+static void iommu_dma_entry_dtor(unsigned long data)
+{
+	struct page *freelist = (struct page *)data;
+
+	while (freelist) {
+		unsigned long p = (unsigned long)page_address(freelist);
+
+		freelist = freelist->freelist;
+		free_page(p);
+	}
+}
+
 static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
 {
 	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
@@ -358,7 +371,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	if (!cookie->fq_domain && !iommu_domain_get_attr(domain,
 			DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) {
 		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
-					NULL))
+					  iommu_dma_entry_dtor))
 			pr_warn("iova flush queue initialization failed\n");
 		else
 			cookie->fq_domain = domain;
@@ -440,7 +453,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 }
 
 static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
-		dma_addr_t iova, size_t size)
+		dma_addr_t iova, size_t size, struct page *freelist)
 {
 	struct iova_domain *iovad = &cookie->iovad;
 
@@ -449,7 +462,8 @@ static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
 		cookie->msi_iova -= size;
 	else if (cookie->fq_domain)	/* non-strict mode */
 		queue_iova(iovad, iova_pfn(iovad, iova),
-				size >> iova_shift(iovad), 0);
+				size >> iova_shift(iovad),
+				(unsigned long)freelist);
 	else
 		free_iova_fast(iovad, iova_pfn(iovad, iova),
 				size >> iova_shift(iovad));
@@ -475,7 +489,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
 
 	if (!cookie->fq_domain)
 		iommu_iotlb_sync(domain, &iotlb_gather);
-	iommu_dma_free_iova(cookie, dma_addr, size);
+	iommu_dma_free_iova(cookie, dma_addr, size, iotlb_gather.freelist);
 }
 
 static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
@@ -523,7 +537,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 		return DMA_MAPPING_ERROR;
 
 	if (iommu_map_atomic(domain, iova, phys - iova_off, size, prot)) {
-		iommu_dma_free_iova(cookie, iova, size);
+		iommu_dma_free_iova(cookie, iova, size, NULL);
 		return DMA_MAPPING_ERROR;
 	}
 	return iova + iova_off;
@@ -724,7 +738,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 out_free_sg:
 	sg_free_table(&sgt);
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, size);
+	iommu_dma_free_iova(cookie, iova, size, NULL);
 out_free_pages:
 	__iommu_dma_free_pages(pages, count);
 	return NULL;
@@ -1038,7 +1052,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	return __finalise_sg(dev, sg, nents, iova);
 
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, iova_len);
+	iommu_dma_free_iova(cookie, iova, iova_len, NULL);
 out_restore_sg:
 	__invalidate_sg(sg, nents);
 out:
@@ -1342,7 +1356,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 	return msi_page;
 
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, size);
+	iommu_dma_free_iova(cookie, iova, size, NULL);
 out_free_page:
 	kfree(msi_page);
 	return NULL;
-- 
Gitee


From a886801ea6def99bc643c95f4b888f433c6630bd Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 25 Feb 2021 14:14:54 +0800
Subject: [PATCH 1358/2161] iommu: Don't use lazy flush for untrusted device

commit 82c3cefb9f1652e7470f442ff96c613e8c8ed8f4 upstream.

The lazy IOTLB flushing setup leaves a time window, in which the device
can still access some system memory, which has already been unmapped by
the device driver. It's not suitable for untrusted devices. A malicious
device might use this to attack the system by obtaining data that it
shouldn't obtain.

Fixes: c588072bba6b5 ("iommu/vt-d: Convert intel iommu driver to the iommu ops")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210225061454.2864009-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 2b2d47c88f99..be28d559a2c4 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -314,6 +314,11 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
 	domain->ops->flush_iotlb_all(domain);
 }
 
+static bool dev_is_untrusted(struct device *dev)
+{
+	return dev_is_pci(dev) && to_pci_dev(dev)->untrusted;
+}
+
 /**
  * iommu_dma_init_domain - Initialise a DMA mapping domain
  * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
@@ -368,8 +373,9 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 
 	init_iova_domain(iovad, 1UL << order, base_pfn);
 
-	if (!cookie->fq_domain && !iommu_domain_get_attr(domain,
-			DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) {
+	if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) &&
+	    !iommu_domain_get_attr(domain, DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) &&
+	    attr) {
 		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
 					  iommu_dma_entry_dtor))
 			pr_warn("iova flush queue initialization failed\n");
@@ -512,11 +518,6 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 				iova_align(iovad, size), dir, attrs);
 }
 
-static bool dev_is_untrusted(struct device *dev)
-{
-	return dev_is_pci(dev) && to_pci_dev(dev)->untrusted;
-}
-
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 		size_t size, int prot, dma_addr_t dma_mask)
 {
-- 
Gitee


From b7ff9f1243893266c22b2f4fcd16c6963f3e19aa Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 5 Mar 2021 16:35:22 +0000
Subject: [PATCH 1359/2161] iommu/iova: Add rbtree entry helper

commit 7ae31cec5b70e301788b95de543abb56748dcfb6 upstream.

Repeating the rb_entry() boilerplate all over the place gets old fast.
Before adding yet more instances, add a little hepler to tidy it up.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/03931d86c0ad71f44b29394e3a8d38bfc32349cd.1614962123.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index f2a43d989fca..e4c38846961c 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -27,6 +27,11 @@ static void fq_destroy_all_entries(struct iova_domain *iovad);
 static void fq_flush_timeout(struct timer_list *t);
 static void free_global_cached_iovas(struct iova_domain *iovad);
 
+static struct iova *to_iova(struct rb_node *node)
+{
+	return rb_entry(node, struct iova, node);
+}
+
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn)
@@ -136,7 +141,7 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 {
 	struct iova *cached_iova;
 
-	cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+	cached_iova = to_iova(iovad->cached32_node);
 	if (free == cached_iova ||
 	    (free->pfn_hi < iovad->dma_32bit_pfn &&
 	     free->pfn_lo >= cached_iova->pfn_lo)) {
@@ -144,7 +149,7 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 		iovad->max32_alloc_size = iovad->dma_32bit_pfn;
 	}
 
-	cached_iova = rb_entry(iovad->cached_node, struct iova, node);
+	cached_iova = to_iova(iovad->cached_node);
 	if (free->pfn_lo >= cached_iova->pfn_lo)
 		iovad->cached_node = rb_next(&free->node);
 }
@@ -159,7 +164,7 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
 	new = (start) ? &start : &(root->rb_node);
 	/* Figure out where to put new node */
 	while (*new) {
-		struct iova *this = rb_entry(*new, struct iova, node);
+		struct iova *this = to_iova(*new);
 
 		parent = *new;
 
@@ -198,7 +203,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 		goto iova32_full;
 
 	curr = __get_cached_rbnode(iovad, limit_pfn);
-	curr_iova = rb_entry(curr, struct iova, node);
+	curr_iova = to_iova(curr);
 	retry_pfn = curr_iova->pfn_hi + 1;
 
 retry:
@@ -207,7 +212,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 		new_pfn = (high_pfn - size) & align_mask;
 		prev = curr;
 		curr = rb_prev(curr);
-		curr_iova = rb_entry(curr, struct iova, node);
+		curr_iova = to_iova(curr);
 	} while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);
 
 	if (high_pfn < size || new_pfn < low_pfn) {
@@ -215,7 +220,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 			high_pfn = limit_pfn;
 			low_pfn = retry_pfn;
 			curr = &iovad->anchor.node;
-			curr_iova = rb_entry(curr, struct iova, node);
+			curr_iova = to_iova(curr);
 			goto retry;
 		}
 		iovad->max32_alloc_size = size;
@@ -333,7 +338,7 @@ private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 	assert_spin_locked(&iovad->iova_rbtree_lock);
 
 	while (node) {
-		struct iova *iova = rb_entry(node, struct iova, node);
+		struct iova *iova = to_iova(node);
 
 		if (pfn < iova->pfn_lo)
 			node = node->rb_left;
@@ -629,7 +634,7 @@ static int
 __is_range_overlap(struct rb_node *node,
 	unsigned long pfn_lo, unsigned long pfn_hi)
 {
-	struct iova *iova = rb_entry(node, struct iova, node);
+	struct iova *iova = to_iova(node);
 
 	if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
 		return 1;
@@ -697,7 +702,7 @@ reserve_iova(struct iova_domain *iovad,
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
 		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
-			iova = rb_entry(node, struct iova, node);
+			iova = to_iova(node);
 			__adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
 			if ((pfn_lo >= iova->pfn_lo) &&
 				(pfn_hi <= iova->pfn_hi))
-- 
Gitee


From 3b0b1d5e116025bb07de38cf94d123b3cd8ee129 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 5 Mar 2021 16:35:23 +0000
Subject: [PATCH 1360/2161] iommu/iova: Improve restart logic

commit 371d7955e3102fe38daf06de4ed9bfd29864354b upstream.

When restarting after searching below the cached node fails, resetting
the start point to the anchor node is often overly pessimistic. If
allocations are made with mixed limits - particularly in the case of the
opportunistic 32-bit allocation for PCI devices - this could mean
significant time wasted walking through the whole populated upper range
just to reach the initial limit. We can improve on that by implementing
a proper tree traversal to find the first node above the relevant limit,
and set the exact start point.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/076b3484d1e5057b95d8c387c894bd6ad2514043.1614962123.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index e4c38846961c..d5a93d4bf16c 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -154,6 +154,43 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 		iovad->cached_node = rb_next(&free->node);
 }
 
+static struct rb_node *iova_find_limit(struct iova_domain *iovad, unsigned long limit_pfn)
+{
+	struct rb_node *node, *next;
+	/*
+	 * Ideally what we'd like to judge here is whether limit_pfn is close
+	 * enough to the highest-allocated IOVA that starting the allocation
+	 * walk from the anchor node will be quicker than this initial work to
+	 * find an exact starting point (especially if that ends up being the
+	 * anchor node anyway). This is an incredibly crude approximation which
+	 * only really helps the most likely case, but is at least trivially easy.
+	 */
+	if (limit_pfn > iovad->dma_32bit_pfn)
+		return &iovad->anchor.node;
+
+	node = iovad->rbroot.rb_node;
+	while (to_iova(node)->pfn_hi < limit_pfn)
+		node = node->rb_right;
+
+search_left:
+	while (node->rb_left && to_iova(node->rb_left)->pfn_lo >= limit_pfn)
+		node = node->rb_left;
+
+	if (!node->rb_left)
+		return node;
+
+	next = node->rb_left;
+	while (next->rb_right) {
+		next = next->rb_right;
+		if (to_iova(next)->pfn_lo >= limit_pfn) {
+			node = next;
+			goto search_left;
+		}
+	}
+
+	return node;
+}
+
 /* Insert the iova into domain rbtree by holding writer lock */
 static void
 iova_insert_rbtree(struct rb_root *root, struct iova *iova,
@@ -219,7 +256,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 		if (low_pfn == iovad->start_pfn && retry_pfn < limit_pfn) {
 			high_pfn = limit_pfn;
 			low_pfn = retry_pfn;
-			curr = &iovad->anchor.node;
+			curr = iova_find_limit(iovad, limit_pfn);
 			curr_iova = to_iova(curr);
 			goto retry;
 		}
-- 
Gitee


From af261d251491c6d389bcfddd6ad03be73836d901 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 25 Mar 2021 20:29:58 +0800
Subject: [PATCH 1361/2161] iova: Add CPU hotplug handler to flush rcaches

commit f598a497bc7dfbec60270bca8b8408db3d23ac07 upstream.

Like the Intel IOMMU driver already does, flush the per-IOVA domain
CPU rcache when a CPU goes offline - there's no point in keeping it.

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1616675401-151997-2-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c       | 30 +++++++++++++++++++++++++++++-
 include/linux/cpuhotplug.h |  1 +
 include/linux/iova.h       |  1 +
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d5a93d4bf16c..fa4a670258e5 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -25,6 +25,17 @@ static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
 static void fq_flush_timeout(struct timer_list *t);
+
+static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct iova_domain *iovad;
+
+	iovad = hlist_entry_safe(node, struct iova_domain, cpuhp_dead);
+
+	free_cpu_cached_iovas(cpu, iovad);
+	return 0;
+}
+
 static void free_global_cached_iovas(struct iova_domain *iovad);
 
 static struct iova *to_iova(struct rb_node *node)
@@ -56,6 +67,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
 	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
 	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
+	cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, &iovad->cpuhp_dead);
 	init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
@@ -301,10 +313,21 @@ int iova_cache_get(void)
 {
 	mutex_lock(&iova_cache_mutex);
 	if (!iova_cache_users) {
+		int ret;
+
+		ret = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", NULL,
+					iova_cpuhp_dead);
+		if (ret) {
+			mutex_unlock(&iova_cache_mutex);
+			pr_err("Couldn't register cpuhp handler\n");
+			return ret;
+		}
+
 		iova_cache = kmem_cache_create(
 			"iommu_iova", sizeof(struct iova), 0,
 			SLAB_HWCACHE_ALIGN, NULL);
 		if (!iova_cache) {
+			cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
 			mutex_unlock(&iova_cache_mutex);
 			pr_err("Couldn't create iova cache\n");
 			return -ENOMEM;
@@ -326,8 +349,10 @@ void iova_cache_put(void)
 		return;
 	}
 	iova_cache_users--;
-	if (!iova_cache_users)
+	if (!iova_cache_users) {
+		cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
 		kmem_cache_destroy(iova_cache);
+	}
 	mutex_unlock(&iova_cache_mutex);
 }
 EXPORT_SYMBOL_GPL(iova_cache_put);
@@ -660,6 +685,9 @@ void put_iova_domain(struct iova_domain *iovad)
 {
 	struct iova *iova, *tmp;
 
+	cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,
+					    &iovad->cpuhp_dead);
+
 	free_iova_flush_queue(iovad);
 	free_iova_rcaches(iovad);
 	rbtree_postorder_for_each_entry_safe(iova, tmp, &iovad->rbroot, node)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index e45b04ba04af..96bf167ec28b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -56,6 +56,7 @@ enum cpuhp_state {
 	CPUHP_PAGE_ALLOC_DEAD,
 	CPUHP_NET_DEV_DEAD,
 	CPUHP_PCI_XGENE_DEAD,
+	CPUHP_IOMMU_IOVA_DEAD,
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 5e20ff08a435..05d66e36f0bd 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -95,6 +95,7 @@ struct iova_domain {
 						   flush-queues */
 	atomic_t fq_timer_on;			/* 1 when timer is active, 0
 						   when not */
+	struct hlist_node	cpuhp_dead;
 };
 
 static inline unsigned long iova_size(struct iova *iova)
-- 
Gitee


From 57d60800473affc4a4477449a3742234c87428d2 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 26 May 2022 17:08:35 +0800
Subject: [PATCH 1362/2161] iommu: Add iommu_dma_free_cpu_cached_iovas()

commit 230309d08b871e439f8618db3610f2cc9b5f7c72 upstream.

Add a iommu_dma_free_cpu_cached_iovas function to allow drivers which
use the dma-iommu ops to free cached cpu iovas.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20201124082057.2614359-3-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 9 +++++++++
 include/linux/dma-iommu.h | 8 ++++++++
 2 files changed, 17 insertions(+)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index be28d559a2c4..183b14cd319d 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -65,6 +65,15 @@ static int __init iommu_dma_forcedac_setup(char *str)
 }
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
+void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
+		struct iommu_domain *domain)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+
+	free_cpu_cached_iovas(cpu, iovad);
+}
+
 static void iommu_dma_entry_dtor(unsigned long data)
 {
 	struct page *freelist = (struct page *)data;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 411a7b7cb600..d688639ca5b3 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -37,6 +37,9 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
+void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
+		struct iommu_domain *domain);
+
 extern bool iommu_dma_forcedac;
 
 #else /* CONFIG_IOMMU_DMA */
@@ -80,5 +83,10 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
 {
 }
 
+static inline void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
+		struct iommu_domain *domain)
+{
+}
+
 #endif	/* CONFIG_IOMMU_DMA */
 #endif	/* __DMA_IOMMU_H */
-- 
Gitee


From 9e0b2496b3bf8b0efc1da8af4a3414eb52d53151 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 25 Mar 2021 20:30:00 +0800
Subject: [PATCH 1363/2161] iommu: Delete iommu_dma_free_cpu_cached_iovas()

commit 149448b353e2517ecc6eced7d9f46e9f3e08b89e upstream.

Function iommu_dma_free_cpu_cached_iovas() no longer has any caller, so
delete it.

With that, function free_cpu_cached_iovas() may be made static.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1616675401-151997-4-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 9 ---------
 drivers/iommu/iova.c      | 3 ++-
 include/linux/dma-iommu.h | 5 -----
 include/linux/iova.h      | 5 -----
 4 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 183b14cd319d..be28d559a2c4 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -65,15 +65,6 @@ static int __init iommu_dma_forcedac_setup(char *str)
 }
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
-void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
-		struct iommu_domain *domain)
-{
-	struct iommu_dma_cookie *cookie = domain->iova_cookie;
-	struct iova_domain *iovad = &cookie->iovad;
-
-	free_cpu_cached_iovas(cpu, iovad);
-}
-
 static void iommu_dma_entry_dtor(unsigned long data)
 {
 	struct page *freelist = (struct page *)data;
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index fa4a670258e5..f7e2e694ad43 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -22,6 +22,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 				     unsigned long size,
 				     unsigned long limit_pfn);
 static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
 static void fq_flush_timeout(struct timer_list *t);
@@ -1094,7 +1095,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 /*
  * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
  */
-void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
 {
 	struct iova_cpu_rcache *cpu_rcache;
 	struct iova_rcache *rcache;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index d688639ca5b3..758ca4694257 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -83,10 +83,5 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
 {
 }
 
-static inline void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
-		struct iommu_domain *domain)
-{
-}
-
 #endif	/* CONFIG_IOMMU_DMA */
 #endif	/* __DMA_IOMMU_H */
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 05d66e36f0bd..aee26846dcf9 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -161,7 +161,6 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
 	struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
-void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 #else
 static inline int iova_cache_get(void)
 {
@@ -255,10 +254,6 @@ static inline struct iova *split_and_remove_iova(struct iova_domain *iovad,
 	return NULL;
 }
 
-static inline void free_cpu_cached_iovas(unsigned int cpu,
-					 struct iova_domain *iovad)
-{
-}
 #endif
 
 #endif
-- 
Gitee


From e823aaf421795f51f49c5a54ed2d01f7d40a81ca Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 26 May 2022 17:23:52 +0800
Subject: [PATCH 1364/2161] iommu: Stop exporting free_iova_fast()

commit 6e1ea50a065ed1ff8a27cde3e1876ed7dfda97fd upstream.

Function free_iova_fast() is only referenced by dma-iommu.c, which can
only be in-built, so stop exporting it.

This was missed in an earlier tidy-up patch.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1616675401-151997-5-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iova.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index f7e2e694ad43..8cca16d1361f 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -540,7 +540,6 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
 
 	free_iova(iovad, pfn);
 }
-EXPORT_SYMBOL_GPL(free_iova_fast);
 
 #define fq_ring_for_each(i, fq) \
 	for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
-- 
Gitee


From 411755fcb801a4e47b4ef6404008df1c90471d48 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 26 May 2022 17:26:16 +0800
Subject: [PATCH 1365/2161] iommu: Fix comment for struct iommu_fwspec

commit 0d35309ab5e080095190965aa7cfc3ca8fb88af9 upstream.

Commit 986d5ecc5699 ("iommu: Move fwspec->iommu_priv to struct
dev_iommu") removed iommu_priv from fwspec and commit 5702ee24182f
("ACPI/IORT: Check ATS capability in root complex nodes") added @flags.
Update the struct doc.

Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20210401154718.307519-2-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6a655c0ff8e5..4d6eb60af2e7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -627,7 +627,7 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
  * struct iommu_fwspec - per-device IOMMU instance data
  * @ops: ops for this device's IOMMU
  * @iommu_fwnode: firmware handle for this device's IOMMU
- * @iommu_priv: IOMMU driver private data for this device
+ * @flags: IOMMU_FWSPEC_* flags
  * @num_ids: number of associated device IDs
  * @ids: IDs which this device may present to the IOMMU
  */
-- 
Gitee


From 11ef5c166aba4c1a78aa4aee6da89999047efbb5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:37 +0200
Subject: [PATCH 1366/2161] iommu: remove the unused domain_window_disable
 method

commit 47685cb202d1aff6f70a2bb91e8271392fefea84 upstream.

domain_window_disable is wired up by fsl_pamu, but never actually called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-2-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/fsl_pamu_domain.c | 48 ---------------------------------
 include/linux/iommu.h           |  2 --
 2 files changed, 50 deletions(-)

diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 06828e2698d5..be9102454e1c 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -473,53 +473,6 @@ static int update_domain_mapping(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
 	return ret;
 }
 
-static int disable_domain_win(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
-{
-	struct device_domain_info *info;
-	int ret = 0;
-
-	list_for_each_entry(info, &dma_domain->devices, link) {
-		if (dma_domain->win_cnt == 1 && dma_domain->enabled) {
-			ret = pamu_disable_liodn(info->liodn);
-			if (!ret)
-				dma_domain->enabled = 0;
-		} else {
-			ret = pamu_disable_spaace(info->liodn, wnd_nr);
-		}
-	}
-
-	return ret;
-}
-
-static void fsl_pamu_window_disable(struct iommu_domain *domain, u32 wnd_nr)
-{
-	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-	if (!dma_domain->win_arr) {
-		pr_debug("Number of windows not configured\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return;
-	}
-
-	if (wnd_nr >= dma_domain->win_cnt) {
-		pr_debug("Invalid window index\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return;
-	}
-
-	if (dma_domain->win_arr[wnd_nr].valid) {
-		ret = disable_domain_win(dma_domain, wnd_nr);
-		if (!ret) {
-			dma_domain->win_arr[wnd_nr].valid = 0;
-			dma_domain->mapped--;
-		}
-	}
-
-	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-}
 
 static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				  phys_addr_t paddr, u64 size, int prot)
@@ -1044,7 +997,6 @@ static const struct iommu_ops fsl_pamu_ops = {
 	.attach_dev	= fsl_pamu_attach_device,
 	.detach_dev	= fsl_pamu_detach_device,
 	.domain_window_enable = fsl_pamu_window_enable,
-	.domain_window_disable = fsl_pamu_window_disable,
 	.iova_to_phys	= fsl_pamu_iova_to_phys,
 	.domain_set_attr = fsl_pamu_set_domain_attr,
 	.domain_get_attr = fsl_pamu_get_domain_attr,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4d6eb60af2e7..0e2bb79bde15 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -246,7 +246,6 @@ struct iommu_iotlb_gather {
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
  * @domain_window_enable: Configure and enable a particular window for a domain
- * @domain_window_disable: Disable a particular window for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @is_attach_deferred: Check if domain attach should be deferred from iommu
  *                      driver init to device driver init (default no)
@@ -308,7 +307,6 @@ struct iommu_ops {
 	/* Window handling functions */
 	int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
 				    phys_addr_t paddr, u64 size, int prot);
-	void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr);
 
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 	bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
-- 
Gitee


From 9a27361d969b904441d89e783ac4858499b0243f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:50 +0200
Subject: [PATCH 1367/2161] iommu: remove DOMAIN_ATTR_PAGING

commit 9fb5fad562fa0a41c84691714d99c23f54168a9e upstream.

DOMAIN_ATTR_PAGING is never used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-15-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 5 -----
 include/linux/iommu.h | 1 -
 2 files changed, 6 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 25d873527ba0..2d2351f1499a 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2675,7 +2675,6 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
 {
 	struct iommu_domain_geometry *geometry;
-	bool *paging;
 	int ret = 0;
 
 	switch (attr) {
@@ -2683,10 +2682,6 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 		geometry  = data;
 		*geometry = domain->geometry;
 
-		break;
-	case DOMAIN_ATTR_PAGING:
-		paging  = data;
-		*paging = (domain->pgsize_bitmap != 0UL);
 		break;
 	default:
 		if (!domain->ops->domain_get_attr)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0e2bb79bde15..d97b3d8fcad0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -129,7 +129,6 @@ enum iommu_cap {
 
 enum iommu_attr {
 	DOMAIN_ATTR_GEOMETRY,
-	DOMAIN_ATTR_PAGING,
 	DOMAIN_ATTR_WINDOWS,
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
-- 
Gitee


From f2a3c9b06d6d9033837ec13a69cbf23d20cb2172 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:51 +0200
Subject: [PATCH 1368/2161] iommu: remove DOMAIN_ATTR_GEOMETRY

commit bc9a05eef113e75cfa792fdf24dae011bc3d5294 upstream.

The geometry information can be trivially queried from the iommu_domain
struture.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-16-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c           | 20 +++-----------------
 drivers/vfio/vfio_iommu_type1.c | 26 ++++++++++++--------------
 include/linux/iommu.h           |  1 -
 3 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2d2351f1499a..676d5a4fa072 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2674,23 +2674,9 @@ core_initcall(iommu_init);
 int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
 {
-	struct iommu_domain_geometry *geometry;
-	int ret = 0;
-
-	switch (attr) {
-	case DOMAIN_ATTR_GEOMETRY:
-		geometry  = data;
-		*geometry = domain->geometry;
-
-		break;
-	default:
-		if (!domain->ops->domain_get_attr)
-			return -EINVAL;
-
-		ret = domain->ops->domain_get_attr(domain, attr, data);
-	}
-
-	return ret;
+	if (!domain->ops->domain_get_attr)
+		return -EINVAL;
+	return domain->ops->domain_get_attr(domain, attr, data);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 15a780f4d75e..1e81f45337b4 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1763,7 +1763,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	int ret;
 	bool resv_msi, msi_remap;
 	phys_addr_t resv_msi_base = 0;
-	struct iommu_domain_geometry geo;
+	struct iommu_domain_geometry *geo;
 	LIST_HEAD(iova_copy);
 	LIST_HEAD(group_resv_regions);
 
@@ -1840,10 +1840,9 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_domain;
 
 	/* Get aperture info */
-	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
-
-	if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
-				     geo.aperture_end)) {
+	geo = &domain->domain->geometry;
+	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
+				     geo->aperture_end)) {
 		ret = -EINVAL;
 		goto out_detach;
 	}
@@ -1866,8 +1865,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
-	ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
-				     geo.aperture_end);
+	ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
+				     geo->aperture_end);
 	if (ret)
 		goto out_detach;
 
@@ -1992,7 +1991,6 @@ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
 				   struct list_head *iova_copy)
 {
 	struct vfio_domain *domain;
-	struct iommu_domain_geometry geo;
 	struct vfio_iova *node;
 	dma_addr_t start = 0;
 	dma_addr_t end = (dma_addr_t)~0;
@@ -2001,12 +1999,12 @@ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
 		return;
 
 	list_for_each_entry(domain, &iommu->domain_list, next) {
-		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
-				      &geo);
-		if (geo.aperture_start > start)
-			start = geo.aperture_start;
-		if (geo.aperture_end < end)
-			end = geo.aperture_end;
+		struct iommu_domain_geometry *geo = &domain->domain->geometry;
+
+		if (geo->aperture_start > start)
+			start = geo->aperture_start;
+		if (geo->aperture_end < end)
+			end = geo->aperture_end;
 	}
 
 	/* Modify aperture limits. The new aper is either same or bigger */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d97b3d8fcad0..fdc9aefc5076 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -128,7 +128,6 @@ enum iommu_cap {
  */
 
 enum iommu_attr {
-	DOMAIN_ATTR_GEOMETRY,
 	DOMAIN_ATTR_WINDOWS,
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
-- 
Gitee


From 2d9d741644bba70bc9630e9b9114cac7b7ec0156 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:53 +0200
Subject: [PATCH 1369/2161] iommu: remove iommu_set_cmd_line_dma_api and
 iommu_cmd_line_dma_api

commit 3189713a1b84ac02cce3217955ae68d0d67b15b7 upstream.

Don't obsfucate the trivial bit flag check.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210401155256.298656-18-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 676d5a4fa072..48226d49aeab 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -73,16 +73,6 @@ static const char * const iommu_group_resv_type_string[] = {
 
 #define IOMMU_CMD_LINE_DMA_API		BIT(0)
 
-static void iommu_set_cmd_line_dma_api(void)
-{
-	iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
-}
-
-static bool iommu_cmd_line_dma_api(void)
-{
-	return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API);
-}
-
 static int iommu_alloc_default_domain(struct iommu_group *group,
 				      struct device *dev);
 static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
@@ -132,9 +122,7 @@ static const char *iommu_domain_type_str(unsigned int t)
 
 static int __init iommu_subsys_init(void)
 {
-	bool cmd_line = iommu_cmd_line_dma_api();
-
-	if (!cmd_line) {
+	if (!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API)) {
 		if (IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH))
 			iommu_set_default_passthrough(false);
 		else
@@ -148,7 +136,8 @@ static int __init iommu_subsys_init(void)
 
 	pr_info("Default domain type: %s %s\n",
 		iommu_domain_type_str(iommu_def_domain_type),
-		cmd_line ? "(set via kernel command line)" : "");
+		(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
+			"(set via kernel command line)" : "");
 
 	return 0;
 }
@@ -2764,16 +2753,14 @@ EXPORT_SYMBOL_GPL(iommu_alloc_resv_region);
 void iommu_set_default_passthrough(bool cmd_line)
 {
 	if (cmd_line)
-		iommu_set_cmd_line_dma_api();
-
+		iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
 	iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY;
 }
 
 void iommu_set_default_translated(bool cmd_line)
 {
 	if (cmd_line)
-		iommu_set_cmd_line_dma_api();
-
+		iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
 	iommu_def_domain_type = IOMMU_DOMAIN_DMA;
 }
 
-- 
Gitee


From c8fab6fe24d4f9213c8ac67a84c06b47d9a43288 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 26 May 2022 20:16:58 +0800
Subject: [PATCH 1370/2161] iommu: remove DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE

commit a250c23f15c21c556becd4986f453255e545807c upstream.

Instead make the global iommu_dma_strict paramete in iommu.c canonical by
exporting helpers to get and set it and use those directly in the drivers.

This make sure that the iommu.strict parameter also works for the AMD and
Intel IOMMU drivers on x86.  As those default to lazy flushing a new
IOMMU_CMD_LINE_STRICT is used to turn the value into a tristate to
represent the default if not overriden by an explicit parameter.

[ported on top of the other iommu_attr changes and added a few small
 missing bits]

Signed-off-by: Robin Murphy <robin.murphy@arm.com>.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210401155256.298656-19-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c   | 23 +------------
 drivers/iommu/dma-iommu.c   |  9 ++----
 drivers/iommu/intel/iommu.c | 64 +++++++------------------------------
 drivers/iommu/iommu.c       | 30 +++++++++++------
 include/linux/iommu.h       |  4 ++-
 5 files changed, 39 insertions(+), 91 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 64007b76c277..686888ae870e 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2234,26 +2234,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev)
 	return acpihid_device_group(dev);
 }
 
-static int amd_iommu_domain_get_attr(struct iommu_domain *domain,
-		enum iommu_attr attr, void *data)
-{
-	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		return -ENODEV;
-	case IOMMU_DOMAIN_DMA:
-		switch (attr) {
-		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-			*(int *)data = !amd_iommu_unmap_flush;
-			return 0;
-		default:
-			return -ENODEV;
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-}
-
 /*****************************************************************************
  *
  * The next functions belong to the dma_ops mapping/unmapping code.
@@ -2312,7 +2292,7 @@ int __init amd_iommu_init_dma_ops(void)
 		pr_info("IO/TLB flush on unmap enabled\n");
 	else
 		pr_info("Lazy IO/TLB flushing enabled\n");
-
+	iommu_set_dma_strict(amd_iommu_unmap_flush);
 	return 0;
 
 }
@@ -2710,7 +2690,6 @@ const struct iommu_ops amd_iommu_ops = {
 	.release_device = amd_iommu_release_device,
 	.probe_finalize = amd_iommu_probe_finalize,
 	.device_group = amd_iommu_device_group,
-	.domain_get_attr = amd_iommu_domain_get_attr,
 	.get_resv_regions = amd_iommu_get_resv_regions,
 	.put_resv_regions = amd_iommu_put_resv_regions,
 	.is_attach_deferred = amd_iommu_is_attach_deferred,
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index be28d559a2c4..48cf554f77fc 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -307,10 +307,7 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
 
 	cookie = container_of(iovad, struct iommu_dma_cookie, iovad);
 	domain = cookie->fq_domain;
-	/*
-	 * The IOMMU driver supporting DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE
-	 * implies that ops->flush_iotlb_all must be non-NULL.
-	 */
+
 	domain->ops->flush_iotlb_all(domain);
 }
 
@@ -337,7 +334,6 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	unsigned long order, base_pfn;
 	struct iova_domain *iovad;
-	int attr;
 
 	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
 		return -EINVAL;
@@ -374,8 +370,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	init_iova_domain(iovad, 1UL << order, base_pfn);
 
 	if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) &&
-	    !iommu_domain_get_attr(domain, DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) &&
-	    attr) {
+	    domain->ops->flush_iotlb_all && !iommu_get_dma_strict(domain)) {
 		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
 					  iommu_dma_entry_dtor))
 			pr_warn("iova flush queue initialization failed\n");
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f8c31d5bd35b..96c810408702 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3489,57 +3489,6 @@ static inline int iommu_domain_cache_init(void)
 	return ret;
 }
 
-static bool domain_use_flush_queue(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	bool r = true;
-
-	if (intel_iommu_strict)
-		return false;
-
-	/*
-	 * The flush queue implementation does not perform page-selective
-	 * invalidations that are required for efficient TLB flushes in virtual
-	 * environments. The benefit of batching is likely to be much lower than
-	 * the overhead of synchronizing the virtual and physical IOMMU
-	 * page-tables.
-	 */
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
-		if (!cap_caching_mode(iommu->cap))
-			continue;
-
-		pr_warn_once("IOMMU batching is disabled due to virtualization");
-		r = false;
-		break;
-	}
-	rcu_read_unlock();
-
-	return r;
-}
-
-static int
-intel_iommu_domain_get_attr(struct iommu_domain *domain,
-			    enum iommu_attr attr, void *data)
-{
-	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		return -ENODEV;
-	case IOMMU_DOMAIN_DMA:
-		switch (attr) {
-		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-			*(int *)data = domain_use_flush_queue();
-			return 0;
-		default:
-			return -ENODEV;
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-}
-
 static inline int iommu_devinfo_cache_init(void)
 {
 	int ret = 0;
@@ -4522,6 +4471,17 @@ int __init intel_iommu_init(void)
 
 	down_read(&dmar_global_lock);
 	for_each_active_iommu(iommu, drhd) {
+		/*
+		 * The flush queue implementation does not perform
+		 * page-selective invalidations that are required for efficient
+		 * TLB flushes in virtual environments.  The benefit of batching
+		 * is likely to be much lower than the overhead of synchronizing
+		 * the virtual and physical IOMMU page-tables.
+		 */
+		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
+			pr_warn("IOMMU batching is disabled due to virtualization");
+			intel_iommu_strict = 1;
+		}
 		iommu_device_sysfs_add(&iommu->iommu, NULL,
 				       intel_iommu_groups,
 				       "%s", iommu->name);
@@ -4530,6 +4490,7 @@ int __init intel_iommu_init(void)
 	}
 	up_read(&dmar_global_lock);
 
+	iommu_set_dma_strict(intel_iommu_strict);
 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
 	if (si_domain && !hw_pass_through)
 		register_memory_notifier(&intel_iommu_memory_nb);
@@ -5669,7 +5630,6 @@ const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
-	.domain_get_attr        = intel_iommu_domain_get_attr,
 	.enable_nesting		= intel_iommu_enable_nesting,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 48226d49aeab..3093adf6137b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -72,6 +72,7 @@ static const char * const iommu_group_resv_type_string[] = {
 };
 
 #define IOMMU_CMD_LINE_DMA_API		BIT(0)
+#define IOMMU_CMD_LINE_STRICT		BIT(1)
 
 static int iommu_alloc_default_domain(struct iommu_group *group,
 				      struct device *dev);
@@ -320,10 +321,29 @@ early_param("iommu.passthrough", iommu_set_def_domain_type);
 
 static int __init iommu_dma_setup(char *str)
 {
-	return kstrtobool(str, &iommu_dma_strict);
+	int ret = kstrtobool(str, &iommu_dma_strict);
+
+	if (!ret)
+		iommu_cmd_line |= IOMMU_CMD_LINE_STRICT;
+	return ret;
 }
 early_param("iommu.strict", iommu_dma_setup);
 
+void iommu_set_dma_strict(bool strict)
+{
+	if (strict || !(iommu_cmd_line & IOMMU_CMD_LINE_STRICT))
+		iommu_dma_strict = strict;
+}
+
+bool iommu_get_dma_strict(struct iommu_domain *domain)
+{
+	/* only allow lazy flushing for DMA domains */
+	if (domain->type == IOMMU_DOMAIN_DMA)
+		return iommu_dma_strict;
+	return true;
+}
+EXPORT_SYMBOL_GPL(iommu_get_dma_strict);
+
 static ssize_t iommu_group_attr_show(struct kobject *kobj,
 				     struct attribute *__attr, char *buf)
 {
@@ -1489,14 +1509,6 @@ static int iommu_group_alloc_default_domain(struct bus_type *bus,
 	group->default_domain = dom;
 	if (!group->domain)
 		group->domain = dom;
-
-	if (!iommu_dma_strict) {
-		int attr = 1;
-		iommu_domain_set_attr(dom,
-				      DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
-				      &attr);
-	}
-
 	return 0;
 }
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index fdc9aefc5076..c1424b780dc9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -132,7 +132,6 @@ enum iommu_attr {
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_FSL_PAMUV1,
-	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_MAX,
 };
 
@@ -569,6 +568,9 @@ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
 				      int prot);
 
+void iommu_set_dma_strict(bool val);
+bool iommu_get_dma_strict(struct iommu_domain *domain);
+
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
-- 
Gitee


From 9a3d12450495dccde2bd0b151c7ff292535a9714 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 26 May 2022 20:42:44 +0800
Subject: [PATCH 1371/2161] iommu: remove iommu_domain_{get,set}_attr

commit 7876a83ffe8c23c7049a63c747a7b96cafaf10a4 upstream.

Remove the now unused iommu attr infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210401155256.298656-21-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 26 --------------------------
 include/linux/iommu.h | 22 ----------------------
 2 files changed, 48 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3093adf6137b..72cafc13701d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2672,32 +2672,6 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
-int iommu_domain_get_attr(struct iommu_domain *domain,
-			  enum iommu_attr attr, void *data)
-{
-	if (!domain->ops->domain_get_attr)
-		return -EINVAL;
-	return domain->ops->domain_get_attr(domain, attr, data);
-}
-EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
-
-int iommu_domain_set_attr(struct iommu_domain *domain,
-			  enum iommu_attr attr, void *data)
-{
-	int ret = 0;
-
-	switch (attr) {
-	default:
-		if (domain->ops->domain_set_attr == NULL)
-			return -EINVAL;
-
-		ret = domain->ops->domain_set_attr(domain, attr, data);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
-
 int iommu_enable_nesting(struct iommu_domain *domain)
 {
 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c1424b780dc9..54f885b6eec1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -236,8 +236,6 @@ struct iommu_iotlb_gather {
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
  *                  group and attached to the groups domain
  * @device_group: find iommu group for a particular device
- * @domain_get_attr: Query domain attributes
- * @domain_set_attr: Change domain attributes
  * @enable_nesting: Enable nesting
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
@@ -288,10 +286,6 @@ struct iommu_ops {
 	void (*release_device)(struct device *dev);
 	void (*probe_finalize)(struct device *dev);
 	struct iommu_group *(*device_group)(struct device *dev);
-	int (*domain_get_attr)(struct iommu_domain *domain,
-			       enum iommu_attr attr, void *data);
-	int (*domain_set_attr)(struct iommu_domain *domain,
-			       enum iommu_attr attr, void *data);
 	int (*enable_nesting)(struct iommu_domain *domain);
 
 	/* Request/Free a list of reserved regions for a device */
@@ -557,10 +551,6 @@ extern int iommu_page_response(struct device *dev,
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
-extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
-				 void *data);
-extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
-				 void *data);
 int iommu_enable_nesting(struct iommu_domain *domain);
 
 /* Window handling function prototypes */
@@ -938,18 +928,6 @@ static inline int iommu_group_id(struct iommu_group *group)
 	return -ENODEV;
 }
 
-static inline int iommu_domain_get_attr(struct iommu_domain *domain,
-					enum iommu_attr attr, void *data)
-{
-	return -EINVAL;
-}
-
-static inline int iommu_domain_set_attr(struct iommu_domain *domain,
-					enum iommu_attr attr, void *data)
-{
-	return -EINVAL;
-}
-
 static inline int  iommu_device_register(struct iommu_device *iommu)
 {
 	return -ENODEV;
-- 
Gitee


From c462c922edd8d711dc91562d51472ecf7410dae8 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:27 +0100
Subject: [PATCH 1372/2161] iommu/dma: Remove redundant "!dev" checks

commit 8d971243a9a7d73a410450951a21f298f1394dc7 upstream.

iommu_dma_init_domain() is now only called from iommu_setup_dma_ops(),
which has already assumed dev to be non-NULL.

Reviewed-by: John Garry <john.garry@huawei.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/06024523c080364390016550065e3cfe8031367e.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 48cf554f77fc..1b6e94793e11 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -369,7 +369,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 
 	init_iova_domain(iovad, 1UL << order, base_pfn);
 
-	if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) &&
+	if (!cookie->fq_domain && !dev_is_untrusted(dev) &&
 	    domain->ops->flush_iotlb_all && !iommu_get_dma_strict(domain)) {
 		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
 					  iommu_dma_entry_dtor))
@@ -378,9 +378,6 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 			cookie->fq_domain = domain;
 	}
 
-	if (!dev)
-		return 0;
-
 	return iova_reserve_iommu_regions(dev, domain);
 }
 
-- 
Gitee


From a597efd9c72469b4ce526ee7bfc027b50f254861 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Date: Wed, 16 Jun 2021 06:38:43 -0700
Subject: [PATCH 1373/2161] iommu: Add an unmap_pages() op for IOMMU drivers

commit cacffb7f7b45ba7649eedea4c196c6e9f1863bf3 upstream.

Add a callback for IOMMU drivers to provide a path for the
IOMMU framework to call into an IOMMU driver, which can call
into the io-pgtable code, to unmap a virtually contiguous
range of pages of the same size.

For IOMMU drivers that do not specify an unmap_pages() callback,
the existing logic of unmapping memory one page block at a time
will be used.

Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/1623850736-389584-3-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 54f885b6eec1..e953971b5c0a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -226,6 +226,7 @@ struct iommu_iotlb_gather {
  * @detach_dev: detach device from an iommu domain
  * @map: map a physically contiguous memory region to an iommu domain
  * @unmap: unmap a physically contiguous memory region from an iommu domain
+ * @unmap_pages: unmap a number of pages of the same size from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
  * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
@@ -276,6 +277,9 @@ struct iommu_ops {
 		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
+	size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova,
+			      size_t pgsize, size_t pgcount,
+			      struct iommu_iotlb_gather *iotlb_gather);
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
 	void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
 			       size_t size);
-- 
Gitee


From d050b9e4259dd534213f328ff9384fb61e90b7cc Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Jun 2021 06:38:47 -0700
Subject: [PATCH 1374/2161] iommu: Split 'addr_merge' argument to
 iommu_pgsize() into separate parts

commit 89d5b9601f70b72f524fdb7bc6cf1b1e8e20e867 upstream.

The 'addr_merge' parameter to iommu_pgsize() is a fabricated address
intended to describe the alignment requirements to consider when
choosing an appropriate page size. On the iommu_map() path, this address
is the logical OR of the virtual and physical addresses.

Subsequent improvements to iommu_pgsize() will need to check the
alignment of the virtual and physical components of 'addr_merge'
independently, so pass them in as separate parameters and reconstruct
'addr_merge' locally.

No functional change.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1623850736-389584-7-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 72cafc13701d..d250dfb3c1dd 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2362,12 +2362,13 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 }
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
 
-static size_t iommu_pgsize(struct iommu_domain *domain,
-			   unsigned long addr_merge, size_t size)
+static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
+			   phys_addr_t paddr, size_t size)
 {
 	unsigned int pgsize_idx;
 	unsigned long pgsizes;
 	size_t pgsize;
+	unsigned long addr_merge = paddr | iova;
 
 	/* Page sizes supported by the hardware and small enough for @size */
 	pgsizes = domain->pgsize_bitmap & GENMASK(__fls(size), 0);
@@ -2420,7 +2421,7 @@ int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
 
 	while (size) {
-		size_t pgsize = iommu_pgsize(domain, iova | paddr, size);
+		size_t pgsize = iommu_pgsize(domain, iova, paddr, size);
 
 		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
 			 iova, &paddr, pgsize);
@@ -2508,8 +2509,9 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 	 * or we hit an area that isn't mapped.
 	 */
 	while (unmapped < size) {
-		size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
+		size_t pgsize;
 
+		pgsize = iommu_pgsize(domain, iova, iova, size - unmapped);
 		unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather);
 		if (!unmapped_page)
 			break;
-- 
Gitee


From bdca9a79e2383f9065f589bf920c418b0cc44adc Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Jun 2021 06:38:48 -0700
Subject: [PATCH 1375/2161] iommu: Hook up '->unmap_pages' driver callback

commit b1d99dc5f983b4ae8ab9841981c5dbd5530f6344 upstream.

Extend iommu_pgsize() to populate an optional 'count' parameter so that
we can direct unmapping operation to the ->unmap_pages callback if it
has been provided by the driver.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1623850736-389584-8-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 59 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 50 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d250dfb3c1dd..b5e0d17753e6 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2363,11 +2363,11 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
 
 static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
-			   phys_addr_t paddr, size_t size)
+			   phys_addr_t paddr, size_t size, size_t *count)
 {
-	unsigned int pgsize_idx;
+	unsigned int pgsize_idx, pgsize_idx_next;
 	unsigned long pgsizes;
-	size_t pgsize;
+	size_t offset, pgsize, pgsize_next;
 	unsigned long addr_merge = paddr | iova;
 
 	/* Page sizes supported by the hardware and small enough for @size */
@@ -2383,7 +2383,36 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
 	/* Pick the biggest page size remaining */
 	pgsize_idx = __fls(pgsizes);
 	pgsize = BIT(pgsize_idx);
+	if (!count)
+		return pgsize;
 
+	/* Find the next biggest support page size, if it exists */
+	pgsizes = domain->pgsize_bitmap & ~GENMASK(pgsize_idx, 0);
+	if (!pgsizes)
+		goto out_set_count;
+
+	pgsize_idx_next = __ffs(pgsizes);
+	pgsize_next = BIT(pgsize_idx_next);
+
+	/*
+	 * There's no point trying a bigger page size unless the virtual
+	 * and physical addresses are similarly offset within the larger page.
+	 */
+	if ((iova ^ paddr) & (pgsize_next - 1))
+		goto out_set_count;
+
+	/* Calculate the offset to the next page size alignment boundary */
+	offset = pgsize_next - (addr_merge & (pgsize_next - 1));
+
+	/*
+	 * If size is big enough to accommodate the larger page, reduce
+	 * the number of smaller pages.
+	 */
+	if (offset + pgsize_next <= size)
+		size = offset;
+
+out_set_count:
+	*count = size >> pgsize_idx;
 	return pgsize;
 }
 
@@ -2421,7 +2450,7 @@ int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
 
 	while (size) {
-		size_t pgsize = iommu_pgsize(domain, iova, paddr, size);
+		size_t pgsize = iommu_pgsize(domain, iova, paddr, size, NULL);
 
 		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
 			 iova, &paddr, pgsize);
@@ -2472,6 +2501,19 @@ int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
 }
 EXPORT_SYMBOL_GPL(iommu_map_atomic);
 
+static size_t __iommu_unmap_pages(struct iommu_domain *domain,
+				  unsigned long iova, size_t size,
+				  struct iommu_iotlb_gather *iotlb_gather)
+{
+	const struct iommu_ops *ops = domain->ops;
+	size_t pgsize, count;
+
+	pgsize = iommu_pgsize(domain, iova, iova, size, &count);
+	return ops->unmap_pages ?
+	       ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather) :
+	       ops->unmap(domain, iova, pgsize, iotlb_gather);
+}
+
 static size_t __iommu_unmap(struct iommu_domain *domain,
 			    unsigned long iova, size_t size,
 			    struct iommu_iotlb_gather *iotlb_gather)
@@ -2481,7 +2523,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 	unsigned long orig_iova = iova;
 	unsigned int min_pagesz;
 
-	if (unlikely(ops->unmap == NULL ||
+	if (unlikely(!(ops->unmap || ops->unmap_pages) ||
 		     domain->pgsize_bitmap == 0UL))
 		return 0;
 
@@ -2509,10 +2551,9 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 	 * or we hit an area that isn't mapped.
 	 */
 	while (unmapped < size) {
-		size_t pgsize;
-
-		pgsize = iommu_pgsize(domain, iova, iova, size - unmapped);
-		unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather);
+		unmapped_page = __iommu_unmap_pages(domain, iova,
+						    size - unmapped,
+						    iotlb_gather);
 		if (!unmapped_page)
 			break;
 
-- 
Gitee


From 921ea3e9f427a4560386199a9fbc90661c9afb8d Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Tue, 30 Jun 2020 10:17:56 +0200
Subject: [PATCH 1376/2161] iommu: Move sg_table wrapper out of
 CONFIG_IOMMU_SUPPORT

commit ca37faf3d7005b5588f045edfac1d82799c408a7 upstream.

Move the recently added sg_table wrapper out of CONFIG_IOMMU_SUPPORT to
let the client code copile also when IOMMU support is disabled.

Fixes: 48530d9fab0d ("iommu: add generic helper for mapping sgtable objects")
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20200630081756.18526-1-m.szyprowski@samsung.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e953971b5c0a..30a7b140457f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -490,22 +490,6 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t io
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
 			iommu_fault_handler_t handler, void *token);
 
-/**
- * iommu_map_sgtable - Map the given buffer to the IOMMU domain
- * @domain:	The IOMMU domain to perform the mapping
- * @iova:	The start address to map the buffer
- * @sgt:	The sg_table object describing the buffer
- * @prot:	IOMMU protection bits
- *
- * Creates a mapping at @iova for the buffer described by a scatterlist
- * stored in the given sg_table object in the provided IOMMU domain.
- */
-static inline size_t iommu_map_sgtable(struct iommu_domain *domain,
-			unsigned long iova, struct sg_table *sgt, int prot)
-{
-	return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot);
-}
-
 extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
 extern void generic_iommu_put_resv_regions(struct device *dev,
@@ -1093,6 +1077,22 @@ static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 }
 #endif /* CONFIG_IOMMU_API */
 
+/**
+ * iommu_map_sgtable - Map the given buffer to the IOMMU domain
+ * @domain:	The IOMMU domain to perform the mapping
+ * @iova:	The start address to map the buffer
+ * @sgt:	The sg_table object describing the buffer
+ * @prot:	IOMMU protection bits
+ *
+ * Creates a mapping at @iova for the buffer described by a scatterlist
+ * stored in the given sg_table object in the provided IOMMU domain.
+ */
+static inline size_t iommu_map_sgtable(struct iommu_domain *domain,
+			unsigned long iova, struct sg_table *sgt, int prot)
+{
+	return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot);
+}
+
 #ifdef CONFIG_IOMMU_DEBUGFS
 extern	struct dentry *iommu_debugfs_dir;
 void iommu_debugfs_setup(void);
-- 
Gitee


From 48df749b27352b7e4a8fac9fe36a6e13495b5ad0 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 27 May 2022 12:18:03 +0800
Subject: [PATCH 1377/2161] iommu: Make some functions static

commit 1b0b2a84c979347f5385a8cb3901d3a46569d57c upstream.

The sparse tool complains as follows:

drivers/iommu/iommu.c:386:5: warning:
 symbol 'iommu_insert_resv_region' was not declared. Should it be static?
drivers/iommu/iommu.c:2182:5: warning:
 symbol '__iommu_map' was not declared. Should it be static?

Those functions are not used outside of iommu.c, so mark them static.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Link: https://lore.kernel.org/r/20200713142542.50294-1-weiyongjun1@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b5e0d17753e6..34140d9541a8 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -400,8 +400,8 @@ static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf)
  * Elements are sorted by start address and overlapping segments
  * of the same type are merged.
  */
-int iommu_insert_resv_region(struct iommu_resv_region *new,
-			     struct list_head *regions)
+static int iommu_insert_resv_region(struct iommu_resv_region *new,
+				    struct list_head *regions)
 {
 	struct iommu_resv_region *iter, *tmp, *nr, *top;
 	LIST_HEAD(stack);
@@ -2416,8 +2416,8 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
 	return pgsize;
 }
 
-int __iommu_map(struct iommu_domain *domain, unsigned long iova,
-		phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
+		       phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
 	const struct iommu_ops *ops = domain->ops;
 	unsigned long orig_iova = iova;
-- 
Gitee


From 5643a449b350766952f81088d7eef2106c7c6a5a Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 20 Jul 2021 10:06:14 +0800
Subject: [PATCH 1378/2161] iommu/vt-d: Implement map/unmap_pages() iommu_ops
 callback

commit 3f34f125977685e591def32984c77e23a86075b0 upstream.

Implement the map_pages() and unmap_pages() callback for the Intel IOMMU
driver to allow calls from iommu core to map and unmap multiple pages of
the same size in one call. With map/unmap_pages() implemented, the prior
map/unmap callbacks are deprecated.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210720020615.4144323-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 96c810408702..2a77034001f8 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5150,6 +5150,28 @@ static int intel_iommu_map(struct iommu_domain *domain,
 				hpa >> VTD_PAGE_SHIFT, size, prot);
 }
 
+static int intel_iommu_map_pages(struct iommu_domain *domain,
+				 unsigned long iova, phys_addr_t paddr,
+				 size_t pgsize, size_t pgcount,
+				 int prot, gfp_t gfp, size_t *mapped)
+{
+	unsigned long pgshift = __ffs(pgsize);
+	size_t size = pgcount << pgshift;
+	int ret;
+
+	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
+		return -EINVAL;
+
+	if (!IS_ALIGNED(iova | paddr, pgsize))
+		return -EINVAL;
+
+	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
+	if (!ret && mapped)
+		*mapped = size;
+
+	return ret;
+}
+
 static size_t intel_iommu_unmap(struct iommu_domain *domain,
 				unsigned long iova, size_t size,
 				struct iommu_iotlb_gather *gather)
@@ -5179,6 +5201,17 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
 	return size;
 }
 
+static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
+				      unsigned long iova,
+				      size_t pgsize, size_t pgcount,
+				      struct iommu_iotlb_gather *gather)
+{
+	unsigned long pgshift = __ffs(pgsize);
+	size_t size = pgcount << pgshift;
+
+	return intel_iommu_unmap(domain, iova, size, gather);
+}
+
 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
 				 struct iommu_iotlb_gather *gather)
 {
@@ -5636,9 +5669,9 @@ const struct iommu_ops intel_iommu_ops = {
 	.aux_attach_dev		= intel_iommu_aux_attach_device,
 	.aux_detach_dev		= intel_iommu_aux_detach_device,
 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
-	.map			= intel_iommu_map,
+	.map_pages		= intel_iommu_map_pages,
+	.unmap_pages		= intel_iommu_unmap_pages,
 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
-	.unmap			= intel_iommu_unmap,
 	.flush_iotlb_all        = intel_flush_iotlb_all,
 	.iotlb_sync		= intel_iommu_tlb_sync,
 	.iova_to_phys		= intel_iommu_iova_to_phys,
-- 
Gitee


From 55d11ccd06587f9c43c947f115cf3d8508e961a1 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Date: Wed, 16 Jun 2021 06:38:49 -0700
Subject: [PATCH 1379/2161] iommu: Add support for the map_pages() callback

commit 647c57764b37c0ddfa6bf775b03d5e5cc407086b upstream.

Since iommu_pgsize can calculate how many pages of the
same size can be mapped/unmapped before the next largest
page size boundary, add support for invoking an IOMMU
driver's map_pages() callback, if it provides one.

Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1623850736-389584-9-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 34140d9541a8..2b1762a9a1bd 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2416,6 +2416,30 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
 	return pgsize;
 }
 
+static int __iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
+			     phys_addr_t paddr, size_t size, int prot,
+			     gfp_t gfp, size_t *mapped)
+{
+	const struct iommu_ops *ops = domain->ops;
+	size_t pgsize, count;
+	int ret;
+
+	pgsize = iommu_pgsize(domain, iova, paddr, size, &count);
+
+	pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n",
+		 iova, &paddr, pgsize, count);
+
+	if (ops->map_pages) {
+		ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot,
+				     gfp, mapped);
+	} else {
+		ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
+		*mapped = ret ? 0 : pgsize;
+	}
+
+	return ret;
+}
+
 static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 		       phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
@@ -2426,7 +2450,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	phys_addr_t orig_paddr = paddr;
 	int ret = 0;
 
-	if (unlikely(ops->map == NULL ||
+	if (unlikely(!(ops->map || ops->map_pages) ||
 		     domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
@@ -2450,18 +2474,21 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
 
 	while (size) {
-		size_t pgsize = iommu_pgsize(domain, iova, paddr, size, NULL);
+		size_t mapped = 0;
 
-		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
-			 iova, &paddr, pgsize);
-		ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
+		ret = __iommu_map_pages(domain, iova, paddr, size, prot, gfp,
+					&mapped);
+		/*
+		 * Some pages may have been mapped, even if an error occurred,
+		 * so we should account for those so they can be unmapped.
+		 */
+		size -= mapped;
 
 		if (ret)
 			break;
 
-		iova += pgsize;
-		paddr += pgsize;
-		size -= pgsize;
+		iova += mapped;
+		paddr += mapped;
 	}
 
 	/* unroll mapping in case something went wrong */
-- 
Gitee


From 4725a2f155d988590f3e6a54d7402a65032d2149 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Date: Wed, 16 Jun 2021 06:38:45 -0700
Subject: [PATCH 1380/2161] iommu: Add a map_pages() op for IOMMU drivers

commit 910c4406ccc9613de0a54abf910edc4bf8a575c0 upstream.

Add a callback for IOMMU drivers to provide a path for the
IOMMU framework to call into an IOMMU driver, which can
call into the io-pgtable code, to map a physically contiguous
rnage of pages of the same size.

For IOMMU drivers that do not specify a map_pages() callback,
the existing logic of mapping memory one page block at a time
will be used.

Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Suggested-by: Will Deacon <will@kernel.org>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/1623850736-389584-5-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 30a7b140457f..c74059e33c9e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -225,6 +225,8 @@ struct iommu_iotlb_gather {
  * @attach_dev: attach device to an iommu domain
  * @detach_dev: detach device from an iommu domain
  * @map: map a physically contiguous memory region to an iommu domain
+ * @map_pages: map a physically contiguous set of pages of the same size to
+ *             an iommu domain.
  * @unmap: unmap a physically contiguous memory region from an iommu domain
  * @unmap_pages: unmap a number of pages of the same size from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
@@ -275,6 +277,9 @@ struct iommu_ops {
 	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
 	int (*map)(struct iommu_domain *domain, unsigned long iova,
 		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
+	int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
+			 int prot, gfp_t gfp, size_t *mapped);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
 	size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova,
-- 
Gitee


From 1a51d97d19bb96e9163c82140c6476640d43cac8 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 12 Jul 2021 19:12:15 +0800
Subject: [PATCH 1381/2161] iommu: Deprecate Intel and AMD cmdline methods to
 enable strict mode

commit 1d479f160c500249d8fa4d21e7d2b7aaffc04daf upstream.

Now that the x86 drivers support iommu.strict, deprecate the custom
methods.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1626088340-5838-2-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 9 ++-------
 drivers/iommu/amd_iommu_init.c                  | 4 +++-
 drivers/iommu/intel/iommu.c                     | 1 +
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 66045c063f51..1c31f6183a97 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -290,10 +290,7 @@
 	amd_iommu=	[HW,X86-64]
 			Pass parameters to the AMD IOMMU driver in the system.
 			Possible values are:
-			fullflush - enable flushing of IO/TLB entries when
-				    they are unmapped. Otherwise they are
-				    flushed before they will be reused, which
-				    is a lot of faster
+			fullflush - Deprecated, equivalent to iommu.strict=1
 			off	  - do not initialize any AMD IOMMU found in
 				    the system
 			force_isolation - Force device isolation for all
@@ -1729,9 +1726,7 @@
 			this case, gfx device will use physical address for
 			DMA.
 		strict [Default Off]
-			With this option on every unmap_single operation will
-			result in a hardware IOTLB flush operation as opposed
-			to batching them for performance.
+			Deprecated, equivalent to iommu.strict=1.
 		sp_off [Default Off]
 			By default, super page will be supported if Intel IOMMU
 			has the capability. With this option, super page will
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 7498a82e5d98..7f964b8afddb 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -3011,8 +3011,10 @@ static int __init parse_amd_iommu_intr(char *str)
 static int __init parse_amd_iommu_options(char *str)
 {
 	for (; *str; ++str) {
-		if (strncmp(str, "fullflush", 9) == 0)
+		if (strncmp(str, "fullflush", 9) == 0) {
+			pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n");
 			amd_iommu_unmap_flush = true;
+		}
 		if (strncmp(str, "off", 3) == 0)
 			amd_iommu_disabled = true;
 		if (strncmp(str, "on", 2) == 0)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2a77034001f8..4311c32c1cf6 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -408,6 +408,7 @@ static int __init intel_iommu_setup(char *str)
 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 			iommu_dma_forcedac = true;
 		} else if (!strncmp(str, "strict", 6)) {
+			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 			pr_info("Disable batched IOTLB flush\n");
 			intel_iommu_strict = 1;
 		} else if (!strncmp(str, "sp_off", 6)) {
-- 
Gitee


From fa4ad5ce0e38f3c249ec204a4d4302fb726287b8 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 12 Jul 2021 19:12:16 +0800
Subject: [PATCH 1382/2161] iommu: Print strict or lazy mode at init time

commit d8577d2e331dc06f5871afa6b76035dd8b74c903 upstream.

As well as the default domain type, it's useful to know whether strict
or lazy for DMA domains, so add this info in a separate print.

The (stict/lazy) mode may be also set via iommu.strict earlyparm, but
this will be processed prior to iommu_subsys_init(), so that print will be
accurate for drivers which don't set the mode via custom means.

For the drivers which set the mode via custom means - AMD and Intel drivers
- they maintain prints to inform a change in policy or that custom cmdline
methods to change policy are deprecated.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1626088340-5838-3-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2b1762a9a1bd..a2953be801e4 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -140,6 +140,11 @@ static int __init iommu_subsys_init(void)
 		(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
 			"(set via kernel command line)" : "");
 
+	pr_info("DMA domain TLB invalidation policy: %s mode %s\n",
+		iommu_dma_strict ? "strict" : "lazy",
+		(iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ?
+			"(set via kernel command line)" : "");
+
 	return 0;
 }
 subsys_initcall(iommu_subsys_init);
-- 
Gitee


From d8fbcdf7cf8aad508ac2409940f576f7c88e3f1b Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 27 May 2022 15:56:26 +0800
Subject: [PATCH 1383/2161] iommu: Enhance IOMMU default DMA mode build options

commit 712d8f205835c1d170e85d7811da3d9f36d8fab7 upstream.

First, add build options IOMMU_DEFAULT_{LAZY|STRICT}, so that we have the
opportunity to set {lazy|strict} mode as default at build time. Then put
the two config options in an choice, as they are mutually exclusive.

[jpg: Make choice between strict and lazy only (and not passthrough)]

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1626088340-5838-4-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         |  4 +-
 drivers/iommu/Kconfig                         | 40 +++++++++++++++++++
 drivers/iommu/iommu.c                         |  2 +-
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1c31f6183a97..320e14dbbc26 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1828,10 +1828,10 @@
 			  throughput at the cost of reduced device isolation.
 			  Will fall back to strict mode if not supported by
 			  the relevant IOMMU driver.
-			1 - Strict mode (default).
+			1 - Strict mode.
 			  DMA unmap operations invalidate IOMMU hardware TLBs
 			  synchronously.
-
+			unset - Use value of CONFIG_IOMMU_DEFAULT_{LAZY,STRICT}.
 	iommu.passthrough=
 			[ARM64, X86] Configure DMA to bypass the IOMMU by default.
 			Format: { "0" | "1" }
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 765b68b96ab4..bf72ef91768c 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -90,6 +90,46 @@ config IOMMU_DEFAULT_PASSTHROUGH
 
 	  If unsure, say N here.
 
+choice
+	prompt "IOMMU default DMA IOTLB invalidation mode"
+	depends on IOMMU_DMA
+
+	default IOMMU_DEFAULT_STRICT
+	help
+	  This option allows an IOMMU DMA IOTLB invalidation mode to be
+	  chosen at build time, to override the default mode of each ARCH,
+	  removing the need to pass in kernel parameters through command line.
+	  It is still possible to provide common boot params to override this
+	  config.
+
+	  If unsure, keep the default.
+
+config IOMMU_DEFAULT_STRICT
+	bool "strict"
+	help
+	  For every IOMMU DMA unmap operation, the flush operation of IOTLB and
+	  the free operation of IOVA are guaranteed to be done in the unmap
+	  function.
+
+config IOMMU_DEFAULT_LAZY
+	bool "lazy"
+	help
+	  Support lazy mode, where for every IOMMU DMA unmap operation, the
+	  flush operation of IOTLB and the free operation of IOVA are deferred.
+	  They are only guaranteed to be done before the related IOVA will be
+	  reused.
+
+	  The isolation provided in this mode is not as secure as STRICT mode,
+	  such that a vulnerable time window may be created between the DMA
+	  unmap and the mappings cached in the IOMMU IOTLB or device TLB
+	  finally being invalidated, where the device could still access the
+	  memory which has already been unmapped by the device driver.
+	  However this mode may provide better performance in high throughput
+	  scenarios, and is still considerably more secure than passthrough
+	  mode or no IOMMU.
+
+endchoice
+
 config OF_IOMMU
 	def_bool y
 	depends on OF && IOMMU_API
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a2953be801e4..7ae3432be20e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -32,7 +32,7 @@ static struct kset *iommu_group_kset;
 static DEFINE_IDA(iommu_group_ida);
 
 static unsigned int iommu_def_domain_type __read_mostly;
-static bool iommu_dma_strict __read_mostly = true;
+static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT);
 static u32 iommu_cmd_line __read_mostly;
 
 struct iommu_group {
-- 
Gitee


From e660dfd7568568726068a5c71af08effabb43d42 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Mon, 12 Jul 2021 19:12:18 +0800
Subject: [PATCH 1384/2161] iommu/vt-d: Add support for IOMMU default DMA mode
 build options

commit d0e108b8e962e0aae65dc74ffda63b93b6ef32f4 upstream.

Make IOMMU_DEFAULT_LAZY default for when INTEL_IOMMU config is set,
as is current behaviour.

Also delete global flag intel_iommu_strict:
- In intel_iommu_setup(), call iommu_set_dma_strict(true) directly. Also
  remove the print, as iommu_subsys_init() prints the mode and we have
  already marked this param as deprecated.

- For cap_caching_mode() check in intel_iommu_setup(), call
  iommu_set_dma_strict(true) directly; also reword the accompanying print
  with a level downgrade and also add the missing '\n'.

- For Ironlake GPU, again call iommu_set_dma_strict(true) directly and
  keep the accompanying print.

[jpg: Remove intel_iommu_strict]

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1626088340-5838-5-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig       |  1 +
 drivers/iommu/intel/iommu.c | 15 ++++++---------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index bf72ef91768c..d6bc761048f5 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -94,6 +94,7 @@ choice
 	prompt "IOMMU default DMA IOTLB invalidation mode"
 	depends on IOMMU_DMA
 
+	default IOMMU_DEFAULT_LAZY if INTEL_IOMMU
 	default IOMMU_DEFAULT_STRICT
 	help
 	  This option allows an IOMMU DMA IOTLB invalidation mode to be
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 4311c32c1cf6..447bbbb65926 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -338,7 +338,6 @@ int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
 static int dmar_map_gfx = 1;
-static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
 static int iommu_skip_te_disable;
@@ -409,8 +408,7 @@ static int __init intel_iommu_setup(char *str)
 			iommu_dma_forcedac = true;
 		} else if (!strncmp(str, "strict", 6)) {
 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
-			pr_info("Disable batched IOTLB flush\n");
-			intel_iommu_strict = 1;
+			iommu_set_dma_strict(true);
 		} else if (!strncmp(str, "sp_off", 6)) {
 			pr_info("Disable supported super page\n");
 			intel_iommu_superpage = 0;
@@ -4479,9 +4477,9 @@ int __init intel_iommu_init(void)
 		 * is likely to be much lower than the overhead of synchronizing
 		 * the virtual and physical IOMMU page-tables.
 		 */
-		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
-			pr_warn("IOMMU batching is disabled due to virtualization");
-			intel_iommu_strict = 1;
+		if (cap_caching_mode(iommu->cap)) {
+			pr_info_once("IOMMU batching disallowed due to virtualization\n");
+			iommu_set_dma_strict(true);
 		}
 		iommu_device_sysfs_add(&iommu->iommu, NULL,
 				       intel_iommu_groups,
@@ -4491,7 +4489,6 @@ int __init intel_iommu_init(void)
 	}
 	up_read(&dmar_global_lock);
 
-	iommu_set_dma_strict(intel_iommu_strict);
 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
 	if (si_domain && !hw_pass_through)
 		register_memory_notifier(&intel_iommu_memory_nb);
@@ -5791,8 +5788,8 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
 	} else if (dmar_map_gfx) {
 		/* we have to ensure the gfx device is idle before we flush */
 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
-		intel_iommu_strict = 1;
-       }
+		iommu_set_dma_strict(true);
+	}
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
-- 
Gitee


From 118891e79a777b6d7aaa3cf5c8c5e3417f699169 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 27 May 2022 16:40:05 +0800
Subject: [PATCH 1385/2161] iommu/amd: Add support for IOMMU default DMA mode
 build options

commit 02252b3bfe9f98770a8d902925711676ff5bd766 upstream.

Make IOMMU_DEFAULT_LAZY default for when AMD_IOMMU config is set, which
matches current behaviour.

For "fullflush" param, just call iommu_set_dma_strict(true) directly.

Since we get a strict vs lazy mode print already in iommu_subsys_init(),
and maintain a deprecation print when "fullflush" param is passed, drop the
prints in amd_iommu_init_dma_ops().

Finally drop global flag amd_iommu_unmap_flush, as it has no longer has any
purpose.

[jpg: Rebase for relocated file and drop amd_iommu_unmap_flush]

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1626088340-5838-6-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Kconfig           | 2 +-
 drivers/iommu/amd_iommu.c       | 6 ------
 drivers/iommu/amd_iommu_init.c  | 3 +--
 drivers/iommu/amd_iommu_types.h | 6 ------
 4 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index d6bc761048f5..a14550d0f9c9 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -94,7 +94,7 @@ choice
 	prompt "IOMMU default DMA IOTLB invalidation mode"
 	depends on IOMMU_DMA
 
-	default IOMMU_DEFAULT_LAZY if INTEL_IOMMU
+	default IOMMU_DEFAULT_LAZY if (AMD_IOMMU || INTEL_IOMMU)
 	default IOMMU_DEFAULT_STRICT
 	help
 	  This option allows an IOMMU DMA IOTLB invalidation mode to be
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 686888ae870e..3287416f3cf4 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2287,12 +2287,6 @@ int __init amd_iommu_init_dma_ops(void)
 {
 	swiotlb        = (iommu_default_passthrough() || sme_me_mask) ? 1 : 0;
 	iommu_detected = 1;
-
-	if (amd_iommu_unmap_flush)
-		pr_info("IO/TLB flush on unmap enabled\n");
-	else
-		pr_info("Lazy IO/TLB flushing enabled\n");
-	iommu_set_dma_strict(amd_iommu_unmap_flush);
 	return 0;
 
 }
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 7f964b8afddb..51fccc5c4134 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -157,7 +157,6 @@ u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 					   to handle */
 LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
 					   we find in ACPI */
-bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
 
 LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
 					   system */
@@ -3013,7 +3012,7 @@ static int __init parse_amd_iommu_options(char *str)
 	for (; *str; ++str) {
 		if (strncmp(str, "fullflush", 9) == 0) {
 			pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n");
-			amd_iommu_unmap_flush = true;
+			iommu_set_dma_strict(true);
 		}
 		if (strncmp(str, "off", 3) == 0)
 			amd_iommu_disabled = true;
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 34847774f1bb..7dd125f63ce5 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -738,12 +738,6 @@ extern u16 amd_iommu_last_bdf;
 /* allocation bitmap for domain ids */
 extern unsigned long *amd_iommu_pd_alloc_bitmap;
 
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
 /* Smallest max PASID supported by any IOMMU in the system */
 extern u32 amd_iommu_max_pasid;
 
-- 
Gitee


From 0b3c1911e0d10c6cfa5553ebd6c47cc247ee585a Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 27 May 2022 16:47:59 +0800
Subject: [PATCH 1386/2161] iommu: Remove mode argument from
 iommu_set_dma_strict()

commit 308723e3580027f0cd7c86a5edfe6b5acb6863d2 upstream.

We only ever now set strict mode enabled in iommu_set_dma_strict(), so
just remove the argument.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1626088340-5838-7-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu_init.c | 2 +-
 drivers/iommu/intel/iommu.c    | 6 +++---
 drivers/iommu/iommu.c          | 5 ++---
 include/linux/iommu.h          | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 51fccc5c4134..52fbe315cdec 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -3012,7 +3012,7 @@ static int __init parse_amd_iommu_options(char *str)
 	for (; *str; ++str) {
 		if (strncmp(str, "fullflush", 9) == 0) {
 			pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n");
-			iommu_set_dma_strict(true);
+			iommu_set_dma_strict();
 		}
 		if (strncmp(str, "off", 3) == 0)
 			amd_iommu_disabled = true;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 447bbbb65926..2f3b2033586d 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -408,7 +408,7 @@ static int __init intel_iommu_setup(char *str)
 			iommu_dma_forcedac = true;
 		} else if (!strncmp(str, "strict", 6)) {
 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
-			iommu_set_dma_strict(true);
+			iommu_set_dma_strict();
 		} else if (!strncmp(str, "sp_off", 6)) {
 			pr_info("Disable supported super page\n");
 			intel_iommu_superpage = 0;
@@ -4479,7 +4479,7 @@ int __init intel_iommu_init(void)
 		 */
 		if (cap_caching_mode(iommu->cap)) {
 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
-			iommu_set_dma_strict(true);
+			iommu_set_dma_strict();
 		}
 		iommu_device_sysfs_add(&iommu->iommu, NULL,
 				       intel_iommu_groups,
@@ -5788,7 +5788,7 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
 	} else if (dmar_map_gfx) {
 		/* we have to ensure the gfx device is idle before we flush */
 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
-		iommu_set_dma_strict(true);
+		iommu_set_dma_strict();
 	}
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7ae3432be20e..ea9749349b84 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -334,10 +334,9 @@ static int __init iommu_dma_setup(char *str)
 }
 early_param("iommu.strict", iommu_dma_setup);
 
-void iommu_set_dma_strict(bool strict)
+void iommu_set_dma_strict(void)
 {
-	if (strict || !(iommu_cmd_line & IOMMU_CMD_LINE_STRICT))
-		iommu_dma_strict = strict;
+	iommu_dma_strict = true;
 }
 
 bool iommu_get_dma_strict(struct iommu_domain *domain)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c74059e33c9e..9b3bf7361b96 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -551,7 +551,7 @@ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
 				      int prot);
 
-void iommu_set_dma_strict(bool val);
+void iommu_set_dma_strict(void);
 bool iommu_get_dma_strict(struct iommu_domain *domain);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
-- 
Gitee


From d4bfeca71bd312415c92d5ff7dff6bf5c2a0b7d3 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 27 May 2022 17:04:21 +0800
Subject: [PATCH 1387/2161] iommu: Streamline iommu_iova_to_phys()

commit 13b6eb6e1c98c47f7e0d6c74e8b22cfe189a84dd upstream.

If people are going to insist on calling iommu_iova_to_phys()
pointlessly and expecting it to work, we can at least do ourselves a
favour by handling those cases in the core code, rather than repeatedly
across an inconsistent handful of drivers.

Since all the existing drivers implement the internal callback, and any
future ones are likely to want to work with iommu-dma which relies on
iova_to_phys a fair bit, we may as well remove that currently-redundant
check as well and consider it mandatory.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/f564f3f6ff731b898ff7a898919bf871c2c7745a.1626354264.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c | 3 ---
 drivers/iommu/iommu.c     | 5 ++++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 3287416f3cf4..83ce245471b8 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2542,9 +2542,6 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	unsigned long offset_mask, pte_pgsize;
 	u64 *pte, __pte;
 
-	if (domain->mode == PAGE_MODE_NONE)
-		return iova;
-
 	pte = fetch_pte(domain, iova, &pte_pgsize);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ea9749349b84..a6451d6178e9 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2359,7 +2359,10 @@ EXPORT_SYMBOL_GPL(iommu_detach_group);
 
 phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
-	if (unlikely(domain->ops->iova_to_phys == NULL))
+	if (domain->type == IOMMU_DOMAIN_IDENTITY)
+		return iova;
+
+	if (domain->type == IOMMU_DOMAIN_BLOCKED)
 		return 0;
 
 	return domain->ops->iova_to_phys(domain, iova);
-- 
Gitee


From b325cf383f0f962420b597d6e0632ce70b6db332 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 27 May 2022 17:17:35 +0800
Subject: [PATCH 1388/2161] iommu: Express DMA strictness via the domain type

commit c208916fe6c7b84e3ec95cd91853039596eeb2cf upstream.

Eliminate the iommu_get_dma_strict() indirection and pipe the
information through the domain type from the beginning. Besides
the flow simplification this also has several nice side-effects:

 - Automatically implies strict mode for untrusted devices by
   virtue of their IOMMU_DOMAIN_DMA override.
 - Ensures that we only end up using flush queues for drivers
   which are aware of them and can actually benefit.
 - Allows us to handle flush queue init failure by falling back
   to strict mode instead of leaving it to possibly blow up later.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/47083d69155577f1367877b1594921948c366eb3.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 10 ++++++----
 drivers/iommu/iommu.c     | 14 +++++---------
 include/linux/iommu.h     |  1 -
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 1b6e94793e11..bf32ebb14431 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -369,13 +369,15 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 
 	init_iova_domain(iovad, 1UL << order, base_pfn);
 
-	if (!cookie->fq_domain && !dev_is_untrusted(dev) &&
-	    domain->ops->flush_iotlb_all && !iommu_get_dma_strict(domain)) {
+	/* If the FQ fails we can simply fall back to strict mode */
+	if (domain->type == IOMMU_DOMAIN_DMA_FQ && !cookie->fq_domain) {
 		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
-					  iommu_dma_entry_dtor))
+					  iommu_dma_entry_dtor)) {
 			pr_warn("iova flush queue initialization failed\n");
-		else
+			domain->type = IOMMU_DOMAIN_DMA;
+		} else {
 			cookie->fq_domain = domain;
+		}
 	}
 
 	return iova_reserve_iommu_regions(dev, domain);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a6451d6178e9..ce4b355aed8a 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -135,6 +135,9 @@ static int __init iommu_subsys_init(void)
 		}
 	}
 
+	if (!iommu_default_passthrough() && !iommu_dma_strict)
+		iommu_def_domain_type = IOMMU_DOMAIN_DMA_FQ;
+
 	pr_info("Default domain type: %s %s\n",
 		iommu_domain_type_str(iommu_def_domain_type),
 		(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
@@ -337,17 +340,10 @@ early_param("iommu.strict", iommu_dma_setup);
 void iommu_set_dma_strict(void)
 {
 	iommu_dma_strict = true;
+	if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ)
+		iommu_def_domain_type = IOMMU_DOMAIN_DMA;
 }
 
-bool iommu_get_dma_strict(struct iommu_domain *domain)
-{
-	/* only allow lazy flushing for DMA domains */
-	if (domain->type == IOMMU_DOMAIN_DMA)
-		return iommu_dma_strict;
-	return true;
-}
-EXPORT_SYMBOL_GPL(iommu_get_dma_strict);
-
 static ssize_t iommu_group_attr_show(struct kobject *kobj,
 				     struct attribute *__attr, char *buf)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9b3bf7361b96..1d78bd135d44 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -552,7 +552,6 @@ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      int prot);
 
 void iommu_set_dma_strict(void);
-bool iommu_get_dma_strict(struct iommu_domain *domain);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
-- 
Gitee


From cbededae98f83d7df2ae8517e3134128cbc7bfd8 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:36 +0100
Subject: [PATCH 1389/2161] iommu: Only log strictness for DMA domains

commit 7cf8a638678c89e9ff62520c5f90c4919b3b2af7 upstream.

When passthrough is enabled, the default strictness policy becomes
irrelevant, since any subsequent runtime override to a DMA domain type
now embodies an explicit choice of strictness as well. Save on noise by
only logging the default policy when it is meaningfully in effect.

Reviewed-by: John Garry <john.garry@huawei.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/9d2bcba880c6d517d0751ed8bd4960853030b4d7.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ce4b355aed8a..66e0e9941798 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -143,10 +143,11 @@ static int __init iommu_subsys_init(void)
 		(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
 			"(set via kernel command line)" : "");
 
-	pr_info("DMA domain TLB invalidation policy: %s mode %s\n",
-		iommu_dma_strict ? "strict" : "lazy",
-		(iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ?
-			"(set via kernel command line)" : "");
+	if (!iommu_default_passthrough())
+		pr_info("DMA domain TLB invalidation policy: %s mode %s\n",
+			iommu_dma_strict ? "strict" : "lazy",
+			(iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ?
+				"(set via kernel command line)" : "");
 
 	return 0;
 }
-- 
Gitee


From afa5581d3be995199788350e9c9a8106bd2feb97 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 27 May 2022 17:29:17 +0800
Subject: [PATCH 1390/2161] iommu: Merge strictness and domain type configs

commit e96763ec42ceb7fc4f1e80b8647bc3ef53b5d286 upstream.

To parallel the sysfs behaviour, merge the new build-time option
for DMA domain strictness into the default domain type choice.

Suggested-by: Joerg Roedel <joro@8bytes.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/d04af35b9c0f2a1d39605d7a9b451f5e1f0c7736.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         |  4 +-
 drivers/iommu/Kconfig                         | 78 +++++++++----------
 drivers/iommu/iommu.c                         |  2 +-
 3 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 320e14dbbc26..37eec51acc5f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1831,7 +1831,9 @@
 			1 - Strict mode.
 			  DMA unmap operations invalidate IOMMU hardware TLBs
 			  synchronously.
-			unset - Use value of CONFIG_IOMMU_DEFAULT_{LAZY,STRICT}.
+			unset - Use value of CONFIG_IOMMU_DEFAULT_DMA_{LAZY,STRICT}.
+			Note: on x86, strict mode specified via one of the
+			legacy driver-specific options takes precedence.
 	iommu.passthrough=
 			[ARM64, X86] Configure DMA to bypass the IOMMU by default.
 			Format: { "0" | "1" }
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index a14550d0f9c9..6506e59523d0 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -79,55 +79,55 @@ config IOMMU_DEBUGFS
 	  debug/iommu directory, and then populate a subdirectory with
 	  entries as required.
 
-config IOMMU_DEFAULT_PASSTHROUGH
-	bool "IOMMU passthrough by default"
+choice
+	prompt "IOMMU default domain type"
 	depends on IOMMU_API
+	default IOMMU_DEFAULT_DMA_LAZY if AMD_IOMMU || INTEL_IOMMU
+	default IOMMU_DEFAULT_DMA_STRICT
 	help
-	  Enable passthrough by default, removing the need to pass in
-	  iommu.passthrough=on or iommu=pt through command line. If this
-	  is enabled, you can still disable with iommu.passthrough=off
-	  or iommu=nopt depending on the architecture.
-
-	  If unsure, say N here.
+	  Choose the type of IOMMU domain used to manage DMA API usage by
+	  device drivers. The options here typically represent different
+	  levels of tradeoff between robustness/security and performance,
+	  depending on the IOMMU driver. Not all IOMMUs support all options.
+	  This choice can be overridden at boot via the command line, and for
+	  some devices also at runtime via sysfs.
 
-choice
-	prompt "IOMMU default DMA IOTLB invalidation mode"
-	depends on IOMMU_DMA
+	  If unsure, keep the default.
 
-	default IOMMU_DEFAULT_LAZY if (AMD_IOMMU || INTEL_IOMMU)
-	default IOMMU_DEFAULT_STRICT
+config IOMMU_DEFAULT_DMA_STRICT
+	bool "Translated - Strict"
 	help
-	  This option allows an IOMMU DMA IOTLB invalidation mode to be
-	  chosen at build time, to override the default mode of each ARCH,
-	  removing the need to pass in kernel parameters through command line.
-	  It is still possible to provide common boot params to override this
-	  config.
+	  Trusted devices use translation to restrict their access to only
+	  DMA-mapped pages, with strict TLB invalidation on unmap. Equivalent
+	  to passing "iommu.passthrough=0 iommu.strict=1" on the command line.
 
-	  If unsure, keep the default.
+	  Untrusted devices always use this mode, with an additional layer of
+	  bounce-buffering such that they cannot gain access to any unrelated
+	  data within a mapped page.
 
-config IOMMU_DEFAULT_STRICT
-	bool "strict"
+config IOMMU_DEFAULT_DMA_LAZY
+	bool "Translated - Lazy"
 	help
-	  For every IOMMU DMA unmap operation, the flush operation of IOTLB and
-	  the free operation of IOVA are guaranteed to be done in the unmap
-	  function.
+	  Trusted devices use translation to restrict their access to only
+	  DMA-mapped pages, but with "lazy" batched TLB invalidation. This
+	  mode allows higher performance with some IOMMUs due to reduced TLB
+	  flushing, but at the cost of reduced isolation since devices may be
+	  able to access memory for some time after it has been unmapped.
+	  Equivalent to passing "iommu.passthrough=0 iommu.strict=0" on the
+	  command line.
+
+	  If this mode is not supported by the IOMMU driver, the effective
+	  runtime default will fall back to IOMMU_DEFAULT_DMA_STRICT.
 
-config IOMMU_DEFAULT_LAZY
-	bool "lazy"
+config IOMMU_DEFAULT_PASSTHROUGH
+	bool "Passthrough"
 	help
-	  Support lazy mode, where for every IOMMU DMA unmap operation, the
-	  flush operation of IOTLB and the free operation of IOVA are deferred.
-	  They are only guaranteed to be done before the related IOVA will be
-	  reused.
-
-	  The isolation provided in this mode is not as secure as STRICT mode,
-	  such that a vulnerable time window may be created between the DMA
-	  unmap and the mappings cached in the IOMMU IOTLB or device TLB
-	  finally being invalidated, where the device could still access the
-	  memory which has already been unmapped by the device driver.
-	  However this mode may provide better performance in high throughput
-	  scenarios, and is still considerably more secure than passthrough
-	  mode or no IOMMU.
+	  Trusted devices are identity-mapped, giving them unrestricted access
+	  to memory with minimal performance overhead. Equivalent to passing
+	  "iommu.passthrough=1" (historically "iommu=pt") on the command line.
+
+	  If this mode is not supported by the IOMMU driver, the effective
+	  runtime default will fall back to IOMMU_DEFAULT_DMA_STRICT.
 
 endchoice
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 66e0e9941798..daab3b55b14f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -32,7 +32,7 @@ static struct kset *iommu_group_kset;
 static DEFINE_IDA(iommu_group_ida);
 
 static unsigned int iommu_def_domain_type __read_mostly;
-static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT);
+static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT);
 static u32 iommu_cmd_line __read_mostly;
 
 struct iommu_group {
-- 
Gitee


From 03eaee0b9e0e751097466d3cc454abbe99ae6cb7 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Tue, 24 Nov 2020 21:06:02 +0800
Subject: [PATCH 1391/2161] iommu: Add support to change default domain of an
 iommu group

commit 08a27c1c3ecf5e1da193ce5f8fc97c3be16e75f0 upstream.

Presently, the default domain of an iommu group is allocated during boot
time and it cannot be changed later. So, the device would typically be
either in identity (also known as pass_through) mode or the device would be
in DMA mode as long as the machine is up and running. There is no way to
change the default domain type dynamically i.e. after booting, a device
cannot switch between identity mode and DMA mode.

But, assume a use case wherein the user trusts the device and believes that
the OS is secure enough and hence wants *only* this device to bypass IOMMU
(so that it could be high performing) whereas all the other devices to go
through IOMMU (so that the system is protected). Presently, this use case
is not supported. It will be helpful if there is some way to change the
default domain of an iommu group dynamically. Hence, add such support.

A privileged user could request the kernel to change the default domain
type of a iommu group by writing to
"/sys/kernel/iommu_groups/<grp_id>/type" file. Presently, only three values
are supported
1. identity: all the DMA transactions from the device in this group are
             *not* translated by the iommu
2. DMA: all the DMA transactions from the device in this group are
        translated by the iommu
3. auto: change to the type the device was booted with

Note:
1. Default domain of an iommu group with two or more devices cannot be
   changed.
2. The device in the iommu group shouldn't be bound to any driver.
3. The device shouldn't be assigned to user for direct access.
4. The change request will fail if any device in the group has a mandatory
   default domain type and the requested one conflicts with that.

Please see "Documentation/ABI/testing/sysfs-kernel-iommu_groups" for more
information.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Sohil Mehta <sohil.mehta@intel.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lore.kernel.org/r/20201124130604.2912899-3-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 230 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 229 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index daab3b55b14f..c8b9e7dd6690 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -87,6 +87,8 @@ static void __iommu_detach_group(struct iommu_domain *domain,
 static int iommu_create_device_direct_mappings(struct iommu_group *group,
 					       struct device *dev);
 static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
+static ssize_t iommu_group_store_type(struct iommu_group *group,
+				      const char *buf, size_t count);
 
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
 struct iommu_group_attribute iommu_group_attr_##_name =		\
@@ -546,7 +548,8 @@ static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
 static IOMMU_GROUP_ATTR(reserved_regions, 0444,
 			iommu_group_show_resv_regions, NULL);
 
-static IOMMU_GROUP_ATTR(type, 0444, iommu_group_show_type, NULL);
+static IOMMU_GROUP_ATTR(type, 0644, iommu_group_show_type,
+			iommu_group_store_type);
 
 static void iommu_group_release(struct kobject *kobj)
 {
@@ -3091,3 +3094,228 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 	return ops->sva_get_pasid(handle);
 }
 EXPORT_SYMBOL_GPL(iommu_sva_get_pasid);
+
+/*
+ * Changes the default domain of an iommu group that has *only* one device
+ *
+ * @group: The group for which the default domain should be changed
+ * @prev_dev: The device in the group (this is used to make sure that the device
+ *	 hasn't changed after the caller has called this function)
+ * @type: The type of the new default domain that gets associated with the group
+ *
+ * Returns 0 on success and error code on failure
+ *
+ * Note:
+ * 1. Presently, this function is called only when user requests to change the
+ *    group's default domain type through /sys/kernel/iommu_groups/<grp_id>/type
+ *    Please take a closer look if intended to use for other purposes.
+ */
+static int iommu_change_dev_def_domain(struct iommu_group *group,
+				       struct device *prev_dev, int type)
+{
+	struct iommu_domain *prev_dom;
+	struct group_device *grp_dev;
+	int ret, dev_def_dom;
+	struct device *dev;
+
+	if (!group)
+		return -EINVAL;
+
+	mutex_lock(&group->mutex);
+
+	if (group->default_domain != group->domain) {
+		dev_err_ratelimited(prev_dev, "Group not assigned to default domain\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/*
+	 * iommu group wasn't locked while acquiring device lock in
+	 * iommu_group_store_type(). So, make sure that the device count hasn't
+	 * changed while acquiring device lock.
+	 *
+	 * Changing default domain of an iommu group with two or more devices
+	 * isn't supported because there could be a potential deadlock. Consider
+	 * the following scenario. T1 is trying to acquire device locks of all
+	 * the devices in the group and before it could acquire all of them,
+	 * there could be another thread T2 (from different sub-system and use
+	 * case) that has already acquired some of the device locks and might be
+	 * waiting for T1 to release other device locks.
+	 */
+	if (iommu_group_device_count(group) != 1) {
+		dev_err_ratelimited(prev_dev, "Cannot change default domain: Group has more than one device\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Since group has only one device */
+	grp_dev = list_first_entry(&group->devices, struct group_device, list);
+	dev = grp_dev->dev;
+
+	if (prev_dev != dev) {
+		dev_err_ratelimited(prev_dev, "Cannot change default domain: Device has been changed\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	prev_dom = group->default_domain;
+	if (!prev_dom) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dev_def_dom = iommu_get_def_domain_type(dev);
+	if (!type) {
+		/*
+		 * If the user hasn't requested any specific type of domain and
+		 * if the device supports both the domains, then default to the
+		 * domain the device was booted with
+		 */
+		type = dev_def_dom ? : iommu_def_domain_type;
+	} else if (dev_def_dom && type != dev_def_dom) {
+		dev_err_ratelimited(prev_dev, "Device cannot be in %s domain\n",
+				    iommu_domain_type_str(type));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Switch to a new domain only if the requested domain type is different
+	 * from the existing default domain type
+	 */
+	if (prev_dom->type == type) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Sets group->default_domain to the newly allocated domain */
+	ret = iommu_group_alloc_default_domain(dev->bus, group, type);
+	if (ret)
+		goto out;
+
+	ret = iommu_create_device_direct_mappings(group, dev);
+	if (ret)
+		goto free_new_domain;
+
+	ret = __iommu_attach_device(group->default_domain, dev);
+	if (ret)
+		goto free_new_domain;
+
+	group->domain = group->default_domain;
+
+	/*
+	 * Release the mutex here because ops->probe_finalize() call-back of
+	 * some vendor IOMMU drivers calls arm_iommu_attach_device() which
+	 * in-turn might call back into IOMMU core code, where it tries to take
+	 * group->mutex, resulting in a deadlock.
+	 */
+	mutex_unlock(&group->mutex);
+
+	/* Make sure dma_ops is appropriatley set */
+	iommu_group_do_probe_finalize(dev, group->default_domain);
+	iommu_domain_free(prev_dom);
+	return 0;
+
+free_new_domain:
+	iommu_domain_free(group->default_domain);
+	group->default_domain = prev_dom;
+	group->domain = prev_dom;
+
+out:
+	mutex_unlock(&group->mutex);
+
+	return ret;
+}
+
+/*
+ * Changing the default domain through sysfs requires the users to ubind the
+ * drivers from the devices in the iommu group. Return failure if this doesn't
+ * meet.
+ *
+ * We need to consider the race between this and the device release path.
+ * device_lock(dev) is used here to guarantee that the device release path
+ * will not be entered at the same time.
+ */
+static ssize_t iommu_group_store_type(struct iommu_group *group,
+				      const char *buf, size_t count)
+{
+	struct group_device *grp_dev;
+	struct device *dev;
+	int ret, req_type;
+
+	if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
+		return -EACCES;
+
+	if (WARN_ON(!group))
+		return -EINVAL;
+
+	if (sysfs_streq(buf, "identity"))
+		req_type = IOMMU_DOMAIN_IDENTITY;
+	else if (sysfs_streq(buf, "DMA"))
+		req_type = IOMMU_DOMAIN_DMA;
+	else if (sysfs_streq(buf, "auto"))
+		req_type = 0;
+	else
+		return -EINVAL;
+
+	/*
+	 * Lock/Unlock the group mutex here before device lock to
+	 * 1. Make sure that the iommu group has only one device (this is a
+	 *    prerequisite for step 2)
+	 * 2. Get struct *dev which is needed to lock device
+	 */
+	mutex_lock(&group->mutex);
+	if (iommu_group_device_count(group) != 1) {
+		mutex_unlock(&group->mutex);
+		pr_err_ratelimited("Cannot change default domain: Group has more than one device\n");
+		return -EINVAL;
+	}
+
+	/* Since group has only one device */
+	grp_dev = list_first_entry(&group->devices, struct group_device, list);
+	dev = grp_dev->dev;
+	get_device(dev);
+
+	/*
+	 * Don't hold the group mutex because taking group mutex first and then
+	 * the device lock could potentially cause a deadlock as below. Assume
+	 * two threads T1 and T2. T1 is trying to change default domain of an
+	 * iommu group and T2 is trying to hot unplug a device or release [1] VF
+	 * of a PCIe device which is in the same iommu group. T1 takes group
+	 * mutex and before it could take device lock assume T2 has taken device
+	 * lock and is yet to take group mutex. Now, both the threads will be
+	 * waiting for the other thread to release lock. Below, lock order was
+	 * suggested.
+	 * device_lock(dev);
+	 *	mutex_lock(&group->mutex);
+	 *		iommu_change_dev_def_domain();
+	 *	mutex_unlock(&group->mutex);
+	 * device_unlock(dev);
+	 *
+	 * [1] Typical device release path
+	 * device_lock() from device/driver core code
+	 *  -> bus_notifier()
+	 *   -> iommu_bus_notifier()
+	 *    -> iommu_release_device()
+	 *     -> ops->release_device() vendor driver calls back iommu core code
+	 *      -> mutex_lock() from iommu core code
+	 */
+	mutex_unlock(&group->mutex);
+
+	/* Check if the device in the group still has a driver bound to it */
+	device_lock(dev);
+	if (device_is_bound(dev)) {
+		pr_err_ratelimited("Device is still bound to driver\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = iommu_change_dev_def_domain(group, dev, req_type);
+	ret = ret ?: count;
+
+out:
+	device_unlock(dev);
+	put_device(dev);
+
+	return ret;
+}
-- 
Gitee


From dfc3fe164c8b47393e5c9360b1b7db12a11f0805 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Thu, 13 May 2021 15:58:15 +0800
Subject: [PATCH 1392/2161] iommu: Delete a duplicate check in
 iommu_change_dev_def_domain()

commit e86b041ffeff554c9458882ebabc2aba5c864186 upstream.

Function iommu_group_store_type() is the only caller of the static
function iommu_change_dev_def_domain() and has performed
"if (WARN_ON(!group))" detection before calling it. So the one here is
redundant.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210513075815.6382-1-thunder.leizhen@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c8b9e7dd6690..c54ba0f7aa33 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3118,9 +3118,6 @@ static int iommu_change_dev_def_domain(struct iommu_group *group,
 	int ret, dev_def_dom;
 	struct device *dev;
 
-	if (!group)
-		return -EINVAL;
-
 	mutex_lock(&group->mutex);
 
 	if (group->default_domain != group->domain) {
-- 
Gitee


From 398cd2eca8291da6a64cc1b74f33f7f9a389fef2 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 19 Jul 2022 16:53:16 +0800
Subject: [PATCH 1393/2161] iommu: Restore intel_iommu_strict and remove
 iommu_set_dma_strict()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu_init.c |  3 ++-
 drivers/iommu/intel/iommu.c    | 18 ++++--------------
 include/linux/iommu.h          |  2 --
 3 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 52fbe315cdec..7f964b8afddb 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -157,6 +157,7 @@ u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 					   to handle */
 LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
 					   we find in ACPI */
+bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
 
 LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
 					   system */
@@ -3012,7 +3013,7 @@ static int __init parse_amd_iommu_options(char *str)
 	for (; *str; ++str) {
 		if (strncmp(str, "fullflush", 9) == 0) {
 			pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n");
-			iommu_set_dma_strict();
+			amd_iommu_unmap_flush = true;
 		}
 		if (strncmp(str, "off", 3) == 0)
 			amd_iommu_disabled = true;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2f3b2033586d..dc5bb532a9b7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -338,6 +338,7 @@ int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
 static int dmar_map_gfx = 1;
+static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
 static int iommu_skip_te_disable;
@@ -407,8 +408,8 @@ static int __init intel_iommu_setup(char *str)
 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 			iommu_dma_forcedac = true;
 		} else if (!strncmp(str, "strict", 6)) {
-			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
-			iommu_set_dma_strict();
+			pr_info("Disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
 		} else if (!strncmp(str, "sp_off", 6)) {
 			pr_info("Disable supported super page\n");
 			intel_iommu_superpage = 0;
@@ -4470,17 +4471,6 @@ int __init intel_iommu_init(void)
 
 	down_read(&dmar_global_lock);
 	for_each_active_iommu(iommu, drhd) {
-		/*
-		 * The flush queue implementation does not perform
-		 * page-selective invalidations that are required for efficient
-		 * TLB flushes in virtual environments.  The benefit of batching
-		 * is likely to be much lower than the overhead of synchronizing
-		 * the virtual and physical IOMMU page-tables.
-		 */
-		if (cap_caching_mode(iommu->cap)) {
-			pr_info_once("IOMMU batching disallowed due to virtualization\n");
-			iommu_set_dma_strict();
-		}
 		iommu_device_sysfs_add(&iommu->iommu, NULL,
 				       intel_iommu_groups,
 				       "%s", iommu->name);
@@ -5788,7 +5778,7 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
 	} else if (dmar_map_gfx) {
 		/* we have to ensure the gfx device is idle before we flush */
 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
-		iommu_set_dma_strict();
+		intel_iommu_strict = 1;
 	}
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1d78bd135d44..9fc1fc037349 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -551,8 +551,6 @@ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
 				      int prot);
 
-void iommu_set_dma_strict(void);
-
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
-- 
Gitee


From e0c8c6661ed55f91c701da0548c9b340d66c1b78 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Tue, 24 Nov 2020 21:06:04 +0800
Subject: [PATCH 1394/2161] iommu: Document usage of
 "/sys/kernel/iommu_groups/<grp_id>/type" file

commit 63a816749d8670e4a92da5aecfb91238821a3d97 upstream.

The default domain type of an iommu group can be changed by writing to
"/sys/kernel/iommu_groups/<grp_id>/type" file. Hence, document it's usage
and more importantly spell out its limitations.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Sohil Mehta <sohil.mehta@intel.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lore.kernel.org/r/20201124130604.2912899-5-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/testing/sysfs-kernel-iommu_groups     | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index 017f5bc3920c..407b1628d7fd 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -33,3 +33,32 @@ Description:    In case an RMRR is used only by graphics or USB devices
 		it is now exposed as "direct-relaxable" instead of "direct".
 		In device assignment use case, for instance, those RMRR
 		are considered to be relaxable and safe.
+
+What:		/sys/kernel/iommu_groups/<grp_id>/type
+Date:		November 2020
+KernelVersion:	v5.11
+Contact:	Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
+Description:	/sys/kernel/iommu_groups/<grp_id>/type shows the type of default
+		domain in use by iommu for this group. See include/linux/iommu.h
+		for possible values. A privileged user could request kernel to
+		change the group type by writing to this file. Presently, only
+		three types of request are supported:
+		1. DMA: All the DMA transactions from the device in this group
+			are translated by the iommu.
+		2. identity: All the DMA transactions from the device in this
+			     group are *not* translated by the iommu.
+		3. auto: Change to the type the device was booted with.
+		Note:
+		-----
+		The default domain type of a group may be modified only when
+		1. The group has *only* one device
+		2. The device in the group is not bound to any device driver.
+		   So, the users must unbind the appropriate driver before
+		   changing the default domain type.
+		Caution:
+		--------
+		Unbinding a device driver will take away the driver's control
+		over the device and if done on devices that host root file
+		system could lead to catastrophic effects (the users might
+		need to reboot the machine to get it to normal state). So, it's
+		expected that the users understand what they're doing.
-- 
Gitee


From 5c3f6ec2811fc42c15dcfa606f5358f0b532ba0c Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 26 Nov 2020 17:06:03 +0800
Subject: [PATCH 1395/2161] iommu: Fix htmldocs warnings in
 sysfs-kernel-iommu_groups

commit 62c9917d9c1041ba175ccf1bc4c010efc0188a2e upstream.

Below warnings are fixed:

Documentation/ABI/testing/sysfs-kernel-iommu_groups:38: WARNING: Unexpected indentation.
Documentation/ABI/testing/sysfs-kernel-iommu_groups:38: WARNING: Block quote ends without a blank line; unexpected unindent.
Documentation/ABI/testing/sysfs-kernel-iommu_groups:38: WARNING: Enumerated list ends without a blank line; unexpected unindent.
Documentation/ABI/testing/sysfs-kernel-iommu_groups:38: WARNING: Unexpected indentation.
Documentation/ABI/testing/sysfs-kernel-iommu_groups:38: WARNING: Block quote ends without a blank line; unexpected unindent.

Fixes: 63a816749d86 ("iommu: Document usage of "/sys/kernel/iommu_groups/<grp_id>/type" file")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/linux-next/20201126174851.200e0e58@canb.auug.org.au/
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201126090603.1511589-1-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/testing/sysfs-kernel-iommu_groups     | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index 407b1628d7fd..0fedbb0f94e4 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -40,23 +40,24 @@ KernelVersion:	v5.11
 Contact:	Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
 Description:	/sys/kernel/iommu_groups/<grp_id>/type shows the type of default
 		domain in use by iommu for this group. See include/linux/iommu.h
-		for possible values. A privileged user could request kernel to
-		change the group type by writing to this file. Presently, only
-		three types of request are supported:
-		1. DMA: All the DMA transactions from the device in this group
-			are translated by the iommu.
-		2. identity: All the DMA transactions from the device in this
-			     group are *not* translated by the iommu.
-		3. auto: Change to the type the device was booted with.
-		Note:
-		-----
+		for possible read values. A privileged user could request kernel to
+		change the group type by writing to this file. Valid write values:
+
+		========  ======================================================
+		DMA       All the DMA transactions from the device in this group
+		          are translated by the iommu.
+		identity  All the DMA transactions from the device in this group
+		          are not translated by the iommu.
+		auto      Change to the type the device was booted with.
+		========  ======================================================
+
 		The default domain type of a group may be modified only when
-		1. The group has *only* one device
-		2. The device in the group is not bound to any device driver.
-		   So, the users must unbind the appropriate driver before
-		   changing the default domain type.
-		Caution:
-		--------
+
+		- The group has only one device.
+		- The device in the group is not bound to any device driver.
+		  So, the users must unbind the appropriate driver before
+		  changing the default domain type.
+
 		Unbinding a device driver will take away the driver's control
 		over the device and if done on devices that host root file
 		system could lead to catastrophic effects (the users might
-- 
Gitee


From 48aec41349d601ca8a1769eb96adff9b5a11e455 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed, 19 May 2021 10:51:44 +0200
Subject: [PATCH 1396/2161] docs: ABI: iommu: remove duplicated definition for
 sysfs-kernel-iommu_groups

commit 5286bd25e2095d71d248fbfd2a55ca4333dfd77e upstream.

./scripts/get_abi.pl is reporting a duplicated definition for
/sys/kernel/iommu_groups/reserved_regions, both at the same
file:

	Warning: /sys/kernel/iommu_groups/reserved_regions is defined 2 times:  Documentation/ABI/testing/sysfs-kernel-iommu_groups:15  Documentation/ABI/testing/sysfs-kernel-iommu_groups:27

Fix it by merging those into an unified entry.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/ec33e8e9b8f120232ffb3b9fcc99c97b87f242e3.1621413933.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/ABI/testing/sysfs-kernel-iommu_groups | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index 0fedbb0f94e4..eae2f1c1e11e 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -25,14 +25,10 @@ Description:    /sys/kernel/iommu_groups/reserved_regions list IOVA
 		the base IOVA, the second is the end IOVA and the third
 		field describes the type of the region.
 
-What:		/sys/kernel/iommu_groups/reserved_regions
-Date: 		June 2019
-KernelVersion:  v5.3
-Contact: 	Eric Auger <eric.auger@redhat.com>
-Description:    In case an RMRR is used only by graphics or USB devices
-		it is now exposed as "direct-relaxable" instead of "direct".
-		In device assignment use case, for instance, those RMRR
-		are considered to be relaxable and safe.
+		Since kernel 5.3, in case an RMRR is used only by graphics or
+		USB devices it is now exposed as "direct-relaxable" instead
+		of "direct". In device assignment use case, for instance,
+		those RMRR are considered to be relaxable and safe.
 
 What:		/sys/kernel/iommu_groups/<grp_id>/type
 Date:		November 2020
-- 
Gitee


From 58ba645ada3371cbdaf61f7dec9f2751201a93ed Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:35 +0100
Subject: [PATCH 1397/2161] iommu: Expose DMA domain strictness via sysfs

commit 26225bea1d84b090b378838a1797daec62d4ad0e upstream.

The sysfs interface for default domain types exists primarily so users
can choose the performance/security tradeoff relevant to their own
workload. As such, the choice between the policies for DMA domains fits
perfectly as an additional point on that scale - downgrading a
particular device from a strict default to non-strict may be enough to
let it reach the desired level of performance, while still retaining
more peace of mind than with a wide-open identity domain. Now that we've
abstracted non-strict mode as a distinct type of DMA domain, allow it to
be chosen through the user interface as well.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/0e08da5ed4069fd3473cfbadda758ca983becdbf.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/ABI/testing/sysfs-kernel-iommu_groups | 6 +++++-
 drivers/iommu/iommu.c                               | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index eae2f1c1e11e..b15af6a5bc08 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -42,8 +42,12 @@ Description:	/sys/kernel/iommu_groups/<grp_id>/type shows the type of default
 		========  ======================================================
 		DMA       All the DMA transactions from the device in this group
 		          are translated by the iommu.
+		DMA-FQ    As above, but using batched invalidation to lazily
+		          remove translations after use. This may offer reduced
+			  overhead at the cost of reduced memory protection.
 		identity  All the DMA transactions from the device in this group
-		          are not translated by the iommu.
+		          are not translated by the iommu. Maximum performance
+			  but zero protection.
 		auto      Change to the type the device was booted with.
 		========  ======================================================
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c54ba0f7aa33..53f2f68255d0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3250,6 +3250,8 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
 		req_type = IOMMU_DOMAIN_IDENTITY;
 	else if (sysfs_streq(buf, "DMA"))
 		req_type = IOMMU_DOMAIN_DMA;
+	else if (sysfs_streq(buf, "DMA-FQ"))
+		req_type = IOMMU_DOMAIN_DMA_FQ;
 	else if (sysfs_streq(buf, "auto"))
 		req_type = 0;
 	else
-- 
Gitee


From 03623dffede7f1b824c9048d5416fc5b8f7ec18b Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:38 +0100
Subject: [PATCH 1398/2161] iommu: Allow enabling non-strict mode dynamically

commit 452e69b58c2889e5546edb92d9e66285410f7463 upstream.

Allocating and enabling a flush queue is in fact something we can
reasonably do while a DMA domain is active, without having to rebuild it
from scratch. Thus we can allow a strict -> non-strict transition from
sysfs without requiring to unbind the device's driver, which is of
particular interest to users who want to make selective relaxations to
critical devices like the one serving their root filesystem.

Disabling and draining a queue also seems technically possible to
achieve without rebuilding the whole domain, but would certainly be more
involved. Furthermore there's not such a clear use-case for tightening
up security *after* the device may already have done whatever it is that
you don't trust it not to do, so we only consider the relaxation case.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/d652966348c78457c38bf18daf369272a4ebc2c9.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 47 ++++++++++++++++++++++++++-------------
 drivers/iommu/iommu.c     | 17 ++++++++++----
 drivers/iommu/iova.c      | 11 ++++-----
 include/linux/dma-iommu.h |  6 +++++
 4 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index bf32ebb14431..33762aaa6e76 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -316,6 +316,30 @@ static bool dev_is_untrusted(struct device *dev)
 	return dev_is_pci(dev) && to_pci_dev(dev)->untrusted;
 }
 
+/* sysfs updates are serialised by the mutex of the group owning @domain */
+int iommu_dma_init_fq(struct iommu_domain *domain)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	int ret;
+
+	if (cookie->fq_domain)
+		return 0;
+
+	ret = init_iova_flush_queue(&cookie->iovad, iommu_dma_flush_iotlb_all,
+				    iommu_dma_entry_dtor);
+	if (ret) {
+		pr_warn("iova flush queue initialization failed\n");
+		return ret;
+	}
+	/*
+	 * Prevent incomplete iovad->fq being observable. Pairs with path from
+	 * __iommu_dma_unmap() through iommu_dma_free_iova() to queue_iova()
+	 */
+	smp_wmb();
+	WRITE_ONCE(cookie->fq_domain, domain);
+	return 0;
+}
+
 /**
  * iommu_dma_init_domain - Initialise a DMA mapping domain
  * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
@@ -370,15 +394,8 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	init_iova_domain(iovad, 1UL << order, base_pfn);
 
 	/* If the FQ fails we can simply fall back to strict mode */
-	if (domain->type == IOMMU_DOMAIN_DMA_FQ && !cookie->fq_domain) {
-		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
-					  iommu_dma_entry_dtor)) {
-			pr_warn("iova flush queue initialization failed\n");
-			domain->type = IOMMU_DOMAIN_DMA;
-		} else {
-			cookie->fq_domain = domain;
-		}
-	}
+	if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
+		domain->type = IOMMU_DOMAIN_DMA;
 
 	return iova_reserve_iommu_regions(dev, domain);
 }
@@ -453,17 +470,17 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 }
 
 static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
-		dma_addr_t iova, size_t size, struct page *freelist)
+		dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather)
 {
 	struct iova_domain *iovad = &cookie->iovad;
 
 	/* The MSI case is only ever cleaning up its most recent allocation */
 	if (cookie->type == IOMMU_DMA_MSI_COOKIE)
 		cookie->msi_iova -= size;
-	else if (cookie->fq_domain)	/* non-strict mode */
+	else if (gather && gather->queued)
 		queue_iova(iovad, iova_pfn(iovad, iova),
 				size >> iova_shift(iovad),
-				(unsigned long)freelist);
+				(unsigned long)gather->freelist);
 	else
 		free_iova_fast(iovad, iova_pfn(iovad, iova),
 				size >> iova_shift(iovad));
@@ -482,14 +499,14 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
 	dma_addr -= iova_off;
 	size = iova_align(iovad, size + iova_off);
 	iommu_iotlb_gather_init(&iotlb_gather);
-	iotlb_gather.queued = cookie->fq_domain;
+	iotlb_gather.queued = READ_ONCE(cookie->fq_domain);
 
 	unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather);
 	WARN_ON(unmapped != size);
 
-	if (!cookie->fq_domain)
+	if (!iotlb_gather.queued)
 		iommu_iotlb_sync(domain, &iotlb_gather);
-	iommu_dma_free_iova(cookie, dma_addr, size, iotlb_gather.freelist);
+	iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
 }
 
 static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 53f2f68255d0..1da12f82e838 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3185,6 +3185,14 @@ static int iommu_change_dev_def_domain(struct iommu_group *group,
 		goto out;
 	}
 
+	/* We can bring up a flush queue without tearing down the domain */
+	if (type == IOMMU_DOMAIN_DMA_FQ && prev_dom->type == IOMMU_DOMAIN_DMA) {
+		ret = iommu_dma_init_fq(prev_dom);
+		if (!ret)
+			prev_dom->type = IOMMU_DOMAIN_DMA_FQ;
+		goto out;
+	}
+
 	/* Sets group->default_domain to the newly allocated domain */
 	ret = iommu_group_alloc_default_domain(dev->bus, group, type);
 	if (ret)
@@ -3225,9 +3233,9 @@ static int iommu_change_dev_def_domain(struct iommu_group *group,
 }
 
 /*
- * Changing the default domain through sysfs requires the users to ubind the
- * drivers from the devices in the iommu group. Return failure if this doesn't
- * meet.
+ * Changing the default domain through sysfs requires the users to unbind the
+ * drivers from the devices in the iommu group, except for a DMA -> DMA-FQ
+ * transition. Return failure if this isn't met.
  *
  * We need to consider the race between this and the device release path.
  * device_lock(dev) is used here to guarantee that the device release path
@@ -3303,7 +3311,8 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
 
 	/* Check if the device in the group still has a driver bound to it */
 	device_lock(dev);
-	if (device_is_bound(dev)) {
+	if (device_is_bound(dev) && !(req_type == IOMMU_DOMAIN_DMA_FQ &&
+	    group->default_domain->type == IOMMU_DOMAIN_DMA)) {
 		pr_err_ratelimited("Device is still bound to driver\n");
 		ret = -EBUSY;
 		goto out;
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 8cca16d1361f..8c0813fafd2b 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -121,8 +121,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 		spin_lock_init(&fq->lock);
 	}
 
-	smp_wmb();
-
 	iovad->fq = queue;
 
 	timer_setup(&iovad->fq_timer, fq_flush_timeout, 0);
@@ -635,17 +633,20 @@ void queue_iova(struct iova_domain *iovad,
 		unsigned long pfn, unsigned long pages,
 		unsigned long data)
 {
-	struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
+	struct iova_fq *fq;
 	unsigned long flags;
 	unsigned idx;
 
 	/*
 	 * Order against the IOMMU driver's pagetable update from unmapping
 	 * @pte, to guarantee that iova_domain_flush() observes that if called
-	 * from a different CPU before we release the lock below.
+	 * from a different CPU before we release the lock below. Full barrier
+	 * so it also pairs with iommu_dma_init_fq() to avoid seeing partially
+	 * written fq state here.
 	 */
-	smp_wmb();
+	smp_mb();
 
+	fq = raw_cpu_ptr(iovad->fq);
 	spin_lock_irqsave(&fq->lock, flags);
 
 	/*
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 758ca4694257..24607dc3c2ac 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -20,6 +20,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain);
 
 /* Setup call for arch DMA mapping code */
 void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
+int iommu_dma_init_fq(struct iommu_domain *domain);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 /*
@@ -54,6 +55,11 @@ static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
 {
 }
 
+static inline int iommu_dma_init_fq(struct iommu_domain *domain)
+{
+	return -EINVAL;
+}
+
 static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
 {
 	return -ENODEV;
-- 
Gitee


From 97c1b8c869946d7fac7f96bcbcd2917006f90af6 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 14 Oct 2021 13:38:33 +0800
Subject: [PATCH 1399/2161] iommu/vt-d: Remove duplicate identity domain flag

commit b34380a6d767c54480a937951e6189a7f9699443 upstream.

The iommu_domain data structure already has the "type" field to keep the
type of a domain. It's unnecessary to have the DOMAIN_FLAG_STATIC_IDENTITY
flag in the vt-d implementation. This cleans it up with no functionality
change.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210926114535.923263-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20211014053839.727419-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 9 ++++-----
 include/linux/intel-iommu.h | 3 ---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index dc5bb532a9b7..7abaf0502b18 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -506,7 +506,7 @@ static inline void free_devinfo_mem(void *vaddr)
 
 static inline int domain_type_is_si(struct dmar_domain *domain)
 {
-	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
+	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 }
 
 static inline bool domain_use_first_level(struct dmar_domain *domain)
@@ -1974,7 +1974,7 @@ static bool first_level_by_default(void)
 	return scalable_mode_support() && intel_cap_flts_sanity();
 }
 
-static struct dmar_domain *alloc_domain(int flags)
+static struct dmar_domain *alloc_domain(unsigned int type)
 {
 	struct dmar_domain *domain;
 
@@ -1984,7 +1984,6 @@ static struct dmar_domain *alloc_domain(int flags)
 
 	memset(domain, 0, sizeof(*domain));
 	domain->nid = NUMA_NO_NODE;
-	domain->flags = flags;
 	if (first_level_by_default())
 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
 	domain->has_iotlb_device = false;
@@ -2803,7 +2802,7 @@ static int __init si_domain_init(int hw)
 	struct device *dev;
 	int i, nid, ret;
 
-	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
+	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
 	if (!si_domain)
 		return -EFAULT;
 
@@ -4608,7 +4607,7 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 	case IOMMU_DOMAIN_DMA:
 	case IOMMU_DOMAIN_DMA_FQ:
 	case IOMMU_DOMAIN_UNMANAGED:
-		dmar_domain = alloc_domain(0);
+		dmar_domain = alloc_domain(type);
 		if (!dmar_domain) {
 			pr_err("Can't allocate dmar_domain\n");
 			return NULL;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f2bdaad1b784..1c9225c94330 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -517,9 +517,6 @@ struct context_entry {
 	u64 hi;
 };
 
-/* si_domain contains mulitple devices */
-#define DOMAIN_FLAG_STATIC_IDENTITY		BIT(0)
-
 /*
  * When VT-d works in the scalable mode, it allows DMA translation to
  * happen through either first level or second level page table. This
-- 
Gitee


From 0443fd9db7cbdd68bcd8f4ad57fe110d624406c5 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 14 Oct 2021 13:38:34 +0800
Subject: [PATCH 1400/2161] iommu/vt-d: Check FL and SL capability sanity in
 scalable mode

commit 7afd7f6aa21a2929aff3a059b741933ee1819c6b upstream.

An iommu domain could be allocated and mapped before it's attached to any
device. This requires that in scalable mode, when the domain is allocated,
the format (FL or SL) of the page table must be determined. In order to
achieve this, the platform should support consistent SL or FL capabilities
on all IOMMU's. This adds a check for this and aborts IOMMU probing if it
doesn't meet this requirement.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210926114535.923263-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20211014053839.727419-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/cap_audit.c | 13 +++++++++++++
 drivers/iommu/intel/cap_audit.h |  1 +
 2 files changed, 14 insertions(+)

diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
index b12e421a2f1a..b39d223926a4 100644
--- a/drivers/iommu/intel/cap_audit.c
+++ b/drivers/iommu/intel/cap_audit.c
@@ -163,6 +163,14 @@ static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type)
 			check_irq_capabilities(iommu, i);
 	}
 
+	/*
+	 * If the system is sane to support scalable mode, either SL or FL
+	 * should be sane.
+	 */
+	if (intel_cap_smts_sanity() &&
+	    !intel_cap_flts_sanity() && !intel_cap_slts_sanity())
+		return -EOPNOTSUPP;
+
 out:
 	rcu_read_unlock();
 	return 0;
@@ -203,3 +211,8 @@ bool intel_cap_flts_sanity(void)
 {
 	return ecap_flts(intel_iommu_ecap_sanity);
 }
+
+bool intel_cap_slts_sanity(void)
+{
+	return ecap_slts(intel_iommu_ecap_sanity);
+}
diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
index 74cfccae0e81..d07b75938961 100644
--- a/drivers/iommu/intel/cap_audit.h
+++ b/drivers/iommu/intel/cap_audit.h
@@ -111,6 +111,7 @@ bool intel_cap_smts_sanity(void);
 bool intel_cap_pasid_sanity(void);
 bool intel_cap_nest_sanity(void);
 bool intel_cap_flts_sanity(void);
+bool intel_cap_slts_sanity(void);
 
 static inline bool scalable_mode_support(void)
 {
-- 
Gitee


From eb8ba865f200c4c949c3b98199baab362c9754ed Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 14 Oct 2021 13:38:35 +0800
Subject: [PATCH 1401/2161] iommu/vt-d: Use second level for GPA->HPA
 translation

commit 032c5ee40e9fc68ed650a3f86f23259376ec93fc upstream.

The IOMMU VT-d implementation uses the first level for GPA->HPA translation
by default. Although both the first level and the second level could handle
the DMA translation, they're different in some way. For example, the second
level translation has separate controls for the Access/Dirty page tracking.
With the first level translation, there's no such control. On the other
hand, the second level translation has the page-level control for forcing
snoop, but the first level only has global control with pasid granularity.

This uses the second level for GPA->HPA translation so that we can provide
a consistent hardware interface for use cases like dirty page tracking for
live migration.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210926114535.923263-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20211014053839.727419-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7abaf0502b18..9538b2370c01 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1969,9 +1969,18 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
  * Check and return whether first level is used by default for
  * DMA translation.
  */
-static bool first_level_by_default(void)
+static bool first_level_by_default(unsigned int type)
 {
-	return scalable_mode_support() && intel_cap_flts_sanity();
+	/* Only SL is available in legacy mode */
+	if (!scalable_mode_support())
+		return false;
+
+	/* Only level (either FL or SL) is available, just use it */
+	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
+		return intel_cap_flts_sanity();
+
+	/* Both levels are available, decide it based on domain type */
+	return type != IOMMU_DOMAIN_UNMANAGED;
 }
 
 static struct dmar_domain *alloc_domain(unsigned int type)
@@ -1984,7 +1993,7 @@ static struct dmar_domain *alloc_domain(unsigned int type)
 
 	memset(domain, 0, sizeof(*domain));
 	domain->nid = NUMA_NO_NODE;
-	if (first_level_by_default())
+	if (first_level_by_default(type))
 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
 	domain->has_iotlb_device = false;
 	INIT_LIST_HEAD(&domain->devices);
-- 
Gitee


From 1d80ac83027e6d6946f0f2a489793e00ca104896 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 30 May 2022 15:22:12 +0800
Subject: [PATCH 1402/2161] iommu/vt-d: Clean up unused PASID updating
 functions

commit 00ecd5401349a5e23d2ad768a23b110030165fac upstream.

update_pasid() and its call chain are currently unused in the tree because
Thomas disabled the ENQCMD feature. The feature will be re-enabled shortly
using a different approach and update_pasid() and its call chain will not
be used in the new approach.

Remove the useless functions.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210920192349.2602141-1-fenghua.yu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20211014053839.727419-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h |  2 --
 drivers/iommu/intel/svm.c      | 24 +-----------------------
 2 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index febe35e181c2..ad26131af8bb 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -96,8 +96,6 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
  */
 #define PASID_DISABLED	0
 
-static inline void update_pasid(void) { }
-
 /* Trap handling */
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern void fpu_sync_fpstate(struct fpu *fpu);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 82fec4389dd5..11dd207ed230 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -505,21 +505,6 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 	return ret;
 }
 
-static void _load_pasid(void *unused)
-{
-	update_pasid();
-}
-
-static void load_pasid(struct mm_struct *mm, u32 pasid)
-{
-	mutex_lock(&mm->context.lock);
-
-	/* Update PASID MSR on all CPUs running the mm's tasks. */
-	on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
-
-	mutex_unlock(&mm->context.lock);
-}
-
 static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
 				 unsigned int flags)
 {
@@ -609,10 +594,6 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
 	if (ret)
 		goto free_sdev;
 
-	/* The newly allocated pasid is loaded to the mm. */
-	if (!(flags & SVM_FLAG_SUPERVISOR_MODE) && list_empty(&svm->devs))
-		load_pasid(mm, svm->pasid);
-
 	list_add_rcu(&sdev->list, &svm->devs);
 success:
 	return &sdev->sva;
@@ -665,11 +646,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
-				if (svm->notifier.ops) {
+				if (svm->notifier.ops)
 					mmu_notifier_unregister(&svm->notifier, mm);
-					/* Clear mm's pasid. */
-					load_pasid(mm, PASID_DISABLED);
-				}
 				pasid_private_remove(svm->pasid);
 				/* We mandate that no page faults may be outstanding
 				 * for the PASID when intel_svm_unbind_mm() is called.
-- 
Gitee


From 5efa473bf906dbe17a6bb71b3fec807263caa075 Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Thu, 14 Oct 2021 13:38:38 +0800
Subject: [PATCH 1403/2161] iommu/vt-d: Convert the return type of
 first_pte_in_page to bool

commit 37c8041a818dec2cea22ea48a3f90d512a9e1fcd upstream.

The first_pte_in_page() returns true or false, so let's convert its
return type to bool. In addition, use 'IS_ALIGNED' to make the
code more readable and neater.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Link: https://lore.kernel.org/r/20211008000433.1115-1-longpeng2@huawei.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20211014053839.727419-9-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/intel-iommu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 1c9225c94330..30f03b0b27a6 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -704,9 +704,9 @@ static inline bool dma_pte_superpage(struct dma_pte *pte)
 	return (pte->val & DMA_PTE_LARGE_PAGE);
 }
 
-static inline int first_pte_in_page(struct dma_pte *pte)
+static inline bool first_pte_in_page(struct dma_pte *pte)
 {
-	return !((unsigned long)pte & ~VTD_PAGE_MASK);
+	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
 }
 
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
-- 
Gitee


From 7bb1cc115955b613b03ad5053afa9ad170a159b0 Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Thu, 14 Oct 2021 13:38:39 +0800
Subject: [PATCH 1404/2161] iommu/vt-d: Avoid duplicate removing in
 __domain_mapping()

commit 9906b9352a35427508d974db3a496eb3c49e2010 upstream.

The __domain_mapping() always removes the pages in the range from
'iov_pfn' to 'end_pfn', but the 'end_pfn' is always the last pfn
of the range that the caller wants to map.

This would introduce too many duplicated removing and leads the
map operation take too long, for example:

  Map iova=0x100000,nr_pages=0x7d61800
    iov_pfn: 0x100000, end_pfn: 0x7e617ff
    iov_pfn: 0x140000, end_pfn: 0x7e617ff
    iov_pfn: 0x180000, end_pfn: 0x7e617ff
    iov_pfn: 0x1c0000, end_pfn: 0x7e617ff
    iov_pfn: 0x200000, end_pfn: 0x7e617ff
    ...
  it takes about 50ms in total.

We can reduce the cost by recalculate the 'end_pfn' and limit it
to the boundary of the end of this pte page.

  Map iova=0x100000,nr_pages=0x7d61800
    iov_pfn: 0x100000, end_pfn: 0x13ffff
    iov_pfn: 0x140000, end_pfn: 0x17ffff
    iov_pfn: 0x180000, end_pfn: 0x1bffff
    iov_pfn: 0x1c0000, end_pfn: 0x1fffff
    iov_pfn: 0x200000, end_pfn: 0x23ffff
    ...
  it only need 9ms now.

This also removes a meaningless BUG_ON() in __domain_mapping().

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Tested-by: Liujunjie <liujunjie23@huawei.com>
Link: https://lore.kernel.org/r/20211008000433.1115-1-longpeng2@huawei.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20211014053839.727419-10-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 11 ++++++-----
 include/linux/intel-iommu.h |  6 ++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 9538b2370c01..e417663e0afb 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2457,12 +2457,17 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 				return -ENOMEM;
 			first_pte = pte;
 
+			lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
 			/* It is large page*/
 			if (largepage_lvl > 1) {
 				unsigned long end_pfn;
+				unsigned long pages_to_remove;
 
 				pteval |= DMA_PTE_LARGE_PAGE;
-				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
+				pages_to_remove = min_t(unsigned long, nr_pages,
+							nr_pte_to_next_page(pte) * lvl_pages);
+				end_pfn = iov_pfn + pages_to_remove - 1;
 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
 			} else {
 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
@@ -2484,10 +2489,6 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			WARN_ON(1);
 		}
 
-		lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
-		BUG_ON(nr_pages < lvl_pages);
-
 		nr_pages -= lvl_pages;
 		iov_pfn += lvl_pages;
 		phys_pfn += lvl_pages;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 30f03b0b27a6..728f127f3386 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -709,6 +709,12 @@ static inline bool first_pte_in_page(struct dma_pte *pte)
 	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
 }
 
+static inline int nr_pte_to_next_page(struct dma_pte *pte)
+{
+	return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
+		(struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
+}
+
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
 extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
 
-- 
Gitee


From ac67798fea2004b95d9b397d086df81baaaf8f2c Mon Sep 17 00:00:00 2001
From: Jianxiong Gao <jxgao@google.com>
Date: Mon, 1 Feb 2021 10:30:15 -0800
Subject: [PATCH 1405/2161] driver core: add a min_align_mask field to struct
 device_dma_parameters

commit 36950f2da1ea4cb683be174f6f581e25b2d33e71 upstream.

Some devices rely on the address offset in a page to function
correctly (NVMe driver as an example). These devices may use
a different page size than the Linux kernel. The address offset
has to be preserved upon mapping, and in order to do so, we
need to record the page_offset_mask first.

Signed-off-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/device.h      |  1 +
 include/linux/dma-mapping.h | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/device.h b/include/linux/device.h
index 9264517f5095..8ec5dc0b3cbd 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -995,6 +995,7 @@ struct device_dma_parameters {
 	 * sg limitations.
 	 */
 	unsigned int max_segment_size;
+	unsigned int min_align_mask;
 	unsigned long segment_boundary_mask;
 };
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 590349b6feb9..177a3efe4dff 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -629,6 +629,22 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
 	return -EIO;
 }
 
+static inline unsigned int dma_get_min_align_mask(struct device *dev)
+{
+	if (dev->dma_parms)
+		return dev->dma_parms->min_align_mask;
+	return 0;
+}
+
+static inline int dma_set_min_align_mask(struct device *dev,
+		unsigned int min_align_mask)
+{
+	if (WARN_ON_ONCE(!dev->dma_parms))
+		return -EIO;
+	dev->dma_parms->min_align_mask = min_align_mask;
+	return 0;
+}
+
 static inline int dma_get_cache_alignment(void)
 {
 #ifdef ARCH_DMA_MINALIGN
-- 
Gitee


From d7b60813ae34d3b480061c8b1ddb6a9dd8485b6d Mon Sep 17 00:00:00 2001
From: Ashish Kalra <ashish.kalra@amd.com>
Date: Thu, 10 Dec 2020 01:25:15 +0000
Subject: [PATCH 1406/2161] x86,swiotlb: Adjust SWIOTLB bounce buffer size for
 SEV guests

commit e998879d4fb7991856916972168cf27c0d86ed12 upstream.

For SEV, all DMA to and from guest has to use shared (un-encrypted) pages.
SEV uses SWIOTLB to make this happen without requiring changes to device
drivers.  However, depending on the workload being run, the default 64MB
of it might not be enough and it may run out of buffers to use for DMA,
resulting in I/O errors and/or performance degradation for high
I/O workloads.

Adjust the default size of SWIOTLB for SEV guests using a
percentage of the total memory available to guest for the SWIOTLB buffers.

Adds a new sev_setup_arch() function which is invoked from setup_arch()
and it calls into a new swiotlb generic code function swiotlb_adjust_size()
to do the SWIOTLB buffer adjustment.

v5 fixed build errors and warnings as
Reported-by: kbuild test robot <lkp@intel.com>

Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Co-developed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/mem_encrypt.h |  2 ++
 arch/x86/kernel/setup.c            |  6 ++++++
 arch/x86/mm/mem_encrypt.c          | 31 ++++++++++++++++++++++++++++++
 include/linux/swiotlb.h            |  8 ++++++++
 kernel/dma/swiotlb.c               | 20 +++++++++++++++++--
 5 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 848ce43b9040..6c36956452ca 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -36,6 +36,7 @@ void __init sme_map_bootdata(char *real_mode_data);
 void __init sme_unmap_bootdata(char *real_mode_data);
 
 void __init sme_early_init(void);
+void __init sev_setup_arch(void);
 
 void __init sme_encrypt_kernel(struct boot_params *bp);
 void __init sme_enable(struct boot_params *bp);
@@ -65,6 +66,7 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
 static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
 
 static inline void __init sme_early_init(void) { }
+static inline void __init sev_setup_arch(void) { }
 
 static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b2a50dd90874..c89294189679 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1121,6 +1121,12 @@ void __init setup_arch(char **cmdline_p)
 	memblock_set_current_limit(ISA_END_ADDRESS);
 	e820__memblock_setup();
 
+	/*
+	 * Needs to run after memblock setup because it needs the physical
+	 * memory size.
+	 */
+	sev_setup_arch();
+
 	reserve_bios_regions();
 
 	if (efi_enabled(EFI_MEMMAP)) {
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index ebf70fb3f821..b5b1491a26cc 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -196,6 +196,37 @@ void __init sme_early_init(void)
 		swiotlb_force = SWIOTLB_FORCE;
 }
 
+void __init sev_setup_arch(void)
+{
+	phys_addr_t total_mem = memblock_phys_mem_size();
+	unsigned long size;
+
+	if (!sev_active())
+		return;
+
+	/*
+	 * For SEV, all DMA has to occur via shared/unencrypted pages.
+	 * SEV uses SWIOTLB to make this happen without changing device
+	 * drivers. However, depending on the workload being run, the
+	 * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+	 * run out of buffers for DMA, resulting in I/O errors and/or
+	 * performance degradation especially with high I/O workloads.
+	 *
+	 * Adjust the default size of SWIOTLB for SEV guests using
+	 * a percentage of guest memory for SWIOTLB buffers.
+	 * Also, as the SWIOTLB bounce buffer memory is allocated
+	 * from low memory, ensure that the adjusted size is within
+	 * the limits of low available memory.
+	 *
+	 * The percentage of guest memory used here for SWIOTLB buffers
+	 * is more of an approximation of the static adjustment which
+	 * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+	 */
+	size = total_mem * 6 / 100;
+	size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+	swiotlb_adjust_size(size);
+}
+
 static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
 {
 	pgprot_t old_prot, new_prot;
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d9a911efbb88..af3f9c1a9af0 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -30,6 +30,9 @@ enum swiotlb_force {
  */
 #define IO_TLB_SHIFT 11
 
+/* default to 64MB */
+#define IO_TLB_DEFAULT_SIZE (64UL<<20)
+
 extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
@@ -77,6 +80,7 @@ void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
 bool is_swiotlb_active(void);
+void __init swiotlb_adjust_size(unsigned long new_size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
@@ -99,6 +103,10 @@ static inline bool is_swiotlb_active(void)
 {
 	return false;
 }
+
+static inline void swiotlb_adjust_size(unsigned long new_size)
+{
+}
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index b2f9527fada5..c8b92a521039 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -152,8 +152,6 @@ void swiotlb_set_max_segment(unsigned int val)
 		max_segment = rounddown(val, PAGE_SIZE);
 }
 
-/* default to 64MB */
-#define IO_TLB_DEFAULT_SIZE (64UL<<20)
 unsigned long swiotlb_size_or_default(void)
 {
 	unsigned long size;
@@ -163,6 +161,24 @@ unsigned long swiotlb_size_or_default(void)
 	return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+void __init swiotlb_adjust_size(unsigned long new_size)
+{
+	unsigned long size;
+
+	/*
+	 * If swiotlb parameter has not been specified, give a chance to
+	 * architectures such as those supporting memory encryption to
+	 * adjust/expand SWIOTLB size for their use.
+	 */
+	if (!io_tlb_nslabs) {
+		size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+		io_tlb_nslabs = size >> IO_TLB_SHIFT;
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+		pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
+	}
+}
+
 void swiotlb_print_info(void)
 {
 	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-- 
Gitee


From 279f0530d7184fd72516489a9ace5927b92d4b18 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Feb 2021 11:18:40 +0100
Subject: [PATCH 1407/2161] swiotlb: add a IO_TLB_SIZE define

commit b5d7ccb7aac3895c2138fe0980a109116ce15eff upstream.

Add a new IO_TLB_SIZE define instead open coding it using
IO_TLB_SHIFT all over.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/swiotlb.h |  1 +
 kernel/dma/swiotlb.c    | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index af3f9c1a9af0..bed5c2c31323 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,6 +29,7 @@ enum swiotlb_force {
  * controllable.
  */
 #define IO_TLB_SHIFT 11
+#define IO_TLB_SIZE (1 << IO_TLB_SHIFT)
 
 /* default to 64MB */
 #define IO_TLB_DEFAULT_SIZE (64UL<<20)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c8b92a521039..68b2e8f0b6ec 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -171,7 +171,7 @@ void __init swiotlb_adjust_size(unsigned long new_size)
 	 * adjust/expand SWIOTLB size for their use.
 	 */
 	if (!io_tlb_nslabs) {
-		size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+		size = ALIGN(new_size, IO_TLB_SIZE);
 		io_tlb_nslabs = size >> IO_TLB_SHIFT;
 		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
 
@@ -491,20 +491,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
 
 	tbl_dma_addr &= mask;
 
-	offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 
 	/*
 	 * Carefully handle integer overflow which can occur when mask == ~0UL.
 	 */
 	max_slots = mask + 1
-		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+		    ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT
 		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
 
 	/*
 	 * For mappings greater than or equal to a page, we limit the stride
 	 * (and hence alignment) to a page size.
 	 */
-	nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 	if (alloc_size >= PAGE_SIZE)
 		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
 	else
@@ -598,7 +598,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      enum dma_data_direction dir, unsigned long attrs)
 {
 	unsigned long flags;
-	int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
@@ -649,7 +649,7 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
-	orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+	orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1);
 
 	switch (target) {
 	case SYNC_FOR_CPU:
@@ -707,7 +707,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 
 size_t swiotlb_max_mapping_size(struct device *dev)
 {
-	return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+	return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
 }
 
 bool is_swiotlb_active(void)
-- 
Gitee


From 979deb560dbac2bdb8935a2f7d19379462680135 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 4 Feb 2021 10:11:20 +0100
Subject: [PATCH 1408/2161] swiotlb: factor out an io_tlb_offset helper

commit c7fbeca757fe74135d8b6a4c8ddaef76f5775d68 upstream.

Replace the very genericly named OFFSET macro with a little inline
helper that hardcodes the alignment to the only value ever passed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 68b2e8f0b6ec..d46467799da4 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -50,9 +50,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/swiotlb.h>
 
-#define OFFSET(val,align) ((unsigned long)	\
-	                   ( (val) & ( (align) - 1)))
-
 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
 
 /*
@@ -192,6 +189,11 @@ void swiotlb_print_info(void)
 	       bytes >> 20);
 }
 
+static inline unsigned long io_tlb_offset(unsigned long val)
+{
+	return val & (IO_TLB_SEGSIZE - 1);
+}
+
 /*
  * Early SWIOTLB allocation may be too early to allow an architecture to
  * perform the desired operations.  This function allows the architecture to
@@ -241,7 +243,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 		      __func__, alloc_size, PAGE_SIZE);
 
 	for (i = 0; i < io_tlb_nslabs; i++) {
-		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
 	}
 	io_tlb_index = 0;
@@ -375,7 +377,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 		goto cleanup4;
 
 	for (i = 0; i < io_tlb_nslabs; i++) {
-		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
 	}
 	io_tlb_index = 0;
@@ -546,7 +548,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
 
 			for (i = index; i < (int) (index + nslots); i++)
 				io_tlb_list[i] = 0;
-			for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+			for (i = index - 1;
+			     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+			     io_tlb_list[i]; i--)
 				io_tlb_list[i] = ++count;
 			tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
 
@@ -632,7 +636,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 		 * Step 2: merge the returned slots with the preceding slots,
 		 * if available (non zero)
 		 */
-		for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+		for (i = index - 1;
+		     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+		     io_tlb_list[i]; i--)
 			io_tlb_list[i] = ++count;
 
 		io_tlb_used -= nslots;
-- 
Gitee


From b2561debb13cab8cc439f2faba65f5fa78ef676f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Feb 2021 11:19:34 +0100
Subject: [PATCH 1409/2161] swiotlb: factor out a nr_slots helper

commit c32a77fd18780a5192dfb6eec69f239faebf28fd upstream.

Factor out a helper to find the number of slots for a given size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d46467799da4..08ba358e69ae 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -194,6 +194,11 @@ static inline unsigned long io_tlb_offset(unsigned long val)
 	return val & (IO_TLB_SEGSIZE - 1);
 }
 
+static inline unsigned long nr_slots(u64 val)
+{
+	return DIV_ROUND_UP(val, IO_TLB_SIZE);
+}
+
 /*
  * Early SWIOTLB allocation may be too early to allow an architecture to
  * perform the desired operations.  This function allows the architecture to
@@ -493,20 +498,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
 
 	tbl_dma_addr &= mask;
 
-	offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT;
+	offset_slots = nr_slots(tbl_dma_addr);
 
 	/*
 	 * Carefully handle integer overflow which can occur when mask == ~0UL.
 	 */
 	max_slots = mask + 1
-		    ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT
+		    ? nr_slots(mask + 1)
 		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
 
 	/*
 	 * For mappings greater than or equal to a page, we limit the stride
 	 * (and hence alignment) to a page size.
 	 */
-	nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT;
+	nslots = nr_slots(alloc_size);
 	if (alloc_size >= PAGE_SIZE)
 		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
 	else
@@ -602,7 +607,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      enum dma_data_direction dir, unsigned long attrs)
 {
 	unsigned long flags;
-	int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT;
+	int i, count, nslots = nr_slots(alloc_size);
 	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
-- 
Gitee


From b759880267868f97eada5fdc29be39203c2d1f38 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 4 Feb 2021 10:13:40 +0100
Subject: [PATCH 1410/2161] swiotlb: clean up swiotlb_tbl_unmap_single

commit ca10d0f8e530600ec63c603dbace2c30927d70b7 upstream.

Remove a layer of pointless indentation, replace a hard to follow
ternary expression with a plain if/else.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 08ba358e69ae..2fcab352906c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -626,28 +626,29 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 * with slots below and above the pool being returned.
 	 */
 	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
-			 io_tlb_list[index + nslots] : 0);
-		/*
-		 * Step 1: return the slots to the free list, merging the
-		 * slots with superceeding slots
-		 */
-		for (i = index + nslots - 1; i >= index; i--) {
-			io_tlb_list[i] = ++count;
-			io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		}
-		/*
-		 * Step 2: merge the returned slots with the preceding slots,
-		 * if available (non zero)
-		 */
-		for (i = index - 1;
-		     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
-		     io_tlb_list[i]; i--)
-			io_tlb_list[i] = ++count;
+	if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
+		count = io_tlb_list[index + nslots];
+	else
+		count = 0;
 
-		io_tlb_used -= nslots;
+	/*
+	 * Step 1: return the slots to the free list, merging the slots with
+	 * superceeding slots
+	 */
+	for (i = index + nslots - 1; i >= index; i--) {
+		io_tlb_list[i] = ++count;
+		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
 	}
+
+	/*
+	 * Step 2: merge the returned slots with the preceding slots, if
+	 * available (non zero)
+	 */
+	for (i = index - 1;
+	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i];
+	     i--)
+		io_tlb_list[i] = ++count;
+	io_tlb_used -= nslots;
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 }
 
-- 
Gitee


From c827e7991b5e84fb79bf4e4aa06deb421651b6bf Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic@linux.ibm.com>
Date: Fri, 11 Feb 2022 02:12:52 +0100
Subject: [PATCH 1411/2161] swiotlb: fix info leak with DMA_FROM_DEVICE

commit ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e upstream.

The problem I'm addressing was discovered by the LTP test covering
cve-2018-1000204.

A short description of what happens follows:
1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO
   interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV
   and a corresponding dxferp. The peculiar thing about this is that TUR
   is not reading from the device.
2) In sg_start_req() the invocation of blk_rq_map_user() effectively
   bounces the user-space buffer. As if the device was to transfer into
   it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in
   sg_build_indirect()") we make sure this first bounce buffer is
   allocated with GFP_ZERO.
3) For the rest of the story we keep ignoring that we have a TUR, so the
   device won't touch the buffer we prepare as if the we had a
   DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device
   and the  buffer allocated by SG is mapped by the function
   virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here
   scatter-gather and not scsi generics). This mapping involves bouncing
   via the swiotlb (we need swiotlb to do virtio in protected guest like
   s390 Secure Execution, or AMD SEV).
4) When the SCSI TUR is done, we first copy back the content of the second
   (that is swiotlb) bounce buffer (which most likely contains some
   previous IO data), to the first bounce buffer, which contains all
   zeros.  Then we copy back the content of the first bounce buffer to
   the user-space buffer.
5) The test case detects that the buffer, which it zero-initialized,
  ain't all zeros and fails.

One can argue that this is an swiotlb problem, because without swiotlb
we leak all zeros, and the swiotlb should be transparent in a sense that
it does not affect the outcome (if all other participants are well
behaved).

Copying the content of the original buffer into the swiotlb buffer is
the only way I can think of to make swiotlb transparent in such
scenarios. So let's do just that if in doubt, but allow the driver
to tell us that the whole mapped buffer is going to be overwritten,
in which case we can preserve the old behavior and avoid the performance
impact of the extra bounce.

Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/DMA-attributes.txt | 10 ++++++++++
 include/linux/dma-mapping.h      |  8 ++++++++
 kernel/dma/swiotlb.c             |  3 ++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
index 6ec8de3fde83..e50b1a4835d2 100644
--- a/Documentation/DMA-attributes.txt
+++ b/Documentation/DMA-attributes.txt
@@ -148,3 +148,13 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged
 subsystem that the buffer is fully accessible at the elevated privilege
 level (and ideally inaccessible or at least read-only at the
 lesser-privileged levels).
+
+DMA_ATTR_PRIVILEGED
+-------------------
+
+Some advanced peripherals such as remote processors and GPUs perform
+accesses to DMA buffers in both privileged "supervisor" and unprivileged
+"user" modes.  This attribute is used to indicate to the DMA-mapping
+subsystem that the buffer is fully accessible at the elevated privilege
+level (and ideally inaccessible or at least read-only at the
+lesser-privileged levels).
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 177a3efe4dff..d360c60ee1eb 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -66,6 +66,14 @@
  */
 #define DMA_ATTR_PRIVILEGED		(1UL << 9)
 
+/*
+ * This is a hint to the DMA-mapping subsystem that the device is expected
+ * to overwrite the entire mapped size, thus the caller does not require any
+ * of the previous buffer contents to be preserved. This allows
+ * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers.
+ */
+#define DMA_ATTR_OVERWRITE		(1UL << 10)
+
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.
  * It can be given to a device to use as a DMA source or target.  A CPU cannot
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 2fcab352906c..23c18f6f69b5 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -593,7 +593,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
 	for (i = 0; i < nslots; i++)
 		io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+	    (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE ||
+	    dir == DMA_BIDIRECTIONAL))
 		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
 
 	return tlb_addr;
-- 
Gitee


From 2c3b81603a6afdb9b1d6236007eb61e04256ad26 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 28 Mar 2022 11:37:05 -0700
Subject: [PATCH 1412/2161] Reinstate some of "swiotlb: rework "fix info leak
 with DMA_FROM_DEVICE""
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 901c7280ca0d5e2b4a8929fbe0bfb007ac2a6544 upstream.

Halil Pasic points out [1] that the full revert of that commit (revert
in bddac7c1e02b), and that a partial revert that only reverts the
problematic case, but still keeps some of the cleanups is probably
better.  ￼

And that partial revert [2] had already been verified by Oleksandr
Natalenko to also fix the issue, I had just missed that in the long
discussion.

So let's reinstate the cleanups from commit aa6f8dcbab47 ("swiotlb:
rework "fix info leak with DMA_FROM_DEVICE""), and effectively only
revert the part that caused problems.

Link: https://lore.kernel.org/all/20220328013731.017ae3e3.pasic@linux.ibm.com/ [1]
Link: https://lore.kernel.org/all/20220324055732.GB12078@lst.de/ [2]
Link: https://lore.kernel.org/all/4386660.LvFx2qVVIh@natalenko.name/ [3]
Suggested-by: Halil Pasic <pasic@linux.ibm.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Christoph Hellwig" <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[OP: backport to 5.4: adjusted context]
Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/DMA-attributes.txt | 10 ----------
 include/linux/dma-mapping.h      |  8 --------
 kernel/dma/swiotlb.c             | 13 ++++++++-----
 3 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
index e50b1a4835d2..6ec8de3fde83 100644
--- a/Documentation/DMA-attributes.txt
+++ b/Documentation/DMA-attributes.txt
@@ -148,13 +148,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged
 subsystem that the buffer is fully accessible at the elevated privilege
 level (and ideally inaccessible or at least read-only at the
 lesser-privileged levels).
-
-DMA_ATTR_PRIVILEGED
--------------------
-
-Some advanced peripherals such as remote processors and GPUs perform
-accesses to DMA buffers in both privileged "supervisor" and unprivileged
-"user" modes.  This attribute is used to indicate to the DMA-mapping
-subsystem that the buffer is fully accessible at the elevated privilege
-level (and ideally inaccessible or at least read-only at the
-lesser-privileged levels).
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index d360c60ee1eb..177a3efe4dff 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -66,14 +66,6 @@
  */
 #define DMA_ATTR_PRIVILEGED		(1UL << 9)
 
-/*
- * This is a hint to the DMA-mapping subsystem that the device is expected
- * to overwrite the entire mapped size, thus the caller does not require any
- * of the previous buffer contents to be preserved. This allows
- * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers.
- */
-#define DMA_ATTR_OVERWRITE		(1UL << 10)
-
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.
  * It can be given to a device to use as a DMA source or target.  A CPU cannot
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 23c18f6f69b5..88d7540fdf0b 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -592,11 +592,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
 	 */
 	for (i = 0; i < nslots; i++)
 		io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE ||
-	    dir == DMA_BIDIRECTIONAL))
-		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
-
+	/*
+	 * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
+	 * to the tlb buffer, if we knew for sure the device will
+	 * overwirte the entire current content. But we don't. Thus
+	 * unconditional bounce may prevent leaking swiotlb content (i.e.
+	 * kernel memory) to user-space.
+	 */
+	swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
 	return tlb_addr;
 }
 
-- 
Gitee


From 210ff6b75e146baf9190ee8073b5ff8f8438d75b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 4 Feb 2021 11:08:35 +0100
Subject: [PATCH 1413/2161] swiotlb: refactor swiotlb_tbl_map_single

commit 26a7e094783d482f3e125f09945a5bb1d867b2e6 upstream.

Split out a bunch of a self-contained helpers to make the function easier
to follow.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 176 +++++++++++++++++++++----------------------
 1 file changed, 87 insertions(+), 89 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 88d7540fdf0b..28a3838f624f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -468,130 +468,128 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
 	}
 }
 
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
-		size_t mapping_size, size_t alloc_size,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start);
-	unsigned long flags;
-	phys_addr_t tlb_addr;
-	unsigned int nslots, stride, index, wrap;
-	int i;
-	unsigned long mask;
-	unsigned long offset_slots;
-	unsigned long max_slots;
-	unsigned long tmp_io_tlb_used;
+#define slot_addr(start, idx)	((start) + ((idx) << IO_TLB_SHIFT))
 
-	if (no_iotlb_memory)
-		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
-
-	if (mem_encrypt_active())
-		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
-
-	if (mapping_size > alloc_size) {
-		dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
-			      mapping_size, alloc_size);
-		return (phys_addr_t)DMA_MAPPING_ERROR;
-	}
-
-	mask = dma_get_seg_boundary(hwdev);
+/*
+ * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
+ */
+static inline unsigned long get_max_slots(unsigned long boundary_mask)
+{
+	if (boundary_mask == ~0UL)
+		return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+	return nr_slots(boundary_mask + 1);
+}
 
-	tbl_dma_addr &= mask;
+static unsigned int wrap_index(unsigned int index)
+{
+	if (index >= io_tlb_nslabs)
+		return 0;
+	return index;
+}
 
-	offset_slots = nr_slots(tbl_dma_addr);
+/*
+ * Find a suitable number of IO TLB entries size that will fit this request and
+ * allocate a buffer from that IO TLB pool.
+ */
+static int find_slots(struct device *dev, size_t alloc_size)
+{
+	unsigned long boundary_mask = dma_get_seg_boundary(dev);
+	dma_addr_t tbl_dma_addr =
+		phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask;
+	unsigned long max_slots = get_max_slots(boundary_mask);
+	unsigned int nslots = nr_slots(alloc_size), stride = 1;
+	unsigned int index, wrap, count = 0, i;
+	unsigned long flags;
 
-	/*
-	 * Carefully handle integer overflow which can occur when mask == ~0UL.
-	 */
-	max_slots = mask + 1
-		    ? nr_slots(mask + 1)
-		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+	BUG_ON(!nslots);
 
 	/*
 	 * For mappings greater than or equal to a page, we limit the stride
 	 * (and hence alignment) to a page size.
 	 */
-	nslots = nr_slots(alloc_size);
 	if (alloc_size >= PAGE_SIZE)
-		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
-	else
-		stride = 1;
-
-	BUG_ON(!nslots);
+		stride <<= (PAGE_SHIFT - IO_TLB_SHIFT);
 
-	/*
-	 * Find suitable number of IO TLB entries size that will fit this
-	 * request and allocate a buffer from that IO TLB pool.
-	 */
 	spin_lock_irqsave(&io_tlb_lock, flags);
-
 	if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
 		goto not_found;
 
-	index = ALIGN(io_tlb_index, stride);
-	if (index >= io_tlb_nslabs)
-		index = 0;
-	wrap = index;
-
+	index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
 	do {
-		while (iommu_is_span_boundary(index, nslots, offset_slots,
-					      max_slots)) {
-			index += stride;
-			if (index >= io_tlb_nslabs)
-				index = 0;
-			if (index == wrap)
-				goto not_found;
-		}
-
 		/*
 		 * If we find a slot that indicates we have 'nslots' number of
 		 * contiguous buffers, we allocate the buffers from that slot
 		 * and mark the entries as '0' indicating unavailable.
 		 */
-		if (io_tlb_list[index] >= nslots) {
-			int count = 0;
-
-			for (i = index; i < (int) (index + nslots); i++)
-				io_tlb_list[i] = 0;
-			for (i = index - 1;
-			     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
-			     io_tlb_list[i]; i--)
-				io_tlb_list[i] = ++count;
-			tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
-			/*
-			 * Update the indices to avoid searching in the next
-			 * round.
-			 */
-			io_tlb_index = ((index + nslots) < io_tlb_nslabs
-					? (index + nslots) : 0);
-
-			goto found;
+		if (!iommu_is_span_boundary(index, nslots,
+					    nr_slots(tbl_dma_addr),
+					    max_slots)) {
+			if (io_tlb_list[index] >= nslots)
+				goto found;
 		}
-		index += stride;
-		if (index >= io_tlb_nslabs)
-			index = 0;
+		index = wrap_index(index + stride);
 	} while (index != wrap);
 
 not_found:
-	tmp_io_tlb_used = io_tlb_used;
-
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
-	if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
-		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
-			 alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
-	return (phys_addr_t)DMA_MAPPING_ERROR;
+	return -1;
 found:
+	for (i = index; i < index + nslots; i++)
+		io_tlb_list[i] = 0;
+	for (i = index - 1;
+	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+	     io_tlb_list[i]; i--)
+		io_tlb_list[i] = ++count;
+
+	/*
+	 * Update the indices to avoid searching in the next round.
+	 */
+	if (index + nslots < io_tlb_nslabs)
+		io_tlb_index = index + nslots;
+	else
+		io_tlb_index = 0;
 	io_tlb_used += nslots;
+
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
+	return index;
+}
+
+phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
+		size_t mapping_size, size_t alloc_size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	unsigned int index, i;
+	phys_addr_t tlb_addr;
+
+	if (no_iotlb_memory)
+		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+
+	if (mem_encrypt_active())
+		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
 
+	if (mapping_size > alloc_size) {
+		dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+			      mapping_size, alloc_size);
+		return (phys_addr_t)DMA_MAPPING_ERROR;
+	}
+
+	index = find_slots(dev, alloc_size);
+	if (index == -1) {
+		if (!(attrs & DMA_ATTR_NO_WARN))
+			dev_warn_ratelimited(dev,
+	"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+				 alloc_size, io_tlb_nslabs, io_tlb_used);
+		return (phys_addr_t)DMA_MAPPING_ERROR;
+	}
 	/*
 	 * Save away the mapping from the original address to the DMA address.
 	 * This is needed when we sync the memory.  Then we sync the buffer if
 	 * needed.
 	 */
-	for (i = 0; i < nslots; i++)
-		io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
+	for (i = 0; i < nr_slots(alloc_size); i++)
+		io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
+
+	tlb_addr = slot_addr(io_tlb_start, index);
 	/*
 	 * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
 	 * to the tlb buffer, if we knew for sure the device will
-- 
Gitee


From acce186fe93746bd1a3c5493189ad6f7649d805a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Feb 2021 10:44:16 +0100
Subject: [PATCH 1414/2161] swiotlb: don't modify orig_addr in
 swiotlb_tbl_sync_single

commit 16fc3cef33a04632ab6b31758abdd77563a20759 upstream.

swiotlb_tbl_map_single currently nevers sets a tlb_addr that is not
aligned to the tlb bucket size.  But we're going to add such a case
soon, for which this adjustment would be bogus.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 28a3838f624f..ea79b6072536 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -663,7 +663,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
-	orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1);
 
 	switch (target) {
 	case SYNC_FOR_CPU:
-- 
Gitee


From 3cda7e37a8206210ce80684884dbe3df99153c2d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Feb 2021 14:39:44 -0500
Subject: [PATCH 1415/2161] swiotlb: respect min_align_mask

commit 1f221a0d0dbf0e48ef3a9c62871281d6a7819f05 upstream.

Respect the min_align_mask in struct device_dma_parameters in swiotlb.

There are two parts to it:
 1) for the lower bits of the alignment inside the io tlb slot, just
    extent the size of the allocation and leave the start of the slot
     empty
 2) for the high bits ensure we find a slot that matches the high bits
    of the alignment to avoid wasting too much memory

Based on an earlier patch from Jianxiong Gao <jxgao@google.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index ea79b6072536..c9fbf60eb715 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -470,6 +470,14 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
 
 #define slot_addr(start, idx)	((start) + ((idx) << IO_TLB_SHIFT))
 
+/*
+ * Return the offset into a iotlb slot required to keep the device happy.
+ */
+static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
+{
+	return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
+}
+
 /*
  * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
  */
@@ -491,24 +499,29 @@ static unsigned int wrap_index(unsigned int index)
  * Find a suitable number of IO TLB entries size that will fit this request and
  * allocate a buffer from that IO TLB pool.
  */
-static int find_slots(struct device *dev, size_t alloc_size)
+static int find_slots(struct device *dev, phys_addr_t orig_addr,
+		size_t alloc_size)
 {
 	unsigned long boundary_mask = dma_get_seg_boundary(dev);
 	dma_addr_t tbl_dma_addr =
 		phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask;
 	unsigned long max_slots = get_max_slots(boundary_mask);
-	unsigned int nslots = nr_slots(alloc_size), stride = 1;
+	unsigned int iotlb_align_mask =
+		dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+	unsigned int nslots = nr_slots(alloc_size), stride;
 	unsigned int index, wrap, count = 0, i;
 	unsigned long flags;
 
 	BUG_ON(!nslots);
 
 	/*
-	 * For mappings greater than or equal to a page, we limit the stride
-	 * (and hence alignment) to a page size.
+	 * For mappings with an alignment requirement don't bother looping to
+	 * unaligned slots once we found an aligned one.  For allocations of
+	 * PAGE_SIZE or larger only look for page aligned allocations.
 	 */
+	stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
 	if (alloc_size >= PAGE_SIZE)
-		stride <<= (PAGE_SHIFT - IO_TLB_SHIFT);
+		stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
 
 	spin_lock_irqsave(&io_tlb_lock, flags);
 	if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
@@ -516,6 +529,12 @@ static int find_slots(struct device *dev, size_t alloc_size)
 
 	index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
 	do {
+		if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+		    (orig_addr & iotlb_align_mask)) {
+			index = wrap_index(index + 1);
+			continue;
+		}
+
 		/*
 		 * If we find a slot that indicates we have 'nslots' number of
 		 * contiguous buffers, we allocate the buffers from that slot
@@ -558,6 +577,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		size_t mapping_size, size_t alloc_size,
 		enum dma_data_direction dir, unsigned long attrs)
 {
+	unsigned int offset = swiotlb_align_offset(dev, orig_addr);
 	unsigned int index, i;
 	phys_addr_t tlb_addr;
 
@@ -573,7 +593,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
-	index = find_slots(dev, alloc_size);
+	index = find_slots(dev, orig_addr, alloc_size + offset);
 	if (index == -1) {
 		if (!(attrs & DMA_ATTR_NO_WARN))
 			dev_warn_ratelimited(dev,
@@ -586,10 +606,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 * This is needed when we sync the memory.  Then we sync the buffer if
 	 * needed.
 	 */
-	for (i = 0; i < nr_slots(alloc_size); i++)
+	for (i = 0; i < nr_slots(alloc_size + offset); i++)
 		io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
 
-	tlb_addr = slot_addr(io_tlb_start, index);
+	tlb_addr = slot_addr(io_tlb_start, index) + offset;
 	/*
 	 * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
 	 * to the tlb buffer, if we knew for sure the device will
@@ -609,8 +629,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      enum dma_data_direction dir, unsigned long attrs)
 {
 	unsigned long flags;
-	int i, count, nslots = nr_slots(alloc_size);
-	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+	int i, count, nslots = nr_slots(alloc_size + offset);
+	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
 	/*
-- 
Gitee


From 1dc70aa3cc9bc9f90a082332389b47ff9f3c637e Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 19:56:10 +0800
Subject: [PATCH 1416/2161] nvme-core: replace ctrl page size with a macro

commit 6c3c05b087ada8947cd31895f67e433070446234 upstream.

Just introduce NVME_CTRL_PAGE_SHIFT and NVME_CTRL_PAGE_SIZE,
as there is not side-effect for other part.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/nvme/host/nvme.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1ab962f4db33..78998c6b84e3 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -28,6 +28,14 @@ extern unsigned int admin_timeout;
 #define NVME_DEFAULT_KATO	5
 #define NVME_KATO_GRACE		10
 
+/*
+ * Default to a 4K page size, with the intention to update this
+ * path in the future to accommodate architectures with differing
+ * kernel and IO page sizes.
+ */
+#define NVME_CTRL_PAGE_SHIFT	12
+#define NVME_CTRL_PAGE_SIZE	(1 << NVME_CTRL_PAGE_SHIFT)
+
 extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;
-- 
Gitee


From 9b6e8966ddbf1245227a14be4a8d68ec99a76bd8 Mon Sep 17 00:00:00 2001
From: Jianxiong Gao <jxgao@google.com>
Date: Mon, 1 Feb 2021 10:30:17 -0800
Subject: [PATCH 1417/2161] nvme-pci: set min_align_mask

commit 3d2d861eb03e8ee96dc430a54361c900cbe28afd upstream.

The PRP addressing scheme requires all PRP entries except for the
first one to have a zero offset into the NVMe controller pages (which
can be different from the Linux PAGE_SIZE).  Use the min_align_mask
device parameter to ensure that swiotlb does not change the address
of the buffer modulo the device page size to ensure that the PRPs
won't be malformed.

Signed-off-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/nvme/host/pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6d25aa4e3622..4bc498fae1f6 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2696,6 +2696,7 @@ static void nvme_reset_work(struct work_struct *work)
 	 * Don't limit the IOMMU merged segment size.
 	 */
 	dma_set_max_seg_size(dev->dev, 0xffffffff);
+	dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
 
 	mutex_unlock(&dev->shutdown_lock);
 
-- 
Gitee


From d5f88cc4cf0f5423cec6c068791151db03c8ef01 Mon Sep 17 00:00:00 2001
From: Martin Radev <martin.b.radev@gmail.com>
Date: Tue, 12 Jan 2021 16:07:29 +0100
Subject: [PATCH 1418/2161] swiotlb: Validate bounce size in the sync/unmap
 path

commit daf9514fd5eb098d7d6f3a1247cb8cc48fc94155 upstream.

The size of the buffer being bounced is not checked if it happens
to be larger than the size of the mapped buffer. Because the size
can be controlled by a device, as it's the case with virtio devices,
this can lead to memory corruption.

This patch saves the remaining buffer memory for each slab and uses
that information for validation in the sync/unmap paths before
swiotlb_bounce is called.

Validating this argument is important under the threat models of
AMD SEV-SNP and Intel TDX, where the HV is considered untrusted.

Signed-off-by: Martin Radev <martin.b.radev@gmail.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 53 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c9fbf60eb715..459428912727 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -99,6 +99,11 @@ static unsigned int max_segment;
 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 static phys_addr_t *io_tlb_orig_addr;
 
+/*
+ * The mapped buffer's size should be validated during a sync operation.
+ */
+static size_t *io_tlb_orig_size;
+
 /*
  * Protect the above data structures in the map and unmap calls
  */
@@ -247,9 +252,16 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
+	alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
+	io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!io_tlb_orig_size)
+		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+		      __func__, alloc_size, PAGE_SIZE);
+
 	for (i = 0; i < io_tlb_nslabs; i++) {
 		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+		io_tlb_orig_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -370,7 +382,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	 * between io_tlb_start and io_tlb_end.
 	 */
 	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
-	                              get_order(io_tlb_nslabs * sizeof(int)));
+				      get_order(io_tlb_nslabs * sizeof(int)));
 	if (!io_tlb_list)
 		goto cleanup3;
 
@@ -381,9 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!io_tlb_orig_addr)
 		goto cleanup4;
 
+	io_tlb_orig_size = (size_t *)
+		__get_free_pages(GFP_KERNEL,
+				 get_order(io_tlb_nslabs *
+					   sizeof(size_t)));
+	if (!io_tlb_orig_size)
+		goto cleanup5;
+
+
 	for (i = 0; i < io_tlb_nslabs; i++) {
 		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+		io_tlb_orig_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -396,6 +417,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 
 	return 0;
 
+cleanup5:
+	free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
+							      sizeof(phys_addr_t)));
+
 cleanup4:
 	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
 	                                                 sizeof(int)));
@@ -411,6 +436,8 @@ void __init swiotlb_exit(void)
 		return;
 
 	if (late_alloc) {
+		free_pages((unsigned long)io_tlb_orig_size,
+			   get_order(io_tlb_nslabs * sizeof(size_t)));
 		free_pages((unsigned long)io_tlb_orig_addr,
 			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
 		free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
@@ -420,6 +447,8 @@ void __init swiotlb_exit(void)
 	} else {
 		memblock_free_late(__pa(io_tlb_orig_addr),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+		memblock_free_late(__pa(io_tlb_orig_size),
+				   PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
 		memblock_free_late(__pa(io_tlb_list),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
 		memblock_free_late(io_tlb_start,
@@ -606,9 +635,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 * This is needed when we sync the memory.  Then we sync the buffer if
 	 * needed.
 	 */
-	for (i = 0; i < nr_slots(alloc_size + offset); i++)
+	for (i = 0; i < nr_slots(alloc_size + offset); i++) {
 		io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
-
+		io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+	}
 	tlb_addr = slot_addr(io_tlb_start, index) + offset;
 	/*
 	 * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
@@ -621,6 +651,17 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	return tlb_addr;
 }
 
+static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
+{
+	if (*size > orig_size) {
+		/* Warn and truncate mapping_size */
+		dev_WARN_ONCE(hwdev, 1,
+			"Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
+			orig_size, *size);
+		*size = orig_size;
+	}
+}
+
 /*
  * tlb_addr is the physical address of the bounce buffer to unmap.
  */
@@ -634,6 +675,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
+	validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
@@ -661,6 +704,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	for (i = index + nslots - 1; i >= index; i--) {
 		io_tlb_list[i] = ++count;
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+		io_tlb_orig_size[i] = 0;
 	}
 
 	/*
@@ -680,11 +724,14 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 			     enum dma_sync_target target)
 {
 	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	size_t orig_size = io_tlb_orig_size[index];
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
 
+	validate_sync_size_and_truncate(hwdev, orig_size, &size);
+
 	switch (target) {
 	case SYNC_FOR_CPU:
 		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-- 
Gitee


From ae108dba4c541c0ff557cd6a0e2b93f0aff0b136 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 1 Mar 2021 08:44:24 +0100
Subject: [PATCH 1419/2161] swiotlb: remove the alloc_size parameter to
 swiotlb_tbl_unmap_single

commit 2973073a80b46daebc352c31d09d95d16cf6876e upstream.

Now that swiotlb remembers the allocation size there is no need to pass
it back to swiotlb_tbl_unmap_single.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c  | 11 +++-------
 include/linux/dma-direct.h |  2 +-
 include/linux/swiotlb.h    |  1 -
 kernel/dma/swiotlb.c       | 45 +++++++++++++++++++-------------------
 4 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 33762aaa6e76..760b5606fc50 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -514,8 +514,6 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 		unsigned long attrs)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
-	struct iommu_dma_cookie *cookie = domain->iova_cookie;
-	struct iova_domain *iovad = &cookie->iovad;
 	phys_addr_t phys;
 
 	phys = iommu_iova_to_phys(domain, dma_addr);
@@ -525,8 +523,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 	__iommu_dma_unmap(dev, dma_addr, size);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size,
-				iova_align(iovad, size), dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
@@ -596,10 +593,8 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
 	}
 
 	iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
-	if ((iova == DMA_MAPPING_ERROR) && is_swiotlb_buffer(phys))
-		swiotlb_tbl_unmap_single(dev, phys, org_size,
-				aligned_size, dir, attrs);
-
+	if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+		swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
 	return iova;
 }
 
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 9f24c1789177..c2b62f684a0a 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -182,6 +182,6 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index bed5c2c31323..c780697a6cc0 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -56,7 +56,6 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
 extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     phys_addr_t tlb_addr,
 				     size_t mapping_size,
-				     size_t alloc_size,
 				     enum dma_data_direction dir,
 				     unsigned long attrs);
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 459428912727..d7dfcf94272a 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -102,7 +102,7 @@ static phys_addr_t *io_tlb_orig_addr;
 /*
  * The mapped buffer's size should be validated during a sync operation.
  */
-static size_t *io_tlb_orig_size;
+static size_t *io_tlb_alloc_size;
 
 /*
  * Protect the above data structures in the map and unmap calls
@@ -253,15 +253,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 		      __func__, alloc_size, PAGE_SIZE);
 
 	alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
-	io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!io_tlb_orig_size)
+	io_tlb_alloc_size = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!io_tlb_alloc_size)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
 	for (i = 0; i < io_tlb_nslabs; i++) {
 		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -393,18 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!io_tlb_orig_addr)
 		goto cleanup4;
 
-	io_tlb_orig_size = (size_t *)
+	io_tlb_alloc_size = (size_t *)
 		__get_free_pages(GFP_KERNEL,
 				 get_order(io_tlb_nslabs *
 					   sizeof(size_t)));
-	if (!io_tlb_orig_size)
+	if (!io_tlb_alloc_size)
 		goto cleanup5;
 
 
 	for (i = 0; i < io_tlb_nslabs; i++) {
 		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -436,7 +436,7 @@ void __init swiotlb_exit(void)
 		return;
 
 	if (late_alloc) {
-		free_pages((unsigned long)io_tlb_orig_size,
+		free_pages((unsigned long)io_tlb_alloc_size,
 			   get_order(io_tlb_nslabs * sizeof(size_t)));
 		free_pages((unsigned long)io_tlb_orig_addr,
 			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
@@ -447,7 +447,7 @@ void __init swiotlb_exit(void)
 	} else {
 		memblock_free_late(__pa(io_tlb_orig_addr),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
-		memblock_free_late(__pa(io_tlb_orig_size),
+		memblock_free_late(__pa(io_tlb_alloc_size),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
 		memblock_free_late(__pa(io_tlb_list),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
@@ -637,7 +637,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 */
 	for (i = 0; i < nr_slots(alloc_size + offset); i++) {
 		io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
-		io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+		io_tlb_alloc_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
 	}
 	tlb_addr = slot_addr(io_tlb_start, index) + offset;
 	/*
@@ -651,14 +651,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	return tlb_addr;
 }
 
-static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
+static void validate_sync_size_and_truncate(struct device *hwdev, size_t alloc_size, size_t *size)
 {
-	if (*size > orig_size) {
+	if (*size > alloc_size) {
 		/* Warn and truncate mapping_size */
 		dev_WARN_ONCE(hwdev, 1,
 			"Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
-			orig_size, *size);
-		*size = orig_size;
+			alloc_size, *size);
+		*size = alloc_size;
 	}
 }
 
@@ -666,16 +666,17 @@ static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_si
  * tlb_addr is the physical address of the bounce buffer to unmap.
  */
 void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
-			      size_t mapping_size, size_t alloc_size,
-			      enum dma_data_direction dir, unsigned long attrs)
+			      size_t mapping_size, enum dma_data_direction dir,
+			      unsigned long attrs)
 {
 	unsigned long flags;
 	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
-	int i, count, nslots = nr_slots(alloc_size + offset);
 	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
+	size_t alloc_size = io_tlb_alloc_size[index];
+	int i, count, nslots = nr_slots(alloc_size + offset);
 
-	validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+	validate_sync_size_and_truncate(hwdev, alloc_size, &mapping_size);
 
 	/*
 	 * First, sync the memory before unmapping the entry
@@ -704,7 +705,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	for (i = index + nslots - 1; i >= index; i--) {
 		io_tlb_list[i] = ++count;
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 
 	/*
@@ -724,13 +725,13 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 			     enum dma_sync_target target)
 {
 	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	size_t orig_size = io_tlb_orig_size[index];
+	size_t alloc_size = io_tlb_alloc_size[index];
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
 
-	validate_sync_size_and_truncate(hwdev, orig_size, &size);
+	validate_sync_size_and_truncate(hwdev, alloc_size, &size);
 
 	switch (target) {
 	case SYNC_FOR_CPU:
@@ -773,7 +774,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 	/* Ensure that the address returned is DMA'ble */
 	dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
 	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-		swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
+		swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		dev_WARN_ONCE(dev, 1,
 			"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
-- 
Gitee


From 96ea98fce78048f7e4e8763fadf788bc01e22fd8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 1 Mar 2021 08:44:25 +0100
Subject: [PATCH 1420/2161] swiotlb: move orig addr and size validation into
 swiotlb_bounce

commit 2bdba622c351259317b0294c6e9fe243b2404316 upstream.

Move the code to find and validate the original buffer address and size
from the callers into swiotlb_bounce.  This means a tiny bit of extra
work in the swiotlb_map path, but avoids code duplication and a leads to
a better code structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 59 +++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d7dfcf94272a..80ec08fb78a7 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -460,12 +460,25 @@ void __init swiotlb_exit(void)
 /*
  * Bounce: copy the swiotlb buffer from or back to the original dma location
  */
-static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
-			   size_t size, enum dma_data_direction dir)
+static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+			   enum dma_data_direction dir)
 {
+	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	size_t alloc_size = io_tlb_alloc_size[index];
+	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 	unsigned long pfn = PFN_DOWN(orig_addr);
 	unsigned char *vaddr = phys_to_virt(tlb_addr);
 
+	if (orig_addr == INVALID_PHYS_ADDR)
+		return;
+
+	if (size > alloc_size) {
+		dev_WARN_ONCE(dev, 1,
+			"Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n",
+			alloc_size, size);
+		size = alloc_size;
+	}
+
 	if (PageHighMem(pfn_to_page(pfn))) {
 		/* The buffer does not have a mapping.  Map it in and copy */
 		unsigned int offset = orig_addr & ~PAGE_MASK;
@@ -647,21 +660,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 * unconditional bounce may prevent leaking swiotlb content (i.e.
 	 * kernel memory) to user-space.
 	 */
-	swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
+	swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
 	return tlb_addr;
 }
 
-static void validate_sync_size_and_truncate(struct device *hwdev, size_t alloc_size, size_t *size)
-{
-	if (*size > alloc_size) {
-		/* Warn and truncate mapping_size */
-		dev_WARN_ONCE(hwdev, 1,
-			"Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
-			alloc_size, *size);
-		*size = alloc_size;
-	}
-}
-
 /*
  * tlb_addr is the physical address of the bounce buffer to unmap.
  */
@@ -672,19 +674,15 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	unsigned long flags;
 	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
 	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
-	size_t alloc_size = io_tlb_alloc_size[index];
-	int i, count, nslots = nr_slots(alloc_size + offset);
-
-	validate_sync_size_and_truncate(hwdev, alloc_size, &mapping_size);
+	int nslots = nr_slots(io_tlb_alloc_size[index] + offset);
+	int count, i;
 
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
-	if (orig_addr != INVALID_PHYS_ADDR &&
-	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
-		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+	    (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+		swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
@@ -724,27 +722,16 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 			     size_t size, enum dma_data_direction dir,
 			     enum dma_sync_target target)
 {
-	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	size_t alloc_size = io_tlb_alloc_size[index];
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
-
-	if (orig_addr == INVALID_PHYS_ADDR)
-		return;
-
-	validate_sync_size_and_truncate(hwdev, alloc_size, &size);
-
 	switch (target) {
 	case SYNC_FOR_CPU:
 		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(orig_addr, tlb_addr,
-				       size, DMA_FROM_DEVICE);
+			swiotlb_bounce(hwdev, tlb_addr, size, DMA_FROM_DEVICE);
 		else
 			BUG_ON(dir != DMA_TO_DEVICE);
 		break;
 	case SYNC_FOR_DEVICE:
 		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(orig_addr, tlb_addr,
-				       size, DMA_TO_DEVICE);
+			swiotlb_bounce(hwdev, tlb_addr, size, DMA_TO_DEVICE);
 		else
 			BUG_ON(dir != DMA_FROM_DEVICE);
 		break;
-- 
Gitee


From ee07aa625dbf249381b702591b4722e8308daf59 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 1 Mar 2021 08:44:26 +0100
Subject: [PATCH 1421/2161] swiotlb: split swiotlb_tbl_sync_single

commit 80808d273a3f075196d1a26463f65d4c9d2891c8 upstream.

Split swiotlb_tbl_sync_single into two separate funtions for the to device
and to cpu synchronization.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c  | 12 ++++++------
 include/linux/dma-direct.h |  4 ++--
 include/linux/swiotlb.h    | 17 ++++-------------
 kernel/dma/direct.c        |  8 ++++----
 kernel/dma/swiotlb.c       | 34 +++++++++++++++-------------------
 5 files changed, 31 insertions(+), 44 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 760b5606fc50..11af9b916f67 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -779,7 +779,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
 		arch_sync_dma_for_cpu(phys, size, dir);
 
 	if (is_swiotlb_buffer(phys))
-		swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_CPU);
+		swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
 
 static void iommu_dma_sync_single_for_device(struct device *dev,
@@ -792,7 +792,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
 	if (is_swiotlb_buffer(phys))
-		swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_DEVICE);
+		swiotlb_sync_single_for_device(dev, phys, size, dir);
 
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(phys, size, dir);
@@ -813,8 +813,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 
 		if (is_swiotlb_buffer(sg_phys(sg)))
-			swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
-						dir, SYNC_FOR_CPU);
+			swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
+						    sg->length, dir);
 	}
 }
 
@@ -830,8 +830,8 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (is_swiotlb_buffer(sg_phys(sg)))
-			swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
-						dir, SYNC_FOR_DEVICE);
+			swiotlb_sync_single_for_device(dev, sg_phys(sg),
+						       sg->length, dir);
 
 		if (!dev_is_dma_coherent(dev))
 			arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index c2b62f684a0a..d63ea014a272 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -128,7 +128,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 	phys_addr_t paddr = dma_to_phys(dev, addr);
 
 	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+		swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(paddr, size, dir);
@@ -145,7 +145,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 	}
 
 	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
+		swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
 }
 
 static inline dma_addr_t dma_direct_map_page(struct device *dev,
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index c780697a6cc0..484e41f835f1 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -41,14 +41,6 @@ unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 extern void __init swiotlb_update_mem_attributes(void);
 
-/*
- * Enumeration for sync targets
- */
-enum dma_sync_target {
-	SYNC_FOR_CPU = 0,
-	SYNC_FOR_DEVICE = 1,
-};
-
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
 		size_t mapping_size, size_t alloc_size,
 		enum dma_data_direction dir, unsigned long attrs);
@@ -59,11 +51,10 @@ extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     enum dma_data_direction dir,
 				     unsigned long attrs);
 
-extern void swiotlb_tbl_sync_single(struct device *hwdev,
-				    phys_addr_t tlb_addr,
-				    size_t size, enum dma_data_direction dir,
-				    enum dma_sync_target target);
-
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir);
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir);
 dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 822e618067ce..b75b40a80ef8 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -340,8 +340,8 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
-			swiotlb_tbl_sync_single(dev, paddr, sg->length,
-					dir, SYNC_FOR_DEVICE);
+			swiotlb_sync_single_for_device(dev, paddr, sg->length,
+						       dir);
 
 		if (!dev_is_dma_coherent(dev))
 			arch_sync_dma_for_device(paddr, sg->length,
@@ -366,8 +366,8 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 			arch_sync_dma_for_cpu(paddr, sg->length, dir);
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
-			swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
-					SYNC_FOR_CPU);
+			swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
+						    dir);
 	}
 
 	if (!dev_is_dma_coherent(dev))
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 80ec08fb78a7..870a58138592 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -718,26 +718,22 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 }
 
-void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
-			     size_t size, enum dma_data_direction dir,
-			     enum dma_sync_target target)
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir)
 {
-	switch (target) {
-	case SYNC_FOR_CPU:
-		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(hwdev, tlb_addr, size, DMA_FROM_DEVICE);
-		else
-			BUG_ON(dir != DMA_TO_DEVICE);
-		break;
-	case SYNC_FOR_DEVICE:
-		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(hwdev, tlb_addr, size, DMA_TO_DEVICE);
-		else
-			BUG_ON(dir != DMA_FROM_DEVICE);
-		break;
-	default:
-		BUG();
-	}
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+	else
+		BUG_ON(dir != DMA_FROM_DEVICE);
+}
+
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir)
+{
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+	else
+		BUG_ON(dir != DMA_TO_DEVICE);
 }
 
 /*
-- 
Gitee


From 1210a53bf06d1cb921d62107fb413d0f2ca8acce Mon Sep 17 00:00:00 2001
From: Claire Chang <tientzu@chromium.org>
Date: Thu, 22 Apr 2021 16:14:53 +0800
Subject: [PATCH 1422/2161] swiotlb: Fix the type of index

commit 95b079d8215b83b37fa59341fda92fcb9392f14a upstream.

Fix the type of index from unsigned int to int since find_slots() might
return -1.

Fixes: 26a7e094783d ("swiotlb: refactor swiotlb_tbl_map_single")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Claire Chang <tientzu@chromium.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/dma/swiotlb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 870a58138592..0927626ba06d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -620,7 +620,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		enum dma_data_direction dir, unsigned long attrs)
 {
 	unsigned int offset = swiotlb_align_offset(dev, orig_addr);
-	unsigned int index, i;
+	unsigned int i;
+	int index;
 	phys_addr_t tlb_addr;
 
 	if (no_iotlb_memory)
-- 
Gitee


From 4f3167ec57467e2aec548bee9919e7af0b0871b8 Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Wed, 29 Sep 2021 11:32:54 +0900
Subject: [PATCH 1423/2161] iommu/dma: Fix sync_sg with swiotlb

commit 08ae5d4a1ae96b72222e7b02d072bb997ff29dac upstream.

The is_swiotlb_buffer function takes the physical address of the swiotlb
buffer, not the physical address of the original buffer. The sglist
contains the physical addresses of the original buffer, so for the
sync_sg functions to work properly when a bounce buffer might have been
used, we need to use iommu_iova_to_phys to look up the physical address.
This is what sync_single does, so call that function on each sglist
segment.

The previous code mostly worked because swiotlb does the transfer on map
and unmap. However, any callers which use DMA_ATTR_SKIP_CPU_SYNC with
sglists or which call sync_sg would not have had anything copied to the
bounce buffer.

Fixes: 82612d66d51d ("iommu: Allow the iommu/dma api to use bounce buffers")
Signed-off-by: David Stevens <stevensd@chromium.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210929023300.335969-2-stevensd@google.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 11af9b916f67..2d10ba70ec17 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -805,17 +805,13 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 	struct scatterlist *sg;
 	int i;
 
-	if (dev_is_dma_coherent(dev) && !dev_is_untrusted(dev))
-		return;
-
-	for_each_sg(sgl, sg, nelems, i) {
-		if (!dev_is_dma_coherent(dev))
+	if (dev_is_untrusted(dev))
+		for_each_sg(sgl, sg, nelems, i)
+			iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
+						      sg->length, dir);
+	else if (!dev_is_dma_coherent(dev))
+		for_each_sg(sgl, sg, nelems, i)
 			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
-
-		if (is_swiotlb_buffer(sg_phys(sg)))
-			swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
-						    sg->length, dir);
-	}
 }
 
 static void iommu_dma_sync_sg_for_device(struct device *dev,
@@ -825,17 +821,14 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 	struct scatterlist *sg;
 	int i;
 
-	if (dev_is_dma_coherent(dev) && !dev_is_untrusted(dev))
-		return;
-
-	for_each_sg(sgl, sg, nelems, i) {
-		if (is_swiotlb_buffer(sg_phys(sg)))
-			swiotlb_sync_single_for_device(dev, sg_phys(sg),
-						       sg->length, dir);
-
-		if (!dev_is_dma_coherent(dev))
+	if (dev_is_untrusted(dev))
+		for_each_sg(sgl, sg, nelems, i)
+			iommu_dma_sync_single_for_device(dev,
+							 sg_dma_address(sg),
+							 sg->length, dir);
+	else if (!dev_is_dma_coherent(dev))
+		for_each_sg(sgl, sg, nelems, i)
 			arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
-	}
 }
 
 static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
-- 
Gitee


From fdfb7ebe643894f014e85da2eec877beeb1f34db Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Wed, 29 Sep 2021 11:32:55 +0900
Subject: [PATCH 1424/2161] iommu/dma: Fix arch_sync_dma for map

commit 06e620345d544e559b2961cb5a676ec9c80c8950 upstream.

When calling arch_sync_dma, we need to pass it the memory that's
actually being used for dma. When using swiotlb bounce buffers, this is
the bounce buffer. Move arch_sync_dma into the __iommu_dma_map_swiotlb
helper, so it can use the bounce buffer address if necessary.

Now that iommu_dma_map_sg delegates to a function which takes care of
architectural syncing in the untrusted device case, the call to
iommu_dma_sync_sg_for_device can be moved so it only occurs for trusted
devices. Doing the sync for untrusted devices before mapping never
really worked, since it needs to be able to target swiotlb buffers.

This also moves the architectural sync to before the call to
__iommu_dma_map, to guarantee that untrusted devices can't see stale
data they shouldn't see.

Fixes: 82612d66d51d ("iommu: Allow the iommu/dma api to use bounce buffers")
Signed-off-by: David Stevens <stevensd@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210929023300.335969-3-stevensd@google.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 2d10ba70ec17..5640140e8bcc 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -592,6 +592,9 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
 		memset(padding_start, 0, padding_size);
 	}
 
+	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(phys, org_size, dir);
+
 	iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
 	if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
 		swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
@@ -837,14 +840,9 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 {
 	phys_addr_t phys = page_to_phys(page) + offset;
 	bool coherent = dev_is_dma_coherent(dev);
-	dma_addr_t dma_handle;
 
-	dma_handle = __iommu_dma_map_swiotlb(dev, phys, size, dma_get_mask(dev),
+	return __iommu_dma_map_swiotlb(dev, phys, size, dma_get_mask(dev),
 			coherent, dir, attrs);
-	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    dma_handle != DMA_MAPPING_ERROR)
-		arch_sync_dma_for_device(phys, size, dir);
-	return dma_handle;
 }
 
 static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
@@ -990,12 +988,12 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 			goto out;
 	}
 
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
-
 	if (dev_is_untrusted(dev))
 		return iommu_dma_map_sg_swiotlb(dev, sg, nents, dir, attrs);
 
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
+
 	/*
 	 * Work out how much IOVA space we need, and align the segments to
 	 * IOVA granules for the IOMMU driver to handle. With some clever
-- 
Gitee


From aba3adf75e775a6512c61540572458340bed220b Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Wed, 29 Sep 2021 11:32:56 +0900
Subject: [PATCH 1425/2161] iommu/dma: Skip extra sync during unmap w/swiotlb

commit ee9d4097cc145dcaebedf6b113d17c91c21333a0 upstream.

Calling the iommu_dma_sync_*_for_cpu functions during unmap can cause
two copies out of the swiotlb buffer. Do the arch sync directly in
__iommu_dma_unmap_swiotlb instead to avoid this. This makes the call to
iommu_dma_sync_sg_for_cpu for untrusted devices in iommu_dma_unmap_sg no
longer necessary, so move that invocation later in the function.

Signed-off-by: David Stevens <stevensd@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210929023300.335969-4-stevensd@google.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5640140e8bcc..5d93c9fb55fa 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -520,6 +520,9 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 	if (WARN_ON(!phys))
 		return;
 
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu(phys, size, dir);
+
 	__iommu_dma_unmap(dev, dma_addr, size);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
@@ -848,8 +851,6 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		iommu_dma_sync_single_for_cpu(dev, dma_handle, size, dir);
 	__iommu_dma_unmap_swiotlb(dev, dma_handle, size, dir, attrs);
 }
 
@@ -1066,14 +1067,14 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
 	struct scatterlist *tmp;
 	int i;
 
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		iommu_dma_sync_sg_for_cpu(dev, sg, nents, dir);
-
 	if (dev_is_untrusted(dev)) {
 		iommu_dma_unmap_sg_swiotlb(dev, sg, nents, dir, attrs);
 		return;
 	}
 
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		iommu_dma_sync_sg_for_cpu(dev, sg, nents, dir);
+
 	/*
 	 * The scatterlist segments are mapped into a single
 	 * contiguous IOVA allocation, so this is incredibly easy.
-- 
Gitee


From 64a152de7805ba55464d29e47d46709d1264bcca Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 28 Sep 2021 17:22:29 -0500
Subject: [PATCH 1426/2161] iommu/dma: Use kvcalloc() instead of kvzalloc()

commit ab6f4b001c8c726b5e0bb01429710dc61f13e24d upstream.

Use 2-factor argument form kvcalloc() instead of kvzalloc().

Link: https://github.com/KSPP/linux/issues/162
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210928222229.GA280355@embeddedor
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5d93c9fb55fa..34e4bfbbd9e1 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -621,7 +621,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 	if (!order_mask)
 		return NULL;
 
-	pages = kvzalloc(count * sizeof(*pages), GFP_KERNEL);
+	pages = kvcalloc(count, sizeof(*pages), GFP_KERNEL);
 	if (!pages)
 		return NULL;
 
-- 
Gitee


From 8a50e72476df8c39184f951dd7ee7effab79d8c5 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 31 May 2022 16:50:37 +0800
Subject: [PATCH 1427/2161] Revert "iommu/sva: Assign a PASID to mm on PASID
 allocation and free it on mm exit"

commit opencloudos.

This reverts commit 0ed3e1e3c618b8069f5b57aac5340e3efd1ea2bc.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c   |  4 ++--
 drivers/iommu/intel/svm.c     |  9 ++++++++
 drivers/iommu/ioasid.c        | 39 +++++++++++++++++++++++++++++++----
 drivers/iommu/iommu-sva-lib.c | 39 ++++++++++++++++++++++++-----------
 drivers/iommu/iommu-sva-lib.h |  1 +
 include/linux/ioasid.h        | 12 +++++++++--
 include/linux/sched/mm.h      | 16 --------------
 kernel/fork.c                 |  1 -
 8 files changed, 84 insertions(+), 37 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e417663e0afb..ce551bb88695 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4794,7 +4794,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 link_failed:
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
-		ioasid_free(domain->default_pasid);
+		ioasid_put(domain->default_pasid);
 
 	return ret;
 }
@@ -4824,7 +4824,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
-		ioasid_free(domain->default_pasid);
+		ioasid_put(domain->default_pasid);
 }
 
 static int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 11dd207ed230..4f9c98249fd9 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -514,6 +514,11 @@ static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
 	return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
 }
 
+static void intel_svm_free_pasid(struct mm_struct *mm)
+{
+	iommu_sva_free_pasid(mm);
+}
+
 static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
 					   struct device *dev,
 					   struct mm_struct *mm,
@@ -657,6 +662,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 				kfree(svm);
 			}
 		}
+		/* Drop a PASID reference and free it if no reference. */
+		intel_svm_free_pasid(mm);
 	}
 out:
 	return ret;
@@ -1041,6 +1048,8 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void
 	}
 
 	sva = intel_svm_bind_mm(iommu, dev, mm, flags);
+	if (IS_ERR_OR_NULL(sva))
+		intel_svm_free_pasid(mm);
 	mutex_unlock(&pasid_mutex);
 
 	return sva;
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index a786c034907c..06fee7416816 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -2,7 +2,7 @@
 /*
  * I/O Address Space ID allocator. There is one global IOASID space, split into
  * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc() and ioasid_free().
+ * free IOASIDs with ioasid_alloc and ioasid_put.
  */
 #include <linux/ioasid.h>
 #include <linux/module.h>
@@ -15,6 +15,7 @@ struct ioasid_data {
 	struct ioasid_set *set;
 	void *private;
 	struct rcu_head rcu;
+	refcount_t refs;
 };
 
 /*
@@ -314,6 +315,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 
 	data->set = set;
 	data->private = private;
+	refcount_set(&data->refs, 1);
 
 	/*
 	 * Custom allocator needs allocator data to perform platform specific
@@ -346,11 +348,35 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 EXPORT_SYMBOL_GPL(ioasid_alloc);
 
 /**
- * ioasid_free - Free an ioasid
+ * ioasid_get - obtain a reference to the IOASID
+ * @ioasid: the ID to get
+ */
+void ioasid_get(ioasid_t ioasid)
+{
+	struct ioasid_data *ioasid_data;
+
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_data = xa_load(&active_allocator->xa, ioasid);
+	if (ioasid_data)
+		refcount_inc(&ioasid_data->refs);
+	else
+		WARN_ON(1);
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_get);
+
+/**
+ * ioasid_put - Release a reference to an ioasid
  * @ioasid: the ID to remove
+ *
+ * Put a reference to the IOASID, free it when the number of references drops to
+ * zero.
+ *
+ * Return: %true if the IOASID was freed, %false otherwise.
  */
-void ioasid_free(ioasid_t ioasid)
+bool ioasid_put(ioasid_t ioasid)
 {
+	bool free = false;
 	struct ioasid_data *ioasid_data;
 
 	spin_lock(&ioasid_allocator_lock);
@@ -360,6 +386,10 @@ void ioasid_free(ioasid_t ioasid)
 		goto exit_unlock;
 	}
 
+	free = refcount_dec_and_test(&ioasid_data->refs);
+	if (!free)
+		goto exit_unlock;
+
 	active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
 	/* Custom allocator needs additional steps to free the xa element */
 	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
@@ -369,8 +399,9 @@ void ioasid_free(ioasid_t ioasid)
 
 exit_unlock:
 	spin_unlock(&ioasid_allocator_lock);
+	return free;
 }
-EXPORT_SYMBOL_GPL(ioasid_free);
+EXPORT_SYMBOL_GPL(ioasid_put);
 
 /**
  * ioasid_find - Find IOASID data
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
index 106506143896..bd41405d34e9 100644
--- a/drivers/iommu/iommu-sva-lib.c
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -18,7 +18,8 @@ static DECLARE_IOASID_SET(iommu_sva_pasid);
  *
  * Try to allocate a PASID for this mm, or take a reference to the existing one
  * provided it fits within the [@min, @max] range. On success the PASID is
- * available in mm->pasid and will be available for the lifetime of the mm.
+ * available in mm->pasid, and must be released with iommu_sva_free_pasid().
+ * @min must be greater than 0, because 0 indicates an unused mm->pasid.
  *
  * Returns 0 on success and < 0 on error.
  */
@@ -32,24 +33,38 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
 		return -EINVAL;
 
 	mutex_lock(&iommu_sva_lock);
-	/* Is a PASID already associated with this mm? */
-	if (pasid_valid(mm->pasid)) {
-		if (mm->pasid < min || mm->pasid >= max)
+	if (mm->pasid) {
+		if (mm->pasid >= min && mm->pasid <= max)
+			ioasid_get(mm->pasid);
+		else
 			ret = -EOVERFLOW;
-		goto out;
+	} else {
+		pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
+		if (pasid == INVALID_IOASID)
+			ret = -ENOMEM;
+		else
+			mm->pasid = pasid;
 	}
-
-	pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
-	if (!pasid_valid(pasid))
-		ret = -ENOMEM;
-	else
-		mm_pasid_set(mm, pasid);
-out:
 	mutex_unlock(&iommu_sva_lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
 
+/**
+ * iommu_sva_free_pasid - Release the mm's PASID
+ * @mm: the mm
+ *
+ * Drop one reference to a PASID allocated with iommu_sva_alloc_pasid()
+ */
+void iommu_sva_free_pasid(struct mm_struct *mm)
+{
+	mutex_lock(&iommu_sva_lock);
+	if (ioasid_put(mm->pasid))
+		mm->pasid = 0;
+	mutex_unlock(&iommu_sva_lock);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_free_pasid);
+
 /* ioasid_find getter() requires a void * argument */
 static bool __mmget_not_zero(void *mm)
 {
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
index 8909ea1094e3..95dc3ebc1928 100644
--- a/drivers/iommu/iommu-sva-lib.h
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -9,6 +9,7 @@
 #include <linux/mm_types.h>
 
 int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
+void iommu_sva_free_pasid(struct mm_struct *mm);
 struct mm_struct *iommu_sva_find(ioasid_t pasid);
 
 /* I/O Page fault */
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index af1c9d62e642..2237f64dbaae 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -34,7 +34,8 @@ struct ioasid_allocator_ops {
 #if IS_ENABLED(CONFIG_IOASID)
 ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		      void *private);
-void ioasid_free(ioasid_t ioasid);
+void ioasid_get(ioasid_t ioasid);
+bool ioasid_put(ioasid_t ioasid);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -52,7 +53,14 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
 	return INVALID_IOASID;
 }
 
-static inline void ioasid_free(ioasid_t ioasid) { }
+static inline void ioasid_get(ioasid_t ioasid)
+{
+}
+
+static inline bool ioasid_put(ioasid_t ioasid)
+{
+	return false;
+}
 
 static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 				bool (*getter)(void *))
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 5122f54f01b1..c9fef831ba21 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -399,24 +399,8 @@ static inline void mm_pasid_init(struct mm_struct *mm)
 {
 	mm->pasid = INVALID_IOASID;
 }
-
-/* Associate a PASID with an mm_struct: */
-static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
-{
-	mm->pasid = pasid;
-}
-
-static inline void mm_pasid_drop(struct mm_struct *mm)
-{
-	if (pasid_valid(mm->pasid)) {
-		ioasid_free(mm->pasid);
-		mm->pasid = INVALID_IOASID;
-	}
-}
 #else
 static inline void mm_pasid_init(struct mm_struct *mm) {}
-static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
-static inline void mm_pasid_drop(struct mm_struct *mm) {}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 41de611bae6b..09cc35f385c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1103,7 +1103,6 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
-	mm_pasid_drop(mm);
 	mmdrop(mm);
 }
 
-- 
Gitee


From 009713fd95a23a435fb437f1d714a9ed0451f847 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 19 Aug 2020 11:18:37 -0700
Subject: [PATCH 1428/2161] iommu/ioasid: Rename ioasid_set_data()

commit ec5ae90c7ad3be7696a00e132beb4ef7aa6b4083 Intel-BKC.

Rename ioasid_set_data() to ioasid_attach_data() to avoid confusion with
struct ioasid_set. ioasid_set is a group of IOASIDs that share a common
token.

Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 6 +++---
 drivers/iommu/ioasid.c    | 6 +++---
 include/linux/ioasid.h    | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 4f9c98249fd9..9db8161156c3 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -398,7 +398,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			svm->gpasid = data->gpasid;
 			svm->flags |= SVM_FLAG_GUEST_PASID;
 		}
-		pasid_private_add(data->hpasid, svm);
+		ioasid_attach_data(data->hpasid, svm);
 		INIT_LIST_HEAD_RCU(&svm->devs);
 		mmput(svm->mm);
 	}
@@ -452,7 +452,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	list_add_rcu(&sdev->list, &svm->devs);
  out:
 	if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
-		pasid_private_remove(data->hpasid);
+		ioasid_attach_data(data->hpasid, NULL);
 		kfree(svm);
 	}
 
@@ -495,7 +495,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 				 * the unbind, IOMMU driver will get notified
 				 * and perform cleanup.
 				 */
-				pasid_private_remove(pasid);
+				ioasid_attach_data(pasid, NULL);
 				kfree(svm);
 			}
 		}
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 06fee7416816..1e91de190112 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -259,14 +259,14 @@ void ioasid_unregister_allocator(struct ioasid_allocator_ops *ops)
 EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
 
 /**
- * ioasid_set_data - Set private data for an allocated ioasid
+ * ioasid_attach_data - Set private data for an allocated ioasid
  * @ioasid: the ID to set data
  * @data:   the private data
  *
  * For IOASID that is already allocated, private data can be set
  * via this API. Future lookup can be done via ioasid_find.
  */
-int ioasid_set_data(ioasid_t ioasid, void *data)
+int ioasid_attach_data(ioasid_t ioasid, void *data)
 {
 	struct ioasid_data *ioasid_data;
 	int ret = 0;
@@ -288,7 +288,7 @@ int ioasid_set_data(ioasid_t ioasid, void *data)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ioasid_set_data);
+EXPORT_SYMBOL_GPL(ioasid_attach_data);
 
 /**
  * ioasid_alloc - Allocate an IOASID
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 2237f64dbaae..edc5ebe15da8 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -40,7 +40,7 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
-int ioasid_set_data(ioasid_t ioasid, void *data);
+int ioasid_attach_data(ioasid_t ioasid, void *data);
 static inline bool pasid_valid(ioasid_t ioasid)
 {
 	return ioasid != INVALID_IOASID;
@@ -77,7 +77,7 @@ static inline void ioasid_unregister_allocator(struct ioasid_allocator_ops *allo
 {
 }
 
-static inline int ioasid_set_data(ioasid_t ioasid, void *data)
+static inline int ioasid_attach_data(ioasid_t ioasid, void *data)
 {
 	return -ENOTSUPP;
 }
-- 
Gitee


From 09e51adc3b9fb0b18259f222721ba3bfaec376b1 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 22 Sep 2020 14:51:19 -0700
Subject: [PATCH 1429/2161] iommu/ioasid: Add a separate function for detach
 data

commit 2859277290bf9685cd8c7627683f47bc9de7a525 Intel-BKC.

IOASID private data can be cleared by ioasid_attach_data() with a NULL
data pointer. A common use case is for a caller to free the data
afterward. ioasid_attach_data() calls synchronize_rcu() before return
such that free data can be sure without outstanding readers.
However, since synchronize_rcu() may sleep, ioasid_attach_data() cannot
be used under spinlocks.

This patch adds ioasid_detach_data() as a separate API where
synchronize_rcu() is called only in this case. ioasid_attach_data() can
then be used under spinlocks. In addition, this change makes the API
symmetrical.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c |  4 +--
 drivers/iommu/ioasid.c    | 54 +++++++++++++++++++++++++++++++--------
 include/linux/ioasid.h    |  5 +++-
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 9db8161156c3..73402720541d 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -452,7 +452,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	list_add_rcu(&sdev->list, &svm->devs);
  out:
 	if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
-		ioasid_attach_data(data->hpasid, NULL);
+		ioasid_detach_data(data->hpasid);
 		kfree(svm);
 	}
 
@@ -495,7 +495,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 				 * the unbind, IOMMU driver will get notified
 				 * and perform cleanup.
 				 */
-				ioasid_attach_data(pasid, NULL);
+				ioasid_detach_data(pasid);
 				kfree(svm);
 			}
 		}
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 1e91de190112..b5aaf50561a9 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -273,23 +273,57 @@ int ioasid_attach_data(ioasid_t ioasid, void *data)
 
 	spin_lock(&ioasid_allocator_lock);
 	ioasid_data = xa_load(&active_allocator->xa, ioasid);
-	if (ioasid_data)
-		rcu_assign_pointer(ioasid_data->private, data);
-	else
+
+	if (!ioasid_data) {
 		ret = -ENOENT;
-	spin_unlock(&ioasid_allocator_lock);
+		goto done_unlock;
+	}
 
-	/*
-	 * Wait for readers to stop accessing the old private data, so the
-	 * caller can free it.
-	 */
-	if (!ret)
-		synchronize_rcu();
+	if (ioasid_data->private) {
+		ret = -EBUSY;
+		goto done_unlock;
+	}
+	rcu_assign_pointer(ioasid_data->private, data);
+
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ioasid_attach_data);
 
+/**
+ * ioasid_detach_data - Clear the private data of an ioasid
+ *
+ * @ioasid: the IOASIDD to clear private data
+ */
+void ioasid_detach_data(ioasid_t ioasid)
+{
+	struct ioasid_data *ioasid_data;
+
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_data = xa_load(&active_allocator->xa, ioasid);
+
+	if (!ioasid_data) {
+		pr_warn("IOASID %u not found to detach data from\n", ioasid);
+		goto done_unlock;
+	}
+
+	if (ioasid_data->private) {
+		rcu_assign_pointer(ioasid_data->private, NULL);
+		goto done_unlock;
+	}
+
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	/*
+	 * Wait for readers to stop accessing the old private data,
+	 * so the caller can free it.
+	 */
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ioasid_detach_data);
+
 /**
  * ioasid_alloc - Allocate an IOASID
  * @set: the IOASID set
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index edc5ebe15da8..41cb5cbd2ff4 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -45,7 +45,7 @@ static inline bool pasid_valid(ioasid_t ioasid)
 {
 	return ioasid != INVALID_IOASID;
 }
-
+void ioasid_detach_data(ioasid_t ioasid);
 #else /* !CONFIG_IOASID */
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
 				    ioasid_t max, void *private)
@@ -87,5 +87,8 @@ static inline bool pasid_valid(ioasid_t ioasid)
 	return false;
 }
 
+static inline void ioasid_detach_data(ioasid_t ioasid)
+{
+}
 #endif /* CONFIG_IOASID */
 #endif /* __LINUX_IOASID_H */
-- 
Gitee


From b123521251f4a384ad1d1a7659513b2e56d2bc80 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 23 Sep 2020 13:24:24 -0700
Subject: [PATCH 1430/2161] iommu/ioasid: Support setting system-wide capacity

commit 5f764744b6d08c5539a4a9838cd5c45ba5b2db39 Intel-BKC.

IOASID is a system-wide resource that could vary on different systems.
The default capacity is 20 bits as defined in the PCI-E specifications.
This patch adds a function to allow adjusting system IOASID capacity.
For VT-d this is set during boot as part of the Intel IOMMU
initialization. APIs also added to support runtime capacity reservation,
potentially by cgroups.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  5 +++
 drivers/iommu/ioasid.c      | 70 +++++++++++++++++++++++++++++++++++++
 include/linux/ioasid.h      | 18 ++++++++++
 3 files changed, 93 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index ce551bb88695..809ef55c5ac5 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -43,6 +43,7 @@
 #include <linux/dma-direct.h>
 #include <linux/crash_dump.h>
 #include <linux/numa.h>
+#include <linux/ioasid.h>
 #include <asm/irq_remapping.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -3428,6 +3429,10 @@ static int __init init_dmars(void)
 	if (ret)
 		goto free_iommu;
 
+	/* PASID is needed for scalable mode irrespective to SVM */
+	if (intel_iommu_sm)
+		ioasid_install_capacity(intel_pasid_max_id);
+
 	/*
 	 * for each drhd
 	 *   enable fault log
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index b5aaf50561a9..6bd966fc1ae4 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -10,6 +10,10 @@
 #include <linux/spinlock.h>
 #include <linux/xarray.h>
 
+/* Default to PCIe standard 20 bit PASID */
+#define PCI_PASID_MAX 0x100000
+static ioasid_t ioasid_capacity = PCI_PASID_MAX;
+static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 struct ioasid_data {
 	ioasid_t id;
 	struct ioasid_set *set;
@@ -258,6 +262,72 @@ void ioasid_unregister_allocator(struct ioasid_allocator_ops *ops)
 }
 EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
 
+void ioasid_install_capacity(ioasid_t total)
+{
+	spin_lock(&ioasid_allocator_lock);
+	if (ioasid_capacity && ioasid_capacity != PCI_PASID_MAX) {
+		pr_warn("IOASID capacity is already set.\n");
+		goto done_unlock;
+	}
+	ioasid_capacity = ioasid_capacity_avail = total;
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_install_capacity);
+
+/**
+ * @brief Reserve capacity from the system pool
+ *
+ * @param nr_ioasid Number of IOASIDs requested to be reserved, 0 means
+ *	reserve all remaining IDs.
+ *
+ * @return the remaining capacity on success, or errno
+ */
+int ioasid_reserve_capacity(ioasid_t nr_ioasid)
+{
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (nr_ioasid > ioasid_capacity_avail) {
+		ret = -ENOSPC;
+		goto done_unlock;
+	}
+	if (!nr_ioasid)
+		nr_ioasid = ioasid_capacity_avail;
+	ioasid_capacity_avail -= nr_ioasid;
+	ret = nr_ioasid;
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_reserve_capacity);
+
+/**
+ * @brief Return capacity to the system pool
+ * 	We trust the caller not to return more than it has reserved, we could
+ * 	also track reservation if needed.
+ *
+ * @param nr_ioasid Number of IOASIDs requested to be returned
+ *
+ * @return the remaining capacity on success, or errno
+ */
+int ioasid_cancel_capacity(ioasid_t nr_ioasid)
+{
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (nr_ioasid + ioasid_capacity_avail > ioasid_capacity) {
+		ret = -EINVAL;
+		goto done_unlock;
+	}
+	ioasid_capacity_avail += nr_ioasid;
+	ret = ioasid_capacity_avail;
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_cancel_capacity);
+
 /**
  * ioasid_attach_data - Set private data for an allocated ioasid
  * @ioasid: the ID to set data
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 41cb5cbd2ff4..bd0a36e16e68 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -32,6 +32,10 @@ struct ioasid_allocator_ops {
 #define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
 
 #if IS_ENABLED(CONFIG_IOASID)
+void ioasid_install_capacity(ioasid_t total);
+int ioasid_reserve_capacity(ioasid_t nr_ioasid);
+int ioasid_cancel_capacity(ioasid_t nr_ioasid);
+
 ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		      void *private);
 void ioasid_get(ioasid_t ioasid);
@@ -47,6 +51,20 @@ static inline bool pasid_valid(ioasid_t ioasid)
 }
 void ioasid_detach_data(ioasid_t ioasid);
 #else /* !CONFIG_IOASID */
+static inline void ioasid_install_capacity(ioasid_t total)
+{
+}
+
+static inline int ioasid_reserve_capacity(ioasid_t nr_ioasid)
+{
+	return -ENOSPC;
+}
+
+static inline int ioasid_cancel_capacity(ioasid_t nr_ioasid)
+{
+	return -EINVAL;
+}
+
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
 				    ioasid_t max, void *private)
 {
-- 
Gitee


From 379569eb50a9b1c53059350e47b83acc68cdf6fb Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 31 May 2022 17:30:10 +0800
Subject: [PATCH 1431/2161] iommu/ioasid: Delete comment line to make next
 patch happy

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 6bd966fc1ae4..28681b99340b 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -453,7 +453,6 @@ EXPORT_SYMBOL_GPL(ioasid_alloc);
 
 /**
  * ioasid_get - obtain a reference to the IOASID
- * @ioasid: the ID to get
  */
 void ioasid_get(ioasid_t ioasid)
 {
-- 
Gitee


From 6f5521bca6c14f9c77a0da6cf597578e926c6c92 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 31 May 2022 18:03:06 +0800
Subject: [PATCH 1432/2161] iommu/ioasid: Redefine IOASID set and allocation
 APIs

commit b489a9e49697bf962544b94114d63ea58556904e Intel-BKC.

ioasid_set was introduced as an arbitrary token that is shared by a
group of IOASIDs. For example, two IOASIDs allocated via the same
ioasid_set pointer belong to the same set.

For guest SVA usages, system-wide IOASID resources need to be
partitioned such that each VM can have its own quota and being managed
separately. ioasid_set is the perfect candidate for meeting such
requirements. This patch redefines and extends ioasid_set with the
following new fields:
- Quota
- Reference count
- Storage of its namespace
- The token is now stored in the ioasid_set with types

Basic ioasid_set level APIs are introduced that wire up these new data.
Existing users of IOASID APIs are converted where a host IOASID set is
allocated for bare-metal usages. Including VT-d driver and
iommu-sva-lib.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c   |  45 +++++-
 drivers/iommu/intel/pasid.h   |   4 +
 drivers/iommu/intel/svm.c     |  93 ++++++++---
 drivers/iommu/ioasid.c        | 287 +++++++++++++++++++++++++++++-----
 drivers/iommu/iommu-sva-lib.c |  19 ++-
 include/linux/ioasid.h        |  68 +++++++-
 6 files changed, 439 insertions(+), 77 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 809ef55c5ac5..190b1e6ca274 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -88,6 +88,27 @@
 #define LEVEL_STRIDE		(9)
 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
 
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KiB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KiB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
+
+/* PASIDs used by host SVM */
+struct ioasid_set *host_pasid_set;
+
 static inline int agaw_to_level(int agaw)
 {
 	return agaw + 2;
@@ -160,6 +181,7 @@ static void __init check_tylersburg_isoch(void);
 static int rwbf_quirk;
 static inline struct device_domain_info *
 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
+static bool scalable_mode_support(void);
 
 /*
  * set to 1 to panic kernel if can't successfully enable VT-d
@@ -3238,8 +3260,8 @@ static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
 	 * We can only free the PASID when all the devices are unbound.
 	 */
-	if (ioasid_find(NULL, ioasid, NULL)) {
-		pr_alert("Cannot free active IOASID %d\n", ioasid);
+	if (IS_ERR(ioasid_find(host_pasid_set, ioasid, NULL))) {
+		pr_err("IOASID %d to be freed but not in system set\n", ioasid);
 		return;
 	}
 	vcmd_free_pasid(iommu, ioasid);
@@ -3430,8 +3452,17 @@ static int __init init_dmars(void)
 		goto free_iommu;
 
 	/* PASID is needed for scalable mode irrespective to SVM */
-	if (intel_iommu_sm)
+	if (scalable_mode_support()) {
 		ioasid_install_capacity(intel_pasid_max_id);
+		/* We should not run out of IOASIDs at boot */
+		host_pasid_set = ioasid_set_alloc(NULL, PID_MAX_DEFAULT,
+						  IOASID_SET_TYPE_NULL);
+		if (IS_ERR_OR_NULL(host_pasid_set)) {
+			pr_err("Failed to allocate host PASID set %lu\n",
+				PTR_ERR(host_pasid_set));
+			intel_iommu_sm = 0;
+		}
+	}
 
 	/*
 	 * for each drhd
@@ -3478,7 +3509,7 @@ static int __init init_dmars(void)
 		disable_dmar_iommu(iommu);
 		free_dmar_iommu(iommu);
 	}
-
+	ioasid_set_free(host_pasid_set);
 	kfree(g_iommus);
 
 error:
@@ -4742,7 +4773,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 		u32 pasid;
 
 		/* No private data needed for the default pasid */
-		pasid = ioasid_alloc(NULL, PASID_MIN,
+		pasid = ioasid_alloc(host_pasid_set, PASID_MIN,
 				     pci_max_pasids(to_pci_dev(dev)) - 1,
 				     NULL);
 		if (pasid == INVALID_IOASID) {
@@ -4799,7 +4830,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 link_failed:
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
-		ioasid_put(domain->default_pasid);
+		ioasid_put(host_pasid_set, domain->default_pasid);
 
 	return ret;
 }
@@ -4829,7 +4860,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
-		ioasid_put(domain->default_pasid);
+		ioasid_put(host_pasid_set, domain->default_pasid);
 }
 
 static int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index cec210b6348b..5d588e11937c 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -105,6 +105,10 @@ static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte)
 }
 
 extern unsigned int intel_pasid_max_id;
+extern struct ioasid_set *host_pasid_set;
+int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
+void intel_pasid_free_id(u32 pasid);
+void *intel_pasid_lookup_id(u32 pasid);
 int intel_pasid_alloc_table(struct device *dev);
 void intel_pasid_free_table(struct device *dev);
 struct pasid_table *intel_pasid_get_table(struct device *dev);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 73402720541d..1f2888854d54 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -282,7 +282,9 @@ static const struct mmu_notifier_ops intel_mmuops = {
 
 static DEFINE_MUTEX(pasid_mutex);
 
-static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
+static int pasid_to_svm_sdev(struct device *dev,
+			     struct ioasid_set *set,
+			     unsigned int pasid,
 			     struct intel_svm **rsvm,
 			     struct intel_svm_dev **rsdev)
 {
@@ -296,7 +298,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
 	if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
 		return -EINVAL;
 
-	svm = pasid_private_find(pasid);
+	svm = ioasid_find(set, pasid, NULL);
 	if (IS_ERR(svm))
 		return PTR_ERR(svm);
 
@@ -364,7 +366,8 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	dmar_domain = to_dmar_domain(domain);
 
 	mutex_lock(&pasid_mutex);
-	ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
+	ret = pasid_to_svm_sdev(dev, NULL,
+				data->hpasid, &svm, &sdev);
 	if (ret)
 		goto out;
 
@@ -471,7 +474,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 		return -EINVAL;
 
 	mutex_lock(&pasid_mutex);
-	ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
+	ret = pasid_to_svm_sdev(dev, NULL, pasid, &svm, &sdev);
 	if (ret)
 		goto out;
 
@@ -587,17 +590,63 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
 			sdev->qdep = 0;
 	}
 
-	/* Setup the pasid table: */
-	sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ?
-			PASID_FLAG_SUPERVISOR_MODE : 0;
-	sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
-	spin_lock_irqsave(&iommu->lock, iflags);
-	ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid,
-					    FLPT_DEFAULT_DID, sflags);
-	spin_unlock_irqrestore(&iommu->lock, iflags);
+	/* Finish the setup now we know we're keeping it */
+	sdev->users = 1;
+	sdev->ops = ops;
+	init_rcu_head(&sdev->rcu);
 
-	if (ret)
-		goto free_sdev;
+	if (!svm) {
+		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+		if (!svm) {
+			ret = -ENOMEM;
+			kfree(sdev);
+			goto out;
+		}
+
+		if (pasid_max > intel_pasid_max_id)
+			pasid_max = intel_pasid_max_id;
+
+		/* Do not use PASID 0, reserved for RID to PASID */
+		svm->pasid = ioasid_alloc(host_pasid_set, PASID_MIN,
+					  pasid_max - 1, svm);
+		if (svm->pasid == INVALID_IOASID) {
+			kfree(svm);
+			kfree(sdev);
+			ret = -ENOSPC;
+			goto out;
+		}
+		svm->notifier.ops = &intel_mmuops;
+		svm->mm = mm;
+		svm->flags = flags;
+		INIT_LIST_HEAD_RCU(&svm->devs);
+		INIT_LIST_HEAD(&svm->list);
+		ret = -ENOMEM;
+		if (mm) {
+			ret = mmu_notifier_register(&svm->notifier, mm);
+			if (ret) {
+				ioasid_put(host_pasid_set, svm->pasid);
+				kfree(svm);
+				kfree(sdev);
+				goto out;
+			}
+		}
+
+		spin_lock_irqsave(&iommu->lock, iflags);
+		ret = intel_pasid_setup_first_level(iommu, dev,
+				mm ? mm->pgd : init_mm.pgd,
+				svm->pasid, FLPT_DEFAULT_DID,
+				(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+				(cpu_feature_enabled(X86_FEATURE_LA57) ?
+				 PASID_FLAG_FL5LP : 0));
+		spin_unlock_irqrestore(&iommu->lock, iflags);
+		if (ret) {
+			if (mm)
+				mmu_notifier_unregister(&svm->notifier, mm);
+			ioasid_put(host_pasid_set, svm->pasid);
+			kfree(svm);
+			kfree(sdev);
+			goto out;
+		}
 
 	list_add_rcu(&sdev->list, &svm->devs);
 success:
@@ -629,7 +678,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 	if (!iommu)
 		goto out;
 
-	ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
+	ret = pasid_to_svm_sdev(dev, host_pasid_set,
+				pasid, &svm, &sdev);
 	if (ret)
 		goto out;
 	mm = svm->mm;
@@ -651,9 +701,13 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
-				if (svm->notifier.ops)
-					mmu_notifier_unregister(&svm->notifier, mm);
-				pasid_private_remove(svm->pasid);
+				ioasid_put(host_pasid_set, svm->pasid);
+				if (svm->mm) {
+					mmu_notifier_unregister(&svm->notifier, svm->mm);
+					/* Clear mm's pasid. */
+					load_pasid(svm->mm, PASID_DISABLED);
+				}
+				list_del(&svm->list);
 				/* We mandate that no page faults may be outstanding
 				 * for the PASID when intel_svm_unbind_mm() is called.
 				 * If that is not obeyed, subtle errors will happen.
@@ -1120,7 +1174,8 @@ int intel_svm_page_response(struct device *dev,
 		goto out;
 	}
 
-	ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
+	ret = pasid_to_svm_sdev(dev, host_pasid_set,
+				prm->pasid, &svm, &sdev);
 	if (ret || !sdev) {
 		ret = -ENODEV;
 		goto out;
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 28681b99340b..4199bc539a15 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * I/O Address Space ID allocator. There is one global IOASID space, split into
- * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc and ioasid_put.
+ * sets. Users create a set with ioasid_set_alloc, then allocate/free IDs
+ * with ioasid_alloc, ioasid_put, and ioasid_free.
  */
 #include <linux/ioasid.h>
 #include <linux/module.h>
@@ -14,6 +14,7 @@
 #define PCI_PASID_MAX 0x100000
 static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
+static DEFINE_XARRAY_ALLOC(ioasid_sets);
 struct ioasid_data {
 	ioasid_t id;
 	struct ioasid_set *set;
@@ -394,6 +395,151 @@ void ioasid_detach_data(ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_detach_data);
 
+static inline bool ioasid_set_is_valid(struct ioasid_set *set)
+{
+	return xa_load(&ioasid_sets, set->id) == set;
+}
+
+/**
+ * ioasid_set_alloc - Allocate a new IOASID set for a given token
+ *
+ * @token:	An optional arbitrary number that can be associated with the
+ *		IOASID set. @token can be NULL if the type is
+ *		IOASID_SET_TYPE_NULL
+ * @quota:	Quota allowed in this set, 0 indicates no limit for the set
+ * @type:	The type of the token used to create the IOASID set
+ *
+ * IOASID is limited system-wide resource that requires quota management.
+ * Token will be stored in the ioasid_set returned. A reference will be taken
+ * on the newly created set. Subsequent IOASID allocation within the set need
+ * to use the returned ioasid_set pointer.
+ */
+struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type)
+{
+	struct ioasid_set *set;
+	unsigned long index;
+	ioasid_t id;
+
+	if (type >= IOASID_SET_TYPE_NR)
+		return ERR_PTR(-EINVAL);
+
+	/* No limit for the set, use whatever is available on the system */
+	if (!quota)
+		quota = ioasid_capacity_avail;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (quota > ioasid_capacity_avail) {
+		pr_warn("Out of IOASID capacity! ask %d, avail %d\n",
+			quota, ioasid_capacity_avail);
+		set = ERR_PTR(-ENOSPC);
+		goto exit_unlock;
+	}
+
+	/*
+	 * Token is only unique within its types but right now we have only
+	 * mm type. If we have more token types, we have to match type as well.
+	 */
+	switch (type) {
+	case IOASID_SET_TYPE_MM:
+		if (!token) {
+			set = ERR_PTR(-EINVAL);
+			goto exit_unlock;
+		}
+		/* Search existing set tokens, reject duplicates */
+		xa_for_each(&ioasid_sets, index, set) {
+			if (set->token == token && set->type == IOASID_SET_TYPE_MM) {
+				set = ERR_PTR(-EEXIST);
+				goto exit_unlock;
+			}
+		}
+		break;
+	case IOASID_SET_TYPE_NULL:
+		if (!token)
+			break;
+		fallthrough;
+	default:
+		pr_err("Invalid token and IOASID type\n");
+		set = ERR_PTR(-EINVAL);
+		goto exit_unlock;
+	}
+
+	set = kzalloc(sizeof(*set), GFP_ATOMIC);
+	if (!set) {
+		set = ERR_PTR(-ENOMEM);
+		goto exit_unlock;
+	}
+
+	if (xa_alloc(&ioasid_sets, &id, set,
+		     XA_LIMIT(0, ioasid_capacity_avail),
+		     GFP_ATOMIC)) {
+		kfree(set);
+		set = ERR_PTR(-ENOSPC);
+		goto exit_unlock;
+	}
+
+	set->token = token;
+	set->type = type;
+	set->quota = quota;
+	set->id = id;
+	atomic_set(&set->nr_ioasids, 0);
+	/*
+	 * Per set XA is used to store private IDs within the set, get ready
+	 * for ioasid_set private ID and system-wide IOASID allocation
+	 * results.
+	 */
+	xa_init(&set->xa);
+	ioasid_capacity_avail -= quota;
+
+exit_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+
+	return set;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_alloc);
+
+static int ioasid_set_free_locked(struct ioasid_set *set)
+{
+	int ret = 0;
+
+	if (!ioasid_set_is_valid(set)) {
+		ret = -EINVAL;
+		goto exit_done;
+	}
+
+	if (atomic_read(&set->nr_ioasids)) {
+		ret = -EBUSY;
+		goto exit_done;
+	}
+
+	WARN_ON(!xa_empty(&set->xa));
+	/*
+	 * Token got released right away after the ioasid_set is freed.
+	 * If a new set is created immediately with the newly released token,
+	 * it will not allocate the same IOASIDs unless they are reclaimed.
+	 */
+	xa_erase(&ioasid_sets, set->id);
+	kfree_rcu(set, rcu);
+exit_done:
+	return ret;
+};
+
+/**
+ * @brief Free an ioasid_set if empty. Restore pending notification list.
+ *
+ * @param set to be freed
+ * @return
+ */
+int ioasid_set_free(struct ioasid_set *set)
+{
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+	ret = ioasid_set_free_locked(set);
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_free);
+
 /**
  * ioasid_alloc - Allocate an IOASID
  * @set: the IOASID set
@@ -411,11 +557,21 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 {
 	struct ioasid_data *data;
 	void *adata;
-	ioasid_t id;
+	ioasid_t id = INVALID_IOASID;
+
+	spin_lock(&ioasid_allocator_lock);
+	/* Check if the IOASID set has been allocated and initialized */
+	if (!ioasid_set_is_valid(set))
+		goto done_unlock;
+
+	if (set->quota <= atomic_read(&set->nr_ioasids)) {
+		pr_err("IOASID set out of quota %d\n", set->quota);
+		goto done_unlock;
+	}
 
 	data = kzalloc(sizeof(*data), GFP_ATOMIC);
 	if (!data)
-		return INVALID_IOASID;
+		goto done_unlock;
 
 	data->set = set;
 	data->private = private;
@@ -425,7 +581,6 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 	 * Custom allocator needs allocator data to perform platform specific
 	 * operations.
 	 */
-	spin_lock(&ioasid_allocator_lock);
 	adata = active_allocator->flags & IOASID_ALLOCATOR_CUSTOM ? active_allocator->ops->pdata : data;
 	id = active_allocator->ops->alloc(min, max, adata);
 	if (id == INVALID_IOASID) {
@@ -442,67 +597,121 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 	}
 	data->id = id;
 
-	spin_unlock(&ioasid_allocator_lock);
-	return id;
+	/* Store IOASID in the per set data */
+	if (xa_err(xa_store(&set->xa, id, data, GFP_ATOMIC))) {
+		pr_err("Failed to store ioasid %d in set\n", id);
+		active_allocator->ops->free(id, active_allocator->ops->pdata);
+		goto exit_free;
+	}
+	atomic_inc(&set->nr_ioasids);
+	goto done_unlock;
 exit_free:
-	spin_unlock(&ioasid_allocator_lock);
 	kfree(data);
-	return INVALID_IOASID;
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return id;
 }
 EXPORT_SYMBOL_GPL(ioasid_alloc);
 
+static void ioasid_do_free_locked(struct ioasid_data *data)
+{
+	struct ioasid_data *ioasid_data;
+
+	active_allocator->ops->free(data->id, active_allocator->ops->pdata);
+	/* Custom allocator needs additional steps to free the xa element */
+	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
+		ioasid_data = xa_erase(&active_allocator->xa, data->id);
+		kfree_rcu(ioasid_data, rcu);
+	}
+	atomic_dec(&data->set->nr_ioasids);
+	xa_erase(&data->set->xa, data->id);
+	/* Destroy the set if empty */
+	if (!atomic_read(&data->set->nr_ioasids))
+		ioasid_set_free_locked(data->set);
+}
+
+int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	struct ioasid_data *data;
+
+	data = xa_load(&active_allocator->xa, ioasid);
+	if (!data) {
+		pr_err("Trying to get unknown IOASID %u\n", ioasid);
+		return -EINVAL;
+	}
+
+	/* Check set ownership if the set is non-null */
+	if (set && data->set != set) {
+		pr_err("Trying to get IOASID %u outside the set\n", ioasid);
+		/* data found but does not belong to the set */
+		return -EACCES;
+	}
+	refcount_inc(&data->refs);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ioasid_get_locked);
+
 /**
  * ioasid_get - obtain a reference to the IOASID
+ * @set:	the ioasid_set to check permission against if not NULL
+ * @ioasid:	the IOASID to get reference
+ *
+ *
+ * Return: 0 on success, error if failed.
  */
-void ioasid_get(ioasid_t ioasid)
+int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
 {
-	struct ioasid_data *ioasid_data;
+	int ret;
 
 	spin_lock(&ioasid_allocator_lock);
-	ioasid_data = xa_load(&active_allocator->xa, ioasid);
-	if (ioasid_data)
-		refcount_inc(&ioasid_data->refs);
-	else
-		WARN_ON(1);
+	ret = ioasid_get_locked(set, ioasid);
 	spin_unlock(&ioasid_allocator_lock);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ioasid_get);
 
+bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	struct ioasid_data *data;
+
+	data = xa_load(&active_allocator->xa, ioasid);
+	if (!data) {
+		pr_err("Trying to put unknown IOASID %u\n", ioasid);
+		return false;
+	}
+	if (set && data->set != set) {
+		pr_err("Trying to drop IOASID %u outside the set\n", ioasid);
+		return false;
+	}
+	if (!refcount_dec_and_test(&data->refs))
+		return false;
+
+	ioasid_do_free_locked(data);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(ioasid_put_locked);
+
 /**
  * ioasid_put - Release a reference to an ioasid
- * @ioasid: the ID to remove
+ * @set:	the ioasid_set to check permission against if not NULL
+ * @ioasid:	the IOASID to drop reference
  *
  * Put a reference to the IOASID, free it when the number of references drops to
  * zero.
  *
  * Return: %true if the IOASID was freed, %false otherwise.
  */
-bool ioasid_put(ioasid_t ioasid)
+bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
 {
-	bool free = false;
-	struct ioasid_data *ioasid_data;
+	bool ret;
 
 	spin_lock(&ioasid_allocator_lock);
-	ioasid_data = xa_load(&active_allocator->xa, ioasid);
-	if (!ioasid_data) {
-		pr_err("Trying to free unknown IOASID %u\n", ioasid);
-		goto exit_unlock;
-	}
-
-	free = refcount_dec_and_test(&ioasid_data->refs);
-	if (!free)
-		goto exit_unlock;
-
-	active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
-	/* Custom allocator needs additional steps to free the xa element */
-	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
-		ioasid_data = xa_erase(&active_allocator->xa, ioasid);
-		kfree_rcu(ioasid_data, rcu);
-	}
-
-exit_unlock:
+	ret = ioasid_put_locked(set, ioasid);
 	spin_unlock(&ioasid_allocator_lock);
-	return free;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ioasid_put);
 
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
index bd41405d34e9..7f97a03a135b 100644
--- a/drivers/iommu/iommu-sva-lib.c
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -8,7 +8,16 @@
 #include "iommu-sva-lib.h"
 
 static DEFINE_MUTEX(iommu_sva_lock);
-static DECLARE_IOASID_SET(iommu_sva_pasid);
+static struct ioasid_set *iommu_sva_pasid;
+
+/* Must be called before PASID allocations can occur */
+void iommu_sva_init(void)
+{
+	if (iommu_sva_pasid)
+		return;
+	iommu_sva_pasid = ioasid_set_alloc(NULL, 0, IOASID_SET_TYPE_NULL);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_init);
 
 /**
  * iommu_sva_alloc_pasid - Allocate a PASID for the mm
@@ -35,11 +44,11 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
 	mutex_lock(&iommu_sva_lock);
 	if (mm->pasid) {
 		if (mm->pasid >= min && mm->pasid <= max)
-			ioasid_get(mm->pasid);
+			ioasid_get(iommu_sva_pasid, mm->pasid);
 		else
 			ret = -EOVERFLOW;
 	} else {
-		pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
+		pasid = ioasid_alloc(iommu_sva_pasid, min, max, mm);
 		if (pasid == INVALID_IOASID)
 			ret = -ENOMEM;
 		else
@@ -59,7 +68,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
 void iommu_sva_free_pasid(struct mm_struct *mm)
 {
 	mutex_lock(&iommu_sva_lock);
-	if (ioasid_put(mm->pasid))
+	if (ioasid_put(iommu_sva_pasid, mm->pasid))
 		mm->pasid = 0;
 	mutex_unlock(&iommu_sva_lock);
 }
@@ -81,6 +90,6 @@ static bool __mmget_not_zero(void *mm)
  */
 struct mm_struct *iommu_sva_find(ioasid_t pasid)
 {
-	return ioasid_find(&iommu_sva_pasid, pasid, __mmget_not_zero);
+	return ioasid_find(iommu_sva_pasid, pasid, __mmget_not_zero);
 }
 EXPORT_SYMBOL_GPL(iommu_sva_find);
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index bd0a36e16e68..0521742c765e 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -4,14 +4,43 @@
 
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/xarray.h>
+#include <linux/refcount.h>
 
 #define INVALID_IOASID ((ioasid_t)-1)
 typedef unsigned int ioasid_t;
 typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
 typedef void (*ioasid_free_fn_t)(ioasid_t ioasid, void *data);
 
+/* IOASID set types */
+enum ioasid_set_type {
+	IOASID_SET_TYPE_NULL = 1, /* Set token is NULL */
+	IOASID_SET_TYPE_MM,	  /* Set token is a mm_struct pointer
+				   * i.e. associated with a process
+				   */
+	IOASID_SET_TYPE_NR,
+};
+
+/**
+ * struct ioasid_set - Meta data about ioasid_set
+ * @nh:		List of notifiers private to that set
+ * @xa:		XArray to store ioasid_set private IDs, can be used for
+ *		guest-host IOASID mapping, or just a private IOASID namespace.
+ * @token:	Unique to identify an IOASID set
+ * @type:	Token types
+ * @quota:	Max number of IOASIDs can be allocated within the set
+ * @nr_ioasids:	Number of IOASIDs currently allocated in the set
+ * @id:		ID of the set
+ */
 struct ioasid_set {
-	int dummy;
+	struct atomic_notifier_head nh;
+	struct xarray xa;
+	void *token;
+	int type;
+	int quota;
+	atomic_t nr_ioasids;
+	int id;
+	struct rcu_head rcu;
 };
 
 /**
@@ -29,17 +58,20 @@ struct ioasid_allocator_ops {
 	void *pdata;
 };
 
-#define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
-
 #if IS_ENABLED(CONFIG_IOASID)
 void ioasid_install_capacity(ioasid_t total);
 int ioasid_reserve_capacity(ioasid_t nr_ioasid);
 int ioasid_cancel_capacity(ioasid_t nr_ioasid);
+struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type);
+int ioasid_set_free(struct ioasid_set *set);
+struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token);
 
 ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		      void *private);
-void ioasid_get(ioasid_t ioasid);
-bool ioasid_put(ioasid_t ioasid);
+int ioasid_get(struct ioasid_set *set, ioasid_t ioasid);
+int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid);
+bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
+bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -71,11 +103,33 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
 	return INVALID_IOASID;
 }
 
-static inline void ioasid_get(ioasid_t ioasid)
+static inline struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota,
+						  ioasid_set_type type)
 {
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token)
+{
+	return NULL;
+}
+
+static inline int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
+{
+	return -ENOTSUPP;
+}
+
+static inline int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	return -ENOTSUPP;
+}
+
+static inline bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
+{
+	return false;
 }
 
-static inline bool ioasid_put(ioasid_t ioasid)
+static inline bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
 {
 	return false;
 }
-- 
Gitee


From e20fe666ea28b4b06e179d5e235189a66b973cb5 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 15 Jul 2022 16:11:27 +0800
Subject: [PATCH 1433/2161] iommu: Resolve merge conflict for
 intel_svm_bind_mm()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for intel_svm_bind_mm().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 192 ++++++++++++++++++++------------------
 1 file changed, 103 insertions(+), 89 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 1f2888854d54..06d993e21511 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -522,67 +522,79 @@ static void intel_svm_free_pasid(struct mm_struct *mm)
 	iommu_sva_free_pasid(mm);
 }
 
-static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
-					   struct device *dev,
-					   struct mm_struct *mm,
-					   unsigned int flags)
+/* Caller must hold pasid_mutex, mm reference */
+static int
+intel_svm_bind_mm(struct device *dev, unsigned int flags,
+		  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
-	struct device_domain_info *info = get_domain_info(dev);
-	unsigned long iflags, sflags;
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct intel_svm *svm = NULL, *t;
+	struct device_domain_info *info;
 	struct intel_svm_dev *sdev;
-	struct intel_svm *svm;
-	int ret = 0;
+	unsigned long iflags;
+	int pasid_max;
+	int ret;
 
-	svm = pasid_private_find(mm->pasid);
-	if (!svm) {
-		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
-		if (!svm)
-			return ERR_PTR(-ENOMEM);
+	if (!iommu || dmar_disabled)
+		return -EINVAL;
 
-		svm->pasid = mm->pasid;
-		svm->mm = mm;
-		svm->flags = flags;
-		INIT_LIST_HEAD_RCU(&svm->devs);
+	if (!intel_svm_capable(iommu))
+		return -ENOTSUPP;
 
-		if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) {
-			svm->notifier.ops = &intel_mmuops;
-			ret = mmu_notifier_register(&svm->notifier, mm);
-			if (ret) {
-				kfree(svm);
-				return ERR_PTR(ret);
-			}
-		}
+	if (dev_is_pci(dev)) {
+		pasid_max = pci_max_pasids(to_pci_dev(dev));
+		if (pasid_max < 0)
+			return -EINVAL;
+	} else
+		pasid_max = 1 << 20;
 
-		ret = pasid_private_add(svm->pasid, svm);
-		if (ret) {
-			if (svm->notifier.ops)
-				mmu_notifier_unregister(&svm->notifier, mm);
-			kfree(svm);
-			return ERR_PTR(ret);
+	/* Bind supervisor PASID shuld have mm = NULL */
+	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+		if (!ecap_srs(iommu->ecap) || mm) {
+			pr_err("Supervisor PASID with user provided mm.\n");
+			return -EINVAL;
 		}
 	}
 
-	/* Find the matching device in svm list */
-	sdev = svm_lookup_device_by_dev(svm, dev);
-	if (sdev) {
-		sdev->users++;
-		goto success;
+	list_for_each_entry(t, &global_svm_list, list) {
+		if (t->mm != mm)
+			continue;
+
+		svm = t;
+		if (svm->pasid >= pasid_max) {
+			dev_warn(dev,
+				 "Limited PASID width. Cannot use existing PASID %d\n",
+				 svm->pasid);
+			ret = -ENOSPC;
+			goto out;
+		}
+
+		/* Find the matching device in svm list */
+		for_each_svm_dev(sdev, svm, dev) {
+			sdev->users++;
+			goto success;
+		}
+
+		break;
 	}
 
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 	if (!sdev) {
 		ret = -ENOMEM;
-		goto free_svm;
+		goto out;
 	}
-
 	sdev->dev = dev;
 	sdev->iommu = iommu;
+
+	ret = intel_iommu_enable_pasid(iommu, dev);
+	if (ret) {
+		kfree(sdev);
+		goto out;
+	}
+
+	info = get_domain_info(dev);
 	sdev->did = FLPT_DEFAULT_DID;
 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
-	sdev->users = 1;
-	sdev->pasid = svm->pasid;
-	sdev->sva.dev = dev;
-	init_rcu_head(&sdev->rcu);
 	if (info->ats_enabled) {
 		sdev->dev_iotlb = 1;
 		sdev->qdep = info->ats_qdep;
@@ -592,7 +604,6 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
 
 	/* Finish the setup now we know we're keeping it */
 	sdev->users = 1;
-	sdev->ops = ops;
 	init_rcu_head(&sdev->rcu);
 
 	if (!svm) {
@@ -648,21 +659,38 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
 			goto out;
 		}
 
+		list_add_tail(&svm->list, &global_svm_list);
+		if (mm) {
+			/* The newly allocated pasid is loaded to the mm. */
+			load_pasid(mm, svm->pasid);
+		}
+	} else {
+		/*
+		 * Binding a new device with existing PASID, need to setup
+		 * the PASID entry.
+		 */
+		spin_lock_irqsave(&iommu->lock, iflags);
+		ret = intel_pasid_setup_first_level(iommu, dev,
+						mm ? mm->pgd : init_mm.pgd,
+						svm->pasid, FLPT_DEFAULT_DID,
+						(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+						(cpu_feature_enabled(X86_FEATURE_LA57) ?
+						PASID_FLAG_FL5LP : 0));
+		spin_unlock_irqrestore(&iommu->lock, iflags);
+		if (ret) {
+			kfree(sdev);
+			goto out;
+		}
+	}
 	list_add_rcu(&sdev->list, &svm->devs);
 success:
-	return &sdev->sva;
-
-free_sdev:
-	kfree(sdev);
-free_svm:
-	if (list_empty(&svm->devs)) {
-		if (svm->notifier.ops)
-			mmu_notifier_unregister(&svm->notifier, mm);
-		pasid_private_remove(mm->pasid);
-		kfree(svm);
-	}
-
-	return ERR_PTR(ret);
+	sdev->pasid = svm->pasid;
+	sdev->sva.dev = dev;
+	if (sd)
+		*sd = sdev;
+	ret = 0;
+out:
+	return ret;
 }
 
 /* Caller must hold pasid_mutex */
@@ -671,7 +699,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 	struct intel_svm_dev *sdev;
 	struct intel_iommu *iommu;
 	struct intel_svm *svm;
-	struct mm_struct *mm;
 	int ret = -EINVAL;
 
 	iommu = device_to_iommu(dev, NULL, NULL);
@@ -682,7 +709,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 				pasid, &svm, &sdev);
 	if (ret)
 		goto out;
-	mm = svm->mm;
 
 	if (sdev) {
 		sdev->users--;
@@ -716,8 +742,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 				kfree(svm);
 			}
 		}
-		/* Drop a PASID reference and free it if no reference. */
-		intel_svm_free_pasid(mm);
 	}
 out:
 	return ret;
@@ -1068,42 +1092,31 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	return IRQ_RETVAL(handled);
 }
 
-struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
+struct iommu_sva *
+intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 {
-	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct iommu_sva *sva = ERR_PTR(-EINVAL);
+	struct intel_svm_dev *sdev = NULL;
 	unsigned int flags = 0;
-	struct iommu_sva *sva;
 	int ret;
 
+	/*
+	 * TODO: Consolidate with generic iommu-sva bind after it is merged.
+	 * It will require shared SVM data structures, i.e. combine io_mm
+	 * and intel_svm etc.
+	 */
 	if (drvdata)
 		flags = *(unsigned int *)drvdata;
-
-	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
-		if (!ecap_srs(iommu->ecap)) {
-			dev_err(dev, "%s: Supervisor PASID not supported\n",
-				iommu->name);
-			return ERR_PTR(-EOPNOTSUPP);
-		}
-
-		if (mm) {
-			dev_err(dev, "%s: Supervisor PASID with user provided mm\n",
-				iommu->name);
-			return ERR_PTR(-EINVAL);
-		}
-
-		mm = &init_mm;
-	}
-
 	mutex_lock(&pasid_mutex);
-	ret = intel_svm_alloc_pasid(dev, mm, flags);
-	if (ret) {
-		mutex_unlock(&pasid_mutex);
-		return ERR_PTR(ret);
-	}
+	ret = intel_svm_bind_mm(dev, flags, mm, &sdev);
+	if (ret)
+		sva = ERR_PTR(ret);
+	else if (sdev)
+		sva = &sdev->sva;
+	else
+		WARN(!sdev, "SVM bind succeeded with no sdev!\n");
 
-	sva = intel_svm_bind_mm(iommu, dev, mm, flags);
-	if (IS_ERR_OR_NULL(sva))
-		intel_svm_free_pasid(mm);
 	mutex_unlock(&pasid_mutex);
 
 	return sva;
@@ -1111,9 +1124,10 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void
 
 void intel_svm_unbind(struct iommu_sva *sva)
 {
-	struct intel_svm_dev *sdev = to_intel_svm_dev(sva);
+	struct intel_svm_dev *sdev;
 
 	mutex_lock(&pasid_mutex);
+	sdev = to_intel_svm_dev(sva);
 	intel_svm_unbind_mm(sdev->dev, sdev->pasid);
 	mutex_unlock(&pasid_mutex);
 }
-- 
Gitee


From 48491c42e4a610e8272598024fb91e5d13c39f76 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 16:25:33 +0800
Subject: [PATCH 1434/2161] iommu: Resolve merge conflict for load_pasid()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for load_pasid(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 06d993e21511..1e04183bab6a 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -508,18 +508,22 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 	return ret;
 }
 
-static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
-				 unsigned int flags)
+static void _load_pasid(void *unused)
 {
-	ioasid_t max_pasid = dev_is_pci(dev) ?
-			pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id;
-
-	return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
+//	update_pasid();
 }
 
-static void intel_svm_free_pasid(struct mm_struct *mm)
+static void load_pasid(struct mm_struct *mm, u32 pasid)
 {
-	iommu_sva_free_pasid(mm);
+	mutex_lock(&mm->context.lock);
+
+	/* Synchronize with READ_ONCE in update_pasid(). */
+	smp_store_release(&mm->pasid, pasid);
+
+	/* Update PASID MSR on all CPUs running the mm's tasks. */
+	on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
+
+	mutex_unlock(&mm->context.lock);
 }
 
 /* Caller must hold pasid_mutex, mm reference */
-- 
Gitee


From aaef2cc169298104da32b6b98bc9aae66d76cbd0 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 17 Feb 2021 13:42:05 -0800
Subject: [PATCH 1435/2161] iommu/ioasid: Add free function and states

commit fc3cbe05486c2ecaa322c2ee68f2362e5704a124 Intel-BKC.

When an actively used IOASID is freed due to exceptions, users must be
notified to perform the cleanup. The IOASID shall be put in a pending
state until all users completed their cleanup work.

This patch adds ioasid_free() function to let the caller initiate the
freeing process. Both ioasid_free() and ioasid_put() decrements
reference counts. Unlike ioasid_put(), the ioasid_free() function also
transition the IOASID to the free pending state where further
ioasid_get() is prohibited. This paves the way for FREE event
notifications that will be introduced next.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 73 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/ioasid.h |  5 +++
 2 files changed, 78 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 4199bc539a15..df998d0cfd99 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -15,8 +15,26 @@
 static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 static DEFINE_XARRAY_ALLOC(ioasid_sets);
+
+enum ioasid_state {
+	IOASID_STATE_IDLE,
+	IOASID_STATE_ACTIVE,
+	IOASID_STATE_FREE_PENDING,
+};
+
+/**
+ * struct ioasid_data - Meta data about ioasid
+ *
+ * @id:		Unique ID
+ * @refs:	Number of active users
+ * @state:	Track state of the IOASID
+ * @set:	ioasid_set of the IOASID belongs to
+ * @private:	Private data associated with the IOASID
+ * @rcu:	For free after RCU grace period
+ */
 struct ioasid_data {
 	ioasid_t id;
+	enum ioasid_state state;
 	struct ioasid_set *set;
 	void *private;
 	struct rcu_head rcu;
@@ -596,6 +614,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		goto exit_free;
 	}
 	data->id = id;
+	data->state = IOASID_STATE_IDLE;
 
 	/* Store IOASID in the per set data */
 	if (xa_err(xa_store(&set->xa, id, data, GFP_ATOMIC))) {
@@ -630,6 +649,56 @@ static void ioasid_do_free_locked(struct ioasid_data *data)
 		ioasid_set_free_locked(data->set);
 }
 
+static void ioasid_free_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	struct ioasid_data *data;
+
+	data = xa_load(&active_allocator->xa, ioasid);
+	if (!data) {
+		pr_err("Trying to free unknown IOASID %u\n", ioasid);
+		return;
+	}
+	if (data->set != set) {
+		pr_warn("Cannot free IOASID %u due to set ownership\n", ioasid);
+		return;
+	}
+	/* Check if the set exists */
+	if (WARN_ON(!xa_load(&ioasid_sets, data->set->id)))
+		return;
+
+	/* Free is already in progress */
+	if (data->state == IOASID_STATE_FREE_PENDING)
+		return;
+
+	data->state = IOASID_STATE_FREE_PENDING;
+	/*
+	 * If the refcount is 1, it means there is no other users of the IOASID
+	 * other than IOASID core itself. There is no need to notify anyone.
+	 */
+	if (!refcount_dec_and_test(&data->refs))
+		return;
+
+	ioasid_do_free_locked(data);
+}
+
+/**
+ * ioasid_free - Drop reference on an IOASID. Free if refcount drops to 0,
+ *               including free from its set and system-wide list.
+ * @set:	The ioasid_set to check permission with. If not NULL, IOASID
+ *		free will fail if the set does not match.
+ * @ioasid:	The IOASID to remove
+ *
+ * TODO: return true if all references dropped, false if async work is in
+ * progress, IOASID is in FREE_PENDING state. wait queue to be used for blocking
+ * free task.
+ */
+void ioasid_free(struct ioasid_set *set, ioasid_t ioasid)
+{
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_free_locked(set, ioasid);
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_free);
 int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
 {
 	struct ioasid_data *data;
@@ -639,6 +708,10 @@ int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
 		pr_err("Trying to get unknown IOASID %u\n", ioasid);
 		return -EINVAL;
 	}
+	if (data->state == IOASID_STATE_FREE_PENDING) {
+		pr_err("Trying to get IOASID being freed%u\n", ioasid);
+		return -EBUSY;
+	}
 
 	/* Check set ownership if the set is non-null */
 	if (set && data->set != set) {
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 0521742c765e..39233abd694f 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -72,6 +72,7 @@ int ioasid_get(struct ioasid_set *set, ioasid_t ioasid);
 int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
+void ioasid_free(struct ioasid_set *set, ioasid_t ioasid);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -109,6 +110,10 @@ static inline struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota,
 	return ERR_PTR(-ENOTSUPP);
 }
 
+static inline void ioasid_free(struct ioasid_set *set, ioasid_t ioasid)
+{
+}
+
 static inline struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token)
 {
 	return NULL;
-- 
Gitee


From c0cd167c80aedad929fba0ffbe0615b79221118a Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 31 May 2022 20:17:33 +0800
Subject: [PATCH 1436/2161] iommu/ioasid: Add ioasid_set iterator helper
 functions

commit aeda6ac61b8e008ce14ce5a9389bf7eb88eef411 Intel-BKC.

Users of an ioasid_set may not keep track of all the IOASIDs allocated
under the set. When collective actions are needed for each IOASIDs, it
is useful to iterate over all the IOASIDs within the set. For example,
when the ioasid_set is freed, the user might perform the same cleanup
operation on each IOASID.

This patch adds an API to iterate all the IOASIDs within the set.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 84 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/ioasid.h | 22 ++++++++++-
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index df998d0cfd99..9e5703c07177 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -699,6 +699,61 @@ void ioasid_free(struct ioasid_set *set, ioasid_t ioasid)
 	spin_unlock(&ioasid_allocator_lock);
 }
 EXPORT_SYMBOL_GPL(ioasid_free);
+
+/**
+ * ioasid_free_all_in_set
+ *
+ * @brief
+ * Free all PASIDs from system-wide IOASID pool, all subscribers gets
+ * notified and do cleanup of their own.
+ * Note that some references of the IOASIDs within the set can still
+ * be held after the free call. This is OK in that the IOASIDs will be
+ * marked inactive, the only operations can be done is ioasid_put.
+ * No need to track IOASID set states since there is no reclaim phase.
+ *
+ * @param
+ * struct ioasid_set where all IOASIDs within the set will be freed.
+ */
+void ioasid_free_all_in_set(struct ioasid_set *set)
+{
+	struct ioasid_data *entry;
+	unsigned long index;
+
+	if (!ioasid_set_is_valid(set))
+		return;
+
+	if (xa_empty(&set->xa))
+		return;
+
+	if (!atomic_read(&set->nr_ioasids))
+		return;
+	spin_lock(&ioasid_allocator_lock);
+	xa_for_each(&set->xa, index, entry) {
+		ioasid_free_locked(set, index);
+		/* Free from per set private pool */
+		xa_erase(&set->xa, index);
+	}
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_free_all_in_set);
+
+/**
+ * ioasid_set_for_each_ioasid
+ * @brief
+ * Iterate over all the IOASIDs within the set
+ */
+void ioasid_set_for_each_ioasid(struct ioasid_set *set,
+				void (*fn)(ioasid_t id, void *data),
+				void *data)
+{
+	struct ioasid_data *entry;
+	unsigned long index;
+
+	xa_for_each(&set->xa, index, entry)
+		fn(index, data);
+}
+EXPORT_SYMBOL_GPL(ioasid_set_for_each_ioasid);
+
 int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
 {
 	struct ioasid_data *data;
@@ -788,6 +843,35 @@ bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_put);
 
+/**
+ * @brief
+ * Find the ioasid_set of an IOASID. As long as the IOASID is valid,
+ * the set must be valid since the refcounting is based on the number of IOASID
+ * in the set.
+ *
+ * @param ioasid
+ * @return struct ioasid_set*
+ */
+struct ioasid_set *ioasid_find_set(ioasid_t ioasid)
+{
+	struct ioasid_allocator_data *idata;
+	struct ioasid_data *ioasid_data;
+	struct ioasid_set *set = NULL;
+
+	rcu_read_lock();
+	idata = rcu_dereference(active_allocator);
+	ioasid_data = xa_load(&idata->xa, ioasid);
+	if (!ioasid_data) {
+		set = ERR_PTR(-ENOENT);
+		goto unlock;
+	}
+	set = ioasid_data->set;
+unlock:
+	rcu_read_unlock();
+	return set;
+}
+EXPORT_SYMBOL_GPL(ioasid_find_set);
+
 /**
  * ioasid_find - Find IOASID data
  * @set: the IOASID set
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 39233abd694f..130cfc0ff12d 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -73,16 +73,21 @@ int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
 void ioasid_free(struct ioasid_set *set, ioasid_t ioasid);
+void ioasid_free_all_in_set(struct ioasid_set *set);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
+struct ioasid_set *ioasid_find_set(ioasid_t ioasid);
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_attach_data(ioasid_t ioasid, void *data);
+void ioasid_detach_data(ioasid_t ioasid);
+void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
+				void (*fn)(ioasid_t id, void *data),
+				void *data);
 static inline bool pasid_valid(ioasid_t ioasid)
 {
 	return ioasid != INVALID_IOASID;
 }
-void ioasid_detach_data(ioasid_t ioasid);
 #else /* !CONFIG_IOASID */
 static inline void ioasid_install_capacity(ioasid_t total)
 {
@@ -167,5 +172,20 @@ static inline bool pasid_valid(ioasid_t ioasid)
 static inline void ioasid_detach_data(ioasid_t ioasid)
 {
 }
+
+static inline void ioasid_free_all_in_set(struct ioasid_set *set)
+{
+}
+
+static inline struct ioasid_set *ioasid_find_set(ioasid_t ioasid)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
+					      void (*fn)(ioasid_t id, void *data),
+					      void *data)
+{
+}
 #endif /* CONFIG_IOASID */
 #endif /* __LINUX_IOASID_H */
-- 
Gitee


From cb16f12f7e6354c81a963a737789eac4401c8dd6 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 20 Aug 2020 10:15:39 -0700
Subject: [PATCH 1437/2161] iommu/ioasid: Introduce ioasid_set private ID

commit 89ff132ab5d84be9508f5879f7b2bbf5b4ce3563 Intel-BKC.

When an IOASID set is used for guest SVA, each VM will acquire its
ioasid_set for IOASID allocations. IOASIDs within the VM must have a
host/physical IOASID backing, mapping between guest and host IOASIDs can
be non-identical. IOASID set private ID (SPID) is introduced in this
patch to be used as guest IOASID. However, the concept of ioasid_set
specific namespace is generic, thus named SPID.

As SPID namespace is within the IOASID set, the IOASID core can provide
lookup services at both directions. SPIDs may not be available when its
IOASID is allocated, the mapping between SPID and IOASID is usually
established when a guest page table is bound to a host PASID.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 104 +++++++++++++++++++++++++++++++++++++++++
 include/linux/ioasid.h |  18 +++++++
 2 files changed, 122 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 9e5703c07177..ab3de175eedd 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -26,6 +26,7 @@ enum ioasid_state {
  * struct ioasid_data - Meta data about ioasid
  *
  * @id:		Unique ID
+ * @spid:	Private ID unique within a set
  * @refs:	Number of active users
  * @state:	Track state of the IOASID
  * @set:	ioasid_set of the IOASID belongs to
@@ -34,6 +35,7 @@ enum ioasid_state {
  */
 struct ioasid_data {
 	ioasid_t id;
+	ioasid_t spid;
 	enum ioasid_state state;
 	struct ioasid_set *set;
 	void *private;
@@ -413,6 +415,107 @@ void ioasid_detach_data(ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_detach_data);
 
+static ioasid_t ioasid_find_by_spid_locked(struct ioasid_set *set, ioasid_t spid, bool get)
+{
+	ioasid_t ioasid = INVALID_IOASID;
+	struct ioasid_data *entry;
+	unsigned long index;
+
+	if (!xa_load(&ioasid_sets, set->id)) {
+		pr_warn("Invalid set\n");
+		goto done;
+	}
+
+	xa_for_each(&set->xa, index, entry) {
+		if (spid == entry->spid) {
+			if (get)
+				refcount_inc(&entry->refs);
+			ioasid = index;
+		}
+	}
+done:
+	return ioasid;
+}
+
+/**
+ * ioasid_attach_spid - Attach ioasid_set private ID to an IOASID
+ *
+ * @ioasid: the system-wide IOASID to attach
+ * @spid:   the ioasid_set private ID of @ioasid
+ *
+ * After attching SPID, future lookup can be done via ioasid_find_by_spid().
+ */
+int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid)
+{
+	struct ioasid_data *data;
+	int ret = 0;
+
+	if (spid == INVALID_IOASID)
+		return -EINVAL;
+
+	spin_lock(&ioasid_allocator_lock);
+	data = xa_load(&active_allocator->xa, ioasid);
+
+	if (!data) {
+		pr_err("No IOASID entry %d to attach SPID %d\n",
+			ioasid, spid);
+		ret = -ENOENT;
+		goto done_unlock;
+	}
+	/* Check if SPID is unique within the set */
+	if (ioasid_find_by_spid_locked(data->set, spid, false) != INVALID_IOASID) {
+		ret = -EINVAL;
+		goto done_unlock;
+	}
+	data->spid = spid;
+
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_attach_spid);
+
+void ioasid_detach_spid(ioasid_t ioasid)
+{
+	struct ioasid_data *data;
+
+	spin_lock(&ioasid_allocator_lock);
+	data = xa_load(&active_allocator->xa, ioasid);
+
+	if (!data || data->spid == INVALID_IOASID) {
+		pr_err("Invalid IOASID entry %d to detach\n", ioasid);
+		goto done_unlock;
+	}
+	data->spid = INVALID_IOASID;
+
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_detach_spid);
+
+/**
+ * ioasid_find_by_spid - Find the system-wide IOASID by a set private ID and
+ * its set.
+ *
+ * @set:	the ioasid_set to search within
+ * @spid:	the set private ID
+ * @get:	flag indicates whether to take a reference once found
+ *
+ * Given a set private ID and its IOASID set, find the system-wide IOASID. Take
+ * a reference upon finding the matching IOASID if @get is true. Return
+ * INVALID_IOASID if the IOASID is not found in the set or the set is not valid.
+ */
+ioasid_t ioasid_find_by_spid(struct ioasid_set *set, ioasid_t spid, bool get)
+{
+	ioasid_t ioasid;
+
+	spin_lock(&ioasid_allocator_lock);
+	ioasid = ioasid_find_by_spid_locked(set, spid, get);
+	spin_unlock(&ioasid_allocator_lock);
+	return ioasid;
+}
+EXPORT_SYMBOL_GPL(ioasid_find_by_spid);
+
 static inline bool ioasid_set_is_valid(struct ioasid_set *set)
 {
 	return xa_load(&ioasid_sets, set->id) == set;
@@ -615,6 +718,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 	}
 	data->id = id;
 	data->state = IOASID_STATE_IDLE;
+	data->spid = INVALID_IOASID;
 
 	/* Store IOASID in the per set data */
 	if (xa_err(xa_store(&set->xa, id, data, GFP_ATOMIC))) {
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 130cfc0ff12d..5631804d7b39 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -81,6 +81,9 @@ int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_attach_data(ioasid_t ioasid, void *data);
 void ioasid_detach_data(ioasid_t ioasid);
+int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid);
+void ioasid_detach_spid(ioasid_t ioasid);
+ioasid_t ioasid_find_by_spid(struct ioasid_set *set, ioasid_t spid, bool get);
 void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
 				void (*fn)(ioasid_t id, void *data),
 				void *data);
@@ -182,6 +185,21 @@ static inline struct ioasid_set *ioasid_find_set(ioasid_t ioasid)
 	return ERR_PTR(-ENOTSUPP);
 }
 
+static inline int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ioasid_detach_spid(ioasid_t ioasid)
+{
+}
+
+static inline ioasid_t ioasid_find_by_spid(struct ioasid_set *set,
+					   ioasid_t spid, bool get)
+{
+	return INVALID_IOASID;
+}
+
 static inline void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
 					      void (*fn)(ioasid_t id, void *data),
 					      void *data)
-- 
Gitee


From c87ec6cb2689d94da13c65b51d262dfa0e783b09 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 20 Aug 2020 10:40:08 -0700
Subject: [PATCH 1438/2161] iommu/ioasid: Introduce notification APIs

commit 49e9b48954a6ce8e99716eb9e2241fc77d0d6dd3 Intel-BKC.

Relations among IOASID users largely follow a publisher-subscriber
pattern. E.g. to support guest SVA on Intel Scalable I/O Virtualization
(SIOV) enabled platforms, VFIO, IOMMU, device drivers, KVM are all users
of IOASIDs. When a state change occurs, VFIO publishes the change event
that needs to be processed by other users/subscribers.

This patch introduced two types of notifications: global and per
ioasid_set. The latter is intended for users who only needs to handle
events related to the IOASID of a given set.
For more information, refer to the kernel documentation at
Documentation/ioasid.rst.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 111 +++++++++++++++++++++++++++++++++++++++--
 include/linux/ioasid.h |  54 ++++++++++++++++++++
 2 files changed, 161 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index ab3de175eedd..c361a0a855c6 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -10,12 +10,33 @@
 #include <linux/spinlock.h>
 #include <linux/xarray.h>
 
+/*
+ * An IOASID can have multiple consumers where each consumer may have
+ * hardware contexts associated with the IOASID.
+ * When a status change occurs, like on IOASID deallocation, notifier chains
+ * are used to keep the consumers in sync.
+ * This is a publisher-subscriber pattern where publisher can change the
+ * state of each IOASID, e.g. alloc/free, bind IOASID to a device and mm.
+ * On the other hand, subscribers get notified for the state change and
+ * keep local states in sync.
+ */
+static ATOMIC_NOTIFIER_HEAD(ioasid_notifier);
+static DEFINE_SPINLOCK(ioasid_nb_lock);
+
 /* Default to PCIe standard 20 bit PASID */
 #define PCI_PASID_MAX 0x100000
 static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 static DEFINE_XARRAY_ALLOC(ioasid_sets);
 
+struct ioasid_set_nb {
+	struct list_head	list;
+	struct notifier_block	*nb;
+	void			*token;
+	struct ioasid_set	*set;
+	bool			active;
+};
+
 enum ioasid_state {
 	IOASID_STATE_IDLE,
 	IOASID_STATE_ACTIVE,
@@ -415,6 +436,38 @@ void ioasid_detach_data(ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_detach_data);
 
+/**
+ * ioasid_notify - Send notification on a given IOASID for status change.
+ *
+ * @data:	The IOASID data to which the notification will send
+ * @cmd:	Notification event sent by IOASID external users, can be
+ *		IOASID_BIND or IOASID_UNBIND.
+ *
+ * @flags:	Special instructions, e.g. notify within a set or global by
+ *		IOASID_NOTIFY_FLAG_SET or IOASID_NOTIFY_FLAG_ALL flags
+ * Caller must hold ioasid_allocator_lock and reference to the IOASID
+ */
+static int ioasid_notify(struct ioasid_data *data,
+			 enum ioasid_notify_val cmd, unsigned int flags)
+{
+	struct ioasid_nb_args args = { 0 };
+	int ret = 0;
+
+	if (flags & ~(IOASID_NOTIFY_FLAG_ALL | IOASID_NOTIFY_FLAG_SET))
+		return -EINVAL;
+
+	args.id = data->id;
+	args.set = data->set;
+	args.pdata = data->private;
+	args.spid = data->spid;
+	if (flags & IOASID_NOTIFY_FLAG_ALL)
+		ret = atomic_notifier_call_chain(&ioasid_notifier, cmd, &args);
+	if (flags & IOASID_NOTIFY_FLAG_SET)
+		ret = atomic_notifier_call_chain(&data->set->nh, cmd, &args);
+
+	return ret;
+}
+
 static ioasid_t ioasid_find_by_spid_locked(struct ioasid_set *set, ioasid_t spid, bool get)
 {
 	ioasid_t ioasid = INVALID_IOASID;
@@ -468,7 +521,7 @@ int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid)
 		goto done_unlock;
 	}
 	data->spid = spid;
-
+	ioasid_notify(data, IOASID_NOTIFY_BIND, IOASID_NOTIFY_FLAG_SET);
 done_unlock:
 	spin_unlock(&ioasid_allocator_lock);
 	return ret;
@@ -486,8 +539,8 @@ void ioasid_detach_spid(ioasid_t ioasid)
 		pr_err("Invalid IOASID entry %d to detach\n", ioasid);
 		goto done_unlock;
 	}
+	ioasid_notify(data, IOASID_NOTIFY_UNBIND, IOASID_NOTIFY_FLAG_SET);
 	data->spid = INVALID_IOASID;
-
 done_unlock:
 	spin_unlock(&ioasid_allocator_lock);
 }
@@ -603,6 +656,8 @@ struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type)
 	set->quota = quota;
 	set->id = id;
 	atomic_set(&set->nr_ioasids, 0);
+	ATOMIC_INIT_NOTIFIER_HEAD(&set->nh);
+
 	/*
 	 * Per set XA is used to store private IDs within the set, get ready
 	 * for ioasid_set private ID and system-wide IOASID allocation
@@ -655,7 +710,9 @@ int ioasid_set_free(struct ioasid_set *set)
 	int ret = 0;
 
 	spin_lock(&ioasid_allocator_lock);
+	spin_lock(&ioasid_nb_lock);
 	ret = ioasid_set_free_locked(set);
+	spin_unlock(&ioasid_nb_lock);
 	spin_unlock(&ioasid_allocator_lock);
 	return ret;
 }
@@ -727,6 +784,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		goto exit_free;
 	}
 	atomic_inc(&set->nr_ioasids);
+	ioasid_notify(data, IOASID_NOTIFY_ALLOC, IOASID_NOTIFY_FLAG_SET);
 	goto done_unlock;
 exit_free:
 	kfree(data);
@@ -779,9 +837,11 @@ static void ioasid_free_locked(struct ioasid_set *set, ioasid_t ioasid)
 	 * If the refcount is 1, it means there is no other users of the IOASID
 	 * other than IOASID core itself. There is no need to notify anyone.
 	 */
-	if (!refcount_dec_and_test(&data->refs))
+	if (!refcount_dec_and_test(&data->refs)) {
+		ioasid_notify(data, IOASID_NOTIFY_FREE,
+			IOASID_NOTIFY_FLAG_SET | IOASID_NOTIFY_FLAG_ALL);
 		return;
-
+	}
 	ioasid_do_free_locked(data);
 }
 
@@ -832,15 +892,39 @@ void ioasid_free_all_in_set(struct ioasid_set *set)
 	if (!atomic_read(&set->nr_ioasids))
 		return;
 	spin_lock(&ioasid_allocator_lock);
+	spin_lock(&ioasid_nb_lock);
 	xa_for_each(&set->xa, index, entry) {
 		ioasid_free_locked(set, index);
 		/* Free from per set private pool */
 		xa_erase(&set->xa, index);
 	}
+	spin_unlock(&ioasid_nb_lock);
 	spin_unlock(&ioasid_allocator_lock);
 }
 EXPORT_SYMBOL_GPL(ioasid_free_all_in_set);
 
+/*
+ * ioasid_find_mm_set - Retrieve IOASID set with mm token
+ * Take a reference of the set if found.
+ */
+struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token)
+{
+	struct ioasid_set *set;
+	unsigned long index;
+
+	spin_lock(&ioasid_allocator_lock);
+
+	xa_for_each(&ioasid_sets, index, set) {
+		if (set->type == IOASID_SET_TYPE_MM && set->token == token)
+			goto exit_unlock;
+	}
+	set = NULL;
+exit_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return set;
+}
+EXPORT_SYMBOL_GPL(ioasid_find_mm_set);
+
 /**
  * ioasid_set_for_each_ioasid
  * @brief
@@ -1020,6 +1104,25 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 }
 EXPORT_SYMBOL_GPL(ioasid_find);
 
+int ioasid_register_notifier(struct ioasid_set *set, struct notifier_block *nb)
+{
+	if (set)
+		return atomic_notifier_chain_register(&set->nh, nb);
+	else
+		return atomic_notifier_chain_register(&ioasid_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(ioasid_register_notifier);
+
+void ioasid_unregister_notifier(struct ioasid_set *set,
+				struct notifier_block *nb)
+{
+	if (set)
+		atomic_notifier_chain_unregister(&set->nh, nb);
+	else
+		atomic_notifier_chain_unregister(&ioasid_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(ioasid_unregister_notifier);
+
 MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 MODULE_DESCRIPTION("IO Address Space ID (IOASID) allocator");
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 5631804d7b39..b1abce9e7b39 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -58,6 +58,47 @@ struct ioasid_allocator_ops {
 	void *pdata;
 };
 
+/* Notification data when IOASID status changed */
+enum ioasid_notify_val {
+	IOASID_NOTIFY_ALLOC = 1,
+	IOASID_NOTIFY_FREE,
+	IOASID_NOTIFY_BIND,
+	IOASID_NOTIFY_UNBIND,
+};
+
+#define IOASID_NOTIFY_FLAG_ALL BIT(0)
+#define IOASID_NOTIFY_FLAG_SET BIT(1)
+/**
+ * enum ioasid_notifier_prios - IOASID event notification order
+ *
+ * When status of an IOASID changes, users might need to take actions to
+ * reflect the new state. For example, when an IOASID is freed due to
+ * exception, the hardware context in virtual CPU, DMA device, and IOMMU
+ * shall be cleared and drained. Order is required to prevent life cycle
+ * problems.
+ */
+enum ioasid_notifier_prios {
+	IOASID_PRIO_LAST,
+	IOASID_PRIO_DEVICE,
+	IOASID_PRIO_IOMMU,
+	IOASID_PRIO_CPU,
+};
+
+/**
+ * struct ioasid_nb_args - Argument provided by IOASID core when notifier
+ * is called.
+ * @id:		The IOASID being notified
+ * @spid:	The set private ID associated with the IOASID
+ * @set:	The IOASID set of @id
+ * @pdata:	The private data attached to the IOASID
+ */
+struct ioasid_nb_args {
+	ioasid_t id;
+	ioasid_t spid;
+	struct ioasid_set *set;
+	void *pdata;
+};
+
 #if IS_ENABLED(CONFIG_IOASID)
 void ioasid_install_capacity(ioasid_t total);
 int ioasid_reserve_capacity(ioasid_t nr_ioasid);
@@ -84,6 +125,10 @@ void ioasid_detach_data(ioasid_t ioasid);
 int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid);
 void ioasid_detach_spid(ioasid_t ioasid);
 ioasid_t ioasid_find_by_spid(struct ioasid_set *set, ioasid_t spid, bool get);
+int ioasid_register_notifier(struct ioasid_set *set,
+			struct notifier_block *nb);
+void ioasid_unregister_notifier(struct ioasid_set *set,
+				struct notifier_block *nb);
 void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
 				void (*fn)(ioasid_t id, void *data),
 				void *data);
@@ -153,6 +198,15 @@ static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 	return NULL;
 }
 
+static inline int ioasid_register_notifier(struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ioasid_unregister_notifier(struct notifier_block *nb)
+{
+}
+
 static inline int ioasid_register_allocator(struct ioasid_allocator_ops *allocator)
 {
 	return -ENOTSUPP;
-- 
Gitee


From fbf40d974da1a648d45f33484410b01773000b2d Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 24 Sep 2020 15:45:16 -0700
Subject: [PATCH 1439/2161] iommu/ioasid: Support mm token type ioasid_set
 notifications
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 0fa1e030423f8aea6f9b26aaa9d6940f9b47f415 Intel-BKC.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As a system-wide resource, IOASID is often shared by multiple kernel
subsystems that are independent of each other. However, at the
ioasid_set level, these kernel subsystems must communicate with each
other for ownership checking, event notifications, etc. For example, on
Intel Scalable IO Virtualization (SIOV) enabled platforms, KVM and VFIO
instances under the same process/guest must be aware of a shared IOASID
set.
IOASID_SET_TYPE_MM token type was introduced to explicitly mark an
IOASID set that belongs to a process, thus use the same mm_struct
pointer as a token. Users of the same process can then identify with
each other based on this token.

This patch introduces MM token specific event registration APIs. Event
subscribers such as KVM instances can register IOASID event handler
without the knowledge of its ioasid_set. Event handlers are registered
based on its mm_struct pointer as a token. In case when subscribers
register handler *prior* to the creation of the ioasid_set, the
handler’s notification block is stored in a pending list within IOASID
core. Once the ioasid_set of the MM token is created, the notification
block will be registered by the IOASID core.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 142 +++++++++++++++++++++++++++++++++++++++++
 include/linux/ioasid.h |  18 ++++++
 2 files changed, 160 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index c361a0a855c6..69981447de40 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -21,6 +21,8 @@
  * keep local states in sync.
  */
 static ATOMIC_NOTIFIER_HEAD(ioasid_notifier);
+/* List to hold pending notification block registrations */
+static LIST_HEAD(ioasid_nb_pending_list);
 static DEFINE_SPINLOCK(ioasid_nb_lock);
 
 /* Default to PCIe standard 20 bit PASID */
@@ -574,6 +576,27 @@ static inline bool ioasid_set_is_valid(struct ioasid_set *set)
 	return xa_load(&ioasid_sets, set->id) == set;
 }
 
+static void ioasid_add_pending_nb(struct ioasid_set *set)
+{
+	struct ioasid_set_nb *curr;
+
+	if (set->type != IOASID_SET_TYPE_MM)
+		return;
+	/*
+	 * Check if there are any pending nb requests for the given token, if so
+	 * add them to the notifier chain.
+	 */
+	spin_lock(&ioasid_nb_lock);
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->token == set->token && !curr->active) {
+			atomic_notifier_chain_register(&set->nh, curr->nb);
+			curr->set = set;
+			curr->active = true;
+		}
+	}
+	spin_unlock(&ioasid_nb_lock);
+}
+
 /**
  * ioasid_set_alloc - Allocate a new IOASID set for a given token
  *
@@ -658,6 +681,11 @@ struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type)
 	atomic_set(&set->nr_ioasids, 0);
 	ATOMIC_INIT_NOTIFIER_HEAD(&set->nh);
 
+	/*
+	 * Check if there are any pending nb requests for the given token, if so
+	 * add them to the notifier chain.
+	 */
+	ioasid_add_pending_nb(set);
 	/*
 	 * Per set XA is used to store private IDs within the set, get ready
 	 * for ioasid_set private ID and system-wide IOASID allocation
@@ -675,6 +703,7 @@ EXPORT_SYMBOL_GPL(ioasid_set_alloc);
 
 static int ioasid_set_free_locked(struct ioasid_set *set)
 {
+	struct ioasid_set_nb *curr;
 	int ret = 0;
 
 	if (!ioasid_set_is_valid(set)) {
@@ -688,6 +717,16 @@ static int ioasid_set_free_locked(struct ioasid_set *set)
 	}
 
 	WARN_ON(!xa_empty(&set->xa));
+	/* Restore pending status of the set NBs */
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->token == set->token) {
+			if (curr->active)
+				curr->active = false;
+			else
+				pr_warn("Set token exists but not active!\n");
+		}
+	}
+
 	/*
 	 * Token got released right away after the ioasid_set is freed.
 	 * If a new set is created immediately with the newly released token,
@@ -1116,6 +1155,22 @@ EXPORT_SYMBOL_GPL(ioasid_register_notifier);
 void ioasid_unregister_notifier(struct ioasid_set *set,
 				struct notifier_block *nb)
 {
+	struct ioasid_set_nb *curr;
+
+	spin_lock(&ioasid_nb_lock);
+	/*
+	 * Pending list is registered with a token without an ioasid_set,
+	 * therefore should not be unregistered directly.
+	 */
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->nb == nb) {
+			pr_warn("Cannot unregister NB from pending list\n");
+			spin_unlock(&ioasid_nb_lock);
+			return;
+		}
+	}
+	spin_unlock(&ioasid_nb_lock);
+
 	if (set)
 		atomic_notifier_chain_unregister(&set->nh, nb);
 	else
@@ -1123,6 +1178,93 @@ void ioasid_unregister_notifier(struct ioasid_set *set,
 }
 EXPORT_SYMBOL_GPL(ioasid_unregister_notifier);
 
+/**
+ * ioasid_register_notifier_mm - Register a notifier block on the IOASID set
+ *                               created by the mm_struct pointer as the token
+ *
+ * @mm: the mm_struct token of the ioasid_set
+ * @nb: notfier block to be registered on the ioasid_set
+ *
+ * This a variant of ioasid_register_notifier() where the caller intends to
+ * listen to IOASID events belong the ioasid_set created under the same
+ * process. Caller is not aware of the ioasid_set, no need to hold reference
+ * of the ioasid_set.
+ */
+int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
+{
+	struct ioasid_set_nb *curr;
+	struct ioasid_set *set;
+	int ret = 0;
+
+	spin_lock(&ioasid_nb_lock);
+	/* Check for duplicates, nb is unique per set */
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->token == mm && curr->nb == nb) {
+			ret = -EBUSY;
+			goto exit_unlock;
+		}
+	}
+	curr = kzalloc(sizeof(*curr), GFP_ATOMIC);
+	if (!curr) {
+		ret = -ENOMEM;
+		goto exit_unlock;
+	}
+	/* Check if the token has an existing set */
+	set = ioasid_find_mm_set(mm);
+	if (!set) {
+		/* Add to the rsvd list as inactive */
+		curr->active = false;
+	} else {
+		/* REVISIT: Only register empty set for now. Can add an option
+		 * in the future to playback existing PASIDs.
+		 */
+		if (atomic_read(&set->nr_ioasids)) {
+			pr_warn("IOASID set %d not empty %d\n", set->id,
+				atomic_read(&set->nr_ioasids));
+			ret = -EBUSY;
+			goto exit_free;
+		}
+		curr->token = mm;
+		curr->nb = nb;
+		curr->active = true;
+		curr->set = set;
+
+		/* Set already created, add to the notifier chain */
+		atomic_notifier_chain_register(&set->nh, nb);
+	}
+
+	list_add(&curr->list, &ioasid_nb_pending_list);
+	goto exit_unlock;
+exit_free:
+	kfree(curr);
+exit_unlock:
+	spin_unlock(&ioasid_nb_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_register_notifier_mm);
+
+void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
+{
+	struct ioasid_set_nb *curr;
+
+	spin_lock(&ioasid_nb_lock);
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->token == mm && curr->nb == nb) {
+			list_del(&curr->list);
+			spin_unlock(&ioasid_nb_lock);
+			if (curr->active) {
+				atomic_notifier_chain_unregister(&curr->set->nh,
+								 nb);
+			}
+			kfree(curr);
+			return;
+		}
+	}
+	pr_warn("No ioasid set found for mm token %llx\n",  (u64)mm);
+	spin_unlock(&ioasid_nb_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_unregister_notifier_mm);
+
 MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 MODULE_DESCRIPTION("IO Address Space ID (IOASID) allocator");
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index b1abce9e7b39..c65c23895e51 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -132,6 +132,8 @@ void ioasid_unregister_notifier(struct ioasid_set *set,
 void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
 				void (*fn)(ioasid_t id, void *data),
 				void *data);
+int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb);
+void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *nb);
 static inline bool pasid_valid(ioasid_t ioasid)
 {
 	return ioasid != INVALID_IOASID;
@@ -259,5 +261,21 @@ static inline void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
 					      void *data)
 {
 }
+
+static inline int ioasid_register_notifier_mm(struct mm_struct *mm,
+					      struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ioasid_unregister_notifier_mm(struct mm_struct *mm,
+						 struct notifier_block *nb)
+{
+}
+
+static inline bool ioasid_queue_work(struct work_struct *work)
+{
+	return false;
+}
 #endif /* CONFIG_IOASID */
 #endif /* __LINUX_IOASID_H */
-- 
Gitee


From 2ee2b5b98a8c380b193c74ca7b66103e63a4f592 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 22 Feb 2021 16:43:57 -0800
Subject: [PATCH 1440/2161] iommu/ioasid: Add ownership check in guest bind

commit 45041fce9a1ad70705682cdd7d41fad938ab2e39 Intel-BKC.

Bind guest page table call comes with an IOASID provided by the
userspace.  To prevent attacks by malicious users, we must ensure the
IOASID was allocated under the same process.

This patch adds a new API that will perform an ownership check that is
based on whether the IOASID belongs to the ioasid_set allocated with the
mm_struct pointer as a token.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 37 +++++++++++++++++++++++++++++++++++++
 drivers/iommu/iommu.c  | 16 ++++++++++++++--
 include/linux/ioasid.h |  6 ++++++
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 69981447de40..069e938ba70f 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -9,6 +9,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/xarray.h>
+#include <linux/sched/mm.h>
 
 /*
  * An IOASID can have multiple consumers where each consumer may have
@@ -1027,6 +1028,42 @@ int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_get);
 
+/**
+ * ioasid_get_if_owned - obtain a reference to the IOASID if the IOASID belongs
+ * 		to the ioasid_set with the current mm as token
+ * @ioasid:	the IOASID to get reference
+ *
+ *
+ * Return: 0 on success, error if failed.
+ */
+int ioasid_get_if_owned(ioasid_t ioasid)
+{
+	struct ioasid_set *set;
+	int ret;
+
+	spin_lock(&ioasid_allocator_lock);
+	set = ioasid_find_set(ioasid);
+	if (IS_ERR_OR_NULL(set)) {
+		ret = -ENOENT;
+		goto done_unlock;
+	}
+	if (set->type != IOASID_SET_TYPE_MM) {
+		ret = -EINVAL;
+		goto done_unlock;
+	}
+	if (current->mm != set->token) {
+		ret = -EPERM;
+		goto done_unlock;
+	}
+
+	ret = ioasid_get_locked(set, ioasid);
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_get_if_owned);
+
 bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
 {
 	struct ioasid_data *data;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1da12f82e838..2302066d3f99 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2179,7 +2179,13 @@ int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	if (ret)
 		return ret;
 
-	return domain->ops->sva_bind_gpasid(domain, dev, &data);
+	ret = ioasid_get_if_owned(data.hpasid);
+	if (ret)
+		return ret;
+	ret = domain->ops->sva_bind_gpasid(domain, dev, &data);
+	ioasid_put(NULL, data.hpasid);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
 
@@ -2206,7 +2212,13 @@ int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev
 	if (ret)
 		return ret;
 
-	return iommu_sva_unbind_gpasid(domain, dev, data.hpasid);
+	ret = ioasid_get_if_owned(data.hpasid);
+	if (ret)
+		return ret;
+	ret = iommu_sva_unbind_gpasid(domain, dev, data.hpasid);
+	ioasid_put(NULL, data.hpasid);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
 
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index c65c23895e51..567a4a0f3e9d 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -111,6 +111,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		      void *private);
 int ioasid_get(struct ioasid_set *set, ioasid_t ioasid);
 int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid);
+int ioasid_get_if_owned(ioasid_t ioasid);
 bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
 void ioasid_free(struct ioasid_set *set, ioasid_t ioasid);
@@ -184,6 +185,11 @@ static inline int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
 	return -ENOTSUPP;
 }
 
+static inline int ioasid_get_if_owned(ioasid_t ioasid)
+{
+	return -ENOTSUPP;
+}
+
 static inline bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
 {
 	return false;
-- 
Gitee


From b224ac9e8a97ee4c3b4e932a753ca89491268f2d Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 25 Sep 2020 10:05:08 -0700
Subject: [PATCH 1441/2161] iommu/vt-d: Remove mm reference for guest SVA

commit 80a97be3a6dc48e62b5563e87f7009beea95de3d Intel-BKC.

Now that IOASID core keeps track of the IOASID to mm_struct ownership in
the forms of ioasid_set with IOASID_SET_TYPE_MM token type, there is no
need to keep the same mapping in VT-d driver specific data. Native SVM
usage is not affected by the change.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 1e04183bab6a..785a6684a6d4 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -390,12 +390,6 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			ret = -ENOMEM;
 			goto out;
 		}
-		/* REVISIT: upper layer/VFIO can track host process that bind
-		 * the PASID. ioasid_set = mm might be sufficient for vfio to
-		 * check pasid VMM ownership. We can drop the following line
-		 * once VFIO and IOASID set check is in place.
-		 */
-		svm->mm = get_task_mm(current);
 		svm->pasid = data->hpasid;
 		if (data->flags & IOMMU_SVA_GPASID_VAL) {
 			svm->gpasid = data->gpasid;
@@ -403,7 +397,6 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 		}
 		ioasid_attach_data(data->hpasid, svm);
 		INIT_LIST_HEAD_RCU(&svm->devs);
-		mmput(svm->mm);
 	}
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 	if (!sdev) {
-- 
Gitee


From d54b65970ed85456c2cbb33b57d8b415bb5bfca3 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 17 Feb 2021 17:25:53 -0800
Subject: [PATCH 1442/2161] iommu/ioasid: Add a workqueue for cleanup work

commit c7d3b3f9c90ec1b33dafcba0ab4f10e51bef9736 Intel-BKC.

An IOASID can have multiple users, such as IOMMU driver, KVM, and device
drivers.   The atomic IOASID notifier is used to inform users of IOASID
state change. For example, the IOASID_NOTIFY_UNBIND event is issued when
the IOASID is no longer bound to an address space. This requires ordered
actions among users to tear down their contexts.

Not all work can be handled in the atomic notifier handler. This patch
introduces a shared, ordered workqueue for all IOASID users who wish to
perform work asynchronously upon notification.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 25 +++++++++++++++++++++++++
 include/linux/ioasid.h |  1 +
 2 files changed, 26 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 069e938ba70f..762bd6bd794e 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -32,6 +32,9 @@ static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 static DEFINE_XARRAY_ALLOC(ioasid_sets);
 
+/* Workqueue for IOASID users to do cleanup upon notification */
+static struct workqueue_struct *ioasid_wq;
+
 struct ioasid_set_nb {
 	struct list_head	list;
 	struct notifier_block	*nb;
@@ -1280,6 +1283,12 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(ioasid_register_notifier_mm);
 
+bool ioasid_queue_work(struct work_struct *work)
+{
+	return queue_work(ioasid_wq, work);
+}
+EXPORT_SYMBOL_GPL(ioasid_queue_work);
+
 void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 {
 	struct ioasid_set_nb *curr;
@@ -1302,7 +1311,23 @@ void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *
 }
 EXPORT_SYMBOL_GPL(ioasid_unregister_notifier_mm);
 
+static int __init ioasid_init(void)
+{
+	ioasid_wq = alloc_ordered_workqueue("ioasid_wq", 0);
+	if (!ioasid_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit ioasid_cleanup(void)
+{
+	destroy_workqueue(ioasid_wq);
+}
+
 MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 MODULE_DESCRIPTION("IO Address Space ID (IOASID) allocator");
 MODULE_LICENSE("GPL");
+module_init(ioasid_init);
+module_exit(ioasid_cleanup);
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 567a4a0f3e9d..0ac162406c6a 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -135,6 +135,7 @@ void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
 				void *data);
 int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb);
 void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *nb);
+bool ioasid_queue_work(struct work_struct *work);
 static inline bool pasid_valid(ioasid_t ioasid)
 {
 	return ioasid != INVALID_IOASID;
-- 
Gitee


From 5828b25e7998a673ee4565f93fb1d782e074147f Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 20 Aug 2020 11:32:50 -0700
Subject: [PATCH 1443/2161] iommu/vt-d: Listen to IOASID notifications

commit d4d79602698d3d452aeaf9f4f573516daf971127 Intel-BKC.

On Intel Scalable I/O Virtualization (SIOV) enabled platforms, IOMMU
driver is one of the users of IOASIDs. In normal flow, callers will
perform IOASID allocation, bind, unbind, and free in order. However, for
guest SVA, IOASID free could come before unbind as guest is untrusted.
This patch registers IOASID notification handler such that IOMMU driver
can perform PASID teardown upon receiving an unexpected IOASID free
event.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |   2 +
 drivers/iommu/intel/svm.c   | 113 +++++++++++++++++++++++++++++++++++-
 include/linux/intel-iommu.h |   3 +
 3 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 190b1e6ca274..88123c0dc03b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3461,6 +3461,8 @@ static int __init init_dmars(void)
 			pr_err("Failed to allocate host PASID set %lu\n",
 				PTR_ERR(host_pasid_set));
 			intel_iommu_sm = 0;
+		} else {
+			intel_svm_add_pasid_notifier();
 		}
 	}
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 785a6684a6d4..55971f610001 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -190,6 +190,104 @@ static inline bool intel_svm_capable(struct intel_iommu *iommu)
 	return iommu->flags & VTD_FLAG_SVM_CAPABLE;
 }
 
+static inline void intel_svm_drop_pasid(ioasid_t pasid)
+{
+	/*
+	 * Detaching SPID results in UNBIND notification on the set, we must
+	 * do this before dropping the IOASID reference, otherwise the
+	 * notification chain may get destroyed.
+	 */
+	ioasid_detach_spid(pasid);
+	ioasid_detach_data(pasid);
+	ioasid_put(NULL, pasid);
+}
+
+static DEFINE_MUTEX(pasid_mutex);
+#define pasid_lock_held() lock_is_held(&pasid_mutex.dep_map)
+
+static void intel_svm_free_async_fn(struct work_struct *work)
+{
+	struct intel_svm *svm = container_of(work, struct intel_svm, work);
+	struct intel_svm_dev *sdev;
+
+	/*
+	 * Unbind all devices associated with this PASID which is
+	 * being freed by other users such as VFIO.
+	 */
+	mutex_lock(&pasid_mutex);
+	list_for_each_entry_rcu(sdev, &svm->devs, list, pasid_lock_held()) {
+		/* Does not poison forward pointer */
+		list_del_rcu(&sdev->list);
+		spin_lock(&sdev->iommu->lock);
+		intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
+					svm->pasid, true);
+		intel_svm_drain_prq(sdev->dev, svm->pasid);
+		spin_unlock(&sdev->iommu->lock);
+		kfree_rcu(sdev, rcu);
+	}
+	/*
+	 * We may not be the last user to drop the reference but since
+	 * the PASID is in FREE_PENDING state, no one can get new reference.
+	 * Therefore, we can safely free the private data svm.
+	 */
+	intel_svm_drop_pasid(svm->pasid);
+
+	/*
+	 * Free before unbind can only happen with host PASIDs used for
+	 * guest SVM. We get here because ioasid_free is called with
+	 * outstanding references. So we need to drop the reference
+	 * such that the PASID can be reclaimed. unbind_gpasid() after this
+	 * will not result in dropping refcount since the private data is
+	 * already detached.
+	 */
+	kfree(svm);
+
+	mutex_unlock(&pasid_mutex);
+}
+
+
+static int pasid_status_change(struct notifier_block *nb,
+				unsigned long code, void *data)
+{
+	struct ioasid_nb_args *args = (struct ioasid_nb_args *)data;
+	struct intel_svm *svm = (struct intel_svm *)args->pdata;
+	int ret = NOTIFY_DONE;
+
+	/*
+	 * Notification private data is a choice of vendor driver when the
+	 * IOASID is allocated or attached after allocation. When the data
+	 * type changes, we must make modifications here accordingly.
+	 */
+	if (code == IOASID_NOTIFY_FREE) {
+		/*
+		 * If PASID UNBIND happens before FREE, private data of the
+		 * IOASID should be NULL, then we don't need to do anything.
+		 */
+		if (!svm)
+			goto done;
+		if (args->id != svm->pasid) {
+			pr_warn("Notify PASID does not match data %d : %d\n",
+				args->id, svm->pasid);
+			goto done;
+		}
+		if (!ioasid_queue_work(&svm->work))
+			pr_warn("Cleanup work already queued\n");
+		return NOTIFY_OK;
+	}
+done:
+	return ret;
+}
+
+static struct notifier_block pasid_nb = {
+	.notifier_call = pasid_status_change,
+};
+
+void intel_svm_add_pasid_notifier(void)
+{
+	/* Listen to all PASIDs, not specific to a set */
+	ioasid_register_notifier(NULL, &pasid_nb);
+}
+
 void intel_svm_check(struct intel_iommu *iommu)
 {
 	if (!pasid_supported(iommu))
@@ -280,7 +378,11 @@ static const struct mmu_notifier_ops intel_mmuops = {
 	.invalidate_range = intel_invalidate_range,
 };
 
-static DEFINE_MUTEX(pasid_mutex);
+static LIST_HEAD(global_svm_list);
+
+#define for_each_svm_dev(sdev, svm, d)			\
+	list_for_each_entry((sdev), &(svm)->devs, list)	\
+		if ((d) != (sdev)->dev) {} else
 
 static int pasid_to_svm_sdev(struct device *dev,
 			     struct ioasid_set *set,
@@ -394,8 +496,15 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 		if (data->flags & IOMMU_SVA_GPASID_VAL) {
 			svm->gpasid = data->gpasid;
 			svm->flags |= SVM_FLAG_GUEST_PASID;
+			ioasid_attach_spid(data->hpasid, data->gpasid);
 		}
 		ioasid_attach_data(data->hpasid, svm);
+		ioasid_get(NULL, svm->pasid);
+		/*
+		 * Set up cleanup async work in case IOASID core notify us PASID
+		 * is freed before unbind.
+		 */
+		INIT_WORK(&svm->work, intel_svm_free_async_fn);
 		INIT_LIST_HEAD_RCU(&svm->devs);
 	}
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
@@ -491,7 +600,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 				 * the unbind, IOMMU driver will get notified
 				 * and perform cleanup.
 				 */
-				ioasid_detach_data(pasid);
+				intel_svm_drop_pasid(pasid);
 				kfree(svm);
 			}
 		}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 728f127f3386..2cb2ffa2ac22 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -771,6 +771,7 @@ void intel_svm_unbind(struct iommu_sva *handle);
 u32 intel_svm_get_pasid(struct iommu_sva *handle);
 int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
 			    struct iommu_page_response *msg);
+void intel_svm_add_pasid_notifier(void);
 
 struct intel_svm_dev {
 	struct list_head list;
@@ -794,6 +795,8 @@ struct intel_svm {
 	u32 pasid;
 	int gpasid; /* In case that guest PASID is different from host PASID */
 	struct list_head devs;
+	struct list_head list;
+	struct work_struct work; /* For deferred clean up */
 };
 #else
 static inline void intel_svm_check(struct intel_iommu *iommu) {}
-- 
Gitee


From 42cc370dcf9c16b9f6c7d136e7cdea0f92521824 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 1 Jun 2022 12:21:23 +0800
Subject: [PATCH 1444/2161] cgroup: Introduce ioasids controller

commit 72d6878a1c013c048823bf66cce9c00f20d10956 Intel-BKC.

IOASIDs are used to associate DMA requests with virtual address spaces.
They are a system-wide limited resource made available to the userspace
applications. Let it be VMs or user-space device drivers.

This RFC patch introduces a cgroup controller to address the following
problems:
1. Some user applications exhaust all the available IOASIDs thus
depriving others of the same host.
2. System admins need to provision VMs based on their needs for IOASIDs,
e.g. the number of VMs with assigned devices that perform DMA requests
with PASID.

This patch is nowhere near its completion, it merely provides the basic
functionality for resource distribution and cgroup hierarchy
organizational changes.

Since this is part of a greater effort to enable Shared Virtual Address
(SVA) virtualization. We would like to have a direction check and
collect feedback early. For details, please refer to the documentation:
Documentation/admin-guide/cgroup-v1/ioasids.rst

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/cgroup_subsys.h |   4 +
 include/linux/ioasid.h        |  18 ++
 init/Kconfig                  |   7 +
 kernel/cgroup/Makefile        |   1 +
 kernel/cgroup/ioasids.c       | 345 ++++++++++++++++++++++++++++++++++
 5 files changed, 375 insertions(+)
 create mode 100644 kernel/cgroup/ioasids.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..cda75ecdcdcb 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -57,6 +57,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_IOASIDS)
+SUBSYS(ioasids)
+#endif
+
 #if IS_ENABLED(CONFIG_CGROUP_RDMA)
 SUBSYS(rdma)
 #endif
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 0ac162406c6a..03d3dd1d14e0 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -135,12 +135,30 @@ void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
 				void *data);
 int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb);
 void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *nb);
+#ifdef CONFIG_CGROUP_IOASIDS
+int ioasid_cg_charge(struct ioasid_set *set);
+void ioasid_cg_uncharge(struct ioasid_set *set);
+#else
+/* No cgroup control, allocation will proceed until run out total pool */
+static inline int ioasid_cg_charge(struct ioasid_set *set)
+{
+	return 0;
+}
+
+static inline int ioasid_cg_uncharge(struct ioasid_set *set)
+{
+	return 0;
+}
+#endif /* CGROUP_IOASIDS */
 bool ioasid_queue_work(struct work_struct *work);
+
 static inline bool pasid_valid(ioasid_t ioasid)
 {
 	return ioasid != INVALID_IOASID;
 }
+
 #else /* !CONFIG_IOASID */
+
 static inline void ioasid_install_capacity(ioasid_t total)
 {
 }
diff --git a/init/Kconfig b/init/Kconfig
index 0babb0a34952..2b14dada8301 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -936,6 +936,13 @@ config CGROUP_PIDS
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
+config CGROUP_IOASIDS
+	bool "IOASIDs controller"
+	depends on IOASID
+	help
+	  Provides enforcement of IO Address Space ID limits in the scope of a
+	  cgroup.
+
 config CGROUP_RDMA
 	bool "RDMA controller"
 	help
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index cc3765e92885..b3720c2e4bd1 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -3,6 +3,7 @@ obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_IOASIDS) += ioasids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/ioasids.c b/kernel/cgroup/ioasids.c
new file mode 100644
index 000000000000..ac43813da6ad
--- /dev/null
+++ b/kernel/cgroup/ioasids.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IO Address Space ID limiting controller for cgroups.
+ *
+ */
+#define pr_fmt(fmt)	"ioasids_cg: " fmt
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/ioasid.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+
+#define IOASIDS_MAX_STR "max"
+static DEFINE_MUTEX(ioasids_cg_lock);
+
+struct ioasids_cgroup {
+	struct cgroup_subsys_state	css;
+	atomic64_t			counter;
+	atomic64_t			limit;
+	struct cgroup_file		events_file;
+	/* Number of times allocations failed because limit was hit. */
+	atomic64_t			events_limit;
+};
+
+static struct ioasids_cgroup *css_ioasids(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct ioasids_cgroup, css);
+}
+
+static struct ioasids_cgroup *parent_ioasids(struct ioasids_cgroup *ioasids)
+{
+	return css_ioasids(ioasids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+ioasids_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct ioasids_cgroup *ioasids;
+
+	ioasids = kzalloc(sizeof(struct ioasids_cgroup), GFP_KERNEL);
+	if (!ioasids)
+		return ERR_PTR(-ENOMEM);
+
+	atomic64_set(&ioasids->counter, 0);
+	atomic64_set(&ioasids->limit, 0);
+	atomic64_set(&ioasids->events_limit, 0);
+	return &ioasids->css;
+}
+
+static void ioasids_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_ioasids(css));
+}
+
+/**
+ * ioasids_cancel - uncharge the local IOASID count
+ * @ioasids: the ioasid cgroup state
+ * @num: the number of ioasids to cancel
+ *
+ */
+static void ioasids_cancel(struct ioasids_cgroup *ioasids, int num)
+{
+	WARN_ON_ONCE(atomic64_add_negative(-num, &ioasids->counter));
+}
+
+/**
+ * ioasids_uncharge - hierarchically uncharge the ioasid count
+ * @ioasids: the ioasid cgroup state
+ * @num: the number of ioasids to uncharge
+ */
+static void ioasids_uncharge(struct ioasids_cgroup *ioasids, int num)
+{
+	struct ioasids_cgroup *p;
+
+	for (p = ioasids; parent_ioasids(p); p = parent_ioasids(p))
+		ioasids_cancel(p, num);
+}
+
+/**
+ * ioasids_charge - hierarchically charge the ioasid count
+ * @ioasids: the ioasid cgroup state
+ * @num: the number of ioasids to charge
+ */
+static void ioasids_charge(struct ioasids_cgroup *ioasids, int num)
+{
+	struct ioasids_cgroup *p;
+
+	for (p = ioasids; parent_ioasids(p); p = parent_ioasids(p))
+		atomic64_add(num, &p->counter);
+}
+
+/**
+ * ioasids_try_charge - hierarchically try to charge the ioasid count
+ * @ioasids: the ioasid cgroup state
+ * @num: the number of ioasids to charge
+ */
+static int ioasids_try_charge(struct ioasids_cgroup *ioasids, int num)
+{
+	struct ioasids_cgroup *p, *q;
+
+	for (p = ioasids; parent_ioasids(p); p = parent_ioasids(p)) {
+		int64_t new = atomic64_add_return(num, &p->counter);
+		int64_t limit = atomic64_read(&p->limit);
+
+		if (new > limit)
+			goto revert;
+	}
+
+	return 0;
+
+revert:
+	for (q = ioasids; q != p; q = parent_ioasids(q))
+		ioasids_cancel(q, num);
+	ioasids_cancel(p, num);
+	cgroup_file_notify(&ioasids->events_file);
+
+	return -EAGAIN;
+}
+
+
+/**
+ * ioasid_cg_charge - Check and charge IOASIDs cgroup
+ *
+ * @set: IOASID set used for allocation
+ *
+ * The IOASID quota is managed per cgroup, all process based allocations
+ * must be validated per cgroup hierarchy.
+ * Return 0 if a single IOASID can be allocated or error if failed in various
+ * checks.
+ */
+int ioasid_cg_charge(struct ioasid_set *set)
+{
+	struct mm_struct *mm = get_task_mm(current);
+	struct cgroup_subsys_state *css;
+	struct ioasids_cgroup *ioasids;
+	int ret = 0;
+
+	/* Must be called with a valid mm, not during process exit */
+	if (set->type != IOASID_SET_TYPE_MM)
+		return ret;
+	if (!mm)
+		return -EINVAL;
+	/* We only charge user process allocated PASIDs */
+	if (set->type != IOASID_SET_TYPE_MM) {
+		ret = -EINVAL;
+		goto exit_drop;
+	}
+	if (set->token != mm) {
+		pr_err("No permisson to allocate IOASID\n");
+		ret = -EPERM;
+		goto exit_drop;
+	}
+	rcu_read_lock();
+	css = task_css(current, ioasids_cgrp_id);
+	ioasids = css_ioasids(css);
+	rcu_read_unlock();
+	ret = ioasids_try_charge(ioasids, 1);
+	if (ret)
+		pr_warn("%s: Unable to charge IOASID %d\n", __func__, ret);
+exit_drop:
+	mmput_async(mm);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_cg_charge);
+
+/* Uncharge IOASIDs cgroup after freeing an IOASID */
+void ioasid_cg_uncharge(struct ioasid_set *set)
+{
+	struct cgroup_subsys_state *css;
+	struct ioasids_cgroup *ioasids;
+	struct mm_struct *mm;
+
+	/* We only charge user process allocated PASIDs */
+	if (set->type != IOASID_SET_TYPE_MM)
+		return;
+	mm = set->token;
+	if (!mmget_not_zero(mm)) {
+		pr_err("MM defunct! Cannot uncharge IOASID\n");
+		return;
+	}
+	rcu_read_lock();
+	css = task_css(current, ioasids_cgrp_id);
+	ioasids = css_ioasids(css);
+	rcu_read_unlock();
+	ioasids_uncharge(ioasids, 1);
+	mmput_async(mm);
+}
+EXPORT_SYMBOL_GPL(ioasid_cg_uncharge);
+
+static int ioasids_can_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *dst_css;
+	static struct ioasid_set *set;
+	struct task_struct *leader;
+
+	/*
+	 * IOASIDs are managed at per process level, we only support domain mode
+	 * in task management model. Loop through all processes by each thread
+	 * leader, charge the leader's css.
+	 */
+	cgroup_taskset_for_each_leader(leader, dst_css, tset) {
+		struct ioasids_cgroup *ioasids = css_ioasids(dst_css);
+		struct cgroup_subsys_state *old_css;
+		struct ioasids_cgroup *old_ioasids;
+		struct mm_struct *mm = get_task_mm(leader);
+
+		set = ioasid_find_mm_set(mm);
+		mmput(mm);
+		if (!set)
+			continue;
+
+		old_css = task_css(leader, ioasids_cgrp_id);
+		old_ioasids = css_ioasids(old_css);
+
+		ioasids_charge(ioasids, atomic_read(&set->nr_ioasids));
+		ioasids_uncharge(old_ioasids, atomic_read(&set->nr_ioasids));
+	}
+
+	return 0;
+}
+
+static void ioasids_cancel_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *dst_css;
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, dst_css, tset) {
+		struct ioasids_cgroup *ioasids = css_ioasids(dst_css);
+		struct cgroup_subsys_state *old_css;
+		struct ioasids_cgroup *old_ioasids;
+
+		old_css = task_css(task, ioasids_cgrp_id);
+		old_ioasids = css_ioasids(old_css);
+
+		ioasids_charge(old_ioasids, 1);
+		ioasids_uncharge(ioasids, 1);
+	}
+}
+
+static ssize_t ioasids_max_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct ioasids_cgroup *ioasids = css_ioasids(css);
+	int64_t limit, limit_cur;
+	int err;
+
+	mutex_lock(&ioasids_cg_lock);
+	/* Check whether we are growing or shrinking */
+	limit_cur = atomic64_read(&ioasids->limit);
+	buf = strstrip(buf);
+	if (!strcmp(buf, IOASIDS_MAX_STR)) {
+		/* Returns how many IOASIDs was in the pool */
+		limit = ioasid_reserve_capacity(0);
+		ioasid_reserve_capacity(limit - limit_cur);
+		goto set_limit;
+	}
+	err = kstrtoll(buf, 0, &limit);
+	if (err)
+		goto done_unlock;
+
+	err = nbytes;
+	/* Check whether we are growing or shrinking */
+	limit_cur = atomic64_read(&ioasids->limit);
+	if (limit < 0 || limit == limit_cur) {
+		err = -EINVAL;
+		goto done_unlock;
+	}
+	if (limit < limit_cur)
+		err = ioasid_cancel_capacity(limit_cur - limit);
+	else
+		err = ioasid_reserve_capacity(limit - limit_cur);
+	if (err < 0)
+		goto done_unlock;
+
+set_limit:
+	err = nbytes;
+	atomic64_set(&ioasids->limit, limit);
+done_unlock:
+	mutex_unlock(&ioasids_cg_lock);
+	return err;
+}
+
+static int ioasids_max_show(struct seq_file *sf, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
+	struct ioasids_cgroup *ioasids = css_ioasids(css);
+	int64_t limit = atomic64_read(&ioasids->limit);
+
+	seq_printf(sf, "%lld\n", limit);
+
+	return 0;
+}
+
+static s64 ioasids_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct ioasids_cgroup *ioasids = css_ioasids(css);
+
+	return atomic64_read(&ioasids->counter);
+}
+
+static int ioasids_events_show(struct seq_file *sf, void *v)
+{
+	struct ioasids_cgroup *ioasids = css_ioasids(seq_css(sf));
+
+	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&ioasids->events_limit));
+	return 0;
+}
+
+static struct cftype ioasids_files[] = {
+	{
+		.name = "max",
+		.write = ioasids_max_write,
+		.seq_show = ioasids_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.read_s64 = ioasids_current_read,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "events",
+		.seq_show = ioasids_events_show,
+		.file_offset = offsetof(struct ioasids_cgroup, events_file),
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys ioasids_cgrp_subsys = {
+	.css_alloc	= ioasids_css_alloc,
+	.css_free	= ioasids_css_free,
+	.can_attach	= ioasids_can_attach,
+	.cancel_attach	= ioasids_cancel_attach,
+	.legacy_cftypes	= ioasids_files,
+	.dfl_cftypes	= ioasids_files,
+	.threaded	= false,
+};
+
-- 
Gitee


From bb646984f83c19ce069b53faf105f434cbd54f93 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 18 Feb 2021 09:10:00 -0800
Subject: [PATCH 1445/2161] iommu/ioasid: Consult IOASIDs cgroup for allocation

commit 094f065a4ed32287d6d1f60893de325238b0cea5 Intel-BKC.

Once IOASIDs cgroup is active, we must consult the limitation set up
by the cgroups during allocation. Freeing IOASIDs also need to return
the quota back to the cgroup.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 762bd6bd794e..ea061c1f66e4 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -782,7 +782,10 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 
 	spin_lock(&ioasid_allocator_lock);
 	/* Check if the IOASID set has been allocated and initialized */
-	if (!ioasid_set_is_valid(set))
+	if (!set || !ioasid_set_is_valid(set))
+		goto done_unlock;
+
+	if (set->type == IOASID_SET_TYPE_MM && ioasid_cg_charge(set))
 		goto done_unlock;
 
 	if (set->quota <= atomic_read(&set->nr_ioasids)) {
@@ -831,6 +834,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 	goto done_unlock;
 exit_free:
 	kfree(data);
+	ioasid_cg_uncharge(set);
 done_unlock:
 	spin_unlock(&ioasid_allocator_lock);
 	return id;
@@ -848,6 +852,7 @@ static void ioasid_do_free_locked(struct ioasid_data *data)
 		kfree_rcu(ioasid_data, rcu);
 	}
 	atomic_dec(&data->set->nr_ioasids);
+	ioasid_cg_uncharge(data->set);
 	xa_erase(&data->set->xa, data->id);
 	/* Destroy the set if empty */
 	if (!atomic_read(&data->set->nr_ioasids))
-- 
Gitee


From 3a285a19d4babf1535015f25520248118ec560b6 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 17 Feb 2021 17:41:54 -0800
Subject: [PATCH 1446/2161] docs: cgroup-v1: Add IOASIDs controller

commit bf826e84ce53d63c4274edb8b178ce44f3398b32 Intel-BKC.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/cgroup-v1/index.rst |   1 +
 .../admin-guide/cgroup-v1/ioasids.rst         | 110 ++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 Documentation/admin-guide/cgroup-v1/ioasids.rst

diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst
index 10bf48bae0b0..9498e09f6c4e 100644
--- a/Documentation/admin-guide/cgroup-v1/index.rst
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -13,6 +13,7 @@ Control Groups version 1
     devices
     freezer-subsystem
     hugetlb
+    ioasids
     memcg_test
     memory
     net_cls
diff --git a/Documentation/admin-guide/cgroup-v1/ioasids.rst b/Documentation/admin-guide/cgroup-v1/ioasids.rst
new file mode 100644
index 000000000000..b30eb41bf1be
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/ioasids.rst
@@ -0,0 +1,110 @@
+========================================
+I/O Address Space ID (IOASID) Controller
+========================================
+
+Acronyms
+--------
+PASID:
+	Process Address Space ID, defined by PCIe
+SVA:
+	Shared Virtual Address
+
+Introduction
+------------
+
+IOASIDs are used to associate DMA requests with virtual address spaces. As
+a system-wide limited¹ resource, its constraints are managed by the IOASIDs
+cgroup subsystem. The specific use cases are:
+
+1. Some user applications exhaust all the available IOASIDs thus depriving
+   others of the same host.
+
+2. System admins need to provision VMs based on their needs for IOASIDs,
+   e.g. the number of VMs with assigned devices that perform DMA requests
+   with PASID.
+
+The IOASID subsystem consists of three components:
+
+- IOASID core: provides APIs for allocation, pool management,
+  notifications and refcounting. See Documentation/driver-api/ioasid.rst
+  for details
+- IOASID user:  provides user allocation interface via /dev/ioasid
+- IOASID cgroup controller: manage resource distribution
+
+Resource Distribution Model
+---------------------------
+IOASID allocation is process-based in that IOASIDs are tied to page tables²,
+the threaded model is not supported. The allocation is rejected by the
+cgroup hierarchy once a limit is reached. However, organizational changes
+such as moving processes across cgroups are exempted. Therefore, it is
+possible to have ioasids.current > ioasids.max. It is not possible to do
+further allocation after the organizational change that exceeds the max.
+
+The system capacity of the IOASIDs is default to PCIe PASID size of 20 bits.
+IOASID core provides API to adjust the system capacity based on platforms.
+IOASIDs are used by both user applications (e.g. VMs and userspace drivers)
+and kernel (e.g. supervisor SVA). However, only user allocation is subject
+to cgroup constraints. Host kernel allocates a pool of IOASIDs where its
+quota is subtracted from the system capacity. IOASIDs cgroup consults with
+the IOASID core for available capacity when a new cgroup limit is granted.
+Upon creation, no IOASID allocation is allowed by the user processes within
+the new cgroup.
+
+Usage
+-----
+CGroup filesystem has the following IOASIDs controller specific entries:
+::
+
+ ioasids.current
+ ioasids.events
+ ioasids.max
+
+To use the IOASIDs controller, set ioasids.max to the limit of the number
+of IOASIDs that can be allocated. The file ioasids.current shows the current
+number of IOASIDs allocated within the cgroup.
+
+Example
+--------
+1. Mount the cgroup2 FS ::
+
+	$ mount -t cgroup2 none /mnt/cg2/
+
+2. Add ioasids controller ::
+
+	$ echo '+ioasids' > /mnt/cg2/cgroup.subtree_control
+
+3. Create a hierarchy, set non-zero limit (default 0) ::
+
+	$ mkdir /mnt/cg2/test1
+	$ echo 5 > /mnt/cg2/test1/ioasids.max
+
+4. Allocate IOASIDs within limit should succeed ::
+
+	$echo $$ > /mnt/cg2/test1/cgroup.procs
+	Do IOASID allocation via /dev/ioasid
+	ioasids.current:1
+	ioasids.max:5
+
+5. Attempt to allocate IOASIDs beyond limit should fail ::
+
+	ioasids.current:5
+	ioasids.max:5
+
+6. Attach a new process with IOASID already allocated to a cgroup could
+result in ioasids.current > ioasids.max, e.g. process with PID 1234 under
+a cgroup with IOASIDs controller has one IOASID allocated, moving it to
+test1 cgroup ::
+
+	$echo 1234 > /mnt/cg2/test1/cgroup.procs
+	ioasids.current:6
+	ioasids.max:5
+
+Notes
+-----
+¹ When IOASID is used for PCI Express PASID, the range is limited to the
+PASID size of 20 bits. For a device that its resources can be shared across
+the platform, the IOASID namespace must be system-wide in order to uniquely
+identify DMA request with PASID inside the device.
+
+² The primary use case is SVA, where CPU page tables are shared with DMA via
+IOMMU.
-- 
Gitee


From 68f6eca763fd041d1ffc0587aa41e82dce4977d7 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 1 Jun 2022 14:38:31 +0800
Subject: [PATCH 1447/2161] ioasid: Add /dev/ioasid for userspace

commit ff6ff5e316b6bd83ea35dff8d88e7a3daa4000de Intel-BKC.

I/O Address Space IDs (IOASIDs) is used to tag DMA requests to target
multiple DMA address spaces for physical devices. Its PCI terminology
is called PASID (Process Address Space ID). Platforms with PASID support
can provide PASID granularity DMA isolation, which is very useful for
efficient and secure device sharing (SVA, subdevice passthrough, etc.).

Today only kernel drivers are allowed to allocate IOASIDs [1]. This patch
aims to extend this capability to userspace as required in device pass-
through scenarios. For example, a userspace driver may want to create its
own DMA address spaces besides the default IOVA address space established
by the kernel on the assigned device (e.g. vDPA control vq [2] and guest
SVA [3]), thus need to get IOASIDs from the kernel IOASID allocator for
tagging. In concept, each device can have its own IOASID space, thus it's
also possible for userspace driver to manage a private IOASID space itself,
say, when PF/VF is assigned. However it doesn't work for subdevice pass-
through, as multiple subdevices under the same parent device share a single
IOASID space thus IOASIDs must be centrally managed by the kernel in such
case.

This patch introduces a /dev/ioasid interface for this purpose (per discussion
in [4]). An IOASID is just a number before it is tagged to a specific DMA
address space. The actual IOASID tagging (to DMA requests) and association
(with DMA address spaces) operations from userspace are scrutinized by specific
device passthrough frameworks, which must ensure that a malicious driver
cannot program arbitrary IOASIDs to its assigned device to access DMA address
spaces that don't belong to it, this is out of the scope of this patch (a
reference VFIO implementation will be posted soon).

Open:

PCIe PASID is 20bit implying a space with 1M IOASIDs. although it's plenty
there was an open [4] on whether this user interface is open to all processes
or only selective processes (e.g. with device assigned). In this patchseries,
a cgroup controller is introduced to manage IOASID quota that a process is
allowed to use. A cgroup-enabled system may by default set quota=0 to disallow
IOASID allocation for most processes, and then having the virt management
stack to adjust the quota for a process which gets device assigned. But yeah,
we are also willing to hear more suggestions.

[1] https://lore.kernel.org/linux-iommu/1565900005-62508-8-git-send-email-jacob.jun.pan@linux.intel.com/
[2] https://lore.kernel.org/kvm/20201216064818.48239-1-jasowang@redhat.com/
[3] https://lore.kernel.org/linux-iommu/1599734733-6431-1-git-send-email-yi.l.liu@intel.com/
[4] https://lore.kernel.org/kvm/20201014171055.328a52f4@w520.home/

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/userspace-api/index.rst  |   1 +
 Documentation/userspace-api/ioasid.rst |  49 ++++
 drivers/iommu/Kconfig                  |   5 +
 drivers/iommu/Makefile                 |   1 +
 drivers/iommu/intel/Kconfig            |   1 +
 drivers/iommu/ioasid_user.c            | 298 +++++++++++++++++++++++++
 include/linux/ioasid.h                 |  26 +++
 include/linux/miscdevice.h             |   1 +
 include/uapi/linux/ioasid.h            |  98 ++++++++
 9 files changed, 480 insertions(+)
 create mode 100644 Documentation/userspace-api/ioasid.rst
 create mode 100644 drivers/iommu/ioasid_user.c
 create mode 100644 include/uapi/linux/ioasid.h

diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index ad494da40009..b7c95a13f134 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -21,6 +21,7 @@ place where this information is gathered.
    unshare
    spec_ctrl
    accelerators/ocxl
+   ioasid
 
 .. only::  subproject and html
 
diff --git a/Documentation/userspace-api/ioasid.rst b/Documentation/userspace-api/ioasid.rst
new file mode 100644
index 000000000000..879d6cbae858
--- /dev/null
+++ b/Documentation/userspace-api/ioasid.rst
@@ -0,0 +1,49 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. ioasid:
+
+=====================================
+IOASID Userspace API
+=====================================
+
+The IOASID UAPI is used for userspace IOASID allocation/free requests,
+thus IOASID management is centralized in the IOASID core[1] in the kernel. The
+primary use case is guest Shared Virtual Address (SVA) today.
+
+Requests such as allocation/free can be issued by the users and managed
+on a per-process basis through the ioasid core. Upon opening ("/dev/ioasid"),
+a process obtains a unique handle associated with the process's mm_struct.
+This handle is mapped to an FD in the userspace. Only a single open is
+allowed per process.
+
+File descriptors can be transferred across processes by employing fork() or
+UNIX domain socket. FDs obtained by transfer cannot be used to perform
+IOASID requests. The following behaviors are recommended for the
+applications:
+
+ - forked children close the parent's IOASID FDs immediately, open new
+   /dev/ioasid FDs if IOASID allocation is desired
+
+ - do not share FDs via UNIX domain socket, e.g. via sendmsg
+
+================
+Userspace APIs
+================
+
+/dev/ioasid provides below ioctls:
+
+*) IOASID_GET_API_VERSION: returns the API version, userspace should check
+   the API version first with the one it has embedded.
+*) IOASID_GET_INFO: returns the information on the /dev/ioasid.
+   - ioasid_bits: the ioasid bit width supported by this uAPI, userspace
+     should check the ioasid_bits returned by this ioctl with the ioasid
+     bits it wants and should fail if it's smaller than the one that
+     userspace wants, otherwise, allocation will be failed.
+*) IOASID_REQUEST_ALLOC: returns an IOASID which is allocated in kernel within
+   the specified ioasid range.
+*) IOASID_REQUEST_FREE: free an IOASID per userspace's request.
+
+For detailed definition, please see include/uapi/linux/ioasid.h.
+
+.. contents:: :local:
+
+[1] Documentation/driver-api/ioasid.rst
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6506e59523d0..b140e8c55354 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -7,6 +7,11 @@ config IOMMU_IOVA
 config IOASID
 	tristate
 
+config IOASID_USER
+	tristate
+	depends on IOASID
+	default n
+
 # IOMMU_API always gets selected by whoever wants it.
 config IOMMU_API
 	bool
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index ca05f1026d95..02a9d77435fa 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
 obj-$(CONFIG_IOASID) += ioasid.o
+obj-$(CONFIG_IOASID_USER) += ioasid_user.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 6436039b3512..88eb316da2ba 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -18,6 +18,7 @@ config INTEL_IOMMU
 	select DMAR_TABLE
 	select SWIOTLB
 	select IOASID
+	select IOASID_USER
 	select IOMMU_DMA
 	select PCI_ATS
 	help
diff --git a/drivers/iommu/ioasid_user.c b/drivers/iommu/ioasid_user.c
new file mode 100644
index 000000000000..90513a32e9d7
--- /dev/null
+++ b/drivers/iommu/ioasid_user.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support IOASID allocation/free from user space.
+ *
+ * Copyright (C) 2021 Intel Corporation.
+ *     Author: Liu Yi L <yi.l.liu@intel.com>
+ *
+ */
+
+#include <linux/ioasid.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/uaccess.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "Liu Yi L <yi.l.liu@intel.com>"
+#define DRIVER_DESC     "IOASID management for user space"
+
+/* Current user ioasid uapi supports 31 bits */
+#define IOASID_BITS	31
+
+struct ioasid_user_token {
+	unsigned long long val;
+};
+
+struct ioasid_user {
+	struct kref		kref;
+	struct ioasid_set	*ioasid_set;
+	struct mutex		lock;
+	struct list_head	next;
+	struct ioasid_user_token	token;
+};
+
+static struct mutex		ioasid_user_lock;
+static struct list_head		ioasid_user_list;
+
+/* called with ioasid_user_lock held */
+static void ioasid_user_release(struct kref *kref)
+{
+	struct ioasid_user *iuser = container_of(kref, struct ioasid_user, kref);
+
+	ioasid_free_all_in_set(iuser->ioasid_set);
+	list_del(&iuser->next);
+	mutex_unlock(&ioasid_user_lock);
+	ioasid_set_free(iuser->ioasid_set);
+	kfree(iuser);
+}
+
+void ioasid_user_put(struct ioasid_user *iuser)
+{
+	kref_put_mutex(&iuser->kref, ioasid_user_release, &ioasid_user_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_user_put);
+
+static void ioasid_user_get(struct ioasid_user *iuser)
+{
+	kref_get(&iuser->kref);
+}
+
+struct ioasid_user *ioasid_user_get_from_task(struct task_struct *task)
+{
+	struct mm_struct *mm = get_task_mm(task);
+	unsigned long long val = (unsigned long long)mm;
+	struct ioasid_user *iuser;
+	bool found = false;
+
+	if (!mm)
+		return NULL;
+
+	mutex_lock(&ioasid_user_lock);
+	/* Search existing ioasid_user with current mm pointer */
+	list_for_each_entry(iuser, &ioasid_user_list, next) {
+		if (iuser->token.val == val) {
+			ioasid_user_get(iuser);
+			found = true;
+			break;
+		}
+	}
+
+	mmput(mm);
+
+	mutex_unlock(&ioasid_user_lock);
+	return found ? iuser : NULL;
+}
+EXPORT_SYMBOL_GPL(ioasid_user_get_from_task);
+
+void ioasid_user_for_each_id(struct ioasid_user *iuser, void *data,
+			    void (*fn)(ioasid_t id, void *data))
+{
+	mutex_lock(&iuser->lock);
+	ioasid_set_for_each_ioasid(iuser->ioasid_set, fn, data);
+	mutex_unlock(&iuser->lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_user_for_each_id);
+
+static int ioasid_fops_open(struct inode *inode, struct file *filep)
+{
+	struct mm_struct *mm = get_task_mm(current);
+	unsigned long long val = (unsigned long long)mm;
+	struct ioasid_set *iset;
+	struct ioasid_user *iuser;
+	int ret = 0;
+
+	mutex_lock(&ioasid_user_lock);
+	/* Only allow one single open per process */
+	list_for_each_entry(iuser, &ioasid_user_list, next) {
+		if (iuser->token.val == val) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	iuser = kzalloc(sizeof(*iuser), GFP_KERNEL);
+	if (!iuser) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * IOASID core provides a 'IOASID set' concept to track all
+	 * IOASIDs associated with a token. Here we use mm_struct as
+	 * the token and create a IOASID set per mm_struct. All the
+	 * containers of the process share the same IOASID set.
+	 */
+	iset = ioasid_set_alloc(mm, 0, IOASID_SET_TYPE_MM);
+	if (IS_ERR(iset)) {
+		kfree(iuser);
+		ret = PTR_ERR(iset);
+		goto out;
+	}
+
+	iuser->ioasid_set = iset;
+	kref_init(&iuser->kref);
+	iuser->token.val = val;
+	mutex_init(&iuser->lock);
+	filep->private_data = iuser;
+
+	list_add(&iuser->next, &ioasid_user_list);
+out:
+	mutex_unlock(&ioasid_user_lock);
+	mmput(mm);
+	return ret;
+}
+
+static int ioasid_fops_release(struct inode *inode, struct file *filep)
+{
+	struct ioasid_user *iuser = filep->private_data;
+
+	filep->private_data = NULL;
+
+	ioasid_user_put(iuser);
+
+	return 0;
+}
+
+static int ioasid_get_info(struct ioasid_user *iuser, unsigned long arg)
+{
+	struct ioasid_info info;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct ioasid_info, ioasid_bits);
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz || info.flags)
+		return -EINVAL;
+
+	info.ioasid_bits = IOASID_BITS;
+
+	return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static int ioasid_alloc_request(struct ioasid_user *iuser, unsigned long arg)
+{
+	struct ioasid_alloc_request req;
+	unsigned long minsz;
+	ioasid_t ioasid;
+
+	minsz = offsetofend(struct ioasid_alloc_request, range);
+
+	if (copy_from_user(&req, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (req.argsz < minsz || req.flags)
+		return -EINVAL;
+
+	if (req.range.min > req.range.max ||
+	    req.range.min >= (1 << IOASID_BITS) ||
+	    req.range.max >= (1 << IOASID_BITS))
+		return -EINVAL;
+
+	ioasid = ioasid_alloc(iuser->ioasid_set, req.range.min,
+			    req.range.max, NULL);
+
+	if (ioasid == INVALID_IOASID)
+		return -EINVAL;
+
+	return ioasid;
+
+}
+
+static int ioasid_free_request(struct ioasid_user *iuser, unsigned long arg)
+{
+	int ioasid;
+
+	if (copy_from_user(&ioasid, (void __user *)arg, sizeof(ioasid)))
+		return -EFAULT;
+
+	if (ioasid < 0)
+		return -EINVAL;
+
+	ioasid_free(iuser->ioasid_set, ioasid);
+
+	return 0;
+}
+
+static long ioasid_fops_unl_ioctl(struct file *filep,
+				  unsigned int cmd, unsigned long arg)
+{
+	struct ioasid_user *iuser = filep->private_data;
+	long ret = -EINVAL;
+
+	if (!iuser)
+		return ret;
+
+	mutex_lock(&iuser->lock);
+
+	switch (cmd) {
+	case IOASID_GET_API_VERSION:
+		ret = IOASID_API_VERSION;
+		break;
+	case IOASID_GET_INFO:
+		ret = ioasid_get_info(iuser, arg);
+		break;
+	case IOASID_REQUEST_ALLOC:
+		ret = ioasid_alloc_request(iuser, arg);
+		break;
+	case IOASID_REQUEST_FREE:
+		ret = ioasid_free_request(iuser, arg);
+		break;
+	default:
+		pr_err("Unsupported cmd %u\n", cmd);
+		break;
+	}
+
+	mutex_unlock(&iuser->lock);
+	return ret;
+}
+
+static const struct file_operations ioasid_user_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ioasid_fops_open,
+	.release	= ioasid_fops_release,
+	.unlocked_ioctl	= ioasid_fops_unl_ioctl,
+};
+
+static struct miscdevice ioasid_user = {
+	.minor = IOASID_MINOR,
+	.name = "ioasid_user",
+	.fops = &ioasid_user_fops,
+	.nodename = "ioasid",
+	.mode = S_IRUGO | S_IWUGO,
+};
+
+
+static int __init ioasid_user_init(void)
+{
+	int ret;
+
+	ret = misc_register(&ioasid_user);
+	if (ret) {
+		pr_err("ioasid_user: misc device register failed\n");
+		return ret;
+	}
+
+	mutex_init(&ioasid_user_lock);
+	INIT_LIST_HEAD(&ioasid_user_list);
+	return 0;
+}
+
+static void __exit ioasid_user_exit(void)
+{
+	WARN_ON(!list_empty(&ioasid_user_list));
+	misc_deregister(&ioasid_user);
+}
+
+module_init(ioasid_user_init);
+module_exit(ioasid_user_exit);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 03d3dd1d14e0..d6e0f20ef7d3 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -6,6 +6,7 @@
 #include <linux/errno.h>
 #include <linux/xarray.h>
 #include <linux/refcount.h>
+#include <uapi/linux/ioasid.h>
 
 #define INVALID_IOASID ((ioasid_t)-1)
 typedef unsigned int ioasid_t;
@@ -152,6 +153,31 @@ static inline int ioasid_cg_uncharge(struct ioasid_set *set)
 #endif /* CGROUP_IOASIDS */
 bool ioasid_queue_work(struct work_struct *work);
 
+/* IOASID userspace support */
+struct ioasid_user;
+#if IS_ENABLED(CONFIG_IOASID_USER)
+extern struct ioasid_user *ioasid_user_get_from_task(struct task_struct *task);
+extern void ioasid_user_put(struct ioasid_user *iuser);
+extern void ioasid_user_for_each_id(struct ioasid_user *iuser, void *data,
+				   void (*fn)(ioasid_t id, void *data));
+
+#else /* CONFIG_IOASID_USER */
+static inline struct ioasid_user *
+ioasid_user_get_from_task(struct task_struct *task)
+{
+	return ERR_PTR(-ENOTTY);
+}
+
+static inline void ioasid_user_put(struct ioasid_user *iuser)
+{
+}
+
+static inline void ioasid_user_for_each_id(struct ioasid_user *iuser, void *data,
+					  void (*fn)(ioasid_t id, void *data))
+{
+}
+#endif /* CONFIG_IOASID_USER */
+
 static inline bool pasid_valid(ioasid_t ioasid)
 {
 	return ioasid != INVALID_IOASID;
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index b06b75776a32..bb2012cd8e6d 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -21,6 +21,7 @@
 #define APOLLO_MOUSE_MINOR	7	/* unused */
 #define PC110PAD_MINOR		9	/* unused */
 /*#define ADB_MOUSE_MINOR	10	FIXME OBSOLETE */
+#define IOASID_MINOR		129     /* /dev/ioasid     */
 #define WATCHDOG_MINOR		130	/* Watchdog timer     */
 #define TEMP_MINOR		131	/* Temperature Sensor */
 #define APM_MINOR_DEV		134
diff --git a/include/uapi/linux/ioasid.h b/include/uapi/linux/ioasid.h
new file mode 100644
index 000000000000..1529070c0317
--- /dev/null
+++ b/include/uapi/linux/ioasid.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * PASID (Processor Address Space ID) is a PCIe concept for tagging
+ * address spaces in DMA requests. When system-wide PASID allocation
+ * is required by the underlying iommu driver (e.g. Intel VT-d), this
+ * provides an interface for userspace to request ioasid alloc/free
+ * for its assigned devices.
+ *
+ * Copyright (C) 2021 Intel Corporation.  All rights reserved.
+ *     Author: Liu Yi L <yi.l.liu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _UAPI_IOASID_H
+#define _UAPI_IOASID_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <linux/ioasid.h>
+
+#define IOASID_API_VERSION	0
+
+
+/* Kernel & User level defines for IOASID IOCTLs. */
+
+#define IOASID_TYPE	('i')
+#define IOASID_BASE	100
+
+/* -------- IOCTLs for IOASID file descriptor (/dev/ioasid) -------- */
+
+/**
+ * IOASID_GET_API_VERSION - _IO(IOASID_TYPE, IOASID_BASE + 0)
+ *
+ * Report the version of the IOASID API.  This allows us to bump the entire
+ * API version should we later need to add or change features in incompatible
+ * ways.
+ * Return: IOASID_API_VERSION
+ * Availability: Always
+ */
+#define IOASID_GET_API_VERSION		_IO(IOASID_TYPE, IOASID_BASE + 0)
+
+/**
+ * IOASID_GET_INFO - _IOR(IOASID_TYPE, IOASID_BASE + 1, struct ioasid_info)
+ *
+ * Retrieve information about the IOASID object. Fills in provided
+ * struct ioasid_info. Caller sets argsz.
+ *
+ * @argsz:	 user filled size of this data.
+ * @flags:	 currently reserved for future extension. must set to 0.
+ * @ioasid_bits: maximum supported PASID bits, 0 represents no PASID
+ *		 support.
+
+ * Availability: Always
+ */
+struct ioasid_info {
+	__u32	argsz;
+	__u32	flags;
+	__u32	ioasid_bits;
+};
+#define IOASID_GET_INFO _IO(IOASID_TYPE, IOASID_BASE + 1)
+
+/**
+ * IOASID_REQUEST_ALLOC - _IOWR(IOASID_TYPE, IOASID_BASE + 2,
+ *					struct ioasid_request)
+ *
+ * Alloc a PASID within @range. @range is [min, max], which means both
+ * @min and @max are inclusive.
+ * User space should provide min, max no more than the ioasid bits reports
+ * in ioasid_info via IOASID_GET_INFO.
+ *
+ * @argsz: user filled size of this data.
+ * @flags: currently reserved for future extension. must set to 0.
+ * @range: allocated ioasid is expected in the range.
+ *
+ * returns: allocated ID on success, -errno on failure
+ */
+struct ioasid_alloc_request {
+	__u32	argsz;
+	__u32	flags;
+	struct {
+		__u32	min;
+		__u32	max;
+	} range;
+};
+#define IOASID_REQUEST_ALLOC	_IO(IOASID_TYPE, IOASID_BASE + 2)
+
+/**
+ * IOASID_REQUEST_FREE - _IOWR(IOASID_TYPE, IOASID_BASE + 3, int)
+ *
+ * Free a PASID.
+ *
+ * returns: 0 on success, -errno on failure
+ */
+#define IOASID_REQUEST_FREE	_IO(IOASID_TYPE, IOASID_BASE + 3)
+
+#endif /* _UAPI_IOASID_H */
-- 
Gitee


From ffdbfa34bc68f2bdc37ce7ccc082e4c4d9f5c1da Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Mon, 1 Mar 2021 16:33:54 +0800
Subject: [PATCH 1448/2161] iommu/vt-d: Check ownership in sva cache invalidate

commit 3fb8c9c438ef38d31e18134df4c1488f7c3388e0 Intel-BKC.

Change-Id: I4f945e6d361b41633828cac2c34d0efb14f133d5
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 88123c0dc03b..2e91034d775b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5084,6 +5084,10 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
 			pasid = inv_info->granu.addr_info.pasid;
 
+		ret = ioasid_get_if_owned(pasid);
+		if (ret)
+			goto out_unlock;
+
 		switch (BIT(cache_type)) {
 		case IOMMU_CACHE_INV_TYPE_IOTLB:
 			/* HW will ignore LSB bits based on address mask */
@@ -5140,6 +5144,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 					    cache_type);
 			ret = -EINVAL;
 		}
+		ioasid_put(NULL, pasid);
 	}
 out_unlock:
 	spin_unlock(&iommu->lock);
-- 
Gitee


From 55c367139e6ad6bcc5b9f9bc678b9adebc946402 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 14 Apr 2020 05:10:07 -0700
Subject: [PATCH 1449/2161] iommu: Report domain nesting info

commit 156d70f99a517d8dc467a47d6a8661c3bef56cff Intel-BKC.

IOMMUs that support nesting translation needs report the capability info
to userspace. It gives information about requirements the userspace needs
to implement plus other features characterizing the physical implementation.

This patch introduces a new IOMMU UAPI struct that gives information about
the nesting capabilities and features. This struct is supposed to be returned
by iommu_domain_get_attr() with DOMAIN_ATTR_NESTING attribute parameter, with
one domain whose type has been set to DOMAIN_ATTR_NESTING.

v7 -> v8:
*) add padding in struct iommu_nesting_info_vtd
*) describe future extension rules for struct iommu_nesting_info in iommu.rst.
*) remove SYSWIDE_PASID

v6 -> v7:
*) rephrase the commit message, replace the @data[] field in struct
   iommu_nesting_info with union per comments from Eric Auger.

v5 -> v6:
*) rephrase the feature notes per comments from Eric Auger.
*) rename @size of struct iommu_nesting_info to @argsz.

v4 -> v5:
*) address comments from Eric Auger.

v3 -> v4:
*) split the SMMU driver changes to be a separate patch
*) move the @addr_width and @pasid_bits from vendor specific
   part to generic part.
*) tweak the description for the @features field of struct
   iommu_nesting_info.
*) add description on the @data[] field of struct iommu_nesting_info

v2 -> v3:
*) remvoe cap/ecap_mask in iommu_nesting_info.
*) reuse DOMAIN_ATTR_NESTING to get nesting info.
*) return an empty iommu_nesting_info for SMMU drivers per Jean'
   suggestion.

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/userspace-api/iommu.rst |  5 +-
 include/uapi/linux/iommu.h            | 72 +++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/Documentation/userspace-api/iommu.rst b/Documentation/userspace-api/iommu.rst
index d3108c1519d5..ad06bb94aad5 100644
--- a/Documentation/userspace-api/iommu.rst
+++ b/Documentation/userspace-api/iommu.rst
@@ -26,6 +26,7 @@ supported user-kernel APIs are as follows:
 2. Bind/Unbind guest PASID table (e.g. ARM SMMU)
 3. Invalidate IOMMU caches upon guest requests
 4. Report errors to the guest and serve page requests
+5. Read iommu_nesting_info from kernel
 
 Requirements
 ============
@@ -96,7 +97,9 @@ kernel. Simply recompiling existing code with newer kernel header should
 not be an issue in that only existing flags are used.
 
 IOMMU vendor driver should report the below features to IOMMU UAPI
-consumers (e.g. via VFIO).
+consumers (e.g. via VFIO). The feature list is passed by struct
+iommu_nesting_info. The future extension to this structure follows
+the rule defined in section "Extension Rules & Precautions".
 
 1. IOMMU_NESTING_FEAT_SYSWIDE_PASID
 2. IOMMU_NESTING_FEAT_BIND_PGTBL
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 59178fc229ca..e333b7224436 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -339,4 +339,76 @@ struct iommu_gpasid_bind_data {
 	} vendor;
 };
 
+/*
+ * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info.
+ *
+ * @flags:	VT-d specific flags. Currently reserved for future
+ *		extension. must be set to 0.
+ * @cap_reg:	Describe basic capabilities as defined in VT-d capability
+ *		register.
+ * @ecap_reg:	Describe the extended capabilities as defined in VT-d
+ *		extended capability register.
+ */
+struct iommu_nesting_info_vtd {
+	__u32	flags;
+	__u8	padding[12];
+	__u64	cap_reg;
+	__u64	ecap_reg;
+};
+
+/*
+ * struct iommu_nesting_info - Information for nesting-capable IOMMU.
+ *			       userspace should check it before using
+ *			       nesting capability.
+ *
+ * @argsz:	size of the whole structure.
+ * @flags:	currently reserved for future extension. must set to 0.
+ * @format:	PASID table entry format, the same definition as struct
+ *		iommu_gpasid_bind_data @format.
+ * @features:	supported nesting features.
+ * @addr_width:	the output addr width of first level/stage translation
+ * @pasid_bits:	maximum supported PASID bits, 0 represents no PASID
+ *		support.
+ * @vendor:	vendor specific data, structure type can be deduced from
+ *		@format field.
+ *
+ * +===============+======================================================+
+ * | feature       |  Notes                                               |
+ * +===============+======================================================+
+ * | BIND_PGTBL    |  IOMMU vendor driver sets it to mandate userspace to |
+ * |               |  bind the first level/stage page table to associated |
+ * |               |  PASID (either the one specified in bind request or  |
+ * |               |  the default PASID of iommu domain), through IOMMU   |
+ * |               |  UAPI.                                               |
+ * +---------------+------------------------------------------------------+
+ * | CACHE_INVLD   |  IOMMU vendor driver sets it to mandate userspace to |
+ * |               |  explicitly invalidate the IOMMU cache through IOMMU |
+ * |               |  UAPI according to vendor-specific requirement when  |
+ * |               |  changing the 1st level/stage page table.            |
+ * +---------------+------------------------------------------------------+
+ *
+ * data struct types defined for @format:
+ * +================================+=====================================+
+ * | @format                        | data struct                         |
+ * +================================+=====================================+
+ * | IOMMU_PASID_FORMAT_INTEL_VTD   | struct iommu_nesting_info_vtd       |
+ * +--------------------------------+-------------------------------------+
+ *
+ */
+struct iommu_nesting_info {
+	__u32	argsz;
+	__u32	flags;
+	__u32	format;
+#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 0)
+#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 1)
+	__u32	features;
+	__u16	addr_width;
+	__u16	pasid_bits;
+	__u8	padding[12];
+	/* Vendor specific data */
+	union {
+		struct iommu_nesting_info_vtd vtd;
+	} vendor;
+};
+
 #endif /* _UAPI_IOMMU_H */
-- 
Gitee


From 2d5d252621ffd092f5c85ae47046e64c3c8eb2b5 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 14 Apr 2020 05:10:07 -0700
Subject: [PATCH 1450/2161] iommu/vt-d: Support reporting nesting capability
 info

commit 24952f3cc4ff598504c91489148528a9daee60f9 Intel-BKC.

This patch reports nesting info when iommu_domain_get_attr() is called with
DOMAIN_ATTR_NESTING and one domain with nesting set.

v7 -> v8:
*) tweak per latest code base

v6 -> v7:
*) split the patch in v6 into two patches:
   [PATCH v7 15/16] iommu/vt-d: Only support nesting when nesting caps are consistent across iommu units
   [PATCH v7 16/16] iommu/vt-d: Support reporting nesting capability info

v2 -> v3:
*) remove cap/ecap_mask in iommu_nesting_info.

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/cap_audit.h |   7 ++
 drivers/iommu/intel/iommu.c     | 117 ++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
index d07b75938961..86f2c8e907f7 100644
--- a/drivers/iommu/intel/cap_audit.h
+++ b/drivers/iommu/intel/cap_audit.h
@@ -60,6 +60,13 @@
 #define ECAP_QI_MASK		BIT_ULL(1)
 #define ECAP_C_MASK		BIT_ULL(0)
 
+/* Capabilities related to nested translation */
+#define VTD_CAP_MASK		(CAP_FL1GP_MASK | CAP_FL5LP_MASK)
+
+#define VTD_ECAP_MASK		(ECAP_PRS_MASK | ECAP_ERS_MASK | \
+				 ECAP_SRS_MASK | ECAP_EAFS_MASK | \
+				 ECAP_PASID_MASK)
+
 /*
  * u64 intel_iommu_cap_sanity, intel_iommu_ecap_sanity will be adjusted as each
  * IOMMU gets audited.
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2e91034d775b..16a393dd6be4 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5649,6 +5649,123 @@ static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
 	return attach_deferred(dev);
 }
 
+static bool domain_use_flush_queue(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	bool r = true;
+
+	if (intel_iommu_strict)
+		return false;
+
+	/*
+	 * The flush queue implementation does not perform page-selective
+	 * invalidations that are required for efficient TLB flushes in virtual
+	 * environments. The benefit of batching is likely to be much lower than
+	 * the overhead of synchronizing the virtual and physical IOMMU
+	 * page-tables.
+	 */
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (!cap_caching_mode(iommu->cap))
+			continue;
+
+		pr_warn_once("IOMMU batching is disabled due to virtualization");
+		r = false;
+		break;
+	}
+	rcu_read_unlock();
+
+	return r;
+}
+
+static int intel_iommu_get_nesting_info(struct iommu_domain *domain,
+					struct iommu_nesting_info *info)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	u64 cap = VTD_CAP_MASK, ecap = VTD_ECAP_MASK;
+	struct device_domain_info *domain_info;
+	struct iommu_nesting_info_vtd vtd;
+	unsigned int size;
+
+	if (!info)
+		return -EINVAL;
+
+	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
+		return -ENODEV;
+
+	size = sizeof(struct iommu_nesting_info);
+	/*
+	 * if provided buffer size is smaller than expected, should
+	 * return 0 and also the expected buffer size to caller.
+	 */
+	if (info->argsz < size) {
+		info->argsz = size;
+		return 0;
+	}
+
+	/*
+	 * arbitrary select the first domain_info as all nesting
+	 * related capabilities should be consistent across iommu
+	 * units.
+	 */
+	domain_info = list_first_entry(&dmar_domain->devices,
+				       struct device_domain_info, link);
+	cap &= domain_info->iommu->cap;
+	ecap &= domain_info->iommu->ecap;
+
+	info->addr_width = dmar_domain->gaw;
+	info->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+	info->features = IOMMU_NESTING_FEAT_BIND_PGTBL |
+			 IOMMU_NESTING_FEAT_CACHE_INVLD;
+	info->pasid_bits = ilog2(intel_pasid_max_id);
+	memset(&info->padding, 0x0, 12);
+
+	vtd.flags = 0;
+	memset(&vtd.padding, 0x0, 12);
+	vtd.cap_reg = cap & VTD_CAP_MASK;
+	vtd.ecap_reg = ecap & VTD_ECAP_MASK;
+
+	memcpy(&info->vendor.vtd, &vtd, sizeof(vtd));
+	return 0;
+}
+
+static int
+intel_iommu_domain_get_attr(struct iommu_domain *domain,
+			    enum iommu_attr attr, void *data)
+{
+	switch (domain->type) {
+	case IOMMU_DOMAIN_UNMANAGED:
+		switch (attr) {
+		case DOMAIN_ATTR_NESTING:
+		{
+			struct iommu_nesting_info *info =
+				(struct iommu_nesting_info *)data;
+			unsigned long flags;
+			int ret;
+
+			spin_lock_irqsave(&device_domain_lock, flags);
+			ret = intel_iommu_get_nesting_info(domain, info);
+			spin_unlock_irqrestore(&device_domain_lock, flags);
+			return ret;
+		}
+		default:
+			return -ENODEV;
+		}
+	case IOMMU_DOMAIN_DMA:
+		switch (attr) {
+		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+			*(int *)data = domain_use_flush_queue();
+			return 0;
+		default:
+			return -ENODEV;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+}
+
 /*
  * Check that the device does not live on an external facing PCI port that is
  * marked as untrusted. Such devices should not be able to apply quirks and
-- 
Gitee


From 5c25746ba4315dc42567fea41cdff3ad618adfe8 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 28 Jun 2022 17:29:42 +0800
Subject: [PATCH 1451/2161] vfio/type1: Add back iommu_domain_set_attr(,
 DOMAIN_ATTR_NESTING, );

commit 642615a73cfffc66b1175e27e9e6bcf4d1806eff Intel-BKC.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 1e81f45337b4..660ed3f679e6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1830,7 +1830,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	}
 
 	if (iommu->nesting) {
-		ret = iommu_enable_nesting(domain->domain);
+		int attr = 1;
+
+		ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
+					    &attr);
 		if (ret)
 			goto out_domain;
 	}
-- 
Gitee


From bb74736b2805be19f9bb69019801ca8945d06d50 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 6 Jul 2022 17:44:43 +0800
Subject: [PATCH 1452/2161] iommu: remove iommu_enable_nesting()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Remove iommu_enable_nesting().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 11 +++++------
 include/linux/iommu.h |  5 ++++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2302066d3f99..9757eb225dae 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2761,15 +2761,14 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
-int iommu_enable_nesting(struct iommu_domain *domain)
+int iommu_domain_get_attr(struct iommu_domain *domain,
+			  enum iommu_attr attr, void *data)
 {
-	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+	if (!domain->ops->domain_get_attr)
 		return -EINVAL;
-	if (!domain->ops->enable_nesting)
-		return -EINVAL;
-	return domain->ops->enable_nesting(domain);
+	return domain->ops->domain_get_attr(domain, attr, data);
 }
-EXPORT_SYMBOL_GPL(iommu_enable_nesting);
+EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
 
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9fc1fc037349..981c95c4fa52 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -544,7 +544,10 @@ extern int iommu_page_response(struct device *dev,
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
-int iommu_enable_nesting(struct iommu_domain *domain);
+extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
+				 void *data);
+extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
+				 void *data);
 
 /* Window handling function prototypes */
 extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-- 
Gitee


From 0de61a7e3cd5561f365657e90c222f1c88e9976c Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 11:40:30 +0800
Subject: [PATCH 1453/2161] Revert "iommu: remove DOMAIN_ATTR_NESTING"

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 31 ++++++++++++++++++++++---------
 drivers/iommu/iommu.c       | 17 +++++++++++++++++
 include/linux/iommu.h       | 21 +++++++++++++++++++--
 3 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 16a393dd6be4..f5293a9c205b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5784,19 +5784,32 @@ static bool risky_device(struct pci_dev *pdev)
 }
 
 static int
-intel_iommu_enable_nesting(struct iommu_domain *domain)
+intel_iommu_domain_set_attr(struct iommu_domain *domain,
+			    enum iommu_attr attr, void *data)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	unsigned long flags;
-	int ret = -ENODEV;
+	int ret = 0;
 
-	spin_lock_irqsave(&device_domain_lock, flags);
-	if (list_empty(&dmar_domain->devices)) {
-		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
-		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
-		ret = 0;
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		spin_lock_irqsave(&device_domain_lock, flags);
+		if (nested_mode_support() &&
+		    list_empty(&dmar_domain->devices)) {
+			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
+			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
+		} else {
+			ret = -ENODEV;
+		}
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
 	}
-	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	return ret;
 }
@@ -5820,7 +5833,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
-	.enable_nesting		= intel_iommu_enable_nesting,
+	.domain_set_attr	= intel_iommu_domain_set_attr,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
 	.aux_attach_dev		= intel_iommu_aux_attach_device,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9757eb225dae..6424696a9ffb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2770,6 +2770,23 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
 
+int iommu_domain_set_attr(struct iommu_domain *domain,
+			  enum iommu_attr attr, void *data)
+{
+	int ret = 0;
+
+	switch (attr) {
+	default:
+		if (domain->ops->domain_set_attr == NULL)
+			return -EINVAL;
+
+		ret = domain->ops->domain_set_attr(domain, attr, data);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
+
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 981c95c4fa52..e9850e3623f5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -132,6 +132,7 @@ enum iommu_attr {
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_FSL_PAMUV1,
+	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_MAX,
 };
 
@@ -239,7 +240,8 @@ struct iommu_iotlb_gather {
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
  *                  group and attached to the groups domain
  * @device_group: find iommu group for a particular device
- * @enable_nesting: Enable nesting
+ * @domain_get_attr: Query domain attributes
+ * @domain_set_attr: Change domain attributes 
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
@@ -295,7 +297,10 @@ struct iommu_ops {
 	void (*release_device)(struct device *dev);
 	void (*probe_finalize)(struct device *dev);
 	struct iommu_group *(*device_group)(struct device *dev);
-	int (*enable_nesting)(struct iommu_domain *domain);
+	int (*domain_get_attr)(struct iommu_domain *domain,
+			       enum iommu_attr attr, void *data);
+	int (*domain_set_attr)(struct iommu_domain *domain,
+			       enum iommu_attr attr, void *data);
 
 	/* Request/Free a list of reserved regions for a device */
 	void (*get_resv_regions)(struct device *dev, struct list_head *list);
@@ -921,6 +926,18 @@ static inline int iommu_group_id(struct iommu_group *group)
 	return -ENODEV;
 }
 
+static inline int iommu_domain_get_attr(struct iommu_domain *domain,
+					enum iommu_attr attr, void *data)
+{
+	return -EINVAL;
+}
+
+static inline int iommu_domain_set_attr(struct iommu_domain *domain,
+					enum iommu_attr attr, void *data)
+{
+	return -EINVAL;
+}
+
 static inline int  iommu_device_register(struct iommu_device *iommu)
 {
 	return -ENODEV;
-- 
Gitee


From bc32d05fc3d5b9ed5e51df6b5bb85a0c3ab92cf7 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 19 Jul 2022 20:22:33 +0800
Subject: [PATCH 1454/2161] Revert "iommu: remove
 DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE"

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c   | 21 +++++++++++++++++----
 drivers/iommu/intel/iommu.c |  1 +
 drivers/iommu/iommu.c       | 15 +++++++++------
 include/linux/iommu.h       |  1 +
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 34e4bfbbd9e1..4c39daa72895 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -307,7 +307,10 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
 
 	cookie = container_of(iovad, struct iommu_dma_cookie, iovad);
 	domain = cookie->fq_domain;
-
+	/*
+	 * The IOMMU driver supporting DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE
+	 * implies that ops->flush_iotlb_all must be non-NULL.
+	 */
 	domain->ops->flush_iotlb_all(domain);
 }
 
@@ -358,6 +361,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	unsigned long order, base_pfn;
 	struct iova_domain *iovad;
+	int attr;
 
 	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
 		return -EINVAL;
@@ -393,9 +397,18 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 
 	init_iova_domain(iovad, 1UL << order, base_pfn);
 
-	/* If the FQ fails we can simply fall back to strict mode */
-	if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
-		domain->type = IOMMU_DOMAIN_DMA;
+	if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) &&
+	    !iommu_domain_get_attr(domain, DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) &&
+	    attr) {
+		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
+					  iommu_dma_entry_dtor))
+			pr_warn("iova flush queue initialization failed\n");
+		else
+			cookie->fq_domain = domain;
+	}
+
+	if (!dev)
+		return 0;
 
 	return iova_reserve_iommu_regions(dev, domain);
 }
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f5293a9c205b..d3c66a0bbc5f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5834,6 +5834,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
 	.domain_set_attr	= intel_iommu_domain_set_attr,
+	.domain_get_attr	= intel_iommu_domain_get_attr,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
 	.aux_attach_dev		= intel_iommu_aux_attach_device,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6424696a9ffb..0ac0362fa6e5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -72,7 +72,6 @@ static const char * const iommu_group_resv_type_string[] = {
 };
 
 #define IOMMU_CMD_LINE_DMA_API		BIT(0)
-#define IOMMU_CMD_LINE_STRICT		BIT(1)
 
 static int iommu_alloc_default_domain(struct iommu_group *group,
 				      struct device *dev);
@@ -332,11 +331,7 @@ early_param("iommu.passthrough", iommu_set_def_domain_type);
 
 static int __init iommu_dma_setup(char *str)
 {
-	int ret = kstrtobool(str, &iommu_dma_strict);
-
-	if (!ret)
-		iommu_cmd_line |= IOMMU_CMD_LINE_STRICT;
-	return ret;
+	return kstrtobool(str, &iommu_dma_strict);
 }
 early_param("iommu.strict", iommu_dma_setup);
 
@@ -1513,6 +1508,14 @@ static int iommu_group_alloc_default_domain(struct bus_type *bus,
 	group->default_domain = dom;
 	if (!group->domain)
 		group->domain = dom;
+
+	if (!iommu_dma_strict) {
+		int attr = 1;
+		iommu_domain_set_attr(dom,
+				      DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+				      &attr);
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e9850e3623f5..2eb7d238466a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -132,6 +132,7 @@ enum iommu_attr {
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_FSL_PAMUV1,
+	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_MAX,
 };
-- 
Gitee


From cb8e040a51fe77f8282dea0140c937f427124e6a Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 19 Jul 2022 20:42:00 +0800
Subject: [PATCH 1455/2161] Revert "iommu: remove iommu_set_cmd_line_dma_api
 and iommu_cmd_line_dma_api"

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0ac0362fa6e5..fd62adf25d5c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -73,6 +73,16 @@ static const char * const iommu_group_resv_type_string[] = {
 
 #define IOMMU_CMD_LINE_DMA_API		BIT(0)
 
+static void iommu_set_cmd_line_dma_api(void)
+{
+	iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
+}
+
+static bool iommu_cmd_line_dma_api(void)
+{
+	return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API);
+}
+
 static int iommu_alloc_default_domain(struct iommu_group *group,
 				      struct device *dev);
 static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
@@ -124,31 +134,23 @@ static const char *iommu_domain_type_str(unsigned int t)
 
 static int __init iommu_subsys_init(void)
 {
-	if (!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API)) {
+	bool cmd_line = iommu_cmd_line_dma_api();
+
+	if (!cmd_line) {
 		if (IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH))
 			iommu_set_default_passthrough(false);
 		else
 			iommu_set_default_translated(false);
 
-		if (iommu_default_passthrough() && mem_encrypt_active()) {
+		if (iommu_default_passthrough()) {
 			pr_info("Memory encryption detected - Disabling default IOMMU Passthrough\n");
 			iommu_set_default_translated(false);
 		}
 	}
 
-	if (!iommu_default_passthrough() && !iommu_dma_strict)
-		iommu_def_domain_type = IOMMU_DOMAIN_DMA_FQ;
-
 	pr_info("Default domain type: %s %s\n",
 		iommu_domain_type_str(iommu_def_domain_type),
-		(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
-			"(set via kernel command line)" : "");
-
-	if (!iommu_default_passthrough())
-		pr_info("DMA domain TLB invalidation policy: %s mode %s\n",
-			iommu_dma_strict ? "strict" : "lazy",
-			(iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ?
-				"(set via kernel command line)" : "");
+		cmd_line ? "(set via kernel command line)" : "");
 
 	return 0;
 }
@@ -2847,14 +2849,16 @@ EXPORT_SYMBOL_GPL(iommu_alloc_resv_region);
 void iommu_set_default_passthrough(bool cmd_line)
 {
 	if (cmd_line)
-		iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
+		iommu_set_cmd_line_dma_api();
+
 	iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY;
 }
 
 void iommu_set_default_translated(bool cmd_line)
 {
 	if (cmd_line)
-		iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
+		iommu_set_cmd_line_dma_api();
+
 	iommu_def_domain_type = IOMMU_DOMAIN_DMA;
 }
 
-- 
Gitee


From 9dd53cec0e8ba1c5e8d36345c296ffe09e587bf6 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 4 Oct 2019 04:51:29 -0700
Subject: [PATCH 1456/2161] iommu: Fix an issue in iommu_page_response() flags
 check

commit 52cc7aafd5fe56b723f3b7814404814ab04d94f2 Intel-BKC.

original code fails when LAST_PAGE is set in flags.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index fd62adf25d5c..ab5af2ad55a0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1220,9 +1220,11 @@ int iommu_page_response(struct device *dev,
 		return -EINVAL;
 
 	if (msg->version != IOMMU_PAGE_RESP_VERSION_1 ||
-	    msg->flags & ~IOMMU_PAGE_RESP_PASID_VALID)
+		!(msg->flags & IOMMU_PAGE_RESP_PASID_VALID)) {
+		dev_dbg(dev, "%s:Invalid ver %x: flags %x\n",
+			__func__, msg->version, msg->flags);
 		return -EINVAL;
-
+	}
 	/* Only send response if there is a fault report pending */
 	mutex_lock(&param->fault_param->lock);
 	if (list_empty(&param->fault_param->faults)) {
-- 
Gitee


From 94fbd7f6230658786f43e94aa3455f93b2e84205 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Wed, 30 Sep 2020 06:46:41 -0700
Subject: [PATCH 1457/2161] iommu/vt-d: Fix bug in page response

commit 2c220905a4e234c84368488aa4237871080de923 Intel-BKC.

Should use NULL as pasid_set for pasid_to_svm_sdev(). host_pasid_set is
used for native SVM. The ideal implementation is to use the pasid_set
stored in domain, which needs to wait for DOMAIN_ATTR_IOASID_SET.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 55971f610001..79657c38c37a 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1294,35 +1294,13 @@ int intel_svm_page_response(struct device *dev,
 		goto out;
 	}
 
-	ret = pasid_to_svm_sdev(dev, host_pasid_set,
+	ret = pasid_to_svm_sdev(dev, NULL,
 				prm->pasid, &svm, &sdev);
 	if (ret || !sdev) {
 		ret = -ENODEV;
 		goto out;
 	}
 
-	/*
-	 * For responses from userspace, need to make sure that the
-	 * pasid has been bound to its mm.
-	 */
-	if (svm->flags & SVM_FLAG_GUEST_MODE) {
-		struct mm_struct *mm;
-
-		mm = get_task_mm(current);
-		if (!mm) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		if (mm != svm->mm) {
-			ret = -ENODEV;
-			mmput(mm);
-			goto out;
-		}
-
-		mmput(mm);
-	}
-
 	/*
 	 * Per VT-d spec. v3.0 ch7.7, system software must respond
 	 * with page group response if private data is present (PDP)
-- 
Gitee


From 3cdde76c5e21e14eea3f761c723a7982fb599734 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Wed, 30 Sep 2020 06:46:41 -0700
Subject: [PATCH 1458/2161] iommu/vt-d: Loop subdevices in
 iommu_domain_get_attr

commit e853665eba6bebcba77bf7afea08a0a967680bdf Intel-BKC.

This was found by the failure of iommu_domain_get_attr() when passthru mdev
to guest. There is no device in domain->devices, thus unable to get the cap
and ecap of iommu unit. But this domain actually has one sub-device which is
attached via aux-manner. This patch fixes the issue.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d3c66a0bbc5f..a79ec06901b3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5709,8 +5709,21 @@ static int intel_iommu_get_nesting_info(struct iommu_domain *domain,
 	 * related capabilities should be consistent across iommu
 	 * units.
 	 */
-	domain_info = list_first_entry(&dmar_domain->devices,
-				       struct device_domain_info, link);
+	/*
+	 * Check full-device list first, and then sub-device list
+	 */
+	if (!list_empty(&dmar_domain->devices))
+		domain_info = list_first_entry(&dmar_domain->devices,
+					struct device_domain_info, link);
+	else if (!list_empty(&dmar_domain->subdevices)) {
+		struct subdev_domain_info *sinfo;
+
+		sinfo = list_first_entry(&dmar_domain->subdevices,
+					struct subdev_domain_info, link_domain);
+		domain_info = get_domain_info(sinfo->pdev);
+	} else
+		return -ENODEV;
+
 	cap &= domain_info->iommu->cap;
 	ecap &= domain_info->iommu->ecap;
 
-- 
Gitee


From 850628e491ae853d1dafc7a21550e9796cd084d4 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 5 Jun 2019 19:13:55 +0100
Subject: [PATCH 1459/2161] iommu: handle page response timeout

commit 47e3e90db9c324cdc4801ed22489ead71e7a82a6 Intel-BKC.

When IO page faults are reported outside IOMMU subsystem, the page
request handler may fail for various reasons. E.g. a guest received
page requests but did not have a chance to run for a long time. The
irresponsive behavior could hold off limited resources on the pending
device.
There can be hardware or credit based software solutions as suggested
in the PCI ATS Ch-4. To provide a basic safty net this patch
introduces a per device deferrable timer which monitors the longest
pending page fault that requires a response. Proper action such as
sending failure response code could be taken when timer expires but not
included in this patch. We need to consider the life cycle of page
groupd ID to prevent confusion with reused group ID by a device.
For now, a warning message provides clue of such failure.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 55 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h |  4 ++++
 2 files changed, 59 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ab5af2ad55a0..595e53292980 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1055,6 +1055,39 @@ int iommu_group_unregister_notifier(struct iommu_group *group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
+static void iommu_dev_fault_timer_fn(struct timer_list *t)
+{
+	struct iommu_fault_param *fparam = from_timer(fparam, t, timer);
+	struct iommu_fault_event *evt;
+	struct iommu_fault_page_request *prm;
+
+	u64 now;
+
+	now = get_jiffies_64();
+
+	/* The goal is to ensure driver or guest page fault handler(via vfio)
+	 * send page response on time. Otherwise, limited queue resources
+	 * may be occupied by some irresponsive guests or drivers.
+	 * When per device pending fault list is not empty, we periodically checks
+	 * if any anticipated page response time has expired.
+	 *
+	 * TODO:
+	 * We could do the following if response time expires:
+	 * 1. send page response code FAILURE to all pending PRQ
+	 * 2. inform device driver or vfio
+	 * 3. drain in-flight page requests and responses for this device
+	 * 4. clear pending fault list such that driver can unregister fault
+	 *    handler(otherwise blocked when pending faults are present).
+	 */
+	list_for_each_entry(evt, &fparam->faults, list) {
+		prm = &evt->fault.prm;
+		if (time_after64(now, evt->expire))
+			pr_err("Page response time expired!, pasid %d gid %d exp %llu now %llu\n",
+				prm->pasid, prm->grpid, evt->expire, now);
+	}
+	mod_timer(t, now + prq_timeout);
+}
+
 /**
  * iommu_register_device_fault_handler() - Register a device fault handler
  * @dev: the device
@@ -1102,6 +1135,9 @@ int iommu_register_device_fault_handler(struct device *dev,
 	mutex_init(&param->fault_param->lock);
 	INIT_LIST_HEAD(&param->fault_param->faults);
 
+	if (prq_timeout)
+		timer_setup(&param->fault_param->timer, iommu_dev_fault_timer_fn,
+			TIMER_DEFERRABLE);
 done_unlock:
 	mutex_unlock(&param->lock);
 
@@ -1163,7 +1199,9 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 	struct dev_iommu *param = dev->iommu;
 	struct iommu_fault_event *evt_pending = NULL;
 	struct iommu_fault_param *fparam;
+	struct timer_list *tmr;
 	int ret = 0;
+	u64 exp;
 
 	if (!param || !evt)
 		return -EINVAL;
@@ -1184,7 +1222,17 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 			ret = -ENOMEM;
 			goto done_unlock;
 		}
+		/* Keep track of response expiration time */
+		exp = get_jiffies_64() + prq_timeout;
+		evt_pending->expire = exp;
 		mutex_lock(&fparam->lock);
+		if (list_empty(&fparam->faults)) {
+			/* First pending event, start timer */
+			tmr = &dev->iommu->fault_param->timer;
+			WARN_ON(timer_pending(tmr));
+			mod_timer(tmr, exp);
+		}
+
 		list_add_tail(&evt_pending->list, &fparam->faults);
 		mutex_unlock(&fparam->lock);
 	}
@@ -1262,6 +1310,13 @@ int iommu_page_response(struct device *dev,
 		break;
 	}
 
+	/* stop response timer if no more pending request */
+	if (list_empty(&param->fault_param->faults) &&
+		timer_pending(&param->fault_param->timer)) {
+		pr_debug("no pending PRQ, stop timer\n");
+		del_timer(&param->fault_param->timer);
+	}
+
 done_unlock:
 	mutex_unlock(&param->fault_param->lock);
 	return ret;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2eb7d238466a..44c72963fdb5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -377,10 +377,12 @@ struct iommu_device {
  *
  * @fault: fault descriptor
  * @list: pending fault event list, used for tracking responses
+ * @expire: time limit in jiffies will wait for page response
  */
 struct iommu_fault_event {
 	struct iommu_fault fault;
 	struct list_head list;
+	u64 expire;
 };
 
 /**
@@ -389,11 +391,13 @@ struct iommu_fault_event {
  * @data: handler private data
  * @faults: holds the pending faults which needs response
  * @lock: protect pending faults list
+ * @timer: track page request pending time limit
  */
 struct iommu_fault_param {
 	iommu_dev_fault_handler_t handler;
 	void *data;
 	struct list_head faults;
+	struct timer_list timer;
 	struct mutex lock;
 };
 
-- 
Gitee


From dcbacae77a32ceec59078c2abf438f7b6b6075a0 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Wed, 5 Jun 2019 19:13:55 +0100
Subject: [PATCH 1460/2161] trace/iommu: Add sva trace events

commit 1561cd6f2d95d0992dd362a0c747f2b1bec406af Intel-BKC.

For development only, trace I/O page faults and responses.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
[JPB: removed the invalidate trace event, that will be added later]
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/trace/events/iommu.h | 84 ++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
index 72b4582322ff..767b92c03f19 100644
--- a/include/trace/events/iommu.h
+++ b/include/trace/events/iommu.h
@@ -12,6 +12,8 @@
 #define _TRACE_IOMMU_H
 
 #include <linux/tracepoint.h>
+#include <linux/iommu.h>
+#include <uapi/linux/iommu.h>
 
 struct device;
 
@@ -161,6 +163,88 @@ DEFINE_EVENT(iommu_error, io_page_fault,
 
 	TP_ARGS(dev, iova, flags)
 );
+
+TRACE_EVENT(dev_fault,
+
+	TP_PROTO(struct device *dev,  struct iommu_fault *evt),
+
+	TP_ARGS(dev, evt),
+
+	TP_STRUCT__entry(
+		__string(device, dev_name(dev))
+		__field(int, type)
+		__field(int, reason)
+		__field(u64, addr)
+		__field(u64, fetch_addr)
+		__field(u32, pasid)
+		__field(u32, grpid)
+		__field(u32, flags)
+		__field(u32, prot)
+	),
+
+	TP_fast_assign(
+		__assign_str(device, dev_name(dev));
+		__entry->type = evt->type;
+		if (evt->type == IOMMU_FAULT_DMA_UNRECOV) {
+			__entry->reason		= evt->event.reason;
+			__entry->flags		= evt->event.flags;
+			__entry->pasid		= evt->event.pasid;
+			__entry->grpid		= 0;
+			__entry->prot		= evt->event.perm;
+			__entry->addr		= evt->event.addr;
+			__entry->fetch_addr	= evt->event.fetch_addr;
+		} else {
+			__entry->reason		= 0;
+			__entry->flags		= evt->prm.flags;
+			__entry->pasid		= evt->prm.pasid;
+			__entry->grpid		= evt->prm.grpid;
+			__entry->prot		= evt->prm.perm;
+			__entry->addr		= evt->prm.addr;
+			__entry->fetch_addr	= 0;
+		}
+	),
+
+	TP_printk("IOMMU:%s type=%d reason=%d addr=0x%016llx fetch=0x%016llx pasid=%d group=%d flags=%x prot=%d",
+		__get_str(device),
+		__entry->type,
+		__entry->reason,
+		__entry->addr,
+		__entry->fetch_addr,
+		__entry->pasid,
+		__entry->grpid,
+		__entry->flags,
+		__entry->prot
+	)
+);
+
+TRACE_EVENT(dev_page_response,
+
+	TP_PROTO(struct device *dev,  struct iommu_fault_page_response *msg),
+
+	TP_ARGS(dev, msg),
+
+	TP_STRUCT__entry(
+		__string(device, dev_name(dev))
+		__field(int, code)
+		__field(u32, pasid)
+		__field(u32, grpid)
+	),
+
+	TP_fast_assign(
+		__assign_str(device, dev_name(dev));
+		__entry->code = msg->code;
+		__entry->pasid = msg->pasid;
+		__entry->grpid = msg->grpid;
+	),
+
+	TP_printk("IOMMU:%s code=%d pasid=%d group=%d",
+		__get_str(device),
+		__entry->code,
+		__entry->pasid,
+		__entry->grpid
+	)
+);
+
 #endif /* _TRACE_IOMMU_H */
 
 /* This part must be outside protection */
-- 
Gitee


From e51bb4e95d054312aade31dc81ccf87caee04a1b Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Wed, 5 Jun 2019 19:13:56 +0100
Subject: [PATCH 1461/2161] iommu: Use device fault trace event

commit 0e6876e3fbcf4782e860679c332b0afa476849d8 Intel-BKC.

For performance and debugging purposes, these trace events help
analyzing device faults that interact with IOMMU subsystem.
E.g.
IOMMU:0000:00:0a.0 type=2 reason=0 addr=0x00000000007ff000 pasid=1
group=1 last=0 prot=1

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
[JPB: removed invalidate event, that will be added later]
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c        | 2 ++
 include/trace/events/iommu.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 595e53292980..fe8c37faf2bf 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1244,6 +1244,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 		mutex_unlock(&fparam->lock);
 		kfree(evt_pending);
 	}
+	trace_dev_fault(dev, &evt->fault);
 done_unlock:
 	mutex_unlock(&param->lock);
 	return ret;
@@ -1305,6 +1306,7 @@ int iommu_page_response(struct device *dev,
 		}
 
 		ret = domain->ops->page_response(dev, evt, msg);
+		trace_dev_page_response(dev, msg);
 		list_del(&evt->list);
 		kfree(evt);
 		break;
diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
index 767b92c03f19..7a7801bc0562 100644
--- a/include/trace/events/iommu.h
+++ b/include/trace/events/iommu.h
@@ -219,7 +219,7 @@ TRACE_EVENT(dev_fault,
 
 TRACE_EVENT(dev_page_response,
 
-	TP_PROTO(struct device *dev,  struct iommu_fault_page_response *msg),
+	TP_PROTO(struct device *dev,  struct iommu_page_response *msg),
 
 	TP_ARGS(dev, msg),
 
-- 
Gitee


From b7de90954f861b10358b8638dfdd28f974dab9f4 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 1 Jun 2022 15:15:54 +0800
Subject: [PATCH 1462/2161] iommu: Add a timeout parameter for PRQ response

commit b80b4efdba6dddfdcfbee79a9fee963f2fb43019 Intel-BKC.

When an I/O page request is processed outside the IOMMU subsystem,
response can be delayed or lost. Add a tunable setup parameter such that
user can choose the timeout for IOMMU to track pending page requests.

This timeout mechanism is a basic safety net which can be implemented in
conjunction with credit based or device level page response exception
handling.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         |  8 +++++
 drivers/iommu/iommu.c                         | 34 ++++++++++++++++---
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 37eec51acc5f..cf2c0aa40f18 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1841,6 +1841,14 @@
 			1 - Bypass the IOMMU for DMA.
 			unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.
 
+	iommu.prq_timeout=
+			Timeout in seconds to wait for page response
+			of a pending page request.
+			Format: <integer>
+			Default: 10
+			0 - no timeout tracking
+			1 to 100 - allowed range
+
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
 			arch/alpha/kernel/core_marvel.c.
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index fe8c37faf2bf..67d3d6fbd091 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -35,6 +35,19 @@ static unsigned int iommu_def_domain_type __read_mostly;
 static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT);
 static u32 iommu_cmd_line __read_mostly;
 
+/*
+ * Timeout to wait for page response of a pending page request. This is
+ * intended as a basic safty net in case a pending page request is not
+ * responded for an exceptionally long time. Device may also implement
+ * its own protection mechanism against this exception.
+ * Units are in jiffies with a range between 1 - 100 seconds equivalent.
+ * Default to 10 seconds.
+ * Setting 0 means no timeout tracking.
+ */
+#define IOMMU_PAGE_RESPONSE_MAX_TIMEOUT (HZ * 100)
+#define IOMMU_PAGE_RESPONSE_DEF_TIMEOUT (HZ * 10)
+static unsigned long prq_timeout = IOMMU_PAGE_RESPONSE_DEF_TIMEOUT;
+
 struct iommu_group {
 	struct kobject kobj;
 	struct kobject *devices_kobj;
@@ -337,12 +350,25 @@ static int __init iommu_dma_setup(char *str)
 }
 early_param("iommu.strict", iommu_dma_setup);
 
-void iommu_set_dma_strict(void)
+static int __init iommu_set_prq_timeout(char *str)
 {
-	iommu_dma_strict = true;
-	if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ)
-		iommu_def_domain_type = IOMMU_DOMAIN_DMA;
+	int ret;
+	unsigned long timeout;
+
+	if (!str)
+		return -EINVAL;
+
+	ret = kstrtoul(str, 10, &timeout);
+	if (ret)
+		return ret;
+	timeout = timeout * HZ;
+	if (timeout > IOMMU_PAGE_RESPONSE_MAX_TIMEOUT)
+		return -EINVAL;
+	prq_timeout = timeout;
+
+	return 0;
 }
+early_param("iommu.prq_timeout", iommu_set_prq_timeout);
 
 static ssize_t iommu_group_attr_show(struct kobject *kobj,
 				     struct attribute *__attr, char *buf)
-- 
Gitee


From fb1920095db24f32dd62f6a5fcc8354d117ca047 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 3 Oct 2019 14:05:06 -0700
Subject: [PATCH 1463/2161] iommu/uapi: use PCI standard page response code

commit bf3fdb7eeb7d47e1a0da186dcc14ffb84a94c9fc Intel-BKC.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index e333b7224436..1566dfe83f6f 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -134,7 +134,7 @@ struct iommu_fault {
 enum iommu_page_response_code {
 	IOMMU_PAGE_RESP_SUCCESS = 0,
 	IOMMU_PAGE_RESP_INVALID,
-	IOMMU_PAGE_RESP_FAILURE,
+	IOMMU_PAGE_RESP_FAILURE = 0xf,
 };
 
 /**
-- 
Gitee


From 5f7d160283bf782d2c9064e849323a67c10e2c4f Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Sat, 27 Jun 2020 20:45:18 -0700
Subject: [PATCH 1464/2161] iommu: Pass user pointer to iommu_page_response()

commit 7c998219d6a4bcc85879ebdb3f75b42f649c8c86 Intel-BKC.

This is aligned with latest IOMMU UAPI data passing design. Refer to iommu.rst
under Doc.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 64 +++++++++++++++++++++++++++++++++----------
 include/linux/iommu.h |  6 ++--
 2 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 67d3d6fbd091..1146fc2af52b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1277,16 +1277,52 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
+static int iommu_page_response_prepare_msg(void __user *udata,
+					   struct iommu_page_response *msg)
+{
+	unsigned long minsz, maxsz;
+
+	/* Current kernel data size is the max to be copied from user */
+	maxsz = sizeof(struct iommu_page_response);
+	memset((void *)msg, 0, maxsz);
+	minsz = offsetofend(struct iommu_page_response, code);
+
+	if (copy_from_user(msg, udata, minsz))
+		return -EFAULT;
+
+	if (msg->argsz < minsz)
+		return -EINVAL;
+
+	if (msg->argsz > maxsz)
+		msg->argsz = maxsz;
+
+	if (msg->version != IOMMU_PAGE_RESP_VERSION_1 ||
+		!(msg->flags & IOMMU_PAGE_RESP_PASID_VALID)) {
+		pr_debug("%s:Invalid ver %x: flags %x\n",
+			__func__, msg->version, msg->flags);
+		return -EINVAL;
+	}
+
+	/* Copy the remaining user data _after_ minsz if there is */
+	if ((msg->argsz - minsz) &&
+	    copy_from_user((void *)msg + minsz, udata + minsz,
+				msg->argsz - minsz))
+		return -EFAULT;
+
+	return 0;
+}
+
 int iommu_page_response(struct device *dev,
-			struct iommu_page_response *msg)
+			void __user *uinfo)
 {
 	bool needs_pasid;
 	int ret = -EINVAL;
+	struct iommu_page_response msg;
 	struct iommu_fault_event *evt;
 	struct iommu_fault_page_request *prm;
 	struct dev_iommu *param = dev->iommu;
-	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	bool has_pasid;
 
 	if (!domain || !domain->ops->page_response)
 		return -ENODEV;
@@ -1294,12 +1330,12 @@ int iommu_page_response(struct device *dev,
 	if (!param || !param->fault_param)
 		return -EINVAL;
 
-	if (msg->version != IOMMU_PAGE_RESP_VERSION_1 ||
-		!(msg->flags & IOMMU_PAGE_RESP_PASID_VALID)) {
-		dev_dbg(dev, "%s:Invalid ver %x: flags %x\n",
-			__func__, msg->version, msg->flags);
-		return -EINVAL;
-	}
+	ret = iommu_page_response_prepare_msg(uinfo, &msg);
+	if (ret)
+		return ret;
+
+	has_pasid = msg.flags & IOMMU_PAGE_RESP_PASID_VALID;
+
 	/* Only send response if there is a fault report pending */
 	mutex_lock(&param->fault_param->lock);
 	if (list_empty(&param->fault_param->faults)) {
@@ -1312,7 +1348,7 @@ int iommu_page_response(struct device *dev,
 	 */
 	list_for_each_entry(evt, &param->fault_param->faults, list) {
 		prm = &evt->fault.prm;
-		if (prm->grpid != msg->grpid)
+		if (prm->grpid != msg.grpid)
 			continue;
 
 		/*
@@ -1322,17 +1358,17 @@ int iommu_page_response(struct device *dev,
 		 * response.
 		 */
 		needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
-		if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid))
+		if (needs_pasid && (!has_pasid || msg.pasid != prm->pasid))
 			continue;
 
 		if (!needs_pasid && has_pasid) {
 			/* No big deal, just clear it. */
-			msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
-			msg->pasid = 0;
+			msg.flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
+			msg.pasid = 0;
 		}
 
-		ret = domain->ops->page_response(dev, evt, msg);
-		trace_dev_page_response(dev, msg);
+		ret = domain->ops->page_response(dev, evt, &msg);
+		trace_dev_page_response(dev, &msg);
 		list_del(&evt->list);
 		kfree(evt);
 		break;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 44c72963fdb5..bfb75b7181fd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -548,8 +548,7 @@ extern int iommu_unregister_device_fault_handler(struct device *dev);
 
 extern int iommu_report_device_fault(struct device *dev,
 				     struct iommu_fault_event *evt);
-extern int iommu_page_response(struct device *dev,
-			       struct iommu_page_response *msg);
+extern int iommu_page_response(struct device *dev, void __user *uinfo);
 
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
@@ -920,8 +919,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 	return -ENODEV;
 }
 
-static inline int iommu_page_response(struct device *dev,
-				      struct iommu_page_response *msg)
+static inline int iommu_page_response(struct device *dev, void __user *uinfo)
 {
 	return -ENODEV;
 }
-- 
Gitee


From 2d6ec3d5aaac5021040d1d4a0c8d04f0f8d1626e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 4 Oct 2019 04:51:29 -0700
Subject: [PATCH 1465/2161] iommu: add domain argument to page response

commit 999841cd78d3fe17bf4423c030eb3829312dadf2 Intel-BKC.

With mdev, page response needs pdev domain that is different than
the device's own domain. Extend iommu_page_response() to support
such case.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/Makefile      |  2 +-
 drivers/iommu/intel/svm.c   |  3 ++-
 drivers/iommu/iommu.c       |  8 ++++----
 include/linux/intel-iommu.h |  3 ++-
 include/linux/iommu.h       | 11 ++++++++---
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 02a9d77435fa..12abbd02c84a 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -33,4 +33,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
-obj-$(CONFIG_IOMMU_SVA) += iommu-sva-lib.o io-pgfault.o
+obj-$(CONFIG_IOMMU_SVA) += iommu-sva-lib.o
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 79657c38c37a..5977a7327246 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1251,7 +1251,8 @@ u32 intel_svm_get_pasid(struct iommu_sva *sva)
 	return pasid;
 }
 
-int intel_svm_page_response(struct device *dev,
+int intel_svm_page_response(struct iommu_domain *domain,
+			    struct device *dev,
 			    struct iommu_fault_event *evt,
 			    struct iommu_page_response *msg)
 {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1146fc2af52b..0c1a859a50e1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1278,7 +1278,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
 static int iommu_page_response_prepare_msg(void __user *udata,
-					   struct iommu_page_response *msg)
+				struct iommu_page_response *msg)
 {
 	unsigned long minsz, maxsz;
 
@@ -1312,7 +1312,8 @@ static int iommu_page_response_prepare_msg(void __user *udata,
 	return 0;
 }
 
-int iommu_page_response(struct device *dev,
+int iommu_page_response(struct iommu_domain *domain,
+			struct device *dev,
 			void __user *uinfo)
 {
 	bool needs_pasid;
@@ -1321,7 +1322,6 @@ int iommu_page_response(struct device *dev,
 	struct iommu_fault_event *evt;
 	struct iommu_fault_page_request *prm;
 	struct dev_iommu *param = dev->iommu;
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	bool has_pasid;
 
 	if (!domain || !domain->ops->page_response)
@@ -1367,7 +1367,7 @@ int iommu_page_response(struct device *dev,
 			msg.pasid = 0;
 		}
 
-		ret = domain->ops->page_response(dev, evt, &msg);
+		ret = domain->ops->page_response(domain, dev, evt, &msg);
 		trace_dev_page_response(dev, &msg);
 		list_del(&evt->list);
 		kfree(evt);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 2cb2ffa2ac22..7c55209ad591 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -769,7 +769,8 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
 				 void *drvdata);
 void intel_svm_unbind(struct iommu_sva *handle);
 u32 intel_svm_get_pasid(struct iommu_sva *handle);
-int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
+int intel_svm_page_response(struct iommu_domain *domain, struct device *dev,
+			    struct iommu_fault_event *evt,
 			    struct iommu_page_response *msg);
 void intel_svm_add_pasid_notifier(void);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bfb75b7181fd..167aff169405 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -333,7 +333,8 @@ struct iommu_ops {
 	void (*sva_unbind)(struct iommu_sva *handle);
 	u32 (*sva_get_pasid)(struct iommu_sva *handle);
 
-	int (*page_response)(struct device *dev,
+	int (*page_response)(struct iommu_domain *domain,
+			     struct device *dev,
 			     struct iommu_fault_event *evt,
 			     struct iommu_page_response *msg);
 	int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
@@ -548,7 +549,9 @@ extern int iommu_unregister_device_fault_handler(struct device *dev);
 
 extern int iommu_report_device_fault(struct device *dev,
 				     struct iommu_fault_event *evt);
-extern int iommu_page_response(struct device *dev, void __user *uinfo);
+extern int iommu_page_response(struct iommu_domain *domain,
+			       struct device *dev,
+			       void __user *uinfo);
 
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
@@ -919,7 +922,9 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 	return -ENODEV;
 }
 
-static inline int iommu_page_response(struct device *dev, void __user *uinfo)
+static inline int iommu_page_response(struct iommu_domain *domain,
+				      struct device *dev,
+				      void __user *uinfo)
 {
 	return -ENODEV;
 }
-- 
Gitee


From 2f0c9cc56d6fc6319bacb7bd281ddacdf37a2afd Mon Sep 17 00:00:00 2001
From: "Liu, Yi L" <yi.l.liu@intel.com>
Date: Thu, 1 Oct 2020 07:55:50 -0700
Subject: [PATCH 1466/2161] iommu/vt-d: Check pasid ownership in
 intel_svm_page_response

commit be78cbefee7bdc409720ec416f74ab5d53e46cd4 Intel-BKC.

This patch needs to wait for DOMAIN_ATTR_IOASID_SET is ready.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 5977a7327246..b264668cfc06 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1257,6 +1257,7 @@ int intel_svm_page_response(struct iommu_domain *domain,
 			    struct iommu_page_response *msg)
 {
 	struct iommu_fault_page_request *prm;
+	struct dmar_domain *dmar_domain;
 	struct intel_svm_dev *sdev = NULL;
 	struct intel_svm *svm = NULL;
 	struct intel_iommu *iommu;
@@ -1295,7 +1296,8 @@ int intel_svm_page_response(struct iommu_domain *domain,
 		goto out;
 	}
 
-	ret = pasid_to_svm_sdev(dev, NULL,
+	dmar_domain = to_dmar_domain(domain);
+	ret = pasid_to_svm_sdev(dev, NULL, // dmar_domain->pasid_set,
 				prm->pasid, &svm, &sdev);
 	if (ret || !sdev) {
 		ret = -ENODEV;
-- 
Gitee


From dcab5ee472eee207ff10369d0457c41342a5877f Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 1 Jun 2022 15:36:11 +0800
Subject: [PATCH 1467/2161] iommu: add place holder for PASID suspend

commit 2458897cfa46d4f23f32b3b03b905f2bf155ecbc Intel-BKC.

APIs to suspend all operations of a given PASID-device combo while
maintaining the SVA bound. PRQ will be drained.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 167aff169405..1cb181e6d315 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -268,6 +268,8 @@ struct iommu_iotlb_gather {
  *		 - 0: use the default setting
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
+ * @sva_suspend_pasid: stop activities related to a pasid but maintain the bond
+ * @sva_resume_pasid: start activities related to a pasid
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
@@ -344,6 +346,10 @@ struct iommu_ops {
 
 	int (*sva_unbind_gpasid)(struct device *dev, u32 pasid);
 
+	void (*sva_suspend_pasid)(struct device *dev, u32 pasid);
+
+	void (*sva_resume_pasid)(struct device *dev, u32 pasid);
+
 	int (*def_domain_type)(struct device *dev);
 
 	unsigned long pgsize_bitmap;
@@ -1101,6 +1107,14 @@ static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
+static inline void sva_suspend_pasid(struct device *dev, u32 pasid)
+{
+}
+
+static inline void (*sva_resume_pasid)(struct device *dev, u32 pasid)
+{
+}
+
 static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 {
 	return NULL;
-- 
Gitee


From da2156bfb9e246a784025207cc6fe679d79b16c1 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 4 Oct 2019 05:26:09 -0700
Subject: [PATCH 1468/2161] iommu: add fault reporting flags

commit ab7bef6caa7cee2db90932a620ff2d42dd68a971 Intel-BKC.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/iommu.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1cb181e6d315..657655082113 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -49,8 +49,10 @@ struct iommu_fault_event;
 struct iommu_dma_cookie;
 
 /* iommu fault flags */
-#define IOMMU_FAULT_READ	0x0
-#define IOMMU_FAULT_WRITE	0x1
+#define IOMMU_FAULT_READ		(1 << 0)
+#define IOMMU_FAULT_WRITE		(1 << 1)
+#define IOMMU_FAULT_EXEC		(1 << 2)
+#define IOMMU_FAULT_PRIV		(1 << 3)
 
 typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
 			struct device *, unsigned long, int, void *);
-- 
Gitee


From 6eccdade909f64f27a5939414c2963122a3930e7 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 4 Oct 2019 10:44:28 -0700
Subject: [PATCH 1469/2161] iommu/vt-d: iommu/vt-d: report non-recoverable
 faults to device

commit c59707c3b1ceb99560a7ff57aed93f1383abf8c6 Intel-BKC.

Currently, dmar fault IRQ handler does nothing more than rate
limited printk, no critical hardware handling need to be done
in IRQ context.
For some use case such as vIOMMU, it might be useful to report
non-recoverable faults outside host IOMMU subsystem. DMAR fault
can come from both DMA and interrupt remapping which has to be
set up early before threaded IRQ is available.
This patch adds an option and a workqueue such that when faults
are requested, DMAR fault IRQ handler can use the IOMMU fault
reporting API to report.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c  | 151 +++++++++++++++++++++++++++++++++++-
 drivers/iommu/intel/iommu.c |   4 +-
 include/linux/dmar.h        |   2 +-
 include/linux/intel-iommu.h |   1 +
 4 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 1f20539cb228..4322fc5d99fc 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1160,6 +1160,12 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
 	return err;
 }
 
+static inline void dmar_free_fault_wq(struct intel_iommu *iommu)
+{
+	if (iommu->fault_wq)
+		destroy_workqueue(iommu->fault_wq);
+}
+
 static void free_iommu(struct intel_iommu *iommu)
 {
 	if (intel_iommu_enabled && !iommu->drhd->ignored) {
@@ -1176,6 +1182,7 @@ static void free_iommu(struct intel_iommu *iommu)
 		free_irq(iommu->irq, iommu);
 		dmar_free_hwirq(iommu->irq);
 		iommu->irq = 0;
+		dmar_free_fault_wq(iommu);
 	}
 
 	if (iommu->qi) {
@@ -1848,6 +1855,31 @@ static const char *irq_remap_fault_reasons[] =
 	"Blocked an interrupt request due to source-id verification failure",
 };
 
+/* fault data and status */
+enum intel_iommu_fault_reason {
+	INTEL_IOMMU_FAULT_REASON_SW,
+	INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT,
+	INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT,
+	INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID,
+	INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH,
+	INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS,
+	INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS,
+	INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID,
+	INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID,
+	INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID,
+	INTEL_IOMMU_FAULT_REASON_NONE_ZERO_RTP,
+	INTEL_IOMMU_FAULT_REASON_NONE_ZERO_CTP,
+	INTEL_IOMMU_FAULT_REASON_NONE_ZERO_PTE,
+	NR_INTEL_IOMMU_FAULT_REASON,
+};
+
+/* fault reasons that are allowed to be reported outside IOMMU subsystem */
+#define INTEL_IOMMU_FAULT_REASON_ALLOWED			\
+	((1ULL << INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH) |	\
+		(1ULL << INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS) |	\
+		(1ULL << INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS))
+
+
 static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
 {
 	if (fault_reason >= 0x20 && (fault_reason - 0x20 <
@@ -1932,12 +1964,85 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
+static enum iommu_fault_reason to_iommu_fault_reason(u8 reason)
+{
+	if (reason >= NR_INTEL_IOMMU_FAULT_REASON) {
+		pr_warn("unknown DMAR fault reason %d\n", reason);
+		return IOMMU_FAULT_REASON_UNKNOWN;
+	}
+	switch (reason) {
+	case INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH:
+		return 	IOMMU_FAULT_REASON_OOR_ADDRESS;
+	case INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID:
+	case INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS:
+	case INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS:
+		return IOMMU_FAULT_REASON_PERMISSION;
+		/* REVISIT: Internal IOMMU fault reasons are reported as
+		 * unknown to the device. Need to sort throuh all SM reasons.
+		 */
+	case INTEL_IOMMU_FAULT_REASON_SW:
+	case INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT:
+	case INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT:
+	case INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID:
+	case INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID:
+	case INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID:
+	default:
+		return IOMMU_FAULT_REASON_UNKNOWN;
+	}
+}
+
+struct dmar_fault_work {
+	struct work_struct fault_work;
+	struct intel_iommu *iommu;
+	u64 addr;
+	int type;
+	int fault_type;
+	enum intel_iommu_fault_reason reason;
+	u16 sid;
+};
+
+static void report_fault_to_device(struct work_struct *work)
+{
+	struct dmar_fault_work *dfw = container_of(work, struct dmar_fault_work,
+						fault_work);
+	struct iommu_fault_event event;
+	struct pci_dev *pdev;
+	u8 bus, devfn;
+
+	memset(&event, 0, sizeof(struct iommu_fault_event));
+	bus = PCI_BUS_NUM(dfw->sid);
+	devfn = PCI_DEVFN(PCI_SLOT(dfw->sid), PCI_FUNC(dfw->sid));
+	/*
+	 * we need to check if the fault reporting is requested for the
+	 * offending device.
+	 */
+	pdev = pci_get_domain_bus_and_slot(dfw->iommu->segment, bus, devfn);
+	if (!pdev) {
+		pr_warn("No PCI device found for source ID %x\n", dfw->sid);
+		goto free_work;
+	}
+	/*
+	 * unrecoverable fault is reported per IOMMU, notifier handler can
+	 * resolve PCI device based on source ID.
+	 */
+	event.fault.event.reason = to_iommu_fault_reason(dfw->reason);
+	event.fault.event.addr = dfw->addr;
+	event.fault.type = IOMMU_FAULT_DMA_UNRECOV;
+	event.fault.event.perm = dfw->type ? IOMMU_READ : IOMMU_WRITE;
+	iommu_report_device_fault(&pdev->dev, &event);
+	pci_dev_put(pdev);
+
+free_work:
+	kfree(dfw);
+}
+
 static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 		u8 fault_reason, u32 pasid, u16 source_id,
 		unsigned long long addr)
 {
 	const char *reason;
 	int fault_type;
+	struct dmar_fault_work *dfw;
 
 	reason = dmar_get_fault_reason(fault_reason, &fault_type);
 
@@ -1965,6 +2070,26 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 
 	dmar_fault_dump_ptes(iommu, source_id, addr, pasid);
 
+	/* check if fault reason is permitted to report outside IOMMU */
+	if (!((1 << fault_reason) & INTEL_IOMMU_FAULT_REASON_ALLOWED))
+		return 0;
+
+	dfw = kmalloc(sizeof(*dfw), GFP_ATOMIC);
+	if (!dfw)
+		return -ENOMEM;
+
+	INIT_WORK(&dfw->fault_work, report_fault_to_device);
+	dfw->addr = addr;
+	dfw->type = type;
+	dfw->fault_type = fault_type;
+	dfw->reason = fault_reason;
+	dfw->sid = source_id;
+	dfw->iommu = iommu;
+	if (!queue_work(iommu->fault_wq, &dfw->fault_work)) {
+		kfree(dfw);
+		return -EBUSY;
+	}
+
 	return 0;
 }
 
@@ -2048,10 +2173,28 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-int dmar_set_interrupt(struct intel_iommu *iommu)
+static int dmar_set_fault_wq(struct intel_iommu *iommu)
+{
+	if (iommu->fault_wq)
+		return 0;
+
+	iommu->fault_wq = alloc_ordered_workqueue(iommu->name, 0);
+	if (!iommu->fault_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu, bool queue_fault)
 {
 	int irq, ret;
 
+	/* fault can be reported back to device drivers via a wq */
+	if (queue_fault) {
+		ret = dmar_set_fault_wq(iommu);
+		if (ret)
+			pr_err("Failed to create fault handling workqueue\n");
+	}
 	/*
 	 * Check if the fault interrupt is already initialized.
 	 */
@@ -2067,8 +2210,10 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 	}
 
 	ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
-	if (ret)
+	if (ret) {
 		pr_err("Can't request irq\n");
+		dmar_free_fault_wq(iommu);
+	}
 	return ret;
 }
 
@@ -2082,7 +2227,7 @@ int __init enable_drhd_fault_handling(void)
 	 */
 	for_each_iommu(iommu, drhd) {
 		u32 fault_status;
-		int ret = dmar_set_interrupt(iommu);
+		int ret = dmar_set_interrupt(iommu, false);
 
 		if (ret) {
 			pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index a79ec06901b3..5d22b439db46 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3499,7 +3499,7 @@ static int __init init_dmars(void)
 				goto free_iommu;
 		}
 #endif
-		ret = dmar_set_interrupt(iommu);
+		ret = dmar_set_interrupt(iommu, true);
 		if (ret)
 			goto free_iommu;
 	}
@@ -4018,7 +4018,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 			goto disable_iommu;
 	}
 #endif
-	ret = dmar_set_interrupt(iommu);
+	ret = dmar_set_interrupt(iommu, true);
 	if (ret)
 		goto disable_iommu;
 
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 45e903d84733..18b57996da62 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -288,7 +288,7 @@ extern void dmar_msi_unmask(struct irq_data *data);
 extern void dmar_msi_mask(struct irq_data *data);
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
-extern int dmar_set_interrupt(struct intel_iommu *iommu);
+extern int dmar_set_interrupt(struct intel_iommu *iommu, bool queue_fault);
 extern irqreturn_t dmar_fault(int irq, void *dev_id);
 extern int dmar_alloc_hwirq(int id, int node, void *arg);
 extern void dmar_free_hwirq(int irq);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 7c55209ad591..2aa779a7c410 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -615,6 +615,7 @@ struct intel_iommu {
 	struct iommu_device iommu;  /* IOMMU core code handle */
 	int		node;
 	u32		flags;      /* Software defined flags */
+	struct workqueue_struct *fault_wq; /* Reporting IOMMU fault to device */
 
 	struct dmar_drhd_unit *drhd;
 	void *perf_statistic;
-- 
Gitee


From 2ce59f759c6644b80f4a09b88a776614bcb51e1f Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 3 Oct 2019 15:37:09 -0700
Subject: [PATCH 1470/2161] iommu: support multiple fault handler data per
 device

commit d6443ccc901124b8a7dbf454a17dea89be23a4c1 Intel-BKC.

With the introduction of mdev, fault reporting must be supported at
per PASID-dev granularity. Introduce APIs to support multiple data per
handler.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 167 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/iommu.h |  27 ++++++-
 2 files changed, 189 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0c1a859a50e1..3c06e6e8d3f8 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1137,6 +1137,7 @@ int iommu_register_device_fault_handler(struct device *dev,
 					void *data)
 {
 	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_handler_data *hdata;
 	int ret = 0;
 
 	if (!param)
@@ -1156,8 +1157,23 @@ int iommu_register_device_fault_handler(struct device *dev,
 		ret = -ENOMEM;
 		goto done_unlock;
 	}
+
 	param->fault_param->handler = handler;
-	param->fault_param->data = data;
+
+	hdata = kzalloc(sizeof(struct iommu_fault_handler_data), GFP_KERNEL);
+	if (!hdata) {
+		kfree(param->fault_param);
+		put_device(dev);
+		ret = -ENOMEM;
+		goto done_unlock;
+	}
+
+	INIT_LIST_HEAD(&param->fault_param->data);
+	/* Default handler data uses reserved vector 0 */
+	hdata->data = data;
+	dev_dbg(dev, "Add IOMMU default handler data %llx\n", (u64)data);
+	list_add(&hdata->list, &param->fault_param->data);
+
 	mutex_init(&param->fault_param->lock);
 	INIT_LIST_HEAD(&param->fault_param->faults);
 
@@ -1171,6 +1187,111 @@ int iommu_register_device_fault_handler(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
 
+
+/**
+ * iommu_add_device_fault_data() - add handler specific data
+ *
+ * For devices with partitioned resources, we may need to have multiple
+ * handler data that can be identified by IOMMU driver. This function
+ * allows device drivers to add handler specific data associated with
+ * a vector. When IOMMU detects device fault and its vector, handlers
+ * can be invoked with the matching data.
+ * For page request service related to DMA request with PASID, the vector
+ * is the PASID and the data is PASID associated data such as a mediated
+ * device. Vector 0 is researved for default handler data when no per vector
+ * data is added to device handler data list.
+ *
+ * @dev: the device
+ * @vector: identifies fault reporting data
+ * @data: opaque device handler data associated with the fault
+ */
+int iommu_add_device_fault_data(struct device *dev,
+				int vector, void *data)
+{
+	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_handler_data *hdata;
+	int ret = 0;
+
+	dev_dbg(dev, "%s: vector: %d data: %llx\n", __func__, vector, (u64)data);
+	/*
+	 * Fault handler must have been registered before adding handler data.
+	 * Vector 0 is reserved for default data associated with handler.
+	 */
+	if (!param || !param->fault_param || !vector)
+		return -EINVAL;
+
+	mutex_lock(&param->lock);
+
+	/* vector must be unique, check if we have the same vector already */
+	list_for_each_entry(hdata, &param->fault_param->data, list) {
+		if (hdata->vector == vector) {
+			dev_err(dev, "IOMMU fault handler data exists for vector %d\n", vector);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	hdata = kzalloc(sizeof(struct iommu_fault_handler_data), GFP_KERNEL);
+	if (!hdata) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+	hdata->vector = vector;
+	hdata->data = data;
+	dev_dbg(dev, "Added IOMMU fault handler data %llx for vector %d\n",
+		(u64)data, vector);
+	list_add_tail(&hdata->list, &param->fault_param->data);
+
+unlock:
+	mutex_unlock(&param->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_add_device_fault_data);
+
+/**
+ * iommu_delete_device_fault_data() - delete handler specific data
+ *
+ * For devices with partitioned resources, we may need to have multiple
+ * handler data that can be identified by IOMMU driver. This function
+ * allows device drivers to add handler specific data associated with
+ * a vector. When IOMMU detects device fault and its vector, handlers
+ * can be invoked with the matching data.
+ * For page request service related to DMA request with PASID, the vector
+ * is the PASID and the data is PASID associated data such as a mediated
+ * device.
+ * @dev: the device
+ * @vector: identifies fault reporting data to be removed
+ */
+void iommu_delete_device_fault_data(struct device *dev, int vector)
+{
+	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_handler_data *hdata, *tmp;
+
+	dev_dbg(dev, "%s: vector:%d\n", __func__, vector);
+	/*
+	 * Fault handler must have been registered before adding handler data.
+	 * Vector 0 is reserved for default data associated with handler.
+	 */
+	if (!param || !param->fault_param || !vector)
+		return;
+
+	mutex_lock(&param->lock);
+
+	list_for_each_entry_safe(hdata, tmp, &param->fault_param->data, list) {
+		if (hdata->vector == vector) {
+			list_del(&hdata->list);
+			kfree(hdata);
+			dev_dbg(dev, "Deleted IOMMU fault handler data for vector %d\n", vector);
+			goto unlock;
+		}
+	}
+	dev_err(dev, "Failed to find handler data for vector %d\n", vector);
+
+unlock:
+	mutex_unlock(&param->lock);
+}
+EXPORT_SYMBOL_GPL(iommu_delete_device_fault_data);
+
 /**
  * iommu_unregister_device_fault_handler() - Unregister the device fault handler
  * @dev: the device
@@ -1184,6 +1305,7 @@ int iommu_unregister_device_fault_handler(struct device *dev)
 {
 	struct dev_iommu *param = dev->iommu;
 	int ret = 0;
+	struct iommu_fault_handler_data *hdata, *tmp;
 
 	if (!param)
 		return -EINVAL;
@@ -1197,6 +1319,14 @@ int iommu_unregister_device_fault_handler(struct device *dev)
 	if (!list_empty(&param->fault_param->faults)) {
 		ret = -EBUSY;
 		goto unlock;
+
+	}
+	/* TODO: Free handler data if any */
+	list_for_each_entry_safe(hdata, tmp, &param->fault_param->data, list) {
+		dev_dbg(dev, "%s: free handler data %llx vector %d\n", __func__,
+			(u64)hdata->data, hdata->vector);
+		list_del(&hdata->list);
+		kfree(hdata);
 	}
 
 	kfree(param->fault_param);
@@ -1224,8 +1354,10 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 {
 	struct dev_iommu *param = dev->iommu;
 	struct iommu_fault_event *evt_pending = NULL;
+	struct iommu_fault_handler_data *hdata;
 	struct iommu_fault_param *fparam;
 	struct timer_list *tmr;
+	void *handler_data = NULL;
 	int ret = 0;
 	u64 exp;
 
@@ -1263,7 +1395,38 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 		mutex_unlock(&fparam->lock);
 	}
 
-	ret = fparam->handler(&evt->fault, fparam->data);
+	if (!evt->vector) {
+		hdata = list_first_entry(&fparam->data,
+					struct iommu_fault_handler_data, list);
+		handler_data = hdata->data;
+		dev_dbg(dev, "%s:default handler data %llx\n",
+			__func__, (u64)handler_data);
+	} else {
+		/* Find data for matching vector */
+		list_for_each_entry(hdata, &param->fault_param->data, list) {
+			dev_dbg(dev, "Searching handler data vector %d to match %llu\n",
+					hdata->vector, evt->vector);
+
+			if (hdata->vector == evt->vector) {
+				handler_data = hdata->data;
+				dev_dbg(dev, "IOMMU report data %llx on fault vector %llu\n",
+					(u64)handler_data, evt->vector);
+				break;
+			}
+		}
+	}
+	if (!handler_data) {
+		dev_err(dev, "No valid handler data for vector %llu\n", evt->vector);
+		if (evt_pending)
+			list_del(&evt_pending->list);
+		ret = -ENODEV;
+		goto done_unlock;
+	}
+	dev_dbg(dev, "%s: calling handler with data %llx\n",
+		__func__, (u64)handler_data);
+
+	ret = fparam->handler(&evt->fault, handler_data);
+	trace_dev_fault(dev, &evt->fault);
 	if (ret && evt_pending) {
 		mutex_lock(&fparam->lock);
 		list_del(&evt_pending->list);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 657655082113..cdfe7b52d4ea 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -392,19 +392,26 @@ struct iommu_fault_event {
 	struct iommu_fault fault;
 	struct list_head list;
 	u64 expire;
+	u64 vector;
+};
+
+struct iommu_fault_handler_data {
+	u32 vector;
+	void *data;
+	struct list_head list;
 };
 
 /**
  * struct iommu_fault_param - per-device IOMMU fault data
  * @handler: Callback function to handle IOMMU faults at device level
- * @data: handler private data
- * @faults: holds the pending faults which needs response
+ * @data: handler private data list
+ * @faults: holds the pending faults which needs response, e.g. page response.
  * @lock: protect pending faults list
  * @timer: track page request pending time limit
  */
 struct iommu_fault_param {
 	iommu_dev_fault_handler_t handler;
-	void *data;
+	struct list_head data;
 	struct list_head faults;
 	struct timer_list timer;
 	struct mutex lock;
@@ -557,6 +564,9 @@ extern int iommu_unregister_device_fault_handler(struct device *dev);
 
 extern int iommu_report_device_fault(struct device *dev,
 				     struct iommu_fault_event *evt);
+extern int iommu_add_device_fault_data(struct device *dev,
+				int vector, void *data);
+extern void iommu_delete_device_fault_data(struct device *dev, int vector);
 extern int iommu_page_response(struct iommu_domain *domain,
 			       struct device *dev,
 			       void __user *uinfo);
@@ -930,6 +940,17 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 	return -ENODEV;
 }
 
+static inline
+int iommu_add_device_fault_data(struct device *dev, int vector, void *data)
+{
+	return -ENODEV;
+}
+
+static inline
+void iommu_delete_device_fault_data(struct device *dev, int vector)
+{
+}
+
 static inline int iommu_page_response(struct iommu_domain *domain,
 				      struct device *dev,
 				      void __user *uinfo)
-- 
Gitee


From 9cba23677edb29a4d03fa5b8a4ef5f0320f25dd4 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 4 Oct 2019 05:24:18 -0700
Subject: [PATCH 1471/2161] iommu: make sure unregistration of fault handler
 never fails

commit 34dfc72d9c9535c09dc064a43a3f0fb392042b00 Intel-BKC.

Add a temorary fix to flush pending faults, proper code needs to be
validated in various race conditions.
REVISIT!

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 64 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3c06e6e8d3f8..d2d60d417f5f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1305,6 +1305,7 @@ int iommu_unregister_device_fault_handler(struct device *dev)
 {
 	struct dev_iommu *param = dev->iommu;
 	int ret = 0;
+	struct iommu_fault_event *evt, *next;
 	struct iommu_fault_handler_data *hdata, *tmp;
 
 	if (!param)
@@ -1317,9 +1318,24 @@ int iommu_unregister_device_fault_handler(struct device *dev)
 
 	/* we cannot unregister handler if there are pending faults */
 	if (!list_empty(&param->fault_param->faults)) {
+		/*
+		 * REVISIT: We should not run into pending faults if we do unbind first.
+		 * the proper termination flow will ensure no pending faults as follows:
+		 * 1. pasid disable and tlb flush
+		 * 2. unbind, free, flush and drain
+		 * 3. unregister fault handler.
+		 */
+		mutex_lock(&param->fault_param->lock);
+		list_for_each_entry_safe(evt, next, &param->fault_param->faults, list) {
+			dev_dbg(dev, "%s, free fault event: 0x%lx\n", __func__, (unsigned long) evt);
+			list_del(&evt->list);
+			kfree(evt);
+		}
+		mutex_unlock(&param->fault_param->lock);
+/*
 		ret = -EBUSY;
 		goto unlock;
-
+*/
 	}
 	/* TODO: Free handler data if any */
 	list_for_each_entry_safe(hdata, tmp, &param->fault_param->data, list) {
@@ -2481,6 +2497,52 @@ EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
 int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
 			     ioasid_t pasid)
 {
+	pr_warn("%s: FIXME need to clear all pending faults!\n", __func__);
+#if 0
+	struct dev_iommu *param = dev->iommu;
+	/* FIXME: clear all pending page requests */
+	struct iommu_page_response msg;
+	int ret = -EINVAL;
+	struct iommu_fault_event *evt, *tmp;
+
+	if (!domain || !domain->ops->page_response)
+		return -ENODEV;
+
+	/*
+	 * Device dev_iommu should have been allocated when device is
+	 * added to its iommu_group.
+	 */
+	if (!param || !param->fault_param)
+		return -EINVAL;
+
+	/* Only send response if there is a fault report pending */
+	mutex_lock(&param->fault_param->lock);
+	if (list_empty(&param->fault_param->faults)) {
+		pr_warn("no pending PRQ, drop response\n");
+		goto done_unlock;
+	}
+
+	/* Clear all pending page requests and return response code INVALID */
+	list_for_each_entry_safe(evt, tmp, &param->fault_param->faults, list) {
+		if (evt->fault.prm.pasid == pasid) {
+			memcpy(&msg.private_data, &evt->private_data, sizeof(evt->private_data));
+			msg.pasid = pasid;
+			msg.flags |= IOMMU_PAGE_RESP_PASID_VALID;
+			msg.priv_data_present = 1;
+			msg.grpid = evt->fault.prm.grpid;
+			msg.code = IOMMU_PAGE_RESP_INVALID;
+			trace_dev_page_response(dev, &msg);
+			dev_dbg(dev, "Clear pending PRQ for PASID %d grp %d resp code IR\n",
+				pasid, msg.grpid);
+			ret = domain->ops->page_response(dev, &msg, evt);
+			list_del(&evt->list);
+			kfree(evt);
+		}
+	}
+
+done_unlock:
+	mutex_unlock(&param->fault_param->lock);
+#endif
 	if (unlikely(!domain->ops->sva_unbind_gpasid))
 		return -ENODEV;
 
-- 
Gitee


From 079388508e486b679c3b75ddb28c82f4e61106dc Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Sat, 27 Jun 2020 23:17:26 -0700
Subject: [PATCH 1472/2161] iommu/vt-d: Bind PASID with per-mdev
 fault_handler_data

commit 02640a5be755eb1c2b37e9c524665c626a119320 Intel-BKC.

For mdevs (a.k.a sub-devices), iommu driver should bind PASID with the
fault_hanlder_data as PRQ reporting is done per PASID-dev granularity.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  4 ++--
 drivers/iommu/intel/svm.c   | 17 +++++++++++++++--
 drivers/iommu/iommu.c       | 11 +++++++++--
 include/linux/intel-iommu.h |  8 ++++++--
 include/linux/iommu.h       | 11 ++++++++---
 5 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5d22b439db46..3b4cd30bdc2a 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4692,8 +4692,8 @@ static void intel_iommu_domain_free(struct iommu_domain *domain)
  * Check whether a @domain could be attached to the @dev through the
  * aux-domain attach/detach APIs.
  */
-static inline bool
-is_aux_domain(struct device *dev, struct iommu_domain *domain)
+inline bool is_aux_domain(struct device *dev,
+			  struct iommu_domain *domain)
 {
 	struct device_domain_info *info = get_domain_info(dev);
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index b264668cfc06..8e3bf39a05df 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -422,8 +422,10 @@ static int pasid_to_svm_sdev(struct device *dev,
 	return 0;
 }
 
-int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
-			  struct iommu_gpasid_bind_data *data)
+int intel_svm_bind_gpasid(struct iommu_domain *domain,
+			  struct device *dev,
+			  struct iommu_gpasid_bind_data *data,
+			  void *fault_data)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct intel_svm_dev *sdev = NULL;
@@ -497,6 +499,12 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			svm->gpasid = data->gpasid;
 			svm->flags |= SVM_FLAG_GUEST_PASID;
 			ioasid_attach_spid(data->hpasid, data->gpasid);
+			/*
+			 * Partial assignment needs to add fault data per-pasid
+			 */
+			if (is_aux_domain(dev, domain) && fault_data)
+				iommu_add_device_fault_data(dev, data->hpasid,
+							    fault_data);
 		}
 		ioasid_attach_data(data->hpasid, svm);
 		ioasid_get(NULL, svm->pasid);
@@ -588,6 +596,11 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 			intel_pasid_tear_down_entry(iommu, dev,
 						    svm->pasid, false);
 			intel_svm_drain_prq(dev, svm->pasid);
+			/*
+			 * Partial assignment needs to delete fault data
+			 */
+			if (is_aux_domain(dev, domain))
+				iommu_delete_device_fault_data(dev, pasid);
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d2d60d417f5f..80118ab270c2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2471,8 +2471,15 @@ static int iommu_sva_prepare_bind_data(void __user *udata,
 	return iommu_check_bind_data(data);
 }
 
+
+/*
+ * Caller could provide fault_data to differentiate future page
+ * requests from the device. This is helpful for page request
+ * handling for partial assignments of physical devices. e.g.
+ * mediated device assingment or other sub-device solution.
+ */
 int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
-			       void __user *udata)
+			       void __user *udata, void *fault_data)
 {
 	struct iommu_gpasid_bind_data data = { 0 };
 	int ret;
@@ -2487,7 +2494,7 @@ int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	ret = ioasid_get_if_owned(data.hpasid);
 	if (ret)
 		return ret;
-	ret = domain->ops->sva_bind_gpasid(domain, dev, &data);
+	ret = domain->ops->sva_bind_gpasid(domain, dev, &data, fault_data);
 	ioasid_put(NULL, data.hpasid);
 
 	return ret;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 2aa779a7c410..101e2028837d 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -763,9 +763,13 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
 extern void intel_svm_check(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
 extern int intel_svm_finish_prq(struct intel_iommu *iommu);
+inline bool is_aux_domain(struct device *dev,
+			  struct iommu_domain *domain);
 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
-			  struct iommu_gpasid_bind_data *data);
-int intel_svm_unbind_gpasid(struct device *dev, u32 pasid);
+			  struct iommu_gpasid_bind_data *data,
+			  void *fault_data);
+int intel_svm_unbind_gpasid(struct iommu_domain *domain,
+			    struct device *dev, u32 pasid);
 struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
 				 void *drvdata);
 void intel_svm_unbind(struct iommu_sva *handle);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index cdfe7b52d4ea..aef6b58a62c5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -344,7 +344,9 @@ struct iommu_ops {
 	int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
 				struct iommu_cache_invalidate_info *inv_info);
 	int (*sva_bind_gpasid)(struct iommu_domain *domain,
-			struct device *dev, struct iommu_gpasid_bind_data *data);
+				struct device *dev,
+				struct iommu_gpasid_bind_data *data,
+				void *fault_data);
 
 	int (*sva_unbind_gpasid)(struct device *dev, u32 pasid);
 
@@ -496,7 +498,9 @@ extern int iommu_uapi_cache_invalidate(struct iommu_domain *domain,
 				       void __user *uinfo);
 
 extern int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
-				      struct device *dev, void __user *udata);
+				      struct device *dev,
+				      void __user *udata,
+				      void *fault_data);
 extern int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
 					struct device *dev, void __user *udata);
 extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
@@ -1112,7 +1116,8 @@ iommu_uapi_cache_invalidate(struct iommu_domain *domain,
 }
 
 static inline int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
-					     struct device *dev, void __user *udata)
+					     struct device *dev, void __user *udata,
+					     void *fault_data)
 {
 	return -ENODEV;
 }
-- 
Gitee


From d19dc8d4be8ad6b686443b38bc90e781e83a9153 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Wed, 30 Sep 2020 06:46:41 -0700
Subject: [PATCH 1473/2161] iommu/vt-d: Report PRQ at per PASID-dev granularity

commit a64c0b29c8331ce0c407e4eb57d63e8b66b93416 Intel-BKC.

For mdev, iommu driver should pass the PASID info to iommu PRQ reportiing
framework, thus can get the corresponding fault_hanlder_data and trigger
fault hanlder.

Say a page request received in iommu driver. It will get the BDF and PASID
info from the request. Then PRQ reporting framework gets the corresponding
fault_hanlder_data with BDF+PASID and then trigger the fault handler.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 8e3bf39a05df..28f3f5f6dd13 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1023,6 +1023,7 @@ static int prq_to_iommu_prot(struct page_req_dsc *req)
 static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 				struct page_req_dsc *desc)
 {
+	struct device_domain_info *info;
 	struct iommu_fault_event event;
 
 	if (!dev || !dev_is_pci(dev))
@@ -1061,6 +1062,16 @@ static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 		event.fault.prm.private_data[0] = ktime_to_ns(ktime_get());
 	}
 
+	/*
+	 * If the device supports PASID granu scalable mode, reports the
+	 * PASID as vector such that handlers can be dispatched with per
+	 * vector data.
+	 */
+	info = get_domain_info(dev);
+	if (!list_empty(&info->subdevices)) {
+		dev_dbg(dev, "Aux domain present, assign vector %d\n", desc->pasid);
+		event.vector = desc->pasid;
+	}
 	return iommu_report_device_fault(dev, &event);
 }
 
-- 
Gitee


From 87ca40d014e112e8b7f192f8b613a1aa5452ce56 Mon Sep 17 00:00:00 2001
From: "Liu, Yi L" <yi.l.liu@intel.com>
Date: Sun, 4 Oct 2020 00:07:46 -0700
Subject: [PATCH 1474/2161] iommu: Add debug log in
 iommu_unregister_device_fault_handler()

commit afce51aae289582f54def5f7a5c61d6910755605 Intel-BKC.

Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 80118ab270c2..6b320b150f6e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1325,6 +1325,7 @@ int iommu_unregister_device_fault_handler(struct device *dev)
 		 * 2. unbind, free, flush and drain
 		 * 3. unregister fault handler.
 		 */
+		printk("%s, there is pending faults on dev: %s, here we force to free the fault events and unregister the fault handler, but this changes should be reverted when page response path is ready\n", __func__, dev_name(dev));
 		mutex_lock(&param->fault_param->lock);
 		list_for_each_entry_safe(evt, next, &param->fault_param->faults, list) {
 			dev_dbg(dev, "%s, free fault event: 0x%lx\n", __func__, (unsigned long) evt);
-- 
Gitee


From bd024cdff09799b66ac3e7f31f8750edb29de09b Mon Sep 17 00:00:00 2001
From: "Sun, Yi Y" <yi.y.sun@intel.com>
Date: Wed, 25 Nov 2020 16:37:42 +0800
Subject: [PATCH 1475/2161] iommu/vt-d: Support bind/unbind guest pt to def
 PASID (!VFIO)

commit ee046e4fae242321050e8afadcdef7b37085680e Intel-BKC.

For guest IOVA support under nesting IOMMU, hypervisor needs to bind/
unbind guest's IOVA page table to host. For such bind/unbind request
from user space, host should figure out a target PASID (host default
PASID) to be bound. This patch adds a flag to indicate host IOMMU
driver if it needs to use default PASID in host.

FIXME: pasid_set = NULL; // dmar_domain->pasid_set;

Change-Id: I71301dd022be8970abad9a51e7fe52650bc9055b
Signed-off-by: Sun, Yi Y <yi.y.sun@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 11 ++++++++
 drivers/iommu/intel/svm.c   | 56 ++++++++++++++++++++++++++++---------
 drivers/iommu/iommu.c       |  8 +++---
 include/linux/intel-iommu.h |  3 +-
 include/linux/iommu.h       |  9 ++++--
 include/uapi/linux/iommu.h  |  1 +
 6 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3b4cd30bdc2a..2037b4a84297 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3463,6 +3463,9 @@ static int __init init_dmars(void)
 			intel_iommu_sm = 0;
 		} else {
 			intel_svm_add_pasid_notifier();
+			/* TODO: where to free? */
+			ioasid_alloc(host_pasid_set, PASID_RID2PASID,
+				     PASID_RID2PASID, NULL);
 		}
 	}
 
@@ -5643,6 +5646,14 @@ intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
 			dmar_domain->default_pasid : -EINVAL;
 }
 
+int domain_get_pasid(struct iommu_domain *domain, struct device *dev)
+{
+	if (is_aux_domain(dev, domain))
+		return intel_iommu_aux_get_pasid(domain, dev);
+
+	return PASID_RID2PASID;
+}
+
 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
 					   struct device *dev)
 {
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 28f3f5f6dd13..3283ac951959 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -190,14 +190,15 @@ static inline bool intel_svm_capable(struct intel_iommu *iommu)
 	return iommu->flags & VTD_FLAG_SVM_CAPABLE;
 }
 
-static inline void intel_svm_drop_pasid(ioasid_t pasid)
+static inline void intel_svm_drop_pasid(ioasid_t pasid, u64 flags)
 {
 	/*
 	 * Detaching SPID results in UNBIND notification on the set, we must
 	 * do this before dropping the IOASID reference, otherwise the
 	 * notification chain may get destroyed.
 	 */
-	ioasid_detach_spid(pasid);
+	if (!(flags & IOMMU_SVA_HPASID_DEF))
+		ioasid_detach_spid(pasid);
 	ioasid_detach_data(pasid);
 	ioasid_put(NULL, pasid);
 }
@@ -230,7 +231,7 @@ static void intel_svm_free_async_fn(struct work_struct *work)
 	 * the PASID is in FREE_PENDING state, no one can get new reference.
 	 * Therefore, we can safely free the private data svm.
 	 */
-	intel_svm_drop_pasid(svm->pasid);
+	intel_svm_drop_pasid(svm->pasid, 0);
 
 	/*
 	 * Free before unbind can only happen with host PASIDs used for
@@ -401,8 +402,13 @@ static int pasid_to_svm_sdev(struct device *dev,
 		return -EINVAL;
 
 	svm = ioasid_find(set, pasid, NULL);
-	if (IS_ERR(svm))
-		return PTR_ERR(svm);
+	if (IS_ERR(svm)) {
+		if (pasid == PASID_RID2PASID) {
+			svm = NULL;
+		} else {
+			return PTR_ERR(svm);
+		}
+	}
 
 	if (!svm)
 		goto out;
@@ -434,6 +440,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	struct intel_svm *svm = NULL;
 	unsigned long iflags;
 	int ret = 0;
+	struct ioasid_set *pasid_set;
 
 	if (WARN_ON(!iommu) || !data)
 		return -EINVAL;
@@ -456,21 +463,29 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
 		return -EINVAL;
 
+	dmar_domain = to_dmar_domain(domain);
+	pasid_set = NULL; //dmar_domain->pasid_set;
+
 	/*
 	 * We only check host PASID range, we have no knowledge to check
 	 * guest PASID range.
 	 */
-	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
+	if (data->flags & IOMMU_SVA_HPASID_DEF) {
+		ret = domain_get_pasid(domain, dev);
+		if (ret < 0)
+			return ret;
+		data->hpasid = ret;
+		/* TODO: may consider to use NULL because host_pasid_set is native scope */
+		pasid_set = host_pasid_set;
+	} else if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
 		return -EINVAL;
 
 	info = get_domain_info(dev);
 	if (!info)
 		return -EINVAL;
 
-	dmar_domain = to_dmar_domain(domain);
-
 	mutex_lock(&pasid_mutex);
-	ret = pasid_to_svm_sdev(dev, NULL,
+	ret = pasid_to_svm_sdev(dev, pasid_set,
 				data->hpasid, &svm, &sdev);
 	if (ret)
 		goto out;
@@ -498,7 +513,8 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 		if (data->flags & IOMMU_SVA_GPASID_VAL) {
 			svm->gpasid = data->gpasid;
 			svm->flags |= SVM_FLAG_GUEST_PASID;
-			ioasid_attach_spid(data->hpasid, data->gpasid);
+			if (!(data->flags & IOMMU_SVA_HPASID_DEF))
+				ioasid_attach_spid(data->hpasid, data->gpasid);
 			/*
 			 * Partial assignment needs to add fault data per-pasid
 			 */
@@ -573,18 +589,32 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	return ret;
 }
 
-int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
+int intel_svm_unbind_gpasid(struct iommu_domain *domain,
+			    struct device *dev, u32 pasid, u64 user_flags)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct intel_svm_dev *sdev;
 	struct intel_svm *svm;
 	int ret;
+	struct dmar_domain *dmar_domain;
+	struct ioasid_set *pasid_set;
 
 	if (WARN_ON(!iommu))
 		return -EINVAL;
 
+	dmar_domain = to_dmar_domain(domain);
+	pasid_set = NULL; // dmar_domain->pasid_set;
+
+	if (user_flags & IOMMU_SVA_HPASID_DEF) {
+		ret = domain_get_pasid(domain, dev);
+		if (ret < 0)
+			return ret;
+		pasid = ret;
+		pasid_set = host_pasid_set;
+	}
+
 	mutex_lock(&pasid_mutex);
-	ret = pasid_to_svm_sdev(dev, NULL, pasid, &svm, &sdev);
+	ret = pasid_to_svm_sdev(dev, pasid_set, pasid, &svm, &sdev);
 	if (ret)
 		goto out;
 
@@ -613,7 +643,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 				 * the unbind, IOMMU driver will get notified
 				 * and perform cleanup.
 				 */
-				intel_svm_drop_pasid(pasid);
+				intel_svm_drop_pasid(pasid, user_flags);
 				kfree(svm);
 			}
 		}
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6b320b150f6e..e915548f1e28 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2427,7 +2427,7 @@ static int iommu_check_bind_data(struct iommu_gpasid_bind_data *data)
 		return -EINVAL;
 
 	/* Check all flags */
-	mask = IOMMU_SVA_GPASID_VAL;
+	mask = IOMMU_SVA_GPASID_VAL | IOMMU_SVA_HPASID_DEF;
 	if (data->flags & ~mask)
 		return -EINVAL;
 
@@ -2503,7 +2503,7 @@ int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
 
 int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
-			     ioasid_t pasid)
+			     ioasid_t pasid, u64 flags)
 {
 	pr_warn("%s: FIXME need to clear all pending faults!\n", __func__);
 #if 0
@@ -2554,7 +2554,7 @@ int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
 	if (unlikely(!domain->ops->sva_unbind_gpasid))
 		return -ENODEV;
 
-	return domain->ops->sva_unbind_gpasid(dev, pasid);
+	return domain->ops->sva_unbind_gpasid(domain, dev, pasid, flags);
 }
 EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
 
@@ -2574,7 +2574,7 @@ int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev
 	ret = ioasid_get_if_owned(data.hpasid);
 	if (ret)
 		return ret;
-	ret = iommu_sva_unbind_gpasid(domain, dev, data.hpasid);
+	ret = iommu_sva_unbind_gpasid(domain, dev, data.hpasid, data.flags);
 	ioasid_put(NULL, data.hpasid);
 
 	return ret;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 101e2028837d..4a79fa33bd6f 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -758,6 +758,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
 struct dmar_domain *find_domain(struct device *dev);
 struct device_domain_info *get_domain_info(struct device *dev);
 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+int domain_get_pasid(struct iommu_domain *domain, struct device *dev);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 extern void intel_svm_check(struct intel_iommu *iommu);
@@ -769,7 +770,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 			  struct iommu_gpasid_bind_data *data,
 			  void *fault_data);
 int intel_svm_unbind_gpasid(struct iommu_domain *domain,
-			    struct device *dev, u32 pasid);
+			    struct device *dev, u32 pasid, u64 user_flags);
 struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
 				 void *drvdata);
 void intel_svm_unbind(struct iommu_sva *handle);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index aef6b58a62c5..a177d8d8e062 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -348,7 +348,8 @@ struct iommu_ops {
 				struct iommu_gpasid_bind_data *data,
 				void *fault_data);
 
-	int (*sva_unbind_gpasid)(struct device *dev, u32 pasid);
+	int (*sva_unbind_gpasid)(struct iommu_domain *domain,
+				 struct device *dev, u32 pasid, u64 flags);
 
 	void (*sva_suspend_pasid)(struct device *dev, u32 pasid);
 
@@ -504,7 +505,8 @@ extern int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
 extern int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
 					struct device *dev, void __user *udata);
 extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
-				   struct device *dev, ioasid_t pasid);
+				   struct device *dev, ioasid_t pasid,
+				   u64 flags);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
@@ -1130,7 +1132,8 @@ static inline int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
 
 static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
 					  struct device *dev,
-					  ioasid_t pasid)
+					  ioasid_t pasid,
+					  u64 flags)
 {
 	return -ENODEV;
 }
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 1566dfe83f6f..6695e91c64f4 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -328,6 +328,7 @@ struct iommu_gpasid_bind_data {
 	__u32 format;
 	__u32 addr_width;
 #define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
+#define IOMMU_SVA_HPASID_DEF	(1 << 1) /* use default host PASID */
 	__u64 flags;
 	__u64 gpgd;
 	__u64 hpasid;
-- 
Gitee


From 342e0cc9dfe3b49a28c0541aece354a8e09310c2 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 1 Dec 2020 18:11:08 +0800
Subject: [PATCH 1476/2161] iommu/vt-d: Only reserve PASID#0 for native driver

commit e7076b09428633e45452cf834098fbe061abf88c Intel-BKC.

This is a temparay fix. Without this fix, will always see an ioasid allocation
failure in guest if scalable mode viommu is exposed.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2037b4a84297..4db9b7260062 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3464,8 +3464,16 @@ static int __init init_dmars(void)
 		} else {
 			intel_svm_add_pasid_notifier();
 			/* TODO: where to free? */
-			ioasid_alloc(host_pasid_set, PASID_RID2PASID,
-				     PASID_RID2PASID, NULL);
+			/* If do this allocation in guest, then it may encounter
+			 * failure as guest allocation will go into host, while
+			 * PASID#0 should have been allocated before VM boot. So
+			 * Add this check. And in future, we may want to let ioasid
+			 * provide a way to only reserve PASID #0 in its own ioasid
+			 * space.
+			 */
+			if (!cap_caching_mode(iommu->cap))
+				ioasid_alloc(host_pasid_set, PASID_RID2PASID,
+					     PASID_RID2PASID, NULL);
 		}
 	}
 
-- 
Gitee


From d21cd4bba67e91dc58648037e959eef06e69e036 Mon Sep 17 00:00:00 2001
From: "Sun, Yi Y" <yi.y.sun@intel.com>
Date: Wed, 25 Nov 2020 16:43:36 +0800
Subject: [PATCH 1477/2161] iommu/vt-d: Ignore pte present check if it is not
 nested PASID entry

commit 600ec25cb73620394835f6628aa5a5f0b0298f32 Intel-BKC.

We do not bind same PASID to the same device twice. But for non-nested
PASID entry, we can ignore present bit check. Because we need modify
the entry to do nested setup.

Change-Id: Ife4448664c7f6e46bae327f29f94911df7f9c134
Signed-off-by: Sun, Yi Y <yi.y.sun@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 7 ++++---
 drivers/iommu/intel/pasid.h | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 980503957972..24c4a161c688 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -788,10 +788,11 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
 		return -EINVAL;
 
 	/*
-	 * Caller must ensure PASID entry is not in use, i.e. not bind the
-	 * same PASID to the same device twice.
+	 * PASID entries with nesting translation type should not be set
+	 * multiple times. If caller tries to setup nesting for a PASID
+	 * entry which is already nested mode, should fail it.
 	 */
-	if (pasid_pte_is_present(pte))
+	if (pasid_pte_is_present(pte) && pasid_pte_is_nested(pte))
 		return -EBUSY;
 
 	pasid_clear_entry(pte);
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 5d588e11937c..b8677047de43 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -98,10 +98,10 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte)
 	return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
 }
 
-/* Get PGTT field of a PASID table entry */
-static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte)
+/* Check if PGTT bits of a PASID table entry is nested. */
+static inline bool pasid_pte_is_nested(struct pasid_entry *pte)
 {
-	return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7);
+	return ((READ_ONCE(pte->val[0]) >> 6) & 0x7) == PASID_ENTRY_PGTT_NESTED;
 }
 
 extern unsigned int intel_pasid_max_id;
-- 
Gitee


From c888ba42ad997d9c8824115c996b2aeff273a98f Mon Sep 17 00:00:00 2001
From: "Sun, Yi Y" <yi.y.sun@intel.com>
Date: Wed, 25 Nov 2020 16:53:56 +0800
Subject: [PATCH 1478/2161] iommu/vt-d: Support cache_invalidation request for
 gIOVA case

commit a9e982372af1a7e0137583d8940791fef9a4ffc2 Intel-BKC.

This patch supports IOMMU cache invalidation for gIOVA page table. Such
invalidation request from guest requires host IOMMU driver to figure out
proper PASID (e.g. default PASID of aux-domain and etc.).

From internal code based on 5.11-rc5, YiLiu used the ioasid_get_if_owned()
to check ownership.

Change-Id: Id49239b5a612d3f1afd6b543dbfc67e749cec6ed
Signed-off-by: Sun, Yi Y <yi.y.sun@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 4db9b7260062..9944493c1519 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5088,12 +5088,21 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 		 * PASID is stored in different locations based on the
 		 * granularity.
 		 */
-		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
-		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
-			pasid = inv_info->granu.pasid_info.pasid;
-		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
-			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
-			pasid = inv_info->granu.addr_info.pasid;
+		if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
+			if (inv_info->granu.pasid_info.flags &
+			    IOMMU_INV_PASID_FLAGS_PASID) {
+				pasid = inv_info->granu.pasid_info.pasid;
+			} else {
+				pasid = domain_get_pasid(domain, dev);
+			}
+		} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
+			if (inv_info->granu.addr_info.flags &
+			    IOMMU_INV_ADDR_FLAGS_PASID) {
+				pasid = inv_info->granu.addr_info.pasid;
+			} else {
+				pasid = domain_get_pasid(domain, dev);
+			}
+		}
 
 		ret = ioasid_get_if_owned(pasid);
 		if (ret)
-- 
Gitee


From 31ace7ad2de6acf6945ef44ee0f1b0ba7ab9a733 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Fri, 14 Aug 2020 15:46:44 +0800
Subject: [PATCH 1479/2161] iommu/vt-d: do not clear pte if pgtt is nesting

commit 49b299309a566c5955a8198e530ced6dfa22d1a2 Intel-BKC.

If the pte pgtt is nesting mode, we should not clear it but
only reset the pgtt to slt only. Otherwise, there will be
page request without pasid happens when the scalable mode
guest reboots to legacy mode guest. The root cause is that
there will be no chance to setup second level page table
on host when legacy guest booting.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 23 +++++++++++++++++++----
 drivers/iommu/intel/svm.c   |  1 +
 include/linux/intel-iommu.h |  1 +
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 24c4a161c688..b5e69b3bfd57 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -246,14 +246,31 @@ static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
 }
 
 static void
-intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore)
+intel_pasid_clear_entry(struct intel_iommu *iommu, struct device *dev,
+			u32 pasid, bool fault_ignore)
 {
 	struct pasid_entry *pe;
+	u64 pe_val;
+	bool nested;
 
 	pe = intel_pasid_get_entry(dev, pasid);
 	if (WARN_ON(!pe))
 		return;
 
+	/*
+	 * The guest may reboot from scalable mode to legacy mode. During this
+	 * phase, there is no chance to setup SLT. So, we should only reset PGTT
+	 * from NESTED to SL and keep other bits when unbind gpasid is executed.
+	 */
+	pe_val = READ_ONCE(pe->val[0]);
+	nested = (((pe_val >> 6) & 0x7) == PASID_ENTRY_PGTT_NESTED) ? true : false;
+	if (nested && (iommu->flags & VTD_FLAG_PGTT_SL_ONLY)) {
+		pe_val &= 0xfffffffffffffebf;
+		WRITE_ONCE(pe->val[0], pe_val);
+		iommu->flags &= ~VTD_FLAG_PGTT_SL_ONLY;
+		return;
+	}
+
 	if (fault_ignore && pasid_pte_is_present(pe))
 		pasid_clear_entry_with_fpd(pe);
 	else
@@ -458,9 +475,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 		return;
 
 	did = pasid_get_domain_id(pte);
-	pgtt = pasid_pte_get_pgtt(pte);
-
-	intel_pasid_clear_entry(dev, pasid, fault_ignore);
+	intel_pasid_clear_entry(iommu, dev, pasid, fault_ignore);
 
 	if (!ecap_coherent(iommu->ecap))
 		clflush_cache_range(pte, sizeof(*pte));
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 3283ac951959..2d8ad25630ff 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -623,6 +623,7 @@ int intel_svm_unbind_gpasid(struct iommu_domain *domain,
 			sdev->users--;
 		if (!sdev->users) {
 			list_del_rcu(&sdev->list);
+			iommu->flags |= VTD_FLAG_PGTT_SL_ONLY;
 			intel_pasid_tear_down_entry(iommu, dev,
 						    svm->pasid, false);
 			intel_svm_drain_prq(dev, svm->pasid);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 4a79fa33bd6f..bef216e95f06 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -478,6 +478,7 @@ enum {
 #define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
 #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
 #define VTD_FLAG_SVM_CAPABLE		(1 << 2)
+#define VTD_FLAG_PGTT_SL_ONLY		(1 << 3)
 
 extern int intel_iommu_sm;
 extern spinlock_t device_domain_lock;
-- 
Gitee


From 3f59173a96e22e3ce7ca7012da320b1dcb3aa3ca Mon Sep 17 00:00:00 2001
From: "Sun, Yi Y" <yi.y.sun@intel.com>
Date: Sat, 28 Nov 2020 14:21:26 +0800
Subject: [PATCH 1480/2161] iommu/vt-d: A temporary patch to cover old guest

commit 27e141e4d73d51dc7f28da71d7ef246466184f90 Intel-BKC.

We should set rid_priv to 1. Otherwise the PRQ w/o pasid generated.
The root cause is that the old guest PTE U/S bit is not set to 1.
Ideally, the guest kernel should set it.

Change-Id: I4067aed15cc78a314853596240c580f30968430c
Signed-off-by: Sun, Yi Y <yi.y.sun@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 9944493c1519..ecb70469f733 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2134,6 +2134,11 @@ static inline void
 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
 {
 	context->hi |= pasid & ((1 << 20) - 1);
+	/* TODO: rid_priv should be 1. Otherwise the PRQ w/o pasid generated.
+	 * The root cause is that the guest PTE U/S bit is not set to 1. Ideally,
+	 * the guest kernel should set it.
+	 */
+	context->hi |= (1 << 20);
 }
 
 /*
-- 
Gitee


From 2c4506064c7c6e45488c90f8ab0e5dafb90adc87 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Thu, 18 Feb 2021 20:32:36 +0800
Subject: [PATCH 1481/2161] iommu/vt-d: Two bug fix to svm.c

commit 4982e28dfaf3a1cab28603619c6fc62f66667cf5 Intel-BKC.

One happened in bind_gpasid() path,
    another happened when handling prq.

Change-Id: I533a8155662da7896e3bdd073e8cad7c5efa5f16
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 45 +++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 2d8ad25630ff..6b980efe3aab 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1183,34 +1183,33 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			goto bad_req;
 		}
 
-		if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) {
-			pr_err("IOMMU: %s: Page request in Privilege Mode\n",
-			       iommu->name);
+		/*
+		 * If prq is to be handled outside iommu driver via receiver of
+		 * the fault notifiers, we skip the page response here.
+		 */
+		if (svm->flags & SVM_FLAG_GUEST_MODE) {
+			if (sdev && !intel_svm_prq_report(sdev->dev, req))
+				goto prq_advance;
+			else
+				goto bad_req;
+
+		/* Since we're using init_mm.pgd directly, we should never take
+		 * any faults on kernel addresses. */
+		if (!svm->mm)
 			goto bad_req;
 		}
+		/* If address is not canonical, return invalid response */
+		if (!is_canonical_address(address))
+			goto bad_req;
 
-		if (unlikely(req->exe_req && req->rd_req)) {
-			pr_err("IOMMU: %s: Execution request not supported\n",
-			       iommu->name);
+		/* If the mm is already defunct, don't handle faults. */
+		if (!mmget_not_zero(svm->mm))
 			goto bad_req;
-		}
-		if (!svm || svm->pasid != req->pasid) {
-			rcu_read_lock();
-			svm = pasid_private_find(req->pasid);
-			/*
-			 * It can't go away, because the driver is not permitted
-			 * to unbind the mm while any page faults are outstanding.
-			 */
-			svm = pasid_private_find(req->pasid);
-			if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE))
-				goto bad_req;
-		}
 
-		if (!sdev || sdev->sid != req->rid) {
-			sdev = svm_lookup_device_by_sid(svm, req->rid);
-			if (!sdev)
-				goto bad_req;
-		}
+		down_read(&svm->mm->mmap_sem);
+		vma = find_extend_vma(svm->mm, address);
+		if (!vma || address < vma->vm_start)
+			goto invalid;
 
 		sdev->prq_seq_number++;
 
-- 
Gitee


From fc5b7d69cae374df0f143ed315207ec16268e82d Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 16:37:20 +0800
Subject: [PATCH 1482/2161] iommu: Resolve merge conflict for
 intel_svm_prq_report()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for intel_svm_prq_report(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 6b980efe3aab..d17b967bfd5e 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1051,8 +1051,8 @@ static int prq_to_iommu_prot(struct page_req_dsc *req)
 	return prot;
 }
 
-static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
-				struct page_req_dsc *desc)
+static int
+intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
 {
 	struct device_domain_info *info;
 	struct iommu_fault_event event;
@@ -1083,14 +1083,8 @@ static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 		 */
 		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
-		event.fault.prm.private_data[0] = desc->priv_data[0];
-		event.fault.prm.private_data[1] = desc->priv_data[1];
-	} else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) {
-		/*
-		 * If the private data fields are not used by hardware, use it
-		 * to monitor the prq handle latency.
-		 */
-		event.fault.prm.private_data[0] = ktime_to_ns(ktime_get());
+		memcpy(event.fault.prm.private_data, desc->priv_data,
+		       sizeof(desc->priv_data));
 	}
 
 	/*
-- 
Gitee


From bc56fb7f204e60424aca81c60992bc6594e20f98 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 4 Feb 2021 13:04:32 -0800
Subject: [PATCH 1483/2161] iommu/vt-d: Enable write protect for supervisor SVM

commit f68c7f539b6e9712e941212ab95a1feb5a0bf3b3 upstream.

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In supervisor shared virtual addressing (SVA), where page tables
are shared between CPU and DMA, IOMMU PASID entry WPE bit should match
CR0.WP bit in the CPU.
This patch sets WPE bit for supervisor PASIDs if CR0.WP is set.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index b5e69b3bfd57..8041aeb41d29 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -513,7 +513,6 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
 
 static inline int pasid_enable_wpe(struct pasid_entry *pte)
 {
-#ifdef CONFIG_X86
 	unsigned long cr0 = read_cr0();
 
 	/* CR0.WP is normally set but just to be sure */
@@ -521,7 +520,6 @@ static inline int pasid_enable_wpe(struct pasid_entry *pte)
 		pr_err_ratelimited("No CPU write protect!\n");
 		return -EINVAL;
 	}
-#endif
 	pasid_set_wpe(pte);
 
 	return 0;
-- 
Gitee


From 101c8b58dbdf678e9b2354de86af0b9e5f789691 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 4 Feb 2021 13:29:35 -0800
Subject: [PATCH 1484/2161] iommu/vt-d: Enable write protect propagation from
 guest

commit bb0f61533dfd6aa815a2719720c77d13f840b683 upstream.

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In guest supervisor shared virtual addressing (SVA), write-protect
should be honored upon guest bind supervisor PASID request.

This patch extends the VT-d portion of the IOMMU UAPI to include WP bit.
WPE bit of the  supervisor PASID entry will be set to match CPU CR0.WP bit.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 8041aeb41d29..899a570eafd0 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -732,8 +732,10 @@ intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
 		}
 		pasid_set_sre(pte);
 		/* Enable write protect WP if guest requested */
-		if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE)
-			pasid_set_wpe(pte);
+		if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE) {
+			if (pasid_enable_wpe(pte))
+				return -EINVAL;
+		}
 	}
 
 	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
-- 
Gitee


From 927f01f32212934235a0ea4b6ce138450cc95ec5 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 11 Feb 2021 15:18:52 -0800
Subject: [PATCH 1485/2161] iommu/vt-d: Reject unsupported page request modes

commit 78a523fe73b81b4447beb2d6c78c9fafae24eebb upstream.

When supervisor/privilige mode SVM is used, we bind init_mm.pgd with
a supervisor PASID. There should not be any page fault for init_mm.
Execution request with DMA read is also not supported.

This patch checks PRQ descriptor for both unsupported configurations,
reject them both with invalid responses.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index d17b967bfd5e..b69c3dc8cdaf 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1160,15 +1160,36 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	while (head != tail) {
 		req = &iommu->prq[head / sizeof(*req)];
 		address = (u64)req->addr << VTD_PAGE_SHIFT;
-
-		if (unlikely(!req->pasid_present)) {
-			pr_err("IOMMU: %s: Page request without PASID\n",
-			       iommu->name);
-bad_req:
-			svm = NULL;
-			sdev = NULL;
-			handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
-			goto prq_advance;
+		if (!req->pasid_present) {
+			pr_err("%s: Page request without PASID: %08llx %08llx\n",
+			       iommu->name, ((unsigned long long *)req)[0],
+			       ((unsigned long long *)req)[1]);
+			goto no_pasid;
+		}
+		/* We shall not receive page request for supervisor SVM */
+		if (req->pm_req && (req->rd_req | req->wr_req)) {
+			pr_err("Unexpected page request in Privilege Mode");
+			/* No need to find the matching sdev as for bad_req */
+			goto no_pasid;
+		}
+		/* DMA read with exec requeset is not supported. */
+		if (req->exe_req && req->rd_req) {
+			pr_err("Execution request not supported\n");
+			goto no_pasid;
+		}
+		if (!svm || svm->pasid != req->pasid) {
+			rcu_read_lock();
+			svm = ioasid_find(NULL, req->pasid, NULL);
+			/* It *can't* go away, because the driver is not permitted
+			 * to unbind the mm while any page faults are outstanding.
+			 * So we only need RCU to protect the internal idr code. */
+			rcu_read_unlock();
+			if (IS_ERR_OR_NULL(svm)) {
+				pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
+				       iommu->name, req->pasid, ((unsigned long long *)req)[0],
+				       ((unsigned long long *)req)[1]);
+				goto no_pasid;
+			}
 		}
 
 		if (unlikely(!is_canonical_address(address))) {
-- 
Gitee


From 4f2a7ce698f41d775a382a5bbca408d5615846f5 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 11 Feb 2021 15:25:05 -0800
Subject: [PATCH 1486/2161] iommu/vt-d: Calculate and set flags for
 handle_mm_fault
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 396bd6f3d9f659d7ce324806bf3cd6677385f8fd upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Page requests are originated from the user page fault. Therefore, we
shall set FAULT_FLAG_USER. 

FAULT_FLAG_REMOTE indicates that we are walking an mm which is not
guaranteed to be the same as the current->mm and should not be subject
to protection key enforcement. Therefore, we should set FAULT_FLAG_REMOTE
to avoid faults when both SVM and PKEY are used.

References: commit 1b2ee1266ea6 ("mm/core: Do not enforce PKEY permissions on remote mm access")
Reviewed-by: Raj Ashok <ashok.raj@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index b69c3dc8cdaf..e40e8b70b3fb 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1144,9 +1144,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	struct intel_svm_dev *sdev = NULL;
 	struct intel_iommu *iommu = d;
 	struct intel_svm *svm = NULL;
-	struct page_req_dsc *req;
-	int head, tail, handled;
-	u64 address;
+	int head, tail, handled = 0;
+	unsigned int flags = 0;
 
 	/*
 	 * Clear PPR bit before reading head/tail registers, to ensure that
@@ -1228,12 +1227,13 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 
 		sdev->prq_seq_number++;
 
-		/*
-		 * If prq is to be handled outside iommu driver via receiver of
-		 * the fault notifiers, we skip the page response here.
-		 */
-		if (intel_svm_prq_report(iommu, sdev->dev, req))
-			handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+		flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+		if (req->wr_req)
+			flags |= FAULT_FLAG_WRITE;
+
+		ret = handle_mm_fault(vma, address, flags);
+		if (ret & VM_FAULT_ERROR)
+			goto invalid;
 
 		trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
 				 req->priv_data[0], req->priv_data[1],
-- 
Gitee


From 36dc79d97cd7597b1f8c683362614a0b213d75d6 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Mon, 1 Mar 2021 15:35:56 +0800
Subject: [PATCH 1487/2161] Add back quota to avoid no space issue when trying
 to open /dev/ioasid in the second time.

commit 3d7a234c2e6141de7a24ba885b5ed95dfe3e8f6e Intel-BKC.

Need to fix the issue in ioasid.c.

Change-Id: Id7c6e252d33c12471306a8c234aef6623ebc1aef
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid_user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/ioasid_user.c b/drivers/iommu/ioasid_user.c
index 90513a32e9d7..95e306cf7f27 100644
--- a/drivers/iommu/ioasid_user.c
+++ b/drivers/iommu/ioasid_user.c
@@ -126,10 +126,11 @@ static int ioasid_fops_open(struct inode *inode, struct file *filep)
 	 * the token and create a IOASID set per mm_struct. All the
 	 * containers of the process share the same IOASID set.
 	 */
-	iset = ioasid_set_alloc(mm, 0, IOASID_SET_TYPE_MM);
+	iset = ioasid_set_alloc(mm, 1000, IOASID_SET_TYPE_MM);
 	if (IS_ERR(iset)) {
 		kfree(iuser);
 		ret = PTR_ERR(iset);
+		printk("%s, ret: %d\n", __func__, ret);
 		goto out;
 	}
 
-- 
Gitee


From 416429e557f98f98208e5dca64b0c8db8f9304eb Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Fri, 19 Mar 2021 13:54:11 +0800
Subject: [PATCH 1488/2161] iommu/vt-d: Fix gpasid bind/unbind issue

commit 52df39844906fcd2b9b16e8a981777327c22b6c6 Intel-BKC.

When gpasid 0 binding/unbinding, ioasid_get_if_owned() returns
errors because the set type is not valid. We should change api
to ioasid_get() to fix it.

When doing cache invalidation, we have similar issue.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 9 ++++++++-
 drivers/iommu/iommu.c       | 5 +++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index ecb70469f733..05fb80174e4d 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5046,6 +5046,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 	u16 did, sid;
 	int ret = 0;
 	u64 size = 0;
+	bool default_pasid = false;
 
 	if (!inv_info || !dmar_domain)
 		return -EINVAL;
@@ -5099,6 +5100,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 				pasid = inv_info->granu.pasid_info.pasid;
 			} else {
 				pasid = domain_get_pasid(domain, dev);
+				default_pasid = true;
 			}
 		} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
 			if (inv_info->granu.addr_info.flags &
@@ -5106,10 +5108,15 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
 				pasid = inv_info->granu.addr_info.pasid;
 			} else {
 				pasid = domain_get_pasid(domain, dev);
+				default_pasid = true;
 			}
 		}
 
-		ret = ioasid_get_if_owned(pasid);
+		if (default_pasid)
+			ret = ioasid_get(NULL, pasid);
+		else
+			ret = ioasid_get_if_owned(pasid);
+
 		if (ret)
 			goto out_unlock;
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e915548f1e28..442982cad779 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2492,9 +2492,10 @@ int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = ioasid_get_if_owned(data.hpasid);
+	ret = ioasid_get(NULL, data.hpasid);
 	if (ret)
 		return ret;
+
 	ret = domain->ops->sva_bind_gpasid(domain, dev, &data, fault_data);
 	ioasid_put(NULL, data.hpasid);
 
@@ -2571,7 +2572,7 @@ int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev
 	if (ret)
 		return ret;
 
-	ret = ioasid_get_if_owned(data.hpasid);
+	ret = ioasid_get(NULL, data.hpasid);
 	if (ret)
 		return ret;
 	ret = iommu_sva_unbind_gpasid(domain, dev, data.hpasid, data.flags);
-- 
Gitee


From b3c236cfe083143df242c5e86fbc9378e22dd418 Mon Sep 17 00:00:00 2001
From: Wu Hao <hao.wu@intel.com>
Date: Wed, 31 Mar 2021 12:40:32 +0800
Subject: [PATCH 1489/2161] iommu/ioasid: fix ioaisd_notifier_register_mm

commit de4a1d400ad45279f14659b689d001abf393b700 Intel-BKC.

still set token and nb for pending registration.

Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index ea061c1f66e4..471bce096501 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -1249,11 +1249,16 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 			goto exit_unlock;
 		}
 	}
+
 	curr = kzalloc(sizeof(*curr), GFP_ATOMIC);
 	if (!curr) {
 		ret = -ENOMEM;
 		goto exit_unlock;
 	}
+
+	curr->token = mm;
+	curr->nb = nb;
+
 	/* Check if the token has an existing set */
 	set = ioasid_find_mm_set(mm);
 	if (!set) {
@@ -1269,8 +1274,6 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 			ret = -EBUSY;
 			goto exit_free;
 		}
-		curr->token = mm;
-		curr->nb = nb;
 		curr->active = true;
 		curr->set = set;
 
-- 
Gitee


From be3a4bef855c57d789f7b2ec7d4b8eec6b9b222c Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Fri, 9 Apr 2021 22:19:51 +0800
Subject: [PATCH 1490/2161] iommu/vt-d: Fix pasid bind issue

commit 4a6c6b7b179544b3483c492927f26bbdcc6514f9 Intel-BKC.

We need set the original hpasid back after binding gpasid.
Otherwise, iommu layer will report error because the hpasid
is different for ioasid_get/ioasid_put.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e40e8b70b3fb..5d675ab224c4 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -441,6 +441,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	unsigned long iflags;
 	int ret = 0;
 	struct ioasid_set *pasid_set;
+	u64 hpasid_org;
 
 	if (WARN_ON(!iommu) || !data)
 		return -EINVAL;
@@ -474,6 +475,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 		ret = domain_get_pasid(domain, dev);
 		if (ret < 0)
 			return ret;
+		hpasid_org = data->hpasid;
 		data->hpasid = ret;
 		/* TODO: may consider to use NULL because host_pasid_set is native scope */
 		pasid_set = host_pasid_set;
@@ -585,6 +587,9 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 		kfree(svm);
 	}
 
+	if (data->flags & IOMMU_SVA_HPASID_DEF)
+		data->hpasid = hpasid_org;
+
 	mutex_unlock(&pasid_mutex);
 	return ret;
 }
-- 
Gitee


From 4527b0b7b70bfe8b9a7cd0aa2403dafcabe4ea2e Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 18 May 2021 13:43:46 +0800
Subject: [PATCH 1491/2161] iommu/vt-d: Add common code for dmar latency
 performance monitors

commit 55ee5e67a59a1b6f388d7a1c7b24022145f47a3e upstream.

The execution time of some operations is very performance critical, such
as cache invalidation and PRQ processing time. This adds some common code
to monitor the execution time range of those operations. The interfaces
include enabling/disabling, checking status, updating sampling data and
providing a common string format for users.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/perf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
index 0e8e03252d92..faaa96dda437 100644
--- a/drivers/iommu/intel/perf.c
+++ b/drivers/iommu/intel/perf.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*
+/**
  * perf.c - performance monitor
  *
  * Copyright (C) 2021 Intel Corporation
@@ -141,14 +141,14 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
 				if (val == UINT_MAX)
 					val = 0;
 				else
-					val = div_u64(val, 1000);
+					val /= 1000;
 				break;
 			case COUNTS_MAX:
-				val = div_u64(val, 1000);
+				val /= 1000;
 				break;
 			case COUNTS_SUM:
 				if (lstat[i].samples)
-					val = div_u64(val, (lstat[i].samples * 1000));
+					val /= (lstat[i].samples * 1000);
 				else
 					val = 0;
 				break;
-- 
Gitee


From 08ce2eb804a181333618d34f9e8ad915b3d4da97 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 18 May 2021 13:47:23 +0800
Subject: [PATCH 1492/2161] iommu/vt-d: Expose latency monitor data through
 debugfs

commit 456bb0b97f00fe8defba155c0a4c48d951635395 upstream.

A debugfs interface /sys/kernel/debug/iommu/intel/dmar_perf_latency is
created to control and show counts of execution time ranges for various
types per DMAR. The interface may help debug any potential performance
issue.

By default, the interface is disabled.

Possible write value of /sys/kernel/debug/iommu/intel/dmar_perf_latency
  0 - disable sampling all latency data
  1 - enable sampling IOTLB invalidation latency data
  2 - enable sampling devTLB invalidation latency data
  3 - enable sampling intr entry cache invalidation latency data
  4 - enable sampling prq handling latency data

Read /sys/kernel/debug/iommu/intel/dmar_perf_latency gives a snapshot
of sampling result of all enabled monitors.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/Kconfig   |   2 +-
 drivers/iommu/intel/debugfs.c | 107 ++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 88eb316da2ba..80a95acdd03a 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -32,7 +32,7 @@ if INTEL_IOMMU
 
 config INTEL_IOMMU_DEBUGFS
 	bool "Export Intel IOMMU internals in Debugfs"
-	depends on IOMMU_DEBUGFS
+	depends on INTEL_IOMMU && IOMMU_DEBUGFS
 	select DMAR_PERF
 	select DMAR_DEBUG
 	help
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index 62e23ff3c987..f4c6ef39068e 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -647,6 +647,111 @@ static const struct file_operations dmar_perf_latency_fops = {
 	.release	= single_release,
 };
 
+static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu,
+			     struct dmar_drhd_unit *drhd)
+{
+	int ret;
+
+	seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
+		   iommu->name, drhd->reg_base_addr);
+
+	ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
+	if (ret < 0)
+		seq_printf(m, "Failed to get latency snapshot");
+	else
+		seq_puts(m, debug_buf);
+	seq_puts(m, "\n");
+}
+
+static int latency_show(struct seq_file *m, void *v)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd)
+		latency_show_one(m, iommu, drhd);
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int dmar_perf_latency_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, latency_show, NULL);
+}
+
+static ssize_t dmar_perf_latency_write(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt, loff_t *ppos)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	int counting;
+	char buf[64];
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (kstrtoint(buf, 0, &counting))
+		return -EINVAL;
+
+	switch (counting) {
+	case 0:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd) {
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB);
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB);
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC);
+			dmar_latency_disable(iommu, DMAR_LATENCY_PRQ);
+		}
+		rcu_read_unlock();
+		break;
+	case 1:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_IOTLB);
+		rcu_read_unlock();
+		break;
+	case 2:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_DEVTLB);
+		rcu_read_unlock();
+		break;
+	case 3:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC);
+		rcu_read_unlock();
+		break;
+	case 4:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_PRQ);
+		rcu_read_unlock();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+	return cnt;
+}
+
+static const struct file_operations dmar_perf_latency_fops = {
+	.open		= dmar_perf_latency_open,
+	.write		= dmar_perf_latency_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 void __init intel_iommu_debugfs_init(void)
 {
 	struct dentry *intel_iommu_debug = debugfs_create_dir("intel",
@@ -665,6 +770,8 @@ void __init intel_iommu_debugfs_init(void)
 	debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
 			    NULL, &ir_translation_struct_fops);
 #endif
+	debugfs_create_file("qi_done", 0644, intel_iommu_debug,
+			    NULL, &qi_done_fops);
 	debugfs_create_file("dmar_perf_latency", 0644, intel_iommu_debug,
 			    NULL, &dmar_perf_latency_fops);
 }
-- 
Gitee


From 12288bc396fba26843602432b1ad28e91e52a931 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 18 May 2021 13:54:57 +0800
Subject: [PATCH 1493/2161] iommu/vt-d: Add PRQ handling latency sampling

commit 0f4834ab255bf488f20544e120713decfa77843e upstream.

The execution time for page fault request handling is performance critical
and needs to be monitored. This adds code to sample the execution time of
page fault request handling.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 48 +++++++++++++++++++++++-
 drivers/iommu/intel/svm.c   | 73 ++++++++++++++++++++++++-------------
 include/linux/intel-svm.h   |  4 +-
 3 files changed, 95 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 05fb80174e4d..9ba3bddc1aa1 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -55,6 +55,17 @@
 
 #define ROOT_SIZE		VTD_PAGE_SIZE
 #define CONTEXT_SIZE		VTD_PAGE_SIZE
+/* PRS_Allocation */
+static int prs_allocation = 32;
+
+/* PRQ_Size, use large enough size to avoid PRQ overflow, on SPR the sum of
+ * DSA, IAX/A, QAT = 512 + 256 + 64, round up to power of 2 is 1024.
+ * Since we process the PRQ once then move QH, we need to double that
+ * to 2048 entries. Each 4K page can hold 128 entries. So we need 64KB
+ * in total to prevent PRQ overflow in the worst case.
+ * For default 32 entries per dev,8 KB should be enough.
+ */
+int prq_size_page_order = 1;
 
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
@@ -442,6 +453,26 @@ static int __init intel_iommu_setup(char *str)
 		} else if (!strncmp(str, "sm_off", 6)) {
 			pr_info("Scalable mode is disallowed\n");
 			intel_iommu_sm = 0;
+		} else if (!strncmp(str, "prq_size_4kb", 12)) {
+			prq_size_page_order = 0;
+		} else if (!strncmp(str, "prq_size_8kb", 12)) {
+			prq_size_page_order = 1;
+		} else if (!strncmp(str, "prq_size_16kb", 13)) {
+			prq_size_page_order = 2;
+		} else if (!strncmp(str, "prq_size_32kb", 13)) {
+			prq_size_page_order = 3;
+		} else if (!strncmp(str, "prq_size_64kb", 13)) {
+			prq_size_page_order = 4;
+		} else if (!strncmp(str, "prs_allocation_32", 17)) {
+			prs_allocation = 32;
+		} else if (!strncmp(str, "prs_allocation_64", 17)) {
+			prs_allocation = 64;
+		} else if (!strncmp(str, "prs_allocation_128", 18)) {
+			prs_allocation = 128;
+		} else if (!strncmp(str, "prs_allocation_256", 18)) {
+			prs_allocation = 256;
+		} else if (!strncmp(str, "prs_allocation_512", 18)) {
+			prs_allocation = 512;
 		} else if (!strncmp(str, "tboot_noforce", 13)) {
 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 			intel_iommu_tboot_noforce = 1;
@@ -458,6 +489,21 @@ static int __init intel_iommu_setup(char *str)
 }
 __setup("intel_iommu=", intel_iommu_setup);
 
+static int __init intel_prs_allocation_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (kstrtoint(str, 10, &prs_allocation) < 0)
+		pr_info("prs_allocation: wrong parameter %s %d\n", str, prs_allocation);
+
+
+	pr_info("prs_allocation set to %d\n", prs_allocation);
+
+	return 1;
+}
+__setup("prs_allocation=", intel_prs_allocation_setup);
+
 static struct kmem_cache *iommu_domain_cache;
 static struct kmem_cache *iommu_devinfo_cache;
 
@@ -1660,7 +1706,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 
 	if (info->pri_supported &&
 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
-	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
+	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, prs_allocation))
 		info->pri_enabled = 1;
 #endif
 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 5d675ab224c4..65f7d9f89b76 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -25,12 +25,13 @@
 
 #include "pasid.h"
 #include "perf.h"
-#include "../iommu-sva-lib.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
 
+extern int prq_size_page_order;
+
 static DEFINE_XARRAY_ALLOC(pasid_private_array);
 static int pasid_private_add(ioasid_t pasid, void *priv)
 {
@@ -88,7 +89,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	struct page *pages;
 	int irq, ret;
 
-	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, prq_size_page_order);
 	if (!pages) {
 		pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
 			iommu->name);
@@ -126,7 +127,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	}
 	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
-	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | prq_size_page_order);
 
 	init_completion(&iommu->prq_complete);
 
@@ -157,12 +158,7 @@ int intel_svm_finish_prq(struct intel_iommu *iommu)
 		iommu->pr_irq = 0;
 	}
 
-	if (iommu->iopf_queue) {
-		iopf_queue_free(iommu->iopf_queue);
-		iommu->iopf_queue = NULL;
-	}
-
-	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	free_pages((unsigned long)iommu->prq, prq_size_page_order);
 	iommu->prq = NULL;
 
 	return 0;
@@ -1000,20 +996,6 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid)
 		goto prq_retry;
 	}
 
-	/*
-	 * A work in IO page fault workqueue may try to lock pasid_mutex now.
-	 * Holding pasid_mutex while waiting in iopf_queue_flush_dev() for
-	 * all works in the workqueue to finish may cause deadlock.
-	 *
-	 * It's unnecessary to hold pasid_mutex in iopf_queue_flush_dev().
-	 * Unlock it to allow the works to be handled while waiting for
-	 * them to finish.
-	 */
-	lockdep_assert_held(&pasid_mutex);
-	mutex_unlock(&pasid_mutex);
-	iopf_queue_flush_dev(dev);
-	mutex_lock(&pasid_mutex);
-
 	/*
 	 * Perform steps described in VT-d spec CH7.10 to drain page
 	 * requests and responses in hardware.
@@ -1151,6 +1133,10 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	struct intel_svm *svm = NULL;
 	int head, tail, handled = 0;
 	unsigned int flags = 0;
+	s64 start_ktime = 0;
+
+	if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ))
+		start_ktime = ktime_to_ns(ktime_get());
 
 	/*
 	 * Clear PPR bit before reading head/tail registers, to ensure that
@@ -1240,9 +1226,44 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		if (ret & VM_FAULT_ERROR)
 			goto invalid;
 
-		trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
-				 req->priv_data[0], req->priv_data[1],
-				 sdev->prq_seq_number);
+		result = QI_RESP_SUCCESS;
+invalid:
+		up_read(&svm->mm->mmap_sem);
+		mmput(svm->mm);
+bad_req:
+		/* We get here in the error case where the PASID lookup failed,
+		   and these can be NULL. Do not use them below this point! */
+		sdev = NULL;
+		svm = NULL;
+no_pasid:
+		if (req->lpig || req->priv_data_present) {
+			/*
+			 * Per VT-d spec. v3.0 ch7.7, system software must
+			 * respond with page group response if private data
+			 * is present (PDP) or last page in group (LPIG) bit
+			 * is set. This is an additional VT-d feature beyond
+			 * PCI ATS spec.
+			 */
+			resp.qw0 = QI_PGRP_PASID(req->pasid) |
+				QI_PGRP_DID(req->rid) |
+				QI_PGRP_PASID_P(req->pasid_present) |
+				QI_PGRP_PDP(req->priv_data_present) |
+				QI_PGRP_RESP_CODE(result) |
+				QI_PGRP_RESP_TYPE;
+			resp.qw1 = QI_PGRP_IDX(req->prg_index) |
+				QI_PGRP_LPIG(req->lpig);
+			resp.qw2 = 0;
+			resp.qw3 = 0;
+
+			if (req->priv_data_present)
+				memcpy(&resp.qw2, req->priv_data,
+				       sizeof(req->priv_data));
+			qi_submit_sync(iommu, &resp, 1, 0);
+		}
+
+		if (start_ktime)
+			dmar_latency_update(iommu, DMAR_LATENCY_PRQ,
+					    ktime_to_ns(ktime_get()) - start_ktime);
 prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 57cceecbe37f..7c0dfe29ff4a 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -15,9 +15,7 @@
 #define SVM_REQ_PRIV	(1<<0)
 
 /* Page Request Queue depth */
-#define PRQ_ORDER	2
-#define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
-#define PRQ_DEPTH	((0x1000 << PRQ_ORDER) >> 5)
+#define PRQ_RING_MASK	((0x1000 << prq_size_page_order) - 0x20)
 
 /*
  * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
-- 
Gitee


From b924b872356cf456147fce8ef0364f62a96553e3 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 16:12:56 +0800
Subject: [PATCH 1494/2161] iommu: Resolve merge conflict for
 intel_svm_enable_prq()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for intel_svm_enable_prq(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 32 +++++++-------------------------
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 65f7d9f89b76..20957132a1f7 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -85,7 +85,6 @@ svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
 
 int intel_svm_enable_prq(struct intel_iommu *iommu)
 {
-	struct iopf_queue *iopfq;
 	struct page *pages;
 	int irq, ret;
 
@@ -102,20 +101,13 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
 		       iommu->name);
 		ret = -EINVAL;
-		goto free_prq;
+	err:
+		free_pages((unsigned long)iommu->prq, prq_size_page_order);
+		iommu->prq = NULL;
+		return ret;
 	}
 	iommu->pr_irq = irq;
 
-	snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name),
-		 "dmar%d-iopfq", iommu->seq_id);
-	iopfq = iopf_queue_alloc(iommu->iopfq_name);
-	if (!iopfq) {
-		pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name);
-		ret = -ENOMEM;
-		goto free_hwirq;
-	}
-	iommu->iopf_queue = iopfq;
-
 	snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
 
 	ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
@@ -123,7 +115,9 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	if (ret) {
 		pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
 		       iommu->name);
-		goto free_iopfq;
+		dmar_free_hwirq(irq);
+		iommu->pr_irq = 0;
+		goto err;
 	}
 	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
@@ -132,18 +126,6 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	init_completion(&iommu->prq_complete);
 
 	return 0;
-
-free_iopfq:
-	iopf_queue_free(iommu->iopf_queue);
-	iommu->iopf_queue = NULL;
-free_hwirq:
-	dmar_free_hwirq(irq);
-	iommu->pr_irq = 0;
-free_prq:
-	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
-	iommu->prq = NULL;
-
-	return ret;
 }
 
 int intel_svm_finish_prq(struct intel_iommu *iommu)
-- 
Gitee


From cc2100d88cf036415d7d56efcc887934f638a002 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 12 Jul 2022 11:43:08 +0800
Subject: [PATCH 1495/2161] iommu: restore prq_event_thread()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Restore prq_event_thread().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 52 ++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 20957132a1f7..4057b41744ea 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -909,6 +909,22 @@ struct page_req_dsc {
 	u64 priv_data[2];
 };
 
+static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
+{
+	unsigned long requested = 0;
+
+	if (req->exe_req)
+		requested |= VM_EXEC;
+
+	if (req->rd_req)
+		requested |= VM_READ;
+
+	if (req->wr_req)
+		requested |= VM_WRITE;
+
+	return (requested & ~vma->vm_flags) != 0;
+}
+
 static bool is_canonical_address(u64 addr)
 {
 	int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
@@ -1120,17 +1136,23 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ))
 		start_ktime = ktime_to_ns(ktime_get());
 
-	/*
-	 * Clear PPR bit before reading head/tail registers, to ensure that
-	 * we get a new interrupt if needed.
-	 */
+	/* Clear PPR bit before reading head/tail registers, to
+	 * ensure that we get a new interrupt if needed. */
 	writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
 
 	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
-	handled = (head != tail);
 	while (head != tail) {
+		struct vm_area_struct *vma;
+		struct page_req_dsc *req;
+		struct qi_desc resp;
+		int result;
+		vm_fault_t ret;
+		u64 address;
+
+		handled = 1;
 		req = &iommu->prq[head / sizeof(*req)];
+		result = QI_RESP_INVALID;
 		address = (u64)req->addr << VTD_PAGE_SHIFT;
 		if (!req->pasid_present) {
 			pr_err("%s: Page request without PASID: %08llx %08llx\n",
@@ -1164,10 +1186,18 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			}
 		}
 
-		if (unlikely(!is_canonical_address(address))) {
-			pr_err("IOMMU: %s: Address is not canonical\n",
-			       iommu->name);
-			goto bad_req;
+		if (!sdev || sdev->sid != req->rid) {
+			struct intel_svm_dev *t;
+
+			sdev = NULL;
+			rcu_read_lock();
+			list_for_each_entry_rcu(t, &svm->devs, list) {
+				if (t->sid == req->rid) {
+					sdev = t;
+					break;
+				}
+			}
+			rcu_read_unlock();
 		}
 
 		/*
@@ -1198,7 +1228,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		if (!vma || address < vma->vm_start)
 			goto invalid;
 
-		sdev->prq_seq_number++;
+		if (access_error(vma, req))
+			goto invalid;
 
 		flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
 		if (req->wr_req)
@@ -1262,7 +1293,6 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 		tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 		if (head == tail) {
-			iopf_queue_discard_partial(iommu->iopf_queue);
 			writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
 			pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
 					    iommu->name);
-- 
Gitee


From 432c30e03d614d8782ea2074b19c9ff1be98543d Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 1 Jun 2022 17:06:35 +0800
Subject: [PATCH 1496/2161] Revert "iommu/vt-d: A temporary patch to cover old
 guest"

commit 5007f4ae730981ba6275d6c7532dec1e47d8dad1 Intel-BKC.

This reverts commit ddd4b634f8eae89e55728abac02768c73b46069d.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 9ba3bddc1aa1..86d26a7c243e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2180,11 +2180,6 @@ static inline void
 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
 {
 	context->hi |= pasid & ((1 << 20) - 1);
-	/* TODO: rid_priv should be 1. Otherwise the PRQ w/o pasid generated.
-	 * The root cause is that the guest PTE U/S bit is not set to 1. Ideally,
-	 * the guest kernel should set it.
-	 */
-	context->hi |= (1 << 20);
 }
 
 /*
-- 
Gitee


From 7a5b414aa375af49e1149b554e2827f2bfe889aa Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 6 Jul 2021 20:19:26 +0800
Subject: [PATCH 1497/2161] ioasid: Fix unexpected ioasid_set free

commit 77fc8a8430e8a58afeb0f9b392cf435ed74467f4 Intel-BKC.

This patch fixes a problem in which ioasid_set is free when the last ioasid
in it is free. I think the motivation of this atomatic ioasid_set free is
based on the consideration when an ioasid_set_free() is called but there is
still ioasid within it. To reclaim the ioasid_set later, this logic was added.

Cc: Jacob Pan <jacob.jun.pan@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 6 +++++-
 include/linux/ioasid.h | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 471bce096501..5fdbb2619cca 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -717,6 +717,8 @@ static int ioasid_set_free_locked(struct ioasid_set *set)
 
 	if (atomic_read(&set->nr_ioasids)) {
 		ret = -EBUSY;
+		set->free_pending = true;
+		pr_info("Set marked as free_pending, will be released when the last ioasid reclaimed!\n");
 		goto exit_done;
 	}
 
@@ -855,8 +857,10 @@ static void ioasid_do_free_locked(struct ioasid_data *data)
 	ioasid_cg_uncharge(data->set);
 	xa_erase(&data->set->xa, data->id);
 	/* Destroy the set if empty */
-	if (!atomic_read(&data->set->nr_ioasids))
+	if (data->set->free_pending && !atomic_read(&data->set->nr_ioasids)) {
+		pr_info("%s free set set->id: %u\n", __func__, data->set->id);
 		ioasid_set_free_locked(data->set);
+	}
 }
 
 static void ioasid_free_locked(struct ioasid_set *set, ioasid_t ioasid)
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index d6e0f20ef7d3..af37ea4bda85 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -41,6 +41,7 @@ struct ioasid_set {
 	int quota;
 	atomic_t nr_ioasids;
 	int id;
+	bool free_pending;
 	struct rcu_head rcu;
 };
 
-- 
Gitee


From ef29da87cf1615ced1ccfcb983c911059637dea9 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Mon, 12 Jul 2021 15:13:15 +0800
Subject: [PATCH 1498/2161] iommu/vt-d: Global devTLB flush when present
 context entry changed

commit 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 upstream.

This fixes a bug in context cache clear operation. The code was not
following the correct invalidation flow. A global device TLB invalidation
should be added after the IOTLB invalidation. At the same time, it
uses the domain ID from the context entry. But in scalable mode, the
domain ID is in PASID table entry, not context entry.

Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
Cc: stable@vger.kernel.org # v5.0+
Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 86d26a7c243e..2bcfdc129a65 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4656,7 +4656,8 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 					PASID_RID2PASID, false);
 
 		iommu_disable_dev_iotlb(info);
-		domain_context_clear(info);
+		if (!dev_is_real_dma_subdevice(info->dev))
+			domain_context_clear(info);
 		intel_pasid_free_table(info->dev);
 	}
 
-- 
Gitee


From 0f09f7a87829b769eccbd59697f89ac0f796d4a3 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 7 Jul 2022 16:16:02 +0800
Subject: [PATCH 1499/2161] iommu: introduce flush_iotlb_all()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Introduce flush_iotlb_all().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 899a570eafd0..ce71ba236fc8 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -461,6 +461,21 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
 		qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT);
 }
 
+static void
+flush_iotlb_all(struct intel_iommu *iommu, struct device *dev,
+		u16 did, u32 pasid, u64 type)
+{
+	pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+
+	if (type)
+		iommu->flush.flush_iotlb(iommu, did, 0, 0, type);
+	else
+		qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+
+	if (!cap_caching_mode(iommu->cap))
+		devtlb_invalidation_with_pasid(iommu, dev, pasid);
+}
+
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 				 u32 pasid, bool fault_ignore)
 {
-- 
Gitee


From 21cdbba6237db74ec0614b63b3a589f5d816efad Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Wed, 11 Aug 2021 03:22:01 -0700
Subject: [PATCH 1500/2161] Fix incorrect invalidation when tearing down a
 pasid table entry.

commit 9a4130d8d00ae2c2223983827d2e0485ffa7f051 Intel-BKC.

Use domain selective invalidation for SL-Only and Nested PGTT types.
Use pasid-selective invalidation for FL-Only and Pass-Through PGTT types.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index ce71ba236fc8..5b67f1c1f114 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -480,7 +480,9 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 				 u32 pasid, bool fault_ignore)
 {
 	struct pasid_entry *pte;
-	u16 did, pgtt;
+	u16 did;
+	u64 pe_val;
+	u16 pgtt_type;
 
 	pte = intel_pasid_get_entry(dev, pasid);
 	if (WARN_ON(!pte))
@@ -490,21 +492,20 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 		return;
 
 	did = pasid_get_domain_id(pte);
-	intel_pasid_clear_entry(iommu, dev, pasid, fault_ignore);
+
+	pe_val = READ_ONCE(pte->val[0]);
+	pgtt_type = (pe_val >> 6) & 0x7;
+
+	intel_pasid_clear_entry(iommu, dev, pasid, fault_ignore, keep_pte);
 
 	if (!ecap_coherent(iommu->ecap))
 		clflush_cache_range(pte, sizeof(*pte));
 
-	pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-
-	if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY)
-		qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+	if (pgtt_type == PASID_ENTRY_PGTT_FL_ONLY ||
+			pgtt_type == PASID_ENTRY_PGTT_PT)
+		flush_iotlb_all(iommu, dev, did, pasid, 0);
 	else
-		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
-
-	/* Device IOTLB doesn't need to be flushed in caching mode. */
-	if (!cap_caching_mode(iommu->cap))
-		devtlb_invalidation_with_pasid(iommu, dev, pasid);
+		flush_iotlb_all(iommu, dev, did, pasid, DMA_TLB_DSI_FLUSH);
 }
 
 /*
-- 
Gitee


From 845b3785b2f8e458aad5005c326032f747543dde Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 15 Jul 2022 11:33:13 +0800
Subject: [PATCH 1501/2161] iommu: Resolve merge conflicts for
 intel_pasid_tear_down_entry()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflicts for intel_pasid_tear_down_entry(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  6 +++---
 drivers/iommu/intel/pasid.c |  4 ++--
 drivers/iommu/intel/pasid.h |  2 +-
 drivers/iommu/intel/svm.c   | 11 ++++++-----
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2bcfdc129a65..b4a414565183 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4650,10 +4650,10 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 	iommu = info->iommu;
 	domain = info->domain;
 
-	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
+	if (info->dev) {
 		if (dev_is_pci(info->dev) && sm_supported(iommu))
 			intel_pasid_tear_down_entry(iommu, info->dev,
-					PASID_RID2PASID, false);
+					PASID_RID2PASID, false, false);
 
 		iommu_disable_dev_iotlb(info);
 		if (!dev_is_real_dma_subdevice(info->dev))
@@ -4912,7 +4912,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
 	if (!auxiliary_unlink_device(domain, dev)) {
 		spin_lock(&iommu->lock);
 		intel_pasid_tear_down_entry(iommu, dev,
-					    domain->default_pasid, false);
+					    domain->default_pasid, false, false);
 		domain_detach_iommu(domain, iommu);
 		spin_unlock(&iommu->lock);
 	}
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 5b67f1c1f114..4556296fdd5f 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -477,7 +477,7 @@ flush_iotlb_all(struct intel_iommu *iommu, struct device *dev,
 }
 
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
-				 u32 pasid, bool fault_ignore)
+				 u32 pasid, bool fault_ignore, bool keep_pte)
 {
 	struct pasid_entry *pte;
 	u16 did;
@@ -488,7 +488,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
 	if (WARN_ON(!pte))
 		return;
 
-	if (!pasid_pte_is_present(pte))
+	if (!(pte->val[0] & PASID_PTE_PRESENT))
 		return;
 
 	did = pasid_get_domain_id(pte);
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index b8677047de43..4b6574fcfb86 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -127,7 +127,7 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu,
 			     struct dmar_domain *domain, int addr_width);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 				 struct device *dev, u32 pasid,
-				 bool fault_ignore);
+				 bool fault_ignore, bool keep_pte);
 int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid);
 void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid);
 #endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 4057b41744ea..d58d8196f916 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -199,7 +199,7 @@ static void intel_svm_free_async_fn(struct work_struct *work)
 		list_del_rcu(&sdev->list);
 		spin_lock(&sdev->iommu->lock);
 		intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
-					svm->pasid, true);
+					svm->pasid, true, false);
 		intel_svm_drain_prq(sdev->dev, svm->pasid);
 		spin_unlock(&sdev->iommu->lock);
 		kfree_rcu(sdev, rcu);
@@ -347,7 +347,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdev, &svm->devs, list)
 		intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
-					    svm->pasid, true);
+					    svm->pasid, true, false);
 	rcu_read_unlock();
 
 }
@@ -581,6 +581,7 @@ int intel_svm_unbind_gpasid(struct iommu_domain *domain,
 	int ret;
 	struct dmar_domain *dmar_domain;
 	struct ioasid_set *pasid_set;
+	bool keep_pte = false;
 
 	if (WARN_ON(!iommu))
 		return -EINVAL;
@@ -594,6 +595,7 @@ int intel_svm_unbind_gpasid(struct iommu_domain *domain,
 			return ret;
 		pasid = ret;
 		pasid_set = host_pasid_set;
+		keep_pte = true;
 	}
 
 	mutex_lock(&pasid_mutex);
@@ -606,9 +608,8 @@ int intel_svm_unbind_gpasid(struct iommu_domain *domain,
 			sdev->users--;
 		if (!sdev->users) {
 			list_del_rcu(&sdev->list);
-			iommu->flags |= VTD_FLAG_PGTT_SL_ONLY;
 			intel_pasid_tear_down_entry(iommu, dev,
-						    svm->pasid, false);
+						    svm->pasid, false, keep_pte);
 			intel_svm_drain_prq(dev, svm->pasid);
 			/*
 			 * Partial assignment needs to delete fault data
@@ -855,7 +856,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 			 * large and has to be physically contiguous. So it's
 			 * hard to be as defensive as we might like. */
 			intel_pasid_tear_down_entry(iommu, dev,
-						    svm->pasid, false);
+						    svm->pasid, false, false);
 			intel_svm_drain_prq(dev, svm->pasid);
 			kfree_rcu(sdev, rcu);
 
-- 
Gitee


From e283755ca292210a135f8caf5eee334f5ed7b150 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 19 Jul 2022 11:00:26 +0800
Subject: [PATCH 1502/2161] iommu: Resolve merge conflict for
 intel_pasid_clear_entry()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for intel_pasid_clear_entry(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 4556296fdd5f..7fd83a7d7e97 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -247,7 +247,7 @@ static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
 
 static void
 intel_pasid_clear_entry(struct intel_iommu *iommu, struct device *dev,
-			u32 pasid, bool fault_ignore)
+			u32 pasid, bool fault_ignore, bool keep_pte)
 {
 	struct pasid_entry *pe;
 	u64 pe_val;
@@ -264,10 +264,9 @@ intel_pasid_clear_entry(struct intel_iommu *iommu, struct device *dev,
 	 */
 	pe_val = READ_ONCE(pe->val[0]);
 	nested = (((pe_val >> 6) & 0x7) == PASID_ENTRY_PGTT_NESTED) ? true : false;
-	if (nested && (iommu->flags & VTD_FLAG_PGTT_SL_ONLY)) {
+	if (nested && keep_pte) {
 		pe_val &= 0xfffffffffffffebf;
 		WRITE_ONCE(pe->val[0], pe_val);
-		iommu->flags &= ~VTD_FLAG_PGTT_SL_ONLY;
 		return;
 	}
 
-- 
Gitee


From c5cd3db9017081d451fa74ace218ce46f6421166 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Wed, 11 Aug 2021 03:24:12 -0700
Subject: [PATCH 1503/2161] Fix missing invalidation when changing a present
 SL-only PGTT type to a Nested type.

commit fe02987055265674c417442dfd12f2a073119b4a Intel-BKC.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 7fd83a7d7e97..dfdbb2bd0757 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -800,6 +800,7 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
 	u64 pgd_val;
 	int agaw;
 	u16 did;
+	bool pasid_present;
 
 	if (!ecap_nest(iommu->ecap)) {
 		pr_err_ratelimited("IOMMU: %s: No nested translation support\n",
@@ -822,11 +823,18 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
 	 * multiple times. If caller tries to setup nesting for a PASID
 	 * entry which is already nested mode, should fail it.
 	 */
-	if (pasid_pte_is_present(pte) && pasid_pte_is_nested(pte))
+	pasid_present = pasid_pte_is_present(pte);
+
+	if (pasid_present && pasid_pte_is_nested(pte))
 		return -EBUSY;
 
 	pasid_clear_entry(pte);
 
+	did = domain->iommu_did[iommu->seq_id];
+
+	if (pasid_present)
+		flush_iotlb_all(iommu, dev, did, pasid, DMA_TLB_DSI_FLUSH);
+
 	/* Sanity checking performed by caller to make sure address
 	 * width matching in two dimensions:
 	 * 1. CPU vs. IOMMU
@@ -879,7 +887,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
 	pasid_set_slptr(pte, pgd_val);
 	pasid_set_fault_enable(pte);
 
-	did = domain->iommu_did[iommu->seq_id];
 	pasid_set_domain_id(pte, did);
 
 	pasid_set_address_width(pte, agaw);
-- 
Gitee


From 72e50545652e871a6bc008d36b76506d1f499c68 Mon Sep 17 00:00:00 2001
From: Kumar Sanjay K <sanjay.k.kumar@intel.com>
Date: Mon, 11 Jan 2021 01:18:17 +0800
Subject: [PATCH 1504/2161] iommu/vt-d: Add an option force using SL for IOVA

commit 2441627644e0fb621a5b5c2ada684158fd324f26 Intel-BKC.

change IOVA mapping from FL to SL via an intel_iommu=iova_sl command line parameter.

Signed-off-by: Kumar Sanjay K <sanjay.k.kumar@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b4a414565183..73eb40b2d4b0 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -368,6 +368,9 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 
+/* == 0 --> use FL for IOVA (default), != 0 --> use SL for IOVA */
+static int default_iova = 0;
+
 int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
@@ -476,6 +479,12 @@ static int __init intel_iommu_setup(char *str)
 		} else if (!strncmp(str, "tboot_noforce", 13)) {
 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 			intel_iommu_tboot_noforce = 1;
+		} else if (!strncmp(str, "qi_done_no_cpu_relax", 19)) {
+			pr_info("Intel-IOMMU: no cpu_relax() in qi\n");
+			qi_done_no_cpu_relax = 1;
+		} else if (!strncmp(str, "iova_sl", 7)) {
+			pr_info("Intel-IOMMU: default SL IOVA enabled\n");
+			default_iova = 1;
 		} else {
 			pr_notice("Unknown option - '%s'\n", str);
 		}
@@ -2040,6 +2049,10 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
  */
 static bool first_level_by_default(unsigned int type)
 {
+	/* Change IOVA mapping to SL instead of FL */
+	if (default_iova)
+		return false;
+
 	/* Only SL is available in legacy mode */
 	if (!scalable_mode_support())
 		return false;
-- 
Gitee


From d46bf49dccdddcbfaea2c5a5bcf6f22591255fa9 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 13 Jul 2022 20:39:41 +0800
Subject: [PATCH 1505/2161] iommu: introduce qi_done_no_cpu_relax

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Introduce qi_done_no_cpu_relax.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c  | 10 +++++++---
 include/linux/intel-iommu.h |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 4322fc5d99fc..333aa51ee309 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -68,6 +68,8 @@ static void free_iommu(struct intel_iommu *iommu);
 
 extern const struct iommu_ops intel_iommu_ops;
 
+int qi_done_no_cpu_relax __read_mostly;
+
 static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
 {
 	/*
@@ -1435,9 +1437,11 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
 		if (rc)
 			break;
 
-		raw_spin_unlock(&qi->q_lock);
-		cpu_relax();
-		raw_spin_lock(&qi->q_lock);
+		if (!qi_done_no_cpu_relax) {
+			raw_spin_unlock(&qi->q_lock);
+			cpu_relax();
+			raw_spin_lock(&qi->q_lock);
+		}
 	}
 
 	for (i = 0; i < count; i++)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index bef216e95f06..8b8ca219d799 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -810,6 +810,8 @@ struct intel_svm {
 static inline void intel_svm_check(struct intel_iommu *iommu) {}
 #endif
 
+extern int qi_done_no_cpu_relax;
+
 #ifdef CONFIG_INTEL_IOMMU_DEBUGFS
 void intel_iommu_debugfs_init(void);
 #else
-- 
Gitee


From 57026040ac272fb434c6afa221f7f4acb0bd0b05 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Thu, 22 Jul 2021 21:11:11 +0800
Subject: [PATCH 1506/2161] iommu/vt-d: Pass the correct domain to
 intel_pasid_setup_slade()

commit aacaa3cbead439ab4ee1844642594ed94bebafe9 Intel-BKC.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 236 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/pasid.c |  37 ++++++
 drivers/iommu/intel/pasid.h |   2 +
 3 files changed, 275 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 73eb40b2d4b0..89f5a919cf0e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5937,6 +5937,242 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
 	}
 }
 
+static int __setup_slade(struct iommu_domain *domain,
+			 struct device_domain_info *info, bool enable)
+{
+	u32 pasid;
+	int ret = 0;
+
+	pasid = domain_get_pasid(domain, info->dev);
+
+	spin_lock(&info->iommu->lock);
+	ret = intel_pasid_setup_slade(info->dev, to_dmar_domain(domain), pasid, enable);
+	spin_unlock(&info->iommu->lock);
+
+	return ret;
+}
+
+static int
+__domain_clear_dirty_log(struct dmar_domain *domain,
+			 unsigned long iova, size_t size,
+			 unsigned long *bitmap,
+			 unsigned long base_iova,
+			 unsigned long bitmap_pgshift);
+
+static int intel_iommu_set_hwdbm(struct iommu_domain *domain, bool enable,
+				 unsigned long iova, size_t size)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct device_domain_info *info;
+	struct subdev_domain_info *sinfo;
+	unsigned long flags;
+	int ret = 0;
+
+	if (domain_use_first_level(dmar_domain)) {
+		/* FL supports A/D bits by default. */
+		/* TODO: shall we clear FLT existing D bits? */
+		if (enable)
+			__domain_clear_dirty_log(dmar_domain, iova, size, NULL,
+						 iova, VTD_PAGE_SHIFT);
+		return 0;
+	}
+
+	if (!slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &dmar_domain->devices, link) {
+		ret = __setup_slade(domain, info, enable);
+		if (ret)
+			goto out;
+	}
+
+	list_for_each_entry(sinfo, &dmar_domain->subdevices, link_domain) {
+		info = get_domain_info(sinfo->pdev);
+		ret = __setup_slade(domain, info, enable);
+		if (ret)
+			break;
+	}
+
+out:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
+}
+
+static int
+__domain_sync_dirty_log(struct dmar_domain *domain,
+			unsigned long iova, size_t size,
+			unsigned long *bitmap,
+			unsigned long base_iova,
+			unsigned long bitmap_pgshift)
+{
+	struct dma_pte *pte = NULL;
+	unsigned long nr_pages = size >> VTD_PAGE_SHIFT;
+	unsigned long lvl_pages = 0;
+	unsigned long iov_pfn = iova >> VTD_PAGE_SHIFT;
+	unsigned long offset;
+	unsigned int largepage_lvl = 0;
+	unsigned int nbits;
+
+	if (bitmap_pgshift != VTD_PAGE_SHIFT)
+		return -EINVAL;
+
+	while (nr_pages > 0) {
+		largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
+							0, nr_pages);
+
+		pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+		if (!pte || !dma_pte_present(pte))
+			return -EINVAL;
+
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+		BUG_ON(nr_pages < lvl_pages);
+
+		if (!(pte->val & DMA_PTE_WRITE)) {
+			pr_warn("The 0x%lx pte is READ ONLY.\n", iova);
+			goto skip;
+		}
+
+		if (!domain_use_first_level(domain)) {
+			if (!(pte->val & BIT(9))) {
+				pr_debug("SL: The 0x%lx pte is not dirty.\n", iova);
+				goto skip;
+			}
+		} else {
+			if (!(pte->val & BIT(6))) {
+				pr_debug("FL: The 0x%lx pte is not dirty.\n", iova);
+				goto skip;
+			}
+		}
+
+		if (bitmap) {
+			nbits = lvl_pages;
+			offset = (iova - base_iova) >> bitmap_pgshift;
+			bitmap_set(bitmap, offset, nbits);
+		}
+
+skip:
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		iova += lvl_pages * VTD_PAGE_SHIFT;
+	}
+
+	return 0;
+}
+
+/*
+ * Make sure the quering iova has been mapped and is being used. Otherwise,
+ * there may be fatal error if another thread free the visiting pages.
+ */
+static int intel_iommu_sync_dirty_log(struct iommu_domain *domain,
+				      unsigned long iova, size_t size,
+				      unsigned long *bitmap,
+				      unsigned long base_iova,
+				      unsigned long bitmap_pgshift)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+	if (!domain_use_first_level(dmar_domain) && !slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
+
+	__domain_sync_dirty_log(dmar_domain, iova, size, bitmap,
+				base_iova, bitmap_pgshift);
+
+	return 0;
+}
+
+static int
+__domain_clear_dirty_log(struct dmar_domain *domain,
+			 unsigned long iova, size_t size,
+			 unsigned long *bitmap,
+			 unsigned long base_iova,
+			 unsigned long bitmap_pgshift)
+{
+	struct dma_pte *pte = NULL;
+	unsigned long nr_pages = size >> VTD_PAGE_SHIFT;
+	unsigned long lvl_pages = 0;
+	unsigned long iov_pfn = iova >> VTD_PAGE_SHIFT;
+	unsigned long offset;
+	unsigned int largepage_lvl = 0;
+	unsigned int nbits;
+	int iommu_id, i;
+
+	if (bitmap_pgshift != VTD_PAGE_SHIFT)
+		return -EINVAL;
+
+	while (nr_pages > 0) {
+		largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
+							0, nr_pages);
+
+		pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+		if (!pte || !dma_pte_present(pte))
+			return -EINVAL;
+
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+		BUG_ON(nr_pages < lvl_pages);
+
+		if (!(pte->val & DMA_PTE_WRITE)) {
+			pr_warn("The 0x%lx pte is READ ONLY.\n", iova);
+			goto skip;
+		}
+
+		/* Ensure all corresponding bits are set */
+		if (bitmap) {
+			nbits = lvl_pages;
+			offset = (iova - base_iova) >> bitmap_pgshift;
+			for (i = offset; i < offset + nbits; i++) {
+				if (!test_bit(i, bitmap))
+					goto skip;
+			}
+		}
+
+		if (!domain_use_first_level(domain))
+			test_and_clear_bit(9, (unsigned long *)&pte->val);
+		else
+			test_and_clear_bit(6, (unsigned long *)&pte->val);
+
+		for_each_domain_iommu(iommu_id, domain)
+			iommu_flush_iotlb_psi(g_iommus[iommu_id], domain,
+					iov_pfn, lvl_pages, 1, 0);
+
+skip:
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		iova += lvl_pages * VTD_PAGE_SHIFT;
+	}
+
+	return 0;
+}
+
+/*
+ * Make sure the clearing iova has been mapped and is being used. Otherwise,
+ * there may be fatal error if another thread free the visiting pages.
+ */
+static int intel_iommu_clear_dirty_log(struct iommu_domain *domain,
+				       unsigned long iova, size_t size,
+				       unsigned long *bitmap,
+				       unsigned long base_iova,
+				       unsigned long bitmap_pgshift)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	int ret = 0;
+
+	if (!domain_use_first_level(dmar_domain) && !slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
+
+	ret = __domain_clear_dirty_log(dmar_domain, iova, size, bitmap,
+					base_iova, bitmap_pgshift);
+
+	return ret;
+}
+
 const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index dfdbb2bd0757..cb8e041f39f5 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -898,3 +898,40 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
 
 	return ret;
 }
+
+/**
+ * intel_pasid_setup_slade() - Set up Second Level Access/Dirty bit Enable
+ * field in PASID entry for scalable mode pasid table.
+ *
+ * @dev:     Device to be set up for translation
+ * @domain:  Domain info for setting up slad enabling
+ * @pasid:   PASID to be programmed in the device PASID table
+ * @value:   Value set to the entry
+ */
+int intel_pasid_setup_slade(struct device *dev, struct dmar_domain *domain,
+			    u32 pasid, bool value)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	struct intel_iommu *iommu;
+	struct pasid_entry *pte;
+	u16 did;
+
+	if (!info || !info->iommu)
+		return -ENODEV;
+
+	iommu = info->iommu;
+	did = domain->iommu_did[iommu->seq_id];
+
+	pte = intel_pasid_get_entry(dev, pasid);
+	if (WARN_ON(!pte))
+		return -ENODEV;
+
+	if (!pasid_pte_is_present(pte))
+		return -EINVAL;
+
+	pasid_set_slade(pte, value);
+
+	flush_iotlb_all(iommu, dev, did, pasid, DMA_TLB_DSI_FLUSH);
+
+	return 0;
+}
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 4b6574fcfb86..3cc8757074c9 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -130,4 +130,6 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 				 bool fault_ignore, bool keep_pte);
 int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid);
 void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid);
+int intel_pasid_setup_slade(struct device *dev, struct dmar_domain *domain,
+			    u32 pasid, bool value);
 #endif /* __INTEL_PASID_H */
-- 
Gitee


From eebc461bf4cd7369bdf8fae88af7f1963c8d5fc7 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 18 Jul 2022 17:39:25 +0800
Subject: [PATCH 1507/2161] iommu: Resolve merge conflicts for slad_support()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflicts for slad_support(),

Remove check for pasid-pte in intel_pasid_setup_pass_through().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/cap_audit.c |  6 ++++++
 drivers/iommu/intel/cap_audit.h |  6 ++++++
 drivers/iommu/intel/pasid.c     | 18 ++++++++++++++----
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
index b39d223926a4..eace338c7dba 100644
--- a/drivers/iommu/intel/cap_audit.c
+++ b/drivers/iommu/intel/cap_audit.c
@@ -212,6 +212,12 @@ bool intel_cap_flts_sanity(void)
 	return ecap_flts(intel_iommu_ecap_sanity);
 }
 
+bool intel_cap_slad_sanity(void)
+{
+	return ecap_slts(intel_iommu_ecap_sanity) &&
+	       ecap_slads(intel_iommu_ecap_sanity);
+}
+
 bool intel_cap_slts_sanity(void)
 {
 	return ecap_slts(intel_iommu_ecap_sanity);
diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
index 86f2c8e907f7..66c59d7e713c 100644
--- a/drivers/iommu/intel/cap_audit.h
+++ b/drivers/iommu/intel/cap_audit.h
@@ -118,6 +118,7 @@ bool intel_cap_smts_sanity(void);
 bool intel_cap_pasid_sanity(void);
 bool intel_cap_nest_sanity(void);
 bool intel_cap_flts_sanity(void);
+bool intel_cap_slad_sanity(void);
 bool intel_cap_slts_sanity(void);
 
 static inline bool scalable_mode_support(void)
@@ -135,4 +136,9 @@ static inline bool nested_mode_support(void)
 	return scalable_mode_support() && intel_cap_nest_sanity();
 }
 
+static inline bool slad_support(void)
+{
+	return scalable_mode_support() && intel_cap_slad_sanity();
+}
+
 int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index cb8e041f39f5..7d722a254c1f 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -20,6 +20,7 @@
 #include <linux/spinlock.h>
 
 #include "pasid.h"
+#include "cap_audit.h"
 
 /*
  * Intel IOMMU system wide PASID name space:
@@ -418,6 +419,15 @@ pasid_set_eafe(struct pasid_entry *pe)
 	pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
 }
 
+/*
+ * Setup Second Level Access/Dirty bit Enable field (Bit 9) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_slade(struct pasid_entry *pe, bool value)
+{
+	pasid_set_bits(&pe->val[0], 1 << 9, value << 9);
+}
+
 static void
 pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
 				    u16 did, u32 pasid)
@@ -684,6 +694,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 	 */
 	if (pasid != PASID_RID2PASID)
 		pasid_set_sre(pte);
+	if (slad_support())
+		pasid_set_slade(pte, true);
 	pasid_set_present(pte);
 	pasid_flush_caches(iommu, pte, pasid, did);
 
@@ -706,10 +718,6 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 		return -ENODEV;
 	}
 
-	/* Caller must ensure PASID entry is not in use. */
-	if (pasid_pte_is_present(pte))
-		return -EBUSY;
-
 	pasid_clear_entry(pte);
 	pasid_set_domain_id(pte, did);
 	pasid_set_address_width(pte, iommu->agaw);
@@ -893,6 +901,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
 	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+	if (slad_support())
+		pasid_set_slade(pte, true);
 	pasid_set_present(pte);
 	pasid_flush_caches(iommu, pte, pasid, did);
 
-- 
Gitee


From e3bb7a6dc33cb6c091d532f6e31278f196a03d8c Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Thu, 22 Jul 2021 21:15:31 +0800
Subject: [PATCH 1508/2161] iommu/vt-d: Fix bugs in __domain_sync_dirty_log()

commit a53b8312555edab93379ec995763b891a1932806 Intel-BKC.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 89f5a919cf0e..b767b4351ed9 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -6021,8 +6021,7 @@ __domain_sync_dirty_log(struct dmar_domain *domain,
 		return -EINVAL;
 
 	while (nr_pages > 0) {
-		largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
-							0, nr_pages);
+		largepage_lvl = 0;
 
 		pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
 		if (!pte || !dma_pte_present(pte))
@@ -6032,7 +6031,7 @@ __domain_sync_dirty_log(struct dmar_domain *domain,
 		BUG_ON(nr_pages < lvl_pages);
 
 		if (!(pte->val & DMA_PTE_WRITE)) {
-			pr_warn("The 0x%lx pte is READ ONLY.\n", iova);
+			pr_debug("The 0x%lx pte is READ ONLY.\n", iova);
 			goto skip;
 		}
 
@@ -6057,7 +6056,7 @@ __domain_sync_dirty_log(struct dmar_domain *domain,
 skip:
 		nr_pages -= lvl_pages;
 		iov_pfn += lvl_pages;
-		iova += lvl_pages * VTD_PAGE_SHIFT;
+		iova += lvl_pages * VTD_PAGE_SIZE;
 	}
 
 	return 0;
-- 
Gitee


From 12f07e6b8dbfc546a16db9d8c8c49e00ee5a8ed9 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Thu, 22 Jul 2021 22:29:36 +0800
Subject: [PATCH 1509/2161] iommu/vt-d: Fix an issue in
 intel_svm_free_async_fn()

commit 724dca3a9c8962bb66d60fa19bc529d18dabc907 Intel-BKC.

For subdevices, intel_svm_free_async_fn() should be the same cleanup
work as intel_sva_unbind_gpasid(). Deleting the per-pasid fault_data
is part of it. Without this, the next time the same pasid was used, it
will report failure when trying to bind it to a fault_data.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c   | 7 +++++++
 include/linux/intel-iommu.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index d58d8196f916..354d7e6c55e9 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -202,6 +202,11 @@ static void intel_svm_free_async_fn(struct work_struct *work)
 					svm->pasid, true, false);
 		intel_svm_drain_prq(sdev->dev, svm->pasid);
 		spin_unlock(&sdev->iommu->lock);
+		/*
+		 * Partial assignment needs to delete fault data
+		 */
+		if (is_aux_domain(sdev->dev, &sdev->domain->domain))
+			iommu_delete_device_fault_data(sdev->dev, svm->pasid);
 		kfree_rcu(sdev, rcu);
 	}
 	/*
@@ -519,6 +524,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	sdev->dev = dev;
 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
 	sdev->iommu = iommu;
+	sdev->domain = dmar_domain;
 
 	/* Only count users if device has aux domains */
 	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
@@ -736,6 +742,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 			sdev->qdep = 0;
 	}
 
+	sdev->domain = info->domain;
 	/* Finish the setup now we know we're keeping it */
 	sdev->users = 1;
 	init_rcu_head(&sdev->rcu);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 8b8ca219d799..558993a08b26 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -786,6 +786,7 @@ struct intel_svm_dev {
 	struct rcu_head rcu;
 	struct device *dev;
 	struct intel_iommu *iommu;
+	struct dmar_domain *domain;
 	struct iommu_sva sva;
 	unsigned long prq_seq_number;
 	u32 pasid;
-- 
Gitee


From 9f981ed504baed09f82e81d883cdeee18dce0a59 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Tue, 27 Jul 2021 15:09:43 -0700
Subject: [PATCH 1510/2161] VT-d/SLAD: Fix a bug in the loop iterator

commit cc62e7a4c0d4bc57887c5403932d2c41fb044844 Intel-BKC.

VT-d/SLAD: Fix a bug in the loop iterator during clearing the SL A/D bits.
Also move the invalidation call out of the loop to reduce number of IOTLB
invalidations.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b767b4351ed9..2fe9da12b525 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -6100,6 +6100,8 @@ __domain_clear_dirty_log(struct dmar_domain *domain,
 	unsigned int largepage_lvl = 0;
 	unsigned int nbits;
 	int iommu_id, i;
+	unsigned long start_pfn = iov_pfn;
+	bool cleared = false;
 
 	if (bitmap_pgshift != VTD_PAGE_SHIFT)
 		return -EINVAL;
@@ -6135,16 +6137,18 @@ __domain_clear_dirty_log(struct dmar_domain *domain,
 		else
 			test_and_clear_bit(6, (unsigned long *)&pte->val);
 
-		for_each_domain_iommu(iommu_id, domain)
-			iommu_flush_iotlb_psi(g_iommus[iommu_id], domain,
-					iov_pfn, lvl_pages, 1, 0);
-
+		cleared = true;
 skip:
 		nr_pages -= lvl_pages;
 		iov_pfn += lvl_pages;
-		iova += lvl_pages * VTD_PAGE_SHIFT;
+		iova += lvl_pages * VTD_PAGE_SIZE;
 	}
 
+	if (cleared)
+		for_each_domain_iommu(iommu_id, domain)
+			iommu_flush_iotlb_psi(g_iommus[iommu_id], domain,
+				start_pfn, size >> VTD_PAGE_SHIFT, 1, 0);
+
 	return 0;
 }
 
-- 
Gitee


From 1bdd4a0ad84c0237329493c74998861318044157 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 19 Jul 2022 14:38:17 +0800
Subject: [PATCH 1511/2161] iommu: Remove intel_iommu_enable{disable}_sva()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Remove now unused intel_iommu_enable_sva() and intel_iommu_disable_sva().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 42 -------------------------------------
 1 file changed, 42 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2fe9da12b525..41a7eff3c767 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5574,48 +5574,6 @@ static int intel_iommu_disable_auxd(struct device *dev)
 	return 0;
 }
 
-static int intel_iommu_enable_sva(struct device *dev)
-{
-	struct device_domain_info *info = get_domain_info(dev);
-	struct intel_iommu *iommu;
-	int ret;
-
-	if (!info || dmar_disabled)
-		return -EINVAL;
-
-	iommu = info->iommu;
-	if (!iommu)
-		return -EINVAL;
-
-	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
-		return -ENODEV;
-
-	if (intel_iommu_enable_pasid(iommu, dev))
-		return -ENODEV;
-
-	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
-		return -EINVAL;
-
-	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
-	if (!ret)
-		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
-
-	return ret;
-}
-
-static int intel_iommu_disable_sva(struct device *dev)
-{
-	struct device_domain_info *info = get_domain_info(dev);
-	struct intel_iommu *iommu = info->iommu;
-	int ret;
-
-	ret = iommu_unregister_device_fault_handler(dev);
-	if (!ret)
-		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
-
-	return ret;
-}
-
 /*
  * A PCI express designated vendor specific extended capability is defined
  * in the section 3.7 of Intel scalable I/O virtualization technical spec
-- 
Gitee


From 58982acf8fa4e882d589f4b225ea7a3ceb295db3 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 2 Jun 2022 11:54:46 +0800
Subject: [PATCH 1512/2161] Revert "iommu/vt-d: Disable SVM when ATS/PRI/PASID
 are not enabled in the device"

commit 6d392710e87f61b34280c97454b84dfb90b41633 Intel-BKC.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 41a7eff3c767..67699887585c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5642,11 +5642,8 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 	case IOMMU_DEV_FEAT_IOPF:
 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
 
-	case IOMMU_DEV_FEAT_SVA:
-		return intel_iommu_enable_sva(dev);
-
-	default:
-		return -ENODEV;
+		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
+			return 0;
 	}
 }
 
-- 
Gitee


From fe55b268be4eb12ae2bb74a7468976b89ed88416 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 14 Jul 2022 17:31:59 +0800
Subject: [PATCH 1513/2161] iommu: Resolve merge conflict for
 intel_iommu_dev_enable_feat()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for intel_iommu_dev_enable_feat(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 67699887585c..00c736ff2fc2 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5635,16 +5635,20 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 static int
 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	switch (feat) {
-	case IOMMU_DEV_FEAT_AUX:
+	if (feat == IOMMU_DEV_FEAT_AUX)
 		return intel_iommu_enable_auxd(dev);
 
-	case IOMMU_DEV_FEAT_IOPF:
-		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
+	if (feat == IOMMU_DEV_FEAT_SVA) {
+		struct device_domain_info *info = get_domain_info(dev);
+
+		if (!info)
+			return -EINVAL;
 
 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
 			return 0;
 	}
+
+	return -ENODEV;
 }
 
 static int
-- 
Gitee


From fb7903b699e119d89b73abc7d05442a1310949ea Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 18 Jul 2022 16:27:14 +0800
Subject: [PATCH 1514/2161] iommu: Resolve merge conflict for
 intel_iommu_dev_has_feat()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for intel_iommu_dev_has_feat(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 32 ++++++++++++++++----------------
 include/linux/iommu.h       | 21 ++++-----------------
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 00c736ff2fc2..3f36b6b5533c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5605,8 +5605,6 @@ static int siov_find_pci_dvsec(struct pci_dev *pdev)
 static bool
 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	struct device_domain_info *info = get_domain_info(dev);
-
 	if (feat == IOMMU_DEV_FEAT_AUX) {
 		int ret;
 
@@ -5621,13 +5619,21 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
 	}
 
-	if (feat == IOMMU_DEV_FEAT_IOPF)
-		return info && info->pri_supported;
+	if (feat == IOMMU_DEV_FEAT_SVA) {
+		struct device_domain_info *info = get_domain_info(dev);
 
-	if (feat == IOMMU_DEV_FEAT_SVA)
 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
 			info->pasid_supported && info->pri_supported &&
 			info->ats_supported;
+	}
+
+	if (feat == IOMMU_DEV_FEAT_HWDBM) {
+		struct device_domain_info *info = get_domain_info(dev);
+
+		/* FL supports dirty bit by default. */
+		return domain_use_first_level(info->domain) ||
+		       (!domain_use_first_level(info->domain) && slad_support());
+	}
 
 	return false;
 }
@@ -5654,19 +5660,10 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 static int
 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	switch (feat) {
-	case IOMMU_DEV_FEAT_AUX:
+	if (feat == IOMMU_DEV_FEAT_AUX)
 		return intel_iommu_disable_auxd(dev);
 
-	case IOMMU_DEV_FEAT_IOPF:
-		return 0;
-
-	case IOMMU_DEV_FEAT_SVA:
-		return intel_iommu_disable_sva(dev);
-
-	default:
-		return -ENODEV;
-	}
+	return -ENODEV;
 }
 
 static bool
@@ -5677,6 +5674,9 @@ intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
 	if (feat == IOMMU_DEV_FEAT_AUX)
 		return scalable_mode_support() && info && info->auxd_enabled;
 
+	if (feat == IOMMU_DEV_FEAT_HWDBM)
+		return intel_iommu_dev_has_feat(dev, feat);
+
 	return false;
 }
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a177d8d8e062..7ab2f61ff471 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -173,24 +173,11 @@ struct iommu_resv_region {
 	enum iommu_resv_type	type;
 };
 
-/**
- * enum iommu_dev_features - Per device IOMMU features
- * @IOMMU_DEV_FEAT_AUX: Auxiliary domain feature
- * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses
- * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. Generally
- *			 enabling %IOMMU_DEV_FEAT_SVA requires
- *			 %IOMMU_DEV_FEAT_IOPF, but some devices manage I/O Page
- *			 Faults themselves instead of relying on the IOMMU. When
- *			 supported, this feature must be enabled before and
- *			 disabled after %IOMMU_DEV_FEAT_SVA.
- *
- * Device drivers query whether a feature is supported using
- * iommu_dev_has_feature(), and enable it using iommu_dev_enable_feature().
- */
+/* Per device IOMMU features */
 enum iommu_dev_features {
-	IOMMU_DEV_FEAT_AUX,
-	IOMMU_DEV_FEAT_SVA,
-	IOMMU_DEV_FEAT_IOPF,
+	IOMMU_DEV_FEAT_AUX,	/* Aux-domain feature */
+	IOMMU_DEV_FEAT_SVA,	/* Shared Virtual Addresses */
+	IOMMU_DEV_FEAT_HWDBM,	/* Hardware Dirty Bit Management */
 };
 
 #define IOMMU_PASID_INVALID	(-1U)
-- 
Gitee


From 5aec659348c170c077d1c76b8080cacc75d2927f Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Sat, 23 Oct 2021 07:12:51 +0800
Subject: [PATCH 1515/2161] iommu/vtd: Fix some gpasid related issues

commit 6c89ec91ff5e0ae1a92999922153df2033a5dbfa Intel-BKC.

During tests, we found the NIC pf/vf (SRIOV) passthrough
cannot work correctly. The NIC cannot get IP in VM. One
reason is the iommu driver returns error when binding
gpasid. There are some unnecessary check for such devices.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 354d7e6c55e9..891180234813 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -443,8 +443,9 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	if (!dev_is_pci(dev))
 		return -ENOTSUPP;
 
-	/* VT-d supports devices with full 20 bit PASIDs only */
-	if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+	/* Except gIOVA binding, VT-d supports devices with full 20 bit PASIDs only */
+	if ((data->flags & IOMMU_SVA_HPASID_DEF) == 0 &&
+	    pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
 		return -EINVAL;
 
 	dmar_domain = to_dmar_domain(domain);
@@ -530,12 +531,16 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
 		sdev->users = 1;
 
-	/* Set up device context entry for PASID if not enabled already */
-	ret = intel_iommu_enable_pasid(iommu, sdev->dev);
-	if (ret) {
-		dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
-		kfree(sdev);
-		goto out;
+	/* For legacy device passthr giova usage, do not enable pasid */
+	if ((data->flags & IOMMU_SVA_HPASID_DEF) == 0 &&
+	    pci_max_pasids(to_pci_dev(dev)) == PASID_MAX) {
+		/* Set up device context entry for PASID if not enabled already */
+		ret = intel_iommu_enable_pasid(iommu, sdev->dev);
+		if (ret) {
+			dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
+			kfree(sdev);
+			goto out;
+		}
 	}
 
 	/*
-- 
Gitee


From 31c3eeba9ffa97e3018563a372c0786edb4d0427 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 10 Nov 2021 16:06:05 -0800
Subject: [PATCH 1516/2161] iommu/vtd: temp fix sm cap audit

commit 0151c87c8146d3f67400833e46e1951f2195e78c Intel-BKC.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/cap_audit.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
index 66c59d7e713c..738d07030cee 100644
--- a/drivers/iommu/intel/cap_audit.h
+++ b/drivers/iommu/intel/cap_audit.h
@@ -123,6 +123,7 @@ bool intel_cap_slts_sanity(void);
 
 static inline bool scalable_mode_support(void)
 {
+	return true;
 	return (intel_iommu_sm && intel_cap_smts_sanity());
 }
 
-- 
Gitee


From b8b17b5f9c38ebd371113720ff2e39068a1a9f48 Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov@linux.com>
Date: Sat, 28 Sep 2019 02:43:08 +0300
Subject: [PATCH 1517/2161] PCI: Add PCI_STD_NUM_BARS for the number of
 standard BARs

commit c9c13ba428ef90a9b408a6cdf874e14ab5754516 upstream.

Code that iterates over all standard PCI BARs typically uses
PCI_STD_RESOURCE_END.  However, that requires the unusual test
"i <= PCI_STD_RESOURCE_END" rather than something the typical
"i < PCI_STD_NUM_BARS".

Add a definition for PCI_STD_NUM_BARS and change loops to use the more
idiomatic C style to help avoid fencepost errors.

Link: https://lore.kernel.org/r/20190927234026.23342-1-efremov@linux.com
Link: https://lore.kernel.org/r/20190927234308.23935-1-efremov@linux.com
Link: https://lore.kernel.org/r/20190916204158.6889-3-efremov@linux.com
Signed-off-by: Denis Efremov <efremov@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Sebastian Ott <sebott@linux.ibm.com>			# arch/s390/
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>	# video/fbdev/
Acked-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>	# pci/controller/dwc/
Acked-by: Jack Wang <jinpu.wang@cloud.ionos.com>		# scsi/pm8001/
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>	# scsi/pm8001/
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>			# memstick/
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/alpha/kernel/pci-sysfs.c                 |  8 ++---
 arch/s390/include/asm/pci.h                   |  5 +--
 arch/s390/include/asm/pci_clp.h               |  6 ++--
 arch/s390/pci/pci.c                           | 16 +++++-----
 arch/s390/pci/pci_clp.c                       |  6 ++--
 arch/x86/pci/common.c                         |  2 +-
 arch/x86/pci/intel_mid_pci.c                  |  2 +-
 drivers/ata/pata_atp867x.c                    |  2 +-
 drivers/ata/sata_nv.c                         |  2 +-
 drivers/memstick/host/jmb38x_ms.c             |  2 +-
 drivers/misc/pci_endpoint_test.c              |  8 ++---
 drivers/net/ethernet/intel/e1000/e1000.h      |  1 -
 drivers/net/ethernet/intel/e1000/e1000_main.c |  2 +-
 drivers/net/ethernet/intel/ixgb/ixgb.h        |  1 -
 drivers/net/ethernet/intel/ixgb/ixgb_main.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_pci.c  |  4 +--
 .../net/ethernet/synopsys/dwc-xlgmac-pci.c    |  2 +-
 drivers/pci/controller/dwc/pci-dra7xx.c       |  2 +-
 .../pci/controller/dwc/pci-layerscape-ep.c    |  2 +-
 drivers/pci/controller/dwc/pcie-artpec6.c     |  2 +-
 .../pci/controller/dwc/pcie-designware-plat.c |  2 +-
 drivers/pci/controller/dwc/pcie-designware.h  |  2 +-
 drivers/pci/controller/pci-hyperv.c           | 10 +++---
 drivers/pci/endpoint/functions/pci-epf-test.c | 10 +++---
 drivers/pci/pci-sysfs.c                       |  4 +--
 drivers/pci/pci.c                             | 13 ++++----
 drivers/pci/proc.c                            |  4 +--
 drivers/pci/quirks.c                          |  4 +--
 drivers/rapidio/devices/tsi721.c              |  2 +-
 drivers/scsi/pm8001/pm8001_hwi.c              |  2 +-
 drivers/scsi/pm8001/pm8001_init.c             |  2 +-
 drivers/staging/gasket/gasket_constants.h     |  3 --
 drivers/staging/gasket/gasket_core.c          | 12 +++----
 drivers/staging/gasket/gasket_core.h          |  4 +--
 drivers/tty/serial/8250/8250_pci.c            |  8 ++---
 drivers/usb/core/hcd-pci.c                    |  2 +-
 drivers/usb/host/pci-quirks.c                 |  2 +-
 drivers/vfio/pci/vfio_pci.c                   | 11 ++++---
 drivers/vfio/pci/vfio_pci_config.c            | 32 ++++++++++---------
 drivers/vfio/pci/vfio_pci_private.h           |  4 +--
 drivers/video/fbdev/core/fbmem.c              |  4 +--
 drivers/video/fbdev/efifb.c                   |  2 +-
 include/linux/pci-epc.h                       |  2 +-
 include/linux/pci.h                           |  2 +-
 include/uapi/linux/pci_regs.h                 |  1 +
 lib/devres.c                                  |  2 +-
 46 files changed, 110 insertions(+), 113 deletions(-)

diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
index f94c732fedeb..0021580d79ad 100644
--- a/arch/alpha/kernel/pci-sysfs.c
+++ b/arch/alpha/kernel/pci-sysfs.c
@@ -71,10 +71,10 @@ static int pci_mmap_resource(struct kobject *kobj,
 	struct pci_bus_region bar;
 	int i;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		if (res == &pdev->resource[i])
 			break;
-	if (i >= PCI_ROM_RESOURCE)
+	if (i >= PCI_STD_NUM_BARS)
 		return -ENODEV;
 
 	if (res->flags & IORESOURCE_MEM && iomem_is_exclusive(res->start))
@@ -115,7 +115,7 @@ void pci_remove_resource_files(struct pci_dev *pdev)
 {
 	int i;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct bin_attribute *res_attr;
 
 		res_attr = pdev->res_attr[i];
@@ -232,7 +232,7 @@ int pci_create_resource_files(struct pci_dev *pdev)
 	int retval;
 
 	/* Expose the PCI resources from this device as files */
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 
 		/* skip empty resources */
 		if (!pci_resource_len(pdev, i))
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 6087a4e9b2bf..b05187ce5dbd 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -2,9 +2,6 @@
 #ifndef __ASM_S390_PCI_H
 #define __ASM_S390_PCI_H
 
-/* must be set before including pci_clp.h */
-#define PCI_BAR_COUNT	6
-
 #include <linux/pci.h>
 #include <linux/mutex.h>
 #include <linux/iommu.h>
@@ -138,7 +135,7 @@ struct zpci_dev {
 
 	char res_name[16];
 	bool mio_capable;
-	struct zpci_bar_struct bars[PCI_BAR_COUNT];
+	struct zpci_bar_struct bars[PCI_STD_NUM_BARS];
 
 	u64		start_dma;	/* Start of available DMA addresses */
 	u64		end_dma;	/* End of available DMA addresses */
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index 50359172cc48..bd2cb4ea7d93 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -77,7 +77,7 @@ struct mio_info {
 	struct {
 		u64 wb;
 		u64 wt;
-	} addr[PCI_BAR_COUNT];
+	} addr[PCI_STD_NUM_BARS];
 	u32 reserved[6];
 } __packed;
 
@@ -98,9 +98,9 @@ struct clp_rsp_query_pci {
 	u16 util_str_avail	:  1;	/* utility string available? */
 	u16 pfgid		:  8;	/* pci function group id */
 	u32 fid;			/* pci function id */
-	u8 bar_size[PCI_BAR_COUNT];
+	u8 bar_size[PCI_STD_NUM_BARS];
 	u16 pchid;
-	__le32 bar[PCI_BAR_COUNT];
+	__le32 bar[PCI_STD_NUM_BARS];
 	u8 pfip[CLP_PFIP_NR_SEGMENTS];	/* pci function internal path */
 	u32			: 16;
 	u8 fmb_len;
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 6105b1b6e49b..b5af0b891326 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -43,7 +43,7 @@ static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES);
 static DEFINE_SPINLOCK(zpci_domain_lock);
 
 #define ZPCI_IOMAP_ENTRIES						\
-	min(((unsigned long) ZPCI_NR_DEVICES * PCI_BAR_COUNT / 2),	\
+	min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2),	\
 	    ZPCI_IOMAP_MAX_ENTRIES)
 
 static DEFINE_SPINLOCK(zpci_iomap_lock);
@@ -294,7 +294,7 @@ static void __iomem *pci_iomap_range_mio(struct pci_dev *pdev, int bar,
 void __iomem *pci_iomap_range(struct pci_dev *pdev, int bar,
 			      unsigned long offset, unsigned long max)
 {
-	if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT)
+	if (bar >= PCI_STD_NUM_BARS || !pci_resource_len(pdev, bar))
 		return NULL;
 
 	if (static_branch_likely(&have_mio))
@@ -324,7 +324,7 @@ static void __iomem *pci_iomap_wc_range_mio(struct pci_dev *pdev, int bar,
 void __iomem *pci_iomap_wc_range(struct pci_dev *pdev, int bar,
 				 unsigned long offset, unsigned long max)
 {
-	if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT)
+	if (bar >= PCI_STD_NUM_BARS || !pci_resource_len(pdev, bar))
 		return NULL;
 
 	if (static_branch_likely(&have_mio))
@@ -416,7 +416,7 @@ static void zpci_map_resources(struct pci_dev *pdev)
 	resource_size_t len;
 	int i;
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		len = pci_resource_len(pdev, i);
 		if (!len)
 			continue;
@@ -451,7 +451,7 @@ static void zpci_unmap_resources(struct pci_dev *pdev)
 	if (zpci_use_mio(zdev))
 		return;
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		len = pci_resource_len(pdev, i);
 		if (!len)
 			continue;
@@ -514,7 +514,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev,
 	snprintf(zdev->res_name, sizeof(zdev->res_name),
 		 "PCI Bus %04x:%02x", zdev->domain, ZPCI_BUS_NR);
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (!zdev->bars[i].size)
 			continue;
 		entry = zpci_alloc_iomap(zdev);
@@ -551,7 +551,7 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
 {
 	int i;
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (!zdev->bars[i].size || !zdev->bars[i].res)
 			continue;
 
@@ -573,7 +573,7 @@ int pcibios_add_device(struct pci_dev *pdev)
 	pdev->dev.dma_ops = &s390_pci_dma_ops;
 	zpci_map_resources(pdev);
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		res = &pdev->resource[i];
 		if (res->parent || !res->flags)
 			continue;
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 20e093f86329..25208fa95426 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -145,7 +145,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 {
 	int i;
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		zdev->bars[i].val = le32_to_cpu(response->bar[i]);
 		zdev->bars[i].size = response->bar_size[i];
 	}
@@ -164,8 +164,8 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 		       sizeof(zdev->util_str));
 	}
 	zdev->mio_capable = response->mio_addr_avail;
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
-		if (!(response->mio.valid & (1 << (PCI_BAR_COUNT - i - 1))))
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		if (!(response->mio.valid & (1 << (PCI_STD_NUM_BARS - i - 1))))
 			continue;
 
 		zdev->bars[i].mio_wb = (void __iomem *) response->mio.addr[i].wb;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f1063d5b2462..e142ae2e8e1c 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -136,7 +136,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev)
 		* resource so the kernel doesn't attempt to assign
 		* it later on in pci_assign_unassigned_resources
 		*/
-		for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
+		for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 			bar_r = &dev->resource[bar];
 			if (bar_r->start == 0 && bar_r->end != 0) {
 				bar_r->flags = 0;
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index d6ac842c9505..0b2c6ec9bd34 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -383,7 +383,7 @@ static void pci_fixed_bar_fixup(struct pci_dev *dev)
 	    PCI_DEVFN(2, 2) == dev->devfn)
 		return;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
 		dev->resource[i].end = dev->resource[i].start + size - 1;
 		dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
diff --git a/drivers/ata/pata_atp867x.c b/drivers/ata/pata_atp867x.c
index cfd0cf2cbca6..e01a3a6e4d46 100644
--- a/drivers/ata/pata_atp867x.c
+++ b/drivers/ata/pata_atp867x.c
@@ -422,7 +422,7 @@ static int atp867x_ata_pci_sff_init_host(struct ata_host *host)
 #ifdef	ATP867X_DEBUG
 	atp867x_check_res(pdev);
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		printk(KERN_DEBUG "ATP867X: iomap[%d]=0x%llx\n", i,
 			(unsigned long long)(host->iomap[i]));
 #endif
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 0514aa7e80e3..20190f66ced9 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -2329,7 +2329,7 @@ static int nv_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
         // Make sure this is a SATA controller by counting the number of bars
         // (NVIDIA SATA controllers will always have six bars).  Otherwise,
         // it's an IDE controller and we ignore it.
-	for (bar = 0; bar < 6; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		if (pci_resource_start(pdev, bar) == 0)
 			return -ENODEV;
 
diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index 64fff6abe60e..719f54c58920 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -848,7 +848,7 @@ static int jmb38x_ms_count_slots(struct pci_dev *pdev)
 {
 	int cnt, rc = 0;
 
-	for (cnt = 0; cnt < PCI_ROM_RESOURCE; ++cnt) {
+	for (cnt = 0; cnt < PCI_STD_NUM_BARS; ++cnt) {
 		if (!(IORESOURCE_MEM & pci_resource_flags(pdev, cnt)))
 			break;
 
diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 1154f0435b0a..32e9f267d84f 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -94,7 +94,7 @@ enum pci_barno {
 struct pci_endpoint_test {
 	struct pci_dev	*pdev;
 	void __iomem	*base;
-	void __iomem	*bar[6];
+	void __iomem	*bar[PCI_STD_NUM_BARS];
 	struct completion irq_raised;
 	int		last_irq;
 	int		num_irqs;
@@ -693,7 +693,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 	if (!pci_endpoint_test_request_irq(test))
 		goto err_disable_irq;
 
-	for (bar = BAR_0; bar <= BAR_5; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) {
 			base = pci_ioremap_bar(pdev, bar);
 			if (!base) {
@@ -746,7 +746,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 	ida_simple_remove(&pci_endpoint_test_ida, id);
 
 err_iounmap:
-	for (bar = BAR_0; bar <= BAR_5; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (test->bar[bar])
 			pci_iounmap(pdev, test->bar[bar]);
 	}
@@ -777,7 +777,7 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev)
 	misc_deregister(&test->miscdev);
 	kfree(misc_device->name);
 	ida_simple_remove(&pci_endpoint_test_ida, id);
-	for (bar = BAR_0; bar <= BAR_5; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (test->bar[bar])
 			pci_iounmap(pdev, test->bar[bar]);
 	}
diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index c40729b2c184..7fad2f24dcad 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -45,7 +45,6 @@
 
 #define BAR_0		0
 #define BAR_1		1
-#define BAR_5		5
 
 #define INTEL_E1000_ETHERNET_DEVICE(device_id) {\
 	PCI_DEVICE(PCI_VENDOR_ID_INTEL, device_id)}
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index a2ee28e487a6..82a83de8384c 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -982,7 +982,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_ioremap;
 
 	if (adapter->need_ioport) {
-		for (i = BAR_1; i <= BAR_5; i++) {
+		for (i = BAR_1; i < PCI_STD_NUM_BARS; i++) {
 			if (pci_resource_len(pdev, i) == 0)
 				continue;
 			if (pci_resource_flags(pdev, i) & IORESOURCE_IO) {
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb.h b/drivers/net/ethernet/intel/ixgb/ixgb.h
index e85271b68410..681d44cc9784 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb.h
+++ b/drivers/net/ethernet/intel/ixgb/ixgb.h
@@ -42,7 +42,6 @@
 
 #define BAR_0		0
 #define BAR_1		1
-#define BAR_5		5
 
 struct ixgb_adapter;
 #include "ixgb_hw.h"
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index 0940a0da16f2..3d8c051dd327 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -412,7 +412,7 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_ioremap;
 	}
 
-	for (i = BAR_1; i <= BAR_5; i++) {
+	for (i = BAR_1; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_len(pdev, i) == 0)
 			continue;
 		if (pci_resource_flags(pdev, i) & IORESOURCE_IO) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index 292045f4581f..8237dbc3e991 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -489,7 +489,7 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
 	}
 
 	/* Get the base address of device */
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_len(pdev, i) == 0)
 			continue;
 		ret = pcim_iomap_regions(pdev, BIT(i), pci_name(pdev));
@@ -532,7 +532,7 @@ static void stmmac_pci_remove(struct pci_dev *pdev)
 	if (priv->plat->stmmac_clk)
 		clk_unregister_fixed_rate(priv->plat->stmmac_clk);
 
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_len(pdev, i) == 0)
 			continue;
 		pcim_iounmap_regions(pdev, BIT(i));
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-pci.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-pci.c
index 386bafe74c3f..fa8604d7b797 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-pci.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-pci.c
@@ -34,7 +34,7 @@ static int xlgmac_probe(struct pci_dev *pcidev, const struct pci_device_id *id)
 		return ret;
 	}
 
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_len(pcidev, i) == 0)
 			continue;
 		ret = pcim_iomap_regions(pcidev, BIT(i), XLGMAC_DRV_NAME);
diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c
index 4234ddb4722f..b20651cea09f 100644
--- a/drivers/pci/controller/dwc/pci-dra7xx.c
+++ b/drivers/pci/controller/dwc/pci-dra7xx.c
@@ -353,7 +353,7 @@ static void dra7xx_pcie_ep_init(struct dw_pcie_ep *ep)
 	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pci);
 	enum pci_barno bar;
 
-	for (bar = BAR_0; bar <= BAR_5; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		dw_pcie_ep_reset_bar(pci, bar);
 
 	dra7xx_pcie_enable_wrapper_interrupts(dra7xx);
diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index ca9aa4501e7e..0d151cead1b7 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -58,7 +58,7 @@ static void ls_pcie_ep_init(struct dw_pcie_ep *ep)
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	enum pci_barno bar;
 
-	for (bar = BAR_0; bar <= BAR_5; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		dw_pcie_ep_reset_bar(pci, bar);
 }
 
diff --git a/drivers/pci/controller/dwc/pcie-artpec6.c b/drivers/pci/controller/dwc/pcie-artpec6.c
index d00252bd8fae..9e2482bd7b6d 100644
--- a/drivers/pci/controller/dwc/pcie-artpec6.c
+++ b/drivers/pci/controller/dwc/pcie-artpec6.c
@@ -422,7 +422,7 @@ static void artpec6_pcie_ep_init(struct dw_pcie_ep *ep)
 	artpec6_pcie_wait_for_phy(artpec6_pcie);
 	artpec6_pcie_set_nfts(artpec6_pcie);
 
-	for (bar = BAR_0; bar <= BAR_5; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		dw_pcie_ep_reset_bar(pci, bar);
 }
 
diff --git a/drivers/pci/controller/dwc/pcie-designware-plat.c b/drivers/pci/controller/dwc/pcie-designware-plat.c
index b58fdcbc664b..73646b677aff 100644
--- a/drivers/pci/controller/dwc/pcie-designware-plat.c
+++ b/drivers/pci/controller/dwc/pcie-designware-plat.c
@@ -70,7 +70,7 @@ static void dw_plat_pcie_ep_init(struct dw_pcie_ep *ep)
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	enum pci_barno bar;
 
-	for (bar = BAR_0; bar <= BAR_5; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		dw_pcie_ep_reset_bar(pci, bar);
 }
 
diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
index 5a18e94e52c8..5accdd6bc388 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -214,7 +214,7 @@ struct dw_pcie_ep {
 	phys_addr_t		phys_base;
 	size_t			addr_size;
 	size_t			page_size;
-	u8			bar_to_atu[6];
+	u8			bar_to_atu[PCI_STD_NUM_BARS];
 	phys_addr_t		*outbound_addr;
 	unsigned long		*ib_window_map;
 	unsigned long		*ob_window_map;
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index cb799e458e93..09e4f249728e 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -307,7 +307,7 @@ struct pci_bus_relations {
 struct pci_q_res_req_response {
 	struct vmpacket_descriptor hdr;
 	s32 status;			/* negative values are failures */
-	u32 probed_bar[6];
+	u32 probed_bar[PCI_STD_NUM_BARS];
 } __packed;
 
 struct pci_set_power {
@@ -539,7 +539,7 @@ struct hv_pci_dev {
 	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
 	 * read it back, for each of the BAR offsets within config space.
 	 */
-	u32 probed_bar[6];
+	u32 probed_bar[PCI_STD_NUM_BARS];
 };
 
 struct hv_pci_compl {
@@ -1602,7 +1602,7 @@ static void survey_child_resources(struct hv_pcibus_device *hbus)
 	 * so it's sufficient to just add them up without tracking alignment.
 	 */
 	list_for_each_entry(hpdev, &hbus->children, list_entry) {
-		for (i = 0; i < 6; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
 				dev_err(&hbus->hdev->device,
 					"There's an I/O BAR in this list!\n");
@@ -1676,7 +1676,7 @@ static void prepopulate_bars(struct hv_pcibus_device *hbus)
 	/* Pick addresses for the BARs. */
 	do {
 		list_for_each_entry(hpdev, &hbus->children, list_entry) {
-			for (i = 0; i < 6; i++) {
+			for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 				bar_val = hpdev->probed_bar[i];
 				if (bar_val == 0)
 					continue;
@@ -1833,7 +1833,7 @@ static void q_resource_requirements(void *context, struct pci_response *resp,
 			"query resource requirements failed: %x\n",
 			resp->status);
 	} else {
-		for (i = 0; i < 6; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			completion->hpdev->probed_bar[i] =
 				q_res_req->probed_bar[i];
 		}
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 1cfe3687a211..5d74f81ddfe4 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -44,7 +44,7 @@
 static struct workqueue_struct *kpcitest_workqueue;
 
 struct pci_epf_test {
-	void			*reg[6];
+	void			*reg[PCI_STD_NUM_BARS];
 	struct pci_epf		*epf;
 	enum pci_barno		test_reg_bar;
 	struct delayed_work	cmd_handler;
@@ -377,7 +377,7 @@ static void pci_epf_test_unbind(struct pci_epf *epf)
 
 	cancel_delayed_work(&epf_test->cmd_handler);
 	pci_epc_stop(epc);
-	for (bar = BAR_0; bar <= BAR_5; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		epf_bar = &epf->bar[bar];
 
 		if (epf_test->reg[bar]) {
@@ -400,7 +400,7 @@ static int pci_epf_test_set_bar(struct pci_epf *epf)
 
 	epc_features = epf_test->epc_features;
 
-	for (bar = BAR_0; bar <= BAR_5; bar += add) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar += add) {
 		epf_bar = &epf->bar[bar];
 		/*
 		 * pci_epc_set_bar() sets PCI_BASE_ADDRESS_MEM_TYPE_64
@@ -450,7 +450,7 @@ static int pci_epf_test_alloc_space(struct pci_epf *epf)
 	}
 	epf_test->reg[test_reg_bar] = base;
 
-	for (bar = BAR_0; bar <= BAR_5; bar += add) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar += add) {
 		epf_bar = &epf->bar[bar];
 		add = (epf_bar->flags & PCI_BASE_ADDRESS_MEM_TYPE_64) ? 2 : 1;
 
@@ -478,7 +478,7 @@ static void pci_epf_configure_bar(struct pci_epf *epf,
 	bool bar_fixed_64bit;
 	int i;
 
-	for (i = BAR_0; i <= BAR_5; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		epf_bar = &epf->bar[i];
 		bar_fixed_64bit = !!(epc_features->bar_fixed_64bit & (1 << i));
 		if (bar_fixed_64bit)
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index e401f040f157..95097ce25378 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1124,7 +1124,7 @@ static void pci_remove_resource_files(struct pci_dev *pdev)
 {
 	int i;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct bin_attribute *res_attr;
 
 		res_attr = pdev->res_attr[i];
@@ -1195,7 +1195,7 @@ static int pci_create_resource_files(struct pci_dev *pdev)
 	int retval;
 
 	/* Expose the PCI resources from this device as files */
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 
 		/* skip empty resources */
 		if (!pci_resource_len(pdev, i))
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 3917d15e744a..239018fc7f42 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -674,7 +674,7 @@ struct resource *pci_find_resource(struct pci_dev *dev, struct resource *res)
 {
 	int i;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct resource *r = &dev->resource[i];
 
 		if (r->start && resource_contains(r, res))
@@ -3773,7 +3773,7 @@ void pci_release_selected_regions(struct pci_dev *pdev, int bars)
 {
 	int i;
 
-	for (i = 0; i < 6; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		if (bars & (1 << i))
 			pci_release_region(pdev, i);
 }
@@ -3784,7 +3784,7 @@ static int __pci_request_selected_regions(struct pci_dev *pdev, int bars,
 {
 	int i;
 
-	for (i = 0; i < 6; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		if (bars & (1 << i))
 			if (__pci_request_region(pdev, i, res_name, excl))
 				goto err_out;
@@ -3832,7 +3832,7 @@ EXPORT_SYMBOL(pci_request_selected_regions_exclusive);
 
 void pci_release_regions(struct pci_dev *pdev)
 {
-	pci_release_selected_regions(pdev, (1 << 6) - 1);
+	pci_release_selected_regions(pdev, (1 << PCI_STD_NUM_BARS) - 1);
 }
 EXPORT_SYMBOL(pci_release_regions);
 
@@ -3851,7 +3851,8 @@ EXPORT_SYMBOL(pci_release_regions);
  */
 int pci_request_regions(struct pci_dev *pdev, const char *res_name)
 {
-	return pci_request_selected_regions(pdev, ((1 << 6) - 1), res_name);
+	return pci_request_selected_regions(pdev,
+			((1 << PCI_STD_NUM_BARS) - 1), res_name);
 }
 EXPORT_SYMBOL(pci_request_regions);
 
@@ -3873,7 +3874,7 @@ EXPORT_SYMBOL(pci_request_regions);
 int pci_request_regions_exclusive(struct pci_dev *pdev, const char *res_name)
 {
 	return pci_request_selected_regions_exclusive(pdev,
-					((1 << 6) - 1), res_name);
+				((1 << PCI_STD_NUM_BARS) - 1), res_name);
 }
 EXPORT_SYMBOL(pci_request_regions_exclusive);
 
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 5495537c60c2..6ef74bf5013f 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -258,13 +258,13 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 	}
 
 	/* Make sure the caller is mapping a real resource for this device */
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (dev->resource[i].flags & res_bit &&
 		    pci_mmap_fits(dev, i, vma,  PCI_MMAP_PROCFS))
 			break;
 	}
 
-	if (i >= PCI_ROM_RESOURCE)
+	if (i >= PCI_STD_NUM_BARS)
 		return -ENODEV;
 
 	if (fpriv->mmap_state == pci_mmap_mem &&
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 5632bab94c24..60274a61bb48 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -474,7 +474,7 @@ static void quirk_extend_bar_to_page(struct pci_dev *dev)
 {
 	int i;
 
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct resource *r = &dev->resource[i];
 
 		if (r->flags & IORESOURCE_MEM && resource_size(r) < PAGE_SIZE) {
@@ -1809,7 +1809,7 @@ static void quirk_alder_ioapic(struct pci_dev *pdev)
 	 * The next five BARs all seem to be rubbish, so just clean
 	 * them out.
 	 */
-	for (i = 1; i < 6; i++)
+	for (i = 1; i < PCI_STD_NUM_BARS; i++)
 		memset(&pdev->resource[i], 0, sizeof(pdev->resource[i]));
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_EESSC,	quirk_alder_ioapic);
diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c
index 125a173bed45..4dd31dd9feea 100644
--- a/drivers/rapidio/devices/tsi721.c
+++ b/drivers/rapidio/devices/tsi721.c
@@ -2755,7 +2755,7 @@ static int tsi721_probe(struct pci_dev *pdev,
 	{
 		int i;
 
-		for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			tsi_debug(INIT, &pdev->dev, "res%d %pR",
 				  i, &pdev->resource[i]);
 		}
diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index 68a8217032d0..1a3661d6be06 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -1186,7 +1186,7 @@ static void pm8001_hw_chip_rst(struct pm8001_hba_info *pm8001_ha)
 void pm8001_chip_iounmap(struct pm8001_hba_info *pm8001_ha)
 {
 	s8 bar, logical = 0;
-	for (bar = 0; bar < 6; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		/*
 		** logical BARs for SPC:
 		** bar 0 and 1 - logical BAR0
diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c
index 8882ba33ca87..50b6b0d24839 100644
--- a/drivers/scsi/pm8001/pm8001_init.c
+++ b/drivers/scsi/pm8001/pm8001_init.c
@@ -401,7 +401,7 @@ static int pm8001_ioremap(struct pm8001_hba_info *pm8001_ha)
 
 	pdev = pm8001_ha->pdev;
 	/* map pci mem (PMC pci base 0-3)*/
-	for (bar = 0; bar < 6; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		/*
 		** logical BARs for SPC:
 		** bar 0 and 1 - logical BAR0
diff --git a/drivers/staging/gasket/gasket_constants.h b/drivers/staging/gasket/gasket_constants.h
index 50d87c7b178c..9ea9c8833f27 100644
--- a/drivers/staging/gasket/gasket_constants.h
+++ b/drivers/staging/gasket/gasket_constants.h
@@ -13,9 +13,6 @@
 /* The maximum devices per each type. */
 #define GASKET_DEV_MAX 256
 
-/* The number of supported (and possible) PCI BARs. */
-#define GASKET_NUM_BARS 6
-
 /* The number of supported Gasket page tables per device. */
 #define GASKET_MAX_NUM_PAGE_TABLES 1
 
diff --git a/drivers/staging/gasket/gasket_core.c b/drivers/staging/gasket/gasket_core.c
index 6f9c0d18d9ce..26e516997ea2 100644
--- a/drivers/staging/gasket/gasket_core.c
+++ b/drivers/staging/gasket/gasket_core.c
@@ -371,7 +371,7 @@ static int gasket_setup_pci(struct pci_dev *pci_dev,
 {
 	int i, mapped_bars, ret;
 
-	for (i = 0; i < GASKET_NUM_BARS; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		ret = gasket_map_pci_bar(gasket_dev, i);
 		if (ret) {
 			mapped_bars = i;
@@ -393,7 +393,7 @@ static void gasket_cleanup_pci(struct gasket_dev *gasket_dev)
 {
 	int i;
 
-	for (i = 0; i < GASKET_NUM_BARS; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		gasket_unmap_pci_bar(gasket_dev, i);
 }
 
@@ -493,7 +493,7 @@ static ssize_t gasket_sysfs_data_show(struct device *device,
 		(enum gasket_sysfs_attribute_type)gasket_attr->data.attr_type;
 	switch (sysfs_type) {
 	case ATTR_BAR_OFFSETS:
-		for (i = 0; i < GASKET_NUM_BARS; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			bar_desc = &driver_desc->bar_descriptions[i];
 			if (bar_desc->size == 0)
 				continue;
@@ -505,7 +505,7 @@ static ssize_t gasket_sysfs_data_show(struct device *device,
 		}
 		break;
 	case ATTR_BAR_SIZES:
-		for (i = 0; i < GASKET_NUM_BARS; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			bar_desc = &driver_desc->bar_descriptions[i];
 			if (bar_desc->size == 0)
 				continue;
@@ -556,7 +556,7 @@ static ssize_t gasket_sysfs_data_show(struct device *device,
 		ret = snprintf(buf, PAGE_SIZE, "%d\n", gasket_dev->reset_count);
 		break;
 	case ATTR_USER_MEM_RANGES:
-		for (i = 0; i < GASKET_NUM_BARS; ++i) {
+		for (i = 0; i < PCI_STD_NUM_BARS; ++i) {
 			current_written =
 				gasket_write_mappable_regions(buf, driver_desc,
 							      i);
@@ -736,7 +736,7 @@ static int gasket_get_bar_index(const struct gasket_dev *gasket_dev,
 	const struct gasket_driver_desc *driver_desc;
 
 	driver_desc = gasket_dev->internal_desc->driver_desc;
-	for (i = 0; i < GASKET_NUM_BARS; ++i) {
+	for (i = 0; i < PCI_STD_NUM_BARS; ++i) {
 		struct gasket_bar_desc bar_desc =
 			driver_desc->bar_descriptions[i];
 
diff --git a/drivers/staging/gasket/gasket_core.h b/drivers/staging/gasket/gasket_core.h
index be44ac1e3118..c417acadb0d5 100644
--- a/drivers/staging/gasket/gasket_core.h
+++ b/drivers/staging/gasket/gasket_core.h
@@ -268,7 +268,7 @@ struct gasket_dev {
 	char kobj_name[GASKET_NAME_MAX];
 
 	/* Virtual address of mapped BAR memory range. */
-	struct gasket_bar_data bar_data[GASKET_NUM_BARS];
+	struct gasket_bar_data bar_data[PCI_STD_NUM_BARS];
 
 	/* Coherent buffer. */
 	struct gasket_coherent_buffer coherent_buffer;
@@ -369,7 +369,7 @@ struct gasket_driver_desc {
 	/* Set of 6 bar descriptions that describe all PCIe bars.
 	 * Note that BUS/AXI devices (i.e. non PCI devices) use those.
 	 */
-	struct gasket_bar_desc bar_descriptions[GASKET_NUM_BARS];
+	struct gasket_bar_desc bar_descriptions[PCI_STD_NUM_BARS];
 
 	/*
 	 * Coherent buffer description.
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 8814ff38aa67..ee0f469d4d5f 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -48,8 +48,6 @@ struct f815xxa_data {
 	int idx;
 };
 
-#define PCI_NUM_BAR_RESOURCES	6
-
 struct serial_private {
 	struct pci_dev		*dev;
 	unsigned int		nr;
@@ -89,7 +87,7 @@ setup_port(struct serial_private *priv, struct uart_8250_port *port,
 {
 	struct pci_dev *dev = priv->dev;
 
-	if (bar >= PCI_NUM_BAR_RESOURCES)
+	if (bar >= PCI_STD_NUM_BARS)
 		return -EINVAL;
 
 	if (pci_resource_flags(dev, bar) & IORESOURCE_MEM) {
@@ -3797,7 +3795,7 @@ serial_pci_guess_board(struct pci_dev *dev, struct pciserial_board *board)
 		return -ENODEV;
 
 	num_iomem = num_port = 0;
-	for (i = 0; i < PCI_NUM_BAR_RESOURCES; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_flags(dev, i) & IORESOURCE_IO) {
 			num_port++;
 			if (first_port == -1)
@@ -3825,7 +3823,7 @@ serial_pci_guess_board(struct pci_dev *dev, struct pciserial_board *board)
 	 */
 	first_port = -1;
 	num_port = 0;
-	for (i = 0; i < PCI_NUM_BAR_RESOURCES; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_flags(dev, i) & IORESOURCE_IO &&
 		    pci_resource_len(dev, i) == 8 &&
 		    (first_port == -1 || (first_port + num_port) == i)) {
diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c
index 9e26b0143a59..9ae2a7a93df2 100644
--- a/drivers/usb/core/hcd-pci.c
+++ b/drivers/usb/core/hcd-pci.c
@@ -234,7 +234,7 @@ int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 		/* UHCI */
 		int	region;
 
-		for (region = 0; region < PCI_ROM_RESOURCE; region++) {
+		for (region = 0; region < PCI_STD_NUM_BARS; region++) {
 			if (!(pci_resource_flags(dev, region) &
 					IORESOURCE_IO))
 				continue;
diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c
index f6d04491df60..6c7f0a876b96 100644
--- a/drivers/usb/host/pci-quirks.c
+++ b/drivers/usb/host/pci-quirks.c
@@ -728,7 +728,7 @@ static void quirk_usb_handoff_uhci(struct pci_dev *pdev)
 	if (!pio_enabled(pdev))
 		return;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		if ((pci_resource_flags(pdev, i) & IORESOURCE_IO)) {
 			base = pci_resource_start(pdev, i);
 			break;
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index a603f363835c..1e835be334a0 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -111,11 +111,13 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 {
 	struct resource *res;
-	int bar;
+	int i;
 	struct vfio_pci_dummy_resource *dummy_res;
 
-	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
-		res = vdev->pdev->resource + bar;
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		int bar = i + PCI_STD_RESOURCES;
+
+		res = &vdev->pdev->resource[bar];
 
 		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
 			goto no_mmap;
@@ -399,7 +401,8 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 
 	vfio_config_free(vdev);
 
-	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		bar = i + PCI_STD_RESOURCES;
 		if (!vdev->barmap[bar])
 			continue;
 		pci_iounmap(pdev, vdev->barmap[bar]);
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index bf32997c557f..5076d0155bc3 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -464,30 +464,32 @@ static void vfio_bar_fixup(struct vfio_pci_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int i;
-	__le32 *bar;
+	__le32 *vbar;
 	u64 mask;
 
-	bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+	vbar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
 
-	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
-		if (!pci_resource_start(pdev, i)) {
-			*bar = 0; /* Unmapped by host = unimplemented to user */
+	for (i = 0; i < PCI_STD_NUM_BARS; i++, vbar++) {
+		int bar = i + PCI_STD_RESOURCES;
+
+		if (!pci_resource_start(pdev, bar)) {
+			*vbar = 0; /* Unmapped by host = unimplemented to user */
 			continue;
 		}
 
-		mask = ~(pci_resource_len(pdev, i) - 1);
+		mask = ~(pci_resource_len(pdev, bar) - 1);
 
-		*bar &= cpu_to_le32((u32)mask);
-		*bar |= vfio_generate_bar_flags(pdev, i);
+		*vbar &= cpu_to_le32((u32)mask);
+		*vbar |= vfio_generate_bar_flags(pdev, bar);
 
-		if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
-			bar++;
-			*bar &= cpu_to_le32((u32)(mask >> 32));
+		if (*vbar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
+			vbar++;
+			*vbar &= cpu_to_le32((u32)(mask >> 32));
 			i++;
 		}
 	}
 
-	bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
+	vbar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
 
 	/*
 	 * NB. REGION_INFO will have reported zero size if we weren't able
@@ -497,14 +499,14 @@ static void vfio_bar_fixup(struct vfio_pci_device *vdev)
 	if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
 		mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
 		mask |= PCI_ROM_ADDRESS_ENABLE;
-		*bar &= cpu_to_le32((u32)mask);
+		*vbar &= cpu_to_le32((u32)mask);
 	} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
 					IORESOURCE_ROM_SHADOW) {
 		mask = ~(0x20000 - 1);
 		mask |= PCI_ROM_ADDRESS_ENABLE;
-		*bar &= cpu_to_le32((u32)mask);
+		*vbar &= cpu_to_le32((u32)mask);
 	} else
-		*bar = 0;
+		*vbar = 0;
 
 	vdev->bardirty = false;
 }
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 987b4d311fde..dc50a3aa508c 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -91,8 +91,8 @@ struct vfio_pci_mmap_vma {
 
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
-	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
-	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
+	void __iomem		*barmap[PCI_STD_NUM_BARS];
+	bool			bar_mmap_supported[PCI_STD_NUM_BARS];
 	u8			*pci_config_map;
 	u8			*vconfig;
 	struct perm_bits	*msi_perm;
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index bf76dadbed87..e3807846e548 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1774,7 +1774,7 @@ int remove_conflicting_pci_framebuffers(struct pci_dev *pdev, int res_id, const
 	int err, idx, bar;
 	bool res_id_found = false;
 
-	for (idx = 0, bar = 0; bar < PCI_ROM_RESOURCE; bar++) {
+	for (idx = 0, bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
 			continue;
 		idx++;
@@ -1784,7 +1784,7 @@ int remove_conflicting_pci_framebuffers(struct pci_dev *pdev, int res_id, const
 	if (!ap)
 		return -ENOMEM;
 
-	for (idx = 0, bar = 0; bar < PCI_ROM_RESOURCE; bar++) {
+	for (idx = 0, bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
 			continue;
 		ap->ranges[idx].base = pci_resource_start(pdev, bar);
diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c
index e0cbf5b3d217..1cad8379bc61 100644
--- a/drivers/video/fbdev/efifb.c
+++ b/drivers/video/fbdev/efifb.c
@@ -653,7 +653,7 @@ static void efifb_fixup_resources(struct pci_dev *dev)
 	if (!base)
 		return;
 
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct resource *res = &dev->resource[i];
 
 		if (!(res->flags & IORESOURCE_MEM))
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 0c12d69dde92..c8e39607dbb7 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -120,7 +120,7 @@ struct pci_epc_features {
 	unsigned int	msix_capable : 1;
 	u8	reserved_bar;
 	u8	bar_fixed_64bit;
-	u64	bar_fixed_size[BAR_5 + 1];
+	u64	bar_fixed_size[PCI_STD_NUM_BARS];
 	size_t	align;
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b808b12d0761..4f61bbd9f943 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -83,7 +83,7 @@ enum pci_mmap_state {
 enum {
 	/* #0-5: standard PCI resources */
 	PCI_STD_RESOURCES,
-	PCI_STD_RESOURCE_END = 5,
+	PCI_STD_RESOURCE_END = PCI_STD_RESOURCES + PCI_STD_NUM_BARS - 1,
 
 	/* #6: expansion ROM resource */
 	PCI_ROM_RESOURCE,
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 340cbf95065f..4d02ca9da6eb 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -34,6 +34,7 @@
  * of which the first 64 bytes are standardized as follows:
  */
 #define PCI_STD_HEADER_SIZEOF	64
+#define PCI_STD_NUM_BARS	6	/* Number of standard BARs */
 #define PCI_VENDOR_ID		0x00	/* 16 bits */
 #define PCI_DEVICE_ID		0x02	/* 16 bits */
 #define PCI_COMMAND		0x04	/* 16 bits */
diff --git a/lib/devres.c b/lib/devres.c
index 77c80ca9e485..36a467ec7f49 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(devm_ioport_unmap);
 /*
  * PCI iomap devres
  */
-#define PCIM_IOMAP_MAX	PCI_ROM_RESOURCE
+#define PCIM_IOMAP_MAX	PCI_STD_NUM_BARS
 
 struct pcim_iomap_devres {
 	void __iomem *table[PCIM_IOMAP_MAX];
-- 
Gitee


From cb978c4c55c38bca0b8e55f51cd170b90d158495 Mon Sep 17 00:00:00 2001
From: Ben Luo <luoben@linux.alibaba.com>
Date: Thu, 3 Oct 2019 11:49:42 +0800
Subject: [PATCH 1518/2161] vfio/type1: remove hugepage checks in
 is_invalid_reserved_pfn()

commit 026948f01eac4dda130aa623b7414375633fe7c1 upstream.

Currently, no hugepage split code can transfer the reserved bit
from head to tail during the split, so checking the head can't make
a difference in a racing condition with hugepage spliting.

The buddy wouldn't allow a driver to allocate an hugepage if any
subpage is reserved in the e820 map at boot, if any driver sets the
reserved bit of head page before mapping the hugepage in userland,
it needs to set the reserved bit in all subpages to be safe.

Signed-off-by: Ben Luo <luoben@linux.alibaba.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 660ed3f679e6..01a99fa81563 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -295,31 +295,13 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
  * Some mappings aren't backed by a struct page, for example an mmap'd
  * MMIO range for our own or another device.  These use a different
  * pfn conversion and shouldn't be tracked as locked pages.
+ * For compound pages, any driver that sets the reserved bit in head
+ * page needs to set the reserved bit in all subpages to be safe.
  */
 static bool is_invalid_reserved_pfn(unsigned long pfn)
 {
-	if (pfn_valid(pfn)) {
-		bool reserved;
-		struct page *tail = pfn_to_page(pfn);
-		struct page *head = compound_head(tail);
-		reserved = !!(PageReserved(head));
-		if (head != tail) {
-			/*
-			 * "head" is not a dangling pointer
-			 * (compound_head takes care of that)
-			 * but the hugepage may have been split
-			 * from under us (and we may not hold a
-			 * reference count on the head page so it can
-			 * be reused before we run PageReferenced), so
-			 * we've to check PageTail before returning
-			 * what we just read.
-			 */
-			smp_rmb();
-			if (PageTail(tail))
-				return reserved;
-		}
-		return PageReserved(tail);
-	}
+	if (pfn_valid(pfn))
+		return PageReserved(pfn_to_page(pfn));
 
 	return true;
 }
-- 
Gitee


From 567404155edca98d6fa55b96c2696310e95e6f2f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 11 Sep 2018 17:23:00 +0200
Subject: [PATCH 1519/2161] compat_ioctl: move drivers to compat_ptr_ioctl

commit 407e9ef72476e64937ebec44cc835e03a25fb408 upstream.

Each of these drivers has a copy of the same trivial helper function to
convert the pointer argument and then call the native ioctl handler.

We now have a generic implementation of that, so use it.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Jiri Kosina <jkosina@suse.cz>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/char/ppdev.c              | 12 +---------
 drivers/char/tpm/tpm_vtpm_proxy.c | 12 +---------
 drivers/firewire/core-cdev.c      | 12 +---------
 drivers/hid/usbhid/hiddev.c       | 11 +--------
 drivers/hwtracing/stm/core.c      | 12 +---------
 drivers/misc/mei/main.c           | 22 +----------------
 drivers/mtd/ubi/cdev.c            | 36 +++-------------------------
 drivers/net/tap.c                 | 12 +---------
 drivers/staging/pi433/pi433_if.c  | 12 +---------
 drivers/usb/core/devio.c          | 16 +------------
 drivers/vfio/vfio.c               | 39 +++----------------------------
 drivers/vhost/net.c               | 12 +---------
 drivers/vhost/scsi.c              | 12 +---------
 drivers/vhost/test.c              | 12 +---------
 drivers/vhost/vsock.c             | 12 +---------
 fs/fat/file.c                     | 13 +----------
 16 files changed, 20 insertions(+), 237 deletions(-)

diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c
index 34bb88fe0b0a..2c2381a806ae 100644
--- a/drivers/char/ppdev.c
+++ b/drivers/char/ppdev.c
@@ -678,14 +678,6 @@ static long pp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-static long pp_compat_ioctl(struct file *file, unsigned int cmd,
-			    unsigned long arg)
-{
-	return pp_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static int pp_open(struct inode *inode, struct file *file)
 {
 	unsigned int minor = iminor(inode);
@@ -794,9 +786,7 @@ static const struct file_operations pp_fops = {
 	.write		= pp_write,
 	.poll		= pp_poll,
 	.unlocked_ioctl	= pp_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl   = pp_compat_ioctl,
-#endif
+	.compat_ioctl   = compat_ptr_ioctl,
 	.open		= pp_open,
 	.release	= pp_release,
 };
diff --git a/drivers/char/tpm/tpm_vtpm_proxy.c b/drivers/char/tpm/tpm_vtpm_proxy.c
index 2f6e087ec496..91c772e38bb5 100644
--- a/drivers/char/tpm/tpm_vtpm_proxy.c
+++ b/drivers/char/tpm/tpm_vtpm_proxy.c
@@ -670,20 +670,10 @@ static long vtpmx_fops_ioctl(struct file *f, unsigned int ioctl,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vtpmx_fops_compat_ioctl(struct file *f, unsigned int ioctl,
-					  unsigned long arg)
-{
-	return vtpmx_fops_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations vtpmx_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = vtpmx_fops_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = vtpmx_fops_compat_ioctl,
-#endif
+	.compat_ioctl = compat_ptr_ioctl,
 	.llseek = noop_llseek,
 };
 
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 1da7ba18d399..c777088f5828 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1646,14 +1646,6 @@ static long fw_device_op_ioctl(struct file *file,
 	return dispatch_ioctl(file->private_data, cmd, (void __user *)arg);
 }
 
-#ifdef CONFIG_COMPAT
-static long fw_device_op_compat_ioctl(struct file *file,
-				      unsigned int cmd, unsigned long arg)
-{
-	return dispatch_ioctl(file->private_data, cmd, compat_ptr(arg));
-}
-#endif
-
 static int fw_device_op_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct client *client = file->private_data;
@@ -1795,7 +1787,5 @@ const struct file_operations fw_device_ops = {
 	.mmap		= fw_device_op_mmap,
 	.release	= fw_device_op_release,
 	.poll		= fw_device_op_poll,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= fw_device_op_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 };
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 4711fb191a07..4f97e6c12059 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -845,13 +845,6 @@ static long hiddev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return r;
 }
 
-#ifdef CONFIG_COMPAT
-static long hiddev_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	return hiddev_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations hiddev_fops = {
 	.owner =	THIS_MODULE,
 	.read =		hiddev_read,
@@ -861,9 +854,7 @@ static const struct file_operations hiddev_fops = {
 	.release =	hiddev_release,
 	.unlocked_ioctl =	hiddev_ioctl,
 	.fasync =	hiddev_fasync,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= hiddev_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.llseek		= noop_llseek,
 };
 
diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 603b83ac5085..2712e699ba08 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -832,23 +832,13 @@ stm_char_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-#ifdef CONFIG_COMPAT
-static long
-stm_char_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	return stm_char_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#else
-#define stm_char_compat_ioctl	NULL
-#endif
-
 static const struct file_operations stm_fops = {
 	.open		= stm_char_open,
 	.release	= stm_char_release,
 	.write		= stm_char_write,
 	.mmap		= stm_char_mmap,
 	.unlocked_ioctl	= stm_char_ioctl,
-	.compat_ioctl	= stm_char_compat_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
 	.llseek		= no_llseek,
 };
 
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 7310b476323c..133fa8cbb1c8 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -532,24 +532,6 @@ static long mei_ioctl(struct file *file, unsigned int cmd, unsigned long data)
 	return rets;
 }
 
-/**
- * mei_compat_ioctl - the compat IOCTL function
- *
- * @file: pointer to file structure
- * @cmd: ioctl command
- * @data: pointer to mei message structure
- *
- * Return: 0 on success , <0 on error
- */
-#ifdef CONFIG_COMPAT
-static long mei_compat_ioctl(struct file *file,
-			unsigned int cmd, unsigned long data)
-{
-	return mei_ioctl(file, cmd, (unsigned long)compat_ptr(data));
-}
-#endif
-
-
 /**
  * mei_poll - the poll function
  *
@@ -898,9 +880,7 @@ static const struct file_operations mei_fops = {
 	.owner = THIS_MODULE,
 	.read = mei_read,
 	.unlocked_ioctl = mei_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = mei_compat_ioctl,
-#endif
+	.compat_ioctl = compat_ptr_ioctl,
 	.open = mei_open,
 	.release = mei_release,
 	.write = mei_write,
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 1b77fff9f892..cc9a28cf9d82 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -1078,36 +1078,6 @@ static long ctrl_cdev_ioctl(struct file *file, unsigned int cmd,
 	return err;
 }
 
-#ifdef CONFIG_COMPAT
-static long vol_cdev_compat_ioctl(struct file *file, unsigned int cmd,
-				  unsigned long arg)
-{
-	unsigned long translated_arg = (unsigned long)compat_ptr(arg);
-
-	return vol_cdev_ioctl(file, cmd, translated_arg);
-}
-
-static long ubi_cdev_compat_ioctl(struct file *file, unsigned int cmd,
-				  unsigned long arg)
-{
-	unsigned long translated_arg = (unsigned long)compat_ptr(arg);
-
-	return ubi_cdev_ioctl(file, cmd, translated_arg);
-}
-
-static long ctrl_cdev_compat_ioctl(struct file *file, unsigned int cmd,
-				   unsigned long arg)
-{
-	unsigned long translated_arg = (unsigned long)compat_ptr(arg);
-
-	return ctrl_cdev_ioctl(file, cmd, translated_arg);
-}
-#else
-#define vol_cdev_compat_ioctl  NULL
-#define ubi_cdev_compat_ioctl  NULL
-#define ctrl_cdev_compat_ioctl NULL
-#endif
-
 /* UBI volume character device operations */
 const struct file_operations ubi_vol_cdev_operations = {
 	.owner          = THIS_MODULE,
@@ -1118,7 +1088,7 @@ const struct file_operations ubi_vol_cdev_operations = {
 	.write          = vol_cdev_write,
 	.fsync		= vol_cdev_fsync,
 	.unlocked_ioctl = vol_cdev_ioctl,
-	.compat_ioctl   = vol_cdev_compat_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 /* UBI character device operations */
@@ -1126,13 +1096,13 @@ const struct file_operations ubi_cdev_operations = {
 	.owner          = THIS_MODULE,
 	.llseek         = no_llseek,
 	.unlocked_ioctl = ubi_cdev_ioctl,
-	.compat_ioctl   = ubi_cdev_compat_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 /* UBI control character device operations */
 const struct file_operations ubi_ctrl_cdev_operations = {
 	.owner          = THIS_MODULE,
 	.unlocked_ioctl = ctrl_cdev_ioctl,
-	.compat_ioctl   = ctrl_cdev_compat_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 	.llseek		= no_llseek,
 };
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f285422a8071..1fdea03273d4 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1122,14 +1122,6 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long tap_compat_ioctl(struct file *file, unsigned int cmd,
-			     unsigned long arg)
-{
-	return tap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations tap_fops = {
 	.owner		= THIS_MODULE,
 	.open		= tap_open,
@@ -1139,9 +1131,7 @@ static const struct file_operations tap_fops = {
 	.poll		= tap_poll,
 	.llseek		= no_llseek,
 	.unlocked_ioctl	= tap_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= tap_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 };
 
 static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
diff --git a/drivers/staging/pi433/pi433_if.c b/drivers/staging/pi433/pi433_if.c
index 40c6f4e7632f..313d22f6210f 100644
--- a/drivers/staging/pi433/pi433_if.c
+++ b/drivers/staging/pi433/pi433_if.c
@@ -928,16 +928,6 @@ pi433_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	return 0;
 }
 
-#ifdef CONFIG_COMPAT
-static long
-pi433_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	return pi433_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
-}
-#else
-#define pi433_compat_ioctl NULL
-#endif /* CONFIG_COMPAT */
-
 /*-------------------------------------------------------------------------*/
 
 static int pi433_open(struct inode *inode, struct file *filp)
@@ -1094,7 +1084,7 @@ static const struct file_operations pi433_fops = {
 	.write =	pi433_write,
 	.read =		pi433_read,
 	.unlocked_ioctl = pi433_ioctl,
-	.compat_ioctl = pi433_compat_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
 	.open =		pi433_open,
 	.release =	pi433_release,
 	.llseek =	no_llseek,
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 35e89460b9ca..41ce8ffa5d1b 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -2707,18 +2707,6 @@ static long usbdev_ioctl(struct file *file, unsigned int cmd,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-static long usbdev_compat_ioctl(struct file *file, unsigned int cmd,
-			unsigned long arg)
-{
-	int ret;
-
-	ret = usbdev_do_ioctl(file, cmd, compat_ptr(arg));
-
-	return ret;
-}
-#endif
-
 /* No kernel lock - fine */
 static __poll_t usbdev_poll(struct file *file,
 				struct poll_table_struct *wait)
@@ -2742,9 +2730,7 @@ const struct file_operations usbdev_file_operations = {
 	.read =		  usbdev_read,
 	.poll =		  usbdev_poll,
 	.unlocked_ioctl = usbdev_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl =   usbdev_compat_ioctl,
-#endif
+	.compat_ioctl =   compat_ptr_ioctl,
 	.mmap =           usbdev_mmap,
 	.open =		  usbdev_open,
 	.release =	  usbdev_release,
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 388597930b64..c8482624ca34 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1184,15 +1184,6 @@ static long vfio_fops_unl_ioctl(struct file *filep,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-static long vfio_fops_compat_ioctl(struct file *filep,
-				   unsigned int cmd, unsigned long arg)
-{
-	arg = (unsigned long)compat_ptr(arg);
-	return vfio_fops_unl_ioctl(filep, cmd, arg);
-}
-#endif	/* CONFIG_COMPAT */
-
 static int vfio_fops_open(struct inode *inode, struct file *filep)
 {
 	struct vfio_container *container;
@@ -1275,9 +1266,7 @@ static const struct file_operations vfio_fops = {
 	.read		= vfio_fops_read,
 	.write		= vfio_fops_write,
 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= vfio_fops_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.mmap		= vfio_fops_mmap,
 };
 
@@ -1556,15 +1545,6 @@ static long vfio_group_fops_unl_ioctl(struct file *filep,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-static long vfio_group_fops_compat_ioctl(struct file *filep,
-					 unsigned int cmd, unsigned long arg)
-{
-	arg = (unsigned long)compat_ptr(arg);
-	return vfio_group_fops_unl_ioctl(filep, cmd, arg);
-}
-#endif	/* CONFIG_COMPAT */
-
 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
 {
 	struct vfio_group *group;
@@ -1620,9 +1600,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep)
 static const struct file_operations vfio_group_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= vfio_group_fops_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.open		= vfio_group_fops_open,
 	.release	= vfio_group_fops_release,
 };
@@ -1687,24 +1665,13 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 	return device->ops->mmap(device->device_data, vma);
 }
 
-#ifdef CONFIG_COMPAT
-static long vfio_device_fops_compat_ioctl(struct file *filep,
-					  unsigned int cmd, unsigned long arg)
-{
-	arg = (unsigned long)compat_ptr(arg);
-	return vfio_device_fops_unl_ioctl(filep, cmd, arg);
-}
-#endif	/* CONFIG_COMPAT */
-
 static const struct file_operations vfio_device_fops = {
 	.owner		= THIS_MODULE,
 	.release	= vfio_device_fops_release,
 	.read		= vfio_device_fops_read,
 	.write		= vfio_device_fops_write,
 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= vfio_device_fops_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.mmap		= vfio_device_fops_mmap,
 };
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d6f9f491e69f..1734b15ebbcd 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1744,14 +1744,6 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
-				   unsigned long arg)
-{
-	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
@@ -1787,9 +1779,7 @@ static const struct file_operations vhost_net_fops = {
 	.write_iter     = vhost_net_chr_write_iter,
 	.poll           = vhost_net_chr_poll,
 	.unlocked_ioctl = vhost_net_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl   = vhost_net_compat_ioctl,
-#endif
+	.compat_ioctl   = compat_ptr_ioctl,
 	.open           = vhost_net_open,
 	.llseek		= noop_llseek,
 };
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 98c484149ac7..02fc1a5c0ab0 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1715,21 +1715,11 @@ vhost_scsi_ioctl(struct file *f,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vhost_scsi_compat_ioctl(struct file *f, unsigned int ioctl,
-				unsigned long arg)
-{
-	return vhost_scsi_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations vhost_scsi_fops = {
 	.owner          = THIS_MODULE,
 	.release        = vhost_scsi_release,
 	.unlocked_ioctl = vhost_scsi_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= vhost_scsi_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.open           = vhost_scsi_open,
 	.llseek		= noop_llseek,
 };
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 056308008288..e37c92d4d7ad 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -304,21 +304,11 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vhost_test_compat_ioctl(struct file *f, unsigned int ioctl,
-				   unsigned long arg)
-{
-	return vhost_test_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations vhost_test_fops = {
 	.owner          = THIS_MODULE,
 	.release        = vhost_test_release,
 	.unlocked_ioctl = vhost_test_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl   = vhost_test_compat_ioctl,
-#endif
+	.compat_ioctl   = compat_ptr_ioctl,
 	.open           = vhost_test_open,
 	.llseek		= noop_llseek,
 };
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f21f5bfbb78d..1f370e5be448 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -814,23 +814,13 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
-					 unsigned long arg)
-{
-	return vhost_vsock_dev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations vhost_vsock_fops = {
 	.owner          = THIS_MODULE,
 	.open           = vhost_vsock_dev_open,
 	.release        = vhost_vsock_dev_release,
 	.llseek		= noop_llseek,
 	.unlocked_ioctl = vhost_vsock_dev_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl   = vhost_vsock_dev_compat_ioctl,
-#endif
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 static struct miscdevice vhost_vsock_misc = {
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4614c0ba5f1c..bdc4503c00a3 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -172,15 +172,6 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
-				      unsigned long arg)
-
-{
-	return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
 	if ((filp->f_mode & FMODE_WRITE) &&
@@ -215,9 +206,7 @@ const struct file_operations fat_file_operations = {
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
 	.unlocked_ioctl	= fat_generic_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= fat_generic_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
-- 
Gitee


From 03229ed266bf8fa0ebe018f5b2e8f430e714d005 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 8 Jun 2022 19:49:46 +0800
Subject: [PATCH 1520/2161] remove ioremap_nocache and devm_ioremap_nocache

commit 4bdc0d676a643140bdf17dbf7eafedee3d496a3c upstream.

ioremap has provided non-cached semantics by default since the Linux 2.6
days, so remove the additional ioremap_nocache interface.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_rdwr.c                         | 2 +-
 drivers/vfio/platform/reset/vfio_platform_amdxgbe.c      | 4 ++--
 drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c    | 2 +-
 drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c | 2 +-
 drivers/vfio/platform/vfio_platform_common.c             | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 83f81d24df78..916b184df3a5 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -246,7 +246,7 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 	switch ((u32)pos) {
 	case 0xa0000 ... 0xbffff:
 		count = min(count, (size_t)(0xc0000 - pos));
-		iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1);
+		iomem = ioremap(0xa0000, 0xbffff - 0xa0000 + 1);
 		off = pos - 0xa0000;
 		rsrc = VGA_RSRC_LEGACY_MEM;
 		is_ioport = false;
diff --git a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
index 2d2babe21b2f..40d4fb9276ba 100644
--- a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
+++ b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
@@ -54,13 +54,13 @@ static int vfio_platform_amdxgbe_reset(struct vfio_platform_device *vdev)
 
 	if (!xgmac_regs->ioaddr) {
 		xgmac_regs->ioaddr =
-			ioremap_nocache(xgmac_regs->addr, xgmac_regs->size);
+			ioremap(xgmac_regs->addr, xgmac_regs->size);
 		if (!xgmac_regs->ioaddr)
 			return -ENOMEM;
 	}
 	if (!xpcs_regs->ioaddr) {
 		xpcs_regs->ioaddr =
-			ioremap_nocache(xpcs_regs->addr, xpcs_regs->size);
+			ioremap(xpcs_regs->addr, xpcs_regs->size);
 		if (!xpcs_regs->ioaddr)
 			return -ENOMEM;
 	}
diff --git a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
index 16165a62b86d..96064ef8f629 100644
--- a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
+++ b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
@@ -82,7 +82,7 @@ static int vfio_platform_bcmflexrm_reset(struct vfio_platform_device *vdev)
 
 	/* Map FlexRM ring registers if not mapped */
 	if (!reg->ioaddr) {
-		reg->ioaddr = ioremap_nocache(reg->addr, reg->size);
+		reg->ioaddr = ioremap(reg->addr, reg->size);
 		if (!reg->ioaddr)
 			return -ENOMEM;
 	}
diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
index f67bab547501..09a9453b75c5 100644
--- a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
+++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
@@ -52,7 +52,7 @@ static int vfio_platform_calxedaxgmac_reset(struct vfio_platform_device *vdev)
 
 	if (!reg->ioaddr) {
 		reg->ioaddr =
-			ioremap_nocache(reg->addr, reg->size);
+			ioremap(reg->addr, reg->size);
 		if (!reg->ioaddr)
 			return -ENOMEM;
 	}
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 152e5188183c..fb4b385191f2 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -408,7 +408,7 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
 
 	if (!reg->ioaddr) {
 		reg->ioaddr =
-			ioremap_nocache(reg->addr, reg->size);
+			ioremap(reg->addr, reg->size);
 
 		if (!reg->ioaddr)
 			return -ENOMEM;
@@ -485,7 +485,7 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
 
 	if (!reg->ioaddr) {
 		reg->ioaddr =
-			ioremap_nocache(reg->addr, reg->size);
+			ioremap(reg->addr, reg->size);
 
 		if (!reg->ioaddr)
 			return -ENOMEM;
-- 
Gitee


From 8f3fc881097bb639dd0bb0f630edcdb235d1b57e Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sun, 29 Dec 2019 16:42:56 +0100
Subject: [PATCH 1521/2161] vfio: vfio_pci_nvlink2: use mmgrab

commit bb3d3cf928d4cc450aa817a9c71f4324ec68fd63 upstream.

Mmgrab was introduced in commit f1f1007644ff ("mm: add new mmgrab()
helper") and most of the kernel was updated to use it. Update a
remaining file.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

<smpl>
@@ expression e; @@
- atomic_inc(&e->mm_count);
+ mmgrab(e);
</smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_nvlink2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
index 08f17839c2fe..dbaffa7fcb2f 100644
--- a/drivers/vfio/pci/vfio_pci_nvlink2.c
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -161,7 +161,7 @@ static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev,
 	data->useraddr = vma->vm_start;
 	data->mm = current->mm;
 
-	atomic_inc(&data->mm->mm_count);
+	mmgrab(data->mm);
 	ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
 			vma_pages(vma), data->gpu_hpa, &data->mem);
 
-- 
Gitee


From 98174596c75c8635a2a84d7f5301e24da9dd436f Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sun, 29 Dec 2019 16:42:57 +0100
Subject: [PATCH 1522/2161] vfio/spapr_tce: use mmgrab

commit 7a49de995ebbd8711d52d6a3806faa8c5db0eac1 upstream.

Mmgrab was introduced in commit f1f1007644ff ("mm: add new mmgrab()
helper") and most of the kernel was updated to use it. Update a
remaining file.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

<smpl>
@@ expression e; @@
- atomic_inc(&e->mm_count);
+ mmgrab(e);
</smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 26cef65b41e7..16b3adc508db 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -79,7 +79,7 @@ static long tce_iommu_mm_set(struct tce_container *container)
 	}
 	BUG_ON(!current->mm);
 	container->mm = current->mm;
-	atomic_inc(&container->mm->mm_count);
+	mmgrab(container->mm);
 
 	return 0;
 }
-- 
Gitee


From e84536432aed0cd6edb175552ac9a390a540b023 Mon Sep 17 00:00:00 2001
From: "Ben Dooks (Codethink)" <ben.dooks@codethink.co.uk>
Date: Wed, 18 Dec 2019 12:31:19 +0000
Subject: [PATCH 1523/2161] vfio/mdev: make create attribute static

commit e10b4f6cd89f3ac2044c8f6b4b0ddc1a48f65e09 upstream.

The create attribute is not exported, so make it
static to avoid the following sparse warning:

drivers/vfio/mdev/mdev_sysfs.c:77:1: warning: symbol 'mdev_type_attr_create' was not declared. Should it be static?

Signed-off-by: Ben Dooks (Codethink) <ben.dooks@codethink.co.uk>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index ca9476883f15..367ff5412a38 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -74,7 +74,7 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev,
 	return count;
 }
 
-MDEV_TYPE_ATTR_WO(create);
+static MDEV_TYPE_ATTR_WO(create);
 
 static void mdev_type_release(struct kobject *kobj)
 {
-- 
Gitee


From 151894e16b4b20cd74554fcb768daf7771acba96 Mon Sep 17 00:00:00 2001
From: "Ben Dooks (Codethink)" <ben.dooks@codethink.co.uk>
Date: Wed, 18 Dec 2019 13:35:25 +0000
Subject: [PATCH 1524/2161] vfio: platform: fix __iomem in
 vfio_platform_amdxgbe.c

commit 7b5372ba04ca1caabed1470d4ec23001cde2eb91 upstream.

The ioaddr should have __iomem marker on it, so add that to fix
the following sparse warnings:

drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:33:44: warning: incorrect type in argument 2 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:33:44:    expected void volatile [noderef] <asn:2> *addr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:33:44:    got void *
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:34:33: warning: incorrect type in argument 1 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:34:33:    expected void const volatile [noderef] <asn:2> *addr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:34:33:    got void *
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:44:44: warning: incorrect type in argument 2 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:44:44:    expected void volatile [noderef] <asn:2> *addr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:44:44:    got void *
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:45:33: warning: incorrect type in argument 2 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:45:33:    expected void volatile [noderef] <asn:2> *addr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:45:33:    got void *
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:69:41: warning: incorrect type in argument 1 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:69:41:    expected void *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:69:41:    got void [noderef] <asn:2> *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:71:30: warning: incorrect type in argument 1 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:71:30:    expected void *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:71:30:    got void [noderef] <asn:2> *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:76:49: warning: incorrect type in argument 1 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:76:49:    expected void *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:76:49:    got void [noderef] <asn:2> *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:85:37: warning: incorrect type in argument 1 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:85:37:    expected void *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:85:37:    got void [noderef] <asn:2> *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:87:30: warning: incorrect type in argument 1 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:87:30:    expected void *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:87:30:    got void [noderef] <asn:2> *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:90:30: warning: incorrect type in argument 1 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:90:30:    expected void *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:90:30:    got void [noderef] <asn:2> *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:93:30: warning: incorrect type in argument 1 (different address spaces)
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:93:30:    expected void *ioaddr
drivers/vfio/platform/reset/vfio_platform_amdxgbe.c:93:30:    got void [noderef] <asn:2> *ioaddr

Signed-off-by: Ben Dooks (Codethink) <ben.dooks@codethink.co.uk>
Acked-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/reset/vfio_platform_amdxgbe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
index 40d4fb9276ba..abdca900802d 100644
--- a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
+++ b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
@@ -24,7 +24,7 @@
 #define MDIO_AN_INT		0x8002
 #define MDIO_AN_INTMASK		0x8001
 
-static unsigned int xmdio_read(void *ioaddr, unsigned int mmd,
+static unsigned int xmdio_read(void __iomem *ioaddr, unsigned int mmd,
 			       unsigned int reg)
 {
 	unsigned int mmd_address, value;
@@ -35,7 +35,7 @@ static unsigned int xmdio_read(void *ioaddr, unsigned int mmd,
 	return value;
 }
 
-static void xmdio_write(void *ioaddr, unsigned int mmd,
+static void xmdio_write(void __iomem *ioaddr, unsigned int mmd,
 			unsigned int reg, unsigned int value)
 {
 	unsigned int mmd_address;
-- 
Gitee


From 2142dc24b772cca233aae00b297a1f4cf3914030 Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Thu, 30 Jan 2020 22:12:39 -0800
Subject: [PATCH 1525/2161] vfio: fix FOLL_LONGTERM use, simplify
 get_user_pages_remote() call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 3567813eae5e9b4d02dc227e2060e85abc912045 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update VFIO to take advantage of the recently loosened restriction on
FOLL_LONGTERM with get_user_pages_remote().  Also, now it is possible to
fix a bug: the VFIO caller is logically a FOLL_LONGTERM user, but it
wasn't setting FOLL_LONGTERM.

Also, remove an unnessary pair of calls that were releasing and
reacquiring the mmap_sem.  There is no need to avoid holding mmap_sem
just in order to call page_to_pfn().

Also, now that the the DAX check ("if a VMA is DAX, don't allow long
term pinning") is in the internals of get_user_pages_remote() and
__gup_longterm_locked(), there's no need for it at the VFIO call site.  So
remove it.

Link: http://lkml.kernel.org/r/20200107224558.2362728-8-jhubbard@nvidia.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Tested-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Björn Töpel <bjorn.topel@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Leon Romanovsky <leonro@mellanox.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 01a99fa81563..4c11e1d6adcf 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -359,7 +359,6 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
-	struct vm_area_struct *vmas[1];
 	unsigned int flags = 0;
 	int ret;
 
@@ -367,33 +366,14 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 		flags |= FOLL_WRITE;
 
 	down_read(&mm->mmap_sem);
-	if (mm == current->mm) {
-		ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
-				     vmas);
-	} else {
-		ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
-					    vmas, NULL);
-		/*
-		 * The lifetime of a vaddr_get_pfn() page pin is
-		 * userspace-controlled. In the fs-dax case this could
-		 * lead to indefinite stalls in filesystem operations.
-		 * Disallow attempts to pin fs-dax pages via this
-		 * interface.
-		 */
-		if (ret > 0 && vma_is_fsdax(vmas[0])) {
-			ret = -EOPNOTSUPP;
-			put_page(page[0]);
-		}
-	}
-	up_read(&mm->mmap_sem);
-
+	ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags | FOLL_LONGTERM,
+				    page, NULL, NULL);
 	if (ret == 1) {
 		*pfn = page_to_pfn(page[0]);
-		return 0;
+		ret = 0;
+		goto done;
 	}
 
-	down_read(&mm->mmap_sem);
-
 	vaddr = untagged_addr(vaddr);
 
 retry:
@@ -407,7 +387,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 		if (!ret && !is_invalid_reserved_pfn(*pfn))
 			ret = -EFAULT;
 	}
-
+done:
 	up_read(&mm->mmap_sem);
 	return ret;
 }
-- 
Gitee


From d0fe8462fe1cf94c277f01d880bb03accf9256fb Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Thu, 30 Jan 2020 22:13:24 -0800
Subject: [PATCH 1526/2161] vfio, mm: pin_user_pages (FOLL_PIN) and
 put_user_page() conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 19fed0dae94dbe66235c116a0e058712d8bc291c upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Change vfio from get_user_pages_remote(), to
   pin_user_pages_remote().

2. Because all FOLL_PIN-acquired pages must be released via
   put_user_page(), also convert the put_page() call over to
   put_user_pages_dirty_lock().

Note that this effectively changes the code's behavior in
vfio_iommu_type1.c: put_pfn(): it now ultimately calls
set_page_dirty_lock(), instead of set_page_dirty().  This is probably
more accurate.

As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
dealing with a file backed page where we have reference on the inode it
hangs off." [1]

[1] https://lore.kernel.org/r/20190723153640.GB720@lst.de

Link: http://lkml.kernel.org/r/20200107224558.2362728-20-jhubbard@nvidia.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Tested-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Björn Töpel <bjorn.topel@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Leon Romanovsky <leonro@mellanox.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4c11e1d6adcf..7f51ea3bdffa 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -310,9 +310,8 @@ static int put_pfn(unsigned long pfn, int prot)
 {
 	if (!is_invalid_reserved_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
-		if (prot & IOMMU_WRITE)
-			SetPageDirty(page);
-		put_page(page);
+
+		put_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
 		return 1;
 	}
 	return 0;
-- 
Gitee


From f22075add1a638ad216352b8c9d952505c180cf1 Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Tue, 24 Mar 2020 09:27:56 -0600
Subject: [PATCH 1527/2161] vfio: allow external user to get vfio group from
 device

commit c0560f51cf77472f4ed113539b0a02ca6cda7961 upstream.

external user calls vfio_group_get_external_user_from_dev() with a device
pointer to get the VFIO group associated with this device.
The VFIO group is checked to be vialbe and have IOMMU set. Then
container user counter is increased and VFIO group reference is hold
to prevent the VFIO group from disposal before external user exits.

when the external user finishes using of the VFIO group, it calls
vfio_group_put_external_user() to dereference the VFIO group and the
container user counter.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c  | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/vfio.h |  2 ++
 2 files changed, 40 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c8482624ca34..97b972bfb735 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1720,6 +1720,44 @@ struct vfio_group *vfio_group_get_external_user(struct file *filep)
 }
 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
 
+/**
+ * External user API, exported by symbols to be linked dynamically.
+ * The external user passes in a device pointer
+ * to verify that:
+ *	- A VFIO group is assiciated with the device;
+ *	- IOMMU is set for the group.
+ * If both checks passed, vfio_group_get_external_user_from_dev()
+ * increments the container user counter to prevent the VFIO group
+ * from disposal before external user exits and returns the pointer
+ * to the VFIO group.
+ *
+ * When the external user finishes using the VFIO group, it calls
+ * vfio_group_put_external_user() to release the VFIO group and
+ * decrement the container user counter.
+ *
+ * @dev [in]	: device
+ * Return error PTR or pointer to VFIO group.
+ */
+
+struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
+{
+	struct vfio_group *group;
+	int ret;
+
+	group = vfio_group_get_from_dev(dev);
+	if (!group)
+		return ERR_PTR(-ENODEV);
+
+	ret = vfio_group_add_container_user(group);
+	if (ret) {
+		vfio_group_put(group);
+		return ERR_PTR(ret);
+	}
+
+	return group;
+}
+EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
+
 void vfio_group_put_external_user(struct vfio_group *group)
 {
 	vfio_group_try_dissolve_container(group);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e42a711a2800..fb71e0ac0e76 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -94,6 +94,8 @@ extern void vfio_unregister_iommu_driver(
  */
 extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
 extern void vfio_group_put_external_user(struct vfio_group *group);
+extern struct vfio_group *vfio_group_get_external_user_from_dev(struct device
+								*dev);
 extern bool vfio_external_group_match_file(struct vfio_group *group,
 					   struct file *filep);
 extern int vfio_external_user_iommu_id(struct vfio_group *group);
-- 
Gitee


From 3924fda65f103c1c0dbee67bff6bbd816bedb5ee Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Tue, 24 Mar 2020 09:27:57 -0600
Subject: [PATCH 1528/2161] vfio: introduce vfio_dma_rw to read/write a range
 of IOVAs

commit 8d46c0cca5f4dc0538173d62cd36b1119b5105bc upstream.

vfio_dma_rw will read/write a range of user space memory pointed to by
IOVA into/from a kernel buffer without enforcing pinning the user space
memory.

TODO: mark the IOVAs to user space memory dirty if they are written in
vfio_dma_rw().

Cc: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c             | 49 +++++++++++++++++++++
 drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
 include/linux/vfio.h            |  5 +++
 3 files changed, 130 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 97b972bfb735..6997f711b925 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1999,6 +1999,55 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
+
+/*
+ * This interface allows the CPUs to perform some sort of virtual DMA on
+ * behalf of the device.
+ *
+ * CPUs read/write from/into a range of IOVAs pointing to user space memory
+ * into/from a kernel buffer.
+ *
+ * As the read/write of user space memory is conducted via the CPUs and is
+ * not a real device DMA, it is not necessary to pin the user space memory.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in]		: VFIO group
+ * @user_iova [in]	: base IOVA of a user space buffer
+ * @data [in]		: pointer to kernel buffer
+ * @len [in]		: kernel buffer length
+ * @write		: indicate read or write
+ * Return error code on failure or 0 on success.
+ */
+int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
+		void *data, size_t len, bool write)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+	int ret = 0;
+
+	if (!group || !data || len <= 0)
+		return -EINVAL;
+
+	container = group->container;
+	driver = container->iommu_driver;
+
+	if (likely(driver && driver->ops->dma_rw))
+		ret = driver->ops->dma_rw(container->iommu_data,
+					  user_iova, data, len, write);
+	else
+		ret = -ENOTTY;
+
+	return ret;
+}
+EXPORT_SYMBOL(vfio_dma_rw);
+
 static int vfio_register_iommu_notifier(struct vfio_group *group,
 					unsigned long *events,
 					struct notifier_block *nb)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 7f51ea3bdffa..6808f2711b21 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -28,6 +28,7 @@
 #include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/mmu_context.h>
 #include <linux/rbtree.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
@@ -2413,6 +2414,80 @@ static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
 	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
 }
 
+static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
+					 dma_addr_t user_iova, void *data,
+					 size_t count, bool write,
+					 size_t *copied)
+{
+	struct mm_struct *mm;
+	unsigned long vaddr;
+	struct vfio_dma *dma;
+	bool kthread = current->mm == NULL;
+	size_t offset;
+
+	*copied = 0;
+
+	dma = vfio_find_dma(iommu, user_iova, 1);
+	if (!dma)
+		return -EINVAL;
+
+	if ((write && !(dma->prot & IOMMU_WRITE)) ||
+			!(dma->prot & IOMMU_READ))
+		return -EPERM;
+
+	mm = get_task_mm(dma->task);
+
+	if (!mm)
+		return -EPERM;
+
+	if (kthread)
+		use_mm(mm);
+	else if (current->mm != mm)
+		goto out;
+
+	offset = user_iova - dma->iova;
+
+	if (count > dma->size - offset)
+		count = dma->size - offset;
+
+	vaddr = dma->vaddr + offset;
+
+	if (write)
+		*copied = __copy_to_user((void __user *)vaddr, data,
+					 count) ? 0 : count;
+	else
+		*copied = __copy_from_user(data, (void __user *)vaddr,
+					   count) ? 0 : count;
+	if (kthread)
+		unuse_mm(mm);
+out:
+	mmput(mm);
+	return *copied ? 0 : -EFAULT;
+}
+
+static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
+				   void *data, size_t count, bool write)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	int ret = 0;
+	size_t done;
+
+	mutex_lock(&iommu->lock);
+	while (count > 0) {
+		ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
+						    count, write, &done);
+		if (ret)
+			break;
+
+		count -= done;
+		data += done;
+		user_iova += done;
+	}
+
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.name			= "vfio-iommu-type1",
 	.owner			= THIS_MODULE,
@@ -2425,6 +2500,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
 	.register_notifier	= vfio_iommu_type1_register_notifier,
 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
+	.dma_rw			= vfio_iommu_type1_dma_rw,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index fb71e0ac0e76..34b2fdf4de6e 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -82,6 +82,8 @@ struct vfio_iommu_driver_ops {
 					     struct notifier_block *nb);
 	int		(*unregister_notifier)(void *iommu_data,
 					       struct notifier_block *nb);
+	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
+				  void *data, size_t count, bool write);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -109,6 +111,9 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
 extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
 			    int npage);
 
+extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
+		       void *data, size_t len, bool write);
+
 /* each type has independent events */
 enum vfio_notify_type {
 	VFIO_IOMMU_NOTIFY = 0,
-- 
Gitee


From 130757cd556cab890e8e406b459943a327ddb63f Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Tue, 24 Mar 2020 09:27:57 -0600
Subject: [PATCH 1529/2161] vfio: avoid inefficient operations on VFIO group in
 vfio_pin/unpin_pages

commit 40280cf7e8ca7d31bb0a9d626f36f458fec32815 upstream.

vfio_group_pin_pages() and vfio_group_unpin_pages() are introduced to
avoid inefficient search/check/ref/deref opertions associated with VFIO
group as those in each calling into vfio_pin_pages() and
vfio_unpin_pages().

VFIO group is taken as arg directly. The callers combine
search/check/ref/deref operations associated with VFIO group by calling
vfio_group_get_external_user()/vfio_group_get_external_user_from_dev()
beforehand, and vfio_group_put_external_user() afterwards.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c  | 91 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/vfio.h |  6 +++
 2 files changed, 97 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6997f711b925..210fcf426643 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1999,6 +1999,97 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
+/*
+ * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
+ * VFIO group.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in]		: VFIO group
+ * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
+ * @npage [in]		: count of elements in user_iova_pfn array.
+ *			  This count should not be greater
+ *			  VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @prot [in]		: protection flags
+ * @phys_pfn [out]	: array of host PFNs
+ * Return error or number of pages pinned.
+ */
+int vfio_group_pin_pages(struct vfio_group *group,
+			 unsigned long *user_iova_pfn, int npage,
+			 int prot, unsigned long *phys_pfn)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+	int ret;
+
+	if (!group || !user_iova_pfn || !phys_pfn || !npage)
+		return -EINVAL;
+
+	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+		return -E2BIG;
+
+	container = group->container;
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->pin_pages))
+		ret = driver->ops->pin_pages(container->iommu_data,
+					     user_iova_pfn, npage,
+					     prot, phys_pfn);
+	else
+		ret = -ENOTTY;
+
+	return ret;
+}
+EXPORT_SYMBOL(vfio_group_pin_pages);
+
+/*
+ * Unpin a set of guest IOVA PFNs for a VFIO group.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in]		: vfio group
+ * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
+ * @npage [in]		: count of elements in user_iova_pfn array.
+ *			  This count should not be greater than
+ *			  VFIO_PIN_PAGES_MAX_ENTRIES.
+ * Return error or number of pages unpinned.
+ */
+int vfio_group_unpin_pages(struct vfio_group *group,
+			   unsigned long *user_iova_pfn, int npage)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+	int ret;
+
+	if (!group || !user_iova_pfn || !npage)
+		return -EINVAL;
+
+	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+		return -E2BIG;
+
+	container = group->container;
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unpin_pages))
+		ret = driver->ops->unpin_pages(container->iommu_data,
+					       user_iova_pfn, npage);
+	else
+		ret = -ENOTTY;
+
+	return ret;
+}
+EXPORT_SYMBOL(vfio_group_unpin_pages);
+
 
 /*
  * This interface allows the CPUs to perform some sort of virtual DMA on
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 34b2fdf4de6e..be2bd358b952 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -111,6 +111,12 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
 extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
 			    int npage);
 
+extern int vfio_group_pin_pages(struct vfio_group *group,
+				unsigned long *user_iova_pfn, int npage,
+				int prot, unsigned long *phys_pfn);
+extern int vfio_group_unpin_pages(struct vfio_group *group,
+				  unsigned long *user_iova_pfn, int npage);
+
 extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
 		       void *data, size_t len, bool write);
 
-- 
Gitee


From 70a942857ea7dd0b5eaac8570d260d52bb76e5c9 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 Mar 2020 09:28:25 -0600
Subject: [PATCH 1530/2161] vfio: Include optional device match in
 vfio_device_ops callbacks

commit 5f3874c2a2310d9bd6969ca6764961d27a843b9d upstream.

Allow bus drivers to provide their own callback to match a device to
the user provided string.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c  | 20 ++++++++++++++++----
 include/linux/vfio.h |  4 ++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 210fcf426643..765e0e5d83ed 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -875,11 +875,23 @@ EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 						     char *buf)
 {
-	struct vfio_device *it, *device = NULL;
+	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
 
 	mutex_lock(&group->device_lock);
 	list_for_each_entry(it, &group->device_list, group_next) {
-		if (!strcmp(dev_name(it->dev), buf)) {
+		int ret;
+
+		if (it->ops->match) {
+			ret = it->ops->match(it->device_data, buf);
+			if (ret < 0) {
+				device = ERR_PTR(ret);
+				break;
+			}
+		} else {
+			ret = !strcmp(dev_name(it->dev), buf);
+		}
+
+		if (ret) {
 			device = it;
 			vfio_device_get(device);
 			break;
@@ -1430,8 +1442,8 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 		return -EPERM;
 
 	device = vfio_device_get_from_name(group, buf);
-	if (!device)
-		return -ENODEV;
+	if (IS_ERR(device))
+		return PTR_ERR(device);
 
 	ret = device->ops->open(device->device_data);
 	if (ret) {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index be2bd358b952..5d92ee15d098 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -26,6 +26,9 @@
  *         operations documented below
  * @mmap: Perform mmap(2) on a region of the device file descriptor
  * @request: Request for the bus driver to release the device
+ * @match: Optional device name match callback (return: 0 for no-match, >0 for
+ *         match, -errno for abort (ex. match with insufficient or incorrect
+ *         additional args)
  */
 struct vfio_device_ops {
 	char	*name;
@@ -39,6 +42,7 @@ struct vfio_device_ops {
 			 unsigned long arg);
 	int	(*mmap)(void *device_data, struct vm_area_struct *vma);
 	void	(*request)(void *device_data, unsigned int count);
+	int	(*match)(void *device_data, char *buf);
 };
 
 extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
-- 
Gitee


From 179475494ea0b4705dab515585ae084378ce7851 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 Mar 2020 09:28:26 -0600
Subject: [PATCH 1531/2161] vfio/pci: Implement match ops

commit 467c084f9ad393899dc7ce527dd03ee588640ae1 upstream.

This currently serves the same purpose as the default implementation
but will be expanded for additional functionality.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 1e835be334a0..a82829e9ac05 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1558,6 +1558,13 @@ static void vfio_pci_request(void *device_data, unsigned int count)
 	mutex_unlock(&vdev->igate);
 }
 
+static int vfio_pci_match(void *device_data, char *buf)
+{
+	struct vfio_pci_device *vdev = device_data;
+
+	return !strcmp(pci_name(vdev->pdev), buf);
+}
+
 static const struct vfio_device_ops vfio_pci_ops = {
 	.name		= "vfio-pci",
 	.open		= vfio_pci_open,
@@ -1567,6 +1574,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.write		= vfio_pci_write,
 	.mmap		= vfio_pci_mmap,
 	.request	= vfio_pci_request,
+	.match		= vfio_pci_match,
 };
 
 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
-- 
Gitee


From 6847ecdcab7a91e0fe698f85025c2baaad3d94a5 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 Mar 2020 09:28:27 -0600
Subject: [PATCH 1532/2161] vfio/pci: Introduce VF token

commit cc20d7999000996557333910bcc99399b7244cd9 upstream.

If we enable SR-IOV on a vfio-pci owned PF, the resulting VFs are not
fully isolated from the PF.  The PF can always cause a denial of service
to the VF, even if by simply resetting itself.  The degree to which a PF
can access the data passed through a VF or interfere with its operation
is dependent on a given SR-IOV implementation.  Therefore we want to
avoid a scenario where an existing vfio-pci based userspace driver might
assume the PF driver is trusted, for example assigning a PF to one VM
and VF to another with some expectation of isolation.  IOMMU grouping
could be a solution to this, but imposes an unnecessarily strong
relationship between PF and VF drivers if they need to operate with the
same IOMMU context.  Instead we introduce a "VF token", which is
essentially just a shared secret between PF and VF drivers, implemented
as a UUID.

The VF token can be set by a vfio-pci based PF driver and must be known
by the vfio-pci based VF driver in order to gain access to the device.
This allows the degree to which this VF token is considered secret to be
determined by the applications and environment.  For example a VM might
generate a random UUID known only internally to the hypervisor while a
userspace networking appliance might use a shared, or even well know,
UUID among the application drivers.

To incorporate this VF token, the VFIO_GROUP_GET_DEVICE_FD interface is
extended to accept key=value pairs in addition to the device name.  This
allows us to most easily deny user access to the device without risk
that existing userspace drivers assume region offsets, IRQs, and other
device features, leading to more elaborate error paths.  The format of
these options are expected to take the form:

"$DEVICE_NAME $OPTION1=$VALUE1 $OPTION2=$VALUE2"

Where the device name is always provided first for compatibility and
additional options are specified in a space separated list.  The
relation between and requirements for the additional options will be
vfio bus driver dependent, however unknown or unused option within this
schema should return error.  This allow for future use of unknown
options as well as a positive indication to the user that an option is
used.

An example VF token option would take this form:

"0000:03:00.0 vf_token=2ab74924-c335-45f4-9b16-8569e5b08258"

When accessing a VF where the PF is making use of vfio-pci, the user
MUST provide the current vf_token.  When accessing a PF, the user MUST
provide the current vf_token IF there are active VF users or MAY provide
a vf_token in order to set the current VF token when no VF users are
active.  The former requirement assures VF users that an unassociated
driver cannot usurp the PF device.  These semantics also imply that a
VF token MUST be set by a PF driver before VF drivers can access their
device, the default token is random and mechanisms to read the token are
not provided in order to protect the VF token of previous users.  Use of
the vf_token option outside of these cases will return an error, as
discussed above.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c         | 198 +++++++++++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_private.h |   8 ++
 2 files changed, 205 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index a82829e9ac05..8194aa367736 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -466,6 +466,44 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 		vfio_pci_set_power_state(vdev, PCI_D3hot);
 }
 
+static struct pci_driver vfio_pci_driver;
+
+static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev,
+					   struct vfio_device **pf_dev)
+{
+	struct pci_dev *physfn = pci_physfn(vdev->pdev);
+
+	if (!vdev->pdev->is_virtfn)
+		return NULL;
+
+	*pf_dev = vfio_device_get_from_dev(&physfn->dev);
+	if (!*pf_dev)
+		return NULL;
+
+	if (pci_dev_driver(physfn) != &vfio_pci_driver) {
+		vfio_device_put(*pf_dev);
+		return NULL;
+	}
+
+	return vfio_device_data(*pf_dev);
+}
+
+static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
+{
+	struct vfio_device *pf_dev;
+	struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
+
+	if (!pf_vdev)
+		return;
+
+	mutex_lock(&pf_vdev->vf_token->lock);
+	pf_vdev->vf_token->users += val;
+	WARN_ON(pf_vdev->vf_token->users < 0);
+	mutex_unlock(&pf_vdev->vf_token->lock);
+
+	vfio_device_put(pf_dev);
+}
+
 static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
@@ -473,6 +511,7 @@ static void vfio_pci_release(void *device_data)
 	mutex_lock(&vdev->reflck->lock);
 
 	if (!(--vdev->refcnt)) {
+		vfio_pci_vf_token_user_add(vdev, -1);
 		vfio_spapr_pci_eeh_release(vdev->pdev);
 		vfio_pci_disable(vdev);
 		mutex_lock(&vdev->igate);
@@ -511,6 +550,7 @@ static int vfio_pci_open(void *device_data)
 			goto error;
 
 		vfio_spapr_pci_eeh_open(vdev->pdev);
+		vfio_pci_vf_token_user_add(vdev, 1);
 	}
 	vdev->refcnt++;
 error:
@@ -1558,11 +1598,148 @@ static void vfio_pci_request(void *device_data, unsigned int count)
 	mutex_unlock(&vdev->igate);
 }
 
+static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
+				      bool vf_token, uuid_t *uuid)
+{
+	/*
+	 * There's always some degree of trust or collaboration between SR-IOV
+	 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
+	 * can disrupt VFs with a reset, but often the PF has more explicit
+	 * access to deny service to the VF or access data passed through the
+	 * VF.  We therefore require an opt-in via a shared VF token (UUID) to
+	 * represent this trust.  This both prevents that a VF driver might
+	 * assume the PF driver is a trusted, in-kernel driver, and also that
+	 * a PF driver might be replaced with a rogue driver, unknown to in-use
+	 * VF drivers.
+	 *
+	 * Therefore when presented with a VF, if the PF is a vfio device and
+	 * it is bound to the vfio-pci driver, the user needs to provide a VF
+	 * token to access the device, in the form of appending a vf_token to
+	 * the device name, for example:
+	 *
+	 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
+	 *
+	 * When presented with a PF which has VFs in use, the user must also
+	 * provide the current VF token to prove collaboration with existing
+	 * VF users.  If VFs are not in use, the VF token provided for the PF
+	 * device will act to set the VF token.
+	 *
+	 * If the VF token is provided but unused, an error is generated.
+	 */
+	if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token)
+		return 0; /* No VF token provided or required */
+
+	if (vdev->pdev->is_virtfn) {
+		struct vfio_device *pf_dev;
+		struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
+		bool match;
+
+		if (!pf_vdev) {
+			if (!vf_token)
+				return 0; /* PF is not vfio-pci, no VF token */
+
+			pci_info_ratelimited(vdev->pdev,
+				"VF token incorrectly provided, PF not bound to vfio-pci\n");
+			return -EINVAL;
+		}
+
+		if (!vf_token) {
+			vfio_device_put(pf_dev);
+			pci_info_ratelimited(vdev->pdev,
+				"VF token required to access device\n");
+			return -EACCES;
+		}
+
+		mutex_lock(&pf_vdev->vf_token->lock);
+		match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
+		mutex_unlock(&pf_vdev->vf_token->lock);
+
+		vfio_device_put(pf_dev);
+
+		if (!match) {
+			pci_info_ratelimited(vdev->pdev,
+				"Incorrect VF token provided for device\n");
+			return -EACCES;
+		}
+	} else if (vdev->vf_token) {
+		mutex_lock(&vdev->vf_token->lock);
+		if (vdev->vf_token->users) {
+			if (!vf_token) {
+				mutex_unlock(&vdev->vf_token->lock);
+				pci_info_ratelimited(vdev->pdev,
+					"VF token required to access device\n");
+				return -EACCES;
+			}
+
+			if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
+				mutex_unlock(&vdev->vf_token->lock);
+				pci_info_ratelimited(vdev->pdev,
+					"Incorrect VF token provided for device\n");
+				return -EACCES;
+			}
+		} else if (vf_token) {
+			uuid_copy(&vdev->vf_token->uuid, uuid);
+		}
+
+		mutex_unlock(&vdev->vf_token->lock);
+	} else if (vf_token) {
+		pci_info_ratelimited(vdev->pdev,
+			"VF token incorrectly provided, not a PF or VF\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define VF_TOKEN_ARG "vf_token="
+
 static int vfio_pci_match(void *device_data, char *buf)
 {
 	struct vfio_pci_device *vdev = device_data;
+	bool vf_token = false;
+	uuid_t uuid;
+	int ret;
+
+	if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
+		return 0; /* No match */
+
+	if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
+		buf += strlen(pci_name(vdev->pdev));
+
+		if (*buf != ' ')
+			return 0; /* No match: non-whitespace after name */
+
+		while (*buf) {
+			if (*buf == ' ') {
+				buf++;
+				continue;
+			}
+
+			if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
+						  strlen(VF_TOKEN_ARG))) {
+				buf += strlen(VF_TOKEN_ARG);
+
+				if (strlen(buf) < UUID_STRING_LEN)
+					return -EINVAL;
 
-	return !strcmp(pci_name(vdev->pdev), buf);
+				ret = uuid_parse(buf, &uuid);
+				if (ret)
+					return ret;
+
+				vf_token = true;
+				buf += UUID_STRING_LEN;
+			} else {
+				/* Unknown/duplicate option */
+				return -EINVAL;
+			}
+		}
+	}
+
+	ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
+	if (ret)
+		return ret;
+
+	return 1; /* Match */
 }
 
 static const struct vfio_device_ops vfio_pci_ops = {
@@ -1638,6 +1815,19 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return ret;
 	}
 
+	if (pdev->is_physfn) {
+		vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
+		if (!vdev->vf_token) {
+			vfio_pci_reflck_put(vdev->reflck);
+			vfio_del_group_dev(&pdev->dev);
+			vfio_iommu_group_put(group, &pdev->dev);
+			kfree(vdev);
+			return -ENOMEM;
+		}
+		mutex_init(&vdev->vf_token->lock);
+		uuid_gen(&vdev->vf_token->uuid);
+	}
+
 	if (vfio_pci_is_vga(pdev)) {
 		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
 		vga_set_legacy_decoding(pdev,
@@ -1671,6 +1861,12 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 	if (!vdev)
 		return;
 
+	if (vdev->vf_token) {
+		WARN_ON(vdev->vf_token->users);
+		mutex_destroy(&vdev->vf_token->lock);
+		kfree(vdev->vf_token);
+	}
+
 	vfio_pci_reflck_put(vdev->reflck);
 
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index dc50a3aa508c..6ade9e208b59 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/irqbypass.h>
 #include <linux/types.h>
+#include <linux/uuid.h>
 
 #ifndef VFIO_PCI_PRIVATE_H
 #define VFIO_PCI_PRIVATE_H
@@ -89,6 +90,12 @@ struct vfio_pci_mmap_vma {
 	struct list_head	vma_next;
 };
 
+struct vfio_pci_vf_token {
+	struct mutex		lock;
+	uuid_t			uuid;
+	int			users;
+};
+
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_NUM_BARS];
@@ -127,6 +134,7 @@ struct vfio_pci_device {
 	struct list_head	dummy_resources_list;
 	struct mutex		ioeventfds_lock;
 	struct list_head	ioeventfds_list;
+	struct vfio_pci_vf_token	*vf_token;
 	struct mutex		vma_lock;
 	struct list_head	vma_list;
 	struct rw_semaphore	memory_lock;
-- 
Gitee


From 809f1ac3d151897bc4576b13c7f97e7ed1505449 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 Mar 2020 09:28:27 -0600
Subject: [PATCH 1533/2161] vfio: Introduce VFIO_DEVICE_FEATURE ioctl and first
 user

commit 43eeeecc8ed5fa05652d68032a8bfb1308ee9baa upstream.

The VFIO_DEVICE_FEATURE ioctl is meant to be a general purpose, device
agnostic ioctl for setting, retrieving, and probing device features.
This implementation provides a 16-bit field for specifying a feature
index, where the data porition of the ioctl is determined by the
semantics for the given feature.  Additional flag bits indicate the
direction and nature of the operation; SET indicates user data is
provided into the device feature, GET indicates the device feature is
written out into user data.  The PROBE flag augments determining
whether the given feature is supported, and if provided, whether the
given operation on the feature is supported.

The first user of this ioctl is for setting the vfio-pci VF token,
where the user provides a shared secret key (UUID) on a SR-IOV PF
device, which users must provide when opening associated VF devices.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 59 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h   | 37 +++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 8194aa367736..b7c7faf585ec 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1256,6 +1256,65 @@ static long vfio_pci_ioctl(void *device_data,
 
 		return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
 					  ioeventfd.data, count, ioeventfd.fd);
+	} else if (cmd == VFIO_DEVICE_FEATURE) {
+		struct vfio_device_feature feature;
+		uuid_t uuid;
+
+		minsz = offsetofend(struct vfio_device_feature, flags);
+
+		if (copy_from_user(&feature, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (feature.argsz < minsz)
+			return -EINVAL;
+
+		/* Check unknown flags */
+		if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK |
+				      VFIO_DEVICE_FEATURE_SET |
+				      VFIO_DEVICE_FEATURE_GET |
+				      VFIO_DEVICE_FEATURE_PROBE))
+			return -EINVAL;
+
+		/* GET & SET are mutually exclusive except with PROBE */
+		if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
+		    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
+		    (feature.flags & VFIO_DEVICE_FEATURE_GET))
+			return -EINVAL;
+
+		switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
+		case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
+			if (!vdev->vf_token)
+				return -ENOTTY;
+
+			/*
+			 * We do not support GET of the VF Token UUID as this
+			 * could expose the token of the previous device user.
+			 */
+			if (feature.flags & VFIO_DEVICE_FEATURE_GET)
+				return -EINVAL;
+
+			if (feature.flags & VFIO_DEVICE_FEATURE_PROBE)
+				return 0;
+
+			/* Don't SET unless told to do so */
+			if (!(feature.flags & VFIO_DEVICE_FEATURE_SET))
+				return -EINVAL;
+
+			if (feature.argsz < minsz + sizeof(uuid))
+				return -EINVAL;
+
+			if (copy_from_user(&uuid, (void __user *)(arg + minsz),
+					   sizeof(uuid)))
+				return -EFAULT;
+
+			mutex_lock(&vdev->vf_token->lock);
+			uuid_copy(&vdev->vf_token->uuid, &uuid);
+			mutex_unlock(&vdev->vf_token->lock);
+
+			return 0;
+		default:
+			return -ENOTTY;
+		}
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cabc93118f9c..459583f20f34 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -707,6 +707,43 @@ struct vfio_device_ioeventfd {
 
 #define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17,
+ *			       struct vfio_device_feature)
+ *
+ * Get, set, or probe feature data of the device.  The feature is selected
+ * using the FEATURE_MASK portion of the flags field.  Support for a feature
+ * can be probed by setting both the FEATURE_MASK and PROBE bits.  A probe
+ * may optionally include the GET and/or SET bits to determine read vs write
+ * access of the feature respectively.  Probing a feature will return success
+ * if the feature is supported and all of the optionally indicated GET/SET
+ * methods are supported.  The format of the data portion of the structure is
+ * specific to the given feature.  The data portion is not required for
+ * probing.  GET and SET are mutually exclusive, except for use with PROBE.
+ *
+ * Return 0 on success, -errno on failure.
+ */
+struct vfio_device_feature {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DEVICE_FEATURE_MASK	(0xffff) /* 16-bit feature index */
+#define VFIO_DEVICE_FEATURE_GET		(1 << 16) /* Get feature into data[] */
+#define VFIO_DEVICE_FEATURE_SET		(1 << 17) /* Set feature from data[] */
+#define VFIO_DEVICE_FEATURE_PROBE	(1 << 18) /* Probe feature support */
+	__u8	data[];
+};
+
+#define VFIO_DEVICE_FEATURE		_IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/*
+ * Provide support for setting a PCI VF Token, which is used as a shared
+ * secret between PF and VF drivers.  This feature may only be set on a
+ * PCI SR-IOV PF when SR-IOV is enabled on the PF and there are no existing
+ * open VFs.  Data provided when setting this feature is a 16-byte array
+ * (__u8 b[16]), representing a UUID.
+ */
+#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN	(0)
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
Gitee


From da7e7eba7a45e4f94bf59de4a59e3ac3d8ad9230 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 Mar 2020 09:28:28 -0600
Subject: [PATCH 1534/2161] vfio/pci: Add sriov_configure support

commit 137e5531351db258eff58ea28f4dc8fdf7ca2990 upstream.

With the VF Token interface we can now expect that a vfio userspace
driver must be in collaboration with the PF driver, an unwitting
userspace driver will not be able to get past the GET_DEVICE_FD step
in accessing the device.  We can now move on to actually allowing
SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
enabled by default in this commit, but it does provide a module option
for this to be enabled (enable_sriov=1).  Enabling VFs is rather
straightforward, except we don't want to risk that a VF might get
autoprobed and bound to other drivers, so a bus notifier is used to
"capture" VFs to vfio-pci using the driver_override support.  We
assume any later action to bind the device to other drivers is
condoned by the system admin and allow it with a log warning.

vfio-pci will disable SR-IOV on a PF before releasing the device,
allowing a VF driver to be assured other drivers cannot take over the
PF and that any other userspace driver must know the shared VF token.
This support also does not provide a mechanism for the PF userspace
driver itself to manipulate SR-IOV through the vfio API.  With this
patch SR-IOV can only be enabled via the host sysfs interface and the
PF driver user cannot create or remove VFs.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c         | 106 +++++++++++++++++++++++++---
 drivers/vfio/pci/vfio_pci_private.h |   2 +
 2 files changed, 97 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b7c7faf585ec..9acfe0ea94fb 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -55,6 +55,12 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(disable_idle_d3,
 		 "Disable using the PCI D3 low power state for idle, unused devices");
 
+static bool enable_sriov;
+#ifdef CONFIG_PCI_IOV
+module_param(enable_sriov, bool, 0644);
+MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration.  Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
+#endif
+
 static inline bool vfio_vga_disabled(void)
 {
 #ifdef CONFIG_VFIO_PCI_VGA
@@ -1815,6 +1821,35 @@ static const struct vfio_device_ops vfio_pci_ops = {
 
 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
+static struct pci_driver vfio_pci_driver;
+
+static int vfio_pci_bus_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct vfio_pci_device *vdev = container_of(nb,
+						    struct vfio_pci_device, nb);
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_dev *physfn = pci_physfn(pdev);
+
+	if (action == BUS_NOTIFY_ADD_DEVICE &&
+	    pdev->is_virtfn && physfn == vdev->pdev) {
+		pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
+			 pci_name(pdev));
+		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
+						  vfio_pci_ops.name);
+	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
+		   pdev->is_virtfn && physfn == vdev->pdev) {
+		struct pci_driver *drv = pci_dev_driver(pdev);
+
+		if (drv && drv != &vfio_pci_driver)
+			pci_warn(vdev->pdev,
+				 "VF %s bound to driver %s while PF bound to vfio-pci\n",
+				 pci_name(pdev), drv->name);
+	}
+
+	return 0;
+}
 
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
@@ -1826,12 +1861,12 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return -EINVAL;
 
 	/*
-	 * Prevent binding to PFs with VFs enabled, this too easily allows
-	 * userspace instance with VFs and PFs from the same device, which
-	 * cannot work.  Disabling SR-IOV here would initiate removing the
-	 * VFs, which would unbind the driver, which is prone to blocking
-	 * if that VF is also in use by vfio-pci.  Just reject these PFs
-	 * and let the user sort it out.
+	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
+	 * by the host or other users.  We cannot capture the VFs if they
+	 * already exist, nor can we track VF users.  Disabling SR-IOV here
+	 * would initiate removing the VFs, which would unbind the driver,
+	 * which is prone to blocking if that VF is also in use by vfio-pci.
+	 * Just reject these PFs and let the user sort it out.
 	 */
 	if (pci_num_vf(pdev)) {
 		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
@@ -1883,6 +1918,18 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 			kfree(vdev);
 			return -ENOMEM;
 		}
+
+		vdev->nb.notifier_call = vfio_pci_bus_notifier;
+		ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
+		if (ret) {
+			kfree(vdev->vf_token);
+			vfio_pci_reflck_put(vdev->reflck);
+			vfio_del_group_dev(&pdev->dev);
+			vfio_iommu_group_put(group, &pdev->dev);
+			kfree(vdev);
+			return ret;
+		}
+
 		mutex_init(&vdev->vf_token->lock);
 		uuid_gen(&vdev->vf_token->uuid);
 	}
@@ -1916,6 +1963,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 {
 	struct vfio_pci_device *vdev;
 
+	pci_disable_sriov(pdev);
+
 	vdev = vfio_del_group_dev(&pdev->dev);
 	if (!vdev)
 		return;
@@ -1926,6 +1975,9 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 		kfree(vdev->vf_token);
 	}
 
+	if (vdev->nb.notifier_call)
+		bus_unregister_notifier(&pci_bus_type, &vdev->nb);
+
 	vfio_pci_reflck_put(vdev->reflck);
 
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
@@ -1974,16 +2026,48 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+	struct vfio_pci_device *vdev;
+	struct vfio_device *device;
+	int ret = 0;
+
+	might_sleep();
+
+	if (!enable_sriov)
+		return -ENOENT;
+
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (!device)
+		return -ENODEV;
+
+	vdev = vfio_device_data(device);
+	if (!vdev) {
+		vfio_device_put(device);
+		return -ENODEV;
+	}
+
+	if (nr_virtfn == 0)
+		pci_disable_sriov(pdev);
+	else
+		ret = pci_enable_sriov(pdev, nr_virtfn);
+
+	vfio_device_put(device);
+
+	return ret < 0 ? ret : nr_virtfn;
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
 	.error_detected = vfio_pci_aer_err_detected,
 };
 
 static struct pci_driver vfio_pci_driver = {
-	.name		= "vfio-pci",
-	.id_table	= NULL, /* only dynamic ids */
-	.probe		= vfio_pci_probe,
-	.remove		= vfio_pci_remove,
-	.err_handler	= &vfio_err_handlers,
+	.name			= "vfio-pci",
+	.id_table		= NULL, /* only dynamic ids */
+	.probe			= vfio_pci_probe,
+	.remove			= vfio_pci_remove,
+	.sriov_configure	= vfio_pci_sriov_configure,
+	.err_handler		= &vfio_err_handlers,
 };
 
 static DEFINE_MUTEX(reflck_lock);
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 6ade9e208b59..6a70c0538b53 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -13,6 +13,7 @@
 #include <linux/irqbypass.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
+#include <linux/notifier.h>
 
 #ifndef VFIO_PCI_PRIVATE_H
 #define VFIO_PCI_PRIVATE_H
@@ -135,6 +136,7 @@ struct vfio_pci_device {
 	struct mutex		ioeventfds_lock;
 	struct list_head	ioeventfds_list;
 	struct vfio_pci_vf_token	*vf_token;
+	struct notifier_block	nb;
 	struct mutex		vma_lock;
 	struct list_head	vma_list;
 	struct rw_semaphore	memory_lock;
-- 
Gitee


From bd8d8679d3d16ba036b9ec033685e8ae20fe79c3 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 Mar 2020 09:28:29 -0600
Subject: [PATCH 1535/2161] vfio/pci: Remove dev_fmt definition

commit 959e1b75cc87e5215622d8344d9d6ede8666480e upstream.

It currently results in messages like:

 "vfio-pci 0000:03:00.0: vfio_pci: ..."

Which is quite a bit redundant.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 9acfe0ea94fb..8b8d4761f9ed 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -9,7 +9,6 @@
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
 
 #include <linux/device.h>
 #include <linux/eventfd.h>
-- 
Gitee


From 5127d7be671194cc9ea0a53eb8e9bf8e4f83b418 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 Mar 2020 09:28:29 -0600
Subject: [PATCH 1536/2161] vfio/pci: Cleanup .probe() exit paths

commit b66574a3fb32e2bac5bf3589dae6d335b7a70689 upstream.

The cleanup is getting a tad long.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 54 +++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 8b8d4761f9ed..2200190c9c3a 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1878,8 +1878,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
 	if (!vdev) {
-		vfio_iommu_group_put(group, &pdev->dev);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_group_put;
 	}
 
 	vdev->pdev = pdev;
@@ -1894,43 +1894,27 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	init_rwsem(&vdev->memory_lock);
 
 	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
-	if (ret) {
-		vfio_iommu_group_put(group, &pdev->dev);
-		kfree(vdev);
-		return ret;
-	}
+	if (ret)
+		goto out_free;
 
 	ret = vfio_pci_reflck_attach(vdev);
-	if (ret) {
-		vfio_del_group_dev(&pdev->dev);
-		vfio_iommu_group_put(group, &pdev->dev);
-		kfree(vdev);
-		return ret;
-	}
+	if (ret)
+		goto out_del_group_dev;
 
 	if (pdev->is_physfn) {
 		vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
 		if (!vdev->vf_token) {
-			vfio_pci_reflck_put(vdev->reflck);
-			vfio_del_group_dev(&pdev->dev);
-			vfio_iommu_group_put(group, &pdev->dev);
-			kfree(vdev);
-			return -ENOMEM;
-		}
-
-		vdev->nb.notifier_call = vfio_pci_bus_notifier;
-		ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
-		if (ret) {
-			kfree(vdev->vf_token);
-			vfio_pci_reflck_put(vdev->reflck);
-			vfio_del_group_dev(&pdev->dev);
-			vfio_iommu_group_put(group, &pdev->dev);
-			kfree(vdev);
-			return ret;
+			ret = -ENOMEM;
+			goto out_reflck;
 		}
 
 		mutex_init(&vdev->vf_token->lock);
 		uuid_gen(&vdev->vf_token->uuid);
+
+		vdev->nb.notifier_call = vfio_pci_bus_notifier;
+		ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
+		if (ret)
+			goto out_vf_token;
 	}
 
 	if (vfio_pci_is_vga(pdev)) {
@@ -1956,6 +1940,18 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	return ret;
+
+out_vf_token:
+	kfree(vdev->vf_token);
+out_reflck:
+	vfio_pci_reflck_put(vdev->reflck);
+out_del_group_dev:
+	vfio_del_group_dev(&pdev->dev);
+out_free:
+	kfree(vdev);
+out_group_put:
+	vfio_iommu_group_put(group, &pdev->dev);
+	return ret;
 }
 
 static void vfio_pci_remove(struct pci_dev *pdev)
-- 
Gitee


From 6bfc94af3596c8183157cb4a8ae80c1f4200e8e3 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Tue, 31 Mar 2020 15:12:46 +1100
Subject: [PATCH 1537/2161] vfio-pci/nvlink2: Allow fallback to
 ibm,mmio-atsd[0]

commit 00bc5095547ef9878324167fe0c3e56f7b388159 upstream.

Older versions of skiboot only provide a single value in the device
tree property "ibm,mmio-atsd", even when multiple Address Translation
Shoot Down (ATSD) registers are present. This prevents NVLink2 devices
(other than the first) from being used with vfio-pci because vfio-pci
expects to be able to assign a dedicated ATSD register to each NVLink2
device.

However, ATSD registers can be shared among devices. This change
allows vfio-pci to fall back to sharing the register at index 0 if
necessary.

Fixes: 7f92891778df ("vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver")
Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_nvlink2.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
index dbaffa7fcb2f..cf2e5b7512c2 100644
--- a/drivers/vfio/pci/vfio_pci_nvlink2.c
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -425,8 +425,14 @@ int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
 
 	if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
 			&mmio_atsd)) {
-		dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
-		mmio_atsd = 0;
+		if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0,
+				&mmio_atsd)) {
+			dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
+			mmio_atsd = 0;
+		} else {
+			dev_warn(&vdev->pdev->dev,
+				 "Using fallback ibm,mmio-atsd[0] for ATSD.\n");
+		}
 	}
 
 	if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
-- 
Gitee


From 3c8aaf02577c8faa4e42c9f3ca1bade220da3145 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Wed, 1 Apr 2020 11:27:24 +0100
Subject: [PATCH 1538/2161] vfio: Ignore -ENODEV when getting MSI cookie

commit f44efca0493ddc1e8731047c234ec8e475943077 upstream.

When we try to get an MSI cookie for a VFIO device, that can fail if
CONFIG_IOMMU_DMA is not set. In this case iommu_get_msi_cookie() returns
-ENODEV, and that should not be fatal.

Ignore that case and proceed with the initialisation.

This fixes VFIO with a platform device on the Calxeda Midway (no MSIs).

Fixes: f6810c15cf973f ("iommu/arm-smmu: Clean up early-probing workarounds")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6808f2711b21..772300ac5d13 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1890,7 +1890,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 
 	if (resv_msi) {
 		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
-		if (ret)
+		if (ret && ret != -ENODEV)
 			goto out_detach;
 	}
 
-- 
Gitee


From 4129157f6d1f447a86aa84fa722ee5062a4c3db1 Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Wed, 8 Apr 2020 03:11:21 -0400
Subject: [PATCH 1539/2161] vfio: checking of validity of user vaddr in
 vfio_dma_rw

commit 205323b8ceac57b0ac9d7dbc4d6fcdb18aa802ec upstream.

instead of calling __copy_to/from_user(), use copy_to_from_user() to
ensure vaddr range is a valid user address range before accessing them.

Fixes: 8d46c0cca5f4 ("vfio: introduce vfio_dma_rw to read/write a range of IOVAs")
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 772300ac5d13..fb32e3e963c3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2453,10 +2453,10 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
 	vaddr = dma->vaddr + offset;
 
 	if (write)
-		*copied = __copy_to_user((void __user *)vaddr, data,
+		*copied = copy_to_user((void __user *)vaddr, data,
 					 count) ? 0 : count;
 	else
-		*copied = __copy_from_user(data, (void __user *)vaddr,
+		*copied = copy_from_user(data, (void __user *)vaddr,
 					   count) ? 0 : count;
 	if (kthread)
 		unuse_mm(mm);
-- 
Gitee


From 6d88d4f572dd2cd11c7c6aea7bef99cb4a798f5d Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Fri, 29 May 2020 02:00:47 +0530
Subject: [PATCH 1540/2161] vfio: UAPI for migration interface for device state

commit a8a24f3f6e38103b77cf399c38eb54e1219d00d6 upstream.

- Defined MIGRATION region type and sub-type.

- Defined vfio_device_migration_info structure which will be placed at the
  0th offset of migration region to get/set VFIO device related
  information. Defined members of structure and usage on read/write access.

- Defined device states and state transition details.

- Defined sequence to be followed while saving and resuming VFIO device.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 228 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 459583f20f34..d8b015b79434 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK	(0xffff)
 #define VFIO_REGION_TYPE_GFX                    (1)
 #define VFIO_REGION_TYPE_CCW			(2)
+#define VFIO_REGION_TYPE_MIGRATION              (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -379,6 +380,233 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD	(1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION           (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *      - The user application writes to this field to inform the vendor driver
+ *        about the device state to be transitioned to.
+ *      - The vendor driver should take the necessary actions to change the
+ *        device state. After successful transition to a given state, the
+ *        vendor driver should return success on write(device_state, state)
+ *        system call. If the device state transition fails, the vendor driver
+ *        should return an appropriate -errno for the fault condition.
+ *      - On the user application side, if the device state transition fails,
+ *	  that is, if write(device_state, state) returns an error, read
+ *	  device_state again to determine the current state of the device from
+ *	  the vendor driver.
+ *      - The vendor driver should return previous state of the device unless
+ *        the vendor driver has encountered an internal error, in which case
+ *        the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
+ *      - The user application must use the device reset ioctl to recover the
+ *        device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *        indicated to be in a valid device state by reading device_state, the
+ *        user application may attempt to transition the device to any valid
+ *        state reachable from the current state or terminate itself.
+ *
+ *      device_state consists of 3 bits:
+ *      - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *        it indicates the _STOP state. When the device state is changed to
+ *        _STOP, driver should stop the device before write() returns.
+ *      - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *        driver should start gathering device state information that will be
+ *        provided to the VFIO user application to save the device's state.
+ *      - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *        the driver should prepare to resume the device. Data provided through
+ *        the migration region should be used to resume the device.
+ *      Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *      application should perform a read-modify-write operation on this
+ *      field when modifying the specified bits.
+ *
+ *  +------- _RESUMING
+ *  |+------ _SAVING
+ *  ||+----- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
+ *                (100b)     (001b)     (011b)        (010b)       (000b)
+ * 0. Running or default state
+ *                             |
+ *
+ * 1. Normal Shutdown (optional)
+ *                             |------------------------------------->|
+ *
+ * 2. Save the state or suspend
+ *                             |------------------------->|---------->|
+ *
+ * 3. Save the state during live migration
+ *                             |----------->|------------>|---------->|
+ *
+ * 4. Resuming
+ *                  |<---------|
+ *
+ * 5. Resumed
+ *                  |--------->|
+ *
+ * 0. Default state of VFIO device is _RUNNNG when the user application starts.
+ * 1. During normal shutdown of the user application, the user application may
+ *    optionally change the VFIO device state from _RUNNING to _STOP. This
+ *    transition is optional. The vendor driver must support this transition but
+ *    must not require it.
+ * 2. When the user application saves state or suspends the application, the
+ *    device state transitions from _RUNNING to stop-and-copy and then to _STOP.
+ *    On state transition from _RUNNING to stop-and-copy, driver must stop the
+ *    device, save the device state and send it to the application through the
+ *    migration region. The sequence to be followed for such transition is given
+ *    below.
+ * 3. In live migration of user application, the state transitions from _RUNNING
+ *    to pre-copy, to stop-and-copy, and to _STOP.
+ *    On state transition from _RUNNING to pre-copy, the driver should start
+ *    gathering the device state while the application is still running and send
+ *    the device state data to application through the migration region.
+ *    On state transition from pre-copy to stop-and-copy, the driver must stop
+ *    the device, save the device state and send it to the user application
+ *    through the migration region.
+ *    Vendor drivers must support the pre-copy state even for implementations
+ *    where no data is provided to the user before the stop-and-copy state. The
+ *    user must not be required to consume all migration data before the device
+ *    transitions to a new state, including the stop-and-copy state.
+ *    The sequence to be followed for above two transitions is given below.
+ * 4. To start the resuming phase, the device state should be transitioned from
+ *    the _RUNNING to the _RESUMING state.
+ *    In the _RESUMING state, the driver should use the device state data
+ *    received through the migration region to resume the device.
+ * 5. After providing saved device data to the driver, the application should
+ *    change the state from _RESUMING to _RUNNING.
+ *
+ * reserved:
+ *      Reads on this field return zero and writes are ignored.
+ *
+ * pending_bytes: (read only)
+ *      The number of pending bytes still to be migrated from the vendor driver.
+ *
+ * data_offset: (read only)
+ *      The user application should read data_offset field from the migration
+ *      region. The user application should read the device data from this
+ *      offset within the migration region during the _SAVING state or write
+ *      the device data during the _RESUMING state. See below for details of
+ *      sequence to be followed.
+ *
+ * data_size: (read/write)
+ *      The user application should read data_size to get the size in bytes of
+ *      the data copied in the migration region during the _SAVING state and
+ *      write the size in bytes of the data copied in the migration region
+ *      during the _RESUMING state.
+ *
+ * The format of the migration region is as follows:
+ *  ------------------------------------------------------------------
+ * |vfio_device_migration_info|    data section                      |
+ * |                          |     ///////////////////////////////  |
+ * ------------------------------------------------------------------
+ *   ^                              ^
+ *  offset 0-trapped part        data_offset
+ *
+ * The structure vfio_device_migration_info is always followed by the data
+ * section in the region, so data_offset will always be nonzero. The offset
+ * from where the data is copied is decided by the kernel driver. The data
+ * section can be trapped, mmapped, or partitioned, depending on how the kernel
+ * driver defines the data section. The data section partition can be defined
+ * as mapped by the sparse mmap capability. If mmapped, data_offset must be
+ * page aligned, whereas initial section which contains the
+ * vfio_device_migration_info structure, might not end at the offset, which is
+ * page aligned. The user is not required to access through mmap regardless
+ * of the capabilities of the region mmap.
+ * The vendor driver should determine whether and how to partition the data
+ * section. The vendor driver should return data_offset accordingly.
+ *
+ * The sequence to be followed while in pre-copy state and stop-and-copy state
+ * is as follows:
+ * a. Read pending_bytes, indicating the start of a new iteration to get device
+ *    data. Repeated read on pending_bytes at this stage should have no side
+ *    effects.
+ *    If pending_bytes == 0, the user application should not iterate to get data
+ *    for that device.
+ *    If pending_bytes > 0, perform the following steps.
+ * b. Read data_offset, indicating that the vendor driver should make data
+ *    available through the data section. The vendor driver should return this
+ *    read operation only after data is available from (region + data_offset)
+ *    to (region + data_offset + data_size).
+ * c. Read data_size, which is the amount of data in bytes available through
+ *    the migration region.
+ *    Read on data_offset and data_size should return the offset and size of
+ *    the current buffer if the user application reads data_offset and
+ *    data_size more than once here.
+ * d. Read data_size bytes of data from (region + data_offset) from the
+ *    migration region.
+ * e. Process the data.
+ * f. Read pending_bytes, which indicates that the data from the previous
+ *    iteration has been read. If pending_bytes > 0, go to step b.
+ *
+ * The user application can transition from the _SAVING|_RUNNING
+ * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
+ * number of pending bytes. The user application should iterate in _SAVING
+ * (stop-and-copy) until pending_bytes is 0.
+ *
+ * The sequence to be followed while _RESUMING device state is as follows:
+ * While data for this device is available, repeat the following steps:
+ * a. Read data_offset from where the user application should write data.
+ * b. Write migration data starting at the migration region + data_offset for
+ *    the length determined by data_size from the migration source.
+ * c. Write data_size, which indicates to the vendor driver that data is
+ *    written in the migration region. Vendor driver must return this write
+ *    operations on consuming data. Vendor driver should apply the
+ *    user-provided migration region data to the device resume state.
+ *
+ * If an error occurs during the above sequences, the vendor driver can return
+ * an error code for next read() or write() operation, which will terminate the
+ * loop. The user application should then take the next necessary action, for
+ * example, failing migration or terminating the user application.
+ *
+ * For the user application, data is opaque. The user application should write
+ * data in the same order as the data is received and the data should be of
+ * same transaction size at the source.
+ */
+
+struct vfio_device_migration_info {
+	__u32 device_state;         /* VFIO device state */
+#define VFIO_DEVICE_STATE_STOP      (0)
+#define VFIO_DEVICE_STATE_RUNNING   (1 << 0)
+#define VFIO_DEVICE_STATE_SAVING    (1 << 1)
+#define VFIO_DEVICE_STATE_RESUMING  (1 << 2)
+#define VFIO_DEVICE_STATE_MASK      (VFIO_DEVICE_STATE_RUNNING | \
+				     VFIO_DEVICE_STATE_SAVING |  \
+				     VFIO_DEVICE_STATE_RESUMING)
+
+#define VFIO_DEVICE_STATE_VALID(state) \
+	(state & VFIO_DEVICE_STATE_RESUMING ? \
+	(state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+
+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
+	((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
+					      VFIO_DEVICE_STATE_RESUMING))
+
+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
+	((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
+					     VFIO_DEVICE_STATE_RESUMING)
+
+	__u32 reserved;
+	__u64 pending_bytes;
+	__u64 data_offset;
+	__u64 data_size;
+};
+
 /*
  * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
  * which allows direct access to non-MSIX registers which happened to be within
-- 
Gitee


From 8f8bfcf2df415192856ea7476dd8fa2314c64df3 Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Fri, 29 May 2020 02:00:48 +0530
Subject: [PATCH 1541/2161] vfio iommu: Remove atomicity of ref_count of pinned
 pages

commit 6581708586c812460ea4798216d9dd2897ad105c upstream.

vfio_pfn.ref_count is always updated while holding iommu->lock, using
atomic variable is overkill.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fb32e3e963c3..0ee88ee3adb8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -113,7 +113,7 @@ struct vfio_pfn {
 	struct rb_node		node;
 	dma_addr_t		iova;		/* Device address */
 	unsigned long		pfn;		/* Host pfn */
-	atomic_t		ref_count;
+	unsigned int		ref_count;
 };
 
 struct vfio_regions {
@@ -234,7 +234,7 @@ static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
 
 	vpfn->iova = iova;
 	vpfn->pfn = pfn;
-	atomic_set(&vpfn->ref_count, 1);
+	vpfn->ref_count = 1;
 	vfio_link_pfn(dma, vpfn);
 	return 0;
 }
@@ -252,7 +252,7 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 
 	if (vpfn)
-		atomic_inc(&vpfn->ref_count);
+		vpfn->ref_count++;
 	return vpfn;
 }
 
@@ -260,7 +260,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 {
 	int ret = 0;
 
-	if (atomic_dec_and_test(&vpfn->ref_count)) {
+	vpfn->ref_count--;
+	if (!vpfn->ref_count) {
 		ret = put_pfn(vpfn->pfn, dma->prot);
 		vfio_remove_from_pfn_list(dma, vpfn);
 	}
-- 
Gitee


From 27240b56724760ab87fd7c6b7d590b82dfb97ab1 Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Fri, 29 May 2020 02:00:49 +0530
Subject: [PATCH 1542/2161] vfio iommu: Cache pgsize_bitmap in struct
 vfio_iommu

commit cade075f265b25c3d6c13486e1bf0fbed9935a4a upstream.

Calculate and cache pgsize_bitmap when iommu->domain_list is updated
and iommu->external_domain is set for mdev device.
Add iommu->lock protection when cached pgsize_bitmap is accessed.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 40 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0ee88ee3adb8..a8f8d5a0fc09 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -70,6 +70,7 @@ struct vfio_iommu {
 	struct rb_root		dma_list;
 	struct blocking_notifier_head notifier;
 	unsigned int		dma_avail;
+	uint64_t		pgsize_bitmap;
 	bool			v2;
 	bool			nesting;
 };
@@ -848,15 +849,14 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	iommu->dma_avail++;
 }
 
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
+static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
 {
 	struct vfio_domain *domain;
-	unsigned long bitmap = ULONG_MAX;
 
-	mutex_lock(&iommu->lock);
+	iommu->pgsize_bitmap = ULONG_MAX;
+
 	list_for_each_entry(domain, &iommu->domain_list, next)
-		bitmap &= domain->domain->pgsize_bitmap;
-	mutex_unlock(&iommu->lock);
+		iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
 
 	/*
 	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
@@ -866,12 +866,10 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	 * granularity while iommu driver can use the sub-PAGE_SIZE size
 	 * to map the buffer.
 	 */
-	if (bitmap & ~PAGE_MASK) {
-		bitmap &= PAGE_MASK;
-		bitmap |= PAGE_SIZE;
+	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+		iommu->pgsize_bitmap &= PAGE_MASK;
+		iommu->pgsize_bitmap |= PAGE_SIZE;
 	}
-
-	return bitmap;
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -882,19 +880,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	size_t unmapped = 0;
 	int ret = 0, retries = 0;
 
-	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+	mutex_lock(&iommu->lock);
+
+	mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+
+	if (unmap->iova & mask) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (!unmap->size || unmap->size & mask) {
+		ret = -EINVAL;
+		goto unlock;
+	}
 
-	if (unmap->iova & mask)
-		return -EINVAL;
-	if (!unmap->size || unmap->size & mask)
-		return -EINVAL;
 	if (unmap->iova + unmap->size - 1 < unmap->iova ||
-	    unmap->size > SIZE_MAX)
-		return -EINVAL;
+	    unmap->size > SIZE_MAX) {
+		ret = -EINVAL;
+		goto unlock;
+	}
 
 	WARN_ON(mask & PAGE_MASK);
 again:
-	mutex_lock(&iommu->lock);
 
 	/*
 	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
@@ -973,6 +980,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			blocking_notifier_call_chain(&iommu->notifier,
 						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
 						    &nb_unmap);
+			mutex_lock(&iommu->lock);
 			goto again;
 		}
 		unmapped += dma->size;
@@ -1088,24 +1096,28 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
 		return -EINVAL;
 
-	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-	WARN_ON(mask & PAGE_MASK);
-
 	/* READ/WRITE from device perspective */
 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 		prot |= IOMMU_WRITE;
 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 		prot |= IOMMU_READ;
 
-	if (!prot || !size || (size | iova | vaddr) & mask)
-		return -EINVAL;
+	mutex_lock(&iommu->lock);
 
-	/* Don't allow IOVA or virtual address wrap */
-	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
-		return -EINVAL;
+	mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
 
-	mutex_lock(&iommu->lock);
+	WARN_ON(mask & PAGE_MASK);
+
+	if (!prot || !size || (size | iova | vaddr) & mask) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Don't allow IOVA or virtual address wrap */
+	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
 
 	if (vfio_find_dma(iommu, iova, size)) {
 		ret = -EEXIST;
@@ -1772,6 +1784,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 			if (!iommu->external_domain) {
 				INIT_LIST_HEAD(&domain->group_list);
 				iommu->external_domain = domain;
+				vfio_update_pgsize_bitmap(iommu);
 			} else {
 				kfree(domain);
 			}
@@ -1896,6 +1909,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	}
 
 	list_add(&domain->next, &iommu->domain_list);
+	vfio_update_pgsize_bitmap(iommu);
 done:
 	/* Delete the old one and insert new iova list */
 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
@@ -2091,6 +2105,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 			list_del(&domain->next);
 			kfree(domain);
 			vfio_iommu_aper_expand(iommu, &iova_copy);
+			vfio_update_pgsize_bitmap(iommu);
 		}
 		break;
 	}
@@ -2222,8 +2237,6 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
 	size_t size;
 	int iovas = 0, i = 0, ret;
 
-	mutex_lock(&iommu->lock);
-
 	list_for_each_entry(iova, &iommu->iova_list, list)
 		iovas++;
 
@@ -2232,17 +2245,14 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
 		 * Return 0 as a container with a single mdev device
 		 * will have an empty list
 		 */
-		ret = 0;
-		goto out_unlock;
+		return 0;
 	}
 
 	size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
 
 	cap_iovas = kzalloc(size, GFP_KERNEL);
-	if (!cap_iovas) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
+	if (!cap_iovas)
+		return -ENOMEM;
 
 	cap_iovas->nr_iovas = iovas;
 
@@ -2255,8 +2265,6 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
 	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
 
 	kfree(cap_iovas);
-out_unlock:
-	mutex_unlock(&iommu->lock);
 	return ret;
 }
 
@@ -2319,12 +2327,13 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 			info.cap_offset = 0; /* output, no-recopy necessary */
 		}
 
+		mutex_lock(&iommu->lock);
 		info.flags = VFIO_IOMMU_INFO_PGSIZES;
 
-		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
+		info.iova_pgsizes = iommu->pgsize_bitmap;
 
 		ret = vfio_iommu_iova_build_caps(iommu, &caps);
-
+		mutex_unlock(&iommu->lock);
 		if (!ret)
 			ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
 
-- 
Gitee


From 6460433310b009e1063379134c867f859c80d83e Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Fri, 29 May 2020 02:00:50 +0530
Subject: [PATCH 1543/2161] vfio iommu: Add ioctl definition for dirty pages
 tracking

commit b704fd14a06f678f080e44db28061f1bcb1f595c upstream.

IOMMU container maintains a list of all pages pinned by vfio_pin_pages API.
All pages pinned by vendor driver through this API should be considered as
dirty during migration. When container consists of IOMMU capable device and
all pages are pinned and mapped, then all pages are marked dirty.
Added support to start/stop dirtied pages tracking and to get bitmap of all
dirtied pages for requested IO virtual address range.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 57 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d8b015b79434..c628aa719886 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1048,6 +1048,12 @@ struct vfio_iommu_type1_dma_map {
 
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+struct vfio_bitmap {
+	__u64        pgsize;	/* page size for bitmap in bytes */
+	__u64        size;	/* in bytes */
+	__u64 __user *data;	/* one bit per page */
+};
+
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
  *							struct vfio_dma_unmap)
@@ -1074,6 +1080,57 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ *                                     struct vfio_iommu_type1_dirty_bitmap)
+ * IOCTL is used for dirty pages logging.
+ * Caller should set flag depending on which operation to perform, details as
+ * below:
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs
+ * the IOMMU driver to log pages that are dirtied or potentially dirtied by
+ * the device; designed to be used when a migration is in progress. Dirty pages
+ * are logged until logging is disabled by user application by calling the IOCTL
+ * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs
+ * the IOMMU driver to stop logging dirtied pages.
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set
+ * returns the dirty pages bitmap for IOMMU container for a given IOVA range.
+ * The user must specify the IOVA range and the pgsize through the structure
+ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
+ * supports getting a bitmap of the smallest supported pgsize only and can be
+ * modified in future to get a bitmap of any specified supported pgsize. The
+ * user must provide a zeroed memory area for the bitmap memory and specify its
+ * size in bitmap.size. One bit is used to represent one page consecutively
+ * starting from iova offset. The user should provide page size in bitmap.pgsize
+ * field. A bit set in the bitmap indicates that the page at that offset from
+ * iova is dirty. The caller must set argsz to a value including the size of
+ * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the
+ * actual bitmap. If dirty pages logging is not enabled, an error will be
+ * returned.
+ *
+ * Only one of the flags _START, _STOP and _GET may be specified at a time.
+ *
+ */
+struct vfio_iommu_type1_dirty_bitmap {
+	__u32        argsz;
+	__u32        flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START	(1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP	(1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP	(1 << 2)
+	__u8         data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+	__u64              iova;	/* IO virtual address */
+	__u64              size;	/* Size of iova range */
+	struct vfio_bitmap bitmap;
+};
+
+#define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*
-- 
Gitee


From a453e2d25acd05a53da4a6669061a6241426eb52 Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Fri, 29 May 2020 02:00:51 +0530
Subject: [PATCH 1544/2161] vfio iommu: Implementation of ioctl for dirty pages
 tracking

commit d6a4c185660cb978f76e46e3f5f6b08dc00ecc8f upstream.

VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot <lkp@intel.com>

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 314 +++++++++++++++++++++++++++++++-
 1 file changed, 308 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a8f8d5a0fc09..147c4b61394e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@ struct vfio_iommu {
 	uint64_t		pgsize_bitmap;
 	bool			v2;
 	bool			nesting;
+	bool			dirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -93,6 +94,7 @@ struct vfio_dma {
 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
 	struct task_struct	*task;
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
+	unsigned long		*bitmap;
 };
 
 struct vfio_group {
@@ -127,6 +129,19 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -177,6 +192,80 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+	uint64_t npages = dma->size / pgsize;
+
+	if (npages > DIRTY_BITMAP_PAGES_MAX)
+		return -EINVAL;
+
+	/*
+	 * Allocate extra 64 bits that are used to calculate shift required for
+	 * bitmap_shift_left() to manipulate and club unaligned number of pages
+	 * in adjacent vfio_dma ranges.
+	 */
+	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
+			       GFP_KERNEL);
+	if (!dma->bitmap)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+	kfree(dma->bitmap);
+	dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+	struct rb_node *p;
+
+	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
+		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+	}
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+		int ret;
+
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret) {
+			struct rb_node *p;
+
+			for (p = rb_prev(n); p; p = rb_prev(p)) {
+				struct vfio_dma *dma = rb_entry(n,
+							struct vfio_dma, node);
+
+				vfio_dma_bitmap_free(dma);
+			}
+			return ret;
+		}
+		vfio_dma_populate_bitmap(dma, pgsize);
+	}
+	return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		vfio_dma_bitmap_free(dma);
+	}
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -610,6 +699,17 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				vfio_lock_acct(dma, -1, true);
 			goto pin_unwind;
 		}
+
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap,
+				   (iova - dma->iova) >> pgshift, 1);
+		}
 	}
 
 	ret = i;
@@ -845,6 +945,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
+	vfio_dma_bitmap_free(dma);
 	kfree(dma);
 	iommu->dma_avail++;
 }
@@ -872,6 +973,94 @@ static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
 	}
 }
 
+static int update_user_bitmap(u64 __user *bitmap, struct vfio_dma *dma,
+			      dma_addr_t base_iova, size_t pgsize)
+{
+	unsigned long pgshift = __ffs(pgsize);
+	unsigned long nbits = dma->size >> pgshift;
+	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
+	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
+	unsigned long shift = bit_offset % BITS_PER_LONG;
+	unsigned long leftover;
+
+	/* mark all pages dirty if all pages are pinned and mapped. */
+	if (dma->iommu_mapped)
+		bitmap_set(dma->bitmap, 0, nbits);
+
+	if (shift) {
+		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
+				  nbits + shift);
+
+		if (copy_from_user(&leftover,
+				   (const void *)(bitmap + copy_offset),
+				   sizeof(leftover)))
+			return -EFAULT;
+
+		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
+	}
+
+	if (copy_to_user((void *)(bitmap + copy_offset), dma->bitmap,
+			 DIRTY_BITMAP_BYTES(nbits + shift)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+				  dma_addr_t iova, size_t size, size_t pgsize)
+{
+	struct vfio_dma *dma;
+	struct rb_node *n;
+	unsigned long pgshift = __ffs(pgsize);
+	int ret;
+
+	/*
+	 * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
+	 * vfio_dma mappings may be clubbed by specifying large ranges, but
+	 * there must not be any previous mappings bisected by the range.
+	 * An error will be returned if these conditions are not met.
+	 */
+	dma = vfio_find_dma(iommu, iova, 1);
+	if (dma && dma->iova != iova)
+		return -EINVAL;
+
+	dma = vfio_find_dma(iommu, iova + size - 1, 0);
+	if (dma && dma->iova + dma->size != iova + size)
+		return -EINVAL;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		if (dma->iova < iova)
+			continue;
+
+		if (dma->iova > iova + size - 1)
+			break;
+
+		ret = update_user_bitmap(bitmap, dma, iova, pgsize);
+		if (ret)
+			return ret;
+
+		/*
+		 * Re-populate bitmap to include all pinned pages which are
+		 * considered as dirty but exclude pages which are unpinned and
+		 * pages which are marked dirty by vfio_dma_rw()
+		 */
+		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
+		vfio_dma_populate_bitmap(dma, pgsize);
+	}
+	return 0;
+}
+
+static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
+{
+	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
+	    (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
@@ -1089,7 +1278,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
 	int ret = 0, prot = 0;
-	uint64_t mask;
+	size_t pgsize;
 	struct vfio_dma *dma;
 
 	/* Verify that none of our __u64 fields overflow */
@@ -1104,11 +1293,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 
 	mutex_lock(&iommu->lock);
 
-	mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
 
-	WARN_ON(mask & PAGE_MASK);
+	WARN_ON((pgsize - 1) & PAGE_MASK);
 
-	if (!prot || !size || (size | iova | vaddr) & mask) {
+	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
@@ -1185,6 +1374,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	else
 		ret = vfio_pin_map_dma(iommu, dma, size);
 
+	if (!ret && iommu->dirty_page_tracking) {
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret)
+			vfio_remove_dma(iommu, dma);
+	}
+
 out_unlock:
 	mutex_unlock(&iommu->lock);
 	return ret;
@@ -2395,6 +2590,104 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		return copy_to_user((void __user *)arg, &unmap, minsz) ?
 			-EFAULT : 0;
+	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
+		struct vfio_iommu_type1_dirty_bitmap dirty;
+		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+		int ret = 0;
+
+		if (!iommu->v2)
+			return -EACCES;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
+				    flags);
+
+		if (copy_from_user(&dirty, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (dirty.argsz < minsz || dirty.flags & ~mask)
+			return -EINVAL;
+
+		/* only one flag should be set at a time */
+		if (__ffs(dirty.flags) != __fls(dirty.flags))
+			return -EINVAL;
+
+		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+			size_t pgsize;
+
+			mutex_lock(&iommu->lock);
+			pgsize = 1 << __ffs(iommu->pgsize_bitmap);
+			if (!iommu->dirty_page_tracking) {
+				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
+				if (!ret)
+					iommu->dirty_page_tracking = true;
+			}
+			mutex_unlock(&iommu->lock);
+			return ret;
+		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+			mutex_lock(&iommu->lock);
+			if (iommu->dirty_page_tracking) {
+				iommu->dirty_page_tracking = false;
+				vfio_dma_bitmap_free_all(iommu);
+			}
+			mutex_unlock(&iommu->lock);
+			return 0;
+		} else if (dirty.flags &
+				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+			struct vfio_iommu_type1_dirty_bitmap_get range;
+			unsigned long pgshift;
+			size_t data_size = dirty.argsz - minsz;
+			size_t iommu_pgsize;
+
+			if (!data_size || data_size < sizeof(range))
+				return -EINVAL;
+
+			if (copy_from_user(&range, (void __user *)(arg + minsz),
+					   sizeof(range)))
+				return -EFAULT;
+
+			if (range.iova + range.size < range.iova)
+				return -EINVAL;
+			if (!access_ok((void __user *)range.bitmap.data,
+				       range.bitmap.size))
+				return -EINVAL;
+
+			pgshift = __ffs(range.bitmap.pgsize);
+			ret = verify_bitmap_size(range.size >> pgshift,
+						 range.bitmap.size);
+			if (ret)
+				return ret;
+
+			mutex_lock(&iommu->lock);
+
+			iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+
+			/* allow only smallest supported pgsize */
+			if (range.bitmap.pgsize != iommu_pgsize) {
+				ret = -EINVAL;
+				goto out_unlock;
+			}
+			if (range.iova & (iommu_pgsize - 1)) {
+				ret = -EINVAL;
+				goto out_unlock;
+			}
+			if (!range.size || range.size & (iommu_pgsize - 1)) {
+				ret = -EINVAL;
+				goto out_unlock;
+			}
+
+			if (iommu->dirty_page_tracking)
+				ret = vfio_iova_dirty_bitmap(range.bitmap.data,
+						iommu, range.iova, range.size,
+						range.bitmap.pgsize);
+			else
+				ret = -EINVAL;
+out_unlock:
+			mutex_unlock(&iommu->lock);
+
+			return ret;
+		}
 	}
 
 	return -ENOTTY;
@@ -2462,10 +2755,19 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
 
 	vaddr = dma->vaddr + offset;
 
-	if (write)
+	if (write) {
 		*copied = copy_to_user((void __user *)vaddr, data,
 					 count) ? 0 : count;
-	else
+		if (*copied && iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap, offset >> pgshift,
+				   *copied >> pgshift);
+		}
+	} else
 		*copied = copy_from_user(data, (void __user *)vaddr,
 					   count) ? 0 : count;
 	if (kthread)
-- 
Gitee


From 69c02956427bbf110435aec2850aecc07153a245 Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Fri, 29 May 2020 02:00:52 +0530
Subject: [PATCH 1545/2161] vfio iommu: Update UNMAP_DMA ioctl to get dirty
 bitmap before unmap

commit 331e33d2960c8292bad8b02578fcfac18f721517 upstream.

DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 61 +++++++++++++++++++++++++++------
 include/uapi/linux/vfio.h       | 11 ++++++
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 147c4b61394e..2f6f0bfadfa3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1062,23 +1062,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-			     struct vfio_iommu_type1_dma_unmap *unmap)
+			     struct vfio_iommu_type1_dma_unmap *unmap,
+			     struct vfio_bitmap *bitmap)
 {
-	uint64_t mask;
 	struct vfio_dma *dma, *dma_last = NULL;
-	size_t unmapped = 0;
+	size_t unmapped = 0, pgsize;
 	int ret = 0, retries = 0;
+	unsigned long pgshift;
 
 	mutex_lock(&iommu->lock);
 
-	mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+	pgshift = __ffs(iommu->pgsize_bitmap);
+	pgsize = (size_t)1 << pgshift;
 
-	if (unmap->iova & mask) {
+	if (unmap->iova & (pgsize - 1)) {
 		ret = -EINVAL;
 		goto unlock;
 	}
 
-	if (!unmap->size || unmap->size & mask) {
+	if (!unmap->size || unmap->size & (pgsize - 1)) {
 		ret = -EINVAL;
 		goto unlock;
 	}
@@ -1089,9 +1091,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		goto unlock;
 	}
 
-	WARN_ON(mask & PAGE_MASK);
-again:
+	/* When dirty tracking is enabled, allow only min supported pgsize */
+	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+		ret = -EINVAL;
+		goto unlock;
+	}
 
+	WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
 	/*
 	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 	 * avoid tracking individual mappings.  This means that the granularity
@@ -1172,6 +1180,14 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			mutex_lock(&iommu->lock);
 			goto again;
 		}
+
+		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+			ret = update_user_bitmap(bitmap->data, dma,
+						 unmap->iova, pgsize);
+			if (ret)
+				break;
+		}
+
 		unmapped += dma->size;
 		vfio_remove_dma(iommu, dma);
 	}
@@ -2574,17 +2590,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 		struct vfio_iommu_type1_dma_unmap unmap;
-		long ret;
+		struct vfio_bitmap bitmap = { 0 };
+		int ret;
 
 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
 		if (copy_from_user(&unmap, (void __user *)arg, minsz))
 			return -EFAULT;
 
-		if (unmap.argsz < minsz || unmap.flags)
+		if (unmap.argsz < minsz ||
+		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
 			return -EINVAL;
 
-		ret = vfio_dma_do_unmap(iommu, &unmap);
+		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+			unsigned long pgshift;
+
+			if (unmap.argsz < (minsz + sizeof(bitmap)))
+				return -EINVAL;
+
+			if (copy_from_user(&bitmap,
+					   (void __user *)(arg + minsz),
+					   sizeof(bitmap)))
+				return -EFAULT;
+
+			if (!access_ok((void __user *)bitmap.data, bitmap.size))
+				return -EINVAL;
+
+			pgshift = __ffs(bitmap.pgsize);
+			ret = verify_bitmap_size(unmap.size >> pgshift,
+						 bitmap.size);
+			if (ret)
+				return ret;
+		}
+
+		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
 		if (ret)
 			return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c628aa719886..5046f69ffde0 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1063,12 +1063,23 @@ struct vfio_bitmap {
  * field.  No guarantee is made to the user that arbitrary unmaps of iova
  * or size different from those used in the original mapping call will
  * succeed.
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
+ * before unmapping IO virtual addresses. When this flag is set, the user must
+ * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
+ * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field.
+ * A bit in the bitmap represents one page, of user provided page size in
+ * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set
+ * indicates that the page at that offset from iova is dirty. A Bitmap of the
+ * pages in the range of unmapped size is returned in the user-provided
+ * vfio_bitmap.data.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
+#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
+	__u8    data[];
 };
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
-- 
Gitee


From ce8d1b7607b2cdb59cc4a071efe1211d99f193aa Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Fri, 29 May 2020 02:00:53 +0530
Subject: [PATCH 1546/2161] vfio iommu: Add migration capability to report
 supported features

commit ad721705d09c62f0d108a6b4f59867ebfd592c90 upstream.

Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.
User application must check page sizes supported and maximum dirty
bitmap size returned by this capability structure for ioctls used to get
dirty bitmap.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 23 ++++++++++++++++++++++-
 include/uapi/linux/vfio.h       | 23 +++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2f6f0bfadfa3..862179cb07da 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2497,6 +2497,22 @@ static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
 	return ret;
 }
 
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+					   struct vfio_info_cap *caps)
+{
+	struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+	cap_mig.header.version = 1;
+
+	cap_mig.flags = 0;
+	/* support minimum pgsize */
+	cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -2543,8 +2559,13 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		info.iova_pgsizes = iommu->pgsize_bitmap;
 
-		ret = vfio_iommu_iova_build_caps(iommu, &caps);
+		ret = vfio_iommu_migration_build_caps(iommu, &caps);
+
+		if (!ret)
+			ret = vfio_iommu_iova_build_caps(iommu, &caps);
+
 		mutex_unlock(&iommu->lock);
+
 		if (!ret)
 			ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5046f69ffde0..80c1d67a92ac 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1028,6 +1028,29 @@ struct vfio_iommu_type1_info_dma_avail {
 	__u32	avail;
 };
 
+/*
+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * The existence of this capability indicates that IOMMU kernel driver supports
+ * dirty page logging.
+ *
+ * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty
+ * page logging.
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes that can be used by user applications when getting the dirty
+ * bitmap.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+	struct	vfio_info_cap_header header;
+	__u32	flags;
+	__u64	pgsize_bitmap;
+	__u64	max_dirty_bitmap_size;		/* in bytes */
+};
+
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
 
 /**
-- 
Gitee


From 6944790350a67f7b28e9ca7a4c6425ca7eefd58e Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Fri, 29 May 2020 02:00:54 +0530
Subject: [PATCH 1547/2161] vfio: Selective dirty page tracking if IOMMU backed
 device pins pages

commit 95fc87b44104d9a524ff3e975bbfbd7c1f1a2dd5 upstream.

Added a check such that only singleton IOMMU groups can pin pages.
>From the point when vendor driver pins any pages, consider IOMMU group
dirty page scope to be limited to pinned pages.

To optimize to avoid walking list often, added flag
pinned_page_dirty_scope to indicate if all of the vfio_groups for each
vfio_domain in the domain_list dirty page scope is limited to pinned
pages. This flag is updated on first pinned pages request for that IOMMU
group and on attaching/detaching group.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c             |  13 +++-
 drivers/vfio/vfio_iommu_type1.c | 103 +++++++++++++++++++++++++++++---
 include/linux/vfio.h            |   4 +-
 3 files changed, 109 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 765e0e5d83ed..580099afeaff 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@ struct vfio_group {
 	atomic_t			opened;
 	wait_queue_head_t		container_q;
 	bool				noiommu;
+	unsigned int			dev_counter;
 	struct kvm			*kvm;
 	struct blocking_notifier_head	notifier;
 };
@@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 
 	mutex_lock(&group->device_lock);
 	list_add(&device->group_next, &group->device_list);
+	group->dev_counter++;
 	mutex_unlock(&group->device_lock);
 
 	return device;
@@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
 	struct vfio_group *group = device->group;
 
 	list_del(&device->group_next);
+	group->dev_counter--;
 	mutex_unlock(&group->device_lock);
 
 	dev_set_drvdata(device->dev, NULL);
@@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
 	if (!group)
 		return -ENODEV;
 
+	if (group->dev_counter > 1)
+		return -EINVAL;
+
 	ret = vfio_group_add_container_user(group);
 	if (ret)
 		goto err_pin_pages;
@@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
 	container = group->container;
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->pin_pages))
-		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+		ret = driver->ops->pin_pages(container->iommu_data,
+					     group->iommu_group, user_pfn,
 					     npage, prot, phys_pfn);
 	else
 		ret = -ENOTTY;
@@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group,
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->pin_pages))
 		ret = driver->ops->pin_pages(container->iommu_data,
-					     user_iova_pfn, npage,
-					     prot, phys_pfn);
+					     group->iommu_group, user_iova_pfn,
+					     npage, prot, phys_pfn);
 	else
 		ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 862179cb07da..e6df843d7072 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -74,6 +74,7 @@ struct vfio_iommu {
 	bool			v2;
 	bool			nesting;
 	bool			dirty_page_tracking;
+	bool			pinned_page_dirty_scope;
 };
 
 struct vfio_domain {
@@ -101,6 +102,7 @@ struct vfio_group {
 	struct iommu_group	*iommu_group;
 	struct list_head	next;
 	bool			mdev_group;	/* An mdev group */
+	bool			pinned_page_dirty_scope;
 };
 
 struct vfio_iova {
@@ -144,6 +146,10 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+					       struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -633,11 +639,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+				      struct iommu_group *iommu_group,
 				      unsigned long *user_pfn,
 				      int npage, int prot,
 				      unsigned long *phys_pfn)
 {
 	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_group *group;
 	int i, j, ret;
 	unsigned long remote_vaddr;
 	struct vfio_dma *dma;
@@ -711,8 +719,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				   (iova - dma->iova) >> pgshift, 1);
 		}
 	}
-
 	ret = i;
+
+	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
+	if (!group->pinned_page_dirty_scope) {
+		group->pinned_page_dirty_scope = true;
+		update_pinned_page_dirty_scope(iommu);
+	}
+
 	goto pin_done;
 
 pin_unwind:
@@ -973,8 +987,9 @@ static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
 	}
 }
 
-static int update_user_bitmap(u64 __user *bitmap, struct vfio_dma *dma,
-			      dma_addr_t base_iova, size_t pgsize)
+static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+			      struct vfio_dma *dma, dma_addr_t base_iova,
+			      size_t pgsize)
 {
 	unsigned long pgshift = __ffs(pgsize);
 	unsigned long nbits = dma->size >> pgshift;
@@ -983,8 +998,11 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_dma *dma,
 	unsigned long shift = bit_offset % BITS_PER_LONG;
 	unsigned long leftover;
 
-	/* mark all pages dirty if all pages are pinned and mapped. */
-	if (dma->iommu_mapped)
+	/*
+	 * mark all pages dirty if any IOMMU capable device is not able
+	 * to report dirty pages and all pages are pinned and mapped.
+	 */
+	if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped)
 		bitmap_set(dma->bitmap, 0, nbits);
 
 	if (shift) {
@@ -1037,7 +1055,7 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 		if (dma->iova > iova + size - 1)
 			break;
 
-		ret = update_user_bitmap(bitmap, dma, iova, pgsize);
+		ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
 		if (ret)
 			return ret;
 
@@ -1182,7 +1200,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		}
 
 		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
-			ret = update_user_bitmap(bitmap->data, dma,
+			ret = update_user_bitmap(bitmap->data, iommu, dma,
 						 unmap->iova, pgsize);
 			if (ret)
 				break;
@@ -1595,6 +1613,51 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
 	return NULL;
 }
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+					       struct iommu_group *iommu_group)
+{
+	struct vfio_domain *domain;
+	struct vfio_group *group = NULL;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = find_iommu_group(domain, iommu_group);
+		if (group)
+			return group;
+	}
+
+	if (iommu->external_domain)
+		group = find_iommu_group(iommu->external_domain, iommu_group);
+
+	return group;
+}
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *domain;
+	struct vfio_group *group;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		list_for_each_entry(group, &domain->group_list, next) {
+			if (!group->pinned_page_dirty_scope) {
+				iommu->pinned_page_dirty_scope = false;
+				return;
+			}
+		}
+	}
+
+	if (iommu->external_domain) {
+		domain = iommu->external_domain;
+		list_for_each_entry(group, &domain->group_list, next) {
+			if (!group->pinned_page_dirty_scope) {
+				iommu->pinned_page_dirty_scope = false;
+				return;
+			}
+		}
+	}
+
+	iommu->pinned_page_dirty_scope = true;
+}
+
 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
 				  phys_addr_t *base)
 {
@@ -2002,6 +2065,16 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 
 			list_add(&group->next,
 				 &iommu->external_domain->group_list);
+			/*
+			 * Non-iommu backed group cannot dirty memory directly,
+			 * it can only use interfaces that provide dirty
+			 * tracking.
+			 * The iommu scope can only be promoted with the
+			 * addition of a dirty tracking group.
+			 */
+			group->pinned_page_dirty_scope = true;
+			if (!iommu->pinned_page_dirty_scope)
+				update_pinned_page_dirty_scope(iommu);
 			mutex_unlock(&iommu->lock);
 
 			return 0;
@@ -2124,6 +2197,13 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 done:
 	/* Delete the old one and insert new iova list */
 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
+
+	/*
+	 * An iommu backed group can dirty memory directly and therefore
+	 * demotes the iommu scope until it declares itself dirty tracking
+	 * capable via the page pinning interface.
+	 */
+	iommu->pinned_page_dirty_scope = false;
 	mutex_unlock(&iommu->lock);
 	vfio_iommu_resv_free(&group_resv_regions);
 
@@ -2258,6 +2338,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain;
 	struct vfio_group *group;
+	bool update_dirty_scope = false;
 	LIST_HEAD(iova_copy);
 
 	mutex_lock(&iommu->lock);
@@ -2265,6 +2346,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 	if (iommu->external_domain) {
 		group = find_iommu_group(iommu->external_domain, iommu_group);
 		if (group) {
+			update_dirty_scope = !group->pinned_page_dirty_scope;
 			list_del(&group->next);
 			kfree(group);
 
@@ -2294,6 +2376,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 			continue;
 
 		vfio_iommu_detach_group(domain, group);
+		update_dirty_scope = !group->pinned_page_dirty_scope;
 		list_del(&group->next);
 		kfree(group);
 		/*
@@ -2327,6 +2410,12 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		vfio_iommu_iova_free(&iova_copy);
 
 detach_group_done:
+	/*
+	 * Removal of a group without dirty tracking may allow the iommu scope
+	 * to be promoted.
+	 */
+	if (update_dirty_scope)
+		update_pinned_page_dirty_scope(iommu);
 	mutex_unlock(&iommu->lock);
 }
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 5d92ee15d098..38d3c6a8dc7e 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -76,7 +76,9 @@ struct vfio_iommu_driver_ops {
 					struct iommu_group *group);
 	void		(*detach_group)(void *iommu_data,
 					struct iommu_group *group);
-	int		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
+	int		(*pin_pages)(void *iommu_data,
+				     struct iommu_group *group,
+				     unsigned long *user_pfn,
 				     int npage, int prot,
 				     unsigned long *phys_pfn);
 	int		(*unpin_pages)(void *iommu_data,
-- 
Gitee


From 204a160597a212a8caabc7afc41d653036ef2fad Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Wed, 3 Jun 2020 00:12:36 +0530
Subject: [PATCH 1548/2161] vfio iommu: Use shift operation for 64-bit integer
 division

commit cd0bb41ea86027d3a5c3555772692323cbc05bc1 upstream.

Fixes compilation error with ARCH=i386.

Error fixed by this commit:
ld: drivers/vfio/vfio_iommu_type1.o: in function `vfio_dma_populate_bitmap':
>> vfio_iommu_type1.c:(.text+0x666): undefined reference to `__udivdi3'

Fixes: d6a4c185660c ("vfio iommu: Implementation of ioctl for dirty pages tracking")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e6df843d7072..11ef17f50ca6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -228,11 +228,12 @@ static void vfio_dma_bitmap_free(struct vfio_dma *dma)
 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
 {
 	struct rb_node *p;
+	unsigned long pgshift = __ffs(pgsize);
 
 	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
 		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
 
-		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
 	}
 }
 
-- 
Gitee


From 22987153b00aacbc62d15296feebc654e00cbf7b Mon Sep 17 00:00:00 2001
From: Kirti Wankhede <kwankhede@nvidia.com>
Date: Wed, 3 Jun 2020 00:12:37 +0530
Subject: [PATCH 1549/2161] vfio iommu: typecast corrections

commit c8e9df4744a3e4d897a6ac6e71a38aa8d7b65aa0 upstream.

Fixes sparse warnings by adding '__user' in typecast for
copy_[from,to]_user()

Fixes: d6a4c185660c ("vfio iommu: Implementation of ioctl for dirty pages tracking")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 11ef17f50ca6..abc921ebc328 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1011,14 +1011,14 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 				  nbits + shift);
 
 		if (copy_from_user(&leftover,
-				   (const void *)(bitmap + copy_offset),
+				   (void __user *)(bitmap + copy_offset),
 				   sizeof(leftover)))
 			return -EFAULT;
 
 		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
 	}
 
-	if (copy_to_user((void *)(bitmap + copy_offset), dma->bitmap,
+	if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
 			 DIRTY_BITMAP_BYTES(nbits + shift)))
 		return -EFAULT;
 
-- 
Gitee


From caff72471d6eb6d8687b67d0bdb9d16ffb01bb39 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 18 Jun 2020 13:07:13 -0600
Subject: [PATCH 1550/2161] vfio/type1: Fix migration info capability ID

commit f751820bc319a30d31a80fe511d88642aaefcdee upstream.

ID 1 is already used by the IOVA range capability, use ID 2.

Reported-by: Liu Yi L <yi.l.liu@intel.com>
Fixes: ad721705d09c ("vfio iommu: Add migration capability to report supported features")
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 80c1d67a92ac..1831fb7beb12 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1042,7 +1042,7 @@ struct vfio_iommu_type1_info_dma_avail {
  * size in bytes that can be used by user applications when getting the dirty
  * bitmap.
  */
-#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  2
 
 struct vfio_iommu_type1_info_cap_migration {
 	struct	vfio_info_cap_header header;
-- 
Gitee


From 49a6a7d967cfef975c7a4c09be92ce4f820f8d8a Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 27 Jul 2020 13:43:36 -0600
Subject: [PATCH 1551/2161] vfio: Cleanup allowed driver naming

commit 26afdd98821bd36e4e1de88dd00a8c70d83accdf upstream.

No functional change, avoid non-inclusive naming schemes.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 580099afeaff..262ab0efd06c 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -627,9 +627,10 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
  * that error notification via MSI can be affected for platforms that handle
  * MSI within the same IOVA space as DMA.
  */
-static const char * const vfio_driver_whitelist[] = { "pci-stub" };
+static const char * const vfio_driver_allowed[] = { "pci-stub" };
 
-static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
+static bool vfio_dev_driver_allowed(struct device *dev,
+				    struct device_driver *drv)
 {
 	if (dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(dev);
@@ -638,8 +639,8 @@ static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 			return true;
 	}
 
-	return match_string(vfio_driver_whitelist,
-			    ARRAY_SIZE(vfio_driver_whitelist),
+	return match_string(vfio_driver_allowed,
+			    ARRAY_SIZE(vfio_driver_allowed),
 			    drv->name) >= 0;
 }
 
@@ -648,7 +649,7 @@ static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
  * one of the following states:
  *  - driver-less
  *  - bound to a vfio driver
- *  - bound to a whitelisted driver
+ *  - bound to an otherwise allowed driver
  *  - a PCI interconnect device
  *
  * We use two methods to determine whether a device is bound to a vfio
@@ -674,7 +675,7 @@ static int vfio_dev_viable(struct device *dev, void *data)
 	}
 	mutex_unlock(&group->unbound_lock);
 
-	if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
+	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
 		return 0;
 
 	device = vfio_group_get_device(group, dev);
-- 
Gitee


From 8bc8559f849d339068d4c2344b199adf4adc34f1 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 27 Jul 2020 13:43:37 -0600
Subject: [PATCH 1552/2161] vfio/pci: Add Intel X550 to hidden INTx devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit bf3551e150e5a231ac60f2bd750f63ab5484010a upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel document 333717-008, "Intel® Ethernet Controller X550
Specification Update", version 2.7, dated June 2020, includes errata
#22, added in version 2.1, May 2016, indicating X550 NICs suffer from
the same implementation deficiency as the 700-series NICs:

"The Interrupt Status bit in the Status register of the PCIe
 configuration space is not implemented and is not set as described
 in the PCIe specification."

Without the interrupt status bit, vfio-pci cannot determine when
these devices signal INTx.  They are therefore added to the nointx
quirk.

Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 2200190c9c3a..06482665f629 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -205,6 +205,8 @@ static bool vfio_pci_nointx(struct pci_dev *pdev)
 		case 0x1580 ... 0x1581:
 		case 0x1583 ... 0x158b:
 		case 0x37d0 ... 0x37d2:
+		/* X550 */
+		case 0x1563:
 			return true;
 		default:
 			return false;
-- 
Gitee


From 1f60c65327602efb869c8d4a31eace72537d0e31 Mon Sep 17 00:00:00 2001
From: Xiang Zheng <zhengxiang9@huawei.com>
Date: Mon, 27 Jul 2020 13:43:37 -0600
Subject: [PATCH 1553/2161] vfio/type1: Add conditional rescheduling after
 iommu map failed

commit e1907d6752a26ccfcda1573b5a7542b056553046 upstream.

Commit c5e6688752c2 ("vfio/type1: Add conditional rescheduling")
missed a "cond_resched()" in vfio_iommu_map if iommu map failed.

This is a very tiny optimization and the case can hardly happen.

Signed-off-by: Xiang Zheng <zhengxiang9@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index abc921ebc328..113e9538b9a3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1238,8 +1238,10 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	return 0;
 
 unwind:
-	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
+	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
 		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
+		cond_resched();
+	}
 
 	return ret;
 }
-- 
Gitee


From 0af749108330a2ca6ce586dda677fb8c03312e63 Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Mon, 27 Jul 2020 13:43:38 -0600
Subject: [PATCH 1554/2161] vfio/spapr_tce: convert get_user_pages() -->
 pin_user_pages()

commit 9d532f286914add61e02312881733a90355f153c upstream.

This code was using get_user_pages*(), in a "Case 2" scenario
(DMA/RDMA), using the categorization from [1]. That means that it's
time to convert the get_user_pages*() + put_page() calls to
pin_user_pages*() + unpin_user_pages() calls.

There is some helpful background in [2]: basically, this is a small
part of fixing a long-standing disconnect between pinning pages, and
file systems' use of those pages.

[1] Documentation/core-api/pin_user_pages.rst

[2] "Explicit pinning of user-space pages":
    https://lwn.net/Articles/807108/

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: kvm@vger.kernel.org
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 16b3adc508db..fe888b5dcc00 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -383,7 +383,7 @@ static void tce_iommu_unuse_page(struct tce_container *container,
 	struct page *page;
 
 	page = pfn_to_page(hpa >> PAGE_SHIFT);
-	put_page(page);
+	unpin_user_page(page);
 }
 
 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
@@ -486,7 +486,7 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 	struct page *page = NULL;
 	enum dma_data_direction direction = iommu_tce_direction(tce);
 
-	if (get_user_pages_fast(tce & PAGE_MASK, 1,
+	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
 			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
 			&page) != 1)
 		return -EFAULT;
-- 
Gitee


From 6be500233c84de5304989ccd64c519015ba4a6b1 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 27 Jul 2020 13:43:38 -0600
Subject: [PATCH 1555/2161] vfio/pci: Hold igate across releasing eventfd
 contexts

commit 924b51abf961f1d888143fcfa3d63c92dc517679 upstream.

No need to release and immediately re-acquire igate while clearing
out the eventfd ctxs.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 06482665f629..59f2c10ba406 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -521,14 +521,12 @@ static void vfio_pci_release(void *device_data)
 		vfio_pci_vf_token_user_add(vdev, -1);
 		vfio_spapr_pci_eeh_release(vdev->pdev);
 		vfio_pci_disable(vdev);
+
 		mutex_lock(&vdev->igate);
 		if (vdev->err_trigger) {
 			eventfd_ctx_put(vdev->err_trigger);
 			vdev->err_trigger = NULL;
 		}
-		mutex_unlock(&vdev->igate);
-
-		mutex_lock(&vdev->igate);
 		if (vdev->req_trigger) {
 			eventfd_ctx_put(vdev->req_trigger);
 			vdev->req_trigger = NULL;
-- 
Gitee


From 1e96aa231cdbfa146c047a91eb940458415128f1 Mon Sep 17 00:00:00 2001
From: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Date: Mon, 27 Jul 2020 13:43:39 -0600
Subject: [PATCH 1556/2161] PCI: Add Intel QuickAssist device IDs

commit 8b7beaf9f185249f29912b5e2d7bc4147c5c2a6a upstream.

Add device IDs for the following Intel QuickAssist devices: DH895XCC,
C3XXX and C62X.

The defines in this patch are going to be referenced in two independent
drivers, qat and vfio-pci.

Signed-off-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Fiona Trahe <fiona.trahe@intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/pci_ids.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0ad57693f392..f3166b1425ca 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2659,6 +2659,8 @@
 #define PCI_DEVICE_ID_INTEL_80332_1	0x0332
 #define PCI_DEVICE_ID_INTEL_80333_0	0x0370
 #define PCI_DEVICE_ID_INTEL_80333_1	0x0372
+#define PCI_DEVICE_ID_INTEL_QAT_DH895XCC	0x0435
+#define PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF	0x0443
 #define PCI_DEVICE_ID_INTEL_82375	0x0482
 #define PCI_DEVICE_ID_INTEL_82424	0x0483
 #define PCI_DEVICE_ID_INTEL_82378	0x0484
@@ -2708,6 +2710,8 @@
 #define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_4C_NHI     0x1577
 #define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_4C_BRIDGE  0x1578
 #define PCI_DEVICE_ID_INTEL_80960_RP	0x1960
+#define PCI_DEVICE_ID_INTEL_QAT_C3XXX	0x19e2
+#define PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF	0x19e3
 #define PCI_DEVICE_ID_INTEL_82840_HB	0x1a21
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
 #define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
@@ -2924,6 +2928,8 @@
 #define PCI_DEVICE_ID_INTEL_IOAT_JSF7	0x3717
 #define PCI_DEVICE_ID_INTEL_IOAT_JSF8	0x3718
 #define PCI_DEVICE_ID_INTEL_IOAT_JSF9	0x3719
+#define PCI_DEVICE_ID_INTEL_QAT_C62X	0x37c8
+#define PCI_DEVICE_ID_INTEL_QAT_C62X_VF	0x37c9
 #define PCI_DEVICE_ID_INTEL_ICH10_0	0x3a14
 #define PCI_DEVICE_ID_INTEL_ICH10_1	0x3a16
 #define PCI_DEVICE_ID_INTEL_ICH10_2	0x3a18
-- 
Gitee


From 381e07b7657c1257aca47a3c583194334b4a2700 Mon Sep 17 00:00:00 2001
From: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Date: Mon, 27 Jul 2020 13:43:40 -0600
Subject: [PATCH 1557/2161] vfio/pci: Add device denylist

commit 1f97970e6c8e60349b045599f4525608db895009 upstream.

Add denylist of devices that by default are not probed by vfio-pci.
Devices in this list may be susceptible to untrusted application, even
if the IOMMU is enabled. To be accessed via vfio-pci, the user has to
explicitly disable the denylist.

The denylist can be disabled via the module parameter disable_denylist.

Signed-off-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Fiona Trahe <fiona.trahe@intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 59f2c10ba406..8baa23e88b5a 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -60,6 +60,10 @@ module_param(enable_sriov, bool, 0644);
 MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration.  Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
 #endif
 
+static bool disable_denylist;
+module_param(disable_denylist, bool, 0444);
+MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
+
 static inline bool vfio_vga_disabled(void)
 {
 #ifdef CONFIG_VFIO_PCI_VGA
@@ -69,6 +73,29 @@ static inline bool vfio_vga_disabled(void)
 #endif
 }
 
+static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
+{
+	return false;
+}
+
+static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
+{
+	if (!vfio_pci_dev_in_denylist(pdev))
+		return false;
+
+	if (disable_denylist) {
+		pci_warn(pdev,
+			 "device denylist disabled - allowing device %04x:%04x.\n",
+			 pdev->vendor, pdev->device);
+		return false;
+	}
+
+	pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
+		 pdev->vendor, pdev->device);
+
+	return true;
+}
+
 /*
  * Our VGA arbiter participation is limited since we don't know anything
  * about the device itself.  However, if the device is the only VGA device
@@ -1856,6 +1883,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	struct iommu_group *group;
 	int ret;
 
+	if (vfio_pci_is_denylisted(pdev))
+		return -EINVAL;
+
 	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 		return -EINVAL;
 
@@ -2346,6 +2376,9 @@ static int __init vfio_pci_init(void)
 
 	vfio_pci_fill_ids();
 
+	if (disable_denylist)
+		pr_warn("device denylist disabled.\n");
+
 	return 0;
 
 out_driver:
-- 
Gitee


From f866402377b71c33232871aedc775b49a4696ad3 Mon Sep 17 00:00:00 2001
From: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Date: Mon, 27 Jul 2020 13:43:40 -0600
Subject: [PATCH 1558/2161] vfio/pci: Add QAT devices to denylist
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 50173329c8cc0c892eaa7a9d0f0692ac39cd7b04 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current generation of Intel® QuickAssist Technology devices
are not designed to run in an untrusted environment because of the
following issues reported in the document "Intel® QuickAssist Technology
(Intel® QAT) Software for Linux" (document number 336211-014):

QATE-39220 - GEN - Intel® QAT API submissions with bad addresses that
             trigger DMA to invalid or unmapped addresses can cause a
             platform hang
QATE-7495  - GEN - An incorrectly formatted request to Intel® QAT can
             hang the entire Intel® QAT Endpoint

The document is downloadable from https://01.org/intel-quickassist-technology
at the following link:
https://01.org/sites/default/files/downloads/336211-014-qatforlinux-releasenotes-hwv1.7_0.pdf

This patch adds the following QAT devices to the denylist: DH895XCC,
C3XXX and C62X.

Signed-off-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Reviewed-by: Fiona Trahe <fiona.trahe@intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 8baa23e88b5a..f862ba663b9a 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -75,6 +75,21 @@ static inline bool vfio_vga_disabled(void)
 
 static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
 {
+	switch (pdev->vendor) {
+	case PCI_VENDOR_ID_INTEL:
+		switch (pdev->device) {
+		case PCI_DEVICE_ID_INTEL_QAT_C3XXX:
+		case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF:
+		case PCI_DEVICE_ID_INTEL_QAT_C62X:
+		case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
+		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
+		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
+			return true;
+		default:
+			return false;
+		}
+	}
+
 	return false;
 }
 
-- 
Gitee


From ea0fd67100838871351da62f65522c8b21a7df68 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Sun, 12 Jul 2020 04:20:56 -0700
Subject: [PATCH 1559/2161] vfio/type1: Refactor vfio_iommu_type1_ioctl()

commit ccd59dce1a21f473518bf273bdf5b182bab955b3 upstream.

This patch refactors the vfio_iommu_type1_ioctl() to use switch instead of
if-else, and each command got a helper function.

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 398 +++++++++++++++++---------------
 1 file changed, 215 insertions(+), 183 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 113e9538b9a3..f34c78ecdb41 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2511,6 +2511,23 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
 	return ret;
 }
 
+static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
+					    unsigned long arg)
+{
+	switch (arg) {
+	case VFIO_TYPE1_IOMMU:
+	case VFIO_TYPE1v2_IOMMU:
+	case VFIO_TYPE1_NESTING_IOMMU:
+		return 1;
+	case VFIO_DMA_CC_IOMMU:
+		if (!iommu)
+			return 0;
+		return vfio_domains_have_iommu_cache(iommu);
+	default:
+		return 0;
+	}
+}
+
 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
 		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
 		 size_t size)
@@ -2605,244 +2622,259 @@ static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
 	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
 }
 
-static long vfio_iommu_type1_ioctl(void *iommu_data,
-				   unsigned int cmd, unsigned long arg)
+static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
+				     unsigned long arg)
 {
-	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_iommu_type1_info info;
 	unsigned long minsz;
+	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+	unsigned long capsz;
+	int ret;
 
-	if (cmd == VFIO_CHECK_EXTENSION) {
-		switch (arg) {
-		case VFIO_TYPE1_IOMMU:
-		case VFIO_TYPE1v2_IOMMU:
-		case VFIO_TYPE1_NESTING_IOMMU:
-			return 1;
-		case VFIO_DMA_CC_IOMMU:
-			if (!iommu)
-				return 0;
-			return vfio_domains_have_iommu_cache(iommu);
-		default:
-			return 0;
-		}
-	} else if (cmd == VFIO_IOMMU_GET_INFO) {
-		struct vfio_iommu_type1_info info;
-		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
-		unsigned long capsz;
-		int ret;
-
-		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 
-		/* For backward compatibility, cannot require this */
-		capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+	/* For backward compatibility, cannot require this */
+	capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
 
-		if (copy_from_user(&info, (void __user *)arg, minsz))
-			return -EFAULT;
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
 
-		if (info.argsz < minsz)
-			return -EINVAL;
+	if (info.argsz < minsz)
+		return -EINVAL;
 
-		if (info.argsz >= capsz) {
-			minsz = capsz;
-			info.cap_offset = 0; /* output, no-recopy necessary */
-		}
+	if (info.argsz >= capsz) {
+		minsz = capsz;
+		info.cap_offset = 0; /* output, no-recopy necessary */
+	}
 
-		mutex_lock(&iommu->lock);
-		info.flags = VFIO_IOMMU_INFO_PGSIZES;
+	mutex_lock(&iommu->lock);
+	info.flags = VFIO_IOMMU_INFO_PGSIZES;
 
-		info.iova_pgsizes = iommu->pgsize_bitmap;
+	info.iova_pgsizes = iommu->pgsize_bitmap;
 
-		ret = vfio_iommu_migration_build_caps(iommu, &caps);
+	ret = vfio_iommu_migration_build_caps(iommu, &caps);
 
-		if (!ret)
-			ret = vfio_iommu_iova_build_caps(iommu, &caps);
+	if (!ret)
+		ret = vfio_iommu_iova_build_caps(iommu, &caps);
 
-		mutex_unlock(&iommu->lock);
+	mutex_unlock(&iommu->lock);
 
-		if (!ret)
-			ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
+	if (!ret)
+		ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
 
-		if (ret)
-			return ret;
+	if (ret)
+		return ret;
 
-		if (caps.size) {
-			info.flags |= VFIO_IOMMU_INFO_CAPS;
+	if (caps.size) {
+		info.flags |= VFIO_IOMMU_INFO_CAPS;
 
-			if (info.argsz < sizeof(info) + caps.size) {
-				info.argsz = sizeof(info) + caps.size;
-			} else {
-				vfio_info_cap_shift(&caps, sizeof(info));
-				if (copy_to_user((void __user *)arg +
-						sizeof(info), caps.buf,
-						caps.size)) {
-					kfree(caps.buf);
-					return -EFAULT;
-				}
-				info.cap_offset = sizeof(info);
+		if (info.argsz < sizeof(info) + caps.size) {
+			info.argsz = sizeof(info) + caps.size;
+		} else {
+			vfio_info_cap_shift(&caps, sizeof(info));
+			if (copy_to_user((void __user *)arg +
+					sizeof(info), caps.buf,
+					caps.size)) {
+				kfree(caps.buf);
+				return -EFAULT;
 			}
-
-			kfree(caps.buf);
+			info.cap_offset = sizeof(info);
 		}
 
-		return copy_to_user((void __user *)arg, &info, minsz) ?
-			-EFAULT : 0;
+		kfree(caps.buf);
+	}
 
-	} else if (cmd == VFIO_IOMMU_MAP_DMA) {
-		struct vfio_iommu_type1_dma_map map;
-		uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
-				VFIO_DMA_MAP_FLAG_WRITE;
+	return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+}
 
-		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
+				    unsigned long arg)
+{
+	struct vfio_iommu_type1_dma_map map;
+	unsigned long minsz;
+	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;	
 
-		if (copy_from_user(&map, (void __user *)arg, minsz))
-			return -EFAULT;
+	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
-		if (map.argsz < minsz || map.flags & ~mask)
-			return -EINVAL;
+	if (copy_from_user(&map, (void __user *)arg, minsz))
+		return -EFAULT;
 
-		return vfio_dma_do_map(iommu, &map);
+	if (map.argsz < minsz || map.flags & ~mask)
+		return -EINVAL;
 
-	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
-		struct vfio_iommu_type1_dma_unmap unmap;
-		struct vfio_bitmap bitmap = { 0 };
-		int ret;
+	return vfio_dma_do_map(iommu, &map);
+}
 
-		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
+				      unsigned long arg)
+{
+	struct vfio_iommu_type1_dma_unmap unmap;
+	struct vfio_bitmap bitmap = { 0 };
+	unsigned long minsz;
+	int ret;
 
-		if (copy_from_user(&unmap, (void __user *)arg, minsz))
-			return -EFAULT;
+	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
-		if (unmap.argsz < minsz ||
-		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
-			return -EINVAL;
+	if (copy_from_user(&unmap, (void __user *)arg, minsz))
+		return -EFAULT;
 
-		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
-			unsigned long pgshift;
+	if (unmap.argsz < minsz ||
+	    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
+		return -EINVAL;
 
-			if (unmap.argsz < (minsz + sizeof(bitmap)))
-				return -EINVAL;
+	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+		unsigned long pgshift;
 
-			if (copy_from_user(&bitmap,
-					   (void __user *)(arg + minsz),
-					   sizeof(bitmap)))
-				return -EFAULT;
+		if (unmap.argsz < (minsz + sizeof(bitmap)))
+			return -EINVAL;
 
-			if (!access_ok((void __user *)bitmap.data, bitmap.size))
-				return -EINVAL;
+		if (copy_from_user(&bitmap,
+				   (void __user *)(arg + minsz),
+				   sizeof(bitmap)))
+			return -EFAULT;
 
-			pgshift = __ffs(bitmap.pgsize);
-			ret = verify_bitmap_size(unmap.size >> pgshift,
-						 bitmap.size);
-			if (ret)
-				return ret;
-		}
+		if (!access_ok((void __user *)bitmap.data, bitmap.size))
+			return -EINVAL;
 
-		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
+		pgshift = __ffs(bitmap.pgsize);
+		ret = verify_bitmap_size(unmap.size >> pgshift,
+					 bitmap.size);
 		if (ret)
 			return ret;
+	}
+
+	ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
+	if (ret)
+		return ret;
 
-		return copy_to_user((void __user *)arg, &unmap, minsz) ?
+	return copy_to_user((void __user *)arg, &unmap, minsz) ?
 			-EFAULT : 0;
-	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
-		struct vfio_iommu_type1_dirty_bitmap dirty;
-		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
-				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
-				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
-		int ret = 0;
+}
 
-		if (!iommu->v2)
-			return -EACCES;
+static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
+					unsigned long arg)
+{
+	struct vfio_iommu_type1_dirty_bitmap dirty;
+	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+			VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+			VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+	unsigned long minsz;
+	int ret = 0;
 
-		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
-				    flags);
+	if (!iommu->v2)
+		return -EACCES;
 
-		if (copy_from_user(&dirty, (void __user *)arg, minsz))
-			return -EFAULT;
+	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
 
-		if (dirty.argsz < minsz || dirty.flags & ~mask)
-			return -EINVAL;
+	if (copy_from_user(&dirty, (void __user *)arg, minsz))
+		return -EFAULT;
 
-		/* only one flag should be set at a time */
-		if (__ffs(dirty.flags) != __fls(dirty.flags))
-			return -EINVAL;
+	if (dirty.argsz < minsz || dirty.flags & ~mask)
+		return -EINVAL;
 
-		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
-			size_t pgsize;
+	/* only one flag should be set at a time */
+	if (__ffs(dirty.flags) != __fls(dirty.flags))
+		return -EINVAL;
 
-			mutex_lock(&iommu->lock);
-			pgsize = 1 << __ffs(iommu->pgsize_bitmap);
-			if (!iommu->dirty_page_tracking) {
-				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
-				if (!ret)
-					iommu->dirty_page_tracking = true;
-			}
-			mutex_unlock(&iommu->lock);
-			return ret;
-		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
-			mutex_lock(&iommu->lock);
-			if (iommu->dirty_page_tracking) {
-				iommu->dirty_page_tracking = false;
-				vfio_dma_bitmap_free_all(iommu);
-			}
-			mutex_unlock(&iommu->lock);
-			return 0;
-		} else if (dirty.flags &
-				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
-			struct vfio_iommu_type1_dirty_bitmap_get range;
-			unsigned long pgshift;
-			size_t data_size = dirty.argsz - minsz;
-			size_t iommu_pgsize;
-
-			if (!data_size || data_size < sizeof(range))
-				return -EINVAL;
-
-			if (copy_from_user(&range, (void __user *)(arg + minsz),
-					   sizeof(range)))
-				return -EFAULT;
+	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+		size_t pgsize;
 
-			if (range.iova + range.size < range.iova)
-				return -EINVAL;
-			if (!access_ok((void __user *)range.bitmap.data,
-				       range.bitmap.size))
-				return -EINVAL;
+		mutex_lock(&iommu->lock);
+		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
+		if (!iommu->dirty_page_tracking) {
+			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
+			if (!ret)
+				iommu->dirty_page_tracking = true;
+		}
+		mutex_unlock(&iommu->lock);
+		return ret;
+	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+		mutex_lock(&iommu->lock);
+		if (iommu->dirty_page_tracking) {
+			iommu->dirty_page_tracking = false;
+			vfio_dma_bitmap_free_all(iommu);
+		}
+		mutex_unlock(&iommu->lock);
+		return 0;
+	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+		struct vfio_iommu_type1_dirty_bitmap_get range;
+		unsigned long pgshift;
+		size_t data_size = dirty.argsz - minsz;
+		size_t iommu_pgsize;
 
-			pgshift = __ffs(range.bitmap.pgsize);
-			ret = verify_bitmap_size(range.size >> pgshift,
-						 range.bitmap.size);
-			if (ret)
-				return ret;
+		if (!data_size || data_size < sizeof(range))
+			return -EINVAL;
 
-			mutex_lock(&iommu->lock);
+		if (copy_from_user(&range, (void __user *)(arg + minsz),
+				   sizeof(range)))
+			return -EFAULT;
 
-			iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+		if (range.iova + range.size < range.iova)
+			return -EINVAL;
+		if (!access_ok((void __user *)range.bitmap.data,
+			       range.bitmap.size))
+			return -EINVAL;
 
-			/* allow only smallest supported pgsize */
-			if (range.bitmap.pgsize != iommu_pgsize) {
-				ret = -EINVAL;
-				goto out_unlock;
-			}
-			if (range.iova & (iommu_pgsize - 1)) {
-				ret = -EINVAL;
-				goto out_unlock;
-			}
-			if (!range.size || range.size & (iommu_pgsize - 1)) {
-				ret = -EINVAL;
-				goto out_unlock;
-			}
+		pgshift = __ffs(range.bitmap.pgsize);
+		ret = verify_bitmap_size(range.size >> pgshift,
+					 range.bitmap.size);
+		if (ret)
+			return ret;
 
-			if (iommu->dirty_page_tracking)
-				ret = vfio_iova_dirty_bitmap(range.bitmap.data,
-						iommu, range.iova, range.size,
-						range.bitmap.pgsize);
-			else
-				ret = -EINVAL;
-out_unlock:
-			mutex_unlock(&iommu->lock);
+		mutex_lock(&iommu->lock);
 
-			return ret;
+		iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+
+		/* allow only smallest supported pgsize */
+		if (range.bitmap.pgsize != iommu_pgsize) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (range.iova & (iommu_pgsize - 1)) {
+			ret = -EINVAL;
+			goto out_unlock;
 		}
+		if (!range.size || range.size & (iommu_pgsize - 1)) {
+			ret = -EINVAL;
+			goto out_unlock;
+ 		}
+
+		if (iommu->dirty_page_tracking)
+			ret = vfio_iova_dirty_bitmap(range.bitmap.data,
+						     iommu, range.iova,
+						     range.size,
+						     range.bitmap.pgsize);
+		else
+			ret = -EINVAL;
+out_unlock:
+		mutex_unlock(&iommu->lock);
+
+		return ret;
 	}
 
-	return -ENOTTY;
+	return -EINVAL;
+}
+
+static long vfio_iommu_type1_ioctl(void *iommu_data,
+				   unsigned int cmd, unsigned long arg)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return vfio_iommu_type1_check_extension(iommu, arg);
+	case VFIO_IOMMU_GET_INFO:
+		return vfio_iommu_type1_get_info(iommu, arg);
+	case VFIO_IOMMU_MAP_DMA:
+		return vfio_iommu_type1_map_dma(iommu, arg);
+	case VFIO_IOMMU_UNMAP_DMA:
+		return vfio_iommu_type1_unmap_dma(iommu, arg);
+	case VFIO_IOMMU_DIRTY_PAGES:
+		return vfio_iommu_type1_dirty_pages(iommu, arg);
+	default:
+		return -ENOTTY;
+	}
 }
 
 static int vfio_iommu_type1_register_notifier(void *iommu_data,
-- 
Gitee


From 31b5416a85290ee24eb3d3ce2f222fda322775a5 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 17 Aug 2020 11:08:18 -0600
Subject: [PATCH 1560/2161] vfio-pci: Avoid recursive read-lock usage

commit bc93b9ae0151ae5ad5b8504cdc598428ea99570b upstream.

A down_read on memory_lock is held when performing read/write accesses
to MMIO BAR space, including across the copy_to/from_user() callouts
which may fault.  If the user buffer for these copies resides in an
mmap of device MMIO space, the mmap fault handler will acquire a
recursive read-lock on memory_lock.  Avoid this by reducing the lock
granularity.  Sequential accesses requiring multiple ioread/iowrite
cycles are expected to be rare, therefore typical accesses should not
see additional overhead.

VGA MMIO accesses are expected to be non-fatal regardless of the PCI
memory enable bit to allow legacy probing, this behavior remains with
a comment added.  ioeventfds are now included in memory access testing,
with writes dropped while memory space is disabled.

Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory")
Reported-by: Zhiyi Guo <zhguo@redhat.com>
Tested-by: Zhiyi Guo <zhguo@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_private.h |   2 +
 drivers/vfio/pci/vfio_pci_rdwr.c    | 120 ++++++++++++++++++++++------
 2 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 6a70c0538b53..a10eb64d30ec 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -33,12 +33,14 @@
 
 struct vfio_pci_ioeventfd {
 	struct list_head	next;
+	struct vfio_pci_device	*vdev;
 	struct virqfd		*virqfd;
 	void __iomem		*addr;
 	uint64_t		data;
 	loff_t			pos;
 	int			bar;
 	int			count;
+	bool			test_mem;
 };
 
 struct vfio_pci_irq_ctx {
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 916b184df3a5..9e353c484ace 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -37,17 +37,70 @@
 #define vfio_ioread8	ioread8
 #define vfio_iowrite8	iowrite8
 
+#define VFIO_IOWRITE(size) \
+static int vfio_pci_iowrite##size(struct vfio_pci_device *vdev,		\
+			bool test_mem, u##size val, void __iomem *io)	\
+{									\
+	if (test_mem) {							\
+		down_read(&vdev->memory_lock);				\
+		if (!__vfio_pci_memory_enabled(vdev)) {			\
+			up_read(&vdev->memory_lock);			\
+			return -EIO;					\
+		}							\
+	}								\
+									\
+	vfio_iowrite##size(val, io);					\
+									\
+	if (test_mem)							\
+		up_read(&vdev->memory_lock);				\
+									\
+	return 0;							\
+}
+
+VFIO_IOWRITE(8)
+VFIO_IOWRITE(16)
+VFIO_IOWRITE(32)
+#ifdef iowrite64
+VFIO_IOWRITE(64)
+#endif
+
+#define VFIO_IOREAD(size) \
+static int vfio_pci_ioread##size(struct vfio_pci_device *vdev,		\
+			bool test_mem, u##size *val, void __iomem *io)	\
+{									\
+	if (test_mem) {							\
+		down_read(&vdev->memory_lock);				\
+		if (!__vfio_pci_memory_enabled(vdev)) {			\
+			up_read(&vdev->memory_lock);			\
+			return -EIO;					\
+		}							\
+	}								\
+									\
+	*val = vfio_ioread##size(io);					\
+									\
+	if (test_mem)							\
+		up_read(&vdev->memory_lock);				\
+									\
+	return 0;							\
+}
+
+VFIO_IOREAD(8)
+VFIO_IOREAD(16)
+VFIO_IOREAD(32)
+
 /*
  * Read or write from an __iomem region (MMIO or I/O port) with an excluded
  * range which is inaccessible.  The excluded range drops writes and fills
  * reads with -1.  This is intended for handling MSI-X vector tables and
  * leftover space for ROM BARs.
  */
-static ssize_t do_io_rw(void __iomem *io, char __user *buf,
+static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem,
+			void __iomem *io, char __user *buf,
 			loff_t off, size_t count, size_t x_start,
 			size_t x_end, bool iswrite)
 {
 	ssize_t done = 0;
+	int ret;
 
 	while (count) {
 		size_t fillable, filled;
@@ -66,9 +119,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 				if (copy_from_user(&val, buf, 4))
 					return -EFAULT;
 
-				vfio_iowrite32(val, io + off);
+				ret = vfio_pci_iowrite32(vdev, test_mem,
+							 val, io + off);
+				if (ret)
+					return ret;
 			} else {
-				val = vfio_ioread32(io + off);
+				ret = vfio_pci_ioread32(vdev, test_mem,
+							&val, io + off);
+				if (ret)
+					return ret;
 
 				if (copy_to_user(buf, &val, 4))
 					return -EFAULT;
@@ -82,9 +141,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 				if (copy_from_user(&val, buf, 2))
 					return -EFAULT;
 
-				vfio_iowrite16(val, io + off);
+				ret = vfio_pci_iowrite16(vdev, test_mem,
+							 val, io + off);
+				if (ret)
+					return ret;
 			} else {
-				val = vfio_ioread16(io + off);
+				ret = vfio_pci_ioread16(vdev, test_mem,
+							&val, io + off);
+				if (ret)
+					return ret;
 
 				if (copy_to_user(buf, &val, 2))
 					return -EFAULT;
@@ -98,9 +163,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 				if (copy_from_user(&val, buf, 1))
 					return -EFAULT;
 
-				vfio_iowrite8(val, io + off);
+				ret = vfio_pci_iowrite8(vdev, test_mem,
+							val, io + off);
+				if (ret)
+					return ret;
 			} else {
-				val = vfio_ioread8(io + off);
+				ret = vfio_pci_ioread8(vdev, test_mem,
+						       &val, io + off);
+				if (ret)
+					return ret;
 
 				if (copy_to_user(buf, &val, 1))
 					return -EFAULT;
@@ -178,14 +249,6 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 
 	count = min(count, (size_t)(end - pos));
 
-	if (res->flags & IORESOURCE_MEM) {
-		down_read(&vdev->memory_lock);
-		if (!__vfio_pci_memory_enabled(vdev)) {
-			up_read(&vdev->memory_lock);
-			return -EIO;
-		}
-	}
-
 	if (bar == PCI_ROM_RESOURCE) {
 		/*
 		 * The ROM can fill less space than the BAR, so we start the
@@ -213,7 +276,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 		x_end = vdev->msix_offset + vdev->msix_size;
 	}
 
-	done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
+	done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+			count, x_start, x_end, iswrite);
 
 	if (done >= 0)
 		*ppos += done;
@@ -221,9 +285,6 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 	if (bar == PCI_ROM_RESOURCE)
 		pci_unmap_rom(pdev, io);
 out:
-	if (res->flags & IORESOURCE_MEM)
-		up_read(&vdev->memory_lock);
-
 	return done;
 }
 
@@ -278,7 +339,12 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 		return ret;
 	}
 
-	done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
+	/*
+	 * VGA MMIO is a legacy, non-BAR resource that hopefully allows
+	 * probing, so we don't currently worry about access in relation
+	 * to the memory enable bit in the command register.
+	 */
+	done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
 
 	vga_put(vdev->pdev, rsrc);
 
@@ -296,17 +362,21 @@ static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
 
 	switch (ioeventfd->count) {
 	case 1:
-		vfio_iowrite8(ioeventfd->data, ioeventfd->addr);
+		vfio_pci_iowrite8(ioeventfd->vdev, ioeventfd->test_mem,
+				  ioeventfd->data, ioeventfd->addr);
 		break;
 	case 2:
-		vfio_iowrite16(ioeventfd->data, ioeventfd->addr);
+		vfio_pci_iowrite16(ioeventfd->vdev, ioeventfd->test_mem,
+				   ioeventfd->data, ioeventfd->addr);
 		break;
 	case 4:
-		vfio_iowrite32(ioeventfd->data, ioeventfd->addr);
+		vfio_pci_iowrite32(ioeventfd->vdev, ioeventfd->test_mem,
+				   ioeventfd->data, ioeventfd->addr);
 		break;
 #ifdef iowrite64
 	case 8:
-		vfio_iowrite64(ioeventfd->data, ioeventfd->addr);
+		vfio_pci_iowrite64(ioeventfd->vdev, ioeventfd->test_mem,
+				   ioeventfd->data, ioeventfd->addr);
 		break;
 #endif
 	}
@@ -378,11 +448,13 @@ long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
 		goto out_unlock;
 	}
 
+	ioeventfd->vdev = vdev;
 	ioeventfd->addr = vdev->barmap[bar] + pos;
 	ioeventfd->data = data;
 	ioeventfd->pos = pos;
 	ioeventfd->bar = bar;
 	ioeventfd->count = count;
+	ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM;
 
 	ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
 				 NULL, NULL, &ioeventfd->virqfd, fd);
-- 
Gitee


From 2145bdbd7e6107cb574415b86b876597d4d01290 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 9 Jun 2022 17:39:40 +0800
Subject: [PATCH 1561/2161] treewide: Use fallthrough
 pseudo-keyword(drivers/vfio)

commit df561f6688fef775baa341a0f5d960becd248b11 upstream.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c     | 2 +-
 drivers/vfio/vfio_iommu_type1.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index f862ba663b9a..eab925b1e30e 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -988,7 +988,7 @@ static long vfio_pci_ioctl(void *device_data,
 		case VFIO_PCI_ERR_IRQ_INDEX:
 			if (pci_is_pcie(vdev->pdev))
 				break;
-		/* fall through */
+			fallthrough;
 		default:
 			return -EINVAL;
 		}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f34c78ecdb41..24ac8b40c43d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2435,7 +2435,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 		break;
 	case VFIO_TYPE1_NESTING_IOMMU:
 		iommu->nesting = true;
-		/* fall through */
+		fallthrough;
 	case VFIO_TYPE1v2_IOMMU:
 		iommu->v2 = true;
 		break;
-- 
Gitee


From 34c31181309dcd285bb502435a790625570a5a5e Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Mon, 21 Sep 2020 12:51:15 +0800
Subject: [PATCH 1562/2161] vfio/pci: Remove redundant declaration of
 vfio_pci_driver

commit eac7cc21c4e49ab7dfeef22e29abdee8432face9 upstream.

It was added by commit 137e5531351d ("vfio/pci: Add sriov_configure
support") but duplicates a forward declaration earlier in the file.

Remove it.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index eab925b1e30e..e5e9149fc151 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1862,7 +1862,6 @@ static const struct vfio_device_ops vfio_pci_ops = {
 
 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
-static struct pci_driver vfio_pci_driver;
 
 static int vfio_pci_bus_notifier(struct notifier_block *nb,
 				 unsigned long action, void *data)
-- 
Gitee


From 98363edd652da70ab36a0e674282ccccd77e0541 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Mon, 21 Sep 2020 12:51:16 +0800
Subject: [PATCH 1563/2161] vfio/pci: Don't regenerate vconfig for all BARs if
 !bardirty

commit 1c0f68252a6a8a793ff3352e3580664d18ec5955 upstream.

Now we regenerate vconfig for all the BARs via vfio_bar_fixup(), every
time any offset of any of them are read.  Though BARs aren't re-read
regularly, the regeneration can be avoided if no BARs had been written
since they were last read, in which case vdev->bardirty is false.

Let's return immediately in vfio_bar_fixup() if bardirty is false.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 5076d0155bc3..a402adee8a21 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -467,6 +467,9 @@ static void vfio_bar_fixup(struct vfio_pci_device *vdev)
 	__le32 *vbar;
 	u64 mask;
 
+	if (!vdev->bardirty)
+		return;
+
 	vbar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
 
 	for (i = 0; i < PCI_STD_NUM_BARS; i++, vbar++) {
-- 
Gitee


From 3761c48a921b350c6bb1739dce4ae2d670089332 Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Wed, 16 Sep 2020 10:28:33 +0800
Subject: [PATCH 1564/2161] vfio: add a singleton check for
 vfio_group_pin_pages

commit 7ef32e52368f62a4e041a4f0abefb4fb64e7fd4a upstream.

Page pinning is used both to translate and pin device mappings for DMA
purpose, as well as to indicate to the IOMMU backend to limit the dirty
page scope to those pages that have been pinned, in the case of an IOMMU
backed device.
To support this, the vfio_pin_pages() interface limits itself to only
singleton groups such that the IOMMU backend can consider dirty page
scope only at the group level.  Implement the same requirement for the
vfio_group_pin_pages() interface.

Fixes: 95fc87b44104 ("vfio: Selective dirty page tracking if IOMMU backed device pins pages")
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 262ab0efd06c..532bcaf28c11 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2051,6 +2051,9 @@ int vfio_group_pin_pages(struct vfio_group *group,
 	if (!group || !user_iova_pfn || !phys_pfn || !npage)
 		return -EINVAL;
 
+	if (group->dev_counter > 1)
+		return -EINVAL;
+
 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
 		return -E2BIG;
 
-- 
Gitee


From a42aafe475f42167220b8cc66fc3278bbe029202 Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Wed, 16 Sep 2020 10:29:27 +0800
Subject: [PATCH 1565/2161] vfio: fix a missed vfio group put in vfio_pin_pages

commit 28b130244061863cf0437b7af1625fb45ec1a71e upstream.

When error occurs, need to put vfio group after a successful get.

Fixes: 95fc87b44104 ("vfio: Selective dirty page tracking if IOMMU backed device pins pages")
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 532bcaf28c11..2151bc7f87ab 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1949,8 +1949,10 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
 	if (!group)
 		return -ENODEV;
 
-	if (group->dev_counter > 1)
-		return -EINVAL;
+	if (group->dev_counter > 1) {
+		ret = -EINVAL;
+		goto err_pin_pages;
+	}
 
 	ret = vfio_group_add_container_user(group);
 	if (ret)
-- 
Gitee


From 55f89ba35e0e6a39d29acd14295e8c548b7e218d Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Wed, 16 Sep 2020 10:30:05 +0800
Subject: [PATCH 1566/2161] vfio/type1: fix dirty bitmap calculation in
 vfio_dma_rw

commit 2c5af98592f65517170c7bcc714566590d3f7397 upstream.

The count of dirtied pages is not only determined by count of copied
pages, but also by the start offset.

e.g. if offset = PAGE_SIZE - 1, and *copied=2, the dirty pages count
is 2, instead of 1 or 0.

Fixes: d6a4c185660c ("vfio iommu: Implementation of ioctl for dirty pages tracking")
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 24ac8b40c43d..e05b9af5b013 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2949,7 +2949,8 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
 			 * size
 			 */
 			bitmap_set(dma->bitmap, offset >> pgshift,
-				   *copied >> pgshift);
+				   ((offset + *copied - 1) >> pgshift) -
+				   (offset >> pgshift) + 1);
 		}
 	} else
 		*copied = copy_from_user(data, (void __user *)vaddr,
-- 
Gitee


From a1760bc00a945f2af4a8b5f3372dd53f7cfc040a Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Thu, 10 Sep 2020 20:25:08 +0800
Subject: [PATCH 1567/2161] vfio: Fix typo of the device_state

commit c12fa88c6d16ed3865072d91154cff6fd1cd9cd4 upstream.

A typo fix ("_RUNNNG" => "_RUNNING") in comment block of the uapi header.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 1831fb7beb12..00b703f3d1df 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -460,7 +460,7 @@ struct vfio_region_gfx_edid {
  * 5. Resumed
  *                  |--------->|
  *
- * 0. Default state of VFIO device is _RUNNNG when the user application starts.
+ * 0. Default state of VFIO device is _RUNNING when the user application starts.
  * 1. During normal shutdown of the user application, the user application may
  *    optionally change the VFIO device state from _RUNNING to _STOP. This
  *    transition is optional. The vendor driver must support this transition but
-- 
Gitee


From 4fc82bab240b89edd877c03645d69853f10f0cf8 Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Wed, 7 Oct 2020 14:56:22 -0400
Subject: [PATCH 1568/2161] vfio: Introduce capability definitions for
 VFIO_DEVICE_GET_INFO

commit 0c633f0be1dc70a6db46d90dba4cdae82073350a upstream.

Allow the VFIO_DEVICE_GET_INFO ioctl to include a capability chain.
Add a flag indicating capability chain support, and introduce the
definitions for the first set of capabilities which are specified to
s390 zPCI devices.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h      | 11 +++++
 include/uapi/linux/vfio_zdev.h | 78 ++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 include/uapi/linux/vfio_zdev.h

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 00b703f3d1df..d0e23b53ac89 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -201,8 +201,10 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
 #define VFIO_DEVICE_FLAGS_CCW	(1 << 4)	/* vfio-ccw device */
 #define VFIO_DEVICE_FLAGS_AP	(1 << 5)	/* vfio-ap device */
+#define VFIO_DEVICE_FLAGS_CAPS	(1 << 7)	/* Info supports caps */
 	__u32	num_regions;	/* Max region index + 1 */
 	__u32	num_irqs;	/* Max IRQ index + 1 */
+	__u32   cap_offset;	/* Offset within info struct of first cap */
 };
 #define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
 
@@ -218,6 +220,15 @@ struct vfio_device_info {
 #define VFIO_DEVICE_API_CCW_STRING		"vfio-ccw"
 #define VFIO_DEVICE_API_AP_STRING		"vfio-ap"
 
+/*
+ * The following capabilities are unique to s390 zPCI devices.  Their contents
+ * are further-defined in vfio_zdev.h
+ */
+#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE		1
+#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP		2
+#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL		3
+#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP		4
+
 /**
  * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
  *				       struct vfio_region_info)
diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
new file mode 100644
index 000000000000..b4309397b6b2
--- /dev/null
+++ b/include/uapi/linux/vfio_zdev.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * VFIO Region definitions for ZPCI devices
+ *
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ *            Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#ifndef _VFIO_ZDEV_H_
+#define _VFIO_ZDEV_H_
+
+#include <linux/types.h>
+#include <linux/vfio.h>
+
+/**
+ * VFIO_DEVICE_INFO_CAP_ZPCI_BASE - Base PCI Function information
+ *
+ * This capability provides a set of descriptive information about the
+ * associated PCI function.
+ */
+struct vfio_device_info_cap_zpci_base {
+	struct vfio_info_cap_header header;
+	__u64 start_dma;	/* Start of available DMA addresses */
+	__u64 end_dma;		/* End of available DMA addresses */
+	__u16 pchid;		/* Physical Channel ID */
+	__u16 vfn;		/* Virtual function number */
+	__u16 fmb_length;	/* Measurement Block Length (in bytes) */
+	__u8 pft;		/* PCI Function Type */
+	__u8 gid;		/* PCI function group ID */
+};
+
+/**
+ * VFIO_DEVICE_INFO_CAP_ZPCI_GROUP - Base PCI Function Group information
+ *
+ * This capability provides a set of descriptive information about the group of
+ * PCI functions that the associated device belongs to.
+ */
+struct vfio_device_info_cap_zpci_group {
+	struct vfio_info_cap_header header;
+	__u64 dasm;		/* DMA Address space mask */
+	__u64 msi_addr;		/* MSI address */
+	__u64 flags;
+#define VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH 1 /* Program-specified TLB refresh */
+	__u16 mui;		/* Measurement Block Update Interval */
+	__u16 noi;		/* Maximum number of MSIs */
+	__u16 maxstbl;		/* Maximum Store Block Length */
+	__u8 version;		/* Supported PCI Version */
+};
+
+/**
+ * VFIO_DEVICE_INFO_CAP_ZPCI_UTIL - Utility String
+ *
+ * This capability provides the utility string for the associated device, which
+ * is a device identifier string made up of EBCDID characters.  'size' specifies
+ * the length of 'util_str'.
+ */
+struct vfio_device_info_cap_zpci_util {
+	struct vfio_info_cap_header header;
+	__u32 size;
+	__u8 util_str[];
+};
+
+/**
+ * VFIO_DEVICE_INFO_CAP_ZPCI_PFIP - PCI Function Path
+ *
+ * This capability provides the PCI function path string, which is an identifier
+ * that describes the internal hardware path of the device. 'size' specifies
+ * the length of 'pfip'.
+ */
+struct vfio_device_info_cap_zpci_pfip {
+	struct vfio_info_cap_header header;
+	__u32 size;
+	__u8 pfip[];
+};
+
+#endif
-- 
Gitee


From ab1679bedba0f29402ce75509c82f8206626cc62 Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Wed, 7 Oct 2020 14:56:23 -0400
Subject: [PATCH 1569/2161] vfio-pci/zdev: Add zPCI capabilities to
 VFIO_DEVICE_GET_INFO

commit e6b817d4b8217a9528fcfd59719b924ab8a5ff23 upstream.

Define a new configuration entry VFIO_PCI_ZDEV for VFIO/PCI.

When this s390-only feature is configured we add capabilities to the
VFIO_DEVICE_GET_INFO ioctl that describe features of the associated
zPCI device and its underlying hardware.

This patch is based on work previously done by Pierre Morel.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/Kconfig            |  12 +++
 drivers/vfio/pci/Makefile           |   1 +
 drivers/vfio/pci/vfio_pci.c         |  37 +++++++
 drivers/vfio/pci/vfio_pci_private.h |  12 +++
 drivers/vfio/pci/vfio_pci_zdev.c    | 143 ++++++++++++++++++++++++++++
 5 files changed, 205 insertions(+)
 create mode 100644 drivers/vfio/pci/vfio_pci_zdev.c

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 4abddbebd4b2..e06ff98a08a6 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -45,3 +45,15 @@ config VFIO_PCI_NVLINK2
 	depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU
 	help
 	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
+
+config VFIO_PCI_ZDEV
+	bool "VFIO PCI ZPCI device CLP support"
+	depends on VFIO_PCI && S390
+	default y
+	help
+	  Enabling this option exposes VFIO capabilities containing hardware
+	  configuration for zPCI devices. This enables userspace (e.g. QEMU)
+	  to supply proper configuration values instead of hard-coded defaults
+	  for zPCI devices passed through via VFIO on s390.
+
+	  Say Y here.
\ No newline at end of file
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index f027f8a0e89c..781e0809d6ee 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -3,5 +3,6 @@
 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
+vfio-pci-$(CONFIG_VFIO_PCI_ZDEV) += vfio_pci_zdev.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index e5e9149fc151..b88755270db1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -805,15 +805,25 @@ static long vfio_pci_ioctl(void *device_data,
 
 	if (cmd == VFIO_DEVICE_GET_INFO) {
 		struct vfio_device_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		unsigned long capsz;
 
 		minsz = offsetofend(struct vfio_device_info, num_irqs);
 
+		/* For backward compatibility, cannot require this */
+		capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
 		if (copy_from_user(&info, (void __user *)arg, minsz))
 			return -EFAULT;
 
 		if (info.argsz < minsz)
 			return -EINVAL;
 
+		if (info.argsz >= capsz) {
+			minsz = capsz;
+			info.cap_offset = 0;
+		}
+
 		info.flags = VFIO_DEVICE_FLAGS_PCI;
 
 		if (vdev->reset_works)
@@ -822,6 +832,33 @@ static long vfio_pci_ioctl(void *device_data,
 		info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
 		info.num_irqs = VFIO_PCI_NUM_IRQS;
 
+		if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
+			int ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
+
+			if (ret && ret != -ENODEV) {
+				pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
+				return ret;
+			}
+		}
+
+		if (caps.size) {
+			info.flags |= VFIO_DEVICE_FLAGS_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						  sizeof(info), caps.buf,
+						  caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+
 		return copy_to_user((void __user *)arg, &info, minsz) ?
 			-EFAULT : 0;
 
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index a10eb64d30ec..de2945440fab 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -213,4 +213,16 @@ static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
 	return -ENODEV;
 }
 #endif
+
+#ifdef CONFIG_VFIO_PCI_ZDEV
+extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+				       struct vfio_info_cap *caps);
+#else
+static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+					      struct vfio_info_cap *caps)
+{
+	return -ENODEV;
+}
+#endif
+
 #endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
new file mode 100644
index 000000000000..229685634031
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2020.  All rights reserved.
+ *	Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ *                 Matthew Rosato <mjrosato@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_zdev.h>
+#include <asm/pci_clp.h>
+#include <asm/pci_io.h>
+
+#include "vfio_pci_private.h"
+
+/*
+ * Add the Base PCI Function information to the device info region.
+ */
+static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+			 struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_base cap = {
+		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
+		.header.version = 1,
+		.start_dma = zdev->start_dma,
+		.end_dma = zdev->end_dma,
+		.pchid = zdev->pchid,
+		.vfn = zdev->vfn,
+		.fmb_length = zdev->fmb_length,
+		.pft = zdev->pft,
+		.gid = zdev->pfgid
+	};
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the Base PCI Function Group information to the device info region.
+ */
+static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+			  struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_group cap = {
+		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
+		.header.version = 1,
+		.dasm = zdev->dma_mask,
+		.msi_addr = zdev->msi_addr,
+		.flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH,
+		.mui = zdev->fmb_update,
+		.noi = zdev->max_msi,
+		.maxstbl = ZPCI_MAX_WRITE_SIZE,
+		.version = zdev->version
+	};
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the device utility string to the device info region.
+ */
+static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+			 struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_util *cap;
+	int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN;
+	int ret;
+
+	cap = kmalloc(cap_size, GFP_KERNEL);
+
+	cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL;
+	cap->header.version = 1;
+	cap->size = CLP_UTIL_STR_LEN;
+	memcpy(cap->util_str, zdev->util_str, cap->size);
+
+	ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+	kfree(cap);
+
+	return ret;
+}
+
+/*
+ * Add the function path string to the device info region.
+ */
+static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+			 struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_pfip *cap;
+	int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS;
+	int ret;
+
+	cap = kmalloc(cap_size, GFP_KERNEL);
+
+	cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP;
+	cap->header.version = 1;
+	cap->size = CLP_PFIP_NR_SEGMENTS;
+	memcpy(cap->pfip, zdev->pfip, cap->size);
+
+	ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+	kfree(cap);
+
+	return ret;
+}
+
+/*
+ * Add all supported capabilities to the VFIO_DEVICE_GET_INFO capability chain.
+ */
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+				struct vfio_info_cap *caps)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	int ret;
+
+	if (!zdev)
+		return -ENODEV;
+
+	ret = zpci_base_cap(zdev, vdev, caps);
+	if (ret)
+		return ret;
+
+	ret = zpci_group_cap(zdev, vdev, caps);
+	if (ret)
+		return ret;
+
+	if (zdev->util_str_avail) {
+		ret = zpci_util_cap(zdev, vdev, caps);
+		if (ret)
+			return ret;
+	}
+
+	ret = zpci_pfip_cap(zdev, vdev, caps);
+
+	return ret;
+}
-- 
Gitee


From 128ac17aeb2f485c67ad97a17eaba354d0bf95de Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Thu, 22 Oct 2020 20:24:17 +0800
Subject: [PATCH 1570/2161] vfio/type1: Use the new helper to find vfio_group

commit 572f64c71e0fe30089699b22ce0ca3d4bf452ce9 upstream.

When attaching a new group to the container, let's use the new helper
vfio_iommu_find_iommu_group() to check if it's already attached. There
is no functional change.

Also take this chance to add a missing blank line.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e05b9af5b013..fed45cb5eaa7 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2005,6 +2005,7 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
 
 	list_splice_tail(iova_copy, iova);
 }
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -2021,18 +2022,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(d, &iommu->domain_list, next) {
-		if (find_iommu_group(d, iommu_group)) {
-			mutex_unlock(&iommu->lock);
-			return -EINVAL;
-		}
-	}
-
-	if (iommu->external_domain) {
-		if (find_iommu_group(iommu->external_domain, iommu_group)) {
-			mutex_unlock(&iommu->lock);
-			return -EINVAL;
-		}
+	/* Check for duplicates */
+	if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
+		mutex_unlock(&iommu->lock);
+		return -EINVAL;
 	}
 
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
-- 
Gitee


From 0ede5cc977fe352092727e7171f4fe1ff31773f8 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 2 Nov 2020 15:02:00 -0700
Subject: [PATCH 1571/2161] vfio/pci: Implement ioeventfd thread handler for
 contended memory lock

commit 38565c93c8a1306dc5f245572a545fbea908ac41 upstream.

The ioeventfd is called under spinlock with interrupts disabled,
therefore if the memory lock is contended defer code that might
sleep to a thread context.

Fixes: bc93b9ae0151 ("vfio-pci: Avoid recursive read-lock usage")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=209253#c1
Reported-by: Ian Pilcher <arequipeno@gmail.com>
Tested-by: Ian Pilcher <arequipeno@gmail.com>
Tested-by: Justin Gatzen <justin.gatzen@gmail.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_rdwr.c | 43 ++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 9e353c484ace..a0b5fc8e46f4 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -356,34 +356,60 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 	return done;
 }
 
-static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
+					bool test_mem)
 {
-	struct vfio_pci_ioeventfd *ioeventfd = opaque;
-
 	switch (ioeventfd->count) {
 	case 1:
-		vfio_pci_iowrite8(ioeventfd->vdev, ioeventfd->test_mem,
+		vfio_pci_iowrite8(ioeventfd->vdev, test_mem,
 				  ioeventfd->data, ioeventfd->addr);
 		break;
 	case 2:
-		vfio_pci_iowrite16(ioeventfd->vdev, ioeventfd->test_mem,
+		vfio_pci_iowrite16(ioeventfd->vdev, test_mem,
 				   ioeventfd->data, ioeventfd->addr);
 		break;
 	case 4:
-		vfio_pci_iowrite32(ioeventfd->vdev, ioeventfd->test_mem,
+		vfio_pci_iowrite32(ioeventfd->vdev, test_mem,
 				   ioeventfd->data, ioeventfd->addr);
 		break;
 #ifdef iowrite64
 	case 8:
-		vfio_pci_iowrite64(ioeventfd->vdev, ioeventfd->test_mem,
+		vfio_pci_iowrite64(ioeventfd->vdev, test_mem,
 				   ioeventfd->data, ioeventfd->addr);
 		break;
 #endif
 	}
+}
+
+static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+{
+	struct vfio_pci_ioeventfd *ioeventfd = opaque;
+	struct vfio_pci_device *vdev = ioeventfd->vdev;
+
+	if (ioeventfd->test_mem) {
+		if (!down_read_trylock(&vdev->memory_lock))
+			return 1; /* Lock contended, use thread */
+		if (!__vfio_pci_memory_enabled(vdev)) {
+			up_read(&vdev->memory_lock);
+			return 0;
+		}
+	}
+
+	vfio_pci_ioeventfd_do_write(ioeventfd, false);
+
+	if (ioeventfd->test_mem)
+		up_read(&vdev->memory_lock);
 
 	return 0;
 }
 
+static void vfio_pci_ioeventfd_thread(void *opaque, void *unused)
+{
+	struct vfio_pci_ioeventfd *ioeventfd = opaque;
+
+	vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem);
+}
+
 long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
 			uint64_t data, int count, int fd)
 {
@@ -457,7 +483,8 @@ long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
 	ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM;
 
 	ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
-				 NULL, NULL, &ioeventfd->virqfd, fd);
+				 vfio_pci_ioeventfd_thread, NULL,
+				 &ioeventfd->virqfd, fd);
 	if (ret) {
 		kfree(ioeventfd);
 		goto out_unlock;
-- 
Gitee


From bf0aae58fa72c1d1956f0d611e9949a8dabe5583 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 27 Oct 2020 13:55:21 +0000
Subject: [PATCH 1572/2161] eventfd: Export eventfd_ctx_do_read()

commit 28f1326710555bbe666f64452d08f2d7dd657cae upstream.

Where events are consumed in the kernel, for example by KVM's
irqfd_wakeup() and VFIO's virqfd_wakeup(), they currently lack a
mechanism to drain the eventfd's counter.

Since the wait queue is already locked while the wakeup functions are
invoked, all they really need to do is call eventfd_ctx_do_read().

Add a check for the lock, and export it for them.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20201027135523.646811-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 fs/eventfd.c            | 5 ++++-
 include/linux/eventfd.h | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 78e41c7c3d05..26b3d821e916 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -181,11 +181,14 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
-static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 {
+	lockdep_assert_held(&ctx->wqh.lock);
+
 	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 	ctx->count -= *cnt;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
 
 /**
  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index dc4fd8a6644d..fa0a524baed0 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -41,6 +41,7 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
 
 DECLARE_PER_CPU(int, eventfd_wake_count);
 
@@ -82,6 +83,11 @@ static inline bool eventfd_signal_count(void)
 	return false;
 }
 
+static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+{
+
+}
+
 #endif
 
 #endif /* _LINUX_EVENTFD_H */
-- 
Gitee


From 909c047110394d82e8d9ac232de7135c6b6e0559 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 27 Oct 2020 13:55:22 +0000
Subject: [PATCH 1573/2161] vfio/virqfd: Drain events from eventfd in
 virqfd_wakeup()

commit b1b397aeef8177f4f7bd91a0d5fa708f4752a499 upstream.

Don't allow the events to accumulate in the eventfd counter, drain them
as they are handled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20201027135523.646811-3-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/virqfd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 997cb5d0a657..414e98d82b02 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -46,6 +46,9 @@ static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void
 	__poll_t flags = key_to_poll(key);
 
 	if (flags & EPOLLIN) {
+		u64 cnt;
+		eventfd_ctx_do_read(virqfd->eventfd, &cnt);
+
 		/* An event has been signaled, call function */
 		if ((!virqfd->handler ||
 		     virqfd->handler(virqfd->opaque, virqfd->data)) &&
-- 
Gitee


From d8e4c4cf2db5e595b27054bdb33fda7be1814c3a Mon Sep 17 00:00:00 2001
From: Eric Farman <farman@linux.ibm.com>
Date: Thu, 3 Dec 2020 22:35:11 +0100
Subject: [PATCH 1574/2161] vfio-mdev: Wire in a request handler for mdev
 parent

commit a15ac665b9e9c90b1557499f2a46c1e89d29154a upstream.

While performing some destructive tests with vfio-ccw, where the
paths to a device are forcible removed and thus the device itself
is unreachable, it is rather easy to end up in an endless loop in
vfio_del_group_dev() due to the lack of a request callback for the
associated device.

In this example, one MDEV (77c) is used by a guest, while another
(77b) is not. The symptom is that the iommu is detached from the
mdev for 77b, but not 77c, until that guest is shutdown:

    [  238.794867] vfio_ccw 0.0.077b: MDEV: Unregistering
    [  238.794996] vfio_mdev 11f2d2bc-4083-431d-a023-eff72715c4f0: Removing from iommu group 2
    [  238.795001] vfio_mdev 11f2d2bc-4083-431d-a023-eff72715c4f0: MDEV: detaching iommu
    [  238.795036] vfio_ccw 0.0.077c: MDEV: Unregistering
    ...silence...

Let's wire in the request call back to the mdev device, so that a
device being physically removed from the host can be (gracefully?)
handled by the parent device at the time the device is removed.

Add a message when registering the device if a driver doesn't
provide this callback, so a clue is given that this same loop
may be encountered in a similar situation, and a message when
this occurs instead of the awkward silence noted above.

Signed-off-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c |  4 ++++
 drivers/vfio/mdev/vfio_mdev.c | 13 +++++++++++++
 include/linux/mdev.h          |  4 ++++
 3 files changed, 21 insertions(+)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index b558d4cfd082..6de97d25a3f8 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -154,6 +154,10 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	if (!dev)
 		return -EINVAL;
 
+	/* Not mandatory, but its absence could be a problem */
+	if (!ops->request)
+		dev_info(dev, "Driver cannot be asked to release device\n");
+
 	mutex_lock(&parent_list_lock);
 
 	/* Check for duplicate */
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 30964a4e0a28..b52eea128549 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -98,6 +98,18 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
 	return parent->ops->mmap(mdev, vma);
 }
 
+static void vfio_mdev_request(void *device_data, unsigned int count)
+{
+	struct mdev_device *mdev = device_data;
+	struct mdev_parent *parent = mdev->parent;
+
+	if (parent->ops->request)
+		parent->ops->request(mdev, count);
+	else if (count == 0)
+		dev_notice(mdev_dev(mdev),
+			   "No mdev vendor driver request callback support, blocked until released by user\n");
+}
+
 static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.name		= "vfio-mdev",
 	.open		= vfio_mdev_open,
@@ -106,6 +118,7 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.read		= vfio_mdev_read,
 	.write		= vfio_mdev_write,
 	.mmap		= vfio_mdev_mmap,
+	.request	= vfio_mdev_request,
 };
 
 static int vfio_mdev_probe(struct device *dev)
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 0ce30ca78db0..9004375c462e 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -72,6 +72,9 @@ struct device *mdev_get_iommu_device(struct device *dev);
  * @mmap:		mmap callback
  *			@mdev: mediated device structure
  *			@vma: vma structure
+ * @request:		request callback to release device
+ *			@mdev: mediated device structure
+ *			@count: request sequence number
  * Parent device that support mediated device should be registered with mdev
  * module with mdev_parent_ops structure.
  **/
@@ -92,6 +95,7 @@ struct mdev_parent_ops {
 	long	(*ioctl)(struct mdev_device *mdev, unsigned int cmd,
 			 unsigned long arg);
 	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
+	void	(*request)(struct mdev_device *mdev, unsigned int count);
 };
 
 /* interface for exporting mdev supported type attributes */
-- 
Gitee


From c82043e721f22c84a1fe6936599dfb4e5228891e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 9 Dec 2020 22:36:39 +0200
Subject: [PATCH 1575/2161] vfio: platform: Switch to use
 platform_get_mem_or_io()

commit feaba5932b6f4bfc875c874a3b7a28c7f05f5a77 upstream.

Switch to use new platform_get_mem_or_io() instead of home grown analogue.

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: kvm@vger.kernel.org
Acked-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20201209203642.27648-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/vfio_platform.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 1e2769010089..9fb6818cea12 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -25,19 +25,8 @@ static struct resource *get_platform_resource(struct vfio_platform_device *vdev,
 					      int num)
 {
 	struct platform_device *dev = (struct platform_device *) vdev->opaque;
-	int i;
 
-	for (i = 0; i < dev->num_resources; i++) {
-		struct resource *r = &dev->resource[i];
-
-		if (resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) {
-			if (!num)
-				return r;
-
-			num--;
-		}
-	}
-	return NULL;
+	return platform_get_mem_or_io(dev, num);
 }
 
 static int get_platform_irq(struct vfio_platform_device *vdev, int i)
-- 
Gitee


From d61ae552b2df6795e81f451f9574b2f95c8ba371 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 9 Dec 2020 09:44:44 +0800
Subject: [PATCH 1576/2161] vfio/type1: Add vfio_group_iommu_domain()

commit bdfae1c9a913930eae5ea506733aa7c285e12a06 upstream.

Add the API for getting the domain from a vfio group. This could be used
by the physical device drivers which rely on the vfio/mdev framework for
mediated device user level access. The typical use case like below:

	unsigned int pasid;
	struct vfio_group *vfio_group;
	struct iommu_domain *iommu_domain;
	struct device *dev = mdev_dev(mdev);
	struct device *iommu_device = mdev_get_iommu_device(dev);

	if (!iommu_device ||
	    !iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
		return -EINVAL;

	vfio_group = vfio_group_get_external_user_from_dev(dev);
	if (IS_ERR_OR_NULL(vfio_group))
		return -EFAULT;

	iommu_domain = vfio_group_iommu_domain(vfio_group);
	if (IS_ERR_OR_NULL(iommu_domain)) {
		vfio_group_put_external_user(vfio_group);
		return -EFAULT;
	}

	pasid = iommu_aux_get_pasid(iommu_domain, iommu_device);
	if (pasid < 0) {
		vfio_group_put_external_user(vfio_group);
		return -EFAULT;
	}

	/* Program device context with pasid value. */
	...

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c             | 18 ++++++++++++++++++
 drivers/vfio/vfio_iommu_type1.c | 24 ++++++++++++++++++++++++
 include/linux/vfio.h            |  4 ++++
 3 files changed, 46 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2151bc7f87ab..4ad8a35667a7 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2331,6 +2331,24 @@ int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
 }
 EXPORT_SYMBOL(vfio_unregister_notifier);
 
+struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+
+	if (!group)
+		return ERR_PTR(-EINVAL);
+
+	container = group->container;
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->group_iommu_domain))
+		return driver->ops->group_iommu_domain(container->iommu_data,
+						       group->iommu_group);
+
+	return ERR_PTR(-ENOTTY);
+}
+EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fed45cb5eaa7..8912aa9f13b5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2978,6 +2978,29 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
 	return ret;
 }
 
+static struct iommu_domain *
+vfio_iommu_type1_group_iommu_domain(void *iommu_data,
+				    struct iommu_group *iommu_group)
+{
+	struct iommu_domain *domain = ERR_PTR(-ENODEV);
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *d;
+
+	if (!iommu || !iommu_group)
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		if (find_iommu_group(d, iommu_group)) {
+			domain = d->domain;
+			break;
+		}
+	}
+	mutex_unlock(&iommu->lock);
+
+	return domain;
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.name			= "vfio-iommu-type1",
 	.owner			= THIS_MODULE,
@@ -2991,6 +3014,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.register_notifier	= vfio_iommu_type1_register_notifier,
 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
 	.dma_rw			= vfio_iommu_type1_dma_rw,
+	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 38d3c6a8dc7e..f45940b38a02 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -90,6 +90,8 @@ struct vfio_iommu_driver_ops {
 					       struct notifier_block *nb);
 	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
 				  void *data, size_t count, bool write);
+	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
+						   struct iommu_group *group);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -126,6 +128,8 @@ extern int vfio_group_unpin_pages(struct vfio_group *group,
 extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
 		       void *data, size_t len, bool write);
 
+extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group);
+
 /* each type has independent events */
 enum vfio_notify_type {
 	VFIO_IOMMU_NOTIFY = 0,
-- 
Gitee


From 92a6b7faf7506f908e36c01d0bf962ee3d2780f5 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:04 -0800
Subject: [PATCH 1577/2161] vfio: option to unmap all

commit c98fe7c2a2038af2bc24f039ccdb5a65c22ba11a upstream.

For the UNMAP_DMA ioctl, delete all mappings if VFIO_DMA_UNMAP_FLAG_ALL
is set.  Define the corresponding VFIO_UNMAP_ALL extension.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d0e23b53ac89..65dcd3012054 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -46,6 +46,9 @@
  */
 #define VFIO_NOIOMMU_IOMMU		8
 
+/* Supports VFIO_DMA_UNMAP_FLAG_ALL */
+#define VFIO_UNMAP_ALL			9
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -1097,6 +1100,7 @@ struct vfio_bitmap {
  * field.  No guarantee is made to the user that arbitrary unmaps of iova
  * or size different from those used in the original mapping call will
  * succeed.
+ *
  * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
  * before unmapping IO virtual addresses. When this flag is set, the user must
  * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
@@ -1106,11 +1110,15 @@ struct vfio_bitmap {
  * indicates that the page at that offset from iova is dirty. A Bitmap of the
  * pages in the range of unmapped size is returned in the user-provided
  * vfio_bitmap.data.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses.  iova and size
+ * must be 0.  This cannot be combined with the get-dirty-bitmap flag.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
+#define VFIO_DMA_UNMAP_FLAG_ALL		     (1 << 1)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 	__u8    data[];
-- 
Gitee


From 90f975f3dac0684531b936a62da7c72fa7d72678 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:05 -0800
Subject: [PATCH 1578/2161] vfio/type1: unmap cleanup

commit 0f53afa12baec8c00f5d1d6afb49325ada105253 upstream.

Minor changes in vfio_dma_do_unmap to improve readability, which also
simplify the subsequent unmap-all patch.  No functional change.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 38 +++++++++++++--------------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8912aa9f13b5..8f825f583145 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1086,34 +1086,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 {
 	struct vfio_dma *dma, *dma_last = NULL;
 	size_t unmapped = 0, pgsize;
-	int ret = 0, retries = 0;
+	int ret = -EINVAL, retries = 0;
 	unsigned long pgshift;
+	dma_addr_t iova = unmap->iova;
+	unsigned long size = unmap->size;
 
 	mutex_lock(&iommu->lock);
 
 	pgshift = __ffs(iommu->pgsize_bitmap);
 	pgsize = (size_t)1 << pgshift;
 
-	if (unmap->iova & (pgsize - 1)) {
-		ret = -EINVAL;
+	if (iova & (pgsize - 1))
 		goto unlock;
-	}
 
-	if (!unmap->size || unmap->size & (pgsize - 1)) {
-		ret = -EINVAL;
+	if (!size || size & (pgsize - 1))
 		goto unlock;
-	}
 
-	if (unmap->iova + unmap->size - 1 < unmap->iova ||
-	    unmap->size > SIZE_MAX) {
-		ret = -EINVAL;
+	if (iova + size - 1 < iova || size > SIZE_MAX)
 		goto unlock;
-	}
 
 	/* When dirty tracking is enabled, allow only min supported pgsize */
 	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
 	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
-		ret = -EINVAL;
 		goto unlock;
 	}
 
@@ -1151,20 +1145,18 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	 * mappings within the range.
 	 */
 	if (iommu->v2) {
-		dma = vfio_find_dma(iommu, unmap->iova, 1);
-		if (dma && dma->iova != unmap->iova) {
-			ret = -EINVAL;
+		dma = vfio_find_dma(iommu, iova, 1);
+		if (dma && dma->iova != iova)
 			goto unlock;
-		}
-		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
-		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
-			ret = -EINVAL;
+
+		dma = vfio_find_dma(iommu, iova + size - 1, 0);
+		if (dma && dma->iova + dma->size != iova + size)
 			goto unlock;
-		}
 	}
 
-	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
-		if (!iommu->v2 && unmap->iova > dma->iova)
+	ret = 0;
+	while ((dma = vfio_find_dma(iommu, iova, size))) {
+		if (!iommu->v2 && iova > dma->iova)
 			break;
 		/*
 		 * Task with same address space who mapped this iova range is
@@ -1202,7 +1194,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 
 		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
 			ret = update_user_bitmap(bitmap->data, iommu, dma,
-						 unmap->iova, pgsize);
+						 iova, pgsize);
 			if (ret)
 				break;
 		}
-- 
Gitee


From b6d4c3547b6771b347b6117ca96bbcafc3c8bb93 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:06 -0800
Subject: [PATCH 1579/2161] vfio/type1: implement unmap all

commit c196509953741a8a472127c80070a0795a072f87 upstream.

Implement VFIO_DMA_UNMAP_FLAG_ALL.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8f825f583145..b151f521a32f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1090,6 +1090,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	unsigned long pgshift;
 	dma_addr_t iova = unmap->iova;
 	unsigned long size = unmap->size;
+	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
 
 	mutex_lock(&iommu->lock);
 
@@ -1099,8 +1100,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	if (iova & (pgsize - 1))
 		goto unlock;
 
-	if (!size || size & (pgsize - 1))
+	if (unmap_all) {
+		if (iova || size)
+			goto unlock;
+		size = SIZE_MAX;
+	} else if (!size || size & (pgsize - 1)) {
 		goto unlock;
+	}
 
 	if (iova + size - 1 < iova || size > SIZE_MAX)
 		goto unlock;
@@ -1144,7 +1150,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	 * will only return success and a size of zero if there were no
 	 * mappings within the range.
 	 */
-	if (iommu->v2) {
+	if (iommu->v2 && !unmap_all) {
 		dma = vfio_find_dma(iommu, iova, 1);
 		if (dma && dma->iova != iova)
 			goto unlock;
@@ -2503,6 +2509,7 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	case VFIO_TYPE1_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
 	case VFIO_TYPE1_NESTING_IOMMU:
+	case VFIO_UNMAP_ALL:
 		return 1;
 	case VFIO_DMA_CC_IOMMU:
 		if (!iommu)
@@ -2696,6 +2703,8 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
 {
 	struct vfio_iommu_type1_dma_unmap unmap;
 	struct vfio_bitmap bitmap = { 0 };
+	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
+			VFIO_DMA_UNMAP_FLAG_ALL;
 	unsigned long minsz;
 	int ret;
 
@@ -2704,8 +2713,11 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
 	if (copy_from_user(&unmap, (void __user *)arg, minsz))
 		return -EFAULT;
 
-	if (unmap.argsz < minsz ||
-	    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
+	if (unmap.argsz < minsz || unmap.flags & ~mask)
+		return -EINVAL;
+
+	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+	    (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL))
 		return -EINVAL;
 
 	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
-- 
Gitee


From 1e955b34515b0f63f9eaefc94f30238d8ba5ad81 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:07 -0800
Subject: [PATCH 1580/2161] vfio: interfaces to update vaddr

commit 441e8106a238920f28e2e79cff462044551f60e2 upstream.

Define interfaces that allow the underlying memory object of an iova
range to be mapped to a new host virtual address in the host process:

  - VFIO_DMA_UNMAP_FLAG_VADDR for VFIO_IOMMU_UNMAP_DMA
  - VFIO_DMA_MAP_FLAG_VADDR flag for VFIO_IOMMU_MAP_DMA
  - VFIO_UPDATE_VADDR extension for VFIO_CHECK_EXTENSION

Unmap vaddr invalidates the host virtual address in an iova range, and
blocks vfio translation of host virtual addresses.  DMA to already-mapped
pages continues.  Map vaddr updates the base VA and resumes translation.
See comments in uapi/linux/vfio.h for more details.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 65dcd3012054..335c0d95ffca 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -49,6 +49,9 @@
 /* Supports VFIO_DMA_UNMAP_FLAG_ALL */
 #define VFIO_UNMAP_ALL			9
 
+/* Supports the vaddr flag for DMA map and unmap */
+#define VFIO_UPDATE_VADDR		10
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -1072,12 +1075,22 @@ struct vfio_iommu_type1_info_cap_migration {
  *
  * Map process virtual addresses to IO virtual addresses using the
  * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ *
+ * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and
+ * unblock translation of host virtual addresses in the iova range.  The vaddr
+ * must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR.  To
+ * maintain memory consistency within the user application, the updated vaddr
+ * must address the same memory object as originally mapped.  Failure to do so
+ * will result in user memory corruption and/or device misbehavior.  iova and
+ * size must match those in the original MAP_DMA call.  Protection is not
+ * changed, and the READ & WRITE flags must be 0.
  */
 struct vfio_iommu_type1_dma_map {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
 #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
+#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
 	__u64	vaddr;				/* Process virtual address */
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
@@ -1113,12 +1126,18 @@ struct vfio_bitmap {
  *
  * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses.  iova and size
  * must be 0.  This cannot be combined with the get-dirty-bitmap flag.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host
+ * virtual addresses in the iova range.  Tasks that attempt to translate an
+ * iova's vaddr will block.  DMA to already-mapped pages continues.  This
+ * cannot be combined with the get-dirty-bitmap flag.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
 #define VFIO_DMA_UNMAP_FLAG_ALL		     (1 << 1)
+#define VFIO_DMA_UNMAP_FLAG_VADDR	     (1 << 2)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 	__u8    data[];
-- 
Gitee


From 36c8c1b474c6431e645b9df2c64a300d5ba8fbf6 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:08 -0800
Subject: [PATCH 1581/2161] vfio/type1: massage unmap iteration

commit 40ae9b807b89e60f86a40f33b931e6cde7eed2b7 upstream.

Modify the iteration in vfio_dma_do_unmap so it does not depend on deletion
of each dma entry.  Add a variant of vfio_find_dma that returns the entry
with the lowest iova in the search range to initialize the iteration.  No
externally visible change, but this behavior is needed in the subsequent
update-vaddr patch.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 35 ++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b151f521a32f..5bc12d609a80 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -174,6 +174,31 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 	return NULL;
 }
 
+static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
+						dma_addr_t start, size_t size)
+{
+	struct rb_node *res = NULL;
+	struct rb_node *node = iommu->dma_list.rb_node;
+	struct vfio_dma *dma_res = NULL;
+
+	while (node) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+		if (start < dma->iova + dma->size) {
+			res = node;
+			dma_res = dma;
+			if (start >= dma->iova)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+	if (res && size && dma_res->iova >= start + size)
+		res = NULL;
+	return res;
+}
+
 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 {
 	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
@@ -1091,6 +1116,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	dma_addr_t iova = unmap->iova;
 	unsigned long size = unmap->size;
 	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
+	struct rb_node *n;
 
 	mutex_lock(&iommu->lock);
 
@@ -1161,7 +1187,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	}
 
 	ret = 0;
-	while ((dma = vfio_find_dma(iommu, iova, size))) {
+	n = vfio_find_dma_first_node(iommu, iova, size);
+
+	while (n) {
+		dma = rb_entry(n, struct vfio_dma, node);
+		if (dma->iova >= iova + size)
+			break;
+
 		if (!iommu->v2 && iova > dma->iova)
 			break;
 		/*
@@ -1206,6 +1238,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		}
 
 		unmapped += dma->size;
+		n = rb_next(n);
 		vfio_remove_dma(iommu, dma);
 	}
 
-- 
Gitee


From 446967e979a9dd9c116431697f0738aae5246794 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:09 -0800
Subject: [PATCH 1582/2161] vfio/type1: implement interfaces to update vaddr

commit c3cbab24db3860d68924d8a3f752a97d3cca1623 upstream.

Implement VFIO_DMA_UNMAP_FLAG_VADDR, VFIO_DMA_MAP_FLAG_VADDR, and
VFIO_UPDATE_VADDR.  This is a partial implementation.  Blocking is
added in a subsequent patch.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 59 +++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5bc12d609a80..9e2533908ce6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -70,6 +70,7 @@ struct vfio_iommu {
 	struct rb_root		dma_list;
 	struct blocking_notifier_head notifier;
 	unsigned int		dma_avail;
+	unsigned int		vaddr_invalid_count;
 	uint64_t		pgsize_bitmap;
 	bool			v2;
 	bool			nesting;
@@ -93,6 +94,7 @@ struct vfio_dma {
 	int			prot;		/* IOMMU_READ/WRITE */
 	bool			iommu_mapped;
 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
+	bool			vaddr_invalid;
 	struct task_struct	*task;
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
 	unsigned long		*bitmap;
@@ -986,6 +988,8 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
 	vfio_dma_bitmap_free(dma);
+	if (dma->vaddr_invalid)
+		iommu->vaddr_invalid_count--;
 	kfree(dma);
 	iommu->dma_avail++;
 }
@@ -1116,7 +1120,8 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	dma_addr_t iova = unmap->iova;
 	unsigned long size = unmap->size;
 	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
-	struct rb_node *n;
+	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
+	struct rb_node *n, *first_n;
 
 	mutex_lock(&iommu->lock);
 
@@ -1187,7 +1192,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	}
 
 	ret = 0;
-	n = vfio_find_dma_first_node(iommu, iova, size);
+	n = first_n = vfio_find_dma_first_node(iommu, iova, size);
 
 	while (n) {
 		dma = rb_entry(n, struct vfio_dma, node);
@@ -1203,6 +1208,27 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		if (dma->task->mm != current->mm)
 			break;
 
+		if (invalidate_vaddr) {
+			if (dma->vaddr_invalid) {
+				struct rb_node *last_n = n;
+
+				for (n = first_n; n != last_n; n = rb_next(n)) {
+					dma = rb_entry(n,
+						       struct vfio_dma, node);
+					dma->vaddr_invalid = false;
+					iommu->vaddr_invalid_count--;
+				}
+				ret = -EINVAL;
+				unmapped = 0;
+				break;
+			}
+			dma->vaddr_invalid = true;
+			iommu->vaddr_invalid_count++;
+			unmapped += dma->size;
+			n = rb_next(n);
+			continue;
+		}
+
 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
 			struct vfio_iommu_type1_dma_unmap nb_unmap;
 
@@ -1342,6 +1368,7 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
+	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
 	dma_addr_t iova = map->iova;
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
@@ -1359,13 +1386,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 		prot |= IOMMU_READ;
 
+	if ((prot && set_vaddr) || (!prot && !set_vaddr))
+		return -EINVAL;
+
 	mutex_lock(&iommu->lock);
 
 	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
 
 	WARN_ON((pgsize - 1) & PAGE_MASK);
 
-	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
+	if (!size || (size | iova | vaddr) & (pgsize - 1)) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
@@ -1376,7 +1406,20 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		goto out_unlock;
 	}
 
-	if (vfio_find_dma(iommu, iova, size)) {
+	dma = vfio_find_dma(iommu, iova, size);
+	if (set_vaddr) {
+		if (!dma) {
+			ret = -ENOENT;
+		} else if (!dma->vaddr_invalid || dma->iova != iova ||
+			   dma->size != size) {
+			ret = -EINVAL;
+		} else {
+			dma->vaddr = vaddr;
+			dma->vaddr_invalid = false;
+			iommu->vaddr_invalid_count--;
+		}
+		goto out_unlock;
+	} else if (dma) {
 		ret = -EEXIST;
 		goto out_unlock;
 	}
@@ -2543,6 +2586,7 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	case VFIO_TYPE1v2_IOMMU:
 	case VFIO_TYPE1_NESTING_IOMMU:
 	case VFIO_UNMAP_ALL:
+	case VFIO_UPDATE_VADDR:
 		return 1;
 	case VFIO_DMA_CC_IOMMU:
 		if (!iommu)
@@ -2718,7 +2762,8 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
 {
 	struct vfio_iommu_type1_dma_map map;
 	unsigned long minsz;
-	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;	
+	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
+			VFIO_DMA_MAP_FLAG_VADDR;
 
 	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
@@ -2737,6 +2782,7 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
 	struct vfio_iommu_type1_dma_unmap unmap;
 	struct vfio_bitmap bitmap = { 0 };
 	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
+			VFIO_DMA_UNMAP_FLAG_VADDR |
 			VFIO_DMA_UNMAP_FLAG_ALL;
 	unsigned long minsz;
 	int ret;
@@ -2750,7 +2796,8 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
 		return -EINVAL;
 
 	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
-	    (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL))
+	    (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
+			    VFIO_DMA_UNMAP_FLAG_VADDR)))
 		return -EINVAL;
 
 	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
-- 
Gitee


From 3b90c5da621c4216e4babdcebb6a7e4ccc8a6686 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:10 -0800
Subject: [PATCH 1583/2161] vfio: iommu driver notify callback

commit ec5e32940cc9d2a8a321cb7d756fb6ae45702d03 upstream.

Define a vfio_iommu_driver_ops notify callback, for sending events to
the driver.  Drivers are not required to provide the callback, and
may ignore any events.  The handling of events is driver specific.

Define the CONTAINER_CLOSE event, called when the container's file
descriptor is closed.  This event signifies that no further state changes
will occur via container ioctl's.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c  | 5 +++++
 include/linux/vfio.h | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 4ad8a35667a7..38779e6fd80c 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1220,6 +1220,11 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
 static int vfio_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_container *container = filep->private_data;
+	struct vfio_iommu_driver *driver = container->iommu_driver;
+
+	if (driver && driver->ops->notify)
+		driver->ops->notify(container->iommu_data,
+				    VFIO_IOMMU_CONTAINER_CLOSE);
 
 	filep->private_data = NULL;
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f45940b38a02..b7e18bde5aa8 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -57,6 +57,11 @@ extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 extern void *vfio_device_data(struct vfio_device *device);
 
+/* events for the backend driver notify callback */
+enum vfio_iommu_notify_type {
+	VFIO_IOMMU_CONTAINER_CLOSE = 0,
+};
+
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
  */
@@ -92,6 +97,8 @@ struct vfio_iommu_driver_ops {
 				  void *data, size_t count, bool write);
 	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
 						   struct iommu_group *group);
+	void		(*notify)(void *iommu_data,
+				  enum vfio_iommu_notify_type event);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
-- 
Gitee


From 29a545c49792f80a7066275d78ca777fb2f4c791 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:11 -0800
Subject: [PATCH 1584/2161] vfio/type1: implement notify callback

commit 487ace1340536ea9b8daf8c783a397a467fe55fc upstream.

Implement a notify callback that remembers if the container's file
descriptor has been closed.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9e2533908ce6..6b89a625d8c1 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -76,6 +76,7 @@ struct vfio_iommu {
 	bool			nesting;
 	bool			dirty_page_tracking;
 	bool			pinned_page_dirty_scope;
+	bool			container_open;
 };
 
 struct vfio_domain {
@@ -2515,6 +2516,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	INIT_LIST_HEAD(&iommu->iova_list);
 	iommu->dma_list = RB_ROOT;
 	iommu->dma_avail = dma_entry_limit;
+	iommu->container_open = true;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
 
@@ -3085,6 +3087,18 @@ vfio_iommu_type1_group_iommu_domain(void *iommu_data,
 	return domain;
 }
 
+static void vfio_iommu_type1_notify(void *iommu_data,
+				    enum vfio_iommu_notify_type event)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	if (event != VFIO_IOMMU_CONTAINER_CLOSE)
+		return;
+	mutex_lock(&iommu->lock);
+	iommu->container_open = false;
+	mutex_unlock(&iommu->lock);
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.name			= "vfio-iommu-type1",
 	.owner			= THIS_MODULE,
@@ -3099,6 +3113,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
 	.dma_rw			= vfio_iommu_type1_dma_rw,
 	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
+	.notify			= vfio_iommu_type1_notify,
 };
 
 static int __init vfio_iommu_type1_init(void)
-- 
Gitee


From 18274c08ebb979a4ce39bf5f69da2dff12978e7e Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:12 -0800
Subject: [PATCH 1585/2161] vfio/type1: block on invalid vaddr

commit 898b9eaeb3fe5eb1ac510f2c4775c8277206752c upstream.

Block translation of host virtual address while an iova range has an
invalid vaddr.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 95 +++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6b89a625d8c1..4c26afcb5733 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -32,6 +32,7 @@
 #include <linux/rbtree.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/kthread.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
@@ -72,6 +73,7 @@ struct vfio_iommu {
 	unsigned int		dma_avail;
 	unsigned int		vaddr_invalid_count;
 	uint64_t		pgsize_bitmap;
+	wait_queue_head_t	vaddr_wait;
 	bool			v2;
 	bool			nesting;
 	bool			dirty_page_tracking;
@@ -147,6 +149,8 @@ struct vfio_regions {
 #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
 #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 
+#define WAITED 1
+
 static int put_pfn(unsigned long pfn, int prot);
 
 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
@@ -518,6 +522,61 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 	return ret;
 }
 
+static int vfio_wait(struct vfio_iommu *iommu)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
+	mutex_unlock(&iommu->lock);
+	schedule();
+	mutex_lock(&iommu->lock);
+	finish_wait(&iommu->vaddr_wait, &wait);
+	if (kthread_should_stop() || !iommu->container_open ||
+	    fatal_signal_pending(current)) {
+		return -EFAULT;
+	}
+	return WAITED;
+}
+
+/*
+ * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
+ * if the task waits, but is re-locked on return.  Return result in *dma_p.
+ * Return 0 on success with no waiting, WAITED on success if waited, and -errno
+ * on error.
+ */
+static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
+			       size_t size, struct vfio_dma **dma_p)
+{
+	int ret;
+
+	do {
+		*dma_p = vfio_find_dma(iommu, start, size);
+		if (!*dma_p)
+			ret = -EINVAL;
+		else if (!(*dma_p)->vaddr_invalid)
+			ret = 0;
+		else
+			ret = vfio_wait(iommu);
+	} while (ret > 0);
+
+	return ret;
+}
+
+/*
+ * Wait for all vaddr in the dma_list to become valid.  iommu lock is dropped
+ * if the task waits, but is re-locked on return.  Return 0 on success with no
+ * waiting, WAITED on success if waited, and -errno on error.
+ */
+static int vfio_wait_all_valid(struct vfio_iommu *iommu)
+{
+	int ret = 0;
+
+	while (iommu->vaddr_invalid_count && ret >= 0)
+		ret = vfio_wait(iommu);
+
+	return ret;
+}
+
 /*
  * Attempt to pin pages.  We really don't want to track all the pfns and
  * the iommu can only map chunks of consecutive pfns anyway, so get the
@@ -679,6 +738,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	unsigned long remote_vaddr;
 	struct vfio_dma *dma;
 	bool do_accounting;
+	dma_addr_t iova;
 
 	if (!iommu || !user_pfn || !phys_pfn)
 		return -EINVAL;
@@ -689,6 +749,22 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
+	/*
+	 * Wait for all necessary vaddr's to be valid so they can be used in
+	 * the main loop without dropping the lock, to avoid racing vs unmap.
+	 */
+again:
+	if (iommu->vaddr_invalid_count) {
+		for (i = 0; i < npage; i++) {
+			iova = user_pfn[i] << PAGE_SHIFT;
+			ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
+			if (ret < 0)
+				goto pin_done;
+			if (ret == WAITED)
+				goto again;
+		}
+	}
+
 	/* Fail if notifier list is empty */
 	if (!iommu->notifier.head) {
 		ret = -EINVAL;
@@ -703,7 +779,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
 
 	for (i = 0; i < npage; i++) {
-		dma_addr_t iova;
 		struct vfio_pfn *vpfn;
 
 		iova = user_pfn[i] << PAGE_SHIFT;
@@ -989,8 +1064,10 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
 	vfio_dma_bitmap_free(dma);
-	if (dma->vaddr_invalid)
+	if (dma->vaddr_invalid) {
 		iommu->vaddr_invalid_count--;
+		wake_up_all(&iommu->vaddr_wait);
+	}
 	kfree(dma);
 	iommu->dma_avail++;
 }
@@ -1418,6 +1495,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			dma->vaddr = vaddr;
 			dma->vaddr_invalid = false;
 			iommu->vaddr_invalid_count--;
+			wake_up_all(&iommu->vaddr_wait);
 		}
 		goto out_unlock;
 	} else if (dma) {
@@ -1517,6 +1595,10 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	int ret;
 
+	ret = vfio_wait_all_valid(iommu);
+	if (ret < 0)
+		return ret;
+
 	/* Arbitrarily pick the first domain in the list for lookups */
 	if (!list_empty(&iommu->domain_list))
 		d = list_first_entry(&iommu->domain_list,
@@ -2519,6 +2601,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	iommu->container_open = true;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
+	init_waitqueue_head(&iommu->vaddr_wait);
 
 	return iommu;
 }
@@ -2990,12 +3073,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
 	struct vfio_dma *dma;
 	bool kthread = current->mm == NULL;
 	size_t offset;
+	int ret;
 
 	*copied = 0;
 
-	dma = vfio_find_dma(iommu, user_iova, 1);
-	if (!dma)
-		return -EINVAL;
+	ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
+	if (ret < 0)
+		return ret;
 
 	if ((write && !(dma->prot & IOMMU_WRITE)) ||
 			!(dma->prot & IOMMU_READ))
@@ -3097,6 +3181,7 @@ static void vfio_iommu_type1_notify(void *iommu_data,
 	mutex_lock(&iommu->lock);
 	iommu->container_open = false;
 	mutex_unlock(&iommu->lock);
+	wake_up_all(&iommu->vaddr_wait);
 }
 
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
-- 
Gitee


From eb9815e0093d3dfa90de25ea51b975e53946b7eb Mon Sep 17 00:00:00 2001
From: Keqian Zhu <zhukeqian1@huawei.com>
Date: Fri, 22 Jan 2021 17:26:34 +0800
Subject: [PATCH 1586/2161] vfio/iommu_type1: Populate full dirty when detach
 non-pinned group

commit d0a78f91761fcd837da1e7a4b0f8368873adc646 upstream.

If a group with non-pinned-page dirty scope is detached with dirty
logging enabled, we should fully populate the dirty bitmaps at the
time it's removed since we don't know the extent of its previous DMA,
nor will the group be present to trigger the full bitmap when the user
retrieves the dirty bitmap.

Fixes: d6a4c185660c ("vfio iommu: Implementation of ioctl for dirty pages tracking")
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4c26afcb5733..f5c46756ff7d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -269,6 +269,18 @@ static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
 	}
 }
 
+static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
+{
+	struct rb_node *n;
+	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
+	}
+}
+
 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
 {
 	struct rb_node *n;
@@ -2567,8 +2579,11 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 	 * Removal of a group without dirty tracking may allow the iommu scope
 	 * to be promoted.
 	 */
-	if (update_dirty_scope)
+	if (update_dirty_scope) {
 		update_pinned_page_dirty_scope(iommu);
+		if (iommu->dirty_page_tracking)
+			vfio_iommu_populate_bitmap_full(iommu);
+	}
 	mutex_unlock(&iommu->lock);
 }
 
-- 
Gitee


From d029588189dcf9ab119df3ba8202a104736c1f62 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 10 Jun 2022 10:42:18 +0800
Subject: [PATCH 1587/2161] vfio/iommu_type1: Mantain a counter for
 non_pinned_groups

commit 010321565a7d2cd73eeed6f589693ce752e154bd upstream.

With this counter, we never need to traverse all groups to update
pinned_scope of vfio_iommu.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 40 +++++----------------------------
 1 file changed, 5 insertions(+), 35 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f5c46756ff7d..71a631743ee1 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,11 +73,11 @@ struct vfio_iommu {
 	unsigned int		dma_avail;
 	unsigned int		vaddr_invalid_count;
 	uint64_t		pgsize_bitmap;
+	uint64_t		num_non_pinned_groups;
 	wait_queue_head_t	vaddr_wait;
 	bool			v2;
 	bool			nesting;
 	bool			dirty_page_tracking;
-	bool			pinned_page_dirty_scope;
 	bool			container_open;
 };
 
@@ -156,7 +156,6 @@ static int put_pfn(unsigned long pfn, int prot);
 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
 					       struct iommu_group *iommu_group);
 
-static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -840,7 +839,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
 	if (!group->pinned_page_dirty_scope) {
 		group->pinned_page_dirty_scope = true;
-		update_pinned_page_dirty_scope(iommu);
+		iommu->num_non_pinned_groups--;
 	}
 
 	goto pin_done;
@@ -1122,7 +1121,7 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 	 * mark all pages dirty if any IOMMU capable device is not able
 	 * to report dirty pages and all pages are pinned and mapped.
 	 */
-	if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped)
+	if (iommu->num_non_pinned_groups && dma->iommu_mapped)
 		bitmap_set(dma->bitmap, 0, nbits);
 
 	if (shift) {
@@ -1803,33 +1802,6 @@ static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
 	return group;
 }
 
-static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
-{
-	struct vfio_domain *domain;
-	struct vfio_group *group;
-
-	list_for_each_entry(domain, &iommu->domain_list, next) {
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (!group->pinned_page_dirty_scope) {
-				iommu->pinned_page_dirty_scope = false;
-				return;
-			}
-		}
-	}
-
-	if (iommu->external_domain) {
-		domain = iommu->external_domain;
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (!group->pinned_page_dirty_scope) {
-				iommu->pinned_page_dirty_scope = false;
-				return;
-			}
-		}
-	}
-
-	iommu->pinned_page_dirty_scope = true;
-}
-
 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
 				  phys_addr_t *base)
 {
@@ -2238,8 +2210,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 			 * addition of a dirty tracking group.
 			 */
 			group->pinned_page_dirty_scope = true;
-			if (!iommu->pinned_page_dirty_scope)
-				update_pinned_page_dirty_scope(iommu);
 			mutex_unlock(&iommu->lock);
 
 			return 0;
@@ -2368,7 +2338,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	 * demotes the iommu scope until it declares itself dirty tracking
 	 * capable via the page pinning interface.
 	 */
-	iommu->pinned_page_dirty_scope = false;
+	iommu->num_non_pinned_groups++;
 	mutex_unlock(&iommu->lock);
 	vfio_iommu_resv_free(&group_resv_regions);
 
@@ -2580,7 +2550,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 	 * to be promoted.
 	 */
 	if (update_dirty_scope) {
-		update_pinned_page_dirty_scope(iommu);
+		iommu->num_non_pinned_groups--;
 		if (iommu->dirty_page_tracking)
 			vfio_iommu_populate_bitmap_full(iommu);
 	}
-- 
Gitee


From 4c1fecb350983443d9a191c28d84282af5583fae Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 24 Jan 2021 16:35:41 +0100
Subject: [PATCH 1588/2161] vfio/pci: Fix handling of pci use accessor return
 codes

commit 37a682ffbe2ab31b51123b67bd30ab42d1131cc1 upstream.

The pci user accessors return negative errno's on error.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
[aw: drop Fixes tag, pcibios_err_to_errno() behaves correctly for -errno]
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_igd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index 53d97f459252..e66dfb0178ed 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -127,7 +127,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_byte(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		if (copy_to_user(buf + count - size, &val, 1))
 			return -EFAULT;
@@ -141,7 +141,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_word(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		val = cpu_to_le16(val);
 		if (copy_to_user(buf + count - size, &val, 2))
@@ -156,7 +156,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_dword(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		val = cpu_to_le32(val);
 		if (copy_to_user(buf + count - size, &val, 4))
@@ -171,7 +171,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_word(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		val = cpu_to_le16(val);
 		if (copy_to_user(buf + count - size, &val, 2))
@@ -186,7 +186,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_byte(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		if (copy_to_user(buf + count - size, &val, 1))
 			return -EFAULT;
-- 
Gitee


From 76aad26d67a7d54348c027f22d6f163c0cd1d47f Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Mon, 1 Feb 2021 16:28:24 +0000
Subject: [PATCH 1589/2161] vfio-pci/zdev: remove unused vdev argument

commit 46c474666094448ee843266adfc38bde43ef8f68 upstream.

Zdev static functions do not use vdev argument. Remove it.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_zdev.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 229685634031..53084521cc80 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -24,8 +24,7 @@
 /*
  * Add the Base PCI Function information to the device info region.
  */
-static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
-			 struct vfio_info_cap *caps)
+static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 {
 	struct vfio_device_info_cap_zpci_base cap = {
 		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
@@ -45,8 +44,7 @@ static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
 /*
  * Add the Base PCI Function Group information to the device info region.
  */
-static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
-			  struct vfio_info_cap *caps)
+static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 {
 	struct vfio_device_info_cap_zpci_group cap = {
 		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
@@ -66,8 +64,7 @@ static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
 /*
  * Add the device utility string to the device info region.
  */
-static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
-			 struct vfio_info_cap *caps)
+static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 {
 	struct vfio_device_info_cap_zpci_util *cap;
 	int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN;
@@ -90,8 +87,7 @@ static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
 /*
  * Add the function path string to the device info region.
  */
-static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
-			 struct vfio_info_cap *caps)
+static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 {
 	struct vfio_device_info_cap_zpci_pfip *cap;
 	int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS;
@@ -123,21 +119,21 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
 	if (!zdev)
 		return -ENODEV;
 
-	ret = zpci_base_cap(zdev, vdev, caps);
+	ret = zpci_base_cap(zdev, caps);
 	if (ret)
 		return ret;
 
-	ret = zpci_group_cap(zdev, vdev, caps);
+	ret = zpci_group_cap(zdev, caps);
 	if (ret)
 		return ret;
 
 	if (zdev->util_str_avail) {
-		ret = zpci_util_cap(zdev, vdev, caps);
+		ret = zpci_util_cap(zdev, caps);
 		if (ret)
 			return ret;
 	}
 
-	ret = zpci_pfip_cap(zdev, vdev, caps);
+	ret = zpci_pfip_cap(zdev, caps);
 
 	return ret;
 }
-- 
Gitee


From 57bc1f996823f7da175afa6c9389d943001957a2 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Mon, 1 Feb 2021 16:28:25 +0000
Subject: [PATCH 1590/2161] vfio-pci/zdev: fix possible segmentation fault
 issue

commit 7e31d6dc2c78b2a0ba0039ca97ca98a581e8db82 upstream.

In case allocation fails, we must behave correctly and exit with error.

Fixes: e6b817d4b821 ("vfio-pci/zdev: Add zPCI capabilities to VFIO_DEVICE_GET_INFO")
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_zdev.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 53084521cc80..7b011b62c766 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -71,6 +71,8 @@ static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 	int ret;
 
 	cap = kmalloc(cap_size, GFP_KERNEL);
+	if (!cap)
+		return -ENOMEM;
 
 	cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL;
 	cap->header.version = 1;
@@ -94,6 +96,8 @@ static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 	int ret;
 
 	cap = kmalloc(cap_size, GFP_KERNEL);
+	if (!cap)
+		return -ENOMEM;
 
 	cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP;
 	cap->header.version = 1;
-- 
Gitee


From f2791f45ec3c68d9c7db2ba374410e9aad8a27c6 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 18 Feb 2021 10:44:35 +0000
Subject: [PATCH 1591/2161] vfio/pci: remove CONFIG_VFIO_PCI_ZDEV from Kconfig

commit b9abef43a08ef7faa33477cccb0c08c64eb2b8bf upstream.

In case we're running on s390 system always expose the capabilities for
configuration of zPCI devices. In case we're running on different
platform, continue as usual.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/Kconfig            | 14 +-------------
 drivers/vfio/pci/Makefile           |  2 +-
 drivers/vfio/pci/vfio_pci.c         | 12 +++++-------
 drivers/vfio/pci/vfio_pci_private.h |  2 +-
 4 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index e06ff98a08a6..2232a4a4215b 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -44,16 +44,4 @@ config VFIO_PCI_NVLINK2
 	def_bool y
 	depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU
 	help
-	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
-
-config VFIO_PCI_ZDEV
-	bool "VFIO PCI ZPCI device CLP support"
-	depends on VFIO_PCI && S390
-	default y
-	help
-	  Enabling this option exposes VFIO capabilities containing hardware
-	  configuration for zPCI devices. This enables userspace (e.g. QEMU)
-	  to supply proper configuration values instead of hard-coded defaults
-	  for zPCI devices passed through via VFIO on s390.
-
-	  Say Y here.
\ No newline at end of file
+	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
\ No newline at end of file
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 781e0809d6ee..eff97a7cd9f1 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -3,6 +3,6 @@
 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
-vfio-pci-$(CONFIG_VFIO_PCI_ZDEV) += vfio_pci_zdev.o
+vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b88755270db1..79d7af95c071 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -807,6 +807,7 @@ static long vfio_pci_ioctl(void *device_data,
 		struct vfio_device_info info;
 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
 		unsigned long capsz;
+		int ret;
 
 		minsz = offsetofend(struct vfio_device_info, num_irqs);
 
@@ -832,13 +833,10 @@ static long vfio_pci_ioctl(void *device_data,
 		info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
 		info.num_irqs = VFIO_PCI_NUM_IRQS;
 
-		if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
-			int ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
-
-			if (ret && ret != -ENODEV) {
-				pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
-				return ret;
-			}
+		ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
+		if (ret && ret != -ENODEV) {
+			pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
+			return ret;
 		}
 
 		if (caps.size) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index de2945440fab..3c3e93320fd9 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -214,7 +214,7 @@ static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
 }
 #endif
 
-#ifdef CONFIG_VFIO_PCI_ZDEV
+#ifdef CONFIG_S390
 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
 				       struct vfio_info_cap *caps);
 #else
-- 
Gitee


From f05d787ec10ebb7ad4d0631337d5eba2312703e0 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 10 Jun 2022 11:42:56 +0800
Subject: [PATCH 1592/2161] vfio/type1: Change success value of vaddr_get_pfn()

commit be16c1fd99f41abebc0bf965d5d29cd18c9d271e upstream.

vaddr_get_pfn() simply returns 0 on success.  Have it report the number
of pfns successfully gotten instead, whether from page pinning or
follow_fault_pfn(), which will be used later when batching pinning.

Change the last check in vfio_pin_pages_remote() for consistency with
the other two.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 71a631743ee1..d1655d52a52e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -495,6 +495,10 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 	return ret;
 }
 
+/*
+ * Returns the positive number of pfns successfully obtained or a negative
+ * error code.
+ */
 static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 			 int prot, unsigned long *pfn)
 {
@@ -511,7 +515,6 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 				    page, NULL, NULL);
 	if (ret == 1) {
 		*pfn = page_to_pfn(page[0]);
-		ret = 0;
 		goto done;
 	}
 
@@ -525,8 +528,12 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 		if (ret == -EAGAIN)
 			goto retry;
 
-		if (!ret && !is_invalid_reserved_pfn(*pfn))
-			ret = -EFAULT;
+		if (!ret) {
+			if (is_invalid_reserved_pfn(*pfn))
+				ret = 1;
+			else
+				ret = -EFAULT;
+		}
 	}
 done:
 	up_read(&mm->mmap_sem);
@@ -607,7 +614,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 		return -ENODEV;
 
 	ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	pinned++;
@@ -634,7 +641,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
 	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
 		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
-		if (ret)
+		if (ret < 0)
 			break;
 
 		if (pfn != *pfn_base + pinned ||
@@ -660,7 +667,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	ret = vfio_lock_acct(dma, lock_acct, false);
 
 unpin_out:
-	if (ret) {
+	if (ret < 0) {
 		if (!rsvd) {
 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
 				put_pfn(pfn, dma->prot);
@@ -704,7 +711,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 		return -ENODEV;
 
 	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
-	if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+	if (ret == 1 && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
 		ret = vfio_lock_acct(dma, 1, true);
 		if (ret) {
 			put_pfn(*pfn_base, dma->prot);
-- 
Gitee


From 03e0b39a5618216e45bd05f2d589d2d86ff80d08 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 10 Jun 2022 14:42:14 +0800
Subject: [PATCH 1593/2161] vfio/type1: Prepare for batched pinning with struct
 vfio_batch

commit 4b6c33b3229678e38a6b0bbd4367d4b91366b523 upstream.

Get ready to pin more pages at once with struct vfio_batch, which
represents a batch of pinned pages.

The struct has a fallback page pointer to avoid two unlikely scenarios:
pointlessly allocating a page if disable_hugepages is enabled or failing
the whole pinning operation if the kernel can't allocate memory.

vaddr_get_pfn() becomes vaddr_get_pfns() to prepare for handling
multiple pages, though for now only one page is stored in the pages
array.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 71 +++++++++++++++++++++++++++------
 1 file changed, 58 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d1655d52a52e..1298eee75b3b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -103,6 +103,12 @@ struct vfio_dma {
 	unsigned long		*bitmap;
 };
 
+struct vfio_batch {
+	struct page		**pages;	/* for pin_user_pages_remote */
+	struct page		*fallback_page; /* if pages alloc fails */
+	int			capacity;	/* length of pages array */
+};
+
 struct vfio_group {
 	struct iommu_group	*iommu_group;
 	struct list_head	next;
@@ -495,14 +501,39 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 	return ret;
 }
 
+#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
+
+static void vfio_batch_init(struct vfio_batch *batch)
+{
+	if (unlikely(disable_hugepages))
+		goto fallback;
+
+	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!batch->pages)
+		goto fallback;
+
+	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
+	return;
+
+fallback:
+	batch->pages = &batch->fallback_page;
+	batch->capacity = 1;
+}
+
+static void vfio_batch_fini(struct vfio_batch *batch)
+{
+	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
+		free_page((unsigned long)batch->pages);
+}
+
 /*
  * Returns the positive number of pfns successfully obtained or a negative
  * error code.
  */
-static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
-			 int prot, unsigned long *pfn)
+static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
+			  long npages, int prot, unsigned long *pfn,
+			  struct page **pages)
 {
-	struct page *page[1];
 	struct vm_area_struct *vma;
 	unsigned int flags = 0;
 	int ret;
@@ -511,10 +542,10 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 		flags |= FOLL_WRITE;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags | FOLL_LONGTERM,
-				    page, NULL, NULL);
-	if (ret == 1) {
-		*pfn = page_to_pfn(page[0]);
+	ret = get_user_pages_remote(NULL, mm, vaddr, npages, flags | FOLL_LONGTERM,
+				    pages, NULL, NULL);
+	if (ret > 0) {
+		*pfn = page_to_pfn(pages[0]);
 		goto done;
 	}
 
@@ -602,7 +633,7 @@ static int vfio_wait_all_valid(struct vfio_iommu *iommu)
  */
 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 				  long npage, unsigned long *pfn_base,
-				  unsigned long limit)
+				  unsigned long limit, struct vfio_batch *batch)
 {
 	unsigned long pfn = 0;
 	long ret, pinned = 0, lock_acct = 0;
@@ -613,7 +644,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	if (!current->mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
+	ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base,
+			     batch->pages);
 	if (ret < 0)
 		return ret;
 
@@ -640,7 +672,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	/* Lock all the consecutive pages from pfn_base */
 	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
 	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
-		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
+		ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn,
+				     batch->pages);
 		if (ret < 0)
 			break;
 
@@ -703,6 +736,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 				  unsigned long *pfn_base, bool do_accounting)
 {
+	struct page *pages[1];
 	struct mm_struct *mm;
 	int ret;
 
@@ -710,7 +744,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 	if (!mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
+	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
 	if (ret == 1 && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
 		ret = vfio_lock_acct(dma, 1, true);
 		if (ret) {
@@ -1404,15 +1438,19 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 {
 	dma_addr_t iova = dma->iova;
 	unsigned long vaddr = dma->vaddr;
+	struct vfio_batch batch;
 	size_t size = map_size;
 	long npage;
 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	int ret = 0;
 
+	vfio_batch_init(&batch);
+
 	while (size) {
 		/* Pin a contiguous chunk of memory */
 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
-					      size >> PAGE_SHIFT, &pfn, limit);
+					      size >> PAGE_SHIFT, &pfn, limit,
+					      &batch);
 		if (npage <= 0) {
 			WARN_ON(!npage);
 			ret = (int)npage;
@@ -1432,6 +1470,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		dma->size += npage << PAGE_SHIFT;
 	}
 
+	vfio_batch_fini(&batch);
 	dma->iommu_mapped = true;
 
 	if (ret)
@@ -1608,6 +1647,7 @@ static int vfio_bus_type(struct device *dev, void *data)
 static int vfio_iommu_replay(struct vfio_iommu *iommu,
 			     struct vfio_domain *domain)
 {
+	struct vfio_batch batch;
 	struct vfio_domain *d = NULL;
 	struct rb_node *n;
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1622,6 +1662,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		d = list_first_entry(&iommu->domain_list,
 				     struct vfio_domain, next);
 
+	vfio_batch_init(&batch);
+
 	n = rb_first(&iommu->dma_list);
 
 	for (; n; n = rb_next(n)) {
@@ -1669,7 +1711,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 
 				npage = vfio_pin_pages_remote(dma, vaddr,
 							      n >> PAGE_SHIFT,
-							      &pfn, limit);
+							      &pfn, limit,
+							      &batch);
 				if (npage <= 0) {
 					WARN_ON(!npage);
 					ret = (int)npage;
@@ -1702,6 +1745,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		dma->iommu_mapped = true;
 	}
 
+	vfio_batch_fini(&batch);
 	return 0;
 
 unwind:
@@ -1742,6 +1786,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		}
 	}
 
+	vfio_batch_fini(&batch);
 	return ret;
 }
 
-- 
Gitee


From b2c110e3d92edaf2b9da19a3b27c49b485de6127 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Fri, 19 Feb 2021 11:13:05 -0500
Subject: [PATCH 1594/2161] vfio/type1: Batch page pinning

commit 4d83de6da265cd84e74c19d876055fa5f261cde4 upstream.

Pinning one 4K page at a time is inefficient, so do it in batches of 512
instead.  This is just an optimization with no functional change
intended, and in particular the driver still calls iommu_map() with the
largest physically contiguous range possible.

Add two fields in vfio_batch to remember where to start between calls to
vfio_pin_pages_remote(), and use vfio_batch_unpin() to handle remaining
pages in the batch in case of error.

qemu pins pages for guests around 8% faster on my test system, a
two-node Broadwell server with 128G memory per node.  The qemu process
was bound to one node with its allocations constrained there as well.

                             base               test
          guest              ----------------   ----------------
       mem (GB)   speedup    avg sec    (std)   avg sec    (std)
              1      7.4%       0.61   (0.00)      0.56   (0.00)
              2      8.3%       0.93   (0.00)      0.85   (0.00)
              4      8.4%       1.46   (0.00)      1.34   (0.00)
              8      8.6%       2.54   (0.01)      2.32   (0.00)
             16      8.3%       4.66   (0.00)      4.27   (0.01)
             32      8.3%       8.94   (0.01)      8.20   (0.01)
             64      8.2%      17.47   (0.01)     16.04   (0.03)
            120      8.5%      32.45   (0.13)     29.69   (0.01)

perf diff confirms less time spent in pup.  Here are the top ten
functions:

             Baseline  Delta Abs  Symbol

               78.63%     +6.64%  clear_page_erms
                1.50%     -1.50%  __gup_longterm_locked
                1.27%     -0.78%  __get_user_pages
                          +0.76%  kvm_zap_rmapp.constprop.0
                0.54%     -0.53%  vmacache_find
                0.55%     -0.51%  get_pfnblock_flags_mask
                0.48%     -0.48%  __get_user_pages_remote
                          +0.39%  slot_rmap_walk_next
                          +0.32%  vfio_pin_map_dma
                          +0.26%  kvm_handle_hva_range
                ...

Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 135 +++++++++++++++++++++-----------
 1 file changed, 89 insertions(+), 46 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 1298eee75b3b..197f744157d9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -107,6 +107,8 @@ struct vfio_batch {
 	struct page		**pages;	/* for pin_user_pages_remote */
 	struct page		*fallback_page; /* if pages alloc fails */
 	int			capacity;	/* length of pages array */
+	int			size;		/* of batch currently */
+	int			offset;		/* of next entry in pages */
 };
 
 struct vfio_group {
@@ -505,6 +507,9 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 
 static void vfio_batch_init(struct vfio_batch *batch)
 {
+	batch->size = 0;
+	batch->offset = 0;
+
 	if (unlikely(disable_hugepages))
 		goto fallback;
 
@@ -520,6 +525,17 @@ static void vfio_batch_init(struct vfio_batch *batch)
 	batch->capacity = 1;
 }
 
+static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
+{
+	while (batch->size) {
+		unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
+
+		put_pfn(pfn, dma->prot);
+		batch->offset++;
+		batch->size--;
+	}
+}
+
 static void vfio_batch_fini(struct vfio_batch *batch)
 {
 	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
@@ -635,65 +651,88 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 				  long npage, unsigned long *pfn_base,
 				  unsigned long limit, struct vfio_batch *batch)
 {
-	unsigned long pfn = 0;
+	unsigned long pfn;
+	struct mm_struct *mm = current->mm;
 	long ret, pinned = 0, lock_acct = 0;
 	bool rsvd;
 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 
 	/* This code path is only user initiated */
-	if (!current->mm)
+	if (!mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base,
-			     batch->pages);
-	if (ret < 0)
-		return ret;
-
-	pinned++;
-	rsvd = is_invalid_reserved_pfn(*pfn_base);
-
-	/*
-	 * Reserved pages aren't counted against the user, externally pinned
-	 * pages are already counted against the user.
-	 */
-	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
-		if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
-			put_pfn(*pfn_base, dma->prot);
-			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
-					limit << PAGE_SHIFT);
-			return -ENOMEM;
-		}
-		lock_acct++;
+	if (batch->size) {
+		/* Leftover pages in batch from an earlier call. */
+		*pfn_base = page_to_pfn(batch->pages[batch->offset]);
+		pfn = *pfn_base;
+		rsvd = is_invalid_reserved_pfn(*pfn_base);
+	} else {
+		*pfn_base = 0;
 	}
 
-	if (unlikely(disable_hugepages))
-		goto out;
+	while (npage) {
+		if (!batch->size) {
+			/* Empty batch, so refill it. */
+			long req_pages = min_t(long, npage, batch->capacity);
 
-	/* Lock all the consecutive pages from pfn_base */
-	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
-	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
-		ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn,
-				     batch->pages);
-		if (ret < 0)
-			break;
+			ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
+					     &pfn, batch->pages);
+			if (ret < 0)
+				goto unpin_out;
 
-		if (pfn != *pfn_base + pinned ||
-		    rsvd != is_invalid_reserved_pfn(pfn)) {
-			put_pfn(pfn, dma->prot);
-			break;
+			batch->size = ret;
+			batch->offset = 0;
+
+			if (!*pfn_base) {
+				*pfn_base = pfn;
+				rsvd = is_invalid_reserved_pfn(*pfn_base);
+			}
 		}
 
-		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
-			if (!dma->lock_cap &&
-			    current->mm->locked_vm + lock_acct + 1 > limit) {
-				put_pfn(pfn, dma->prot);
-				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
-					__func__, limit << PAGE_SHIFT);
-				ret = -ENOMEM;
-				goto unpin_out;
+		/*
+		 * pfn is preset for the first iteration of this inner loop and
+		 * updated at the end to handle a VM_PFNMAP pfn.  In that case,
+		 * batch->pages isn't valid (there's no struct page), so allow
+		 * batch->pages to be touched only when there's more than one
+		 * pfn to check, which guarantees the pfns are from a
+		 * !VM_PFNMAP vma.
+		 */
+		while (true) {
+			if (pfn != *pfn_base + pinned ||
+			    rsvd != is_invalid_reserved_pfn(pfn))
+				goto out;
+
+			/*
+			 * Reserved pages aren't counted against the user,
+			 * externally pinned pages are already counted against
+			 * the user.
+			 */
+			if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+				if (!dma->lock_cap &&
+				    mm->locked_vm + lock_acct + 1 > limit) {
+					pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+						__func__, limit << PAGE_SHIFT);
+					ret = -ENOMEM;
+					goto unpin_out;
+				}
+				lock_acct++;
 			}
-			lock_acct++;
+
+			pinned++;
+			npage--;
+			vaddr += PAGE_SIZE;
+			iova += PAGE_SIZE;
+			batch->offset++;
+			batch->size--;
+
+			if (!batch->size)
+				break;
+
+			pfn = page_to_pfn(batch->pages[batch->offset]);
 		}
+
+		if (unlikely(disable_hugepages))
+			break;
 	}
 
 out:
@@ -701,10 +740,11 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 
 unpin_out:
 	if (ret < 0) {
-		if (!rsvd) {
+		if (pinned && !rsvd) {
 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
 				put_pfn(pfn, dma->prot);
 		}
+		vfio_batch_unpin(batch, dma);
 
 		return ret;
 	}
@@ -1463,6 +1503,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		if (ret) {
 			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
 						npage, true);
+			vfio_batch_unpin(&batch, dma);
 			break;
 		}
 
@@ -1726,11 +1767,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 			ret = iommu_map(domain->domain, iova, phys,
 					size, dma->prot | domain->prot);
 			if (ret) {
-				if (!dma->iommu_mapped)
+				if (!dma->iommu_mapped) {
 					vfio_unpin_pages_remote(dma, iova,
 							phys >> PAGE_SHIFT,
 							size >> PAGE_SHIFT,
 							true);
+					vfio_batch_unpin(&batch, dma);
+				}
 				goto unwind;
 			}
 
-- 
Gitee


From 365f50c27facdf19f8180a6d3c233cd63b1e7aa8 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Thu, 25 Feb 2021 11:25:02 -0800
Subject: [PATCH 1595/2161] vfio/type1: fix unmap all on ILP32

commit 7dc4b2fdb27242faf40fc20ef83372b7033af050 upstream.

Some ILP32 architectures support mapping a 32-bit vaddr within a 64-bit
iova space.  The unmap-all code uses 32-bit SIZE_MAX as an upper bound on
the extent of the mappings within iova space, so mappings above 4G cannot
be found and unmapped.  Use U64_MAX instead, and use u64 for size variables.
This also fixes a static analysis bug found by the kernel test robot running
smatch for ILP32.

Fixes: 0f53afa12bae ("vfio/type1: unmap cleanup")
Fixes: c19650995374 ("vfio/type1: implement unmap all")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Message-Id: <1614281102-230747-1-git-send-email-steven.sistare@oracle.com>
Link: https://lore.kernel.org/linux-mm/20210222141043.GW2222@kadam
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 197f744157d9..8cfa3706a30d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -189,7 +189,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 }
 
 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
-						dma_addr_t start, size_t size)
+						dma_addr_t start, u64 size)
 {
 	struct rb_node *res = NULL;
 	struct rb_node *node = iommu->dma_list.rb_node;
@@ -1288,7 +1288,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	int ret = -EINVAL, retries = 0;
 	unsigned long pgshift;
 	dma_addr_t iova = unmap->iova;
-	unsigned long size = unmap->size;
+	u64 size = unmap->size;
 	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
 	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
 	struct rb_node *n, *first_n;
@@ -1304,14 +1304,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	if (unmap_all) {
 		if (iova || size)
 			goto unlock;
-		size = SIZE_MAX;
-	} else if (!size || size & (pgsize - 1)) {
+		size = U64_MAX;
+	} else if (!size || size & (pgsize - 1) ||
+		   iova + size - 1 < iova || size > SIZE_MAX) {
 		goto unlock;
 	}
 
-	if (iova + size - 1 < iova || size > SIZE_MAX)
-		goto unlock;
-
 	/* When dirty tracking is enabled, allow only min supported pgsize */
 	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
 	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
-- 
Gitee


From 7bd4091a92dd9ed03a482d8359bc7c54871ff5fc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 23 Feb 2021 15:17:47 -0400
Subject: [PATCH 1596/2161] vfio-platform: Add COMPILE_TEST to VFIO_PLATFORM

commit d3d72a6dfffd3fcaac969786118162b596227f70 upstream.

x86 can build platform bus code too, so vfio-platform and all the platform
reset implementations compile successfully on x86.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <2-v1-df057e0f92c3+91-vfio_arm_compile_test_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index dc1a3c44f2c6..233efde219cc 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PLATFORM
 	tristate "VFIO support for platform devices"
-	depends on VFIO && EVENTFD && (ARM || ARM64)
+	depends on VFIO && EVENTFD && (ARM || ARM64 || COMPILE_TEST)
 	select VFIO_VIRQFD
 	help
 	  Support for platform devices with VFIO. This is required to make
-- 
Gitee


From 5e48d3adca94bd5638177c7261d17d279c36cba9 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Mon, 8 Mar 2021 12:24:52 -0500
Subject: [PATCH 1597/2161] vfio/type1: fix vaddr_get_pfns() return in
 vfio_pin_page_external()

commit 4ab4fcfce5b540227d80eb32f1db45ab615f7c92 upstream.

vaddr_get_pfns() now returns the positive number of pfns successfully
gotten instead of zero.  vfio_pin_page_external() might return 1 to
vfio_iommu_type1_pin_pages(), which will treat it as an error, if
vaddr_get_pfns() is successful but vfio_pin_page_external() doesn't
reach vfio_lock_acct().

Fix it up in vfio_pin_page_external().  Found by inspection.

Fixes: be16c1fd99f4 ("vfio/type1: Change success value of vaddr_get_pfn()")
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Message-Id: <20210308172452.38864-1-daniel.m.jordan@oracle.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8cfa3706a30d..548fa2070f3e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -785,7 +785,12 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 		return -ENODEV;
 
 	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
-	if (ret == 1 && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+	if (ret != 1)
+		goto out;
+
+	ret = 0;
+
+	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
 		ret = vfio_lock_acct(dma, 1, true);
 		if (ret) {
 			put_pfn(*pfn_base, dma->prot);
@@ -797,6 +802,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 		}
 	}
 
+out:
 	mmput(mm);
 	return ret;
 }
-- 
Gitee


From 18b177f3351e09b1cf2b38969e461aefe82fea9c Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Wed, 24 Mar 2021 21:05:52 -0400
Subject: [PATCH 1598/2161] vfio/type1: Empty batch for pfnmap pages

commit 60c988bc154108bd522a21289e389143006a1ff0 upstream.

When vfio_pin_pages_remote() returns with a partial batch consisting of
a single VM_PFNMAP pfn, a subsequent call will unfortunately try
restoring it from batch->pages, resulting in vfio mapping the wrong page
and unbalancing the page refcount.

Prevent the function from returning with this kind of partial batch to
avoid the issue.  There's no explicit check for a VM_PFNMAP pfn because
it's awkward to do so, so infer it from characteristics of the batch
instead.  This may result in occasional false positives but keeps the
code simpler.

Fixes: 4d83de6da265 ("vfio/type1: Batch page pinning")
Link: https://lkml.kernel.org/r/20210323133254.33ed9161@omen.home.shazbot.org/
Reported-by: Alex Williamson <alex.williamson@redhat.com>
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Message-Id: <20210325010552.185481-1-daniel.m.jordan@oracle.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 548fa2070f3e..94c89b1c3d7f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -739,6 +739,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	ret = vfio_lock_acct(dma, lock_acct, false);
 
 unpin_out:
+	if (batch->size == 1 && !batch->offset) {
+		/* May be a VM_PFNMAP pfn, which the batch can't remember. */
+		put_pfn(pfn, dma->prot);
+		batch->size = 0;
+	}
+
 	if (ret < 0) {
 		if (pinned && !rsvd) {
 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
-- 
Gitee


From 7c8377c1d1373ec07bec50f8404c61bb85336bbe Mon Sep 17 00:00:00 2001
From: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Date: Tue, 30 Mar 2021 09:54:21 -0600
Subject: [PATCH 1599/2161] vfio: pci: Spello fix in the file vfio_pci.c

commit fbc9d37161b7d7e0e5f838445bbeff721dc4fc8d upstream.

s/permision/permission/

Signed-off-by: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Message-Id: <20210314052925.3560-1-unixbhaskar@gmail.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 79d7af95c071..5808ecd6ee08 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -2413,7 +2413,7 @@ static int __init vfio_pci_init(void)
 {
 	int ret;
 
-	/* Allocate shared config space permision data used by all devices */
+	/* Allocate shared config space permission data used by all devices */
 	ret = vfio_pci_init_perm_bits();
 	if (ret)
 		return ret;
-- 
Gitee


From d44273526c2e16f97e670a39af6b40e1bf9669e9 Mon Sep 17 00:00:00 2001
From: Zhou Wang <wangzhou1@hisilicon.com>
Date: Tue, 30 Mar 2021 09:54:21 -0600
Subject: [PATCH 1600/2161] vfio/pci: Remove an unnecessary blank line in
 vfio_pci_enable

commit 36f0be5a30bbe6f1783db459449f2d88c5cd1e34 upstream.

This blank line is unnecessary, so remove it.

Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Message-Id: <1615808073-178604-1-git-send-email-wangzhou1@hisilicon.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 5808ecd6ee08..dfd3570c0c81 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -378,7 +378,6 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
 		vdev->has_vga = true;
 
-
 	if (vfio_pci_is_vga(pdev) &&
 	    pdev->vendor == PCI_VENDOR_ID_INTEL &&
 	    IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
-- 
Gitee


From 38ed3c212610df89728b59a5a81c1729523029c8 Mon Sep 17 00:00:00 2001
From: Fred Gao <fred.gao@intel.com>
Date: Tue, 30 Mar 2021 09:54:21 -0600
Subject: [PATCH 1601/2161] vfio/pci: Add support for opregion v2.1+

commit bab2c1990b78b90d7e1cffbb05ccf1009a55f0d3 upstream.

Before opregion version 2.0 VBT data is stored in opregion mailbox #4,
but when VBT data exceeds 6KB size and cannot be within mailbox #4
then from opregion v2.0+, Extended VBT region, next to opregion is
used to hold the VBT data, so the total size will be opregion size plus
extended VBT region size.

Since opregion v2.0 with physical host VBT address would not be
practically available for end user and guest can not directly access
host physical address, so it is not supported.

Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Signed-off-by: Swee Yee Fonn <swee.yee.fonn@intel.com>
Signed-off-by: Fred Gao <fred.gao@intel.com>
Message-Id: <20210325170953.24549-1-fred.gao@intel.com>
Reviewed-by: Zhenyu Wang <zhenyuw@linux.intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_igd.c | 53 +++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index e66dfb0178ed..228df565e9bc 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -21,6 +21,10 @@
 #define OPREGION_SIZE		(8 * 1024)
 #define OPREGION_PCI_ADDR	0xfc
 
+#define OPREGION_RVDA		0x3ba
+#define OPREGION_RVDS		0x3c2
+#define OPREGION_VERSION	0x16
+
 static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
 			      size_t count, loff_t *ppos, bool iswrite)
 {
@@ -58,6 +62,7 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
 	u32 addr, size;
 	void *base;
 	int ret;
+	u16 version;
 
 	ret = pci_read_config_dword(vdev->pdev, OPREGION_PCI_ADDR, &addr);
 	if (ret)
@@ -83,6 +88,54 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
 
 	size *= 1024; /* In KB */
 
+	/*
+	 * Support opregion v2.1+
+	 * When VBT data exceeds 6KB size and cannot be within mailbox #4, then
+	 * the Extended VBT region next to opregion is used to hold the VBT data.
+	 * RVDA (Relative Address of VBT Data from Opregion Base) and RVDS
+	 * (Raw VBT Data Size) from opregion structure member are used to hold the
+	 * address from region base and size of VBT data. RVDA/RVDS are not
+	 * defined before opregion 2.0.
+	 *
+	 * opregion 2.1+: RVDA is unsigned, relative offset from
+	 * opregion base, and should point to the end of opregion.
+	 * otherwise, exposing to userspace to allow read access to everything between
+	 * the OpRegion and VBT is not safe.
+	 * RVDS is defined as size in bytes.
+	 *
+	 * opregion 2.0: rvda is the physical VBT address.
+	 * Since rvda is HPA it cannot be directly used in guest.
+	 * And it should not be practically available for end user,so it is not supported.
+	 */
+	version = le16_to_cpu(*(__le16 *)(base + OPREGION_VERSION));
+	if (version >= 0x0200) {
+		u64 rvda;
+		u32 rvds;
+
+		rvda = le64_to_cpu(*(__le64 *)(base + OPREGION_RVDA));
+		rvds = le32_to_cpu(*(__le32 *)(base + OPREGION_RVDS));
+		if (rvda && rvds) {
+			/* no support for opregion v2.0 with physical VBT address */
+			if (version == 0x0200) {
+				memunmap(base);
+				pci_err(vdev->pdev,
+					"IGD assignment does not support opregion v2.0 with an extended VBT region\n");
+				return -EINVAL;
+			}
+
+			if (rvda != size) {
+				memunmap(base);
+				pci_err(vdev->pdev,
+					"Extended VBT does not follow opregion on version 0x%04x\n",
+					version);
+				return -EINVAL;
+			}
+
+			/* region size for opregion v2.0+: opregion and VBT size. */
+			size += rvds;
+		}
+	}
+
 	if (size != OPREGION_SIZE) {
 		memunmap(base);
 		base = memremap(addr, size, MEMREMAP_WB);
-- 
Gitee


From a7fb355ac9fabc17c10e31f2e488a0a718b6541d Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 30 Mar 2021 09:54:21 -0600
Subject: [PATCH 1602/2161] vfio/type1: fix a couple of spelling mistakes

commit 06d738c8ab56f3de68f785154b87dae1ec94c823 upstream.

There are several spelling mistakes, as follows:
userpsace ==> userspace
Accouting ==> Accounting
exlude ==> exclude

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Message-Id: <20210326083528.1329-2-thunder.leizhen@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 94c89b1c3d7f..048fb7b5a442 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -16,7 +16,7 @@
  * IOMMU to support the IOMMU API and have few to no restrictions around
  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  * optimized for relatively static mappings of a userspace process with
- * userpsace pages pinned into memory.  We also assume devices and IOMMU
+ * userspace pages pinned into memory.  We also assume devices and IOMMU
  * domains are PCI based as the IOMMU API is still centered around a
  * device/bus interface rather than a group interface.
  */
@@ -877,7 +877,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 
 	/*
 	 * If iommu capable domain exist in the container then all pages are
-	 * already pinned and accounted. Accouting should be done if there is no
+	 * already pinned and accounted. Accounting should be done if there is no
 	 * iommu capable domain in the container.
 	 */
 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
@@ -2177,7 +2177,7 @@ static int vfio_iommu_resv_exclude(struct list_head *iova,
 				continue;
 			/*
 			 * Insert a new node if current node overlaps with the
-			 * reserve region to exlude that from valid iova range.
+			 * reserve region to exclude that from valid iova range.
 			 * Note that, new node is inserted before the current
 			 * node and finally the current node is deleted keeping
 			 * the list updated and sorted.
-- 
Gitee


From d26f2c17300720ff19dc6d67ece39b45896430ae Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 30 Mar 2021 09:54:21 -0600
Subject: [PATCH 1603/2161] vfio/mdev: Fix spelling mistake "interal" ->
 "internal"

commit d0a7541dd9998c3b911943fee3bb726d9c2d89c3 upstream.

There is a spelling mistake in a comment, fix it.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Message-Id: <20210326083528.1329-3-thunder.leizhen@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_private.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index 7d922950caaf..4d62b76c4734 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Mediated device interal definitions
+ * Mediated device internal definitions
  *
  * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  *     Author: Neo Jia <cjia@nvidia.com>
-- 
Gitee


From 30b2b521ab7588928d4a404da91e0da5f60b7d9a Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 30 Mar 2021 09:54:21 -0600
Subject: [PATCH 1604/2161] vfio/pci: fix a couple of spelling mistakes

commit d0915b32917c3afb390c0eca1a77fd656527d5dc upstream.

There are several spelling mistakes, as follows:
thru ==> through
presense ==> presence

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Message-Id: <20210326083528.1329-4-thunder.leizhen@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_config.c  | 2 +-
 drivers/vfio/pci/vfio_pci_nvlink2.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index a402adee8a21..d57f037f65b8 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -101,7 +101,7 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
 /*
  * Read/Write Permission Bits - one bit for each bit in capability
  * Any field can be read if it exists, but what is read depends on
- * whether the field is 'virtualized', or just pass thru to the
+ * whether the field is 'virtualized', or just pass through to the
  * hardware.  Any virtualized field is also virtualized for writes.
  * Writes are only permitted if they have a 1 bit here.
  */
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
index cf2e5b7512c2..cd275a8e5155 100644
--- a/drivers/vfio/pci/vfio_pci_nvlink2.c
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -219,7 +219,7 @@ int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
 	unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM;
 
 	/*
-	 * PCI config space does not tell us about NVLink presense but
+	 * PCI config space does not tell us about NVLink presence but
 	 * platform does, use this.
 	 */
 	npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0);
@@ -402,7 +402,7 @@ int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
 	u32 link_speed = 0xff;
 
 	/*
-	 * PCI config space does not tell us about NVLink presense but
+	 * PCI config space does not tell us about NVLink presence but
 	 * platform does, use this.
 	 */
 	if (!pnv_pci_get_gpu_dev(vdev->pdev))
-- 
Gitee


From cab1ac483d519aedba62d8c6ca0246d48643c4a3 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 30 Mar 2021 09:54:22 -0600
Subject: [PATCH 1605/2161] vfio/platform: Fix spelling mistake "registe" ->
 "register"

commit f5c858ec2b1d2a2656d78a5efe37cfcf568fce31 upstream.

There is a spelling mistake in a comment, fix it.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Acked-by: Eric Auger <eric.auger@redhat.com>
Message-Id: <20210326083528.1329-5-thunder.leizhen@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
index 09a9453b75c5..63cc7f0b2e4a 100644
--- a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
+++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
@@ -26,7 +26,7 @@
 #define XGMAC_DMA_CONTROL       0x00000f18      /* Ctrl (Operational Mode) */
 #define XGMAC_DMA_INTR_ENA      0x00000f1c      /* Interrupt Enable */
 
-/* DMA Control registe defines */
+/* DMA Control register defines */
 #define DMA_CONTROL_ST          0x00002000      /* Start/Stop Transmission */
 #define DMA_CONTROL_SR          0x00000002      /* Start/Stop Receive */
 
-- 
Gitee


From 4cd53435bffa9b56ae7c976ca553d608e0620618 Mon Sep 17 00:00:00 2001
From: Shenming Lu <lushenming@huawei.com>
Date: Tue, 6 Apr 2021 21:50:09 +0800
Subject: [PATCH 1606/2161] vfio/type1: Remove the almost unused check in
 vfio_iommu_type1_unpin_pages

commit a536019d3e7d85a901c5e6a2f2894c0aa0acaefa upstream.

The check i > npage at the end of vfio_iommu_type1_unpin_pages is unused
unless npage < 0, but if npage < 0, this function will return npage, which
should return -EINVAL instead. So let's just check the parameter npage at
the start of the function. By the way, replace unpin_exit with break.

Signed-off-by: Shenming Lu <lushenming@huawei.com>
Message-Id: <20210406135009.1707-1-lushenming@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 048fb7b5a442..e0243886c418 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -960,7 +960,7 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 	bool do_accounting;
 	int i;
 
-	if (!iommu || !user_pfn)
+	if (!iommu || !user_pfn || npage <= 0)
 		return -EINVAL;
 
 	/* Supported for v2 version only */
@@ -977,13 +977,13 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 		iova = user_pfn[i] << PAGE_SHIFT;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		if (!dma)
-			goto unpin_exit;
+			break;
+
 		vfio_unpin_page_external(dma, iova, do_accounting);
 	}
 
-unpin_exit:
 	mutex_unlock(&iommu->lock);
-	return i > npage ? npage : (i > 0 ? i : -EINVAL);
+	return i > 0 ? i : -EINVAL;
 }
 
 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
-- 
Gitee


From 05cce75d9f748d45262cc7f6216429035ac9f573 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:05 -0600
Subject: [PATCH 1607/2161] vfio: Remove extra put/gets around
 vfio_device->group

commit e572bfb2b6a83b05acd30c03010e661b1967960f upstream.

The vfio_device->group value has a get obtained during
vfio_add_group_dev() which gets moved from the stack to vfio_device->group
in vfio_group_create_device().

The reference remains until we reach the end of vfio_del_group_dev() when
it is put back.

Thus anything that already has a kref on the vfio_device is guaranteed a
valid group pointer. Remove all the extra reference traffic.

It is tricky to see, but the get at the start of vfio_del_group_dev() is
actually pairing with the put hidden inside vfio_device_put() a few lines
below.

A later patch merges vfio_group_create_device() into vfio_add_group_dev()
which makes the ownership and error flow on the create side easier to
follow.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <1-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 38779e6fd80c..15d8e678e556 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -546,14 +546,12 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 
 	kref_init(&device->kref);
 	device->dev = dev;
+	/* Our reference on group is moved to the device */
 	device->group = group;
 	device->ops = ops;
 	device->device_data = device_data;
 	dev_set_drvdata(dev, device);
 
-	/* No need to get group_lock, caller has group reference */
-	vfio_group_get(group);
-
 	mutex_lock(&group->device_lock);
 	list_add(&device->group_next, &group->device_list);
 	group->dev_counter++;
@@ -585,13 +583,11 @@ void vfio_device_put(struct vfio_device *device)
 {
 	struct vfio_group *group = device->group;
 	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
-	vfio_group_put(group);
 }
 EXPORT_SYMBOL_GPL(vfio_device_put);
 
 static void vfio_device_get(struct vfio_device *device)
 {
-	vfio_group_get(device->group);
 	kref_get(&device->kref);
 }
 
@@ -841,14 +837,6 @@ int vfio_add_group_dev(struct device *dev,
 		vfio_group_put(group);
 		return PTR_ERR(device);
 	}
-
-	/*
-	 * Drop all but the vfio_device reference.  The vfio_device holds
-	 * a reference to the vfio_group, which holds a reference to the
-	 * iommu_group.
-	 */
-	vfio_group_put(group);
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
@@ -928,12 +916,6 @@ void *vfio_del_group_dev(struct device *dev)
 	unsigned int i = 0;
 	bool interrupted = false;
 
-	/*
-	 * The group exists so long as we have a device reference.  Get
-	 * a group reference and use it to scan for the device going away.
-	 */
-	vfio_group_get(group);
-
 	/*
 	 * When the device is removed from the group, the group suddenly
 	 * becomes non-viable; the device has a driver (until the unbind
@@ -1008,6 +990,7 @@ void *vfio_del_group_dev(struct device *dev)
 	if (list_empty(&group->device_list))
 		wait_event(group->container_q, !group->container);
 
+	/* Matches the get in vfio_group_create_device() */
 	vfio_group_put(group);
 
 	return device_data;
-- 
Gitee


From 79c0918a6f67902f686a6e30cd86370c5ae86c2b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:05 -0600
Subject: [PATCH 1608/2161] vfio: Simplify the lifetime logic for vfio_device

commit 5e42c999445bd0ae86e35affeb3e7c473d74a893 upstream.

The vfio_device is using a 'sleep until all refs go to zero' pattern for
its lifetime, but it is indirectly coded by repeatedly scanning the group
list waiting for the device to be removed on its own.

Switch this around to be a direct representation, use a refcount to count
the number of places that are blocking destruction and sleep directly on a
completion until that counter goes to zero. kfree the device after other
accesses have been excluded in vfio_del_group_dev(). This is a fairly
common Linux idiom.

Due to this we can now remove kref_put_mutex(), which is very rarely used
in the kernel. Here it is being used to prevent a zero ref device from
being seen in the group list. Instead allow the zero ref device to
continue to exist in the device_list and use refcount_inc_not_zero() to
exclude it once refs go to zero.

This patch is organized so the next patch will be able to alter the API to
allow drivers to provide the kfree.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <2-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio.c | 79 ++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 54 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 15d8e678e556..32660e8a69ae 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -46,7 +46,6 @@ static struct vfio {
 	struct mutex			group_lock;
 	struct cdev			group_cdev;
 	dev_t				group_devt;
-	wait_queue_head_t		release_q;
 } vfio;
 
 struct vfio_iommu_driver {
@@ -91,7 +90,8 @@ struct vfio_group {
 };
 
 struct vfio_device {
-	struct kref			kref;
+	refcount_t			refcount;
+	struct completion		comp;
 	struct device			*dev;
 	const struct vfio_device_ops	*ops;
 	struct vfio_group		*group;
@@ -544,7 +544,8 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 	if (!device)
 		return ERR_PTR(-ENOMEM);
 
-	kref_init(&device->kref);
+	refcount_set(&device->refcount, 1);
+	init_completion(&device->comp);
 	device->dev = dev;
 	/* Our reference on group is moved to the device */
 	device->group = group;
@@ -560,35 +561,17 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 	return device;
 }
 
-static void vfio_device_release(struct kref *kref)
-{
-	struct vfio_device *device = container_of(kref,
-						  struct vfio_device, kref);
-	struct vfio_group *group = device->group;
-
-	list_del(&device->group_next);
-	group->dev_counter--;
-	mutex_unlock(&group->device_lock);
-
-	dev_set_drvdata(device->dev, NULL);
-
-	kfree(device);
-
-	/* vfio_del_group_dev may be waiting for this device */
-	wake_up(&vfio.release_q);
-}
-
 /* Device reference always implies a group reference */
 void vfio_device_put(struct vfio_device *device)
 {
-	struct vfio_group *group = device->group;
-	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
+	if (refcount_dec_and_test(&device->refcount))
+		complete(&device->comp);
 }
 EXPORT_SYMBOL_GPL(vfio_device_put);
 
-static void vfio_device_get(struct vfio_device *device)
+static bool vfio_device_try_get(struct vfio_device *device)
 {
-	kref_get(&device->kref);
+	return refcount_inc_not_zero(&device->refcount);
 }
 
 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
@@ -598,8 +581,7 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 
 	mutex_lock(&group->device_lock);
 	list_for_each_entry(device, &group->device_list, group_next) {
-		if (device->dev == dev) {
-			vfio_device_get(device);
+		if (device->dev == dev && vfio_device_try_get(device)) {
 			mutex_unlock(&group->device_lock);
 			return device;
 		}
@@ -883,9 +865,8 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 			ret = !strcmp(dev_name(it->dev), buf);
 		}
 
-		if (ret) {
+		if (ret && vfio_device_try_get(it)) {
 			device = it;
-			vfio_device_get(device);
 			break;
 		}
 	}
@@ -908,13 +889,13 @@ EXPORT_SYMBOL_GPL(vfio_device_data);
  * removed.  Open file descriptors for the device... */
 void *vfio_del_group_dev(struct device *dev)
 {
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct vfio_device *device = dev_get_drvdata(dev);
 	struct vfio_group *group = device->group;
 	void *device_data = device->device_data;
 	struct vfio_unbound_dev *unbound;
 	unsigned int i = 0;
 	bool interrupted = false;
+	long rc;
 
 	/*
 	 * When the device is removed from the group, the group suddenly
@@ -935,32 +916,18 @@ void *vfio_del_group_dev(struct device *dev)
 	WARN_ON(!unbound);
 
 	vfio_device_put(device);
-
-	/*
-	 * If the device is still present in the group after the above
-	 * 'put', then it is in use and we need to request it from the
-	 * bus driver.  The driver may in turn need to request the
-	 * device from the user.  We send the request on an arbitrary
-	 * interval with counter to allow the driver to take escalating
-	 * measures to release the device if it has the ability to do so.
-	 */
-	add_wait_queue(&vfio.release_q, &wait);
-
-	do {
-		device = vfio_group_get_device(group, dev);
-		if (!device)
-			break;
-
+	rc = try_wait_for_completion(&device->comp);
+	while (rc <= 0) {
 		if (device->ops->request)
 			device->ops->request(device_data, i++);
 
-		vfio_device_put(device);
-
 		if (interrupted) {
-			wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
+			rc = wait_for_completion_timeout(&device->comp,
+							 HZ * 10);
 		} else {
-			wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
-			if (signal_pending(current)) {
+			rc = wait_for_completion_interruptible_timeout(
+				&device->comp, HZ * 10);
+			if (rc < 0) {
 				interrupted = true;
 				dev_warn(dev,
 					 "Device is currently in use, task"
@@ -969,10 +936,13 @@ void *vfio_del_group_dev(struct device *dev)
 					 current->comm, task_pid_nr(current));
 			}
 		}
+	}
 
-	} while (1);
+	mutex_lock(&group->device_lock);
+	list_del(&device->group_next);
+	group->dev_counter--;
+	mutex_unlock(&group->device_lock);
 
-	remove_wait_queue(&vfio.release_q, &wait);
 	/*
 	 * In order to support multiple devices per group, devices can be
 	 * plucked from the group while other devices in the group are still
@@ -992,6 +962,8 @@ void *vfio_del_group_dev(struct device *dev)
 
 	/* Matches the get in vfio_group_create_device() */
 	vfio_group_put(group);
+	dev_set_drvdata(dev, NULL);
+	kfree(device);
 
 	return device_data;
 }
@@ -2362,7 +2334,6 @@ static int __init vfio_init(void)
 	mutex_init(&vfio.iommu_drivers_lock);
 	INIT_LIST_HEAD(&vfio.group_list);
 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
-	init_waitqueue_head(&vfio.release_q);
 
 	ret = misc_register(&vfio_dev);
 	if (ret) {
-- 
Gitee


From ad154063acfb8f8ad3a7b1b36816dadcfd893715 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:05 -0600
Subject: [PATCH 1609/2161] vfio: Split creation of a vfio_device into init and
 register ops

commit 0bfc6a4ea63c2adac71a824397ef48f28dbc5e47 upstream.

This makes the struct vfio_device part of the public interface so it
can be used with container_of and so forth, as is typical for a Linux
subystem.

This is the first step to bring some type-safety to the vfio interface by
allowing the replacement of 'void *' and 'struct device *' inputs with a
simple and clear 'struct vfio_device *'

For now the self-allocating vfio_add_group_dev() interface is kept so each
user can be updated as a separate patch.

The expected usage pattern is

  driver core probe() function:
     my_device = kzalloc(sizeof(*mydevice));
     vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice);
     /* other driver specific prep */
     vfio_register_group_dev(&my_device->vdev);
     dev_set_drvdata(dev, my_device);

  driver core remove() function:
     my_device = dev_get_drvdata(dev);
     vfio_unregister_group_dev(&my_device->vdev);
     /* other driver specific tear down */
     kfree(my_device);

Allowing the driver to be able to use the drvdata and vfio_device to go
to/from its own data.

The pattern also makes it clear that vfio_register_group_dev() must be
last in the sequence, as once it is called the core code can immediately
start calling ops. The init/register gap is provided to allow for the
driver to do setup before ops can be called and thus avoid races.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Liu Yi L <yi.l.liu@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/vfio.rst |  31 ++++----
 drivers/vfio/vfio.c               | 125 ++++++++++++++++--------------
 include/linux/vfio.h              |  16 ++++
 3 files changed, 99 insertions(+), 73 deletions(-)

diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst
index f1a4d3c3ba0b..d3a02300913a 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -249,18 +249,23 @@ VFIO bus driver API
 
 VFIO bus drivers, such as vfio-pci make use of only a few interfaces
 into VFIO core.  When devices are bound and unbound to the driver,
-the driver should call vfio_add_group_dev() and vfio_del_group_dev()
-respectively::
-
-	extern int vfio_add_group_dev(struct device *dev,
-				      const struct vfio_device_ops *ops,
-				      void *device_data);
-
-	extern void *vfio_del_group_dev(struct device *dev);
-
-vfio_add_group_dev() indicates to the core to begin tracking the
-iommu_group of the specified dev and register the dev as owned by
-a VFIO bus driver.  The driver provides an ops structure for callbacks
+the driver should call vfio_register_group_dev() and
+vfio_unregister_group_dev() respectively::
+
+	void vfio_init_group_dev(struct vfio_device *device,
+				struct device *dev,
+				const struct vfio_device_ops *ops,
+				void *device_data);
+	int vfio_register_group_dev(struct vfio_device *device);
+	void vfio_unregister_group_dev(struct vfio_device *device);
+
+The driver should embed the vfio_device in its own structure and call
+vfio_init_group_dev() to pre-configure it before going to registration.
+vfio_register_group_dev() indicates to the core to begin tracking the
+iommu_group of the specified dev and register the dev as owned by a VFIO bus
+driver. Once vfio_register_group_dev() returns it is possible for userspace to
+start accessing the driver, thus the driver should ensure it is completely
+ready before calling it. The driver provides an ops structure for callbacks
 similar to a file operations structure::
 
 	struct vfio_device_ops {
@@ -276,7 +281,7 @@ similar to a file operations structure::
 	};
 
 Each function is passed the device_data that was originally registered
-in the vfio_add_group_dev() call above.  This allows the bus driver
+in the vfio_register_group_dev() call above.  This allows the bus driver
 an easy place to store its opaque, private data.  The open/release
 callbacks are issued when a new file descriptor is created for a
 device (via VFIO_GROUP_GET_DEVICE_FD).  The ioctl interface provides
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 32660e8a69ae..2ea430de505b 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -89,16 +89,6 @@ struct vfio_group {
 	struct blocking_notifier_head	notifier;
 };
 
-struct vfio_device {
-	refcount_t			refcount;
-	struct completion		comp;
-	struct device			*dev;
-	const struct vfio_device_ops	*ops;
-	struct vfio_group		*group;
-	struct list_head		group_next;
-	void				*device_data;
-};
-
 #ifdef CONFIG_VFIO_NOIOMMU
 static bool noiommu __read_mostly;
 module_param_named(enable_unsafe_noiommu_mode,
@@ -532,35 +522,6 @@ static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 /**
  * Device objects - create, release, get, put, search
  */
-static
-struct vfio_device *vfio_group_create_device(struct vfio_group *group,
-					     struct device *dev,
-					     const struct vfio_device_ops *ops,
-					     void *device_data)
-{
-	struct vfio_device *device;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return ERR_PTR(-ENOMEM);
-
-	refcount_set(&device->refcount, 1);
-	init_completion(&device->comp);
-	device->dev = dev;
-	/* Our reference on group is moved to the device */
-	device->group = group;
-	device->ops = ops;
-	device->device_data = device_data;
-	dev_set_drvdata(dev, device);
-
-	mutex_lock(&group->device_lock);
-	list_add(&device->group_next, &group->device_list);
-	group->dev_counter++;
-	mutex_unlock(&group->device_lock);
-
-	return device;
-}
-
 /* Device reference always implies a group reference */
 void vfio_device_put(struct vfio_device *device)
 {
@@ -779,14 +740,23 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
 /**
  * VFIO driver API
  */
-int vfio_add_group_dev(struct device *dev,
-		       const struct vfio_device_ops *ops, void *device_data)
+void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
+			 const struct vfio_device_ops *ops, void *device_data)
+{
+	init_completion(&device->comp);
+	device->dev = dev;
+	device->ops = ops;
+	device->device_data = device_data;
+}
+EXPORT_SYMBOL_GPL(vfio_init_group_dev);
+
+int vfio_register_group_dev(struct vfio_device *device)
 {
+	struct vfio_device *existing_device;
 	struct iommu_group *iommu_group;
 	struct vfio_group *group;
-	struct vfio_device *device;
 
-	iommu_group = iommu_group_get(dev);
+	iommu_group = iommu_group_get(device->dev);
 	if (!iommu_group)
 		return -EINVAL;
 
@@ -805,21 +775,50 @@ int vfio_add_group_dev(struct device *dev,
 		iommu_group_put(iommu_group);
 	}
 
-	device = vfio_group_get_device(group, dev);
-	if (device) {
-		dev_WARN(dev, "Device already exists on group %d\n",
+	existing_device = vfio_group_get_device(group, device->dev);
+	if (existing_device) {
+		dev_WARN(device->dev, "Device already exists on group %d\n",
 			 iommu_group_id(iommu_group));
-		vfio_device_put(device);
+		vfio_device_put(existing_device);
 		vfio_group_put(group);
 		return -EBUSY;
 	}
 
-	device = vfio_group_create_device(group, dev, ops, device_data);
-	if (IS_ERR(device)) {
-		vfio_group_put(group);
-		return PTR_ERR(device);
-	}
+	/* Our reference on group is moved to the device */
+	device->group = group;
+
+	/* Refcounting can't start until the driver calls register */
+	refcount_set(&device->refcount, 1);
+
+	mutex_lock(&group->device_lock);
+	list_add(&device->group_next, &group->device_list);
+	group->dev_counter++;
+	mutex_unlock(&group->device_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_register_group_dev);
+
+int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops,
+		       void *device_data)
+{
+	struct vfio_device *device;
+	int ret;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device)
+		return -ENOMEM;
+
+	vfio_init_group_dev(device, dev, ops, device_data);
+	ret = vfio_register_group_dev(device);
+	if (ret)
+		goto err_kfree;
+	dev_set_drvdata(dev, device);
 	return 0;
+
+err_kfree:
+	kfree(device);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 
@@ -887,11 +886,9 @@ EXPORT_SYMBOL_GPL(vfio_device_data);
 /*
  * Decrement the device reference count and wait for the device to be
  * removed.  Open file descriptors for the device... */
-void *vfio_del_group_dev(struct device *dev)
+void vfio_unregister_group_dev(struct vfio_device *device)
 {
-	struct vfio_device *device = dev_get_drvdata(dev);
 	struct vfio_group *group = device->group;
-	void *device_data = device->device_data;
 	struct vfio_unbound_dev *unbound;
 	unsigned int i = 0;
 	bool interrupted = false;
@@ -908,7 +905,7 @@ void *vfio_del_group_dev(struct device *dev)
 	 */
 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 	if (unbound) {
-		unbound->dev = dev;
+		unbound->dev = device->dev;
 		mutex_lock(&group->unbound_lock);
 		list_add(&unbound->unbound_next, &group->unbound_list);
 		mutex_unlock(&group->unbound_lock);
@@ -919,7 +916,7 @@ void *vfio_del_group_dev(struct device *dev)
 	rc = try_wait_for_completion(&device->comp);
 	while (rc <= 0) {
 		if (device->ops->request)
-			device->ops->request(device_data, i++);
+			device->ops->request(device->device_data, i++);
 
 		if (interrupted) {
 			rc = wait_for_completion_timeout(&device->comp,
@@ -929,7 +926,7 @@ void *vfio_del_group_dev(struct device *dev)
 				&device->comp, HZ * 10);
 			if (rc < 0) {
 				interrupted = true;
-				dev_warn(dev,
+				dev_warn(device->dev,
 					 "Device is currently in use, task"
 					 " \"%s\" (%d) "
 					 "blocked until device is released",
@@ -960,11 +957,19 @@ void *vfio_del_group_dev(struct device *dev)
 	if (list_empty(&group->device_list))
 		wait_event(group->container_q, !group->container);
 
-	/* Matches the get in vfio_group_create_device() */
+	/* Matches the get in vfio_register_group_dev() */
 	vfio_group_put(group);
+}
+EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
+
+void *vfio_del_group_dev(struct device *dev)
+{
+	struct vfio_device *device = dev_get_drvdata(dev);
+	void *device_data = device->device_data;
+
+	vfio_unregister_group_dev(device);
 	dev_set_drvdata(dev, NULL);
 	kfree(device);
-
 	return device_data;
 }
 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index b7e18bde5aa8..ad8b579d67d3 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -15,6 +15,18 @@
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
 
+struct vfio_device {
+	struct device *dev;
+	const struct vfio_device_ops *ops;
+	struct vfio_group *group;
+
+	/* Members below here are private, not for driver use */
+	refcount_t refcount;
+	struct completion comp;
+	struct list_head group_next;
+	void *device_data;
+};
+
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
  *
@@ -48,11 +60,15 @@ struct vfio_device_ops {
 extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
 extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 
+void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
+			 const struct vfio_device_ops *ops, void *device_data);
+int vfio_register_group_dev(struct vfio_device *device);
 extern int vfio_add_group_dev(struct device *dev,
 			      const struct vfio_device_ops *ops,
 			      void *device_data);
 
 extern void *vfio_del_group_dev(struct device *dev);
+void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 extern void *vfio_device_data(struct vfio_device *device);
-- 
Gitee


From c91f5dd4bd209fb8c6e5bbbde6afe8500b5e97cf Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:06 -0600
Subject: [PATCH 1610/2161] vfio/platform: Use
 vfio_init/register/unregister_group_dev

commit cb6164586814bbe41f55559ae5505d8373d6f781 upstream.

platform already allocates a struct vfio_platform_device with exactly
the same lifetime as vfio_device, switch to the new API and embed
vfio_device in vfio_platform_device.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Acked-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <4-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/vfio_platform.c         | 20 ++++++++--------
 drivers/vfio/platform/vfio_platform_common.c  | 23 +++++++------------
 drivers/vfio/platform/vfio_platform_private.h |  5 ++--
 3 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 9fb6818cea12..e4027799a154 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -54,23 +54,21 @@ static int vfio_platform_probe(struct platform_device *pdev)
 	vdev->reset_required = reset_required;
 
 	ret = vfio_platform_probe_common(vdev, &pdev->dev);
-	if (ret)
+	if (ret) {
 		kfree(vdev);
-
-	return ret;
+		return ret;
+	}
+	dev_set_drvdata(&pdev->dev, vdev);
+	return 0;
 }
 
 static int vfio_platform_remove(struct platform_device *pdev)
 {
-	struct vfio_platform_device *vdev;
-
-	vdev = vfio_platform_remove_common(&pdev->dev);
-	if (vdev) {
-		kfree(vdev);
-		return 0;
-	}
+	struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev);
 
-	return -EINVAL;
+	vfio_platform_remove_common(vdev);
+	kfree(vdev);
+	return 0;
 }
 
 static struct platform_driver vfio_platform_driver = {
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index fb4b385191f2..6eb749250ee4 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -659,8 +659,7 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 	struct iommu_group *group;
 	int ret;
 
-	if (!vdev)
-		return -EINVAL;
+	vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops, vdev);
 
 	ret = vfio_platform_acpi_probe(vdev, dev);
 	if (ret)
@@ -685,13 +684,13 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 		goto put_reset;
 	}
 
-	ret = vfio_add_group_dev(dev, &vfio_platform_ops, vdev);
+	ret = vfio_register_group_dev(&vdev->vdev);
 	if (ret)
 		goto put_iommu;
 
 	mutex_init(&vdev->igate);
 
-	pm_runtime_enable(vdev->device);
+	pm_runtime_enable(dev);
 	return 0;
 
 put_iommu:
@@ -702,19 +701,13 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 }
 EXPORT_SYMBOL_GPL(vfio_platform_probe_common);
 
-struct vfio_platform_device *vfio_platform_remove_common(struct device *dev)
+void vfio_platform_remove_common(struct vfio_platform_device *vdev)
 {
-	struct vfio_platform_device *vdev;
-
-	vdev = vfio_del_group_dev(dev);
+	vfio_unregister_group_dev(&vdev->vdev);
 
-	if (vdev) {
-		pm_runtime_disable(vdev->device);
-		vfio_platform_put_reset(vdev);
-		vfio_iommu_group_put(dev->iommu_group, dev);
-	}
-
-	return vdev;
+	pm_runtime_disable(vdev->device);
+	vfio_platform_put_reset(vdev);
+	vfio_iommu_group_put(vdev->vdev.dev->iommu_group, vdev->vdev.dev);
 }
 EXPORT_SYMBOL_GPL(vfio_platform_remove_common);
 
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index 289089910643..a5ba82c8cbc3 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -9,6 +9,7 @@
 
 #include <linux/types.h>
 #include <linux/interrupt.h>
+#include <linux/vfio.h>
 
 #define VFIO_PLATFORM_OFFSET_SHIFT   40
 #define VFIO_PLATFORM_OFFSET_MASK (((u64)(1) << VFIO_PLATFORM_OFFSET_SHIFT) - 1)
@@ -42,6 +43,7 @@ struct vfio_platform_region {
 };
 
 struct vfio_platform_device {
+	struct vfio_device		vdev;
 	struct vfio_platform_region	*regions;
 	u32				num_regions;
 	struct vfio_platform_irq	*irqs;
@@ -80,8 +82,7 @@ struct vfio_platform_reset_node {
 
 extern int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 				      struct device *dev);
-extern struct vfio_platform_device *vfio_platform_remove_common
-				     (struct device *dev);
+void vfio_platform_remove_common(struct vfio_platform_device *vdev);
 
 extern int vfio_platform_irq_init(struct vfio_platform_device *vdev);
 extern void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev);
-- 
Gitee


From 4ba0890ba890e52c31dbeadcccea946bd1a29685 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:06 -0600
Subject: [PATCH 1611/2161] vfio/pci: Move VGA and VF initialization to
 functions

commit 61e90817482871b614133c0f20feb1aba2faec86 upstream.

vfio_pci_probe() is quite complicated, with optional VF and VGA sub
components. Move these into clear init/uninit functions and have a linear
flow in probe/remove.

This fixes a few little buglets:
 - vfio_pci_remove() is in the wrong order, vga_client_register() removes
   a notifier and is after kfree(vdev), but the notifier refers to vdev,
   so it can use after free in a race.
 - vga_client_register() can fail but was ignored

Organize things so destruction order is the reverse of creation order.

Fixes: ecaa1f6a0154 ("vfio-pci: Add VGA arbiter client")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <7-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 116 +++++++++++++++++++++++-------------
 1 file changed, 74 insertions(+), 42 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index dfd3570c0c81..7ed57dc7037e 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1925,6 +1925,68 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb,
 	return 0;
 }
 
+static int vfio_pci_vf_init(struct vfio_pci_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	if (!pdev->is_physfn)
+		return 0;
+
+	vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
+	if (!vdev->vf_token)
+		return -ENOMEM;
+
+	mutex_init(&vdev->vf_token->lock);
+	uuid_gen(&vdev->vf_token->uuid);
+
+	vdev->nb.notifier_call = vfio_pci_bus_notifier;
+	ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
+	if (ret) {
+		kfree(vdev->vf_token);
+		return ret;
+	}
+	return 0;
+}
+
+static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev)
+{
+	if (!vdev->vf_token)
+		return;
+
+	bus_unregister_notifier(&pci_bus_type, &vdev->nb);
+	WARN_ON(vdev->vf_token->users);
+	mutex_destroy(&vdev->vf_token->lock);
+	kfree(vdev->vf_token);
+}
+
+static int vfio_pci_vga_init(struct vfio_pci_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	if (!vfio_pci_is_vga(pdev))
+		return 0;
+
+	ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
+	if (ret)
+		return ret;
+	vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false));
+	return 0;
+}
+
+static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+
+	if (!vfio_pci_is_vga(pdev))
+		return;
+	vga_client_register(pdev, NULL, NULL, NULL);
+	vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
+					      VGA_RSRC_LEGACY_IO |
+					      VGA_RSRC_LEGACY_MEM);
+}
+
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct vfio_pci_device *vdev;
@@ -1978,28 +2040,12 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	ret = vfio_pci_reflck_attach(vdev);
 	if (ret)
 		goto out_del_group_dev;
-
-	if (pdev->is_physfn) {
-		vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
-		if (!vdev->vf_token) {
-			ret = -ENOMEM;
-			goto out_reflck;
-		}
-
-		mutex_init(&vdev->vf_token->lock);
-		uuid_gen(&vdev->vf_token->uuid);
-
-		vdev->nb.notifier_call = vfio_pci_bus_notifier;
-		ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
-		if (ret)
-			goto out_vf_token;
-	}
-
-	if (vfio_pci_is_vga(pdev)) {
-		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
-		vga_set_legacy_decoding(pdev,
-					vfio_pci_set_vga_decode(vdev, false));
-	}
+	ret = vfio_pci_vf_init(vdev);
+	if (ret)
+		goto out_reflck;
+	ret = vfio_pci_vga_init(vdev);
+	if (ret)
+		goto out_vf;
 
 	vfio_pci_probe_power_state(vdev);
 
@@ -2019,8 +2065,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	return ret;
 
-out_vf_token:
-	kfree(vdev->vf_token);
+out_vf:
+	vfio_pci_vf_uninit(vdev);
 out_reflck:
 	vfio_pci_reflck_put(vdev->reflck);
 out_del_group_dev:
@@ -2042,33 +2088,19 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 	if (!vdev)
 		return;
 
-	if (vdev->vf_token) {
-		WARN_ON(vdev->vf_token->users);
-		mutex_destroy(&vdev->vf_token->lock);
-		kfree(vdev->vf_token);
-	}
-
-	if (vdev->nb.notifier_call)
-		bus_unregister_notifier(&pci_bus_type, &vdev->nb);
-
+	vfio_pci_vf_uninit(vdev);
 	vfio_pci_reflck_put(vdev->reflck);
+	vfio_pci_vga_uninit(vdev);
 
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
-	kfree(vdev->region);
-	mutex_destroy(&vdev->ioeventfds_lock);
 
 	if (!disable_idle_d3)
 		vfio_pci_set_power_state(vdev, PCI_D0);
 
+	mutex_destroy(&vdev->ioeventfds_lock);
+	kfree(vdev->region);
 	kfree(vdev->pm_save);
 	kfree(vdev);
-
-	if (vfio_pci_is_vga(pdev)) {
-		vga_client_register(pdev, NULL, NULL, NULL);
-		vga_set_legacy_decoding(pdev,
-				VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
-				VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM);
-	}
 }
 
 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
-- 
Gitee


From e83587741d62f713a9a65d3925fd6ef0b6523a70 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:06 -0600
Subject: [PATCH 1612/2161] vfio/pci: Re-order vfio_pci_probe()

commit 4aeec3984ddc853f7c65903bde472ffdef738bae upstream.

vfio_add_group_dev() must be called only after all of the private data in
vdev is fully setup and ready, otherwise there could be races with user
space instantiating a device file descriptor and starting to call ops.

For instance vfio_pci_reflck_attach() sets vdev->reflck and
vfio_pci_open(), called by fops open, unconditionally derefs it, which
will crash if things get out of order.

Fixes: cc20d7999000 ("vfio/pci: Introduce VF token")
Fixes: e309df5b0c9e ("vfio/pci: Parallelize device open and release")
Fixes: 6eb7018705de ("vfio-pci: Move idle devices to D3hot power state")
Fixes: ecaa1f6a0154 ("vfio-pci: Add VGA arbiter client")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <8-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ed57dc7037e..5ad3de496a61 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -2033,13 +2033,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_LIST_HEAD(&vdev->vma_list);
 	init_rwsem(&vdev->memory_lock);
 
-	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
-	if (ret)
-		goto out_free;
-
 	ret = vfio_pci_reflck_attach(vdev);
 	if (ret)
-		goto out_del_group_dev;
+		goto out_free;
 	ret = vfio_pci_vf_init(vdev);
 	if (ret)
 		goto out_reflck;
@@ -2063,15 +2059,20 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		vfio_pci_set_power_state(vdev, PCI_D3hot);
 	}
 
-	return ret;
+	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
+	if (ret)
+		goto out_power;
+	return 0;
 
+out_power:
+	if (!disable_idle_d3)
+		vfio_pci_set_power_state(vdev, PCI_D0);
 out_vf:
 	vfio_pci_vf_uninit(vdev);
 out_reflck:
 	vfio_pci_reflck_put(vdev->reflck);
-out_del_group_dev:
-	vfio_del_group_dev(&pdev->dev);
 out_free:
+	kfree(vdev->pm_save);
 	kfree(vdev);
 out_group_put:
 	vfio_iommu_group_put(group, &pdev->dev);
-- 
Gitee


From 5ee94c2e04ba63de4a09b2402617edb344758487 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:07 -0600
Subject: [PATCH 1613/2161] vfio/pci: Use
 vfio_init/register/unregister_group_dev

commit 6b018e203d5effc97961cd9477687fe09e2fe79f upstream.

pci already allocates a struct vfio_pci_device with exactly the same
lifetime as vfio_device, switch to the new API and embed vfio_device in
vfio_pci_device.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Liu Yi L <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <9-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c         | 10 +++++-----
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 5ad3de496a61..abe4de80ffba 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -2022,6 +2022,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto out_group_put;
 	}
 
+	vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops, vdev);
 	vdev->pdev = pdev;
 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
 	mutex_init(&vdev->igate);
@@ -2059,9 +2060,10 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		vfio_pci_set_power_state(vdev, PCI_D3hot);
 	}
 
-	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
+	ret = vfio_register_group_dev(&vdev->vdev);
 	if (ret)
 		goto out_power;
+	dev_set_drvdata(&pdev->dev, vdev);
 	return 0;
 
 out_power:
@@ -2081,13 +2083,11 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 static void vfio_pci_remove(struct pci_dev *pdev)
 {
-	struct vfio_pci_device *vdev;
+	struct vfio_pci_device *vdev = dev_get_drvdata(&pdev->dev);
 
 	pci_disable_sriov(pdev);
 
-	vdev = vfio_del_group_dev(&pdev->dev);
-	if (!vdev)
-		return;
+	vfio_unregister_group_dev(&vdev->vdev);
 
 	vfio_pci_vf_uninit(vdev);
 	vfio_pci_reflck_put(vdev->reflck);
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 3c3e93320fd9..d32e74019bf7 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -100,6 +100,7 @@ struct vfio_pci_vf_token {
 };
 
 struct vfio_pci_device {
+	struct vfio_device	vdev;
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_NUM_BARS];
 	bool			bar_mmap_supported[PCI_STD_NUM_BARS];
-- 
Gitee


From 68828b68080f4f0f7c221561ac7e451390b4b26a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:07 -0600
Subject: [PATCH 1614/2161] vfio/mdev: Use
 vfio_init/register/unregister_group_dev

commit 1ae1b20f6f2c67659c963e5fe58f9b4a47df9f12 upstream.

mdev gets little benefit because it doesn't actually do anything, however
it is the last user, so move the vfio_init/register/unregister_group_dev()
code here for now.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Liu Yi L <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <10-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/vfio_mdev.c | 20 ++++++++++++++++--
 drivers/vfio/vfio.c           | 39 ++---------------------------------
 include/linux/vfio.h          |  5 -----
 3 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index b52eea128549..4043cc91f952 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -124,13 +124,29 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
 static int vfio_mdev_probe(struct device *dev)
 {
 	struct mdev_device *mdev = to_mdev_device(dev);
+	struct vfio_device *vdev;
+	int ret;
 
-	return vfio_add_group_dev(dev, &vfio_mdev_dev_ops, mdev);
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return -ENOMEM;
+
+	vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops, mdev);
+	ret = vfio_register_group_dev(vdev);
+	if (ret) {
+		kfree(vdev);
+		return ret;
+	}
+	dev_set_drvdata(&mdev->dev, vdev);
+	return 0;
 }
 
 static void vfio_mdev_remove(struct device *dev)
 {
-	vfio_del_group_dev(dev);
+	struct vfio_device *vdev = dev_get_drvdata(dev);
+
+	vfio_unregister_group_dev(vdev);
+	kfree(vdev);
 }
 
 static struct mdev_driver vfio_mdev_driver = {
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2ea430de505b..180b4ab02d11 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -99,8 +99,8 @@ MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  Thi
 /*
  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
  * and remove functions, any use cases other than acquiring the first
- * reference for the purpose of calling vfio_add_group_dev() or removing
- * that symmetric reference after vfio_del_group_dev() should use the raw
+ * reference for the purpose of calling vfio_register_group_dev() or removing
+ * that symmetric reference after vfio_unregister_group_dev() should use the raw
  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
  * removes the device from the dummy group and cannot be nested.
  */
@@ -799,29 +799,6 @@ int vfio_register_group_dev(struct vfio_device *device)
 }
 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 
-int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops,
-		       void *device_data)
-{
-	struct vfio_device *device;
-	int ret;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return -ENOMEM;
-
-	vfio_init_group_dev(device, dev, ops, device_data);
-	ret = vfio_register_group_dev(device);
-	if (ret)
-		goto err_kfree;
-	dev_set_drvdata(dev, device);
-	return 0;
-
-err_kfree:
-	kfree(device);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(vfio_add_group_dev);
-
 /**
  * Get a reference to the vfio_device for a device.  Even if the
  * caller thinks they own the device, they could be racing with a
@@ -962,18 +939,6 @@ void vfio_unregister_group_dev(struct vfio_device *device)
 }
 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 
-void *vfio_del_group_dev(struct device *dev)
-{
-	struct vfio_device *device = dev_get_drvdata(dev);
-	void *device_data = device->device_data;
-
-	vfio_unregister_group_dev(device);
-	dev_set_drvdata(dev, NULL);
-	kfree(device);
-	return device_data;
-}
-EXPORT_SYMBOL_GPL(vfio_del_group_dev);
-
 /**
  * VFIO base fd, /dev/vfio/vfio
  */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ad8b579d67d3..4995faf51efe 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -63,11 +63,6 @@ extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 			 const struct vfio_device_ops *ops, void *device_data);
 int vfio_register_group_dev(struct vfio_device *device);
-extern int vfio_add_group_dev(struct device *dev,
-			      const struct vfio_device_ops *ops,
-			      void *device_data);
-
-extern void *vfio_del_group_dev(struct device *dev);
 void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
-- 
Gitee


From da56ad539437b7f1a19caf3b9688158186af8bc5 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:07 -0600
Subject: [PATCH 1615/2161] vfio/mdev: Make to_mdev_device() into a static
 inline

commit 66873b5fa738ca02b5c075ca4a410b13d88e6e9a upstream.

The macro wrongly uses 'dev' as both the macro argument and the member
name, which means it fails compilation if any caller uses a word other
than 'dev' as the single argument. Fix this defect by making it into
proper static inline, which is more clear and typesafe anyhow.

Fixes: 99e3123e3d72 ("vfio-mdev: Make mdev_device private and abstract interfaces")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <11-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_private.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index 4d62b76c4734..fdbb3bee99a9 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -35,7 +35,10 @@ struct mdev_device {
 	bool active;
 };
 
-#define to_mdev_device(dev)	container_of(dev, struct mdev_device, dev)
+static inline struct mdev_device *to_mdev_device(struct device *dev)
+{
+	return container_of(dev, struct mdev_device, dev);
+}
 #define dev_is_mdev(d)		((d)->bus == &mdev_bus_type)
 
 struct mdev_type {
-- 
Gitee


From 4f1ebd9d2c963845c9085edb3423eb2d934a8d0c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:08 -0600
Subject: [PATCH 1616/2161] vfio: Make vfio_device_ops pass a 'struct
 vfio_device *' instead of 'void *'

commit 6df62c5b05f4ad6876815ea8b8775905a090224a upstream.

This is the standard kernel pattern, the ops associated with a struct get
the struct pointer in for typesafety. The expected design is to use
container_of to cleanly go from the subsystem level type to the driver
level type without having any type erasure in a void *.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <12-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/vfio.rst            | 18 ++++----
 drivers/vfio/mdev/vfio_mdev.c                | 33 +++++++-------
 drivers/vfio/pci/vfio_pci.c                  | 47 ++++++++++++--------
 drivers/vfio/platform/vfio_platform_common.c | 33 ++++++++------
 drivers/vfio/vfio.c                          | 20 ++++-----
 include/linux/vfio.h                         | 16 +++----
 6 files changed, 95 insertions(+), 72 deletions(-)

diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst
index d3a02300913a..3337f337293a 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -269,20 +269,22 @@ ready before calling it. The driver provides an ops structure for callbacks
 similar to a file operations structure::
 
 	struct vfio_device_ops {
-		int	(*open)(void *device_data);
-		void	(*release)(void *device_data);
-		ssize_t	(*read)(void *device_data, char __user *buf,
+		int	(*open)(struct vfio_device *vdev);
+		void	(*release)(struct vfio_device *vdev);
+		ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
 				size_t count, loff_t *ppos);
-		ssize_t	(*write)(void *device_data, const char __user *buf,
+		ssize_t	(*write)(struct vfio_device *vdev,
+				 const char __user *buf,
 				 size_t size, loff_t *ppos);
-		long	(*ioctl)(void *device_data, unsigned int cmd,
+		long	(*ioctl)(struct vfio_device *vdev, unsigned int cmd,
 				 unsigned long arg);
-		int	(*mmap)(void *device_data, struct vm_area_struct *vma);
+		int	(*mmap)(struct vfio_device *vdev,
+				struct vm_area_struct *vma);
 	};
 
-Each function is passed the device_data that was originally registered
+Each function is passed the vdev that was originally registered
 in the vfio_register_group_dev() call above.  This allows the bus driver
-an easy place to store its opaque, private data.  The open/release
+to obtain its private data using container_of().  The open/release
 callbacks are issued when a new file descriptor is created for a
 device (via VFIO_GROUP_GET_DEVICE_FD).  The ioctl interface provides
 a direct pass through for VFIO_DEVICE_* ioctls.  The read/write/mmap
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 4043cc91f952..11b3e15403ba 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -21,10 +21,11 @@
 #define DRIVER_AUTHOR   "NVIDIA Corporation"
 #define DRIVER_DESC     "VFIO based driver for Mediated device"
 
-static int vfio_mdev_open(void *device_data)
+static int vfio_mdev_open(struct vfio_device *core_vdev)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
+
 	int ret;
 
 	if (unlikely(!parent->ops->open))
@@ -40,9 +41,9 @@ static int vfio_mdev_open(void *device_data)
 	return ret;
 }
 
-static void vfio_mdev_release(void *device_data)
+static void vfio_mdev_release(struct vfio_device *core_vdev)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (likely(parent->ops->release))
@@ -51,10 +52,10 @@ static void vfio_mdev_release(void *device_data)
 	module_put(THIS_MODULE);
 }
 
-static long vfio_mdev_unlocked_ioctl(void *device_data,
+static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev,
 				     unsigned int cmd, unsigned long arg)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (unlikely(!parent->ops->ioctl))
@@ -63,10 +64,10 @@ static long vfio_mdev_unlocked_ioctl(void *device_data,
 	return parent->ops->ioctl(mdev, cmd, arg);
 }
 
-static ssize_t vfio_mdev_read(void *device_data, char __user *buf,
+static ssize_t vfio_mdev_read(struct vfio_device *core_vdev, char __user *buf,
 			      size_t count, loff_t *ppos)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (unlikely(!parent->ops->read))
@@ -75,10 +76,11 @@ static ssize_t vfio_mdev_read(void *device_data, char __user *buf,
 	return parent->ops->read(mdev, buf, count, ppos);
 }
 
-static ssize_t vfio_mdev_write(void *device_data, const char __user *buf,
-			       size_t count, loff_t *ppos)
+static ssize_t vfio_mdev_write(struct vfio_device *core_vdev,
+			       const char __user *buf, size_t count,
+			       loff_t *ppos)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (unlikely(!parent->ops->write))
@@ -87,9 +89,10 @@ static ssize_t vfio_mdev_write(void *device_data, const char __user *buf,
 	return parent->ops->write(mdev, buf, count, ppos);
 }
 
-static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_mdev_mmap(struct vfio_device *core_vdev,
+			  struct vm_area_struct *vma)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (unlikely(!parent->ops->mmap))
@@ -98,9 +101,9 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
 	return parent->ops->mmap(mdev, vma);
 }
 
-static void vfio_mdev_request(void *device_data, unsigned int count)
+static void vfio_mdev_request(struct vfio_device *core_vdev, unsigned int count)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (parent->ops->request)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index abe4de80ffba..8ae70fdf8a62 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -552,9 +552,10 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
 	vfio_device_put(pf_dev);
 }
 
-static void vfio_pci_release(void *device_data)
+static void vfio_pci_release(struct vfio_device *core_vdev)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 
 	mutex_lock(&vdev->reflck->lock);
 
@@ -580,9 +581,10 @@ static void vfio_pci_release(void *device_data)
 	module_put(THIS_MODULE);
 }
 
-static int vfio_pci_open(void *device_data)
+static int vfio_pci_open(struct vfio_device *core_vdev)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	int ret = 0;
 
 	if (!try_module_get(THIS_MODULE))
@@ -796,10 +798,11 @@ struct vfio_devices {
 	int max_index;
 };
 
-static long vfio_pci_ioctl(void *device_data,
+static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 			   unsigned int cmd, unsigned long arg)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	unsigned long minsz;
 
 	if (cmd == VFIO_DEVICE_GET_INFO) {
@@ -1401,11 +1404,10 @@ static long vfio_pci_ioctl(void *device_data,
 	return -ENOTTY;
 }
 
-static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
+static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf,
 			   size_t count, loff_t *ppos, bool iswrite)
 {
 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
-	struct vfio_pci_device *vdev = device_data;
 
 	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
 		return -EINVAL;
@@ -1433,22 +1435,28 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
 	return -EINVAL;
 }
 
-static ssize_t vfio_pci_read(void *device_data, char __user *buf,
+static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf,
 			     size_t count, loff_t *ppos)
 {
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
+
 	if (!count)
 		return 0;
 
-	return vfio_pci_rw(device_data, buf, count, ppos, false);
+	return vfio_pci_rw(vdev, buf, count, ppos, false);
 }
 
-static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
+static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf,
 			      size_t count, loff_t *ppos)
 {
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
+
 	if (!count)
 		return 0;
 
-	return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
+	return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
 }
 
 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
@@ -1647,9 +1655,10 @@ static const struct vm_operations_struct vfio_pci_mmap_ops = {
 	.fault = vfio_pci_mmap_fault,
 };
 
-static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned int index;
 	u64 phys_len, req_len, pgoff, req_start;
@@ -1717,9 +1726,10 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 	return 0;
 }
 
-static void vfio_pci_request(void *device_data, unsigned int count)
+static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	struct pci_dev *pdev = vdev->pdev;
 
 	mutex_lock(&vdev->igate);
@@ -1833,9 +1843,10 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
 
 #define VF_TOKEN_ARG "vf_token="
 
-static int vfio_pci_match(void *device_data, char *buf)
+static int vfio_pci_match(struct vfio_device *core_vdev, char *buf)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	bool vf_token = false;
 	uuid_t uuid;
 	int ret;
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 6eb749250ee4..f5f6b537084a 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -218,9 +218,10 @@ static int vfio_platform_call_reset(struct vfio_platform_device *vdev,
 	return -EINVAL;
 }
 
-static void vfio_platform_release(void *device_data)
+static void vfio_platform_release(struct vfio_device *core_vdev)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 
 	mutex_lock(&driver_lock);
 
@@ -244,9 +245,10 @@ static void vfio_platform_release(void *device_data)
 	module_put(vdev->parent_module);
 }
 
-static int vfio_platform_open(void *device_data)
+static int vfio_platform_open(struct vfio_device *core_vdev)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	int ret;
 
 	if (!try_module_get(vdev->parent_module))
@@ -293,10 +295,12 @@ static int vfio_platform_open(void *device_data)
 	return ret;
 }
 
-static long vfio_platform_ioctl(void *device_data,
+static long vfio_platform_ioctl(struct vfio_device *core_vdev,
 				unsigned int cmd, unsigned long arg)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+
 	unsigned long minsz;
 
 	if (cmd == VFIO_DEVICE_GET_INFO) {
@@ -455,10 +459,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
 	return -EFAULT;
 }
 
-static ssize_t vfio_platform_read(void *device_data, char __user *buf,
-				  size_t count, loff_t *ppos)
+static ssize_t vfio_platform_read(struct vfio_device *core_vdev,
+				  char __user *buf, size_t count, loff_t *ppos)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
 	loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
 
@@ -531,10 +536,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
 	return -EFAULT;
 }
 
-static ssize_t vfio_platform_write(void *device_data, const char __user *buf,
+static ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf,
 				   size_t count, loff_t *ppos)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
 	loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
 
@@ -573,9 +579,10 @@ static int vfio_platform_mmap_mmio(struct vfio_platform_region region,
 			       req_len, vma->vm_page_prot);
 }
 
-static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index;
 
 	index = vma->vm_pgoff >> (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT);
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 180b4ab02d11..e6f5109fba48 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -832,7 +832,7 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 		int ret;
 
 		if (it->ops->match) {
-			ret = it->ops->match(it->device_data, buf);
+			ret = it->ops->match(it, buf);
 			if (ret < 0) {
 				device = ERR_PTR(ret);
 				break;
@@ -893,7 +893,7 @@ void vfio_unregister_group_dev(struct vfio_device *device)
 	rc = try_wait_for_completion(&device->comp);
 	while (rc <= 0) {
 		if (device->ops->request)
-			device->ops->request(device->device_data, i++);
+			device->ops->request(device, i++);
 
 		if (interrupted) {
 			rc = wait_for_completion_timeout(&device->comp,
@@ -1379,7 +1379,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	if (IS_ERR(device))
 		return PTR_ERR(device);
 
-	ret = device->ops->open(device->device_data);
+	ret = device->ops->open(device);
 	if (ret) {
 		vfio_device_put(device);
 		return ret;
@@ -1391,7 +1391,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	 */
 	ret = get_unused_fd_flags(O_CLOEXEC);
 	if (ret < 0) {
-		device->ops->release(device->device_data);
+		device->ops->release(device);
 		vfio_device_put(device);
 		return ret;
 	}
@@ -1401,7 +1401,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	if (IS_ERR(filep)) {
 		put_unused_fd(ret);
 		ret = PTR_ERR(filep);
-		device->ops->release(device->device_data);
+		device->ops->release(device);
 		vfio_device_put(device);
 		return ret;
 	}
@@ -1558,7 +1558,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_device *device = filep->private_data;
 
-	device->ops->release(device->device_data);
+	device->ops->release(device);
 
 	vfio_group_try_dissolve_container(device->group);
 
@@ -1575,7 +1575,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
 	if (unlikely(!device->ops->ioctl))
 		return -EINVAL;
 
-	return device->ops->ioctl(device->device_data, cmd, arg);
+	return device->ops->ioctl(device, cmd, arg);
 }
 
 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
@@ -1586,7 +1586,7 @@ static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
 	if (unlikely(!device->ops->read))
 		return -EINVAL;
 
-	return device->ops->read(device->device_data, buf, count, ppos);
+	return device->ops->read(device, buf, count, ppos);
 }
 
 static ssize_t vfio_device_fops_write(struct file *filep,
@@ -1598,7 +1598,7 @@ static ssize_t vfio_device_fops_write(struct file *filep,
 	if (unlikely(!device->ops->write))
 		return -EINVAL;
 
-	return device->ops->write(device->device_data, buf, count, ppos);
+	return device->ops->write(device, buf, count, ppos);
 }
 
 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
@@ -1608,7 +1608,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 	if (unlikely(!device->ops->mmap))
 		return -EINVAL;
 
-	return device->ops->mmap(device->device_data, vma);
+	return device->ops->mmap(device, vma);
 }
 
 static const struct file_operations vfio_device_fops = {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 4995faf51efe..784c34c0a287 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -44,17 +44,17 @@ struct vfio_device {
  */
 struct vfio_device_ops {
 	char	*name;
-	int	(*open)(void *device_data);
-	void	(*release)(void *device_data);
-	ssize_t	(*read)(void *device_data, char __user *buf,
+	int	(*open)(struct vfio_device *vdev);
+	void	(*release)(struct vfio_device *vdev);
+	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos);
-	ssize_t	(*write)(void *device_data, const char __user *buf,
+	ssize_t	(*write)(struct vfio_device *vdev, const char __user *buf,
 			 size_t count, loff_t *size);
-	long	(*ioctl)(void *device_data, unsigned int cmd,
+	long	(*ioctl)(struct vfio_device *vdev, unsigned int cmd,
 			 unsigned long arg);
-	int	(*mmap)(void *device_data, struct vm_area_struct *vma);
-	void	(*request)(void *device_data, unsigned int count);
-	int	(*match)(void *device_data, char *buf);
+	int	(*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
+	void	(*request)(struct vfio_device *vdev, unsigned int count);
+	int	(*match)(struct vfio_device *vdev, char *buf);
 };
 
 extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
-- 
Gitee


From 7e2b660e27ed7f5c0b0bda36bd84928b5c03a7da Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:08 -0600
Subject: [PATCH 1617/2161] vfio/pci: Replace uses of vfio_device_data() with
 container_of

commit 07d47b4222d5d1cd933f01587dda00398d8daf40 upstream.

This tidies a few confused places that think they can have a refcount on
the vfio_device but the device_data could be NULL, that isn't possible by
design.

Most of the change falls out when struct vfio_devices is updated to just
store the struct vfio_pci_device itself. This wasn't possible before
because there was no easy way to get from the 'struct vfio_pci_device' to
the 'struct vfio_device' to put back the refcount.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <13-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 67 +++++++++++++------------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 8ae70fdf8a62..59e87869bf93 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -516,30 +516,29 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 
 static struct pci_driver vfio_pci_driver;
 
-static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev,
-					   struct vfio_device **pf_dev)
+static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev)
 {
 	struct pci_dev *physfn = pci_physfn(vdev->pdev);
+	struct vfio_device *pf_dev;
 
 	if (!vdev->pdev->is_virtfn)
 		return NULL;
 
-	*pf_dev = vfio_device_get_from_dev(&physfn->dev);
-	if (!*pf_dev)
+	pf_dev = vfio_device_get_from_dev(&physfn->dev);
+	if (!pf_dev)
 		return NULL;
 
 	if (pci_dev_driver(physfn) != &vfio_pci_driver) {
-		vfio_device_put(*pf_dev);
+		vfio_device_put(pf_dev);
 		return NULL;
 	}
 
-	return vfio_device_data(*pf_dev);
+	return container_of(pf_dev, struct vfio_pci_device, vdev);
 }
 
 static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
 {
-	struct vfio_device *pf_dev;
-	struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
+	struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev);
 
 	if (!pf_vdev)
 		return;
@@ -549,7 +548,7 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
 	WARN_ON(pf_vdev->vf_token->users < 0);
 	mutex_unlock(&pf_vdev->vf_token->lock);
 
-	vfio_device_put(pf_dev);
+	vfio_device_put(&pf_vdev->vdev);
 }
 
 static void vfio_pci_release(struct vfio_device *core_vdev)
@@ -793,7 +792,7 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
 }
 
 struct vfio_devices {
-	struct vfio_device **devices;
+	struct vfio_pci_device **devices;
 	int cur_index;
 	int max_index;
 };
@@ -1282,9 +1281,7 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 			goto hot_reset_release;
 
 		for (; mem_idx < devs.cur_index; mem_idx++) {
-			struct vfio_pci_device *tmp;
-
-			tmp = vfio_device_data(devs.devices[mem_idx]);
+			struct vfio_pci_device *tmp = devs.devices[mem_idx];
 
 			ret = down_write_trylock(&tmp->memory_lock);
 			if (!ret) {
@@ -1299,17 +1296,13 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 
 hot_reset_release:
 		for (i = 0; i < devs.cur_index; i++) {
-			struct vfio_device *device;
-			struct vfio_pci_device *tmp;
-
-			device = devs.devices[i];
-			tmp = vfio_device_data(device);
+			struct vfio_pci_device *tmp = devs.devices[i];
 
 			if (i < mem_idx)
 				up_write(&tmp->memory_lock);
 			else
 				mutex_unlock(&tmp->vma_lock);
-			vfio_device_put(device);
+			vfio_device_put(&tmp->vdev);
 		}
 		kfree(devs.devices);
 
@@ -1780,8 +1773,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
 		return 0; /* No VF token provided or required */
 
 	if (vdev->pdev->is_virtfn) {
-		struct vfio_device *pf_dev;
-		struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
+		struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev);
 		bool match;
 
 		if (!pf_vdev) {
@@ -1794,7 +1786,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
 		}
 
 		if (!vf_token) {
-			vfio_device_put(pf_dev);
+			vfio_device_put(&pf_vdev->vdev);
 			pci_info_ratelimited(vdev->pdev,
 				"VF token required to access device\n");
 			return -EACCES;
@@ -1804,7 +1796,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
 		match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
 		mutex_unlock(&pf_vdev->vf_token->lock);
 
-		vfio_device_put(pf_dev);
+		vfio_device_put(&pf_vdev->vdev);
 
 		if (!match) {
 			pci_info_ratelimited(vdev->pdev,
@@ -2125,11 +2117,7 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 	if (device == NULL)
 		return PCI_ERS_RESULT_DISCONNECT;
 
-	vdev = vfio_device_data(device);
-	if (vdev == NULL) {
-		vfio_device_put(device);
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
+	vdev = container_of(device, struct vfio_pci_device, vdev);
 
 	mutex_lock(&vdev->igate);
 
@@ -2145,7 +2133,6 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 
 static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
 {
-	struct vfio_pci_device *vdev;
 	struct vfio_device *device;
 	int ret = 0;
 
@@ -2158,12 +2145,6 @@ static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
 	if (!device)
 		return -ENODEV;
 
-	vdev = vfio_device_data(device);
-	if (!vdev) {
-		vfio_device_put(device);
-		return -ENODEV;
-	}
-
 	if (nr_virtfn == 0)
 		pci_disable_sriov(pdev);
 	else
@@ -2223,7 +2204,7 @@ static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
 		return 0;
 	}
 
-	vdev = vfio_device_data(device);
+	vdev = container_of(device, struct vfio_pci_device, vdev);
 
 	if (vdev->reflck) {
 		vfio_pci_reflck_get(vdev->reflck);
@@ -2285,7 +2266,7 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 		return -EBUSY;
 	}
 
-	vdev = vfio_device_data(device);
+	vdev = container_of(device, struct vfio_pci_device, vdev);
 
 	/* Fault if the device is not unused */
 	if (vdev->refcnt) {
@@ -2293,7 +2274,7 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 		return -EBUSY;
 	}
 
-	devs->devices[devs->cur_index++] = device;
+	devs->devices[devs->cur_index++] = vdev;
 	return 0;
 }
 
@@ -2315,7 +2296,7 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 		return -EBUSY;
 	}
 
-	vdev = vfio_device_data(device);
+	vdev = container_of(device, struct vfio_pci_device, vdev);
 
 	/*
 	 * Locking multiple devices is prone to deadlock, runaway and
@@ -2326,7 +2307,7 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 		return -EBUSY;
 	}
 
-	devs->devices[devs->cur_index++] = device;
+	devs->devices[devs->cur_index++] = vdev;
 	return 0;
 }
 
@@ -2374,7 +2355,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 
 	/* Does at least one need a reset? */
 	for (i = 0; i < devs.cur_index; i++) {
-		tmp = vfio_device_data(devs.devices[i]);
+		tmp = devs.devices[i];
 		if (tmp->needs_reset) {
 			ret = pci_reset_bus(vdev->pdev);
 			break;
@@ -2383,7 +2364,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 
 put_devs:
 	for (i = 0; i < devs.cur_index; i++) {
-		tmp = vfio_device_data(devs.devices[i]);
+		tmp = devs.devices[i];
 
 		/*
 		 * If reset was successful, affected devices no longer need
@@ -2399,7 +2380,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 				vfio_pci_set_power_state(tmp, PCI_D3hot);
 		}
 
-		vfio_device_put(devs.devices[i]);
+		vfio_device_put(&tmp->vdev);
 	}
 
 	kfree(devs.devices);
-- 
Gitee


From df5be74ef37d54193f30894b9069c19bbb4f270d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:08 -0600
Subject: [PATCH 1618/2161] vfio: Remove device_data from the vfio bus driver
 API

commit 1e04ec14204dec28131855d8dd160c3d55d12797 upstream.

There are no longer any users, so it can go away. Everything is using
container_of now.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <14-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/vfio.rst            |  3 +--
 drivers/vfio/mdev/vfio_mdev.c                |  2 +-
 drivers/vfio/pci/vfio_pci.c                  |  2 +-
 drivers/vfio/platform/vfio_platform_common.c |  2 +-
 drivers/vfio/vfio.c                          | 12 +-----------
 include/linux/vfio.h                         |  4 +---
 6 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst
index 3337f337293a..decc68cb8114 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -254,8 +254,7 @@ vfio_unregister_group_dev() respectively::
 
 	void vfio_init_group_dev(struct vfio_device *device,
 				struct device *dev,
-				const struct vfio_device_ops *ops,
-				void *device_data);
+				const struct vfio_device_ops *ops);
 	int vfio_register_group_dev(struct vfio_device *device);
 	void vfio_unregister_group_dev(struct vfio_device *device);
 
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 11b3e15403ba..ae7e322fbe3c 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -134,7 +134,7 @@ static int vfio_mdev_probe(struct device *dev)
 	if (!vdev)
 		return -ENOMEM;
 
-	vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops, mdev);
+	vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops);
 	ret = vfio_register_group_dev(vdev);
 	if (ret) {
 		kfree(vdev);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 59e87869bf93..1504ba6a63c8 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -2025,7 +2025,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto out_group_put;
 	}
 
-	vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops, vdev);
+	vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops);
 	vdev->pdev = pdev;
 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
 	mutex_init(&vdev->igate);
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index f5f6b537084a..361e5b57e369 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -666,7 +666,7 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 	struct iommu_group *group;
 	int ret;
 
-	vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops, vdev);
+	vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops);
 
 	ret = vfio_platform_acpi_probe(vdev, dev);
 	if (ret)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index e6f5109fba48..5e631c359ef2 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -741,12 +741,11 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
  * VFIO driver API
  */
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
-			 const struct vfio_device_ops *ops, void *device_data)
+			 const struct vfio_device_ops *ops)
 {
 	init_completion(&device->comp);
 	device->dev = dev;
 	device->ops = ops;
-	device->device_data = device_data;
 }
 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
 
@@ -851,15 +850,6 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 	return device;
 }
 
-/*
- * Caller must hold a reference to the vfio_device
- */
-void *vfio_device_data(struct vfio_device *device)
-{
-	return device->device_data;
-}
-EXPORT_SYMBOL_GPL(vfio_device_data);
-
 /*
  * Decrement the device reference count and wait for the device to be
  * removed.  Open file descriptors for the device... */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 784c34c0a287..a2c5b30e1763 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -24,7 +24,6 @@ struct vfio_device {
 	refcount_t refcount;
 	struct completion comp;
 	struct list_head group_next;
-	void *device_data;
 };
 
 /**
@@ -61,12 +60,11 @@ extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
 extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
-			 const struct vfio_device_ops *ops, void *device_data);
+			 const struct vfio_device_ops *ops);
 int vfio_register_group_dev(struct vfio_device *device);
 void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
-extern void *vfio_device_data(struct vfio_device *device);
 
 /* events for the backend driver notify callback */
 enum vfio_iommu_notify_type {
-- 
Gitee


From 37c96cb437853b7ffd5f3c51e3cf0d36f47275ca Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:24 -0300
Subject: [PATCH 1619/2161] vfio/mdev: Fix missing static's on MDEV_TYPE_ATTR's

commit 6cbf507fd08b52901d62bf11f3507e80f84c0db4 upstream.

These should always be prefixed with static, otherwise compilation
will fail on non-modular builds with

ld: samples/vfio-mdev/mbochs.o:(.data+0x2e0): multiple definition of `mdev_type_attr_name'; samples/vfio-mdev/mdpy.o:(.data+0x240): first defined here

Fixes: a5e6e6505f38 ("sample: vfio bochs vbe display (host device for bochs-drm)")
Fixes: d61fc96f47fd ("sample: vfio mdev display - host device")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Message-Id: <1-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mbochs.c | 10 +++++-----
 samples/vfio-mdev/mdpy.c   | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index ac5c8c17b1ff..93f42631452e 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1349,7 +1349,7 @@ static const struct attribute_group mdev_dev_group = {
 	.attrs = mdev_dev_attrs,
 };
 
-const struct attribute_group *mdev_dev_groups[] = {
+static const struct attribute_group *mdev_dev_groups[] = {
 	&mdev_dev_group,
 	NULL,
 };
@@ -1359,7 +1359,7 @@ name_show(struct kobject *kobj, struct device *dev, char *buf)
 {
 	return sprintf(buf, "%s\n", kobj->name);
 }
-MDEV_TYPE_ATTR_RO(name);
+static MDEV_TYPE_ATTR_RO(name);
 
 static ssize_t
 description_show(struct kobject *kobj, struct device *dev, char *buf)
@@ -1369,7 +1369,7 @@ description_show(struct kobject *kobj, struct device *dev, char *buf)
 	return sprintf(buf, "virtual display, %d MB video memory\n",
 		       type ? type->mbytes  : 0);
 }
-MDEV_TYPE_ATTR_RO(description);
+static MDEV_TYPE_ATTR_RO(description);
 
 static ssize_t
 available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
@@ -1379,14 +1379,14 @@ available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
 
 	return sprintf(buf, "%d\n", count);
 }
-MDEV_TYPE_ATTR_RO(available_instances);
+static MDEV_TYPE_ATTR_RO(available_instances);
 
 static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
 			       char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
-MDEV_TYPE_ATTR_RO(device_api);
+static MDEV_TYPE_ATTR_RO(device_api);
 
 static struct attribute *mdev_types_attrs[] = {
 	&mdev_type_attr_name.attr,
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 9894693f3be1..d4ec2b52ca49 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -658,7 +658,7 @@ static const struct attribute_group mdev_dev_group = {
 	.attrs = mdev_dev_attrs,
 };
 
-const struct attribute_group *mdev_dev_groups[] = {
+static const struct attribute_group *mdev_dev_groups[] = {
 	&mdev_dev_group,
 	NULL,
 };
@@ -668,7 +668,7 @@ name_show(struct kobject *kobj, struct device *dev, char *buf)
 {
 	return sprintf(buf, "%s\n", kobj->name);
 }
-MDEV_TYPE_ATTR_RO(name);
+static MDEV_TYPE_ATTR_RO(name);
 
 static ssize_t
 description_show(struct kobject *kobj, struct device *dev, char *buf)
@@ -679,21 +679,21 @@ description_show(struct kobject *kobj, struct device *dev, char *buf)
 		       type ? type->width  : 0,
 		       type ? type->height : 0);
 }
-MDEV_TYPE_ATTR_RO(description);
+static MDEV_TYPE_ATTR_RO(description);
 
 static ssize_t
 available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
 {
 	return sprintf(buf, "%d\n", max_devices - mdpy_count);
 }
-MDEV_TYPE_ATTR_RO(available_instances);
+static MDEV_TYPE_ATTR_RO(available_instances);
 
 static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
 			       char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
-MDEV_TYPE_ATTR_RO(device_api);
+static MDEV_TYPE_ATTR_RO(device_api);
 
 static struct attribute *mdev_types_attrs[] = {
 	&mdev_type_attr_name.attr,
-- 
Gitee


From 1f73d0f09c8de222655b6f18d0269e585161e406 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:26 -0300
Subject: [PATCH 1620/2161] vfio/mdev: Add missing typesafety around
 mdev_device

commit 2a3d15f270efa50d78d8a32d895e9d5396668f3a upstream.

The mdev API should accept and pass a 'struct mdev_device *' in all
places, not pass a 'struct device *' and cast it internally with
to_mdev_device(). Particularly in its struct mdev_driver functions, the
whole point of a bus's struct device_driver wrapper is to provide type
safety compared to the default struct device_driver.

Further, the driver core standard is for bus drivers to expose their
device structure in their public headers that can be used with
container_of() inlines and '&foo->dev' to go between the class levels, and
'&foo->dev' to be used with dev_err/etc driver core helper functions. Move
'struct mdev_device' to mdev.h

Once done this allows moving some one instruction exported functions to
static inlines, which in turns allows removing one of the two grotesque
symbol_get()'s related to mdev in the core code.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Message-Id: <3-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../driver-api/vfio-mediated-device.rst       |  4 +-
 drivers/vfio/mdev/mdev_core.c                 | 64 ++-----------------
 drivers/vfio/mdev/mdev_driver.c               |  4 +-
 drivers/vfio/mdev/mdev_private.h              | 23 +------
 drivers/vfio/mdev/mdev_sysfs.c                | 26 ++++----
 drivers/vfio/mdev/vfio_mdev.c                 |  7 +-
 drivers/vfio/vfio_iommu_type1.c               | 25 ++------
 include/linux/mdev.h                          | 58 +++++++++++++----
 8 files changed, 83 insertions(+), 128 deletions(-)

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index 25eb7d5b834b..c43c1dc33333 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -105,8 +105,8 @@ structure to represent a mediated device's driver::
       */
      struct mdev_driver {
 	     const char *name;
-	     int  (*probe)  (struct device *dev);
-	     void (*remove) (struct device *dev);
+	     int  (*probe)  (struct mdev_device *dev);
+	     void (*remove) (struct mdev_device *dev);
 	     struct device_driver    driver;
      };
 
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 6de97d25a3f8..057922a1707e 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -33,36 +33,6 @@ struct device *mdev_parent_dev(struct mdev_device *mdev)
 }
 EXPORT_SYMBOL(mdev_parent_dev);
 
-void *mdev_get_drvdata(struct mdev_device *mdev)
-{
-	return mdev->driver_data;
-}
-EXPORT_SYMBOL(mdev_get_drvdata);
-
-void mdev_set_drvdata(struct mdev_device *mdev, void *data)
-{
-	mdev->driver_data = data;
-}
-EXPORT_SYMBOL(mdev_set_drvdata);
-
-struct device *mdev_dev(struct mdev_device *mdev)
-{
-	return &mdev->dev;
-}
-EXPORT_SYMBOL(mdev_dev);
-
-struct mdev_device *mdev_from_dev(struct device *dev)
-{
-	return dev_is_mdev(dev) ? to_mdev_device(dev) : NULL;
-}
-EXPORT_SYMBOL(mdev_from_dev);
-
-const guid_t *mdev_uuid(struct mdev_device *mdev)
-{
-	return &mdev->uuid;
-}
-EXPORT_SYMBOL(mdev_uuid);
-
 /* Should be called holding parent_list_lock */
 static struct mdev_parent *__find_parent_device(struct device *dev)
 {
@@ -107,7 +77,7 @@ static void mdev_device_remove_common(struct mdev_device *mdev)
 	int ret;
 
 	type = to_mdev_type(mdev->type_kobj);
-	mdev_remove_sysfs_files(&mdev->dev, type);
+	mdev_remove_sysfs_files(mdev, type);
 	device_del(&mdev->dev);
 	parent = mdev->parent;
 	lockdep_assert_held(&parent->unreg_sem);
@@ -122,12 +92,10 @@ static void mdev_device_remove_common(struct mdev_device *mdev)
 
 static int mdev_device_remove_cb(struct device *dev, void *data)
 {
-	if (dev_is_mdev(dev)) {
-		struct mdev_device *mdev;
+	struct mdev_device *mdev = mdev_from_dev(dev);
 
-		mdev = to_mdev_device(dev);
+	if (mdev)
 		mdev_device_remove_common(mdev);
-	}
 	return 0;
 }
 
@@ -332,7 +300,7 @@ int mdev_device_create(struct kobject *kobj,
 	if (ret)
 		goto add_fail;
 
-	ret = mdev_create_sysfs_files(&mdev->dev, type);
+	ret = mdev_create_sysfs_files(mdev, type);
 	if (ret)
 		goto sysfs_fail;
 
@@ -354,13 +322,11 @@ int mdev_device_create(struct kobject *kobj,
 	return ret;
 }
 
-int mdev_device_remove(struct device *dev)
+int mdev_device_remove(struct mdev_device *mdev)
 {
-	struct mdev_device *mdev, *tmp;
+	struct mdev_device *tmp;
 	struct mdev_parent *parent;
 
-	mdev = to_mdev_device(dev);
-
 	mutex_lock(&mdev_list_lock);
 	list_for_each_entry(tmp, &mdev_list, next) {
 		if (tmp == mdev)
@@ -390,24 +356,6 @@ int mdev_device_remove(struct device *dev)
 	return 0;
 }
 
-int mdev_set_iommu_device(struct device *dev, struct device *iommu_device)
-{
-	struct mdev_device *mdev = to_mdev_device(dev);
-
-	mdev->iommu_device = iommu_device;
-
-	return 0;
-}
-EXPORT_SYMBOL(mdev_set_iommu_device);
-
-struct device *mdev_get_iommu_device(struct device *dev)
-{
-	struct mdev_device *mdev = to_mdev_device(dev);
-
-	return mdev->iommu_device;
-}
-EXPORT_SYMBOL(mdev_get_iommu_device);
-
 static int __init mdev_init(void)
 {
 	return mdev_bus_register();
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 0d3223aee20b..44c3ba7e56d9 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -48,7 +48,7 @@ static int mdev_probe(struct device *dev)
 		return ret;
 
 	if (drv && drv->probe) {
-		ret = drv->probe(dev);
+		ret = drv->probe(mdev);
 		if (ret)
 			mdev_detach_iommu(mdev);
 	}
@@ -62,7 +62,7 @@ static int mdev_remove(struct device *dev)
 	struct mdev_device *mdev = to_mdev_device(dev);
 
 	if (drv && drv->remove)
-		drv->remove(dev);
+		drv->remove(mdev);
 
 	mdev_detach_iommu(mdev);
 
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index fdbb3bee99a9..97e2225f7f49 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -24,23 +24,6 @@ struct mdev_parent {
 	struct rw_semaphore unreg_sem;
 };
 
-struct mdev_device {
-	struct device dev;
-	struct mdev_parent *parent;
-	guid_t uuid;
-	void *driver_data;
-	struct list_head next;
-	struct kobject *type_kobj;
-	struct device *iommu_device;
-	bool active;
-};
-
-static inline struct mdev_device *to_mdev_device(struct device *dev)
-{
-	return container_of(dev, struct mdev_device, dev);
-}
-#define dev_is_mdev(d)		((d)->bus == &mdev_bus_type)
-
 struct mdev_type {
 	struct kobject kobj;
 	struct kobject *devices_kobj;
@@ -57,11 +40,11 @@ struct mdev_type {
 int  parent_create_sysfs_files(struct mdev_parent *parent);
 void parent_remove_sysfs_files(struct mdev_parent *parent);
 
-int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type);
-void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type);
+int  mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type);
+void mdev_remove_sysfs_files(struct mdev_device *mdev, struct mdev_type *type);
 
 int  mdev_device_create(struct kobject *kobj,
 			struct device *dev, const guid_t *uuid);
-int  mdev_device_remove(struct device *dev);
+int  mdev_device_remove(struct mdev_device *dev);
 
 #endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 367ff5412a38..18114f3e090a 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -225,6 +225,7 @@ int parent_create_sysfs_files(struct mdev_parent *parent)
 static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	unsigned long val;
 
 	if (kstrtoul(buf, 0, &val) < 0)
@@ -233,7 +234,7 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 	if (val && device_remove_file_self(dev, attr)) {
 		int ret;
 
-		ret = mdev_device_remove(dev);
+		ret = mdev_device_remove(mdev);
 		if (ret)
 			return ret;
 	}
@@ -248,34 +249,37 @@ static const struct attribute *mdev_device_attrs[] = {
 	NULL,
 };
 
-int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type)
+int mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
 {
+	struct kobject *kobj = &mdev->dev.kobj;
 	int ret;
 
-	ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev));
+	ret = sysfs_create_link(type->devices_kobj, kobj, dev_name(&mdev->dev));
 	if (ret)
 		return ret;
 
-	ret = sysfs_create_link(&dev->kobj, &type->kobj, "mdev_type");
+	ret = sysfs_create_link(kobj, &type->kobj, "mdev_type");
 	if (ret)
 		goto type_link_failed;
 
-	ret = sysfs_create_files(&dev->kobj, mdev_device_attrs);
+	ret = sysfs_create_files(kobj, mdev_device_attrs);
 	if (ret)
 		goto create_files_failed;
 
 	return ret;
 
 create_files_failed:
-	sysfs_remove_link(&dev->kobj, "mdev_type");
+	sysfs_remove_link(kobj, "mdev_type");
 type_link_failed:
-	sysfs_remove_link(type->devices_kobj, dev_name(dev));
+	sysfs_remove_link(type->devices_kobj, dev_name(&mdev->dev));
 	return ret;
 }
 
-void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type)
+void mdev_remove_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
 {
-	sysfs_remove_files(&dev->kobj, mdev_device_attrs);
-	sysfs_remove_link(&dev->kobj, "mdev_type");
-	sysfs_remove_link(type->devices_kobj, dev_name(dev));
+	struct kobject *kobj = &mdev->dev.kobj;
+
+	sysfs_remove_files(kobj, mdev_device_attrs);
+	sysfs_remove_link(kobj, "mdev_type");
+	sysfs_remove_link(type->devices_kobj, dev_name(&mdev->dev));
 }
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index ae7e322fbe3c..91b7b8b9eb9c 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -124,9 +124,8 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.request	= vfio_mdev_request,
 };
 
-static int vfio_mdev_probe(struct device *dev)
+static int vfio_mdev_probe(struct mdev_device *mdev)
 {
-	struct mdev_device *mdev = to_mdev_device(dev);
 	struct vfio_device *vdev;
 	int ret;
 
@@ -144,9 +143,9 @@ static int vfio_mdev_probe(struct device *dev)
 	return 0;
 }
 
-static void vfio_mdev_remove(struct device *dev)
+static void vfio_mdev_remove(struct mdev_device *mdev)
 {
-	struct vfio_device *vdev = dev_get_drvdata(dev);
+	struct vfio_device *vdev = dev_get_drvdata(&mdev->dev);
 
 	vfio_unregister_group_dev(vdev);
 	kfree(vdev);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e0243886c418..7628d45b55c3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1933,28 +1933,13 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
 	return ret;
 }
 
-static struct device *vfio_mdev_get_iommu_device(struct device *dev)
-{
-	struct device *(*fn)(struct device *dev);
-	struct device *iommu_device;
-
-	fn = symbol_get(mdev_get_iommu_device);
-	if (fn) {
-		iommu_device = fn(dev);
-		symbol_put(mdev_get_iommu_device);
-
-		return iommu_device;
-	}
-
-	return NULL;
-}
-
 static int vfio_mdev_attach_domain(struct device *dev, void *data)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct iommu_domain *domain = data;
 	struct device *iommu_device;
 
-	iommu_device = vfio_mdev_get_iommu_device(dev);
+	iommu_device = mdev_get_iommu_device(mdev);
 	if (iommu_device) {
 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
 			return iommu_aux_attach_device(domain, iommu_device);
@@ -1967,10 +1952,11 @@ static int vfio_mdev_attach_domain(struct device *dev, void *data)
 
 static int vfio_mdev_detach_domain(struct device *dev, void *data)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct iommu_domain *domain = data;
 	struct device *iommu_device;
 
-	iommu_device = vfio_mdev_get_iommu_device(dev);
+	iommu_device = mdev_get_iommu_device(mdev);
 	if (iommu_device) {
 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
 			iommu_aux_detach_device(domain, iommu_device);
@@ -2018,9 +2004,10 @@ static bool vfio_bus_is_mdev(struct bus_type *bus)
 
 static int vfio_mdev_iommu_device(struct device *dev, void *data)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct device **old = data, *new;
 
-	new = vfio_mdev_get_iommu_device(dev);
+	new = mdev_get_iommu_device(mdev);
 	if (!new || (*old && *old != new))
 		return -EINVAL;
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 9004375c462e..230b57b6784c 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -10,7 +10,21 @@
 #ifndef MDEV_H
 #define MDEV_H
 
-struct mdev_device;
+struct mdev_device {
+	struct device dev;
+	struct mdev_parent *parent;
+	guid_t uuid;
+	void *driver_data;
+	struct list_head next;
+	struct kobject *type_kobj;
+	struct device *iommu_device;
+	bool active;
+};
+
+static inline struct mdev_device *to_mdev_device(struct device *dev)
+{
+	return container_of(dev, struct mdev_device, dev);
+}
 
 /*
  * Called by the parent device driver to set the device which represents
@@ -19,12 +33,17 @@ struct mdev_device;
  *
  * @dev: the mediated device that iommu will isolate.
  * @iommu_device: a pci device which represents the iommu for @dev.
- *
- * Return 0 for success, otherwise negative error value.
  */
-int mdev_set_iommu_device(struct device *dev, struct device *iommu_device);
+static inline void mdev_set_iommu_device(struct mdev_device *mdev,
+					 struct device *iommu_device)
+{
+	mdev->iommu_device = iommu_device;
+}
 
-struct device *mdev_get_iommu_device(struct device *dev);
+static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
+{
+	return mdev->iommu_device;
+}
 
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
@@ -126,16 +145,25 @@ struct mdev_type_attribute mdev_type_attr_##_name =		\
  **/
 struct mdev_driver {
 	const char *name;
-	int  (*probe)(struct device *dev);
-	void (*remove)(struct device *dev);
+	int (*probe)(struct mdev_device *dev);
+	void (*remove)(struct mdev_device *dev);
 	struct device_driver driver;
 };
 
 #define to_mdev_driver(drv)	container_of(drv, struct mdev_driver, driver)
 
-void *mdev_get_drvdata(struct mdev_device *mdev);
-void mdev_set_drvdata(struct mdev_device *mdev, void *data);
-const guid_t *mdev_uuid(struct mdev_device *mdev);
+static inline void *mdev_get_drvdata(struct mdev_device *mdev)
+{
+	return mdev->driver_data;
+}
+static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
+{
+	mdev->driver_data = data;
+}
+static inline const guid_t *mdev_uuid(struct mdev_device *mdev)
+{
+	return &mdev->uuid;
+}
 
 extern struct bus_type mdev_bus_type;
 
@@ -146,7 +174,13 @@ int mdev_register_driver(struct mdev_driver *drv, struct module *owner);
 void mdev_unregister_driver(struct mdev_driver *drv);
 
 struct device *mdev_parent_dev(struct mdev_device *mdev);
-struct device *mdev_dev(struct mdev_device *mdev);
-struct mdev_device *mdev_from_dev(struct device *dev);
+static inline struct device *mdev_dev(struct mdev_device *mdev)
+{
+	return &mdev->dev;
+}
+static inline struct mdev_device *mdev_from_dev(struct device *dev)
+{
+	return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL;
+}
 
 #endif /* MDEV_H */
-- 
Gitee


From 3e58476b5eb7fe4aad5ece4ebe4026e8c459f5ed Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:27 -0300
Subject: [PATCH 1621/2161] vfio/mdev: Simplify driver registration

commit 91b9969d9c6bb7c02253bbfc536bfd892f636fdc upstream.

This is only done once, we don't need to generate code to initialize a
structure stored in the ELF .data segment. Fill in the three required
.driver members directly instead of copying data into them during
mdev_register_driver().

Further the to_mdev_driver() function doesn't belong in a public header,
just inline it into the two places that need it. Finally, we can now
clearly see that 'drv' derived from dev->driver cannot be NULL, firstly
because the driver core forbids it, and secondly because NULL won't pass
through the container_of(). Remove the dead code.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <4-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  5 +----
 drivers/vfio/mdev/mdev_driver.c                   | 15 +++++++--------
 drivers/vfio/mdev/vfio_mdev.c                     |  8 ++++++--
 include/linux/mdev.h                              |  6 +-----
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index c43c1dc33333..1779b85f014e 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -98,13 +98,11 @@ structure to represent a mediated device's driver::
 
      /*
       * struct mdev_driver [2] - Mediated device's driver
-      * @name: driver name
       * @probe: called when new device created
       * @remove: called when device removed
       * @driver: device driver structure
       */
      struct mdev_driver {
-	     const char *name;
 	     int  (*probe)  (struct mdev_device *dev);
 	     void (*remove) (struct mdev_device *dev);
 	     struct device_driver    driver;
@@ -115,8 +113,7 @@ to register and unregister itself with the core driver:
 
 * Register::
 
-    extern int  mdev_register_driver(struct mdev_driver *drv,
-				   struct module *owner);
+    extern int  mdev_register_driver(struct mdev_driver *drv);
 
 * Unregister::
 
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 44c3ba7e56d9..041699571b7e 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -39,7 +39,8 @@ static void mdev_detach_iommu(struct mdev_device *mdev)
 
 static int mdev_probe(struct device *dev)
 {
-	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_driver *drv =
+		container_of(dev->driver, struct mdev_driver, driver);
 	struct mdev_device *mdev = to_mdev_device(dev);
 	int ret;
 
@@ -47,7 +48,7 @@ static int mdev_probe(struct device *dev)
 	if (ret)
 		return ret;
 
-	if (drv && drv->probe) {
+	if (drv->probe) {
 		ret = drv->probe(mdev);
 		if (ret)
 			mdev_detach_iommu(mdev);
@@ -58,10 +59,11 @@ static int mdev_probe(struct device *dev)
 
 static int mdev_remove(struct device *dev)
 {
-	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_driver *drv =
+		container_of(dev->driver, struct mdev_driver, driver);
 	struct mdev_device *mdev = to_mdev_device(dev);
 
-	if (drv && drv->remove)
+	if (drv->remove)
 		drv->remove(mdev);
 
 	mdev_detach_iommu(mdev);
@@ -79,16 +81,13 @@ EXPORT_SYMBOL_GPL(mdev_bus_type);
 /**
  * mdev_register_driver - register a new MDEV driver
  * @drv: the driver to register
- * @owner: module owner of driver to be registered
  *
  * Returns a negative value on error, otherwise 0.
  **/
-int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
+int mdev_register_driver(struct mdev_driver *drv)
 {
 	/* initialize common driver fields */
-	drv->driver.name = drv->name;
 	drv->driver.bus = &mdev_bus_type;
-	drv->driver.owner = owner;
 
 	/* register with core */
 	return driver_register(&drv->driver);
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 91b7b8b9eb9c..cc9507ed85a1 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -152,14 +152,18 @@ static void vfio_mdev_remove(struct mdev_device *mdev)
 }
 
 static struct mdev_driver vfio_mdev_driver = {
-	.name	= "vfio_mdev",
+	.driver = {
+		.name = "vfio_mdev",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
 	.probe	= vfio_mdev_probe,
 	.remove	= vfio_mdev_remove,
 };
 
 static int __init vfio_mdev_init(void)
 {
-	return mdev_register_driver(&vfio_mdev_driver, THIS_MODULE);
+	return mdev_register_driver(&vfio_mdev_driver);
 }
 
 static void __exit vfio_mdev_exit(void)
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 230b57b6784c..9838e4495470 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -137,21 +137,17 @@ struct mdev_type_attribute mdev_type_attr_##_name =		\
 
 /**
  * struct mdev_driver - Mediated device driver
- * @name: driver name
  * @probe: called when new device created
  * @remove: called when device removed
  * @driver: device driver structure
  *
  **/
 struct mdev_driver {
-	const char *name;
 	int (*probe)(struct mdev_device *dev);
 	void (*remove)(struct mdev_device *dev);
 	struct device_driver driver;
 };
 
-#define to_mdev_driver(drv)	container_of(drv, struct mdev_driver, driver)
-
 static inline void *mdev_get_drvdata(struct mdev_device *mdev)
 {
 	return mdev->driver_data;
@@ -170,7 +166,7 @@ extern struct bus_type mdev_bus_type;
 int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops);
 void mdev_unregister_device(struct device *dev);
 
-int mdev_register_driver(struct mdev_driver *drv, struct module *owner);
+int mdev_register_driver(struct mdev_driver *drv);
 void mdev_unregister_driver(struct mdev_driver *drv);
 
 struct device *mdev_parent_dev(struct mdev_device *mdev);
-- 
Gitee


From 85e437be92594c7ceeca2e3563dae9376479961b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:28 -0300
Subject: [PATCH 1622/2161] vfio/mdev: Use struct mdev_type in struct
 mdev_device

commit 417fd5bf242d7691c15fe0bd705ab76c69276572 upstream.

The kobj pointer in mdev_device is actually pointing at a struct
mdev_type. Use the proper type so things are understandable.

There are a number of places that are confused and passing both the mdev
and the mtype as function arguments, fix these to derive the mtype
directly from the mdev to remove the redundancy.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <5-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c    | 16 ++++++----------
 drivers/vfio/mdev/mdev_private.h |  7 +++----
 drivers/vfio/mdev/mdev_sysfs.c   | 11 ++++++-----
 include/linux/mdev.h             |  4 +++-
 4 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 057922a1707e..5ca0efa5266b 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -73,11 +73,9 @@ static void mdev_put_parent(struct mdev_parent *parent)
 static void mdev_device_remove_common(struct mdev_device *mdev)
 {
 	struct mdev_parent *parent;
-	struct mdev_type *type;
 	int ret;
 
-	type = to_mdev_type(mdev->type_kobj);
-	mdev_remove_sysfs_files(mdev, type);
+	mdev_remove_sysfs_files(mdev);
 	device_del(&mdev->dev);
 	parent = mdev->parent;
 	lockdep_assert_held(&parent->unreg_sem);
@@ -241,13 +239,11 @@ static void mdev_device_release(struct device *dev)
 	mdev_device_free(mdev);
 }
 
-int mdev_device_create(struct kobject *kobj,
-		       struct device *dev, const guid_t *uuid)
+int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 {
 	int ret;
 	struct mdev_device *mdev, *tmp;
 	struct mdev_parent *parent;
-	struct mdev_type *type = to_mdev_type(kobj);
 
 	parent = mdev_get_parent(type->parent);
 	if (!parent)
@@ -285,14 +281,14 @@ int mdev_device_create(struct kobject *kobj,
 	}
 
 	device_initialize(&mdev->dev);
-	mdev->dev.parent  = dev;
+	mdev->dev.parent = parent->dev;
 	mdev->dev.bus     = &mdev_bus_type;
 	mdev->dev.release = mdev_device_release;
 	dev_set_name(&mdev->dev, "%pUl", uuid);
 	mdev->dev.groups = parent->ops->mdev_attr_groups;
-	mdev->type_kobj = kobj;
+	mdev->type = type;
 
-	ret = parent->ops->create(kobj, mdev);
+	ret = parent->ops->create(&type->kobj, mdev);
 	if (ret)
 		goto ops_create_fail;
 
@@ -300,7 +296,7 @@ int mdev_device_create(struct kobject *kobj,
 	if (ret)
 		goto add_fail;
 
-	ret = mdev_create_sysfs_files(mdev, type);
+	ret = mdev_create_sysfs_files(mdev);
 	if (ret)
 		goto sysfs_fail;
 
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index 97e2225f7f49..f12e34e16ab9 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -40,11 +40,10 @@ struct mdev_type {
 int  parent_create_sysfs_files(struct mdev_parent *parent);
 void parent_remove_sysfs_files(struct mdev_parent *parent);
 
-int  mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type);
-void mdev_remove_sysfs_files(struct mdev_device *mdev, struct mdev_type *type);
+int  mdev_create_sysfs_files(struct mdev_device *mdev);
+void mdev_remove_sysfs_files(struct mdev_device *mdev);
 
-int  mdev_device_create(struct kobject *kobj,
-			struct device *dev, const guid_t *uuid);
+int mdev_device_create(struct mdev_type *kobj, const guid_t *uuid);
 int  mdev_device_remove(struct mdev_device *dev);
 
 #endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 18114f3e090a..bcfe48d56e8a 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -67,7 +67,7 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = mdev_device_create(kobj, dev, &uuid);
+	ret = mdev_device_create(to_mdev_type(kobj), &uuid);
 	if (ret)
 		return ret;
 
@@ -249,8 +249,9 @@ static const struct attribute *mdev_device_attrs[] = {
 	NULL,
 };
 
-int mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
+int mdev_create_sysfs_files(struct mdev_device *mdev)
 {
+	struct mdev_type *type = mdev->type;
 	struct kobject *kobj = &mdev->dev.kobj;
 	int ret;
 
@@ -271,15 +272,15 @@ int mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
 create_files_failed:
 	sysfs_remove_link(kobj, "mdev_type");
 type_link_failed:
-	sysfs_remove_link(type->devices_kobj, dev_name(&mdev->dev));
+	sysfs_remove_link(mdev->type->devices_kobj, dev_name(&mdev->dev));
 	return ret;
 }
 
-void mdev_remove_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
+void mdev_remove_sysfs_files(struct mdev_device *mdev)
 {
 	struct kobject *kobj = &mdev->dev.kobj;
 
 	sysfs_remove_files(kobj, mdev_device_attrs);
 	sysfs_remove_link(kobj, "mdev_type");
-	sysfs_remove_link(type->devices_kobj, dev_name(&mdev->dev));
+	sysfs_remove_link(mdev->type->devices_kobj, dev_name(&mdev->dev));
 }
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 9838e4495470..0792476310a9 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -10,13 +10,15 @@
 #ifndef MDEV_H
 #define MDEV_H
 
+struct mdev_type;
+
 struct mdev_device {
 	struct device dev;
 	struct mdev_parent *parent;
 	guid_t uuid;
 	void *driver_data;
 	struct list_head next;
-	struct kobject *type_kobj;
+	struct mdev_type *type;
 	struct device *iommu_device;
 	bool active;
 };
-- 
Gitee


From ab494e2d895164cbe962b5fbd35a5b4d3db1bd70 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:29 -0300
Subject: [PATCH 1623/2161] vfio/mdev: Expose mdev_get/put_parent to
 mdev_private.h

commit a9f8111d0b5f445d853345e6917c1781573e4ba9 upstream.

The next patch will use these in mdev_sysfs.c

While here remove the now dead code checks for NULL, a mdev_type can never
have a NULL parent.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <6-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c    | 23 +++--------------------
 drivers/vfio/mdev/mdev_private.h | 12 ++++++++++++
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 5ca0efa5266b..7ec21c907397 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -45,7 +45,7 @@ static struct mdev_parent *__find_parent_device(struct device *dev)
 	return NULL;
 }
 
-static void mdev_release_parent(struct kref *kref)
+void mdev_release_parent(struct kref *kref)
 {
 	struct mdev_parent *parent = container_of(kref, struct mdev_parent,
 						  ref);
@@ -55,20 +55,6 @@ static void mdev_release_parent(struct kref *kref)
 	put_device(dev);
 }
 
-static struct mdev_parent *mdev_get_parent(struct mdev_parent *parent)
-{
-	if (parent)
-		kref_get(&parent->ref);
-
-	return parent;
-}
-
-static void mdev_put_parent(struct mdev_parent *parent)
-{
-	if (parent)
-		kref_put(&parent->ref, mdev_release_parent);
-}
-
 /* Caller must hold parent unreg_sem read or write lock */
 static void mdev_device_remove_common(struct mdev_device *mdev)
 {
@@ -243,12 +229,9 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 {
 	int ret;
 	struct mdev_device *mdev, *tmp;
-	struct mdev_parent *parent;
-
-	parent = mdev_get_parent(type->parent);
-	if (!parent)
-		return -EINVAL;
+	struct mdev_parent *parent = type->parent;
 
+	mdev_get_parent(parent);
 	mutex_lock(&mdev_list_lock);
 
 	/* Check for duplicate */
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index f12e34e16ab9..fddab240ccc3 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -46,4 +46,16 @@ void mdev_remove_sysfs_files(struct mdev_device *mdev);
 int mdev_device_create(struct mdev_type *kobj, const guid_t *uuid);
 int  mdev_device_remove(struct mdev_device *dev);
 
+void mdev_release_parent(struct kref *kref);
+
+static inline void mdev_get_parent(struct mdev_parent *parent)
+{
+	kref_get(&parent->ref);
+}
+
+static inline void mdev_put_parent(struct mdev_parent *parent)
+{
+	kref_put(&parent->ref, mdev_release_parent);
+}
+
 #endif /* MDEV_PRIVATE_H */
-- 
Gitee


From 61c028bd8732f8db5ba45d907b0dbd1ca24247fc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:30 -0300
Subject: [PATCH 1624/2161] vfio/mdev: Add missing reference counting to
 mdev_type

commit 9a302449a58d45d0ef2aab686f64b35919bc604c upstream.

struct mdev_type holds a pointer to the kref'd object struct mdev_parent,
but doesn't hold the kref. The lifetime of the parent becomes implicit
because parent_remove_sysfs_files() is supposed to remove all the access
before the parent can be freed, but this is very hard to reason about.

Make it obviously correct by adding the missing get.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <7-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_sysfs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index bcfe48d56e8a..8c169d12ba7d 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -81,6 +81,8 @@ static void mdev_type_release(struct kobject *kobj)
 	struct mdev_type *type = to_mdev_type(kobj);
 
 	pr_debug("Releasing group %s\n", kobj->name);
+	/* Pairs with the get in add_mdev_supported_type() */
+	mdev_put_parent(type->parent);
 	kfree(type);
 }
 
@@ -106,6 +108,8 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 
 	type->kobj.kset = parent->mdev_types_kset;
 	type->parent = parent;
+	/* Pairs with the put in mdev_type_release() */
+	mdev_get_parent(parent);
 
 	ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL,
 				   "%s-%s", dev_driver_string(parent->dev),
-- 
Gitee


From 8ab53b3ba1ea01d1390e33c5e5a553bdabb526be Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:31 -0300
Subject: [PATCH 1625/2161] vfio/mdev: Reorganize mdev_device_create()

commit fbd0e2b0c3d0b2eeaef471c9fe19ae5a7b2ee970 upstream.

Once the memory for the struct mdev_device is allocated it should
immediately be device_initialize()'d and filled in so that put_device()
can always be used to undo the allocation.

Place the mdev_get/put_parent() so that they are clearly protecting the
mdev->parent pointer. Move the final put to the release function so that
the lifetime rules are trivial to understand. Update the goto labels to
follow the normal convention.

Remove mdev_device_free() as the release function via device_put() is now
usable in all cases.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <8-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c | 60 ++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 7ec21c907397..f7559835b061 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -71,7 +71,6 @@ static void mdev_device_remove_common(struct mdev_device *mdev)
 
 	/* Balances with device_initialize() */
 	put_device(&mdev->dev);
-	mdev_put_parent(parent);
 }
 
 static int mdev_device_remove_cb(struct device *dev, void *data)
@@ -208,8 +207,13 @@ void mdev_unregister_device(struct device *dev)
 }
 EXPORT_SYMBOL(mdev_unregister_device);
 
-static void mdev_device_free(struct mdev_device *mdev)
+static void mdev_device_release(struct device *dev)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	/* Pairs with the get in mdev_device_create() */
+	mdev_put_parent(mdev->parent);
+
 	mutex_lock(&mdev_list_lock);
 	list_del(&mdev->next);
 	mutex_unlock(&mdev_list_lock);
@@ -218,70 +222,61 @@ static void mdev_device_free(struct mdev_device *mdev)
 	kfree(mdev);
 }
 
-static void mdev_device_release(struct device *dev)
-{
-	struct mdev_device *mdev = to_mdev_device(dev);
-
-	mdev_device_free(mdev);
-}
-
 int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 {
 	int ret;
 	struct mdev_device *mdev, *tmp;
 	struct mdev_parent *parent = type->parent;
 
-	mdev_get_parent(parent);
 	mutex_lock(&mdev_list_lock);
 
 	/* Check for duplicate */
 	list_for_each_entry(tmp, &mdev_list, next) {
 		if (guid_equal(&tmp->uuid, uuid)) {
 			mutex_unlock(&mdev_list_lock);
-			ret = -EEXIST;
-			goto mdev_fail;
+			return -EEXIST;
 		}
 	}
 
 	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
 	if (!mdev) {
 		mutex_unlock(&mdev_list_lock);
-		ret = -ENOMEM;
-		goto mdev_fail;
+		return -ENOMEM;
 	}
 
+	device_initialize(&mdev->dev);
+	mdev->dev.parent  = parent->dev;
+	mdev->dev.bus = &mdev_bus_type;
+	mdev->dev.release = mdev_device_release;
+	mdev->dev.groups = parent->ops->mdev_attr_groups;
+	mdev->type = type;
+	mdev->parent = parent;
+	/* Pairs with the put in mdev_device_release() */
+	mdev_get_parent(parent);
+
 	guid_copy(&mdev->uuid, uuid);
 	list_add(&mdev->next, &mdev_list);
 	mutex_unlock(&mdev_list_lock);
 
-	mdev->parent = parent;
+	dev_set_name(&mdev->dev, "%pUl", uuid);
 
 	/* Check if parent unregistration has started */
 	if (!down_read_trylock(&parent->unreg_sem)) {
-		mdev_device_free(mdev);
 		ret = -ENODEV;
-		goto mdev_fail;
+		goto out_put_device;
 	}
 
-	device_initialize(&mdev->dev);
-	mdev->dev.parent = parent->dev;
-	mdev->dev.bus     = &mdev_bus_type;
-	mdev->dev.release = mdev_device_release;
-	dev_set_name(&mdev->dev, "%pUl", uuid);
-	mdev->dev.groups = parent->ops->mdev_attr_groups;
-	mdev->type = type;
-
 	ret = parent->ops->create(&type->kobj, mdev);
 	if (ret)
-		goto ops_create_fail;
+		goto out_unlock;
 
 	ret = device_add(&mdev->dev);
 	if (ret)
-		goto add_fail;
+		goto out_remove;
 
 	ret = mdev_create_sysfs_files(mdev);
 	if (ret)
-		goto sysfs_fail;
+		goto out_del;
 
 	mdev->active = true;
 	dev_dbg(&mdev->dev, "MDEV: created\n");
@@ -289,15 +284,14 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 
 	return 0;
 
-sysfs_fail:
+out_del:
 	device_del(&mdev->dev);
-add_fail:
+out_remove:
 	parent->ops->remove(mdev);
-ops_create_fail:
+out_unlock:
 	up_read(&parent->unreg_sem);
+out_put_device:
 	put_device(&mdev->dev);
-mdev_fail:
-	mdev_put_parent(parent);
 	return ret;
 }
 
-- 
Gitee


From 1bff566a9e53414297e9cae641d95642fcd79969 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:32 -0300
Subject: [PATCH 1626/2161] vfio/mdev: Add missing error handling to
 dev_set_name()

commit 18d731242d5c67c0783126c42d3f85870cec2df5 upstream.

This can fail, and seems to be a popular target for syzkaller error
injection. Check the error return and unwind with put_device().

Fixes: 7b96953bc640 ("vfio: Mediated device Core driver")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <9-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index f7559835b061..4caedb3d4fbf 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -258,7 +258,9 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 	list_add(&mdev->next, &mdev_list);
 	mutex_unlock(&mdev_list_lock);
 
-	dev_set_name(&mdev->dev, "%pUl", uuid);
+	ret = dev_set_name(&mdev->dev, "%pUl", uuid);
+	if (ret)
+		goto out_put_device;
 
 	/* Check if parent unregistration has started */
 	if (!down_read_trylock(&parent->unreg_sem)) {
-- 
Gitee


From 57885ff4ef93c9429539ae829f9b783784b697d9 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:33 -0300
Subject: [PATCH 1627/2161] vfio/mdev: Remove duplicate storage of parent in
 mdev_device

commit fbea43239074e16c91048f5ce70378664efbdb99 upstream.

mdev_device->type->parent is the same thing.

The struct mdev_device was relying on the kref on the mdev_parent to also
indirectly hold a kref on the mdev_type pointer. Now that the type holds a
kref on the parent we can directly kref the mdev_type and remove this
implicit relationship.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <10-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c | 13 +++++--------
 drivers/vfio/mdev/vfio_mdev.c | 14 +++++++-------
 include/linux/mdev.h          |  1 -
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 4caedb3d4fbf..2a20bdaf6142 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -29,7 +29,7 @@ static DEFINE_MUTEX(mdev_list_lock);
 
 struct device *mdev_parent_dev(struct mdev_device *mdev)
 {
-	return mdev->parent->dev;
+	return mdev->type->parent->dev;
 }
 EXPORT_SYMBOL(mdev_parent_dev);
 
@@ -58,12 +58,11 @@ void mdev_release_parent(struct kref *kref)
 /* Caller must hold parent unreg_sem read or write lock */
 static void mdev_device_remove_common(struct mdev_device *mdev)
 {
-	struct mdev_parent *parent;
+	struct mdev_parent *parent = mdev->type->parent;
 	int ret;
 
 	mdev_remove_sysfs_files(mdev);
 	device_del(&mdev->dev);
-	parent = mdev->parent;
 	lockdep_assert_held(&parent->unreg_sem);
 	ret = parent->ops->remove(mdev);
 	if (ret)
@@ -212,7 +211,7 @@ static void mdev_device_release(struct device *dev)
 	struct mdev_device *mdev = to_mdev_device(dev);
 
 	/* Pairs with the get in mdev_device_create() */
-	mdev_put_parent(mdev->parent);
+	kobject_put(&mdev->type->kobj);
 
 	mutex_lock(&mdev_list_lock);
 	list_del(&mdev->next);
@@ -250,9 +249,8 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 	mdev->dev.release = mdev_device_release;
 	mdev->dev.groups = parent->ops->mdev_attr_groups;
 	mdev->type = type;
-	mdev->parent = parent;
 	/* Pairs with the put in mdev_device_release() */
-	mdev_get_parent(parent);
+	kobject_get(&type->kobj);
 
 	guid_copy(&mdev->uuid, uuid);
 	list_add(&mdev->next, &mdev_list);
@@ -300,7 +298,7 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 int mdev_device_remove(struct mdev_device *mdev)
 {
 	struct mdev_device *tmp;
-	struct mdev_parent *parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	mutex_lock(&mdev_list_lock);
 	list_for_each_entry(tmp, &mdev_list, next) {
@@ -321,7 +319,6 @@ int mdev_device_remove(struct mdev_device *mdev)
 	mdev->active = false;
 	mutex_unlock(&mdev_list_lock);
 
-	parent = mdev->parent;
 	/* Check if parent unregistration has started */
 	if (!down_read_trylock(&parent->unreg_sem))
 		return -ENODEV;
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index cc9507ed85a1..922729071c5a 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -24,7 +24,7 @@
 static int vfio_mdev_open(struct vfio_device *core_vdev)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	int ret;
 
@@ -44,7 +44,7 @@ static int vfio_mdev_open(struct vfio_device *core_vdev)
 static void vfio_mdev_release(struct vfio_device *core_vdev)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (likely(parent->ops->release))
 		parent->ops->release(mdev);
@@ -56,7 +56,7 @@ static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev,
 				     unsigned int cmd, unsigned long arg)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->ioctl))
 		return -EINVAL;
@@ -68,7 +68,7 @@ static ssize_t vfio_mdev_read(struct vfio_device *core_vdev, char __user *buf,
 			      size_t count, loff_t *ppos)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->read))
 		return -EINVAL;
@@ -81,7 +81,7 @@ static ssize_t vfio_mdev_write(struct vfio_device *core_vdev,
 			       loff_t *ppos)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->write))
 		return -EINVAL;
@@ -93,7 +93,7 @@ static int vfio_mdev_mmap(struct vfio_device *core_vdev,
 			  struct vm_area_struct *vma)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->mmap))
 		return -EINVAL;
@@ -104,7 +104,7 @@ static int vfio_mdev_mmap(struct vfio_device *core_vdev,
 static void vfio_mdev_request(struct vfio_device *core_vdev, unsigned int count)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (parent->ops->request)
 		parent->ops->request(mdev, count);
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 0792476310a9..f222e4e11a96 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -14,7 +14,6 @@ struct mdev_type;
 
 struct mdev_device {
 	struct device dev;
-	struct mdev_parent *parent;
 	guid_t uuid;
 	void *driver_data;
 	struct list_head next;
-- 
Gitee


From 01c4966664649a1b01b113c3238e36e44ac31c02 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:34 -0300
Subject: [PATCH 1628/2161] vfio/mdev: Add mdev/mtype_get_type_group_id()

commit 15fcc44be0c7afa2945b1896a96ac2ddf09f1fa7 upstream.

This returns the index in the supported_type_groups array that is
associated with the mdev_type attached to the struct mdev_device or its
containing struct kobject.

Each mdev_device can be spawned from exactly one mdev_type, which in turn
originates from exactly one supported_type_group.

Drivers are using weird string calculations to try and get back to this
index, providing a direct access to the index removes a bunch of wonky
driver code.

mdev_type->group can be deleted as the group is obtained using the
type_group_id.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <11-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c    | 20 ++++++++++++++++++++
 drivers/vfio/mdev/mdev_private.h |  2 +-
 drivers/vfio/mdev/mdev_sysfs.c   | 15 +++++++++------
 include/linux/mdev.h             |  3 +++
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 2a20bdaf6142..5ae06f951a09 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -33,6 +33,26 @@ struct device *mdev_parent_dev(struct mdev_device *mdev)
 }
 EXPORT_SYMBOL(mdev_parent_dev);
 
+/*
+ * Return the index in supported_type_groups that this mdev_device was created
+ * from.
+ */
+unsigned int mdev_get_type_group_id(struct mdev_device *mdev)
+{
+	return mdev->type->type_group_id;
+}
+EXPORT_SYMBOL(mdev_get_type_group_id);
+
+/*
+ * Used in mdev_type_attribute sysfs functions to return the index in the
+ * supported_type_groups that the sysfs is called from.
+ */
+unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj)
+{
+	return container_of(mtype_kobj, struct mdev_type, kobj)->type_group_id;
+}
+EXPORT_SYMBOL(mtype_get_type_group_id);
+
 /* Should be called holding parent_list_lock */
 static struct mdev_parent *__find_parent_device(struct device *dev)
 {
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index fddab240ccc3..6999c89db7b1 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -29,7 +29,7 @@ struct mdev_type {
 	struct kobject *devices_kobj;
 	struct mdev_parent *parent;
 	struct list_head next;
-	struct attribute_group *group;
+	unsigned int type_group_id;
 };
 
 #define to_mdev_type_attr(_attr)	\
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 8c169d12ba7d..712fbc78b12e 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -92,9 +92,11 @@ static struct kobj_type mdev_type_ktype = {
 };
 
 static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
-						 struct attribute_group *group)
+						 unsigned int type_group_id)
 {
 	struct mdev_type *type;
+	struct attribute_group *group =
+		parent->ops->supported_type_groups[type_group_id];
 	int ret;
 
 	if (!group->name) {
@@ -110,6 +112,7 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 	type->parent = parent;
 	/* Pairs with the put in mdev_type_release() */
 	mdev_get_parent(parent);
+	type->type_group_id = type_group_id;
 
 	ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL,
 				   "%s-%s", dev_driver_string(parent->dev),
@@ -135,8 +138,6 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 		ret = -ENOMEM;
 		goto attrs_failed;
 	}
-
-	type->group = group;
 	return type;
 
 attrs_failed:
@@ -151,8 +152,11 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 
 static void remove_mdev_supported_type(struct mdev_type *type)
 {
+	struct attribute_group *group =
+		type->parent->ops->supported_type_groups[type->type_group_id];
+
 	sysfs_remove_files(&type->kobj,
-			   (const struct attribute **)type->group->attrs);
+			   (const struct attribute **)group->attrs);
 	kobject_put(type->devices_kobj);
 	sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
 	kobject_del(&type->kobj);
@@ -166,8 +170,7 @@ static int add_mdev_supported_type_groups(struct mdev_parent *parent)
 	for (i = 0; parent->ops->supported_type_groups[i]; i++) {
 		struct mdev_type *type;
 
-		type = add_mdev_supported_type(parent,
-					parent->ops->supported_type_groups[i]);
+		type = add_mdev_supported_type(parent, i);
 		if (IS_ERR(type)) {
 			struct mdev_type *ltype, *tmp;
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index f222e4e11a96..3a0743bb5273 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -46,6 +46,9 @@ static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
 	return mdev->iommu_device;
 }
 
+unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
+unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj);
+
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
  * register the device to mdev module.
-- 
Gitee


From 9ede77ed7b303e72c6775e55de201d1a7f53078a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:35 -0300
Subject: [PATCH 1629/2161] vfio/mtty: Use mdev_get_type_group_id()

commit c594b26ff78e2cb315101ade73155baf868158eb upstream.

The type_group_id directly gives the single or dual port index, no
need for string searching.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <12-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mtty.c | 50 ++++++----------------------------------
 1 file changed, 7 insertions(+), 43 deletions(-)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index ce84a300a4da..191a587a8d5a 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -711,23 +711,7 @@ static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
 static int mtty_create(struct kobject *kobj, struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state;
-	char name[MTTY_STRING_LEN];
-	int nr_ports = 0, i;
-
-	if (!mdev)
-		return -EINVAL;
-
-	for (i = 0; i < 2; i++) {
-		snprintf(name, MTTY_STRING_LEN, "%s-%d",
-			dev_driver_string(mdev_parent_dev(mdev)), i + 1);
-		if (!strcmp(kobj->name, name)) {
-			nr_ports = i + 1;
-			break;
-		}
-	}
-
-	if (!nr_ports)
-		return -EINVAL;
+	int nr_ports = mdev_get_type_group_id(mdev) + 1;
 
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
 	if (mdev_state == NULL)
@@ -1311,18 +1295,11 @@ static const struct attribute_group *mdev_dev_groups[] = {
 static ssize_t
 name_show(struct kobject *kobj, struct device *dev, char *buf)
 {
-	char name[MTTY_STRING_LEN];
-	int i;
-	const char *name_str[2] = {"Single port serial", "Dual port serial"};
+	static const char *name_str[2] = { "Single port serial",
+					   "Dual port serial" };
 
-	for (i = 0; i < 2; i++) {
-		snprintf(name, MTTY_STRING_LEN, "%s-%d",
-			 dev_driver_string(dev), i + 1);
-		if (!strcmp(kobj->name, name))
-			return sprintf(buf, "%s\n", name_str[i]);
-	}
-
-	return -EINVAL;
+	return sysfs_emit(buf, "%s\n",
+			  name_str[mtype_get_type_group_id(kobj)]);
 }
 
 static MDEV_TYPE_ATTR_RO(name);
@@ -1330,22 +1307,9 @@ static MDEV_TYPE_ATTR_RO(name);
 static ssize_t
 available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
 {
-	char name[MTTY_STRING_LEN];
-	int i;
 	struct mdev_state *mds;
-	int ports = 0, used = 0;
-
-	for (i = 0; i < 2; i++) {
-		snprintf(name, MTTY_STRING_LEN, "%s-%d",
-			 dev_driver_string(dev), i + 1);
-		if (!strcmp(kobj->name, name)) {
-			ports = i + 1;
-			break;
-		}
-	}
-
-	if (!ports)
-		return -EINVAL;
+	unsigned int ports = mtype_get_type_group_id(kobj) + 1;
+	int used = 0;
 
 	list_for_each_entry(mds, &mdev_devices_list, next)
 		used += mds->nr_ports;
-- 
Gitee


From f9217ab56d1f8d2bc34fbf9df03a882b2e1b405a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:36 -0300
Subject: [PATCH 1630/2161] vfio/mdpy: Use mdev_get_type_group_id()

commit adc9d1f6f5db811f5269cfc66c48fc0cab6c041c upstream.

The mdpy_types array is parallel to the supported_type_groups array, so
the type_group_id indexes both. Instead of doing string searching just
directly index with type_group_id in all places.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <13-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mdpy.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index d4ec2b52ca49..08c15f9f06a8 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -99,16 +99,6 @@ struct mdev_state {
 	void *memblk;
 };
 
-static const struct mdpy_type *mdpy_find_type(struct kobject *kobj)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(mdpy_types); i++)
-		if (strcmp(mdpy_types[i].name, kobj->name) == 0)
-			return mdpy_types + i;
-	return NULL;
-}
-
 static void mdpy_create_config_space(struct mdev_state *mdev_state)
 {
 	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
@@ -228,7 +218,8 @@ static int mdpy_reset(struct mdev_device *mdev)
 
 static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
 {
-	const struct mdpy_type *type = mdpy_find_type(kobj);
+	const struct mdpy_type *type =
+		&mdpy_types[mdev_get_type_group_id(mdev)];
 	struct device *dev = mdev_dev(mdev);
 	struct mdev_state *mdev_state;
 	u32 fbsize;
@@ -246,8 +237,6 @@ static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
 		return -ENOMEM;
 	}
 
-	if (!type)
-		type = &mdpy_types[0];
 	fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
 
 	mdev_state->memblk = vmalloc_user(fbsize);
@@ -256,8 +245,8 @@ static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
 		kfree(mdev_state);
 		return -ENOMEM;
 	}
-	dev_info(dev, "%s: %s (%dx%d)\n",
-		 __func__, kobj->name, type->width, type->height);
+	dev_info(dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width,
+		 type->height);
 
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
@@ -673,7 +662,8 @@ static MDEV_TYPE_ATTR_RO(name);
 static ssize_t
 description_show(struct kobject *kobj, struct device *dev, char *buf)
 {
-	const struct mdpy_type *type = mdpy_find_type(kobj);
+	const struct mdpy_type *type =
+		&mdpy_types[mtype_get_type_group_id(kobj)];
 
 	return sprintf(buf, "virtual display, %dx%d framebuffer\n",
 		       type ? type->width  : 0,
-- 
Gitee


From 02d2a1e1ba5d0bc6f09311dcf3c3492b3ca9a142 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:37 -0300
Subject: [PATCH 1631/2161] vfio/mbochs: Use mdev_get_type_group_id()

commit 3d3a360e5706169a60fd4f26a9cec7da196a41c9 upstream.

The mbochs_types array is parallel to the supported_type_groups array, so
the type_group_id indexes both. Instead of doing string searching just
directly index with type_group_id in all places.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <14-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mbochs.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 93f42631452e..32b57ce55d55 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -205,16 +205,6 @@ static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
 static struct page *mbochs_get_page(struct mdev_state *mdev_state,
 				    pgoff_t pgoff);
 
-static const struct mbochs_type *mbochs_find_type(struct kobject *kobj)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(mbochs_types); i++)
-		if (strcmp(mbochs_types[i].name, kobj->name) == 0)
-			return mbochs_types + i;
-	return NULL;
-}
-
 static void mbochs_create_config_space(struct mdev_state *mdev_state)
 {
 	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
@@ -518,7 +508,8 @@ static int mbochs_reset(struct mdev_device *mdev)
 
 static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
 {
-	const struct mbochs_type *type = mbochs_find_type(kobj);
+	const struct mbochs_type *type =
+		&mbochs_types[mdev_get_type_group_id(mdev)];
 	struct device *dev = mdev_dev(mdev);
 	struct mdev_state *mdev_state;
 
@@ -544,7 +535,7 @@ static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
 		goto err_mem;
 
 	dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__,
-		 kobj->name, type->mbytes, mdev_state->pagecount);
+		 type->name, type->mbytes, mdev_state->pagecount);
 
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
@@ -1364,7 +1355,8 @@ static MDEV_TYPE_ATTR_RO(name);
 static ssize_t
 description_show(struct kobject *kobj, struct device *dev, char *buf)
 {
-	const struct mbochs_type *type = mbochs_find_type(kobj);
+	const struct mbochs_type *type =
+		&mbochs_types[mtype_get_type_group_id(kobj)];
 
 	return sprintf(buf, "virtual display, %d MB video memory\n",
 		       type ? type->mbytes  : 0);
@@ -1374,7 +1366,8 @@ static MDEV_TYPE_ATTR_RO(description);
 static ssize_t
 available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
 {
-	const struct mbochs_type *type = mbochs_find_type(kobj);
+	const struct mbochs_type *type =
+		&mbochs_types[mtype_get_type_group_id(kobj)];
 	int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
 
 	return sprintf(buf, "%d\n", count);
-- 
Gitee


From e70b25a5dea6136003e71b95ca5838272732ce8a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:40 -0300
Subject: [PATCH 1632/2161] vfio/mdev: Remove kobj from
 mdev_parent_ops->create()

commit c2ef2f50ad0ccf5460bf4824bc6669240b6c7936 upstream.

The kobj here is a type-erased version of mdev_type, which is already
stored in the struct mdev_device being passed in. It was only ever used to
compute the type_group_id, which is now extracted directly from the mdev.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <17-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c | 2 +-
 include/linux/mdev.h          | 3 +--
 samples/vfio-mdev/mbochs.c    | 2 +-
 samples/vfio-mdev/mdpy.c      | 2 +-
 samples/vfio-mdev/mtty.c      | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 5ae06f951a09..10eff33ce1f2 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -286,7 +286,7 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 		goto out_put_device;
 	}
 
-	ret = parent->ops->create(&type->kobj, mdev);
+	ret = parent->ops->create(mdev);
 	if (ret)
 		goto out_unlock;
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 3a0743bb5273..ba513e34844b 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -61,7 +61,6 @@ unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj);
  * @create:		Called to allocate basic resources in parent device's
  *			driver for a particular mediated device. It is
  *			mandatory to provide create ops.
- *			@kobj: kobject of type for which 'create' is called.
  *			@mdev: mdev_device structure on of mediated device
  *			      that is being created
  *			Returns integer: success (0) or error (< 0)
@@ -107,7 +106,7 @@ struct mdev_parent_ops {
 	const struct attribute_group **mdev_attr_groups;
 	struct attribute_group **supported_type_groups;
 
-	int     (*create)(struct kobject *kobj, struct mdev_device *mdev);
+	int     (*create)(struct mdev_device *mdev);
 	int     (*remove)(struct mdev_device *mdev);
 	int     (*open)(struct mdev_device *mdev);
 	void    (*release)(struct mdev_device *mdev);
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 32b57ce55d55..ed26dbe063bd 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -506,7 +506,7 @@ static int mbochs_reset(struct mdev_device *mdev)
 	return 0;
 }
 
-static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mbochs_create(struct mdev_device *mdev)
 {
 	const struct mbochs_type *type =
 		&mbochs_types[mdev_get_type_group_id(mdev)];
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 08c15f9f06a8..da88fd7dd423 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -216,7 +216,7 @@ static int mdpy_reset(struct mdev_device *mdev)
 	return 0;
 }
 
-static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mdpy_create(struct mdev_device *mdev)
 {
 	const struct mdpy_type *type =
 		&mdpy_types[mdev_get_type_group_id(mdev)];
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 191a587a8d5a..f2e36c06ac6a 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -708,7 +708,7 @@ static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
 	return ret;
 }
 
-static int mtty_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mtty_create(struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state;
 	int nr_ports = mdev_get_type_group_id(mdev) + 1;
-- 
Gitee


From 4804eaccc2985db994a5a507ad7edb4b6c162c7f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:41 -0300
Subject: [PATCH 1633/2161] vfio/mdev: Correct the function signatures for the
 mdev_type_attributes

commit 9169cff168ff262b4b78597f542e23843d0c494a upstream.

The driver core standard is to pass in the properly typed object, the
properly typed attribute and the buffer data. It stems from the root
kobject method:

  ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,..)

Each subclass of kobject should provide their own function with the same
signature but more specific types, eg struct device uses:

  ssize_t (*show)(struct device *dev, struct device_attribute *attr,..)

In this case the existing signature is:

  ssize_t (*show)(struct kobject *kobj, struct device *dev,..)

Where kobj is a 'struct mdev_type *' and dev is 'mdev_type->parent->dev'.

Change the mdev_type related sysfs attribute functions to:

  ssize_t (*show)(struct mdev_type *mtype, struct mdev_type_attribute *attr,..)

In order to restore type safety and match the driver core standard

There are no current users of 'attr', but if it is ever needed it would be
hard to add in retroactively, so do it now.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <18-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c  | 14 ++++++++++++--
 drivers/vfio/mdev/mdev_sysfs.c | 11 ++++++-----
 include/linux/mdev.h           | 11 +++++++----
 samples/vfio-mdev/mbochs.c     | 26 +++++++++++++++-----------
 samples/vfio-mdev/mdpy.c       | 24 ++++++++++++++----------
 samples/vfio-mdev/mtty.c       | 18 +++++++++---------
 6 files changed, 63 insertions(+), 41 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 10eff33ce1f2..2a85d6fcb7dd 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -47,12 +47,22 @@ EXPORT_SYMBOL(mdev_get_type_group_id);
  * Used in mdev_type_attribute sysfs functions to return the index in the
  * supported_type_groups that the sysfs is called from.
  */
-unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj)
+unsigned int mtype_get_type_group_id(struct mdev_type *mtype)
 {
-	return container_of(mtype_kobj, struct mdev_type, kobj)->type_group_id;
+	return mtype->type_group_id;
 }
 EXPORT_SYMBOL(mtype_get_type_group_id);
 
+/*
+ * Used in mdev_type_attribute sysfs functions to return the parent struct
+ * device
+ */
+struct device *mtype_get_parent_dev(struct mdev_type *mtype)
+{
+	return mtype->parent->dev;
+}
+EXPORT_SYMBOL(mtype_get_parent_dev);
+
 /* Should be called holding parent_list_lock */
 static struct mdev_parent *__find_parent_device(struct device *dev)
 {
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 712fbc78b12e..f5cf1931c54e 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -26,7 +26,7 @@ static ssize_t mdev_type_attr_show(struct kobject *kobj,
 	ssize_t ret = -EIO;
 
 	if (attr->show)
-		ret = attr->show(kobj, type->parent->dev, buf);
+		ret = attr->show(type, attr, buf);
 	return ret;
 }
 
@@ -39,7 +39,7 @@ static ssize_t mdev_type_attr_store(struct kobject *kobj,
 	ssize_t ret = -EIO;
 
 	if (attr->store)
-		ret = attr->store(&type->kobj, type->parent->dev, buf, count);
+		ret = attr->store(type, attr, buf, count);
 	return ret;
 }
 
@@ -48,8 +48,9 @@ static const struct sysfs_ops mdev_type_sysfs_ops = {
 	.store = mdev_type_attr_store,
 };
 
-static ssize_t create_store(struct kobject *kobj, struct device *dev,
-			    const char *buf, size_t count)
+static ssize_t create_store(struct mdev_type *mtype,
+			    struct mdev_type_attribute *attr, const char *buf,
+			    size_t count)
 {
 	char *str;
 	guid_t uuid;
@@ -67,7 +68,7 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = mdev_device_create(to_mdev_type(kobj), &uuid);
+	ret = mdev_device_create(mtype, &uuid);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index ba513e34844b..de12d3a3ded0 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -47,7 +47,8 @@ static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
 }
 
 unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
-unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj);
+unsigned int mtype_get_type_group_id(struct mdev_type *mtype);
+struct device *mtype_get_parent_dev(struct mdev_type *mtype);
 
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
@@ -123,9 +124,11 @@ struct mdev_parent_ops {
 /* interface for exporting mdev supported type attributes */
 struct mdev_type_attribute {
 	struct attribute attr;
-	ssize_t (*show)(struct kobject *kobj, struct device *dev, char *buf);
-	ssize_t (*store)(struct kobject *kobj, struct device *dev,
-			 const char *buf, size_t count);
+	ssize_t (*show)(struct mdev_type *mtype,
+			struct mdev_type_attribute *attr, char *buf);
+	ssize_t (*store)(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, const char *buf,
+			 size_t count);
 };
 
 #define MDEV_TYPE_ATTR(_name, _mode, _show, _store)		\
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index ed26dbe063bd..b033576d1d3f 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1345,37 +1345,41 @@ static const struct attribute_group *mdev_dev_groups[] = {
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%s\n", kobj->name);
+	const struct mbochs_type *type =
+		&mbochs_types[mtype_get_type_group_id(mtype)];
+
+	return sprintf(buf, "%s\n", type->name);
 }
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-description_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t description_show(struct mdev_type *mtype,
+				struct mdev_type_attribute *attr, char *buf)
 {
 	const struct mbochs_type *type =
-		&mbochs_types[mtype_get_type_group_id(kobj)];
+		&mbochs_types[mtype_get_type_group_id(mtype)];
 
 	return sprintf(buf, "virtual display, %d MB video memory\n",
 		       type ? type->mbytes  : 0);
 }
 static MDEV_TYPE_ATTR_RO(description);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	const struct mbochs_type *type =
-		&mbochs_types[mtype_get_type_group_id(kobj)];
+		&mbochs_types[mtype_get_type_group_id(mtype)];
 	int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
 
 	return sprintf(buf, "%d\n", count);
 }
 static MDEV_TYPE_ATTR_RO(available_instances);
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index da88fd7dd423..885b88ea20e2 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -652,18 +652,21 @@ static const struct attribute_group *mdev_dev_groups[] = {
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%s\n", kobj->name);
+	const struct mdpy_type *type =
+		&mdpy_types[mtype_get_type_group_id(mtype)];
+
+	return sprintf(buf, "%s\n", type->name);
 }
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-description_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t description_show(struct mdev_type *mtype,
+				struct mdev_type_attribute *attr, char *buf)
 {
 	const struct mdpy_type *type =
-		&mdpy_types[mtype_get_type_group_id(kobj)];
+		&mdpy_types[mtype_get_type_group_id(mtype)];
 
 	return sprintf(buf, "virtual display, %dx%d framebuffer\n",
 		       type ? type->width  : 0,
@@ -671,15 +674,16 @@ description_show(struct kobject *kobj, struct device *dev, char *buf)
 }
 static MDEV_TYPE_ATTR_RO(description);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	return sprintf(buf, "%d\n", max_devices - mdpy_count);
 }
 static MDEV_TYPE_ATTR_RO(available_instances);
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index f2e36c06ac6a..b9b24be4abda 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -1292,23 +1292,24 @@ static const struct attribute_group *mdev_dev_groups[] = {
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
 	static const char *name_str[2] = { "Single port serial",
 					   "Dual port serial" };
 
 	return sysfs_emit(buf, "%s\n",
-			  name_str[mtype_get_type_group_id(kobj)]);
+			  name_str[mtype_get_type_group_id(mtype)]);
 }
 
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	struct mdev_state *mds;
-	unsigned int ports = mtype_get_type_group_id(kobj) + 1;
+	unsigned int ports = mtype_get_type_group_id(mtype) + 1;
 	int used = 0;
 
 	list_for_each_entry(mds, &mdev_devices_list, next)
@@ -1319,9 +1320,8 @@ available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
 
 static MDEV_TYPE_ATTR_RO(available_instances);
 
-
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
-- 
Gitee


From 2481f834831f4ee052c571c16b58d8ab6183887b Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 13 May 2021 18:01:55 -0500
Subject: [PATCH 1634/2161] vfio/iommu_type1: Use struct_size() for kzalloc()

commit 78b238147e4d241bc1681d2559477c995f9dcb0a upstream.

Make use of the struct_size() helper instead of an open-coded version,
in order to avoid any potential type mistakes or integer overflows
that, in the worst scenario, could lead to heap overflows.

This code was detected with the help of Coccinelle and, audited and
fixed manually.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Message-Id: <20210513230155.GA217517@embeddedor>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 7628d45b55c3..48d435d7b969 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2799,7 +2799,7 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
 		return 0;
 	}
 
-	size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
+	size = struct_size(cap_iovas, iova_ranges, iovas);
 
 	cap_iovas = kzalloc(size, GFP_KERNEL);
 	if (!cap_iovas)
-- 
Gitee


From 11471a1606f8e67e4779e673b8cc0abce8372ec6 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Tue, 18 May 2021 22:21:31 +0300
Subject: [PATCH 1635/2161] vfio/platform: fix module_put call in error flow

commit dc51ff91cf2d1e9a2d941da483602f71d4a51472 upstream.

The ->parent_module is the one that use in try_module_get. It should
also be the one the we use in module_put during vfio_platform_open().

Fixes: 32a2d71c4e80 ("vfio: platform: introduce vfio-platform-base module")
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Message-Id: <20210518192133.59195-1-mgurtovoy@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/vfio_platform_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 361e5b57e369..470fcf7dac56 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -291,7 +291,7 @@ static int vfio_platform_open(struct vfio_device *core_vdev)
 	vfio_platform_regions_cleanup(vdev);
 err_reg:
 	mutex_unlock(&driver_lock);
-	module_put(THIS_MODULE);
+	module_put(vdev->parent_module);
 	return ret;
 }
 
-- 
Gitee


From 5279b9e9e0bf4e8553a5262d92670ee057faa42a Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Tue, 18 May 2021 22:21:32 +0300
Subject: [PATCH 1636/2161] vfio: centralize module refcount in subsystem layer

commit 9dcf01d95721261844d8c07c142efc143f7d38e3 upstream.

Remove code duplication and move module refcounting to the subsystem
module.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20210518192133.59195-2-mgurtovoy@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/vfio_mdev.c                | 13 +------------
 drivers/vfio/pci/vfio_pci.c                  |  7 -------
 drivers/vfio/platform/vfio_platform_common.c |  6 ------
 drivers/vfio/vfio.c                          | 10 ++++++++++
 4 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 922729071c5a..5ef4815609ed 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -26,19 +26,10 @@ static int vfio_mdev_open(struct vfio_device *core_vdev)
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->type->parent;
 
-	int ret;
-
 	if (unlikely(!parent->ops->open))
 		return -EINVAL;
 
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
-
-	ret = parent->ops->open(mdev);
-	if (ret)
-		module_put(THIS_MODULE);
-
-	return ret;
+	return parent->ops->open(mdev);
 }
 
 static void vfio_mdev_release(struct vfio_device *core_vdev)
@@ -48,8 +39,6 @@ static void vfio_mdev_release(struct vfio_device *core_vdev)
 
 	if (likely(parent->ops->release))
 		parent->ops->release(mdev);
-
-	module_put(THIS_MODULE);
 }
 
 static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev,
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 1504ba6a63c8..e03d560b79c6 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -576,8 +576,6 @@ static void vfio_pci_release(struct vfio_device *core_vdev)
 	}
 
 	mutex_unlock(&vdev->reflck->lock);
-
-	module_put(THIS_MODULE);
 }
 
 static int vfio_pci_open(struct vfio_device *core_vdev)
@@ -586,9 +584,6 @@ static int vfio_pci_open(struct vfio_device *core_vdev)
 		container_of(core_vdev, struct vfio_pci_device, vdev);
 	int ret = 0;
 
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
-
 	mutex_lock(&vdev->reflck->lock);
 
 	if (!vdev->refcnt) {
@@ -602,8 +597,6 @@ static int vfio_pci_open(struct vfio_device *core_vdev)
 	vdev->refcnt++;
 error:
 	mutex_unlock(&vdev->reflck->lock);
-	if (ret)
-		module_put(THIS_MODULE);
 	return ret;
 }
 
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 470fcf7dac56..703164df7637 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -241,8 +241,6 @@ static void vfio_platform_release(struct vfio_device *core_vdev)
 	}
 
 	mutex_unlock(&driver_lock);
-
-	module_put(vdev->parent_module);
 }
 
 static int vfio_platform_open(struct vfio_device *core_vdev)
@@ -251,9 +249,6 @@ static int vfio_platform_open(struct vfio_device *core_vdev)
 		container_of(core_vdev, struct vfio_platform_device, vdev);
 	int ret;
 
-	if (!try_module_get(vdev->parent_module))
-		return -ENODEV;
-
 	mutex_lock(&driver_lock);
 
 	if (!vdev->refcnt) {
@@ -291,7 +286,6 @@ static int vfio_platform_open(struct vfio_device *core_vdev)
 	vfio_platform_regions_cleanup(vdev);
 err_reg:
 	mutex_unlock(&driver_lock);
-	module_put(vdev->parent_module);
 	return ret;
 }
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 5e631c359ef2..02cc51ce6891 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1369,8 +1369,14 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	if (IS_ERR(device))
 		return PTR_ERR(device);
 
+	if (!try_module_get(device->dev->driver->owner)) {
+		vfio_device_put(device);
+		return -ENODEV;
+	}
+
 	ret = device->ops->open(device);
 	if (ret) {
+		module_put(device->dev->driver->owner);
 		vfio_device_put(device);
 		return ret;
 	}
@@ -1382,6 +1388,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	ret = get_unused_fd_flags(O_CLOEXEC);
 	if (ret < 0) {
 		device->ops->release(device);
+		module_put(device->dev->driver->owner);
 		vfio_device_put(device);
 		return ret;
 	}
@@ -1392,6 +1399,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 		put_unused_fd(ret);
 		ret = PTR_ERR(filep);
 		device->ops->release(device);
+		module_put(device->dev->driver->owner);
 		vfio_device_put(device);
 		return ret;
 	}
@@ -1550,6 +1558,8 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 
 	device->ops->release(device);
 
+	module_put(device->dev->driver->owner);
+
 	vfio_group_try_dissolve_container(device->group);
 
 	vfio_device_put(device);
-- 
Gitee


From 7e9357d98219de60e765a8eff427dcf7bbf5e1c3 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Tue, 18 May 2021 22:21:33 +0300
Subject: [PATCH 1637/2161] vfio/platform: remove unneeded parent_module
 attribute

commit 3b62a62429b26709895846180c93f0c21547f7ac upstream.

The vfio core driver is now taking refcount on the provider drivers,
remove redundant parent_module attribute from vfio_platform_device
structure.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Acked-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20210518192133.59195-3-mgurtovoy@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/vfio_amba.c             | 1 -
 drivers/vfio/platform/vfio_platform.c         | 1 -
 drivers/vfio/platform/vfio_platform_private.h | 1 -
 3 files changed, 3 deletions(-)

diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 9636a2afaecd..a309ec7a019b 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -59,7 +59,6 @@ static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id)
 	vdev->flags = VFIO_DEVICE_FLAGS_AMBA;
 	vdev->get_resource = get_amba_resource;
 	vdev->get_irq = get_amba_irq;
-	vdev->parent_module = THIS_MODULE;
 	vdev->reset_required = false;
 
 	ret = vfio_platform_probe_common(vdev, &adev->dev);
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index e4027799a154..68a1c87066d7 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -50,7 +50,6 @@ static int vfio_platform_probe(struct platform_device *pdev)
 	vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM;
 	vdev->get_resource = get_platform_resource;
 	vdev->get_irq = get_platform_irq;
-	vdev->parent_module = THIS_MODULE;
 	vdev->reset_required = reset_required;
 
 	ret = vfio_platform_probe_common(vdev, &pdev->dev);
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index a5ba82c8cbc3..dfb834c13659 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -50,7 +50,6 @@ struct vfio_platform_device {
 	u32				num_irqs;
 	int				refcnt;
 	struct mutex			igate;
-	struct module			*parent_module;
 	const char			*compat;
 	const char			*acpihid;
 	struct module			*reset_module;
-- 
Gitee


From 33e4fef51b9528193f82903db6017a1f3f4a3529 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 13 Jun 2022 17:12:08 +0800
Subject: [PATCH 1638/2161] driver core: Pull required checks into
 driver_probe_device()

commit 204db60c83574559a8e08ce4bbd0029d56b8ab2e upstream.

Checking if the dev is dead or if the dev is already bound is a required
precondition to invoking driver_probe_device(). All the call chains
leading here duplicate these checks.

Add it directly to driver_probe_device() so the precondition is clear and
remove the checks from device_driver_attach() and
__driver_attach_async_helper().

The other call chain going through __device_attach_driver() does have
these same checks but they are inlined into logic higher up the call stack
and can't be removed.

The sysfs uAPI call chain starting at bind_store() is a bit confused
because it reads dev->driver unlocked and returns -ENODEV if it is !NULL,
otherwise it reads it again under lock and returns 0 if it is !NULL. Fix
this to always return -EBUSY and always read dev->driver under its lock.

Done in preparation for the next patches which will add additional
callers to driver_probe_device() and will need these checks as well.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
[hch: drop the extra checks in device_driver_attach and bind_store]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20210617142218.1877096-2-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/bus.c |  2 +-
 drivers/base/dd.c  | 32 ++++++++++----------------------
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index a1d1e8256324..f682afdfc1b8 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -209,7 +209,7 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf,
 	int err = -ENODEV;
 
 	dev = bus_find_device_by_name(bus, NULL, buf);
-	if (dev && dev->driver == NULL && driver_match_device(drv, dev)) {
+	if (dev && driver_match_device(drv, dev)) {
 		err = device_driver_attach(drv, dev);
 
 		if (err > 0) {
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index cf7e5b4afc1b..d794a27949b2 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -698,8 +698,9 @@ EXPORT_SYMBOL_GPL(wait_for_device_probe);
  * @drv: driver to bind a device to
  * @dev: device to try to bind to the driver
  *
- * This function returns -ENODEV if the device is not registered,
- * 1 if the device is bound successfully and 0 otherwise.
+ * This function returns -ENODEV if the device is not registered, -EBUSY if it
+ * already has a driver, and 1 if the device is bound successfully and 0
+ * otherwise.
  *
  * This function must be called with @dev lock held.  When called for a
  * USB interface, @dev->parent lock must be held as well.
@@ -710,8 +711,10 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 {
 	int ret = 0;
 
-	if (!device_is_registered(dev))
+	if (dev->p->dead || !device_is_registered(dev))
 		return -ENODEV;
+	if (dev->driver)
+		return -EBUSY;
 
 	pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
@@ -991,17 +994,10 @@ static void __device_driver_unlock(struct device *dev, struct device *parent)
  */
 int device_driver_attach(struct device_driver *drv, struct device *dev)
 {
-	int ret = 0;
+	int ret;
 
 	__device_driver_lock(dev, dev->parent);
-
-	/*
-	 * If device has been removed or someone has already successfully
-	 * bound a driver before us just skip the driver probe call.
-	 */
-	if (!dev->p->dead && !dev->driver)
-		ret = driver_probe_device(drv, dev);
-
+	ret = driver_probe_device(drv, dev);
 	__device_driver_unlock(dev, dev->parent);
 
 	return ret;
@@ -1011,19 +1007,11 @@ static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
 {
 	struct device *dev = _dev;
 	struct device_driver *drv;
-	int ret = 0;
+	int ret;
 
 	__device_driver_lock(dev, dev->parent);
-
 	drv = dev->p->async_driver;
-
-	/*
-	 * If device has been removed or someone has already successfully
-	 * bound a driver before us just skip the driver probe call.
-	 */
-	if (!dev->p->dead && !dev->driver)
-		ret = driver_probe_device(drv, dev);
-
+	ret = driver_probe_device(drv, dev);
 	__device_driver_unlock(dev, dev->parent);
 
 	dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret);
-- 
Gitee


From 2ff7e0ea3d4b2421652aa75eaa7ff30198e938a4 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 13 Jun 2022 17:53:58 +0800
Subject: [PATCH 1639/2161] driver core: Better distinguish probe errors in
 really_probe

commit e1499647c69c72c4583273e773d8c2786cb4bee9 upstream.

really_probe tries to special case errors from ->probe, but due to all
other initialization added to the function over time now a lot of
internal errors hit that code path as well.  Untangle that by adding
a new probe_err local variable and apply the special casing only to
that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20210617142218.1877096-3-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/dd.c | 72 +++++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 30 deletions(-)

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index d794a27949b2..4e37677d8702 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -491,12 +491,44 @@ static void driver_deferred_probe_add_trigger(struct device *dev,
 		driver_deferred_probe_trigger();
 }
 
+
+static int call_driver_probe(struct device *dev, struct device_driver *drv)
+{
+	int ret = 0;
+
+	if (dev->bus->probe)
+		ret = dev->bus->probe(dev);
+	else if (drv->probe)
+		ret = drv->probe(dev);
+
+	switch (ret) {
+	case 0:
+		break;
+	case -EPROBE_DEFER:
+		/* Driver requested deferred probing */
+		dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
+		break;
+	case -ENODEV:
+	case -ENXIO:
+		pr_debug("%s: probe of %s rejects match %d\n",
+			 drv->name, dev_name(dev), ret);
+		break;
+	default:
+		/* driver matched but the probe failed */
+		pr_warn("%s: probe of %s failed with error %d\n",
+			drv->name, dev_name(dev), ret);
+		break;
+	}
+
+	return ret;
+}
+
 static int really_probe(struct device *dev, struct device_driver *drv)
 {
-	int ret = -EPROBE_DEFER;
 	int local_trigger_count = atomic_read(&deferred_trigger_count);
 	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
 			   !drv->suppress_bind_attrs;
+	int ret = -EPROBE_DEFER, probe_ret = 0;
 
 	if (defer_all_probes) {
 		/*
@@ -550,14 +582,14 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 			goto probe_failed;
 	}
 
-	if (dev->bus->probe) {
-		ret = dev->bus->probe(dev);
-		if (ret)
-			goto probe_failed;
-	} else if (drv->probe) {
-		ret = drv->probe(dev);
-		if (ret)
-			goto probe_failed;
+	probe_ret = call_driver_probe(dev, drv);
+	if (probe_ret) {
+		/*
+		 * Ignore errors returned by ->probe so that the next driver can
+		 * try its luck.
+		 */
+		ret = 0;
+		goto probe_failed;
 	}
 
 	if (device_add_groups(dev, drv->dev_groups)) {
@@ -618,28 +650,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 	pm_runtime_reinit(dev);
 	dev_pm_set_driver_flags(dev, 0);
 
-	switch (ret) {
-	case -EPROBE_DEFER:
-		/* Driver requested deferred probing */
-		dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
+	if (probe_ret == -EPROBE_DEFER)
 		driver_deferred_probe_add_trigger(dev, local_trigger_count);
-		break;
-	case -ENODEV:
-	case -ENXIO:
-		pr_debug("%s: probe of %s rejects match %d\n",
-			 drv->name, dev_name(dev), ret);
-		break;
-	default:
-		/* driver matched but the probe failed */
-		printk(KERN_WARNING
-		       "%s: probe of %s failed with error %d\n",
-		       drv->name, dev_name(dev), ret);
-	}
-	/*
-	 * Ignore errors returned by ->probe so that the next driver can try
-	 * its luck.
-	 */
-	ret = 0;
 done:
 	atomic_dec(&probe_count);
 	wake_up_all(&probe_waitqueue);
-- 
Gitee


From 2e9c51c3becc8e7006a8f6f5a1e39edee13fd291 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 17 Jun 2021 16:22:11 +0200
Subject: [PATCH 1640/2161] driver core: Flow the return code from ->probe()
 through to sysfs bind

commit ef6dcbdd8eb2f44dce70a3abecc32d43cc5f3e64 upstream.

Currently really_probe() returns 1 on success and 0 if the probe() call
fails. This return code arrangement is designed to be useful for
__device_attach_driver() which is walking the device list and trying every
driver. 0 means to keep trying.

However, it is not useful for the other places that call through to
really_probe() that do actually want to see the probe() return code.

For instance bind_store() would be better to return the actual error code
from the driver's probe method, not discarding it and returning -ENODEV.

Reorganize things so that really_probe() returns the error code from
->probe as a (inverted) positive number, and 0 for successful attach.

With this, __device_attach_driver can ignore the (positive) probe errors,
return 1 to exit the loop for a successful binding and pass on the
other negative errors, while device_driver_attach simplify inverts the
positive errors and returns all errors to the sysfs code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20210617142218.1877096-4-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/bus.c |  6 +-----
 drivers/base/dd.c  | 29 ++++++++++++++++++++---------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index f682afdfc1b8..7974bd7c6c13 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -211,13 +211,9 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf,
 	dev = bus_find_device_by_name(bus, NULL, buf);
 	if (dev && driver_match_device(drv, dev)) {
 		err = device_driver_attach(drv, dev);
-
-		if (err > 0) {
+		if (!err) {
 			/* success */
 			err = count;
-		} else if (err == 0) {
-			/* driver didn't accept device */
-			err = -ENODEV;
 		}
 	}
 	put_device(dev);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 4e37677d8702..e03ee6def988 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -585,10 +585,10 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 	probe_ret = call_driver_probe(dev, drv);
 	if (probe_ret) {
 		/*
-		 * Ignore errors returned by ->probe so that the next driver can
-		 * try its luck.
+		 * Return probe errors as positive values so that the callers
+		 * can distinguish them from other errors.
 		 */
-		ret = 0;
+		ret = -probe_ret;
 		goto probe_failed;
 	}
 
@@ -624,7 +624,6 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 		dev->pm_domain->sync(dev);
 
 	driver_bound(dev);
-	ret = 1;
 	pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
 	goto done;
@@ -711,8 +710,8 @@ EXPORT_SYMBOL_GPL(wait_for_device_probe);
  * @dev: device to try to bind to the driver
  *
  * This function returns -ENODEV if the device is not registered, -EBUSY if it
- * already has a driver, and 1 if the device is bound successfully and 0
- * otherwise.
+ * already has a driver, 0 if the device is bound successfully and a positive
+ * (inverted) error code for failures from the ->probe method.
  *
  * This function must be called with @dev lock held.  When called for a
  * USB interface, @dev->parent lock must be held as well.
@@ -846,7 +845,14 @@ static int __device_attach_driver(struct device_driver *drv, void *_data)
 	if (data->check_async && async_allowed != data->want_async)
 		return 0;
 
-	return driver_probe_device(drv, dev);
+	/*
+	 * Ignore errors returned by ->probe so that the next driver can try
+	 * its luck.
+	 */
+	ret = driver_probe_device(drv, dev);
+	if (ret < 0)
+		return ret;
+	return ret == 0;
 }
 
 static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
@@ -1002,7 +1008,7 @@ static void __device_driver_unlock(struct device *dev, struct device *parent)
  * @dev: Device to attach it to
  *
  * Manually attach driver to a device. Will acquire both @dev lock and
- * @dev->parent lock if needed.
+ * @dev->parent lock if needed. Returns 0 on success, -ERR on failure.
  */
 int device_driver_attach(struct device_driver *drv, struct device *dev)
 {
@@ -1012,6 +1018,9 @@ int device_driver_attach(struct device_driver *drv, struct device *dev)
 	ret = driver_probe_device(drv, dev);
 	__device_driver_unlock(dev, dev->parent);
 
+	/* also return probe errors as normal negative errnos */
+	if (ret > 0)
+		ret = -ret;
 	return ret;
 }
 
@@ -1077,7 +1086,9 @@ static int __driver_attach(struct device *dev, void *data)
 		return 0;
 	}
 
-	device_driver_attach(drv, dev);
+	__device_driver_lock(dev, dev->parent);
+	driver_probe_device(drv, dev);
+	__device_driver_unlock(dev, dev->parent);
 
 	return 0;
 }
-- 
Gitee


From 3c72bceca0b973b0c017a1b226c6c3daea13a183 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 13 Jun 2022 18:35:32 +0800
Subject: [PATCH 1641/2161] driver core: Don't return EPROBE_DEFER to userspace
 during sysfs bind

commit 45ddcb42949f825f0caa25352e825cede94b6aba upstream.

EPROBE_DEFER is an internal kernel error code and it should not be leaked
to userspace via the bind_store() sysfs. Userspace doesn't have this
constant and cannot understand it.

Further, it doesn't really make sense to have userspace trigger a deferred
probe via bind_store(), which could eventually succeed, while
simultaneously returning an error back.

Resolve this by splitting driver_probe_device so that the version used
by the sysfs binding that turns EPROBE_DEFER into -EAGAIN, while the one
used for internally binding keeps the error code, and calls
driver_deferred_probe_add where needed.  This also allows to nicely split
out the defer_all_probes / probe_count checks so that they actually allow
for full device_{block,unblock}_probing protection while not bothering
the sysfs bind case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210617142218.1877096-5-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/base.h |  1 -
 drivers/base/dd.c   | 85 ++++++++++++++++++++++++---------------------
 2 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0d32544b6f91..807503530e90 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -121,7 +121,6 @@ extern void device_release_driver_internal(struct device *dev,
 					   struct device *parent);
 
 extern void driver_detach(struct device_driver *drv);
-extern int driver_probe_device(struct device_driver *drv, struct device *dev);
 extern void driver_deferred_probe_del(struct device *dev);
 static inline int driver_match_device(struct device_driver *drv,
 				      struct device *dev)
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index e03ee6def988..b580c6c2d05e 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -482,16 +482,6 @@ EXPORT_SYMBOL_GPL(device_bind_driver);
 static atomic_t probe_count = ATOMIC_INIT(0);
 static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue);
 
-static void driver_deferred_probe_add_trigger(struct device *dev,
-					      int local_trigger_count)
-{
-	driver_deferred_probe_add(dev);
-	/* Did a trigger occur while probing? Need to re-trigger if yes */
-	if (local_trigger_count != atomic_read(&deferred_trigger_count))
-		driver_deferred_probe_trigger();
-}
-
-
 static int call_driver_probe(struct device *dev, struct device_driver *drv)
 {
 	int ret = 0;
@@ -525,10 +515,9 @@ static int call_driver_probe(struct device *dev, struct device_driver *drv)
 
 static int really_probe(struct device *dev, struct device_driver *drv)
 {
-	int local_trigger_count = atomic_read(&deferred_trigger_count);
 	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
 			   !drv->suppress_bind_attrs;
-	int ret = -EPROBE_DEFER, probe_ret = 0;
+	int ret;
 
 	if (defer_all_probes) {
 		/*
@@ -537,17 +526,13 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 		 * wait_for_device_probe() right after that to avoid any races.
 		 */
 		dev_dbg(dev, "Driver %s force probe deferral\n", drv->name);
-		driver_deferred_probe_add(dev);
-		return ret;
+		return -EPROBE_DEFER;
 	}
 
 	ret = device_links_check_suppliers(dev);
-	if (ret == -EPROBE_DEFER)
-		driver_deferred_probe_add_trigger(dev, local_trigger_count);
 	if (ret)
 		return ret;
 
-	atomic_inc(&probe_count);
 	pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
 		 drv->bus->name, __func__, drv->name, dev_name(dev));
 	if (!list_empty(&dev->devres_head)) {
@@ -582,13 +567,13 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 			goto probe_failed;
 	}
 
-	probe_ret = call_driver_probe(dev, drv);
-	if (probe_ret) {
+	ret = call_driver_probe(dev, drv);
+	if (ret) {
 		/*
 		 * Return probe errors as positive values so that the callers
 		 * can distinguish them from other errors.
 		 */
-		ret = -probe_ret;
+		ret = -ret;
 		goto probe_failed;
 	}
 
@@ -649,11 +634,7 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 	pm_runtime_reinit(dev);
 	dev_pm_set_driver_flags(dev, 0);
 
-	if (probe_ret == -EPROBE_DEFER)
-		driver_deferred_probe_add_trigger(dev, local_trigger_count);
 done:
-	atomic_dec(&probe_count);
-	wake_up_all(&probe_waitqueue);
 	return ret;
 }
 
@@ -704,21 +685,7 @@ void wait_for_device_probe(void)
 }
 EXPORT_SYMBOL_GPL(wait_for_device_probe);
 
-/**
- * driver_probe_device - attempt to bind device & driver together
- * @drv: driver to bind a device to
- * @dev: device to try to bind to the driver
- *
- * This function returns -ENODEV if the device is not registered, -EBUSY if it
- * already has a driver, 0 if the device is bound successfully and a positive
- * (inverted) error code for failures from the ->probe method.
- *
- * This function must be called with @dev lock held.  When called for a
- * USB interface, @dev->parent lock must be held as well.
- *
- * If the device has a parent, runtime-resume the parent before driver probing.
- */
-int driver_probe_device(struct device_driver *drv, struct device *dev)
+static int __driver_probe_device(struct device_driver *drv, struct device *dev)
 {
 	int ret = 0;
 
@@ -748,6 +715,42 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 	return ret;
 }
 
+/**
+ * driver_probe_device - attempt to bind device & driver together
+ * @drv: driver to bind a device to
+ * @dev: device to try to bind to the driver
+ *
+ * This function returns -ENODEV if the device is not registered, -EBUSY if it
+ * already has a driver, 0 if the device is bound successfully and a positive
+ * (inverted) error code for failures from the ->probe method.
+ *
+ * This function must be called with @dev lock held.  When called for a
+ * USB interface, @dev->parent lock must be held as well.
+ *
+ * If the device has a parent, runtime-resume the parent before driver probing.
+ */
+static int driver_probe_device(struct device_driver *drv, struct device *dev)
+{
+	int trigger_count = atomic_read(&deferred_trigger_count);
+	int ret;
+
+	atomic_inc(&probe_count);
+	ret = __driver_probe_device(drv, dev);
+	if (ret == -EPROBE_DEFER || ret == EPROBE_DEFER) {
+		driver_deferred_probe_add(dev);
+
+		/*
+		 * Did a trigger occur while probing? Need to re-trigger if yes
+		 */
+		if (trigger_count != atomic_read(&deferred_trigger_count) &&
+		    !defer_all_probes)
+			driver_deferred_probe_trigger();
+	}
+	atomic_dec(&probe_count);
+	wake_up_all(&probe_waitqueue);
+	return ret;
+}
+
 static inline bool cmdline_requested_async_probing(const char *drv_name)
 {
 	return parse_option_str(async_probe_drv_names, drv_name);
@@ -1015,12 +1018,14 @@ int device_driver_attach(struct device_driver *drv, struct device *dev)
 	int ret;
 
 	__device_driver_lock(dev, dev->parent);
-	ret = driver_probe_device(drv, dev);
+	ret = __driver_probe_device(drv, dev);
 	__device_driver_unlock(dev, dev->parent);
 
 	/* also return probe errors as normal negative errnos */
 	if (ret > 0)
 		ret = -ret;
+	if (ret == -EPROBE_DEFER)
+		return -EAGAIN;
 	return ret;
 }
 
-- 
Gitee


From ffba8e7beeb72c77f68d464726f4b132b01f799e Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 13 Jun 2022 18:48:28 +0800
Subject: [PATCH 1642/2161] driver core: Export device_driver_attach()

commit 0d9f837c6958a4c14e6bcb5c5edf6c851d65f507 upstream.

This is intended as a replacement API for device_bind_driver(). It has at
least the following benefits:

- Internal locking. Few of the users of device_bind_driver() follow the
  locking rules

- Calls device driver probe() internally. Notably this means that devm
  support for probe works correctly as probe() error will call
  devres_release_all()

- struct device_driver -> dev_groups is supported

- Simplified calling convention, no need to manually call probe().

The general usage is for situations that already know what driver to bind
and need to ensure the bind is synchronized with other logic. Call
device_driver_attach() after device_add().

If probe() returns a failure then this will be preserved up through to the
error return of device_driver_attach().

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210617142218.1877096-6-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/base.h    | 1 -
 drivers/base/dd.c      | 3 +++
 include/linux/device.h | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 807503530e90..6ead2acf1ce9 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -133,7 +133,6 @@ extern int driver_add_groups(struct device_driver *drv,
 			     const struct attribute_group **groups);
 extern void driver_remove_groups(struct device_driver *drv,
 				 const struct attribute_group **groups);
-int device_driver_attach(struct device_driver *drv, struct device *dev);
 void device_driver_detach(struct device *dev);
 
 extern char *make_class_name(const char *name, struct kobject *kobj);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index b580c6c2d05e..f98d5411f9b2 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -464,6 +464,8 @@ static void driver_sysfs_remove(struct device *dev)
  * from a driver's probe() method.)
  *
  * This function must be called with the device lock held.
+ *
+ * Callers should prefer to use device_driver_attach() instead.
  */
 int device_bind_driver(struct device *dev)
 {
@@ -1028,6 +1030,7 @@ int device_driver_attach(struct device_driver *drv, struct device *dev)
 		return -EAGAIN;
 	return ret;
 }
+EXPORT_SYMBOL_GPL(device_driver_attach);
 
 static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
 {
diff --git a/include/linux/device.h b/include/linux/device.h
index 8ec5dc0b3cbd..6b8ff7cf36df 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1568,6 +1568,8 @@ static inline void *dev_get_platdata(const struct device *dev)
  * Manual binding of a device to driver. See drivers/base/bus.c
  * for information on use.
  */
+int __must_check device_driver_attach(struct device_driver *drv,
+				      struct device *dev);
 extern int __must_check device_bind_driver(struct device *dev);
 extern void device_release_driver(struct device *dev);
 extern int  __must_check device_attach(struct device *dev);
-- 
Gitee


From c341a9310bdbb601e9d8c22f37511eec5c5c8621 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 17 Jun 2021 16:22:14 +0200
Subject: [PATCH 1643/2161] vfio/mdev: Remove CONFIG_VFIO_MDEV_DEVICE

commit af3ab3f9b986cdbc1b97b8a3341ce78851edb0dd upstream.

For some reason the vfio_mdev shim mdev_driver has its own module and
kconfig. As the next patch requires access to it from mdev.ko merge the
two modules together and remove VFIO_MDEV_DEVICE.

A later patch deletes this driver entirely.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20210617142218.1877096-7-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/Kconfig        |  7 -------
 drivers/vfio/mdev/Makefile       |  3 +--
 drivers/vfio/mdev/mdev_core.c    | 16 ++++++++++++++--
 drivers/vfio/mdev/mdev_private.h |  2 ++
 drivers/vfio/mdev/vfio_mdev.c    | 24 +-----------------------
 samples/Kconfig                  |  6 +++---
 6 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
index 5da27f2100f9..763c877a1318 100644
--- a/drivers/vfio/mdev/Kconfig
+++ b/drivers/vfio/mdev/Kconfig
@@ -9,10 +9,3 @@ config VFIO_MDEV
 	  See Documentation/driver-api/vfio-mediated-device.rst for more details.
 
 	  If you don't know what do here, say N.
-
-config VFIO_MDEV_DEVICE
-	tristate "VFIO driver for Mediated devices"
-	depends on VFIO && VFIO_MDEV
-	default n
-	help
-	  VFIO based driver for Mediated devices.
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
index 101516fdf375..ff9ecd802125 100644
--- a/drivers/vfio/mdev/Makefile
+++ b/drivers/vfio/mdev/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
+mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o vfio_mdev.o
 
 obj-$(CONFIG_VFIO_MDEV) += mdev.o
-obj-$(CONFIG_VFIO_MDEV_DEVICE) += vfio_mdev.o
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 2a85d6fcb7dd..ff8c1a845166 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -360,11 +360,24 @@ int mdev_device_remove(struct mdev_device *mdev)
 
 static int __init mdev_init(void)
 {
-	return mdev_bus_register();
+	int rc;
+
+	rc = mdev_bus_register();
+	if (rc)
+		return rc;
+	rc = mdev_register_driver(&vfio_mdev_driver);
+	if (rc)
+		goto err_bus;
+	return 0;
+err_bus:
+	mdev_bus_unregister();
+	return rc;
 }
 
 static void __exit mdev_exit(void)
 {
+	mdev_unregister_driver(&vfio_mdev_driver);
+
 	if (mdev_bus_compat_class)
 		class_compat_unregister(mdev_bus_compat_class);
 
@@ -378,4 +391,3 @@ MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
-MODULE_SOFTDEP("post: vfio_mdev");
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index 6999c89db7b1..afbad7b0a14a 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -37,6 +37,8 @@ struct mdev_type {
 #define to_mdev_type(_kobj)		\
 	container_of(_kobj, struct mdev_type, kobj)
 
+extern struct mdev_driver vfio_mdev_driver;
+
 int  parent_create_sysfs_files(struct mdev_parent *parent);
 void parent_remove_sysfs_files(struct mdev_parent *parent);
 
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 5ef4815609ed..39ef7489fe47 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -17,10 +17,6 @@
 
 #include "mdev_private.h"
 
-#define DRIVER_VERSION  "0.1"
-#define DRIVER_AUTHOR   "NVIDIA Corporation"
-#define DRIVER_DESC     "VFIO based driver for Mediated device"
-
 static int vfio_mdev_open(struct vfio_device *core_vdev)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
@@ -140,7 +136,7 @@ static void vfio_mdev_remove(struct mdev_device *mdev)
 	kfree(vdev);
 }
 
-static struct mdev_driver vfio_mdev_driver = {
+struct mdev_driver vfio_mdev_driver = {
 	.driver = {
 		.name = "vfio_mdev",
 		.owner = THIS_MODULE,
@@ -149,21 +145,3 @@ static struct mdev_driver vfio_mdev_driver = {
 	.probe	= vfio_mdev_probe,
 	.remove	= vfio_mdev_remove,
 };
-
-static int __init vfio_mdev_init(void)
-{
-	return mdev_register_driver(&vfio_mdev_driver);
-}
-
-static void __exit vfio_mdev_exit(void)
-{
-	mdev_unregister_driver(&vfio_mdev_driver);
-}
-
-module_init(vfio_mdev_init)
-module_exit(vfio_mdev_exit)
-
-MODULE_VERSION(DRIVER_VERSION);
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR(DRIVER_AUTHOR);
-MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/samples/Kconfig b/samples/Kconfig
index c8dacb4dda80..e6663b91dbae 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -118,14 +118,14 @@ config SAMPLE_SECCOMP
 
 config SAMPLE_VFIO_MDEV_MTTY
 	tristate "Build VFIO mtty example mediated device sample code -- loadable modules only"
-	depends on VFIO_MDEV_DEVICE && m
+	depends on VFIO_MDEV && m
 	help
 	  Build a virtual tty sample driver for use as a VFIO
 	  mediated device
 
 config SAMPLE_VFIO_MDEV_MDPY
 	tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
-	depends on VFIO_MDEV_DEVICE && m
+	depends on VFIO_MDEV && m
 	help
 	  Build a virtual display sample driver for use as a VFIO
 	  mediated device.  It is a simple framebuffer and supports
@@ -142,7 +142,7 @@ config SAMPLE_VFIO_MDEV_MDPY_FB
 
 config SAMPLE_VFIO_MDEV_MBOCHS
 	tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
-	depends on VFIO_MDEV_DEVICE && m
+	depends on VFIO_MDEV && m
 	select DMA_SHARED_BUFFER
 	help
 	  Build a virtual display sample driver for use as a VFIO
-- 
Gitee


From 026a8b6ec23065fa88db06eea21aa065d2d6daab Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 17 Jun 2021 16:22:15 +0200
Subject: [PATCH 1644/2161] vfio/mdev: Allow the mdev_parent_ops to specify the
 device driver to bind

commit 88a21f265ce50a17e6e71e3fb4467625cf234c5a upstream.

This allows a mdev driver to opt out of using vfio_mdev.c, instead the
driver will provide a 'struct mdev_driver' and register directly with the
driver core.

Much of mdev_parent_ops becomes unused in this mode:
- create()/remove() are done via the mdev_driver probe()/remove()
- mdev_attr_groups becomes mdev_driver driver.dev_groups
- Wrapper function callbacks are replaced with the same ones from
  struct vfio_device_ops

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20210617142218.1877096-8-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../driver-api/vfio-mediated-device.rst       | 35 +++++++------------
 drivers/vfio/mdev/mdev_core.c                 | 30 +++++++++++-----
 drivers/vfio/mdev/mdev_driver.c               | 10 ++++++
 include/linux/mdev.h                          |  2 ++
 4 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index 1779b85f014e..9f26079cacae 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -93,7 +93,7 @@ interfaces:
 Registration Interface for a Mediated Bus Driver
 ------------------------------------------------
 
-The registration interface for a mediated bus driver provides the following
+The registration interface for a mediated device driver provides the following
 structure to represent a mediated device's driver::
 
      /*
@@ -136,37 +136,26 @@ The structures in the mdev_parent_ops structure are as follows:
 * dev_attr_groups: attributes of the parent device
 * mdev_attr_groups: attributes of the mediated device
 * supported_config: attributes to define supported configurations
+* device_driver: device driver to bind for mediated device instances
 
-The functions in the mdev_parent_ops structure are as follows:
+The mdev_parent_ops also still has various functions pointers.  Theses exist
+for historical reasons only and shall not be used for new drivers.
 
-* create: allocate basic resources in a driver for a mediated device
-* remove: free resources in a driver when a mediated device is destroyed
-
-(Note that mdev-core provides no implicit serialization of create/remove
-callbacks per mdev parent device, per mdev type, or any other categorization.
-Vendor drivers are expected to be fully asynchronous in this respect or
-provide their own internal resource protection.)
-
-The callbacks in the mdev_parent_ops structure are as follows:
-
-* open: open callback of mediated device
-* close: close callback of mediated device
-* ioctl: ioctl callback of mediated device
-* read : read emulation callback
-* write: write emulation callback
-* mmap: mmap emulation callback
-
-A driver should use the mdev_parent_ops structure in the function call to
-register itself with the mdev core driver::
+When a driver wants to add the GUID creation sysfs to an existing device it has
+probe'd to then it should call::
 
 	extern int  mdev_register_device(struct device *dev,
 	                                 const struct mdev_parent_ops *ops);
 
-However, the mdev_parent_ops structure is not required in the function call
-that a driver should use to unregister itself with the mdev core driver::
+This will provide the 'mdev_supported_types/XX/create' files which can then be
+used to trigger the creation of a mdev_device. The created mdev_device will be
+attached to the specified driver.
+
+When the driver needs to remove itself it calls::
 
 	extern void mdev_unregister_device(struct device *dev);
 
+Which will unbind and destroy all the created mdevs and remove the sysfs files.
 
 Mediated Device Management Interface Through sysfs
 ==================================================
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index ff8c1a845166..e4581ec093a6 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -94,9 +94,11 @@ static void mdev_device_remove_common(struct mdev_device *mdev)
 	mdev_remove_sysfs_files(mdev);
 	device_del(&mdev->dev);
 	lockdep_assert_held(&parent->unreg_sem);
-	ret = parent->ops->remove(mdev);
-	if (ret)
-		dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
+	if (parent->ops->remove) {
+		ret = parent->ops->remove(mdev);
+		if (ret)
+			dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
+	}
 
 	/* Balances with device_initialize() */
 	put_device(&mdev->dev);
@@ -127,7 +129,9 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	char *envp[] = { env_string, NULL };
 
 	/* check for mandatory ops */
-	if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups)
+	if (!ops || !ops->supported_type_groups)
+		return -EINVAL;
+	if (!ops->device_driver && (!ops->create || !ops->remove))
 		return -EINVAL;
 
 	dev = get_device(dev);
@@ -256,6 +260,7 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 	int ret;
 	struct mdev_device *mdev, *tmp;
 	struct mdev_parent *parent = type->parent;
+	struct mdev_driver *drv = parent->ops->device_driver;
 
 	mutex_lock(&mdev_list_lock);
 
@@ -296,14 +301,22 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 		goto out_put_device;
 	}
 
-	ret = parent->ops->create(mdev);
-	if (ret)
-		goto out_unlock;
+	if (parent->ops->create) {
+		ret = parent->ops->create(mdev);
+		if (ret)
+			goto out_unlock;
+	}
 
 	ret = device_add(&mdev->dev);
 	if (ret)
 		goto out_remove;
 
+	if (!drv)
+		drv = &vfio_mdev_driver;
+	ret = device_driver_attach(&drv->driver, &mdev->dev);
+	if (ret)
+		goto out_del;
+
 	ret = mdev_create_sysfs_files(mdev);
 	if (ret)
 		goto out_del;
@@ -317,7 +330,8 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 out_del:
 	device_del(&mdev->dev);
 out_remove:
-	parent->ops->remove(mdev);
+	if (parent->ops->remove)
+		parent->ops->remove(mdev);
 out_unlock:
 	up_read(&parent->unreg_sem);
 out_put_device:
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 041699571b7e..c368ec824e2b 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -71,10 +71,20 @@ static int mdev_remove(struct device *dev)
 	return 0;
 }
 
+static int mdev_match(struct device *dev, struct device_driver *drv)
+{
+	/*
+	 * No drivers automatically match. Drivers are only bound by explicit
+	 * device_driver_attach()
+	 */
+	return 0;
+}
+
 struct bus_type mdev_bus_type = {
 	.name		= "mdev",
 	.probe		= mdev_probe,
 	.remove		= mdev_remove,
+	.match		= mdev_match,
 };
 EXPORT_SYMBOL_GPL(mdev_bus_type);
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index de12d3a3ded0..ae9b409bf634 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -55,6 +55,7 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype);
  * register the device to mdev module.
  *
  * @owner:		The module owner.
+ * @device_driver:	Which device driver to probe() on newly created devices
  * @dev_attr_groups:	Attributes of the parent device.
  * @mdev_attr_groups:	Attributes of the mediated device.
  * @supported_type_groups: Attributes to define supported types. It is mandatory
@@ -103,6 +104,7 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype);
  **/
 struct mdev_parent_ops {
 	struct module   *owner;
+	struct mdev_driver *device_driver;
 	const struct attribute_group **dev_attr_groups;
 	const struct attribute_group **mdev_attr_groups;
 	struct attribute_group **supported_type_groups;
-- 
Gitee


From b4912b81470ae10165808d180bdcc13d9ea4137c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 17 Jun 2021 16:22:16 +0200
Subject: [PATCH 1645/2161] vfio/mtty: Convert to use vfio_register_group_dev()

commit 09177ac9192198bec24a81c822ebeef4197c3c8b upstream.

This is straightforward conversion, the mdev_state is actually serving as
the vfio_device and we can replace all the mdev_get_drvdata()'s and the
wonky dead code with a simple container_of()

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20210617142218.1877096-9-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mtty.c | 185 ++++++++++++++++++---------------------
 1 file changed, 83 insertions(+), 102 deletions(-)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index b9b24be4abda..faf9b8e8873a 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -127,6 +127,7 @@ struct serial_port {
 
 /* State of each mdev device */
 struct mdev_state {
+	struct vfio_device vdev;
 	int irq_fd;
 	struct eventfd_ctx *intx_evtfd;
 	struct eventfd_ctx *msi_evtfd;
@@ -150,6 +151,8 @@ static const struct file_operations vd_fops = {
 	.owner          = THIS_MODULE,
 };
 
+static const struct vfio_device_ops mtty_dev_ops;
+
 /* function prototypes */
 
 static int mtty_trigger_interrupt(struct mdev_state *mdev_state);
@@ -631,22 +634,15 @@ static void mdev_read_base(struct mdev_state *mdev_state)
 	}
 }
 
-static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
+static ssize_t mdev_access(struct mdev_state *mdev_state, u8 *buf, size_t count,
 			   loff_t pos, bool is_write)
 {
-	struct mdev_state *mdev_state;
 	unsigned int index;
 	loff_t offset;
 	int ret = 0;
 
-	if (!mdev || !buf)
-		return -EINVAL;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state) {
-		pr_err("%s mdev_state not found\n", __func__);
+	if (!buf)
 		return -EINVAL;
-	}
 
 	mutex_lock(&mdev_state->ops_lock);
 
@@ -708,15 +704,18 @@ static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
 	return ret;
 }
 
-static int mtty_create(struct mdev_device *mdev)
+static int mtty_probe(struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state;
 	int nr_ports = mdev_get_type_group_id(mdev) + 1;
+	int ret;
 
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
 	if (mdev_state == NULL)
 		return -ENOMEM;
 
+	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mtty_dev_ops);
+
 	mdev_state->nr_ports = nr_ports;
 	mdev_state->irq_index = -1;
 	mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE;
@@ -731,7 +730,6 @@ static int mtty_create(struct mdev_device *mdev)
 
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
-	mdev_set_drvdata(mdev, mdev_state);
 
 	mtty_create_config_space(mdev_state);
 
@@ -739,50 +737,40 @@ static int mtty_create(struct mdev_device *mdev)
 	list_add(&mdev_state->next, &mdev_devices_list);
 	mutex_unlock(&mdev_list_lock);
 
+	ret = vfio_register_group_dev(&mdev_state->vdev);
+	if (ret) {
+		kfree(mdev_state);
+		return ret;
+	}
+	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
 }
 
-static int mtty_remove(struct mdev_device *mdev)
+static void mtty_remove(struct mdev_device *mdev)
 {
-	struct mdev_state *mds, *tmp_mds;
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	int ret = -EINVAL;
+	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
 
+	vfio_unregister_group_dev(&mdev_state->vdev);
 	mutex_lock(&mdev_list_lock);
-	list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) {
-		if (mdev_state == mds) {
-			list_del(&mdev_state->next);
-			mdev_set_drvdata(mdev, NULL);
-			kfree(mdev_state->vconfig);
-			kfree(mdev_state);
-			ret = 0;
-			break;
-		}
-	}
+	list_del(&mdev_state->next);
 	mutex_unlock(&mdev_list_lock);
 
-	return ret;
+	kfree(mdev_state->vconfig);
+	kfree(mdev_state);
 }
 
-static int mtty_reset(struct mdev_device *mdev)
+static int mtty_reset(struct mdev_state *mdev_state)
 {
-	struct mdev_state *mdev_state;
-
-	if (!mdev)
-		return -EINVAL;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
-
 	pr_info("%s: called\n", __func__);
 
 	return 0;
 }
 
-static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
+static ssize_t mtty_read(struct vfio_device *vdev, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -792,7 +780,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
 		if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
-			ret =  mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret =  mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					   *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -804,7 +792,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
 		} else if (count >= 2 && !(*ppos % 2)) {
 			u16 val;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -816,7 +804,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
 		} else {
 			u8 val;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -839,9 +827,11 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
 	return -EFAULT;
 }
 
-static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
+static ssize_t mtty_write(struct vfio_device *vdev, const char __user *buf,
 		   size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -854,7 +844,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -866,7 +856,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -878,7 +868,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -896,19 +886,11 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
 	return -EFAULT;
 }
 
-static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags,
+static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
 			 unsigned int index, unsigned int start,
 			 unsigned int count, void *data)
 {
 	int ret = 0;
-	struct mdev_state *mdev_state;
-
-	if (!mdev)
-		return -EINVAL;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
 
 	mutex_lock(&mdev_state->ops_lock);
 	switch (index) {
@@ -1024,21 +1006,13 @@ static int mtty_trigger_interrupt(struct mdev_state *mdev_state)
 	return ret;
 }
 
-static int mtty_get_region_info(struct mdev_device *mdev,
+static int mtty_get_region_info(struct mdev_state *mdev_state,
 			 struct vfio_region_info *region_info,
 			 u16 *cap_type_id, void **cap_type)
 {
 	unsigned int size = 0;
-	struct mdev_state *mdev_state;
 	u32 bar_index;
 
-	if (!mdev)
-		return -EINVAL;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
-
 	bar_index = region_info->index;
 	if (bar_index >= VFIO_PCI_NUM_REGIONS)
 		return -EINVAL;
@@ -1073,8 +1047,7 @@ static int mtty_get_region_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mtty_get_irq_info(struct mdev_device *mdev,
-			     struct vfio_irq_info *irq_info)
+static int mtty_get_irq_info(struct vfio_irq_info *irq_info)
 {
 	switch (irq_info->index) {
 	case VFIO_PCI_INTX_IRQ_INDEX:
@@ -1098,8 +1071,7 @@ static int mtty_get_irq_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mtty_get_device_info(struct mdev_device *mdev,
-			 struct vfio_device_info *dev_info)
+static int mtty_get_device_info(struct vfio_device_info *dev_info)
 {
 	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
 	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
@@ -1108,19 +1080,13 @@ static int mtty_get_device_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
+static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd,
 			unsigned long arg)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	int ret = 0;
 	unsigned long minsz;
-	struct mdev_state *mdev_state;
-
-	if (!mdev)
-		return -EINVAL;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -ENODEV;
 
 	switch (cmd) {
 	case VFIO_DEVICE_GET_INFO:
@@ -1135,7 +1101,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mtty_get_device_info(mdev, &info);
+		ret = mtty_get_device_info(&info);
 		if (ret)
 			return ret;
 
@@ -1160,7 +1126,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mtty_get_region_info(mdev, &info, &cap_type_id,
+		ret = mtty_get_region_info(mdev_state, &info, &cap_type_id,
 					   &cap_type);
 		if (ret)
 			return ret;
@@ -1184,7 +1150,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		    (info.index >= mdev_state->dev_info.num_irqs))
 			return -EINVAL;
 
-		ret = mtty_get_irq_info(mdev, &info);
+		ret = mtty_get_irq_info(&info);
 		if (ret)
 			return ret;
 
@@ -1218,25 +1184,25 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
 				return PTR_ERR(data);
 		}
 
-		ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start,
+		ret = mtty_set_irqs(mdev_state, hdr.flags, hdr.index, hdr.start,
 				    hdr.count, data);
 
 		kfree(ptr);
 		return ret;
 	}
 	case VFIO_DEVICE_RESET:
-		return mtty_reset(mdev);
+		return mtty_reset(mdev_state);
 	}
 	return -ENOTTY;
 }
 
-static int mtty_open(struct mdev_device *mdev)
+static int mtty_open(struct vfio_device *vdev)
 {
 	pr_info("%s\n", __func__);
 	return 0;
 }
 
-static void mtty_close(struct mdev_device *mdev)
+static void mtty_close(struct vfio_device *mdev)
 {
 	pr_info("%s\n", __func__);
 }
@@ -1351,18 +1317,31 @@ static struct attribute_group *mdev_type_groups[] = {
 	NULL,
 };
 
+static const struct vfio_device_ops mtty_dev_ops = {
+	.name = "vfio-mtty",
+	.open = mtty_open,
+	.release = mtty_close,
+	.read = mtty_read,
+	.write = mtty_write,
+	.ioctl = mtty_ioctl,
+};
+
+static struct mdev_driver mtty_driver = {
+	.driver = {
+		.name = "mtty",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+		.dev_groups = mdev_dev_groups,
+	},
+	.probe = mtty_probe,
+	.remove	= mtty_remove,
+};
+
 static const struct mdev_parent_ops mdev_fops = {
 	.owner                  = THIS_MODULE,
+	.device_driver		= &mtty_driver,
 	.dev_attr_groups        = mtty_dev_groups,
-	.mdev_attr_groups       = mdev_dev_groups,
 	.supported_type_groups  = mdev_type_groups,
-	.create                 = mtty_create,
-	.remove			= mtty_remove,
-	.open                   = mtty_open,
-	.release                = mtty_close,
-	.read                   = mtty_read,
-	.write                  = mtty_write,
-	.ioctl		        = mtty_ioctl,
 };
 
 static void mtty_device_release(struct device *dev)
@@ -1393,12 +1372,16 @@ static int __init mtty_dev_init(void)
 
 	pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt));
 
+	ret = mdev_register_driver(&mtty_driver);
+	if (ret)
+		goto err_cdev;
+
 	mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME);
 
 	if (IS_ERR(mtty_dev.vd_class)) {
 		pr_err("Error: failed to register mtty_dev class\n");
 		ret = PTR_ERR(mtty_dev.vd_class);
-		goto failed1;
+		goto err_driver;
 	}
 
 	mtty_dev.dev.class = mtty_dev.vd_class;
@@ -1407,28 +1390,25 @@ static int __init mtty_dev_init(void)
 
 	ret = device_register(&mtty_dev.dev);
 	if (ret)
-		goto failed2;
+		goto err_class;
 
 	ret = mdev_register_device(&mtty_dev.dev, &mdev_fops);
 	if (ret)
-		goto failed3;
+		goto err_device;
 
 	mutex_init(&mdev_list_lock);
 	INIT_LIST_HEAD(&mdev_devices_list);
+	return 0;
 
-	goto all_done;
-
-failed3:
-
+err_device:
 	device_unregister(&mtty_dev.dev);
-failed2:
+err_class:
 	class_destroy(mtty_dev.vd_class);
-
-failed1:
+err_driver:
+	mdev_unregister_driver(&mtty_driver);
+err_cdev:
 	cdev_del(&mtty_dev.vd_cdev);
 	unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1);
-
-all_done:
 	return ret;
 }
 
@@ -1439,6 +1419,7 @@ static void __exit mtty_dev_exit(void)
 
 	device_unregister(&mtty_dev.dev);
 	idr_destroy(&mtty_dev.vd_idr);
+	mdev_unregister_driver(&mtty_driver);
 	cdev_del(&mtty_dev.vd_cdev);
 	unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1);
 	class_destroy(mtty_dev.vd_class);
-- 
Gitee


From 6209b5f4ff3bf0d9351d9c613497c7667e1ec4b6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Apr 2021 22:58:01 -0700
Subject: [PATCH 1646/2161] samples/vfio-mdev/mdpy: use remap_vmalloc_range

commit 8c2acfe8c1df1c8baacbeee4c519683ae3f3d722 upstream.

Patch series "remap_vmalloc_range cleanups".

This series removes an open coded instance of remap_vmalloc_range and
removes the unused remap_vmalloc_range_partial export.

This patch (of 2):

Use remap_vmalloc_range instead of open coding it using
remap_vmalloc_range_partial.

Link: https://lkml.kernel.org/r/20210301082235.932968-1-hch@lst.de
Link: https://lkml.kernel.org/r/20210301082235.932968-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mdpy.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 885b88ea20e2..f0c0e7209719 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -406,9 +406,7 @@ static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
 	if ((vma->vm_flags & VM_SHARED) == 0)
 		return -EINVAL;
 
-	return remap_vmalloc_range_partial(vma, vma->vm_start,
-					   mdev_state->memblk, 0,
-					   vma->vm_end - vma->vm_start);
+	return remap_vmalloc_range(vma, mdev_state->memblk, 0);
 }
 
 static int mdpy_get_region_info(struct mdev_device *mdev,
-- 
Gitee


From 3c4c0b7496990857a1c8fb882e1c4b902119f17d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 17 Jun 2021 16:22:17 +0200
Subject: [PATCH 1647/2161] vfio/mdpy: Convert to use vfio_register_group_dev()

commit 437e41368c01fba8c220d7ca2f6b9d7fde92beee upstream.

This is straightforward conversion, the mdev_state is actually serving as
the vfio_device and we can replace all the mdev_get_drvdata()'s and the
wonky dead code with a simple container_of().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210617142218.1877096-10-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mdpy.c | 159 ++++++++++++++++++++++-----------------
 1 file changed, 88 insertions(+), 71 deletions(-)

diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index f0c0e7209719..8f6a2ae52550 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -85,9 +85,11 @@ static struct class	*mdpy_class;
 static struct cdev	mdpy_cdev;
 static struct device	mdpy_dev;
 static u32		mdpy_count;
+static const struct vfio_device_ops mdpy_dev_ops;
 
 /* State of each mdev device */
 struct mdev_state {
+	struct vfio_device vdev;
 	u8 *vconfig;
 	u32 bar_mask;
 	struct mutex ops_lock;
@@ -162,11 +164,9 @@ static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
 	}
 }
 
-static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
-			   loff_t pos, bool is_write)
+static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf,
+			   size_t count, loff_t pos, bool is_write)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	struct device *dev = mdev_dev(mdev);
 	int ret = 0;
 
 	mutex_lock(&mdev_state->ops_lock);
@@ -187,8 +187,9 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
 			memcpy(buf, mdev_state->memblk, count);
 
 	} else {
-		dev_info(dev, "%s: %s @0x%llx (unhandled)\n",
-			 __func__, is_write ? "WR" : "RD", pos);
+		dev_info(mdev_state->vdev.dev,
+			 "%s: %s @0x%llx (unhandled)\n", __func__,
+			 is_write ? "WR" : "RD", pos);
 		ret = -1;
 		goto accessfailed;
 	}
@@ -202,9 +203,8 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
 	return ret;
 }
 
-static int mdpy_reset(struct mdev_device *mdev)
+static int mdpy_reset(struct mdev_state *mdev_state)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
 	u32 stride, i;
 
 	/* initialize with gray gradient */
@@ -216,13 +216,14 @@ static int mdpy_reset(struct mdev_device *mdev)
 	return 0;
 }
 
-static int mdpy_create(struct mdev_device *mdev)
+static int mdpy_probe(struct mdev_device *mdev)
 {
 	const struct mdpy_type *type =
 		&mdpy_types[mdev_get_type_group_id(mdev)];
 	struct device *dev = mdev_dev(mdev);
 	struct mdev_state *mdev_state;
 	u32 fbsize;
+	int ret;
 
 	if (mdpy_count >= max_devices)
 		return -ENOMEM;
@@ -230,6 +231,7 @@ static int mdpy_create(struct mdev_device *mdev)
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
 	if (mdev_state == NULL)
 		return -ENOMEM;
+	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mdpy_dev_ops);
 
 	mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
 	if (mdev_state->vconfig == NULL) {
@@ -250,36 +252,41 @@ static int mdpy_create(struct mdev_device *mdev)
 
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
-	mdev_set_drvdata(mdev, mdev_state);
-
 	mdev_state->type    = type;
 	mdev_state->memsize = fbsize;
 	mdpy_create_config_space(mdev_state);
-	mdpy_reset(mdev);
+	mdpy_reset(mdev_state);
 
 	mdpy_count++;
+
+	ret = vfio_register_group_dev(&mdev_state->vdev);
+	if (ret) {
+		kfree(mdev_state);
+		return ret;
+	}
+	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
 }
 
-static int mdpy_remove(struct mdev_device *mdev)
+static void mdpy_remove(struct mdev_device *mdev)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	struct device *dev = mdev_dev(mdev);
+	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
 
-	dev_info(dev, "%s\n", __func__);
+	dev_info(&mdev->dev, "%s\n", __func__);
 
-	mdev_set_drvdata(mdev, NULL);
+	vfio_unregister_group_dev(&mdev_state->vdev);
 	vfree(mdev_state->memblk);
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
 
 	mdpy_count--;
-	return 0;
 }
 
-static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
+static ssize_t mdpy_read(struct vfio_device *vdev, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -289,8 +296,8 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
 		if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
-			ret =  mdev_access(mdev, (char *)&val, sizeof(val),
-					   *ppos, false);
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
 
@@ -301,7 +308,7 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
 		} else if (count >= 2 && !(*ppos % 2)) {
 			u16 val;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -313,7 +320,7 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
 		} else {
 			u8 val;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -336,9 +343,11 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
 	return -EFAULT;
 }
 
-static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
+static ssize_t mdpy_write(struct vfio_device *vdev, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -351,7 +360,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -363,7 +372,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -375,7 +384,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -393,9 +402,10 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
 	return -EFAULT;
 }
 
-static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 
 	if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
 		return -EINVAL;
@@ -409,16 +419,10 @@ static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
 	return remap_vmalloc_range(vma, mdev_state->memblk, 0);
 }
 
-static int mdpy_get_region_info(struct mdev_device *mdev,
+static int mdpy_get_region_info(struct mdev_state *mdev_state,
 				struct vfio_region_info *region_info,
 				u16 *cap_type_id, void **cap_type)
 {
-	struct mdev_state *mdev_state;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
-
 	if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
 	    region_info->index != MDPY_DISPLAY_REGION)
 		return -EINVAL;
@@ -447,15 +451,13 @@ static int mdpy_get_region_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mdpy_get_irq_info(struct mdev_device *mdev,
-			     struct vfio_irq_info *irq_info)
+static int mdpy_get_irq_info(struct vfio_irq_info *irq_info)
 {
 	irq_info->count = 0;
 	return 0;
 }
 
-static int mdpy_get_device_info(struct mdev_device *mdev,
-				struct vfio_device_info *dev_info)
+static int mdpy_get_device_info(struct vfio_device_info *dev_info)
 {
 	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
 	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
@@ -463,11 +465,9 @@ static int mdpy_get_device_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mdpy_query_gfx_plane(struct mdev_device *mdev,
+static int mdpy_query_gfx_plane(struct mdev_state *mdev_state,
 				struct vfio_device_gfx_plane_info *plane)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-
 	if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
 		if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
 				     VFIO_GFX_PLANE_TYPE_REGION))
@@ -496,14 +496,13 @@ static int mdpy_query_gfx_plane(struct mdev_device *mdev,
 	return 0;
 }
 
-static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
+static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
 		       unsigned long arg)
 {
 	int ret = 0;
 	unsigned long minsz;
-	struct mdev_state *mdev_state;
-
-	mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 
 	switch (cmd) {
 	case VFIO_DEVICE_GET_INFO:
@@ -518,7 +517,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mdpy_get_device_info(mdev, &info);
+		ret = mdpy_get_device_info(&info);
 		if (ret)
 			return ret;
 
@@ -543,7 +542,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mdpy_get_region_info(mdev, &info, &cap_type_id,
+		ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id,
 					   &cap_type);
 		if (ret)
 			return ret;
@@ -567,7 +566,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		    (info.index >= mdev_state->dev_info.num_irqs))
 			return -EINVAL;
 
-		ret = mdpy_get_irq_info(mdev, &info);
+		ret = mdpy_get_irq_info(&info);
 		if (ret)
 			return ret;
 
@@ -590,7 +589,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (plane.argsz < minsz)
 			return -EINVAL;
 
-		ret = mdpy_query_gfx_plane(mdev, &plane);
+		ret = mdpy_query_gfx_plane(mdev_state, &plane);
 		if (ret)
 			return ret;
 
@@ -604,12 +603,12 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		return -EINVAL;
 
 	case VFIO_DEVICE_RESET:
-		return mdpy_reset(mdev);
+		return mdpy_reset(mdev_state);
 	}
 	return -ENOTTY;
 }
 
-static int mdpy_open(struct mdev_device *mdev)
+static int mdpy_open(struct vfio_device *vdev)
 {
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
@@ -617,7 +616,7 @@ static int mdpy_open(struct mdev_device *mdev)
 	return 0;
 }
 
-static void mdpy_close(struct mdev_device *mdev)
+static void mdpy_close(struct vfio_device *vdev)
 {
 	module_put(THIS_MODULE);
 }
@@ -626,8 +625,7 @@ static ssize_t
 resolution_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
-	struct mdev_device *mdev = mdev_from_dev(dev);
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state = dev_get_drvdata(dev);
 
 	return sprintf(buf, "%dx%d\n",
 		       mdev_state->type->width,
@@ -717,18 +715,30 @@ static struct attribute_group *mdev_type_groups[] = {
 	NULL,
 };
 
+static const struct vfio_device_ops mdpy_dev_ops = {
+	.open = mdpy_open,
+	.release = mdpy_close,
+	.read = mdpy_read,
+	.write = mdpy_write,
+	.ioctl = mdpy_ioctl,
+	.mmap = mdpy_mmap,
+};
+
+static struct mdev_driver mdpy_driver = {
+	.driver = {
+		.name = "mdpy",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+		.dev_groups = mdev_dev_groups,
+	},
+	.probe = mdpy_probe,
+	.remove	= mdpy_remove,
+};
+
 static const struct mdev_parent_ops mdev_fops = {
 	.owner			= THIS_MODULE,
-	.mdev_attr_groups	= mdev_dev_groups,
+	.device_driver          = &mdpy_driver,
 	.supported_type_groups	= mdev_type_groups,
-	.create			= mdpy_create,
-	.remove			= mdpy_remove,
-	.open			= mdpy_open,
-	.release		= mdpy_close,
-	.read			= mdpy_read,
-	.write			= mdpy_write,
-	.ioctl			= mdpy_ioctl,
-	.mmap			= mdpy_mmap,
 };
 
 static const struct file_operations vd_fops = {
@@ -753,11 +763,15 @@ static int __init mdpy_dev_init(void)
 	cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK + 1);
 	pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt));
 
+	ret = mdev_register_driver(&mdpy_driver);
+	if (ret)
+		goto err_cdev;
+
 	mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME);
 	if (IS_ERR(mdpy_class)) {
 		pr_err("Error: failed to register mdpy_dev class\n");
 		ret = PTR_ERR(mdpy_class);
-		goto failed1;
+		goto err_driver;
 	}
 	mdpy_dev.class = mdpy_class;
 	mdpy_dev.release = mdpy_device_release;
@@ -765,19 +779,21 @@ static int __init mdpy_dev_init(void)
 
 	ret = device_register(&mdpy_dev);
 	if (ret)
-		goto failed2;
+		goto err_class;
 
 	ret = mdev_register_device(&mdpy_dev, &mdev_fops);
 	if (ret)
-		goto failed3;
+		goto err_device;
 
 	return 0;
 
-failed3:
+err_device:
 	device_unregister(&mdpy_dev);
-failed2:
+err_class:
 	class_destroy(mdpy_class);
-failed1:
+err_driver:
+	mdev_unregister_driver(&mdpy_driver);
+err_cdev:
 	cdev_del(&mdpy_cdev);
 	unregister_chrdev_region(mdpy_devt, MINORMASK + 1);
 	return ret;
@@ -789,6 +805,7 @@ static void __exit mdpy_dev_exit(void)
 	mdev_unregister_device(&mdpy_dev);
 
 	device_unregister(&mdpy_dev);
+	mdev_unregister_driver(&mdpy_driver);
 	cdev_del(&mdpy_cdev);
 	unregister_chrdev_region(mdpy_devt, MINORMASK + 1);
 	class_destroy(mdpy_class);
-- 
Gitee


From c25b2c8d22cd633c612f6704ea24612efef60e42 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 29 Apr 2021 12:53:27 +0300
Subject: [PATCH 1648/2161] vfio/mdev: remove unnecessary NULL check in
 mbochs_create()

commit 698f99ed5e06946764c3be035ce9d62a2691e08c upstream.

Originally "type" could be NULL and these checks were required, but we
recently changed how "type" is assigned and that's no longer the case.
Now "type" points to an element in the middle of a non-NULL array.

Removing the checks does not affect runtime at all, but it makes the
code a little bit simpler to read.

Fixes: 3d3a360e570616 ("vfio/mbochs: Use mdev_get_type_group_id()")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <20210429095327.GY1981@kadam>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mbochs.c | 2 --
 samples/vfio-mdev/mdpy.c   | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index b033576d1d3f..aca894460955 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -513,8 +513,6 @@ static int mbochs_create(struct mdev_device *mdev)
 	struct device *dev = mdev_dev(mdev);
 	struct mdev_state *mdev_state;
 
-	if (!type)
-		type = &mbochs_types[0];
 	if (type->mbytes + mbochs_used_mbytes > max_mbytes)
 		return -ENOMEM;
 
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 8f6a2ae52550..7e9c9df0f05b 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -665,8 +665,7 @@ static ssize_t description_show(struct mdev_type *mtype,
 		&mdpy_types[mtype_get_type_group_id(mtype)];
 
 	return sprintf(buf, "virtual display, %dx%d framebuffer\n",
-		       type ? type->width  : 0,
-		       type ? type->height : 0);
+		       type->width, type->height);
 }
 static MDEV_TYPE_ATTR_RO(description);
 
-- 
Gitee


From d12d9cf93f3fd2a6ebb0417c4e9834e21916dc13 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 17 Jun 2021 16:22:18 +0200
Subject: [PATCH 1649/2161] vfio/mbochs: Convert to use
 vfio_register_group_dev()

commit 681c1615f8914451cfd432ad30e2f307b6490542 upstream.

This is straightforward conversion, the mdev_state is actually serving as
the vfio_device and we can replace all the mdev_get_drvdata()'s and the
wonky dead code with a simple container_of().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210617142218.1877096-11-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mbochs.c | 163 +++++++++++++++++++++----------------
 1 file changed, 91 insertions(+), 72 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index aca894460955..10685332c564 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -130,6 +130,7 @@ static struct class	*mbochs_class;
 static struct cdev	mbochs_cdev;
 static struct device	mbochs_dev;
 static int		mbochs_used_mbytes;
+static const struct vfio_device_ops mbochs_dev_ops;
 
 struct vfio_region_info_ext {
 	struct vfio_region_info          base;
@@ -160,6 +161,7 @@ struct mbochs_dmabuf {
 
 /* State of each mdev device */
 struct mdev_state {
+	struct vfio_device vdev;
 	u8 *vconfig;
 	u64 bar_mask[3];
 	u32 memory_bar_mask;
@@ -425,11 +427,9 @@ static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset,
 		memcpy(buf, mdev_state->edid_blob + offset, count);
 }
 
-static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
-			   loff_t pos, bool is_write)
+static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf,
+			   size_t count, loff_t pos, bool is_write)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	struct device *dev = mdev_dev(mdev);
 	struct page *pg;
 	loff_t poff;
 	char *map;
@@ -478,7 +478,7 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
 		put_page(pg);
 
 	} else {
-		dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n",
+		dev_dbg(mdev_state->vdev.dev, "%s: %s @0x%llx (unhandled)\n",
 			__func__, is_write ? "WR" : "RD", pos);
 		ret = -1;
 		goto accessfailed;
@@ -493,9 +493,8 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
 	return ret;
 }
 
-static int mbochs_reset(struct mdev_device *mdev)
+static int mbochs_reset(struct mdev_state *mdev_state)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
 	u32 size64k = mdev_state->memsize / (64 * 1024);
 	int i;
 
@@ -506,12 +505,13 @@ static int mbochs_reset(struct mdev_device *mdev)
 	return 0;
 }
 
-static int mbochs_create(struct mdev_device *mdev)
+static int mbochs_probe(struct mdev_device *mdev)
 {
 	const struct mbochs_type *type =
 		&mbochs_types[mdev_get_type_group_id(mdev)];
 	struct device *dev = mdev_dev(mdev);
 	struct mdev_state *mdev_state;
+	int ret = -ENOMEM;
 
 	if (type->mbytes + mbochs_used_mbytes > max_mbytes)
 		return -ENOMEM;
@@ -519,6 +519,7 @@ static int mbochs_create(struct mdev_device *mdev)
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
 	if (mdev_state == NULL)
 		return -ENOMEM;
+	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mbochs_dev_ops);
 
 	mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
 	if (mdev_state->vconfig == NULL)
@@ -537,7 +538,6 @@ static int mbochs_create(struct mdev_device *mdev)
 
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
-	mdev_set_drvdata(mdev, mdev_state);
 	INIT_LIST_HEAD(&mdev_state->dmabufs);
 	mdev_state->next_id = 1;
 
@@ -547,32 +547,38 @@ static int mbochs_create(struct mdev_device *mdev)
 	mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET;
 	mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob);
 	mbochs_create_config_space(mdev_state);
-	mbochs_reset(mdev);
+	mbochs_reset(mdev_state);
 
 	mbochs_used_mbytes += type->mbytes;
+
+	ret = vfio_register_group_dev(&mdev_state->vdev);
+	if (ret)
+		goto err_mem;
+	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
 
 err_mem:
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
-	return -ENOMEM;
+	return ret;
 }
 
-static int mbochs_remove(struct mdev_device *mdev)
+static void mbochs_remove(struct mdev_device *mdev)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
 
 	mbochs_used_mbytes -= mdev_state->type->mbytes;
-	mdev_set_drvdata(mdev, NULL);
+	vfio_unregister_group_dev(&mdev_state->vdev);
 	kfree(mdev_state->pages);
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
-	return 0;
 }
 
-static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
+static ssize_t mbochs_read(struct vfio_device *vdev, char __user *buf,
 			   size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -582,7 +588,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
 		if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
-			ret =  mdev_access(mdev, (char *)&val, sizeof(val),
+			ret =  mdev_access(mdev_state, (char *)&val, sizeof(val),
 					   *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -594,7 +600,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
 		} else if (count >= 2 && !(*ppos % 2)) {
 			u16 val;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -606,7 +612,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
 		} else {
 			u8 val;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -629,9 +635,11 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
 	return -EFAULT;
 }
 
-static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
+static ssize_t mbochs_write(struct vfio_device *vdev, const char __user *buf,
 			    size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -644,7 +652,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -656,7 +664,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -668,7 +676,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -754,9 +762,10 @@ static const struct vm_operations_struct mbochs_region_vm_ops = {
 	.fault = mbochs_region_vm_fault,
 };
 
-static int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+static int mbochs_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 
 	if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
 		return -EINVAL;
@@ -978,7 +987,7 @@ mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id)
 static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
 {
 	struct mdev_state *mdev_state = dmabuf->mdev_state;
-	struct device *dev = mdev_dev(mdev_state->mdev);
+	struct device *dev = mdev_state->vdev.dev;
 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
 	struct dma_buf *buf;
 
@@ -1006,15 +1015,10 @@ static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
 	return 0;
 }
 
-static int mbochs_get_region_info(struct mdev_device *mdev,
+static int mbochs_get_region_info(struct mdev_state *mdev_state,
 				  struct vfio_region_info_ext *ext)
 {
 	struct vfio_region_info *region_info = &ext->base;
-	struct mdev_state *mdev_state;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
 
 	if (region_info->index >= MBOCHS_NUM_REGIONS)
 		return -EINVAL;
@@ -1062,15 +1066,13 @@ static int mbochs_get_region_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mbochs_get_irq_info(struct mdev_device *mdev,
-			       struct vfio_irq_info *irq_info)
+static int mbochs_get_irq_info(struct vfio_irq_info *irq_info)
 {
 	irq_info->count = 0;
 	return 0;
 }
 
-static int mbochs_get_device_info(struct mdev_device *mdev,
-				  struct vfio_device_info *dev_info)
+static int mbochs_get_device_info(struct vfio_device_info *dev_info)
 {
 	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
 	dev_info->num_regions = MBOCHS_NUM_REGIONS;
@@ -1078,11 +1080,9 @@ static int mbochs_get_device_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mbochs_query_gfx_plane(struct mdev_device *mdev,
+static int mbochs_query_gfx_plane(struct mdev_state *mdev_state,
 				  struct vfio_device_gfx_plane_info *plane)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	struct device *dev = mdev_dev(mdev);
 	struct mbochs_dmabuf *dmabuf;
 	struct mbochs_mode mode;
 	int ret;
@@ -1136,18 +1136,16 @@ static int mbochs_query_gfx_plane(struct mdev_device *mdev,
 done:
 	if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY &&
 	    mdev_state->active_id != plane->dmabuf_id) {
-		dev_dbg(dev, "%s: primary: %d => %d\n", __func__,
-			mdev_state->active_id, plane->dmabuf_id);
+		dev_dbg(mdev_state->vdev.dev, "%s: primary: %d => %d\n",
+			__func__, mdev_state->active_id, plane->dmabuf_id);
 		mdev_state->active_id = plane->dmabuf_id;
 	}
 	mutex_unlock(&mdev_state->ops_lock);
 	return 0;
 }
 
-static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
-				 u32 id)
+static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
 	struct mbochs_dmabuf *dmabuf;
 
 	mutex_lock(&mdev_state->ops_lock);
@@ -1169,9 +1167,11 @@ static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
 	return dma_buf_fd(dmabuf->buf, 0);
 }
 
-static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
-			unsigned long arg)
+static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
+			 unsigned long arg)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	int ret = 0;
 	unsigned long minsz, outsz;
 
@@ -1188,7 +1188,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mbochs_get_device_info(mdev, &info);
+		ret = mbochs_get_device_info(&info);
 		if (ret)
 			return ret;
 
@@ -1212,7 +1212,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (outsz > sizeof(info))
 			return -EINVAL;
 
-		ret = mbochs_get_region_info(mdev, &info);
+		ret = mbochs_get_region_info(mdev_state, &info);
 		if (ret)
 			return ret;
 
@@ -1235,7 +1235,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		    (info.index >= VFIO_PCI_NUM_IRQS))
 			return -EINVAL;
 
-		ret = mbochs_get_irq_info(mdev, &info);
+		ret = mbochs_get_irq_info(&info);
 		if (ret)
 			return ret;
 
@@ -1258,7 +1258,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (plane.argsz < minsz)
 			return -EINVAL;
 
-		ret = mbochs_query_gfx_plane(mdev, &plane);
+		ret = mbochs_query_gfx_plane(mdev_state, &plane);
 		if (ret)
 			return ret;
 
@@ -1275,19 +1275,19 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (get_user(dmabuf_id, (__u32 __user *)arg))
 			return -EFAULT;
 
-		return mbochs_get_gfx_dmabuf(mdev, dmabuf_id);
+		return mbochs_get_gfx_dmabuf(mdev_state, dmabuf_id);
 	}
 
 	case VFIO_DEVICE_SET_IRQS:
 		return -EINVAL;
 
 	case VFIO_DEVICE_RESET:
-		return mbochs_reset(mdev);
+		return mbochs_reset(mdev_state);
 	}
 	return -ENOTTY;
 }
 
-static int mbochs_open(struct mdev_device *mdev)
+static int mbochs_open(struct vfio_device *vdev)
 {
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
@@ -1295,9 +1295,10 @@ static int mbochs_open(struct mdev_device *mdev)
 	return 0;
 }
 
-static void mbochs_close(struct mdev_device *mdev)
+static void mbochs_close(struct vfio_device *vdev)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	struct mbochs_dmabuf *dmabuf, *tmp;
 
 	mutex_lock(&mdev_state->ops_lock);
@@ -1321,8 +1322,7 @@ static ssize_t
 memory_show(struct device *dev, struct device_attribute *attr,
 	    char *buf)
 {
-	struct mdev_device *mdev = mdev_from_dev(dev);
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state = dev_get_drvdata(dev);
 
 	return sprintf(buf, "%d MB\n", mdev_state->type->mbytes);
 }
@@ -1413,18 +1413,30 @@ static struct attribute_group *mdev_type_groups[] = {
 	NULL,
 };
 
+static const struct vfio_device_ops mbochs_dev_ops = {
+	.open = mbochs_open,
+	.release = mbochs_close,
+	.read = mbochs_read,
+	.write = mbochs_write,
+	.ioctl = mbochs_ioctl,
+	.mmap = mbochs_mmap,
+};
+
+static struct mdev_driver mbochs_driver = {
+	.driver = {
+		.name = "mbochs",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+		.dev_groups = mdev_dev_groups,
+	},
+	.probe = mbochs_probe,
+	.remove	= mbochs_remove,
+};
+
 static const struct mdev_parent_ops mdev_fops = {
 	.owner			= THIS_MODULE,
-	.mdev_attr_groups	= mdev_dev_groups,
+	.device_driver		= &mbochs_driver,
 	.supported_type_groups	= mdev_type_groups,
-	.create			= mbochs_create,
-	.remove			= mbochs_remove,
-	.open			= mbochs_open,
-	.release		= mbochs_close,
-	.read			= mbochs_read,
-	.write			= mbochs_write,
-	.ioctl			= mbochs_ioctl,
-	.mmap			= mbochs_mmap,
 };
 
 static const struct file_operations vd_fops = {
@@ -1449,11 +1461,15 @@ static int __init mbochs_dev_init(void)
 	cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK + 1);
 	pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt));
 
+	ret = mdev_register_driver(&mbochs_driver);
+	if (ret)
+		goto err_cdev;
+
 	mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME);
 	if (IS_ERR(mbochs_class)) {
 		pr_err("Error: failed to register mbochs_dev class\n");
 		ret = PTR_ERR(mbochs_class);
-		goto failed1;
+		goto err_driver;
 	}
 	mbochs_dev.class = mbochs_class;
 	mbochs_dev.release = mbochs_device_release;
@@ -1461,19 +1477,21 @@ static int __init mbochs_dev_init(void)
 
 	ret = device_register(&mbochs_dev);
 	if (ret)
-		goto failed2;
+		goto err_class;
 
 	ret = mdev_register_device(&mbochs_dev, &mdev_fops);
 	if (ret)
-		goto failed3;
+		goto err_device;
 
 	return 0;
 
-failed3:
+err_device:
 	device_unregister(&mbochs_dev);
-failed2:
+err_class:
 	class_destroy(mbochs_class);
-failed1:
+err_driver:
+	mdev_unregister_driver(&mbochs_driver);
+err_cdev:
 	cdev_del(&mbochs_cdev);
 	unregister_chrdev_region(mbochs_devt, MINORMASK + 1);
 	return ret;
@@ -1485,6 +1503,7 @@ static void __exit mbochs_dev_exit(void)
 	mdev_unregister_device(&mbochs_dev);
 
 	device_unregister(&mbochs_dev);
+	mdev_unregister_driver(&mbochs_driver);
 	cdev_del(&mbochs_cdev);
 	unregister_chrdev_region(mbochs_devt, MINORMASK + 1);
 	class_destroy(mbochs_class);
-- 
Gitee


From aa03308c27a157c18fbec27e829f624a9c353afb Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Tue, 8 Jun 2021 14:28:41 +0300
Subject: [PATCH 1650/2161] vfio/iommu_type1: rename vfio_group struck to
 vfio_iommu_group

commit c7396f2eac2bf9d767d9cf49bd26224fbb894aaf upstream.

The vfio_group structure is already defined in vfio module so in order
to improve code readability and for simplicity, rename the vfio_group
structure in vfio_iommu_type1 module to vfio_iommu_group.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Link: https://lore.kernel.org/r/20210608112841.51897-1-mgurtovoy@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 34 +++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 48d435d7b969..a27a96b7b8f6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -111,7 +111,7 @@ struct vfio_batch {
 	int			offset;		/* of next entry in pages */
 };
 
-struct vfio_group {
+struct vfio_iommu_group {
 	struct iommu_group	*iommu_group;
 	struct list_head	next;
 	bool			mdev_group;	/* An mdev group */
@@ -161,8 +161,9 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
-static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
-					       struct iommu_group *iommu_group);
+static struct vfio_iommu_group*
+vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+			    struct iommu_group *iommu_group);
 
 /*
  * This code handles mapping and unmapping of user data buffers
@@ -837,7 +838,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				      unsigned long *phys_pfn)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group;
+	struct vfio_iommu_group *group;
 	int i, j, ret;
 	unsigned long remote_vaddr;
 	struct vfio_dma *dma;
@@ -1876,10 +1877,10 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
-static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
-					   struct iommu_group *iommu_group)
+static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
+						 struct iommu_group *iommu_group)
 {
-	struct vfio_group *g;
+	struct vfio_iommu_group *g;
 
 	list_for_each_entry(g, &domain->group_list, next) {
 		if (g->iommu_group == iommu_group)
@@ -1889,11 +1890,12 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
 	return NULL;
 }
 
-static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
-					       struct iommu_group *iommu_group)
+static struct vfio_iommu_group*
+vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+			    struct iommu_group *iommu_group)
 {
 	struct vfio_domain *domain;
-	struct vfio_group *group = NULL;
+	struct vfio_iommu_group *group = NULL;
 
 	list_for_each_entry(domain, &iommu->domain_list, next) {
 		group = find_iommu_group(domain, iommu_group);
@@ -1968,7 +1970,7 @@ static int vfio_mdev_detach_domain(struct device *dev, void *data)
 }
 
 static int vfio_iommu_attach_group(struct vfio_domain *domain,
-				   struct vfio_group *group)
+				   struct vfio_iommu_group *group)
 {
 	if (group->mdev_group)
 		return iommu_group_for_each_dev(group->iommu_group,
@@ -1979,7 +1981,7 @@ static int vfio_iommu_attach_group(struct vfio_domain *domain,
 }
 
 static void vfio_iommu_detach_group(struct vfio_domain *domain,
-				    struct vfio_group *group)
+				    struct vfio_iommu_group *group)
 {
 	if (group->mdev_group)
 		iommu_group_for_each_dev(group->iommu_group, domain->domain,
@@ -2243,7 +2245,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group;
+	struct vfio_iommu_group *group;
 	struct vfio_domain *domain, *d;
 	struct bus_type *bus = NULL;
 	int ret;
@@ -2522,7 +2524,7 @@ static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
 				   struct list_head *iova_copy)
 {
 	struct vfio_domain *d;
-	struct vfio_group *g;
+	struct vfio_iommu_group *g;
 	struct vfio_iova *node;
 	dma_addr_t start, end;
 	LIST_HEAD(resv_regions);
@@ -2564,7 +2566,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain;
-	struct vfio_group *group;
+	struct vfio_iommu_group *group;
 	bool update_dirty_scope = false;
 	LIST_HEAD(iova_copy);
 
@@ -2685,7 +2687,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 
 static void vfio_release_domain(struct vfio_domain *domain, bool external)
 {
-	struct vfio_group *group, *group_tmp;
+	struct vfio_iommu_group *group, *group_tmp;
 
 	list_for_each_entry_safe(group, group_tmp,
 				 &domain->group_list, next) {
-- 
Gitee


From 87ac3c72cb447875e6d6d8ee8547dc8ddc6714d6 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 22 Jun 2021 19:37:10 +0100
Subject: [PATCH 1651/2161] vfio/mdpy: Fix memory leak of object
 mdev_state->vconfig

commit 0af5160edb87b1868eba514422d3991628a018f8 upstream.

In the case where the call to vfio_register_group_dev fails the error
return path kfree's mdev_state but not mdev_state->vconfig. Fix this
by kfree'ing mdev_state->vconfig before returning.

Addresses-Coverity: ("Resource leak")
Fixes: 437e41368c01 ("vfio/mdpy: Convert to use vfio_register_group_dev()")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Link: https://lore.kernel.org/r/20210622183710.28954-1-colin.king@canonical.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mdpy.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 7e9c9df0f05b..393c9df6f6a0 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -261,6 +261,7 @@ static int mdpy_probe(struct mdev_device *mdev)
 
 	ret = vfio_register_group_dev(&mdev_state->vdev);
 	if (ret) {
+		kfree(mdev_state->vconfig);
 		kfree(mdev_state);
 		return ret;
 	}
-- 
Gitee


From a80014d719beb9222848f3b52a2072bab033aafe Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 22 Jun 2021 19:28:23 -0700
Subject: [PATCH 1652/2161] PCI: Export pci_dev_trylock() and pci_dev_unlock()

commit e3a9b1212b9d6cb20751196e338f4a5138d539d3 upstream.

Other places in the kernel use this form, and so just
provide a common path for it.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20210623022824.308041-2-mcgrof@kernel.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci.c   | 6 ++++--
 include/linux/pci.h | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 239018fc7f42..be2540ff6641 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4905,7 +4905,7 @@ static void pci_dev_lock(struct pci_dev *dev)
 }
 
 /* Return 1 on successful lock, 0 on contention */
-static int pci_dev_trylock(struct pci_dev *dev)
+int pci_dev_trylock(struct pci_dev *dev)
 {
 	if (pci_cfg_access_trylock(dev)) {
 		if (device_trylock(&dev->dev))
@@ -4915,12 +4915,14 @@ static int pci_dev_trylock(struct pci_dev *dev)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pci_dev_trylock);
 
-static void pci_dev_unlock(struct pci_dev *dev)
+void pci_dev_unlock(struct pci_dev *dev)
 {
 	device_unlock(&dev->dev);
 	pci_cfg_access_unlock(dev);
 }
+EXPORT_SYMBOL_GPL(pci_dev_unlock);
 
 static void pci_dev_save_and_disable(struct pci_dev *dev)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4f61bbd9f943..284b723d1f08 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1604,6 +1604,9 @@ void pci_cfg_access_lock(struct pci_dev *dev);
 bool pci_cfg_access_trylock(struct pci_dev *dev);
 void pci_cfg_access_unlock(struct pci_dev *dev);
 
+int pci_dev_trylock(struct pci_dev *dev);
+void pci_dev_unlock(struct pci_dev *dev);
+
 /*
  * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
  * a PCI domain is defined to be a set of PCI buses which share
-- 
Gitee


From 46dbf22c307d0054c144a70da7c466fd50939085 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 22 Jun 2021 19:28:24 -0700
Subject: [PATCH 1653/2161] vfio: use the new pci_dev_trylock() helper to
 simplify try lock

commit 742b4c0d1efe7a7640ad17f1bbf696a1305f6495 upstream.

Use the new pci_dev_trylock() helper to simplify our locking.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20210623022824.308041-3-mcgrof@kernel.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index e03d560b79c6..56f68096b344 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -495,13 +495,10 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 	 * We can not use the "try" reset interface here, which will
 	 * overwrite the previously restored configuration information.
 	 */
-	if (vdev->reset_works && pci_cfg_access_trylock(pdev)) {
-		if (device_trylock(&pdev->dev)) {
-			if (!__pci_reset_function_locked(pdev))
-				vdev->needs_reset = false;
-			device_unlock(&pdev->dev);
-		}
-		pci_cfg_access_unlock(pdev);
+	if (vdev->reset_works && pci_dev_trylock(pdev)) {
+		if (!__pci_reset_function_locked(pdev))
+			vdev->needs_reset = false;
+		pci_dev_unlock(pdev);
 	}
 
 	pci_restore_state(pdev);
-- 
Gitee


From 0337f1daea1dd30eaabcf704d6d73614a95bfa35 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 25 Jun 2021 12:56:04 -0300
Subject: [PATCH 1654/2161] vfio/mtty: Delete mdev_devices_list

commit 0dd1b7fc3e7d30802d5839f6bf8957023b437ad4 upstream.

Dan points out that an error case left things on this list. It is also
missing locking in available_instances_show().

Further study shows the list isn't needed at all, just store the total
ports in use in an atomic and delete the whole thing.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: 09177ac91921 ("vfio/mtty: Convert to use vfio_register_group_dev()")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/0-v1-0bc56b362ca7+62-mtty_used_ports_jgg@nvidia.com
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mtty.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index faf9b8e8873a..ffbaf07a17ea 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -144,8 +144,7 @@ struct mdev_state {
 	int nr_ports;
 };
 
-static struct mutex mdev_list_lock;
-static struct list_head mdev_devices_list;
+static atomic_t mdev_used_ports;
 
 static const struct file_operations vd_fops = {
 	.owner          = THIS_MODULE,
@@ -733,15 +732,13 @@ static int mtty_probe(struct mdev_device *mdev)
 
 	mtty_create_config_space(mdev_state);
 
-	mutex_lock(&mdev_list_lock);
-	list_add(&mdev_state->next, &mdev_devices_list);
-	mutex_unlock(&mdev_list_lock);
-
 	ret = vfio_register_group_dev(&mdev_state->vdev);
 	if (ret) {
 		kfree(mdev_state);
 		return ret;
 	}
+	atomic_add(mdev_state->nr_ports, &mdev_used_ports);
+
 	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
 }
@@ -750,10 +747,8 @@ static void mtty_remove(struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
 
+	atomic_sub(mdev_state->nr_ports, &mdev_used_ports);
 	vfio_unregister_group_dev(&mdev_state->vdev);
-	mutex_lock(&mdev_list_lock);
-	list_del(&mdev_state->next);
-	mutex_unlock(&mdev_list_lock);
 
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
@@ -1274,14 +1269,10 @@ static ssize_t available_instances_show(struct mdev_type *mtype,
 					struct mdev_type_attribute *attr,
 					char *buf)
 {
-	struct mdev_state *mds;
 	unsigned int ports = mtype_get_type_group_id(mtype) + 1;
-	int used = 0;
 
-	list_for_each_entry(mds, &mdev_devices_list, next)
-		used += mds->nr_ports;
-
-	return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports);
+	return sprintf(buf, "%d\n",
+		       (MAX_MTTYS - atomic_read(&mdev_used_ports)) / ports);
 }
 
 static MDEV_TYPE_ATTR_RO(available_instances);
@@ -1395,9 +1386,6 @@ static int __init mtty_dev_init(void)
 	ret = mdev_register_device(&mtty_dev.dev, &mdev_fops);
 	if (ret)
 		goto err_device;
-
-	mutex_init(&mdev_list_lock);
-	INIT_LIST_HEAD(&mdev_devices_list);
 	return 0;
 
 err_device:
-- 
Gitee


From c6431e7fdf4ab243320c36aaf635d6fd03d32c80 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 25 Jun 2021 15:20:06 -0600
Subject: [PATCH 1655/2161] vfio/mtty: Enforce available_instances

commit 97d0a6874478802b68e3bea7aa9b9a333d257182 upstream.

The sample mtty mdev driver doesn't actually enforce the number of
device instances it claims are available.  Implement this properly.

Link: https://lore.kernel.org/r/162465624894.3338367.12935940647049917981.stgit@omen
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mtty.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index ffbaf07a17ea..8b26fecc4afe 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -144,7 +144,7 @@ struct mdev_state {
 	int nr_ports;
 };
 
-static atomic_t mdev_used_ports;
+static atomic_t mdev_avail_ports = ATOMIC_INIT(MAX_MTTYS);
 
 static const struct file_operations vd_fops = {
 	.owner          = THIS_MODULE,
@@ -707,11 +707,20 @@ static int mtty_probe(struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state;
 	int nr_ports = mdev_get_type_group_id(mdev) + 1;
+	int avail_ports = atomic_read(&mdev_avail_ports);
 	int ret;
 
+	do {
+		if (avail_ports < nr_ports)
+			return -ENOSPC;
+	} while (!atomic_try_cmpxchg(&mdev_avail_ports,
+				     &avail_ports, avail_ports - nr_ports));
+
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
-	if (mdev_state == NULL)
+	if (mdev_state == NULL) {
+		atomic_add(nr_ports, &mdev_avail_ports);
 		return -ENOMEM;
+	}
 
 	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mtty_dev_ops);
 
@@ -724,6 +733,7 @@ static int mtty_probe(struct mdev_device *mdev)
 
 	if (mdev_state->vconfig == NULL) {
 		kfree(mdev_state);
+		atomic_add(nr_ports, &mdev_avail_ports);
 		return -ENOMEM;
 	}
 
@@ -735,9 +745,9 @@ static int mtty_probe(struct mdev_device *mdev)
 	ret = vfio_register_group_dev(&mdev_state->vdev);
 	if (ret) {
 		kfree(mdev_state);
+		atomic_add(nr_ports, &mdev_avail_ports);
 		return ret;
 	}
-	atomic_add(mdev_state->nr_ports, &mdev_used_ports);
 
 	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
@@ -746,12 +756,13 @@ static int mtty_probe(struct mdev_device *mdev)
 static void mtty_remove(struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
+	int nr_ports = mdev_state->nr_ports;
 
-	atomic_sub(mdev_state->nr_ports, &mdev_used_ports);
 	vfio_unregister_group_dev(&mdev_state->vdev);
 
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
+	atomic_add(nr_ports, &mdev_avail_ports);
 }
 
 static int mtty_reset(struct mdev_state *mdev_state)
@@ -1271,8 +1282,7 @@ static ssize_t available_instances_show(struct mdev_type *mtype,
 {
 	unsigned int ports = mtype_get_type_group_id(mtype) + 1;
 
-	return sprintf(buf, "%d\n",
-		       (MAX_MTTYS - atomic_read(&mdev_used_ports)) / ports);
+	return sprintf(buf, "%d\n", atomic_read(&mdev_avail_ports) / ports);
 }
 
 static MDEV_TYPE_ATTR_RO(available_instances);
-- 
Gitee


From 08dec5b5ad55a5f8b63eed57e7866cbb666e91c2 Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Mon, 28 Jun 2021 19:38:50 -0700
Subject: [PATCH 1656/2161] mm: add vma_lookup(), update
 find_vma_intersection() comments

commit ce6d42f2e4a2d98898419743b037a95661e3ac9d upstream.

Patch series "mm: Add vma_lookup()", v2.

Many places in the kernel use find_vma() to get a vma and then check the
start address of the vma to ensure the next vma was not returned.

Other places use the find_vma_intersection() call with add, addr + 1 as
the range; looking for just the vma at a specific address.

The third use of find_vma() is by developers who do not know that the
function starts searching at the provided address upwards for the next
vma.  This results in a bug that is often overlooked for a long time.

Adding the new vma_lookup() function will allow for cleaner code by
removing the find_vma() calls which check limits, making
find_vma_intersection() calls of a single address to be shorter, and
potentially reduce the incorrect uses of find_vma().

This patch (of 22):

Many places in the kernel use find_vma() to get a vma and then check the
start address of the vma to ensure the next vma was not returned.

Other places use the find_vma_intersection() call with add, addr + 1 as
the range; looking for just the vma at a specific address.

The third use of find_vma() is by developers who do not know that the
function starts searching at the provided address upwards for the next
vma.  This results in a bug that is often overlooked for a long time.

Adding the new vma_lookup() function will allow for cleaner code by
removing the find_vma() calls which check limits, making
find_vma_intersection() calls of a single address to be shorter, and
potentially reduce the incorrect uses of find_vma().

Also change find_vma_intersection() comments and declaration to be of the
correct length and add kernel documentation style comment.

Link: https://lkml.kernel.org/r/20210521174745.2219620-1-Liam.Howlett@Oracle.com
Link: https://lkml.kernel.org/r/20210521174745.2219620-2-Liam.Howlett@Oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/mm.h | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 60d9d7c5a10b..1cc662ad866b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2489,17 +2489,45 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
-/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
-   NULL if none.  Assume start_addr < end_addr. */
-static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
+ * start_addr < end_addr.
+ */
+static inline
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+					     unsigned long start_addr,
+					     unsigned long end_addr)
 {
-	struct vm_area_struct * vma = find_vma(mm,start_addr);
+	struct vm_area_struct *vma = find_vma(mm, start_addr);
 
 	if (vma && end_addr <= vma->vm_start)
 		vma = NULL;
 	return vma;
 }
 
+/**
+ * vma_lookup() - Find a VMA at a specific address
+ * @mm: The process address space.
+ * @addr: The user address.
+ *
+ * Return: The vm_area_struct at the given address, %NULL otherwise.
+ */
+static inline
+struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = find_vma(mm, addr);
+
+	if (vma && addr < vma->vm_start)
+		vma = NULL;
+
+	return vma;
+}
+
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 {
 	unsigned long vm_start = vma->vm_start;
-- 
Gitee


From 2388bf3c4760136a038c815ca03faadf6acd1da3 Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Mon, 28 Jun 2021 19:39:20 -0700
Subject: [PATCH 1657/2161] vfio: use vma_lookup() instead of
 find_vma_intersection()

commit 85715d6809014870a8a4d498b292fc5711a969e7 upstream.

vma_lookup() finds the vma of a specific address with a cleaner interface
and is more readable.

Link: https://lkml.kernel.org/r/20210521174745.2219620-12-Liam.Howlett@Oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/gpu/drm/i915/i915_vma.c | 4 ++--
 drivers/vfio/vfio_iommu_type1.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index e0e677b2a3a9..9439cd995afc 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -230,7 +230,7 @@ vma_create(struct drm_i915_gem_object *obj,
 }
 
 static struct i915_vma *
-vma_lookup(struct drm_i915_gem_object *obj,
+i915_vma_lookup(struct drm_i915_gem_object *obj,
 	   struct i915_address_space *vm,
 	   const struct i915_ggtt_view *view)
 {
@@ -280,7 +280,7 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
 	GEM_BUG_ON(vm->closed);
 
 	spin_lock(&obj->vma.lock);
-	vma = vma_lookup(obj, vm, view);
+	vma = i915_vma_lookup(obj, vm, view);
 	spin_unlock(&obj->vma.lock);
 
 	/* vma_create() will resolve the race if another creates the vma */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a27a96b7b8f6..8fd94cf11a9c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -569,7 +569,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
 	vaddr = untagged_addr(vaddr);
 
 retry:
-	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
+	vma = vma_lookup(mm, vaddr);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
-- 
Gitee


From 04bcbf7f7218f1c3d217f64092d99f2e426d26d8 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Wed, 21 Jul 2021 10:05:48 -0300
Subject: [PATCH 1658/2161] vfio/pci: Make vfio_pci_regops->rw() return ssize_t

commit e7500b3ede2c66380a7e9faa6a81e6df2f8e4e55 upstream.

The only implementation of this in IGD returns a -ERRNO which is
implicitly cast through a size_t and then casted again and returned as a
ssize_t in vfio_pci_rw().

Fix the vfio_pci_regops->rw() return type to be ssize_t so all is
consistent.

Fixes: 28541d41c9e0 ("vfio/pci: Add infrastructure for additional device specific regions")
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Link: https://lore.kernel.org/r/0-v3-5db12d1bf576+c910-vfio_rw_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_igd.c     | 10 +++++-----
 drivers/vfio/pci/vfio_pci_private.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index 228df565e9bc..aa0a29fd2762 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -25,8 +25,8 @@
 #define OPREGION_RVDS		0x3c2
 #define OPREGION_VERSION	0x16
 
-static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
-			      size_t count, loff_t *ppos, bool iswrite)
+static ssize_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
+			       size_t count, loff_t *ppos, bool iswrite)
 {
 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
 	void *base = vdev->region[i].data;
@@ -160,9 +160,9 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
 	return ret;
 }
 
-static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
-				  char __user *buf, size_t count, loff_t *ppos,
-				  bool iswrite)
+static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
+				   char __user *buf, size_t count, loff_t *ppos,
+				   bool iswrite)
 {
 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
 	struct pci_dev *pdev = vdev->region[i].data;
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index d32e74019bf7..4025f6a27fcb 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -56,7 +56,7 @@ struct vfio_pci_device;
 struct vfio_pci_region;
 
 struct vfio_pci_regops {
-	size_t	(*rw)(struct vfio_pci_device *vdev, char __user *buf,
+	ssize_t	(*rw)(struct vfio_pci_device *vdev, char __user *buf,
 		      size_t count, loff_t *ppos, bool iswrite);
 	void	(*release)(struct vfio_pci_device *vdev,
 			   struct vfio_pci_region *region);
-- 
Gitee


From 1b387941449b0aedac8cc84cc9feca14efab1539 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 26 Jul 2021 16:35:23 +0200
Subject: [PATCH 1659/2161] vfio/mdev: turn mdev_init into a subsys_initcall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 15a5896e61acb7cbad5efd9cf807a4d9a5e8315d upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this setups with buіlt-in mdev and mdev-drivers fail to
register like this:

[1.903149] Driver 'intel_vgpu_mdev' was unable to register with bus_type 'mdev' because the bus was not initialized.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20210726143524.155779-2-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index e4581ec093a6..b16606ebafa1 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -398,7 +398,7 @@ static void __exit mdev_exit(void)
 	mdev_bus_unregister();
 }
 
-module_init(mdev_init)
+subsys_initcall(mdev_init)
 module_exit(mdev_exit)
 
 MODULE_VERSION(DRIVER_VERSION);
-- 
Gitee


From 794b73b22b4bd5f7a7b4f4a426207b64c5ccc2a0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 26 Jul 2021 16:35:24 +0200
Subject: [PATCH 1660/2161] vfio/mdev: don't warn if ->request is not set

commit 3fb1712d85962f81265b5018922a2da13cdf6033 upstream.

Only a single driver actually sets the ->request method, so don't print
a scary warning if it isn't.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20210726143524.155779-3-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/mdev_core.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index b16606ebafa1..b314101237fe 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -138,10 +138,6 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	if (!dev)
 		return -EINVAL;
 
-	/* Not mandatory, but its absence could be a problem */
-	if (!ops->request)
-		dev_info(dev, "Driver cannot be asked to release device\n");
-
 	mutex_lock(&parent_list_lock);
 
 	/* Check for duplicate */
-- 
Gitee


From ea58e35bfb7c2d818c46dd5fc0987f000dbcfcde Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:18:57 -0300
Subject: [PATCH 1661/2161] vfio/samples: Remove module get/put

commit e1706f0764f8bea42db8d33df5d7f9e754b89693 upstream.

The patch to move the get/put to core and the patch to convert the samples
to use vfio_device crossed in a way that this was missed. When both
patches are together the samples do not need their own get/put.

Fixes: 437e41368c01 ("vfio/mdpy: Convert to use vfio_register_group_dev()")
Fixes: 681c1615f891 ("vfio/mbochs: Convert to use vfio_register_group_dev()")
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Link: https://lore.kernel.org/r/1-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mbochs.c | 4 ----
 samples/vfio-mdev/mdpy.c   | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 10685332c564..62526c659d68 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1289,9 +1289,6 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
 
 static int mbochs_open(struct vfio_device *vdev)
 {
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
-
 	return 0;
 }
 
@@ -1315,7 +1312,6 @@ static void mbochs_close(struct vfio_device *vdev)
 	mbochs_put_pages(mdev_state);
 
 	mutex_unlock(&mdev_state->ops_lock);
-	module_put(THIS_MODULE);
 }
 
 static ssize_t
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 393c9df6f6a0..a7d4ed28d664 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -611,15 +611,11 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
 
 static int mdpy_open(struct vfio_device *vdev)
 {
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
-
 	return 0;
 }
 
 static void mdpy_close(struct vfio_device *vdev)
 {
-	module_put(THIS_MODULE);
 }
 
 static ssize_t
-- 
Gitee


From 2a6d98fd2db408aa091e392bd3d03db4ed774711 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:18:58 -0300
Subject: [PATCH 1662/2161] vfio/mbochs: Fix missing error unwind of
 mbochs_used_mbytes

commit de5494af4815a4c9328536c72741229b7de88e7f upstream.

Convert mbochs to use an atomic scheme for this like mtty was changed
into. The atomic fixes various race conditions with probing. Add the
missing error unwind. Also add the missing kfree of mdev_state->pages.

Fixes: 681c1615f891 ("vfio/mbochs: Convert to use vfio_register_group_dev()")
Reported-by: Cornelia Huck <cohuck@redhat.com>
Co-developed-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/2-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mbochs.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 62526c659d68..350acacc4180 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -129,7 +129,7 @@ static dev_t		mbochs_devt;
 static struct class	*mbochs_class;
 static struct cdev	mbochs_cdev;
 static struct device	mbochs_dev;
-static int		mbochs_used_mbytes;
+static atomic_t mbochs_avail_mbytes;
 static const struct vfio_device_ops mbochs_dev_ops;
 
 struct vfio_region_info_ext {
@@ -507,18 +507,22 @@ static int mbochs_reset(struct mdev_state *mdev_state)
 
 static int mbochs_probe(struct mdev_device *mdev)
 {
+	int avail_mbytes = atomic_read(&mbochs_avail_mbytes);
 	const struct mbochs_type *type =
 		&mbochs_types[mdev_get_type_group_id(mdev)];
 	struct device *dev = mdev_dev(mdev);
 	struct mdev_state *mdev_state;
 	int ret = -ENOMEM;
 
-	if (type->mbytes + mbochs_used_mbytes > max_mbytes)
-		return -ENOMEM;
+	do {
+		if (avail_mbytes < type->mbytes)
+			return -ENOSPC;
+	} while (!atomic_try_cmpxchg(&mbochs_avail_mbytes, &avail_mbytes,
+				     avail_mbytes - type->mbytes));
 
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
 	if (mdev_state == NULL)
-		return -ENOMEM;
+		goto err_avail;
 	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mbochs_dev_ops);
 
 	mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
@@ -549,17 +553,17 @@ static int mbochs_probe(struct mdev_device *mdev)
 	mbochs_create_config_space(mdev_state);
 	mbochs_reset(mdev_state);
 
-	mbochs_used_mbytes += type->mbytes;
-
 	ret = vfio_register_group_dev(&mdev_state->vdev);
 	if (ret)
 		goto err_mem;
 	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
-
 err_mem:
+	kfree(mdev_state->pages);
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
+err_avail:
+	atomic_add(type->mbytes, &mbochs_avail_mbytes);
 	return ret;
 }
 
@@ -567,8 +571,8 @@ static void mbochs_remove(struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
 
-	mbochs_used_mbytes -= mdev_state->type->mbytes;
 	vfio_unregister_group_dev(&mdev_state->vdev);
+	atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes);
 	kfree(mdev_state->pages);
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
@@ -1366,7 +1370,7 @@ static ssize_t available_instances_show(struct mdev_type *mtype,
 {
 	const struct mbochs_type *type =
 		&mbochs_types[mtype_get_type_group_id(mtype)];
-	int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
+	int count = atomic_read(&mbochs_avail_mbytes) / type->mbytes;
 
 	return sprintf(buf, "%d\n", count);
 }
@@ -1448,6 +1452,8 @@ static int __init mbochs_dev_init(void)
 {
 	int ret = 0;
 
+	atomic_set(&mbochs_avail_mbytes, max_mbytes);
+
 	ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME);
 	if (ret < 0) {
 		pr_err("Error: failed to register mbochs_dev, err: %d\n", ret);
-- 
Gitee


From ddc588d06e97e345b6b225b1443725442059ff9e Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 5 Aug 2021 22:18:59 -0300
Subject: [PATCH 1663/2161] vfio: Introduce a vfio_uninit_group_dev() API call

commit ae03c3771b8cbbed3802ad1153d896c32015c520 upstream.

This pairs with vfio_init_group_dev() and allows undoing any state that is
stored in the vfio_device unrelated to registration. Add appropriately
placed calls to all the drivers.

The following patch will use this to add pre-registration state for the
device set.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/vfio.rst            |  4 ++-
 drivers/vfio/mdev/vfio_mdev.c                | 13 +++++++---
 drivers/vfio/pci/vfio_pci.c                  |  6 +++--
 drivers/vfio/platform/vfio_platform_common.c |  7 +++--
 drivers/vfio/vfio.c                          |  5 ++++
 include/linux/vfio.h                         |  1 +
 samples/vfio-mdev/mbochs.c                   |  2 ++
 samples/vfio-mdev/mdpy.c                     | 25 ++++++++++--------
 samples/vfio-mdev/mtty.c                     | 27 ++++++++++++--------
 9 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst
index decc68cb8114..d8265ec8cca6 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -255,11 +255,13 @@ vfio_unregister_group_dev() respectively::
 	void vfio_init_group_dev(struct vfio_device *device,
 				struct device *dev,
 				const struct vfio_device_ops *ops);
+	void vfio_uninit_group_dev(struct vfio_device *device);
 	int vfio_register_group_dev(struct vfio_device *device);
 	void vfio_unregister_group_dev(struct vfio_device *device);
 
 The driver should embed the vfio_device in its own structure and call
-vfio_init_group_dev() to pre-configure it before going to registration.
+vfio_init_group_dev() to pre-configure it before going to registration
+and call vfio_uninit_group_dev() after completing the un-registration.
 vfio_register_group_dev() indicates to the core to begin tracking the
 iommu_group of the specified dev and register the dev as owned by a VFIO bus
 driver. Once vfio_register_group_dev() returns it is possible for userspace to
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 39ef7489fe47..a5c77ccb24f7 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -120,12 +120,16 @@ static int vfio_mdev_probe(struct mdev_device *mdev)
 
 	vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops);
 	ret = vfio_register_group_dev(vdev);
-	if (ret) {
-		kfree(vdev);
-		return ret;
-	}
+	if (ret)
+		goto out_uninit;
+
 	dev_set_drvdata(&mdev->dev, vdev);
 	return 0;
+
+out_uninit:
+	vfio_uninit_group_dev(vdev);
+	kfree(vdev);
+	return ret;
 }
 
 static void vfio_mdev_remove(struct mdev_device *mdev)
@@ -133,6 +137,7 @@ static void vfio_mdev_remove(struct mdev_device *mdev)
 	struct vfio_device *vdev = dev_get_drvdata(&mdev->dev);
 
 	vfio_unregister_group_dev(vdev);
+	vfio_uninit_group_dev(vdev);
 	kfree(vdev);
 }
 
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 56f68096b344..b18508543045 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -2029,7 +2029,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	ret = vfio_pci_reflck_attach(vdev);
 	if (ret)
-		goto out_free;
+		goto out_uninit;
 	ret = vfio_pci_vf_init(vdev);
 	if (ret)
 		goto out_reflck;
@@ -2066,7 +2066,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	vfio_pci_vf_uninit(vdev);
 out_reflck:
 	vfio_pci_reflck_put(vdev->reflck);
-out_free:
+out_uninit:
+	vfio_uninit_group_dev(&vdev->vdev);
 	kfree(vdev->pm_save);
 	kfree(vdev);
 out_group_put:
@@ -2084,6 +2085,7 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 
 	vfio_pci_vf_uninit(vdev);
 	vfio_pci_reflck_put(vdev->reflck);
+	vfio_uninit_group_dev(&vdev->vdev);
 	vfio_pci_vga_uninit(vdev);
 
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 703164df7637..bdde8605178c 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -667,7 +667,7 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 		ret = vfio_platform_of_probe(vdev, dev);
 
 	if (ret)
-		return ret;
+		goto out_uninit;
 
 	vdev->device = dev;
 
@@ -675,7 +675,7 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 	if (ret && vdev->reset_required) {
 		dev_err(dev, "No reset function found for device %s\n",
 			vdev->name);
-		return ret;
+		goto out_uninit;
 	}
 
 	group = vfio_iommu_group_get(dev);
@@ -698,6 +698,8 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 	vfio_iommu_group_put(group, dev);
 put_reset:
 	vfio_platform_put_reset(vdev);
+out_uninit:
+	vfio_uninit_group_dev(&vdev->vdev);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(vfio_platform_probe_common);
@@ -708,6 +710,7 @@ void vfio_platform_remove_common(struct vfio_platform_device *vdev)
 
 	pm_runtime_disable(vdev->device);
 	vfio_platform_put_reset(vdev);
+	vfio_uninit_group_dev(&vdev->vdev);
 	vfio_iommu_group_put(vdev->vdev.dev->iommu_group, vdev->vdev.dev);
 }
 EXPORT_SYMBOL_GPL(vfio_platform_remove_common);
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 02cc51ce6891..cc375df0fd5d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -749,6 +749,11 @@ void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
 
+void vfio_uninit_group_dev(struct vfio_device *device)
+{
+}
+EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
+
 int vfio_register_group_dev(struct vfio_device *device)
 {
 	struct vfio_device *existing_device;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index a2c5b30e1763..b0875cf8e496 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -61,6 +61,7 @@ extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 			 const struct vfio_device_ops *ops);
+void vfio_uninit_group_dev(struct vfio_device *device);
 int vfio_register_group_dev(struct vfio_device *device);
 void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 350acacc4180..5ba33cc39948 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -559,6 +559,7 @@ static int mbochs_probe(struct mdev_device *mdev)
 	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
 err_mem:
+	vfio_uninit_group_dev(&mdev_state->vdev);
 	kfree(mdev_state->pages);
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
@@ -572,6 +573,7 @@ static void mbochs_remove(struct mdev_device *mdev)
 	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
 
 	vfio_unregister_group_dev(&mdev_state->vdev);
+	vfio_uninit_group_dev(&mdev_state->vdev);
 	atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes);
 	kfree(mdev_state->pages);
 	kfree(mdev_state->vconfig);
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index a7d4ed28d664..57334034cde6 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -235,17 +235,16 @@ static int mdpy_probe(struct mdev_device *mdev)
 
 	mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
 	if (mdev_state->vconfig == NULL) {
-		kfree(mdev_state);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_state;
 	}
 
 	fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
 
 	mdev_state->memblk = vmalloc_user(fbsize);
 	if (!mdev_state->memblk) {
-		kfree(mdev_state->vconfig);
-		kfree(mdev_state);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_vconfig;
 	}
 	dev_info(dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width,
 		 type->height);
@@ -260,13 +259,18 @@ static int mdpy_probe(struct mdev_device *mdev)
 	mdpy_count++;
 
 	ret = vfio_register_group_dev(&mdev_state->vdev);
-	if (ret) {
-		kfree(mdev_state->vconfig);
-		kfree(mdev_state);
-		return ret;
-	}
+	if (ret)
+		goto err_mem;
 	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
+err_mem:
+	vfree(mdev_state->memblk);
+err_vconfig:
+	kfree(mdev_state->vconfig);
+err_state:
+	vfio_uninit_group_dev(&mdev_state->vdev);
+	kfree(mdev_state);
+	return ret;
 }
 
 static void mdpy_remove(struct mdev_device *mdev)
@@ -278,6 +282,7 @@ static void mdpy_remove(struct mdev_device *mdev)
 	vfio_unregister_group_dev(&mdev_state->vdev);
 	vfree(mdev_state->memblk);
 	kfree(mdev_state->vconfig);
+	vfio_uninit_group_dev(&mdev_state->vdev);
 	kfree(mdev_state);
 
 	mdpy_count--;
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 8b26fecc4afe..37cc9067e160 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -718,8 +718,8 @@ static int mtty_probe(struct mdev_device *mdev)
 
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
 	if (mdev_state == NULL) {
-		atomic_add(nr_ports, &mdev_avail_ports);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_nr_ports;
 	}
 
 	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mtty_dev_ops);
@@ -732,9 +732,8 @@ static int mtty_probe(struct mdev_device *mdev)
 	mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL);
 
 	if (mdev_state->vconfig == NULL) {
-		kfree(mdev_state);
-		atomic_add(nr_ports, &mdev_avail_ports);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_state;
 	}
 
 	mutex_init(&mdev_state->ops_lock);
@@ -743,14 +742,19 @@ static int mtty_probe(struct mdev_device *mdev)
 	mtty_create_config_space(mdev_state);
 
 	ret = vfio_register_group_dev(&mdev_state->vdev);
-	if (ret) {
-		kfree(mdev_state);
-		atomic_add(nr_ports, &mdev_avail_ports);
-		return ret;
-	}
-
+	if (ret)
+		goto err_vconfig;
 	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
+
+err_vconfig:
+	kfree(mdev_state->vconfig);
+err_state:
+	vfio_uninit_group_dev(&mdev_state->vdev);
+	kfree(mdev_state);
+err_nr_ports:
+	atomic_add(nr_ports, &mdev_avail_ports);
+	return ret;
 }
 
 static void mtty_remove(struct mdev_device *mdev)
@@ -761,6 +765,7 @@ static void mtty_remove(struct mdev_device *mdev)
 	vfio_unregister_group_dev(&mdev_state->vdev);
 
 	kfree(mdev_state->vconfig);
+	vfio_uninit_group_dev(&mdev_state->vdev);
 	kfree(mdev_state);
 	atomic_add(nr_ports, &mdev_avail_ports);
 }
-- 
Gitee


From 38c6c746b459348944a9f87fce2e80b0e7e19d76 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:00 -0300
Subject: [PATCH 1664/2161] vfio: Provide better generic support for
 open/release vfio_device_ops

commit 2fd585f4ed9de9b9259e95affdd7d8cde06b48c3 upstream.

Currently the driver ops have an open/release pair that is called once
each time a device FD is opened or closed. Add an additional set of
open/close_device() ops which are called when the device FD is opened for
the first time and closed for the last time.

An analysis shows that all of the drivers require this semantic. Some are
open coding it as part of their reflck implementation, and some are just
buggy and miss it completely.

To retain the current semantics PCI and FSL depend on, introduce the idea
of a "device set" which is a grouping of vfio_device's that share the same
lock around opening.

The device set is established by providing a 'set_id' pointer. All
vfio_device's that provide the same pointer will be joined to the same
singleton memory and lock across the whole set. This effectively replaces
the oddly named reflck.

After conversion the set_id will be sourced from:
 - A struct device from a fsl_mc_device (fsl)
 - A struct pci_slot (pci)
 - A struct pci_bus (pci)
 - The struct vfio_device (everything)

The design ensures that the above pointers are live as long as the
vfio_device is registered, so they form reliable unique keys to group
vfio_devices into sets.

This implementation uses xarray instead of searching through the driver
core structures, which simplifies the somewhat tricky locking in this
area.

Following patches convert all the drivers.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/vfio_mdev.c |  26 +++++-
 drivers/vfio/vfio.c           | 149 +++++++++++++++++++++++++++++-----
 include/linux/mdev.h          |   2 +
 include/linux/vfio.h          |  21 +++++
 4 files changed, 174 insertions(+), 24 deletions(-)

diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index a5c77ccb24f7..e12196ffd487 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -17,13 +17,33 @@
 
 #include "mdev_private.h"
 
+static int vfio_mdev_open_device(struct vfio_device *core_vdev)
+{
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
+
+	if (unlikely(!parent->ops->open_device))
+		return 0;
+
+	return parent->ops->open_device(mdev);
+}
+
+static void vfio_mdev_close_device(struct vfio_device *core_vdev)
+{
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
+
+	if (likely(parent->ops->close_device))
+		parent->ops->close_device(mdev);
+}
+
 static int vfio_mdev_open(struct vfio_device *core_vdev)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->open))
-		return -EINVAL;
+		return 0;
 
 	return parent->ops->open(mdev);
 }
@@ -44,7 +64,7 @@ static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev,
 	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->ioctl))
-		return -EINVAL;
+		return 0;
 
 	return parent->ops->ioctl(mdev, cmd, arg);
 }
@@ -100,6 +120,8 @@ static void vfio_mdev_request(struct vfio_device *core_vdev, unsigned int count)
 
 static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.name		= "vfio-mdev",
+	.open_device	= vfio_mdev_open_device,
+	.close_device	= vfio_mdev_close_device,
 	.open		= vfio_mdev_open,
 	.release	= vfio_mdev_release,
 	.ioctl		= vfio_mdev_unlocked_ioctl,
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index cc375df0fd5d..9cc17768c425 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -96,6 +96,79 @@ module_param_named(enable_unsafe_noiommu_mode,
 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 #endif
 
+static DEFINE_XARRAY(vfio_device_set_xa);
+
+int vfio_assign_device_set(struct vfio_device *device, void *set_id)
+{
+	unsigned long idx = (unsigned long)set_id;
+	struct vfio_device_set *new_dev_set;
+	struct vfio_device_set *dev_set;
+
+	if (WARN_ON(!set_id))
+		return -EINVAL;
+
+	/*
+	 * Atomically acquire a singleton object in the xarray for this set_id
+	 */
+	xa_lock(&vfio_device_set_xa);
+	dev_set = xa_load(&vfio_device_set_xa, idx);
+	if (dev_set)
+		goto found_get_ref;
+	xa_unlock(&vfio_device_set_xa);
+
+	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
+	if (!new_dev_set)
+		return -ENOMEM;
+	mutex_init(&new_dev_set->lock);
+	INIT_LIST_HEAD(&new_dev_set->device_list);
+	new_dev_set->set_id = set_id;
+
+	xa_lock(&vfio_device_set_xa);
+	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
+			       GFP_KERNEL);
+	if (!dev_set) {
+		dev_set = new_dev_set;
+		goto found_get_ref;
+	}
+
+	kfree(new_dev_set);
+	if (xa_is_err(dev_set)) {
+		xa_unlock(&vfio_device_set_xa);
+		return xa_err(dev_set);
+	}
+
+found_get_ref:
+	dev_set->device_count++;
+	xa_unlock(&vfio_device_set_xa);
+	mutex_lock(&dev_set->lock);
+	device->dev_set = dev_set;
+	list_add_tail(&device->dev_set_list, &dev_set->device_list);
+	mutex_unlock(&dev_set->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_assign_device_set);
+
+static void vfio_release_device_set(struct vfio_device *device)
+{
+	struct vfio_device_set *dev_set = device->dev_set;
+
+	if (!dev_set)
+		return;
+
+	mutex_lock(&dev_set->lock);
+	list_del(&device->dev_set_list);
+	mutex_unlock(&dev_set->lock);
+
+	xa_lock(&vfio_device_set_xa);
+	if (!--dev_set->device_count) {
+		__xa_erase(&vfio_device_set_xa,
+			   (unsigned long)dev_set->set_id);
+		mutex_destroy(&dev_set->lock);
+		kfree(dev_set);
+	}
+	xa_unlock(&vfio_device_set_xa);
+}
+
 /*
  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
  * and remove functions, any use cases other than acquiring the first
@@ -751,6 +824,7 @@ EXPORT_SYMBOL_GPL(vfio_init_group_dev);
 
 void vfio_uninit_group_dev(struct vfio_device *device)
 {
+	vfio_release_device_set(device);
 }
 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
 
@@ -760,6 +834,13 @@ int vfio_register_group_dev(struct vfio_device *device)
 	struct iommu_group *iommu_group;
 	struct vfio_group *group;
 
+	/*
+	 * If the driver doesn't specify a set then the device is added to a
+	 * singleton set just for itself.
+	 */
+	if (!device->dev_set)
+		vfio_assign_device_set(device, device);
+
 	iommu_group = iommu_group_get(device->dev);
 	if (!iommu_group)
 		return -EINVAL;
@@ -1361,7 +1442,8 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 {
 	struct vfio_device *device;
 	struct file *filep;
-	int ret;
+	int fdno;
+	int ret = 0;
 
 	if (0 == atomic_read(&group->container_users) ||
 	    !group->container->iommu_driver || !vfio_group_viable(group))
@@ -1375,38 +1457,38 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 		return PTR_ERR(device);
 
 	if (!try_module_get(device->dev->driver->owner)) {
-		vfio_device_put(device);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto err_device_put;
 	}
 
-	ret = device->ops->open(device);
-	if (ret) {
-		module_put(device->dev->driver->owner);
-		vfio_device_put(device);
-		return ret;
+	mutex_lock(&device->dev_set->lock);
+	device->open_count++;
+	if (device->open_count == 1 && device->ops->open_device) {
+		ret = device->ops->open_device(device);
+		if (ret)
+			goto err_undo_count;
+	}
+	mutex_unlock(&device->dev_set->lock);
+
+	if (device->ops->open) {
+		ret = device->ops->open(device);
+		if (ret)
+			goto err_close_device;
 	}
 
 	/*
 	 * We can't use anon_inode_getfd() because we need to modify
 	 * the f_mode flags directly to allow more than just ioctls
 	 */
-	ret = get_unused_fd_flags(O_CLOEXEC);
-	if (ret < 0) {
-		device->ops->release(device);
-		module_put(device->dev->driver->owner);
-		vfio_device_put(device);
-		return ret;
-	}
+	fdno = ret = get_unused_fd_flags(O_CLOEXEC);
+	if (ret < 0)
+		goto err_release;
 
 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
 				   device, O_RDWR);
 	if (IS_ERR(filep)) {
-		put_unused_fd(ret);
 		ret = PTR_ERR(filep);
-		device->ops->release(device);
-		module_put(device->dev->driver->owner);
-		vfio_device_put(device);
-		return ret;
+		goto err_fd;
 	}
 
 	/*
@@ -1418,12 +1500,28 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 
 	atomic_inc(&group->container_users);
 
-	fd_install(ret, filep);
+	fd_install(fdno, filep);
 
 	if (group->noiommu)
 		dev_warn(device->dev, "vfio-noiommu device opened by user "
 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
+	return fdno;
 
+err_fd:
+	put_unused_fd(fdno);
+err_release:
+	if (device->ops->release)
+		device->ops->release(device);
+err_close_device:
+	mutex_lock(&device->dev_set->lock);
+	if (device->open_count == 1 && device->ops->close_device)
+		device->ops->close_device(device);
+err_undo_count:
+	device->open_count--;
+	mutex_unlock(&device->dev_set->lock);
+	module_put(device->dev->driver->owner);
+err_device_put:
+	vfio_device_put(device);
 	return ret;
 }
 
@@ -1561,7 +1659,13 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_device *device = filep->private_data;
 
-	device->ops->release(device);
+	if (device->ops->release)
+		device->ops->release(device);
+
+	mutex_lock(&device->dev_set->lock);
+	if (!--device->open_count && device->ops->close_device)
+		device->ops->close_device(device);
+	mutex_unlock(&device->dev_set->lock);
 
 	module_put(device->dev->driver->owner);
 
@@ -2364,6 +2468,7 @@ static void __exit vfio_cleanup(void)
 	class_destroy(vfio.class);
 	vfio.class = NULL;
 	misc_deregister(&vfio_dev);
+	xa_destroy(&vfio_device_set_xa);
 }
 
 module_init(vfio_init);
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index ae9b409bf634..322b38f16b3d 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -111,6 +111,8 @@ struct mdev_parent_ops {
 
 	int     (*create)(struct mdev_device *mdev);
 	int     (*remove)(struct mdev_device *mdev);
+	int     (*open_device)(struct mdev_device *mdev);
+	void    (*close_device)(struct mdev_device *mdev);
 	int     (*open)(struct mdev_device *mdev);
 	void    (*release)(struct mdev_device *mdev);
 	ssize_t (*read)(struct mdev_device *mdev, char __user *buf,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index b0875cf8e496..f0e6a72875e4 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -15,13 +15,28 @@
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
 
+/*
+ * VFIO devices can be placed in a set, this allows all devices to share this
+ * structure and the VFIO core will provide a lock that is held around
+ * open_device()/close_device() for all devices in the set.
+ */
+struct vfio_device_set {
+	void *set_id;
+	struct mutex lock;
+	struct list_head device_list;
+	unsigned int device_count;
+};
+
 struct vfio_device {
 	struct device *dev;
 	const struct vfio_device_ops *ops;
 	struct vfio_group *group;
+	struct vfio_device_set *dev_set;
+	struct list_head dev_set_list;
 
 	/* Members below here are private, not for driver use */
 	refcount_t refcount;
+	unsigned int open_count;
 	struct completion comp;
 	struct list_head group_next;
 };
@@ -29,6 +44,8 @@ struct vfio_device {
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
  *
+ * @open_device: Called when the first file descriptor is opened for this device
+ * @close_device: Opposite of open_device
  * @open: Called when userspace creates new file descriptor for device
  * @release: Called when userspace releases file descriptor for device
  * @read: Perform read(2) on device file descriptor
@@ -43,6 +60,8 @@ struct vfio_device {
  */
 struct vfio_device_ops {
 	char	*name;
+	int	(*open_device)(struct vfio_device *vdev);
+	void	(*close_device)(struct vfio_device *vdev);
 	int	(*open)(struct vfio_device *vdev);
 	void	(*release)(struct vfio_device *vdev);
 	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
@@ -67,6 +86,8 @@ void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 
+int vfio_assign_device_set(struct vfio_device *device, void *set_id);
+
 /* events for the backend driver notify callback */
 enum vfio_iommu_notify_type {
 	VFIO_IOMMU_CONTAINER_CLOSE = 0,
-- 
Gitee


From 3573b001db208e50edc9948cbac6879c0ef54a09 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:01 -0300
Subject: [PATCH 1665/2161] vfio/samples: Delete useless open/close

commit 17a1e4fa3f7f07f541c751745b5aa6f2fcab9a48 upstream.

The core code no longer requires these ops to be defined, so delete these
empty functions and leave the op as NULL. mtty's functions only log a
pointless message, delete that entirely.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/5-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mbochs.c |  6 ------
 samples/vfio-mdev/mdpy.c   | 11 -----------
 samples/vfio-mdev/mtty.c   | 13 -------------
 3 files changed, 30 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 5ba33cc39948..9e2b30c4d30a 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1293,11 +1293,6 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
 	return -ENOTTY;
 }
 
-static int mbochs_open(struct vfio_device *vdev)
-{
-	return 0;
-}
-
 static void mbochs_close(struct vfio_device *vdev)
 {
 	struct mdev_state *mdev_state =
@@ -1416,7 +1411,6 @@ static struct attribute_group *mdev_type_groups[] = {
 };
 
 static const struct vfio_device_ops mbochs_dev_ops = {
-	.open = mbochs_open,
 	.release = mbochs_close,
 	.read = mbochs_read,
 	.write = mbochs_write,
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 57334034cde6..8d1a80a0722a 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -614,15 +614,6 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
 	return -ENOTTY;
 }
 
-static int mdpy_open(struct vfio_device *vdev)
-{
-	return 0;
-}
-
-static void mdpy_close(struct vfio_device *vdev)
-{
-}
-
 static ssize_t
 resolution_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
@@ -717,8 +708,6 @@ static struct attribute_group *mdev_type_groups[] = {
 };
 
 static const struct vfio_device_ops mdpy_dev_ops = {
-	.open = mdpy_open,
-	.release = mdpy_close,
 	.read = mdpy_read,
 	.write = mdpy_write,
 	.ioctl = mdpy_ioctl,
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 37cc9067e160..5983cdb16e3d 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -1207,17 +1207,6 @@ static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd,
 	return -ENOTTY;
 }
 
-static int mtty_open(struct vfio_device *vdev)
-{
-	pr_info("%s\n", __func__);
-	return 0;
-}
-
-static void mtty_close(struct vfio_device *mdev)
-{
-	pr_info("%s\n", __func__);
-}
-
 static ssize_t
 sample_mtty_dev_show(struct device *dev, struct device_attribute *attr,
 		     char *buf)
@@ -1325,8 +1314,6 @@ static struct attribute_group *mdev_type_groups[] = {
 
 static const struct vfio_device_ops mtty_dev_ops = {
 	.name = "vfio-mtty",
-	.open = mtty_open,
-	.release = mtty_close,
 	.read = mtty_read,
 	.write = mtty_write,
 	.ioctl = mtty_ioctl,
-- 
Gitee


From 89abeb2ae0a02c58a319312b4455f2493f6f29f3 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:03 -0300
Subject: [PATCH 1666/2161] vfio/platform: Use open_device() instead of open
 coding a refcnt scheme

commit ab7e5e34a9f661f5649f48d4531c4a75713ff7cf upstream.

Platform simply wants to run some code when the device is first
opened/last closed. Use the core framework and locking for this.  Aside
from removing a bit of code this narrows the locking scope from a global
lock.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/7-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/vfio_platform_common.c  | 79 ++++++++-----------
 drivers/vfio/platform/vfio_platform_private.h |  1 -
 2 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index bdde8605178c..6af7ce7d619c 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -218,65 +218,52 @@ static int vfio_platform_call_reset(struct vfio_platform_device *vdev,
 	return -EINVAL;
 }
 
-static void vfio_platform_release(struct vfio_device *core_vdev)
+static void vfio_platform_close_device(struct vfio_device *core_vdev)
 {
 	struct vfio_platform_device *vdev =
 		container_of(core_vdev, struct vfio_platform_device, vdev);
+	const char *extra_dbg = NULL;
+	int ret;
 
-	mutex_lock(&driver_lock);
-
-	if (!(--vdev->refcnt)) {
-		const char *extra_dbg = NULL;
-		int ret;
-
-		ret = vfio_platform_call_reset(vdev, &extra_dbg);
-		if (ret && vdev->reset_required) {
-			dev_warn(vdev->device, "reset driver is required and reset call failed in release (%d) %s\n",
-				 ret, extra_dbg ? extra_dbg : "");
-			WARN_ON(1);
-		}
-		pm_runtime_put(vdev->device);
-		vfio_platform_regions_cleanup(vdev);
-		vfio_platform_irq_cleanup(vdev);
+	ret = vfio_platform_call_reset(vdev, &extra_dbg);
+	if (WARN_ON(ret && vdev->reset_required)) {
+		dev_warn(
+			vdev->device,
+			"reset driver is required and reset call failed in release (%d) %s\n",
+			ret, extra_dbg ? extra_dbg : "");
 	}
-
-	mutex_unlock(&driver_lock);
+	pm_runtime_put(vdev->device);
+	vfio_platform_regions_cleanup(vdev);
+	vfio_platform_irq_cleanup(vdev);
 }
 
-static int vfio_platform_open(struct vfio_device *core_vdev)
+static int vfio_platform_open_device(struct vfio_device *core_vdev)
 {
 	struct vfio_platform_device *vdev =
 		container_of(core_vdev, struct vfio_platform_device, vdev);
+	const char *extra_dbg = NULL;
 	int ret;
 
-	mutex_lock(&driver_lock);
-
-	if (!vdev->refcnt) {
-		const char *extra_dbg = NULL;
-
-		ret = vfio_platform_regions_init(vdev);
-		if (ret)
-			goto err_reg;
+	ret = vfio_platform_regions_init(vdev);
+	if (ret)
+		return ret;
 
-		ret = vfio_platform_irq_init(vdev);
-		if (ret)
-			goto err_irq;
+	ret = vfio_platform_irq_init(vdev);
+	if (ret)
+		goto err_irq;
 
-		ret = pm_runtime_get_sync(vdev->device);
-		if (ret < 0)
-			goto err_rst;
+	ret = pm_runtime_get_sync(vdev->device);
+	if (ret < 0)
+		goto err_rst;
 
-		ret = vfio_platform_call_reset(vdev, &extra_dbg);
-		if (ret && vdev->reset_required) {
-			dev_warn(vdev->device, "reset driver is required and reset call failed in open (%d) %s\n",
-				 ret, extra_dbg ? extra_dbg : "");
-			goto err_rst;
-		}
+	ret = vfio_platform_call_reset(vdev, &extra_dbg);
+	if (ret && vdev->reset_required) {
+		dev_warn(
+			vdev->device,
+			"reset driver is required and reset call failed in open (%d) %s\n",
+			ret, extra_dbg ? extra_dbg : "");
+		goto err_rst;
 	}
-
-	vdev->refcnt++;
-
-	mutex_unlock(&driver_lock);
 	return 0;
 
 err_rst:
@@ -284,8 +271,6 @@ static int vfio_platform_open(struct vfio_device *core_vdev)
 	vfio_platform_irq_cleanup(vdev);
 err_irq:
 	vfio_platform_regions_cleanup(vdev);
-err_reg:
-	mutex_unlock(&driver_lock);
 	return ret;
 }
 
@@ -616,8 +601,8 @@ static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_stru
 
 static const struct vfio_device_ops vfio_platform_ops = {
 	.name		= "vfio-platform",
-	.open		= vfio_platform_open,
-	.release	= vfio_platform_release,
+	.open_device	= vfio_platform_open_device,
+	.close_device	= vfio_platform_close_device,
 	.ioctl		= vfio_platform_ioctl,
 	.read		= vfio_platform_read,
 	.write		= vfio_platform_write,
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index dfb834c13659..520d2a8e8375 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -48,7 +48,6 @@ struct vfio_platform_device {
 	u32				num_regions;
 	struct vfio_platform_irq	*irqs;
 	u32				num_irqs;
-	int				refcnt;
 	struct mutex			igate;
 	const char			*compat;
 	const char			*acpihid;
-- 
Gitee


From 0e619ab2a953a4a2dc6afc85a557e931a8ae18e1 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:04 -0300
Subject: [PATCH 1667/2161] vfio/pci: Move to the device set infrastructure

commit 2cd8b14aaa667f5c6b7918e8d769872fa9acb0ac upstream.

PCI wants to have the usual open/close_device() logic with the slight
twist that the open/close_device() must be done under a singelton lock
shared by all of the vfio_devices that are in the PCI "reset group".

The reset group, and thus the device set, is determined by what devices
pci_reset_bus() touches, which is either the entire bus or only the slot.

Rely on the core code to do everything reflck was doing and delete reflck
entirely.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/8-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c         | 162 +++++++---------------------
 drivers/vfio/pci/vfio_pci_private.h |   7 --
 2 files changed, 37 insertions(+), 132 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b18508543045..5f37d18ecd51 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -548,53 +548,40 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
 	vfio_device_put(&pf_vdev->vdev);
 }
 
-static void vfio_pci_release(struct vfio_device *core_vdev)
+static void vfio_pci_close_device(struct vfio_device *core_vdev)
 {
 	struct vfio_pci_device *vdev =
 		container_of(core_vdev, struct vfio_pci_device, vdev);
 
-	mutex_lock(&vdev->reflck->lock);
-
-	if (!(--vdev->refcnt)) {
-		vfio_pci_vf_token_user_add(vdev, -1);
-		vfio_spapr_pci_eeh_release(vdev->pdev);
-		vfio_pci_disable(vdev);
+	vfio_pci_vf_token_user_add(vdev, -1);
+	vfio_spapr_pci_eeh_release(vdev->pdev);
+	vfio_pci_disable(vdev);
 
-		mutex_lock(&vdev->igate);
-		if (vdev->err_trigger) {
-			eventfd_ctx_put(vdev->err_trigger);
-			vdev->err_trigger = NULL;
-		}
-		if (vdev->req_trigger) {
-			eventfd_ctx_put(vdev->req_trigger);
-			vdev->req_trigger = NULL;
-		}
-		mutex_unlock(&vdev->igate);
+	mutex_lock(&vdev->igate);
+	if (vdev->err_trigger) {
+		eventfd_ctx_put(vdev->err_trigger);
+		vdev->err_trigger = NULL;
 	}
-
-	mutex_unlock(&vdev->reflck->lock);
+	if (vdev->req_trigger) {
+		eventfd_ctx_put(vdev->req_trigger);
+		vdev->req_trigger = NULL;
+	}
+	mutex_unlock(&vdev->igate);
 }
 
-static int vfio_pci_open(struct vfio_device *core_vdev)
+static int vfio_pci_open_device(struct vfio_device *core_vdev)
 {
 	struct vfio_pci_device *vdev =
 		container_of(core_vdev, struct vfio_pci_device, vdev);
 	int ret = 0;
 
-	mutex_lock(&vdev->reflck->lock);
-
-	if (!vdev->refcnt) {
-		ret = vfio_pci_enable(vdev);
-		if (ret)
-			goto error;
+	ret = vfio_pci_enable(vdev);
+	if (ret)
+		return ret;
 
-		vfio_spapr_pci_eeh_open(vdev->pdev);
-		vfio_pci_vf_token_user_add(vdev, 1);
-	}
-	vdev->refcnt++;
-error:
-	mutex_unlock(&vdev->reflck->lock);
-	return ret;
+	vfio_spapr_pci_eeh_open(vdev->pdev);
+	vfio_pci_vf_token_user_add(vdev, 1);
+	return 0;
 }
 
 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -1877,8 +1864,8 @@ static int vfio_pci_match(struct vfio_device *core_vdev, char *buf)
 
 static const struct vfio_device_ops vfio_pci_ops = {
 	.name		= "vfio-pci",
-	.open		= vfio_pci_open,
-	.release	= vfio_pci_release,
+	.open_device	= vfio_pci_open_device,
+	.close_device	= vfio_pci_close_device,
 	.ioctl		= vfio_pci_ioctl,
 	.read		= vfio_pci_read,
 	.write		= vfio_pci_write,
@@ -1887,9 +1874,6 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.match		= vfio_pci_match,
 };
 
-static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
-static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
-
 static int vfio_pci_bus_notifier(struct notifier_block *nb,
 				 unsigned long action, void *data)
 {
@@ -2027,12 +2011,23 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_LIST_HEAD(&vdev->vma_list);
 	init_rwsem(&vdev->memory_lock);
 
-	ret = vfio_pci_reflck_attach(vdev);
+	if (pci_is_root_bus(pdev->bus)) {
+		ret = vfio_assign_device_set(&vdev->vdev, vdev);
+	} else if (!pci_probe_reset_slot(pdev->slot)) {
+		ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
+	} else {
+		/*
+		 * If there is no slot reset support for this device, the whole
+		 * bus needs to be grouped together to support bus-wide resets.
+		 */
+		ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
+	}
+
 	if (ret)
 		goto out_uninit;
 	ret = vfio_pci_vf_init(vdev);
 	if (ret)
-		goto out_reflck;
+		goto out_uninit;
 	ret = vfio_pci_vga_init(vdev);
 	if (ret)
 		goto out_vf;
@@ -2064,8 +2059,6 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		vfio_pci_set_power_state(vdev, PCI_D0);
 out_vf:
 	vfio_pci_vf_uninit(vdev);
-out_reflck:
-	vfio_pci_reflck_put(vdev->reflck);
 out_uninit:
 	vfio_uninit_group_dev(&vdev->vdev);
 	kfree(vdev->pm_save);
@@ -2084,7 +2077,6 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 	vfio_unregister_group_dev(&vdev->vdev);
 
 	vfio_pci_vf_uninit(vdev);
-	vfio_pci_reflck_put(vdev->reflck);
 	vfio_uninit_group_dev(&vdev->vdev);
 	vfio_pci_vga_uninit(vdev);
 
@@ -2160,86 +2152,6 @@ static struct pci_driver vfio_pci_driver = {
 	.err_handler		= &vfio_err_handlers,
 };
 
-static DEFINE_MUTEX(reflck_lock);
-
-static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
-{
-	struct vfio_pci_reflck *reflck;
-
-	reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
-	if (!reflck)
-		return ERR_PTR(-ENOMEM);
-
-	kref_init(&reflck->kref);
-	mutex_init(&reflck->lock);
-
-	return reflck;
-}
-
-static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
-{
-	kref_get(&reflck->kref);
-}
-
-static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
-{
-	struct vfio_pci_reflck **preflck = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return 0;
-
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return 0;
-	}
-
-	vdev = container_of(device, struct vfio_pci_device, vdev);
-
-	if (vdev->reflck) {
-		vfio_pci_reflck_get(vdev->reflck);
-		*preflck = vdev->reflck;
-		vfio_device_put(device);
-		return 1;
-	}
-
-	vfio_device_put(device);
-	return 0;
-}
-
-static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
-{
-	bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
-
-	mutex_lock(&reflck_lock);
-
-	if (pci_is_root_bus(vdev->pdev->bus) ||
-	    vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
-					  &vdev->reflck, slot) <= 0)
-		vdev->reflck = vfio_pci_reflck_alloc();
-
-	mutex_unlock(&reflck_lock);
-
-	return PTR_ERR_OR_ZERO(vdev->reflck);
-}
-
-static void vfio_pci_reflck_release(struct kref *kref)
-{
-	struct vfio_pci_reflck *reflck = container_of(kref,
-						      struct vfio_pci_reflck,
-						      kref);
-
-	kfree(reflck);
-	mutex_unlock(&reflck_lock);
-}
-
-static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
-{
-	kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
-}
-
 static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 {
 	struct vfio_devices *devs = data;
@@ -2261,7 +2173,7 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 	vdev = container_of(device, struct vfio_pci_device, vdev);
 
 	/* Fault if the device is not unused */
-	if (vdev->refcnt) {
+	if (device->open_count) {
 		vfio_device_put(device);
 		return -EBUSY;
 	}
@@ -2310,7 +2222,7 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
  *  - At least one of the affected devices is marked dirty via
  *    needs_reset (such as by lack of FLR support)
  * Then attempt to perform that bus or slot reset.  Callers are required
- * to hold vdev->reflck->lock, protecting the bus/slot reset group from
+ * to hold vdev->dev_set->lock, protecting the bus/slot reset group from
  * concurrent opens.  A vfio_device reference is acquired for each device
  * to prevent unbinds during the reset operation.
  *
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 4025f6a27fcb..9e4f8dcf4e46 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -83,11 +83,6 @@ struct vfio_pci_dummy_resource {
 	struct list_head	res_next;
 };
 
-struct vfio_pci_reflck {
-	struct kref		kref;
-	struct mutex		lock;
-};
-
 struct vfio_pci_mmap_vma {
 	struct vm_area_struct	*vma;
 	struct list_head	vma_next;
@@ -130,8 +125,6 @@ struct vfio_pci_device {
 	bool			needs_pm_restore;
 	struct pci_saved_state	*pci_saved_state;
 	struct pci_saved_state	*pm_save;
-	struct vfio_pci_reflck	*reflck;
-	int			refcnt;
 	int			ioeventfds_nr;
 	struct eventfd_ctx	*err_trigger;
 	struct eventfd_ctx	*req_trigger;
-- 
Gitee


From c9b417a8190b1b2813087d1a16ee7623aa927e16 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:05 -0300
Subject: [PATCH 1668/2161] vfio/pci: Change vfio_pci_try_bus_reset() to use
 the dev_set

commit a882c16a2b7ef6e0ab3f0d1e41345b667893cbfd upstream.

vfio_pci_try_bus_reset() is triggering a reset of the entire_dev set if
any device within it has accumulated a needs_reset. This reset can only be
done once all of the drivers operating the PCI devices to be reset are in
a known safe state.

Make this clearer by directly operating on the dev_set instead of the
vfio_pci_device. Rename the function to vfio_pci_dev_set_try_reset().

Use the device list inside the dev_set to check that all drivers are in a
safe state instead of working backwards from the pci_device.

The dev_set->lock directly prevents devices from joining/leaving the set,
or changing their state, which further implies the pci_device cannot
change drivers or that the vfio_device be freed, eliminating the need for
get/put's.

If a pci_device to be reset is not in the dev_set then the reset cannot be
used as we can't know what the state of that driver is. Directly measure
this by checking that every pci_device is in the dev_set - which
effectively proves that VFIO drivers are attached to everything.

Remove the odd interaction around vfio_pci_set_power_state() - have the
only caller avoid its redundant vfio_pci_set_power_state() instead of
avoiding it inside vfio_pci_dev_set_try_reset().

This restructuring corrects a call to pci_dev_driver() without holding the
device_lock() and removes a hard wiring to &vfio_pci_driver.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/9-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 174 +++++++++++++++++-------------------
 1 file changed, 82 insertions(+), 92 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 5f37d18ecd51..101eeaedc52c 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -223,7 +223,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 	}
 }
 
-static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
+static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data);
 
@@ -422,6 +422,9 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
 	int i, bar;
 
+	/* For needs_reset */
+	lockdep_assert_held(&vdev->vdev.dev_set->lock);
+
 	/* Stop the device from further DMA */
 	pci_clear_master(pdev);
 
@@ -505,9 +508,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 out:
 	pci_disable_device(pdev);
 
-	vfio_pci_try_bus_reset(vdev);
-
-	if (!disable_idle_d3)
+	if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3)
 		vfio_pci_set_power_state(vdev, PCI_D3hot);
 }
 
@@ -2152,7 +2153,7 @@ static struct pci_driver vfio_pci_driver = {
 	.err_handler		= &vfio_err_handlers,
 };
 
-static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
+static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 {
 	struct vfio_devices *devs = data;
 	struct vfio_device *device;
@@ -2172,8 +2173,11 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 
 	vdev = container_of(device, struct vfio_pci_device, vdev);
 
-	/* Fault if the device is not unused */
-	if (device->open_count) {
+	/*
+	 * Locking multiple devices is prone to deadlock, runaway and
+	 * unwind if we hit contention.
+	 */
+	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
 		vfio_device_put(device);
 		return -EBUSY;
 	}
@@ -2182,112 +2186,98 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 	return 0;
 }
 
-static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
+static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
 {
-	struct vfio_devices *devs = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
+	struct vfio_device_set *dev_set = data;
+	struct vfio_device *cur;
 
-	if (devs->cur_index == devs->max_index)
-		return -ENOSPC;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return -EINVAL;
+	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
+		if (cur->dev == &pdev->dev)
+			return 0;
+	return -EBUSY;
+}
 
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
+/*
+ * vfio-core considers a group to be viable and will create a vfio_device even
+ * if some devices are bound to drivers like pci-stub or pcieport. Here we
+ * require all PCI devices to be inside our dev_set since that ensures they stay
+ * put and that every driver controlling the device can co-ordinate with the
+ * device reset.
+ *
+ * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
+ * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
+ */
+static struct pci_dev *
+vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
+{
+	struct pci_dev *pdev;
 
-	vdev = container_of(device, struct vfio_pci_device, vdev);
+	lockdep_assert_held(&dev_set->lock);
 
 	/*
-	 * Locking multiple devices is prone to deadlock, runaway and
-	 * unwind if we hit contention.
+	 * By definition all PCI devices in the dev_set share the same PCI
+	 * reset, so any pci_dev will have the same outcomes for
+	 * pci_probe_reset_*() and pci_reset_bus().
 	 */
-	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
+	pdev = list_first_entry(&dev_set->device_list, struct vfio_pci_device,
+				vdev.dev_set_list)->pdev;
 
-	devs->devices[devs->cur_index++] = vdev;
-	return 0;
+	/* pci_reset_bus() is supported */
+	if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
+		return NULL;
+
+	if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
+					  dev_set,
+					  !pci_probe_reset_slot(pdev->slot)))
+		return NULL;
+	return pdev;
+}
+
+static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
+{
+	struct vfio_pci_device *cur;
+	bool needs_reset = false;
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		/* No VFIO device in the set can have an open device FD */
+		if (cur->vdev.open_count)
+			return false;
+		needs_reset |= cur->needs_reset;
+	}
+	return needs_reset;
 }
 
 /*
- * If a bus or slot reset is available for the provided device and:
+ * If a bus or slot reset is available for the provided dev_set and:
  *  - All of the devices affected by that bus or slot reset are unused
- *    (!refcnt)
  *  - At least one of the affected devices is marked dirty via
  *    needs_reset (such as by lack of FLR support)
- * Then attempt to perform that bus or slot reset.  Callers are required
- * to hold vdev->dev_set->lock, protecting the bus/slot reset group from
- * concurrent opens.  A vfio_device reference is acquired for each device
- * to prevent unbinds during the reset operation.
- *
- * NB: vfio-core considers a group to be viable even if some devices are
- * bound to drivers like pci-stub or pcieport.  Here we require all devices
- * to be bound to vfio_pci since that's the only way we can be sure they
- * stay put.
+ * Then attempt to perform that bus or slot reset.
+ * Returns true if the dev_set was reset.
  */
-static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
+static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
 {
-	struct vfio_devices devs = { .cur_index = 0 };
-	int i = 0, ret = -EINVAL;
-	bool slot = false;
-	struct vfio_pci_device *tmp;
-
-	if (!pci_probe_reset_slot(vdev->pdev->slot))
-		slot = true;
-	else if (pci_probe_reset_bus(vdev->pdev->bus))
-		return;
-
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
-					  &i, slot) || !i)
-		return;
-
-	devs.max_index = i;
-	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
-	if (!devs.devices)
-		return;
-
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_get_unused_devs,
-					  &devs, slot))
-		goto put_devs;
-
-	/* Does at least one need a reset? */
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = devs.devices[i];
-		if (tmp->needs_reset) {
-			ret = pci_reset_bus(vdev->pdev);
-			break;
-		}
-	}
+	struct vfio_pci_device *cur;
+	struct pci_dev *pdev;
+	int ret;
 
-put_devs:
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = devs.devices[i];
+	if (!vfio_pci_dev_set_needs_reset(dev_set))
+		return false;
 
-		/*
-		 * If reset was successful, affected devices no longer need
-		 * a reset and we should return all the collateral devices
-		 * to low power.  If not successful, we either didn't reset
-		 * the bus or timed out waiting for it, so let's not touch
-		 * the power state.
-		 */
-		if (!ret) {
-			tmp->needs_reset = false;
+	pdev = vfio_pci_dev_set_resettable(dev_set);
+	if (!pdev)
+		return false;
 
-			if (tmp != vdev && !disable_idle_d3)
-				vfio_pci_set_power_state(tmp, PCI_D3hot);
-		}
+	ret = pci_reset_bus(pdev);
+	if (ret)
+		return false;
 
-		vfio_device_put(&tmp->vdev);
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		cur->needs_reset = false;
+		if (!disable_idle_d3)
+			vfio_pci_set_power_state(cur, PCI_D3hot);
 	}
-
-	kfree(devs.devices);
+	return true;
 }
 
 static void __exit vfio_pci_cleanup(void)
-- 
Gitee


From da86c39ce23c2f43d1a248a9d4a566876d428533 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:06 -0300
Subject: [PATCH 1669/2161] vfio/pci: Reorganize VFIO_DEVICE_PCI_HOT_RESET to
 use the device set

commit db44c17458fb54880b9a65479e464b64c365a87d upstream.

Like vfio_pci_dev_set_try_reset() this code wants to reset all of the
devices in the "reset group" which is the same membership as the device
set.

Instead of trying to reconstruct the device set from the PCI list go
directly from the device set's device list to execute the reset.

The same basic structure as vfio_pci_dev_set_try_reset() is used. The
'vfio_devices' struct is replaced with the device set linked list and we
simply sweep it multiple times under the lock.

This eliminates a memory allocation and get/put traffic and another
improperly locked test of pci_dev_driver().

Reviewed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/10-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c | 213 +++++++++++++++---------------------
 1 file changed, 89 insertions(+), 124 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 101eeaedc52c..d0ed0b9df45d 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -223,9 +223,11 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 	}
 }
 
+struct vfio_pci_group_info;
 static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
-static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data);
+static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
+				      struct vfio_pci_group_info *groups);
 
 /*
  * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
@@ -661,37 +663,11 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 	return 0;
 }
 
-struct vfio_pci_group_entry {
-	struct vfio_group *group;
-	int id;
-};
-
 struct vfio_pci_group_info {
 	int count;
-	struct vfio_pci_group_entry *groups;
+	struct vfio_group **groups;
 };
 
-static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
-{
-	struct vfio_pci_group_info *info = data;
-	struct iommu_group *group;
-	int id, i;
-
-	group = iommu_group_get(&pdev->dev);
-	if (!group)
-		return -EPERM;
-
-	id = iommu_group_id(group);
-
-	for (i = 0; i < info->count; i++)
-		if (info->groups[i].id == id)
-			break;
-
-	iommu_group_put(group);
-
-	return (i == info->count) ? -EINVAL : 0;
-}
-
 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
 {
 	for (; pdev; pdev = pdev->bus->self)
@@ -769,12 +745,6 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-struct vfio_devices {
-	struct vfio_pci_device **devices;
-	int cur_index;
-	int max_index;
-};
-
 static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 			   unsigned int cmd, unsigned long arg)
 {
@@ -1143,11 +1113,10 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 	} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
 		struct vfio_pci_hot_reset hdr;
 		int32_t *group_fds;
-		struct vfio_pci_group_entry *groups;
+		struct vfio_group **groups;
 		struct vfio_pci_group_info info;
-		struct vfio_devices devs = { .cur_index = 0 };
 		bool slot = false;
-		int i, group_idx, mem_idx = 0, count = 0, ret = 0;
+		int group_idx, count = 0, ret = 0;
 
 		minsz = offsetofend(struct vfio_pci_hot_reset, count);
 
@@ -1214,9 +1183,7 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 				break;
 			}
 
-			groups[group_idx].group = group;
-			groups[group_idx].id =
-					vfio_external_user_iommu_id(group);
+			groups[group_idx] = group;
 		}
 
 		kfree(group_fds);
@@ -1228,64 +1195,11 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 		info.count = hdr.count;
 		info.groups = groups;
 
-		/*
-		 * Test whether all the affected devices are contained
-		 * by the set of groups provided by the user.
-		 */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_validate_devs,
-						    &info, slot);
-		if (ret)
-			goto hot_reset_release;
-
-		devs.max_index = count;
-		devs.devices = kcalloc(count, sizeof(struct vfio_device *),
-				       GFP_KERNEL);
-		if (!devs.devices) {
-			ret = -ENOMEM;
-			goto hot_reset_release;
-		}
-
-		/*
-		 * We need to get memory_lock for each device, but devices
-		 * can share mmap_sem, therefore we need to zap and hold
-		 * the vma_lock for each device, and only then get each
-		 * memory_lock.
-		 */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					    vfio_pci_try_zap_and_vma_lock_cb,
-					    &devs, slot);
-		if (ret)
-			goto hot_reset_release;
-
-		for (; mem_idx < devs.cur_index; mem_idx++) {
-			struct vfio_pci_device *tmp = devs.devices[mem_idx];
-
-			ret = down_write_trylock(&tmp->memory_lock);
-			if (!ret) {
-				ret = -EBUSY;
-				goto hot_reset_release;
-			}
-			mutex_unlock(&tmp->vma_lock);
-		}
-
-		/* User has access, do the reset */
-		ret = pci_reset_bus(vdev->pdev);
+		ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
 
 hot_reset_release:
-		for (i = 0; i < devs.cur_index; i++) {
-			struct vfio_pci_device *tmp = devs.devices[i];
-
-			if (i < mem_idx)
-				up_write(&tmp->memory_lock);
-			else
-				mutex_unlock(&tmp->vma_lock);
-			vfio_device_put(&tmp->vdev);
-		}
-		kfree(devs.devices);
-
 		for (group_idx--; group_idx >= 0; group_idx--)
-			vfio_group_put_external_user(groups[group_idx].group);
+			vfio_group_put_external_user(groups[group_idx]);
 
 		kfree(groups);
 		return ret;
@@ -2153,37 +2067,15 @@ static struct pci_driver vfio_pci_driver = {
 	.err_handler		= &vfio_err_handlers,
 };
 
-static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
+static bool vfio_dev_in_groups(struct vfio_pci_device *vdev,
+			       struct vfio_pci_group_info *groups)
 {
-	struct vfio_devices *devs = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
-
-	if (devs->cur_index == devs->max_index)
-		return -ENOSPC;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return -EINVAL;
-
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	vdev = container_of(device, struct vfio_pci_device, vdev);
+	unsigned int i;
 
-	/*
-	 * Locking multiple devices is prone to deadlock, runaway and
-	 * unwind if we hit contention.
-	 */
-	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	devs->devices[devs->cur_index++] = vdev;
-	return 0;
+	for (i = 0; i < groups->count; i++)
+		if (groups->groups[i] == vdev->vdev.group)
+			return true;
+	return false;
 }
 
 static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
@@ -2233,6 +2125,79 @@ vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
 	return pdev;
 }
 
+/*
+ * We need to get memory_lock for each device, but devices can share mmap_sem,
+ * therefore we need to zap and hold the vma_lock for each device, and only then
+ * get each memory_lock.
+ */
+static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
+				      struct vfio_pci_group_info *groups)
+{
+	struct vfio_pci_device *cur_mem;
+	struct vfio_pci_device *cur_vma;
+	struct vfio_pci_device *cur;
+	struct pci_dev *pdev;
+	bool is_mem = true;
+	int ret;
+
+	mutex_lock(&dev_set->lock);
+	cur_mem = list_first_entry(&dev_set->device_list,
+				   struct vfio_pci_device, vdev.dev_set_list);
+
+	pdev = vfio_pci_dev_set_resettable(dev_set);
+	if (!pdev) {
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
+		/*
+		 * Test whether all the affected devices are contained by the
+		 * set of groups provided by the user.
+		 */
+		if (!vfio_dev_in_groups(cur_vma, groups)) {
+			ret = -EINVAL;
+			goto err_undo;
+		}
+
+		/*
+		 * Locking multiple devices is prone to deadlock, runaway and
+		 * unwind if we hit contention.
+		 */
+		if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
+			ret = -EBUSY;
+			goto err_undo;
+		}
+	}
+	cur_vma = NULL;
+
+	list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
+		if (!down_write_trylock(&cur_mem->memory_lock)) {
+			ret = -EBUSY;
+			goto err_undo;
+		}
+		mutex_unlock(&cur_mem->vma_lock);
+	}
+	cur_mem = NULL;
+
+	ret = pci_reset_bus(pdev);
+
+err_undo:
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		if (cur == cur_mem)
+			is_mem = false;
+		if (cur == cur_vma)
+			break;
+		if (is_mem)
+			up_write(&cur->memory_lock);
+		else
+			mutex_unlock(&cur->vma_lock);
+	}
+err_unlock:
+	mutex_unlock(&dev_set->lock);
+	return ret;
+}
+
 static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
 {
 	struct vfio_pci_device *cur;
-- 
Gitee


From 3233d67cfc2976c02bc5b4f1fd3878e18cee60be Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:07 -0300
Subject: [PATCH 1670/2161] vfio/mbochs: Fix close when multiple device FDs are
 open

commit 3cb24827147b75557bddc5b39d63897786935b14 upstream.

mbochs_close() iterates over global device state and frees it. Currently
this is done every time a device FD is closed, but if multiple device FDs
are open this could corrupt other still active FDs.

Change this to use close_device() so it only runs on the last close.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/11-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 samples/vfio-mdev/mbochs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 9e2b30c4d30a..8fcb640375f5 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1293,7 +1293,7 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
 	return -ENOTTY;
 }
 
-static void mbochs_close(struct vfio_device *vdev)
+static void mbochs_close_device(struct vfio_device *vdev)
 {
 	struct mdev_state *mdev_state =
 		container_of(vdev, struct mdev_state, vdev);
@@ -1411,7 +1411,7 @@ static struct attribute_group *mdev_type_groups[] = {
 };
 
 static const struct vfio_device_ops mbochs_dev_ops = {
-	.release = mbochs_close,
+	.close_device = mbochs_close_device,
 	.read = mbochs_read,
 	.write = mbochs_write,
 	.ioctl = mbochs_ioctl,
-- 
Gitee


From 2d2f6b25095412ba69946986469f2e885216ac00 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:10 -0300
Subject: [PATCH 1671/2161] vfio: Remove struct vfio_device_ops open/release

commit eb24c1007e6852e024dc33b0dd9617b8500a1291 upstream.

Nothing uses this anymore, delete it.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/14-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/vfio_mdev.c | 22 ----------------------
 drivers/vfio/vfio.c           | 14 +-------------
 include/linux/mdev.h          |  7 -------
 include/linux/vfio.h          |  4 ----
 4 files changed, 1 insertion(+), 46 deletions(-)

diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index e12196ffd487..7a9883048216 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -37,26 +37,6 @@ static void vfio_mdev_close_device(struct vfio_device *core_vdev)
 		parent->ops->close_device(mdev);
 }
 
-static int vfio_mdev_open(struct vfio_device *core_vdev)
-{
-	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->type->parent;
-
-	if (unlikely(!parent->ops->open))
-		return 0;
-
-	return parent->ops->open(mdev);
-}
-
-static void vfio_mdev_release(struct vfio_device *core_vdev)
-{
-	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->type->parent;
-
-	if (likely(parent->ops->release))
-		parent->ops->release(mdev);
-}
-
 static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev,
 				     unsigned int cmd, unsigned long arg)
 {
@@ -122,8 +102,6 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.name		= "vfio-mdev",
 	.open_device	= vfio_mdev_open_device,
 	.close_device	= vfio_mdev_close_device,
-	.open		= vfio_mdev_open,
-	.release	= vfio_mdev_release,
 	.ioctl		= vfio_mdev_unlocked_ioctl,
 	.read		= vfio_mdev_read,
 	.write		= vfio_mdev_write,
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 9cc17768c425..3c034fe14ccb 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1470,19 +1470,13 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	}
 	mutex_unlock(&device->dev_set->lock);
 
-	if (device->ops->open) {
-		ret = device->ops->open(device);
-		if (ret)
-			goto err_close_device;
-	}
-
 	/*
 	 * We can't use anon_inode_getfd() because we need to modify
 	 * the f_mode flags directly to allow more than just ioctls
 	 */
 	fdno = ret = get_unused_fd_flags(O_CLOEXEC);
 	if (ret < 0)
-		goto err_release;
+		goto err_close_device;
 
 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
 				   device, O_RDWR);
@@ -1509,9 +1503,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 
 err_fd:
 	put_unused_fd(fdno);
-err_release:
-	if (device->ops->release)
-		device->ops->release(device);
 err_close_device:
 	mutex_lock(&device->dev_set->lock);
 	if (device->open_count == 1 && device->ops->close_device)
@@ -1659,9 +1650,6 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_device *device = filep->private_data;
 
-	if (device->ops->release)
-		device->ops->release(device);
-
 	mutex_lock(&device->dev_set->lock);
 	if (!--device->open_count && device->ops->close_device)
 		device->ops->close_device(device);
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 322b38f16b3d..93a7842c8018 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -72,11 +72,6 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype);
  *			@mdev: mdev_device device structure which is being
  *			       destroyed
  *			Returns integer: success (0) or error (< 0)
- * @open:		Open mediated device.
- *			@mdev: mediated device.
- *			Returns integer: success (0) or error (< 0)
- * @release:		release mediated device
- *			@mdev: mediated device.
  * @read:		Read emulation callback
  *			@mdev: mediated device structure
  *			@buf: read buffer
@@ -113,8 +108,6 @@ struct mdev_parent_ops {
 	int     (*remove)(struct mdev_device *mdev);
 	int     (*open_device)(struct mdev_device *mdev);
 	void    (*close_device)(struct mdev_device *mdev);
-	int     (*open)(struct mdev_device *mdev);
-	void    (*release)(struct mdev_device *mdev);
 	ssize_t (*read)(struct mdev_device *mdev, char __user *buf,
 			size_t count, loff_t *ppos);
 	ssize_t (*write)(struct mdev_device *mdev, const char __user *buf,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f0e6a72875e4..b53a9557884a 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -46,8 +46,6 @@ struct vfio_device {
  *
  * @open_device: Called when the first file descriptor is opened for this device
  * @close_device: Opposite of open_device
- * @open: Called when userspace creates new file descriptor for device
- * @release: Called when userspace releases file descriptor for device
  * @read: Perform read(2) on device file descriptor
  * @write: Perform write(2) on device file descriptor
  * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
@@ -62,8 +60,6 @@ struct vfio_device_ops {
 	char	*name;
 	int	(*open_device)(struct vfio_device *vdev);
 	void	(*close_device)(struct vfio_device *vdev);
-	int	(*open)(struct vfio_device *vdev);
-	void	(*release)(struct vfio_device *vdev);
 	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos);
 	ssize_t	(*write)(struct vfio_device *vdev, const char __user *buf,
-- 
Gitee


From e1d3fd49097c524f953de116716bd95417ec0b6f Mon Sep 17 00:00:00 2001
From: Cai Huoqing <caihuoqing@baidu.com>
Date: Sun, 22 Aug 2021 12:36:43 +0800
Subject: [PATCH 1672/2161] vfio: platform: reset: Convert to SPDX identifier

commit ab78130e6e99e74c2c6d5670fa3e7f290e07c2c5 upstream.

use SPDX-License-Identifier instead of a verbose license text

Signed-off-by: Cai Huoqing <caihuoqing@baidu.com>
Link: https://lore.kernel.org/r/20210822043643.2040-1-caihuoqing@baidu.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
index 96064ef8f629..1131ebe4837d 100644
--- a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
+++ b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 /*
-- 
Gitee


From fd14ece705a5ae3fdaf0cd3c93145225cd4b8fbc Mon Sep 17 00:00:00 2001
From: Cai Huoqing <caihuoqing@baidu.com>
Date: Tue, 24 Aug 2021 08:37:49 +0800
Subject: [PATCH 1673/2161] vfio-pci/zdev: Remove repeated verbose license text

commit 29848a034ac749622f5ef4686f9e48d69254f4dc upstream.

The SPDX and verbose license text are redundant, however in this case
the verbose license indicates a GPL v2 only while SPDX specifies v2+.
Remove the verbose license and correct SPDX to the more restricted
version.

Signed-off-by: Cai Huoqing <caihuoqing@baidu.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Link: https://lore.kernel.org/r/20210824003749.1039-1-caihuoqing@baidu.com
[aw: commit log]
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_zdev.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 7b011b62c766..104fcf6658db 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -1,15 +1,10 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * VFIO ZPCI devices support
  *
  * Copyright (C) IBM Corp. 2020.  All rights reserved.
  *	Author(s): Pierre Morel <pmorel@linux.ibm.com>
  *                 Matthew Rosato <mjrosato@linux.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
  */
 #include <linux/io.h>
 #include <linux/pci.h>
-- 
Gitee


From bb2663382a2586c121b4566a45da4260c3db0293 Mon Sep 17 00:00:00 2001
From: Anthony Yznaga <anthony.yznaga@oracle.com>
Date: Mon, 23 Aug 2021 09:35:50 -0700
Subject: [PATCH 1674/2161] vfio/type1: Fix vfio_find_dma_valid return

commit ffc95d1b8edb80d2dab77f2e8a823c8d93b06419 upstream.

vfio_find_dma_valid is defined to return WAITED on success if it was
necessary to wait.  However, the loop forgets the WAITED value returned
by vfio_wait() and returns 0 in a later iteration.  Fix it.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
Link: https://lore.kernel.org/r/1629736550-2388-1-git-send-email-anthony.yznaga@oracle.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8fd94cf11a9c..659f86358120 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -613,17 +613,17 @@ static int vfio_wait(struct vfio_iommu *iommu)
 static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
 			       size_t size, struct vfio_dma **dma_p)
 {
-	int ret;
+	int ret = 0;
 
 	do {
 		*dma_p = vfio_find_dma(iommu, start, size);
 		if (!*dma_p)
-			ret = -EINVAL;
+			return -EINVAL;
 		else if (!(*dma_p)->vaddr_invalid)
-			ret = 0;
+			return ret;
 		else
 			ret = vfio_wait(iommu);
-	} while (ret > 0);
+	} while (ret == WAITED);
 
 	return ret;
 }
-- 
Gitee


From 400bdda9e86f6f92366a8e73d6553e249785bb5c Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:00 +0300
Subject: [PATCH 1675/2161] vfio/pci: Rename vfio_pci.c to vfio_pci_core.c

commit 1cbd70fe37870b938463fd0a4a07e45fc4a3db3c upstream.

This is a preparation patch for separating the vfio_pci driver to a
subsystem driver and a generic pci driver. This patch doesn't change any
logic.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-2-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/Makefile                        | 2 +-
 drivers/vfio/pci/{vfio_pci.c => vfio_pci_core.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename drivers/vfio/pci/{vfio_pci.c => vfio_pci_core.c} (100%)

diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index eff97a7cd9f1..bbf8d7c8fc45 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
 vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci_core.c
similarity index 100%
rename from drivers/vfio/pci/vfio_pci.c
rename to drivers/vfio/pci/vfio_pci_core.c
-- 
Gitee


From e1adde3dabb020a5c2e62fb50fc7db5095e281f0 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:01 +0300
Subject: [PATCH 1676/2161] vfio/pci: Rename vfio_pci_private.h to
 vfio_pci_core.h

commit 9a389938695a068a3149c2d21a16c34b63ca002f upstream.

This is a preparation patch for separating the vfio_pci driver to a
subsystem driver and a generic pci driver. This patch doesn't change any
logic.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-3-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_config.c                       | 2 +-
 drivers/vfio/pci/vfio_pci_core.c                         | 2 +-
 drivers/vfio/pci/{vfio_pci_private.h => vfio_pci_core.h} | 6 +++---
 drivers/vfio/pci/vfio_pci_igd.c                          | 2 +-
 drivers/vfio/pci/vfio_pci_intrs.c                        | 2 +-
 drivers/vfio/pci/vfio_pci_rdwr.c                         | 2 +-
 drivers/vfio/pci/vfio_pci_zdev.c                         | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)
 rename drivers/vfio/pci/{vfio_pci_private.h => vfio_pci_core.h} (98%)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index d57f037f65b8..44fbc35f1a9d 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -26,7 +26,7 @@
 #include <linux/vfio.h>
 #include <linux/slab.h>
 
-#include "vfio_pci_private.h"
+#include "vfio_pci_core.h"
 
 /* Fake capability ID for standard config space */
 #define PCI_CAP_ID_BASIC	0
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index d0ed0b9df45d..e03241c4acf5 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -28,7 +28,7 @@
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
 
-#include "vfio_pci_private.h"
+#include "vfio_pci_core.h"
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_core.h
similarity index 98%
rename from drivers/vfio/pci/vfio_pci_private.h
rename to drivers/vfio/pci/vfio_pci_core.h
index 9e4f8dcf4e46..44f01832de23 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_core.h
@@ -15,8 +15,8 @@
 #include <linux/uuid.h>
 #include <linux/notifier.h>
 
-#ifndef VFIO_PCI_PRIVATE_H
-#define VFIO_PCI_PRIVATE_H
+#ifndef VFIO_PCI_CORE_H
+#define VFIO_PCI_CORE_H
 
 #define VFIO_PCI_OFFSET_SHIFT   40
 
@@ -219,4 +219,4 @@ static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
 }
 #endif
 
-#endif /* VFIO_PCI_PRIVATE_H */
+#endif /* VFIO_PCI_CORE_H */
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index aa0a29fd2762..d57c409b4033 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -15,7 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 
-#include "vfio_pci_private.h"
+#include "vfio_pci_core.h"
 
 #define OPREGION_SIGNATURE	"IntelGraphicsMem"
 #define OPREGION_SIZE		(8 * 1024)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 869dce5f134d..df1e8c8c274c 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -20,7 +20,7 @@
 #include <linux/wait.h>
 #include <linux/slab.h>
 
-#include "vfio_pci_private.h"
+#include "vfio_pci_core.h"
 
 /*
  * INTx
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index a0b5fc8e46f4..667e82726e75 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -17,7 +17,7 @@
 #include <linux/vfio.h>
 #include <linux/vgaarb.h>
 
-#include "vfio_pci_private.h"
+#include "vfio_pci_core.h"
 
 #ifdef __LITTLE_ENDIAN
 #define vfio_ioread64	ioread64
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 104fcf6658db..51e4092984c2 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -14,7 +14,7 @@
 #include <asm/pci_clp.h>
 #include <asm/pci_io.h>
 
-#include "vfio_pci_private.h"
+#include "vfio_pci_core.h"
 
 /*
  * Add the Base PCI Function information to the device info region.
-- 
Gitee


From f7d8c708d243d82e55ebb82ae7a6e52047734fc2 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 14 Jun 2022 21:15:53 +0800
Subject: [PATCH 1677/2161] vfio/pci: remove vfio_pci_nvlink2

commit b392a198917020cac996fd207355211ecfcfad84 upstream.

This driver never had any open userspace (which for VFIO would include
VM kernel drivers) that use it, and thus should never have been added
by our normal userspace ABI rules.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Message-Id: <20210326061311.1497642-2-hch@lst.de>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/Kconfig         |  6 -----
 drivers/vfio/pci/Makefile        |  1 -
 drivers/vfio/pci/vfio_pci_core.c | 18 ---------------
 drivers/vfio/pci/vfio_pci_core.h | 14 ------------
 include/uapi/linux/vfio.h        | 38 ++++----------------------------
 5 files changed, 4 insertions(+), 73 deletions(-)

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2232a4a4215b..53ce78d7d07b 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -39,9 +39,3 @@ config VFIO_PCI_IGD
 	  and LPC bridge config space.
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
-
-config VFIO_PCI_NVLINK2
-	def_bool y
-	depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU
-	help
-	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
\ No newline at end of file
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index bbf8d7c8fc45..66a40488e967 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,7 +2,6 @@
 
 vfio-pci-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
-vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
 vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index e03241c4acf5..5e0ba9a3cc2c 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -390,24 +390,6 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 		}
 	}
 
-	if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
-			goto disable_exit;
-		}
-	}
-
-	if (pdev->vendor == PCI_VENDOR_ID_IBM &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_ibm_npu2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
-			goto disable_exit;
-		}
-	}
-
 	vfio_pci_probe_mmaps(vdev);
 
 	return 0;
diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h
index 44f01832de23..06440bf095a5 100644
--- a/drivers/vfio/pci/vfio_pci_core.h
+++ b/drivers/vfio/pci/vfio_pci_core.h
@@ -193,20 +193,6 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev)
 	return -ENODEV;
 }
 #endif
-#ifdef CONFIG_VFIO_PCI_NVLINK2
-extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev);
-extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev);
-#else
-static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
-{
-	return -ENODEV;
-}
-
-static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
-{
-	return -ENODEV;
-}
-#endif
 
 #ifdef CONFIG_S390
 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 335c0d95ffca..30be43622c6d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -332,17 +332,10 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
 
 /* 10de vendor PCI sub-types */
-/*
- * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
- */
-#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM	(1)
+/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */
 
 /* 1014 vendor PCI sub-types */
-/*
- * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
- * to do TLB invalidation on a GPU.
- */
-#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
+/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */
 
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
@@ -634,32 +627,9 @@ struct vfio_device_migration_info {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
 
-/*
- * Capability with compressed real address (aka SSA - small system address)
- * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
- * and by the userspace to associate a NVLink bridge with a GPU.
- */
-#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT	4
-
-struct vfio_region_info_cap_nvlink2_ssatgt {
-	struct vfio_info_cap_header header;
-	__u64 tgt;
-};
+/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */
 
-/*
- * Capability with an NVLink link speed. The value is read by
- * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
- * property in the device tree. The value is fixed in the hardware
- * and failing to provide the correct value results in the link
- * not working with no indication from the driver why.
- */
-#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD	5
-
-struct vfio_region_info_cap_nvlink2_lnkspd {
-	struct vfio_info_cap_header header;
-	__u32 link_speed;
-	__u32 __pad;
-};
+/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
-- 
Gitee


From 9f03c2505b09013bf7ebe71e6ce603c65c671b05 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 28 Jun 2021 14:08:12 -0600
Subject: [PATCH 1678/2161] vfio/pci: Handle concurrent vma faults

commit 6a45ece4c9af473555f01f0f8b97eba56e3c7d0d upstream.

io_remap_pfn_range() will trigger a BUG_ON if it encounters a
populated pte within the mapping range.  This can occur because we map
the entire vma on fault and multiple faults can be blocked behind the
vma_lock.  This leads to traces like the one reported below.

We can use our vma_list to test whether a given vma is mapped to avoid
this issue.

[ 1591.733256] kernel BUG at mm/memory.c:2177!
[ 1591.739515] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
[ 1591.747381] Modules linked in: vfio_iommu_type1 vfio_pci vfio_virqfd vfio pv680_mii(O)
[ 1591.760536] CPU: 2 PID: 227 Comm: lcore-worker-2 Tainted: G O 5.11.0-rc3+ #1
[ 1591.770735] Hardware name:  , BIOS HixxxxFPGA 1P B600 V121-1
[ 1591.778872] pstate: 40400009 (nZcv daif +PAN -UAO -TCO BTYPE=--)
[ 1591.786134] pc : remap_pfn_range+0x214/0x340
[ 1591.793564] lr : remap_pfn_range+0x1b8/0x340
[ 1591.799117] sp : ffff80001068bbd0
[ 1591.803476] x29: ffff80001068bbd0 x28: 0000042eff6f0000
[ 1591.810404] x27: 0000001100910000 x26: 0000001300910000
[ 1591.817457] x25: 0068000000000fd3 x24: ffffa92f1338e358
[ 1591.825144] x23: 0000001140000000 x22: 0000000000000041
[ 1591.832506] x21: 0000001300910000 x20: ffffa92f141a4000
[ 1591.839520] x19: 0000001100a00000 x18: 0000000000000000
[ 1591.846108] x17: 0000000000000000 x16: ffffa92f11844540
[ 1591.853570] x15: 0000000000000000 x14: 0000000000000000
[ 1591.860768] x13: fffffc0000000000 x12: 0000000000000880
[ 1591.868053] x11: ffff0821bf3d01d0 x10: ffff5ef2abd89000
[ 1591.875932] x9 : ffffa92f12ab0064 x8 : ffffa92f136471c0
[ 1591.883208] x7 : 0000001140910000 x6 : 0000000200000000
[ 1591.890177] x5 : 0000000000000001 x4 : 0000000000000001
[ 1591.896656] x3 : 0000000000000000 x2 : 0168044000000fd3
[ 1591.903215] x1 : ffff082126261880 x0 : fffffc2084989868
[ 1591.910234] Call trace:
[ 1591.914837]  remap_pfn_range+0x214/0x340
[ 1591.921765]  vfio_pci_mmap_fault+0xac/0x130 [vfio_pci]
[ 1591.931200]  __do_fault+0x44/0x12c
[ 1591.937031]  handle_mm_fault+0xcc8/0x1230
[ 1591.942475]  do_page_fault+0x16c/0x484
[ 1591.948635]  do_translation_fault+0xbc/0xd8
[ 1591.954171]  do_mem_abort+0x4c/0xc0
[ 1591.960316]  el0_da+0x40/0x80
[ 1591.965585]  el0_sync_handler+0x168/0x1b0
[ 1591.971608]  el0_sync+0x174/0x180
[ 1591.978312] Code: eb1b027f 540000c0 f9400022 b4fffe02 (d4210000)

Fixes: 11c4cd07ba11 ("vfio-pci: Fault mmaps to enable vma tracking")
Reported-by: Zeng Tao <prime.zeng@hisilicon.com>
Suggested-by: Zeng Tao <prime.zeng@hisilicon.com>
Link: https://lore.kernel.org/r/162497742783.3883260.3282953006487785034.stgit@omen
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 5e0ba9a3cc2c..a59bef0297e6 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1488,6 +1488,7 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct vfio_pci_device *vdev = vma->vm_private_data;
+	struct vfio_pci_mmap_vma *mmap_vma;
 	vm_fault_t ret = VM_FAULT_NOPAGE;
 
 	mutex_lock(&vdev->vma_lock);
@@ -1495,24 +1496,36 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
 
 	if (!__vfio_pci_memory_enabled(vdev)) {
 		ret = VM_FAULT_SIGBUS;
-		mutex_unlock(&vdev->vma_lock);
 		goto up_out;
 	}
 
-	if (__vfio_pci_add_vma(vdev, vma)) {
-		ret = VM_FAULT_OOM;
-		mutex_unlock(&vdev->vma_lock);
-		goto up_out;
+	/*
+	 * We populate the whole vma on fault, so we need to test whether
+	 * the vma has already been mapped, such as for concurrent faults
+	 * to the same vma.  io_remap_pfn_range() will trigger a BUG_ON if
+	 * we ask it to fill the same range again.
+	 */
+	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+		if (mmap_vma->vma == vma)
+			goto up_out;
 	}
 
-	mutex_unlock(&vdev->vma_lock);
-
 	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			       vma->vm_end - vma->vm_start, vma->vm_page_prot))
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot)) {
 		ret = VM_FAULT_SIGBUS;
+		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+		goto up_out;
+	}
+
+	if (__vfio_pci_add_vma(vdev, vma)) {
+		ret = VM_FAULT_OOM;
+		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+	}
 
 up_out:
 	up_read(&vdev->memory_lock);
+	mutex_unlock(&vdev->vma_lock);
 	return ret;
 }
 
-- 
Gitee


From 3f727b330cd0364df0952cebef3fbd428e2a2d05 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:02 +0300
Subject: [PATCH 1679/2161] vfio/pci: Rename vfio_pci_device to
 vfio_pci_core_device

commit 536475109c82841126ca341ef0f138e7298880c1 upstream.

This is a preparation patch for separating the vfio_pci driver to a
subsystem driver and a generic pci driver. This patch doesn't change any
logic.

The new vfio_pci_core_device structure will be the main structure of the
core driver and later on vfio_pci_device structure will be the main
structure of the generic vfio_pci driver.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-4-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_config.c |  68 ++++++++---------
 drivers/vfio/pci/vfio_pci_core.c   | 118 +++++++++++++++--------------
 drivers/vfio/pci/vfio_pci_core.h   |  52 ++++++-------
 drivers/vfio/pci/vfio_pci_igd.c    |  17 +++--
 drivers/vfio/pci/vfio_pci_intrs.c  |  40 +++++-----
 drivers/vfio/pci/vfio_pci_rdwr.c   |  16 ++--
 drivers/vfio/pci/vfio_pci_zdev.c   |   2 +-
 7 files changed, 158 insertions(+), 155 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 44fbc35f1a9d..619638fa55a3 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -108,9 +108,9 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
 struct perm_bits {
 	u8	*virt;		/* read/write virtual data, not hw */
 	u8	*write;		/* writeable bits */
-	int	(*readfn)(struct vfio_pci_device *vdev, int pos, int count,
+	int	(*readfn)(struct vfio_pci_core_device *vdev, int pos, int count,
 			  struct perm_bits *perm, int offset, __le32 *val);
-	int	(*writefn)(struct vfio_pci_device *vdev, int pos, int count,
+	int	(*writefn)(struct vfio_pci_core_device *vdev, int pos, int count,
 			   struct perm_bits *perm, int offset, __le32 val);
 };
 
@@ -171,7 +171,7 @@ static int vfio_user_config_write(struct pci_dev *pdev, int offset,
 	return ret;
 }
 
-static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_default_config_read(struct vfio_pci_core_device *vdev, int pos,
 				    int count, struct perm_bits *perm,
 				    int offset, __le32 *val)
 {
@@ -197,7 +197,7 @@ static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
 	return count;
 }
 
-static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_default_config_write(struct vfio_pci_core_device *vdev, int pos,
 				     int count, struct perm_bits *perm,
 				     int offset, __le32 val)
 {
@@ -244,7 +244,7 @@ static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
 }
 
 /* Allow direct read from hardware, except for capability next pointer */
-static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos,
 				   int count, struct perm_bits *perm,
 				   int offset, __le32 *val)
 {
@@ -269,7 +269,7 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
 }
 
 /* Raw access skips any kind of virtualization */
-static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 val)
 {
@@ -282,7 +282,7 @@ static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
 	return count;
 }
 
-static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos,
 				int count, struct perm_bits *perm,
 				int offset, __le32 *val)
 {
@@ -296,7 +296,7 @@ static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
 }
 
 /* Virt access uses only virtualization */
-static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_virt_config_write(struct vfio_pci_core_device *vdev, int pos,
 				  int count, struct perm_bits *perm,
 				  int offset, __le32 val)
 {
@@ -304,7 +304,7 @@ static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos,
 	return count;
 }
 
-static int vfio_virt_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 *val)
 {
@@ -396,7 +396,7 @@ static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write)
 }
 
 /* Caller should hold memory_lock semaphore */
-bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev)
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
@@ -413,7 +413,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev)
  * Restore the *real* BARs after we detect a FLR or backdoor reset.
  * (backdoor = some device specific technique that we didn't catch)
  */
-static void vfio_bar_restore(struct vfio_pci_device *vdev)
+static void vfio_bar_restore(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u32 *rbar = vdev->rbar;
@@ -460,7 +460,7 @@ static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
  * Pretend we're hardware and tweak the values of the *virtual* PCI BARs
  * to reflect the hardware capabilities.  This implements BAR sizing.
  */
-static void vfio_bar_fixup(struct vfio_pci_device *vdev)
+static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int i;
@@ -514,7 +514,7 @@ static void vfio_bar_fixup(struct vfio_pci_device *vdev)
 	vdev->bardirty = false;
 }
 
-static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_basic_config_read(struct vfio_pci_core_device *vdev, int pos,
 				  int count, struct perm_bits *perm,
 				  int offset, __le32 *val)
 {
@@ -536,7 +536,7 @@ static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
 }
 
 /* Test whether BARs match the value we think they should contain */
-static bool vfio_need_bar_restore(struct vfio_pci_device *vdev)
+static bool vfio_need_bar_restore(struct vfio_pci_core_device *vdev)
 {
 	int i = 0, pos = PCI_BASE_ADDRESS_0, ret;
 	u32 bar;
@@ -552,7 +552,7 @@ static bool vfio_need_bar_restore(struct vfio_pci_device *vdev)
 	return false;
 }
 
-static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
 				   int count, struct perm_bits *perm,
 				   int offset, __le32 val)
 {
@@ -692,7 +692,7 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
 	return 0;
 }
 
-static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos,
 				int count, struct perm_bits *perm,
 				int offset, __le32 val)
 {
@@ -747,7 +747,7 @@ static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
 	return 0;
 }
 
-static int vfio_vpd_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_vpd_config_write(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 val)
 {
@@ -829,7 +829,7 @@ static int __init init_pci_cap_pcix_perm(struct perm_bits *perm)
 	return 0;
 }
 
-static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 val)
 {
@@ -913,7 +913,7 @@ static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
 	return 0;
 }
 
-static int vfio_af_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
 				int count, struct perm_bits *perm,
 				int offset, __le32 val)
 {
@@ -1072,7 +1072,7 @@ int __init vfio_pci_init_perm_bits(void)
 	return ret;
 }
 
-static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
+static int vfio_find_cap_start(struct vfio_pci_core_device *vdev, int pos)
 {
 	u8 cap;
 	int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
@@ -1089,7 +1089,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
 	return pos;
 }
 
-static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_msi_config_read(struct vfio_pci_core_device *vdev, int pos,
 				int count, struct perm_bits *perm,
 				int offset, __le32 *val)
 {
@@ -1109,7 +1109,7 @@ static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
 	return vfio_default_config_read(vdev, pos, count, perm, offset, val);
 }
 
-static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_msi_config_write(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 val)
 {
@@ -1189,7 +1189,7 @@ static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags)
 }
 
 /* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */
-static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos)
+static int vfio_msi_cap_len(struct vfio_pci_core_device *vdev, u8 pos)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int len, ret;
@@ -1222,7 +1222,7 @@ static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos)
 }
 
 /* Determine extended capability length for VC (2 & 9) and MFVC */
-static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
+static int vfio_vc_cap_len(struct vfio_pci_core_device *vdev, u16 pos)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u32 tmp;
@@ -1263,7 +1263,7 @@ static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
 	return len;
 }
 
-static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
+static int vfio_cap_len(struct vfio_pci_core_device *vdev, u8 cap, u8 pos)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u32 dword;
@@ -1338,7 +1338,7 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
 	return 0;
 }
 
-static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
+static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epos)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 byte;
@@ -1412,7 +1412,7 @@ static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
 	return 0;
 }
 
-static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
+static int vfio_fill_vconfig_bytes(struct vfio_pci_core_device *vdev,
 				   int offset, int size)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -1459,7 +1459,7 @@ static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
 	return ret;
 }
 
-static int vfio_cap_init(struct vfio_pci_device *vdev)
+static int vfio_cap_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 *map = vdev->pci_config_map;
@@ -1549,7 +1549,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
 	return 0;
 }
 
-static int vfio_ecap_init(struct vfio_pci_device *vdev)
+static int vfio_ecap_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 *map = vdev->pci_config_map;
@@ -1669,7 +1669,7 @@ static const struct pci_device_id known_bogus_vf_intx_pin[] = {
  * for each area requiring emulated bits, but the array of pointers
  * would be comparable in size (at least for standard config space).
  */
-int vfio_config_init(struct vfio_pci_device *vdev)
+int vfio_config_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 *map, *vconfig;
@@ -1773,7 +1773,7 @@ int vfio_config_init(struct vfio_pci_device *vdev)
 	return pcibios_err_to_errno(ret);
 }
 
-void vfio_config_free(struct vfio_pci_device *vdev)
+void vfio_config_free(struct vfio_pci_core_device *vdev)
 {
 	kfree(vdev->vconfig);
 	vdev->vconfig = NULL;
@@ -1790,7 +1790,7 @@ void vfio_config_free(struct vfio_pci_device *vdev)
  * Find the remaining number of bytes in a dword that match the given
  * position.  Stop at either the end of the capability or the dword boundary.
  */
-static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
+static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev,
 					   loff_t pos)
 {
 	u8 cap = vdev->pci_config_map[pos];
@@ -1802,7 +1802,7 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
 	return i;
 }
 
-static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
+static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 				 size_t count, loff_t *ppos, bool iswrite)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -1885,7 +1885,7 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
 	return ret;
 }
 
-ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
+ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			   size_t count, loff_t *ppos, bool iswrite)
 {
 	size_t done = 0;
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index a59bef0297e6..619425eb0e47 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -121,7 +121,7 @@ static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
  */
 static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga)
 {
-	struct vfio_pci_device *vdev = opaque;
+	struct vfio_pci_core_device *vdev = opaque;
 	struct pci_dev *tmp = NULL, *pdev = vdev->pdev;
 	unsigned char max_busnr;
 	unsigned int decodes;
@@ -155,7 +155,7 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
-static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
+static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
 {
 	struct resource *res;
 	int i;
@@ -225,7 +225,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 
 struct vfio_pci_group_info;
 static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
-static void vfio_pci_disable(struct vfio_pci_device *vdev);
+static void vfio_pci_disable(struct vfio_pci_core_device *vdev);
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 				      struct vfio_pci_group_info *groups);
 
@@ -260,7 +260,7 @@ static bool vfio_pci_nointx(struct pci_dev *pdev)
 	return false;
 }
 
-static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
+static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u16 pmcsr;
@@ -280,7 +280,7 @@ static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
  * by PM capability emulation and separately from pci_dev internal saved state
  * to avoid it being overwritten and consumed around other resets.
  */
-int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
+int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	bool needs_restore = false, needs_save = false;
@@ -311,7 +311,7 @@ int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
 	return ret;
 }
 
-static int vfio_pci_enable(struct vfio_pci_device *vdev)
+static int vfio_pci_enable(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int ret;
@@ -399,7 +399,7 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 	return ret;
 }
 
-static void vfio_pci_disable(struct vfio_pci_device *vdev)
+static void vfio_pci_disable(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	struct vfio_pci_dummy_resource *dummy_res, *tmp;
@@ -498,7 +498,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 
 static struct pci_driver vfio_pci_driver;
 
-static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev)
+static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *physfn = pci_physfn(vdev->pdev);
 	struct vfio_device *pf_dev;
@@ -515,12 +515,12 @@ static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev)
 		return NULL;
 	}
 
-	return container_of(pf_dev, struct vfio_pci_device, vdev);
+	return container_of(pf_dev, struct vfio_pci_core_device, vdev);
 }
 
-static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
+static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int val)
 {
-	struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev);
+	struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev);
 
 	if (!pf_vdev)
 		return;
@@ -535,8 +535,8 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
 
 static void vfio_pci_close_device(struct vfio_device *core_vdev)
 {
-	struct vfio_pci_device *vdev =
-		container_of(core_vdev, struct vfio_pci_device, vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 
 	vfio_pci_vf_token_user_add(vdev, -1);
 	vfio_spapr_pci_eeh_release(vdev->pdev);
@@ -556,8 +556,8 @@ static void vfio_pci_close_device(struct vfio_device *core_vdev)
 
 static int vfio_pci_open_device(struct vfio_device *core_vdev)
 {
-	struct vfio_pci_device *vdev =
-		container_of(core_vdev, struct vfio_pci_device, vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	int ret = 0;
 
 	ret = vfio_pci_enable(vdev);
@@ -569,7 +569,7 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev)
 	return 0;
 }
 
-static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
+static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
 {
 	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
 		u8 pin;
@@ -690,7 +690,7 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 	return walk.ret;
 }
 
-static int msix_mmappable_cap(struct vfio_pci_device *vdev,
+static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
 			      struct vfio_info_cap *caps)
 {
 	struct vfio_info_cap_header header = {
@@ -701,7 +701,7 @@ static int msix_mmappable_cap(struct vfio_pci_device *vdev,
 	return vfio_info_add_capability(caps, &header, sizeof(header));
 }
 
-int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
+int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
 				 unsigned int type, unsigned int subtype,
 				 const struct vfio_pci_regops *ops,
 				 size_t size, u32 flags, void *data)
@@ -730,8 +730,8 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
 static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 			   unsigned int cmd, unsigned long arg)
 {
-	struct vfio_pci_device *vdev =
-		container_of(core_vdev, struct vfio_pci_device, vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	unsigned long minsz;
 
 	if (cmd == VFIO_DEVICE_GET_INFO) {
@@ -1271,7 +1271,7 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 	return -ENOTTY;
 }
 
-static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf,
+static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			   size_t count, loff_t *ppos, bool iswrite)
 {
 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
@@ -1305,8 +1305,8 @@ static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf,
 static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	struct vfio_pci_device *vdev =
-		container_of(core_vdev, struct vfio_pci_device, vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 
 	if (!count)
 		return 0;
@@ -1317,8 +1317,8 @@ static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf,
 static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf,
 			      size_t count, loff_t *ppos)
 {
-	struct vfio_pci_device *vdev =
-		container_of(core_vdev, struct vfio_pci_device, vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 
 	if (!count)
 		return 0;
@@ -1327,7 +1327,7 @@ static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *
 }
 
 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
-static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try)
+static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
 {
 	struct vfio_pci_mmap_vma *mmap_vma, *tmp;
 
@@ -1417,14 +1417,14 @@ static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try)
 	}
 }
 
-void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev)
+void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
 {
 	vfio_pci_zap_and_vma_lock(vdev, false);
 	down_write(&vdev->memory_lock);
 	mutex_unlock(&vdev->vma_lock);
 }
 
-u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev)
+u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
 {
 	u16 cmd;
 
@@ -1437,14 +1437,14 @@ u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev)
 	return cmd;
 }
 
-void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd)
+void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
 {
 	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
 	up_write(&vdev->memory_lock);
 }
 
 /* Caller holds vma_lock */
-static int __vfio_pci_add_vma(struct vfio_pci_device *vdev,
+static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
 			      struct vm_area_struct *vma)
 {
 	struct vfio_pci_mmap_vma *mmap_vma;
@@ -1470,7 +1470,7 @@ static void vfio_pci_mmap_open(struct vm_area_struct *vma)
 
 static void vfio_pci_mmap_close(struct vm_area_struct *vma)
 {
-	struct vfio_pci_device *vdev = vma->vm_private_data;
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
 	struct vfio_pci_mmap_vma *mmap_vma;
 
 	mutex_lock(&vdev->vma_lock);
@@ -1487,7 +1487,7 @@ static void vfio_pci_mmap_close(struct vm_area_struct *vma)
 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct vfio_pci_device *vdev = vma->vm_private_data;
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
 	struct vfio_pci_mmap_vma *mmap_vma;
 	vm_fault_t ret = VM_FAULT_NOPAGE;
 
@@ -1537,8 +1537,8 @@ static const struct vm_operations_struct vfio_pci_mmap_ops = {
 
 static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
 {
-	struct vfio_pci_device *vdev =
-		container_of(core_vdev, struct vfio_pci_device, vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned int index;
 	u64 phys_len, req_len, pgoff, req_start;
@@ -1608,8 +1608,8 @@ static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *v
 
 static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count)
 {
-	struct vfio_pci_device *vdev =
-		container_of(core_vdev, struct vfio_pci_device, vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	struct pci_dev *pdev = vdev->pdev;
 
 	mutex_lock(&vdev->igate);
@@ -1628,7 +1628,7 @@ static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count)
 	mutex_unlock(&vdev->igate);
 }
 
-static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
+static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 				      bool vf_token, uuid_t *uuid)
 {
 	/*
@@ -1660,7 +1660,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
 		return 0; /* No VF token provided or required */
 
 	if (vdev->pdev->is_virtfn) {
-		struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev);
+		struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev);
 		bool match;
 
 		if (!pf_vdev) {
@@ -1724,8 +1724,8 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
 
 static int vfio_pci_match(struct vfio_device *core_vdev, char *buf)
 {
-	struct vfio_pci_device *vdev =
-		container_of(core_vdev, struct vfio_pci_device, vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	bool vf_token = false;
 	uuid_t uuid;
 	int ret;
@@ -1787,8 +1787,8 @@ static const struct vfio_device_ops vfio_pci_ops = {
 static int vfio_pci_bus_notifier(struct notifier_block *nb,
 				 unsigned long action, void *data)
 {
-	struct vfio_pci_device *vdev = container_of(nb,
-						    struct vfio_pci_device, nb);
+	struct vfio_pci_core_device *vdev = container_of(nb,
+						    struct vfio_pci_core_device, nb);
 	struct device *dev = data;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct pci_dev *physfn = pci_physfn(pdev);
@@ -1812,7 +1812,7 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb,
 	return 0;
 }
 
-static int vfio_pci_vf_init(struct vfio_pci_device *vdev)
+static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int ret;
@@ -1836,7 +1836,7 @@ static int vfio_pci_vf_init(struct vfio_pci_device *vdev)
 	return 0;
 }
 
-static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev)
+static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
 {
 	if (!vdev->vf_token)
 		return;
@@ -1847,7 +1847,7 @@ static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev)
 	kfree(vdev->vf_token);
 }
 
-static int vfio_pci_vga_init(struct vfio_pci_device *vdev)
+static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int ret;
@@ -1862,7 +1862,7 @@ static int vfio_pci_vga_init(struct vfio_pci_device *vdev)
 	return 0;
 }
 
-static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev)
+static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 
@@ -1876,7 +1876,7 @@ static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev)
 
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	struct vfio_pci_device *vdev;
+	struct vfio_pci_core_device *vdev;
 	struct iommu_group *group;
 	int ret;
 
@@ -1980,7 +1980,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 static void vfio_pci_remove(struct pci_dev *pdev)
 {
-	struct vfio_pci_device *vdev = dev_get_drvdata(&pdev->dev);
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
 
 	pci_disable_sriov(pdev);
 
@@ -2004,14 +2004,14 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 						  pci_channel_state_t state)
 {
-	struct vfio_pci_device *vdev;
+	struct vfio_pci_core_device *vdev;
 	struct vfio_device *device;
 
 	device = vfio_device_get_from_dev(&pdev->dev);
 	if (device == NULL)
 		return PCI_ERS_RESULT_DISCONNECT;
 
-	vdev = container_of(device, struct vfio_pci_device, vdev);
+	vdev = container_of(device, struct vfio_pci_core_device, vdev);
 
 	mutex_lock(&vdev->igate);
 
@@ -2062,7 +2062,7 @@ static struct pci_driver vfio_pci_driver = {
 	.err_handler		= &vfio_err_handlers,
 };
 
-static bool vfio_dev_in_groups(struct vfio_pci_device *vdev,
+static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
 			       struct vfio_pci_group_info *groups)
 {
 	unsigned int i;
@@ -2106,7 +2106,8 @@ vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
 	 * reset, so any pci_dev will have the same outcomes for
 	 * pci_probe_reset_*() and pci_reset_bus().
 	 */
-	pdev = list_first_entry(&dev_set->device_list, struct vfio_pci_device,
+	pdev = list_first_entry(&dev_set->device_list,
+				struct vfio_pci_core_device,
 				vdev.dev_set_list)->pdev;
 
 	/* pci_reset_bus() is supported */
@@ -2128,16 +2129,17 @@ vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 				      struct vfio_pci_group_info *groups)
 {
-	struct vfio_pci_device *cur_mem;
-	struct vfio_pci_device *cur_vma;
-	struct vfio_pci_device *cur;
+	struct vfio_pci_core_device *cur_mem;
+	struct vfio_pci_core_device *cur_vma;
+	struct vfio_pci_core_device *cur;
 	struct pci_dev *pdev;
 	bool is_mem = true;
 	int ret;
 
 	mutex_lock(&dev_set->lock);
 	cur_mem = list_first_entry(&dev_set->device_list,
-				   struct vfio_pci_device, vdev.dev_set_list);
+				   struct vfio_pci_core_device,
+				   vdev.dev_set_list);
 
 	pdev = vfio_pci_dev_set_resettable(dev_set);
 	if (!pdev) {
@@ -2195,7 +2197,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 
 static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
 {
-	struct vfio_pci_device *cur;
+	struct vfio_pci_core_device *cur;
 	bool needs_reset = false;
 
 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
@@ -2217,7 +2219,7 @@ static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
  */
 static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
 {
-	struct vfio_pci_device *cur;
+	struct vfio_pci_core_device *cur;
 	struct pci_dev *pdev;
 	int ret;
 
diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h
index 06440bf095a5..120526fb321f 100644
--- a/drivers/vfio/pci/vfio_pci_core.h
+++ b/drivers/vfio/pci/vfio_pci_core.h
@@ -33,7 +33,7 @@
 
 struct vfio_pci_ioeventfd {
 	struct list_head	next;
-	struct vfio_pci_device	*vdev;
+	struct vfio_pci_core_device	*vdev;
 	struct virqfd		*virqfd;
 	void __iomem		*addr;
 	uint64_t		data;
@@ -52,18 +52,18 @@ struct vfio_pci_irq_ctx {
 	struct irq_bypass_producer	producer;
 };
 
-struct vfio_pci_device;
+struct vfio_pci_core_device;
 struct vfio_pci_region;
 
 struct vfio_pci_regops {
-	ssize_t	(*rw)(struct vfio_pci_device *vdev, char __user *buf,
+	ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
 		      size_t count, loff_t *ppos, bool iswrite);
-	void	(*release)(struct vfio_pci_device *vdev,
+	void	(*release)(struct vfio_pci_core_device *vdev,
 			   struct vfio_pci_region *region);
-	int	(*mmap)(struct vfio_pci_device *vdev,
+	int	(*mmap)(struct vfio_pci_core_device *vdev,
 			struct vfio_pci_region *region,
 			struct vm_area_struct *vma);
-	int	(*add_capability)(struct vfio_pci_device *vdev,
+	int	(*add_capability)(struct vfio_pci_core_device *vdev,
 				  struct vfio_pci_region *region,
 				  struct vfio_info_cap *caps);
 };
@@ -94,7 +94,7 @@ struct vfio_pci_vf_token {
 	int			users;
 };
 
-struct vfio_pci_device {
+struct vfio_pci_core_device {
 	struct vfio_device	vdev;
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_NUM_BARS];
@@ -144,61 +144,61 @@ struct vfio_pci_device {
 #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
 #define irq_is(vdev, type) (vdev->irq_type == type)
 
-extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev);
-extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev);
+extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
 
-extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
+extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
 				   uint32_t flags, unsigned index,
 				   unsigned start, unsigned count, void *data);
 
-extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev,
+extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
 				  char __user *buf, size_t count,
 				  loff_t *ppos, bool iswrite);
 
-extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
+extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			       size_t count, loff_t *ppos, bool iswrite);
 
-extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
+extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			       size_t count, loff_t *ppos, bool iswrite);
 
-extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
+extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
 			       uint64_t data, int count, int fd);
 
 extern int vfio_pci_init_perm_bits(void);
 extern void vfio_pci_uninit_perm_bits(void);
 
-extern int vfio_config_init(struct vfio_pci_device *vdev);
-extern void vfio_config_free(struct vfio_pci_device *vdev);
+extern int vfio_config_init(struct vfio_pci_core_device *vdev);
+extern void vfio_config_free(struct vfio_pci_core_device *vdev);
 
-extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
+extern int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
 					unsigned int type, unsigned int subtype,
 					const struct vfio_pci_regops *ops,
 					size_t size, u32 flags, void *data);
 
-extern int vfio_pci_set_power_state(struct vfio_pci_device *vdev,
+extern int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
 				    pci_power_t state);
 
-extern bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev);
-extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device
+extern bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device
 						    *vdev);
-extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev);
-extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev,
+extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
 					       u16 cmd);
 
 #ifdef CONFIG_VFIO_PCI_IGD
-extern int vfio_pci_igd_init(struct vfio_pci_device *vdev);
+extern int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
 #else
-static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev)
+static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 {
 	return -ENODEV;
 }
 #endif
 
 #ifdef CONFIG_S390
-extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 				       struct vfio_info_cap *caps);
 #else
-static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 					      struct vfio_info_cap *caps)
 {
 	return -ENODEV;
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index d57c409b4033..a324ca7e6b5a 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -25,8 +25,9 @@
 #define OPREGION_RVDS		0x3c2
 #define OPREGION_VERSION	0x16
 
-static ssize_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
-			       size_t count, loff_t *ppos, bool iswrite)
+static ssize_t vfio_pci_igd_rw(struct vfio_pci_core_device *vdev,
+			       char __user *buf, size_t count, loff_t *ppos,
+			       bool iswrite)
 {
 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
 	void *base = vdev->region[i].data;
@@ -45,7 +46,7 @@ static ssize_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
 	return count;
 }
 
-static void vfio_pci_igd_release(struct vfio_pci_device *vdev,
+static void vfio_pci_igd_release(struct vfio_pci_core_device *vdev,
 				 struct vfio_pci_region *region)
 {
 	memunmap(region->data);
@@ -56,7 +57,7 @@ static const struct vfio_pci_regops vfio_pci_igd_regops = {
 	.release	= vfio_pci_igd_release,
 };
 
-static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
+static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev)
 {
 	__le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR);
 	u32 addr, size;
@@ -160,7 +161,7 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
 	return ret;
 }
 
-static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
+static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev,
 				   char __user *buf, size_t count, loff_t *ppos,
 				   bool iswrite)
 {
@@ -253,7 +254,7 @@ static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 	return count;
 }
 
-static void vfio_pci_igd_cfg_release(struct vfio_pci_device *vdev,
+static void vfio_pci_igd_cfg_release(struct vfio_pci_core_device *vdev,
 				     struct vfio_pci_region *region)
 {
 	struct pci_dev *pdev = region->data;
@@ -266,7 +267,7 @@ static const struct vfio_pci_regops vfio_pci_igd_cfg_regops = {
 	.release	= vfio_pci_igd_cfg_release,
 };
 
-static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev)
+static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *host_bridge, *lpc_bridge;
 	int ret;
@@ -314,7 +315,7 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev)
 	return 0;
 }
 
-int vfio_pci_igd_init(struct vfio_pci_device *vdev)
+int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 {
 	int ret;
 
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index df1e8c8c274c..945ddbdf4d11 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -27,13 +27,13 @@
  */
 static void vfio_send_intx_eventfd(void *opaque, void *unused)
 {
-	struct vfio_pci_device *vdev = opaque;
+	struct vfio_pci_core_device *vdev = opaque;
 
 	if (likely(is_intx(vdev) && !vdev->virq_disabled))
 		eventfd_signal(vdev->ctx[0].trigger, 1);
 }
 
-void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
+void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned long flags;
@@ -73,7 +73,7 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
  */
 static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
 {
-	struct vfio_pci_device *vdev = opaque;
+	struct vfio_pci_core_device *vdev = opaque;
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned long flags;
 	int ret = 0;
@@ -107,7 +107,7 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
 	return ret;
 }
 
-void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
+void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
 {
 	if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
 		vfio_send_intx_eventfd(vdev, NULL);
@@ -115,7 +115,7 @@ void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
 
 static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
 {
-	struct vfio_pci_device *vdev = dev_id;
+	struct vfio_pci_core_device *vdev = dev_id;
 	unsigned long flags;
 	int ret = IRQ_NONE;
 
@@ -139,7 +139,7 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
 	return ret;
 }
 
-static int vfio_intx_enable(struct vfio_pci_device *vdev)
+static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
 {
 	if (!is_irq_none(vdev))
 		return -EINVAL;
@@ -168,7 +168,7 @@ static int vfio_intx_enable(struct vfio_pci_device *vdev)
 	return 0;
 }
 
-static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
+static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned long irqflags = IRQF_SHARED;
@@ -223,7 +223,7 @@ static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
 	return 0;
 }
 
-static void vfio_intx_disable(struct vfio_pci_device *vdev)
+static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
 {
 	vfio_virqfd_disable(&vdev->ctx[0].unmask);
 	vfio_virqfd_disable(&vdev->ctx[0].mask);
@@ -244,7 +244,7 @@ static irqreturn_t vfio_msihandler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
+static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msix)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned int flag = msix ? PCI_IRQ_MSIX : PCI_IRQ_MSI;
@@ -285,7 +285,7 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
 	return 0;
 }
 
-static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
+static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
 				      int vector, int fd, bool msix)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -364,7 +364,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
+static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start,
 			      unsigned count, int32_t *fds, bool msix)
 {
 	int i, j, ret = 0;
@@ -385,7 +385,7 @@ static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
 	return ret;
 }
 
-static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
+static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int i;
@@ -417,7 +417,7 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
 /*
  * IOCTL support
  */
-static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
@@ -444,7 +444,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_mask(struct vfio_pci_core_device *vdev,
 				  unsigned index, unsigned start,
 				  unsigned count, uint32_t flags, void *data)
 {
@@ -464,7 +464,7 @@ static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
 				     unsigned index, unsigned start,
 				     unsigned count, uint32_t flags, void *data)
 {
@@ -507,7 +507,7 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
@@ -613,7 +613,7 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
 	return -EINVAL;
 }
 
-static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
@@ -624,7 +624,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
 					       count, flags, data);
 }
 
-static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
@@ -635,11 +635,11 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
 					       count, flags, data);
 }
 
-int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
+int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 			    unsigned index, unsigned start, unsigned count,
 			    void *data)
 {
-	int (*func)(struct vfio_pci_device *vdev, unsigned index,
+	int (*func)(struct vfio_pci_core_device *vdev, unsigned index,
 		    unsigned start, unsigned count, uint32_t flags,
 		    void *data) = NULL;
 
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 667e82726e75..8fff4689dd44 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -38,7 +38,7 @@
 #define vfio_iowrite8	iowrite8
 
 #define VFIO_IOWRITE(size) \
-static int vfio_pci_iowrite##size(struct vfio_pci_device *vdev,		\
+static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev,		\
 			bool test_mem, u##size val, void __iomem *io)	\
 {									\
 	if (test_mem) {							\
@@ -65,7 +65,7 @@ VFIO_IOWRITE(64)
 #endif
 
 #define VFIO_IOREAD(size) \
-static int vfio_pci_ioread##size(struct vfio_pci_device *vdev,		\
+static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev,		\
 			bool test_mem, u##size *val, void __iomem *io)	\
 {									\
 	if (test_mem) {							\
@@ -94,7 +94,7 @@ VFIO_IOREAD(32)
  * reads with -1.  This is intended for handling MSI-X vector tables and
  * leftover space for ROM BARs.
  */
-static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem,
+static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 			void __iomem *io, char __user *buf,
 			loff_t off, size_t count, size_t x_start,
 			size_t x_end, bool iswrite)
@@ -200,7 +200,7 @@ static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem,
 	return done;
 }
 
-static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
+static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int ret;
@@ -224,7 +224,7 @@ static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
 	return 0;
 }
 
-ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
+ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos, bool iswrite)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -288,7 +288,7 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 	return done;
 }
 
-ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
+ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			       size_t count, loff_t *ppos, bool iswrite)
 {
 	int ret;
@@ -384,7 +384,7 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
 static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
 {
 	struct vfio_pci_ioeventfd *ioeventfd = opaque;
-	struct vfio_pci_device *vdev = ioeventfd->vdev;
+	struct vfio_pci_core_device *vdev = ioeventfd->vdev;
 
 	if (ioeventfd->test_mem) {
 		if (!down_read_trylock(&vdev->memory_lock))
@@ -410,7 +410,7 @@ static void vfio_pci_ioeventfd_thread(void *opaque, void *unused)
 	vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem);
 }
 
-long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
+long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
 			uint64_t data, int count, int fd)
 {
 	struct pci_dev *pdev = vdev->pdev;
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 51e4092984c2..d9a669f41d90 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -109,7 +109,7 @@ static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 /*
  * Add all supported capabilities to the VFIO_DEVICE_GET_INFO capability chain.
  */
-int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 				struct vfio_info_cap *caps)
 {
 	struct zpci_dev *zdev = to_zpci(vdev->pdev);
-- 
Gitee


From e9694bb5501612ffd5eb23cf0f34848e8d471c13 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:03 +0300
Subject: [PATCH 1680/2161] vfio/pci: Rename ops functions to fit core namings

commit bf9fdc9a74cf61fe9f646c43eac4823481f1e20a upstream.

This is another preparation patch for separating the vfio_pci driver to
a subsystem driver and a generic pci driver. This patch doesn't change
any logic.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-5-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 619425eb0e47..7f6707cb8c58 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -533,7 +533,7 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int va
 	vfio_device_put(&pf_vdev->vdev);
 }
 
-static void vfio_pci_close_device(struct vfio_device *core_vdev)
+static void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -554,7 +554,7 @@ static void vfio_pci_close_device(struct vfio_device *core_vdev)
 	mutex_unlock(&vdev->igate);
 }
 
-static int vfio_pci_open_device(struct vfio_device *core_vdev)
+static int vfio_pci_core_open_device(struct vfio_device *core_vdev)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -727,7 +727,7 @@ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
 	return 0;
 }
 
-static long vfio_pci_ioctl(struct vfio_device *core_vdev,
+static long vfio_pci_core_ioctl(struct vfio_device *core_vdev,
 			   unsigned int cmd, unsigned long arg)
 {
 	struct vfio_pci_core_device *vdev =
@@ -1302,7 +1302,7 @@ static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 	return -EINVAL;
 }
 
-static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf,
+static ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
 			     size_t count, loff_t *ppos)
 {
 	struct vfio_pci_core_device *vdev =
@@ -1314,7 +1314,7 @@ static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf,
 	return vfio_pci_rw(vdev, buf, count, ppos, false);
 }
 
-static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf,
+static ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
 			      size_t count, loff_t *ppos)
 {
 	struct vfio_pci_core_device *vdev =
@@ -1535,7 +1535,7 @@ static const struct vm_operations_struct vfio_pci_mmap_ops = {
 	.fault = vfio_pci_mmap_fault,
 };
 
-static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
+static int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1606,7 +1606,7 @@ static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *v
 	return 0;
 }
 
-static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count)
+static void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1722,7 +1722,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 
 #define VF_TOKEN_ARG "vf_token="
 
-static int vfio_pci_match(struct vfio_device *core_vdev, char *buf)
+static int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1774,14 +1774,14 @@ static int vfio_pci_match(struct vfio_device *core_vdev, char *buf)
 
 static const struct vfio_device_ops vfio_pci_ops = {
 	.name		= "vfio-pci",
-	.open_device	= vfio_pci_open_device,
-	.close_device	= vfio_pci_close_device,
-	.ioctl		= vfio_pci_ioctl,
-	.read		= vfio_pci_read,
-	.write		= vfio_pci_write,
-	.mmap		= vfio_pci_mmap,
-	.request	= vfio_pci_request,
-	.match		= vfio_pci_match,
+	.open_device	= vfio_pci_core_open_device,
+	.close_device	= vfio_pci_core_close_device,
+	.ioctl		= vfio_pci_core_ioctl,
+	.read		= vfio_pci_core_read,
+	.write		= vfio_pci_core_write,
+	.mmap		= vfio_pci_core_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
 };
 
 static int vfio_pci_bus_notifier(struct notifier_block *nb,
-- 
Gitee


From c5d5a41c0b4b3e399faa2f95e45ba9fcc64364df Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:04 +0300
Subject: [PATCH 1681/2161] vfio/pci: Include vfio header in vfio_pci_core.h

commit c39f8fa76cdd0c96f82fa785a0d6c92afe8f4a77 upstream.

The vfio_device structure is embedded into the vfio_pci_core_device
structure, so there is no reason for not including the header file in
the vfio_pci_core header as well.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-6-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 1 -
 drivers/vfio/pci/vfio_pci_core.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 7f6707cb8c58..34f939acd4d9 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -23,7 +23,6 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
-#include <linux/vfio.h>
 #include <linux/vgaarb.h>
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h
index 120526fb321f..2162066ac0a8 100644
--- a/drivers/vfio/pci/vfio_pci_core.h
+++ b/drivers/vfio/pci/vfio_pci_core.h
@@ -10,6 +10,7 @@
 
 #include <linux/mutex.h>
 #include <linux/pci.h>
+#include <linux/vfio.h>
 #include <linux/irqbypass.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
-- 
Gitee


From 107efc0e1c7c1a26c33d1dfcbcefb14b11b91d24 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:05 +0300
Subject: [PATCH 1682/2161] vfio/pci: Split the pci_driver code out of
 vfio_pci_core.c

commit ff53edf6d6ab0970f86595b3f5d179df51848723 upstream.

Split the vfio_pci driver into two logical parts, the 'struct
pci_driver' (vfio_pci.c) which implements "Generic VFIO support for any
PCI device" and a library of code (vfio_pci_core.c) that helps
implementing a struct vfio_device on top of a PCI device.

vfio_pci.ko continues to present the same interface under sysfs and this
change should have no functional impact.

Following patches will turn vfio_pci and vfio_pci_core into a separate
module.

This is a preparation for allowing another module to provide the
pci_driver and allow that module to customize how VFIO is setup, inject
its own operations, and easily extend vendor specific functionality.

At this point the vfio_pci_core still contains a lot of vfio_pci
functionality mixed into it. Following patches will move more of the
large scale items out, but another cleanup series will be needed to get
everything.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-7-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/Makefile        |   2 +-
 drivers/vfio/pci/vfio_pci.c      | 223 +++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_core.c | 271 +++++++------------------------
 drivers/vfio/pci/vfio_pci_core.h |  23 +++
 4 files changed, 304 insertions(+), 215 deletions(-)
 create mode 100644 drivers/vfio/pci/vfio_pci.c

diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 66a40488e967..8aa517b4b671 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-vfio-pci-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-y := vfio_pci.o vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o
 
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
new file mode 100644
index 000000000000..4e31bd3001ad
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+#include "vfio_pci_core.h"
+
+#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
+
+static char ids[1024] __initdata;
+module_param_string(ids, ids, sizeof(ids), 0);
+MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified");
+
+static bool enable_sriov;
+#ifdef CONFIG_PCI_IOV
+module_param(enable_sriov, bool, 0644);
+MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration.  Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
+#endif
+
+static bool disable_denylist;
+module_param(disable_denylist, bool, 0444);
+MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
+
+static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
+{
+	switch (pdev->vendor) {
+	case PCI_VENDOR_ID_INTEL:
+		switch (pdev->device) {
+		case PCI_DEVICE_ID_INTEL_QAT_C3XXX:
+		case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF:
+		case PCI_DEVICE_ID_INTEL_QAT_C62X:
+		case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
+		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
+		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	return false;
+}
+
+static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
+{
+	if (!vfio_pci_dev_in_denylist(pdev))
+		return false;
+
+	if (disable_denylist) {
+		pci_warn(pdev,
+			 "device denylist disabled - allowing device %04x:%04x.\n",
+			 pdev->vendor, pdev->device);
+		return false;
+	}
+
+	pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
+		 pdev->vendor, pdev->device);
+
+	return true;
+}
+
+static const struct vfio_device_ops vfio_pci_ops = {
+	.name		= "vfio-pci",
+	.open_device	= vfio_pci_core_open_device,
+	.close_device	= vfio_pci_core_close_device,
+	.ioctl		= vfio_pci_core_ioctl,
+	.read		= vfio_pci_core_read,
+	.write		= vfio_pci_core_write,
+	.mmap		= vfio_pci_core_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+};
+
+static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct vfio_pci_core_device *vdev;
+	int ret;
+
+	if (vfio_pci_is_denylisted(pdev))
+		return -EINVAL;
+
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return -ENOMEM;
+	vfio_pci_core_init_device(vdev, pdev, &vfio_pci_ops);
+
+	ret = vfio_pci_core_register_device(vdev);
+	if (ret)
+		goto out_free;
+	return 0;
+
+out_free:
+	vfio_pci_core_uninit_device(vdev);
+	kfree(vdev);
+	return ret;
+}
+
+static void vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+
+	vfio_pci_core_unregister_device(vdev);
+	vfio_pci_core_uninit_device(vdev);
+	kfree(vdev);
+}
+
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+	if (!enable_sriov)
+		return -ENOENT;
+
+	return vfio_pci_core_sriov_configure(pdev, nr_virtfn);
+}
+
+static struct pci_driver vfio_pci_driver = {
+	.name			= "vfio-pci",
+	.id_table		= NULL, /* only dynamic ids */
+	.probe			= vfio_pci_probe,
+	.remove			= vfio_pci_remove,
+	.sriov_configure	= vfio_pci_sriov_configure,
+	.err_handler		= &vfio_pci_core_err_handlers,
+};
+
+static void __init vfio_pci_fill_ids(void)
+{
+	char *p, *id;
+	int rc;
+
+	/* no ids passed actually */
+	if (ids[0] == '\0')
+		return;
+
+	/* add ids specified in the module parameter */
+	p = ids;
+	while ((id = strsep(&p, ","))) {
+		unsigned int vendor, device, subvendor = PCI_ANY_ID,
+			subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
+		int fields;
+
+		if (!strlen(id))
+			continue;
+
+		fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
+				&vendor, &device, &subvendor, &subdevice,
+				&class, &class_mask);
+
+		if (fields < 2) {
+			pr_warn("invalid id string \"%s\"\n", id);
+			continue;
+		}
+
+		rc = pci_add_dynid(&vfio_pci_driver, vendor, device,
+				   subvendor, subdevice, class, class_mask, 0);
+		if (rc)
+			pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
+				vendor, device, subvendor, subdevice,
+				class, class_mask, rc);
+		else
+			pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
+				vendor, device, subvendor, subdevice,
+				class, class_mask);
+	}
+}
+
+static int __init vfio_pci_init(void)
+{
+	int ret;
+
+	ret = vfio_pci_core_init();
+	if (ret)
+		return ret;
+
+	/* Register and scan for devices */
+	ret = pci_register_driver(&vfio_pci_driver);
+	if (ret)
+		goto out;
+
+	vfio_pci_fill_ids();
+
+	if (disable_denylist)
+		pr_warn("device denylist disabled.\n");
+
+	return 0;
+
+out:
+	vfio_pci_core_cleanup();
+	return ret;
+}
+module_init(vfio_pci_init);
+
+static void __exit vfio_pci_cleanup(void)
+{
+	pci_unregister_driver(&vfio_pci_driver);
+	vfio_pci_core_cleanup();
+}
+module_exit(vfio_pci_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 34f939acd4d9..efaab3363e2b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -8,8 +8,6 @@
  * Author: Tom Lyon, pugs@cisco.com
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
 #include <linux/device.h>
 #include <linux/eventfd.h>
 #include <linux/file.h>
@@ -29,14 +27,6 @@
 
 #include "vfio_pci_core.h"
 
-#define DRIVER_VERSION  "0.2"
-#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
-#define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
-
-static char ids[1024] __initdata;
-module_param_string(ids, ids, sizeof(ids), 0);
-MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified");
-
 static bool nointxmask;
 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(nointxmask,
@@ -53,16 +43,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(disable_idle_d3,
 		 "Disable using the PCI D3 low power state for idle, unused devices");
 
-static bool enable_sriov;
-#ifdef CONFIG_PCI_IOV
-module_param(enable_sriov, bool, 0644);
-MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration.  Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
-#endif
-
-static bool disable_denylist;
-module_param(disable_denylist, bool, 0444);
-MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
-
 static inline bool vfio_vga_disabled(void)
 {
 #ifdef CONFIG_VFIO_PCI_VGA
@@ -72,44 +52,6 @@ static inline bool vfio_vga_disabled(void)
 #endif
 }
 
-static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
-{
-	switch (pdev->vendor) {
-	case PCI_VENDOR_ID_INTEL:
-		switch (pdev->device) {
-		case PCI_DEVICE_ID_INTEL_QAT_C3XXX:
-		case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF:
-		case PCI_DEVICE_ID_INTEL_QAT_C62X:
-		case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
-		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
-		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
-			return true;
-		default:
-			return false;
-		}
-	}
-
-	return false;
-}
-
-static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
-{
-	if (!vfio_pci_dev_in_denylist(pdev))
-		return false;
-
-	if (disable_denylist) {
-		pci_warn(pdev,
-			 "device denylist disabled - allowing device %04x:%04x.\n",
-			 pdev->vendor, pdev->device);
-		return false;
-	}
-
-	pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
-		 pdev->vendor, pdev->device);
-
-	return true;
-}
-
 /*
  * Our VGA arbiter participation is limited since we don't know anything
  * about the device itself.  However, if the device is the only VGA device
@@ -495,8 +437,6 @@ static void vfio_pci_disable(struct vfio_pci_core_device *vdev)
 		vfio_pci_set_power_state(vdev, PCI_D3hot);
 }
 
-static struct pci_driver vfio_pci_driver;
-
 static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *physfn = pci_physfn(vdev->pdev);
@@ -509,7 +449,7 @@ static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vde
 	if (!pf_dev)
 		return NULL;
 
-	if (pci_dev_driver(physfn) != &vfio_pci_driver) {
+	if (pci_dev_driver(physfn) != pci_dev_driver(vdev->pdev)) {
 		vfio_device_put(pf_dev);
 		return NULL;
 	}
@@ -532,7 +472,7 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int va
 	vfio_device_put(&pf_vdev->vdev);
 }
 
-static void vfio_pci_core_close_device(struct vfio_device *core_vdev)
+void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -553,7 +493,7 @@ static void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 	mutex_unlock(&vdev->igate);
 }
 
-static int vfio_pci_core_open_device(struct vfio_device *core_vdev)
+int vfio_pci_core_open_device(struct vfio_device *core_vdev)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -726,8 +666,8 @@ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
 	return 0;
 }
 
-static long vfio_pci_core_ioctl(struct vfio_device *core_vdev,
-			   unsigned int cmd, unsigned long arg)
+long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+		unsigned long arg)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1301,8 +1241,8 @@ static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 	return -EINVAL;
 }
 
-static ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
-			     size_t count, loff_t *ppos)
+ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+		size_t count, loff_t *ppos)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1313,8 +1253,8 @@ static ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *bu
 	return vfio_pci_rw(vdev, buf, count, ppos, false);
 }
 
-static ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
-			      size_t count, loff_t *ppos)
+ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+		size_t count, loff_t *ppos)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1534,7 +1474,7 @@ static const struct vm_operations_struct vfio_pci_mmap_ops = {
 	.fault = vfio_pci_mmap_fault,
 };
 
-static int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
+int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1605,7 +1545,7 @@ static int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_stru
 	return 0;
 }
 
-static void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
+void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1721,7 +1661,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 
 #define VF_TOKEN_ARG "vf_token="
 
-static int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
+int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
@@ -1771,18 +1711,6 @@ static int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
 	return 1; /* Match */
 }
 
-static const struct vfio_device_ops vfio_pci_ops = {
-	.name		= "vfio-pci",
-	.open_device	= vfio_pci_core_open_device,
-	.close_device	= vfio_pci_core_close_device,
-	.ioctl		= vfio_pci_core_ioctl,
-	.read		= vfio_pci_core_read,
-	.write		= vfio_pci_core_write,
-	.mmap		= vfio_pci_core_mmap,
-	.request	= vfio_pci_core_request,
-	.match		= vfio_pci_core_match,
-};
-
 static int vfio_pci_bus_notifier(struct notifier_block *nb,
 				 unsigned long action, void *data)
 {
@@ -1797,15 +1725,16 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb,
 		pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
 			 pci_name(pdev));
 		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
-						  vfio_pci_ops.name);
+						  vdev->vdev.ops->name);
 	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
 		   pdev->is_virtfn && physfn == vdev->pdev) {
 		struct pci_driver *drv = pci_dev_driver(pdev);
 
-		if (drv && drv != &vfio_pci_driver)
+		if (drv && drv != pci_dev_driver(vdev->pdev))
 			pci_warn(vdev->pdev,
-				 "VF %s bound to driver %s while PF bound to vfio-pci\n",
-				 pci_name(pdev), drv->name);
+				 "VF %s bound to driver %s while PF bound to driver %s\n",
+				 pci_name(pdev), drv->name,
+				 pci_dev_driver(vdev->pdev)->name);
 	}
 
 	return 0;
@@ -1873,15 +1802,39 @@ static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
 					      VGA_RSRC_LEGACY_MEM);
 }
 
-static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
+			       struct pci_dev *pdev,
+			       const struct vfio_device_ops *vfio_pci_ops)
 {
-	struct vfio_pci_core_device *vdev;
+	vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops);
+	vdev->pdev = pdev;
+	vdev->irq_type = VFIO_PCI_NUM_IRQS;
+	mutex_init(&vdev->igate);
+	spin_lock_init(&vdev->irqlock);
+	mutex_init(&vdev->ioeventfds_lock);
+	INIT_LIST_HEAD(&vdev->dummy_resources_list);
+	INIT_LIST_HEAD(&vdev->ioeventfds_list);
+	mutex_init(&vdev->vma_lock);
+	INIT_LIST_HEAD(&vdev->vma_list);
+	init_rwsem(&vdev->memory_lock);
+}
+
+void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev)
+{
+	mutex_destroy(&vdev->igate);
+	mutex_destroy(&vdev->ioeventfds_lock);
+	mutex_destroy(&vdev->vma_lock);
+	vfio_uninit_group_dev(&vdev->vdev);
+	kfree(vdev->region);
+	kfree(vdev->pm_save);
+}
+
+int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
 	struct iommu_group *group;
 	int ret;
 
-	if (vfio_pci_is_denylisted(pdev))
-		return -EINVAL;
-
 	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 		return -EINVAL;
 
@@ -1902,24 +1855,6 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (!group)
 		return -EINVAL;
 
-	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
-	if (!vdev) {
-		ret = -ENOMEM;
-		goto out_group_put;
-	}
-
-	vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops);
-	vdev->pdev = pdev;
-	vdev->irq_type = VFIO_PCI_NUM_IRQS;
-	mutex_init(&vdev->igate);
-	spin_lock_init(&vdev->irqlock);
-	mutex_init(&vdev->ioeventfds_lock);
-	INIT_LIST_HEAD(&vdev->dummy_resources_list);
-	INIT_LIST_HEAD(&vdev->ioeventfds_list);
-	mutex_init(&vdev->vma_lock);
-	INIT_LIST_HEAD(&vdev->vma_list);
-	init_rwsem(&vdev->memory_lock);
-
 	if (pci_is_root_bus(pdev->bus)) {
 		ret = vfio_assign_device_set(&vdev->vdev, vdev);
 	} else if (!pci_probe_reset_slot(pdev->slot)) {
@@ -1933,10 +1868,10 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	if (ret)
-		goto out_uninit;
+		goto out_group_put;
 	ret = vfio_pci_vf_init(vdev);
 	if (ret)
-		goto out_uninit;
+		goto out_group_put;
 	ret = vfio_pci_vga_init(vdev);
 	if (ret)
 		goto out_vf;
@@ -1968,36 +1903,26 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		vfio_pci_set_power_state(vdev, PCI_D0);
 out_vf:
 	vfio_pci_vf_uninit(vdev);
-out_uninit:
-	vfio_uninit_group_dev(&vdev->vdev);
-	kfree(vdev->pm_save);
-	kfree(vdev);
 out_group_put:
 	vfio_iommu_group_put(group, &pdev->dev);
 	return ret;
 }
 
-static void vfio_pci_remove(struct pci_dev *pdev)
+void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
 {
-	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+	struct pci_dev *pdev = vdev->pdev;
 
 	pci_disable_sriov(pdev);
 
 	vfio_unregister_group_dev(&vdev->vdev);
 
 	vfio_pci_vf_uninit(vdev);
-	vfio_uninit_group_dev(&vdev->vdev);
 	vfio_pci_vga_uninit(vdev);
 
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
 
 	if (!disable_idle_d3)
 		vfio_pci_set_power_state(vdev, PCI_D0);
-
-	mutex_destroy(&vdev->ioeventfds_lock);
-	kfree(vdev->region);
-	kfree(vdev->pm_save);
-	kfree(vdev);
 }
 
 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
@@ -2024,16 +1949,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
-static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
 {
 	struct vfio_device *device;
 	int ret = 0;
 
-	might_sleep();
-
-	if (!enable_sriov)
-		return -ENOENT;
-
 	device = vfio_device_get_from_dev(&pdev->dev);
 	if (!device)
 		return -ENODEV;
@@ -2048,19 +1968,10 @@ static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
 	return ret < 0 ? ret : nr_virtfn;
 }
 
-static const struct pci_error_handlers vfio_err_handlers = {
+const struct pci_error_handlers vfio_pci_core_err_handlers = {
 	.error_detected = vfio_pci_aer_err_detected,
 };
 
-static struct pci_driver vfio_pci_driver = {
-	.name			= "vfio-pci",
-	.id_table		= NULL, /* only dynamic ids */
-	.probe			= vfio_pci_probe,
-	.remove			= vfio_pci_remove,
-	.sriov_configure	= vfio_pci_sriov_configure,
-	.err_handler		= &vfio_err_handlers,
-};
-
 static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
 			       struct vfio_pci_group_info *groups)
 {
@@ -2241,83 +2152,15 @@ static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
 	return true;
 }
 
-static void __exit vfio_pci_cleanup(void)
+/* This will become the __exit function of vfio_pci_core.ko */
+void vfio_pci_core_cleanup(void)
 {
-	pci_unregister_driver(&vfio_pci_driver);
 	vfio_pci_uninit_perm_bits();
 }
 
-static void __init vfio_pci_fill_ids(void)
+/* This will become the __init function of vfio_pci_core.ko */
+int __init vfio_pci_core_init(void)
 {
-	char *p, *id;
-	int rc;
-
-	/* no ids passed actually */
-	if (ids[0] == '\0')
-		return;
-
-	/* add ids specified in the module parameter */
-	p = ids;
-	while ((id = strsep(&p, ","))) {
-		unsigned int vendor, device, subvendor = PCI_ANY_ID,
-			subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
-		int fields;
-
-		if (!strlen(id))
-			continue;
-
-		fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
-				&vendor, &device, &subvendor, &subdevice,
-				&class, &class_mask);
-
-		if (fields < 2) {
-			pr_warn("invalid id string \"%s\"\n", id);
-			continue;
-		}
-
-		rc = pci_add_dynid(&vfio_pci_driver, vendor, device,
-				   subvendor, subdevice, class, class_mask, 0);
-		if (rc)
-			pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
-				vendor, device, subvendor, subdevice,
-				class, class_mask, rc);
-		else
-			pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
-				vendor, device, subvendor, subdevice,
-				class, class_mask);
-	}
-}
-
-static int __init vfio_pci_init(void)
-{
-	int ret;
-
 	/* Allocate shared config space permission data used by all devices */
-	ret = vfio_pci_init_perm_bits();
-	if (ret)
-		return ret;
-
-	/* Register and scan for devices */
-	ret = pci_register_driver(&vfio_pci_driver);
-	if (ret)
-		goto out_driver;
-
-	vfio_pci_fill_ids();
-
-	if (disable_denylist)
-		pr_warn("device denylist disabled.\n");
-
-	return 0;
-
-out_driver:
-	vfio_pci_uninit_perm_bits();
-	return ret;
+	return vfio_pci_init_perm_bits();
 }
-
-module_init(vfio_pci_init);
-module_exit(vfio_pci_cleanup);
-
-MODULE_VERSION(DRIVER_VERSION);
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR(DRIVER_AUTHOR);
-MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h
index 2162066ac0a8..ab6244c67bbf 100644
--- a/drivers/vfio/pci/vfio_pci_core.h
+++ b/drivers/vfio/pci/vfio_pci_core.h
@@ -206,4 +206,27 @@ static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 }
 #endif
 
+/* Will be exported for vfio pci drivers usage */
+void vfio_pci_core_cleanup(void);
+int vfio_pci_core_init(void);
+void vfio_pci_core_close_device(struct vfio_device *core_vdev);
+int vfio_pci_core_open_device(struct vfio_device *core_vdev);
+void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
+			       struct pci_dev *pdev,
+			       const struct vfio_device_ops *vfio_pci_ops);
+int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev);
+int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn);
+extern const struct pci_error_handlers vfio_pci_core_err_handlers;
+long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+		unsigned long arg);
+ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+		size_t count, loff_t *ppos);
+ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+		size_t count, loff_t *ppos);
+int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
+void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
+int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
+
 #endif /* VFIO_PCI_CORE_H */
-- 
Gitee


From 8ee5771211f24f9a90be282d3be5a5eb57091a86 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:06 +0300
Subject: [PATCH 1683/2161] vfio/pci: Move igd initialization to vfio_pci.c

commit 2fb89f56a624fd74e6e15154f3e9fdceca98b784 upstream.

igd is related to the vfio_pci pci_driver implementation, move it out of
vfio_pci_core.c.

This is preparation for splitting vfio_pci.ko into 2 drivers.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-8-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c      | 29 +++++++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_core.c | 39 ++++----------------------------
 drivers/vfio/pci/vfio_pci_core.h |  9 +++++++-
 3 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 4e31bd3001ad..2729b777a56d 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -82,9 +82,36 @@ static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
 	return true;
 }
 
+static int vfio_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	if (vfio_pci_is_vga(pdev) &&
+	    pdev->vendor == PCI_VENDOR_ID_INTEL &&
+	    IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
+		ret = vfio_pci_igd_init(vdev);
+		if (ret && ret != -ENODEV) {
+			pci_warn(pdev, "Failed to setup Intel IGD regions\n");
+			vfio_pci_core_disable(vdev);
+			return ret;
+		}
+	}
+
+	vfio_pci_core_finish_enable(vdev);
+
+	return 0;
+}
+
 static const struct vfio_device_ops vfio_pci_ops = {
 	.name		= "vfio-pci",
-	.open_device	= vfio_pci_core_open_device,
+	.open_device	= vfio_pci_open_device,
 	.close_device	= vfio_pci_core_close_device,
 	.ioctl		= vfio_pci_core_ioctl,
 	.read		= vfio_pci_core_read,
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index efaab3363e2b..d7128519a602 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -91,11 +91,6 @@ static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga)
 	return decodes;
 }
 
-static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
-{
-	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
-}
-
 static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
 {
 	struct resource *res;
@@ -166,7 +161,6 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
 
 struct vfio_pci_group_info;
 static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
-static void vfio_pci_disable(struct vfio_pci_core_device *vdev);
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 				      struct vfio_pci_group_info *groups);
 
@@ -252,7 +246,7 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
 	return ret;
 }
 
-static int vfio_pci_enable(struct vfio_pci_core_device *vdev)
+int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int ret;
@@ -321,26 +315,11 @@ static int vfio_pci_enable(struct vfio_pci_core_device *vdev)
 	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
 		vdev->has_vga = true;
 
-	if (vfio_pci_is_vga(pdev) &&
-	    pdev->vendor == PCI_VENDOR_ID_INTEL &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
-		ret = vfio_pci_igd_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup Intel IGD regions\n");
-			goto disable_exit;
-		}
-	}
-
-	vfio_pci_probe_mmaps(vdev);
 
 	return 0;
-
-disable_exit:
-	vfio_pci_disable(vdev);
-	return ret;
 }
 
-static void vfio_pci_disable(struct vfio_pci_core_device *vdev)
+void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	struct vfio_pci_dummy_resource *dummy_res, *tmp;
@@ -479,7 +458,7 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 
 	vfio_pci_vf_token_user_add(vdev, -1);
 	vfio_spapr_pci_eeh_release(vdev->pdev);
-	vfio_pci_disable(vdev);
+	vfio_pci_core_disable(vdev);
 
 	mutex_lock(&vdev->igate);
 	if (vdev->err_trigger) {
@@ -493,19 +472,11 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 	mutex_unlock(&vdev->igate);
 }
 
-int vfio_pci_core_open_device(struct vfio_device *core_vdev)
+void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
 {
-	struct vfio_pci_core_device *vdev =
-		container_of(core_vdev, struct vfio_pci_core_device, vdev);
-	int ret = 0;
-
-	ret = vfio_pci_enable(vdev);
-	if (ret)
-		return ret;
-
+	vfio_pci_probe_mmaps(vdev);
 	vfio_spapr_pci_eeh_open(vdev->pdev);
 	vfio_pci_vf_token_user_add(vdev, 1);
-	return 0;
 }
 
 static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h
index ab6244c67bbf..8dd5b8286138 100644
--- a/drivers/vfio/pci/vfio_pci_core.h
+++ b/drivers/vfio/pci/vfio_pci_core.h
@@ -210,7 +210,6 @@ static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 void vfio_pci_core_cleanup(void);
 int vfio_pci_core_init(void);
 void vfio_pci_core_close_device(struct vfio_device *core_vdev);
-int vfio_pci_core_open_device(struct vfio_device *core_vdev);
 void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
 			       struct pci_dev *pdev,
 			       const struct vfio_device_ops *vfio_pci_ops);
@@ -228,5 +227,13 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
 int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
+int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
+
+static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
+{
+	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
+}
 
 #endif /* VFIO_PCI_CORE_H */
-- 
Gitee


From 4bf60e48f83744ea70d4ca345b416b44d24c00e3 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:07 +0300
Subject: [PATCH 1684/2161] vfio/pci: Move module parameters to vfio_pci.c

commit c61302aa48f7c46b5c9d893109488af951be12e4 upstream.

This is a preparation before splitting vfio_pci.ko to 2 modules.

As module parameters are a kind of uAPI they need to stay on vfio_pci.ko
to avoid a user visible impact.

For now continue to keep the implementation of these options in
vfio_pci_core.c. Arguably they are vfio_pci functionality, but further
splitting of vfio_pci_core.c will be better done in another series

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-9-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c      | 23 +++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_core.c | 20 ++++++++------------
 drivers/vfio/pci/vfio_pci_core.h |  2 ++
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 2729b777a56d..163e560c4495 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -34,6 +34,22 @@ static char ids[1024] __initdata;
 module_param_string(ids, ids, sizeof(ids), 0);
 MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified");
 
+static bool nointxmask;
+module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(nointxmask,
+		  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
+
+#ifdef CONFIG_VFIO_PCI_VGA
+static bool disable_vga;
+module_param(disable_vga, bool, S_IRUGO);
+MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci");
+#endif
+
+static bool disable_idle_d3;
+module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_idle_d3,
+		 "Disable using the PCI D3 low power state for idle, unused devices");
+
 static bool enable_sriov;
 #ifdef CONFIG_PCI_IOV
 module_param(enable_sriov, bool, 0644);
@@ -215,6 +231,13 @@ static void __init vfio_pci_fill_ids(void)
 static int __init vfio_pci_init(void)
 {
 	int ret;
+	bool is_disable_vga = true;
+
+#ifdef CONFIG_VFIO_PCI_VGA
+	is_disable_vga = disable_vga;
+#endif
+
+	vfio_pci_core_set_params(nointxmask, is_disable_vga, disable_idle_d3);
 
 	ret = vfio_pci_core_init();
 	if (ret)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index d7128519a602..c428a6a1d37c 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -28,20 +28,8 @@
 #include "vfio_pci_core.h"
 
 static bool nointxmask;
-module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(nointxmask,
-		  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
-
-#ifdef CONFIG_VFIO_PCI_VGA
 static bool disable_vga;
-module_param(disable_vga, bool, S_IRUGO);
-MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci");
-#endif
-
 static bool disable_idle_d3;
-module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(disable_idle_d3,
-		 "Disable using the PCI D3 low power state for idle, unused devices");
 
 static inline bool vfio_vga_disabled(void)
 {
@@ -2123,6 +2111,14 @@ static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
 	return true;
 }
 
+void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
+			      bool is_disable_idle_d3)
+{
+	nointxmask = is_nointxmask;
+	disable_vga = is_disable_vga;
+	disable_idle_d3 = is_disable_idle_d3;
+}
+
 /* This will become the __exit function of vfio_pci_core.ko */
 void vfio_pci_core_cleanup(void)
 {
diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h
index 8dd5b8286138..6192de8e4d13 100644
--- a/drivers/vfio/pci/vfio_pci_core.h
+++ b/drivers/vfio/pci/vfio_pci_core.h
@@ -209,6 +209,8 @@ static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 /* Will be exported for vfio pci drivers usage */
 void vfio_pci_core_cleanup(void);
 int vfio_pci_core_init(void);
+void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
+			      bool is_disable_idle_d3);
 void vfio_pci_core_close_device(struct vfio_device *core_vdev);
 void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
 			       struct pci_dev *pdev,
-- 
Gitee


From a886b797cb45c14df28d37c9b1b85345584c1fae Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:08 +0300
Subject: [PATCH 1685/2161] PCI: Add 'override_only' field to struct
 pci_device_id

commit 343b7258687ecfbb363bfda8833a7cf641aac524 upstream.

Add 'override_only' field to struct pci_device_id to be used as part of
pci_match_device().

When set, a driver only matches the entry when dev->driver_override is
set to that driver.

In addition, add a helper macro named 'PCI_DEVICE_DRIVER_OVERRIDE' to
enable setting some data on it.

Next patch from this series will use the above functionality.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-10-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/PCI/pci.rst       |  1 +
 drivers/pci/pci-driver.c        | 28 +++++++++++++++++++++-------
 include/linux/mod_devicetable.h |  2 ++
 include/linux/pci.h             | 15 +++++++++++++++
 4 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
index 6864f9a70f5f..0070a28dd4fd 100644
--- a/Documentation/PCI/pci.rst
+++ b/Documentation/PCI/pci.rst
@@ -103,6 +103,7 @@ need pass only as many optional fields as necessary:
   - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
   - class and classmask fields default to 0
   - driver_data defaults to 0UL.
+  - override_only field defaults to 0.
 
 Note that driver_data must match the value used by any of the pci_device_id
 entries defined in the driver. This makes the driver_data field mandatory
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 5ea612a15550..2308f13fc9d8 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -252,7 +252,7 @@ static const struct pci_device_id *pci_match_device(struct pci_driver *drv,
 						    struct pci_dev *dev)
 {
 	struct pci_dynid *dynid;
-	const struct pci_device_id *found_id = NULL;
+	const struct pci_device_id *found_id = NULL, *ids;
 
 	/* When driver_override is set, only bind to the matching driver */
 	if (dev->driver_override && strcmp(dev->driver_override, drv->name))
@@ -268,14 +268,28 @@ static const struct pci_device_id *pci_match_device(struct pci_driver *drv,
 	}
 	spin_unlock(&drv->dynids.lock);
 
-	if (!found_id)
-		found_id = pci_match_id(drv->id_table, dev);
+	if (found_id)
+		return found_id;
 
-	/* driver_override will always match, send a dummy id */
-	if (!found_id && dev->driver_override)
-		found_id = &pci_device_id_any;
+	for (ids = drv->id_table; (found_id = pci_match_id(ids, dev));
+	     ids = found_id + 1) {
+		/*
+		 * The match table is split based on driver_override.
+		 * In case override_only was set, enforce driver_override
+		 * matching.
+		 */
+		if (found_id->override_only) {
+			if (dev->driver_override)
+				return found_id;
+		} else {
+			return found_id;
+		}
+	}
 
-	return found_id;
+	/* driver_override will always match, send a dummy id */
+	if (dev->driver_override)
+		return &pci_device_id_any;
+	return NULL;
 }
 
 struct drv_dev_and_id {
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 4c56404e53a7..0fff236b287f 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -34,12 +34,14 @@ typedef unsigned long kernel_ulong_t;
  *			Best practice is to use driver_data as an index
  *			into a static list of equivalent device types,
  *			instead of using it as a pointer.
+ * @override_only:	Match only when dev->driver_override is this driver.
  */
 struct pci_device_id {
 	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
 	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
 	__u32 class, class_mask;	/* (class,subclass,prog-if) triplet */
 	kernel_ulong_t driver_data;	/* Data private to the driver */
+	__u32 override_only;
 };
 
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 284b723d1f08..862adf3be046 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -883,6 +883,21 @@ struct pci_driver {
 	.vendor = (vend), .device = (dev), \
 	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
 
+/**
+ * PCI_DEVICE_DRIVER_OVERRIDE - macro used to describe a PCI device with
+ *                              override_only flags.
+ * @vend: the 16 bit PCI Vendor ID
+ * @dev: the 16 bit PCI Device ID
+ * @driver_override: the 32 bit PCI Device override_only
+ *
+ * This macro is used to create a struct pci_device_id that matches only a
+ * driver_override device. The subvendor and subdevice fields will be set to
+ * PCI_ANY_ID.
+ */
+#define PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \
+	.vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \
+	.subdevice = PCI_ANY_ID, .override_only = (driver_override)
+
 /**
  * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem
  * @vend: the 16 bit PCI Vendor ID
-- 
Gitee


From 05f64d6843afefa12443b405ef556d1a9611417d Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:09 +0300
Subject: [PATCH 1686/2161] PCI / VFIO: Add 'override_only' support for VFIO
 PCI sub system

commit cc6711b0bf36de068b10490198d05ac168377989 upstream.

Expose an 'override_only' helper macro (i.e.
PCI_DRIVER_OVERRIDE_DEVICE_VFIO) for VFIO PCI sub system and add the
required code to prefix its matching entries with "vfio_" in
modules.alias file.

It allows VFIO device drivers to include match entries in the
modules.alias file produced by kbuild that are not used for normal
driver autoprobing and module autoloading. Drivers using these match
entries can be connected to the PCI device manually, by userspace, using
the existing driver_override sysfs.

For example the resulting modules.alias may have:

  alias pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_core
  alias vfio_pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_vfio_pci
  alias vfio_pci:v*d*sv*sd*bc*sc*i* vfio_pci

In this example mlx5_core and mlx5_vfio_pci match to the same PCI
device. The kernel will autoload and autobind to mlx5_core but the
kernel and udev mechanisms will ignore mlx5_vfio_pci.

When userspace wants to change a device to the VFIO subsystem it can
implement a generic algorithm:

   1) Identify the sysfs path to the device:
    /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0

   2) Get the modalias string from the kernel:
    $ cat /sys/bus/pci/devices/0000:01:00.0/modalias
    pci:v000015B3d00001021sv000015B3sd00000001bc02sc00i00

   3) Prefix it with vfio_:
    vfio_pci:v000015B3d00001021sv000015B3sd00000001bc02sc00i00

   4) Search modules.alias for the above string and select the entry that
      has the fewest *'s:
    alias vfio_pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_vfio_pci

   5) modprobe the matched module name:
    $ modprobe mlx5_vfio_pci

   6) cat the matched module name to driver_override:
    echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:01:00.0/driver_override

   7) unbind device from original module
     echo 0000:01:00.0 > /sys/bus/pci/devices/0000:01:00.0/driver/unbind

   8) probe PCI drivers (or explicitly bind to mlx5_vfio_pci)
    echo 0000:01:00.0 > /sys/bus/pci/drivers_probe

The algorithm is independent of bus type. In future the other buses with
VFIO device drivers, like platform and ACPI, can use this algorithm as
well.

This patch is the infrastructure to provide the information in the
modules.alias to userspace. Convert the only VFIO pci_driver which results
in one new line in the modules.alias:

  alias vfio_pci:v*d*sv*sd*bc*sc*i* vfio_pci

Later series introduce additional HW specific VFIO PCI drivers, such as
mlx5_vfio_pci.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>  # for pci.h
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-11-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci.c       |  9 ++++++++-
 include/linux/mod_devicetable.h   |  4 ++++
 include/linux/pci.h               | 14 ++++++++++++++
 scripts/mod/devicetable-offsets.c |  1 +
 scripts/mod/file2alias.c          | 17 +++++++++++++++--
 5 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 163e560c4495..85fd638a5955 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -178,9 +178,16 @@ static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
 	return vfio_pci_core_sriov_configure(pdev, nr_virtfn);
 }
 
+static const struct pci_device_id vfio_pci_table[] = {
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_ANY_ID, PCI_ANY_ID) }, /* match all by default */
+	{}
+};
+
+MODULE_DEVICE_TABLE(pci, vfio_pci_table);
+
 static struct pci_driver vfio_pci_driver = {
 	.name			= "vfio-pci",
-	.id_table		= NULL, /* only dynamic ids */
+	.id_table		= vfio_pci_table,
 	.probe			= vfio_pci_probe,
 	.remove			= vfio_pci_remove,
 	.sriov_configure	= vfio_pci_sriov_configure,
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 0fff236b287f..f655de60b72b 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -16,6 +16,10 @@ typedef unsigned long kernel_ulong_t;
 
 #define PCI_ANY_ID (~0)
 
+enum {
+	PCI_ID_F_VFIO_DRIVER_OVERRIDE = 1,
+};
+
 /**
  * struct pci_device_id - PCI device ID structure
  * @vendor:		Vendor ID to match (or PCI_ANY_ID)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 862adf3be046..894bd747285d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -898,6 +898,20 @@ struct pci_driver {
 	.vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \
 	.subdevice = PCI_ANY_ID, .override_only = (driver_override)
 
+/**
+ * PCI_DRIVER_OVERRIDE_DEVICE_VFIO - macro used to describe a VFIO
+ *                                   "driver_override" PCI device.
+ * @vend: the 16 bit PCI Vendor ID
+ * @dev: the 16 bit PCI Device ID
+ *
+ * This macro is used to create a struct pci_device_id that matches a
+ * specific device. The subvendor and subdevice fields will be set to
+ * PCI_ANY_ID and the driver_override will be set to
+ * PCI_ID_F_VFIO_DRIVER_OVERRIDE.
+ */
+#define PCI_DRIVER_OVERRIDE_DEVICE_VFIO(vend, dev) \
+	PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, PCI_ID_F_VFIO_DRIVER_OVERRIDE)
+
 /**
  * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem
  * @vend: the 16 bit PCI Vendor ID
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index 054405b90ba4..58bcceabff7e 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -42,6 +42,7 @@ int main(void)
 	DEVID_FIELD(pci_device_id, subdevice);
 	DEVID_FIELD(pci_device_id, class);
 	DEVID_FIELD(pci_device_id, class_mask);
+	DEVID_FIELD(pci_device_id, override_only);
 
 	DEVID(ccw_device_id);
 	DEVID_FIELD(ccw_device_id, match_flags);
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index c91eba751804..e8d4827f9ca6 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -426,7 +426,7 @@ static int do_ieee1394_entry(const char *filename,
 	return 1;
 }
 
-/* Looks like: pci:vNdNsvNsdNbcNscNiN. */
+/* Looks like: pci:vNdNsvNsdNbcNscNiN or <prefix>_pci:vNdNsvNsdNbcNscNiN. */
 static int do_pci_entry(const char *filename,
 			void *symval, char *alias)
 {
@@ -440,8 +440,21 @@ static int do_pci_entry(const char *filename,
 	DEF_FIELD(symval, pci_device_id, subdevice);
 	DEF_FIELD(symval, pci_device_id, class);
 	DEF_FIELD(symval, pci_device_id, class_mask);
+	DEF_FIELD(symval, pci_device_id, override_only);
+
+	switch (override_only) {
+	case 0:
+		strcpy(alias, "pci:");
+		break;
+	case PCI_ID_F_VFIO_DRIVER_OVERRIDE:
+		strcpy(alias, "vfio_pci:");
+		break;
+	default:
+		warn("Unknown PCI driver_override alias %08X\n",
+		     override_only);
+		return 0;
+	}
 
-	strcpy(alias, "pci:");
 	ADD(alias, "v", vendor != PCI_ANY_ID, vendor);
 	ADD(alias, "d", device != PCI_ANY_ID, device);
 	ADD(alias, "sv", subvendor != PCI_ANY_ID, subvendor);
-- 
Gitee


From 54646ae53ed910d6ec60cf1334d27087f1a2389f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 15 May 2021 12:08:56 -0700
Subject: [PATCH 1687/2161] vfio/pci: zap_vma_ptes() needs MMU

commit 2a55ca37350171d9b43d561528f23d4130097255 upstream.

zap_vma_ptes() is only available when CONFIG_MMU is set/enabled.
Without CONFIG_MMU, vfio_pci.o has build errors, so make
VFIO_PCI depend on MMU.

riscv64-linux-ld: drivers/vfio/pci/vfio_pci.o: in function `vfio_pci_mmap_open':
vfio_pci.c:(.text+0x1ec): undefined reference to `zap_vma_ptes'
riscv64-linux-ld: drivers/vfio/pci/vfio_pci.o: in function `.L0 ':
vfio_pci.c:(.text+0x165c): undefined reference to `zap_vma_ptes'

Fixes: 11c4cd07ba11 ("vfio-pci: Fault mmaps to enable vma tracking")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: kvm@vger.kernel.org
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Eric Auger <eric.auger@redhat.com>
Message-Id: <20210515190856.2130-1-rdunlap@infradead.org>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 53ce78d7d07b..5e2e1b9a9fd3 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -2,6 +2,7 @@
 config VFIO_PCI
 	tristate "VFIO support for PCI devices"
 	depends on VFIO && PCI && EVENTFD
+	depends on MMU
 	select VFIO_VIRQFD
 	select IRQ_BYPASS_MANAGER
 	help
-- 
Gitee


From 00f2fa1b6477de94dddd38d302516743d353e38b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:10 +0300
Subject: [PATCH 1688/2161] vfio: Use select for eventfd

commit ca4ddaac7fa710a250bbd650cc719425bec973a0 upstream.

If VFIO_VIRQFD is required then turn on eventfd automatically.
The majority of kconfig users of the EVENTFD use select not depends on.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-12-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/Kconfig          | 3 ++-
 drivers/vfio/pci/Kconfig      | 2 +-
 drivers/vfio/platform/Kconfig | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 503ed2f3fbb5..d4513539e0f9 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -16,7 +16,8 @@ config VFIO_SPAPR_EEH
 
 config VFIO_VIRQFD
 	tristate
-	depends on VFIO && EVENTFD
+	depends on VFIO
+	select EVENTFD
 	default n
 
 menuconfig VFIO
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 5e2e1b9a9fd3..d208a95a2767 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PCI
 	tristate "VFIO support for PCI devices"
-	depends on VFIO && PCI && EVENTFD
+	depends on VFIO && PCI
 	depends on MMU
 	select VFIO_VIRQFD
 	select IRQ_BYPASS_MANAGER
diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index 233efde219cc..d161d22df4c1 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PLATFORM
 	tristate "VFIO support for platform devices"
-	depends on VFIO && EVENTFD && (ARM || ARM64 || COMPILE_TEST)
+	depends on VFIO && (ARM || ARM64 || COMPILE_TEST)
 	select VFIO_VIRQFD
 	help
 	  Support for platform devices with VFIO. This is required to make
-- 
Gitee


From 8a1851dfe74a187842ce6ef526c53ec47db1984a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 16 Jul 2021 15:39:12 -0300
Subject: [PATCH 1689/2161] vfio: Use config not menuconfig for VFIO_NOIOMMU

commit 26c22cfde5dd6e63f25c48458b0185dcb0fbb2fd upstream.

VFIO_NOIOMMU is supposed to be an element in the VFIO menu, not start
a new menu. Correct this copy-paste mistake.

Fixes: 03a76b60f8ba ("vfio: Include No-IOMMU mode")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/0-v1-3f0b685c3679+478-vfio_menuconfig_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index d4513539e0f9..067f5aefb098 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -30,7 +30,7 @@ menuconfig VFIO
 
 	  If you don't know what to do here, say N.
 
-menuconfig VFIO_NOIOMMU
+config VFIO_NOIOMMU
 	bool "VFIO No-IOMMU support"
 	depends on VFIO
 	help
-- 
Gitee


From fe10f4a8ce0abd96f8251d2a97895c2e688338bc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:11 +0300
Subject: [PATCH 1690/2161] vfio: Use kconfig if XX/endif blocks instead of
 repeating 'depends on'

commit 85c94dcffcb775bafffd6e966db49253e1b789d9 upstream.

This results in less kconfig wordage and a simpler understanding of the
required "depends on" to create the menu structure.

The next patch increases the nesting level a lot so this is a nice
preparatory simplification.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-13-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/Kconfig                | 28 ++++++++++++++--------------
 drivers/vfio/mdev/Kconfig           |  1 -
 drivers/vfio/pci/Kconfig            | 11 ++++++-----
 drivers/vfio/platform/Kconfig       |  6 ++++--
 drivers/vfio/platform/reset/Kconfig |  4 +---
 5 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 067f5aefb098..2ad5fe473227 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,12 +1,22 @@
 # SPDX-License-Identifier: GPL-2.0-only
+menuconfig VFIO
+	tristate "VFIO Non-Privileged userspace driver framework"
+	select IOMMU_API
+	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
+	help
+	  VFIO provides a framework for secure userspace device drivers.
+	  See Documentation/driver-api/vfio.rst for more details.
+
+	  If you don't know what to do here, say N.
+
+if VFIO
 config VFIO_IOMMU_TYPE1
 	tristate
-	depends on VFIO
 	default n
 
 config VFIO_IOMMU_SPAPR_TCE
 	tristate
-	depends on VFIO && SPAPR_TCE_IOMMU
+	depends on SPAPR_TCE_IOMMU
 	default VFIO
 
 config VFIO_SPAPR_EEH
@@ -16,23 +26,11 @@ config VFIO_SPAPR_EEH
 
 config VFIO_VIRQFD
 	tristate
-	depends on VFIO
 	select EVENTFD
 	default n
 
-menuconfig VFIO
-	tristate "VFIO Non-Privileged userspace driver framework"
-	select IOMMU_API
-	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
-	help
-	  VFIO provides a framework for secure userspace device drivers.
-	  See Documentation/driver-api/vfio.rst for more details.
-
-	  If you don't know what to do here, say N.
-
 config VFIO_NOIOMMU
 	bool "VFIO No-IOMMU support"
-	depends on VFIO
 	help
 	  VFIO is built on the ability to isolate devices using the IOMMU.
 	  Only with an IOMMU can userspace access to DMA capable devices be
@@ -48,4 +46,6 @@ config VFIO_NOIOMMU
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
 source "drivers/vfio/mdev/Kconfig"
+endif
+
 source "virt/lib/Kconfig"
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
index 763c877a1318..646dbed44eb2 100644
--- a/drivers/vfio/mdev/Kconfig
+++ b/drivers/vfio/mdev/Kconfig
@@ -2,7 +2,6 @@
 
 config VFIO_MDEV
 	tristate "Mediated device driver framework"
-	depends on VFIO
 	default n
 	help
 	  Provides a framework to virtualize devices.
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index d208a95a2767..afdab7d71e98 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PCI
 	tristate "VFIO support for PCI devices"
-	depends on VFIO && PCI
+	depends on PCI
 	depends on MMU
 	select VFIO_VIRQFD
 	select IRQ_BYPASS_MANAGER
@@ -11,9 +11,10 @@ config VFIO_PCI
 
 	  If you don't know what to do here, say N.
 
+if VFIO_PCI
 config VFIO_PCI_VGA
 	bool "VFIO PCI support for VGA devices"
-	depends on VFIO_PCI && X86 && VGA_ARB
+	depends on X86 && VGA_ARB
 	help
 	  Support for VGA extension to VFIO PCI.  This exposes an additional
 	  region on VGA devices for accessing legacy VGA addresses used by
@@ -22,16 +23,14 @@ config VFIO_PCI_VGA
 	  If you don't know what to do here, say N.
 
 config VFIO_PCI_MMAP
-	depends on VFIO_PCI
 	def_bool y if !S390
 
 config VFIO_PCI_INTX
-	depends on VFIO_PCI
 	def_bool y if !S390
 
 config VFIO_PCI_IGD
 	bool "VFIO PCI extensions for Intel graphics (GVT-d)"
-	depends on VFIO_PCI && X86
+	depends on X86
 	default y
 	help
 	  Support for Intel IGD specific extensions to enable direct
@@ -40,3 +39,5 @@ config VFIO_PCI_IGD
 	  and LPC bridge config space.
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
+
+endif
diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index d161d22df4c1..72de406281bf 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PLATFORM
 	tristate "VFIO support for platform devices"
-	depends on VFIO && (ARM || ARM64 || COMPILE_TEST)
+	depends on ARM || ARM64 || COMPILE_TEST
 	select VFIO_VIRQFD
 	help
 	  Support for platform devices with VFIO. This is required to make
@@ -10,9 +10,10 @@ config VFIO_PLATFORM
 
 	  If you don't know what to do here, say N.
 
+if VFIO_PLATFORM
 config VFIO_AMBA
 	tristate "VFIO support for AMBA devices"
-	depends on VFIO_PLATFORM && ARM_AMBA
+	depends on ARM_AMBA
 	help
 	  Support for ARM AMBA devices with VFIO. This is required to make
 	  use of ARM AMBA devices present on the system using the VFIO
@@ -21,3 +22,4 @@ config VFIO_AMBA
 	  If you don't know what to do here, say N.
 
 source "drivers/vfio/platform/reset/Kconfig"
+endif
diff --git a/drivers/vfio/platform/reset/Kconfig b/drivers/vfio/platform/reset/Kconfig
index 1edbe9ee7356..12f5f3d80387 100644
--- a/drivers/vfio/platform/reset/Kconfig
+++ b/drivers/vfio/platform/reset/Kconfig
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PLATFORM_CALXEDAXGMAC_RESET
 	tristate "VFIO support for calxeda xgmac reset"
-	depends on VFIO_PLATFORM
 	help
 	  Enables the VFIO platform driver to handle reset for Calxeda xgmac
 
@@ -9,7 +8,6 @@ config VFIO_PLATFORM_CALXEDAXGMAC_RESET
 
 config VFIO_PLATFORM_AMDXGBE_RESET
 	tristate "VFIO support for AMD XGBE reset"
-	depends on VFIO_PLATFORM
 	help
 	  Enables the VFIO platform driver to handle reset for AMD XGBE
 
@@ -17,7 +15,7 @@ config VFIO_PLATFORM_AMDXGBE_RESET
 
 config VFIO_PLATFORM_BCMFLEXRM_RESET
 	tristate "VFIO support for Broadcom FlexRM reset"
-	depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
+	depends on ARCH_BCM_IPROC || COMPILE_TEST
 	default ARCH_BCM_IPROC
 	help
 	  Enables the VFIO platform driver to handle reset for Broadcom FlexRM
-- 
Gitee


From bbaa204ea831294ab32f4ecd0e3cdbe3f6bc7412 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:12 +0300
Subject: [PATCH 1691/2161] vfio/pci: Introduce vfio_pci_core.ko

commit 7fa005caa35ed92563b9e9d88d319b2623763a77 upstream.

Now that vfio_pci has been split into two source modules, one focusing on
the "struct pci_driver" (vfio_pci.c) and a toolbox library of code
(vfio_pci_core.c), complete the split and move them into two different
kernel modules.

As before vfio_pci.ko continues to present the same interface under sysfs
and this change will have no functional impact.

Splitting into another module and adding exports allows creating new HW
specific VFIO PCI drivers that can implement device specific
functionality, such as VFIO migration interfaces or specialized device
requirements.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-14-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/Kconfig                      | 33 +++++++++-------
 drivers/vfio/pci/Makefile                     |  8 ++--
 drivers/vfio/pci/vfio_pci.c                   | 14 ++-----
 drivers/vfio/pci/vfio_pci_config.c            |  2 +-
 drivers/vfio/pci/vfio_pci_core.c              | 39 ++++++++++++++++---
 drivers/vfio/pci/vfio_pci_igd.c               |  2 +-
 drivers/vfio/pci/vfio_pci_intrs.c             |  2 +-
 drivers/vfio/pci/vfio_pci_rdwr.c              |  2 +-
 drivers/vfio/pci/vfio_pci_zdev.c              |  2 +-
 .../pci => include/linux}/vfio_pci_core.h     |  2 -
 10 files changed, 64 insertions(+), 42 deletions(-)
 rename {drivers/vfio/pci => include/linux}/vfio_pci_core.h (99%)

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index afdab7d71e98..860424ccda1b 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -1,19 +1,28 @@
 # SPDX-License-Identifier: GPL-2.0-only
-config VFIO_PCI
-	tristate "VFIO support for PCI devices"
-	depends on PCI
-	depends on MMU
+if PCI && MMU
+config VFIO_PCI_CORE
+	tristate
 	select VFIO_VIRQFD
 	select IRQ_BYPASS_MANAGER
+
+config VFIO_PCI_MMAP
+	def_bool y if !S390
+
+config VFIO_PCI_INTX
+	def_bool y if !S390
+
+config VFIO_PCI
+	tristate "Generic VFIO support for any PCI device"
+	select VFIO_PCI_CORE
 	help
-	  Support for the PCI VFIO bus driver.  This is required to make
-	  use of PCI drivers using the VFIO framework.
+	  Support for the generic PCI VFIO bus driver which can connect any
+	  PCI device to the VFIO framework.
 
 	  If you don't know what to do here, say N.
 
 if VFIO_PCI
 config VFIO_PCI_VGA
-	bool "VFIO PCI support for VGA devices"
+	bool "Generic VFIO PCI support for VGA devices"
 	depends on X86 && VGA_ARB
 	help
 	  Support for VGA extension to VFIO PCI.  This exposes an additional
@@ -22,14 +31,8 @@ config VFIO_PCI_VGA
 
 	  If you don't know what to do here, say N.
 
-config VFIO_PCI_MMAP
-	def_bool y if !S390
-
-config VFIO_PCI_INTX
-	def_bool y if !S390
-
 config VFIO_PCI_IGD
-	bool "VFIO PCI extensions for Intel graphics (GVT-d)"
+	bool "Generic VFIO PCI extensions for Intel graphics (GVT-d)"
 	depends on X86
 	default y
 	help
@@ -39,5 +42,5 @@ config VFIO_PCI_IGD
 	  and LPC bridge config space.
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
-
+endif
 endif
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 8aa517b4b671..349d68d242b4 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-vfio-pci-y := vfio_pci.o vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
-vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
-vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o
+vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
+obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
 
+vfio-pci-y := vfio_pci.o
+vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 85fd638a5955..a5ce92beb655 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -25,7 +25,7 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 
-#include "vfio_pci_core.h"
+#include <linux/vfio_pci_core.h>
 
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
 #define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
@@ -153,6 +153,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	ret = vfio_pci_core_register_device(vdev);
 	if (ret)
 		goto out_free;
+	dev_set_drvdata(&pdev->dev, vdev);
 	return 0;
 
 out_free:
@@ -246,14 +247,10 @@ static int __init vfio_pci_init(void)
 
 	vfio_pci_core_set_params(nointxmask, is_disable_vga, disable_idle_d3);
 
-	ret = vfio_pci_core_init();
-	if (ret)
-		return ret;
-
 	/* Register and scan for devices */
 	ret = pci_register_driver(&vfio_pci_driver);
 	if (ret)
-		goto out;
+		return ret;
 
 	vfio_pci_fill_ids();
 
@@ -261,17 +258,12 @@ static int __init vfio_pci_init(void)
 		pr_warn("device denylist disabled.\n");
 
 	return 0;
-
-out:
-	vfio_pci_core_cleanup();
-	return ret;
 }
 module_init(vfio_pci_init);
 
 static void __exit vfio_pci_cleanup(void)
 {
 	pci_unregister_driver(&vfio_pci_driver);
-	vfio_pci_core_cleanup();
 }
 module_exit(vfio_pci_cleanup);
 
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 619638fa55a3..9be98bce8405 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -26,7 +26,7 @@
 #include <linux/vfio.h>
 #include <linux/slab.h>
 
-#include "vfio_pci_core.h"
+#include <linux/vfio_pci_core.h>
 
 /* Fake capability ID for standard config space */
 #define PCI_CAP_ID_BASIC	0
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index c428a6a1d37c..aac578e6adcb 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -8,6 +8,8 @@
  * Author: Tom Lyon, pugs@cisco.com
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/eventfd.h>
 #include <linux/file.h>
@@ -25,7 +27,10 @@
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
 
-#include "vfio_pci_core.h"
+#include <linux/vfio_pci_core.h>
+
+#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC "core driver for VFIO based PCI devices"
 
 static bool nointxmask;
 static bool disable_vga;
@@ -306,6 +311,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
 
 void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
 {
@@ -403,6 +409,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
 	if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3)
 		vfio_pci_set_power_state(vdev, PCI_D3hot);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
 
 static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev)
 {
@@ -459,6 +466,7 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 	}
 	mutex_unlock(&vdev->igate);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
 
 void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
 {
@@ -466,6 +474,7 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
 	vfio_spapr_pci_eeh_open(vdev->pdev);
 	vfio_pci_vf_token_user_add(vdev, 1);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
 
 static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
 {
@@ -624,6 +633,7 @@ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region);
 
 long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		unsigned long arg)
@@ -1168,6 +1178,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 
 	return -ENOTTY;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
 
 static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			   size_t count, loff_t *ppos, bool iswrite)
@@ -1211,6 +1222,7 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
 
 	return vfio_pci_rw(vdev, buf, count, ppos, false);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_read);
 
 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
 		size_t count, loff_t *ppos)
@@ -1223,6 +1235,7 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
 
 	return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_write);
 
 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
@@ -1503,6 +1516,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
 
 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
 {
@@ -1525,6 +1539,7 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
 
 	mutex_unlock(&vdev->igate);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_request);
 
 static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 				      bool vf_token, uuid_t *uuid)
@@ -1669,6 +1684,7 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
 
 	return 1; /* Match */
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_match);
 
 static int vfio_pci_bus_notifier(struct notifier_block *nb,
 				 unsigned long action, void *data)
@@ -1777,6 +1793,7 @@ void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
 	INIT_LIST_HEAD(&vdev->vma_list);
 	init_rwsem(&vdev->memory_lock);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_init_device);
 
 void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev)
 {
@@ -1787,6 +1804,7 @@ void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev)
 	kfree(vdev->region);
 	kfree(vdev->pm_save);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device);
 
 int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
 {
@@ -1854,7 +1872,6 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
 	ret = vfio_register_group_dev(&vdev->vdev);
 	if (ret)
 		goto out_power;
-	dev_set_drvdata(&pdev->dev, vdev);
 	return 0;
 
 out_power:
@@ -1866,6 +1883,7 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
 	vfio_iommu_group_put(group, &pdev->dev);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
 
 void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
 {
@@ -1883,6 +1901,7 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
 	if (!disable_idle_d3)
 		vfio_pci_set_power_state(vdev, PCI_D0);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
 
 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 						  pci_channel_state_t state)
@@ -1926,10 +1945,12 @@ int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
 
 	return ret < 0 ? ret : nr_virtfn;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
 
 const struct pci_error_handlers vfio_pci_core_err_handlers = {
 	.error_detected = vfio_pci_aer_err_detected,
 };
+EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
 
 static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
 			       struct vfio_pci_group_info *groups)
@@ -2118,16 +2139,22 @@ void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
 	disable_vga = is_disable_vga;
 	disable_idle_d3 = is_disable_idle_d3;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
 
-/* This will become the __exit function of vfio_pci_core.ko */
-void vfio_pci_core_cleanup(void)
+static void vfio_pci_core_cleanup(void)
 {
 	vfio_pci_uninit_perm_bits();
 }
 
-/* This will become the __init function of vfio_pci_core.ko */
-int __init vfio_pci_core_init(void)
+static int __init vfio_pci_core_init(void)
 {
 	/* Allocate shared config space permission data used by all devices */
 	return vfio_pci_init_perm_bits();
 }
+
+module_init(vfio_pci_core_init);
+module_exit(vfio_pci_core_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index a324ca7e6b5a..7ca4109bba48 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -15,7 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 
-#include "vfio_pci_core.h"
+#include <linux/vfio_pci_core.h>
 
 #define OPREGION_SIGNATURE	"IntelGraphicsMem"
 #define OPREGION_SIZE		(8 * 1024)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 945ddbdf4d11..6069a11fb51a 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -20,7 +20,7 @@
 #include <linux/wait.h>
 #include <linux/slab.h>
 
-#include "vfio_pci_core.h"
+#include <linux/vfio_pci_core.h>
 
 /*
  * INTx
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 8fff4689dd44..57d3b2cbbd8e 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -17,7 +17,7 @@
 #include <linux/vfio.h>
 #include <linux/vgaarb.h>
 
-#include "vfio_pci_core.h"
+#include <linux/vfio_pci_core.h>
 
 #ifdef __LITTLE_ENDIAN
 #define vfio_ioread64	ioread64
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index d9a669f41d90..ea4c0d2b0663 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -14,7 +14,7 @@
 #include <asm/pci_clp.h>
 #include <asm/pci_io.h>
 
-#include "vfio_pci_core.h"
+#include <linux/vfio_pci_core.h>
 
 /*
  * Add the Base PCI Function information to the device info region.
diff --git a/drivers/vfio/pci/vfio_pci_core.h b/include/linux/vfio_pci_core.h
similarity index 99%
rename from drivers/vfio/pci/vfio_pci_core.h
rename to include/linux/vfio_pci_core.h
index 6192de8e4d13..ec0fb70be2d3 100644
--- a/drivers/vfio/pci/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -207,8 +207,6 @@ static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 #endif
 
 /* Will be exported for vfio pci drivers usage */
-void vfio_pci_core_cleanup(void);
-int vfio_pci_core_init(void);
 void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
 			      bool is_disable_idle_d3);
 void vfio_pci_core_close_device(struct vfio_device *core_vdev);
-- 
Gitee


From 3a0ca5b7cdc9dbba6aac4c38950fba836bc5300d Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 21 Jul 2021 12:25:20 -0700
Subject: [PATCH 1692/2161] dmaengine: idxd: Change license on idxd.h to LGPL

commit 25905f602fdb0cfa147017056636768a7aa1ff6f upstream.

This file was given GPL-2.0 license. But LGPL-2.1 makes more sense
as it needs to be used by libraries outside of the kernel source tree.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/idxd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 3a6c2fe812d4..c750eac09fc9 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: LGPL-2.1 WITH Linux-syscall-note */
 /* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
 #ifndef _USR_IDXD_H_
 #define _USR_IDXD_H_
-- 
Gitee


From 77445a46b0958d3675fde8c10f604dc5c35ced81 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 17 Nov 2021 10:03:51 -0700
Subject: [PATCH 1693/2161] dmaengine: idxd: fix calling wq quiesce inside
 spinlock

commit fa51b16d05583c7aebbc06330afb50276243d198 upstream.

Dan reports that smatch has found idxd_wq_quiesce() is being called inside
the idxd->dev_lock. idxd_wq_quiesce() calls wait_for_completion() and
therefore it can sleep. Move the call outside of the spinlock as it does
not need device lock.

Fixes: 5b0c68c473a1 ("dmaengine: idxd: support reporting of halt interrupt")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163716858508.1721911.15051495873516709923.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 17f2f8a31b63..cf2c8bc4f147 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -137,10 +137,10 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 			INIT_WORK(&idxd->work, idxd_device_reinit);
 			queue_work(idxd->wq, &idxd->work);
 		} else {
-			spin_lock(&idxd->dev_lock);
 			idxd->state = IDXD_DEV_HALTED;
 			idxd_wqs_quiesce(idxd);
 			idxd_wqs_unmap_portal(idxd);
+			spin_lock(&idxd->dev_lock);
 			idxd_device_clear_state(idxd);
 			dev_err(&idxd->pdev->dev,
 				"idxd halted, need %s.\n",
-- 
Gitee


From 4bf302159cab31d9cbc06d1a5cd33d91fb6b49a0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 8 Dec 2021 10:01:27 -0700
Subject: [PATCH 1694/2161] dmaengine: idxd: fix missed completion on abort
 path

commit 8affd8a4b5ce356c8900cfb037674f3a4a11fbdb upstream.

Ming reported that with the abort path of the descriptor submission, there
can be a window where a completed descriptor can be missed to be completed
by the irq completion thread:

CPU A				CPU B
Submit (successful)

Submit (fail)
				irq_process_work_list() // empty

llist_abort_desc()
// remove all descs from pending list

				irq_process_pending_llist() // empty
				exit idxd_wq_thread() with no processing

Add opportunistic descriptor completion in the abort path in order to
remove the missed completion.

Fixes: 6b4b87f2c31a ("dmaengine: idxd: fix submission race window")
Reported-by: Ming Li <ming4.li@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163898288714.443911.16084982766671976640.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index de76fb4abac2..83452fbbb168 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -106,6 +106,7 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
 {
 	struct idxd_desc *d, *t, *found = NULL;
 	struct llist_node *head;
+	LIST_HEAD(flist);
 
 	desc->completion->status = IDXD_COMP_DESC_ABORT;
 	/*
@@ -120,7 +121,11 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
 				found = desc;
 				continue;
 			}
-			list_add_tail(&desc->list, &ie->work_list);
+
+			if (d->completion->status)
+				list_add_tail(&d->list, &flist);
+			else
+				list_add_tail(&d->list, &ie->work_list);
 		}
 	}
 
@@ -130,6 +135,17 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
 
 	if (found)
 		complete_desc(found, IDXD_COMPLETE_ABORT);
+
+	/*
+	 * complete_desc() will return desc to allocator and the desc can be
+	 * acquired by a different process and the desc->list can be modified.
+	 * Delete desc from list so the list trasversing does not get corrupted
+	 * by the other process.
+	 */
+	list_for_each_entry_safe(d, t, &flist, list) {
+		list_del_init(&d->list);
+		complete_desc(d, IDXD_COMPLETE_NORMAL);
+	}
 }
 
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
-- 
Gitee


From f5b1051e11ddab9d1e30f80ef07f44fa551601fd Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Oct 2021 14:36:02 -0700
Subject: [PATCH 1695/2161] dmaengine: idxd: rework descriptor free path on
 failure

commit 5d78abb6fbc974d601dd365b9ce39f320fb5ba79 upstream.

Refactor the completion function to allow skipping of descriptor freeing on
the submission failure path. This completely removes descriptor freeing
from the submit failure path and leave the responsibility to the caller.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163528416222.3925689.12859769271667814762.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/dma.c    | 10 ++++++++--
 drivers/dma/idxd/idxd.h   |  8 +-------
 drivers/dma/idxd/init.c   |  9 +++------
 drivers/dma/idxd/irq.c    |  8 ++++----
 drivers/dma/idxd/submit.c | 22 ++++++++--------------
 5 files changed, 24 insertions(+), 33 deletions(-)

diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index c39e9483206a..1ea663215909 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -21,7 +21,8 @@ static inline struct idxd_wq *to_idxd_wq(struct dma_chan *c)
 }
 
 void idxd_dma_complete_txd(struct idxd_desc *desc,
-			   enum idxd_complete_type comp_type)
+			   enum idxd_complete_type comp_type,
+			   bool free_desc)
 {
 	struct dma_async_tx_descriptor *tx;
 	struct dmaengine_result res;
@@ -44,6 +45,9 @@ void idxd_dma_complete_txd(struct idxd_desc *desc,
 		tx->callback = NULL;
 		tx->callback_result = NULL;
 	}
+
+	if (free_desc)
+		idxd_free_desc(desc->wq, desc);
 }
 
 static void op_flag_setup(unsigned long flags, u32 *desc_flags)
@@ -153,8 +157,10 @@ static dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	cookie = dma_cookie_assign(tx);
 
 	rc = idxd_submit_desc(wq, desc);
-	if (rc < 0)
+	if (rc < 0) {
+		idxd_free_desc(wq, desc);
 		return rc;
+	}
 
 	return cookie;
 }
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 0cf8d3145870..3d600f8ee90b 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -579,7 +579,7 @@ int idxd_register_dma_channel(struct idxd_wq *wq);
 void idxd_unregister_dma_channel(struct idxd_wq *wq);
 void idxd_parse_completion_status(u8 status, enum dmaengine_tx_result *res);
 void idxd_dma_complete_txd(struct idxd_desc *desc,
-			   enum idxd_complete_type comp_type);
+			   enum idxd_complete_type comp_type, bool free_desc);
 
 /* cdev */
 int idxd_cdev_register(void);
@@ -603,10 +603,4 @@ static inline void perfmon_init(void) {}
 static inline void perfmon_exit(void) {}
 #endif
 
-static inline void complete_desc(struct idxd_desc *desc, enum idxd_complete_type reason)
-{
-	idxd_dma_complete_txd(desc, reason);
-	idxd_free_desc(desc->wq, desc);
-}
-
 #endif
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 7bf03f371ce1..4373b48cdc91 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -717,10 +717,8 @@ static void idxd_flush_pending_llist(struct idxd_irq_entry *ie)
 	if (!head)
 		return;
 
-	llist_for_each_entry_safe(desc, itr, head, llnode) {
-		idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT);
-		idxd_free_desc(desc->wq, desc);
-	}
+	llist_for_each_entry_safe(desc, itr, head, llnode)
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
 }
 
 static void idxd_flush_work_list(struct idxd_irq_entry *ie)
@@ -729,8 +727,7 @@ static void idxd_flush_work_list(struct idxd_irq_entry *ie)
 
 	list_for_each_entry_safe(desc, iter, &ie->work_list, list) {
 		list_del(&desc->list);
-		idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT);
-		idxd_free_desc(desc->wq, desc);
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
 	}
 }
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index cf2c8bc4f147..47b6fc894a23 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -195,11 +195,11 @@ static void irq_process_pending_llist(struct idxd_irq_entry *irq_entry)
 			 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
 			 */
 			if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) {
-				complete_desc(desc, IDXD_COMPLETE_ABORT);
+				idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
 				continue;
 			}
 
-			complete_desc(desc, IDXD_COMPLETE_NORMAL);
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL, true);
 		} else {
 			spin_lock(&irq_entry->list_lock);
 			list_add_tail(&desc->list,
@@ -238,11 +238,11 @@ static void irq_process_work_list(struct idxd_irq_entry *irq_entry)
 		 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
 		 */
 		if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) {
-			complete_desc(desc, IDXD_COMPLETE_ABORT);
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
 			continue;
 		}
 
-		complete_desc(desc, IDXD_COMPLETE_NORMAL);
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL, true);
 	}
 }
 
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 83452fbbb168..e5c3689851ea 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -134,17 +134,17 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
 	spin_unlock(&ie->list_lock);
 
 	if (found)
-		complete_desc(found, IDXD_COMPLETE_ABORT);
+		idxd_dma_complete_txd(found, IDXD_COMPLETE_ABORT, false);
 
 	/*
-	 * complete_desc() will return desc to allocator and the desc can be
-	 * acquired by a different process and the desc->list can be modified.
-	 * Delete desc from list so the list trasversing does not get corrupted
-	 * by the other process.
+	 * completing the descriptor will return desc to allocator and
+	 * the desc can be acquired by a different process and the
+	 * desc->list can be modified.  Delete desc from list so the
+	 * list trasversing does not get corrupted by the other process.
 	 */
 	list_for_each_entry_safe(d, t, &flist, list) {
 		list_del_init(&d->list);
-		complete_desc(d, IDXD_COMPLETE_NORMAL);
+		idxd_dma_complete_txd(found, IDXD_COMPLETE_ABORT, true);
 	}
 }
 
@@ -155,15 +155,11 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	void __iomem *portal;
 	int rc;
 
-	if (idxd->state != IDXD_DEV_ENABLED) {
-		idxd_free_desc(wq, desc);
+	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
-	}
 
-	if (!percpu_ref_tryget_live(&wq->wq_active)) {
-		idxd_free_desc(wq, desc);
+	if (!percpu_ref_tryget_live(&wq->wq_active))
 		return -ENXIO;
-	}
 
 	portal = idxd_wq_portal_addr(wq);
 
@@ -198,8 +194,6 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 			/* abort operation frees the descriptor */
 			if (ie)
 				llist_abort_desc(wq, ie, desc);
-			else
-				idxd_free_desc(wq, desc);
 			return rc;
 		}
 	}
-- 
Gitee


From df6634263d339f54c1b1974e9da9469dee3d554a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Oct 2021 14:36:10 -0700
Subject: [PATCH 1696/2161] dmaengine: idxd: int handle management refactoring

commit 8b67426e05584e956775f4b134596b56bc0d35e0 upstream.

Attach int_handle to irq_entry. This removes the separate management of int
handles and reduces the confusion of interating through int handles that is
off by 1 count.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163528417065.3925689.11505755433684476288.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  8 ++++
 drivers/dma/idxd/idxd.h   | 10 ++++-
 drivers/dma/idxd/init.c   | 86 ++++++++++++++++++++-------------------
 drivers/dma/idxd/submit.c |  6 +--
 drivers/dma/idxd/sysfs.c  |  1 -
 5 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index fab412349f7f..f381319615fd 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1206,6 +1206,13 @@ int __drv_enable_wq(struct idxd_wq *wq)
 		goto err;
 	}
 
+	/*
+	 * Device has 1 misc interrupt and N interrupts for descriptor completion. To
+	 * assign WQ to interrupt, we will take the N+1 interrupt since vector 0 is
+	 * for the misc interrupt.
+	 */
+	wq->ie = &idxd->irq_entries[wq->id + 1];
+
 	rc = idxd_wq_enable(wq);
 	if (rc < 0) {
 		dev_dbg(dev, "wq %d enabling failed: %d\n", wq->id, rc);
@@ -1256,6 +1263,7 @@ void __drv_disable_wq(struct idxd_wq *wq)
 	idxd_wq_drain(wq);
 	idxd_wq_reset(wq);
 
+	wq->ie = NULL;
 	wq->client_count = 0;
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 3d600f8ee90b..355159d4ee68 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -10,6 +10,7 @@
 #include <linux/cdev.h>
 #include <linux/idr.h>
 #include <linux/pci.h>
+#include <linux/ioasid.h>
 #include <linux/perf_event.h>
 #include <uapi/linux/idxd.h>
 #include "registers.h"
@@ -64,6 +65,7 @@ extern struct idxd_device_driver idxd_drv;
 extern struct idxd_device_driver idxd_dmaengine_drv;
 extern struct idxd_device_driver idxd_user_drv;
 
+#define INVALID_INT_HANDLE	-1
 struct idxd_irq_entry {
 	struct idxd_device *idxd;
 	int id;
@@ -75,6 +77,9 @@ struct idxd_irq_entry {
 	 * and irq thread processing error descriptor.
 	 */
 	spinlock_t list_lock;
+	int int_handle;
+	struct idxd_wq *wq;
+	ioasid_t pasid;
 };
 
 struct idxd_group {
@@ -171,6 +176,7 @@ struct idxd_wq {
 	struct wait_queue_head err_queue;
 	struct idxd_device *idxd;
 	int id;
+	struct idxd_irq_entry *ie;
 	enum idxd_wq_type type;
 	struct idxd_group *group;
 	int client_count;
@@ -266,6 +272,8 @@ struct idxd_device {
 	unsigned int pasid;
 
 	int num_groups;
+	int irq_cnt;
+	bool request_int_handles;
 
 	u32 msix_perm_offset;
 	u32 wqcfg_offset;
@@ -292,8 +300,6 @@ struct idxd_device {
 	struct workqueue_struct *wq;
 	struct work_struct work;
 
-	int *int_handles;
-
 	struct idxd_pmu *idxd_pmu;
 };
 
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 4373b48cdc91..2dbff722e207 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -81,6 +81,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 		dev_err(dev, "Not MSI-X interrupt capable.\n");
 		return -ENOSPC;
 	}
+	idxd->irq_cnt = msixcnt;
 
 	rc = pci_alloc_irq_vectors(pdev, msixcnt, msixcnt, PCI_IRQ_MSIX);
 	if (rc != msixcnt) {
@@ -103,7 +104,18 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	for (i = 0; i < msixcnt; i++) {
 		idxd->irq_entries[i].id = i;
 		idxd->irq_entries[i].idxd = idxd;
+		/*
+		 * Association of WQ should be assigned starting with irq_entry 1.
+		 * irq_entry 0 is for misc interrupts and has no wq association
+		 */
+		if (i > 0)
+			idxd->irq_entries[i].wq = idxd->wqs[i - 1];
 		idxd->irq_entries[i].vector = pci_irq_vector(pdev, i);
+		idxd->irq_entries[i].int_handle = INVALID_INT_HANDLE;
+		if (device_pasid_enabled(idxd) && i > 0)
+			idxd->irq_entries[i].pasid = idxd->pasid;
+		else
+			idxd->irq_entries[i].pasid = INVALID_IOASID;
 		spin_lock_init(&idxd->irq_entries[i].list_lock);
 	}
 
@@ -135,22 +147,14 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 		}
 
 		dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, irq_entry->vector);
-		if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) {
-			/*
-			 * The MSIX vector enumeration starts at 1 with vector 0 being the
-			 * misc interrupt that handles non I/O completion events. The
-			 * interrupt handles are for IMS enumeration on guest. The misc
-			 * interrupt vector does not require a handle and therefore we start
-			 * the int_handles at index 0. Since 'i' starts at 1, the first
-			 * int_handles index will be 0.
-			 */
-			rc = idxd_device_request_int_handle(idxd, i, &idxd->int_handles[i - 1],
+		if (idxd->request_int_handles) {
+			rc = idxd_device_request_int_handle(idxd, i, &irq_entry->int_handle,
 							    IDXD_IRQ_MSIX);
 			if (rc < 0) {
 				free_irq(irq_entry->vector, irq_entry);
 				goto err_wq_irqs;
 			}
-			dev_dbg(dev, "int handle requested: %u\n", idxd->int_handles[i - 1]);
+			dev_dbg(dev, "int handle requested: %u\n", irq_entry->int_handle);
 		}
 	}
 
@@ -161,9 +165,15 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	while (--i >= 0) {
 		irq_entry = &idxd->irq_entries[i];
 		free_irq(irq_entry->vector, irq_entry);
-		if (i != 0)
-			idxd_device_release_int_handle(idxd,
-						       idxd->int_handles[i], IDXD_IRQ_MSIX);
+		if (irq_entry->int_handle != INVALID_INT_HANDLE) {
+			idxd_device_release_int_handle(idxd, irq_entry->int_handle,
+						       IDXD_IRQ_MSIX);
+			irq_entry->int_handle = INVALID_INT_HANDLE;
+			irq_entry->pasid = INVALID_IOASID;
+		}
+		irq_entry->vector = -1;
+		irq_entry->wq = NULL;
+		irq_entry->idxd = NULL;
 	}
  err_misc_irq:
 	/* Disable error interrupt generation */
@@ -179,21 +189,19 @@ static void idxd_cleanup_interrupts(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
 	struct idxd_irq_entry *irq_entry;
-	int i, msixcnt;
-
-	msixcnt = pci_msix_vec_count(pdev);
-	if (msixcnt <= 0)
-		return;
-
-	irq_entry = &idxd->irq_entries[0];
-	free_irq(irq_entry->vector, irq_entry);
-
-	for (i = 1; i < msixcnt; i++) {
+	int i;
 
+	for (i = 0; i < idxd->irq_cnt; i++) {
 		irq_entry = &idxd->irq_entries[i];
-		if (idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE))
-			idxd_device_release_int_handle(idxd, idxd->int_handles[i],
+		if (irq_entry->int_handle != INVALID_INT_HANDLE) {
+			idxd_device_release_int_handle(idxd, irq_entry->int_handle,
 						       IDXD_IRQ_MSIX);
+			irq_entry->int_handle = INVALID_INT_HANDLE;
+			irq_entry->pasid = INVALID_IOASID;
+		}
+		irq_entry->vector = -1;
+		irq_entry->wq = NULL;
+		irq_entry->idxd = NULL;
 		free_irq(irq_entry->vector, irq_entry);
 	}
 
@@ -379,13 +387,6 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 
 	init_waitqueue_head(&idxd->cmd_waitq);
 
-	if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) {
-		idxd->int_handles = kcalloc_node(idxd->max_wqs, sizeof(int), GFP_KERNEL,
-						 dev_to_node(dev));
-		if (!idxd->int_handles)
-			return -ENOMEM;
-	}
-
 	rc = idxd_setup_wqs(idxd);
 	if (rc < 0)
 		goto err_wqs;
@@ -416,7 +417,6 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 	for (i = 0; i < idxd->max_wqs; i++)
 		put_device(wq_confdev(idxd->wqs[i]));
  err_wqs:
-	kfree(idxd->int_handles);
 	return rc;
 }
 
@@ -451,6 +451,10 @@ static void idxd_read_caps(struct idxd_device *idxd)
 		dev_dbg(dev, "cmd_cap: %#x\n", idxd->hw.cmd_cap);
 	}
 
+	/* reading command capabilities */
+	if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE))
+		idxd->request_int_handles = true;
+
 	idxd->max_xfer_bytes = 1ULL << idxd->hw.gen_cap.max_xfer_shift;
 	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
 	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
@@ -748,15 +752,15 @@ static void idxd_release_int_handles(struct idxd_device *idxd)
 	struct device *dev = &idxd->pdev->dev;
 	int i, rc;
 
-	for (i = 0; i < idxd->num_wq_irqs; i++) {
-		if (idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE)) {
-			rc = idxd_device_release_int_handle(idxd, idxd->int_handles[i],
-							    IDXD_IRQ_MSIX);
+	for (i = 1; i < idxd->irq_cnt; i++) {
+		struct idxd_irq_entry *ie = &idxd->irq_entries[i];
+
+		if (ie->int_handle != INVALID_INT_HANDLE) {
+			rc = idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
 			if (rc < 0)
-				dev_warn(dev, "irq handle %d release failed\n",
-					 idxd->int_handles[i]);
+				dev_warn(dev, "irq handle %d release failed\n", ie->int_handle);
 			else
-				dev_dbg(dev, "int handle requested: %u\n", idxd->int_handles[i]);
+				dev_dbg(dev, "int handle released: %u\n", ie->int_handle);
 		}
 	}
 }
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index e5c3689851ea..960fbee684b2 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -25,10 +25,10 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 	 * On host, MSIX vecotr 0 is used for misc interrupt. Therefore when we match
 	 * vector 1:1 to the WQ id, we need to add 1
 	 */
-	if (!idxd->int_handles)
+	if (wq->ie->int_handle == INVALID_INT_HANDLE)
 		desc->hw->int_handle = wq->id + 1;
 	else
-		desc->hw->int_handle = idxd->int_handles[wq->id];
+		desc->hw->int_handle = wq->ie->int_handle;
 
 	return desc;
 }
@@ -175,7 +175,7 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 * that we designated the descriptor to.
 	 */
 	if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
-		ie = &idxd->irq_entries[wq->id + 1];
+		ie = wq->ie;
 		llist_add(&desc->llnode, &ie->pending_llist);
 	}
 
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index a9025be940db..90857e776273 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1269,7 +1269,6 @@ static void idxd_conf_device_release(struct device *dev)
 	kfree(idxd->wqs);
 	kfree(idxd->engines);
 	kfree(idxd->irq_entries);
-	kfree(idxd->int_handles);
 	ida_free(&idxd_ida, idxd->id);
 	kfree(idxd);
 }
-- 
Gitee


From 67bdaf24c1928ad5d2ee62fa00433a2aa2336413 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Oct 2021 14:36:17 -0700
Subject: [PATCH 1697/2161] dmaengine: idxd: move interrupt handle assignment

commit eb0cf33a91b46cd50b590d032471f7f977d5a92a upstream.

In preparation of supporting interrupt handle revoke event, move the
interrupt handle assignment to right before the descriptor to be submitted.
This allows the interrupt handle revoke logic to assign the latest
interrupt handle on submission.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163528417767.3925689.7730411152122952808.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 960fbee684b2..c3dc450134d8 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -21,15 +21,6 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 	if (device_pasid_enabled(idxd))
 		desc->hw->pasid = idxd->pasid;
 
-	/*
-	 * On host, MSIX vecotr 0 is used for misc interrupt. Therefore when we match
-	 * vector 1:1 to the WQ id, we need to add 1
-	 */
-	if (wq->ie->int_handle == INVALID_INT_HANDLE)
-		desc->hw->int_handle = wq->id + 1;
-	else
-		desc->hw->int_handle = wq->ie->int_handle;
-
 	return desc;
 }
 
@@ -176,6 +167,11 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 */
 	if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
 		ie = wq->ie;
+		if (ie->int_handle == INVALID_INT_HANDLE)
+			desc->hw->int_handle = ie->id;
+		else
+			desc->hw->int_handle = ie->int_handle;
+
 		llist_add(&desc->llnode, &ie->pending_llist);
 	}
 
-- 
Gitee


From 822428af826160b63975e4d849f096919ffd5390 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Oct 2021 14:36:23 -0700
Subject: [PATCH 1698/2161] dmaengine: idxd: add helper for per interrupt
 handle drain

commit 46c6df1c958e55558212cfa94cad201eae48d684 upstream.

The helper is called at the completion of the interrupt handle refresh
event. It issues drain descriptors to each of the wq with associated
interrupt handle. The drain descriptor will have interrupt request set but
without completion record. This will ensure all descriptors with incorrect
interrupt completion handle get drained and a completion interrupt is
triggered for the guest driver to process them.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163528418315.3925689.7944718440052849626.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/irq.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 47b6fc894a23..f980486db590 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -55,6 +55,45 @@ static void idxd_device_reinit(struct work_struct *work)
 	idxd_device_clear_state(idxd);
 }
 
+/*
+ * The function sends a drain descriptor for the interrupt handle. The drain ensures
+ * all descriptors with this interrupt handle is flushed and the interrupt
+ * will allow the cleanup of the outstanding descriptors.
+ */
+static void idxd_int_handle_revoke_drain(struct idxd_irq_entry *ie)
+{
+	struct idxd_wq *wq = ie->wq;
+	struct idxd_device *idxd = ie->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct dsa_hw_desc desc = {};
+	void __iomem *portal;
+	int rc;
+
+	/* Issue a simple drain operation with interrupt but no completion record */
+	desc.flags = IDXD_OP_FLAG_RCI;
+	desc.opcode = DSA_OPCODE_DRAIN;
+	desc.priv = 1;
+
+	if (ie->pasid != INVALID_IOASID)
+		desc.pasid = ie->pasid;
+	desc.int_handle = ie->int_handle;
+	portal = idxd_wq_portal_addr(wq);
+
+	/*
+	 * The wmb() makes sure that the descriptor is all there before we
+	 * issue.
+	 */
+	wmb();
+	if (wq_dedicated(wq)) {
+		iosubmit_cmds512(portal, &desc, 1);
+	} else {
+		rc = enqcmds(portal, &desc);
+		/* This should not fail unless hardware failed. */
+		if (rc < 0)
+			dev_warn(dev, "Failed to submit drain desc on wq %d\n", wq->id);
+	}
+}
+
 static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 {
 	struct device *dev = &idxd->pdev->dev;
-- 
Gitee


From e71e095d592cb4a624061738810c0b26f3c2ddf8 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Oct 2021 14:36:29 -0700
Subject: [PATCH 1699/2161] dmaengine: idxd: create locked version of
 idxd_quiesce() call

commit bd5970a0d01f8e45af9b2e2cf1d245b84ea757ba upstream.

Add a locked version of idxd_quiesce() call so that the quiesce can be
called with a lock in situations where the lock is not held by the caller.

In the driver probe/remove path, the lock is already held, so the raw
version can be called w/o locking.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163528418980.3925689.5841907054957931211.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 10 +++++++++-
 drivers/dma/idxd/dma.c    |  4 ++--
 drivers/dma/idxd/idxd.h   |  1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index f381319615fd..943e9967627b 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -411,12 +411,20 @@ int idxd_wq_init_percpu_ref(struct idxd_wq *wq)
 	return 0;
 }
 
-void idxd_wq_quiesce(struct idxd_wq *wq)
+void __idxd_wq_quiesce(struct idxd_wq *wq)
 {
+	lockdep_assert_held(&wq->wq_lock);
 	percpu_ref_kill(&wq->wq_active);
 	wait_for_completion(&wq->wq_dead);
 }
 
+void idxd_wq_quiesce(struct idxd_wq *wq)
+{
+	mutex_lock(&wq->wq_lock);
+	__idxd_wq_quiesce(wq);
+	mutex_unlock(&wq->wq_lock);
+}
+
 /* Device control bits */
 static inline bool idxd_is_enabled(struct idxd_device *idxd)
 {
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 1ea663215909..375dbae18583 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -316,7 +316,7 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 	return 0;
 
 err_dma:
-	idxd_wq_quiesce(wq);
+	__idxd_wq_quiesce(wq);
 	percpu_ref_exit(&wq->wq_active);
 err_ref:
 	idxd_wq_free_resources(wq);
@@ -333,7 +333,7 @@ static void idxd_dmaengine_drv_remove(struct idxd_dev *idxd_dev)
 	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
 
 	mutex_lock(&wq->wq_lock);
-	idxd_wq_quiesce(wq);
+	__idxd_wq_quiesce(wq);
 	idxd_unregister_dma_channel(wq);
 	idxd_wq_free_resources(wq);
 	__drv_disable_wq(wq);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 355159d4ee68..970701738c8a 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -570,6 +570,7 @@ int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
 int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid);
 int idxd_wq_disable_pasid(struct idxd_wq *wq);
+void __idxd_wq_quiesce(struct idxd_wq *wq);
 void idxd_wq_quiesce(struct idxd_wq *wq);
 int idxd_wq_init_percpu_ref(struct idxd_wq *wq);
 
-- 
Gitee


From ae51d217378bd2237d5625b46f0bcdf00346fd02 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Oct 2021 14:36:36 -0700
Subject: [PATCH 1700/2161] dmaengine: idxd: handle invalid interrupt handle
 descriptors

commit f6d442f7088cbf5e2ac4561aca6888380239d5b9 upstream.

Handle a descriptor that has been marked with invalid interrupt handle
error in status. Create a work item that will resubmit the descriptor. This
typically happens when the driver has handled the revoke interrupt handle
event and has a new interrupt handle.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163528419601.3925689.4166517602890523193.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/dma.c  | 14 ++++++++----
 drivers/dma/idxd/idxd.h |  1 +
 drivers/dma/idxd/irq.c  | 50 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 375dbae18583..2ce873994e33 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -24,18 +24,24 @@ void idxd_dma_complete_txd(struct idxd_desc *desc,
 			   enum idxd_complete_type comp_type,
 			   bool free_desc)
 {
+	struct idxd_device *idxd = desc->wq->idxd;
 	struct dma_async_tx_descriptor *tx;
 	struct dmaengine_result res;
 	int complete = 1;
 
-	if (desc->completion->status == DSA_COMP_SUCCESS)
+	if (desc->completion->status == DSA_COMP_SUCCESS) {
 		res.result = DMA_TRANS_NOERROR;
-	else if (desc->completion->status)
+	} else if (desc->completion->status) {
+		if (idxd->request_int_handles && comp_type != IDXD_COMPLETE_ABORT &&
+		    desc->completion->status == DSA_COMP_INT_HANDLE_INVAL &&
+		    idxd_queue_int_handle_resubmit(desc))
+			return;
 		res.result = DMA_TRANS_WRITE_FAILED;
-	else if (comp_type == IDXD_COMPLETE_ABORT)
+	} else if (comp_type == IDXD_COMPLETE_ABORT) {
 		res.result = DMA_TRANS_ABORTED;
-	else
+	} else {
 		complete = 0;
+	}
 
 	tx = &desc->txd;
 	if (complete && tx->cookie) {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 970701738c8a..82c4915f58a2 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -524,6 +524,7 @@ void idxd_unregister_devices(struct idxd_device *idxd);
 int idxd_register_driver(void);
 void idxd_unregister_driver(void);
 void idxd_wqs_quiesce(struct idxd_device *idxd);
+bool idxd_queue_int_handle_resubmit(struct idxd_desc *desc);
 
 /* device interrupt control */
 void idxd_msix_perm_setup(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index f980486db590..8193f6f30719 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -22,6 +22,11 @@ struct idxd_fault {
 	struct idxd_device *idxd;
 };
 
+struct idxd_resubmit {
+	struct work_struct work;
+	struct idxd_desc *desc;
+};
+
 static void idxd_device_reinit(struct work_struct *work)
 {
 	struct idxd_device *idxd = container_of(work, struct idxd_device, work);
@@ -216,6 +221,51 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	return IRQ_HANDLED;
 }
 
+static void idxd_int_handle_resubmit_work(struct work_struct *work)
+{
+	struct idxd_resubmit *irw = container_of(work, struct idxd_resubmit, work);
+	struct idxd_desc *desc = irw->desc;
+	struct idxd_wq *wq = desc->wq;
+	int rc;
+
+	desc->completion->status = 0;
+	rc = idxd_submit_desc(wq, desc);
+	if (rc < 0) {
+		dev_dbg(&wq->idxd->pdev->dev, "Failed to resubmit desc %d to wq %d.\n",
+			desc->id, wq->id);
+		/*
+		 * If the error is not -EAGAIN, it means the submission failed due to wq
+		 * has been killed instead of ENQCMDS failure. Here the driver needs to
+		 * notify the submitter of the failure by reporting abort status.
+		 *
+		 * -EAGAIN comes from ENQCMDS failure. idxd_submit_desc() will handle the
+		 * abort.
+		 */
+		if (rc != -EAGAIN) {
+			desc->completion->status = IDXD_COMP_DESC_ABORT;
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, false);
+		}
+		idxd_free_desc(wq, desc);
+	}
+	kfree(irw);
+}
+
+bool idxd_queue_int_handle_resubmit(struct idxd_desc *desc)
+{
+	struct idxd_wq *wq = desc->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_resubmit *irw;
+
+	irw = kzalloc(sizeof(*irw), GFP_KERNEL);
+	if (!irw)
+		return false;
+
+	irw->desc = desc;
+	INIT_WORK(&irw->work, idxd_int_handle_resubmit_work);
+	queue_work(idxd->wq, &irw->work);
+	return true;
+}
+
 static void irq_process_pending_llist(struct idxd_irq_entry *irq_entry)
 {
 	struct idxd_desc *desc, *t;
-- 
Gitee


From 0793ea028f466e1b5f54cac6c5490d29a1f4affe Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Oct 2021 14:36:41 -0700
Subject: [PATCH 1701/2161] dmaengine: idxd: handle interrupt handle revoked
 event

commit 56fc39f5a36794c4f27f5fee047b641eac3f5b89 upstream.

"Interrupt handle revoked" is an event that happens when the driver is
running on a guest kernel and the VM is migrated to a new machine.
The device will trigger an interrupt that signals to the guest driver
that the interrupt handles need to be replaced.

The misc irq thread function calls a helper function to handle the
event. The function uses the WQ percpu_ref to quiesce the kernel
submissions. It then replaces the interrupt handles by requesting
interrupt handle command for each I/O MSIX vector. Once the handle is
updated, the driver will unblock the submission path to allow new
submissions.

The submitter will attempt to acquire a percpu_ref before submission. When
the request fails, it will wait on the wq_resurrect 'completion'.

The driver does anticipate the possibility of descriptors being submitted
before the WQ percpu_ref is killed. If a descriptor has already been
submitted, it will return with incorrect interrupt handle status. The
descriptor will be re-submitted with the new interrupt handle on the
completion path. For descriptors with incorrect interrupt handles,
completion interrupt won't be triggered.

At the completion of the interrupt handle refresh, the handling function
will call idxd_int_handle_refresh_drain() to issue drain descriptors to
each of the wq with associated interrupt handle. The drain descriptor will have
interrupt request set but without completion record. This will ensure all
descriptors with incorrect interrupt completion handle get drained and
a completion interrupt is triggered for the guest driver to process them.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Co-Developed-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163528420189.3925689.18212568593220415551.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c    |   6 +-
 drivers/dma/idxd/idxd.h      |   1 +
 drivers/dma/idxd/init.c      |   1 +
 drivers/dma/idxd/irq.c       | 137 +++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/registers.h |   1 +
 drivers/dma/idxd/submit.c    |  10 ++-
 6 files changed, 152 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 943e9967627b..1dc5245107df 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -404,17 +404,21 @@ int idxd_wq_init_percpu_ref(struct idxd_wq *wq)
 	int rc;
 
 	memset(&wq->wq_active, 0, sizeof(wq->wq_active));
-	rc = percpu_ref_init(&wq->wq_active, idxd_wq_ref_release, 0, GFP_KERNEL);
+	rc = percpu_ref_init(&wq->wq_active, idxd_wq_ref_release,
+			     PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
 	if (rc < 0)
 		return rc;
 	reinit_completion(&wq->wq_dead);
+	reinit_completion(&wq->wq_resurrect);
 	return 0;
 }
 
 void __idxd_wq_quiesce(struct idxd_wq *wq)
 {
 	lockdep_assert_held(&wq->wq_lock);
+	reinit_completion(&wq->wq_resurrect);
 	percpu_ref_kill(&wq->wq_active);
+	complete_all(&wq->wq_resurrect);
 	wait_for_completion(&wq->wq_dead);
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 82c4915f58a2..51e79201636c 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -171,6 +171,7 @@ struct idxd_wq {
 	u32 portal_offset;
 	struct percpu_ref wq_active;
 	struct completion wq_dead;
+	struct completion wq_resurrect;
 	struct idxd_dev idxd_dev;
 	struct idxd_cdev *idxd_cdev;
 	struct wait_queue_head err_queue;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 2dbff722e207..912839bf0be3 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -245,6 +245,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		mutex_init(&wq->wq_lock);
 		init_waitqueue_head(&wq->err_queue);
 		init_completion(&wq->wq_dead);
+		init_completion(&wq->wq_resurrect);
 		wq->max_xfer_bytes = idxd->max_xfer_bytes;
 		wq->max_batch_size = idxd->max_batch_size;
 		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 8193f6f30719..8a9eaeffc6f5 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -6,6 +6,7 @@
 #include <linux/pci.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmaengine.h>
+#include <linux/delay.h>
 #include <uapi/linux/idxd.h>
 #include "../dmaengine.h"
 #include "idxd.h"
@@ -27,6 +28,11 @@ struct idxd_resubmit {
 	struct idxd_desc *desc;
 };
 
+struct idxd_int_handle_revoke {
+	struct work_struct work;
+	struct idxd_device *idxd;
+};
+
 static void idxd_device_reinit(struct work_struct *work)
 {
 	struct idxd_device *idxd = container_of(work, struct idxd_device, work);
@@ -99,6 +105,120 @@ static void idxd_int_handle_revoke_drain(struct idxd_irq_entry *ie)
 	}
 }
 
+static void idxd_abort_invalid_int_handle_descs(struct idxd_irq_entry *ie)
+{
+	LIST_HEAD(flist);
+	struct idxd_desc *d, *t;
+	struct llist_node *head;
+
+	spin_lock(&ie->list_lock);
+	head = llist_del_all(&ie->pending_llist);
+	if (head) {
+		llist_for_each_entry_safe(d, t, head, llnode)
+			list_add_tail(&d->list, &ie->work_list);
+	}
+
+	list_for_each_entry_safe(d, t, &ie->work_list, list) {
+		if (d->completion->status == DSA_COMP_INT_HANDLE_INVAL)
+			list_move_tail(&d->list, &flist);
+	}
+	spin_unlock(&ie->list_lock);
+
+	list_for_each_entry_safe(d, t, &flist, list) {
+		list_del(&d->list);
+		idxd_dma_complete_txd(d, IDXD_COMPLETE_ABORT, true);
+	}
+}
+
+static void idxd_int_handle_revoke(struct work_struct *work)
+{
+	struct idxd_int_handle_revoke *revoke =
+		container_of(work, struct idxd_int_handle_revoke, work);
+	struct idxd_device *idxd = revoke->idxd;
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	int i, new_handle, rc;
+
+	if (!idxd->request_int_handles) {
+		kfree(revoke);
+		dev_warn(dev, "Unexpected int handle refresh interrupt.\n");
+		return;
+	}
+
+	/*
+	 * The loop attempts to acquire new interrupt handle for all interrupt
+	 * vectors that supports a handle. If a new interrupt handle is acquired and the
+	 * wq is kernel type, the driver will kill the percpu_ref to pause all
+	 * ongoing descriptor submissions. The interrupt handle is then changed.
+	 * After change, the percpu_ref is revived and all the pending submissions
+	 * are woken to try again. A drain is sent to for the interrupt handle
+	 * at the end to make sure all invalid int handle descriptors are processed.
+	 */
+	for (i = 1; i < idxd->irq_cnt; i++) {
+		struct idxd_irq_entry *ie = &idxd->irq_entries[i];
+		struct idxd_wq *wq = ie->wq;
+
+		rc = idxd_device_request_int_handle(idxd, i, &new_handle, IDXD_IRQ_MSIX);
+		if (rc < 0) {
+			dev_warn(dev, "get int handle %d failed: %d\n", i, rc);
+			/*
+			 * Failed to acquire new interrupt handle. Kill the WQ
+			 * and release all the pending submitters. The submitters will
+			 * get error return code and handle appropriately.
+			 */
+			ie->int_handle = INVALID_INT_HANDLE;
+			idxd_wq_quiesce(wq);
+			idxd_abort_invalid_int_handle_descs(ie);
+			continue;
+		}
+
+		/* No change in interrupt handle, nothing needs to be done */
+		if (ie->int_handle == new_handle)
+			continue;
+
+		if (wq->state != IDXD_WQ_ENABLED || wq->type != IDXD_WQT_KERNEL) {
+			/*
+			 * All the MSIX interrupts are allocated at once during probe.
+			 * Therefore we need to update all interrupts even if the WQ
+			 * isn't supporting interrupt operations.
+			 */
+			ie->int_handle = new_handle;
+			continue;
+		}
+
+		mutex_lock(&wq->wq_lock);
+		reinit_completion(&wq->wq_resurrect);
+
+		/* Kill percpu_ref to pause additional descriptor submissions */
+		percpu_ref_kill(&wq->wq_active);
+
+		/* Wait for all submitters quiesce before we change interrupt handle */
+		wait_for_completion(&wq->wq_dead);
+
+		ie->int_handle = new_handle;
+
+		/* Revive percpu ref and wake up all the waiting submitters */
+		percpu_ref_reinit(&wq->wq_active);
+		complete_all(&wq->wq_resurrect);
+		mutex_unlock(&wq->wq_lock);
+
+		/*
+		 * The delay here is to wait for all possible MOVDIR64B that
+		 * are issued before percpu_ref_kill() has happened to have
+		 * reached the PCIe domain before the drain is issued. The driver
+		 * needs to ensure that the drain descriptor issued does not pass
+		 * all the other issued descriptors that contain the invalid
+		 * interrupt handle in order to ensure that the drain descriptor
+		 * interrupt will allow the cleanup of all the descriptors with
+		 * invalid interrupt handle.
+		 */
+		if (wq_dedicated(wq))
+			udelay(100);
+		idxd_int_handle_revoke_drain(ie);
+	}
+	kfree(revoke);
+}
+
 static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -145,6 +265,23 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 		err = true;
 	}
 
+	if (cause & IDXD_INTC_INT_HANDLE_REVOKED) {
+		struct idxd_int_handle_revoke *revoke;
+
+		val |= IDXD_INTC_INT_HANDLE_REVOKED;
+
+		revoke = kzalloc(sizeof(*revoke), GFP_ATOMIC);
+		if (revoke) {
+			revoke->idxd = idxd;
+			INIT_WORK(&revoke->work, idxd_int_handle_revoke);
+			queue_work(idxd->wq, &revoke->work);
+
+		} else {
+			dev_err(dev, "Failed to allocate work for int handle revoke\n");
+			idxd_wqs_quiesce(idxd);
+		}
+	}
+
 	if (cause & IDXD_INTC_CMD) {
 		val |= IDXD_INTC_CMD;
 		complete(idxd->cmd_done);
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 262c8220adbd..8e396698c22b 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -158,6 +158,7 @@ enum idxd_device_reset_type {
 #define IDXD_INTC_OCCUPY			0x04
 #define IDXD_INTC_PERFMON_OVFL		0x08
 #define IDXD_INTC_HALT_STATE		0x10
+#define IDXD_INTC_INT_HANDLE_REVOKED	0x80000000
 
 #define IDXD_CMD_OFFSET			0xa0
 union idxd_command_reg {
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index c3dc450134d8..2060a9c2de9c 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -143,14 +143,18 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct idxd_irq_entry *ie = NULL;
+	u32 desc_flags = desc->hw->flags;
 	void __iomem *portal;
 	int rc;
 
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
 
-	if (!percpu_ref_tryget_live(&wq->wq_active))
-		return -ENXIO;
+	if (!percpu_ref_tryget_live(&wq->wq_active)) {
+		wait_for_completion(&wq->wq_resurrect);
+		if (!percpu_ref_tryget_live(&wq->wq_active))
+			return -ENXIO;
+	}
 
 	portal = idxd_wq_portal_addr(wq);
 
@@ -165,7 +169,7 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 * Pending the descriptor to the lockless list for the irq_entry
 	 * that we designated the descriptor to.
 	 */
-	if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
+	if (desc_flags & IDXD_OP_FLAG_RCI) {
 		ie = wq->ie;
 		if (ie->int_handle == INVALID_INT_HANDLE)
 			desc->hw->int_handle = ie->id;
-- 
Gitee


From d8a9dd4ba7d794b1e099dabe61e090652a19df17 Mon Sep 17 00:00:00 2001
From: Adrian Larumbe <adrianml@alumnos.upm.es>
Date: Mon, 1 Nov 2021 18:08:23 +0000
Subject: [PATCH 1702/2161] dmaengine: Add documentation for new memcpy
 scatter-gather function

commit 58fe107660483138a7a77acd673b911016e4ad31 upstream.

Documentation describes semantics, limitations and a typical use case
scenario.

Signed-off-by: Adrian Larumbe <adrianml@alumnos.upm.es>
Link: https://lore.kernel.org/r/20211101180825.241048-2-adrianml@alumnos.upm.es
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../driver-api/dmaengine/provider.rst         | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index 3b6bb2bc5d1f..16cf096527ee 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -162,6 +162,29 @@ Currently, the types available are:
 
   - The device is able to do memory to memory copies
 
+- - DMA_MEMCPY_SG
+
+  - The device supports memory to memory scatter-gather transfers.
+
+  - Even though a plain memcpy can look like a particular case of a
+    scatter-gather transfer, with a single chunk to copy, it's a distinct
+    transaction type in the mem2mem transfer case. This is because some very
+    simple devices might be able to do contiguous single-chunk memory copies,
+    but have no support for more complex SG transfers.
+
+  - No matter what the overall size of the combined chunks for source and
+    destination is, only as many bytes as the smallest of the two will be
+    transmitted. That means the number and size of the scatter-gather buffers in
+    both lists need not be the same, and that the operation functionally is
+    equivalent to a ``strncpy`` where the ``count`` argument equals the smallest
+    total size of the two scatter-gather list buffers.
+
+  - It's usually used for copying pixel data between host memory and
+    memory-mapped GPU device memory, such as found on modern PCI video graphics
+    cards. The most immediate example is the OpenGL API function
+    ``glReadPielx()``, which might require a verbatim copy of a huge framebuffer
+    from local device memory onto host memory.
+
 - DMA_XOR
 
   - The device is able to perform XOR operations on memory areas
-- 
Gitee


From 04d66ddb5f7d40ce455c075cd975c496e03466f4 Mon Sep 17 00:00:00 2001
From: Adrian Larumbe <adrianml@alumnos.upm.es>
Date: Mon, 1 Nov 2021 18:08:24 +0000
Subject: [PATCH 1703/2161] dmaengine: Add core function and capability check
 for DMA_MEMCPY_SG

commit 3218910fd5858842a1dd98ce92b602f0878f8210 upstream.

This is the old DMA_SG interface that was removed in commit
c678fa66341c ("dmaengine: remove DMA_SG as it is dead code in kernel"). It
has been renamed to DMA_MEMCPY_SG to better match the MEMSET and MEMSET_SG
naming convention.

It should only be used for mem2mem copies, either main system memory or
CPU-addressable device memory (like video memory on a PCI graphics card).

Bringing back this interface was prompted by the need to use the Xilinx
CDMA device for mem2mem SG transfers.

Signed-off-by: Adrian Larumbe <adrianml@alumnos.upm.es>
Link: https://lore.kernel.org/r/20211101180825.241048-3-adrianml@alumnos.upm.es
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.c   |  7 +++++++
 include/linux/dmaengine.h | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index d9f7c097cfd6..2cfa8458b51b 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1159,6 +1159,13 @@ int dma_async_device_register(struct dma_device *device)
 		return -EIO;
 	}
 
+	if (dma_has_cap(DMA_MEMCPY_SG, device->cap_mask) && !device->device_prep_dma_memcpy_sg) {
+		dev_err(device->dev,
+			"Device claims capability %s, but op is not defined\n",
+			"DMA_MEMCPY_SG");
+		return -EIO;
+	}
+
 	if (dma_has_cap(DMA_XOR, device->cap_mask) && !device->device_prep_dma_xor) {
 		dev_err(device->dev,
 			"Device claims capability %s, but op is not defined\n",
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index b99e78ddc930..97ad078d1288 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -50,6 +50,7 @@ enum dma_status {
  */
 enum dma_transaction_type {
 	DMA_MEMCPY,
+	DMA_MEMCPY_SG,
 	DMA_XOR,
 	DMA_PQ,
 	DMA_XOR_VAL,
@@ -864,6 +865,11 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t src,
 		size_t len, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy_sg)(
+		struct dma_chan *chan,
+		struct scatterlist *dst_sg, unsigned int dst_nents,
+		struct scatterlist *src_sg, unsigned int src_nents,
+		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
@@ -1023,6 +1029,20 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
 						    len, flags);
 }
 
+static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy_sg(
+		struct dma_chan *chan,
+		struct scatterlist *dst_sg, unsigned int dst_nents,
+		struct scatterlist *src_sg, unsigned int src_nents,
+		unsigned long flags)
+{
+	if (!chan || !chan->device || !chan->device->device_prep_dma_memcpy_sg)
+		return NULL;
+
+	return chan->device->device_prep_dma_memcpy_sg(chan, dst_sg, dst_nents,
+						       src_sg, src_nents,
+						       flags);
+}
+
 static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan,
 		enum dma_desc_metadata_mode mode)
 {
-- 
Gitee


From ab7a71a2d989d847a03b7ffba3d26a23b34f8864 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Oct 2021 14:45:34 -0700
Subject: [PATCH 1704/2161] dmaengine: idxd: set defaults for wq configs

commit 92452a72ebdf1225aa37690d3648f2af6d0b4fca upstream.

Add default values for wq size, max_xfer_size and max_batch_size. These
values should provide a general guidance for the wq configuration when
the user does not specify any specific values.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163528473483.3926048.7950067926287180976.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 13 +++++--------
 drivers/dma/idxd/idxd.h   |  4 ++++
 drivers/dma/idxd/init.c   |  4 ++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 1dc5245107df..36e213a8108d 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -390,6 +390,8 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
+	wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
+	wq->max_batch_size = WQ_DEFAULT_MAX_BATCH;
 }
 
 static void idxd_wq_ref_release(struct percpu_ref *ref)
@@ -839,15 +841,12 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 		wq->wqcfg->bits[i] = ioread32(idxd->reg_base + wq_offset);
 	}
 
+	if (wq->size == 0 && wq->type != IDXD_WQT_NONE)
+		wq->size = WQ_DEFAULT_QUEUE_DEPTH;
+
 	/* byte 0-3 */
 	wq->wqcfg->wq_size = wq->size;
 
-	if (wq->size == 0) {
-		idxd->cmd_status = IDXD_SCMD_WQ_NO_SIZE;
-		dev_warn(dev, "Incorrect work queue size: 0\n");
-		return -EINVAL;
-	}
-
 	/* bytes 4-7 */
 	wq->wqcfg->wq_thresh = wq->threshold;
 
@@ -993,8 +992,6 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 
 		if (!wq->group)
 			continue;
-		if (!wq->size)
-			continue;
 
 		if (wq_shared(wq) && !device_swq_supported(idxd)) {
 			idxd->cmd_status = IDXD_SCMD_WQ_NO_SWQ_SUPPORT;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 51e79201636c..89e98d69115b 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -150,6 +150,10 @@ struct idxd_cdev {
 #define WQ_NAME_SIZE   1024
 #define WQ_TYPE_SIZE   10
 
+#define WQ_DEFAULT_QUEUE_DEPTH		16
+#define WQ_DEFAULT_MAX_XFER		SZ_2M
+#define WQ_DEFAULT_MAX_BATCH		32
+
 enum idxd_op_type {
 	IDXD_OP_BLOCK = 0,
 	IDXD_OP_NONBLOCK = 1,
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 912839bf0be3..94ecd4bf0f0e 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -246,8 +246,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		init_waitqueue_head(&wq->err_queue);
 		init_completion(&wq->wq_dead);
 		init_completion(&wq->wq_resurrect);
-		wq->max_xfer_bytes = idxd->max_xfer_bytes;
-		wq->max_batch_size = idxd->max_batch_size;
+		wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
+		wq->max_batch_size = WQ_DEFAULT_MAX_BATCH;
 		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
 		if (!wq->wqcfg) {
 			put_device(conf_dev);
-- 
Gitee


From 7299334a58a81f34777f4a7068e421d4eee51392 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 29 Nov 2021 10:19:38 -0700
Subject: [PATCH 1705/2161] dmaengine: idxd: add knob for enqcmds retries

commit 7930d85535751bc8b05c6731c6b79d874671f13c upstream.

Add a sysfs knob to allow tuning of retries for the kernel ENQCMDS
descriptor submission. While on host, it is not as likely that ENQCMDS
return busy during normal operations due to the driver controlling the
number of descriptors allocated for submission. However, when the driver is
operating as a guest driver, the chance of retry goes up significantly due
to sharing a wq with multiple VMs. A default value is provided with the
system admin being able to tune the value on a per WQ basis.

Suggested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163820629464.2702134.7577370098568297574.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  7 ++++
 drivers/dma/idxd/device.c                     |  1 +
 drivers/dma/idxd/idxd.h                       |  5 +++
 drivers/dma/idxd/init.c                       |  1 +
 drivers/dma/idxd/irq.c                        |  2 +-
 drivers/dma/idxd/submit.c                     | 31 ++++++++++++----
 drivers/dma/idxd/sysfs.c                      | 36 +++++++++++++++++++
 7 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index c67c63bacd3c..6bbaeace2aa6 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -220,6 +220,13 @@ Contact:	dmaengine@vger.kernel.org
 Description:	Show the current number of entries in this WQ if WQ Occupancy
 		Support bit WQ capabilities is 1.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/enqcmds_retries
+Date		Oct 29, 2021
+KernelVersion:	5.17.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicate the number of retires for an enqcmds submission on a shared wq.
+		A max value to set attribute is capped at 64.
+
 What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 36e213a8108d..5a50ee6f6881 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -387,6 +387,7 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	wq->threshold = 0;
 	wq->priority = 0;
 	wq->ats_dis = 0;
+	wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 89e98d69115b..6b9bfdc557fe 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -52,6 +52,9 @@ enum idxd_type {
 #define IDXD_NAME_SIZE		128
 #define IDXD_PMU_EVENT_MAX	64
 
+#define IDXD_ENQCMDS_RETRIES		32
+#define IDXD_ENQCMDS_MAX_RETRIES	64
+
 struct idxd_device_driver {
 	const char *name;
 	enum idxd_dev_type *type;
@@ -173,6 +176,7 @@ struct idxd_dma_chan {
 struct idxd_wq {
 	void __iomem *portal;
 	u32 portal_offset;
+	unsigned int enqcmds_retries;
 	struct percpu_ref wq_active;
 	struct completion wq_dead;
 	struct completion wq_resurrect;
@@ -584,6 +588,7 @@ int idxd_wq_init_percpu_ref(struct idxd_wq *wq);
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
 struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype);
 void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc);
+int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc);
 
 /* dmaengine */
 int idxd_register_dma_device(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 94ecd4bf0f0e..8b3afce9ea67 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -248,6 +248,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		init_completion(&wq->wq_resurrect);
 		wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
 		wq->max_batch_size = WQ_DEFAULT_MAX_BATCH;
+		wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
 		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
 		if (!wq->wqcfg) {
 			put_device(conf_dev);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 8a9eaeffc6f5..925171e9738c 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -98,7 +98,7 @@ static void idxd_int_handle_revoke_drain(struct idxd_irq_entry *ie)
 	if (wq_dedicated(wq)) {
 		iosubmit_cmds512(portal, &desc, 1);
 	} else {
-		rc = enqcmds(portal, &desc);
+		rc = idxd_enqcmds(wq, portal, &desc);
 		/* This should not fail unless hardware failed. */
 		if (rc < 0)
 			dev_warn(dev, "Failed to submit drain desc on wq %d\n", wq->id);
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 2060a9c2de9c..11ac06be1f0a 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -139,6 +139,29 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
 	}
 }
 
+/*
+ * ENQCMDS typically fail when the WQ is inactive or busy. On host submission, the driver
+ * has better control of number of descriptors being submitted to a shared wq by limiting
+ * the number of driver allocated descriptors to the wq size. However, when the swq is
+ * exported to a guest kernel, it may be shared with multiple guest kernels. This means
+ * the likelihood of getting busy returned on the swq when submitting goes significantly up.
+ * Having a tunable retry mechanism allows the driver to keep trying for a bit before giving
+ * up. The sysfs knob can be tuned by the system administrator.
+ */
+int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc)
+{
+	int rc, retries = 0;
+
+	do {
+		rc = enqcmds(portal, desc);
+		if (rc == 0)
+			break;
+		cpu_relax();
+	} while (retries++ < wq->enqcmds_retries);
+
+	return rc;
+}
+
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 {
 	struct idxd_device *idxd = wq->idxd;
@@ -182,13 +205,7 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	if (wq_dedicated(wq)) {
 		iosubmit_cmds512(portal, desc->hw, 1);
 	} else {
-		/*
-		 * It's not likely that we would receive queue full rejection
-		 * since the descriptor allocation gates at wq size. If we
-		 * receive a -EAGAIN, that means something went wrong such as the
-		 * device is not accepting descriptor at all.
-		 */
-		rc = enqcmds(portal, desc->hw);
+		rc = idxd_enqcmds(wq, portal, desc->hw);
 		if (rc < 0) {
 			percpu_ref_put(&wq->wq_active);
 			/* abort operation frees the descriptor */
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 90857e776273..c0fec88ff6c1 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -945,6 +945,41 @@ static ssize_t wq_occupancy_show(struct device *dev, struct device_attribute *at
 static struct device_attribute dev_attr_wq_occupancy =
 		__ATTR(occupancy, 0444, wq_occupancy_show, NULL);
 
+static ssize_t wq_enqcmds_retries_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	if (wq_dedicated(wq))
+		return -EOPNOTSUPP;
+
+	return sysfs_emit(buf, "%u\n", wq->enqcmds_retries);
+}
+
+static ssize_t wq_enqcmds_retries_store(struct device *dev, struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	int rc;
+	unsigned int retries;
+
+	if (wq_dedicated(wq))
+		return -EOPNOTSUPP;
+
+	rc = kstrtouint(buf, 10, &retries);
+	if (rc < 0)
+		return rc;
+
+	if (retries > IDXD_ENQCMDS_MAX_RETRIES)
+		retries = IDXD_ENQCMDS_MAX_RETRIES;
+
+	wq->enqcmds_retries = retries;
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_enqcmds_retries =
+		__ATTR(enqcmds_retries, 0644, wq_enqcmds_retries_show, wq_enqcmds_retries_store);
+
 static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_clients.attr,
 	&dev_attr_wq_state.attr,
@@ -961,6 +996,7 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_max_batch_size.attr,
 	&dev_attr_wq_ats_disable.attr,
 	&dev_attr_wq_occupancy.attr,
+	&dev_attr_wq_enqcmds_retries.attr,
 	NULL,
 };
 
-- 
Gitee


From 77feb1a9a632440fa9aaade4fcea70fdda12f14c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 13 Dec 2021 11:51:23 -0700
Subject: [PATCH 1706/2161] dmaengine: idxd: embed irq_entry in idxd_wq struct

commit ec0d64231615e50539d83516b974e7947d45fbce upstream.

With irq_entry already being associated with the wq in a 1:1 relationship,
embed the irq_entry in the idxd_wq struct and remove back pointers for
idxe_wq and idxd_device. In the process of this work, clean up the interrupt
handle assignment so that there's no decision to be made during submit
call on where interrupt handle value comes from. Set the interrupt handle
during irq request initialization time.

irq_entry 0 is designated as special and is tied to the device itself.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163942148362.2412839.12055447853311267866.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  18 +++---
 drivers/dma/idxd/idxd.h   |  22 +++++--
 drivers/dma/idxd/init.c   | 119 +++++++++++++++-----------------------
 drivers/dma/idxd/irq.c    |  10 ++--
 drivers/dma/idxd/submit.c |   8 +--
 drivers/dma/idxd/sysfs.c  |   1 -
 6 files changed, 79 insertions(+), 99 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 5a50ee6f6881..8233a29f859d 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -21,8 +21,11 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq);
 /* Interrupt control bits */
 void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
 {
-	struct irq_data *data = irq_get_irq_data(idxd->irq_entries[vec_id].vector);
+	struct idxd_irq_entry *ie;
+	struct irq_data *data;
 
+	ie = idxd_get_ie(idxd, vec_id);
+	data = irq_get_irq_data(ie->vector);
 	pci_msi_mask_irq(data);
 }
 
@@ -38,8 +41,11 @@ void idxd_mask_msix_vectors(struct idxd_device *idxd)
 
 void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id)
 {
-	struct irq_data *data = irq_get_irq_data(idxd->irq_entries[vec_id].vector);
+	struct idxd_irq_entry *ie;
+	struct irq_data *data;
 
+	ie = idxd_get_ie(idxd, vec_id);
+	data = irq_get_irq_data(ie->vector);
 	pci_msi_unmask_irq(data);
 }
 
@@ -1216,13 +1222,6 @@ int __drv_enable_wq(struct idxd_wq *wq)
 		goto err;
 	}
 
-	/*
-	 * Device has 1 misc interrupt and N interrupts for descriptor completion. To
-	 * assign WQ to interrupt, we will take the N+1 interrupt since vector 0 is
-	 * for the misc interrupt.
-	 */
-	wq->ie = &idxd->irq_entries[wq->id + 1];
-
 	rc = idxd_wq_enable(wq);
 	if (rc < 0) {
 		dev_dbg(dev, "wq %d enabling failed: %d\n", wq->id, rc);
@@ -1273,7 +1272,6 @@ void __drv_disable_wq(struct idxd_wq *wq)
 	idxd_wq_drain(wq);
 	idxd_wq_reset(wq);
 
-	wq->ie = NULL;
 	wq->client_count = 0;
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 6b9bfdc557fe..d77be03dd8b0 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -70,7 +70,6 @@ extern struct idxd_device_driver idxd_user_drv;
 
 #define INVALID_INT_HANDLE	-1
 struct idxd_irq_entry {
-	struct idxd_device *idxd;
 	int id;
 	int vector;
 	struct llist_head pending_llist;
@@ -81,7 +80,6 @@ struct idxd_irq_entry {
 	 */
 	spinlock_t list_lock;
 	int int_handle;
-	struct idxd_wq *wq;
 	ioasid_t pasid;
 };
 
@@ -185,7 +183,7 @@ struct idxd_wq {
 	struct wait_queue_head err_queue;
 	struct idxd_device *idxd;
 	int id;
-	struct idxd_irq_entry *ie;
+	struct idxd_irq_entry ie;
 	enum idxd_wq_type type;
 	struct idxd_group *group;
 	int client_count;
@@ -266,6 +264,7 @@ struct idxd_device {
 	int id;
 	int major;
 	u32 cmd_status;
+	struct idxd_irq_entry ie;	/* misc irq, msix 0 */
 
 	struct pci_dev *pdev;
 	void __iomem *reg_base;
@@ -302,8 +301,6 @@ struct idxd_device {
 
 	union sw_err_reg sw_err;
 	wait_queue_head_t cmd_waitq;
-	int num_wq_irqs;
-	struct idxd_irq_entry *irq_entries;
 
 	struct idxd_dma_dev *idxd_dma;
 	struct workqueue_struct *wq;
@@ -395,6 +392,21 @@ static inline void idxd_dev_set_type(struct idxd_dev *idev, int type)
 	idev->type = type;
 }
 
+static inline struct idxd_irq_entry *idxd_get_ie(struct idxd_device *idxd, int idx)
+{
+	return (idx == 0) ? &idxd->ie : &idxd->wqs[idx - 1]->ie;
+}
+
+static inline struct idxd_wq *ie_to_wq(struct idxd_irq_entry *ie)
+{
+	return container_of(ie, struct idxd_wq, ie);
+}
+
+static inline struct idxd_device *ie_to_idxd(struct idxd_irq_entry *ie)
+{
+	return container_of(ie, struct idxd_device, ie);
+}
+
 extern struct bus_type dsa_bus_type;
 
 extern bool support_enqcmd;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 8b3afce9ea67..29c732a94027 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -72,7 +72,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
 	struct device *dev = &pdev->dev;
-	struct idxd_irq_entry *irq_entry;
+	struct idxd_irq_entry *ie;
 	int i, msixcnt;
 	int rc = 0;
 
@@ -90,72 +90,54 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	}
 	dev_dbg(dev, "Enabled %d msix vectors\n", msixcnt);
 
-	/*
-	 * We implement 1 completion list per MSI-X entry except for
-	 * entry 0, which is for errors and others.
-	 */
-	idxd->irq_entries = kcalloc_node(msixcnt, sizeof(struct idxd_irq_entry),
-					 GFP_KERNEL, dev_to_node(dev));
-	if (!idxd->irq_entries) {
-		rc = -ENOMEM;
-		goto err_irq_entries;
-	}
-
-	for (i = 0; i < msixcnt; i++) {
-		idxd->irq_entries[i].id = i;
-		idxd->irq_entries[i].idxd = idxd;
-		/*
-		 * Association of WQ should be assigned starting with irq_entry 1.
-		 * irq_entry 0 is for misc interrupts and has no wq association
-		 */
-		if (i > 0)
-			idxd->irq_entries[i].wq = idxd->wqs[i - 1];
-		idxd->irq_entries[i].vector = pci_irq_vector(pdev, i);
-		idxd->irq_entries[i].int_handle = INVALID_INT_HANDLE;
-		if (device_pasid_enabled(idxd) && i > 0)
-			idxd->irq_entries[i].pasid = idxd->pasid;
-		else
-			idxd->irq_entries[i].pasid = INVALID_IOASID;
-		spin_lock_init(&idxd->irq_entries[i].list_lock);
-	}
-
 	idxd_msix_perm_setup(idxd);
 
-	irq_entry = &idxd->irq_entries[0];
-	rc = request_threaded_irq(irq_entry->vector, NULL, idxd_misc_thread,
-				  0, "idxd-misc", irq_entry);
+	ie = idxd_get_ie(idxd, 0);
+	ie->vector = pci_irq_vector(pdev, 0);
+	rc = request_threaded_irq(ie->vector, NULL, idxd_misc_thread, 0, "idxd-misc", ie);
 	if (rc < 0) {
 		dev_err(dev, "Failed to allocate misc interrupt.\n");
 		goto err_misc_irq;
 	}
 
-	dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n", irq_entry->vector);
+	dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n", ie->vector);
 
-	/* first MSI-X entry is not for wq interrupts */
-	idxd->num_wq_irqs = msixcnt - 1;
+	for (i = 0; i < idxd->max_wqs; i++) {
+		int msix_idx = i + 1;
+
+		ie = idxd_get_ie(idxd, msix_idx);
 
-	for (i = 1; i < msixcnt; i++) {
-		irq_entry = &idxd->irq_entries[i];
+		/* MSIX vector 0 special, wq irq entry starts at 1 */
+		ie->id = msix_idx;
+		ie->vector = pci_irq_vector(pdev, msix_idx);
+		ie->int_handle = INVALID_INT_HANDLE;
+		if (device_pasid_enabled(idxd) && i > 0)
+			ie->pasid = idxd->pasid;
+		else
+			ie->pasid = INVALID_IOASID;
+		spin_lock_init(&ie->list_lock);
+		init_llist_head(&ie->pending_llist);
+		INIT_LIST_HEAD(&ie->work_list);
 
-		init_llist_head(&idxd->irq_entries[i].pending_llist);
-		INIT_LIST_HEAD(&idxd->irq_entries[i].work_list);
-		rc = request_threaded_irq(irq_entry->vector, NULL,
-					  idxd_wq_thread, 0, "idxd-portal", irq_entry);
+		rc = request_threaded_irq(ie->vector, NULL, idxd_wq_thread, 0, "idxd-portal", ie);
 		if (rc < 0) {
-			dev_err(dev, "Failed to allocate irq %d.\n", irq_entry->vector);
+			dev_err(dev, "Failed to allocate irq %d.\n", ie->vector);
 			goto err_wq_irqs;
 		}
 
-		dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, irq_entry->vector);
+		dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, ie->vector);
 		if (idxd->request_int_handles) {
-			rc = idxd_device_request_int_handle(idxd, i, &irq_entry->int_handle,
+			rc = idxd_device_request_int_handle(idxd, i, &ie->int_handle,
 							    IDXD_IRQ_MSIX);
 			if (rc < 0) {
-				free_irq(irq_entry->vector, irq_entry);
+				free_irq(ie->vector, ie);
 				goto err_wq_irqs;
 			}
-			dev_dbg(dev, "int handle requested: %u\n", irq_entry->int_handle);
+			dev_dbg(dev, "int handle requested: %u\n", ie->int_handle);
+		} else {
+			ie->int_handle = msix_idx;
 		}
+
 	}
 
 	idxd_unmask_error_interrupts(idxd);
@@ -163,23 +145,19 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 
  err_wq_irqs:
 	while (--i >= 0) {
-		irq_entry = &idxd->irq_entries[i];
-		free_irq(irq_entry->vector, irq_entry);
-		if (irq_entry->int_handle != INVALID_INT_HANDLE) {
-			idxd_device_release_int_handle(idxd, irq_entry->int_handle,
-						       IDXD_IRQ_MSIX);
-			irq_entry->int_handle = INVALID_INT_HANDLE;
-			irq_entry->pasid = INVALID_IOASID;
+		ie = &idxd->wqs[i]->ie;
+		free_irq(ie->vector, ie);
+		if (ie->int_handle != INVALID_INT_HANDLE) {
+			idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
+			ie->int_handle = INVALID_INT_HANDLE;
+			ie->pasid = INVALID_IOASID;
 		}
-		irq_entry->vector = -1;
-		irq_entry->wq = NULL;
-		irq_entry->idxd = NULL;
+		ie->vector = -1;
 	}
  err_misc_irq:
 	/* Disable error interrupt generation */
 	idxd_mask_error_interrupts(idxd);
 	idxd_msix_perm_clear(idxd);
- err_irq_entries:
 	pci_free_irq_vectors(pdev);
 	dev_err(dev, "No usable interrupts\n");
 	return rc;
@@ -188,21 +166,18 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 static void idxd_cleanup_interrupts(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
-	struct idxd_irq_entry *irq_entry;
+	struct idxd_irq_entry *ie;
 	int i;
 
 	for (i = 0; i < idxd->irq_cnt; i++) {
-		irq_entry = &idxd->irq_entries[i];
-		if (irq_entry->int_handle != INVALID_INT_HANDLE) {
-			idxd_device_release_int_handle(idxd, irq_entry->int_handle,
-						       IDXD_IRQ_MSIX);
-			irq_entry->int_handle = INVALID_INT_HANDLE;
-			irq_entry->pasid = INVALID_IOASID;
+		ie = idxd_get_ie(idxd, i);
+		if (ie->int_handle != INVALID_INT_HANDLE) {
+			idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
+			ie->int_handle = INVALID_INT_HANDLE;
+			ie->pasid = INVALID_IOASID;
 		}
-		irq_entry->vector = -1;
-		irq_entry->wq = NULL;
-		irq_entry->idxd = NULL;
-		free_irq(irq_entry->vector, irq_entry);
+		free_irq(ie->vector, ie);
+		ie->vector = -1;
 	}
 
 	idxd_mask_error_interrupts(idxd);
@@ -755,7 +730,7 @@ static void idxd_release_int_handles(struct idxd_device *idxd)
 	int i, rc;
 
 	for (i = 1; i < idxd->irq_cnt; i++) {
-		struct idxd_irq_entry *ie = &idxd->irq_entries[i];
+		struct idxd_irq_entry *ie = idxd_get_ie(idxd, i);
 
 		if (ie->int_handle != INVALID_INT_HANDLE) {
 			rc = idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
@@ -783,7 +758,7 @@ static void idxd_shutdown(struct pci_dev *pdev)
 	idxd_mask_error_interrupts(idxd);
 
 	for (i = 0; i < msixcnt; i++) {
-		irq_entry = &idxd->irq_entries[i];
+		irq_entry = idxd_get_ie(idxd, i);
 		synchronize_irq(irq_entry->vector);
 		if (i == 0)
 			continue;
@@ -815,7 +790,7 @@ static void idxd_remove(struct pci_dev *pdev)
 		idxd_disable_system_pasid(idxd);
 
 	for (i = 0; i < msixcnt; i++) {
-		irq_entry = &idxd->irq_entries[i];
+		irq_entry = idxd_get_ie(idxd, i);
 		free_irq(irq_entry->vector, irq_entry);
 	}
 	idxd_msix_perm_clear(idxd);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 925171e9738c..a1316f341dd6 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -73,8 +73,8 @@ static void idxd_device_reinit(struct work_struct *work)
  */
 static void idxd_int_handle_revoke_drain(struct idxd_irq_entry *ie)
 {
-	struct idxd_wq *wq = ie->wq;
-	struct idxd_device *idxd = ie->idxd;
+	struct idxd_wq *wq = ie_to_wq(ie);
+	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
 	struct dsa_hw_desc desc = {};
 	void __iomem *portal;
@@ -155,8 +155,8 @@ static void idxd_int_handle_revoke(struct work_struct *work)
 	 * at the end to make sure all invalid int handle descriptors are processed.
 	 */
 	for (i = 1; i < idxd->irq_cnt; i++) {
-		struct idxd_irq_entry *ie = &idxd->irq_entries[i];
-		struct idxd_wq *wq = ie->wq;
+		struct idxd_irq_entry *ie = idxd_get_ie(idxd, i);
+		struct idxd_wq *wq = ie_to_wq(ie);
 
 		rc = idxd_device_request_int_handle(idxd, i, &new_handle, IDXD_IRQ_MSIX);
 		if (rc < 0) {
@@ -338,7 +338,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 irqreturn_t idxd_misc_thread(int vec, void *data)
 {
 	struct idxd_irq_entry *irq_entry = data;
-	struct idxd_device *idxd = irq_entry->idxd;
+	struct idxd_device *idxd = ie_to_idxd(irq_entry);
 	int rc;
 	u32 cause;
 
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 11ac06be1f0a..e289fd48711a 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -193,12 +193,8 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	 * that we designated the descriptor to.
 	 */
 	if (desc_flags & IDXD_OP_FLAG_RCI) {
-		ie = wq->ie;
-		if (ie->int_handle == INVALID_INT_HANDLE)
-			desc->hw->int_handle = ie->id;
-		else
-			desc->hw->int_handle = ie->int_handle;
-
+		ie = &wq->ie;
+		desc->hw->int_handle = ie->int_handle;
 		llist_add(&desc->llnode, &ie->pending_llist);
 	}
 
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index c0fec88ff6c1..13404532131b 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1304,7 +1304,6 @@ static void idxd_conf_device_release(struct device *dev)
 	kfree(idxd->groups);
 	kfree(idxd->wqs);
 	kfree(idxd->engines);
-	kfree(idxd->irq_entries);
 	ida_free(&idxd_ida, idxd->id);
 	kfree(idxd);
 }
-- 
Gitee


From 7e7ac82f4b4f99d479c75abd3cc733db2206172d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 13 Dec 2021 11:51:29 -0700
Subject: [PATCH 1707/2161] dmaengine: idxd: fix descriptor flushing locking

commit 23a50c8035655c5a1d9b52c878b3ebf7b6b83eea upstream.

The descriptor flushing for shutdown is not holding the irq_entry list
lock. If there's ongoing interrupt completion handling, this can corrupt
the list. Add locking to protect list walking. Also refactor the code so
it's more compact.

Fixes: 8f47d1a5e545 ("dmaengine: idxd: connect idxd to dmaengine subsystem")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163942148935.2412839.18282664745572777280.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 29c732a94027..03c735727f68 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -689,26 +689,28 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return rc;
 }
 
-static void idxd_flush_pending_llist(struct idxd_irq_entry *ie)
+static void idxd_flush_pending_descs(struct idxd_irq_entry *ie)
 {
 	struct idxd_desc *desc, *itr;
 	struct llist_node *head;
+	LIST_HEAD(flist);
+	enum idxd_complete_type ctype;
 
+	spin_lock(&ie->list_lock);
 	head = llist_del_all(&ie->pending_llist);
-	if (!head)
-		return;
-
-	llist_for_each_entry_safe(desc, itr, head, llnode)
-		idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
-}
+	if (head) {
+		llist_for_each_entry_safe(desc, itr, head, llnode)
+			list_add_tail(&desc->list, &ie->work_list);
+	}
 
-static void idxd_flush_work_list(struct idxd_irq_entry *ie)
-{
-	struct idxd_desc *desc, *iter;
+	list_for_each_entry_safe(desc, itr, &ie->work_list, list)
+		list_move_tail(&desc->list, &flist);
+	spin_unlock(&ie->list_lock);
 
-	list_for_each_entry_safe(desc, iter, &ie->work_list, list) {
+	list_for_each_entry_safe(desc, itr, &flist, list) {
 		list_del(&desc->list);
-		idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
+		ctype = desc->completion->status ? IDXD_COMPLETE_NORMAL : IDXD_COMPLETE_ABORT;
+		idxd_dma_complete_txd(desc, ctype, true);
 	}
 }
 
@@ -762,8 +764,7 @@ static void idxd_shutdown(struct pci_dev *pdev)
 		synchronize_irq(irq_entry->vector);
 		if (i == 0)
 			continue;
-		idxd_flush_pending_llist(irq_entry);
-		idxd_flush_work_list(irq_entry);
+		idxd_flush_pending_descs(irq_entry);
 	}
 	flush_workqueue(idxd->wq);
 }
-- 
Gitee


From a197db9a7f1dfdd001c2d8a8fa3cc876fa5978eb Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 13 Dec 2021 11:51:34 -0700
Subject: [PATCH 1708/2161] dmaengine: idxd: change MSIX allocation based on
 per wq activation

commit 403a2e236538c6b479ea5bfc8b75a75540cfba6b upstream.

Change the driver where WQ interrupt is requested only when wq is being
enabled. This new scheme set things up so that request_threaded_irq() is
only called when a kernel wq type is being enabled. This also sets up for
future interrupt request where different interrupt handler such as wq
occupancy interrupt can be setup instead of the wq completion interrupt.

Not calling request_irq() until the WQ actually needs an irq also prevents
wasting of CPU irq vectors on x86 systems, which is a limited resource.

idxd_flush_pending_descs() is moved to device.c since descriptor flushing
is now part of wq disable rather than shutdown().

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163942149487.2412839.6691222855803875848.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 161 +++++++++++++++++++++++---------------
 drivers/dma/idxd/dma.c    |  12 +++
 drivers/dma/idxd/idxd.h   |   7 +-
 drivers/dma/idxd/init.c   | 133 ++++---------------------------
 drivers/dma/idxd/irq.c    |   3 +
 include/uapi/linux/idxd.h |   1 +
 6 files changed, 132 insertions(+), 185 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 8233a29f859d..280b41417f41 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -19,36 +19,6 @@ static void idxd_device_wqs_clear_state(struct idxd_device *idxd);
 static void idxd_wq_disable_cleanup(struct idxd_wq *wq);
 
 /* Interrupt control bits */
-void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
-{
-	struct idxd_irq_entry *ie;
-	struct irq_data *data;
-
-	ie = idxd_get_ie(idxd, vec_id);
-	data = irq_get_irq_data(ie->vector);
-	pci_msi_mask_irq(data);
-}
-
-void idxd_mask_msix_vectors(struct idxd_device *idxd)
-{
-	struct pci_dev *pdev = idxd->pdev;
-	int msixcnt = pci_msix_vec_count(pdev);
-	int i;
-
-	for (i = 0; i < msixcnt; i++)
-		idxd_mask_msix_vector(idxd, i);
-}
-
-void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id)
-{
-	struct idxd_irq_entry *ie;
-	struct irq_data *data;
-
-	ie = idxd_get_ie(idxd, vec_id);
-	data = irq_get_irq_data(ie->vector);
-	pci_msi_unmask_irq(data);
-}
-
 void idxd_unmask_error_interrupts(struct idxd_device *idxd)
 {
 	union genctrl_reg genctrl;
@@ -593,7 +563,6 @@ void idxd_device_reset(struct idxd_device *idxd)
 	idxd_device_clear_state(idxd);
 	idxd->state = IDXD_DEV_DISABLED;
 	idxd_unmask_error_interrupts(idxd);
-	idxd_msix_perm_setup(idxd);
 	spin_unlock(&idxd->dev_lock);
 }
 
@@ -732,36 +701,6 @@ void idxd_device_clear_state(struct idxd_device *idxd)
 	idxd_device_wqs_clear_state(idxd);
 }
 
-void idxd_msix_perm_setup(struct idxd_device *idxd)
-{
-	union msix_perm mperm;
-	int i, msixcnt;
-
-	msixcnt = pci_msix_vec_count(idxd->pdev);
-	if (msixcnt < 0)
-		return;
-
-	mperm.bits = 0;
-	mperm.pasid = idxd->pasid;
-	mperm.pasid_en = device_pasid_enabled(idxd);
-	for (i = 1; i < msixcnt; i++)
-		iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + i * 8);
-}
-
-void idxd_msix_perm_clear(struct idxd_device *idxd)
-{
-	union msix_perm mperm;
-	int i, msixcnt;
-
-	msixcnt = pci_msix_vec_count(idxd->pdev);
-	if (msixcnt < 0)
-		return;
-
-	mperm.bits = 0;
-	for (i = 1; i < msixcnt; i++)
-		iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + i * 8);
-}
-
 static void idxd_group_config_write(struct idxd_group *group)
 {
 	struct idxd_device *idxd = group->idxd;
@@ -1158,6 +1097,106 @@ int idxd_device_load_config(struct idxd_device *idxd)
 	return 0;
 }
 
+static void idxd_flush_pending_descs(struct idxd_irq_entry *ie)
+{
+	struct idxd_desc *desc, *itr;
+	struct llist_node *head;
+	LIST_HEAD(flist);
+	enum idxd_complete_type ctype;
+
+	spin_lock(&ie->list_lock);
+	head = llist_del_all(&ie->pending_llist);
+	if (head) {
+		llist_for_each_entry_safe(desc, itr, head, llnode)
+			list_add_tail(&desc->list, &ie->work_list);
+	}
+
+	list_for_each_entry_safe(desc, itr, &ie->work_list, list)
+		list_move_tail(&desc->list, &flist);
+	spin_unlock(&ie->list_lock);
+
+	list_for_each_entry_safe(desc, itr, &flist, list) {
+		list_del(&desc->list);
+		ctype = desc->completion->status ? IDXD_COMPLETE_NORMAL : IDXD_COMPLETE_ABORT;
+		idxd_dma_complete_txd(desc, ctype, true);
+	}
+}
+
+static void idxd_device_set_perm_entry(struct idxd_device *idxd,
+				       struct idxd_irq_entry *ie)
+{
+	union msix_perm mperm;
+
+	if (ie->pasid == INVALID_IOASID)
+		return;
+
+	mperm.bits = 0;
+	mperm.pasid = ie->pasid;
+	mperm.pasid_en = 1;
+	iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + ie->id * 8);
+}
+
+static void idxd_device_clear_perm_entry(struct idxd_device *idxd,
+					 struct idxd_irq_entry *ie)
+{
+	iowrite32(0, idxd->reg_base + idxd->msix_perm_offset + ie->id * 8);
+}
+
+void idxd_wq_free_irq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_irq_entry *ie = &wq->ie;
+
+	synchronize_irq(ie->vector);
+	free_irq(ie->vector, ie);
+	idxd_flush_pending_descs(ie);
+	if (idxd->request_int_handles)
+		idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
+	idxd_device_clear_perm_entry(idxd, ie);
+	ie->vector = -1;
+	ie->int_handle = INVALID_INT_HANDLE;
+	ie->pasid = INVALID_IOASID;
+}
+
+int idxd_wq_request_irq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	struct idxd_irq_entry *ie;
+	int rc;
+
+	ie = &wq->ie;
+	ie->vector = pci_irq_vector(pdev, ie->id);
+	ie->pasid = device_pasid_enabled(idxd) ? idxd->pasid : INVALID_IOASID;
+	idxd_device_set_perm_entry(idxd, ie);
+
+	rc = request_threaded_irq(ie->vector, NULL, idxd_wq_thread, 0, "idxd-portal", ie);
+	if (rc < 0) {
+		dev_err(dev, "Failed to request irq %d.\n", ie->vector);
+		goto err_irq;
+	}
+
+	if (idxd->request_int_handles) {
+		rc = idxd_device_request_int_handle(idxd, ie->id, &ie->int_handle,
+						    IDXD_IRQ_MSIX);
+		if (rc < 0)
+			goto err_int_handle;
+	} else {
+		ie->int_handle = ie->id;
+	}
+
+	return 0;
+
+err_int_handle:
+	ie->int_handle = INVALID_INT_HANDLE;
+	free_irq(ie->vector, ie);
+err_irq:
+	idxd_device_clear_perm_entry(idxd, ie);
+	ie->pasid = INVALID_IOASID;
+	return rc;
+}
+
 int __drv_enable_wq(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 2ce873994e33..bfff59617d04 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -289,6 +289,14 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 
 	mutex_lock(&wq->wq_lock);
 	wq->type = IDXD_WQT_KERNEL;
+
+	rc = idxd_wq_request_irq(wq);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_IRQ_ERR;
+		dev_dbg(dev, "WQ %d irq setup failed: %d\n", wq->id, rc);
+		goto err_irq;
+	}
+
 	rc = __drv_enable_wq(wq);
 	if (rc < 0) {
 		dev_dbg(dev, "Enable wq %d failed: %d\n", wq->id, rc);
@@ -329,6 +337,8 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 err_res_alloc:
 	__drv_disable_wq(wq);
 err:
+	idxd_wq_free_irq(wq);
+err_irq:
 	wq->type = IDXD_WQT_NONE;
 	mutex_unlock(&wq->wq_lock);
 	return rc;
@@ -344,6 +354,8 @@ static void idxd_dmaengine_drv_remove(struct idxd_dev *idxd_dev)
 	idxd_wq_free_resources(wq);
 	__drv_disable_wq(wq);
 	percpu_ref_exit(&wq->wq_active);
+	idxd_wq_free_irq(wq);
+	wq->type = IDXD_WQT_NONE;
 	mutex_unlock(&wq->wq_lock);
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index d77be03dd8b0..6353e762286d 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -548,15 +548,10 @@ void idxd_wqs_quiesce(struct idxd_device *idxd);
 bool idxd_queue_int_handle_resubmit(struct idxd_desc *desc);
 
 /* device interrupt control */
-void idxd_msix_perm_setup(struct idxd_device *idxd);
-void idxd_msix_perm_clear(struct idxd_device *idxd);
 irqreturn_t idxd_misc_thread(int vec, void *data);
 irqreturn_t idxd_wq_thread(int irq, void *data);
 void idxd_mask_error_interrupts(struct idxd_device *idxd);
 void idxd_unmask_error_interrupts(struct idxd_device *idxd);
-void idxd_mask_msix_vectors(struct idxd_device *idxd);
-void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
-void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
 int idxd_register_idxd_drv(void);
@@ -595,6 +590,8 @@ int idxd_wq_disable_pasid(struct idxd_wq *wq);
 void __idxd_wq_quiesce(struct idxd_wq *wq);
 void idxd_wq_quiesce(struct idxd_wq *wq);
 int idxd_wq_init_percpu_ref(struct idxd_wq *wq);
+void idxd_wq_free_irq(struct idxd_wq *wq);
+int idxd_wq_request_irq(struct idxd_wq *wq);
 
 /* submission */
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 03c735727f68..3505efb7ae71 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -90,7 +90,6 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	}
 	dev_dbg(dev, "Enabled %d msix vectors\n", msixcnt);
 
-	idxd_msix_perm_setup(idxd);
 
 	ie = idxd_get_ie(idxd, 0);
 	ie->vector = pci_irq_vector(pdev, 0);
@@ -99,65 +98,26 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 		dev_err(dev, "Failed to allocate misc interrupt.\n");
 		goto err_misc_irq;
 	}
-
-	dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n", ie->vector);
+	dev_dbg(dev, "Requested idxd-misc handler on msix vector %d\n", ie->vector);
 
 	for (i = 0; i < idxd->max_wqs; i++) {
 		int msix_idx = i + 1;
 
 		ie = idxd_get_ie(idxd, msix_idx);
-
-		/* MSIX vector 0 special, wq irq entry starts at 1 */
 		ie->id = msix_idx;
-		ie->vector = pci_irq_vector(pdev, msix_idx);
 		ie->int_handle = INVALID_INT_HANDLE;
-		if (device_pasid_enabled(idxd) && i > 0)
-			ie->pasid = idxd->pasid;
-		else
-			ie->pasid = INVALID_IOASID;
+		ie->pasid = INVALID_IOASID;
+
 		spin_lock_init(&ie->list_lock);
 		init_llist_head(&ie->pending_llist);
 		INIT_LIST_HEAD(&ie->work_list);
-
-		rc = request_threaded_irq(ie->vector, NULL, idxd_wq_thread, 0, "idxd-portal", ie);
-		if (rc < 0) {
-			dev_err(dev, "Failed to allocate irq %d.\n", ie->vector);
-			goto err_wq_irqs;
-		}
-
-		dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, ie->vector);
-		if (idxd->request_int_handles) {
-			rc = idxd_device_request_int_handle(idxd, i, &ie->int_handle,
-							    IDXD_IRQ_MSIX);
-			if (rc < 0) {
-				free_irq(ie->vector, ie);
-				goto err_wq_irqs;
-			}
-			dev_dbg(dev, "int handle requested: %u\n", ie->int_handle);
-		} else {
-			ie->int_handle = msix_idx;
-		}
-
 	}
 
 	idxd_unmask_error_interrupts(idxd);
 	return 0;
 
- err_wq_irqs:
-	while (--i >= 0) {
-		ie = &idxd->wqs[i]->ie;
-		free_irq(ie->vector, ie);
-		if (ie->int_handle != INVALID_INT_HANDLE) {
-			idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
-			ie->int_handle = INVALID_INT_HANDLE;
-			ie->pasid = INVALID_IOASID;
-		}
-		ie->vector = -1;
-	}
  err_misc_irq:
-	/* Disable error interrupt generation */
 	idxd_mask_error_interrupts(idxd);
-	idxd_msix_perm_clear(idxd);
 	pci_free_irq_vectors(pdev);
 	dev_err(dev, "No usable interrupts\n");
 	return rc;
@@ -167,20 +127,15 @@ static void idxd_cleanup_interrupts(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
 	struct idxd_irq_entry *ie;
-	int i;
+	int msixcnt;
 
-	for (i = 0; i < idxd->irq_cnt; i++) {
-		ie = idxd_get_ie(idxd, i);
-		if (ie->int_handle != INVALID_INT_HANDLE) {
-			idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
-			ie->int_handle = INVALID_INT_HANDLE;
-			ie->pasid = INVALID_IOASID;
-		}
-		free_irq(ie->vector, ie);
-		ie->vector = -1;
-	}
+	msixcnt = pci_msix_vec_count(pdev);
+	if (msixcnt <= 0)
+		return;
 
+	ie = idxd_get_ie(idxd, 0);
 	idxd_mask_error_interrupts(idxd);
+	free_irq(ie->vector, ie);
 	pci_free_irq_vectors(pdev);
 }
 
@@ -592,8 +547,6 @@ static int idxd_probe(struct idxd_device *idxd)
 	if (rc)
 		goto err_config;
 
-	dev_dbg(dev, "IDXD interrupt setup complete.\n");
-
 	idxd->major = idxd_cdev_get_major(idxd);
 
 	rc = perfmon_pmu_init(idxd);
@@ -689,31 +642,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return rc;
 }
 
-static void idxd_flush_pending_descs(struct idxd_irq_entry *ie)
-{
-	struct idxd_desc *desc, *itr;
-	struct llist_node *head;
-	LIST_HEAD(flist);
-	enum idxd_complete_type ctype;
-
-	spin_lock(&ie->list_lock);
-	head = llist_del_all(&ie->pending_llist);
-	if (head) {
-		llist_for_each_entry_safe(desc, itr, head, llnode)
-			list_add_tail(&desc->list, &ie->work_list);
-	}
-
-	list_for_each_entry_safe(desc, itr, &ie->work_list, list)
-		list_move_tail(&desc->list, &flist);
-	spin_unlock(&ie->list_lock);
-
-	list_for_each_entry_safe(desc, itr, &flist, list) {
-		list_del(&desc->list);
-		ctype = desc->completion->status ? IDXD_COMPLETE_NORMAL : IDXD_COMPLETE_ABORT;
-		idxd_dma_complete_txd(desc, ctype, true);
-	}
-}
-
 void idxd_wqs_quiesce(struct idxd_device *idxd)
 {
 	struct idxd_wq *wq;
@@ -726,46 +654,19 @@ void idxd_wqs_quiesce(struct idxd_device *idxd)
 	}
 }
 
-static void idxd_release_int_handles(struct idxd_device *idxd)
-{
-	struct device *dev = &idxd->pdev->dev;
-	int i, rc;
-
-	for (i = 1; i < idxd->irq_cnt; i++) {
-		struct idxd_irq_entry *ie = idxd_get_ie(idxd, i);
-
-		if (ie->int_handle != INVALID_INT_HANDLE) {
-			rc = idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
-			if (rc < 0)
-				dev_warn(dev, "irq handle %d release failed\n", ie->int_handle);
-			else
-				dev_dbg(dev, "int handle released: %u\n", ie->int_handle);
-		}
-	}
-}
-
 static void idxd_shutdown(struct pci_dev *pdev)
 {
 	struct idxd_device *idxd = pci_get_drvdata(pdev);
-	int rc, i;
 	struct idxd_irq_entry *irq_entry;
-	int msixcnt = pci_msix_vec_count(pdev);
+	int rc;
 
 	rc = idxd_device_disable(idxd);
 	if (rc)
 		dev_err(&pdev->dev, "Disabling device failed\n");
 
-	dev_dbg(&pdev->dev, "%s called\n", __func__);
-	idxd_mask_msix_vectors(idxd);
+	irq_entry = &idxd->ie;
+	synchronize_irq(irq_entry->vector);
 	idxd_mask_error_interrupts(idxd);
-
-	for (i = 0; i < msixcnt; i++) {
-		irq_entry = idxd_get_ie(idxd, i);
-		synchronize_irq(irq_entry->vector);
-		if (i == 0)
-			continue;
-		idxd_flush_pending_descs(irq_entry);
-	}
 	flush_workqueue(idxd->wq);
 }
 
@@ -773,8 +674,6 @@ static void idxd_remove(struct pci_dev *pdev)
 {
 	struct idxd_device *idxd = pci_get_drvdata(pdev);
 	struct idxd_irq_entry *irq_entry;
-	int msixcnt = pci_msix_vec_count(pdev);
-	int i;
 
 	idxd_unregister_devices(idxd);
 	/*
@@ -790,12 +689,8 @@ static void idxd_remove(struct pci_dev *pdev)
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
 
-	for (i = 0; i < msixcnt; i++) {
-		irq_entry = idxd_get_ie(idxd, i);
-		free_irq(irq_entry->vector, irq_entry);
-	}
-	idxd_msix_perm_clear(idxd);
-	idxd_release_int_handles(idxd);
+	irq_entry = idxd_get_ie(idxd, 0);
+	free_irq(irq_entry->vector, irq_entry);
 	pci_free_irq_vectors(pdev);
 	pci_iounmap(pdev, idxd->reg_base);
 	iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index a1316f341dd6..743ead5ebc57 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -158,6 +158,9 @@ static void idxd_int_handle_revoke(struct work_struct *work)
 		struct idxd_irq_entry *ie = idxd_get_ie(idxd, i);
 		struct idxd_wq *wq = ie_to_wq(ie);
 
+		if (ie->int_handle == INVALID_INT_HANDLE)
+			continue;
+
 		rc = idxd_device_request_int_handle(idxd, i, &new_handle, IDXD_IRQ_MSIX);
 		if (rc < 0) {
 			dev_warn(dev, "get int handle %d failed: %d\n", i, rc);
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index c750eac09fc9..a8f0ff75c430 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -28,6 +28,7 @@ enum idxd_scmd_stat {
 	IDXD_SCMD_WQ_NONE_CONFIGURED = 0x800d0000,
 	IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
 	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
+	IDXD_SCMD_WQ_IRQ_ERR = 0x80100000,
 };
 
 #define IDXD_SCMD_SOFTERR_MASK	0x80000000
-- 
Gitee


From 5b9bf5d7d93ee9f35f62eff1369c0e9dd9ae16fb Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 14 Dec 2021 13:15:17 -0700
Subject: [PATCH 1709/2161] dmaengine: idxd: fix wq settings post wq disable

commit 0f225705cf6536826318180831e18a74595efc8d upstream.

By the spec, wq size and group association is not changeable unless device
is disabled. Exclude clearing the shadow copy on wq disable/reset. This
allows wq type to be changed after disable to be re-enabled.

Move the size and group association to its own cleanup and only call it
during device disable.

Fixes: 0dcfe41e9a4c ("dmanegine: idxd: cleanup all device related bits after disabling device")
Reported-by: Lucas Van <lucas.van@intel.com>
Tested-by: Lucas Van <lucas.van@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163951291732.2987775.13576571320501115257.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 280b41417f41..3fce7629daa7 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -358,8 +358,6 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	lockdep_assert_held(&wq->wq_lock);
 	memset(wq->wqcfg, 0, idxd->wqcfg_size);
 	wq->type = IDXD_WQT_NONE;
-	wq->size = 0;
-	wq->group = NULL;
 	wq->threshold = 0;
 	wq->priority = 0;
 	wq->ats_dis = 0;
@@ -371,6 +369,15 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	wq->max_batch_size = WQ_DEFAULT_MAX_BATCH;
 }
 
+static void idxd_wq_device_reset_cleanup(struct idxd_wq *wq)
+{
+	lockdep_assert_held(&wq->wq_lock);
+
+	idxd_wq_disable_cleanup(wq);
+	wq->size = 0;
+	wq->group = NULL;
+}
+
 static void idxd_wq_ref_release(struct percpu_ref *ref)
 {
 	struct idxd_wq *wq = container_of(ref, struct idxd_wq, wq_active);
@@ -689,6 +696,7 @@ static void idxd_device_wqs_clear_state(struct idxd_device *idxd)
 
 		if (wq->state == IDXD_WQ_ENABLED) {
 			idxd_wq_disable_cleanup(wq);
+			idxd_wq_device_reset_cleanup(wq);
 			wq->state = IDXD_WQ_DISABLED;
 		}
 	}
-- 
Gitee


From c34347646bfde8f6b090380512f451f401ccaf77 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 14 Dec 2021 13:23:09 -0700
Subject: [PATCH 1710/2161] dmaengine: idxd: change bandwidth token to read
 buffers

commit 7ed6f1b85fb613e5e44ef3e14d73f2dc96860935 upstream.

DSA spec v1.2 has changed the term of "bandwidth tokens" to "read buffers"
in order to make the concept clearer. Deprecate bandwidth token
naming in the driver and convert to read buffers in order to match with
the spec and reduce confusion when reading the spec.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163951338932.2988321.6162640806935567317.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c    | 25 +++++++++++----------
 drivers/dma/idxd/idxd.h      | 12 +++++------
 drivers/dma/idxd/init.c      |  6 +++---
 drivers/dma/idxd/registers.h | 14 ++++++------
 drivers/dma/idxd/sysfs.c     | 42 ++++++++++++++++++------------------
 5 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 3fce7629daa7..573ad8b86804 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -678,9 +678,9 @@ static void idxd_groups_clear_state(struct idxd_device *idxd)
 		memset(&group->grpcfg, 0, sizeof(group->grpcfg));
 		group->num_engines = 0;
 		group->num_wqs = 0;
-		group->use_token_limit = false;
-		group->tokens_allowed = 0;
-		group->tokens_reserved = 0;
+		group->use_rdbuf_limit = false;
+		group->rdbufs_allowed = 0;
+		group->rdbufs_reserved = 0;
 		group->tc_a = -1;
 		group->tc_b = -1;
 	}
@@ -748,10 +748,10 @@ static int idxd_groups_config_write(struct idxd_device *idxd)
 	int i;
 	struct device *dev = &idxd->pdev->dev;
 
-	/* Setup bandwidth token limit */
-	if (idxd->hw.gen_cap.config_en && idxd->token_limit) {
+	/* Setup bandwidth rdbuf limit */
+	if (idxd->hw.gen_cap.config_en && idxd->rdbuf_limit) {
 		reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
-		reg.token_limit = idxd->token_limit;
+		reg.rdbuf_limit = idxd->rdbuf_limit;
 		iowrite32(reg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET);
 	}
 
@@ -889,13 +889,12 @@ static void idxd_group_flags_setup(struct idxd_device *idxd)
 			group->tc_b = group->grpcfg.flags.tc_b = 1;
 		else
 			group->grpcfg.flags.tc_b = group->tc_b;
-		group->grpcfg.flags.use_token_limit = group->use_token_limit;
-		group->grpcfg.flags.tokens_reserved = group->tokens_reserved;
-		if (group->tokens_allowed)
-			group->grpcfg.flags.tokens_allowed =
-				group->tokens_allowed;
+		group->grpcfg.flags.use_rdbuf_limit = group->use_rdbuf_limit;
+		group->grpcfg.flags.rdbufs_reserved = group->rdbufs_reserved;
+		if (group->rdbufs_allowed)
+			group->grpcfg.flags.rdbufs_allowed = group->rdbufs_allowed;
 		else
-			group->grpcfg.flags.tokens_allowed = idxd->max_tokens;
+			group->grpcfg.flags.rdbufs_allowed = idxd->max_rdbufs;
 	}
 }
 
@@ -1086,7 +1085,7 @@ int idxd_device_load_config(struct idxd_device *idxd)
 	int i, rc;
 
 	reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
-	idxd->token_limit = reg.token_limit;
+	idxd->rdbuf_limit = reg.rdbuf_limit;
 
 	for (i = 0; i < idxd->max_groups; i++) {
 		struct idxd_group *group = idxd->groups[i];
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 6353e762286d..da72eb15f610 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -90,9 +90,9 @@ struct idxd_group {
 	int id;
 	int num_engines;
 	int num_wqs;
-	bool use_token_limit;
-	u8 tokens_allowed;
-	u8 tokens_reserved;
+	bool use_rdbuf_limit;
+	u8 rdbufs_allowed;
+	u8 rdbufs_reserved;
 	int tc_a;
 	int tc_b;
 };
@@ -292,11 +292,11 @@ struct idxd_device {
 	u32 max_batch_size;
 	int max_groups;
 	int max_engines;
-	int max_tokens;
+	int max_rdbufs;
 	int max_wqs;
 	int max_wq_size;
-	int token_limit;
-	int nr_tokens;		/* non-reserved tokens */
+	int rdbuf_limit;
+	int nr_rdbufs;		/* non-reserved read buffers */
 	unsigned int wqcfg_size;
 
 	union sw_err_reg sw_err;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 3505efb7ae71..08a5f4310188 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -400,9 +400,9 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	dev_dbg(dev, "group_cap: %#llx\n", idxd->hw.group_cap.bits);
 	idxd->max_groups = idxd->hw.group_cap.num_groups;
 	dev_dbg(dev, "max groups: %u\n", idxd->max_groups);
-	idxd->max_tokens = idxd->hw.group_cap.total_tokens;
-	dev_dbg(dev, "max tokens: %u\n", idxd->max_tokens);
-	idxd->nr_tokens = idxd->max_tokens;
+	idxd->max_rdbufs = idxd->hw.group_cap.total_rdbufs;
+	dev_dbg(dev, "max read buffers: %u\n", idxd->max_rdbufs);
+	idxd->nr_rdbufs = idxd->max_rdbufs;
 
 	/* read engine capabilities */
 	idxd->hw.engine_cap.bits =
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 8e396698c22b..aa642aecdc0b 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -64,9 +64,9 @@ union wq_cap_reg {
 union group_cap_reg {
 	struct {
 		u64 num_groups:8;
-		u64 total_tokens:8;
-		u64 token_en:1;
-		u64 token_limit:1;
+		u64 total_rdbufs:8;	/* formerly total_tokens */
+		u64 rdbuf_ctrl:1;	/* formerly token_en */
+		u64 rdbuf_limit:1;	/* formerly token_limit */
 		u64 rsvd:46;
 	};
 	u64 bits;
@@ -110,7 +110,7 @@ union offsets_reg {
 #define IDXD_GENCFG_OFFSET		0x80
 union gencfg_reg {
 	struct {
-		u32 token_limit:8;
+		u32 rdbuf_limit:8;
 		u32 rsvd:4;
 		u32 user_int_en:1;
 		u32 rsvd2:19;
@@ -288,10 +288,10 @@ union group_flags {
 		u32 tc_a:3;
 		u32 tc_b:3;
 		u32 rsvd:1;
-		u32 use_token_limit:1;
-		u32 tokens_reserved:8;
+		u32 use_rdbuf_limit:1;
+		u32 rdbufs_reserved:8;
 		u32 rsvd2:4;
-		u32 tokens_allowed:8;
+		u32 rdbufs_allowed:8;
 		u32 rsvd3:4;
 	};
 	u32 bits;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 13404532131b..6f1ebf08878a 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -99,17 +99,17 @@ struct device_type idxd_engine_device_type = {
 
 /* Group attributes */
 
-static void idxd_set_free_tokens(struct idxd_device *idxd)
+static void idxd_set_free_rdbufs(struct idxd_device *idxd)
 {
-	int i, tokens;
+	int i, rdbufs;
 
-	for (i = 0, tokens = 0; i < idxd->max_groups; i++) {
+	for (i = 0, rdbufs = 0; i < idxd->max_groups; i++) {
 		struct idxd_group *g = idxd->groups[i];
 
-		tokens += g->tokens_reserved;
+		rdbufs += g->rdbufs_reserved;
 	}
 
-	idxd->nr_tokens = idxd->max_tokens - tokens;
+	idxd->nr_rdbufs = idxd->max_rdbufs - rdbufs;
 }
 
 static ssize_t group_tokens_reserved_show(struct device *dev,
@@ -118,7 +118,7 @@ static ssize_t group_tokens_reserved_show(struct device *dev,
 {
 	struct idxd_group *group = confdev_to_group(dev);
 
-	return sysfs_emit(buf, "%u\n", group->tokens_reserved);
+	return sysfs_emit(buf, "%u\n", group->rdbufs_reserved);
 }
 
 static ssize_t group_tokens_reserved_store(struct device *dev,
@@ -143,14 +143,14 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
 	if (idxd->state == IDXD_DEV_ENABLED)
 		return -EPERM;
 
-	if (val > idxd->max_tokens)
+	if (val > idxd->max_rdbufs)
 		return -EINVAL;
 
-	if (val > idxd->nr_tokens + group->tokens_reserved)
+	if (val > idxd->nr_rdbufs + group->rdbufs_reserved)
 		return -EINVAL;
 
-	group->tokens_reserved = val;
-	idxd_set_free_tokens(idxd);
+	group->rdbufs_reserved = val;
+	idxd_set_free_rdbufs(idxd);
 	return count;
 }
 
@@ -164,7 +164,7 @@ static ssize_t group_tokens_allowed_show(struct device *dev,
 {
 	struct idxd_group *group = confdev_to_group(dev);
 
-	return sysfs_emit(buf, "%u\n", group->tokens_allowed);
+	return sysfs_emit(buf, "%u\n", group->rdbufs_allowed);
 }
 
 static ssize_t group_tokens_allowed_store(struct device *dev,
@@ -190,10 +190,10 @@ static ssize_t group_tokens_allowed_store(struct device *dev,
 		return -EPERM;
 
 	if (val < 4 * group->num_engines ||
-	    val > group->tokens_reserved + idxd->nr_tokens)
+	    val > group->rdbufs_reserved + idxd->nr_rdbufs)
 		return -EINVAL;
 
-	group->tokens_allowed = val;
+	group->rdbufs_allowed = val;
 	return count;
 }
 
@@ -207,7 +207,7 @@ static ssize_t group_use_token_limit_show(struct device *dev,
 {
 	struct idxd_group *group = confdev_to_group(dev);
 
-	return sysfs_emit(buf, "%u\n", group->use_token_limit);
+	return sysfs_emit(buf, "%u\n", group->use_rdbuf_limit);
 }
 
 static ssize_t group_use_token_limit_store(struct device *dev,
@@ -232,10 +232,10 @@ static ssize_t group_use_token_limit_store(struct device *dev,
 	if (idxd->state == IDXD_DEV_ENABLED)
 		return -EPERM;
 
-	if (idxd->token_limit == 0)
+	if (idxd->rdbuf_limit == 0)
 		return -EPERM;
 
-	group->use_token_limit = !!val;
+	group->use_rdbuf_limit = !!val;
 	return count;
 }
 
@@ -1197,7 +1197,7 @@ static ssize_t max_tokens_show(struct device *dev,
 {
 	struct idxd_device *idxd = confdev_to_idxd(dev);
 
-	return sysfs_emit(buf, "%u\n", idxd->max_tokens);
+	return sysfs_emit(buf, "%u\n", idxd->max_rdbufs);
 }
 static DEVICE_ATTR_RO(max_tokens);
 
@@ -1206,7 +1206,7 @@ static ssize_t token_limit_show(struct device *dev,
 {
 	struct idxd_device *idxd = confdev_to_idxd(dev);
 
-	return sysfs_emit(buf, "%u\n", idxd->token_limit);
+	return sysfs_emit(buf, "%u\n", idxd->rdbuf_limit);
 }
 
 static ssize_t token_limit_store(struct device *dev,
@@ -1227,13 +1227,13 @@ static ssize_t token_limit_store(struct device *dev,
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
-	if (!idxd->hw.group_cap.token_limit)
+	if (!idxd->hw.group_cap.rdbuf_limit)
 		return -EPERM;
 
-	if (val > idxd->hw.group_cap.total_tokens)
+	if (val > idxd->hw.group_cap.total_rdbufs)
 		return -EINVAL;
 
-	idxd->token_limit = val;
+	idxd->rdbuf_limit = val;
 	return count;
 }
 static DEVICE_ATTR_RW(token_limit);
-- 
Gitee


From 42de5ecbef3ba0d96e338306f764638a259cafdf Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 14 Dec 2021 13:23:14 -0700
Subject: [PATCH 1711/2161] dmaengine: idxd: deprecate token sysfs attributes
 for read buffers

commit fde212e44f45e491f8e3875084b587c0c2189078 upstream.

The following sysfs attributes will be obsolete due to the name change of
tokens to read buffers:
max_tokens
token_limit
group/tokens_allowed
group/tokens_reserved
group/use_token_limit

Create new entries and have old entry print warning of deprecation.

New attributes to replace the token ones:
max_read_buffers
read_buffer_limit
group/read_buffers_allowed
group/read_buffers_reserved
group/use_read_buffer_limit

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/163951339488.2988321.2424012059911316373.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  47 ++++--
 drivers/dma/idxd/sysfs.c                      | 145 ++++++++++++++----
 2 files changed, 153 insertions(+), 39 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index 6bbaeace2aa6..e1293bfa4187 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -41,14 +41,14 @@ KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The maximum number of groups can be created under this device.
 
-What:           /sys/bus/dsa/devices/dsa<m>/max_tokens
-Date:           Oct 25, 2019
-KernelVersion:  5.6.0
+What:           /sys/bus/dsa/devices/dsa<m>/max_read_buffers
+Date:           Dec 10, 2021
+KernelVersion:  5.17.0
 Contact:        dmaengine@vger.kernel.org
-Description:    The total number of bandwidth tokens supported by this device.
-		The bandwidth tokens represent resources within the DSA
+Description:    The total number of read buffers supported by this device.
+		The read buffers represent resources within the DSA
 		implementation, and these resources are allocated by engines to
-		support operations.
+		support operations. See DSA spec v1.2 9.2.4 Total Read Buffers.
 
 What:           /sys/bus/dsa/devices/dsa<m>/max_transfer_size
 Date:           Oct 25, 2019
@@ -115,13 +115,13 @@ KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    To indicate if this device is configurable or not.
 
-What:           /sys/bus/dsa/devices/dsa<m>/token_limit
-Date:           Oct 25, 2019
-KernelVersion:  5.6.0
+What:           /sys/bus/dsa/devices/dsa<m>/read_buffer_limit
+Date:           Dec 10, 2021
+KernelVersion:  5.17.0
 Contact:        dmaengine@vger.kernel.org
-Description:    The maximum number of bandwidth tokens that may be in use at
+Description:    The maximum number of read buffers that may be in use at
 		one time by operations that access low bandwidth memory in the
-		device.
+		device. See DSA spec v1.2 9.2.8 GENCFG on Global Read Buffer Limit.
 
 What:		/sys/bus/dsa/devices/dsa<m>/cmd_status
 Date:		Aug 28, 2020
@@ -224,7 +224,7 @@ What:		/sys/bus/dsa/devices/wq<m>.<n>/enqcmds_retries
 Date		Oct 29, 2021
 KernelVersion:	5.17.0
 Contact:	dmaengine@vger.kernel.org
-Description:	Indicate the number of retires for an enqcmds submission on a shared wq.
+Description:	Indicate the number of retires for an enqcmds submission on a sharedwq.
 		A max value to set attribute is capped at 64.
 
 What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
@@ -232,3 +232,26 @@ Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The group that this engine belongs to.
+
+What:		/sys/bus/dsa/devices/group<m>.<n>/use_read_buffer_limit
+Date:		Dec 10, 2021
+KernelVersion:	5.17.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Enable the use of global read buffer limit for the group. See DSA
+		spec v1.2 9.2.18 GRPCFG Use Global Read Buffer Limit.
+
+What:		/sys/bus/dsa/devices/group<m>.<n>/read_buffers_allowed
+Date:		Dec 10, 2021
+KernelVersion:	5.17.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicates max number of read buffers that may be in use at one time
+		by all engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read
+		Buffers Allowed.
+
+What:		/sys/bus/dsa/devices/group<m>.<n>/read_buffers_reserved
+Date:		Dec 10, 2021
+KernelVersion:	5.17.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicates the number of Read Buffers reserved for the use of
+		engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read Buffers
+		Reserved.
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 6f1ebf08878a..7e19ab92b61a 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -112,18 +112,26 @@ static void idxd_set_free_rdbufs(struct idxd_device *idxd)
 	idxd->nr_rdbufs = idxd->max_rdbufs - rdbufs;
 }
 
-static ssize_t group_tokens_reserved_show(struct device *dev,
-					  struct device_attribute *attr,
-					  char *buf)
+static ssize_t group_read_buffers_reserved_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
 {
 	struct idxd_group *group = confdev_to_group(dev);
 
 	return sysfs_emit(buf, "%u\n", group->rdbufs_reserved);
 }
 
-static ssize_t group_tokens_reserved_store(struct device *dev,
-					   struct device_attribute *attr,
-					   const char *buf, size_t count)
+static ssize_t group_tokens_reserved_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffers_reserved.\n");
+	return group_read_buffers_reserved_show(dev, attr, buf);
+}
+
+static ssize_t group_read_buffers_reserved_store(struct device *dev,
+						 struct device_attribute *attr,
+						 const char *buf, size_t count)
 {
 	struct idxd_group *group = confdev_to_group(dev);
 	struct idxd_device *idxd = group->idxd;
@@ -154,22 +162,42 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
 	return count;
 }
 
+static ssize_t group_tokens_reserved_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffers_reserved.\n");
+	return group_read_buffers_reserved_store(dev, attr, buf, count);
+}
+
 static struct device_attribute dev_attr_group_tokens_reserved =
 		__ATTR(tokens_reserved, 0644, group_tokens_reserved_show,
 		       group_tokens_reserved_store);
 
-static ssize_t group_tokens_allowed_show(struct device *dev,
-					 struct device_attribute *attr,
-					 char *buf)
+static struct device_attribute dev_attr_group_read_buffers_reserved =
+		__ATTR(read_buffers_reserved, 0644, group_read_buffers_reserved_show,
+		       group_read_buffers_reserved_store);
+
+static ssize_t group_read_buffers_allowed_show(struct device *dev,
+					       struct device_attribute *attr,
+					       char *buf)
 {
 	struct idxd_group *group = confdev_to_group(dev);
 
 	return sysfs_emit(buf, "%u\n", group->rdbufs_allowed);
 }
 
-static ssize_t group_tokens_allowed_store(struct device *dev,
-					  struct device_attribute *attr,
-					  const char *buf, size_t count)
+static ssize_t group_tokens_allowed_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffers_allowed.\n");
+	return group_read_buffers_allowed_show(dev, attr, buf);
+}
+
+static ssize_t group_read_buffers_allowed_store(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
 {
 	struct idxd_group *group = confdev_to_group(dev);
 	struct idxd_device *idxd = group->idxd;
@@ -197,22 +225,42 @@ static ssize_t group_tokens_allowed_store(struct device *dev,
 	return count;
 }
 
+static ssize_t group_tokens_allowed_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffers_allowed.\n");
+	return group_read_buffers_allowed_store(dev, attr, buf, count);
+}
+
 static struct device_attribute dev_attr_group_tokens_allowed =
 		__ATTR(tokens_allowed, 0644, group_tokens_allowed_show,
 		       group_tokens_allowed_store);
 
-static ssize_t group_use_token_limit_show(struct device *dev,
-					  struct device_attribute *attr,
-					  char *buf)
+static struct device_attribute dev_attr_group_read_buffers_allowed =
+		__ATTR(read_buffers_allowed, 0644, group_read_buffers_allowed_show,
+		       group_read_buffers_allowed_store);
+
+static ssize_t group_use_read_buffer_limit_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
 {
 	struct idxd_group *group = confdev_to_group(dev);
 
 	return sysfs_emit(buf, "%u\n", group->use_rdbuf_limit);
 }
 
-static ssize_t group_use_token_limit_store(struct device *dev,
-					   struct device_attribute *attr,
-					   const char *buf, size_t count)
+static ssize_t group_use_token_limit_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see use_read_buffer_limit.\n");
+	return group_use_read_buffer_limit_show(dev, attr, buf);
+}
+
+static ssize_t group_use_read_buffer_limit_store(struct device *dev,
+						 struct device_attribute *attr,
+						 const char *buf, size_t count)
 {
 	struct idxd_group *group = confdev_to_group(dev);
 	struct idxd_device *idxd = group->idxd;
@@ -239,10 +287,22 @@ static ssize_t group_use_token_limit_store(struct device *dev,
 	return count;
 }
 
+static ssize_t group_use_token_limit_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	dev_warn_once(dev, "attribute deprecated, see use_read_buffer_limit.\n");
+	return group_use_read_buffer_limit_store(dev, attr, buf, count);
+}
+
 static struct device_attribute dev_attr_group_use_token_limit =
 		__ATTR(use_token_limit, 0644, group_use_token_limit_show,
 		       group_use_token_limit_store);
 
+static struct device_attribute dev_attr_group_use_read_buffer_limit =
+		__ATTR(use_read_buffer_limit, 0644, group_use_read_buffer_limit_show,
+		       group_use_read_buffer_limit_store);
+
 static ssize_t group_engines_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
@@ -387,8 +447,11 @@ static struct attribute *idxd_group_attributes[] = {
 	&dev_attr_group_work_queues.attr,
 	&dev_attr_group_engines.attr,
 	&dev_attr_group_use_token_limit.attr,
+	&dev_attr_group_use_read_buffer_limit.attr,
 	&dev_attr_group_tokens_allowed.attr,
+	&dev_attr_group_read_buffers_allowed.attr,
 	&dev_attr_group_tokens_reserved.attr,
+	&dev_attr_group_read_buffers_reserved.attr,
 	&dev_attr_group_traffic_class_a.attr,
 	&dev_attr_group_traffic_class_b.attr,
 	NULL,
@@ -1192,26 +1255,42 @@ static ssize_t errors_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(errors);
 
-static ssize_t max_tokens_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
+static ssize_t max_read_buffers_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
 {
 	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->max_rdbufs);
 }
-static DEVICE_ATTR_RO(max_tokens);
 
-static ssize_t token_limit_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
+static ssize_t max_tokens_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see max_read_buffers.\n");
+	return max_read_buffers_show(dev, attr, buf);
+}
+
+static DEVICE_ATTR_RO(max_tokens);	/* deprecated */
+static DEVICE_ATTR_RO(max_read_buffers);
+
+static ssize_t read_buffer_limit_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
 {
 	struct idxd_device *idxd = confdev_to_idxd(dev);
 
 	return sysfs_emit(buf, "%u\n", idxd->rdbuf_limit);
 }
 
-static ssize_t token_limit_store(struct device *dev,
-				 struct device_attribute *attr,
-				 const char *buf, size_t count)
+static ssize_t token_limit_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffer_limit.\n");
+	return read_buffer_limit_show(dev, attr, buf);
+}
+
+static ssize_t read_buffer_limit_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
 {
 	struct idxd_device *idxd = confdev_to_idxd(dev);
 	unsigned long val;
@@ -1236,7 +1315,17 @@ static ssize_t token_limit_store(struct device *dev,
 	idxd->rdbuf_limit = val;
 	return count;
 }
-static DEVICE_ATTR_RW(token_limit);
+
+static ssize_t token_limit_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffer_limit\n");
+	return read_buffer_limit_store(dev, attr, buf, count);
+}
+
+static DEVICE_ATTR_RW(token_limit);	/* deprecated */
+static DEVICE_ATTR_RW(read_buffer_limit);
 
 static ssize_t cdev_major_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
@@ -1282,7 +1371,9 @@ static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_state.attr,
 	&dev_attr_errors.attr,
 	&dev_attr_max_tokens.attr,
+	&dev_attr_max_read_buffers.attr,
 	&dev_attr_token_limit.attr,
+	&dev_attr_read_buffer_limit.attr,
 	&dev_attr_cdev_major.attr,
 	&dev_attr_cmd_status.attr,
 	NULL,
-- 
Gitee


From 3715d70d4d23580623de9607ea4cbb916fa5ea34 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 24 Jan 2022 09:20:33 -0700
Subject: [PATCH 1712/2161] dmaengine: idxd: restore traffic class defaults
 after wq reset

commit ea7c8f598c323f6ebaf9ddae01fb2a981fe8c56a upstream.

When clearing the group configurations, the driver fails to restore the
default setting for DSA 1.x based devices. Add defaults in
idxd_groups_clear_state() for traffic class configuration.

Fixes: ade8a86b512c ("dmaengine: idxd: Set defaults for GRPCFG traffic class")
Reported-by: Binuraj Ravindran <binuraj.ravindran@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164304123369.824298.6952463420266592087.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 573ad8b86804..3061fe857d69 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -681,8 +681,13 @@ static void idxd_groups_clear_state(struct idxd_device *idxd)
 		group->use_rdbuf_limit = false;
 		group->rdbufs_allowed = 0;
 		group->rdbufs_reserved = 0;
-		group->tc_a = -1;
-		group->tc_b = -1;
+		if (idxd->hw.version < DEVICE_VERSION_2 && !tc_override) {
+			group->tc_a = 1;
+			group->tc_b = 1;
+		} else {
+			group->tc_a = -1;
+			group->tc_b = -1;
+		}
 	}
 }
 
-- 
Gitee


From 320e6425c1d7c5790ba97ebe4fa292c7070d8b6e Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 15 Jan 2022 08:24:20 +0100
Subject: [PATCH 1713/2161] dmaengine: idxd: Remove useless DMA-32 fallback
 configuration

commit b6f2f0352c0302076dcd0e0297592ea7d37f2242 upstream.

As stated in [1], dma_set_mask() with a 64-bit mask never fails if
dev->dma_mask is non-NULL.
So, if it fails, the 32 bits case will also fail for the same reason.

Simplify code and remove some dead code accordingly.

[1]: https://lore.kernel.org/linux-kernel/YL3vSPK5DXTNvgdx@infradead.org/#t

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/009c80294dba72858cd8a6ed2ed81041df1b1e82.1642231430.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 08a5f4310188..993a5dcca24f 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -604,8 +604,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_dbg(dev, "Set DMA masks\n");
 	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
-	if (rc)
-		rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
 	if (rc)
 		goto err;
 
-- 
Gitee


From b9963f1563ca691df8bc117763707021f9ddeeae Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 5 Apr 2022 14:53:39 -0700
Subject: [PATCH 1714/2161] dmaengine: idxd: fix device cleanup on disable

commit 12e45e89556d7a532120f976081e9e7582addd2b upstream.

There are certain parts of WQ that needs to be cleaned up even after WQ is
disabled during the device disable. Those are the unchangeable parts for a
WQ when the device is still enabled. Move the cleanup outside of WQ state
check. Remove idxd_wq_disable_cleanup() inside idxd_wq_device_reset_cleanup()
since only the unchangeable parts need to be cleared.

Fixes: 0f225705cf65 ("dmaengine: idxd: fix wq settings post wq disable")
Reported-by: Tony Zhu <tony.zhu@intel.com>
Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164919561905.1455025.13542366389944678346.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 3061fe857d69..5a0535a0f850 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -373,7 +373,6 @@ static void idxd_wq_device_reset_cleanup(struct idxd_wq *wq)
 {
 	lockdep_assert_held(&wq->wq_lock);
 
-	idxd_wq_disable_cleanup(wq);
 	wq->size = 0;
 	wq->group = NULL;
 }
@@ -701,9 +700,9 @@ static void idxd_device_wqs_clear_state(struct idxd_device *idxd)
 
 		if (wq->state == IDXD_WQ_ENABLED) {
 			idxd_wq_disable_cleanup(wq);
-			idxd_wq_device_reset_cleanup(wq);
 			wq->state = IDXD_WQ_DISABLED;
 		}
+		idxd_wq_device_reset_cleanup(wq);
 	}
 }
 
-- 
Gitee


From 9a6c7fe7944eae933921b9d6d7c12de9c3a570de Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 18 Apr 2022 14:31:10 -0700
Subject: [PATCH 1715/2161] dmaengine: idxd: match type for retries var in
 idxd_enqcmds()

commit 5d9d16e5aa0cf023e600bf716239fd9caa2d4148 upstream.

wq->enqcmds_retries is defined as unsigned int. However, retries on the
stack is defined as int. Change retries to unsigned int to compare the same
type.

Fixes: 7930d8553575 ("dmaengine: idxd: add knob for enqcmds retries")
Suggested-by: Thiago Macieira <thiago.macieira@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/165031747059.3658198.6035308204505664375.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index e289fd48711a..554b0602d2e9 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -150,7 +150,8 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
  */
 int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc)
 {
-	int rc, retries = 0;
+	unsigned int retries = 0;
+	int rc;
 
 	do {
 		rc = enqcmds(portal, desc);
-- 
Gitee


From 6256d59f034218c1362ffd96551fbdbf1f99de6e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 18 Apr 2022 14:33:21 -0700
Subject: [PATCH 1716/2161] dmaengine: idxd: fix retry value to be constant for
 duration of function call

commit bc3452cdfc468a65965d0ac397c940acb787ea4d upstream.

When retries is compared to wq->enqcmds_retries each loop of idxd_enqcmds(),
wq->enqcmds_retries can potentially changed by user. Assign the value
of retries to wq->enqcmds_retries during initialization so it is the
original value set when entering the function.

Fixes: 7930d8553575 ("dmaengine: idxd: add knob for enqcmds retries")
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/165031760154.3658664.1983547716619266558.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 554b0602d2e9..c01db23e3333 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -150,7 +150,7 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
  */
 int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc)
 {
-	unsigned int retries = 0;
+	unsigned int retries = wq->enqcmds_retries;
 	int rc;
 
 	do {
@@ -158,7 +158,7 @@ int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc)
 		if (rc == 0)
 			break;
 		cpu_relax();
-	} while (retries++ < wq->enqcmds_retries);
+	} while (retries--);
 
 	return rc;
 }
-- 
Gitee


From f8b0d190e7944613f3a1343e7f3bb6d42fecd0d1 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 11 Apr 2022 15:08:55 -0700
Subject: [PATCH 1717/2161] dmaengine: idxd: add RO check for wq max_batch_size
 write

commit 66903461ffed0b66fc3e0200082d4e09365aacdc upstream.

Block wq_max_batch_size_store() when the device is configured as read-only
and not configurable.

Fixes: e7184b159dd3 ("dmaengine: idxd: add support for configurable max wq batch size")
Reported-by: Bernice Zhang <bernice.zhang@intel.com>
Tested-by: Bernice Zhang <bernice.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164971493551.2201159.1942042593642155209.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 7e19ab92b61a..6c41d429bd89 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -939,6 +939,9 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu
 	u64 batch_size;
 	int rc;
 
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
 
-- 
Gitee


From d56146ead42f0e7b38eeef79b83314a62ba0d537 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 11 Apr 2022 15:08:01 -0700
Subject: [PATCH 1718/2161] dmaengine: idxd: add RO check for wq
 max_transfer_size write

commit 66903461ffed0b66fc3e0200082d4e09365aacdc upstream.

Block wq_max_transfer_size_store() when the device is configured as
read-only and not configurable.

Fixes: d7aad5550eca ("dmaengine: idxd: add support for configurable max wq xfer size")
Reported-by: Bernice Zhang <bernice.zhang@intel.com>
Tested-by: Bernice Zhang <bernice.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164971488154.2200913.10706665404118545941.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 6c41d429bd89..dfd549685c46 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -905,6 +905,9 @@ static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attr
 	u64 xfer_size;
 	int rc;
 
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
 
-- 
Gitee


From cb97dbb0a3c92864c177b148d5fad93c0776ff29 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 11 Apr 2022 15:06:34 -0700
Subject: [PATCH 1719/2161] dmaengine: idxd: skip clearing device context when
 device is read-only

commit 1cd8e751d96c43ece3f6842ac2244a37d9332c3a upstream.

If the device shows up as read-only configuration, skip the clearing of the
state as the context must be preserved for device re-enable after being
disabled.

Fixes: 0dcfe41e9a4c ("dmanegine: idxd: cleanup all device related bits after disabling device")
Reported-by: Tony Zhu <tony.zhu@intel.com>
Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164971479479.2200566.13980022473526292759.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 5a0535a0f850..f652da6ab47d 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -708,6 +708,9 @@ static void idxd_device_wqs_clear_state(struct idxd_device *idxd)
 
 void idxd_device_clear_state(struct idxd_device *idxd)
 {
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return;
+
 	idxd_groups_clear_state(idxd);
 	idxd_engines_clear_state(idxd);
 	idxd_device_wqs_clear_state(idxd);
-- 
Gitee


From 4517f7168d0a860f684bd3336a65392a93290c17 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 22 Jun 2022 11:28:55 +0800
Subject: [PATCH 1720/2161] x86/irq: Add DEV_MSI allocation type

commit 5ea8b29ab30976e197727fc83ddf810891fbf03f Intel-BKC.

For the upcoming device MSI support a new allocation type is
required.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/hw_irq.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6e1194387b18..b466be8673b8 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -62,6 +62,7 @@ enum irq_alloc_type {
 	X86_IRQ_ALLOC_TYPE_PCI_MSIX,
 	X86_IRQ_ALLOC_TYPE_DMAR,
 	X86_IRQ_ALLOC_TYPE_UV,
+	X86_IRQ_ALLOC_TYPE_DEV_MSI,
 };
 
 struct ioapic_alloc_info {
-- 
Gitee


From dc050991b03a72b982543008166fd161e6dc0ac9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 30 Oct 2020 15:01:43 -0700
Subject: [PATCH 1721/2161] x86/msi: Rename and rework pci_msi_prepare() to
 cover non-PCI MSI

commit 44dbe1c1654f119fd27faf0dba6ee268d118805b Intel-BKC.

Rename it to x86_msi_prepare() and handle the allocation type setup
depending on the device type.

Add a new arch_msi_prepare define which will be utilized by the upcoming
device MSI support. Define it to NULL if not provided by an architecture
in the generic MSI header.

One arch specific function for MSI support is truly enough.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/msi.h          |  4 +++-
 arch/x86/kernel/apic/msi.c          | 27 ++++++++++++++++++++-------
 drivers/pci/controller/pci-hyperv.c |  2 +-
 include/linux/msi.h                 |  4 ++++
 4 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 322fd905da9c..2db0c2bc88a8 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -6,9 +6,11 @@
 
 typedef struct irq_alloc_info msi_alloc_info_t;
 
-int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+int x86_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
 		    msi_alloc_info_t *arg);
 
+#define arch_msi_prepare		x86_msi_prepare
+
 /* Structs and defines for the X86 specific MSI message format */
 
 typedef struct x86_msi_data {
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7ced252da3d2..5691a2bc6d25 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -155,26 +155,39 @@ static struct irq_chip pci_msi_controller = {
 				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
-int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
-		    msi_alloc_info_t *arg)
+static void pci_msi_prepare(struct device *dev, msi_alloc_info_t *arg)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct msi_desc *desc = first_pci_msi_entry(pdev);
+	struct msi_desc *desc = first_msi_entry(dev);
 
-	init_irq_alloc_info(arg, NULL);
 	if (desc->msi_attrib.is_msix) {
 		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
 	} else {
 		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
 		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
 	}
+}
+
+static void dev_msi_prepare(struct device *dev, msi_alloc_info_t *arg)
+{
+	arg->type = X86_IRQ_ALLOC_TYPE_DEV_MSI;
+}
+
+int x86_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+		    msi_alloc_info_t *arg)
+{
+	init_irq_alloc_info(arg, NULL);
+
+	if (dev_is_pci(dev))
+		pci_msi_prepare(dev, arg);
+	else
+		dev_msi_prepare(dev, arg);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pci_msi_prepare);
+EXPORT_SYMBOL_GPL(x86_msi_prepare);
 
 static struct msi_domain_ops pci_msi_domain_ops = {
-	.msi_prepare	= pci_msi_prepare,
+	.msi_prepare	= x86_msi_prepare,
 };
 
 static struct msi_domain_info pci_msi_domain_info = {
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 09e4f249728e..1688d2ae00cd 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1511,7 +1511,7 @@ static struct irq_chip hv_msi_irq_chip = {
 };
 
 static struct msi_domain_ops hv_msi_ops = {
-	.msi_prepare	= pci_msi_prepare,
+	.msi_prepare	= arch_msi_prepare,
 	.msi_free	= hv_msi_free,
 };
 
diff --git a/include/linux/msi.h b/include/linux/msi.h
index fe1ab1ba2551..4b58edfd6dbe 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -446,4 +446,8 @@ static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 }
 #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
 
+#ifndef arch_msi_prepare
+# define arch_msi_prepare	NULL
+#endif
+
 #endif /* LINUX_MSI_H */
-- 
Gitee


From 860ce2687f85d3f47da75e5bc33af050da61a33d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 21 Aug 2020 02:24:59 +0200
Subject: [PATCH 1722/2161] platform-msi: Provide default irq_chip:: Ack

commit 44c5ba11fc7fe476275d84cc41c95cf6301cd483 Intel-BKC.

For the upcoming device MSI support it's required to have a default
irq_chip::ack implementation (irq_chip_ack_parent) so the drivers do not
need to care.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/platform-msi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 8da314b81eab..4f68251d1932 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -95,6 +95,8 @@ static void platform_msi_update_chip_ops(struct msi_domain_info *info)
 		chip->irq_mask = irq_chip_mask_parent;
 	if (!chip->irq_unmask)
 		chip->irq_unmask = irq_chip_unmask_parent;
+	if (!chip->irq_ack)
+		chip->irq_ack = irq_chip_ack_parent;
 	if (!chip->irq_eoi)
 		chip->irq_eoi = irq_chip_eoi_parent;
 	if (!chip->irq_set_affinity)
-- 
Gitee


From 0e2776e21203d2f13bc85074be0a316eabb1cb3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:26 +0200
Subject: [PATCH 1723/2161] genirq: Export affinity setter for modules

commit 4d80d6ca5d77fde9880da8466e5b64f250e5bf82 upstream.

Perf modules abuse irq_set_affinity_hint() to set the affinity of system
PMU interrupts just because irq_set_affinity() was not exported.

The fact that irq_set_affinity_hint() actually sets the affinity is a
non-documented side effect and the name is clearly saying it's a hint.

To clean this up, export the real affinity setter.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093117.968251441@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/interrupt.h | 35 ++---------------------------------
 kernel/irq/manage.c       | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index cf34dbaf2c11..569f87d699cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -301,39 +301,8 @@ struct irq_affinity_desc {
 
 extern cpumask_var_t irq_default_affinity;
 
-/* Internal implementation. Use the helpers below */
-extern int __irq_set_affinity(unsigned int irq, const struct cpumask *cpumask,
-			      bool force);
-
-/**
- * irq_set_affinity - Set the irq affinity of a given irq
- * @irq:	Interrupt to set affinity
- * @cpumask:	cpumask
- *
- * Fails if cpumask does not contain an online CPU
- */
-static inline int
-irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
-{
-	return __irq_set_affinity(irq, cpumask, false);
-}
-
-/**
- * irq_force_affinity - Force the irq affinity of a given irq
- * @irq:	Interrupt to set affinity
- * @cpumask:	cpumask
- *
- * Same as irq_set_affinity, but without checking the mask against
- * online cpus.
- *
- * Solely for low level cpu hotplug code, where we need to make per
- * cpu interrupts affine before the cpu becomes online.
- */
-static inline int
-irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
-{
-	return __irq_set_affinity(irq, cpumask, true);
-}
+extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 918fe0593386..39cee1fea51e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -332,7 +332,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
 	return ret;
 }
 
-int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
+static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
+			      bool force)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -347,6 +348,36 @@ int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
 	return ret;
 }
 
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq:	Interrupt to set affinity
+ * @cpumask:	cpumask
+ *
+ * Fails if cpumask does not contain an online CPU
+ */
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	return __irq_set_affinity(irq, cpumask, false);
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity);
+
+/**
+ * irq_force_affinity - Force the irq affinity of a given irq
+ * @irq:	Interrupt to set affinity
+ * @cpumask:	cpumask
+ *
+ * Same as irq_set_affinity, but without checking the mask against
+ * online cpus.
+ *
+ * Solely for low level cpu hotplug code, where we need to make per
+ * cpu interrupts affine before the cpu becomes online.
+ */
+int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	return __irq_set_affinity(irq, cpumask, true);
+}
+EXPORT_SYMBOL_GPL(irq_force_affinity);
+
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
 	unsigned long flags;
-- 
Gitee


From 2ab622c3a7cf641610e0b09374136dde0833246a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Aug 2020 19:06:40 +0200
Subject: [PATCH 1724/2161] genirq/proc: Take buslock on affinity write

commit a99ffe1a97b9a6151c2f8913fad6da2face81ef8 Intel-BKC.

Until now interrupt chips which support setting affinity are not locking
the associated bus lock for two reasons:

 - All chips which support affinity setting do not use buslock because they
   just can operated directly on the hardware.

 - All chips which use buslock do not support affinity setting because
   their interrupt chips are not capable. These chips are usually connected
   over a bus like I2C, SPI etc. and have an interrupt output which is
   conneted to CPU interrupt of some sort. So there is no way to set the
   affinity on the chip itself.

Upcoming hardware which is PCIE based sports a non standard MSI(X) variant
which stores the MSI message in RAM which is associated to e.g. a device
queue. The device manages this RAM and writes have to be issued via command
queues or similar mechanisms which is obviously not possible from interrupt
disabled, raw spinlock held context.

The buslock mechanism of irq chips can be utilized to support that. The
affinity write to the chip writes to shadow state, marks it pending and the
irq chip's irq_bus_sync_unlock() callback handles the command queue and
wait for completion similar to the other chip operations on I2C or SPI
buses.

Change the locking in irq_set_affinity() to bus_lock/unlock to help with
that. There are a few other callers than the proc interface, but none of
them is affected by this change as none of them affects an irq chip with
bus lock support.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/irq/manage.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 39cee1fea51e..255266df1aa1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -335,16 +335,16 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
 static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
 			      bool force)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 	int ret;
 
+	desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 	if (!desc)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	irq_put_desc_busunlock(desc, flags);
 	return ret;
 }
 
-- 
Gitee


From 0063cf7b83a07afc34738c7cbd1deb6c961013b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= <kw@linux.com>
Date: Tue, 26 May 2020 21:39:05 +0000
Subject: [PATCH 1725/2161] PCI: Rename _DSM constants to align with spec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 3910ebaca8eae0cb9d41a20efe1bcb375ec64dfb upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename PCI-related _DSM constants to align them with the PCI Firmware Spec,
r3.2, sec 4.6.  No functional change intended.

Link: https://lore.kernel.org/r/20200526213905.2479381-1-kw@linux.com
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/acpi/pci_root.c  |  2 +-
 drivers/pci/pci-acpi.c   |  4 ++--
 drivers/pci/pci-label.c  |  4 ++--
 include/linux/pci-acpi.h | 10 ++++++----
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 57b7ba6d9ad1..ddc8baa34864 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -933,7 +933,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	 * assignments made by firmware for this host bridge.
 	 */
 	obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 1,
-	                        IGNORE_PCI_BOOT_CONFIG_DSM, NULL);
+				DSM_PCI_PRESERVE_BOOT_CONFIG, NULL);
 	if (obj && obj->type == ACPI_TYPE_INTEGER && obj->integer.value == 0)
 		host_bridge->preserve_config = 1;
 	ACPI_FREE(obj);
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 6c6045939fdf..088ca501e319 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -1138,7 +1138,7 @@ void acpi_pci_add_bus(struct pci_bus *bus)
 		return;
 
 	obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 3,
-				RESET_DELAY_DSM, NULL);
+				DSM_PCI_POWER_ON_RESET_DELAY, NULL);
 	if (!obj)
 		return;
 
@@ -1203,7 +1203,7 @@ static void pci_acpi_optimize_delay(struct pci_dev *pdev,
 		pdev->d3cold_delay = 0;
 
 	obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 3,
-				FUNCTION_DELAY_DSM, NULL);
+				DSM_PCI_DEVICE_READINESS_DURATIONS, NULL);
 	if (!obj)
 		return;
 
diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
index a5910f942857..707dd9808676 100644
--- a/drivers/pci/pci-label.c
+++ b/drivers/pci/pci-label.c
@@ -178,7 +178,7 @@ static int dsm_get_label(struct device *dev, char *buf,
 		return -1;
 
 	obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 0x2,
-				DEVICE_LABEL_DSM, NULL);
+				DSM_PCI_DEVICE_NAME, NULL);
 	if (!obj)
 		return -1;
 
@@ -218,7 +218,7 @@ static bool device_has_dsm(struct device *dev)
 		return false;
 
 	return !!acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2,
-				1 << DEVICE_LABEL_DSM);
+				1 << DSM_PCI_DEVICE_NAME);
 }
 
 static umode_t acpi_index_string_exist(struct kobject *kobj,
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 11c98875538a..769cb9c9fa28 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -107,10 +107,12 @@ static inline void acpiphp_check_host_bridge(struct acpi_device *adev) { }
 #endif
 
 extern const guid_t pci_acpi_dsm_guid;
-#define IGNORE_PCI_BOOT_CONFIG_DSM	0x05
-#define DEVICE_LABEL_DSM		0x07
-#define RESET_DELAY_DSM			0x08
-#define FUNCTION_DELAY_DSM		0x09
+
+/* _DSM Definitions for PCI */
+#define DSM_PCI_PRESERVE_BOOT_CONFIG		0x05
+#define DSM_PCI_DEVICE_NAME			0x07
+#define DSM_PCI_POWER_ON_RESET_DELAY		0x08
+#define DSM_PCI_DEVICE_READINESS_DURATIONS	0x09
 
 #ifdef CONFIG_PCIE_EDR
 void pci_acpi_add_edr_notifier(struct pci_dev *pdev);
-- 
Gitee


From 04c6aedcfcf728f6869590a8f5928f53c9237b46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= <kw@linux.com>
Date: Tue, 27 Apr 2021 10:39:16 -0500
Subject: [PATCH 1726/2161] PCI/sysfs: Rename device_has_dsm() to
 device_has_acpi_name()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 1017275d2e43dba68527e0e69f4cc12d2b0f8966 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename device_has_dsm() to device_has_acpi_name() to better reflect its
purpose and move it earlier so it's available for a future SMBIOS
.is_visible() function.  No functional change intended.

[bhelgaas: split to separate patch]
Link: https://lore.kernel.org/r/20210416205856.3234481-6-kw@linux.com
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci-label.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
index 707dd9808676..e16dd34d2095 100644
--- a/drivers/pci/pci-label.c
+++ b/drivers/pci/pci-label.c
@@ -33,6 +33,22 @@
 #include <linux/pci-acpi.h>
 #include "pci.h"
 
+static bool device_has_acpi_name(struct device *dev)
+{
+#ifdef CONFIG_ACPI
+	acpi_handle handle;
+
+	handle = ACPI_HANDLE(dev);
+	if (!handle)
+		return false;
+
+	return acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2,
+			      1 << DSM_PCI_DEVICE_NAME);
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_DMI
 enum smbios_attr_enum {
 	SMBIOS_ATTR_NONE = 0,
@@ -209,18 +225,6 @@ static int dsm_get_label(struct device *dev, char *buf,
 	return len;
 }
 
-static bool device_has_dsm(struct device *dev)
-{
-	acpi_handle handle;
-
-	handle = ACPI_HANDLE(dev);
-	if (!handle)
-		return false;
-
-	return !!acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2,
-				1 << DSM_PCI_DEVICE_NAME);
-}
-
 static umode_t acpi_index_string_exist(struct kobject *kobj,
 				       struct attribute *attr, int n)
 {
@@ -228,7 +232,7 @@ static umode_t acpi_index_string_exist(struct kobject *kobj,
 
 	dev = kobj_to_dev(kobj);
 
-	if (device_has_dsm(dev))
+	if (device_has_acpi_name(dev))
 		return S_IRUGO;
 
 	return 0;
@@ -287,16 +291,11 @@ static inline int pci_remove_acpi_index_label_files(struct pci_dev *pdev)
 {
 	return -1;
 }
-
-static inline bool device_has_dsm(struct device *dev)
-{
-	return false;
-}
 #endif
 
 void pci_create_firmware_label_files(struct pci_dev *pdev)
 {
-	if (device_has_dsm(&pdev->dev))
+	if (device_has_acpi_name(&pdev->dev))
 		pci_create_acpi_index_label_files(pdev);
 	else
 		pci_create_smbiosname_file(pdev);
@@ -304,7 +303,7 @@ void pci_create_firmware_label_files(struct pci_dev *pdev)
 
 void pci_remove_firmware_label_files(struct pci_dev *pdev)
 {
-	if (device_has_dsm(&pdev->dev))
+	if (device_has_acpi_name(&pdev->dev))
 		pci_remove_acpi_index_label_files(pdev);
 	else
 		pci_remove_smbiosname_file(pdev);
-- 
Gitee


From 6cec30d9227f8ff6b62e686c8befb4e95fce225c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= <kw@linux.com>
Date: Tue, 27 Apr 2021 15:04:44 -0500
Subject: [PATCH 1727/2161] PCI/sysfs: Tidy SMBIOS & ACPI label attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 362fb766264a1d62254ad950304fa1d97172bb44 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update coding style to reduce distraction.  No functional change intended.

[bhelgaas: split to separate patch]
Link: https://lore.kernel.org/r/20210416205856.3234481-6-kw@linux.com
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci-label.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
index e16dd34d2095..64788f821c2f 100644
--- a/drivers/pci/pci-label.c
+++ b/drivers/pci/pci-label.c
@@ -36,9 +36,8 @@
 static bool device_has_acpi_name(struct device *dev)
 {
 #ifdef CONFIG_ACPI
-	acpi_handle handle;
+	acpi_handle handle = ACPI_HANDLE(dev);
 
-	handle = ACPI_HANDLE(dev);
 	if (!handle)
 		return false;
 
@@ -61,13 +60,9 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf,
 {
 	const struct dmi_device *dmi;
 	struct dmi_dev_onboard *donboard;
-	int domain_nr;
-	int bus;
-	int devfn;
-
-	domain_nr = pci_domain_nr(pdev->bus);
-	bus = pdev->bus->number;
-	devfn = pdev->devfn;
+	int domain_nr = pci_domain_nr(pdev->bus);
+	int bus = pdev->bus->number;
+	int devfn = pdev->devfn;
 
 	dmi = NULL;
 	while ((dmi = dmi_find_device(DMI_DEV_TYPE_DEV_ONBOARD,
@@ -185,11 +180,10 @@ static void dsm_label_utf16s_to_utf8s(union acpi_object *obj, char *buf)
 static int dsm_get_label(struct device *dev, char *buf,
 			 enum acpi_attr_enum attr)
 {
-	acpi_handle handle;
+	acpi_handle handle = ACPI_HANDLE(dev);
 	union acpi_object *obj, *tmp;
 	int len = -1;
 
-	handle = ACPI_HANDLE(dev);
 	if (!handle)
 		return -1;
 
-- 
Gitee


From 79fe9fcf04157f118e1d743db58e6107c2a1ec3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= <kw@linux.com>
Date: Thu, 3 Jun 2021 00:01:12 +0000
Subject: [PATCH 1728/2161] PCI/sysfs: Fix dsm_label_utf16s_to_utf8s() buffer
 overrun
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit bdcdaa13ad96f1a530711c29e6d4b8311eff767c upstream.

"utf16s_to_utf8s(..., buf, PAGE_SIZE)" puts up to PAGE_SIZE bytes into
"buf" and returns the number of bytes it actually put there.  If it wrote
PAGE_SIZE bytes, the newline added by dsm_label_utf16s_to_utf8s() would
overrun "buf".

Reduce the size available for utf16s_to_utf8s() to use so there is always
space for the newline.

[bhelgaas: reorder patch in series, commit log]
Fixes: 6058989bad05 ("PCI: Export ACPI _DSM provided firmware instance number and string name to sysfs")
Link: https://lore.kernel.org/r/20210603000112.703037-7-kw@linux.com
Reported-by: Joe Perches <joe@perches.com>
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci-label.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
index 64788f821c2f..f1ae7cb3b441 100644
--- a/drivers/pci/pci-label.c
+++ b/drivers/pci/pci-label.c
@@ -173,7 +173,7 @@ static void dsm_label_utf16s_to_utf8s(union acpi_object *obj, char *buf)
 	len = utf16s_to_utf8s((const wchar_t *)obj->buffer.pointer,
 			      obj->buffer.length,
 			      UTF16_LITTLE_ENDIAN,
-			      buf, PAGE_SIZE);
+			      buf, PAGE_SIZE - 1);
 	buf[len] = '\n';
 }
 
-- 
Gitee


From 8d6212d5793071db870e7980fcce966fa859a87f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= <kw@linux.com>
Date: Thu, 3 Jun 2021 00:01:08 +0000
Subject: [PATCH 1729/2161] PCI/sysfs: Rely on lengths from scnprintf(),
 dsm_label_utf16s_to_utf8s()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 316ae33051215f92c72fe13bc1bfc4e513a26700 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scnprintf() returns the number of bytes written into the buffer.  Change
dsm_label_utf16s_to_utf8s() to do the same.  Rely on those values instead
of using strlen() to compute the buffer length.

No functional change intended.

[bhelgaas: reorder patch in series, len++ to include newline added by
dsm_label_utf16s_to_utf8s(), commit log]
Link: https://lore.kernel.org/r/20210603000112.703037-3-kw@linux.com
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci-label.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
index f1ae7cb3b441..4057f719146d 100644
--- a/drivers/pci/pci-label.c
+++ b/drivers/pci/pci-label.c
@@ -167,14 +167,17 @@ enum acpi_attr_enum {
 	ACPI_ATTR_INDEX_SHOW,
 };
 
-static void dsm_label_utf16s_to_utf8s(union acpi_object *obj, char *buf)
+static int dsm_label_utf16s_to_utf8s(union acpi_object *obj, char *buf)
 {
 	int len;
+
 	len = utf16s_to_utf8s((const wchar_t *)obj->buffer.pointer,
 			      obj->buffer.length,
 			      UTF16_LITTLE_ENDIAN,
 			      buf, PAGE_SIZE - 1);
-	buf[len] = '\n';
+	buf[len++] = '\n';
+
+	return len;
 }
 
 static int dsm_get_label(struct device *dev, char *buf,
@@ -182,7 +185,7 @@ static int dsm_get_label(struct device *dev, char *buf,
 {
 	acpi_handle handle = ACPI_HANDLE(dev);
 	union acpi_object *obj, *tmp;
-	int len = -1;
+	int len = 0;
 
 	if (!handle)
 		return -1;
@@ -203,20 +206,19 @@ static int dsm_get_label(struct device *dev, char *buf,
 		 * this entry must return a null string.
 		 */
 		if (attr == ACPI_ATTR_INDEX_SHOW) {
-			scnprintf(buf, PAGE_SIZE, "%llu\n", tmp->integer.value);
+			len = scnprintf(buf, PAGE_SIZE, "%llu\n", tmp->integer.value);
 		} else if (attr == ACPI_ATTR_LABEL_SHOW) {
 			if (tmp[1].type == ACPI_TYPE_STRING)
-				scnprintf(buf, PAGE_SIZE, "%s\n",
+				len = scnprintf(buf, PAGE_SIZE, "%s\n",
 					  tmp[1].string.pointer);
 			else if (tmp[1].type == ACPI_TYPE_BUFFER)
-				dsm_label_utf16s_to_utf8s(tmp + 1, buf);
+				len = dsm_label_utf16s_to_utf8s(tmp + 1, buf);
 		}
-		len = strlen(buf) > 0 ? strlen(buf) : -1;
 	}
 
 	ACPI_FREE(obj);
 
-	return len;
+	return len > 0 ? len : -1;
 }
 
 static umode_t acpi_index_string_exist(struct kobject *kobj,
-- 
Gitee


From 9f42e8a19e24af003398100151ea317c432b4b23 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 22 Jun 2022 20:57:04 +0800
Subject: [PATCH 1730/2161] PCI/sysfs: Use sysfs_emit() and sysfs_emit_at() in
 "show" functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit ad025f8e46f3dbf09b1bf8d7a5b4ce858df74544 upstream.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sysfs_emit() and sysfs_emit_at() functions were introduced to make
it less ambiguous which function is preferred when writing to the output
buffer in a device attribute's "show" callback [1].

Convert the PCI sysfs object "show" functions from sprintf(), snprintf()
and scnprintf() to sysfs_emit() and sysfs_emit_at() accordingly, as the
latter is aware of the PAGE_SIZE buffer and correctly returns the number
of bytes written into the buffer.

No functional change intended.

[1] Documentation/filesystems/sysfs.rst

Related commit: ad025f8e46f3 ("PCI/sysfs: Use sysfs_emit() and
sysfs_emit_at() in "show" functions").

Link: https://lore.kernel.org/r/20210603000112.703037-2-kw@linux.com
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/hotplug/pci_hotplug_core.c |  8 +++---
 drivers/pci/hotplug/rpadlpar_sysfs.c   |  4 +--
 drivers/pci/hotplug/shpchp_sysfs.c     | 38 ++++++++++++++------------
 drivers/pci/iov.c                      | 12 ++++----
 drivers/pci/msi.c                      |  8 +++---
 drivers/pci/p2pdma.c                   |  7 ++---
 drivers/pci/pci-label.c                |  6 ++--
 drivers/pci/pci.c                      |  2 +-
 drivers/pci/pcie/aer.c                 | 20 ++++++++------
 drivers/pci/pcie/aspm.c                |  4 +--
 drivers/pci/slot.c                     | 18 ++++++------
 drivers/pci/switch/switchtec.c         | 12 ++++----
 12 files changed, 72 insertions(+), 67 deletions(-)

diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index 5ac31f683b85..058d5937d8a9 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -73,7 +73,7 @@ static ssize_t power_read_file(struct pci_slot *pci_slot, char *buf)
 	if (retval)
 		return retval;
 
-	return sprintf(buf, "%d\n", value);
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static ssize_t power_write_file(struct pci_slot *pci_slot, const char *buf,
@@ -130,7 +130,7 @@ static ssize_t attention_read_file(struct pci_slot *pci_slot, char *buf)
 	if (retval)
 		return retval;
 
-	return sprintf(buf, "%d\n", value);
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static ssize_t attention_write_file(struct pci_slot *pci_slot, const char *buf,
@@ -175,7 +175,7 @@ static ssize_t latch_read_file(struct pci_slot *pci_slot, char *buf)
 	if (retval)
 		return retval;
 
-	return sprintf(buf, "%d\n", value);
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static struct pci_slot_attribute hotplug_slot_attr_latch = {
@@ -192,7 +192,7 @@ static ssize_t presence_read_file(struct pci_slot *pci_slot, char *buf)
 	if (retval)
 		return retval;
 
-	return sprintf(buf, "%d\n", value);
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static struct pci_slot_attribute hotplug_slot_attr_presence = {
diff --git a/drivers/pci/hotplug/rpadlpar_sysfs.c b/drivers/pci/hotplug/rpadlpar_sysfs.c
index dbfa0b55d31a..068b7810a574 100644
--- a/drivers/pci/hotplug/rpadlpar_sysfs.c
+++ b/drivers/pci/hotplug/rpadlpar_sysfs.c
@@ -50,7 +50,7 @@ static ssize_t add_slot_store(struct kobject *kobj, struct kobj_attribute *attr,
 static ssize_t add_slot_show(struct kobject *kobj,
 			     struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "0\n");
+	return sysfs_emit(buf, "0\n");
 }
 
 static ssize_t remove_slot_store(struct kobject *kobj,
@@ -80,7 +80,7 @@ static ssize_t remove_slot_store(struct kobject *kobj,
 static ssize_t remove_slot_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "0\n");
+	return sysfs_emit(buf, "0\n");
 }
 
 static struct kobj_attribute add_slot_attr =
diff --git a/drivers/pci/hotplug/shpchp_sysfs.c b/drivers/pci/hotplug/shpchp_sysfs.c
index 45658bb5c554..64beed7a26be 100644
--- a/drivers/pci/hotplug/shpchp_sysfs.c
+++ b/drivers/pci/hotplug/shpchp_sysfs.c
@@ -24,50 +24,54 @@
 static ssize_t show_ctrl(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct pci_dev *pdev;
-	char *out = buf;
 	int index, busnr;
 	struct resource *res;
 	struct pci_bus *bus;
+	size_t len = 0;
 
 	pdev = to_pci_dev(dev);
 	bus = pdev->subordinate;
 
-	out += sprintf(buf, "Free resources: memory\n");
+	len += sysfs_emit_at(buf, len, "Free resources: memory\n");
 	pci_bus_for_each_resource(bus, res, index) {
 		if (res && (res->flags & IORESOURCE_MEM) &&
 				!(res->flags & IORESOURCE_PREFETCH)) {
-			out += sprintf(out, "start = %8.8llx, length = %8.8llx\n",
-				       (unsigned long long)res->start,
-				       (unsigned long long)resource_size(res));
+			len += sysfs_emit_at(buf, len,
+					     "start = %8.8llx, length = %8.8llx\n",
+					     (unsigned long long)res->start,
+					     (unsigned long long)resource_size(res));
 		}
 	}
-	out += sprintf(out, "Free resources: prefetchable memory\n");
+	len += sysfs_emit_at(buf, len, "Free resources: prefetchable memory\n");
 	pci_bus_for_each_resource(bus, res, index) {
 		if (res && (res->flags & IORESOURCE_MEM) &&
 			       (res->flags & IORESOURCE_PREFETCH)) {
-			out += sprintf(out, "start = %8.8llx, length = %8.8llx\n",
-				       (unsigned long long)res->start,
-				       (unsigned long long)resource_size(res));
+			len += sysfs_emit_at(buf, len,
+					     "start = %8.8llx, length = %8.8llx\n",
+					     (unsigned long long)res->start,
+					     (unsigned long long)resource_size(res));
 		}
 	}
-	out += sprintf(out, "Free resources: IO\n");
+	len += sysfs_emit_at(buf, len, "Free resources: IO\n");
 	pci_bus_for_each_resource(bus, res, index) {
 		if (res && (res->flags & IORESOURCE_IO)) {
-			out += sprintf(out, "start = %8.8llx, length = %8.8llx\n",
-				       (unsigned long long)res->start,
-				       (unsigned long long)resource_size(res));
+			len += sysfs_emit_at(buf, len,
+					     "start = %8.8llx, length = %8.8llx\n",
+					     (unsigned long long)res->start,
+					     (unsigned long long)resource_size(res));
 		}
 	}
-	out += sprintf(out, "Free resources: bus numbers\n");
+	len += sysfs_emit_at(buf, len, "Free resources: bus numbers\n");
 	for (busnr = bus->busn_res.start; busnr <= bus->busn_res.end; busnr++) {
 		if (!pci_find_bus(pci_domain_nr(bus), busnr))
 			break;
 	}
 	if (busnr < bus->busn_res.end)
-		out += sprintf(out, "start = %8.8x, length = %8.8x\n",
-				busnr, (int)(bus->busn_res.end - busnr));
+		len += sysfs_emit_at(buf, len,
+				     "start = %8.8x, length = %8.8x\n",
+				     busnr, (int)(bus->busn_res.end - busnr));
 
-	return out - buf;
+	return len;
 }
 static DEVICE_ATTR(ctrl, S_IRUGO, show_ctrl, NULL);
 
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index b3b802bcefa0..e3436cccb6eb 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -246,7 +246,7 @@ static ssize_t sriov_totalvfs_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev));
+	return sysfs_emit(buf, "%u\n", pci_sriov_get_totalvfs(pdev));
 }
 
 static ssize_t sriov_numvfs_show(struct device *dev,
@@ -261,7 +261,7 @@ static ssize_t sriov_numvfs_show(struct device *dev,
 	num_vfs = pdev->sriov->num_VFs;
 	device_unlock(&pdev->dev);
 
-	return sprintf(buf, "%u\n", num_vfs);
+	return sysfs_emit(buf, "%u\n", num_vfs);
 }
 
 /*
@@ -340,7 +340,7 @@ static ssize_t sriov_offset_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%u\n", pdev->sriov->offset);
+	return sysfs_emit(buf, "%u\n", pdev->sriov->offset);
 }
 
 static ssize_t sriov_stride_show(struct device *dev,
@@ -349,7 +349,7 @@ static ssize_t sriov_stride_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%u\n", pdev->sriov->stride);
+	return sysfs_emit(buf, "%u\n", pdev->sriov->stride);
 }
 
 static ssize_t sriov_vf_device_show(struct device *dev,
@@ -358,7 +358,7 @@ static ssize_t sriov_vf_device_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%x\n", pdev->sriov->vf_device);
+	return sysfs_emit(buf, "%x\n", pdev->sriov->vf_device);
 }
 
 static ssize_t sriov_drivers_autoprobe_show(struct device *dev,
@@ -367,7 +367,7 @@ static ssize_t sriov_drivers_autoprobe_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe);
+	return sysfs_emit(buf, "%u\n", pdev->sriov->drivers_autoprobe);
 }
 
 static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4fa71f1dcc0e..22802292702d 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -471,11 +471,11 @@ static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
 		return retval;
 
 	entry = irq_get_msi_desc(irq);
-	if (entry)
-		return sprintf(buf, "%s\n",
-				entry->msi_attrib.is_msix ? "msix" : "msi");
+	if (!entry)
+		return -ENODEV;
 
-	return -ENODEV;
+	return sysfs_emit(buf, "%s\n",
+			  entry->msi_attrib.is_msix ? "msix" : "msi");
 }
 
 static int populate_msi_sysfs(struct pci_dev *pdev)
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 0608aae72ccc..338a46382069 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -53,7 +53,7 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr,
 	if (pdev->p2pdma->pool)
 		size = gen_pool_size(pdev->p2pdma->pool);
 
-	return snprintf(buf, PAGE_SIZE, "%zd\n", size);
+	return sysfs_emit(buf, "%zd\n", size);
 }
 static DEVICE_ATTR_RO(size);
 
@@ -66,7 +66,7 @@ static ssize_t available_show(struct device *dev, struct device_attribute *attr,
 	if (pdev->p2pdma->pool)
 		avail = gen_pool_avail(pdev->p2pdma->pool);
 
-	return snprintf(buf, PAGE_SIZE, "%zd\n", avail);
+	return sysfs_emit(buf, "%zd\n", avail);
 }
 static DEVICE_ATTR_RO(available);
 
@@ -75,8 +75,7 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return snprintf(buf, PAGE_SIZE, "%d\n",
-			pdev->p2pdma->p2pmem_published);
+	return sysfs_emit(buf, "%d\n", pdev->p2pdma->p2pmem_published);
 }
 static DEVICE_ATTR_RO(published);
 
diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
index 4057f719146d..8bc827d7d70e 100644
--- a/drivers/pci/pci-label.c
+++ b/drivers/pci/pci-label.c
@@ -206,11 +206,11 @@ static int dsm_get_label(struct device *dev, char *buf,
 		 * this entry must return a null string.
 		 */
 		if (attr == ACPI_ATTR_INDEX_SHOW) {
-			len = scnprintf(buf, PAGE_SIZE, "%llu\n", tmp->integer.value);
+			len = sysfs_emit(buf, "%llu\n", tmp->integer.value);
 		} else if (attr == ACPI_ATTR_LABEL_SHOW) {
 			if (tmp[1].type == ACPI_TYPE_STRING)
-				len = scnprintf(buf, PAGE_SIZE, "%s\n",
-					  tmp[1].string.pointer);
+				len = sysfs_emit(buf, "%s\n",
+						 tmp[1].string.pointer);
 			else if (tmp[1].type == ACPI_TYPE_BUFFER)
 				len = dsm_label_utf16s_to_utf8s(tmp + 1, buf);
 		}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index be2540ff6641..bcf2dad2face 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6321,7 +6321,7 @@ static ssize_t resource_alignment_show(struct bus_type *bus, char *buf)
 
 	spin_lock(&resource_alignment_lock);
 	if (resource_alignment_param)
-		count = snprintf(buf, PAGE_SIZE, "%s", resource_alignment_param);
+		count = sysfs_emit(buf, "%s", resource_alignment_param);
 	spin_unlock(&resource_alignment_lock);
 
 	/*
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index bac0e0403349..6325f8399d41 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -524,21 +524,23 @@ static const char *aer_agent_string[] = {
 		     char *buf)						\
 {									\
 	unsigned int i;							\
-	char *str = buf;						\
 	struct pci_dev *pdev = to_pci_dev(dev);				\
 	u64 *stats = pdev->aer_stats->stats_array;			\
+	size_t len = 0;							\
 									\
 	for (i = 0; i < ARRAY_SIZE(strings_array); i++) {		\
 		if (strings_array[i])					\
-			str += sprintf(str, "%s %llu\n",		\
-				       strings_array[i], stats[i]);	\
+			len += sysfs_emit_at(buf, len, "%s %llu\n",	\
+					     strings_array[i],		\
+					     stats[i]);			\
 		else if (stats[i])					\
-			str += sprintf(str, #stats_array "_bit[%d] %llu\n",\
-				       i, stats[i]);			\
+			len += sysfs_emit_at(buf, len,			\
+					     #stats_array "_bit[%d] %llu\n",\
+					     i, stats[i]);		\
 	}								\
-	str += sprintf(str, "TOTAL_%s %llu\n", total_string,		\
-		       pdev->aer_stats->total_field);			\
-	return str-buf;							\
+	len += sysfs_emit_at(buf, len, "TOTAL_%s %llu\n", total_string,	\
+			     pdev->aer_stats->total_field);		\
+	return len;							\
 }									\
 static DEVICE_ATTR_RO(name)
 
@@ -558,7 +560,7 @@ aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs,
 		     char *buf)						\
 {									\
 	struct pci_dev *pdev = to_pci_dev(dev);				\
-	return sprintf(buf, "%llu\n", pdev->aer_stats->field);		\
+	return sysfs_emit(buf, "%llu\n", pdev->aer_stats->field);	\
 }									\
 static DEVICE_ATTR_RO(name)
 
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 7624c71011c6..5e47b98669df 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -1192,7 +1192,7 @@ static ssize_t link_state_show(struct device *dev,
 	struct pci_dev *pci_device = to_pci_dev(dev);
 	struct pcie_link_state *link_state = pci_device->link_state;
 
-	return sprintf(buf, "%d\n", link_state->aspm_enabled);
+	return sysfs_emit(buf, "%d\n", link_state->aspm_enabled);
 }
 
 static ssize_t link_state_store(struct device *dev,
@@ -1231,7 +1231,7 @@ static ssize_t clk_ctl_show(struct device *dev,
 	struct pci_dev *pci_device = to_pci_dev(dev);
 	struct pcie_link_state *link_state = pci_device->link_state;
 
-	return sprintf(buf, "%d\n", link_state->clkpm_enabled);
+	return sysfs_emit(buf, "%d\n", link_state->clkpm_enabled);
 }
 
 static ssize_t clk_ctl_store(struct device *dev,
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index 1e3ed6ec0a4a..794504e854ce 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -39,14 +39,14 @@ static const struct sysfs_ops pci_slot_sysfs_ops = {
 static ssize_t address_read_file(struct pci_slot *slot, char *buf)
 {
 	if (slot->number == 0xff)
-		return sprintf(buf, "%04x:%02x\n",
-				pci_domain_nr(slot->bus),
-				slot->bus->number);
-	else
-		return sprintf(buf, "%04x:%02x:%02x\n",
-				pci_domain_nr(slot->bus),
-				slot->bus->number,
-				slot->number);
+		return sysfs_emit(buf, "%04x:%02x\n",
+				  pci_domain_nr(slot->bus),
+				  slot->bus->number);
+
+	return sysfs_emit(buf, "%04x:%02x:%02x\n",
+			  pci_domain_nr(slot->bus),
+			  slot->bus->number,
+			  slot->number);
 }
 
 /* these strings match up with the values in pci_bus_speed */
@@ -87,7 +87,7 @@ static ssize_t bus_speed_read(enum pci_bus_speed speed, char *buf)
 	else
 		speed_string = "Unknown";
 
-	return sprintf(buf, "%s\n", speed_string);
+	return sysfs_emit(buf, "%s\n", speed_string);
 }
 
 static ssize_t max_speed_read_file(struct pci_slot *slot, char *buf)
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 2c9c3061894b..39768a4dde23 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -278,7 +278,7 @@ static ssize_t device_version_show(struct device *dev,
 
 	ver = ioread32(&stdev->mmio_sys_info->device_version);
 
-	return sprintf(buf, "%x\n", ver);
+	return sysfs_emit(buf, "%x\n", ver);
 }
 static DEVICE_ATTR_RO(device_version);
 
@@ -290,7 +290,7 @@ static ssize_t fw_version_show(struct device *dev,
 
 	ver = ioread32(&stdev->mmio_sys_info->firmware_version);
 
-	return sprintf(buf, "%08x\n", ver);
+	return sysfs_emit(buf, "%08x\n", ver);
 }
 static DEVICE_ATTR_RO(fw_version);
 
@@ -334,7 +334,7 @@ static ssize_t component_id_show(struct device *dev,
 	struct switchtec_dev *stdev = to_stdev(dev);
 	int id = ioread16(&stdev->mmio_sys_info->component_id);
 
-	return sprintf(buf, "PM%04X\n", id);
+	return sysfs_emit(buf, "PM%04X\n", id);
 }
 static DEVICE_ATTR_RO(component_id);
 
@@ -344,7 +344,7 @@ static ssize_t component_revision_show(struct device *dev,
 	struct switchtec_dev *stdev = to_stdev(dev);
 	int rev = ioread8(&stdev->mmio_sys_info->component_revision);
 
-	return sprintf(buf, "%d\n", rev);
+	return sysfs_emit(buf, "%d\n", rev);
 }
 static DEVICE_ATTR_RO(component_revision);
 
@@ -353,7 +353,7 @@ static ssize_t partition_show(struct device *dev,
 {
 	struct switchtec_dev *stdev = to_stdev(dev);
 
-	return sprintf(buf, "%d\n", stdev->partition);
+	return sysfs_emit(buf, "%d\n", stdev->partition);
 }
 static DEVICE_ATTR_RO(partition);
 
@@ -362,7 +362,7 @@ static ssize_t partition_count_show(struct device *dev,
 {
 	struct switchtec_dev *stdev = to_stdev(dev);
 
-	return sprintf(buf, "%d\n", stdev->partition_count);
+	return sysfs_emit(buf, "%d\n", stdev->partition_count);
 }
 static DEVICE_ATTR_RO(partition_count);
 
-- 
Gitee


From 7c5b40f77671b18deacd1c1c764de724b77c895a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:44 +0200
Subject: [PATCH 1731/2161] PCI/MSI: Do not set invalid bits in MSI mask

commit 361fd37397f77578735907341579397d5bed0a2d upstream.

msi_mask_irq() takes a mask and a flags argument. The mask argument is used
to mask out bits from the cached mask and the flags argument to set bits.

Some places invoke it with a flags argument which sets bits which are not
used by the device, i.e. when the device supports up to 8 vectors a full
unmask in some places sets the mask to 0xFFFFFF00. While devices probably
do not care, it's still bad practice.

Fixes: 7ba1930db02f ("PCI MSI: Unmask MSI if setup failed")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.568173099@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 22802292702d..b214b80a6129 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -643,21 +643,21 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 	/* Configure MSI capability structure */
 	ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
 	if (ret) {
-		msi_mask_irq(entry, mask, ~mask);
+		msi_mask_irq(entry, mask, 0);
 		free_msi_irqs(dev);
 		return ret;
 	}
 
 	ret = msi_verify_entries(dev);
 	if (ret) {
-		msi_mask_irq(entry, mask, ~mask);
+		msi_mask_irq(entry, mask, 0);
 		free_msi_irqs(dev);
 		return ret;
 	}
 
 	ret = populate_msi_sysfs(dev);
 	if (ret) {
-		msi_mask_irq(entry, mask, ~mask);
+		msi_mask_irq(entry, mask, 0);
 		free_msi_irqs(dev);
 		return ret;
 	}
@@ -931,7 +931,7 @@ static void pci_msi_shutdown(struct pci_dev *dev)
 	/* Return the device with MSI unmasked as initial states */
 	mask = msi_mask(desc->msi_attrib.multi_cap);
 	/* Keep cached state to be restored */
-	__pci_msi_desc_mask_irq(desc, mask, ~mask);
+	__pci_msi_desc_mask_irq(desc, mask, 0);
 
 	/* Restore dev->irq to its default pin-assertion IRQ */
 	dev->irq = desc->msi_attrib.default_irq;
-- 
Gitee


From 773789cb61b00420eeaae8eb7f4d8cabdd5389a3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:54 +0200
Subject: [PATCH 1732/2161] PCI/MSI: Consolidate error handling in
 msi_capability_init()

commit 8eb5ce3f78a5e5d3f1a12248f6b7dc64ebf71da6 upstream.

Three error exits doing exactly the same ask for a common error exit point.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210729222543.098828720@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index b214b80a6129..9893525fd342 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -642,25 +642,16 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 
 	/* Configure MSI capability structure */
 	ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
-	if (ret) {
-		msi_mask_irq(entry, mask, 0);
-		free_msi_irqs(dev);
-		return ret;
-	}
+	if (ret)
+		goto err;
 
 	ret = msi_verify_entries(dev);
-	if (ret) {
-		msi_mask_irq(entry, mask, 0);
-		free_msi_irqs(dev);
-		return ret;
-	}
+	if (ret)
+		goto err;
 
 	ret = populate_msi_sysfs(dev);
-	if (ret) {
-		msi_mask_irq(entry, mask, 0);
-		free_msi_irqs(dev);
-		return ret;
-	}
+	if (ret)
+		goto err;
 
 	/* Set MSI enabled bits	*/
 	pci_intx_for_msi(dev, 0);
@@ -670,6 +661,11 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 	pcibios_free_irq(dev);
 	dev->irq = entry->irq;
 	return 0;
+
+err:
+	msi_mask_irq(entry, mask, 0);
+	free_msi_irqs(dev);
+	return ret;
 }
 
 static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
-- 
Gitee


From 839825480d97be1a895db813db18edba07763332 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Fri, 13 Aug 2021 15:56:27 +1200
Subject: [PATCH 1733/2161] genirq/msi: Move MSI sysfs handling from PCI to MSI
 core

commit 2f170814bdd26289e9daaa4ae359290f854e5dcf upstream.

Move PCI's MSI sysfs code to the irq core so that other busses such as
platform can reuse it.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210813035628.6844-2-21cnbao@gmail.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c   | 125 ++++-------------------------------------
 include/linux/msi.h |   4 ++
 kernel/irq/msi.c    | 134 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+), 115 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 9893525fd342..d1292b070f53 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -366,9 +366,7 @@ static void free_msi_irqs(struct pci_dev *dev)
 {
 	struct list_head *msi_list = dev_to_msi_list(&dev->dev);
 	struct msi_desc *entry, *tmp;
-	struct attribute **msi_attrs;
-	struct device_attribute *dev_attr;
-	int i, count = 0;
+	int i;
 
 	for_each_pci_msi_entry(entry, dev)
 		if (entry->irq)
@@ -388,18 +386,7 @@ static void free_msi_irqs(struct pci_dev *dev)
 	}
 
 	if (dev->msi_irq_groups) {
-		sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups);
-		msi_attrs = dev->msi_irq_groups[0]->attrs;
-		while (msi_attrs[count]) {
-			dev_attr = container_of(msi_attrs[count],
-						struct device_attribute, attr);
-			kfree(dev_attr->attr.name);
-			kfree(dev_attr);
-			++count;
-		}
-		kfree(msi_attrs);
-		kfree(dev->msi_irq_groups[0]);
-		kfree(dev->msi_irq_groups);
+		msi_destroy_sysfs(&dev->dev, dev->msi_irq_groups);
 		dev->msi_irq_groups = NULL;
 	}
 }
@@ -459,102 +446,6 @@ void pci_restore_msi_state(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_restore_msi_state);
 
-static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
-			     char *buf)
-{
-	struct msi_desc *entry;
-	unsigned long irq;
-	int retval;
-
-	retval = kstrtoul(attr->attr.name, 10, &irq);
-	if (retval)
-		return retval;
-
-	entry = irq_get_msi_desc(irq);
-	if (!entry)
-		return -ENODEV;
-
-	return sysfs_emit(buf, "%s\n",
-			  entry->msi_attrib.is_msix ? "msix" : "msi");
-}
-
-static int populate_msi_sysfs(struct pci_dev *pdev)
-{
-	struct attribute **msi_attrs;
-	struct attribute *msi_attr;
-	struct device_attribute *msi_dev_attr;
-	struct attribute_group *msi_irq_group;
-	const struct attribute_group **msi_irq_groups;
-	struct msi_desc *entry;
-	int ret = -ENOMEM;
-	int num_msi = 0;
-	int count = 0;
-	int i;
-
-	/* Determine how many msi entries we have */
-	for_each_pci_msi_entry(entry, pdev)
-		num_msi += entry->nvec_used;
-	if (!num_msi)
-		return 0;
-
-	/* Dynamically create the MSI attributes for the PCI device */
-	msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
-	if (!msi_attrs)
-		return -ENOMEM;
-	for_each_pci_msi_entry(entry, pdev) {
-		for (i = 0; i < entry->nvec_used; i++) {
-			msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
-			if (!msi_dev_attr)
-				goto error_attrs;
-			msi_attrs[count] = &msi_dev_attr->attr;
-
-			sysfs_attr_init(&msi_dev_attr->attr);
-			msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
-							    entry->irq + i);
-			if (!msi_dev_attr->attr.name)
-				goto error_attrs;
-			msi_dev_attr->attr.mode = S_IRUGO;
-			msi_dev_attr->show = msi_mode_show;
-			++count;
-		}
-	}
-
-	msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
-	if (!msi_irq_group)
-		goto error_attrs;
-	msi_irq_group->name = "msi_irqs";
-	msi_irq_group->attrs = msi_attrs;
-
-	msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
-	if (!msi_irq_groups)
-		goto error_irq_group;
-	msi_irq_groups[0] = msi_irq_group;
-
-	ret = sysfs_create_groups(&pdev->dev.kobj, msi_irq_groups);
-	if (ret)
-		goto error_irq_groups;
-	pdev->msi_irq_groups = msi_irq_groups;
-
-	return 0;
-
-error_irq_groups:
-	kfree(msi_irq_groups);
-error_irq_group:
-	kfree(msi_irq_group);
-error_attrs:
-	count = 0;
-	msi_attr = msi_attrs[count];
-	while (msi_attr) {
-		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
-		kfree(msi_attr->name);
-		kfree(msi_dev_attr);
-		++count;
-		msi_attr = msi_attrs[count];
-	}
-	kfree(msi_attrs);
-	return ret;
-}
-
 static struct msi_desc *
 msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 {
@@ -649,9 +540,11 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 	if (ret)
 		goto err;
 
-	ret = populate_msi_sysfs(dev);
-	if (ret)
+	dev->msi_irq_groups = msi_populate_sysfs(&dev->dev);
+	if (IS_ERR(dev->msi_irq_groups)) {
+		ret = PTR_ERR(dev->msi_irq_groups);
 		goto err;
+	}
 
 	/* Set MSI enabled bits	*/
 	pci_intx_for_msi(dev, 0);
@@ -807,9 +700,11 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 
 	msix_program_entries(dev, entries);
 
-	ret = populate_msi_sysfs(dev);
-	if (ret)
+	dev->msi_irq_groups = msi_populate_sysfs(&dev->dev);
+	if (IS_ERR(dev->msi_irq_groups)) {
+		ret = PTR_ERR(dev->msi_irq_groups);
 		goto out_free;
+	}
 
 	/* Set MSI-X enabled bits and unmask the function */
 	pci_intx_for_msi(dev, 0);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 4b58edfd6dbe..a22fde7bbfae 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -237,6 +237,10 @@ u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
 void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
+const struct attribute_group **msi_populate_sysfs(struct device *dev);
+void msi_destroy_sysfs(struct device *dev,
+		       const struct attribute_group **msi_irq_groups);
+
 /*
  * The arch hooks to setup up msi irqs. Default functions are implemented
  * as weak symbols so that they /can/ be overriden by architecture specific
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5396232895b0..0b732c0d93e9 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,6 +14,7 @@
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
 #include <linux/slab.h>
+#include <linux/pci.h>
 
 #include "internals.h"
 
@@ -69,6 +70,139 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
 }
 EXPORT_SYMBOL_GPL(get_cached_msi_msg);
 
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct msi_desc *entry;
+	bool is_msix = false;
+	unsigned long irq;
+	int retval;
+
+	retval = kstrtoul(attr->attr.name, 10, &irq);
+	if (retval)
+		return retval;
+
+	entry = irq_get_msi_desc(irq);
+	if (!entry)
+		return -ENODEV;
+
+	if (dev_is_pci(dev))
+		is_msix = entry->msi_attrib.is_msix;
+
+	return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+/**
+ * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices
+ * @dev:	The device(PCI, platform etc) who will get sysfs entries
+ *
+ * Return attribute_group ** so that specific bus MSI can save it to
+ * somewhere during initilizing msi irqs. If devices has no MSI irq,
+ * return NULL; if it fails to populate sysfs, return ERR_PTR
+ */
+const struct attribute_group **msi_populate_sysfs(struct device *dev)
+{
+	const struct attribute_group **msi_irq_groups;
+	struct attribute **msi_attrs, *msi_attr;
+	struct device_attribute *msi_dev_attr;
+	struct attribute_group *msi_irq_group;
+	struct msi_desc *entry;
+	int ret = -ENOMEM;
+	int num_msi = 0;
+	int count = 0;
+	int i;
+
+	/* Determine how many msi entries we have */
+	for_each_msi_entry(entry, dev)
+		num_msi += entry->nvec_used;
+	if (!num_msi)
+		return NULL;
+
+	/* Dynamically create the MSI attributes for the device */
+	msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
+	if (!msi_attrs)
+		return ERR_PTR(-ENOMEM);
+
+	for_each_msi_entry(entry, dev) {
+		for (i = 0; i < entry->nvec_used; i++) {
+			msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
+			if (!msi_dev_attr)
+				goto error_attrs;
+			msi_attrs[count] = &msi_dev_attr->attr;
+
+			sysfs_attr_init(&msi_dev_attr->attr);
+			msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
+							    entry->irq + i);
+			if (!msi_dev_attr->attr.name)
+				goto error_attrs;
+			msi_dev_attr->attr.mode = 0444;
+			msi_dev_attr->show = msi_mode_show;
+			++count;
+		}
+	}
+
+	msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
+	if (!msi_irq_group)
+		goto error_attrs;
+	msi_irq_group->name = "msi_irqs";
+	msi_irq_group->attrs = msi_attrs;
+
+	msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
+	if (!msi_irq_groups)
+		goto error_irq_group;
+	msi_irq_groups[0] = msi_irq_group;
+
+	ret = sysfs_create_groups(&dev->kobj, msi_irq_groups);
+	if (ret)
+		goto error_irq_groups;
+
+	return msi_irq_groups;
+
+error_irq_groups:
+	kfree(msi_irq_groups);
+error_irq_group:
+	kfree(msi_irq_group);
+error_attrs:
+	count = 0;
+	msi_attr = msi_attrs[count];
+	while (msi_attr) {
+		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
+		kfree(msi_attr->name);
+		kfree(msi_dev_attr);
+		++count;
+		msi_attr = msi_attrs[count];
+	}
+	kfree(msi_attrs);
+	return ERR_PTR(ret);
+}
+
+/**
+ * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices
+ * @dev:		The device(PCI, platform etc) who will remove sysfs entries
+ * @msi_irq_groups:	attribute_group for device msi_irqs entries
+ */
+void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
+{
+	struct device_attribute *dev_attr;
+	struct attribute **msi_attrs;
+	int count = 0;
+
+	if (msi_irq_groups) {
+		sysfs_remove_groups(&dev->kobj, msi_irq_groups);
+		msi_attrs = msi_irq_groups[0]->attrs;
+		while (msi_attrs[count]) {
+			dev_attr = container_of(msi_attrs[count],
+					struct device_attribute, attr);
+			kfree(dev_attr->attr.name);
+			kfree(dev_attr);
+			++count;
+		}
+		kfree(msi_attrs);
+		kfree(msi_irq_groups[0]);
+		kfree(msi_irq_groups);
+	}
+}
+
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 static inline void irq_chip_write_msi_msg(struct irq_data *data,
 					  struct msi_msg *msg)
-- 
Gitee


From e7248a0b851bbef4fc0d55daa98048a97c57375f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:40 +0200
Subject: [PATCH 1734/2161] PCI/MSI: Enable and mask MSI-X early

commit 438553958ba19296663c6d6583d208dfb6792830 upstream.

The ordering of MSI-X enable in hardware is dysfunctional:

 1) MSI-X is disabled in the control register
 2) Various setup functions
 3) pci_msi_setup_msi_irqs() is invoked which ends up accessing
    the MSI-X table entries
 4) MSI-X is enabled and masked in the control register with the
    comment that enabling is required for some hardware to access
    the MSI-X table

Step #4 obviously contradicts #3. The history of this is an issue with the
NIU hardware. When #4 was introduced the table access actually happened in
msix_program_entries() which was invoked after enabling and masking MSI-X.

This was changed in commit d71d6432e105 ("PCI/MSI: Kill redundant call of
irq_set_msi_desc() for MSI-X interrupts") which removed the table write
from msix_program_entries().

Interestingly enough nobody noticed and either NIU still works or it did
not get any testing with a kernel 3.19 or later.

Nevertheless this is inconsistent and there is no reason why MSI-X can't be
enabled and masked in the control register early on, i.e. move step #4
above to step #1. This preserves the NIU workaround and has no side effects
on other hardware.

Fixes: d71d6432e105 ("PCI/MSI: Kill redundant call of irq_set_msi_desc() for MSI-X interrupts")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.344136412@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index d1292b070f53..98abe8780f8c 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -668,18 +668,25 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	u16 control;
 	void __iomem *base;
 
-	/* Ensure MSI-X is disabled while it is set up */
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+	/*
+	 * Some devices require MSI-X to be enabled before the MSI-X
+	 * registers can be accessed.  Mask all the vectors to prevent
+	 * interrupts coming in before they're fully set up.
+	 */
+	pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
+				    PCI_MSIX_FLAGS_ENABLE);
 
 	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
 	/* Request & Map MSI-X table region */
 	base = msix_map_region(dev, msix_table_size(control));
-	if (!base)
-		return -ENOMEM;
+	if (!base) {
+		ret = -ENOMEM;
+		goto out_disable;
+	}
 
 	ret = msix_setup_entries(dev, base, entries, nvec, affd);
 	if (ret)
-		return ret;
+		goto out_disable;
 
 	ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
 	if (ret)
@@ -690,14 +697,6 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	if (ret)
 		goto out_free;
 
-	/*
-	 * Some devices require MSI-X to be enabled before we can touch the
-	 * MSI-X registers.  We need to mask all the vectors to prevent
-	 * interrupts coming in before they're fully set up.
-	 */
-	pci_msix_clear_and_set_ctrl(dev, 0,
-				PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE);
-
 	msix_program_entries(dev, entries);
 
 	dev->msi_irq_groups = msi_populate_sysfs(&dev->dev);
@@ -734,6 +733,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 out_free:
 	free_msi_irqs(dev);
 
+out_disable:
+	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+
 	return ret;
 }
 
-- 
Gitee


From d5d8d838a011c3b27ff29fbdb26f0e8b076dfbec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:41 +0200
Subject: [PATCH 1735/2161] PCI/MSI: Mask all unused MSI-X entries

commit 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f upstream.

When MSI-X is enabled the ordering of calls is:

  msix_map_region();
  msix_setup_entries();
  pci_msi_setup_msi_irqs();
  msix_program_entries();

This has a few interesting issues:

 1) msix_setup_entries() allocates the MSI descriptors and initializes them
    except for the msi_desc:masked member which is left zero initialized.

 2) pci_msi_setup_msi_irqs() allocates the interrupt descriptors and sets
    up the MSI interrupts which ends up in pci_write_msi_msg() unless the
    interrupt chip provides its own irq_write_msi_msg() function.

 3) msix_program_entries() does not do what the name suggests. It solely
    updates the entries array (if not NULL) and initializes the masked
    member for each MSI descriptor by reading the hardware state and then
    masks the entry.

Obviously this has some issues:

 1) The uninitialized masked member of msi_desc prevents the enforcement
    of masking the entry in pci_write_msi_msg() depending on the cached
    masked bit. Aside of that half initialized data is a NONO in general

 2) msix_program_entries() only ensures that the actually allocated entries
    are masked. This is wrong as experimentation with crash testing and
    crash kernel kexec has shown.

    This limited testing unearthed that when the production kernel had more
    entries in use and unmasked when it crashed and the crash kernel
    allocated a smaller amount of entries, then a full scan of all entries
    found unmasked entries which were in use in the production kernel.

    This is obviously a device or emulation issue as the device reset
    should mask all MSI-X table entries, but obviously that's just part
    of the paper specification.

Cure this by:

 1) Masking all table entries in hardware
 2) Initializing msi_desc::masked in msix_setup_entries()
 3) Removing the mask dance in msix_program_entries()
 4) Renaming msix_program_entries() to msix_update_entries() to
    reflect the purpose of that function.

As the masking of unused entries has never been done the Fixes tag refers
to a commit in:
   git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git

Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.403833459@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 98abe8780f8c..db5c91e6a157 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -587,6 +587,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 {
 	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
+	void __iomem *addr;
 	int ret, i;
 	int vec_count = pci_msix_vec_count(dev);
 
@@ -607,6 +608,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 
 		entry->msi_attrib.is_msix	= 1;
 		entry->msi_attrib.is_64		= 1;
+
 		if (entries)
 			entry->msi_attrib.entry_nr = entries[i].entry;
 		else
@@ -618,6 +620,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 		entry->msi_attrib.default_irq	= dev->irq;
 		entry->mask_base		= base;
 
+		addr = pci_msix_desc_addr(entry);
+		if (addr)
+			entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
 		list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
 		if (masks)
 			curmsk++;
@@ -628,26 +634,25 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 	return ret;
 }
 
-static void msix_program_entries(struct pci_dev *dev,
-				 struct msix_entry *entries)
+static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries)
 {
 	struct msi_desc *entry;
-	int i = 0;
-	void __iomem *desc_addr;
 
 	for_each_pci_msi_entry(entry, dev) {
-		if (entries)
-			entries[i++].vector = entry->irq;
+		if (entries) {
+			entries->vector = entry->irq;
+			entries++;
+		}
+	}
+}
 
-		desc_addr = pci_msix_desc_addr(entry);
-		if (desc_addr)
-			entry->masked = readl(desc_addr +
-					      PCI_MSIX_ENTRY_VECTOR_CTRL);
-		else
-			entry->masked = 0;
+static void msix_mask_all(void __iomem *base, int tsize)
+{
+	u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
+	int i;
 
-		msix_mask_irq(entry, 1);
-	}
+	for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
+		writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
 }
 
 /**
@@ -664,9 +669,9 @@ static void msix_program_entries(struct pci_dev *dev,
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 				int nvec, struct irq_affinity *affd)
 {
-	int ret;
-	u16 control;
 	void __iomem *base;
+	int ret, tsize;
+	u16 control;
 
 	/*
 	 * Some devices require MSI-X to be enabled before the MSI-X
@@ -678,12 +683,16 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 
 	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
 	/* Request & Map MSI-X table region */
-	base = msix_map_region(dev, msix_table_size(control));
+	tsize = msix_table_size(control);
+	base = msix_map_region(dev, tsize);
 	if (!base) {
 		ret = -ENOMEM;
 		goto out_disable;
 	}
 
+	/* Ensure that all table entries are masked. */
+	msix_mask_all(base, tsize);
+
 	ret = msix_setup_entries(dev, base, entries, nvec, affd);
 	if (ret)
 		goto out_disable;
@@ -697,7 +706,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	if (ret)
 		goto out_free;
 
-	msix_program_entries(dev, entries);
+	msix_update_entries(dev, entries);
 
 	dev->msi_irq_groups = msi_populate_sysfs(&dev->dev);
 	if (IS_ERR(dev->msi_irq_groups)) {
-- 
Gitee


From ae14ca6a64c9f55c748b921b4f55cc69519ec9aa Mon Sep 17 00:00:00 2001
From: Wang Hai <wanghai38@huawei.com>
Date: Tue, 12 Oct 2021 15:15:56 +0800
Subject: [PATCH 1736/2161] PCI/MSI: Handle msi_populate_sysfs() errors
 correctly

commit 2b94b6b79b7c24092a6169db9e83c4565be0db42 upstream.

Previously, when msi_populate_sysfs() failed, we saved the error return
value as dev->msi_irq_groups, which leads to a page fault when
free_msi_irqs() calls msi_destroy_sysfs().

To prevent this, leave dev->msi_irq_groups alone when msi_populate_sysfs()
fails.

Found by the Hulk Robot when injecting a memory allocation fault in
msi_populate_sysfs():

  BUG: unable to handle page fault for address: fffffffffffffff4
  ...
  Call Trace:
   msi_destroy_sysfs+0x30/0xa0
   free_msi_irqs+0x11d/0x1b0

Fixes: 2f170814bdd2 ("genirq/msi: Move MSI sysfs handling from PCI to MSI core")
Link: https://lore.kernel.org/r/20211012071556.939137-1-wanghai38@huawei.com
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index db5c91e6a157..190946b7b285 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -515,6 +515,7 @@ static int msi_verify_entries(struct pci_dev *dev)
 static int msi_capability_init(struct pci_dev *dev, int nvec,
 			       struct irq_affinity *affd)
 {
+	const struct attribute_group **groups;
 	struct msi_desc *entry;
 	int ret;
 	unsigned mask;
@@ -540,12 +541,14 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 	if (ret)
 		goto err;
 
-	dev->msi_irq_groups = msi_populate_sysfs(&dev->dev);
-	if (IS_ERR(dev->msi_irq_groups)) {
-		ret = PTR_ERR(dev->msi_irq_groups);
+	groups = msi_populate_sysfs(&dev->dev);
+	if (IS_ERR(groups)) {
+		ret = PTR_ERR(groups);
 		goto err;
 	}
 
+	dev->msi_irq_groups = groups;
+
 	/* Set MSI enabled bits	*/
 	pci_intx_for_msi(dev, 0);
 	pci_msi_set_enable(dev, 1);
@@ -669,6 +672,7 @@ static void msix_mask_all(void __iomem *base, int tsize)
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 				int nvec, struct irq_affinity *affd)
 {
+	const struct attribute_group **groups;
 	void __iomem *base;
 	int ret, tsize;
 	u16 control;
@@ -708,12 +712,14 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 
 	msix_update_entries(dev, entries);
 
-	dev->msi_irq_groups = msi_populate_sysfs(&dev->dev);
-	if (IS_ERR(dev->msi_irq_groups)) {
-		ret = PTR_ERR(dev->msi_irq_groups);
+	groups = msi_populate_sysfs(&dev->dev);
+	if (IS_ERR(groups)) {
+		ret = PTR_ERR(groups);
 		goto out_free;
 	}
 
+	dev->msi_irq_groups = groups;
+
 	/* Set MSI-X enabled bits and unmask the function */
 	pci_intx_for_msi(dev, 0);
 	dev->msix_enabled = 1;
-- 
Gitee


From 2864663b7eab8974f70774947ca052fbbe89feb3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 25 Aug 2020 13:29:56 +0200
Subject: [PATCH 1737/2161] genirq/msi: Provide and use
 msi_domain_set_default_info_flags()

commit d4a4db1e8108a2a78183490484bc9adda867cc0f Intel-BKC.

MSI interrupts have some common flags which should be set not only for
PCI/MSI interrupts.

Move the PCI/MSI flag setting into a common function so it can be reused.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c   |  7 +------
 include/linux/msi.h |  1 +
 kernel/irq/msi.c    | 24 ++++++++++++++++++++++++
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 190946b7b285..d3c1cd3dd718 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1397,12 +1397,7 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		pci_msi_domain_update_chip_ops(info);
 
-	info->flags |= MSI_FLAG_ACTIVATE_EARLY;
-	if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
-		info->flags |= MSI_FLAG_MUST_REACTIVATE;
-
-	/* PCI-MSI is oneshot-safe */
-	info->chip->flags |= IRQCHIP_ONESHOT_SAFE;
+	msi_domain_set_default_info_flags(info);
 
 	domain = msi_create_irq_domain(fwnode, info, parent);
 	if (!domain)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index a22fde7bbfae..ffcbdba035e8 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -431,6 +431,7 @@ int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 void platform_msi_domain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nvec);
 void *platform_msi_get_host_data(struct irq_domain *domain);
+void msi_domain_set_default_info_flags(struct msi_domain_info *info);
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
 
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 0b732c0d93e9..c2543cf88358 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -204,6 +204,30 @@ void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_ir
 }
 
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+void msi_domain_set_default_info_flags(struct msi_domain_info *info)
+{
+	/* Required so that a device latches a valid MSI message on startup */
+	info->flags |= MSI_FLAG_ACTIVATE_EARLY;
+
+	/*
+	 * Interrupt reservation mode allows to stear the MSI message of an
+	 * inactive device to a special (usually spurious interrupt) target.
+	 * This allows to prevent interrupt vector exhaustion e.g. on x86.
+	 * But (PCI)MSI interrupts are activated early - see above - so the
+	 * interrupt request/startup sequence would not try to allocate a
+	 * usable vector which means that the device interrupts would end
+	 * up on the special vector and issue spurious interrupt messages.
+	 * Setting the reactivation flag ensures that when the interrupt
+	 * is requested the activation is invoked again so that a real
+	 * vector can be allocated.
+	 */
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
+		info->flags |= MSI_FLAG_MUST_REACTIVATE;
+
+	/* MSI is oneshot-safe at least in theory */
+	info->chip->flags |= IRQCHIP_ONESHOT_SAFE;
+}
+
 static inline void irq_chip_write_msi_msg(struct irq_data *data,
 					  struct msi_msg *msg)
 {
-- 
Gitee


From ea94362ea0b306c57a7b8b6679209c70e07ea678 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 21 Aug 2020 02:25:00 +0200
Subject: [PATCH 1738/2161] platform-msi: Add device MSI infrastructure

commit 2f4841d50818106b263e60c4fbb6ead6bd770634 Intel-BKC.

Add device specific MSI domain infrastructure for devices which have their
own resource management and interrupt chip. These devices are not related
to PCI and contrary to platform MSI they do not share a common resource and
interrupt chip. They provide their own domain specific resource management
and interrupt chip.

This utilizes the new alloc/free override in a non evil way which avoids
having yet another set of specialized alloc/free functions. Just using
msi_domain_alloc/free_irqs() is sufficient

While initially it was suggested and tried to piggyback device MSI on
platform MSI, the better variant is to reimplement platform MSI on top of
device MSI.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/platform-msi.c | 131 ++++++++++++++++++++++++++++++++++++
 include/linux/irqdomain.h   |   1 +
 include/linux/msi.h         |  24 +++++++
 kernel/irq/Kconfig          |   4 ++
 4 files changed, 160 insertions(+)

diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 4f68251d1932..6aafd9c43c3a 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -412,3 +412,134 @@ int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 	return err;
 }
+
+#ifdef CONFIG_DEVICE_MSI
+/*
+ * Device specific MSI domain infrastructure for devices which have their
+ * own resource management and interrupt chip. These devices are not
+ * related to PCI and contrary to platform MSI they do not share a common
+ * resource and interrupt chip. They provide their own domain specific
+ * resource management and interrupt chip.
+ */
+
+static void device_msi_free_msi_entries(struct device *dev)
+{
+	struct list_head *msi_list = dev_to_msi_list(dev);
+	struct msi_desc *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, msi_list, list) {
+		list_del(&entry->list);
+		free_msi_entry(entry);
+	}
+}
+
+/**
+ * device_msi_free_irqs - Free MSI interrupts assigned to  a device
+ * @dev:	Pointer to the device
+ *
+ * Frees the interrupt and the MSI descriptors.
+ */
+static void device_msi_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+	__msi_domain_free_irqs(domain, dev);
+	device_msi_free_msi_entries(dev);
+}
+
+/**
+ * device_msi_alloc_irqs - Allocate MSI interrupts for a device
+ * @dev:	Pointer to the device
+ * @nvec:	Number of vectors
+ *
+ * Allocates the required number of MSI descriptors and the corresponding
+ * interrupt descriptors.
+ */
+static int device_msi_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
+{
+	int i, ret = -ENOMEM;
+
+	for (i = 0; i < nvec; i++) {
+		struct msi_desc *entry = alloc_msi_entry(dev, 1, NULL);
+
+		if (!entry)
+			goto fail;
+		list_add_tail(&entry->list, dev_to_msi_list(dev));
+	}
+
+	ret = __msi_domain_alloc_irqs(domain, dev, nvec);
+	if (!ret)
+		return 0;
+fail:
+	device_msi_free_msi_entries(dev);
+	return ret;
+}
+
+static void device_msi_update_dom_ops(struct msi_domain_info *info)
+{
+	if (!info->ops->domain_alloc_irqs)
+		info->ops->domain_alloc_irqs = device_msi_alloc_irqs;
+	if (!info->ops->domain_free_irqs)
+		info->ops->domain_free_irqs = device_msi_free_irqs;
+	if (!info->ops->msi_prepare)
+		info->ops->msi_prepare = arch_msi_prepare;
+}
+
+/**
+ * device_msi_create_msi_irq_domain - Create an irq domain for devices
+ * @fwnode:	Firmware node of the interrupt controller
+ * @info:	MSI domain info to configure the new domain
+ * @parent:	Parent domain
+ */
+struct irq_domain *device_msi_create_irq_domain(struct fwnode_handle *fn,
+						struct msi_domain_info *info,
+						struct irq_domain *parent)
+{
+	struct irq_domain *domain;
+
+	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+		platform_msi_update_chip_ops(info);
+
+	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
+		device_msi_update_dom_ops(info);
+
+	msi_domain_set_default_info_flags(info);
+
+	domain = msi_create_irq_domain(fn, info, parent);
+	if (domain)
+		irq_domain_update_bus_token(domain, DOMAIN_BUS_DEVICE_MSI);
+	return domain;
+}
+
+#ifdef CONFIG_PCI
+#include <linux/pci.h>
+
+/**
+ * pci_subdevice_msi_create_irq_domain - Create an irq domain for subdevices
+ * @pdev:	Pointer to PCI device for which the subdevice domain is created
+ * @info:	MSI domain info to configure the new domain
+ */
+struct irq_domain *pci_subdevice_msi_create_irq_domain(struct pci_dev *pdev,
+						       struct msi_domain_info *info)
+{
+	struct irq_domain *domain, *pdev_msi;
+	struct fwnode_handle *fn;
+
+	/*
+	 * Retrieve the MSI domain of the underlying PCI device's MSI
+	 * domain. The PCI device domain's parent domain is also the parent
+	 * domain of the new subdevice domain.
+	 */
+	pdev_msi = dev_get_msi_domain(&pdev->dev);
+	if (!pdev_msi)
+		return NULL;
+
+	fn = irq_domain_alloc_named_fwnode(dev_name(&pdev->dev));
+	if (!fn)
+		return NULL;
+	domain = device_msi_create_irq_domain(fn, info, pdev_msi->parent);
+	if (!domain)
+		irq_domain_free_fwnode(fn);
+	return domain;
+}
+EXPORT_SYMBOL_GPL(pci_subdevice_msi_create_irq_domain);
+#endif /* CONFIG_PCI */
+#endif /* CONFIG_DEVICE_MSI */
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index a7619bb43c33..20fb384aef46 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -85,6 +85,7 @@ enum irq_domain_bus_token {
 	DOMAIN_BUS_FSL_MC_MSI,
 	DOMAIN_BUS_TI_SCI_INTA_MSI,
 	DOMAIN_BUS_VMD_MSI,
+	DOMAIN_BUS_DEVICE_MSI,
 };
 
 /**
diff --git a/include/linux/msi.h b/include/linux/msi.h
index ffcbdba035e8..b6cd408fc2e4 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -94,6 +94,18 @@ struct ti_sci_inta_msi_desc {
 	u16	dev_index;
 };
 
+/**
+ * device_msi_desc - Device MSI specific MSI descriptor data
+ * @priv:		Pointer to device specific private data
+ * @priv_iomem:		Pointer to device specific private io memory
+ * @hwirq:		The hardware irq number in the device domain
+ */
+struct device_msi_desc {
+	void		*priv;
+	void __iomem	*priv_iomem;
+	u16		hwirq;
+};
+
 /**
  * struct msi_desc - Descriptor structure for MSI based interrupts
  * @list:	List head for management
@@ -166,6 +178,7 @@ struct msi_desc {
 		struct platform_msi_desc platform;
 		struct fsl_mc_msi_desc fsl_mc;
 		struct ti_sci_inta_msi_desc inta;
+		struct device_msi_desc device_msi;
 	};
 };
 
@@ -434,6 +447,17 @@ void *platform_msi_get_host_data(struct irq_domain *domain);
 void msi_domain_set_default_info_flags(struct msi_domain_info *info);
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
 
+#ifdef CONFIG_DEVICE_MSI
+struct irq_domain *device_msi_create_irq_domain(struct fwnode_handle *fn,
+						struct msi_domain_info *info,
+						struct irq_domain *parent);
+
+# ifdef CONFIG_PCI
+struct irq_domain *pci_subdevice_msi_create_irq_domain(struct pci_dev *pdev,
+						       struct msi_domain_info *info);
+# endif
+#endif /* CONFIG_DEVICE_MSI */
+
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8fc2539eb08e..f1acfe93c8cf 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -97,6 +97,10 @@ config GENERIC_MSI_IRQ_DOMAIN
 	select IRQ_DOMAIN_HIERARCHY
 	select GENERIC_MSI_IRQ
 
+config DEVICE_MSI
+	bool
+	select GENERIC_MSI_IRQ_DOMAIN
+
 config IRQ_MSI_IOMMU
 	bool
 
-- 
Gitee


From 2c6b6b61cc1f76d832a283088d4679c1727734af Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Aug 2020 13:16:57 +0200
Subject: [PATCH 1739/2161] irqdomain/msi: Allow to override
 msi_domain_alloc/free_irqs()

commit 43e9e705dd57c466c4bfe32ab8c17db537b89297 upstream.

To support MSI irq domains which do not fit at all into the regular MSI
irqdomain scheme, like the XEN MSI interrupt management for PV/HVM/DOM0,
it's necessary to allow to override the alloc/free implementation.

This is a preperatory step to switch X86 away from arch_*_msi_irqs() and
store the irq domain pointer right in struct device.

No functional change for existing MSI irq domain users.

Aside of the evil XEN wrapper this is also useful for special MSI domains
which need to do extra alloc/free work before/after calling the generic
core function. Work like allocating/freeing MSI descriptors, MSI storage
space etc.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200826112333.526797548@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/msi.h | 27 +++++++++++++++++
 kernel/irq/msi.c    | 70 +++++++++++++++++++++++++++++++--------------
 2 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/include/linux/msi.h b/include/linux/msi.h
index b6cd408fc2e4..053ce4b2765c 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -323,6 +323,10 @@ struct msi_domain_info;
  * @msi_finish:		Optional callback to finalize the allocation
  * @set_desc:		Set the msi descriptor for an interrupt
  * @handle_error:	Optional error handler if the allocation fails
+ * @domain_alloc_irqs:	Optional function to override the default allocation
+ *			function.
+ * @domain_free_irqs:	Optional function to override the default free
+ *			function.
  *
  * @get_hwirq, @msi_init and @msi_free are callbacks used by
  * msi_create_irq_domain() and related interfaces
@@ -330,6 +334,22 @@ struct msi_domain_info;
  * @msi_check, @msi_prepare, @msi_finish, @set_desc and @handle_error
  * are callbacks used by msi_domain_alloc_irqs() and related
  * interfaces which are based on msi_desc.
+ *
+ * @domain_alloc_irqs, @domain_free_irqs can be used to override the
+ * default allocation/free functions (__msi_domain_alloc/free_irqs). This
+ * is initially for a wrapper around XENs seperate MSI universe which can't
+ * be wrapped into the regular irq domains concepts by mere mortals.  This
+ * allows to universally use msi_domain_alloc/free_irqs without having to
+ * special case XEN all over the place.
+ *
+ * Contrary to other operations @domain_alloc_irqs and @domain_free_irqs
+ * are set to the default implementation if NULL and even when
+ * MSI_FLAG_USE_DEF_DOM_OPS is not set to avoid breaking existing users and
+ * because these callbacks are obviously mandatory.
+ *
+ * This is NOT meant to be abused, but it can be useful to build wrappers
+ * for specialized MSI irq domains which need extra work before and after
+ * calling __msi_domain_alloc_irqs()/__msi_domain_free_irqs().
  */
 struct msi_domain_ops {
 	irq_hw_number_t	(*get_hwirq)(struct msi_domain_info *info,
@@ -352,6 +372,10 @@ struct msi_domain_ops {
 				    struct msi_desc *desc);
 	int		(*handle_error)(struct irq_domain *domain,
 					struct msi_desc *desc, int error);
+	int		(*domain_alloc_irqs)(struct irq_domain *domain,
+					     struct device *dev, int nvec);
+	void		(*domain_free_irqs)(struct irq_domain *domain,
+					    struct device *dev);
 };
 
 /**
@@ -409,8 +433,11 @@ int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
 struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 					 struct msi_domain_info *info,
 					 struct irq_domain *parent);
+int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+			    int nvec);
 int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			  int nvec);
+void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
 void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
 struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c2543cf88358..5185da5c5222 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -387,11 +387,13 @@ static int msi_domain_ops_check(struct irq_domain *domain,
 }
 
 static struct msi_domain_ops msi_domain_ops_default = {
-	.get_hwirq	= msi_domain_ops_get_hwirq,
-	.msi_init	= msi_domain_ops_init,
-	.msi_check	= msi_domain_ops_check,
-	.msi_prepare	= msi_domain_ops_prepare,
-	.set_desc	= msi_domain_ops_set_desc,
+	.get_hwirq		= msi_domain_ops_get_hwirq,
+	.msi_init		= msi_domain_ops_init,
+	.msi_check		= msi_domain_ops_check,
+	.msi_prepare		= msi_domain_ops_prepare,
+	.set_desc		= msi_domain_ops_set_desc,
+	.domain_alloc_irqs	= __msi_domain_alloc_irqs,
+	.domain_free_irqs	= __msi_domain_free_irqs,
 };
 
 static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -403,6 +405,14 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
 		return;
 	}
 
+	if (ops->domain_alloc_irqs == NULL)
+		ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
+	if (ops->domain_free_irqs == NULL)
+		ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
+
+	if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
+		return;
+
 	if (ops->get_hwirq == NULL)
 		ops->get_hwirq = msi_domain_ops_default.get_hwirq;
 	if (ops->msi_init == NULL)
@@ -436,8 +446,7 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 {
 	struct irq_domain *domain;
 
-	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
-		msi_domain_update_dom_ops(info);
+	msi_domain_update_dom_ops(info);
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		msi_domain_update_chip_ops(info);
 
@@ -544,17 +553,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
 	return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
 }
 
-/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain:	The domain to allocate from
- * @dev:	Pointer to device struct of the device for which the interrupts
- *		are allocated
- * @nvec:	The number of interrupts to allocate
- *
- * Returns 0 on success or an error code.
- */
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
-			  int nvec)
+int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+			    int nvec)
 {
 	struct msi_domain_info *info = domain->host_data;
 	struct msi_domain_ops *ops = info->ops;
@@ -644,12 +644,24 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 }
 
 /**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
- * @domain:	The domain to managing the interrupts
+ * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
+ * @domain:	The domain to allocate from
  * @dev:	Pointer to device struct of the device for which the interrupts
- *		are free
+ *		are allocated
+ * @nvec:	The number of interrupts to allocate
+ *
+ * Returns 0 on success or an error code.
  */
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+			  int nvec)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+
+	return ops->domain_alloc_irqs(domain, dev, nvec);
+}
+
+void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 {
 	struct msi_desc *desc;
 
@@ -666,6 +678,20 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 	}
 }
 
+/**
+ * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * @domain:	The domain to managing the interrupts
+ * @dev:	Pointer to device struct of the device for which the interrupts
+ *		are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+
+	return ops->domain_free_irqs(domain, dev);
+}
+
 /**
  * msi_get_domain_info - Get the MSI interrupt domain info for @domain
  * @domain:	The interrupt domain to retrieve data from
-- 
Gitee


From 1dbd189844c5fd3c54de81cc9eef509af7cfd860 Mon Sep 17 00:00:00 2001
From: Bixuan Cui <cuibixuan@huawei.com>
Date: Tue, 18 May 2021 11:31:17 +0800
Subject: [PATCH 1740/2161] genirq/msi: Ensure deactivation on teardown

commit dbbc93576e03fbe24b365fab0e901eb442237a8a upstream.

msi_domain_alloc_irqs() invokes irq_domain_activate_irq(), but
msi_domain_free_irqs() does not enforce deactivation before tearing down
the interrupts.

This happens when PCI/MSI interrupts are set up and never used before being
torn down again, e.g. in error handling pathes. The only place which cleans
that up is the error handling path in msi_domain_alloc_irqs().

Move the cleanup from msi_domain_alloc_irqs() into msi_domain_free_irqs()
to cure that.

Fixes: f3b0946d629c ("genirq/msi: Make sure PCI MSIs are activated early")
Signed-off-by: Bixuan Cui <cuibixuan@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210518033117.78104-1-cuibixuan@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/irq/msi.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5185da5c5222..2db057f23516 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -634,11 +634,6 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	return 0;
 
 cleanup:
-	for_each_msi_vector(desc, i, dev) {
-		irq_data = irq_domain_get_irq_data(domain, i);
-		if (irqd_is_activated(irq_data))
-			irq_domain_deactivate_irq(irq_data);
-	}
 	msi_domain_free_irqs(domain, dev);
 	return ret;
 }
@@ -663,7 +658,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 
 void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 {
+	struct irq_data *irq_data;
 	struct msi_desc *desc;
+	int i;
+
+	for_each_msi_vector(desc, i, dev) {
+		irq_data = irq_domain_get_irq_data(domain, i);
+		if (irqd_is_activated(irq_data))
+			irq_domain_deactivate_irq(irq_data);
+	}
 
 	for_each_msi_entry(desc, dev) {
 		/*
-- 
Gitee


From 6f4e8f54caa5305c8d9f96da533952b33ca7e6d1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 21 Aug 2020 02:25:01 +0200
Subject: [PATCH 1741/2161] irqdomain/msi: Provide msi_alloc/free_store()
 callbacks

commit edd679d4da0966b436c15ba1867b995ce582e39b Intel-BKC.

For devices which don't have a standard storage for MSI messages like the
upcoming IMS (Interrupt Message Store) it's required to allocate storage
space before allocating interrupts and after freeing them.

This could be achieved with the existing callbacks, but that would be
awkward because they operate on msi_alloc_info_t which is not uniform
across architectures. Also these callbacks are invoked per interrupt but
the allocation might have bulk requirements depending on the device.

As such devices can operate on different architectures it is simpler to
have separate callbacks which operate on struct device. The resulting
storage information has to be stored in struct msi_desc so the underlying
irq chip implementation can retrieve it for the relevant operations.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/msi.h |  8 ++++++++
 kernel/irq/msi.c    | 11 +++++++++++
 2 files changed, 19 insertions(+)

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 053ce4b2765c..57cdd3a27b12 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -327,6 +327,10 @@ struct msi_domain_info;
  *			function.
  * @domain_free_irqs:	Optional function to override the default free
  *			function.
+ * @msi_alloc_store:	Optional callback to allocate storage in a device
+ *			specific non-standard MSI store
+ * @msi_alloc_free:	Optional callback to free storage in a device
+ *			specific non-standard MSI store
  *
  * @get_hwirq, @msi_init and @msi_free are callbacks used by
  * msi_create_irq_domain() and related interfaces
@@ -376,6 +380,10 @@ struct msi_domain_ops {
 					     struct device *dev, int nvec);
 	void		(*domain_free_irqs)(struct irq_domain *domain,
 					    struct device *dev);
+	int		(*msi_alloc_store)(struct irq_domain *domain,
+					   struct device *dev, int nvec);
+	void		(*msi_free_store)(struct irq_domain *domain,
+					  struct device *dev);
 };
 
 /**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2db057f23516..dfcc065ce111 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -568,6 +568,12 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	if (ret)
 		return ret;
 
+	if (ops->msi_alloc_store) {
+		ret = ops->msi_alloc_store(domain, dev, nvec);
+		if (ret)
+			return ret;
+	}
+
 	for_each_msi_entry(desc, dev) {
 		ops->set_desc(&arg, desc);
 
@@ -659,6 +665,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 {
 	struct irq_data *irq_data;
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
 	struct msi_desc *desc;
 	int i;
 
@@ -679,6 +687,9 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 			desc->irq = 0;
 		}
 	}
+
+	if (ops->msi_free_store)
+		ops->msi_free_store(domain, dev);
 }
 
 /**
-- 
Gitee


From f28010f2df93010917db424de6d4e9b8086b11f8 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Mon, 1 Feb 2021 12:43:30 -0800
Subject: [PATCH 1742/2161] genirq: Set auxiliary data for an interrupt

commit 459da077f38ceffaa524bd504d66739e37314fd1 Intel-BKC.

Introduce a new function pointer in the irq_chip structure(irq_set_auxdata)
which is responsible for updating data which is stored in a shared register
or data storage. For example, the idxd driver uses the auxiliary data API
to enable/set and disable PASID field that is in the IMS entry (introduced
in a later patch) and that data are not typically present in MSI entry.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/interrupt.h |  2 ++
 include/linux/irq.h       |  4 ++++
 kernel/irq/manage.c       | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 569f87d699cc..e0f44cc660e7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -443,6 +443,8 @@ extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 				 bool state);
 
+int irq_set_auxdata(unsigned int irq, unsigned int which, u64 val);
+
 #ifdef CONFIG_IRQ_FORCED_THREADING
 # ifdef CONFIG_PREEMPT_RT
 #  define force_irqthreads	(true)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index dd6653005ea2..dbb2d08e010b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -481,6 +481,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  *				irq_request_resources
  * @irq_compose_msi_msg:	optional to compose message content for MSI
  * @irq_write_msi_msg:	optional to write message content for MSI
+ * @irq_set_auxdata:	Optional function to update auxiliary data e.g. in
+ *			shared registers
  * @irq_get_irqchip_state:	return the internal state of an interrupt
  * @irq_set_irqchip_state:	set the internal state of a interrupt
  * @irq_set_vcpu_affinity:	optional to target a vCPU in a virtual machine
@@ -528,6 +530,8 @@ struct irq_chip {
 	void		(*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg);
 	void		(*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg);
 
+	int		(*irq_set_auxdata)(struct irq_data *data, unsigned int which, u64 auxval);
+
 	int		(*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state);
 	int		(*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 255266df1aa1..0aca7401a233 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2782,3 +2782,35 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 	return err;
 }
 EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_set_auxdata - Set auxiliary data
+ * @irq:	Interrupt to update
+ * @which:	Selector which data to update
+ * @auxval:	Auxiliary data value
+ *
+ * Function to update auxiliary data for an interrupt, e.g. to update data
+ * which is stored in a shared register or data storage (e.g. IMS).
+ */
+int irq_set_auxdata(unsigned int irq, unsigned int which, u64 val)
+{
+	struct irq_desc *desc;
+	struct irq_data *data;
+	unsigned long flags;
+	int res = -ENODEV;
+
+	desc = irq_get_desc_buslock(irq, &flags, 0);
+	if (!desc)
+		return -EINVAL;
+
+	for (data = &desc->irq_data; data; data = irqd_get_parent_data(data)) {
+		if (data->chip->irq_set_auxdata) {
+			res = data->chip->irq_set_auxdata(data, which, val);
+			break;
+		}
+	}
+
+	irq_put_desc_busunlock(desc, flags);
+	return res;
+}
+EXPORT_SYMBOL_GPL(irq_set_auxdata);
-- 
Gitee


From 1ee1757e768c37577f07202b64294c1ef7d091d8 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Tue, 29 Jun 2021 05:52:05 -0700
Subject: [PATCH 1743/2161] iommu/vt-d: Add DEV-MSI support

commit bfd5168b96da3883ccca3cba3fc4060e9c58f2a4 Intel-BKC.

Add required support in the interrupt remapping driver for devices
which generate dev-msi interrupts and use the intel remapping
domain as the parent domain. Set the source-id of all dev-msi
interrupt requests to the parent PCI device associated with it.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/irq_remapping.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index f912fe45bea2..f969570fb0a9 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1283,6 +1283,9 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 		set_msi_sid(irte,
 			    pci_real_dma_dev(msi_desc_to_pci_dev(info->desc)));
 		break;
+	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
+		set_msi_sid(irte, to_pci_dev(info->desc->dev->parent));
+		break;
 	default:
 		BUG_ON(1);
 		break;
@@ -1326,7 +1329,8 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
 	if (!info || !iommu)
 		return -EINVAL;
 	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
-	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
+	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX &&
+	    info->type != X86_IRQ_ALLOC_TYPE_DEV_MSI)
 		return -EINVAL;
 
 	/*
-- 
Gitee


From 4d9c344f9079a7cc388214b7c443ec76dc9d8bfa Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Tue, 29 Jun 2021 05:55:12 -0700
Subject: [PATCH 1744/2161] iommu: Add capability IOMMU_CAP_VIOMMU_HINT

commit 80cb89a09af3f244983e5d7157026830c7bf0e41 Intel-BKC.

Some IOMMU specification defines some kind of hint mechanism, through
which BIOS can imply that OS runs in a virtualized environment. For
example, the caching mode defined in VT-d spec and NpCache capability
defined in the AMD IOMMU specification. This hint could also be used
outside of the IOMMU subsystem, where it could be used with other known
means (CPUID, smbios) to sense whether Linux is running in a virtualized
environment. Add a capability bit so that it could be used there.

Cc: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/amd_iommu.c    | 2 ++
 drivers/iommu/intel/iommu.c  | 5 +++++
 drivers/iommu/virtio-iommu.c | 9 +++++++++
 include/linux/iommu.h        | 2 ++
 4 files changed, 18 insertions(+)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 83ce245471b8..516056835533 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2562,6 +2562,8 @@ static bool amd_iommu_capable(enum iommu_cap cap)
 		return (irq_remapping_enabled == 1);
 	case IOMMU_CAP_NOEXEC:
 		return false;
+	case IOMMU_CAP_VIOMMU_HINT:
+		return amd_iommu_np_cache;
 	default:
 		break;
 	}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3f36b6b5533c..4ab762b7f88b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -315,6 +315,7 @@ static inline void context_clear_entry(struct context_entry *context)
  */
 static struct dmar_domain *si_domain;
 static int hw_pass_through = 1;
+static int intel_caching_mode;
 
 #define for_each_domain_iommu(idx, domain)			\
 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
@@ -3476,6 +3477,8 @@ static int __init init_dmars(void)
 
 		if (!ecap_pass_through(iommu->ecap))
 			hw_pass_through = 0;
+		if (cap_caching_mode(iommu->cap))
+			intel_caching_mode = 1;
 
 		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
 			pr_info("Disable batched IOTLB flush due to virtualization");
@@ -5382,6 +5385,8 @@ static bool intel_iommu_capable(enum iommu_cap cap)
 		return domain_update_iommu_snooping(NULL);
 	if (cap == IOMMU_CAP_INTR_REMAP)
 		return irq_remapping_enabled == 1;
+	if (cap == IOMMU_CAP_VIOMMU_HINT)
+		return intel_caching_mode;
 
 	return false;
 }
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 07600b05f2b6..6b7cc71c0042 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -944,7 +944,16 @@ static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args)
 	return iommu_fwspec_add_ids(dev, args->args, 1);
 }
 
+static bool viommu_capable(enum iommu_cap cap)
+{
+	if (cap == IOMMU_CAP_VIOMMU_HINT)
+		return true;
+
+	return false;
+}
+
 static struct iommu_ops viommu_ops = {
+	.capable		= viommu_capable,
 	.domain_alloc		= viommu_domain_alloc,
 	.domain_free		= viommu_domain_free,
 	.attach_dev		= viommu_attach_dev,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 7ab2f61ff471..4a4388e0bc89 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -114,6 +114,8 @@ enum iommu_cap {
 					   transactions */
 	IOMMU_CAP_INTR_REMAP,		/* IOMMU supports interrupt isolation */
 	IOMMU_CAP_NOEXEC,		/* IOMMU_NOEXEC flag */
+	IOMMU_CAP_VIOMMU_HINT,		/* IOMMU can detect a hit for running in
+					   VM */
 };
 
 /*
-- 
Gitee


From c3a33ae281ccb00fb2aa6c8a57b59a6438c89301 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 23 Nov 2020 08:41:21 +0800
Subject: [PATCH 1745/2161] platform-msi: Add platform check for subdevice irq
 domain

commit e99a6435963bc19d1cbf731f8c2a71973ee7a6e2 Intel-BKC.

The pci_subdevice_msi_create_irq_domain() should fail if the underlying
platform is not able to support IMS (Interrupt Message Storage). Otherwise,
the isolation of interrupt is not guaranteed.

For x86, IMS is only supported on bare metal for now. We could enable it
in the virtualization environments in the future if interrupt HYPERCALL
domain is supported or the hardware has the capability of interrupt
isolation for subdevices.

Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Kevin Tian <kevin.tian@intel.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/linux-pci/87pn4nk7nn.fsf@nanos.tec.linutronix.de/
Link: https://lore.kernel.org/linux-pci/877dqrnzr3.fsf@nanos.tec.linutronix.de/
Link: https://lore.kernel.org/linux-pci/877dqqmc2h.fsf@nanos.tec.linutronix.de/
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/pci/common.c       | 72 +++++++++++++++++++++++++++++++++++++
 drivers/base/platform-msi.c |  8 +++++
 include/linux/msi.h         |  2 ++
 3 files changed, 82 insertions(+)

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index e142ae2e8e1c..ad81de4ed7b2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/msi.h>
 
 #include <asm/acpi.h>
 #include <asm/segment.h>
@@ -726,3 +728,73 @@ struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
 	return dev;
 }
 #endif
+
+/*
+ * We want to figure out which context we are running in. But the hardware
+ * does not introduce a reliable way (instruction, CPUID leaf, MSR, whatever)
+ * which can be manipulated by the VMM to let the OS figure out where it runs.
+ * So we go with the below probably on_bare_metal() function as a replacement
+ * for definitely on_bare_metal() to go forward only for the very simple reason
+ * that this is the only option we have.
+ */
+static const char * const vmm_vendor_name[] = {
+	"QEMU", "Bochs", "KVM", "Xen", "VMware", "VMW", "VMware Inc.",
+	"innotek GmbH", "Oracle Corporation", "Parallels", "BHYVE"
+};
+
+static void read_type0_virtual_machine(const struct dmi_header *dm, void *p)
+{
+	u8 *data = (u8 *)dm + 0x13;
+
+	/* BIOS Information (Type 0) */
+	if (dm->type != 0 || dm->length < 0x14)
+		return;
+
+	/* Bit 4 of BIOS Characteristics Extension Byte 2*/
+	if (*data & BIT(4))
+		*((bool *)p) = true;
+}
+
+static bool smbios_virtual_machine(void)
+{
+	bool bit_present = false;
+
+	dmi_walk(read_type0_virtual_machine, &bit_present);
+
+	return bit_present;
+}
+
+static bool on_bare_metal(struct device *dev)
+{
+	int i;
+
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
+
+	if (smbios_virtual_machine())
+		return false;
+
+	if (iommu_capable(dev->bus, IOMMU_CAP_VIOMMU_HINT))
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(vmm_vendor_name); i++)
+		if (dmi_match(DMI_SYS_VENDOR, vmm_vendor_name[i]))
+			return false;
+
+	pr_info("System running on bare metal, report to bugzilla.kernel.org if not the case.");
+
+	return true;
+}
+
+bool arch_support_pci_device_msi(struct pci_dev *pdev)
+{
+	/*
+	 * When we are running in a VMM context, the device IMS could only be
+	 * enabled when the underlying hardware supports interrupt isolation
+	 * of the subdevice, or any mechanism (trap, hypercall) is added so
+	 * that changes in the interrupt message store could be managed by the
+	 * VMM. For now, we only support the device IMS when we are running on
+	 * the bare metal.
+	 */
+	return on_bare_metal(&pdev->dev);
+}
diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 6aafd9c43c3a..0e639a650ee5 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -512,6 +512,11 @@ struct irq_domain *device_msi_create_irq_domain(struct fwnode_handle *fn,
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
 
+bool __weak arch_support_pci_device_msi(struct pci_dev *pdev)
+{
+	return false;
+}
+
 /**
  * pci_subdevice_msi_create_irq_domain - Create an irq domain for subdevices
  * @pdev:	Pointer to PCI device for which the subdevice domain is created
@@ -523,6 +528,9 @@ struct irq_domain *pci_subdevice_msi_create_irq_domain(struct pci_dev *pdev,
 	struct irq_domain *domain, *pdev_msi;
 	struct fwnode_handle *fn;
 
+	if (!arch_support_pci_device_msi(pdev))
+		return NULL;
+
 	/*
 	 * Retrieve the MSI domain of the underlying PCI device's MSI
 	 * domain. The PCI device domain's parent domain is also the parent
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 57cdd3a27b12..46ace332348a 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -493,6 +493,8 @@ struct irq_domain *pci_subdevice_msi_create_irq_domain(struct pci_dev *pdev,
 # endif
 #endif /* CONFIG_DEVICE_MSI */
 
+bool arch_support_pci_device_msi(struct pci_dev *pdev);
+
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
-- 
Gitee


From 01eba9024f9c999792315d32eb0e758736724a09 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Tue, 29 Jun 2021 06:01:07 -0700
Subject: [PATCH 1746/2161] irqchip: Add IMS (Interrupt Message Store) driver

commit 048f4019b6f7100e74fb58dca5ce00a8050c7422 Intel-BKC.

Generic IMS(Interrupt Message Store) irq chips and irq domain
implementations for IMS based devices which store the interrupt messages
in an array in device memory.

Allocation and freeing of interrupts happens via the generic
msi_domain_alloc/free_irqs() interface. No special purpose IMS magic
required as long as the interrupt domain is stored in the underlying
device struct. The irq_set_auxdata() is used to program the pasid into
the IMS entry.

[Megha : Fixed compile time errors
         Added necessary dependencies to IMS_MSI_ARRAY config
         Fixed polarity of IMS_VECTOR_CTRL
         Added reads after writes to flush writes to device
         Added set_desc ops to IMS msi domain ops
         Tested the IMS infrastructure with the IDXD driver]

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/irqchip/Kconfig             |  14 ++
 drivers/irqchip/Makefile            |   1 +
 drivers/irqchip/irq-ims-msi.c       | 193 ++++++++++++++++++++++++++++
 include/linux/irqchip/irq-ims-msi.h |  68 ++++++++++
 4 files changed, 276 insertions(+)
 create mode 100644 drivers/irqchip/irq-ims-msi.c
 create mode 100644 include/linux/irqchip/irq-ims-msi.h

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 6d04672975ef..634d074fb290 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -504,4 +504,18 @@ config SIFIVE_PLIC
 
 	   If you don't know what to do here, say Y.
 
+config IMS_MSI
+	depends on PCI
+	select DEVICE_MSI
+	bool
+
+config IMS_MSI_ARRAY
+	bool "IMS Interrupt Message Store MSI controller for device memory storage arrays"
+	depends on PCI
+	select IMS_MSI
+	select GENERIC_MSI_IRQ_DOMAIN
+	help
+	  Support for IMS Interrupt Message Store MSI controller
+	  with IMS slot storage in a slot array in device memory
+
 endmenu
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 8903d3c34916..fec6f7e01917 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_CSKY_APB_INTC)		+= irq-csky-apb-intc.o
 obj-$(CONFIG_SIFIVE_PLIC)		+= irq-sifive-plic.o
 obj-$(CONFIG_IMX_IRQSTEER)		+= irq-imx-irqsteer.o
 obj-$(CONFIG_MADERA_IRQ)		+= irq-madera.o
+obj-$(CONFIG_IMS_MSI)			+= irq-ims-msi.o
 obj-$(CONFIG_LS1X_IRQ)			+= irq-ls1x.o
 obj-$(CONFIG_TI_SCI_INTR_IRQCHIP)	+= irq-ti-sci-intr.o
 obj-$(CONFIG_TI_SCI_INTA_IRQCHIP)	+= irq-ti-sci-inta.o
diff --git a/drivers/irqchip/irq-ims-msi.c b/drivers/irqchip/irq-ims-msi.c
new file mode 100644
index 000000000000..61f2437d7c58
--- /dev/null
+++ b/drivers/irqchip/irq-ims-msi.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+// (C) Copyright 2021 Thomas Gleixner <tglx@linutronix.de>
+/*
+ * Shared interrupt chips and irq domains for IMS devices
+ */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/msi.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+
+#include <linux/irqchip/irq-ims-msi.h>
+
+#ifdef CONFIG_IMS_MSI_ARRAY
+
+struct ims_array_data {
+	struct ims_array_info	info;
+	unsigned long		map[0];
+};
+
+static inline void iowrite32_and_flush(u32 value, void __iomem *addr)
+{
+	iowrite32(value, addr);
+	ioread32(addr);
+}
+
+static void ims_array_mask_irq(struct irq_data *data)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 __iomem *ctrl = &slot->ctrl;
+
+	iowrite32_and_flush(ioread32(ctrl) | IMS_CTRL_VECTOR_MASKBIT, ctrl);
+}
+
+static void ims_array_unmask_irq(struct irq_data *data)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 __iomem *ctrl = &slot->ctrl;
+
+	iowrite32_and_flush(ioread32(ctrl) & ~IMS_CTRL_VECTOR_MASKBIT, ctrl);
+}
+
+static void ims_array_write_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+
+	iowrite32(msg->address_lo, &slot->address_lo);
+	iowrite32(msg->address_hi, &slot->address_hi);
+	iowrite32_and_flush(msg->data, &slot->data);
+}
+
+static const struct irq_chip ims_array_msi_controller = {
+	.name			= "IMS",
+	.irq_mask		= ims_array_mask_irq,
+	.irq_unmask		= ims_array_unmask_irq,
+	.irq_write_msi_msg	= ims_array_write_msi_msg,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static void ims_array_reset_slot(struct ims_slot __iomem *slot)
+{
+	iowrite32(0, &slot->address_lo);
+	iowrite32(0, &slot->address_hi);
+	iowrite32(0, &slot->data);
+	iowrite32_and_flush(IMS_CTRL_VECTOR_MASKBIT, &slot->ctrl);
+}
+
+static void ims_array_free_msi_store(struct irq_domain *domain,
+				     struct device *dev)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct ims_array_data *ims = info->data;
+	struct msi_desc *entry;
+
+	for_each_msi_entry(entry, dev) {
+		if (entry->device_msi.priv_iomem) {
+			clear_bit(entry->device_msi.hwirq, ims->map);
+			ims_array_reset_slot(entry->device_msi.priv_iomem);
+			entry->device_msi.priv_iomem = NULL;
+			entry->device_msi.hwirq = 0;
+		}
+	}
+}
+
+static int ims_array_alloc_msi_store(struct irq_domain *domain,
+				     struct device *dev, int nvec)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct ims_array_data *ims = info->data;
+	struct msi_desc *entry;
+
+	for_each_msi_entry(entry, dev) {
+		unsigned int idx;
+
+		idx = find_first_zero_bit(ims->map, ims->info.max_slots);
+		if (idx >= ims->info.max_slots)
+			goto fail;
+		set_bit(idx, ims->map);
+		entry->device_msi.priv_iomem = &ims->info.slots[idx];
+		ims_array_reset_slot(entry->device_msi.priv_iomem);
+		entry->device_msi.hwirq = idx;
+	}
+	return 0;
+
+fail:
+	ims_array_free_msi_store(domain, dev);
+	return -ENOSPC;
+}
+
+struct ims_array_domain_template {
+	struct msi_domain_ops	ops;
+	struct msi_domain_info	info;
+};
+
+static void ims_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+	arg->desc = desc;
+	arg->hwirq = desc->device_msi.hwirq;
+}
+
+static const struct ims_array_domain_template ims_array_domain_template = {
+	.ops = {
+		.msi_alloc_store	= ims_array_alloc_msi_store,
+		.msi_free_store		= ims_array_free_msi_store,
+		.set_desc               = ims_set_desc,
+	},
+	.info = {
+		.flags		= MSI_FLAG_USE_DEF_DOM_OPS |
+				  MSI_FLAG_USE_DEF_CHIP_OPS,
+		.handler	= handle_edge_irq,
+		.handler_name	= "edge",
+	},
+};
+
+struct irq_domain *
+pci_ims_array_create_msi_irq_domain(struct pci_dev *pdev,
+				    struct ims_array_info *ims_info)
+{
+	struct ims_array_domain_template *info;
+	struct ims_array_data *data;
+	struct irq_domain *domain;
+	struct irq_chip *chip;
+	unsigned int size;
+
+	/* Allocate new domain storage */
+	info = kmemdup(&ims_array_domain_template,
+		       sizeof(ims_array_domain_template), GFP_KERNEL);
+	if (!info)
+		return NULL;
+	/* Link the ops */
+	info->info.ops = &info->ops;
+
+	/* Allocate ims_info along with the bitmap */
+	size = sizeof(*data);
+	size += BITS_TO_LONGS(ims_info->max_slots) * sizeof(unsigned long);
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto err_info;
+
+	data->info = *ims_info;
+	info->info.data = data;
+
+	/*
+	 * Allocate an interrupt chip because the core needs to be able to
+	 * update it with default callbacks.
+	 */
+	chip = kmemdup(&ims_array_msi_controller,
+		       sizeof(ims_array_msi_controller), GFP_KERNEL);
+	if (!chip)
+		goto err_data;
+	info->info.chip = chip;
+
+	domain = pci_subdevice_msi_create_irq_domain(pdev, &info->info);
+	if (!domain)
+		goto err_chip;
+
+	return domain;
+
+err_chip:
+	kfree(chip);
+err_data:
+	kfree(data);
+err_info:
+	kfree(info);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pci_ims_array_create_msi_irq_domain);
+
+#endif /* CONFIG_IMS_MSI_ARRAY */
diff --git a/include/linux/irqchip/irq-ims-msi.h b/include/linux/irqchip/irq-ims-msi.h
new file mode 100644
index 000000000000..9ba767fbbb3d
--- /dev/null
+++ b/include/linux/irqchip/irq-ims-msi.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* (C) Copyright 2021 Thomas Gleixner <tglx@linutronix.de> */
+
+#ifndef _LINUX_IRQCHIP_IRQ_IMS_MSI_H
+#define _LINUX_IRQCHIP_IRQ_IMS_MSI_H
+
+#include <linux/types.h>
+#include <linux/bits.h>
+
+/**
+ * ims_hw_slot - The hardware layout of an IMS based MSI message
+ * @address_lo:	Lower 32bit address
+ * @address_hi:	Upper 32bit address
+ * @data:	Message data
+ * @ctrl:	Control word
+ *
+ * This structure is used by both the device memory array and the queue
+ * memory variants of IMS.
+ */
+struct ims_slot {
+	u32	address_lo;
+	u32	address_hi;
+	u32	data;
+	u32	ctrl;
+} __packed;
+
+/*
+ * The IMS control word utilizes bit 0-2 for interrupt control. The remaining
+ * bits can contain auxiliary data.
+ */
+#define IMS_CONTROL_WORD_IRQMASK	GENMASK(2, 0)
+#define IMS_CONTROL_WORD_AUXMASK	GENMASK(31, 3)
+
+/* Auxiliary control word data related defines */
+enum {
+	IMS_AUXDATA_CONTROL_WORD,
+};
+
+/* Bit to mask the interrupt in ims_hw_slot::ctrl */
+#define IMS_CTRL_VECTOR_MASKBIT		BIT(0)
+#define IMS_CTRL_PASID_ENABLE           BIT(3)
+#define IMS_CTRL_PASID_SHIFT            12
+
+/* Set pasid and enable bit for the IMS entry */
+static inline u32 ims_ctrl_pasid_aux(unsigned int pasid, bool enable)
+{
+	u32 auxval = pasid << IMS_CTRL_PASID_SHIFT;
+
+	return enable ? auxval | IMS_CTRL_PASID_ENABLE : auxval;
+}
+
+/**
+ * struct ims_array_info - Information to create an IMS array domain
+ * @slots:	Pointer to the start of the array
+ * @max_slots:	Maximum number of slots in the array
+ */
+struct ims_array_info {
+	struct ims_slot		__iomem *slots;
+	unsigned int		max_slots;
+};
+
+struct pci_dev;
+struct irq_domain;
+
+struct irq_domain *pci_ims_array_create_msi_irq_domain(struct pci_dev *pdev,
+						       struct ims_array_info *ims_info);
+
+#endif
-- 
Gitee


From 82ee5cb07ea515fa22a2bcb2a50591971fa10d4c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 19 Feb 2021 14:36:44 -0700
Subject: [PATCH 1747/2161] genirq/msi: Provide helpers to return Linux
 IRQ/dev_msi hw IRQ number

commit 84ede51031d8df62a94b4014294e81ff25b502fb Intel-BKC.

Add new helpers to get the Linux IRQ number and device specific index
for given device-relative vector so that the drivers don't need to
allocate their own arrays to keep track of the vectors and hwirq for
the multi vector device MSI case.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/msi.h |  2 ++
 kernel/irq/msi.c    | 48 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 46ace332348a..f829a03a1849 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -455,6 +455,8 @@ struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode,
 int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec,
 				   irq_write_msi_msg_t write_msi_msg);
 void platform_msi_domain_free_irqs(struct device *dev);
+int dev_msi_irq_vector(struct device *dev, unsigned int nr);
+int dev_msi_hwirq(struct device *dev, unsigned int nr);
 
 /* When an MSI domain is used as an intermediate domain */
 int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index dfcc065ce111..8215af7dc129 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -718,4 +718,52 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
 	return (struct msi_domain_info *)domain->host_data;
 }
 
+/**
+ * get_dev_msi_entry - Get the nth device MSI entry
+ * @dev: device to operate on
+ * @nr: device-relative interrupt vector index (0-based).
+ *
+ * Return the nth dev_msi entry
+ */
+static struct msi_desc *get_dev_msi_entry(struct device *dev, unsigned int nr)
+{
+	struct msi_desc *entry;
+	int i = 0;
+
+	for_each_msi_entry(entry, dev) {
+		if (i == nr)
+			return entry;
+		i++;
+	}
+
+	WARN_ON_ONCE(!entry);
+	return entry;
+}
+
+/**
+ * dev_msi_irq_vector - Get the Linux IRQ number of a device vector
+ * @dev: device to operate on
+ * @nr: device-relative interrupt vector index (0-based).
+ *
+ * Returns the Linux IRQ number of a device vector.
+ */
+int dev_msi_irq_vector(struct device *dev, unsigned int nr)
+{
+	return get_dev_msi_entry(dev, nr)->irq;
+}
+EXPORT_SYMBOL_GPL(dev_msi_irq_vector);
+
+/**
+ * dev_msi_hwirq - Get the device MSI hw IRQ number of a device vector
+ * @dev: device to operate on
+ * @nr: device-relative interrupt vector index (0-based).
+ *
+ * Return the dev_msi hw IRQ number of a device vector.
+ */
+int dev_msi_hwirq(struct device *dev, unsigned int nr)
+{
+	return get_dev_msi_entry(dev, nr)->device_msi.hwirq;
+}
+EXPORT_SYMBOL_GPL(dev_msi_hwirq);
+
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
-- 
Gitee


From af3544bf6d04551c7e5988203ac1aec10f152b90 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 23 Jun 2022 20:56:47 +0800
Subject: [PATCH 1748/2161] PCI/MSI: Deobfuscate virtual MSI-X

commit b296ababcc4bbf8efbb603d3aec6024a78662c1b upstream.

Handling of virtual MSI-X is obfuscated by letting pci_msix_desc_addr()
return NULL and checking the pointer.

Just use msi_desc::msi_attrib.is_virtual at the call sites and get rid of
that pointer check.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210729222543.151522318@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index d3c1cd3dd718..b6df3b41b566 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -193,11 +193,7 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 
 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 {
-	if (desc->msi_attrib.is_virtual)
-		return NULL;
-
-	return desc->mask_base +
-		desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+	return desc->mask_base + desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 }
 
 /*
@@ -209,14 +205,10 @@ static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
  */
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
+	void __iomem *desc_addr = pci_msix_desc_addr(desc);
 	u32 mask_bits = desc->masked;
-	void __iomem *desc_addr;
-
-	if (pci_msi_ignore_mask)
-		return 0;
 
-	desc_addr = pci_msix_desc_addr(desc);
-	if (!desc_addr)
+	if (pci_msi_ignore_mask || desc->msi_attrib.is_virtual)
 		return 0;
 
 	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
@@ -283,10 +275,8 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
-		if (!base) {
-			WARN_ON(1);
+		if (WARN_ON_ONCE(entry->msi_attrib.is_virtual))
 			return;
-		}
 
 		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
@@ -318,7 +308,7 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	} else if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
-		if (!base)
+		if (entry->msi_attrib.is_virtual)
 			goto skip;
 
 		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
@@ -623,9 +613,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 		entry->msi_attrib.default_irq	= dev->irq;
 		entry->mask_base		= base;
 
-		addr = pci_msix_desc_addr(entry);
-		if (addr)
+		if (!entry->msi_attrib.is_virtual) {
+			addr = pci_msix_desc_addr(entry);
 			entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
+		}
 
 		list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
 		if (masks)
-- 
Gitee


From 072977fa8ef320f13cd8e512c7b238df727e2d9c Mon Sep 17 00:00:00 2001
From: Stefan Roese <sr@denx.de>
Date: Tue, 14 Dec 2021 12:49:32 +0100
Subject: [PATCH 1749/2161] PCI/MSI: Mask MSI-X vectors only on success
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 83dbf898a2d45289be875deb580e93050ba67529 upstream.

Masking all unused MSI-X entries is done to ensure that a crash kernel
starts from a clean slate, which correponds to the reset state of the
device as defined in the PCI-E specificion 3.0 and later:

 Vector Control for MSI-X Table Entries
 --------------------------------------

 "00: Mask bit:  When this bit is set, the function is prohibited from
                 sending a message using this MSI-X Table entry.
                 ...
                 This bit’s state after reset is 1 (entry is masked)."

A Marvell NVME device fails to deliver MSI interrupts after trying to
enable MSI-X interrupts due to that masking. It seems to take the MSI-X
mask bits into account even when MSI-X is disabled.

While not specification compliant, this can be cured by moving the masking
into the success path, so that the MSI-X table entries stay in device reset
state when the MSI-X setup fails.

[ tglx: Move it into the success path, add comment and amend changelog ]

Fixes: aa8092c1d1f1 ("PCI/MSI: Mask all unused MSI-X entries")
Signed-off-by: Stefan Roese <sr@denx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-pci@vger.kernel.org
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Michal Simek <michal.simek@xilinx.com>
Cc: Marek Vasut <marex@denx.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211210161025.3287927-1-sr@denx.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index b6df3b41b566..580dbe8dacee 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -685,9 +685,6 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 		goto out_disable;
 	}
 
-	/* Ensure that all table entries are masked. */
-	msix_mask_all(base, tsize);
-
 	ret = msix_setup_entries(dev, base, entries, nvec, affd);
 	if (ret)
 		goto out_disable;
@@ -714,6 +711,16 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	/* Set MSI-X enabled bits and unmask the function */
 	pci_intx_for_msi(dev, 0);
 	dev->msix_enabled = 1;
+
+	/*
+	 * Ensure that all table entries are masked to prevent
+	 * stale entries from firing in a crash kernel.
+	 *
+	 * Done late to deal with a broken Marvell NVME device
+	 * which takes the MSI-X mask bits into account even
+	 * when MSI-X is disabled, which prevents MSI delivery.
+	 */
+	msix_mask_all(base, tsize);
 	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
 
 	pcibios_free_irq(dev);
-- 
Gitee


From 7aab7fc583b9d5020e7318a7f3fc2611822d6b20 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 24 Jun 2022 11:49:04 +0800
Subject: [PATCH 1750/2161] PCI/MSI: Store MSI-X table address in struct
 pci_dev

commit bf8809bd8e1062cf99094e59bff6f7debee6f22c Intel-BKC.

This is a preparatory patch to enable dynamic allocation of MSI-X
interrupts.

Store the base address of the device MSI-X table in struct pci_dev,
so that it can be retrieved easily every time new MSI-X vectors are
requested for the device.

No functional change.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c   | 15 +++++++--------
 include/linux/pci.h |  1 +
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 580dbe8dacee..1d55c45473bb 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -574,7 +574,7 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
 	return ioremap_nocache(phys_addr, nr_entries * PCI_MSIX_ENTRY_SIZE);
 }
 
-static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
+static int msix_setup_entries(struct pci_dev *dev,
 			      struct msix_entry *entries, int nvec,
 			      struct irq_affinity *affd)
 {
@@ -591,7 +591,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 		entry = alloc_msi_entry(&dev->dev, 1, curmsk);
 		if (!entry) {
 			if (!i)
-				iounmap(base);
+				iounmap(dev->msix_table_base);
 			else
 				free_msi_irqs(dev);
 			/* No enough memory. Don't try again */
@@ -611,7 +611,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			entry->msi_attrib.entry_nr >= vec_count;
 
 		entry->msi_attrib.default_irq	= dev->irq;
-		entry->mask_base		= base;
+		entry->mask_base		= dev->msix_table_base;
 
 		if (!entry->msi_attrib.is_virtual) {
 			addr = pci_msix_desc_addr(entry);
@@ -664,7 +664,6 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 				int nvec, struct irq_affinity *affd)
 {
 	const struct attribute_group **groups;
-	void __iomem *base;
 	int ret, tsize;
 	u16 control;
 
@@ -679,13 +678,13 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
 	/* Request & Map MSI-X table region */
 	tsize = msix_table_size(control);
-	base = msix_map_region(dev, tsize);
-	if (!base) {
+	dev->msix_table_base = msix_map_region(dev, tsize);
+	if (!dev->msix_table_base) {
 		ret = -ENOMEM;
 		goto out_disable;
 	}
 
-	ret = msix_setup_entries(dev, base, entries, nvec, affd);
+	ret = msix_setup_entries(dev, entries, nvec, affd);
 	if (ret)
 		goto out_disable;
 
@@ -720,7 +719,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	 * which takes the MSI-X mask bits into account even
 	 * when MSI-X is disabled, which prevents MSI delivery.
 	 */
-	msix_mask_all(base, tsize);
+	msix_mask_all(dev->msix_table_base, tsize);
 	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
 
 	pcibios_free_irq(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 894bd747285d..4743fb19161e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -449,6 +449,7 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCI_MSI
 	const struct attribute_group **msi_irq_groups;
+	void __iomem	*msix_table_base;	/* Base address of device MSI-X table */
 #endif
 	struct pci_vpd *vpd;
 #ifdef CONFIG_PCIE_DPC
-- 
Gitee


From 685e518b71b35c2a0ccafe5ac222d764d6a31cf6 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 27 Sep 2022 16:58:38 +0800
Subject: [PATCH 1751/2161] Revert "PCI/MSI: Mask MSI-X vectors only on
 success"

commit opencloudos.

This reverts commit 602e76c9ec72ff3d14cbb0430b2ff30ae701a4bb.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 1d55c45473bb..7ab3bef8f5d1 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -684,6 +684,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 		goto out_disable;
 	}
 
+	/* Ensure that all table entries are masked. */
+	msix_mask_all(dev->msix_table_base, tsize);
+
 	ret = msix_setup_entries(dev, entries, nvec, affd);
 	if (ret)
 		goto out_disable;
@@ -710,16 +713,6 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	/* Set MSI-X enabled bits and unmask the function */
 	pci_intx_for_msi(dev, 0);
 	dev->msix_enabled = 1;
-
-	/*
-	 * Ensure that all table entries are masked to prevent
-	 * stale entries from firing in a crash kernel.
-	 *
-	 * Done late to deal with a broken Marvell NVME device
-	 * which takes the MSI-X mask bits into account even
-	 * when MSI-X is disabled, which prevents MSI delivery.
-	 */
-	msix_mask_all(dev->msix_table_base, tsize);
 	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
 
 	pcibios_free_irq(dev);
-- 
Gitee


From a1a5815b42b5b6a6ab7e909ec72000213dd83994 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Dec 2021 12:42:14 +0100
Subject: [PATCH 1752/2161] PCI/MSI: Clear PCI_MSIX_FLAGS_MASKALL on error

commit 94185adbfad56815c2c8401e16d81bdb74a79201 upstream.

PCI_MSIX_FLAGS_MASKALL is set in the MSI-X control register at MSI-X
interrupt setup time. It's cleared on success, but the error handling path
only clears the PCI_MSIX_FLAGS_ENABLE bit.

That's incorrect as the reset state of the PCI_MSIX_FLAGS_MASKALL bit is
zero. That can be observed via lspci:

        Capabilities: [b0] MSI-X: Enable- Count=67 Masked+

Clear the bit in the error path to restore the reset state.

Fixes: 438553958ba1 ("PCI/MSI: Enable and mask MSI-X early")
Reported-by: Stefan Roese <sr@denx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Stefan Roese <sr@denx.de>
Cc: linux-pci@vger.kernel.org
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Michal Simek <michal.simek@xilinx.com>
Cc: Marek Vasut <marex@denx.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/87tufevoqx.ffs@tglx
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 7ab3bef8f5d1..61a25bfad8a1 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -739,7 +739,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	free_msi_irqs(dev);
 
 out_disable:
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
 
 	return ret;
 }
-- 
Gitee


From 79a1f2061382c5bf7edfbd90d041c4d9af77597b Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Mon, 18 Oct 2021 12:50:04 -0700
Subject: [PATCH 1753/2161] PCI/MSI: Separate one time setup functionalities

commit 72ebcfcf7a45545ebe71d913857fc7817771e5b0 Intel-BKC.

This is a preparatory patch to enable dynamic allocation of MSI-X
interrupts.

There are some functions which needs to be performed only once even
after enabling dynamic allocation of MSI-X vectors. These include
setting up the
1. device MSI-X table
2. sysfs folder for the MSI/MSI-X interrupts

Use the dev->msix_enabled flag to ensure that these structures are setup
only once.

Also, msix_capability_init() sets up the
1. device MSI-X table (called once per device)
2. MSI-X vectors requested (can be called multiple times once dynamic
MSI-X is enabled)
Separate out these 2 functionalities.

No functional change.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 72 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 26 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 61a25bfad8a1..5601dbb1c59e 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -652,20 +652,12 @@ static void msix_mask_all(void __iomem *base, int tsize)
 /**
  * msix_capability_init - configure device's MSI-X capability
  * @dev: pointer to the pci_dev data structure of MSI-X device function
- * @entries: pointer to an array of struct msix_entry entries
- * @nvec: number of @entries
- * @affd: Optional pointer to enable automatic affinity assignment
- *
- * Setup the MSI-X capability structure of device function with a
- * single MSI-X IRQ. A return of zero indicates the successful setup of
- * requested MSI-X entries with allocated IRQs or non-zero for otherwise.
+ * Setup the MSI-X capability structure of device function
  **/
-static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
-				int nvec, struct irq_affinity *affd)
+static int msix_capability_init(struct pci_dev *dev)
 {
-	const struct attribute_group **groups;
-	int ret, tsize;
 	u16 control;
+	int tsize;
 
 	/*
 	 * Some devices require MSI-X to be enabled before the MSI-X
@@ -680,12 +672,29 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	tsize = msix_table_size(control);
 	dev->msix_table_base = msix_map_region(dev, tsize);
 	if (!dev->msix_table_base) {
-		ret = -ENOMEM;
-		goto out_disable;
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+		return -ENOMEM;
 	}
 
 	/* Ensure that all table entries are masked. */
 	msix_mask_all(dev->msix_table_base, tsize);
+	return 0;
+}
+
+/**
+ * msix_setup_irqs - setup requested number of MSI-X entries
+ * @dev:	pointer to the pci_dev data structure of MSI-X device function
+ * @entries:	pointer to an array of struct msix_entry entries
+ * @nvec:	number of @entries
+ * @affd:	Optional pointer to enable automatic affinity assignment
+ *
+ * A return of zero indicates the successful setup of the requested IRQs
+ * or non-zero for otherwise.
+ **/
+static int msix_setup_irqs(struct pci_dev *dev, struct msix_entry *entries,
+			   int nvec, struct irq_affinity *affd)
+{
+	int ret = 0;
 
 	ret = msix_setup_entries(dev, entries, nvec, affd);
 	if (ret)
@@ -702,20 +711,25 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 
 	msix_update_entries(dev, entries);
 
-	groups = msi_populate_sysfs(&dev->dev);
-	if (IS_ERR(groups)) {
-		ret = PTR_ERR(groups);
-		goto out_free;
-	}
+	if (!dev->msix_enabled) {
+		const struct attribute_group **groups;
 
-	dev->msi_irq_groups = groups;
+		groups = msi_populate_sysfs(&dev->dev);
+		if (IS_ERR(groups)) {
+			ret = PTR_ERR(groups);
+			goto out_free;
+		}
 
-	/* Set MSI-X enabled bits and unmask the function */
-	pci_intx_for_msi(dev, 0);
-	dev->msix_enabled = 1;
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+		dev->msi_irq_groups = groups;
+
+		/* Set MSI-X enabled bits and unmask the function */
+		pci_intx_for_msi(dev, 0);
+		dev->msix_enabled = 1;
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+
+		pcibios_free_irq(dev);
+	}
 
-	pcibios_free_irq(dev);
 	return 0;
 
 out_avail:
@@ -739,7 +753,8 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	free_msi_irqs(dev);
 
 out_disable:
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
+	if (!dev->msix_enabled)
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
 
 	return ret;
 }
@@ -898,7 +913,12 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 		pci_info(dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
 		return -EINVAL;
 	}
-	return msix_capability_init(dev, entries, nvec, affd);
+
+	if (!dev->msix_enabled)
+		if (msix_capability_init(dev))
+			return -ENOMEM;
+
+	return msix_setup_irqs(dev, entries, nvec, affd);
 }
 
 static void pci_msix_shutdown(struct pci_dev *dev)
-- 
Gitee


From a3714af8f6e0e63033eb628a05086e5b60d78bb9 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 24 Jun 2022 11:59:24 +0800
Subject: [PATCH 1754/2161] PCI/MSI: Add mutex to serialize MSI-X interrupt
 allocation

commit 4356da3a425e0a227f4d29e33cd5da0da284155c Intel-BKC.

This is a preparatory patch to enable dynamic allocation of MSI-X
interrupts.

After enabling dynamic allocation of MSI-X interrupts, prevent
concurrent MSI-X interrupt allocations to the same device by using a
mutex. Until now, the device msi_list was immutable but once dynamic
allocation of MSI-X is enabled, entries can be added or removed from
this list, and thus needs to be protected.

No functional change.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c   | 4 ++++
 drivers/pci/probe.c | 3 +++
 include/linux/pci.h | 1 +
 3 files changed, 8 insertions(+)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 5601dbb1c59e..9108eb74bd21 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -950,8 +950,10 @@ void pci_disable_msix(struct pci_dev *dev)
 	if (!pci_msi_enable || !dev || !dev->msix_enabled)
 		return;
 
+	mutex_lock(&dev->msix_mutex);
 	pci_msix_shutdown(dev);
 	free_msi_irqs(dev);
+	mutex_unlock(&dev->msix_mutex);
 }
 EXPORT_SYMBOL(pci_disable_msix);
 
@@ -1052,7 +1054,9 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 				return -ENOSPC;
 		}
 
+		mutex_lock(&dev->msix_mutex);
 		rc = __pci_enable_msix(dev, entries, nvec, affd, flags);
+		mutex_unlock(&dev->msix_mutex);
 		if (rc == 0)
 			return nvec;
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8d33e4556486..c8d03c3f2c31 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2191,6 +2191,9 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
 		return NULL;
 
 	INIT_LIST_HEAD(&dev->bus_list);
+#ifdef CONFIG_PCI_MSI
+	mutex_init(&dev->msix_mutex);
+#endif
 	dev->dev.type = &pci_dev_type;
 	dev->bus = pci_bus_get(bus);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4743fb19161e..1455099470d5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -450,6 +450,7 @@ struct pci_dev {
 #ifdef CONFIG_PCI_MSI
 	const struct attribute_group **msi_irq_groups;
 	void __iomem	*msix_table_base;	/* Base address of device MSI-X table */
+	struct mutex	msix_mutex;		/* Serialize MSI-X interrupt allocation */
 #endif
 	struct pci_vpd *vpd;
 #ifdef CONFIG_PCIE_DPC
-- 
Gitee


From 4243ac8fdc201373ea03cb5f1274bf666cf3ef23 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 14 Oct 2019 16:17:05 -0500
Subject: [PATCH 1755/2161] PCI/MSI: Move power state check out of
 pci_msi_supported()

commit 901c4ddbe277a766fd081ffbbd526d7b4bdbacdf upstream.

27e20603c54b ("PCI/MSI: Move D0 check into pci_msi_check_device()")
moved the power state check into pci_msi_check_device(), which was
subsequently renamed to pci_msi_supported().  This didn't change the
behavior, since both callers checked the power state.

However, it doesn't fit the current "pci_msi_supported()" name, which
should return what the device is capable of, independent of the power
state.

Move the power state check back into the callers for readability.  No
functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 9108eb74bd21..53ae13d4f7c0 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -776,7 +776,7 @@ static int pci_msi_supported(struct pci_dev *dev, int nvec)
 	if (!pci_msi_enable)
 		return 0;
 
-	if (!dev || dev->no_msi || dev->current_state != PCI_D0)
+	if (!dev || dev->no_msi)
 		return 0;
 
 	/*
@@ -887,7 +887,7 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 	int nr_entries;
 	int i, j;
 
-	if (!pci_msi_supported(dev, nvec))
+	if (!pci_msi_supported(dev, nvec) || dev->current_state != PCI_D0)
 		return -EINVAL;
 
 	nr_entries = pci_msix_vec_count(dev);
@@ -980,7 +980,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	int nvec;
 	int rc;
 
-	if (!pci_msi_supported(dev, minvec))
+	if (!pci_msi_supported(dev, minvec) || dev->current_state != PCI_D0)
 		return -EINVAL;
 
 	/* Check whether driver already requested MSI-X IRQs */
-- 
Gitee


From f107a4eb7da4e0e759cc7c2e750f6c94384b6d2e Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 24 Jun 2022 12:10:07 +0800
Subject: [PATCH 1756/2161] PCI/MSI: Add bitmap to track allocated MSI-X
 vectors

commit c80b4e4fb4e72a3b409c36d6df9f1af63a773cdc Intel-BKC.

This is a preparatory patch to enable dynamic allocation of MSI-X
interrupts.

Add a bitmap, msix_map, to struct pci_dev to track the currently
allocated MSI-X vectors of a device.

Testing the validity of the 'entries' array can also be done using
this bitmap.

No functional change.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c   | 58 ++++++++++++++++++++++++++-------------------
 include/linux/pci.h |  1 +
 2 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 53ae13d4f7c0..1d4bdd1c196d 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -581,7 +581,7 @@ static int msix_setup_entries(struct pci_dev *dev,
 	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
 	void __iomem *addr;
-	int ret, i;
+	int ret, i, idx;
 	int vec_count = pci_msix_vec_count(dev);
 
 	if (affd)
@@ -590,10 +590,6 @@ static int msix_setup_entries(struct pci_dev *dev,
 	for (i = 0, curmsk = masks; i < nvec; i++) {
 		entry = alloc_msi_entry(&dev->dev, 1, curmsk);
 		if (!entry) {
-			if (!i)
-				iounmap(dev->msix_table_base);
-			else
-				free_msi_irqs(dev);
 			/* No enough memory. Don't try again */
 			ret = -ENOMEM;
 			goto out;
@@ -602,10 +598,24 @@ static int msix_setup_entries(struct pci_dev *dev,
 		entry->msi_attrib.is_msix	= 1;
 		entry->msi_attrib.is_64		= 1;
 
-		if (entries)
+		if (entries) {
+			/* Check for invalid/duplicate entries */
+			if (test_bit(entries[i].entry, dev->msix_map) ||
+			    entries[i].entry >= vec_count) {
+				ret = -EINVAL;
+				goto out;
+			}
+			__set_bit(entries[i].entry, dev->msix_map);
 			entry->msi_attrib.entry_nr = entries[i].entry;
-		else
-			entry->msi_attrib.entry_nr = i;
+		} else {
+			idx = find_first_zero_bit(dev->msix_map, vec_count);
+			if (idx >= vec_count) {
+				ret = -ENOSPC;
+				goto out;
+			}
+			__set_bit(idx, dev->msix_map);
+			entry->msi_attrib.entry_nr = idx;
+		}
 
 		entry->msi_attrib.is_virtual =
 			entry->msi_attrib.entry_nr >= vec_count;
@@ -622,8 +632,13 @@ static int msix_setup_entries(struct pci_dev *dev,
 		if (masks)
 			curmsk++;
 	}
-	ret = 0;
+	return 0;
 out:
+	if (!i)
+		iounmap(dev->msix_table_base);
+	else
+		free_msi_irqs(dev);
+
 	kfree(masks);
 	return ret;
 }
@@ -753,9 +768,10 @@ static int msix_setup_irqs(struct pci_dev *dev, struct msix_entry *entries,
 	free_msi_irqs(dev);
 
 out_disable:
-	if (!dev->msix_enabled)
+	if (!dev->msix_enabled) {
 		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
-
+		kfree(dev->msix_map);
+	}
 	return ret;
 }
 
@@ -885,7 +901,6 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 			     int nvec, struct irq_affinity *affd, int flags)
 {
 	int nr_entries;
-	int i, j;
 
 	if (!pci_msi_supported(dev, nvec) || dev->current_state != PCI_D0)
 		return -EINVAL;
@@ -896,27 +911,19 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 	if (nvec > nr_entries && !(flags & PCI_IRQ_VIRTUAL))
 		return nr_entries;
 
-	if (entries) {
-		/* Check for any invalid entries */
-		for (i = 0; i < nvec; i++) {
-			if (entries[i].entry >= nr_entries)
-				return -EINVAL;		/* invalid entry */
-			for (j = i + 1; j < nvec; j++) {
-				if (entries[i].entry == entries[j].entry)
-					return -EINVAL;	/* duplicate entry */
-			}
-		}
-	}
-
 	/* Check whether driver already requested for MSI IRQ */
 	if (dev->msi_enabled) {
 		pci_info(dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
 		return -EINVAL;
 	}
 
-	if (!dev->msix_enabled)
+	if (!dev->msix_enabled) {
+		dev->msix_map = kcalloc(BITS_TO_LONGS(nr_entries), sizeof(long), GFP_KERNEL);
+		if (!dev->msix_map)
+			return -ENOMEM;
 		if (msix_capability_init(dev))
 			return -ENOMEM;
+	}
 
 	return msix_setup_irqs(dev, entries, nvec, affd);
 }
@@ -941,6 +948,7 @@ static void pci_msix_shutdown(struct pci_dev *dev)
 
 	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 	pci_intx_for_msi(dev, 1);
+	kfree(dev->msix_map);
 	dev->msix_enabled = 0;
 	pcibios_alloc_irq(dev);
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1455099470d5..839a746f26f5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -451,6 +451,7 @@ struct pci_dev {
 	const struct attribute_group **msi_irq_groups;
 	void __iomem	*msix_table_base;	/* Base address of device MSI-X table */
 	struct mutex	msix_mutex;		/* Serialize MSI-X interrupt allocation */
+	unsigned long	*msix_map;		/* Bitmap to track allocated MSI-X vectors */
 #endif
 	struct pci_vpd *vpd;
 #ifdef CONFIG_PCIE_DPC
-- 
Gitee


From 64cd6f3b3c4ecb864ca0912118e69fa3a9ba5cfc Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 27 Jun 2022 16:07:39 +0800
Subject: [PATCH 1757/2161] PCI: Remove useless comments and tidy others

commit 9d8b738bb9f8008848b62f58671de3537a6d0734 upstream.

Remove useless comments and tidy others for better readability.  Whitespace
changes only.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/probe.c | 45 +++++++++++----------------------------------
 1 file changed, 11 insertions(+), 34 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index c8d03c3f2c31..d1510644ddb9 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2353,8 +2353,7 @@ void pcie_report_downtraining(struct pci_dev *dev)
 
 static void pci_init_capabilities(struct pci_dev *dev)
 {
-	/* Enhanced Allocation */
-	pci_ea_init(dev);
+	pci_ea_init(dev);		/* Enhanced Allocation */
 
 	/* Setup MSI caps & disable MSI/MSI-X interrupts */
 	pci_msi_setup_pci_dev(dev);
@@ -2362,35 +2361,16 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	/* Buffers for saving PCIe and PCI-X capabilities */
 	pci_allocate_cap_save_buffers(dev);
 
-	/* Power Management */
-	pci_pm_init(dev);
-
-	/* Vital Product Data */
-	pci_vpd_init(dev);
-
-	/* Alternative Routing-ID Forwarding */
-	pci_configure_ari(dev);
-
-	/* Single Root I/O Virtualization */
-	pci_iov_init(dev);
-
-	/* Address Translation Services */
-	pci_ats_init(dev);
-
-	/* Page Request Interface */
-	pci_pri_init(dev);
-
-	/* Process Address Space ID */
-	pci_pasid_init(dev);
-
-	/* Enable ACS P2P upstream forwarding */
-	pci_enable_acs(dev);
-
-	/* Precision Time Measurement */
-	pci_ptm_init(dev);
-
-	/* Advanced Error Reporting */
-	pci_aer_init(dev);
+	pci_pm_init(dev);		/* Power Management */
+	pci_vpd_init(dev);		/* Vital Product Data */
+	pci_configure_ari(dev);		/* Alternative Routing-ID Forwarding */
+	pci_iov_init(dev);		/* Single Root I/O Virtualization */
+	pci_ats_init(dev);		/* Address Translation Services */
+	pci_pri_init(dev);		/* Page Request Interface */
+	pci_pasid_init(dev);		/* Process Address Space ID */
+	pci_enable_acs(dev);		/* Access Control Services */
+	pci_ptm_init(dev);		/* Precision Time Measurement */
+	pci_aer_init(dev);		/* Advanced Error Reporting */
 	pci_dpc_init(dev);		/* Downstream Port Containment */
 
 	pcie_report_downtraining(dev);
@@ -2463,13 +2443,10 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	/* Fix up broken headers */
 	pci_fixup_device(pci_fixup_header, dev);
 
-	/* Moved out from quirk header fixup code */
 	pci_reassigndev_resource_alignment(dev);
 
-	/* Clear the state_saved flag */
 	dev->state_saved = false;
 
-	/* Initialize various capabilities */
 	pci_init_capabilities(dev);
 
 	/*
-- 
Gitee


From 9f502c54d1546549782d959af913cbaa18cf2d39 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 27 Jun 2022 16:15:36 +0800
Subject: [PATCH 1758/2161] PCI/MSI: Move MSI/MSI-X init to msi.c

commit cbc40d5c33af289548d2481e68a38512102cdd3e upstream.

Move pci_msi_setup_pci_dev(), which disables MSI and MSI-X interrupts, from
probe.c to msi.c so it's with all the other MSI code and more consistent
with other capability initialization.  This means we must compile msi.c
always, even without CONFIG_PCI_MSI, so wrap the rest of msi.c in an #ifdef
and adjust the Makefile accordingly.  No functional change intended.

Link: https://lore.kernel.org/r/20201203185110.1583077-2-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/Makefile |  3 +--
 drivers/pci/msi.c    | 36 ++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h    |  2 ++
 drivers/pci/probe.c  | 21 ++-------------------
 4 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 28cdd8c0213a..b14e23004690 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_PCI)		+= access.o bus.o probe.o host-bridge.o \
 				   remove.o pci.o pci-driver.o search.o \
 				   pci-sysfs.o rom.o setup-res.o irq.o vpd.o \
-				   setup-bus.o vc.o mmap.o setup-irq.o
+				   setup-bus.o vc.o mmap.o setup-irq.o msi.o
 
 ifdef CONFIG_PCI
 obj-$(CONFIG_PROC_FS)		+= proc.o
@@ -17,7 +17,6 @@ obj-$(CONFIG_OF)		+= of.o
 obj-$(CONFIG_PCI_QUIRKS)	+= quirks.o
 obj-$(CONFIG_PCIEPORTBUS)	+= pcie/
 obj-$(CONFIG_HOTPLUG_PCI)	+= hotplug/
-obj-$(CONFIG_PCI_MSI)		+= msi.o
 obj-$(CONFIG_PCI_ATS)		+= ats.o
 obj-$(CONFIG_PCI_IOV)		+= iov.o
 obj-$(CONFIG_PCI_BRIDGE_EMUL)	+= pci-bridge-emul.o
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 1d4bdd1c196d..3b53231c2d6a 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -26,6 +26,8 @@
 
 #include "pci.h"
 
+#ifdef CONFIG_PCI_MSI
+
 static int pci_msi_enable = 1;
 int pci_msi_ignore_mask;
 
@@ -1522,3 +1524,37 @@ bool pci_dev_has_special_msi_domain(struct pci_dev *pdev)
 }
 
 #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
+#endif /* CONFIG_PCI_MSI */
+
+void pci_msi_init(struct pci_dev *dev)
+{
+	u16 ctrl;
+
+	/*
+	 * Disable the MSI hardware to avoid screaming interrupts
+	 * during boot.  This is the power on reset default so
+	 * usually this should be a noop.
+	 */
+	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!dev->msi_cap)
+		return;
+
+	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &ctrl);
+	if (ctrl & PCI_MSI_FLAGS_ENABLE)
+		pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS,
+				      ctrl & ~PCI_MSI_FLAGS_ENABLE);
+}
+
+void pci_msix_init(struct pci_dev *dev)
+{
+	u16 ctrl;
+
+	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!dev->msix_cap)
+		return;
+
+	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
+	if (ctrl & PCI_MSIX_FLAGS_ENABLE)
+		pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
+				      ctrl & ~PCI_MSIX_FLAGS_ENABLE);
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 99ec1ebfc27b..16645c124e9b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -105,6 +105,8 @@ void pci_config_pm_runtime_get(struct pci_dev *dev);
 void pci_config_pm_runtime_put(struct pci_dev *dev);
 void pci_pm_init(struct pci_dev *dev);
 void pci_ea_init(struct pci_dev *dev);
+void pci_msi_init(struct pci_dev *dev);
+void pci_msix_init(struct pci_dev *dev);
 void pci_allocate_cap_save_buffers(struct pci_dev *dev);
 void pci_free_cap_save_buffers(struct pci_dev *dev);
 bool pci_bridge_d3_possible(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index d1510644ddb9..8a96b027ab86 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1658,22 +1658,6 @@ static u8 pci_hdr_type(struct pci_dev *dev)
 
 #define LEGACY_IO_RESOURCE	(IORESOURCE_IO | IORESOURCE_PCI_FIXED)
 
-static void pci_msi_setup_pci_dev(struct pci_dev *dev)
-{
-	/*
-	 * Disable the MSI hardware to avoid screaming interrupts
-	 * during boot.  This is the power on reset default so
-	 * usually this should be a noop.
-	 */
-	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	if (dev->msi_cap)
-		pci_msi_set_enable(dev, 0);
-
-	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (dev->msix_cap)
-		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
-}
-
 /**
  * pci_intx_mask_broken - Test PCI_COMMAND_INTX_DISABLE writability
  * @dev: PCI device
@@ -2354,9 +2338,8 @@ void pcie_report_downtraining(struct pci_dev *dev)
 static void pci_init_capabilities(struct pci_dev *dev)
 {
 	pci_ea_init(dev);		/* Enhanced Allocation */
-
-	/* Setup MSI caps & disable MSI/MSI-X interrupts */
-	pci_msi_setup_pci_dev(dev);
+	pci_msi_init(dev);		/* Disable MSI */
+	pci_msix_init(dev);		/* Disable MSI-X */
 
 	/* Buffers for saving PCIe and PCI-X capabilities */
 	pci_allocate_cap_save_buffers(dev);
-- 
Gitee


From 896c0f853b5fe07d94c975388e82d64a3f20218c Mon Sep 17 00:00:00 2001
From: Vidya Sagar <vidyas@nvidia.com>
Date: Thu, 3 Dec 2020 12:51:10 -0600
Subject: [PATCH 1759/2161] PCI/MSI: Set device flag indicating only 32-bit MSI
 support

commit 2053230af11dc651ee3024682df12668496adad2 upstream.

The MSI-X Capability requires devices to support 64-bit Message Addresses,
but the MSI Capability can support either 32- or 64-bit addresses.

Previously, we set dev->no_64bit_msi for a few broken devices that
advertise 64-bit MSI support but don't correctly support it.

In addition, check the MSI "64-bit Address Capable" bit for all devices and
set dev->no_64bit_msi for devices that don't advertise 64-bit support.
This allows msi_verify_entries() to catch arch code defects that assign
64-bit addresses when they're not supported.

The warning is helpful to find defects like the one fixed by
https://lore.kernel.org/r/20201117165312.25847-1-vidyas@nvidia.com

[bhelgaas: set no_64bit_msi in pci_msi_init(), commit log]
Link: https://lore.kernel.org/r/20201124105035.24573-1-vidyas@nvidia.com
Link: https://lore.kernel.org/r/20201203185110.1583077-4-helgaas@kernel.org
Signed-off-by: Vidya Sagar <vidyas@nvidia.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 3b53231c2d6a..117898346b43 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -483,11 +483,11 @@ static int msi_verify_entries(struct pci_dev *dev)
 	struct msi_desc *entry;
 
 	for_each_pci_msi_entry(entry, dev) {
-		if (!dev->no_64bit_msi || !entry->msg.address_hi)
-			continue;
-		pci_err(dev, "Device has broken 64-bit MSI but arch"
-			" tried to assign one above 4G\n");
-		return -EIO;
+		if (entry->msg.address_hi && dev->no_64bit_msi) {
+			pci_err(dev, "arch assigned 64-bit MSI address %#x%08x but device only supports 32 bits\n",
+				entry->msg.address_hi, entry->msg.address_lo);
+			return -EIO;
+		}
 	}
 	return 0;
 }
@@ -1543,6 +1543,9 @@ void pci_msi_init(struct pci_dev *dev)
 	if (ctrl & PCI_MSI_FLAGS_ENABLE)
 		pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS,
 				      ctrl & ~PCI_MSI_FLAGS_ENABLE);
+
+	if (!(ctrl & PCI_MSI_FLAGS_64BIT))
+		dev->no_64bit_msi = 1;
 }
 
 void pci_msix_init(struct pci_dev *dev)
-- 
Gitee


From 00c34fd9984f2d457b36dd8d876b714c7f6acf38 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:52 +0200
Subject: [PATCH 1760/2161] PCI/MSI: Simplify msi_verify_entries()

commit a6e8b946508cda3c3bf0f9b0e133d293dc9754f6 upstream.

No point in looping over all entries when 64bit addressing mode is enabled
for nothing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20210729222542.992849326@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 117898346b43..99abe9ba8620 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -482,8 +482,11 @@ static int msi_verify_entries(struct pci_dev *dev)
 {
 	struct msi_desc *entry;
 
+	if (!dev->no_64bit_msi)
+		return 0;
+
 	for_each_pci_msi_entry(entry, dev) {
-		if (entry->msg.address_hi && dev->no_64bit_msi) {
+		if (entry->msg.address_hi) {
 			pci_err(dev, "arch assigned 64-bit MSI address %#x%08x but device only supports 32 bits\n",
 				entry->msg.address_hi, entry->msg.address_lo);
 			return -EIO;
-- 
Gitee


From 3e832288921bee360e505040aa80b983f155832c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:47 +0200
Subject: [PATCH 1761/2161] PCI/MSI: Protect msi_desc::masked for multi-MSI

commit 77e89afc25f30abd56e76a809ee2884d7c1b63ce upstream.

Multi-MSI uses a single MSI descriptor and there is a single mask register
when the device supports per vector masking. To avoid reading back the mask
register the value is cached in the MSI descriptor and updates are done by
clearing and setting bits in the cache and writing it to the device.

But nothing protects msi_desc::masked and the mask register from being
modified concurrently on two different CPUs for two different Linux
interrupts which belong to the same multi-MSI descriptor.

Add a lock to struct device and protect any operation on the mask and the
mask register with it.

This makes the update of msi_desc::masked unconditional, but there is no
place which requires a modification of the hardware register without
updating the masked cache.

msi_mask_irq() is now an empty wrapper which will be cleaned up in follow
up changes.

The problem goes way back to the initial support of multi-MSI, but picking
the commit which introduced the mask cache is a valid cut off point
(2.6.30).

Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/core.c    |  1 +
 drivers/pci/msi.c      | 19 ++++++++++---------
 include/linux/device.h |  1 +
 include/linux/msi.h    |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 6fea8096edf1..69a7b7b22b49 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1722,6 +1722,7 @@ void device_initialize(struct device *dev)
 	device_pm_init(dev);
 	set_dev_node(dev, -1);
 #ifdef CONFIG_GENERIC_MSI_IRQ
+	raw_spin_lock_init(&dev->msi_lock);
 	INIT_LIST_HEAD(&dev->msi_list);
 #endif
 	INIT_LIST_HEAD(&dev->links.consumers);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 99abe9ba8620..693911babf89 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -173,24 +173,25 @@ static inline __attribute_const__ u32 msi_mask(unsigned x)
  * reliably as devices without an INTx disable bit will then generate a
  * level IRQ which will never be cleared.
  */
-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	u32 mask_bits = desc->masked;
+	raw_spinlock_t *lock = &desc->dev->msi_lock;
+	unsigned long flags;
 
 	if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit)
-		return 0;
+		return;
 
-	mask_bits &= ~mask;
-	mask_bits |= flag;
+	raw_spin_lock_irqsave(lock, flags);
+	desc->masked &= ~mask;
+	desc->masked |= flag;
 	pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos,
-			       mask_bits);
-
-	return mask_bits;
+			       desc->masked);
+	raw_spin_unlock_irqrestore(lock, flags);
 }
 
 static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag);
+	__pci_msi_desc_mask_irq(desc, mask, flag);
 }
 
 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
diff --git a/include/linux/device.h b/include/linux/device.h
index 6b8ff7cf36df..c08db887cb72 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1267,6 +1267,7 @@ struct device {
 	struct dev_pin_info	*pins;
 #endif
 #ifdef CONFIG_GENERIC_MSI_IRQ
+	raw_spinlock_t		msi_lock;
 	struct list_head	msi_list;
 #endif
 
diff --git a/include/linux/msi.h b/include/linux/msi.h
index f829a03a1849..0ec0bf0b305d 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -246,7 +246,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag);
-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
+void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
 void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
-- 
Gitee


From 6f67c7514a4edeec7753c2fa801b8e04f8bf52c3 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 27 Jun 2022 16:44:03 +0800
Subject: [PATCH 1762/2161] genirq/msi: Iterate msi_list starting from a given
 desc

commit 235988b35dba007134f4ec4d973408b8a5c0c8f5 Intel-BKC.

This is a preparatory patch to enable dynamic allocation of MSI-X
interrupts.

With the addition of dynamic msix, the msi_list of the device is no
longer immutable. To set up new vectors, we need to iterate only through
the newly added entries of this list.

To help with this:
1. A msi_last_list pointer is added to struct device which points to the
   last msi_desc's list before a new allocation.
2. New macros are introduced which iterate the msi_list from the 1st
   newly added msi_desc of every allocation using (1).

No functional change.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/core.c    |  1 +
 drivers/pci/msi.c      | 14 ++++++++++----
 include/linux/device.h |  2 ++
 include/linux/msi.h    | 12 ++++++++++++
 kernel/irq/msi.c       |  6 +++---
 5 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 69a7b7b22b49..ecc91f18968a 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1724,6 +1724,7 @@ void device_initialize(struct device *dev)
 #ifdef CONFIG_GENERIC_MSI_IRQ
 	raw_spin_lock_init(&dev->msi_lock);
 	INIT_LIST_HEAD(&dev->msi_list);
+	dev->msi_last_list = &dev->msi_list;
 #endif
 	INIT_LIST_HEAD(&dev->links.consumers);
 	INIT_LIST_HEAD(&dev->links.suppliers);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 693911babf89..9b2a2d8b18e9 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -104,7 +104,7 @@ int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
-	for_each_pci_msi_entry(entry, dev) {
+	for_each_new_pci_msi_entry(entry, dev) {
 		ret = arch_setup_msi_irq(dev, entry);
 		if (ret < 0)
 			return ret;
@@ -486,7 +486,7 @@ static int msi_verify_entries(struct pci_dev *dev)
 	if (!dev->no_64bit_msi)
 		return 0;
 
-	for_each_pci_msi_entry(entry, dev) {
+	for_each_new_pci_msi_entry(entry, dev) {
 		if (entry->msg.address_hi) {
 			pci_err(dev, "arch assigned 64-bit MSI address %#x%08x but device only supports 32 bits\n",
 				entry->msg.address_hi, entry->msg.address_lo);
@@ -593,6 +593,12 @@ static int msix_setup_entries(struct pci_dev *dev,
 	if (affd)
 		masks = irq_create_affinity_masks(nvec, affd);
 
+	/* Store pointer to the last msi_desc entry's list before adding new
+	 * entries to the device msi_list or to the device msi_list itself
+	 * if msi_list is empty.
+	 */
+	dev->dev.msi_last_list = dev->dev.msi_list.prev;
+
 	for (i = 0, curmsk = masks; i < nvec; i++) {
 		entry = alloc_msi_entry(&dev->dev, 1, curmsk);
 		if (!entry) {
@@ -653,7 +659,7 @@ static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries)
 {
 	struct msi_desc *entry;
 
-	for_each_pci_msi_entry(entry, dev) {
+	for_each_new_pci_msi_entry(entry, dev) {
 		if (entries) {
 			entries->vector = entry->irq;
 			entries++;
@@ -762,7 +768,7 @@ static int msix_setup_irqs(struct pci_dev *dev, struct msix_entry *entries,
 		struct msi_desc *entry;
 		int avail = 0;
 
-		for_each_pci_msi_entry(entry, dev) {
+		for_each_new_pci_msi_entry(entry, dev) {
 			if (entry->irq != 0)
 				avail++;
 		}
diff --git a/include/linux/device.h b/include/linux/device.h
index c08db887cb72..778190ee2bb2 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1182,6 +1182,7 @@ struct dev_links_info {
  * @pins:	For device pin management.
  *		See Documentation/driver-api/pinctl.rst for details.
  * @msi_list:	Hosts MSI descriptors
+ * @msi_last_list: Pointer to list of last msi_desc entry
  * @msi_domain: The generic MSI domain this device is using.
  * @numa_node:	NUMA node this device is close to.
  * @dma_ops:    DMA mapping operations for this device.
@@ -1267,6 +1268,7 @@ struct device {
 	struct dev_pin_info	*pins;
 #endif
 #ifdef CONFIG_GENERIC_MSI_IRQ
+	struct list_head	*msi_last_list;
 	raw_spinlock_t		msi_lock;
 	struct list_head	msi_list;
 #endif
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 0ec0bf0b305d..ae463a66ace9 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -197,6 +197,16 @@ struct msi_desc {
 			for (__irq = (desc)->irq;			\
 			     __irq < ((desc)->irq + (desc)->nvec_used);	\
 			     __irq++)
+/* Iterate through all the msi_descs starting from a given desc */
+#define for_each_new_msi_entry(desc, dev)                             \
+	(desc) = list_entry((dev)->msi_last_list->next, struct msi_desc, list);         \
+	list_for_each_entry_from((desc), dev_to_msi_list((dev)), list)
+#define for_each_new_msi_vector(desc, __irq, dev)                             \
+	for_each_new_msi_entry((desc), (dev))                          \
+		if ((desc)->irq)                                        \
+			for ((__irq) = (desc)->irq;                     \
+			     (__irq) < ((desc)->irq + (desc)->nvec_used);       \
+			     (__irq)++)
 
 #ifdef CONFIG_IRQ_MSI_IOMMU
 static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
@@ -225,6 +235,8 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
 #define first_pci_msi_entry(pdev)	first_msi_entry(&(pdev)->dev)
 #define for_each_pci_msi_entry(desc, pdev)	\
 	for_each_msi_entry((desc), &(pdev)->dev)
+#define for_each_new_pci_msi_entry(desc, pdev)        \
+	for_each_new_msi_entry((desc), &(pdev)->dev)
 
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
 void *msi_desc_to_pci_sysdata(struct msi_desc *desc);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 8215af7dc129..9ac72daeb889 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -574,7 +574,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			return ret;
 	}
 
-	for_each_msi_entry(desc, dev) {
+	for_each_new_msi_entry(desc, dev) {
 		ops->set_desc(&arg, desc);
 
 		virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
@@ -608,7 +608,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
 		goto skip_activate;
 
-	for_each_msi_vector(desc, i, dev) {
+	for_each_new_msi_vector(desc, i, dev) {
 		if (desc->irq == i) {
 			virq = desc->irq;
 			dev_dbg(dev, "irq [%d-%d] for MSI\n",
@@ -632,7 +632,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	 * so request_irq() will assign the final vector.
 	 */
 	if (can_reserve) {
-		for_each_msi_vector(desc, i, dev) {
+		for_each_new_msi_vector(desc, i, dev) {
 			irq_data = irq_domain_get_irq_data(domain, i);
 			irqd_clr_activated(irq_data);
 		}
-- 
Gitee


From 88e5d3dfbfcf1c30afe03f868d0b2f7e6b12f088 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 27 Jun 2022 16:49:17 +0800
Subject: [PATCH 1763/2161] driver core: Add missing kernel doc for
 device::msi_lock

commit 7a3dc4f35bf8e1a07e5c3f8ecc8ac923f48493fe upstream.

Fixes: 77e89afc25f3 ("PCI/MSI: Protect msi_desc::masked for multi-MSI")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/device.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/device.h b/include/linux/device.h
index 778190ee2bb2..756cee58704c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1181,6 +1181,7 @@ struct dev_links_info {
  * 		along with subsystem-level and driver-level callbacks.
  * @pins:	For device pin management.
  *		See Documentation/driver-api/pinctl.rst for details.
+ * @msi_lock:	Lock to protect MSI mask cache and mask register
  * @msi_list:	Hosts MSI descriptors
  * @msi_last_list: Pointer to list of last msi_desc entry
  * @msi_domain: The generic MSI domain this device is using.
-- 
Gitee


From 9875199a1a4a0c6e093cc6794fb7f57306cec006 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Mon, 18 Oct 2021 13:01:47 -0700
Subject: [PATCH 1764/2161] PCI/MSI: Enable dynamic allocation of MSI-X vectors

commit ef818654dbb197aa1e2cd3baaeb0ac1b98b47e26 Intel-BKC.

Introduce a new API pci_add_msix_irq_vector(), which can be called
multiple times by a driver to add a new MSI-X vector to the device
after some number of interrupts have already been allocated using
the existing pci_alloc_irq_vectors API.

If successful, the API returns the device-relative interrupt vector
index (0-based) which can be passed to pci_irq_vector() to retrieve
the Linux IRQ number of that device vector. It should be called only
after the pci_alloc_irq_vectors().

Add a new member msix_alloc_count to keep track of the number of the
MSI-X vectors currently allocated to the device.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/PCI/msi-howto.rst | 13 +++++
 drivers/pci/msi.c               | 85 +++++++++++++++++++++++++++++++--
 include/linux/pci.h             |  5 ++
 kernel/irq/msi.c                | 19 ++++++--
 4 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst
index 994cbb660ade..7ba2e9925556 100644
--- a/Documentation/PCI/msi-howto.rst
+++ b/Documentation/PCI/msi-howto.rst
@@ -164,6 +164,19 @@ the driver can specify that only MSI or MSI-X is acceptable::
 	if (nvec < 0)
 		goto out_err;
 
+To request additional MSI-X vectors after the probe phase, the
+pci_add_msix_irq_vector() API can be used::
+
+        irq = pci_add_msix_irq_vectors(struct pci_dev *dev);
+        if (irq < 0)
+                goto out_err;
+
+Each time this API is called, one MSI-X vector gets added to the device. This
+API should be called after pci_alloc_irq_vectors has been called by the driver.
+
+This API returns the device-relative interrupt vector index (0-based) which can
+be passed to pci_irq_vector() to retrieve the corresponding Linux IRQ number.
+
 Legacy APIs
 -----------
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 9b2a2d8b18e9..86e126014ef5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -646,7 +646,7 @@ static int msix_setup_entries(struct pci_dev *dev,
 	}
 	return 0;
 out:
-	if (!i)
+	if (!i && !dev->msix_enabled)
 		iounmap(dev->msix_table_base);
 	else
 		free_msi_irqs(dev);
@@ -708,6 +708,44 @@ static int msix_capability_init(struct pci_dev *dev)
 	return 0;
 }
 
+static ssize_t msix_mode_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	return sysfs_emit(buf, "%s\n", "msix");
+}
+
+static int msix_add_sysfs(struct pci_dev *pdev)
+{
+	struct device_attribute *msi_dev_attr;
+	struct attribute *msi_attr;
+	struct msi_desc *entry;
+
+	for_each_new_pci_msi_entry(entry, pdev) {
+		msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
+		if (!msi_dev_attr)
+			goto error_attrs;
+
+		pdev->msi_irq_groups[0]->attrs[entry->msi_attrib.entry_nr] = &msi_dev_attr->attr;
+		sysfs_attr_init(&msi_dev_attr->attr);
+		msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d", entry->irq);
+		if (!msi_dev_attr->attr.name)
+			goto error_attrs;
+		msi_dev_attr->attr.mode = 0444;
+		msi_dev_attr->show = msix_mode_show;
+	}
+
+	return 0;
+
+error_attrs:
+	for_each_new_pci_msi_entry(entry, pdev) {
+		msi_attr = pdev->msi_irq_groups[0]->attrs[entry->msi_attrib.entry_nr];
+		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
+		kfree(msi_attr->name);
+		kfree(msi_dev_attr);
+	}
+	return -ENOMEM;
+}
+
 /**
  * msix_setup_irqs - setup requested number of MSI-X entries
  * @dev:	pointer to the pci_dev data structure of MSI-X device function
@@ -755,8 +793,14 @@ static int msix_setup_irqs(struct pci_dev *dev, struct msix_entry *entries,
 		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
 
 		pcibios_free_irq(dev);
+	} else {
+		ret = msix_add_sysfs(dev);
+		if (ret)
+			goto out_free;
 	}
 
+	dev->msix_alloc_count += nvec;
+
 	return 0;
 
 out_avail:
@@ -920,8 +964,10 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 	nr_entries = pci_msix_vec_count(dev);
 	if (nr_entries < 0)
 		return nr_entries;
-	if (nvec > nr_entries && !(flags & PCI_IRQ_VIRTUAL))
-		return nr_entries;
+	if (nr_entries - dev->msix_alloc_count == 0)
+		return -ENOSPC;
+	if ((nvec > nr_entries - dev->msix_alloc_count) && !(flags & PCI_IRQ_VIRTUAL))
+		return nr_entries - dev->msix_alloc_count;
 
 	/* Check whether driver already requested for MSI IRQ */
 	if (dev->msi_enabled) {
@@ -961,6 +1007,7 @@ static void pci_msix_shutdown(struct pci_dev *dev)
 	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 	pci_intx_for_msi(dev, 1);
 	kfree(dev->msix_map);
+	dev->msix_alloc_count = 0;
 	dev->msix_enabled = 0;
 	pcibios_alloc_irq(dev);
 }
@@ -1111,6 +1158,38 @@ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
 }
 EXPORT_SYMBOL(pci_enable_msix_range);
 
+/**
+ * pci_add_msix_irq_vector - Add an additional MSI-X interrupt to a device
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ *
+ * Add 1 MSI-X vector to the device function, after some vectors have already
+ * been allocated using pci_alloc_irq_vectors(). It returns a negative errno
+ * if an error occurs. If it succeeds, it returns the device-relative interrupt
+ * vector index (0-based) which can be passed to pci_irq_vector to retrieve the
+ * Linux IRQ number of that device vector.
+ **/
+int pci_add_msix_irq_vector(struct pci_dev *dev)
+{
+	struct msi_desc *entry;
+	int ret;
+
+	if (WARN_ON_ONCE(!dev->msix_enabled))
+		return -EINVAL;
+
+	mutex_lock(&dev->msix_mutex);
+	ret = __pci_enable_msix(dev, NULL, 1, NULL, 0);
+
+	if (ret == 0) {
+		entry = list_last_entry(dev_to_msi_list(&dev->dev), struct msi_desc, list);
+		mutex_unlock(&dev->msix_mutex);
+		return entry->msi_attrib.entry_nr;
+	}
+
+	mutex_unlock(&dev->msix_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(pci_add_msix_irq_vector);
+
 /**
  * pci_alloc_irq_vectors_affinity - allocate multiple IRQs for a device
  * @dev:		PCI device to operate on
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 839a746f26f5..bedd8c8e401a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -449,6 +449,7 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCI_MSI
 	const struct attribute_group **msi_irq_groups;
+	int		msix_alloc_count;	/* No. of MSI-X vectors allocated to device */
 	void __iomem	*msix_table_base;	/* Base address of device MSI-X table */
 	struct mutex	msix_mutex;		/* Serialize MSI-X interrupt allocation */
 	unsigned long	*msix_map;		/* Bitmap to track allocated MSI-X vectors */
@@ -1510,6 +1511,7 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev,
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 				   unsigned int max_vecs, unsigned int flags,
 				   struct irq_affinity *affd);
+int pci_add_msix_irq_vector(struct pci_dev *dev);
 
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
@@ -1542,6 +1544,9 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 	return -ENOSPC;
 }
 
+static inline int pci_add_msix_irq_vector(struct pci_dev *dev)
+{ return -ENOSYS; }
+
 static inline void pci_free_irq_vectors(struct pci_dev *dev)
 {
 }
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9ac72daeb889..53a4c2f601b8 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -112,9 +112,22 @@ const struct attribute_group **msi_populate_sysfs(struct device *dev)
 	int count = 0;
 	int i;
 
-	/* Determine how many msi entries we have */
-	for_each_msi_entry(entry, dev)
-		num_msi += entry->nvec_used;
+	entry = first_msi_entry(dev);
+	if (entry->msi_attrib.is_msix) {
+		/* Since msi-x vectors can be allocated multiple times,
+		 * allocate maximum no. of vectors supported by the device
+		 */
+#ifdef CONFIG_PCI_MSI
+		num_msi = pci_msix_vec_count(to_pci_dev(dev));
+#endif
+	}
+
+	if (!num_msi) {
+		/* Determine how many msi entries we have */
+		for_each_msi_entry(entry, dev)
+			num_msi += entry->nvec_used;
+	}
+
 	if (!num_msi)
 		return NULL;
 
-- 
Gitee


From 5c07e773a14ee167f7a67e485751d860870eaf7f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Nov 2021 14:53:57 +0100
Subject: [PATCH 1765/2161] PCI/MSI: Destroy sysfs before freeing entries

commit 3735459037114d31e5acd9894fad9aed104231a0 upstream.

free_msi_irqs() frees the MSI entries before destroying the sysfs entries
which are exposing them. Nothing prevents a concurrent free while a sysfs
file is read and accesses the possibly freed entry.

Move the sysfs release ahead of freeing the entries.

Fixes: 1c51b50c2995 ("PCI/MSI: Export MSI mode using attributes, not kobjects")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/87sfw5305m.ffs@tglx
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 86e126014ef5..a52e59ae56b9 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -366,6 +366,11 @@ static void free_msi_irqs(struct pci_dev *dev)
 			for (i = 0; i < entry->nvec_used; i++)
 				BUG_ON(irq_has_action(entry->irq + i));
 
+	if (dev->msi_irq_groups) {
+		msi_destroy_sysfs(&dev->dev, dev->msi_irq_groups);
+		dev->msi_irq_groups = NULL;
+	}
+
 	pci_msi_teardown_msi_irqs(dev);
 
 	list_for_each_entry_safe(entry, tmp, msi_list, list) {
@@ -377,11 +382,6 @@ static void free_msi_irqs(struct pci_dev *dev)
 		list_del(&entry->list);
 		free_msi_entry(entry);
 	}
-
-	if (dev->msi_irq_groups) {
-		msi_destroy_sysfs(&dev->dev, dev->msi_irq_groups);
-		dev->msi_irq_groups = NULL;
-	}
 }
 
 static void pci_intx_for_msi(struct pci_dev *dev, int enable)
-- 
Gitee


From 0f64f29891e73176d3a5e196ad2cd3513fb071c4 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Wed, 8 Sep 2021 13:13:36 -0700
Subject: [PATCH 1766/2161] PCI/MSI: Free MSI-X interrupt with a given IRQ
 number

commit f35723a81e0880e3923bc0b3d198bba8d814a3fd Intel-BKC.

Currently, the pci_msix_disable() frees all the allocated resources
associated with a PCIe device when the device is being shut down. With
the introduction of dynamic allocation of MSI-X vectors, there may be
cases where drivers want to free only a particular interrupt, even
when the device is not being shut down.

A new API, pci_free_msix_irq_vector() provides this type of interface.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/PCI/msi-howto.rst |  7 ++++
 drivers/pci/msi.c               | 65 ++++++++++++++++++++++++++++++++-
 include/linux/msi.h             |  1 +
 include/linux/pci.h             |  5 +++
 kernel/irq/msi.c                | 50 ++++++++++++++++++++++---
 5 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst
index 7ba2e9925556..43e45f46af37 100644
--- a/Documentation/PCI/msi-howto.rst
+++ b/Documentation/PCI/msi-howto.rst
@@ -177,6 +177,13 @@ API should be called after pci_alloc_irq_vectors has been called by the driver.
 This API returns the device-relative interrupt vector index (0-based) which can
 be passed to pci_irq_vector() to retrieve the corresponding Linux IRQ number.
 
+To free the allocated resources associated with a particular MSI-X vector, the
+pci_free_msix_irq_vector() API can be used::
+
+        void pci_free_msix_irq_vector(struct pci_dev *dev, unsigned int irq)
+
+Here, 'irq' refers to the Linux IRQ number.
+
 Legacy APIs
 -----------
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index a52e59ae56b9..444cf780c1b1 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -366,13 +366,13 @@ static void free_msi_irqs(struct pci_dev *dev)
 			for (i = 0; i < entry->nvec_used; i++)
 				BUG_ON(irq_has_action(entry->irq + i));
 
+	pci_msi_teardown_msi_irqs(dev);
+
 	if (dev->msi_irq_groups) {
 		msi_destroy_sysfs(&dev->dev, dev->msi_irq_groups);
 		dev->msi_irq_groups = NULL;
 	}
 
-	pci_msi_teardown_msi_irqs(dev);
-
 	list_for_each_entry_safe(entry, tmp, msi_list, list) {
 		if (entry->msi_attrib.is_msix) {
 			if (list_is_last(&entry->list, msi_list))
@@ -384,6 +384,50 @@ static void free_msi_irqs(struct pci_dev *dev)
 	}
 }
 
+static void pci_msix_teardown_irq(struct pci_dev *dev, unsigned int irq)
+{
+	struct irq_domain *domain;
+
+	domain = dev_get_msi_domain(&dev->dev);
+	if (domain && irq_domain_is_hierarchy(domain))
+		msi_domain_free_irq(domain, &dev->dev, irq);
+}
+ 
+static void free_msix_irq(struct pci_dev *dev, unsigned int irq)
+{
+	struct list_head *msi_list = dev_to_msi_list(&dev->dev);
+	struct device_attribute *dev_attr;
+	struct msi_desc *entry, *tmp;
+	struct attribute **msi_attrs;
+	int count;
+
+	list_for_each_entry_safe(entry, tmp, msi_list, list) {
+		if (entry->irq == irq) {
+			if (entry->irq) {
+				BUG_ON(irq_has_action(entry->irq));
+				pci_msix_teardown_irq(dev, irq);
+			}
+			if (dev->msi_irq_groups) {
+				msi_attrs = dev->msi_irq_groups[0]->attrs;
+				count = entry->msi_attrib.entry_nr;
+				if (msi_attrs[count]) {
+					dev_attr = container_of(msi_attrs[count],
+								struct device_attribute, attr);
+					sysfs_remove_file_from_group(&dev->dev.kobj,
+								     &dev_attr->attr,
+								     "msi_irqs");
+					kfree(dev_attr->attr.name);
+					kfree(dev_attr);
+					msi_attrs[count] = NULL;
+				}
+			}
+			__clear_bit(entry->msi_attrib.entry_nr, dev->msix_map);
+			list_del(&entry->list);
+			free_msi_entry(entry);
+		}
+	}
+}
+
 static void pci_intx_for_msi(struct pci_dev *dev, int enable)
 {
 	if (!(dev->dev_flags & PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG))
@@ -1024,6 +1068,23 @@ void pci_disable_msix(struct pci_dev *dev)
 }
 EXPORT_SYMBOL(pci_disable_msix);
 
+/**
+ * pci_free_msix_irq_vector - free previously allocated IRQ for a device
+ * @dev:		PCI device to operate on
+ * @irq:		Linux IRQ number
+ **/
+void pci_free_msix_irq_vector(struct pci_dev *dev, unsigned int irq)
+{
+	if (!pci_msi_enable || !dev || !dev->msix_enabled)
+		return;
+
+	mutex_lock(&dev->msix_mutex);
+	free_msix_irq(dev, irq);
+	dev->msix_alloc_count -= 1;
+	mutex_unlock(&dev->msix_mutex);
+}
+EXPORT_SYMBOL(pci_free_msix_irq_vector);
+
 void pci_no_msi(void)
 {
 	pci_msi_enable = 0;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index ae463a66ace9..4c60a27b5372 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -459,6 +459,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			  int nvec);
 void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
 void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+void msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq);
 struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);
 
 struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index bedd8c8e401a..af7e9277acf2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1514,6 +1514,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 int pci_add_msix_irq_vector(struct pci_dev *dev);
 
 void pci_free_irq_vectors(struct pci_dev *dev);
+void pci_free_msix_irq_vector(struct pci_dev *dev, unsigned int irq);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
 int pci_irq_get_node(struct pci_dev *pdev, int vec);
@@ -1551,6 +1552,10 @@ static inline void pci_free_irq_vectors(struct pci_dev *dev)
 {
 }
 
+static inline void pci_free_msix_irq_vector(struct pci_dev *dev, unsigned int irq)
+{
+}
+
 static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
 {
 	if (WARN_ON_ONCE(nr > 0))
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 53a4c2f601b8..d367f303b677 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -198,11 +198,27 @@ void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_ir
 {
 	struct device_attribute *dev_attr;
 	struct attribute **msi_attrs;
+	struct msi_desc *entry;
 	int count = 0;
 
-	if (msi_irq_groups) {
-		sysfs_remove_groups(&dev->kobj, msi_irq_groups);
-		msi_attrs = msi_irq_groups[0]->attrs;
+	if (!msi_irq_groups)
+		return;
+
+	sysfs_remove_groups(&dev->kobj, msi_irq_groups);
+	msi_attrs = msi_irq_groups[0]->attrs;
+
+	entry = first_msi_entry(dev);
+	if (entry->msi_attrib.is_msix) {
+		for_each_msi_entry(entry, dev) {
+			if (msi_attrs[entry->msi_attrib.entry_nr]) {
+				dev_attr = container_of(msi_attrs[entry->msi_attrib.entry_nr],
+							struct device_attribute, attr);
+				kfree(dev_attr->attr.name);
+				kfree(dev_attr);
+				msi_attrs[entry->msi_attrib.entry_nr] = NULL;
+			}
+		}
+	} else {
 		while (msi_attrs[count]) {
 			dev_attr = container_of(msi_attrs[count],
 					struct device_attribute, attr);
@@ -210,10 +226,11 @@ void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_ir
 			kfree(dev_attr);
 			++count;
 		}
-		kfree(msi_attrs);
-		kfree(msi_irq_groups[0]);
-		kfree(msi_irq_groups);
 	}
+
+	kfree(msi_attrs);
+	kfree(msi_irq_groups[0]);
+	kfree(msi_irq_groups);
 }
 
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
@@ -719,6 +736,27 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 	return ops->domain_free_irqs(domain, dev);
 }
 
+/**
+ * msi_domain_free_irq - Free interrupt from a MSI interrupt @domain associated to @dev
+ * @domain:	The domain to managing the interrupts
+ * @dev:	Pointer to device struct for which the interrupt needs to be freed
+ * @irq:	Interrupt to be freed
+ */
+void msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq)
+{
+	struct msi_desc *desc = irq_get_msi_desc(irq);
+	struct irq_data *irq_data;
+
+	if (irq) {
+		irq_data = irq_domain_get_irq_data(domain, irq);
+		if (irqd_is_activated(irq_data))
+			irq_domain_deactivate_irq(irq_data);
+
+		irq_domain_free_irqs(desc->irq, 1);
+		desc->irq = 0;
+	}
+}
+
 /**
  * msi_get_domain_info - Get the MSI interrupt domain info for @domain
  * @domain:	The interrupt domain to retrieve data from
-- 
Gitee


From 8dad1bc2718def743bfa7581b1e49fb8df12b551 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Wed, 8 Sep 2021 13:42:55 -0700
Subject: [PATCH 1767/2161] PCI/MSI: Free newly allocated vectors if MSI-X
 allocation fails

commit c3f95d74b0ab079cca591ffa58f4aed922108dc3 Intel-BKC.

If an MSI-X vector allocation fails (first or subsequent allocation),
all the vectors allocated thus far to the device are freed.

Ensure that only the vectors allocated as part of the current allocation
are freed, if that allocation fails.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/msi.c   | 37 +++++++++++++++++++++++++++++++------
 include/linux/msi.h |  5 +++++
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 444cf780c1b1..f33f7469d7ba 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -629,7 +629,7 @@ static int msix_setup_entries(struct pci_dev *dev,
 			      struct irq_affinity *affd)
 {
 	struct irq_affinity_desc *curmsk, *masks = NULL;
-	struct msi_desc *entry;
+	struct msi_desc *entry, *tmp;
 	void __iomem *addr;
 	int ret, i, idx;
 	int vec_count = pci_msix_vec_count(dev);
@@ -690,10 +690,21 @@ static int msix_setup_entries(struct pci_dev *dev,
 	}
 	return 0;
 out:
-	if (!i && !dev->msix_enabled)
-		iounmap(dev->msix_table_base);
-	else
-		free_msi_irqs(dev);
+	if (!dev->msix_enabled) {
+		if (!i)
+			iounmap(dev->msix_table_base);
+		else
+			free_msi_irqs(dev);
+	} else {
+		while (i-- > 0) {
+			list_for_each_entry_safe_reverse(entry, tmp, dev_to_msi_list(&dev->dev),
+							 list) {
+				__clear_bit(entry->msi_attrib.entry_nr, dev->msix_map);
+				list_del(&entry->list);
+				free_msi_entry(entry);
+			}
+		}
+	}
 
 	kfree(masks);
 	return ret;
@@ -865,12 +876,26 @@ static int msix_setup_irqs(struct pci_dev *dev, struct msix_entry *entries,
 	}
 
 out_free:
-	free_msi_irqs(dev);
+	if (ret) {
+		struct msi_desc *desc, *tmp;
+
+		for_each_new_pci_msi_entry_safe(desc, tmp, dev) {
+			if (desc->irq) {
+				BUG_ON(irq_has_action(desc->irq));
+				pci_msix_teardown_irq(dev, desc->irq);
+			}
+
+			__clear_bit(desc->msi_attrib.entry_nr, dev->msix_map);
+			list_del(&desc->list);
+			free_msi_entry(desc);
+		}
+	}
 
 out_disable:
 	if (!dev->msix_enabled) {
 		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
 		kfree(dev->msix_map);
+		iounmap(dev->msix_table_base);
 	}
 	return ret;
 }
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 4c60a27b5372..76c9e433c482 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -207,6 +207,9 @@ struct msi_desc {
 			for ((__irq) = (desc)->irq;                     \
 			     (__irq) < ((desc)->irq + (desc)->nvec_used);       \
 			     (__irq)++)
+#define for_each_new_msi_entry_safe(desc, tmp, dev)				\
+        (desc) = list_entry((dev)->msi_last_list->next, struct msi_desc, list);	\
+        list_for_each_entry_safe_from((desc), (tmp), dev_to_msi_list((dev)), list)
 
 #ifdef CONFIG_IRQ_MSI_IOMMU
 static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
@@ -237,6 +240,8 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
 	for_each_msi_entry((desc), &(pdev)->dev)
 #define for_each_new_pci_msi_entry(desc, pdev)        \
 	for_each_new_msi_entry((desc), &(pdev)->dev)
+#define for_each_new_pci_msi_entry_safe(desc, tmp, pdev)	\
+	for_each_new_msi_entry_safe((desc), (tmp), &(pdev)->dev)
 
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
 void *msi_desc_to_pci_sysdata(struct msi_desc *desc);
-- 
Gitee


From 1d969c419afb1e5ea17f4452106e01bb91c63cf9 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 1 Dec 2021 17:07:29 -0700
Subject: [PATCH 1768/2161] MSI: export msi_domain_alloc_irqs() symbol

commit 0963865b1fb1c2685c42c6ea5957d1fe8096932b Intel-BKC.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/irq/msi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index d367f303b677..c46b7de6c645 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -691,6 +691,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 
 	return ops->domain_alloc_irqs(domain, dev, nvec);
 }
+EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs);
 
 void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 {
-- 
Gitee


From def5ba208f58936be761245f848eedd2b159194f Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Wed, 15 Dec 2021 13:17:40 -0800
Subject: [PATCH 1769/2161] IMS:add aux data callback

commit 3b84344cf336260d04269f13579342cb836fe167 Intel-BKC.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/irqchip/irq-ims-msi.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/irqchip/irq-ims-msi.c b/drivers/irqchip/irq-ims-msi.c
index 61f2437d7c58..6e3fc36287bc 100644
--- a/drivers/irqchip/irq-ims-msi.c
+++ b/drivers/irqchip/irq-ims-msi.c
@@ -52,11 +52,29 @@ static void ims_array_write_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	iowrite32_and_flush(msg->data, &slot->data);
 }
 
+static int ims_array_set_auxdata(struct irq_data *data, unsigned int which,
+				 u64 auxval)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 val, __iomem *ctrl = &slot->ctrl;
+
+	if (which != IMS_AUXDATA_CONTROL_WORD)
+		return -EINVAL;
+	if (auxval & ~(u64)IMS_CONTROL_WORD_AUXMASK)
+		return -EINVAL;
+
+	val = ioread32(ctrl) & IMS_CONTROL_WORD_IRQMASK;
+	iowrite32_and_flush(val | (u32) auxval, ctrl);
+	return 0;
+}
+
 static const struct irq_chip ims_array_msi_controller = {
 	.name			= "IMS",
 	.irq_mask		= ims_array_mask_irq,
 	.irq_unmask		= ims_array_unmask_irq,
 	.irq_write_msi_msg	= ims_array_write_msi_msg,
+	.irq_set_auxdata	= ims_array_set_auxdata,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
-- 
Gitee


From c879a48ba154110fcdad8c97eca2967f3ed6576e Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Fri, 18 Dec 2020 14:00:39 +0800
Subject: [PATCH 1770/2161] genirq/msi: Initialize msi_alloc_info before
 calling msi_domain_prepare_irqs()

commit 06fde695ee76429634c1e8c8c1154035aa61191e upstream.

Since commit 5fe71d271df8 ("irqchip/gic-v3-its: Tag ITS device as shared if
allocating for a proxy device"), some of the devices are wrongly marked as
"shared" by the ITS driver on systems equipped with the ITS(es). The
problem is that the @info->flags may not be initialized anywhere and we end
up looking at random bits on the stack. That's obviously not good.

We can perform the initialization in the IRQ core layer before calling
msi_domain_prepare_irqs(), which is neat enough.

Fixes: 5fe71d271df8 ("irqchip/gic-v3-its: Tag ITS device as shared if allocating for a proxy device")
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201218060039.1770-1-yuzenghui@huawei.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/irq/msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c46b7de6c645..3069099f118d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -590,7 +590,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	struct msi_domain_ops *ops = info->ops;
 	struct irq_data *irq_data;
 	struct msi_desc *desc;
-	msi_alloc_info_t arg;
+	msi_alloc_info_t arg = { };
 	int i, ret, virq;
 	bool can_reserve;
 
-- 
Gitee


From 18da21df2f63670728b94a288954169fdb1eba83 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 27 Jun 2022 17:34:00 +0800
Subject: [PATCH 1771/2161] msi: dynamic dev_msi

commit 86d8cd35c5641a6807506674f72030c4d3ceaa8e Intel-BKC.

Enable a single device to hold both device msi and msi/x interrupts.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/base/core.c           |  2 ++
 drivers/base/platform-msi.c   | 49 +++++++++++++++++++++++++++--
 drivers/irqchip/irq-ims-msi.c | 22 +++++++++++--
 include/linux/device.h        |  2 ++
 include/linux/msi.h           | 59 ++++++++++++++++++++++++-----------
 kernel/irq/msi.c              | 41 +++++++++++++++++++-----
 6 files changed, 145 insertions(+), 30 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index ecc91f18968a..cf98e505efb7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1724,7 +1724,9 @@ void device_initialize(struct device *dev)
 #ifdef CONFIG_GENERIC_MSI_IRQ
 	raw_spin_lock_init(&dev->msi_lock);
 	INIT_LIST_HEAD(&dev->msi_list);
+	INIT_LIST_HEAD(&dev->dev_msi_list);
 	dev->msi_last_list = &dev->msi_list;
+	dev->dev_msi_last_list = &dev->dev_msi_list;
 #endif
 	INIT_LIST_HEAD(&dev->links.consumers);
 	INIT_LIST_HEAD(&dev->links.suppliers);
diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 0e639a650ee5..319785758334 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -424,7 +424,7 @@ int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 static void device_msi_free_msi_entries(struct device *dev)
 {
-	struct list_head *msi_list = dev_to_msi_list(dev);
+	struct list_head *msi_list = dev_to_dev_msi_list(dev);
 	struct msi_desc *entry, *tmp;
 
 	list_for_each_entry_safe(entry, tmp, msi_list, list) {
@@ -445,6 +445,25 @@ static void device_msi_free_irqs(struct irq_domain *domain, struct device *dev)
 	device_msi_free_msi_entries(dev);
 }
 
+static void device_msi_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq)
+{
+	struct msi_desc *entry, *tmp;
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+
+	if (ops->msi_free_irq)
+		ops->msi_free_irq(domain, dev, irq);
+
+	list_for_each_entry_safe(entry, tmp, dev_to_dev_msi_list(dev), list) {
+		if (entry->irq == irq) {
+			list_del(&entry->list);
+			free_msi_entry(entry);
+		}
+	}
+
+	__msi_domain_free_irq(domain, dev, irq);
+}
+
 /**
  * device_msi_alloc_irqs - Allocate MSI interrupts for a device
  * @dev:	Pointer to the device
@@ -457,12 +476,14 @@ static int device_msi_alloc_irqs(struct irq_domain *domain, struct device *dev,
 {
 	int i, ret = -ENOMEM;
 
+	dev->dev_msi_last_list = dev->dev_msi_list.prev;
+
 	for (i = 0; i < nvec; i++) {
 		struct msi_desc *entry = alloc_msi_entry(dev, 1, NULL);
 
 		if (!entry)
 			goto fail;
-		list_add_tail(&entry->list, dev_to_msi_list(dev));
+		list_add_tail(&entry->list, dev_to_dev_msi_list(dev));
 	}
 
 	ret = __msi_domain_alloc_irqs(domain, dev, nvec);
@@ -473,12 +494,36 @@ static int device_msi_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	return ret;
 }
 
+int device_msi_add_irq(struct irq_domain *domain, struct device *dev)
+{
+	struct msi_desc *entry;
+
+	dev->dev_msi_last_list = dev->dev_msi_list.prev;
+
+	entry = alloc_msi_entry(dev, 1, NULL);
+	if (!entry)
+		goto fail;
+	list_add_tail(&entry->list, dev_to_dev_msi_list(dev));
+
+	entry = list_last_entry(dev_to_dev_msi_list(dev), struct msi_desc, list);
+	if (!__msi_domain_alloc_irqs(domain, dev, 1))
+		return entry->device_msi.hwirq;
+
+	list_del(&entry->list);
+fail:
+	free_msi_entry(entry);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(device_msi_add_irq);
+
 static void device_msi_update_dom_ops(struct msi_domain_info *info)
 {
 	if (!info->ops->domain_alloc_irqs)
 		info->ops->domain_alloc_irqs = device_msi_alloc_irqs;
 	if (!info->ops->domain_free_irqs)
 		info->ops->domain_free_irqs = device_msi_free_irqs;
+	if (!info->ops->domain_free_irq)
+		info->ops->domain_free_irq = device_msi_free_irq;
 	if (!info->ops->msi_prepare)
 		info->ops->msi_prepare = arch_msi_prepare;
 }
diff --git a/drivers/irqchip/irq-ims-msi.c b/drivers/irqchip/irq-ims-msi.c
index 6e3fc36287bc..7c9ef211dc40 100644
--- a/drivers/irqchip/irq-ims-msi.c
+++ b/drivers/irqchip/irq-ims-msi.c
@@ -94,7 +94,7 @@ static void ims_array_free_msi_store(struct irq_domain *domain,
 	struct ims_array_data *ims = info->data;
 	struct msi_desc *entry;
 
-	for_each_msi_entry(entry, dev) {
+	for_each_dev_msi_entry(entry, dev) {
 		if (entry->device_msi.priv_iomem) {
 			clear_bit(entry->device_msi.hwirq, ims->map);
 			ims_array_reset_slot(entry->device_msi.priv_iomem);
@@ -104,6 +104,23 @@ static void ims_array_free_msi_store(struct irq_domain *domain,
 	}
 }
 
+static void ims_array_free_msi_irq(struct irq_domain *domain,
+				   struct device *dev, unsigned int irq)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct ims_array_data *ims = info->data;
+	struct msi_desc *entry;
+
+	for_each_dev_msi_entry(entry, dev) {
+		if (entry->irq == irq && entry->device_msi.priv_iomem) {
+			clear_bit(entry->device_msi.hwirq, ims->map);
+			ims_array_reset_slot(entry->device_msi.priv_iomem);
+			entry->device_msi.priv_iomem = NULL;
+			entry->device_msi.hwirq = 0;
+		}
+	}
+}
+
 static int ims_array_alloc_msi_store(struct irq_domain *domain,
 				     struct device *dev, int nvec)
 {
@@ -111,7 +128,7 @@ static int ims_array_alloc_msi_store(struct irq_domain *domain,
 	struct ims_array_data *ims = info->data;
 	struct msi_desc *entry;
 
-	for_each_msi_entry(entry, dev) {
+	for_each_new_dev_msi_entry(entry, dev) {
 		unsigned int idx;
 
 		idx = find_first_zero_bit(ims->map, ims->info.max_slots);
@@ -144,6 +161,7 @@ static const struct ims_array_domain_template ims_array_domain_template = {
 	.ops = {
 		.msi_alloc_store	= ims_array_alloc_msi_store,
 		.msi_free_store		= ims_array_free_msi_store,
+		.msi_free_irq		= ims_array_free_msi_irq,
 		.set_desc               = ims_set_desc,
 	},
 	.info = {
diff --git a/include/linux/device.h b/include/linux/device.h
index 756cee58704c..c6e2874b8d16 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1272,6 +1272,8 @@ struct device {
 	struct list_head	*msi_last_list;
 	raw_spinlock_t		msi_lock;
 	struct list_head	msi_list;
+	struct list_head	dev_msi_list;
+	struct list_head	*dev_msi_last_list;
 #endif
 
 	const struct dma_map_ops *dma_ops;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 76c9e433c482..88d47a9396e0 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -101,7 +101,6 @@ struct ti_sci_inta_msi_desc {
  * @hwirq:		The hardware irq number in the device domain
  */
 struct device_msi_desc {
-	void		*priv;
 	void __iomem	*priv_iomem;
 	u16		hwirq;
 };
@@ -185,31 +184,45 @@ struct msi_desc {
 /* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		((desc)->dev)
 #define dev_to_msi_list(dev)		(&(dev)->msi_list)
+#define dev_to_dev_msi_list(dev)	(&(dev)->dev_msi_list)
 #define first_msi_entry(dev)		\
 	list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list)
-#define for_each_msi_entry(desc, dev)	\
-	list_for_each_entry((desc), dev_to_msi_list((dev)), list)
+#define __for_each_msi_entry(desc, msi_list)    \
+	list_for_each_entry((desc), (msi_list), list)
+#define for_each_msi_entry(desc, dev)		\
+	__for_each_msi_entry((desc), dev_to_msi_list((dev)))
 #define for_each_msi_entry_safe(desc, tmp, dev)	\
 	list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list)
-#define for_each_msi_vector(desc, __irq, dev)				\
-	for_each_msi_entry((desc), (dev))				\
-		if ((desc)->irq)					\
-			for (__irq = (desc)->irq;			\
-			     __irq < ((desc)->irq + (desc)->nvec_used);	\
+#define __for_each_msi_vector(desc, __irq, msi_list)	\
+	__for_each_msi_entry((desc), (msi_list))	\
+		if ((desc)->irq)			\
+			for (__irq = (desc)->irq;	\
+			     __irq < ((desc)->irq + (desc)->nvec_used); \
 			     __irq++)
+#define for_each_msi_vector(desc, __irq, dev)		\
+	__for_each_msi_vector(desc, __irq, dev_to_msi_list((dev)))
+
 /* Iterate through all the msi_descs starting from a given desc */
-#define for_each_new_msi_entry(desc, dev)                             \
-	(desc) = list_entry((dev)->msi_last_list->next, struct msi_desc, list);         \
-	list_for_each_entry_from((desc), dev_to_msi_list((dev)), list)
-#define for_each_new_msi_vector(desc, __irq, dev)                             \
-	for_each_new_msi_entry((desc), (dev))                          \
-		if ((desc)->irq)                                        \
-			for ((__irq) = (desc)->irq;                     \
-			     (__irq) < ((desc)->irq + (desc)->nvec_used);       \
+#define __for_each_new_msi_entry(desc, msi_last_list, msi_list) \
+	(desc) = list_entry((msi_last_list)->next, struct msi_desc, list);	\
+	list_for_each_entry_from((desc), (msi_list), list)
+#define for_each_new_msi_entry(desc, dev)			\
+	 __for_each_new_msi_entry((desc), (dev)->msi_last_list, dev_to_msi_list((dev)))
+#define __for_each_new_msi_vector(desc, __irq, msi_last_list, msi_list)	\
+	__for_each_new_msi_entry((desc), (msi_last_list), (msi_list))	\
+		if ((desc)->irq)					\
+			for ((__irq) = (desc)->irq;			\
+			     (__irq) < ((desc)->irq + (desc)->nvec_used);	\
 			     (__irq)++)
-#define for_each_new_msi_entry_safe(desc, tmp, dev)				\
-        (desc) = list_entry((dev)->msi_last_list->next, struct msi_desc, list);	\
-        list_for_each_entry_safe_from((desc), (tmp), dev_to_msi_list((dev)), list)
+#define for_each_new_msi_vector(desc, __irq, dev)	\
+	__for_each_new_msi_vector((desc), (__irq), (dev)->msi_last_list, dev_to_msi_list((dev)))
+#define for_each_new_msi_entry_safe(desc, tmp, dev)	\
+	(desc) = list_entry((dev)->msi_last_list->next, struct msi_desc, list);	\
+	list_for_each_entry_safe_from((desc), (tmp), dev_to_msi_list((dev)), list)
+#define for_each_dev_msi_entry(desc, dev)	\
+	list_for_each_entry((desc), dev_to_dev_msi_list((dev)), list)
+#define for_each_new_dev_msi_entry(desc, dev)                       \
+	__for_each_new_msi_entry((desc), (dev)->dev_msi_last_list, dev_to_dev_msi_list((dev)))
 
 #ifdef CONFIG_IRQ_MSI_IOMMU
 static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
@@ -397,10 +410,16 @@ struct msi_domain_ops {
 					     struct device *dev, int nvec);
 	void		(*domain_free_irqs)(struct irq_domain *domain,
 					    struct device *dev);
+	void		(*domain_free_irq)(struct irq_domain *domain,
+					   struct device *dev,
+					   unsigned int irq);
 	int		(*msi_alloc_store)(struct irq_domain *domain,
 					   struct device *dev, int nvec);
 	void		(*msi_free_store)(struct irq_domain *domain,
 					  struct device *dev);
+	void            (*msi_free_irq)(struct irq_domain *domain,
+					struct device *dev, unsigned int irq);
+
 };
 
 /**
@@ -464,7 +483,9 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			  int nvec);
 void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
 void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+int device_msi_add_irq(struct irq_domain *domain, struct device *dev);
 void msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq);
+void __msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq);
 struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);
 
 struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3069099f118d..b934f7a04d8f 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -424,6 +424,7 @@ static struct msi_domain_ops msi_domain_ops_default = {
 	.set_desc		= msi_domain_ops_set_desc,
 	.domain_alloc_irqs	= __msi_domain_alloc_irqs,
 	.domain_free_irqs	= __msi_domain_free_irqs,
+	.domain_free_irq	= __msi_domain_free_irq,
 };
 
 static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -439,6 +440,8 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
 		ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
 	if (ops->domain_free_irqs == NULL)
 		ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
+	if (ops->domain_free_irq == NULL)
+		ops->domain_free_irq = msi_domain_ops_default.domain_free_irq;
 
 	if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
 		return;
@@ -593,6 +596,8 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	msi_alloc_info_t arg = { };
 	int i, ret, virq;
 	bool can_reserve;
+	struct list_head *msi_last_list;
+	struct list_head *msi_list;
 
 	ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
 	if (ret)
@@ -604,7 +609,15 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			return ret;
 	}
 
-	for_each_new_msi_entry(desc, dev) {
+	if (domain->bus_token == DOMAIN_BUS_DEVICE_MSI) {
+		msi_last_list = dev->dev_msi_last_list;
+		msi_list = dev_to_dev_msi_list(dev);
+	} else {
+		msi_last_list = dev->msi_last_list;
+		msi_list = dev_to_msi_list(dev);
+	}
+
+	__for_each_new_msi_entry(desc, msi_last_list, msi_list) {
 		ops->set_desc(&arg, desc);
 
 		virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
@@ -638,7 +651,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
 		goto skip_activate;
 
-	for_each_new_msi_vector(desc, i, dev) {
+	__for_each_new_msi_vector(desc, i, msi_last_list, msi_list) {
 		if (desc->irq == i) {
 			virq = desc->irq;
 			dev_dbg(dev, "irq [%d-%d] for MSI\n",
@@ -662,7 +675,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	 * so request_irq() will assign the final vector.
 	 */
 	if (can_reserve) {
-		for_each_new_msi_vector(desc, i, dev) {
+		__for_each_new_msi_vector(desc, i, msi_last_list, msi_list) {
 			irq_data = irq_domain_get_irq_data(domain, i);
 			irqd_clr_activated(irq_data);
 		}
@@ -700,14 +713,18 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 	struct msi_domain_ops *ops = info->ops;
 	struct msi_desc *desc;
 	int i;
+	struct list_head *msi_list;
+
+	msi_list = (domain->bus_token == DOMAIN_BUS_DEVICE_MSI)
+			 ? dev_to_dev_msi_list(dev) : dev_to_msi_list(dev);
 
-	for_each_msi_vector(desc, i, dev) {
+	__for_each_msi_vector(desc, i, msi_list) {
 		irq_data = irq_domain_get_irq_data(domain, i);
 		if (irqd_is_activated(irq_data))
 			irq_domain_deactivate_irq(irq_data);
 	}
 
-	for_each_msi_entry(desc, dev) {
+	__for_each_msi_entry(desc, msi_list) {
 		/*
 		 * We might have failed to allocate an MSI early
 		 * enough that there is no IRQ associated to this
@@ -736,6 +753,7 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 
 	return ops->domain_free_irqs(domain, dev);
 }
+EXPORT_SYMBOL_GPL(msi_domain_free_irqs);
 
 /**
  * msi_domain_free_irq - Free interrupt from a MSI interrupt @domain associated to @dev
@@ -743,7 +761,7 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
  * @dev:	Pointer to device struct for which the interrupt needs to be freed
  * @irq:	Interrupt to be freed
  */
-void msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq)
+void __msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq)
 {
 	struct msi_desc *desc = irq_get_msi_desc(irq);
 	struct irq_data *irq_data;
@@ -758,6 +776,15 @@ void msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned
 	}
 }
 
+void msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+
+	return ops->domain_free_irq(domain, dev, irq);
+}
+EXPORT_SYMBOL_GPL(msi_domain_free_irq);
+
 /**
  * msi_get_domain_info - Get the MSI interrupt domain info for @domain
  * @domain:	The interrupt domain to retrieve data from
@@ -782,7 +809,7 @@ static struct msi_desc *get_dev_msi_entry(struct device *dev, unsigned int nr)
 	struct msi_desc *entry;
 	int i = 0;
 
-	for_each_msi_entry(entry, dev) {
+	for_each_dev_msi_entry(entry, dev) {
 		if (i == nr)
 			return entry;
 		i++;
-- 
Gitee


From 61babcd38d6121247fdc2334ecd7fe0eefe8484f Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 17 Mar 2021 15:21:23 -0700
Subject: [PATCH 1772/2161] dmaengine: idxd: add external module driver support
 for dsa_bus_type

commit d581defaa516d735db9446b9a47cdc43a5f2e49a Intel-BKC.

Add support to allow an external driver to be registered to the
dsa_bus_type and also auto-loaded.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/bus.c  | 6 ++++++
 drivers/dma/idxd/idxd.h | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/drivers/dma/idxd/bus.c b/drivers/dma/idxd/bus.c
index 02837f0fb3e4..ef81c3bbe92c 100644
--- a/drivers/dma/idxd/bus.c
+++ b/drivers/dma/idxd/bus.c
@@ -68,11 +68,17 @@ static int idxd_config_bus_remove(struct device *dev)
 	return 0;
 }
 
+static int idxd_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	return add_uevent_var(env, "MODALIAS=" IDXD_DEVICES_MODALIAS_FMT, 0);
+}
+
 struct bus_type dsa_bus_type = {
 	.name = "dsa",
 	.match = idxd_config_bus_match,
 	.probe = idxd_config_bus_probe,
 	.remove = idxd_config_bus_remove,
+	.uevent = idxd_bus_uevent,
 };
 EXPORT_SYMBOL_GPL(dsa_bus_type);
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index da72eb15f610..4bd764db8c94 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -528,6 +528,9 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq)
 	return wq->client_count;
 };
 
+#define MODULE_ALIAS_IDXD_DEVICE(type) MODULE_ALIAS("idxd:t" __stringify(type) "*")
+#define IDXD_DEVICES_MODALIAS_FMT "idxd:t%d"
+
 int __must_check __idxd_driver_register(struct idxd_device_driver *idxd_drv,
 					struct module *module, const char *mod_name);
 #define idxd_driver_register(driver) \
-- 
Gitee


From 9579d0ca27324ff7c12af6cc05f1466f916b1a29 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 30 Nov 2020 13:03:10 -0700
Subject: [PATCH 1773/2161] dmaengine: idxd: add rest of IAX definitions

commit fbfce5c688489b8654a49a75256266ae77abe128 Intel-BKC.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/idxd.h | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index a8f0ff75c430..bce7c43657d5 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -53,6 +53,11 @@ enum idxd_scmd_stat {
 
 /* IAX */
 #define IDXD_OP_FLAG_RD_SRC2_AECS	0x010000
+#define IDXD_OP_FLAG_RD_SRC2_2ND	0x020000
+#define IDXD_OP_FLAG_WR_SRC2_AECS_COMP	0x040000
+#define IDXD_OP_FLAG_WR_SRC2_AECS_OVFL	0x080000
+#define IDXD_OP_FLAG_SRC2_STS		0x100000
+#define IDXD_OP_FLAG_CRC_RFC3720	0x200000
 
 /* Opcode */
 enum dsa_opcode {
@@ -81,6 +86,18 @@ enum iax_opcode {
 	IAX_OPCODE_MEMMOVE,
 	IAX_OPCODE_DECOMPRESS = 0x42,
 	IAX_OPCODE_COMPRESS,
+	IAX_OPCODE_CRC64,
+	IAX_OPCODE_ZERO_DECOMP_32 = 0x48,
+	IAX_OPCODE_ZERO_DECOMP_16,
+	IAX_OPCODE_DECOMP_32 = 0x4c,
+	IAX_OPCODE_DECOMP_16,
+	IAX_OPCODE_SCAN = 0x50,
+	IAX_OPCODE_SET_MEMBER,
+	IAX_OPCODE_EXTRACT,
+	IAX_OPCODE_SELECT,
+	IAX_OPCODE_RLE_BURST,
+	IAX_OPCDE_FIND_UNIQUE,
+	IAX_OPCODE_EXPAND,
 };
 
 /* Completion record status */
@@ -120,6 +137,7 @@ enum iax_completion_status {
 	IAX_COMP_NONE = 0,
 	IAX_COMP_SUCCESS,
 	IAX_COMP_PAGE_FAULT_IR = 0x04,
+	IAX_COMP_ANALYTICS_ERROR = 0x0a,
 	IAX_COMP_OUTBUF_OVERFLOW,
 	IAX_COMP_BAD_OPCODE = 0x10,
 	IAX_COMP_INVALID_FLAGS,
@@ -140,7 +158,10 @@ enum iax_completion_status {
 	IAX_COMP_WATCHDOG,
 	IAX_COMP_INVALID_COMP_FLAG = 0x30,
 	IAX_COMP_INVALID_FILTER_FLAG,
-	IAX_COMP_INVALID_NUM_ELEMS = 0x33,
+	IAX_COMP_INVALID_INPUT_SIZE,
+	IAX_COMP_INVALID_NUM_ELEMS,
+	IAX_COMP_INVALID_SRC1_WIDTH,
+	IAX_COMP_INVALID_INVERT_OUT,
 };
 
 #define DSA_COMP_STATUS_MASK		0x7f
@@ -319,8 +340,12 @@ struct iax_completion_record {
 	uint32_t                output_size;
 	uint8_t                 output_bits;
 	uint8_t                 rsvd3;
-	uint16_t                rsvd4;
-	uint64_t                rsvd5[4];
+	uint16_t                xor_csum;
+	uint32_t                crc;
+	uint32_t                min;
+	uint32_t                max;
+	uint32_t                sum;
+	uint64_t                rsvd4[2];
 } __attribute__((packed));
 
 struct iax_raw_completion_record {
-- 
Gitee


From d2ace3e9750b65bf20e9c4e482ebafd866828086 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@intel.com>
Date: Fri, 1 Oct 2021 14:05:27 -0700
Subject: [PATCH 1774/2161] dmaengine: idxd: Export drv_enable/disable and
 related functions

commit dbf3e023c824b280e859dc6552636a24f49f9c58 Intel-BKC.

To allow idxd sub-drivers to enable and disable wqs, both with and
without the wq mutex held, export them.

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index f652da6ab47d..18d0099b60c5 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1298,6 +1298,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 err:
 	return rc;
 }
+EXPORT_SYMBOL_GPL(__drv_enable_wq);
 
 int drv_enable_wq(struct idxd_wq *wq)
 {
@@ -1327,6 +1328,7 @@ void __drv_disable_wq(struct idxd_wq *wq)
 
 	wq->client_count = 0;
 }
+EXPORT_SYMBOL_GPL(__drv_disable_wq);
 
 void drv_disable_wq(struct idxd_wq *wq)
 {
-- 
Gitee


From ca3fd7e2989029f3b7b01cc47d334649007f9523 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@intel.com>
Date: Fri, 1 Oct 2021 11:49:57 -0700
Subject: [PATCH 1775/2161] dmaengine: idxd: Export descriptor management
 functions

commit 5d666a0671c020c4ebc400993da14b0916f8a455 Intel-BKC.

To allow idxd sub-drivers to access the descriptor management
functions, export them.

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index c01db23e3333..0536d3e74ad2 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -61,6 +61,7 @@ struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype)
 
 	return __get_desc(wq, idx, cpu);
 }
+EXPORT_SYMBOL_GPL(idxd_alloc_desc);
 
 void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 {
@@ -69,6 +70,7 @@ void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	desc->cpu = -1;
 	sbitmap_queue_clear(&wq->sbq, desc->id, cpu);
 }
+EXPORT_SYMBOL_GPL(idxd_free_desc);
 
 static struct idxd_desc *list_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
 					 struct idxd_desc *desc)
@@ -215,3 +217,4 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	percpu_ref_put(&wq->wq_active);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(idxd_submit_desc);
-- 
Gitee


From 7e2bc6ca6c7aae66612bdbe5bf0aad4ee3004ae2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@intel.com>
Date: Fri, 1 Oct 2021 13:49:54 -0700
Subject: [PATCH 1776/2161] dmaengine: idxd: Export wq resource management
 functions

commit 0143f546db988a1435fbad3a48f6bbaed7ca9cce Intel-BKC.

To allow idxd sub-drivers to access the wq resource management
functions, export them.

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 18d0099b60c5..03de61c25311 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -162,6 +162,7 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	free_hw_descs(wq);
 	return rc;
 }
+EXPORT_SYMBOL_GPL(idxd_wq_alloc_resources);
 
 void idxd_wq_free_resources(struct idxd_wq *wq)
 {
@@ -175,6 +176,7 @@ void idxd_wq_free_resources(struct idxd_wq *wq)
 	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
 	sbitmap_queue_free(&wq->sbq);
 }
+EXPORT_SYMBOL_GPL(idxd_wq_free_resources);
 
 int idxd_wq_enable(struct idxd_wq *wq)
 {
@@ -397,6 +399,7 @@ int idxd_wq_init_percpu_ref(struct idxd_wq *wq)
 	reinit_completion(&wq->wq_resurrect);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(idxd_wq_init_percpu_ref);
 
 void __idxd_wq_quiesce(struct idxd_wq *wq)
 {
@@ -406,6 +409,7 @@ void __idxd_wq_quiesce(struct idxd_wq *wq)
 	complete_all(&wq->wq_resurrect);
 	wait_for_completion(&wq->wq_dead);
 }
+EXPORT_SYMBOL_GPL(__idxd_wq_quiesce);
 
 void idxd_wq_quiesce(struct idxd_wq *wq)
 {
@@ -413,6 +417,7 @@ void idxd_wq_quiesce(struct idxd_wq *wq)
 	__idxd_wq_quiesce(wq);
 	mutex_unlock(&wq->wq_lock);
 }
+EXPORT_SYMBOL_GPL(idxd_wq_quiesce);
 
 /* Device control bits */
 static inline bool idxd_is_enabled(struct idxd_device *idxd)
-- 
Gitee


From ad0f06152c8641c458ceaba7e0ded8882ba7c3b0 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@intel.com>
Date: Wed, 1 Dec 2021 12:52:15 -0800
Subject: [PATCH 1777/2161] dmaengine: idxd: Add private_data to struct idxd_wq

commit 74c8bded7efeda08bc7ce59cba3c310fb0ba0ee4 Intel-BKC.

Add a void * to idxd_wqs for user-defined context data.

Signed-off-by: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 4bd764db8c94..e0e0314e4c96 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -209,6 +209,8 @@ struct idxd_wq {
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
 	bool ats_dis;
+
+	void *private_data;
 };
 
 struct idxd_engine {
-- 
Gitee


From 8cdd83789b3be6b532b8f2c38133c27003973537 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@intel.com>
Date: Wed, 1 Dec 2021 12:55:40 -0800
Subject: [PATCH 1778/2161] lib/hexdump: Allow print_hexdump() to print rowsize
 of 8

commit c72052b04c9e646b9fa6eb315b055797c17c9da0 Intel-BKC.

Sometimes it's useful to be able to print hexdumps with widths of 8
e.g. when structures in manuals are printed that way, it's easier to
do the mapping directly when looking at data.

Signed-off-by: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 lib/hexdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/hexdump.c b/lib/hexdump.c
index 147133f8eb2f..9007d325e142 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -244,7 +244,7 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type,
 	int i, linelen, remaining = len;
 	unsigned char linebuf[32 * 3 + 2 + 32 + 1];
 
-	if (rowsize != 16 && rowsize != 32)
+	if (rowsize != 8 && rowsize != 16 && rowsize != 32)
 		rowsize = 16;
 
 	for (i = 0; i < len; i += rowsize) {
-- 
Gitee


From 09606672500c800bd388b062be19a9297fcc6022 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@intel.com>
Date: Fri, 30 Jul 2021 07:00:18 -0700
Subject: [PATCH 1779/2161] dmaengine: idxd: revert kernel name with pasid
 disable

commit bca2a4c0ad145e938f38a3bc7bcbb23aa313a654 Intel-BKC.

NOT FOR UPSTREAM

Signed-off-by: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index dfd549685c46..a48928973bd4 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -839,13 +839,6 @@ static ssize_t wq_name_store(struct device *dev,
 	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
 		return -EINVAL;
 
-	/*
-	 * This is temporarily placed here until we have SVM support for
-	 * dmaengine.
-	 */
-	if (wq->type == IDXD_WQT_KERNEL && device_pasid_enabled(wq->idxd))
-		return -EOPNOTSUPP;
-
 	memset(wq->name, 0, WQ_NAME_SIZE + 1);
 	strncpy(wq->name, buf, WQ_NAME_SIZE);
 	strreplace(wq->name, '\n', '\0');
-- 
Gitee


From 0437b5deeb31c4b9a86a6e81974fe67558b60275 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 21 Sep 2021 14:52:44 -0700
Subject: [PATCH 1780/2161] crypto: iax - Introduce iax hardware compression
 accelerator

commit 749cb2d2029caad14a10e4e2fce076b10470c720 Intel-BKC.

The Intel Analytics Accelerator (IAX) is a hardware accelerator that
provides very high thoughput compression/decompression combined with
analytic primitive functions. The primitive functions are commonly
used for filtering data during analytic query processing. The IAX
primarily targets applications such as big data and in-memory analytics
databases as well as application-transparent usages such as memory page
compression (e.g. zswap compression/decompression).

The IAX supports compression/decompression compatible with the DEFLATE
compression standard described in RFC 1951, which is the
compression/decompression algorithm exported by this module.  Users
can select IAX compress/decompress acceleration by specifying
'iax_crypto' as the compression algorithm to use by whatever facility
allows compression algorithms to be selected.

For example, a zram drive can select iax_crypto via:

  # echo iax_crypto > /sys/block/zram1/comp_algorithm

Similarly for zswap:

  # echo iax_crypto > /sys/module/zswap/parameters/compressor

In order for the iax_crypto module to actually do any
compression/decompression work on behalf of a facility, one or more
IAX workqueues need to be bound to the iax_crypto driver.

For instance, here's an example of configuring an IAX workqueue
and binding it to the iax_crypto driver:

  # configure wq1.0
  #
  # echo shared > /sys/bus/dsa/devices/iax1/wq1.0/mode
  # echo "kernel" > /sys/bus/dsa/devices/iax1/wq1.0/type
  # echo "iax_crypto" > /sys/bus/dsa/devices/iax1/wq1.0/name
  # echo 0 > /sys/bus/dsa/devices/iax1/engine1.0/group_id

  # enable IAX device iax1
  #
  # echo iax1 > /sys/bus/dsa/drivers/idxd/bind

  # enable wq1.0 on IAX device iax1

  # echo wq1.0 > /sys/bus/dsa/drivers/crypto/bind

Whenever a new workqueue is bound to or unbound from the iax_crypto
driver, the available workqueues are 'rebalanced' such that work
submitted from a particular CPU is given to the most appropriate
workqueue available.  There currently isn't any way for the user to
tweak the way this is done internally - if necessary, knobs can be
added later for that purpose.  Current best practice is to configure
and bind at least one workqueue for each IAX device, but as long as
there is at least one workqueue configured and bound to any IAX device
in the system, the iax_crypto driver will work, albeit most likely not
as efficiently.

[ Based on work originally by George Powley and Jing Lin. ]

Co-developed-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/Kconfig               |   1 +
 drivers/crypto/Makefile              |   1 +
 drivers/crypto/iax/Kconfig           |   8 +
 drivers/crypto/iax/Makefile          |   9 +
 drivers/crypto/iax/iax_crypto.h      |  56 ++
 drivers/crypto/iax/iax_crypto_main.c | 866 +++++++++++++++++++++++++++
 6 files changed, 941 insertions(+)
 create mode 100644 drivers/crypto/iax/Kconfig
 create mode 100644 drivers/crypto/iax/Makefile
 create mode 100644 drivers/crypto/iax/iax_crypto.h
 create mode 100644 drivers/crypto/iax/iax_crypto_main.c

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 1f6308cdf79a..0cf6e4d1e73f 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -605,6 +605,7 @@ config CRYPTO_DEV_MXS_DCP
 source "drivers/crypto/qat/Kconfig"
 source "drivers/crypto/cavium/cpt/Kconfig"
 source "drivers/crypto/cavium/nitrox/Kconfig"
+source "drivers/crypto/iax/Kconfig"
 
 config CRYPTO_DEV_CAVIUM_ZIP
 	tristate "Cavium ZIP driver"
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index afc4753b5d28..94a63a0f814f 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_CRYPTO_DEV_ATMEL_I2C) += atmel-i2c.o
 obj-$(CONFIG_CRYPTO_DEV_ATMEL_ECC) += atmel-ecc.o
 obj-$(CONFIG_CRYPTO_DEV_ATMEL_SHA204A) += atmel-sha204a.o
 obj-$(CONFIG_CRYPTO_DEV_CAVIUM_ZIP) += cavium/
+obj-$(CONFIG_CRYPTO_DEV_IAX_CRYPTO) += iax/
 obj-$(CONFIG_CRYPTO_DEV_CCP) += ccp/
 obj-$(CONFIG_CRYPTO_DEV_CCREE) += ccree/
 obj-$(CONFIG_CRYPTO_DEV_CHELSIO) += chelsio/
diff --git a/drivers/crypto/iax/Kconfig b/drivers/crypto/iax/Kconfig
new file mode 100644
index 000000000000..e546d5952486
--- /dev/null
+++ b/drivers/crypto/iax/Kconfig
@@ -0,0 +1,8 @@
+config CRYPTO_DEV_IAX_CRYPTO
+	tristate "IAX Crypto Driver"
+	depends on CRYPTO_DEFLATE
+	depends on INTEL_IDXD
+	default n
+	help
+	  This driver supports Intel analytics accelerator hardware.
+	  The module will be called iax_crypto.
diff --git a/drivers/crypto/iax/Makefile b/drivers/crypto/iax/Makefile
new file mode 100644
index 000000000000..d0f3c1af8c73
--- /dev/null
+++ b/drivers/crypto/iax/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for IAX crypto device drivers
+#
+
+ccflags-y += -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE=IDXD
+
+obj-$(CONFIG_CRYPTO_DEV_IAX_CRYPTO) := iax_crypto.o
+iax_crypto-y := iax_crypto_main.o
diff --git a/drivers/crypto/iax/iax_crypto.h b/drivers/crypto/iax/iax_crypto.h
new file mode 100644
index 000000000000..30771be44a0b
--- /dev/null
+++ b/drivers/crypto/iax/iax_crypto.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#ifndef __IAX_CRYPTO_H__
+#define __IAX_CRYPTO_H__
+
+#include <linux/crypto.h>
+#include <linux/idxd.h>
+#include <uapi/linux/idxd.h>
+
+#undef pr_fmt
+#define	pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#define IAX_DECOMP_ENABLE		BIT(0)
+#define IAX_DECOMP_FLUSH_OUTPUT		BIT(1)
+#define IAX_DECOMP_CHECK_FOR_EOB	BIT(2)
+#define IAX_DECOMP_STOP_ON_EOB		BIT(3)
+#define IAX_DECOMP_SUPPRESS_OUTPUT	BIT(9)
+
+#define IAX_COMP_FLUSH_OUTPUT		BIT(1)
+#define IAX_COMP_APPEND_EOB		BIT(2)
+
+#define IAX_COMPLETION_TIMEOUT		1000000
+
+#define IAX_ANALYTICS_ERROR		0x0a
+#define IAX_ERROR_COMP_BUF_OVERFLOW	0x19
+#define IAX_ERROR_WATCHDOG_EXPIRED	0x24
+
+#define DYNAMIC_HDR			0x2
+#define DYNAMIC_HDR_SIZE		3
+
+#define IAX_COMP_FLAGS			(IAX_COMP_FLUSH_OUTPUT | \
+					 IAX_COMP_APPEND_EOB)
+
+#define IAX_DECOMP_FLAGS		(IAX_DECOMP_ENABLE |	   \
+					 IAX_DECOMP_FLUSH_OUTPUT | \
+					 IAX_DECOMP_CHECK_FOR_EOB | \
+					 IAX_DECOMP_STOP_ON_EOB)
+
+/*
+ * Analytics Engine Configuration and State (AECS) contains parameters and
+ * internal state of the analytics engine.
+ */
+struct aecs_table_record {
+	u32 crc;
+	u32 xor_checksum;
+	u32 reserved0[5];
+	u32 num_output_accum_bits;
+	u8 output_accum[256];
+	u32 ll_sym[286];
+	u32 reserved1;
+	u32 reserved2;
+	u32 d_sym[30];
+	u32 reserved_padding[2];
+};
+#endif
diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
new file mode 100644
index 000000000000..5402ce5029a5
--- /dev/null
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -0,0 +1,866 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <uapi/linux/idxd.h>
+#include <linux/highmem.h>
+
+#include "registers.h"
+#include "idxd.h"
+#include "iax_crypto.h"
+
+#define IAX_CRYPTO_VER			"1.0"
+
+#define IAX_CRYPTO_WQ_NAME		"iax_crypto"
+#define IAX_ALG_PRIORITY		300
+#define IAX_AECS_ALIGN			32
+
+/* IAX completion timeout value in tsc units */
+static unsigned int iax_completion_timeout = IAX_COMPLETION_TIMEOUT;
+
+module_param_named(iax_completion_timeout, iax_completion_timeout, uint, 0644);
+MODULE_PARM_DESC(iax_completion_timeout, "IAX completion timeout (1000000 cycles default)");
+
+/* Verify results of IAX compress or not */
+static bool iax_verify_compress = 1;
+
+module_param_named(iax_verify_compress, iax_verify_compress, bool, 0644);
+MODULE_PARM_DESC(iax_verify_compress,
+		 "Verify IAX compression (value = 1) or not (value = 0)");
+
+struct iax_wq {
+	struct list_head	list;
+	struct idxd_wq		*wq;
+
+	struct iax_device	*iax_device;
+};
+
+/* Representation of IAX device with wqs, populated by probe */
+struct iax_device {
+	struct list_head		list;
+	struct idxd_device		*idxd;
+
+	int				n_wq;
+	struct list_head		wqs;
+};
+
+static LIST_HEAD(iax_devices);
+static DEFINE_SPINLOCK(iax_devices_lock);
+
+static struct iax_device *iax_device_alloc(void)
+{
+	struct iax_device *iax_device;
+
+	iax_device = kzalloc(sizeof(*iax_device), GFP_KERNEL);
+	if (!iax_device)
+		return NULL;
+
+	INIT_LIST_HEAD(&iax_device->wqs);
+
+	return iax_device;
+}
+
+static void iax_device_free(struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq, *next;
+
+	list_for_each_entry_safe(iax_wq, next, &iax_device->wqs, list)
+		list_del(&iax_wq->list);
+
+	kfree(iax_device);
+}
+
+static void free_iax_devices(void)
+{
+	struct iax_device *iax_device, *next;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry_safe(iax_device, next, &iax_devices, list) {
+		list_del(&iax_device->list);
+		iax_device_free(iax_device);
+	}
+	spin_unlock(&iax_devices_lock);
+}
+
+/* IAX number of iax instances found */
+static unsigned int nr_iax;
+static unsigned int nr_cpus;
+static unsigned int nr_nodes;
+
+/* Number of physical cpus sharing each iax instance */
+static unsigned int cpus_per_iax;
+
+/* Per-cpu lookup table for balanced wqs */
+static struct idxd_wq * __percpu *wq_table;
+
+static struct aecs_table_record aecs_table __aligned(IAX_AECS_ALIGN);
+
+/*
+ * Given a cpu, find the closest IAX instance.  The idea is to try to
+ * choose the most appropriate IAX instance for a caller and spread
+ * available workqueues around to clients.
+ */
+static inline int cpu_to_iax(int cpu)
+{
+	const struct cpumask *node_cpus;
+	int node, n_cpus = 0, test_cpu, iax;
+	int nr_iax_per_node;
+
+	nr_iax_per_node = nr_iax / nr_nodes;
+
+	for_each_online_node(node) {
+		node_cpus = cpumask_of_node(node);
+		if (!cpumask_test_cpu(cpu, node_cpus))
+			continue;
+
+		iax = node * nr_iax_per_node;
+
+		for_each_cpu(test_cpu, node_cpus) {
+			if (test_cpu == cpu)
+				return iax;
+
+			n_cpus++;
+			if ((n_cpus % cpus_per_iax) == 0)
+				iax++;
+		}
+	}
+
+	return -1;
+}
+
+static bool iax_has_wq(struct iax_device *iax_device, struct idxd_wq *wq)
+{
+	struct iax_wq *iax_wq;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list) {
+		if (iax_wq->wq == wq)
+			return true;
+	}
+
+	return false;
+}
+
+const u32 fixed_ll_sym[286] = {
+	0x40030, 0x40031, 0x40032, 0x40033, 0x40034, 0x40035, 0x40036, 0x40037,
+	0x40038, 0x40039, 0x4003A, 0x4003B, 0x4003C, 0x4003D, 0x4003E, 0x4003F,
+	0x40040, 0x40041, 0x40042, 0x40043, 0x40044, 0x40045, 0x40046, 0x40047,
+	0x40048, 0x40049, 0x4004A, 0x4004B, 0x4004C, 0x4004D, 0x4004E, 0x4004F,
+	0x40050, 0x40051, 0x40052, 0x40053, 0x40054, 0x40055, 0x40056, 0x40057,
+	0x40058, 0x40059, 0x4005A, 0x4005B, 0x4005C, 0x4005D, 0x4005E, 0x4005F,
+	0x40060, 0x40061, 0x40062, 0x40063, 0x40064, 0x40065, 0x40066, 0x40067,
+	0x40068, 0x40069, 0x4006A, 0x4006B, 0x4006C, 0x4006D, 0x4006E, 0x4006F,
+	0x40070, 0x40071, 0x40072, 0x40073, 0x40074, 0x40075, 0x40076, 0x40077,
+	0x40078, 0x40079, 0x4007A, 0x4007B, 0x4007C, 0x4007D, 0x4007E, 0x4007F,
+	0x40080, 0x40081, 0x40082, 0x40083, 0x40084, 0x40085, 0x40086, 0x40087,
+	0x40088, 0x40089, 0x4008A, 0x4008B, 0x4008C, 0x4008D, 0x4008E, 0x4008F,
+	0x40090, 0x40091, 0x40092, 0x40093, 0x40094, 0x40095, 0x40096, 0x40097,
+	0x40098, 0x40099, 0x4009A, 0x4009B, 0x4009C, 0x4009D, 0x4009E, 0x4009F,
+	0x400A0, 0x400A1, 0x400A2, 0x400A3, 0x400A4, 0x400A5, 0x400A6, 0x400A7,
+	0x400A8, 0x400A9, 0x400AA, 0x400AB, 0x400AC, 0x400AD, 0x400AE, 0x400AF,
+	0x400B0, 0x400B1, 0x400B2, 0x400B3, 0x400B4, 0x400B5, 0x400B6, 0x400B7,
+	0x400B8, 0x400B9, 0x400BA, 0x400BB, 0x400BC, 0x400BD, 0x400BE, 0x400BF,
+	0x48190, 0x48191, 0x48192, 0x48193, 0x48194, 0x48195, 0x48196, 0x48197,
+	0x48198, 0x48199, 0x4819A, 0x4819B, 0x4819C, 0x4819D, 0x4819E, 0x4819F,
+	0x481A0, 0x481A1, 0x481A2, 0x481A3, 0x481A4, 0x481A5, 0x481A6, 0x481A7,
+	0x481A8, 0x481A9, 0x481AA, 0x481AB, 0x481AC, 0x481AD, 0x481AE, 0x481AF,
+	0x481B0, 0x481B1, 0x481B2, 0x481B3, 0x481B4, 0x481B5, 0x481B6, 0x481B7,
+	0x481B8, 0x481B9, 0x481BA, 0x481BB, 0x481BC, 0x481BD, 0x481BE, 0x481BF,
+	0x481C0, 0x481C1, 0x481C2, 0x481C3, 0x481C4, 0x481C5, 0x481C6, 0x481C7,
+	0x481C8, 0x481C9, 0x481CA, 0x481CB, 0x481CC, 0x481CD, 0x481CE, 0x481CF,
+	0x481D0, 0x481D1, 0x481D2, 0x481D3, 0x481D4, 0x481D5, 0x481D6, 0x481D7,
+	0x481D8, 0x481D9, 0x481DA, 0x481DB, 0x481DC, 0x481DD, 0x481DE, 0x481DF,
+	0x481E0, 0x481E1, 0x481E2, 0x481E3, 0x481E4, 0x481E5, 0x481E6, 0x481E7,
+	0x481E8, 0x481E9, 0x481EA, 0x481EB, 0x481EC, 0x481ED, 0x481EE, 0x481EF,
+	0x481F0, 0x481F1, 0x481F2, 0x481F3, 0x481F4, 0x481F5, 0x481F6, 0x481F7,
+	0x481F8, 0x481F9, 0x481FA, 0x481FB, 0x481FC, 0x481FD, 0x481FE, 0x481FF,
+	0x38000, 0x38001, 0x38002, 0x38003, 0x38004, 0x38005, 0x38006, 0x38007,
+	0x38008, 0x38009, 0x3800A, 0x3800B, 0x3800C, 0x3800D, 0x3800E, 0x3800F,
+	0x38010, 0x38011, 0x38012, 0x38013, 0x38014, 0x38015, 0x38016, 0x38017,
+	0x400C0, 0x400C1, 0x400C2, 0x400C3, 0x400C4, 0x400C5
+};
+
+const u32 fixed_d_sym[30] = {
+	0x28000, 0x28001, 0x28002, 0x28003, 0x28004, 0x28005, 0x28006, 0x28007,
+	0x28008, 0x28009, 0x2800A, 0x2800B, 0x2800C, 0x2800D, 0x2800E, 0x2800F,
+	0x28010, 0x28011, 0x28012, 0x28013, 0x28014, 0x28015, 0x28016, 0x28017,
+	0x28018, 0x28019, 0x2801A, 0x2801B, 0x2801C, 0x2801D
+};
+
+static struct iax_device *add_iax_device(struct idxd_device *idxd)
+{
+	struct iax_device *iax_device;
+
+	iax_device = iax_device_alloc();
+	if (!iax_device)
+		return NULL;
+
+	iax_device->idxd = idxd;
+
+	list_add_tail(&iax_device->list, &iax_devices);
+
+	nr_iax++;
+
+	return iax_device;
+}
+
+static void del_iax_device(struct iax_device *iax_device)
+{
+	list_del(&iax_device->list);
+
+	iax_device_free(iax_device);
+
+	nr_iax--;
+}
+
+static int add_iax_wq(struct iax_device *iax_device, struct idxd_wq *wq)
+{
+	struct iax_wq *iax_wq;
+
+	iax_wq = kzalloc(sizeof(*iax_wq), GFP_KERNEL);
+	if (!iax_wq)
+		return -ENOMEM;
+
+	iax_wq->wq = wq;
+	iax_wq->iax_device = iax_device;
+	wq->private_data = iax_wq;
+
+	list_add_tail(&iax_wq->list, &iax_device->wqs);
+
+	iax_device->n_wq++;
+
+	pr_debug("%s: added wq %p to iax %p, n_wq %d\n", __func__, wq, iax_device, iax_device->n_wq);
+
+	return 0;
+}
+
+static void del_iax_wq(struct iax_device *iax_device, struct idxd_wq *wq)
+{
+	struct iax_wq *iax_wq;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list) {
+		if (iax_wq->wq == wq) {
+			list_del(&iax_wq->list);
+			iax_device->n_wq--;
+
+			pr_debug("%s: removed wq %p from iax_device %p, n_wq %d, nr_iax %d\n", __func__, wq, iax_device, iax_device->n_wq, nr_iax);
+
+			if (iax_device->n_wq == 0) {
+				del_iax_device(iax_device);
+				break;
+			}
+		}
+	}
+}
+
+static int save_iax_wq(struct idxd_wq *wq)
+{
+	struct iax_device *iax_device, *found = NULL;
+	int ret = 0;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list) {
+		if (iax_device->idxd == wq->idxd) {
+			/*
+			 * Check to see that we don't already have this wq.
+			 * Shouldn't happen but we don't control probing.
+			 */
+			if (iax_has_wq(iax_device, wq)) {
+				pr_warn("%s: same wq probed multiple times for iax_device %p\n", __func__, iax_device);
+				goto out;
+			}
+
+			found = iax_device;
+
+			ret = add_iax_wq(iax_device, wq);
+			if (ret)
+				goto out;
+
+			break;
+		}
+	}
+
+	if (!found) {
+		struct iax_device *new;
+
+		new = add_iax_device(wq->idxd);
+		if (!new) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = add_iax_wq(new, wq);
+		if (ret) {
+			del_iax_device(new);
+			goto out;
+		}
+	}
+
+	BUG_ON(nr_iax == 0);
+
+	cpus_per_iax = nr_cpus / nr_iax;
+out:
+	spin_unlock(&iax_devices_lock);
+
+	return 0;
+}
+
+static void clear_wq_table(void)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		*per_cpu_ptr(wq_table, cpu) = NULL;
+
+	pr_debug("%s: cleared wq table\n", __func__);
+}
+
+static void remove_iax_wq(struct idxd_wq *wq)
+{
+	struct iax_device *iax_device;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list) {
+		if (iax_has_wq(iax_device, wq)) {
+			del_iax_wq(iax_device, wq);
+			if (nr_iax == 0)
+				clear_wq_table();
+			break;
+		}
+	}
+	spin_unlock(&iax_devices_lock);
+
+	if (nr_iax)
+		cpus_per_iax = nr_cpus / nr_iax;
+	else
+		cpus_per_iax = 0;
+
+	idxd_wq_put(wq);
+}
+
+static struct idxd_wq *request_iax_wq(int iax)
+{
+	struct iax_device *iax_device, *found_device = NULL;
+	struct idxd_wq *bkup_wq = NULL, *found_wq = NULL;
+	int cur_iax = 0, cur_wq = 0, cur_bkup;
+	struct iax_wq *iax_wq;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list) {
+		if (cur_iax != iax) {
+			cur_iax++;
+			continue;
+		}
+
+		found_device = iax_device;
+		pr_debug("%s: getting wq from iax_device %p (%d)\n", __func__, found_device, cur_iax);
+		break;
+	}
+
+	if (!found_device) {
+		found_device = list_first_entry_or_null(&iax_devices,
+							struct iax_device, list);
+		if (!found_device) {
+			pr_warn("%s: couldn't find any iax devices with wqs!\n", __func__);
+			goto out;
+		}
+		cur_iax = 0;
+		pr_debug("%s: getting wq from only iax_device %p (%d)\n", __func__, found_device, cur_iax);
+	}
+
+	list_for_each_entry(iax_wq, &found_device->wqs, list) {
+		/* Prefer unused wq but use if we can't find one */
+		if (idxd_wq_refcount(iax_wq->wq) > 0) {
+			idxd_wq_get(iax_wq->wq);
+			bkup_wq = iax_wq->wq;
+			cur_bkup = cur_wq;
+		} else {
+			pr_debug("%s: returning unused wq %p (%d) from iax device %p (%d)\n", __func__, iax_wq->wq, cur_wq, found_device, cur_iax);
+			found_wq = iax_wq->wq;
+			goto out;
+		}
+		cur_wq++;
+	}
+
+	if (bkup_wq) {
+		pr_debug("%s: returning used wq %p (%d) from iax device %p (%d)\n", __func__, bkup_wq, cur_bkup, found_device, cur_iax);
+		idxd_wq_get(bkup_wq);
+		found_wq = bkup_wq;
+		goto out;
+	}
+
+out:
+	spin_unlock(&iax_devices_lock);
+
+	return found_wq;
+}
+
+static inline int check_completion(struct iax_completion_record *comp,
+				   bool compress)
+{
+	char *op_str = compress ? "compress" : "decompress";
+	int ret = 0;
+
+	while (!comp->status)
+		cpu_relax();
+
+	if (comp->status != IAX_COMP_SUCCESS) {
+		if (comp->status == IAX_ERROR_WATCHDOG_EXPIRED) {
+			ret = -ETIMEDOUT;
+			pr_warn("%s: %s timed out, size=0x%x\n",
+				__func__, op_str, comp->output_size);
+			goto out;
+		}
+
+		if (comp->status == IAX_ANALYTICS_ERROR &&
+		    comp->error_code == IAX_ERROR_COMP_BUF_OVERFLOW &&
+		    compress == true) {
+			ret = -E2BIG;
+			pr_debug("%s: compressed size > uncompressed size, not compressing, size=0x%x\n", __func__, comp->output_size);
+			goto out;
+		}
+
+		ret = -EINVAL;
+		pr_err("%s: iax %s status=0x%x, error=0x%x, size=0x%x\n",
+		       __func__, op_str, ret, comp->error_code, comp->output_size);
+		print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET, 8, 1, comp, 64, 0);
+		goto out;
+	}
+out:
+	return ret;
+}
+
+static int iax_compress(struct crypto_tfm *tfm,
+			const u8 *src, unsigned int slen,
+			u8 *dst, unsigned int *dlen)
+{
+	struct idxd_desc *idxd_desc;
+	struct iax_hw_desc *desc;
+	u32 compression_crc;
+	struct idxd_wq *wq;
+	int ret = 0;
+
+	wq = *per_cpu_ptr(wq_table, smp_processor_id());
+	if (!wq) {
+		pr_err("%s: no wq configured for cpu=%d\n", __func__, smp_processor_id());
+		return -ENODEV;
+	}
+
+	pr_debug("%s: using wq for cpu=%d = wq %p\n", __func__, smp_processor_id(), wq);
+
+	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(idxd_desc)) {
+		pr_err("%s: idxd descriptor allocation failed\n", __func__);
+		pr_warn("%s: iax compress failed: ret=%ld\n", __func__, PTR_ERR(desc));
+
+		return PTR_ERR(idxd_desc);
+	}
+	desc = idxd_desc->iax_hw;
+
+	desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR |
+		IDXD_OP_FLAG_RD_SRC2_AECS | IDXD_OP_FLAG_CC;
+	desc->opcode = IAX_OPCODE_COMPRESS;
+	desc->src2_addr = (u64)(u64 *)&aecs_table;
+	desc->src2_size = sizeof(struct aecs_table_record);
+	desc->compr_flags = IAX_COMP_FLAGS;
+	desc->priv = 1;
+
+	desc->src1_addr = (u64)src;
+	desc->src1_size = slen;
+	desc->src2_addr = (u64)(u64 *)&aecs_table;
+	desc->dst_addr = (u64)dst;
+	desc->max_dst_size = *dlen;
+	desc->completion_addr = (u64)idxd_desc->iax_completion;
+
+	ret = idxd_submit_desc(wq, idxd_desc);
+	if (ret) {
+		pr_warn("%s: submit_desc failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	ret = check_completion(idxd_desc->iax_completion, true);
+	if (ret) {
+		pr_warn("%s: check_completion failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	*dlen = idxd_desc->iax_completion->output_size;
+
+	idxd_free_desc(wq, idxd_desc);
+
+	if (!iax_verify_compress)
+		goto out;
+
+	compression_crc = idxd_desc->iax_completion->crc;
+
+	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(idxd_desc)) {
+		pr_err("%s: idxd descriptor allocation failed\n", __func__);
+		pr_warn("%s: iax compress (verify) failed: ret=%ld\n", __func__, PTR_ERR(desc));
+
+		return PTR_ERR(desc);
+	}
+	desc = idxd_desc->iax_hw;
+
+	/* Verify (optional) - decompress and check crc, suppress dest write */
+
+	desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+	desc->opcode = IAX_OPCODE_DECOMPRESS;
+	desc->max_dst_size = PAGE_SIZE;
+	desc->decompr_flags = IAX_DECOMP_FLAGS | IAX_DECOMP_SUPPRESS_OUTPUT;
+	desc->priv = 1;
+
+	desc->src1_addr = (u64)dst;
+	desc->src1_size = *dlen;
+	desc->dst_addr = (u64)src;
+	desc->max_dst_size = slen;
+	desc->completion_addr = (u64)idxd_desc->iax_completion;
+
+	ret = idxd_submit_desc(wq, idxd_desc);
+	if (ret) {
+		pr_warn("%s: submit_desc (verify) failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	ret = check_completion(idxd_desc->iax_completion, true);
+	if (ret) {
+		pr_warn("%s: check_completion (verify) failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	if (compression_crc != idxd_desc->iax_completion->crc) {
+		ret = -EINVAL;
+		pr_err("%s: iax comp/decomp crc mismatch: comp=0x%x, decomp=0x%x\n", __func__,
+		       compression_crc, idxd_desc->iax_completion->crc);
+		print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET, 8, 1, idxd_desc->iax_completion, 64, 0);
+		goto err;
+	}
+
+	idxd_free_desc(wq, idxd_desc);
+out:
+	return ret;
+err:
+	idxd_free_desc(wq, idxd_desc);
+	pr_warn("iax compress failed: ret=%d\n", ret);
+
+	goto out;
+}
+
+static int iax_decompress(struct crypto_tfm *tfm,
+			  const u8 *src, unsigned int slen,
+			  u8 *dst, unsigned int *dlen)
+{
+	struct idxd_desc *idxd_desc;
+	struct iax_hw_desc *desc;
+	struct idxd_wq *wq;
+	int ret = 0;
+
+	wq = *per_cpu_ptr(wq_table, smp_processor_id());
+	if (!wq) {
+		pr_err("%s: no wq configured for cpu=%d\n", __func__, smp_processor_id());
+		return -ENODEV;
+	}
+
+	pr_debug("%s: using wq for cpu=%d = wq %p\n", __func__, smp_processor_id(), wq);
+
+	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(idxd_desc)) {
+		pr_err("%s: idxd descriptor allocation failed\n", __func__);
+		pr_warn("%s: iax decompress failed: ret=%ld\n", __func__, PTR_ERR(desc));
+
+		return PTR_ERR(desc);
+	}
+	desc = idxd_desc->iax_hw;
+
+	desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+	desc->opcode = IAX_OPCODE_DECOMPRESS;
+	desc->max_dst_size = PAGE_SIZE;
+	desc->decompr_flags = IAX_DECOMP_FLAGS;
+	desc->priv = 1;
+
+	desc->src1_addr = (u64)src;
+	desc->dst_addr = (u64)dst;
+	desc->max_dst_size = *dlen;
+	desc->src1_size = slen;
+	desc->completion_addr = (u64)idxd_desc->iax_completion;
+
+	ret = idxd_submit_desc(wq, idxd_desc);
+	if (ret) {
+		pr_warn("%s: submit_desc failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	ret = check_completion(idxd_desc->iax_completion, true);
+	if (ret) {
+		pr_warn("%s: check_completion failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	*dlen = idxd_desc->iax_completion->output_size;
+
+	idxd_free_desc(wq, idxd_desc);
+out:
+	return ret;
+err:
+	idxd_free_desc(wq, idxd_desc);
+	pr_warn("iax decompress failed: ret=%d\n", ret);
+
+	goto out;
+}
+
+static int iax_comp_compress(struct crypto_tfm *tfm,
+			     const u8 *src, unsigned int slen,
+			     u8 *dst, unsigned int *dlen)
+{
+	int ret;
+
+	pr_debug("%s: src %p, slen %d, dst %p, dlen %u\n",
+		 __func__, src, slen, dst, *dlen);
+
+	ret = iax_compress(tfm, src, slen, dst, dlen);
+	if (ret != 0)
+		pr_warn("synchronous compress failed ret=%d\n", ret);
+
+	return ret;
+}
+
+static int iax_comp_decompress(struct crypto_tfm *tfm,
+			       const u8 *src, unsigned int slen,
+			       u8 *dst, unsigned int *dlen)
+{
+	int ret;
+
+	pr_debug("%s: src %p, slen %d, dst %p, dlen %u\n",
+		 __func__, src, slen, dst, *dlen);
+
+	ret = iax_decompress(tfm, src, slen, dst, dlen);
+	if (ret != 0)
+		pr_warn("synchronous decompress failed ret=%d\n", ret);
+
+	return ret;
+}
+
+static struct crypto_alg iax_comp_deflate = {
+	.cra_name		= "deflate",
+	.cra_driver_name	= "iax_crypto",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_priority		= IAX_ALG_PRIORITY,
+	.cra_module		= THIS_MODULE,
+	.cra_u			= {
+		.compress = {
+			.coa_compress	= iax_comp_compress,
+			.coa_decompress	= iax_comp_decompress
+		}
+	}
+};
+
+static int iax_register_compression_device(void)
+{
+	int ret;
+
+	ret = crypto_register_alg(&iax_comp_deflate);
+	if (ret < 0) {
+		pr_err("deflate algorithm registration failed\n");
+		return ret;
+	}
+
+	return ret;
+}
+
+static void iax_unregister_compression_device(void)
+{
+	crypto_unregister_alg(&iax_comp_deflate);
+}
+
+static void iax_set_aecs(void)
+{
+	u32 offset;
+	u32 bfinal = 1;
+
+	/* Configure aecs table using fixed Huffman table */
+	aecs_table.crc = 0;
+	aecs_table.xor_checksum = 0;
+	offset = aecs_table.num_output_accum_bits / 8;
+	aecs_table.output_accum[offset] = DYNAMIC_HDR | bfinal;
+	aecs_table.num_output_accum_bits = DYNAMIC_HDR_SIZE;
+
+	/* Add Huffman table to aecs */
+	memcpy(aecs_table.ll_sym, fixed_ll_sym, sizeof(fixed_ll_sym));
+	memcpy(aecs_table.d_sym, fixed_d_sym, sizeof(fixed_d_sym));
+}
+
+static void rebalance_wq_table(void)
+{
+	int node, cpu, iax;
+	struct idxd_wq *wq;
+
+	if (nr_iax == 0)
+		return;
+
+	pr_debug("%s: nr_nodes=%d, nr_cpus %d, nr_iax %d, cpus_per_iax %d\n",
+		 __func__, nr_nodes, nr_cpus, nr_iax, cpus_per_iax);
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		iax = cpu_to_iax(cpu);
+		pr_debug("%s: iax=%d\n", __func__, iax);
+
+		BUG_ON(iax == -1);
+
+		wq = request_iax_wq(iax);
+		if (!wq) {
+			pr_err("could not get wq for iax %d!\n", iax);
+			return;
+		}
+
+		*per_cpu_ptr(wq_table, cpu) = wq;
+		pr_debug("%s: assigned wq for cpu=%d, node=%d = wq %p\n", __func__, cpu, node, wq);
+	}
+}
+
+static int iax_crypto_probe(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_driver_data *data = idxd->data;
+	int ret = 0;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -ENXIO;
+
+	if (data->type != IDXD_TYPE_IAX)
+		return -ENODEV;
+
+	mutex_lock(&wq->wq_lock);
+
+	wq->type = IDXD_WQT_KERNEL;
+
+	ret = __drv_enable_wq(wq);
+	if (ret < 0) {
+		pr_warn("%s: enable wq %d failed: %d\n", __func__, wq->id, ret);
+		ret = -ENXIO;
+		goto err;
+	}
+
+	ret = idxd_wq_alloc_resources(wq);
+	if (ret < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_RES_ALLOC_ERR;
+		pr_warn("%s: WQ resource alloc failed: ret=%d\n", __func__, ret);
+		goto err_alloc;
+	}
+
+	ret = idxd_wq_init_percpu_ref(wq);
+	if (ret < 0) {
+		idxd->cmd_status = IDXD_SCMD_PERCPU_ERR;
+		pr_warn("%s: WQ percpu_ref setup failed: ret=%d\n", __func__, ret);
+		goto err_ref;
+	}
+
+	ret = save_iax_wq(wq);
+	if (ret)
+		goto err_save;
+
+	rebalance_wq_table();
+out:
+	mutex_unlock(&wq->wq_lock);
+
+	return ret;
+
+err_save:
+	__idxd_wq_quiesce(wq);
+err_ref:
+	idxd_wq_free_resources(wq);
+err_alloc:
+	__drv_disable_wq(wq);
+err:
+	wq->type = IDXD_WQT_NONE;
+
+	goto out;
+}
+
+static void iax_crypto_remove(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+
+	mutex_lock(&wq->wq_lock);
+
+	__idxd_wq_quiesce(wq);
+	remove_iax_wq(wq);
+	__drv_disable_wq(wq);
+	idxd_wq_free_resources(wq);
+	wq->type = IDXD_WQT_NONE;
+	rebalance_wq_table();
+
+	mutex_unlock(&wq->wq_lock);
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_WQ,
+	IDXD_DEV_NONE,
+};
+
+static struct idxd_device_driver iax_crypto_driver = {
+	.probe = iax_crypto_probe,
+	.remove = iax_crypto_remove,
+	.name = "crypto",
+	.type = dev_types,
+};
+
+static int __init iax_crypto_init_module(void)
+{
+	int ret = 0;
+
+	nr_cpus = num_online_cpus();
+	nr_nodes = num_online_nodes();
+
+	iax_set_aecs();
+
+	wq_table = alloc_percpu(struct idxd_wq *);
+	if (!wq_table)
+		return -ENOMEM;
+
+	ret = __idxd_driver_register(&iax_crypto_driver, THIS_MODULE,
+				     KBUILD_MODNAME);
+	if (ret) {
+		pr_err("IAX wq sub-driver registration failed\n");
+		goto err_driver_register;
+	}
+
+	ret = iax_register_compression_device();
+	if (ret < 0) {
+		pr_err("IAX compression device registration failed\n");
+		goto err_crypto_register;
+	}
+
+	pr_info("%s: initialized\n", __func__);
+out:
+	return ret;
+
+err_crypto_register:
+	idxd_driver_unregister(&iax_crypto_driver);
+err_driver_register:
+	free_percpu(wq_table);
+
+	goto out;
+}
+
+static void __exit iax_crypto_cleanup_module(void)
+{
+	idxd_driver_unregister(&iax_crypto_driver);
+	iax_unregister_compression_device();
+	free_percpu(wq_table);
+	free_iax_devices();
+	pr_info("%s: cleaned up\n", __func__);
+}
+
+MODULE_IMPORT_NS(IDXD);
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_IDXD_DEVICE(0);
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("IAX Crypto Driver");
+
+module_init(iax_crypto_init_module);
+module_exit(iax_crypto_cleanup_module);
-- 
Gitee


From e4a03912905ce8c3023c6b1fc9f11ac4909aeca1 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 21 Sep 2021 15:46:57 -0700
Subject: [PATCH 1781/2161] crypto: iax - Add async interface support

commit 631a333ca12110d17c59005eccb3b26cf06bd3d1 Intel-BKC.

Add support for the crypto async interface, so that the IAX crypto
hardware can be used with facilities that require it, such as zswap.

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/iax_crypto_main.c | 72 ++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index 5402ce5029a5..5d4086d5ee04 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -9,6 +9,7 @@
 #include <linux/iommu.h>
 #include <uapi/linux/idxd.h>
 #include <linux/highmem.h>
+#include <crypto/internal/acompress.h>
 
 #include "registers.h"
 #include "idxd.h"
@@ -658,6 +659,65 @@ static struct crypto_alg iax_comp_deflate = {
 	}
 };
 
+static int iax_comp_acompress(struct acomp_req *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	void *src, *dst;
+	int ret;
+
+	src = kmap_atomic(sg_page(req->src)) + req->src->offset;
+	dst = kmap_atomic(sg_page(req->dst)) + req->dst->offset;
+
+	pr_debug("%s: src %p (offset %d), slen %d, dst %p (offset %d), dlen %u\n",
+		 __func__, src, req->src->offset, req->slen,
+		 dst, req->dst->offset, req->dlen);
+
+	ret = iax_compress(tfm, (const u8 *)src, req->slen, (u8 *)dst, &req->dlen);
+
+	kunmap_atomic(src);
+	kunmap_atomic(dst);
+
+	if (ret != 0)
+		pr_warn("asynchronous compress failed ret=%d\n", ret);
+
+	return ret;
+}
+
+static int iax_comp_adecompress(struct acomp_req *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	void *src, *dst;
+	int ret;
+
+	src = kmap_atomic(sg_page(req->src)) + req->src->offset;
+	dst = kmap_atomic(sg_page(req->dst)) + req->dst->offset;
+
+	pr_debug("%s: src %p (offset %d), slen %d, dst %p (offset %d), dlen %u\n",
+		 __func__, src, req->src->offset, req->slen,
+		 dst, req->dst->offset, req->dlen);
+
+	ret = iax_decompress(tfm, (const u8 *)src, req->slen, (u8 *)dst, &req->dlen);
+
+	kunmap_atomic(src);
+	kunmap_atomic(dst);
+
+	if (ret != 0)
+		pr_warn("asynchronous decompress failed ret=%d\n", ret);
+
+	return ret;
+}
+
+static struct acomp_alg iax_acomp_deflate = {
+	.compress		= iax_comp_acompress,
+	.decompress		= iax_comp_adecompress,
+	.base			= {
+		.cra_name		= "deflate",
+		.cra_driver_name	= "iax_crypto",
+		.cra_module		= THIS_MODULE,
+		.cra_priority           = IAX_ALG_PRIORITY,
+	}
+};
+
 static int iax_register_compression_device(void)
 {
 	int ret;
@@ -668,12 +728,24 @@ static int iax_register_compression_device(void)
 		return ret;
 	}
 
+	ret = crypto_register_acomp(&iax_acomp_deflate);
+	if (ret) {
+		pr_err("deflate algorithm acomp registration failed (%d)\n", ret);
+		goto err_unregister_alg_deflate;
+	}
+
+	return ret;
+
+err_unregister_alg_deflate:
+	crypto_unregister_alg(&iax_comp_deflate);
+
 	return ret;
 }
 
 static void iax_unregister_compression_device(void)
 {
 	crypto_unregister_alg(&iax_comp_deflate);
+	crypto_unregister_acomp(&iax_acomp_deflate);
 }
 
 static void iax_set_aecs(void)
-- 
Gitee


From 322e097d69a7bf81f365e0dd0bef360224193be9 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 21 Sep 2021 15:59:56 -0700
Subject: [PATCH 1782/2161] crypto: iax - Add IAX crypto stats

commit 813d265bb3addde033d48748f8c6432351ff7c3e Intel-BKC.

Add support for optional debugfs statistics support.  This is enabled
by the kernel config item:

  IAX_CRYPTO_STATS

When enabled, the IAX crypto driver will generate statistics which can
be accessed at:

  /sys/kernel/debug/iax-crypto/

[ Based on work originally by George Powley and Jing Lin. ]

Co-developed-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/Kconfig            |   7 +
 drivers/crypto/iax/Makefile           |   2 +
 drivers/crypto/iax/iax_crypto.h       |  41 ++++
 drivers/crypto/iax/iax_crypto_main.c  |  80 ++++++--
 drivers/crypto/iax/iax_crypto_stats.c | 269 ++++++++++++++++++++++++++
 drivers/crypto/iax/iax_crypto_stats.h |  54 ++++++
 6 files changed, 438 insertions(+), 15 deletions(-)
 create mode 100644 drivers/crypto/iax/iax_crypto_stats.c
 create mode 100644 drivers/crypto/iax/iax_crypto_stats.h

diff --git a/drivers/crypto/iax/Kconfig b/drivers/crypto/iax/Kconfig
index e546d5952486..20bce91a9dbf 100644
--- a/drivers/crypto/iax/Kconfig
+++ b/drivers/crypto/iax/Kconfig
@@ -6,3 +6,10 @@ config CRYPTO_DEV_IAX_CRYPTO
 	help
 	  This driver supports Intel analytics accelerator hardware.
 	  The module will be called iax_crypto.
+
+config CRYPTO_DEV_IAX_CRYPTO_STATS
+	bool "IAX Crypto Stats"
+	depends on CRYPTO_DEV_IAX_CRYPTO
+	default n
+	help
+	  Enable IAX crypto stats
diff --git a/drivers/crypto/iax/Makefile b/drivers/crypto/iax/Makefile
index d0f3c1af8c73..532ac9002f9c 100644
--- a/drivers/crypto/iax/Makefile
+++ b/drivers/crypto/iax/Makefile
@@ -7,3 +7,5 @@ ccflags-y += -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE=IDXD
 
 obj-$(CONFIG_CRYPTO_DEV_IAX_CRYPTO) := iax_crypto.o
 iax_crypto-y := iax_crypto_main.o
+
+iax_crypto-$(CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS) += iax_crypto_stats.o
diff --git a/drivers/crypto/iax/iax_crypto.h b/drivers/crypto/iax/iax_crypto.h
index 30771be44a0b..30137c27aef5 100644
--- a/drivers/crypto/iax/iax_crypto.h
+++ b/drivers/crypto/iax/iax_crypto.h
@@ -7,6 +7,7 @@
 #include <linux/crypto.h>
 #include <linux/idxd.h>
 #include <uapi/linux/idxd.h>
+#include "iax_crypto_stats.h"
 
 #undef pr_fmt
 #define	pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
@@ -37,6 +38,32 @@
 					 IAX_DECOMP_CHECK_FOR_EOB | \
 					 IAX_DECOMP_STOP_ON_EOB)
 
+struct iax_wq {
+	struct list_head	list;
+	struct idxd_wq		*wq;
+
+	struct iax_device	*iax_device;
+
+	u64			comp_calls;
+	u64			comp_bytes;
+	u64			decomp_calls;
+	u64			decomp_bytes;
+};
+
+/* Representation of IAX device with wqs, populated by probe */
+struct iax_device {
+	struct list_head	list;
+	struct idxd_device	*idxd;
+
+	int			n_wq;
+	struct list_head	wqs;
+
+	u64			comp_calls;
+	u64			comp_bytes;
+	u64			decomp_calls;
+	u64			decomp_bytes;
+};
+
 /*
  * Analytics Engine Configuration and State (AECS) contains parameters and
  * internal state of the analytics engine.
@@ -53,4 +80,18 @@ struct aecs_table_record {
 	u32 d_sym[30];
 	u32 reserved_padding[2];
 };
+
+#if defined(CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS)
+void	global_stats_show(struct seq_file *m);
+void	device_stats_show(struct seq_file *m, struct iax_device *iax_device);
+void	reset_iax_crypto_stats(void);
+void	reset_device_stats(struct iax_device *iax_device);
+
+#else
+static inline void	global_stats_show(struct seq_file *m) {}
+static inline void	device_stats_show(struct seq_file *m, struct iax_device *iax_device) {}
+static inline void	reset_iax_crypto_stats(void) {}
+static inline void	reset_device_stats(struct iax_device *iax_device) {}
+#endif
+
 #endif
diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index 5d4086d5ee04..f139a6b23981 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -34,24 +34,40 @@ module_param_named(iax_verify_compress, iax_verify_compress, bool, 0644);
 MODULE_PARM_DESC(iax_verify_compress,
 		 "Verify IAX compression (value = 1) or not (value = 0)");
 
-struct iax_wq {
-	struct list_head	list;
-	struct idxd_wq		*wq;
+static LIST_HEAD(iax_devices);
+static DEFINE_SPINLOCK(iax_devices_lock);
 
-	struct iax_device	*iax_device;
-};
+int wq_stats_show(struct seq_file *m, void *v)
+{
+	struct iax_device *iax_device;
 
-/* Representation of IAX device with wqs, populated by probe */
-struct iax_device {
-	struct list_head		list;
-	struct idxd_device		*idxd;
+	spin_lock(&iax_devices_lock);
 
-	int				n_wq;
-	struct list_head		wqs;
-};
+	global_stats_show(m);
 
-static LIST_HEAD(iax_devices);
-static DEFINE_SPINLOCK(iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list)
+		device_stats_show(m, iax_device);
+
+	spin_unlock(&iax_devices_lock);
+
+	return 0;
+}
+
+int iax_crypto_stats_reset(void *data, u64 value)
+{
+	struct iax_device *iax_device;
+
+	reset_iax_crypto_stats();
+
+	spin_lock(&iax_devices_lock);
+
+	list_for_each_entry(iax_device, &iax_devices, list)
+		reset_device_stats(iax_device);
+
+	spin_unlock(&iax_devices_lock);
+
+	return 0;
+}
 
 static struct iax_device *iax_device_alloc(void)
 {
@@ -70,8 +86,10 @@ static void iax_device_free(struct iax_device *iax_device)
 {
 	struct iax_wq *iax_wq, *next;
 
-	list_for_each_entry_safe(iax_wq, next, &iax_device->wqs, list)
+	list_for_each_entry_safe(iax_wq, next, &iax_device->wqs, list) {
 		list_del(&iax_wq->list);
+		kfree(iax_wq); // zzzz do this in original code too
+	}
 
 	kfree(iax_device);
 }
@@ -414,6 +432,7 @@ static inline int check_completion(struct iax_completion_record *comp,
 			ret = -ETIMEDOUT;
 			pr_warn("%s: %s timed out, size=0x%x\n",
 				__func__, op_str, comp->output_size);
+			update_completion_timeout_errs();
 			goto out;
 		}
 
@@ -422,6 +441,7 @@ static inline int check_completion(struct iax_completion_record *comp,
 		    compress == true) {
 			ret = -E2BIG;
 			pr_debug("%s: compressed size > uncompressed size, not compressing, size=0x%x\n", __func__, comp->output_size);
+			update_completion_comp_buf_overflow_errs();
 			goto out;
 		}
 
@@ -429,6 +449,8 @@ static inline int check_completion(struct iax_completion_record *comp,
 		pr_err("%s: iax %s status=0x%x, error=0x%x, size=0x%x\n",
 		       __func__, op_str, ret, comp->error_code, comp->output_size);
 		print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET, 8, 1, comp, 64, 0);
+		update_completion_einval_errs();
+
 		goto out;
 	}
 out:
@@ -498,6 +520,12 @@ static int iax_compress(struct crypto_tfm *tfm,
 
 	compression_crc = idxd_desc->iax_completion->crc;
 
+	/* Update stats */
+	update_total_comp_calls();
+	update_total_comp_bytes_out(*dlen);
+	update_wq_comp_calls(wq);
+	update_wq_comp_bytes(wq, *dlen);
+
 	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
 	if (IS_ERR(idxd_desc)) {
 		pr_err("%s: idxd descriptor allocation failed\n", __func__);
@@ -604,6 +632,12 @@ static int iax_decompress(struct crypto_tfm *tfm,
 	*dlen = idxd_desc->iax_completion->output_size;
 
 	idxd_free_desc(wq, idxd_desc);
+
+	/* Update stats */
+	update_total_decomp_calls();
+	update_total_decomp_bytes_in(slen);
+	update_wq_decomp_calls(wq);
+	update_wq_decomp_bytes(wq, slen);
 out:
 	return ret;
 err:
@@ -617,12 +651,15 @@ static int iax_comp_compress(struct crypto_tfm *tfm,
 			     const u8 *src, unsigned int slen,
 			     u8 *dst, unsigned int *dlen)
 {
+	u64 start_time_ns;
 	int ret;
 
 	pr_debug("%s: src %p, slen %d, dst %p, dlen %u\n",
 		 __func__, src, slen, dst, *dlen);
 
+	start_time_ns = ktime_get_ns();
 	ret = iax_compress(tfm, src, slen, dst, dlen);
+	update_max_comp_delay_ns(start_time_ns);
 	if (ret != 0)
 		pr_warn("synchronous compress failed ret=%d\n", ret);
 
@@ -633,12 +670,15 @@ static int iax_comp_decompress(struct crypto_tfm *tfm,
 			       const u8 *src, unsigned int slen,
 			       u8 *dst, unsigned int *dlen)
 {
+	u64 start_time_ns;
 	int ret;
 
 	pr_debug("%s: src %p, slen %d, dst %p, dlen %u\n",
 		 __func__, src, slen, dst, *dlen);
 
+	start_time_ns = ktime_get_ns();
 	ret = iax_decompress(tfm, src, slen, dst, dlen);
+	update_max_decomp_delay_ns(start_time_ns);
 	if (ret != 0)
 		pr_warn("synchronous decompress failed ret=%d\n", ret);
 
@@ -662,6 +702,7 @@ static struct crypto_alg iax_comp_deflate = {
 static int iax_comp_acompress(struct acomp_req *req)
 {
 	struct crypto_tfm *tfm = req->base.tfm;
+	u64 start_time_ns;
 	void *src, *dst;
 	int ret;
 
@@ -672,7 +713,9 @@ static int iax_comp_acompress(struct acomp_req *req)
 		 __func__, src, req->src->offset, req->slen,
 		 dst, req->dst->offset, req->dlen);
 
+	start_time_ns = ktime_get_ns();
 	ret = iax_compress(tfm, (const u8 *)src, req->slen, (u8 *)dst, &req->dlen);
+	update_max_acomp_delay_ns(start_time_ns);
 
 	kunmap_atomic(src);
 	kunmap_atomic(dst);
@@ -686,6 +729,7 @@ static int iax_comp_acompress(struct acomp_req *req)
 static int iax_comp_adecompress(struct acomp_req *req)
 {
 	struct crypto_tfm *tfm = req->base.tfm;
+	u64 start_time_ns;
 	void *src, *dst;
 	int ret;
 
@@ -696,7 +740,9 @@ static int iax_comp_adecompress(struct acomp_req *req)
 		 __func__, src, req->src->offset, req->slen,
 		 dst, req->dst->offset, req->dlen);
 
+	start_time_ns = ktime_get_ns();
 	ret = iax_decompress(tfm, (const u8 *)src, req->slen, (u8 *)dst, &req->dlen);
+	update_max_decomp_delay_ns(start_time_ns);
 
 	kunmap_atomic(src);
 	kunmap_atomic(dst);
@@ -907,6 +953,9 @@ static int __init iax_crypto_init_module(void)
 		goto err_crypto_register;
 	}
 
+	if (iax_crypto_debugfs_init())
+		pr_warn("debugfs init failed, stats not available\n");
+
 	pr_info("%s: initialized\n", __func__);
 out:
 	return ret;
@@ -921,6 +970,7 @@ static int __init iax_crypto_init_module(void)
 
 static void __exit iax_crypto_cleanup_module(void)
 {
+	iax_crypto_debugfs_cleanup();
 	idxd_driver_unregister(&iax_crypto_driver);
 	iax_unregister_compression_device();
 	free_percpu(wq_table);
diff --git a/drivers/crypto/iax/iax_crypto_stats.c b/drivers/crypto/iax/iax_crypto_stats.c
new file mode 100644
index 000000000000..525a20d12cc5
--- /dev/null
+++ b/drivers/crypto/iax/iax_crypto_stats.c
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <uapi/linux/idxd.h>
+#include <linux/idxd.h>
+#include <linux/dmaengine.h>
+#include <linux/intel-svm.h>
+#include "../../dma/idxd/idxd.h"
+#include <linux/debugfs.h>
+#include <crypto/internal/acompress.h>
+#include "iax_crypto.h"
+#include "iax_crypto_stats.h"
+
+static u64 total_comp_calls;
+static u64 total_decomp_calls;
+static u64 max_comp_delay_ns;
+static u64 max_decomp_delay_ns;
+static u64 max_acomp_delay_ns;
+static u64 max_adecomp_delay_ns;
+static u64 total_comp_bytes_out;
+static u64 total_decomp_bytes_in;
+static u64 total_completion_einval_errors;
+static u64 total_completion_timeout_errors;
+static u64 total_completion_comp_buf_overflow_errors;
+
+static struct dentry *iax_crypto_debugfs_root;
+
+void update_total_comp_calls(void)
+{
+	total_comp_calls++;
+}
+
+void update_total_comp_bytes_out(int n)
+{
+	total_comp_bytes_out += n;
+}
+
+void update_total_decomp_calls(void)
+{
+	total_decomp_calls++;
+}
+
+void update_total_decomp_bytes_in(int n)
+{
+	total_decomp_bytes_in += n;
+}
+
+void update_completion_einval_errs(void)
+{
+	total_completion_einval_errors++;
+}
+
+void update_completion_timeout_errs(void)
+{
+	total_completion_timeout_errors++;
+}
+
+void update_completion_comp_buf_overflow_errs(void)
+{
+	total_completion_comp_buf_overflow_errors++;
+}
+
+void update_max_comp_delay_ns(u64 start_time_ns)
+{
+	u64 time_diff;
+
+	time_diff = ktime_get_ns() - start_time_ns;
+
+	if (time_diff > max_comp_delay_ns)
+		max_comp_delay_ns = time_diff;
+}
+
+void update_max_decomp_delay_ns(u64 start_time_ns)
+{
+	u64 time_diff;
+
+	time_diff = ktime_get_ns() - start_time_ns;
+
+	if (time_diff > max_decomp_delay_ns)
+		max_decomp_delay_ns = time_diff;
+}
+
+void update_max_acomp_delay_ns(u64 start_time_ns)
+{
+	u64 time_diff;
+
+	time_diff = ktime_get_ns() - start_time_ns;
+
+	if (time_diff > max_acomp_delay_ns)
+		max_acomp_delay_ns = time_diff;
+}
+
+void update_max_adecomp_delay_ns(u64 start_time_ns)
+{
+	u64 time_diff;
+
+	time_diff = ktime_get_ns() - start_time_ns;
+
+	if (time_diff > max_adecomp_delay_ns)
+
+		max_adecomp_delay_ns = time_diff;
+}
+
+void update_wq_comp_calls(struct idxd_wq *idxd_wq)
+{
+	struct iax_wq *wq = idxd_wq->private_data;
+
+	wq->comp_calls++;
+	wq->iax_device->comp_calls++;
+}
+
+void update_wq_comp_bytes(struct idxd_wq *idxd_wq, int n)
+{
+	struct iax_wq *wq = idxd_wq->private_data;
+
+	wq->comp_bytes += n;
+	wq->iax_device->comp_bytes += n;
+}
+
+void update_wq_decomp_calls(struct idxd_wq *idxd_wq)
+{
+	struct iax_wq *wq = idxd_wq->private_data;
+
+	wq->decomp_calls++;
+	wq->iax_device->decomp_calls++;
+}
+
+void update_wq_decomp_bytes(struct idxd_wq *idxd_wq, int n)
+{
+	struct iax_wq *wq = idxd_wq->private_data;
+
+	wq->decomp_bytes += n;
+	wq->iax_device->decomp_bytes += n;
+}
+
+void reset_iax_crypto_stats(void)
+{
+	total_comp_calls = 0;
+	total_decomp_calls = 0;
+	max_comp_delay_ns = 0;
+	max_decomp_delay_ns = 0;
+	max_acomp_delay_ns = 0;
+	max_adecomp_delay_ns = 0;
+	total_comp_bytes_out = 0;
+	total_decomp_bytes_in = 0;
+	total_completion_einval_errors = 0;
+	total_completion_timeout_errors = 0;
+	total_completion_comp_buf_overflow_errors = 0;
+}
+
+static void reset_wq_stats(struct iax_wq *wq)
+{
+	wq->comp_calls = 0;
+	wq->comp_bytes = 0;
+	wq->decomp_calls = 0;
+	wq->decomp_bytes = 0;
+}
+
+void reset_device_stats(struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq;
+
+	iax_device->comp_calls = 0;
+	iax_device->comp_bytes = 0;
+	iax_device->decomp_calls = 0;
+	iax_device->decomp_bytes = 0;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list)
+		reset_wq_stats(iax_wq);
+}
+
+static void wq_show(struct seq_file *m, struct iax_wq *iax_wq)
+{
+	seq_printf(m, "    name: %s\n", iax_wq->wq->name);
+	seq_printf(m, "    comp_calls: %llu\n", iax_wq->comp_calls);
+	seq_printf(m, "    comp_bytes: %llu\n", iax_wq->comp_bytes);
+	seq_printf(m, "    decomp_calls: %llu\n", iax_wq->decomp_calls);
+	seq_printf(m, "    decomp_bytes: %llu\n\n", iax_wq->decomp_bytes);
+}
+
+void device_stats_show(struct seq_file *m, struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq;
+
+	seq_puts(m, "iax device:\n");
+	seq_printf(m, "  id: %d\n", iax_device->idxd->id);
+	seq_printf(m, "  n_wqs: %d\n", iax_device->n_wq);
+	seq_printf(m, "  comp_calls: %llu\n", iax_device->comp_calls);
+	seq_printf(m, "  comp_bytes: %llu\n", iax_device->comp_bytes);
+	seq_printf(m, "  decomp_calls: %llu\n", iax_device->decomp_calls);
+	seq_printf(m, "  decomp_bytes: %llu\n", iax_device->decomp_bytes);
+	seq_puts(m, "  wqs:\n");
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list)
+		wq_show(m, iax_wq);
+}
+
+void global_stats_show(struct seq_file *m)
+{
+	seq_puts(m, "global stats:\n");
+	seq_printf(m, "  total_comp_calls: %llu\n", total_comp_calls);
+	seq_printf(m, "  total_decomp_calls: %llu\n", total_decomp_calls);
+	seq_printf(m, "  total_comp_bytes_out: %llu\n", total_comp_bytes_out);
+	seq_printf(m, "  total_decomp_bytes_in: %llu\n", total_decomp_bytes_in);
+	seq_printf(m, "  total_completion_einval_errors: %llu\n", total_completion_einval_errors);
+	seq_printf(m, "  total_completion_timeout_errors: %llu\n", total_completion_timeout_errors);
+	seq_printf(m, "  total_completion_comp_buf_overflow_errors: %llu\n\n", total_completion_comp_buf_overflow_errors);
+}
+
+static int wq_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, wq_stats_show, file);
+}
+
+const struct file_operations wq_stats_fops = {
+	.open = wq_stats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+DEFINE_DEBUGFS_ATTRIBUTE(wq_stats_reset_fops, NULL, iax_crypto_stats_reset, "%llu\n");
+
+int __init iax_crypto_debugfs_init(void)
+{
+	if (!debugfs_initialized())
+		return -ENODEV;
+
+	iax_crypto_debugfs_root = debugfs_create_dir("iax_crypto", NULL);
+	if (!iax_crypto_debugfs_root)
+		return -ENOMEM;
+
+	debugfs_create_u64("max_comp_delay_ns", 0644,
+			   iax_crypto_debugfs_root, &max_comp_delay_ns);
+	debugfs_create_u64("max_decomp_delay_ns", 0644,
+			   iax_crypto_debugfs_root, &max_decomp_delay_ns);
+	debugfs_create_u64("max_acomp_delay_ns", 0644,
+			   iax_crypto_debugfs_root, &max_comp_delay_ns);
+	debugfs_create_u64("max_adecomp_delay_ns", 0644,
+			   iax_crypto_debugfs_root, &max_decomp_delay_ns);
+	debugfs_create_u64("total_comp_calls", 0644,
+			   iax_crypto_debugfs_root, &total_comp_calls);
+	debugfs_create_u64("total_decomp_calls", 0644,
+			   iax_crypto_debugfs_root, &total_decomp_calls);
+	debugfs_create_u64("total_comp_bytes_out", 0644,
+			   iax_crypto_debugfs_root, &total_comp_bytes_out);
+	debugfs_create_u64("total_decomp_bytes_in", 0644,
+			   iax_crypto_debugfs_root, &total_decomp_bytes_in);
+	debugfs_create_file("wq_stats", 0644, iax_crypto_debugfs_root, NULL,
+			    &wq_stats_fops);
+	debugfs_create_file("wq_stats_reset", 0644, iax_crypto_debugfs_root, NULL,
+			    &wq_stats_reset_fops);
+
+	return 0;
+}
+
+void __exit iax_crypto_debugfs_cleanup(void)
+{
+	debugfs_remove_recursive(iax_crypto_debugfs_root);
+}
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/crypto/iax/iax_crypto_stats.h b/drivers/crypto/iax/iax_crypto_stats.h
new file mode 100644
index 000000000000..524f444e58ff
--- /dev/null
+++ b/drivers/crypto/iax/iax_crypto_stats.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#ifndef __CRYPTO_DEV_IAX_CRYPTO_STATS_H__
+#define __CRYPTO_DEV_IAX_CRYPTO_STATS_H__
+
+#if defined(CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS)
+int	iax_crypto_debugfs_init(void);
+void	iax_crypto_debugfs_cleanup(void);
+
+void	update_total_comp_calls(void);
+void	update_total_comp_bytes_out(int n);
+void	update_total_decomp_calls(void);
+void	update_total_decomp_bytes_in(int n);
+void	update_max_comp_delay_ns(u64 start_time_ns);
+void	update_max_decomp_delay_ns(u64 start_time_ns);
+void	update_max_acomp_delay_ns(u64 start_time_ns);
+void	update_max_adecomp_delay_ns(u64 start_time_ns);
+void	update_completion_einval_errs(void);
+void	update_completion_timeout_errs(void);
+void	update_completion_comp_buf_overflow_errs(void);
+
+void	update_wq_comp_calls(struct idxd_wq *idxd_wq);
+void	update_wq_comp_bytes(struct idxd_wq *idxd_wq, int n);
+void	update_wq_decomp_calls(struct idxd_wq *idxd_wq);
+void	update_wq_decomp_bytes(struct idxd_wq *idxd_wq, int n);
+
+int	wq_stats_show(struct seq_file *m, void *v);
+int	iax_crypto_stats_reset(void *data, u64 value);
+
+#else
+static inline int	iax_crypto_debugfs_init(void) { return 0; }
+static inline void	iax_crypto_debugfs_cleanup(void) {}
+
+static inline void	update_total_comp_calls(void) {}
+static inline void	update_total_comp_bytes_out(int n) {}
+static inline void	update_total_decomp_calls(void) {}
+static inline void	update_total_decomp_bytes_in(int n) {}
+static inline void	update_max_comp_delay_ns(u64 start_time_ns) {}
+static inline void	update_max_decomp_delay_ns(u64 start_time_ns) {}
+static inline void	update_max_acomp_delay_ns(u64 start_time_ns) {}
+static inline void	update_max_adecomp_delay_ns(u64 start_time_ns) {}
+static inline void	update_completion_einval_errs(void) {}
+static inline void	update_completion_timeout_errs(void) {}
+static inline void	update_completion_comp_buf_overflow_errs(void) {}
+
+static inline void	update_wq_comp_calls(struct idxd_wq *idxd_wq) {}
+static inline void	update_wq_comp_bytes(struct idxd_wq *idxd_wq, int n) {}
+static inline void	update_wq_decomp_calls(struct idxd_wq *idxd_wq) {}
+static inline void	update_wq_decomp_bytes(struct idxd_wq *idxd_wq, int n) {}
+
+#endif // CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS
+
+#endif
-- 
Gitee


From e7d5329806d843a03293ef2a58d4054f337e8be0 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 25 Nov 2021 09:14:57 +0200
Subject: [PATCH 1783/2161] perf inject: Fix itrace space allowed for new
 attributes

commit c29d9792607e67ed8a3f6e9db0d96836d885a8c5 upstream.

The space allowed for new attributes can be too small if existing header
information is large. That can happen, for example, if there are very
many CPUs, due to having an event ID per CPU per event being stored in the
header information.

Fix by adding the existing header.data_offset. Also increase the extra
space allowed to 8KiB and align to a 4KiB boundary for neatness.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lore.kernel.org/lkml/20211125071457.2066863-1-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/perf/builtin-inject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 0d524ef3606d..ab65f563dc65 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -682,7 +682,7 @@ static int __cmd_inject(struct perf_inject *inject)
 		inject->tool.ordered_events = true;
 		inject->tool.ordering_requires_timestamps = true;
 		/* Allow space in the header for new attributes */
-		output_data_offset = 4096;
+		output_data_offset = roundup(8192 + session->header.data_offset, 4096);
 		if (inject->strip)
 			strip_init(inject);
 	}
-- 
Gitee


From 5d394093d3f1a1957113ec7cb40e15dad22c8a8a Mon Sep 17 00:00:00 2001
From: jiangkunkun <jiangkunkun@huawei.com>
Date: Thu, 11 Mar 2021 16:29:19 +0800
Subject: [PATCH 1784/2161] vfio/iommu_type1: Add HWDBM status maintanance

commit da65dcaeb4447ba6e5d112921394ca62805ac184 Intel-BKC.

We are going to optimize dirty log tracking based on iommu
HWDBM feature, but the dirty log from iommu is useful only
when all iommu backed groups are connected to iommu with
HWDBM feature. This maintains a counter for this feature.

Co-developed-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 659f86358120..278181b0ad9e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -79,6 +79,7 @@ struct vfio_iommu {
 	bool			nesting;
 	bool			dirty_page_tracking;
 	bool			container_open;
+	uint64_t		num_non_hwdbm_groups;
 };
 
 struct vfio_domain {
@@ -116,6 +117,7 @@ struct vfio_iommu_group {
 	struct list_head	next;
 	bool			mdev_group;	/* An mdev group */
 	bool			pinned_page_dirty_scope;
+	bool			iommu_hwdbm;	/* Valid for non-mdev group */
 };
 
 struct vfio_iova {
@@ -1200,6 +1202,27 @@ static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
 	}
 }
 
+static int vfio_dev_enable_feature(struct device *dev, void *data)
+{
+	enum iommu_dev_features *feat = data;
+
+	if (iommu_dev_feature_enabled(dev, *feat))
+		return 0;
+
+	return iommu_dev_enable_feature(dev, *feat);
+}
+
+static bool vfio_group_supports_hwdbm(struct vfio_iommu_group *group)
+{
+	enum iommu_dev_features feat = IOMMU_DEV_FEAT_HWDBM;
+
+	if (iommu_group_for_each_dev(group->iommu_group, &feat,
+				     vfio_dev_enable_feature))
+		return false;
+
+	return true;
+}
+
 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 			      struct vfio_dma *dma, dma_addr_t base_iova,
 			      size_t pgsize)
@@ -2433,6 +2456,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	 * capable via the page pinning interface.
 	 */
 	iommu->num_non_pinned_groups++;
+
+	/* Update the hwdbm status of group and iommu */
+	group->iommu_hwdbm = vfio_group_supports_hwdbm(group);
+	if (!group->iommu_hwdbm)
+		iommu->num_non_hwdbm_groups++;
+
 	mutex_unlock(&iommu->lock);
 	vfio_iommu_resv_free(&group_resv_regions);
 
@@ -2568,6 +2597,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 	struct vfio_domain *domain;
 	struct vfio_iommu_group *group;
 	bool update_dirty_scope = false;
+	bool update_iommu_hwdbm = false;
 	LIST_HEAD(iova_copy);
 
 	mutex_lock(&iommu->lock);
@@ -2606,6 +2636,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 		vfio_iommu_detach_group(domain, group);
 		update_dirty_scope = !group->pinned_page_dirty_scope;
+		update_iommu_hwdbm = !group->iommu_hwdbm;
 		list_del(&group->next);
 		kfree(group);
 		/*
@@ -2648,6 +2679,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		if (iommu->dirty_page_tracking)
 			vfio_iommu_populate_bitmap_full(iommu);
 	}
+	if (update_iommu_hwdbm)
+		iommu->num_non_hwdbm_groups--;
 	mutex_unlock(&iommu->lock);
 }
 
-- 
Gitee


From c9856fdecb7e354ba6cc4f2913360bb9dc82f391 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 14 Jul 2022 15:20:53 +0800
Subject: [PATCH 1785/2161] iommu: Resolve merge conflict for iommu_pgsize()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for iommu_pgsize().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 128 +++++++++++-------------------------------
 1 file changed, 33 insertions(+), 95 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 442982cad779..219b4b8398ab 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2741,84 +2741,38 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 }
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
 
-static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
-			   phys_addr_t paddr, size_t size, size_t *count)
+static size_t iommu_pgsize(struct iommu_domain *domain,
+			   unsigned long addr_merge, size_t size)
 {
-	unsigned int pgsize_idx, pgsize_idx_next;
-	unsigned long pgsizes;
-	size_t offset, pgsize, pgsize_next;
-	unsigned long addr_merge = paddr | iova;
+	unsigned int pgsize_idx;
+	size_t pgsize;
 
-	/* Page sizes supported by the hardware and small enough for @size */
-	pgsizes = domain->pgsize_bitmap & GENMASK(__fls(size), 0);
+	/* Max page size that still fits into 'size' */
+	pgsize_idx = __fls(size);
 
-	/* Constrain the page sizes further based on the maximum alignment */
-	if (likely(addr_merge))
-		pgsizes &= GENMASK(__ffs(addr_merge), 0);
-
-	/* Make sure we have at least one suitable page size */
-	BUG_ON(!pgsizes);
-
-	/* Pick the biggest page size remaining */
-	pgsize_idx = __fls(pgsizes);
-	pgsize = BIT(pgsize_idx);
-	if (!count)
-		return pgsize;
-
-	/* Find the next biggest support page size, if it exists */
-	pgsizes = domain->pgsize_bitmap & ~GENMASK(pgsize_idx, 0);
-	if (!pgsizes)
-		goto out_set_count;
+	/* need to consider alignment requirements ? */
+	if (likely(addr_merge)) {
+		/* Max page size allowed by address */
+		unsigned int align_pgsize_idx = __ffs(addr_merge);
+		pgsize_idx = min(pgsize_idx, align_pgsize_idx);
+	}
 
-	pgsize_idx_next = __ffs(pgsizes);
-	pgsize_next = BIT(pgsize_idx_next);
+	/* build a mask of acceptable page sizes */
+	pgsize = (1UL << (pgsize_idx + 1)) - 1;
 
-	/*
-	 * There's no point trying a bigger page size unless the virtual
-	 * and physical addresses are similarly offset within the larger page.
-	 */
-	if ((iova ^ paddr) & (pgsize_next - 1))
-		goto out_set_count;
+	/* throw away page sizes not supported by the hardware */
+	pgsize &= domain->pgsize_bitmap;
 
-	/* Calculate the offset to the next page size alignment boundary */
-	offset = pgsize_next - (addr_merge & (pgsize_next - 1));
+	/* make sure we're still sane */
+	BUG_ON(!pgsize);
 
-	/*
-	 * If size is big enough to accommodate the larger page, reduce
-	 * the number of smaller pages.
-	 */
-	if (offset + pgsize_next <= size)
-		size = offset;
+	/* pick the biggest page */
+	pgsize_idx = __fls(pgsize);
+	pgsize = 1UL << pgsize_idx;
 
-out_set_count:
-	*count = size >> pgsize_idx;
 	return pgsize;
 }
 
-static int __iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
-			     phys_addr_t paddr, size_t size, int prot,
-			     gfp_t gfp, size_t *mapped)
-{
-	const struct iommu_ops *ops = domain->ops;
-	size_t pgsize, count;
-	int ret;
-
-	pgsize = iommu_pgsize(domain, iova, paddr, size, &count);
-
-	pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n",
-		 iova, &paddr, pgsize, count);
-
-	if (ops->map_pages) {
-		ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot,
-				     gfp, mapped);
-	} else {
-		ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
-		*mapped = ret ? 0 : pgsize;
-	}
-
-	return ret;
-}
-
 static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 		       phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
@@ -2829,7 +2783,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	phys_addr_t orig_paddr = paddr;
 	int ret = 0;
 
-	if (unlikely(!(ops->map || ops->map_pages) ||
+	if (unlikely(ops->map == NULL ||
 		     domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
@@ -2853,21 +2807,18 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
 
 	while (size) {
-		size_t mapped = 0;
+		size_t pgsize = iommu_pgsize(domain, iova | paddr, size);
 
-		ret = __iommu_map_pages(domain, iova, paddr, size, prot, gfp,
-					&mapped);
-		/*
-		 * Some pages may have been mapped, even if an error occurred,
-		 * so we should account for those so they can be unmapped.
-		 */
-		size -= mapped;
+		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
+			 iova, &paddr, pgsize);
+		ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
 
 		if (ret)
 			break;
 
-		iova += mapped;
-		paddr += mapped;
+		iova += pgsize;
+		paddr += pgsize;
+		size -= pgsize;
 	}
 
 	/* unroll mapping in case something went wrong */
@@ -2907,19 +2858,6 @@ int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
 }
 EXPORT_SYMBOL_GPL(iommu_map_atomic);
 
-static size_t __iommu_unmap_pages(struct iommu_domain *domain,
-				  unsigned long iova, size_t size,
-				  struct iommu_iotlb_gather *iotlb_gather)
-{
-	const struct iommu_ops *ops = domain->ops;
-	size_t pgsize, count;
-
-	pgsize = iommu_pgsize(domain, iova, iova, size, &count);
-	return ops->unmap_pages ?
-	       ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather) :
-	       ops->unmap(domain, iova, pgsize, iotlb_gather);
-}
-
 static size_t __iommu_unmap(struct iommu_domain *domain,
 			    unsigned long iova, size_t size,
 			    struct iommu_iotlb_gather *iotlb_gather)
@@ -2929,7 +2867,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 	unsigned long orig_iova = iova;
 	unsigned int min_pagesz;
 
-	if (unlikely(!(ops->unmap || ops->unmap_pages) ||
+	if (unlikely(ops->unmap == NULL ||
 		     domain->pgsize_bitmap == 0UL))
 		return 0;
 
@@ -2957,9 +2895,9 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 	 * or we hit an area that isn't mapped.
 	 */
 	while (unmapped < size) {
-		unmapped_page = __iommu_unmap_pages(domain, iova,
-						    size - unmapped,
-						    iotlb_gather);
+		size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
+
+		unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather);
 		if (!unmapped_page)
 			break;
 
-- 
Gitee


From 702ce84b94e99bf5d4cf57b5581ba30538752bc9 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Fri, 13 Dec 2019 16:22:10 -0800
Subject: [PATCH 1786/2161] bitmap: genericize percpu bitmap region iterators

commit e837dfde15a49c97dcbb059757d96c71e9e7bd54 upstream.

Bitmaps are fairly popular for their space efficiency, but we don't have
generic iterators available. Make percpu's bitmap region iterators
available to everyone.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/bitmap.h | 35 ++++++++++++++++++++++++
 mm/percpu.c            | 61 +++++++++++-------------------------------
 2 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 29fc933df3bf..9c31b5268f7a 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -438,6 +438,41 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen,
 	return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
 }
 
+static inline void bitmap_next_clear_region(unsigned long *bitmap,
+					    unsigned int *rs, unsigned int *re,
+					    unsigned int end)
+{
+	*rs = find_next_zero_bit(bitmap, end, *rs);
+	*re = find_next_bit(bitmap, end, *rs + 1);
+}
+
+static inline void bitmap_next_set_region(unsigned long *bitmap,
+					  unsigned int *rs, unsigned int *re,
+					  unsigned int end)
+{
+	*rs = find_next_bit(bitmap, end, *rs);
+	*re = find_next_zero_bit(bitmap, end, *rs + 1);
+}
+
+/*
+ * Bitmap region iterators.  Iterates over the bitmap between [@start, @end).
+ * @rs and @re should be integer variables and will be set to start and end
+ * index of the current clear or set region.
+ */
+#define bitmap_for_each_clear_region(bitmap, rs, re, start, end)	     \
+	for ((rs) = (start),						     \
+	     bitmap_next_clear_region((bitmap), &(rs), &(re), (end));	     \
+	     (rs) < (re);						     \
+	     (rs) = (re) + 1,						     \
+	     bitmap_next_clear_region((bitmap), &(rs), &(re), (end)))
+
+#define bitmap_for_each_set_region(bitmap, rs, re, start, end)		     \
+	for ((rs) = (start),						     \
+	     bitmap_next_set_region((bitmap), &(rs), &(re), (end));	     \
+	     (rs) < (re);						     \
+	     (rs) = (re) + 1,						     \
+	     bitmap_next_set_region((bitmap), &(rs), &(re), (end)))
+
 /**
  * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
  * @n: u64 value
diff --git a/mm/percpu.c b/mm/percpu.c
index 806bc16f88eb..63a3f6f0a42d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -270,33 +270,6 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 	       pcpu_unit_page_offset(cpu, page_idx);
 }
 
-static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
-{
-	*rs = find_next_zero_bit(bitmap, end, *rs);
-	*re = find_next_bit(bitmap, end, *rs + 1);
-}
-
-static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
-{
-	*rs = find_next_bit(bitmap, end, *rs);
-	*re = find_next_zero_bit(bitmap, end, *rs + 1);
-}
-
-/*
- * Bitmap region iterators.  Iterates over the bitmap between
- * [@start, @end) in @chunk.  @rs and @re should be integer variables
- * and will be set to start and end index of the current free region.
- */
-#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
-	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
-	     (rs) < (re);						     \
-	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
-
-#define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
-	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
-	     (rs) < (re);						     \
-	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
-
 /*
  * The following are helper functions to help access bitmaps and convert
  * between bitmap offsets to address offsets.
@@ -732,9 +705,8 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
 	}
 
 	bits = 0;
-	pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+	pcpu_for_each_md_free_region(chunk, bit_off, bits)
 		pcpu_block_update(chunk_md, bit_off, bit_off + bits);
-	}
 }
 
 /**
@@ -749,7 +721,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
 {
 	struct pcpu_block_md *block = chunk->md_blocks + index;
 	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
-	int rs, re, start;	/* region start, region end */
+	unsigned int rs, re, start;	/* region start, region end */
 
 	/* promote scan_hint to contig_hint */
 	if (block->scan_hint) {
@@ -765,10 +737,9 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
 	block->right_free = 0;
 
 	/* iterate over free areas and update the contig hints */
-	pcpu_for_each_unpop_region(alloc_map, rs, re, start,
-				   PCPU_BITMAP_BLOCK_BITS) {
+	bitmap_for_each_clear_region(alloc_map, rs, re, start,
+				     PCPU_BITMAP_BLOCK_BITS)
 		pcpu_block_update(block, rs, re);
-	}
 }
 
 /**
@@ -1041,13 +1012,13 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
 static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
 			      int *next_off)
 {
-	int page_start, page_end, rs, re;
+	unsigned int page_start, page_end, rs, re;
 
 	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
 	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
 
 	rs = page_start;
-	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
+	bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
 	if (rs >= page_end)
 		return true;
 
@@ -1702,13 +1673,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 	/* populate if not all pages are already there */
 	if (!is_atomic) {
-		int page_start, page_end, rs, re;
+		unsigned int page_start, page_end, rs, re;
 
 		page_start = PFN_DOWN(off);
 		page_end = PFN_UP(off + size);
 
-		pcpu_for_each_unpop_region(chunk->populated, rs, re,
-					   page_start, page_end) {
+		bitmap_for_each_clear_region(chunk->populated, rs, re,
+					     page_start, page_end) {
 			WARN_ON(chunk->immutable);
 
 			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
@@ -1858,10 +1829,10 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	spin_unlock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, &to_free, list) {
-		int rs, re;
+		unsigned int rs, re;
 
-		pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
-					 chunk->nr_pages) {
+		bitmap_for_each_set_region(chunk->populated, rs, re, 0,
+					   chunk->nr_pages) {
 			pcpu_depopulate_chunk(chunk, rs, re);
 			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_depopulated(chunk, rs, re);
@@ -1893,7 +1864,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	}
 
 	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
-		int nr_unpop = 0, rs, re;
+		unsigned int nr_unpop = 0, rs, re;
 
 		if (!nr_to_pop)
 			break;
@@ -1910,9 +1881,9 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			continue;
 
 		/* @chunk can't go away while pcpu_alloc_mutex is held */
-		pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
-					   chunk->nr_pages) {
-			int nr = min(re - rs, nr_to_pop);
+		bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
+					     chunk->nr_pages) {
+			int nr = min_t(int, re - rs, nr_to_pop);
 
 			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
 			if (!ret) {
-- 
Gitee


From bec151566b51a645a8f6fbf1e41e612615626906 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Wed, 4 Mar 2020 15:09:20 +0100
Subject: [PATCH 1787/2161] include/bitmap.h: add new functions to
 documentation

commit a392d26f32cdd87e09b1ea3849db79cfc4eae745 upstream.

I found these functions only by chance although I was looking exactly
for something like them. So, add them to the list of functions to make
them more visible.

Fixes: e837dfde15a4 ("bitmap: genericize percpu bitmap region iterators")
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/bitmap.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 9c31b5268f7a..d291e0736f1e 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -51,6 +51,12 @@
  *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
  *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
  *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask)  as above
+ *  bitmap_next_clear_region(map, &start, &end, nbits)  Find next clear region
+ *  bitmap_next_set_region(map, &start, &end, nbits)  Find next set region
+ *  bitmap_for_each_clear_region(map, rs, re, start, end)
+ *  						Iterate over all clear regions
+ *  bitmap_for_each_set_region(map, rs, re, start, end)
+ *  						Iterate over all set regions
  *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
  *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
  *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
-- 
Gitee


From 8e9ec3285c28155923b8f7257921220710180ff4 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 6 Jul 2022 20:18:00 +0800
Subject: [PATCH 1788/2161] iommu: introduce iommu_sync_dirty_log() and
 iommu_clear_dirty_log()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Introduce iommu_sync_dirty_log() and iommu_clear_dirty_log().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |   2 +
 drivers/iommu/iommu.c       | 105 ++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h       |  36 +++++++++++++
 3 files changed, 143 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 4ab762b7f88b..7de13e0c279d 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -6179,6 +6179,8 @@ const struct iommu_ops intel_iommu_ops = {
 	.sva_get_pasid		= intel_svm_get_pasid,
 	.page_response		= intel_svm_page_response,
 #endif
+	.sync_dirty_log		= intel_iommu_sync_dirty_log,
+	.clear_dirty_log	= intel_iommu_clear_dirty_log,
 };
 
 static void quirk_iommu_igfx(struct pci_dev *dev)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 219b4b8398ab..eeb6210d50f7 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3085,6 +3085,111 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
 
+int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
+			 size_t size, unsigned long *bitmap,
+			 unsigned long base_iova, unsigned long bitmap_pgshift)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned int min_pagesz;
+	size_t pgsize;
+	int ret;
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+			iova, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	if (!ops || !ops->sync_dirty_log) {
+		pr_err("don't support sync dirty log\n");
+		return -ENODEV;
+	}
+
+	while (size) {
+		pgsize = iommu_pgsize(domain, iova, size);
+
+		ret = ops->sync_dirty_log(domain, iova, pgsize,
+					bitmap, base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+
+		pr_debug("dirty_log_sync: iova 0x%lx pagesz 0x%zx\n", iova,
+			pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_sync_dirty_log);
+
+static int __iommu_clear_dirty_log(struct iommu_domain *domain,
+				   unsigned long iova, size_t size,
+				   unsigned long *bitmap,
+				   unsigned long base_iova,
+				   unsigned long bitmap_pgshift)
+{
+	const struct iommu_ops *ops = domain->ops;
+	size_t pgsize;
+	int ret = 0;
+
+	if (!ops || !ops->clear_dirty_log) {
+		pr_err("don't support clear dirty log\n");
+		return -ENODEV;
+	}
+
+	while (size) {
+		pgsize = iommu_pgsize(domain, iova, size);
+		ret = ops->clear_dirty_log(domain, iova, pgsize, bitmap,
+				base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+
+		pr_debug("dirty_log_clear: iova 0x%lx pagesz 0x%zx\n", iova,
+			 pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+
+	return ret;
+}
+
+int iommu_clear_dirty_log(struct iommu_domain *domain,
+			  unsigned long iova, size_t size,
+			  unsigned long *bitmap, unsigned long base_iova,
+			  unsigned long bitmap_pgshift)
+{
+	unsigned long riova, rsize;
+	unsigned int min_pagesz;
+	int rs, re, start, end, ret = 0;
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx min_pagesz 0x%x\n",
+			iova, min_pagesz);
+		return -EINVAL;
+	}
+
+	start = (iova - base_iova) >> bitmap_pgshift;
+	end = start + (size >> bitmap_pgshift);
+	bitmap_for_each_set_region(bitmap, rs, re, start, end) {
+		riova = iova + (rs << bitmap_pgshift);
+		rsize = (re - rs) << bitmap_pgshift;
+		ret = __iommu_clear_dirty_log(domain, riova, rsize, bitmap,
+					      base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_dirty_log);
+
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4a4388e0bc89..3303100e4ac5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -346,6 +346,15 @@ struct iommu_ops {
 
 	int (*def_domain_type)(struct device *dev);
 
+	int (*sync_dirty_log)(struct iommu_domain *domain,
+			unsigned long iova, size_t size,
+			unsigned long *bitmap, unsigned long base_iova,
+			unsigned long bitmap_pgshift);
+	int (*clear_dirty_log)(struct iommu_domain *domain,
+			unsigned long iova, size_t size,
+			unsigned long *bitmap, unsigned long base_iova,
+			unsigned long bitmap_pgshift);
+
 	unsigned long pgsize_bitmap;
 	struct module *owner;
 
@@ -574,6 +583,15 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 
+extern int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
+				size_t size, unsigned long *bitmap,
+				unsigned long base_iova,
+				unsigned long bitmap_pgshift);
+extern int iommu_clear_dirty_log(struct iommu_domain *domain, unsigned long iova,
+				 size_t dma_size, unsigned long *bitmap,
+				 unsigned long base_iova,
+				 unsigned long bitmap_pgshift);
+
 /* Window handling function prototypes */
 extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
@@ -970,6 +988,24 @@ static inline int iommu_domain_set_attr(struct iommu_domain *domain,
 	return -EINVAL;
 }
 
+static inline int iommu_sync_dirty_log(struct iommu_domain *domain,
+				       unsigned long iova, size_t size,
+				       unsigned long *bitmap,
+				       unsigned long base_iova,
+				       unsigned long pgshift)
+{
+	return -EINVAL;
+}
+
+static inline int iommu_clear_dirty_log(struct iommu_domain *domain,
+					unsigned long iova, size_t size,
+					unsigned long *bitmap,
+					unsigned long base_iova,
+					unsigned long pgshift)
+{
+	return -ENODEV;
+}
+
 static inline int  iommu_device_register(struct iommu_device *iommu)
 {
 	return -ENODEV;
-- 
Gitee


From a773f06a7e289abc7b955dab065de871c5239233 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 6 Jul 2022 18:19:54 +0800
Subject: [PATCH 1789/2161] iommu: introduce iommu_domain_set_hwdbm()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Introduce iommu_domain_set_hwdbm().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c |  1 +
 drivers/iommu/iommu.c       | 17 +++++++++++++++++
 include/linux/iommu.h       | 14 ++++++++++++++
 3 files changed, 32 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7de13e0c279d..f021503ef0fe 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -6179,6 +6179,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.sva_get_pasid		= intel_svm_get_pasid,
 	.page_response		= intel_svm_page_response,
 #endif
+	.set_hwdbm		= intel_iommu_set_hwdbm,
 	.sync_dirty_log		= intel_iommu_sync_dirty_log,
 	.clear_dirty_log	= intel_iommu_clear_dirty_log,
 };
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index eeb6210d50f7..2c9a80db0f63 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3059,6 +3059,23 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
+int iommu_domain_set_hwdbm(struct iommu_domain *domain, bool enable,
+			   unsigned long iova, size_t size)
+{
+	const struct iommu_ops *ops = domain->ops;
+	int ret = 0;
+
+	if (!ops || !ops->set_hwdbm) {
+		pr_err_ratelimited("Don't support set_hwdbm\n");
+		return -EINVAL;
+	}
+
+	ret = ops->set_hwdbm(domain, enable, iova, size);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_domain_set_hwdbm);
+
 int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3303100e4ac5..d215bc77f4ab 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -346,6 +346,9 @@ struct iommu_ops {
 
 	int (*def_domain_type)(struct device *dev);
 
+	int (*set_hwdbm)(struct iommu_domain *domain, bool enable,
+			unsigned long iova, size_t size);
+
 	int (*sync_dirty_log)(struct iommu_domain *domain,
 			unsigned long iova, size_t size,
 			unsigned long *bitmap, unsigned long base_iova,
@@ -578,6 +581,9 @@ extern int iommu_page_response(struct iommu_domain *domain,
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
+extern int iommu_domain_set_hwdbm(struct iommu_domain *domain, bool enable,
+				  unsigned long iova, size_t size);
+
 extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
@@ -1006,6 +1012,14 @@ static inline int iommu_clear_dirty_log(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
+static inline int iommu_domain_set_hwdbm(struct iommu_domain *domain,
+					 bool enable,
+					 unsigned long iova,
+					 size_t size)
+{
+	return -EINVAL;
+}
+
 static inline int  iommu_device_register(struct iommu_device *iommu)
 {
 	return -ENODEV;
-- 
Gitee


From 8bbe2613ddd4680408e26307cfabea4bdc386c17 Mon Sep 17 00:00:00 2001
From: jiangkunkun <jiangkunkun@huawei.com>
Date: Thu, 11 Mar 2021 17:14:51 +0800
Subject: [PATCH 1790/2161] vfio/iommu_type1: Optimize dirty bitmap population
 based on iommu HWDBM

commit 0b5a5a762e0b4e47cbb890b46826d8b228ed8061 Intel-BKC.

In the past if vfio_iommu is not of pinned_page_dirty_scope and
vfio_dma is iommu_mapped, we populate full dirty bitmap for this
vfio_dma. Now we can try to get dirty log from iommu before make
the lousy decision.

Co-developed-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 82 +++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 278181b0ad9e..d1dc398f3b4a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1223,6 +1223,25 @@ static bool vfio_group_supports_hwdbm(struct vfio_iommu_group *group)
 	return true;
 }
 
+static int vfio_iommu_dirty_log_clear(struct vfio_iommu *iommu,
+				      dma_addr_t start_iova, size_t size,
+				      unsigned long *bitmap_buffer,
+				      dma_addr_t base_iova, size_t pgsize)
+{
+	struct vfio_domain *d;
+	unsigned long pgshift = __ffs(pgsize);
+	int ret;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		ret = iommu_clear_dirty_log(d->domain, start_iova, size,
+					    bitmap_buffer, base_iova, pgshift);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 			      struct vfio_dma *dma, dma_addr_t base_iova,
 			      size_t pgsize)
@@ -1234,13 +1253,28 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 	unsigned long shift = bit_offset % BITS_PER_LONG;
 	unsigned long leftover;
 
+	if (!iommu->num_non_pinned_groups || !dma->iommu_mapped)
+		goto bitmap_done;
+
+	/* try to get dirty log from IOMMU */
+	if (!iommu->num_non_hwdbm_groups) {
+		struct vfio_domain *d;
+
+		list_for_each_entry(d, &iommu->domain_list, next) {
+			if (iommu_sync_dirty_log(d->domain, dma->iova, dma->size,
+						 dma->bitmap, dma->iova, pgshift))
+				return -EFAULT;
+		}
+		goto bitmap_done;
+	}
+
 	/*
 	 * mark all pages dirty if any IOMMU capable device is not able
 	 * to report dirty pages and all pages are pinned and mapped.
 	 */
-	if (iommu->num_non_pinned_groups && dma->iommu_mapped)
-		bitmap_set(dma->bitmap, 0, nbits);
+	bitmap_set(dma->bitmap, 0, nbits);
 
+bitmap_done:
 	if (shift) {
 		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
 				  nbits + shift);
@@ -1302,6 +1336,18 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 		 */
 		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
 		vfio_dma_populate_bitmap(dma, pgsize);
+
+		/* Clear iommu dirty log to re-enable dirty log tracking */
+		if (iommu->num_non_pinned_groups &&
+		    dma->iommu_mapped && !iommu->num_non_hwdbm_groups) {
+			ret = vfio_iommu_dirty_log_clear(iommu, dma->iova,
+					dma->size, dma->bitmap, dma->iova,
+					pgsize);
+			if (ret) {
+				pr_warn("dma dirty log clear failed!\n");
+				return ret;
+			}
+		}
 	}
 	return 0;
 }
@@ -3026,6 +3072,33 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
 			-EFAULT : 0;
 }
 
+static void vfio_dma_dirty_log_start(struct vfio_iommu *iommu,
+				     struct vfio_dma *dma,
+				     bool start)
+{
+	struct vfio_domain *d;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		/* Go through all domain anyway even if we fail */
+		iommu_domain_set_hwdbm(d->domain, start, dma->iova, dma->size);
+	}
+}
+
+static void vfio_iommu_dirty_log_switch(struct vfio_iommu *iommu, bool start)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		if (!dma->iommu_mapped)
+			continue;
+
+		/* Go through all dma range anyway even if we fail */
+		vfio_dma_dirty_log_start(iommu, dma, start);
+	}
+}
+
 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
 					unsigned long arg)
 {
@@ -3058,8 +3131,10 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
 		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
 		if (!iommu->dirty_page_tracking) {
 			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
-			if (!ret)
+			if (!ret) {
 				iommu->dirty_page_tracking = true;
+				vfio_iommu_dirty_log_switch(iommu, true);
+			}
 		}
 		mutex_unlock(&iommu->lock);
 		return ret;
@@ -3068,6 +3143,7 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
 		if (iommu->dirty_page_tracking) {
 			iommu->dirty_page_tracking = false;
 			vfio_dma_bitmap_free_all(iommu);
+			vfio_iommu_dirty_log_switch(iommu, false);
 		}
 		mutex_unlock(&iommu->lock);
 		return 0;
-- 
Gitee


From 72216464f36d51654ccf02daaa83994a8c9e5b03 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 14 Apr 2020 05:10:07 -0700
Subject: [PATCH 1791/2161] vfio/type1: Report iommu nesting info to userspace

commit 2095edb171484434b5b8e8cd9c65c0b135f97db8 Intel-BKC.

This patch exports iommu nesting capability info to user space through
VFIO. Userspace is expected to check this info for supported uAPIs (e.g.
bind page table, cache invalidation) and the vendor specific format
information for first level/stage page table that will be bound to.

The nesting info is available only after container set to be NESTED type.
Current implementation imposes one limitation - one nesting container
should include at most one iommu group. The philosophy of vfio container
is having all groups/devices within the container share the same IOMMU
context. When vSVA is enabled, one IOMMU context could include one 2nd-
level address space and multiple 1st-level address spaces. While the
2nd-level address space is reasonably sharable by multiple groups, blindly
sharing 1st-level address spaces across all groups within the container
might instead break the guest expectation. In the future sub/super container
concept might be introduced to allow partial address space sharing within
an IOMMU context. But for now let's go with this restriction by requiring
singleton container for using nesting iommu features. Below link has the
related discussion about this decision.

https://lore.kernel.org/kvm/20200515115924.37e6996d@w520.home/

This patch also changes the NESTING type container behaviour. Something
that would have succeeded before will now fail: Before this series, if
user asked for a VFIO_IOMMU_TYPE1_NESTING, it would have succeeded even
if the SMMU didn't support stage-2, as the driver would have silently
fallen back on stage-1 mappings (which work exactly the same as stage-2
only since there was no nesting supported). After the series, we do check
for DOMAIN_ATTR_NESTING so if user asks for VFIO_IOMMU_TYPE1_NESTING and
the SMMU doesn't support stage-2, the ioctl fails. But it should be a good
fix and completely harmless. Detail can be found in below link as well.

https://lore.kernel.org/kvm/20200717090900.GC4850@myrica/

v7 -> v8:
*) tweak per Alex's comments against v7.
*) check "iommu->nesting_info->format == 0" in attach_group()

v6 -> v7:
*) using vfio_info_add_capability() for adding nesting cap per suggestion
   from Eric.

v5 -> v6:
*) address comments against v5 from Eric Auger.
*) don't report nesting cap to userspace if the nesting_info->format is
   invalid.

v4 -> v5:
*) address comments from Eric Auger.
*) return struct iommu_nesting_info for VFIO_IOMMU_TYPE1_INFO_CAP_NESTING as
   cap is much "cheap", if needs extension in future, just define another cap.
   https://lore.kernel.org/kvm/20200708132947.5b7ee954@x1.home/

v3 -> v4:
*) address comments against v3.

v1 -> v2:
*) added in v2

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 69 +++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       | 19 +++++++++
 2 files changed, 88 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d1dc398f3b4a..f5762b9069ce 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -75,6 +75,7 @@ struct vfio_iommu {
 	uint64_t		pgsize_bitmap;
 	uint64_t		num_non_pinned_groups;
 	wait_queue_head_t	vaddr_wait;
+	struct iommu_nesting_info	*nesting_info;
 	bool			v2;
 	bool			nesting;
 	bool			dirty_page_tracking;
@@ -146,6 +147,9 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
+#define CONTAINER_HAS_DOMAIN(iommu)	(((iommu)->external_domain) || \
+					 (!list_empty(&(iommu)->domain_list)))
+
 #define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
 
 /*
@@ -2310,6 +2314,12 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
 	list_splice_tail(iova_copy, iova);
 }
 
+static void vfio_iommu_release_nesting_info(struct vfio_iommu *iommu)
+{
+	kfree(iommu->nesting_info);
+	iommu->nesting_info = NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -2332,6 +2342,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		return -EINVAL;
 	}
 
+	/* Nesting type container can include only one group */
+	if (iommu->nesting && CONTAINER_HAS_DOMAIN(iommu)) {
+		mutex_unlock(&iommu->lock);
+		return -EINVAL;
+	}
+
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
 	if (!group || !domain) {
@@ -2400,6 +2416,31 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_domain;
 
+	/* Nesting cap info is available only after attaching */
+	if (iommu->nesting) {
+		int size = sizeof(struct iommu_nesting_info);
+
+		iommu->nesting_info = kzalloc(size, GFP_KERNEL);
+		if (!iommu->nesting_info) {
+			ret = -ENOMEM;
+			goto out_detach;
+		}
+
+		/* Now get the nesting info */
+		iommu->nesting_info->argsz = size;
+		ret = iommu_domain_get_attr(domain->domain,
+					    DOMAIN_ATTR_NESTING,
+					    iommu->nesting_info);
+		if (ret)
+			goto out_detach;
+
+		/* when @format of nesting_info is 0, fail the attach */
+		if (iommu->nesting_info->format == 0) {
+			ret = -ENOENT;
+			goto out_detach;
+		}
+	}
+
 	/* Get aperture info */
 	geo = &domain->domain->geometry;
 	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
@@ -2514,6 +2555,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	return 0;
 
 out_detach:
+	vfio_iommu_release_nesting_info(iommu);
 	vfio_iommu_detach_group(domain, group);
 out_domain:
 	iommu_domain_free(domain->domain);
@@ -2700,6 +2742,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 				} else {
 					vfio_iommu_unmap_unpin_reaccount(iommu);
 				}
+
+				vfio_iommu_release_nesting_info(iommu);
 			}
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
@@ -2918,6 +2962,28 @@ static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
 	return ret;
 }
 
+static int vfio_iommu_nesting_build_caps(struct vfio_iommu *iommu,
+					 struct vfio_info_cap *caps)
+{
+	struct vfio_iommu_type1_info_cap_nesting nesting_cap;
+	size_t size;
+
+	/* when nesting_info is null, no need to go further */
+	if (!iommu->nesting_info)
+		return 0;
+
+	size = offsetof(struct vfio_iommu_type1_info_cap_nesting, info) +
+	       iommu->nesting_info->argsz;
+
+	nesting_cap.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_NESTING;
+	nesting_cap.header.version = 1;
+
+	memcpy(&nesting_cap.info, iommu->nesting_info,
+	       iommu->nesting_info->argsz);
+
+	return vfio_info_add_capability(caps, &nesting_cap.header, size);
+}
+
 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
 					   struct vfio_info_cap *caps)
 {
@@ -2969,6 +3035,9 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
 	if (!ret)
 		ret = vfio_iommu_iova_build_caps(iommu, &caps);
 
+	if (!ret)
+		ret = vfio_iommu_nesting_build_caps(iommu, &caps);
+
 	mutex_unlock(&iommu->lock);
 
 	if (!ret)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 30be43622c6d..2f06fe8c31d3 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <linux/iommu.h>
 
 #define VFIO_API_VERSION	0
 
@@ -1038,6 +1039,24 @@ struct vfio_iommu_type1_info_cap_migration {
 	__u64	max_dirty_bitmap_size;		/* in bytes */
 };
 
+/*
+ * The nesting capability allows to report the related capability
+ * and info for nesting iommu type.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * Nested capabilities should be checked by the userspace after
+ * setting VFIO_TYPE1_NESTING_IOMMU.
+ *
+ * @info: the nesting info provided by IOMMU driver.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING  4
+
+struct vfio_iommu_type1_info_cap_nesting {
+	struct	vfio_info_cap_header header;
+	struct	iommu_nesting_info info;
+};
+
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
 
 /**
-- 
Gitee


From 85b5561b45800fa77b1a1b63db07953f2beba70e Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 21 Apr 2020 03:01:09 -0700
Subject: [PATCH 1792/2161] vfio/type1: Support binding guest page tables to
 PASID

commit 9861f0079c060b9cb20d049959bdfe84a3ea9f07 Intel-BKC.

Nesting translation allows two-levels/stages page tables, with 1st level
for guest translations (e.g. GVA->GPA), 2nd level for host translations
(e.g. GPA->HPA). This patch adds interface for binding guest page tables
to a PASID. This PASID must have been allocated by the userspace before
the binding request. e.g. allocated from /dev/ioasid. As the bind data
is parsed by iommu abstract layer, so this patch doesn't have the ownership
check against the PASID from userspace. It would be done in the iommu sub-
system.

v7 -> v8:
*) adapt to /dev/ioasid
*) address comments from Alex on v7.
*) adapt to latest iommu_sva_unbind_gpasid() implementation.
*) remove the OP check against VFIO_IOMMU_NESTING_OP_NUM as it's redundant
   to the default switch case in vfio_iommu_handle_pgtbl_op().

v6 -> v7:
*) introduced @user in struct domain_capsule to simplify the code per Eric's
   suggestion.
*) introduced VFIO_IOMMU_NESTING_OP_NUM for sanitizing op from userspace.
*) corrected the @argsz value of unbind_data in vfio_group_unbind_gpasid_fn().

v5 -> v6:
*) dropped vfio_find_nesting_group() and add vfio_get_nesting_domain_capsule().
   per comment from Eric.
*) use iommu_uapi_sva_bind/unbind_gpasid() and iommu_sva_unbind_gpasid() in
   linux/iommu.h for userspace operation and in-kernel operation.

v3 -> v4:
*) address comments from Alex on v3

v2 -> v3:
*) use __iommu_sva_unbind_gpasid() for unbind call issued by VFIO
   https://lore.kernel.org/linux-iommu/1592931837-58223-6-git-send-email-jacob.jun.pan@linux.intel.com/

v1 -> v2:
*) rename subject from "vfio/type1: Bind guest page tables to host"
*) remove VFIO_IOMMU_BIND, introduce VFIO_IOMMU_NESTING_OP to support bind/
   unbind guet page table
*) replaced vfio_iommu_for_each_dev() with a group level loop since this
   series enforces one group per container w/ nesting type as start.
*) rename vfio_bind/unbind_gpasid_fn() to vfio_dev_bind/unbind_gpasid_fn()
*) vfio_dev_unbind_gpasid() always successful
*) use vfio_mm->pasid_lock to avoid race between PASID free and page table
   bind/unbind

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 156 ++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       |  35 +++++++
 2 files changed, 191 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f5762b9069ce..f9d153d80693 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -165,6 +165,34 @@ struct vfio_regions {
 
 #define WAITED 1
 
+struct domain_capsule {
+	struct vfio_iommu_group	*group;
+	struct iommu_domain	*domain;
+	void			*data;
+	/* set if @data contains a user pointer*/
+	bool			user;
+};
+
+/* iommu->lock must be held */
+static int vfio_prepare_nesting_domain_capsule(struct vfio_iommu *iommu,
+					       struct domain_capsule *dc)
+{
+	struct vfio_domain *domain;
+	struct vfio_iommu_group *group;
+
+	if (!iommu->nesting_info)
+		return -EINVAL;
+
+	domain = list_first_entry(&iommu->domain_list,
+				  struct vfio_domain, next);
+	group = list_first_entry(&domain->group_list,
+				 struct vfio_iommu_group, next);
+	dc->group = group;
+	dc->domain = domain->domain;
+	dc->user = true;
+	return 0;
+}
+
 static int put_pfn(unsigned long pfn, int prot);
 
 static struct vfio_iommu_group*
@@ -2678,6 +2706,51 @@ static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
 	return ret;
 }
 
+static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	unsigned long arg = *(unsigned long *)dc->data;
+
+	return iommu_uapi_sva_bind_gpasid(dc->domain, dev,
+					  (void __user *)arg);
+}
+
+static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+
+	/*
+	 * dc->user is a toggle for the unbind operation. When user
+	 * set, the dc->data passes in a __user pointer and requires
+	 * to use iommu_uapi_sva_unbind_gpasid(), in which it will
+	 * copy the unbind data from the user buffer. When user is
+	 * clear, the dc->data passes in a pasid which is going to
+	 * be unbind no need to copy data from userspace.
+	 */
+	if (dc->user) {
+		unsigned long arg = *(unsigned long *)dc->data;
+
+		iommu_uapi_sva_unbind_gpasid(dc->domain,
+					     dev, (void __user *)arg);
+	} else {
+		ioasid_t pasid = *(ioasid_t *)dc->data;
+
+		iommu_sva_unbind_gpasid(dc->domain, dev, pasid);
+	}
+	return 0;
+}
+
+static void vfio_group_unbind_gpasid_fn(ioasid_t pasid, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+
+	dc->user = false;
+	dc->data = &pasid;
+
+	iommu_group_for_each_dev(dc->group->iommu_group,
+				 dc, vfio_dev_unbind_gpasid_fn);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -2722,6 +2795,27 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		if (!group)
 			continue;
 
+		if (iommu->nesting_info &&
+		    iommu->nesting_info->features &
+					IOMMU_NESTING_FEAT_BIND_PGTBL) {
+			struct domain_capsule dc = { .group = group,
+						     .domain = domain->domain,
+						     .data = NULL };
+			struct ioasid_user *iuser;
+
+			/*
+			 * For devices attached to nesting type iommu,
+			 * VFIO should unbind page tables bound with the
+			 * devices in the iommu group before detaching.
+			 */
+			iuser = ioasid_user_get_from_task(current);
+			if (!(IS_ERR(iuser) || !iuser)) {
+				ioasid_user_for_each_id(iuser, &dc,
+					       vfio_group_unbind_gpasid_fn);
+				ioasid_user_put(iuser);
+			}
+		}
+
 		vfio_iommu_detach_group(domain, group);
 		update_dirty_scope = !group->pinned_page_dirty_scope;
 		update_iommu_hwdbm = !group->iommu_hwdbm;
@@ -3275,6 +3369,66 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
 	return -EINVAL;
 }
 
+static long vfio_iommu_handle_pgtbl_op(struct vfio_iommu *iommu,
+				       bool is_bind, unsigned long arg)
+{
+	struct domain_capsule dc = { .data = &arg, .user = true };
+	struct iommu_nesting_info *info;
+	int ret;
+
+	mutex_lock(&iommu->lock);
+
+	info = iommu->nesting_info;
+	if (!info || !(info->features & IOMMU_NESTING_FEAT_BIND_PGTBL)) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+
+	ret = vfio_prepare_nesting_domain_capsule(iommu, &dc);
+	if (ret)
+		goto out_unlock;
+
+	if (is_bind)
+		ret = iommu_group_for_each_dev(dc.group->iommu_group, &dc,
+					       vfio_dev_bind_gpasid_fn);
+	if (ret || !is_bind)
+		iommu_group_for_each_dev(dc.group->iommu_group,
+					 &dc, vfio_dev_unbind_gpasid_fn);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu,
+					unsigned long arg)
+{
+	struct vfio_iommu_type1_nesting_op hdr;
+	unsigned int minsz;
+	int ret;
+
+	minsz = offsetofend(struct vfio_iommu_type1_nesting_op, flags);
+
+	if (copy_from_user(&hdr, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (hdr.argsz < minsz || hdr.flags & ~VFIO_NESTING_OP_MASK)
+		return -EINVAL;
+
+	switch (hdr.flags & VFIO_NESTING_OP_MASK) {
+	case VFIO_IOMMU_NESTING_OP_BIND_PGTBL:
+		ret = vfio_iommu_handle_pgtbl_op(iommu, true, arg + minsz);
+		break;
+	case VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL:
+		ret = vfio_iommu_handle_pgtbl_op(iommu, false, arg + minsz);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -3291,6 +3445,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 		return vfio_iommu_type1_unmap_dma(iommu, arg);
 	case VFIO_IOMMU_DIRTY_PAGES:
 		return vfio_iommu_type1_dirty_pages(iommu, arg);
+	case VFIO_IOMMU_NESTING_OP:
+		return vfio_iommu_type1_nesting_op(iommu, arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2f06fe8c31d3..b8d225f95a13 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1192,6 +1192,41 @@ struct vfio_iommu_type1_dirty_bitmap_get {
 
 #define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
 
+/**
+ * VFIO_IOMMU_NESTING_OP - _IOW(VFIO_TYPE, VFIO_BASE + 18,
+ *				struct vfio_iommu_type1_nesting_op)
+ *
+ * This interface allows userspace to utilize the nesting IOMMU
+ * capabilities as reported in VFIO_IOMMU_TYPE1_INFO_CAP_NESTING
+ * cap through VFIO_IOMMU_GET_INFO. For platforms which require
+ * system wide PASID, PASID will be allocated by VFIO_IOMMU_PASID
+ * _REQUEST.
+ *
+ * @data[] types defined for each op:
+ * +=================+===============================================+
+ * | NESTING OP      |      @data[]                                  |
+ * +=================+===============================================+
+ * | BIND_PGTBL      |      struct iommu_gpasid_bind_data            |
+ * +-----------------+-----------------------------------------------+
+ * | UNBIND_PGTBL    |      struct iommu_gpasid_bind_data            |
+ * +-----------------+-----------------------------------------------+
+ *
+ * returns: 0 on success, -errno on failure.
+ */
+struct vfio_iommu_type1_nesting_op {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_NESTING_OP_MASK	(0xffff) /* lower 16-bits for op */
+	__u8	data[];
+};
+
+enum {
+	VFIO_IOMMU_NESTING_OP_BIND_PGTBL,
+	VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL,
+};
+
+#define VFIO_IOMMU_NESTING_OP		_IO(VFIO_TYPE, VFIO_BASE + 18)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*
-- 
Gitee


From 5e59809d2d1b9ee8407a1ed432c241f65626c398 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 14 Apr 2020 05:10:07 -0700
Subject: [PATCH 1793/2161] vfio/type1: Allow invalidating first-level/stage
 IOMMU cache

commit 3116f7f0c27edd58822e526ba3255554507011ed Intel-BKC.

This patch provides an interface allowing the userspace to invalidate
IOMMU cache for first-level page table. It is required when the first
level IOMMU page table is not managed by the host kernel in the nested
translation setup.

v1 -> v2:
*) rename from "vfio/type1: Flush stage-1 IOMMU cache for nesting type"
*) rename vfio_cache_inv_fn() to vfio_dev_cache_invalidate_fn()
*) vfio_dev_cache_inv_fn() always successful
*) remove VFIO_IOMMU_CACHE_INVALIDATE, and reuse VFIO_IOMMU_NESTING_OP

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 38 +++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       |  3 +++
 2 files changed, 41 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f9d153d80693..33134808ed91 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -3400,6 +3400,41 @@ static long vfio_iommu_handle_pgtbl_op(struct vfio_iommu *iommu,
 	return ret;
 }
 
+static int vfio_dev_cache_invalidate_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	unsigned long arg = *(unsigned long *)dc->data;
+
+	iommu_uapi_cache_invalidate(dc->domain, dev, (void __user *)arg);
+	return 0;
+}
+
+static long vfio_iommu_invalidate_cache(struct vfio_iommu *iommu,
+					unsigned long arg)
+{
+	struct domain_capsule dc = { .data = &arg };
+	struct iommu_nesting_info *info;
+	int ret;
+
+	mutex_lock(&iommu->lock);
+	info = iommu->nesting_info;
+	if (!info || !(info->features & IOMMU_NESTING_FEAT_CACHE_INVLD)) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+
+	ret = vfio_prepare_nesting_domain_capsule(iommu, &dc);
+	if (ret)
+		goto out_unlock;
+
+	iommu_group_for_each_dev(dc.group->iommu_group, &dc,
+				 vfio_dev_cache_invalidate_fn);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
 static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu,
 					unsigned long arg)
 {
@@ -3422,6 +3457,9 @@ static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu,
 	case VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL:
 		ret = vfio_iommu_handle_pgtbl_op(iommu, false, arg + minsz);
 		break;
+	case VFIO_IOMMU_NESTING_OP_CACHE_INVLD:
+		ret = vfio_iommu_invalidate_cache(iommu, arg + minsz);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index b8d225f95a13..c120a820763b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1210,6 +1210,8 @@ struct vfio_iommu_type1_dirty_bitmap_get {
  * +-----------------+-----------------------------------------------+
  * | UNBIND_PGTBL    |      struct iommu_gpasid_bind_data            |
  * +-----------------+-----------------------------------------------+
+ * | CACHE_INVLD     |      struct iommu_cache_invalidate_info       |
+ * +-----------------+-----------------------------------------------+
  *
  * returns: 0 on success, -errno on failure.
  */
@@ -1223,6 +1225,7 @@ struct vfio_iommu_type1_nesting_op {
 enum {
 	VFIO_IOMMU_NESTING_OP_BIND_PGTBL,
 	VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL,
+	VFIO_IOMMU_NESTING_OP_CACHE_INVLD,
 };
 
 #define VFIO_IOMMU_NESTING_OP		_IO(VFIO_TYPE, VFIO_BASE + 18)
-- 
Gitee


From 8ba3a4d110f9da64ebad39fc367aad7df687c206 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 14 Apr 2020 05:10:07 -0700
Subject: [PATCH 1794/2161] vfio/type1: Add vSVA support for IOMMU-backed mdevs

commit f6b1fabccb6345bb8a31c3cde3be519bf87fd89d Intel-BKC.

Recent years, mediated device pass-through framework (e.g. vfio-mdev)
is used to achieve flexible device sharing across domains (e.g. VMs).
Also there are hardware assisted mediated pass-through solutions from
platform vendors. e.g. Intel VT-d scalable mode which supports Intel
Scalable I/O Virtualization technology. Such mdevs are called IOMMU-
backed mdevs as there are IOMMU enforced DMA isolation for such mdevs.
In kernel, IOMMU-backed mdevs are exposed to IOMMU layer by aux-domain
concept, which means mdevs are protected by an iommu domain which is
auxiliary to the domain that the kernel driver primarily uses for DMA
API. Details can be found in the KVM presentation as below:

https://events19.linuxfoundation.org/wp-content/uploads/2017/12/Hardware-Assisted-Mediated-Pass-Through-with-VFIO-Kevin-Tian-Intel.pdf

This patch extends NESTING_IOMMU ops to IOMMU-backed mdev devices. The
main requirement is to use the auxiliary domain associated with mdev.

v5 -> v6:
*) add review-by from Eric Auger.

v1 -> v2:
*) check the iommu_device to ensure the handling mdev is IOMMU-backed

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
CC: Jun Tian <jun.j.tian@intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 36 ++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 33134808ed91..bf15fc99b3d1 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2706,18 +2706,38 @@ static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
 	return ret;
 }
 
+static struct device *vfio_get_iommu_device(struct vfio_iommu_group *group,
+					    struct device *dev)
+{
+	struct mdev_device *mdev = to_mdev_device(dev);
+	if (group->mdev_group)
+		return mdev_get_iommu_device(mdev);
+	else
+		return dev;
+}
+
 static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data)
 {
 	struct domain_capsule *dc = (struct domain_capsule *)data;
 	unsigned long arg = *(unsigned long *)dc->data;
+	struct device *iommu_device;
 
-	return iommu_uapi_sva_bind_gpasid(dc->domain, dev,
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
+
+	return iommu_uapi_sva_bind_gpasid(dc->domain, iommu_device,
 					  (void __user *)arg);
 }
 
 static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data)
 {
 	struct domain_capsule *dc = (struct domain_capsule *)data;
+	struct device *iommu_device;
+
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
 
 	/*
 	 * dc->user is a toggle for the unbind operation. When user
@@ -2730,12 +2750,12 @@ static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data)
 	if (dc->user) {
 		unsigned long arg = *(unsigned long *)dc->data;
 
-		iommu_uapi_sva_unbind_gpasid(dc->domain,
-					     dev, (void __user *)arg);
+		iommu_uapi_sva_unbind_gpasid(dc->domain, iommu_device,
+					     (void __user *)arg);
 	} else {
 		ioasid_t pasid = *(ioasid_t *)dc->data;
 
-		iommu_sva_unbind_gpasid(dc->domain, dev, pasid);
+		iommu_sva_unbind_gpasid(dc->domain, iommu_device, pasid);
 	}
 	return 0;
 }
@@ -3404,8 +3424,14 @@ static int vfio_dev_cache_invalidate_fn(struct device *dev, void *data)
 {
 	struct domain_capsule *dc = (struct domain_capsule *)data;
 	unsigned long arg = *(unsigned long *)dc->data;
+	struct device *iommu_device;
+
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
 
-	iommu_uapi_cache_invalidate(dc->domain, dev, (void __user *)arg);
+	iommu_uapi_cache_invalidate(dc->domain, iommu_device,
+				    (void __user *)arg);
 	return 0;
 }
 
-- 
Gitee


From 6ff82bc73a95b5ee935329ce81cc941e826ff861 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 9 Nov 2021 23:50:51 +0800
Subject: [PATCH 1795/2161] vfio/mdev: Add a member for iommu fault data in
 mdev_device

commit 25a69fdd045011cc71b7f64169239b17c3b877fa Intel-BKC.

This patch adds a member to save iommu fault data in mdev_device structure.
VDCM (Virtual Device Control Module) saves IOMMU-capable mdev's iommu fault
data here in the mdev open or creation. vfio iommu modules may retrive the
fault data after binding IOMMU-capale mdev with a PASID.

Below member is added in struct mdev_device:
* iommu_fault_data
  - A place to save the iommu fault data which is passed to fault handler
    to differentiate mdevs

Below helpers are added to set and get iommu fault data in struct mdev_device.
* mdev_set/get_iommu_fault_data()
  - per IOMMU-capable mdev fault data will be kept in the mdev data structure
    and be retrived later.

As fault injection (includes PRQ) to VM is done per device (PF or sub-device),
so the fault_handler_data is usually per-mdev (a.k.a. sub-device).
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/mdev.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 93a7842c8018..a3ca88580e56 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -19,6 +19,7 @@ struct mdev_device {
 	struct list_head next;
 	struct mdev_type *type;
 	struct device *iommu_device;
+	void *iommu_fault_data;
 	bool active;
 };
 
@@ -50,6 +51,27 @@ unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
 unsigned int mtype_get_type_group_id(struct mdev_type *mtype);
 struct device *mtype_get_parent_dev(struct mdev_type *mtype);
 
+/*
+ * Called by the parent device driver to set the iommu fault data which
+ * is used for iommu fault reporting on the mdev. The vfio iommu modules
+ * could call mdev_get_iommu_fault_data() to retrieve fault data and add
+ * it to physical device's fault data list.
+ *
+ * @dev: the mediated device that iommu will report fault.
+ * @fault_data: the iommu fault data for @dev.
+ *
+ * Return 0 for success, otherwise negative error value.
+ */
+static inline void mdev_set_iommu_fault_data(struct mdev_device *mdev, void *fault_data)
+{
+	mdev->iommu_fault_data = fault_data;
+}
+
+static inline void *mdev_get_iommu_fault_data(struct mdev_device *mdev)
+{
+	return mdev->iommu_fault_data;
+}
+
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
  * register the device to mdev module.
-- 
Gitee


From 09a01033aeefa9bfe1fa1f7ff7ee2b0ca939ea12 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Sat, 27 Jun 2020 23:20:17 -0700
Subject: [PATCH 1796/2161] vfio/type1: Get per-mdev fault_handler_data for
 IOMMU-capable mdevs

commit 3ab3a3fa8eed0c861370ef574a5960e8cff83b6f Intel-BKC.

With the introduction of mdev, fault reporting is done at per PASID-dev
granularity. This requires VFIO to get fault_handler_data from parent
device and pass it to IOMMU driver, thus IOMMU driver could bind PASID
with the fault_hanlder_data to fulfill the per PASID-dev granularity
fault reporting rule.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf15fc99b3d1..fc964adafcc2 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2720,14 +2720,20 @@ static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data)
 {
 	struct domain_capsule *dc = (struct domain_capsule *)data;
 	unsigned long arg = *(unsigned long *)dc->data;
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct device *iommu_device;
+	void *iommu_fault_data = NULL;
 
 	iommu_device = vfio_get_iommu_device(dc->group, dev);
 	if (!iommu_device)
 		return -EINVAL;
 
+	if (iommu_device != dev)
+		iommu_fault_data = mdev_get_iommu_fault_data(mdev);
+
 	return iommu_uapi_sva_bind_gpasid(dc->domain, iommu_device,
-					  (void __user *)arg);
+					  (void __user *)arg,
+					  iommu_fault_data);
 }
 
 static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data)
-- 
Gitee


From 5acdfb734bf44d3842d0cf16142aee6d58eca83f Mon Sep 17 00:00:00 2001
From: Cathy Zhang <cathy.zhang@intel.com>
Date: Wed, 10 Nov 2021 23:59:48 +0800
Subject: [PATCH 1797/2161] vfio/type1: A fix for iommu: forklift v5.12 iommu
 to v5.15 bkc

commit 54a04b67a76754cf9ac686eec4a41ed68ad5cf5c Intel-BKC.

The iommu part of below commit was merged by forklift v5.12 iommu to v5.15 bkc.
But vfio is still using the old API, so it will cause compiling errors.

   commit af8e85ec400d4fd01b80cc8cf8654098ffbff6ba
   Author: Sun, Yi Y <yi.y.sun@intel.com>
   Date:   Wed Nov 25 16:37:42 2020 +0800

       iommu/vt-d: Support bind/unbind guest page table to default PASID

       For guest IOVA support under nesting IOMMU, hypervisor needs to bind/
       unbind guest's IOVA page table to host. For such bind/unbind request
       from user space, host should figure out a target PASID (host default
       PASID) to be bound. This patch adds a flag to indicate host IOMMU
       driver if it needs to use default PASID in host.

       FIXME: pasid_set = NULL; // dmar_domain->pasid_set;

       Change-Id: I71301dd022be8970abad9a51e7fe52650bc9055b
       Signed-off-by: Sun, Yi Y <yi.y.sun@intel.com>
       Signed-off-by: Liu Yi L <yi.l.liu@intel.com>

Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fc964adafcc2..db3b95708f6e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -171,6 +171,7 @@ struct domain_capsule {
 	void			*data;
 	/* set if @data contains a user pointer*/
 	bool			user;
+	u64			flags;
 };
 
 /* iommu->lock must be held */
@@ -2761,7 +2762,7 @@ static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data)
 	} else {
 		ioasid_t pasid = *(ioasid_t *)dc->data;
 
-		iommu_sva_unbind_gpasid(dc->domain, iommu_device, pasid);
+		iommu_sva_unbind_gpasid(dc->domain, iommu_device, pasid, dc->flags);
 	}
 	return 0;
 }
-- 
Gitee


From 7f3d57c8f9db3089efb7a18cd024e2647f020a35 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Mon, 14 Jan 2019 15:59:02 +0800
Subject: [PATCH 1798/2161] vfio/pci: Expose PCIe PASID capability to userspace

commit 92e6ee8e0ff4f6d72ac2f2ead707879105566b33 Intel-BKC.

This patch exposes PCIe PASID capability to userspace and where to
emulate this capability if wants to further expose it to VM.

And this patch only exposes PASID capability for devices which has PCIe
PASID extended struture in its configuration space. While for VFs, user
space still unable to see this capability as SR-IOV spec forbides VF to
implement PASID capability extended structure. It is a TODO in future.
Related discussion can be found in below link:

https://lore.kernel.org/kvm/20200407095801.648b1371@w520.home/

v7 -> v8:
*) refine the commit message and the subject.

v5 -> v6:
*) add review-by from Eric Auger.

v1 -> v2:
*) added in v2, but it was sent in a separate patchseries before

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 9be98bce8405..a20ab95b1b89 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -95,7 +95,7 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
 	[PCI_EXT_CAP_ID_LTR]	=	PCI_EXT_CAP_LTR_SIZEOF,
 	[PCI_EXT_CAP_ID_SECPCI]	=	0,	/* not yet */
 	[PCI_EXT_CAP_ID_PMUX]	=	0,	/* not yet */
-	[PCI_EXT_CAP_ID_PASID]	=	0,	/* not yet */
+	[PCI_EXT_CAP_ID_PASID]	=	PCI_EXT_CAP_PASID_SIZEOF,
 };
 
 /*
-- 
Gitee


From 46c7f9becf3d685377ed1dbdf0d0c7bd45109d62 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 8 Nov 2021 18:23:19 +0800
Subject: [PATCH 1799/2161] vfio/mdev: Add a member for iommu domain in
 mdev_device

commit 39e3147641c33c320b23d902e8f545ac1138cf4a Intel-BKC.

This adds a member to save iommu domain in mdev_device
structure. Whenever an iommu domain is attached to the
mediated device, it must be save here so that a VDCM
(Virtual Device Control Module) could retreive it.

Below member is added in struct mdev_device:
* iommu_domain
  - A place to save the iommu domain attached to this
    mdev.

Below helpers are added to set and get iommu domain in
struct mdev_device.
* mdev_set/get_iommu_domain(domain)
  - A iommu domain which has been attached to the iommu
    device in order to protect and isolate the mediated
    device will be kept in the mdev data structure and
    could be retrieved later.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Liu Yi L <yi.l.liu@intel.com>
Suggested-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/mdev.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index a3ca88580e56..b64ad7b70d29 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -20,6 +20,7 @@ struct mdev_device {
 	struct mdev_type *type;
 	struct device *iommu_device;
 	void *iommu_fault_data;
+	void *iommu_domain;
 	bool active;
 };
 
@@ -72,6 +73,22 @@ static inline void *mdev_get_iommu_fault_data(struct mdev_device *mdev)
 	return mdev->iommu_fault_data;
 }
 
+/*
+ * Called by vfio iommu modules to save the iommu domain after a domain being
+ * attached to the mediated device. The vDCM (virtual device control module)
+ * could call mdev_get_iommu_domain() to retrieve an auxiliary domain attached
+ * to an mdev.
+ */
+static inline void mdev_set_iommu_domain(struct mdev_device *mdev, void *domain)
+{
+	mdev->iommu_domain = domain;
+}
+
+static inline void *mdev_get_iommu_domain(struct mdev_device *mdev)
+{
+	return mdev->iommu_domain;
+}
+
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
  * register the device to mdev module.
-- 
Gitee


From 8d6629e7c65537c91481d59cd28c6efc3b4fd603 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 8 Nov 2021 19:25:19 +0800
Subject: [PATCH 1800/2161] vfio/type1: Save domain when attach domain to mdev

commit ff2c8a3c02c8e1541e6c4b913ad5cd5b260cbe91 Intel-BKC.

This saves the iommu domain in mdev on attaching a domain
to it and clear it on detaching.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Liu Yi L <yi.l.liu@intel.com>

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index db3b95708f6e..a26943d992d7 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2040,18 +2040,28 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
 static int vfio_mdev_attach_domain(struct device *dev, void *data)
 {
 	struct mdev_device *mdev = to_mdev_device(dev);
-	struct iommu_domain *domain = data;
+	struct iommu_domain *domain;
 	struct device *iommu_device;
+	int ret = -ENODEV;
+
+	/* Only single domain is allowed to attach to an mdev. */
+	domain = mdev_get_iommu_domain(mdev);
+	if (domain)
+		return -EINVAL;
+	domain = data;
 
 	iommu_device = mdev_get_iommu_device(mdev);
 	if (iommu_device) {
 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
-			return iommu_aux_attach_device(domain, iommu_device);
+			ret = iommu_aux_attach_device(domain, iommu_device);
 		else
-			return iommu_attach_device(domain, iommu_device);
+			ret = iommu_attach_device(domain, iommu_device);
 	}
 
-	return -EINVAL;
+	if (!ret)
+		mdev_set_iommu_domain(mdev, domain);
+
+	return ret;
 }
 
 static int vfio_mdev_detach_domain(struct device *dev, void *data)
@@ -2068,6 +2078,8 @@ static int vfio_mdev_detach_domain(struct device *dev, void *data)
 			iommu_detach_device(domain, iommu_device);
 	}
 
+	mdev_set_iommu_domain(mdev, NULL);
+
 	return 0;
 }
 
-- 
Gitee


From 2ea5b9b28cc43d083ad8f0f1b58a40398685cc23 Mon Sep 17 00:00:00 2001
From: Cathy Zhang <cathy.zhang@intel.com>
Date: Wed, 10 Nov 2021 16:42:58 +0800
Subject: [PATCH 1801/2161] vfio: Add new IRQ for DMA fault reporting

commit fbf911818ca1df9ab7fc4e4e2f7071fff316fa2e Intel-BKC.

Add a new IRQ type/subtype to get notification on nested
stage DMA faults.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c120a820763b..0085fbce9d24 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -676,6 +676,9 @@ struct vfio_irq_info {
 };
 #define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
 
+#define VFIO_IRQ_TYPE_NESTED				(1)
+#define VFIO_IRQ_SUBTYPE_DMA_FAULT			(1)
+
 /**
  * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
  *
-- 
Gitee


From 9d0fa2b16f1e63046b555f245fec2b5b4fe06d48 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 21 Jul 2022 15:35:08 +0800
Subject: [PATCH 1802/2161] iommu: Resolve merge conflict for
 intel_iommu_get_nesting_info()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for intel_iommu_get_nesting_info(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 7 ++++++-
 include/uapi/linux/iommu.h  | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f021503ef0fe..27d2fe2e77fe 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5788,8 +5788,13 @@ static int intel_iommu_get_nesting_info(struct iommu_domain *domain,
 
 	info->addr_width = dmar_domain->gaw;
 	info->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+	/* REVISIT:
+	 * to be precise, may only report SYSWIDE_PASID when pasid is
+	 * supported, also may only report page_resp when PRS is supported
+	 */
 	info->features = IOMMU_NESTING_FEAT_BIND_PGTBL |
-			 IOMMU_NESTING_FEAT_CACHE_INVLD;
+			 IOMMU_NESTING_FEAT_CACHE_INVLD |
+			 IOMMU_NESTING_FEAT_PAGE_RESP;
 	info->pasid_bits = ilog2(intel_pasid_max_id);
 	memset(&info->padding, 0x0, 12);
 
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 6695e91c64f4..8696459e1b9e 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -402,6 +402,7 @@ struct iommu_nesting_info {
 	__u32	format;
 #define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 0)
 #define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 1)
+#define IOMMU_NESTING_FEAT_PAGE_RESP		(1 << 2)
 	__u32	features;
 	__u16	addr_width;
 	__u16	pasid_bits;
-- 
Gitee


From 5793884cdd06c9cce495047ca457b0149bba7079 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 9 Nov 2021 15:30:10 +0800
Subject: [PATCH 1803/2161] vfio/pci: Add VFIO_REGION_TYPE_NESTED region type

commit 12e8b6371eb4d38778732b82f75eb05db8f49cc1 Intel-BKC.

Add a new specific DMA_FAULT region aiming to exposed nested mode
translation faults. This region only is exposed if the device
is attached to a nested domain.

The region has a ring buffer that contains the actual fault
records plus a header allowing to handle it (tail/head indices,
max capacity, entry size). At the moment the region is dimensionned
for 512 fault records.

Changes from Eric's original version:
*) check iommu_nesting_info instead of bool
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 80 ++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_rdwr.c | 44 ++++++++++++++++++
 include/linux/vfio_pci_core.h    |  6 +++
 include/uapi/linux/vfio.h        | 35 ++++++++++++++
 4 files changed, 165 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index aac578e6adcb..d69c43fb4b2d 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -239,6 +239,81 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
 	return ret;
 }
 
+static void vfio_pci_dma_fault_release(struct vfio_pci_core_device *vdev,
+				       struct vfio_pci_region *region)
+{
+	kfree(vdev->fault_pages);
+}
+
+static int vfio_pci_dma_fault_add_capability(struct vfio_pci_core_device *vdev,
+					     struct vfio_pci_region *region,
+					     struct vfio_info_cap *caps)
+{
+	struct vfio_region_info_cap_fault cap = {
+		.header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
+		.header.version = 1,
+		.version = 1,
+	};
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
+	.rw		= vfio_pci_dma_fault_rw,
+	.release	= vfio_pci_dma_fault_release,
+	.add_capability = vfio_pci_dma_fault_add_capability,
+};
+
+#define DMA_FAULT_RING_LENGTH 512
+
+static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_region_dma_fault *header;
+	struct iommu_domain *domain;
+	struct iommu_nesting_info info;
+	size_t size;
+	int ret;
+
+	domain = iommu_get_domain_for_dev(&vdev->pdev->dev);
+	ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &info);
+	if (ret || !(info.features & IOMMU_NESTING_FEAT_PAGE_RESP)) {
+		/*
+		 * No need go futher as no page request service support.
+		 */
+		return 0;
+	}
+
+	mutex_init(&vdev->fault_queue_lock);
+
+	/*
+	 * We provision 1 page for the header and space for
+	 * DMA_FAULT_RING_LENGTH fault records in the ring buffer.
+	 */
+	size = ALIGN(sizeof(struct iommu_fault) *
+		     DMA_FAULT_RING_LENGTH, PAGE_SIZE) + PAGE_SIZE;
+
+	vdev->fault_pages = kzalloc(size, GFP_KERNEL);
+	if (!vdev->fault_pages)
+		return -ENOMEM;
+
+	ret = vfio_pci_register_dev_region(vdev,
+		VFIO_REGION_TYPE_NESTED,
+		VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
+		&vfio_pci_dma_fault_regops, size,
+		VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+		vdev->fault_pages);
+	if (ret)
+		goto out;
+
+	header = (struct vfio_region_dma_fault *)vdev->fault_pages;
+	header->entry_size = sizeof(struct iommu_fault);
+	header->nb_entries = DMA_FAULT_RING_LENGTH;
+	header->offset = sizeof(struct vfio_region_dma_fault);
+	return 0;
+out:
+	kfree(vdev->fault_pages);
+	return ret;
+}
+
 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -308,6 +383,11 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
 		vdev->has_vga = true;
 
+	ret = vfio_pci_dma_fault_init(vdev);
+	if (ret) {
+		pci_disable_device(pdev);
+		return ret;
+	}
 
 	return 0;
 }
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 57d3b2cbbd8e..e3ff97613827 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -381,6 +381,50 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
 	}
 }
 
+ssize_t vfio_pci_dma_fault_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			     size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	void *base = vdev->region[i].data;
+	int ret = -EFAULT;
+
+	if (pos >= vdev->region[i].size)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	mutex_lock(&vdev->fault_queue_lock);
+
+	if (iswrite) {
+		struct vfio_region_dma_fault *header =
+			(struct vfio_region_dma_fault *)base;
+		u32 new_tail;
+
+		if (pos != 0 || count != 4) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		if (copy_from_user((void *)&new_tail, buf, count))
+			goto unlock;
+
+		if (new_tail > header->nb_entries) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+		header->tail = new_tail;
+	} else {
+		if (copy_to_user(buf, base + pos, count))
+			goto unlock;
+	}
+	*ppos += count;
+	ret = count;
+unlock:
+	mutex_unlock(&vdev->fault_queue_lock);
+	return ret;
+}
+
 static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
 {
 	struct vfio_pci_ioeventfd *ioeventfd = opaque;
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index ec0fb70be2d3..0a3f6318c18e 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -129,6 +129,8 @@ struct vfio_pci_core_device {
 	int			ioeventfds_nr;
 	struct eventfd_ctx	*err_trigger;
 	struct eventfd_ctx	*req_trigger;
+	u8			*fault_pages;
+	struct mutex		fault_queue_lock;
 	struct list_head	dummy_resources_list;
 	struct mutex		ioeventfds_lock;
 	struct list_head	ioeventfds_list;
@@ -165,6 +167,10 @@ extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *b
 extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
 			       uint64_t data, int count, int fd);
 
+extern ssize_t vfio_pci_dma_fault_rw(struct vfio_pci_core_device *vdev,
+				    char __user *buf, size_t count,
+				    loff_t *ppos, bool iswrite);
+
 extern int vfio_pci_init_perm_bits(void);
 extern void vfio_pci_uninit_perm_bits(void);
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0085fbce9d24..d199c9b434b2 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -324,6 +324,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_GFX                    (1)
 #define VFIO_REGION_TYPE_CCW			(2)
 #define VFIO_REGION_TYPE_MIGRATION              (3)
+#define VFIO_REGION_TYPE_NESTED			(4)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -618,6 +619,9 @@ struct vfio_device_migration_info {
 	__u64 data_size;
 };
 
+/* sub-types for VFIO_REGION_TYPE_NESTED */
+#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT	(1)
+
 /*
  * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
  * which allows direct access to non-MSIX registers which happened to be within
@@ -632,6 +636,37 @@ struct vfio_device_migration_info {
 
 /* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */
 
+/*
+ * Capability exposed by the DMA fault region
+ * @version: ABI version
+ */
+#define VFIO_REGION_INFO_CAP_DMA_FAULT	6
+
+struct vfio_region_info_cap_fault {
+	struct vfio_info_cap_header header;
+	__u32 version;
+};
+
+/*
+ * DMA Fault Region Layout
+ * @tail: index relative to the start of the ring buffer at which the
+ *        consumer finds the next item in the buffer
+ * @entry_size: fault ring buffer entry size in bytes
+ * @nb_entries: max capacity of the fault ring buffer
+ * @offset: ring buffer offset relative to the start of the region
+ * @head: index relative to the start of the ring buffer at which the
+ *        producer (kernel) inserts items into the buffers
+ */
+struct vfio_region_dma_fault {
+	/* Write-Only */
+	__u32   tail;
+	/* Read-Only */
+	__u32   entry_size;
+	__u32	nb_entries;
+	__u32	offset;
+	__u32   head;
+};
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  *				    struct vfio_irq_info)
-- 
Gitee


From 57703175c99e76fea4e9b59bdcb8e31e6db708e3 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Nov 2021 15:46:41 +0800
Subject: [PATCH 1804/2161] vfio/pci: Register an iommu fault handler

commit 1fd6fbd5c30c23357f5625125e8521c41cf12384 Intel-BKC.

Register an IOMMU fault handler which records faults in
the DMA FAULT region ring buffer. In a subsequent patch, we
will add the signaling of a specific eventfd to allow the
userspace to be notified whenever a new fault as shown up.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 45 ++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index d69c43fb4b2d..fa9f4b4128a2 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -26,6 +26,7 @@
 #include <linux/vgaarb.h>
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
+#include <linux/circ_buf.h>
 
 #include <linux/vfio_pci_core.h>
 
@@ -263,6 +264,42 @@ static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
 	.add_capability = vfio_pci_dma_fault_add_capability,
 };
 
+int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
+{
+	struct vfio_pci_core_device *vdev = (struct vfio_pci_core_device *)data;
+	struct vfio_region_dma_fault *reg =
+		(struct vfio_region_dma_fault *)vdev->fault_pages;
+	struct iommu_fault *new;
+	int head, tail, size;
+	int ret = 0;
+
+	if (WARN_ON(!reg))
+		return -ENOENT;
+
+	new = (struct iommu_fault *)(vdev->fault_pages + reg->offset +
+				     reg->head * reg->entry_size);
+
+	if (fault->type != IOMMU_FAULT_DMA_UNRECOV)
+		return -ENOENT;
+
+	mutex_lock(&vdev->fault_queue_lock);
+
+	head = reg->head;
+	tail = reg->tail;
+	size = reg->nb_entries;
+
+	if (CIRC_SPACE(head, tail, size) < 1) {
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	*new = *fault;
+	reg->head = (head + 1) % size;
+unlock:
+	mutex_unlock(&vdev->fault_queue_lock);
+	return ret;
+}
+
 #define DMA_FAULT_RING_LENGTH 512
 
 static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
@@ -308,6 +345,13 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
 	header->entry_size = sizeof(struct iommu_fault);
 	header->nb_entries = DMA_FAULT_RING_LENGTH;
 	header->offset = sizeof(struct vfio_region_dma_fault);
+
+	ret = iommu_register_device_fault_handler(&vdev->pdev->dev,
+					vfio_pci_iommu_dev_fault_handler,
+					vdev);
+	if (ret) /* the dma fault region is freed in vfio_pci_disable() */
+		goto out;
+
 	return 0;
 out:
 	kfree(vdev->fault_pages);
@@ -409,6 +453,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
 	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
 				VFIO_IRQ_SET_ACTION_TRIGGER,
 				vdev->irq_type, 0, 0, NULL);
+	WARN_ON(iommu_unregister_device_fault_handler(&vdev->pdev->dev));
 
 	/* Device closed, don't need mutex here */
 	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
-- 
Gitee


From 3d0ee52f62133d5df5a6a9a1a08e18bf94faa53a Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Nov 2021 16:01:22 +0800
Subject: [PATCH 1805/2161] vfio/pci: Allow to mmap the fault queue

commit d7609ef7de673b4803586fc4e7c9e4366e0c1362 Intel-BKC.

The DMA FAULT region contains the fault ring buffer.
There is benefit to let the userspace mmap this area.
Expose this mmappable area through a sparse mmap entry
and implement the mmap operation.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 62 ++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index fa9f4b4128a2..6d4bbda71701 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -246,21 +246,76 @@ static void vfio_pci_dma_fault_release(struct vfio_pci_core_device *vdev,
 	kfree(vdev->fault_pages);
 }
 
+static int vfio_pci_dma_fault_mmap(struct vfio_pci_core_device *vdev,
+				   struct vfio_pci_region *region,
+				   struct vm_area_struct *vma)
+{
+	u64 phys_len, req_len, pgoff, req_start;
+	unsigned long long addr;
+	unsigned int ret;
+
+	phys_len = region->size;
+
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	/* only the second page of the producer fault region is mmappable */
+	if (req_start < PAGE_SIZE)
+		return -EINVAL;
+
+	if (req_start + req_len > phys_len)
+		return -EINVAL;
+
+	addr = virt_to_phys(vdev->fault_pages);
+	vma->vm_private_data = vdev;
+	vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
+
+	ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			      req_len, vma->vm_page_prot);
+	return ret;
+}
+
 static int vfio_pci_dma_fault_add_capability(struct vfio_pci_core_device *vdev,
 					     struct vfio_pci_region *region,
 					     struct vfio_info_cap *caps)
 {
+	struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
 	struct vfio_region_info_cap_fault cap = {
 		.header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
 		.header.version = 1,
 		.version = 1,
 	};
-	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+	size_t size = sizeof(*sparse) + sizeof(*sparse->areas);
+	int ret;
+
+	ret = vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+	if (ret)
+		return ret;
+
+	sparse = kzalloc(size, GFP_KERNEL);
+	if (!sparse)
+		return -ENOMEM;
+
+	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+	sparse->header.version = 1;
+	sparse->nr_areas = 1;
+	sparse->areas[0].offset = PAGE_SIZE;
+	sparse->areas[0].size = region->size - PAGE_SIZE;
+
+	ret = vfio_info_add_capability(caps, &sparse->header, size);
+
+	/* Regardless of add status, needs to free sparse to avoid memleak */
+	kfree(sparse);
+
+	return ret;
 }
 
 static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
 	.rw		= vfio_pci_dma_fault_rw,
 	.release	= vfio_pci_dma_fault_release,
+	.mmap		= vfio_pci_dma_fault_mmap,
 	.add_capability = vfio_pci_dma_fault_add_capability,
 };
 
@@ -336,7 +391,8 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
 		VFIO_REGION_TYPE_NESTED,
 		VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
 		&vfio_pci_dma_fault_regops, size,
-		VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+		VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+		VFIO_REGION_INFO_FLAG_MMAP,
 		vdev->fault_pages);
 	if (ret)
 		goto out;
@@ -344,7 +400,7 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
 	header = (struct vfio_region_dma_fault *)vdev->fault_pages;
 	header->entry_size = sizeof(struct iommu_fault);
 	header->nb_entries = DMA_FAULT_RING_LENGTH;
-	header->offset = sizeof(struct vfio_region_dma_fault);
+	header->offset = PAGE_SIZE;
 
 	ret = iommu_register_device_fault_handler(&vdev->pdev->dev,
 					vfio_pci_iommu_dev_fault_handler,
-- 
Gitee


From 92a225ca406171992a064d5a4226a92178c2cb13 Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Wed, 19 Jun 2019 04:15:52 -0400
Subject: [PATCH 1806/2161] vfio: Use capability chains to handle device
 specific irq

commit fb9394f37dc0111abba195e1574722ae4d844372 Intel-BKC.

Caps the number of irqs with fixed indexes and uses capability chains
to chain device specific irqs.

Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
[Eric: Put cap_offset at the end of the vfio_irq_info struct,
remove GFX IRQ at the moment and remove any reference to this latter
in the commit message]
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d199c9b434b2..728801ea7fab 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -706,11 +706,27 @@ struct vfio_irq_info {
 #define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
 #define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
 #define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
+#define VFIO_IRQ_INFO_FLAG_CAPS		(1 << 4) /* Info supports caps */
 	__u32	index;		/* IRQ index */
 	__u32	count;		/* Number of IRQs within this index */
+	__u32	cap_offset;	/* Offset within info struct of first cap */
 };
 #define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
 
+/*
+ * The irq type capability allows IRQs unique to a specific device or
+ * class of devices to be exposed.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_IRQ_INFO_CAP_TYPE      3
+
+struct vfio_irq_info_cap_type {
+	struct vfio_info_cap_header header;
+	__u32 type;     /* global per bus driver */
+	__u32 subtype;  /* type specific */
+};
+
 #define VFIO_IRQ_TYPE_NESTED				(1)
 #define VFIO_IRQ_SUBTYPE_DMA_FAULT			(1)
 
@@ -815,7 +831,8 @@ enum {
 	VFIO_PCI_MSIX_IRQ_INDEX,
 	VFIO_PCI_ERR_IRQ_INDEX,
 	VFIO_PCI_REQ_IRQ_INDEX,
-	VFIO_PCI_NUM_IRQS
+	VFIO_PCI_NUM_IRQS = 5	/* Fixed user ABI, IRQ indexes >=5 use   */
+				/* device specific cap to define content */
 };
 
 /*
-- 
Gitee


From fbc90d9ad90131ce574390bcc7dcf3fe2949c942 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Nov 2021 16:44:16 +0800
Subject: [PATCH 1807/2161] vfio/pci: Add framework for custom interrupt
 indices

commit 8cc83ea3471985650f434d9d92e671866ab94ea9 Intel-BKC.

Implement IRQ capability chain infrastructure. All interrupt
indexes beyond VFIO_PCI_NUM_IRQS are handled as extended
interrupts. They are registered with a specific type/subtype

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c  | 89 ++++++++++++++++++++++++++-----
 drivers/vfio/pci/vfio_pci_intrs.c | 62 +++++++++++++++++++++
 include/linux/vfio_pci_core.h     | 14 +++++
 3 files changed, 153 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 6d4bbda71701..0a4634533e2a 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -511,6 +511,14 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
 				vdev->irq_type, 0, 0, NULL);
 	WARN_ON(iommu_unregister_device_fault_handler(&vdev->pdev->dev));
 
+	for (i = 0; i < vdev->num_ext_irqs; i++)
+		vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+					VFIO_IRQ_SET_ACTION_TRIGGER,
+					VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
+	vdev->num_ext_irqs = 0;
+	kfree(vdev->ext_irqs);
+	vdev->ext_irqs = NULL;
+
 	/* Device closed, don't need mutex here */
 	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
 				 &vdev->ioeventfds_list, next) {
@@ -695,6 +703,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_typ
 			return 1;
 	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
 		return 1;
+	} else if (irq_type >= VFIO_PCI_NUM_IRQS &&
+		   irq_type < VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs) {
+		return 1;
 	}
 
 	return 0;
@@ -851,7 +862,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 			info.flags |= VFIO_DEVICE_FLAGS_RESET;
 
 		info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
-		info.num_irqs = VFIO_PCI_NUM_IRQS;
+		info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs;
 
 		ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
 		if (ret && ret != -ENODEV) {
@@ -1027,36 +1038,89 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 
 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
 		struct vfio_irq_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		unsigned long capsz;
 
 		minsz = offsetofend(struct vfio_irq_info, count);
 
+		/* For backward compatibility, cannot require this */
+		capsz = offsetofend(struct vfio_irq_info, cap_offset);
+
 		if (copy_from_user(&info, (void __user *)arg, minsz))
 			return -EFAULT;
 
-		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+		if (info.argsz < minsz ||
+			info.index >= VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs)
 			return -EINVAL;
 
+		if (info.argsz >= capsz)
+			minsz = capsz;
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+
 		switch (info.index) {
-		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+		case VFIO_PCI_INTX_IRQ_INDEX:
+			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
+				       VFIO_IRQ_INFO_AUTOMASKED);
+			break;
+		case VFIO_PCI_MSI_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
 		case VFIO_PCI_REQ_IRQ_INDEX:
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
 			break;
 		case VFIO_PCI_ERR_IRQ_INDEX:
-			if (pci_is_pcie(vdev->pdev))
-				break;
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+			if (!pci_is_pcie(vdev->pdev))
+				return -EINVAL;
+			break;
 			fallthrough;
 		default:
-			return -EINVAL;
+		{
+			struct vfio_irq_info_cap_type cap_type = {
+				.header.id = VFIO_IRQ_INFO_CAP_TYPE,
+				.header.version = 1 };
+			int ret, i;
+			if (info.index >= VFIO_PCI_NUM_IRQS +
+						vdev->num_ext_irqs)
+				return -EINVAL;
+			info.index = array_index_nospec(info.index,
+							VFIO_PCI_NUM_IRQS +
+							vdev->num_ext_irqs);
+			i = info.index - VFIO_PCI_NUM_IRQS;
+
+			info.flags = vdev->ext_irqs[i].flags;
+			cap_type.type = vdev->ext_irqs[i].type;
+			cap_type.subtype = vdev->ext_irqs[i].subtype;
+
+			ret = vfio_info_add_capability(&caps,
+					&cap_type.header,
+					sizeof(cap_type));
+			if (ret)
+				return ret;
+		}
 		}
 
 		info.flags = VFIO_IRQ_INFO_EVENTFD;
 
 		info.count = vfio_pci_get_irq_count(vdev, info.index);
 
-		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
-			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
-				       VFIO_IRQ_INFO_AUTOMASKED);
-		else
-			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+		if (caps.size) {
+			info.flags |= VFIO_IRQ_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						  sizeof(info), caps.buf,
+						  caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
 
 		return copy_to_user((void __user *)arg, &info, minsz) ?
 			-EFAULT : 0;
@@ -1075,7 +1139,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		max = vfio_pci_get_irq_count(vdev, hdr.index);
 
 		ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
-						 VFIO_PCI_NUM_IRQS, &data_size);
+				VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs,
+				&data_size);
 		if (ret)
 			return ret;
 
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 6069a11fb51a..05e021096f22 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -19,6 +19,7 @@
 #include <linux/vfio.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/nospec.h>
 
 #include <linux/vfio_pci_core.h>
 
@@ -635,6 +636,24 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
 					       count, flags, data);
 }
 
+static int vfio_pci_set_ext_irq_trigger(struct vfio_pci_core_device *vdev,
+					unsigned int index, unsigned int start,
+					unsigned int count, uint32_t flags,
+					void *data)
+{
+	int i;
+
+	if (start != 0 || count > 1)
+		return -EINVAL;
+
+	index = array_index_nospec(index,
+				   VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs);
+	i = index - VFIO_PCI_NUM_IRQS;
+
+	return vfio_pci_set_ctx_trigger_single(&vdev->ext_irqs[i].trigger,
+					       count, flags, data);
+}
+
 int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 			    unsigned index, unsigned start, unsigned count,
 			    void *data)
@@ -684,6 +703,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 			break;
 		}
 		break;
+	default:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = vfio_pci_set_ext_irq_trigger;
+			break;
+		}
+		break;
 	}
 
 	if (!func)
@@ -691,3 +717,39 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 
 	return func(vdev, index, start, count, flags, data);
 }
+
+int vfio_pci_get_ext_irq_index(struct vfio_pci_core_device *vdev,
+			       unsigned int type, unsigned int subtype)
+{
+	int i;
+
+	for (i = 0; i <  vdev->num_ext_irqs; i++) {
+		if (vdev->ext_irqs[i].type == type &&
+		    vdev->ext_irqs[i].subtype == subtype) {
+			return i;
+		}
+	}
+	return -EINVAL;
+}
+
+int vfio_pci_register_irq(struct vfio_pci_core_device *vdev,
+			  unsigned int type, unsigned int subtype,
+			  u32 flags)
+{
+	struct vfio_ext_irq *ext_irqs;
+
+	ext_irqs = krealloc(vdev->ext_irqs,
+			    (vdev->num_ext_irqs + 1) * sizeof(*ext_irqs),
+			    GFP_KERNEL);
+	if (!ext_irqs)
+		return -ENOMEM;
+
+	vdev->ext_irqs = ext_irqs;
+
+	vdev->ext_irqs[vdev->num_ext_irqs].type = type;
+	vdev->ext_irqs[vdev->num_ext_irqs].subtype = subtype;
+	vdev->ext_irqs[vdev->num_ext_irqs].flags = flags;
+	vdev->ext_irqs[vdev->num_ext_irqs].trigger = NULL;
+	vdev->num_ext_irqs++;
+	return 0;
+}
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 0a3f6318c18e..76a874c6e886 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -78,6 +78,13 @@ struct vfio_pci_region {
 	u32				flags;
 };
 
+struct vfio_ext_irq {
+	u32				type;
+	u32				subtype;
+	u32				flags;
+	struct eventfd_ctx		*trigger;
+};
+
 struct vfio_pci_dummy_resource {
 	struct resource		resource;
 	int			index;
@@ -108,6 +115,8 @@ struct vfio_pci_core_device {
 	struct vfio_pci_irq_ctx	*ctx;
 	int			num_ctx;
 	int			irq_type;
+	struct vfio_ext_irq	*ext_irqs;
+	int			num_ext_irqs;
 	int			num_regions;
 	struct vfio_pci_region	*region;
 	u8			msi_qmax;
@@ -149,6 +158,11 @@ struct vfio_pci_core_device {
 
 extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
 extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
+extern int vfio_pci_register_irq(struct vfio_pci_core_device *vdev,
+				 unsigned int type, unsigned int subtype,
+				 u32 flags);
+extern int vfio_pci_get_ext_irq_index(struct vfio_pci_core_device *vdev,
+				      unsigned int type, unsigned int subtype);
 
 extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
 				   uint32_t flags, unsigned index,
-- 
Gitee


From 9b70b5416c4243a2b3648b0b8dfaab6919556956 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Tue, 9 Nov 2021 16:56:14 +0800
Subject: [PATCH 1808/2161] vfio_pci: A fix to ext_irq support

commit 13074da48e6b6a329f75a0dd63aa0a40fffa55f0 Intel-BKC.

Two issues:
a) VFIO code uses VFIO_PCI_NUM_IRQS as invalid value for vdev->irq_type field, but
   with the introduction of ext_irqs, VFIO_PCI_NUM_IRQS will be number 0 of ext_irqs.
   So the clearup in vfio_pci_disable() for vdev->irq_type should have a check.
b) The ext_irqs is not exposed to userspace if nesting type iommu is configured by
   userspace. e.g. no scalable modev virtual Intel IOMMU. So VFIO code should check
   before heading to do cleanup for the ext_irqs.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 0a4634533e2a..32d7f036b2a2 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -506,18 +506,24 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
 	/* Stop the device from further DMA */
 	pci_clear_master(pdev);
 
-	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
-				VFIO_IRQ_SET_ACTION_TRIGGER,
-				vdev->irq_type, 0, 0, NULL);
+	if (vdev->irq_type < VFIO_PCI_NUM_IRQS)
+		vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+					VFIO_IRQ_SET_ACTION_TRIGGER,
+					vdev->irq_type, 0, 0, NULL);
+
 	WARN_ON(iommu_unregister_device_fault_handler(&vdev->pdev->dev));
 
-	for (i = 0; i < vdev->num_ext_irqs; i++)
-		vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+	if (vdev->ext_irqs) {
+		for (i = 0; i < vdev->num_ext_irqs; i++)
+			if (vdev->ext_irqs[i].trigger)
+				vfio_pci_set_irqs_ioctl(
+					vdev, VFIO_IRQ_SET_DATA_NONE |
 					VFIO_IRQ_SET_ACTION_TRIGGER,
 					VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
-	vdev->num_ext_irqs = 0;
-	kfree(vdev->ext_irqs);
-	vdev->ext_irqs = NULL;
+		vdev->num_ext_irqs = 0;
+		kfree(vdev->ext_irqs);
+		vdev->ext_irqs = NULL;
+	}
 
 	/* Device closed, don't need mutex here */
 	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
-- 
Gitee


From 33f33b51b86623e5a7def4557aefc1be5de89e03 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Nov 2021 17:02:34 +0800
Subject: [PATCH 1809/2161] vfio/pci: Register and allow DMA FAULT IRQ
 signaling

commit 70b9944e77cd6d885e54f49812493fd295df842c Intel-BKC.

Register the VFIO_IRQ_TYPE_NESTED/VFIO_IRQ_SUBTYPE_DMA_FAULT
IRQ that allows to signal a nested mode DMA fault.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 32d7f036b2a2..f718da96cb47 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -324,8 +324,8 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 	struct vfio_pci_core_device *vdev = (struct vfio_pci_core_device *)data;
 	struct vfio_region_dma_fault *reg =
 		(struct vfio_region_dma_fault *)vdev->fault_pages;
+	int head, tail, size, ext_irq_index;
 	struct iommu_fault *new;
-	int head, tail, size;
 	int ret = 0;
 
 	if (WARN_ON(!reg))
@@ -352,7 +352,19 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 	reg->head = (head + 1) % size;
 unlock:
 	mutex_unlock(&vdev->fault_queue_lock);
-	return ret;
+	if (ret)
+		return ret;
+
+	ext_irq_index = vfio_pci_get_ext_irq_index(vdev, VFIO_IRQ_TYPE_NESTED,
+						   VFIO_IRQ_SUBTYPE_DMA_FAULT);
+	if (ext_irq_index < 0)
+		return -EINVAL;
+
+	mutex_lock(&vdev->igate);
+	if (vdev->ext_irqs[ext_irq_index].trigger)
+		eventfd_signal(vdev->ext_irqs[ext_irq_index].trigger, 1);
+	mutex_unlock(&vdev->igate);
+	return 0;
 }
 
 #define DMA_FAULT_RING_LENGTH 512
@@ -408,6 +420,12 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
 	if (ret) /* the dma fault region is freed in vfio_pci_disable() */
 		goto out;
 
+	ret = vfio_pci_register_irq(vdev, VFIO_IRQ_TYPE_NESTED,
+				    VFIO_IRQ_SUBTYPE_DMA_FAULT,
+				    VFIO_IRQ_INFO_EVENTFD);
+	if (ret) /* the fault handler is also freed in vfio_pci_disable() */
+		goto out;
+
 	return 0;
 out:
 	kfree(vdev->fault_pages);
-- 
Gitee


From 72ab33f808c44bb64251b71ceed9d2870220d760 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 9 Nov 2021 21:59:45 +0800
Subject: [PATCH 1810/2161] vfio/pci: check fault type before reporting

commit aecb358a34a19f0a9d40c85d63e0e4d5cfa3fe20 Intel-BKC.

IOMMU detected device faults need to be injected into the guest
if the guest owns the resource. e.g. Page requests of 1st level
faults has to be handled by the guest by calling handle_mm_fault().

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index f718da96cb47..fac6691c3a49 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -334,7 +334,9 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 	new = (struct iommu_fault *)(vdev->fault_pages + reg->offset +
 				     reg->head * reg->entry_size);
 
-	if (fault->type != IOMMU_FAULT_DMA_UNRECOV)
+	/* We need to send page request and relavent unrecoverable fault to userspace */
+	if (fault->type != IOMMU_FAULT_DMA_UNRECOV &&
+	    fault->type != IOMMU_FAULT_PAGE_REQ)
 		return -ENOENT;
 
 	mutex_lock(&vdev->fault_queue_lock);
-- 
Gitee


From 890457d1e4725723239bca1301cb6b680781917e Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Sat, 27 Jun 2020 20:45:50 -0700
Subject: [PATCH 1811/2161] vfio/type1: Handle page response from VM

commit 4ec21bad7482c0736f437a4e0fb874ecbb010889 Intel-BKC.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 37 +++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       |  3 +++
 2 files changed, 40 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a26943d992d7..3e18e3a4c09e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -3480,6 +3480,40 @@ static long vfio_iommu_invalidate_cache(struct vfio_iommu *iommu,
 	return ret;
 }
 
+static int vfio_dev_page_resp_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	unsigned long arg = *(unsigned long *) dc->data;
+
+	return iommu_page_response(dc->domain, dev, (void __user *) arg);
+}
+
+static long vfio_iommu_page_response(struct vfio_iommu *iommu,
+				     unsigned long arg)
+{
+	struct domain_capsule dc = { .data = &arg };
+	struct iommu_nesting_info *info;
+	int ret;
+
+	mutex_lock(&iommu->lock);
+	info = iommu->nesting_info;
+	if (!info || !(info->features & IOMMU_NESTING_FEAT_PAGE_RESP)) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+
+	ret = vfio_prepare_nesting_domain_capsule(iommu, &dc);
+	if (ret)
+		goto out_unlock;
+
+	ret = iommu_group_for_each_dev(dc.group->iommu_group, &dc,
+				       vfio_dev_page_resp_fn);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
 static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu,
 					unsigned long arg)
 {
@@ -3505,6 +3539,9 @@ static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu,
 	case VFIO_IOMMU_NESTING_OP_CACHE_INVLD:
 		ret = vfio_iommu_invalidate_cache(iommu, arg + minsz);
 		break;
+	case VFIO_IOMMU_NESTING_OP_PAGE_RESP:
+		ret = vfio_iommu_page_response(iommu, arg + minsz);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 728801ea7fab..03997de836c8 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1267,6 +1267,8 @@ struct vfio_iommu_type1_dirty_bitmap_get {
  * +-----------------+-----------------------------------------------+
  * | CACHE_INVLD     |      struct iommu_cache_invalidate_info       |
  * +-----------------+-----------------------------------------------+
+ * | PAGE_RESP       |      struct iommu_page_response               |
+ * +-----------------+-----------------------------------------------+
  *
  * returns: 0 on success, -errno on failure.
  */
@@ -1281,6 +1283,7 @@ enum {
 	VFIO_IOMMU_NESTING_OP_BIND_PGTBL,
 	VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL,
 	VFIO_IOMMU_NESTING_OP_CACHE_INVLD,
+	VFIO_IOMMU_NESTING_OP_PAGE_RESP,
 };
 
 #define VFIO_IOMMU_NESTING_OP		_IO(VFIO_TYPE, VFIO_BASE + 18)
-- 
Gitee


From 81cfed4559ab6ed421c989153b8990cea7d35a50 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Sat, 27 Jun 2020 20:49:02 -0700
Subject: [PATCH 1812/2161] vfio/type1: Support page response for IOMMU-capable
 Mdev

commit 64e63caa9932b43086dfc5523bacef680443518d Intel-BKC.

VFIO should pass the physical device info for IOMMU-capable mdev.

Details can refer to below commit:
"vfio/type1: Add vSVA support for IOMMU-backed mdevs"

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3e18e3a4c09e..0378055698dc 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -3484,8 +3484,14 @@ static int vfio_dev_page_resp_fn(struct device *dev, void *data)
 {
 	struct domain_capsule *dc = (struct domain_capsule *)data;
 	unsigned long arg = *(unsigned long *) dc->data;
+	struct device *iommu_device;
+
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
 
-	return iommu_page_response(dc->domain, dev, (void __user *) arg);
+	return iommu_page_response(dc->domain, iommu_device,
+				   (void __user *) arg);
 }
 
 static long vfio_iommu_page_response(struct vfio_iommu *iommu,
-- 
Gitee


From 6115444a8b6f487f6328f231413d9b1133640fa0 Mon Sep 17 00:00:00 2001
From: "Liu, Yi L" <yi.l.liu@intel.com>
Date: Sun, 4 Oct 2020 00:08:22 -0700
Subject: [PATCH 1813/2161] vfio/type1: Add debug log for vSVA and vPRQ path

commit 6a6960728728aa5fddd1bb6365ba5a9350b07770 Intel-BKC.

Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0378055698dc..6caeca1a9150 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2737,6 +2737,7 @@ static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data)
 	struct device *iommu_device;
 	void *iommu_fault_data = NULL;
 
+	printk("%s, - arg: 0x%lx\n", __func__, arg);
 	iommu_device = vfio_get_iommu_device(dc->group, dev);
 	if (!iommu_device)
 		return -EINVAL;
@@ -2744,6 +2745,8 @@ static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data)
 	if (iommu_device != dev)
 		iommu_fault_data = mdev_get_iommu_fault_data(mdev);
 
+	printk("%s: iommu_fault_data: %llx\n", __func__, (unsigned long long) iommu_fault_data);
+
 	return iommu_uapi_sva_bind_gpasid(dc->domain, iommu_device,
 					  (void __user *)arg,
 					  iommu_fault_data);
@@ -2769,6 +2772,7 @@ static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data)
 	if (dc->user) {
 		unsigned long arg = *(unsigned long *)dc->data;
 
+		printk("%s, - arg: 0x%lx\n", __func__, arg);
 		iommu_uapi_sva_unbind_gpasid(dc->domain, iommu_device,
 					     (void __user *)arg);
 	} else {
@@ -3445,6 +3449,7 @@ static int vfio_dev_cache_invalidate_fn(struct device *dev, void *data)
 	unsigned long arg = *(unsigned long *)dc->data;
 	struct device *iommu_device;
 
+	printk("%s, - arg: 0x%lx\n", __func__, arg);
 	iommu_device = vfio_get_iommu_device(dc->group, dev);
 	if (!iommu_device)
 		return -EINVAL;
@@ -3486,6 +3491,7 @@ static int vfio_dev_page_resp_fn(struct device *dev, void *data)
 	unsigned long arg = *(unsigned long *) dc->data;
 	struct device *iommu_device;
 
+	printk("%s, - arg: 0x%lx\n", __func__, arg);
 	iommu_device = vfio_get_iommu_device(dc->group, dev);
 	if (!iommu_device)
 		return -EINVAL;
@@ -3529,21 +3535,29 @@ static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu,
 
 	minsz = offsetofend(struct vfio_iommu_type1_nesting_op, flags);
 
+	printk("%s, - 1, arg: 0x%lx, minsz: %u\n", __func__, arg, minsz);
 	if (copy_from_user(&hdr, (void __user *)arg, minsz))
 		return -EFAULT;
 
 	if (hdr.argsz < minsz || hdr.flags & ~VFIO_NESTING_OP_MASK)
 		return -EINVAL;
 
+	printk("%s, - 2\n", __func__);
 	switch (hdr.flags & VFIO_NESTING_OP_MASK) {
 	case VFIO_IOMMU_NESTING_OP_BIND_PGTBL:
+	printk("%s, bind - 1\n", __func__);
 		ret = vfio_iommu_handle_pgtbl_op(iommu, true, arg + minsz);
+	printk("%s, bind - 2, ret: %d\n", __func__, ret);
 		break;
 	case VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL:
+	printk("%s, unbind - 1\n", __func__);
 		ret = vfio_iommu_handle_pgtbl_op(iommu, false, arg + minsz);
+	printk("%s, unbind - 2, ret: %d\n", __func__, ret);
 		break;
 	case VFIO_IOMMU_NESTING_OP_CACHE_INVLD:
+	printk("%s, cache_inv - 1\n", __func__);
 		ret = vfio_iommu_invalidate_cache(iommu, arg + minsz);
+	printk("%s, cache_inv - 2, ret: %d\n", __func__, ret);
 		break;
 	case VFIO_IOMMU_NESTING_OP_PAGE_RESP:
 		ret = vfio_iommu_page_response(iommu, arg + minsz);
-- 
Gitee


From b23ed4f3fc93a69fb4a5e6b4e50d53fd309a0bd5 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Wed, 10 Nov 2021 00:00:20 +0800
Subject: [PATCH 1814/2161] vfio/pci: Add debug log in PRQ reporting

commit 8b7aedf3f5d5ba53188fe8c31e7b6c4433b3bc52 Intel-BKC.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index fac6691c3a49..5aa8b736a9a4 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -341,6 +341,7 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 
 	mutex_lock(&vdev->fault_queue_lock);
 
+	dev_info(&vdev->pdev->dev, "%s, enque fault event\n", __func__);
 	head = reg->head;
 	tail = reg->tail;
 	size = reg->nb_entries;
@@ -363,6 +364,7 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 		return -EINVAL;
 
 	mutex_lock(&vdev->igate);
+	dev_info(&vdev->pdev->dev, "%s, signal userspace!\n", __func__);
 	if (vdev->ext_irqs[ext_irq_index].trigger)
 		eventfd_signal(vdev->ext_irqs[ext_irq_index].trigger, 1);
 	mutex_unlock(&vdev->igate);
-- 
Gitee


From bdee48559671cb26935ff5e7133fd3bd7a54676e Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Mon, 13 Jan 2020 16:20:48 +0800
Subject: [PATCH 1815/2161] vfio/pci: Emulate PASID/PRI capability for VFs

commit ec648557c99507031c09afdcbc01a7f64d6c3c2a Intel-BKC.

Per PCIe r5.0, sec 9.3.7.14, if a PF implements the PASID Capability, the
PF PASID configuration is shared by its VFs.  VFs must not implement their
own PASID Capability.

Per PCIe r5.0, sec 9.3.7.11, VFs must not implement the PRI Capability. If
the PF implements PRI, it is shared by the VFs.

On bare metal, it has been fixed by below efforts.
to PASID/PRI are
https://lkml.org/lkml/2019/9/5/996
https://lkml.org/lkml/2019/9/5/995

This patch adds emulated PASID/PRI capabilities for VFs when assigned to
VMs via vfio-pci driver. This is required for enabling vSVA on pass-through
VFs as VFs have no PASID/PRI capability structure in its configure space.

TODO: this patch will be phase out for only PRF vSVA support.

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 327 ++++++++++++++++++++++++++++-
 1 file changed, 325 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index a20ab95b1b89..3604920a41e7 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -1549,11 +1549,302 @@ static int vfio_cap_init(struct vfio_pci_core_device *vdev)
 	return 0;
 }
 
+static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_core_device *vdev,
+					int offset, uint8_t *data, int size)
+{
+	int ret = 0, data_offset = 0;
+
+	while (size) {
+		int filled;
+
+		if (size >= 4 && !(offset % 4)) {
+			__le32 *dwordp = (__le32 *)&vdev->vconfig[offset];
+			u32 dword;
+
+			memcpy(&dword, data + data_offset, 4);
+			*dwordp = cpu_to_le32(dword);
+			filled = 4;
+		} else if (size >= 2 && !(offset % 2)) {
+			__le16 *wordp = (__le16 *)&vdev->vconfig[offset];
+			u16 word;
+
+			memcpy(&word, data + data_offset, 2);
+			*wordp = cpu_to_le16(word);
+			filled = 2;
+		} else {
+			u8 *byte = &vdev->vconfig[offset];
+
+			memcpy(byte, data + data_offset, 1);
+			filled = 1;
+		}
+
+		offset += filled;
+		data_offset += filled;
+		size -= filled;
+	}
+
+	return ret;
+}
+
+static int vfio_pci_get_ecap_content(struct pci_dev *pdev,
+					int cap, int cap_len, u8 *content)
+{
+	int pos, offset, len = cap_len, ret = 0;
+
+	pos = pci_find_ext_capability(pdev, cap);
+	if (!pos)
+		return -EINVAL;
+
+	offset = 0;
+	while (len) {
+		int fetched;
+
+		if (len >= 4 && !(pos % 4)) {
+			u32 *dwordp = (u32 *) (content + offset);
+			u32 dword;
+			__le32 *dwptr = (__le32 *) &dword;
+
+			ret = pci_read_config_dword(pdev, pos, &dword);
+			if (ret)
+				return ret;
+			*dwordp = le32_to_cpu(*dwptr);
+			fetched = 4;
+		} else if (len >= 2 && !(pos % 2)) {
+			u16 *wordp = (u16 *) (content + offset);
+			u16 word;
+			__le16 *wptr = (__le16 *) &word;
+
+			ret = pci_read_config_word(pdev, pos, &word);
+			if (ret)
+				return ret;
+			*wordp = le16_to_cpu(*wptr);
+			fetched = 2;
+		} else {
+			u8 *byte = (u8 *) (content + offset);
+
+			ret = pci_read_config_byte(pdev, pos, byte);
+			if (ret)
+				return ret;
+			fetched = 1;
+		}
+
+		pos += fetched;
+		offset += fetched;
+		len -= fetched;
+	}
+
+	return ret;
+}
+
+struct vfio_pci_pasid_cap_data {
+	u32 id:16;
+	u32 version:4;
+	u32 next:12;
+	union {
+		u16 cap_reg_val;
+		struct {
+			u16 rsv1:1;
+			u16 execs:1;
+			u16 prvs:1;
+			u16 rsv2:5;
+			u16 pasid_bits:5;
+			u16 rsv3:3;
+		};
+	} cap_reg;
+	union {
+		u16 control_reg_val;
+		struct {
+			u16 paside:1;
+			u16 exece:1;
+			u16 prve:1;
+			u16 rsv4:13;
+		};
+	} control_reg;
+};
+
+static int vfio_pci_add_pasid_cap(struct vfio_pci_core_device *vdev,
+				    struct pci_dev *pdev,
+				    u16 epos, u16 *next, __le32 **prevp)
+{
+	u8 *map = vdev->pci_config_map;
+	int ecap = PCI_EXT_CAP_ID_PASID;
+	int len = pci_ext_cap_length[ecap];
+	struct vfio_pci_pasid_cap_data pasid_cap;
+	struct vfio_pci_pasid_cap_data vpasid_cap;
+	int ret;
+
+	/*
+	 * If no cap filled in this function, should make sure the next
+	 * pointer points to current epos.
+	 */
+	*next = epos;
+
+	if (!len) {
+		pr_info("%s: VF %s hiding PASID capability\n",
+				__func__, dev_name(&vdev->pdev->dev));
+		ret = 0;
+		goto out;
+	}
+
+	/* Add PASID capability*/
+	ret = vfio_pci_get_ecap_content(pdev, ecap,
+					len, (u8 *)&pasid_cap);
+	if (ret)
+		goto out;
+
+	if (!pasid_cap.control_reg.paside) {
+		pr_debug("%s: its PF's PASID capability is not enabled\n",
+			dev_name(&vdev->pdev->dev));
+		ret = 0;
+		goto out;
+	}
+
+	memcpy(&vpasid_cap, &pasid_cap, len);
+
+	vpasid_cap.next = 0;
+	/* clear the control reg for guest */
+	memset(&vpasid_cap.control_reg, 0x0,
+			sizeof(vpasid_cap.control_reg));
+
+	memset(map + epos, vpasid_cap.id, len);
+	ret = vfio_fill_custom_vconfig_bytes(vdev, epos,
+					(u8 *)&vpasid_cap, len);
+	if (!ret) {
+		/*
+		 * Successfully filled in PASID cap, update
+		 * the next offset in previous cap header,
+		 * and also update caller about the offset
+		 * of next cap if any.
+		 */
+		u32 val = epos;
+		**prevp &= cpu_to_le32(~(0xffcU << 20));
+		**prevp |= cpu_to_le32(val << 20);
+		*prevp = (__le32 *)&vdev->vconfig[epos];
+		*next = epos + len;
+	}
+
+out:
+	return ret;
+}
+
+struct vfio_pci_pri_cap_data {
+	u32 id:16;
+	u32 version:4;
+	u32 next:12;
+	union {
+		u16 control_reg_val;
+		struct {
+			u16 enable:1;
+			u16 reset:1;
+			u16 rsv1:14;
+		};
+	} control_reg;
+	union {
+		u16 status_reg_val;
+		struct {
+			u16 rf:1;
+			u16 uprgi:1;
+			u16 rsv2:6;
+			u16 stop:1;
+			u16 rsv3:6;
+			u16 pasid_required:1;
+		};
+	} status_reg;
+	u32 prq_capacity;
+	u32 prq_quota;
+};
+
+static int vfio_pci_add_pri_cap(struct vfio_pci_core_device *vdev,
+				    struct pci_dev *pdev,
+				    u16 epos, u16 *next, __le32 **prevp)
+{
+	u8 *map = vdev->pci_config_map;
+	int ecap = PCI_EXT_CAP_ID_PRI;
+	int len = pci_ext_cap_length[ecap];
+	struct vfio_pci_pri_cap_data pri_cap;
+	struct vfio_pci_pri_cap_data vpri_cap;
+	int ret;
+
+	/*
+	 * If no cap filled in this function, should make sure the next
+	 * pointer points to current epos.
+	 */
+	*next = epos;
+
+	if (!len) {
+		pr_info("%s: VF %s hiding PRI capability\n",
+				__func__, dev_name(&vdev->pdev->dev));
+		ret = 0;
+		goto out;
+	}
+
+	/* Add PASID capability*/
+	ret = vfio_pci_get_ecap_content(pdev, ecap,
+					len, (u8 *)&pri_cap);
+	if (ret)
+		goto out;
+
+	if (!pri_cap.control_reg.enable) {
+		pr_debug("%s: its PF's PRI capability is not enabled\n",
+			dev_name(&vdev->pdev->dev));
+		ret = 0;
+		goto out;
+	}
+
+	memcpy(&vpri_cap, &pri_cap, len);
+
+	vpri_cap.next = 0;
+	/* clear the control reg for guest */
+	memset(&vpri_cap.control_reg, 0x0,
+			sizeof(vpri_cap.control_reg));
+
+	memset(map + epos, vpri_cap.id, len);
+	ret = vfio_fill_custom_vconfig_bytes(vdev, epos,
+					(u8 *)&vpri_cap, len);
+	if (!ret) {
+		/*
+		 * Successfully filled in PASID cap, update
+		 * the next offset in previous cap header,
+		 * and also update caller about the offset
+		 * of next cap if any.
+		 */
+		u32 val = epos;
+		**prevp &= cpu_to_le32(~(0xffcU << 20));
+		**prevp |= cpu_to_le32(val << 20);
+		*prevp = (__le32 *)&vdev->vconfig[epos];
+		*next = epos + len;
+	}
+
+out:
+	return ret;
+}
+
+static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_core_device *vdev,
+			struct pci_dev *pdev, u16 start_epos, __le32 *prev)
+{
+	__le32 *__prev = prev;
+	u16 epos = start_epos, epos_next = start_epos;
+	int ret = 0;
+
+	/* Add PASID capability*/
+	ret = vfio_pci_add_pasid_cap(vdev, pdev, epos,
+					&epos_next, &__prev);
+	if (ret)
+		return ret;
+
+	/* Add PRI capability */
+	epos = epos_next;
+	ret = vfio_pci_add_pri_cap(vdev, pdev, epos,
+				   &epos_next, &__prev);
+
+	return ret;
+}
+
 static int vfio_ecap_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 *map = vdev->pci_config_map;
-	u16 epos;
+	u16 epos, epos_max;
 	__le32 *prev = NULL;
 	int loops, ret, ecaps = 0;
 
@@ -1561,6 +1852,7 @@ static int vfio_ecap_init(struct vfio_pci_core_device *vdev)
 		return 0;
 
 	epos = PCI_CFG_SPACE_SIZE;
+	epos_max = PCI_CFG_SPACE_SIZE;
 
 	loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF;
 
@@ -1585,6 +1877,9 @@ static int vfio_ecap_init(struct vfio_pci_core_device *vdev)
 			}
 		}
 
+		if (epos_max <= (epos + len))
+			epos_max = epos + len;
+
 		if (!len) {
 			pci_info(pdev, "%s: hiding ecap %#x@%#x\n",
 				 __func__, ecap, epos);
@@ -1644,6 +1939,18 @@ static int vfio_ecap_init(struct vfio_pci_core_device *vdev)
 	if (!ecaps)
 		*(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0;
 
+#ifdef CONFIG_PCI_ATS
+	if (pdev->is_virtfn) {
+		struct pci_dev *physfn = pdev->physfn;
+
+		ret = vfio_pci_add_emulated_cap_for_vf(vdev,
+					physfn, epos_max, prev);
+		if (ret)
+			pr_info("%s, failed to add special caps for VF %s\n",
+				__func__, dev_name(&vdev->pdev->dev));
+	}
+#endif
+
 	return 0;
 }
 
@@ -1802,7 +2109,22 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev,
 	return i;
 }
 
+<<<<<<< HEAD
 static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+=======
+static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id)
+{
+#ifdef CONFIG_PCI_ATS
+	return (pdev->is_virtfn &&
+		(cap_id == PCI_EXT_CAP_ID_PASID ||
+		 cap_id == PCI_EXT_CAP_ID_PRI));
+#else
+	return false;
+#endif
+}
+
+static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
+>>>>>>> 64511739a93d... vfio/pci: Emulate PASID/PRI capability for VFs
 				 size_t count, loff_t *ppos, bool iswrite)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -1835,7 +2157,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user
 	if (cap_id == PCI_CAP_ID_INVALID) {
 		perm = &unassigned_perms;
 		cap_start = *ppos;
-	} else if (cap_id == PCI_CAP_ID_INVALID_VIRT) {
+	} else if (cap_id == PCI_CAP_ID_INVALID_VIRT ||
+		   vfio_pci_need_virt_perm(pdev, cap_id)) {
 		perm = &virt_perms;
 		cap_start = *ppos;
 	} else {
-- 
Gitee


From f5eb521661c036607bc33f3ce404f2be8cb490e6 Mon Sep 17 00:00:00 2001
From: Cathy Zhang <cathy.zhang@intel.com>
Date: Wed, 10 Nov 2021 17:14:19 +0800
Subject: [PATCH 1816/2161] fix for vfio/pci: Emulate PASID/PRI capability for
 VFs

commit fd9fd02b384aa9506fbac637f5cd6386ad6198bb Intel-BKC.

Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 3604920a41e7..6751606c0de6 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -2109,9 +2109,6 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev,
 	return i;
 }
 
-<<<<<<< HEAD
-static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf,
-=======
 static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id)
 {
 #ifdef CONFIG_PCI_ATS
@@ -2123,8 +2120,7 @@ static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id)
 #endif
 }
 
-static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
->>>>>>> 64511739a93d... vfio/pci: Emulate PASID/PRI capability for VFs
+static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 				 size_t count, loff_t *ppos, bool iswrite)
 {
 	struct pci_dev *pdev = vdev->pdev;
-- 
Gitee


From 5bf74fe5a64621072e7092fb44151e705c44ade4 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Fri, 25 Jun 2021 20:53:09 +0800
Subject: [PATCH 1817/2161] vSVA: Refine the logs to be debug devel

commit 1e23206130895908256c5a4d22be1cac26a02fe2 Intel-BKC.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c |  4 ++--
 drivers/vfio/vfio_iommu_type1.c  | 26 +++++++++++++-------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 5aa8b736a9a4..0ac0b21b7ace 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -341,7 +341,7 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 
 	mutex_lock(&vdev->fault_queue_lock);
 
-	dev_info(&vdev->pdev->dev, "%s, enque fault event\n", __func__);
+	dev_dbg(&vdev->pdev->dev, "%s, enque fault event\n", __func__);
 	head = reg->head;
 	tail = reg->tail;
 	size = reg->nb_entries;
@@ -364,7 +364,7 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 		return -EINVAL;
 
 	mutex_lock(&vdev->igate);
-	dev_info(&vdev->pdev->dev, "%s, signal userspace!\n", __func__);
+	dev_dbg(&vdev->pdev->dev, "%s, signal userspace!\n", __func__);
 	if (vdev->ext_irqs[ext_irq_index].trigger)
 		eventfd_signal(vdev->ext_irqs[ext_irq_index].trigger, 1);
 	mutex_unlock(&vdev->igate);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6caeca1a9150..06e87979a10c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2737,7 +2737,7 @@ static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data)
 	struct device *iommu_device;
 	void *iommu_fault_data = NULL;
 
-	printk("%s, - arg: 0x%lx\n", __func__, arg);
+	pr_debug("%s, - arg: 0x%lx\n", __func__, arg);
 	iommu_device = vfio_get_iommu_device(dc->group, dev);
 	if (!iommu_device)
 		return -EINVAL;
@@ -2745,7 +2745,7 @@ static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data)
 	if (iommu_device != dev)
 		iommu_fault_data = mdev_get_iommu_fault_data(mdev);
 
-	printk("%s: iommu_fault_data: %llx\n", __func__, (unsigned long long) iommu_fault_data);
+	pr_debug("%s: iommu_fault_data: %llx\n", __func__, (unsigned long long) iommu_fault_data);
 
 	return iommu_uapi_sva_bind_gpasid(dc->domain, iommu_device,
 					  (void __user *)arg,
@@ -2772,7 +2772,7 @@ static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data)
 	if (dc->user) {
 		unsigned long arg = *(unsigned long *)dc->data;
 
-		printk("%s, - arg: 0x%lx\n", __func__, arg);
+		pr_debug("%s, - arg: 0x%lx\n", __func__, arg);
 		iommu_uapi_sva_unbind_gpasid(dc->domain, iommu_device,
 					     (void __user *)arg);
 	} else {
@@ -3449,7 +3449,7 @@ static int vfio_dev_cache_invalidate_fn(struct device *dev, void *data)
 	unsigned long arg = *(unsigned long *)dc->data;
 	struct device *iommu_device;
 
-	printk("%s, - arg: 0x%lx\n", __func__, arg);
+	pr_debug("%s, - arg: 0x%lx\n", __func__, arg);
 	iommu_device = vfio_get_iommu_device(dc->group, dev);
 	if (!iommu_device)
 		return -EINVAL;
@@ -3491,7 +3491,7 @@ static int vfio_dev_page_resp_fn(struct device *dev, void *data)
 	unsigned long arg = *(unsigned long *) dc->data;
 	struct device *iommu_device;
 
-	printk("%s, - arg: 0x%lx\n", __func__, arg);
+	pr_debug("%s, - arg: 0x%lx\n", __func__, arg);
 	iommu_device = vfio_get_iommu_device(dc->group, dev);
 	if (!iommu_device)
 		return -EINVAL;
@@ -3535,29 +3535,29 @@ static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu,
 
 	minsz = offsetofend(struct vfio_iommu_type1_nesting_op, flags);
 
-	printk("%s, - 1, arg: 0x%lx, minsz: %u\n", __func__, arg, minsz);
+	pr_debug("%s, - 1, arg: 0x%lx, minsz: %u\n", __func__, arg, minsz);
 	if (copy_from_user(&hdr, (void __user *)arg, minsz))
 		return -EFAULT;
 
 	if (hdr.argsz < minsz || hdr.flags & ~VFIO_NESTING_OP_MASK)
 		return -EINVAL;
 
-	printk("%s, - 2\n", __func__);
+	pr_debug("%s, - 2\n", __func__);
 	switch (hdr.flags & VFIO_NESTING_OP_MASK) {
 	case VFIO_IOMMU_NESTING_OP_BIND_PGTBL:
-	printk("%s, bind - 1\n", __func__);
+	pr_debug("%s, bind - 1\n", __func__);
 		ret = vfio_iommu_handle_pgtbl_op(iommu, true, arg + minsz);
-	printk("%s, bind - 2, ret: %d\n", __func__, ret);
+	pr_debug("%s, bind - 2, ret: %d\n", __func__, ret);
 		break;
 	case VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL:
-	printk("%s, unbind - 1\n", __func__);
+	pr_debug("%s, unbind - 1\n", __func__);
 		ret = vfio_iommu_handle_pgtbl_op(iommu, false, arg + minsz);
-	printk("%s, unbind - 2, ret: %d\n", __func__, ret);
+	pr_debug("%s, unbind - 2, ret: %d\n", __func__, ret);
 		break;
 	case VFIO_IOMMU_NESTING_OP_CACHE_INVLD:
-	printk("%s, cache_inv - 1\n", __func__);
+	pr_debug("%s, cache_inv - 1\n", __func__);
 		ret = vfio_iommu_invalidate_cache(iommu, arg + minsz);
-	printk("%s, cache_inv - 2, ret: %d\n", __func__, ret);
+	pr_debug("%s, cache_inv - 2, ret: %d\n", __func__, ret);
 		break;
 	case VFIO_IOMMU_NESTING_OP_PAGE_RESP:
 		ret = vfio_iommu_page_response(iommu, arg + minsz);
-- 
Gitee


From c3268b775b27efbb4386f5afffe4e7be9287e400 Mon Sep 17 00:00:00 2001
From: Cathy Zhang <cathy.zhang@intel.com>
Date: Wed, 10 Nov 2021 13:52:19 +0800
Subject: [PATCH 1818/2161] vfio/pci: Remove the IOMMU_NESTING_FEAT_PAGE_RESP
 check in vfio_pci_dma_fault_init()

commit 19a564fd89c7da1ebbc0fc280c866ab5979d1960 Intel-BKC.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 0ac0b21b7ace..2b9d369ca97c 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -376,20 +376,9 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
 {
 	struct vfio_region_dma_fault *header;
-	struct iommu_domain *domain;
-	struct iommu_nesting_info info;
 	size_t size;
 	int ret;
 
-	domain = iommu_get_domain_for_dev(&vdev->pdev->dev);
-	ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &info);
-	if (ret || !(info.features & IOMMU_NESTING_FEAT_PAGE_RESP)) {
-		/*
-		 * No need go futher as no page request service support.
-		 */
-		return 0;
-	}
-
 	mutex_init(&vdev->fault_queue_lock);
 
 	/*
-- 
Gitee


From 96d3b72571d1ee571bfb585f8f09552e7634331e Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Thu, 22 Jul 2021 19:42:44 +0800
Subject: [PATCH 1819/2161] vfio/type1: Fix a bug for getting dirty page

commit 0ec4157beeade6dd9c9e9a0cb00cab970a312c67 Intel-BKC.

Current migration is based on mdev framework, so the devices with migration
capability is mdev. Hence the vfio_dev_has_feature() check should check
the feature on its parent device. Without this fix, userspace always see
all pages are dirty and hence no converge in pre-copy phase.

Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 06e87979a10c..80baad0b90e5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1235,21 +1235,31 @@ static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
 	}
 }
 
+static struct device *vfio_get_iommu_device(struct vfio_iommu_group *group,
+					    struct device *dev);
+
 static int vfio_dev_enable_feature(struct device *dev, void *data)
 {
-	enum iommu_dev_features *feat = data;
+	struct domain_capsule *dc = data;
+	enum iommu_dev_features *feat = dc->data;
+	struct device *iommu_device;
+
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
 
-	if (iommu_dev_feature_enabled(dev, *feat))
+	if (iommu_dev_feature_enabled(iommu_device, *feat))
 		return 0;
 
-	return iommu_dev_enable_feature(dev, *feat);
+	return iommu_dev_enable_feature(iommu_device, *feat);
 }
 
 static bool vfio_group_supports_hwdbm(struct vfio_iommu_group *group)
 {
 	enum iommu_dev_features feat = IOMMU_DEV_FEAT_HWDBM;
+	struct domain_capsule dc = { .group = group, .data = &feat, };
 
-	if (iommu_group_for_each_dev(group->iommu_group, &feat,
+	if (iommu_group_for_each_dev(group->iommu_group, &dc,
 				     vfio_dev_enable_feature))
 		return false;
 
-- 
Gitee


From 5df152684e9ee91f70932b004344c3b28337901d Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Tue, 13 Jul 2021 15:09:50 -0700
Subject: [PATCH 1820/2161] vfio/type1: Fix a bug in IOMMU dirty bit clearing

commit e5dc368e0e2f45042feca0cded8ac7aac3227f23 Intel-BKC.

Because of this bug, the IOMMU dirty bit was not getting cleared and
IOMMU driver reports a lot of dirty pages and VM migration doesn't
converge.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 80baad0b90e5..72c46d0e0b9c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1372,14 +1372,6 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 		if (ret)
 			return ret;
 
-		/*
-		 * Re-populate bitmap to include all pinned pages which are
-		 * considered as dirty but exclude pages which are unpinned and
-		 * pages which are marked dirty by vfio_dma_rw()
-		 */
-		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
-		vfio_dma_populate_bitmap(dma, pgsize);
-
 		/* Clear iommu dirty log to re-enable dirty log tracking */
 		if (iommu->num_non_pinned_groups &&
 		    dma->iommu_mapped && !iommu->num_non_hwdbm_groups) {
@@ -1391,6 +1383,14 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 				return ret;
 			}
 		}
+		/*
+		 * Re-populate bitmap to include all pinned pages which are
+		 * considered as dirty but exclude pages which are unpinned and
+		 * pages which are marked dirty by vfio_dma_rw()
+		 */
+		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
+		vfio_dma_populate_bitmap(dma, pgsize);
+
 	}
 	return 0;
 }
-- 
Gitee


From 14628b2ad6c9a372f8bf815f872ad0b01a69b2a9 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Mon, 22 Nov 2021 18:56:12 +0800
Subject: [PATCH 1821/2161] vfio: Support bind/unbind guest page table to
 default PASID

commit 6d37212237974325b998dc8ab106f5a6e8cce798 Intel-BKC.

For guest IOVA support under nesting IOMMU, hypervisor needs to bind/
unbind guest's IOVA page table to host. For such bind/unbind request
from user space, host should figure out a target PASID (host default
PASID) to be bound. This patch adds a flag to indicate host IOMMU
driver if it needs to use default PASID in host.

FIXME: pasid_set = NULL; // dmar_domain->pasid_set;

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/vfio_iommu_type1.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 72c46d0e0b9c..548e23278716 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2804,6 +2804,18 @@ static void vfio_group_unbind_gpasid_fn(ioasid_t pasid, void *data)
 				 dc, vfio_dev_unbind_gpasid_fn);
 }
 
+static void vfio_group_unbind_default_gpasid(ioasid_t pasid, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+
+	dc->user = false;
+	dc->data = &pasid;
+	dc->flags = IOMMU_SVA_HPASID_DEF;
+
+	iommu_group_for_each_dev(dc->group->iommu_group,
+				 dc, vfio_dev_unbind_gpasid_fn);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -2867,6 +2879,11 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 					       vfio_group_unbind_gpasid_fn);
 				ioasid_user_put(iuser);
 			}
+			/*
+			 * We should explicitly call interface to unbind default pasid gIOVA
+			 * page table here.
+			 */
+			vfio_group_unbind_default_gpasid(0, &dc);
 		}
 
 		vfio_iommu_detach_group(domain, group);
-- 
Gitee


From a492fb30d288c8612064d231818256c953e5022b Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 30 Jun 2022 16:55:22 +0800
Subject: [PATCH 1822/2161] kvm: x86: add ENQCMD/ENQCMDS support

commit b0b204bc4c16dbd919d63be5c94a0e249d53b737 Intel-BKC.

ENQCMD (Enqueue Command) and ENQCMDS (Enqueue Command Supervisor)
instructions allow software to write command data to device's enqueue
registers together with PASID for workload submission. (ENQCMD obtains
the PASID from the IA32_PASID MSR, ENQCMDS obtains the PASID from
command data). When ENQCMD/ENQCMDS are used in guest (non-root mode),
the guest-programmed PASID must be translated to host PASID before the
command data store to the device. In order to support ENQCMD/ENQCMDS
virtualization, a hardware assisted Guest to Host PASID Translation
feature is added to VMX.

To enable this new VMX PASID Translation feature for ENQCMD/ENQCMDS
virtualization, this patch introduces a new PASID translation table in
KVM. It's a per-vm table which is shared by all vcpus and used by
hardwaware for Guest to Host PASID translation. Kvm manages this PASID
translation table according to Guest to Host PASID mapping information
from IOASID notification events. These events are from IOASID core
which manages Host PASID and its association with guest PASID, including

 IOASID_NOTIFY_BIND - capture Guest PASID to Host PASID mapping into the
		      translation table.
 IOASID_NOTIFY_UNBIND - clear Guest PASID to Host PASID mapping into the
		      translation table.

VM Exit is triggered by ENQCMD/ENQCMDS in guest (non-root mode) when
PASID translation failure happends. So this patch adds related VM Exit
handling too. It reports this failure to user by setting status flag of
the ENQCMD/ENQCMDS instruction.

This patch also adds a new vmx_vm_destroy function which is used to
destroy PASID translation table when vm ends.

v2: remove IOASID_FREE event handling.
    add more comments and descriptions, cleanup duplicated code.
    defer PASID translaiton table initialization to cpuid update.
v3: fix naming in macro and functions.
    rebased against IOASID notification patchset v3.
v4: resolve conflicts on PASID_MAX macro.

Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/vmx.h      |  43 +++++
 arch/x86/include/uapi/asm/vmx.h |   6 +-
 arch/x86/kvm/vmx/capabilities.h |   6 +
 arch/x86/kvm/vmx/vmx.c          | 284 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.h          |   5 +
 5 files changed, 343 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 1835767aa335..d32f830b924e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,6 +66,7 @@
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_PT_CONCEAL_VMX		0x00080000
 #define SECONDARY_EXEC_XSAVES			0x00100000
+#define SECONDARY_EXEC_PASID_TRANSLATION	0x00200000
 #define SECONDARY_EXEC_PT_USE_GPA		0x01000000
 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	0x00400000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
@@ -214,6 +215,10 @@ enum vmcs_field {
 	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
 	TSC_MULTIPLIER                  = 0x00002032,
 	TSC_MULTIPLIER_HIGH             = 0x00002033,
+	PASID_DIR0			= 0x00002038,
+	PASID_DIR0_HIGH 		= 0x00002039,
+	PASID_DIR1			= 0x0000203a,
+	PASID_DIR1_HIGH 		= 0x0000203b,
 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
 	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
 	VMCS_LINK_POINTER               = 0x00002800,
@@ -589,4 +594,42 @@ enum vmx_l1d_flush_state {
 
 extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
 
+/*
+ * The VMCS PASID Translation Table is a two-level data structure, including
+ * High/Low PASID Directory and PASID Table. Different fields of the Guest
+ * PASID are used to locate the PASID Table Entry which has the Host PASID.
+ *
+ * High PASID Directory Select - Guest PASID Bit19
+ * PASID Directory Entry Index - Guest PASID Bit18-10
+ * PASID Table Entry Index     - Guest PASID Bit9-0
+ */
+#define pasid_high_dir_select(gpasid)	(((gpasid) >> 19) & 0x1)
+#define pasid_de_idx(gpasid)		(((gpasid) >> 10) & 0x1ff)
+#define pasid_te_idx(gpasid)		((gpasid) & 0x3ff)
+
+#define MAX_PASID			(0Xfffff)
+/*
+ * PASID Directory Entry
+ *
+ * PAISD Table Pointer - PASID Directory Entry BitM-1
+ * PASID Table Present - PASID Directory Entry Bit0
+ */
+#define PASID_DE_TAB_PTR		(((u64)-1) << 12)
+#define PASID_DE_TAB_PRESENT		(1ULL << 0)
+#define PASID_DE_NUM			512
+#define pasid_de_table_ptr(pde)		(*(pde) & PASID_DE_TAB_PTR)
+#define pasid_de_table_present(pde)	(*(pde) & PASID_DE_TAB_PRESENT)
+
+/*
+ * PASID Table Entry
+ *
+ * Host PASID Valid - PASID Table Entry Bit31
+ * Host PASID       - PASID Table Entry Bit19-0
+ */
+#define PASID_TE_VALID			(1 << 31)
+#define PASID_TE_HOST_PASID		(0xfffff)
+#define PASID_TE_NUM			1024
+#define pasid_te_hpasid_valid(pte)	(*(pte) & PASID_TE_VALID)
+#define pasid_te_hpasid(pte)		(*(pte) & PASID_TE_HOST_PASID)
+
 #endif
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 3eb8411ab60e..66d8f64d75a9 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -88,6 +88,8 @@
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_UMWAIT              67
 #define EXIT_REASON_TPAUSE              68
+#define EXIT_REASON_ENQCMD_PASID        72
+#define EXIT_REASON_ENQCMDS_PASID       73
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -148,7 +150,9 @@
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
 	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
-	{ EXIT_REASON_TPAUSE,                "TPAUSE" }
+	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
+	{ EXIT_REASON_ENQCMD_PASID,          "ENQCMD_PASID" }, \
+	{ EXIT_REASON_ENQCMDS_PASID,         "ENQCMDS_PASID" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL       2
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 100cacde88fc..776e473b5850 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -248,6 +248,12 @@ static inline bool vmx_xsaves_supported(void)
 		SECONDARY_EXEC_XSAVES;
 }
 
+static inline bool cpu_has_vmx_pasid_trans(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_PASID_TRANSLATION;
+}
+
 static inline bool vmx_waitpkg_supported(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a359b8d6e542..939bdbc0d47c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -16,6 +16,7 @@
 #include <linux/frame.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/ioasid.h>
 #include <linux/kernel.h>
 #include <linux/kvm_host.h>
 #include <linux/module.h>
@@ -23,6 +24,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/smt.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
@@ -2403,6 +2405,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			SECONDARY_EXEC_PT_CONCEAL_VMX |
 			SECONDARY_EXEC_ENABLE_VMFUNC |
 			SECONDARY_EXEC_ENCLS_EXITING;
+		if (boot_cpu_has(X86_FEATURE_ENQCMD))
+			opt2 |= SECONDARY_EXEC_PASID_TRANSLATION;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -4164,6 +4168,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 		}
 	}
 
+	if (cpu_has_vmx_pasid_trans()) {
+		if (!guest_cpuid_has(vcpu, X86_FEATURE_ENQCMD))
+			exec_control &= ~SECONDARY_EXEC_PASID_TRANSLATION;
+	}
+
 	vmx->secondary_exec_control = exec_control;
 }
 
@@ -5587,6 +5596,27 @@ static int handle_encls(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_enqcmd_pasid(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	kvm_debug_ratelimited("[%s] VM exit_qualification=0x%lx\n", __func__,
+			      vmcs_readl(EXIT_QUALIFICATION));
+
+	/*
+	 * Valid PASID translation should exist before the guest attempts to do
+	 * ENQCMD/ENQCMDS. Otherwise, VM exit is triggered if CPU executes
+	 * ENQCMD/ENQCMDS in non-root mode but fails to translate the guest
+	 * PASID latched in IA32_PASID MSR. In such case, set the EFLAGS.ZF to
+	 * indicate the failure to the guest and skip the instruction.
+	 */
+	flags = vmx_get_rflags(vcpu);
+	flags |= X86_EFLAGS_ZF;
+	vmx_set_rflags(vcpu, flags);
+
+	return kvm_skip_emulated_instruction(vcpu);
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5643,6 +5673,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 	[EXIT_REASON_ENCLS]		      = handle_encls,
+	[EXIT_REASON_ENQCMD_PASID]	      = handle_enqcmd_pasid,
+	[EXIT_REASON_ENQCMDS_PASID]	      = handle_enqcmd_pasid,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6868,9 +6900,247 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
 
+static inline struct page *vmx_pasid_dir_page(struct kvm_vmx *kvm_vmx,
+					      ioasid_t gpasid)
+{
+	if (!kvm_vmx->pasid_dirs)
+		return NULL;
+
+	return pasid_high_dir_select(gpasid) ?
+			&kvm_vmx->pasid_dirs[1] : &kvm_vmx->pasid_dirs[0];
+}
+
+/* hold pasid_lock when invoke this function */
+static u32 *vmx_find_pasid_table_entry(struct kvm_vmx *kvm_vmx, ioasid_t gpasid)
+{
+	struct page *pd_page = vmx_pasid_dir_page(kvm_vmx, gpasid);
+	u64 *pd_base, *pde;
+	u32 *pt_base;
+
+	if (!pd_page) {
+		kvm_err("%s: PASID directory not found\n", __func__);
+		return ERR_PTR(-ENOENT);
+	}
+
+	pd_base = page_address(pd_page);
+	pde = pd_base + pasid_de_idx(gpasid);
+
+	if (!pasid_de_table_present(pde))
+		return NULL;
+
+	pt_base = __va(pasid_de_table_ptr(pde));
+	return pt_base + pasid_te_idx(gpasid);
+}
+
+/* hold pasid_lock when invoke this function */
+static u32 *vmx_alloc_pasid_table_entry(struct kvm_vmx *kvm_vmx,
+					ioasid_t gpasid)
+{
+	struct page *pt_page, *pd_page = vmx_pasid_dir_page(kvm_vmx, gpasid);
+	u64 *pd_base, *pde;
+	u32 *pt_base;
+
+	if (!pd_page) {
+		kvm_err("%s: PASID directory not found\n", __func__);
+		return ERR_PTR(-ENOENT);
+	}
+
+	pd_base = page_address(pd_page);
+	pde = pd_base + pasid_de_idx(gpasid);
+
+	pt_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+	if (!pt_page) {
+		kvm_err("%s: fail to allocate PASID table entry\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pt_base = page_address(pt_page);
+	*pde = (u64)page_to_phys(pt_page) | PASID_DE_TAB_PRESENT;
+	return pt_base + pasid_te_idx(gpasid);
+}
+
+static int vmx_set_pasid_trans(struct kvm_vmx *kvm_vmx, ioasid_t gpasid,
+			       ioasid_t hpasid)
+{
+	int ret = 0;
+	u32 *pte;
+
+	spin_lock(&kvm_vmx->pasid_lock);
+
+	pte = vmx_find_pasid_table_entry(kvm_vmx, gpasid);
+	if (IS_ERR(pte)) {
+		ret = PTR_ERR(pte);
+		goto done;
+	} else if (!pte) {
+		pte = vmx_alloc_pasid_table_entry(kvm_vmx, gpasid);
+		if (IS_ERR(pte)) {
+			ret = PTR_ERR(pte);
+			goto done;
+		}
+	}
+
+	WARN_ON(pasid_te_hpasid_valid(pte));
+
+	ioasid_get_locked(NULL, hpasid);
+	*pte = hpasid | PASID_TE_VALID;
+
+done:
+	spin_unlock(&kvm_vmx->pasid_lock);
+	return ret;
+}
+
+static int vmx_clear_pasid_trans(struct kvm_vmx *kvm_vmx, ioasid_t gpasid,
+				 ioasid_t hpasid)
+{
+	ioasid_t old_hpasid;
+	int ret = 0;
+	u32 *pte;
+
+	spin_lock(&kvm_vmx->pasid_lock);
+
+	pte = vmx_find_pasid_table_entry(kvm_vmx, gpasid);
+	if (IS_ERR(pte)) {
+		ret = PTR_ERR(pte);
+		goto done;
+	} else if (!pte || !pasid_te_hpasid_valid(pte)) {
+		WARN_ON(1);
+		goto done;
+	}
+
+	old_hpasid = pasid_te_hpasid(pte);
+	WARN_ON(old_hpasid != hpasid);
+
+	*pte = 0;
+	ioasid_put_locked(NULL, old_hpasid);
+
+done:
+	spin_unlock(&kvm_vmx->pasid_lock);
+	return ret;
+}
+
+static int vmx_handle_ioasid_event(struct notifier_block *nb,
+				   unsigned long event, void *data)
+{
+	struct ioasid_nb_args *args = (struct ioasid_nb_args *)data;
+	struct kvm_vmx *kvm_vmx = container_of(nb, struct kvm_vmx, pasid_nb);
+	ioasid_t gpasid = args->spid;
+	ioasid_t hpasid = args->id;
+	int r = -1;
+
+	kvm_debug("%s: event %lu hpasid %u gpasid %u\n", __func__,
+		  event, hpasid, gpasid);
+
+	if (hpasid > MAX_PASID || gpasid > MAX_PASID)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case IOASID_NOTIFY_BIND:
+		r = vmx_set_pasid_trans(kvm_vmx, gpasid, hpasid);
+		break;
+	case IOASID_NOTIFY_UNBIND:
+		r = vmx_clear_pasid_trans(kvm_vmx, gpasid, hpasid);
+		break;
+	}
+
+	return r ? NOTIFY_DONE : NOTIFY_OK;
+}
+
+#define PASID_DIRS_ORDER 1
+
+static void vmx_vcpu_pasid_trans_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+	struct mm_struct *mm = get_task_mm(current);
+	int ret = 0;
+
+	/*
+	 * initialize a per-vm PASID translation table and start monitoring
+	 * IOASID events.
+	 */
+	spin_lock(&kvm_vmx->pasid_lock);
+	if (kvm_vmx->pasid_dirs) {
+		/* skip this as table has already been initialized */
+		goto done;
+	}
+
+	kvm_vmx->pasid_dirs = alloc_pages(GFP_ATOMIC | __GFP_ZERO,
+					  PASID_DIRS_ORDER);
+	if (!kvm_vmx->pasid_dirs) {
+		kvm_err("%s: fail to alloc PASID Directory\n", __func__);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	kvm_vmx->pasid_nb.notifier_call = vmx_handle_ioasid_event;
+	kvm_vmx->pasid_nb.priority = IOASID_PRIO_CPU;
+	kvm_vmx->mm = mm;
+
+	ret = ioasid_register_notifier_mm(kvm_vmx->mm, &kvm_vmx->pasid_nb);
+	if (ret) {
+		__free_pages(kvm_vmx->pasid_dirs, PASID_DIRS_ORDER);
+		kvm_vmx->pasid_dirs = NULL;
+	}
+
+done:
+	spin_unlock(&kvm_vmx->pasid_lock);
+	mmput(mm);
+
+	if (!ret) {
+		vmcs_write64(PASID_DIR0, page_to_phys(&kvm_vmx->pasid_dirs[0]));
+		vmcs_write64(PASID_DIR1, page_to_phys(&kvm_vmx->pasid_dirs[1]));
+	}
+}
+
+static void vmx_vm_pasid_tables_free(struct page *pd_page)
+{
+	u64 *pde, *pd_base = page_address(pd_page);
+	u32 *pte, *pt_base;
+	ioasid_t hpasid;
+
+	/*
+	 * before free the PASID translation table, traverse all table entries
+	 * to make sure that kvm doesn't hold reference count of any hpasid.
+	 */
+	for (pde = pd_base; pde < pd_base + PASID_DE_NUM; pde++) {
+		if (!pasid_de_table_present(pde))
+			continue;
+
+		pt_base = __va(pasid_de_table_ptr(pde));
+
+		for (pte = pt_base; pte < pt_base + PASID_TE_NUM; pte++) {
+			if (pasid_te_hpasid_valid(pte)) {
+				hpasid = pasid_te_hpasid(pte);
+
+				/*
+				 * decrease the reference of this hpasid, and
+				 * remove it from the PASID translation table.
+				 */
+				*pte = 0;
+				ioasid_put(NULL, hpasid);
+			}
+		}
+
+		*pde = 0;
+		free_page((unsigned long)pt_base);
+	}
+}
+
+static void vmx_vm_pasid_trans_destroy(struct kvm_vmx *kvm_vmx)
+{
+	if (!kvm_vmx->pasid_dirs)
+		return;
+
+	ioasid_unregister_notifier_mm(kvm_vmx->mm, &kvm_vmx->pasid_nb);
+
+	vmx_vm_pasid_tables_free(&kvm_vmx->pasid_dirs[0]);
+	vmx_vm_pasid_tables_free(&kvm_vmx->pasid_dirs[1]);
+	__free_pages(kvm_vmx->pasid_dirs, PASID_DIRS_ORDER);
+}
+
 static int vmx_vm_init(struct kvm *kvm)
 {
 	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+	spin_lock_init(&to_kvm_vmx(kvm)->pasid_lock);
 
 	if (!ple_gap)
 		kvm->arch.pause_in_guest = true;
@@ -6901,6 +7171,12 @@ static int vmx_vm_init(struct kvm *kvm)
 	return 0;
 }
 
+static void vmx_vm_destroy(struct kvm *kvm)
+{
+	if (cpu_has_vmx_pasid_trans())
+		vmx_vm_pasid_trans_destroy(to_kvm_vmx(kvm));
+}
+
 static int __init vmx_check_processor_compat(void)
 {
 	struct vmcs_config vmcs_conf;
@@ -7146,6 +7422,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	if (kvm_cpu_cap_has(X86_FEATURE_XFD))
 		vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
 					  !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+
+	if (cpu_has_vmx_pasid_trans() &&
+		guest_cpuid_has(vcpu, X86_FEATURE_ENQCMD))
+		vmx_vcpu_pasid_trans_init(vcpu);
 }
 
 /*
@@ -7185,6 +7465,9 @@ static __init void vmx_set_cpu_caps(void)
 	/* CPUID 0x80000001 */
 	if (!cpu_has_vmx_rdtscp())
 		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+
+	if (!cpu_has_vmx_pasid_trans())
+		kvm_cpu_cap_clear(X86_FEATURE_ENQCMD);
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7886,6 +8169,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.has_emulated_msr = vmx_has_emulated_msr,
 
 	.vm_init = vmx_vm_init,
+	.vm_destroy = vmx_vm_destroy,
 	.vm_alloc = vmx_vm_alloc,
 	.vm_free = vmx_vm_free,
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 8bf935e122d3..df25f361c020 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -322,6 +322,11 @@ struct kvm_vmx {
 
 	enum ept_pointers_status ept_pointers_match;
 	spinlock_t ept_pointer_lock;
+
+	struct page *pasid_dirs;
+	spinlock_t pasid_lock;
+	struct notifier_block pasid_nb;
+	struct mm_struct *mm;
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
-- 
Gitee


From c72c6bc305055b2c3c8940f3fec3a8e6305c9a94 Mon Sep 17 00:00:00 2001
From: Wu Hao <hao.wu@intel.com>
Date: Sat, 19 Sep 2020 22:57:25 +0800
Subject: [PATCH 1823/2161] kvm: x86: rename KVM_REQ_MCLOCK_INPROGRESS to
 BLOCK_VMENTRY

commit f609f8d7db3a1202eab18c5511dfadd313e76790 Intel-BKC.

This patch renames KVM_REQ_MCLOCK_INPROGREE to a common request
KVM_REQ_BLOCK_VMENTRY to block vm entry. The purpose of this change
is adding a common VCPU rendezvous mechanism (no VCPU can enter
non-root mode before the desired operation is completed).

v3: add rendezvous description.
    fixed building errors on non-x86 platforms.

Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++++-
 arch/x86/kvm/x86.c              | 25 +++++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8ea16df9633b..fd83db27465d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -64,7 +64,7 @@
 #define KVM_REQ_PMI			KVM_ARCH_REQ(11)
 #define KVM_REQ_SMI			KVM_ARCH_REQ(12)
 #define KVM_REQ_MASTERCLOCK_UPDATE	KVM_ARCH_REQ(13)
-#define KVM_REQ_MCLOCK_INPROGRESS \
+#define KVM_REQ_BLOCK_VMENTRY \
 	KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_SCAN_IOAPIC \
 	KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
@@ -1643,6 +1643,9 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #endif
 }
 
+void kvm_make_block_vmentry_request(struct kvm *kvm);
+void kvm_clear_block_vmentry_request(struct kvm *kvm);
+
 #define put_smstate(type, buf, offset, val)                      \
 	*(type *)((buf) + (offset) - 0x7e00) = val
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 24d1dab0a54a..b046780b300c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2328,10 +2328,21 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 #endif
 }
 
-void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+void kvm_make_block_vmentry_request(struct kvm *kvm)
 {
-	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+	kvm_make_all_cpus_request(kvm, KVM_REQ_BLOCK_VMENTRY);
 }
+EXPORT_SYMBOL_GPL(kvm_make_block_vmentry_request);
+
+void kvm_clear_block_vmentry_request(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_clear_request(KVM_REQ_BLOCK_VMENTRY, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_clear_block_vmentry_request);
 
 static void kvm_gen_update_masterclock(struct kvm *kvm)
 {
@@ -2341,7 +2352,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 	struct kvm_arch *ka = &kvm->arch;
 
 	spin_lock(&ka->pvclock_gtod_sync_lock);
-	kvm_make_mclock_inprogress_request(kvm);
+	kvm_make_block_vmentry_request(kvm);
 	/* no guest entries from this point */
 	pvclock_update_vm_gtod_copy(kvm);
 
@@ -2349,8 +2360,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
 	/* guest entries allowed */
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+	kvm_clear_block_vmentry_request(kvm);
 
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
@@ -7139,7 +7149,7 @@ static void kvm_hyperv_tsc_notifier(void)
 
 	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_make_mclock_inprogress_request(kvm);
+		kvm_make_block_vmentry_request(kvm);
 
 	hyperv_stop_tsc_emulation();
 
@@ -7158,8 +7168,7 @@ static void kvm_hyperv_tsc_notifier(void)
 		kvm_for_each_vcpu(cpu, vcpu, kvm)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
-		kvm_for_each_vcpu(cpu, vcpu, kvm)
-			kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+		kvm_clear_block_vmentry_request(kvm);
 
 		spin_unlock(&ka->pvclock_gtod_sync_lock);
 	}
-- 
Gitee


From 054cc4d1a3e16c0ee8f5b1f35f5aa2294ed967f9 Mon Sep 17 00:00:00 2001
From: Wu Hao <hao.wu@intel.com>
Date: Sat, 19 Sep 2020 23:07:25 +0800
Subject: [PATCH 1824/2161] kvm: x86: enqcmd: update PASID translation table in
 VMX root mode only.

commit 09ce2f5e8779d67c6e912e147da024d3e58a01b1 Intel-BKC.

PASID Translation Table for ENQCMD/ENQCMDS is a per-VM table shared by
all vcpus/VMCS, it's not part of the VMCS, but linked by a pointer in
the VMCS. Per SDM 24.11.4, "Software should ensure that each such data
structure is modified only when no logical processor with a current VMCS
that references it is in VMX non-root mode.".

This patch uses kvm_make_block_vmentry_request to make sure all
vcpus are in VMX root mode when modify the PASID Translation Table.

v2: switch to use common BLOCK_VMENTRY request.
v3: rebased

Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 939bdbc0d47c..fa986ac16670 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6985,6 +6985,7 @@ static int vmx_set_pasid_trans(struct kvm_vmx *kvm_vmx, ioasid_t gpasid,
 	*pte = hpasid | PASID_TE_VALID;
 
 done:
+	kvm_clear_block_vmentry_request(&kvm_vmx->kvm);
 	spin_unlock(&kvm_vmx->pasid_lock);
 	return ret;
 }
@@ -6998,6 +6999,8 @@ static int vmx_clear_pasid_trans(struct kvm_vmx *kvm_vmx, ioasid_t gpasid,
 
 	spin_lock(&kvm_vmx->pasid_lock);
 
+	kvm_make_block_vmentry_request(&kvm_vmx->kvm);
+
 	pte = vmx_find_pasid_table_entry(kvm_vmx, gpasid);
 	if (IS_ERR(pte)) {
 		ret = PTR_ERR(pte);
@@ -7010,9 +7013,12 @@ static int vmx_clear_pasid_trans(struct kvm_vmx *kvm_vmx, ioasid_t gpasid,
 	old_hpasid = pasid_te_hpasid(pte);
 	WARN_ON(old_hpasid != hpasid);
 
+	kvm_make_block_vmentry_request(&kvm_vmx->kvm);
+
 	*pte = 0;
 	ioasid_put_locked(NULL, old_hpasid);
 
+	kvm_clear_block_vmentry_request(&kvm_vmx->kvm);
 done:
 	spin_unlock(&kvm_vmx->pasid_lock);
 	return ret;
-- 
Gitee


From 4d4d65bf93ce5b79bda9c90b024fb6407988f4cf Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Mon, 21 Oct 2019 16:30:20 -0700
Subject: [PATCH 1825/2161] KVM: x86: Introduce vcpu->arch.xsaves_enabled

commit 7204160eb7809345d10c983d9d1dfbd98060a56d upstream.

Cache whether XSAVES is enabled in the guest by adding xsaves_enabled to
vcpu->arch.

Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Change-Id: If4638e0901c28a4494dad2e103e2c075e8ab5d68
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/svm.c              | 3 +++
 arch/x86/kvm/vmx/vmx.c          | 5 +++++
 3 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fd83db27465d..0341e3dc5afc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -572,6 +572,7 @@ struct kvm_vcpu_arch {
 	u64 smi_count;
 	bool tpr_access_reporting;
 	bool xfd_no_write_intercept;
+	bool xsaves_enabled;
 	u64 ia32_xss;
 	u64 microcode_version;
 	u64 arch_capabilities;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0eaed742c8da..3b850437b8ab 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5978,6 +5978,9 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+				    boot_cpu_has(X86_FEATURE_XSAVES);
+
 	/* Update nrips enabled cache */
 	svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fa986ac16670..bbcd0f204c4e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4072,6 +4072,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 			guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
 			guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
 
+		vcpu->arch.xsaves_enabled = xsaves_enabled;
+
 		if (!xsaves_enabled)
 			exec_control &= ~SECONDARY_EXEC_XSAVES;
 
@@ -7404,6 +7406,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
+	vcpu->arch.xsaves_enabled = false;
+
 	if (cpu_has_secondary_exec_ctrls()) {
 		vmx_compute_secondary_exec_control(vmx);
 		vmcs_set_secondary_exec_control(vmx);
-- 
Gitee


From 55605782e95df01b2d125114218e350b716d252c Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Mon, 21 Oct 2019 16:30:21 -0700
Subject: [PATCH 1826/2161] KVM: VMX: Fix conditions for guest IA32_XSS support

commit c034f2aa8622e1e436563eb34c0f78ba8aa32329 upstream.

Volume 4 of the SDM says that IA32_XSS is supported
if CPUID(EAX=0DH,ECX=1):EAX.XSS[bit 3] is set, so only the
X86_FEATURE_XSAVES check is necessary (X86_FEATURE_XSAVES is the Linux
name for CPUID(EAX=0DH,ECX=1):EAX.XSS[bit 3]).

Fixes: 4d763b168e9c5 ("KVM: VMX: check CPUID before allowing read/write of IA32_XSS")
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Change-Id: I9059b9f2e3595e4b09a4cdcf14b933b22ebad419
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index bbcd0f204c4e..0e1955a28bd6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1836,10 +1836,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
 				       &msr_info->data);
 	case MSR_IA32_XSS:
-		if (!vmx_xsaves_supported() ||
-		    (!msr_info->host_initiated &&
-		     !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-		       guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
 			return 1;
 		msr_info->data = vcpu->arch.ia32_xss;
 		break;
@@ -2095,10 +2093,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		return vmx_set_vmx_msr(vcpu, msr_index, data);
 	case MSR_IA32_XSS:
-		if (!vmx_xsaves_supported() ||
-		    (!msr_info->host_initiated &&
-		     !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-		       guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
 			return 1;
 		/*
 		 * The only supported bit as of Skylake is bit 8, but
@@ -2107,11 +2103,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (data != 0)
 			return 1;
 		vcpu->arch.ia32_xss = data;
-		if (vcpu->arch.ia32_xss != host_xss)
-			add_atomic_switch_msr(vmx, MSR_IA32_XSS,
-				vcpu->arch.ia32_xss, host_xss, false);
-		else
-			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+		if (vcpu->arch.xsaves_enabled) {
+			if (vcpu->arch.ia32_xss != host_xss)
+				add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+					vcpu->arch.ia32_xss, host_xss, false);
+			else
+				clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+		}
 		break;
 	case MSR_IA32_RTIT_CTL:
 		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-- 
Gitee


From d2a7769632de60e8d17d9156768b03dc07026b8f Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Mon, 21 Oct 2019 16:30:22 -0700
Subject: [PATCH 1827/2161] KVM: x86: Remove unneeded kvm_vcpu variable,
 guest_xcr0_loaded

commit 78958563d8023db0c6d03a2fe2a64d79b47b4349 upstream.

The kvm_vcpu variable, guest_xcr0_loaded, is a waste of an 'int'
and a conditional branch.  VMX and SVM are the only users, and both
unconditionally pair kvm_load_guest_xcr0() with kvm_put_guest_xcr0()
making this check unnecessary. Without this variable, the predicates in
kvm_load_guest_xcr0 and kvm_put_guest_xcr0 should match.

Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Change-Id: I7b1eb9b62969d7bbb2850f27e42f863421641b23
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c       | 16 +++++-----------
 include/linux/kvm_host.h |  1 -
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b046780b300c..946418fb3e95 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -838,12 +838,8 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
 void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
 {
 	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-			!vcpu->guest_xcr0_loaded) {
-		/* kvm_set_xcr() also depends on this */
-		if (vcpu->arch.xcr0 != host_xcr0)
-			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
-		vcpu->guest_xcr0_loaded = 1;
-	}
+	    vcpu->arch.xcr0 != host_xcr0)
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 
 	if (static_cpu_has(X86_FEATURE_PKU) &&
 	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
@@ -863,11 +859,9 @@ void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 			write_pkru(vcpu->arch.host_pkru);
 	}
 
-	if (vcpu->guest_xcr0_loaded) {
-		if (vcpu->arch.xcr0 != host_xcr0)
-			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
-		vcpu->guest_xcr0_loaded = 0;
-	}
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+	    vcpu->arch.xcr0 != host_xcr0)
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
 }
 EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3d0aee229958..79b4dbb3e7fc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -278,7 +278,6 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	struct kvm_run *run;
 
-	int guest_xcr0_loaded;
 	struct swait_queue_head wq;
 	struct pid __rcu *pid;
 	int sigset_active;
-- 
Gitee


From d5762c170069878a1aa61e65268eec16e3beed32 Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Mon, 21 Oct 2019 16:30:23 -0700
Subject: [PATCH 1828/2161] KVM: SVM: Use wrmsr for switching between guest and
 host IA32_XSS on AMD

commit 312a1c87798e6b43ff533393167b3cba33645ead upstream.

When the guest can execute the XSAVES/XRSTORS instructions, set the
hardware IA32_XSS MSR to guest/host values on VM-entry/VM-exit.

Note that vcpu->arch.ia32_xss is currently guaranteed to be 0 on AMD,
since there is no way to change it.

Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Change-Id: Id51a782462086e6d7a3ab621838e200f1c005afd
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3b850437b8ab..dfb37f24a026 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -115,6 +115,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 
 static bool erratum_383_found __read_mostly;
 
+static u64 __read_mostly host_xss;
+
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
 	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -1483,6 +1485,9 @@ static __init int svm_hardware_setup(void)
 			pr_info("Virtual GIF supported\n");
 	}
 
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		rdmsrl(MSR_IA32_XSS, host_xss);
+
 	svm_set_cpu_caps();
 	
 	return 0;
@@ -5673,6 +5678,22 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(svm);
 }
 
+static void svm_load_guest_xss(struct kvm_vcpu *vcpu)
+{
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+	    vcpu->arch.xsaves_enabled &&
+	    vcpu->arch.ia32_xss != host_xss)
+		wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
+}
+
+static void svm_load_host_xss(struct kvm_vcpu *vcpu)
+{
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+	    vcpu->arch.xsaves_enabled &&
+	    vcpu->arch.ia32_xss != host_xss)
+		wrmsrl(MSR_IA32_XSS, host_xss);
+}
+
 static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
 	return EXIT_FASTPATH_NONE;
@@ -5718,6 +5739,7 @@ static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	clgi();
 	kvm_load_guest_xcr0(vcpu);
+	svm_load_guest_xss(vcpu);
 
 	if (lapic_in_kernel(vcpu) &&
 		vcpu->arch.apic->lapic_timer.timer_advance_ns)
@@ -5867,6 +5889,7 @@ static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_before_interrupt(&svm->vcpu);
 
+	svm_load_host_xss(vcpu);
 	kvm_put_guest_xcr0(vcpu);
 	stgi();
 
-- 
Gitee


From 7c2f2b35446b1ebac0d8c74ce1f6534ae7fab566 Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Mon, 21 Oct 2019 16:30:24 -0700
Subject: [PATCH 1829/2161] KVM: VMX: Use wrmsr for switching between guest and
 host IA32_XSS on Intel

commit 9753d68865c5662eee94eb8808b5ad5eb766f5ea upstream.

When the guest can execute the XSAVES/XRSTORS instructions, use wrmsr to
set the hardware IA32_XSS MSR to guest/host values on VM-entry/VM-exit,
rather than the MSR-load areas. By using the same approach as AMD, we
will be able to use a common implementation for both (in the next
patch).

Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Change-Id: I9447d104b2615c04e39e4af0c911e1e7309bf464
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0e1955a28bd6..55f8e28f33ff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2103,13 +2103,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (data != 0)
 			return 1;
 		vcpu->arch.ia32_xss = data;
-		if (vcpu->arch.xsaves_enabled) {
-			if (vcpu->arch.ia32_xss != host_xss)
-				add_atomic_switch_msr(vmx, MSR_IA32_XSS,
-					vcpu->arch.ia32_xss, host_xss, false);
-			else
-				clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
-		}
 		break;
 	case MSR_IA32_RTIT_CTL:
 		if ((pt_mode != PT_MODE_HOST_GUEST) ||
@@ -6571,6 +6564,22 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 	}
 }
 
+static void vmx_load_guest_xss(struct kvm_vcpu *vcpu)
+{
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+	    vcpu->arch.xsaves_enabled &&
+	    vcpu->arch.ia32_xss != host_xss)
+		wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
+}
+
+static void vmx_load_host_xss(struct kvm_vcpu *vcpu)
+{
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+	    vcpu->arch.xsaves_enabled &&
+	    vcpu->arch.ia32_xss != host_xss)
+		wrmsrl(MSR_IA32_XSS, host_xss);
+}
+
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
 static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -6626,6 +6635,7 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
 	kvm_load_guest_xcr0(vcpu);
+	vmx_load_guest_xss(vcpu);
 
 	pt_guest_enter(vmx);
 
@@ -6714,6 +6724,7 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	pt_guest_exit(vmx);
 
+	vmx_load_host_xss(vcpu);
 	kvm_put_guest_xcr0(vcpu);
 
 	vmx->nested.nested_run_pending = 0;
-- 
Gitee


From 0f1c3f8200f1f465d442c5d0a08498bebaf48037 Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Mon, 21 Oct 2019 16:30:25 -0700
Subject: [PATCH 1830/2161] KVM: x86: Move IA32_XSS-swapping on
 VM-entry/VM-exit to common x86 code

commit 139a12cfe1a040fd881338a7cc042bd37159ea9a upstream.

Hoist the vendor-specific code related to loading the hardware IA32_XSS
MSR with guest/host values on VM-entry/VM-exit to common x86 code.

Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Change-Id: Ic6e3430833955b98eb9b79ae6715cf2a3fdd6d82
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c     | 27 ++-------------------------
 arch/x86/kvm/vmx/vmx.c | 27 ++-------------------------
 arch/x86/kvm/x86.c     | 38 ++++++++++++++++++++++++++++----------
 arch/x86/kvm/x86.h     |  4 ++--
 4 files changed, 34 insertions(+), 62 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index dfb37f24a026..09ad2d6e6990 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -115,8 +115,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 
 static bool erratum_383_found __read_mostly;
 
-static u64 __read_mostly host_xss;
-
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
 	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -1485,9 +1483,6 @@ static __init int svm_hardware_setup(void)
 			pr_info("Virtual GIF supported\n");
 	}
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		rdmsrl(MSR_IA32_XSS, host_xss);
-
 	svm_set_cpu_caps();
 	
 	return 0;
@@ -5678,22 +5673,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(svm);
 }
 
-static void svm_load_guest_xss(struct kvm_vcpu *vcpu)
-{
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-	    vcpu->arch.xsaves_enabled &&
-	    vcpu->arch.ia32_xss != host_xss)
-		wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
-}
-
-static void svm_load_host_xss(struct kvm_vcpu *vcpu)
-{
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-	    vcpu->arch.xsaves_enabled &&
-	    vcpu->arch.ia32_xss != host_xss)
-		wrmsrl(MSR_IA32_XSS, host_xss);
-}
-
 static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
 	return EXIT_FASTPATH_NONE;
@@ -5738,8 +5717,7 @@ static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
 	clgi();
-	kvm_load_guest_xcr0(vcpu);
-	svm_load_guest_xss(vcpu);
+	kvm_load_guest_xsave_state(vcpu);
 
 	if (lapic_in_kernel(vcpu) &&
 		vcpu->arch.apic->lapic_timer.timer_advance_ns)
@@ -5889,8 +5867,7 @@ static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_before_interrupt(&svm->vcpu);
 
-	svm_load_host_xss(vcpu);
-	kvm_put_guest_xcr0(vcpu);
+	kvm_load_host_xsave_state(vcpu);
 	stgi();
 
 	/* Any pending NMI will happen here */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 55f8e28f33ff..a04e9af11f80 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -109,8 +109,6 @@ module_param(enable_apicv, bool, S_IRUGO);
 static bool __read_mostly nested = 1;
 module_param(nested, bool, S_IRUGO);
 
-static u64 __read_mostly host_xss;
-
 bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
@@ -6564,22 +6562,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 	}
 }
 
-static void vmx_load_guest_xss(struct kvm_vcpu *vcpu)
-{
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-	    vcpu->arch.xsaves_enabled &&
-	    vcpu->arch.ia32_xss != host_xss)
-		wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
-}
-
-static void vmx_load_host_xss(struct kvm_vcpu *vcpu)
-{
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-	    vcpu->arch.xsaves_enabled &&
-	    vcpu->arch.ia32_xss != host_xss)
-		wrmsrl(MSR_IA32_XSS, host_xss);
-}
-
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
 static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -6634,8 +6616,7 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
-	kvm_load_guest_xcr0(vcpu);
-	vmx_load_guest_xss(vcpu);
+	kvm_load_guest_xsave_state(vcpu);
 
 	pt_guest_enter(vmx);
 
@@ -6724,8 +6705,7 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	pt_guest_exit(vmx);
 
-	vmx_load_host_xss(vcpu);
-	kvm_put_guest_xcr0(vcpu);
+	kvm_load_host_xsave_state(vcpu);
 
 	vmx->nested.nested_run_pending = 0;
 	vmx->idt_vectoring_info = 0;
@@ -8025,9 +8005,6 @@ static __init int hardware_setup(void)
 		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
 	}
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		rdmsrl(MSR_IA32_XSS, host_xss);
-
 	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
 	    !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
 		enable_vpid = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 946418fb3e95..7c8e20e012f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -184,6 +184,8 @@ struct kvm_shared_msrs {
 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 static struct kvm_shared_msrs __percpu *shared_msrs;
 
+static u64 __read_mostly host_xss;
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
 	{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -835,11 +837,17 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
 {
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-	    vcpu->arch.xcr0 != host_xcr0)
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
+		if (vcpu->arch.xcr0 != host_xcr0)
+			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+
+		if (vcpu->arch.xsaves_enabled &&
+		    vcpu->arch.ia32_xss != host_xss)
+			wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
+	}
 
 	if (static_cpu_has(X86_FEATURE_PKU) &&
 	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
@@ -847,9 +855,9 @@ void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
 	    vcpu->arch.pkru != vcpu->arch.host_pkru)
 		write_pkru(vcpu->arch.pkru);
 }
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
 
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 {
 	if (static_cpu_has(X86_FEATURE_PKU) &&
 	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
@@ -859,11 +867,18 @@ void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 			write_pkru(vcpu->arch.host_pkru);
 	}
 
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-	    vcpu->arch.xcr0 != host_xcr0)
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
+		if (vcpu->arch.xcr0 != host_xcr0)
+			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+
+		if (vcpu->arch.xsaves_enabled &&
+		    vcpu->arch.ia32_xss != host_xss)
+			wrmsrl(MSR_IA32_XSS, host_xss);
+	}
+
 }
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
 
 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
@@ -9536,6 +9551,9 @@ int kvm_arch_hardware_setup(void)
 		kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
 	}
 
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		rdmsrl(MSR_IA32_XSS, host_xss);
+
 	kvm_init_msr_list();
 	return 0;
 }
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7e782c49636a..f7d85cc5027c 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -363,8 +363,8 @@ static inline bool kvm_pat_valid(u64 data)
 	return (data | ((data & 0x0202020202020202ull) << 1)) == data;
 }
 
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
 int kvm_spec_ctrl_test_value(u64 value);
 
 #endif
-- 
Gitee


From 2dcc2fef7b7de22bbca578935e3f3121f12861ba Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Mon, 21 Oct 2019 16:30:26 -0700
Subject: [PATCH 1831/2161] kvm: x86: Move IA32_XSS to kvm_{get,set}_msr_common

commit 864e2ab2b46db1ac266c46a7c9cefe6cc893029d upstream.

Hoist support for RDMSR/WRMSR of IA32_XSS from vmx into common code so
that it can be used for svm as well.

Right now, kvm only allows the guest IA32_XSS to be zero,
so the guest's usage of XSAVES will be exactly the same as XSAVEC.

Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Change-Id: Ie4b0f777d71e428fbee6e82071ac2d7618e9bb40
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 18 ------------------
 arch/x86/kvm/x86.c     | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a04e9af11f80..1c3be320de03 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1833,12 +1833,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
 				       &msr_info->data);
-	case MSR_IA32_XSS:
-		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
-			return 1;
-		msr_info->data = vcpu->arch.ia32_xss;
-		break;
 	case MSR_IA32_RTIT_CTL:
 		if (pt_mode != PT_MODE_HOST_GUEST)
 			return 1;
@@ -2090,18 +2084,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
 		return vmx_set_vmx_msr(vcpu, msr_index, data);
-	case MSR_IA32_XSS:
-		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
-			return 1;
-		/*
-		 * The only supported bit as of Skylake is bit 8, but
-		 * it is not supported on KVM.
-		 */
-		if (data != 0)
-			return 1;
-		vcpu->arch.ia32_xss = data;
-		break;
 	case MSR_IA32_RTIT_CTL:
 		if ((pt_mode != PT_MODE_HOST_GUEST) ||
 			vmx_rtit_ctl_check(vcpu, data) ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7c8e20e012f1..c3c7225f5d06 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2887,6 +2887,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC:
 		kvm_write_tsc(vcpu, msr_info);
 		break;
+	case MSR_IA32_XSS:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+			return 1;
+		/*
+		 * We do support PT if kvm_x86_ops->pt_supported(), but we do
+		 * not support IA32_XSS[bit 8]. Guests will have to use
+		 * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
+		 * MSRs.
+		 */
+		if (data != 0)
+			return 1;
+		vcpu->arch.ia32_xss = data;
+		break;
 	case MSR_SMI_COUNT:
 		if (!msr_info->host_initiated)
 			return 1;
@@ -3236,6 +3250,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
 		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
 				   msr_info->host_initiated);
+	case MSR_IA32_XSS:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+			return 1;
+		msr_info->data = vcpu->arch.ia32_xss;
+		break;
 	case MSR_K7_CLK_CTL:
 		/*
 		 * Provide expected ramp-up count for K7. All other
-- 
Gitee


From 0b8d17b25e7656b5edb78a6b30ace1d03e0419de Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 4 Jul 2022 14:52:16 +0800
Subject: [PATCH 1832/2161] kvm: svm: Update svm_xsaves_supported

commit 52297436199dde85be557ee6bc779f5b96082f74 upstream.

AMD CPUs now support XSAVES in a limited fashion (they require IA32_XSS
to be zero).

AMD has no equivalent of Intel's "Enable XSAVES/XRSTORS" VM-execution
control. Instead, XSAVES is always available to the guest when supported
on the host.

Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Change-Id: I40dc2c682eb0d38c2208d95d5eb7bbb6c47f6317
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 09ad2d6e6990..f8c551ab49f5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6043,7 +6043,7 @@ static bool svm_mpx_supported(void)
 
 static bool svm_xsaves_supported(void)
 {
-	return false;
+	return boot_cpu_has(X86_FEATURE_XSAVES);
 }
 
 static bool svm_pt_supported(void)
-- 
Gitee


From 6162b4e6a2f61dbc786d9943889bc0f4d4f8032c Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 4 Jul 2022 16:26:50 +0800
Subject: [PATCH 1833/2161] KVM: x86: Move XSAVES CPUID adjust to VMX's KVM cpu
 cap update

commit b3d895d5c4154156894fd1df2158d82f94fb5527 upstream.

Move the clearing of the XSAVES CPUID bit into VMX, which has a separate
VMCS control to enable XSAVES in non-root, to eliminate the last ugly
renmant of the undesirable "unsigned f_* = *_supported ? F(*) : 0"
pattern in the common CPUID handling code.

Drop ->xsaves_supported(), CPUID adjustment was the only user.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/cpuid.c            | 4 ----
 arch/x86/kvm/svm.c              | 6 ------
 arch/x86/kvm/vmx/vmx.c          | 5 ++++-
 4 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0341e3dc5afc..85b65e8b0073 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1132,7 +1132,6 @@ struct kvm_x86_ops {
 			       enum x86_intercept_stage stage);
 	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
 	bool (*mpx_supported)(void);
-	bool (*xsaves_supported)(void);
 	bool (*pt_supported)(void);
 
 	int (*check_nested_events)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 079df2910971..5a2b7b46f293 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -691,10 +691,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			goto out;
 
 		cpuid_entry_override(entry, CPUID_D_1_EAX);
-
-		if (!kvm_x86_ops->xsaves_supported())
-			cpuid_entry_clear(entry, X86_FEATURE_XSAVES);
-
 		if (entry->eax & (F(XSAVES)|F(XSAVEC)))
 			entry->ebx = xstate_required_size(supported, true);
 		else
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f8c551ab49f5..7c7f9bc538bc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6041,11 +6041,6 @@ static bool svm_mpx_supported(void)
 	return false;
 }
 
-static bool svm_xsaves_supported(void)
-{
-	return boot_cpu_has(X86_FEATURE_XSAVES);
-}
-
 static bool svm_pt_supported(void)
 {
 	return false;
@@ -7357,7 +7352,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 	.rdtscp_supported = svm_rdtscp_supported,
 	.mpx_supported = svm_mpx_supported,
-	.xsaves_supported = svm_xsaves_supported,
 	.pt_supported = svm_pt_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1c3be320de03..44f7e4c37d42 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7444,6 +7444,10 @@ static __init void vmx_set_cpu_caps(void)
 	if (vmx_umip_emulated())
 		kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
+	/* CPUID 0xD.1 */
+	if (!vmx_xsaves_supported())
+		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
 	/* CPUID 0x80000001 */
 	if (!cpu_has_vmx_rdtscp())
 		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
@@ -8246,7 +8250,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.check_intercept = vmx_check_intercept,
 	.handle_exit_irqoff = vmx_handle_exit_irqoff,
 	.mpx_supported = vmx_mpx_supported,
-	.xsaves_supported = vmx_xsaves_supported,
 	.pt_supported = vmx_pt_supported,
 
 	.request_immediate_exit = vmx_request_immediate_exit,
-- 
Gitee


From 90ee5c4a8a4ba340011da42d54b1080e86a27694 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 4 Jul 2022 16:46:02 +0800
Subject: [PATCH 1834/2161] KVM: x86: Add a helper to check kernel support when
 setting cpu cap

commit 8721f5b061eb18c4bb3b77be3ec1c2811ca574ba upstream.

Add a helper, kvm_cpu_cap_check_and_set(), to query boot_cpu_has() as
part of setting a KVM cpu capability.  VMX in particular has a number of
features that are dependent on both a VMCS capability and kernel
support.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.h   |  5 +++++
 arch/x86/kvm/svm.c     |  3 +--
 arch/x86/kvm/vmx/vmx.c | 18 ++++++++----------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a6f51510a30b..d2bd1b4e5d51 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -345,4 +345,9 @@ static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature)
 	return !!kvm_cpu_cap_get(x86_feature);
 }
 
+static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
+{
+	if (boot_cpu_has(x86_feature))
+		kvm_cpu_cap_set(x86_feature);
+}
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7c7f9bc538bc..b88df7784570 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1366,8 +1366,7 @@ static __init void svm_set_cpu_caps(void)
 
 	/* CPUID 0x8000000A */
 	/* Support next_rip if host supports it */
-	if (boot_cpu_has(X86_FEATURE_NRIPS))
-		kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+	kvm_cpu_cap_check_and_set(X86_FEATURE_NRIPS);
 
 	if (npt_enabled)
 		kvm_cpu_cap_set(X86_FEATURE_NPT);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 44f7e4c37d42..4a4e7049633e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7428,18 +7428,16 @@ static __init void vmx_set_cpu_caps(void)
 		kvm_cpu_cap_set(X86_FEATURE_VMX);
 
 	/* CPUID 0x7 */
-	if (boot_cpu_has(X86_FEATURE_MPX) && kvm_mpx_supported())
-		kvm_cpu_cap_set(X86_FEATURE_MPX);
-	if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_has_vmx_invpcid())
-		kvm_cpu_cap_set(X86_FEATURE_INVPCID);
-	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
-	    vmx_pt_supported())
-		kvm_cpu_cap_set(X86_FEATURE_INTEL_PT);
+	if (kvm_mpx_supported())
+		kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+	if (cpu_has_vmx_invpcid())
+		kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+	if (vmx_pt_supported())
+		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 
 	/* PKU is not yet implemented for shadow paging. */
-	if (enable_ept && boot_cpu_has(X86_FEATURE_PKU) &&
-	    boot_cpu_has(X86_FEATURE_OSPKE))
-		kvm_cpu_cap_set(X86_FEATURE_PKU);
+	if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
+		kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
 
 	if (vmx_umip_emulated())
 		kvm_cpu_cap_set(X86_FEATURE_UMIP);
-- 
Gitee


From 57185a158344c427d4a9001adfbf4587a6a8f7a5 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 4 Jul 2022 17:31:20 +0800
Subject: [PATCH 1835/2161] KVM: CPUID: add support for supervisor states

commit 408e9a318f57ba8be82ba01e98cc271b97392187 upstream.

Current CPUID 0xd enumeration code does not support supervisor
states, because KVM only supports setting IA32_XSS to zero.
Change it instead to use a new variable supported_xss, to be
set from the hardware_setup callback which is in charge of CPU
capabilities.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c   | 32 +++++++++++++++++++-------------
 arch/x86/kvm/svm.c     |  2 ++
 arch/x86/kvm/vmx/vmx.c |  1 +
 arch/x86/kvm/x86.c     | 13 +++++++++----
 arch/x86/kvm/x86.h     |  2 ++
 5 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 5a2b7b46f293..eeba492fad5e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -692,15 +692,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 
 		cpuid_entry_override(entry, CPUID_D_1_EAX);
 		if (entry->eax & (F(XSAVES)|F(XSAVEC)))
-			entry->ebx = xstate_required_size(supported, true);
-		else
+			entry->ebx = xstate_required_size(supported | supported_xss,
+							  true);
+		else {
+			WARN_ON_ONCE(supported_xss != 0);
 			entry->ebx = 0;
-		/* Saving XSS controlled state via XSAVES isn't supported. */
-		entry->ecx = 0;
-		entry->edx = 0;
+		}
+		entry->ecx &= supported_xss;
+		entry->edx &= supported_xss >> 32;
 
 		for (idx = 2; idx < 64; ++idx) {
-			if (!(supported & BIT_ULL(idx)))
+			bool s_state;
+			if (supported & BIT_ULL(idx))
+				s_state = false;
+			else if (supported_xss & BIT_ULL(idx))
+				s_state = true;
+			else
 				continue;
 
 			entry = do_host_cpuid(array, function, idx);
@@ -708,18 +715,17 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 				goto out;
 
 			/*
-			 * The @supported check above should have filtered out
-			 * invalid sub-leafs as well as sub-leafs managed by
-			 * IA32_XSS MSR.  Only XCR0-managed sub-leafs should
+			 * The supported check above should have filtered out
+			 * invalid sub-leafs.  Only valid sub-leafs should
 			 * reach this point, and they should have a non-zero
-			 * save state size.
+			 * save state size.  Furthermore, check whether the
+			 * processor agrees with supported_xcr0/supported_xss
+			 * on whether this is an XCR0- or IA32_XSS-managed area.
 			 */
-			if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 1))) {
+			if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
 				--array->nent;
 				continue;
 			}
-
-			entry->ecx = 0;
 			entry->edx = 0;
 		}
 		break;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b88df7784570..663f510a5af0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1360,6 +1360,8 @@ static __init void svm_set_cpu_caps(void)
 {
 	kvm_set_cpu_caps();
 
+	supported_xss = 0;
+
 	/* CPUID 0x80000001 */
 	if (nested)
 		kvm_cpu_cap_set(X86_FEATURE_SVM);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4a4e7049633e..c7bb66d42b1d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7443,6 +7443,7 @@ static __init void vmx_set_cpu_caps(void)
 		kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
 	/* CPUID 0xD.1 */
+	supported_xss = 0;
 	if (!vmx_xsaves_supported())
 		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3c7225f5d06..5b7fa70e0dbb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -185,6 +185,8 @@ static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 static struct kvm_shared_msrs __percpu *shared_msrs;
 
 static u64 __read_mostly host_xss;
+u64 __read_mostly supported_xss;
+EXPORT_SYMBOL_GPL(supported_xss);
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -2897,7 +2899,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
 		 * MSRs.
 		 */
-		if (data != 0)
+		if (data & ~supported_xss)
 			return 1;
 		vcpu->arch.ia32_xss = data;
 		break;
@@ -9551,10 +9553,16 @@ int kvm_arch_hardware_setup(void)
 {
 	int r;
 
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		rdmsrl(MSR_IA32_XSS, host_xss);
+
 	r = kvm_x86_ops->hardware_setup();
 	if (r != 0)
 		return r;
 
+	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+		supported_xss = 0;
+
 	cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
 
 	if (kvm_has_tsc_control) {
@@ -9571,9 +9579,6 @@ int kvm_arch_hardware_setup(void)
 		kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
 	}
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		rdmsrl(MSR_IA32_XSS, host_xss);
-
 	kvm_init_msr_list();
 	return 0;
 }
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f7d85cc5027c..860c6d0bb559 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,6 +294,8 @@ extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
 
+extern u64 supported_xss;
+
 extern unsigned int min_timer_period_us;
 
 extern bool enable_vmware_backdoor;
-- 
Gitee


From d381a748603016d00c00270e34438755e5673f6a Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 4 Jul 2022 17:43:49 +0800
Subject: [PATCH 1836/2161] KVM: SVM: Provide support for SEV-ES vCPU loading

commit 861377730aa9db4cbaa0f3bd3f4d295c152732c4 upstream.

Just merge changes for host_xss.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 arch/x86/kvm/x86.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5b7fa70e0dbb..a1692bb9c47d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -184,7 +184,8 @@ struct kvm_shared_msrs {
 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 static struct kvm_shared_msrs __percpu *shared_msrs;
 
-static u64 __read_mostly host_xss;
+u64 __read_mostly host_xss;
+EXPORT_SYMBOL_GPL(host_xss);
 u64 __read_mostly supported_xss;
 EXPORT_SYMBOL_GPL(supported_xss);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 860c6d0bb559..94d5164b95a5 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,6 +294,7 @@ extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
 
+extern u64 host_xss;
 extern u64 supported_xss;
 
 extern unsigned int min_timer_period_us;
-- 
Gitee


From 6fd4cb70a4287271e36ee57ce6550251a49baace Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 5 Jul 2022 10:59:42 +0800
Subject: [PATCH 1837/2161] KVM: x86: Report XSS as an MSR to be saved if there
 are supported features

commit dc56d6e175100467be475dc9adc6e1bd5b59ac8f Intel-BKC.

Add MSR_IA32_XSS to the list of MSRs reported to userspace if
supported_xss is non-zero, i.e. KVM supports at least one XSS based
feature.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1692bb9c47d..6f1bbde6d211 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1270,6 +1270,8 @@ static const u32 msrs_to_save_all[] = {
 	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+	MSR_IA32_XSS,
 	MSR_IA32_XFD, MSR_IA32_XFD_ERR,
 
 	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
@@ -5433,6 +5435,10 @@ static void kvm_init_msr_list(void)
 				continue;
 			break;
 		}
+		case MSR_IA32_XSS:
+			if (!supported_xss)
+				continue;
+			break;
 		case MSR_IA32_XFD:
 		case MSR_IA32_XFD_ERR:
 			if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
@@ -9563,6 +9569,8 @@ int kvm_arch_hardware_setup(void)
 
 	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
 		supported_xss = 0;
+	else
+		supported_xss &= host_xss;
 
 	cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
 
-- 
Gitee


From 315747c9688e99b793ae7dfa6bb44a40ef77e223 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 5 Jul 2022 14:36:10 +0800
Subject: [PATCH 1838/2161] kvm: x86: Cleanup vcpu->arch.guest_xstate_size

commit a71936ab46f1da1539d97a98dfb2f94ee383d687 upstream.

vcpu->arch.guest_xstate_size lost its only user since commit df1daba7d1cb
("KVM: x86: support XSAVES usage in the host"), so clean it up.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Message-Id: <20200429154312.1411-1-xiaoyao.li@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/cpuid.c            | 4 +---
 arch/x86/kvm/x86.c              | 2 --
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 85b65e8b0073..64f6dfbfd2da 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -627,7 +627,6 @@ struct kvm_vcpu_arch {
 
 	u64 xcr0;
 	u64 guest_supported_xcr0;
-	u32 guest_xstate_size;
 
 	struct kvm_pio_request pio;
 	void *pio_data;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index eeba492fad5e..aa0c5048861a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -108,13 +108,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
 	if (!best) {
 		vcpu->arch.guest_supported_xcr0 = 0;
-		vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 	} else {
 		vcpu->arch.guest_supported_xcr0 =
 			(best->eax | ((u64)best->edx << 32)) &
 			kvm_supported_xcr0();
-		vcpu->arch.guest_xstate_size = best->ebx =
-			xstate_required_size(vcpu->arch.xcr0, false);
+		best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
 	}
 
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f1bbde6d211..6236df855b9d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9664,8 +9664,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	fx_init(vcpu);
 
-	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
 	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-- 
Gitee


From 78c51f5fcf5bd577ec20015281fd5f25d5266590 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Thu, 12 Mar 2020 18:18:32 -0700
Subject: [PATCH 1839/2161] KVM: x86: Add guest_supported_xss placholder

commit 79fd4ff08bb7e2695826ec8c037f85c85e97aa48 Intel-BKC.

Add a per-vcpu placeholder for the support XSS of the guest so that the
TDX configuration code doesn't need to hack in manual computation of the
supported XSS.  KVM XSS enabling is currently being upstreamed, i.e.
guest_supported_xss will no longer be a placeholder by the time TDX is
ready for upstreaming (hopefully).

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 64f6dfbfd2da..0c1629c45dac 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -627,6 +627,7 @@ struct kvm_vcpu_arch {
 
 	u64 xcr0;
 	u64 guest_supported_xcr0;
+	u64 guest_supported_xss;
 
 	struct kvm_pio_request pio;
 	void *pio_data;
-- 
Gitee


From 307eb8d99c4fc59df05f51ccf3218348c4f1bba3 Mon Sep 17 00:00:00 2001
From: Yang Weijiang <weijiang.yang@intel.com>
Date: Thu, 16 Jul 2020 11:16:21 +0800
Subject: [PATCH 1840/2161] KVM: x86: Refresh CPUID on writes to MSR_IA32_XSS

commit 5d9a42a43d509b7f35561b7c94a0c4dcc8e75576 Intel-BKC.

Updated CPUID.0xD.0x1, which reports the current required storage size
of all features enabled via XCR0 | XSS, when the guest's XSS is modified.

Note, KVM does not yet support any XSS based features, i.e. supported_xss
is guaranteed to be zero at this time.

Co-developed-by: Zhang Yi Z <yi.z.zhang@linux.intel.com>
Signed-off-by: Zhang Yi Z <yi.z.zhang@linux.intel.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 21 ++++++++++++++++++---
 arch/x86/kvm/x86.c   |  7 +++++--
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index aa0c5048861a..6c64e9a60f32 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -116,9 +116,24 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	}
 
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
-	if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
-		     cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
-		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
+	if (best) {
+		if (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
+		    cpuid_entry_has(best, X86_FEATURE_XSAVEC))  {
+			u64 xstate = vcpu->arch.xcr0 | vcpu->arch.ia32_xss;
+
+			best->ebx = xstate_required_size(xstate, true);
+		}
+
+		if (!cpuid_entry_has(best, X86_FEATURE_XSAVES)) {
+			best->ecx = 0;
+			best->edx = 0;
+		}
+		vcpu->arch.guest_supported_xss =
+			(((u64)best->edx << 32) | best->ecx) & supported_xss;
+
+	} else {
+		vcpu->arch.guest_supported_xss = 0;
+	}
 
 	/*
 	 * The existing code assumes virtual address is 48-bit or 57-bit in the
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6236df855b9d..c4ed0e2816e4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2902,9 +2902,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
 		 * MSRs.
 		 */
-		if (data & ~supported_xss)
+		if (data & ~vcpu->arch.guest_supported_xss)
 			return 1;
-		vcpu->arch.ia32_xss = data;
+		if (vcpu->arch.ia32_xss != data) {
+			vcpu->arch.ia32_xss = data;
+			kvm_update_cpuid(vcpu);
+		}
 		break;
 	case MSR_SMI_COUNT:
 		if (!msr_info->host_initiated)
-- 
Gitee


From c6e1d55596ff3fb2cde5d58f4137d2ccf793ed0f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Thu, 16 Jul 2020 11:16:22 +0800
Subject: [PATCH 1841/2161] KVM: x86: Load guest fpu state when accessing MSRs
 managed by XSAVES

commit a383630686ccc2d98c1914f00ae356b626d483bb Intel-BKC.

If new feature MSRs are supported in XSS and passed through to the guest
they are saved and restored by XSAVES/XRSTORS, i.e. in the guest's FPU
state.

Load the guest's FPU state if userspace is accessing MSRs whose values are
managed by XSAVES so that the MSR helper, e.g. vmx_{get,set}_xsave_msr(),
can simply do {RD,WR}MSR to access the guest's value.

Because is also used for the KVM_GET_MSRS device ioctl(), explicitly
check that @vcpu is non-null before attempting to load guest state.  The
XSS supporting MSRs cannot be retrieved via the device ioctl() without
loading guest FPU state (which doesn't exist).

MSRs that are switched through XSAVES are especially annoying due to the
possibility of the kernel's FPU being used in IRQ context.  Disable IRQs
and ensure the guest's FPU state is loaded when accessing such MSRs.

Note that guest_cpuid_has() is not queried as host userspace is allowed
to access MSRs that have not been exposed to the guest, e.g. it might do
KVM_SET_MSRS prior to KVM_SET_CPUID2.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++++++++
 arch/x86/kvm/x86.c     | 30 +++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c7bb66d42b1d..5a1f6a7796c5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1750,6 +1750,24 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
 	return !(val & ~valid_bits);
 }
 
+static void __maybe_unused vmx_get_xsave_msr(struct msr_data *msr_info)
+{
+	local_irq_disable();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+	      switch_fpu_return();
+	rdmsrl(msr_info->index, msr_info->data);
+	local_irq_enable();
+}
+
+static void __maybe_unused vmx_set_xsave_msr(struct msr_data *msr_info)
+{
+	local_irq_disable();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+	      switch_fpu_return();
+	wrmsrl(msr_info->index, msr_info->data);
+	local_irq_enable();
+}
+
 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
 {
 	switch (msr->index) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c4ed0e2816e4..fb9e66dc08e3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -113,6 +113,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 static void store_regs(struct kvm_vcpu *vcpu);
 static int sync_regs(struct kvm_vcpu *vcpu);
 
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
@@ -3357,6 +3360,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 }
 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 
+/*
+ * If new features passthrough XSS managed MSRs to guest, it's required to
+ * add separate checks here so as to load feature dependent guest MSRs before
+ * access them.
+ */
+static bool is_xsaves_msr(u32 index)
+{
+
+	/*
+	 * Add the check for your feature like this:
+	 * return index == MSR_IA32_U_CET;
+	 */
+
+	return false;
+}
+
 /*
  * Read or write a bunch of msrs. All parameters are kernel addresses.
  *
@@ -3367,11 +3386,20 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 		    int (*do_msr)(struct kvm_vcpu *vcpu,
 				  unsigned index, u64 *data))
 {
+	bool fpu_loaded = false;
 	int i;
 
-	for (i = 0; i < msrs->nmsrs; ++i)
+	for (i = 0; i < msrs->nmsrs; ++i) {
+		if (vcpu && !fpu_loaded && supported_xss &&
+		    is_xsaves_msr(entries[i].index)) {
+			kvm_load_guest_fpu(vcpu);
+			fpu_loaded = true;
+		}
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
+	}
+	if (fpu_loaded)
+		kvm_put_guest_fpu(vcpu);
 
 	return i;
 }
-- 
Gitee


From 3d6871cfd5bce7c203b28ca514a62dd79882bfc1 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 5 Jul 2022 17:09:30 +0800
Subject: [PATCH 1842/2161] kvm: x86: enqcmd: pass through MSR_IA32_PASID

commit f67ec7d480cbd32166b8fde3d7cd987fea3b7f84 Intel-BKC.

Each execution of ENQCMD results in an enqueue store with a PASID value.
ENQCMD obtains the PASID value from MSR_IA32_PASID. When PASID translation
is enabled, this PASID value from MSR_IA32_PASID is interpreted as a guest
PASID, and the guest PASID is converted to a host PASID, and the enqueue
store uses the host PASID for the command data that it writes.

This enables direct pass through MSR_IA32_PASID to guest, so that user can
program guest PASID values into MSR_IA32_PASID directly.

This patch enables the virtualization support for XSAVE extension of PASID
supervisor state component, so guest can use this extension to save/restore
PASID state (MSR_IA32_PASID) per context switch.

v3: merge XSAVES PASID state component patch
    rebase on top of msr filter changes

Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 4 ++++
 arch/x86/kvm/x86.c     | 3 +++
 arch/x86/kvm/x86.h     | 5 +++++
 3 files changed, 12 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5a1f6a7796c5..ca620792cdf9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6829,6 +6829,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
 		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
 	}
+	if (cpu_has_vmx_pasid_trans())
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_PASID, MSR_TYPE_RW);
 	vmx->msr_bitmap_mode = 0;
 
 	vmx->loaded_vmcs = &vmx->vmcs01;
@@ -7471,6 +7473,8 @@ static __init void vmx_set_cpu_caps(void)
 
 	if (!cpu_has_vmx_pasid_trans())
 		kvm_cpu_cap_clear(X86_FEATURE_ENQCMD);
+	else if (kvm_cpu_cap_has(X86_FEATURE_ENQCMD))
+		supported_xss |= XFEATURE_MASK_PASID;
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fb9e66dc08e3..b67229ea8ba4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9603,6 +9603,9 @@ int kvm_arch_hardware_setup(void)
 	else
 		supported_xss &= host_xss;
 
+	if (!kvm_pasid_supported())
+		kvm_cpu_cap_clear(X86_FEATURE_ENQCMD);
+
 	cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
 
 	if (kvm_has_tsc_control) {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 94d5164b95a5..29bbb0aaf125 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -297,6 +297,11 @@ extern u64 kvm_supported_xcr0(void);
 extern u64 host_xss;
 extern u64 supported_xss;
 
+static inline bool kvm_pasid_supported(void)
+{
+	return supported_xss & XFEATURE_MASK_PASID;
+}
+
 extern unsigned int min_timer_period_us;
 
 extern bool enable_vmware_backdoor;
-- 
Gitee


From 8ce2f79a259dc6f5c0cec3534f969f42f6bb0c4d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 29 Nov 2021 17:23:10 -0700
Subject: [PATCH 1843/2161] vfio: vfio fixups for idxd mdev

commit b7ee3b100a8681e753ed8b3c599e6f6fe8980f49 Intel-BKC.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c  | 17 ++++++++++-------
 drivers/vfio/pci/vfio_pci_intrs.c |  3 ++-
 include/linux/vfio_pci_core.h     |  4 ++++
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 2b9d369ca97c..5c8f94b477fa 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -373,7 +373,7 @@ int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 
 #define DMA_FAULT_RING_LENGTH 512
 
-static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
+int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev, bool register_fault)
 {
 	struct vfio_region_dma_fault *header;
 	size_t size;
@@ -407,11 +407,13 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
 	header->nb_entries = DMA_FAULT_RING_LENGTH;
 	header->offset = PAGE_SIZE;
 
-	ret = iommu_register_device_fault_handler(&vdev->pdev->dev,
-					vfio_pci_iommu_dev_fault_handler,
-					vdev);
-	if (ret) /* the dma fault region is freed in vfio_pci_disable() */
-		goto out;
+	if (register_fault) {
+		ret = iommu_register_device_fault_handler(&vdev->pdev->dev,
+							  vfio_pci_iommu_dev_fault_handler,
+							  vdev);
+		if (ret) /* the dma fault region is freed in vfio_pci_disable() */
+			goto out;
+	}
 
 	ret = vfio_pci_register_irq(vdev, VFIO_IRQ_TYPE_NESTED,
 				    VFIO_IRQ_SUBTYPE_DMA_FAULT,
@@ -424,6 +426,7 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev)
 	kfree(vdev->fault_pages);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_dma_fault_init);
 
 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 {
@@ -494,7 +497,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
 		vdev->has_vga = true;
 
-	ret = vfio_pci_dma_fault_init(vdev);
+	ret = vfio_pci_dma_fault_init(vdev, true);
 	if (ret) {
 		pci_disable_device(pdev);
 		return ret;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 05e021096f22..b7bb1ae21b72 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -636,7 +636,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
 					       count, flags, data);
 }
 
-static int vfio_pci_set_ext_irq_trigger(struct vfio_pci_core_device *vdev,
+int vfio_pci_set_ext_irq_trigger(struct vfio_pci_core_device *vdev,
 					unsigned int index, unsigned int start,
 					unsigned int count, uint32_t flags,
 					void *data)
@@ -653,6 +653,7 @@ static int vfio_pci_set_ext_irq_trigger(struct vfio_pci_core_device *vdev,
 	return vfio_pci_set_ctx_trigger_single(&vdev->ext_irqs[i].trigger,
 					       count, flags, data);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_set_ext_irq_trigger);
 
 int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 			    unsigned index, unsigned start, unsigned count,
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 76a874c6e886..10182d1ae9ac 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -256,4 +256,8 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
+int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev, bool register_fault);
+int vfio_pci_set_ext_irq_trigger(struct vfio_pci_core_device *vdev,
+				 unsigned int index, unsigned int start,
+				 unsigned int count, uint32_t flags, void *data);
 #endif /* VFIO_PCI_CORE_H */
-- 
Gitee


From bc8871c3095d81808c2303ca07794f29bbc377c9 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 29 Nov 2021 18:08:51 -0700
Subject: [PATCH 1844/2161] vfio: vfio migration stuff

commit b5dc20a2cc2764359b9b937d4e66e898ff4b6a96 Intel-BKC.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 108 +++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_rdwr.c |  55 ++++++++++++++++
 include/linux/vfio_pci_core.h    |  12 ++++
 3 files changed, 175 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 5c8f94b477fa..3cdcff7b2aa7 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -319,6 +319,114 @@ static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
 	.add_capability = vfio_pci_dma_fault_add_capability,
 };
 
+static void vfio_pci_mregion_release(struct vfio_pci_core_device *vdev,
+				       struct vfio_pci_region *region)
+{
+}
+
+static int vfio_pci_mregion_mmap(struct vfio_pci_core_device *vdev,
+				   struct vfio_pci_region *region,
+				   struct vm_area_struct *vma)
+{
+	u64 phys_len, req_len, pgoff, req_start;
+	unsigned long long addr;
+	unsigned int ret;
+
+	phys_len = region->size;
+
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	/* only the second page of the producer fault region is mmappable */
+	if (req_start < PAGE_SIZE)
+		return -EINVAL;
+
+	if (req_start + req_len > phys_len)
+		return -EINVAL;
+
+	addr = virt_to_phys(vdev->mig_pages);
+	vma->vm_private_data = vdev;
+	vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
+
+	printk("%s, mmap addr %llx\n", __func__, addr);
+	ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			      req_len, vma->vm_page_prot);
+	return ret;
+}
+
+static int vfio_pci_mregion_add_capability(struct vfio_pci_core_device *vdev,
+					     struct vfio_pci_region *region,
+					     struct vfio_info_cap *caps)
+{
+	struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+	size_t size = sizeof(*sparse) + sizeof(*sparse->areas);
+	int ret;
+
+	/* TODO: Look into re-enabling it */
+	return 0;
+
+	sparse = kzalloc(size, GFP_KERNEL);
+	if (!sparse)
+		return -ENOMEM;
+
+	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+	sparse->header.version = 1;
+	sparse->nr_areas = 1;
+	/* first page of the region is for vfio_device_migration_info */
+	sparse->areas[0].offset = PAGE_SIZE;
+	sparse->areas[0].size = region->size - PAGE_SIZE;
+
+	ret = vfio_info_add_capability(caps, &sparse->header, size);
+	if (ret)
+		kfree(sparse);
+
+	return ret;
+}
+
+static const struct vfio_pci_regops vfio_pci_mig_regops = {
+	.rw		= vfio_pci_mregion_rw,
+	.release	= vfio_pci_mregion_release,
+	.mmap		= vfio_pci_mregion_mmap,
+	.add_capability = vfio_pci_mregion_add_capability,
+};
+
+int vfio_pci_migration_init(struct vfio_pci_core_device *vdev, uint32_t state_size)
+{
+	struct vfio_device_migration_info *mig_info;
+	size_t size;
+	int ret;
+
+	mutex_init(&vdev->mig_lock);
+
+	size = ALIGN(sizeof(*mig_info) + state_size, PAGE_SIZE);
+
+	vdev->mig_pages = kzalloc(size, GFP_KERNEL);
+	if (!vdev->mig_pages)
+		return -ENOMEM;
+
+	mig_info = (struct vfio_device_migration_info *) vdev->mig_pages;
+	ret = vfio_pci_register_dev_region(vdev,
+		VFIO_REGION_TYPE_MIGRATION,
+		VFIO_REGION_SUBTYPE_MIGRATION,
+		&vfio_pci_mig_regops, size,
+		VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+		//VFIO_REGION_INFO_FLAG_MMAP,
+		vdev->mig_pages);
+	if (ret)
+		goto out;
+
+	mig_info->data_offset = sizeof(*mig_info);
+	pr_info("%s, mig_info->data_offset: 0x%lx\n", __func__, (unsigned long) mig_info->data_offset);
+	return 0;
+out:
+	kfree(vdev->mig_pages);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_migration_init);
+
+
 int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
 {
 	struct vfio_pci_core_device *vdev = (struct vfio_pci_core_device *)data;
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index e3ff97613827..aa0b71dbbe20 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -425,6 +425,61 @@ ssize_t vfio_pci_dma_fault_rw(struct vfio_pci_core_device *vdev, char __user *bu
 	return ret;
 }
 
+ssize_t vfio_pci_mregion_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			    size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	void *base = vdev->region[i].data;
+	struct vfio_device_migration_info *mig_info = (struct vfio_device_migration_info *) base;
+	int ret = -EFAULT;
+
+	//pr_info("%s, pos: 0x%llx, count: %lu, base: %px, index: %d wr %d\n",
+		//__func__, pos, count, base, i, iswrite);
+
+	if (pos >= vdev->region[i].size)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	mutex_lock(&vdev->mig_lock);
+
+	if (iswrite) {
+		if ((pos == offsetof(struct vfio_device_migration_info,
+				device_state)) && vdev->migops) {
+			u32 new_state;
+			/* Call into the device specific code to handle
+			 * the state change (before changing the state)
+			 */
+			if (count != sizeof(mig_info->device_state)) {
+				ret = -EINVAL;
+				goto unlock;
+			}
+
+			if (copy_from_user((void *)&new_state, buf, count))
+				goto unlock;
+
+			ret = vdev->migops->state_change(vdev, new_state);
+			if (ret)
+				goto unlock;
+			mig_info->device_state = new_state;
+		} else {
+			if (copy_from_user(base + pos, buf, count))
+				goto unlock;
+		}
+	} else {
+		if (copy_to_user(buf, base + pos, count))
+			goto unlock;
+		if (pos >= mig_info->data_offset &&
+			pos < mig_info->data_offset + mig_info->data_size)
+			mig_info->pending_bytes -= count;
+	}
+	ret = count;
+unlock:
+	mutex_unlock(&vdev->mig_lock);
+	return ret;
+}
+
 static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
 {
 	struct vfio_pci_ioeventfd *ioeventfd = opaque;
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 10182d1ae9ac..3f97de7f9daf 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -56,6 +56,10 @@ struct vfio_pci_irq_ctx {
 struct vfio_pci_core_device;
 struct vfio_pci_region;
 
+struct vfio_pci_migops {
+	int	(*state_change)(struct vfio_pci_core_device *vdev, u32 new_state);
+};
+
 struct vfio_pci_regops {
 	ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
 		      size_t count, loff_t *ppos, bool iswrite);
@@ -148,6 +152,10 @@ struct vfio_pci_core_device {
 	struct mutex		vma_lock;
 	struct list_head	vma_list;
 	struct rw_semaphore	memory_lock;
+
+	struct vfio_pci_migops	*migops;
+	u8			*mig_pages;
+	struct mutex		mig_lock;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
@@ -184,6 +192,9 @@ extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
 extern ssize_t vfio_pci_dma_fault_rw(struct vfio_pci_core_device *vdev,
 				    char __user *buf, size_t count,
 				    loff_t *ppos, bool iswrite);
+extern ssize_t vfio_pci_mregion_rw(struct vfio_pci_core_device *vdev,
+				   char __user *buf, size_t count,
+				   loff_t *ppos, bool iswrite);
 
 extern int vfio_pci_init_perm_bits(void);
 extern void vfio_pci_uninit_perm_bits(void);
@@ -260,4 +271,5 @@ int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev, bool register_fau
 int vfio_pci_set_ext_irq_trigger(struct vfio_pci_core_device *vdev,
 				 unsigned int index, unsigned int start,
 				 unsigned int count, uint32_t flags, void *data);
+int vfio_pci_migration_init(struct vfio_pci_core_device *vdev, uint32_t size);
 #endif /* VFIO_PCI_CORE_H */
-- 
Gitee


From a1156acc0ef55cb4dd257e1082c1d3152f1fb3ef Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 30 Nov 2021 17:11:39 -0700
Subject: [PATCH 1845/2161] vfio/mdev: add req_trigger

commit d7d3555ace46440d833865f154545219b7175146 Intel-BKC.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/mdev.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index b64ad7b70d29..773d7e95ce17 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -22,6 +22,7 @@ struct mdev_device {
 	void *iommu_fault_data;
 	void *iommu_domain;
 	bool active;
+	struct eventfd_ctx *req_trigger;
 };
 
 static inline struct mdev_device *to_mdev_device(struct device *dev)
-- 
Gitee


From ec3c0411a7d4fe861b2f207c0190663e77805402 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Wed, 1 Dec 2021 17:10:21 -0700
Subject: [PATCH 1846/2161] vfio: Adding Dynamic trap flag

commit a2ddcff1acbfee1da9f69251edebc208c765ab0f Intel-BKC.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/uapi/linux/vfio.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 03997de836c8..933df045b8c0 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -255,6 +255,7 @@ struct vfio_region_info {
 #define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
 #define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
 #define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
+#define VFIO_REGION_INFO_FLAG_DYNAMIC_TRAP	(1 << 4) /* Region supports dynamic trap/untrap */
 	__u32	index;		/* Region index */
 	__u32	cap_offset;	/* Offset within info struct of first cap */
 	__u64	size;		/* Region size (bytes) */
-- 
Gitee


From 5b37aabee7181fe7f78e51538d912c7b0ac5c358 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 5 Jul 2022 20:41:36 +0800
Subject: [PATCH 1847/2161] vfio: idxd: wholesale copy over 5.12 PO kernel mdev
 driver

commit 4f430647f174759f60ca81d52a231b0db01a803f Intel-BKC.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c            |    4 +-
 drivers/dma/idxd/device.c          |  157 +-
 drivers/dma/idxd/idxd.h            |   45 +-
 drivers/dma/idxd/init.c            |   20 +
 drivers/dma/idxd/irq.c             |    2 +-
 drivers/dma/idxd/registers.h       |   28 +-
 drivers/dma/idxd/sysfs.c           |    7 +
 drivers/vfio/mdev/Kconfig          |    9 +
 drivers/vfio/mdev/Makefile         |    1 +
 drivers/vfio/mdev/idxd/Makefile    |    4 +
 drivers/vfio/mdev/idxd/mdev.c      | 2410 ++++++++++++++++++++++++++++
 drivers/vfio/mdev/idxd/mdev.h      |  179 +++
 drivers/vfio/mdev/idxd/mdev_host.c |   94 ++
 drivers/vfio/mdev/idxd/vdev.c      | 1398 ++++++++++++++++
 include/uapi/linux/idxd.h          |    2 +
 15 files changed, 4321 insertions(+), 39 deletions(-)
 create mode 100644 drivers/vfio/mdev/idxd/Makefile
 create mode 100644 drivers/vfio/mdev/idxd/mdev.c
 create mode 100644 drivers/vfio/mdev/idxd/mdev.h
 create mode 100644 drivers/vfio/mdev/idxd/mdev_host.c
 create mode 100644 drivers/vfio/mdev/idxd/vdev.c

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index b9b2b4a4124e..1f6a456149b8 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -158,7 +158,7 @@ static int idxd_cdev_release(struct inode *node, struct file *filep)
 			if (rc < 0)
 				dev_err(dev, "wq disable pasid failed.\n");
 		} else {
-			idxd_wq_drain(wq);
+			idxd_wq_drain(wq, NULL);
 		}
 	}
 
@@ -204,7 +204,7 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
 
 	vma->vm_flags |= VM_DONTCOPY;
 	pfn = (base + idxd_get_wq_portal_full_offset(wq->id,
-				IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT;
+				IDXD_PORTAL_LIMITED, IDXD_IRQ_MSIX)) >> PAGE_SHIFT;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	vma->vm_private_data = ctx;
 
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 03de61c25311..993b7fccf6a0 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -178,22 +178,28 @@ void idxd_wq_free_resources(struct idxd_wq *wq)
 }
 EXPORT_SYMBOL_GPL(idxd_wq_free_resources);
 
-int idxd_wq_enable(struct idxd_wq *wq)
+int idxd_wq_enable(struct idxd_wq *wq, u32 *status)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
-	u32 status;
+	u32 stat;
+
+	if (status)
+		*status = 0;
 
 	if (wq->state == IDXD_WQ_ENABLED) {
 		dev_dbg(dev, "WQ %d already enabled\n", wq->id);
 		return -ENXIO;
 	}
 
-	idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_WQ, wq->id, &status);
+	idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_WQ, wq->id, &stat);
 
-	if (status != IDXD_CMDSTS_SUCCESS &&
-	    status != IDXD_CMDSTS_ERR_WQ_ENABLED) {
-		dev_dbg(dev, "WQ enable failed: %#x\n", status);
+	if (status)
+		*status = stat;
+
+	if (stat != IDXD_CMDSTS_SUCCESS &&
+	    stat != IDXD_CMDSTS_ERR_WQ_ENABLED) {
+		dev_dbg(dev, "WQ enable failed: %#x\n", stat);
 		return -ENXIO;
 	}
 
@@ -201,14 +207,26 @@ int idxd_wq_enable(struct idxd_wq *wq)
 	dev_dbg(dev, "WQ %d enabled\n", wq->id);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(idxd_wq_enable);
 
-int idxd_wq_disable(struct idxd_wq *wq, bool reset_config)
+int idxd_wq_disable(struct idxd_wq *wq, bool reset_config, u32 *status)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
-	u32 status, operand;
+	u32 stat, operand;
 
 	dev_dbg(dev, "Disabling WQ %d\n", wq->id);
+	if (status)
+		*status = 0;
+
+	/*
+	 * When the wq is in LOCKED state, it means it is disabled but
+	 * also at the same time is "enabled" as far as the user is
+	 * concerned. So a call to disable the hardware can be
+	 * skipped.
+	 */
+	if (wq->state == IDXD_WQ_LOCKED)
+		goto out;
 
 	if (wq->state != IDXD_WQ_ENABLED) {
 		dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state);
@@ -216,35 +234,59 @@ int idxd_wq_disable(struct idxd_wq *wq, bool reset_config)
 	}
 
 	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
-	idxd_cmd_exec(idxd, IDXD_CMD_DISABLE_WQ, operand, &status);
+	idxd_cmd_exec(idxd, IDXD_CMD_DISABLE_WQ, operand, &stat);
+
+	if (status)
+		*status = stat;
 
-	if (status != IDXD_CMDSTS_SUCCESS) {
-		dev_dbg(dev, "WQ disable failed: %#x\n", status);
+	if (stat != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "WQ disable failed: %#x\n", stat);
 		return -ENXIO;
 	}
 
-	if (reset_config)
-		idxd_wq_disable_cleanup(wq);
-	wq->state = IDXD_WQ_DISABLED;
+out:
+	if (wq_dedicated(wq) && is_idxd_wq_mdev(wq)) {
+		wq->state = IDXD_WQ_LOCKED;
+	} else {
+		if (reset_config && test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+			idxd_wq_disable_cleanup(wq);
+		wq->state = IDXD_WQ_DISABLED;
+	}
 	dev_dbg(dev, "WQ %d disabled\n", wq->id);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(idxd_wq_disable);
 
-void idxd_wq_drain(struct idxd_wq *wq)
+int idxd_wq_drain(struct idxd_wq *wq, u32 *status)
 {
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
-	u32 operand;
+	u32 operand, stat;
+
+	if (status)
+		*status = 0;
 
 	if (wq->state != IDXD_WQ_ENABLED) {
 		dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state);
-		return;
+		return 0;
 	}
 
 	dev_dbg(dev, "Draining WQ %d\n", wq->id);
 	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
-	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_WQ, operand, NULL);
+	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_WQ, operand, &stat);
+
+	if (status)
+		*status = stat;
+
+	if (stat != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "WQ drain failed: %#x\n", stat);
+		return -ENXIO;
+	}
+
+	dev_dbg(dev, "WQ %d drained\n", wq->id);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(idxd_wq_drain);
 
 void idxd_wq_reset(struct idxd_wq *wq)
 {
@@ -262,6 +304,7 @@ void idxd_wq_reset(struct idxd_wq *wq)
 	idxd_wq_disable_cleanup(wq);
 	wq->state = IDXD_WQ_DISABLED;
 }
+EXPORT_SYMBOL_GPL(idxd_wq_reset);
 
 int idxd_wq_map_portal(struct idxd_wq *wq)
 {
@@ -271,7 +314,7 @@ int idxd_wq_map_portal(struct idxd_wq *wq)
 	resource_size_t start;
 
 	start = pci_resource_start(pdev, IDXD_WQ_BAR);
-	start += idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED);
+	start += idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED, IDXD_IRQ_MSIX);
 
 	wq->portal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE);
 	if (!wq->portal)
@@ -301,6 +344,35 @@ void idxd_wqs_unmap_portal(struct idxd_device *idxd)
 	}
 }
 
+int idxd_wq_abort(struct idxd_wq *wq, u32 *status)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand, stat;
+
+	dev_dbg(dev, "Abort WQ %d\n", wq->id);
+	if (wq->state != IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d not active\n", wq->id);
+		return -ENXIO;
+	}
+
+	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_ABORT_WQ, operand);
+	idxd_cmd_exec(idxd, IDXD_CMD_ABORT_WQ, operand, &stat);
+
+	if (status)
+		*status = stat;
+
+	if (stat != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "WQ abort failed: %#x\n", stat);
+		return -ENXIO;
+	}
+
+	dev_dbg(dev, "WQ %d aborted\n", wq->id);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_abort);
+
 int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
 {
 	struct idxd_device *idxd = wq->idxd;
@@ -308,7 +380,7 @@ int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
 	union wqcfg wqcfg;
 	unsigned int offset;
 
-	rc = idxd_wq_disable(wq, false);
+	rc = idxd_wq_disable(wq, false, NULL);
 	if (rc < 0)
 		return rc;
 
@@ -320,12 +392,13 @@ int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
 	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
 	spin_unlock(&idxd->dev_lock);
 
-	rc = idxd_wq_enable(wq);
+	rc = idxd_wq_enable(wq, NULL);
 	if (rc < 0)
 		return rc;
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(idxd_wq_set_pasid);
 
 int idxd_wq_disable_pasid(struct idxd_wq *wq)
 {
@@ -334,7 +407,7 @@ int idxd_wq_disable_pasid(struct idxd_wq *wq)
 	union wqcfg wqcfg;
 	unsigned int offset;
 
-	rc = idxd_wq_disable(wq, false);
+	rc = idxd_wq_disable(wq, false, NULL);
 	if (rc < 0)
 		return rc;
 
@@ -346,12 +419,13 @@ int idxd_wq_disable_pasid(struct idxd_wq *wq)
 	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
 	spin_unlock(&idxd->dev_lock);
 
-	rc = idxd_wq_enable(wq);
+	rc = idxd_wq_enable(wq, NULL);
 	if (rc < 0)
 		return rc;
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(idxd_wq_disable_pasid);
 
 static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 {
@@ -419,6 +493,34 @@ void idxd_wq_quiesce(struct idxd_wq *wq)
 }
 EXPORT_SYMBOL_GPL(idxd_wq_quiesce);
 
+void idxd_wq_setup_pasid(struct idxd_wq *wq, int pasid)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int offset;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	/* PASID fields are 8 bytes into the WQCFG register */
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+	wq->wqcfg->pasid = pasid;
+	iowrite32(wq->wqcfg->bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
+}
+EXPORT_SYMBOL_GPL(idxd_wq_setup_pasid);
+
+void idxd_wq_setup_priv(struct idxd_wq *wq, int priv)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int offset;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	/* priv field is 8 bytes into the WQCFG register */
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PRIV_IDX);
+	wq->wqcfg->priv = !!priv;
+	iowrite32(wq->wqcfg->bits[WQCFG_PRIV_IDX], idxd->reg_base + offset);
+}
+EXPORT_SYMBOL_GPL(idxd_wq_setup_priv);
+
 /* Device control bits */
 static inline bool idxd_is_enabled(struct idxd_device *idxd)
 {
@@ -587,6 +689,7 @@ void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid)
 	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_PASID, operand, NULL);
 	dev_dbg(dev, "pasid %d drained\n", pasid);
 }
+EXPORT_SYMBOL_GPL(idxd_device_drain_pasid);
 
 int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle,
 				   enum idxd_interrupt_type irq_type)
@@ -1280,7 +1383,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 		goto err;
 	}
 
-	rc = idxd_wq_enable(wq);
+	rc = idxd_wq_enable(wq, NULL);
 	if (rc < 0) {
 		dev_dbg(dev, "wq %d enabling failed: %d\n", wq->id, rc);
 		goto err;
@@ -1297,7 +1400,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 	return 0;
 
 err_map_portal:
-	rc = idxd_wq_disable(wq, false);
+	rc = idxd_wq_disable(wq, false, NULL);
 	if (rc < 0)
 		dev_dbg(dev, "wq %s disable failed\n", dev_name(wq_confdev(wq)));
 err:
@@ -1314,6 +1417,7 @@ int drv_enable_wq(struct idxd_wq *wq)
 	mutex_unlock(&wq->wq_lock);
 	return rc;
 }
+EXPORT_SYMBOL_GPL(drv_enable_wq);
 
 void __drv_disable_wq(struct idxd_wq *wq)
 {
@@ -1328,7 +1432,7 @@ void __drv_disable_wq(struct idxd_wq *wq)
 
 	idxd_wq_unmap_portal(wq);
 
-	idxd_wq_drain(wq);
+	idxd_wq_drain(wq, NULL);
 	idxd_wq_reset(wq);
 
 	wq->client_count = 0;
@@ -1341,6 +1445,7 @@ void drv_disable_wq(struct idxd_wq *wq)
 	__drv_disable_wq(wq);
 	mutex_unlock(&wq->wq_lock);
 }
+EXPORT_SYMBOL_GPL(drv_disable_wq);
 
 int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index e0e0314e4c96..4567ade5d334 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/ioasid.h>
 #include <linux/perf_event.h>
+#include <linux/vfio_pci_core.h>
 #include <uapi/linux/idxd.h>
 #include "registers.h"
 
@@ -52,6 +53,10 @@ enum idxd_type {
 #define IDXD_NAME_SIZE		128
 #define IDXD_PMU_EVENT_MAX	64
 
+struct idxd_device_ops {
+	 void (*notify_error)(struct idxd_wq *wq);
+};
+
 #define IDXD_ENQCMDS_RETRIES		32
 #define IDXD_ENQCMDS_MAX_RETRIES	64
 
@@ -61,6 +66,7 @@ struct idxd_device_driver {
 	int (*probe)(struct idxd_dev *idxd_dev);
 	void (*remove)(struct idxd_dev *idxd_dev);
 	struct device_driver drv;
+	struct idxd_device_ops *ops;
 };
 
 extern struct idxd_device_driver dsa_drv;
@@ -127,17 +133,20 @@ struct idxd_pmu {
 enum idxd_wq_state {
 	IDXD_WQ_DISABLED = 0,
 	IDXD_WQ_ENABLED,
+	IDXD_WQ_LOCKED,
 };
 
 enum idxd_wq_flag {
 	WQ_FLAG_DEDICATED = 0,
 	WQ_FLAG_BLOCK_ON_FAULT,
+	WQ_FLAG_MODE_1,
 };
 
 enum idxd_wq_type {
 	IDXD_WQT_NONE = 0,
 	IDXD_WQT_KERNEL,
 	IDXD_WQT_USER,
+	IDXD_WQT_MDEV,
 };
 
 struct idxd_cdev {
@@ -211,6 +220,7 @@ struct idxd_wq {
 	bool ats_dis;
 
 	void *private_data;
+	struct list_head vdcm_list;
 };
 
 struct idxd_engine {
@@ -241,6 +251,7 @@ enum idxd_device_flag {
 	IDXD_FLAG_CONFIGURABLE = 0,
 	IDXD_FLAG_CMD_RUNNING,
 	IDXD_FLAG_PASID_ENABLED,
+	IDXD_FLAG_IMS_SUPPORTED,
 };
 
 struct idxd_dma_dev {
@@ -285,6 +296,7 @@ struct idxd_device {
 	int irq_cnt;
 	bool request_int_handles;
 
+	u32 ims_offset;
 	u32 msix_perm_offset;
 	u32 wqcfg_offset;
 	u32 grpcfg_offset;
@@ -292,6 +304,7 @@ struct idxd_device {
 
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
+	int ims_size;
 	int max_groups;
 	int max_engines;
 	int max_rdbufs;
@@ -308,6 +321,14 @@ struct idxd_device {
 	struct workqueue_struct *wq;
 	struct work_struct work;
 
+	struct irq_domain *ims_domain;
+	struct vfio_pci_core_device vfio_pdev;
+	struct kref mdev_kref;
+	struct mutex kref_lock;
+	bool mdev_host_init;
+	int *new_handles;
+	struct ida vdev_ida;
+
 	struct idxd_pmu *idxd_pmu;
 };
 
@@ -419,6 +440,11 @@ extern struct device_type idxd_wq_device_type;
 extern struct device_type idxd_engine_device_type;
 extern struct device_type idxd_group_device_type;
 
+static inline bool is_idxd_wq_mdev(struct idxd_wq *wq)
+{
+	return (wq->type == IDXD_WQT_MDEV);
+}
+
 static inline bool is_dsa_dev(struct idxd_dev *idxd_dev)
 {
 	return idxd_dev->type == IDXD_DEV_DSA;
@@ -486,15 +512,17 @@ enum idxd_interrupt_type {
 	IDXD_IRQ_IMS,
 };
 
-static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot)
+static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot,
+					    enum idxd_interrupt_type irq_type)
 {
-	return prot * 0x1000;
+	return prot * 0x1000 + irq_type * 0x2000;
 }
 
 static inline int idxd_get_wq_portal_full_offset(int wq_id,
-						 enum idxd_portal_prot prot)
+						 enum idxd_portal_prot prot,
+						 enum idxd_interrupt_type irq_type)
 {
-	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot);
+	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot, irq_type);
 }
 
 #define IDXD_PORTAL_MASK	(PAGE_SIZE - 1)
@@ -584,9 +612,9 @@ int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
 void idxd_wqs_unmap_portal(struct idxd_device *idxd);
 int idxd_wq_alloc_resources(struct idxd_wq *wq);
 void idxd_wq_free_resources(struct idxd_wq *wq);
-int idxd_wq_enable(struct idxd_wq *wq);
-int idxd_wq_disable(struct idxd_wq *wq, bool reset_config);
-void idxd_wq_drain(struct idxd_wq *wq);
+int idxd_wq_enable(struct idxd_wq *wq, u32 *status);
+int idxd_wq_disable(struct idxd_wq *wq, bool reset_config, u32 *status);
+int idxd_wq_drain(struct idxd_wq *wq, u32 *status);
 void idxd_wq_reset(struct idxd_wq *wq);
 int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
@@ -597,6 +625,9 @@ void idxd_wq_quiesce(struct idxd_wq *wq);
 int idxd_wq_init_percpu_ref(struct idxd_wq *wq);
 void idxd_wq_free_irq(struct idxd_wq *wq);
 int idxd_wq_request_irq(struct idxd_wq *wq);
+int idxd_wq_abort(struct idxd_wq *wq, u32 *status);
+void idxd_wq_setup_pasid(struct idxd_wq *wq, int pasid);
+void idxd_wq_setup_priv(struct idxd_wq *wq, int priv);
 
 /* submission */
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 993a5dcca24f..400253301c4a 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -176,6 +176,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		init_waitqueue_head(&wq->err_queue);
 		init_completion(&wq->wq_dead);
 		init_completion(&wq->wq_resurrect);
+		INIT_LIST_HEAD(&wq->vdcm_list);
 		wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
 		wq->max_batch_size = WQ_DEFAULT_MAX_BATCH;
 		wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
@@ -367,6 +368,24 @@ static void idxd_read_table_offsets(struct idxd_device *idxd)
 	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n", idxd->msix_perm_offset);
 	idxd->perfmon_offset = offsets.perfmon * IDXD_TABLE_MULT;
 	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
+	idxd->ims_offset = offsets.ims * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD IMS Offset: %#x\n", idxd->ims_offset);
+}
+
+static void idxd_check_ims(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+
+	/* verify that we have IMS vectors supported by device */
+	if (idxd->hw.gen_cap.max_ims_mult) {
+		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
+		dev_dbg(&pdev->dev, "IMS size: %u\n", idxd->ims_size);
+		set_bit(IDXD_FLAG_IMS_SUPPORTED, &idxd->flags);
+		dev_dbg(&pdev->dev, "IMS supported for device\n");
+		return;
+	}
+
+	dev_dbg(&pdev->dev, "IMS unsupported for device\n");
 }
 
 static void idxd_read_caps(struct idxd_device *idxd)
@@ -391,6 +410,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
 	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
 	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
+	idxd_check_ims(idxd);
 	if (idxd->hw.gen_cap.config_en)
 		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 743ead5ebc57..eb0174ee409d 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -52,7 +52,7 @@ static void idxd_device_reinit(struct work_struct *work)
 		struct idxd_wq *wq = idxd->wqs[i];
 
 		if (wq->state == IDXD_WQ_ENABLED) {
-			rc = idxd_wq_enable(wq);
+			rc = idxd_wq_enable(wq, NULL);
 			if (rc < 0) {
 				dev_warn(dev, "Unable to re-enable wq %s\n",
 					 dev_name(wq_confdev(wq)));
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index aa642aecdc0b..0666966fd519 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -90,6 +90,9 @@ struct opcap {
 	u64 bits[4];
 };
 
+#define OPCAP_OFS(op) (op - (0x40 * (op >> 6)))
+#define OPCAP_BIT(op) (BIT_ULL(OPCAP_OFS(op)))
+
 #define IDXD_OPCAP_OFFSET		0x40
 
 #define IDXD_TABLE_OFFSET		0x60
@@ -170,6 +173,7 @@ union idxd_command_reg {
 	};
 	u32 bits;
 } __packed;
+#define IDXD_CMD_INT_MASK		0x80000000
 
 enum idxd_cmd {
 	IDXD_CMD_ENABLE_DEVICE = 1,
@@ -186,6 +190,7 @@ enum idxd_cmd {
 	IDXD_CMD_ABORT_PASID,
 	IDXD_CMD_REQUEST_INT_HANDLE,
 	IDXD_CMD_RELEASE_INT_HANDLE,
+	IDXD_CMD_REVOKED_HANDLES_PROCESSED,
 };
 
 #define CMD_INT_HANDLE_IMS		0x10000
@@ -200,7 +205,8 @@ union cmdsts_reg {
 	};
 	u32 bits;
 } __packed;
-#define IDXD_CMDSTS_ACTIVE		0x80000000
+#define IDXD_CMDS_ACTIVE_BIT		31
+#define IDXD_CMDSTS_ACTIVE		BIT(IDXD_CMDS_ACTIVE_BIT)
 #define IDXD_CMDSTS_ERR_MASK		0xff
 #define IDXD_CMDSTS_RES_SHIFT		8
 
@@ -232,10 +238,11 @@ enum idxd_cmdsts_err {
 	/* disable device errors */
 	IDXD_CMDSTS_ERR_DIS_DEV_EN = 0x31,
 	/* disable WQ, drain WQ, abort WQ, reset WQ */
-	IDXD_CMDSTS_ERR_DEV_NOT_EN,
+	IDXD_CMDSTS_ERR_WQ_NOT_EN,
 	/* request interrupt handle */
 	IDXD_CMDSTS_ERR_INVAL_INT_IDX = 0x41,
 	IDXD_CMDSTS_ERR_NO_HANDLE,
+	IDXD_CMDSTS_ERR_INVAL_INT_IDX_RELEASE,
 };
 
 #define IDXD_CMDCAP_OFFSET		0xb0
@@ -283,6 +290,11 @@ union msix_perm {
 	u32 bits;
 } __packed;
 
+#define IDXD_MSIX_PERM_MASK	0xfffff00c
+#define IDXD_MSIX_PERM_IGNORE	0x3
+#define MSIX_ENTRY_MASK_INT	0x1
+#define MSIX_ENTRY_CTRL_BYTE	12
+
 union group_flags {
 	struct {
 		u32 tc_a:3;
@@ -352,9 +364,19 @@ union wqcfg {
 	u32 bits[8];
 } __packed;
 
-#define WQCFG_PASID_IDX                2
+enum idxd_wq_hw_state {
+	IDXD_WQ_DEV_DISABLED = 0,
+	IDXD_WQ_DEV_ENABLED,
+	IDXD_WQ_DEV_BUSY,
+};
+
+#define WQCFG_PASID_IDX		2
+#define WQCFG_PRIV_IDX		2
 #define WQCFG_OCCUP_IDX		6
 
+#define WQCFG_MODE_DEDICATED	1
+#define WQCFG_MODE_SHARED	0
+
 #define WQCFG_OCCUP_MASK	0xffff
 
 /*
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index a48928973bd4..eb00991a0cc7 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -14,6 +14,7 @@ static char *idxd_wq_type_names[] = {
 	[IDXD_WQT_NONE]		= "none",
 	[IDXD_WQT_KERNEL]	= "kernel",
 	[IDXD_WQT_USER]		= "user",
+	[IDXD_WQT_MDEV]		= "mdev",
 };
 
 /* IDXD engine attributes */
@@ -501,6 +502,8 @@ static ssize_t wq_state_show(struct device *dev,
 		return sysfs_emit(buf, "disabled\n");
 	case IDXD_WQ_ENABLED:
 		return sysfs_emit(buf, "enabled\n");
+	case IDXD_WQ_LOCKED:
+		return sysfs_emit(buf, "locked\n");
 	}
 
 	return sysfs_emit(buf, "unknown\n");
@@ -781,6 +784,8 @@ static ssize_t wq_type_show(struct device *dev,
 		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_KERNEL]);
 	case IDXD_WQT_USER:
 		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_USER]);
+	case IDXD_WQT_MDEV:
+		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_MDEV]);
 	case IDXD_WQT_NONE:
 	default:
 		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_NONE]);
@@ -806,6 +811,8 @@ static ssize_t wq_type_store(struct device *dev,
 		wq->type = IDXD_WQT_KERNEL;
 	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_USER]))
 		wq->type = IDXD_WQT_USER;
+	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_MDEV]))
+		wq->type = IDXD_WQT_MDEV;
 	else
 		return -EINVAL;
 
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
index 646dbed44eb2..6b2fa8a194ef 100644
--- a/drivers/vfio/mdev/Kconfig
+++ b/drivers/vfio/mdev/Kconfig
@@ -8,3 +8,12 @@ config VFIO_MDEV
 	  See Documentation/driver-api/vfio-mediated-device.rst for more details.
 
 	  If you don't know what do here, say N.
+
+config VFIO_MDEV_IDXD
+	tristate "VFIO Mediated device driver for Intel IDXD"
+	depends on VFIO && VFIO_MDEV && X86_64
+	select IMS_MSI_ARRAY
+	default n
+	help
+	  VFIO based mediated device driver for
+	  Intel Accelerator Devices driver.
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
index ff9ecd802125..a2660c3edf5b 100644
--- a/drivers/vfio/mdev/Makefile
+++ b/drivers/vfio/mdev/Makefile
@@ -3,3 +3,4 @@
 mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o vfio_mdev.o
 
 obj-$(CONFIG_VFIO_MDEV) += mdev.o
+obj-$(CONFIG_VFIO_MDEV_IDXD) += idxd/
diff --git a/drivers/vfio/mdev/idxd/Makefile b/drivers/vfio/mdev/idxd/Makefile
new file mode 100644
index 000000000000..ab55032b5486
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/Makefile
@@ -0,0 +1,4 @@
+ccflags-y += -I$(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE=IDXD
+
+obj-$(CONFIG_VFIO_MDEV_IDXD) += idxd_mdev.o
+idxd_mdev-y := mdev.o vdev.o mdev_host.o
diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
new file mode 100644
index 000000000000..69ba0f67052a
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -0,0 +1,2410 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+#include <linux/msi.h>
+#include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
+#include <linux/kvm_host.h>
+#include <linux/eventfd.h>
+#include <linux/circ_buf.h>
+#include <linux/irqchip/irq-ims-msi.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+#include "../mdev_private.h"
+#include "mdev.h"
+
+static u64 idxd_pci_config[] = {
+	0x0010000000008086ULL,
+	0x0080000008800000ULL,
+	0x000000000000000cULL,
+	0x000000000000000cULL,
+	0x0000000000000000ULL,
+	0x2010808600000000ULL,
+	0x0000004000000000ULL,
+	0x000000ff00000000ULL,
+	0x0000060000015011ULL, /* MSI-X capability, hardcoded 2 entries, Encoded as N-1 */
+	0x0000070000000000ULL,
+	0x0000000000920010ULL, /* PCIe capability */
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0070001000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+};
+
+static u64 idxd_pci_ext_cap[] = {
+	0x000000611101000fULL, /* ATS capability */
+	0x0000000000000000ULL,
+	0x8100000012010013ULL, /* Page Request capability */
+	0x0000000000000001ULL,
+	0x000014040001001bULL, /* PASID capability */
+	0x0000000000000000ULL,
+	0x0181808600010023ULL, /* Scalable IOV capability */
+	0x0000000100000005ULL,
+	0x0000000000000001ULL,
+	0x0000000000000000ULL,
+};
+
+static int idxd_vdcm_set_irqs(struct vdcm_idxd *vidxd, uint32_t flags,
+			      unsigned int index, unsigned int start,
+			      unsigned int count, void *data);
+static int vidxd_register_ioasid_notifier(struct vdcm_idxd *vidxd);
+
+struct idxd_ioasid_work {
+	struct work_struct work;
+	struct idxd_wq *wq;
+	u32 guest_pasid;
+	u32 host_pasid;
+};
+
+static const char idxd_dsa_1dwq_name[] = "dsa-1dwq-v1";
+static const char idxd_iax_1dwq_name[] = "iax-1dwq-v1";
+static const char idxd_dsa_1swq_name[] = "dsa-1swq-v1";
+static const char idxd_iax_1swq_name[] = "iax-1swq-v1";
+
+static int idxd_vdcm_get_irq_count(struct mdev_device *mdev, int type)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+
+	/*
+	 * Even though the number of MSIX vectors supported are not tied to number of
+	 * wqs being exported, the current design is to allow 1 vector per WQ for guest.
+	 * So here we end up with num of wqs plus 1 that handles the misc interrupts.
+	 */
+	if (type == VFIO_PCI_MSI_IRQ_INDEX || type == VFIO_PCI_MSIX_IRQ_INDEX)
+		return VIDXD_MAX_MSIX_VECS;
+	else if (type == VFIO_PCI_REQ_IRQ_INDEX)
+		return 1;
+	else if (type >= VFIO_PCI_NUM_IRQS &&
+		 type < VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs)
+		return 1;
+
+	return 0;
+}
+
+static void idxd_wq_ioasid_work(struct work_struct *work)
+{
+	struct idxd_ioasid_work *iwork = container_of(work, struct idxd_ioasid_work, work);
+	struct idxd_wq *wq = iwork->wq;
+
+	if (wq->state != IDXD_WQ_ENABLED)
+		return;
+
+	idxd_device_drain_pasid(wq->idxd, iwork->guest_pasid);
+	ioasid_put(NULL, iwork->host_pasid);
+	kfree(iwork);
+}
+
+static int idxd_mdev_ioasid_event(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct idxd_vdev *vdev = container_of(nb, struct idxd_vdev, pasid_nb);
+	struct mdev_device *mdev = vdev->mdev;
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+	struct ioasid_nb_args *args = (struct ioasid_nb_args *)data;
+	struct idxd_ioasid_work *iwork;
+
+	if (event == IOASID_NOTIFY_FREE) {
+		dev_dbg(mdev_dev(mdev), "ioasid free event\n");
+
+		if (wq_dedicated(wq))
+			return NOTIFY_DONE;
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			return NOTIFY_DONE;
+
+		iwork = kmalloc(sizeof(*iwork), GFP_ATOMIC);
+		if (!iwork)
+			return notifier_from_errno(-ENOMEM);
+		iwork->wq = wq;
+		iwork->guest_pasid = args->spid;
+		iwork->host_pasid = args->id;
+		INIT_WORK(&iwork->work, idxd_wq_ioasid_work);
+		ioasid_queue_work(&iwork->work);
+		return NOTIFY_OK;
+	}
+
+	return NOTIFY_OK;
+}
+
+int idxd_mdev_get_pasid(struct mdev_device *mdev, u32 *pasid)
+{
+	struct vfio_group *vfio_group;
+	struct iommu_domain *iommu_domain;
+	struct device *dev = mdev_dev(mdev);
+	struct device *iommu_device = mdev_get_iommu_device(mdev);
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	int mdev_pasid;
+
+	if (!vidxd->ivdev.vfio_group) {
+		dev_warn(dev, "Missing vfio_group.\n");
+		return -EINVAL;
+	}
+
+	vfio_group = vidxd->ivdev.vfio_group;
+
+	iommu_domain = vfio_group_iommu_domain(vfio_group);
+	if (IS_ERR_OR_NULL(iommu_domain))
+		goto err;
+
+	mdev_pasid = iommu_aux_get_pasid(iommu_domain, iommu_device);
+	if (mdev_pasid < 0)
+		goto err;
+
+	*pasid = (u32)mdev_pasid;
+	return 0;
+
+ err:
+	vfio_group_put_external_user(vfio_group);
+	vidxd->ivdev.vfio_group = NULL;
+	return -EFAULT;
+}
+
+int idxd_mdev_get_host_pasid(struct mdev_device *mdev, u32 gpasid, u32 *pasid)
+{
+	struct ioasid_set *ioasid_set;
+	struct mm_struct *mm;
+
+	mm = get_task_mm(current);
+	if (!mm)
+		return -ENXIO;
+
+	ioasid_set = ioasid_find_mm_set(mm);
+	if (!ioasid_set) {
+		mmput(mm);
+		return -ENXIO;
+	}
+
+	*pasid = ioasid_find_by_spid(ioasid_set, gpasid, true);
+	mmput(mm);
+	if (*pasid == INVALID_IOASID)
+		return -ENXIO;
+
+	return 0;
+}
+
+static inline void reset_vconfig(struct vdcm_idxd *vidxd)
+{
+	u16 *devid = (u16 *)(vidxd->cfg + PCI_DEVICE_ID);
+	struct idxd_device *idxd = vidxd->idxd;
+
+	memset(vidxd->cfg, 0, VIDXD_MAX_CFG_SPACE_SZ);
+	memcpy(vidxd->cfg, idxd_pci_config, sizeof(idxd_pci_config));
+
+	if (idxd->data->type == IDXD_TYPE_DSA)
+		*devid = PCI_DEVICE_ID_INTEL_DSA_SPR0;
+	else if (idxd->data->type == IDXD_TYPE_IAX)
+		*devid = PCI_DEVICE_ID_INTEL_IAX_SPR0;
+
+	memcpy(vidxd->cfg + 0x100, idxd_pci_ext_cap, sizeof(idxd_pci_ext_cap));
+}
+
+static inline void reset_vmmio(struct vdcm_idxd *vidxd)
+{
+	memset(&vidxd->bar0, 0, VIDXD_MAX_MMIO_SPACE_SZ);
+}
+
+static void idxd_vdcm_init(struct vdcm_idxd *vidxd)
+{
+	struct idxd_wq *wq = vidxd->wq;
+
+	INIT_LIST_HEAD(&vidxd->vwq.head);
+
+	reset_vconfig(vidxd);
+	reset_vmmio(vidxd);
+
+	vidxd->bar_size[0] = VIDXD_BAR0_SIZE;
+	vidxd->bar_size[1] = VIDXD_BAR2_SIZE;
+
+	vidxd_mmio_init(vidxd);
+
+	if (wq_dedicated(wq) && wq->state == IDXD_WQ_ENABLED) {
+		idxd_wq_disable(wq, false, NULL);
+		wq->state = IDXD_WQ_LOCKED;
+	}
+}
+
+static void  vidxd_unregister_ioasid_notifier(struct vdcm_idxd *vidxd)
+{
+	struct idxd_vdev *vdev = &vidxd->ivdev;
+	struct ioasid_mm_entry *mm_entry, *n;
+	struct mm_struct *mm;
+
+	mm = get_task_mm(current);
+	if (!mm)
+		return;
+
+	mutex_lock(&vdev->ioasid_lock);
+
+	list_for_each_entry_safe(mm_entry, n, &vdev->mm_list, node) {
+		if (mm_entry->mm == mm) {
+			list_del(&mm_entry->node);
+			kfree(mm_entry);
+			ioasid_unregister_notifier_mm(mm, &vidxd->ivdev.pasid_nb);
+			break;
+		}
+	}
+
+	mutex_unlock(&vdev->ioasid_lock);
+	mmput(mm);
+}
+
+static int vidxd_source_pause_device(struct vdcm_idxd *vidxd)
+{
+	int i;
+	int rc;
+	u32 status;
+
+	if (vidxd->paused)
+		return 0;
+
+	mutex_lock(&vidxd->mig_submit_lock);
+	/* The VMM is expected to have unmap the portals. So once we drain
+	 * there shouldn't be any work directly submited from the VM */
+	vidxd->paused = true;
+	mutex_unlock(&vidxd->mig_submit_lock);
+
+	/* For DWQs, pausing the vDSA can always be done by Drain WQ command.
+	 * For SWQs, pausing the vDSA may mean Drain PASID if the SWQ is shared
+	 * with other VMs. We will need to do Drain PASID for each PASID
+	 * allocated to the VM which may take a long time. As an optimization,
+	 * we may do Drain PASID if no of PASIDs for the VM is below certain
+	 * number and do Drain WQ otherwise.
+	 */
+	/* Drain WQ(s) to make sure no more outstanding work in the dev */
+	/* TODO: Currently support for only 1 WQ per VDev */
+	for (i = 0; i < vidxd->num_wqs; i++) {
+		rc = idxd_wq_drain(vidxd->wq, &status);
+
+		if (rc < 0) {
+			pr_info("%s: failed rc %d\n", __func__, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static void vidxd_free_resources (struct vdcm_idxd *vidxd)
+{
+	int i;
+
+        /* Free the queued descriptors */
+        for (i = 0; i < vidxd->num_wqs; i++) {
+                struct idxd_wq_desc_elem *el, *tmp;
+		struct idxd_virtual_wq *vwq = &vidxd->vwq;
+
+                list_for_each_entry_safe(el, tmp, &vwq->head, link) {
+                        list_del(&el->link);
+                        vwq->ndescs--;
+                        kfree(el);
+                }
+        }
+
+}
+
+static void vidxd_source_prepare_for_migration(struct vdcm_idxd *vidxd)
+{
+	int i;
+	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
+	struct vfio_device_migration_info *mig_info =
+		(struct vfio_device_migration_info *)vdev->mig_pages;
+	u8 *data_ptr = (u8 *)vdev->mig_pages;
+	unsigned int offset =  mig_info->data_offset;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_virtual_wq *vwq;
+
+	memcpy(data_ptr + offset, vidxd->cfg, sizeof(vidxd->cfg));
+	offset += sizeof(vidxd->cfg);
+	memcpy(data_ptr + offset, (u8 *)vidxd->bar_val, sizeof(vidxd->bar_val));
+	offset += sizeof(vidxd->bar_val);
+	memcpy(data_ptr + offset, (u8 *)vidxd->bar_size,
+					sizeof(vidxd->bar_size));
+	offset += sizeof(vidxd->bar_size);
+	memcpy(data_ptr + offset, (u8 *)&vidxd->bar0, sizeof(vidxd->bar0));
+	offset += sizeof(vidxd->bar0);
+
+	/* Save the queued descriptors */
+	for (i = 0; i < vidxd->num_wqs; i++) {
+		struct idxd_wq_desc_elem *el;
+
+		vwq = &vidxd->vwq;
+		memcpy(data_ptr + offset, (u8 *)&vwq->ndescs, sizeof(vwq->ndescs));
+		offset += sizeof(vwq->ndescs);
+		list_for_each_entry(el, &vwq->head, link) {
+			dev_dbg(dev, "Saving descriptor at offset %x\n", offset);
+			memcpy(data_ptr + offset, (u8 *)el, sizeof(*el));
+			offset += sizeof(*el);
+		}
+	}
+
+	/* Save int handle info */
+	for (i = 1; i < VIDXD_MAX_MSIX_VECS; i++) {
+		u32 ims_idx = dev_msi_hwirq(dev, i - 1);
+
+		/* Save the current handle in use */
+		dev_dbg(dev, "Saving handle %d at offset %x\n", ims_idx, offset);
+		memcpy(data_ptr + offset, (u8 *)&ims_idx, sizeof(ims_idx));
+		offset += sizeof(ims_idx);
+	}
+
+	mig_info->data_size = offset - mig_info->data_offset;
+	mig_info->pending_bytes = offset - mig_info->data_offset;
+
+	dev_dbg(dev, "%s, mig_info->pending_bytes: 0x%llx, data_size: 0x%llx\n",
+		__func__, mig_info->pending_bytes, mig_info->data_size);
+}
+
+static void vidxd_dest_prepare_for_migration(struct vdcm_idxd *vidxd)
+{
+
+}
+
+static int vidxd_resume_wq_state(struct vdcm_idxd *vidxd)
+{
+	struct idxd_wq *wq;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_device *idxd = vidxd->idxd;
+	union wqcfg *vwqcfg, *wqcfg;
+	bool priv;
+	int wq_id;
+	int rc = 0;
+	u8 *bar0 = vidxd->bar0;
+
+	dev_dbg(dev, "%s:%d numwqs %d\n", __func__, __LINE__, vidxd->num_wqs);
+	/* TODO: Currently support for only 1 WQ per VDev */
+	for (wq_id = 0; wq_id < vidxd->num_wqs; wq_id++) {
+		wq = vidxd->wq;
+		dev_dbg(dev, "%s:%d wq %px\n", __func__, __LINE__, wq);
+		vwqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+		wqcfg = wq->wqcfg;
+
+		if (vidxd_state(vidxd) != 1 || vwqcfg->wq_state != 1) {
+			/* either VDEV or vWQ is disabled */
+			if (wq_dedicated(wq) && wq->state == IDXD_WQ_ENABLED)
+				idxd_wq_disable(wq, false, NULL);
+			continue;
+		} else {
+			unsigned long flags;
+			printk("vidxd re-enable wq %u:%u\n", wq_id, wq->id);
+
+			/* If dedicated WQ and PASID is not enabled, program
+			 * the default PASID in the WQ PASID register */
+			if (wq_dedicated(wq) && vwqcfg->mode_support) {
+				int wq_pasid, gpasid = -1;
+
+				if (vwqcfg->pasid_en) {
+					gpasid = vwqcfg->pasid;
+					priv = vwqcfg->priv;
+					rc = idxd_mdev_get_host_pasid(mdev,
+						gpasid, &wq_pasid);
+				} else {
+					rc = idxd_mdev_get_pasid(mdev,
+						&wq_pasid);
+					priv = true;
+				}
+
+				if (wq_pasid >= 0) {
+					u32 status;
+
+					wqcfg->bits[WQCFG_PASID_IDX] &=
+								~GENMASK(29, 8);
+					wqcfg->priv = priv;
+					wqcfg->pasid_en = 1;
+					wqcfg->pasid = wq_pasid;
+					dev_dbg(dev, "pasid %d:%d in wq %d\n",
+						gpasid, wq_pasid, wq->id);
+					spin_lock_irqsave(&idxd->dev_lock,
+									flags);
+					idxd_wq_setup_pasid(wq, wq_pasid);
+					idxd_wq_setup_priv(wq, priv);
+					spin_unlock_irqrestore(&idxd->dev_lock,
+									flags);
+					idxd_wq_enable(wq, &rc);
+					if (status) {
+						dev_err(dev, "resume wq failed\n");
+						break;;
+					}
+				}
+			} else if (!wq_dedicated(wq) && vwqcfg->mode_support) {
+				wqcfg->bits[WQCFG_PASID_IDX] &= ~GENMASK(29, 8);
+				wqcfg->pasid_en = 1;
+				wqcfg->mode = 0;
+				spin_lock_irqsave(&idxd->dev_lock, flags);
+				idxd_wq_setup_pasid(wq, 0);
+				spin_unlock_irqrestore(&idxd->dev_lock, flags);
+				idxd_wq_enable(wq, &rc);
+				if (rc) {
+					dev_err(dev, "resume wq %d failed\n",
+							wq->id);
+					break;
+				}
+			}
+		}
+	}
+	return rc;
+}
+
+static unsigned int vidxd_dest_load_state(struct vdcm_idxd *vidxd)
+{
+	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
+	struct vfio_device_migration_info *mig_info =
+		(struct vfio_device_migration_info *)vdev->mig_pages;
+	u8	*data_ptr = (u8 *)vdev->mig_pages;
+	unsigned int offset =  mig_info->data_offset;
+
+	pr_info("%s, data_size: %llx, data_offset: 0x%llx\n", __func__,
+			mig_info->data_size, mig_info->data_offset);
+
+	/* restore the state data to device */
+	memcpy(vidxd->cfg, data_ptr + offset, sizeof(vidxd->cfg));
+	offset += sizeof(vidxd->cfg);
+	memcpy((u8 *)vidxd->bar_val, data_ptr + offset, sizeof(vidxd->bar_val));
+	offset += sizeof(vidxd->bar_val);
+	memcpy((u8 *)vidxd->bar_size, data_ptr + offset,
+					sizeof(vidxd->bar_size));
+	offset += sizeof(vidxd->bar_size);
+	memcpy((u8 *)&vidxd->bar0, data_ptr + offset, sizeof(vidxd->bar0));
+	offset += sizeof(vidxd->bar0);
+	//memcpy((u8 *)ims, data_ptr + offset, sizeof(vidxd->ims));
+	//offset += sizeof(vidxd->ims);
+
+	printk("Offset %x\n", offset);
+	return offset;
+}
+
+static int vidxd_dest_int_handle_revocation (struct vdcm_idxd *vidxd,
+		unsigned int *offset)
+{
+	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
+	u8 *data_ptr = (u8 *)vdev->mig_pages;
+	u8 *bar0 = vidxd->bar0;
+	int i;
+	int rc = 0;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	bool int_handle_revoked = false;
+
+	/* Restore int handle info */
+	for (i = 1; i < VIDXD_MAX_MSIX_VECS; i++) {
+		u32 perm_val, auxval;
+		u32 gpasid, pasid;
+		bool paside;
+		int ims_idx = dev_msi_hwirq(dev, i - 1);
+		int irq = dev_msi_irq_vector(dev, i - 1);
+		u32 revoked_handle;
+
+		memcpy((u8 *)&revoked_handle, data_ptr + *offset,
+					sizeof(revoked_handle));
+		*offset += sizeof(revoked_handle);
+
+		pr_info("%s: %d new handle %x old handle %x\n",
+				__func__, i, ims_idx, revoked_handle);
+
+		if (revoked_handle != ims_idx) {
+			/* Int Handle Revoked */
+			int_handle_revoked = true;
+		}
+
+		perm_val = *(u32 *)(bar0 + VIDXD_MSIX_PERM_OFFSET + i * 8);
+
+		paside = (perm_val >> 3) & 1;
+		gpasid = (perm_val >> 12) & 0xfffff;
+
+		if (paside)
+			rc = idxd_mdev_get_host_pasid(vidxd->ivdev.mdev, gpasid, &pasid);
+		else
+			rc = idxd_mdev_get_pasid(vidxd->ivdev.mdev, &pasid);
+		if (rc < 0)
+			return rc;
+
+		auxval = ims_ctrl_pasid_aux(pasid, true);
+
+		rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+		pr_info("%s: auxval %x rc %d\n", __func__, auxval, rc);
+		if (rc < 0) {
+			pr_info("set ims pasid failed rc %d\n", rc);
+			break;
+		}
+	}
+
+	if (int_handle_revoked)
+                vidxd_notify_revoked_handles(vidxd);
+
+	return rc;
+}
+
+static int vidxd_resubmit_pending_descs (struct vdcm_idxd *vidxd,
+		unsigned int *offset)
+{
+	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	u8 *data_ptr = (u8 *)vdev->mig_pages;
+	struct idxd_virtual_wq *vwq;
+	struct idxd_wq *wq;
+	int i;
+
+	/* Submit the queued descriptors. The WQ state
+	 * has been resumed by this point
+	 */
+	for (i = 0; i < vidxd->num_wqs; i++) {
+		void __iomem *portal;
+		struct idxd_wq_desc_elem el;
+		vwq = &vidxd->vwq;
+		wq = vidxd->wq;
+
+		memcpy((u8 *)&vwq->ndescs, data_ptr + *offset, sizeof(vwq->ndescs));
+		*offset += sizeof(vwq->ndescs);
+
+		for (; vwq->ndescs > 0; vwq->ndescs--) {
+			printk("Descriptor at offset %x\n", *offset);
+
+			memcpy((u8 *)&el, data_ptr + *offset, sizeof(el));
+			*offset += sizeof(el);
+
+			portal = wq->portal;
+			pr_info("submitting a desc to WQ %d:%d ded %d\n",
+					i, wq->id, wq_dedicated(wq));
+			if (wq_dedicated(wq)) {
+				iosubmit_cmds512(portal, el.work_desc, 1);
+			} else {
+				int rc;
+				struct dsa_hw_desc *hw =
+					(struct dsa_hw_desc *)el.work_desc;
+				int hpasid, gpasid = hw->pasid;
+
+				/* Translate the gpasid in the descriptor */
+				rc = idxd_mdev_get_host_pasid(mdev,
+						gpasid, &hpasid);
+				if (rc < 0) {
+					pr_info("gpasid->hpasid trans failed\n");
+					continue;
+				}
+				hw->pasid = hpasid;
+				rc = enqcmds(portal, el.work_desc);
+				if (rc < 0) {
+					pr_info("%s: enqcmds failed\n", __func__);
+					continue;
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int vidxd_dest_complete_migration(struct vdcm_idxd *vidxd)
+{
+	int rc = 0;
+	unsigned int offset;
+
+	offset = vidxd_dest_load_state(vidxd);
+
+	rc = vidxd_resume_wq_state(vidxd);
+
+	if (rc) {
+		pr_info("vidxd resume wq state failed %d\n", rc);
+		return rc;
+	}
+
+	rc = vidxd_resubmit_pending_descs(vidxd, &offset);
+
+	if (rc) {
+		pr_info("vidxd pending descs handling failed %d\n", rc);
+		return rc;
+	}
+
+	rc = vidxd_dest_int_handle_revocation(vidxd, &offset);
+
+	if (rc) {
+		pr_info("vidxd int handle revocation handling failed %d\n", rc);
+		return rc;
+	}
+
+	return rc;
+}
+
+static int vidxd_migration_state_change(struct vfio_pci_core_device *vfio_vdev,
+		u32 new_state)
+{
+	struct vdcm_idxd *vidxd = container_of(vfio_vdev, struct vdcm_idxd, vfio_pdev);
+	struct vfio_device_migration_info *mig_info =
+		(struct vfio_device_migration_info *) vfio_vdev->mig_pages;
+	int ret = 0;
+
+	pr_info("%s, VFIO_DEVICE_STATE_MASK: 0x%x, new_state: 0x%x\n",
+			__func__, VFIO_DEVICE_STATE_MASK, new_state);
+	if (new_state & (~(VFIO_DEVICE_STATE_MASK))) {
+		pr_info("%s, invalid new device state, 0x%x!!\n", __func__, new_state);
+		return -EINVAL;
+	}
+
+	switch (new_state) {
+	case 0:
+		pr_info("%s, __STOPPED !!\n", __func__);
+		vidxd_free_resources(vidxd);
+		break;
+	case VFIO_DEVICE_STATE_RUNNING:
+		pr_info("%s, VFIO_DEVICE_STATE_RUNNING!! old state %x\n",
+			__func__, mig_info->device_state);
+		if (mig_info->device_state & VFIO_DEVICE_STATE_RESUMING)
+			vidxd_dest_complete_migration(vidxd);
+		break;
+	case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING:
+		pr_info("%s, VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING!!\n", __func__);
+
+		break;
+	case VFIO_DEVICE_STATE_SAVING:
+		pr_info("%s, VFIO_DEVICE_STATE_SAVING!!\n", __func__);
+		/* Prepared the state data for migration */
+		if (!(mig_info->device_state & VFIO_DEVICE_STATE_RUNNING))
+			vidxd_source_prepare_for_migration(vidxd);
+
+		/* Pause the virtual device. The vCPUs are still running.
+		 * This happens just before the VM is paused. The vDEV
+		 * is already in slow path */
+		if (mig_info->device_state & VFIO_DEVICE_STATE_RUNNING)
+			vidxd_source_pause_device(vidxd);
+		break;
+	case VFIO_DEVICE_STATE_RESUMING:
+		/* Prepared the state restore for migration */
+		vidxd_dest_prepare_for_migration(vidxd);
+		pr_info("%s, VFIO_DEVICE_STATE_RESUMING!!\n", __func__);
+		break;
+	default:
+		pr_info("%s, not handled new device state: 0x%x\n", __func__, new_state);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static struct vfio_pci_migops vidxd_migops = {
+	.state_change	= vidxd_migration_state_change,
+};
+
+static struct idxd_wq *find_any_dwq(struct idxd_device *idxd, struct vdcm_idxd_type *type)
+{
+	int i;
+	struct idxd_wq *wq;
+	unsigned long flags;
+
+	switch (type->type) {
+	case IDXD_MDEV_TYPE_DSA_1_DWQ:
+		if (idxd->data->type != IDXD_TYPE_DSA)
+			return NULL;
+		break;
+	case IDXD_MDEV_TYPE_IAX_1_DWQ:
+		if (idxd->data->type != IDXD_TYPE_IAX)
+			return NULL;
+		break;
+	default:
+		return NULL;
+	}
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED && wq->state != IDXD_WQ_LOCKED)
+			continue;
+
+		if (!is_idxd_wq_mdev(wq))
+			continue;
+
+		if (!wq_dedicated(wq))
+			continue;
+
+		if (idxd_wq_refcount(wq) != 0)
+			continue;
+
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		mutex_lock(&wq->wq_lock);
+		idxd_wq_get(wq);
+		mutex_unlock(&wq->wq_lock);
+		return wq;
+	}
+
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	return NULL;
+}
+
+static int swq_lowest_client_count(struct idxd_device *idxd)
+{
+	struct idxd_wq *wq;
+	int i, count = -ENODEV;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			continue;
+
+		if (!is_idxd_wq_mdev(wq))
+			continue;
+
+		if (wq_dedicated(wq))
+			continue;
+
+		if (count == -ENODEV)
+			count = idxd_wq_refcount(wq);
+		else if (count > idxd_wq_refcount(wq))
+			count = idxd_wq_refcount(wq);
+	}
+
+	return count;
+}
+
+static struct idxd_wq *find_any_swq(struct idxd_device *idxd, struct vdcm_idxd_type *type)
+{
+	int i, count;
+	struct idxd_wq *wq;
+	unsigned long flags;
+
+	switch (type->type) {
+	case IDXD_MDEV_TYPE_DSA_1_SWQ:
+		if (idxd->data->type != IDXD_TYPE_DSA)
+			return NULL;
+		break;
+	case IDXD_MDEV_TYPE_IAX_1_SWQ:
+		if (idxd->data->type != IDXD_TYPE_IAX)
+			return NULL;
+		break;
+	default:
+		return NULL;
+	}
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	count = swq_lowest_client_count(idxd);
+	if (count < 0)
+		goto out;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			continue;
+
+		if (!is_idxd_wq_mdev(wq))
+			continue;
+
+		if (wq_dedicated(wq))
+			continue;
+
+		/*
+		 * Attempt to load balance the shared wq by round robin until on the lowest
+		 * ref count for the wq.
+		 */
+		if (idxd_wq_refcount(wq) != count)
+			continue;
+
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		mutex_lock(&wq->wq_lock);
+		idxd_wq_get(wq);
+		mutex_unlock(&wq->wq_lock);
+		return wq;
+	}
+
+ out:
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	return NULL;
+}
+
+extern const struct vfio_pci_regops vfio_pci_dma_fault_regops;
+
+static struct vdcm_idxd *vdcm_vidxd_create(struct idxd_device *idxd, struct mdev_device *mdev,
+					   struct vdcm_idxd_type *type)
+{
+	struct vdcm_idxd *vidxd;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_wq *wq = NULL;
+	int rc;
+
+	switch (type->type) {
+	case IDXD_MDEV_TYPE_DSA_1_DWQ:
+	case IDXD_MDEV_TYPE_IAX_1_DWQ:
+		wq = find_any_dwq(idxd, type);
+		break;
+	case IDXD_MDEV_TYPE_DSA_1_SWQ:
+	case IDXD_MDEV_TYPE_IAX_1_SWQ:
+		wq = find_any_swq(idxd, type);
+		break;
+	default:
+		return ERR_PTR(-ENODEV);
+	}
+
+	if (!wq)
+		return ERR_PTR(-ENODEV);
+
+	vidxd = kzalloc(sizeof(*vidxd), GFP_KERNEL);
+	if (!vidxd) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	mutex_init(&vidxd->dev_lock);
+	vidxd->idxd = idxd;
+	vidxd->ivdev.mdev = mdev;
+	vidxd->wq = wq;
+	mdev_set_drvdata(mdev, vidxd);
+	vidxd->type = type;
+	vidxd->num_wqs = VIDXD_MAX_WQS;
+	dev_set_msi_domain(dev, idxd->ims_domain);
+	mutex_init(&vidxd->ivdev.ioasid_lock);
+	INIT_LIST_HEAD(&vidxd->ivdev.mm_list);
+
+	idxd_vdcm_init(vidxd);
+
+	mutex_init(&vidxd->vfio_pdev.igate);
+	vidxd->vfio_pdev.pdev = idxd->pdev;
+	rc = vfio_pci_dma_fault_init(&vidxd->vfio_pdev, false);
+	if (rc < 0) {
+		dev_err(dev, "dma fault region init failed\n");
+		kfree(vidxd);
+		goto err;
+	}
+
+	mdev_set_iommu_fault_data(mdev, &vidxd->vfio_pdev);
+
+	vidxd->vfio_pdev.migops = &vidxd_migops;
+	rc = vfio_pci_migration_init(&vidxd->vfio_pdev, VIDXD_STATE_BUFFER_SIZE);
+	if (rc)
+		pr_err("%s, idxd migration region init failed!!!\n", __func__);
+	else
+		pr_info("%s, idxd migration region init successfully!!!\n", __func__);
+
+	return vidxd;
+
+ err:
+	mutex_lock(&wq->wq_lock);
+	idxd_wq_put(wq);
+	mutex_unlock(&wq->wq_lock);
+	return ERR_PTR(rc);
+}
+
+static struct vdcm_idxd_type idxd_mdev_types[IDXD_MDEV_TYPES] = {
+	{
+		.name = idxd_dsa_1dwq_name,
+		.type = IDXD_MDEV_TYPE_DSA_1_DWQ,
+	},
+	{
+		.name = idxd_iax_1dwq_name,
+		.type = IDXD_MDEV_TYPE_IAX_1_DWQ,
+	},
+	{
+		.name = idxd_dsa_1swq_name,
+		.type = IDXD_MDEV_TYPE_DSA_1_SWQ,
+	},
+	{
+		.name = idxd_iax_1swq_name,
+		.type = IDXD_MDEV_TYPE_IAX_1_SWQ,
+	},
+};
+
+static struct vdcm_idxd_type *idxd_vdcm_get_type(struct mdev_device *mdev)
+{
+	return &idxd_mdev_types[mdev_get_type_group_id(mdev)];
+}
+
+static const struct vfio_device_ops idxd_mdev_ops;
+
+static int idxd_vdcm_probe(struct mdev_device *mdev)
+{
+	struct vdcm_idxd *vidxd;
+	struct vdcm_idxd_type *type;
+	struct device *dev, *parent;
+	struct idxd_device *idxd;
+	struct idxd_wq *wq;
+	int rc;
+
+	parent = mdev_parent_dev(mdev);
+	idxd = dev_get_drvdata(parent);
+	dev = mdev_dev(mdev);
+	mdev_set_iommu_device(mdev, parent);
+	type = idxd_vdcm_get_type(mdev);
+
+	vidxd = vdcm_vidxd_create(idxd, mdev, type);
+	if (IS_ERR(vidxd)) {
+		dev_err(dev, "failed to create vidxd: %ld\n", PTR_ERR(vidxd));
+		return PTR_ERR(vidxd);
+	}
+
+	vfio_init_group_dev(&vidxd->vdev, &mdev->dev, &idxd_mdev_ops);
+	wq = vidxd->wq;
+	dev_set_drvdata(dev, vidxd);
+	rc = vfio_register_group_dev(&vidxd->vdev);
+	if (rc < 0) {
+		mutex_lock(&wq->wq_lock);
+		idxd_wq_put(wq);
+		mutex_unlock(&wq->wq_lock);
+		kfree(vidxd);
+		return rc;
+	}
+
+	mutex_lock(&wq->wq_lock);
+	list_add(&vidxd->list, &wq->vdcm_list);
+	mutex_unlock(&wq->wq_lock);
+	dev_dbg(dev, "mdev creation success: %s\n", dev_name(mdev_dev(mdev)));
+
+	return 0;
+}
+
+static void idxd_vdcm_remove(struct mdev_device *mdev)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct idxd_device *idxd = vidxd->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_wq *wq = vidxd->wq;
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int i;
+
+	dev_dbg(dev, "%s: removing for wq %d\n", __func__, vidxd->wq->id);
+
+	for (i = 0; i < vfio_pdev->num_regions; i++)
+		vfio_pdev->region[i].ops->release(vfio_pdev, &vfio_pdev->region[i]);
+	vfio_pdev->num_regions = 0;
+	kfree(vfio_pdev->region);
+	vfio_pdev->region = NULL;
+
+	for (i = 0; i < vfio_pdev->num_ext_irqs; i++)
+		vfio_pci_set_ext_irq_trigger(vfio_pdev,
+					 VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+					 VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
+	vfio_pdev->num_ext_irqs = 0;
+	kfree(vfio_pdev->ext_irqs);
+	vfio_pdev->ext_irqs = NULL;
+
+	mutex_lock(&wq->wq_lock);
+	list_del(&vidxd->list);
+	idxd_wq_put(wq);
+	mutex_unlock(&wq->wq_lock);
+
+	vfio_unregister_group_dev(&vidxd->vdev);
+
+	kfree(vidxd);
+}
+
+static int idxd_vdcm_open(struct vfio_device *vdev)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	int rc = -EINVAL;
+	struct vdcm_idxd_type *type = vidxd->type;
+	struct device *dev = vdev->dev;
+	struct vfio_group *vfio_group;
+
+	dev_dbg(dev, "%s: type: %d\n", __func__, type->type);
+
+	vfio_group = vfio_group_get_external_user_from_dev(dev);
+	if (IS_ERR_OR_NULL(vfio_group)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = vidxd_register_ioasid_notifier(vidxd);
+	if (rc < 0)
+		goto ioasid_err;
+
+	mutex_lock(&vidxd->dev_lock);
+	if (vidxd->refcount)
+		goto ioasid_err;
+
+	vidxd->ivdev.vfio_group = vfio_group;
+	vidxd->refcount++;
+
+	mutex_unlock(&vidxd->dev_lock);
+	return 0;
+
+ ioasid_err:
+	vfio_group_put_external_user(vfio_group);
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+	return rc;
+}
+
+static void idxd_vdcm_close(struct vfio_device *vdev)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+
+	mutex_lock(&vidxd->dev_lock);
+	if (!vidxd->refcount)
+		goto out;
+
+	vidxd_unregister_ioasid_notifier(vidxd);
+	idxd_vdcm_set_irqs(vidxd, VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+			   VFIO_PCI_MSIX_IRQ_INDEX, 0, 0, NULL);
+
+	if (vidxd->ivdev.vfio_group) {
+		vfio_group_put_external_user(vidxd->ivdev.vfio_group);
+		vidxd->ivdev.vfio_group = NULL;
+	}
+
+	/* Re-initialize the VIDXD to a pristine state for re-use */
+	idxd_vdcm_init(vidxd);
+	vidxd->refcount--;
+	vidxd->paused = false;
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+}
+
+static int vidxd_register_ioasid_notifier(struct vdcm_idxd *vidxd)
+{
+	struct idxd_vdev *vdev = &vidxd->ivdev;
+	struct ioasid_mm_entry *mm_entry;
+	struct mm_struct *mm;
+	int rc;
+
+	mm = get_task_mm(current);
+	if (!mm)
+		return -ENODEV;
+
+	mutex_lock(&vdev->ioasid_lock);
+	list_for_each_entry(mm_entry, &vdev->mm_list, node) {
+		if (mm_entry->mm == mm) {
+			mutex_unlock(&vdev->ioasid_lock);
+			return 0;
+		}
+	}
+
+	mm_entry = kzalloc(sizeof(*mm_entry), GFP_KERNEL);
+	if (!mm_entry) {
+		rc = -ENOMEM;
+		goto err_alloc;
+	}
+
+	mm_entry->mm = mm;
+
+	vidxd->ivdev.pasid_nb.priority = IOASID_PRIO_DEVICE;
+	vidxd->ivdev.pasid_nb.notifier_call = idxd_mdev_ioasid_event;
+	rc = ioasid_register_notifier_mm(mm, &vidxd->ivdev.pasid_nb);
+	mmput(mm);
+	if (rc < 0)
+		goto err_ioasid;
+
+	list_add(&mm_entry->node, &vdev->mm_list);
+	mutex_unlock(&vdev->ioasid_lock);
+
+	return 0;
+
+ err_ioasid:
+	kfree(mm_entry);
+ err_alloc:
+	mutex_unlock(&vdev->ioasid_lock);
+	mmput(mm);
+	return rc;
+}
+
+static ssize_t idxd_vdcm_rw(struct vfio_device *vdev, char *buf, size_t count, loff_t *ppos,
+			    enum idxd_vdcm_rw mode)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	struct device *dev = vdev->dev;
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int rc = -EINVAL;
+
+	if (index >= VFIO_PCI_NUM_REGIONS + vfio_pdev->num_regions) {
+		dev_err(dev, "invalid index: %u\n", index);
+		return -EINVAL;
+	}
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		if (mode == IDXD_VDCM_WRITE)
+			rc = vidxd_cfg_write(vidxd, pos, buf, count);
+		else
+			rc = vidxd_cfg_read(vidxd, pos, buf, count);
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case VFIO_PCI_BAR1_REGION_INDEX:
+		if (mode == IDXD_VDCM_WRITE)
+			rc = vidxd_mmio_write(vidxd, vidxd->bar_val[0] + pos, buf, count);
+		else
+			rc = vidxd_mmio_read(vidxd, vidxd->bar_val[0] + pos, buf, count);
+		break;
+	case VFIO_PCI_BAR2_REGION_INDEX:
+	case VFIO_PCI_BAR3_REGION_INDEX:
+		if (mode == IDXD_VDCM_WRITE) {
+			rc = vidxd_portal_mmio_write(vidxd,
+				vidxd->bar_val[1] + pos, buf, count);
+		} else {
+			rc = vidxd_portal_mmio_read(vidxd,
+				vidxd->bar_val[1] + pos, buf, count);
+		}
+		break;
+
+	case VFIO_PCI_BAR4_REGION_INDEX:
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+		dev_err(dev, "unsupported region: %u\n", index);
+		break;
+
+	default:
+		dev_dbg(dev, "vendor specific region: %u\n", index);
+		index -= VFIO_PCI_NUM_REGIONS;
+		return vfio_pdev->region[index].ops->rw(vfio_pdev, buf, count, ppos, mode);
+	}
+
+	return rc == 0 ? count : rc;
+}
+
+static ssize_t idxd_vdcm_read(struct vfio_device *vdev, char __user *buf, size_t count,
+			      loff_t *ppos)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	unsigned int done = 0;
+	int rc;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case VFIO_PCI_BAR1_REGION_INDEX:
+	case VFIO_PCI_BAR2_REGION_INDEX:
+	case VFIO_PCI_BAR3_REGION_INDEX:
+	case VFIO_PCI_BAR4_REGION_INDEX:
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+		break;
+	default: {
+		struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+		struct device *dev = vdev->dev;
+
+		dev_dbg(dev, "vendor specific region: %u\n", index);
+		index -= VFIO_PCI_NUM_REGIONS;
+		return vfio_pdev->region[index].ops->rw(vfio_pdev, buf, count, ppos, false);
+	} /* end default */
+	} /* end switch(index) */
+
+	mutex_lock(&vidxd->dev_lock);
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			rc = idxd_vdcm_rw(vdev, &val, sizeof(val), ppos,
+					  IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	mutex_unlock(&vidxd->dev_lock);
+	return done;
+
+ read_err:
+	mutex_unlock(&vidxd->dev_lock);
+	return -EFAULT;
+}
+
+static ssize_t idxd_vdcm_write(struct vfio_device *vdev, const char __user *buf, size_t count,
+			       loff_t *ppos)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	unsigned int done = 0;
+	int rc;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case VFIO_PCI_BAR1_REGION_INDEX:
+	case VFIO_PCI_BAR2_REGION_INDEX:
+	case VFIO_PCI_BAR3_REGION_INDEX:
+	case VFIO_PCI_BAR4_REGION_INDEX:
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+		break;
+	default: {
+		struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+		struct device *dev = vdev->dev;
+
+		dev_dbg(dev, "vendor specific region: %u\n", index);
+		index -= VFIO_PCI_NUM_REGIONS;
+		return vfio_pdev->region[index].ops->rw(vfio_pdev, buf, count, ppos, true);
+	} /* end default */
+	} /* end switch(index) */
+
+	mutex_lock(&vidxd->dev_lock);
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val,
+					  sizeof(val), ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(vdev, &val, sizeof(val),
+					  ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	mutex_unlock(&vidxd->dev_lock);
+	return done;
+
+write_err:
+	mutex_unlock(&vidxd->dev_lock);
+	return -EFAULT;
+}
+
+static int idxd_vdcm_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
+{
+	unsigned int wq_idx, index;
+	unsigned long req_size, pgoff = 0, offset;
+	pgprot_t pg_prot;
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	struct idxd_device *idxd = vidxd->idxd;
+	struct idxd_wq *wq = vidxd->wq;
+	enum idxd_portal_prot virt_portal, phys_portal;
+	phys_addr_t base = pci_resource_start(idxd->pdev, IDXD_WQ_BAR);
+	struct device *dev = vdev->dev;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+	if (index >= VFIO_PCI_NUM_REGIONS) {
+		int regnum = index - VFIO_PCI_NUM_REGIONS;
+		struct vfio_pci_region *region = vidxd->vfio_pdev.region + regnum;
+
+		if (region && region->ops && region->ops->mmap &&
+		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+			return region->ops->mmap(&vidxd->vfio_pdev, region, vma);
+
+		return -EINVAL;
+	}
+
+	pg_prot = vma->vm_page_prot;
+	req_size = vma->vm_end - vma->vm_start;
+	if (req_size > PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_DONTCOPY;
+
+	offset = (vma->vm_pgoff << PAGE_SHIFT) &
+		 ((1ULL << VFIO_PCI_OFFSET_SHIFT) - 1);
+
+	wq_idx = offset >> (PAGE_SHIFT + 2);
+	if (wq_idx >= 1) {
+		dev_err(dev, "mapping invalid wq %d off %lx\n",
+			wq_idx, offset);
+		return -EINVAL;
+	}
+
+	/*
+	 * Check and see if the guest wants to map to the limited or unlimited portal.
+	 * The driver will allow mapping to unlimited portal only if the wq is a
+	 * dedicated wq. Otherwise, it goes to limited.
+	 */
+	virt_portal = ((offset >> PAGE_SHIFT) & 0x3) == 1;
+	phys_portal = IDXD_PORTAL_LIMITED;
+	if (virt_portal == IDXD_PORTAL_UNLIMITED && wq_dedicated(wq))
+		phys_portal = IDXD_PORTAL_UNLIMITED;
+
+	/* We always map IMS portals to the guest */
+	pgoff = (base + idxd_get_wq_portal_full_offset(wq->id, phys_portal,
+						       IDXD_IRQ_IMS)) >> PAGE_SHIFT;
+
+	dev_dbg(dev, "mmap %lx %lx %lx %lx\n", vma->vm_start, pgoff, req_size,
+		pgprot_val(pg_prot));
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_pgoff = pgoff;
+
+	return remap_pfn_range(vma, vma->vm_start, pgoff, req_size, pg_prot);
+}
+
+static void vidxd_vdcm_reset(struct vdcm_idxd *vidxd)
+{
+	vidxd_reset(vidxd);
+}
+
+static irqreturn_t idxd_vdcm_msix_handler(int irq, void *arg)
+{
+	struct vfio_pci_irq_ctx *ctx = (struct vfio_pci_irq_ctx *)arg;
+
+	eventfd_signal(ctx->trigger, 1);
+	return IRQ_HANDLED;
+}
+
+static void idxd_vdcm_free_irq (struct vfio_pci_core_device *vfio_pdev, int vector, int irq)
+{
+	u32 auxval;
+	if (irq) {
+		irq_bypass_unregister_producer(&vfio_pdev->ctx[vector].producer);
+		free_irq(irq, &vfio_pdev->ctx[vector]);
+		auxval = ims_ctrl_pasid_aux(0, false);
+		irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+	}
+	kfree(vfio_pdev->ctx[vector].name);
+	vfio_pdev->ctx[vector].name = NULL;
+}
+
+static int idxd_vdcm_msix_set_vector_signal(struct vdcm_idxd *vidxd, int vector, int fd)
+{
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct eventfd_ctx *trigger;
+	char *name;
+	u32 pasid, auxval;
+	int irq, rc;
+
+	dev_dbg(dev, "%s: set signal %d fd: %d\n", __func__, vector, fd);
+
+	if (vector < 0 || vector >= vfio_pdev->num_ctx) {
+		dev_warn(dev, "%s out of boundary\n", __func__);
+		return -EINVAL;
+	}
+
+	irq = vector ? dev_msi_irq_vector(dev, vector - 1) : 0;
+
+	dev_dbg(dev, "%s: irq: %d\n", __func__, irq);
+
+	if (vfio_pdev->ctx[vector].trigger) {
+		if (irq)
+			irq_bypass_unregister_producer(&vfio_pdev->ctx[vector].producer);
+
+		eventfd_ctx_put(vfio_pdev->ctx[vector].trigger);
+
+		if (fd < 0) {
+			dev_dbg(dev, "%s: trigger already set, freeing\n", __func__);
+			idxd_vdcm_free_irq(vfio_pdev, vector, irq);
+			return 0;
+		}
+		dev_dbg(dev, "%s: trigger already set, changing\n", __func__);
+		trigger = eventfd_ctx_fdget(fd);
+		if (IS_ERR(trigger)) {
+			dev_dbg(dev, "%s: trigger change failed, freeing\n", __func__);
+			idxd_vdcm_free_irq(vfio_pdev, vector, irq);
+			vfio_pdev->ctx[vector].trigger = NULL;
+			return PTR_ERR(trigger);
+		}
+		vfio_pdev->ctx[vector].trigger = trigger;
+
+		if (irq) {
+			/* Update IRQ Bypass Setting */
+			vfio_pdev->ctx[vector].producer.token = trigger;
+			vfio_pdev->ctx[vector].producer.irq = irq;
+			rc = irq_bypass_register_producer(&vfio_pdev->ctx[vector].producer);
+			if (unlikely(rc)) {
+				dev_warn(dev, "irq bypass producer (token %p) registration fails: %d\n",
+					vfio_pdev->ctx[vector].producer.token, rc);
+				vfio_pdev->ctx[vector].producer.token = NULL;
+			}
+		}
+		return 0;
+	}
+
+	if (fd < 0)
+		return 0;
+
+	name = kasprintf(GFP_KERNEL, "vfio-dev-ims[%d](%s)", vector, dev_name(dev));
+	if (!name)
+		return -ENOMEM;
+
+	trigger = eventfd_ctx_fdget(fd);
+	if (IS_ERR(trigger)) {
+		kfree(name);
+		return PTR_ERR(trigger);
+	}
+
+	vfio_pdev->ctx[vector].name = name;
+	vfio_pdev->ctx[vector].trigger = trigger;
+
+	dev_dbg(dev, "%s: trigger: %px\n", __func__, trigger);
+
+	if (!irq) {
+		dev_dbg(dev, "Mediated vector 0 set\n");
+		return 0;
+	}
+
+	rc = idxd_mdev_get_pasid(mdev, &pasid);
+	if (rc < 0) {
+		dev_warn(dev, "%s unable to get pasid, failing\n", __func__);
+		goto err;
+	}
+
+	dev_dbg(dev, "%s: pasid: %d\n", __func__, pasid);
+
+	auxval = ims_ctrl_pasid_aux(pasid, true);
+	rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+	if (rc < 0) {
+		dev_warn(dev, "%s: set IMS aux data failed: %d\n", __func__, rc);
+		goto err;
+	}
+
+	rc = request_irq(irq, idxd_vdcm_msix_handler, 0, name, &vfio_pdev->ctx[vector]);
+	if (rc < 0) {
+		dev_warn(dev, "%s request_irq() failed\n", __func__);
+		goto irq_err;
+	}
+
+	vfio_pdev->ctx[vector].producer.token = trigger;
+	vfio_pdev->ctx[vector].producer.irq = irq;
+	rc = irq_bypass_register_producer(&vfio_pdev->ctx[vector].producer);
+	if (unlikely(rc)) {
+		dev_warn(dev, "irq bypass producer (token %p) registration fails: %d\n",
+			vfio_pdev->ctx[vector].producer.token, rc);
+		vfio_pdev->ctx[vector].producer.token = NULL;
+	}
+
+	return 0;
+
+ irq_err:
+	auxval = ims_ctrl_pasid_aux(0, false);
+	irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+ err:
+	kfree(name);
+	vfio_pdev->ctx[vector].name = NULL;
+	eventfd_ctx_put(trigger);
+	vfio_pdev->ctx[vector].trigger = NULL;
+	return rc;
+}
+
+static int idxd_vdcm_msix_set_vector_signals(struct vdcm_idxd *vidxd, u32 start,
+					     u32 count, int *fds)
+{
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int i, j, rc = 0;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	if (start >= vfio_pdev->num_ctx || start + count > vfio_pdev->num_ctx) {
+		dev_warn(dev, "%s out of boundary\n", __func__);
+		return -EINVAL;
+	}
+
+	for (i = 0, j = start; i < count && !rc; i++, j++) {
+		int fd = fds ? fds[i] : -1;
+
+		dev_dbg(dev, "%s: %s signal %d, fd: %d\n",
+			__func__, (fd == -1) ? "unset" : "set", j, fd);
+		rc = idxd_vdcm_msix_set_vector_signal(vidxd, j, fd);
+	}
+
+	if (rc) {
+		dev_warn(dev, "%s: set signal failed, unwind\n", __func__);
+		for (--j; j >= (int)start; j--)
+			idxd_vdcm_msix_set_vector_signal(vidxd, j, -1);
+	}
+
+	return rc;
+}
+
+static int idxd_vdcm_msix_enable(struct vdcm_idxd *vidxd, int nvec)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int rc;
+
+	dev_dbg(dev, "%s: nvec: %d\n", __func__, nvec);
+
+	/* There should be at least 1 vectors for idxd */
+	if (nvec < 1)
+		return -EINVAL;
+
+	dev_dbg(dev, "%s: allocating\n", __func__);
+	vfio_pdev->ctx = kcalloc(nvec, sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
+	if (!vfio_pdev->ctx) {
+		dev_warn(dev, "%s: failed to alloc VFIO irq context\n", __func__);
+		return -ENOMEM;
+	}
+
+	if (nvec > 1) {
+		dev_dbg(dev, "%s: allocate %d IMS\n", __func__, nvec - 1);
+		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, nvec - 1);
+		if (rc < 0) {
+			dev_warn(dev, "%s failed to allocate irq on IMS domain: %d\n",
+				 __func__, rc);
+			kfree(vfio_pdev->ctx);
+			return rc;
+		}
+	}
+
+	vfio_pdev->num_ctx = nvec;
+	vfio_pdev->irq_type = VFIO_PCI_MSIX_IRQ_INDEX;
+	return 0;
+}
+
+static int idxd_vdcm_msix_disable(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct irq_domain *irq_domain;
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+
+	/* Check if somebody already disabled it */
+	if (vfio_pdev->num_ctx == 0)
+		return 0;
+
+	idxd_vdcm_msix_set_vector_signals(vidxd, 0, vfio_pdev->num_ctx, NULL);
+	irq_domain = dev_get_msi_domain(dev);
+	if (irq_domain)
+		msi_domain_free_irqs(irq_domain, dev);
+	kfree(vfio_pdev->ctx);
+	vfio_pdev->num_ctx = 0;
+	vfio_pdev->irq_type = VFIO_PCI_NUM_IRQS;
+	return 0;
+}
+
+static int idxd_vdcm_set_msix_trigger(struct vdcm_idxd *vidxd, u32 index, u32 start,
+				      u32 count, u32 flags, void *data)
+{
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int rc, i;
+
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	dev_dbg(dev, "%s(index: %d start: %d count: %d flags: %d data: %px\n",
+		__func__, index, start, count, flags, data);
+
+	if (count > VIDXD_MAX_MSIX_VECS)
+		count = VIDXD_MAX_MSIX_VECS;
+
+	if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
+		dev_dbg(dev, "%s disabling\n", __func__);
+		idxd_vdcm_msix_disable(vidxd);
+		return 0;
+	}
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int *fds = data;
+
+		if (vfio_pdev->irq_type == index) {
+			dev_dbg(dev, "%s straight set signal\n", __func__);
+			return idxd_vdcm_msix_set_vector_signals(vidxd, start, count, fds);
+		}
+
+		rc = idxd_vdcm_msix_enable(vidxd, start + count);
+		if (rc < 0)
+			return rc;
+
+		rc = idxd_vdcm_msix_set_vector_signals(vidxd, start, count, fds);
+		if (rc < 0)
+			idxd_vdcm_msix_disable(vidxd);
+
+		return rc;
+	}
+
+	if (start + count > VIDXD_MAX_MSIX_VECS)
+		return -EINVAL;
+
+	for (i = start; i < start + count; i++) {
+		if (!vfio_pdev->ctx[i].trigger)
+			continue;
+		if (flags & VFIO_IRQ_SET_DATA_NONE) {
+			eventfd_signal(vfio_pdev->ctx[i].trigger, 1);
+		} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+			u8 *bools = data;
+
+			if (bools[i - start])
+				eventfd_signal(vfio_pdev->ctx[i].trigger, 1);
+		}
+	}
+	return 0;
+}
+
+
+static int idxd_vdcm_set_ctx_trigger_single(struct eventfd_ctx **ctx,
+					    unsigned int count, u32 flags, void *data)
+{
+	/* DATA_NONE/DATA_BOOL enables loopback testing */
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		if (*ctx) {
+			if (count) {
+				eventfd_signal(*ctx, 1);
+			} else {
+				eventfd_ctx_put(*ctx);
+				*ctx = NULL;
+			}
+			return 0;
+		}
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		u8 trigger;
+
+		if (!count)
+			return -EINVAL;
+
+		trigger = *(u8 *)data;
+		if (trigger && *ctx)
+			eventfd_signal(*ctx, 1);
+
+		return 0;
+	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		s32 fd;
+
+		if (!count)
+			return -EINVAL;
+
+		fd = *(s32 *)data;
+		if (fd == -1) {
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+			*ctx = NULL;
+		} else if (fd >= 0) {
+			struct eventfd_ctx *efdctx;
+
+			efdctx = eventfd_ctx_fdget(fd);
+			if (IS_ERR(efdctx))
+				return PTR_ERR(efdctx);
+
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+
+			*ctx = efdctx;
+		}
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int idxd_vdcm_set_req_trigger(struct mdev_device *mdev, unsigned int index,
+				    unsigned int start, unsigned int count,
+				    u32 flags, void *data)
+{
+	if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
+		return -EINVAL;
+
+	return idxd_vdcm_set_ctx_trigger_single(&mdev->req_trigger, count, flags, data);
+}
+
+static int idxd_vdcm_set_irqs(struct vdcm_idxd *vidxd, uint32_t flags,
+			      unsigned int index, unsigned int start,
+			      unsigned int count, void *data)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+
+	dev_dbg(dev, "%s: flags: %#x index: %#x, start: %#x, count: %#x, data: %px\n",
+		__func__, flags, index, start, count, data);
+
+	switch (index) {
+	case VFIO_PCI_MSIX_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_MASK:
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			break;
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			return idxd_vdcm_set_msix_trigger(vidxd, index, start, count, flags, data);
+		}
+		break;
+	case VFIO_PCI_REQ_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			return idxd_vdcm_set_req_trigger(mdev, index, start, count, flags, data);
+		}
+		break;
+	default:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			return vfio_pci_set_ext_irq_trigger(vfio_pdev, index, start,
+							    count, flags, data);
+		}
+		break;
+	}
+
+	return -ENOTTY;
+}
+
+static long idxd_vdcm_ioctl(struct vfio_device *vdev, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	unsigned long minsz;
+	int rc = -EINVAL;
+	struct device *dev = vdev->dev;
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+
+	dev_dbg(dev, "vidxd %p ioctl, cmd: %d\n", vidxd, cmd);
+
+	mutex_lock(&vidxd->dev_lock);
+	if (cmd == VFIO_DEVICE_GET_INFO) {
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		info.flags = VFIO_DEVICE_FLAGS_PCI;
+		info.flags |= VFIO_DEVICE_FLAGS_RESET;
+		info.num_regions = VFIO_PCI_NUM_REGIONS + vfio_pdev->num_regions;
+		info.num_irqs = VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			rc = -EFAULT;
+		else
+			rc = 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+		struct vfio_region_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+		size_t size;
+		int nr_areas = 1;
+		int cap_type_id = 0;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		switch (info.index) {
+		case VFIO_PCI_CONFIG_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = VIDXD_MAX_CFG_SPACE_SZ;
+			info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR0_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vidxd->bar_size[info.index];
+			if (!info.size) {
+				info.flags = 0;
+				break;
+			}
+
+			info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR1_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0;
+			info.flags = 0;
+			break;
+		case VFIO_PCI_BAR2_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.flags = VFIO_REGION_INFO_FLAG_CAPS | VFIO_REGION_INFO_FLAG_MMAP |
+				     VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+				     VFIO_REGION_INFO_FLAG_DYNAMIC_TRAP;
+			info.size = vidxd->bar_size[1];
+
+			/*
+			 * Every WQ has two areas for unlimited and limited
+			 * MSI-X portals. IMS portals are not reported. For shared
+			 * WQ, we will only allow limited portal.
+			 */
+			nr_areas = wq_dedicated(vidxd->wq) ? 2 : 1;
+
+			size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
+			sparse = kzalloc(size, GFP_KERNEL);
+			if (!sparse) {
+				rc = -ENOMEM;
+				goto out;
+			}
+
+			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+			sparse->header.version = 1;
+			sparse->nr_areas = nr_areas;
+			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+
+			/* Unlimited portal */
+			if (wq_dedicated(vidxd->wq)) {
+				sparse->areas[0].offset = 0;
+				sparse->areas[0].size = PAGE_SIZE;
+				sparse->areas[1].offset = PAGE_SIZE;
+				sparse->areas[1].size = PAGE_SIZE;
+			} else {
+			/* Limited portal */
+				sparse->areas[0].offset = PAGE_SIZE;
+				sparse->areas[0].size = PAGE_SIZE;
+			}
+
+			break;
+
+		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0;
+			info.flags = 0;
+			dev_dbg(dev, "get region info bar:%d\n", info.index);
+			break;
+
+		case VFIO_PCI_ROM_REGION_INDEX:
+		case VFIO_PCI_VGA_REGION_INDEX:
+			dev_dbg(dev, "get region info index:%d\n", info.index);
+			break;
+		default: {
+			struct vfio_region_info_cap_type cap_type = {
+				.header.id = VFIO_REGION_INFO_CAP_TYPE,
+				.header.version = 1,
+			};
+			int i;
+
+			if (info.index >= VFIO_PCI_NUM_REGIONS + vfio_pdev->num_regions) {
+				rc = -EINVAL;
+				goto out;
+			}
+
+			info.index = array_index_nospec(info.index,
+							VFIO_PCI_NUM_REGIONS +
+							vfio_pdev->num_regions);
+			i = info.index - VFIO_PCI_NUM_REGIONS;
+
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vfio_pdev->region[i].size;
+			info.flags = vfio_pdev->region[i].flags;
+
+			cap_type.type = vfio_pdev->region[i].type;
+			cap_type.subtype = vfio_pdev->region[i].subtype;
+
+			rc = vfio_info_add_capability(&caps, &cap_type.header, sizeof(cap_type));
+			if (rc)
+				goto out;
+
+			if (vfio_pdev->region[i].ops->add_capability) {
+				rc = vfio_pdev->region[i].ops->add_capability(vfio_pdev,
+									  &vfio_pdev->region[i],
+									  &caps);
+				if (rc)
+					goto out;
+			}
+		} /* default */
+		} /* info.index switch */
+
+		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
+			if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) {
+				rc = vfio_info_add_capability(&caps, &sparse->header,
+							      sizeof(*sparse) + (sparse->nr_areas *
+							      sizeof(*sparse->areas)));
+				kfree(sparse);
+				if (rc)
+					goto out;
+			}
+		}
+
+		if (caps.size) {
+			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg + sizeof(info),
+						 caps.buf, caps.size)) {
+					kfree(caps.buf);
+					rc = -EFAULT;
+					goto out;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			rc = -EFAULT;
+		else
+			rc = 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+		struct vfio_irq_info info;
+		struct vfio_info_cap caps = {
+			.buf = NULL,
+			.size = 0
+		};
+		unsigned long capsz;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+		capsz = offsetofend(struct vfio_irq_info, cap_offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz ||
+		    info.index >= VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		if (info.argsz >= capsz)
+			minsz = capsz;
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+
+		switch (info.index) {
+		case VFIO_PCI_INTX_IRQ_INDEX:
+		case VFIO_PCI_MSI_IRQ_INDEX:
+		case VFIO_PCI_ERR_IRQ_INDEX:
+			rc = -EINVAL;
+			goto out;
+		case VFIO_PCI_MSIX_IRQ_INDEX:
+		case VFIO_PCI_REQ_IRQ_INDEX:
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+			break;
+		default: {
+			struct vfio_irq_info_cap_type cap_type = {
+				.header.id = VFIO_IRQ_INFO_CAP_TYPE,
+				.header.version = 1
+			};
+			int i;
+
+			if (info.index >= VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs)
+				return -EINVAL;
+			info.index = array_index_nospec(info.index,
+							VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs);
+			i = info.index - VFIO_PCI_NUM_IRQS;
+
+			info.flags = vfio_pdev->ext_irqs[i].flags;
+			cap_type.type = vfio_pdev->ext_irqs[i].type;
+			cap_type.subtype = vfio_pdev->ext_irqs[i].subtype;
+
+			rc = vfio_info_add_capability(&caps, &cap_type.header, sizeof(cap_type));
+			if (rc)
+				goto out;
+			break;
+		}
+		} /* switch(info.index) */
+
+		info.count = idxd_vdcm_get_irq_count(mdev, info.index);
+		if (caps.size) {
+			info.flags |= VFIO_IRQ_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg + sizeof(info), caps.buf,
+						 caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+			kfree(caps.buf);
+		}
+
+		rc = copy_to_user((void __user *)arg, &info, minsz);
+		rc = rc ? -EFAULT : 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
+		struct vfio_irq_set hdr;
+		u8 *data = NULL;
+		size_t data_size = 0;
+		int max;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		max = idxd_vdcm_get_irq_count(mdev, hdr.index);
+		rc = vfio_set_irqs_validate_and_prepare(&hdr, max,
+							VFIO_PCI_NUM_IRQS +
+							vfio_pdev->num_ext_irqs,
+							&data_size);
+		if (rc) {
+			dev_err(dev, "intel:vfio_set_irqs_validate_and_prepare failed\n");
+			goto out;
+		}
+
+		if (data_size) {
+			data = memdup_user((void __user *)(arg + minsz), data_size);
+			if (IS_ERR(data)) {
+				rc = PTR_ERR(data);
+				goto out;
+			}
+		}
+		mutex_lock(&vidxd->vfio_pdev.igate);
+		rc = idxd_vdcm_set_irqs(vidxd, hdr.flags, hdr.index, hdr.start, hdr.count, data);
+		mutex_unlock(&vidxd->vfio_pdev.igate);
+		kfree(data);
+		goto out;
+	} else if (cmd == VFIO_DEVICE_RESET) {
+		vidxd_vdcm_reset(vidxd);
+	}
+
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+	return rc;
+}
+
+static void idxd_vdcm_mdev_request(struct vfio_device *vdev, unsigned int count)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+
+	mutex_lock(&vfio_pdev->igate);
+	if (mdev->req_trigger) {
+		if (!(count % 10))
+			dev_info_ratelimited(mdev_dev(mdev),
+					     "Relaying device request to user (#%u)\n",
+					     count);
+		eventfd_signal(mdev->req_trigger, 1);
+	} else if (count == 0) {
+		dev_warn(mdev_dev(mdev),
+			 "No device request channel registered, blocked until released by user\n");
+	}
+
+	mutex_unlock(&vfio_pdev->igate);
+}
+
+static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%s\n", idxd_mdev_types[mtype_get_type_group_id(mtype)].name);
+}
+static MDEV_TYPE_ATTR_RO(name);
+
+static int find_available_mdev_instances(struct idxd_device *idxd, struct vdcm_idxd_type *type)
+{
+	int count = 0, i;
+	unsigned long flags;
+
+	switch (type->type) {
+	case IDXD_MDEV_TYPE_DSA_1_DWQ:
+	case IDXD_MDEV_TYPE_DSA_1_SWQ:
+		if (idxd->data->type != IDXD_TYPE_DSA)
+			return 0;
+		break;
+	case IDXD_MDEV_TYPE_IAX_1_DWQ:
+	case IDXD_MDEV_TYPE_IAX_1_SWQ:
+		if (idxd->data->type != IDXD_TYPE_IAX)
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq;
+
+		wq = idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			continue;
+
+		if (!is_idxd_wq_mdev(wq))
+			continue;
+
+		switch (type->type) {
+		case IDXD_MDEV_TYPE_DSA_1_DWQ:
+		case IDXD_MDEV_TYPE_IAX_1_DWQ:
+			if (wq_dedicated(wq) && !idxd_wq_refcount(wq))
+				count++;
+			break;
+		case IDXD_MDEV_TYPE_DSA_1_SWQ:
+		case IDXD_MDEV_TYPE_IAX_1_SWQ:
+			if (!wq_dedicated(wq))
+				count++;
+			break;
+		default:
+			return 0;
+		}
+	}
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	return count;
+}
+
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
+{
+	struct device *dev = mtype_get_parent_dev(mtype);
+	struct idxd_device *idxd = dev_get_drvdata(dev);
+	int count;
+	struct vdcm_idxd_type *type;
+
+	type = &idxd_mdev_types[mtype_get_type_group_id(mtype)];
+	count = find_available_mdev_instances(idxd, type);
+
+	return sprintf(buf, "%d\n", count);
+}
+static MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct mdev_type *mtype, struct mdev_type_attribute *attr,
+			       char *buf)
+{
+	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+static MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *idxd_mdev_types_attrs[] = {
+	&mdev_type_attr_name.attr,
+	&mdev_type_attr_device_api.attr,
+	&mdev_type_attr_available_instances.attr,
+	NULL,
+};
+
+static struct attribute_group idxd_mdev_type_dsa_group0 = {
+	.name = idxd_dsa_1dwq_name,
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group idxd_mdev_type_iax_group0 = {
+	.name = idxd_iax_1dwq_name,
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group idxd_mdev_type_dsa_group1 = {
+	.name = idxd_dsa_1swq_name,
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group idxd_mdev_type_iax_group1 = {
+	.name = idxd_iax_1swq_name,
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group *idxd_mdev_type_groups[] = {
+	&idxd_mdev_type_dsa_group0,
+	&idxd_mdev_type_iax_group0,
+	&idxd_mdev_type_dsa_group1,
+	&idxd_mdev_type_iax_group1,
+	NULL,
+};
+
+static const struct vfio_device_ops idxd_mdev_ops = {
+	.name = "vfio-mdev",
+	.open_device = idxd_vdcm_open,
+	.close_device = idxd_vdcm_close,
+	.read = idxd_vdcm_read,
+	.write = idxd_vdcm_write,
+	.mmap = idxd_vdcm_mmap,
+	.ioctl = idxd_vdcm_ioctl,
+	.request = idxd_vdcm_mdev_request,
+};
+
+static struct mdev_driver idxd_vdcm_driver = {
+	.driver = {
+		.name = "idxd-mdev",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
+	.probe = idxd_vdcm_probe,
+	.remove = idxd_vdcm_remove,
+};
+
+static const struct mdev_parent_ops idxd_parent_ops = {
+	.owner = THIS_MODULE,
+	.device_driver = &idxd_vdcm_driver,
+	.supported_type_groups = idxd_mdev_type_groups,
+};
+
+static int idxd_mdev_drv_probe(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -ENXIO;
+
+	mutex_lock(&wq->wq_lock);
+	wq->type = IDXD_WQT_MDEV;
+
+	rc = __drv_enable_wq(wq);
+	mutex_unlock(&wq->wq_lock);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&idxd->kref_lock);
+	/*
+	 * If kref == 1, that means there are no mdev clients and mdev has
+	 * not been registered.
+	 */
+	if (!idxd->mdev_host_init) {
+		kref_init(&idxd->mdev_kref);
+		rc = idxd_mdev_host_init(idxd, &idxd_parent_ops);
+		if (rc < 0) {
+			mutex_unlock(&idxd->kref_lock);
+			drv_disable_wq(wq);
+			dev_warn(dev, "mdev device init failed!\n");
+			return -ENXIO;
+		}
+	} else {
+		kref_get(&idxd->mdev_kref);
+	}
+	mutex_unlock(&idxd->kref_lock);
+
+	get_device(dev);
+	dev_info(dev, "wq %s enabled\n", dev_name(dev));
+	return 0;
+}
+
+static void idxd_mdev_drv_remove(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+
+
+	mutex_lock(&wq->wq_lock);
+	__drv_disable_wq(wq);
+
+	if (wq->state == IDXD_WQ_DISABLED) {
+		mutex_unlock(&wq->wq_lock);
+		return;
+	}
+
+	if (wq->state == IDXD_WQ_LOCKED)
+		wq->state = IDXD_WQ_DISABLED;
+	mutex_unlock(&wq->wq_lock);
+
+	mutex_lock(&idxd->kref_lock);
+	if (idxd->mdev_host_init)
+		kref_put(&idxd->mdev_kref, idxd_mdev_host_release);
+	mutex_unlock(&idxd->kref_lock);
+	put_device(dev);
+	dev_info(dev, "wq %s disabled\n", dev_name(dev));
+}
+
+static struct idxd_device_ops mdev_wq_ops = {
+	.notify_error = idxd_wq_vidxd_send_errors,
+};
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_WQ,
+	IDXD_DEV_NONE,
+};
+
+static struct idxd_device_driver idxd_mdev_driver = {
+	.probe = idxd_mdev_drv_probe,
+	.remove = idxd_mdev_drv_remove,
+	.name = "mdev",
+	.type = dev_types,
+	.ops = &mdev_wq_ops,
+};
+
+static int __init idxd_mdev_init(void)
+{
+	int rc;
+
+	rc = idxd_driver_register(&idxd_mdev_driver);
+	if (rc < 0)
+		return rc;
+
+	rc = mdev_register_driver(&idxd_vdcm_driver);
+	if (rc < 0) {
+		idxd_driver_unregister(&idxd_mdev_driver);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void __exit idxd_mdev_exit(void)
+{
+	mdev_unregister_driver(&idxd_vdcm_driver);
+	idxd_driver_unregister(&idxd_mdev_driver);
+}
+
+module_init(idxd_mdev_init);
+module_exit(idxd_mdev_exit);
+
+MODULE_IMPORT_NS(IDXD);
+MODULE_SOFTDEP("pre: idxd");
+MODULE_SOFTDEP("pre: mdev");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_ALIAS_IDXD_DEVICE(0);
diff --git a/drivers/vfio/mdev/idxd/mdev.h b/drivers/vfio/mdev/idxd/mdev.h
new file mode 100644
index 000000000000..775f4927b2e3
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/mdev.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#ifndef _IDXD_MDEV_H_
+#define _IDXD_MDEV_H_
+
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+/* two 64-bit BARs implemented */
+#define VIDXD_MAX_BARS 2
+#define VIDXD_MAX_CFG_SPACE_SZ 4096
+#define VIDXD_MAX_MMIO_SPACE_SZ 8192
+#define VIDXD_MSIX_TBL_SZ_OFFSET 0x42
+#define VIDXD_CAP_CTRL_SZ 0x100
+#define VIDXD_GRP_CTRL_SZ 0x100
+#define VIDXD_WQ_CTRL_SZ 0x100
+#define VIDXD_WQ_OCPY_INT_SZ 0x20
+#define VIDXD_MSIX_TBL_SZ 0x90
+#define VIDXD_MSIX_PERM_TBL_SZ 0x48
+
+#define VIDXD_MSIX_TABLE_OFFSET 0x600
+#define VIDXD_MSIX_PERM_OFFSET 0x300
+#define VIDXD_GRPCFG_OFFSET 0x400
+#define VIDXD_WQCFG_OFFSET 0x500
+#define VIDXD_IMS_OFFSET 0x1000
+
+#define VIDXD_BAR0_SIZE  0x2000
+#define VIDXD_BAR2_SIZE  0x2000
+#define VIDXD_MAX_MSIX_ENTRIES  (VIDXD_MSIX_TBL_SZ / 0x10)
+#define VIDXD_MAX_WQS	1
+#define VIDXD_MAX_MSIX_VECS	2
+
+#define VIDXD_ATS_OFFSET 0x100
+#define VIDXD_PRS_OFFSET 0x110
+#define VIDXD_PASID_OFFSET 0x120
+#define VIDXD_MSIX_PBA_OFFSET 0x700
+
+#define VIDXD_STATE_BUFFER_SIZE (4 * PAGE_SIZE)
+#define VIDXD_MAX_INTS 65536
+
+struct ioasid_mm_entry {
+	struct mm_struct *mm;
+	struct list_head node;
+};
+
+#define IDXD_DESC_SIZE sizeof(struct dsa_hw_desc)
+
+#define VIDXD_MAX_PORTALS 64
+
+struct idxd_wq_desc_elem {
+	enum idxd_portal_prot portal_prot;
+	u8  work_desc[IDXD_DESC_SIZE];
+	struct list_head link;
+};
+
+struct idxd_wq_portal {
+	u8 data[IDXD_DESC_SIZE];
+	unsigned int count;
+};
+
+struct idxd_virtual_wq {
+	unsigned int ndescs;
+	struct list_head head;
+	struct idxd_wq_portal portals[VIDXD_MAX_PORTALS];
+};
+
+struct idxd_vdev {
+	struct mdev_device *mdev;
+	struct vfio_group *vfio_group;
+	struct notifier_block pasid_nb;
+	struct mutex ioasid_lock;
+	struct list_head mm_list;
+};
+
+struct vdcm_idxd {
+	struct vfio_device vdev;
+	struct idxd_device *idxd;
+	struct idxd_wq *wq;
+	struct idxd_virtual_wq vwq;
+	struct idxd_vdev ivdev;
+	struct vdcm_idxd_type *type;
+	int num_wqs;
+
+	/* For VM use case */
+	u64 bar_val[VIDXD_MAX_BARS];
+	u64 bar_size[VIDXD_MAX_BARS];
+	u8 cfg[VIDXD_MAX_CFG_SPACE_SZ];
+	u8 bar0[VIDXD_MAX_MMIO_SPACE_SZ];
+	struct list_head list;
+	struct mutex dev_lock; /* lock for vidxd resources */
+	struct mutex mig_submit_lock;
+	bool paused;
+
+	int refcount;
+	struct vfio_pci_core_device vfio_pdev;
+};
+
+#define vdev_to_vidxd(vdev) container_of(vdev, struct vdcm_idxd, vdev)
+
+static inline struct vdcm_idxd *to_vidxd(struct idxd_vdev *vdev)
+{
+	return container_of(vdev, struct vdcm_idxd, vdev);
+}
+
+#define IDXD_MDEV_NAME_LEN 64
+
+enum idxd_mdev_type {
+	IDXD_MDEV_TYPE_NONE = -1,
+	IDXD_MDEV_TYPE_DSA_1_DWQ = 0,
+	IDXD_MDEV_TYPE_IAX_1_DWQ,
+	IDXD_MDEV_TYPE_DSA_1_SWQ,
+	IDXD_MDEV_TYPE_IAX_1_SWQ,
+};
+
+#define IDXD_MDEV_WQ_TYPES 	2
+#define IDXD_MDEV_TYPES		(IDXD_TYPE_MAX * IDXD_MDEV_WQ_TYPES)
+
+struct vdcm_idxd_type {
+	const char *name;
+	enum idxd_mdev_type type;
+	unsigned int avail_instance;
+};
+
+enum idxd_vdcm_rw {
+	IDXD_VDCM_READ = 0,
+	IDXD_VDCM_WRITE,
+};
+
+static inline u64 get_reg_val(void *buf, int size)
+{
+	u64 val = 0;
+
+	switch (size) {
+	case 8:
+		val = *(u64 *)buf;
+		break;
+	case 4:
+		val = *(u32 *)buf;
+		break;
+	case 2:
+		val = *(u16 *)buf;
+		break;
+	case 1:
+		val = *(u8 *)buf;
+		break;
+	}
+
+	return val;
+}
+
+static inline u8 vidxd_state(struct vdcm_idxd *vidxd)
+{
+	union gensts_reg *gensts = (union gensts_reg *)(vidxd->bar0 + IDXD_GENSTATS_OFFSET);
+
+	return gensts->state;
+}
+
+int idxd_mdev_host_init(struct idxd_device *idxd, const struct mdev_parent_ops *ops);
+void idxd_mdev_host_release(struct kref *kref);
+int idxd_mdev_get_pasid(struct mdev_device *mdev, u32 *pasid);
+int idxd_mdev_get_host_pasid(struct mdev_device *mdev, u32 gpasid, u32 *pasid);
+int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
+int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
+int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count);
+int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size);
+void vidxd_mmio_init(struct vdcm_idxd *vidxd);
+void vidxd_reset(struct vdcm_idxd *vidxd);
+void vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx);
+void idxd_wq_vidxd_send_errors(struct idxd_wq *wq);
+
+int vidxd_portal_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf,
+				unsigned int size);
+int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
+				unsigned int size);
+
+void vidxd_notify_revoked_handles (struct vdcm_idxd *vidxd);
+
+#endif
diff --git a/drivers/vfio/mdev/idxd/mdev_host.c b/drivers/vfio/mdev/idxd/mdev_host.c
new file mode 100644
index 000000000000..b05858e22759
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/mdev_host.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+#include <linux/irqdomain.h>
+#include <linux/irqchip/irq-ims-msi.h>
+#include <uapi/linux/idxd.h>
+#include "idxd.h"
+#include "mdev.h"
+
+extern const struct vfio_pci_regops vfio_pci_dma_fault_regops;
+
+int idxd_mdev_host_init(struct idxd_device *idxd, const struct mdev_parent_ops *ops)
+{
+	struct device *dev = &idxd->pdev->dev;
+	struct ims_array_info ims_info;
+	int rc;
+
+	if (!test_bit(IDXD_FLAG_IMS_SUPPORTED, &idxd->flags))
+		return -EOPNOTSUPP;
+
+	rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_AUX);
+	if (rc < 0) {
+		dev_warn(dev, "Failed to enable aux-domain: %d\n", rc);
+		return rc;
+	}
+
+	ims_info.max_slots = idxd->ims_size;
+	ims_info.slots = idxd->reg_base + idxd->ims_offset;
+	idxd->ims_domain = pci_ims_array_create_msi_irq_domain(idxd->pdev, &ims_info);
+	if (!idxd->ims_domain) {
+		dev_warn(dev, "Fail to acquire IMS domain\n");
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+		return -ENODEV;
+	}
+
+	rc = mdev_register_device(dev, ops);
+	if (rc < 0) {
+		dev_warn(dev, "mdev register failed\n");
+		irq_domain_remove(idxd->ims_domain);
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+		return rc;
+	}
+
+	mutex_init(&idxd->vfio_pdev.igate);
+	idxd->vfio_pdev.pdev = idxd->pdev;
+	rc = vfio_pci_dma_fault_init(&idxd->vfio_pdev, true);
+	if (rc < 0) {
+		dev_err(dev, "dma fault region init failed\n");
+		irq_domain_remove(idxd->ims_domain);
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+		mdev_unregister_device(dev);
+		return rc;
+	}
+
+	idxd->mdev_host_init = true;
+	return 0;
+}
+
+void idxd_mdev_host_release(struct kref *kref)
+{
+	struct idxd_device *idxd = container_of(kref, struct idxd_device, mdev_kref);
+	struct device *dev = &idxd->pdev->dev;
+	struct vfio_pci_core_device *vfio_pdev = &idxd->vfio_pdev;
+	int i;
+
+	if (!idxd->mdev_host_init)
+		return;
+
+	WARN_ON(iommu_unregister_device_fault_handler(dev));
+
+	for (i = 0; i < vfio_pdev->num_regions; i++)
+		vfio_pdev->region[i].ops->release(vfio_pdev, &vfio_pdev->region[i]);
+	vfio_pdev->num_regions = 0;
+	kfree(vfio_pdev->region);
+	vfio_pdev->region = NULL;
+	iommu_unregister_device_fault_handler(&idxd->pdev->dev);
+	for (i = 0; i < vfio_pdev->num_ext_irqs; i++)
+		vfio_pci_set_ext_irq_trigger(vfio_pdev,
+					     VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+					     VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
+	vfio_pdev->num_ext_irqs = 0;
+	kfree(vfio_pdev->ext_irqs);
+	vfio_pdev->ext_irqs = NULL;
+
+	irq_domain_remove(idxd->ims_domain);
+	mdev_unregister_device(dev);
+	iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+	idxd->mdev_host_init = false;
+}
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
new file mode 100644
index 000000000000..36701a1757be
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -0,0 +1,1398 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/sched/task.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+#include <linux/msi.h>
+#include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
+#include <linux/kvm_host.h>
+#include <linux/eventfd.h>
+#include <linux/irqchip/irq-ims-msi.h>
+#include <linux/sched/mm.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+#include "../mdev_private.h"
+#include "mdev.h"
+
+static void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val);
+
+void vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
+{
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+
+	eventfd_signal(vfio_pdev->ctx[msix_idx].trigger, 1);
+}
+
+static void vidxd_report_error(struct vdcm_idxd *vidxd, unsigned int error)
+{
+	u8 *bar0 = vidxd->bar0;
+	union sw_err_reg *swerr = (union sw_err_reg *)(bar0 + IDXD_SWERR_OFFSET);
+	union genctrl_reg *genctrl;
+	bool send = false;
+
+	if (!swerr->valid) {
+		memset(swerr, 0, sizeof(*swerr));
+		swerr->valid = 1;
+		swerr->error = error;
+		send = true;
+	} else if (swerr->valid && !swerr->overflow) {
+		swerr->overflow = 1;
+	}
+
+	genctrl = (union genctrl_reg *)(bar0 + IDXD_GENCTRL_OFFSET);
+	if (send && genctrl->softerr_int_en) {
+		u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+
+		*intcause |= IDXD_INTC_ERR;
+		vidxd_send_interrupt(vidxd, 0);
+	}
+}
+
+void vidxd_notify_revoked_handles (struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+
+	*intcause |= IDXD_INTC_INT_HANDLE_REVOKED;
+	pr_info("informating guest about revoked handles\n");
+	vidxd_send_interrupt(vidxd, 0);
+}
+
+static int vidxd_set_ims_pasid(struct vdcm_idxd *vidxd, int index, bool pasid_en, u32 gpasid)
+{
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+	u64 auxval;
+	u32 pasid;
+	int irq;
+	int rc;
+
+	irq = dev_msi_irq_vector(dev, index);
+
+	if (pasid_en)
+		rc = idxd_mdev_get_host_pasid(vidxd->ivdev.mdev, gpasid, &pasid);
+	else
+		rc = idxd_mdev_get_pasid(vidxd->ivdev.mdev, &pasid);
+	if (rc < 0)
+		return rc;
+	auxval = ims_ctrl_pasid_aux(pasid, 1);
+	return irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+
+}
+
+int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
+{
+	u32 offset = pos & (vidxd->bar_size[0] - 1);
+	u8 *bar0 = vidxd->bar0;
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+
+	dev_dbg(dev, "vidxd mmio W %d %x %x: %llx\n", vidxd->wq->id, size,
+		offset, get_reg_val(buf, size));
+
+	if (((size & (size - 1)) != 0) || (offset & (size - 1)) != 0)
+		return -EINVAL;
+
+	/* If we don't limit this, we potentially can write out of bound */
+	if (size > sizeof(u32))
+		return -EINVAL;
+
+	switch (offset) {
+	case IDXD_GENCFG_OFFSET ... IDXD_GENCFG_OFFSET + 3:
+		/* Write only when device is disabled. */
+		if (vidxd_state(vidxd) == IDXD_DEVICE_STATE_DISABLED)
+			memcpy(bar0 + offset, buf, size);
+		break;
+
+	case IDXD_GENCTRL_OFFSET:
+		memcpy(bar0 + offset, buf, size);
+		break;
+
+	case IDXD_INTCAUSE_OFFSET:
+		*(u32 *)&bar0[offset] &= ~(get_reg_val(buf, 4));
+		break;
+
+	case IDXD_CMD_OFFSET: {
+		u32 *cmdsts = (u32 *)(bar0 + IDXD_CMDSTS_OFFSET);
+		u32 val = get_reg_val(buf, size);
+
+		if (size != sizeof(u32))
+			return -EINVAL;
+
+		/* Check and set command in progress */
+		if (test_and_set_bit(IDXD_CMDS_ACTIVE_BIT, (unsigned long *)cmdsts) == 0)
+			vidxd_do_command(vidxd, val);
+		else
+			vidxd_report_error(vidxd, DSA_ERR_CMD_REG);
+		break;
+	}
+
+	case IDXD_SWERR_OFFSET:
+		/* W1C */
+		bar0[offset] &= ~(get_reg_val(buf, 1) & GENMASK(1, 0));
+		break;
+
+	case VIDXD_WQCFG_OFFSET ... VIDXD_WQCFG_OFFSET + VIDXD_WQ_CTRL_SZ - 1: {
+		union wqcfg *wqcfg;
+		int wq_id = (offset - VIDXD_WQCFG_OFFSET) / 0x20;
+		int subreg = offset & 0x1c;
+		u32 new_val;
+
+		if (wq_id >= VIDXD_MAX_WQS)
+			break;
+
+		/* FIXME: Need to sanitize for RO Config WQ mode 1 */
+		wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET + wq_id * 0x20);
+		if (size >= 4) {
+			new_val = get_reg_val(buf, 4);
+		} else {
+			u32 tmp1, tmp2, shift, mask;
+
+			switch (subreg) {
+			case 4:
+				tmp1 = wqcfg->bits[1];
+				break;
+			case 8:
+				tmp1 = wqcfg->bits[2];
+				break;
+			case 12:
+				tmp1 = wqcfg->bits[3];
+				break;
+			case 16:
+				tmp1 = wqcfg->bits[4];
+				break;
+			case 20:
+				tmp1 = wqcfg->bits[5];
+				break;
+			default:
+				tmp1 = 0;
+			}
+
+			tmp2 = get_reg_val(buf, size);
+			shift = (offset & 0x03U) * 8;
+			mask = ((1U << size * 8) - 1u) << shift;
+			new_val = (tmp1 & ~mask) | (tmp2 << shift);
+		}
+
+		if (subreg == 8) {
+			if (wqcfg->wq_state == 0) {
+				wqcfg->bits[2] &= 0xfe;
+				wqcfg->bits[2] |= new_val & 0xffffff01;
+			}
+		}
+
+		break;
+	} /* WQCFG */
+
+	case VIDXD_GRPCFG_OFFSET ...  VIDXD_GRPCFG_OFFSET + VIDXD_GRP_CTRL_SZ - 1:
+		/* Nothing is written. Should be all RO */
+		break;
+
+	case VIDXD_MSIX_TABLE_OFFSET ...  VIDXD_MSIX_TABLE_OFFSET + VIDXD_MSIX_TBL_SZ - 1: {
+		int index = (offset - VIDXD_MSIX_TABLE_OFFSET) / 0x10;
+		u8 *msix_entry = &bar0[VIDXD_MSIX_TABLE_OFFSET + index * 0x10];
+		u64 *pba = (u64 *)(bar0 + VIDXD_MSIX_PBA_OFFSET);
+		u8 ctrl;
+
+		ctrl = msix_entry[MSIX_ENTRY_CTRL_BYTE];
+		memcpy(bar0 + offset, buf, size);
+		/* Handle clearing of UNMASK bit */
+		if (!(msix_entry[MSIX_ENTRY_CTRL_BYTE] & MSIX_ENTRY_MASK_INT) &&
+		    ctrl & MSIX_ENTRY_MASK_INT)
+			if (test_and_clear_bit(index, (unsigned long *)pba))
+				vidxd_send_interrupt(vidxd, index);
+		break;
+	}
+
+	case VIDXD_MSIX_PERM_OFFSET ...  VIDXD_MSIX_PERM_OFFSET + VIDXD_MSIX_PERM_TBL_SZ - 1: {
+		int index, rc;
+		u32 msix_perm;
+		u32 pasid, pasid_en;
+
+		if (size != sizeof(u32) || !IS_ALIGNED(offset, sizeof(u64))) {
+			dev_warn(dev, "XXX unaligned MSIX PERM access\n");
+			break;
+		}
+
+		index = (offset - VIDXD_MSIX_PERM_OFFSET) / 8;
+		msix_perm = get_reg_val(buf, sizeof(u32)) & 0xfffff00d;
+		memcpy(bar0 + offset, buf, size);
+		/*
+		 * index 0 for MSIX is emulated for misc interrupts. The MSIX indices from
+		 * 1...N are backed by IMS. Here we would pass in index - 1, which is 0 for
+		 * the first one
+		 */
+		if (index > 0) {
+			pasid_en = (msix_perm >> 3) & 1;
+
+			/*
+			 * When vSVA is turned on, this is the only place where the guest PASID
+			 * can be retrieved by the host. The guest driver writes the PASID to the
+			 * MSIX permission entry. In turn the vdcm will translate this to the
+			 * IMS entry.
+			 */
+
+			if (pasid_en) {
+				pasid = (msix_perm >> 12) & 0xfffff;
+				if (!pasid)
+					break;
+			}
+			rc = vidxd_set_ims_pasid(vidxd, index - 1, pasid_en, pasid);
+			if (rc < 0)
+				return rc;
+		}
+		break;
+	}
+	} /* offset */
+
+	return 0;
+}
+
+int vidxd_portal_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf,
+                                unsigned int size)
+{
+	u32 offset = pos & (vidxd->bar_size[1] - 1);
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+
+	BUG_ON((size & (size - 1)) != 0);
+	BUG_ON(size > 8);
+	BUG_ON((offset & (size - 1)) != 0);
+
+	memset(buf, 0xff, size);
+
+	dev_dbg(dev, "vidxd portal mmio R %d %x %x: %llx\n",
+		vidxd->wq->id, size, offset, get_reg_val(buf, size));
+	return 0;
+}
+
+int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
+				unsigned int size)
+{
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+	u32 offset = pos & (vidxd->bar_size[1] - 1);
+	uint16_t wq_id = offset >> 14;
+	uint16_t portal_id, portal_offset;
+	struct idxd_virtual_wq *vwq;
+	struct idxd_wq *wq;
+	struct idxd_wq_portal *portal;
+	enum idxd_portal_prot portal_prot = IDXD_PORTAL_UNLIMITED;
+
+	BUG_ON((size & (size - 1)) != 0);
+	BUG_ON(size > 64);
+	BUG_ON((offset & (size - 1)) != 0);
+
+	dev_dbg(dev, "vidxd portal mmio W %d %x %x: %llx\n", vidxd->wq->id, size,
+			offset, get_reg_val(buf, size));
+
+	if (wq_id >= vidxd->num_wqs) {
+		printk("DSA portal write: Invalid wq  %d\n", wq_id);
+	}
+
+	vwq = &vidxd->vwq;
+	wq = vidxd->wq;
+
+	if (!wq_dedicated(wq) || (((offset >> PAGE_SHIFT) & 0x3) == 1))
+		portal_prot = IDXD_PORTAL_LIMITED;
+
+	portal_id = (offset & 0xFFF) >> 6;
+	portal_offset = offset & 0x3F;
+
+	portal = &vwq->portals[portal_id];
+
+	portal->count += size;
+	memcpy(&portal->data[portal_offset], buf, size);
+
+	if (portal->count == IDXD_DESC_SIZE) {
+		struct idxd_wq_desc_elem *elem;
+		u64 *p = (u64 *)portal->data;
+		printk("desc: %016llx %016llx  %016llx %016llx %016llx %016llx %016llx %016llx\n",
+				p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+
+		if (wq_dedicated(wq) && vwq->ndescs == wq->size) {
+			printk("can't submit more descriptors than WQ size. Dropping.\n");
+			memset(&portal->data, 0, IDXD_DESC_SIZE);
+			portal->count = 0;
+			return 0;
+		}
+
+		mutex_lock(&vidxd->mig_submit_lock);
+		if (vidxd->paused) {
+			elem = kmalloc(sizeof(struct idxd_wq_desc_elem),
+					GFP_KERNEL);
+
+			if (elem == NULL) {
+				printk("kmalloc failed\n");
+				return 0;
+			}
+			printk("queuing the desc\n");
+			memcpy(elem->work_desc, portal->data, IDXD_DESC_SIZE);
+			elem->portal_prot = portal_prot;
+
+			list_add_tail(&elem->link, &vwq->head);
+			vwq->ndescs++;
+               } else {
+			void __iomem *wq_portal;
+
+			wq_portal = wq->portal;
+			printk("submitting a desc to WQ %d ded %d\n", wq->id,
+					wq_dedicated(wq));
+			if (wq_dedicated(wq)) {
+				iosubmit_cmds512(wq_portal, (struct dsa_hw_desc *)p, 1);
+			} else {
+				int rc;
+				rc = enqcmds(wq_portal, (struct dsa_hw_desc *)p);
+				if (rc < 0) {
+					pr_info("%s: enqcmds failed\n", __func__);
+					return rc;
+				}
+			}
+		}
+		mutex_unlock(&vidxd->mig_submit_lock);
+		memset(&portal->data, 0, IDXD_DESC_SIZE);
+		portal->count = 0;
+	}
+
+	return 0;
+}
+
+int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
+{
+	u32 offset = pos & (vidxd->bar_size[0] - 1);
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+
+	memcpy(buf, vidxd->bar0 + offset, size);
+
+	dev_dbg(dev, "vidxd mmio R %d %x %x: %llx\n",
+		vidxd->wq->id, size, offset, get_reg_val(buf, size));
+	return 0;
+}
+
+int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count)
+{
+	u32 offset = pos & 0xfff;
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+
+	memcpy(buf, &vidxd->cfg[offset], count);
+
+	dev_dbg(dev, "vidxd pci R %d %x %x: %llx\n",
+		vidxd->wq->id, count, offset, get_reg_val(buf, count));
+
+	return 0;
+}
+
+/*
+ * Much of the emulation code has been borrowed from Intel i915 cfg space
+ * emulation code.
+ * drivers/gpu/drm/i915/gvt/cfg_space.c:
+ */
+
+/*
+ * Bitmap for writable bits (RW or RW1C bits, but cannot co-exist in one
+ * byte) byte by byte in standard pci configuration space. (not the full
+ * 256 bytes.)
+ */
+static const u8 pci_cfg_space_rw_bmp[PCI_INTERRUPT_LINE + 4] = {
+	[PCI_COMMAND]		= 0xff, 0x07,
+	[PCI_STATUS]		= 0x00, 0xf9, /* the only one RW1C byte */
+	[PCI_CACHE_LINE_SIZE]	= 0xff,
+	[PCI_BASE_ADDRESS_0 ... PCI_CARDBUS_CIS - 1] = 0xff,
+	[PCI_ROM_ADDRESS]	= 0x01, 0xf8, 0xff, 0xff,
+	[PCI_INTERRUPT_LINE]	= 0xff,
+};
+
+static void _pci_cfg_mem_write(struct vdcm_idxd *vidxd, unsigned int off, u8 *src,
+			       unsigned int bytes)
+{
+	u8 *cfg_base = vidxd->cfg;
+	u8 mask, new, old;
+	int i = 0;
+
+	for (; i < bytes && (off + i < sizeof(pci_cfg_space_rw_bmp)); i++) {
+		mask = pci_cfg_space_rw_bmp[off + i];
+		old = cfg_base[off + i];
+		new = src[i] & mask;
+
+		/**
+		 * The PCI_STATUS high byte has RW1C bits, here
+		 * emulates clear by writing 1 for these bits.
+		 * Writing a 0b to RW1C bits has no effect.
+		 */
+		if (off + i == PCI_STATUS + 1)
+			new = (~new & old) & mask;
+
+		cfg_base[off + i] = (old & ~mask) | new;
+	}
+
+	/* For other configuration space directly copy as it is. */
+	if (i < bytes)
+		memcpy(cfg_base + off + i, src + i, bytes - i);
+}
+
+static inline void _write_pci_bar(struct vdcm_idxd *vidxd, u32 offset, u32 val, bool low)
+{
+	u32 *pval;
+
+	/* BAR offset should be 32 bits algiend */
+	offset = rounddown(offset, 4);
+	pval = (u32 *)(vidxd->cfg + offset);
+
+	if (low) {
+		/*
+		 * only update bit 31 - bit 4,
+		 * leave the bit 3 - bit 0 unchanged.
+		 */
+		*pval = (val & GENMASK(31, 4)) | (*pval & GENMASK(3, 0));
+	} else {
+		*pval = val;
+	}
+}
+
+static int _pci_cfg_bar_write(struct vdcm_idxd *vidxd, unsigned int offset, void *p_data,
+			      unsigned int bytes)
+{
+	u32 new = *(u32 *)(p_data);
+	bool lo = IS_ALIGNED(offset, 8);
+	u64 size;
+	unsigned int bar_id;
+
+	/*
+	 * Power-up software can determine how much address
+	 * space the device requires by writing a value of
+	 * all 1's to the register and then reading the value
+	 * back. The device will return 0's in all don't-care
+	 * address bits.
+	 */
+	if (new == 0xffffffff) {
+		switch (offset) {
+		case PCI_BASE_ADDRESS_0:
+		case PCI_BASE_ADDRESS_1:
+		case PCI_BASE_ADDRESS_2:
+		case PCI_BASE_ADDRESS_3:
+			bar_id = (offset - PCI_BASE_ADDRESS_0) / 8;
+			size = vidxd->bar_size[bar_id];
+			_write_pci_bar(vidxd, offset, size >> (lo ? 0 : 32), lo);
+			break;
+		default:
+			/* Unimplemented BARs */
+			_write_pci_bar(vidxd, offset, 0x0, false);
+		}
+	} else {
+		switch (offset) {
+		case PCI_BASE_ADDRESS_0:
+		case PCI_BASE_ADDRESS_1:
+		case PCI_BASE_ADDRESS_2:
+		case PCI_BASE_ADDRESS_3:
+			_write_pci_bar(vidxd, offset, new, lo);
+			break;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size)
+{
+	struct device *dev = &vidxd->idxd->pdev->dev;
+	u8 *cfg = vidxd->cfg;
+	u32 offset = pos & 0xfff;
+	u64 val;
+
+	if (size > 4)
+		return -EINVAL;
+
+	if (pos + size > VIDXD_MAX_CFG_SPACE_SZ)
+		return -EINVAL;
+
+	dev_dbg(dev, "vidxd pci W %d %x %x: %llx\n", vidxd->wq->id, size, pos,
+		get_reg_val(buf, size));
+
+	/* First check if it's PCI_COMMAND */
+	if (IS_ALIGNED(pos, 2) && pos == PCI_COMMAND) {
+		bool new_bme;
+		bool bme;
+
+		if (size > 2)
+			return -EINVAL;
+
+		new_bme = !!(get_reg_val(buf, 2) & PCI_COMMAND_MASTER);
+		bme = !!(vidxd->cfg[pos] & PCI_COMMAND_MASTER);
+		_pci_cfg_mem_write(vidxd, pos, buf, size);
+
+		/* Flag error if turning off BME while device is enabled */
+		if ((bme && !new_bme) && vidxd_state(vidxd) == IDXD_DEVICE_STATE_ENABLED)
+			vidxd_report_error(vidxd, DSA_ERR_PCI_CFG);
+		return 0;
+	}
+
+	switch (pos) {
+	case PCI_BASE_ADDRESS_0 ... PCI_BASE_ADDRESS_5:
+		if (!IS_ALIGNED(pos, 4))
+			return -EINVAL;
+		return _pci_cfg_bar_write(vidxd, pos, buf, size);
+
+	case VIDXD_ATS_OFFSET + 4:
+		if (size < 4)
+			break;
+		offset += 2;
+		buf = buf + 2;
+		size -= 2;
+		fallthrough;
+
+	case VIDXD_ATS_OFFSET + 6:
+		memcpy(&cfg[offset], buf, size);
+		break;
+
+	case VIDXD_PRS_OFFSET + 4: {
+		u8 old_val, new_val;
+
+		val = get_reg_val(buf, 1);
+		old_val = cfg[VIDXD_PRS_OFFSET + 4];
+		new_val = val & 1;
+
+		cfg[offset] = new_val;
+		if (old_val == 0 && new_val == 1) {
+			/*
+			 * Clear Stopped, Response Failure,
+			 * and Unexpected Response.
+			 */
+			*(u16 *)&cfg[VIDXD_PRS_OFFSET + 6] &= ~(u16)(0x0103);
+		}
+
+		if (size < 4)
+			break;
+
+		offset += 2;
+		buf = (u8 *)buf + 2;
+		size -= 2;
+		fallthrough;
+	}
+
+	case VIDXD_PRS_OFFSET + 6:
+		cfg[offset] &= ~(get_reg_val(buf, 1) & 3);
+		break;
+
+	case VIDXD_PRS_OFFSET + 12 ... VIDXD_PRS_OFFSET + 15:
+		memcpy(&cfg[offset], buf, size);
+		break;
+
+	case VIDXD_PASID_OFFSET + 4:
+		if (size < 4)
+			break;
+		offset += 2;
+		buf = buf + 2;
+		size -= 2;
+		fallthrough;
+
+	case VIDXD_PASID_OFFSET + 6:
+		cfg[offset] = get_reg_val(buf, 1) & 5;
+		break;
+
+	default:
+		_pci_cfg_mem_write(vidxd, pos, buf, size);
+	}
+	return 0;
+}
+
+static void vidxd_mmio_init_grpcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union group_cap_reg *grp_cap = (union group_cap_reg *)(bar0 + IDXD_GRPCAP_OFFSET);
+
+	/* single group for current implementation */
+	grp_cap->rdbuf_ctrl = 0;
+	grp_cap->rdbuf_limit = 0;
+	grp_cap->total_rdbufs = 0;
+	grp_cap->num_groups = 1;
+}
+
+static void vidxd_mmio_init_grpcfg(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	struct grpcfg *grpcfg = (struct grpcfg *)(bar0 + VIDXD_GRPCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	struct idxd_group *group = wq->group;
+	int i;
+
+	/*
+	 * At this point, we are only exporting a single workqueue for
+	 * each mdev. So we need to just fake it as first workqueue
+	 * and also mark the available engines in this group.
+	 */
+
+	/* Set single workqueue and the first one */
+	grpcfg->wqs[0] = BIT(0);
+	grpcfg->engines = 0;
+	for (i = 0; i < group->num_engines; i++)
+		grpcfg->engines |= BIT(i);
+	grpcfg->flags.bits = group->grpcfg.flags.bits;
+}
+
+static void vidxd_mmio_init_wqcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	struct idxd_wq *wq = vidxd->wq;
+	union wq_cap_reg *wq_cap = (union wq_cap_reg *)(bar0 + IDXD_WQCAP_OFFSET);
+
+	wq_cap->occupancy_int = 0;
+	wq_cap->occupancy = 0;
+	wq_cap->priority = 0;
+	wq_cap->total_wq_size = wq->size;
+	wq_cap->num_wqs = VIDXD_MAX_WQS;
+	wq_cap->wq_ats_support = 0;
+	if (wq_dedicated(wq))
+		wq_cap->dedicated_mode = 1;
+	else
+		wq_cap->shared_mode = 1;
+}
+
+static void vidxd_mmio_init_wqcfg(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	struct idxd_wq *wq = vidxd->wq;
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+
+	wqcfg->wq_size = wq->size;
+	wqcfg->wq_thresh = wq->threshold;
+
+	if (wq_dedicated(wq))
+		wqcfg->mode = WQCFG_MODE_DEDICATED;
+	else if (device_pasid_enabled(idxd))
+		wqcfg->pasid_en = 1;
+
+	wqcfg->bof = wq->wqcfg->bof;
+
+	wqcfg->priority = wq->priority;
+	wqcfg->max_xfer_shift = idxd->hw.gen_cap.max_xfer_shift;
+	wqcfg->max_batch_shift = idxd->hw.gen_cap.max_batch_shift;
+	wqcfg->mode_support = 1;
+}
+
+static void vidxd_mmio_init_engcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union engine_cap_reg *engcap = (union engine_cap_reg *)(bar0 + IDXD_ENGCAP_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	struct idxd_group *group = wq->group;
+
+	engcap->num_engines = group->num_engines;
+}
+
+static void vidxd_mmio_init_gencap(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	union gen_cap_reg *gencap = (union gen_cap_reg *)(bar0 + IDXD_GENCAP_OFFSET);
+
+	gencap->bits = idxd->hw.gen_cap.bits;
+	gencap->config_en = 0;
+	gencap->max_ims_mult = 0;
+	gencap->cmd_cap = 1;
+	if (device_pasid_enabled(idxd))
+		gencap->block_on_fault = 1;
+}
+
+static void vidxd_mmio_init_cmdcap(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	u32 *cmdcap = (u32 *)(bar0 + IDXD_CMDCAP_OFFSET);
+
+	if (idxd->hw.cmd_cap)
+		*cmdcap = idxd->hw.cmd_cap;
+	else
+		*cmdcap = 0x1ffe;
+
+	*cmdcap |= BIT(IDXD_CMD_REQUEST_INT_HANDLE) | BIT(IDXD_CMD_RELEASE_INT_HANDLE) |
+			BIT(IDXD_CMD_REVOKED_HANDLES_PROCESSED);
+}
+
+static void vidxd_mmio_init_opcap(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u64 opcode;
+	u8 *bar0 = vidxd->bar0;
+	u64 *opcap = (u64 *)(bar0 + IDXD_OPCAP_OFFSET);
+
+	if (idxd->data->type == IDXD_TYPE_DSA) {
+		opcode = BIT_ULL(DSA_OPCODE_NOOP) | BIT_ULL(DSA_OPCODE_BATCH) |
+			 BIT_ULL(DSA_OPCODE_DRAIN) | BIT_ULL(DSA_OPCODE_MEMMOVE) |
+			 BIT_ULL(DSA_OPCODE_MEMFILL) | BIT_ULL(DSA_OPCODE_COMPARE) |
+			 BIT_ULL(DSA_OPCODE_COMPVAL) | BIT_ULL(DSA_OPCODE_CR_DELTA) |
+			 BIT_ULL(DSA_OPCODE_AP_DELTA) | BIT_ULL(DSA_OPCODE_DUALCAST) |
+			 BIT_ULL(DSA_OPCODE_CRCGEN) | BIT_ULL(DSA_OPCODE_COPY_CRC) |
+			 BIT_ULL(DSA_OPCODE_DIF_CHECK) | BIT_ULL(DSA_OPCODE_DIF_INS) |
+			 BIT_ULL(DSA_OPCODE_DIF_STRP) | BIT_ULL(DSA_OPCODE_DIF_UPDT) |
+			 BIT_ULL(DSA_OPCODE_CFLUSH);
+		*opcap = opcode;
+	} else if (idxd->data->type == IDXD_TYPE_IAX) {
+		opcode = BIT_ULL(IAX_OPCODE_NOOP) | BIT_ULL(IAX_OPCODE_DRAIN) |
+			 BIT_ULL(IAX_OPCODE_MEMMOVE);
+		*opcap = opcode;
+		opcap++;
+		opcode = OPCAP_BIT(IAX_OPCODE_DECOMPRESS) | OPCAP_BIT(IAX_OPCODE_COMPRESS) |
+			 OPCAP_BIT(IAX_OPCODE_CRC64) | OPCAP_BIT(IAX_OPCODE_ZERO_DECOMP_32) |
+			 OPCAP_BIT(IAX_OPCODE_ZERO_DECOMP_16) | OPCAP_BIT(IAX_OPCODE_DECOMP_32) |
+			 OPCAP_BIT(IAX_OPCODE_DECOMP_16) | OPCAP_BIT(IAX_OPCODE_SCAN) |
+			 OPCAP_BIT(IAX_OPCODE_SET_MEMBER) | OPCAP_BIT(IAX_OPCODE_EXTRACT) |
+			 OPCAP_BIT(IAX_OPCODE_SELECT) | OPCAP_BIT(IAX_OPCODE_RLE_BURST) |
+			 OPCAP_BIT(IAX_OPCDE_FIND_UNIQUE) | OPCAP_BIT(IAX_OPCODE_EXPAND);
+		*opcap = opcode;
+	}
+}
+
+static void vidxd_mmio_init_version(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u32 *version;
+
+	version = (u32 *)vidxd->bar0;
+	*version = idxd->hw.version;
+}
+
+static void vidxd_mmio_reset(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+
+	memset(bar0 + IDXD_GENCFG_OFFSET, 0, 4);
+	memset(bar0 + IDXD_GENCTRL_OFFSET, 0, 4);
+	memset(bar0 + IDXD_GENSTATS_OFFSET, 0, 4);
+	memset(bar0 + IDXD_INTCAUSE_OFFSET, 0, 4);
+	memset(bar0 + IDXD_INTCAUSE_OFFSET, 0, 4);
+	memset(bar0 + VIDXD_MSIX_PBA_OFFSET, 0, 1);
+	memset(bar0 + VIDXD_MSIX_PERM_OFFSET, 0, VIDXD_MSIX_PERM_TBL_SZ);
+
+	vidxd_mmio_init_grpcfg(vidxd);
+	vidxd_mmio_init_wqcfg(vidxd);
+}
+
+void vidxd_mmio_init(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union offsets_reg *offsets;
+
+	memset(vidxd->bar0, 0, VIDXD_BAR0_SIZE);
+
+	vidxd_mmio_init_version(vidxd);
+	vidxd_mmio_init_gencap(vidxd);
+	vidxd_mmio_init_wqcap(vidxd);
+	vidxd_mmio_init_grpcap(vidxd);
+	vidxd_mmio_init_engcap(vidxd);
+	vidxd_mmio_init_opcap(vidxd);
+
+	offsets = (union offsets_reg *)(bar0 + IDXD_TABLE_OFFSET);
+	offsets->grpcfg = VIDXD_GRPCFG_OFFSET / 0x100;
+	offsets->wqcfg = VIDXD_WQCFG_OFFSET / 0x100;
+	offsets->msix_perm = VIDXD_MSIX_PERM_OFFSET / 0x100;
+
+	vidxd_mmio_init_cmdcap(vidxd);
+	memset(bar0 + VIDXD_MSIX_PERM_OFFSET, 0, VIDXD_MSIX_PERM_TBL_SZ);
+	vidxd_mmio_init_grpcfg(vidxd);
+	vidxd_mmio_init_wqcfg(vidxd);
+}
+
+static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val)
+{
+	u8 *bar0 = vidxd->bar0;
+	u32 *cmd = (u32 *)(bar0 + IDXD_CMD_OFFSET);
+	u32 *cmdsts = (u32 *)(bar0 + IDXD_CMDSTS_OFFSET);
+	u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	*cmdsts = val;
+	dev_dbg(dev, "%s: cmd: %#x  status: %#x\n", __func__, *cmd, val);
+
+	if (*cmd & IDXD_CMD_INT_MASK) {
+		*intcause |= IDXD_INTC_CMD;
+		vidxd_send_interrupt(vidxd, 0);
+	}
+}
+
+static void vidxd_enable(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	bool ats = (*(u16 *)&vidxd->cfg[VIDXD_ATS_OFFSET + 6]) & (1U << 15);
+	bool prs = (*(u16 *)&vidxd->cfg[VIDXD_PRS_OFFSET + 4]) & 1U;
+	bool pasid = (*(u16 *)&vidxd->cfg[VIDXD_PASID_OFFSET + 6]) & 1U;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (gensts->state == IDXD_DEVICE_STATE_ENABLED)
+		return idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DEV_ENABLED);
+
+	/* Check PCI configuration */
+	if (!(vidxd->cfg[PCI_COMMAND] & PCI_COMMAND_MASTER))
+		return idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_BUSMASTER_EN);
+
+	if (pasid != prs || (pasid && !ats))
+		return idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_BUSMASTER_EN);
+
+	gensts->state = IDXD_DEVICE_STATE_ENABLED;
+
+	return idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_disable(struct vdcm_idxd *vidxd)
+{
+	struct idxd_wq *wq;
+	union wqcfg *wqcfg;
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (gensts->state == IDXD_DEVICE_STATE_DISABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DIS_DEV_EN);
+		return;
+	}
+
+	wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	wq = vidxd->wq;
+
+	/* If it is a DWQ, need to disable the DWQ as well */
+	if (wq_dedicated(wq)) {
+		idxd_wq_disable(wq, false, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable (wq disable) failed: %#x\n", status);
+			idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DIS_DEV_EN);
+			return;
+		}
+	} else {
+		idxd_wq_drain(wq, &status);
+		if (status)
+			dev_warn(dev, "vidxd disable (wq drain) failed: %#x\n", status);
+	}
+
+	wqcfg->wq_state = 0;
+	gensts->state = IDXD_DEVICE_STATE_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_drain_all(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	idxd_wq_drain(wq, NULL);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_drain(struct vdcm_idxd *vidxd, int val)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	u32 status;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	idxd_wq_drain(wq, &status);
+	if (status) {
+		dev_dbg(dev, "wq drain failed: %#x\n", status);
+		idxd_complete_command(vidxd, status);
+		return;
+	}
+
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_abort_all(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (wq_dedicated(wq))
+		idxd_wq_abort(wq, NULL);
+	else
+		idxd_wq_drain(wq, NULL);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_abort(struct vdcm_idxd *vidxd, int val)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	u32 status;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	if (wq_dedicated(wq))
+		idxd_wq_abort(wq, &status);
+	else
+		idxd_wq_drain(wq, &status);
+	if (status) {
+		dev_dbg(dev, "wq abort failed: %#x\n", status);
+		idxd_complete_command(vidxd, status);
+		return;
+	}
+
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+void vidxd_reset(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct idxd_wq *wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+	gensts->state = IDXD_DEVICE_STATE_DRAIN;
+	wq = vidxd->wq;
+
+	if (wq->state == IDXD_WQ_ENABLED) {
+		if (wq_dedicated(wq)) {
+			idxd_wq_abort(wq, NULL);
+			idxd_wq_disable(wq, false, NULL);
+		} else {
+			idxd_wq_drain(wq, NULL);
+		}
+	}
+
+	vidxd_mmio_reset(vidxd);
+	gensts->state = IDXD_DEVICE_STATE_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_reset(struct vdcm_idxd *vidxd, int wq_id_mask)
+{
+	struct idxd_wq *wq;
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status;
+
+	wq = vidxd->wq;
+	dev_dbg(dev, "vidxd reset wq %u:%u\n", 0, wq->id);
+
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	if (wq_dedicated(wq)) {
+		idxd_wq_abort(wq, &status);
+		if (status) {
+			dev_dbg(dev, "vidxd reset wq failed to abort: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+
+		idxd_wq_disable(wq, false, &status);
+		if (status) {
+			dev_dbg(dev, "vidxd reset wq failed to disable: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	} else {
+		idxd_wq_drain(wq, &status);
+		if (status) {
+			dev_dbg(dev, "vidxd reset wq failed to drain: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	}
+
+	wqcfg->wq_state = IDXD_WQ_DEV_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_alloc_int_handle(struct vdcm_idxd *vidxd, int operand)
+{
+	bool ims = !!(operand & CMD_INT_HANDLE_IMS);
+	u32 cmdsts;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	int ims_idx, vidx;
+
+	vidx = operand & GENMASK(15, 0);
+
+	dev_dbg(dev, "allocating int handle for %d\n", vidx);
+
+	/* vidx cannot be 0 since that's emulated and does not require IMS handle */
+	if (vidx <= 0 || vidx >= VIDXD_MAX_MSIX_VECS) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_INVAL_INT_IDX);
+		return;
+	}
+
+	if (ims) {
+		dev_warn(dev, "IMS allocation is not implemented yet\n");
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_NO_HANDLE);
+		return;
+	}
+
+	ims_idx = dev_msi_hwirq(dev, vidx - 1);
+	cmdsts = ims_idx << IDXD_CMDSTS_RES_SHIFT;
+	dev_dbg(dev, "requested index %d handle %d\n", vidx, ims_idx);
+	idxd_complete_command(vidxd, cmdsts);
+}
+
+static void vidxd_revoked_handles_processed (struct vdcm_idxd *vidxd,
+		int operand)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_virtual_wq *vwq = &vidxd->vwq;
+	int idx;
+	u32 status;
+
+        printk("completed revoked int handle\n");
+
+	idxd_complete_command(vidxd, 0);
+
+	BUG_ON(!list_empty(&vwq->head));
+
+	/* Step 1. Drain all the WQs associated with this VM. Currently only 1 */
+	idxd_wq_drain(vidxd->wq, &status);
+
+	if (status)
+		dev_dbg(dev, "wq drain failed: %#x\n", status);
+
+	/* Step 2. Generate a completion interrupt for all int handles */
+	for (idx = 1; idx < VIDXD_MAX_MSIX_VECS; idx++) {
+		dev_dbg(dev, "revoked int handle processed idx %d\n", idx);
+		vidxd_send_interrupt(vidxd, idx);
+	}
+}
+
+static void vidxd_release_int_handle(struct vdcm_idxd *vidxd, int operand)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	bool ims = !!(operand & CMD_INT_HANDLE_IMS);
+	int handle, i;
+	bool found = false;
+
+	handle = operand & GENMASK(15, 0);
+	dev_dbg(dev, "allocating int handle %d\n", handle);
+
+	if (ims) {
+		dev_warn(dev, "IMS allocation is not implemented yet\n");
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_INVAL_INT_IDX_RELEASE);
+		return;
+	}
+
+	/* IMS backed entry start at 1, 0 is emulated vector */
+	for (i = 0; i < VIDXD_MAX_MSIX_VECS - 1; i++) {
+		if (dev_msi_hwirq(dev, i) == handle) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		dev_warn(dev, "Freeing unallocated int handle.\n");
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_INVAL_INT_IDX_RELEASE);
+	}
+
+	dev_dbg(dev, "int handle %d released.\n", handle);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_enable(struct vdcm_idxd *vidxd, int wq_id)
+{
+	struct idxd_wq *wq;
+	u8 *bar0 = vidxd->bar0;
+	union wq_cap_reg *wqcap;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_device *idxd;
+	union wqcfg *vwqcfg, *wqcfg;
+	int rc;
+	bool wq_pasid_enable;
+	bool pasid_enabled = (*(u16 *)&vidxd->cfg[VIDXD_PASID_OFFSET + 6]) & 1U;
+
+	if (wq_id >= VIDXD_MAX_WQS) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_INVAL_WQIDX);
+		return;
+	}
+
+	idxd = vidxd->idxd;
+	wq = vidxd->wq;
+
+	dev_dbg(dev, "%s: wq %u:%u\n", __func__, wq_id, wq->id);
+
+	vwqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET + wq_id * 32);
+	wqcap = (union wq_cap_reg *)(bar0 + IDXD_WQCAP_OFFSET);
+	wqcfg = wq->wqcfg;
+
+	if (vidxd_state(vidxd) != IDXD_DEVICE_STATE_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DEV_NOTEN);
+		return;
+	}
+
+	if (vwqcfg->wq_state != IDXD_WQ_DEV_DISABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_ENABLED);
+		return;
+	}
+
+	if ((!wq_dedicated(wq) && wqcap->shared_mode == 0) ||
+	    (wq_dedicated(wq) && wqcap->dedicated_mode == 0)) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_MODE);
+		return;
+	}
+
+	if ((!wq_dedicated(wq) && vwqcfg->pasid_en == 0) ||
+	    (vwqcfg->pasid_en && pasid_enabled == 0)) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_PASID_EN);
+		return;
+	}
+
+	wq_pasid_enable = vwqcfg->pasid_en;
+
+	if (wq_dedicated(wq)) {
+		u32 wq_pasid;
+		bool priv;
+
+		if (wq_pasid_enable) {
+			u32 gpasid;
+
+			priv = vwqcfg->priv;
+			gpasid = vwqcfg->pasid;
+			rc = idxd_mdev_get_host_pasid(mdev, gpasid, &wq_pasid);
+		} else {
+			priv = 1;
+			rc = idxd_mdev_get_pasid(mdev, &wq_pasid);
+		}
+		if (rc < 0) {
+			dev_err(dev, "idxd pasid setup failed wq %d: %d\n", wq->id, rc);
+			idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_PASID_EN);
+			return;
+		}
+
+		if (wq_pasid >= 0) {
+			u32 status;
+			unsigned long flags;
+
+			wqcfg->bits[WQCFG_PASID_IDX] &= ~GENMASK(29, 8);
+			wqcfg->priv = priv;
+			wqcfg->pasid_en = 1;
+			wqcfg->pasid = wq_pasid;
+			dev_dbg(dev, "program pasid %d in wq %d\n", wq_pasid, wq->id);
+			spin_lock_irqsave(&idxd->dev_lock, flags);
+			idxd_wq_setup_pasid(wq, wq_pasid);
+			idxd_wq_setup_priv(wq, priv);
+			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			idxd_wq_enable(wq, &status);
+			if (status) {
+				dev_err(dev, "vidxd enable wq %d failed\n", wq->id);
+				idxd_complete_command(vidxd, status);
+				return;
+			}
+		} else {
+			dev_err(dev, "idxd pasid setup failed wq %d wq_pasid %d\n",
+				wq->id, wq_pasid);
+			idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_PASID_EN);
+			return;
+		}
+	}
+
+	vwqcfg->wq_state = IDXD_WQ_DEV_ENABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_disable(struct vdcm_idxd *vidxd, int wq_id_mask)
+{
+	struct idxd_wq *wq;
+	union wqcfg *wqcfg, *vwqcfg;
+	u8 *bar0 = vidxd->bar0;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status;
+
+	wq = vidxd->wq;
+
+	dev_dbg(dev, "vidxd disable wq %u:%u\n", 0, wq->id);
+
+	wqcfg = wq->wqcfg;
+	vwqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	if (vwqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	/* If it is a DWQ, need to disable the DWQ as well */
+	if (wq_dedicated(wq)) {
+		struct ioasid_set *ioasid_set;
+		struct mm_struct *mm;
+
+		idxd_wq_disable(wq, false, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable wq failed: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+
+		if (vwqcfg->pasid_en) {
+			mm = get_task_mm(current);
+			if (!mm) {
+				dev_dbg(dev, "Can't retrieve task mm\n");
+				return;
+			}
+
+			ioasid_set = ioasid_find_mm_set(mm);
+			if (!ioasid_set) {
+				dev_dbg(dev, "Unable to find ioasid_set\n");
+				mmput(mm);
+				return;
+			}
+			mmput(mm);
+			if (!ioasid_put(ioasid_set, wqcfg->pasid))
+				dev_warn(dev, "Unable to put ioasid\n");
+		}
+	} else {
+		idxd_wq_drain(wq, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable drain wq failed: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	}
+
+	vwqcfg->wq_state = IDXD_WQ_DEV_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+void vidxd_free_ims_entries(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	msi_domain_free_irqs(dev_get_msi_domain(dev), dev);
+}
+
+static bool command_supported(struct vdcm_idxd *vidxd, u32 cmd)
+{
+	u8 *bar0 = vidxd->bar0;
+	u32 *cmd_cap = (u32 *)(bar0 + IDXD_CMDCAP_OFFSET);
+
+	return !!(*cmd_cap & BIT(cmd));
+}
+
+static void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val)
+{
+	union idxd_command_reg *reg = (union idxd_command_reg *)(vidxd->bar0 + IDXD_CMD_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	reg->bits = val;
+
+	dev_dbg(dev, "%s: cmd code: %u reg: %x\n", __func__, reg->cmd, reg->bits);
+
+	if (!command_supported(vidxd, reg->cmd)) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_INVAL_CMD);
+		return;
+	}
+
+	switch (reg->cmd) {
+	case IDXD_CMD_ENABLE_DEVICE:
+		vidxd_enable(vidxd);
+		break;
+	case IDXD_CMD_DISABLE_DEVICE:
+		vidxd_disable(vidxd);
+		break;
+	case IDXD_CMD_DRAIN_ALL:
+		vidxd_drain_all(vidxd);
+		break;
+	case IDXD_CMD_ABORT_ALL:
+		vidxd_abort_all(vidxd);
+		break;
+	case IDXD_CMD_RESET_DEVICE:
+		vidxd_reset(vidxd);
+		break;
+	case IDXD_CMD_ENABLE_WQ:
+		vidxd_wq_enable(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_DISABLE_WQ:
+		vidxd_wq_disable(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_DRAIN_WQ:
+		vidxd_wq_drain(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_ABORT_WQ:
+		vidxd_wq_abort(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_RESET_WQ:
+		vidxd_wq_reset(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_REQUEST_INT_HANDLE:
+		vidxd_alloc_int_handle(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_RELEASE_INT_HANDLE:
+		vidxd_release_int_handle(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_REVOKED_HANDLES_PROCESSED:
+		vidxd_revoked_handles_processed(vidxd, reg->operand);
+		break;
+	default:
+		idxd_complete_command(vidxd, IDXD_CMDSTS_INVAL_CMD);
+		break;
+	}
+}
+
+static void vidxd_send_errors(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	union sw_err_reg *swerr = (union sw_err_reg *)(bar0 + IDXD_SWERR_OFFSET);
+	union genctrl_reg *genctrl = (union genctrl_reg *)(bar0 + IDXD_GENCTRL_OFFSET);
+	u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	if (swerr->valid) {
+		if (!swerr->overflow)
+			swerr->overflow = 1;
+		return;
+	}
+
+	for (i = 0; i < 4; i++)
+		swerr->bits[i] = idxd->sw_err.bits[i];
+
+	*intcause |= IDXD_INTC_ERR;
+	if (genctrl->softerr_int_en)
+		vidxd_send_interrupt(vidxd, 0);
+}
+
+void idxd_wq_vidxd_send_errors(struct idxd_wq *wq)
+{
+	struct vdcm_idxd *vidxd;
+
+	list_for_each_entry(vidxd, &wq->vdcm_list, list)
+		vidxd_send_errors(vidxd);
+}
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index bce7c43657d5..19e4e532918f 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -131,6 +131,8 @@ enum dsa_completion_status {
 	DSA_COMP_HW_ERR1,
 	DSA_COMP_HW_ERR_DRB,
 	DSA_COMP_TRANSLATION_FAIL,
+	DSA_ERR_PCI_CFG = 0x51,
+	DSA_ERR_CMD_REG,
 };
 
 enum iax_completion_status {
-- 
Gitee


From 35f0c504afd2c64741cb0bf82a014b7e2863aec7 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 13 Jan 2022 09:41:14 -0800
Subject: [PATCH 1848/2161] iommu/vt-d: Fix PCI bus rescan device hot add

commit 316f92a705a4c2bf4712135180d56f3cca09243a upstream.

During PCI bus rescan, adding new devices involve two notifiers.
1. dmar_pci_bus_notifier()
2. iommu_bus_notifier()
The current code sets #1 as low priority (INT_MIN) which resulted in #2
being invoked first. The result is that struct device pointer cannot be
found in DRHD search for the new device's DMAR/IOMMU. Subsequently, the
device is put under the "catch-all" IOMMU instead of the correct one.

This could cause system hang when device TLB invalidation is sent to the
wrong IOMMU. Invalidation timeout error or hard lockup can be observed.

This patch fixes the issue by setting a higher priority for
dmar_pci_bus_notifier. DRHD search for a new device will find the
correct IOMMU.

Reported-by: Zhang, Bernice <bernice.zhang@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/dmar.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 333aa51ee309..c23eeafdf51b 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -387,7 +387,7 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb,
 
 static struct notifier_block dmar_pci_bus_nb = {
 	.notifier_call = dmar_pci_bus_notifier,
-	.priority = INT_MIN,
+	.priority = INT_MAX,
 };
 
 static struct dmar_drhd_unit *
-- 
Gitee


From c5fe6ee811765cc47050f52e793d3bda788fe900 Mon Sep 17 00:00:00 2001
From: Like Xu <like.xu@linux.intel.com>
Date: Fri, 30 Apr 2021 13:22:46 +0800
Subject: [PATCH 1849/2161] perf/x86: Avoid touching LBR_TOS MSR for Arch LBR

commit 3317c26a4b413b41364f2c4b83c778c6aba1576d upstream.

The Architecture LBR does not have MSR_LBR_TOS (0x000001c9).
In a guest that should support Architecture LBR, check_msr()
will be a non-related check for the architecture MSR 0x0
(IA32_P5_MC_ADDR) that is also not supported by KVM.

The failure will cause x86_pmu.lbr_nr = 0, thereby preventing
the initialization of the guest Arch LBR. Fix it by avoiding
this extraneous check in intel_pmu_init() for Arch LBR.

Fixes: 47125db27e47 ("perf/x86/intel/lbr: Support Architectural LBR")
Signed-off-by: Like Xu <like.xu@linux.intel.com>
[peterz: simpler still]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210430052247.3079672-1-like.xu@linux.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 10183b167dfc..89795c2e3a24 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5515,7 +5515,7 @@ __init int intel_pmu_init(void)
 	 * Check all LBT MSR here.
 	 * Disable LBR access if any LBR MSRs can not be accessed.
 	 */
-	if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+	if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))
 		x86_pmu.lbr_nr = 0;
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
-- 
Gitee


From 64eb5a393438e4a6d5de816a061a3dc8345d4b98 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 21 Jan 2022 10:18:12 -0700
Subject: [PATCH 1850/2161] dmaengine: idxd: remove shared wq restriction on
 config load

commit 684faefea881973dbada146515c3902960efed41 Intel-BKC.

Shared WQ restriction accidentally left in idxd_wq_load_config(). Not
needed when we support swq in VDCM.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 993b7fccf6a0..d237e1f7e98e 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1118,14 +1118,24 @@ static int idxd_wq_load_config(struct idxd_wq *wq)
 	wq->size = wq->wqcfg->wq_size;
 	wq->threshold = wq->wqcfg->wq_thresh;
 
-	/* The driver does not support shared WQ mode in read-only config yet */
-	if (wq->wqcfg->mode == 0 || wq->wqcfg->pasid_en)
-		return -EOPNOTSUPP;
-
-	set_bit(WQ_FLAG_DEDICATED, &wq->flags);
+	if (wq->wqcfg->mode)
+		set_bit(WQ_FLAG_DEDICATED, &wq->flags);
 
 	wq->priority = wq->wqcfg->priority;
 
+	if (wq->wqcfg->bof)
+		set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+
+	if (device_pasid_enabled(idxd) && wq->wqcfg->mode == 1) {
+		wq->wqcfg->pasid_en = 1;
+		wq->wqcfg->pasid = idxd->pasid;
+		wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+		iowrite32(wq->wqcfg->bits[WQCFG_PASID_IDX], idxd->reg_base + wqcfg_offset);
+	}
+
+	if (wq->wqcfg->mode_support)
+		set_bit(WQ_FLAG_MODE_1, &wq->flags);
+
 	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
 		wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i);
 		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n", wq->id, i, wqcfg_offset, wq->wqcfg->bits[i]);
-- 
Gitee


From eef2d2f4d3819b463ee890c2b1e17f377f22eddc Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 25 Jan 2022 16:40:49 -0700
Subject: [PATCH 1851/2161] dmaengine: idxd: fix wq reset on RO config

commit d249f9c11aad36451ae9f723b727eb9a55ddec8e Intel-BKC.

WQ reset clears all configuration. However, for devices that are RO, the
internal data structure should not be cleared. Skip when device is not
configurable (RO).

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index d237e1f7e98e..25f8c9f79200 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -301,7 +301,8 @@ void idxd_wq_reset(struct idxd_wq *wq)
 
 	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
 	idxd_cmd_exec(idxd, IDXD_CMD_RESET_WQ, operand, NULL);
-	idxd_wq_disable_cleanup(wq);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		idxd_wq_disable_cleanup(wq);
 	wq->state = IDXD_WQ_DISABLED;
 }
 EXPORT_SYMBOL_GPL(idxd_wq_reset);
-- 
Gitee


From 6a8cfd6bee109e85aadbdb62e55ea3df3ab76065 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Thu, 19 Aug 2021 09:28:44 -0700
Subject: [PATCH 1852/2161] IDXD/MDEV: Add support for handling 8 byte MMIO
 accesses.

commit 4dabc644d8bbec7a8bdaa16d6e8486b1ed089f0a Intel-BKC.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 69ba0f67052a..9a8b428e0391 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -1198,7 +1198,19 @@ static ssize_t idxd_vdcm_read(struct vfio_device *vdev, char __user *buf, size_t
 	while (count) {
 		size_t filled;
 
-		if (count >= 4 && !(*ppos % 4)) {
+		if (count >= 8 && !(*ppos % 8)) {
+			u64 val;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 8;
+		} else if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
 			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
@@ -1283,7 +1295,19 @@ static ssize_t idxd_vdcm_write(struct vfio_device *vdev, const char __user *buf,
 	while (count) {
 		size_t filled;
 
-		if (count >= 4 && !(*ppos % 4)) {
+		if (count >= 8 && !(*ppos % 8)) {
+			u64 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 8;
+		} else if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
 			if (copy_from_user(&val, buf, sizeof(val)))
-- 
Gitee


From fa8e77b8f9d92483f19ebfe90dbbe2373a0a4068 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Thu, 30 Sep 2021 03:04:53 -0700
Subject: [PATCH 1853/2161] IDXD/MDEV: Fix the code to return retries on ENQCMD
 and ENQCMDS instructions when the vDEV is paused

commit 518ad28138575b43a6688c02b43dcc52b284fc8f Intel-BKC.

Fix the code to return retries on ENQCMD and ENQCMDS instructions when the vDEV
is paused. Also fix the code where we were returning without unlocking the
mig_submit_lock mutex on errors. Also use the exact same portal
(within the MMIO page) to re-submit the work on destination that was used to
submit the work on source machine during live VM migration.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c |  4 ++
 drivers/vfio/mdev/idxd/mdev.h |  1 +
 drivers/vfio/mdev/idxd/vdev.c | 69 ++++++++++++++++++++++++-----------
 3 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 9a8b428e0391..f13198604020 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -579,6 +579,8 @@ static int vidxd_resubmit_pending_descs (struct vdcm_idxd *vidxd,
 			*offset += sizeof(el);
 
 			portal = wq->portal;
+			portal += (el.portal_id << 6);
+
 			pr_info("submitting a desc to WQ %d:%d ded %d\n",
 					i, wq->id, wq_dedicated(wq));
 			if (wq_dedicated(wq)) {
@@ -597,6 +599,8 @@ static int vidxd_resubmit_pending_descs (struct vdcm_idxd *vidxd,
 					continue;
 				}
 				hw->pasid = hpasid;
+				/* FIXME: Allow enqcmds to retry a few times
+				 * before failing */
 				rc = enqcmds(portal, el.work_desc);
 				if (rc < 0) {
 					pr_info("%s: enqcmds failed\n", __func__);
diff --git a/drivers/vfio/mdev/idxd/mdev.h b/drivers/vfio/mdev/idxd/mdev.h
index 775f4927b2e3..8af1bb8ee21a 100644
--- a/drivers/vfio/mdev/idxd/mdev.h
+++ b/drivers/vfio/mdev/idxd/mdev.h
@@ -50,6 +50,7 @@ struct ioasid_mm_entry {
 
 struct idxd_wq_desc_elem {
 	enum idxd_portal_prot portal_prot;
+	u8   portal_id;
 	u8  work_desc[IDXD_DESC_SIZE];
 	struct list_head link;
 };
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 36701a1757be..82eb4ac1aca8 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -284,6 +284,7 @@ int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
 	struct idxd_wq *wq;
 	struct idxd_wq_portal *portal;
 	enum idxd_portal_prot portal_prot = IDXD_PORTAL_UNLIMITED;
+	int rc = 0;
 
 	BUG_ON((size & (size - 1)) != 0);
 	BUG_ON(size > 64);
@@ -316,51 +317,75 @@ int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
 		printk("desc: %016llx %016llx  %016llx %016llx %016llx %016llx %016llx %016llx\n",
 				p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
 
-		if (wq_dedicated(wq) && vwq->ndescs == wq->size) {
-			printk("can't submit more descriptors than WQ size. Dropping.\n");
-			memset(&portal->data, 0, IDXD_DESC_SIZE);
-			portal->count = 0;
-			return 0;
-		}
-
 		mutex_lock(&vidxd->mig_submit_lock);
 		if (vidxd->paused) {
-			elem = kmalloc(sizeof(struct idxd_wq_desc_elem),
+			if (wq_dedicated(wq)) {
+				/* Queue the descriptor if submitted to DWQ */
+				if (vwq->ndescs == wq->size) {
+					printk("can't submit more descriptors than WQ size. Dropping.\n");
+					goto out_unlock;
+				}
+
+				elem = kmalloc(sizeof(struct idxd_wq_desc_elem),
 					GFP_KERNEL);
 
-			if (elem == NULL) {
-				printk("kmalloc failed\n");
-				return 0;
-			}
-			printk("queuing the desc\n");
-			memcpy(elem->work_desc, portal->data, IDXD_DESC_SIZE);
-			elem->portal_prot = portal_prot;
+				if (elem == NULL) {
+					printk("kmalloc failed\n");
+					rc = -ENOMEM;
+					goto out_unlock;
+				}
+				printk("queuing the desc\n");
+				memcpy(elem->work_desc, portal->data, IDXD_DESC_SIZE);
+				elem->portal_prot = portal_prot;
+				elem->portal_id = portal_id;
 
-			list_add_tail(&elem->link, &vwq->head);
-			vwq->ndescs++;
+				list_add_tail(&elem->link, &vwq->head);
+				vwq->ndescs++;
+			} else {
+				/* Return retry if submitted to SWQ */
+				rc = -EAGAIN;
+				goto out_unlock;
+			}
                } else {
 			void __iomem *wq_portal;
-
-			wq_portal = wq->portal;
+			portal = wq->portal;
+			wq_portal += (portal_id << 6);
 			printk("submitting a desc to WQ %d ded %d\n", wq->id,
 					wq_dedicated(wq));
 			if (wq_dedicated(wq)) {
 				iosubmit_cmds512(wq_portal, (struct dsa_hw_desc *)p, 1);
 			} else {
 				int rc;
-				rc = enqcmds(wq_portal, (struct dsa_hw_desc *)p);
+				struct dsa_hw_desc *hw =
+					(struct dsa_hw_desc *)portal->data;
+				int hpasid, gpasid = hw->pasid;
+
+				/* Translate the gpasid in the descriptor */
+				rc = idxd_mdev_get_host_pasid(vidxd->ivdev.mdev,
+							gpasid, &hpasid);
+                                if (rc < 0) {
+                                        pr_info("gpasid->hpasid trans failed\n");
+					rc = -EINVAL;
+					goto out_unlock;
+                                }
+                                hw->pasid = hpasid;
+
+				/* FIXME: Allow enqcmds to retry a few times
+				 * before failing */
+				rc = enqcmds(wq_portal, hw);
 				if (rc < 0) {
 					pr_info("%s: enqcmds failed\n", __func__);
-					return rc;
+					goto out_unlock;
 				}
 			}
 		}
+out_unlock:
 		mutex_unlock(&vidxd->mig_submit_lock);
 		memset(&portal->data, 0, IDXD_DESC_SIZE);
 		portal->count = 0;
 	}
 
-	return 0;
+	return rc;
 }
 
 int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
-- 
Gitee


From 5a1b3fa73090597443d0fc426cacca69715371dc Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Thu, 30 Sep 2021 03:07:34 -0700
Subject: [PATCH 1854/2161] IDXD/MDEV: Initialize the portal data structures

commit bc1e35fa85c94cb526920ba8fa4154e5b06f2002 Intel-BKC.

Initialize the portal data structures properly before assigning the vDEV to
the VM. Also initialize the mig_submit_lock mutex.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index f13198604020..1bfae9377c7f 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -219,12 +219,19 @@ static inline void reset_vmmio(struct vdcm_idxd *vidxd)
 	memset(&vidxd->bar0, 0, VIDXD_MAX_MMIO_SPACE_SZ);
 }
 
+static inline void vidxd_vwq_init(struct vdcm_idxd *vidxd)
+{
+	INIT_LIST_HEAD(&vidxd->vwq.head);
+	vidxd->vwq.ndescs = 0;
+
+	memset(vidxd->vwq.portals, 0,
+		VIDXD_MAX_PORTALS * sizeof(struct idxd_wq_portal));
+}
+
 static void idxd_vdcm_init(struct vdcm_idxd *vidxd)
 {
 	struct idxd_wq *wq = vidxd->wq;
 
-	INIT_LIST_HEAD(&vidxd->vwq.head);
-
 	reset_vconfig(vidxd);
 	reset_vmmio(vidxd);
 
@@ -233,6 +240,7 @@ static void idxd_vdcm_init(struct vdcm_idxd *vidxd)
 
 	vidxd_mmio_init(vidxd);
 
+	vidxd_vwq_init(vidxd);
 	if (wq_dedicated(wq) && wq->state == IDXD_WQ_ENABLED) {
 		idxd_wq_disable(wq, false, NULL);
 		wq->state = IDXD_WQ_LOCKED;
@@ -886,6 +894,7 @@ static struct vdcm_idxd *vdcm_vidxd_create(struct idxd_device *idxd, struct mdev
 
 	mdev_set_iommu_fault_data(mdev, &vidxd->vfio_pdev);
 
+	mutex_init(&vidxd->mig_submit_lock);
 	vidxd->vfio_pdev.migops = &vidxd_migops;
 	rc = vfio_pci_migration_init(&vidxd->vfio_pdev, VIDXD_STATE_BUFFER_SIZE);
 	if (rc)
-- 
Gitee


From 27b87ab295dc0b01396aa5814c6742babea41e03 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Thu, 2 Dec 2021 05:02:26 -0800
Subject: [PATCH 1855/2161] IDXD_MDEV: Fix the order of resuming IMS entries
 and resubmitting the queued descriptors

commit e01610832e9efcfe9e83ebf2612ac087f336ee4f Intel-BKC.

Fix the order of resuming IMS entries and resubmitting the queued descriptors.
Before this fix, we were submitting the descriptors before setting up the IMS
entries which could cause descriptor failures at DSA.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 52 +++++++++++++++++------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 1bfae9377c7f..3a267908d74d 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -347,30 +347,30 @@ static void vidxd_source_prepare_for_migration(struct vdcm_idxd *vidxd)
 	memcpy(data_ptr + offset, (u8 *)&vidxd->bar0, sizeof(vidxd->bar0));
 	offset += sizeof(vidxd->bar0);
 
-	/* Save the queued descriptors */
-	for (i = 0; i < vidxd->num_wqs; i++) {
-		struct idxd_wq_desc_elem *el;
-
-		vwq = &vidxd->vwq;
-		memcpy(data_ptr + offset, (u8 *)&vwq->ndescs, sizeof(vwq->ndescs));
-		offset += sizeof(vwq->ndescs);
-		list_for_each_entry(el, &vwq->head, link) {
-			dev_dbg(dev, "Saving descriptor at offset %x\n", offset);
-			memcpy(data_ptr + offset, (u8 *)el, sizeof(*el));
-			offset += sizeof(*el);
-		}
-	}
-
 	/* Save int handle info */
 	for (i = 1; i < VIDXD_MAX_MSIX_VECS; i++) {
 		u32 ims_idx = dev_msi_hwirq(dev, i - 1);
 
 		/* Save the current handle in use */
-		dev_dbg(dev, "Saving handle %d at offset %x\n", ims_idx, offset);
+		pr_info("Saving handle %d at offset %x\n", ims_idx, offset);
 		memcpy(data_ptr + offset, (u8 *)&ims_idx, sizeof(ims_idx));
 		offset += sizeof(ims_idx);
 	}
 
+        /* Save the queued descriptors */
+        for (i = 0; i < vidxd->num_wqs; i++) {
+                struct idxd_wq_desc_elem *el;
+                vwq = &vidxd->vwq;
+
+                memcpy(data_ptr + offset, (u8 *)&vwq->ndescs, sizeof(vwq->ndescs));
+                offset += sizeof(vwq->ndescs);
+                list_for_each_entry(el, &vwq->head, link) {
+                        printk("Saving descriptor at offset %x\n", offset);
+                        memcpy(data_ptr + offset, (u8 *)el, sizeof(*el));
+                        offset += sizeof(*el);
+                }
+        }
+
 	mig_info->data_size = offset - mig_info->data_offset;
 	mig_info->pending_bytes = offset - mig_info->data_offset;
 
@@ -497,8 +497,8 @@ static unsigned int vidxd_dest_load_state(struct vdcm_idxd *vidxd)
 	return offset;
 }
 
-static int vidxd_dest_int_handle_revocation (struct vdcm_idxd *vidxd,
-		unsigned int *offset)
+static int vidxd_resume_ims_state (struct vdcm_idxd *vidxd,
+		unsigned int *offset, bool *int_handle_revoked)
 {
 	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
 	u8 *data_ptr = (u8 *)vdev->mig_pages;
@@ -507,7 +507,6 @@ static int vidxd_dest_int_handle_revocation (struct vdcm_idxd *vidxd,
 	int rc = 0;
 	struct mdev_device *mdev = vidxd->ivdev.mdev;
 	struct device *dev = mdev_dev(mdev);
-	bool int_handle_revoked = false;
 
 	/* Restore int handle info */
 	for (i = 1; i < VIDXD_MAX_MSIX_VECS; i++) {
@@ -527,7 +526,7 @@ static int vidxd_dest_int_handle_revocation (struct vdcm_idxd *vidxd,
 
 		if (revoked_handle != ims_idx) {
 			/* Int Handle Revoked */
-			int_handle_revoked = true;
+			*int_handle_revoked = true;
 		}
 
 		perm_val = *(u32 *)(bar0 + VIDXD_MSIX_PERM_OFFSET + i * 8);
@@ -552,9 +551,6 @@ static int vidxd_dest_int_handle_revocation (struct vdcm_idxd *vidxd,
 		}
 	}
 
-	if (int_handle_revoked)
-                vidxd_notify_revoked_handles(vidxd);
-
 	return rc;
 }
 
@@ -625,6 +621,7 @@ static int vidxd_dest_complete_migration(struct vdcm_idxd *vidxd)
 {
 	int rc = 0;
 	unsigned int offset;
+	bool int_handle_revoked = false;
 
 	offset = vidxd_dest_load_state(vidxd);
 
@@ -635,20 +632,23 @@ static int vidxd_dest_complete_migration(struct vdcm_idxd *vidxd)
 		return rc;
 	}
 
-	rc = vidxd_resubmit_pending_descs(vidxd, &offset);
+	rc = vidxd_resume_ims_state(vidxd, &offset, &int_handle_revoked);
 
 	if (rc) {
-		pr_info("vidxd pending descs handling failed %d\n", rc);
+		pr_info("vidxd int handle revocation handling failed %d\n", rc);
 		return rc;
 	}
 
-	rc = vidxd_dest_int_handle_revocation(vidxd, &offset);
+	rc = vidxd_resubmit_pending_descs(vidxd, &offset);
 
 	if (rc) {
-		pr_info("vidxd int handle revocation handling failed %d\n", rc);
+		pr_info("vidxd pending descs handling failed %d\n", rc);
 		return rc;
 	}
 
+	if (int_handle_revoked)
+                vidxd_notify_revoked_handles(vidxd);
+
 	return rc;
 }
 
-- 
Gitee


From de42b5c9cdb9580d0d83c2acbfaa0282d133d7d1 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Fri, 3 Dec 2021 00:22:54 -0800
Subject: [PATCH 1856/2161] IDXD_MDEV: Reset the vDEV paused state when the
 vDEV again switches to RUNNING state

commit eb5bca3449560665dcc97775dc5b422fdd3b241a Intel-BKC.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 3a267908d74d..7ff1c87cecae 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -677,6 +677,13 @@ static int vidxd_migration_state_change(struct vfio_pci_core_device *vfio_vdev,
 			__func__, mig_info->device_state);
 		if (mig_info->device_state & VFIO_DEVICE_STATE_RESUMING)
 			vidxd_dest_complete_migration(vidxd);
+
+		mutex_lock(&vidxd->mig_submit_lock);
+		/* The VMM may continue the VM after pausing it. So get ready
+		* for normal operation */
+		vidxd->paused = false;
+		mutex_unlock(&vidxd->mig_submit_lock);
+
 		break;
 	case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING:
 		pr_info("%s, VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING!!\n", __func__);
-- 
Gitee


From d37778c4b8feedbad2063613c19cedd396537e1a Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Wed, 15 Dec 2021 23:52:18 +0800
Subject: [PATCH 1857/2161] iommu/vtd: Split super page to 4KB when enabling
 FL/SL AD

commit 3da2c830b0fced841a0c410dba07351b1fb15669 Intel-BKC.

To accelerate dirty pages tracking performance, we should split
big page to 4KB.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 158 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/iommu.c       |  45 ++++++++++
 include/linux/intel-iommu.h |   1 +
 include/linux/iommu.h       |   2 +
 4 files changed, 206 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 27d2fe2e77fe..8bcc2f4d23e3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1170,6 +1170,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 				      unsigned long pfn, int *target_level)
 {
 	struct dma_pte *parent, *pte;
+	/*
+	 * level == 5: 5 level page table;
+	 * level == 4: 4 level page table;
+	 * level == 3: 3 level page table;
+	 */
 	int level = agaw_to_level(domain->agaw);
 	int offset;
 
@@ -6145,6 +6150,158 @@ static int intel_iommu_clear_dirty_log(struct iommu_domain *domain,
 	return ret;
 }
 
+static int
+__domain_split_block(struct dmar_domain *domain, struct intel_iommu *iommu,
+		     unsigned long iova, size_t size)
+{
+	struct dma_pte *pte = NULL;
+	unsigned long nr_pages = size >> VTD_PAGE_SHIFT;
+	unsigned long lvl_pages = 0, child_lvl_pages = 0;
+	unsigned long iov_pfn = iova >> VTD_PAGE_SHIFT;
+	unsigned int largepage_lvl = 0;
+	unsigned int create_pages = 0;
+	int i;
+	unsigned long page_pfn;
+	unsigned long phys_pfn;
+	phys_addr_t pteval;
+	u64 attr;
+	bool splitted = false;
+
+	spin_lock(&iommu->lock);
+	while (nr_pages > 0) {
+		/*
+		 * largepage_lvl == 1: 4KB page;
+		 * largepage_lvl == 2: 2MB page;
+		 * largepage_lvl == 3: 1GB page;
+		 */
+		largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
+							0, nr_pages);
+
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+		BUG_ON(nr_pages < lvl_pages);
+
+		if (largepage_lvl <= 1) {
+			/* It is not large page*/
+			pr_debug("The 0x%lx pte is not super page. largepage_lvl=%d\n",
+				iova, largepage_lvl);
+			goto skip;
+		}
+
+		/* Get the current level pte, e.g. if largepage_lvl == 1,
+		 * we get "SL-PDE: 4KB page" item which is assigned to pte.
+		 */
+		pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+		if (!pte || !dma_pte_present(pte))
+			return -EINVAL;
+
+		if (!(pte->val & DMA_PTE_WRITE)) {
+			pr_warn("The 0x%lx pte is READ ONLY.\n", iova);
+			goto skip;
+		}
+
+		/* If the page is super, split it. */
+		if (!dma_pte_superpage(pte)) {
+			pr_warn("The 0x%lx pte is not super page.\n", iova);
+			goto skip;
+		}
+
+		phys_pfn = dma_pte_addr(pte);
+		page_pfn = iov_pfn;
+		attr = pte->val & VTD_ATTR_MASK;
+		pteval = ((phys_addr_t)phys_pfn) | attr;
+		pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+		if (largepage_lvl == 2) {
+			/* 2MB: Create 512 4KB items, i.e. split 2MB to 512 4KB pages */
+			create_pages = 512;
+		} else {
+			/* 1GB: Create 512*512 4KB items */
+			create_pages = 512 * 512;
+		}
+
+		/* Change big page level to 4KB level */
+		largepage_lvl = 1;
+
+		/* Establish page table and get the 4KB page item */
+		pte = NULL;
+		for (i = 0; i < create_pages; i++) {
+			if (!pte) {
+				pte = pfn_to_dma_pte(domain, page_pfn, &largepage_lvl);
+				if (!pte)
+					return -ENOMEM;
+			}
+			cmpxchg64_local(&pte->val, 0ULL, pteval);
+			/* Always 1 here */
+			child_lvl_pages = lvl_to_nr_pages(largepage_lvl);
+			page_pfn += child_lvl_pages;
+			phys_pfn += child_lvl_pages;
+			pteval += child_lvl_pages * VTD_PAGE_SIZE;
+			pte++;
+			pr_debug("%s: Big page splitted to 4KB pages, pteval=0x%llx, pte=0x%llx\n",
+				__func__, pteval, (uint64_t)pte);
+			if (first_pte_in_page(pte))
+				pte = NULL;
+		}
+
+		splitted = true;
+skip:
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		iova += lvl_pages * VTD_PAGE_SIZE;
+	}
+	spin_unlock(&iommu->lock);
+
+	if (splitted)
+		return 1;
+
+	return 0;
+}
+
+static int intel_iommu_split_block(struct iommu_domain *domain,
+				   unsigned long iova, size_t size)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct device_domain_info *info;
+	struct subdev_domain_info *sinfo;
+	int ret = 0;
+	unsigned long flags;
+
+	if (!domain_use_first_level(dmar_domain) && !slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
+
+	/* Return if size is less than 2MB */
+	if (size < 0x200000)
+		return 0;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &dmar_domain->devices, link) {
+		pr_debug("%s: split for %02x:%02x.%d, iova=0x%lx, size=0x%zx\n",
+			__func__, info->bus, PCI_SLOT(info->devfn), PCI_FUNC(info->devfn),
+			iova, size);
+
+		ret = __domain_split_block(dmar_domain, info->iommu, iova, size);
+		if (ret && ret != 1)
+			goto out;
+	}
+
+	list_for_each_entry(sinfo, &dmar_domain->subdevices, link_domain) {
+		info = get_domain_info(sinfo->pdev);
+		pr_debug("%s: split for subdev %02x:%02x.%d, iova=0x%lx, size=0x%zx\n",
+			__func__, info->bus, PCI_SLOT(info->devfn), PCI_FUNC(info->devfn),
+			iova, size);
+
+		ret = __domain_split_block(dmar_domain, info->iommu, iova, size);
+		if (ret && ret != 1)
+			break;
+	}
+
+out:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
+}
+
 const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
@@ -6184,6 +6341,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.sva_get_pasid		= intel_svm_get_pasid,
 	.page_response		= intel_svm_page_response,
 #endif
+	.split_block		= intel_iommu_split_block,
 	.set_hwdbm		= intel_iommu_set_hwdbm,
 	.sync_dirty_log		= intel_iommu_sync_dirty_log,
 	.clear_dirty_log	= intel_iommu_clear_dirty_log,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2c9a80db0f63..7ba8447babff 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3059,6 +3059,46 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
+static int iommu_split_block(struct iommu_domain *domain, unsigned long iova,
+			     size_t size)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned int min_pagesz;
+	size_t pgsize;
+	int ret = 0;
+
+	if (unlikely(!ops))
+		return -ENODEV;
+
+	if (unlikely(!ops->split_block)) {
+		pr_warn("don't support split_block\n");
+		return ret;
+	}
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+		       iova, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	while (size) {
+		pgsize = iommu_pgsize(domain, iova, size);
+
+		ret = ops->split_block(domain, iova, pgsize);
+		if (ret)
+			break;
+
+		pr_debug("split handled: iova 0x%lx size 0x%zx\n", iova, pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+	iommu_flush_iotlb_all(domain);
+
+	return ret;
+}
+
 int iommu_domain_set_hwdbm(struct iommu_domain *domain, bool enable,
 			   unsigned long iova, size_t size)
 {
@@ -3071,6 +3111,11 @@ int iommu_domain_set_hwdbm(struct iommu_domain *domain, bool enable,
 	}
 
 	ret = ops->set_hwdbm(domain, enable, iova, size);
+	if (ret)
+		return ret;
+
+	if (enable)
+		ret = iommu_split_block(domain, iova, size);
 
 	return ret;
 }
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 558993a08b26..e9be3f6fd56f 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -32,6 +32,7 @@
 #define VTD_PAGE_SIZE		(1UL << VTD_PAGE_SHIFT)
 #define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
 #define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
+#define VTD_ATTR_MASK		(0xfff)
 
 #define VTD_STRIDE_SHIFT        (9)
 #define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d215bc77f4ab..2accac04d414 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -346,6 +346,8 @@ struct iommu_ops {
 
 	int (*def_domain_type)(struct device *dev);
 
+	int (*split_block)(struct iommu_domain *domain, unsigned long iova,
+			   size_t size);
 	int (*set_hwdbm)(struct iommu_domain *domain, bool enable,
 			unsigned long iova, size_t size);
 
-- 
Gitee


From ef6adb6d9a26705c267d5b5b2d5d64ea2ec4cb01 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Tue, 21 Dec 2021 00:03:12 +0800
Subject: [PATCH 1858/2161] iommu/vtd: Merge small pages back to big page

commit fc93ebbf1374573d80a47e08c3210fdfb69b33c0 Intel-BKC.

After dirty log tracking, we should merge small pages back
to big page.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 87 +++++++++++++++++++++++++++++++++++++
 drivers/iommu/iommu.c       | 76 ++++++++++++++++++++++++++++++++
 include/linux/iommu.h       |  2 +
 3 files changed, 165 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 8bcc2f4d23e3..ba65e58b97da 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -6302,6 +6302,92 @@ static int intel_iommu_split_block(struct iommu_domain *domain,
 	return ret;
 }
 
+static int
+__domain_merge_pages(struct dmar_domain *domain, struct intel_iommu *iommu,
+		     unsigned long iova, phys_addr_t paddr, size_t size)
+{
+	struct dma_pte *pte = NULL;
+	unsigned int largepage_lvl = 0;
+	unsigned long iov_pfn = iova >> VTD_PAGE_SHIFT;
+	unsigned long start_pfn = iov_pfn;
+	unsigned long end_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+	struct page *freelist;
+	int prot = 0;
+	int ret = 0;
+
+	/* Construct the big page again */
+	largepage_lvl = 1;
+	pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+	if (!pte || !dma_pte_present(pte))
+		return -EINVAL;
+
+	if (pte->val & DMA_PTE_READ)
+		prot |= DMA_PTE_READ;
+	if (pte->val & DMA_PTE_WRITE)
+		prot |= DMA_PTE_WRITE;
+	if (pte->val & DMA_PTE_SNP)
+		prot |= DMA_PTE_SNP;
+
+	pr_debug("%s: start_pfn=0x%lx, end_pfn=0x%lx, prot=0x%x, size=0x%zx, paddr=0x%llx\n",
+		__func__, start_pfn, end_pfn, prot, size, paddr);
+
+	/* Free the splitted 4KB pages */
+	freelist = domain_unmap(domain, start_pfn, end_pfn, NULL);
+	dma_free_pagelist(freelist);
+
+	ret = __domain_mapping(domain, start_pfn, paddr >> VTD_PAGE_SHIFT,
+				size >> VTD_PAGE_SHIFT, prot);
+
+	return ret;
+}
+
+static int intel_iommu_merge_pages(struct iommu_domain *domain, unsigned long iova,
+				   phys_addr_t phys, size_t size)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct device_domain_info *info;
+	struct subdev_domain_info *sinfo;
+	int ret = 0;
+	unsigned long flags;
+
+	if (!domain_use_first_level(dmar_domain) && !slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
+
+	/* Return if size is less than 2MB */
+	if (size < 0x200000)
+		return 0;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+
+	list_for_each_entry(info, &dmar_domain->devices, link) {
+		pr_debug("%s: merge for %02x:%02x.%d, iova=0x%lx, phys=0x%llx, size=0x%zx\n",
+			__func__, info->bus, PCI_SLOT(info->devfn), PCI_FUNC(info->devfn),
+			iova, phys, size);
+
+		ret = __domain_merge_pages(dmar_domain, info->iommu, iova, phys, size);
+		if (ret)
+			goto out;
+	}
+
+	list_for_each_entry(sinfo, &dmar_domain->subdevices, link_domain) {
+		info = get_domain_info(sinfo->pdev);
+		pr_debug("%s: merge for subdev %02x:%02x.%d, iova=0x%lx, phys=0x%llx, size=0x%zx\n",
+			__func__, info->bus, PCI_SLOT(info->devfn), PCI_FUNC(info->devfn),
+			iova, phys, size);
+
+		ret = __domain_merge_pages(dmar_domain, info->iommu, iova, phys, size);
+		if (ret)
+			break;
+	}
+
+out:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
+}
+
 const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
@@ -6341,6 +6427,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.sva_get_pasid		= intel_svm_get_pasid,
 	.page_response		= intel_svm_page_response,
 #endif
+	.merge_pages		= intel_iommu_merge_pages,
 	.split_block		= intel_iommu_split_block,
 	.set_hwdbm		= intel_iommu_set_hwdbm,
 	.sync_dirty_log		= intel_iommu_sync_dirty_log,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7ba8447babff..f3d60080f62d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3059,6 +3059,80 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
+static int __iommu_merge_pages(struct iommu_domain *domain,
+			       unsigned long iova, phys_addr_t paddr,
+			       size_t size)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned int min_pagesz;
+	size_t pgsize;
+	int ret = 0;
+
+	if (unlikely(!ops))
+		return -ENODEV;
+
+	if (unlikely(!ops->merge_pages)) {
+		pr_warn("don't support merge_pages\n");
+		return ret;
+	}
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+	if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n",
+			iova, &paddr, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	while (size) {
+		pgsize = iommu_pgsize(domain, iova | paddr, size);
+
+		ret = ops->merge_pages(domain, iova, paddr, pgsize);
+		if (ret)
+			break;
+
+		pr_debug("merge handled: iova 0x%lx pa %pa size 0x%zx\n",
+			 iova, &paddr, pgsize);
+
+		iova += pgsize;
+		paddr += pgsize;
+		size -= pgsize;
+	}
+
+	return ret;
+}
+
+static int iommu_merge_pages(struct iommu_domain *domain, unsigned long iova,
+			     size_t size)
+{
+	phys_addr_t phys;
+	dma_addr_t p, i;
+	size_t cont_size;
+	int ret = 0;
+
+	while (size) {
+		phys = iommu_iova_to_phys(domain, iova);
+		cont_size = PAGE_SIZE;
+		p = phys + cont_size;
+		i = iova + cont_size;
+
+		while (cont_size < size && p == iommu_iova_to_phys(domain, i)) {
+			p += PAGE_SIZE;
+			i += PAGE_SIZE;
+			cont_size += PAGE_SIZE;
+		}
+
+		ret = __iommu_merge_pages(domain, iova, phys, cont_size);
+		if (ret)
+			break;
+
+		iova += cont_size;
+		size -= cont_size;
+	}
+	iommu_flush_iotlb_all(domain);
+
+	return ret;
+}
+
 static int iommu_split_block(struct iommu_domain *domain, unsigned long iova,
 			     size_t size)
 {
@@ -3116,6 +3190,8 @@ int iommu_domain_set_hwdbm(struct iommu_domain *domain, bool enable,
 
 	if (enable)
 		ret = iommu_split_block(domain, iova, size);
+	else
+		ret = iommu_merge_pages(domain, iova, size);
 
 	return ret;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2accac04d414..c8b458f6e84b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -346,6 +346,8 @@ struct iommu_ops {
 
 	int (*def_domain_type)(struct device *dev);
 
+	int (*merge_pages)(struct iommu_domain *domain, unsigned long iova,
+			   phys_addr_t phys, size_t size);
 	int (*split_block)(struct iommu_domain *domain, unsigned long iova,
 			   size_t size);
 	int (*set_hwdbm)(struct iommu_domain *domain, bool enable,
-- 
Gitee


From dd94790d46d5a315679c30fbf70d32ea4523dc71 Mon Sep 17 00:00:00 2001
From: Yian Chen <yian.chen@intel.com>
Date: Mon, 11 Oct 2021 17:21:29 -0700
Subject: [PATCH 1859/2161] iommu/vt-d: Enable ATS for the devices in SATC
 table

commit 97f2f2c5317f55ae3440733a090a96a251da222b upstream.

Starting from Intel VT-d v3.2, Intel platform BIOS can provide
additional SATC table structure. SATC table includes a list of
SoC integrated devices that support ATC (Address translation
cache).

All Intel SoC devices in SATC table are trusted to enable and
use ATS. When IOMMU is working in scalable mode, software
should enable ATS for every device in SATC table.

On the other hand, if IOMMU is in legacy mode, ATS of SATC
capable devices can work transparently to software and be
automatically enabled by IOMMU hardware. As the result,
software shouldn't enable ATS on these devices, otherwise
duplicate device TLB invalidation will occur.

Signed-off-by: Yian Chen <yian.chen@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 53 ++++++++++++++++++++++++++++++++++---
 include/linux/intel-iommu.h |  2 +-
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index ba65e58b97da..566bad68235c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -934,7 +934,6 @@ static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 
 	return false;
 }
-
 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 {
 	struct dmar_drhd_unit *drhd = NULL;
@@ -2787,7 +2786,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 
 		if (ecap_dev_iotlb_support(iommu->ecap) &&
 		    pci_ats_supported(pdev) &&
-		    dmar_find_matched_atsr_unit(pdev))
+		    dmar_ats_supported(pdev, iommu))
 			info->ats_supported = 1;
 
 		if (sm_supported(iommu)) {
@@ -4156,7 +4155,42 @@ static void intel_iommu_free_dmars(void)
 	}
 }
 
-int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+/* dev_satc_state - Find if dev is in a DMAR SATC table
+ *
+ * return value:
+ *    1: dev is in STAC table and ATS is required
+ *    0: dev is in SATC table and ATS is optional
+ *    -1: dev isn't in SATC table
+ */
+static int dev_satc_state(struct pci_dev *dev)
+{
+	int i, ret = -1;
+	struct device *tmp;
+	struct dmar_satc_unit *satcu;
+	struct acpi_dmar_satc *satc;
+
+	dev = pci_physfn(dev);
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
+		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
+		if (satc->segment != pci_domain_nr(dev->bus))
+			continue;
+		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
+			if (to_pci_dev(tmp) == dev) {
+				if (satc->flags)
+					ret = 1;
+				else
+					ret = 0;
+				goto out;
+			}
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
 {
 	int i, ret = 1;
 	struct pci_bus *bus;
@@ -4166,6 +4200,19 @@ int dmar_find_matched_atsr_unit(struct pci_dev *dev)
 	struct dmar_atsr_unit *atsru;
 
 	dev = pci_physfn(dev);
+	i = dev_satc_state(dev);
+	if (i >= 0) {
+		/* This dev supports ATS as it is in SATC table!
+		 * When IOMMU is in legacy mode, enabling ATS is done
+		 * automatically by HW for the device that requires
+		 * ATS, hence OS should not enable this device ATS
+		 * to avoid duplicated TLB invalidation
+		 */
+		if (i && !sm_supported(iommu))
+			ret = 0;
+		return ret;
+	}
+
 	for (bus = dev->bus; bus; bus = bus->parent) {
 		bridge = bus->self;
 		/* If it's an integrated device, allow ATS */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index e9be3f6fd56f..58a2f37ebc9b 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -719,7 +719,7 @@ static inline int nr_pte_to_next_page(struct dma_pte *pte)
 }
 
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
-extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
+extern int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu);
 
 extern int dmar_enable_qi(struct intel_iommu *iommu);
 extern void dmar_disable_qi(struct intel_iommu *iommu);
-- 
Gitee


From 2ebc9905ed3c0da4f256d497eb04c3843047d5a0 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 22 Oct 2021 15:25:18 -0700
Subject: [PATCH 1860/2161] iommu/vt-d: Allow IOMMU bypass for SoC integrated
 devices

commit 517284597772e4eaec7673e05dedf898d36dd422 Intel-BKC.

SoC integrated devices listed under ACPI SATC  (SoC Integrated Address
Translation Cache) should be trusted for DMA. These are the devices
co-designed with the CPU, can be viewed as co-processors rather than
devices.

This patch leverage the existing infrastructure to mark devices
under SATC device scope as identity mapping by default. This will in
turn bypass DMA map ops. The drivers invoke DMA map APIs will get physical
address in returned dma_addr_t. Devices will be in hardware pass-through
mode whenever available.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 566bad68235c..0c1132dab4b2 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -185,6 +185,8 @@ static inline unsigned long virt_to_dma_pfn(void *p)
 	return page_to_dma_pfn(virt_to_page(p));
 }
 
+static int dev_satc_state(struct pci_dev *dev);
+
 /* global iommu list, set NULL for ignored DMAR units */
 static struct intel_iommu **g_iommus;
 
@@ -3047,6 +3049,13 @@ static bool device_is_rmrr_locked(struct device *dev)
 	return true;
 }
 
+static bool device_has_satc(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return dev_satc_state(pdev) >= 0;
+}
+
 /*
  * Return the required default domain type for a specific device.
  *
@@ -3068,6 +3077,11 @@ static int device_def_domain_type(struct device *dev)
 
 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
 			return IOMMU_DOMAIN_IDENTITY;
+
+		if (device_has_satc(dev)) {
+			dev_info(dev, "Use identity domain for SATC devices");
+			return IOMMU_DOMAIN_IDENTITY;
+		}
 	}
 
 	return 0;
-- 
Gitee


From 588257d8120a9ebe23a0029bc6939f6a33a59c69 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Sun, 25 Jul 2021 13:25:20 -0700
Subject: [PATCH 1861/2161] ioasid: Reserve a global PASID for in-kernel DMA

commit 9ee131fbaf6b93f912ec55b31edfc1c0ac1a05e2 Intel-BKC.

In-kernel DMA is managed by DMA mapping APIs, which supports per device
addressing mode for legacy DMA requests. With the introduction of
Process Address Space ID (PASID), device DMA can now target at a finer
granularity per PASID + Requester ID (RID).

However, for in-kernel DMA there is no need to differentiate between
legacy DMA and DMA with PASID in terms of mapping. DMA address mapping
for RID+PASID can be made identical to the RID. The benefit for the
drivers is the continuation of DMA mapping APIs without change.

This patch reserves a special IOASID for devices that perform in-kernel
DMA requests with PASID. This global IOASID is excluded from the
IOASID allocator. The analogous case is PASID #0, a special PASID
reserved for DMA requests without PASID (legacy). We could have different
kernel PASIDs for individual devices, but for simplicity reasons, a
globally reserved one will fit the bill.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 4 ++--
 drivers/iommu/intel/pasid.h | 3 +--
 drivers/iommu/intel/svm.c   | 2 +-
 drivers/iommu/ioasid.c      | 2 ++
 include/linux/ioasid.h      | 4 ++++
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 0c1132dab4b2..dcf3fe77ec14 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3319,7 +3319,7 @@ static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
 	 * PASID range. Host can partition guest PASID range based on
 	 * policies but it is out of guest's control.
 	 */
-	if (min < PASID_MIN || max > intel_pasid_max_id)
+	if (min < IOASID_ALLOC_BASE || max > intel_pasid_max_id)
 		return INVALID_IOASID;
 
 	if (vcmd_alloc_pasid(iommu, &ioasid))
@@ -4915,7 +4915,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 		u32 pasid;
 
 		/* No private data needed for the default pasid */
-		pasid = ioasid_alloc(host_pasid_set, PASID_MIN,
+		pasid = ioasid_alloc(host_pasid_set, IOASID_ALLOC_BASE,
 				     pci_max_pasids(to_pci_dev(dev)) - 1,
 				     NULL);
 		if (pasid == INVALID_IOASID) {
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 3cc8757074c9..cc9149208792 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -10,8 +10,7 @@
 #ifndef __INTEL_PASID_H
 #define __INTEL_PASID_H
 
-#define PASID_RID2PASID			0x0
-#define PASID_MIN			0x1
+#define PASID_RID2PASID			IOASID_DMA_NO_PASID
 #define PASID_MAX			0x100000
 #define PASID_PTE_MASK			0x3F
 #define PASID_PTE_PRESENT		1
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 891180234813..e9cd03477790 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -764,7 +764,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 			pasid_max = intel_pasid_max_id;
 
 		/* Do not use PASID 0, reserved for RID to PASID */
-		svm->pasid = ioasid_alloc(host_pasid_set, PASID_MIN,
+		svm->pasid = ioasid_alloc(host_pasid_set, IOASID_ALLOC_BASE,
 					  pasid_max - 1, svm);
 		if (svm->pasid == INVALID_IOASID) {
 			kfree(svm);
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 5fdbb2619cca..572ce327c59c 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -803,6 +803,8 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 	data->private = private;
 	refcount_set(&data->refs, 1);
 
+	if (min < IOASID_ALLOC_BASE)
+		min = IOASID_ALLOC_BASE;
 	/*
 	 * Custom allocator needs allocator data to perform platform specific
 	 * operations.
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index af37ea4bda85..aa349873825f 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -9,6 +9,10 @@
 #include <uapi/linux/ioasid.h>
 
 #define INVALID_IOASID ((ioasid_t)-1)
+#define IOASID_DMA_NO_PASID	0 /* For DMA request w/o PASID */
+#define IOASID_DMA_PASID	1 /* For in-kernel DMA w/ PASID */
+#define IOASID_ALLOC_BASE	2 /* Start of the allocation */
+
 typedef unsigned int ioasid_t;
 typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
 typedef void (*ioasid_free_fn_t)(ioasid_t ioasid, void *data);
-- 
Gitee


From 4ae50b05318dbe1e1afd089be8e23454655c0f36 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 7 Jul 2022 15:32:36 +0800
Subject: [PATCH 1862/2161] iommu: Add PASID support for DMA mapping API users

commit e1711caf3d8c69629c9b4aa5326de2bb2272fcf6 Intel-BKC.

DMA mapping API is the de facto standard for in-kernel DMA. It operates
on a per device/RID basis which is not PASID-aware.

Some modern devices such as Intel Data Streaming Accelerator, PASID is
required for certain work submissions. To allow such devices use DMA
mapping API, we need the following functionalities:
1. Provide device a way to retrive a kernel PASID for work submission
2. Enable the kernel PASID on the IOMMU
3. Establish address space for the kernel PASID that matches the default
   domain. Let it be IOVA or physical address in case of pass-through.

This patch introduces a driver facing API that enables DMA API
PASID usage. Once enabled, device drivers can continue to use DMA APIs as
is. There is no difference in dma_handle between without PASID and with
PASID.

To manage device IOTLB flush at PASID level, this patch also introduces
a .pasid field to struct device. This also serves as a flag indicating
whether PASID is being used for the device to perform in-kernel DMA.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 65 +++++++++++++++++++++++++++++++++++++++
 include/linux/device.h    |  1 +
 include/linux/dma-iommu.h |  7 +++++
 include/linux/iommu.h     |  4 +++
 4 files changed, 77 insertions(+)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 4c39daa72895..e602f5f516d3 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -175,6 +175,71 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL(iommu_put_dma_cookie);
 
+/**
+ * iommu_enable_pasid_dma --Enable in-kernel DMA request with PASID
+ * @dev:	Device to be enabled
+ *
+ * DMA request with PASID will be mapped the same way as the legacy DMA.
+ * If the device is in pass-through, PASID will also pass-through. If the
+ * device is in IOVA map, the supervisor PASID will point to the same IOVA
+ * page table.
+ *
+ * @return the kernel PASID to be used for DMA or INVALID_IOASID on failure
+ */
+ioasid_t iommu_enable_pasid_dma(struct device *dev)
+{
+	struct iommu_domain *dom;
+
+	if (dev->pasid) {
+		dev_err(dev, "PASID DMA already enabled\n");
+		return IOASID_DMA_PASID;
+	}
+	dom = iommu_get_domain_for_dev(dev);
+	/*
+	 * Use the reserved kernel PASID for all devices. For now,
+	 * there is no need to have different PASIDs for in-kernel use.
+	 */
+	if (!dom->ops->enable_pasid_dma || dom->ops->enable_pasid_dma(dev, IOASID_DMA_PASID))
+		return INVALID_IOASID;
+	/* Used for device IOTLB flush */
+	dev->pasid = IOASID_DMA_PASID;
+
+	return IOASID_DMA_PASID;
+}
+EXPORT_SYMBOL(iommu_enable_pasid_dma);
+
+/**
+ * iommu_disable_pasid_dma --Disable in-kernel DMA request with PASID
+ * @dev:	Device's PASID DMA to be disabled
+ *
+ * It is the device driver's responsibility to ensure no more incoming DMA
+ * requests with the kernel PASID before calling this function. IOMMU driver
+ * ensures PASID cache, IOTLBs related to the kernel PASID are cleared and
+ * drained.
+ *
+ * @return 0 on success or error code on failure
+ */
+int iommu_disable_pasid_dma(struct device *dev)
+{
+	struct iommu_domain *dom;
+	int ret = 0;
+
+	if (!dev->pasid) {
+		dev_err(dev, "PASID DMA not enabled\n");
+		return -ENODEV;
+	}
+	dom = iommu_get_domain_for_dev(dev);
+	if (!dom->ops->disable_pasid_dma)
+		return -ENOTSUPP;
+
+	ret = dom->ops->disable_pasid_dma(dev);
+	if (!ret)
+		dev->pasid = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL(iommu_disable_pasid_dma);
+
 /**
  * iommu_dma_get_resv_regions - Reserved region driver helper
  * @dev: Device from iommu_get_resv_regions()
diff --git a/include/linux/device.h b/include/linux/device.h
index c6e2874b8d16..937c71dcff40 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1319,6 +1319,7 @@ struct device {
 	void	(*release)(struct device *dev);
 	struct iommu_group	*iommu_group;
 	struct dev_iommu	*iommu;
+	u32			pasid;	/* For in-kernel DMA w/ PASID */
 
 	bool			offline_disabled:1;
 	bool			offline:1;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 24607dc3c2ac..298b31e3a007 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -18,6 +18,13 @@ int iommu_get_dma_cookie(struct iommu_domain *domain);
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
+/*
+ * For devices that can do DMA request with PASID, setup a system PASID.
+ * Address modes (IOVA, PA) are selected by the platform code.
+ */
+ioasid_t iommu_enable_pasid_dma(struct device *dev);
+int iommu_disable_pasid_dma(struct device *dev);
+
 /* Setup call for arch DMA mapping code */
 void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
 int iommu_dma_init_fq(struct iommu_domain *domain);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c8b458f6e84b..cf86f7d8fd07 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -257,6 +257,8 @@ struct iommu_iotlb_gather {
  *		 - IOMMU_DOMAIN_IDENTITY: must use an identity domain
  *		 - IOMMU_DOMAIN_DMA: must use a dma domain
  *		 - 0: use the default setting
+ * @enable_pasid_dma: Set up PASID for in-kernel DMA
+ * @disable_pasid_dma: Disable in-kernel DMA with PASID on the device
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
  * @sva_suspend_pasid: stop activities related to a pasid but maintain the bond
@@ -362,6 +364,8 @@ struct iommu_ops {
 			unsigned long *bitmap, unsigned long base_iova,
 			unsigned long bitmap_pgshift);
 
+	int (*enable_pasid_dma)(struct device *dev, u32 pasid);
+	int (*disable_pasid_dma)(struct device *dev);
 	unsigned long pgsize_bitmap;
 	struct module *owner;
 
-- 
Gitee


From eec5e15587866a146f6f23a616c1a062a877385c Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 22 Oct 2021 15:44:47 -0700
Subject: [PATCH 1863/2161] iommu/vt-d: Support PASID DMA for in-kernel usage

commit 03e1764b813442df35884472b354b2d39150cf5a Intel-BKC.

Between DMA requests with and without PASID (legacy), DMA mapping APIs
are used indiscriminately on a device. Therefore, we should always match
the addressing mode of the legacy DMA when enabling kernel PASID.

This patch adds support for VT-d driver where the kernel PASID is
programmed to match RIDPASID. i.e. if the device is in pass-through, the
kernel PASID is also in pass-through; if the device is in IOVA mode, the
kernel PASID will also be using the same IOVA space.

There is additional handling for IOTLB and device TLB flush w.r.t. the
kernel PASID. On VT-d, PASID-selective IOTLB flush is also on a
per-domain basis; whereas device TLB flush is per device. Note that
IOTLBs are used even when devices are in pass-through mode. ATS is
enabled device-wide, but the device drivers can choose to manage ATS at
per PASID level whenever control is available.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 102 +++++++++++++++++++++++++++++++++++-
 drivers/iommu/intel/pasid.c |   7 +++
 include/linux/intel-iommu.h |   3 +-
 3 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index dcf3fe77ec14..ce4102f98b1c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1805,7 +1805,14 @@ static void domain_flush_piotlb(struct intel_iommu *iommu,
 	if (domain->default_pasid)
 		qi_flush_piotlb(iommu, did, domain->default_pasid,
 				addr, npages, ih);
-
+	if (domain->kernel_pasid && !domain_type_is_si(domain)) {
+		/*
+		 * REVISIT: we only do PASID IOTLB inval for FL, we could have SL
+		 * for PASID in the future such as vIOMMU PT. this doesn't get hit.
+		 */
+		qi_flush_piotlb(iommu, did, domain->kernel_pasid,
+				addr, npages, ih);
+	}
 	if (!list_empty(&domain->devices))
 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
 }
@@ -6032,6 +6039,68 @@ static int intel_iommu_set_hwdbm(struct iommu_domain *domain, bool enable,
 	}
 
 out:
+	return ret;
+}
+
+static int intel_enable_pasid_dma(struct device *dev, u32 pasid)
+{
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct device_domain_info *info;
+	unsigned long flags;
+	int ret = 0;
+
+	info = get_domain_info(dev);
+	if (!info)
+		return -ENODEV;
+
+	if (!dev_is_pci(dev) || !sm_supported(info->iommu))
+		return -EINVAL;
+
+	if (intel_iommu_enable_pasid(info->iommu, dev))
+		return -ENODEV;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	spin_lock(&iommu->lock);
+	/*
+	 * Store PASID for IOTLB flush, but only needed for non-passthrough
+	 * unmap case. For passthrough, we only need to do IOTLB flush during
+	 * PASID teardown. Flush covers all devices in the same domain as the
+	 * domain ID is the same for the same SL.
+	 */
+	info->domain->kernel_pasid = pasid;
+
+	/*
+	 * Tracks how many attached devices are using the kernel PASID. Clear
+	 * the domain kernel PASID when all users called disable_pasid_dma().
+	 */
+	atomic_inc(&info->domain->kernel_pasid_user);
+
+	/*
+	 * Addressing modes (IOVA vs. PA) is a per device choice made by the
+	 * platform code. We must treat legacy DMA (request w/o PASID) and
+	 * DMA w/ PASID identially in terms of mapping. Here we just set up
+	 * the kernel PASID to match the mapping of RID2PASID/PASID0.
+	 */
+	if (hw_pass_through && domain_type_is_si(info->domain)) {
+		ret = intel_pasid_setup_pass_through(info->iommu, info->domain,
+						dev, pasid);
+		if (ret)
+			dev_err(dev, "Failed kernel PASID %d in BYPASS", pasid);
+
+	} else if (domain_use_first_level(info->domain)) {
+		/* We are using FL for IOVA, this is the default option */
+		ret = domain_setup_first_level(info->iommu, info->domain, dev,
+					       pasid);
+		if (ret)
+			dev_err(dev, "Failed kernel PASID %d IOVA FL", pasid);
+	} else {
+		ret = intel_pasid_setup_second_level(info->iommu, info->domain,
+						     dev, pasid);
+		if (ret)
+			dev_err(dev, "Failed kernel SPASID %d IOVA SL", pasid);
+	}
+
+	spin_unlock(&iommu->lock);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	return ret;
@@ -6207,7 +6276,36 @@ static int intel_iommu_clear_dirty_log(struct iommu_domain *domain,
 
 	ret = __domain_clear_dirty_log(dmar_domain, iova, size, bitmap,
 					base_iova, bitmap_pgshift);
+	return ret;
+}
 
+static int intel_disable_pasid_dma(struct device *dev)
+{
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct device_domain_info *info;
+	unsigned long flags;
+	int ret = 0;
+
+	info = get_domain_info(dev);
+	if (!info)
+		return -ENODEV;
+
+	if (!dev_is_pci(dev) || !sm_supported(info->iommu))
+		return -EINVAL;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	spin_lock(&iommu->lock);
+
+	/* Tear down kernel PASID for this device */
+	intel_pasid_tear_down_entry(info->iommu, info->dev,
+				    info->domain->kernel_pasid, false,
+				    false);
+	/* Clear the domain kernel PASID when there is no users */
+	if (atomic_dec_and_test(&info->domain->kernel_pasid_user))
+		info->domain->kernel_pasid = 0;
+
+	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 	return ret;
 }
 
@@ -6493,6 +6591,8 @@ const struct iommu_ops intel_iommu_ops = {
 	.set_hwdbm		= intel_iommu_set_hwdbm,
 	.sync_dirty_log		= intel_iommu_sync_dirty_log,
 	.clear_dirty_log	= intel_iommu_clear_dirty_log,
+	.enable_pasid_dma	= intel_enable_pasid_dma,
+	.disable_pasid_dma	= intel_disable_pasid_dma,
 };
 
 static void quirk_iommu_igfx(struct pci_dev *dev)
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 7d722a254c1f..5433d0328906 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -468,6 +468,13 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
 		qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
 	else
 		qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+	/*
+	 * Flush the kernel PASID if used by the device. This is the case where
+	 * a device driver uses IOVA via DMA map APIs for request with PASID.
+	 */
+	if (dev->pasid)
+		qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, dev->pasid, qdep, 0,
+					 64 - VTD_PAGE_SHIFT);
 }
 
 static void
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 58a2f37ebc9b..3e1c59f2b3ce 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -569,7 +569,8 @@ struct dmar_domain {
 					 * The default pasid used for non-SVM
 					 * traffic on mediated devices.
 					 */
-
+	u32 		kernel_pasid;	/* for in-kernel DMA w/ PASID */
+	atomic_t	kernel_pasid_user; /* count of kernel_pasid users */
 	struct iommu_domain domain;	/* generic domain data structure for
 					   iommu core */
 };
-- 
Gitee


From 546e294f9aae92ef7987b336ed27ab033b3040f4 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 22 Oct 2021 15:48:24 -0700
Subject: [PATCH 1864/2161] dmaengine: idxd: Use DMA API for in-kernel PASID
 DMA

commit 4cc216bbb3fe83b1cded59fcd75f3c92ddf942de Intel-BKC.

The current in-kernel supervisor PASID support is based on the SVM/SVA
machinery in SVA lib. The binding between a kernel PASID and kernel
mapping has many flaws. See discussions in the link below.

This patch enables in-kernel DMA by switching from SVA lib to the
standard DMA mapping APIs. Since both DMA requests with and without
PASIDs are mapped identically, there is no change to how DMA APIs are
used after the kernel PASID is enabled.

Link: https://lore.kernel.org/linux-iommu/20210511194726.GP1002214@nvidia.com/
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h |  1 -
 drivers/dma/idxd/init.c | 53 +++++++++++++----------------------------
 2 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 4567ade5d334..ca1b1491fa90 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -289,7 +289,6 @@ struct idxd_device {
 	struct idxd_wq **wqs;
 	struct idxd_engine **engines;
 
-	struct iommu_sva *sva;
 	unsigned int pasid;
 
 	int num_groups;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 400253301c4a..d6931d56e838 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -16,6 +16,7 @@
 #include <linux/idr.h>
 #include <linux/intel-svm.h>
 #include <linux/iommu.h>
+#include <linux/dma-iommu.h>
 #include <uapi/linux/idxd.h>
 #include <linux/dmaengine.h>
 #include "../dmaengine.h"
@@ -486,36 +487,22 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d
 
 static int idxd_enable_system_pasid(struct idxd_device *idxd)
 {
-	int flags;
-	unsigned int pasid;
-	struct iommu_sva *sva;
+	u32 pasid;
+	int ret;
 
-	flags = SVM_FLAG_SUPERVISOR_MODE;
-
-	sva = iommu_sva_bind_device(&idxd->pdev->dev, NULL, &flags);
-	if (IS_ERR(sva)) {
-		dev_warn(&idxd->pdev->dev,
-			 "iommu sva bind failed: %ld\n", PTR_ERR(sva));
-		return PTR_ERR(sva);
-	}
-
-	pasid = iommu_sva_get_pasid(sva);
-	if (pasid == IOMMU_PASID_INVALID) {
-		iommu_sva_unbind_device(sva);
-		return -ENODEV;
+	pasid = iommu_enable_pasid_dma(&idxd->pdev->dev);
+	if (pasid == INVALID_IOASID) {
+		dev_err(&idxd->pdev->dev, "No DMA PASID %d\n", ret);
+		return ret;
 	}
-
-	idxd->sva = sva;
 	idxd->pasid = pasid;
-	dev_dbg(&idxd->pdev->dev, "system pasid: %u\n", pasid);
+
 	return 0;
 }
 
 static void idxd_disable_system_pasid(struct idxd_device *idxd)
 {
-
-	iommu_sva_unbind_device(idxd->sva);
-	idxd->sva = NULL;
+	iommu_disable_pasid_dma(&idxd->pdev->dev);
 }
 
 static int idxd_probe(struct idxd_device *idxd)
@@ -532,20 +519,14 @@ static int idxd_probe(struct idxd_device *idxd)
 	dev_dbg(dev, "IDXD reset complete\n");
 
 	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) {
-		rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA);
-		if (rc == 0) {
-			rc = idxd_enable_system_pasid(idxd);
-			if (rc < 0) {
-				iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
-				dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
-			} else {
-				set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
-			}
-		} else {
-			dev_warn(dev, "Unable to turn on SVA feature.\n");
-		}
-	} else if (!sva) {
-		dev_warn(dev, "User forced SVA off via module param.\n");
+		rc = idxd_enable_system_pasid(idxd);
+		if (rc < 0)
+			dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
+		else
+			set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+
+	} else {
+		dev_warn(dev, "Unable to turn on SVA feature.\n");
 	}
 
 	idxd_read_caps(idxd);
-- 
Gitee


From 6e7f45a45d0a07bff7b1d8daf78e66697f48d381 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 26 Jan 2022 14:39:43 -0800
Subject: [PATCH 1865/2161] dmaengine/idxd: Enable SVA feature by default

commit 4b829b56c2f4ed73f9c08282f9eabb67c8ddc7b9 Intel-BKC.

User SVA is the essential feature of IDXD, enable it by default.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index d6931d56e838..f787f9197bd8 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -517,6 +517,9 @@ static int idxd_probe(struct idxd_device *idxd)
 		return rc;
 
 	dev_dbg(dev, "IDXD reset complete\n");
+	rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA);
+	if (rc)
+		dev_warn(dev, "Unable to turn on SVA feature.\n");
 
 	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) {
 		rc = idxd_enable_system_pasid(idxd);
-- 
Gitee


From 8b65e7fb7fa2441d923e3b1383a8b46809dcb572 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 27 Jan 2022 19:11:14 -0800
Subject: [PATCH 1866/2161] iommu/vt-d: Always signal completion for PRQ drain

commit aa5bf216c6f2e67026ed790e8a0628739a13d837 Intel-BKC.

Paste change alleviates potential race condition between prq event thread and prq drain.
Not for upstream, there will be a proper fix for PRQ drain.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e9cd03477790..2c60c93daf91 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1312,8 +1312,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		}
 	}
 
-	if (!completion_done(&iommu->prq_complete))
-		complete(&iommu->prq_complete);
+	complete(&iommu->prq_complete);
 
 	return IRQ_RETVAL(handled);
 }
-- 
Gitee


From c33ccc5c76709bf27fdbf721b4e64e7fce57908d Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 27 Jan 2022 21:34:08 -0800
Subject: [PATCH 1867/2161] iommu/vt-d: Remove fake SM check

commit e52a5546d2cdd3cab1383725d402676c07718ed8 Intel-BKC.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/cap_audit.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
index 738d07030cee..66c59d7e713c 100644
--- a/drivers/iommu/intel/cap_audit.h
+++ b/drivers/iommu/intel/cap_audit.h
@@ -123,7 +123,6 @@ bool intel_cap_slts_sanity(void);
 
 static inline bool scalable_mode_support(void)
 {
-	return true;
 	return (intel_iommu_sm && intel_cap_smts_sanity());
 }
 
-- 
Gitee


From bd5917640df3648da5aee616e5c7bc62e74050e0 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 7 Jul 2022 16:59:30 +0800
Subject: [PATCH 1868/2161] iommu/vt-d: Check model and stepping for PASID PT
 SRE support

commit cdaf9980a697833a3bd136be6654b17af3a8ee64 Intel-BKC.

Before SPR E stepping, PASID pass-through with supervisor request is not
supported. Warn if used on the wrong CPUs.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index ce4102f98b1c..2e04da17c624 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -6042,6 +6042,19 @@ static int intel_iommu_set_hwdbm(struct iommu_domain *domain, bool enable,
 	return ret;
 }
 
+/* Temporary check */
+static inline bool check_pasid_pt_sre(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	if (c->x86_model == 0x8f && c->x86_stepping >= 4) {
+		pr_debug("SPR E0+, PASID PT SRE enabled");
+		return true;
+	}
+	pr_alert("No PASID PT SRE, in-kernel PASID DMA not supported!!!");
+	return false;
+}
+
 static int intel_enable_pasid_dma(struct device *dev, u32 pasid)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
@@ -6049,6 +6062,12 @@ static int intel_enable_pasid_dma(struct device *dev, u32 pasid)
 	unsigned long flags;
 	int ret = 0;
 
+	/*
+	 * We don't bail here in that some drivers tie user SVM with
+	 * kernel PASID support.
+	 */
+	check_pasid_pt_sre();
+
 	info = get_domain_info(dev);
 	if (!info)
 		return -ENODEV;
-- 
Gitee


From a0cf9e9a4102e72e5b81b5ece7d8f1fa2399bdfc Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 10 Feb 2022 16:23:10 -0800
Subject: [PATCH 1869/2161] iommu: Validate domain before using it

commit 37eaf100a730b08caad836679a1a826f92aba935 Intel-BKC.

Check if a domain is valid before using it.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e602f5f516d3..2fa12132cf26 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -195,6 +195,9 @@ ioasid_t iommu_enable_pasid_dma(struct device *dev)
 		return IOASID_DMA_PASID;
 	}
 	dom = iommu_get_domain_for_dev(dev);
+	if (!dom || !dom->ops)
+		return INVALID_IOASID;
+
 	/*
 	 * Use the reserved kernel PASID for all devices. For now,
 	 * there is no need to have different PASIDs for in-kernel use.
-- 
Gitee


From 05dbb0e6d9db00768c19cacd22a411b31e5a64e2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@intel.com>
Date: Fri, 11 Feb 2022 07:13:46 -0800
Subject: [PATCH 1870/2161] crypto: Fix breakage caused by SPR-BKC-PC-v3.13

commit 715d84bd20d4ae62e3ad502f8445cded45ccd2dc Intel-BKC.

The patches added in SPR-BKC-PC-v3.13 broke iax_crypto, fix it.

Signed-off-by: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/iax_crypto.h      |  22 ++-
 drivers/crypto/iax/iax_crypto_main.c | 256 +++++++++++++++++++++++----
 2 files changed, 232 insertions(+), 46 deletions(-)

diff --git a/drivers/crypto/iax/iax_crypto.h b/drivers/crypto/iax/iax_crypto.h
index 30137c27aef5..fb6ebd7b18ac 100644
--- a/drivers/crypto/iax/iax_crypto.h
+++ b/drivers/crypto/iax/iax_crypto.h
@@ -52,16 +52,22 @@ struct iax_wq {
 
 /* Representation of IAX device with wqs, populated by probe */
 struct iax_device {
-	struct list_head	list;
-	struct idxd_device	*idxd;
+	struct list_head		list;
+	struct idxd_device		*idxd;
 
-	int			n_wq;
-	struct list_head	wqs;
+	struct aecs_table_record	*aecs_table;
+	dma_addr_t			aecs_table_addr;
 
-	u64			comp_calls;
-	u64			comp_bytes;
-	u64			decomp_calls;
-	u64			decomp_bytes;
+	void				*aecs_table_unaligned;
+	dma_addr_t			aecs_table_addr_unaligned;
+
+	int				n_wq;
+	struct list_head		wqs;
+
+	u64				comp_calls;
+	u64				comp_bytes;
+	u64				decomp_calls;
+	u64				decomp_bytes;
 };
 
 /*
diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index f139a6b23981..e310bc59dd67 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -37,6 +37,31 @@ MODULE_PARM_DESC(iax_verify_compress,
 static LIST_HEAD(iax_devices);
 static DEFINE_SPINLOCK(iax_devices_lock);
 
+static struct crypto_comp *deflate_generic_tfm;
+
+static bool iax_crypto_enabled = true;
+static int iax_crypto_enable(const char *val, const struct kernel_param *kp)
+{
+	int ret = 0;
+
+        if (val[0] == '0')
+		iax_crypto_enabled = true; /* for BKC, never allow disable */
+	else if (val[0] == '1')
+		iax_crypto_enabled = true;
+	else
+		ret = -EINVAL;
+
+	pr_info("%s: iax_crypto now %s\n", __func__,
+		iax_crypto_enabled ? "ENABLED" : "DISABLED");
+
+	return ret;
+}
+static const struct kernel_param_ops enable_ops = {
+        .set = iax_crypto_enable,
+};
+module_param_cb(iax_crypto_enable, &enable_ops, &iax_crypto_enabled, 0644);
+MODULE_PARM_DESC(iax_crypto_enable, "Enable (value = 1) or disable (value = 0) iax_crypto");
+
 int wq_stats_show(struct seq_file *m, void *v)
 {
 	struct iax_device *iax_device;
@@ -117,8 +142,6 @@ static unsigned int cpus_per_iax;
 /* Per-cpu lookup table for balanced wqs */
 static struct idxd_wq * __percpu *wq_table;
 
-static struct aecs_table_record aecs_table __aligned(IAX_AECS_ALIGN);
-
 /*
  * Given a cpu, find the closest IAX instance.  The idea is to try to
  * choose the most appropriate IAX instance for a caller and spread
@@ -210,6 +233,45 @@ const u32 fixed_d_sym[30] = {
 	0x28018, 0x28019, 0x2801A, 0x2801B, 0x2801C, 0x2801D
 };
 
+static int iax_aecs_alloc(struct iax_device *iax_device)
+{
+	size_t size = sizeof(struct aecs_table_record) + IAX_AECS_ALIGN;
+	struct device *dev = &iax_device->idxd->pdev->dev;
+	u32 bfinal = 1;
+	u32 offset;
+
+	iax_device->aecs_table_unaligned = dma_alloc_coherent(dev, size,
+							      &iax_device->aecs_table_addr_unaligned, GFP_KERNEL);
+	if (!iax_device->aecs_table_unaligned) {
+		iax_device_free(iax_device);
+		return -ENOMEM;
+	}
+	iax_device->aecs_table = PTR_ALIGN(iax_device->aecs_table_unaligned, IAX_AECS_ALIGN);
+	iax_device->aecs_table_addr = ALIGN(iax_device->aecs_table_addr_unaligned, IAX_AECS_ALIGN);
+
+	/* Configure aecs table using fixed Huffman table */
+	iax_device->aecs_table->crc = 0;
+	iax_device->aecs_table->xor_checksum = 0;
+	offset = iax_device->aecs_table->num_output_accum_bits / 8;
+	iax_device->aecs_table->output_accum[offset] = DYNAMIC_HDR | bfinal;
+	iax_device->aecs_table->num_output_accum_bits = DYNAMIC_HDR_SIZE;
+
+	/* Add Huffman table to aecs */
+	memcpy(iax_device->aecs_table->ll_sym, fixed_ll_sym, sizeof(fixed_ll_sym));
+	memcpy(iax_device->aecs_table->d_sym, fixed_d_sym, sizeof(fixed_d_sym));
+
+	return 0;
+}
+
+static void iax_aecs_free(struct iax_device *iax_device)
+{
+	size_t size = sizeof(struct aecs_table_record) + IAX_AECS_ALIGN;
+	struct device *dev = &iax_device->idxd->pdev->dev;
+
+	dma_free_coherent(dev, size,
+			  iax_device->aecs_table_unaligned, iax_device->aecs_table_addr_unaligned);
+}
+
 static struct iax_device *add_iax_device(struct idxd_device *idxd)
 {
 	struct iax_device *iax_device;
@@ -220,6 +282,11 @@ static struct iax_device *add_iax_device(struct idxd_device *idxd)
 
 	iax_device->idxd = idxd;
 
+	if (iax_aecs_alloc(iax_device) < 0) {
+		iax_device_free(iax_device);
+		return NULL;
+	}
+
 	list_add_tail(&iax_device->list, &iax_devices);
 
 	nr_iax++;
@@ -229,6 +296,8 @@ static struct iax_device *add_iax_device(struct idxd_device *idxd)
 
 static void del_iax_device(struct iax_device *iax_device)
 {
+	iax_aecs_free(iax_device);
+
 	list_del(&iax_device->list);
 
 	iax_device_free(iax_device);
@@ -322,6 +391,8 @@ static int save_iax_wq(struct idxd_wq *wq)
 	BUG_ON(nr_iax == 0);
 
 	cpus_per_iax = nr_cpus / nr_iax;
+
+	idxd_wq_get(wq);
 out:
 	spin_unlock(&iax_devices_lock);
 
@@ -394,7 +465,6 @@ static struct idxd_wq *request_iax_wq(int iax)
 	list_for_each_entry(iax_wq, &found_device->wqs, list) {
 		/* Prefer unused wq but use if we can't find one */
 		if (idxd_wq_refcount(iax_wq->wq) > 0) {
-			idxd_wq_get(iax_wq->wq);
 			bkup_wq = iax_wq->wq;
 			cur_bkup = cur_wq;
 		} else {
@@ -407,11 +477,9 @@ static struct idxd_wq *request_iax_wq(int iax)
 
 	if (bkup_wq) {
 		pr_debug("%s: returning used wq %p (%d) from iax device %p (%d)\n", __func__, bkup_wq, cur_bkup, found_device, cur_iax);
-		idxd_wq_get(bkup_wq);
 		found_wq = bkup_wq;
 		goto out;
 	}
-
 out:
 	spin_unlock(&iax_devices_lock);
 
@@ -461,10 +529,13 @@ static int iax_compress(struct crypto_tfm *tfm,
 			const u8 *src, unsigned int slen,
 			u8 *dst, unsigned int *dlen)
 {
+	dma_addr_t src_addr, dst_addr;
 	struct idxd_desc *idxd_desc;
 	struct iax_hw_desc *desc;
+	struct iax_wq *iax_wq;
 	u32 compression_crc;
 	struct idxd_wq *wq;
+	struct device *dev;
 	int ret = 0;
 
 	wq = *per_cpu_ptr(wq_table, smp_processor_id());
@@ -472,6 +543,9 @@ static int iax_compress(struct crypto_tfm *tfm,
 		pr_err("%s: no wq configured for cpu=%d\n", __func__, smp_processor_id());
 		return -ENODEV;
 	}
+	dev = &wq->idxd->pdev->dev;
+
+	iax_wq = wq->private_data;
 
 	pr_debug("%s: using wq for cpu=%d = wq %p\n", __func__, smp_processor_id(), wq);
 
@@ -487,17 +561,36 @@ static int iax_compress(struct crypto_tfm *tfm,
 	desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR |
 		IDXD_OP_FLAG_RD_SRC2_AECS | IDXD_OP_FLAG_CC;
 	desc->opcode = IAX_OPCODE_COMPRESS;
-	desc->src2_addr = (u64)(u64 *)&aecs_table;
-	desc->src2_size = sizeof(struct aecs_table_record);
 	desc->compr_flags = IAX_COMP_FLAGS;
+#ifdef SPR_E0
 	desc->priv = 1;
+#else
+	desc->priv = 0;
+#endif
+
+	src_addr = dma_map_single(dev, (void *)src, slen, DMA_TO_DEVICE);
+	pr_debug("%s: dma_map_single, src_addr %llx, dev %p, src %p, slen %d\n", __func__, src_addr, dev, src, slen);
+	if (unlikely(dma_mapping_error(dev, src_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_src;
+	}
 
-	desc->src1_addr = (u64)src;
+	dst_addr = dma_map_single(dev, (void *)dst, *dlen, DMA_FROM_DEVICE);
+	pr_debug("%s: dma_map_single, dst_addr %llx, dev %p, dst %p, *dlen %d\n", __func__, dst_addr, dev, dst, *dlen);
+	if (unlikely(dma_mapping_error(dev, dst_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_dst;
+	}
+
+	desc->src1_addr = (u64)src_addr;
 	desc->src1_size = slen;
-	desc->src2_addr = (u64)(u64 *)&aecs_table;
-	desc->dst_addr = (u64)dst;
+	desc->dst_addr = (u64)dst_addr;
 	desc->max_dst_size = *dlen;
-	desc->completion_addr = (u64)idxd_desc->iax_completion;
+	desc->src2_addr = iax_wq->iax_device->aecs_table_addr;
+	desc->src2_size = sizeof(struct aecs_table_record);
+	desc->completion_addr = idxd_desc->compl_dma;
 
 	ret = idxd_submit_desc(wq, idxd_desc);
 	if (ret) {
@@ -511,6 +604,9 @@ static int iax_compress(struct crypto_tfm *tfm,
 		goto err;
 	}
 
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+
 	*dlen = idxd_desc->iax_completion->output_size;
 
 	idxd_free_desc(wq, idxd_desc);
@@ -541,13 +637,33 @@ static int iax_compress(struct crypto_tfm *tfm,
 	desc->opcode = IAX_OPCODE_DECOMPRESS;
 	desc->max_dst_size = PAGE_SIZE;
 	desc->decompr_flags = IAX_DECOMP_FLAGS | IAX_DECOMP_SUPPRESS_OUTPUT;
+#ifdef SPR_E0
 	desc->priv = 1;
+#else
+	desc->priv = 0;
+#endif
+
+	src_addr = dma_map_single(dev, (void *)src, slen, DMA_TO_DEVICE);
+	pr_debug("%s: dma_map_single, src_addr %llx, dev %p, src %p, slen %d\n", __func__, src_addr, dev, src, slen);
+	if (unlikely(dma_mapping_error(dev, src_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_src;
+	}
 
-	desc->src1_addr = (u64)dst;
+	dst_addr = dma_map_single(dev, (void *)dst, *dlen, DMA_FROM_DEVICE);
+	pr_debug("%s: dma_map_single, dst_addr %llx, dev %p, dst %p, *dlen %d\n", __func__, dst_addr, dev, dst, *dlen);
+	if (unlikely(dma_mapping_error(dev, dst_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_dst;
+	}
+
+	desc->src1_addr = (u64)dst_addr;
 	desc->src1_size = *dlen;
-	desc->dst_addr = (u64)src;
+	desc->dst_addr = (u64)src_addr;
 	desc->max_dst_size = slen;
-	desc->completion_addr = (u64)idxd_desc->iax_completion;
+	desc->completion_addr = idxd_desc->compl_dma;
 
 	ret = idxd_submit_desc(wq, idxd_desc);
 	if (ret) {
@@ -569,10 +685,17 @@ static int iax_compress(struct crypto_tfm *tfm,
 		goto err;
 	}
 
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+
 	idxd_free_desc(wq, idxd_desc);
 out:
 	return ret;
 err:
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+err_map_dst:
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+err_map_src:
 	idxd_free_desc(wq, idxd_desc);
 	pr_warn("iax compress failed: ret=%d\n", ret);
 
@@ -583,9 +706,11 @@ static int iax_decompress(struct crypto_tfm *tfm,
 			  const u8 *src, unsigned int slen,
 			  u8 *dst, unsigned int *dlen)
 {
+	dma_addr_t src_addr, dst_addr;
 	struct idxd_desc *idxd_desc;
 	struct iax_hw_desc *desc;
 	struct idxd_wq *wq;
+	struct device *dev;
 	int ret = 0;
 
 	wq = *per_cpu_ptr(wq_table, smp_processor_id());
@@ -593,6 +718,7 @@ static int iax_decompress(struct crypto_tfm *tfm,
 		pr_err("%s: no wq configured for cpu=%d\n", __func__, smp_processor_id());
 		return -ENODEV;
 	}
+	dev = &wq->idxd->pdev->dev;
 
 	pr_debug("%s: using wq for cpu=%d = wq %p\n", __func__, smp_processor_id(), wq);
 
@@ -609,13 +735,33 @@ static int iax_decompress(struct crypto_tfm *tfm,
 	desc->opcode = IAX_OPCODE_DECOMPRESS;
 	desc->max_dst_size = PAGE_SIZE;
 	desc->decompr_flags = IAX_DECOMP_FLAGS;
+#ifdef SPR_E0
 	desc->priv = 1;
+#else
+	desc->priv = 0;
+#endif
+
+	src_addr = dma_map_single(dev, (void *)src, slen, DMA_TO_DEVICE);
+	pr_debug("%s: dma_map_single, src_addr %llx, dev %p, src %p, slen %d\n", __func__, src_addr, dev, src, slen);
+	if (unlikely(dma_mapping_error(dev, src_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_src;
+	}
+
+	dst_addr = dma_map_single(dev, (void *)dst, *dlen, DMA_FROM_DEVICE);
+	pr_debug("%s: dma_map_single, dst_addr %llx, dev %p, dst %p, *dlen %d\n", __func__, dst_addr, dev, dst, *dlen);
+	if (unlikely(dma_mapping_error(dev, dst_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_dst;
+	}
 
-	desc->src1_addr = (u64)src;
-	desc->dst_addr = (u64)dst;
+	desc->src1_addr = (u64)src_addr;
+	desc->dst_addr = (u64)dst_addr;
 	desc->max_dst_size = *dlen;
 	desc->src1_size = slen;
-	desc->completion_addr = (u64)idxd_desc->iax_completion;
+	desc->completion_addr = idxd_desc->compl_dma;
 
 	ret = idxd_submit_desc(wq, idxd_desc);
 	if (ret) {
@@ -629,6 +775,9 @@ static int iax_decompress(struct crypto_tfm *tfm,
 		goto err;
 	}
 
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+
 	*dlen = idxd_desc->iax_completion->output_size;
 
 	idxd_free_desc(wq, idxd_desc);
@@ -641,6 +790,10 @@ static int iax_decompress(struct crypto_tfm *tfm,
 out:
 	return ret;
 err:
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+err_map_dst:
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+err_map_src:
 	idxd_free_desc(wq, idxd_desc);
 	pr_warn("iax decompress failed: ret=%d\n", ret);
 
@@ -652,7 +805,14 @@ static int iax_comp_compress(struct crypto_tfm *tfm,
 			     u8 *dst, unsigned int *dlen)
 {
 	u64 start_time_ns;
-	int ret;
+	int ret = 0;
+
+	if (!iax_crypto_enabled) {
+		pr_debug("%s: iax_crypto disabled, using deflate-generic compression\n", __func__);
+		ret = crypto_comp_compress(deflate_generic_tfm,
+					   src, slen, dst, dlen);
+		return ret;
+	}
 
 	pr_debug("%s: src %p, slen %d, dst %p, dlen %u\n",
 		 __func__, src, slen, dst, *dlen);
@@ -671,7 +831,14 @@ static int iax_comp_decompress(struct crypto_tfm *tfm,
 			       u8 *dst, unsigned int *dlen)
 {
 	u64 start_time_ns;
-	int ret;
+	int ret = 0;
+
+	if (!iax_crypto_enabled) {
+		pr_debug("%s: iax_crypto disabled, using deflate-generic decompression\n", __func__);
+		ret = crypto_comp_decompress(deflate_generic_tfm,
+					     src, slen, dst, dlen);
+		return ret;
+	}
 
 	pr_debug("%s: src %p, slen %d, dst %p, dlen %u\n",
 		 __func__, src, slen, dst, *dlen);
@@ -704,11 +871,21 @@ static int iax_comp_acompress(struct acomp_req *req)
 	struct crypto_tfm *tfm = req->base.tfm;
 	u64 start_time_ns;
 	void *src, *dst;
-	int ret;
+	int ret = 0;
 
 	src = kmap_atomic(sg_page(req->src)) + req->src->offset;
 	dst = kmap_atomic(sg_page(req->dst)) + req->dst->offset;
 
+	if (!iax_crypto_enabled) {
+		pr_debug("%s: iax_crypto disabled, using deflate-generic compression\n", __func__);
+		ret = crypto_comp_compress(deflate_generic_tfm,
+					   src, req->slen, dst, &req->dlen);
+		kunmap_atomic(src);
+		kunmap_atomic(dst);
+
+		return ret;
+	}
+
 	pr_debug("%s: src %p (offset %d), slen %d, dst %p (offset %d), dlen %u\n",
 		 __func__, src, req->src->offset, req->slen,
 		 dst, req->dst->offset, req->dlen);
@@ -736,6 +913,15 @@ static int iax_comp_adecompress(struct acomp_req *req)
 	src = kmap_atomic(sg_page(req->src)) + req->src->offset;
 	dst = kmap_atomic(sg_page(req->dst)) + req->dst->offset;
 
+	if (!iax_crypto_enabled) {
+		pr_debug("%s: iax_crypto disabled, using deflate-generic decompression\n", __func__);
+		ret = crypto_comp_decompress(deflate_generic_tfm,
+					     src, req->slen, dst, &req->dlen);
+		kunmap_atomic(src);
+		kunmap_atomic(dst);
+		return ret;
+	}
+
 	pr_debug("%s: src %p (offset %d), slen %d, dst %p (offset %d), dlen %u\n",
 		 __func__, src, req->src->offset, req->slen,
 		 dst, req->dst->offset, req->dlen);
@@ -794,23 +980,6 @@ static void iax_unregister_compression_device(void)
 	crypto_unregister_acomp(&iax_acomp_deflate);
 }
 
-static void iax_set_aecs(void)
-{
-	u32 offset;
-	u32 bfinal = 1;
-
-	/* Configure aecs table using fixed Huffman table */
-	aecs_table.crc = 0;
-	aecs_table.xor_checksum = 0;
-	offset = aecs_table.num_output_accum_bits / 8;
-	aecs_table.output_accum[offset] = DYNAMIC_HDR | bfinal;
-	aecs_table.num_output_accum_bits = DYNAMIC_HDR_SIZE;
-
-	/* Add Huffman table to aecs */
-	memcpy(aecs_table.ll_sym, fixed_ll_sym, sizeof(fixed_ll_sym));
-	memcpy(aecs_table.d_sym, fixed_d_sym, sizeof(fixed_d_sym));
-}
-
 static void rebalance_wq_table(void)
 {
 	int node, cpu, iax;
@@ -889,6 +1058,7 @@ static int iax_crypto_probe(struct idxd_dev *idxd_dev)
 
 err_save:
 	__idxd_wq_quiesce(wq);
+	percpu_ref_exit(&wq->wq_active);
 err_ref:
 	idxd_wq_free_resources(wq);
 err_alloc:
@@ -910,6 +1080,7 @@ static void iax_crypto_remove(struct idxd_dev *idxd_dev)
 	__drv_disable_wq(wq);
 	idxd_wq_free_resources(wq);
 	wq->type = IDXD_WQT_NONE;
+	percpu_ref_exit(&wq->wq_active);
 	rebalance_wq_table();
 
 	mutex_unlock(&wq->wq_lock);
@@ -934,7 +1105,14 @@ static int __init iax_crypto_init_module(void)
 	nr_cpus = num_online_cpus();
 	nr_nodes = num_online_nodes();
 
-	iax_set_aecs();
+	if (crypto_has_comp("deflate-generic", 0, 0))
+		deflate_generic_tfm = crypto_alloc_comp("deflate-generic", 0, 0);
+
+	if (IS_ERR_OR_NULL(deflate_generic_tfm)) {
+		pr_err("IAX could not alloc %s tfm: errcode = %ld\n",
+		       "deflate-generic", PTR_ERR(deflate_generic_tfm));
+		return -ENOMEM;
+	}
 
 	wq_table = alloc_percpu(struct idxd_wq *);
 	if (!wq_table)
@@ -963,6 +1141,7 @@ static int __init iax_crypto_init_module(void)
 err_crypto_register:
 	idxd_driver_unregister(&iax_crypto_driver);
 err_driver_register:
+	crypto_free_comp(deflate_generic_tfm);
 	free_percpu(wq_table);
 
 	goto out;
@@ -975,6 +1154,7 @@ static void __exit iax_crypto_cleanup_module(void)
 	iax_unregister_compression_device();
 	free_percpu(wq_table);
 	free_iax_devices();
+	crypto_free_comp(deflate_generic_tfm);
 	pr_info("%s: cleaned up\n", __func__);
 }
 
-- 
Gitee


From 765176e34d4b9e4cf46248d271281d36438196d0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 15 Feb 2022 10:01:35 -0700
Subject: [PATCH 1871/2161] dmaengine: idxd: fix return value for
 idxd_enable_system_pasid()

commit e75e0042dc577ec58b3d4e81415f1b6647daed7d Intel-BKC.

The return value ret is returned without being initialized to an error
value. This caused the failing function to return success no matter what.
Remove return value and return error directly.

Fixes: 253a70def285 ("dmaengine: idxd: Use DMA API for in-kernel PASID DMA")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/init.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index f787f9197bd8..771bc18a8419 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -488,12 +488,11 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d
 static int idxd_enable_system_pasid(struct idxd_device *idxd)
 {
 	u32 pasid;
-	int ret;
 
 	pasid = iommu_enable_pasid_dma(&idxd->pdev->dev);
 	if (pasid == INVALID_IOASID) {
-		dev_err(&idxd->pdev->dev, "No DMA PASID %d\n", ret);
-		return ret;
+		dev_err(&idxd->pdev->dev, "No DMA PASID.\n");
+		return -ENXIO;
 	}
 	idxd->pasid = pasid;
 
-- 
Gitee


From ccd136b85a8d19a2df3f101c33ef7d625f08a6c5 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 15 Feb 2022 14:13:22 -0700
Subject: [PATCH 1872/2161] ioasid: Revert range adjustment

commit 819732383b78b99fb44191874c6606881adc54bf Intel-BKC.

Reserving IOASID 0 and 1 for RID2PASID and kernel DMA PASID moves
allocation base to 2 (IOASID_ALLOC_BASE). However, callers still
expect to use ioasid refcount for reserved PASID such as the default
RID2PASID for each domain.
Revert this range adjustment such that ioasid refcount still works as
is. Future fix should be avoid allocating reserved PASIDs.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 572ce327c59c..5fdbb2619cca 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -803,8 +803,6 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 	data->private = private;
 	refcount_set(&data->refs, 1);
 
-	if (min < IOASID_ALLOC_BASE)
-		min = IOASID_ALLOC_BASE;
 	/*
 	 * Custom allocator needs allocator data to perform platform specific
 	 * operations.
-- 
Gitee


From 996d1fdcf14af721ac84d6c9360b39f804daeeb4 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 16 Feb 2022 22:22:21 -0800
Subject: [PATCH 1873/2161] iommu/vtd: Fix vIOMMU GIOVA by avoiding reserved
 IOASIDs

commit 7125eae9f6d964b2bee713f056f70402db7ad57f Intel-BKC.

Globally reserved PASIDs are used for in-kernel DMA request.  This
method works on the host and conserve PASID resources.

However, under guest environment all guest PASIDs should be allocated
via the virtual command interface.  Otherwise, the reserved PASID's in
the guest has no host PASID's backing.

This patch forces all DMA API PASIDs to be allocated via IOASID
allocator.  Additional PASIDs are consumed in that each device with
PASID DMA will allocate its own PASID.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/dma-iommu.c | 16 +++++++++++-----
 include/linux/ioasid.h    |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 2fa12132cf26..4221422f8b14 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -175,6 +175,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL(iommu_put_dma_cookie);
 
+#define PCI_PASID_MAX 0x100000 /* TODO: Use per dev limits */
 /**
  * iommu_enable_pasid_dma --Enable in-kernel DMA request with PASID
  * @dev:	Device to be enabled
@@ -189,25 +190,28 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
 ioasid_t iommu_enable_pasid_dma(struct device *dev)
 {
 	struct iommu_domain *dom;
+	u32 pasid;
 
 	if (dev->pasid) {
 		dev_err(dev, "PASID DMA already enabled\n");
-		return IOASID_DMA_PASID;
+		return dev->pasid;
 	}
 	dom = iommu_get_domain_for_dev(dev);
 	if (!dom || !dom->ops)
 		return INVALID_IOASID;
 
+	pasid = ioasid_alloc(host_pasid_set, IOASID_ALLOC_BASE, PCI_PASID_MAX, dev);
+	dev_alert(dev, "%s: PASID %u\n", __func__, pasid);
 	/*
 	 * Use the reserved kernel PASID for all devices. For now,
 	 * there is no need to have different PASIDs for in-kernel use.
 	 */
-	if (!dom->ops->enable_pasid_dma || dom->ops->enable_pasid_dma(dev, IOASID_DMA_PASID))
+	if (!dom->ops->enable_pasid_dma || dom->ops->enable_pasid_dma(dev, pasid))
 		return INVALID_IOASID;
 	/* Used for device IOTLB flush */
-	dev->pasid = IOASID_DMA_PASID;
+	dev->pasid = pasid;
 
-	return IOASID_DMA_PASID;
+	return pasid;
 }
 EXPORT_SYMBOL(iommu_enable_pasid_dma);
 
@@ -235,9 +239,11 @@ int iommu_disable_pasid_dma(struct device *dev)
 	if (!dom->ops->disable_pasid_dma)
 		return -ENOTSUPP;
 
+	dev_alert(dev, "%s: PASID %u\n", __func__, dev->pasid);
 	ret = dom->ops->disable_pasid_dma(dev);
 	if (!ret)
-		dev->pasid = 0;
+		ioasid_free(host_pasid_set, dev->pasid);
+	dev->pasid = 0;
 
 	return ret;
 }
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index aa349873825f..02be9292a74a 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -10,8 +10,7 @@
 
 #define INVALID_IOASID ((ioasid_t)-1)
 #define IOASID_DMA_NO_PASID	0 /* For DMA request w/o PASID */
-#define IOASID_DMA_PASID	1 /* For in-kernel DMA w/ PASID */
-#define IOASID_ALLOC_BASE	2 /* Start of the allocation */
+#define IOASID_ALLOC_BASE	1 /* Start of the allocation */
 
 typedef unsigned int ioasid_t;
 typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
@@ -165,6 +164,7 @@ extern struct ioasid_user *ioasid_user_get_from_task(struct task_struct *task);
 extern void ioasid_user_put(struct ioasid_user *iuser);
 extern void ioasid_user_for_each_id(struct ioasid_user *iuser, void *data,
 				   void (*fn)(ioasid_t id, void *data));
+extern struct ioasid_set *host_pasid_set;
 
 #else /* CONFIG_IOASID_USER */
 static inline struct ioasid_user *
-- 
Gitee


From 8a6ad8c03f47e15a0c05f571cb2bddb7b3e38447 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 10 Feb 2022 16:45:51 -0700
Subject: [PATCH 1874/2161] dmaengine: idxd: separate user and kernel pasid
 allocation

commit 986267eb328124be087a58fb85ddd797f08b58e6 Intel-BKC.

The idxd driver always gated the pasid enabling under a single knob and
this assumption is incorrect. The pasid used for kernel operation can be
independently toggled and has no dependency on the user pasid (and vice
versa). Split the two so they are independent "enabled" flags.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/cdev.c       |  4 ++--
 drivers/dma/idxd/idxd.h       |  6 ++++++
 drivers/dma/idxd/init.c       | 17 +++++++++++------
 drivers/vfio/mdev/idxd/vdev.c |  4 ++--
 4 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 1f6a456149b8..b54c953f5c1b 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -99,7 +99,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 	ctx->wq = wq;
 	filp->private_data = ctx;
 
-	if (device_pasid_enabled(idxd)) {
+	if (device_user_pasid_enabled(idxd)) {
 		sva = iommu_sva_bind_device(dev, current->mm, NULL);
 		if (IS_ERR(sva)) {
 			rc = PTR_ERR(sva);
@@ -152,7 +152,7 @@ static int idxd_cdev_release(struct inode *node, struct file *filep)
 	if (wq_shared(wq)) {
 		idxd_device_drain_pasid(idxd, ctx->pasid);
 	} else {
-		if (device_pasid_enabled(idxd)) {
+		if (device_user_pasid_enabled(idxd)) {
 			/* The wq disable in the disable pasid function will drain the wq */
 			rc = idxd_wq_disable_pasid(wq);
 			if (rc < 0)
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index ca1b1491fa90..fd03f44dc5b8 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -251,6 +251,7 @@ enum idxd_device_flag {
 	IDXD_FLAG_CONFIGURABLE = 0,
 	IDXD_FLAG_CMD_RUNNING,
 	IDXD_FLAG_PASID_ENABLED,
+	IDXD_FLAG_USER_PASID_ENABLED,
 	IDXD_FLAG_IMS_SUPPORTED,
 };
 
@@ -496,6 +497,11 @@ static inline bool device_pasid_enabled(struct idxd_device *idxd)
 	return test_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
 }
 
+static inline bool device_user_pasid_enabled(struct idxd_device *idxd)
+{
+	return test_bit(IDXD_FLAG_USER_PASID_ENABLED, &idxd->flags);
+}
+
 static inline bool device_swq_supported(struct idxd_device *idxd)
 {
 	return (support_enqcmd && device_pasid_enabled(idxd));
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 771bc18a8419..82f6e272ae61 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -516,11 +516,13 @@ static int idxd_probe(struct idxd_device *idxd)
 		return rc;
 
 	dev_dbg(dev, "IDXD reset complete\n");
-	rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA);
-	if (rc)
-		dev_warn(dev, "Unable to turn on SVA feature.\n");
 
 	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) {
+		rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA);
+		if (rc)
+			dev_warn(dev, "Failed to enable user PASID.\n");
+		else
+			set_bit(IDXD_FLAG_USER_PASID_ENABLED, &idxd->flags);
 		rc = idxd_enable_system_pasid(idxd);
 		if (rc < 0)
 			dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
@@ -564,7 +566,8 @@ static int idxd_probe(struct idxd_device *idxd)
  err:
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
-	iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
+	if (device_user_pasid_enabled(idxd))
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
 	return rc;
 }
 
@@ -577,7 +580,8 @@ static void idxd_cleanup(struct idxd_device *idxd)
 	idxd_cleanup_internals(idxd);
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
-	iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
+	if (device_user_pasid_enabled(idxd))
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
 }
 
 static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
@@ -694,7 +698,8 @@ static void idxd_remove(struct pci_dev *pdev)
 	free_irq(irq_entry->vector, irq_entry);
 	pci_free_irq_vectors(pdev);
 	pci_iounmap(pdev, idxd->reg_base);
-	iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
+	if (device_user_pasid_enabled(idxd))
+		iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
 	pci_disable_device(pdev);
 	destroy_workqueue(idxd->wq);
 	perfmon_pmu_remove(idxd);
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 82eb4ac1aca8..534d90ad2aef 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -691,7 +691,7 @@ static void vidxd_mmio_init_wqcfg(struct vdcm_idxd *vidxd)
 
 	if (wq_dedicated(wq))
 		wqcfg->mode = WQCFG_MODE_DEDICATED;
-	else if (device_pasid_enabled(idxd))
+	else if (device_user_pasid_enabled(idxd))
 		wqcfg->pasid_en = 1;
 
 	wqcfg->bof = wq->wqcfg->bof;
@@ -722,7 +722,7 @@ static void vidxd_mmio_init_gencap(struct vdcm_idxd *vidxd)
 	gencap->config_en = 0;
 	gencap->max_ims_mult = 0;
 	gencap->cmd_cap = 1;
-	if (device_pasid_enabled(idxd))
+	if (device_user_pasid_enabled(idxd))
 		gencap->block_on_fault = 1;
 }
 
-- 
Gitee


From e611f7e909e7b8a8a25273d4e06e9eb2fec4a969 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 25 Feb 2022 14:22:05 -0700
Subject: [PATCH 1875/2161] dmaengine: idxd: don't load pasid config until
 needed

commit 3157dd0a366183adaea2f4d8721961637d562fee upstream.

The driver currently programs the system pasid to the WQ preemptively when
system pasid is enabled. Given that a dwq will reprogram the pasid and
pasid_en bit, the programming is not necessary. Remove system pasid
programming on device config write. Add pasid programming for kernel wq
type on wq driver enable.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 22 +++++++++++++++++-----
 include/uapi/linux/idxd.h |  1 +
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 25f8c9f79200..a10ae8429306 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -924,11 +924,15 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	if (wq_dedicated(wq))
 		wq->wqcfg->mode = 1;
 
-	if (device_pasid_enabled(idxd)) {
+	/*
+	 * Enable this for shared WQ. swq does not need to program the pasid field,
+	 * but pasid_en needs to be set. Programming here prevents swq being disabled
+	 * and re-enabled, which is something to avoid.
+	 */
+	if ((is_idxd_wq_kernel(wq) && device_pasid_enabled(idxd)) ||
+	    ((is_idxd_wq_user(wq) || is_idxd_wq_mdev(wq)) &&
+	     device_user_pasid_enabled(idxd)))
 		wq->wqcfg->pasid_en = 1;
-		if (wq->type == IDXD_WQT_KERNEL && wq_dedicated(wq))
-			wq->wqcfg->pasid = idxd->pasid;
-	}
 
 	/*
 	 * Here the priv bit is set depending on the WQ type. priv = 1 if the
@@ -1129,7 +1133,6 @@ static int idxd_wq_load_config(struct idxd_wq *wq)
 
 	if (device_pasid_enabled(idxd) && wq->wqcfg->mode == 1) {
 		wq->wqcfg->pasid_en = 1;
-		wq->wqcfg->pasid = idxd->pasid;
 		wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
 		iowrite32(wq->wqcfg->bits[WQCFG_PASID_IDX], idxd->reg_base + wqcfg_offset);
 	}
@@ -1384,6 +1387,15 @@ int __drv_enable_wq(struct idxd_wq *wq)
 		}
 	}
 
+	if (device_pasid_enabled(idxd) && is_idxd_wq_kernel(wq) && wq_dedicated(wq)) {
+		rc = idxd_wq_set_pasid(wq, idxd->pasid);
+		if (rc < 0) {
+			idxd->cmd_status = IDXD_SCMD_WQ_PASID_ERR;
+			dev_dbg(dev, "wq set PASID failed.\n");
+			goto err;
+		}
+	}
+
 	rc = 0;
 	spin_lock(&idxd->dev_lock);
 	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 19e4e532918f..cc311a917b9f 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -29,6 +29,7 @@ enum idxd_scmd_stat {
 	IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
 	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
 	IDXD_SCMD_WQ_IRQ_ERR = 0x80100000,
+	IDXD_SCMD_WQ_PASID_ERR = 0x80200000,
 };
 
 #define IDXD_SCMD_SOFTERR_MASK	0x80000000
-- 
Gitee


From 3cc2dab6fa7221e4518d8f7aae1daea9816263ad Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 28 Feb 2022 16:44:02 -0700
Subject: [PATCH 1876/2161] vfio: mdev: idxd: fix setup pasid permission for
 IMS in guest scalable mode

commit 36820eed1ae19777d268b452d79daed1fae55764 Intel-BKC.

Change where pasid is setup for DWQ WQCFG. Have the driver only set it when
WQ is being enabled.

Also remove setting of IMS pasid entry in vidxd_mmio_write() to virtual MSIXPERM
register. Add the IMS pasid configuration in set_signal() for both with
vIOMMU and without. This is where it belongs anyhow.

Also moved wq_enable_irq() to after device disable to reduce unnecessary
extra code running when the wq isn't ready.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c     | 47 ++++++++++++++++++++++++++---------
 drivers/dma/idxd/dma.c        | 20 +++++++--------
 drivers/vfio/mdev/idxd/mdev.c | 44 ++++++++++++++++++++++++++------
 drivers/vfio/mdev/idxd/vdev.c | 44 +++++++++++---------------------
 4 files changed, 96 insertions(+), 59 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index a10ae8429306..45142d8ce78d 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -374,16 +374,25 @@ int idxd_wq_abort(struct idxd_wq *wq, u32 *status)
 }
 EXPORT_SYMBOL_GPL(idxd_wq_abort);
 
-int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
+static void __idxd_wq_set_priv_locked(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
-	int rc;
 	union wqcfg wqcfg;
 	unsigned int offset;
 
-	rc = idxd_wq_disable(wq, false, NULL);
-	if (rc < 0)
-		return rc;
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PRIV_IDX);
+	spin_lock(&idxd->dev_lock);
+	wqcfg.bits[WQCFG_PRIV_IDX] = ioread32(idxd->reg_base + offset);
+	wqcfg.priv = 1;
+	iowrite32(wqcfg.bits[WQCFG_PRIV_IDX], idxd->reg_base + offset);
+	spin_unlock(&idxd->dev_lock);
+}
+
+static void __idxd_wq_set_pasid_locked(struct idxd_wq *wq, int pasid)
+{
+	struct idxd_device *idxd = wq->idxd;
+	union wqcfg wqcfg;
+	unsigned int offset;
 
 	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
 	spin_lock(&idxd->dev_lock);
@@ -392,6 +401,17 @@ int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
 	wqcfg.pasid = pasid;
 	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
 	spin_unlock(&idxd->dev_lock);
+}
+
+int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
+{
+	int rc;
+
+	rc = idxd_wq_disable(wq, false, NULL);
+	if (rc < 0)
+		return rc;
+
+	__idxd_wq_set_pasid_locked(wq, pasid);
 
 	rc = idxd_wq_enable(wq, NULL);
 	if (rc < 0)
@@ -1263,9 +1283,11 @@ static void idxd_device_set_perm_entry(struct idxd_device *idxd,
 {
 	union msix_perm mperm;
 
+	dev_dbg(&idxd->pdev->dev, "set MSIX_PERM entry for idx %d\n", ie->id);
 	if (ie->pasid == INVALID_IOASID)
 		return;
 
+	dev_dbg(&idxd->pdev->dev, "pasid %u for MSIX_PERM\n", idxd->pasid);
 	mperm.bits = 0;
 	mperm.pasid = ie->pasid;
 	mperm.pasid_en = 1;
@@ -1275,6 +1297,7 @@ static void idxd_device_set_perm_entry(struct idxd_device *idxd,
 static void idxd_device_clear_perm_entry(struct idxd_device *idxd,
 					 struct idxd_irq_entry *ie)
 {
+	dev_dbg(&idxd->pdev->dev, "clear MSIX_PERM entry for idx %d\n", ie->id);
 	iowrite32(0, idxd->reg_base + idxd->msix_perm_offset + ie->id * 8);
 }
 
@@ -1302,6 +1325,8 @@ int idxd_wq_request_irq(struct idxd_wq *wq)
 	struct idxd_irq_entry *ie;
 	int rc;
 
+	dev_dbg(dev, "wq %d enabling ie %d irq\n", wq->id, ie->id);
+
 	ie = &wq->ie;
 	ie->vector = pci_irq_vector(pdev, ie->id);
 	ie->pasid = device_pasid_enabled(idxd) ? idxd->pasid : INVALID_IOASID;
@@ -1387,13 +1412,11 @@ int __drv_enable_wq(struct idxd_wq *wq)
 		}
 	}
 
-	if (device_pasid_enabled(idxd) && is_idxd_wq_kernel(wq) && wq_dedicated(wq)) {
-		rc = idxd_wq_set_pasid(wq, idxd->pasid);
-		if (rc < 0) {
-			idxd->cmd_status = IDXD_SCMD_WQ_PASID_ERR;
-			dev_dbg(dev, "wq set PASID failed.\n");
-			goto err;
-		}
+
+	if (is_idxd_wq_kernel(wq)) {
+		if (device_pasid_enabled(idxd) && wq_dedicated(wq))
+			__idxd_wq_set_pasid_locked(wq, idxd->pasid);
+		__idxd_wq_set_priv_locked(wq);
 	}
 
 	rc = 0;
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index bfff59617d04..67a2b9ed09f0 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -290,13 +290,6 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 	mutex_lock(&wq->wq_lock);
 	wq->type = IDXD_WQT_KERNEL;
 
-	rc = idxd_wq_request_irq(wq);
-	if (rc < 0) {
-		idxd->cmd_status = IDXD_SCMD_WQ_IRQ_ERR;
-		dev_dbg(dev, "WQ %d irq setup failed: %d\n", wq->id, rc);
-		goto err_irq;
-	}
-
 	rc = __drv_enable_wq(wq);
 	if (rc < 0) {
 		dev_dbg(dev, "Enable wq %d failed: %d\n", wq->id, rc);
@@ -304,6 +297,13 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 		goto err;
 	}
 
+	rc = idxd_wq_request_irq(wq);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_IRQ_ERR;
+		dev_dbg(dev, "WQ %d irq setup failed: %d\n", wq->id, rc);
+		goto err_irq;
+	}
+
 	rc = idxd_wq_alloc_resources(wq);
 	if (rc < 0) {
 		idxd->cmd_status = IDXD_SCMD_WQ_RES_ALLOC_ERR;
@@ -335,10 +335,10 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 err_ref:
 	idxd_wq_free_resources(wq);
 err_res_alloc:
-	__drv_disable_wq(wq);
-err:
 	idxd_wq_free_irq(wq);
 err_irq:
+	__drv_disable_wq(wq);
+err:
 	wq->type = IDXD_WQT_NONE;
 	mutex_unlock(&wq->wq_lock);
 	return rc;
@@ -352,9 +352,9 @@ static void idxd_dmaengine_drv_remove(struct idxd_dev *idxd_dev)
 	__idxd_wq_quiesce(wq);
 	idxd_unregister_dma_channel(wq);
 	idxd_wq_free_resources(wq);
+	idxd_wq_free_irq(wq);
 	__drv_disable_wq(wq);
 	percpu_ref_exit(&wq->wq_active);
-	idxd_wq_free_irq(wq);
 	wq->type = IDXD_WQT_NONE;
 	mutex_unlock(&wq->wq_lock);
 }
diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 7ff1c87cecae..39ffca5c4a38 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -181,19 +181,24 @@ int idxd_mdev_get_host_pasid(struct mdev_device *mdev, u32 gpasid, u32 *pasid)
 	struct mm_struct *mm;
 
 	mm = get_task_mm(current);
-	if (!mm)
+	if (!mm) {
+		dev_warn(mdev_dev(mdev), "%s no mm!\n", __func__);
 		return -ENXIO;
+	}
 
 	ioasid_set = ioasid_find_mm_set(mm);
 	if (!ioasid_set) {
 		mmput(mm);
+		dev_warn(mdev_dev(mdev), "%s no ioasid_set!\n", __func__);
 		return -ENXIO;
 	}
 
 	*pasid = ioasid_find_by_spid(ioasid_set, gpasid, true);
 	mmput(mm);
-	if (*pasid == INVALID_IOASID)
+	if (*pasid == INVALID_IOASID) {
+		dev_warn(mdev_dev(mdev), "%s invalid ioasid by spid!\n", __func__);
 		return -ENXIO;
+	}
 
 	return 0;
 }
@@ -1480,6 +1485,8 @@ static int idxd_vdcm_msix_set_vector_signal(struct vdcm_idxd *vidxd, int vector,
 	char *name;
 	u32 pasid, auxval;
 	int irq, rc;
+	u8 *bar0 = vidxd->bar0;
+	u32 msix_perm;
 
 	dev_dbg(dev, "%s: set signal %d fd: %d\n", __func__, vector, fd);
 
@@ -1523,6 +1530,7 @@ static int idxd_vdcm_msix_set_vector_signal(struct vdcm_idxd *vidxd, int vector,
 					vfio_pdev->ctx[vector].producer.token, rc);
 				vfio_pdev->ctx[vector].producer.token = NULL;
 			}
+			dev_dbg(dev, "%s: updated irq %d\n", __func__, irq);
 		}
 		return 0;
 	}
@@ -1550,13 +1558,31 @@ static int idxd_vdcm_msix_set_vector_signal(struct vdcm_idxd *vidxd, int vector,
 		return 0;
 	}
 
-	rc = idxd_mdev_get_pasid(mdev, &pasid);
-	if (rc < 0) {
-		dev_warn(dev, "%s unable to get pasid, failing\n", __func__);
-		goto err;
-	}
+	/*
+	 * This only points to MSIX entry 1, which is fine for now.
+	 */
+	msix_perm = *(u32 *)(bar0 + VIDXD_MSIX_PERM_OFFSET + 8 * vector);
+	dev_dbg(dev, "MSIX PERM: %#x\n", msix_perm);
+	if (!(msix_perm & BIT(3))) {
+		rc = idxd_mdev_get_pasid(mdev, &pasid);
+		if (rc < 0) {
+			dev_warn(dev, "%s unable to get pasid, failing\n", __func__);
+			goto err;
+		}
+
+		dev_dbg(dev, "%s: pasid: %d\n", __func__, pasid);
+	} else {
+		u32 gpasid;
 
-	dev_dbg(dev, "%s: pasid: %d\n", __func__, pasid);
+		gpasid = (msix_perm & GENMASK(31, 12)) >> 12;
+		rc = idxd_mdev_get_host_pasid(vidxd->ivdev.mdev, gpasid, &pasid);
+		if (rc < 0) {
+			dev_warn(dev, "%s guest pasid %u translate failure\n", __func__, gpasid);
+			goto err;
+		}
+		dev_dbg(dev, "%s: guest pasid: %u host pasid: %u\n",
+			__func__, gpasid, pasid);
+	}
 
 	auxval = ims_ctrl_pasid_aux(pasid, true);
 	rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
@@ -1580,6 +1606,8 @@ static int idxd_vdcm_msix_set_vector_signal(struct vdcm_idxd *vidxd, int vector,
 		vfio_pdev->ctx[vector].producer.token = NULL;
 	}
 
+	dev_dbg(dev, "%s: irq %d set\n", __func__, irq);
+
 	return 0;
 
  irq_err:
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 534d90ad2aef..af9e9ae7a64c 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -84,6 +84,8 @@ static int vidxd_set_ims_pasid(struct vdcm_idxd *vidxd, int index, bool pasid_en
 		rc = idxd_mdev_get_pasid(vidxd->ivdev.mdev, &pasid);
 	if (rc < 0)
 		return rc;
+	dev_dbg(dev, "IMS entry: %d pasid_en: %u guest pasid %u host pasid: %u\n",
+		index, pasid_en, gpasid, pasid);
 	auxval = ims_ctrl_pasid_aux(pasid, 1);
 	return irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
 
@@ -98,12 +100,16 @@ int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int s
 	dev_dbg(dev, "vidxd mmio W %d %x %x: %llx\n", vidxd->wq->id, size,
 		offset, get_reg_val(buf, size));
 
-	if (((size & (size - 1)) != 0) || (offset & (size - 1)) != 0)
+	if (((size & (size - 1)) != 0) || (offset & (size - 1)) != 0) {
+		dev_warn(dev, "XXX %s out of bounds\n", __func__);
 		return -EINVAL;
+	}
 
 	/* If we don't limit this, we potentially can write out of bound */
-	if (size > sizeof(u32))
+	if (size > sizeof(u32)) {
+		dev_warn(dev, "XXX %s size greater than u32\n", __func__);
 		return -EINVAL;
+	}
 
 	switch (offset) {
 	case IDXD_GENCFG_OFFSET ... IDXD_GENCFG_OFFSET + 3:
@@ -213,9 +219,8 @@ int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int s
 	}
 
 	case VIDXD_MSIX_PERM_OFFSET ...  VIDXD_MSIX_PERM_OFFSET + VIDXD_MSIX_PERM_TBL_SZ - 1: {
-		int index, rc;
+		int index;
 		u32 msix_perm;
-		u32 pasid, pasid_en;
 
 		if (size != sizeof(u32) || !IS_ALIGNED(offset, sizeof(u64))) {
 			dev_warn(dev, "XXX unaligned MSIX PERM access\n");
@@ -225,30 +230,8 @@ int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int s
 		index = (offset - VIDXD_MSIX_PERM_OFFSET) / 8;
 		msix_perm = get_reg_val(buf, sizeof(u32)) & 0xfffff00d;
 		memcpy(bar0 + offset, buf, size);
-		/*
-		 * index 0 for MSIX is emulated for misc interrupts. The MSIX indices from
-		 * 1...N are backed by IMS. Here we would pass in index - 1, which is 0 for
-		 * the first one
-		 */
-		if (index > 0) {
-			pasid_en = (msix_perm >> 3) & 1;
-
-			/*
-			 * When vSVA is turned on, this is the only place where the guest PASID
-			 * can be retrieved by the host. The guest driver writes the PASID to the
-			 * MSIX permission entry. In turn the vdcm will translate this to the
-			 * IMS entry.
-			 */
-
-			if (pasid_en) {
-				pasid = (msix_perm >> 12) & 0xfffff;
-				if (!pasid)
-					break;
-			}
-			rc = vidxd_set_ims_pasid(vidxd, index - 1, pasid_en, pasid);
-			if (rc < 0)
-				return rc;
-		}
+		dev_dbg(dev, "%s writing to MSIX_PERM: %#x offset %#x index: %u\n",
+			__func__, msix_perm, offset, index);
 		break;
 	}
 	} /* offset */
@@ -1200,7 +1183,7 @@ static void vidxd_wq_enable(struct vdcm_idxd *vidxd, int wq_id)
 	wq_pasid_enable = vwqcfg->pasid_en;
 
 	if (wq_dedicated(wq)) {
-		u32 wq_pasid;
+		u32 wq_pasid = ~0U;
 		bool priv;
 
 		if (wq_pasid_enable) {
@@ -1209,9 +1192,12 @@ static void vidxd_wq_enable(struct vdcm_idxd *vidxd, int wq_id)
 			priv = vwqcfg->priv;
 			gpasid = vwqcfg->pasid;
 			rc = idxd_mdev_get_host_pasid(mdev, gpasid, &wq_pasid);
+			dev_dbg(dev, "guest pasid enabled, translate gpasid: %d\n", gpasid);
 		} else {
 			priv = 1;
 			rc = idxd_mdev_get_pasid(mdev, &wq_pasid);
+			dev_dbg(dev, "guest pasid disabled, using default host pasid: %u\n",
+				wq_pasid);
 		}
 		if (rc < 0) {
 			dev_err(dev, "idxd pasid setup failed wq %d: %d\n", wq->id, rc);
-- 
Gitee


From c48abd185108881be84059274e95e990bad49d0a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 24 Jan 2022 16:10:05 -0700
Subject: [PATCH 1877/2161] dmaengine: idxd: add wq driver name support for
 accel-config user tool

commit e918172fa7b78571a137cb10ecc17536a913864c Intel-BKC.

With the possibility of multiple wq drivers that can be bound to the wq,
the user config tool accel-config needs a way to know which wq driver to
bind to the wq. Introduce per wq driver_name sysfs attribute where the user
can indicate the driver to be bound to the wq. This allows accel-config to
just bind to the driver using wq->driver_name.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>

 Conflicts:
	drivers/dma/idxd/dma.c
	include/uapi/linux/idxd.h

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  6 ++++
 drivers/dma/idxd/cdev.c                       |  8 ++++++
 drivers/dma/idxd/dma.c                        |  7 +++++
 drivers/dma/idxd/idxd.h                       |  6 ++++
 drivers/dma/idxd/sysfs.c                      | 28 +++++++++++++++++++
 include/uapi/linux/idxd.h                     |  1 +
 6 files changed, 56 insertions(+)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index e1293bfa4187..16076301f9cb 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -227,6 +227,12 @@ Contact:	dmaengine@vger.kernel.org
 Description:	Indicate the number of retires for an enqcmds submission on a sharedwq.
 		A max value to set attribute is capped at 64.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/driver_name
+Date:		Jan 21, 2022
+KernelVersion:	5.18.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Name of driver to be bounded to the wq.
+
 What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index b54c953f5c1b..202ee1e95464 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -305,6 +305,7 @@ void idxd_wq_del_cdev(struct idxd_wq *wq)
 
 static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
 {
+	struct device *dev = &idxd_dev->conf_dev;
 	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
 	struct idxd_device *idxd = wq->idxd;
 	int rc;
@@ -313,6 +314,12 @@ static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
 		return -ENXIO;
 
 	mutex_lock(&wq->wq_lock);
+	if (!idxd_wq_driver_name_match(wq, dev)) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_DRV_NAME;
+		rc = -ENODEV;
+		goto err_drv_name;
+	}
+
 	wq->type = IDXD_WQT_USER;
 	rc = __drv_enable_wq(wq);
 	if (rc < 0)
@@ -331,6 +338,7 @@ static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
 err_cdev:
 	__drv_disable_wq(wq);
 err:
+err_drv_name:
 	wq->type = IDXD_WQT_NONE;
 	mutex_unlock(&wq->wq_lock);
 	return rc;
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 67a2b9ed09f0..2f8ae3cda112 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -288,6 +288,12 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 		return -ENXIO;
 
 	mutex_lock(&wq->wq_lock);
+	if (!idxd_wq_driver_name_match(wq, dev)) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_DRV_NAME;
+		rc = -ENODEV;
+		goto err_drv_name;
+	}
+
 	wq->type = IDXD_WQT_KERNEL;
 
 	rc = __drv_enable_wq(wq);
@@ -339,6 +345,7 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 err_irq:
 	__drv_disable_wq(wq);
 err:
+err_drv_name:
 	wq->type = IDXD_WQT_NONE;
 	mutex_unlock(&wq->wq_lock);
 	return rc;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index fd03f44dc5b8..da24e1a6f359 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -218,6 +218,7 @@ struct idxd_wq {
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
 	bool ats_dis;
+	char driver_name[WQ_NAME_SIZE + 1];
 
 	void *private_data;
 	struct list_head vdcm_list;
@@ -566,6 +567,11 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq)
 #define MODULE_ALIAS_IDXD_DEVICE(type) MODULE_ALIAS("idxd:t" __stringify(type) "*")
 #define IDXD_DEVICES_MODALIAS_FMT "idxd:t%d"
 
+static inline int idxd_wq_driver_name_match(struct idxd_wq *wq, struct device *dev)
+{
+	return (strncmp(wq->driver_name, dev->driver->name, strlen(dev->driver->name)) == 0);
+}
+
 int __must_check __idxd_driver_register(struct idxd_device_driver *idxd_drv,
 					struct module *module, const char *mod_name);
 #define idxd_driver_register(driver) \
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index eb00991a0cc7..5e4257b15f55 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1049,6 +1049,33 @@ static ssize_t wq_enqcmds_retries_store(struct device *dev, struct device_attrib
 static struct device_attribute dev_attr_wq_enqcmds_retries =
 		__ATTR(enqcmds_retries, 0644, wq_enqcmds_retries_show, wq_enqcmds_retries_store);
 
+static ssize_t wq_driver_name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%s\n", wq->driver_name);
+}
+
+static ssize_t wq_driver_name_store(struct device *dev, struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
+		return -EINVAL;
+
+	memset(wq->driver_name, 0, WQ_NAME_SIZE + 1);
+	strncpy(wq->driver_name, buf, WQ_NAME_SIZE);
+	strreplace(wq->name, '\n', '\0');
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_driver_name =
+		__ATTR(driver_name, 0644, wq_driver_name_show, wq_driver_name_store);
+
 static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_clients.attr,
 	&dev_attr_wq_state.attr,
@@ -1066,6 +1093,7 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_ats_disable.attr,
 	&dev_attr_wq_occupancy.attr,
 	&dev_attr_wq_enqcmds_retries.attr,
+	&dev_attr_wq_driver_name.attr,
 	NULL,
 };
 
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index cc311a917b9f..55837fa9c7ce 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -30,6 +30,7 @@ enum idxd_scmd_stat {
 	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
 	IDXD_SCMD_WQ_IRQ_ERR = 0x80100000,
 	IDXD_SCMD_WQ_PASID_ERR = 0x80200000,
+	IDXD_SCMD_WQ_NO_DRV_NAME = 0x80300000,
 };
 
 #define IDXD_SCMD_SOFTERR_MASK	0x80000000
-- 
Gitee


From 693395033f80b6ee19c36d164e576b25656524ca Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@intel.com>
Date: Thu, 17 Feb 2022 11:51:05 -0800
Subject: [PATCH 1878/2161] crypto: Enable iax_crypto_enable and
 iax_crypto_disable

commit 7ffc3290e508cf28bacd49957e2fe4fc193d4f12 Intel-BKC.

Enabling iax_crypto_enable and iax_crypto_disable allow wq resources
to be pinned down only when being actively used.

With this patch, in order for iax_crypto to do any compression or
decompression using the iax hardware, the following must be done from
the command-line after all the desired iax wqs and devices have been
configured and enabled:

  # echo 1 > /sys/module/iax_crypto/parameters/iax_crypto_enable

In order for the same iax wqs and devices to be disabled, the
following must be done:

  # echo 0 > /sys/module/iax_crypto/parameters/iax_crypto_enable

Enabling or disabling these will display a corresponding message in
the kernel log:

  [  393.373502] iax_crypto: iax_crypto_enable: iax_crypto now ENABLED

or

  [  562.694727] iax_crypto: iax_crypto_enable: iax_crypto now DISABLED

Signed-off-by: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/iax_crypto_main.c | 112 ++++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 20 deletions(-)

diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index e310bc59dd67..ffe0655a1286 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -39,17 +39,92 @@ static DEFINE_SPINLOCK(iax_devices_lock);
 
 static struct crypto_comp *deflate_generic_tfm;
 
-static bool iax_crypto_enabled = true;
+static int iax_wqs_get(struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq;
+	int n_wqs = 0;
+	int ret = 0;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list) {
+		mutex_lock(&iax_wq->wq->wq_lock);
+		ret = idxd_wq_alloc_resources(iax_wq->wq);
+		if (ret < 0) {
+			pr_err("%s: WQ resource alloc failed for iax device %d, wq %d: ret=%d\n", __func__, iax_device->idxd->id, iax_wq->wq->id, ret);
+			mutex_unlock(&iax_wq->wq->wq_lock);
+			return ret;
+		}
+		idxd_wq_get(iax_wq->wq);
+		mutex_unlock(&iax_wq->wq->wq_lock);
+		n_wqs++;
+	}
+
+	return n_wqs;
+}
+
+static void iax_wqs_put(struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list) {
+		mutex_lock(&iax_wq->wq->wq_lock);
+		idxd_wq_free_resources(iax_wq->wq);
+		idxd_wq_put(iax_wq->wq);
+		mutex_unlock(&iax_wq->wq->wq_lock);
+	}
+}
+
+static int iax_all_wqs_get(void)
+{
+	struct iax_device *iax_device;
+	int n_wqs = 0;
+	int ret;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list) {
+		ret = iax_wqs_get(iax_device);
+		if (ret < 0) {
+			spin_unlock(&iax_devices_lock);
+			return ret;
+		}
+		n_wqs += ret;
+	}
+	spin_unlock(&iax_devices_lock);
+
+	return n_wqs;
+}
+
+static void iax_all_wqs_put(void)
+{
+	struct iax_device *iax_device;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list)
+		iax_wqs_put(iax_device);
+	spin_unlock(&iax_devices_lock);
+}
+
+static bool iax_crypto_enabled = false;
 static int iax_crypto_enable(const char *val, const struct kernel_param *kp)
 {
 	int ret = 0;
 
-        if (val[0] == '0')
-		iax_crypto_enabled = true; /* for BKC, never allow disable */
-	else if (val[0] == '1')
-		iax_crypto_enabled = true;
-	else
-		ret = -EINVAL;
+        if (val[0] == '0') {
+		iax_crypto_enabled = false;
+		iax_all_wqs_put();
+	} else if (val[0] == '1') {
+		ret = iax_all_wqs_get();
+		if (ret == 0) {
+			pr_info("%s: no wqs available, not enabling iax_crypto\n", __func__);
+			return ret;
+		} else if (ret < 0) {
+			pr_err("%s: iax_crypto enable failed: ret=%d\n", __func__, ret);
+			return ret;
+		} else
+			iax_crypto_enabled = true;
+	} else {
+		pr_err("%s: iax_crypto failed, bad enable val: ret=%d\n", __func__, -EINVAL);
+		return -EINVAL;
+	}
 
 	pr_info("%s: iax_crypto now %s\n", __func__,
 		iax_crypto_enabled ? "ENABLED" : "DISABLED");
@@ -57,7 +132,8 @@ static int iax_crypto_enable(const char *val, const struct kernel_param *kp)
 	return ret;
 }
 static const struct kernel_param_ops enable_ops = {
-        .set = iax_crypto_enable,
+	.set = iax_crypto_enable,
+	.get = param_get_bool,
 };
 module_param_cb(iax_crypto_enable, &enable_ops, &iax_crypto_enabled, 0644);
 MODULE_PARM_DESC(iax_crypto_enable, "Enable (value = 1) or disable (value = 0) iax_crypto");
@@ -391,8 +467,6 @@ static int save_iax_wq(struct idxd_wq *wq)
 	BUG_ON(nr_iax == 0);
 
 	cpus_per_iax = nr_cpus / nr_iax;
-
-	idxd_wq_get(wq);
 out:
 	spin_unlock(&iax_devices_lock);
 
@@ -428,8 +502,6 @@ static void remove_iax_wq(struct idxd_wq *wq)
 		cpus_per_iax = nr_cpus / nr_iax;
 	else
 		cpus_per_iax = 0;
-
-	idxd_wq_put(wq);
 }
 
 static struct idxd_wq *request_iax_wq(int iax)
@@ -1013,6 +1085,7 @@ static int iax_crypto_probe(struct idxd_dev *idxd_dev)
 	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
 	struct idxd_device *idxd = wq->idxd;
 	struct idxd_driver_data *data = idxd->data;
+	struct device *dev = &idxd_dev->conf_dev;
 	int ret = 0;
 
 	if (idxd->state != IDXD_DEV_ENABLED)
@@ -1023,6 +1096,13 @@ static int iax_crypto_probe(struct idxd_dev *idxd_dev)
 
 	mutex_lock(&wq->wq_lock);
 
+	if (!idxd_wq_driver_name_match(wq, dev)) {
+		pr_warn("%s: wq driver_name match failed: wq driver_name %s, dev driver name %s\n", __func__, wq->driver_name, dev->driver->name);
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_DRV_NAME;
+		ret = -ENODEV;
+		goto err;
+	}
+
 	wq->type = IDXD_WQT_KERNEL;
 
 	ret = __drv_enable_wq(wq);
@@ -1032,13 +1112,6 @@ static int iax_crypto_probe(struct idxd_dev *idxd_dev)
 		goto err;
 	}
 
-	ret = idxd_wq_alloc_resources(wq);
-	if (ret < 0) {
-		idxd->cmd_status = IDXD_SCMD_WQ_RES_ALLOC_ERR;
-		pr_warn("%s: WQ resource alloc failed: ret=%d\n", __func__, ret);
-		goto err_alloc;
-	}
-
 	ret = idxd_wq_init_percpu_ref(wq);
 	if (ret < 0) {
 		idxd->cmd_status = IDXD_SCMD_PERCPU_ERR;
@@ -1061,7 +1134,6 @@ static int iax_crypto_probe(struct idxd_dev *idxd_dev)
 	percpu_ref_exit(&wq->wq_active);
 err_ref:
 	idxd_wq_free_resources(wq);
-err_alloc:
 	__drv_disable_wq(wq);
 err:
 	wq->type = IDXD_WQT_NONE;
-- 
Gitee


From b7c53bf9ce4fcd7d37a12048e5b30aeaeef84cc2 Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Tue, 30 Nov 2021 15:58:16 -0700
Subject: [PATCH 1879/2161] dmaengine: Remove dma_async_is_complete from client
 API

commit caabe3641f5dbed4fe63cd366e1af91679eb59f0 Intel-BKC.

This is never actually used by any existing DMA clients. It is only
used, via dma_cookie_status, by providers.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/dmaengine/client.rst |  5 ++--
 drivers/dma/amba-pl08x.c                      |  1 -
 drivers/dma/at_hdmac.c                        |  3 +-
 drivers/dma/dmaengine.h                       | 10 ++++++-
 include/linux/dmaengine.h                     | 28 ++-----------------
 5 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/Documentation/driver-api/dmaengine/client.rst b/Documentation/driver-api/dmaengine/client.rst
index 45953f171500..dfab71c637f6 100644
--- a/Documentation/driver-api/dmaengine/client.rst
+++ b/Documentation/driver-api/dmaengine/client.rst
@@ -249,9 +249,8 @@ Further APIs:
    the documentation in include/linux/dmaengine.h for a more complete
    description of this API.
 
-   This can be used in conjunction with dma_async_is_complete() and
-   the cookie returned from dmaengine_submit() to check for
-   completion of a specific DMA transaction.
+   This can be used with the cookie returned from dmaengine_submit()
+   to check for completion of a specific DMA transaction.
 
    .. note::
 
diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 9adc7a2fa3d3..8bf8f42b8b06 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -1544,7 +1544,6 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_interrupt(
 }
 
 /*
- * Code accessing dma_async_is_complete() in a tight loop may give problems.
  * If slaves are relying on interrupts to signal completion this function
  * must not be called with interrupts disabled.
  */
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 303bc3e601a1..0e320a74ee45 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -1466,8 +1466,7 @@ static int atc_terminate_all(struct dma_chan *chan)
  * @txstate: if not %NULL updated with transaction state
  *
  * If @txstate is passed in, upon return it reflect the driver
- * internal state and can be used with dma_async_is_complete() to check
- * the status of multiple cookies without re-checking hardware state.
+ * internal state.
  */
 static enum dma_status
 atc_tx_status(struct dma_chan *chan,
diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 1bfbd64b1371..04c1ebf96640 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -79,7 +79,15 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 		state->residue = 0;
 		state->in_flight_bytes = 0;
 	}
-	return dma_async_is_complete(cookie, complete, used);
+
+	if (complete <= used) {
+		if ((cookie <= complete) || (cookie > used))
+			return DMA_COMPLETE;
+	} else {
+		if ((cookie <= complete) && (cookie > used))
+			return DMA_COMPLETE;
+	}
+	return DMA_IN_PROGRESS;
 }
 
 static inline void dma_set_residue(struct dma_tx_state *state, u32 residue)
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 97ad078d1288..1b7f70008f47 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1422,9 +1422,9 @@ static inline void dma_async_issue_pending(struct dma_chan *chan)
  * @last: returns last completed cookie, can be NULL
  * @used: returns last issued cookie, can be NULL
  *
- * If @last and @used are passed in, upon return they reflect the driver
- * internal state and can be used with dma_async_is_complete() to check
- * the status of multiple cookies without re-checking hardware state.
+ * If @last and @used are passed in, upon return they reflect the most
+ * recently submitted (used) cookie and the most recently completed
+ * cookie.
  */
 static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
@@ -1440,28 +1440,6 @@ static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 	return status;
 }
 
-/**
- * dma_async_is_complete - test a cookie against chan state
- * @cookie: transaction identifier to test status of
- * @last_complete: last know completed transaction
- * @last_used: last cookie value handed out
- *
- * dma_async_is_complete() is used in dma_async_is_tx_complete()
- * the test logic is separated for lightweight testing of multiple cookies
- */
-static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
-			dma_cookie_t last_complete, dma_cookie_t last_used)
-{
-	if (last_complete <= last_used) {
-		if ((cookie <= last_complete) || (cookie > last_used))
-			return DMA_COMPLETE;
-	} else {
-		if ((cookie <= last_complete) && (cookie > last_used))
-			return DMA_COMPLETE;
-	}
-	return DMA_IN_PROGRESS;
-}
-
 static inline void
 dma_set_tx_state(struct dma_tx_state *st, dma_cookie_t last, dma_cookie_t used, u32 residue)
 {
-- 
Gitee


From d526d3bf7dce333b512d732509903c0d48031f88 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 26 Feb 2020 12:18:39 +0200
Subject: [PATCH 1880/2161] dmaengine: Use negative condition for better
 readability

commit 3a92063be16873a10648a81be0b1be42a9d54ee9 upstream.

When negative condition is in use we may decrease indentation level
and make the main part of logic better visible.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20200226101842.29426-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dmaengine.h | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 1b7f70008f47..82a738403fd2 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -630,10 +630,11 @@ static inline void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap)
 
 static inline void dma_descriptor_unmap(struct dma_async_tx_descriptor *tx)
 {
-	if (tx->unmap) {
-		dmaengine_unmap_put(tx->unmap);
-		tx->unmap = NULL;
-	}
+	if (!tx->unmap)
+		return;
+
+	dmaengine_unmap_put(tx->unmap);
+	tx->unmap = NULL;
 }
 
 #ifndef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
@@ -1443,11 +1444,12 @@ static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 static inline void
 dma_set_tx_state(struct dma_tx_state *st, dma_cookie_t last, dma_cookie_t used, u32 residue)
 {
-	if (st) {
-		st->last = last;
-		st->used = used;
-		st->residue = residue;
-	}
+	if (!st)
+		return;
+
+	st->last = last;
+	st->used = used;
+	st->residue = residue;
 }
 
 #ifdef CONFIG_DMA_ENGINE
@@ -1518,12 +1520,11 @@ static inline int dmaengine_desc_set_reuse(struct dma_async_tx_descriptor *tx)
 	if (ret)
 		return ret;
 
-	if (caps.descriptor_reuse) {
-		tx->flags |= DMA_CTRL_REUSE;
-		return 0;
-	} else {
+	if (!caps.descriptor_reuse)
 		return -EPERM;
-	}
+
+	tx->flags |= DMA_CTRL_REUSE;
+	return 0;
 }
 
 static inline void dmaengine_desc_clear_reuse(struct dma_async_tx_descriptor *tx)
@@ -1539,10 +1540,10 @@ static inline bool dmaengine_desc_test_reuse(struct dma_async_tx_descriptor *tx)
 static inline int dmaengine_desc_free(struct dma_async_tx_descriptor *desc)
 {
 	/* this is supported for reusable desc, so check that */
-	if (dmaengine_desc_test_reuse(desc))
-		return desc->desc_free(desc);
-	else
+	if (!dmaengine_desc_test_reuse(desc))
 		return -EPERM;
+
+	return desc->desc_free(desc);
 }
 
 /* --- DMA device --- */
-- 
Gitee


From 928e9bd6246a689edf91c7d9e06d3a878f8607d9 Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Wed, 1 Dec 2021 11:03:06 -0700
Subject: [PATCH 1881/2161] dmaengine: Move dma_set_tx_state to the provider
 API header

commit 20c56f51843bacdd98a6c0ca6744757880b8ae3a Intel-BKC.

This is only used by DMA providers, not DMA clients. Move it next
to the other cookie utility functions.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.h   | 11 +++++++++++
 include/linux/dmaengine.h | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 04c1ebf96640..80787829539d 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -90,6 +90,17 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 	return DMA_IN_PROGRESS;
 }
 
+static inline void dma_set_tx_state(struct dma_tx_state *st,
+	dma_cookie_t last, dma_cookie_t used, u32 residue)
+{
+	if (!st)
+		return;
+
+	st->last = last;
+	st->used = used;
+	st->residue = residue;
+}
+
 static inline void dma_set_residue(struct dma_tx_state *state, u32 residue)
 {
 	if (state)
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 82a738403fd2..d7c53830546d 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1441,17 +1441,6 @@ static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 	return status;
 }
 
-static inline void
-dma_set_tx_state(struct dma_tx_state *st, dma_cookie_t last, dma_cookie_t used, u32 residue)
-{
-	if (!st)
-		return;
-
-	st->last = last;
-	st->used = used;
-	st->residue = residue;
-}
-
 #ifdef CONFIG_DMA_ENGINE
 struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type);
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
-- 
Gitee


From acf77bf869e29881e7f7f7f6a1a84d4a3bfcbb08 Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Wed, 1 Dec 2021 10:59:18 -0700
Subject: [PATCH 1882/2161] dmaengine: Remove the last, used parameters in
 dma_async_is_tx_complete

commit 29381fe8cdd2b15561a5bea9f12eee86fdfad39d Intel-BKC.

These are only used by a single driver (pxa_camera) which knows exactly
which DMA engine it is dealing with and therefore knows the behavior
of the cookie values. It can grab the value it needs directly from
the dma_chan instead. No other DMA client assumes anything about
the behavior of the cookie values, so the cookie becomes an opaque
handle after this patch.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/dmaengine/client.rst  | 17 +++--------------
 .../driver-api/dmaengine/provider.rst          |  6 +++---
 drivers/crypto/stm32/stm32-hash.c              |  3 +--
 drivers/dma/dmaengine.c                        |  2 +-
 drivers/dma/dmatest.c                          |  3 +--
 drivers/media/platform/omap/omap_vout_vrfb.c   |  2 +-
 drivers/media/platform/pxa_camera.c            | 13 +++++++++++--
 drivers/rapidio/devices/rio_mport_cdev.c       |  3 +--
 include/linux/dmaengine.h                      | 18 +++---------------
 9 files changed, 25 insertions(+), 42 deletions(-)

diff --git a/Documentation/driver-api/dmaengine/client.rst b/Documentation/driver-api/dmaengine/client.rst
index dfab71c637f6..fb2f6cfd4530 100644
--- a/Documentation/driver-api/dmaengine/client.rst
+++ b/Documentation/driver-api/dmaengine/client.rst
@@ -162,8 +162,8 @@ The details of these operations are:
 
       dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc)
 
-   This returns a cookie can be used to check the progress of DMA engine
-   activity via other DMA engine calls not covered in this document.
+   This returns a cookie that can be used to check the progress of a transaction
+   via dma_async_is_tx_complete().
 
    dmaengine_submit() will not start the DMA operation, it merely adds
    it to the pending queue. For this, see step 5, dma_async_issue_pending.
@@ -243,22 +243,11 @@ Further APIs:
    .. code-block:: c
 
       enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
-		dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
-
-   This can be used to check the status of the channel. Please see
-   the documentation in include/linux/dmaengine.h for a more complete
-   description of this API.
+		dma_cookie_t cookie)
 
    This can be used with the cookie returned from dmaengine_submit()
    to check for completion of a specific DMA transaction.
 
-   .. note::
-
-      Not all DMA engine drivers can return reliable information for
-      a running DMA channel. It is recommended that DMA engine users
-      pause or stop (via dmaengine_terminate_all()) the channel before
-      using this API.
-
 5. Synchronize termination API
 
    .. code-block:: c
diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index 16cf096527ee..d5b1e3de9867 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -466,10 +466,10 @@ where to put them)
 
 dma_cookie_t
 
-- it's a DMA transaction ID that will increment over time.
+- it's a DMA transaction ID.
 
-- Not really relevant any more since the introduction of ``virt-dma``
-  that abstracts it away.
+- The value can be chosen by the provider, or use the helper APIs
+  such as dma_cookie_assign() and dma_cookie_complete().
 
 DMA_CTRL_ACK
 
diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c
index dcce15b55809..37ef6df6e4d5 100644
--- a/drivers/crypto/stm32/stm32-hash.c
+++ b/drivers/crypto/stm32/stm32-hash.c
@@ -451,8 +451,7 @@ static int stm32_hash_xmit_dma(struct stm32_hash_dev *hdev,
 					 msecs_to_jiffies(100)))
 		err = -ETIMEDOUT;
 
-	if (dma_async_is_tx_complete(hdev->dma_lch, cookie,
-				     NULL, NULL) != DMA_COMPLETE)
+	if (dma_async_is_tx_complete(hdev->dma_lch, cookie) != DMA_COMPLETE)
 		err = -ETIMEDOUT;
 
 	if (err) {
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 2cfa8458b51b..590f238a0671 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -523,7 +523,7 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
 
 	dma_async_issue_pending(chan);
 	do {
-		status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
+		status = dma_async_is_tx_complete(chan, cookie);
 		if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
 			dev_err(chan->device->dev, "%s: timeout!\n", __func__);
 			return DMA_ERROR;
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 5a49458602d0..e1d8f6e42f93 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -813,8 +813,7 @@ static int dmatest_func(void *data)
 					done->done,
 					msecs_to_jiffies(params->timeout));
 
-			status = dma_async_is_tx_complete(chan, cookie, NULL,
-							  NULL);
+			status = dma_async_is_tx_complete(chan, cookie);
 		}
 
 		if (!done->done) {
diff --git a/drivers/media/platform/omap/omap_vout_vrfb.c b/drivers/media/platform/omap/omap_vout_vrfb.c
index 6bd672cbdb62..6750db90b450 100644
--- a/drivers/media/platform/omap/omap_vout_vrfb.c
+++ b/drivers/media/platform/omap/omap_vout_vrfb.c
@@ -289,7 +289,7 @@ int omap_vout_prepare_vrfb(struct omap_vout_device *vout,
 					 vout->vrfb_dma_tx.tx_status == 1,
 					 VRFB_TX_TIMEOUT);
 
-	status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
+	status = dma_async_is_tx_complete(chan, cookie);
 
 	if (vout->vrfb_dma_tx.tx_status == 0) {
 		pr_err("%s: Timeout while waiting for DMA\n", __func__);
diff --git a/drivers/media/platform/pxa_camera.c b/drivers/media/platform/pxa_camera.c
index 6e04e3ec61ba..dd3fb8d2e86f 100644
--- a/drivers/media/platform/pxa_camera.c
+++ b/drivers/media/platform/pxa_camera.c
@@ -1089,8 +1089,17 @@ static void pxa_camera_dma_irq(struct pxa_camera_dev *pcdev,
 	last_buf = list_entry(pcdev->capture.prev,
 			      struct pxa_buffer, queue);
 	last_status = dma_async_is_tx_complete(pcdev->dma_chans[chan],
-					       last_buf->cookie[chan],
-					       NULL, &last_issued);
+					       last_buf->cookie[chan]);
+	/*
+	 * Peek into the channel and read the last cookie that was issued.
+	 * This is a layering violation - the dmaengine API does not officially
+	 * provide this information. Since this camera driver is tightly coupled
+	 * with a specific DMA device we know exactly how this cookie value will
+	 * behave. Otherwise, this wouldn't be safe.
+	 */
+	last_issued = pcdev->dma_chans[chan]->cookie;
+	barrier();
+
 	if (camera_status & overrun &&
 	    last_status != DMA_COMPLETE) {
 		dev_dbg(pcdev_to_dev(pcdev), "FIFO overrun! CISR: %x\n",
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 2b08fdeb87c1..41e3c3fcd925 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -599,8 +599,7 @@ static void dma_xfer_callback(void *param)
 	struct mport_dma_req *req = (struct mport_dma_req *)param;
 	struct mport_cdev_priv *priv = req->priv;
 
-	req->status = dma_async_is_tx_complete(priv->dmach, req->cookie,
-					       NULL, NULL);
+	req->status = dma_async_is_tx_complete(priv->dmach, req->cookie);
 	complete(&req->req_comp);
 	kref_put(&req->refcount, dma_req_free);
 }
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index d7c53830546d..cf5b0bbcaad6 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1420,25 +1420,13 @@ static inline void dma_async_issue_pending(struct dma_chan *chan)
  * dma_async_is_tx_complete - poll for transaction completion
  * @chan: DMA channel
  * @cookie: transaction identifier to check status of
- * @last: returns last completed cookie, can be NULL
- * @used: returns last issued cookie, can be NULL
- *
- * If @last and @used are passed in, upon return they reflect the most
- * recently submitted (used) cookie and the most recently completed
- * cookie.
  */
 static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
-	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+	dma_cookie_t cookie)
 {
 	struct dma_tx_state state;
-	enum dma_status status;
-
-	status = chan->device->device_tx_status(chan, cookie, &state);
-	if (last)
-		*last = state.last;
-	if (used)
-		*used = state.used;
-	return status;
+
+	return chan->device->device_tx_status(chan, cookie, &state);
 }
 
 #ifdef CONFIG_DMA_ENGINE
-- 
Gitee


From 3336891619d91cead89e20adf2cee5af66ae9959 Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Thu, 2 Dec 2021 16:05:50 -0700
Subject: [PATCH 1883/2161] dmaengine: Remove last, used from dma_tx_state

commit 08a196633ebe3fb5ddb89a6edcbef1ecd22e6c7b Intel-BKC.

Nothing uses these and they don't convey usable information.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.h   | 4 ----
 include/linux/dmaengine.h | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 80787829539d..2a695a30d334 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -74,8 +74,6 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 	complete = chan->completed_cookie;
 	barrier();
 	if (state) {
-		state->last = complete;
-		state->used = used;
 		state->residue = 0;
 		state->in_flight_bytes = 0;
 	}
@@ -96,8 +94,6 @@ static inline void dma_set_tx_state(struct dma_tx_state *st,
 	if (!st)
 		return;
 
-	st->last = last;
-	st->used = used;
 	st->residue = residue;
 }
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index cf5b0bbcaad6..1ea6e5e2d176 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -698,16 +698,12 @@ static inline struct dma_async_tx_descriptor *txd_next(struct dma_async_tx_descr
 /**
  * struct dma_tx_state - filled in to report the status of
  * a transfer.
- * @last: last completed DMA cookie
- * @used: last issued DMA cookie (i.e. the one in progress)
  * @residue: the remaining number of bytes left to transmit
  *	on the selected transfer for states DMA_IN_PROGRESS and
  *	DMA_PAUSED if this is implemented in the driver, else 0
  * @in_flight_bytes: amount of data in bytes cached by the DMA.
  */
 struct dma_tx_state {
-	dma_cookie_t last;
-	dma_cookie_t used;
 	u32 residue;
 	u32 in_flight_bytes;
 };
-- 
Gitee


From 345ba0f2933df65f7b66a65b1e002a4fae67fb96 Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Mon, 6 Dec 2021 11:45:10 -0700
Subject: [PATCH 1884/2161] dmaengine: Providers should prefer dma_set_residue
 over dma_set_tx_state

commit a87a45ce5695d96a3d415f637eb878398e973ca0 Intel-BKC.

The dma_set_tx_state function will go away shortly. The two functions
are functionally equivalent.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/imx-sdma.c | 3 +--
 drivers/dma/mmp_tdma.c | 3 +--
 drivers/dma/mxs-dma.c  | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 67736c801f3c..5d21916b923a 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1675,8 +1675,7 @@ static enum dma_status sdma_tx_status(struct dma_chan *chan,
 	}
 	spin_unlock_irqrestore(&sdmac->vc.lock, flags);
 
-	dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie,
-			 residue);
+	dma_set_residue(txstate, residue);
 
 	return sdmac->status;
 }
diff --git a/drivers/dma/mmp_tdma.c b/drivers/dma/mmp_tdma.c
index 89d90c456c0c..b4ce2f7effff 100644
--- a/drivers/dma/mmp_tdma.c
+++ b/drivers/dma/mmp_tdma.c
@@ -532,8 +532,7 @@ static enum dma_status mmp_tdma_tx_status(struct dma_chan *chan,
 	struct mmp_tdma_chan *tdmac = to_mmp_tdma_chan(chan);
 
 	tdmac->pos = mmp_tdma_get_pos(tdmac);
-	dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie,
-			 tdmac->buf_len - tdmac->pos);
+	dma_set_residue(txstate, tdmac->buf_len - tdmac->pos);
 
 	return tdmac->status;
 }
diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index 3039bba0e4d5..595a65586bef 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -683,8 +683,7 @@ static enum dma_status mxs_dma_tx_status(struct dma_chan *chan,
 		residue -= bar;
 	}
 
-	dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie,
-			residue);
+	dma_set_residue(txstate, residue);
 
 	return mxs_chan->status;
 }
-- 
Gitee


From e51fe54c64ce33f96be65df4c4b09aff00c393e0 Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Thu, 2 Dec 2021 10:50:38 -0700
Subject: [PATCH 1885/2161] dmaengine: Remove dma_set_tx_state

commit 877254351db362b1c9383dcf6e5d163ce4204175 Intel-BKC.

Nothing calls this anymore.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 2a695a30d334..c31383c9db1d 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -88,15 +88,6 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 	return DMA_IN_PROGRESS;
 }
 
-static inline void dma_set_tx_state(struct dma_tx_state *st,
-	dma_cookie_t last, dma_cookie_t used, u32 residue)
-{
-	if (!st)
-		return;
-
-	st->residue = residue;
-}
-
 static inline void dma_set_residue(struct dma_tx_state *state, u32 residue)
 {
 	if (state)
-- 
Gitee


From eeea38851a863cb96b9e2e6382b83b649c2e6a2a Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Thu, 2 Dec 2021 12:00:17 -0700
Subject: [PATCH 1886/2161] dmaengine: Add provider documentation on cookie
 assignment

commit 7bdfd20a38852f629c6818af437a20a5a0aa7d2d Intel-BKC.

Clarify the rules on assigning cookies to DMA transactions.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../driver-api/dmaengine/provider.rst         | 45 +++++++++++++++----
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index d5b1e3de9867..6d237717f287 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -344,7 +344,9 @@ supported.
 
     - tx_submit: A pointer to a function you have to implement,
       that is supposed to push the current transaction descriptor to a
-      pending queue, waiting for issue_pending to be called.
+      pending queue, waiting for issue_pending to be called. Each
+      descriptor is given a cookie to identify it. See the section
+      "Cookie Management" below.
 
   - In this structure the function pointer callback_result can be
     initialized in order for the submitter to be notified that a
@@ -449,6 +451,40 @@ supported.
 
   - May sleep.
 
+Cookie Management
+------------------
+
+When a transaction is queued for submission via tx_submit(), the provider
+must assign that transaction a cookie (dma_cookie_t) to uniquely identify it.
+The provider is allowed to perform this assignment however it wants, but for
+convenience the following utility functions are available to create
+monotonically increasing cookies
+
+  .. code-block:: c
+
+    void dma_cookie_init(struct dma_chan *chan);
+
+  Called once at channel creation
+
+  .. code-block:: c
+
+    dma_cookie_t dma_cookie_assign(struct dma_async_tx_descriptor *tx);
+
+  Assign a cookie to the given descriptor
+
+  .. code-block:: c
+
+    void dma_cookie_complete(struct dma_async_tx_descriptor *tx);
+
+  Mark the descriptor as complete and invalidate the cookie
+
+  .. code-block:: c
+
+    enum dma_status dma_cookie_status(struct dma_chan *chan,
+      dma_cookie_t cookie, struct dma_tx_state *state);
+
+  Report the status of the cookie and filling in state, if not NULL.
+
 
 Misc notes
 ==========
@@ -464,13 +500,6 @@ where to put them)
 - Makes sure that dependent operations are run before marking it
   as complete.
 
-dma_cookie_t
-
-- it's a DMA transaction ID.
-
-- The value can be chosen by the provider, or use the helper APIs
-  such as dma_cookie_assign() and dma_cookie_complete().
-
 DMA_CTRL_ACK
 
 - If clear, the descriptor cannot be reused by provider until the
-- 
Gitee


From 1399a9688f708c7aac0bf6623c3915b680bc761b Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Mon, 6 Dec 2021 11:22:36 -0700
Subject: [PATCH 1887/2161] dmaengine: idxd: idxd_desc.id is now a u16

commit 4e4c92933f0abc77eed2810d90034afd43ac44e6 Intel-BKC.

This is going to be packed into the cookie. It does not need to be
negative or larger than u16.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index da24e1a6f359..2c1c82276f1d 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -348,7 +348,7 @@ struct idxd_desc {
 	struct dma_async_tx_descriptor txd;
 	struct llist_node llnode;
 	struct list_head list;
-	int id;
+	u16 id;
 	int cpu;
 	struct idxd_wq *wq;
 };
-- 
Gitee


From 215cd40955b18a1c7da3a3def9dcf5f1a94ee4cc Mon Sep 17 00:00:00 2001
From: Ben Walker <benjamin.walker@intel.com>
Date: Thu, 25 Nov 2021 14:53:39 -0500
Subject: [PATCH 1888/2161] dmaengine: idxd: Support device_tx_status

commit a079359ae604fb3b065c1c6047457e491c65768b Intel-BKC.

This can now be supported even for devices that complete operations out
of order. Add support for directly polling transactions.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  1 +
 drivers/dma/idxd/dma.c    | 85 ++++++++++++++++++++++++++++++++++++++-
 drivers/dma/idxd/idxd.h   |  1 +
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 45142d8ce78d..de0f543bba2c 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -148,6 +148,7 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 			desc->iax_completion = &wq->iax_compls[i];
 		desc->compl_dma = wq->compls_addr + idxd->data->compl_size * i;
 		desc->id = i;
+		desc->gen = 1;
 		desc->wq = wq;
 		desc->cpu = -1;
 	}
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 2f8ae3cda112..fe85f6238091 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -12,6 +12,23 @@
 #include "registers.h"
 #include "idxd.h"
 
+
+#define DMA_COOKIE_BITS (sizeof(dma_cookie_t) * 8)
+/*
+ * The descriptor id takes the lower 16 bits of the cookie.
+ */
+#define DESC_ID_BITS 16
+#define DESC_ID_MASK ((1 << DESC_ID_BITS) - 1)
+/*
+ * The 'generation' is in the upper half of the cookie. But dma_cookie_t
+ * is signed, so we leave the upper-most bit for the sign. Further, we
+ * need to flag whether a cookie corresponds to an operation that is
+ * being completed via interrupt to avoid polling it, which takes
+ * the second most upper bit. So we subtract two bits from the upper half.
+ */
+#define DESC_GEN_MAX ((1 << (DMA_COOKIE_BITS - DESC_ID_BITS - 2)) - 1)
+#define DESC_INTERRUPT_FLAG (1 << (DMA_COOKIE_BITS - 2))
+
 static inline struct idxd_wq *to_idxd_wq(struct dma_chan *c)
 {
 	struct idxd_dma_chan *idxd_chan;
@@ -137,13 +154,67 @@ static void idxd_dma_free_chan_resources(struct dma_chan *chan)
 		idxd_wq_refcount(wq));
 }
 
+
 static enum dma_status idxd_dma_tx_status(struct dma_chan *dma_chan,
 					  dma_cookie_t cookie,
 					  struct dma_tx_state *txstate)
 {
-	return DMA_OUT_OF_ORDER;
+	u8 status;
+	struct idxd_wq *wq;
+	struct idxd_desc *desc;
+	u32 idx;
+
+	memset(txstate, 0, sizeof(*txstate));
+
+	if (dma_submit_error(cookie))
+		return DMA_ERROR;
+
+	wq = to_idxd_wq(dma_chan);
+
+	idx = cookie & DESC_ID_MASK;
+	if (idx >= wq->num_descs)
+		return DMA_ERROR;
+
+	desc = wq->descs[idx];
+
+	if (desc->txd.cookie != cookie) {
+		/*
+		 * The user asked about an old transaction
+		 */
+		return DMA_COMPLETE;
+	}
+
+	/*
+	 * For descriptors completed via interrupt, we can't go
+	 * look at the completion status directly because it races
+	 * with the IRQ handler recyling the descriptor. However,
+	 * since in this case we can rely on the interrupt handler
+	 * to invalidate the cookie when the command completes we
+	 * know that if we get here, the command is still in
+	 * progress.
+	 */
+	if ((cookie & DESC_INTERRUPT_FLAG) != 0)
+		return DMA_IN_PROGRESS;
+
+	status = desc->completion->status & DSA_COMP_STATUS_MASK;
+
+	if (status) {
+		/*
+		 * Check against the original status as ABORT is software defined
+		 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
+		 */
+		if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT))
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
+		else
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL, true);
+
+		return DMA_COMPLETE;
+	}
+
+	return DMA_IN_PROGRESS;
 }
 
+
 /*
  * issue_pending() does not need to do anything since tx_submit() does the job
  * already.
@@ -160,7 +231,17 @@ static dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	int rc;
 	struct idxd_desc *desc = container_of(tx, struct idxd_desc, txd);
 
-	cookie = dma_cookie_assign(tx);
+	cookie = (desc->gen << DESC_ID_BITS) | (desc->id & DESC_ID_MASK);
+
+	if ((desc->hw->flags & IDXD_OP_FLAG_RCI) != 0)
+		cookie |= DESC_INTERRUPT_FLAG;
+
+	if (desc->gen == DESC_GEN_MAX)
+		desc->gen = 1;
+	else
+		desc->gen++;
+
+	tx->cookie = cookie;
 
 	rc = idxd_submit_desc(wq, desc);
 	if (rc < 0) {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 2c1c82276f1d..026d454cadd3 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -349,6 +349,7 @@ struct idxd_desc {
 	struct llist_node llnode;
 	struct list_head list;
 	u16 id;
+	u16 gen;
 	int cpu;
 	struct idxd_wq *wq;
 };
-- 
Gitee


From 01bce65bd3ed2f556055279661459229323bd6b2 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 8 Jul 2022 14:57:07 +0800
Subject: [PATCH 1889/2161] Revert "dmaengine: cookie bypass for out of order
 completion"

commit opencloudos.

This reverts commit 900807990c2065f016ceef01b692e7d5750a8209.

This is no longer necessary now that all assumptions about the order of
completions have been removed from the dmaengine client API.

Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../driver-api/dmaengine/provider.rst         | 19 -------------------
 drivers/dma/dmatest.c                         | 11 +----------
 drivers/dma/idxd/dma.c                        |  1 -
 include/linux/dmaengine.h                     |  2 --
 4 files changed, 1 insertion(+), 32 deletions(-)

diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index 6d237717f287..31e140c353d3 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -262,22 +262,6 @@ Currently, the types available are:
     want to transfer a portion of uncompressed data directly to the
     display to print it
 
-- DMA_COMPLETION_NO_ORDER
-
-  - The device does not support in order completion.
-
-  - The driver should return DMA_OUT_OF_ORDER for device_tx_status if
-    the device is setting this capability.
-
-  - All cookie tracking and checking API should be treated as invalid if
-    the device exports this capability.
-
-  - At this point, this is incompatible with polling option for dmatest.
-
-  - If this cap is set, the user is recommended to provide an unique
-    identifier for each descriptor sent to the DMA device in order to
-    properly track the completion.
-
 These various types will also affect how the source and destination
 addresses change over time.
 
@@ -384,9 +368,6 @@ supported.
   - In the case of a cyclic transfer, it should only take into
     account the current period.
 
-  - Should return DMA_OUT_OF_ORDER if the device does not support in order
-    completion and is completing the operation out of order.
-
   - This function can be called in an interrupt context.
 
 - device_config
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index e1d8f6e42f93..a184c8c50e20 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -820,10 +820,7 @@ static int dmatest_func(void *data)
 			result("test timed out", total_tests, src->off, dst->off,
 			       len, 0);
 			goto error_unmap_continue;
-		} else if (status != DMA_COMPLETE &&
-			   !(dma_has_cap(DMA_COMPLETION_NO_ORDER,
-					 dev->cap_mask) &&
-			     status == DMA_OUT_OF_ORDER)) {
+		} else if (status != DMA_COMPLETE) {
 			result(status == DMA_ERROR ?
 			       "completion error status" :
 			       "completion busy status", total_tests, src->off,
@@ -1001,12 +998,6 @@ static int dmatest_add_channel(struct dmatest_info *info,
 	dtc->chan = chan;
 	INIT_LIST_HEAD(&dtc->threads);
 
-	if (dma_has_cap(DMA_COMPLETION_NO_ORDER, dma_dev->cap_mask) &&
-	    info->params.polled) {
-		info->params.polled = false;
-		pr_warn("DMA_COMPLETION_NO_ORDER, polled disabled\n");
-	}
-
 	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) {
 		if (dmatest == 0) {
 			cnt = dmatest_add_threads(info, dtc, DMA_MEMCPY);
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index fe85f6238091..cab6294371c1 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -275,7 +275,6 @@ int idxd_register_dma_device(struct idxd_device *idxd)
 	dma->dev = dev;
 
 	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
-	dma_cap_set(DMA_COMPLETION_NO_ORDER, dma->cap_mask);
 	dma->device_release = idxd_dma_release;
 
 	if (idxd->hw.opcap.bits[0] & IDXD_OPCAP_MEMMOVE) {
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 1ea6e5e2d176..854bbcf383d7 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -39,7 +39,6 @@ enum dma_status {
 	DMA_IN_PROGRESS,
 	DMA_PAUSED,
 	DMA_ERROR,
-	DMA_OUT_OF_ORDER,
 };
 
 /**
@@ -63,7 +62,6 @@ enum dma_transaction_type {
 	DMA_SLAVE,
 	DMA_CYCLIC,
 	DMA_INTERLEAVE,
-	DMA_COMPLETION_NO_ORDER,
 /* last transaction type for creation of the capabilities mask */
 	DMA_TX_TYPE_END,
 };
-- 
Gitee


From 12607f62f276cbf9a0301d0b291af6c016ada7b5 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 8 Jul 2022 15:40:01 +0800
Subject: [PATCH 1890/2161] dmaengine: Add DMA_MEMCPY_SG function

commit 65af3f2329586dde23f7f669b84cf260eaa475f1 Intel-BKC.

Bring back memcopy_sg for mem2mem. This was removed in commit
c678fa6

Signed-off-by: Jiangbo Wu <jiangbo.wu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dmaengine.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 854bbcf383d7..59caf248173f 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -783,6 +783,7 @@ struct dma_filter {
  *	number of allocated descriptors
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
+ * @device_prep_dma_memcpy_sg: prepares a memcpy_sg operation
  * @device_prep_dma_xor: prepares a xor operation
  * @device_prep_dma_xor_val: prepares a xor validation operation
  * @device_prep_dma_pq: prepares a pq operation
-- 
Gitee


From 3dc0cee4b548f0d92968c4348243768a048b4421 Mon Sep 17 00:00:00 2001
From: Jiangbo Wu <jiangbo.wu@intel.com>
Date: Wed, 19 Jan 2022 16:29:58 -0700
Subject: [PATCH 1891/2161] dmaengine: idxd: Add batch descriptor support

commit c9b30d5e5b33024ddf8d32705fa89ba4a6436510 Intel-BKC.

The DSA device can support a group of descriptors for submission at once.
The DSA spec calls this operation a 'batch' operation. The batch descriptor
points to an array of descriptors where the beginning of the address list
is 64-byte aligned.

Co-developed-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jiangbo Wu <jiangbo.wu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 64 +++++++++++++++++++++++++++++++++++++--
 drivers/dma/idxd/idxd.h   | 18 +++++++++++
 drivers/dma/idxd/submit.c |  1 +
 3 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index de0f543bba2c..8fa2230eeda8 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -39,6 +39,59 @@ void idxd_mask_error_interrupts(struct idxd_device *idxd)
 	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
 }
 
+static int alloc_desc_batch(struct idxd_wq *wq, struct idxd_desc *desc)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_batch *batch;
+	unsigned int size, cr_size, num;
+
+	batch = kzalloc_node(sizeof(*batch), GFP_KERNEL, dev_to_node(dev));
+	if (!batch)
+		return -ENOMEM;
+
+	num = wq->max_batch_size;
+	size = num * sizeof(struct dsa_hw_desc);
+	batch->descs = dma_alloc_coherent(dev, size, &batch->dma_descs, GFP_KERNEL);
+	if (!batch->descs)
+		goto descs_err;
+
+	cr_size = num * idxd->data->compl_size;
+	batch->crs = dma_alloc_coherent(dev, cr_size, &batch->dma_crs, GFP_KERNEL);
+	if (!batch->crs)
+		goto crs_err;
+
+	desc->batch = batch;
+
+	return 0;
+
+crs_err:
+	dma_free_coherent(dev, size, batch->descs, batch->dma_descs);
+descs_err:
+	kfree(batch);
+	dev_warn(dev, "Unable to allocate memory, consider lowering max batch size.\n");
+	return -ENOMEM;
+}
+
+static void free_desc_batch(struct idxd_wq *wq, struct idxd_desc *desc)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	unsigned int size, cr_size, num;
+	struct idxd_batch *batch;
+
+	batch = desc->batch;
+	if (!batch)
+		return;
+
+	num = wq->max_batch_size;
+	size = num * sizeof(struct dsa_hw_desc);
+	cr_size = num * idxd->data->compl_size;
+	dma_free_coherent(dev, size, batch->descs, batch->dma_descs);
+	dma_free_coherent(dev, cr_size, batch->crs, batch->dma_crs);
+	kfree(batch);
+}
+
 static void free_hw_descs(struct idxd_wq *wq)
 {
 	int i;
@@ -76,8 +129,10 @@ static void free_descs(struct idxd_wq *wq)
 {
 	int i;
 
-	for (i = 0; i < wq->num_descs; i++)
+	for (i = 0; i < wq->num_descs; i++) {
+		free_desc_batch(wq, wq->descs[i]);
 		kfree(wq->descs[i]);
+	}
 
 	kfree(wq->descs);
 }
@@ -142,9 +197,12 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 		struct idxd_desc *desc = wq->descs[i];
 
 		desc->hw = wq->hw_descs[i];
-		if (idxd->data->type == IDXD_TYPE_DSA)
+		if (idxd->data->type == IDXD_TYPE_DSA) {
 			desc->completion = &wq->compls[i];
-		else if (idxd->data->type == IDXD_TYPE_IAX)
+			/* pre-allocate batch for descriptor */
+			if (alloc_desc_batch(wq, desc))
+				goto fail_sbitmap_init;
+		} else if (idxd->data->type == IDXD_TYPE_IAX)
 			desc->iax_completion = &wq->iax_compls[i];
 		desc->compl_dma = wq->compls_addr + idxd->data->compl_size * i;
 		desc->id = i;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 026d454cadd3..d871c514dd58 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -333,6 +333,22 @@ struct idxd_device {
 	struct idxd_pmu *idxd_pmu;
 };
 
+/**
+ * IDXD batch field for SW Batch descriptor
+ * @descs: Descriptor list address
+ * @dma_descs: DMA address for descs
+ * @crs: completion record list address
+ * @dma_crs: DMA address for completion records
+ * @num: Number of descs in batch
+ */
+struct idxd_batch {
+	struct dsa_hw_desc *descs;
+	dma_addr_t dma_descs;
+	struct dsa_completion_record *crs;
+	dma_addr_t dma_crs;
+	u32 num;
+};
+
 /* IDXD software descriptor */
 struct idxd_desc {
 	union {
@@ -352,6 +368,8 @@ struct idxd_desc {
 	u16 gen;
 	int cpu;
 	struct idxd_wq *wq;
+
+	struct idxd_batch *batch;
 };
 
 /*
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 0536d3e74ad2..e9c494f7d5aa 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -16,6 +16,7 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 	desc = wq->descs[idx];
 	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
 	memset(desc->completion, 0, idxd->data->compl_size);
+	desc->batch->num = 0;
 	desc->cpu = cpu;
 
 	if (device_pasid_enabled(idxd))
-- 
Gitee


From b977e04ce1a1681b0c3cc6d710f7e2050339c9e5 Mon Sep 17 00:00:00 2001
From: Jiangbo Wu <jiangbo.wu@intel.com>
Date: Fri, 7 Jan 2022 12:27:48 -0500
Subject: [PATCH 1892/2161] dmaengine: idxd: Add DMA_MEMCPY_SG transfer support

commit ac144d03c262a134f664e72b6d6daf6e19d8a223 Intel-BKC.

Add memcpy_sg support based on DSA batch descriptor. DSA batch
descriptor has a 'batch' field that is a set of descriptors in
the array. scatter list is filled up batch descriptor.

Co-developed-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jiangbo Wu <jiangbo.wu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/dma.c | 118 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index cab6294371c1..a37721c82e0d 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -133,6 +133,119 @@ idxd_dma_submit_memcpy(struct dma_chan *c, dma_addr_t dma_dest,
 	return &desc->txd;
 }
 
+static inline int fetch_sg_and_pos(struct scatterlist **sg, size_t *remain,
+				   unsigned int len)
+{
+	struct scatterlist *next = *sg;
+	int count = 0;
+
+	*remain -= len;
+
+	while (*remain == 0 && next && !sg_is_last(next)) {
+		next = sg_next(next);
+		*remain = sg_dma_len(next);
+		count++;
+	}
+
+	*sg = next;
+
+	return count;
+}
+
+/*
+ * idxd_dma_prep_memcpy_sg - prepare descriptors for a memcpy_sg transcation
+ *
+ * @chan: DMA channel
+ * @dst_sg: Destination scatter list
+ * @dst_nents: Number of entries in destination scatter list
+ * @src_sg: Source scatter list
+ * @src_nents: Number of entries in source scatter list
+ * @flags: DMA transcation flags
+ *
+ * Return: Async transcation descriptor on success and NULL in failure.
+ *
+ * DSA batch descriptor and work queue depth can provide large memcpy
+ * operation. Combined batch descriptor with WQ depth to support scatter
+ * list.
+ */
+static struct dma_async_tx_descriptor *
+idxd_dma_prep_memcpy_sg(struct dma_chan *chan,
+			struct scatterlist *dst_sg, unsigned int dst_nents,
+			struct scatterlist *src_sg, unsigned int src_nents,
+			unsigned long flags)
+{
+	struct idxd_wq *wq = to_idxd_wq(chan);
+	struct idxd_desc *desc;
+	struct idxd_batch *batch;
+	dma_addr_t dma_dst, dma_src;
+	size_t dst_avail, src_avail, len;
+	u32 desc_flags;
+	int i;
+
+	/* sanity check */
+	if (unlikely(!dst_sg || !src_sg))
+		return NULL;
+	if (unlikely(dst_nents == 0 || src_nents == 0))
+		return NULL;
+
+	if (min(dst_nents, src_nents) > wq->max_batch_size)
+		return NULL;
+
+	dst_avail = sg_dma_len(dst_sg);
+	src_avail = sg_dma_len(src_sg);
+
+	if (dst_nents == 1 && src_nents == 1) {
+		if (unlikely(dst_avail != src_avail))
+			return NULL;
+
+		return idxd_dma_submit_memcpy(chan, sg_dma_address(dst_sg),
+				sg_dma_address(src_sg), dst_avail, flags);
+	}
+
+	desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
+	if (IS_ERR(desc))
+		return NULL;
+
+	/*
+	 * DSA Batch descriptor has a set of descriptors in array
+	 * is called 'batch'. fill up 'batch' field with some
+	 * descriptors of DSA_OPCODE_MEMMOVE until max_batch_size
+	 * or scatter list is consumed.
+	 */
+	batch = desc->batch;
+	for (i = 0; i < wq->max_batch_size; i++) {
+		dma_dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) -
+			dst_avail;
+		dma_src = sg_dma_address(src_sg) + sg_dma_len(src_sg) -
+			src_avail;
+
+		len = min_t(size_t, dst_avail, src_avail);
+		len = min_t(size_t, len, wq->idxd->max_xfer_bytes);
+
+		memset(batch->descs + i, 0, sizeof(struct dsa_hw_desc));
+		idxd_prep_desc_common(wq, batch->descs + i, DSA_OPCODE_MEMMOVE,
+				dma_src, dma_dst, len, 0, IDXD_OP_FLAG_CC);
+		batch->num++;
+
+		dst_nents -= fetch_sg_and_pos(&dst_sg, &dst_avail, len);
+		src_nents -= fetch_sg_and_pos(&src_sg, &src_avail, len);
+
+		/* entries or src or dst consumed */
+		if (!dst_nents || !src_nents ||
+				!min_t(size_t, dst_avail, src_avail)) {
+			break;
+		}
+	}
+
+	/* prepare DSA_OPCODE_BATCH */
+	op_flag_setup(flags, &desc_flags);
+	idxd_prep_desc_common(wq, desc->hw, DSA_OPCODE_BATCH,
+			batch->dma_descs, 0, batch->num,
+			desc->compl_dma, desc_flags);
+
+	return &desc->txd;
+}
+
 static int idxd_dma_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct idxd_wq *wq = to_idxd_wq(chan);
@@ -282,6 +395,11 @@ int idxd_register_dma_device(struct idxd_device *idxd)
 		dma->device_prep_dma_memcpy = idxd_dma_submit_memcpy;
 	}
 
+	if (idxd->hw.opcap.bits[0] & IDXD_OPCAP_BATCH) {
+		dma_cap_set(DMA_MEMCPY_SG, dma->cap_mask);
+		dma->device_prep_dma_memcpy_sg = idxd_dma_prep_memcpy_sg;
+	}
+
 	dma->device_tx_status = idxd_dma_tx_status;
 	dma->device_issue_pending = idxd_dma_issue_pending;
 	dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources;
-- 
Gitee


From 8b02192071f28c511c827e03a10477a839eb1e9e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 10 Mar 2022 09:08:23 -0700
Subject: [PATCH 1893/2161] dmaengine: idxd: move wq pasid configuration until
 when needed

commit 098159e3d405124cbc2020d93f529dff0eb7869e Intel-BKC.

The main issue is that when we program wq with pasid of 0 but pasid_en set
for shared wq, it confuses the VDCM. Remove the WQ pasid setting during
configuration programming of the device as the driver does not have enough
information at that point to make that decision. Program the WQCFG pasid
and pasid_en at the time when it is needed. Add support in VDCM to
understand what pasid_en=1 and pasid=0 means.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c     | 44 ++++++++++++++++++-----------------
 drivers/vfio/mdev/idxd/vdev.c | 11 +++++++--
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 8fa2230eeda8..e38cefaf4a6e 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -443,6 +443,7 @@ static void __idxd_wq_set_priv_locked(struct idxd_wq *wq)
 	spin_lock(&idxd->dev_lock);
 	wqcfg.bits[WQCFG_PRIV_IDX] = ioread32(idxd->reg_base + offset);
 	wqcfg.priv = 1;
+	wq->wqcfg->bits[WQCFG_PRIV_IDX] = wqcfg.bits[WQCFG_PRIV_IDX];
 	iowrite32(wqcfg.bits[WQCFG_PRIV_IDX], idxd->reg_base + offset);
 	spin_unlock(&idxd->dev_lock);
 }
@@ -458,6 +459,7 @@ static void __idxd_wq_set_pasid_locked(struct idxd_wq *wq, int pasid)
 	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
 	wqcfg.pasid_en = 1;
 	wqcfg.pasid = pasid;
+	wq->wqcfg->bits[WQCFG_PASID_IDX] = wqcfg.bits[WQCFG_PASID_IDX];
 	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
 	spin_unlock(&idxd->dev_lock);
 }
@@ -987,7 +989,7 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	 */
 	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
 		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
-		wq->wqcfg->bits[i] = ioread32(idxd->reg_base + wq_offset);
+		wq->wqcfg->bits[i] |= ioread32(idxd->reg_base + wq_offset);
 	}
 
 	if (wq->size == 0 && wq->type != IDXD_WQT_NONE)
@@ -1003,16 +1005,6 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	if (wq_dedicated(wq))
 		wq->wqcfg->mode = 1;
 
-	/*
-	 * Enable this for shared WQ. swq does not need to program the pasid field,
-	 * but pasid_en needs to be set. Programming here prevents swq being disabled
-	 * and re-enabled, which is something to avoid.
-	 */
-	if ((is_idxd_wq_kernel(wq) && device_pasid_enabled(idxd)) ||
-	    ((is_idxd_wq_user(wq) || is_idxd_wq_mdev(wq)) &&
-	     device_user_pasid_enabled(idxd)))
-		wq->wqcfg->pasid_en = 1;
-
 	/*
 	 * Here the priv bit is set depending on the WQ type. priv = 1 if the
 	 * WQ type is kernel to indicate privileged access. This setting only
@@ -1210,12 +1202,6 @@ static int idxd_wq_load_config(struct idxd_wq *wq)
 	if (wq->wqcfg->bof)
 		set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
 
-	if (device_pasid_enabled(idxd) && wq->wqcfg->mode == 1) {
-		wq->wqcfg->pasid_en = 1;
-		wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
-		iowrite32(wq->wqcfg->bits[WQCFG_PASID_IDX], idxd->reg_base + wqcfg_offset);
-	}
-
 	if (wq->wqcfg->mode_support)
 		set_bit(WQ_FLAG_MODE_1, &wq->flags);
 
@@ -1471,11 +1457,27 @@ int __drv_enable_wq(struct idxd_wq *wq)
 		}
 	}
 
+	/*
+	 * In the event that the WQ is configurable for pasid and priv bits.
+	 * For kernel wq, the driver should setup the pasid, pasid_en, and priv bit.
+	 * However, for non-kernel wq, the driver should only set the pasid_en bit for
+	 * shared wq. A dedicated wq will configure pasid and pasid_en later on so
+	 * there is no need to setup.
+	 */
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags) ||
+	    test_bit(WQ_FLAG_MODE_1, &wq->flags)) {
+		if (is_idxd_wq_kernel(wq)) {
+			if (device_pasid_enabled(idxd)) {
+				u32 pasid = wq_dedicated(wq) ? idxd->pasid : 0;
+
+				__idxd_wq_set_pasid_locked(wq, pasid);
+			}
+			__idxd_wq_set_priv_locked(wq);
+		} else {
+			if (device_user_pasid_enabled(idxd) && wq_shared(wq))
+				__idxd_wq_set_pasid_locked(wq, 0);
+		}
 
-	if (is_idxd_wq_kernel(wq)) {
-		if (device_pasid_enabled(idxd) && wq_dedicated(wq))
-			__idxd_wq_set_pasid_locked(wq, idxd->pasid);
-		__idxd_wq_set_priv_locked(wq);
 	}
 
 	rc = 0;
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index af9e9ae7a64c..0090b4fca0bc 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -1191,8 +1191,15 @@ static void vidxd_wq_enable(struct vdcm_idxd *vidxd, int wq_id)
 
 			priv = vwqcfg->priv;
 			gpasid = vwqcfg->pasid;
-			rc = idxd_mdev_get_host_pasid(mdev, gpasid, &wq_pasid);
-			dev_dbg(dev, "guest pasid enabled, translate gpasid: %d\n", gpasid);
+
+			if (gpasid == 0) {
+				rc = idxd_mdev_get_pasid(mdev, &wq_pasid);
+				dev_dbg(dev, "shared wq, pasid 0, use default host: %u\n",
+					wq_pasid);
+			} else {
+				rc = idxd_mdev_get_host_pasid(mdev, gpasid, &wq_pasid);
+				dev_dbg(dev, "guest pasid enabled, translate gpasid: %d\n", gpasid);
+			}
 		} else {
 			priv = 1;
 			rc = idxd_mdev_get_pasid(mdev, &wq_pasid);
-- 
Gitee


From 4c31f83534750c7012b0c8c0e66bcb57c4da0731 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 8 Jul 2022 16:35:20 +0800
Subject: [PATCH 1894/2161] kvm/x86: Emulate the MOVDIR64B instruction on an
 EPT VMexit

commit ec0b7da08df525ff4e4b3ebe612f76d3f66e10b7 Intel-BKC.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_emulate.h |  2 ++
 arch/x86/kvm/emulate.c             | 39 +++++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 77cf6c11f66b..8292b0139f39 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -233,6 +233,7 @@ struct x86_emulate_ops {
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
+typedef u32 __attribute__((vector_size(64))) sz512_t;
 
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
@@ -257,6 +258,7 @@ struct operand {
 		u64 val64;
 		char valptr[sizeof(sse128_t)];
 		sse128_t vec_val;
+		char valptr512[sizeof(sz512_t)];
 		u64 mm_val;
 		void *data;
 	};
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 795be3d7f8f1..f6b7d3d8371c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3652,6 +3652,34 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_movdir64b(struct x86_emulate_ctxt *ctxt)
+{
+	u64 dest_va = ctxt->dst.val;
+	int rc = X86EMUL_CONTINUE;
+
+	if (dest_va & 0x3f) {
+		printk("dest va is not 64 byte aligned\n");
+	}
+	ctxt->op_bytes = 64;
+	ctxt->src.bytes = 64;
+	ctxt->dst.bytes = 64;
+
+	ctxt->dst.type = OP_MEM;
+	ctxt->dst.addr.mem.ea = dest_va;
+	ctxt->dst.addr.mem.seg = ctxt->src.addr.mem.seg;
+
+	rc = segmented_read(ctxt, ctxt->src.addr.mem,
+		ctxt->src.valptr512, ctxt->src.bytes);
+	if (rc != X86EMUL_CONTINUE) {
+		printk("MOVDIR64B: read src data failed rc %d\n", rc);
+		goto done;
+	}
+
+	memcpy(ctxt->dst.valptr512, ctxt->src.valptr512, sizeof(ctxt->src.valptr512));
+done:
+	return rc;
+}
+
 #define FFL(x) __feature_bit(X86_FEATURE_##x)
 
 static int em_movbe(struct x86_emulate_ctxt *ctxt)
@@ -4941,6 +4969,13 @@ static const struct gprefix three_byte_0f_38_f1 = {
 	ID(0, &instr_dual_0f_38_f1), N, N, N
 };
 
+/* MOVDIR64B has alignment requirement on dest but not on source.
+ * Not sure how to specify Unaligned only on the source addr
+ */
+static const struct gprefix three_byte_0f_38_f8 = {
+	N, I(DstReg | SrcMem | Mov | NoAccess | TwoMemOp | Unaligned, em_movdir64b), N, N
+};
+
 /*
  * Insns below are selected by the prefix which indexed by the third opcode
  * byte.
@@ -4954,7 +4989,7 @@ static const struct opcode opcode_map_0f_38[256] = {
 	GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
 	GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
 	/* 0xf2 - 0xff */
-	N, N, X4(N), X8(N)
+	N, N, X4(N), GP(ModRM, &three_byte_0f_38_f8), N, N, N, X4(N)
 };
 
 #undef D
@@ -5322,6 +5357,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 			ctxt->opcode_len = 3;
 			ctxt->b = insn_fetch(u8, ctxt);
 			opcode = opcode_map_0f_38[ctxt->b];
+			if (ctxt->b == 0xf8)
+				ctxt->op_bytes = 8;
 		}
 	}
 	ctxt->d = opcode.flags;
-- 
Gitee


From 0d213d953b5f96fd8908fb5502295f95cbee78d8 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar <sanjay.k.kumar@intel.com>
Date: Thu, 22 Jul 2021 23:08:35 +0800
Subject: [PATCH 1895/2161] kvm/x86: This commit emulates ENQCMD and ENQCMDS on
 an EPT exit

commit 2c090ef1c8a45b000ce93700a7a316006e1a9e5c Intel-BKC.

It doesnt support PASID MSR and VDCM PASID translation table yet.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_emulate.h |  5 +-
 arch/x86/kvm/emulate.c             | 95 +++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c                 | 42 ++++++++++---
 include/linux/kvm_host.h           |  4 ++
 4 files changed, 135 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8292b0139f39..c2a5e44ae33f 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -166,7 +166,10 @@ struct x86_emulate_ops {
 	int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
 			      unsigned long addr, const void *val,
 			      unsigned int bytes,
-			      struct x86_exception *fault);
+			      struct x86_exception *fault,
+			      bool non_posted);
+
+	int (*np_write_complete)(struct x86_emulate_ctxt *ctxt, bool *retry);
 
 	/*
 	 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f6b7d3d8371c..d415435122b0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -172,6 +172,7 @@
 #define NoMod	    ((u64)1 << 47)  /* Mod field is ignored */
 #define Intercept   ((u64)1 << 48)  /* Has valid intercept field */
 #define CheckPerm   ((u64)1 << 49)  /* Has valid check_perm field */
+#define NonPostedWrite  ((u64)1 << 50)  /* Instruction does non-posted write */
 #define PrivUD      ((u64)1 << 51)  /* #UD instead of #GP on CPL > 0 */
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16	    ((u64)1 << 53)  /* No 16 bit operand */
@@ -1485,12 +1486,15 @@ static int segmented_write(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	ulong linear;
+	bool non_posted = false;
 
 	rc = linearize(ctxt, addr, size, true, &linear);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
+	if (ctxt->d & NonPostedWrite)
+		non_posted = true;
 	return ctxt->ops->write_emulated(ctxt, linear, data, size,
-					 &ctxt->exception);
+					 &ctxt->exception, non_posted);
 }
 
 static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
@@ -3652,6 +3656,91 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_enqcmd(struct x86_emulate_ctxt *ctxt)
+{
+	u64 dest_va = ctxt->dst.val;
+	int rc = X86EMUL_CONTINUE;
+	bool retry = false;
+
+	if (ctxt->ops->np_write_complete(ctxt, &retry)) {
+		/* Dont do writeback on the return path*/
+		ctxt->d &= ~NoWrite;
+		if (retry)
+			ctxt->eflags |=  X86_EFLAGS_ZF;
+		else
+			ctxt->eflags &=  ~X86_EFLAGS_ZF;
+
+		return rc;
+	}
+
+	if (dest_va & 0x3f) {
+		printk("dest va is not 64 byte aligned\n");
+	}
+	ctxt->op_bytes = 64;
+	ctxt->src.bytes = 64;
+	ctxt->dst.bytes = 64;
+
+	ctxt->dst.type = OP_MEM;
+	ctxt->dst.addr.mem.ea = dest_va;
+	ctxt->dst.addr.mem.seg = ctxt->src.addr.mem.seg;
+
+	rc = segmented_read(ctxt, ctxt->src.addr.mem,
+	ctxt->src.valptr512, ctxt->src.bytes);
+	if (rc != X86EMUL_CONTINUE) {
+		printk("ENQCMD: read src data failed rc %d\n", rc);
+		goto done;
+	}
+
+	/* TODO: Get PASID from PASID MSR */
+
+	/* TODO: Translate PASID through VMCS PASID translation table */
+
+	memcpy(ctxt->dst.valptr512, ctxt->src.valptr512, sizeof(ctxt->src.valptr512));
+done:
+	return rc;
+}
+
+static int em_enqcmds(struct x86_emulate_ctxt *ctxt)
+{
+	u64 dest_va = ctxt->dst.val;
+	int rc = X86EMUL_CONTINUE;
+	bool retry = false;
+
+	if (ctxt->ops->np_write_complete(ctxt, &retry)) {
+		/* Dont do writeback on the return path*/
+		ctxt->d &= ~NoWrite;
+		if (retry)
+			ctxt->eflags |=  X86_EFLAGS_ZF;
+		else
+			ctxt->eflags &=  ~X86_EFLAGS_ZF;
+
+		return rc;
+	}
+
+	if (dest_va & 0x3f) {
+		printk("dest va is not 64 byte aligned\n");
+	}
+	ctxt->op_bytes = 64;
+	ctxt->src.bytes = 64;
+	ctxt->dst.bytes = 64;
+
+	ctxt->dst.type = OP_MEM;
+	ctxt->dst.addr.mem.ea = dest_va;
+	ctxt->dst.addr.mem.seg = ctxt->src.addr.mem.seg;
+
+	rc = segmented_read(ctxt, ctxt->src.addr.mem,
+	ctxt->src.valptr512, ctxt->src.bytes);
+	if (rc != X86EMUL_CONTINUE) {
+		printk("ENQCMDS: read src data failed rc %d\n", rc);
+		goto done;
+	}
+
+	/* TODO: Translate PASID through VMCS PASID translation table */
+	memcpy(ctxt->dst.valptr512, ctxt->src.valptr512, sizeof(ctxt->src.valptr512));
+done:
+	return rc;
+}
+
 static int em_movdir64b(struct x86_emulate_ctxt *ctxt)
 {
 	u64 dest_va = ctxt->dst.val;
@@ -4973,7 +5062,9 @@ static const struct gprefix three_byte_0f_38_f1 = {
  * Not sure how to specify Unaligned only on the source addr
  */
 static const struct gprefix three_byte_0f_38_f8 = {
-	N, I(DstReg | SrcMem | Mov | NoAccess | TwoMemOp | Unaligned, em_movdir64b), N, N
+	N, I(DstReg | SrcMem | Mov | NoAccess | TwoMemOp | Unaligned, em_movdir64b),
+	I(DstReg | SrcMem | Mov | NoAccess | TwoMemOp | Unaligned | NonPostedWrite, em_enqcmd),
+	I(DstReg | SrcMem | Mov | NoAccess | TwoMemOp | Unaligned | NonPostedWrite, em_enqcmds)
 };
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b67229ea8ba4..c7b21fb5c802 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5965,7 +5965,8 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
 			unsigned long addr,
 			void *val, unsigned int bytes,
 			struct x86_exception *exception,
-			const struct read_write_emulator_ops *ops)
+			const struct read_write_emulator_ops *ops,
+			bool non_posted)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	gpa_t gpa;
@@ -6009,6 +6010,12 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
 
 	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+
+	if (non_posted && ops->write) {
+		vcpu->mmio_is_write |= MMIO_NONPOSTED_WRITE;
+		vcpu->run->mmio.is_write = vcpu->mmio_is_write;
+	}
+
 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = gpa;
 
@@ -6022,17 +6029,31 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				  struct x86_exception *exception)
 {
 	return emulator_read_write(ctxt, addr, val, bytes,
-				   exception, &read_emultor);
+				   exception, &read_emultor, false);
 }
 
 static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
 			    unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
-			    struct x86_exception *exception)
+			    struct x86_exception *exception,
+			    bool non_posted)
 {
 	return emulator_read_write(ctxt, addr, (void *)val, bytes,
-				   exception, &write_emultor);
+				   exception, &write_emultor, non_posted);
+}
+
+static int emulator_np_write_complete(struct x86_emulate_ctxt *ctxt, bool *retry)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	if (vcpu->mmio_nonposted_write_completed) {
+		vcpu->mmio_nonposted_write_completed = 0;
+		*retry = !!(vcpu->run->mmio.is_write & MMIO_NONPOSTED_DEFERRED);
+		return 1;
+	}
+
+	return 0;
 }
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -6105,7 +6126,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
-	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
+	return emulator_write_emulated(ctxt, addr, new, bytes, exception, false);
 }
 
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -6507,6 +6528,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
+	.np_write_complete   = emulator_np_write_complete,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
 	.invlpg              = emulator_invlpg,
 	.pio_in_emulated     = emulator_pio_in_emulated,
@@ -7733,7 +7755,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
 	return emulator_write_emulated(ctxt, rip, instruction, 3,
-		&ctxt->exception);
+		&ctxt->exception, false);
 }
 
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
@@ -8721,9 +8743,13 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 		vcpu->mmio_needed = 0;
 
 		/* FIXME: return into emulator if single-stepping.  */
-		if (vcpu->mmio_is_write)
+		if (vcpu->mmio_is_write == MMIO_WRITE)
 			return 1;
-		vcpu->mmio_read_completed = 1;
+		if (vcpu->mmio_is_write == MMIO_NONPOSTED_WRITE)
+			vcpu->mmio_nonposted_write_completed = 1;
+		else
+			vcpu->mmio_read_completed = 1;
+
 		return complete_emulated_io(vcpu);
 	}
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79b4dbb3e7fc..281b64286e0b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -289,6 +289,10 @@ struct kvm_vcpu {
 #ifdef CONFIG_HAS_IOMEM
 	int mmio_needed;
 	int mmio_read_completed;
+	int mmio_nonposted_write_completed;
+#define MMIO_WRITE 1
+#define MMIO_NONPOSTED_WRITE 3
+#define MMIO_NONPOSTED_DEFERRED 4
 	int mmio_is_write;
 	int mmio_cur_fragment;
 	int mmio_nr_fragments;
-- 
Gitee


From 47796c108ef4dcaa66ede7661f05b353bfa09c7c Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Thu, 17 Feb 2022 18:27:40 +0800
Subject: [PATCH 1896/2161] kvm: Support 512bit operand for emulated
 instruction

commit 866b410f2ada5585fada1733dbbabd3ef7db6d26 Intel-BKC.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c       | 14 +++++++-------
 include/uapi/linux/kvm.h |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c7b21fb5c802..7a3599b35b07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5507,7 +5507,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 	int n;
 
 	do {
-		n = min(len, 8);
+		n = min(len, 64);
 		if (!(lapic_in_kernel(vcpu) &&
 		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
 		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
@@ -5527,7 +5527,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	int n;
 
 	do {
-		n = min(len, 8);
+		n = min(len, 64);
 		if (!(lapic_in_kernel(vcpu) &&
 		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
 					 addr, n, v))
@@ -5891,7 +5891,7 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
 	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
 
-	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+	memcpy(vcpu->run->mmio.data, frag->data, min(64u, frag->len));
 	return X86EMUL_CONTINUE;
 }
 
@@ -6008,7 +6008,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_cur_fragment = 0;
 
-	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
+	vcpu->run->mmio.len = min(64u, vcpu->mmio_fragments[0].len);
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
 
 	if (non_posted && ops->write) {
@@ -8724,7 +8724,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 	/* Complete previous fragment */
 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
-	len = min(8u, frag->len);
+	len = min(64u, frag->len);
 	if (!vcpu->mmio_is_write)
 		memcpy(frag->data, run->mmio.data, len);
 
@@ -8756,8 +8756,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 	run->exit_reason = KVM_EXIT_MMIO;
 	run->mmio.phys_addr = frag->gpa;
 	if (vcpu->mmio_is_write)
-		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
-	run->mmio.len = min(8u, frag->len);
+		memcpy(run->mmio.data, frag->data, min(64u, frag->len));
+	run->mmio.len = min(64u, frag->len);
 	run->mmio.is_write = vcpu->mmio_is_write;
 	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	return 0;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 81caac385bce..3afc10fe4864 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -301,7 +301,7 @@ struct kvm_run {
 		/* KVM_EXIT_MMIO */
 		struct {
 			__u64 phys_addr;
-			__u8  data[8];
+			__u8  data[64];
 			__u32 len;
 			__u8  is_write;
 		} mmio;
@@ -437,7 +437,7 @@ struct kvm_coalesced_mmio {
 		__u32 pad;
 		__u32 pio;
 	};
-	__u8  data[8];
+	__u8  data[64];
 };
 
 struct kvm_coalesced_mmio_ring {
-- 
Gitee


From 86b74e5d6668549bc8736ca45f60c2bfef478987 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Tue, 15 Mar 2022 22:04:40 +0800
Subject: [PATCH 1897/2161] iommu/vtd: Fix bug introduced by porting codes

commit 7f624662dcf69cfd790def75289894f4a5400093 Intel-BKC.

Unlock is missed when porting codes, we can observe kernel hang issue.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2e04da17c624..4e7553acd2c8 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -6039,6 +6039,8 @@ static int intel_iommu_set_hwdbm(struct iommu_domain *domain, bool enable,
 	}
 
 out:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
 	return ret;
 }
 
-- 
Gitee


From 95d38d03eed38191ca6aa2f6d15a1ba734068530 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Mon, 21 Mar 2022 22:25:34 +0800
Subject: [PATCH 1898/2161] idxd: Fix kernel bug found during DSA DWQ live
 migration

commit a1a86b18b59dd62c56fe597d2a7fe97f29818acc Intel-BKC.

Found below kernel bug when doing DSA DWQ live migration
[  940.265678] BUG: kernel NULL pointer dereference, address: 0000000000000000
...
[  940.319892] RIP: 0010:vidxd_portal_mmio_write.cold.12+0x53/0x216 [idxd_mdev]

The reason is that some critical codes missed during porting
codes from 5.12 to 5.15. Fix it.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/idxd.h       | 1 +
 drivers/dma/idxd/init.c       | 4 ++++
 drivers/vfio/mdev/idxd/mdev.c | 4 +++-
 drivers/vfio/mdev/idxd/vdev.c | 4 +++-
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index d871c514dd58..7593f4103f15 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -283,6 +283,7 @@ struct idxd_device {
 
 	struct pci_dev *pdev;
 	void __iomem *reg_base;
+	void __iomem *portal_base;
 
 	spinlock_t dev_lock;	/* spinlock for device */
 	spinlock_t cmd_lock;	/* spinlock for device commands */
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 82f6e272ae61..017136aa165f 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -609,6 +609,10 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_iomap;
 	}
 
+	idxd->portal_base = pcim_iomap(pdev, IDXD_WQ_BAR, 0);
+	if (!idxd->portal_base)
+		return -ENOMEM;
+
 	dev_dbg(dev, "Set DMA masks\n");
 	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 	if (rc)
diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 39ffca5c4a38..380925905809 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -587,7 +587,9 @@ static int vidxd_resubmit_pending_descs (struct vdcm_idxd *vidxd,
 			memcpy((u8 *)&el, data_ptr + *offset, sizeof(el));
 			*offset += sizeof(el);
 
-			portal = wq->portal;
+			portal = vidxd->idxd->portal_base +
+				idxd_get_wq_portal_full_offset(wq->id,
+						el.portal_prot, IDXD_IRQ_IMS);
 			portal += (el.portal_id << 6);
 
 			pr_info("submitting a desc to WQ %d:%d ded %d\n",
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 0090b4fca0bc..9a554a8f9501 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -331,7 +331,9 @@ int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
 			}
                } else {
 			void __iomem *wq_portal;
-			portal = wq->portal;
+			wq_portal = vidxd->idxd->portal_base +
+				idxd_get_wq_portal_full_offset(wq->id,
+						portal_prot, IDXD_IRQ_IMS);
 			wq_portal += (portal_id << 6);
 			printk("submitting a desc to WQ %d ded %d\n", wq->id,
 					wq_dedicated(wq));
-- 
Gitee


From 0df58cc4ad582897de6e0a9bca5e0ecd5c65ecfb Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Mon, 21 Mar 2022 23:30:03 +0800
Subject: [PATCH 1899/2161] idxd: Work around SWQ retry issue

commit ad14b7e05f19a689a81f58f38ea4d83678afe966 Intel-BKC.

SWQ may be shared to multiple uses so that the queue may be full
quickly. This will block other requests. To avoid this, we design
to send retry back to guest when migrating the mdev. But something
wrong in the retry flow so that guest cannot correctly send retry
out.

To unblock the test, make this work around to queue the desc as
DWQ. When final solution found, may revert this patch.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/vdev.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 9a554a8f9501..7f3a0e092684 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -302,7 +302,9 @@ int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
 
 		mutex_lock(&vidxd->mig_submit_lock);
 		if (vidxd->paused) {
+#if 0
 			if (wq_dedicated(wq)) {
+#endif
 				/* Queue the descriptor if submitted to DWQ */
 				if (vwq->ndescs == wq->size) {
 					printk("can't submit more descriptors than WQ size. Dropping.\n");
@@ -324,11 +326,13 @@ int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
 
 				list_add_tail(&elem->link, &vwq->head);
 				vwq->ndescs++;
+#if 0
 			} else {
 				/* Return retry if submitted to SWQ */
 				rc = -EAGAIN;
 				goto out_unlock;
 			}
+#endif
                } else {
 			void __iomem *wq_portal;
 			wq_portal = vidxd->idxd->portal_base +
-- 
Gitee


From a19e9b22f0e377df0c0bd181ff184bd7b8d81d13 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 21 Mar 2022 08:47:54 -0700
Subject: [PATCH 1900/2161] dmaengine: idxd: remove trailing white space on
 input str for driver_name

commit 81f5eb2b11ba748ee7e393e2faf32af773ae332d upstream.

Add string processing with strim() in order to remove trailing white spaces
that may be input by user.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 5e4257b15f55..d117ed669ab9 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1060,6 +1060,7 @@ static ssize_t wq_driver_name_store(struct device *dev, struct device_attribute
 				    const char *buf, size_t count)
 {
 	struct idxd_wq *wq = confdev_to_wq(dev);
+	char *input, *pos;
 
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
@@ -1067,9 +1068,14 @@ static ssize_t wq_driver_name_store(struct device *dev, struct device_attribute
 	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
 		return -EINVAL;
 
+	input = kstrndup(buf, count, GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	pos = strim(input);
 	memset(wq->driver_name, 0, WQ_NAME_SIZE + 1);
-	strncpy(wq->driver_name, buf, WQ_NAME_SIZE);
-	strreplace(wq->name, '\n', '\0');
+	sprintf(wq->driver_name, "%s", pos);
+	kfree(input);
 	return count;
 }
 
-- 
Gitee


From a5a8c68ba1a36c412f07f1e154f778c978d3c87c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 21 Mar 2022 09:07:59 -0700
Subject: [PATCH 1901/2161] dmaengine: idxd: remove trailing white space on
 input str for wq name

commit 81f5eb2b11ba748ee7e393e2faf32af773ae332d upstream.

Add string processing with strim() in order to remove trailing white spaces
that may be input by user.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/sysfs.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index d117ed669ab9..2be5fdfa22c2 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -839,6 +839,7 @@ static ssize_t wq_name_store(struct device *dev,
 			     size_t count)
 {
 	struct idxd_wq *wq = confdev_to_wq(dev);
+	char *input, *pos;
 
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
@@ -846,9 +847,14 @@ static ssize_t wq_name_store(struct device *dev,
 	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
 		return -EINVAL;
 
+	input = kstrndup(buf, count, GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	pos = strim(input);
 	memset(wq->name, 0, WQ_NAME_SIZE + 1);
-	strncpy(wq->name, buf, WQ_NAME_SIZE);
-	strreplace(wq->name, '\n', '\0');
+	sprintf(wq->name, "%s", pos);
+	kfree(input);
 	return count;
 }
 
-- 
Gitee


From c63bee89a9afdfb5149e4cbe69e0e366a6b1b3fc Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 22 Mar 2022 10:31:30 -0700
Subject: [PATCH 1902/2161] dmaengine: idxd: clear driver_name on wq reset

commit e92cef3c5e0c92e3c8f9d873153e10527f138a44 Intel-BKC.

Cleanup driver_name when wq gets disabled.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index e38cefaf4a6e..bed08e428072 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -523,6 +523,7 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
+	memset(wq->driver_name, 0, WQ_NAME_SIZE);
 	wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
 	wq->max_batch_size = WQ_DEFAULT_MAX_BATCH;
 }
-- 
Gitee


From b2d651be713d3ef22ad4d552f1b8f9b174155f64 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 22 Mar 2022 17:09:19 -0700
Subject: [PATCH 1903/2161] dmanegine: idxd: check batch before init in
 __get_desc

commit f4735eec028420bcd5ff3e0f0b955aa84c8f502c Intel-BKC.

IAX does not have batch support. So don't attempt to init the batch
structure during descriptor allocation.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/submit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index e9c494f7d5aa..64d0b17cfc28 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -16,7 +16,8 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 	desc = wq->descs[idx];
 	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
 	memset(desc->completion, 0, idxd->data->compl_size);
-	desc->batch->num = 0;
+	if (desc->batch)
+		desc->batch->num = 0;
 	desc->cpu = cpu;
 
 	if (device_pasid_enabled(idxd))
-- 
Gitee


From f1bf5d4956773f7e59c935f9f480a96a1080eaf3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Mar 2022 16:30:01 -0700
Subject: [PATCH 1904/2161] vfio: mdev: idxd: fix vidxd_resume_wq_state() unint
 var

commit eecc072b71b0d166b085d2235b1c9c369c7957ab Intel-BKC.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 380925905809..67f2d5bf9959 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -434,8 +434,6 @@ static int vidxd_resume_wq_state(struct vdcm_idxd *vidxd)
 				}
 
 				if (wq_pasid >= 0) {
-					u32 status;
-
 					wqcfg->bits[WQCFG_PASID_IDX] &=
 								~GENMASK(29, 8);
 					wqcfg->priv = priv;
@@ -449,8 +447,8 @@ static int vidxd_resume_wq_state(struct vdcm_idxd *vidxd)
 					idxd_wq_setup_priv(wq, priv);
 					spin_unlock_irqrestore(&idxd->dev_lock,
 									flags);
-					idxd_wq_enable(wq, &rc);
-					if (status) {
+					rc = idxd_wq_enable(wq, NULL);
+					if (rc) {
 						dev_err(dev, "resume wq failed\n");
 						break;;
 					}
@@ -462,7 +460,7 @@ static int vidxd_resume_wq_state(struct vdcm_idxd *vidxd)
 				spin_lock_irqsave(&idxd->dev_lock, flags);
 				idxd_wq_setup_pasid(wq, 0);
 				spin_unlock_irqrestore(&idxd->dev_lock, flags);
-				idxd_wq_enable(wq, &rc);
+				rc = idxd_wq_enable(wq, NULL);
 				if (rc) {
 					dev_err(dev, "resume wq %d failed\n",
 							wq->id);
-- 
Gitee


From ad9fd02c3d8a60b0c43167a21623d06e18eb8dd7 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Mar 2022 17:18:06 -0700
Subject: [PATCH 1905/2161] crypto: iax: remove double free on iax_device

commit 58417352c933b8d8a2ac724d797c2d23653fa52e Intel-BKC.

In add_iax_devices(), when iax_aecs_alloc() fails, it already calls
iax_device_free() internally. So there's no need to call iax_device_free()
in the failure path.

Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/iax_crypto_main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index ffe0655a1286..313a175059d7 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -358,10 +358,8 @@ static struct iax_device *add_iax_device(struct idxd_device *idxd)
 
 	iax_device->idxd = idxd;
 
-	if (iax_aecs_alloc(iax_device) < 0) {
-		iax_device_free(iax_device);
+	if (iax_aecs_alloc(iax_device) < 0)
 		return NULL;
-	}
 
 	list_add_tail(&iax_device->list, &iax_devices);
 
-- 
Gitee


From 0fe5e031a8152304d3ec88b6893efef86728b07f Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Mar 2022 17:23:24 -0700
Subject: [PATCH 1906/2161] crypto: iax: fix usage of uninit ptr in
 iax_compress() call

commit 88e12aed6bfbcab02ca1710c7036327e540fa23b Intel-BKC.

Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/iax_crypto_main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index 313a175059d7..f07151d6d142 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -695,9 +695,10 @@ static int iax_compress(struct crypto_tfm *tfm,
 	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
 	if (IS_ERR(idxd_desc)) {
 		pr_err("%s: idxd descriptor allocation failed\n", __func__);
-		pr_warn("%s: iax compress (verify) failed: ret=%ld\n", __func__, PTR_ERR(desc));
+		pr_warn("%s: iax compress (verify) failed: ret=%ld\n", __func__,
+			PTR_ERR(idxd_desc));
 
-		return PTR_ERR(desc);
+		return PTR_ERR(idxd_desc);
 	}
 	desc = idxd_desc->iax_hw;
 
-- 
Gitee


From 0caa1f71edb5adb54586e120f9cecd04b309d9d3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Mar 2022 17:39:43 -0700
Subject: [PATCH 1907/2161] vfio: mdev: idxd: fix uninit var usage in
 vidxd_resume_wq_state()

commit 07e68bf383f345eacc900d57d3832cc7b05b14f1 Intel-BKC.

wq_pasid should be filled out when passed by reference to called functions.
However, to be extra paranoid, set it to -1 as init value.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 67f2d5bf9959..3bf46fef7c7f 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -420,7 +420,7 @@ static int vidxd_resume_wq_state(struct vdcm_idxd *vidxd)
 			/* If dedicated WQ and PASID is not enabled, program
 			 * the default PASID in the WQ PASID register */
 			if (wq_dedicated(wq) && vwqcfg->mode_support) {
-				int wq_pasid, gpasid = -1;
+				int wq_pasid = -1, gpasid = -1;
 
 				if (vwqcfg->pasid_en) {
 					gpasid = vwqcfg->pasid;
-- 
Gitee


From 6c1dc727d3e734ea78da37f17a719fa72b13975c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Mar 2022 17:43:02 -0700
Subject: [PATCH 1908/2161] vfio: mdev: idxd: fix uninit var in
 vidxd_wq_reset()

commit b9155fefe8fea8479277fbfab2a9f49201689f5e Intel-BKC.

status variable uninitialized before being used. idxd_wq_abort() takes
status by reference and may set it. But don't count on it.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/vdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 7f3a0e092684..7195cc5d4095 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -1008,7 +1008,7 @@ static void vidxd_wq_reset(struct vdcm_idxd *vidxd, int wq_id_mask)
 	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
 	struct mdev_device *mdev = vidxd->ivdev.mdev;
 	struct device *dev = mdev_dev(mdev);
-	u32 status;
+	u32 status = 0;
 
 	wq = vidxd->wq;
 	dev_dbg(dev, "vidxd reset wq %u:%u\n", 0, wq->id);
-- 
Gitee


From 4e8d45642cf5a6adc4cb68dcd141a74a6fa057a5 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Mar 2022 17:46:18 -0700
Subject: [PATCH 1909/2161] vfio: mdev: idxd: fix uninit var usage in
 vidxd_wq_abort()

commit 07e68bf383f345eacc900d57d3832cc7b05b14f1 Intel-BKC.

Init status variable in case the called functions that passes it by
reference does not set it.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/vdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 7195cc5d4095..2f7441741776 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -954,7 +954,7 @@ static void vidxd_wq_abort(struct vdcm_idxd *vidxd, int val)
 	u8 *bar0 = vidxd->bar0;
 	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
 	struct idxd_wq *wq = vidxd->wq;
-	u32 status;
+	u32 status = 0;
 
 	dev_dbg(dev, "%s\n", __func__);
 	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
-- 
Gitee


From cdb0c4edd60c4b46e7a01c11d04b8d8afa411881 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Mar 2022 17:48:10 -0700
Subject: [PATCH 1910/2161] crypto: iax: fix incorrect desc ptr being used for
 error out

commit df094bb3096494995136d81dedda77e43d84160c Intel-BKC.

Error output using the wrong ptr for error status. Change to idxd_desc
which is the ptr should be checked.

Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/iax_crypto_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index f07151d6d142..45d2a3c6f54e 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -796,9 +796,9 @@ static int iax_decompress(struct crypto_tfm *tfm,
 	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
 	if (IS_ERR(idxd_desc)) {
 		pr_err("%s: idxd descriptor allocation failed\n", __func__);
-		pr_warn("%s: iax decompress failed: ret=%ld\n", __func__, PTR_ERR(desc));
+		pr_warn("%s: iax decompress failed: ret=%ld\n", __func__, PTR_ERR(idxd_desc));
 
-		return PTR_ERR(desc);
+		return PTR_ERR(idxd_desc);
 	}
 	desc = idxd_desc->iax_hw;
 
-- 
Gitee


From 7263cac15bb68b0878422592d4cf0568daa56c78 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 24 Mar 2022 16:46:31 -0700
Subject: [PATCH 1911/2161] vfio: mdev: idxd: check return value for
 iommu_unregister_device_fault_handler()

commit a7d35e8ca358e40c1b70e6798ee4e5c2ce8d0da3 Intel-BKC.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev_host.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev_host.c b/drivers/vfio/mdev/idxd/mdev_host.c
index b05858e22759..ef94f5994023 100644
--- a/drivers/vfio/mdev/idxd/mdev_host.c
+++ b/drivers/vfio/mdev/idxd/mdev_host.c
@@ -66,7 +66,7 @@ void idxd_mdev_host_release(struct kref *kref)
 	struct idxd_device *idxd = container_of(kref, struct idxd_device, mdev_kref);
 	struct device *dev = &idxd->pdev->dev;
 	struct vfio_pci_core_device *vfio_pdev = &idxd->vfio_pdev;
-	int i;
+	int i, rc;
 
 	if (!idxd->mdev_host_init)
 		return;
@@ -78,7 +78,10 @@ void idxd_mdev_host_release(struct kref *kref)
 	vfio_pdev->num_regions = 0;
 	kfree(vfio_pdev->region);
 	vfio_pdev->region = NULL;
-	iommu_unregister_device_fault_handler(&idxd->pdev->dev);
+	rc = iommu_unregister_device_fault_handler(&idxd->pdev->dev);
+	if (rc)
+		dev_warn(dev, "iommu_unregister_device_fault_handler() failed\n");
+
 	for (i = 0; i < vfio_pdev->num_ext_irqs; i++)
 		vfio_pci_set_ext_irq_trigger(vfio_pdev,
 					     VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
-- 
Gitee


From 3263257e4ea8cdf879bc0664ec2cfd3beb211e0e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 24 Mar 2022 16:49:07 -0700
Subject: [PATCH 1912/2161] vfio: mdev: idxd: make wq_pasid to be an int

commit 10fd5439913a155d06067d9518ac13e47bf212cc Intel-BKC.

wq_pasid is set to u32 and a compare later compares against >= 0, which
always evaluate to true. Make wq_pasid an int and set default value to -1.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/vdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 2f7441741776..868a71f6355d 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -1189,7 +1189,7 @@ static void vidxd_wq_enable(struct vdcm_idxd *vidxd, int wq_id)
 	wq_pasid_enable = vwqcfg->pasid_en;
 
 	if (wq_dedicated(wq)) {
-		u32 wq_pasid = ~0U;
+		int wq_pasid = -1;
 		bool priv;
 
 		if (wq_pasid_enable) {
-- 
Gitee


From 351d5fe9b1027b21e427b11c24219084ce9ace8c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 24 Mar 2022 16:53:18 -0700
Subject: [PATCH 1913/2161] vfio: mdev: idxd: fix potential null dereference

commit 6ab8ad7874107834950c2567d05491b31f16f6d2 Intel-BKC.

Add check to data ptr for idxd_vdcm_set_ctx_trigger_single() before
dereference the ptr.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 3bf46fef7c7f..8ed855188554 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -1787,6 +1787,9 @@ static int idxd_vdcm_set_ctx_trigger_single(struct eventfd_ctx **ctx,
 		if (!count)
 			return -EINVAL;
 
+		if (!data)
+			return -EINVAL;
+
 		trigger = *(u8 *)data;
 		if (trigger && *ctx)
 			eventfd_signal(*ctx, 1);
-- 
Gitee


From deb02d123edb3ac07f46a6ed274709a9096b43ef Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 24 Mar 2022 17:18:40 -0700
Subject: [PATCH 1914/2161] vfio: mdev: idxd: check idxd_wq_enable() return
 value

commit 4943953b3b684933a48d6b6819ccf0308189f1c4 Intel-BKC.

Check the return value of idxd_wq_enable() in vidxd_wq_enable().

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/vdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 868a71f6355d..db501a0c8b2c 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -1231,8 +1231,8 @@ static void vidxd_wq_enable(struct vdcm_idxd *vidxd, int wq_id)
 			idxd_wq_setup_pasid(wq, wq_pasid);
 			idxd_wq_setup_priv(wq, priv);
 			spin_unlock_irqrestore(&idxd->dev_lock, flags);
-			idxd_wq_enable(wq, &status);
-			if (status) {
+			rc = idxd_wq_enable(wq, &status);
+			if (rc < 0 || status) {
 				dev_err(dev, "vidxd enable wq %d failed\n", wq->id);
 				idxd_complete_command(vidxd, status);
 				return;
-- 
Gitee


From 245af5e3527300891c0ef2af34519cb60793829d Mon Sep 17 00:00:00 2001
From: Jiangbo Wu <jiangbo.wu@intel.com>
Date: Sat, 19 Mar 2022 04:35:43 +0800
Subject: [PATCH 1915/2161] IDXD: claim device to support HugePage/THP segment
 size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit af9a712e3657f6af8c282d204f19d01e405dd106 Intel-BKC.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using IDXD to copy HugePage/THP pages, if DMA_API_DEBUG is enabled,
DMA-API debug code would complain mapping segment size is longer than
device claimed, it use 64K as default if device doesn’t claim.

Claim DSA max_segment_size to HugePage/THP to avoid DMA-API debug code
print warning message if copy HugePage/THP pages.

Signed-off-by: Jiangbo Wu <jiangbo.wu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/dma.c  | 7 +++++++
 drivers/dma/idxd/idxd.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index a37721c82e0d..1af0e279c549 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -387,6 +387,13 @@ int idxd_register_dma_device(struct idxd_device *idxd)
 	INIT_LIST_HEAD(&dma->channels);
 	dma->dev = dev;
 
+	/*
+	 * claim device max_segment_size to HugePage/THP size, otherwise
+	 * DMA-API debug code would complain it's longer than default.
+	 */
+	idxd_dma->dma_parms.max_segment_size = HPAGE_PMD_SIZE;
+	dma->dev->dma_parms = &idxd_dma->dma_parms;
+
 	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
 	dma->device_release = idxd_dma_release;
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 7593f4103f15..7b404bc171e2 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -259,6 +259,7 @@ enum idxd_device_flag {
 struct idxd_dma_dev {
 	struct idxd_device *idxd;
 	struct dma_device dma;
+	struct device_dma_parameters dma_parms;
 };
 
 struct idxd_driver_data {
-- 
Gitee


From cf2f86e7ddaa974d8e3bbe0cda759b71a97c598e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 1 Apr 2022 08:09:01 -0700
Subject: [PATCH 1916/2161] iommu/vt-d: Allow disabling PRQ timeout

commit bc603fa1d83379e290421aa15e1c8840f4939a0c Intel-BKC.

For debug purpose, if user use cmdline iommu.prq_timeout=0, let's disable
PRQ timeout tracking. The downside is that we will not limit mis-behaved
guests.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f3d60080f62d..523316a5b49d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1401,7 +1401,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 		exp = get_jiffies_64() + prq_timeout;
 		evt_pending->expire = exp;
 		mutex_lock(&fparam->lock);
-		if (list_empty(&fparam->faults)) {
+		if (list_empty(&fparam->faults) && prq_timeout) {
 			/* First pending event, start timer */
 			tmr = &dev->iommu->fault_param->timer;
 			WARN_ON(timer_pending(tmr));
@@ -1555,7 +1555,7 @@ int iommu_page_response(struct iommu_domain *domain,
 	}
 
 	/* stop response timer if no more pending request */
-	if (list_empty(&param->fault_param->faults) &&
+	if (prq_timeout && list_empty(&param->fault_param->faults) &&
 		timer_pending(&param->fault_param->timer)) {
 		pr_debug("no pending PRQ, stop timer\n");
 		del_timer(&param->fault_param->timer);
-- 
Gitee


From 2b5eaf9280c92aa17395a791937a7461b47ff36f Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 7 Apr 2022 10:29:12 -0700
Subject: [PATCH 1917/2161] dmaengine: idxd: set priv bit to 0 when wq is not
 kernel type

commit c2f10b603dc626985c6d3dcce79aee52ef95be2d Intel-BKC.

Set priv bit to 0 for read-only WQ setup when the WQ type is not kernel.
Current code does not touch the priv bit when the WQ is not kernel type.
This non-action causes problem wq reconfiguration where the WQ is
configured as a kernel type, gets disabled, and then reconfigured to be a
user WQ. The priv bit that was set by the kernel type config remains.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index bed08e428072..3edcdb405c10 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -433,7 +433,7 @@ int idxd_wq_abort(struct idxd_wq *wq, u32 *status)
 }
 EXPORT_SYMBOL_GPL(idxd_wq_abort);
 
-static void __idxd_wq_set_priv_locked(struct idxd_wq *wq)
+static void __idxd_wq_set_priv_locked(struct idxd_wq *wq, int priv)
 {
 	struct idxd_device *idxd = wq->idxd;
 	union wqcfg wqcfg;
@@ -442,7 +442,7 @@ static void __idxd_wq_set_priv_locked(struct idxd_wq *wq)
 	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PRIV_IDX);
 	spin_lock(&idxd->dev_lock);
 	wqcfg.bits[WQCFG_PRIV_IDX] = ioread32(idxd->reg_base + offset);
-	wqcfg.priv = 1;
+	wqcfg.priv = priv;
 	wq->wqcfg->bits[WQCFG_PRIV_IDX] = wqcfg.bits[WQCFG_PRIV_IDX];
 	iowrite32(wqcfg.bits[WQCFG_PRIV_IDX], idxd->reg_base + offset);
 	spin_unlock(&idxd->dev_lock);
@@ -1473,12 +1473,12 @@ int __drv_enable_wq(struct idxd_wq *wq)
 
 				__idxd_wq_set_pasid_locked(wq, pasid);
 			}
-			__idxd_wq_set_priv_locked(wq);
+			__idxd_wq_set_priv_locked(wq, 1);
 		} else {
 			if (device_user_pasid_enabled(idxd) && wq_shared(wq))
 				__idxd_wq_set_pasid_locked(wq, 0);
+			__idxd_wq_set_priv_locked(wq, 0);
 		}
-
 	}
 
 	rc = 0;
-- 
Gitee


From eebc96bc3df1bbb8e2f562140122f670d895f5e7 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 7 Apr 2022 16:38:15 -0700
Subject: [PATCH 1918/2161] vfio: mdev: idxd: fix max batch size for vdcm

commit b761b8f3203a05f05cab1e27ca568d2f9b72a810 Intel-BKC.

Set wq max batch size and max xfer size to actual wq values.

Also add permission check to reject guest modification.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/vdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index db501a0c8b2c..561674615554 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -686,8 +686,8 @@ static void vidxd_mmio_init_wqcfg(struct vdcm_idxd *vidxd)
 	wqcfg->bof = wq->wqcfg->bof;
 
 	wqcfg->priority = wq->priority;
-	wqcfg->max_xfer_shift = idxd->hw.gen_cap.max_xfer_shift;
-	wqcfg->max_batch_shift = idxd->hw.gen_cap.max_batch_shift;
+	wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
+	wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
 	wqcfg->mode_support = 1;
 }
 
-- 
Gitee


From 128cfc767bcb17356e541f6de2848b5dadd09ace Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 8 Apr 2022 13:17:11 -0700
Subject: [PATCH 1919/2161] dmaengine: idxd: load wq configuration for max_xfer
 and max_batch

commit 84a5fc5fb6891aecc3d7bd0dc520a75c8d800487 Intel-BKC.

Make sure the internal wq variables for max_xfer_size and max_batch_size
are updated from the WQCFG when loading a read-only device configuration.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 3edcdb405c10..1d9a13f5bfbe 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1206,6 +1206,9 @@ static int idxd_wq_load_config(struct idxd_wq *wq)
 	if (wq->wqcfg->mode_support)
 		set_bit(WQ_FLAG_MODE_1, &wq->flags);
 
+	wq->max_xfer_bytes = 1ULL << wq->wqcfg->max_xfer_shift;
+	wq->max_batch_size = 1ULL << wq->wqcfg->max_batch_shift;
+
 	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
 		wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i);
 		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n", wq->id, i, wqcfg_offset, wq->wqcfg->bits[i]);
-- 
Gitee


From 7b01b520e6d16f0d3275c4bda8b1bbb2d13e556a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 12 Apr 2022 11:18:34 -0700
Subject: [PATCH 1920/2161] vfio: mdev: idxd: put kref when all devices are
 released

commit 8dc1c970b4da505e9d7568d86648c675c65d54ab Intel-BKC.

kref is init at value of 1. So when it reaches value of 1 on remove,
call the extra put and release everything.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 8ed855188554..4d32ad10fa93 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -2427,8 +2427,13 @@ static void idxd_mdev_drv_remove(struct idxd_dev *idxd_dev)
 	mutex_unlock(&wq->wq_lock);
 
 	mutex_lock(&idxd->kref_lock);
-	if (idxd->mdev_host_init)
+	if (idxd->mdev_host_init) {
 		kref_put(&idxd->mdev_kref, idxd_mdev_host_release);
+
+		/* kref init at 1, when it hits 1, no more devices and we can release */
+		if (kref_read(&idxd->mdev_kref) == 1)
+			kref_put(&idxd->mdev_kref, idxd_mdev_host_release);
+	}
 	mutex_unlock(&idxd->kref_lock);
 	put_device(dev);
 	dev_info(dev, "wq %s disabled\n", dev_name(dev));
-- 
Gitee


From 3a0fa3e9ba1b907323c935f5bd32d276a24024bc Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Wed, 13 Apr 2022 08:32:22 +0200
Subject: [PATCH 1921/2161] dma-direct: avoid redundant memory sync for swiotlb

commit 9e02977bfad006af328add9434c8bffa40e053bb upstream.

When we looked into FIO performance with swiotlb enabled in VM, we found
swiotlb_bounce() is always called one more time than expected for each DMA
read request.

It turns out that the bounce buffer is copied to original DMA buffer twice
after the completion of a DMA request (one is done by in
dma_direct_sync_single_for_cpu(), the other by swiotlb_tbl_unmap_single()).
But the content in bounce buffer actually doesn't change between the two
rounds of copy. So, one round of copy is redundant.

Pass DMA_ATTR_SKIP_CPU_SYNC flag to swiotlb_tbl_unmap_single() to
skip the memory copy in it.

This fix increases FIO 64KB sequential read throughput in a guest with
swiotlb=force by 5.6%.

Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct code")
Reported-by: Wang Zhaoyang1 <zhaoyang1.wang@intel.com>
Reported-by: Gao Liang <liang.gao@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/dma-direct.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index d63ea014a272..8af66e513212 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -182,6 +182,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, dir,
+					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
 #endif /* _LINUX_DMA_DIRECT_H */
-- 
Gitee


From 12529162c21c73d65a39ae117b3c417a973a5107 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 15 Apr 2022 17:40:36 -0700
Subject: [PATCH 1922/2161] dmaengine: idxd: fix mdev driver ->remove()
 unregistering mdev device

commit 9a9bdbb8608cd39b70751444545fcb1f2b0197ca Intel-BKC.

Remove code that skips mdev device unregister and host setup teardown in
mdev idxd_driver ->remove() call.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 4d32ad10fa93..5086e1146dee 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -2413,15 +2413,9 @@ static void idxd_mdev_drv_remove(struct idxd_dev *idxd_dev)
 	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
 	struct idxd_device *idxd = wq->idxd;
 
-
 	mutex_lock(&wq->wq_lock);
 	__drv_disable_wq(wq);
 
-	if (wq->state == IDXD_WQ_DISABLED) {
-		mutex_unlock(&wq->wq_lock);
-		return;
-	}
-
 	if (wq->state == IDXD_WQ_LOCKED)
 		wq->state = IDXD_WQ_DISABLED;
 	mutex_unlock(&wq->wq_lock);
-- 
Gitee


From 78041d9393aa5a6a33d133e0b035399d71c4c44e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 20 Apr 2022 13:15:29 -0700
Subject: [PATCH 1923/2161] dmaengine: idxd: fix wq clear state when called
 from device disable

commit 8866c56721b2e3f5468792015025deacde17affd Intel-BKC.

At the time when idxd_device_wqs_clear_state() gets called, which is from
device disable path, the wq may already been disabled. Given that the
driver does not actually know if the wq was previously disabled, force
cleanup the context of any available wq.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 1d9a13f5bfbe..c9c2aaf46d38 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -889,10 +889,7 @@ static void idxd_device_wqs_clear_state(struct idxd_device *idxd)
 	for (i = 0; i < idxd->max_wqs; i++) {
 		struct idxd_wq *wq = idxd->wqs[i];
 
-		if (wq->state == IDXD_WQ_ENABLED) {
-			idxd_wq_disable_cleanup(wq);
-			wq->state = IDXD_WQ_DISABLED;
-		}
+		idxd_wq_disable_cleanup(wq);
 		idxd_wq_device_reset_cleanup(wq);
 	}
 }
-- 
Gitee


From 60b3405aab24dc770c553678053f5f30c5927026 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Apr 2022 10:16:24 -0700
Subject: [PATCH 1924/2161] dmaengine: idxd: return 0 if wq already enabled
 instead of error

commit 3a5e5ffd0d28c0c9b71f047668baca77723b6875 Intel-BKC.

If wq is already enabled and idxd_wq_enable() is called, return 0 instead
of error since it's not a real error.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index c9c2aaf46d38..ddde520c5e0c 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -248,7 +248,7 @@ int idxd_wq_enable(struct idxd_wq *wq, u32 *status)
 
 	if (wq->state == IDXD_WQ_ENABLED) {
 		dev_dbg(dev, "WQ %d already enabled\n", wq->id);
-		return -ENXIO;
+		return 0;
 	}
 
 	idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_WQ, wq->id, &stat);
-- 
Gitee


From 3da0c63a6a13688af3903749d3eeeeb4187add5e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 22 Apr 2022 22:30:11 -0700
Subject: [PATCH 1925/2161] vfio: mdev: idxd: fix mm ref leak

commit 1817cd523ae7fe5891bd3a9b14a89ec8d009ba78 Intel-BKC.

Fix leaked mm reference get when registering ioasid notifier.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 5086e1146dee..b071521d2158 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -1102,6 +1102,7 @@ static int vidxd_register_ioasid_notifier(struct vdcm_idxd *vidxd)
 	list_for_each_entry(mm_entry, &vdev->mm_list, node) {
 		if (mm_entry->mm == mm) {
 			mutex_unlock(&vdev->ioasid_lock);
+			mmput(mm);
 			return 0;
 		}
 	}
@@ -1117,12 +1118,12 @@ static int vidxd_register_ioasid_notifier(struct vdcm_idxd *vidxd)
 	vidxd->ivdev.pasid_nb.priority = IOASID_PRIO_DEVICE;
 	vidxd->ivdev.pasid_nb.notifier_call = idxd_mdev_ioasid_event;
 	rc = ioasid_register_notifier_mm(mm, &vidxd->ivdev.pasid_nb);
-	mmput(mm);
 	if (rc < 0)
 		goto err_ioasid;
 
 	list_add(&mm_entry->node, &vdev->mm_list);
 	mutex_unlock(&vdev->ioasid_lock);
+	mmput(mm);
 
 	return 0;
 
-- 
Gitee


From 289800211bb9a361a2198e254073b9936e8c61f9 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 25 Apr 2022 16:34:26 -0700
Subject: [PATCH 1926/2161] vfio: mdev: idxd: fix soft reset for guest

commit 3d23391ae9901be3f18e1311ad022e20496670dc Intel-BKC.

Bypass signal to guest when doing reset via vfio ioctl since the eventfd is
not available.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c |  2 +-
 drivers/vfio/mdev/idxd/mdev.h |  2 +-
 drivers/vfio/mdev/idxd/vdev.c | 20 ++++++++++++++------
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index b071521d2158..841221434d03 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -1453,7 +1453,7 @@ static int idxd_vdcm_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
 
 static void vidxd_vdcm_reset(struct vdcm_idxd *vidxd)
 {
-	vidxd_reset(vidxd);
+	vidxd_reset(vidxd, false);
 }
 
 static irqreturn_t idxd_vdcm_msix_handler(int irq, void *arg)
diff --git a/drivers/vfio/mdev/idxd/mdev.h b/drivers/vfio/mdev/idxd/mdev.h
index 8af1bb8ee21a..17b2e18f2fdd 100644
--- a/drivers/vfio/mdev/idxd/mdev.h
+++ b/drivers/vfio/mdev/idxd/mdev.h
@@ -166,7 +166,7 @@ int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int s
 int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count);
 int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size);
 void vidxd_mmio_init(struct vdcm_idxd *vidxd);
-void vidxd_reset(struct vdcm_idxd *vidxd);
+void vidxd_reset(struct vdcm_idxd *vidxd, bool interrupt);
 void vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx);
 void idxd_wq_vidxd_send_errors(struct idxd_wq *wq);
 
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 561674615554..1a9af0ccf088 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -30,7 +30,8 @@ void vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
 {
 	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
 
-	eventfd_signal(vfio_pdev->ctx[msix_idx].trigger, 1);
+	if (vfio_pdev->ctx[msix_idx].trigger)
+		eventfd_signal(vfio_pdev->ctx[msix_idx].trigger, 1);
 }
 
 static void vidxd_report_error(struct vdcm_idxd *vidxd, unsigned int error)
@@ -814,7 +815,8 @@ void vidxd_mmio_init(struct vdcm_idxd *vidxd)
 	vidxd_mmio_init_wqcfg(vidxd);
 }
 
-static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val)
+static void __idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val,
+				    bool interrupt)
 {
 	u8 *bar0 = vidxd->bar0;
 	u32 *cmd = (u32 *)(bar0 + IDXD_CMD_OFFSET);
@@ -828,10 +830,16 @@ static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err
 
 	if (*cmd & IDXD_CMD_INT_MASK) {
 		*intcause |= IDXD_INTC_CMD;
-		vidxd_send_interrupt(vidxd, 0);
+		if (interrupt)
+			vidxd_send_interrupt(vidxd, 0);
 	}
 }
 
+static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val)
+{
+	__idxd_complete_command(vidxd, val, true);
+}
+
 static void vidxd_enable(struct vdcm_idxd *vidxd)
 {
 	u8 *bar0 = vidxd->bar0;
@@ -975,7 +983,7 @@ static void vidxd_wq_abort(struct vdcm_idxd *vidxd, int val)
 	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
 }
 
-void vidxd_reset(struct vdcm_idxd *vidxd)
+void vidxd_reset(struct vdcm_idxd *vidxd, bool interrupt)
 {
 	struct mdev_device *mdev = vidxd->ivdev.mdev;
 	struct device *dev = mdev_dev(mdev);
@@ -998,7 +1006,7 @@ void vidxd_reset(struct vdcm_idxd *vidxd)
 
 	vidxd_mmio_reset(vidxd);
 	gensts->state = IDXD_DEVICE_STATE_DISABLED;
-	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+	__idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS, interrupt);
 }
 
 static void vidxd_wq_reset(struct vdcm_idxd *vidxd, int wq_id_mask)
@@ -1356,7 +1364,7 @@ static void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val)
 		vidxd_abort_all(vidxd);
 		break;
 	case IDXD_CMD_RESET_DEVICE:
-		vidxd_reset(vidxd);
+		vidxd_reset(vidxd, true);
 		break;
 	case IDXD_CMD_ENABLE_WQ:
 		vidxd_wq_enable(vidxd, reg->operand);
-- 
Gitee


From 4cb09f00f0ee1a37c8c81f793c0b0c9a20896097 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Sat, 23 Apr 2022 20:58:04 -0700
Subject: [PATCH 1927/2161] iommu/vt-d: Fix A-B-C-A dead lock issue which
 results in PRQ timeout

commit e6bec715e605e6027d1d5ca663bc1256d7e13fd2 Intel-BKC.

A lock: param->lock
B lock: fparam->lock
C lock: pasid_mutex

Thread #1: prq report, holds A lock, and tries to hold B lock
Thread #2: page response, holds B lock, and tries to hold C lock
Thread #3: unbind_gpasid (could be bind_gpasid or intel_svm_free_async_fn as well),
           holds C lock, and tries to hold A lock.

Dead lock happens when #1 holds A lock, #2 holds B lock and #3 holds C lock.

PRQ report:
     A lock    =>      B lock        unlock   =>     unlock
       |                 |             |               |
       |                 +-------------+               |
       +-----------------------------------------------+

Page response:
     B lock    =>      C lock        unlock   =>     unlock
       |                  |             |              |
       |                  +-------------+              |
       +-----------------------------------------------+

Unbind_gpasid:
     C lock   =>        A lock        unlock  =>     unlock
       |                  |             |              |
       |                  +-------------+              |
       +-----------------------------------------------+

This fix moves the attempt of holding A lock in Thread #3 to be
outside of C lock protection. To demonstrate well, also draw
the bind_gpasid explicitly. After fixing, the locking sequence is as
below:

Bind_gpasid:
                                                    {only for bind failure}
    A lock      unlock  =>   C lock     unlock  =>  A lock       unlock
      |           |           |          |            |            |
      +-----------+           +----------+            +------------+

PRQ report:
    A lock   =>    B lock        unlock   =>    unlock
      |              |             |              |
      |              +-------------+              |
      +-------------------------------------------+

Page response:
    B lock   =>    C lock        unlock   =>    unlock
      |              |             |              |
      |              +-------------+              |
      +-------------------------------------------+

Unbind_gpasid:
    C lock       unlock  =>    A lock     unlock
      |            |             |          |
      +------------+             +----------+

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 56 +++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 2c60c93daf91..cccf9c72fdde 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -187,7 +187,9 @@ static DEFINE_MUTEX(pasid_mutex);
 static void intel_svm_free_async_fn(struct work_struct *work)
 {
 	struct intel_svm *svm = container_of(work, struct intel_svm, work);
-	struct intel_svm_dev *sdev;
+	struct intel_svm_dev *sdev, *tmp;
+	LIST_HEAD(sdevs);
+	u32 pasid = svm->pasid;
 
 	/*
 	 * Unbind all devices associated with this PASID which is
@@ -203,11 +205,10 @@ static void intel_svm_free_async_fn(struct work_struct *work)
 		intel_svm_drain_prq(sdev->dev, svm->pasid);
 		spin_unlock(&sdev->iommu->lock);
 		/*
-		 * Partial assignment needs to delete fault data
+		 * Record the sdev and delete device_fault_data outside pasid_mutex
+		 * protection to avoid race with page response and prq reporting.
 		 */
-		if (is_aux_domain(sdev->dev, &sdev->domain->domain))
-			iommu_delete_device_fault_data(sdev->dev, svm->pasid);
-		kfree_rcu(sdev, rcu);
+		list_add_tail(&sdev->list, &sdevs);
 	}
 	/*
 	 * We may not be the last user to drop the reference but since
@@ -227,6 +228,16 @@ static void intel_svm_free_async_fn(struct work_struct *work)
 	kfree(svm);
 
 	mutex_unlock(&pasid_mutex);
+
+	list_for_each_entry_safe(sdev, tmp, &sdevs, list) {
+		list_del(&sdev->list);
+		/*
+		 * Partial assignment needs to delete fault data
+		 */
+		if (is_aux_domain(sdev->dev, &sdev->domain->domain))
+			iommu_delete_device_fault_data(sdev->dev, pasid);
+		kfree_rcu(sdev, rcu);
+	}
 }
 
 
@@ -470,6 +481,15 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	if (!info)
 		return -EINVAL;
 
+	/*
+	 * Partial assignment needs to add fault data per-pasid.
+	 * Add the fault data in advance as per pasid entry setup it should
+	 * be able to handle prq. And this should be outside of pasid_mutex
+	 * to avoid race with page response and prq reporting.
+	 */
+	if (is_aux_domain(dev, domain) && fault_data)
+		iommu_add_device_fault_data(dev, data->hpasid,
+					    fault_data);
 	mutex_lock(&pasid_mutex);
 	ret = pasid_to_svm_sdev(dev, pasid_set,
 				data->hpasid, &svm, &sdev);
@@ -501,12 +521,6 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 			svm->flags |= SVM_FLAG_GUEST_PASID;
 			if (!(data->flags & IOMMU_SVA_HPASID_DEF))
 				ioasid_attach_spid(data->hpasid, data->gpasid);
-			/*
-			 * Partial assignment needs to add fault data per-pasid
-			 */
-			if (is_aux_domain(dev, domain) && fault_data)
-				iommu_add_device_fault_data(dev, data->hpasid,
-							    fault_data);
 		}
 		ioasid_attach_data(data->hpasid, svm);
 		ioasid_get(NULL, svm->pasid);
@@ -580,6 +594,10 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 		data->hpasid = hpasid_org;
 
 	mutex_unlock(&pasid_mutex);
+
+	if (ret && is_aux_domain(dev, domain) && fault_data)
+		iommu_delete_device_fault_data(dev,
+				(data->flags & IOMMU_SVA_HPASID_DEF) ? hpasid_org : data->hpasid);
 	return ret;
 }
 
@@ -622,12 +640,6 @@ int intel_svm_unbind_gpasid(struct iommu_domain *domain,
 			intel_pasid_tear_down_entry(iommu, dev,
 						    svm->pasid, false, keep_pte);
 			intel_svm_drain_prq(dev, svm->pasid);
-			/*
-			 * Partial assignment needs to delete fault data
-			 */
-			if (is_aux_domain(dev, domain))
-				iommu_delete_device_fault_data(dev, pasid);
-			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
 				/*
@@ -646,6 +658,16 @@ int intel_svm_unbind_gpasid(struct iommu_domain *domain,
 	}
 out:
 	mutex_unlock(&pasid_mutex);
+	if (sdev) {
+		/*
+		 * Partial assignment needs to delete fault data, this should
+		 * be outside of pasid_mutex protection to avoid race with
+		 * page response and prq reporting.
+		 */
+		if (is_aux_domain(dev, domain))
+			iommu_delete_device_fault_data(dev, pasid);
+		kfree_rcu(sdev, rcu);
+	}
 	return ret;
 }
 
-- 
Gitee


From f0677651a1c1788579dce89a23ce297909764083 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Tue, 26 Apr 2022 06:40:46 -0700
Subject: [PATCH 1928/2161] iommu/vt-d: Fruther fix PASID race

commit 8a4d2d4a1cc582f3b7dbcf749a2ed901713564b8 Intel-BKC.

When the intel_svm_free_async_fn() is triggerd, it will call the
iommu_delete_device_fault_data() to delete the fault data for
the pasid. However, sync_fn() put the pasid reference before calling
iommu_delete_device_fault_data(). This is bad if the reference is
the last one. the pasid may be allocated to other usage, race will
come.

This patch just hold an extra reference on the pasid in the
iommu_add_device_fault_data(), and release when the iommu_del_device_fault_data()
is called.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 10 +++++++---
 drivers/iommu/iommu.c     |  7 +++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index cccf9c72fdde..dfbc5c3aa43b 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -487,9 +487,13 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	 * be able to handle prq. And this should be outside of pasid_mutex
 	 * to avoid race with page response and prq reporting.
 	 */
-	if (is_aux_domain(dev, domain) && fault_data)
-		iommu_add_device_fault_data(dev, data->hpasid,
-					    fault_data);
+	if (is_aux_domain(dev, domain) && fault_data) {
+		ret = iommu_add_device_fault_data(dev, data->hpasid,
+						  fault_data);
+		if (ret)
+			return ret;
+	}
+
 	mutex_lock(&pasid_mutex);
 	ret = pasid_to_svm_sdev(dev, pasid_set,
 				data->hpasid, &svm, &sdev);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 523316a5b49d..01b69495a776 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1222,6 +1222,12 @@ int iommu_add_device_fault_data(struct device *dev,
 
 	mutex_lock(&param->lock);
 
+	ret = ioasid_get(NULL, vector);
+	if (ret) {
+		dev_err(dev, "Failed to get vector %d\n", vector);
+		goto unlock;
+	}
+
 	/* vector must be unique, check if we have the same vector already */
 	list_for_each_entry(hdata, &param->fault_param->data, list) {
 		if (hdata->vector == vector) {
@@ -1282,6 +1288,7 @@ void iommu_delete_device_fault_data(struct device *dev, int vector)
 			list_del(&hdata->list);
 			kfree(hdata);
 			dev_dbg(dev, "Deleted IOMMU fault handler data for vector %d\n", vector);
+			ioasid_put(NULL, vector);//vector == pasid
 			goto unlock;
 		}
 	}
-- 
Gitee


From e74f5526d3b647b61ae6a2161c52c696022bf021 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Wed, 27 Apr 2022 05:32:37 -0700
Subject: [PATCH 1929/2161] iommu/vtd-: Fix regression issue in
 intel_svm_free_async_fn

commit c0d8d5817a0bea721972612ef905bea1ee38be44 Intel-BKC.

Call trace

[  149.745157] BUG: spinlock bad magic on CPU#99, kworker/u384:0/8
[  149.745249] BUG: unable to handle page fault for address: 0000001000000930
[  149.745355] #PF: supervisor read access in kernel mode
[  149.745436] #PF: error_code(0x0000) - not-present page
[  149.745517] PGD 0
[  149.745550] Oops: 0000 1 SMP NOPTI
[  149.745607] CPU: 99 PID: 8 Comm: kworker/u384:0 Tainted: G        W          5.15.0-spr.bkc.pc.6.8.0.x86_64 #1
[  149.745764] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.86B.0073.D14.2201260628 01/26/2022
[  149.745929] Workqueue: ioasid_wq intel_svm_free_async_fn
[  149.746012] RIP: 0010:spin_dump.cold.5+0x24/0x34
[  149.746085] Code: c3 e9 83 75 78 ff 41 83 c8 ff 48 c7 c1 32 dd 8f 88 8b 55 04 48 89 ee 48 c7 c7 80 b8 87 88 e8 81 24 00 00 5b 5d e9 87 cf 01 00 <44> 8b 83 50 09 00 00 48 8d 8b 38 0b 00 00 eb d7 48 8d 45 10 40 84
[  149.746378] RSP: 0018:ff46f841403c7e18 EFLAGS: 00010202
[  149.746460] RAX: 0000000000000033 RBX: 0000000fffffffe0 RCX: 0000000000000000
[  149.746571] RDX: 0000000000000000 RSI: ff4531203f4d7c70 RDI: ff4531203f4d7c70
[  149.746683] RBP: ff453120efa04788 R08: 0000000000000000 R09: 00000000efa04788
[  149.746795] R10: ff46f841403c7c40 R11: ff46f841403c7c38 R12: ff45311115c7f250
[  149.746907] R13: dead000000000122 R14: ff46f841403c7e50 R15: ff46f841403c7e50
[  149.747019] FS:  0000000000000000(0000) GS:ff4531203f4c0000(0000) knlGS:0000000000000000
[  149.747146] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  149.747236] CR2: 0000001000000930 CR3: 000000189720c004 CR4: 0000000001f73ee0
[  149.747347] DR0: ffffffff89b9879b DR1: 0000000000000000 DR2: 0000000000000000
[  149.747459] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000600
[  149.747571] PKRU: 55555554
[  149.747615] PKRS: 0x55555554
[  149.747660] Call Trace:
[  149.747700]  do_raw_spin_lock+0x71/0xc0
[  149.750580]  intel_svm_free_async_fn+0x9a/0x1f0
[  149.753505]  process_one_work+0x193/0x370
[  149.756409]  worker_thread+0x30/0x380
[  149.759282]  ? process_one_work+0x370/0x370
[  149.762121]  kthread+0x118/0x140
[  149.764908]  ? set_kthread_struct+0x40/0x40
[  149.767656]  ret_from_fork+0x1f/0x30
[  149.770337] Modules linked in: vhost_net vhost vhost_iotlb tap tun vfio_pci nfsv3 rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace bridge nft_ct stp llc nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink sunrpc ofpart spi_nor intel_rapl_msr mtd intel_sdsi pmt_crashlog intel_rapl_common pmt_telemetry pmt_class i10nm_edac nfit x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm isst_if_mbox_pci isst_if_mmio joydev isst_if_common idxd_mdev snd_pcm vfio_pci_core mei_me vfio_virqfd irqbypass intel_vsec mei spi_intel_pci i2c_i801 snd_timer spi_intel i2c_smbus i2c_ismt wmi acpi_power_meter acpi_pad pfr_update pfr_telemetry iax_crypto i40e crc32c_intel igb drm_vram_helper igc idxd drm_ttm_helper dca pinctrl_emmitsburg pinctrl_intel sr_mod cdrom fuse
[  149.797488] CR2: 0000001000000930
[  149.800524] --[ end trace 160b85f4bc67d73d ]--
[  150.111945] RIP: 0010:spin_dump.cold.5+0x24/0x34
[  150.114947] Code: c3 e9 83 75 78 ff 41 83 c8 ff 48 c7 c1 32 dd 8f 88 8b 55 04 48 89 ee 48 c7 c7 80 b8 87 88 e8 81 24 00 00 5b 5d e9 87 cf 01 00 <44> 8b 83 50 09 00 00 48 8d 8b 38 0b 00 00 eb d7 48 8d 45 10 40 84
[  150.124033] RSP: 0018:ff46f841403c7e18 EFLAGS: 00010202
[  150.127065] RAX: 0000000000000033 RBX: 0000000fffffffe0 RCX: 0000000000000000
[  150.130157] RDX: 0000000000000000 RSI: ff4531203f4d7c70 RDI: ff4531203f4d7c70
[  150.133233] RBP: ff453120efa04788 R08: 0000000000000000 R09: 00000000efa04788
[  150.136309] R10: ff46f841403c7c40 R11: ff46f841403c7c38 R12: ff45311115c7f250
[  150.139411] R13: dead000000000122 R14: ff46f841403c7e50 R15: ff46f841403c7e50
[  150.142470] FS:  0000000000000000(0000) GS:ff4531203f4c0000(0000) knlGS:0000000000000000
[  150.145555] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  150.148611] CR2: 0000001000000930 CR3: 000000189720c004 CR4: 0000000001f73ee0
[  150.151713] DR0: ffffffff89b9879b DR1: 0000000000000000 DR2: 0000000000000000
[  150.154831] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000600
[  150.157904] PKRU: 55555554
[  150.160985] PKRS: 0x55555554
[  150.183267] virbr0: port 2(tap0) entered disabled state
[  150.186582] device tap0 left promiscuous mode
[  150.189711] virbr0: port 2(tap0) entered disabled state
[  150.352879] iommu: iommu_sva_unbind_gpasid: FIXME need to clear all pending faults!
[  153.162117] vfio-pci 0000:6a:01.0: No device request channel registered, blocked until released by user

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index dfbc5c3aa43b..d8fcc18926d7 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -187,8 +187,8 @@ static DEFINE_MUTEX(pasid_mutex);
 static void intel_svm_free_async_fn(struct work_struct *work)
 {
 	struct intel_svm *svm = container_of(work, struct intel_svm, work);
-	struct intel_svm_dev *sdev, *tmp;
-	LIST_HEAD(sdevs);
+	struct intel_svm_dev *sdev, *subdev, *tmp;
+	LIST_HEAD(subdevs);
 	u32 pasid = svm->pasid;
 
 	/*
@@ -204,11 +204,21 @@ static void intel_svm_free_async_fn(struct work_struct *work)
 					svm->pasid, true, false);
 		intel_svm_drain_prq(sdev->dev, svm->pasid);
 		spin_unlock(&sdev->iommu->lock);
-		/*
-		 * Record the sdev and delete device_fault_data outside pasid_mutex
-		 * protection to avoid race with page response and prq reporting.
-		 */
-		list_add_tail(&sdev->list, &sdevs);
+		if (is_aux_domain(sdev->dev, &sdev->domain->domain)) {
+			subdev = kzalloc(sizeof(*subdev), GFP_KERNEL);
+			if (!subdev) {
+				dev_err_ratelimited(sdev->dev, "Failed to record for fault data del %u\n", pasid);
+				continue;
+			}
+			subdev->dev = sdev->dev;
+
+			kfree_rcu(sdev, rcu);
+			/*
+			 * Record the sdev and delete device_fault_data outside pasid_mutex
+			 * protection to avoid race with page response and prq reporting.
+			 */
+			list_add_tail(&subdev->list, &subdevs);
+		}
 	}
 	/*
 	 * We may not be the last user to drop the reference but since
@@ -229,14 +239,14 @@ static void intel_svm_free_async_fn(struct work_struct *work)
 
 	mutex_unlock(&pasid_mutex);
 
-	list_for_each_entry_safe(sdev, tmp, &sdevs, list) {
-		list_del(&sdev->list);
+	list_for_each_entry_safe(subdev, tmp, &subdevs, list) {
+		list_del(&subdev->list);
 		/*
 		 * Partial assignment needs to delete fault data
 		 */
-		if (is_aux_domain(sdev->dev, &sdev->domain->domain))
-			iommu_delete_device_fault_data(sdev->dev, pasid);
-		kfree_rcu(sdev, rcu);
+		dev_dbg(subdev->dev, "try to del fault data for %u\n", pasid);
+		iommu_delete_device_fault_data(subdev->dev, pasid);
+		kfree(subdev);
 	}
 }
 
-- 
Gitee


From cf5636bd4ab9f3f5df7ee4be92339d64d2a01e0e Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 11 Jul 2022 19:59:32 +0800
Subject: [PATCH 1930/2161] Revert "vfio: mdev: idxd: fix soft reset for guest"

commit 8a46bdc8eb2d0fbfcd54b5daa6d6a61885947358 Intel-BKC.

This reverts commit db02084b239740ec59b30c46b9793e3ad766e5de.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c |  2 +-
 drivers/vfio/mdev/idxd/mdev.h |  2 +-
 drivers/vfio/mdev/idxd/vdev.c | 20 ++++++--------------
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 841221434d03..b071521d2158 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -1453,7 +1453,7 @@ static int idxd_vdcm_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
 
 static void vidxd_vdcm_reset(struct vdcm_idxd *vidxd)
 {
-	vidxd_reset(vidxd, false);
+	vidxd_reset(vidxd);
 }
 
 static irqreturn_t idxd_vdcm_msix_handler(int irq, void *arg)
diff --git a/drivers/vfio/mdev/idxd/mdev.h b/drivers/vfio/mdev/idxd/mdev.h
index 17b2e18f2fdd..8af1bb8ee21a 100644
--- a/drivers/vfio/mdev/idxd/mdev.h
+++ b/drivers/vfio/mdev/idxd/mdev.h
@@ -166,7 +166,7 @@ int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int s
 int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count);
 int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size);
 void vidxd_mmio_init(struct vdcm_idxd *vidxd);
-void vidxd_reset(struct vdcm_idxd *vidxd, bool interrupt);
+void vidxd_reset(struct vdcm_idxd *vidxd);
 void vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx);
 void idxd_wq_vidxd_send_errors(struct idxd_wq *wq);
 
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 1a9af0ccf088..561674615554 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -30,8 +30,7 @@ void vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
 {
 	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
 
-	if (vfio_pdev->ctx[msix_idx].trigger)
-		eventfd_signal(vfio_pdev->ctx[msix_idx].trigger, 1);
+	eventfd_signal(vfio_pdev->ctx[msix_idx].trigger, 1);
 }
 
 static void vidxd_report_error(struct vdcm_idxd *vidxd, unsigned int error)
@@ -815,8 +814,7 @@ void vidxd_mmio_init(struct vdcm_idxd *vidxd)
 	vidxd_mmio_init_wqcfg(vidxd);
 }
 
-static void __idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val,
-				    bool interrupt)
+static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val)
 {
 	u8 *bar0 = vidxd->bar0;
 	u32 *cmd = (u32 *)(bar0 + IDXD_CMD_OFFSET);
@@ -830,16 +828,10 @@ static void __idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_er
 
 	if (*cmd & IDXD_CMD_INT_MASK) {
 		*intcause |= IDXD_INTC_CMD;
-		if (interrupt)
-			vidxd_send_interrupt(vidxd, 0);
+		vidxd_send_interrupt(vidxd, 0);
 	}
 }
 
-static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val)
-{
-	__idxd_complete_command(vidxd, val, true);
-}
-
 static void vidxd_enable(struct vdcm_idxd *vidxd)
 {
 	u8 *bar0 = vidxd->bar0;
@@ -983,7 +975,7 @@ static void vidxd_wq_abort(struct vdcm_idxd *vidxd, int val)
 	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
 }
 
-void vidxd_reset(struct vdcm_idxd *vidxd, bool interrupt)
+void vidxd_reset(struct vdcm_idxd *vidxd)
 {
 	struct mdev_device *mdev = vidxd->ivdev.mdev;
 	struct device *dev = mdev_dev(mdev);
@@ -1006,7 +998,7 @@ void vidxd_reset(struct vdcm_idxd *vidxd, bool interrupt)
 
 	vidxd_mmio_reset(vidxd);
 	gensts->state = IDXD_DEVICE_STATE_DISABLED;
-	__idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS, interrupt);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
 }
 
 static void vidxd_wq_reset(struct vdcm_idxd *vidxd, int wq_id_mask)
@@ -1364,7 +1356,7 @@ static void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val)
 		vidxd_abort_all(vidxd);
 		break;
 	case IDXD_CMD_RESET_DEVICE:
-		vidxd_reset(vidxd, true);
+		vidxd_reset(vidxd);
 		break;
 	case IDXD_CMD_ENABLE_WQ:
 		vidxd_wq_enable(vidxd, reg->operand);
-- 
Gitee


From 0318c1a5007bfb28b3a7c9d74e40e3b429eb3420 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 29 Apr 2022 12:27:48 -0700
Subject: [PATCH 1931/2161] vfio: mdev: idxd: fix vdcm_reset() for vidxd device

commit 84d4279747aa9b9acaf67177ef41dca99419b3e0 Intel-BKC.

Make vdcm_reset() to do full vidxd re-initialization in order for the ADI
to load in guest after soft reset.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index b071521d2158..4aeaf7689385 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -1453,7 +1453,7 @@ static int idxd_vdcm_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
 
 static void vidxd_vdcm_reset(struct vdcm_idxd *vidxd)
 {
-	vidxd_reset(vidxd);
+	idxd_vdcm_init(vidxd);
 }
 
 static irqreturn_t idxd_vdcm_msix_handler(int irq, void *arg)
-- 
Gitee


From 3ea366ee4c592580a2ceba619db10d3bbdfdf3db Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 26 Apr 2022 16:27:12 -0700
Subject: [PATCH 1932/2161] iommu/vt-d: Add PRQ counter for debug

commit d6872eb4d4e329816c635be6c562b607a03d8287 Intel-BKC.

To debug possible lost page requests or page responses, this patch adds
three counters to be used as follows:
- clear counter:
echo 0 >  /sys/kernel/debug/iommu/intel/dmar_perf_latency
- read the counters
root@kbl:~# cat /sys/kernel/debug/iommu/intel/invalidation_queue  | grep -e PR -e dma
Invalidation queue on IOMMU: dmar1
No. of received PRQs: 0
No. of PRR SUCCESS sent: 0
No. of PRR INVALID sent: 0
Invalidation queue on IOMMU: dmar0
No. of received PRQs: 0
No. of PRR SUCCESS sent: 0
No. of PRR INVALID sent: 0
Invalidation queue on IOMMU: dmar2
No. of received PRQs: 0
No. of PRR SUCCESS sent: 0
No. of PRR INVALID sent: 0

The above counter need to be used with idxd perfmon.
perf stat -a -e
dsa0/event=0x10,event_category=0x2/,dsa0/event=0x20,event_category=0x2/,dsa0/event=0x2,event_category=0x2/,dsa0/event=0x8000,event_category=0x2/

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/debugfs.c | 6 ++++++
 drivers/iommu/intel/dmar.c    | 7 +++++++
 drivers/iommu/intel/svm.c     | 1 +
 include/linux/intel-iommu.h   | 4 ++++
 4 files changed, 18 insertions(+)

diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index f4c6ef39068e..2c1b9e569816 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -426,6 +426,9 @@ static int invalidation_queue_show(struct seq_file *m, void *unused)
 			   (u64)virt_to_phys(qi->desc),
 			   dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift,
 			   dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift);
+		seq_printf(m, "No. of received PRQs: %ld\n", iommu->num_prqs);
+		seq_printf(m, "No. of PRR SUCCESS sent: %ld\n", iommu->num_prrs);
+		seq_printf(m, "No. of PRR INVALID sent: %ld\n", iommu->num_prri);
 		invalidation_queue_entry_show(m, iommu);
 		raw_spin_unlock_irqrestore(&qi->q_lock, flags);
 		seq_putc(m, '\n');
@@ -709,6 +712,9 @@ static ssize_t dmar_perf_latency_write(struct file *filp,
 			dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB);
 			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC);
 			dmar_latency_disable(iommu, DMAR_LATENCY_PRQ);
+			iommu->num_prqs = 0;
+			iommu->num_prrs = 0;
+			iommu->num_prri = 0;
 		}
 		rcu_read_unlock();
 		break;
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index c23eeafdf51b..659f0cfc6680 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1413,6 +1413,13 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
 	wait_desc.qw2 = 0;
 	wait_desc.qw3 = 0;
 
+	if ((desc[count - 1].qw0 & 0xF) == 9) {
+		if ((desc[count - 1].qw0 & 0xF000) == 0)
+			iommu->num_prrs++;
+		else if ((desc[count - 1].qw0 & 0xF000) == 0x1000)
+			iommu->num_prri++;
+	}
+
 	offset = wait_index << shift;
 	memcpy(qi->desc + offset, &wait_desc, 1 << shift);
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index d8fcc18926d7..ba7591bf30d0 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1199,6 +1199,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		vm_fault_t ret;
 		u64 address;
 
+		iommu->num_prqs++;
 		handled = 1;
 		req = &iommu->prq[head / sizeof(*req)];
 		result = QI_RESP_INVALID;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 3e1c59f2b3ce..8c66c3e42fe5 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -604,6 +604,10 @@ struct intel_iommu {
 	unsigned char prq_name[16];    /* Name for PRQ interrupt */
 	struct completion prq_complete;
 	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
+	unsigned long num_prqs; /* Page Req */
+	unsigned long num_prrs; /* Page Resp SUCCESS */
+	unsigned long num_prri; /* Page Resp INVALID */
+
 #endif
 	struct iopf_queue *iopf_queue;
 	unsigned char iopfq_name[16];
-- 
Gitee


From 3ccf9c4985f4af2e740a9e9a6f652eab9f0d3464 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 20 Apr 2022 19:36:00 +0800
Subject: [PATCH 1933/2161] iommu/ioasid: Fix possible lock race

commit 1493db38c706bc56f00fa0fb5aa6e5a2ff238b96 Intel-BKC.

Fix a A-B/B-A locking issue in the ioasid.c

Reported-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 5fdbb2619cca..587a1b8c5a6d 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -955,6 +955,19 @@ void ioasid_free_all_in_set(struct ioasid_set *set)
 }
 EXPORT_SYMBOL_GPL(ioasid_free_all_in_set);
 
+static struct ioasid_set *ioasid_find_mm_set_locked(struct mm_struct *token)
+{
+	struct ioasid_set *set;
+	unsigned long index;
+
+	xa_for_each(&ioasid_sets, index, set) {
+		if (set->type == IOASID_SET_TYPE_MM && set->token == token)
+			return set;
+	}
+
+	return NULL;
+}
+
 /*
  * ioasid_find_mm_set - Retrieve IOASID set with mm token
  * Take a reference of the set if found.
@@ -962,17 +975,11 @@ EXPORT_SYMBOL_GPL(ioasid_free_all_in_set);
 struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token)
 {
 	struct ioasid_set *set;
-	unsigned long index;
 
 	spin_lock(&ioasid_allocator_lock);
-
-	xa_for_each(&ioasid_sets, index, set) {
-		if (set->type == IOASID_SET_TYPE_MM && set->token == token)
-			goto exit_unlock;
-	}
-	set = NULL;
-exit_unlock:
+	set = ioasid_find_mm_set_locked(token);
 	spin_unlock(&ioasid_allocator_lock);
+
 	return set;
 }
 EXPORT_SYMBOL_GPL(ioasid_find_mm_set);
@@ -1245,6 +1252,7 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 	struct ioasid_set *set;
 	int ret = 0;
 
+	spin_lock(&ioasid_allocator_lock);
 	spin_lock(&ioasid_nb_lock);
 	/* Check for duplicates, nb is unique per set */
 	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
@@ -1264,7 +1272,7 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 	curr->nb = nb;
 
 	/* Check if the token has an existing set */
-	set = ioasid_find_mm_set(mm);
+	set = ioasid_find_mm_set_locked(mm);
 	if (!set) {
 		/* Add to the rsvd list as inactive */
 		curr->active = false;
@@ -1291,6 +1299,7 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 	kfree(curr);
 exit_unlock:
 	spin_unlock(&ioasid_nb_lock);
+	spin_unlock(&ioasid_allocator_lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ioasid_register_notifier_mm);
-- 
Gitee


From 257928b5a36b6e3f0b2c8c33e772829bb97c5cff Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 21 Apr 2022 16:47:11 +0800
Subject: [PATCH 1934/2161] iommu/ioasid: Simplify spin locks in ioasid.c

commit 6d3ec3316961c41e34c1b10672cab134ecbebafc Intel-BKC.

Two spin locks are usded in ioasid.c. As all paths are slow ones, use a
global one to make things simple. Make sure that the spin lock is held
when a notifier is registered/unregistered/notified.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 587a1b8c5a6d..c8eed393d593 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -24,7 +24,6 @@
 static ATOMIC_NOTIFIER_HEAD(ioasid_notifier);
 /* List to hold pending notification block registrations */
 static LIST_HEAD(ioasid_nb_pending_list);
-static DEFINE_SPINLOCK(ioasid_nb_lock);
 
 /* Default to PCIe standard 20 bit PASID */
 #define PCI_PASID_MAX 0x100000
@@ -459,6 +458,7 @@ static int ioasid_notify(struct ioasid_data *data,
 	struct ioasid_nb_args args = { 0 };
 	int ret = 0;
 
+	assert_spin_locked(&ioasid_allocator_lock);
 	if (flags & ~(IOASID_NOTIFY_FLAG_ALL | IOASID_NOTIFY_FLAG_SET))
 		return -EINVAL;
 
@@ -584,13 +584,14 @@ static void ioasid_add_pending_nb(struct ioasid_set *set)
 {
 	struct ioasid_set_nb *curr;
 
+	assert_spin_locked(&ioasid_allocator_lock);
+
 	if (set->type != IOASID_SET_TYPE_MM)
 		return;
 	/*
 	 * Check if there are any pending nb requests for the given token, if so
 	 * add them to the notifier chain.
 	 */
-	spin_lock(&ioasid_nb_lock);
 	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
 		if (curr->token == set->token && !curr->active) {
 			atomic_notifier_chain_register(&set->nh, curr->nb);
@@ -598,7 +599,6 @@ static void ioasid_add_pending_nb(struct ioasid_set *set)
 			curr->active = true;
 		}
 	}
-	spin_unlock(&ioasid_nb_lock);
 }
 
 /**
@@ -755,9 +755,7 @@ int ioasid_set_free(struct ioasid_set *set)
 	int ret = 0;
 
 	spin_lock(&ioasid_allocator_lock);
-	spin_lock(&ioasid_nb_lock);
 	ret = ioasid_set_free_locked(set);
-	spin_unlock(&ioasid_nb_lock);
 	spin_unlock(&ioasid_allocator_lock);
 	return ret;
 }
@@ -944,13 +942,11 @@ void ioasid_free_all_in_set(struct ioasid_set *set)
 	if (!atomic_read(&set->nr_ioasids))
 		return;
 	spin_lock(&ioasid_allocator_lock);
-	spin_lock(&ioasid_nb_lock);
 	xa_for_each(&set->xa, index, entry) {
 		ioasid_free_locked(set, index);
 		/* Free from per set private pool */
 		xa_erase(&set->xa, index);
 	}
-	spin_unlock(&ioasid_nb_lock);
 	spin_unlock(&ioasid_allocator_lock);
 }
 EXPORT_SYMBOL_GPL(ioasid_free_all_in_set);
@@ -1201,10 +1197,16 @@ EXPORT_SYMBOL_GPL(ioasid_find);
 
 int ioasid_register_notifier(struct ioasid_set *set, struct notifier_block *nb)
 {
+	int ret;
+
+	spin_lock(&ioasid_allocator_lock);
 	if (set)
-		return atomic_notifier_chain_register(&set->nh, nb);
+		ret = atomic_notifier_chain_register(&set->nh, nb);
 	else
-		return atomic_notifier_chain_register(&ioasid_notifier, nb);
+		ret = atomic_notifier_chain_register(&ioasid_notifier, nb);
+	spin_unlock(&ioasid_allocator_lock);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ioasid_register_notifier);
 
@@ -1213,7 +1215,7 @@ void ioasid_unregister_notifier(struct ioasid_set *set,
 {
 	struct ioasid_set_nb *curr;
 
-	spin_lock(&ioasid_nb_lock);
+	spin_lock(&ioasid_allocator_lock);
 	/*
 	 * Pending list is registered with a token without an ioasid_set,
 	 * therefore should not be unregistered directly.
@@ -1221,16 +1223,16 @@ void ioasid_unregister_notifier(struct ioasid_set *set,
 	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
 		if (curr->nb == nb) {
 			pr_warn("Cannot unregister NB from pending list\n");
-			spin_unlock(&ioasid_nb_lock);
-			return;
+			goto out_unlock;
 		}
 	}
-	spin_unlock(&ioasid_nb_lock);
 
 	if (set)
 		atomic_notifier_chain_unregister(&set->nh, nb);
 	else
 		atomic_notifier_chain_unregister(&ioasid_notifier, nb);
+out_unlock:
+	spin_unlock(&ioasid_allocator_lock);
 }
 EXPORT_SYMBOL_GPL(ioasid_unregister_notifier);
 
@@ -1253,7 +1255,6 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 	int ret = 0;
 
 	spin_lock(&ioasid_allocator_lock);
-	spin_lock(&ioasid_nb_lock);
 	/* Check for duplicates, nb is unique per set */
 	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
 		if (curr->token == mm && curr->nb == nb) {
@@ -1298,7 +1299,6 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 exit_free:
 	kfree(curr);
 exit_unlock:
-	spin_unlock(&ioasid_nb_lock);
 	spin_unlock(&ioasid_allocator_lock);
 	return ret;
 }
@@ -1314,21 +1314,19 @@ void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *
 {
 	struct ioasid_set_nb *curr;
 
-	spin_lock(&ioasid_nb_lock);
+	spin_lock(&ioasid_allocator_lock);
 	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
 		if (curr->token == mm && curr->nb == nb) {
 			list_del(&curr->list);
-			spin_unlock(&ioasid_nb_lock);
 			if (curr->active) {
 				atomic_notifier_chain_unregister(&curr->set->nh,
 								 nb);
 			}
 			kfree(curr);
-			return;
+			break;
 		}
 	}
-	pr_warn("No ioasid set found for mm token %llx\n",  (u64)mm);
-	spin_unlock(&ioasid_nb_lock);
+	spin_unlock(&ioasid_allocator_lock);
 }
 EXPORT_SYMBOL_GPL(ioasid_unregister_notifier_mm);
 
-- 
Gitee


From b8e302bd2202e7dc53c88c95a6ca0867f8db2138 Mon Sep 17 00:00:00 2001
From: Baolu Lu <baolu.lu@linux.intel.com>
Date: Thu, 28 Apr 2022 14:22:31 +0800
Subject: [PATCH 1935/2161] iommu/ioasid: Avoid
 atomic_notifier_chain_unregister() with spinlock

commit 0669c81851f6322eaddd1c81515b354e63b2c193 Intel-BKC.

atomic_notifier_chain_unregister() will eventually call synchronize_rcu()
which is not suitable in a lock critical area. Release the lock before
calling atomic_notifier_chain_unregister() and reaquire it after it.
The set might be released once we put the lock down when call into
atomic_notifier_chain_unregister(). This will result in set->nh an
invalid reference. Keep a reference of set before unlock and drop
it afterward.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 44 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index c8eed393d593..047ad250b441 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -1210,6 +1210,39 @@ int ioasid_register_notifier(struct ioasid_set *set, struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(ioasid_register_notifier);
 
+/*
+ *  atomic_notifier_chain_unregister() will eventually call synchronize_rcu()
+ *  which is not suitable in a lock critical area. Release the lock before
+ *  calling atomic_notifier_chain_unregister() and reaquire it after it.
+ */
+static void unregister_notifier_locked(struct ioasid_set *set,
+				       struct notifier_block *nb)
+{
+	assert_spin_locked(&ioasid_allocator_lock);
+
+	/*
+	 * The set might be released once we put the lock down and result in
+	 * set->nh an invalid reference. Keep a reference of set before
+	 * unlock and drop it afterward.
+	 */
+	if (set) {
+		atomic_inc(&set->nr_ioasids);
+
+		spin_unlock(&ioasid_allocator_lock);
+		atomic_notifier_chain_unregister(&set->nh, nb);
+		spin_lock(&ioasid_allocator_lock);
+
+		if (atomic_dec_and_test(&set->nr_ioasids))
+			ioasid_set_free_locked(set);
+
+		return;
+	}
+
+	spin_unlock(&ioasid_allocator_lock);
+	atomic_notifier_chain_unregister(&ioasid_notifier, nb);
+	spin_lock(&ioasid_allocator_lock);
+}
+
 void ioasid_unregister_notifier(struct ioasid_set *set,
 				struct notifier_block *nb)
 {
@@ -1227,10 +1260,7 @@ void ioasid_unregister_notifier(struct ioasid_set *set,
 		}
 	}
 
-	if (set)
-		atomic_notifier_chain_unregister(&set->nh, nb);
-	else
-		atomic_notifier_chain_unregister(&ioasid_notifier, nb);
+	unregister_notifier_locked(set, nb);
 out_unlock:
 	spin_unlock(&ioasid_allocator_lock);
 }
@@ -1318,10 +1348,8 @@ void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *
 	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
 		if (curr->token == mm && curr->nb == nb) {
 			list_del(&curr->list);
-			if (curr->active) {
-				atomic_notifier_chain_unregister(&curr->set->nh,
-								 nb);
-			}
+			if (curr->active)
+				unregister_notifier_locked(curr->set, nb);
 			kfree(curr);
 			break;
 		}
-- 
Gitee


From 508df306050f198e31939bf9e6a59753edcbf535 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 28 Apr 2022 15:50:13 -0700
Subject: [PATCH 1936/2161] ioasid: Reject null notifier call registration

commit 888f44bd1b53913f7d809369489fd7daae5ae23c Intel-BKC.

Booting multiple VMs causes Invalid notifier called! call trace.

Error message below observed during boot:
[  638.774647] Invalid notifier called!
[  638.774669] WARNING: CPU: 319 PID: 46882 at kernel/notifier.c:78 notifier_call_chain+0x79/0xb0
[  638.870074] RIP: 0010:notifier_call_chain+0x79/0xb0
[  638.972742]  atomic_notifier_call_chain+0x17/0x30
[  638.989967]  ioasid_notify+0x77/0xe0
[  638.989987]  ioasid_alloc+0x19e/0x230
[  639.003162]  ioasid_fops_unl_ioctl+0xb4/0x1b0
This is due to early registration of NULL callback function pointer.
[  613.023815] ioasid_add_pending_nb: nh ff37c3c3913de380 b call ffffffffc115a770, nr_ioasids 0
[  613.023883] ioasid_add_pending_nb: nh ff37c3c3913de380 b call 0,

It is unclear who registered the invalid callback, this patch fixes
the issue by rejecting the registration. Dump stack should show the
culprit. It is also possible that nb got cleared somehow outside
ioasid core.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/ioasid.c | 10 ++++++++++
 kernel/notifier.c      |  1 +
 2 files changed, 11 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 047ad250b441..31030330df94 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -594,6 +594,10 @@ static void ioasid_add_pending_nb(struct ioasid_set *set)
 	 */
 	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
 		if (curr->token == set->token && !curr->active) {
+			if (unlikely(!func_ptr_is_kernel_text(curr->nb->notifier_call))) {
+				pr_warn("Invalid notifier callback");
+				continue;
+			}
 			atomic_notifier_chain_register(&set->nh, curr->nb);
 			curr->set = set;
 			curr->active = true;
@@ -1293,6 +1297,12 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
 		}
 	}
 
+	if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
+		pr_warn("Registering invalid callback!");
+		dump_stack();
+		ret = -EINVAL;
+		goto exit_unlock;
+	}
 	curr = kzalloc(sizeof(*curr), GFP_ATOMIC);
 	if (!curr) {
 		ret = -ENOMEM;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index f6d5ffe4e72e..c6352021ca84 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -91,6 +91,7 @@ static int notifier_call_chain(struct notifier_block **nl,
 #ifdef CONFIG_DEBUG_NOTIFIERS
 		if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
 			WARN(1, "Invalid notifier called!");
+			pr_alert("%s: nb call %llx", __func__, (u64)nb->notifier_call);
 			nb = next_nb;
 			continue;
 		}
-- 
Gitee


From ff0755fb557f382ae9110a94c67644a2f50d16c4 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Wed, 11 May 2022 22:28:38 +0800
Subject: [PATCH 1937/2161] kvm: Refine codes to make ABI compatible

commit b2e2b01ca3e9de1fdd1f4d7a79451b2ea12b1f4c Intel-BKC.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c       | 15 ++++++++++++---
 include/uapi/linux/kvm.h |  5 +++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a3599b35b07..59edbcc6a9d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5891,7 +5891,11 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
 	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
 
-	memcpy(vcpu->run->mmio.data, frag->data, min(64u, frag->len));
+	if (frag->len > 8) {
+		memcpy(vcpu->run->mmio.np_data, frag->data, min(64u, frag->len));
+	} else {
+		memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+	}
 	return X86EMUL_CONTINUE;
 }
 
@@ -8755,8 +8759,13 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 	run->exit_reason = KVM_EXIT_MMIO;
 	run->mmio.phys_addr = frag->gpa;
-	if (vcpu->mmio_is_write)
-		memcpy(run->mmio.data, frag->data, min(64u, frag->len));
+	if (vcpu->mmio_is_write) {
+		if (frag->len > 8) {
+			memcpy(run->mmio.np_data, frag->data, min(64u, frag->len));
+		} else {
+			memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+		}
+	}
 	run->mmio.len = min(64u, frag->len);
 	run->mmio.is_write = vcpu->mmio_is_write;
 	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3afc10fe4864..c49bf3a938f0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -301,9 +301,10 @@ struct kvm_run {
 		/* KVM_EXIT_MMIO */
 		struct {
 			__u64 phys_addr;
-			__u8  data[64];
+			__u8  data[8];
 			__u32 len;
 			__u8  is_write;
+			__u8  np_data[64];
 		} mmio;
 		/* KVM_EXIT_HYPERCALL */
 		struct {
@@ -437,7 +438,7 @@ struct kvm_coalesced_mmio {
 		__u32 pad;
 		__u32 pio;
 	};
-	__u8  data[64];
+	__u8  data[8];
 };
 
 struct kvm_coalesced_mmio_ring {
-- 
Gitee


From 28459e07f783c05b04594f62fc905db21f6ebc8a Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 26 May 2022 12:36:59 -0700
Subject: [PATCH 1938/2161] crypto: iax: Break out of loop after list del in
 del_iax_wq()

commit b0610a9b853a1d891af76c2fb36b1ad66042d606 Intel-BKC.

Once the element has been deleted, there's no reason to continue
iterating the list, just break out.

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/iax_crypto_main.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index 45d2a3c6f54e..be5810b14e7b 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -411,10 +411,9 @@ static void del_iax_wq(struct iax_device *iax_device, struct idxd_wq *wq)
 
 			pr_debug("%s: removed wq %p from iax_device %p, n_wq %d, nr_iax %d\n", __func__, wq, iax_device, iax_device->n_wq, nr_iax);
 
-			if (iax_device->n_wq == 0) {
+			if (iax_device->n_wq == 0)
 				del_iax_device(iax_device);
-				break;
-			}
+			break;
 		}
 	}
 }
-- 
Gitee


From 4ef0bcb1fa7a3f50952286e19f82cb1988a6543f Mon Sep 17 00:00:00 2001
From: Dong Xiaocheng <xiaocheng.dong@intel.com>
Date: Tue, 31 May 2022 05:33:07 -0400
Subject: [PATCH 1939/2161] kvm/x86: Fix MMIO data mismatch issue

commit 5df07d3721abfc59329905e7693b0c6a76f8910e Intel-BKC.

The commit `b2e2b01ca3e9` introduced mmio.np_data for the data
length > 8 and covered write operations. There are some non write
operations should also be covered, otherwise,
`Unhandled exception 13 #GP` will be thrown when some SIMD
instructions try to access MMIO

Signed-off-by: Dong Xiaocheng <xiaocheng.dong@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59edbcc6a9d3..f98ada15d43b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8729,8 +8729,13 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 	/* Complete previous fragment */
 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
 	len = min(64u, frag->len);
-	if (!vcpu->mmio_is_write)
-		memcpy(frag->data, run->mmio.data, len);
+	if (!vcpu->mmio_is_write) {
+                if (len > 8) {
+                        memcpy(frag->data, run->mmio.np_data, len);
+                } else {
+                        memcpy(frag->data, run->mmio.data, len);
+                }
+        }
 
 	if (frag->len <= 8) {
 		/* Switch to the next fragment. */
-- 
Gitee


From 0437a3f4d654e62e6643ed02773d931d74660f25 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 15 Jun 2022 09:01:07 +0800
Subject: [PATCH 1940/2161] iommu/vt-d: Size Page Request Queue to avoid
 overflow condition

commit ea661ad6e1573d5b08c27444ff2ed403bf39ff66 upstream.

PRQ overflow may cause I/O throughput congestion, resulting in unnecessary
degradation of I/O performance. Appropriately increasing the length of PRQ
can greatly reduce the occurrence of PRQ overflow. The count of maximum
page requests that can be generated in parallel by a PCIe device is
statically defined in the Outstanding Page Request Capacity field of the
PCIe ATS configure space.

The new length of PRQ is calculated by summing up the value of Outstanding
Page Request Capacity register across all devices where Page Requests are
supported on the real PR-capable platform (Intel Sapphire Rapids). The
result is round to the nearest higher power of 2.

The PRQ length is also double sized as the VT-d IOMMU driver only updates
the Page Request Queue Head Register (PQH_REG) after processing the entire
queue.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20220421113558.3504874-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20220510023407.2759143-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 4e7553acd2c8..135c0867fd3b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -65,7 +65,7 @@ static int prs_allocation = 32;
  * in total to prevent PRQ overflow in the worst case.
  * For default 32 entries per dev,8 KB should be enough.
  */
-int prq_size_page_order = 1;
+int prq_size_page_order = 4;
 
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
-- 
Gitee


From edd4e2248f6d402a8a3b6d71627af34a35832232 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Mon, 1 Feb 2021 17:06:23 -0800
Subject: [PATCH 1941/2161] iommu: Properly pass gfp_t in _iommu_map() to avoid
 atomic sleeping

commit b8437a3ef8c485903d05d1f261328aaf0c0a6cc2 upstream.

Sleeping while atomic = bad.  Let's fix an obvious typo to try to avoid it.

The warning that was seen (on a downstream kernel with the problematic
patch backported):

 BUG: sleeping function called from invalid context at mm/page_alloc.c:4726
 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 9, name: ksoftirqd/0
 CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 5.4.93-12508-gc10c93e28e39 #1
 Call trace:
  dump_backtrace+0x0/0x154
  show_stack+0x20/0x2c
  dump_stack+0xa0/0xfc
  ___might_sleep+0x11c/0x12c
  __might_sleep+0x50/0x84
  __alloc_pages_nodemask+0xf8/0x2bc
  __arm_lpae_alloc_pages+0x48/0x1b4
  __arm_lpae_map+0x124/0x274
  __arm_lpae_map+0x1cc/0x274
  arm_lpae_map+0x140/0x170
  arm_smmu_map+0x78/0xbc
  __iommu_map+0xd4/0x210
  _iommu_map+0x4c/0x84
  iommu_map_atomic+0x44/0x58
  __iommu_dma_map+0x8c/0xc4
  iommu_dma_map_page+0xac/0xf0

Fixes: d8c1df02ac7f ("iommu: Move iotlb_sync_map out from __iommu_map")
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Yong Wu <yong.wu@mediatek.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210201170611.1.I64a7b62579287d668d7c89e105dcedf45d641063@changeid
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 01b69495a776..4b54ac63ad03 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2843,7 +2843,7 @@ static int _iommu_map(struct iommu_domain *domain, unsigned long iova,
 	const struct iommu_ops *ops = domain->ops;
 	int ret;
 
-	ret = __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
+	ret = __iommu_map(domain, iova, paddr, size, prot, gfp);
 	if (ret == 0 && ops->iotlb_sync_map)
 		ops->iotlb_sync_map(domain, iova, size);
 
-- 
Gitee


From 52fcced248876551c2a19b3d6bb144276e0dd2be Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 11:05:45 +0800
Subject: [PATCH 1942/2161] Revert "iommu: remove DOMAIN_ATTR_GEOMETRY"

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 20 +++++++++++++++++---
 include/linux/iommu.h |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 4b54ac63ad03..b5ecafe0ed76 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3207,9 +3207,23 @@ EXPORT_SYMBOL_GPL(iommu_domain_set_hwdbm);
 int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
 {
-	if (!domain->ops->domain_get_attr)
-		return -EINVAL;
-	return domain->ops->domain_get_attr(domain, attr, data);
+	struct iommu_domain_geometry *geometry;
+	int ret = 0;
+
+	switch (attr) {
+	case DOMAIN_ATTR_GEOMETRY:
+		geometry  = data;
+		*geometry = domain->geometry;
+
+		break;
+	default:
+		if (!domain->ops->domain_get_attr)
+			return -EINVAL;
+
+		ret = domain->ops->domain_get_attr(domain, attr, data);
+	}
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index cf86f7d8fd07..3e8dc880fd44 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -132,6 +132,7 @@ enum iommu_cap {
  */
 
 enum iommu_attr {
+	DOMAIN_ATTR_GEOMETRY,
 	DOMAIN_ATTR_WINDOWS,
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
-- 
Gitee


From b1fdf696afd0a60e88617384f8baecec98aa89a9 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 11:13:21 +0800
Subject: [PATCH 1943/2161] Revert "iommu: remove DOMAIN_ATTR_PAGING"

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/iommu.c | 5 +++++
 include/linux/iommu.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b5ecafe0ed76..56f6d8e80799 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3208,6 +3208,7 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
 {
 	struct iommu_domain_geometry *geometry;
+	bool *paging;
 	int ret = 0;
 
 	switch (attr) {
@@ -3215,6 +3216,10 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 		geometry  = data;
 		*geometry = domain->geometry;
 
+		break;
+	case DOMAIN_ATTR_PAGING:
+		paging  = data;
+		*paging = (domain->pgsize_bitmap != 0UL);
 		break;
 	default:
 		if (!domain->ops->domain_get_attr)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3e8dc880fd44..4fb3bbe07f2e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -133,6 +133,7 @@ enum iommu_cap {
 
 enum iommu_attr {
 	DOMAIN_ATTR_GEOMETRY,
+	DOMAIN_ATTR_PAGING,
 	DOMAIN_ATTR_WINDOWS,
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
-- 
Gitee


From 032d459aa7bd6051dcd5c5de082985871c80146a Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 15:43:11 +0800
Subject: [PATCH 1944/2161] iommu: Switch gather->end to the inclusive end

commit 862c3715de8f3e5350489240c951d697f04bd8c9 upstream.

Currently gather->end is "unsigned long" which may be overflow in
arch32 in the corner case: 0xfff00000 + 0x100000(iova + size).
Although it doesn't affect the size(end - start), it affects the checking
"gather->end < end"

This patch changes this "end" to the real end address
(end = start + size - 1). Correspondingly, update the length to
"end - start + 1".

Fixes: a7d20dc19d9e ("iommu: Introduce struct iommu_iotlb_gather for batching TLB flushes")
Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210107122909.16317-5-yong.wu@mediatek.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/tegra-gart.c | 2 +-
 include/linux/iommu.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index a446a6aee71d..a52ed43591ea 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -282,7 +282,7 @@ static void gart_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
 static void gart_iommu_sync(struct iommu_domain *domain,
 			    struct iommu_iotlb_gather *gather)
 {
-	size_t length = gather->end - gather->start;
+	size_t length = gather->end - gather->start + 1;
 
 	gart_iommu_sync_map(domain, gather->start, length);
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4fb3bbe07f2e..17f9776f9ada 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -192,7 +192,7 @@ enum iommu_dev_features {
  * struct iommu_iotlb_gather - Range information for a pending IOTLB flush
  *
  * @start: IOVA representing the start of the range to be flushed
- * @end: IOVA representing the end of the range to be flushed (exclusive)
+ * @end: IOVA representing the end of the range to be flushed (inclusive)
  * @pgsize: The interval at which to perform the flush
  * @freelist: Removed pages to free after sync
  * @queued: Indicates that the flush will be queued
@@ -635,7 +635,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 					       struct iommu_iotlb_gather *gather,
 					       unsigned long iova, size_t size)
 {
-	unsigned long start = iova, end = start + size;
+	unsigned long start = iova, end = start + size - 1;
 
 	/*
 	 * If the new page is disjoint from the current range or is mapped at
-- 
Gitee


From 275943862e65caed92df3939b79d5458231af611 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 17:34:39 +0800
Subject: [PATCH 1945/2161] iommu: Remove unused functions in
 /drivers/iommu/intel/svm.c

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 74 ---------------------------------------
 1 file changed, 74 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ba7591bf30d0..f27732863c68 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -28,44 +28,9 @@
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
-#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
 
 extern int prq_size_page_order;
 
-static DEFINE_XARRAY_ALLOC(pasid_private_array);
-static int pasid_private_add(ioasid_t pasid, void *priv)
-{
-	return xa_alloc(&pasid_private_array, &pasid, priv,
-			XA_LIMIT(pasid, pasid), GFP_ATOMIC);
-}
-
-static void pasid_private_remove(ioasid_t pasid)
-{
-	xa_erase(&pasid_private_array, pasid);
-}
-
-static void *pasid_private_find(ioasid_t pasid)
-{
-	return xa_load(&pasid_private_array, pasid);
-}
-
-static struct intel_svm_dev *
-svm_lookup_device_by_sid(struct intel_svm *svm, u16 sid)
-{
-	struct intel_svm_dev *sdev = NULL, *t;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(t, &svm->devs, list) {
-		if (t->sid == sid) {
-			sdev = t;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return sdev;
-}
-
 static struct intel_svm_dev *
 svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
 {
@@ -1134,45 +1099,6 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
 	return iommu_report_device_fault(dev, &event);
 }
 
-static void handle_bad_prq_event(struct intel_iommu *iommu,
-				 struct page_req_dsc *req, int result)
-{
-	struct qi_desc desc;
-
-	pr_err("%s: Invalid page request: %08llx %08llx\n",
-	       iommu->name, ((unsigned long long *)req)[0],
-	       ((unsigned long long *)req)[1]);
-
-	/*
-	 * Per VT-d spec. v3.0 ch7.7, system software must
-	 * respond with page group response if private data
-	 * is present (PDP) or last page in group (LPIG) bit
-	 * is set. This is an additional VT-d feature beyond
-	 * PCI ATS spec.
-	 */
-	if (!req->lpig && !req->priv_data_present)
-		return;
-
-	desc.qw0 = QI_PGRP_PASID(req->pasid) |
-			QI_PGRP_DID(req->rid) |
-			QI_PGRP_PASID_P(req->pasid_present) |
-			QI_PGRP_PDP(req->priv_data_present) |
-			QI_PGRP_RESP_CODE(result) |
-			QI_PGRP_RESP_TYPE;
-	desc.qw1 = QI_PGRP_IDX(req->prg_index) |
-			QI_PGRP_LPIG(req->lpig);
-
-	if (req->priv_data_present) {
-		desc.qw2 = req->priv_data[0];
-		desc.qw3 = req->priv_data[1];
-	} else {
-		desc.qw2 = 0;
-		desc.qw3 = 0;
-	}
-
-	qi_submit_sync(iommu, &desc, 1, 0);
-}
-
 static irqreturn_t prq_event_thread(int irq, void *d)
 {
 	struct intel_svm_dev *sdev = NULL;
-- 
Gitee


From 12e7b208e14eac2d6d7f09081c62ac09ae6da4eb Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 20 Jul 2022 17:44:21 +0800
Subject: [PATCH 1946/2161] iommu: Resolve merge conflict for
 pasid_to_svm_sdev()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Resolve merge conflict for pasid_to_svm_sdev(),

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index f27732863c68..dbf536b8c021 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -31,23 +31,6 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid);
 
 extern int prq_size_page_order;
 
-static struct intel_svm_dev *
-svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
-{
-	struct intel_svm_dev *sdev = NULL, *t;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(t, &svm->devs, list) {
-		if (t->dev == dev) {
-			sdev = t;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return sdev;
-}
-
 int intel_svm_enable_prq(struct intel_iommu *iommu)
 {
 	struct page *pages;
@@ -360,7 +343,7 @@ static int pasid_to_svm_sdev(struct device *dev,
 			     struct intel_svm **rsvm,
 			     struct intel_svm_dev **rsdev)
 {
-	struct intel_svm_dev *sdev = NULL;
+	struct intel_svm_dev *d, *sdev = NULL;
 	struct intel_svm *svm;
 
 	/* The caller should hold the pasid_mutex lock */
@@ -388,7 +371,15 @@ static int pasid_to_svm_sdev(struct device *dev,
 	 */
 	if (WARN_ON(list_empty(&svm->devs)))
 		return -EINVAL;
-	sdev = svm_lookup_device_by_dev(svm, dev);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(d, &svm->devs, list) {
+		if (d->dev == dev) {
+			sdev = d;
+			break;
+		}
+	}
+	rcu_read_unlock();
 
 out:
 	*rsvm = svm;
-- 
Gitee


From b2237a287d42fe1ca902f2025f322e3b8df81ef2 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 31 Aug 2022 20:35:17 +0800
Subject: [PATCH 1947/2161] iommu: Resolve merge conflicts for VCMD registers

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.h | 10 +++++-----
 include/linux/intel-iommu.h |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index cc9149208792..71057dc74813 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -27,12 +27,12 @@
 #define VCMD_CMD_ALLOC			0x1
 #define VCMD_CMD_FREE			0x2
 #define VCMD_VRSP_IP			0x1
-#define VCMD_VRSP_SC(e)			(((e) & 0xff) >> 1)
+#define VCMD_VRSP_SC(e)			(((e) >> 1) & 0x3)
 #define VCMD_VRSP_SC_SUCCESS		0
-#define VCMD_VRSP_SC_NO_PASID_AVAIL	16
-#define VCMD_VRSP_SC_INVALID_PASID	16
-#define VCMD_VRSP_RESULT_PASID(e)	(((e) >> 16) & 0xfffff)
-#define VCMD_CMD_OPERAND(e)		((e) << 16)
+#define VCMD_VRSP_SC_NO_PASID_AVAIL	2
+#define VCMD_VRSP_SC_INVALID_PASID	2
+#define VCMD_VRSP_RESULT_PASID(e)	(((e) >> 8) & 0xfffff)
+#define VCMD_CMD_OPERAND(e)		((e) << 8)
 /*
  * Domain ID reserved for pasid entries programmed for first-level
  * only and pass-through transfer modes.
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 8c66c3e42fe5..8af9abb73b6c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -125,9 +125,9 @@
 #define DMAR_MTRR_PHYSMASK8_REG 0x208
 #define DMAR_MTRR_PHYSBASE9_REG 0x210
 #define DMAR_MTRR_PHYSMASK9_REG 0x218
-#define DMAR_VCCAP_REG		0xe30 /* Virtual command capability register */
-#define DMAR_VCMD_REG		0xe00 /* Virtual command register */
-#define DMAR_VCRSP_REG		0xe10 /* Virtual command response register */
+#define DMAR_VCCAP_REG		0xe00 /* Virtual command capability register */
+#define DMAR_VCMD_REG		0xe10 /* Virtual command register */
+#define DMAR_VCRSP_REG		0xe20 /* Virtual command response register */
 
 #define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
 #define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
-- 
Gitee


From 6bc45b0b40470f067b806cb9f01f83a53f6ddd76 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 21 Jul 2022 17:30:37 +0800
Subject: [PATCH 1948/2161] iommu: Restore map/unmap() callbacks for
 intel_iommu_ops

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 37 ++-----------------------------------
 include/linux/iommu.h       |  9 ---------
 2 files changed, 2 insertions(+), 44 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 135c0867fd3b..aef23cbfd508 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5353,28 +5353,6 @@ static int intel_iommu_map(struct iommu_domain *domain,
 				hpa >> VTD_PAGE_SHIFT, size, prot);
 }
 
-static int intel_iommu_map_pages(struct iommu_domain *domain,
-				 unsigned long iova, phys_addr_t paddr,
-				 size_t pgsize, size_t pgcount,
-				 int prot, gfp_t gfp, size_t *mapped)
-{
-	unsigned long pgshift = __ffs(pgsize);
-	size_t size = pgcount << pgshift;
-	int ret;
-
-	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
-		return -EINVAL;
-
-	if (!IS_ALIGNED(iova | paddr, pgsize))
-		return -EINVAL;
-
-	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
-	if (!ret && mapped)
-		*mapped = size;
-
-	return ret;
-}
-
 static size_t intel_iommu_unmap(struct iommu_domain *domain,
 				unsigned long iova, size_t size,
 				struct iommu_iotlb_gather *gather)
@@ -5404,17 +5382,6 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
 	return size;
 }
 
-static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
-				      unsigned long iova,
-				      size_t pgsize, size_t pgcount,
-				      struct iommu_iotlb_gather *gather)
-{
-	unsigned long pgshift = __ffs(pgsize);
-	size_t size = pgcount << pgshift;
-
-	return intel_iommu_unmap(domain, iova, size, gather);
-}
-
 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
 				 struct iommu_iotlb_gather *gather)
 {
@@ -6579,9 +6546,9 @@ const struct iommu_ops intel_iommu_ops = {
 	.aux_attach_dev		= intel_iommu_aux_attach_device,
 	.aux_detach_dev		= intel_iommu_aux_detach_device,
 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
-	.map_pages		= intel_iommu_map_pages,
-	.unmap_pages		= intel_iommu_unmap_pages,
+	.map			= intel_iommu_map,
 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
+	.unmap			= intel_iommu_unmap,
 	.flush_iotlb_all        = intel_flush_iotlb_all,
 	.iotlb_sync		= intel_iommu_tlb_sync,
 	.iova_to_phys		= intel_iommu_iova_to_phys,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 17f9776f9ada..51e0ebfa9639 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -220,10 +220,7 @@ struct iommu_iotlb_gather {
  * @attach_dev: attach device to an iommu domain
  * @detach_dev: detach device from an iommu domain
  * @map: map a physically contiguous memory region to an iommu domain
- * @map_pages: map a physically contiguous set of pages of the same size to
- *             an iommu domain.
  * @unmap: unmap a physically contiguous memory region from an iommu domain
- * @unmap_pages: unmap a number of pages of the same size from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
  * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
@@ -277,14 +274,8 @@ struct iommu_ops {
 	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
 	int (*map)(struct iommu_domain *domain, unsigned long iova,
 		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
-	int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
-			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
-			 int prot, gfp_t gfp, size_t *mapped);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
-	size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova,
-			      size_t pgsize, size_t pgcount,
-			      struct iommu_iotlb_gather *iotlb_gather);
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
 	void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
 			       size_t size);
-- 
Gitee


From 24e6fa8689fbdaba71e4869ef390662cbb9117bc Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 26 Jul 2022 17:11:29 +0800
Subject: [PATCH 1949/2161] iommu/vt-d: Fix RID2PASID setup failure based on
 BKC

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/pasid.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 5433d0328906..42bd49b119af 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -577,10 +577,6 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 	if (WARN_ON(!pte))
 		return -EINVAL;
 
-	/* Caller must ensure PASID entry is not in use. */
-	if (pasid_pte_is_present(pte))
-		return -EBUSY;
-
 	pasid_clear_entry(pte);
 
 	/* Setup the first level page table pointer: */
@@ -680,10 +676,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 		return -ENODEV;
 	}
 
-	/* Caller must ensure PASID entry is not in use. */
-	if (pasid_pte_is_present(pte))
-		return -EBUSY;
-
 	pasid_clear_entry(pte);
 	pasid_set_domain_id(pte, did);
 	pasid_set_slptr(pte, pgd_val);
-- 
Gitee


From 8a3dfc86ae88c5268a5e9792f80a3e0a194c9d02 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 27 Jul 2022 16:32:06 +0800
Subject: [PATCH 1950/2161] iommu: Restore domain_get_pasid()

commit 77d7c99a6b30e30d436b01b1af302d9e56b985f8 Intel-BKC.

Restore domain_get_pasid().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/iommu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index aef23cbfd508..5d06cc2b740c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5736,10 +5736,12 @@ intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
 
 int domain_get_pasid(struct iommu_domain *domain, struct device *dev)
 {
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
 	if (is_aux_domain(dev, domain))
 		return intel_iommu_aux_get_pasid(domain, dev);
 
-	return PASID_RID2PASID;
+	return dmar_domain->default_pasid;
 }
 
 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
-- 
Gitee


From e8e02e0ff97534f71e15829927e7445f13006c3a Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 28 Jul 2022 11:30:30 +0800
Subject: [PATCH 1951/2161] dmaengine: idxd: Fix module order for idxd_bus

commit opencloudos.

Bring module idxd_bus to front of idxd, otherwise idxd will register failed.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index a1e9f2b3a37c..817ffa95a9b1 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,12 +1,12 @@
 ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=IDXD
 
+obj-$(CONFIG_INTEL_IDXD_BUS) += idxd_bus.o
+idxd_bus-y := bus.o
+
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
 idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
 
 idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
 
-obj-$(CONFIG_INTEL_IDXD_BUS) += idxd_bus.o
-idxd_bus-y := bus.o
-
 obj-$(CONFIG_INTEL_IDXD_COMPAT) += idxd_compat.o
 idxd_compat-y := compat.o
-- 
Gitee


From 8f7710427aaaeb7c2739506235283d70bd41c916 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 11 Aug 2022 11:20:29 +0800
Subject: [PATCH 1952/2161] crypto: enable CONFIG_CRYPTO_DEFLATE

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/configs/oc.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/configs/oc.config b/arch/x86/configs/oc.config
index f37fc4e31fed..568e482a4a5b 100644
--- a/arch/x86/configs/oc.config
+++ b/arch/x86/configs/oc.config
@@ -1443,6 +1443,7 @@ CONFIG_SECURITY_YAMA=y
 # CONFIG_INTEGRITY is not set
 CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,selinux"
 CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_RSA=y
-- 
Gitee


From 89e6d9092a86ee6ee32994af878c201da69b78ca Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 12 Aug 2022 15:41:12 +0800
Subject: [PATCH 1953/2161] x86: get rid of get_user_ex() in
 restore_sigcontext()

commit 978727ca331ebd8b479f6a7afd27bb2e6504b2e3 upstream.

Just do copyin into a local struct and be done with that - we are
on a shallow stack here.

[reworked by tglx, removing the macro horrors while we are touching that]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c | 105 ++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 60 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 97c4a29ed1f4..57c58375b737 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -35,70 +35,57 @@
 #include <asm/sighandling.h>
 #include <asm/smap.h>
 
+static inline void reload_segments(struct sigcontext_32 *sc)
+{
+	unsigned int cur;
+
+	savesegment(gs, cur);
+	if ((sc->gs | 0x03) != cur)
+		load_gs_index(sc->gs | 0x03);
+	savesegment(fs, cur);
+	if ((sc->fs | 0x03) != cur)
+		loadsegment(fs, sc->fs | 0x03);
+	savesegment(ds, cur);
+	if ((sc->ds | 0x03) != cur)
+		loadsegment(ds, sc->ds | 0x03);
+	savesegment(es, cur);
+	if ((sc->es | 0x03) != cur)
+		loadsegment(es, sc->es | 0x03);
+}
+
 /*
  * Do a signal return; undo the signal stack.
  */
-#define loadsegment_gs(v)	load_gs_index(v)
-#define loadsegment_fs(v)	loadsegment(fs, v)
-#define loadsegment_ds(v)	loadsegment(ds, v)
-#define loadsegment_es(v)	loadsegment(es, v)
-
-#define get_user_seg(seg)	({ unsigned int v; savesegment(seg, v); v; })
-#define set_user_seg(seg, v)	loadsegment_##seg(v)
-
-#define COPY(x)			{		\
-	get_user_ex(regs->x, &sc->x);		\
-}
-
-#define GET_SEG(seg)		({			\
-	unsigned short tmp;				\
-	get_user_ex(tmp, &sc->seg);			\
-	tmp;						\
-})
-
-#define COPY_SEG_CPL3(seg)	do {			\
-	regs->seg = GET_SEG(seg) | 3;			\
-} while (0)
-
-#define RELOAD_SEG(seg)		{		\
-	unsigned int pre = (seg) | 3;		\
-	unsigned int cur = get_user_seg(seg);	\
-	if (pre != cur)				\
-		set_user_seg(seg, pre);		\
-}
-
 static int ia32_restore_sigcontext(struct pt_regs *regs,
-				   struct sigcontext_32 __user *sc)
+				   struct sigcontext_32 __user *usc)
 {
-	unsigned int tmpflags, err = 0;
-	u16 gs, fs, es, ds;
-	void __user *buf;
-	u32 tmp;
+	struct sigcontext_32 sc;
+	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	get_user_try {
-		gs = GET_SEG(gs);
-		fs = GET_SEG(fs);
-		ds = GET_SEG(ds);
-		es = GET_SEG(es);
-
-		COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-		COPY(dx); COPY(cx); COPY(ip); COPY(ax);
-		/* Don't touch extended registers */
-
-		COPY_SEG_CPL3(cs);
-		COPY_SEG_CPL3(ss);
-
-		get_user_ex(tmpflags, &sc->flags);
-		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-		/* disable syscall checks */
-		regs->orig_ax = -1;
+	if (unlikely(copy_from_user(&sc, usc, sizeof(sc))))
+		return -EFAULT;
 
-		get_user_ex(tmp, &sc->fpstate);
-		buf = compat_ptr(tmp);
-	} get_user_catch(err);
+	/* Get only the ia32 registers. */
+	regs->bx = sc.bx;
+	regs->cx = sc.cx;
+	regs->dx = sc.dx;
+	regs->si = sc.si;
+	regs->di = sc.di;
+	regs->bp = sc.bp;
+	regs->ax = sc.ax;
+	regs->sp = sc.sp;
+	regs->ip = sc.ip;
+
+	/* Get CS/SS and force CPL3 */
+	regs->cs = sc.cs | 0x03;
+	regs->ss = sc.ss | 0x03;
+
+	regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+	/* disable syscall checks */
+	regs->orig_ax = -1;
 
 	/*
 	 * Reload fs and gs if they have changed in the signal
@@ -106,12 +93,8 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	 * the handler, but does not clobber them at least in the
 	 * normal case.
 	 */
-	RELOAD_SEG(gs);
-	RELOAD_SEG(fs);
-	RELOAD_SEG(ds);
-	RELOAD_SEG(es);
-
-	err |= fpu__restore_sig(buf, 1);
+	reload_segments(&sc);
+	err = fpu__restore_sig(compat_ptr(sc.fpstate), 1);
 
 	force_iret();
 
@@ -176,6 +159,8 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
  * Set up a signal frame.
  */
 
+#define get_user_seg(seg)	({ unsigned int v; savesegment(seg, v); v; })
+
 static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
 				 void __user *fpstate,
 				 struct pt_regs *regs, unsigned int mask)
-- 
Gitee


From fdcdefb43ad386691172d9198feaf282e75529ee Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 12 Aug 2022 15:23:08 +0800
Subject: [PATCH 1954/2161] x86/signal: Change return type of
 restore_sigcontext() to boolean

commit ee4ecdfbd28954086a09740dc931c10c93e39370 upstream.

Append changes for arch/x86/ia32/ia32_signal.c.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 57c58375b737..a299fbab1058 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -56,8 +56,8 @@ static inline void reload_segments(struct sigcontext_32 *sc)
 /*
  * Do a signal return; undo the signal stack.
  */
-static int ia32_restore_sigcontext(struct pt_regs *regs,
-				   struct sigcontext_32 __user *usc)
+static bool ia32_restore_sigcontext(struct pt_regs *regs,
+				    struct sigcontext_32 __user *usc)
 {
 	struct sigcontext_32 sc;
 	unsigned int err = 0;
@@ -66,7 +66,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	current->restart_block.fn = do_no_restart_syscall;
 
 	if (unlikely(copy_from_user(&sc, usc, sizeof(sc))))
-		return -EFAULT;
+		return false;
 
 	/* Get only the ia32 registers. */
 	regs->bx = sc.bx;
@@ -94,7 +94,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	 * normal case.
 	 */
 	reload_segments(&sc);
-	err = fpu__restore_sig(compat_ptr(sc.fpstate), 1);
+	err = !fpu__restore_sig(compat_ptr(sc.fpstate), 1);
 
 	force_iret();
 
@@ -118,7 +118,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
 
 	set_current_blocked(&set);
 
-	if (ia32_restore_sigcontext(regs, &frame->sc))
+	if (!ia32_restore_sigcontext(regs, &frame->sc))
 		goto badframe;
 	return regs->ax;
 
@@ -142,7 +142,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
 
 	set_current_blocked(&set);
 
-	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
+	if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
 	if (compat_restore_altstack(&frame->uc.uc_stack))
-- 
Gitee


From 0c7585d1c0777c25b6fc45ba42f1d17a2d30a014 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Fri, 12 Aug 2022 15:31:07 +0800
Subject: [PATCH 1955/2161] x86/fpu/signal: Change return type of
 fpu__restore_sig() to boolean

commit 052adee668284b67105375c0a524f16a423f1424 upstream.

Append changes for arch/x86/ia32/ia32_signal.c

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index a299fbab1058..119cc0818f3b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -94,7 +94,7 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs,
 	 * normal case.
 	 */
 	reload_segments(&sc);
-	err = !fpu__restore_sig(compat_ptr(sc.fpstate), 1);
+	err = fpu__restore_sig(compat_ptr(sc.fpstate), 1);
 
 	force_iret();
 
-- 
Gitee


From f1a8d3223c2fde3d5f6da90bae6a06ed4bf3fa36 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 15 Aug 2022 16:01:34 +0800
Subject: [PATCH 1956/2161] x86/cpu: Add new VMX feature, Tertiary VM-Execution
 control

commit 465932db25f3664893b66152c7b190afd28c32db upstream.

Ignore parts of vmx capabilities.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/msr-index.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 2ee1ea7e3284..1cc4ddcb9200 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -896,6 +896,7 @@
 #define MSR_IA32_VMX_TRUE_EXIT_CTLS      0x0000048f
 #define MSR_IA32_VMX_TRUE_ENTRY_CTLS     0x00000490
 #define MSR_IA32_VMX_VMFUNC             0x00000491
+#define MSR_IA32_VMX_PROCBASED_CTLS3	0x00000492
 
 /* VMX_BASIC bits and bitmasks */
 #define VMX_BASIC_VMCS_SIZE_SHIFT	32
-- 
Gitee


From a7d1d5bb14103aa2eec0417968d683a1f1ffe06b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 10 Aug 2021 10:19:50 -0700
Subject: [PATCH 1957/2161] KVM: nVMX: Pull KVM L0's desired controls directly
 from vmcs01

commit 389ab25216c9d09e0d335e764eeeb84c2089614f upstream.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index df25f361c020..123e468a4263 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -436,9 +436,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val)	    \
 		vmx->loaded_vmcs->controls_shadow.lname = val;		    \
 	}								    \
 }									    \
+static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs)	    \
+{									    \
+	return vmcs->controls_shadow.lname;				    \
+}									    \
 static inline u32 lname##_controls_get(struct vcpu_vmx *vmx)		    \
 {									    \
-	return vmx->loaded_vmcs->controls_shadow.lname;			    \
+	return __##lname##_controls_get(vmx->loaded_vmcs);		    \
 }									    \
 static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val)   \
 {									    \
-- 
Gitee


From 2b3ac214bbf6bb5f502faf1309b3dda2ca0cea94 Mon Sep 17 00:00:00 2001
From: Robert Hoo <robert.hu@linux.intel.com>
Date: Tue, 19 Apr 2022 23:33:18 +0800
Subject: [PATCH 1958/2161] KVM: VMX: Extend BUILD_CONTROLS_SHADOW macro to
 support 64-bit variation

commit ed3905ba60384ab8c73b421c3618375e58080a9a upstream.

The Tertiary VM-Exec Control, different from previous control fields, is 64
bit. So extend BUILD_CONTROLS_SHADOW() by adding a 'bit' parameter, to
support both 32 bit and 64 bit fields' auxiliary functions building.

Suggested-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419153318.11595-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.h | 56 +++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 123e468a4263..2e5348daadc4 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -428,35 +428,35 @@ static inline u8 vmx_get_rvi(void)
 	return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
 }
 
-#define BUILD_CONTROLS_SHADOW(lname, uname)				    \
-static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val)	    \
-{									    \
-	if (vmx->loaded_vmcs->controls_shadow.lname != val) {		    \
-		vmcs_write32(uname, val);				    \
-		vmx->loaded_vmcs->controls_shadow.lname = val;		    \
-	}								    \
-}									    \
-static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs)	    \
-{									    \
-	return vmcs->controls_shadow.lname;				    \
-}									    \
-static inline u32 lname##_controls_get(struct vcpu_vmx *vmx)		    \
-{									    \
-	return __##lname##_controls_get(vmx->loaded_vmcs);		    \
-}									    \
-static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val)   \
-{									    \
-	lname##_controls_set(vmx, lname##_controls_get(vmx) | val);	    \
-}									    \
-static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \
-{									    \
-	lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val);	    \
+#define BUILD_CONTROLS_SHADOW(lname, uname, bits)				\
+static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val)	\
+{										\
+	if (vmx->loaded_vmcs->controls_shadow.lname != val) {			\
+		vmcs_write##bits(uname, val);					\
+		vmx->loaded_vmcs->controls_shadow.lname = val;			\
+	}									\
+}										\
+static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs)	\
+{										\
+	return vmcs->controls_shadow.lname;					\
+}										\
+static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx)		\
+{										\
+	return __##lname##_controls_get(vmx->loaded_vmcs);			\
+}										\
+static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val)	\
+{										\
+	lname##_controls_set(vmx, lname##_controls_get(vmx) | val);		\
+}										\
+static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val)	\
+{										\
+	lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val);		\
 }
-BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS)
-BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS)
-BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
-BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
-BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
+BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32)
 
 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 {
-- 
Gitee


From 302d36238b033877f4bb7b2c96d169283b3fa002 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 15 Aug 2022 17:38:39 +0800
Subject: [PATCH 1959/2161] KVM: VMX: Detect Tertiary VM-Execution control when
 setup VMCS config

commit 1ad4e5438c67a01620ed67cea959de89f4430515 upstream.

Check VMX features on tertiary execution control in VMCS config setup.
Sub-features in tertiary execution control to be enabled are adjusted
according to hardware capabilities although no sub-feature is enabled
in this patch.

EVMCSv1 doesn't support tertiary VM-execution control, so disable it
when EVMCSv1 is in use. And define the auxiliary functions for Tertiary
control field here, using the new BUILD_CONTROLS_SHADOW().

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419153400.11642-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/vmx.h      |  3 +++
 arch/x86/kvm/vmx/capabilities.h |  7 +++++++
 arch/x86/kvm/vmx/evmcs.c        |  2 ++
 arch/x86/kvm/vmx/evmcs.h        |  1 +
 arch/x86/kvm/vmx/vmcs.h         |  1 +
 arch/x86/kvm/vmx/vmx.c          | 28 +++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/vmx.h          |  1 +
 7 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index d32f830b924e..50b07c9d2c7b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -28,6 +28,7 @@
 #define CPU_BASED_RDTSC_EXITING                 0x00001000
 #define CPU_BASED_CR3_LOAD_EXITING		0x00008000
 #define CPU_BASED_CR3_STORE_EXITING		0x00010000
+#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS	0x00020000
 #define CPU_BASED_CR8_LOAD_EXITING              0x00080000
 #define CPU_BASED_CR8_STORE_EXITING             0x00100000
 #define CPU_BASED_TPR_SHADOW                    0x00200000
@@ -215,6 +216,8 @@ enum vmcs_field {
 	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
 	TSC_MULTIPLIER                  = 0x00002032,
 	TSC_MULTIPLIER_HIGH             = 0x00002033,
+	TERTIARY_VM_EXEC_CONTROL	= 0x00002034,
+	TERTIARY_VM_EXEC_CONTROL_HIGH	= 0x00002035,
 	PASID_DIR0			= 0x00002038,
 	PASID_DIR0_HIGH 		= 0x00002039,
 	PASID_DIR1			= 0x0000203a,
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 776e473b5850..510142b30494 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -55,6 +55,7 @@ struct vmcs_config {
 	u32 pin_based_exec_ctrl;
 	u32 cpu_based_exec_ctrl;
 	u32 cpu_based_2nd_exec_ctrl;
+	u64 cpu_based_3rd_exec_ctrl;
 	u32 vmexit_ctrl;
 	u32 vmentry_ctrl;
 	struct nested_vmx_msrs nested;
@@ -128,6 +129,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void)
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 }
 
+static inline bool cpu_has_tertiary_exec_ctrls(void)
+{
+	return vmcs_config.cpu_based_exec_ctrl &
+		CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
+}
+
 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 72359709cdc1..f4d77f4786fc 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -305,8 +305,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
 
 void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 {
+	vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL;
 	vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
 	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+	vmcs_conf->cpu_based_3rd_exec_ctrl = 0;
 
 	vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
 	vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 07ebf6882a45..892504b1e965 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -49,6 +49,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
  */
 #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
 				    PIN_BASED_VMX_PREEMPTION_TIMER)
+#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
 #define EVMCS1_UNSUPPORTED_2NDEXEC					\
 	(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |				\
 	 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |			\
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 052828914440..962806a4c478 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -48,6 +48,7 @@ struct vmcs_controls_shadow {
 	u32 pin;
 	u32 exec;
 	u32 secondary_exec;
+	u64 tertiary_exec;
 };
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ca620792cdf9..486d622093fb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2331,6 +2331,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 	return 0;
 }
 
+static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+	u64 allowed;
+
+	rdmsrl(msr, allowed);
+
+	return  ctl_opt & allowed;
+}
+
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 				    struct vmx_capability *vmx_cap)
 {
@@ -2339,6 +2348,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	u32 _pin_based_exec_control = 0;
 	u32 _cpu_based_exec_control = 0;
 	u32 _cpu_based_2nd_exec_control = 0;
+	u64 _cpu_based_3rd_exec_control = 0;
 	u32 _vmexit_control = 0;
 	u32 _vmentry_control = 0;
 
@@ -2360,7 +2370,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
-	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS |
+	      CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
 				&_cpu_based_exec_control) < 0)
 		return -EIO;
@@ -2434,6 +2445,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 				"1-setting enable VPID VM-execution control\n");
 	}
 
+	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) {
+		u64 opt3 = 0;
+
+		_cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3,
+					      MSR_IA32_VMX_PROCBASED_CTLS3);
+	}
+
 	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
@@ -2521,6 +2539,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
 	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
 	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+	vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
 	vmcs_conf->vmexit_ctrl         = _vmexit_control;
 	vmcs_conf->vmentry_ctrl        = _vmentry_control;
 
@@ -4015,6 +4034,10 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	return exec_control;
 }
 
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+	return vmcs_config.cpu_based_3rd_exec_ctrl;
+}
 
 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 {
@@ -4205,6 +4228,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
 	}
 
+	if (cpu_has_tertiary_exec_ctrls())
+		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
 	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 2e5348daadc4..0f77b44a59eb 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -457,6 +457,7 @@ BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32)
 BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32)
 BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32)
 BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
 
 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 {
-- 
Gitee


From 54ee02c6461a8dd429791b637cda2ef2d325411a Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 15 Aug 2022 19:29:09 +0800
Subject: [PATCH 1960/2161] KVM: VMX: Report tertiary_exec_control field in
 dump_vmcs()

commit 0b85baa5f46de1c6ad6e4b987905df041f2f80f0 upstream.

Add tertiary_exec_control field report in dump_vmcs(). Meanwhile,
reorganize the dump output of VMCS category as follows.

Before change:
*** Control State ***
 PinBased=0x000000ff CPUBased=0xb5a26dfa SecondaryExec=0x061037eb
 EntryControls=0000d1ff ExitControls=002befff

After change:
*** Control State ***
 CPUBased=0xb5a26dfa SecondaryExec=0x061037eb TertiaryExec=0x0000000000000010
 PinBased=0x000000ff EntryControls=0000d1ff ExitControls=002befff

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419153441.11687-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 486d622093fb..38757ffc0ed4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5780,6 +5780,7 @@ void dump_vmcs(void)
 {
 	u32 vmentry_ctl, vmexit_ctl;
 	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+	u64 tertiary_exec_control;
 	unsigned long cr4;
 	u64 efer;
 	int i, n;
@@ -5795,9 +5796,16 @@ void dump_vmcs(void)
 	pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
 	cr4 = vmcs_readl(GUEST_CR4);
 	efer = vmcs_read64(GUEST_IA32_EFER);
-	secondary_exec_control = 0;
+
 	if (cpu_has_secondary_exec_ctrls())
 		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	else
+		secondary_exec_control = 0;
+
+	if (cpu_has_tertiary_exec_ctrls())
+		tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+	else
+		tertiary_exec_control = 0;
 
 	pr_err("*** Guest State ***\n");
 	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
@@ -5881,9 +5889,10 @@ void dump_vmcs(void)
 		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
 
 	pr_err("*** Control State ***\n");
-	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
-	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
-	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+	pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+	       cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+	pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+	       pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
 	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
 	       vmcs_read32(EXCEPTION_BITMAP),
 	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
-- 
Gitee


From 17198031316f57c226e9d3b9b51d0f5c6bce59f3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 4 Feb 2022 21:41:56 +0000
Subject: [PATCH 1961/2161] KVM: VMX: Handle APIC-write offset wrangling in VMX
 code

commit b5ede3df79b7274db0aa6acbdb1185c659f81636 upstream.

Move the vAPIC offset adjustments done in the APIC-write trap path from
common x86 to VMX in anticipation of using the nodecode path for SVM's
AVIC.  The adjustment reflects hardware behavior, i.e. it's technically a
property of VMX, no common x86.  SVM's AVIC behavior is identical, so
it's a bit of a moot point, the goal is purely to make it easier to
understand why the adjustment is ok.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c   |  3 ---
 arch/x86/kvm/vmx/vmx.c | 11 +++++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 74044fecfccb..7e0ebcf2d351 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2028,9 +2028,6 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
 {
 	u32 val = 0;
 
-	/* hw has done the conditional check and inst decode */
-	offset &= 0xff0;
-
 	kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val);
 
 	/* TODO: optimize to just emulate side effect w/o one more write */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 38757ffc0ed4..8cd04f3edfa5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5155,9 +5155,16 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
 static int handle_apic_write(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	u32 offset = exit_qualification & 0xfff;
 
-	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
+	/*
+	 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+	 * hardware has done any necessary aliasing, offset adjustments, etc...
+	 * for the access.  I.e. the correct value has already been  written to
+	 * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
+	 * retrieve the register value and emulate the access.
+	 */
+	u32 offset = exit_qualification & 0xff0;
+
 	kvm_apic_write_nodecode(vcpu, offset);
 	return 1;
 }
-- 
Gitee


From 1e15b79f1f31ec5f873ffac4241a5eb75848788c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 4 Feb 2022 21:41:57 +0000
Subject: [PATCH 1962/2161] KVM: x86: Use "raw" APIC register read for handling
 APIC-write VM-Exit

commit b031f1043583b957e5572f158a31d453d67114de upstream.

Use the "raw" helper to read the vAPIC register after an APIC-write trap
VM-Exit.  Hardware is responsible for vetting the write, and the caller
is responsible for sanitizing the offset.  This is a functional change,
as it means KVM will consume whatever happens to be in the vAPIC page if
the write was dropped by hardware.  But, unless userspace deliberately
wrote garbage into the vAPIC page via KVM_SET_LAPIC, the value should be
zero since it's not writable by the guest.

This aligns common x86 with SVM's AVIC logic, i.e. paves the way for
using the nodecode path to handle APIC-write traps when AVIC is enabled.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7e0ebcf2d351..f982a928368c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2026,9 +2026,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 /* emulate APIC access in a trap manner */
 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
 {
-	u32 val = 0;
-
-	kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val);
+	u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset);
 
 	/* TODO: optimize to just emulate side effect w/o one more write */
 	kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
-- 
Gitee


From 0005478042b4ce6334162c2efacc2558babc1eb2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 4 Feb 2022 21:42:02 +0000
Subject: [PATCH 1963/2161] KVM: x86: Add helpers to handle 64-bit APIC MSR
 read/writes

commit 5429478d038fc945919419cfa290313463a21141 upstream.

Add helpers to handle 64-bit APIC read/writes via MSRs to deduplicate the
x2APIC and Hyper-V code needed to service reads/writes to ICR.  Future
support for IPI virtualization will add yet another path where KVM must
handle 64-bit APIC MSR reads/write (to ICR).

Opportunistically fix the comment in the write path; ICR2 holds the
destination (if there's no shorthand), not the vector.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-9-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c | 59 ++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f982a928368c..be0ed06d82ca 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2595,6 +2595,30 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 	return 0;
 }
 
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+{
+	u32 low, high = 0;
+
+	if (kvm_lapic_reg_read(apic, reg, 4, &low))
+		return 1;
+
+	if (reg == APIC_ICR &&
+	    WARN_ON_ONCE(kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high)))
+		return 1;
+
+	*data = (((u64)high) << 32) | low;
+
+	return 0;
+}
+
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
+{
+	/* For 64-bit ICR writes, set ICR2 (dest) before ICR (command). */
+	if (reg == APIC_ICR)
+		kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+	return kvm_lapic_reg_write(apic, reg, (u32)data);
+}
+
 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
@@ -2606,16 +2630,13 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	if (reg == APIC_ICR2)
 		return 1;
 
-	/* if this is ICR write vector before command */
-	if (reg == APIC_ICR)
-		kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
-	return kvm_lapic_reg_write(apic, reg, (u32)data);
+	return kvm_lapic_msr_write(apic, reg, data);
 }
 
 int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+	u32 reg = (msr - APIC_BASE_MSR) << 4;
 
 	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
@@ -2623,45 +2644,23 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	if (reg == APIC_DFR || reg == APIC_ICR2)
 		return 1;
 
-	if (kvm_lapic_reg_read(apic, reg, 4, &low))
-		return 1;
-	if (reg == APIC_ICR)
-		kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
-
-	*data = (((u64)high) << 32) | low;
-
-	return 0;
+	return kvm_lapic_msr_read(apic, reg, data);
 }
 
 int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
 	if (!lapic_in_kernel(vcpu))
 		return 1;
 
-	/* if this is ICR write vector before command */
-	if (reg == APIC_ICR)
-		kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
-	return kvm_lapic_reg_write(apic, reg, (u32)data);
+	return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
 }
 
 int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 low, high = 0;
-
 	if (!lapic_in_kernel(vcpu))
 		return 1;
 
-	if (kvm_lapic_reg_read(apic, reg, 4, &low))
-		return 1;
-	if (reg == APIC_ICR)
-		kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
-
-	*data = (((u64)high) << 32) | low;
-
-	return 0;
+	return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
 }
 
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
-- 
Gitee


From f687caa27313433c2848c158e9a3fb8b58cc661b Mon Sep 17 00:00:00 2001
From: Zeng Guang <guang.zeng@intel.com>
Date: Tue, 19 Apr 2022 23:35:16 +0800
Subject: [PATCH 1964/2161] KVM: x86: Add support for vICR APIC-write VM-Exits
 in x2APIC mode

commit 5413bcba7ed57206178d60ee03dd5bb3a460e645 upstream.

Upcoming Intel CPUs will support virtual x2APIC MSR writes to the vICR,
i.e. will trap and generate an APIC-write VM-Exit instead of intercepting
the WRMSR.  Add support for handling "nodecode" x2APIC writes, which
were previously impossible.

Note, x2APIC MSR writes are 64 bits wide.

Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419153516.11739-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index be0ed06d82ca..8b3019e08809 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -66,6 +66,7 @@ static bool lapic_timer_advance_dynamic __read_mostly;
 #define LAPIC_TIMER_ADVANCE_NS_MAX     5000
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
 
 static inline int apic_test_vector(int vec, void *bitmap)
 {
@@ -2026,10 +2027,27 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 /* emulate APIC access in a trap manner */
 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
 {
-	u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset);
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 val;
+
+	if (apic_x2apic_mode(apic)) {
+		/*
+		 * When guest APIC is in x2APIC mode and IPI virtualization
+		 * is enabled, accessing APIC_ICR may cause trap-like VM-exit
+		 * on Intel hardware. Other offsets are not possible.
+		 */
+		if (WARN_ON_ONCE(offset != APIC_ICR))
+			return;
 
-	/* TODO: optimize to just emulate side effect w/o one more write */
-	kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
+		kvm_lapic_msr_read(apic, offset, &val);
+		kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
+		trace_kvm_apic_write(APIC_ICR, val);
+	} else {
+		val = kvm_lapic_get_reg(apic, offset);
+
+		/* TODO: optimize to just emulate side effect w/o one more write */
+		kvm_lapic_reg_write(apic, offset, (u32)val);
+	}
 }
 EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
 
-- 
Gitee


From 10374838c3c07492afc0b1bdea6fcf2cf025d8c6 Mon Sep 17 00:00:00 2001
From: Zeng Guang <guang.zeng@intel.com>
Date: Tue, 19 Apr 2022 23:36:04 +0800
Subject: [PATCH 1965/2161] KVM: VMX: Clean up vmx_refresh_apicv_exec_ctrl()

commit f08a06c9a35706349f74b7a18deefe3f89f73e8e upstream.

Remove the condition check cpu_has_secondary_exec_ctrls(). Calling
vmx_refresh_apicv_exec_ctrl() premises secondary controls activated
and VMCS fields related to APICv valid as well. If it's invoked in
wrong circumstance at the worst case, VMX operation will report
VMfailValid error without further harmful impact and just functions
as if all the secondary controls were 0.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419153604.11786-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8cd04f3edfa5..96e5219ea14b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3993,16 +3993,15 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
-	if (cpu_has_secondary_exec_ctrls()) {
-		if (kvm_vcpu_apicv_active(vcpu))
-			secondary_exec_controls_setbit(vmx,
-				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
-				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-		else
-			secondary_exec_controls_clearbit(vmx,
-					SECONDARY_EXEC_APIC_REGISTER_VIRT |
-					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-	}
+
+	if (kvm_vcpu_apicv_active(vcpu))
+		secondary_exec_controls_setbit(vmx,
+					       SECONDARY_EXEC_APIC_REGISTER_VIRT |
+					       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+	else
+		secondary_exec_controls_clearbit(vmx,
+						 SECONDARY_EXEC_APIC_REGISTER_VIRT |
+						 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 
 	if (cpu_has_vmx_msr_bitmap())
 		vmx_update_msr_bitmap(vcpu);
-- 
Gitee


From a7e8c1cb6cf634632f1b8b9aa00729bc7fc17053 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 16 Aug 2022 10:33:15 +0800
Subject: [PATCH 1966/2161] KVM: Add kvm_arch_vcpu_precreate() to handle
 pre-allocation issues

commit 897cc38eaab96d006ab17edd0f50a2f432f584cf upstream.

Add a pre-allocation arch hook to handle checks that are currently done
by arch specific code prior to allocating the vCPU object.  This paves
the way for moving the allocation to common KVM code.

Acked-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c       | 14 +++++++++-----
 include/linux/kvm_host.h |  1 +
 virt/kvm/arm/arm.c       | 21 +++++++++++----------
 virt/kvm/kvm_main.c      |  4 ++++
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f98ada15d43b..5267f9febc9b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9390,16 +9390,20 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	free_cpumask_var(wbinvd_dirty_mask);
 }
 
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+		pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+			     "guest TSC will not be reliable\n");
+
+	return 0;
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 						unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
 
-	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
-		printk_once(KERN_WARNING
-		"kvm: SMP vm created on host with unstable TSC; "
-		"guest TSC will not be reliable\n");
-
 	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
 
 	return vcpu;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 281b64286e0b..72dd3d1620ad 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -864,6 +864,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id);
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 2e7d2b3f2907..4d2f141a804a 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -253,21 +253,22 @@ void kvm_arch_free_vm(struct kvm *kvm)
 		vfree(kvm);
 }
 
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
+		return -EBUSY;
+
+	if (id >= kvm->arch.max_vcpus)
+		return -EINVAL;
+
+	return 0;
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	int err;
 	struct kvm_vcpu *vcpu;
 
-	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) {
-		err = -EBUSY;
-		goto out;
-	}
-
-	if (id >= kvm->arch.max_vcpus) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
 	if (!vcpu) {
 		err = -ENOMEM;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 238734754260..1a7c8ad7b760 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2817,6 +2817,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	kvm->created_vcpus++;
 	mutex_unlock(&kvm->lock);
 
+	r = kvm_arch_vcpu_precreate(kvm, id);
+	if (r)
+		goto vcpu_decrement;
+
 	vcpu = kvm_arch_vcpu_create(kvm, id);
 	if (IS_ERR(vcpu)) {
 		r = PTR_ERR(vcpu);
-- 
Gitee


From 580d742e2096ac440e6576af79554c23e3a167ab Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 16 Aug 2022 10:51:09 +0800
Subject: [PATCH 1967/2161] KVM: Move kvm_arch_vcpu_precreate() under kvm->lock

commit 1d5e740d518e02cea46325b3d37135bf9c08982a upstream.

kvm_arch_vcpu_precreate() targets to handle arch specific VM resource
to be prepared prior to the actual creation of vCPU. For example, x86
platform may need do per-VM allocation based on max_vcpu_ids at the
first vCPU creation. It probably leads to concurrency control on this
allocation as multiple vCPU creation could happen simultaneously. From
the architectual point of view, it's necessary to execute
kvm_arch_vcpu_precreate() under protect of kvm->lock.

Currently only arm64, x86 and s390 have non-nop implementations at the
stage of vCPU pre-creation. Remove the lock acquiring in s390's design
and make sure all architecture can run kvm_arch_vcpu_precreate() safely
under kvm->lock without recrusive lock issue.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419154409.11842-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c  |  2 +-
 virt/kvm/kvm_main.c | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5267f9febc9b..c52bc82567f0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9392,7 +9392,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 {
-	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+	if (kvm_check_tsc_unstable() && kvm->created_vcpus)
 		pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
 			     "guest TSC will not be reliable\n");
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1a7c8ad7b760..1814a3fdcfab 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2814,13 +2814,15 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		return -EINVAL;
 	}
 
+	r = kvm_arch_vcpu_precreate(kvm, id);
+	if (r) {
+		mutex_unlock(&kvm->lock);
+		return r;
+	}
+
 	kvm->created_vcpus++;
 	mutex_unlock(&kvm->lock);
 
-	r = kvm_arch_vcpu_precreate(kvm, id);
-	if (r)
-		goto vcpu_decrement;
-
 	vcpu = kvm_arch_vcpu_create(kvm, id);
 	if (IS_ERR(vcpu)) {
 		r = PTR_ERR(vcpu);
-- 
Gitee


From 0c9ca4bd1706f22ec0c3a9fc4f6b700da516cf53 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 16 Aug 2022 11:14:05 +0800
Subject: [PATCH 1968/2161] KVM: x86: Allow userspace to set maximum VCPU id
 for VM

commit 35875316384b71d23dc2a45a969732fc8cab16af upstream.

Introduce new max_vcpu_ids in KVM for x86 architecture. Userspace
can assign maximum possible vcpu id for current VM session using
KVM_CAP_MAX_VCPU_ID of KVM_ENABLE_CAP ioctl().

This is done for x86 only because the sole use case is to guide
memory allocation for PID-pointer table, a structure needed to
enable VMX IPI.

By default, max_vcpu_ids set as KVM_MAX_VCPU_IDS.

Suggested-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419154444.11888-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/virt/kvm/api.txt  | 21 +++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |  6 ++++++
 arch/x86/kvm/x86.c              | 20 ++++++++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 57bafd45137b..3a2cee9af7f0 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -5112,6 +5112,27 @@ it hard or impossible to use it correctly.  The availability of
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed.
 Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT.
 
+7.32 KVM_CAP_MAX_VCPU_ID
+------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] - maximum APIC ID value set for current VM
+:Returns: 0 on success, -EINVAL if args[0] is beyond KVM_MAX_VCPU_IDS
+          supported in KVM or if it has been set.
+
+This capability allows userspace to specify maximum possible APIC ID
+assigned for current VM session prior to the creation of vCPUs, saving
+memory for data structures indexed by the APIC ID.  Userspace is able
+to calculate the limit to APIC ID values from designated
+CPU topology.
+
+The value can be changed only until KVM_ENABLE_CAP is set to a nonzero
+value or until a vCPU is created.  Upon creation of the first vCPU,
+if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM
+uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as
+the maximum APIC ID.
+
 8. Other capabilities.
 ----------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c1629c45dac..12fa5bf8f9bd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -945,6 +945,12 @@ struct kvm_arch {
 
 	struct kvm_pmu_event_filter *pmu_event_filter;
 	struct task_struct *nx_lpage_recovery_thread;
+	/*
+	 * VM-scope maximum vCPU ID. Used to determine the size of structures
+	 * that increase along with the maximum vCPU ID, in which case, using
+	 * the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
+	 */
+	u32 max_vcpu_ids;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c52bc82567f0..8dc7323b4189 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5076,6 +5076,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		kvm->arch.exception_payload_enabled = cap->args[0];
 		r = 0;
 		break;
+	case KVM_CAP_MAX_VCPU_ID:
+		r = -EINVAL;
+		if (cap->args[0] > KVM_MAX_VCPU_IDS)
+			break;
+
+		mutex_lock(&kvm->lock);
+		if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+			r = 0;
+		} else if (!kvm->arch.max_vcpu_ids) {
+			kvm->arch.max_vcpu_ids = cap->args[0];
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -9396,6 +9410,12 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 		pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
 			     "guest TSC will not be reliable\n");
 
+	if (!kvm->arch.max_vcpu_ids)
+		kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
+
+	if (id >= kvm->arch.max_vcpu_ids)
+		return -EINVAL;
+
 	return 0;
 }
 
-- 
Gitee


From b6865e65aa917a20909b6088a1fa6d20cac6d564 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 16 Aug 2022 11:24:33 +0800
Subject: [PATCH 1969/2161] kvm: selftests: Add KVM_CAP_MAX_VCPU_ID cap test

commit 753dcf7a8686a750fa6aa4b4ca42c6945fc75ac1 upstream.

Basic test coverage of KVM_CAP_MAX_VCPU_ID cap.

This capability can be enabled before vCPU creation and only allowed
to set once. if assigned vcpu id is beyond KVM_CAP_MAX_VCPU_ID
capability, vCPU creation will fail.

Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220422134456.26655-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 tools/testing/selftests/kvm/.gitignore        |  1 +
 tools/testing/selftests/kvm/Makefile          |  1 +
 .../kvm/x86_64/max_vcpuid_cap_test.c          | 54 +++++++++++++++++++
 3 files changed, 56 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 409c1fa75e03..8a0ba9018077 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -3,6 +3,7 @@
 /x86_64/cr4_cpuid_sync_test
 /x86_64/evmcs_test
 /x86_64/hyperv_cpuid
+/x86_64/max_vcpuid_cap_test
 /x86_64/mmio_warning_test
 /x86_64/platform_info_test
 /x86_64/set_sregs_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index df8494d5a468..9da33dd0432b 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -26,6 +26,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
 TEST_GEN_PROGS_x86_64 += x86_64/amx_test
+TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test
 TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
new file mode 100644
index 000000000000..3f6c1ad86cc6
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * maximum APIC ID capability tests
+ *
+ * Copyright (C) 2022, Intel, Inc.
+ *
+ * Tests for getting/setting maximum APIC ID capability
+ */
+
+#include "kvm_util.h"
+#include "../lib/kvm_util_internal.h"
+
+#define MAX_VCPU_ID	2
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_enable_cap cap = { 0 };
+	int ret;
+
+	vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+
+	/* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */
+	ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
+
+	/* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */
+	cap.cap = KVM_CAP_MAX_VCPU_ID;
+	cap.args[0] = ret + 1;
+	ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap);
+	TEST_ASSERT(ret < 0,
+		    "Unexpected success to enable KVM_CAP_MAX_VCPU_ID"
+		    "beyond KVM cap!\n");
+
+	/* Set KVM_CAP_MAX_VCPU_ID */
+	cap.cap = KVM_CAP_MAX_VCPU_ID;
+	cap.args[0] = MAX_VCPU_ID;
+	ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap);
+	TEST_ASSERT(ret == 0,
+		    "Unexpected failure to enable KVM_CAP_MAX_VCPU_ID!\n");
+
+	/* Try to set KVM_CAP_MAX_VCPU_ID again */
+	cap.args[0] = MAX_VCPU_ID + 1;
+	ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap);
+	TEST_ASSERT(ret < 0,
+		    "Unexpected success to enable KVM_CAP_MAX_VCPU_ID again\n");
+
+	/* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/
+	ret = ioctl(vm->fd, KVM_CREATE_VCPU, MAX_VCPU_ID);
+	TEST_ASSERT(ret < 0,
+		    "Unexpected success in creating a vCPU with VCPU ID out of range\n");
+
+	kvm_vm_free(vm);
+	return 0;
+}
-- 
Gitee


From 9628f998f84210975c6f478fb051686f841469f5 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 17 Aug 2022 11:34:45 +0800
Subject: [PATCH 1970/2161] KVM: VMX: enable IPI virtualization

commit d588bb9be1da6aa750aa64875fe57369db983d8b upstream.

With IPI virtualization enabled, the processor emulates writes to
APIC registers that would send IPIs. The processor sets the bit
corresponding to the vector in target vCPU's PIR and may send a
notification (IPI) specified by NDST and NV fields in target vCPU's
Posted-Interrupt Descriptor (PID). It is similar to what IOMMU
engine does when dealing with posted interrupt from devices.

A PID-pointer table is used by the processor to locate the PID of a
vCPU with the vCPU's APIC ID. The table size depends on maximum APIC
ID assigned for current VM session from userspace. Allocating memory
for PID-pointer table is deferred to vCPU creation, because irqchip
mode and VM-scope maximum APIC ID is settled at that point. KVM can
skip PID-pointer table allocation if !irqchip_in_kernel().

Like VT-d PI, if a vCPU goes to blocked state, VMM needs to switch its
notification vector to wakeup vector. This can ensure that when an IPI
for blocked vCPUs arrives, VMM can get control and wake up blocked
vCPUs. And if a VCPU is preempted, its posted interrupt notification
is suppressed.

Note that IPI virtualization can only virualize physical-addressing,
flat mode, unicast IPIs. Sending other IPIs would still cause a
trap-like APIC-write VM-exit and need to be handled by VMM.

Signed-off-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419154510.11938-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/include/asm/vmx.h      |  8 +++
 arch/x86/kvm/vmx/capabilities.h |  6 ++
 arch/x86/kvm/vmx/vmx.c          | 98 +++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/vmx.h          |  9 +++
 arch/x86/kvm/x86.c              |  2 +-
 6 files changed, 117 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 12fa5bf8f9bd..aa2d7b950b06 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1035,6 +1035,7 @@ struct kvm_x86_ops {
 	void (*vm_destroy)(struct kvm *kvm);
 
 	/* Create, but do not attach this VCPU */
+	int (*vcpu_precreate)(struct kvm *kvm);
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 50b07c9d2c7b..083910dc5011 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -73,6 +73,11 @@
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
 #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	0x04000000
 
+/*
+ * Definitions of Tertiary Processor-Based VM-Execution Controls.
+ */
+#define TERTIARY_EXEC_IPI_VIRT			BIT((3*32+4) & 0x1f)
+
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
@@ -153,6 +158,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc)
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID            = 0x00000000,
 	POSTED_INTR_NV                  = 0x00000002,
+	LAST_PID_POINTER_INDEX		= 0x00000008,
 	GUEST_ES_SELECTOR               = 0x00000800,
 	GUEST_CS_SELECTOR               = 0x00000802,
 	GUEST_SS_SELECTOR               = 0x00000804,
@@ -222,6 +228,8 @@ enum vmcs_field {
 	PASID_DIR0_HIGH 		= 0x00002039,
 	PASID_DIR1			= 0x0000203a,
 	PASID_DIR1_HIGH 		= 0x0000203b,
+	PID_POINTER_TABLE		= 0x00002042,
+	PID_POINTER_TABLE_HIGH		= 0x00002043,
 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
 	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
 	VMCS_LINK_POINTER               = 0x00002800,
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 510142b30494..6ccf08a2d9fb 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -12,6 +12,7 @@ extern bool __read_mostly enable_ept;
 extern bool __read_mostly enable_unrestricted_guest;
 extern bool __read_mostly enable_ept_ad_bits;
 extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_ipiv;
 extern bool __read_mostly enable_apicv;
 extern int __read_mostly pt_mode;
 
@@ -280,6 +281,11 @@ static inline bool cpu_has_vmx_apicv(void)
 		cpu_has_vmx_posted_intr();
 }
 
+static inline bool cpu_has_vmx_ipiv(void)
+{
+	return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 96e5219ea14b..8e169eadaeff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -101,6 +101,9 @@ module_param(fasteoi, bool, S_IRUGO);
 bool __read_mostly enable_apicv = 1;
 module_param(enable_apicv, bool, S_IRUGO);
 
+bool __read_mostly enable_ipiv = true;
+module_param(enable_ipiv, bool, 0444);
+
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -1386,10 +1389,26 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vmx->host_debugctlmsr = get_debugctlmsr();
 }
 
+static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * The default posted interrupt vector does nothing when
+	 * invoked outside guest mode.   Return whether a blocked vCPU
+	 * can be the target of posted interrupts, as is the case when
+	 * using either IPI virtualization or VT-d PI, so that the
+	 * notification vector is switched to the one that calls
+	 * back to the pi_wakeup_handler() function.
+	 */
+	return vmx_can_use_ipiv(vcpu);
+}
+
 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 
+	if (!vmx_needs_pi_wakeup(vcpu))
+		return;
+
 	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
 		!kvm_vcpu_apicv_active(vcpu))
@@ -2446,7 +2465,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	}
 
 	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) {
-		u64 opt3 = 0;
+		u64 opt3 = TERTIARY_EXEC_IPI_VIRT;
 
 		_cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3,
 					      MSR_IA32_VMX_PROCBASED_CTLS3);
@@ -3744,6 +3763,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu,
 			vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
 			vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
 			vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+			if (enable_ipiv)
+				vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
 		}
 	}
 }
@@ -3994,14 +4015,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 
 	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
 
-	if (kvm_vcpu_apicv_active(vcpu))
+	if (kvm_vcpu_apicv_active(vcpu)) {
 		secondary_exec_controls_setbit(vmx,
 					       SECONDARY_EXEC_APIC_REGISTER_VIRT |
 					       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-	else
+		if (enable_ipiv)
+			tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
+	} else {
 		secondary_exec_controls_clearbit(vmx,
 						 SECONDARY_EXEC_APIC_REGISTER_VIRT |
 						 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+		if (enable_ipiv)
+			tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
+	}
 
 	if (cpu_has_vmx_msr_bitmap())
 		vmx_update_msr_bitmap(vcpu);
@@ -4035,7 +4061,16 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx)
 
 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
 {
-	return vmcs_config.cpu_based_3rd_exec_ctrl;
+	u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+	/*
+	 * IPI virtualization relies on APICv. Disable IPI virtualization if
+	 * APICv is inhibited.
+	 */
+	if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+		exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
+
+	return exec_control;
 }
 
 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
@@ -4199,6 +4234,35 @@ static void ept_set_mmio_spte_mask(void)
 				   VMX_EPT_MISCONFIG_WX_VALUE, 0);
 }
 
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+	return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+	struct page *pages;
+	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+	if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+		return 0;
+
+	if (kvm_vmx->pid_table)
+		return 0;
+
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+	if (!pages)
+		return -ENOMEM;
+
+	kvm_vmx->pid_table = (void *)page_address(pages);
+	return 0;
+}
+
+static int vmx_vcpu_precreate(struct kvm *kvm)
+{
+	return vmx_alloc_ipiv_pid_table(kvm);
+}
+
 #define VMX_XSS_EXIT_BITMAP 0
 
 /*
@@ -4206,6 +4270,8 @@ static void ept_set_mmio_spte_mask(void)
  */
 static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
+	struct kvm *kvm = vmx->vcpu.kvm;
+	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
 	int i;
 
 	if (nested)
@@ -4242,7 +4308,12 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
 	}
 
-	if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+	if (vmx_can_use_ipiv(&vmx->vcpu)) {
+		vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+		vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
+	}
+
+	if (!kvm_pause_in_guest(kvm)) {
 		vmcs_write32(PLE_GAP, ple_gap);
 		vmx->ple_window = ple_window;
 		vmx->ple_window_dirty = true;
@@ -6809,6 +6880,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
 	struct vcpu_vmx *vmx;
+	struct kvm_vcpu *vcpu;
 	unsigned long *msr_bitmap;
 	int cpu;
 
@@ -6913,6 +6985,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	vmx->ept_pointer = INVALID_PAGE;
 
+	vcpu = &vmx->vcpu;
+
+	if (vmx_can_use_ipiv(vcpu))
+		WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+			   __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -7213,8 +7291,12 @@ static int vmx_vm_init(struct kvm *kvm)
 
 static void vmx_vm_destroy(struct kvm *kvm)
 {
+	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
 	if (cpu_has_vmx_pasid_trans())
-		vmx_vm_pasid_trans_destroy(to_kvm_vmx(kvm));
+		vmx_vm_pasid_trans_destroy(kvm_vmx);
+
+	free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
 }
 
 static int __init vmx_check_processor_compat(void)
@@ -8111,6 +8193,9 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->sync_pir_to_irr = NULL;
 	}
 
+	if (!enable_apicv || !cpu_has_vmx_ipiv())
+		enable_ipiv = false;
+
 	if (cpu_has_vmx_tsc_scaling()) {
 		kvm_has_tsc_control = true;
 		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
@@ -8218,6 +8303,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.vm_alloc = vmx_vm_alloc,
 	.vm_free = vmx_vm_free,
 
+	.vcpu_precreate = vmx_vcpu_precreate,
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 0f77b44a59eb..965d4dc31674 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -327,6 +327,8 @@ struct kvm_vmx {
 	spinlock_t pasid_lock;
 	struct notifier_block pasid_nb;
 	struct mm_struct *mm;
+	/* Posted Interrupt Descriptor (PID) table for IPI virtualization */
+	u64 *pid_table;
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -365,6 +367,8 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
 #define POSTED_INTR_ON  0
 #define POSTED_INTR_SN  1
 
+#define PID_TABLE_ENTRY_VALID 1
+
 static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
 {
 	return test_and_set_bit(POSTED_INTR_ON,
@@ -551,4 +555,9 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 
 void dump_vmcs(void);
 
+static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
+{
+	return  lapic_in_kernel(vcpu) && enable_ipiv;
+}
+
 #endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8dc7323b4189..cbcfb588014b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9416,7 +9416,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 	if (id >= kvm->arch.max_vcpu_ids)
 		return -EINVAL;
 
-	return 0;
+	return kvm_x86_ops->vcpu_precreate(kvm);
 }
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
-- 
Gitee


From 87ab117708ef0d40f819ff81b3960546da02bb10 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Fri, 3 Sep 2021 17:15:58 -0400
Subject: [PATCH 1971/2161] kvm: x86: Set KVM_MAX_VCPU_ID to 4*KVM_MAX_VCPUS

commit opencloudos.

Instead of requiring KVM_MAX_VCPU_ID to be manually increased
every time we increase KVM_MAX_VCPUS, set it to 4*KVM_MAX_VCPUS.
This should be enough for CPU topologies where Cores-per-Package
and Packages-per-Socket are not powers of 2.

In practice, this increases KVM_MAX_VCPU_ID from 1023 to 1152.
The only side effect of this change is making some fields in
struct kvm_ioapic larger, increasing the struct size from 1628 to
1780 bytes (in x86_64).

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Message-Id: <20210903211600.2002377-2-ehabkost@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa2d7b950b06..c68e94a2b084 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -39,7 +39,19 @@
 
 #define KVM_MAX_VCPUS 288
 #define KVM_SOFT_MAX_VCPUS 240
-#define KVM_MAX_VCPU_ID 1023
+
+/*
+ * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
+ * might be larger than the actual number of VCPUs because the
+ * APIC ID encodes CPU topology information.
+ *
+ * In the worst case, we'll need less than one extra bit for the
+ * Core ID, and less than one extra bit for the Package (Die) ID,
+ * so ratio of 4 should be enough.
+ */
+#define KVM_VCPU_ID_RATIO 4
+#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 3
-- 
Gitee


From 6af9d48a24fa33e39e9bff2f905a51473add2ff0 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Fri, 3 Sep 2021 17:15:59 -0400
Subject: [PATCH 1972/2161] kvm: x86: Increase MAX_VCPUS to 1024

commit 074c82c8f7cf8a46c3b81965f122599e3a133450 upstream.

Increase KVM_MAX_VCPUS to 1024, so we can test larger VMs.

I'm not changing KVM_SOFT_MAX_VCPUS yet because I'm afraid it
might involve complicated questions around the meaning of
"supported" and "recommended" in the upstream tree.
KVM_SOFT_MAX_VCPUS will be changed in a separate patch.

For reference, visible effects of this change are:
- KVM_CAP_MAX_VCPUS will now return 1024 (of course)
- Default value for CPUID[HYPERV_CPUID_IMPLEMENT_LIMITS (00x40000005)].EAX
  will now be 1024
- KVM_MAX_VCPU_ID will change from 1151 to 4096
- Size of struct kvm will increase from 19328 to 22272 bytes
  (in x86_64)
- Size of struct kvm_ioapic will increase from 1780 to 5084 bytes
  (in x86_64)
- Bitmap stack variables that will grow:
  - At kvm_hv_flush_tlb() kvm_hv_send_ipi(),
    vp_bitmap[] and vcpu_bitmap[] will now be 128 bytes long
  - vcpu_bitmap at bioapic_write_indirect() will be 128 bytes long
    once patch "KVM: x86: Fix stack-out-of-bounds memory access
    from ioapic_write_indirect()" is applied

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Message-Id: <20210903211600.2002377-3-ehabkost@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c68e94a2b084..c6b3544f8ecc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -37,7 +37,7 @@
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
 
-#define KVM_MAX_VCPUS 288
+#define KVM_MAX_VCPUS 1024
 #define KVM_SOFT_MAX_VCPUS 240
 
 /*
-- 
Gitee


From 1eb67485861c9474b53c1feebdaa3bbc5cf9e2cc Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 17 Aug 2022 10:49:47 +0800
Subject: [PATCH 1973/2161] kvm: rename KVM_MAX_VCPU_ID to KVM_MAX_VCPU_IDS

commit a1c42ddedf35dbf5f25ea0982ed6e226eef7a78c upstream.

KVM_MAX_VCPU_ID is not specifying the highest allowed vcpu-id, but the
number of allowed vcpu-ids. This has already led to confusion, so
rename KVM_MAX_VCPU_ID to KVM_MAX_VCPU_IDS to make its semantics more
clear

Suggested-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210913135745.13944-3-jgross@suse.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h                    | 2 +-
 arch/x86/kvm/ioapic.c                              | 2 +-
 arch/x86/kvm/ioapic.h                              | 4 ++--
 arch/x86/kvm/x86.c                                 | 2 +-
 include/linux/kvm_host.h                           | 4 ++--
 tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 2 +-
 virt/kvm/kvm_main.c                                | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c6b3544f8ecc..b5985ad8b807 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -50,7 +50,7 @@
  * so ratio of 4 should be enough.
  */
 #define KVM_VCPU_ID_RATIO 4
-#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
 
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 24a6905d60ee..d7723e5d728a 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -91,7 +91,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
 {
 	ioapic->rtc_status.pending_eoi = 0;
-	bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
+	bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS);
 }
 
 static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ea1a4e0297da..85a60351e16f 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -43,13 +43,13 @@ struct kvm_vcpu;
 
 struct dest_map {
 	/* vcpu bitmap where IRQ has been sent */
-	DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
+	DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS);
 
 	/*
 	 * Vector sent to a given vcpu, only valid when
 	 * the vcpu's bit in map is set
 	 */
-	u8 vectors[KVM_MAX_VCPU_ID];
+	u8 vectors[KVM_MAX_VCPU_IDS];
 };
 
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cbcfb588014b..41315b69cfeb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3548,7 +3548,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_MAX_VCPUS;
 		break;
 	case KVM_CAP_MAX_VCPU_ID:
-		r = KVM_MAX_VCPU_ID;
+		r = KVM_MAX_VCPU_IDS;
 		break;
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 72dd3d1620ad..f8a34fdf45d2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,8 +35,8 @@
 
 #include <asm/kvm_host.h>
 
-#ifndef KVM_MAX_VCPU_ID
-#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
+#ifndef KVM_MAX_VCPU_IDS
+#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
 #endif
 
 /*
diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
index 231d79e57774..7220e0cf95cf 100644
--- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
+++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
@@ -56,7 +56,7 @@ int main(int argc, char *argv[])
 		kvm_max_vcpu_id = kvm_max_vcpus;
 
 	TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus,
-		    "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).",
+		    "KVM_MAX_VCPU_IDS (%d) must be at least as large as KVM_MAX_VCPUS (%d).",
 		    kvm_max_vcpu_id, kvm_max_vcpus);
 
 	test_vcpu_creation(0, kvm_max_vcpus);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1814a3fdcfab..6a85f7be61b8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2805,7 +2805,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	int r;
 	struct kvm_vcpu *vcpu;
 
-	if (id >= KVM_MAX_VCPU_ID)
+	if (id >= KVM_MAX_VCPU_IDS)
 		return -EINVAL;
 
 	mutex_lock(&kvm->lock);
-- 
Gitee


From 1fd9c1ca802ee8623e2638bc6ae92f1af5b86fc8 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 8 Nov 2021 16:28:18 +0100
Subject: [PATCH 1974/2161] KVM: x86: Rename kvm_lapic_enable_pv_eoi()

commit 77c3323f487512fd587074280e7fb46089cb50b4 upstream.

kvm_lapic_enable_pv_eoi() is a misnomer as the function is also
used to disable PV EOI. Rename it to kvm_lapic_set_pv_eoi().

No functional change intended.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20211108152819.12485-2-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/hyperv.c | 4 ++--
 arch/x86/kvm/lapic.c  | 2 +-
 arch/x86/kvm/lapic.h  | 2 +-
 arch/x86/kvm/x86.c    | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 26408434b9bc..0fa3e8105330 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1113,7 +1113,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 
 		if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
 			hv_vcpu->hv_vapic = data;
-			if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
+			if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
 				return 1;
 			break;
 		}
@@ -1131,7 +1131,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 			return 1;
 		hv_vcpu->hv_vapic = data;
 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
-		if (kvm_lapic_enable_pv_eoi(vcpu,
+		if (kvm_lapic_set_pv_eoi(vcpu,
 					    gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
 					    sizeof(struct hv_vp_assist_page)))
 			return 1;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8b3019e08809..df561f5acfa8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2681,7 +2681,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 	return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
 }
 
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
 {
 	u64 addr = data & ~KVM_MSR_ENABLED;
 	struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f086829b87fe..ff8fc6cab834 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -130,7 +130,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 	return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
 }
 
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
 void kvm_lapic_init(void);
 void kvm_lapic_exit(void);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41315b69cfeb..e92a810b1d0a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2971,7 +2971,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		break;
 	case MSR_KVM_PV_EOI_EN:
-		if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
+		if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
 			return 1;
 		break;
 
-- 
Gitee


From 3ff2997dceb7f9e23c527fa73c6cecac5012309c Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 17 Aug 2022 15:14:48 +0800
Subject: [PATCH 1975/2161] KVM: x86: Make kvm_lapic_reg_{read,write}() static

commit 70180052354c894d04af07e56cf13ecf1891904f upstream.

Make the low level read/write lapic helpers static, any accesses to the
local APIC from vendor code or non-APIC code should be routed through
proper helpers.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-8-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c | 8 +++-----
 arch/x86/kvm/lapic.h | 3 ---
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index df561f5acfa8..078dc36c9f56 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1293,8 +1293,8 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
 #define APIC_REGS_MASK(first, count) \
 	(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
 
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
-		void *data)
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+			      void *data)
 {
 	unsigned char alignment = offset & 0xf;
 	u32 result;
@@ -1347,7 +1347,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
 
 static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
 {
@@ -1842,7 +1841,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 	}
 }
 
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 {
 	int ret = 0;
 
@@ -1983,7 +1982,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
 
 static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 			    gpa_t address, int len, const void *data)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff8fc6cab834..61c2e4b3bfc0 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -85,9 +85,6 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
-		       void *data);
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode);
 
-- 
Gitee


From 1cf4d25fdb67126b7117ea9402866f161ef1c9e5 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 17 Aug 2022 17:26:47 +0800
Subject: [PATCH 1976/2161] KVM: SVM: Don't rewrite guest ICR on AVIC IPI
 virtualization failure

commit b51818afdc1d3c7cc269e295953685558d3af71c upstream.

Don't bother rewriting the ICR value into the vAPIC page on an AVIC IPI
virtualization failure, the access is a trap, i.e. the value has already
been written to the vAPIC page.  The one caveat is if hardware left the
BUSY flag set (which appears to happen somewhat arbitrarily), in which
case go through the "nodecode" APIC-write path in order to clear the BUSY
flag.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-6-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c |  1 +
 arch/x86/kvm/svm.c   | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 078dc36c9f56..47530601a3d9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1210,6 +1210,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
 
 	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
 }
+EXPORT_SYMBOL_GPL(kvm_apic_send_ipi);
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
 {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 663f510a5af0..e8d7bcec455d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4553,18 +4553,18 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
 	switch (id) {
 	case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
 		/*
-		 * AVIC hardware handles the generation of
-		 * IPIs when the specified Message Type is Fixed
-		 * (also known as fixed delivery mode) and
-		 * the Trigger Mode is edge-triggered. The hardware
-		 * also supports self and broadcast delivery modes
-		 * specified via the Destination Shorthand(DSH)
-		 * field of the ICRL. Logical and physical APIC ID
-		 * formats are supported. All other IPI types cause
-		 * a #VMEXIT, which needs to emulated.
+		 * Emulate IPIs that are not handled by AVIC hardware, which
+		 * only virtualizes Fixed, Edge-Triggered INTRs.  The exit is
+		 * a trap, e.g. ICR holds the correct value and RIP has been
+		 * advanced, KVM is responsible only for emulating the IPI.
+		 * Sadly, hardware may sometimes leave the BUSY flag set, in
+		 * which case KVM needs to emulate the ICR write as well in
+		 * order to clear the BUSY flag.
 		 */
-		kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
-		kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+		if (icrl & APIC_ICR_BUSY)
+			kvm_apic_write_nodecode(&svm->vcpu, APIC_ICR);
+		else
+			kvm_apic_send_ipi(apic, icrl, icrh);
 		break;
 	case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
 		int i;
-- 
Gitee


From 7eadd2f2617d351b1f6854b4216ca2f39feb40c2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 4 Feb 2022 21:41:58 +0000
Subject: [PATCH 1977/2161] KVM: SVM: Use common kvm_apic_write_nodecode() for
 AVIC write traps

commit ed60920efe73c8bf19e9f40b786111a520272787 upstream.

Use the common kvm_apic_write_nodecode() to handle AVIC/APIC-write traps
instead of open coding the same exact code.  This will allow making the
low level lapic helpers inaccessible outside of lapic.c code.

Opportunistically clean up the params to eliminate a bunch of svm=>vcpu
reflection.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-5-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/svm.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e8d7bcec455d..ac5a32320c96 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4722,30 +4722,28 @@ static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
 	svm->dfr_reg = dfr;
 }
 
-static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = svm->vcpu.arch.apic;
-	u32 offset = svm->vmcb->control.exit_info_1 &
+	u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
 				AVIC_UNACCEL_ACCESS_OFFSET_MASK;
 
 	switch (offset) {
 	case APIC_ID:
-		if (avic_handle_apic_id_update(&svm->vcpu))
+		if (avic_handle_apic_id_update(vcpu))
 			return 0;
 		break;
 	case APIC_LDR:
-		if (avic_handle_ldr_update(&svm->vcpu))
+		if (avic_handle_ldr_update(vcpu))
 			return 0;
 		break;
 	case APIC_DFR:
-		avic_handle_dfr_update(&svm->vcpu);
+		avic_handle_dfr_update(vcpu);
 		break;
 	default:
 		break;
 	}
 
-	kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
-
+	kvm_apic_write_nodecode(vcpu, offset);
 	return 1;
 }
 
@@ -4794,7 +4792,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
 	if (trap) {
 		/* Handling Trap */
 		WARN_ONCE(!write, "svm: Handling trap read.\n");
-		ret = avic_unaccel_trap_write(svm);
+		ret = avic_unaccel_trap_write(&svm->vcpu);
 	} else {
 		/* Handling Fault */
 		ret = kvm_emulate_instruction(&svm->vcpu, 0);
-- 
Gitee


From 728adee9b04f661ba8d3c05b72bc433cdf4b9f93 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 17 Aug 2022 16:15:34 +0800
Subject: [PATCH 1978/2161] KVM: LAPIC: Recalculate apic map in batch

commit 4abaffce4d25aa41392d2e81835592726d757857 upstream.

In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated
several times, each time it will consume 10+ us observed by ftrace in my
non-overcommit environment since the expensive memory allocate/mutex/rcu etc
operations. This patch optimizes it by recaluating apic map in batch, I hope
this can benefit the serverless scenario which can frequently create/destroy
VMs.

Before patch:

kvm_lapic_reset  ~27us

After patch:

kvm_lapic_reset  ~14us

Observed by ftrace, improve ~48%.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/lapic.c            | 46 ++++++++++++++++++++++++++-------
 arch/x86/kvm/lapic.h            |  1 +
 arch/x86/kvm/x86.c              |  1 +
 4 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b5985ad8b807..c3948b936db7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -898,6 +898,7 @@ struct kvm_arch {
 	atomic_t vapics_in_nmi_mode;
 	struct mutex apic_map_lock;
 	struct kvm_apic_map *apic_map;
+	bool apic_map_dirty;
 
 	bool apic_access_page_done;
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 47530601a3d9..a01393ab3071 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -162,14 +162,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
 	kvfree(map);
 }
 
-static void recalculate_apic_map(struct kvm *kvm)
+void kvm_recalculate_apic_map(struct kvm *kvm)
 {
 	struct kvm_apic_map *new, *old = NULL;
 	struct kvm_vcpu *vcpu;
 	int i;
 	u32 max_id = 255; /* enough space for any xAPIC ID */
 
+	if (!kvm->arch.apic_map_dirty) {
+		/*
+		 * Read kvm->arch.apic_map_dirty before
+		 * kvm->arch.apic_map
+		 */
+		smp_rmb();
+		return;
+	}
+
 	mutex_lock(&kvm->arch.apic_map_lock);
+	if (!kvm->arch.apic_map_dirty) {
+		/* Someone else has updated the map. */
+		mutex_unlock(&kvm->arch.apic_map_lock);
+		return;
+	}
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		if (kvm_apic_present(vcpu))
@@ -234,6 +248,12 @@ static void recalculate_apic_map(struct kvm *kvm)
 	old = rcu_dereference_protected(kvm->arch.apic_map,
 			lockdep_is_held(&kvm->arch.apic_map_lock));
 	rcu_assign_pointer(kvm->arch.apic_map, new);
+	/*
+	 * Write kvm->arch.apic_map before
+	 * clearing apic->apic_map_dirty
+	 */
+	smp_wmb();
+	kvm->arch.apic_map_dirty = false;
 	mutex_unlock(&kvm->arch.apic_map_lock);
 
 	if (old)
@@ -255,20 +275,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 		else
 			static_key_slow_inc(&apic_sw_disabled.key);
 
-		recalculate_apic_map(apic->vcpu->kvm);
+		apic->vcpu->kvm->arch.apic_map_dirty = true;
 	}
 }
 
 static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
 {
 	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
 {
 	kvm_lapic_set_reg(apic, APIC_LDR, id);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
@@ -284,7 +304,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 
 	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
@@ -1875,7 +1895,7 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	case APIC_DFR:
 		if (!apic_x2apic_mode(apic)) {
 			kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-			recalculate_apic_map(apic->vcpu->kvm);
+			apic->vcpu->kvm->arch.apic_map_dirty = true;
 		} else
 			ret = 1;
 		break;
@@ -1981,6 +2001,8 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 	}
 
+	kvm_recalculate_apic_map(apic->vcpu->kvm);
+
 	return ret;
 }
 
@@ -2140,7 +2162,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 			static_key_slow_dec_deferred(&apic_hw_disabled);
 		} else {
 			static_key_slow_inc(&apic_hw_disabled.key);
-			recalculate_apic_map(vcpu->kvm);
+			vcpu->kvm->arch.apic_map_dirty = true;
 		}
 	}
 
@@ -2166,6 +2188,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (!apic)
 		return;
 
+	vcpu->kvm->arch.apic_map_dirty = false;
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
@@ -2218,6 +2241,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
+
+	kvm_recalculate_apic_map(vcpu->kvm);
 }
 
 /*
@@ -2440,17 +2465,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int r;
 
-
 	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
 	/* set SPIV separately to get count of SW disabled APICs right */
 	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
 
 	r = kvm_apic_state_fixup(vcpu, s, true);
-	if (r)
+	if (r) {
+		kvm_recalculate_apic_map(vcpu->kvm);
 		return r;
+	}
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
 
-	recalculate_apic_map(vcpu->kvm);
+	kvm_recalculate_apic_map(vcpu->kvm);
 	kvm_apic_set_version(vcpu);
 
 	apic_update_ppr(apic);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 61c2e4b3bfc0..8ce10520cfff 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -84,6 +84,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_recalculate_apic_map(struct kvm *kvm);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e92a810b1d0a..7e628fcbb2fb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -368,6 +368,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	}
 
 	kvm_lapic_set_base(vcpu, msr_info->data);
+	kvm_recalculate_apic_map(vcpu->kvm);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-- 
Gitee


From d00162006496b0697b39357c7df750bd9ad2e8f0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 4 Feb 2022 21:42:00 +0000
Subject: [PATCH 1979/2161] KVM: x86: WARN if KVM emulates an IPI without
 clearing the BUSY flag

commit bd17f417c07d4e1cbc85d7b69cd663725d12e8d6 upstream.

WARN if KVM emulates an IPI without clearing the BUSY flag, failure to do
so could hang the guest if it waits for the IPI be sent.

Opportunistically use APIC_ICR_BUSY macro instead of open coding the
magic number, and add a comment to clarify why kvm_recalculate_apic_map()
is unconditionally invoked (it's really, really confusing for IPIs due to
the existence of fast paths that don't trigger a potential recalc).

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-7-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c | 10 +++++++++-
 arch/x86/kvm/x86.c   |  9 ++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a01393ab3071..50a121323faf 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1214,6 +1214,9 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
 {
 	struct kvm_lapic_irq irq;
 
+	/* KVM has no delay and should always clear the BUSY/PENDING flag. */
+	WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
+
 	irq.vector = icr_low & APIC_VECTOR_MASK;
 	irq.delivery_mode = icr_low & APIC_MODE_MASK;
 	irq.dest_mode = icr_low & APIC_DEST_MASK;
@@ -1923,7 +1926,7 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	}
 	case APIC_ICR:
 		/* No delay here, so we always clear the pending bit */
-		val &= ~(1 << 12);
+		val &= ~APIC_ICR_BUSY;
 		kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
 		kvm_lapic_set_reg(apic, APIC_ICR, val);
 		break;
@@ -2001,6 +2004,11 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 	}
 
+	/*
+	 * Recalculate APIC maps if necessary, e.g. if the software enable bit
+	 * was toggled, the APIC ID changed, etc...   The maps are marked dirty
+	 * on relevant changes, i.e. this is a nop for most writes.
+	 */
 	kvm_recalculate_apic_map(apic->vcpu->kvm);
 
 	return ret;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7e628fcbb2fb..dadcabb9361e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1661,11 +1661,10 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
 		return 1;
 
 	if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
-		((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
-		((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
-		((u32)(data >> 32) != X2APIC_BROADCAST)) {
-
-		data &= ~(1 << 12);
+	    ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+	    ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+	    ((u32)(data >> 32) != X2APIC_BROADCAST)) {
+		data &= ~APIC_ICR_BUSY;
 		kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
 		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
 		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
-- 
Gitee


From 754b86d5a2c11e5aea8a4bd6f695d3991d543b67 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 4 Feb 2022 21:42:03 +0000
Subject: [PATCH 1980/2161] KVM: x86: Treat x2APIC's ICR as a 64-bit register,
 not two 32-bit regs

commit a57a31684d7be2d4427292db21ef622ea9591f5b upstream.

Emulate the x2APIC ICR as a single 64-bit register, as opposed to forking
it across ICR and ICR2 as two 32-bit registers.  This mirrors hardware
behavior for Intel's upcoming IPI virtualization support, which does not
split the access.

Previous versions of Intel's SDM and AMD's APM don't explicitly state
exactly how ICR is reflected in the vAPIC page for x2APIC, KVM just
happened to speculate incorrectly.

Handling the upcoming behavior is necessary in order to maintain
backwards compatibility with KVM_{G,S}ET_LAPIC, e.g. failure to shuffle
the 64-bit ICR to ICR+ICR2 and vice versa would break live migration if
IPI virtualization support isn't symmetrical across the source and dest.

Cc: Zeng Guang <guang.zeng@intel.com>
Cc: Chao Gao <chao.gao@intel.com>
Cc: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-10-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c | 107 +++++++++++++++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 50a121323faf..1843e7224573 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,29 @@ static bool lapic_timer_advance_dynamic __read_mostly;
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
 
+static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
+{
+	BUILD_BUG_ON(reg != APIC_ICR);
+	return *((u64 *) (regs + reg));
+}
+
+static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
+{
+	return __kvm_lapic_get_reg64(apic->regs, reg);
+}
+
+static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
+{
+	BUILD_BUG_ON(reg != APIC_ICR);
+	*((u64 *) (regs + reg)) = val;
+}
+
+static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
+						int reg, u64 val)
+{
+	__kvm_lapic_set_reg64(apic->regs, reg, val);
+}
+
 static inline int apic_test_vector(int vec, void *bitmap)
 {
 	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -1336,7 +1359,6 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 		APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
 		APIC_REG_MASK(APIC_ESR) |
 		APIC_REG_MASK(APIC_ICR) |
-		APIC_REG_MASK(APIC_ICR2) |
 		APIC_REG_MASK(APIC_LVTT) |
 		APIC_REG_MASK(APIC_LVTTHMR) |
 		APIC_REG_MASK(APIC_LVTPC) |
@@ -1347,9 +1369,16 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 		APIC_REG_MASK(APIC_TMCCT) |
 		APIC_REG_MASK(APIC_TDCR);
 
-	/* ARBPRI is not valid on x2APIC */
+	/*
+	 * ARBPRI and ICR2 are not valid in x2APIC mode.  WARN if KVM reads ICR
+	 * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
+	 * manually handled by the caller.
+	 */
 	if (!apic_x2apic_mode(apic))
-		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
+		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+				  APIC_REG_MASK(APIC_ICR2);
+	else
+		WARN_ON_ONCE(offset == APIC_ICR);
 
 	if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset)))
 		return 1;
@@ -1925,16 +1954,18 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 	}
 	case APIC_ICR:
+		WARN_ON_ONCE(apic_x2apic_mode(apic));
+
 		/* No delay here, so we always clear the pending bit */
 		val &= ~APIC_ICR_BUSY;
 		kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
 		kvm_lapic_set_reg(apic, APIC_ICR, val);
 		break;
-
 	case APIC_ICR2:
-		if (!apic_x2apic_mode(apic))
-			val &= 0xff000000;
-		kvm_lapic_set_reg(apic, APIC_ICR2, val);
+		if (apic_x2apic_mode(apic))
+			ret = 1;
+		else
+			kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
 		break;
 
 	case APIC_LVT0:
@@ -2222,8 +2253,12 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (!apic_x2apic_mode(apic))
 		kvm_apic_set_ldr(apic, 0);
 	kvm_lapic_set_reg(apic, APIC_ESR, 0);
-	kvm_lapic_set_reg(apic, APIC_ICR, 0);
-	kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+	if (!apic_x2apic_mode(apic)) {
+		kvm_lapic_set_reg(apic, APIC_ICR, 0);
+		kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+	} else {
+		kvm_lapic_set_reg64(apic, APIC_ICR, 0);
+	}
 	kvm_lapic_set_reg(apic, APIC_TDCR, 0);
 	kvm_lapic_set_reg(apic, APIC_TMICT, 0);
 	for (i = 0; i < 8; i++) {
@@ -2443,6 +2478,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 	if (apic_x2apic_mode(vcpu->arch.apic)) {
 		u32 *id = (u32 *)(s->regs + APIC_ID);
 		u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+		u64 icr;
 
 		if (vcpu->kvm->arch.x2apic_format) {
 			if (*id != vcpu->vcpu_id)
@@ -2454,9 +2490,21 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 				*id <<= 24;
 		}
 
-		/* In x2APIC mode, the LDR is fixed and based on the id */
-		if (set)
+		/*
+		 * In x2APIC mode, the LDR is fixed and based on the id.  And
+		 * ICR is internally a single 64-bit register, but needs to be
+		 * split to ICR+ICR2 in userspace for backwards compatibility.
+		 */
+		if (set) {
 			*ldr = kvm_apic_calc_x2apic_ldr(*id);
+
+			icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+			      (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+			__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+		} else {
+			icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+			__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+		}
 	}
 
 	return 0;
@@ -2646,27 +2694,43 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 	return 0;
 }
 
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+{
+	data &= ~APIC_ICR_BUSY;
+
+	kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+	kvm_lapic_set_reg64(apic, APIC_ICR, data);
+	trace_kvm_apic_write(APIC_ICR, data);
+	return 0;
+}
+
 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
 {
-	u32 low, high = 0;
+	u32 low;
 
-	if (kvm_lapic_reg_read(apic, reg, 4, &low))
-		return 1;
+	if (reg == APIC_ICR) {
+		*data = kvm_lapic_get_reg64(apic, APIC_ICR);
+		return 0;
+	}
 
-	if (reg == APIC_ICR &&
-	    WARN_ON_ONCE(kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high)))
+	if (kvm_lapic_reg_read(apic, reg, 4, &low))
 		return 1;
 
-	*data = (((u64)high) << 32) | low;
+	*data = low;
 
 	return 0;
 }
 
 static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
 {
-	/* For 64-bit ICR writes, set ICR2 (dest) before ICR (command). */
+	/*
+	 * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and
+	 * can be written as such, all other registers remain accessible only
+	 * through 32-bit reads/writes.
+	 */
 	if (reg == APIC_ICR)
-		kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+		return kvm_x2apic_icr_write(apic, data);
+
 	return kvm_lapic_reg_write(apic, reg, (u32)data);
 }
 
@@ -2678,9 +2742,6 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 
-	if (reg == APIC_ICR2)
-		return 1;
-
 	return kvm_lapic_msr_write(apic, reg, data);
 }
 
@@ -2692,7 +2753,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 
-	if (reg == APIC_DFR || reg == APIC_ICR2)
+	if (reg == APIC_DFR)
 		return 1;
 
 	return kvm_lapic_msr_read(apic, reg, data);
-- 
Gitee


From 74806599fdc90a6b8b63381d641624f2b99cbf33 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Wed, 17 Aug 2022 17:00:59 +0800
Subject: [PATCH 1981/2161] KVM: x86: Return updated timer current count
 register from KVM_GET_LAPIC

commit 24647e0a39b6c5ef007fa2f9a91ac8b581e093b9 upstream.

kvm_vcpu_ioctl_get_lapic (implements KVM_GET_LAPIC ioctl) does a bulk copy
of the LAPIC registers but must take into account that the one-shot and
periodic timer current count register is computed upon reads and is not
present in register state. When restoring LAPIC state (e.g. after
migration), restart timers from their their current count values at time of
save.

Note: When a one-shot timer expires, the code in arch/x86/kvm/lapic.c does
not zero the value of the LAPIC initial count register (emulating HW
behavior). If no other timer is run and pending prior to a subsequent
KVM_GET_LAPIC call, the returned register set will include the expired
one-shot initial count. On a subsequent KVM_SET_LAPIC call the code will
see a non-zero initial count and start a new one-shot timer using the
expired timer's count. This is a prior existing bug and will be addressed
in a separate patch. Thanks to jmattson@google.com for this find.

Signed-off-by: Peter Shier <pshier@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <20181010225653.238911-1-pshier@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c | 61 ++++++++++++++++++++++++++++++++++++--------
 arch/x86/kvm/lapic.h |  7 ++++-
 2 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1843e7224573..2edbc690d155 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1646,13 +1646,18 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	local_irq_restore(flags);
 }
 
+static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
+{
+	return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+}
+
 static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
 {
 	ktime_t now, remaining;
 	u64 ns_remaining_old, ns_remaining_new;
 
-	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-		* APIC_BUS_CYCLE_NS * apic->divide_count;
+	apic->lapic_timer.period =
+			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
 	limit_periodic_timer_frequency(apic);
 
 	now = ktime_get();
@@ -1670,14 +1675,15 @@ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_diviso
 	apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
 }
 
-static bool set_target_expiration(struct kvm_lapic *apic)
+static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
 {
 	ktime_t now;
 	u64 tscl = rdtsc();
+	s64 deadline;
 
 	now = ktime_get();
-	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-		* APIC_BUS_CYCLE_NS * apic->divide_count;
+	apic->lapic_timer.period =
+			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
 
 	if (!apic->lapic_timer.period) {
 		apic->lapic_timer.tscdeadline = 0;
@@ -1685,10 +1691,32 @@ static bool set_target_expiration(struct kvm_lapic *apic)
 	}
 
 	limit_periodic_timer_frequency(apic);
+	deadline = apic->lapic_timer.period;
+
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+		if (unlikely(count_reg != APIC_TMICT)) {
+			deadline = tmict_to_ns(apic,
+				     kvm_lapic_get_reg(apic, count_reg));
+			if (unlikely(deadline <= 0))
+				deadline = apic->lapic_timer.period;
+			else if (unlikely(deadline > apic->lapic_timer.period)) {
+				pr_info_ratelimited(
+				    "kvm: vcpu %i: requested lapic timer restore with "
+				    "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
+				    "Using initial count to start timer.\n",
+				    apic->vcpu->vcpu_id,
+				    count_reg,
+				    kvm_lapic_get_reg(apic, count_reg),
+				    deadline, apic->lapic_timer.period);
+				kvm_lapic_set_reg(apic, count_reg, 0);
+				deadline = apic->lapic_timer.period;
+			}
+		}
+	}
 
 	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
-		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
-	apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+		nsec_to_cycles(apic->vcpu, deadline);
+	apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
 
 	return true;
 }
@@ -1870,17 +1898,22 @@ void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
 	restart_apic_timer(apic);
 }
 
-static void start_apic_timer(struct kvm_lapic *apic)
+static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
 {
 	atomic_set(&apic->lapic_timer.pending, 0);
 
 	if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
-	    && !set_target_expiration(apic))
+	    && !set_target_expiration(apic, count_reg))
 		return;
 
 	restart_apic_timer(apic);
 }
 
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+	__start_apic_timer(apic, APIC_TMICT);
+}
+
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 {
 	bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
@@ -2513,6 +2546,14 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 {
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+
+	/*
+	 * Get calculated timer current count for remaining timer period (if
+	 * any) and store it in the returned register set.
+	 */
+	__kvm_lapic_set_reg(s->regs, APIC_TMCCT,
+			    __apic_read(vcpu->arch.apic, APIC_TMCCT));
+
 	return kvm_apic_state_fixup(vcpu, s, false);
 }
 
@@ -2540,7 +2581,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 	apic_update_lvtt(apic);
 	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
 	update_divide_count(apic);
-	start_apic_timer(apic);
+	__start_apic_timer(apic, APIC_TMCCT);
 	apic->irr_pending = true;
 	apic->isr_count = vcpu->arch.apicv_active ?
 				1 : count_vectors(apic->regs + APIC_ISR);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 8ce10520cfff..9d858ac4ed93 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -160,9 +160,14 @@ static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
 	return *((u32 *) (apic->regs + reg_off));
 }
 
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+	*((u32 *) (regs + reg_off)) = val;
+}
+
 static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
 {
-	*((u32 *) (apic->regs + reg_off)) = val;
+	__kvm_lapic_set_reg(apic->regs, reg_off, val);
 }
 
 extern struct static_key kvm_no_apic_vcpu;
-- 
Gitee


From cdc10b1bae80709b26a558a4e87353bb1592f9b7 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 4 Feb 2022 21:42:04 +0000
Subject: [PATCH 1982/2161] KVM: x86: Make kvm_lapic_set_reg() a "private"
 xAPIC helper

commit b9964ee36bdf2ffcbe9084510874c2ff97b3cb46 upstream.

Hide the lapic's "raw" write helper inside lapic.c to force non-APIC code
to go through proper helpers when modification the vAPIC state.  Keep the
read helper visible to outsiders for now, refactoring KVM to hide it too
is possible, it will just take more work to do so.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220204214205.3306634-11-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/lapic.c | 10 ++++++++++
 arch/x86/kvm/lapic.h | 14 +++++---------
 arch/x86/kvm/trace.h |  6 +++---
 arch/x86/kvm/x86.c   | 10 ++--------
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2edbc690d155..aa46aad54cf0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,16 @@ static bool lapic_timer_advance_dynamic __read_mostly;
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
 
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+	*((u32 *) (regs + reg_off)) = val;
+}
+
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+	__kvm_lapic_set_reg(apic->regs, reg_off, val);
+}
+
 static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
 {
 	BUILD_BUG_ON(reg != APIC_ICR);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 9d858ac4ed93..7b5bb9bc9e5b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -116,6 +116,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
 
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data);
 void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
@@ -155,19 +156,14 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
 	apic->irr_pending = true;
 }
 
-static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
 {
-	return *((u32 *) (apic->regs + reg_off));
+	return *((u32 *) (regs + reg_off));
 }
 
-static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
-{
-	*((u32 *) (regs + reg_off)) = val;
-}
-
-static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
 {
-	__kvm_lapic_set_reg(apic->regs, reg_off, val);
+	return __kvm_lapic_get_reg(apic->regs, reg_off);
 }
 
 extern struct static_key kvm_no_apic_vcpu;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c741a0c5f80..af14b0953b9b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -192,13 +192,13 @@ TRACE_EVENT(kvm_cpuid,
  * Tracepoint for apic access.
  */
 TRACE_EVENT(kvm_apic,
-	TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+	TP_PROTO(unsigned int rw, unsigned int reg, u64 val),
 	TP_ARGS(rw, reg, val),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	rw		)
 		__field(	unsigned int,	reg		)
-		__field(	unsigned int,	val		)
+		__field(	u64,		val		)
 	),
 
 	TP_fast_assign(
@@ -207,7 +207,7 @@ TRACE_EVENT(kvm_apic,
 		__entry->val		= val;
 	),
 
-	TP_printk("apic_%s %s = 0x%x",
+	TP_printk("apic_%s %s = 0x%llx",
 		  __entry->rw ? "write" : "read",
 		  __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
 		  __entry->val)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dadcabb9361e..36bf0fdcdd29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1663,14 +1663,8 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
 	if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
 	    ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
 	    ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
-	    ((u32)(data >> 32) != X2APIC_BROADCAST)) {
-		data &= ~APIC_ICR_BUSY;
-		kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
-		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
-		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
-		trace_kvm_apic_write(APIC_ICR, (u32)data);
-		return 0;
-	}
+	    ((u32)(data >> 32) != X2APIC_BROADCAST))
+		return kvm_x2apic_icr_write(vcpu->arch.apic, data);
 	return 1;
 }
 
-- 
Gitee


From 453905859a0b5c0f8c0ed2463641c76453b9b3dd Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 16 Sep 2019 15:39:56 -0700
Subject: [PATCH 1983/2161] x86/cpu: Align cpu_caps_cleared and cpu_caps_set to
 unsigned long

commit f6a892ddd53e555362dbf64d31b47fde0f550ec4 upstream.

cpu_caps_cleared[] and cpu_caps_set[] are arrays of type u32 and therefore
naturally aligned to 4 bytes, which is also unsigned long aligned on
32-bit, but not on 64-bit.

The array pointer is handed into atomic bit operations. If the access not
aligned to unsigned long then the atomic bit operations can end up crossing
a cache line boundary, which causes the CPU to do a full bus lock as it
can't lock both cache lines at once. The bus lock operation is heavy weight
and can cause severe performance degradation.

The upcoming #AC split lock detection mechanism will issue warnings for
this kind of access.

Force the alignment of these arrays to unsigned long. This avoids the
massive code changes which would be required when converting the array data
type to unsigned long.

[ tglx: Rewrote changelog ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20190916223958.27048-2-tony.luck@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/common.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 37787f7f36d3..77b4f1f3c943 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -582,8 +582,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
 	return NULL;		/* Not found */
 }
 
-__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
-__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
 
 void load_percpu_segment(int cpu)
 {
-- 
Gitee


From 35451a2e6c041420286ec79b13459ae89ce6e74c Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 16 Sep 2019 15:39:58 -0700
Subject: [PATCH 1984/2161] x86/cpu: Align the x86_capability array to size of
 unsigned long

commit db8c33f8b5bea59d00ca12dcd6b65d01b1ea98ef upstream.

The x86_capability array in cpuinfo_x86 is of type u32 and thus is
naturally aligned to 4 bytes. But, set_bit() and clear_bit() require the
array to be aligned to size of unsigned long (i.e. 8 bytes on 64-bit
systems).

The array pointer is handed into atomic bit operations. If the access is
not aligned to unsigned long then the atomic bit operations can end up
crossing a cache line boundary, which causes the CPU to do a full bus lock
as it can't lock both cache lines at once. The bus lock operation is heavy
weight and can cause severe performance degradation.

The upcoming #AC split lock detection mechanism will issue warnings for
this kind of access.

Force the alignment of the array to unsigned long. This avoids the massive
code changes which would be required when converting the array data type to
unsigned long.

[ tglx: Rewrote changelog so it contains information WHY this is required ]

Suggested-by: David Laight <David.Laight@aculab.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190916223958.27048-4-tony.luck@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/processor.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 584414a798d5..00389a69f807 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -93,7 +93,15 @@ struct cpuinfo_x86 {
 	__u32			extended_cpuid_level;
 	/* Maximum supported CPUID level, -1=no CPUID: */
 	int			cpuid_level;
-	__u32			x86_capability[NCAPINTS + NBUGINTS];
+	/*
+	 * Align to size of unsigned long because the x86_capability array
+	 * is passed to bitops which require the alignment. Use unnamed
+	 * union to enforce the array is aligned to size of unsigned long.
+	 */
+	union {
+		__u32		x86_capability[NCAPINTS + NBUGINTS];
+		unsigned long	x86_capability_alignment;
+	};
 	char			x86_vendor_id[16];
 	char			x86_model_id[64];
 	/* in KB - valid for CPUS which support this call: */
-- 
Gitee


From 3f907414fa93c2acf10cb3b3eab062682535b519 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Sun, 26 Jan 2020 12:05:35 -0800
Subject: [PATCH 1985/2161] x86/split_lock: Enable split lock detection by
 kernel

commit 6650cdd9a8ccf00555dbbe743d58541ad8feb6a7 upstream.

A split-lock occurs when an atomic instruction operates on data that spans
two cache lines. In order to maintain atomicity the core takes a global bus
lock.

This is typically >1000 cycles slower than an atomic operation within a
cache line. It also disrupts performance on other cores (which must wait
for the bus lock to be released before their memory operations can
complete). For real-time systems this may mean missing deadlines. For other
systems it may just be very annoying.

Some CPUs have the capability to raise an #AC trap when a split lock is
attempted.

Provide a command line option to give the user choices on how to handle
this:

split_lock_detect=
	off	- not enabled (no traps for split locks)
	warn	- warn once when an application does a
		  split lock, but allow it to continue
		  running.
	fatal	- Send SIGBUS to applications that cause split lock

On systems that support split lock detection the default is "warn". Note
that if the kernel hits a split lock in any mode other than "off" it will
OOPs.

One implementation wrinkle is that the MSR to control the split lock
detection is per-core, not per thread. This might result in some short
lived races on HT systems in "warn" mode if Linux tries to enable on one
thread while disabling on the other. Race analysis by Sean Christopherson:

  - Toggling of split-lock is only done in "warn" mode.  Worst case
    scenario of a race is that a misbehaving task will generate multiple
    #AC exceptions on the same instruction.  And this race will only occur
    if both siblings are running tasks that generate split-lock #ACs, e.g.
    a race where sibling threads are writing different values will only
    occur if CPUx is disabling split-lock after an #AC and CPUy is
    re-enabling split-lock after *its* previous task generated an #AC.
  - Transitioning between off/warn/fatal modes at runtime isn't supported
    and disabling is tracked per task, so hardware will always reach a steady
    state that matches the configured mode.  I.e. split-lock is guaranteed to
    be enabled in hardware once all _TIF_SLD threads have been scheduled out.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Co-developed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20200126200535.GB30377@agluck-desk2.amr.corp.intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         |  22 +++
 arch/x86/include/asm/cpu.h                    |  12 ++
 arch/x86/include/asm/cpufeatures.h            |   2 +
 arch/x86/include/asm/msr-index.h              |   9 +
 arch/x86/include/asm/thread_info.h            |   4 +-
 arch/x86/kernel/cpu/common.c                  |   2 +
 arch/x86/kernel/cpu/intel.c                   | 175 ++++++++++++++++++
 arch/x86/kernel/process.c                     |   3 +
 arch/x86/kernel/traps.c                       |  24 ++-
 9 files changed, 250 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cf2c0aa40f18..c2dbe1378dc1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4633,6 +4633,28 @@ qspinlock.numa_spinlock_threshold_ns=	[NUMA, PV_OPS]
 			off:    Disable mitigation and remove
 				performance impact to RDRAND and RDSEED
 
+	split_lock_detect=
+			[X86] Enable split lock detection
+
+			When enabled (and if hardware support is present), atomic
+			instructions that access data across cache line
+			boundaries will result in an alignment check exception.
+
+			off	- not enabled
+
+			warn	- the kernel will emit rate limited warnings
+				  about applications triggering the #AC
+				  exception. This mode is the default on CPUs
+				  that supports split lock detection.
+
+			fatal	- the kernel will send SIGBUS to applications
+				  that trigger the #AC exception.
+
+			If an #AC exception is hit in the kernel or in
+			firmware (i.e. not while executing in user mode)
+			the kernel will oops in either "warn" or "fatal"
+			mode.
+
 	srcutree.counter_wrap_check [KNL]
 			Specifies how frequently to check for
 			grace-period sequence counter wrap for the
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index adc6cc86b062..ff6f3ca649b3 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -40,4 +40,16 @@ int mwait_usable(const struct cpuinfo_x86 *);
 unsigned int x86_family(unsigned int sig);
 unsigned int x86_model(unsigned int sig);
 unsigned int x86_stepping(unsigned int sig);
+#ifdef CONFIG_CPU_SUP_INTEL
+extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
+extern void switch_to_sld(unsigned long tifn);
+extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
+#else
+static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
+static inline void switch_to_sld(unsigned long tifn) {}
+static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+	return false;
+}
+#endif
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3772f818d378..44210a33b1e0 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -286,6 +286,7 @@
 #define X86_FEATURE_CQM_MBM_LOCAL	(11*32+ 3) /* LLC Local MBM monitoring */
 #define X86_FEATURE_FENCE_SWAPGS_USER	(11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL	(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
+#define X86_FEATURE_SPLIT_LOCK_DETECT	(11*32+ 6) /* #AC for split lock */
 #define X86_FEATURE_PER_THREAD_MBA	(11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
@@ -379,6 +380,7 @@
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_CORE_CAPABILITIES	(18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
 #define X86_FEATURE_SPEC_CTRL_SSBD	(18*32+31) /* "" Speculative Store Bypass Disable */
 
 /*
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1cc4ddcb9200..f59cd500a5d3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -41,6 +41,10 @@
 
 /* Intel MSRs. Some also available on other CPUs */
 
+#define MSR_TEST_CTRL				0x00000033
+#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT	29
+#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT		BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
+
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
 #define SPEC_CTRL_IBRS			BIT(0)	   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP_SHIFT		1	   /* Single Thread Indirect Branch Predictor (STIBP) bit */
@@ -70,6 +74,11 @@
  */
 #define MSR_IA32_UMWAIT_CONTROL_TIME_MASK	(~0x03U)
 
+/* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */
+#define MSR_IA32_CORE_CAPS			  0x000000cf
+#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT  5
+#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT	  BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT)
+
 #define MSR_PKG_CST_CONFIG_CONTROL	0x000000e2
 #define NHM_C3_AUTO_DEMOTE		(1UL << 25)
 #define NHM_C1_AUTO_DEMOTE		(1UL << 26)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a4de7aa7500f..20a1462810b1 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,6 +92,7 @@ struct thread_info {
 #define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* IA32 compatibility process */
+#define TIF_SLD			18	/* Restore split lock detection on context switch */
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG	21	/* idle is polling for TIF_NEED_RESCHED */
@@ -122,6 +123,7 @@ struct thread_info {
 #define _TIF_NOCPUID		(1 << TIF_NOCPUID)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
+#define _TIF_SLD		(1 << TIF_SLD)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
@@ -145,7 +147,7 @@ struct thread_info {
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW_BASE						\
 	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|		\
-	 _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
+	 _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD)
 
 /*
  * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 77b4f1f3c943..a27f556248fd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1322,6 +1322,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	cpu_set_bug_bits(c);
 
+	cpu_set_core_cap_bits(c);
+
 	fpu__init_system(c);
 
 	init_sigframe_size();
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 91fd4b7c25f8..087bb215aeb4 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -19,6 +19,8 @@
 #include <asm/microcode_intel.h>
 #include <asm/hwcap2.h>
 #include <asm/elf.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
 #include <asm/resctrl.h>
 
 #ifdef CONFIG_X86_64
@@ -32,6 +34,19 @@
 #include <asm/apic.h>
 #endif
 
+enum split_lock_detect_state {
+	sld_off = 0,
+	sld_warn,
+	sld_fatal,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection
+ * split_lock_setup() will switch this to sld_warn on systems that support
+ * split lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state = sld_off;
+
 /*
  * Just in case our CPU detection goes bad, or you have a weird system,
  * allow a way to override the automatic disabling of MPX.
@@ -658,6 +673,8 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 	wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
 }
 
+static void split_lock_init(void);
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	early_init_intel(c);
@@ -775,6 +792,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 		tsx_enable();
 	if (tsx_ctrl_state == TSX_CTRL_DISABLE)
 		tsx_disable();
+
+	split_lock_init();
 }
 
 #ifdef CONFIG_X86_32
@@ -1037,3 +1056,159 @@ static const struct cpu_dev intel_cpu_dev = {
 };
 
 cpu_dev_register(intel_cpu_dev);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+static const struct {
+	const char			*option;
+	enum split_lock_detect_state	state;
+} sld_options[] __initconst = {
+	{ "off",	sld_off   },
+	{ "warn",	sld_warn  },
+	{ "fatal",	sld_fatal },
+};
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+	int len = strlen(opt);
+
+	return len == arglen && !strncmp(arg, opt, len);
+}
+
+static void __init split_lock_setup(void)
+{
+	char arg[20];
+	int i, ret;
+
+	setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+	sld_state = sld_warn;
+
+	ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+				  arg, sizeof(arg));
+	if (ret >= 0) {
+		for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+			if (match_option(arg, ret, sld_options[i].option)) {
+				sld_state = sld_options[i].state;
+				break;
+			}
+		}
+	}
+
+	switch (sld_state) {
+	case sld_off:
+		pr_info("disabled\n");
+		break;
+
+	case sld_warn:
+		pr_info("warning about user-space split_locks\n");
+		break;
+
+	case sld_fatal:
+		pr_info("sending SIGBUS on user-space split_locks\n");
+		break;
+	}
+}
+
+/*
+ * Locking is not required at the moment because only bit 29 of this
+ * MSR is implemented and locking would not prevent that the operation
+ * of one thread is immediately undone by the sibling thread.
+ * Use the "safe" versions of rdmsr/wrmsr here because although code
+ * checks CPUID and MSR bits to make sure the TEST_CTRL MSR should
+ * exist, there may be glitches in virtualization that leave a guest
+ * with an incorrect view of real h/w capabilities.
+ */
+static bool __sld_msr_set(bool on)
+{
+	u64 test_ctrl_val;
+
+	if (rdmsrl_safe(MSR_TEST_CTRL, &test_ctrl_val))
+		return false;
+
+	if (on)
+		test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+	else
+		test_ctrl_val &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+	return !wrmsrl_safe(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+static void split_lock_init(void)
+{
+	if (sld_state == sld_off)
+		return;
+
+	if (__sld_msr_set(true))
+		return;
+
+	/*
+	 * If this is anything other than the boot-cpu, you've done
+	 * funny things and you get to keep whatever pieces.
+	 */
+	pr_warn("MSR fail -- disabled\n");
+	sld_state = sld_off;
+}
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+	if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+		return false;
+
+	pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+			    current->comm, current->pid, regs->ip);
+
+	/*
+	 * Disable the split lock detection for this task so it can make
+	 * progress and set TIF_SLD so the detection is re-enabled via
+	 * switch_to_sld() when the task is scheduled out.
+	 */
+	__sld_msr_set(false);
+	set_tsk_thread_flag(current, TIF_SLD);
+	return true;
+}
+
+/*
+ * This function is called only when switching between tasks with
+ * different split-lock detection modes. It sets the MSR for the
+ * mode of the new task. This is right most of the time, but since
+ * the MSR is shared by hyperthreads on a physical core there can
+ * be glitches when the two threads need different modes.
+ */
+void switch_to_sld(unsigned long tifn)
+{
+	__sld_msr_set(!(tifn & _TIF_SLD));
+}
+
+#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY}
+
+/*
+ * The following processors have the split lock detection feature. But
+ * since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot
+ * be enumerated. Enable it by family and model matching on these
+ * processors.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+	SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_X),
+	SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_L),
+	{}
+};
+
+void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
+{
+	u64 ia32_core_caps = 0;
+
+	if (c->x86_vendor != X86_VENDOR_INTEL)
+		return;
+	if (cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) {
+		/* Enumerate features reported in IA32_CORE_CAPABILITIES MSR. */
+		rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+	} else if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+		/* Enumerate split lock detection by family and model. */
+		if (x86_match_cpu(split_lock_cpu_ids))
+			ia32_core_caps |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT;
+	}
+
+	if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+		split_lock_setup();
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 976d24f03b65..a3fb6e2480b6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -653,6 +653,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
 		/* Enforce MSR update to ensure consistent state */
 		__speculation_ctrl_update(~tifn, tifn);
 	}
+
+	if ((tifp ^ tifn) & _TIF_SLD)
+		switch_to_sld(tifn);
 }
 
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9b741cd9f778..7bddac52e146 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -53,6 +53,7 @@
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/fpu/api.h>
+#include <asm/cpu.h>
 #include <asm/cpu_entry_area.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
@@ -181,7 +182,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 {
 	struct task_struct *tsk = current;
 
-
 	if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
 		return;
 
@@ -242,9 +242,29 @@ DO_ERROR(X86_TRAP_OLD_MF, SIGFPE,           0, NULL, "coprocessor segment overru
 DO_ERROR(X86_TRAP_TS,     SIGSEGV,          0, NULL, "invalid TSS",         invalid_TSS)
 DO_ERROR(X86_TRAP_NP,     SIGBUS,           0, NULL, "segment not present", segment_not_present)
 DO_ERROR(X86_TRAP_SS,     SIGBUS,           0, NULL, "stack segment",       stack_segment)
-DO_ERROR(X86_TRAP_AC,     SIGBUS,  BUS_ADRALN, NULL, "alignment check",     alignment_check)
 #undef IP
 
+dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
+{
+	char *str = "alignment check";
+
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+	if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
+		return;
+
+	if (!user_mode(regs))
+		die("Split lock detected\n", regs, error_code);
+
+	local_irq_enable();
+
+	if (handle_user_split_lock(regs, error_code))
+		return;
+
+	do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
+		error_code, BUS_ADRALN, NULL);
+}
+
 #ifdef CONFIG_VMAP_STACK
 __visible void __noreturn handle_stack_overflow(const char *message,
 						struct pt_regs *regs,
-- 
Gitee


From 84e96c333ed1813e51ba4b1bec9dcb62eb4400f2 Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Wed, 25 Mar 2020 11:09:23 +0800
Subject: [PATCH 1986/2161] x86/split_lock: Rework the initialization flow of
 split lock detection

commit dbaba47085b0c2aa793ce849750164bd3765e163 upstream.

Current initialization flow of split lock detection has following issues:

1. It assumes the initial value of MSR_TEST_CTRL.SPLIT_LOCK_DETECT to be
   zero. However, it's possible that BIOS/firmware has set it.

2. X86_FEATURE_SPLIT_LOCK_DETECT flag is unconditionally set even if
   there is a virtualization flaw that FMS indicates the existence while
   it's actually not supported.

Rework the initialization flow to solve above issues. In detail, explicitly
clear and set split_lock_detect bit to verify MSR_TEST_CTRL can be
accessed, and rdmsr after wrmsr to ensure bit is cleared/set successfully.

X86_FEATURE_SPLIT_LOCK_DETECT flag is set only when the feature does exist
and the feature is not disabled with kernel param "split_lock_detect=off"

On each processor, explicitly updating the SPLIT_LOCK_DETECT bit based on
sld_sate in split_lock_init() since BIOS/firmware may touch it.

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200325030924.132881-2-xiaoyao.li@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 75 +++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 087bb215aeb4..5454182eb931 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -45,7 +45,7 @@ enum split_lock_detect_state {
  * split_lock_setup() will switch this to sld_warn on systems that support
  * split lock detect, unless there is a command line override.
  */
-static enum split_lock_detect_state sld_state = sld_off;
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
 
 /*
  * Just in case our CPU detection goes bad, or you have a weird system,
@@ -1076,78 +1076,87 @@ static inline bool match_option(const char *arg, int arglen, const char *opt)
 	return len == arglen && !strncmp(arg, opt, len);
 }
 
+static bool split_lock_verify_msr(bool on)
+{
+	u64 ctrl, tmp;
+
+	if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
+		return false;
+	if (on)
+		ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+	else
+		ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+	if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
+		return false;
+	rdmsrl(MSR_TEST_CTRL, tmp);
+	return ctrl == tmp;
+}
+
 static void __init split_lock_setup(void)
 {
+	enum split_lock_detect_state state = sld_warn;
 	char arg[20];
 	int i, ret;
 
-	setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
-	sld_state = sld_warn;
+	if (!split_lock_verify_msr(false)) {
+		pr_info("MSR access failed: Disabled\n");
+		return;
+	}
 
 	ret = cmdline_find_option(boot_command_line, "split_lock_detect",
 				  arg, sizeof(arg));
 	if (ret >= 0) {
 		for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
 			if (match_option(arg, ret, sld_options[i].option)) {
-				sld_state = sld_options[i].state;
+				state = sld_options[i].state;
 				break;
 			}
 		}
 	}
 
-	switch (sld_state) {
+	switch (state) {
 	case sld_off:
 		pr_info("disabled\n");
-		break;
-
+		return;
 	case sld_warn:
 		pr_info("warning about user-space split_locks\n");
 		break;
-
 	case sld_fatal:
 		pr_info("sending SIGBUS on user-space split_locks\n");
 		break;
 	}
+
+	if (!split_lock_verify_msr(true)) {
+		pr_info("MSR access failed: Disabled\n");
+		return;
+	}
+
+	sld_state = state;
+	setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
 }
 
 /*
- * Locking is not required at the moment because only bit 29 of this
- * MSR is implemented and locking would not prevent that the operation
- * of one thread is immediately undone by the sibling thread.
- * Use the "safe" versions of rdmsr/wrmsr here because although code
- * checks CPUID and MSR bits to make sure the TEST_CTRL MSR should
- * exist, there may be glitches in virtualization that leave a guest
- * with an incorrect view of real h/w capabilities.
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
  */
-static bool __sld_msr_set(bool on)
+static void sld_update_msr(bool on)
 {
 	u64 test_ctrl_val;
 
-	if (rdmsrl_safe(MSR_TEST_CTRL, &test_ctrl_val))
-		return false;
+	rdmsrl(MSR_TEST_CTRL, test_ctrl_val);
 
 	if (on)
 		test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
 	else
 		test_ctrl_val &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
 
-	return !wrmsrl_safe(MSR_TEST_CTRL, test_ctrl_val);
+	wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
 }
 
 static void split_lock_init(void)
 {
-	if (sld_state == sld_off)
-		return;
-
-	if (__sld_msr_set(true))
-		return;
-
-	/*
-	 * If this is anything other than the boot-cpu, you've done
-	 * funny things and you get to keep whatever pieces.
-	 */
-	pr_warn("MSR fail -- disabled\n");
-	sld_state = sld_off;
+	split_lock_verify_msr(sld_state != sld_off);
 }
 
 bool handle_user_split_lock(struct pt_regs *regs, long error_code)
@@ -1163,7 +1172,7 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 	 * progress and set TIF_SLD so the detection is re-enabled via
 	 * switch_to_sld() when the task is scheduled out.
 	 */
-	__sld_msr_set(false);
+	sld_update_msr(false);
 	set_tsk_thread_flag(current, TIF_SLD);
 	return true;
 }
@@ -1177,7 +1186,7 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
  */
 void switch_to_sld(unsigned long tifn)
 {
-	__sld_msr_set(!(tifn & _TIF_SLD));
+	sld_update_msr(!(tifn & _TIF_SLD));
 }
 
 #define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY}
-- 
Gitee


From 6a3de55a7b1515b00606ca7582d53fdba67c4074 Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Wed, 25 Mar 2020 11:09:24 +0800
Subject: [PATCH 1987/2161] x86/split_lock: Avoid runtime reads of the
 TEST_CTRL MSR

commit a6a60741035bb48ca8d9f92a138958818148064c upstream.

In a context switch from a task that is detecting split locks to one that
is not (or vice versa) we need to update the TEST_CTRL MSR. Currently this
is done with the common sequence:

        read the MSR
	flip the bit
	write the MSR
in order to avoid changing the value of any reserved bits in the MSR.

Cache unused and reserved bits of TEST_CTRL MSR with SPLIT_LOCK_DETECT bit
cleared during initialization, so we can avoid an expensive RDMSR
instruction during context switch.

Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Originally-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200325030924.132881-3-xiaoyao.li@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 5454182eb931..c95efe5ddb60 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -46,6 +46,7 @@ enum split_lock_detect_state {
  * split lock detect, unless there is a command line override.
  */
 static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
 
 /*
  * Just in case our CPU detection goes bad, or you have a weird system,
@@ -1126,6 +1127,8 @@ static void __init split_lock_setup(void)
 		break;
 	}
 
+	rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
 	if (!split_lock_verify_msr(true)) {
 		pr_info("MSR access failed: Disabled\n");
 		return;
@@ -1142,14 +1145,10 @@ static void __init split_lock_setup(void)
  */
 static void sld_update_msr(bool on)
 {
-	u64 test_ctrl_val;
-
-	rdmsrl(MSR_TEST_CTRL, test_ctrl_val);
+	u64 test_ctrl_val = msr_test_ctrl_cache;
 
 	if (on)
 		test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
-	else
-		test_ctrl_val &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
 
 	wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
 }
-- 
Gitee


From 9ca1a3925a192d882d2589ef465342833eec148c Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 16 Apr 2020 13:57:52 -0700
Subject: [PATCH 1988/2161] x86/split_lock: Update to use
 X86_MATCH_INTEL_FAM6_MODEL()

commit 3ab0762d1edfda6ccbc08f636acab42c103c299f upstream.

The SPLIT_LOCK_CPU() macro escaped the tree-wide sweep for old-style
initialization. Update to use X86_MATCH_INTEL_FAM6_MODEL().

Fixes: 6650cdd9a8cc ("x86/split_lock: Enable split lock detection by kernel")
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200416205754.21177-2-tony.luck@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c95efe5ddb60..33264650f219 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1188,8 +1188,6 @@ void switch_to_sld(unsigned long tifn)
 	sld_update_msr(!(tifn & _TIF_SLD));
 }
 
-#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY}
-
 /*
  * The following processors have the split lock detection feature. But
  * since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot
@@ -1197,8 +1195,8 @@ void switch_to_sld(unsigned long tifn)
  * processors.
  */
 static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
-	SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_X),
-	SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_L),
+	INTEL_CPU_FAM6(ICELAKE_X,		0),
+	INTEL_CPU_FAM6(ICELAKE_L,		0),
 	{}
 };
 
-- 
Gitee


From 28bc4c7d0f002ecf4afbc574e8bb10f5e0235949 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 16 Apr 2020 13:57:53 -0700
Subject: [PATCH 1989/2161] x86/split_lock: Bits in IA32_CORE_CAPABILITIES are
 not architectural

commit 48fd5b5ee714714f4cf9f9e1cba3b49b1fd40ed6 upstream.

The Intel Software Developers' Manual erroneously listed bit 5 of the
IA32_CORE_CAPABILITIES register as an architectural feature. It is not.

Features enumerated by IA32_CORE_CAPABILITIES are model specific and
implementation details may vary in different cpu models. Thus it is only
safe to trust features after checking the CPU model.

Icelake client and server models are known to implement the split lock
detect feature even though they don't enumerate IA32_CORE_CAPABILITIES

[ tglx: Use switch() for readability and massage comments ]

Fixes: 6650cdd9a8cc ("x86/split_lock: Enable split lock detection by kernel")
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200416205754.21177-3-tony.luck@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 45 +++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 33264650f219..91b87134e4a9 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1189,10 +1189,17 @@ void switch_to_sld(unsigned long tifn)
 }
 
 /*
- * The following processors have the split lock detection feature. But
- * since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot
- * be enumerated. Enable it by family and model matching on these
- * processors.
+ * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
+ * only be trusted if it is confirmed that a CPU model implements a
+ * specific feature at a particular bit position.
+ *
+ * The possible driver data field values:
+ *
+ * - 0: CPU models that are known to have the per-core split-lock detection
+ *	feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ *
+ * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use
+ *      bit 5 to enumerate the per-core split-lock detection feature.
  */
 static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
 	INTEL_CPU_FAM6(ICELAKE_X,		0),
@@ -1202,19 +1209,29 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
 
 void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
 {
-	u64 ia32_core_caps = 0;
+	const struct x86_cpu_id *m;
+	u64 ia32_core_caps;
+
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return;
 
-	if (c->x86_vendor != X86_VENDOR_INTEL)
+	m = x86_match_cpu(split_lock_cpu_ids);
+	if (!m)
 		return;
-	if (cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) {
-		/* Enumerate features reported in IA32_CORE_CAPABILITIES MSR. */
+
+	switch (m->driver_data) {
+	case 0:
+		break;
+	case 1:
+		if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+			return;
 		rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
-	} else if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
-		/* Enumerate split lock detection by family and model. */
-		if (x86_match_cpu(split_lock_cpu_ids))
-			ia32_core_caps |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT;
+		if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
+			return;
+		break;
+	default:
+		return;
 	}
 
-	if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
-		split_lock_setup();
+	split_lock_setup();
 }
-- 
Gitee


From 8d83fd3c0492ef39e0d584db75fad0661acc7c4a Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 18 Aug 2022 17:24:32 +0800
Subject: [PATCH 1990/2161] x86/split_lock: Add Tremont family CPU models

commit 8b9a18a9f2494144fe23fe630d0734310fa65301 upstream.

Tremont CPUs support IA32_CORE_CAPABILITIES bits to indicate whether
specific SKUs have support for split lock detection.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200416205754.21177-4-tony.luck@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 91b87134e4a9..e3806fb62db8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1204,6 +1204,9 @@ void switch_to_sld(unsigned long tifn)
 static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
 	INTEL_CPU_FAM6(ICELAKE_X,		0),
 	INTEL_CPU_FAM6(ICELAKE_L,		0),
+	INTEL_CPU_FAM6(ATOM_TREMONT,		1),
+	INTEL_CPU_FAM6(ATOM_TREMONT_D,		1),
+	INTEL_CPU_FAM6(ATOM_TREMONT_L,		1),
 	{}
 };
 
-- 
Gitee


From 88833a3ef70b3f0e51121e8e139fb26b86ceaa5e Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 18 Aug 2022 17:35:40 +0800
Subject: [PATCH 1991/2161] x86/split_lock: Add Icelake microserver and
 Tigerlake CPU models

commit 429ac8b75a0b1c3478ffd584de8a63075cbe25e7 upstream.

Icelake microserver CPU supports split lock detection while it doesn't
have the split lock enumeration bit in IA32_CORE_CAPABILITIES. Tigerlake
CPUs do enumerate the MSR.

 [ bp: Merge the two model-adding patches into one. ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1588290395-2677-1-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index e3806fb62db8..c5e3033a4143 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1204,9 +1204,12 @@ void switch_to_sld(unsigned long tifn)
 static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
 	INTEL_CPU_FAM6(ICELAKE_X,		0),
 	INTEL_CPU_FAM6(ICELAKE_L,		0),
+	INTEL_CPU_FAM6(ICELAKE_D,		0),
 	INTEL_CPU_FAM6(ATOM_TREMONT,		1),
 	INTEL_CPU_FAM6(ATOM_TREMONT_D,		1),
 	INTEL_CPU_FAM6(ATOM_TREMONT_L,		1),
+	INTEL_CPU_FAM6(TIGERLAKE_L,		1),
+	INTEL_CPU_FAM6(TIGERLAKE,		1),
 	{}
 };
 
-- 
Gitee


From 2512e9cc2cee522c4533aadf90071dfd2d8c5f9c Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 18 Aug 2022 17:38:01 +0800
Subject: [PATCH 1992/2161] x86/split_lock: Enable the split lock feature on
 Sapphire Rapids and Alder Lake CPUs

commit 3aae57f0c3ba57715cf89201b5a5f290684078a5 upstream.

Add Sapphire Rapids and Alder Lake processors to CPU list to enumerate
and enable the split lock feature.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/1595634320-79689-1-git-send-email-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c5e3033a4143..d506b704bd9d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1210,6 +1210,8 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
 	INTEL_CPU_FAM6(ATOM_TREMONT_L,		1),
 	INTEL_CPU_FAM6(TIGERLAKE_L,		1),
 	INTEL_CPU_FAM6(TIGERLAKE,		1),
+	INTEL_CPU_FAM6(SAPPHIRERAPIDS_X,	1),
+	INTEL_CPU_FAM6(ALDERLAKE,		1),
 	{}
 };
 
-- 
Gitee


From f3e5b0fad872068dc1da707e7a090dd4d49cf15c Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 16 Dec 2019 16:33:44 +0800
Subject: [PATCH 1993/2161] x86/cpu: Add Jasper Lake to Intel family

commit b2d32af0bff402b4c1fce28311759dd1f6af058a upstream.

Japser Lake is an Atom family processor.
It uses Tremont cores and is targeted at mobile platforms.

Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/intel-family.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 1433791d9e6d..a113c16f7efd 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -120,6 +120,7 @@
 
 #define INTEL_FAM6_ATOM_TREMONT_D	0x86 /* Jacobsville */
 #define INTEL_FAM6_ATOM_TREMONT		0x96 /* Elkhart Lake */
+#define INTEL_FAM6_ATOM_TREMONT_L	0x9C /* Jasper Lake */
 
 /* Xeon Phi */
 
-- 
Gitee


From 049732028e89da84be08560a79cf5ab234ba2c4d Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Thu, 18 Aug 2022 18:02:50 +0800
Subject: [PATCH 1994/2161] x86/split_lock: Fix compile error for macro
 INTEL_CPU_FAM6()

commit opencloudos.

Macro INTEL_CPU_FAM6() could not accept constant as .driver_data

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index d506b704bd9d..6aef27e40b3a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1202,16 +1202,16 @@ void switch_to_sld(unsigned long tifn)
  *      bit 5 to enumerate the per-core split-lock detection feature.
  */
 static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
-	INTEL_CPU_FAM6(ICELAKE_X,		0),
-	INTEL_CPU_FAM6(ICELAKE_L,		0),
-	INTEL_CPU_FAM6(ICELAKE_D,		0),
-	INTEL_CPU_FAM6(ATOM_TREMONT,		1),
-	INTEL_CPU_FAM6(ATOM_TREMONT_D,		1),
-	INTEL_CPU_FAM6(ATOM_TREMONT_L,		1),
-	INTEL_CPU_FAM6(TIGERLAKE_L,		1),
-	INTEL_CPU_FAM6(TIGERLAKE,		1),
-	INTEL_CPU_FAM6(SAPPHIRERAPIDS_X,	1),
-	INTEL_CPU_FAM6(ALDERLAKE,		1),
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_X,		X86_FEATURE_ANY, 0},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_L,		X86_FEATURE_ANY, 0},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_D,		X86_FEATURE_ANY, 0},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT,		X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT_D,	X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT_L,	X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_TIGERLAKE_L,		X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_TIGERLAKE,		X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_SAPPHIRERAPIDS_X,	X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE,		X86_FEATURE_ANY, 1},
 	{}
 };
 
-- 
Gitee


From 4a3be718d945fd2a733293d013979f2ed0b22419 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 22 Mar 2021 13:53:23 +0000
Subject: [PATCH 1995/2161] x86/cpufeatures: Enumerate #DB for bus lock
 detection

commit f21d4d3b97a8603567e5d4250bd75e8ebbd520af upstream.

A bus lock is acquired through either a split locked access to writeback
(WB) memory or any locked access to non-WB memory. This is typically >1000
cycles slower than an atomic operation within a cache line. It also
disrupts performance on other cores.

Some CPUs have the ability to notify the kernel by a #DB trap after a user
instruction acquires a bus lock and is executed. This allows the kernel to
enforce user application throttling or mitigation. Both breakpoint and bus
lock can trigger the #DB trap in the same instruction and the ordering of
handling them is the kernel #DB handler's choice.

The CPU feature flag to be shown in /proc/cpuinfo will be "bus_lock_detect".

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210322135325.682257-2-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 44210a33b1e0..0778e487c780 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -349,6 +349,7 @@
 #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_LA57		(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID		(16*32+22) /* RDPID instruction */
+#define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* Bus Lock detect */
 #define X86_FEATURE_CLDEMOTE		(16*32+25) /* CLDEMOTE instruction */
 #define X86_FEATURE_MOVDIRI		(16*32+27) /* MOVDIRI instruction */
 #define X86_FEATURE_MOVDIR64B		(16*32+28) /* MOVDIR64B instruction */
-- 
Gitee


From a5e9197e54428240b56ffca14ea3b9b9fd7679af Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Apr 2020 13:54:00 +0200
Subject: [PATCH 1996/2161] x86/split_lock: Provide handle_guest_split_lock()

commit d7e94dbdac1a40924626b0efc7ff530c8baf5e4a upstream.

Without at least minimal handling for split lock detection induced #AC,
VMX will just run into the same problem as the VMWare hypervisor, which
was reported by Kenneth.

It will inject the #AC blindly into the guest whether the guest is
prepared or not.

Provide a function for guest mode which acts depending on the host
SLD mode. If mode == sld_warn, treat it like user space, i.e. emit a
warning, disable SLD and mark the task accordingly. Otherwise force
SIGBUS.

 [ bp: Add a !CPU_SUP_INTEL stub for handle_guest_split_lock(). ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lkml.kernel.org/r/20200410115516.978037132@linutronix.de
Link: https://lkml.kernel.org/r/20200402123258.895628824@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpu.h  |  6 ++++++
 arch/x86/kernel/cpu/intel.c | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index ff6f3ca649b3..dd17c2da1af5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -44,6 +44,7 @@ unsigned int x86_stepping(unsigned int sig);
 extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
 extern void switch_to_sld(unsigned long tifn);
 extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
+extern bool handle_guest_split_lock(unsigned long ip);
 #else
 static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
 static inline void switch_to_sld(unsigned long tifn) {}
@@ -51,5 +52,10 @@ static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 {
 	return false;
 }
+
+static inline bool handle_guest_split_lock(unsigned long ip)
+{
+	return false;
+}
 #endif
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 6aef27e40b3a..ec52546e45fd 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -21,6 +21,7 @@
 #include <asm/elf.h>
 #include <asm/cpu_device_id.h>
 #include <asm/cmdline.h>
+#include <asm/traps.h>
 #include <asm/resctrl.h>
 
 #ifdef CONFIG_X86_64
@@ -1158,13 +1159,10 @@ static void split_lock_init(void)
 	split_lock_verify_msr(sld_state != sld_off);
 }
 
-bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+static void split_lock_warn(unsigned long ip)
 {
-	if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
-		return false;
-
 	pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
-			    current->comm, current->pid, regs->ip);
+			    current->comm, current->pid, ip);
 
 	/*
 	 * Disable the split lock detection for this task so it can make
@@ -1173,6 +1171,31 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 	 */
 	sld_update_msr(false);
 	set_tsk_thread_flag(current, TIF_SLD);
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+	if (sld_state == sld_warn) {
+		split_lock_warn(ip);
+		return true;
+	}
+
+	pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+		     current->comm, current->pid,
+		     sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+	current->thread.error_code = 0;
+	current->thread.trap_nr = X86_TRAP_AC;
+	force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+	return false;
+}
+EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+	if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+		return false;
+	split_lock_warn(regs->ip);
 	return true;
 }
 
-- 
Gitee


From d377260ebfdf1120453b324d2d24afe1d85d58a2 Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Fri, 10 Apr 2020 13:54:01 +0200
Subject: [PATCH 1997/2161] KVM: x86: Emulate split-lock access as a write in
 emulator

commit 9de6fe3c28d6d8feadfad907961f1f31b85c6985 upstream.

Emulate split-lock accesses as writes if split lock detection is on
to avoid #AC during emulation, which will result in a panic(). This
should never occur for a well-behaved guest, but a malicious guest can
manipulate the TLB to trigger emulation of a locked instruction[1].

More discussion can be found at [2][3].

[1] https://lkml.kernel.org/r/8c5b11c9-58df-38e7-a514-dc12d687b198@redhat.com
[2] https://lkml.kernel.org/r/20200131200134.GD18946@linux.intel.com
[3] https://lkml.kernel.org/r/20200227001117.GX9940@linux.intel.com

Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lkml.kernel.org/r/20200410115517.084300242@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/x86.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 36bf0fdcdd29..b3e051858689 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6087,6 +6087,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 {
 	struct kvm_host_map map;
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	u64 page_line_mask;
 	gpa_t gpa;
 	char *kaddr;
 	bool exchanged;
@@ -6101,7 +6102,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto emul_write;
 
-	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+	/*
+	 * Emulate the atomic as a straight write to avoid #AC if SLD is
+	 * enabled in the host and the access splits a cache line.
+	 */
+	if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+		page_line_mask = ~(cache_line_size() - 1);
+	else
+		page_line_mask = PAGE_MASK;
+
+	if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
 		goto emul_write;
 
 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
-- 
Gitee


From 9bff53565eae4c68d28d6db4313a688a8b2b41b2 Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Fri, 10 Apr 2020 13:54:02 +0200
Subject: [PATCH 1998/2161] KVM: VMX: Extend VMXs #AC interceptor to handle
 split lock #AC in guest

commit e6f8b6c12f03818baacc5f504fe83fa5e20771d6 upstream.

Two types of #AC can be generated in Intel CPUs:
 1. legacy alignment check #AC
 2. split lock #AC

Reflect #AC back into the guest if the guest has legacy alignment checks
enabled or if split lock detection is disabled.

If the #AC is not a legacy one and split lock detection is enabled, then
invoke handle_guest_split_lock() which will either warn and disable split
lock detection for this task or force SIGBUS on it.

[ tglx: Switch it to handle_guest_split_lock() and rename the misnamed
  helper function. ]

Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lkml.kernel.org/r/20200410115517.176308876@linutronix.de
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8e169eadaeff..e09c531a1ebe 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4725,6 +4725,26 @@ static int handle_machine_check(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * If the host has split lock detection disabled, then #AC is
+ * unconditionally injected into the guest, which is the pre split lock
+ * detection behaviour.
+ *
+ * If the host has split lock detection enabled then #AC is
+ * only injected into the guest when:
+ *  - Guest CPL == 3 (user mode)
+ *  - Guest has #AC detection enabled in CR0
+ *  - Guest EFLAGS has AC bit set
+ */
+static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
+{
+	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+		return true;
+
+	return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+	       (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
+}
+
 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4801,9 +4821,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 		return handle_rmode_exception(vcpu, ex_no, error_code);
 
 	switch (ex_no) {
-	case AC_VECTOR:
-		kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
-		return 1;
 	case DB_VECTOR:
 		dr6 = vmcs_readl(EXIT_QUALIFICATION);
 		if (!(vcpu->guest_debug &
@@ -4832,6 +4849,20 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
 		kvm_run->debug.arch.exception = ex_no;
 		break;
+	case AC_VECTOR:
+		if (guest_inject_ac(vcpu)) {
+			kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+			return 1;
+		}
+
+		/*
+		 * Handle split lock. Depending on detection mode this will
+		 * either warn and disable split lock detection for this
+		 * task or force SIGBUS on it.
+		 */
+		if (handle_guest_split_lock(kvm_rip_read(vcpu)))
+			return 1;
+		fallthrough;
 	default:
 		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
 		kvm_run->ex.exception = ex_no;
-- 
Gitee


From f6ab6b1fe8e84e6e422e81f1f93d1bac0f73858d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 5 Jun 2020 12:26:05 -0700
Subject: [PATCH 1999/2161] x86/split_lock: Don't write MSR_TEST_CTRL on CPUs
 that aren't whitelisted

commit 009bce1df0bb5eb970b9eb98d963861f7fe353c7 upstream.

Choo! Choo!  All aboard the Split Lock Express, with direct service to
Wreckage!

Skip split_lock_verify_msr() if the CPU isn't whitelisted as a possible
SLD-enabled CPU model to avoid writing MSR_TEST_CTRL.  MSR_TEST_CTRL
exists, and is writable, on many generations of CPUs.  Writing the MSR,
even with '0', can result in bizarre, undocumented behavior.

This fixes a crash on Haswell when resuming from suspend with a live KVM
guest.  Because APs use the standard SMP boot flow for resume, they will
go through split_lock_init() and the subsequent RDMSR/WRMSR sequence,
which runs even when sld_state==sld_off to ensure SLD is disabled.  On
Haswell (at least, my Haswell), writing MSR_TEST_CTRL with '0' will
succeed and _may_ take the SMT _sibling_ out of VMX root mode.

When KVM has an active guest, KVM performs VMXON as part of CPU onlining
(see kvm_starting_cpu()).  Because SMP boot is serialized, the resulting
flow is effectively:

  on_each_ap_cpu() {
     WRMSR(MSR_TEST_CTRL, 0)
     VMXON
  }

As a result, the WRMSR can disable VMX on a different CPU that has
already done VMXON.  This ultimately results in a #UD on VMPTRLD when
KVM regains control and attempt run its vCPUs.

The above voodoo was confirmed by reworking KVM's VMXON flow to write
MSR_TEST_CTRL prior to VMXON, and to serialize the sequence as above.
Further verification of the insanity was done by redoing VMXON on all
APs after the initial WRMSR->VMXON sequence.  The additional VMXON,
which should VM-Fail, occasionally succeeded, and also eliminated the
unexpected #UD on VMPTRLD.

The damage done by writing MSR_TEST_CTRL doesn't appear to be limited
to VMX, e.g. after suspend with an active KVM guest, subsequent reboots
almost always hang (even when fudging VMXON), a #UD on a random Jcc was
observed, suspend/resume stability is qualitatively poor, and so on and
so forth.

  kernel BUG at arch/x86/kvm/x86.c:386!
  CPU: 1 PID: 2592 Comm: CPU 6/KVM Tainted: G      D
  Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014
  RIP: 0010:kvm_spurious_fault+0xf/0x20
  Call Trace:
   vmx_vcpu_load_vmcs+0x1fb/0x2b0
   vmx_vcpu_load+0x3e/0x160
   kvm_arch_vcpu_load+0x48/0x260
   finish_task_switch+0x140/0x260
   __schedule+0x460/0x720
   _cond_resched+0x2d/0x40
   kvm_arch_vcpu_ioctl_run+0x82e/0x1ca0
   kvm_vcpu_ioctl+0x363/0x5c0
   ksys_ioctl+0x88/0xa0
   __x64_sys_ioctl+0x16/0x20
   do_syscall_64+0x4c/0x170
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: dbaba47085b0c ("x86/split_lock: Rework the initialization flow of split lock detection")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20200605192605.7439-1-sean.j.christopherson@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ec52546e45fd..69409c6f9e68 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -49,6 +49,13 @@ enum split_lock_detect_state {
 static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
 static u64 msr_test_ctrl_cache __ro_after_init;
 
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models.  Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
 /*
  * Just in case our CPU detection goes bad, or you have a weird system,
  * allow a way to override the automatic disabling of MPX.
@@ -1156,7 +1163,8 @@ static void sld_update_msr(bool on)
 
 static void split_lock_init(void)
 {
-	split_lock_verify_msr(sld_state != sld_off);
+	if (cpu_model_supports_sld)
+		split_lock_verify_msr(sld_state != sld_off);
 }
 
 static void split_lock_warn(unsigned long ip)
@@ -1264,5 +1272,6 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
 		return;
 	}
 
+	cpu_model_supports_sld = true;
 	split_lock_setup();
 }
-- 
Gitee


From b537c709c87fd211ee70e77eb232e728ec5b6a5e Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Mon, 22 Aug 2022 11:27:57 +0800
Subject: [PATCH 2000/2161] x86/traps: Handle #DB for bus lock

commit ebb1064e7c2e90b56e4d40ab154ef9796060a1c3 upstream.

Bus locks degrade performance for the whole system, not just for the CPU
that requested the bus lock. Two CPU features "#AC for split lock" and
"#DB for bus lock" provide hooks so that the operating system may choose
one of several mitigation strategies.

bus lock feature to cover additional situations with new options to
mitigate.

split_lock_detect=
		#AC for split lock		#DB for bus lock

off		Do nothing			Do nothing

warn		Kernel OOPs			Warn once per task and
		Warn once per task and		and continues to run.
		disable future checking
	 	When both features are
		supported, warn in #AC

fatal		Kernel OOPs			Send SIGBUS to user.
		Send SIGBUS to user
		When both features are
		supported, fatal in #AC

ratelimit:N	Do nothing			Limit bus lock rate to
						N per second in the
						current non-root user.

Default option is "warn".

Hardware only generates #DB for bus lock detect when CPL>0 to avoid
nested #DB from multiple bus locks while the first #DB is being handled.
So no need to handle #DB for bus lock detected in the kernel.

while #AC for split lock is enabled by split lock detection bit 29 in
TEST_CTRL MSR.

Both breakpoint and bus lock in the same instruction can trigger one #DB.
The bus lock is handled before the breakpoint in the #DB handler.

Delivery of #DB for bus lock in userspace clears DR6[11], which is set by
the #DB handler right after reading DR6.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210322135325.682257-3-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/cpu.h           |   7 +-
 arch/x86/include/asm/msr-index.h     |   1 +
 arch/x86/include/uapi/asm/debugreg.h |   1 +
 arch/x86/kernel/cpu/common.c         |   2 +-
 arch/x86/kernel/cpu/intel.c          | 111 ++++++++++++++++++++++-----
 arch/x86/kernel/traps.c              |   5 ++
 6 files changed, 105 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index dd17c2da1af5..fc18a4817f16 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -41,12 +41,13 @@ unsigned int x86_family(unsigned int sig);
 unsigned int x86_model(unsigned int sig);
 unsigned int x86_stepping(unsigned int sig);
 #ifdef CONFIG_CPU_SUP_INTEL
-extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
+extern void __init sld_setup(struct cpuinfo_x86 *c);
 extern void switch_to_sld(unsigned long tifn);
 extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
 extern bool handle_guest_split_lock(unsigned long ip);
+extern void handle_bus_lock(struct pt_regs *regs);
 #else
-static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
+static inline void __init sld_setup(struct cpuinfo_x86 *c) {}
 static inline void switch_to_sld(unsigned long tifn) {}
 static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 {
@@ -57,5 +58,7 @@ static inline bool handle_guest_split_lock(unsigned long ip)
 {
 	return false;
 }
+
+static inline void handle_bus_lock(struct pt_regs *regs) {}
 #endif
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f59cd500a5d3..8c5f11299c68 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -260,6 +260,7 @@
 #define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
 #define DEBUGCTLMSR_BTF_SHIFT		1
 #define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
+#define DEBUGCTLMSR_BUS_LOCK_DETECT	(1UL <<  2)
 #define DEBUGCTLMSR_TR			(1UL <<  6)
 #define DEBUGCTLMSR_BTS			(1UL <<  7)
 #define DEBUGCTLMSR_BTINT		(1UL <<  8)
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
index d95d080b30e3..0007ba077c0c 100644
--- a/arch/x86/include/uapi/asm/debugreg.h
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -24,6 +24,7 @@
 #define DR_TRAP3	(0x8)		/* db3 */
 #define DR_TRAP_BITS	(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
 
+#define DR_BUS_LOCK	(0x800)		/* bus_lock */
 #define DR_STEP		(0x4000)	/* single-step */
 #define DR_SWITCH	(0x8000)	/* task switch */
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a27f556248fd..4ec0c7dc250d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1322,7 +1322,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	cpu_set_bug_bits(c);
 
-	cpu_set_core_cap_bits(c);
+	sld_setup(c);
 
 	fpu__init_system(c);
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 69409c6f9e68..6a9bf2313ce5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -42,9 +42,9 @@ enum split_lock_detect_state {
 };
 
 /*
- * Default to sld_off because most systems do not support split lock detection
- * split_lock_setup() will switch this to sld_warn on systems that support
- * split lock detect, unless there is a command line override.
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
  */
 static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
 static u64 msr_test_ctrl_cache __ro_after_init;
@@ -683,6 +683,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 }
 
 static void split_lock_init(void);
+static void bus_lock_init(void);
 
 static void init_intel(struct cpuinfo_x86 *c)
 {
@@ -803,6 +804,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 		tsx_disable();
 
 	split_lock_init();
+	bus_lock_init();
 }
 
 #ifdef CONFIG_X86_32
@@ -1101,16 +1103,15 @@ static bool split_lock_verify_msr(bool on)
 	return ctrl == tmp;
 }
 
-static void __init split_lock_setup(void)
+static void __init sld_state_setup(void)
 {
 	enum split_lock_detect_state state = sld_warn;
 	char arg[20];
 	int i, ret;
 
-	if (!split_lock_verify_msr(false)) {
-		pr_info("MSR access failed: Disabled\n");
+	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+	    !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
 		return;
-	}
 
 	ret = cmdline_find_option(boot_command_line, "split_lock_detect",
 				  arg, sizeof(arg));
@@ -1122,17 +1123,14 @@ static void __init split_lock_setup(void)
 			}
 		}
 	}
+	sld_state = state;
+}
 
-	switch (state) {
-	case sld_off:
-		pr_info("disabled\n");
+static void __init __split_lock_setup(void)
+{
+	if (!split_lock_verify_msr(false)) {
+		pr_info("MSR access failed: Disabled\n");
 		return;
-	case sld_warn:
-		pr_info("warning about user-space split_locks\n");
-		break;
-	case sld_fatal:
-		pr_info("sending SIGBUS on user-space split_locks\n");
-		break;
 	}
 
 	rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
@@ -1142,7 +1140,9 @@ static void __init split_lock_setup(void)
 		return;
 	}
 
-	sld_state = state;
+	/* Restore the MSR to its cached value. */
+	wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
 	setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
 }
 
@@ -1199,6 +1199,29 @@ bool handle_guest_split_lock(unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(handle_guest_split_lock);
 
+static void bus_lock_init(void)
+{
+	u64 val;
+
+	/*
+	 * Warn and fatal are handled by #AC for split lock if #AC for
+	 * split lock is supported.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) ||
+	    (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+	    (sld_state == sld_warn || sld_state == sld_fatal)) ||
+	    sld_state == sld_off)
+		return;
+
+	/*
+	 * Enable #DB for bus lock. All bus locks are handled in #DB except
+	 * split locks are handled in #AC in the fatal case.
+	 */
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+	val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+}
+
 bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 {
 	if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
@@ -1207,6 +1230,21 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 	return true;
 }
 
+void handle_bus_lock(struct pt_regs *regs)
+{
+	switch (sld_state) {
+	case sld_off:
+		break;
+	case sld_warn:
+		pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+				    current->comm, current->pid, regs->ip);
+		break;
+	case sld_fatal:
+		force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+		break;
+	}
+}
+
 /*
  * This function is called only when switching between tasks with
  * different split-lock detection modes. It sets the MSR for the
@@ -1246,7 +1284,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
 	{}
 };
 
-void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
 {
 	const struct x86_cpu_id *m;
 	u64 ia32_core_caps;
@@ -1273,5 +1311,40 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
 	}
 
 	cpu_model_supports_sld = true;
-	split_lock_setup();
+	__split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+	    !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+		return;
+
+	switch (sld_state) {
+	case sld_off:
+		pr_info("disabled\n");
+		break;
+	case sld_warn:
+		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+			pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+		else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+			pr_info("#DB: warning on user-space bus_locks\n");
+		break;
+	case sld_fatal:
+		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+			pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+		} else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+			pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+				boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+				" from non-WB" : "");
+		}
+		break;
+	}
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+	split_lock_setup(c);
+	sld_state_setup();
+	sld_state_show();
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7bddac52e146..2309527e5f5f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -817,6 +817,11 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 		regs->flags &= ~X86_EFLAGS_TF;
 	}
+
+	/* #DB for bus lock can only be triggered from userspace. */
+	if ((dr6 & DR_BUS_LOCK) && user_mode(regs))
+		handle_bus_lock(regs);
+
 	si_code = get_si_code(tsk->thread.debugreg6);
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(regs, error_code, si_code);
-- 
Gitee


From 65f7625096255b64cd41625623e227ec429fc8a3 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 22 Mar 2021 13:53:25 +0000
Subject: [PATCH 2001/2161] Documentation/admin-guide: Change doc for
 split_lock_detect parameter

commit ebca17707e38f2050b188d837bd4646b29a1b0c2 upstream.

Since #DB for bus lock detect changes the split_lock_detect parameter,
update the documentation for the changes.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20210322135325.682257-4-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c2dbe1378dc1..ea30d75a74bf 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4634,27 +4634,37 @@ qspinlock.numa_spinlock_threshold_ns=	[NUMA, PV_OPS]
 				performance impact to RDRAND and RDSEED
 
 	split_lock_detect=
-			[X86] Enable split lock detection
+			[X86] Enable split lock detection or bus lock detection
 
 			When enabled (and if hardware support is present), atomic
 			instructions that access data across cache line
-			boundaries will result in an alignment check exception.
+			boundaries will result in an alignment check exception
+			for split lock detection or a debug exception for
+			bus lock detection.
 
 			off	- not enabled
 
-			warn	- the kernel will emit rate limited warnings
+			warn	- the kernel will emit rate-limited warnings
 				  about applications triggering the #AC
-				  exception. This mode is the default on CPUs
-				  that supports split lock detection.
+				  exception or the #DB exception. This mode is
+				  the default on CPUs that support split lock
+				  detection or bus lock detection. Default
+				  behavior is by #AC if both features are
+				  enabled in hardware.
 
 			fatal	- the kernel will send SIGBUS to applications
-				  that trigger the #AC exception.
+				  that trigger the #AC exception or the #DB
+				  exception. Default behavior is by #AC if
+				  both features are enabled in hardware.
 
 			If an #AC exception is hit in the kernel or in
 			firmware (i.e. not while executing in user mode)
 			the kernel will oops in either "warn" or "fatal"
 			mode.
 
+			#DB exception for bus lock is triggered only when
+			CPL > 0.
+
 	srcutree.counter_wrap_check [KNL]
 			Specifies how frequently to check for
 			grace-period sequence counter wrap for the
-- 
Gitee


From e125e5f3705be05ed4ba5ae525adfd00f600b7ae Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 19 Apr 2021 21:49:55 +0000
Subject: [PATCH 2002/2161] Documentation/x86: Add buslock.rst

commit 1897907cca5aa22cdfcdb7fb8f0644a6add0877d upstream.

Add buslock.rst to explain bus lock problem and how to detect and
handle it.

[ tglx: Included it into index.rst and added the missing include ... ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210419214958.4035512-2-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/buslock.rst | 104 ++++++++++++++++++++++++++++++++++
 Documentation/x86/index.rst   |   1 +
 2 files changed, 105 insertions(+)
 create mode 100644 Documentation/x86/buslock.rst

diff --git a/Documentation/x86/buslock.rst b/Documentation/x86/buslock.rst
new file mode 100644
index 000000000000..159ff6ba830e
--- /dev/null
+++ b/Documentation/x86/buslock.rst
@@ -0,0 +1,104 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: <isonum.txt>
+
+===============================
+Bus lock detection and handling
+===============================
+
+:Copyright: |copy| 2021 Intel Corporation
+:Authors: - Fenghua Yu <fenghua.yu@intel.com>
+          - Tony Luck <tony.luck@intel.com>
+
+Problem
+=======
+
+A split lock is any atomic operation whose operand crosses two cache lines.
+Since the operand spans two cache lines and the operation must be atomic,
+the system locks the bus while the CPU accesses the two cache lines.
+
+A bus lock is acquired through either split locked access to writeback (WB)
+memory or any locked access to non-WB memory. This is typically thousands of
+cycles slower than an atomic operation within a cache line. It also disrupts
+performance on other cores and brings the whole system to its knees.
+
+Detection
+=========
+
+Intel processors may support either or both of the following hardware
+mechanisms to detect split locks and bus locks.
+
+#AC exception for split lock detection
+--------------------------------------
+
+Beginning with the Tremont Atom CPU split lock operations may raise an
+Alignment Check (#AC) exception when a split lock operation is attemped.
+
+#DB exception for bus lock detection
+------------------------------------
+
+Some CPUs have the ability to notify the kernel by an #DB trap after a user
+instruction acquires a bus lock and is executed. This allows the kernel to
+terminate the application or to enforce throttling.
+
+Software handling
+=================
+
+The kernel #AC and #DB handlers handle bus lock based on the kernel
+parameter "split_lock_detect". Here is a summary of different options:
+
++------------------+----------------------------+-----------------------+
+|split_lock_detect=|#AC for split lock		|#DB for bus lock	|
++------------------+----------------------------+-----------------------+
+|off	  	   |Do nothing			|Do nothing		|
++------------------+----------------------------+-----------------------+
+|warn		   |Kernel OOPs			|Warn once per task and |
+|(default)	   |Warn once per task and	|and continues to run.  |
+|		   |disable future checking	|			|
+|		   |When both features are	|			|
+|		   |supported, warn in #AC	|			|
++------------------+----------------------------+-----------------------+
+|fatal		   |Kernel OOPs			|Send SIGBUS to user.	|
+|		   |Send SIGBUS to user		|			|
+|		   |When both features are	|			|
+|		   |supported, fatal in #AC	|			|
++------------------+----------------------------+-----------------------+
+
+Usages
+======
+
+Detecting and handling bus lock may find usages in various areas:
+
+It is critical for real time system designers who build consolidated real
+time systems. These systems run hard real time code on some cores and run
+"untrusted" user processes on other cores. The hard real time cannot afford
+to have any bus lock from the untrusted processes to hurt real time
+performance. To date the designers have been unable to deploy these
+solutions as they have no way to prevent the "untrusted" user code from
+generating split lock and bus lock to block the hard real time code to
+access memory during bus locking.
+
+It's also useful for general computing to prevent guests or user
+applications from slowing down the overall system by executing instructions
+with bus lock.
+
+
+Guidance
+========
+off
+---
+
+Disable checking for split lock and bus lock. This option can be useful if
+there are legacy applications that trigger these events at a low rate so
+that mitigation is not needed.
+
+warn
+----
+
+A warning is emitted when a bus lock is detected which allows to identify
+the offending application. This is the default behavior.
+
+fatal
+-----
+
+In this case, the bus lock is not tolerated and the process is killed.
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 8c748ee1726b..747b4c6a6f8e 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -28,6 +28,7 @@ x86-specific Documentation
    microcode
    resctrl
    tsx_async_abort
+   buslock
    usb-legacy-support
    i386/index
    x86_64/index
-- 
Gitee


From b1f3940356091398fca54d2bd89544eecebeb14d Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 19 Apr 2021 21:49:56 +0000
Subject: [PATCH 2003/2161] x86/bus_lock: Set rate limit for bus lock

commit ef4ae6e4413159d2329a172c12e9274e2cb0a3a8 upstream.

A bus lock can be thousands of cycles slower than atomic operation within
one cache line. It also disrupts performance on other cores. Malicious
users can generate multiple bus locks to degrade the whole system
performance.

The current mitigation is to kill the offending process, but for certain
scenarios it's desired to identify and throttle the offending application.

Add a system wide rate limit for bus locks. When the system detects bus
locks at a rate higher than N/sec (where N can be set by the kernel boot
argument in the range [1..1000]) any task triggering a bus lock will be
forced to sleep for at least 20ms until the overall system rate of bus
locks drops below the threshold.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210419214958.4035512-3-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/cpu/intel.c | 42 +++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 6a9bf2313ce5..468783c45cb5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -9,6 +9,7 @@
 #include <linux/thread_info.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
+#include <linux/delay.h>
 
 #include <asm/cpufeature.h>
 #include <asm/pgtable.h>
@@ -39,6 +40,7 @@ enum split_lock_detect_state {
 	sld_off = 0,
 	sld_warn,
 	sld_fatal,
+	sld_ratelimit,
 };
 
 /*
@@ -1078,13 +1080,30 @@ static const struct {
 	{ "off",	sld_off   },
 	{ "warn",	sld_warn  },
 	{ "fatal",	sld_fatal },
+	{ "ratelimit:", sld_ratelimit },
 };
 
+static struct ratelimit_state bld_ratelimit;
+
 static inline bool match_option(const char *arg, int arglen, const char *opt)
 {
-	int len = strlen(opt);
+	int len = strlen(opt), ratelimit;
+
+	if (strncmp(arg, opt, len))
+		return false;
+
+	/*
+	 * Min ratelimit is 1 bus lock/sec.
+	 * Max ratelimit is 1000 bus locks/sec.
+	 */
+	if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+	    ratelimit > 0 && ratelimit <= 1000) {
+		ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+		ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+		return true;
+	}
 
-	return len == arglen && !strncmp(arg, opt, len);
+	return len == arglen;
 }
 
 static bool split_lock_verify_msr(bool on)
@@ -1163,6 +1182,15 @@ static void sld_update_msr(bool on)
 
 static void split_lock_init(void)
 {
+	/*
+	 * #DB for bus lock handles ratelimit and #AC for split lock is
+	 * disabled.
+	 */
+	if (sld_state == sld_ratelimit) {
+		split_lock_verify_msr(false);
+		return;
+	}
+
 	if (cpu_model_supports_sld)
 		split_lock_verify_msr(sld_state != sld_off);
 }
@@ -1235,6 +1263,12 @@ void handle_bus_lock(struct pt_regs *regs)
 	switch (sld_state) {
 	case sld_off:
 		break;
+	case sld_ratelimit:
+		/* Enforce no more than bld_ratelimit bus locks/sec. */
+		while (!__ratelimit(&bld_ratelimit))
+			msleep(20);
+		/* Warn on the bus lock. */
+		fallthrough;
 	case sld_warn:
 		pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
 				    current->comm, current->pid, regs->ip);
@@ -1339,6 +1373,10 @@ static void sld_state_show(void)
 				" from non-WB" : "");
 		}
 		break;
+	case sld_ratelimit:
+		if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+			pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+		break;
 	}
 }
 
-- 
Gitee


From 18ec23d97e50fef0e22a0d61b25e0c812136a49f Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 19 Apr 2021 21:49:57 +0000
Subject: [PATCH 2004/2161] Documentation/admin-guide: Add bus lock ratelimit

commit 9d839c280b64817345c2fa462c0027a9bd742361 upstream.

Since bus lock rate limit changes the split_lock_detect parameter,
update the documentation for the change.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210419214958.4035512-4-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ea30d75a74bf..dc9b0d3c898c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4657,6 +4657,14 @@ qspinlock.numa_spinlock_threshold_ns=	[NUMA, PV_OPS]
 				  exception. Default behavior is by #AC if
 				  both features are enabled in hardware.
 
+			ratelimit:N -
+				  Set system wide rate limit to N bus locks
+				  per second for bus lock detection.
+				  0 < N <= 1000.
+
+				  N/A for split lock detection.
+
+
 			If an #AC exception is hit in the kernel or in
 			firmware (i.e. not while executing in user mode)
 			the kernel will oops in either "warn" or "fatal"
-- 
Gitee


From 691a968abc970a7a2533f03bc15f0ab4c0e48f78 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 19 Apr 2021 21:49:58 +0000
Subject: [PATCH 2005/2161] Documentation/x86: Add ratelimit in buslock.rst

commit d28397eaf4c27947a1ffc720d42e8b3a33ae1e2a upstream.

ratelimit is a new command line option for bus lock handling. Add proper
documentation.

[ tglx: Massaged documentation ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210419214958.4035512-5-fenghua.yu@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/buslock.rst | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/Documentation/x86/buslock.rst b/Documentation/x86/buslock.rst
index 159ff6ba830e..7c051e714943 100644
--- a/Documentation/x86/buslock.rst
+++ b/Documentation/x86/buslock.rst
@@ -63,6 +63,11 @@ parameter "split_lock_detect". Here is a summary of different options:
 |		   |When both features are	|			|
 |		   |supported, fatal in #AC	|			|
 +------------------+----------------------------+-----------------------+
+|ratelimit:N	   |Do nothing			|Limit bus lock rate to	|
+|(0 < N <= 1000)   |				|N bus locks per second	|
+|		   |				|system wide and warn on|
+|		   |				|bus locks.		|
++------------------+----------------------------+-----------------------+
 
 Usages
 ======
@@ -102,3 +107,20 @@ fatal
 -----
 
 In this case, the bus lock is not tolerated and the process is killed.
+
+ratelimit
+---------
+
+A system wide bus lock rate limit N is specified where 0 < N <= 1000. This
+allows a bus lock rate up to N bus locks per second. When the bus lock rate
+is exceeded then any task which is caught via the buslock #DB exception is
+throttled by enforced sleeps until the rate goes under the limit again.
+
+This is an effective mitigation in cases where a minimal impact can be
+tolerated, but an eventual Denial of Service attack has to be prevented. It
+allows to identify the offending processes and analyze whether they are
+malicious or just badly written.
+
+Selecting a rate limit of 1000 allows the bus to be locked for up to about
+seven million cycles each second (assuming 7000 cycles for each bus
+lock). On a 2 GHz processor that would be about 0.35% system slowdown.
-- 
Gitee


From 2d4ae79e506777065870dcabadb83ac6ddd91cd4 Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 23 Aug 2022 16:40:22 +0800
Subject: [PATCH 2006/2161] config: update configs to support Sapphire Rapids

commit opencloudos.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/arm64/configs/oc.config         |  1 +
 arch/x86/configs/oc.config           | 16 ++++++++++++++++
 package/default/config.default_kasan |  3 ++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/configs/oc.config b/arch/arm64/configs/oc.config
index 9157c78e3899..c91e389cd8fe 100644
--- a/arch/arm64/configs/oc.config
+++ b/arch/arm64/configs/oc.config
@@ -407,6 +407,7 @@ CONFIG_IP_DCCP=m
 CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_RDMA=m
+CONFIG_DMATEST=m
 CONFIG_RDS_TCP=m
 CONFIG_TIPC=m
 CONFIG_TIPC_MEDIA_IB=y
diff --git a/arch/x86/configs/oc.config b/arch/x86/configs/oc.config
index 568e482a4a5b..fb2a1b0b87d6 100644
--- a/arch/x86/configs/oc.config
+++ b/arch/x86/configs/oc.config
@@ -22,6 +22,7 @@ CONFIG_BLK_CGROUP=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_IOASIDS=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_HUGETLB=y
 CONFIG_CPUSETS=y
@@ -106,6 +107,9 @@ CONFIG_PCIE_EDR=y
 CONFIG_ACPI_APEI_MEMORY_FAILURE=y
 CONFIG_ACPI_APEI_EINJ=m
 CONFIG_ACPI_APEI_ERST_DEBUG=m
+CONFIG_ACPI_PFRU=y
+CONFIG_ACPI_PFRU_TELEMETRY=y
+CONFIG_ACPI_PRMT=y
 CONFIG_ACPI_EXTLOG=m
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
@@ -1541,3 +1545,15 @@ CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_X86_DECODER_SELFTEST=y
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_UNWINDER_FRAME_POINTER=y
+# Intel SPR
+CONFIG_MFD_INTEL_PMT=y
+CONFIG_INTEL_IDXD_BUS=y
+CONFIG_INTEL_IDXD=y
+CONFIG_INTEL_IDXD_COMPAT=y
+CONFIG_INTEL_IDXD_SVM=y
+CONFIG_INTEL_IDXD_PERFMON=y
+CONFIG_VFIO_MDEV_IDXD=m
+CONFIG_IOMMU_DEFAULT_PASSTHROUGH=y
+CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON=y
+CONFIG_CRYPTO_DEV_IAX_CRYPTO=y
+CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS=y
diff --git a/package/default/config.default_kasan b/package/default/config.default_kasan
index 3bf8deb99f58..dcfabd5d2b73 100644
--- a/package/default/config.default_kasan
+++ b/package/default/config.default_kasan
@@ -3795,6 +3795,7 @@ CONFIG_HSU_DMA=y
 #
 # CONFIG_ASYNC_TX_DMA is not set
 # CONFIG_DMATEST is not set
+CONFIG_DMATEST=m
 CONFIG_DMA_ENGINE_RAID=y
 
 #
@@ -4603,7 +4604,7 @@ CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m
 #
 # Compression
 #
-CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
 # CONFIG_CRYPTO_842 is not set
 # CONFIG_CRYPTO_LZ4 is not set
-- 
Gitee


From 19ccb25285eaf38be74e26baaf46132d49667afb Mon Sep 17 00:00:00 2001
From: Zhuo Chen <sagazchen@tencent.com>
Date: Tue, 23 Aug 2022 20:14:14 +0800
Subject: [PATCH 2007/2161] config: enable CONFIG_JUMP_LABEL

commit opencloudos.

Fill the gaps left by history.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/configs/oc.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/configs/oc.config b/arch/x86/configs/oc.config
index fb2a1b0b87d6..7726e996f62f 100644
--- a/arch/x86/configs/oc.config
+++ b/arch/x86/configs/oc.config
@@ -140,6 +140,7 @@ CONFIG_VHOST_VSOCK=m
 CONFIG_OPROFILE=m
 CONFIG_OPROFILE_EVENT_MULTIPLEX=y
 CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
-- 
Gitee


From 7f174bf5d8529732780ca0aa75b50b001424cb6f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 May 2021 12:01:19 -0700
Subject: [PATCH 2008/2161] clocksource: Retry clock read if long delays
 detected

commit db3a34e17433de2390eb80d436970edcebd0ca3e upstream.

When the clocksource watchdog marks a clock as unstable, this might be due
to that clock being unstable or it might be due to delays that happen to
occur between the reads of the two clocks.  Yes, interrupts are disabled
across those two reads, but there are no shortage of things that can delay
interrupts-disabled regions of code ranging from SMI handlers to vCPU
preemption.  It would be good to have some indication as to why the clock
was marked unstable.

Therefore, re-read the watchdog clock on either side of the read from the
clock under test.  If the watchdog clock shows an excessive time delta
between its pair of reads, the reads are retried.

The maximum number of retries is specified by a new kernel boot parameter
clocksource.max_cswd_read_retries, which defaults to three, that is, up to
four reads, one initial and up to three retries.  If more than one retry
was required, a message is printed on the console (the occasional single
retry is expected behavior, especially in guest OSes).  If the maximum
number of retries is exceeded, the clock under test will be marked
unstable.  However, the probability of this happening due to various sorts
of delays is quite small.  In addition, the reason (clock-read delays) for
the unstable marking will be apparent.

Reported-by: Chris Mason <clm@fb.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Feng Tang <feng.tang@intel.com>
Link: https://lore.kernel.org/r/20210527190124.440372-1-paulmck@kernel.org
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         |  6 +++
 kernel/time/clocksource.c                     | 53 ++++++++++++++++---
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index dc9b0d3c898c..1c204ab35fd3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -564,6 +564,12 @@
 			loops can be debugged more effectively on production
 			systems.
 
+	clocksource.max_cswd_read_retries= [KNL]
+			Number of clocksource_watchdog() retries due to
+			external delays before the clock will be marked
+			unstable.  Defaults to three retries, that is,
+			four attempts to read the clock under test.
+
 	clearcpuid=BITNUM[,BITNUM...] [X86]
 			Disable CPUID feature X for the kernel. See
 			arch/x86/include/asm/cpufeatures.h for the valid bit
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 428beb69426a..6863a054c970 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -124,6 +124,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating);
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
+/*
+ * Maximum permissible delay between two readouts of the watchdog
+ * clocksource surrounding a read of the clocksource being validated.
+ * This delay could be due to SMIs, NMIs, or to VCPU preemptions.
+ */
+#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
+
 static void clocksource_watchdog_work(struct work_struct *work)
 {
 	/*
@@ -184,12 +191,45 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
+static ulong max_cswd_read_retries = 3;
+module_param(max_cswd_read_retries, ulong, 0644);
+
+static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+{
+	unsigned int nretries;
+	u64 wd_end, wd_delta;
+	int64_t wd_delay;
+
+	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
+		local_irq_disable();
+		*wdnow = watchdog->read(watchdog);
+		*csnow = cs->read(cs);
+		wd_end = watchdog->read(watchdog);
+		local_irq_enable();
+
+		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
+		wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
+					      watchdog->shift);
+		if (wd_delay <= WATCHDOG_MAX_SKEW) {
+			if (nretries > 1 || nretries >= max_cswd_read_retries) {
+				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
+					smp_processor_id(), watchdog->name, nretries);
+			}
+			return true;
+		}
+	}
+
+	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
+		smp_processor_id(), watchdog->name, wd_delay, nretries);
+	return false;
+}
+
 static void clocksource_watchdog(struct timer_list *unused)
 {
-	struct clocksource *cs;
 	u64 csnow, wdnow, cslast, wdlast, delta;
-	int64_t wd_nsec, cs_nsec;
 	int next_cpu, reset_pending;
+	int64_t wd_nsec, cs_nsec;
+	struct clocksource *cs;
 
 	spin_lock(&watchdog_lock);
 	if (!watchdog_running)
@@ -206,10 +246,11 @@ static void clocksource_watchdog(struct timer_list *unused)
 			continue;
 		}
 
-		local_irq_disable();
-		csnow = cs->read(cs);
-		wdnow = watchdog->read(watchdog);
-		local_irq_enable();
+		if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
+			/* Clock readout unreliable, so give it up. */
+			__clocksource_unstable(cs);
+			continue;
+		}
 
 		/* Clocksource initialized ? */
 		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
-- 
Gitee


From 68af6e1e348181255f17cda5708d7f79a092a2e2 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 18 Nov 2021 14:14:36 -0500
Subject: [PATCH 2009/2161] clocksource: Avoid accidental unstable marking of
 clocksources

commit c86ff8c55b8ae68837b2fa59dc0c203907e9a15f upstream.

Since commit db3a34e17433 ("clocksource: Retry clock read if long delays
detected") and commit 2e27e793e280 ("clocksource: Reduce clocksource-skew
threshold"), it is found that tsc clocksource fallback to hpet can
sometimes happen on both Intel and AMD systems especially when they are
running stressful benchmarking workloads. Of the 23 systems tested with
a v5.14 kernel, 10 of them have switched to hpet clock source during
the test run.

The result of falling back to hpet is a drastic reduction of performance
when running benchmarks. For example, the fio performance tests can
drop up to 70% whereas the iperf3 performance can drop up to 80%.

4 hpet fallbacks happened during bootup. They were:

  [    8.749399] clocksource: timekeeping watchdog on CPU13: hpet read-back delay of 263750ns, attempt 4, marking unstable
  [   12.044610] clocksource: timekeeping watchdog on CPU19: hpet read-back delay of 186166ns, attempt 4, marking unstable
  [   17.336941] clocksource: timekeeping watchdog on CPU28: hpet read-back delay of 182291ns, attempt 4, marking unstable
  [   17.518565] clocksource: timekeeping watchdog on CPU34: hpet read-back delay of 252196ns, attempt 4, marking unstable

Other fallbacks happen when the systems were running stressful
benchmarks. For example:

  [ 2685.867873] clocksource: timekeeping watchdog on CPU117: hpet read-back delay of 57269ns, attempt 4, marking unstable
  [46215.471228] clocksource: timekeeping watchdog on CPU8: hpet read-back delay of 61460ns, attempt 4, marking unstable

Commit 2e27e793e280 ("clocksource: Reduce clocksource-skew threshold"),
changed the skew margin from 100us to 50us. I think this is too small
and can easily be exceeded when running some stressful workloads on a
thermally stressed system.  So it is switched back to 100us.

Even a maximum skew margin of 100us may be too small in for some systems
when booting up especially if those systems are under thermal stress. To
eliminate the case that the large skew is due to the system being too
busy slowing down the reading of both the watchdog and the clocksource,
an extra consecutive read of watchdog clock is being done to check this.

The consecutive watchdog read delay is compared against
WATCHDOG_MAX_SKEW/2. If the delay exceeds the limit, we assume that
the system is just too busy. A warning will be printed to the console
and the clock skew check is skipped for this round.

Fixes: db3a34e17433 ("clocksource: Retry clock read if long delays detected")
Fixes: 2e27e793e280 ("clocksource: Reduce clocksource-skew threshold")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 kernel/time/clocksource.c | 48 ++++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6863a054c970..208802722e3d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -194,17 +194,24 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static ulong max_cswd_read_retries = 3;
 module_param(max_cswd_read_retries, ulong, 0644);
 
-static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+enum wd_read_status {
+	WD_READ_SUCCESS,
+	WD_READ_UNSTABLE,
+	WD_READ_SKIP
+};
+
+static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
 {
 	unsigned int nretries;
-	u64 wd_end, wd_delta;
-	int64_t wd_delay;
+	u64 wd_end, wd_end2, wd_delta;
+	int64_t wd_delay, wd_seq_delay;
 
 	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
 		local_irq_disable();
 		*wdnow = watchdog->read(watchdog);
 		*csnow = cs->read(cs);
 		wd_end = watchdog->read(watchdog);
+		wd_end2 = watchdog->read(watchdog);
 		local_irq_enable();
 
 		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
@@ -215,13 +222,34 @@ static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
 				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
 					smp_processor_id(), watchdog->name, nretries);
 			}
-			return true;
+			return WD_READ_SUCCESS;
 		}
+
+		/*
+		 * Now compute delay in consecutive watchdog read to see if
+		 * there is too much external interferences that cause
+		 * significant delay in reading both clocksource and watchdog.
+		 *
+		 * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
+		 * report system busy, reinit the watchdog and skip the current
+		 * watchdog test.
+		 */
+		wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
+		wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
+		if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
+			goto skip_test;
 	}
 
 	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
 		smp_processor_id(), watchdog->name, wd_delay, nretries);
-	return false;
+	return WD_READ_UNSTABLE;
+
+skip_test:
+	pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
+		smp_processor_id(), watchdog->name, wd_seq_delay);
+	pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
+		cs->name, wd_delay);
+	return WD_READ_SKIP;
 }
 
 static void clocksource_watchdog(struct timer_list *unused)
@@ -230,6 +258,7 @@ static void clocksource_watchdog(struct timer_list *unused)
 	int next_cpu, reset_pending;
 	int64_t wd_nsec, cs_nsec;
 	struct clocksource *cs;
+	enum wd_read_status read_ret;
 
 	spin_lock(&watchdog_lock);
 	if (!watchdog_running)
@@ -246,9 +275,12 @@ static void clocksource_watchdog(struct timer_list *unused)
 			continue;
 		}
 
-		if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
-			/* Clock readout unreliable, so give it up. */
-			__clocksource_unstable(cs);
+		read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
+
+		if (read_ret != WD_READ_SUCCESS) {
+			if (read_ret == WD_READ_UNSTABLE)
+				/* Clock readout unreliable, so give it up. */
+				__clocksource_unstable(cs);
 			continue;
 		}
 
-- 
Gitee


From 254a2ece7a8e44981837074edfc729a7c78c8fae Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 2 Sep 2022 17:40:48 +0800
Subject: [PATCH 2010/2161] clocksource: Reduce the default
 clocksource_watchdog() retries to 2

commit 1a5620671a1b6fd9cc08761677d050f1702f910c upstream.

With the previous patch, there is an extra watchdog read in each retry.
Now the total number of clocksource reads is increased to 4 per iteration.
In order to avoid increasing the clock skew check overhead, the default
maximum number of retries is reduced from 3 to 2 to maintain the same 12
clocksource reads in the worst case.

Suggested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 4 ++--
 kernel/time/clocksource.c                       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1c204ab35fd3..e909389d411f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -567,8 +567,8 @@
 	clocksource.max_cswd_read_retries= [KNL]
 			Number of clocksource_watchdog() retries due to
 			external delays before the clock will be marked
-			unstable.  Defaults to three retries, that is,
-			four attempts to read the clock under test.
+			unstable.  Defaults to two retries, that is,
+			three attempts to read the clock under test.
 
 	clearcpuid=BITNUM[,BITNUM...] [X86]
 			Disable CPUID feature X for the kernel. See
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 208802722e3d..dc9fbf42ed34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -191,7 +191,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static ulong max_cswd_read_retries = 3;
+static ulong max_cswd_read_retries = 2;
 module_param(max_cswd_read_retries, ulong, 0644);
 
 enum wd_read_status {
-- 
Gitee


From 3654cd8360a3cd7039f3c6a6e491fc5aca20eff6 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 27 Apr 2022 09:08:53 +0300
Subject: [PATCH 2011/2161] intel_idle: Fix SPR C6 optimization

commit 7eac3bd38d18cd3317756649921b8264ddfee692 upstream.

The Sapphire Rapids (SPR) C6 optimization was added to the end of the
'spr_idle_state_table_update()' function. However, the function has a
'return' which may happen before the optimization has a chance to run.
And this may prevent the optimization from happening.

This is an unlikely scenario, but possible if user boots with, say,
the 'intel_idle.preferred_cstates=6' kernel boot option.

This patch fixes the issue by eliminating the problematic 'return'
statement.

Fixes: 3a9cf77b60dc ("intel_idle: add core C6 optimization for SPR")
Suggested-by: Jan Beulich <jbeulich@suse.com>
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index b43a785bfc04..1938ec019207 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1579,11 +1579,9 @@ static void __init spr_idle_state_table_update(void)
 	unsigned long long msr;
 
 	/* Check if user prefers C1E over C1. */
-	if (preferred_states_mask & BIT(2)) {
-		if (preferred_states_mask & BIT(1))
-			/* Both can't be enabled, stick to the defaults. */
-			return;
-
+	if ((preferred_states_mask & BIT(2)) &&
+	    !(preferred_states_mask & BIT(1))) {
+		/* Disable C1 and enable C1E. */
 		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
 
-- 
Gitee


From 9b24ef7277897e5f66d739080a0c6beda14598dc Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 27 Apr 2022 09:08:52 +0300
Subject: [PATCH 2012/2161] intel_idle: Fix the 'preferred_cstates' module
 parameter

commit 39c184a6a9a7a99950b321d55fe713175cf1d404 upstream.

Problem description.

When user boots kernel up with the 'intel_idle.preferred_cstates=4' option,
we enable C1E and disable C1 states on Sapphire Rapids Xeon (SPR). In order
for C1E to work on SPR, we have to enable the C1E promotion bit on all
CPUs.  However, we enable it only on one CPU.

Fix description.

The 'intel_idle' driver already has the infrastructure for disabling C1E
promotion on every CPU. This patch uses the same infrastructure for
enabling C1E promotion on every CPU. It changes the boolean
'disable_promotion_to_c1e' variable to a tri-state 'c1e_promotion'
variable.

Tested on a 2-socket SPR system. I verified the following combinations:

 * C1E promotion enabled and disabled in BIOS.
 * Booted with and without the 'intel_idle.preferred_cstates=4' kernel
   argument.

In all 4 cases C1E promotion was correctly set on all CPUs.

Also tested on an old Broadwell system, just to make sure it does not cause
a regression. C1E promotion was correctly disabled on that system, both C1
and C1E were exposed (as expected).

Fixes: da0e58c038e6 ("intel_idle: add 'preferred_cstates' module argument")
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 1938ec019207..a7075d5bb16d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -69,7 +69,12 @@ static unsigned int preferred_states_mask;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
 static unsigned long auto_demotion_disable_flags;
-static bool disable_promotion_to_c1e;
+
+static enum {
+	C1E_PROMOTION_PRESERVE,
+	C1E_PROMOTION_ENABLE,
+	C1E_PROMOTION_DISABLE
+} c1e_promotion = C1E_PROMOTION_PRESERVE;
 
 struct idle_cpu {
 	struct cpuidle_state *state_table;
@@ -1399,8 +1404,6 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
-static void c1e_promotion_enable(void);
-
 /**
  * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
@@ -1586,8 +1589,7 @@ static void __init spr_idle_state_table_update(void)
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
 
 		/* Enable C1E using the "C1E promotion" bit. */
-		c1e_promotion_enable();
-		disable_promotion_to_c1e = false;
+		c1e_promotion = C1E_PROMOTION_ENABLE;
 	}
 
 	/*
@@ -1753,7 +1755,9 @@ static int intel_idle_cpu_init(unsigned int cpu)
 	if (auto_demotion_disable_flags)
 		auto_demotion_disable();
 
-	if (disable_promotion_to_c1e)
+	if (c1e_promotion == C1E_PROMOTION_ENABLE)
+		c1e_promotion_enable();
+	else if (c1e_promotion == C1E_PROMOTION_DISABLE)
 		c1e_promotion_disable();
 
 	return 0;
@@ -1832,7 +1836,8 @@ static int __init intel_idle_init(void)
 	if (icpu) {
 		cpuidle_state_table = icpu->state_table;
 		auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
-		disable_promotion_to_c1e = icpu->disable_promotion_to_c1e;
+		if (icpu->disable_promotion_to_c1e)
+			c1e_promotion = C1E_PROMOTION_DISABLE;
 		if (icpu->use_acpi || force_use_acpi)
 			intel_idle_acpi_cst_extract();
 	} else if (!intel_idle_acpi_cst_extract()) {
-- 
Gitee


From f375e440cc185a0f155c1dd06eca9241fc9cd669 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Sat, 16 Jul 2022 09:26:55 +0300
Subject: [PATCH 2013/2161] intel_idle: make SPR C1 and C1E be independent

commit 1548fac47a114b42063def551eb152a536ed9697 upstream.

This patch partially reverts the changes made by the following commit:

da0e58c038e6 intel_idle: add 'preferred_cstates' module argument

As that commit describes, on early Sapphire Rapids Xeon platforms the C1 and
C1E states were mutually exclusive, so that users could only have either C1 and
C6, or C1E and C6.

However, Intel firmware engineers managed to remove this limitation and make C1
and C1E to be completely independent, just like on previous Xeon platforms.

Therefore, this patch:
 * Removes commentary describing the old, and now non-existing SPR C1E
   limitation.
 * Marks SPR C1E as available by default.
 * Removes the 'preferred_cstates' parameter handling for SPR. Both C1 and
   C1E will be available regardless of 'preferred_cstates' value.

We expect that all SPR systems are shipping with new firmware, which includes
the C1/C1E improvement.

Cc: v5.18+ <stable@vger.kernel.org> # v5.18+
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a7075d5bb16d..7bbeb88518c5 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -762,16 +762,6 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.enter = NULL }
 };
 
-/*
- * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
- * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
- * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
- * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
- * both C1 and C1E requests end up with C1, so there is effectively no C1E.
- *
- * By default we enable C1 and disable C1E by marking it with
- * 'CPUIDLE_FLAG_UNUSABLE'.
- */
 static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1",
@@ -784,8 +774,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
-					   CPUIDLE_FLAG_UNUSABLE,
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 2,
 		.target_residency = 4,
 		.enter = &intel_idle,
@@ -1581,17 +1570,6 @@ static void __init spr_idle_state_table_update(void)
 {
 	unsigned long long msr;
 
-	/* Check if user prefers C1E over C1. */
-	if ((preferred_states_mask & BIT(2)) &&
-	    !(preferred_states_mask & BIT(1))) {
-		/* Disable C1 and enable C1E. */
-		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
-		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
-
-		/* Enable C1E using the "C1E promotion" bit. */
-		c1e_promotion = C1E_PROMOTION_ENABLE;
-	}
-
 	/*
 	 * By default, the C6 state assumes the worst-case scenario of package
 	 * C6. However, if PC6 is disabled, we update the numbers to match
-- 
Gitee


From 66c982ce0fe815d1d8cf8981f8e7680c198b3327 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Fri, 23 Sep 2022 17:18:26 +0800
Subject: [PATCH 2014/2161] intel_idle: Add a new flag to initialize the AMX
 state

commit 9f01129382774d98ec21526f13da26a0630ee3d8 upstream.

The non-initialized AMX state can be the cause of C-state demotion from C6
to C1E. This low-power idle state may improve power savings and thus result
in a higher available turbo frequency budget.

This behavior is implementation-specific. Initialize the state for the C6
entrance of Sapphire Rapids as needed.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lkml.kernel.org/r/20220614164116.5196-1-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 7bbeb88518c5..d413ec9d0dfa 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -54,6 +54,7 @@
 #include <asm/intel-family.h>
 #include <asm/mwait.h>
 #include <asm/msr.h>
+#include <asm/fpu/api.h>
 
 #define INTEL_IDLE_VERSION "0.5.1"
 
@@ -105,6 +106,11 @@ static unsigned int mwait_substates __initdata;
  */
 #define CPUIDLE_FLAG_ALWAYS_ENABLE	BIT(15)
 
+/*
+ * Initialize large xstate for the C6-state entrance.
+ */
+#define CPUIDLE_FLAG_INIT_XSTATE	BIT(17)
+
 /*
  * MWAIT takes an 8-bit "hint" in EAX "suggesting"
  * the C-state (top nibble) and sub-state (bottom nibble)
@@ -144,6 +150,13 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	return index;
 }
 
+static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
+				       struct cpuidle_driver *drv, int index)
+{
+	fpu_idle_fpregs();
+	return __intel_idle(dev, drv, index);
+}
+
 /**
  * intel_idle_s2idle - Ask the processor to enter the given idle state.
  * @dev: cpuidle device of the target CPU.
@@ -159,8 +172,12 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 static __cpuidle void intel_idle_s2idle(struct cpuidle_device *dev,
 					struct cpuidle_driver *drv, int index)
 {
-	unsigned long eax = flg2MWAIT(drv->states[index].flags);
 	unsigned long ecx = 1; /* break on interrupt flag */
+	struct cpuidle_state *state = &drv->states[index];
+	unsigned long eax = flg2MWAIT(state->flags);
+
+	if (state->flags & CPUIDLE_FLAG_INIT_XSTATE)
+		fpu_idle_fpregs();
 
 	mwait_idle_with_hints(eax, ecx);
 }
@@ -782,7 +799,8 @@ static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
-		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED |
+					   CPUIDLE_FLAG_INIT_XSTATE,
 		.exit_latency = 290,
 		.target_residency = 800,
 		.enter = &intel_idle,
@@ -1647,6 +1665,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		/* Structure copy. */
 		drv->states[drv->state_count] = cpuidle_state_table[cstate];
 
+		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_INIT_XSTATE)
+			drv->states[drv->state_count].enter = intel_idle_xstate;
+
 		if ((disabled_states_mask & BIT(drv->state_count)) ||
 		    ((icpu->use_acpi || force_use_acpi) &&
 		     intel_idle_off_by_default(mwait_hint) &&
-- 
Gitee


From 6ec2e5266e26bda1a37fd525397832ca1577432c Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 3 Dec 2020 17:57:44 -0800
Subject: [PATCH 2015/2161] platform/x86: ISST: Check for unaligned mmio
 address

commit a552f204b050b213b1e41a5134a0d2726c9a2ec1 upstream.

The address should be aligned to 4 byte boundary. So send an error for
unaligned address.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20201204015746.1168941-1-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
index aa17fd7817f8..e7e9808a1aed 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
@@ -42,6 +42,9 @@ static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
 	if (io_reg->reg < 0x04 || io_reg->reg > 0xD0)
 		return -EINVAL;
 
+	if (io_reg->reg % 4)
+		return -EINVAL;
+
 	if (io_reg->read_write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-- 
Gitee


From c1a0f7671e7eec026b789ee4c832eae924f6dca2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 3 Dec 2020 17:57:45 -0800
Subject: [PATCH 2016/2161] platform/x86: ISST: Allow configurable offset range

commit 761f0ee0e84b4c18535c6d17890ccc9f5c617e8d upstream.

The mmio offset range can be different based on the PCI device id. Here
for INTEL_RAPL_PRIO_DEVID_1, the range is increased from 45 to 64. Pass
the range as the driver_data. Also account for different ranges during
save/restore via suspend/resume callbacks.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20201204015746.1168941-2-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../x86/intel_speed_select_if/isst_if_mmio.c  | 50 +++++++++++++------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
index e7e9808a1aed..c4bf8dea32ca 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
@@ -20,15 +20,21 @@ struct isst_mmio_range {
 	int end;
 };
 
-struct isst_mmio_range mmio_range[] = {
+struct isst_mmio_range mmio_range_devid_0[] = {
 	{0x04, 0x14},
 	{0x20, 0xD0},
 };
 
+struct isst_mmio_range mmio_range_devid_1[] = {
+	{0x04, 0x14},
+	{0x20, 0x11C},
+};
+
 struct isst_if_device {
 	void __iomem *punit_mmio;
 	u32 range_0[5];
-	u32 range_1[45];
+	u32 range_1[64];
+	struct isst_mmio_range *mmio_range;
 	struct mutex mutex;
 };
 
@@ -39,8 +45,6 @@ static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
 	struct pci_dev *pdev;
 
 	io_reg = (struct isst_if_io_reg *)cmd_ptr;
-	if (io_reg->reg < 0x04 || io_reg->reg > 0xD0)
-		return -EINVAL;
 
 	if (io_reg->reg % 4)
 		return -EINVAL;
@@ -56,6 +60,10 @@ static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
 	if (!punit_dev)
 		return -EINVAL;
 
+	if (io_reg->reg < punit_dev->mmio_range[0].beg ||
+	    io_reg->reg > punit_dev->mmio_range[1].end)
+		return -EINVAL;
+
 	/*
 	 * Ensure that operation is complete on a PCI device to avoid read
 	 * write race by using per PCI device mutex.
@@ -74,8 +82,10 @@ static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
 }
 
 static const struct pci_device_id isst_if_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_0)},
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_1)},
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_0),
+		.driver_data = (kernel_ulong_t)&mmio_range_devid_0},
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_1),
+		.driver_data = (kernel_ulong_t)&mmio_range_devid_1},
 	{ 0 },
 };
 MODULE_DEVICE_TABLE(pci, isst_if_ids);
@@ -112,6 +122,7 @@ static int isst_if_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	mutex_init(&punit_dev->mutex);
 	pci_set_drvdata(pdev, punit_dev);
+	punit_dev->mmio_range = (struct isst_mmio_range *) ent->driver_data;
 
 	memset(&cb, 0, sizeof(cb));
 	cb.cmd_size = sizeof(struct isst_if_io_reg);
@@ -141,10 +152,15 @@ static int __maybe_unused isst_if_suspend(struct device *device)
 
 	for (i = 0; i < ARRAY_SIZE(punit_dev->range_0); ++i)
 		punit_dev->range_0[i] = readl(punit_dev->punit_mmio +
-						mmio_range[0].beg + 4 * i);
-	for (i = 0; i < ARRAY_SIZE(punit_dev->range_1); ++i)
-		punit_dev->range_1[i] = readl(punit_dev->punit_mmio +
-						mmio_range[1].beg + 4 * i);
+						punit_dev->mmio_range[0].beg + 4 * i);
+	for (i = 0; i < ARRAY_SIZE(punit_dev->range_1); ++i) {
+		u32 addr;
+
+		addr = punit_dev->mmio_range[1].beg + 4 * i;
+		if (addr > punit_dev->mmio_range[1].end)
+			break;
+		punit_dev->range_1[i] = readl(punit_dev->punit_mmio + addr);
+	}
 
 	return 0;
 }
@@ -156,10 +172,16 @@ static int __maybe_unused isst_if_resume(struct device *device)
 
 	for (i = 0; i < ARRAY_SIZE(punit_dev->range_0); ++i)
 		writel(punit_dev->range_0[i], punit_dev->punit_mmio +
-						mmio_range[0].beg + 4 * i);
-	for (i = 0; i < ARRAY_SIZE(punit_dev->range_1); ++i)
-		writel(punit_dev->range_1[i], punit_dev->punit_mmio +
-						mmio_range[1].beg + 4 * i);
+						punit_dev->mmio_range[0].beg + 4 * i);
+	for (i = 0; i < ARRAY_SIZE(punit_dev->range_1); ++i) {
+		u32 addr;
+
+		addr = punit_dev->mmio_range[1].beg + 4 * i;
+		if (addr > punit_dev->mmio_range[1].end)
+			break;
+
+		writel(punit_dev->range_1[i], punit_dev->punit_mmio + addr);
+	}
 
 	return 0;
 }
-- 
Gitee


From cf5c36c280a71d3024c691bd0c5e09be4199eb53 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 3 Dec 2020 17:57:46 -0800
Subject: [PATCH 2017/2161] platform/x86: ISST: Change PCI device macros

commit 7c88ab5715a265d5dde06e4e1b0dd4370d911372 upstream.

Use PCI_VDEVICE and PCI_DEVICE_DATA macros. No functional changes are
expected.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20201204015746.1168941-3-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../platform/x86/intel_speed_select_if/isst_if_common.h   | 8 ++++----
 .../platform/x86/intel_speed_select_if/isst_if_mbox_pci.c | 4 ++--
 drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c | 6 ++----
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
index 4f6f7f0761fc..fdecdae248d7 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
@@ -10,11 +10,11 @@
 #ifndef __ISST_IF_COMMON_H
 #define __ISST_IF_COMMON_H
 
-#define INTEL_RAPL_PRIO_DEVID_0	0x3451
-#define INTEL_CFG_MBOX_DEVID_0	0x3459
+#define PCI_DEVICE_ID_INTEL_RAPL_PRIO_DEVID_0	0x3451
+#define PCI_DEVICE_ID_INTEL_CFG_MBOX_DEVID_0	0x3459
 
-#define INTEL_RAPL_PRIO_DEVID_1 0x3251
-#define INTEL_CFG_MBOX_DEVID_1  0x3259
+#define PCI_DEVICE_ID_INTEL_RAPL_PRIO_DEVID_1	0x3251
+#define PCI_DEVICE_ID_INTEL_CFG_MBOX_DEVID_1	0x3259
 
 /*
  * Validate maximum commands in a single request.
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c
index 95f01e7a87d5..a2a2d923e60c 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c
@@ -146,8 +146,8 @@ static long isst_if_mbox_proc_cmd(u8 *cmd_ptr, int *write_only, int resume)
 }
 
 static const struct pci_device_id isst_if_mbox_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_CFG_MBOX_DEVID_0)},
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_CFG_MBOX_DEVID_1)},
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_CFG_MBOX_DEVID_0)},
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_CFG_MBOX_DEVID_1)},
 	{ 0 },
 };
 MODULE_DEVICE_TABLE(pci, isst_if_mbox_ids);
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
index c4bf8dea32ca..2906cfee5d9c 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
@@ -82,10 +82,8 @@ static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
 }
 
 static const struct pci_device_id isst_if_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_0),
-		.driver_data = (kernel_ulong_t)&mmio_range_devid_0},
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_1),
-		.driver_data = (kernel_ulong_t)&mmio_range_devid_1},
+	{ PCI_DEVICE_DATA(INTEL, RAPL_PRIO_DEVID_0, &mmio_range_devid_0)},
+	{ PCI_DEVICE_DATA(INTEL, RAPL_PRIO_DEVID_1, &mmio_range_devid_1)},
 	{ 0 },
 };
 MODULE_DEVICE_TABLE(pci, isst_if_ids);
-- 
Gitee


From 629ca94e61b345692e277d23ce1c316663bdf100 Mon Sep 17 00:00:00 2001
From: Zou Wei <zou_wei@huawei.com>
Date: Tue, 8 Dec 2020 20:28:09 +0800
Subject: [PATCH 2018/2161] platform/x86: ISST: Mark mmio_range_devid_0 and
 mmio_range_devid_1 with static keyword

commit 0cd3f561efa9adce840140720e0581355db3e554 upstream.

Fix the following sparse warnings:

drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c:23:24: warning: symbol 'mmio_range_devid_0' was not declared. Should it be static?
drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c:28:24: warning: symbol 'mmio_range_devid_1' was not declared. Should it be static?

Signed-off-by: Zou Wei <zou_wei@huawei.com>
Link: https://lore.kernel.org/r/1607430489-116200-1-git-send-email-zou_wei@huawei.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
index 2906cfee5d9c..ff49025ec085 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
@@ -20,12 +20,12 @@ struct isst_mmio_range {
 	int end;
 };
 
-struct isst_mmio_range mmio_range_devid_0[] = {
+static struct isst_mmio_range mmio_range_devid_0[] = {
 	{0x04, 0x14},
 	{0x20, 0xD0},
 };
 
-struct isst_mmio_range mmio_range_devid_1[] = {
+static struct isst_mmio_range mmio_range_devid_1[] = {
 	{0x04, 0x14},
 	{0x20, 0x11C},
 };
-- 
Gitee


From bffe6c02c8114bf7543b02d31bfacd5c2bd1714a Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 16 Jun 2021 15:13:28 -0700
Subject: [PATCH 2019/2161] platform/x86: ISST: Optimize CPU to PCI device
 mapping

commit 1e42de8e53d32bbd7a732df49d872a30b4f888b4 upstream.

It was observed that some of the high performance benchmarks are spending
more time in kernel depending on which CPU package they are executing.
The difference is significant and benchmark scores varies more than 10%.
These benchmarks adjust class of service to improve thread performance
which run in parallel. This class of service change causes access to
MMIO region of Intel Speed Select PCI devices depending on the CPU
package they are executing.

This mapping from CPU to PCI device instance uses a standard Linux PCI
interface "pci_get_domain_bus_and_slot()". This function does a linear
search to get to a PCI device. Since these platforms have 100+ PCI
devices, this search can be expensive in fast path for benchmarks.

Since the device and function of PCI device is fixed for Intel
Speed Select PCI devices, the CPU to PCI device information can be cached
at the same time when bus number for the CPU is read. In this way during
runtime the cached information can be used. This improves performance
of these benchmarks significantly.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20210616221329.1909276-1-srinivas.pandruvada@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../intel_speed_select_if/isst_if_common.c    | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
index 0c2aa22c7a12..bbd46b1d9c10 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -281,11 +281,27 @@ static int isst_if_get_platform_info(void __user *argp)
 struct isst_if_cpu_info {
 	/* For BUS 0 and BUS 1 only, which we need for PUNIT interface */
 	int bus_info[2];
+	struct pci_dev *pci_dev[2];
 	int punit_cpu_id;
 };
 
 static struct isst_if_cpu_info *isst_cpu_info;
 
+static struct pci_dev *_isst_if_get_pci_dev(int cpu, int bus_no, int dev, int fn)
+{
+	int bus_number;
+
+	if (bus_no < 0 || bus_no > 1 || cpu < 0 || cpu >= nr_cpu_ids ||
+	    cpu >= num_possible_cpus())
+		return NULL;
+
+	bus_number = isst_cpu_info[cpu].bus_info[bus_no];
+	if (bus_number < 0)
+		return NULL;
+
+	return pci_get_domain_bus_and_slot(0, bus_number, PCI_DEVFN(dev, fn));
+}
+
 /**
  * isst_if_get_pci_dev() - Get the PCI device instance for a CPU
  * @cpu: Logical CPU number.
@@ -300,17 +316,18 @@ static struct isst_if_cpu_info *isst_cpu_info;
  */
 struct pci_dev *isst_if_get_pci_dev(int cpu, int bus_no, int dev, int fn)
 {
-	int bus_number;
+	struct pci_dev *pci_dev;
 
 	if (bus_no < 0 || bus_no > 1 || cpu < 0 || cpu >= nr_cpu_ids ||
 	    cpu >= num_possible_cpus())
 		return NULL;
 
-	bus_number = isst_cpu_info[cpu].bus_info[bus_no];
-	if (bus_number < 0)
-		return NULL;
+	pci_dev = isst_cpu_info[cpu].pci_dev[bus_no];
 
-	return pci_get_domain_bus_and_slot(0, bus_number, PCI_DEVFN(dev, fn));
+	if (pci_dev && pci_dev->devfn == PCI_DEVFN(dev, fn))
+		return pci_dev;
+
+	return _isst_if_get_pci_dev(cpu, bus_no, dev, fn);
 }
 EXPORT_SYMBOL_GPL(isst_if_get_pci_dev);
 
@@ -327,6 +344,8 @@ static int isst_if_cpu_online(unsigned int cpu)
 	} else {
 		isst_cpu_info[cpu].bus_info[0] = data & 0xff;
 		isst_cpu_info[cpu].bus_info[1] = (data >> 8) & 0xff;
+		isst_cpu_info[cpu].pci_dev[0] = _isst_if_get_pci_dev(cpu, 0, 0, 1);
+		isst_cpu_info[cpu].pci_dev[1] = _isst_if_get_pci_dev(cpu, 1, 30, 1);
 	}
 
 	ret = rdmsrl_safe(MSR_THREAD_ID_INFO, &data);
-- 
Gitee


From 9c8d90664a6323beffda6552b38cb823dac20a78 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 16 Jun 2021 15:13:29 -0700
Subject: [PATCH 2020/2161] platform/x86: ISST: Use numa node id for cpu pci
 dev mapping

commit aa2ddd24257213bdfd2f65058531810ac57455dc upstream.

There is a problem in mapping CPU to a PCI device instance when the
bus numbers are reused in different packages. This was observed on
some Sapphire Rapids systems.

The current implementation reads bus number assigned to a CPU package
via MSR 0x128. This allows to establish relationship between a CPU
and a PCI device. This allows to update power related parameters to a
MMIO offset in a PCI device space which is unique to a CPU. But if
two packages uses same bus number then this mapping will not be unique.

When bus number is reused, PCI device will use different domain number
or segment number. So we need to be aware of this domain information
while matching CPU to PCI bus number. This domain information is not
available via any MSR. So need to use ACPI numa node information.

There is an interface already available in the Linux to read numa
node for a CPU and a PCI device. This change uses this interface
to check the numa node of a match PCI device with bus number.
If the bus number and numa node matches with the CPU's assigned
bus number and numa node, the matched PCI device instance will be
returned to the caller.

It is possible that before Sapphire Rapids, the numa node is not
defined for the Speed Select PCI device in some OEM systems. In this
case to restore old behavior, return the last matched PCI device
for domain 0 unlsess there are more than one matches.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20210616221329.1909276-2-srinivas.pandruvada@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../intel_speed_select_if/isst_if_common.c    | 48 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
index bbd46b1d9c10..6f0cc679c8e5 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -283,13 +283,18 @@ struct isst_if_cpu_info {
 	int bus_info[2];
 	struct pci_dev *pci_dev[2];
 	int punit_cpu_id;
+	int numa_node;
 };
 
 static struct isst_if_cpu_info *isst_cpu_info;
+#define ISST_MAX_PCI_DOMAINS	8
 
 static struct pci_dev *_isst_if_get_pci_dev(int cpu, int bus_no, int dev, int fn)
 {
-	int bus_number;
+	struct pci_dev *matched_pci_dev = NULL;
+	struct pci_dev *pci_dev = NULL;
+	int no_matches = 0;
+	int i, bus_number;
 
 	if (bus_no < 0 || bus_no > 1 || cpu < 0 || cpu >= nr_cpu_ids ||
 	    cpu >= num_possible_cpus())
@@ -299,7 +304,45 @@ static struct pci_dev *_isst_if_get_pci_dev(int cpu, int bus_no, int dev, int fn
 	if (bus_number < 0)
 		return NULL;
 
-	return pci_get_domain_bus_and_slot(0, bus_number, PCI_DEVFN(dev, fn));
+	for (i = 0; i < ISST_MAX_PCI_DOMAINS; ++i) {
+		struct pci_dev *_pci_dev;
+		int node;
+
+		_pci_dev = pci_get_domain_bus_and_slot(i, bus_number, PCI_DEVFN(dev, fn));
+		if (!_pci_dev)
+			continue;
+
+		++no_matches;
+		if (!matched_pci_dev)
+			matched_pci_dev = _pci_dev;
+
+		node = dev_to_node(&_pci_dev->dev);
+		if (node == NUMA_NO_NODE) {
+			pr_info("Fail to get numa node for CPU:%d bus:%d dev:%d fn:%d\n",
+				cpu, bus_no, dev, fn);
+			continue;
+		}
+
+		if (node == isst_cpu_info[cpu].numa_node) {
+			pci_dev = _pci_dev;
+			break;
+		}
+	}
+
+	/*
+	 * If there is no numa matched pci_dev, then there can be following cases:
+	 * 1. CONFIG_NUMA is not defined: In this case if there is only single device
+	 *    match, then we don't need numa information. Simply return last match.
+	 *    Othewise return NULL.
+	 * 2. NUMA information is not exposed via _SEG method. In this case it is similar
+	 *    to case 1.
+	 * 3. Numa information doesn't match with CPU numa node and more than one match
+	 *    return NULL.
+	 */
+	if (!pci_dev && no_matches == 1)
+		pci_dev = matched_pci_dev;
+
+	return pci_dev;
 }
 
 /**
@@ -354,6 +397,7 @@ static int isst_if_cpu_online(unsigned int cpu)
 		return ret;
 	}
 	isst_cpu_info[cpu].punit_cpu_id = data;
+	isst_cpu_info[cpu].numa_node = cpu_to_node(cpu);
 
 	isst_restore_msr_local(cpu);
 
-- 
Gitee


From 4acca21c6f34dba784b13eab3c9f35eddbbc5dde Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 25 Aug 2021 10:23:57 +0300
Subject: [PATCH 2021/2161] platform/x86: ISST: use semi-colons instead of
 commas

commit 55879dc4d095232609fe81498c1b43f042708eef upstream.

The code works the same either way, but it's better to use semi-colons
to separate statements.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/20210825072357.GA12957@kili
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/platform/x86/intel_speed_select_if/isst_if_common.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
index 6f0cc679c8e5..61ddf9f769af 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -265,9 +265,9 @@ static int isst_if_get_platform_info(void __user *argp)
 {
 	struct isst_if_platform_info info;
 
-	info.api_version = ISST_IF_API_VERSION,
-	info.driver_version = ISST_IF_DRIVER_VERSION,
-	info.max_cmds_per_ioctl = ISST_IF_CMD_LIMIT,
+	info.api_version = ISST_IF_API_VERSION;
+	info.driver_version = ISST_IF_DRIVER_VERSION;
+	info.max_cmds_per_ioctl = ISST_IF_CMD_LIMIT;
 	info.mbox_supported = punit_callbacks[ISST_IF_DEV_MBOX].registered;
 	info.mmio_supported = punit_callbacks[ISST_IF_DEV_MMIO].registered;
 
-- 
Gitee


From b406ccb2344a821e077678035b898da33e9689a4 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 7 Dec 2021 21:17:34 +0800
Subject: [PATCH 2022/2161] powercap: intel_rapl: support new layout of Psys
 PowerLimit Register on SPR

commit 931da6a0de5d620425af4425344259e6ff46b654 upstream.

On Sapphire Rapids, the layout of the Psys domain Power Limit Register
is different from from what it was before.

Enhance the code to support the new Psys PL register layout.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reported-and-tested-by: Alkattan Dana <dana.alkattan@intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/powercap/intel_rapl_common.c | 61 +++++++++++++++++++++++++++-
 include/linux/intel_rapl.h           |  6 +++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index a242c3480296..8566bf72f3fb 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -62,6 +62,20 @@
 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
 #define PP_POLICY_MASK         0x1F
 
+/*
+ * SPR has different layout for Psys Domain PowerLimit registers.
+ * There are 17 bits of PL1 and PL2 instead of 15 bits.
+ * The Enable bits and TimeWindow bits are also shifted as a result.
+ */
+#define PSYS_POWER_LIMIT1_MASK       0x1FFFF
+#define PSYS_POWER_LIMIT1_ENABLE     BIT(17)
+
+#define PSYS_POWER_LIMIT2_MASK       (0x1FFFFULL<<32)
+#define PSYS_POWER_LIMIT2_ENABLE     BIT_ULL(49)
+
+#define PSYS_TIME_WINDOW1_MASK       (0x7FULL<<19)
+#define PSYS_TIME_WINDOW2_MASK       (0x7FULL<<51)
+
 /* Non HW constants */
 #define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
@@ -97,6 +111,7 @@ struct rapl_defaults {
 				    bool to_raw);
 	unsigned int dram_domain_energy_unit;
 	unsigned int psys_domain_energy_unit;
+	bool spr_psys_bits;
 };
 static struct rapl_defaults *rapl_defaults;
 
@@ -628,12 +643,51 @@ static struct rapl_primitive_info rpi[] = {
 			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
 	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
 			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 	/* non-hardware */
 	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
 			    RAPL_PRIMITIVE_DERIVED),
 	{NULL, 0, 0, 0},
 };
 
+static enum rapl_primitives
+prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
+{
+	if (!rapl_defaults->spr_psys_bits)
+		return prim;
+
+	if (rd->id != RAPL_DOMAIN_PLATFORM)
+		return prim;
+
+	switch (prim) {
+	case POWER_LIMIT1:
+		return PSYS_POWER_LIMIT1;
+	case POWER_LIMIT2:
+		return PSYS_POWER_LIMIT2;
+	case PL1_ENABLE:
+		return PSYS_PL1_ENABLE;
+	case PL2_ENABLE:
+		return PSYS_PL2_ENABLE;
+	case TIME_WINDOW1:
+		return PSYS_TIME_WINDOW1;
+	case TIME_WINDOW2:
+		return PSYS_TIME_WINDOW2;
+	default:
+		return prim;
+	}
+}
+
 /* Read primitive data based on its related struct rapl_primitive_info.
  * if xlate flag is set, return translated data based on data units, i.e.
  * time, energy, and power.
@@ -651,7 +705,8 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 			      enum rapl_primitives prim, bool xlate, u64 *data)
 {
 	u64 value;
-	struct rapl_primitive_info *rp = &rpi[prim];
+	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
+	struct rapl_primitive_info *rp = &rpi[prim_fixed];
 	struct reg_action ra;
 	int cpu;
 
@@ -697,7 +752,8 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 			       enum rapl_primitives prim,
 			       unsigned long long value)
 {
-	struct rapl_primitive_info *rp = &rpi[prim];
+	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
+	struct rapl_primitive_info *rp = &rpi[prim_fixed];
 	int cpu;
 	u64 bits;
 	struct reg_action ra;
@@ -940,6 +996,7 @@ static const struct rapl_defaults rapl_defaults_spr_server = {
 	.compute_time_window = rapl_compute_time_window_core,
 	.dram_domain_energy_unit = 15300,
 	.psys_domain_energy_unit = 1000000000,
+	.spr_psys_bits = true,
 };
 
 static const struct rapl_defaults rapl_defaults_byt = {
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index efb3ce892c20..a12fca5b401d 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -55,6 +55,12 @@ enum rapl_primitives {
 	THROTTLED_TIME,
 	PRIORITY_LEVEL,
 
+	PSYS_POWER_LIMIT1,
+	PSYS_POWER_LIMIT2,
+	PSYS_PL1_ENABLE,
+	PSYS_PL2_ENABLE,
+	PSYS_TIME_WINDOW1,
+	PSYS_TIME_WINDOW2,
 	/* below are not raw primitive data */
 	AVERAGE_POWER,
 	NR_RAPL_PRIMITIVES,
-- 
Gitee


From 0ac9b6dd5d0e6fb4f27c223b786f6b9164ea2a3c Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Wed, 8 Jun 2022 09:47:47 -0700
Subject: [PATCH 2023/2161] x86/fpu: Add a helper to prepare AMX state for
 low-power CPU idle

commit f17b168734c0fe47343a7502d012266a051f9942 upstream.

When a CPU enters an idle state, a non-initialized AMX register state may
be the cause of preventing a deeper low-power state. Other extended
register states whether initialized or not do not impact the CPU idle
state.

The new helper can ensure the AMX state is initialized before the CPU is
idle, and it will be used by the intel idle driver.

Check the AMX_TILE feature bit before using XGETBV1 as a chain of
dependencies was established via cpuid_deps[]: AMX->XFD->XGETBV1.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20220608164748.11864-2-chang.seok.bae@intel.com
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/fpu/api.h       |  2 ++
 arch/x86/include/asm/special_insns.h |  9 +++++++++
 arch/x86/kernel/fpu/core.c           | 14 ++++++++++++++
 3 files changed, 25 insertions(+)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index ad26131af8bb..18c0d76b5565 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -159,4 +159,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
 struct task_struct;
 extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2);
 
+extern void fpu_idle_fpregs(void);
+
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 70578c8def0d..00c667f63126 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -278,6 +278,15 @@ static inline int enqcmds(void __iomem *dst, const void *src)
 	return 0;
 }
 
+static inline void tile_release(void)
+{
+	/*
+	 * Instruction opcode for TILERELEASE; supported in binutils
+	 * version >= 2.36.
+	 */
+	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0");
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1d9a52ae2b83..ff41bd6a458c 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -844,3 +844,17 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
 	 */
 	return 0;
 }
+
+/*
+ * Initialize register state that may prevent from entering low-power idle.
+ * This function will be invoked from the cpuidle driver only when needed.
+ */
+void fpu_idle_fpregs(void)
+{
+	/* Note: AMX_TILE being enabled implies XGETBV1 support */
+	if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
+	    (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
+		tile_release();
+		fpregs_deactivate(&current->thread.fpu);
+	}
+}
-- 
Gitee


From a1d1900807b2a77f9d47e4d951aadd95086a3314 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Jun 2022 16:27:27 +0200
Subject: [PATCH 2024/2161] cpuidle,intel_idle: Fix CPUIDLE_FLAG_IRQ_ENABLE

commit 32d4fd5751eadbe1823a37eb38df85ec5c8e6207 upstream.

Commit c227233ad64c ("intel_idle: enable interrupts before C1 on
Xeons") wrecked intel_idle in two ways:

 - must not have tracing in idle functions
 - must return with IRQs disabled

Additionally, it added a branch for no good reason.

Fixes: c227233ad64c ("intel_idle: enable interrupts before C1 on Xeons")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[ rjw: Moved the intel_idle() kerneldoc comment next to the function ]
Cc: 5.16+ <stable@vger.kernel.org> # 5.16+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/idle/intel_idle.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index d413ec9d0dfa..0cc535b63b94 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -121,6 +121,18 @@ static unsigned int mwait_substates __initdata;
 #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
 #define MWAIT2flg(eax) ((eax & 0xFF) << 24)
 
+static __always_inline int __intel_idle(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int index)
+{
+	struct cpuidle_state *state = &drv->states[index];
+	unsigned long eax = flg2MWAIT(state->flags);
+	unsigned long ecx = 1; /* break on interrupt flag */
+
+	mwait_idle_with_hints(eax, ecx);
+
+	return index;
+}
+
 /**
  * intel_idle - Ask the processor to enter the given idle state.
  * @dev: cpuidle device of the target CPU.
@@ -138,16 +150,19 @@ static unsigned int mwait_substates __initdata;
 static __cpuidle int intel_idle(struct cpuidle_device *dev,
 				struct cpuidle_driver *drv, int index)
 {
-	struct cpuidle_state *state = &drv->states[index];
-	unsigned long eax = flg2MWAIT(state->flags);
-	unsigned long ecx = 1; /* break on interrupt flag */
+	return __intel_idle(dev, drv, index);
+}
 
-	if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE)
-		local_irq_enable();
+static __cpuidle int intel_idle_irq(struct cpuidle_device *dev,
+				    struct cpuidle_driver *drv, int index)
+{
+	int ret;
 
-	mwait_idle_with_hints(eax, ecx);
+	raw_local_irq_enable();
+	ret = __intel_idle(dev, drv, index);
+	raw_local_irq_disable();
 
-	return index;
+	return ret;
 }
 
 static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
@@ -1665,6 +1680,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		/* Structure copy. */
 		drv->states[drv->state_count] = cpuidle_state_table[cstate];
 
+		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE)
+			drv->states[drv->state_count].enter = intel_idle_irq;
+
 		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_INIT_XSTATE)
 			drv->states[drv->state_count].enter = intel_idle_xstate;
 
-- 
Gitee


From cc4bf9d791967761841b0461d86f316d65df3754 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 10:59:22 +0800
Subject: [PATCH 2025/2161] PCI: vmd: Use Shadow MEMBAR registers for QEMU/KVM
 guests

commit 51f939b11cb1c47ed2a8d56b23f25483b7363f8e upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

VMD device 28C0 natively assists guest passthrough of the VMD endpoint
through the use of shadow registers that provide Host Physical Addresses
to correctly assign bridge windows. These shadow registers are only
available if VMD config space register 0x70, bit 1 is set.

In order to support this mode in existing VMD devices which don't
natively support the shadow register, it was decided that the hypervisor
could offer the shadow registers in a vendor-specific PCI capability.

QEMU has been modified to create this vendor-specific capability and
supply the shadow membar registers for VMDs which don't natively support
this feature. This patch adds this mode and updates the supported device
list to allow this feature to be used on these VMDs.

Link: https://lore.kernel.org/r/20200528030240.16024-4-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 44 +++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index fc2fcf63948e..7bfb987ed2ed 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -39,13 +39,19 @@ enum vmd_features {
 	 * membars, in order to allow proper address translation during
 	 * resource assignment to enable guest virtualization
 	 */
-	VMD_FEAT_HAS_MEMBAR_SHADOW	= (1 << 0),
+	VMD_FEAT_HAS_MEMBAR_SHADOW		= (1 << 0),
 
 	/*
 	 * Device may provide root port configuration information which limits
 	 * bus numbering
 	 */
-	VMD_FEAT_HAS_BUS_RESTRICTIONS	= (1 << 1),
+	VMD_FEAT_HAS_BUS_RESTRICTIONS		= (1 << 1),
+
+	/*
+	 * Device contains physical location shadow registers in
+	 * vendor-specific capability space
+	 */
+	VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP	= (1 << 2),
 };
 
 /*
@@ -453,6 +459,28 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 		}
 	}
 
+	if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) {
+		int pos = pci_find_capability(vmd->dev, PCI_CAP_ID_VNDR);
+		u32 reg, regu;
+
+		pci_read_config_dword(vmd->dev, pos + 4, &reg);
+
+		/* "SHDW" */
+		if (pos && reg == 0x53484457) {
+			pci_read_config_dword(vmd->dev, pos + 8, &reg);
+			pci_read_config_dword(vmd->dev, pos + 12, &regu);
+			offset[0] = vmd->dev->resource[VMD_MEMBAR1].start -
+					(((u64) regu << 32 | reg) &
+					 PCI_BASE_ADDRESS_MEM_MASK);
+
+			pci_read_config_dword(vmd->dev, pos + 16, &reg);
+			pci_read_config_dword(vmd->dev, pos + 20, &regu);
+			offset[1] = vmd->dev->resource[VMD_MEMBAR2].start -
+					(((u64) regu << 32 | reg) &
+					 PCI_BASE_ADDRESS_MEM_MASK);
+		}
+	}
+
 	/*
 	 * Certain VMD devices may have a root port configuration option which
 	 * limits the bus range to between 0-127 or 128-255
@@ -712,16 +740,20 @@ static int vmd_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
 
 static const struct pci_device_id vmd_ids[] = {
-	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D),},
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D),
+		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_28C0),
 		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW |
 				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
-		.driver_data = VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
+				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x467f),
-		.driver_data = VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
+				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x4c3d),
-		.driver_data = VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
+				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
 	{0,}
 };
 MODULE_DEVICE_TABLE(pci, vmd_ids);
-- 
Gitee


From 4b9baa14947cf9ee3a07f1ac9e57dd8d9e016bbb Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:02:02 +0800
Subject: [PATCH 2026/2161] PCI: vmd: Create physical offset helper

commit 030109c0376c8f34e95f8b88d170a47900b3ad07 upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

Move the guest-passthrough physical offset discovery code to a new helper.
No functional changes.

Link: https://lore.kernel.org/r/20200728194945.14126-2-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 105 +++++++++++++++++++++--------------
 1 file changed, 62 insertions(+), 43 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 7bfb987ed2ed..159f08343592 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -416,6 +416,60 @@ static int vmd_find_free_domain(void)
 	return domain + 1;
 }
 
+static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint,
+				resource_size_t *offset1,
+				resource_size_t *offset2)
+{
+	struct pci_dev *dev = vmd->dev;
+	u64 phys1, phys2;
+
+	if (native_hint) {
+		u32 vmlock;
+		int ret;
+
+		ret = pci_read_config_dword(dev, PCI_REG_VMLOCK, &vmlock);
+		if (ret || vmlock == ~0)
+			return -ENODEV;
+
+		if (MB2_SHADOW_EN(vmlock)) {
+			void __iomem *membar2;
+
+			membar2 = pci_iomap(dev, VMD_MEMBAR2, 0);
+			if (!membar2)
+				return -ENOMEM;
+			phys1 = readq(membar2 + MB2_SHADOW_OFFSET);
+			phys2 = readq(membar2 + MB2_SHADOW_OFFSET + 8);
+			pci_iounmap(dev, membar2);
+		} else
+			return 0;
+	} else {
+		/* Hypervisor-Emulated Vendor-Specific Capability */
+		int pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+		u32 reg, regu;
+
+		pci_read_config_dword(dev, pos + 4, &reg);
+
+		/* "SHDW" */
+		if (pos && reg == 0x53484457) {
+			pci_read_config_dword(dev, pos + 8, &reg);
+			pci_read_config_dword(dev, pos + 12, &regu);
+			phys1 = (u64) regu << 32 | reg;
+
+			pci_read_config_dword(dev, pos + 16, &reg);
+			pci_read_config_dword(dev, pos + 20, &regu);
+			phys2 = (u64) regu << 32 | reg;
+		} else
+			return 0;
+	}
+
+	*offset1 = dev->resource[VMD_MEMBAR1].start -
+			(phys1 & PCI_BASE_ADDRESS_MEM_MASK);
+	*offset2 = dev->resource[VMD_MEMBAR2].start -
+			(phys2 & PCI_BASE_ADDRESS_MEM_MASK);
+
+	return 0;
+}
+
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
 	struct pci_sysdata *sd = &vmd->sysdata;
@@ -427,6 +481,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	resource_size_t offset[2] = {0};
 	resource_size_t membar2_offset = 0x2000;
 	struct pci_bus *child;
+	int ret;
 
 	/*
 	 * Shadow registers may exist in certain VMD device ids which allow
@@ -435,50 +490,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	 * or 0, depending on an enable bit in the VMD device.
 	 */
 	if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) {
-		u32 vmlock;
-		int ret;
-
 		membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE;
-		ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, &vmlock);
-		if (ret || vmlock == ~0)
-			return -ENODEV;
-
-		if (MB2_SHADOW_EN(vmlock)) {
-			void __iomem *membar2;
-
-			membar2 = pci_iomap(vmd->dev, VMD_MEMBAR2, 0);
-			if (!membar2)
-				return -ENOMEM;
-			offset[0] = vmd->dev->resource[VMD_MEMBAR1].start -
-					(readq(membar2 + MB2_SHADOW_OFFSET) &
-					 PCI_BASE_ADDRESS_MEM_MASK);
-			offset[1] = vmd->dev->resource[VMD_MEMBAR2].start -
-					(readq(membar2 + MB2_SHADOW_OFFSET + 8) &
-					 PCI_BASE_ADDRESS_MEM_MASK);
-			pci_iounmap(vmd->dev, membar2);
-		}
-	}
-
-	if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) {
-		int pos = pci_find_capability(vmd->dev, PCI_CAP_ID_VNDR);
-		u32 reg, regu;
-
-		pci_read_config_dword(vmd->dev, pos + 4, &reg);
-
-		/* "SHDW" */
-		if (pos && reg == 0x53484457) {
-			pci_read_config_dword(vmd->dev, pos + 8, &reg);
-			pci_read_config_dword(vmd->dev, pos + 12, &regu);
-			offset[0] = vmd->dev->resource[VMD_MEMBAR1].start -
-					(((u64) regu << 32 | reg) &
-					 PCI_BASE_ADDRESS_MEM_MASK);
-
-			pci_read_config_dword(vmd->dev, pos + 16, &reg);
-			pci_read_config_dword(vmd->dev, pos + 20, &regu);
-			offset[1] = vmd->dev->resource[VMD_MEMBAR2].start -
-					(((u64) regu << 32 | reg) &
-					 PCI_BASE_ADDRESS_MEM_MASK);
-		}
+		ret = vmd_get_phys_offsets(vmd, true, &offset[0], &offset[1]);
+		if (ret)
+			return ret;
+	} else if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) {
+		ret = vmd_get_phys_offsets(vmd, false, &offset[0], &offset[1]);
+		if (ret)
+			return ret;
 	}
 
 	/*
-- 
Gitee


From 9cf44387b57aedaa0a98fb0406bddb1d5c2a9257 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:03:04 +0800
Subject: [PATCH 2027/2161] PCI: vmd: Add bus 224-255 restriction decode

commit 08bcdd22ecdb01dd60d9284a55f8220a8a40150e upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

VMD bus restrictions are required when IO fabric is multiplexed such
that VMD cannot use the entire bus range. This patch adds another bus
restriction decode bit that can be set by firmware to restrict the VMD
bus range to between 224-255.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 159f08343592..a6a2ea3fd370 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -502,16 +502,30 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 
 	/*
 	 * Certain VMD devices may have a root port configuration option which
-	 * limits the bus range to between 0-127 or 128-255
+	 * limits the bus range to between 0-127, 128-255, or 224-255
 	 */
 	if (features & VMD_FEAT_HAS_BUS_RESTRICTIONS) {
-		u32 vmcap, vmconfig;
-
-		pci_read_config_dword(vmd->dev, PCI_REG_VMCAP, &vmcap);
-		pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, &vmconfig);
-		if (BUS_RESTRICT_CAP(vmcap) &&
-		    (BUS_RESTRICT_CFG(vmconfig) == 0x1))
-			vmd->busn_start = 128;
+		u16 reg16;
+
+		pci_read_config_word(vmd->dev, PCI_REG_VMCAP, &reg16);
+		if (BUS_RESTRICT_CAP(reg16)) {
+			pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG,
+					     &reg16);
+
+			switch (BUS_RESTRICT_CFG(reg16)) {
+			case 1:
+				vmd->busn_start = 128;
+				break;
+			case 2:
+				vmd->busn_start = 224;
+				break;
+			case 3:
+				pci_err(vmd->dev, "Unknown Bus Offset Setting\n");
+				return -ENODEV;
+			default:
+				break;
+			}
+		}
 	}
 
 	res = &vmd->dev->resource[VMD_CFGBAR];
-- 
Gitee


From 55377cb48448e8ed026b32e03b83c7d05f8b6462 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:03:41 +0800
Subject: [PATCH 2028/2161] PCI: vmd: Create bus offset configuration helper

commit 2e1224183b1bec6610e01be80f007ee9ab0af9c0 upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

Move the bus offset configuration discovery code to a new helper.  Modify
the bus offset 2-bit decode switch to have a 0 case and a default error
case, just in case the field is expanded in future hardware.

Link: https://lore.kernel.org/r/20200728194945.14126-3-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 53 ++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index a6a2ea3fd370..790a85f8e6eb 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -470,6 +470,35 @@ static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint,
 	return 0;
 }
 
+static int vmd_get_bus_number_start(struct vmd_dev *vmd)
+{
+	struct pci_dev *dev = vmd->dev;
+	u16 reg;
+
+	pci_read_config_word(dev, PCI_REG_VMCAP, &reg);
+	if (BUS_RESTRICT_CAP(reg)) {
+		pci_read_config_word(dev, PCI_REG_VMCONFIG, &reg);
+
+		switch (BUS_RESTRICT_CFG(reg)) {
+		case 0:
+			vmd->busn_start = 0;
+			break;
+		case 1:
+			vmd->busn_start = 128;
+			break;
+		case 2:
+			vmd->busn_start = 224;
+			break;
+		default:
+			pci_err(dev, "Unknown Bus Offset Setting (%d)\n",
+				BUS_RESTRICT_CFG(reg));
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
 	struct pci_sysdata *sd = &vmd->sysdata;
@@ -505,27 +534,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	 * limits the bus range to between 0-127, 128-255, or 224-255
 	 */
 	if (features & VMD_FEAT_HAS_BUS_RESTRICTIONS) {
-		u16 reg16;
-
-		pci_read_config_word(vmd->dev, PCI_REG_VMCAP, &reg16);
-		if (BUS_RESTRICT_CAP(reg16)) {
-			pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG,
-					     &reg16);
-
-			switch (BUS_RESTRICT_CFG(reg16)) {
-			case 1:
-				vmd->busn_start = 128;
-				break;
-			case 2:
-				vmd->busn_start = 224;
-				break;
-			case 3:
-				pci_err(vmd->dev, "Unknown Bus Offset Setting\n");
-				return -ENODEV;
-			default:
-				break;
-			}
-		}
+		ret = vmd_get_bus_number_start(vmd);
+		if (ret)
+			return ret;
 	}
 
 	res = &vmd->dev->resource[VMD_CFGBAR];
-- 
Gitee


From b92650d787c797d7afcb2bee2f9ef6f104de4df5 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:11:24 +0800
Subject: [PATCH 2029/2161] VMD: Use x86_vector_domain as parent domain

commit e382dffc904d14cb6e2c31e2eefebdca41343943 Intel-BKC.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

Upstream commit: e382dffc904d14cb6e2c31e2eefebdca41343943
Author: Keith Busch <kbusch@kernel.org>
Date:   Mon Jun 20 09:39:52 2016 -0600

    x86/PCI: VMD: Use x86_vector_domain as parent domain

    Otherwise APIC code assumes VMD's IRQ domain can be managed by the APIC,
    resulting in an invalid cast of irq_data during irq_force_complete_move().

    Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
    Signed-off-by: Keith Busch <keith.busch@intel.com>
    Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 790a85f8e6eb..dfd47e799078 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -602,7 +602,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 		return -ENODEV;
 
 	vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info,
-						    NULL);
+						    x86_vector_domain);
 
 	if (!vmd->irq_domain) {
 		irq_domain_free_fwnode(fn);
-- 
Gitee


From 635436043cd94363436202569f339f1c96d39a59 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:13:30 +0800
Subject: [PATCH 2030/2161] PCI: vmd: Create IRQ Domain configuration helper

commit 1552b11ba15e93ee550c4672aa5acd8c1c5e8a82 upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

Move the IRQ and MSI Domain configuration code to new helpers. No
functional changes.

Link: https://lore.kernel.org/r/20200728194945.14126-4-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 53 ++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index dfd47e799078..0bf1949918c1 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -297,6 +297,34 @@ static struct msi_domain_info vmd_msi_domain_info = {
 	.chip		= &vmd_msi_controller,
 };
 
+static int vmd_create_irq_domain(struct vmd_dev *vmd)
+{
+	struct fwnode_handle *fn;
+
+	fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain);
+	if (!fn)
+		return -ENODEV;
+
+	vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info,
+						    x86_vector_domain);
+	if (!vmd->irq_domain) {
+		irq_domain_free_fwnode(fn);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void vmd_remove_irq_domain(struct vmd_dev *vmd)
+{
+	if (vmd->irq_domain) {
+		struct fwnode_handle *fn = vmd->irq_domain->fwnode;
+
+		irq_domain_remove(vmd->irq_domain);
+		irq_domain_free_fwnode(fn);
+	}
+}
+
 static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
 				  unsigned int devfn, int reg, int len)
 {
@@ -502,7 +530,6 @@ static int vmd_get_bus_number_start(struct vmd_dev *vmd)
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
 	struct pci_sysdata *sd = &vmd->sysdata;
-	struct fwnode_handle *fn;
 	struct resource *res;
 	u32 upper_bits;
 	unsigned long flags;
@@ -597,17 +624,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 
 	sd->node = pcibus_to_node(vmd->dev->bus);
 
-	fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain);
-	if (!fn)
-		return -ENODEV;
-
-	vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info,
-						    x86_vector_domain);
-
-	if (!vmd->irq_domain) {
-		irq_domain_free_fwnode(fn);
-		return -ENODEV;
-	}
+	ret = vmd_create_irq_domain(vmd);
+	if (ret)
+		return ret;
 
 	/*
 	 * Override the irq domain bus token so the domain can be distinguished
@@ -623,13 +642,13 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 				       &vmd_ops, sd, &resources);
 	if (!vmd->bus) {
 		pci_free_resource_list(&resources);
-		irq_domain_remove(vmd->irq_domain);
-		irq_domain_free_fwnode(fn);
+		vmd_remove_irq_domain(vmd);
 		return -ENODEV;
 	}
 
 	vmd_attach_resources(vmd);
-	dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
+	if (vmd->irq_domain)
+		dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
 
 	pci_scan_child_bus(vmd->bus);
 	pci_assign_unassigned_bus_resources(vmd->bus);
@@ -738,15 +757,13 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd)
 static void vmd_remove(struct pci_dev *dev)
 {
 	struct vmd_dev *vmd = pci_get_drvdata(dev);
-	struct fwnode_handle *fn = vmd->irq_domain->fwnode;
 
 	sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
 	pci_stop_root_bus(vmd->bus);
 	pci_remove_root_bus(vmd->bus);
 	vmd_cleanup_srcu(vmd);
 	vmd_detach_resources(vmd);
-	irq_domain_remove(vmd->irq_domain);
-	irq_domain_free_fwnode(fn);
+	vmd_remove_irq_domain(vmd);
 }
 
 #ifdef CONFIG_PM_SLEEP
-- 
Gitee


From 39423a05f32c4427fb712caa48781ab01e99c379 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:14:37 +0800
Subject: [PATCH 2031/2161] PCI: vmd: Create IRQ allocation helper

commit 875b4e2a93bb2c3ddefb4dcb292c4bfd46ccc464 upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

Move the IRQ allocation and SRCU initialization code to a new helper.  No
functional changes.

Link: https://lore.kernel.org/r/20200728194945.14126-5-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 94 ++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 0bf1949918c1..98bc7634aca2 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -527,6 +527,55 @@ static int vmd_get_bus_number_start(struct vmd_dev *vmd)
 	return 0;
 }
 
+static irqreturn_t vmd_irq(int irq, void *data)
+{
+	struct vmd_irq_list *irqs = data;
+	struct vmd_irq *vmdirq;
+	int idx;
+
+	idx = srcu_read_lock(&irqs->srcu);
+	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
+		generic_handle_irq(vmdirq->virq);
+	srcu_read_unlock(&irqs->srcu, idx);
+
+	return IRQ_HANDLED;
+}
+
+static int vmd_alloc_irqs(struct vmd_dev *vmd)
+{
+	struct pci_dev *dev = vmd->dev;
+	int i, err;
+
+	vmd->msix_count = pci_msix_vec_count(dev);
+	if (vmd->msix_count < 0)
+		return -ENODEV;
+
+	vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count,
+						PCI_IRQ_MSIX);
+	if (vmd->msix_count < 0)
+		return vmd->msix_count;
+
+	vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
+				 GFP_KERNEL);
+	if (!vmd->irqs)
+		return -ENOMEM;
+
+	for (i = 0; i < vmd->msix_count; i++) {
+		err = init_srcu_struct(&vmd->irqs[i].srcu);
+		if (err)
+			return err;
+
+		INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
+		err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i),
+				       vmd_irq, IRQF_NO_THREAD,
+				       "vmd", &vmd->irqs[i]);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
 	struct pci_sysdata *sd = &vmd->sysdata;
@@ -668,24 +717,10 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	return 0;
 }
 
-static irqreturn_t vmd_irq(int irq, void *data)
-{
-	struct vmd_irq_list *irqs = data;
-	struct vmd_irq *vmdirq;
-	int idx;
-
-	idx = srcu_read_lock(&irqs->srcu);
-	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
-		generic_handle_irq(vmdirq->virq);
-	srcu_read_unlock(&irqs->srcu, idx);
-
-	return IRQ_HANDLED;
-}
-
 static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	struct vmd_dev *vmd;
-	int i, err;
+	int err;
 
 	if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
 		return -ENOMEM;
@@ -708,32 +743,9 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	    dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)))
 		return -ENODEV;
 
-	vmd->msix_count = pci_msix_vec_count(dev);
-	if (vmd->msix_count < 0)
-		return -ENODEV;
-
-	vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count,
-					PCI_IRQ_MSIX);
-	if (vmd->msix_count < 0)
-		return vmd->msix_count;
-
-	vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
-				 GFP_KERNEL);
-	if (!vmd->irqs)
-		return -ENOMEM;
-
-	for (i = 0; i < vmd->msix_count; i++) {
-		err = init_srcu_struct(&vmd->irqs[i].srcu);
-		if (err)
-			return err;
-
-		INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
-		err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i),
-				       vmd_irq, IRQF_NO_THREAD,
-				       "vmd", &vmd->irqs[i]);
-		if (err)
-			return err;
-	}
+	err = vmd_alloc_irqs(vmd);
+	if (err)
+		return err;
 
 	spin_lock_init(&vmd->cfg_lock);
 	pci_set_drvdata(dev, vmd);
-- 
Gitee


From 6d7e63ed38027db02e8ec146588e8b1b87a240df Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:15:26 +0800
Subject: [PATCH 2032/2161] vmd: Update VMD PM to correctly use generic PCI PM

commit 93c9fce7d5f3db87f67718a2e5cf09b02c2c2c6a upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

The pci_save_state() call in vmd_suspend() can be performed by
pci_pm_suspend_irq(). This also allows VMD to benefit from the call into
pci_prepare_to_sleep().

The pci_restore_state() call in vmd_resume() was restoring state after
pci_pm_resume()::pci_restore_standard_config() had already restored state.
It's also been suspected that the config state should have been restored
before re-requesting IRQs instead of afterwards.

Remove the pci_save_state()/pci_restore_state() calls in
vmd_suspend()/vmd_resume() to allow proper flow through generic PCI core
Power Management code.

Link: https://lore.kernel.org/r/20200806210017.5654-1-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: You-Sheng Yang <vicamo.yang@canonical.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 98bc7634aca2..fad3d7374f0a 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -788,7 +788,6 @@ static int vmd_suspend(struct device *dev)
 	for (i = 0; i < vmd->msix_count; i++)
                 devm_free_irq(dev, pci_irq_vector(pdev, i), &vmd->irqs[i]);
 
-	pci_save_state(pdev);
 	return 0;
 }
 
@@ -806,7 +805,6 @@ static int vmd_resume(struct device *dev)
 			return err;
 	}
 
-	pci_restore_state(pdev);
 	return 0;
 }
 #endif
-- 
Gitee


From 8dc7f1e94c7413295b4a5b449711e5b890699781 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:16:30 +0800
Subject: [PATCH 2033/2161] PCI: vmd: Assign VMD IRQ domain before enumeration

commit 886e67100b904cb1b106ed1dfa8a60696aff519a upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

During the boot process all the PCI devices are assigned default PCI-MSI
IRQ domain including VMD endpoint devices. If interrupt-remapping is
enabled by IOMMU, the PCI devices except VMD get new INTEL-IR-MSI IRQ
domain. And VMD is supposed to create and assign a separate VMD-MSI IRQ
domain for its child devices in order to support MSI-X remapping
capabilities.

Now when MSI-X remapping in VMD is disabled in order to improve
performance, VMD skips VMD-MSI IRQ domain assignment process to its
child devices. Thus the devices behind VMD get default PCI-MSI IRQ
domain instead of INTEL-IR-MSI IRQ domain when VMD creates root bus and
configures child devices.

As a result host OS fails to boot and DMAR errors were observed when
interrupt remapping was enabled on Intel Icelake CPUs. For instance:

  DMAR: DRHD: handling fault status reg 2
  DMAR: [INTR-REMAP] Request device [0xe2:0x00.0] fault index 0xa00 [fault reason 0x25] Blocked a compatibility format interrupt request

To fix this issue, dev_msi_info struct in dev struct maintains correct
value of IRQ domain. VMD will use this information to assign proper IRQ
domain to its child devices when it doesn't create a separate IRQ domain.

Link: https://lore.kernel.org/r/20220511095707.25403-2-nirmal.patel@linux.intel.com
Signed-off-by: Nirmal Patel <nirmal.patel@linux.intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index fad3d7374f0a..eb80349ce915 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -698,6 +698,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	vmd_attach_resources(vmd);
 	if (vmd->irq_domain)
 		dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
+	else
+		dev_set_msi_domain(&vmd->bus->dev,
+				   dev_get_msi_domain(&vmd->dev->dev));
 
 	pci_scan_child_bus(vmd->bus);
 	pci_assign_unassigned_bus_resources(vmd->bus);
-- 
Gitee


From 02ecafc59b8e5a5d7ccab8394e111c3a92db37c9 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:17:49 +0800
Subject: [PATCH 2034/2161] PCI: vmd: Offset Client VMD MSI-X vectors

commit f6b7bb847ca821a8aaa1b6da10ee65311e6f15bf upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

Client VMD platforms have a software-triggered MSI-X vector 0 that will
not forward hardware-remapped MSI from the sub-device domain. This
causes an issue with VMD platforms that use AHCI behind VMD and have a
single MSI-X vector remapped to VMD vector 0. Add a VMD MSI-X vector
offset for these platforms.

Link: https://lore.kernel.org/r/20201102222223.92978-1-jonathan.derrick@intel.com
Tested-by: Jian-Hong Pan <jhp@endlessos.org>
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 37 +++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index eb80349ce915..d6abbf0f6881 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -52,6 +52,12 @@ enum vmd_features {
 	 * vendor-specific capability space
 	 */
 	VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP	= (1 << 2),
+
+	/*
+	 * Device may use MSI-X vector 0 for software triggering and will not
+	 * be used for MSI remapping
+	 */
+	VMD_FEAT_OFFSET_FIRST_VECTOR		= (1 << 3),
 };
 
 /*
@@ -103,6 +109,7 @@ struct vmd_dev {
 	struct irq_domain	*irq_domain;
 	struct pci_bus		*bus;
 	u8			busn_start;
+	u8			first_vec;
 };
 
 static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
@@ -198,11 +205,11 @@ static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
  */
 static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
 {
-	int i, best = 1;
 	unsigned long flags;
+	int i, best;
 
-	if (vmd->msix_count == 1)
-		return &vmd->irqs[0];
+	if (vmd->msix_count == 1 + vmd->first_vec)
+		return &vmd->irqs[vmd->first_vec];
 
 	/*
 	 * White list for fast-interrupt handlers. All others will share the
@@ -212,11 +219,12 @@ static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *d
 	case PCI_CLASS_STORAGE_EXPRESS:
 		break;
 	default:
-		return &vmd->irqs[0];
+		return &vmd->irqs[vmd->first_vec];
 	}
 
 	raw_spin_lock_irqsave(&list_lock, flags);
-	for (i = 1; i < vmd->msix_count; i++)
+	best = vmd->first_vec + 1;
+	for (i = best; i < vmd->msix_count; i++)
 		if (vmd->irqs[i].count < vmd->irqs[best].count)
 			best = i;
 	vmd->irqs[best].count++;
@@ -550,8 +558,8 @@ static int vmd_alloc_irqs(struct vmd_dev *vmd)
 	if (vmd->msix_count < 0)
 		return -ENODEV;
 
-	vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count,
-						PCI_IRQ_MSIX);
+	vmd->msix_count = pci_alloc_irq_vectors(dev, vmd->first_vec + 1,
+						vmd->msix_count, PCI_IRQ_MSIX);
 	if (vmd->msix_count < 0)
 		return vmd->msix_count;
 
@@ -722,6 +730,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 
 static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	unsigned long features = (unsigned long) id->driver_data;
 	struct vmd_dev *vmd;
 	int err;
 
@@ -746,13 +755,16 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	    dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)))
 		return -ENODEV;
 
+	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
+		vmd->first_vec = 1;
+
 	err = vmd_alloc_irqs(vmd);
 	if (err)
 		return err;
 
 	spin_lock_init(&vmd->cfg_lock);
 	pci_set_drvdata(dev, vmd);
-	err = vmd_enable_domain(vmd, (unsigned long) id->driver_data);
+	err = vmd_enable_domain(vmd, features);
 	if (err)
 		return err;
 
@@ -821,13 +833,16 @@ static const struct pci_device_id vmd_ids[] = {
 				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
 		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
-				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+				VMD_FEAT_HAS_BUS_RESTRICTIONS |
+				VMD_FEAT_OFFSET_FIRST_VECTOR,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x467f),
 		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
-				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+				VMD_FEAT_HAS_BUS_RESTRICTIONS |
+				VMD_FEAT_OFFSET_FIRST_VECTOR,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x4c3d),
 		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
-				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+				VMD_FEAT_HAS_BUS_RESTRICTIONS |
+				VMD_FEAT_OFFSET_FIRST_VECTOR,},
 	{0,}
 };
 MODULE_DEVICE_TABLE(pci, vmd_ids);
-- 
Gitee


From 6a7870cd04b1fefd3ed5c45308436ca42f244911 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:19:35 +0800
Subject: [PATCH 2035/2161] PCI: vmd: Disable MSI-X remapping when possible

commit ee81ee84f8739e584c9ccf113ba3c796187b7080 upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

VMD will retransmit child device MSI-X using its own MSI-X table and
requester-id. This limits the number of MSI-X available to the whole
child device domain to the number of VMD MSI-X interrupts.

Some VMD devices have a mode where this remapping can be disabled,
allowing child device interrupts to bypass processing with the VMD MSI-X
domain interrupt handler and going straight the child device interrupt
handler, allowing for better performance and scaling. The requester-id
still gets changed to the VMD endpoint's requester-id, and the interrupt
remapping handlers have been updated to properly set IRTE for child
device interrupts to the VMD endpoint's context.

Some VMD platforms have existing production BIOS which rely on MSI-X
remapping and won't explicitly program the MSI-X remapping bit. This
re-enables MSI-X remapping on unload.

Link: https://lore.kernel.org/r/20210210161315.316097-3-jonathan.derrick@intel.com
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Krzysztof Wilczy?ski <kw@linux.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 63 +++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index d6abbf0f6881..74f2c7f32351 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -27,6 +27,7 @@
 #define BUS_RESTRICT_CAP(vmcap)	(vmcap & 0x1)
 #define PCI_REG_VMCONFIG	0x44
 #define BUS_RESTRICT_CFG(vmcfg)	((vmcfg >> 8) & 0x3)
+#define VMCONFIG_MSI_REMAP	0x2
 #define PCI_REG_VMLOCK		0x70
 #define MB2_SHADOW_EN(vmlock)	(vmlock & 0x2)
 
@@ -58,6 +59,13 @@ enum vmd_features {
 	 * be used for MSI remapping
 	 */
 	VMD_FEAT_OFFSET_FIRST_VECTOR		= (1 << 3),
+
+	/*
+	 * Device can bypass remapping MSI-X transactions into its MSI-X table,
+	 * avoiding the requirement of a VMD MSI domain for child device
+	 * interrupt handling.
+	 */
+	VMD_FEAT_CAN_BYPASS_MSI_REMAP		= (1 << 4),
 };
 
 /*
@@ -305,6 +313,16 @@ static struct msi_domain_info vmd_msi_domain_info = {
 	.chip		= &vmd_msi_controller,
 };
 
+static void vmd_set_msi_remapping(struct vmd_dev *vmd, bool enable)
+{
+	u16 reg;
+
+	pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, &reg);
+	reg = enable ? (reg & ~VMCONFIG_MSI_REMAP) :
+		       (reg | VMCONFIG_MSI_REMAP);
+	pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg);
+}
+
 static int vmd_create_irq_domain(struct vmd_dev *vmd)
 {
 	struct fwnode_handle *fn;
@@ -325,6 +343,13 @@ static int vmd_create_irq_domain(struct vmd_dev *vmd)
 
 static void vmd_remove_irq_domain(struct vmd_dev *vmd)
 {
+	/*
+	 * Some production BIOS won't enable remapping between soft reboots.
+	 * Ensure remapping is restored before unloading the driver.
+	 */
+	if (!vmd->msix_count)
+		vmd_set_msi_remapping(vmd, true);
+
 	if (vmd->irq_domain) {
 		struct fwnode_handle *fn = vmd->irq_domain->fwnode;
 
@@ -681,15 +706,32 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 
 	sd->node = pcibus_to_node(vmd->dev->bus);
 
-	ret = vmd_create_irq_domain(vmd);
-	if (ret)
-		return ret;
-
 	/*
-	 * Override the irq domain bus token so the domain can be distinguished
-	 * from a regular PCI/MSI domain.
+	 * Currently MSI remapping must be enabled in guest passthrough mode
+	 * due to some missing interrupt remapping plumbing. This is probably
+	 * acceptable because the guest is usually CPU-limited and MSI
+	 * remapping doesn't become a performance bottleneck.
 	 */
-	irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI);
+	if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
+	    offset[0] || offset[1]) {
+		ret = vmd_alloc_irqs(vmd);
+		if (ret)
+			return ret;
+
+		vmd_set_msi_remapping(vmd, true);
+
+		ret = vmd_create_irq_domain(vmd);
+		if (ret)
+			return ret;
+
+		/*
+		 * Override the IRQ domain bus token so the domain can be
+		 * distinguished from a regular PCI/MSI domain.
+		 */
+		irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI);
+	} else {
+		vmd_set_msi_remapping(vmd, false);
+	}
 
 	pci_add_resource(&resources, &vmd->resources[0]);
 	pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]);
@@ -758,10 +800,6 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
 		vmd->first_vec = 1;
 
-	err = vmd_alloc_irqs(vmd);
-	if (err)
-		return err;
-
 	spin_lock_init(&vmd->cfg_lock);
 	pci_set_drvdata(dev, vmd);
 	err = vmd_enable_domain(vmd, features);
@@ -830,7 +868,8 @@ static const struct pci_device_id vmd_ids[] = {
 		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_28C0),
 		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW |
-				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+				VMD_FEAT_HAS_BUS_RESTRICTIONS |
+				VMD_FEAT_CAN_BYPASS_MSI_REMAP,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
 		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
 				VMD_FEAT_HAS_BUS_RESTRICTIONS |
-- 
Gitee


From 014f730f94b76b3e98fd9957733dbcf5cce43a0f Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:25:45 +0800
Subject: [PATCH 2036/2161] PCI: vmd: Do not disable MSI-X remapping if
 interrupt remapping is enabled by IOMMU

commit 2565e5b69c44b4e42469afea3cc5a97e74d1ed45 upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.
Conflict: No

When enabling VMD in BIOS setup (Ice Lake Processor: Whitley platform),
the host OS cannot boot successfully with the following error message:

  nvme nvme0: I/O 12 QID 0 timeout, completion polled
  nvme nvme0: Shutdown timeout set to 6 seconds
  DMAR: DRHD: handling fault status reg 2
  DMAR: [INTR-REMAP] Request device [0x00:0x00.5] fault index 0xa00 [fault reason 0x25] Blocked a compatibility format interrupt request

The request device is the VMD controller:
  # lspci -s 0000:00.5 -nn
  0000:00:00.5 RAID bus controller [0104]: Intel Corporation Volume
  Management Device NVMe RAID Controller [8086:28c0] (rev 04)

`git bisect` points to this offending commit ee81ee84f873 ("PCI:
vmd: Disable MSI-X remapping when possible"), which disables VMD MSI
remapping. The IOMMU hardware blocks the compatibility format
interrupt request because Interrupt Remapping Enable Status (IRES) and
Extended Interrupt Mode Enable (EIME) are enabled. Please refer to
section "5.1.4 Interrupt-Remapping Hardware Operation" in Intel VT-d
spec.

To fix the issue, VMD driver still enables the interrupt remapping
irrespective of VMD_FEAT_CAN_BYPASS_MSI_REMAP if the IOMMU subsystem
enables the interrupt remapping.

Test configuration is shown as follows:
  * Two VMD controllers
    1. 8086:28c0 (Whitley's VMD)
    2. 8086:201d (Purley's VMD: The issue does not appear in this
        controller. Just make sure if any side effect occurs.)
  * w/wo intremap=off

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=214219
Link: https://lore.kernel.org/r/20210901124047.1615-1-adrianhuang0701@gmail.com
Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Cc: Jon Derrick <jonathan.derrick@intel.com>
Cc: Nirmal Patel <nirmal.patel@linux.intel.com>
Cc: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 74f2c7f32351..33bcbb172d9e 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -6,6 +6,7 @@
 
 #include <linux/device.h>
 #include <linux/interrupt.h>
+#include <linux/iommu.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -712,7 +713,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	 * acceptable because the guest is usually CPU-limited and MSI
 	 * remapping doesn't become a performance bottleneck.
 	 */
-	if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
+	if (iommu_capable(vmd->dev->dev.bus, IOMMU_CAP_INTR_REMAP) ||
+	    !(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
 	    offset[0] || offset[1]) {
 		ret = vmd_alloc_irqs(vmd);
 		if (ret)
-- 
Gitee


From 609f91cc92f63aa2a2bbdb3f5b153fc14ee83f8a Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Tue, 6 Sep 2022 11:27:41 +0800
Subject: [PATCH 2037/2161] PCI: vmd: Revert 2565e5b69c44 ("PCI: vmd: Do not
 disable MSI-X remapping if interrupt remapping is enabled by IOMMU.")

commit c94f732e8001a860b42aa740b0a178a29907463c upstream.

Backport Reason: fix spr fio softlock issue causing by vmd driver.

Revert 2565e5b69c44 ("PCI: vmd: Do not disable MSI-X remapping if
interrupt remapping is enabled by IOMMU.")

The commit 2565e5b69c44 was added as a workaround to keep MSI-X
remapping enabled if IOMMU enables interrupt remapping. VMD would keep
running in low performance mode. There is no dependency between MSI-X
remapping by VMD and interrupt remapping by IOMMU.

Link: https://lore.kernel.org/r/20220511095707.25403-3-nirmal.patel@linux.intel.com
Signed-off-by: Nirmal Patel <nirmal.patel@linux.intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/controller/vmd.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 33bcbb172d9e..74f2c7f32351 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -6,7 +6,6 @@
 
 #include <linux/device.h>
 #include <linux/interrupt.h>
-#include <linux/iommu.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -713,8 +712,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	 * acceptable because the guest is usually CPU-limited and MSI
 	 * remapping doesn't become a performance bottleneck.
 	 */
-	if (iommu_capable(vmd->dev->dev.bus, IOMMU_CAP_INTR_REMAP) ||
-	    !(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
+	if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
 	    offset[0] || offset[1]) {
 		ret = vmd_alloc_irqs(vmd);
 		if (ret)
-- 
Gitee


From a21c55cd00da4e771797a1881bf4480a758b1feb Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Mon, 3 Oct 2022 22:53:29 +0800
Subject: [PATCH 2038/2161] Revert "dmaengine: Remove the last, used parameters
 in dma_async_is_tx_complete"

commit opencloudos.

This reverts commit 11b6822fb65944e91f9690202bc9d75b214e8ee3.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/dmaengine/client.rst  | 17 ++++++++++++++---
 .../driver-api/dmaengine/provider.rst          |  7 +++++++
 drivers/crypto/stm32/stm32-hash.c              |  3 ++-
 drivers/dma/dmaengine.c                        |  2 +-
 drivers/dma/dmatest.c                          |  3 ++-
 drivers/media/platform/omap/omap_vout_vrfb.c   |  2 +-
 drivers/media/platform/pxa_camera.c            | 13 ++-----------
 drivers/rapidio/devices/rio_mport_cdev.c       |  3 ++-
 include/linux/dmaengine.h                      | 18 +++++++++++++++---
 9 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/Documentation/driver-api/dmaengine/client.rst b/Documentation/driver-api/dmaengine/client.rst
index fb2f6cfd4530..dfab71c637f6 100644
--- a/Documentation/driver-api/dmaengine/client.rst
+++ b/Documentation/driver-api/dmaengine/client.rst
@@ -162,8 +162,8 @@ The details of these operations are:
 
       dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc)
 
-   This returns a cookie that can be used to check the progress of a transaction
-   via dma_async_is_tx_complete().
+   This returns a cookie can be used to check the progress of DMA engine
+   activity via other DMA engine calls not covered in this document.
 
    dmaengine_submit() will not start the DMA operation, it merely adds
    it to the pending queue. For this, see step 5, dma_async_issue_pending.
@@ -243,11 +243,22 @@ Further APIs:
    .. code-block:: c
 
       enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
-		dma_cookie_t cookie)
+		dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+
+   This can be used to check the status of the channel. Please see
+   the documentation in include/linux/dmaengine.h for a more complete
+   description of this API.
 
    This can be used with the cookie returned from dmaengine_submit()
    to check for completion of a specific DMA transaction.
 
+   .. note::
+
+      Not all DMA engine drivers can return reliable information for
+      a running DMA channel. It is recommended that DMA engine users
+      pause or stop (via dmaengine_terminate_all()) the channel before
+      using this API.
+
 5. Synchronize termination API
 
    .. code-block:: c
diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index 31e140c353d3..ceec2374add6 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -481,6 +481,13 @@ where to put them)
 - Makes sure that dependent operations are run before marking it
   as complete.
 
+dma_cookie_t
+
+- it's a DMA transaction ID that will increment over time.
+
+- Not really relevant any more since the introduction of ``virt-dma``
+  that abstracts it away.
+
 DMA_CTRL_ACK
 
 - If clear, the descriptor cannot be reused by provider until the
diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c
index 37ef6df6e4d5..dcce15b55809 100644
--- a/drivers/crypto/stm32/stm32-hash.c
+++ b/drivers/crypto/stm32/stm32-hash.c
@@ -451,7 +451,8 @@ static int stm32_hash_xmit_dma(struct stm32_hash_dev *hdev,
 					 msecs_to_jiffies(100)))
 		err = -ETIMEDOUT;
 
-	if (dma_async_is_tx_complete(hdev->dma_lch, cookie) != DMA_COMPLETE)
+	if (dma_async_is_tx_complete(hdev->dma_lch, cookie,
+				     NULL, NULL) != DMA_COMPLETE)
 		err = -ETIMEDOUT;
 
 	if (err) {
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 590f238a0671..2cfa8458b51b 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -523,7 +523,7 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
 
 	dma_async_issue_pending(chan);
 	do {
-		status = dma_async_is_tx_complete(chan, cookie);
+		status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
 		if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
 			dev_err(chan->device->dev, "%s: timeout!\n", __func__);
 			return DMA_ERROR;
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index a184c8c50e20..238936e2dfe2 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -813,7 +813,8 @@ static int dmatest_func(void *data)
 					done->done,
 					msecs_to_jiffies(params->timeout));
 
-			status = dma_async_is_tx_complete(chan, cookie);
+			status = dma_async_is_tx_complete(chan, cookie, NULL,
+							  NULL);
 		}
 
 		if (!done->done) {
diff --git a/drivers/media/platform/omap/omap_vout_vrfb.c b/drivers/media/platform/omap/omap_vout_vrfb.c
index 6750db90b450..6bd672cbdb62 100644
--- a/drivers/media/platform/omap/omap_vout_vrfb.c
+++ b/drivers/media/platform/omap/omap_vout_vrfb.c
@@ -289,7 +289,7 @@ int omap_vout_prepare_vrfb(struct omap_vout_device *vout,
 					 vout->vrfb_dma_tx.tx_status == 1,
 					 VRFB_TX_TIMEOUT);
 
-	status = dma_async_is_tx_complete(chan, cookie);
+	status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
 
 	if (vout->vrfb_dma_tx.tx_status == 0) {
 		pr_err("%s: Timeout while waiting for DMA\n", __func__);
diff --git a/drivers/media/platform/pxa_camera.c b/drivers/media/platform/pxa_camera.c
index dd3fb8d2e86f..6e04e3ec61ba 100644
--- a/drivers/media/platform/pxa_camera.c
+++ b/drivers/media/platform/pxa_camera.c
@@ -1089,17 +1089,8 @@ static void pxa_camera_dma_irq(struct pxa_camera_dev *pcdev,
 	last_buf = list_entry(pcdev->capture.prev,
 			      struct pxa_buffer, queue);
 	last_status = dma_async_is_tx_complete(pcdev->dma_chans[chan],
-					       last_buf->cookie[chan]);
-	/*
-	 * Peek into the channel and read the last cookie that was issued.
-	 * This is a layering violation - the dmaengine API does not officially
-	 * provide this information. Since this camera driver is tightly coupled
-	 * with a specific DMA device we know exactly how this cookie value will
-	 * behave. Otherwise, this wouldn't be safe.
-	 */
-	last_issued = pcdev->dma_chans[chan]->cookie;
-	barrier();
-
+					       last_buf->cookie[chan],
+					       NULL, &last_issued);
 	if (camera_status & overrun &&
 	    last_status != DMA_COMPLETE) {
 		dev_dbg(pcdev_to_dev(pcdev), "FIFO overrun! CISR: %x\n",
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 41e3c3fcd925..2b08fdeb87c1 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -599,7 +599,8 @@ static void dma_xfer_callback(void *param)
 	struct mport_dma_req *req = (struct mport_dma_req *)param;
 	struct mport_cdev_priv *priv = req->priv;
 
-	req->status = dma_async_is_tx_complete(priv->dmach, req->cookie);
+	req->status = dma_async_is_tx_complete(priv->dmach, req->cookie,
+					       NULL, NULL);
 	complete(&req->req_comp);
 	kref_put(&req->refcount, dma_req_free);
 }
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 59caf248173f..b70bdaf38514 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1415,13 +1415,25 @@ static inline void dma_async_issue_pending(struct dma_chan *chan)
  * dma_async_is_tx_complete - poll for transaction completion
  * @chan: DMA channel
  * @cookie: transaction identifier to check status of
+ * @last: returns last completed cookie, can be NULL
+ * @used: returns last issued cookie, can be NULL
+ *
+ * If @last and @used are passed in, upon return they reflect the most
+ * recently submitted (used) cookie and the most recently completed
+ * cookie.
  */
 static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
-	dma_cookie_t cookie)
+	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
 {
 	struct dma_tx_state state;
-
-	return chan->device->device_tx_status(chan, cookie, &state);
+	enum dma_status status;
+
+	status = chan->device->device_tx_status(chan, cookie, &state);
+	if (last)
+		*last = state.last;
+	if (used)
+		*used = state.used;
+	return status;
 }
 
 #ifdef CONFIG_DMA_ENGINE
-- 
Gitee


From 244dcc76c857b7059fcd1061e73d08e0df62ab8c Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Sat, 8 Oct 2022 15:52:56 +0800
Subject: [PATCH 2039/2161] Revert "dmaengine: Remove dma_set_tx_state"

commit opencloudos.

This reverts commit ad8bbc00006ab663655f4720ce69288bd3026956.
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index c31383c9db1d..2a695a30d334 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -88,6 +88,15 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 	return DMA_IN_PROGRESS;
 }
 
+static inline void dma_set_tx_state(struct dma_tx_state *st,
+	dma_cookie_t last, dma_cookie_t used, u32 residue)
+{
+	if (!st)
+		return;
+
+	st->residue = residue;
+}
+
 static inline void dma_set_residue(struct dma_tx_state *state, u32 residue)
 {
 	if (state)
-- 
Gitee


From 03918e9889e0c5936e175361a5d0a6b63e5f90a2 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Sat, 8 Oct 2022 15:53:03 +0800
Subject: [PATCH 2040/2161] Revert "dmaengine: Remove last, used from
 dma_tx_state"

commit opencloudos.

This reverts commit 8aabc8207cd8c897b5891fd32f8bc41b699d4078.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.h   | 4 ++++
 include/linux/dmaengine.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 2a695a30d334..80787829539d 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -74,6 +74,8 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 	complete = chan->completed_cookie;
 	barrier();
 	if (state) {
+		state->last = complete;
+		state->used = used;
 		state->residue = 0;
 		state->in_flight_bytes = 0;
 	}
@@ -94,6 +96,8 @@ static inline void dma_set_tx_state(struct dma_tx_state *st,
 	if (!st)
 		return;
 
+	st->last = last;
+	st->used = used;
 	st->residue = residue;
 }
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index b70bdaf38514..7a4e8a51ce6f 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -696,12 +696,16 @@ static inline struct dma_async_tx_descriptor *txd_next(struct dma_async_tx_descr
 /**
  * struct dma_tx_state - filled in to report the status of
  * a transfer.
+ * @last: last completed DMA cookie
+ * @used: last issued DMA cookie (i.e. the one in progress)
  * @residue: the remaining number of bytes left to transmit
  *	on the selected transfer for states DMA_IN_PROGRESS and
  *	DMA_PAUSED if this is implemented in the driver, else 0
  * @in_flight_bytes: amount of data in bytes cached by the DMA.
  */
 struct dma_tx_state {
+	dma_cookie_t last;
+	dma_cookie_t used;
 	u32 residue;
 	u32 in_flight_bytes;
 };
-- 
Gitee


From 00571d0d1a4385ab5f11f3e973ea32b20e5ae69c Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Sat, 8 Oct 2022 15:59:33 +0800
Subject: [PATCH 2041/2161] Revert "dmaengine: Move dma_set_tx_state to the
 provider API header"

commit opencloudos.

This reverts commit 57fa93b96dfecdfd2b57f924a64ac88fa717a5e4.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/dmaengine.h   | 11 -----------
 include/linux/dmaengine.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 80787829539d..04c1ebf96640 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -90,17 +90,6 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 	return DMA_IN_PROGRESS;
 }
 
-static inline void dma_set_tx_state(struct dma_tx_state *st,
-	dma_cookie_t last, dma_cookie_t used, u32 residue)
-{
-	if (!st)
-		return;
-
-	st->last = last;
-	st->used = used;
-	st->residue = residue;
-}
-
 static inline void dma_set_residue(struct dma_tx_state *state, u32 residue)
 {
 	if (state)
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 7a4e8a51ce6f..a439ca03b450 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1440,6 +1440,17 @@ static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 	return status;
 }
 
+static inline void
+dma_set_tx_state(struct dma_tx_state *st, dma_cookie_t last, dma_cookie_t used, u32 residue)
+{
+	if (!st)
+		return;
+
+	st->last = last;
+	st->used = used;
+	st->residue = residue;
+}
+
 #ifdef CONFIG_DMA_ENGINE
 struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type);
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
-- 
Gitee


From 2c0d7396a6710e06a7cb95fcfca92da77a0da023 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Sat, 8 Oct 2022 15:59:39 +0800
Subject: [PATCH 2042/2161] Revert "dmaengine: Remove dma_async_is_complete
 from client API"

commit opencloudos.

This reverts commit e63ff28e3ac9042eeda7a3a9a0576d6c2d2ab4ad.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/driver-api/dmaengine/client.rst |  5 ++--
 drivers/dma/amba-pl08x.c                      |  1 +
 drivers/dma/at_hdmac.c                        |  3 +-
 drivers/dma/dmaengine.h                       | 10 +------
 include/linux/dmaengine.h                     | 28 +++++++++++++++++--
 5 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/Documentation/driver-api/dmaengine/client.rst b/Documentation/driver-api/dmaengine/client.rst
index dfab71c637f6..45953f171500 100644
--- a/Documentation/driver-api/dmaengine/client.rst
+++ b/Documentation/driver-api/dmaengine/client.rst
@@ -249,8 +249,9 @@ Further APIs:
    the documentation in include/linux/dmaengine.h for a more complete
    description of this API.
 
-   This can be used with the cookie returned from dmaengine_submit()
-   to check for completion of a specific DMA transaction.
+   This can be used in conjunction with dma_async_is_complete() and
+   the cookie returned from dmaengine_submit() to check for
+   completion of a specific DMA transaction.
 
    .. note::
 
diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 8bf8f42b8b06..9adc7a2fa3d3 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -1544,6 +1544,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_interrupt(
 }
 
 /*
+ * Code accessing dma_async_is_complete() in a tight loop may give problems.
  * If slaves are relying on interrupts to signal completion this function
  * must not be called with interrupts disabled.
  */
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 0e320a74ee45..303bc3e601a1 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -1466,7 +1466,8 @@ static int atc_terminate_all(struct dma_chan *chan)
  * @txstate: if not %NULL updated with transaction state
  *
  * If @txstate is passed in, upon return it reflect the driver
- * internal state.
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
  */
 static enum dma_status
 atc_tx_status(struct dma_chan *chan,
diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 04c1ebf96640..1bfbd64b1371 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -79,15 +79,7 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 		state->residue = 0;
 		state->in_flight_bytes = 0;
 	}
-
-	if (complete <= used) {
-		if ((cookie <= complete) || (cookie > used))
-			return DMA_COMPLETE;
-	} else {
-		if ((cookie <= complete) && (cookie > used))
-			return DMA_COMPLETE;
-	}
-	return DMA_IN_PROGRESS;
+	return dma_async_is_complete(cookie, complete, used);
 }
 
 static inline void dma_set_residue(struct dma_tx_state *state, u32 residue)
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index a439ca03b450..6529dd736106 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1422,9 +1422,9 @@ static inline void dma_async_issue_pending(struct dma_chan *chan)
  * @last: returns last completed cookie, can be NULL
  * @used: returns last issued cookie, can be NULL
  *
- * If @last and @used are passed in, upon return they reflect the most
- * recently submitted (used) cookie and the most recently completed
- * cookie.
+ * If @last and @used are passed in, upon return they reflect the driver
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
  */
 static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
@@ -1440,6 +1440,28 @@ static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 	return status;
 }
 
+/**
+ * dma_async_is_complete - test a cookie against chan state
+ * @cookie: transaction identifier to test status of
+ * @last_complete: last know completed transaction
+ * @last_used: last cookie value handed out
+ *
+ * dma_async_is_complete() is used in dma_async_is_tx_complete()
+ * the test logic is separated for lightweight testing of multiple cookies
+ */
+static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
+			dma_cookie_t last_complete, dma_cookie_t last_used)
+{
+	if (last_complete <= last_used) {
+		if ((cookie <= last_complete) || (cookie > last_used))
+			return DMA_COMPLETE;
+	} else {
+		if ((cookie <= last_complete) && (cookie > last_used))
+			return DMA_COMPLETE;
+	}
+	return DMA_IN_PROGRESS;
+}
+
 static inline void
 dma_set_tx_state(struct dma_tx_state *st, dma_cookie_t last, dma_cookie_t used, u32 residue)
 {
-- 
Gitee


From 262a261f6d283d2e7d47192ac605afcfc2fe2948 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Sun, 9 Oct 2022 15:34:21 +0800
Subject: [PATCH 2043/2161] update config.default_nosign for tmanger make spr
 support kernel.

commit opencloudos.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 package/default/config.default_kasan | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/package/default/config.default_kasan b/package/default/config.default_kasan
index dcfabd5d2b73..381593d4e0bc 100644
--- a/package/default/config.default_kasan
+++ b/package/default/config.default_kasan
@@ -157,6 +157,7 @@ CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_IOASIDS=y
 # CONFIG_CGROUP_RDMA is not set
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_HUGETLB=y
@@ -455,6 +456,7 @@ CONFIG_LEGACY_VSYSCALL_EMULATE=y
 # CONFIG_LEGACY_VSYSCALL_NONE is not set
 # CONFIG_CMDLINE_BOOL is not set
 CONFIG_MODIFY_LDT_SYSCALL=y
+# CONFIG_STRICT_SIGALTSTACK_SIZE is not set
 CONFIG_HAVE_LIVEPATCH=y
 CONFIG_LIVEPATCH=y
 # end of Processor type and features
@@ -536,11 +538,14 @@ CONFIG_ACPI_APEI_MEMORY_FAILURE=y
 CONFIG_ACPI_APEI_EINJ=m
 CONFIG_ACPI_APEI_ERST_DEBUG=m
 # CONFIG_DPTF_POWER is not set
+CONFIG_ACPI_PFRU=y
+CONFIG_ACPI_PFRU_TELEMETRY=y
 CONFIG_ACPI_EXTLOG=m
 CONFIG_ACPI_ADXL=y
 # CONFIG_PMIC_OPREGION is not set
 # CONFIG_ACPI_CONFIGFS is not set
 CONFIG_X86_PM_TIMER=y
+CONFIG_ACPI_PRMT=y
 # CONFIG_SFI is not set
 
 #
@@ -698,7 +703,8 @@ CONFIG_OPROFILE_EVENT_MULTIPLEX=y
 CONFIG_HAVE_OPROFILE=y
 CONFIG_OPROFILE_NMI_TIMER=y
 CONFIG_KPROBES=y
-# CONFIG_JUMP_LABEL is not set
+CONFIG_JUMP_LABEL=y
+# CONFIG_STATIC_KEYS_SELFTEST is not set
 CONFIG_OPTPROBES=y
 CONFIG_KPROBES_ON_FTRACE=y
 CONFIG_UPROBES=y
@@ -3100,6 +3106,7 @@ CONFIG_BCMA_POSSIBLE=y
 # CONFIG_LPC_SCH is not set
 # CONFIG_MFD_INTEL_LPSS_ACPI is not set
 # CONFIG_MFD_INTEL_LPSS_PCI is not set
+CONFIG_MFD_INTEL_PMT=y
 # CONFIG_MFD_JANZ_CMODIO is not set
 # CONFIG_MFD_KEMPLD is not set
 # CONFIG_MFD_88PM800 is not set
@@ -3780,6 +3787,11 @@ CONFIG_DMA_VIRTUAL_CHANNELS=y
 CONFIG_DMA_ACPI=y
 # CONFIG_ALTERA_MSGDMA is not set
 CONFIG_INTEL_IDMA64=y
+CONFIG_INTEL_IDXD_BUS=y
+CONFIG_INTEL_IDXD=y
+CONFIG_INTEL_IDXD_COMPAT=y
+CONFIG_INTEL_IDXD_SVM=y
+CONFIG_INTEL_IDXD_PERFMON=y
 CONFIG_INTEL_IOATDMA=y
 # CONFIG_QCOM_HIDMA_MGMT is not set
 # CONFIG_QCOM_HIDMA is not set
@@ -3829,6 +3841,7 @@ CONFIG_VFIO_PCI_MMAP=y
 CONFIG_VFIO_PCI_INTX=y
 CONFIG_VFIO_PCI_IGD=y
 CONFIG_VFIO_MDEV=m
+CONFIG_VFIO_MDEV_IDXD=m
 CONFIG_VFIO_MDEV_DEVICE=m
 CONFIG_IRQ_BYPASS_MANAGER=m
 CONFIG_VIRT_DRIVERS=y
@@ -3931,7 +3944,9 @@ CONFIG_IOMMU_SUPPORT=y
 # end of Generic IOMMU Pagetable Support
 
 # CONFIG_IOMMU_DEBUGFS is not set
-# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set
+# CONFIG_IOMMU_DEFAULT_DMA_STRICT is not set
+# CONFIG_IOMMU_DEFAULT_DMA_LAZY is not set
+CONFIG_IOMMU_DEFAULT_PASSTHROUGH=y
 CONFIG_AMD_IOMMU=y
 CONFIG_AMD_IOMMU_V2=m
 CONFIG_DMAR_TABLE=y
@@ -3939,6 +3954,7 @@ CONFIG_INTEL_IOMMU=y
 CONFIG_INTEL_IOMMU_SVM=y
 # CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
 CONFIG_INTEL_IOMMU_FLOPPY_WA=y
+CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON=y
 CONFIG_IRQ_REMAP=y
 # CONFIG_SMMU_BYPASS_DEV is not set
 
@@ -4643,6 +4659,8 @@ CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m
 CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m
 CONFIG_CRYPTO_DEV_QAT_C62XVF=m
 # CONFIG_CRYPTO_DEV_NITROX_CNN55XX is not set
+CONFIG_CRYPTO_DEV_IAX_CRYPTO=y
+CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS=y
 # CONFIG_CRYPTO_DEV_CHELSIO is not set
 # CONFIG_CRYPTO_DEV_CHELSIO_TLS is not set
 CONFIG_CRYPTO_DEV_VIRTIO=m
-- 
Gitee


From 85fe6d6d599d00c4e36ed9a90c930670991c6e1c Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Wed, 19 Oct 2022 15:49:40 +0800
Subject: [PATCH 2044/2161] Fix some compile error for the kasan version.

commit opencloudos.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/pgtable.h | 1 +
 include/linux/lockdep.h        | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 515787ace19d..a5c2dbf21b19 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -25,6 +25,7 @@
 #include <asm/x86_init.h>
 #include <asm/pkru.h>
 #include <asm/fpu/api.h>
+#include <linux/spinlock.h>
 
 extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 42c7255d030f..a3e607abb040 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -652,7 +652,7 @@ do {									\
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
 		     debug_locks			&&		\
 		     (preempt_count() != 0		||		\
-		      !this_cpu_read(hardirqs_enabled)));		\
+		      !current->hardirqs_enabled));		\
 } while (0)
 
 #define lockdep_assert_preemption_disabled()				\
@@ -660,7 +660,7 @@ do {									\
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
 		     debug_locks			&&		\
 		     (preempt_count() == 0		&&		\
-		      this_cpu_read(hardirqs_enabled)));		\
+		      current->hardirqs_enabled));		\
 } while (0)
 
 #else
-- 
Gitee


From effe56489e031df82782174fdd22292d5580b824 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Fri, 2 Sep 2022 10:27:44 +0800
Subject: [PATCH 2045/2161] iommu/vt-d: Only enable SLT if guest is passthrough
 mode

commit opencloudos.

Add new support for guest's configuration due to DMA API PASID introduction.
When guest is passthrough mode, the translation is gPA to hPA. So, only
is SLT required.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/iommu/intel/svm.c  | 12 ++++++++----
 drivers/iommu/iommu.c      |  2 +-
 include/uapi/linux/iommu.h |  1 +
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index dbf536b8c021..a932e0a8cc0f 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -533,10 +533,14 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain,
 	 * call the nested mode setup function here.
 	 */
 	spin_lock_irqsave(&iommu->lock, iflags);
-	ret = intel_pasid_setup_nested(iommu, dev,
-				       (pgd_t *)(uintptr_t)data->gpgd,
-				       data->hpasid, &data->vendor.vtd, dmar_domain,
-				       data->addr_width);
+	if (data->flags & IOMMU_SVA_SL_ONLY) {
+		ret = intel_pasid_setup_second_level(iommu, dmar_domain, dev, data->hpasid);
+	} else {
+		ret = intel_pasid_setup_nested(iommu, dev,
+					       (pgd_t *)(uintptr_t)data->gpgd,
+					       data->hpasid, &data->vendor.vtd, dmar_domain,
+					       data->addr_width);
+	}
 	spin_unlock_irqrestore(&iommu->lock, iflags);
 	if (ret) {
 		dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 56f6d8e80799..fbf8e17b07f9 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2434,7 +2434,7 @@ static int iommu_check_bind_data(struct iommu_gpasid_bind_data *data)
 		return -EINVAL;
 
 	/* Check all flags */
-	mask = IOMMU_SVA_GPASID_VAL | IOMMU_SVA_HPASID_DEF;
+	mask = IOMMU_SVA_GPASID_VAL | IOMMU_SVA_HPASID_DEF | IOMMU_SVA_SL_ONLY;
 	if (data->flags & ~mask)
 		return -EINVAL;
 
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 8696459e1b9e..e68a5ae67ebd 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -329,6 +329,7 @@ struct iommu_gpasid_bind_data {
 	__u32 addr_width;
 #define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
 #define IOMMU_SVA_HPASID_DEF	(1 << 1) /* use default host PASID */
+#define IOMMU_SVA_SL_ONLY	(1 << 2) /* only setup SLT */
 	__u64 flags;
 	__u64 gpgd;
 	__u64 hpasid;
-- 
Gitee


From 1b86e36dbb4c3926865ea0e66c3533acf42a2d25 Mon Sep 17 00:00:00 2001
From: Yang Weijiang <weijiang.yang@intel.com>
Date: Wed, 12 Oct 2022 19:27:05 -0400
Subject: [PATCH 2046/2161] iommu/vt-d: Retain PASID entry if guest switched
 remapping mode

commit opencloudos.

Keep PASID SST entry in case guest changed scalable mode to legacy mode.
New entry will be built during vIOMMU reinitialization.

Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/arm64/configs/defconfig         | 5 ++++-
 arch/x86/configs/oc.config           | 7 +++----
 drivers/iommu/intel/pasid.c          | 9 ++++++---
 package/default/config.default_kasan | 8 ++++----
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index c9a867ac32d4..b39b520ee465 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -858,4 +858,7 @@ CONFIG_DEBUG_KERNEL=y
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
 # CONFIG_FTRACE is not set
-CONFIG_MEMTEST=y
+CONFIG_INTEL_IDXD_BUS=m
+CONFIG_INTEL_IDXD=m
+CONFIG_CRYPTO_DEV_IAX_CRYPTO=m
+# CONFIG_INTEL_IDXD_COMPAT is not set
diff --git a/arch/x86/configs/oc.config b/arch/x86/configs/oc.config
index 7726e996f62f..26bd155d4d60 100644
--- a/arch/x86/configs/oc.config
+++ b/arch/x86/configs/oc.config
@@ -1548,13 +1548,12 @@ CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_UNWINDER_FRAME_POINTER=y
 # Intel SPR
 CONFIG_MFD_INTEL_PMT=y
-CONFIG_INTEL_IDXD_BUS=y
-CONFIG_INTEL_IDXD=y
-CONFIG_INTEL_IDXD_COMPAT=y
+CONFIG_INTEL_IDXD_BUS=m
+CONFIG_INTEL_IDXD=m
 CONFIG_INTEL_IDXD_SVM=y
 CONFIG_INTEL_IDXD_PERFMON=y
 CONFIG_VFIO_MDEV_IDXD=m
 CONFIG_IOMMU_DEFAULT_PASSTHROUGH=y
 CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON=y
-CONFIG_CRYPTO_DEV_IAX_CRYPTO=y
+CONFIG_CRYPTO_DEV_IAX_CRYPTO=m
 CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS=y
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 42bd49b119af..6ca39da14c91 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -251,8 +251,9 @@ intel_pasid_clear_entry(struct intel_iommu *iommu, struct device *dev,
 			u32 pasid, bool fault_ignore, bool keep_pte)
 {
 	struct pasid_entry *pe;
+	bool keep_slt = false;
 	u64 pe_val;
-	bool nested;
+	u8 pgtt = 0;
 
 	pe = intel_pasid_get_entry(dev, pasid);
 	if (WARN_ON(!pe))
@@ -264,8 +265,10 @@ intel_pasid_clear_entry(struct intel_iommu *iommu, struct device *dev,
 	 * from NESTED to SL and keep other bits when unbind gpasid is executed.
 	 */
 	pe_val = READ_ONCE(pe->val[0]);
-	nested = (((pe_val >> 6) & 0x7) == PASID_ENTRY_PGTT_NESTED) ? true : false;
-	if (nested && keep_pte) {
+	pgtt = (pe_val >> 6) & 0x7;
+	keep_slt = (pgtt == PASID_ENTRY_PGTT_NESTED ||
+		    pgtt == PASID_ENTRY_PGTT_SL_ONLY);
+	if (keep_slt && keep_pte) {
 		pe_val &= 0xfffffffffffffebf;
 		WRITE_ONCE(pe->val[0], pe_val);
 		return;
diff --git a/package/default/config.default_kasan b/package/default/config.default_kasan
index 381593d4e0bc..2ff9d4f5d415 100644
--- a/package/default/config.default_kasan
+++ b/package/default/config.default_kasan
@@ -3787,9 +3787,9 @@ CONFIG_DMA_VIRTUAL_CHANNELS=y
 CONFIG_DMA_ACPI=y
 # CONFIG_ALTERA_MSGDMA is not set
 CONFIG_INTEL_IDMA64=y
-CONFIG_INTEL_IDXD_BUS=y
-CONFIG_INTEL_IDXD=y
-CONFIG_INTEL_IDXD_COMPAT=y
+CONFIG_INTEL_IDXD_BUS=m
+CONFIG_INTEL_IDXD=m
+# CONFIG_INTEL_IDXD_COMPAT is not set
 CONFIG_INTEL_IDXD_SVM=y
 CONFIG_INTEL_IDXD_PERFMON=y
 CONFIG_INTEL_IOATDMA=y
@@ -4659,7 +4659,7 @@ CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m
 CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m
 CONFIG_CRYPTO_DEV_QAT_C62XVF=m
 # CONFIG_CRYPTO_DEV_NITROX_CNN55XX is not set
-CONFIG_CRYPTO_DEV_IAX_CRYPTO=y
+CONFIG_CRYPTO_DEV_IAX_CRYPTO=m
 CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS=y
 # CONFIG_CRYPTO_DEV_CHELSIO is not set
 # CONFIG_CRYPTO_DEV_CHELSIO_TLS is not set
-- 
Gitee


From 1261f36e8fec89d4b1555da2160872881e4dd7fc Mon Sep 17 00:00:00 2001
From: Yang Weijiang <weijiang.yang@intel.com>
Date: Thu, 27 Oct 2022 00:42:40 -0400
Subject: [PATCH 2047/2161] drivers/idxd: Fixup errors reported during DSA
 device migration

commit opencloudos.

When there's no MSI generated during LM, device dev_msi_list is
actually empty. Add empty check before process the list to prevent
returning an invalid entry pointer, therefore avoiding parsing
invalid memory and eventually eliminating induced error messages.

Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/vfio/mdev/idxd/mdev.c | 18 +++++++++++-------
 drivers/vfio/mdev/idxd/vdev.c |  2 +-
 kernel/irq/msi.c              | 19 ++++++++++++++-----
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
index 4aeaf7689385..5ff6de0cae75 100644
--- a/drivers/vfio/mdev/idxd/mdev.c
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -356,8 +356,8 @@ static void vidxd_source_prepare_for_migration(struct vdcm_idxd *vidxd)
 	for (i = 1; i < VIDXD_MAX_MSIX_VECS; i++) {
 		u32 ims_idx = dev_msi_hwirq(dev, i - 1);
 
-		/* Save the current handle in use */
-		pr_info("Saving handle %d at offset %x\n", ims_idx, offset);
+		/* Save active ims index, -1 means no entry is in use. */
+		pr_info("%s: saving handle %d at offset 0x%x\n", __func__, ims_idx, offset);
 		memcpy(data_ptr + offset, (u8 *)&ims_idx, sizeof(ims_idx));
 		offset += sizeof(ims_idx);
 	}
@@ -496,7 +496,6 @@ static unsigned int vidxd_dest_load_state(struct vdcm_idxd *vidxd)
 	//memcpy((u8 *)ims, data_ptr + offset, sizeof(vidxd->ims));
 	//offset += sizeof(vidxd->ims);
 
-	printk("Offset %x\n", offset);
 	return offset;
 }
 
@@ -522,16 +521,21 @@ static int vidxd_resume_ims_state (struct vdcm_idxd *vidxd,
 
 		memcpy((u8 *)&revoked_handle, data_ptr + *offset,
 					sizeof(revoked_handle));
-		*offset += sizeof(revoked_handle);
 
-		pr_info("%s: %d new handle %x old handle %x\n",
-				__func__, i, ims_idx, revoked_handle);
+		/* ims_idx == -1 means no handle is active. */
+		pr_info("%s: [%d] offset = 0x%x, new handle: %d, old handle: %d\n",
+				__func__, i, *offset, ims_idx, revoked_handle);
+
+		*offset += sizeof(revoked_handle);
 
 		if (revoked_handle != ims_idx) {
 			/* Int Handle Revoked */
 			*int_handle_revoked = true;
 		}
 
+		if (ims_idx < 0 || irq < 0)
+			return 0;
+
 		perm_val = *(u32 *)(bar0 + VIDXD_MSIX_PERM_OFFSET + i * 8);
 
 		paside = (perm_val >> 3) & 1;
@@ -547,7 +551,7 @@ static int vidxd_resume_ims_state (struct vdcm_idxd *vidxd,
 		auxval = ims_ctrl_pasid_aux(pasid, true);
 
 		rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
-		pr_info("%s: auxval %x rc %d\n", __func__, auxval, rc);
+		pr_info("%s: auxval 0x%x rc %d\n", __func__, auxval, rc);
 		if (rc < 0) {
 			pr_info("set ims pasid failed rc %d\n", rc);
 			break;
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
index 561674615554..ed5901e45f35 100644
--- a/drivers/vfio/mdev/idxd/vdev.c
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -64,7 +64,7 @@ void vidxd_notify_revoked_handles (struct vdcm_idxd *vidxd)
 	u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
 
 	*intcause |= IDXD_INTC_INT_HANDLE_REVOKED;
-	pr_info("informating guest about revoked handles\n");
+	pr_info("Signaling guest about revoked handles\n");
 	vidxd_send_interrupt(vidxd, 0);
 }
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index b934f7a04d8f..0b7d91a75100 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -802,13 +802,16 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
  * @dev: device to operate on
  * @nr: device-relative interrupt vector index (0-based).
  *
- * Return the nth dev_msi entry
+ * Return NULL or the nth dev_msi entry
  */
 static struct msi_desc *get_dev_msi_entry(struct device *dev, unsigned int nr)
 {
 	struct msi_desc *entry;
 	int i = 0;
 
+	if (list_empty(dev_to_dev_msi_list(dev)))
+		return NULL;
+
 	for_each_dev_msi_entry(entry, dev) {
 		if (i == nr)
 			return entry;
@@ -824,11 +827,14 @@ static struct msi_desc *get_dev_msi_entry(struct device *dev, unsigned int nr)
  * @dev: device to operate on
  * @nr: device-relative interrupt vector index (0-based).
  *
- * Returns the Linux IRQ number of a device vector.
+ * Returns the Linux IRQ number of a device vector, or -1 if the
+ * entry is not found.
  */
 int dev_msi_irq_vector(struct device *dev, unsigned int nr)
 {
-	return get_dev_msi_entry(dev, nr)->irq;
+	struct msi_desc *entry = get_dev_msi_entry(dev, nr);
+
+	return entry ? entry->irq : -1;
 }
 EXPORT_SYMBOL_GPL(dev_msi_irq_vector);
 
@@ -837,11 +843,14 @@ EXPORT_SYMBOL_GPL(dev_msi_irq_vector);
  * @dev: device to operate on
  * @nr: device-relative interrupt vector index (0-based).
  *
- * Return the dev_msi hw IRQ number of a device vector.
+ * Return the dev_msi hw IRQ number of a device vector, or -1 if the
+ * entry is not found.
  */
 int dev_msi_hwirq(struct device *dev, unsigned int nr)
 {
-	return get_dev_msi_entry(dev, nr)->device_msi.hwirq;
+	struct msi_desc *entry = get_dev_msi_entry(dev, nr);
+
+	return entry ? entry->device_msi.hwirq : -1;
 }
 EXPORT_SYMBOL_GPL(dev_msi_hwirq);
 
-- 
Gitee


From e1ba220daf7cd0b5c5b38d9320ec45252cbdbfa6 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Thu, 3 Nov 2022 20:11:28 +0800
Subject: [PATCH 2048/2161] PCI: Move pci_match_device() ahead of
 new_id_store()

commit 1f40704bb01b9fba9925006662a37373d514f26b upstream.

Move pci_match_device() and its dependencies (pci_match_id() and
pci_device_id_any) ahead of new_id_store().

This is preparation work for calling pci_match_device() in new_id_store().
No functional changes.

[bhelgaas: update function comments]
Link: https://lore.kernel.org/r/20201117054409.3428-2-zhenzhong.duan@gmail.com
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci-driver.c | 172 +++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 86 deletions(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 2308f13fc9d8..92f5c22ce451 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -87,6 +87,92 @@ static void pci_free_dynids(struct pci_driver *drv)
 	spin_unlock(&drv->dynids.lock);
 }
 
+/**
+ * pci_match_id - See if a pci device matches a given pci_id table
+ * @ids: array of PCI device id structures to search in
+ * @dev: the PCI device structure to match against.
+ *
+ * Used by a driver to check whether a PCI device present in the
+ * system is in its list of supported devices.  Returns the matching
+ * pci_device_id structure or %NULL if there is no match.
+ *
+ * Deprecated, don't use this as it will not catch any dynamic ids
+ * that a driver might want to check for.
+ */
+const struct pci_device_id *pci_match_id(const struct pci_device_id *ids,
+					 struct pci_dev *dev)
+{
+	if (ids) {
+		while (ids->vendor || ids->subvendor || ids->class_mask) {
+			if (pci_match_one_device(ids, dev))
+				return ids;
+			ids++;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(pci_match_id);
+
+static const struct pci_device_id pci_device_id_any = {
+	.vendor = PCI_ANY_ID,
+	.device = PCI_ANY_ID,
+	.subvendor = PCI_ANY_ID,
+	.subdevice = PCI_ANY_ID,
+};
+
+/**
+ * pci_match_device - Tell if a PCI device structure has a matching PCI device id structure
+ * @drv: the PCI driver to match against
+ * @dev: the PCI device structure to match against
+ *
+ * Used by a driver to check whether a PCI device present in the
+ * system is in its list of supported devices.  Returns the matching
+ * pci_device_id structure or %NULL if there is no match.
+ */
+static const struct pci_device_id *pci_match_device(struct pci_driver *drv,
+						    struct pci_dev *dev)
+{
+	struct pci_dynid *dynid;
+	const struct pci_device_id *found_id = NULL, *ids;
+
+	/* When driver_override is set, only bind to the matching driver */
+	if (dev->driver_override && strcmp(dev->driver_override, drv->name))
+		return NULL;
+
+	/* Look at the dynamic ids first, before the static ones */
+	spin_lock(&drv->dynids.lock);
+	list_for_each_entry(dynid, &drv->dynids.list, node) {
+		if (pci_match_one_device(&dynid->id, dev)) {
+			found_id = &dynid->id;
+			break;
+		}
+	}
+	spin_unlock(&drv->dynids.lock);
+
+	if (found_id)
+		return found_id;
+
+	for (ids = drv->id_table; (found_id = pci_match_id(ids, dev));
+	     ids = found_id + 1) {
+		/*
+		 * The match table is split based on driver_override.
+		 * In case override_only was set, enforce driver_override
+		 * matching.
+		 */
+		if (found_id->override_only) {
+			if (dev->driver_override)
+				return found_id;
+		} else {
+			return found_id;
+		}
+	}
+
+	/* driver_override will always match, send a dummy id */
+	if (dev->driver_override)
+		return &pci_device_id_any;
+	return NULL;
+}
+
 /**
  * store_new_id - sysfs frontend to pci_add_dynid()
  * @driver: target device driver
@@ -206,92 +292,6 @@ static struct attribute *pci_drv_attrs[] = {
 };
 ATTRIBUTE_GROUPS(pci_drv);
 
-/**
- * pci_match_id - See if a pci device matches a given pci_id table
- * @ids: array of PCI device id structures to search in
- * @dev: the PCI device structure to match against.
- *
- * Used by a driver to check whether a PCI device present in the
- * system is in its list of supported devices.  Returns the matching
- * pci_device_id structure or %NULL if there is no match.
- *
- * Deprecated, don't use this as it will not catch any dynamic ids
- * that a driver might want to check for.
- */
-const struct pci_device_id *pci_match_id(const struct pci_device_id *ids,
-					 struct pci_dev *dev)
-{
-	if (ids) {
-		while (ids->vendor || ids->subvendor || ids->class_mask) {
-			if (pci_match_one_device(ids, dev))
-				return ids;
-			ids++;
-		}
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(pci_match_id);
-
-static const struct pci_device_id pci_device_id_any = {
-	.vendor = PCI_ANY_ID,
-	.device = PCI_ANY_ID,
-	.subvendor = PCI_ANY_ID,
-	.subdevice = PCI_ANY_ID,
-};
-
-/**
- * pci_match_device - Tell if a PCI device structure has a matching PCI device id structure
- * @drv: the PCI driver to match against
- * @dev: the PCI device structure to match against
- *
- * Used by a driver to check whether a PCI device present in the
- * system is in its list of supported devices.  Returns the matching
- * pci_device_id structure or %NULL if there is no match.
- */
-static const struct pci_device_id *pci_match_device(struct pci_driver *drv,
-						    struct pci_dev *dev)
-{
-	struct pci_dynid *dynid;
-	const struct pci_device_id *found_id = NULL, *ids;
-
-	/* When driver_override is set, only bind to the matching driver */
-	if (dev->driver_override && strcmp(dev->driver_override, drv->name))
-		return NULL;
-
-	/* Look at the dynamic ids first, before the static ones */
-	spin_lock(&drv->dynids.lock);
-	list_for_each_entry(dynid, &drv->dynids.list, node) {
-		if (pci_match_one_device(&dynid->id, dev)) {
-			found_id = &dynid->id;
-			break;
-		}
-	}
-	spin_unlock(&drv->dynids.lock);
-
-	if (found_id)
-		return found_id;
-
-	for (ids = drv->id_table; (found_id = pci_match_id(ids, dev));
-	     ids = found_id + 1) {
-		/*
-		 * The match table is split based on driver_override.
-		 * In case override_only was set, enforce driver_override
-		 * matching.
-		 */
-		if (found_id->override_only) {
-			if (dev->driver_override)
-				return found_id;
-		} else {
-			return found_id;
-		}
-	}
-
-	/* driver_override will always match, send a dummy id */
-	if (dev->driver_override)
-		return &pci_device_id_any;
-	return NULL;
-}
-
 struct drv_dev_and_id {
 	struct pci_driver *drv;
 	struct pci_dev *dev;
-- 
Gitee


From 6cfb255ca9df07bbebdc27d2ce86a219a2ef1589 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@gmail.com>
Date: Tue, 17 Nov 2020 13:44:09 +0800
Subject: [PATCH 2049/2161] PCI: Avoid duplicate IDs in driver dynamic IDs list

commit 3853f9123c185eb4018f5ccd3cdda5968efb5e10 upstream.

When a device ID is written to /sys/bus/pci/drivers/.../new_id, we
previously only checked the driver's static ID table for duplicates.
Writing the same ID several times added it to the dynamic IDs list several
times.

This doesn't cause user-visible broken behavior, but remove_id_store() only
removes one of the duplicate IDs, so if we add an ID several times, we
would have to remove it the same number of times before it's completely
gone.

Fix it by calling pci_match_device(), which checks both dynamic and static
IDs to avoid inserting duplicate IDs in dynamic IDs list.

After fix, attempts to add an ID more than once cause an error:

  # echo "1af4 1041" > /sys/bus/pci/drivers/vfio-pci/new_id
  # echo "1af4 1041" > /sys/bus/pci/drivers/vfio-pci/new_id
  bash: echo: write error: File exists

Link: https://lore.kernel.org/r/20201117054409.3428-3-zhenzhong.duan@gmail.com
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/pci/pci-driver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 92f5c22ce451..5f90a68e61fc 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -209,7 +209,7 @@ static ssize_t new_id_store(struct device_driver *driver, const char *buf,
 		pdev->subsystem_device = subdevice;
 		pdev->class = class;
 
-		if (pci_match_id(pdrv->id_table, pdev))
+		if (pci_match_device(pdrv, pdev))
 			retval = -EEXIST;
 
 		kfree(pdev);
-- 
Gitee


From a4be0254b70783b647707f7a78bdd463b8f3ca8b Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 16 Nov 2022 16:19:08 +0800
Subject: [PATCH 2050/2161] KVM: x86: Use a stable condition around all VT-d PI
 paths

commit 53b7ca1a359389276c76fbc9e1009d8626a17e40 upstream.

Currently, checks for whether VT-d PI can be used refer to the current
status of the feature in the current vCPU; or they more or less pick
vCPU 0 in case a specific vCPU is not available.

However, these checks do not attempt to synchronize with changes to
the IRTE.  In particular, there is no path that updates the IRTE when
APICv is re-activated on vCPU 0; and there is no path to wakeup a CPU
that has APICv disabled, if the wakeup occurs because of an IRTE
that points to a posted interrupt.

To fix this, always go through the VT-d PI path as long as there are
assigned devices and APICv is available on both the host and the VM side.
Since the relevant condition was copied over three times, take the hint
and factor it into a separate function.

Suggested-by: Sean Christopherson <seanjc@google.com>
Cc: stable@vger.kernel.org
Reviewed-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Message-Id: <20211123004311.2954158-5-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e09c531a1ebe..74eb6ad79dbe 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -54,6 +54,7 @@
 #include "irq.h"
 #include "kvm_cache_regs.h"
 #include "lapic.h"
+#include "irq.h"
 #include "mmu.h"
 #include "nested.h"
 #include "ops.h"
@@ -1389,6 +1390,13 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vmx->host_debugctlmsr = get_debugctlmsr();
 }
 
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+	return irqchip_in_kernel(kvm) && enable_apicv &&
+		kvm_arch_has_assigned_device(kvm) &&
+		irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
 static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -1409,9 +1417,7 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 	if (!vmx_needs_pi_wakeup(vcpu))
 		return;
 
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	if (!vmx_can_use_vtd_pi(vcpu->kvm))
 		return;
 
 	/* Set SN when the vCPU is preempted */
@@ -7904,9 +7910,7 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 	struct pi_desc old, new;
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	if (!vmx_can_use_vtd_pi(vcpu->kvm))
 		return 0;
 
 	WARN_ON(irqs_disabled());
@@ -8004,9 +8008,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 	struct vcpu_data vcpu_info;
 	int idx, ret = 0;
 
-	if (!kvm_arch_has_assigned_device(kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP) ||
-		!kvm_vcpu_apicv_active(kvm->vcpus[0]))
+	if (!vmx_can_use_vtd_pi(kvm))
 		return 0;
 
 	idx = srcu_read_lock(&kvm->irq_srcu);
-- 
Gitee


From 0891b94bc6f44d03b50e85dc7ec0a377370fa895 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 16 Nov 2022 16:27:43 +0800
Subject: [PATCH 2051/2161] KVM: VMX: restore check of vmx_can_use_vtd_pi() in
 vmx_needs_pi_wakeup()

commit opencloudos.

Previous implementation of vmx_needs_pi_wakeup() had dropped invoking vmx_can_use_vtd_pi(),
as vmx_vcpu_pi_put() takes similar checks under invocation of vmx_needs_pi_wakeup(). It may
makes errorly return for vmx_vcpu_pi_put().

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 74eb6ad79dbe..c4359767ad78 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1407,7 +1407,7 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
 	 * notification vector is switched to the one that calls
 	 * back to the pi_wakeup_handler() function.
 	 */
-	return vmx_can_use_ipiv(vcpu);
+	return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm);
 }
 
 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -1417,9 +1417,6 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 	if (!vmx_needs_pi_wakeup(vcpu))
 		return;
 
-	if (!vmx_can_use_vtd_pi(vcpu->kvm))
-		return;
-
 	/* Set SN when the vCPU is preempted */
 	if (vcpu->preempted)
 		pi_set_sn(pi_desc);
-- 
Gitee


From 28b58d082b1a8e1dfe7289d7356bbca2e560f59a Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 16 Nov 2022 17:59:52 +0800
Subject: [PATCH 2052/2161] KVM: VMX: Fix vcpu could not be waken up if enable
 IPI virtualization

commit opencloudos.

Since enable IPI virtualization, vcpu which experience HLT->blocked will
not be waken up, if there is no vfio device assigned.

In this case, kvm_arch_has_assigned_device() will always return false,
so that pi_pre_block() will directly return after check
kvm_arch_has_assigned_device() and vcpu will not added to
blocked_vcpu_on_cpu list.

Follow upstream commit d76fb40637fc0e84b27bf431cd72cf8fe3f813ef, just
take this check in vmx_vcpu_pi_put().

This patch is interrelated with previous two commits and
'KVM: VMX: enable IPI virtualization'. It should be reverted with
these three commits, if you want revert it.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/vmx/vmx.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c4359767ad78..6c79798d2da2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7907,9 +7907,6 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 	struct pi_desc old, new;
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 
-	if (!vmx_can_use_vtd_pi(vcpu->kvm))
-		return 0;
-
 	WARN_ON(irqs_disabled());
 	local_irq_disable();
 	if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
-- 
Gitee


From 01b2630f5150ee162c422b41440efc06e03b0bbe Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 6 Dec 2022 19:31:12 +0800
Subject: [PATCH 2053/2161] Revert "x86/extable: Fix _ASM_EXTABLE_HANDLE() was
 removed"

commit opencloudos.

This reverts commit 1f12414046b4a296fd55d700a62d3acfe75e0ee2.

Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/asm.h | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 2d4eb66100c8..254f8c8c5429 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -123,13 +123,6 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
-	.pushsection "__ex_table","a" ; 			\
-	.balign 4 ;						\
-	.long (from) - . ;					\
-	.long (to) - . ;					\
-	.long (handler) - . ;					\
-	.popsection
 
 # define _ASM_EXTABLE_TYPE(from, to, type)			\
 	.pushsection "__ex_table","a" ;				\
@@ -152,14 +145,6 @@
 	.popsection
 
 #else
-# define _EXPAND_EXTABLE_HANDLE(x) #x
-# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 4\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - .\n"				\
-	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
-	" .popsection\n"
 
 # define _ASM_EXTABLE_TYPE(from, to, type)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-- 
Gitee


From a13739141f1a13899b2a51cf05af4ca0752f9cd4 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:58:53 +0000
Subject: [PATCH 2054/2161] locking/refcount: Define constants for saturation
 and max refcount values

commit 23e6b169c9917fbd77534f8c5f378cb073f548bd upstream.

The REFCOUNT_FULL implementation uses a different saturation point than
the x86 implementation, which means that the shared refcount code in
lib/refcount.c (e.g. refcount_dec_not_one()) needs to be aware of the
difference.

Rather than duplicate the definitions from the lkdtm driver, instead
move them into <linux/refcount.h> and update all references accordingly.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-2-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/misc/lkdtm/refcount.c |  8 --------
 include/linux/refcount.h      | 10 +++++++++-
 lib/refcount.c                | 37 +++++++++++++++++++----------------
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/drivers/misc/lkdtm/refcount.c b/drivers/misc/lkdtm/refcount.c
index 0a146b32da13..abf3b7c1f686 100644
--- a/drivers/misc/lkdtm/refcount.c
+++ b/drivers/misc/lkdtm/refcount.c
@@ -6,14 +6,6 @@
 #include "lkdtm.h"
 #include <linux/refcount.h>
 
-#ifdef CONFIG_REFCOUNT_FULL
-#define REFCOUNT_MAX		(UINT_MAX - 1)
-#define REFCOUNT_SATURATED	UINT_MAX
-#else
-#define REFCOUNT_MAX		INT_MAX
-#define REFCOUNT_SATURATED	(INT_MIN / 2)
-#endif
-
 static void overflow_check(refcount_t *ref)
 {
 	switch (refcount_read(ref)) {
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index e28cce21bad6..79f62e8d2256 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -4,6 +4,7 @@
 
 #include <linux/atomic.h>
 #include <linux/compiler.h>
+#include <linux/limits.h>
 #include <linux/spinlock_types.h>
 
 struct mutex;
@@ -12,7 +13,7 @@ struct mutex;
  * struct refcount_t - variant of atomic_t specialized for reference counts
  * @refs: atomic_t counter field
  *
- * The counter saturates at UINT_MAX and will not move once
+ * The counter saturates at REFCOUNT_SATURATED and will not move once
  * there. This avoids wrapping the counter and causing 'spurious'
  * use-after-free bugs.
  */
@@ -56,6 +57,9 @@ extern void refcount_dec_checked(refcount_t *r);
 
 #ifdef CONFIG_REFCOUNT_FULL
 
+#define REFCOUNT_MAX		(UINT_MAX - 1)
+#define REFCOUNT_SATURATED	UINT_MAX
+
 #define refcount_add_not_zero	refcount_add_not_zero_checked
 #define refcount_add		refcount_add_checked
 
@@ -68,6 +72,10 @@ extern void refcount_dec_checked(refcount_t *r);
 #define refcount_dec		refcount_dec_checked
 
 #else
+
+#define REFCOUNT_MAX		INT_MAX
+#define REFCOUNT_SATURATED	(INT_MIN / 2)
+
 # ifdef CONFIG_ARCH_HAS_REFCOUNT
 #  include <asm/refcount.h>
 # else
diff --git a/lib/refcount.c b/lib/refcount.c
index 6e904af0fb3e..48b78a423d7d 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -5,8 +5,8 @@
  * The interface matches the atomic_t interface (to aid in porting) but only
  * provides the few functions one should use for reference counting.
  *
- * It differs in that the counter saturates at UINT_MAX and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
+ * It differs in that the counter saturates at REFCOUNT_SATURATED and will not
+ * move once there. This avoids wrapping the counter and causing 'spurious'
  * use-after-free issues.
  *
  * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
@@ -48,7 +48,7 @@
  * @i: the value to add to the refcount
  * @r: the refcount
  *
- * Will saturate at UINT_MAX and WARN.
+ * Will saturate at REFCOUNT_SATURATED and WARN.
  *
  * Provides no memory ordering, it is assumed the caller has guaranteed the
  * object memory to be stable (RCU, etc.). It does provide a control dependency
@@ -69,16 +69,17 @@ bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r)
 		if (!val)
 			return false;
 
-		if (unlikely(val == UINT_MAX))
+		if (unlikely(val == REFCOUNT_SATURATED))
 			return true;
 
 		new = val + i;
 		if (new < val)
-			new = UINT_MAX;
+			new = REFCOUNT_SATURATED;
 
 	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
 
-	WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+	WARN_ONCE(new == REFCOUNT_SATURATED,
+		  "refcount_t: saturated; leaking memory.\n");
 
 	return true;
 }
@@ -89,7 +90,7 @@ EXPORT_SYMBOL(refcount_add_not_zero_checked);
  * @i: the value to add to the refcount
  * @r: the refcount
  *
- * Similar to atomic_add(), but will saturate at UINT_MAX and WARN.
+ * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
  *
  * Provides no memory ordering, it is assumed the caller has guaranteed the
  * object memory to be stable (RCU, etc.). It does provide a control dependency
@@ -110,7 +111,8 @@ EXPORT_SYMBOL(refcount_add_checked);
  * refcount_inc_not_zero_checked - increment a refcount unless it is 0
  * @r: the refcount to increment
  *
- * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN.
+ * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
+ * and WARN.
  *
  * Provides no memory ordering, it is assumed the caller has guaranteed the
  * object memory to be stable (RCU, etc.). It does provide a control dependency
@@ -133,7 +135,8 @@ bool refcount_inc_not_zero_checked(refcount_t *r)
 
 	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
 
-	WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+	WARN_ONCE(new == REFCOUNT_SATURATED,
+		  "refcount_t: saturated; leaking memory.\n");
 
 	return true;
 }
@@ -143,7 +146,7 @@ EXPORT_SYMBOL(refcount_inc_not_zero_checked);
  * refcount_inc_checked - increment a refcount
  * @r: the refcount to increment
  *
- * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN.
+ * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
  *
  * Provides no memory ordering, it is assumed the caller already has a
  * reference on the object.
@@ -164,7 +167,7 @@ EXPORT_SYMBOL(refcount_inc_checked);
  *
  * Similar to atomic_dec_and_test(), but it will WARN, return false and
  * ultimately leak on underflow and will fail to decrement when saturated
- * at UINT_MAX.
+ * at REFCOUNT_SATURATED.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides an acquire ordering on success such that free()
@@ -182,7 +185,7 @@ bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r)
 	unsigned int new, val = atomic_read(&r->refs);
 
 	do {
-		if (unlikely(val == UINT_MAX))
+		if (unlikely(val == REFCOUNT_SATURATED))
 			return false;
 
 		new = val - i;
@@ -207,7 +210,7 @@ EXPORT_SYMBOL(refcount_sub_and_test_checked);
  * @r: the refcount
  *
  * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
+ * decrement when saturated at REFCOUNT_SATURATED.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides an acquire ordering on success such that free()
@@ -226,7 +229,7 @@ EXPORT_SYMBOL(refcount_dec_and_test_checked);
  * @r: the refcount
  *
  * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
- * when saturated at UINT_MAX.
+ * when saturated at REFCOUNT_SATURATED.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before.
@@ -277,7 +280,7 @@ bool refcount_dec_not_one(refcount_t *r)
 	unsigned int new, val = atomic_read(&r->refs);
 
 	do {
-		if (unlikely(val == UINT_MAX))
+		if (unlikely(val == REFCOUNT_SATURATED))
 			return true;
 
 		if (val == 1)
@@ -302,7 +305,7 @@ EXPORT_SYMBOL(refcount_dec_not_one);
  * @lock: the mutex to be locked
  *
  * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
- * to decrement when saturated at UINT_MAX.
+ * to decrement when saturated at REFCOUNT_SATURATED.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
@@ -333,7 +336,7 @@ EXPORT_SYMBOL(refcount_dec_and_mutex_lock);
  * @lock: the spinlock to be locked
  *
  * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
+ * decrement when saturated at REFCOUNT_SATURATED.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
-- 
Gitee


From 34a87f26d55f0bf43358b4f8fd724fd2561a24d3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:58:54 +0000
Subject: [PATCH 2055/2161] locking/refcount: Ensure integer operands are
 treated as signed

commit 97a1420adf0cdf0cf6f41bab0b2acf658c96b94b upstream.

In preparation for changing the saturation point of REFCOUNT_FULL to
INT_MIN/2, change the type of integer operands passed into the API
from 'unsigned int' to 'int' so that we can avoid casting during
comparisons when we don't want to fall foul of C integral conversion
rules for signed and unsigned types.

Since the kernel is compiled with '-fno-strict-overflow', we don't need
to worry about the UB introduced by signed overflow here. Furthermore,
we're already making heavy use of the atomic_t API, which operates
exclusively on signed types.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-3-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/refcount.h | 14 +++++++-------
 lib/refcount.c           |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 79f62e8d2256..89066a1471dd 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -28,7 +28,7 @@ typedef struct refcount_struct {
  * @r: the refcount
  * @n: value to which the refcount will be set
  */
-static inline void refcount_set(refcount_t *r, unsigned int n)
+static inline void refcount_set(refcount_t *r, int n)
 {
 	atomic_set(&r->refs, n);
 }
@@ -44,13 +44,13 @@ static inline unsigned int refcount_read(const refcount_t *r)
 	return atomic_read(&r->refs);
 }
 
-extern __must_check bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r);
-extern void refcount_add_checked(unsigned int i, refcount_t *r);
+extern __must_check bool refcount_add_not_zero_checked(int i, refcount_t *r);
+extern void refcount_add_checked(int i, refcount_t *r);
 
 extern __must_check bool refcount_inc_not_zero_checked(refcount_t *r);
 extern void refcount_inc_checked(refcount_t *r);
 
-extern __must_check bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r);
+extern __must_check bool refcount_sub_and_test_checked(int i, refcount_t *r);
 
 extern __must_check bool refcount_dec_and_test_checked(refcount_t *r);
 extern void refcount_dec_checked(refcount_t *r);
@@ -79,12 +79,12 @@ extern void refcount_dec_checked(refcount_t *r);
 # ifdef CONFIG_ARCH_HAS_REFCOUNT
 #  include <asm/refcount.h>
 # else
-static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
 {
 	return atomic_add_unless(&r->refs, i, 0);
 }
 
-static inline void refcount_add(unsigned int i, refcount_t *r)
+static inline void refcount_add(int i, refcount_t *r)
 {
 	atomic_add(i, &r->refs);
 }
@@ -99,7 +99,7 @@ static inline void refcount_inc(refcount_t *r)
 	atomic_inc(&r->refs);
 }
 
-static inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
 {
 	return atomic_sub_and_test(i, &r->refs);
 }
diff --git a/lib/refcount.c b/lib/refcount.c
index 48b78a423d7d..719b0bc42ab1 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -61,7 +61,7 @@
  *
  * Return: false if the passed refcount is 0, true otherwise
  */
-bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r)
+bool refcount_add_not_zero_checked(int i, refcount_t *r)
 {
 	unsigned int new, val = atomic_read(&r->refs);
 
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(refcount_add_not_zero_checked);
  * cases, refcount_inc(), or one of its variants, should instead be used to
  * increment a reference count.
  */
-void refcount_add_checked(unsigned int i, refcount_t *r)
+void refcount_add_checked(int i, refcount_t *r)
 {
 	WARN_ONCE(!refcount_add_not_zero_checked(i, r), "refcount_t: addition on 0; use-after-free.\n");
 }
@@ -180,7 +180,7 @@ EXPORT_SYMBOL(refcount_inc_checked);
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
-bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r)
+bool refcount_sub_and_test_checked(int i, refcount_t *r)
 {
 	unsigned int new, val = atomic_read(&r->refs);
 
-- 
Gitee


From b736cce8d1538b506f48117436366837d9e928bd Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:58:55 +0000
Subject: [PATCH 2056/2161] locking/refcount: Remove unused
 refcount_*_checked() variants

commit 7221762c48c6bbbcc6cc51d8b803c06930215e34 upstream.

The full-fat refcount implementation is exposed via a set of functions
suffixed with "_checked()", the idea being that code can choose to use
the more expensive, yet more secure implementation on a case-by-case
basis.

In reality, this hasn't happened, so with a grand total of zero users,
let's remove the checked variants for now by simply dropping the suffix
and predicating the out-of-line functions on CONFIG_REFCOUNT_FULL=y.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-4-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/refcount.h | 25 ++++++-------------
 lib/refcount.c           | 54 +++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 89066a1471dd..edd505d1a23b 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -44,32 +44,21 @@ static inline unsigned int refcount_read(const refcount_t *r)
 	return atomic_read(&r->refs);
 }
 
-extern __must_check bool refcount_add_not_zero_checked(int i, refcount_t *r);
-extern void refcount_add_checked(int i, refcount_t *r);
-
-extern __must_check bool refcount_inc_not_zero_checked(refcount_t *r);
-extern void refcount_inc_checked(refcount_t *r);
-
-extern __must_check bool refcount_sub_and_test_checked(int i, refcount_t *r);
-
-extern __must_check bool refcount_dec_and_test_checked(refcount_t *r);
-extern void refcount_dec_checked(refcount_t *r);
-
 #ifdef CONFIG_REFCOUNT_FULL
 
 #define REFCOUNT_MAX		(UINT_MAX - 1)
 #define REFCOUNT_SATURATED	UINT_MAX
 
-#define refcount_add_not_zero	refcount_add_not_zero_checked
-#define refcount_add		refcount_add_checked
+extern __must_check bool refcount_add_not_zero(int i, refcount_t *r);
+extern void refcount_add(int i, refcount_t *r);
 
-#define refcount_inc_not_zero	refcount_inc_not_zero_checked
-#define refcount_inc		refcount_inc_checked
+extern __must_check bool refcount_inc_not_zero(refcount_t *r);
+extern void refcount_inc(refcount_t *r);
 
-#define refcount_sub_and_test	refcount_sub_and_test_checked
+extern __must_check bool refcount_sub_and_test(int i, refcount_t *r);
 
-#define refcount_dec_and_test	refcount_dec_and_test_checked
-#define refcount_dec		refcount_dec_checked
+extern __must_check bool refcount_dec_and_test(refcount_t *r);
+extern void refcount_dec(refcount_t *r);
 
 #else
 
diff --git a/lib/refcount.c b/lib/refcount.c
index 719b0bc42ab1..a2f670998cee 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -43,8 +43,10 @@
 #include <linux/spinlock.h>
 #include <linux/bug.h>
 
+#ifdef CONFIG_REFCOUNT_FULL
+
 /**
- * refcount_add_not_zero_checked - add a value to a refcount unless it is 0
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
  * @i: the value to add to the refcount
  * @r: the refcount
  *
@@ -61,7 +63,7 @@
  *
  * Return: false if the passed refcount is 0, true otherwise
  */
-bool refcount_add_not_zero_checked(int i, refcount_t *r)
+bool refcount_add_not_zero(int i, refcount_t *r)
 {
 	unsigned int new, val = atomic_read(&r->refs);
 
@@ -83,10 +85,10 @@ bool refcount_add_not_zero_checked(int i, refcount_t *r)
 
 	return true;
 }
-EXPORT_SYMBOL(refcount_add_not_zero_checked);
+EXPORT_SYMBOL(refcount_add_not_zero);
 
 /**
- * refcount_add_checked - add a value to a refcount
+ * refcount_add - add a value to a refcount
  * @i: the value to add to the refcount
  * @r: the refcount
  *
@@ -101,14 +103,14 @@ EXPORT_SYMBOL(refcount_add_not_zero_checked);
  * cases, refcount_inc(), or one of its variants, should instead be used to
  * increment a reference count.
  */
-void refcount_add_checked(int i, refcount_t *r)
+void refcount_add(int i, refcount_t *r)
 {
-	WARN_ONCE(!refcount_add_not_zero_checked(i, r), "refcount_t: addition on 0; use-after-free.\n");
+	WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
 }
-EXPORT_SYMBOL(refcount_add_checked);
+EXPORT_SYMBOL(refcount_add);
 
 /**
- * refcount_inc_not_zero_checked - increment a refcount unless it is 0
+ * refcount_inc_not_zero - increment a refcount unless it is 0
  * @r: the refcount to increment
  *
  * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
@@ -120,7 +122,7 @@ EXPORT_SYMBOL(refcount_add_checked);
  *
  * Return: true if the increment was successful, false otherwise
  */
-bool refcount_inc_not_zero_checked(refcount_t *r)
+bool refcount_inc_not_zero(refcount_t *r)
 {
 	unsigned int new, val = atomic_read(&r->refs);
 
@@ -140,10 +142,10 @@ bool refcount_inc_not_zero_checked(refcount_t *r)
 
 	return true;
 }
-EXPORT_SYMBOL(refcount_inc_not_zero_checked);
+EXPORT_SYMBOL(refcount_inc_not_zero);
 
 /**
- * refcount_inc_checked - increment a refcount
+ * refcount_inc - increment a refcount
  * @r: the refcount to increment
  *
  * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
@@ -154,14 +156,14 @@ EXPORT_SYMBOL(refcount_inc_not_zero_checked);
  * Will WARN if the refcount is 0, as this represents a possible use-after-free
  * condition.
  */
-void refcount_inc_checked(refcount_t *r)
+void refcount_inc(refcount_t *r)
 {
-	WARN_ONCE(!refcount_inc_not_zero_checked(r), "refcount_t: increment on 0; use-after-free.\n");
+	WARN_ONCE(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
 }
-EXPORT_SYMBOL(refcount_inc_checked);
+EXPORT_SYMBOL(refcount_inc);
 
 /**
- * refcount_sub_and_test_checked - subtract from a refcount and test if it is 0
+ * refcount_sub_and_test - subtract from a refcount and test if it is 0
  * @i: amount to subtract from the refcount
  * @r: the refcount
  *
@@ -180,7 +182,7 @@ EXPORT_SYMBOL(refcount_inc_checked);
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
-bool refcount_sub_and_test_checked(int i, refcount_t *r)
+bool refcount_sub_and_test(int i, refcount_t *r)
 {
 	unsigned int new, val = atomic_read(&r->refs);
 
@@ -203,10 +205,10 @@ bool refcount_sub_and_test_checked(int i, refcount_t *r)
 	return false;
 
 }
-EXPORT_SYMBOL(refcount_sub_and_test_checked);
+EXPORT_SYMBOL(refcount_sub_and_test);
 
 /**
- * refcount_dec_and_test_checked - decrement a refcount and test if it is 0
+ * refcount_dec_and_test - decrement a refcount and test if it is 0
  * @r: the refcount
  *
  * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
@@ -218,14 +220,14 @@ EXPORT_SYMBOL(refcount_sub_and_test_checked);
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
-bool refcount_dec_and_test_checked(refcount_t *r)
+bool refcount_dec_and_test(refcount_t *r)
 {
-	return refcount_sub_and_test_checked(1, r);
+	return refcount_sub_and_test(1, r);
 }
-EXPORT_SYMBOL(refcount_dec_and_test_checked);
+EXPORT_SYMBOL(refcount_dec_and_test);
 
 /**
- * refcount_dec_checked - decrement a refcount
+ * refcount_dec - decrement a refcount
  * @r: the refcount
  *
  * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
@@ -234,11 +236,13 @@ EXPORT_SYMBOL(refcount_dec_and_test_checked);
  * Provides release memory ordering, such that prior loads and stores are done
  * before.
  */
-void refcount_dec_checked(refcount_t *r)
+void refcount_dec(refcount_t *r)
 {
-	WARN_ONCE(refcount_dec_and_test_checked(r), "refcount_t: decrement hit 0; leaking memory.\n");
+	WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
 }
-EXPORT_SYMBOL(refcount_dec_checked);
+EXPORT_SYMBOL(refcount_dec);
+
+#endif /* CONFIG_REFCOUNT_FULL */
 
 /**
  * refcount_dec_if_one - decrement a refcount if it is 1
-- 
Gitee


From c7119b1d23a77ce56397d56e30fde02dcf00d7b0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:58:56 +0000
Subject: [PATCH 2057/2161] locking/refcount: Move the bulk of the
 REFCOUNT_FULL implementation into the <linux/refcount.h> header

commit 77e9971c79c29542ab7dd4140f9343bf2ff36158 upstream.

In an effort to improve performance of the REFCOUNT_FULL implementation,
move the bulk of its functions into linux/refcount.h. This allows them
to be inlined in the same way as if they had been provided via
CONFIG_ARCH_HAS_REFCOUNT.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-5-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/refcount.h | 237 ++++++++++++++++++++++++++++++++++++--
 lib/refcount.c           | 238 +--------------------------------------
 2 files changed, 229 insertions(+), 246 deletions(-)

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index edd505d1a23b..e719b5b1220e 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -45,22 +45,241 @@ static inline unsigned int refcount_read(const refcount_t *r)
 }
 
 #ifdef CONFIG_REFCOUNT_FULL
+#include <linux/bug.h>
 
 #define REFCOUNT_MAX		(UINT_MAX - 1)
 #define REFCOUNT_SATURATED	UINT_MAX
 
-extern __must_check bool refcount_add_not_zero(int i, refcount_t *r);
-extern void refcount_add(int i, refcount_t *r);
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at REFCOUNT_SATURATED and will not
+ * move once there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ * The decrements dec_and_test() and sub_and_test() also provide acquire
+ * ordering on success.
+ *
+ */
+
+/**
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
+static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
+{
+	unsigned int new, val = atomic_read(&r->refs);
+
+	do {
+		if (!val)
+			return false;
+
+		if (unlikely(val == REFCOUNT_SATURATED))
+			return true;
+
+		new = val + i;
+		if (new < val)
+			new = REFCOUNT_SATURATED;
+
+	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
+
+	WARN_ONCE(new == REFCOUNT_SATURATED,
+		  "refcount_t: saturated; leaking memory.\n");
+
+	return true;
+}
+
+/**
+ * refcount_add - add a value to a refcount
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ */
+static inline void refcount_add(int i, refcount_t *r)
+{
+	WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+}
+
+/**
+ * refcount_inc_not_zero - increment a refcount unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
+ * and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
+ */
+static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+	unsigned int new, val = atomic_read(&r->refs);
+
+	do {
+		new = val + 1;
 
-extern __must_check bool refcount_inc_not_zero(refcount_t *r);
-extern void refcount_inc(refcount_t *r);
+		if (!val)
+			return false;
 
-extern __must_check bool refcount_sub_and_test(int i, refcount_t *r);
+		if (unlikely(!new))
+			return true;
 
-extern __must_check bool refcount_dec_and_test(refcount_t *r);
-extern void refcount_dec(refcount_t *r);
+	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
+
+	WARN_ONCE(new == REFCOUNT_SATURATED,
+		  "refcount_t: saturated; leaking memory.\n");
+
+	return true;
+}
+
+/**
+ * refcount_inc - increment a refcount
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object.
+ *
+ * Will WARN if the refcount is 0, as this represents a possible use-after-free
+ * condition.
+ */
+static inline void refcount_inc(refcount_t *r)
+{
+	WARN_ONCE(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+
+/**
+ * refcount_sub_and_test - subtract from a refcount and test if it is 0
+ * @i: amount to subtract from the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), but it will WARN, return false and
+ * ultimately leak on underflow and will fail to decrement when saturated
+ * at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_dec(), or one of its variants, should instead be used to
+ * decrement a reference count.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
+static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
+{
+	unsigned int new, val = atomic_read(&r->refs);
+
+	do {
+		if (unlikely(val == REFCOUNT_SATURATED))
+			return false;
+
+		new = val - i;
+		if (new > val) {
+			WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n");
+			return false;
+		}
+
+	} while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
+
+	if (!new) {
+		smp_acquire__after_ctrl_dep();
+		return true;
+	}
+	return false;
+
+}
+
+/**
+ * refcount_dec_and_test - decrement a refcount and test if it is 0
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
+static inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+	return refcount_sub_and_test(1, r);
+}
+
+/**
+ * refcount_dec - decrement a refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
+static inline void refcount_dec(refcount_t *r)
+{
+	WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
+}
 
-#else
+#else /* CONFIG_REFCOUNT_FULL */
 
 #define REFCOUNT_MAX		INT_MAX
 #define REFCOUNT_SATURATED	(INT_MIN / 2)
@@ -103,7 +322,7 @@ static inline void refcount_dec(refcount_t *r)
 	atomic_dec(&r->refs);
 }
 # endif /* !CONFIG_ARCH_HAS_REFCOUNT */
-#endif /* CONFIG_REFCOUNT_FULL */
+#endif /* !CONFIG_REFCOUNT_FULL */
 
 extern __must_check bool refcount_dec_if_one(refcount_t *r);
 extern __must_check bool refcount_dec_not_one(refcount_t *r);
diff --git a/lib/refcount.c b/lib/refcount.c
index a2f670998cee..3a534fbebdcc 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -1,41 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Variant of atomic_t specialized for reference counts.
- *
- * The interface matches the atomic_t interface (to aid in porting) but only
- * provides the few functions one should use for reference counting.
- *
- * It differs in that the counter saturates at REFCOUNT_SATURATED and will not
- * move once there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free issues.
- *
- * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
- * and provide only what is strictly required for refcounts.
- *
- * The increments are fully relaxed; these will not provide ordering. The
- * rationale is that whatever is used to obtain the object we're increasing the
- * reference count on will provide the ordering. For locked data structures,
- * its the lock acquire, for RCU/lockless data structures its the dependent
- * load.
- *
- * Do note that inc_not_zero() provides a control dependency which will order
- * future stores against the inc, this ensures we'll never modify the object
- * if we did not in fact acquire a reference.
- *
- * The decrements will provide release order, such that all the prior loads and
- * stores will be issued before, it also provides a control dependency, which
- * will order us against the subsequent free().
- *
- * The control dependency is against the load of the cmpxchg (ll/sc) that
- * succeeded. This means the stores aren't fully ordered, but this is fine
- * because the 1->0 transition indicates no concurrency.
- *
- * Note that the allocator is responsible for ordering things between free()
- * and alloc().
- *
- * The decrements dec_and_test() and sub_and_test() also provide acquire
- * ordering on success.
- *
+ * Out-of-line refcount functions common to all refcount implementations.
  */
 
 #include <linux/mutex.h>
@@ -43,207 +8,6 @@
 #include <linux/spinlock.h>
 #include <linux/bug.h>
 
-#ifdef CONFIG_REFCOUNT_FULL
-
-/**
- * refcount_add_not_zero - add a value to a refcount unless it is 0
- * @i: the value to add to the refcount
- * @r: the refcount
- *
- * Will saturate at REFCOUNT_SATURATED and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- *
- * Use of this function is not recommended for the normal reference counting
- * use case in which references are taken and released one at a time.  In these
- * cases, refcount_inc(), or one of its variants, should instead be used to
- * increment a reference count.
- *
- * Return: false if the passed refcount is 0, true otherwise
- */
-bool refcount_add_not_zero(int i, refcount_t *r)
-{
-	unsigned int new, val = atomic_read(&r->refs);
-
-	do {
-		if (!val)
-			return false;
-
-		if (unlikely(val == REFCOUNT_SATURATED))
-			return true;
-
-		new = val + i;
-		if (new < val)
-			new = REFCOUNT_SATURATED;
-
-	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
-
-	WARN_ONCE(new == REFCOUNT_SATURATED,
-		  "refcount_t: saturated; leaking memory.\n");
-
-	return true;
-}
-EXPORT_SYMBOL(refcount_add_not_zero);
-
-/**
- * refcount_add - add a value to a refcount
- * @i: the value to add to the refcount
- * @r: the refcount
- *
- * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- *
- * Use of this function is not recommended for the normal reference counting
- * use case in which references are taken and released one at a time.  In these
- * cases, refcount_inc(), or one of its variants, should instead be used to
- * increment a reference count.
- */
-void refcount_add(int i, refcount_t *r)
-{
-	WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
-}
-EXPORT_SYMBOL(refcount_add);
-
-/**
- * refcount_inc_not_zero - increment a refcount unless it is 0
- * @r: the refcount to increment
- *
- * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
- * and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- *
- * Return: true if the increment was successful, false otherwise
- */
-bool refcount_inc_not_zero(refcount_t *r)
-{
-	unsigned int new, val = atomic_read(&r->refs);
-
-	do {
-		new = val + 1;
-
-		if (!val)
-			return false;
-
-		if (unlikely(!new))
-			return true;
-
-	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
-
-	WARN_ONCE(new == REFCOUNT_SATURATED,
-		  "refcount_t: saturated; leaking memory.\n");
-
-	return true;
-}
-EXPORT_SYMBOL(refcount_inc_not_zero);
-
-/**
- * refcount_inc - increment a refcount
- * @r: the refcount to increment
- *
- * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
- *
- * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object.
- *
- * Will WARN if the refcount is 0, as this represents a possible use-after-free
- * condition.
- */
-void refcount_inc(refcount_t *r)
-{
-	WARN_ONCE(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
-}
-EXPORT_SYMBOL(refcount_inc);
-
-/**
- * refcount_sub_and_test - subtract from a refcount and test if it is 0
- * @i: amount to subtract from the refcount
- * @r: the refcount
- *
- * Similar to atomic_dec_and_test(), but it will WARN, return false and
- * ultimately leak on underflow and will fail to decrement when saturated
- * at REFCOUNT_SATURATED.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides an acquire ordering on success such that free()
- * must come after.
- *
- * Use of this function is not recommended for the normal reference counting
- * use case in which references are taken and released one at a time.  In these
- * cases, refcount_dec(), or one of its variants, should instead be used to
- * decrement a reference count.
- *
- * Return: true if the resulting refcount is 0, false otherwise
- */
-bool refcount_sub_and_test(int i, refcount_t *r)
-{
-	unsigned int new, val = atomic_read(&r->refs);
-
-	do {
-		if (unlikely(val == REFCOUNT_SATURATED))
-			return false;
-
-		new = val - i;
-		if (new > val) {
-			WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n");
-			return false;
-		}
-
-	} while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
-
-	if (!new) {
-		smp_acquire__after_ctrl_dep();
-		return true;
-	}
-	return false;
-
-}
-EXPORT_SYMBOL(refcount_sub_and_test);
-
-/**
- * refcount_dec_and_test - decrement a refcount and test if it is 0
- * @r: the refcount
- *
- * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
- * decrement when saturated at REFCOUNT_SATURATED.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides an acquire ordering on success such that free()
- * must come after.
- *
- * Return: true if the resulting refcount is 0, false otherwise
- */
-bool refcount_dec_and_test(refcount_t *r)
-{
-	return refcount_sub_and_test(1, r);
-}
-EXPORT_SYMBOL(refcount_dec_and_test);
-
-/**
- * refcount_dec - decrement a refcount
- * @r: the refcount
- *
- * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
- * when saturated at REFCOUNT_SATURATED.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before.
- */
-void refcount_dec(refcount_t *r)
-{
-	WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
-}
-EXPORT_SYMBOL(refcount_dec);
-
-#endif /* CONFIG_REFCOUNT_FULL */
-
 /**
  * refcount_dec_if_one - decrement a refcount if it is 1
  * @r: the refcount
-- 
Gitee


From 893c313214d1356cf0b6ee645fce954fda67f357 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:58:57 +0000
Subject: [PATCH 2058/2161] locking/refcount: Improve performance of generic
 REFCOUNT_FULL code

commit dcb786493f3e48da3272b710028d42ec608cfda1 upstream.

Rewrite the generic REFCOUNT_FULL implementation so that the saturation
point is moved to INT_MIN / 2. This allows us to defer the sanity checks
until after the atomic operation, which removes many uses of cmpxchg()
in favour of atomic_fetch_{add,sub}().

Some crude perf results obtained from lkdtm show substantially less
overhead, despite the checking:

 $ perf stat -r 3 -B -- echo {ATOMIC,REFCOUNT}_TIMING >/sys/kernel/debug/provoke-crash/DIRECT

 # arm64
 ATOMIC_TIMING:                                      46.50451 +- 0.00134 seconds time elapsed  ( +-  0.00% )
 REFCOUNT_TIMING (REFCOUNT_FULL, mainline):          77.57522 +- 0.00982 seconds time elapsed  ( +-  0.01% )
 REFCOUNT_TIMING (REFCOUNT_FULL, this series):       48.7181  +- 0.0256  seconds time elapsed  ( +-  0.05% )

 # x86
 ATOMIC_TIMING:                                      31.6225 +- 0.0776 seconds time elapsed  ( +-  0.25% )
 REFCOUNT_TIMING (!REFCOUNT_FULL, mainline/x86 asm): 31.6689 +- 0.0901 seconds time elapsed  ( +-  0.28% )
 REFCOUNT_TIMING (REFCOUNT_FULL, mainline):          53.203  +- 0.138  seconds time elapsed  ( +-  0.26% )
 REFCOUNT_TIMING (REFCOUNT_FULL, this series):       31.7408 +- 0.0486 seconds time elapsed  ( +-  0.15% )

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Tested-by: Jan Glauber <jglauber@marvell.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-6-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/refcount.h | 131 ++++++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 56 deletions(-)

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index e719b5b1220e..e3b218d669ce 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -47,8 +47,8 @@ static inline unsigned int refcount_read(const refcount_t *r)
 #ifdef CONFIG_REFCOUNT_FULL
 #include <linux/bug.h>
 
-#define REFCOUNT_MAX		(UINT_MAX - 1)
-#define REFCOUNT_SATURATED	UINT_MAX
+#define REFCOUNT_MAX		INT_MAX
+#define REFCOUNT_SATURATED	(INT_MIN / 2)
 
 /*
  * Variant of atomic_t specialized for reference counts.
@@ -56,9 +56,47 @@ static inline unsigned int refcount_read(const refcount_t *r)
  * The interface matches the atomic_t interface (to aid in porting) but only
  * provides the few functions one should use for reference counting.
  *
- * It differs in that the counter saturates at REFCOUNT_SATURATED and will not
- * move once there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free issues.
+ * Saturation semantics
+ * ====================
+ *
+ * refcount_t differs from atomic_t in that the counter saturates at
+ * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
+ * counter and causing 'spurious' use-after-free issues. In order to avoid the
+ * cost associated with introducing cmpxchg() loops into all of the saturating
+ * operations, we temporarily allow the counter to take on an unchecked value
+ * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
+ * or overflow has occurred. Although this is racy when multiple threads
+ * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
+ * equidistant from 0 and INT_MAX we minimise the scope for error:
+ *
+ * 	                           INT_MAX     REFCOUNT_SATURATED   UINT_MAX
+ *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
+ *   +--------------------------------+----------------+----------------+
+ *                                     <---------- bad value! ---------->
+ *
+ * (in a signed view of the world, the "bad value" range corresponds to
+ * a negative counter value).
+ *
+ * As an example, consider a refcount_inc() operation that causes the counter
+ * to overflow:
+ *
+ * 	int old = atomic_fetch_add_relaxed(r);
+ *	// old is INT_MAX, refcount now INT_MIN (0x8000_0000)
+ *	if (old < 0)
+ *		atomic_set(r, REFCOUNT_SATURATED);
+ *
+ * If another thread also performs a refcount_inc() operation between the two
+ * atomic operations, then the count will continue to edge closer to 0. If it
+ * reaches a value of 1 before /any/ of the threads reset it to the saturated
+ * value, then a concurrent refcount_dec_and_test() may erroneously free the
+ * underlying object. Given the precise timing details involved with the
+ * round-robin scheduling of each thread manipulating the refcount and the need
+ * to hit the race multiple times in succession, there doesn't appear to be a
+ * practical avenue of attack even if using refcount_add() operations with
+ * larger increments.
+ *
+ * Memory ordering
+ * ===============
  *
  * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
  * and provide only what is strictly required for refcounts.
@@ -109,25 +147,19 @@ static inline unsigned int refcount_read(const refcount_t *r)
  */
 static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
 {
-	unsigned int new, val = atomic_read(&r->refs);
+	int old = refcount_read(r);
 
 	do {
-		if (!val)
-			return false;
-
-		if (unlikely(val == REFCOUNT_SATURATED))
-			return true;
-
-		new = val + i;
-		if (new < val)
-			new = REFCOUNT_SATURATED;
+		if (!old)
+			break;
+	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));
 
-	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
-
-	WARN_ONCE(new == REFCOUNT_SATURATED,
-		  "refcount_t: saturated; leaking memory.\n");
+	if (unlikely(old < 0 || old + i < 0)) {
+		refcount_set(r, REFCOUNT_SATURATED);
+		WARN_ONCE(1, "refcount_t: saturated; leaking memory.\n");
+	}
 
-	return true;
+	return old;
 }
 
 /**
@@ -148,7 +180,13 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
  */
 static inline void refcount_add(int i, refcount_t *r)
 {
-	WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+	int old = atomic_fetch_add_relaxed(i, &r->refs);
+
+	WARN_ONCE(!old, "refcount_t: addition on 0; use-after-free.\n");
+	if (unlikely(old <= 0 || old + i <= 0)) {
+		refcount_set(r, REFCOUNT_SATURATED);
+		WARN_ONCE(old, "refcount_t: saturated; leaking memory.\n");
+	}
 }
 
 /**
@@ -166,23 +204,7 @@ static inline void refcount_add(int i, refcount_t *r)
  */
 static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
 {
-	unsigned int new, val = atomic_read(&r->refs);
-
-	do {
-		new = val + 1;
-
-		if (!val)
-			return false;
-
-		if (unlikely(!new))
-			return true;
-
-	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
-
-	WARN_ONCE(new == REFCOUNT_SATURATED,
-		  "refcount_t: saturated; leaking memory.\n");
-
-	return true;
+	return refcount_add_not_zero(1, r);
 }
 
 /**
@@ -199,7 +221,7 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
  */
 static inline void refcount_inc(refcount_t *r)
 {
-	WARN_ONCE(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+	refcount_add(1, r);
 }
 
 /**
@@ -224,26 +246,19 @@ static inline void refcount_inc(refcount_t *r)
  */
 static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
 {
-	unsigned int new, val = atomic_read(&r->refs);
-
-	do {
-		if (unlikely(val == REFCOUNT_SATURATED))
-			return false;
+	int old = atomic_fetch_sub_release(i, &r->refs);
 
-		new = val - i;
-		if (new > val) {
-			WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n");
-			return false;
-		}
-
-	} while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
-
-	if (!new) {
+	if (old == i) {
 		smp_acquire__after_ctrl_dep();
 		return true;
 	}
-	return false;
 
+	if (unlikely(old < 0 || old - i < 0)) {
+		refcount_set(r, REFCOUNT_SATURATED);
+		WARN_ONCE(1, "refcount_t: underflow; use-after-free.\n");
+	}
+
+	return false;
 }
 
 /**
@@ -276,9 +291,13 @@ static inline __must_check bool refcount_dec_and_test(refcount_t *r)
  */
 static inline void refcount_dec(refcount_t *r)
 {
-	WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
-}
+	int old = atomic_fetch_sub_release(1, &r->refs);
 
+	if (unlikely(old <= 1)) {
+		refcount_set(r, REFCOUNT_SATURATED);
+		WARN_ONCE(1, "refcount_t: decrement hit 0; leaking memory.\n");
+	}
+}
 #else /* CONFIG_REFCOUNT_FULL */
 
 #define REFCOUNT_MAX		INT_MAX
-- 
Gitee


From a46033d56c8532378a1006322fb089f1f3f24e48 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:58:58 +0000
Subject: [PATCH 2059/2161] locking/refcount: Move saturation warnings out of
 line

commit 1eb085d94256aaa69b00cf5a86e3c5f5bb2bc460 upstream.

Having the refcount saturation and warnings inline bloats the text,
despite the fact that these paths should never be executed in normal
operation.

Move the refcount saturation and warnings out of line to reduce the
image size when refcount_t checking is enabled. Relative to an x86_64
defconfig, the sizes reported by bloat-o-meter are:

 # defconfig+REFCOUNT_FULL, inline saturation (i.e. before this patch)
 Total: Before=14762076, After=14915442, chg +1.04%

 # defconfig+REFCOUNT_FULL, out-of-line saturation (i.e. after this patch)
 Total: Before=14762076, After=14835497, chg +0.50%

A side-effect of this change is that we now only get one warning per
refcount saturation type, rather than one per problematic call-site.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-7-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/refcount.h | 39 ++++++++++++++++++++-------------------
 lib/refcount.c           | 28 ++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index e3b218d669ce..1cd0a876a789 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -23,6 +23,16 @@ typedef struct refcount_struct {
 
 #define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
 
+enum refcount_saturation_type {
+	REFCOUNT_ADD_NOT_ZERO_OVF,
+	REFCOUNT_ADD_OVF,
+	REFCOUNT_ADD_UAF,
+	REFCOUNT_SUB_UAF,
+	REFCOUNT_DEC_LEAK,
+};
+
+void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);
+
 /**
  * refcount_set - set a refcount's value
  * @r: the refcount
@@ -154,10 +164,8 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
 			break;
 	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));
 
-	if (unlikely(old < 0 || old + i < 0)) {
-		refcount_set(r, REFCOUNT_SATURATED);
-		WARN_ONCE(1, "refcount_t: saturated; leaking memory.\n");
-	}
+	if (unlikely(old < 0 || old + i < 0))
+		refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);
 
 	return old;
 }
@@ -182,11 +190,10 @@ static inline void refcount_add(int i, refcount_t *r)
 {
 	int old = atomic_fetch_add_relaxed(i, &r->refs);
 
-	WARN_ONCE(!old, "refcount_t: addition on 0; use-after-free.\n");
-	if (unlikely(old <= 0 || old + i <= 0)) {
-		refcount_set(r, REFCOUNT_SATURATED);
-		WARN_ONCE(old, "refcount_t: saturated; leaking memory.\n");
-	}
+	if (unlikely(!old))
+		refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
+	else if (unlikely(old < 0 || old + i < 0))
+		refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
 }
 
 /**
@@ -253,10 +260,8 @@ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
 		return true;
 	}
 
-	if (unlikely(old < 0 || old - i < 0)) {
-		refcount_set(r, REFCOUNT_SATURATED);
-		WARN_ONCE(1, "refcount_t: underflow; use-after-free.\n");
-	}
+	if (unlikely(old < 0 || old - i < 0))
+		refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
 
 	return false;
 }
@@ -291,12 +296,8 @@ static inline __must_check bool refcount_dec_and_test(refcount_t *r)
  */
 static inline void refcount_dec(refcount_t *r)
 {
-	int old = atomic_fetch_sub_release(1, &r->refs);
-
-	if (unlikely(old <= 1)) {
-		refcount_set(r, REFCOUNT_SATURATED);
-		WARN_ONCE(1, "refcount_t: decrement hit 0; leaking memory.\n");
-	}
+	if (unlikely(atomic_fetch_sub_release(1, &r->refs) <= 1))
+		refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
 }
 #else /* CONFIG_REFCOUNT_FULL */
 
diff --git a/lib/refcount.c b/lib/refcount.c
index 3a534fbebdcc..8b7e249c0e10 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -8,6 +8,34 @@
 #include <linux/spinlock.h>
 #include <linux/bug.h>
 
+#define REFCOUNT_WARN(str)	WARN_ONCE(1, "refcount_t: " str ".\n")
+
+void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t)
+{
+	refcount_set(r, REFCOUNT_SATURATED);
+
+	switch (t) {
+	case REFCOUNT_ADD_NOT_ZERO_OVF:
+		REFCOUNT_WARN("saturated; leaking memory");
+		break;
+	case REFCOUNT_ADD_OVF:
+		REFCOUNT_WARN("saturated; leaking memory");
+		break;
+	case REFCOUNT_ADD_UAF:
+		REFCOUNT_WARN("addition on 0; use-after-free");
+		break;
+	case REFCOUNT_SUB_UAF:
+		REFCOUNT_WARN("underflow; use-after-free");
+		break;
+	case REFCOUNT_DEC_LEAK:
+		REFCOUNT_WARN("decrement hit 0; leaking memory");
+		break;
+	default:
+		REFCOUNT_WARN("unknown saturation event!?");
+	}
+}
+EXPORT_SYMBOL(refcount_warn_saturate);
+
 /**
  * refcount_dec_if_one - decrement a refcount if it is 1
  * @r: the refcount
-- 
Gitee


From f0d1d9682b13f5cbd4d032162a791a90c3c0d6d8 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:58:59 +0000
Subject: [PATCH 2060/2161] locking/refcount: Consolidate
 REFCOUNT_{MAX,SATURATED} definitions

commit 65b008552469f1c37f5e06e0016924502e40b4f5 upstream.

The definitions of REFCOUNT_MAX and REFCOUNT_SATURATED are the same,
regardless of CONFIG_REFCOUNT_FULL, so consolidate them into a single
pair of definitions.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-8-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/refcount.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 1cd0a876a789..757d4630115c 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -22,6 +22,8 @@ typedef struct refcount_struct {
 } refcount_t;
 
 #define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
+#define REFCOUNT_MAX		INT_MAX
+#define REFCOUNT_SATURATED	(INT_MIN / 2)
 
 enum refcount_saturation_type {
 	REFCOUNT_ADD_NOT_ZERO_OVF,
@@ -57,9 +59,6 @@ static inline unsigned int refcount_read(const refcount_t *r)
 #ifdef CONFIG_REFCOUNT_FULL
 #include <linux/bug.h>
 
-#define REFCOUNT_MAX		INT_MAX
-#define REFCOUNT_SATURATED	(INT_MIN / 2)
-
 /*
  * Variant of atomic_t specialized for reference counts.
  *
@@ -300,10 +299,6 @@ static inline void refcount_dec(refcount_t *r)
 		refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
 }
 #else /* CONFIG_REFCOUNT_FULL */
-
-#define REFCOUNT_MAX		INT_MAX
-#define REFCOUNT_SATURATED	(INT_MIN / 2)
-
 # ifdef CONFIG_ARCH_HAS_REFCOUNT
 #  include <asm/refcount.h>
 # else
-- 
Gitee


From 6cc4a1373ce9801ccb5e935ae612841a21fad517 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Tue, 6 Dec 2022 20:02:56 +0800
Subject: [PATCH 2061/2161] locking/refcount: Consolidate implementations of
 refcount_t

commit fb041bb7c0a918b95c6889fc965cdc4a75b4c0ca upstream.

The generic implementation of refcount_t should be good enough for
everybody, so remove ARCH_HAS_REFCOUNT and REFCOUNT_FULL entirely,
leaving the generic implementation enabled unconditionally.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-9-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/Kconfig                       |  21 ----
 arch/arm/Kconfig                   |   1 -
 arch/arm64/Kconfig                 |   1 -
 arch/s390/configs/debug_defconfig  |   1 -
 arch/x86/Kconfig                   |   1 -
 arch/x86/include/asm/asm.h         |   6 --
 arch/x86/include/asm/refcount.h    | 126 -----------------------
 arch/x86/mm/extable.c              |  49 ---------
 drivers/gpu/drm/i915/Kconfig.debug |   1 -
 include/linux/refcount.h           | 158 +++++++++++------------------
 lib/refcount.c                     |   2 +-
 11 files changed, 59 insertions(+), 308 deletions(-)
 delete mode 100644 arch/x86/include/asm/refcount.h

diff --git a/arch/Kconfig b/arch/Kconfig
index eaa0738e0539..6cb98cd17acf 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -922,27 +922,6 @@ config STRICT_MODULE_RWX
 config ARCH_HAS_PHYS_TO_DMA
 	bool
 
-config ARCH_HAS_REFCOUNT
-	bool
-	help
-	  An architecture selects this when it has implemented refcount_t
-	  using open coded assembly primitives that provide an optimized
-	  refcount_t implementation, possibly at the expense of some full
-	  refcount state checks of CONFIG_REFCOUNT_FULL=y.
-
-	  The refcount overflow check behavior, however, must be retained.
-	  Catching overflows is the primary security concern for protecting
-	  against bugs in reference counts.
-
-config REFCOUNT_FULL
-	bool "Perform full reference count validation at the expense of speed"
-	help
-	  Enabling this switches the refcounting infrastructure from a fast
-	  unchecked atomic_t implementation to a fully state checked
-	  implementation, which can be (slightly) slower but provides protections
-	  against various use-after-free conditions that can be used in
-	  security flaw exploits.
-
 config HAVE_ARCH_COMPILER_H
 	bool
 	help
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 63cafb433eba..1905d58b7920 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -117,7 +117,6 @@ config ARM
 	select OLD_SIGSUSPEND3
 	select PCI_SYSCALL if PCI
 	select PERF_USE_VMALLOC
-	select REFCOUNT_FULL
 	select RTC_LIB
 	select SYS_SUPPORTS_APM_EMULATION
 	# Above selects are sorted alphabetically; please add new ones
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 8a6cd40cc966..c5d482b4392d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -182,7 +182,6 @@ config ARM64
 	select PCI_SYSCALL if PCI
 	select POWER_RESET
 	select POWER_SUPPLY
-	select REFCOUNT_FULL
 	select SPARSE_IRQ
 	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 38d64030aacf..2e60c80395ab 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -62,7 +62,6 @@ CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_STATIC_KEYS_SELFTEST=y
-CONFIG_REFCOUNT_FULL=y
 CONFIG_LOCK_EVENT_COUNTS=y
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 18cdd5f87c82..cb1cc0ecbd1a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -73,7 +73,6 @@ config X86
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
-	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_UACCESS_MCSAFE		if X86_64 && X86_MCE
 	select ARCH_HAS_SET_MEMORY
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 254f8c8c5429..03cbd219f00a 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -135,9 +135,6 @@
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
-# define _ASM_EXTABLE_REFCOUNT(from, to)			\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
-
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
 	_ASM_ALIGN ;						\
@@ -157,9 +154,6 @@
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
-# define _ASM_EXTABLE_REFCOUNT(from, to)			\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
-
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
deleted file mode 100644
index 232f856e0db0..000000000000
--- a/arch/x86/include/asm/refcount.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef __ASM_X86_REFCOUNT_H
-#define __ASM_X86_REFCOUNT_H
-/*
- * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
- * PaX/grsecurity.
- */
-#include <linux/refcount.h>
-#include <asm/bug.h>
-
-/*
- * This is the first portion of the refcount error handling, which lives in
- * .text.unlikely, and is jumped to from the CPU flag check (in the
- * following macros). This saves the refcount value location into CX for
- * the exception handler to use (in mm/extable.c), and then triggers the
- * central refcount exception. The fixup address for the exception points
- * back to the regular execution flow in .text.
- */
-#define _REFCOUNT_EXCEPTION				\
-	".pushsection .text..refcount\n"		\
-	"111:\tlea %[var], %%" _ASM_CX "\n"		\
-	"112:\t" ASM_UD2 "\n"				\
-	ASM_UNREACHABLE					\
-	".popsection\n"					\
-	"113:\n"					\
-	_ASM_EXTABLE_REFCOUNT(112b, 113b)
-
-/* Trigger refcount exception if refcount result is negative. */
-#define REFCOUNT_CHECK_LT_ZERO				\
-	"js 111f\n\t"					\
-	_REFCOUNT_EXCEPTION
-
-/* Trigger refcount exception if refcount result is zero or negative. */
-#define REFCOUNT_CHECK_LE_ZERO				\
-	"jz 111f\n\t"					\
-	REFCOUNT_CHECK_LT_ZERO
-
-/* Trigger refcount exception unconditionally. */
-#define REFCOUNT_ERROR					\
-	"jmp 111f\n\t"					\
-	_REFCOUNT_EXCEPTION
-
-static __always_inline void refcount_add(unsigned int i, refcount_t *r)
-{
-	asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
-		REFCOUNT_CHECK_LT_ZERO
-		: [var] "+m" (r->refs.counter)
-		: "ir" (i)
-		: "cc", "cx");
-}
-
-static __always_inline void refcount_inc(refcount_t *r)
-{
-	asm volatile(LOCK_PREFIX "incl %0\n\t"
-		REFCOUNT_CHECK_LT_ZERO
-		: [var] "+m" (r->refs.counter)
-		: : "cc", "cx");
-}
-
-static __always_inline void refcount_dec(refcount_t *r)
-{
-	asm volatile(LOCK_PREFIX "decl %0\n\t"
-		REFCOUNT_CHECK_LE_ZERO
-		: [var] "+m" (r->refs.counter)
-		: : "cc", "cx");
-}
-
-static __always_inline __must_check
-bool refcount_sub_and_test(unsigned int i, refcount_t *r)
-{
-	bool ret = GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl",
-					 REFCOUNT_CHECK_LT_ZERO,
-					 r->refs.counter, e, "er", i, "cx");
-
-	if (ret) {
-		smp_acquire__after_ctrl_dep();
-		return true;
-	}
-
-	return false;
-}
-
-static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
-{
-	bool ret = GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl",
-					 REFCOUNT_CHECK_LT_ZERO,
-					 r->refs.counter, e, "cx");
-
-	if (ret) {
-		smp_acquire__after_ctrl_dep();
-		return true;
-	}
-
-	return false;
-}
-
-static __always_inline __must_check
-bool refcount_add_not_zero(unsigned int i, refcount_t *r)
-{
-	int c, result;
-
-	c = atomic_read(&(r->refs));
-	do {
-		if (unlikely(c == 0))
-			return false;
-
-		result = c + i;
-
-		/* Did we try to increment from/to an undesirable state? */
-		if (unlikely(c < 0 || c == INT_MAX || result < c)) {
-			asm volatile(REFCOUNT_ERROR
-				     : : [var] "m" (r->refs.counter)
-				     : "cc", "cx");
-			break;
-		}
-
-	} while (!atomic_try_cmpxchg(&(r->refs), &c, result));
-
-	return c != 0;
-}
-
-static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
-{
-	return refcount_add_not_zero(1, r);
-}
-
-#endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 11a7d67e6725..26f01015239f 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -28,55 +28,6 @@ static bool ex_handler_fault(const struct exception_table_entry *fixup,
 	return ex_handler_default(fixup, regs);
 }
 
-/*
- * Handler for UD0 exception following a failed test against the
- * result of a refcount inc/dec/add/sub.
- */
-__visible bool ex_handler_refcount(const struct exception_table_entry *fixup,
-				   struct pt_regs *regs, int trapnr,
-				   unsigned long error_code,
-				   unsigned long fault_addr)
-{
-	/* First unconditionally saturate the refcount. */
-	*(int *)regs->cx = INT_MIN / 2;
-
-	/*
-	 * Strictly speaking, this reports the fixup destination, not
-	 * the fault location, and not the actually overflowing
-	 * instruction, which is the instruction before the "js", but
-	 * since that instruction could be a variety of lengths, just
-	 * report the location after the overflow, which should be close
-	 * enough for finding the overflow, as it's at least back in
-	 * the function, having returned from .text.unlikely.
-	 */
-	regs->ip = ex_fixup_addr(fixup);
-
-	/*
-	 * This function has been called because either a negative refcount
-	 * value was seen by any of the refcount functions, or a zero
-	 * refcount value was seen by refcount_dec().
-	 *
-	 * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
-	 * wrapped around) will be set. Additionally, seeing the refcount
-	 * reach 0 will set ZF (Zero Flag: result was zero). In each of
-	 * these cases we want a report, since it's a boundary condition.
-	 * The SF case is not reported since it indicates post-boundary
-	 * manipulations below zero or above INT_MAX. And if none of the
-	 * flags are set, something has gone very wrong, so report it.
-	 */
-	if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
-		bool zero = regs->flags & X86_EFLAGS_ZF;
-
-		refcount_error_report(regs, zero ? "hit zero" : "overflow");
-	} else if ((regs->flags & X86_EFLAGS_SF) == 0) {
-		/* Report if none of OF, ZF, nor SF are set. */
-		refcount_error_report(regs, "unexpected saturation");
-	}
-
-	return true;
-}
-EXPORT_SYMBOL(ex_handler_refcount);
-
 /*
  * Handler for when we fail to restore a task's FPU state.  We should never get
  * here because the FPU state of a task using the FPU (task->thread.fpu.state)
diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug
index 41c8e39a73ba..e4f03fcb125e 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -21,7 +21,6 @@ config DRM_I915_DEBUG
         depends on DRM_I915
         select DEBUG_FS
         select PREEMPT_COUNT
-        select REFCOUNT_FULL
         select I2C_CHARDEV
         select STACKDEPOT
         select DRM_DP_AUX_CHARDEV
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 757d4630115c..0ac50cf62d06 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -1,64 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_REFCOUNT_H
-#define _LINUX_REFCOUNT_H
-
-#include <linux/atomic.h>
-#include <linux/compiler.h>
-#include <linux/limits.h>
-#include <linux/spinlock_types.h>
-
-struct mutex;
-
-/**
- * struct refcount_t - variant of atomic_t specialized for reference counts
- * @refs: atomic_t counter field
- *
- * The counter saturates at REFCOUNT_SATURATED and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free bugs.
- */
-typedef struct refcount_struct {
-	atomic_t refs;
-} refcount_t;
-
-#define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
-#define REFCOUNT_MAX		INT_MAX
-#define REFCOUNT_SATURATED	(INT_MIN / 2)
-
-enum refcount_saturation_type {
-	REFCOUNT_ADD_NOT_ZERO_OVF,
-	REFCOUNT_ADD_OVF,
-	REFCOUNT_ADD_UAF,
-	REFCOUNT_SUB_UAF,
-	REFCOUNT_DEC_LEAK,
-};
-
-void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);
-
-/**
- * refcount_set - set a refcount's value
- * @r: the refcount
- * @n: value to which the refcount will be set
- */
-static inline void refcount_set(refcount_t *r, int n)
-{
-	atomic_set(&r->refs, n);
-}
-
-/**
- * refcount_read - get a refcount's value
- * @r: the refcount
- *
- * Return: the refcount's value
- */
-static inline unsigned int refcount_read(const refcount_t *r)
-{
-	return atomic_read(&r->refs);
-}
-
-#ifdef CONFIG_REFCOUNT_FULL
-#include <linux/bug.h>
-
 /*
  * Variant of atomic_t specialized for reference counts.
  *
@@ -136,6 +76,64 @@ static inline unsigned int refcount_read(const refcount_t *r)
  *
  */
 
+#ifndef _LINUX_REFCOUNT_H
+#define _LINUX_REFCOUNT_H
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/limits.h>
+#include <linux/spinlock_types.h>
+
+struct mutex;
+
+/**
+ * struct refcount_t - variant of atomic_t specialized for reference counts
+ * @refs: atomic_t counter field
+ *
+ * The counter saturates at REFCOUNT_SATURATED and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free bugs.
+ */
+typedef struct refcount_struct {
+	atomic_t refs;
+} refcount_t;
+
+#define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
+#define REFCOUNT_MAX		INT_MAX
+#define REFCOUNT_SATURATED	(INT_MIN / 2)
+
+enum refcount_saturation_type {
+	REFCOUNT_ADD_NOT_ZERO_OVF,
+	REFCOUNT_ADD_OVF,
+	REFCOUNT_ADD_UAF,
+	REFCOUNT_SUB_UAF,
+	REFCOUNT_DEC_LEAK,
+};
+
+void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);
+
+/**
+ * refcount_set - set a refcount's value
+ * @r: the refcount
+ * @n: value to which the refcount will be set
+ */
+static inline void refcount_set(refcount_t *r, int n)
+{
+	atomic_set(&r->refs, n);
+}
+
+/**
+ * refcount_read - get a refcount's value
+ * @r: the refcount
+ *
+ * Return: the refcount's value
+ */
+static inline unsigned int refcount_read(const refcount_t *r)
+{
+	return atomic_read(&r->refs);
+}
+
 /**
  * refcount_add_not_zero - add a value to a refcount unless it is 0
  * @i: the value to add to the refcount
@@ -298,46 +296,6 @@ static inline void refcount_dec(refcount_t *r)
 	if (unlikely(atomic_fetch_sub_release(1, &r->refs) <= 1))
 		refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
 }
-#else /* CONFIG_REFCOUNT_FULL */
-# ifdef CONFIG_ARCH_HAS_REFCOUNT
-#  include <asm/refcount.h>
-# else
-static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
-{
-	return atomic_add_unless(&r->refs, i, 0);
-}
-
-static inline void refcount_add(int i, refcount_t *r)
-{
-	atomic_add(i, &r->refs);
-}
-
-static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
-{
-	return atomic_add_unless(&r->refs, 1, 0);
-}
-
-static inline void refcount_inc(refcount_t *r)
-{
-	atomic_inc(&r->refs);
-}
-
-static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
-{
-	return atomic_sub_and_test(i, &r->refs);
-}
-
-static inline __must_check bool refcount_dec_and_test(refcount_t *r)
-{
-	return atomic_dec_and_test(&r->refs);
-}
-
-static inline void refcount_dec(refcount_t *r)
-{
-	atomic_dec(&r->refs);
-}
-# endif /* !CONFIG_ARCH_HAS_REFCOUNT */
-#endif /* !CONFIG_REFCOUNT_FULL */
 
 extern __must_check bool refcount_dec_if_one(refcount_t *r);
 extern __must_check bool refcount_dec_not_one(refcount_t *r);
diff --git a/lib/refcount.c b/lib/refcount.c
index 8b7e249c0e10..ebac8b7d15a7 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Out-of-line refcount functions common to all refcount implementations.
+ * Out-of-line refcount functions.
  */
 
 #include <linux/mutex.h>
-- 
Gitee


From 72910bd7cd15f9758739361d8477a00c26932f11 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:59:01 +0000
Subject: [PATCH 2062/2161] locking/refcount: Remove unused
 'refcount_error_report()' function

commit 7221762c48c6bbbcc6cc51d8b803c06930215e34 upstream.

'refcount_error_report()' has no callers. Remove it.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-10-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/kernel.h |  7 -------
 kernel/panic.c         | 11 -----------
 2 files changed, 18 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d83d403dac2e..09f759228e3f 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -328,13 +328,6 @@ extern int oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
-#ifdef CONFIG_ARCH_HAS_REFCOUNT
-void refcount_error_report(struct pt_regs *regs, const char *err);
-#else
-static inline void refcount_error_report(struct pt_regs *regs, const char *err)
-{ }
-#endif
-
 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
 int __must_check _kstrtol(const char *s, unsigned int base, long *res);
diff --git a/kernel/panic.c b/kernel/panic.c
index f470a038b05b..b69ee9e76cb2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -671,17 +671,6 @@ EXPORT_SYMBOL(__stack_chk_fail);
 
 #endif
 
-#ifdef CONFIG_ARCH_HAS_REFCOUNT
-void refcount_error_report(struct pt_regs *regs, const char *err)
-{
-	WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
-		err, (void *)instruction_pointer(regs),
-		current->comm, task_pid_nr(current),
-		from_kuid_munged(&init_user_ns, current_uid()),
-		from_kuid_munged(&init_user_ns, current_euid()));
-}
-#endif
-
 core_param(panic, panic_timeout, int, 0644);
 core_param(panic_print, panic_print, ulong, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
-- 
Gitee


From e998cff0f8bbd54e49402160531cf9402aae17bb Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 11:59:02 +0000
Subject: [PATCH 2063/2161] lkdtm: Remove references to CONFIG_REFCOUNT_FULL

commit 500543c53a54134ced386aed85cd93cf1363f981 upstream.

CONFIG_REFCOUNT_FULL no longer exists, so remove all references to it.

Signed-off-by: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191121115902.2551-11-will@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/misc/lkdtm/refcount.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/misc/lkdtm/refcount.c b/drivers/misc/lkdtm/refcount.c
index abf3b7c1f686..de7c5ab528d9 100644
--- a/drivers/misc/lkdtm/refcount.c
+++ b/drivers/misc/lkdtm/refcount.c
@@ -119,7 +119,7 @@ void lkdtm_REFCOUNT_DEC_ZERO(void)
 static void check_negative(refcount_t *ref, int start)
 {
 	/*
-	 * CONFIG_REFCOUNT_FULL refuses to move a refcount at all on an
+	 * refcount_t refuses to move a refcount at all on an
 	 * over-sub, so we have to track our starting position instead of
 	 * looking only at zero-pinning.
 	 */
@@ -202,7 +202,6 @@ static void check_from_zero(refcount_t *ref)
 
 /*
  * A refcount_inc() from zero should pin to zero or saturate and may WARN.
- * Only CONFIG_REFCOUNT_FULL provides this protection currently.
  */
 void lkdtm_REFCOUNT_INC_ZERO(void)
 {
-- 
Gitee


From 85d4190c86f1e746aae3ea4147124b111fd5489e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 11:28:09 -0500
Subject: [PATCH 2064/2161] x86 user stack frame reads: switch to explicit
 __get_user()

commit c8e3dd86600a1a7b165478cc626d69bf07967c15 upstream.

rather than relying upon the magic in raw_copy_from_user()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/core.c         | 27 +++++++--------------------
 arch/x86/include/asm/uaccess.h |  9 ---------
 arch/x86/kernel/stacktrace.c   |  6 ++++--
 3 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 3f5bf76bf711..39bb48bd722a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2540,7 +2540,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
 	/* 32-bit process in 64-bit kernel. */
 	unsigned long ss_base, cs_base;
 	struct stack_frame_ia32 frame;
-	const void __user *fp;
+	const struct stack_frame_ia32 __user *fp;
 
 	if (!test_thread_flag(TIF_IA32))
 		return 0;
@@ -2551,18 +2551,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
 	while (entry->nr < entry->max_stack) {
-		unsigned long bytes;
-		frame.next_frame     = 0;
-		frame.return_address = 0;
-
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
-		if (bytes != 0)
+		if (__get_user(frame.next_frame, &fp->next_frame))
 			break;
-		bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
-		if (bytes != 0)
+		if (__get_user(frame.return_address, &fp->return_address))
 			break;
 
 		perf_callchain_store(entry, cs_base + frame.return_address);
@@ -2583,7 +2577,7 @@ void
 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stack_frame frame;
-	const unsigned long __user *fp;
+	const struct stack_frame __user *fp;
 
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* TODO: We don't support guest os callchain now */
@@ -2596,7 +2590,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
 		return;
 
-	fp = (unsigned long __user *)regs->bp;
+	fp = (void __user *)regs->bp;
 
 	perf_callchain_store(entry, regs->ip);
 
@@ -2608,19 +2602,12 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	pagefault_disable();
 	while (entry->nr < entry->max_stack) {
-		unsigned long bytes;
-
-		frame.next_frame	     = NULL;
-		frame.return_address = 0;
-
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp));
-		if (bytes != 0)
+		if (__get_user(frame.next_frame, &fp->next_frame))
 			break;
-		bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp));
-		if (bytes != 0)
+		if (__get_user(frame.return_address, &fp->return_address))
 			break;
 
 		perf_callchain_store(entry, frame.return_address);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 61d93f062a36..ab8eab43a8a2 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -694,15 +694,6 @@ extern struct movsl_mask {
 # include <asm/uaccess_64.h>
 #endif
 
-/*
- * We rely on the nested NMI work to allow atomic faults from the NMI path; the
- * nested NMI paths are careful to preserve CR2.
- *
- * Caller must use pagefault_enable/disable, or run in interrupt context,
- * and also do a uaccess_ok() check
- */
-#define __copy_from_user_nmi __copy_from_user_inatomic
-
 /*
  * The "unsafe" user accesses aren't really "unsafe", but the naming
  * is a big fat warning: you have to not only do the access_ok()
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6d83b4b857e6..2fd698e28e4d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -91,7 +91,8 @@ struct stack_frame_user {
 };
 
 static int
-copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
+copy_stack_frame(const struct stack_frame_user __user *fp,
+		 struct stack_frame_user *frame)
 {
 	int ret;
 
@@ -100,7 +101,8 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
 
 	ret = 1;
 	pagefault_disable();
-	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+	if (__get_user(frame->next_fp, &fp->next_fp) ||
+	    __get_user(frame->ret_addr, &fp->ret_addr))
 		ret = 0;
 	pagefault_enable();
 
-- 
Gitee


From b14d0cb4d112ed2b8783b1f48ffa0b08a35d0091 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 11:29:04 -0500
Subject: [PATCH 2065/2161] x86 kvm page table walks: switch to explicit
 __get_user()

commit a4814443993c7c8686036bb8ca0df6abc499d63f upstream.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a20fc1ba607f..6092ffa649af 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -385,7 +385,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 			goto error;
 
 		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
-		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+		if (unlikely(__get_user(pte, ptep_user)))
 			goto error;
 		walker->ptep_user[walker->level - 1] = ptep_user;
 
-- 
Gitee


From 5fe896f376585cbfc52190bca04b981d2c8c6e84 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 11:43:18 -0500
Subject: [PATCH 2066/2161] x86: switch sigframe sigset handling to explict
 __get_user()/__put_user()

commit 71c3313a38aa09339a2442809e658fd233ab0757 upstream.

... and consolidate the definition of sigframe_ia32->extramask - it's
always a 1-element array of 32bit unsigned.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c     | 16 +++++-----------
 arch/x86/include/asm/sigframe.h |  6 +-----
 arch/x86/kernel/signal.c        | 20 ++++++++------------
 3 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 119cc0818f3b..62e7fa8526e4 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -110,10 +110,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__get_user(set.sig[0], &frame->sc.oldmask)
-	    || (_COMPAT_NSIG_WORDS > 1
-		&& __copy_from_user((((char *) &set.sig) + 4),
-				    &frame->extramask,
-				    sizeof(frame->extramask))))
+	    || __get_user(((__u32 *)&set)[1], &frame->extramask[0]))
 		goto badframe;
 
 	set_current_blocked(&set);
@@ -137,7 +134,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
 
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
-	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+	if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
 		goto badframe;
 
 	set_current_blocked(&set);
@@ -263,11 +260,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 	if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
 		return -EFAULT;
 
-	if (_COMPAT_NSIG_WORDS > 1) {
-		if (__copy_to_user(frame->extramask, &set->sig[1],
-				   sizeof(frame->extramask)))
-			return -EFAULT;
-	}
+	if (__put_user(set->sig[1], &frame->extramask[0]))
+		return -EFAULT;
 
 	if (ksig->ka.sa.sa_flags & SA_RESTORER) {
 		restorer = ksig->ka.sa.sa_restorer;
@@ -367,7 +361,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 	err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
 	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
 				     regs, set->sig[0]);
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+	err |= __put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask);
 
 	if (err)
 		return -EFAULT;
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index e30aa05905f5..5b1ed650b124 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -33,11 +33,7 @@ struct sigframe_ia32 {
 	 * legacy application accessing/modifying it.
 	 */
 	struct _fpstate_32 fpstate_unused;
-#ifdef CONFIG_IA32_EMULATION
-	unsigned int extramask[_COMPAT_NSIG_WORDS-1];
-#else /* !CONFIG_IA32_EMULATION */
-	unsigned long extramask[_NSIG_WORDS-1];
-#endif /* CONFIG_IA32_EMULATION */
+	unsigned int extramask[1];
 	char retcode[8];
 	/* fp state follows here */
 };
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index dbf28eeea3cc..28f4a528b61f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -334,11 +334,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 	if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
 		return -EFAULT;
 
-	if (_NSIG_WORDS > 1) {
-		if (__copy_to_user(&frame->extramask, &set->sig[1],
-				   sizeof(frame->extramask)))
-			return -EFAULT;
-	}
+	if (__put_user(set->sig[1], &frame->extramask[0]))
+		return -EFAULT;
 
 	if (current->mm->context.vdso)
 		restorer = current->mm->context.vdso +
@@ -497,7 +494,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	} put_user_catch(err);
 
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+	err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
 
 	if (err)
 		return -EFAULT;
@@ -583,7 +580,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
 				regs, set->sig[0]);
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+	err |= __put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask);
 
 	if (err)
 		return -EFAULT;
@@ -621,9 +618,8 @@ SYSCALL_DEFINE0(sigreturn)
 
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
-	if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
-		&& __copy_from_user(&set.sig[1], &frame->extramask,
-				    sizeof(frame->extramask))))
+	if (__get_user(set.sig[0], &frame->sc.oldmask) ||
+	    __get_user(set.sig[1], &frame->extramask[0]))
 		goto badframe;
 
 	set_current_blocked(&set);
@@ -653,7 +649,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
-	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+	if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
 		goto badframe;
 	if (__get_user(uc_flags, &frame->uc.uc_flags))
 		goto badframe;
@@ -970,7 +966,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
-	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+	if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
 		goto badframe;
 	if (__get_user(uc_flags, &frame->uc.uc_flags))
 		goto badframe;
-- 
Gitee


From d3c097eefe59995b749ebbd2e7d15fad27a43165 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 11:46:30 -0500
Subject: [PATCH 2067/2161] x86: get rid of small constant size cases in
 raw_copy_{to,from}_user()

commit 4b842e4e25b12951fa10dedb4bc16bc47e3b850c upstream.

Very few call sites where that would be triggered remain, and none
of those is anywhere near hot enough to bother.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/uaccess.h    |  12 ----
 arch/x86/include/asm/uaccess_32.h |  27 --------
 arch/x86/include/asm/uaccess_64.h | 108 +-----------------------------
 3 files changed, 2 insertions(+), 145 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ab8eab43a8a2..1cfa33b94a1a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -378,18 +378,6 @@ do {									\
 		     : "=r" (err), ltype(x)				\
 		     : "m" (__m(addr)), "i" (errret), "0" (err))
 
-#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret)	\
-	asm volatile("\n"						\
-		     "1:	mov"itype" %2,%"rtype"1\n"		\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:	mov %3,%0\n"				\
-		     "	jmp 2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE_UA(1b, 3b)				\
-		     : "=r" (err), ltype(x)				\
-		     : "m" (__m(addr)), "i" (errret), "0" (err))
-
 /*
  * This doesn't do __uaccess_begin/end - the exception handling
  * around it must do that.
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index ba2dc1930630..388a40660c7b 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -23,33 +23,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 static __always_inline unsigned long
 raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (__builtin_constant_p(n)) {
-		unsigned long ret;
-
-		switch (n) {
-		case 1:
-			ret = 0;
-			__uaccess_begin_nospec();
-			__get_user_asm_nozero(*(u8 *)to, from, ret,
-					      "b", "b", "=q", 1);
-			__uaccess_end();
-			return ret;
-		case 2:
-			ret = 0;
-			__uaccess_begin_nospec();
-			__get_user_asm_nozero(*(u16 *)to, from, ret,
-					      "w", "w", "=r", 2);
-			__uaccess_end();
-			return ret;
-		case 4:
-			ret = 0;
-			__uaccess_begin_nospec();
-			__get_user_asm_nozero(*(u32 *)to, from, ret,
-					      "l", "k", "=r", 4);
-			__uaccess_end();
-			return ret;
-		}
-	}
 	return __copy_user_ll(to, (__force const void *)from, n);
 }
 
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 5cd1caa8bc65..bc10e3dc64fe 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -65,117 +65,13 @@ copy_to_user_mcsafe(void *to, const void *from, unsigned len)
 static __always_inline __must_check unsigned long
 raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 {
-	int ret = 0;
-
-	if (!__builtin_constant_p(size))
-		return copy_user_generic(dst, (__force void *)src, size);
-	switch (size) {
-	case 1:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
-			      ret, "b", "b", "=q", 1);
-		__uaccess_end();
-		return ret;
-	case 2:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
-			      ret, "w", "w", "=r", 2);
-		__uaccess_end();
-		return ret;
-	case 4:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
-			      ret, "l", "k", "=r", 4);
-		__uaccess_end();
-		return ret;
-	case 8:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
-			      ret, "q", "", "=r", 8);
-		__uaccess_end();
-		return ret;
-	case 10:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
-			       ret, "q", "", "=r", 10);
-		if (likely(!ret))
-			__get_user_asm_nozero(*(u16 *)(8 + (char *)dst),
-				       (u16 __user *)(8 + (char __user *)src),
-				       ret, "w", "w", "=r", 2);
-		__uaccess_end();
-		return ret;
-	case 16:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
-			       ret, "q", "", "=r", 16);
-		if (likely(!ret))
-			__get_user_asm_nozero(*(u64 *)(8 + (char *)dst),
-				       (u64 __user *)(8 + (char __user *)src),
-				       ret, "q", "", "=r", 8);
-		__uaccess_end();
-		return ret;
-	default:
-		return copy_user_generic(dst, (__force void *)src, size);
-	}
+	return copy_user_generic(dst, (__force void *)src, size);
 }
 
 static __always_inline __must_check unsigned long
 raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
 {
-	int ret = 0;
-
-	if (!__builtin_constant_p(size))
-		return copy_user_generic((__force void *)dst, src, size);
-	switch (size) {
-	case 1:
-		__uaccess_begin();
-		__put_user_asm(*(u8 *)src, (u8 __user *)dst,
-			      ret, "b", "b", "iq", 1);
-		__uaccess_end();
-		return ret;
-	case 2:
-		__uaccess_begin();
-		__put_user_asm(*(u16 *)src, (u16 __user *)dst,
-			      ret, "w", "w", "ir", 2);
-		__uaccess_end();
-		return ret;
-	case 4:
-		__uaccess_begin();
-		__put_user_asm(*(u32 *)src, (u32 __user *)dst,
-			      ret, "l", "k", "ir", 4);
-		__uaccess_end();
-		return ret;
-	case 8:
-		__uaccess_begin();
-		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
-			      ret, "q", "", "er", 8);
-		__uaccess_end();
-		return ret;
-	case 10:
-		__uaccess_begin();
-		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
-			       ret, "q", "", "er", 10);
-		if (likely(!ret)) {
-			asm("":::"memory");
-			__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
-				       ret, "w", "w", "ir", 2);
-		}
-		__uaccess_end();
-		return ret;
-	case 16:
-		__uaccess_begin();
-		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
-			       ret, "q", "", "er", 16);
-		if (likely(!ret)) {
-			asm("":::"memory");
-			__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
-				       ret, "q", "", "er", 8);
-		}
-		__uaccess_end();
-		return ret;
-	default:
-		return copy_user_generic((__force void *)dst, src, size);
-	}
+	return copy_user_generic((__force void *)dst, src, size);
 }
 
 static __always_inline __must_check
-- 
Gitee


From 8557912c6e16969937db2ab21736034d6c8b7056 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 12:09:14 -0500
Subject: [PATCH 2068/2161] vm86: get rid of get_user_ex() use

commit c63aad695dceb08290be7845052d7d3f1d457416 upstream.

Just do a copyin of what we want into a local variable and
be done with that.  We are guaranteed to be on shallow stack
here...

Note that conditional expression for range passed to access_ok()
in mainline had been pointless all along - the only difference
between vm86plus_struct and vm86_struct is that the former has
one extra field in the end and when we get to copyin of that
field (conditional upon 'plus' argument), we use copy_from_user().
Moreover, all fields starting with ->int_revectored are copied
that way, so we only need that check (be it done by access_ok()
or by user_access_begin()) only on the beginning of the structure -
the fields that used to be covered by that get_user_try() block.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/vm86_32.c | 54 +++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index a76c12b38e92..151f751db10c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -243,6 +243,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	struct kernel_vm86_regs vm86regs;
 	struct pt_regs *regs = current_pt_regs();
 	unsigned long err = 0;
+	struct vm86_struct v;
 
 	err = security_mmap_addr(0);
 	if (err) {
@@ -278,39 +279,32 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	if (vm86->saved_sp0)
 		return -EPERM;
 
-	if (!access_ok(user_vm86, plus ?
-		       sizeof(struct vm86_struct) :
-		       sizeof(struct vm86plus_struct)))
+	if (copy_from_user(&v, user_vm86,
+			offsetof(struct vm86_struct, int_revectored)))
 		return -EFAULT;
 
 	memset(&vm86regs, 0, sizeof(vm86regs));
-	get_user_try {
-		unsigned short seg;
-		get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
-		get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
-		get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
-		get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
-		get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
-		get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
-		get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
-		get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
-		get_user_ex(seg, &user_vm86->regs.cs);
-		vm86regs.pt.cs = seg;
-		get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
-		get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
-		get_user_ex(seg, &user_vm86->regs.ss);
-		vm86regs.pt.ss = seg;
-		get_user_ex(vm86regs.es, &user_vm86->regs.es);
-		get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
-		get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
-		get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
-
-		get_user_ex(vm86->flags, &user_vm86->flags);
-		get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
-		get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
-	} get_user_catch(err);
-	if (err)
-		return err;
+
+	vm86regs.pt.bx = v.regs.ebx;
+	vm86regs.pt.cx = v.regs.ecx;
+	vm86regs.pt.dx = v.regs.edx;
+	vm86regs.pt.si = v.regs.esi;
+	vm86regs.pt.di = v.regs.edi;
+	vm86regs.pt.bp = v.regs.ebp;
+	vm86regs.pt.ax = v.regs.eax;
+	vm86regs.pt.ip = v.regs.eip;
+	vm86regs.pt.cs = v.regs.cs;
+	vm86regs.pt.flags = v.regs.eflags;
+	vm86regs.pt.sp = v.regs.esp;
+	vm86regs.pt.ss = v.regs.ss;
+	vm86regs.es = v.regs.es;
+	vm86regs.ds = v.regs.ds;
+	vm86regs.fs = v.regs.fs;
+	vm86regs.gs = v.regs.gs;
+
+	vm86->flags = v.flags;
+	vm86->screen_bitmap = v.screen_bitmap;
+	vm86->cpu_type = v.cpu_type;
 
 	if (copy_from_user(&vm86->int_revectored,
 			   &user_vm86->int_revectored,
-- 
Gitee


From 73bf1ee1aa887e863fc97307e30c592bf34b8eea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 13:26:51 -0500
Subject: [PATCH 2069/2161] x86: kill get_user_{try,catch,ex}

commit 77f3c6166ddc7567455b244074b3ebb63862b56f upstream.

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/uaccess.h | 54 ----------------------------------
 1 file changed, 54 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 1cfa33b94a1a..4dc5accdd826 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -335,12 +335,9 @@ do {									\
 		       "i" (errret), "0" (retval));			\
 })
 
-#define __get_user_asm_ex_u64(x, ptr)			(x) = __get_user_bad()
 #else
 #define __get_user_asm_u64(x, ptr, retval, errret) \
 	 __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
-#define __get_user_asm_ex_u64(x, ptr) \
-	 __get_user_asm_ex(x, ptr, "q", "", "=r")
 #endif
 
 #define __get_user_size(x, ptr, size, retval, errret)			\
@@ -378,41 +375,6 @@ do {									\
 		     : "=r" (err), ltype(x)				\
 		     : "m" (__m(addr)), "i" (errret), "0" (err))
 
-/*
- * This doesn't do __uaccess_begin/end - the exception handling
- * around it must do that.
- */
-#define __get_user_size_ex(x, ptr, size)				\
-do {									\
-	__chk_user_ptr(ptr);						\
-	switch (size) {							\
-	case 1:								\
-		__get_user_asm_ex(x, ptr, "b", "b", "=q");		\
-		break;							\
-	case 2:								\
-		__get_user_asm_ex(x, ptr, "w", "w", "=r");		\
-		break;							\
-	case 4:								\
-		__get_user_asm_ex(x, ptr, "l", "k", "=r");		\
-		break;							\
-	case 8:								\
-		__get_user_asm_ex_u64(x, ptr);				\
-		break;							\
-	default:							\
-		(x) = __get_user_bad();					\
-	}								\
-} while (0)
-
-#define __get_user_asm_ex(x, addr, itype, rtype, ltype)			\
-	asm volatile("1:	mov"itype" %1,%"rtype"0\n"		\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-                     "3:xor"itype" %"rtype"0,%"rtype"0\n"		\
-		     "  jmp 2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE_EX(1b, 3b)				\
-		     : ltype(x) : "m" (__m(addr)))
-
 #define __put_user_nocheck(x, ptr, size)			\
 ({								\
 	__label__ __pu_label;					\
@@ -540,22 +502,6 @@ struct __large_struct { unsigned long buf[100]; };
 #define __put_user(x, ptr)						\
 	__put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
 
-/*
- * {get|put}_user_try and catch
- *
- * get_user_try {
- *	get_user_ex(...);
- * } get_user_catch(err)
- */
-#define get_user_try		uaccess_try_nospec
-#define get_user_catch(err)	uaccess_catch(err)
-
-#define get_user_ex(x, ptr)	do {					\
-	unsigned long __gue_val;					\
-	__get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr))));	\
-	(x) = (__force __typeof__(*(ptr)))__gue_val;			\
-} while (0)
-
 #define put_user_try		uaccess_try
 #define put_user_catch(err)	uaccess_catch(err)
 
-- 
Gitee


From 30f4f429b80d33f3100518f06cf3b6f58e3f6dac Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 13:38:18 -0500
Subject: [PATCH 2070/2161] x86: switch save_v86_state() to unsafe_put_user()

commit a37d01ead405e3aa14d72d284721fe46422b3b63 upstream.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/vm86_32.c | 61 +++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 151f751db10c..f747021024ca 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -98,7 +98,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
 	struct vm86 *vm86 = current->thread.vm86;
-	long err = 0;
 
 	/*
 	 * This gets called from entry.S with interrupts disabled, but
@@ -114,37 +113,30 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
 	user = vm86->user_vm86;
 
-	if (!access_ok(user, vm86->vm86plus.is_vm86pus ?
+	if (!user_access_begin(user, vm86->vm86plus.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
-		       sizeof(struct vm86_struct))) {
-		pr_alert("could not access userspace vm86 info\n");
-		do_exit(SIGSEGV);
-	}
-
-	put_user_try {
-		put_user_ex(regs->pt.bx, &user->regs.ebx);
-		put_user_ex(regs->pt.cx, &user->regs.ecx);
-		put_user_ex(regs->pt.dx, &user->regs.edx);
-		put_user_ex(regs->pt.si, &user->regs.esi);
-		put_user_ex(regs->pt.di, &user->regs.edi);
-		put_user_ex(regs->pt.bp, &user->regs.ebp);
-		put_user_ex(regs->pt.ax, &user->regs.eax);
-		put_user_ex(regs->pt.ip, &user->regs.eip);
-		put_user_ex(regs->pt.cs, &user->regs.cs);
-		put_user_ex(regs->pt.flags, &user->regs.eflags);
-		put_user_ex(regs->pt.sp, &user->regs.esp);
-		put_user_ex(regs->pt.ss, &user->regs.ss);
-		put_user_ex(regs->es, &user->regs.es);
-		put_user_ex(regs->ds, &user->regs.ds);
-		put_user_ex(regs->fs, &user->regs.fs);
-		put_user_ex(regs->gs, &user->regs.gs);
-
-		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
-	} put_user_catch(err);
-	if (err) {
-		pr_alert("could not access userspace vm86 info\n");
-		do_exit(SIGSEGV);
-	}
+		       sizeof(struct vm86_struct)))
+		goto Efault;
+
+	unsafe_put_user(regs->pt.bx, &user->regs.ebx, Efault_end);
+	unsafe_put_user(regs->pt.cx, &user->regs.ecx, Efault_end);
+	unsafe_put_user(regs->pt.dx, &user->regs.edx, Efault_end);
+	unsafe_put_user(regs->pt.si, &user->regs.esi, Efault_end);
+	unsafe_put_user(regs->pt.di, &user->regs.edi, Efault_end);
+	unsafe_put_user(regs->pt.bp, &user->regs.ebp, Efault_end);
+	unsafe_put_user(regs->pt.ax, &user->regs.eax, Efault_end);
+	unsafe_put_user(regs->pt.ip, &user->regs.eip, Efault_end);
+	unsafe_put_user(regs->pt.cs, &user->regs.cs, Efault_end);
+	unsafe_put_user(regs->pt.flags, &user->regs.eflags, Efault_end);
+	unsafe_put_user(regs->pt.sp, &user->regs.esp, Efault_end);
+	unsafe_put_user(regs->pt.ss, &user->regs.ss, Efault_end);
+	unsafe_put_user(regs->es, &user->regs.es, Efault_end);
+	unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
+	unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
+	unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
+	unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end);
+
+	user_access_end();
 
 	preempt_disable();
 	tsk->thread.sp0 = vm86->saved_sp0;
@@ -159,6 +151,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	lazy_load_gs(vm86->regs32.gs);
 
 	regs->pt.ax = retval;
+	return;
+
+Efault_end:
+	user_access_end();
+Efault:
+	pr_alert("could not access userspace vm86 info\n");
+	do_exit(SIGSEGV);
 }
 
 static void mark_screen_rdonly(struct mm_struct *mm)
-- 
Gitee


From 17ae44e9e5eddea4c73cd5bc1d3224b36554aed0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 17:25:27 -0500
Subject: [PATCH 2071/2161] x86: switch setup_sigcontext() to unsafe_put_user()

commit 9f855c085fb150ccc208497b39b5d3542bac5322 upstream.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/include/asm/sighandling.h |  3 -
 arch/x86/kernel/signal.c           | 88 +++++++++++++++---------------
 2 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 2fcbd6f33ef7..35e0b579ffcb 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -14,9 +14,6 @@
 			 X86_EFLAGS_CF | X86_EFLAGS_RF)
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
-		     struct pt_regs *regs, unsigned long mask);
-
 
 #ifdef CONFIG_X86_X32_ABI
 asmlinkage long sys32_x32_rt_sigreturn(void);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 28f4a528b61f..d9c2129879b4 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -143,63 +143,65 @@ static bool restore_sigcontext(struct pt_regs *regs,
 			       IS_ENABLED(CONFIG_X86_32));
 }
 
-int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+static int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		     struct pt_regs *regs, unsigned long mask)
 {
-	int err = 0;
-
-	put_user_try {
+	if (!user_access_begin(sc, sizeof(struct sigcontext)))
+		return -EFAULT;
 
 #ifdef CONFIG_X86_32
-		put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
-		put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
-		put_user_ex(regs->es, (unsigned int __user *)&sc->es);
-		put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
+	unsafe_put_user(get_user_gs(regs),
+				  (unsigned int __user *)&sc->gs, Efault);
+	unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
+	unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
+	unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
 #endif /* CONFIG_X86_32 */
 
-		put_user_ex(regs->di, &sc->di);
-		put_user_ex(regs->si, &sc->si);
-		put_user_ex(regs->bp, &sc->bp);
-		put_user_ex(regs->sp, &sc->sp);
-		put_user_ex(regs->bx, &sc->bx);
-		put_user_ex(regs->dx, &sc->dx);
-		put_user_ex(regs->cx, &sc->cx);
-		put_user_ex(regs->ax, &sc->ax);
+	unsafe_put_user(regs->di, &sc->di, Efault);
+	unsafe_put_user(regs->si, &sc->si, Efault);
+	unsafe_put_user(regs->bp, &sc->bp, Efault);
+	unsafe_put_user(regs->sp, &sc->sp, Efault);
+	unsafe_put_user(regs->bx, &sc->bx, Efault);
+	unsafe_put_user(regs->dx, &sc->dx, Efault);
+	unsafe_put_user(regs->cx, &sc->cx, Efault);
+	unsafe_put_user(regs->ax, &sc->ax, Efault);
 #ifdef CONFIG_X86_64
-		put_user_ex(regs->r8, &sc->r8);
-		put_user_ex(regs->r9, &sc->r9);
-		put_user_ex(regs->r10, &sc->r10);
-		put_user_ex(regs->r11, &sc->r11);
-		put_user_ex(regs->r12, &sc->r12);
-		put_user_ex(regs->r13, &sc->r13);
-		put_user_ex(regs->r14, &sc->r14);
-		put_user_ex(regs->r15, &sc->r15);
+	unsafe_put_user(regs->r8, &sc->r8, Efault);
+	unsafe_put_user(regs->r9, &sc->r9, Efault);
+	unsafe_put_user(regs->r10, &sc->r10, Efault);
+	unsafe_put_user(regs->r11, &sc->r11, Efault);
+	unsafe_put_user(regs->r12, &sc->r12, Efault);
+	unsafe_put_user(regs->r13, &sc->r13, Efault);
+	unsafe_put_user(regs->r14, &sc->r14, Efault);
+	unsafe_put_user(regs->r15, &sc->r15, Efault);
 #endif /* CONFIG_X86_64 */
 
-		put_user_ex(current->thread.trap_nr, &sc->trapno);
-		put_user_ex(current->thread.error_code, &sc->err);
-		put_user_ex(regs->ip, &sc->ip);
+	unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+	unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+	unsafe_put_user(regs->ip, &sc->ip, Efault);
 #ifdef CONFIG_X86_32
-		put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
-		put_user_ex(regs->flags, &sc->flags);
-		put_user_ex(regs->sp, &sc->sp_at_signal);
-		put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
+	unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
+	unsafe_put_user(regs->flags, &sc->flags, Efault);
+	unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
+	unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
 #else /* !CONFIG_X86_32 */
-		put_user_ex(regs->flags, &sc->flags);
-		put_user_ex(regs->cs, &sc->cs);
-		put_user_ex(0, &sc->gs);
-		put_user_ex(0, &sc->fs);
-		put_user_ex(regs->ss, &sc->ss);
+	unsafe_put_user(regs->flags, &sc->flags, Efault);
+	unsafe_put_user(regs->cs, &sc->cs, Efault);
+	unsafe_put_user(0, &sc->gs, Efault);
+	unsafe_put_user(0, &sc->fs, Efault);
+	unsafe_put_user(regs->ss, &sc->ss, Efault);
 #endif /* CONFIG_X86_32 */
 
-		put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate);
+	unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
 
-		/* non-iBCS2 extensions.. */
-		put_user_ex(mask, &sc->oldmask);
-		put_user_ex(current->thread.cr2, &sc->cr2);
-	} put_user_catch(err);
-
-	return err;
+	/* non-iBCS2 extensions.. */
+	unsafe_put_user(mask, &sc->oldmask, Efault);
+	unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+	user_access_end();
+	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 /*
-- 
Gitee


From 4f688c3c235432616dc2e70c9dc3e1a755c08009 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 17:41:04 -0500
Subject: [PATCH 2072/2161] x86: switch ia32_setup_sigcontext() to
 unsafe_put_user()

commit d2d2728d161cbc52739d823a7fb76f3ba2fb3519 upstream.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c | 64 +++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 62e7fa8526e4..f1c5ada6ef6d 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -162,38 +162,40 @@ static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
 				 void __user *fpstate,
 				 struct pt_regs *regs, unsigned int mask)
 {
-	int err = 0;
-
-	put_user_try {
-		put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs);
-		put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs);
-		put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds);
-		put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es);
-
-		put_user_ex(regs->di, &sc->di);
-		put_user_ex(regs->si, &sc->si);
-		put_user_ex(regs->bp, &sc->bp);
-		put_user_ex(regs->sp, &sc->sp);
-		put_user_ex(regs->bx, &sc->bx);
-		put_user_ex(regs->dx, &sc->dx);
-		put_user_ex(regs->cx, &sc->cx);
-		put_user_ex(regs->ax, &sc->ax);
-		put_user_ex(current->thread.trap_nr, &sc->trapno);
-		put_user_ex(current->thread.error_code, &sc->err);
-		put_user_ex(regs->ip, &sc->ip);
-		put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
-		put_user_ex(regs->flags, &sc->flags);
-		put_user_ex(regs->sp, &sc->sp_at_signal);
-		put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
-
-		put_user_ex(ptr_to_compat(fpstate), &sc->fpstate);
-
-		/* non-iBCS2 extensions.. */
-		put_user_ex(mask, &sc->oldmask);
-		put_user_ex(current->thread.cr2, &sc->cr2);
-	} put_user_catch(err);
+	if (!user_access_begin(sc, sizeof(struct sigcontext_32)))
+		return -EFAULT;
 
-	return err;
+	unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault);
+	unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault);
+	unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault);
+	unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault);
+
+	unsafe_put_user(regs->di, &sc->di, Efault);
+	unsafe_put_user(regs->si, &sc->si, Efault);
+	unsafe_put_user(regs->bp, &sc->bp, Efault);
+	unsafe_put_user(regs->sp, &sc->sp, Efault);
+	unsafe_put_user(regs->bx, &sc->bx, Efault);
+	unsafe_put_user(regs->dx, &sc->dx, Efault);
+	unsafe_put_user(regs->cx, &sc->cx, Efault);
+	unsafe_put_user(regs->ax, &sc->ax, Efault);
+	unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+	unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+	unsafe_put_user(regs->ip, &sc->ip, Efault);
+	unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
+	unsafe_put_user(regs->flags, &sc->flags, Efault);
+	unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
+	unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
+
+	unsafe_put_user(ptr_to_compat(fpstate), &sc->fpstate, Efault);
+
+	/* non-iBCS2 extensions.. */
+	unsafe_put_user(mask, &sc->oldmask, Efault);
+	unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+	user_access_end();
+	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 /*
-- 
Gitee


From b7e3643c09ea5005f3f87951681d561da10318ad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 18:39:17 -0500
Subject: [PATCH 2073/2161] x86: get rid of put_user_try in
 {ia32,x32}_setup_rt_frame()

commit 39f16c1c0f14e9794545dbf6a64c909d5e16a2ea upstream.

Straightforward, except for compat_save_altstack_ex() stuck in those.
Replace that thing with an analogue that would use unsafe_put_user()
instead of put_user_ex() (called unsafe_compat_save_altstack()) and
be done with that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c | 50 +++++++++++++++++++------------------
 arch/x86/kernel/signal.c    | 33 +++++++++++++-----------
 include/linux/compat.h      |  9 ++++---
 3 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f1c5ada6ef6d..de2e3a493cee 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -330,35 +330,34 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
 
-	if (!access_ok(frame, sizeof(*frame)))
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
-	put_user_try {
-		put_user_ex(sig, &frame->sig);
-		put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo);
-		put_user_ex(ptr_to_compat(&frame->uc), &frame->puc);
+	unsafe_put_user(sig, &frame->sig, Efault);
+	unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault);
+	unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault);
 
-		/* Create the ucontext.  */
-		if (static_cpu_has(X86_FEATURE_XSAVE))
-			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-		else
-			put_user_ex(0, &frame->uc.uc_flags);
-		put_user_ex(0, &frame->uc.uc_link);
-		compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+	/* Create the ucontext.  */
+	if (static_cpu_has(X86_FEATURE_XSAVE))
+		unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
+	else
+		unsafe_put_user(0, &frame->uc.uc_flags, Efault);
+	unsafe_put_user(0, &frame->uc.uc_link, Efault);
+	unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
 
-		if (ksig->ka.sa.sa_flags & SA_RESTORER)
-			restorer = ksig->ka.sa.sa_restorer;
-		else
-			restorer = current->mm->context.vdso +
-				vdso_image_32.sym___kernel_rt_sigreturn;
-		put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
+	if (ksig->ka.sa.sa_flags & SA_RESTORER)
+		restorer = ksig->ka.sa.sa_restorer;
+	else
+		restorer = current->mm->context.vdso +
+			vdso_image_32.sym___kernel_rt_sigreturn;
+	unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
 
-		/*
-		 * Not actually used anymore, but left because some gdb
-		 * versions need it.
-		 */
-		put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
-	} put_user_catch(err);
+	/*
+	 * Not actually used anymore, but left because some gdb
+	 * versions need it.
+	 */
+	unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
+	user_access_end();
 
 	err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
 	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
@@ -384,4 +383,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 	regs->ss = __USER32_DS;
 
 	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d9c2129879b4..2afe9970effc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -551,6 +551,9 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 	int err = 0;
 	void __user *fpstate = NULL;
 
+	if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+		return -EFAULT;
+
 	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
 
 	if (!access_ok(frame, sizeof(*frame)))
@@ -563,22 +566,17 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 
 	uc_flags = frame_uc_flags(regs);
 
-	put_user_try {
-		/* Create the ucontext.  */
-		put_user_ex(uc_flags, &frame->uc.uc_flags);
-		put_user_ex(0, &frame->uc.uc_link);
-		compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
-		put_user_ex(0, &frame->uc.uc__pad0);
+	if (!user_access_begin(frame, sizeof(*frame)))
+		return -EFAULT;
 
-		if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-			restorer = ksig->ka.sa.sa_restorer;
-		} else {
-			/* could use a vstub here */
-			restorer = NULL;
-			err |= -EFAULT;
-		}
-		put_user_ex(restorer, (unsigned long __user *)&frame->pretcode);
-	} put_user_catch(err);
+	/* Create the ucontext.  */
+	unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+	unsafe_put_user(0, &frame->uc.uc_link, Efault);
+	unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+	unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
+	restorer = ksig->ka.sa.sa_restorer;
+	unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
+	user_access_end();
 
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
 				regs, set->sig[0]);
@@ -604,6 +602,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 #endif	/* CONFIG_X86_X32_ABI */
 
 	return 0;
+#ifdef CONFIG_X86_X32_ABI
+Efault:
+	user_access_end();
+	return -EFAULT;
+#endif
 }
 
 /*
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c4c389c7e1b4..13f845a56f49 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -490,12 +490,13 @@ extern void __user *compat_alloc_user_space(unsigned long len);
 
 int compat_restore_altstack(const compat_stack_t __user *uss);
 int __compat_save_altstack(compat_stack_t __user *, unsigned long);
-#define compat_save_altstack_ex(uss, sp) do { \
+#define unsafe_compat_save_altstack(uss, sp, label) do { \
 	compat_stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
-	put_user_ex(ptr_to_compat((void __user *)t->sas_ss_sp), &__uss->ss_sp); \
-	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
-	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	unsafe_put_user(ptr_to_compat((void __user *)t->sas_ss_sp), \
+			&__uss->ss_sp, label); \
+	unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
+	unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
 	if (t->sas_ss_flags & SS_AUTODISARM) \
 		sas_ss_reset(t); \
 } while (0);
-- 
Gitee


From fef66de7ff5de15629b515ecc642cac8dd59d4dc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 18:46:02 -0500
Subject: [PATCH 2074/2161] x86: ia32_setup_sigcontext(): lift
 user_access_{begin,end}() into the callers

commit 44a1d996325982025eefcdc50b636ab83e813372 upstream.

What's left is just a sequence of stores to userland addresses, with all
error handling, etc. done out of line.  Calling that from user_access block
is safe, but rather than teaching objtool to recognize it as such we can
just make it always_inline - it is small enough and has few enough callers,
for the space savings not to be an issue.

	Rename the sucker to __unsafe_setup_sigcontext32() and provide
unsafe_put_sigcontext32() with usual kind of semantics.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c | 44 +++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index de2e3a493cee..85ed190600fe 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -158,13 +158,11 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
 
 #define get_user_seg(seg)	({ unsigned int v; savesegment(seg, v); v; })
 
-static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
-				 void __user *fpstate,
-				 struct pt_regs *regs, unsigned int mask)
+static __always_inline int
+__unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc,
+			    void __user *fpstate,
+			    struct pt_regs *regs, unsigned int mask)
 {
-	if (!user_access_begin(sc, sizeof(struct sigcontext_32)))
-		return -EFAULT;
-
 	unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault);
 	unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault);
 	unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault);
@@ -191,13 +189,18 @@ static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
 	/* non-iBCS2 extensions.. */
 	unsafe_put_user(mask, &sc->oldmask, Efault);
 	unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
-	user_access_end();
 	return 0;
+
 Efault:
-	user_access_end();
 	return -EFAULT;
 }
 
+#define unsafe_put_sigcontext32(sc, fp, regs, set, label)		\
+do {									\
+	if (__unsafe_setup_sigcontext32(sc, fp, regs, set->sig[0]))	\
+		goto label;						\
+} while(0)
+
 /*
  * Determine which stack to use..
  */
@@ -238,7 +241,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 	struct sigframe_ia32 __user *frame;
 	void __user *restorer;
 	int err = 0;
-	void __user *fpstate = NULL;
+	void __user *fp = NULL;
 
 	/* copy_to_user optimizes that into a single 8 byte store */
 	static const struct {
@@ -251,7 +254,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 		0x80cd,		/* int $0x80 */
 	};
 
-	frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
+	frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
 
 	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
@@ -259,9 +262,12 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 	if (__put_user(sig, &frame->sig))
 		return -EFAULT;
 
-	if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+	if (!user_access_begin(&frame->sc, sizeof(struct sigcontext_32)))
 		return -EFAULT;
 
+	unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault);
+	user_access_end();
+
 	if (__put_user(set->sig[1], &frame->extramask[0]))
 		return -EFAULT;
 
@@ -305,6 +311,9 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 	regs->ss = __USER32_DS;
 
 	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
@@ -313,7 +322,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 	struct rt_sigframe_ia32 __user *frame;
 	void __user *restorer;
 	int err = 0;
-	void __user *fpstate = NULL;
+	void __user *fp = NULL;
 
 	/* __copy_to_user optimizes that into a single 8 byte store */
 	static const struct {
@@ -328,7 +337,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 		0,
 	};
 
-	frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
+	frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
 
 	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
@@ -359,9 +368,12 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 	unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
 	user_access_end();
 
-	err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
-	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-				     regs, set->sig[0]);
+	if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false))
+		return -EFAULT;
+	if (!user_access_begin(&frame->uc.uc_mcontext, sizeof(struct sigcontext_32)))
+		return -EFAULT;
+	unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	user_access_end();
 	err |= __put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask);
 
 	if (err)
-- 
Gitee


From a3f4485e9089c2ce27ff36e1c25b089c4eaed08e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 19:36:40 -0500
Subject: [PATCH 2075/2161] x86: ia32_setup_frame(): consolidate uaccess areas

commit e2390741053e4931841650b5eadac32697aa88aa upstream.

Currently we have user_access block, followed by __put_user(),
deciding what the restorer will be and finally a put_user_try
block.

Moving the calculation of restorer first allows the rest
(actual copyout work) to coalesce into a single user_access block.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c | 39 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 27 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 85ed190600fe..3ececc7571fe 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -240,7 +240,6 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 {
 	struct sigframe_ia32 __user *frame;
 	void __user *restorer;
-	int err = 0;
 	void __user *fp = NULL;
 
 	/* copy_to_user optimizes that into a single 8 byte store */
@@ -256,21 +255,6 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
 
-	if (!access_ok(frame, sizeof(*frame)))
-		return -EFAULT;
-
-	if (__put_user(sig, &frame->sig))
-		return -EFAULT;
-
-	if (!user_access_begin(&frame->sc, sizeof(struct sigcontext_32)))
-		return -EFAULT;
-
-	unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault);
-	user_access_end();
-
-	if (__put_user(set->sig[1], &frame->extramask[0]))
-		return -EFAULT;
-
 	if (ksig->ka.sa.sa_flags & SA_RESTORER) {
 		restorer = ksig->ka.sa.sa_restorer;
 	} else {
@@ -282,19 +266,20 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 			restorer = &frame->retcode;
 	}
 
-	put_user_try {
-		put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
-
-		/*
-		 * These are actually not used anymore, but left because some
-		 * gdb versions depend on them as a marker.
-		 */
-		put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
-	} put_user_catch(err);
-
-	if (err)
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
+	unsafe_put_user(sig, &frame->sig, Efault);
+	unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault);
+	unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
+	unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
+	/*
+	 * These are actually not used anymore, but left because some
+	 * gdb versions depend on them as a marker.
+	 */
+	unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
+	user_access_end();
+
 	/* Set up registers for signal handler */
 	regs->sp = (unsigned long) frame;
 	regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
-- 
Gitee


From e1d6214f1baf879f571eb4c990ca3f3518fc49ea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 19:42:40 -0500
Subject: [PATCH 2076/2161] x86: ia32_setup_rt_frame(): consolidate uaccess
 areas

commit 57d563c8292569f2849569853e846bf740df5032 upstream.

__copy_siginfo_to_user32() call reordered a bit.  The rest folds
nicely.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/ia32/ia32_signal.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 3ececc7571fe..55c38af27465 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -306,10 +306,9 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 {
 	struct rt_sigframe_ia32 __user *frame;
 	void __user *restorer;
-	int err = 0;
 	void __user *fp = NULL;
 
-	/* __copy_to_user optimizes that into a single 8 byte store */
+	/* unsafe_put_user optimizes that into a single 8 byte store */
 	static const struct {
 		u8 movl;
 		u32 val;
@@ -351,17 +350,11 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 	 * versions need it.
 	 */
 	unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
-	user_access_end();
-
-	if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false))
-		return -EFAULT;
-	if (!user_access_begin(&frame->uc.uc_mcontext, sizeof(struct sigcontext_32)))
-		return -EFAULT;
 	unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault);
 	user_access_end();
-	err |= __put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask);
 
-	if (err)
+	if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false))
 		return -EFAULT;
 
 	/* Set up registers for signal handler */
-- 
Gitee


From e02a30368bced16abc52102b9981565ccf8305af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 19:54:56 -0500
Subject: [PATCH 2077/2161] x86: get rid of put_user_try in __setup_rt_frame()
 (both 32bit and 64bit)

commit 119cd59fcfbe70fb3fcab4e64cd232bcc3807585 upstream.

Straightforward, except for save_altstack_ex() stuck in those.
Replace that thing with an analogue that would use unsafe_put_user()
instead of put_user_ex() (called compat_save_altstack()) and be done
with that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 91 +++++++++++++++++++++-------------------
 include/linux/signal.h   |  8 ++--
 2 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2afe9970effc..d3aa0f567ead 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -387,38 +387,37 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
 
-	if (!access_ok(frame, sizeof(*frame)))
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
-	put_user_try {
-		put_user_ex(sig, &frame->sig);
-		put_user_ex(&frame->info, &frame->pinfo);
-		put_user_ex(&frame->uc, &frame->puc);
+	unsafe_put_user(sig, &frame->sig, Efault);
+	unsafe_put_user(&frame->info, &frame->pinfo, Efault);
+	unsafe_put_user(&frame->uc, &frame->puc, Efault);
 
-		/* Create the ucontext.  */
-		if (static_cpu_has(X86_FEATURE_XSAVE))
-			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-		else
-			put_user_ex(0, &frame->uc.uc_flags);
-		put_user_ex(0, &frame->uc.uc_link);
-		save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+	/* Create the ucontext.  */
+	if (static_cpu_has(X86_FEATURE_XSAVE))
+		unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
+	else
+		unsafe_put_user(0, &frame->uc.uc_flags, Efault);
+	unsafe_put_user(0, &frame->uc.uc_link, Efault);
+	unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
 
-		/* Set up to return from userspace.  */
-		restorer = current->mm->context.vdso +
-			vdso_image_32.sym___kernel_rt_sigreturn;
-		if (ksig->ka.sa.sa_flags & SA_RESTORER)
-			restorer = ksig->ka.sa.sa_restorer;
-		put_user_ex(restorer, &frame->pretcode);
+	/* Set up to return from userspace.  */
+	restorer = current->mm->context.vdso +
+		vdso_image_32.sym___kernel_rt_sigreturn;
+	if (ksig->ka.sa.sa_flags & SA_RESTORER)
+		restorer = ksig->ka.sa.sa_restorer;
+	unsafe_put_user(restorer, &frame->pretcode, Efault);
 
-		/*
-		 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
-		 *
-		 * WE DO NOT USE IT ANY MORE! It's only left here for historical
-		 * reasons and because gdb uses it as a signature to notice
-		 * signal handler stack frames.
-		 */
-		put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
-	} put_user_catch(err);
+	/*
+	 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
+	 *
+	 * WE DO NOT USE IT ANY MORE! It's only left here for historical
+	 * reasons and because gdb uses it as a signature to notice
+	 * signal handler stack frames.
+	 */
+	unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault);
+	user_access_end();
 	
 	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
@@ -441,6 +440,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	regs->cs = __USER_CS;
 
 	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 #else /* !CONFIG_X86_32 */
 static unsigned long frame_uc_flags(struct pt_regs *regs)
@@ -466,6 +468,10 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	unsigned long uc_flags;
 	int err = 0;
 
+	/* x86-64 should always use SA_RESTORER. */
+	if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+		return -EFAULT;
+
 	frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
 
 	if (!access_ok(frame, sizeof(*frame)))
@@ -477,23 +483,18 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	}
 
 	uc_flags = frame_uc_flags(regs);
+	if (!user_access_begin(frame, sizeof(*frame)))
+		return -EFAULT;
 
-	put_user_try {
-		/* Create the ucontext.  */
-		put_user_ex(uc_flags, &frame->uc.uc_flags);
-		put_user_ex(0, &frame->uc.uc_link);
-		save_altstack_ex(&frame->uc.uc_stack, regs->sp);
-
-		/* Set up to return from userspace.  If provided, use a stub
-		   already in userspace.  */
-		/* x86-64 should always use SA_RESTORER. */
-		if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-			put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode);
-		} else {
-			/* could use a vstub here */
-			err |= -EFAULT;
-		}
-	} put_user_catch(err);
+	/* Create the ucontext.  */
+	unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+	unsafe_put_user(0, &frame->uc.uc_link, Efault);
+	unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+
+	/* Set up to return from userspace.  If provided, use a stub
+	   already in userspace.  */
+	unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
+	user_access_end();
 
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
 	err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
@@ -537,6 +538,10 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 		force_valid_ss(regs);
 
 	return 0;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 #endif /* CONFIG_X86_32 */
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 2123902d60ff..b4973c9d1d7b 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -444,12 +444,12 @@ void signals_init(void);
 int restore_altstack(const stack_t __user *);
 int __save_altstack(stack_t __user *, unsigned long);
 
-#define save_altstack_ex(uss, sp) do { \
+#define unsafe_save_altstack(uss, sp, label) do { \
 	stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
-	put_user_ex((void __user *)t->sas_ss_sp, &__uss->ss_sp); \
-	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
-	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
+	unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
+	unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
 	if (t->sas_ss_flags & SS_AUTODISARM) \
 		sas_ss_reset(t); \
 } while (0);
-- 
Gitee


From 49821a70ae1419b1de749629c23775f908ebf96e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 21:12:26 -0500
Subject: [PATCH 2078/2161] x86: setup_sigcontext(): list
 user_access_{begin,end}() into callers

commit b00d8f8f0b2b39223c3fd6713d318aba95420264 upstream.

Similar to ia32_setup_sigcontext() change several commits ago, make it
__always_inline.  In cases when there is a user_access_{begin,end}()
section nearby, just move the call over there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 45 +++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d3aa0f567ead..f79b0743b8de 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -143,12 +143,10 @@ static bool restore_sigcontext(struct pt_regs *regs,
 			       IS_ENABLED(CONFIG_X86_32));
 }
 
-static int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+static __always_inline int
+__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		     struct pt_regs *regs, unsigned long mask)
 {
-	if (!user_access_begin(sc, sizeof(struct sigcontext)))
-		return -EFAULT;
-
 #ifdef CONFIG_X86_32
 	unsafe_put_user(get_user_gs(regs),
 				  (unsigned int __user *)&sc->gs, Efault);
@@ -197,13 +195,17 @@ static int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 	/* non-iBCS2 extensions.. */
 	unsafe_put_user(mask, &sc->oldmask, Efault);
 	unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
-	user_access_end();
 	return 0;
 Efault:
-	user_access_end();
 	return -EFAULT;
 }
 
+#define unsafe_put_sigcontext(sc, fp, regs, set, label)			\
+do {									\
+	if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0]))	\
+		goto label;						\
+} while(0);
+
 /*
  * Set up a signal frame.
  */
@@ -323,9 +325,9 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 	struct sigframe __user *frame;
 	void __user *restorer;
 	int err = 0;
-	void __user *fpstate = NULL;
+	void __user *fp = NULL;
 
-	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
 
 	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
@@ -333,8 +335,10 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 	if (__put_user(sig, &frame->sig))
 		return -EFAULT;
 
-	if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+	if (!user_access_begin(&frame->sc, sizeof(struct sigcontext)))
 		return -EFAULT;
+	unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault);
+	user_access_end();
 
 	if (__put_user(set->sig[1], &frame->extramask[0]))
 		return -EFAULT;
@@ -375,6 +379,10 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 	regs->cs = __USER_CS;
 
 	return 0;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 static int __setup_rt_frame(int sig, struct ksignal *ksig,
@@ -383,9 +391,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	struct rt_sigframe __user *frame;
 	void __user *restorer;
 	int err = 0;
-	void __user *fpstate = NULL;
+	void __user *fp = NULL;
 
-	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
 
 	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
@@ -417,13 +425,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	 * signal handler stack frames.
 	 */
 	unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault);
+	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
 	user_access_end();
 	
 	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-				regs, set->sig[0]);
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
 	if (err)
 		return -EFAULT;
 
@@ -494,9 +500,8 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	/* Set up to return from userspace.  If provided, use a stub
 	   already in userspace.  */
 	unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
+	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
 	user_access_end();
-
-	err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
 	err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
 
 	if (err)
@@ -554,12 +559,12 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 	unsigned long uc_flags;
 	void __user *restorer;
 	int err = 0;
-	void __user *fpstate = NULL;
+	void __user *fp = NULL;
 
 	if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
 		return -EFAULT;
 
-	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
 
 	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
@@ -581,10 +586,8 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 	unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
 	restorer = ksig->ka.sa.sa_restorer;
 	unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
+	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
 	user_access_end();
-
-	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-				regs, set->sig[0]);
 	err |= __put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask);
 
 	if (err)
-- 
Gitee


From bbfa88fae04ebf078c21e2c3434311bd5bb8099e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 21:18:02 -0500
Subject: [PATCH 2079/2161] x86: __setup_frame(): consolidate uaccess areas

commit 5c1f178094631e8b9acc67e4a9b6e03abfbc2529 upstream.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index f79b0743b8de..b140703c6976 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -324,25 +324,16 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 {
 	struct sigframe __user *frame;
 	void __user *restorer;
-	int err = 0;
 	void __user *fp = NULL;
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
 
-	if (!access_ok(frame, sizeof(*frame)))
-		return -EFAULT;
-
-	if (__put_user(sig, &frame->sig))
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
-	if (!user_access_begin(&frame->sc, sizeof(struct sigcontext)))
-		return -EFAULT;
+	unsafe_put_user(sig, &frame->sig, Efault);
 	unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault);
-	user_access_end();
-
-	if (__put_user(set->sig[1], &frame->extramask[0]))
-		return -EFAULT;
-
+	unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
 	if (current->mm->context.vdso)
 		restorer = current->mm->context.vdso +
 			vdso_image_32.sym___kernel_sigreturn;
@@ -352,7 +343,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 		restorer = ksig->ka.sa.sa_restorer;
 
 	/* Set up to return from userspace.  */
-	err |= __put_user(restorer, &frame->pretcode);
+	unsafe_put_user(restorer, &frame->pretcode, Efault);
 
 	/*
 	 * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
@@ -361,10 +352,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 	 * reasons and because gdb uses it as a signature to notice
 	 * signal handler stack frames.
 	 */
-	err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
-
-	if (err)
-		return -EFAULT;
+	unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault);
+	user_access_end();
 
 	/* Set up registers for signal handler */
 	regs->sp = (unsigned long)frame;
-- 
Gitee


From 4964679dd70285e85a73babcd8b7a5598cfe8983 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 21:22:39 -0500
Subject: [PATCH 2080/2161] x86: __setup_rt_frame(): consolidate uaccess areas

commit ead8e4e7e2c75ced6fcd9a53d3e9a2ecd7368553 upstream.

reorder copy_siginfo_to_user() calls a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b140703c6976..8e3c0a775de5 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -379,7 +379,6 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 {
 	struct rt_sigframe __user *frame;
 	void __user *restorer;
-	int err = 0;
 	void __user *fp = NULL;
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
@@ -415,11 +414,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	 */
 	unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault);
 	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	unsafe_put_user(*(__u64 *)set,
+			(__u64 __user *)&frame->uc.uc_sigmask, Efault);
 	user_access_end();
 	
-	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-	if (err)
+	if (copy_siginfo_to_user(&frame->info, &ksig->info))
 		return -EFAULT;
 
 	/* Set up registers for signal handler */
@@ -461,23 +460,14 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	struct rt_sigframe __user *frame;
 	void __user *fp = NULL;
 	unsigned long uc_flags;
-	int err = 0;
 
 	/* x86-64 should always use SA_RESTORER. */
 	if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
 		return -EFAULT;
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
-
-	if (!access_ok(frame, sizeof(*frame)))
-		return -EFAULT;
-
-	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-		if (copy_siginfo_to_user(&frame->info, &ksig->info))
-			return -EFAULT;
-	}
-
 	uc_flags = frame_uc_flags(regs);
+
 	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
@@ -490,11 +480,13 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	   already in userspace.  */
 	unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
 	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	unsafe_put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0], Efault);
 	user_access_end();
-	err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
 
-	if (err)
-		return -EFAULT;
+	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+		if (copy_siginfo_to_user(&frame->info, &ksig->info))
+			return -EFAULT;
+	}
 
 	/* Set up registers for signal handler */
 	regs->di = sig;
-- 
Gitee


From a6c298b2229261d37e480d69ccfddbc85da3feb1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 21:25:14 -0500
Subject: [PATCH 2081/2161] x86: x32_setup_rt_frame(): consolidate uaccess
 areas

commit 791612e9668cecbf5dd24d13400ac74e099f005c upstream.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 8e3c0a775de5..a910f07c6f16 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -539,7 +539,6 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 	struct rt_sigframe_x32 __user *frame;
 	unsigned long uc_flags;
 	void __user *restorer;
-	int err = 0;
 	void __user *fp = NULL;
 
 	if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
@@ -547,14 +546,6 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
 
-	if (!access_ok(frame, sizeof(*frame)))
-		return -EFAULT;
-
-	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-		if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
-			return -EFAULT;
-	}
-
 	uc_flags = frame_uc_flags(regs);
 
 	if (!user_access_begin(frame, sizeof(*frame)))
@@ -568,11 +559,13 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 	restorer = ksig->ka.sa.sa_restorer;
 	unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
 	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault);
 	user_access_end();
-	err |= __put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask);
 
-	if (err)
-		return -EFAULT;
+	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+		if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
+			return -EFAULT;
+	}
 
 	/* Set up registers for signal handler */
 	regs->sp = (unsigned long) frame;
-- 
Gitee


From f67a978aea6491d3f5ea5dd4a6e23026f861a97a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Feb 2020 21:36:52 -0500
Subject: [PATCH 2082/2161] x86: unsafe_put-style macro for sigmask

commit b87df6594486626a9ae5944807307f2604cea3e2 upstream.

regularizes things a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/signal.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index a910f07c6f16..9fe6b025f4bd 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -206,6 +206,11 @@ do {									\
 		goto label;						\
 } while(0);
 
+#define unsafe_put_sigmask(set, frame, label) \
+	unsafe_put_user(*(__u64 *)(set), \
+			(__u64 __user *)&(frame)->uc.uc_sigmask, \
+			label)
+
 /*
  * Set up a signal frame.
  */
@@ -414,8 +419,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	 */
 	unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault);
 	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
-	unsafe_put_user(*(__u64 *)set,
-			(__u64 __user *)&frame->uc.uc_sigmask, Efault);
+	unsafe_put_sigmask(set, frame, Efault);
 	user_access_end();
 	
 	if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -480,7 +484,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	   already in userspace.  */
 	unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
 	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
-	unsafe_put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0], Efault);
+	unsafe_put_sigmask(set, frame, Efault);
 	user_access_end();
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
@@ -559,7 +563,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 	restorer = ksig->ka.sa.sa_restorer;
 	unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
 	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
-	unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault);
+	unsafe_put_sigmask(set, frame, Efault);
 	user_access_end();
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-- 
Gitee


From 8f526d7d5858bc189114e09b37f735944365a6e4 Mon Sep 17 00:00:00 2001
From: Chen Zhuo <sagazchen@tencent.com>
Date: Wed, 7 Dec 2022 16:23:32 +0800
Subject: [PATCH 2083/2161] kill uaccess_try()

commit cf122cfba5b1d9daf64009d143f51dfec4b1705a upstream.

finally

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 Documentation/x86/exception-tables.rst |  6 ---
 arch/x86/include/asm/asm.h             |  6 ---
 arch/x86/include/asm/processor.h       |  1 -
 arch/x86/include/asm/uaccess.h         | 65 --------------------------
 arch/x86/mm/extable.c                  | 12 -----
 5 files changed, 90 deletions(-)

diff --git a/Documentation/x86/exception-tables.rst b/Documentation/x86/exception-tables.rst
index ed6d4b0cf62c..514f51829da7 100644
--- a/Documentation/x86/exception-tables.rst
+++ b/Documentation/x86/exception-tables.rst
@@ -337,10 +337,4 @@ pointer which points to one of:
      entry->insn. It is used to distinguish page faults from machine
      check.
 
-3) ``int ex_handler_ext(const struct exception_table_entry *fixup)``
-     This case is used for uaccess_err ... we need to set a flag
-     in the task structure. Before the handler functions existed this
-     case was handled by adding a large offset to the fixup to tag
-     it as special.
-
 More functions can easily be added.
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 03cbd219f00a..13dd803c7438 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -132,9 +132,6 @@
 	.long type ;						\
 	.popsection
 
-# define _ASM_EXTABLE_EX(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
-
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
 	_ASM_ALIGN ;						\
@@ -151,9 +148,6 @@
 	" .long " __stringify(type) " \n"			\
 	" .popsection\n"
 
-# define _ASM_EXTABLE_EX(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
-
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 00389a69f807..e125879d8b3e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -492,7 +492,6 @@ struct thread_struct {
 	mm_segment_t		addr_limit;
 
 	unsigned int		sig_on_uaccess_err:1;
-	unsigned int		uaccess_err:1;	/* uaccess failed */
 
 	/*
 	 * Protection Keys Register for Userspace.  Loaded immediately on
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 4dc5accdd826..d11662f753d2 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -193,23 +193,12 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 		     : : "A" (x), "r" (addr)			\
 		     : : label)
 
-#define __put_user_asm_ex_u64(x, addr)					\
-	asm volatile("\n"						\
-		     "1:	movl %%eax,0(%1)\n"			\
-		     "2:	movl %%edx,4(%1)\n"			\
-		     "3:"						\
-		     _ASM_EXTABLE_EX(1b, 2b)				\
-		     _ASM_EXTABLE_EX(2b, 3b)				\
-		     : : "A" (x), "r" (addr))
-
 #define __put_user_x8(x, ptr, __ret_pu)				\
 	asm volatile("call __put_user_8" : "=a" (__ret_pu)	\
 		     : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
 #else
 #define __put_user_goto_u64(x, ptr, label) \
 	__put_user_goto(x, ptr, "q", "", "er", label)
-#define __put_user_asm_ex_u64(x, addr)	\
-	__put_user_asm_ex(x, addr, "q", "", "er")
 #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
 #endif
 
@@ -289,31 +278,6 @@ do {									\
 	}								\
 } while (0)
 
-/*
- * This doesn't do __uaccess_begin/end - the exception handling
- * around it must do that.
- */
-#define __put_user_size_ex(x, ptr, size)				\
-do {									\
-	__chk_user_ptr(ptr);						\
-	switch (size) {							\
-	case 1:								\
-		__put_user_asm_ex(x, ptr, "b", "b", "iq");		\
-		break;							\
-	case 2:								\
-		__put_user_asm_ex(x, ptr, "w", "w", "ir");		\
-		break;							\
-	case 4:								\
-		__put_user_asm_ex(x, ptr, "l", "k", "ir");		\
-		break;							\
-	case 8:								\
-		__put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr);	\
-		break;							\
-	default:							\
-		__put_user_bad();					\
-	}								\
-} while (0)
-
 #ifdef CONFIG_X86_32
 #define __get_user_asm_u64(x, ptr, retval, errret)			\
 ({									\
@@ -430,29 +394,6 @@ struct __large_struct { unsigned long buf[100]; };
 	retval = __put_user_failed(x, addr, itype, rtype, ltype, errret);	\
 } while (0)
 
-#define __put_user_asm_ex(x, addr, itype, rtype, ltype)			\
-	asm volatile("1:	mov"itype" %"rtype"0,%1\n"		\
-		     "2:\n"						\
-		     _ASM_EXTABLE_EX(1b, 2b)				\
-		     : : ltype(x), "m" (__m(addr)))
-
-/*
- * uaccess_try and catch
- */
-#define uaccess_try	do {						\
-	current->thread.uaccess_err = 0;				\
-	__uaccess_begin();						\
-	barrier();
-
-#define uaccess_try_nospec do {						\
-	current->thread.uaccess_err = 0;				\
-	__uaccess_begin_nospec();					\
-
-#define uaccess_catch(err)						\
-	__uaccess_end();						\
-	(err) |= (current->thread.uaccess_err ? -EFAULT : 0);		\
-} while (0)
-
 /**
  * __get_user - Get a simple variable from user space, with less checking.
  * @x:   Variable to store result.
@@ -502,12 +443,6 @@ struct __large_struct { unsigned long buf[100]; };
 #define __put_user(x, ptr)						\
 	__put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
 
-#define put_user_try		uaccess_try
-#define put_user_catch(err)	uaccess_catch(err)
-
-#define put_user_ex(x, ptr)						\
-	__put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
-
 extern unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
 extern __must_check long
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 26f01015239f..ce6213294912 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -57,18 +57,6 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
 	return ex_handler_default(fixup, regs);
 }
 
-__visible bool ex_handler_ext(const struct exception_table_entry *fixup,
-			      struct pt_regs *regs, int trapnr,
-			      unsigned long error_code,
-			      unsigned long fault_addr)
-{
-	/* Special hack for uaccess_err */
-	current->thread.uaccess_err = 1;
-	regs->ip = ex_fixup_addr(fixup);
-	return true;
-}
-EXPORT_SYMBOL(ex_handler_ext);
-
 static bool ex_handler_copy(const struct exception_table_entry *fixup,
 			    struct pt_regs *regs, int trapnr)
 {
-- 
Gitee


From 82210dbcf76e351ce85ed6f85493798d24b53d96 Mon Sep 17 00:00:00 2001
From: "Kun(llfl)" <llfl@linux.alibaba.com>
Date: Mon, 17 Oct 2022 11:05:11 +0800
Subject: [PATCH 2084/2161] anolis: crypto: iax: Fix deflate crash caused by
 global shared tfm

commit openanolis.

ANBZ: #2423

When iax deflate calls fallback to generic deflate, they use the same global
shared tfm: deflate_generic_tfm. This shared tfm makes a deflate fallback
call crash. So fix it.

Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/774
Reviewed-by: Xiaochen Shen <xiaochen.shen@intel.com>
Reviewed-by: Zelin Deng <zelin.deng@linux.alibaba.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/crypto/iax/iax_crypto_main.c | 57 ++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
index be5810b14e7b..d2e2c3843c40 100644
--- a/drivers/crypto/iax/iax_crypto_main.c
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -37,7 +37,33 @@ MODULE_PARM_DESC(iax_verify_compress,
 static LIST_HEAD(iax_devices);
 static DEFINE_SPINLOCK(iax_devices_lock);
 
-static struct crypto_comp *deflate_generic_tfm;
+struct deflate_generic_ctx {
+	struct crypto_comp *deflate_generic_tfm;
+};
+
+static int iax_deflate_generic_init(struct crypto_tfm *tfm)
+{
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct crypto_comp *deflate_tfm;
+
+	if (crypto_has_comp("deflate-generic", 0, 0))
+		deflate_tfm = crypto_alloc_comp("deflate-generic", 0, 0);
+
+	if (IS_ERR_OR_NULL(deflate_tfm)) {
+		pr_err("IAX could not alloc %s tfm: errcode = %ld\n",
+		       "deflate-generic", PTR_ERR(deflate_tfm));
+		return -ENOMEM;
+	}
+	ctx->deflate_generic_tfm = deflate_tfm;
+	return 0;
+}
+
+static void iax_deflate_generic_exit(struct crypto_tfm *tfm)
+{
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_comp(ctx->deflate_generic_tfm);
+}
 
 static int iax_wqs_get(struct iax_device *iax_device)
 {
@@ -874,12 +900,13 @@ static int iax_comp_compress(struct crypto_tfm *tfm,
 			     const u8 *src, unsigned int slen,
 			     u8 *dst, unsigned int *dlen)
 {
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 start_time_ns;
 	int ret = 0;
 
 	if (!iax_crypto_enabled) {
 		pr_debug("%s: iax_crypto disabled, using deflate-generic compression\n", __func__);
-		ret = crypto_comp_compress(deflate_generic_tfm,
+		ret = crypto_comp_compress(ctx->deflate_generic_tfm,
 					   src, slen, dst, dlen);
 		return ret;
 	}
@@ -900,12 +927,13 @@ static int iax_comp_decompress(struct crypto_tfm *tfm,
 			       const u8 *src, unsigned int slen,
 			       u8 *dst, unsigned int *dlen)
 {
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 start_time_ns;
 	int ret = 0;
 
 	if (!iax_crypto_enabled) {
 		pr_debug("%s: iax_crypto disabled, using deflate-generic decompression\n", __func__);
-		ret = crypto_comp_decompress(deflate_generic_tfm,
+		ret = crypto_comp_decompress(ctx->deflate_generic_tfm,
 					     src, slen, dst, dlen);
 		return ret;
 	}
@@ -928,6 +956,9 @@ static struct crypto_alg iax_comp_deflate = {
 	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
 	.cra_priority		= IAX_ALG_PRIORITY,
 	.cra_module		= THIS_MODULE,
+	.cra_ctxsize		= sizeof(struct deflate_generic_ctx),
+	.cra_init		= iax_deflate_generic_init,
+	.cra_exit		= iax_deflate_generic_exit,
 	.cra_u			= {
 		.compress = {
 			.coa_compress	= iax_comp_compress,
@@ -939,6 +970,7 @@ static struct crypto_alg iax_comp_deflate = {
 static int iax_comp_acompress(struct acomp_req *req)
 {
 	struct crypto_tfm *tfm = req->base.tfm;
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 start_time_ns;
 	void *src, *dst;
 	int ret = 0;
@@ -948,7 +980,7 @@ static int iax_comp_acompress(struct acomp_req *req)
 
 	if (!iax_crypto_enabled) {
 		pr_debug("%s: iax_crypto disabled, using deflate-generic compression\n", __func__);
-		ret = crypto_comp_compress(deflate_generic_tfm,
+		ret = crypto_comp_compress(ctx->deflate_generic_tfm,
 					   src, req->slen, dst, &req->dlen);
 		kunmap_atomic(src);
 		kunmap_atomic(dst);
@@ -976,6 +1008,7 @@ static int iax_comp_acompress(struct acomp_req *req)
 static int iax_comp_adecompress(struct acomp_req *req)
 {
 	struct crypto_tfm *tfm = req->base.tfm;
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 start_time_ns;
 	void *src, *dst;
 	int ret;
@@ -985,7 +1018,7 @@ static int iax_comp_adecompress(struct acomp_req *req)
 
 	if (!iax_crypto_enabled) {
 		pr_debug("%s: iax_crypto disabled, using deflate-generic decompression\n", __func__);
-		ret = crypto_comp_decompress(deflate_generic_tfm,
+		ret = crypto_comp_decompress(ctx->deflate_generic_tfm,
 					     src, req->slen, dst, &req->dlen);
 		kunmap_atomic(src);
 		kunmap_atomic(dst);
@@ -1016,6 +1049,9 @@ static struct acomp_alg iax_acomp_deflate = {
 		.cra_name		= "deflate",
 		.cra_driver_name	= "iax_crypto",
 		.cra_module		= THIS_MODULE,
+		.cra_ctxsize		= sizeof(struct deflate_generic_ctx),
+		.cra_init		= iax_deflate_generic_init,
+		.cra_exit		= iax_deflate_generic_exit,
 		.cra_priority           = IAX_ALG_PRIORITY,
 	}
 };
@@ -1175,15 +1211,6 @@ static int __init iax_crypto_init_module(void)
 	nr_cpus = num_online_cpus();
 	nr_nodes = num_online_nodes();
 
-	if (crypto_has_comp("deflate-generic", 0, 0))
-		deflate_generic_tfm = crypto_alloc_comp("deflate-generic", 0, 0);
-
-	if (IS_ERR_OR_NULL(deflate_generic_tfm)) {
-		pr_err("IAX could not alloc %s tfm: errcode = %ld\n",
-		       "deflate-generic", PTR_ERR(deflate_generic_tfm));
-		return -ENOMEM;
-	}
-
 	wq_table = alloc_percpu(struct idxd_wq *);
 	if (!wq_table)
 		return -ENOMEM;
@@ -1211,7 +1238,6 @@ static int __init iax_crypto_init_module(void)
 err_crypto_register:
 	idxd_driver_unregister(&iax_crypto_driver);
 err_driver_register:
-	crypto_free_comp(deflate_generic_tfm);
 	free_percpu(wq_table);
 
 	goto out;
@@ -1224,7 +1250,6 @@ static void __exit iax_crypto_cleanup_module(void)
 	iax_unregister_compression_device();
 	free_percpu(wq_table);
 	free_iax_devices();
-	crypto_free_comp(deflate_generic_tfm);
 	pr_info("%s: cleaned up\n", __func__);
 }
 
-- 
Gitee


From 16377e021ca8162ce2f49b18c5ed5d1af566b35e Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Fri, 15 Apr 2022 21:53:13 +0800
Subject: [PATCH 2085/2161] ACPI/PRM: move mutex under PRM_CMD_RUN_SERVICE

commit opencloudos.

PRM_CMD_START_TRANSACTION and PRM_CMD_END_TRANSACTION may not be
invoked by _DSM method when run PRM service, so move mutex under
PRM_CMD_RUN_SERVICE to protect PRM handler.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 drivers/acpi/prmt.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 9dc59c741696..3b3022b94ae3 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -312,10 +312,13 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 	switch (buffer->prm_cmd) {
 	case PRM_CMD_RUN_SERVICE:
 
+		mutex_lock(&prm_mutex);
 		handler = find_prm_handler(&buffer->handler_guid);
 		module = find_prm_module(&buffer->handler_guid);
-		if (!handler || !module)
+		if (!handler || !module) {
+			mutex_unlock(&prm_mutex);
 			goto invalid_guid;
+		}
 
 		ACPI_COPY_NAMESEG(context.signature, "PRMC");
 		context.revision = 0x0;
@@ -327,6 +330,7 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 		status = efi_call_virt_pointer(handler, handler_addr,
 					       handler->acpi_param_buffer_addr,
 					       &context);
+		mutex_unlock(&prm_mutex);
 		if (status == EFI_SUCCESS) {
 			buffer->prm_status = PRM_HANDLER_SUCCESS;
 		} else {
@@ -337,7 +341,6 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 
 	case PRM_CMD_START_TRANSACTION:
 
-		mutex_lock(&prm_mutex);
 		module = find_prm_module(&buffer->handler_guid);
 		if (!module)
 			goto invalid_guid;
@@ -358,7 +361,6 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 			buffer->prm_status = UPDATE_UNLOCK_WITHOUT_LOCK;
 		else
 			module->updatable = true;
-		mutex_unlock(&prm_mutex);
 		break;
 
 	default:
-- 
Gitee


From a028c1006a30fcdcac72d2a62c76a5019bc8b818 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 24 Feb 2020 13:13:31 +0100
Subject: [PATCH 2086/2161] printk: Prepare for nested printk_nmi_enter()

commit 2ade0d60939bcd54197c133b03b460fe62a4ec47 upstream.

There is plenty of space in the printk_context variable. Reserve one byte
there for the NMI context to be on the safe side.

It should never overflow. The BUG_ON(in_nmi() == NMI_MASK) in nmi_enter()
will trigger much earlier.

Intel-SIG: commit 2ade0d60939b printk: Prepare for nested printk_nmi_enter().
Backport to kernel 5.4 to enhance MCA-R.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Link: https://lkml.kernel.org/r/20200505134100.681374113@linutronix.de
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 kernel/printk/internal.h    | 8 +++++---
 kernel/printk/printk_safe.c | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index b2b0f526f249..660f9a6bf73a 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -6,9 +6,11 @@
 
 #ifdef CONFIG_PRINTK
 
-#define PRINTK_SAFE_CONTEXT_MASK	 0x3fffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK	 0x40000000
-#define PRINTK_NMI_CONTEXT_MASK		 0x80000000
+#define PRINTK_SAFE_CONTEXT_MASK	0x007ffffff
+#define PRINTK_NMI_DIRECT_CONTEXT_MASK	0x008000000
+#define PRINTK_NMI_CONTEXT_MASK		0xff0000000
+
+#define PRINTK_NMI_CONTEXT_OFFSET	0x010000000
 
 extern raw_spinlock_t logbuf_lock;
 
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 6cfc5a00c67d..8078205f5b10 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -303,12 +303,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
 
 void notrace printk_nmi_enter(void)
 {
-	this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+	this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
 }
 
 void notrace printk_nmi_exit(void)
 {
-	this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+	this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
 }
 
 /*
-- 
Gitee


From b4666c06a2d6cff004dbb24780b5b75e10c25187 Mon Sep 17 00:00:00 2001
From: jschoenh <jschoenh@amazon.de>
Date: Fri, 3 Jan 2020 16:07:17 +0100
Subject: [PATCH 2087/2161] x86/mce: Take action on UCNA/Deferred errors again
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 8438b84ab42d9a67df33633258e0865c5761d2d4 upstream.

Commit

  fa92c5869426 ("x86, mce: Support memory error recovery for both UCNA
		and Deferred error in machine_check_poll")

added handling of UCNA and Deferred errors by adding them to the ring
for SRAO errors.

Later, commit

  fd4cf79fcc4b ("x86/mce: Remove the MCE ring for Action Optional errors")

switched storage from the SRAO ring to the unified pool that is still
in use today. In order to only act on the intended errors, a filter
for MCE_AO_SEVERITY is used -- effectively removing handling of
UCNA/Deferred errors again.

Extend the severity filter to include UCNA/Deferred errors again.
Also, generalize the naming of the notifier from SRAO to UC to capture
the extended scope.

Note, that this change may cause a message like the following to appear,
as the same address may be reported as SRAO and as UCNA:

 Memory failure: 0x5fe3284: already hardware poisoned

Technically, this is a return to previous behavior.

Intel-SIG: commit 8438b84ab42d x86/mce: Take action on UCNA/Deferred errors again.
Backport to kernel 5.4 to enhance MCA-R.

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Sink: https://lkml.kernel.org/r/20200103150722.20313-2-jschoenh@amazon.de
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/include/asm/mce.h     |  2 +-
 arch/x86/kernel/cpu/mce/core.c | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b0b54e18aa8b..dd2e21b130eb 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -172,7 +172,7 @@ enum mce_notifier_prios {
 	MCE_PRIO_EDAC,
 	MCE_PRIO_NFIT,
 	MCE_PRIO_EXTLOG,
-	MCE_PRIO_SRAO,
+	MCE_PRIO_UC,
 	MCE_PRIO_EARLY,
 	MCE_PRIO_CEC
 };
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e202cc618cd1..cdca07a43e39 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -168,8 +168,6 @@ void mce_inject_log(struct mce *m)
 }
 EXPORT_SYMBOL_GPL(mce_inject_log);
 
-static struct notifier_block mce_srao_nb;
-
 void mce_register_decode_chain(struct notifier_block *nb)
 {
 	if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
@@ -615,28 +613,30 @@ static struct notifier_block early_nb = {
 	.priority	= MCE_PRIO_EARLY,
 };
 
-static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
 	struct mce *mce = (struct mce *)data;
 	unsigned long pfn;
 
-	if (!mce)
+	if (!mce || !mce_usable_address(mce))
 		return NOTIFY_DONE;
 
-	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
-		pfn = mce->addr >> PAGE_SHIFT;
-		if (!memory_failure(pfn, 0)) {
-			set_mce_nospec(pfn, whole_page(mce));
-			mce->kflags |= MCE_HANDLED_UC;
-		}
+	if (mce->severity != MCE_AO_SEVERITY &&
+	    mce->severity != MCE_DEFERRED_SEVERITY)
+		return NOTIFY_DONE;
+
+	pfn = mce->addr >> PAGE_SHIFT;
+	if (!memory_failure(pfn, 0)) {
+		set_mce_nospec(pfn, whole_page(mce));
+		mce->kflags |= MCE_HANDLED_UC;
 	}
 
 	return NOTIFY_OK;
 }
-static struct notifier_block mce_srao_nb = {
-	.notifier_call	= srao_decode_notifier,
-	.priority	= MCE_PRIO_SRAO,
+static struct notifier_block mce_uc_nb = {
+	.notifier_call	= uc_decode_notifier,
+	.priority	= MCE_PRIO_UC,
 };
 
 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@@ -2048,7 +2048,7 @@ int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
 	mce_register_decode_chain(&early_nb);
-	mce_register_decode_chain(&mce_srao_nb);
+	mce_register_decode_chain(&mce_uc_nb);
 	mce_register_decode_chain(&mce_default_nb);
 	mcheck_vendor_init_severity();
 
-- 
Gitee


From f78e0dad8905af5f6ba1d6fd55e7610206611774 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 24 Jun 2021 18:39:55 -0700
Subject: [PATCH 2088/2161] mm/memory-failure: use a mutex to avoid
 memory_failure() races

commit 171936ddaf97e6f4e1264f4128bb5cf15691339c upstream.

Patch series "mm,hwpoison: fix sending SIGBUS for Action Required MCE", v5.

I wrote this patchset to materialize what I think is the current
allowable solution mentioned by the previous discussion [1].  I simply
borrowed Tony's mutex patch and Aili's return code patch, then I queued
another one to find error virtual address in the best effort manner.  I
know that this is not a perfect solution, but should work for some
typical case.

[1]: https://lore.kernel.org/linux-mm/20210331192540.2141052f@alex-virtual-machine/

This patch (of 2):

There can be races when multiple CPUs consume poison from the same page.
The first into memory_failure() atomically sets the HWPoison page flag
and begins hunting for tasks that map this page.  Eventually it
invalidates those mappings and may send a SIGBUS to the affected tasks.

But while all that work is going on, other CPUs see a "success" return
code from memory_failure() and so they believe the error has been
handled and continue executing.

Fix by wrapping most of the internal parts of memory_failure() in a
mutex.

Intel-SIG: commit 171936ddaf97 mm/memory-failure: use a mutex to avoid
 memory_failure() races.
Backport to kernel 5.4 to enhance MCA-R.

[akpm@linux-foundation.org: make mf_mutex local to memory_failure()]

Link: https://lkml.kernel.org/r/20210521030156.2612074-1-nao.horiguchi@gmail.com
Link: https://lkml.kernel.org/r/20210521030156.2612074-2-nao.horiguchi@gmail.com
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jue Wang <juew@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 mm/memory-failure.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 63e188a7d40b..eabede98e8ee 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1251,8 +1251,9 @@ int memory_failure(unsigned long pfn, int flags)
 	struct page *hpage;
 	struct page *orig_head;
 	struct dev_pagemap *pgmap;
-	int res;
+	int res = 0;
 	unsigned long page_flags;
+	static DEFINE_MUTEX(mf_mutex);
 
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure on page %lx", pfn);
@@ -1270,12 +1271,17 @@ int memory_failure(unsigned long pfn, int flags)
 		return -ENXIO;
 	}
 
-	if (PageHuge(p))
-		return memory_failure_hugetlb(pfn, flags);
+	mutex_lock(&mf_mutex);
+
+	if (PageHuge(p)) {
+		res = memory_failure_hugetlb(pfn, flags);
+		goto unlock_mutex;
+	}
+
 	if (TestSetPageHWPoison(p)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	orig_head = hpage = compound_head(p);
@@ -1298,8 +1304,9 @@ int memory_failure(unsigned long pfn, int flags)
 			return 0;
 		} else {
 			action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
-			return -EBUSY;
+			res = -EBUSY;
 		}
+		goto unlock_mutex;
 	}
 
 	if (PageTransHuge(hpage)) {
@@ -1315,7 +1322,8 @@ int memory_failure(unsigned long pfn, int flags)
 			if (TestClearPageHWPoison(p))
 				num_poisoned_pages_dec();
 			put_hwpoison_page(p);
-			return -EBUSY;
+			res = -EBUSY;
+			goto unlock_mutex;
 		}
 		unlock_page(p);
 		VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1337,7 +1345,8 @@ int memory_failure(unsigned long pfn, int flags)
 			action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
 		else
 			action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
-		return 0;
+		res = 0;
+		goto unlock_mutex;
 	}
 
 	lock_page(p);
@@ -1349,7 +1358,7 @@ int memory_failure(unsigned long pfn, int flags)
 	if (PageCompound(p) && compound_head(p) != orig_head) {
 		action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
 		res = -EBUSY;
-		goto out;
+		goto unlock_page;
 	}
 
 	/*
@@ -1372,14 +1381,14 @@ int memory_failure(unsigned long pfn, int flags)
 		num_poisoned_pages_dec();
 		unlock_page(p);
 		put_hwpoison_page(p);
-		return 0;
+		goto unlock_mutex;
 	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
 			num_poisoned_pages_dec();
 		unlock_page(p);
 		put_hwpoison_page(p);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (!PageTransTail(p) && !PageLRU(p))
@@ -1401,7 +1410,7 @@ int memory_failure(unsigned long pfn, int flags)
 	if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;
-		goto out;
+		goto unlock_page;
 	}
 
 	/*
@@ -1410,13 +1419,15 @@ int memory_failure(unsigned long pfn, int flags)
 	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
 		action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
 		res = -EBUSY;
-		goto out;
+		goto unlock_page;
 	}
 
 identify_page_state:
 	res = identify_page_state(pfn, p, page_flags);
-out:
+unlock_page:
 	unlock_page(p);
+unlock_mutex:
+	mutex_unlock(&mf_mutex);
 	return res;
 }
 EXPORT_SYMBOL_GPL(memory_failure);
-- 
Gitee


From 99b38689049a61f2aca085e79bc78dc54ff08873 Mon Sep 17 00:00:00 2001
From: Aili Yao <yaoaili@kingsoft.com>
Date: Thu, 24 Jun 2021 18:39:58 -0700
Subject: [PATCH 2089/2161] mm,hwpoison: return -EHWPOISON to denote that the
 page has already been poisoned

commit 47af12bae17f99b5e77f8651cb7f3e1877610acf upstream.

When memory_failure() is called with MF_ACTION_REQUIRED on the page that
has already been hwpoisoned, memory_failure() could fail to send SIGBUS
to the affected process, which results in infinite loop of MCEs.

Currently memory_failure() returns 0 if it's called for already
hwpoisoned page, then the caller, kill_me_maybe(), could return without
sending SIGBUS to current process.  An action required MCE is raised
when the current process accesses to the broken memory, so no SIGBUS
means that the current process continues to run and access to the error
page again soon, so running into MCE loop.

This issue can arise for example in the following scenarios:

 - Two or more threads access to the poisoned page concurrently. If
   local MCE is enabled, MCE handler independently handles the MCE
   events. So there's a race among MCE events, and the second or latter
   threads fall into the situation in question.

 - If there was a precedent memory error event and memory_failure() for
   the event failed to unmap the error page for some reason, the
   subsequent memory access to the error page triggers the MCE loop
   situation.

To fix the issue, make memory_failure() return an error code when the
error page has already been hwpoisoned.  This allows memory error
handler to control how it sends signals to userspace.  And make sure
that any process touching a hwpoisoned page should get a SIGBUS even in
"already hwpoisoned" path of memory_failure() as is done in page fault
path.

Intel-SIG: commit 47af12bae17f mm,hwpoison: return -EHWPOISON to denote that the page
 has already been poisoned.
Backport to kernel 5.4 to enhance MCA-R.

Link: https://lkml.kernel.org/r/20210521030156.2612074-3-nao.horiguchi@gmail.com
Signed-off-by: Aili Yao <yaoaili@kingsoft.com>
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jue Wang <juew@google.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 mm/memory-failure.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index eabede98e8ee..92a2126e6318 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1091,7 +1091,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(head)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 		       pfn);
-		return 0;
+		return -EHWPOISON;
 	}
 
 	num_poisoned_pages_inc();
@@ -1281,6 +1281,7 @@ int memory_failure(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(p)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
+		res = -EHWPOISON;
 		goto unlock_mutex;
 	}
 
-- 
Gitee


From 6ad85a30d19748afa2b5f04010b19f985901f2b8 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 24 Jun 2021 18:40:01 -0700
Subject: [PATCH 2090/2161] mm/hwpoison: do not lock page again when
 me_huge_page() successfully recovers

commit ea6d0630100b285f059d0a8d8e86f38a46407536 upstream.

Currently me_huge_page() temporary unlocks page to perform some actions
then locks it again later.  My testcase (which calls hard-offline on
some tail page in a hugetlb, then accesses the address of the hugetlb
range) showed that page allocation code detects this page lock on buddy
page and printed out "BUG: Bad page state" message.

check_new_page_bad() does not consider a page with __PG_HWPOISON as bad
page, so this flag works as kind of filter, but this filtering doesn't
work in this case because the "bad page" is not the actual hwpoisoned
page.  So stop locking page again.  Actions to be taken depend on the
page type of the error, so page unlocking should be done in ->action()
callbacks.  So let's make it assumed and change all existing callbacks
that way.

Intel-SIG: commit ea6d0630100b mm/hwpoison: do not lock page again when me_huge_page()
 successfully recovers.
Backport to kernel 5.4 to enhance MCA-R.

Link: https://lkml.kernel.org/r/20210609072029.74645-1-nao.horiguchi@gmail.com
Fixes: commit 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 mm/memory-failure.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 92a2126e6318..e621e5001d46 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -630,6 +630,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
  */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
+	unlock_page(p);
 	return MF_IGNORED;
 }
 
@@ -639,6 +640,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
 	pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+	unlock_page(p);
 	return MF_FAILED;
 }
 
@@ -647,6 +649,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
  */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
+	int ret;
 	struct address_space *mapping;
 
 	delete_from_lru_cache(p);
@@ -655,8 +658,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 * For anonymous pages we're done the only reference left
 	 * should be the one m_f() holds.
 	 */
-	if (PageAnon(p))
-		return MF_RECOVERED;
+	if (PageAnon(p)) {
+		ret = MF_RECOVERED;
+		goto out;
+	}
 
 	/*
 	 * Now truncate the page in the page cache. This is really
@@ -670,7 +675,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 		/*
 		 * Page has been teared down in the meanwhile
 		 */
-		return MF_FAILED;
+		ret = MF_FAILED;
+		goto out;
 	}
 
 	/*
@@ -678,7 +684,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 *
 	 * Open: to take i_mutex or not for this? Right now we don't.
 	 */
-	return truncate_error_page(p, pfn, mapping);
+	ret = truncate_error_page(p, pfn, mapping);
+out:
+	unlock_page(p);
+	return ret;
 }
 
 /*
@@ -754,24 +763,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
+	int ret;
+
 	ClearPageDirty(p);
 	/* Trigger EIO in shmem: */
 	ClearPageUptodate(p);
 
-	if (!delete_from_lru_cache(p))
-		return MF_DELAYED;
-	else
-		return MF_FAILED;
+	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+	unlock_page(p);
+	return ret;
 }
 
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
+	int ret;
+
 	delete_from_swap_cache(p);
 
-	if (!delete_from_lru_cache(p))
-		return MF_RECOVERED;
-	else
-		return MF_FAILED;
+	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+	unlock_page(p);
+	return ret;
 }
 
 /*
@@ -792,6 +803,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 	mapping = page_mapping(hpage);
 	if (mapping) {
 		res = truncate_error_page(hpage, pfn, mapping);
+		unlock_page(hpage);
 	} else {
 		unlock_page(hpage);
 		/*
@@ -803,7 +815,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 			put_page(hpage);
 		dissolve_free_huge_page(p);
 		res = MF_RECOVERED;
-		lock_page(hpage);
 	}
 
 	return res;
@@ -836,6 +847,8 @@ static struct page_state {
 	unsigned long mask;
 	unsigned long res;
 	enum mf_action_page_type type;
+
+	/* Callback ->action() has to unlock the relevant page inside it. */
 	int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
 	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
@@ -900,6 +913,7 @@ static int page_action(struct page_state *ps, struct page *p,
 	int result;
 	int count;
 
+	/* page p should be unlocked after returning from ps->action().  */
 	result = ps->action(p, pfn);
 
 	count = page_count(p) - 1;
@@ -1147,7 +1161,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 		goto out;
 	}
 
-	res = identify_page_state(pfn, p, page_flags);
+	return identify_page_state(pfn, p, page_flags);
 out:
 	unlock_page(head);
 	return res;
@@ -1425,6 +1439,8 @@ int memory_failure(unsigned long pfn, int flags)
 
 identify_page_state:
 	res = identify_page_state(pfn, p, page_flags);
+	mutex_unlock(&mf_mutex);
+	return res;
 unlock_page:
 	unlock_page(p);
 unlock_mutex:
-- 
Gitee


From 0791afd5c3758ebcb28843de78d97e8a5124cc18 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Mon, 28 Jun 2021 19:43:14 -0700
Subject: [PATCH 2091/2161] mm,hwpoison: send SIGBUS with error virutal address

commit a3f5d80ea401ac857f2910e28b15f35b2cf902f4 upstream.

Now an action required MCE in already hwpoisoned address surely sends a
SIGBUS to current process, but the SIGBUS doesn't convey error virtual
address.  That's not optimal for hwpoison-aware applications.

To fix the issue, make memory_failure() call kill_accessing_process(),
that does pagetable walk to find the error virtual address.  It could find
multiple virtual addresses for the same error page, and it seems hard to
tell which virtual address is correct one.  But that's rare and sending
incorrect virtual address could be better than no address.  So let's
report the first found virtual address for now.

Intel-SIG: commit a3f5d80ea401 mm,hwpoison: send SIGBUS with error virutal address.
Backport to kernel 5.4 to enhance MCA-R.

[naoya.horiguchi@nec.com: fix walk_page_range() return]
  Link: https://lkml.kernel.org/r/20210603051055.GA244241@hori.linux.bs1.fc.nec.co.jp

Link: https://lkml.kernel.org/r/20210521030156.2612074-4-nao.horiguchi@gmail.com
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Jue Wang <juew@google.com>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c |  13 ++-
 include/linux/swapops.h        |   5 ++
 mm/memory-failure.c            | 150 ++++++++++++++++++++++++++++++++-
 3 files changed, 165 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index cdca07a43e39..bf5bb75622b2 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1219,17 +1219,26 @@ static void kill_me_maybe(struct callback_head *cb)
 {
 	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
 	int flags = MF_ACTION_REQUIRED;
+	int ret;
 
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;
 
-	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
-	    !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+	if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		return;
 	}
 
+	/*
+	 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+	 * to the current process with the proper error info, so no need to
+	 * send SIGBUS here again.
+	 */
+	if (ret == -EHWPOISON)
+		return;
+
 	if (p->mce_vaddr != (void __user *)-1l) {
 		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
 	} else {
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 3208a520d0be..8c595cea2811 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -321,6 +321,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
+static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e621e5001d46..42e9ad2f2289 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -527,6 +528,148 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 	kfree(tk);
 }
 
+struct hwp_walk {
+	struct to_kill tk;
+	unsigned long pfn;
+	int flags;
+};
+
+static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+	tk->addr = addr;
+	tk->size_shift = shift;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+				unsigned long poisoned_pfn, struct to_kill *tk)
+{
+	unsigned long pfn = 0;
+
+	if (pte_present(pte)) {
+		pfn = pte_pfn(pte);
+	} else {
+		swp_entry_t swp = pte_to_swp_entry(pte);
+
+		if (is_hwpoison_entry(swp))
+			pfn = hwpoison_entry_to_pfn(swp);
+	}
+
+	if (!pfn || pfn != poisoned_pfn)
+		return 0;
+
+	set_to_kill(tk, addr, shift);
+	return 1;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	pmd_t pmd = *pmdp;
+	unsigned long pfn;
+	unsigned long hwpoison_vaddr;
+
+	if (!pmd_present(pmd))
+		return 0;
+	pfn = pmd_pfn(pmd);
+	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
+		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
+		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
+		return 1;
+	}
+	return 0;
+}
+#else
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	return 0;
+}
+#endif
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+			      unsigned long end, struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	int ret = 0;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+	if (ptl) {
+		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
+		spin_unlock(ptl);
+		goto out;
+	}
+
+	if (pmd_trans_unstable(pmdp))
+		goto out;
+
+	ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+	for (; addr != end; ptep++, addr += PAGE_SIZE) {
+		ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+					     hwp->pfn, &hwp->tk);
+		if (ret == 1)
+			break;
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+out:
+	cond_resched();
+	return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+			    unsigned long addr, unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	pte_t pte = huge_ptep_get(ptep);
+	struct hstate *h = hstate_vma(walk->vma);
+
+	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+				      hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range	NULL
+#endif
+
+static struct mm_walk_ops hwp_walk_ops = {
+	.pmd_entry = hwpoison_pte_range,
+	.hugetlb_entry = hwpoison_hugetlb_range,
+};
+
+/*
+ * Sends SIGBUS to the current process with error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple local machine checks happened on different CPUs.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. The returned virtual address
+ * is proper in most cases, but it could be wrong when the application
+ * process has multiple entries mapping the error page.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+				  int flags)
+{
+	int ret;
+	struct hwp_walk priv = {
+		.pfn = pfn,
+	};
+	priv.tk.tsk = p;
+
+	down_read(&(p->mm->mmap_sem));
+	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
+			      (void *)&priv);
+	if (ret == 1 && priv.tk.addr)
+		kill_proc(&priv.tk, pfn, flags);
+	up_read(&(p->mm->mmap_sem));
+	return ret ? -EFAULT : -EHWPOISON;
+}
+
 static const char *action_name[] = {
 	[MF_IGNORED] = "Ignored",
 	[MF_FAILED] = "Failed",
@@ -1105,7 +1248,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(head)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 		       pfn);
-		return -EHWPOISON;
+		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, page_to_pfn(head), flags);
+		return res;
 	}
 
 	num_poisoned_pages_inc();
@@ -1296,6 +1442,8 @@ int memory_failure(unsigned long pfn, int flags)
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
 		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, pfn, flags);
 		goto unlock_mutex;
 	}
 
-- 
Gitee


From c06e8462407202f5228784674bc1b103ba08afcf Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Mon, 13 Sep 2021 14:52:39 -0700
Subject: [PATCH 2092/2161] Intel: x86/mce: Avoid infinite loop for copy from
 user recovery

commit 81065b35e2486c024c7aa86caed452e1f01a59d4 upstream.

There are two cases for machine check recovery:

1) The machine check was triggered by ring3 (application) code.
   This is the simpler case. The machine check handler simply queues
   work to be executed on return to user. That code unmaps the page
   from all users and arranges to send a SIGBUS to the task that
   triggered the poison.

2) The machine check was triggered in kernel code that is covered by
   an exception table entry. In this case the machine check handler
   still queues a work entry to unmap the page, etc. but this will
   not be called right away because the #MC handler returns to the
   fix up code address in the exception table entry.

Problems occur if the kernel triggers another machine check before the
return to user processes the first queued work item.

Specifically, the work is queued using the ->mce_kill_me callback
structure in the task struct for the current thread. Attempting to queue
a second work item using this same callback results in a loop in the
linked list of work functions to call. So when the kernel does return to
user, it enters an infinite loop processing the same entry for ever.

There are some legitimate scenarios where the kernel may take a second
machine check before returning to the user.

1) Some code (e.g. futex) first tries a get_user() with page faults
   disabled. If this fails, the code retries with page faults enabled
   expecting that this will resolve the page fault.

2) Copy from user code retries a copy in byte-at-time mode to check
   whether any additional bytes can be copied.

On the other side of the fence are some bad drivers that do not check
the return value from individual get_user() calls and may access
multiple user addresses without noticing that some/all calls have
failed.

Fix by adding a counter (current->mce_count) to keep track of repeated
machine checks before task_work() is called. First machine check saves
the address information and calls task_work_add(). Subsequent machine
checks before that task_work call back is executed check that the address
is in the same page as the first machine check (since the callback will
offline exactly one page).

Expected worst case is four machine checks before moving on (e.g. one
user access with page faults disabled, then a repeat to the same address
with page faults enabled ... repeat in copy tail bytes). Just in case
there is some code that loops forever enforce a limit of 10.

 [ bp: Massage commit message, drop noinstr, fix typo, extend panic
   messages. ]

Intel-SIG: commit 81065b35e248 Intel: x86/mce: Avoid infinite loop for copy from user
 recovery.
Backport to kernel 5.4 to enhance MCA-R.

Fixes: 5567d11c21a1 ("x86/mce: Send #MC singal from task work")
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/YT/IJ9ziLqmtqEPu@agluck-desk2.amr.corp.intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c | 43 +++++++++++++++++++++++++---------
 include/linux/sched.h          |  1 +
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index bf5bb75622b2..8538a97b0cb3 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1212,6 +1212,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
 
 static void kill_me_now(struct callback_head *ch)
 {
+	struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+	p->mce_count = 0;
 	force_sig(SIGBUS);
 }
 
@@ -1221,6 +1224,7 @@ static void kill_me_maybe(struct callback_head *cb)
 	int flags = MF_ACTION_REQUIRED;
 	int ret;
 
+	p->mce_count = 0;
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;
@@ -1247,17 +1251,34 @@ static void kill_me_maybe(struct callback_head *cb)
 	}
 }
 
-static void queue_task_work(struct mce *m, int kill_it)
+static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
 {
-	current->mce_addr = m->addr;
-	current->mce_kflags = m->kflags;
-	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
-	current->mce_whole_page = whole_page(m);
+	int count = ++current->mce_count;
 
-	if (kill_it)
-		current->mce_kill_me.func = kill_me_now;
-	else
-		current->mce_kill_me.func = kill_me_maybe;
+	/* First call, save all the details */
+	if (count == 1) {
+		current->mce_addr = m->addr;
+		current->mce_kflags = m->kflags;
+		current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+		current->mce_whole_page = whole_page(m);
+
+		if (kill_current_task)
+			current->mce_kill_me.func = kill_me_now;
+		else
+			current->mce_kill_me.func = kill_me_maybe;
+	}
+
+	/* Ten is likely overkill. Don't expect more than two faults before task_work() */
+	if (count > 10)
+		mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+
+	/* Second or later call, make sure page address matches the one from first call */
+	if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+		mce_panic("Consecutive machine checks to different user pages", m, msg);
+
+	/* Do not call task_work_add() more than once */
+	if (count > 1)
+		return;
 
 	task_work_add(current, &current->mce_kill_me, true);
 }
@@ -1408,7 +1429,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		/* If this triggers there is no way to recover. Die hard. */
 		BUG_ON(!on_thread_stack() || !user_mode(regs));
 
-		queue_task_work(&m, kill_it);
+		queue_task_work(&m, msg, kill_it);
 
 	} else {
 		/*
@@ -1426,7 +1447,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		}
 
 		if (m.kflags & MCE_IN_KERNEL_COPYIN)
-			queue_task_work(&m, kill_it);
+			queue_task_work(&m, msg, kill_it);
 	}
 
 out_ist:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae223f2470a7..0ef728f0a300 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1298,6 +1298,7 @@ struct task_struct {
 					__mce_reserved : 62;
 
 	struct callback_head		mce_kill_me;
+	int				mce_count;
 #endif
 
 	/*
-- 
Gitee


From 78422c6befe1f8367c39755bd91fad343f997b99 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 17 Aug 2021 17:29:41 -0700
Subject: [PATCH 2093/2161] Intel: x86/mce: Change to not send SIGBUS error
 during copy from user

commit a6e3cf70b772541c2388abdb86e5a562cfe18e63 upstream.

Sending a SIGBUS for a copy from user is not the correct semantic.
System calls should return -EFAULT (or a short count for write(2)).

Intel-SIG: commit a6e3cf70b772 Intel: x86/mce: Change to not send SIGBUS error during
 copy from user.
Backport to kernel 5.4 to enhance MCA-R.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210818002942.1607544-3-tony.luck@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c | 36 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 8538a97b0cb3..4529f1103bef 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1230,7 +1230,7 @@ static void kill_me_maybe(struct callback_head *cb)
 		flags |= MF_MUST_KILL;
 
 	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
-	if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	if (!ret) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		return;
 	}
@@ -1243,15 +1243,21 @@ static void kill_me_maybe(struct callback_head *cb)
 	if (ret == -EHWPOISON)
 		return;
 
-	if (p->mce_vaddr != (void __user *)-1l) {
-		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
-	} else {
-		pr_err("Memory error not recovered");
-		kill_me_now(cb);
-	}
+	pr_err("Memory error not recovered");
+	kill_me_now(cb);
 }
 
-static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
+static void kill_me_never(struct callback_head *cb)
+{
+	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+
+	p->mce_count = 0;
+	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
+		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+}
+
+static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
 {
 	int count = ++current->mce_count;
 
@@ -1261,11 +1267,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
 		current->mce_kflags = m->kflags;
 		current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
 		current->mce_whole_page = whole_page(m);
-
-		if (kill_current_task)
-			current->mce_kill_me.func = kill_me_now;
-		else
-			current->mce_kill_me.func = kill_me_maybe;
+		current->mce_kill_me.func = func;
 	}
 
 	/* Ten is likely overkill. Don't expect more than two faults before task_work() */
@@ -1429,8 +1431,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		/* If this triggers there is no way to recover. Die hard. */
 		BUG_ON(!on_thread_stack() || !user_mode(regs));
 
-		queue_task_work(&m, msg, kill_it);
-
+		if (kill_it)
+			queue_task_work(&m, msg, kill_me_now);
+		else
+			queue_task_work(&m, msg, kill_me_maybe);
 	} else {
 		/*
 		 * Handle an MCE which has happened in kernel space but from
@@ -1447,7 +1451,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		}
 
 		if (m.kflags & MCE_IN_KERNEL_COPYIN)
-			queue_task_work(&m, msg, kill_it);
+			queue_task_work(&m, msg, kill_me_never);
 	}
 
 out_ist:
-- 
Gitee


From 6da4067049601e4a79da1131f98e698ca089ee6a Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 17 Aug 2021 17:29:42 -0700
Subject: [PATCH 2094/2161] Intel: x86/mce: Drop copyin special case for #MC

commit 690658471b5f28d306e6492c4585d748cb5304e8 upstream.

Fixes to the iterator code to handle faults that are not on page
boundaries mean that the special case for machine check during copy from
user is no longer needed.

For a full list of those fixes, see the output of:

  git log --oneline v5.14 ^v5.13 -- lib/iov_iter.c

Intel-SIG: commit 690658471b5f Intel: x86/mce: Drop copyin special case for #MC.
Backport to kernel 5.4 to enhance MCA-R.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210818002942.1607544-4-tony.luck@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/lib/copy_user_64.S | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 497b4a8414b3..0f2e33169f8c 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -234,24 +234,11 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
 ALIGN;
 .Lcopy_user_handle_tail:
 	movl %edx,%ecx
-	cmp $18,%eax		/* check if X86_TRAP_MC */
-	je 3f
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	ret
 
-	/*
-	 * Return zero to pretend that this copy succeeded. This
-	 * is counter-intuitive, but needed to prevent the code
-	 * in lib/iov_iter.c from retrying and running back into
-	 * the poison cache line again. The machine check handler
-	 * will ensure that a SIGBUS is sent to the task.
-	 */
-3:	xorl %eax,%eax
-	ASM_CLAC
-	ret
-
 	_ASM_EXTABLE_CPY(1b, 2b)
 END(.Lcopy_user_handle_tail)
 
-- 
Gitee


From e954b45617154c7b6b1409e51bd398f960fbf997 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 31 May 2021 00:32:44 -0400
Subject: [PATCH 2095/2161] Intel: generic_perform_write()/iomap_write_actor():
 saner logics for short copy

commit bc1bb416bbb9203e250f5c49aaf1d11b5d9c8adb upstream.

if we run into a short copy and ->write_end() refuses to advance at all,
use the amount we'd managed to copy for the next iteration to handle.

Intel-SIG: commit bc1bb416bbb9 Intel: generic_perform_write()/iomap_write_actor():
 saner logics for short copy.
Backport to kernel 5.4 to enhance MCA-R.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 fs/iomap/buffered-io.c | 24 +++++++++---------------
 mm/filemap.c           | 24 +++++++++---------------
 2 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 80867a1a94f2..f678d9f7181c 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -762,10 +762,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@@ -788,24 +784,22 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 				iomap);
 		if (unlikely(status < 0))
 			break;
-		copied = status;
 
 		cond_resched();
 
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
+		if (unlikely(status == 0)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * A short copy made iomap_write_end() reject the
+			 * thing entirely.  Might be memory poisoning
+			 * halfway through, might be a race with munmap,
+			 * might be severe memory pressure.
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
+			if (copied)
+				bytes = copied;
 			goto again;
 		}
+		copied = status;
+		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 		length -= copied;
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a830bc940d1..006d1186ec50 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3297,10 +3297,6 @@ ssize_t generic_perform_write(struct file *file,
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@@ -3327,24 +3323,22 @@ ssize_t generic_perform_write(struct file *file,
 						page, fsdata);
 		if (unlikely(status < 0))
 			break;
-		copied = status;
 
 		cond_resched();
 
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
+		if (unlikely(status == 0)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * A short copy made ->write_end() reject the
+			 * thing entirely.  Might be memory poisoning
+			 * halfway through, might be a race with munmap,
+			 * might be severe memory pressure.
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
+			if (copied)
+				bytes = copied;
 			goto again;
 		}
+		copied = status;
+		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 
-- 
Gitee


From f0c178424c310d0cab0697cb25c468336dc3d33f Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Thu, 23 Dec 2021 12:07:01 -0800
Subject: [PATCH 2096/2161] x86/mce: Reduce number of machine checks taken
 during recovery

commit 3376136300a00df9a864b88fa969177d6c3be8e5 upstream.

When any of the copy functions in arch/x86/lib/copy_user_64.S take a
fault, the fixup code copies the remaining byte count from %ecx to %edx
and unconditionally jumps to .Lcopy_user_handle_tail to continue the
copy in case any more bytes can be copied.

If the fault was #PF this may copy more bytes (because the page fault
handler might have fixed the fault). But when the fault is a machine
check the original copy code will have copied all the way to the poisoned
cache line. So .Lcopy_user_handle_tail will just take another machine
check for no good reason.

Every code path to .Lcopy_user_handle_tail comes from an exception fixup
path, so add a check there to check the trap type (in %eax) and simply
return the count of remaining bytes if the trap was a machine check.

Doing this reduces the number of machine checks taken during synthetic
tests from four to three.

As well as reducing the number of machine checks, this also allows
Skylake generation Xeons to recover some cases that currently fail. The
is because REP; MOVSB is only recoverable when source and destination
are well aligned and the byte count is large. That useless call to
.Lcopy_user_handle_tail may violate one or more of these conditions and
generate a fatal machine check.

Intel-SIG: commit 3376136300a0 x86/mce: Reduce number of machine checks taken during
 recovery.
Backport to kernel 5.4 to enhance MCA-R.

  [ Tony: Add more details to commit message. ]
  [ bp: Fixup comment.
    Also, another tip patchset which is adding straight-line speculation
    mitigation changes the "ret" instruction to an all-caps macro "RET".
    But, since gas is case-insensitive, use "RET" in the newly added asm block
    already in order to simplify tip branch merging on its way upstream.
  ]

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/YcTW5dh8yTGucDd+@agluck-desk2.amr.corp.intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/lib/copy_user_64.S | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 0f2e33169f8c..6a53e579d683 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -224,6 +224,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  * Don't try to copy the tail if machine check happened
  *
  * Input:
+ * eax trap number written by ex_handler_copy()
  * rdi destination
  * rsi source
  * rdx count
@@ -233,12 +234,20 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  */
 ALIGN;
 .Lcopy_user_handle_tail:
+	cmp $18,%eax
+	je 3f
+
 	movl %edx,%ecx
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	ret
 
+3:
+	movl %edx,%eax
+	ASM_CLAC
+	ret
+
 	_ASM_EXTABLE_CPY(1b, 2b)
 END(.Lcopy_user_handle_tail)
 
-- 
Gitee


From 389be113b456199eb4675479301e71b9c833fe07 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Sun, 6 Feb 2022 15:10:14 +0800
Subject: [PATCH 2097/2161] mm/hwpoison: fix error page recovered but reported
 "not recovered"

commit 046545a661af2beec21de7b90ca0e35f05088a81 upstream.

When an uncorrected memory error is consumed there is a race between the CMCI from the memory controller reporting an uncorrected error with a UCNA signature, and the core reporting and SRAR signature machine check when the data is about to be consumed.

If the CMCI wins that race, the page is marked poisoned when
uc_decode_notifier() calls memory_failure() and the machine check processing code finds the page already poisoned.  It calls
kill_accessing_process() to make sure a SIGBUS is sent.  But returns the wrong error code.

Console log looks like this:

[34775.674296] mce: Uncorrected hardware memory error in user-access at 3710b3400 [34775.675413] Memory failure: 0x3710b3: recovery action for dirty LRU page: Recovered [34775.690310] Memory failure: 0x3710b3: already hardware poisoned [34775.696247] Memory failure: 0x3710b3: Sending SIGBUS to einj_mem_uc:361438 due to hardware memory corruption [34775.706072] mce: Memory error not recovered

kill_accessing_process() is supposed to return -EHWPOISON to notify that SIGBUS is already set to the process and kill_me_maybe() doesn't have to send it again.  But current code simply fails to do this, so fix it to make sure to work as intended.  This change avoids the noise message "Memory error not recovered" and skips duplicate SIGBUSs.

Intel-SIG: commit 046545a661af mm/hwpoison: fix error page recovered but reported "not
 recovered".
Backport to kernel 5.4 to enhance MCA-R.

[tony.luck@intel.com: reword some parts of commit message]
Link: https://lkml.kernel.org/r/20220113231117.1021405-1-naoya.horiguchi@linux.dev
Fixes: a3f5d80ea401 ("mm,hwpoison: send SIGBUS with error virutal address")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: Youquan Song <youquan.song@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 mm/memory-failure.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 42e9ad2f2289..e715dd67ce05 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -666,8 +666,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
 			      (void *)&priv);
 	if (ret == 1 && priv.tk.addr)
 		kill_proc(&priv.tk, pfn, flags);
+	else
+		ret = 0;
 	up_read(&(p->mm->mmap_sem));
-	return ret ? -EFAULT : -EHWPOISON;
+	return ret > 0 ? -EHWPOISON : -EFAULT;
 }
 
 static const char *action_name[] = {
-- 
Gitee


From f338a9ff8105cb8e32a6155efbe82253320c6645 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 17 Nov 2020 20:49:52 +0800
Subject: [PATCH 2098/2161] EDAC: Add DDR5 new memory type

commit bc1c99a5971aa7571e8b9731c28fa32abe12cab8 upstream.

Add a new entry to 'enum mem_type' and a new string to
'edac_mem_types[]' for DDR5 new memory type.

Intel_SIG: commit bc1c99a5971a EDAC: Add DDR5 new memory type.
backport for SPR EDAC support

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/edac_mc.c | 1 +
 include/linux/edac.h   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e73ca303f1a7..4b0a33a674fd 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -214,6 +214,7 @@ const char * const edac_mem_types[] = {
 	[MEM_DDR4]	= "Unbuffered-DDR4",
 	[MEM_RDDR4]	= "Registered-DDR4",
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
+	[MEM_DDR5]	= "Unbuffered-DDR5",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index c19483b90079..ad56bfa55a0b 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -187,6 +187,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_RDDR4:		Registered DDR4 RAM
  *			This is a variant of the DDR4 memories.
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
+ * @MEM_DDR5:		Unbuffered DDR5 RAM
  * @MEM_NVDIMM:		Non-volatile RAM
  */
 enum mem_type {
@@ -211,6 +212,7 @@ enum mem_type {
 	MEM_DDR4,
 	MEM_RDDR4,
 	MEM_LRDDR4,
+	MEM_DDR5,
 	MEM_NVDIMM,
 };
 
@@ -234,6 +236,7 @@ enum mem_type {
 #define MEM_FLAG_DDR4           BIT(MEM_DDR4)
 #define MEM_FLAG_RDDR4          BIT(MEM_RDDR4)
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
+#define MEM_FLAG_DDR5           BIT(MEM_DDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
 
 /**
-- 
Gitee


From 26aa2247aa913dda2ffe7cf7d068033c91b9ccd2 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Wed, 18 Aug 2021 10:57:00 -0700
Subject: [PATCH 2099/2161] EDAC/i10nm: Fix NVDIMM detection

commit 2294a7299f5e51667b841f63c6d69474491753fb upstream.

MCDDRCFG is a per-channel register and uses bit{0,1} to indicate
the NVDIMM presence on DIMM slot{0,1}. Current i10nm_edac driver
wrongly uses MCDDRCFG as per-DIMM register and fails to detect
the NVDIMM.

Fix it by reading MCDDRCFG as per-channel register and using its
bit{0,1} to check whether the NVDIMM is populated on DIMM slot{0,1}.

Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors")
Reported-by: Fan Du <fan.du@intel.com>
Tested-by: Wen Jin <wen.jin@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210818175701.1611513-2-tony.luck@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/edac/i10nm_base.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 8a1913860d08..7999ffea32fd 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -26,8 +26,8 @@
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
 	readl((m)->mbase + 0x2080c + (i) * 0x4000 + (j) * 4)
-#define I10NM_GET_MCDDRTCFG(m, i, j)	\
-	readl((m)->mbase + 0x20970 + (i) * 0x4000 + (j) * 4)
+#define I10NM_GET_MCDDRTCFG(m, i)	\
+	readl((m)->mbase + 0x20970 + (i) * 0x4000)
 #define I10NM_GET_MCMTR(m, i)		\
 	readl((m)->mbase + 0x20ef8 + (i) * 0x4000)
 
@@ -168,11 +168,11 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
 			continue;
 
 		ndimms = 0;
+		mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i);
 		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
 			dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 					     mci->n_layers, i, j, 0);
 			mtr = I10NM_GET_DIMMMTR(imc, i, j);
-			mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j);
 			edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n",
 				 mtr, mcddrtcfg, imc->mc, i, j);
 
-- 
Gitee


From b696bc590b4298b75d0fbf8928e3c450f2875758 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 17 Nov 2020 20:49:53 +0800
Subject: [PATCH 2100/2161] EDAC/i10nm: Add Intel Sapphire Rapids server
 support

commit 479f58dda25bb46daeb937f124718e8b4aea6781 upstream.

The Sapphire Rapids CPU model shares the same memory controller
architecture with Ice Lake server. There are some configurations
different from Ice Lake server as below:
- The device ID for configuration agent.
- The size for per channel memory-mapped I/O.
- The DDR5 memory support.
So add the above configurations and the Sapphire Rapids CPU model
ID for EDAC support.

Intel_SIG: commit 479f58dda25b EDAC/i10nm: Add Intel Sapphire
Rapids server support.
Backport for SPR EDAC support.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 34 +++++++++++++++++++++++++---------
 drivers/edac/skx_base.c   |  6 +++---
 drivers/edac/skx_common.c | 23 ++++++++++++++++++-----
 drivers/edac/skx_common.h | 16 ++++++++++++----
 4 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 7999ffea32fd..7ee553c6b72c 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -13,7 +13,7 @@
 #include "edac_module.h"
 #include "skx_common.h"
 
-#define I10NM_REVISION	"v0.0.3"
+#define I10NM_REVISION	"v0.0.4"
 #define EDAC_MOD_STR	"i10nm_edac"
 
 /* Debug macros */
@@ -25,11 +25,13 @@
 #define I10NM_GET_IMC_BAR(d, i, reg)	\
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
-	readl((m)->mbase + 0x2080c + (i) * 0x4000 + (j) * 4)
+	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
-	readl((m)->mbase + 0x20970 + (i) * 0x4000)
+	readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz)
 #define I10NM_GET_MCMTR(m, i)		\
-	readl((m)->mbase + 0x20ef8 + (i) * 0x4000)
+	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
+#define I10NM_GET_AMAP(m, i)		\
+	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
@@ -129,18 +131,29 @@ static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
+	.ddr_chan_mmio_sz	= 0x4000,
 };
 
 static struct res_config i10nm_cfg1 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
+	.ddr_chan_mmio_sz	= 0x4000,
+};
+
+static struct res_config spr_cfg = {
+	.type			= SPR,
+	.decs_did		= 0x3252,
+	.busno_cfg_offset	= 0xd0,
+	.ddr_chan_mmio_sz	= 0x8000,
+	.support_ddr5		= true,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT_D, 0, (kernel_ulong_t)&i10nm_cfg0 },
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_X, 0, (kernel_ulong_t)&i10nm_cfg0 },
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_D, 0, (kernel_ulong_t)&i10nm_cfg1 },
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SAPPHIRERAPIDS_X, 0, (kernel_ulong_t)&spr_cfg },
 	{ }
 };
 MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids);
@@ -155,12 +168,13 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
-static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
+static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
+				 struct res_config *cfg)
 {
 	struct skx_pvt *pvt = mci->pvt_info;
 	struct skx_imc *imc = pvt->imc;
+	u32 mtr, amap, mcddrtcfg;
 	struct dimm_info *dimm;
-	u32 mtr, mcddrtcfg;
 	int i, j, ndimms;
 
 	for (i = 0; i < I10NM_NUM_CHANNELS; i++) {
@@ -168,6 +182,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
 			continue;
 
 		ndimms = 0;
+		amap = I10NM_GET_AMAP(imc, i);
 		mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i);
 		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
 			dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
@@ -177,8 +192,8 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
 				 mtr, mcddrtcfg, imc->mc, i, j);
 
 			if (IS_DIMM_PRESENT(mtr))
-				ndimms += skx_get_dimm_info(mtr, 0, 0, dimm,
-							    imc, i, j);
+				ndimms += skx_get_dimm_info(mtr, 0, amap, dimm,
+							    imc, i, j, cfg);
 			else if (IS_NVDIMM_PRESENT(mcddrtcfg, j))
 				ndimms += skx_get_nvdimm_info(dimm, imc, i, j,
 							      EDAC_MOD_STR);
@@ -306,10 +321,11 @@ static int __init i10nm_init(void)
 			d->imc[i].lmc = i;
 			d->imc[i].src_id  = src_id;
 			d->imc[i].node_id = node_id;
+			d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
 
 			rc = skx_register_mci(&d->imc[i], d->imc[i].mdev,
 					      "Intel_10nm Socket", EDAC_MOD_STR,
-					      i10nm_get_dimm_config);
+					      i10nm_get_dimm_config, cfg);
 			if (rc < 0)
 				goto fail;
 		}
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 49899108cb00..416f429d1b97 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -174,7 +174,7 @@ static bool skx_check_ecc(u32 mcmtr)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
-static int skx_get_dimm_config(struct mem_ctl_info *mci)
+static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 {
 	struct skx_pvt *pvt = mci->pvt_info;
 	u32 mtr, mcmtr, amap, mcddrtcfg;
@@ -196,7 +196,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
 			pci_read_config_dword(imc->chan[i].cdev,
 					      0x80 + 4 * j, &mtr);
 			if (IS_DIMM_PRESENT(mtr)) {
-				ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j);
+				ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j, cfg);
 			} else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) {
 				ndimms += skx_get_nvdimm_info(dimm, imc, i, j,
 							      EDAC_MOD_STR);
@@ -703,7 +703,7 @@ static int __init skx_init(void)
 			d->imc[i].node_id = node_id;
 			rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev,
 					      "Skylake Socket", EDAC_MOD_STR,
-					      skx_get_dimm_config);
+					      skx_get_dimm_config, cfg);
 			if (rc < 0)
 				goto fail;
 		}
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 8ceb993295ce..969cb5261a0d 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -304,15 +304,25 @@ static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add,
 #define numcol(reg)	skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols")
 
 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
-		      struct skx_imc *imc, int chan, int dimmno)
+		      struct skx_imc *imc, int chan, int dimmno,
+		      struct res_config *cfg)
 {
-	int  banks = 16, ranks, rows, cols, npages;
+	int  banks, ranks, rows, cols, npages;
+	enum mem_type mtype;
 	u64 size;
 
 	ranks = numrank(mtr);
 	rows = numrow(mtr);
 	cols = numcol(mtr);
 
+	if (cfg->support_ddr5 && (amap & 0x8)) {
+		banks = 32;
+		mtype = MEM_DDR5;
+	} else {
+		banks = 16;
+		mtype = MEM_DDR4;
+	}
+
 	/*
 	 * Compute size in 8-byte (2^3) words, then shift to MiB (2^20)
 	 */
@@ -332,7 +342,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	dimm->nr_pages = npages;
 	dimm->grain = 32;
 	dimm->dtype = get_width(mtr);
-	dimm->mtype = MEM_DDR4;
+	dimm->mtype = mtype;
 	dimm->edac_mode = EDAC_SECDED; /* likely better than this */
 	snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
 		 imc->src_id, imc->lmc, chan, dimmno);
@@ -390,7 +400,8 @@ int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 
 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 		     const char *ctl_name, const char *mod_str,
-		     get_dimm_config_f get_dimm_config)
+		     get_dimm_config_f get_dimm_config,
+		     struct res_config *cfg)
 {
 	struct mem_ctl_info *mci;
 	struct edac_mc_layer layers[2];
@@ -425,13 +436,15 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 	}
 
 	mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM;
+	if (cfg->support_ddr5)
+		mci->mtype_cap |= MEM_FLAG_DDR5;
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = mod_str;
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
 
-	rc = get_dimm_config(mci);
+	rc = get_dimm_config(mci, cfg);
 	if (rc < 0)
 		goto fail;
 
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 78f8c1de0b71..bf56bebff138 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -59,6 +59,7 @@ struct skx_dev {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
 		void __iomem *mbase;  /* for i10nm CPU */
+		int chan_mmio_sz;     /* for i10nm CPU */
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id, node_id;
@@ -82,7 +83,8 @@ struct skx_pvt {
 
 enum type {
 	SKX,
-	I10NM
+	I10NM,
+	SPR
 };
 
 enum {
@@ -118,9 +120,13 @@ struct res_config {
 	unsigned int decs_did;
 	/* Default bus number configuration register offset */
 	int busno_cfg_offset;
+	/* Per DDR channel memory-mapped I/O size */
+	int ddr_chan_mmio_sz;
+	bool support_ddr5;
 };
 
-typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci);
+typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
+				 struct res_config *cfg);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
 typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
 
@@ -136,14 +142,16 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list);
 int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm);
 
 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
-		      struct skx_imc *imc, int chan, int dimmno);
+		      struct skx_imc *imc, int chan, int dimmno,
+		      struct res_config *cfg);
 
 int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 			int chan, int dimmno, const char *mod_str);
 
 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 		     const char *ctl_name, const char *mod_str,
-		     get_dimm_config_f get_dimm_config);
+		     get_dimm_config_f get_dimm_config,
+		     struct res_config *cfg);
 
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data);
-- 
Gitee


From 2024e430b23576bdb2e2b9c3f4ed1fb031bcc34d Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Wed, 18 Aug 2021 10:57:01 -0700
Subject: [PATCH 2101/2161] EDAC/i10nm: Retrieve and print retry_rd_err_log
 registers

commit cf4e6d52f58399c777276172ec250502e19d5e63 upstream.

Retrieve and print retry_rd_err_log registers like the earlier change:
commit e80634a75aba ("EDAC, skx: Retrieve and print retry_rd_err_log registers")

This is a little trickier than on Skylake because of potential
interference with BIOS use of the same registers. The default
behavior is to ignore these registers.

A module parameter retry_rd_err_log(default=0) controls the mode of operation:
- 0=off  : Default.
- 1=bios : Linux doesn't reset any control bits, but just reports values.
           This is "no harm" mode, but it may miss reporting some data.
- 2=linux: Linux tries to take control and resets mode bits,
           clears valid/UC bits after reading. This should be
           more reliable (especially if BIOS interference is reduced
           by disabling eMCA reporting mode in BIOS setup).

Intel-SIG: commit cf4e6d52f583 EDAC/i10nm: Retrieve and print
retry_rd_err_log registers.
backport for enhance MCA-R.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210818175701.1611513-3-tony.luck@intel.com
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 146 ++++++++++++++++++++++++++++++++++++++
 drivers/edac/skx_base.c   |   3 +-
 drivers/edac/skx_common.c |   4 +-
 drivers/edac/skx_common.h |   7 +-
 4 files changed, 157 insertions(+), 3 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 7ee553c6b72c..0582a43c600f 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -32,14 +32,137 @@
 	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
 #define I10NM_GET_AMAP(m, i)		\
 	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
+#define I10NM_GET_REG32(m, i, offset)	\
+	readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_GET_REG64(m, i, offset)	\
+	readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_SET_REG32(m, i, offset, v)	\
+	writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
 
+#define RETRY_RD_ERR_LOG_UC		BIT(1)
+#define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
+#define RETRY_RD_ERR_LOG_EN		BIT(15)
+#define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
+#define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
+
 static struct list_head *i10nm_edac_list;
 
+static struct res_config *res_cfg;
+static int retry_rd_err_log;
+
+static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+
+static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
+{
+	u32 s, d;
+
+	if (!imc->mbase)
+		return;
+
+	s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
+	d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
+
+	if (enable) {
+		/* Save default configurations */
+		imc->chan[chan].retry_rd_err_log_s = s;
+		imc->chan[chan].retry_rd_err_log_d = d;
+
+		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		s |=  RETRY_RD_ERR_LOG_EN;
+		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		d |=  RETRY_RD_ERR_LOG_EN;
+	} else {
+		/* Restore default configurations */
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
+			s |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER)
+			s |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN))
+			s &= ~RETRY_RD_ERR_LOG_EN;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC)
+			d |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER)
+			d |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
+			d &= ~RETRY_RD_ERR_LOG_EN;
+	}
+
+	I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
+	I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
+}
+
+static void enable_retry_rd_err_log(bool enable)
+{
+	struct skx_dev *d;
+	int i, j;
+
+	edac_dbg(2, "\n");
+
+	list_for_each_entry(d, i10nm_edac_list, list)
+		for (i = 0; i < I10NM_NUM_IMC; i++)
+			for (j = 0; j < I10NM_NUM_CHANNELS; j++)
+				__enable_retry_rd_err_log(&d->imc[i], j, enable);
+}
+
+static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
+				  int len, bool scrub_err)
+{
+	struct skx_imc *imc = &res->dev->imc[res->imc];
+	u32 log0, log1, log2, log3, log4;
+	u32 corr0, corr1, corr2, corr3;
+	u64 log2a, log5;
+	u32 *offsets;
+	int n;
+
+	if (!imc->mbase)
+		return;
+
+	offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
+
+	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
+	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
+	log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]);
+	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
+	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
+
+	if (res_cfg->type == SPR) {
+		log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]",
+			     log0, log1, log2a, log3, log4, log5);
+	} else {
+		log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
+			     log0, log1, log2, log3, log4, log5);
+	}
+
+	corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
+	corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
+	corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
+	corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+
+	if (len - n > 0)
+		snprintf(msg + n, len - n,
+			 " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]",
+			 corr0 & 0xffff, corr0 >> 16,
+			 corr1 & 0xffff, corr1 >> 16,
+			 corr2 & 0xffff, corr2 >> 16,
+			 corr3 & 0xffff, corr3 >> 16);
+
+	/* Clear status bits */
+	if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
+		log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+		I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+	}
+}
+
 static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 					   unsigned int dev, unsigned int fun)
 {
@@ -132,6 +255,8 @@ static struct res_config i10nm_cfg0 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config i10nm_cfg1 = {
@@ -139,6 +264,8 @@ static struct res_config i10nm_cfg1 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config spr_cfg = {
@@ -147,6 +274,8 @@ static struct res_config spr_cfg = {
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
 	.support_ddr5		= true,
+	.offsets_scrub		= offsets_scrub_spr,
+	.offsets_demand		= offsets_demand_spr,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
@@ -282,6 +411,7 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 
 	cfg = (struct res_config *)id->driver_data;
+	res_cfg = cfg;
 
 	/* Newer steppings have different offset for ATOM_TREMONT_D/ICELAKE_X */
 	if (boot_cpu_data.x86_stepping >= 4)
@@ -339,6 +469,12 @@ static int __init i10nm_init(void)
 	mce_register_decode_chain(&i10nm_mce_dec);
 	setup_i10nm_debug();
 
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, show_retry_rd_err_log);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(true);
+	}
+
 	i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
 
 	return 0;
@@ -350,6 +486,13 @@ static int __init i10nm_init(void)
 static void __exit i10nm_exit(void)
 {
 	edac_dbg(2, "\n");
+
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, NULL);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(false);
+	}
+
 	teardown_i10nm_debug();
 	mce_unregister_decode_chain(&i10nm_mce_dec);
 	skx_adxl_put();
@@ -359,5 +502,8 @@ static void __exit i10nm_exit(void)
 module_init(i10nm_init);
 module_exit(i10nm_exit);
 
+module_param(retry_rd_err_log, int, 0444);
+MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
+
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors");
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 416f429d1b97..9d9132fb09fb 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -231,7 +231,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 #define SKX_ILV_TARGET(tgt)	((tgt) & 7)
 
 static void skx_show_retry_rd_err_log(struct decoded_addr *res,
-				      char *msg, int len)
+				      char *msg, int len,
+				      bool scrub_err)
 {
 	u32 log0, log1, log2, log3, log4;
 	u32 corr0, corr1, corr2, corr3;
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 969cb5261a0d..7100f947687e 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -494,6 +494,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
+	bool scrub_err = false;
 	bool recoverable;
 	int len;
 	u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
@@ -545,6 +546,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			break;
 		case 4:
 			optype = "memory scrubbing error";
+			scrub_err = true;
 			break;
 		default:
 			optype = "reserved";
@@ -567,7 +569,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	}
 
 	if (skx_show_retry_rd_err_log)
-		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len);
+		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err);
 
 	edac_dbg(0, "%s\n", skx_msg);
 
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index bf56bebff138..22e174c740be 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -66,6 +66,8 @@ struct skx_dev {
 		struct skx_channel {
 			struct pci_dev	*cdev;
 			struct pci_dev	*edev;
+			u32 retry_rd_err_log_s;
+			u32 retry_rd_err_log_d;
 			struct skx_dimm {
 				u8 close_pg;
 				u8 bank_xor_enable;
@@ -123,12 +125,15 @@ struct res_config {
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
 	bool support_ddr5;
+	/* Offsets of retry_rd_err_log registers */
+	u32 *offsets_scrub;
+	u32 *offsets_demand;
 };
 
 typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
 				 struct res_config *cfg);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
-typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
+typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err);
 
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
-- 
Gitee


From 659afbae7f513d4f13d1745edb67ea3fb6c70107 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 11 Jun 2021 10:01:18 -0700
Subject: [PATCH 2102/2161] EDAC/skx_common: Add new ADXL components for
 2-level memory

commit 2f4348e5a86198704368a699a7c4cdeb21d569f5 upstream.

Some Intel servers may configure memory in 2 levels, using
fast "near" memory (e.g. DDR) as a cache for larger, slower,
"far" memory (e.g. 3D X-point).

In these configurations the BIOS ADXL address translation for
an address in a 2-level memory range will provide details of
both the "near" and far components.

Current exported ADXL components are only for 1-level memory
system or for 2nd level memory of 2-level memory system. So
add new ADXL components for 1st level memory of 2-level memory
system to fully support 2-level memory system and the detection
of memory error source(1st level memory or 2nd level memory).

Intel-SIG: commit 2f4348e5a861 EDAC/skx_common: Add new ADXL components for 2-level
 memory.
Backport to add EDAC 2LM support.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-2-tony.luck@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 67 ++++++++++++++++++++++++++++++++-------
 drivers/edac/skx_common.h | 11 +++++++
 2 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 7100f947687e..daa20f3ee3f2 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -23,10 +23,13 @@
 #include "skx_common.h"
 
 static const char * const component_names[] = {
-	[INDEX_SOCKET]	= "ProcessorSocketId",
-	[INDEX_MEMCTRL]	= "MemoryControllerId",
-	[INDEX_CHANNEL]	= "ChannelId",
-	[INDEX_DIMM]	= "DimmSlotId",
+	[INDEX_SOCKET]		= "ProcessorSocketId",
+	[INDEX_MEMCTRL]		= "MemoryControllerId",
+	[INDEX_CHANNEL]		= "ChannelId",
+	[INDEX_DIMM]		= "DimmSlotId",
+	[INDEX_NM_MEMCTRL]	= "NmMemoryControllerId",
+	[INDEX_NM_CHANNEL]	= "NmChannelId",
+	[INDEX_NM_DIMM]		= "NmDimmSlotId",
 };
 
 static int component_indices[ARRAY_SIZE(component_names)];
@@ -34,12 +37,14 @@ static int adxl_component_count;
 static const char * const *adxl_component_names;
 static u64 *adxl_values;
 static char *adxl_msg;
+static unsigned long adxl_nm_bitmap;
 
 static char skx_msg[MSG_SIZE];
 static skx_decode_f skx_decode;
 static skx_show_retry_log_f skx_show_retry_rd_err_log;
 static u64 skx_tolm, skx_tohm;
 static LIST_HEAD(dev_edac_list);
+static bool skx_mem_cfg_2lm;
 
 int __init skx_adxl_get(void)
 {
@@ -56,14 +61,25 @@ int __init skx_adxl_get(void)
 		for (j = 0; names[j]; j++) {
 			if (!strcmp(component_names[i], names[j])) {
 				component_indices[i] = j;
+
+				if (i >= INDEX_NM_FIRST)
+					adxl_nm_bitmap |= 1 << i;
+
 				break;
 			}
 		}
 
-		if (!names[j])
+		if (!names[j] && i < INDEX_NM_FIRST)
 			goto err;
 	}
 
+	if (skx_mem_cfg_2lm) {
+		if (!adxl_nm_bitmap)
+			skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n");
+		else
+			edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap);
+	}
+
 	adxl_component_names = names;
 	while (*names++)
 		adxl_component_count++;
@@ -99,7 +115,7 @@ void __exit skx_adxl_put(void)
 	kfree(adxl_msg);
 }
 
-static bool skx_adxl_decode(struct decoded_addr *res)
+static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
 {
 	struct skx_dev *d;
 	int i, len = 0;
@@ -116,11 +132,20 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	}
 
 	res->socket  = (int)adxl_values[component_indices[INDEX_SOCKET]];
-	res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
-	res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
-	res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	if (error_in_1st_level_mem) {
+		res->imc     = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
+		res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
+		res->dimm    = (adxl_nm_bitmap & BIT_NM_DIMM) ?
+			       (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
+	} else {
+		res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
+		res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
+		res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	}
 
-	if (res->imc > NUM_IMC - 1) {
+	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
 		skx_printk(KERN_ERR, "Bad imc %d\n", res->imc);
 		return false;
 	}
@@ -151,6 +176,11 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	return true;
 }
 
+void skx_set_mem_cfg(bool mem_cfg_2lm)
+{
+	skx_mem_cfg_2lm = mem_cfg_2lm;
+}
+
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
 {
 	skx_decode = decode;
@@ -580,6 +610,21 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			     optype, skx_msg);
 }
 
+static bool skx_error_in_1st_level_mem(const struct mce *m)
+{
+	u32 errcode;
+
+	if (!skx_mem_cfg_2lm)
+		return false;
+
+	errcode = GET_BITFIELD(m->status, 0, 15);
+
+	if ((errcode & 0xef80) != 0x280)
+		return false;
+
+	return true;
+}
+
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -602,7 +647,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	res.addr = mce->addr;
 
 	if (adxl_component_count) {
-		if (!skx_adxl_decode(&res))
+		if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))
 			return NOTIFY_DONE;
 	} else if (!skx_decode || !skx_decode(&res)) {
 		return NOTIFY_DONE;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 22e174c740be..de9f82977f0e 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -9,6 +9,8 @@
 #ifndef _SKX_COMM_EDAC_H
 #define _SKX_COMM_EDAC_H
 
+#include <linux/bits.h>
+
 #define MSG_SIZE		1024
 
 /*
@@ -94,9 +96,17 @@ enum {
 	INDEX_MEMCTRL,
 	INDEX_CHANNEL,
 	INDEX_DIMM,
+	INDEX_NM_FIRST,
+	INDEX_NM_MEMCTRL = INDEX_NM_FIRST,
+	INDEX_NM_CHANNEL,
+	INDEX_NM_DIMM,
 	INDEX_MAX
 };
 
+#define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
+#define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
+#define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
+
 struct decoded_addr {
 	struct skx_dev *dev;
 	u64	addr;
@@ -138,6 +148,7 @@ typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int le
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
+void skx_set_mem_cfg(bool mem_cfg_2lm);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
 int skx_get_node_id(struct skx_dev *d, u8 *id);
-- 
Gitee


From 92c5e5a2eba4f78b2335822e6909bd0815cb163c Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 11 Jun 2021 10:01:19 -0700
Subject: [PATCH 2103/2161] EDAC/i10nm: Add detection of memory levels for
 ICX/SPR servers

commit 4bd4d32e9a38d7ffb091b4109ab63c8f601e5678 upstream.

Current i10nm_edac driver is only for system configured in 1-level
memory. If the system is configured in 2-level memory, the driver
doesn't report the 1st level memory DIMM for the error address, even
if the error occurs in the 1st level memory.

Both Ice Lake servers and Sapphire Rapids servers can be configured
in 2-level memory. Add detection of memory levels to i10nm_edac for
the two kinds of servers so that the driver can report the 2nd level
memory DIMM or the 1st level memory DIMM according to error source.

Intel-SIG: commit 4bd4d32e9a38 EDAC/i10nm: Add detection of memory levels for ICX/SPR
 servers.
Backport to add EDAC 2LM support.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-3-tony.luck@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 39 +++++++++++++++++++++++++++++++++++++++
 drivers/edac/skx_common.h |  3 +++
 2 files changed, 42 insertions(+)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 0582a43c600f..ee996419aa65 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -24,6 +24,8 @@
 	pci_read_config_dword((d)->uracu, 0xd0, &(reg))
 #define I10NM_GET_IMC_BAR(d, i, reg)	\
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
+#define I10NM_GET_SAD(d, offset, i, reg)\
+	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
 	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
@@ -50,6 +52,10 @@
 #define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
 #define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
 
+#define I10NM_MAX_SAD			16
+#define I10NM_SAD_ENABLE(reg)		GET_BITFIELD(reg, 0, 0)
+#define I10NM_SAD_NM_CACHEABLE(reg)	GET_BITFIELD(reg, 5, 5)
+
 static struct list_head *i10nm_edac_list;
 
 static struct res_config *res_cfg;
@@ -186,6 +192,31 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 	return pdev;
 }
 
+static bool i10nm_check_2lm(struct res_config *cfg)
+{
+	struct skx_dev *d;
+	u32 reg;
+	int i;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1],
+						 PCI_SLOT(cfg->sad_all_devfn),
+						 PCI_FUNC(cfg->sad_all_devfn));
+		if (!d->sad_all)
+			continue;
+
+		for (i = 0; i < I10NM_MAX_SAD; i++) {
+			I10NM_GET_SAD(d, cfg->sad_all_offset, i, reg);
+			if (I10NM_SAD_ENABLE(reg) && I10NM_SAD_NM_CACHEABLE(reg)) {
+				edac_dbg(2, "2-level memory configuration.\n");
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
 static int i10nm_get_all_munits(void)
 {
 	struct pci_dev *mdev;
@@ -255,6 +286,8 @@ static struct res_config i10nm_cfg0 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
 	.offsets_scrub		= offsets_scrub_icx,
 	.offsets_demand		= offsets_demand_icx,
 };
@@ -264,6 +297,8 @@ static struct res_config i10nm_cfg1 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
 	.offsets_scrub		= offsets_scrub_icx,
 	.offsets_demand		= offsets_demand_icx,
 };
@@ -274,6 +309,8 @@ static struct res_config spr_cfg = {
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
 	.support_ddr5		= true,
+	.sad_all_devfn		= PCI_DEVFN(10, 0),
+	.sad_all_offset		= 0x300,
 	.offsets_scrub		= offsets_scrub_spr,
 	.offsets_demand		= offsets_demand_spr,
 };
@@ -429,6 +466,8 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 	}
 
+	skx_set_mem_cfg(i10nm_check_2lm(cfg));
+
 	rc = i10nm_get_all_munits();
 	if (rc < 0)
 		goto fail;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index de9f82977f0e..1fb7540a7092 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -135,6 +135,9 @@ struct res_config {
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
 	bool support_ddr5;
+	/* SAD device number and function number */
+	unsigned int sad_all_devfn;
+	int sad_all_offset;
 	/* Offsets of retry_rd_err_log registers */
 	u32 *offsets_scrub;
 	u32 *offsets_demand;
-- 
Gitee


From 81740bf45a9f15329806b85be9e730cb32d92fdf Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Thu, 21 Jul 2022 13:25:07 +0800
Subject: [PATCH 2104/2161] EDAC/i10nm: Add support for high bandwidth memory

commit c945088384d00e6eb61535cc4ba25bc062090909 upstream.

A future Xeon processor will include in-package HBM (high bandwidth
memory). The in-package HBM memory controller shares the same
architecture with the regular DDR memory controller.

Add the HBM memory controller devices for EDAC support.

Intel-SIG: commit c945088384d0 EDAC/i10nm: Add support for high bandwidth memory.
Backport to add EDAC HBM support.

Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-4-tony.luck@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 132 ++++++++++++++++++++++++++++++++++----
 drivers/edac/skx_common.c |  15 +++--
 drivers/edac/skx_common.h |  20 +++++-
 3 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index ee996419aa65..0c8b6d080ca0 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -13,7 +13,7 @@
 #include "edac_module.h"
 #include "skx_common.h"
 
-#define I10NM_REVISION	"v0.0.4"
+#define I10NM_REVISION	"v0.0.5"
 #define EDAC_MOD_STR	"i10nm_edac"
 
 /* Debug macros */
@@ -26,14 +26,22 @@
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
 #define I10NM_GET_SAD(d, offset, i, reg)\
 	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
+#define I10NM_GET_HBM_IMC_BAR(d, reg)	\
+	pci_read_config_dword((d)->uracu, 0xd4, &(reg))
+#define I10NM_GET_CAPID3_CFG(d, reg)	\
+	pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
-	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \
+	(i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
-	readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_MCMTR(m, i)		\
-	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_AMAP(m, i)		\
-	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_REG32(m, i, offset)	\
 	readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
 #define I10NM_GET_REG64(m, i, offset)	\
@@ -45,6 +53,12 @@
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
+#define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg)	\
+	((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000)
+
+#define I10NM_HBM_IMC_MMIO_SIZE		0x9000
+#define I10NM_IS_HBM_PRESENT(reg)	GET_BITFIELD(reg, 27, 30)
+#define I10NM_IS_HBM_IMC(reg)		GET_BITFIELD(reg, 29, 29)
 
 #define RETRY_RD_ERR_LOG_UC		BIT(1)
 #define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
@@ -217,7 +231,7 @@ static bool i10nm_check_2lm(struct res_config *cfg)
 	return false;
 }
 
-static int i10nm_get_all_munits(void)
+static int i10nm_get_ddr_munits(void)
 {
 	struct pci_dev *mdev;
 	void __iomem *mbase;
@@ -245,7 +259,7 @@ static int i10nm_get_all_munits(void)
 		edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
 			 j++, base, reg);
 
-		for (i = 0; i < I10NM_NUM_IMC; i++) {
+		for (i = 0; i < I10NM_NUM_DDR_IMC; i++) {
 			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
 						   12 + i, 0);
 			if (i == 0 && !mdev) {
@@ -281,6 +295,90 @@ static int i10nm_get_all_munits(void)
 	return 0;
 }
 
+static bool i10nm_check_hbm_imc(struct skx_dev *d)
+{
+	u32 reg;
+
+	if (I10NM_GET_CAPID3_CFG(d, reg)) {
+		i10nm_printk(KERN_ERR, "Failed to get capid3_cfg\n");
+		return false;
+	}
+
+	return I10NM_IS_HBM_PRESENT(reg) != 0;
+}
+
+static int i10nm_get_hbm_munits(void)
+{
+	struct pci_dev *mdev;
+	void __iomem *mbase;
+	u32 reg, off, mcmtr;
+	struct skx_dev *d;
+	int i, lmc;
+	u64 base;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3);
+		if (!d->pcu_cr3)
+			return -ENODEV;
+
+		if (!i10nm_check_hbm_imc(d)) {
+			i10nm_printk(KERN_DEBUG, "No hbm memory\n");
+			return -ENODEV;
+		}
+
+		if (I10NM_GET_SCK_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get socket bar\n");
+			return -ENODEV;
+		}
+		base = I10NM_GET_SCK_MMIO_BASE(reg);
+
+		if (I10NM_GET_HBM_IMC_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get hbm mc bar\n");
+			return -ENODEV;
+		}
+		base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg);
+
+		lmc = I10NM_NUM_DDR_IMC;
+
+		for (i = 0; i < I10NM_NUM_HBM_IMC; i++) {
+			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
+						   12 + i / 4, 1 + i % 4);
+			if (i == 0 && !mdev) {
+				i10nm_printk(KERN_ERR, "No hbm mc found\n");
+				return -ENODEV;
+			}
+			if (!mdev)
+				continue;
+
+			d->imc[lmc].mdev = mdev;
+			off = i * I10NM_HBM_IMC_MMIO_SIZE;
+
+			edac_dbg(2, "hbm mc%d mmio base 0x%llx size 0x%x\n",
+				 lmc, base + off, I10NM_HBM_IMC_MMIO_SIZE);
+
+			mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE);
+			if (!mbase) {
+				i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n",
+					     base + off);
+				return -ENOMEM;
+			}
+
+			d->imc[lmc].mbase = mbase;
+			d->imc[lmc].hbm_mc = true;
+
+			mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0);
+			if (!I10NM_IS_HBM_IMC(mcmtr)) {
+				i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n");
+				return -ENODEV;
+			}
+
+			lmc++;
+		}
+	}
+
+	return 0;
+}
+
 static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
@@ -308,6 +406,7 @@ static struct res_config spr_cfg = {
 	.decs_did		= 0x3252,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
+	.hbm_chan_mmio_sz	= 0x4000,
 	.support_ddr5		= true,
 	.sad_all_devfn		= PCI_DEVFN(10, 0),
 	.sad_all_offset		= 0x300,
@@ -343,14 +442,14 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
 	struct dimm_info *dimm;
 	int i, j, ndimms;
 
-	for (i = 0; i < I10NM_NUM_CHANNELS; i++) {
+	for (i = 0; i < imc->num_channels; i++) {
 		if (!imc->mbase)
 			continue;
 
 		ndimms = 0;
 		amap = I10NM_GET_AMAP(imc, i);
 		mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i);
-		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
+		for (j = 0; j < imc->num_dimms; j++) {
 			dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 					     mci->n_layers, i, j, 0);
 			mtr = I10NM_GET_DIMMMTR(imc, i, j);
@@ -468,8 +567,9 @@ static int __init i10nm_init(void)
 
 	skx_set_mem_cfg(i10nm_check_2lm(cfg));
 
-	rc = i10nm_get_all_munits();
-	if (rc < 0)
+	rc = i10nm_get_ddr_munits();
+
+	if (i10nm_get_hbm_munits() && rc)
 		goto fail;
 
 	list_for_each_entry(d, i10nm_edac_list, list) {
@@ -490,7 +590,15 @@ static int __init i10nm_init(void)
 			d->imc[i].lmc = i;
 			d->imc[i].src_id  = src_id;
 			d->imc[i].node_id = node_id;
-			d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+			if (d->imc[i].hbm_mc) {
+				d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_HBM_DIMMS;
+			} else {
+				d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_DDR_DIMMS;
+			}
 
 			rc = skx_register_mci(&d->imc[i], d->imc[i].mdev,
 					      "Intel_10nm Socket", EDAC_MOD_STR,
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index daa20f3ee3f2..4a0c94b897bb 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -343,9 +343,9 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 
 	ranks = numrank(mtr);
 	rows = numrow(mtr);
-	cols = numcol(mtr);
+	cols = imc->hbm_mc ? 6 : numcol(mtr);
 
-	if (cfg->support_ddr5 && (amap & 0x8)) {
+	if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) {
 		banks = 32;
 		mtype = MEM_DDR5;
 	} else {
@@ -374,8 +374,13 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	dimm->dtype = get_width(mtr);
 	dimm->mtype = mtype;
 	dimm->edac_mode = EDAC_SECDED; /* likely better than this */
-	snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
-		 imc->src_id, imc->lmc, chan, dimmno);
+
+	if (imc->hbm_mc)
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u",
+			 imc->src_id, imc->lmc, chan);
+	else
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
+			 imc->src_id, imc->lmc, chan, dimmno);
 
 	return 1;
 }
@@ -708,6 +713,8 @@ void skx_remove(void)
 		}
 		if (d->util_all)
 			pci_dev_put(d->util_all);
+		if (d->pcu_cr3)
+			pci_dev_put(d->pcu_cr3);
 		if (d->sad_all)
 			pci_dev_put(d->sad_all);
 		if (d->uracu)
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 1fb7540a7092..03ac067a80b9 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -32,9 +32,17 @@
 #define SKX_NUM_CHANNELS	3	/* Channels per memory controller */
 #define SKX_NUM_DIMMS		2	/* Max DIMMS per channel */
 
-#define I10NM_NUM_IMC		4
-#define I10NM_NUM_CHANNELS	2
-#define I10NM_NUM_DIMMS		2
+#define I10NM_NUM_DDR_IMC	4
+#define I10NM_NUM_DDR_CHANNELS	2
+#define I10NM_NUM_DDR_DIMMS	2
+
+#define I10NM_NUM_HBM_IMC	16
+#define I10NM_NUM_HBM_CHANNELS	2
+#define I10NM_NUM_HBM_DIMMS	1
+
+#define I10NM_NUM_IMC		(I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC)
+#define I10NM_NUM_CHANNELS	MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS)
+#define I10NM_NUM_DIMMS		MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS)
 
 #define MAX(a, b)	((a) > (b) ? (a) : (b))
 #define NUM_IMC		MAX(SKX_NUM_IMC, I10NM_NUM_IMC)
@@ -56,12 +64,16 @@ struct skx_dev {
 	struct pci_dev *sad_all;
 	struct pci_dev *util_all;
 	struct pci_dev *uracu; /* for i10nm CPU */
+	struct pci_dev *pcu_cr3; /* for HBM memory detection */
 	u32 mcroute;
 	struct skx_imc {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
 		void __iomem *mbase;  /* for i10nm CPU */
 		int chan_mmio_sz;     /* for i10nm CPU */
+		int num_channels; /* channels per memory controller */
+		int num_dimms; /* dimms per channel */
+		bool hbm_mc;
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id, node_id;
@@ -134,6 +146,8 @@ struct res_config {
 	int busno_cfg_offset;
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
+	/* Per HBM channel memory-mapped I/O size */
+	int hbm_chan_mmio_sz;
 	bool support_ddr5;
 	/* SAD device number and function number */
 	unsigned int sad_all_devfn;
-- 
Gitee


From 9544554bef89ef667392af2752cf449009726d13 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Thu, 1 Sep 2022 12:43:08 -0700
Subject: [PATCH 2105/2161] EDAC/skx_common: Use driver decoder first

commit fe32f366931a950889e8d72be86fafc867dab777 upstream.

The performance of driver decoder[1] is better than the performance
of firmware decoder[2], especially on frequent correctable errors.

So use the driver decoder first, fall back to firmware decoder if
the driver decoder is unavailable. Also rename the function pointer
skx_decode to driver_decode (better name to contrast with adxl_decode).

[1] Decode errors by extracting error information from registers of
    memory controllers and/or MCA bank registers.

[2] Decode errors by calling ACPI DSM methods.

Intel-SIG: commit fe32f366931a EDAC/skx_common: Use driver decoder first.
Backport to decode DDR error by MCA bank registers in replace of firmware.

Co-developed-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220901194310.115427-1-tony.luck@intel.com/
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_base.c   |  9 +++++++--
 drivers/edac/skx_common.c | 16 +++++++++-------
 drivers/edac/skx_common.h |  1 +
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 9d9132fb09fb..1d078c37e856 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -712,8 +712,13 @@ static int __init skx_init(void)
 
 	skx_set_decode(skx_decode, skx_show_retry_rd_err_log);
 
-	if (nvdimm_count && skx_adxl_get() == -ENODEV)
-		skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n");
+	if (nvdimm_count && skx_adxl_get() != -ENODEV) {
+		skx_set_decode(NULL, skx_show_retry_rd_err_log);
+	} else {
+		if (nvdimm_count)
+			skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n");
+		skx_set_decode(skx_decode, skx_show_retry_rd_err_log);
+	}
 
 	/* Ensure that the OPSTATE is set correctly for POLL or NMI */
 	opstate_init();
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 4a0c94b897bb..94fbffde174c 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -40,7 +40,7 @@ static char *adxl_msg;
 static unsigned long adxl_nm_bitmap;
 
 static char skx_msg[MSG_SIZE];
-static skx_decode_f skx_decode;
+static skx_decode_f driver_decode;
 static skx_show_retry_log_f skx_show_retry_rd_err_log;
 static u64 skx_tolm, skx_tohm;
 static LIST_HEAD(dev_edac_list);
@@ -173,6 +173,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
 			break;
 	}
 
+	res->decoded_by_adxl = true;
+
 	return true;
 }
 
@@ -183,7 +185,7 @@ void skx_set_mem_cfg(bool mem_cfg_2lm)
 
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
 {
-	skx_decode = decode;
+	driver_decode = decode;
 	skx_show_retry_rd_err_log = show_retry_log;
 }
 
@@ -588,7 +590,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			break;
 		}
 	}
-	if (adxl_component_count) {
+	if (res->decoded_by_adxl) {
 		len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s",
 			 overflow ? " OVERFLOW" : "",
 			 (uncorrected_error && recoverable) ? " recoverable" : "",
@@ -651,11 +653,11 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	memset(&res, 0, sizeof(res));
 	res.addr = mce->addr;
 
-	if (adxl_component_count) {
-		if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))
+	/* Try driver decoder first */
+	if (!(driver_decode && driver_decode(&res))) {
+		/* Then try firmware decoder (ACPI DSM methods) */
+		if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))))
 			return NOTIFY_DONE;
-	} else if (!skx_decode || !skx_decode(&res)) {
-		return NOTIFY_DONE;
 	}
 
 	mci = res.dev->imc[res.imc].mci;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 03ac067a80b9..880ecd15ca42 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -136,6 +136,7 @@ struct decoded_addr {
 	int	column;
 	int	bank_address;
 	int	bank_group;
+	bool	decoded_by_adxl;
 };
 
 struct res_config {
-- 
Gitee


From 2a3ba6b94236570a04054407ea5340eae3581303 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Thu, 1 Sep 2022 12:43:09 -0700
Subject: [PATCH 2106/2161] EDAC/skx_common: Make output format similar

commit 627d551a9e75ef81525822ba2a0d9d5a64791d89 upstream.

The decoded output format of driver decoder is different from the
output format of firmware decoder. Make output format similar regardless
of decode function (Align driver decoder's to firmware decoder's).

Intel-SIG: commit 627d551a9e75 EDAC/skx_common: Make output format similar.
Backport to decode DDR error by MCA bank registers in replace of firmware.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220901194310.115427-1-tony.luck@intel.com/
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 94fbffde174c..28ee46628202 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -597,12 +597,12 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			 mscod, errcode, adxl_msg);
 	} else {
 		len = snprintf(skx_msg, MSG_SIZE,
-			 "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x",
+			 "%s%s err_code:0x%04x:0x%04x ProcessorSocketId:0x%x MemoryControllerId:0x%x PhysicalRankId:0x%x Row:0x%x Column:0x%x Bank:0x%x BankGroup:0x%x",
 			 overflow ? " OVERFLOW" : "",
 			 (uncorrected_error && recoverable) ? " recoverable" : "",
 			 mscod, errcode,
 			 res->socket, res->imc, res->rank,
-			 res->bank_group, res->bank_address, res->row, res->column);
+			 res->row, res->column, res->bank_address, res->bank_group);
 	}
 
 	if (skx_show_retry_rd_err_log)
-- 
Gitee


From 6b6f1d506f354446b1544da3eb5d3bc6fcd7f434 Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Thu, 1 Sep 2022 12:43:10 -0700
Subject: [PATCH 2107/2161] EDAC/i10nm: Add driver decoder for Ice Lake and
 Tremont CPUs

commit 2738c69a8813453b35549465867ae591f8598eb0 upstream.

Current i10nm_edac only supports firmware decoder (ACPI DSM methods).
MCA bank registers of Ice Lake or Tremont CPUs contain the information
to decode DDR memory errors. To get better decoding performance, add
the driver decoder (decoding DDR memory errors via extracting error
information from MCA bank registers) for Ice Lake and Tremont CPUs.

Intel-SIG: commit 2738c69a8813 EDAC/i10nm: Add driver decoder for Ice Lake and Tremont
 CPUs.
Backport to decode DDR error by MCA bank registers in replace of firmware.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220901194310.115427-1-tony.luck@intel.com/
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/include/asm/mce.h |   1 +
 drivers/edac/i10nm_base.c  | 134 ++++++++++++++++++++++++++++++++++++-
 drivers/edac/skx_common.c  |   1 +
 drivers/edac/skx_common.h  |   5 ++
 4 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index dd2e21b130eb..124b27c2cf69 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,7 @@
 #define MCI_STATUS_CEC_SHIFT	38           /* Corrected Error Count */
 #define MCI_STATUS_CEC_MASK	GENMASK_ULL(52,38)
 #define MCI_STATUS_CEC(c)	(((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT)
+#define MCI_STATUS_MSCOD(m)	(((m) >> 16) & 0xffff)
 
 /* AMD-specific bits */
 #define MCI_STATUS_TCC		BIT_ULL(55)  /* Task context corrupt */
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 0c8b6d080ca0..6ffd6b7c7262 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -74,6 +74,8 @@ static struct list_head *i10nm_edac_list;
 
 static struct res_config *res_cfg;
 static int retry_rd_err_log;
+static int decoding_via_mca;
+static bool mem_cfg_2lm;
 
 static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
 static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
@@ -231,6 +233,103 @@ static bool i10nm_check_2lm(struct res_config *cfg)
 	return false;
 }
 
+/*
+ * Check whether the error comes from DDRT by ICX/Tremont model specific error code.
+ * Refer to SDM vol3B 16.11.3 Intel IMC MC error codes for IA32_MCi_STATUS.
+ */
+static bool i10nm_mscod_is_ddrt(u32 mscod)
+{
+	switch (mscod) {
+	case 0x0106: case 0x0107:
+	case 0x0800: case 0x0804:
+	case 0x0806 ... 0x0808:
+	case 0x080a ... 0x080e:
+	case 0x0810: case 0x0811:
+	case 0x0816: case 0x081e:
+	case 0x081f:
+		return true;
+	}
+
+	return false;
+}
+
+static bool i10nm_mc_decode_available(struct mce *mce)
+{
+	u8 bank;
+
+	if (!decoding_via_mca || mem_cfg_2lm)
+		return false;
+
+	if ((mce->status & (MCI_STATUS_MISCV | MCI_STATUS_ADDRV))
+			!= (MCI_STATUS_MISCV | MCI_STATUS_ADDRV))
+		return false;
+
+	bank = mce->bank;
+
+	switch (res_cfg->type) {
+	case I10NM:
+		if (bank < 13 || bank > 26)
+			return false;
+
+		/* DDRT errors can't be decoded from MCA bank registers */
+		if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT)
+			return false;
+
+		if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status)))
+			return false;
+
+		/* Check whether one of {13,14,17,18,21,22,25,26} */
+		return ((bank - 13) & BIT(1)) == 0;
+	default:
+		return false;
+	}
+}
+
+static bool i10nm_mc_decode(struct decoded_addr *res)
+{
+	struct mce *m = res->mce;
+	struct skx_dev *d;
+	u8 bank;
+
+	if (!i10nm_mc_decode_available(m))
+		return false;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		if (d->imc[0].src_id == m->socketid) {
+			res->socket = m->socketid;
+			res->dev = d;
+			break;
+		}
+	}
+
+	switch (res_cfg->type) {
+	case I10NM:
+		bank = m->bank - 13;
+		res->imc = bank / 4;
+		res->channel = bank % 2;
+		break;
+	default:
+		return false;
+	}
+
+	if (!res->dev) {
+		skx_printk(KERN_ERR, "No device for src_id %d imc %d\n",
+			   m->socketid, res->imc);
+		return false;
+	}
+
+	res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
+	res->row          = GET_BITFIELD(m->misc, 19, 39);
+	res->bank_group   = GET_BITFIELD(m->misc, 40, 41);
+	res->bank_address = GET_BITFIELD(m->misc, 42, 43);
+	res->bank_group  |= GET_BITFIELD(m->misc, 44, 44) << 2;
+	res->rank         = GET_BITFIELD(m->misc, 56, 58);
+	res->dimm         = res->rank >> 2;
+	res->rank         = res->rank % 4;
+
+	return true;
+}
+
 static int i10nm_get_ddr_munits(void)
 {
 	struct pci_dev *mdev;
@@ -565,7 +664,8 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 	}
 
-	skx_set_mem_cfg(i10nm_check_2lm(cfg));
+	mem_cfg_2lm = i10nm_check_2lm(cfg);
+	skx_set_mem_cfg(mem_cfg_2lm);
 
 	rc = i10nm_get_ddr_munits();
 
@@ -617,9 +717,11 @@ static int __init i10nm_init(void)
 	setup_i10nm_debug();
 
 	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
-		skx_set_decode(NULL, show_retry_rd_err_log);
+		skx_set_decode(i10nm_mc_decode, show_retry_rd_err_log);
 		if (retry_rd_err_log == 2)
 			enable_retry_rd_err_log(true);
+	} else {
+		skx_set_decode(i10nm_mc_decode, NULL);
 	}
 
 	i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
@@ -649,6 +751,34 @@ static void __exit i10nm_exit(void)
 module_init(i10nm_init);
 module_exit(i10nm_exit);
 
+static int set_decoding_via_mca(const char *buf, const struct kernel_param *kp)
+{
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 0, &val);
+
+	if (ret || val > 1)
+		return -EINVAL;
+
+	if (val && mem_cfg_2lm) {
+		i10nm_printk(KERN_NOTICE, "Decoding errors via MCA banks for 2LM isn't supported yet\n");
+		return -EIO;
+	}
+
+	ret = param_set_int(buf, kp);
+
+	return ret;
+}
+
+static const struct kernel_param_ops decoding_via_mca_param_ops = {
+	.set = set_decoding_via_mca,
+	.get = param_get_int,
+};
+
+module_param_cb(decoding_via_mca, &decoding_via_mca_param_ops, &decoding_via_mca, 0644);
+MODULE_PARM_DESC(decoding_via_mca, "decoding_via_mca: 0=off(default), 1=enable");
+
 module_param(retry_rd_err_log, int, 0444);
 MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
 
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 28ee46628202..b9a9b4a274ab 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -651,6 +651,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 		return NOTIFY_DONE;
 
 	memset(&res, 0, sizeof(res));
+	res.mce  = mce;
 	res.addr = mce->addr;
 
 	/* Try driver decoder first */
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 880ecd15ca42..c542f1562825 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -10,6 +10,7 @@
 #define _SKX_COMM_EDAC_H
 
 #include <linux/bits.h>
+#include <asm/mce.h>
 
 #define MSG_SIZE		1024
 
@@ -52,6 +53,9 @@
 #define IS_DIMM_PRESENT(r)		GET_BITFIELD(r, 15, 15)
 #define IS_NVDIMM_PRESENT(r, i)		GET_BITFIELD(r, i, i)
 
+#define MCI_MISC_ECC_MODE(m)	(((m) >> 59) & 15)
+#define MCI_MISC_ECC_DDRT	8	/* read from DDRT */
+
 /*
  * Each cpu socket contains some pci devices that provide global
  * information, and also some that are local to each of the two
@@ -120,6 +124,7 @@ enum {
 #define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
 
 struct decoded_addr {
+	struct mce *mce;
 	struct skx_dev *dev;
 	u64	addr;
 	int	socket;
-- 
Gitee


From d2aeda874ca2b54c94c424bf8fb1142dd112a83a Mon Sep 17 00:00:00 2001
From: Naveen Krishna Chatradhi <nchatrad@amd.com>
Date: Wed, 30 Jun 2021 20:58:24 +0530
Subject: [PATCH 2108/2161] EDAC/mc: Add new HBM2 memory type

commit e1ca90b7cc5cb5d3a38321cbb65ad36a59fcb574 upstream.

Add a new entry to 'enum mem_type' and a new string to 'edac_mem_types[]'
for HBM2 (High Bandwidth Memory Gen 2) new memory type.

Intel-SIG: commit e1ca90b7cc5c EDAC/mc: Add new HBM2 memory type.
Backport to add EDAC HBM support.

Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210630152828.162659-4-nchatrad@amd.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/edac_mc.c | 1 +
 include/linux/edac.h   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 4b0a33a674fd..7dbf4935d580 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -216,6 +216,7 @@ const char * const edac_mem_types[] = {
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
 	[MEM_DDR5]	= "Unbuffered-DDR5",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
+	[MEM_HBM2]	= "High-bandwidth-memory-Gen2",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index ad56bfa55a0b..47c1f57b31d9 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -189,6 +189,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
  * @MEM_DDR5:		Unbuffered DDR5 RAM
  * @MEM_NVDIMM:		Non-volatile RAM
+ * @MEM_HBM2:		High bandwidth Memory Gen 2.
  */
 enum mem_type {
 	MEM_EMPTY = 0,
@@ -214,6 +215,7 @@ enum mem_type {
 	MEM_LRDDR4,
 	MEM_DDR5,
 	MEM_NVDIMM,
+	MEM_HBM2,
 };
 
 #define MEM_FLAG_EMPTY		BIT(MEM_EMPTY)
@@ -238,6 +240,7 @@ enum mem_type {
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
 #define MEM_FLAG_DDR5           BIT(MEM_DDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
+#define MEM_FLAG_HBM2		BIT(MEM_HBM2)
 
 /**
  * enum edac-type - Error Detection and Correction capabilities and mode
-- 
Gitee


From abe327724f18ff4fbf3e56bb07e26e33339e4275 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 20 Jul 2021 09:30:09 -0700
Subject: [PATCH 2109/2161] EDAC/skx_common: Set the memory type correctly for
 HBM memory

commit fd07a4a0d30b5468a1f4a0739e34f5f014df7d44 upstream.

Set the memory type to MEM_HBM2 if it's managed by the HBM2
memory controller.

Intel-SIG: commit fd07a4a0d30b EDAC/skx_common: Set the memory type correctly for HBM
 memory.
Backport to add EDAC HBM support.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210720163009.GA1417532@agluck-desk2.amr.corp.intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index b9a9b4a274ab..338936760c41 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -347,7 +347,10 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	rows = numrow(mtr);
 	cols = imc->hbm_mc ? 6 : numcol(mtr);
 
-	if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) {
+	if (imc->hbm_mc) {
+		banks = 32;
+		mtype = MEM_HBM2;
+	} else if (cfg->support_ddr5 && (amap & 0x8)) {
 		banks = 32;
 		mtype = MEM_DDR5;
 	} else {
-- 
Gitee


From e3d0982a323f9c27abf59e77169749ca08cae5db Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 24 Dec 2021 04:11:26 -0500
Subject: [PATCH 2110/2161] EDAC/i10nm: Release mdev/mbase when failing to
 detect HBM

commit c370baa328022cbd46c59c821d1b467a97f047be upstream.

On systems without HBM (High Bandwidth Memory) mdev/mbase are not
released/unmapped.

Add the code to release mdev/mbase when failing to detect HBM.

[Tony: re-word commit message]

Intel-SIG: commit c370baa32802 EDAC/i10nm: Release mdev/mbase when failing to detect HBM.
Backport to add EDAC HBM support.

Cc: <stable@vger.kernel.org>
Fixes: c945088384d0 ("EDAC/i10nm: Add support for high bandwidth memory")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20211224091126.1246-1-qiuxu.zhuo@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 6ffd6b7c7262..c234177b31a1 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -457,6 +457,9 @@ static int i10nm_get_hbm_munits(void)
 
 			mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE);
 			if (!mbase) {
+				pci_dev_put(d->imc[lmc].mdev);
+				d->imc[lmc].mdev = NULL;
+
 				i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n",
 					     base + off);
 				return -ENOMEM;
@@ -467,6 +470,12 @@ static int i10nm_get_hbm_munits(void)
 
 			mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0);
 			if (!I10NM_IS_HBM_IMC(mcmtr)) {
+				iounmap(d->imc[lmc].mbase);
+				d->imc[lmc].mbase = NULL;
+				d->imc[lmc].hbm_mc = false;
+				pci_dev_put(d->imc[lmc].mdev);
+				d->imc[lmc].mdev = NULL;
+
 				i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n");
 				return -ENODEV;
 			}
-- 
Gitee


From f84098144e06d0bc4a94b1960d1122cdab4971b8 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 22 Jul 2022 16:33:35 -0700
Subject: [PATCH 2111/2161] EDAC/skx_common: Add ChipSelect ADXL component

commit 14646de48bd77947cd6a325b42eecddcec5a35c7 upstream.

Each pseudo channel of HBM has its own retry_rd_err_log registers.
The bit 0 of ChipSelect ADXL component encodes the pseudo channel
number of HBM memory. So add ChipSelect ADXL component to get HBM
pseudo channel number.

Intel-SIG: commit 14646de48bd7 EDAC/skx_common: Add ChipSelect ADXL component.
Backport to add retry_rd_err_log for SPR HBM.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220722233338.341567-1-tony.luck@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 5 +++++
 drivers/edac/skx_common.h | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 338936760c41..cc3b9e586031 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -27,9 +27,11 @@ static const char * const component_names[] = {
 	[INDEX_MEMCTRL]		= "MemoryControllerId",
 	[INDEX_CHANNEL]		= "ChannelId",
 	[INDEX_DIMM]		= "DimmSlotId",
+	[INDEX_CS]		= "ChipSelect",
 	[INDEX_NM_MEMCTRL]	= "NmMemoryControllerId",
 	[INDEX_NM_CHANNEL]	= "NmChannelId",
 	[INDEX_NM_DIMM]		= "NmDimmSlotId",
+	[INDEX_NM_CS]		= "NmChipSelect",
 };
 
 static int component_indices[ARRAY_SIZE(component_names)];
@@ -139,10 +141,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
 			       (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
 		res->dimm    = (adxl_nm_bitmap & BIT_NM_DIMM) ?
 			       (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
+		res->cs      = (adxl_nm_bitmap & BIT_NM_CS) ?
+			       (int)adxl_values[component_indices[INDEX_NM_CS]] : -1;
 	} else {
 		res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
 		res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
 		res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+		res->cs      = (int)adxl_values[component_indices[INDEX_CS]];
 	}
 
 	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index c542f1562825..167760fd75ba 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -112,16 +112,19 @@ enum {
 	INDEX_MEMCTRL,
 	INDEX_CHANNEL,
 	INDEX_DIMM,
+	INDEX_CS,
 	INDEX_NM_FIRST,
 	INDEX_NM_MEMCTRL = INDEX_NM_FIRST,
 	INDEX_NM_CHANNEL,
 	INDEX_NM_DIMM,
+	INDEX_NM_CS,
 	INDEX_MAX
 };
 
 #define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
 #define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
 #define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
+#define BIT_NM_CS	BIT_ULL(INDEX_NM_CS)
 
 struct decoded_addr {
 	struct mce *mce;
@@ -134,6 +137,7 @@ struct decoded_addr {
 	int	sktways;
 	int	chanways;
 	int	dimm;
+	int	cs;
 	int	rank;
 	int	channel_rank;
 	u64	rank_address;
-- 
Gitee


From c1ca24e2b523aa707d3d6231971eb152b57160bc Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 22 Jul 2022 16:33:36 -0700
Subject: [PATCH 2112/2161] EDAC/i10nm: Retrieve and print retry_rd_err_log
 registers for HBM

commit acd4cf68fefe70138926056e3137c9ea99ef7ebf upstream.

An HBM memory channel is divided into two pseudo channels. Each
pseudo channel has its own retry_rd_err_log registers. Retrieve and
print retry_rd_err_log registers of the HBM pseudo channel if the
memory error is from HBM.

Intel-SIG: commit acd4cf68fefe EDAC/i10nm: Retrieve and print retry_rd_err_log registers
 for HBM.
Backport to add retry_rd_err_log for SPR HBM.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220722233338.341567-1-tony.luck@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 84 +++++++++++++++++++++++++++++++--------
 drivers/edac/skx_common.h |  4 ++
 2 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index c234177b31a1..45aefc971833 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -79,18 +79,20 @@ static bool mem_cfg_2lm;
 
 static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
 static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_scrub_spr_hbm0[]  = {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8};
+static u32 offsets_scrub_spr_hbm1[]  = {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8};
 static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
 static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0};
+static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0};
 
-static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
+static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable,
+				      u32 *offsets_scrub, u32 *offsets_demand)
 {
 	u32 s, d;
 
-	if (!imc->mbase)
-		return;
-
-	s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
-	d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
+	s = I10NM_GET_REG32(imc, chan, offsets_scrub[0]);
+	d = I10NM_GET_REG32(imc, chan, offsets_demand[0]);
 
 	if (enable) {
 		/* Save default configurations */
@@ -117,21 +119,39 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable
 			d &= ~RETRY_RD_ERR_LOG_EN;
 	}
 
-	I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
-	I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
+	I10NM_SET_REG32(imc, chan, offsets_scrub[0], s);
+	I10NM_SET_REG32(imc, chan, offsets_demand[0], d);
 }
 
 static void enable_retry_rd_err_log(bool enable)
 {
+	struct skx_imc *imc;
 	struct skx_dev *d;
 	int i, j;
 
 	edac_dbg(2, "\n");
 
 	list_for_each_entry(d, i10nm_edac_list, list)
-		for (i = 0; i < I10NM_NUM_IMC; i++)
-			for (j = 0; j < I10NM_NUM_CHANNELS; j++)
-				__enable_retry_rd_err_log(&d->imc[i], j, enable);
+		for (i = 0; i < I10NM_NUM_IMC; i++) {
+			imc = &d->imc[i];
+			if (!imc->mbase)
+				continue;
+
+			for (j = 0; j < I10NM_NUM_CHANNELS; j++) {
+				if (imc->hbm_mc) {
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub_hbm0,
+								  res_cfg->offsets_demand_hbm0);
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub_hbm1,
+								  res_cfg->offsets_demand_hbm1);
+				} else {
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub,
+								  res_cfg->offsets_demand);
+				}
+			}
+	}
 }
 
 static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
@@ -142,12 +162,24 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 	u32 corr0, corr1, corr2, corr3;
 	u64 log2a, log5;
 	u32 *offsets;
-	int n;
+	int n, pch;
 
 	if (!imc->mbase)
 		return;
 
-	offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
+	if (imc->hbm_mc) {
+		pch = res->cs & 1;
+
+		if (pch)
+			offsets = scrub_err ? res_cfg->offsets_scrub_hbm1 :
+					      res_cfg->offsets_demand_hbm1;
+		else
+			offsets = scrub_err ? res_cfg->offsets_scrub_hbm0 :
+					      res_cfg->offsets_demand_hbm0;
+	} else {
+		offsets = scrub_err ? res_cfg->offsets_scrub :
+				      res_cfg->offsets_demand;
+	}
 
 	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
 	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
@@ -165,10 +197,24 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 			     log0, log1, log2, log3, log4, log5);
 	}
 
-	corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
-	corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
-	corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
-	corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+	if (imc->hbm_mc) {
+		if (pch) {
+			corr0 = I10NM_GET_REG32(imc, res->channel, 0x2c18);
+			corr1 = I10NM_GET_REG32(imc, res->channel, 0x2c1c);
+			corr2 = I10NM_GET_REG32(imc, res->channel, 0x2c20);
+			corr3 = I10NM_GET_REG32(imc, res->channel, 0x2c24);
+		} else {
+			corr0 = I10NM_GET_REG32(imc, res->channel, 0x2818);
+			corr1 = I10NM_GET_REG32(imc, res->channel, 0x281c);
+			corr2 = I10NM_GET_REG32(imc, res->channel, 0x2820);
+			corr3 = I10NM_GET_REG32(imc, res->channel, 0x2824);
+		}
+	} else {
+		corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
+		corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
+		corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
+		corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+	}
 
 	if (len - n > 0)
 		snprintf(msg + n, len - n,
@@ -519,7 +565,11 @@ static struct res_config spr_cfg = {
 	.sad_all_devfn		= PCI_DEVFN(10, 0),
 	.sad_all_offset		= 0x300,
 	.offsets_scrub		= offsets_scrub_spr,
+	.offsets_scrub_hbm0	= offsets_scrub_spr_hbm0,
+	.offsets_scrub_hbm1	= offsets_scrub_spr_hbm1,
 	.offsets_demand		= offsets_demand_spr,
+	.offsets_demand_hbm0	= offsets_demand_spr_hbm0,
+	.offsets_demand_hbm1	= offsets_demand_spr_hbm1,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 167760fd75ba..455e652c0e46 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -164,7 +164,11 @@ struct res_config {
 	int sad_all_offset;
 	/* Offsets of retry_rd_err_log registers */
 	u32 *offsets_scrub;
+	u32 *offsets_scrub_hbm0;
+	u32 *offsets_scrub_hbm1;
 	u32 *offsets_demand;
+	u32 *offsets_demand_hbm0;
+	u32 *offsets_demand_hbm1;
 };
 
 typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
-- 
Gitee


From 44d698294b93affb81e42517b05d43ded5ad6980 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 22 Jul 2022 16:33:37 -0700
Subject: [PATCH 2113/2161] EDAC/i10nm: Print an extra register set of
 retry_rd_err_log

commit d5f5e49953f68bb7b15afd6e32ad176b987c6525 upstream.

Sapphire Rapids server adds an extra register set for logging more
retry_rd_err_log data. So add code to print the extra register set.

Intel-SIG: commit d5f5e49953f6 EDAC/i10nm: Print an extra register set of
 retry_rd_err_log.
Backport to add retry_rd_err_log for SPR HBM.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220722233338.341567-1-tony.luck@intel.com
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 81 +++++++++++++++++++++++++++++++++------
 drivers/edac/skx_common.h |  2 +
 2 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 45aefc971833..5c5f3b54f689 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -83,26 +83,38 @@ static u32 offsets_scrub_spr_hbm0[]  = {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828,
 static u32 offsets_scrub_spr_hbm1[]  = {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8};
 static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
 static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand2_spr[] = {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10};
 static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0};
 static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0};
 
 static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable,
-				      u32 *offsets_scrub, u32 *offsets_demand)
+				      u32 *offsets_scrub, u32 *offsets_demand,
+				      u32 *offsets_demand2)
 {
-	u32 s, d;
+	u32 s, d, d2;
 
 	s = I10NM_GET_REG32(imc, chan, offsets_scrub[0]);
 	d = I10NM_GET_REG32(imc, chan, offsets_demand[0]);
+	if (offsets_demand2)
+		d2 = I10NM_GET_REG32(imc, chan, offsets_demand2[0]);
 
 	if (enable) {
 		/* Save default configurations */
 		imc->chan[chan].retry_rd_err_log_s = s;
 		imc->chan[chan].retry_rd_err_log_d = d;
+		if (offsets_demand2)
+			imc->chan[chan].retry_rd_err_log_d2 = d2;
 
 		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
 		s |=  RETRY_RD_ERR_LOG_EN;
 		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
 		d |=  RETRY_RD_ERR_LOG_EN;
+
+		if (offsets_demand2) {
+			d2 &= ~RETRY_RD_ERR_LOG_UC;
+			d2 |=  RETRY_RD_ERR_LOG_NOOVER;
+			d2 |=  RETRY_RD_ERR_LOG_EN;
+		}
 	} else {
 		/* Restore default configurations */
 		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
@@ -117,10 +129,21 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable
 			d |=  RETRY_RD_ERR_LOG_NOOVER;
 		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
 			d &= ~RETRY_RD_ERR_LOG_EN;
+
+		if (offsets_demand2) {
+			if (imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_UC)
+				d2 |=  RETRY_RD_ERR_LOG_UC;
+			if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_NOOVER))
+				d2 &=  ~RETRY_RD_ERR_LOG_NOOVER;
+			if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_EN))
+				d2 &= ~RETRY_RD_ERR_LOG_EN;
+		}
 	}
 
 	I10NM_SET_REG32(imc, chan, offsets_scrub[0], s);
 	I10NM_SET_REG32(imc, chan, offsets_demand[0], d);
+	if (offsets_demand2)
+		I10NM_SET_REG32(imc, chan, offsets_demand2[0], d2);
 }
 
 static void enable_retry_rd_err_log(bool enable)
@@ -141,14 +164,17 @@ static void enable_retry_rd_err_log(bool enable)
 				if (imc->hbm_mc) {
 					__enable_retry_rd_err_log(imc, j, enable,
 								  res_cfg->offsets_scrub_hbm0,
-								  res_cfg->offsets_demand_hbm0);
+								  res_cfg->offsets_demand_hbm0,
+								  NULL);
 					__enable_retry_rd_err_log(imc, j, enable,
 								  res_cfg->offsets_scrub_hbm1,
-								  res_cfg->offsets_demand_hbm1);
+								  res_cfg->offsets_demand_hbm1,
+								  NULL);
 				} else {
 					__enable_retry_rd_err_log(imc, j, enable,
 								  res_cfg->offsets_scrub,
-								  res_cfg->offsets_demand);
+								  res_cfg->offsets_demand,
+								  res_cfg->offsets_demand2);
 				}
 			}
 	}
@@ -160,7 +186,10 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 	struct skx_imc *imc = &res->dev->imc[res->imc];
 	u32 log0, log1, log2, log3, log4;
 	u32 corr0, corr1, corr2, corr3;
+	u32 lxg0, lxg1, lxg3, lxg4;
+	u32 *xffsets = NULL;
 	u64 log2a, log5;
+	u64 lxg2a, lxg5;
 	u32 *offsets;
 	int n, pch;
 
@@ -177,8 +206,12 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 			offsets = scrub_err ? res_cfg->offsets_scrub_hbm0 :
 					      res_cfg->offsets_demand_hbm0;
 	} else {
-		offsets = scrub_err ? res_cfg->offsets_scrub :
-				      res_cfg->offsets_demand;
+		if (scrub_err) {
+			offsets = res_cfg->offsets_scrub;
+		} else {
+			offsets = res_cfg->offsets_demand;
+			xffsets = res_cfg->offsets_demand2;
+		}
 	}
 
 	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
@@ -187,10 +220,28 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
 	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
 
+	if (xffsets) {
+		lxg0 = I10NM_GET_REG32(imc, res->channel, xffsets[0]);
+		lxg1 = I10NM_GET_REG32(imc, res->channel, xffsets[1]);
+		lxg3 = I10NM_GET_REG32(imc, res->channel, xffsets[3]);
+		lxg4 = I10NM_GET_REG32(imc, res->channel, xffsets[4]);
+		lxg5 = I10NM_GET_REG64(imc, res->channel, xffsets[5]);
+	}
+
 	if (res_cfg->type == SPR) {
 		log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]);
-		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]",
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx",
 			     log0, log1, log2a, log3, log4, log5);
+
+		if (len - n > 0) {
+			if (xffsets) {
+				lxg2a = I10NM_GET_REG64(imc, res->channel, xffsets[2]);
+				n += snprintf(msg + n, len - n, " %.8x %.8x %.16llx %.8x %.8x %.16llx]",
+					     lxg0, lxg1, lxg2a, lxg3, lxg4, lxg5);
+			} else {
+				n += snprintf(msg + n, len - n, "]");
+			}
+		}
 	} else {
 		log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
 		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
@@ -225,9 +276,16 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 			 corr3 & 0xffff, corr3 >> 16);
 
 	/* Clear status bits */
-	if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
-		log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
-		I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+	if (retry_rd_err_log == 2) {
+		if (log0 & RETRY_RD_ERR_LOG_OVER_UC_V) {
+			log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+			I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+		}
+
+		if (xffsets && (lxg0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
+			lxg0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+			I10NM_SET_REG32(imc, res->channel, xffsets[0], lxg0);
+		}
 	}
 }
 
@@ -568,6 +626,7 @@ static struct res_config spr_cfg = {
 	.offsets_scrub_hbm0	= offsets_scrub_spr_hbm0,
 	.offsets_scrub_hbm1	= offsets_scrub_spr_hbm1,
 	.offsets_demand		= offsets_demand_spr,
+	.offsets_demand2	= offsets_demand2_spr,
 	.offsets_demand_hbm0	= offsets_demand_spr_hbm0,
 	.offsets_demand_hbm1	= offsets_demand_spr_hbm1,
 };
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 455e652c0e46..0cbadd3d2cd3 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -86,6 +86,7 @@ struct skx_dev {
 			struct pci_dev	*edev;
 			u32 retry_rd_err_log_s;
 			u32 retry_rd_err_log_d;
+			u32 retry_rd_err_log_d2;
 			struct skx_dimm {
 				u8 close_pg;
 				u8 bank_xor_enable;
@@ -167,6 +168,7 @@ struct res_config {
 	u32 *offsets_scrub_hbm0;
 	u32 *offsets_scrub_hbm1;
 	u32 *offsets_demand;
+	u32 *offsets_demand2;
 	u32 *offsets_demand_hbm0;
 	u32 *offsets_demand_hbm1;
 };
-- 
Gitee


From 2183a0e782e9f10549420f1e125284d969fcfa96 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 13 Jan 2023 11:27:58 +0800
Subject: [PATCH 2114/2161] EDAC/skx_common: Enable EDAC support for the "near"
 memory

commit 6e8746cb735166eaf3ceb086b31bb0431f5e3532 upstream.

The current {skx,i10nm}_edac miss the EDAC support to decode errors from
the 1st level memory (the fast "near" memory as cache) of the 2-level
memory system. Introduce a helper function skx_error_in_mem() to check
whether errors are from memory at the beginning of skx_mce_check_error().

As long as the errors are from memory (either the 1-level memory system
or the 2-level memory system), decode the errors.

Intel-SIG: commit 258bf6c34817  EDAC/skx_common: Enable EDAC support for the "near"
 memory
Backport to add retry_rd_err_log for SPR HBM 2LM.

Reported-and-tested-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20230113032802.41752-1-qiuxu.zhuo@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 18 ++++++++++++------
 drivers/edac/skx_common.h | 24 ++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index cc3b9e586031..296f8b68ded2 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -632,12 +632,18 @@ static bool skx_error_in_1st_level_mem(const struct mce *m)
 	if (!skx_mem_cfg_2lm)
 		return false;
 
-	errcode = GET_BITFIELD(m->status, 0, 15);
+	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
 
-	if ((errcode & 0xef80) != 0x280)
-		return false;
+	return errcode == MCACOD_EXT_MEM_ERR;
+}
 
-	return true;
+static bool skx_error_in_mem(const struct mce *m)
+{
+	u32 errcode;
+
+	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
+
+	return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
 }
 
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
@@ -654,8 +660,8 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	if (mce->kflags & MCE_HANDLED_CEC)
 		return NOTIFY_DONE;
 
-	/* ignore unless this is memory related with an address */
-	if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
+	/* Ignore unless this is memory related with an address */
+	if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
 		return NOTIFY_DONE;
 
 	memset(&res, 0, sizeof(res));
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 0cbadd3d2cd3..312032657264 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -56,6 +56,30 @@
 #define MCI_MISC_ECC_MODE(m)	(((m) >> 59) & 15)
 #define MCI_MISC_ECC_DDRT	8	/* read from DDRT */
 
+/*
+ * According to Intel Architecture spec vol 3B,
+ * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
+ * memory errors should fit one of these masks:
+ *	000f 0000 1mmm cccc (binary)
+ *	000f 0010 1mmm cccc (binary)	[RAM used as cache]
+ * where:
+ *	f = Correction Report Filtering Bit. If 1, subsequent errors
+ *	    won't be shown
+ *	mmm = error type
+ *	cccc = channel
+ */
+#define MCACOD_MEM_ERR_MASK	0xef80
+/*
+ * Errors from either the memory of the 1-level memory system or the
+ * 2nd level memory (the slow "far" memory) of the 2-level memory system.
+ */
+#define MCACOD_MEM_CTL_ERR	0x80
+/*
+ * Errors from the 1st level memory (the fast "near" memory as cache)
+ * of the 2-level memory system.
+ */
+#define MCACOD_EXT_MEM_ERR	0x280
+
 /*
  * Each cpu socket contains some pci devices that provide global
  * information, and also some that are local to each of the two
-- 
Gitee


From 7157bba17da06e6add21943a88b202f7bdf6774a Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Mon, 6 Feb 2023 09:14:23 +0800
Subject: [PATCH 2115/2161] EDAC/i10nm: Add driver decoder for Sapphire Rapids
 server

commit 221aa03fb4e0740b1f4d9ecbf3d9af9fd9f7f85e upstream.

Intel SDM (December 2022) vol3B 17.13.2 contains IMC MC error codes
for Sapphire Rapids. Current i10nm_edac only supports firmware decoder
(ACPI DSM methods) for Sapphire Rapids. So add the driver decoder
(decoding DDR memory errors via extracting error information from the
IMC MC error codes) for Sapphire Rapids for better decoding performance.

Intel-SIG: commit 221aa03fb4e0 EDAC/i10nm: Add driver decoder for Sapphire Rapids server
Backport to decode DDR error by MCA bank registers in replace of firmware.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Youquan Song: amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 102 ++++++++++++++++++++++++++------------
 1 file changed, 69 insertions(+), 33 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 5c5f3b54f689..07e443b69d60 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -338,20 +338,39 @@ static bool i10nm_check_2lm(struct res_config *cfg)
 }
 
 /*
- * Check whether the error comes from DDRT by ICX/Tremont model specific error code.
- * Refer to SDM vol3B 16.11.3 Intel IMC MC error codes for IA32_MCi_STATUS.
+ * Check whether the error comes from DDRT by ICX/Tremont/SPR model specific error code.
+ * Refer to SDM vol3B 17.11.3/17.13.2 Intel IMC MC error codes for IA32_MCi_STATUS.
  */
 static bool i10nm_mscod_is_ddrt(u32 mscod)
 {
-	switch (mscod) {
-	case 0x0106: case 0x0107:
-	case 0x0800: case 0x0804:
-	case 0x0806 ... 0x0808:
-	case 0x080a ... 0x080e:
-	case 0x0810: case 0x0811:
-	case 0x0816: case 0x081e:
-	case 0x081f:
-		return true;
+	switch (res_cfg->type) {
+	case I10NM:
+		switch (mscod) {
+		case 0x0106: case 0x0107:
+		case 0x0800: case 0x0804:
+		case 0x0806 ... 0x0808:
+		case 0x080a ... 0x080e:
+		case 0x0810: case 0x0811:
+		case 0x0816: case 0x081e:
+		case 0x081f:
+			return true;
+		}
+
+		break;
+	case SPR:
+		switch (mscod) {
+		case 0x0800: case 0x0804:
+		case 0x0806 ... 0x0808:
+		case 0x080a ... 0x080e:
+		case 0x0810: case 0x0811:
+		case 0x0816: case 0x081e:
+		case 0x081f:
+			return true;
+		}
+
+		break;
+	default:
+		return false;
 	}
 
 	return false;
@@ -359,6 +378,7 @@ static bool i10nm_mscod_is_ddrt(u32 mscod)
 
 static bool i10nm_mc_decode_available(struct mce *mce)
 {
+#define ICX_IMCx_CHy		0x06666000
 	u8 bank;
 
 	if (!decoding_via_mca || mem_cfg_2lm)
@@ -372,21 +392,26 @@ static bool i10nm_mc_decode_available(struct mce *mce)
 
 	switch (res_cfg->type) {
 	case I10NM:
-		if (bank < 13 || bank > 26)
-			return false;
-
-		/* DDRT errors can't be decoded from MCA bank registers */
-		if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT)
+		/* Check whether the bank is one of {13,14,17,18,21,22,25,26} */
+		if (!(ICX_IMCx_CHy & (1 << bank)))
 			return false;
-
-		if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status)))
+		break;
+	case SPR:
+		if (bank < 13 || bank > 20)
 			return false;
-
-		/* Check whether one of {13,14,17,18,21,22,25,26} */
-		return ((bank - 13) & BIT(1)) == 0;
+		break;
 	default:
 		return false;
 	}
+
+	/* DDRT errors can't be decoded from MCA bank registers */
+	if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT)
+		return false;
+
+	if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status)))
+		return false;
+
+	return true;
 }
 
 static bool i10nm_mc_decode(struct decoded_addr *res)
@@ -408,9 +433,29 @@ static bool i10nm_mc_decode(struct decoded_addr *res)
 
 	switch (res_cfg->type) {
 	case I10NM:
-		bank = m->bank - 13;
-		res->imc = bank / 4;
-		res->channel = bank % 2;
+		bank              = m->bank - 13;
+		res->imc          = bank / 4;
+		res->channel      = bank % 2;
+		res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
+		res->row          = GET_BITFIELD(m->misc, 19, 39);
+		res->bank_group   = GET_BITFIELD(m->misc, 40, 41);
+		res->bank_address = GET_BITFIELD(m->misc, 42, 43);
+		res->bank_group  |= GET_BITFIELD(m->misc, 44, 44) << 2;
+		res->rank         = GET_BITFIELD(m->misc, 56, 58);
+		res->dimm         = res->rank >> 2;
+		res->rank         = res->rank % 4;
+		break;
+	case SPR:
+		bank              = m->bank - 13;
+		res->imc          = bank / 2;
+		res->channel      = bank % 2;
+		res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
+		res->row          = GET_BITFIELD(m->misc, 19, 36);
+		res->bank_group   = GET_BITFIELD(m->misc, 37, 38);
+		res->bank_address = GET_BITFIELD(m->misc, 39, 40);
+		res->bank_group  |= GET_BITFIELD(m->misc, 41, 41) << 2;
+		res->rank         = GET_BITFIELD(m->misc, 57, 57);
+		res->dimm         = GET_BITFIELD(m->misc, 58, 58);
 		break;
 	default:
 		return false;
@@ -422,15 +467,6 @@ static bool i10nm_mc_decode(struct decoded_addr *res)
 		return false;
 	}
 
-	res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
-	res->row          = GET_BITFIELD(m->misc, 19, 39);
-	res->bank_group   = GET_BITFIELD(m->misc, 40, 41);
-	res->bank_address = GET_BITFIELD(m->misc, 42, 43);
-	res->bank_group  |= GET_BITFIELD(m->misc, 44, 44) << 2;
-	res->rank         = GET_BITFIELD(m->misc, 56, 58);
-	res->dimm         = res->rank >> 2;
-	res->rank         = res->rank % 4;
-
 	return true;
 }
 
-- 
Gitee


From 25608f0643152314397c376d6ddc9789c4dbd786 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 20 Nov 2020 16:10:23 -0800
Subject: [PATCH 2116/2161] PCI/ERR: Bind RCEC devices to the Root Port driver

commit c9d659b60770db94b898f94947192a94bbf95c5c upstream.

If a Root Complex Integrated Endpoint (RCiEP) is implemented, it may signal
errors through a Root Complex Event Collector (RCEC).  Each RCiEP must be
associated with no more than one RCEC.

For an RCEC (which is technically not a Bridge), error messages "received"
from associated RCiEPs must be enabled for "transmission" in order to cause
a System Error via the Root Control register or (when the Advanced Error
Reporting Capability is present) reporting via the Root Error Command
register and logging in the Root Error Status register and Error Source
Identification register.

Given the commonality with Root Ports and the need to also support AER and
PME services for RCECs, extend the Root Port driver to support RCEC devices
by adding the RCEC Class ID to the driver structure.

Intel-SIG: commit c9d659b60770 PCI/ERR: Bind RCEC devices to the Root Port driver.
Backport for enable PCIe/RCEC.

Co-developed-by: Sean V Kelley <sean.v.kelley@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-3-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/portdrv_pci.c | 5 ++++-
 include/linux/pci_ids.h        | 1 +
 include/uapi/linux/pci_regs.h  | 7 +++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 160d67c59310..46e360faf35d 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -106,7 +106,8 @@ static int pcie_portdrv_probe(struct pci_dev *dev,
 	if (!pci_is_pcie(dev) ||
 	    ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
 	     (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
-	     (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM)))
+	     (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM) &&
+	     (pci_pcie_type(dev) != PCI_EXP_TYPE_RC_EC)))
 		return -ENODEV;
 
 	status = pcie_port_device_register(dev);
@@ -195,6 +196,8 @@ static const struct pci_device_id port_pci_ids[] = {
 	{ PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x00), ~0) },
 	/* subtractive decode PCI-to-PCI bridge, class type is 060401h */
 	{ PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x01), ~0) },
+	/* handle any Root Complex Event Collector */
+	{ PCI_DEVICE_CLASS(((PCI_CLASS_SYSTEM_RCEC << 8) | 0x00), ~0) },
 	{ },
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index f3166b1425ca..e49aa4f5b620 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -81,6 +81,7 @@
 #define PCI_CLASS_SYSTEM_RTC		0x0803
 #define PCI_CLASS_SYSTEM_PCI_HOTPLUG	0x0804
 #define PCI_CLASS_SYSTEM_SDHCI		0x0805
+#define PCI_CLASS_SYSTEM_RCEC		0x0807
 #define PCI_CLASS_SYSTEM_OTHER		0x0880
 
 #define PCI_BASE_CLASS_INPUT		0x09
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 4d02ca9da6eb..725e671fe7f2 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -824,6 +824,13 @@
 #define  PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
 #define PCI_EXT_CAP_PWR_SIZEOF	16
 
+/* Root Complex Event Collector Endpoint Association  */
+#define PCI_RCEC_RCIEP_BITMAP	4	/* Associated Bitmap for RCiEPs */
+#define PCI_RCEC_BUSN		8	/* RCEC Associated Bus Numbers */
+#define  PCI_RCEC_BUSN_REG_VER	0x02	/* Least version with BUSN present */
+#define  PCI_RCEC_BUSN_NEXT(x)	(((x) >> 8) & 0xff)
+#define  PCI_RCEC_BUSN_LAST(x)	(((x) >> 16) & 0xff)
+
 /* Vendor-Specific (VSEC, PCI_EXT_CAP_ID_VNDR) */
 #define PCI_VNDR_HEADER		4	/* Vendor-Specific Header */
 #define  PCI_VNDR_HEADER_ID(x)	((x) & 0xffff)
-- 
Gitee


From e84119c0a750f306e29a37bb942b42992e1689f8 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:24 -0800
Subject: [PATCH 2117/2161] PCI/ERR: Cache RCEC EA Capability offset in
 pci_init_capabilities()

commit 90655631988f8f501529e6de5f13614389717ead upstream.

Extend support for Root Complex Event Collectors by decoding and caching
the RCEC Endpoint Association Extended Capabilities when enumerating. Use
that cached information for later error source reporting. See PCIe r5.0,
sec 7.9.10.

Intel-SIG: commit 90655631988f PCI/ERR: Cache RCEC EA Capability offset
in pci_init_capabilities().
Backport for enable PCIe/RCEC.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-4-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pci.h         | 17 +++++++++++
 drivers/pci/pcie/Makefile |  2 +-
 drivers/pci/pcie/rcec.c   | 59 +++++++++++++++++++++++++++++++++++++++
 drivers/pci/probe.c       |  2 ++
 include/linux/pci.h       |  4 +++
 5 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 drivers/pci/pcie/rcec.c

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 16645c124e9b..5b6cffdeaa4c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -454,6 +454,15 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 #endif	/* CONFIG_PCIEAER */
 
+#ifdef CONFIG_PCIEPORTBUS
+/* Cached RCEC Endpoint Association */
+struct rcec_ea {
+	u8		nextbusn;
+	u8		lastbusn;
+	u32		bitmap;
+};
+#endif
+
 #ifdef CONFIG_PCIE_DPC
 void pci_save_dpc_state(struct pci_dev *dev);
 void pci_restore_dpc_state(struct pci_dev *dev);
@@ -468,6 +477,14 @@ static inline void pci_dpc_init(struct pci_dev *pdev) {}
 static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; }
 #endif
 
+#ifdef CONFIG_PCIEPORTBUS
+void pci_rcec_init(struct pci_dev *dev);
+void pci_rcec_exit(struct pci_dev *dev);
+#else
+static inline void pci_rcec_init(struct pci_dev *dev) {}
+static inline void pci_rcec_exit(struct pci_dev *dev) {}
+#endif
+
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
 void pci_ats_init(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 68da9280ff11..d9697892fa3e 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
+pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o rcec.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
new file mode 100644
index 000000000000..038e9d706d5f
--- /dev/null
+++ b/drivers/pci/pcie/rcec.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Root Complex Event Collector Support
+ *
+ * Authors:
+ *  Sean V Kelley <sean.v.kelley@intel.com>
+ *  Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+ *
+ * Copyright (C) 2020 Intel Corp.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+
+#include "../pci.h"
+
+void pci_rcec_init(struct pci_dev *dev)
+{
+	struct rcec_ea *rcec_ea;
+	u32 rcec, hdr, busn;
+	u8 ver;
+
+	/* Only for Root Complex Event Collectors */
+	if (pci_pcie_type(dev) != PCI_EXP_TYPE_RC_EC)
+		return;
+
+	rcec = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_RCEC);
+	if (!rcec)
+		return;
+
+	rcec_ea = kzalloc(sizeof(*rcec_ea), GFP_KERNEL);
+	if (!rcec_ea)
+		return;
+
+	pci_read_config_dword(dev, rcec + PCI_RCEC_RCIEP_BITMAP,
+			      &rcec_ea->bitmap);
+
+	/* Check whether RCEC BUSN register is present */
+	pci_read_config_dword(dev, rcec, &hdr);
+	ver = PCI_EXT_CAP_VER(hdr);
+	if (ver >= PCI_RCEC_BUSN_REG_VER) {
+		pci_read_config_dword(dev, rcec + PCI_RCEC_BUSN, &busn);
+		rcec_ea->nextbusn = PCI_RCEC_BUSN_NEXT(busn);
+		rcec_ea->lastbusn = PCI_RCEC_BUSN_LAST(busn);
+	} else {
+		/* Avoid later ver check by setting nextbusn */
+		rcec_ea->nextbusn = 0xff;
+		rcec_ea->lastbusn = 0x00;
+	}
+
+	dev->rcec_ea = rcec_ea;
+}
+
+void pci_rcec_exit(struct pci_dev *dev)
+{
+	kfree(dev->rcec_ea);
+	dev->rcec_ea = NULL;
+}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8a96b027ab86..f6ab3847c051 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2139,6 +2139,7 @@ static void pci_configure_device(struct pci_dev *dev)
 static void pci_release_capabilities(struct pci_dev *dev)
 {
 	pci_aer_exit(dev);
+	pci_rcec_exit(dev);
 	pci_vpd_release(dev);
 	pci_iov_release(dev);
 	pci_free_cap_save_buffers(dev);
@@ -2355,6 +2356,7 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	pci_ptm_init(dev);		/* Precision Time Measurement */
 	pci_aer_init(dev);		/* Advanced Error Reporting */
 	pci_dpc_init(dev);		/* Downstream Port Containment */
+	pci_rcec_init(dev);		/* Root Complex Event Collector */
 
 	pcie_report_downtraining(dev);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index af7e9277acf2..f01b62c93dde 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -287,6 +287,7 @@ struct pci_vpd;
 struct pci_sriov;
 struct pci_ats;
 struct pci_p2pdma;
+struct rcec_ea;
 
 /* The pci_dev structure describes PCI devices */
 struct pci_dev {
@@ -309,6 +310,9 @@ struct pci_dev {
 #ifdef CONFIG_PCIEAER
 	u16		aer_cap;	/* AER capability offset */
 	struct aer_stats *aer_stats;	/* AER stats for this device */
+#endif
+#ifdef CONFIG_PCIEPORTBUS
+	struct rcec_ea	*rcec_ea;	/* RCEC cached endpoint association */
 #endif
 	u8		pcie_cap;	/* PCIe capability offset */
 	u8		msi_cap;	/* MSI capability offset */
-- 
Gitee


From fcabaaad4497071bbb53d6ba536d7b23d37d654a Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:25 -0800
Subject: [PATCH 2118/2161] PCI/ERR: Rename reset_link() to
 reset_subordinates()

commit 8f1bbfbc3596d401b60d1562b27ec28c2724f60d upstream.

reset_link() appears to be misnamed.  The point is to reset any devices
below a given bridge, so rename it to reset_subordinates() to make it clear
that we are passing a bridge with the intent to reset the devices below it.

Intel-SIG: commit 8f1bbfbc3596 PCI/ERR: Rename reset_link() to reset_subordinates().
Backport for enable PCIe/RCEC.

Link: https://lore.kernel.org/r/20201121001036.8560-5-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pci.h      | 4 ++--
 drivers/pci/pcie/err.c | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 5b6cffdeaa4c..b2bddbd58501 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -576,8 +576,8 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
 
 /* PCI error reporting and recovery */
 pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
-			enum pci_channel_state state,
-			pci_ers_result_t (*reset_link)(struct pci_dev *pdev));
+		pci_channel_state_t state,
+		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev));
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index fb86b1803cd1..a0cb82350548 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -147,8 +147,8 @@ static int report_resume(struct pci_dev *dev, void *data)
 }
 
 pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
-			enum pci_channel_state state,
-			pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
+		pci_channel_state_t state,
+		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
 {
 	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
 	struct pci_bus *bus;
@@ -165,8 +165,8 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	pci_dbg(dev, "broadcast error_detected message\n");
 	if (state == pci_channel_io_frozen) {
 		pci_walk_bus(bus, report_frozen_detected, &status);
-		if (reset_link(dev) != PCI_ERS_RESULT_RECOVERED) {
-			pci_warn(dev, "link reset failed\n");
+		if (reset_subordinates(dev) != PCI_ERS_RESULT_RECOVERED) {
+			pci_warn(dev, "subordinate device reset failed\n");
 			goto failed;
 		}
 	} else {
-- 
Gitee


From ca0a99699fb67699dbb5bf09e2c4ad1fb05f8cce Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:26 -0800
Subject: [PATCH 2119/2161] PCI/ERR: Simplify by using pci_upstream_bridge()

commit 5d69dcc9f839bd2d5cac7a098712f52149e1673f upstream.

Use pci_upstream_bridge() in place of dev->bus->self.  No functional change
intended.

Intel-SIG: commit 5d69dcc9f839 PCI/ERR: Simplify by using pci_upstream_bridge().
Backport for enable PCIe/RCEC.

Link: https://lore.kernel.org/r/20201121001036.8560-6-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/err.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index a0cb82350548..bae1e3b007ab 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -159,7 +159,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	 */
 	if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
 	      pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
-		dev = dev->bus->self;
+		dev = pci_upstream_bridge(dev);
 	bus = dev->subordinate;
 
 	pci_dbg(dev, "broadcast error_detected message\n");
-- 
Gitee


From bbe571b860f678278d41a9fadd0fcca0f004a802 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:27 -0800
Subject: [PATCH 2120/2161] PCI/ERR: Simplify by computing pci_pcie_type() once

commit 480ef7cb9fcebda7b28cbed4f6cdcf0a02f4a6ca upstream.

Instead of calling pci_pcie_type(dev) twice, call it once and save the
result.  No functional change intended.

Intel-SIG: commit 480ef7cb9fce PCI/ERR: Simplify by computing pci_pcie_type()
once.
Backport for enable PCIe/RCEC.

Link: https://lore.kernel.org/r/20201121001036.8560-7-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/aer.c         | 5 +++--
 drivers/pci/pcie/err.c         | 5 +++--
 drivers/pci/pcie/portdrv_pci.c | 9 +++++----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 6325f8399d41..ff096d10a58e 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1031,6 +1031,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 {
 	int pos, temp;
+	int type = pci_pcie_type(dev);
 
 	/* Must reset in this function */
 	info->status = 0;
@@ -1049,8 +1050,8 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 			&info->mask);
 		if (!(info->status & ~info->mask))
 			return 0;
-	} else if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
-	           pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM ||
+	} else if (type == PCI_EXP_TYPE_ROOT_PORT ||
+		   type == PCI_EXP_TYPE_DOWNSTREAM ||
 		   info->severity == AER_NONFATAL) {
 
 		/* Link is still healthy for IO reads */
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index bae1e3b007ab..a72fee4d2600 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -150,6 +150,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 		pci_channel_state_t state,
 		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
 {
+	int type = pci_pcie_type(dev);
 	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
 	struct pci_bus *bus;
 
@@ -157,8 +158,8 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	 * Error recovery runs on all subordinates of the first downstream port.
 	 * If the downstream port detected the error, it is cleared at the end.
 	 */
-	if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
-	      pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
+	if (!(type == PCI_EXP_TYPE_ROOT_PORT ||
+	      type == PCI_EXP_TYPE_DOWNSTREAM))
 		dev = pci_upstream_bridge(dev);
 	bus = dev->subordinate;
 
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 46e360faf35d..82e1866d61d2 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -101,13 +101,14 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
 static int pcie_portdrv_probe(struct pci_dev *dev,
 					const struct pci_device_id *id)
 {
+	int type = pci_pcie_type(dev);
 	int status;
 
 	if (!pci_is_pcie(dev) ||
-	    ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
-	     (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
-	     (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM) &&
-	     (pci_pcie_type(dev) != PCI_EXP_TYPE_RC_EC)))
+	    ((type != PCI_EXP_TYPE_ROOT_PORT) &&
+	     (type != PCI_EXP_TYPE_UPSTREAM) &&
+	     (type != PCI_EXP_TYPE_DOWNSTREAM) &&
+	     (type != PCI_EXP_TYPE_RC_EC)))
 		return -ENODEV;
 
 	status = pcie_port_device_register(dev);
-- 
Gitee


From 0f6be2b46008f965fbf20943d1e01178e5a7d700 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:28 -0800
Subject: [PATCH 2121/2161] PCI/ERR: Use "bridge" for clarity in
 pcie_do_recovery()

commit 0791721d800790e6e533bd8467df67f0dc4f2fec upstream.

pcie_do_recovery() may be called with "dev" being either a bridge (Root
Port or Switch Downstream Port) or an Endpoint.  The bulk of the function
deals with the bridge, so if we start with an Endpoint, we reset "dev" to
be the bridge leading to it.

For clarity, replace "dev" in the body of the function with "bridge".  No
functional change intended.

Intel-SIG: commit 0791721d8007 PCI/ERR: Use "bridge" for clarity in
pcie_do_recovery().
Backport for enable PCIe/RCEC.

Link: https://lore.kernel.org/r/20201121001036.8560-8-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/err.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index a72fee4d2600..515714868524 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -151,23 +151,26 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
 {
 	int type = pci_pcie_type(dev);
-	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
+	struct pci_dev *bridge;
 	struct pci_bus *bus;
+	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
 
 	/*
-	 * Error recovery runs on all subordinates of the first downstream port.
-	 * If the downstream port detected the error, it is cleared at the end.
+	 * Error recovery runs on all subordinates of the bridge.  If the
+	 * bridge detected the error, it is cleared at the end.
 	 */
 	if (!(type == PCI_EXP_TYPE_ROOT_PORT ||
 	      type == PCI_EXP_TYPE_DOWNSTREAM))
-		dev = pci_upstream_bridge(dev);
-	bus = dev->subordinate;
+		bridge = pci_upstream_bridge(dev);
+	else
+		bridge = dev;
 
-	pci_dbg(dev, "broadcast error_detected message\n");
+	bus = bridge->subordinate;
+	pci_dbg(bridge, "broadcast error_detected message\n");
 	if (state == pci_channel_io_frozen) {
 		pci_walk_bus(bus, report_frozen_detected, &status);
 		if (reset_subordinates(dev) != PCI_ERS_RESULT_RECOVERED) {
-			pci_warn(dev, "subordinate device reset failed\n");
+			pci_warn(bridge, "subordinate device reset failed\n");
 			goto failed;
 		}
 	} else {
@@ -176,7 +179,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 
 	if (status == PCI_ERS_RESULT_CAN_RECOVER) {
 		status = PCI_ERS_RESULT_RECOVERED;
-		pci_dbg(dev, "broadcast mmio_enabled message\n");
+		pci_dbg(bridge, "broadcast mmio_enabled message\n");
 		pci_walk_bus(bus, report_mmio_enabled, &status);
 	}
 
@@ -187,27 +190,27 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 		 * drivers' slot_reset callbacks?
 		 */
 		status = PCI_ERS_RESULT_RECOVERED;
-		pci_dbg(dev, "broadcast slot_reset message\n");
+		pci_dbg(bridge, "broadcast slot_reset message\n");
 		pci_walk_bus(bus, report_slot_reset, &status);
 	}
 
 	if (status != PCI_ERS_RESULT_RECOVERED)
 		goto failed;
 
-	pci_dbg(dev, "broadcast resume message\n");
+	pci_dbg(bridge, "broadcast resume message\n");
 	pci_walk_bus(bus, report_resume, &status);
 
-	if (pcie_aer_is_native(dev))
-		pcie_clear_device_status(dev);
-	pci_aer_clear_nonfatal_status(dev);
-	pci_info(dev, "device recovery successful\n");
+	if (pcie_aer_is_native(bridge))
+		pcie_clear_device_status(bridge);
+	pci_aer_clear_nonfatal_status(bridge);
+	pci_info(bridge, "device recovery successful\n");
 	return status;
 
 failed:
-	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+	pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT);
 
 	/* TODO: Should kernel panic here? */
-	pci_info(dev, "device recovery failed\n");
+	pci_info(bridge, "device recovery failed\n");
 
 	return status;
 }
-- 
Gitee


From 37c56b06db322a9da454e7105ad065fc7e2715b0 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:29 -0800
Subject: [PATCH 2122/2161] PCI/ERR: Avoid negated conditional for clarity

commit 3d7d8fc78f4b504819882278fcfe10784eb985fa upstream.

Reverse the sense of the Root Port/Downstream Port conditional for clarity.
No functional change intended.

Intel-SIG: commit 3d7d8fc78f4b PCI/ERR: Avoid negated conditional for clarity.
Backport for enable PCIe/RCEC.

Link: https://lore.kernel.org/r/20201121001036.8560-9-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/err.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 515714868524..889d791e23b8 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -159,11 +159,11 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	 * Error recovery runs on all subordinates of the bridge.  If the
 	 * bridge detected the error, it is cleared at the end.
 	 */
-	if (!(type == PCI_EXP_TYPE_ROOT_PORT ||
-	      type == PCI_EXP_TYPE_DOWNSTREAM))
-		bridge = pci_upstream_bridge(dev);
-	else
+	if (type == PCI_EXP_TYPE_ROOT_PORT ||
+	    type == PCI_EXP_TYPE_DOWNSTREAM)
 		bridge = dev;
+	else
+		bridge = pci_upstream_bridge(dev);
 
 	bus = bridge->subordinate;
 	pci_dbg(bridge, "broadcast error_detected message\n");
-- 
Gitee


From a27d438969b64d1df9d2cd29e922f20a8b30ed43 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:30 -0800
Subject: [PATCH 2123/2161] PCI/ERR: Add pci_walk_bridge() to
 pcie_do_recovery()

commit 05e9ae19ab83881a0f33025bd1288e41e552a34b upstream.

Consolidate subordinate bus checks with pci_walk_bus() into
pci_walk_bridge() for walking below potentially AER affected bridges.

Intel-SIG: commit 05e9ae19ab83 PCI/ERR: Add pci_walk_bridge() to
pcie_do_recovery().
Backport for enable PCIe/RCEC.

Link: https://lore.kernel.org/r/20201121001036.8560-10-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/err.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 889d791e23b8..9af71b240ca4 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -146,13 +146,30 @@ static int report_resume(struct pci_dev *dev, void *data)
 	return 0;
 }
 
+/**
+ * pci_walk_bridge - walk bridges potentially AER affected
+ * @bridge:	bridge which may be a Port
+ * @cb:		callback to be called for each device found
+ * @userdata:	arbitrary pointer to be passed to callback
+ *
+ * If the device provided is a bridge, walk the subordinate bus, including
+ * any bridged devices on buses under this bus.  Call the provided callback
+ * on each device found.
+ */
+static void pci_walk_bridge(struct pci_dev *bridge,
+			    int (*cb)(struct pci_dev *, void *),
+			    void *userdata)
+{
+	if (bridge->subordinate)
+		pci_walk_bus(bridge->subordinate, cb, userdata);
+}
+
 pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 		pci_channel_state_t state,
 		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
 {
 	int type = pci_pcie_type(dev);
 	struct pci_dev *bridge;
-	struct pci_bus *bus;
 	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
 
 	/*
@@ -165,22 +182,21 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	else
 		bridge = pci_upstream_bridge(dev);
 
-	bus = bridge->subordinate;
 	pci_dbg(bridge, "broadcast error_detected message\n");
 	if (state == pci_channel_io_frozen) {
-		pci_walk_bus(bus, report_frozen_detected, &status);
+		pci_walk_bridge(bridge, report_frozen_detected, &status);
 		if (reset_subordinates(dev) != PCI_ERS_RESULT_RECOVERED) {
 			pci_warn(bridge, "subordinate device reset failed\n");
 			goto failed;
 		}
 	} else {
-		pci_walk_bus(bus, report_normal_detected, &status);
+		pci_walk_bridge(bridge, report_normal_detected, &status);
 	}
 
 	if (status == PCI_ERS_RESULT_CAN_RECOVER) {
 		status = PCI_ERS_RESULT_RECOVERED;
 		pci_dbg(bridge, "broadcast mmio_enabled message\n");
-		pci_walk_bus(bus, report_mmio_enabled, &status);
+		pci_walk_bridge(bridge, report_mmio_enabled, &status);
 	}
 
 	if (status == PCI_ERS_RESULT_NEED_RESET) {
@@ -191,14 +207,14 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 		 */
 		status = PCI_ERS_RESULT_RECOVERED;
 		pci_dbg(bridge, "broadcast slot_reset message\n");
-		pci_walk_bus(bus, report_slot_reset, &status);
+		pci_walk_bridge(bridge, report_slot_reset, &status);
 	}
 
 	if (status != PCI_ERS_RESULT_RECOVERED)
 		goto failed;
 
 	pci_dbg(bridge, "broadcast resume message\n");
-	pci_walk_bus(bus, report_resume, &status);
+	pci_walk_bridge(bridge, report_resume, &status);
 
 	if (pcie_aer_is_native(bridge))
 		pcie_clear_device_status(bridge);
-- 
Gitee


From 85dfe522109ed513d521af126cfad0bf3a7e0090 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Tue, 24 Nov 2020 10:55:30 -0600
Subject: [PATCH 2124/2161] PCI/ERR: Clear AER status only when we control AER

commit aa344bc8b727b47b4350b59d8166216a3f351e55 upstream.

In some cases a bridge may not exist as the hardware controlling may be
handled only by firmware and so is not visible to the OS. This scenario is
also possible in future use cases involving non-native use of RCECs by
firmware. In this scenario, we expect the platform to retain control of the
bridge and to clear error status itself.

Clear error status only when the OS has native control of AER.

Intel-SIG: commit aa344bc8b727 PCI/ERR: Clear AER status only when we
control AER.
Backport for enable PCIe/RCEC.

Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/err.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 9af71b240ca4..2f3ebad99098 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -171,6 +171,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	int type = pci_pcie_type(dev);
 	struct pci_dev *bridge;
 	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
+	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 
 	/*
 	 * Error recovery runs on all subordinates of the bridge.  If the
@@ -216,9 +217,17 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	pci_dbg(bridge, "broadcast resume message\n");
 	pci_walk_bridge(bridge, report_resume, &status);
 
-	if (pcie_aer_is_native(bridge))
+	/*
+	 * If we have native control of AER, clear error status in the Root
+	 * Port or Downstream Port that signaled the error.  If the
+	 * platform retained control of AER, it is responsible for clearing
+	 * this status.  In that case, the signaling device may not even be
+	 * visible to the OS.
+	 */
+	if (host->native_aer || pcie_ports_native) {
 		pcie_clear_device_status(bridge);
-	pci_aer_clear_nonfatal_status(bridge);
+		pci_aer_clear_nonfatal_status(bridge);
+	}
 	pci_info(bridge, "device recovery successful\n");
 	return status;
 
-- 
Gitee


From 7461a03460666a12c7042066eb2b9144da47b6e1 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 29 May 2020 17:56:09 -0500
Subject: [PATCH 2125/2161] PCI/AER: Use "aer" variable for capability offset

commit 07b2fbb565e2df7ccc41e5c977b19f5f1f9fe013 upstream.

Previously we used "pos" or "aer_pos" for the offset of the AER Capability.
Use "aer" consistently and initialize it the same way everywhere.  No
functional change intended.

Intel-SIG: commit aa344bc8b727 PCI/ERR: Clear AER status only when we
control AER.
Backport for enable PCIe/RCEC.

Link: https://lore.kernel.org/r/20200529230915.GA479883@bjorn-Precision-5520
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/aer.c | 196 ++++++++++++++++++-----------------------
 1 file changed, 84 insertions(+), 112 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index ff096d10a58e..af1bc2e47fc7 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -136,22 +136,18 @@ static const char * const ecrc_policy_str[] = {
  */
 static int enable_ecrc_checking(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 reg32;
 
-	if (!pci_is_pcie(dev))
+	if (!aer)
 		return -ENODEV;
 
-	pos = dev->aer_cap;
-	if (!pos)
-		return -ENODEV;
-
-	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	pci_read_config_dword(dev, aer + PCI_ERR_CAP, &reg32);
 	if (reg32 & PCI_ERR_CAP_ECRC_GENC)
 		reg32 |= PCI_ERR_CAP_ECRC_GENE;
 	if (reg32 & PCI_ERR_CAP_ECRC_CHKC)
 		reg32 |= PCI_ERR_CAP_ECRC_CHKE;
-	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+	pci_write_config_dword(dev, aer + PCI_ERR_CAP, reg32);
 
 	return 0;
 }
@@ -164,19 +160,15 @@ static int enable_ecrc_checking(struct pci_dev *dev)
  */
 static int disable_ecrc_checking(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 reg32;
 
-	if (!pci_is_pcie(dev))
-		return -ENODEV;
-
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return -ENODEV;
 
-	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	pci_read_config_dword(dev, aer + PCI_ERR_CAP, &reg32);
 	reg32 &= ~(PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
-	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+	pci_write_config_dword(dev, aer + PCI_ERR_CAP, reg32);
 
 	return 0;
 }
@@ -251,22 +243,18 @@ EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 status, sev;
 
-	pos = dev->aer_cap;
-	if (!pos)
-		return -EIO;
-
 	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
 	/* Clear status bits for ERR_NONFATAL errors only */
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, &sev);
 	status &= ~sev;
 	if (status)
-		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+		pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
 
 	return 0;
 }
@@ -274,22 +262,18 @@ EXPORT_SYMBOL_GPL(pci_aer_clear_nonfatal_status);
 
 void pci_aer_clear_fatal_status(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 status, sev;
 
-	pos = dev->aer_cap;
-	if (!pos)
-		return;
-
 	if (!pcie_aer_is_native(dev))
 		return;
 
 	/* Clear status bits for ERR_FATAL errors only */
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, &sev);
 	status &= sev;
 	if (status)
-		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+		pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
 }
 
 /**
@@ -303,28 +287,24 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
  */
 int pci_aer_raw_clear_status(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 status;
 	int port_type;
 
-	if (!pci_is_pcie(dev))
-		return -ENODEV;
-
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return -EIO;
 
 	port_type = pci_pcie_type(dev);
 	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
-		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
-		pci_write_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, status);
+		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, &status);
+		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, status);
 	}
 
-	pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, &status);
-	pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, status);
+	pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS, &status);
+	pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS, status);
 
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
-	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
+	pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
 
 	return 0;
 }
@@ -339,12 +319,11 @@ int pci_aer_clear_status(struct pci_dev *dev)
 
 void pci_save_aer_state(struct pci_dev *dev)
 {
+	int aer = dev->aer_cap;
 	struct pci_cap_saved_state *save_state;
 	u32 *cap;
-	int pos;
 
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return;
 
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_ERR);
@@ -352,22 +331,21 @@ void pci_save_aer_state(struct pci_dev *dev)
 		return;
 
 	cap = &save_state->cap.data[0];
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, cap++);
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, cap++);
-	pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, cap++);
-	pci_read_config_dword(dev, pos + PCI_ERR_CAP, cap++);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, cap++);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, cap++);
+	pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, cap++);
+	pci_read_config_dword(dev, aer + PCI_ERR_CAP, cap++);
 	if (pcie_cap_has_rtctl(dev))
-		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, cap++);
+		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, cap++);
 }
 
 void pci_restore_aer_state(struct pci_dev *dev)
 {
+	int aer = dev->aer_cap;
 	struct pci_cap_saved_state *save_state;
 	u32 *cap;
-	int pos;
 
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return;
 
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_ERR);
@@ -375,12 +353,12 @@ void pci_restore_aer_state(struct pci_dev *dev)
 		return;
 
 	cap = &save_state->cap.data[0];
-	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, *cap++);
-	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, *cap++);
-	pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, *cap++);
-	pci_write_config_dword(dev, pos + PCI_ERR_CAP, *cap++);
+	pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, *cap++);
+	pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, *cap++);
+	pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, *cap++);
+	pci_write_config_dword(dev, aer + PCI_ERR_CAP, *cap++);
 	if (pcie_cap_has_rtctl(dev))
-		pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, *cap++);
+		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, *cap++);
 }
 
 void pci_aer_init(struct pci_dev *dev)
@@ -813,7 +791,7 @@ static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
  */
 static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 status, mask;
 	u16 reg16;
 
@@ -848,17 +826,16 @@ static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info)
 	if (!(reg16 & PCI_EXP_AER_FLAGS))
 		return false;
 
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return false;
 
 	/* Check if error is recorded */
 	if (e_info->severity == AER_CORRECTABLE) {
-		pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, &status);
-		pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, &mask);
+		pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS, &status);
+		pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
 	} else {
-		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
-		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &mask);
+		pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
+		pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
 	}
 	if (status & ~mask)
 		return true;
@@ -929,16 +906,15 @@ static bool find_source_device(struct pci_dev *parent,
  */
 static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 {
-	int pos;
+	int aer = dev->aer_cap;
 
 	if (info->severity == AER_CORRECTABLE) {
 		/*
 		 * Correctable error does not need software intervention.
 		 * No need to go through error recovery process.
 		 */
-		pos = dev->aer_cap;
-		if (pos)
-			pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
+		if (aer)
+			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
 					info->status);
 		if (pcie_aer_is_native(dev))
 			pcie_clear_device_status(dev);
@@ -1030,23 +1006,22 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
  */
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 {
-	int pos, temp;
+	int aer = dev->aer_cap;
+	int temp;
 	int type = pci_pcie_type(dev);
 
 	/* Must reset in this function */
 	info->status = 0;
 	info->tlp_header_valid = 0;
 
-	pos = dev->aer_cap;
-
 	/* The device might not support AER */
-	if (!pos)
+	if (!aer)
 		return 0;
 
 	if (info->severity == AER_CORRECTABLE) {
-		pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS,
+		pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS,
 			&info->status);
-		pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK,
+		pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK,
 			&info->mask);
 		if (!(info->status & ~info->mask))
 			return 0;
@@ -1055,27 +1030,27 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 		   info->severity == AER_NONFATAL) {
 
 		/* Link is still healthy for IO reads */
-		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS,
+		pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS,
 			&info->status);
-		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK,
+		pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK,
 			&info->mask);
 		if (!(info->status & ~info->mask))
 			return 0;
 
 		/* Get First Error Pointer */
-		pci_read_config_dword(dev, pos + PCI_ERR_CAP, &temp);
+		pci_read_config_dword(dev, aer + PCI_ERR_CAP, &temp);
 		info->first_error = PCI_ERR_CAP_FEP(temp);
 
 		if (info->status & AER_LOG_TLP_MASKS) {
 			info->tlp_header_valid = 1;
 			pci_read_config_dword(dev,
-				pos + PCI_ERR_HEADER_LOG, &info->tlp.dw0);
+				aer + PCI_ERR_HEADER_LOG, &info->tlp.dw0);
 			pci_read_config_dword(dev,
-				pos + PCI_ERR_HEADER_LOG + 4, &info->tlp.dw1);
+				aer + PCI_ERR_HEADER_LOG + 4, &info->tlp.dw1);
 			pci_read_config_dword(dev,
-				pos + PCI_ERR_HEADER_LOG + 8, &info->tlp.dw2);
+				aer + PCI_ERR_HEADER_LOG + 8, &info->tlp.dw2);
 			pci_read_config_dword(dev,
-				pos + PCI_ERR_HEADER_LOG + 12, &info->tlp.dw3);
+				aer + PCI_ERR_HEADER_LOG + 12, &info->tlp.dw3);
 		}
 	}
 
@@ -1181,15 +1156,15 @@ static irqreturn_t aer_irq(int irq, void *context)
 	struct pcie_device *pdev = (struct pcie_device *)context;
 	struct aer_rpc *rpc = get_service_data(pdev);
 	struct pci_dev *rp = rpc->rpd;
+	int aer = rp->aer_cap;
 	struct aer_err_source e_src = {};
-	int pos = rp->aer_cap;
 
-	pci_read_config_dword(rp, pos + PCI_ERR_ROOT_STATUS, &e_src.status);
+	pci_read_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, &e_src.status);
 	if (!(e_src.status & (PCI_ERR_ROOT_UNCOR_RCV|PCI_ERR_ROOT_COR_RCV)))
 		return IRQ_NONE;
 
-	pci_read_config_dword(rp, pos + PCI_ERR_ROOT_ERR_SRC, &e_src.id);
-	pci_write_config_dword(rp, pos + PCI_ERR_ROOT_STATUS, e_src.status);
+	pci_read_config_dword(rp, aer + PCI_ERR_ROOT_ERR_SRC, &e_src.id);
+	pci_write_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, e_src.status);
 
 	if (!kfifo_put(&rpc->aer_fifo, e_src))
 		return IRQ_HANDLED;
@@ -1241,7 +1216,7 @@ static void set_downstream_devices_error_reporting(struct pci_dev *dev,
 static void aer_enable_rootport(struct aer_rpc *rpc)
 {
 	struct pci_dev *pdev = rpc->rpd;
-	int aer_pos;
+	int aer = pdev->aer_cap;
 	u16 reg16;
 	u32 reg32;
 
@@ -1253,14 +1228,13 @@ static void aer_enable_rootport(struct aer_rpc *rpc)
 	pcie_capability_clear_word(pdev, PCI_EXP_RTCTL,
 				   SYSTEM_ERROR_INTR_ON_MESG_MASK);
 
-	aer_pos = pdev->aer_cap;
 	/* Clear error status */
-	pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, &reg32);
-	pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, reg32);
-	pci_read_config_dword(pdev, aer_pos + PCI_ERR_COR_STATUS, &reg32);
-	pci_write_config_dword(pdev, aer_pos + PCI_ERR_COR_STATUS, reg32);
-	pci_read_config_dword(pdev, aer_pos + PCI_ERR_UNCOR_STATUS, &reg32);
-	pci_write_config_dword(pdev, aer_pos + PCI_ERR_UNCOR_STATUS, reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, &reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_COR_STATUS, &reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_COR_STATUS, reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, &reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, reg32);
 
 	/*
 	 * Enable error reporting for the root port device and downstream port
@@ -1269,9 +1243,9 @@ static void aer_enable_rootport(struct aer_rpc *rpc)
 	set_downstream_devices_error_reporting(pdev, true);
 
 	/* Enable Root Port's interrupt in response to error messages */
-	pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_COMMAND, &reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
 	reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_COMMAND, reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32);
 }
 
 /**
@@ -1283,8 +1257,8 @@ static void aer_enable_rootport(struct aer_rpc *rpc)
 static void aer_disable_rootport(struct aer_rpc *rpc)
 {
 	struct pci_dev *pdev = rpc->rpd;
+	int aer = pdev->aer_cap;
 	u32 reg32;
-	int pos;
 
 	/*
 	 * Disable error reporting for the root port device and downstream port
@@ -1292,15 +1266,14 @@ static void aer_disable_rootport(struct aer_rpc *rpc)
 	 */
 	set_downstream_devices_error_reporting(pdev, false);
 
-	pos = pdev->aer_cap;
 	/* Disable Root's interrupt in response to error messages */
-	pci_read_config_dword(pdev, pos + PCI_ERR_ROOT_COMMAND, &reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
 	reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(pdev, pos + PCI_ERR_ROOT_COMMAND, reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32);
 
 	/* Clear Root's error status reg */
-	pci_read_config_dword(pdev, pos + PCI_ERR_ROOT_STATUS, &reg32);
-	pci_write_config_dword(pdev, pos + PCI_ERR_ROOT_STATUS, reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, &reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, reg32);
 }
 
 /**
@@ -1357,28 +1330,27 @@ static int aer_probe(struct pcie_device *dev)
  */
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 {
+	int aer = dev->aer_cap;
 	u32 reg32;
-	int pos;
 	int rc;
 
-	pos = dev->aer_cap;
 
 	/* Disable Root's interrupt in response to error messages */
-	pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, &reg32);
+	pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
 	reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
+	pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32);
 
 	rc = pci_bus_error_reset(dev);
 	pci_info(dev, "Root Port link has been reset\n");
 
 	/* Clear Root Error Status */
-	pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &reg32);
-	pci_write_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, reg32);
+	pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, &reg32);
+	pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, reg32);
 
 	/* Enable Root Port's interrupt in response to error messages */
-	pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, &reg32);
+	pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
 	reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
+	pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32);
 
 	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
 }
-- 
Gitee


From bfd1e2f67fc6df7be408c1585cc91efd2aac9f5b Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:22 -0800
Subject: [PATCH 2126/2161] PCI/AER: Write AER Capability only when we control
 it

commit 50cc18fcd3053fb46a09db5a39e6516e9560f765 upstream.

If an OS has not been granted AER control via _OSC, it should not make
changes to PCI_ERR_ROOT_COMMAND and PCI_ERR_ROOT_STATUS related registers.
Per section 4.5.1 of the System Firmware Intermediary (SFI) _OSC and DPC
Updates ECN [1], this bit also covers these aspects of the PCI Express
Advanced Error Reporting. Based on the above and earlier discussion [2],
make the following changes:

Add a check for the native case (i.e., AER control via _OSC)

Note that the previous "clear, reset, enable" order suggests that the reset
might cause errors that we should ignore. After this commit, those errors
(if any) will remain logged in the PCI_ERR_ROOT_STATUS register.

[1] System Firmware Intermediary (SFI) _OSC and DPC Updates ECN, Feb 24,
    2020, affecting PCI Firmware Specification, Rev. 3.2
    https://members.pcisig.com/wg/PCI-SIG/document/14076
[2] https://lore.kernel.org/linux-pci/20201020162820.GA370938@bjorn-Precision-5520/

Intel-SIG: commit 50cc18fcd305 PCI/AER: Write AER Capability only when we control it.
Backport for enable PCIe/RCEC.

Link: https://lore.kernel.org/r/20201121001036.8560-2-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/aer.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index af1bc2e47fc7..6a92d96e1f1c 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1334,23 +1334,26 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 	u32 reg32;
 	int rc;
 
-
-	/* Disable Root's interrupt in response to error messages */
-	pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
-	reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32);
+	if (pcie_aer_is_native(dev)) {
+		/* Disable Root's interrupt in response to error messages */
+		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
+		reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
+		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32);
+	}
 
 	rc = pci_bus_error_reset(dev);
-	pci_info(dev, "Root Port link has been reset\n");
+	pci_info(dev, "Root Port link has been reset (%d)\n", rc);
 
-	/* Clear Root Error Status */
-	pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, &reg32);
-	pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, reg32);
+	if (pcie_aer_is_native(dev)) {
+		/* Clear Root Error Status */
+		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, &reg32);
+		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, reg32);
 
-	/* Enable Root Port's interrupt in response to error messages */
-	pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
-	reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32);
+		/* Enable Root Port's interrupt in response to error messages */
+		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
+		reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
+		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32);
+	}
 
 	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
 }
-- 
Gitee


From cb30c1415c62e0bb053070932fe185f649b56e6b Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Wed, 2 Dec 2020 11:26:29 -0600
Subject: [PATCH 2127/2161] PCI/ERR: Recover from RCEC AER errors

commit a175102b0a82fc57853a9e611c42d1d6172e5180 upstream.

A Root Complex Event Collector (RCEC) collects and signals AER errors that
were detected by Root Complex Integrated Endpoints (RCiEPs), but it may
also signal errors it detects itself.  This is analogous to errors detected
and signaled by a Root Port.

Update the AER service driver to claim RCECs in addition to Root Ports.
Add support for handling RCEC-detected AER errors.  This does not
include handling RCiEP-detected errors that are signaled by the RCEC.

Note that we expect these errors only from the native AER and APEI paths,
not from DPC or EDR.

Intel-SIG: commit a175102b0a82 PCI/ERR: Recover from RCEC AER errors.
Backport for enable PCIe/RCEC.

[bhelgaas: split from combined RCEC/RCiEP patch, commit log]
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/aer.c | 58 +++++++++++++++++++++++++++++-------------
 drivers/pci/pcie/err.c | 19 +++++++++++---
 2 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 6a92d96e1f1c..dbf933903917 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -295,7 +295,8 @@ int pci_aer_raw_clear_status(struct pci_dev *dev)
 		return -EIO;
 
 	port_type = pci_pcie_type(dev);
-	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
+	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
+	    port_type == PCI_EXP_TYPE_RC_EC) {
 		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, &status);
 		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, status);
 	}
@@ -571,7 +572,8 @@ static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
 	if ((a == &dev_attr_aer_rootport_total_err_cor.attr ||
 	     a == &dev_attr_aer_rootport_total_err_fatal.attr ||
 	     a == &dev_attr_aer_rootport_total_err_nonfatal.attr) &&
-	    pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT)
+	    ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT) &&
+	     (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC)))
 		return 0;
 
 	return a->mode;
@@ -1178,6 +1180,7 @@ static int set_device_error_reporting(struct pci_dev *dev, void *data)
 	int type = pci_pcie_type(dev);
 
 	if ((type == PCI_EXP_TYPE_ROOT_PORT) ||
+	    (type == PCI_EXP_TYPE_RC_EC) ||
 	    (type == PCI_EXP_TYPE_UPSTREAM) ||
 	    (type == PCI_EXP_TYPE_DOWNSTREAM)) {
 		if (enable)
@@ -1302,6 +1305,11 @@ static int aer_probe(struct pcie_device *dev)
 	struct device *device = &dev->device;
 	struct pci_dev *port = dev->port;
 
+	/* Limit to Root Ports or Root Complex Event Collectors */
+	if ((pci_pcie_type(port) != PCI_EXP_TYPE_RC_EC) &&
+	    (pci_pcie_type(port) != PCI_EXP_TYPE_ROOT_PORT))
+		return -ENODEV;
+
 	rpc = devm_kzalloc(device, sizeof(struct aer_rpc), GFP_KERNEL);
 	if (!rpc)
 		return -ENOMEM;
@@ -1323,36 +1331,52 @@ static int aer_probe(struct pcie_device *dev)
 }
 
 /**
- * aer_root_reset - reset link on Root Port
- * @dev: pointer to Root Port's pci_dev data structure
+ * aer_root_reset - reset Root Port hierarchy or RCEC
+ * @dev: pointer to Root Port or RCEC
  *
- * Invoked by Port Bus driver when performing link reset at Root Port.
+ * Invoked by Port Bus driver when performing reset.
  */
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 {
-	int aer = dev->aer_cap;
+	int type = pci_pcie_type(dev);
+	struct pci_dev *root;
+	int aer;
+	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 	u32 reg32;
 	int rc;
 
-	if (pcie_aer_is_native(dev)) {
+	root = dev;	/* device with Root Error registers */
+	aer = root->aer_cap;
+
+	if ((host->native_aer || pcie_ports_native) && aer) {
 		/* Disable Root's interrupt in response to error messages */
-		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
+		pci_read_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, &reg32);
 		reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
-		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32);
+		pci_write_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, reg32);
 	}
 
-	rc = pci_bus_error_reset(dev);
-	pci_info(dev, "Root Port link has been reset (%d)\n", rc);
+	if (type == PCI_EXP_TYPE_RC_EC) {
+		if (pcie_has_flr(dev)) {
+			rc = pcie_flr(dev);
+			pci_info(dev, "has been reset (%d)\n", rc);
+		} else {
+			pci_info(dev, "not reset (no FLR support)\n");
+			rc = -ENOTTY;
+		}
+	} else {
+		rc = pci_bus_error_reset(dev);
+		pci_info(dev, "Root Port link has been reset (%d)\n", rc);
+	}
 
-	if (pcie_aer_is_native(dev)) {
+	if ((host->native_aer || pcie_ports_native) && aer) {
 		/* Clear Root Error Status */
-		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, &reg32);
-		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, reg32);
+		pci_read_config_dword(root, aer + PCI_ERR_ROOT_STATUS, &reg32);
+		pci_write_config_dword(root, aer + PCI_ERR_ROOT_STATUS, reg32);
 
 		/* Enable Root Port's interrupt in response to error messages */
-		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
+		pci_read_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, &reg32);
 		reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
-		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32);
+		pci_write_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, reg32);
 	}
 
 	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
@@ -1360,7 +1384,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 
 static struct pcie_port_service_driver aerdriver = {
 	.name		= "aer",
-	.port_type	= PCI_EXP_TYPE_ROOT_PORT,
+	.port_type	= PCIE_ANY_PORT,
 	.service	= PCIE_PORT_SERVICE_AER,
 
 	.probe		= aer_probe,
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 2f3ebad99098..adc92d1c6d6f 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -148,13 +148,16 @@ static int report_resume(struct pci_dev *dev, void *data)
 
 /**
  * pci_walk_bridge - walk bridges potentially AER affected
- * @bridge:	bridge which may be a Port
+ * @bridge:	bridge which may be a Port or an RCEC
  * @cb:		callback to be called for each device found
  * @userdata:	arbitrary pointer to be passed to callback
  *
  * If the device provided is a bridge, walk the subordinate bus, including
  * any bridged devices on buses under this bus.  Call the provided callback
  * on each device found.
+ *
+ * If the device provided has no subordinate bus, e.g., an RCEC, call the
+ * callback on the device itself.
  */
 static void pci_walk_bridge(struct pci_dev *bridge,
 			    int (*cb)(struct pci_dev *, void *),
@@ -162,6 +165,8 @@ static void pci_walk_bridge(struct pci_dev *bridge,
 {
 	if (bridge->subordinate)
 		pci_walk_bus(bridge->subordinate, cb, userdata);
+	else
+		cb(bridge, userdata);
 }
 
 pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
@@ -174,11 +179,17 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 
 	/*
-	 * Error recovery runs on all subordinates of the bridge.  If the
-	 * bridge detected the error, it is cleared at the end.
+	 * If the error was detected by a Root Port, Downstream Port, or
+	 * RCEC, recovery runs on the device itself.  For Ports, that also
+	 * includes any subordinate devices.
+	 *
+	 * If it was detected by another device (Endpoint, etc), recovery
+	 * runs on the device and anything else under the same Port, i.e.,
+	 * everything under "bridge".
 	 */
 	if (type == PCI_EXP_TYPE_ROOT_PORT ||
-	    type == PCI_EXP_TYPE_DOWNSTREAM)
+	    type == PCI_EXP_TYPE_DOWNSTREAM ||
+	    type == PCI_EXP_TYPE_RC_EC)
 		bridge = dev;
 	else
 		bridge = pci_upstream_bridge(dev);
-- 
Gitee


From 3045f27c63250982e88a1748f94fb845c6669836 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:32 -0800
Subject: [PATCH 2128/2161] PCI/ERR: Add pcie_link_rcec() to associate RCiEPs

commit 507b460f814458605c47b0ed03c11e49a712fc08 upstream.

A Root Complex Event Collector terminates error and PME messages from
associated RCiEPs.

Use the RCEC Endpoint Association Extended Capability to identify
associated RCiEPs. Link the associated RCiEPs as the RCECs are enumerated.

Intel-SIG: commit 507b460f8144 PCI/ERR: Add pcie_link_rcec() to associate RCiEPs.
Backport for enable PCIe/RCEC.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-12-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pci.h              |  2 +
 drivers/pci/pcie/portdrv_pci.c |  3 ++
 drivers/pci/pcie/rcec.c        | 94 ++++++++++++++++++++++++++++++++++
 include/linux/pci.h            |  1 +
 4 files changed, 100 insertions(+)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index b2bddbd58501..fdd1be462a50 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -480,9 +480,11 @@ static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; }
 #ifdef CONFIG_PCIEPORTBUS
 void pci_rcec_init(struct pci_dev *dev);
 void pci_rcec_exit(struct pci_dev *dev);
+void pcie_link_rcec(struct pci_dev *rcec);
 #else
 static inline void pci_rcec_init(struct pci_dev *dev) {}
 static inline void pci_rcec_exit(struct pci_dev *dev) {}
+static inline void pcie_link_rcec(struct pci_dev *rcec) {}
 #endif
 
 #ifdef CONFIG_PCI_ATS
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 82e1866d61d2..d622024a41bc 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -111,6 +111,9 @@ static int pcie_portdrv_probe(struct pci_dev *dev,
 	     (type != PCI_EXP_TYPE_RC_EC)))
 		return -ENODEV;
 
+	if (type == PCI_EXP_TYPE_RC_EC)
+		pcie_link_rcec(dev);
+
 	status = pcie_port_device_register(dev);
 	if (status)
 		return status;
diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
index 038e9d706d5f..cdec277cbd62 100644
--- a/drivers/pci/pcie/rcec.c
+++ b/drivers/pci/pcie/rcec.c
@@ -15,6 +15,100 @@
 
 #include "../pci.h"
 
+struct walk_rcec_data {
+	struct pci_dev *rcec;
+	int (*user_callback)(struct pci_dev *dev, void *data);
+	void *user_data;
+};
+
+static bool rcec_assoc_rciep(struct pci_dev *rcec, struct pci_dev *rciep)
+{
+	unsigned long bitmap = rcec->rcec_ea->bitmap;
+	unsigned int devn;
+
+	/* An RCiEP found on a different bus in range */
+	if (rcec->bus->number != rciep->bus->number)
+		return true;
+
+	/* Same bus, so check bitmap */
+	for_each_set_bit(devn, &bitmap, 32)
+		if (devn == rciep->devfn)
+			return true;
+
+	return false;
+}
+
+static int link_rcec_helper(struct pci_dev *dev, void *data)
+{
+	struct walk_rcec_data *rcec_data = data;
+	struct pci_dev *rcec = rcec_data->rcec;
+
+	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
+	    rcec_assoc_rciep(rcec, dev)) {
+		dev->rcec = rcec;
+		pci_dbg(dev, "PME & error events signaled via %s\n",
+			pci_name(rcec));
+	}
+
+	return 0;
+}
+
+static void walk_rcec(int (*cb)(struct pci_dev *dev, void *data),
+		      void *userdata)
+{
+	struct walk_rcec_data *rcec_data = userdata;
+	struct pci_dev *rcec = rcec_data->rcec;
+	u8 nextbusn, lastbusn;
+	struct pci_bus *bus;
+	unsigned int bnr;
+
+	if (!rcec->rcec_ea)
+		return;
+
+	/* Walk own bus for bitmap based association */
+	pci_walk_bus(rcec->bus, cb, rcec_data);
+
+	nextbusn = rcec->rcec_ea->nextbusn;
+	lastbusn = rcec->rcec_ea->lastbusn;
+
+	/* All RCiEP devices are on the same bus as the RCEC */
+	if (nextbusn == 0xff && lastbusn == 0x00)
+		return;
+
+	for (bnr = nextbusn; bnr <= lastbusn; bnr++) {
+		/* No association indicated (PCIe 5.0-1, 7.9.10.3) */
+		if (bnr == rcec->bus->number)
+			continue;
+
+		bus = pci_find_bus(pci_domain_nr(rcec->bus), bnr);
+		if (!bus)
+			continue;
+
+		/* Find RCiEP devices on the given bus ranges */
+		pci_walk_bus(bus, cb, rcec_data);
+	}
+}
+
+/**
+ * pcie_link_rcec - Link RCiEP devices associated with RCEC.
+ * @rcec: RCEC whose RCiEP devices should be linked.
+ *
+ * Link the given RCEC to each RCiEP device found.
+ */
+void pcie_link_rcec(struct pci_dev *rcec)
+{
+	struct walk_rcec_data rcec_data;
+
+	if (!rcec->rcec_ea)
+		return;
+
+	rcec_data.rcec = rcec;
+	rcec_data.user_callback = NULL;
+	rcec_data.user_data = NULL;
+
+	walk_rcec(link_rcec_helper, &rcec_data);
+}
+
 void pci_rcec_init(struct pci_dev *dev)
 {
 	struct rcec_ea *rcec_ea;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f01b62c93dde..815d5f9c7957 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -313,6 +313,7 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCIEPORTBUS
 	struct rcec_ea	*rcec_ea;	/* RCEC cached endpoint association */
+	struct pci_dev  *rcec;          /* Associated RCEC device */
 #endif
 	u8		pcie_cap;	/* PCIe capability offset */
 	u8		msi_cap;	/* MSI capability offset */
-- 
Gitee


From 4f162428a57cfac0055e139d5218f6d383cbd516 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 20 Nov 2020 16:10:33 -0800
Subject: [PATCH 2129/2161] PCI/ERR: Recover from RCiEP AER errors

commit 5790862255028c831761e13014ee87a06df828f1 upstream.

Add support for handling AER errors detected by Root Complex Integrated
Endpoints (RCiEPs).  These errors are signaled to software natively via a
Root Complex Event Collector (RCEC) or non-natively via ACPI APEI if the
platform retains control of AER or uses a non-standard RCEC-like device.

When recovering from RCiEP errors, the Root Error Command and Status
registers are in the AER Capability of an associated RCEC (if any), not in
a Root Port.  In the non-native case, the platform is responsible for those
registers and we can't touch them.

Intel-SIG: commit 579086225502 PCI/ERR: Recover from RCiEP AER errors.
Backport for enable PCIe/RCEC.

[bhelgaas: commit log, etc]
Co-developed-by: Sean V Kelley <sean.v.kelley@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-13-sean.v.kelley@intel.com
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/aer.c | 24 +++++++++++++++++++-----
 drivers/pci/pcie/err.c | 15 ++++++++-------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index dbf933903917..dfc552dbd9a5 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1331,8 +1331,8 @@ static int aer_probe(struct pcie_device *dev)
 }
 
 /**
- * aer_root_reset - reset Root Port hierarchy or RCEC
- * @dev: pointer to Root Port or RCEC
+ * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
+ * @dev: pointer to Root Port, RCEC, or RCiEP
  *
  * Invoked by Port Bus driver when performing reset.
  */
@@ -1345,8 +1345,22 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 	u32 reg32;
 	int rc;
 
-	root = dev;	/* device with Root Error registers */
-	aer = root->aer_cap;
+	/*
+	 * Only Root Ports and RCECs have AER Root Command and Root Status
+	 * registers.  If "dev" is an RCiEP, the relevant registers are in
+	 * the RCEC.
+	 */
+	if (type == PCI_EXP_TYPE_RC_END)
+		root = dev->rcec;
+	else
+		root = dev;
+
+	/*
+	 * If the platform retained control of AER, an RCiEP may not have
+	 * an RCEC visible to us, so dev->rcec ("root") may be NULL.  In
+	 * that case, firmware is responsible for these registers.
+	 */
+	aer = root ? root->aer_cap : 0;
 
 	if ((host->native_aer || pcie_ports_native) && aer) {
 		/* Disable Root's interrupt in response to error messages */
@@ -1355,7 +1369,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 		pci_write_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, reg32);
 	}
 
-	if (type == PCI_EXP_TYPE_RC_EC) {
+	if (type == PCI_EXP_TYPE_RC_EC || type == PCI_EXP_TYPE_RC_END) {
 		if (pcie_has_flr(dev)) {
 			rc = pcie_flr(dev);
 			pci_info(dev, "has been reset (%d)\n", rc);
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index adc92d1c6d6f..168674bfbec5 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -148,7 +148,7 @@ static int report_resume(struct pci_dev *dev, void *data)
 
 /**
  * pci_walk_bridge - walk bridges potentially AER affected
- * @bridge:	bridge which may be a Port or an RCEC
+ * @bridge:	bridge which may be a Port, an RCEC, or an RCiEP
  * @cb:		callback to be called for each device found
  * @userdata:	arbitrary pointer to be passed to callback
  *
@@ -156,8 +156,8 @@ static int report_resume(struct pci_dev *dev, void *data)
  * any bridged devices on buses under this bus.  Call the provided callback
  * on each device found.
  *
- * If the device provided has no subordinate bus, e.g., an RCEC, call the
- * callback on the device itself.
+ * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
+ * call the callback on the device itself.
  */
 static void pci_walk_bridge(struct pci_dev *bridge,
 			    int (*cb)(struct pci_dev *, void *),
@@ -179,9 +179,9 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 
 	/*
-	 * If the error was detected by a Root Port, Downstream Port, or
-	 * RCEC, recovery runs on the device itself.  For Ports, that also
-	 * includes any subordinate devices.
+	 * If the error was detected by a Root Port, Downstream Port, RCEC,
+	 * or RCiEP, recovery runs on the device itself.  For Ports, that
+	 * also includes any subordinate devices.
 	 *
 	 * If it was detected by another device (Endpoint, etc), recovery
 	 * runs on the device and anything else under the same Port, i.e.,
@@ -189,7 +189,8 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	 */
 	if (type == PCI_EXP_TYPE_ROOT_PORT ||
 	    type == PCI_EXP_TYPE_DOWNSTREAM ||
-	    type == PCI_EXP_TYPE_RC_EC)
+	    type == PCI_EXP_TYPE_RC_EC ||
+	    type == PCI_EXP_TYPE_RC_END)
 		bridge = dev;
 	else
 		bridge = pci_upstream_bridge(dev);
-- 
Gitee


From 19d5a5885560aeba6525914ebb5ef3a349013742 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:34 -0800
Subject: [PATCH 2130/2161] PCI/AER: Add pcie_walk_rcec() to RCEC AER handling

commit af113553d9610b2d811d05da96263b4f666f44f0 upstream.

Root Complex Event Collectors (RCEC) appear as peers to Root Ports and also
have the AER capability. In addition, actions need to be taken for
associated RCiEPs. In such cases the RCECs will need to be walked in order
to find and act upon their respective RCiEPs.

Extend the existing ability to link the RCECs with a walking function
pcie_walk_rcec(). Add RCEC support to the current AER service driver and
attach the AER service driver to the RCEC device.

Intel-SIG: commit af113553d961 PCI/AER: Add pcie_walk_rcec() to RCEC AER
handling.
Backport for enable PCIe/RCEC.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-14-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pci.h       |  6 ++++++
 drivers/pci/pcie/aer.c  | 15 +++++++++++----
 drivers/pci/pcie/rcec.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fdd1be462a50..74a9f61bf057 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -481,10 +481,16 @@ static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; }
 void pci_rcec_init(struct pci_dev *dev);
 void pci_rcec_exit(struct pci_dev *dev);
 void pcie_link_rcec(struct pci_dev *rcec);
+void pcie_walk_rcec(struct pci_dev *rcec,
+		    int (*cb)(struct pci_dev *, void *),
+		    void *userdata);
 #else
 static inline void pci_rcec_init(struct pci_dev *dev) {}
 static inline void pci_rcec_exit(struct pci_dev *dev) {}
 static inline void pcie_link_rcec(struct pci_dev *rcec) {}
+static inline void pcie_walk_rcec(struct pci_dev *rcec,
+				  int (*cb)(struct pci_dev *, void *),
+				  void *userdata) {}
 #endif
 
 #ifdef CONFIG_PCI_ATS
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index dfc552dbd9a5..0bc2f1ec893e 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -890,7 +890,10 @@ static bool find_source_device(struct pci_dev *parent,
 	if (result)
 		return true;
 
-	pci_walk_bus(parent->subordinate, find_device_iter, e_info);
+	if (pci_pcie_type(parent) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(parent, find_device_iter, e_info);
+	else
+		pci_walk_bus(parent->subordinate, find_device_iter, e_info);
 
 	if (!e_info->error_dev_num) {
 		pci_info(parent, "can't find device of ID%04x\n", e_info->id);
@@ -1028,6 +1031,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 		if (!(info->status & ~info->mask))
 			return 0;
 	} else if (type == PCI_EXP_TYPE_ROOT_PORT ||
+		   type == PCI_EXP_TYPE_RC_EC ||
 		   type == PCI_EXP_TYPE_DOWNSTREAM ||
 		   info->severity == AER_NONFATAL) {
 
@@ -1205,9 +1209,12 @@ static void set_downstream_devices_error_reporting(struct pci_dev *dev,
 {
 	set_device_error_reporting(dev, &enable);
 
-	if (!dev->subordinate)
-		return;
-	pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable);
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(dev, set_device_error_reporting, &enable);
+	else if (dev->subordinate)
+		pci_walk_bus(dev->subordinate, set_device_error_reporting,
+			     &enable);
+
 }
 
 /**
diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
index cdec277cbd62..2c5c552994e4 100644
--- a/drivers/pci/pcie/rcec.c
+++ b/drivers/pci/pcie/rcec.c
@@ -53,6 +53,18 @@ static int link_rcec_helper(struct pci_dev *dev, void *data)
 	return 0;
 }
 
+static int walk_rcec_helper(struct pci_dev *dev, void *data)
+{
+	struct walk_rcec_data *rcec_data = data;
+	struct pci_dev *rcec = rcec_data->rcec;
+
+	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
+	    rcec_assoc_rciep(rcec, dev))
+		rcec_data->user_callback(dev, rcec_data->user_data);
+
+	return 0;
+}
+
 static void walk_rcec(int (*cb)(struct pci_dev *dev, void *data),
 		      void *userdata)
 {
@@ -109,6 +121,31 @@ void pcie_link_rcec(struct pci_dev *rcec)
 	walk_rcec(link_rcec_helper, &rcec_data);
 }
 
+/**
+ * pcie_walk_rcec - Walk RCiEP devices associating with RCEC and call callback.
+ * @rcec:	RCEC whose RCiEP devices should be walked
+ * @cb:		Callback to be called for each RCiEP device found
+ * @userdata:	Arbitrary pointer to be passed to callback
+ *
+ * Walk the given RCEC. Call the callback on each RCiEP found.
+ *
+ * If @cb returns anything other than 0, break out.
+ */
+void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *),
+		    void *userdata)
+{
+	struct walk_rcec_data rcec_data;
+
+	if (!rcec->rcec_ea)
+		return;
+
+	rcec_data.rcec = rcec;
+	rcec_data.user_callback = cb;
+	rcec_data.user_data = userdata;
+
+	walk_rcec(walk_rcec_helper, &rcec_data);
+}
+
 void pci_rcec_init(struct pci_dev *dev)
 {
 	struct rcec_ea *rcec_ea;
-- 
Gitee


From 0e0da8255db53b5c7c63b7c71b484e53dc1bb7d9 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:35 -0800
Subject: [PATCH 2131/2161] PCI/PME: Add pcie_walk_rcec() to RCEC PME handling

commit 9a2f604f44979e0effa8cf067e5a8ecda729f23b upstream.

Root Complex Event Collectors (RCEC) appear as peers of Root Ports and also
have the PME capability. As with AER, there is a need to be able to walk
the RCiEPs associated with their RCEC for purposes of acting upon them with
callbacks.

Add RCEC support through the use of pcie_walk_rcec() to the current PME
service driver and attach the PME service driver to the RCEC device.

Intel-SIG: commit 9a2f604f4497 PCI/PME: Add pcie_walk_rcec() to RCEC PME
handling.
Backport for enable PCIe/RCEC.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-15-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/pme.c          | 16 ++++++++++++----
 drivers/pci/pcie/portdrv_core.c |  9 +++------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c
index f38e6c19dd50..2d25c047e1c4 100644
--- a/drivers/pci/pcie/pme.c
+++ b/drivers/pci/pcie/pme.c
@@ -310,7 +310,10 @@ static int pcie_pme_can_wakeup(struct pci_dev *dev, void *ign)
 static void pcie_pme_mark_devices(struct pci_dev *port)
 {
 	pcie_pme_can_wakeup(port, NULL);
-	if (port->subordinate)
+
+	if (pci_pcie_type(port) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(port, pcie_pme_can_wakeup, NULL);
+	else if (port->subordinate)
 		pci_walk_bus(port->subordinate, pcie_pme_can_wakeup, NULL);
 }
 
@@ -320,10 +323,16 @@ static void pcie_pme_mark_devices(struct pci_dev *port)
  */
 static int pcie_pme_probe(struct pcie_device *srv)
 {
-	struct pci_dev *port;
+	struct pci_dev *port = srv->port;
 	struct pcie_pme_service_data *data;
+	int type = pci_pcie_type(port);
 	int ret;
 
+	/* Limit to Root Ports or Root Complex Event Collectors */
+	if (type != PCI_EXP_TYPE_RC_EC &&
+	    type != PCI_EXP_TYPE_ROOT_PORT)
+		return -ENODEV;
+
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
@@ -333,7 +342,6 @@ static int pcie_pme_probe(struct pcie_device *srv)
 	data->srv = srv;
 	set_service_data(srv, data);
 
-	port = srv->port;
 	pcie_pme_interrupt_enable(port, false);
 	pcie_clear_root_pme_status(port);
 
@@ -445,7 +453,7 @@ static void pcie_pme_remove(struct pcie_device *srv)
 
 static struct pcie_port_service_driver pcie_pme_driver = {
 	.name		= "pcie_pme",
-	.port_type	= PCI_EXP_TYPE_ROOT_PORT,
+	.port_type	= PCIE_ANY_PORT,
 	.service	= PCIE_PORT_SERVICE_PME,
 
 	.probe		= pcie_pme_probe,
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 50a9522ab07d..e1fed6649c41 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -233,12 +233,9 @@ static int get_port_device_capability(struct pci_dev *dev)
 	}
 #endif
 
-	/*
-	 * Root ports are capable of generating PME too.  Root Complex
-	 * Event Collectors can also generate PMEs, but we don't handle
-	 * those yet.
-	 */
-	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT &&
+	/* Root Ports and Root Complex Event Collectors may generate PMEs */
+	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
+	     pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC) &&
 	    (pcie_ports_native || host->native_pme)) {
 		services |= PCIE_PORT_SERVICE_PME;
 
-- 
Gitee


From c3960acfce6cb5feddf90501916326f7e3319bf7 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 20 Nov 2020 16:10:36 -0800
Subject: [PATCH 2132/2161] PCI/AER: Add RCEC AER error injection support

commit d292dd0eb3ac6ce6ea66715bb9f6b8e2ae70747c upstream.

Root Complex Event Collectors (RCEC) appear as peers to Root Ports and may
also have the AER capability.

Add RCEC support to the AER error injection driver.

Intel-SIG: commit d292dd0eb3ac PCI/AER: Add RCEC AER error injection support.
Backport for enable PCIe/RCEC.

Co-developed-by: Sean V Kelley <sean.v.kelley@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-16-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/aer_inject.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer_inject.c b/drivers/pci/pcie/aer_inject.c
index 21cc3d3387f7..f7e452728d1d 100644
--- a/drivers/pci/pcie/aer_inject.c
+++ b/drivers/pci/pcie/aer_inject.c
@@ -333,8 +333,11 @@ static int aer_inject(struct aer_error_inj *einj)
 	if (!dev)
 		return -ENODEV;
 	rpdev = pcie_find_root_port(dev);
+	/* If Root Port not found, try to find an RCEC */
+	if (!rpdev)
+		rpdev = dev->rcec;
 	if (!rpdev) {
-		pci_err(dev, "Root port not found\n");
+		pci_err(dev, "Neither Root Port nor RCEC found\n");
 		ret = -ENODEV;
 		goto out_put;
 	}
-- 
Gitee


From aafbb7a11e22fe469e993585afd2fde4de5dcdaa Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Mon, 22 Feb 2021 09:17:17 +0800
Subject: [PATCH 2133/2161] PCI/RCEC: Fix RCiEP device to RCEC association

commit a175102b0a82fc57853a9e611c42d1d6172e5180 upstream.

rcec_assoc_rciep() used "rciep->devfn" (a single byte encoding both the
device and function number) as the device number to check whether the
corresponding bit was set in the RCEC's Association Bitmap for RCiEPs.

But per PCIe r5.0, sec 7.9.10.2, "Association Bitmap for RCiEPs", the
32-bit bitmap contains one bit per device.  That bit applies to all
functions of the device.

Fix rcec_assoc_rciep() to convert the value of "rciep->devfn" to a device
number to ensure that RCiEP devices are correctly associated with the RCEC.

Intel-SIG: commit a175102b0a82 PCI/ERR: Recover from RCEC AER errors.
Backport for enable PCIe/RCEC.

Reported-and-tested-by: Wen Jin <wen.jin@intel.com>
Fixes: 507b460f8144 ("PCI/ERR: Add pcie_link_rcec() to associate RCiEPs")
Link: https://lore.kernel.org/r/20210222011717.43266-1-qiuxu.zhuo@intel.com
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sean V Kelley <sean.v.kelley@intel.com>
[ Youquan Song amend commit log ]
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/pci/pcie/rcec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
index 2c5c552994e4..d0bcd141ac9c 100644
--- a/drivers/pci/pcie/rcec.c
+++ b/drivers/pci/pcie/rcec.c
@@ -32,7 +32,7 @@ static bool rcec_assoc_rciep(struct pci_dev *rcec, struct pci_dev *rciep)
 
 	/* Same bus, so check bitmap */
 	for_each_set_bit(devn, &bitmap, 32)
-		if (devn == rciep->devfn)
+		if (devn == PCI_SLOT(rciep->devfn))
 			return true;
 
 	return false;
-- 
Gitee


From 9548fb4429dc1bb5a4376e1f94e013a4f1c66e80 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:42:55 -0800
Subject: [PATCH 2134/2161] ACPI: NUMA: Establish a new drivers/acpi/numa/
 directory

commit c710fcc5d95a5e0d1648c40c0b101e198bfc3459 upstream.

Currently hmat.c lives under an "hmat" directory which does not enhance
the description of the file. The initial motivation for giving hmat.c
its own directory was to delineate it as mm functionality in contrast to
ACPI device driver functionality.

As ACPI continues to play an increasing role in conveying
memory location and performance topology information to the OS take the
opportunity to co-locate these NUMA relevant tables in a combined
directory.

numa.c is renamed to srat.c and moved to drivers/acpi/numa/ along with
hmat.c.

Intel-SIG: commit c710fcc5d95a ACPI: NUMA: Establish a new drivers/acpi/
numa/directory.
Backport for "EFI Specific Purpose Memory Support".

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/acpi/Kconfig                 | 9 +--------
 drivers/acpi/Makefile                | 3 +--
 drivers/acpi/{hmat => numa}/Kconfig  | 6 ++++++
 drivers/acpi/{hmat => numa}/Makefile | 3 ++-
 drivers/acpi/{hmat => numa}/hmat.c   | 0
 drivers/acpi/{numa.c => numa/srat.c} | 0
 6 files changed, 10 insertions(+), 11 deletions(-)
 rename drivers/acpi/{hmat => numa}/Kconfig (77%)
 rename drivers/acpi/{hmat => numa}/Makefile (37%)
 rename drivers/acpi/{hmat => numa}/hmat.c (100%)
 rename drivers/acpi/{numa.c => numa/srat.c} (100%)

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 0bd025566712..8140180fcdaa 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -320,12 +320,6 @@ config ACPI_THERMAL
 	  To compile this driver as a module, choose M here:
 	  the module will be called thermal.
 
-config ACPI_NUMA
-	bool "NUMA support"
-	depends on NUMA
-	depends on (X86 || IA64 || ARM64)
-	default y if IA64 || ARM64
-
 config ACPI_CUSTOM_DSDT_FILE
 	string "Custom DSDT Table file to include"
 	default ""
@@ -474,8 +468,7 @@ config ACPI_REDUCED_HARDWARE_ONLY
 	  If you are unsure what to do, do not enable this option.
 
 source "drivers/acpi/nfit/Kconfig"
-source "drivers/acpi/hmat/Kconfig"
-
+source "drivers/acpi/numa/Kconfig"
 source "drivers/acpi/apei/Kconfig"
 source "drivers/acpi/dptf/Kconfig"
 source "drivers/acpi/pfru/Kconfig"
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index d129670daea3..87006b9fe017 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -55,7 +55,6 @@ acpi-$(CONFIG_X86)		+= acpi_cmos_rtc.o
 acpi-$(CONFIG_X86)		+= x86/apple.o
 acpi-$(CONFIG_X86)		+= x86/utils.o
 acpi-$(CONFIG_DEBUG_FS)		+= debugfs.o
-acpi-$(CONFIG_ACPI_NUMA)	+= numa.o
 acpi-$(CONFIG_ACPI_PROCFS_POWER) += cm_sbs.o
 acpi-y				+= acpi_lpat.o
 acpi-$(CONFIG_ACPI_LPIT)	+= acpi_lpit.o
@@ -81,7 +80,7 @@ obj-$(CONFIG_ACPI_PROCESSOR)	+= processor.o
 obj-$(CONFIG_ACPI)		+= container.o
 obj-$(CONFIG_ACPI_THERMAL)	+= thermal.o
 obj-$(CONFIG_ACPI_NFIT)		+= nfit/
-obj-$(CONFIG_ACPI_HMAT)		+= hmat/
+obj-$(CONFIG_ACPI_NUMA)		+= numa/
 obj-$(CONFIG_ACPI)		+= acpi_memhotplug.o
 obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
 obj-$(CONFIG_ACPI_BATTERY)	+= battery.o
diff --git a/drivers/acpi/hmat/Kconfig b/drivers/acpi/numa/Kconfig
similarity index 77%
rename from drivers/acpi/hmat/Kconfig
rename to drivers/acpi/numa/Kconfig
index 95a29964dbea..acbd5aa76e40 100644
--- a/drivers/acpi/hmat/Kconfig
+++ b/drivers/acpi/numa/Kconfig
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+config ACPI_NUMA
+	bool "NUMA support"
+	depends on NUMA
+	depends on (X86 || IA64 || ARM64)
+	default y if IA64 || ARM64
+
 config ACPI_HMAT
 	bool "ACPI Heterogeneous Memory Attribute Table Support"
 	depends on ACPI_NUMA
diff --git a/drivers/acpi/hmat/Makefile b/drivers/acpi/numa/Makefile
similarity index 37%
rename from drivers/acpi/hmat/Makefile
rename to drivers/acpi/numa/Makefile
index 1c20ef36a385..517a6c689a94 100644
--- a/drivers/acpi/hmat/Makefile
+++ b/drivers/acpi/numa/Makefile
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_ACPI_HMAT) := hmat.o
+obj-$(CONFIG_ACPI_NUMA) += srat.o
+obj-$(CONFIG_ACPI_HMAT) += hmat.o
diff --git a/drivers/acpi/hmat/hmat.c b/drivers/acpi/numa/hmat.c
similarity index 100%
rename from drivers/acpi/hmat/hmat.c
rename to drivers/acpi/numa/hmat.c
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa/srat.c
similarity index 100%
rename from drivers/acpi/numa.c
rename to drivers/acpi/numa/srat.c
-- 
Gitee


From 581662163426e1ea6ca654c35f277795154ce3c0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:00 -0800
Subject: [PATCH 2135/2161] efi: Enumerate EFI_MEMORY_SP

commit fe3e5e65c06edb1c56e64e567f053e243142001f upstream.

UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the
interpretation of the EFI Memory Types as "reserved for a specific
purpose". The intent of this bit is to allow the OS to identify precious
or scarce memory resources and optionally manage it separately from
EfiConventionalMemory. As defined older OSes that do not know about this
attribute are permitted to ignore it and the memory will be handled
according to the OS default policy for the given memory type.

In other words, this "specific purpose" hint is deliberately weaker than
EfiReservedMemoryType in that the system continues to operate if the OS
takes no action on the attribute. The risk of taking no action is
potentially unwanted / unmovable kernel allocations from the designated
resource that prevent the full realization of the "specific purpose".
For example, consider a system with a high-bandwidth memory pool. Older
kernels are permitted to boot and consume that memory as conventional
"System-RAM" newer kernels may arrange for that memory to be set aside
(soft reserved) by the system administrator for a dedicated
high-bandwidth memory aware application to consume.

Specifically, this mechanism allows for the elimination of scenarios
where platform firmware tries to game OS policy by lying about ACPI SLIT
values, i.e. claiming that a precious memory resource has a high
distance to trigger the OS to avoid it by default. This reservation hint
allows platform-firmware to instead tell the truth about performance
characteristics by indicate to OS memory management to put immovable
allocations elsewhere.

Implement simple detection of the bit for EFI memory table dumps and
save the kernel policy for a follow-on change.

Intel-SIG: commit fe3e5e65c06e efi: Enumerate EFI_MEMORY_SP.
Backport for "EFI Specific Purpose Memory Support".

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/firmware/efi/efi.c | 5 +++--
 include/linux/efi.h        | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 3a2b60736915..aab8fd400485 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -844,15 +844,16 @@ char * __init efi_md_typeattr_format(char *buf, size_t size,
 	if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT |
 		     EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_RO |
 		     EFI_MEMORY_WP | EFI_MEMORY_RP | EFI_MEMORY_XP |
-		     EFI_MEMORY_NV |
+		     EFI_MEMORY_NV | EFI_MEMORY_SP |
 		     EFI_MEMORY_RUNTIME | EFI_MEMORY_MORE_RELIABLE))
 		snprintf(pos, size, "|attr=0x%016llx]",
 			 (unsigned long long)attr);
 	else
 		snprintf(pos, size,
-			 "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
+			 "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
 			 attr & EFI_MEMORY_RUNTIME ? "RUN" : "",
 			 attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "",
+			 attr & EFI_MEMORY_SP      ? "SP"  : "",
 			 attr & EFI_MEMORY_NV      ? "NV"  : "",
 			 attr & EFI_MEMORY_XP      ? "XP"  : "",
 			 attr & EFI_MEMORY_RP      ? "RP"  : "",
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 401e98551376..60cc4d9eaa31 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -114,6 +114,7 @@ typedef	struct {
 #define EFI_MEMORY_MORE_RELIABLE \
 				((u64)0x0000000000010000ULL)	/* higher reliability */
 #define EFI_MEMORY_RO		((u64)0x0000000000020000ULL)	/* read-only */
+#define EFI_MEMORY_SP		((u64)0x0000000000040000ULL)	/* soft reserved */
 #define EFI_MEMORY_RUNTIME	((u64)0x8000000000000000ULL)	/* range requires runtime mapping */
 #define EFI_MEMORY_DESCRIPTOR_VERSION	1
 
-- 
Gitee


From 717117be60c2b5ecf94e3ca47c6102572a6492bd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:05 -0800
Subject: [PATCH 2136/2161] x86/efi: Push EFI_MEMMAP check into leaf routines

commit 6950e31b35fdf4588cbbdec1813091bb02cf8871 upstream.

In preparation for adding another EFI_MEMMAP dependent call that needs
to occur before e820__memblock_setup() fixup the existing efi calls to
check for EFI_MEMMAP internally. This ends up being cleaner than the
alternative of checking EFI_MEMMAP multiple times in setup_arch().

Intel-SIG: commit 6950e31b35fd x86/efi: Push EFI_MEMMAP check into leaf
routines.
Backport for "EFI Specific Purpose Memory Support".

Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 arch/x86/include/asm/efi.h      |  9 ++++++++-
 arch/x86/kernel/setup.c         | 18 ++++++++----------
 arch/x86/platform/efi/efi.c     |  3 +++
 arch/x86/platform/efi/quirks.c  |  3 +++
 drivers/firmware/efi/esrt.c     |  3 +++
 drivers/firmware/efi/fake_mem.c |  2 +-
 include/linux/efi.h             |  1 -
 7 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 43a82e59c59d..45f853bce869 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -140,7 +140,6 @@ extern void efi_delete_dummy_variable(void);
 extern void efi_switch_mm(struct mm_struct *mm);
 extern void efi_recover_from_page_fault(unsigned long phys_addr);
 extern void efi_free_boot_services(void);
-extern void efi_reserve_boot_services(void);
 
 struct efi_setup_data {
 	u64 fw_vendor;
@@ -244,6 +243,8 @@ static inline bool efi_is_64bit(void)
 extern bool efi_reboot_required(void);
 extern bool efi_is_table_address(unsigned long phys_addr);
 
+extern void efi_find_mirror(void);
+extern void efi_reserve_boot_services(void);
 #else
 static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
 static inline bool efi_reboot_required(void)
@@ -254,6 +255,12 @@ static inline  bool efi_is_table_address(unsigned long phys_addr)
 {
 	return false;
 }
+static inline void efi_find_mirror(void)
+{
+}
+static inline void efi_reserve_boot_services(void)
+{
+}
 #endif /* CONFIG_EFI */
 
 #endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c89294189679..1e331a0868ac 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1129,17 +1129,15 @@ void __init setup_arch(char **cmdline_p)
 
 	reserve_bios_regions();
 
-	if (efi_enabled(EFI_MEMMAP)) {
-		efi_fake_memmap();
-		efi_find_mirror();
-		efi_esrt_init();
+	efi_fake_memmap();
+	efi_find_mirror();
+	efi_esrt_init();
 
-		/*
-		 * The EFI specification says that boot service code won't be
-		 * called after ExitBootServices(). This is, in fact, a lie.
-		 */
-		efi_reserve_boot_services();
-	}
+	/*
+	 * The EFI specification says that boot service code won't be
+	 * called after ExitBootServices(). This is, in fact, a lie.
+	 */
+	efi_reserve_boot_services();
 
 	/* preallocate 4k for mptable mpc */
 	e820__memblock_alloc_reserved_mpc_new();
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index ae1c5baf27cd..4659c065d840 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -130,6 +130,9 @@ void __init efi_find_mirror(void)
 	efi_memory_desc_t *md;
 	u64 mirror_size = 0, total_size = 0;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index aefe845dff59..f8f0220b6a66 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -318,6 +318,9 @@ void __init efi_reserve_boot_services(void)
 {
 	efi_memory_desc_t *md;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	for_each_efi_memory_desc(md) {
 		u64 start = md->phys_addr;
 		u64 size = md->num_pages << EFI_PAGE_SHIFT;
diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c
index e8f71a50ba89..9761f303bcb9 100644
--- a/drivers/firmware/efi/esrt.c
+++ b/drivers/firmware/efi/esrt.c
@@ -246,6 +246,9 @@ void __init efi_esrt_init(void)
 	int rc;
 	phys_addr_t end;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	pr_debug("esrt-init: loading.\n");
 	if (!esrt_table_exists())
 		return;
diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 9501edc0fcfb..526b45331d96 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -44,7 +44,7 @@ void __init efi_fake_memmap(void)
 	void *new_memmap;
 	int i;
 
-	if (!nr_fake_mem)
+	if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
 		return;
 
 	/* count up the number of EFI memory descriptor */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 60cc4d9eaa31..285f7dbeae3a 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1101,7 +1101,6 @@ static inline void efi_enter_virtual_mode (void) {}
 extern efi_status_t efi_query_variable_store(u32 attributes,
 					     unsigned long size,
 					     bool nonblocking);
-extern void efi_find_mirror(void);
 #else
 
 static inline efi_status_t efi_query_variable_store(u32 attributes,
-- 
Gitee


From 2a8d8ad8bd8cec6ba8273ae28568a942a8a57c03 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:11 -0800
Subject: [PATCH 2137/2161] efi: Common enable/disable infrastructure for EFI
 soft reservation

commit b617c5266eedbef2ccbb90931bb9175faa4ae0bc upstream.

UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the
interpretation of the EFI Memory Types as "reserved for a specific
purpose".

The proposed Linux behavior for specific purpose memory is that it is
reserved for direct-access (device-dax) by default and not available for
any kernel usage, not even as an OOM fallback.  Later, through udev
scripts or another init mechanism, these device-dax claimed ranges can
be reconfigured and hot-added to the available System-RAM with a unique
node identifier. This device-dax management scheme implements "soft" in
the "soft reserved" designation by allowing some or all of the
reservation to be recovered as typical memory. This policy can be
disabled at compile-time with CONFIG_EFI_SOFT_RESERVE=n, or runtime with
efi=nosoftreserve.

As for this patch, define the common helpers to determine if the
EFI_MEMORY_SP attribute should be honored. The determination needs to be
made early to prevent the kernel from being loaded into soft-reserved
memory, or otherwise allowing early allocations to land there. Follow-on
changes are needed per architecture to leverage these helpers in their
respective mem-init paths.

Intel-SIG: commit b617c5266eed efi: Common enable/disable infrastructure
for EFI soft reservation.
Backport for "EFI Specific Purpose Memory Support".

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 .../admin-guide/kernel-parameters.txt         |  9 +++++++-
 drivers/firmware/efi/Kconfig                  | 21 +++++++++++++++++++
 drivers/firmware/efi/efi.c                    |  8 +++++++
 .../firmware/efi/libstub/efi-stub-helper.c    | 19 +++++++++++++++++
 include/linux/efi.h                           | 14 +++++++++++++
 5 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e909389d411f..7f32edf5b29e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1175,7 +1175,8 @@
 			Format: {"off" | "on" | "skip[mbr]"}
 
 	efi=		[EFI]
-			Format: { "old_map", "nochunk", "noruntime", "debug" }
+			Format: { "old_map", "nochunk", "noruntime", "debug",
+				  "nosoftreserve" }
 			old_map [X86-64]: switch to the old ioremap-based EFI
 			runtime services mapping. 32-bit still uses this one by
 			default.
@@ -1184,6 +1185,12 @@
 			firmware implementations.
 			noruntime : disable EFI runtime services support
 			debug: enable misc debug output
+			nosoftreserve: The EFI_MEMORY_SP (Specific Purpose)
+			attribute may cause the kernel to reserve the
+			memory range for a memory mapping driver to
+			claim. Specify efi=nosoftreserve to disable this
+			reservation and treat the memory by its base type
+			(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
 
 	efi_no_storage_paranoia [EFI; X86]
 			Using this parameter you can use more than 50% of
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 3222645c95b3..4a3355c75dbd 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -75,6 +75,27 @@ config EFI_MAX_FAKE_MEM
 	  Ranges can be set up to this value using comma-separated list.
 	  The default value is 8.
 
+config EFI_SOFT_RESERVE
+	bool "Reserve EFI Specific Purpose Memory"
+	depends on EFI && EFI_STUB && ACPI_HMAT
+	default ACPI_HMAT
+	help
+	  On systems that have mixed performance classes of memory EFI
+	  may indicate specific purpose memory with an attribute (See
+	  EFI_MEMORY_SP in UEFI 2.8). A memory range tagged with this
+	  attribute may have unique performance characteristics compared
+	  to the system's general purpose "System RAM" pool. On the
+	  expectation that such memory has application specific usage,
+	  and its base EFI memory type is "conventional" answer Y to
+	  arrange for the kernel to reserve it as a "Soft Reserved"
+	  resource, and set aside for direct-access (device-dax) by
+	  default. The memory range can later be optionally assigned to
+	  the page allocator by system administrator policy via the
+	  device-dax kmem facility. Say N to have the kernel treat this
+	  memory as "System RAM" by default.
+
+	  If unsure, say Y.
+
 config EFI_PARAMS_FROM_FDT
 	bool
 	help
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index aab8fd400485..acde8985862a 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -81,6 +81,11 @@ bool efi_runtime_disabled(void)
 	return disable_runtime;
 }
 
+bool __pure __efi_soft_reserve_enabled(void)
+{
+	return !efi_enabled(EFI_MEM_NO_SOFT_RESERVE);
+}
+
 static int __init parse_efi_cmdline(char *str)
 {
 	if (!str) {
@@ -94,6 +99,9 @@ static int __init parse_efi_cmdline(char *str)
 	if (parse_option_str(str, "noruntime"))
 		disable_runtime = true;
 
+	if (parse_option_str(str, "nosoftreserve"))
+		set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags);
+
 	return 0;
 }
 early_param("efi", parse_efi_cmdline);
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 35dbc2791c97..e02579907f2e 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -32,6 +32,7 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 static int __section(.data) __nokaslr;
 static int __section(.data) __quiet;
 static int __section(.data) __novamap;
+static bool __section(.data) efi_nosoftreserve;
 
 int __pure nokaslr(void)
 {
@@ -45,6 +46,10 @@ int __pure novamap(void)
 {
 	return __novamap;
 }
+bool __pure __efi_soft_reserve_enabled(void)
+{
+	return !efi_nosoftreserve;
+}
 
 #define EFI_MMAP_NR_SLACK_SLOTS	8
 
@@ -211,6 +216,10 @@ efi_status_t efi_high_alloc(efi_system_table_t *sys_table_arg,
 		if (desc->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
+		if (efi_soft_reserve_enabled() &&
+		    (desc->attribute & EFI_MEMORY_SP))
+			continue;
+
 		if (desc->num_pages < nr_pages)
 			continue;
 
@@ -305,6 +314,10 @@ efi_status_t efi_low_alloc_above(efi_system_table_t *sys_table_arg,
 		if (desc->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
+		if (efi_soft_reserve_enabled() &&
+		    (desc->attribute & EFI_MEMORY_SP))
+			continue;
+
 		if (desc->num_pages < nr_pages)
 			continue;
 
@@ -484,6 +497,12 @@ efi_status_t efi_parse_options(char const *cmdline)
 			__novamap = 1;
 		}
 
+		if (IS_ENABLED(CONFIG_EFI_SOFT_RESERVE) &&
+		    !strncmp(str, "nosoftreserve", 7)) {
+			str += strlen("nosoftreserve");
+			efi_nosoftreserve = 1;
+		}
+
 		/* Group words together, delimited by "," */
 		while (*str && *str != ' ' && *str != ',')
 			str++;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 285f7dbeae3a..96090fb1df0c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1258,6 +1258,7 @@ extern int __init efi_setup_pcdp_console(char *);
 #define EFI_DBG			8	/* Print additional debug info at runtime */
 #define EFI_NX_PE_DATA		9	/* Can runtime data regions be mapped non-executable? */
 #define EFI_MEM_ATTR		10	/* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */
+#define EFI_MEM_NO_SOFT_RESERVE	11	/* Is the kernel configured to ignore soft reservations? */
 
 #ifdef CONFIG_EFI
 /*
@@ -1268,6 +1269,14 @@ static inline bool efi_enabled(int feature)
 	return test_bit(feature, &efi.flags) != 0;
 }
 extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused);
+
+bool __pure __efi_soft_reserve_enabled(void);
+
+static inline bool __pure efi_soft_reserve_enabled(void)
+{
+	return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE)
+		&& __efi_soft_reserve_enabled();
+}
 #else
 static inline bool efi_enabled(int feature)
 {
@@ -1281,6 +1290,11 @@ efi_capsule_pending(int *reset_type)
 {
 	return false;
 }
+
+static inline bool efi_soft_reserve_enabled(void)
+{
+	return false;
+}
 #endif
 
 extern int efi_status_to_err(efi_status_t status);
-- 
Gitee


From b452f27fe394a3a6da64540803daf61051f320f6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:16 -0800
Subject: [PATCH 2138/2161] x86/efi: EFI soft reservation to E820 enumeration

commit 262b45ae3ab4bf8e2caf1fcfd0d8307897519630 upstream.

UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the
interpretation of the EFI Memory Types as "reserved for a specific
purpose".

The proposed Linux behavior for specific purpose memory is that it is
reserved for direct-access (device-dax) by default and not available for
any kernel usage, not even as an OOM fallback.  Later, through udev
scripts or another init mechanism, these device-dax claimed ranges can
be reconfigured and hot-added to the available System-RAM with a unique
node identifier. This device-dax management scheme implements "soft" in
the "soft reserved" designation by allowing some or all of the
reservation to be recovered as typical memory. This policy can be
disabled at compile-time with CONFIG_EFI_SOFT_RESERVE=n, or runtime with
efi=nosoftreserve.

This patch introduces 2 new concepts at once given the entanglement
between early boot enumeration relative to memory that can optionally be
reserved from the kernel page allocator by default. The new concepts
are:

- E820_TYPE_SOFT_RESERVED: Upon detecting the EFI_MEMORY_SP
  attribute on EFI_CONVENTIONAL memory, update the E820 map with this
  new type. Only perform this classification if the
  CONFIG_EFI_SOFT_RESERVE=y policy is enabled, otherwise treat it as
  typical ram.

- IORES_DESC_SOFT_RESERVED: Add a new I/O resource descriptor for
  a device driver to search iomem resources for application specific
  memory. Teach the iomem code to identify such ranges as "Soft Reserved".

Note that the comment for do_add_efi_memmap() needed refreshing since it
seemed to imply that the efi map might overflow the e820 table, but that
is not an issue as of commit 7b6e4ba3cb1f "x86/boot/e820: Clean up the
E820_X_MAX definition" that removed the 128 entry limit for
e820__range_add().

A follow-on change integrates parsing of the ACPI HMAT to identify the
node and sub-range boundaries of EFI_MEMORY_SP designated memory. For
now, just identify and reserve memory of this type.

Intel-SIG: commit 262b45ae3ab4 x86/efi: EFI soft reservation to E820
enumeration.
Backport for "EFI Specific Purpose Memory Support".

Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 arch/x86/boot/compressed/eboot.c  |  6 +++-
 arch/x86/boot/compressed/kaslr.c  |  4 +++
 arch/x86/include/asm/e820/types.h |  8 +++++
 arch/x86/kernel/e820.c            | 12 ++++++--
 arch/x86/platform/efi/efi.c       | 49 ++++++++++++++++++++++++++++---
 include/linux/ioport.h            |  1 +
 6 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 82bc60c8acb2..f2db8c5e4b06 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -554,7 +554,11 @@ setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_s
 		case EFI_BOOT_SERVICES_CODE:
 		case EFI_BOOT_SERVICES_DATA:
 		case EFI_CONVENTIONAL_MEMORY:
-			e820_type = E820_TYPE_RAM;
+			if (efi_soft_reserve_enabled() &&
+			    (d->attribute & EFI_MEMORY_SP))
+				e820_type = E820_TYPE_SOFT_RESERVED;
+			else
+				e820_type = E820_TYPE_RAM;
 			break;
 
 		case EFI_ACPI_MEMORY_NVS:
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 2e53c056ba20..ff6fa81949cd 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -760,6 +760,10 @@ process_efi_entries(unsigned long minimum, unsigned long image_size)
 		if (md->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
+		if (efi_soft_reserve_enabled() &&
+		    (md->attribute & EFI_MEMORY_SP))
+			continue;
+
 		if (efi_mirror_found &&
 		    !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
 			continue;
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
index c3aa4b5e49e2..314f75d886d0 100644
--- a/arch/x86/include/asm/e820/types.h
+++ b/arch/x86/include/asm/e820/types.h
@@ -28,6 +28,14 @@ enum e820_type {
 	 */
 	E820_TYPE_PRAM		= 12,
 
+	/*
+	 * Special-purpose memory is indicated to the system via the
+	 * EFI_MEMORY_SP attribute. Define an e820 translation of this
+	 * memory type for the purpose of reserving this range and
+	 * marking it with the IORES_DESC_SOFT_RESERVED designation.
+	 */
+	E820_TYPE_SOFT_RESERVED	= 0xefffffff,
+
 	/*
 	 * Reserved RAM used by the kernel itself if
 	 * CONFIG_INTEL_TXT=y is enabled, memory of this type
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7da2bcd2b8eb..9976106b57ec 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type)
 	case E820_TYPE_RAM:		/* Fall through: */
 	case E820_TYPE_RESERVED_KERN:	pr_cont("usable");			break;
 	case E820_TYPE_RESERVED:	pr_cont("reserved");			break;
+	case E820_TYPE_SOFT_RESERVED:	pr_cont("soft reserved");		break;
 	case E820_TYPE_ACPI:		pr_cont("ACPI data");			break;
 	case E820_TYPE_NVS:		pr_cont("ACPI NVS");			break;
 	case E820_TYPE_UNUSABLE:	pr_cont("unusable");			break;
@@ -1037,6 +1038,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
 	case E820_TYPE_PRAM:		return "Persistent Memory (legacy)";
 	case E820_TYPE_PMEM:		return "Persistent Memory";
 	case E820_TYPE_RESERVED:	return "Reserved";
+	case E820_TYPE_SOFT_RESERVED:	return "Soft Reserved";
 	default:			return "Unknown E820 type";
 	}
 }
@@ -1052,6 +1054,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
 	case E820_TYPE_PRAM:		/* Fall-through: */
 	case E820_TYPE_PMEM:		/* Fall-through: */
 	case E820_TYPE_RESERVED:	/* Fall-through: */
+	case E820_TYPE_SOFT_RESERVED:	/* Fall-through: */
 	default:			return IORESOURCE_MEM;
 	}
 }
@@ -1064,6 +1067,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
 	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
 	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
 	case E820_TYPE_RESERVED:	return IORES_DESC_RESERVED;
+	case E820_TYPE_SOFT_RESERVED:	return IORES_DESC_SOFT_RESERVED;
 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
 	case E820_TYPE_RAM:		/* Fall-through: */
 	case E820_TYPE_UNUSABLE:	/* Fall-through: */
@@ -1078,11 +1082,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res)
 		return true;
 
 	/*
-	 * Treat persistent memory like device memory, i.e. reserve it
-	 * for exclusive use of a driver
+	 * Treat persistent memory and other special memory ranges like
+	 * device memory, i.e. reserve it for exclusive use of a driver
 	 */
 	switch (type) {
 	case E820_TYPE_RESERVED:
+	case E820_TYPE_SOFT_RESERVED:
 	case E820_TYPE_PRAM:
 	case E820_TYPE_PMEM:
 		return false;
@@ -1285,6 +1290,9 @@ void __init e820__memblock_setup(void)
 		if (end != (resource_size_t)end)
 			continue;
 
+		if (entry->type == E820_TYPE_SOFT_RESERVED)
+			memblock_reserve(entry->addr, entry->size);
+
 		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
 			continue;
 
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 4659c065d840..9ea196fa5aaf 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -150,14 +150,18 @@ void __init efi_find_mirror(void)
 
 /*
  * Tell the kernel about the EFI memory map.  This might include
- * more than the max 128 entries that can fit in the e820 legacy
- * (zeropage) memory map.
+ * more than the max 128 entries that can fit in the passed in e820
+ * legacy (zeropage) memory map, but the kernel's e820 table can hold
+ * E820_MAX_ENTRIES.
  */
 
 static void __init do_add_efi_memmap(void)
 {
 	efi_memory_desc_t *md;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
@@ -169,7 +173,10 @@ static void __init do_add_efi_memmap(void)
 		case EFI_BOOT_SERVICES_CODE:
 		case EFI_BOOT_SERVICES_DATA:
 		case EFI_CONVENTIONAL_MEMORY:
-			if (md->attribute & EFI_MEMORY_WB)
+			if (efi_soft_reserve_enabled()
+			    && (md->attribute & EFI_MEMORY_SP))
+				e820_type = E820_TYPE_SOFT_RESERVED;
+			else if (md->attribute & EFI_MEMORY_WB)
 				e820_type = E820_TYPE_RAM;
 			else
 				e820_type = E820_TYPE_RESERVED;
@@ -195,11 +202,36 @@ static void __init do_add_efi_memmap(void)
 			e820_type = E820_TYPE_RESERVED;
 			break;
 		}
+
 		e820__range_add(start, size, e820_type);
 	}
 	e820__update_table(e820_table);
 }
 
+/*
+ * Given add_efi_memmap defaults to 0 and there there is no alternative
+ * e820 mechanism for soft-reserved memory, import the full EFI memory
+ * map if soft reservations are present and enabled. Otherwise, the
+ * mechanism to disable the kernel's consideration of EFI_MEMORY_SP is
+ * the efi=nosoftreserve option.
+ */
+static bool do_efi_soft_reserve(void)
+{
+	efi_memory_desc_t *md;
+
+	if (!efi_enabled(EFI_MEMMAP))
+		return false;
+
+	if (!efi_soft_reserve_enabled())
+		return false;
+
+	for_each_efi_memory_desc(md)
+		if (md->type == EFI_CONVENTIONAL_MEMORY &&
+		    (md->attribute & EFI_MEMORY_SP))
+			return true;
+	return false;
+}
+
 int __init efi_memblock_x86_reserve_range(void)
 {
 	struct efi_info *e = &boot_params.efi_info;
@@ -229,7 +261,7 @@ int __init efi_memblock_x86_reserve_range(void)
 	if (rv)
 		return rv;
 
-	if (add_efi_memmap)
+	if (add_efi_memmap || do_efi_soft_reserve())
 		do_add_efi_memmap();
 
 	WARN(efi.memmap.desc_version != 1,
@@ -784,6 +816,15 @@ static bool should_map_region(efi_memory_desc_t *md)
 	if (IS_ENABLED(CONFIG_X86_32))
 		return false;
 
+	/*
+	 * EFI specific purpose memory may be reserved by default
+	 * depending on kernel config and boot options.
+	 */
+	if (md->type == EFI_CONVENTIONAL_MEMORY &&
+	    efi_soft_reserve_enabled() &&
+	    (md->attribute & EFI_MEMORY_SP))
+		return false;
+
 	/*
 	 * Map all of RAM so that we can access arguments in the 1:1
 	 * mapping when making EFI runtime calls.
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 982f87a4f52f..06f5c7bb405b 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -141,6 +141,7 @@ enum {
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_RESERVED			= 7,
+	IORES_DESC_SOFT_RESERVED		= 8,
 };
 
 /*
-- 
Gitee


From 4666963fcc240cf61c7149c639825d3206694d9b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:21 -0800
Subject: [PATCH 2139/2161] arm/efi: EFI soft reservation to memblock

commit 16993c0f0a43213e23666ea40e9163887f593ac7 upstream.

UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the
interpretation of the EFI Memory Types as "reserved for a specific
purpose".

The proposed Linux behavior for specific purpose memory is that it is
reserved for direct-access (device-dax) by default and not available for
any kernel usage, not even as an OOM fallback.  Later, through udev
scripts or another init mechanism, these device-dax claimed ranges can
be reconfigured and hot-added to the available System-RAM with a unique
node identifier. This device-dax management scheme implements "soft" in
the "soft reserved" designation by allowing some or all of the
reservation to be recovered as typical memory. This policy can be
disabled at compile-time with CONFIG_EFI_SOFT_RESERVE=n, or runtime with
efi=nosoftreserve.

For this patch, update the ARM paths that consider
EFI_CONVENTIONAL_MEMORY to optionally take the EFI_MEMORY_SP attribute
into account as a reservation indicator. Publish the soft reservation as
IORES_DESC_SOFT_RESERVED memory, similar to x86.

(Based on an original patch by Ard)

Intel-SIG: commit 16993c0f0a43 arm/efi: EFI soft reservation to memblock.
Backport for "EFI Specific Purpose Memory Support".

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 arch/arm64/mm/mmu.c                       |  2 ++
 drivers/firmware/efi/arm-init.c           |  9 +++++++++
 drivers/firmware/efi/arm-runtime.c        | 24 +++++++++++++++++++++++
 drivers/firmware/efi/libstub/arm32-stub.c |  5 +++++
 drivers/firmware/efi/libstub/random.c     |  4 ++++
 5 files changed, 44 insertions(+)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 99bc0289ab2b..05fbeb896502 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1061,6 +1061,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
 			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
 
+	memblock_clear_nomap(start, size);
+
 	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   restrictions);
 }
diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index 311cd349a862..904fa09e6a6b 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -163,6 +163,15 @@ static __init int is_usable_memory(efi_memory_desc_t *md)
 	case EFI_BOOT_SERVICES_DATA:
 	case EFI_CONVENTIONAL_MEMORY:
 	case EFI_PERSISTENT_MEMORY:
+		/*
+		 * Special purpose memory is 'soft reserved', which means it
+		 * is set aside initially, but can be hotplugged back in or
+		 * be assigned to the dax driver after boot.
+		 */
+		if (efi_soft_reserve_enabled() &&
+		    (md->attribute & EFI_MEMORY_SP))
+			return false;
+
 		/*
 		 * According to the spec, these regions are no longer reserved
 		 * after calling ExitBootServices(). However, we can only use
diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c
index e2ac5fa5531b..899b803842bb 100644
--- a/drivers/firmware/efi/arm-runtime.c
+++ b/drivers/firmware/efi/arm-runtime.c
@@ -121,6 +121,30 @@ static int __init arm_enable_runtime_services(void)
 		return 0;
 	}
 
+	if (efi_soft_reserve_enabled()) {
+		efi_memory_desc_t *md;
+
+		for_each_efi_memory_desc(md) {
+			int md_size = md->num_pages << EFI_PAGE_SHIFT;
+			struct resource *res;
+
+			if (!(md->attribute & EFI_MEMORY_SP))
+				continue;
+
+			res = kzalloc(sizeof(*res), GFP_KERNEL);
+			if (WARN_ON(!res))
+				break;
+
+			res->start	= md->phys_addr;
+			res->end	= md->phys_addr + md_size - 1;
+			res->name	= "Soft Reserved";
+			res->flags	= IORESOURCE_MEM;
+			res->desc	= IORES_DESC_SOFT_RESERVED;
+
+			insert_resource(&iomem_resource, res);
+		}
+	}
+
 	if (efi_runtime_disabled()) {
 		pr_info("EFI runtime services will be disabled.\n");
 		return 0;
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index 41213bf5fcf5..4566640de650 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -146,6 +146,11 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
 			continue;
 
 		case EFI_CONVENTIONAL_MEMORY:
+			/* Skip soft reserved conventional memory */
+			if (efi_soft_reserve_enabled() &&
+			    (desc->attribute & EFI_MEMORY_SP))
+				continue;
+
 			/*
 			 * Reserve the intersection between this entry and the
 			 * region.
diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c
index b4b1d1dcb5fd..6c188695e730 100644
--- a/drivers/firmware/efi/libstub/random.c
+++ b/drivers/firmware/efi/libstub/random.c
@@ -46,6 +46,10 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
 	if (md->type != EFI_CONVENTIONAL_MEMORY)
 		return 0;
 
+	if (efi_soft_reserve_enabled() &&
+	    (md->attribute & EFI_MEMORY_SP))
+		return 0;
+
 	region_end = min((u64)ULONG_MAX, md->phys_addr + md->num_pages*EFI_PAGE_SIZE - 1);
 
 	first_slot = round_up(md->phys_addr, align);
-- 
Gitee


From e7dccb8ea8a9300dbcc87940c3527f8678e11579 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:26 -0800
Subject: [PATCH 2140/2161] x86/efi: Add efi_fake_mem support for EFI_MEMORY_SP

commit 199c8471761273b7e287914cee968ddf21dfbfe0 upstream.

Given that EFI_MEMORY_SP is platform BIOS policy decision for marking
memory ranges as "reserved for a specific purpose" there will inevitably
be scenarios where the BIOS omits the attribute in situations where it
is desired. Unlike other attributes if the OS wants to reserve this
memory from the kernel the reservation needs to happen early in init. So
early, in fact, that it needs to happen before e820__memblock_setup()
which is a pre-requisite for efi_fake_memmap() that wants to allocate
memory for the updated table.

Introduce an x86 specific efi_fake_memmap_early() that can search for
attempts to set EFI_MEMORY_SP via efi_fake_mem and update the e820 table
accordingly.

The KASLR code that scans the command line looking for user-directed
memory reservations also needs to be updated to consider
"efi_fake_mem=nn@ss:0x40000" requests.

Intel-SIG: commit 199c84717612 x86/efi: Add efi_fake_mem support for
EFI_MEMORY_SP.
Backport for "EFI Specific Purpose Memory Support".

Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 .../admin-guide/kernel-parameters.txt         | 10 ++-
 arch/x86/boot/compressed/kaslr.c              | 42 +++++++++--
 arch/x86/include/asm/efi.h                    |  8 +++
 arch/x86/platform/efi/efi.c                   |  2 +
 drivers/firmware/efi/Makefile                 |  5 +-
 drivers/firmware/efi/fake_mem.c               | 24 +++----
 drivers/firmware/efi/fake_mem.h               | 10 +++
 drivers/firmware/efi/x86_fake_mem.c           | 69 +++++++++++++++++++
 8 files changed, 147 insertions(+), 23 deletions(-)
 create mode 100644 drivers/firmware/efi/fake_mem.h
 create mode 100644 drivers/firmware/efi/x86_fake_mem.c

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7f32edf5b29e..6314798b3df6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1203,15 +1203,21 @@
 			updating original EFI memory map.
 			Region of memory which aa attribute is added to is
 			from ss to ss+nn.
+
 			If efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000
 			is specified, EFI_MEMORY_MORE_RELIABLE(0x10000)
 			attribute is added to range 0x100000000-0x180000000 and
 			0x10a0000000-0x1120000000.
 
+			If efi_fake_mem=8G@9G:0x40000 is specified, the
+			EFI_MEMORY_SP(0x40000) attribute is added to
+			range 0x240000000-0x43fffffff.
+
 			Using this parameter you can do debugging of EFI memmap
-			related feature. For example, you can do debugging of
+			related features. For example, you can do debugging of
 			Address Range Mirroring feature even if your box
-			doesn't support it.
+			doesn't support it, or mark specific memory as
+			"soft reserved".
 
 	efivar_ssdt=	[EFI; X86] Name of an EFI variable that contains an SSDT
 			that is to be dynamically loaded by Linux. If there are
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index ff6fa81949cd..da0eedd5635d 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -132,8 +132,14 @@ char *skip_spaces(const char *str)
 #include "../../../../lib/ctype.c"
 #include "../../../../lib/cmdline.c"
 
+enum parse_mode {
+	PARSE_MEMMAP,
+	PARSE_EFI,
+};
+
 static int
-parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
+parse_memmap(char *p, unsigned long long *start, unsigned long long *size,
+		enum parse_mode mode)
 {
 	char *oldp;
 
@@ -156,8 +162,29 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
 		*start = memparse(p + 1, &p);
 		return 0;
 	case '@':
-		/* memmap=nn@ss specifies usable region, should be skipped */
-		*size = 0;
+		if (mode == PARSE_MEMMAP) {
+			/*
+			 * memmap=nn@ss specifies usable region, should
+			 * be skipped
+			 */
+			*size = 0;
+		} else {
+			unsigned long long flags;
+
+			/*
+			 * efi_fake_mem=nn@ss:attr the attr specifies
+			 * flags that might imply a soft-reservation.
+			 */
+			*start = memparse(p + 1, &p);
+			if (p && *p == ':') {
+				p++;
+				if (kstrtoull(p, 0, &flags) < 0)
+					*size = 0;
+				else if (flags & EFI_MEMORY_SP)
+					return 0;
+			}
+			*size = 0;
+		}
 		/* Fall through */
 	default:
 		/*
@@ -172,7 +199,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
 	return -EINVAL;
 }
 
-static void mem_avoid_memmap(char *str)
+static void mem_avoid_memmap(enum parse_mode mode, char *str)
 {
 	static int i;
 
@@ -187,7 +214,7 @@ static void mem_avoid_memmap(char *str)
 		if (k)
 			*k++ = 0;
 
-		rc = parse_memmap(str, &start, &size);
+		rc = parse_memmap(str, &start, &size, mode);
 		if (rc < 0)
 			break;
 		str = k;
@@ -238,7 +265,6 @@ static void parse_gb_huge_pages(char *param, char *val)
 	}
 }
 
-
 static void handle_mem_options(void)
 {
 	char *args = (char *)get_cmd_line_ptr();
@@ -271,7 +297,7 @@ static void handle_mem_options(void)
 		}
 
 		if (!strcmp(param, "memmap")) {
-			mem_avoid_memmap(val);
+			mem_avoid_memmap(PARSE_MEMMAP, val);
 		} else if (strstr(param, "hugepages")) {
 			parse_gb_huge_pages(param, val);
 		} else if (!strcmp(param, "mem")) {
@@ -284,6 +310,8 @@ static void handle_mem_options(void)
 				goto out;
 
 			mem_limit = mem_size;
+		} else if (!strcmp(param, "efi_fake_mem")) {
+			mem_avoid_memmap(PARSE_EFI, val);
 		}
 	}
 
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 45f853bce869..d028e9acdf1c 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -263,4 +263,12 @@ static inline void efi_reserve_boot_services(void)
 }
 #endif /* CONFIG_EFI */
 
+#ifdef CONFIG_EFI_FAKE_MEMMAP
+extern void __init efi_fake_memmap_early(void);
+#else
+static inline void efi_fake_memmap_early(void)
+{
+}
+#endif
+
 #endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 9ea196fa5aaf..89ae6adfc4c4 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -264,6 +264,8 @@ int __init efi_memblock_x86_reserve_range(void)
 	if (add_efi_memmap || do_efi_soft_reserve())
 		do_add_efi_memmap();
 
+	efi_fake_memmap_early();
+
 	WARN(efi.memmap.desc_version != 1,
 	     "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
 	     efi.memmap.desc_version);
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 4ac2de4dfa72..554d795270d9 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -20,13 +20,16 @@ obj-$(CONFIG_UEFI_CPER)			+= cper.o
 obj-$(CONFIG_EFI_RUNTIME_MAP)		+= runtime-map.o
 obj-$(CONFIG_EFI_RUNTIME_WRAPPERS)	+= runtime-wrappers.o
 obj-$(CONFIG_EFI_STUB)			+= libstub/
-obj-$(CONFIG_EFI_FAKE_MEMMAP)		+= fake_mem.o
+obj-$(CONFIG_EFI_FAKE_MEMMAP)		+= fake_map.o
 obj-$(CONFIG_EFI_BOOTLOADER_CONTROL)	+= efibc.o
 obj-$(CONFIG_EFI_TEST)			+= test/
 obj-$(CONFIG_EFI_DEV_PATH_PARSER)	+= dev-path-parser.o
 obj-$(CONFIG_APPLE_PROPERTIES)		+= apple-properties.o
 obj-$(CONFIG_EFI_RCI2_TABLE)		+= rci2-table.o
 
+fake_map-y				+= fake_mem.o
+fake_map-$(CONFIG_X86)			+= x86_fake_mem.o
+
 arm-obj-$(CONFIG_EFI)			:= arm-init.o arm-runtime.o
 obj-$(CONFIG_ARM)			+= $(arm-obj-y)
 obj-$(CONFIG_ARM64)			+= $(arm-obj-y)
diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 526b45331d96..bb9fc70d0cfa 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -17,12 +17,10 @@
 #include <linux/memblock.h>
 #include <linux/types.h>
 #include <linux/sort.h>
-#include <asm/efi.h>
+#include "fake_mem.h"
 
-#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM
-
-static struct efi_mem_range fake_mems[EFI_MAX_FAKEMEM];
-static int nr_fake_mem;
+struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM];
+int nr_fake_mem;
 
 static int __init cmp_fake_mem(const void *x1, const void *x2)
 {
@@ -50,7 +48,7 @@ void __init efi_fake_memmap(void)
 	/* count up the number of EFI memory descriptor */
 	for (i = 0; i < nr_fake_mem; i++) {
 		for_each_efi_memory_desc(md) {
-			struct range *r = &fake_mems[i].range;
+			struct range *r = &efi_fake_mems[i].range;
 
 			new_nr_map += efi_memmap_split_count(md, r);
 		}
@@ -70,7 +68,7 @@ void __init efi_fake_memmap(void)
 	}
 
 	for (i = 0; i < nr_fake_mem; i++)
-		efi_memmap_insert(&efi.memmap, new_memmap, &fake_mems[i]);
+		efi_memmap_insert(&efi.memmap, new_memmap, &efi_fake_mems[i]);
 
 	/* swap into new EFI memmap */
 	early_memunmap(new_memmap, efi.memmap.desc_size * new_nr_map);
@@ -104,22 +102,22 @@ static int __init setup_fake_mem(char *p)
 		if (nr_fake_mem >= EFI_MAX_FAKEMEM)
 			break;
 
-		fake_mems[nr_fake_mem].range.start = start;
-		fake_mems[nr_fake_mem].range.end = start + mem_size - 1;
-		fake_mems[nr_fake_mem].attribute = attribute;
+		efi_fake_mems[nr_fake_mem].range.start = start;
+		efi_fake_mems[nr_fake_mem].range.end = start + mem_size - 1;
+		efi_fake_mems[nr_fake_mem].attribute = attribute;
 		nr_fake_mem++;
 
 		if (*p == ',')
 			p++;
 	}
 
-	sort(fake_mems, nr_fake_mem, sizeof(struct efi_mem_range),
+	sort(efi_fake_mems, nr_fake_mem, sizeof(struct efi_mem_range),
 	     cmp_fake_mem, NULL);
 
 	for (i = 0; i < nr_fake_mem; i++)
 		pr_info("efi_fake_mem: add attr=0x%016llx to [mem 0x%016llx-0x%016llx]",
-			fake_mems[i].attribute, fake_mems[i].range.start,
-			fake_mems[i].range.end);
+			efi_fake_mems[i].attribute, efi_fake_mems[i].range.start,
+			efi_fake_mems[i].range.end);
 
 	return *p == '\0' ? 0 : -EINVAL;
 }
diff --git a/drivers/firmware/efi/fake_mem.h b/drivers/firmware/efi/fake_mem.h
new file mode 100644
index 000000000000..d52791af4b18
--- /dev/null
+++ b/drivers/firmware/efi/fake_mem.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __EFI_FAKE_MEM_H__
+#define __EFI_FAKE_MEM_H__
+#include <asm/efi.h>
+
+#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM
+
+extern struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM];
+extern int nr_fake_mem;
+#endif /* __EFI_FAKE_MEM_H__ */
diff --git a/drivers/firmware/efi/x86_fake_mem.c b/drivers/firmware/efi/x86_fake_mem.c
new file mode 100644
index 000000000000..e5d6d5a1b240
--- /dev/null
+++ b/drivers/firmware/efi/x86_fake_mem.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights reserved. */
+#include <linux/efi.h>
+#include <asm/e820/api.h>
+#include "fake_mem.h"
+
+void __init efi_fake_memmap_early(void)
+{
+	int i;
+
+	/*
+	 * The late efi_fake_mem() call can handle all requests if
+	 * EFI_MEMORY_SP support is disabled.
+	 */
+	if (!efi_soft_reserve_enabled())
+		return;
+
+	if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
+		return;
+
+	/*
+	 * Given that efi_fake_memmap() needs to perform memblock
+	 * allocations it needs to run after e820__memblock_setup().
+	 * However, if efi_fake_mem specifies EFI_MEMORY_SP for a given
+	 * address range that potentially needs to mark the memory as
+	 * reserved prior to e820__memblock_setup(). Update e820
+	 * directly if EFI_MEMORY_SP is specified for an
+	 * EFI_CONVENTIONAL_MEMORY descriptor.
+	 */
+	for (i = 0; i < nr_fake_mem; i++) {
+		struct efi_mem_range *mem = &efi_fake_mems[i];
+		efi_memory_desc_t *md;
+		u64 m_start, m_end;
+
+		if ((mem->attribute & EFI_MEMORY_SP) == 0)
+			continue;
+
+		m_start = mem->range.start;
+		m_end = mem->range.end;
+		for_each_efi_memory_desc(md) {
+			u64 start, end;
+
+			if (md->type != EFI_CONVENTIONAL_MEMORY)
+				continue;
+
+			start = md->phys_addr;
+			end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+
+			if (m_start <= end && m_end >= start)
+				/* fake range overlaps descriptor */;
+			else
+				continue;
+
+			/*
+			 * Trim the boundary of the e820 update to the
+			 * descriptor in case the fake range overlaps
+			 * !EFI_CONVENTIONAL_MEMORY
+			 */
+			start = max(start, m_start);
+			end = min(end, m_end);
+
+			if (end <= start)
+				continue;
+			e820__range_update(start, end - start + 1, E820_TYPE_RAM,
+					E820_TYPE_SOFT_RESERVED);
+			e820__update_table(e820_table);
+		}
+	}
+}
-- 
Gitee


From 7a67a1650301c3788a6dff579cfdcb6b0c85e53a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:31 -0800
Subject: [PATCH 2141/2161] lib: Uplevel the pmem "region" ida to a global
 allocator

commit 33dd70752cd76f4d883a165a674f13121a4155ed upstream.

In preparation for handling platform differentiated memory types beyond
persistent memory, uplevel the "region" identifier to a global number
space. This enables a device-dax instance to be registered to any memory
type with guaranteed unique names.

Intel-SIG: commit 33dd70752cd7 lib: Uplevel the pmem "region" ida to a
global allocator.
Backport for "EFI Specific Purpose Memory Support".

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/nvdimm/Kconfig       |  1 +
 drivers/nvdimm/core.c        |  1 -
 drivers/nvdimm/nd-core.h     |  1 -
 drivers/nvdimm/region_devs.c | 13 ++++---------
 include/linux/memregion.h    | 19 +++++++++++++++++++
 lib/Kconfig                  |  3 +++
 lib/Makefile                 |  1 +
 lib/memregion.c              | 18 ++++++++++++++++++
 8 files changed, 46 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/memregion.h
 create mode 100644 lib/memregion.c

diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 36af7af6b7cf..b7d1eb38b27d 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -4,6 +4,7 @@ menuconfig LIBNVDIMM
 	depends on PHYS_ADDR_T_64BIT
 	depends on HAS_IOMEM
 	depends on BLK_DEV
+	select MEMREGION
 	help
 	  Generic support for non-volatile memory devices including
 	  ACPI-6-NFIT defined resources.  On platforms that define an
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 9204f1e9fd14..e592c4964674 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -455,7 +455,6 @@ static __exit void libnvdimm_exit(void)
 	nd_region_exit();
 	nvdimm_exit();
 	nvdimm_bus_exit();
-	nd_region_devs_exit();
 	nvdimm_devs_exit();
 }
 
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 25fa121104d0..aa059439fca0 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -114,7 +114,6 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
 void nvdimm_devs_exit(void);
-void nd_region_devs_exit(void);
 struct nd_region;
 void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev);
 void nd_region_create_ns_seed(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b8236a9e8750..c28974d14170 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -3,6 +3,7 @@
  * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
  */
 #include <linux/scatterlist.h>
+#include <linux/memregion.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -19,7 +20,6 @@
  */
 #include <linux/io-64-nonatomic-hi-lo.h>
 
-static DEFINE_IDA(region_ida);
 static DEFINE_PER_CPU(int, flush_idx);
 
 static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm,
@@ -133,7 +133,7 @@ static void nd_region_release(struct device *dev)
 		put_device(&nvdimm->dev);
 	}
 	free_percpu(nd_region->lane);
-	ida_simple_remove(&region_ida, nd_region->id);
+	memregion_free(nd_region->id);
 	if (is_nd_blk(dev))
 		kfree(to_nd_blk_region(dev));
 	else
@@ -985,7 +985,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 	if (!region_buf)
 		return NULL;
-	nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
+	nd_region->id = memregion_alloc(GFP_KERNEL);
 	if (nd_region->id < 0)
 		goto err_id;
 
@@ -1044,7 +1044,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	return nd_region;
 
  err_percpu:
-	ida_simple_remove(&region_ida, nd_region->id);
+	memregion_free(nd_region->id);
  err_id:
 	kfree(region_buf);
 	return NULL;
@@ -1221,8 +1221,3 @@ int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
 
 	return device_for_each_child(&nvdimm_bus->dev, &ctx, region_conflict);
 }
-
-void __exit nd_region_devs_exit(void)
-{
-	ida_destroy(&region_ida);
-}
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
new file mode 100644
index 000000000000..7de7c0a1444e
--- /dev/null
+++ b/include/linux/memregion.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MEMREGION_H_
+#define _MEMREGION_H_
+#include <linux/types.h>
+#include <linux/errno.h>
+
+#ifdef CONFIG_MEMREGION
+int memregion_alloc(gfp_t gfp);
+void memregion_free(int id);
+#else
+static inline int memregion_alloc(gfp_t gfp)
+{
+	return -ENOMEM;
+}
+void memregion_free(int id)
+{
+}
+#endif
+#endif /* _MEMREGION_H_ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 3321d04dfa5a..681b7e50490e 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -605,6 +605,9 @@ config ARCH_NO_SG_CHAIN
 config ARCH_HAS_PMEM_API
 	bool
 
+config MEMREGION
+	bool
+
 # use memcpy to implement user copies for nommu architectures
 config UACCESS_MEMCPY
 	bool
diff --git a/lib/Makefile b/lib/Makefile
index 3b8977aed1b4..e4f42e71b325 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -212,6 +212,7 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 
 obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_SG_POOL) += sg_pool.o
+obj-$(CONFIG_MEMREGION) += memregion.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
diff --git a/lib/memregion.c b/lib/memregion.c
new file mode 100644
index 000000000000..77c85b5251da
--- /dev/null
+++ b/lib/memregion.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* identifiers for device / performance-differentiated memory regions */
+#include <linux/idr.h>
+#include <linux/types.h>
+
+static DEFINE_IDA(memregion_ids);
+
+int memregion_alloc(gfp_t gfp)
+{
+	return ida_alloc(&memregion_ids, gfp);
+}
+EXPORT_SYMBOL(memregion_alloc);
+
+void memregion_free(int id)
+{
+	ida_free(&memregion_ids, id);
+}
+EXPORT_SYMBOL(memregion_free);
-- 
Gitee


From 14be75b5c9599e9089189482087ee4bac0c68972 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:43 -0800
Subject: [PATCH 2142/2161] device-dax: Add a driver for "hmem" devices

commit a6c7f4c6aea5f4ca6056b06cec7ebd79f8c23e33 upstream.

Platform firmware like EFI/ACPI may publish "hmem" platform devices.
Such a device is a performance differentiated memory range likely
reserved for an application specific use case. The driver gives access
to 100% of the capacity via a device-dax mmap instance by default.

However, if over-subscription and other kernel memory management is
desired the resulting dax device can be assigned to the core-mm via the
kmem driver.

This consumes "hmem" devices the producer of "hmem" devices is saved for
a follow-on patch so that it can reference the new CONFIG_DEV_DAX_HMEM
symbol to gate performing the enumeration work.

Intel-SIG: commit a6c7f4c6aea5 device-dax: Add a driver for "hmem"
devices.
Backport for "EFI Specific Purpose Memory Support".

Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/dax/Kconfig       | 27 +++++++++++++++----
 drivers/dax/Makefile      |  2 ++
 drivers/dax/hmem.c        | 56 +++++++++++++++++++++++++++++++++++++++
 include/linux/memregion.h |  4 +++
 4 files changed, 84 insertions(+), 5 deletions(-)
 create mode 100644 drivers/dax/hmem.c

diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index f33c73e4af41..3b6c06f07326 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -32,19 +32,36 @@ config DEV_DAX_PMEM
 
 	  Say M if unsure
 
+config DEV_DAX_HMEM
+	tristate "HMEM DAX: direct access to 'specific purpose' memory"
+	depends on EFI_SOFT_RESERVE
+	default DEV_DAX
+	help
+	  EFI 2.8 platforms, and others, may advertise 'specific purpose'
+	  memory. For example, a high bandwidth memory pool. The
+	  indication from platform firmware is meant to reserve the
+	  memory from typical usage by default. This driver creates
+	  device-dax instances for these memory ranges, and that also
+	  enables the possibility to assign them to the DEV_DAX_KMEM
+	  driver to override the reservation and add them to kernel
+	  "System RAM" pool.
+
+	  Say M if unsure.
+
 config DEV_DAX_KMEM
 	tristate "KMEM DAX: volatile-use of persistent memory"
 	default DEV_DAX
 	depends on DEV_DAX
 	depends on MEMORY_HOTPLUG # for add_memory() and friends
 	help
-	  Support access to persistent memory as if it were RAM.  This
-	  allows easier use of persistent memory by unmodified
-	  applications.
+	  Support access to persistent, or other performance
+	  differentiated memory as if it were System RAM. This allows
+	  easier use of persistent memory by unmodified applications, or
+	  adds core kernel memory services to heterogeneous memory types
+	  (HMEM) marked "reserved" by platform firmware.
 
 	  To use this feature, a DAX device must be unbound from the
-	  device_dax driver (PMEM DAX) and bound to this kmem driver
-	  on each boot.
+	  device_dax driver and bound to this kmem driver on each boot.
 
 	  Say N if unsure.
 
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 81f7d54dadfb..80065b38b3c4 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -2,9 +2,11 @@
 obj-$(CONFIG_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
+obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o
 
 dax-y := super.o
 dax-y += bus.o
 device_dax-y := device.o
+dax_hmem-y := hmem.o
 
 obj-y += pmem/
diff --git a/drivers/dax/hmem.c b/drivers/dax/hmem.c
new file mode 100644
index 000000000000..fe7214daf62e
--- /dev/null
+++ b/drivers/dax/hmem.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/platform_device.h>
+#include <linux/memregion.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include "bus.h"
+
+static int dax_hmem_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct dev_pagemap pgmap = { };
+	struct dax_region *dax_region;
+	struct memregion_info *mri;
+	struct dev_dax *dev_dax;
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	mri = dev->platform_data;
+	memcpy(&pgmap.res, res, sizeof(*res));
+
+	dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node,
+			PMD_SIZE, PFN_DEV|PFN_MAP);
+	if (!dax_region)
+		return -ENOMEM;
+
+	dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap);
+	if (IS_ERR(dev_dax))
+		return PTR_ERR(dev_dax);
+
+	/* child dev_dax instances now own the lifetime of the dax_region */
+	dax_region_put(dax_region);
+	return 0;
+}
+
+static int dax_hmem_remove(struct platform_device *pdev)
+{
+	/* devm handles teardown */
+	return 0;
+}
+
+static struct platform_driver dax_hmem_driver = {
+	.probe = dax_hmem_probe,
+	.remove = dax_hmem_remove,
+	.driver = {
+		.name = "hmem",
+	},
+};
+
+module_platform_driver(dax_hmem_driver);
+
+MODULE_ALIAS("platform:hmem*");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
index 7de7c0a1444e..e11595256cac 100644
--- a/include/linux/memregion.h
+++ b/include/linux/memregion.h
@@ -4,6 +4,10 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 
+struct memregion_info {
+	int target_node;
+};
+
 #ifdef CONFIG_MEMREGION
 int memregion_alloc(gfp_t gfp);
 void memregion_free(int id);
-- 
Gitee


From 620a191ce8ed9cf8a9d52bd046eeeb1407a1ad8b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:49 -0800
Subject: [PATCH 2143/2161] ACPI: NUMA: HMAT: Register HMAT at device_initcall
 level

commit 0f847f8c0813c8ad7df5174c8f27bcba5926b972 upstream.

In preparation for registering device-dax instances for accessing EFI
specific-purpose memory, arrange for the HMAT registration to occur
later in the init process. Critically HMAT initialization needs to occur
after e820__reserve_resources_late() which is the point at which the
iomem resource tree is populated with "Application Reserved"
(IORES_DESC_APPLICATION_RESERVED). e820__reserve_resources_late()
happens at subsys_initcall time.

Intel-SIG: commit 0f847f8c0813 ACPI: NUMA: HMAT: Register HMAT at
device_initcall level.
Backport for "EFI Specific Purpose Memory Support".

Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/acpi/numa/hmat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 0f1c939b7e90..47e87cc394a6 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -749,4 +749,4 @@ static __init int hmat_init(void)
 	acpi_put_table(tbl);
 	return 0;
 }
-subsys_initcall(hmat_init);
+device_initcall(hmat_init);
-- 
Gitee


From e0da0c2d631f14f46c6bc53e9c528cde756d5d4a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:55 -0800
Subject: [PATCH 2144/2161] ACPI: NUMA: HMAT: Register "soft reserved" memory
 as an "hmem" device

commit cf8741ac57ed48613e49559d3e5ae43f56291e4c upstream.

Memory that has been tagged EFI_MEMORY_SP, and has performance
properties described by the ACPI HMAT is expected to have an application
specific consumer.

Those consumers may want 100% of the memory capacity to be reserved from
any usage by the kernel. By default, with this enabling, a platform
device is created to represent this differentiated resource.

The device-dax "hmem" driver claims these devices by default and
provides an mmap interface for the target application.  If the
administrator prefers, the hmem resource range can be made available to
the core-mm via the device-dax hotplug facility, kmem, to online the
memory with its own numa node.

This was tested with an emulated HMAT produced by qemu (with the pending
HMAT enabling patches), and "efi_fake_mem=8G@9G:0x40000" on the kernel
command line to mark the memory ranges associated with node2 and node3
as EFI_MEMORY_SP.

qemu numa configuration options:

-numa node,mem=4G,cpus=0-19,nodeid=0
-numa node,mem=4G,cpus=20-39,nodeid=1
-numa node,mem=4G,nodeid=2
-numa node,mem=4G,nodeid=3
-numa dist,src=0,dst=0,val=10
-numa dist,src=0,dst=1,val=21
-numa dist,src=0,dst=2,val=21
-numa dist,src=0,dst=3,val=21
-numa dist,src=1,dst=0,val=21
-numa dist,src=1,dst=1,val=10
-numa dist,src=1,dst=2,val=21
-numa dist,src=1,dst=3,val=21
-numa dist,src=2,dst=0,val=21
-numa dist,src=2,dst=1,val=21
-numa dist,src=2,dst=2,val=10
-numa dist,src=2,dst=3,val=21
-numa dist,src=3,dst=0,val=21
-numa dist,src=3,dst=1,val=21
-numa dist,src=3,dst=2,val=21
-numa dist,src=3,dst=3,val=10
-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5
-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5
-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10
-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10
-numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15
-numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15
-numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20
-numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20
-numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10
-numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10
-numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5
-numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5
-numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15
-numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15
-numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20
-numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20

Result:

[
  {
    "path":"\/platform\/hmem.1",
    "id":1,
    "size":"4.00 GiB (4.29 GB)",
    "align":2097152,
    "devices":[
      {
        "chardev":"dax1.0",
        "size":"4.00 GiB (4.29 GB)"
      }
    ]
  },
  {
    "path":"\/platform\/hmem.0",
    "id":0,
    "size":"4.00 GiB (4.29 GB)",
    "align":2097152,
    "devices":[
      {
        "chardev":"dax0.0",
        "size":"4.00 GiB (4.29 GB)"
      }
    ]
  }
]

[..]
240000000-43fffffff : Soft Reserved
  240000000-33fffffff : hmem.0
    240000000-33fffffff : dax0.0
  340000000-43fffffff : hmem.1
    340000000-43fffffff : dax1.0

Intel-SIG: commit cf8741ac57ed ACPI: NUMA: HMAT: Register "soft
reserved" memory as an "hmem" device.
Backport for "EFI Specific Purpose Memory Support".

Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/acpi/numa/Kconfig |   1 +
 drivers/acpi/numa/hmat.c  | 136 ++++++++++++++++++++++++++++++++++----
 2 files changed, 125 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig
index acbd5aa76e40..fcf2e556d69d 100644
--- a/drivers/acpi/numa/Kconfig
+++ b/drivers/acpi/numa/Kconfig
@@ -9,6 +9,7 @@ config ACPI_HMAT
 	bool "ACPI Heterogeneous Memory Attribute Table Support"
 	depends on ACPI_NUMA
 	select HMEM_REPORTING
+	select MEMREGION
 	help
 	 If set, this option has the kernel parse and report the
 	 platform's ACPI HMAT (Heterogeneous Memory Attributes Table),
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 47e87cc394a6..f632b9bec7fc 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -8,12 +8,18 @@
  * the applicable attributes with the node's interfaces.
  */
 
+#define pr_fmt(fmt) "acpi/hmat: " fmt
+#define dev_fmt(fmt) "acpi/hmat: " fmt
+
 #include <linux/acpi.h>
 #include <linux/bitops.h>
 #include <linux/device.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/platform_device.h>
 #include <linux/list_sort.h>
+#include <linux/memregion.h>
 #include <linux/memory.h>
 #include <linux/mutex.h>
 #include <linux/node.h>
@@ -49,6 +55,7 @@ struct memory_target {
 	struct list_head node;
 	unsigned int memory_pxm;
 	unsigned int processor_pxm;
+	struct resource memregions;
 	struct node_hmem_attrs hmem_attrs;
 	struct list_head caches;
 	struct node_cache_attrs cache_attrs;
@@ -104,22 +111,36 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm)
 	list_add_tail(&initiator->node, &initiators);
 }
 
-static __init void alloc_memory_target(unsigned int mem_pxm)
+static __init void alloc_memory_target(unsigned int mem_pxm,
+		resource_size_t start, resource_size_t len)
 {
 	struct memory_target *target;
 
 	target = find_mem_target(mem_pxm);
-	if (target)
-		return;
-
-	target = kzalloc(sizeof(*target), GFP_KERNEL);
-	if (!target)
-		return;
+	if (!target) {
+		target = kzalloc(sizeof(*target), GFP_KERNEL);
+		if (!target)
+			return;
+		target->memory_pxm = mem_pxm;
+		target->processor_pxm = PXM_INVAL;
+		target->memregions = (struct resource) {
+			.name	= "ACPI mem",
+			.start	= 0,
+			.end	= -1,
+			.flags	= IORESOURCE_MEM,
+		};
+		list_add_tail(&target->node, &targets);
+		INIT_LIST_HEAD(&target->caches);
+	}
 
-	target->memory_pxm = mem_pxm;
-	target->processor_pxm = PXM_INVAL;
-	list_add_tail(&target->node, &targets);
-	INIT_LIST_HEAD(&target->caches);
+	/*
+	 * There are potentially multiple ranges per PXM, so record each
+	 * in the per-target memregions resource tree.
+	 */
+	if (!__request_region(&target->memregions, start, len, "memory target",
+				IORESOURCE_MEM))
+		pr_warn("failed to reserve %#llx - %#llx in pxm: %d\n",
+				start, start + len, mem_pxm);
 }
 
 static __init const char *hmat_data_type(u8 type)
@@ -453,7 +474,7 @@ static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header,
 		return -EINVAL;
 	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
 		return 0;
-	alloc_memory_target(ma->proximity_domain);
+	alloc_memory_target(ma->proximity_domain, ma->base_address, ma->length);
 	return 0;
 }
 
@@ -614,10 +635,91 @@ static void hmat_register_target_perf(struct memory_target *target)
 	node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0);
 }
 
+static void hmat_register_target_device(struct memory_target *target,
+		struct resource *r)
+{
+	/* define a clean / non-busy resource for the platform device */
+	struct resource res = {
+		.start = r->start,
+		.end = r->end,
+		.flags = IORESOURCE_MEM,
+	};
+	struct platform_device *pdev;
+	struct memregion_info info;
+	int rc, id;
+
+	rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM,
+			IORES_DESC_SOFT_RESERVED);
+	if (rc != REGION_INTERSECTS)
+		return;
+
+	id = memregion_alloc(GFP_KERNEL);
+	if (id < 0) {
+		pr_err("memregion allocation failure for %pr\n", &res);
+		return;
+	}
+
+	pdev = platform_device_alloc("hmem", id);
+	if (!pdev) {
+		pr_err("hmem device allocation failure for %pr\n", &res);
+		goto out_pdev;
+	}
+
+	pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm);
+	info = (struct memregion_info) {
+		.target_node = acpi_map_pxm_to_node(target->memory_pxm),
+	};
+	rc = platform_device_add_data(pdev, &info, sizeof(info));
+	if (rc < 0) {
+		pr_err("hmem memregion_info allocation failure for %pr\n", &res);
+		goto out_pdev;
+	}
+
+	rc = platform_device_add_resources(pdev, &res, 1);
+	if (rc < 0) {
+		pr_err("hmem resource allocation failure for %pr\n", &res);
+		goto out_resource;
+	}
+
+	rc = platform_device_add(pdev);
+	if (rc < 0) {
+		dev_err(&pdev->dev, "device add failed for %pr\n", &res);
+		goto out_resource;
+	}
+
+	return;
+
+out_resource:
+	put_device(&pdev->dev);
+out_pdev:
+	memregion_free(id);
+}
+
+static __init void hmat_register_target_devices(struct memory_target *target)
+{
+	struct resource *res;
+
+	/*
+	 * Do not bother creating devices if no driver is available to
+	 * consume them.
+	 */
+	if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
+		return;
+
+	for (res = target->memregions.child; res; res = res->sibling)
+		hmat_register_target_device(target, res);
+}
+
 static void hmat_register_target(struct memory_target *target)
 {
 	int nid = pxm_to_node(target->memory_pxm);
 
+	/*
+	 * Devices may belong to either an offline or online
+	 * node, so unconditionally add them.
+	 */
+	hmat_register_target_devices(target);
+
 	/*
 	 * Skip offline nodes. This can happen when memory
 	 * marked EFI_MEMORY_SP, "specific purpose", is applied
@@ -678,11 +780,21 @@ static __init void hmat_free_structures(void)
 	struct target_cache *tcache, *cnext;
 
 	list_for_each_entry_safe(target, tnext, &targets, node) {
+		struct resource *res, *res_next;
+
 		list_for_each_entry_safe(tcache, cnext, &target->caches, node) {
 			list_del(&tcache->node);
 			kfree(tcache);
 		}
+
 		list_del(&target->node);
+		res = target->memregions.child;
+		while (res) {
+			res_next = res->sibling;
+			__release_region(&target->memregions, res->start,
+					resource_size(res));
+			res = res_next;
+		}
 		kfree(target);
 	}
 
-- 
Gitee


From 67fdb6355c1e2cbb7ab71d2e12c3aea032c45dc7 Mon Sep 17 00:00:00 2001
From: Brice Goglin <Brice.Goglin@inria.fr>
Date: Mon, 28 Oct 2019 10:11:18 +0100
Subject: [PATCH 2145/2161] ACPI: HMAT: don't mix pxm and nid when setting
 memory target processor_pxm

commit 4caa525b783b0abe7bc06e41220b337ba311bbf7 upstream.

On systems where PXMs and nids are in different order, memory initiators
exposed in sysfs could be wrong: On dual-socket CLX with SNC enabled
(4 nodes, 1 and 2 swapped between PXMs and nids), node1 would only
get node2 as initiator, and node2 would only get node1.

With this patch, we get node1 as the only initiator of itself,
and node2 as the only initiator of itself, as expected.

This should likely go to stable up to 5.2.

Intel-SIG: commit 4caa525b783b ACPI: HMAT: don't mix pxm and nid when
setting memory target processor_pxm.
Backport for "EFI Specific Purpose Memory Support".

Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/acpi/numa/hmat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index f632b9bec7fc..ea28b7fa64b5 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -439,7 +439,7 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade
 			pr_debug("HMAT: Invalid Processor Domain\n");
 			return -EINVAL;
 		}
-		target->processor_pxm = p_node;
+		target->processor_pxm = p->processor_PD;
 	}
 
 	return 0;
-- 
Gitee


From bdd5dfeda8d2a48c9b3a05e98fb046d62f38412c Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Mon, 11 Nov 2019 16:34:26 -0500
Subject: [PATCH 2146/2161] ACPI: NUMA: HMAT: fix a section mismatch

commit 59b2c5b63587a9ed2292ccce32fd69d8de815036 upstream.

Commit cf8741ac57ed ("ACPI: NUMA: HMAT: Register "soft reserved"
memory as an "hmem" device") introduced a linker warning,

WARNING: vmlinux.o(.text+0x64ec3c): Section mismatch in reference from
the function hmat_register_target() to the function
.init.text:hmat_register_target_devices()

The function hmat_register_target() references the function __init
hmat_register_target_devices().

Since hmat_register_target() is also called from hmat_callback(), and
then register_hotmemory_notifier(), where it should not be freed when
hmat_init() is done, it indicates that the __init annotation of
hmat_register_target_devices() is incorrect.

Intel-SIG: commit 59b2c5b63587 ACPI: NUMA: HMAT: fix a section mismatch.
Backport for "EFI Specific Purpose Memory Support".

Fixes: cf8741ac57ed ("ACPI: NUMA: HMAT: Register "soft reserved" memory as an "hmem" device")
Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/acpi/numa/hmat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index ea28b7fa64b5..928865dfa937 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -695,7 +695,7 @@ static void hmat_register_target_device(struct memory_target *target,
 	memregion_free(id);
 }
 
-static __init void hmat_register_target_devices(struct memory_target *target)
+static void hmat_register_target_devices(struct memory_target *target)
 {
 	struct resource *res;
 
-- 
Gitee


From e71cdd1b521bbdb72214287c86b3ea288c33a880 Mon Sep 17 00:00:00 2001
From: Tao Xu <tao3.xu@intel.com>
Date: Wed, 30 Oct 2019 14:34:03 +0800
Subject: [PATCH 2147/2161] ACPI: HMAT: use %u instead of %d to print u32
 values

commit 0f1839d0888700389e3062b4787046d61780d6d9 upstream.

Use %u instead of %d to print u32 values to expand the value range,
especially when latency or bandwidth value is bigger than INT_MAX.

Then HMAT latency can support up to 4.29s and bandwidth can support
up to 4PB/s.

Intel-SIG: commit 0f1839d08887 ACPI: HMAT: use %u instead of %d to
print u32 values.
Backport for "EFI Specific Purpose Memory Support".

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jingqi Liu <Jingqi.liu@intel.com>
Signed-off-by: Tao Xu <tao3.xu@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
---
 drivers/acpi/numa/hmat.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 928865dfa937..63b04721dafa 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -293,7 +293,7 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header,
 	u8 type, mem_hier;
 
 	if (hmat_loc->header.length < sizeof(*hmat_loc)) {
-		pr_notice("HMAT: Unexpected locality header length: %d\n",
+		pr_notice("HMAT: Unexpected locality header length: %u\n",
 			 hmat_loc->header.length);
 		return -EINVAL;
 	}
@@ -305,12 +305,12 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header,
 	total_size = sizeof(*hmat_loc) + sizeof(*entries) * ipds * tpds +
 		     sizeof(*inits) * ipds + sizeof(*targs) * tpds;
 	if (hmat_loc->header.length < total_size) {
-		pr_notice("HMAT: Unexpected locality header length:%d, minimum required:%d\n",
+		pr_notice("HMAT: Unexpected locality header length:%u, minimum required:%u\n",
 			 hmat_loc->header.length, total_size);
 		return -EINVAL;
 	}
 
-	pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%d Target Domains:%d Base:%lld\n",
+	pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%u Target Domains:%u Base:%lld\n",
 		hmat_loc->flags, hmat_data_type(type), ipds, tpds,
 		hmat_loc->entry_base_unit);
 
@@ -323,7 +323,7 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header,
 			value = hmat_normalize(entries[init * tpds + targ],
 					       hmat_loc->entry_base_unit,
 					       type);
-			pr_info("  Initiator-Target[%d-%d]:%d%s\n",
+			pr_info("  Initiator-Target[%u-%u]:%u%s\n",
 				inits[init], targs[targ], value,
 				hmat_data_type_suffix(type));
 
@@ -350,13 +350,13 @@ static __init int hmat_parse_cache(union acpi_subtable_headers *header,
 	u32 attrs;
 
 	if (cache->header.length < sizeof(*cache)) {
-		pr_notice("HMAT: Unexpected cache header length: %d\n",
+		pr_notice("HMAT: Unexpected cache header length: %u\n",
 			 cache->header.length);
 		return -EINVAL;
 	}
 
 	attrs = cache->cache_attributes;
-	pr_info("HMAT: Cache: Domain:%d Size:%llu Attrs:%08x SMBIOS Handles:%d\n",
+	pr_info("HMAT: Cache: Domain:%u Size:%llu Attrs:%08x SMBIOS Handles:%d\n",
 		cache->memory_PD, cache->cache_size, attrs,
 		cache->number_of_SMBIOShandles);
 
@@ -411,17 +411,17 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade
 	struct memory_target *target = NULL;
 
 	if (p->header.length != sizeof(*p)) {
-		pr_notice("HMAT: Unexpected address range header length: %d\n",
+		pr_notice("HMAT: Unexpected address range header length: %u\n",
 			 p->header.length);
 		return -EINVAL;
 	}
 
 	if (hmat_revision == 1)
-		pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%d Memory Domain:%d\n",
+		pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%d Memory Domain:%u\n",
 			p->reserved3, p->reserved4, p->flags, p->processor_PD,
 			p->memory_PD);
 	else
-		pr_info("HMAT: Memory Flags:%04x Processor Domain:%d Memory Domain:%d\n",
+		pr_info("HMAT: Memory Flags:%04x Processor Domain:%d Memory Domain:%u\n",
 			p->flags, p->processor_PD, p->memory_PD);
 
 	if ((hmat_revision == 1 && p->flags & ACPI_HMAT_MEMORY_PD_VALID) ||
-- 
Gitee


From f485c85aac3958c605deb7306885be0287c02818 Mon Sep 17 00:00:00 2001
From: Zhenyu Wang <zhenyuw@linux.intel.com>
Date: Mon, 23 Mar 2020 17:22:36 +0800
Subject: [PATCH 2148/2161] KVM: x86: Expose fast short REP MOV for supported
 cpuid

commit e3747407c4d52ee3f82afdad235866e815362197 upstream.

For CPU supporting fast short REP MOV (XF86_FEATURE_FSRM) e.g Icelake,
Tigerlake, expose it in KVM supported cpuid as well.

Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
Message-Id: <20200323092236.3703-1-zhenyuw@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kvm/cpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6c64e9a60f32..4d90f3cb4b46 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -418,7 +418,7 @@ void kvm_set_cpu_caps(void)
 		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
 		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
 		F(MD_CLEAR) |
-		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
+		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FSRM)
 	);
 
 	kvm_cpu_cap_mask(CPUID_7_1_EAX,
-- 
Gitee


From f712f79e4da5d769f5b06e7b21208089c560026c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 5 Jan 2021 11:33:00 -0800
Subject: [PATCH 2149/2161] mm: make wait_on_page_writeback() wait for multiple
 pending writebacks

commit c2407cf7d22d0c0d94cf20342b3b8f06f1d904e7 upstream.

Ever since commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common()
logic") we've had some very occasional reports of BUG_ON(PageWriteback)
in write_cache_pages(), which we thought we already fixed in commit
073861ed77b6 ("mm: fix VM_BUG_ON(PageTail) and BUG_ON(PageWriteback)").

But syzbot just reported another one, even with that commit in place.

And it turns out that there's a simpler way to trigger the BUG_ON() than
the one Hugh found with page re-use.  It all boils down to the fact that
the page writeback is ostensibly serialized by the page lock, but that
isn't actually really true.

Yes, the people _setting_ writeback all do so under the page lock, but
the actual clearing of the bit - and waking up any waiters - happens
without any page lock.

This gives us this fairly simple race condition:

  CPU1 = end previous writeback
  CPU2 = start new writeback under page lock
  CPU3 = write_cache_pages()

  CPU1          CPU2            CPU3
  ----          ----            ----

  end_page_writeback()
    test_clear_page_writeback(page)
    ... delayed...

                lock_page();
                set_page_writeback()
                unlock_page()

                                lock_page()
                                wait_on_page_writeback();

    wake_up_page(page, PG_writeback);
    .. wakes up CPU3 ..

                                BUG_ON(PageWriteback(page));

where the BUG_ON() happens because we woke up the PG_writeback bit
becasue of the _previous_ writeback, but a new one had already been
started because the clearing of the bit wasn't actually atomic wrt the
actual wakeup or serialized by the page lock.

The reason this didn't use to happen was that the old logic in waiting
on a page bit would just loop if it ever saw the bit set again.

The nice proper fix would probably be to get rid of the whole "wait for
writeback to clear, and then set it" logic in the writeback path, and
replace it with an atomic "wait-to-set" (ie the same as we have for page
locking: we set the page lock bit with a single "lock_page()", not with
"wait for lock bit to clear and then set it").

However, out current model for writeback is that the waiting for the
writeback bit is done by the generic VFS code (ie write_cache_pages()),
but the actual setting of the writeback bit is done much later by the
filesystem ".writepages()" function.

IOW, to make the writeback bit have that same kind of "wait-to-set"
behavior as we have for page locking, we'd have to change our roughly
~50 different writeback functions.  Painful.

Instead, just make "wait_on_page_writeback()" loop on the very unlikely
situation that the PG_writeback bit is still set, basically re-instating
the old behavior.  This is very non-optimal in case of contention, but
since we only ever set the bit under the page lock, that situation is
controlled.

Reported-by: syzbot+2fc0712f8f8b8b8fa0ef@syzkaller.appspotmail.com
Fixes: 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic")
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Bin Lai <robinlai@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 mm/page-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index daeae783ddf5..3feb9ac606d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2817,7 +2817,7 @@ EXPORT_SYMBOL(__test_set_page_writeback);
  */
 void wait_on_page_writeback(struct page *page)
 {
-	if (PageWriteback(page)) {
+	while (PageWriteback(page)) {
 		trace_wait_on_page_writeback(page, page_mapping(page));
 		wait_on_page_bit(page, PG_writeback);
 	}
-- 
Gitee


From 81417a948037ef552f64de803fe0792f7e90174a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Nov 2020 16:37:51 +0100
Subject: [PATCH 2150/2161] perf/intel: Remove Perfmon-v4 counter_freezing
 support

commit 3daa96d67274653b7c461b30ef9581d68e905fe1 upstream.

Perfmon-v4 counter freezing is fundamentally broken; remove this default
disabled code to make sure nobody uses it.

The feature is called Freeze-on-PMI in the SDM, and if it would do that,
there wouldn't actually be a problem, *however* it does something subtly
different. It globally disables the whole PMU when it raises the PMI,
not when the PMI hits.

This means there's a window between the PMI getting raised and the PMI
actually getting served where we loose events and this violates the
perf counter independence. That is, a counting event should not result
in a different event count when there is a sampling event co-scheduled.

This is known to break existing software (RR).

Intel-SIG: commit 3daa96d67274 perf/intel: Remove Perfmon-v4
counter_freezing support
Backport as a dependency for Sapphire Rapids core PMU support.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[ Yunying Sun: amend commit log ]
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 .../admin-guide/kernel-parameters.txt         |   6 -
 arch/x86/events/intel/core.c                  | 152 ------------------
 arch/x86/events/perf_event.h                  |   2 +-
 3 files changed, 1 insertion(+), 159 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6314798b3df6..87de86346d31 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -882,12 +882,6 @@
 			causing system reset or hang due to sending
 			INIT from AP to BSP.
 
-	perf_v4_pmi=	[X86,INTEL]
-			Format: <bool>
-			Disable Intel PMU counter freezing feature.
-			The feature only exists starting from
-			Arch Perfmon v4 (Skylake and newer).
-
 	disable_ddw	[PPC/PSERIES]
 			Disable Dynamic DMA Window support. Use this if
 			to workaround buggy firmware.
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 89795c2e3a24..f3fb46b3027b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2114,18 +2114,6 @@ static void intel_tfa_pmu_enable_all(int added)
 	intel_pmu_enable_all(added);
 }
 
-static void enable_counter_freeze(void)
-{
-	update_debugctlmsr(get_debugctlmsr() |
-			DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
-static void disable_counter_freeze(void)
-{
-	update_debugctlmsr(get_debugctlmsr() &
-			~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
 static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
@@ -2677,95 +2665,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 	return handled;
 }
 
-static bool disable_counter_freezing = true;
-static int __init intel_perf_counter_freezing_setup(char *s)
-{
-	bool res;
-
-	if (kstrtobool(s, &res))
-		return -EINVAL;
-
-	disable_counter_freezing = !res;
-	return 1;
-}
-__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
-
-/*
- * Simplified handler for Arch Perfmon v4:
- * - We rely on counter freezing/unfreezing to enable/disable the PMU.
- * This is done automatically on PMU ack.
- * - Ack the PMU only after the APIC.
- */
-
-static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int handled = 0;
-	bool bts = false;
-	u64 status;
-	int pmu_enabled = cpuc->enabled;
-	int loops = 0;
-
-	/* PMU has been disabled because of counter freezing */
-	cpuc->enabled = 0;
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-		bts = true;
-		intel_bts_disable_local();
-		handled = intel_pmu_drain_bts_buffer();
-		handled += intel_bts_interrupt();
-	}
-	status = intel_pmu_get_status();
-	if (!status)
-		goto done;
-again:
-	intel_pmu_lbr_read();
-	if (++loops > 100) {
-		static bool warned;
-
-		if (!warned) {
-			WARN(1, "perfevents: irq loop stuck!\n");
-			perf_event_print_debug();
-			warned = true;
-		}
-		intel_pmu_reset();
-		goto done;
-	}
-
-
-	handled += handle_pmi_common(regs, status);
-done:
-	/* Ack the PMI in the APIC */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-	/*
-	 * The counters start counting immediately while ack the status.
-	 * Make it as close as possible to IRET. This avoids bogus
-	 * freezing on Skylake CPUs.
-	 */
-	if (status) {
-		intel_pmu_ack_status(status);
-	} else {
-		/*
-		 * CPU may issues two PMIs very close to each other.
-		 * When the PMI handler services the first one, the
-		 * GLOBAL_STATUS is already updated to reflect both.
-		 * When it IRETs, the second PMI is immediately
-		 * handled and it sees clear status. At the meantime,
-		 * there may be a third PMI, because the freezing bit
-		 * isn't set since the ack in first PMI handlers.
-		 * Double check if there is more work to be done.
-		 */
-		status = intel_pmu_get_status();
-		if (status)
-			goto again;
-	}
-
-	if (bts)
-		intel_bts_enable_local();
-	cpuc->enabled = pmu_enabled;
-	return handled;
-}
-
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -4050,9 +3949,6 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (x86_pmu.version > 1)
 		flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
 
-	if (x86_pmu.counter_freezing)
-		enable_counter_freeze();
-
 	/* Disable perf metrics if any added CPU doesn't support it. */
 	if (x86_pmu.intel_cap.perf_metrics) {
 		union perf_capabilities perf_cap;
@@ -4123,9 +4019,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
 static void intel_pmu_cpu_dying(int cpu)
 {
 	fini_debug_store_on_cpu(cpu);
-
-	if (x86_pmu.counter_freezing)
-		disable_counter_freeze();
 }
 
 void intel_cpuc_finish(struct cpu_hw_events *cpuc)
@@ -4520,39 +4413,6 @@ static __init void intel_nehalem_quirk(void)
 	}
 }
 
-static const struct x86_cpu_desc counter_freezing_ucodes[] = {
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT,	 2, 0x0000000e),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT,	 9, 0x0000002e),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT,	10, 0x00000008),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D,	 1, 0x00000028),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS,	 1, 0x00000028),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS,	 8, 0x00000006),
-	{}
-};
-
-static bool intel_counter_freezing_broken(void)
-{
-	return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes);
-}
-
-static __init void intel_counter_freezing_quirk(void)
-{
-	/* Check if it's already disabled */
-	if (disable_counter_freezing)
-		return;
-
-	/*
-	 * If the system starts with the wrong ucode, leave the
-	 * counter-freezing feature permanently disabled.
-	 */
-	if (intel_counter_freezing_broken()) {
-		pr_info("PMU counter freezing disabled due to CPU errata,"
-			"please upgrade microcode\n");
-		x86_pmu.counter_freezing = false;
-		x86_pmu.handle_irq = intel_pmu_handle_irq;
-	}
-}
-
 /*
  * enable software workaround for errata:
  * SNB: BJ122
@@ -4938,9 +4798,6 @@ __init int intel_pmu_init(void)
 			max((int)edx.split.num_counters_fixed, assume);
 	}
 
-	if (version >= 4)
-		x86_pmu.counter_freezing = !disable_counter_freezing;
-
 	if (boot_cpu_has(X86_FEATURE_PDCM)) {
 		u64 capabilities;
 
@@ -5062,7 +4919,6 @@ __init int intel_pmu_init(void)
 
 	case INTEL_FAM6_ATOM_GOLDMONT:
 	case INTEL_FAM6_ATOM_GOLDMONT_D:
-		x86_add_quirk(intel_counter_freezing_quirk);
 		memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -5089,7 +4945,6 @@ __init int intel_pmu_init(void)
 		break;
 
 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-		x86_add_quirk(intel_counter_freezing_quirk);
 		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -5547,13 +5402,6 @@ __init int intel_pmu_init(void)
 		pr_cont("full-width counters, ");
 	}
 
-	/*
-	 * For arch perfmon 4 use counter freezing to avoid
-	 * several MSR accesses in the PMI.
-	 */
-	if (x86_pmu.counter_freezing)
-		x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
-
 	if (x86_pmu.intel_cap.perf_metrics)
 		x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index fd391a7da62c..1af4caedf10f 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -681,7 +681,7 @@ struct x86_pmu {
 
 	/* PMI handler bits */
 	unsigned int	late_ack		:1,
-			counter_freezing	:1;
+			enabled_ack		:1;
 	/*
 	 * sysfs attrs
 	 */
-- 
Gitee


From e44bee08e5a59a443ab736580e8e2a57812fc7e6 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 Jan 2021 14:40:07 -0800
Subject: [PATCH 2151/2161] perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT

commit 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c upstream.

Current PERF_SAMPLE_WEIGHT sample type is very useful to expresses the
cost of an action represented by the sample. This allows the profiler
to scale the samples to be more informative to the programmer. It could
also help to locate a hotspot, e.g., when profiling by memory latencies,
the expensive load appear higher up in the histograms. But current
PERF_SAMPLE_WEIGHT sample type is solely determined by one factor. This
could be a problem, if users want two or more factors to contribute to
the weight. For example, Golden Cove core PMU can provide both the
instruction latency and the cache Latency information as factors for the
memory profiling.

For current X86 platforms, although meminfo::latency is defined as a
u64, only the lower 32 bits include the valid data in practice (No
memory access could last than 4G cycles). The higher 32 bits can be used
to store new factors.

Add a new sample type, PERF_SAMPLE_WEIGHT_STRUCT, to indicate the new
sample weight structure. It shares the same space as the
PERF_SAMPLE_WEIGHT sample type.

Users can apply either the PERF_SAMPLE_WEIGHT sample type or the
PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but
they cannot apply both sample types simultaneously.

Currently, only X86 and PowerPC use the PERF_SAMPLE_WEIGHT sample type.
- For PowerPC, there is nothing changed for the PERF_SAMPLE_WEIGHT
  sample type. There is no effect for the new PERF_SAMPLE_WEIGHT_STRUCT
  sample type. PowerPC can re-struct the weight field similarly later.
- For X86, the same value will be dumped for the PERF_SAMPLE_WEIGHT
  sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type for now.
  The following patches will apply the new factors for the
  PERF_SAMPLE_WEIGHT_STRUCT sample type.

The field in the union perf_sample_weight should be shared among
different architectures. A generic name is required, but it's hard to
abstract a name that applies to all architectures. For example, on X86,
the fields are to store all kinds of latency. While on PowerPC, it
stores MMCRA[TECX/TECM], which should not be latency. So a general name
prefix 'var$NUM' is used here.

Intel-SIG: commit 2a6c6b7d7ad3 perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT
Backport for Sapphire Rapids core PMU support.
Note: This backported patch has some deviations from upstream version.
To avoid enum hole in perf_event_sample_format struct, we added
PERF_SAMPLE_{AUX,CGROUP,DATA_PAGE_SIZE,CODE_PAGE_SIZE} to file
include/uapi/linux/perf_event.h, but didn't backport the full patchsets
that introducing these enumeration values. To avoid mishandling of these
sampling formats, we added check to perf_copy_attr() in
kernel/events/core.c, to make sure -EINVAL always being returned for
these lack-of-kernel-support sampling formats.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-2-git-send-email-kan.liang@linux.intel.com
[ Yunying Sun: amend commit log ]
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/powerpc/perf/core-book3s.c |  2 +-
 arch/x86/events/intel/ds.c      | 17 ++++++------
 include/linux/perf_event.h      |  4 +--
 include/uapi/linux/perf_event.h | 46 +++++++++++++++++++++++++++++++--
 kernel/events/core.c            | 17 +++++++++---
 5 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 58de6cec907b..f5f973077c28 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2136,7 +2136,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
 						ppmu->get_mem_weight)
-			ppmu->get_mem_weight(&data.weight);
+			ppmu->get_mem_weight(&data.weight.full);
 
 		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 384ccdb7e572..8642e10aef2e 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -957,7 +957,8 @@ static void adaptive_pebs_record_size_update(void)
 }
 
 #define PERF_PEBS_MEMINFO_TYPE	(PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
-				PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+				PERF_SAMPLE_PHYS_ADDR |			     \
+				PERF_SAMPLE_WEIGHT_TYPE |		     \
 				PERF_SAMPLE_TRANSACTION)
 
 static u64 pebs_update_adaptive_cfg(struct perf_event *event)
@@ -983,7 +984,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
 	gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
 	       (attr->sample_regs_intr & PEBS_GP_REGS);
 
-	tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+	tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
 		     ((attr->config & INTEL_ARCH_EVENT_MASK) ==
 		      x86_pmu.rtm_abort_event);
 
@@ -1361,8 +1362,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
 	/*
 	 * Use latency for weight (only avail with PEBS-LL)
 	 */
-	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-		data->weight = pebs->lat;
+	if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
+		data->weight.full = pebs->lat;
 
 	/*
 	 * data.data_src encodes the data source
@@ -1454,8 +1455,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
 
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
-		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-			data->weight = intel_get_tsx_weight(pebs->tsx_tuning);
+		if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
+			data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
 
 		if (sample_type & PERF_SAMPLE_TRANSACTION)
 			data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
@@ -1569,8 +1570,8 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	}
 
 	if (format_size & PEBS_DATACFG_MEMINFO) {
-		if (sample_type & PERF_SAMPLE_WEIGHT)
-			data->weight = meminfo->latency ?:
+		if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+			data->weight.full = meminfo->latency ?:
 				intel_get_tsx_weight(meminfo->tsx_tuning);
 
 		if (sample_type & PERF_SAMPLE_DATA_SRC)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2f73da9936cf..892e178ab089 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -973,7 +973,7 @@ struct perf_sample_data {
 	struct perf_raw_record		*raw;
 	struct perf_branch_stack	*br_stack;
 	u64				period;
-	u64				weight;
+	union perf_sample_weight	weight;
 	u64				txn;
 	union  perf_mem_data_src	data_src;
 
@@ -1024,7 +1024,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->raw  = NULL;
 	data->br_stack = NULL;
 	data->period = period;
-	data->weight = 0;
+	data->weight.full = 0;
 	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
 }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 507eddef8715..2f705e5ba20e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,12 +141,18 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_AUX				= 1U << 20,
+	PERF_SAMPLE_CGROUP			= 1U << 21,
+	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 22,
+	PERF_SAMPLE_CODE_PAGE_SIZE		= 1U << 23,
+	PERF_SAMPLE_WEIGHT_STRUCT		= 1U << 24,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 25,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
 
+#define PERF_SAMPLE_WEIGHT_TYPE	(PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT)
 /*
  * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
  *
@@ -864,7 +870,24 @@ enum perf_event_type {
 	 * 	  char			data[size];
 	 * 	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
 	 *
-	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
+	 *	{ union perf_sample_weight
+	 *	 {
+	 *		u64		full; && PERF_SAMPLE_WEIGHT
+	 *	#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 *		struct {
+	 *			u32	var1_dw;
+	 *			u16	var2_w;
+	 *			u16	var3_w;
+	 *		} && PERF_SAMPLE_WEIGHT_STRUCT
+	 *	#elif defined(__BIG_ENDIAN_BITFIELD)
+	 *		struct {
+	 *			u16	var3_w;
+	 *			u16	var2_w;
+	 *			u32	var1_dw;
+	 *		} && PERF_SAMPLE_WEIGHT_STRUCT
+	 *	#endif
+	 *	 }
+	 *	}
 	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
@@ -1185,4 +1208,23 @@ struct perf_branch_entry {
 		reserved:40;
 };
 
+union perf_sample_weight {
+	__u64		full;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	struct {
+		__u32	var1_dw;
+		__u16	var2_w;
+		__u16	var3_w;
+	};
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	struct {
+		__u16	var3_w;
+		__u16	var2_w;
+		__u32	var1_dw;
+	};
+#else
+#error "Unknown endianness"
+#endif
+};
+
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c07e264d170e..0f8cdcb69c4f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1758,8 +1758,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		size += sizeof(data->period);
 
-	if (sample_type & PERF_SAMPLE_WEIGHT)
-		size += sizeof(data->weight);
+	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+		size += sizeof(data->weight.full);
 
 	if (sample_type & PERF_SAMPLE_READ)
 		size += event->read_size;
@@ -6547,8 +6547,8 @@ void perf_output_sample(struct perf_output_handle *handle,
 					  data->regs_user.regs);
 	}
 
-	if (sample_type & PERF_SAMPLE_WEIGHT)
-		perf_output_put(handle, data->weight);
+	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+		perf_output_put(handle, data->weight.full);
 
 	if (sample_type & PERF_SAMPLE_DATA_SRC)
 		perf_output_put(handle, data->data_src.val);
@@ -10802,6 +10802,15 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 
 	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
 		ret = perf_reg_validate(attr->sample_regs_intr);
+
+	if (attr->sample_type & (PERF_SAMPLE_AUX | PERF_SAMPLE_CGROUP
+		| PERF_SAMPLE_DATA_PAGE_SIZE | PERF_SAMPLE_CODE_PAGE_SIZE))
+		return -EINVAL;
+
+	if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
+	    (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
+		return -EINVAL;
+
 out:
 	return ret;
 
-- 
Gitee


From 75127a649782ffce42bd8a5b2e4c73801a2649cd Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 Jan 2021 14:40:08 -0800
Subject: [PATCH 2152/2161] perf/x86/intel: Factor out
 intel_update_topdown_event()

commit 628d923a3c464db98c1c98bb1e0cd50804caf681 upstream.

Similar to Ice Lake, Intel Sapphire Rapids server also supports the
topdown performance metrics feature. The difference is that Intel
Sapphire Rapids server extends the PERF_METRICS MSR to feature TMA
method level two metrics, which will introduce 8 metrics events. Current
icl_update_topdown_event() only check 4 level one metrics events.

Factor out intel_update_topdown_event() to facilitate the code sharing
between Ice Lake and Sapphire Rapids.

Intel-SIG: commit 628d923a3c46 perf/x86/intel: Factor out
intel_update_topdown_event()
Backport for Sapphire Rapids core PMU support.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-3-git-send-email-kan.liang@linux.intel.com
[ Yunying Sun: amend commit log ]
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f3fb46b3027b..ff32a3d3cc0b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2305,8 +2305,8 @@ static void __icl_update_topdown_event(struct perf_event *event,
 	}
 }
 
-static void update_saved_topdown_regs(struct perf_event *event,
-				      u64 slots, u64 metrics)
+static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
+				      u64 metrics, int metric_end)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct perf_event *other;
@@ -2315,7 +2315,7 @@ static void update_saved_topdown_regs(struct perf_event *event,
 	event->hw.saved_slots = slots;
 	event->hw.saved_metric = metrics;
 
-	for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+	for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
 		if (!is_topdown_idx(idx))
 			continue;
 		other = cpuc->events[idx];
@@ -2330,7 +2330,8 @@ static void update_saved_topdown_regs(struct perf_event *event,
  * The PERF_METRICS and Fixed counter 3 are read separately. The values may be
  * modify by a NMI. PMU has to be disabled before calling this function.
  */
-static u64 icl_update_topdown_event(struct perf_event *event)
+
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct perf_event *other;
@@ -2346,7 +2347,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
 	/* read PERF_METRICS */
 	rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
 
-	for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+	for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
 		if (!is_topdown_idx(idx))
 			continue;
 		other = cpuc->events[idx];
@@ -2372,7 +2373,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
 		 * Don't need to reset the PERF_METRICS and Fixed counter 3.
 		 * Because the values will be restored in next schedule in.
 		 */
-		update_saved_topdown_regs(event, slots, metrics);
+		update_saved_topdown_regs(event, slots, metrics, metric_end);
 		reset = false;
 	}
 
@@ -2381,12 +2382,17 @@ static u64 icl_update_topdown_event(struct perf_event *event)
 		wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
 		wrmsrl(MSR_PERF_METRICS, 0);
 		if (event)
-			update_saved_topdown_regs(event, 0, 0);
+			update_saved_topdown_regs(event, 0, 0, metric_end);
 	}
 
 	return slots;
 }
 
+static u64 icl_update_topdown_event(struct perf_event *event)
+{
+	return intel_update_topdown_event(event, INTEL_PMC_IDX_TD_BE_BOUND);
+}
+
 static void intel_pmu_read_topdown_event(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-- 
Gitee


From 147956b19ef3eb33ae7e3531e0adc37bef8d21d7 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 Jan 2021 14:40:09 -0800
Subject: [PATCH 2153/2161] perf/x86/intel: Filter unsupported Topdown metrics
 event

commit 1ab5f235c176e93adc4f75000aae6c50fea9db00 upstream.

Intel Sapphire Rapids server will introduce 8 metrics events. Intel
Ice Lake only supports 4 metrics events. A perf tool user may mistakenly
use the unsupported events via RAW format on Ice Lake. The user can
still get a value from the unsupported Topdown metrics event once the
following Sapphire Rapids enabling patch is applied.

To enable the 8 metrics events on Intel Sapphire Rapids, the
INTEL_TD_METRIC_MAX has to be updated, which impacts the
is_metric_event(). The is_metric_event() is a generic function.
On Ice Lake, the newly added SPR metrics events will be mistakenly
accepted as metric events on creation. At runtime, the unsupported
Topdown metrics events will be updated.

Add a variable num_topdown_events in x86_pmu to indicate the available
number of the Topdown metrics event on the platform. Apply the number
into is_metric_event(). Only the supported Topdown metrics events
should be created as metrics events.

Apply the num_topdown_events in icl_update_topdown_event() as well. The
function can be reused by the following patch.

Intel-SIG: commit 1ab5f235c176 perf/x86/intel: Filter unsupported
Topdown metrics event
Backport for Sapphire Rapids core PMU support.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-4-git-send-email-kan.liang@linux.intel.com
[ Yunying Sun: amend commit log ]
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c      | 15 +++++++++++++--
 arch/x86/events/perf_event.h      |  1 +
 arch/x86/include/asm/perf_event.h | 10 ++++++++--
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ff32a3d3cc0b..78cbacf8113a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2390,7 +2390,8 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
 
 static u64 icl_update_topdown_event(struct perf_event *event)
 {
-	return intel_update_topdown_event(event, INTEL_PMC_IDX_TD_BE_BOUND);
+	return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
+						 x86_pmu.num_topdown_events - 1);
 }
 
 static void intel_pmu_read_topdown_event(struct perf_event *event)
@@ -3436,6 +3437,15 @@ static int core_pmu_hw_config(struct perf_event *event)
 	return intel_pmu_bts_config(event);
 }
 
+#define INTEL_TD_METRIC_AVAILABLE_MAX	(INTEL_TD_METRIC_RETIRING + \
+					 ((x86_pmu.num_topdown_events - 1) << 8))
+
+static bool is_available_metric_event(struct perf_event *event)
+{
+	return is_metric_event(event) &&
+		event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -3509,7 +3519,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		if (event->attr.config & X86_ALL_EVENT_FLAGS)
 			return -EINVAL;
 
-		if (is_metric_event(event)) {
+		if (is_available_metric_event(event)) {
 			struct perf_event *leader = event->group_leader;
 
 			/* The metric events don't support sampling. */
@@ -5294,6 +5304,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
 		x86_pmu.lbr_pt_coexist = true;
 		intel_pmu_pebs_data_source_skl(pmem);
+		x86_pmu.num_topdown_events = 4;
 		x86_pmu.update_topdown_event = icl_update_topdown_event;
 		x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
 		pr_cont("Icelake events, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 1af4caedf10f..df4c1c2ad44f 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -774,6 +774,7 @@ struct x86_pmu {
 	/*
 	* Intel perf metrics
 	*/
+	int		num_topdown_events;
 	u64             (*update_topdown_event)(struct perf_event *event);
 	int             (*set_topdown_event_period)(struct perf_event *event);
 
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 65c338ef434c..6c83db8a4a1e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -278,8 +278,14 @@ struct x86_pmu_capability {
 #define INTEL_TD_METRIC_BAD_SPEC		0x8100	/* Bad speculation metric */
 #define INTEL_TD_METRIC_FE_BOUND		0x8200	/* FE bound metric */
 #define INTEL_TD_METRIC_BE_BOUND		0x8300	/* BE bound metric */
-#define INTEL_TD_METRIC_MAX			INTEL_TD_METRIC_BE_BOUND
-#define INTEL_TD_METRIC_NUM			4
+/* Level 2 metrics */
+#define INTEL_TD_METRIC_HEAVY_OPS		0x8400  /* Heavy Operations metric */
+#define INTEL_TD_METRIC_BR_MISPREDICT		0x8500  /* Branch Mispredict metric */
+#define INTEL_TD_METRIC_FETCH_LAT		0x8600  /* Fetch Latency metric */
+#define INTEL_TD_METRIC_MEM_BOUND		0x8700  /* Memory bound metric */
+
+#define INTEL_TD_METRIC_MAX			INTEL_TD_METRIC_MEM_BOUND
+#define INTEL_TD_METRIC_NUM			8
 
 static inline bool is_metric_idx(int idx)
 {
-- 
Gitee


From 8b7cf3eea3c4c01f465c8c0beef8322e857d3f6e Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 Jan 2021 14:40:10 -0800
Subject: [PATCH 2154/2161] perf/x86/intel: Add perf core PMU support for
 Sapphire Rapids

commit 61b985e3e775a3a75fda04ce7ef1b1aefc4758bc upstream.

Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.

The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.

A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.

Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
  event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
  introduced to indicate the case. A new event, mem-loads-aux, is
  exposed to sysfs for the user tool.
  Add a check in hw_config(). If the auxiliary event is not detected,
  return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
  fields may be set by HW on some machines. Add pebs_no_block to
  explicitly indicate the previous platforms which don't support the new
  block fields. Accessing the new block fields are ignored on those
  platforms.

A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.

The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
  For loads, the latency starts by the actual cache access until the
  data is returned by the memory subsystem.
  For stores, the latency starts when the demand write accesses the L1
  data cache and lasts until the cacheline write is completed in the
  memory subsystem.
  The cache access latency is stored in low 32bits of the sample type
  PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
  for execution and lasts until completion of the instruction it belongs
  to.
  Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
  latency support. The instruction latency is stored in the bit 47:32
  of the sample type PERF_SAMPLE_WEIGHT_STRUCT.

Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.

The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.

Intel-SIG: commit 61b985e3e775 perf/x86/intel: Add perf core PMU
support for Sapphire Rapids
Backport for Sapphire Rapids core PMU support.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
[ Yunying Sun: amend commit log ]
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/intel/core.c      | 307 +++++++++++++++++++++++++++++-
 arch/x86/events/intel/ds.c        | 118 +++++++++++-
 arch/x86/events/perf_event.h      |  12 +-
 arch/x86/include/asm/perf_event.h |   8 +-
 include/uapi/linux/perf_event.h   |  12 +-
 5 files changed, 443 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 78cbacf8113a..0e2653245725 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -275,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };
 
+static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_spr_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x01c0, 0),	/* INST_RETIRED.PREC_DIST */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	FIXED_EVENT_CONSTRAINT(0x0400, 3),	/* SLOTS */
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+	INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
+	INTEL_EVENT_CONSTRAINT(0x3c, 0xff),
+	/*
+	 * Generally event codes < 0x90 are restricted to counters 0-3.
+	 * The 0x2E and 0x3C are exception, which has no restriction.
+	 */
+	INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
+
+	INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
+	INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
+	INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
+	INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+	INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+	INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
+	INTEL_EVENT_CONSTRAINT(0xce, 0x1),
+	INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+	/*
+	 * Generally event codes >= 0x90 are likely to have no restrictions.
+	 * The exception are defined as above.
+	 */
+	INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
+
+	EVENT_CONSTRAINT_END
+};
+
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
 EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
@@ -314,11 +363,15 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
 EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
 	"4", "2");
 
-EVENT_ATTR_STR(slots,			slots,		"event=0x00,umask=0x4");
-EVENT_ATTR_STR(topdown-retiring,	td_retiring,	"event=0x00,umask=0x80");
-EVENT_ATTR_STR(topdown-bad-spec,	td_bad_spec,	"event=0x00,umask=0x81");
-EVENT_ATTR_STR(topdown-fe-bound,	td_fe_bound,	"event=0x00,umask=0x82");
-EVENT_ATTR_STR(topdown-be-bound,	td_be_bound,	"event=0x00,umask=0x83");
+EVENT_ATTR_STR(slots,			slots,			"event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring,	td_retiring,		"event=0x00,umask=0x80");
+EVENT_ATTR_STR(topdown-bad-spec,	td_bad_spec,		"event=0x00,umask=0x81");
+EVENT_ATTR_STR(topdown-fe-bound,	td_fe_bound,		"event=0x00,umask=0x82");
+EVENT_ATTR_STR(topdown-be-bound,	td_be_bound,		"event=0x00,umask=0x83");
+EVENT_ATTR_STR(topdown-heavy-ops,	td_heavy_ops,		"event=0x00,umask=0x84");
+EVENT_ATTR_STR(topdown-br-mispredict,	td_br_mispredict,	"event=0x00,umask=0x85");
+EVENT_ATTR_STR(topdown-fetch-lat,	td_fetch_lat,		"event=0x00,umask=0x86");
+EVENT_ATTR_STR(topdown-mem-bound,	td_mem_bound,		"event=0x00,umask=0x87");
 
 static struct attribute *snb_events_attrs[] = {
 	EVENT_PTR(td_slots_issued),
@@ -384,6 +437,108 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
+static __initconst const u64 spr_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,
+		[ C(RESULT_MISS)   ] = 0xe124,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_MISS)   ] = 0xe424,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x12a,
+		[ C(RESULT_MISS)   ] = 0x12a,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x12a,
+		[ C(RESULT_MISS)   ] = 0x12a,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,
+		[ C(RESULT_MISS)   ] = 0xe12,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,
+		[ C(RESULT_MISS)   ] = 0xe13,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = 0xe11,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4c4,
+		[ C(RESULT_MISS)   ] = 0x4c5,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x12a,
+		[ C(RESULT_MISS)   ] = 0x12a,
+	},
+ },
+};
+
+static __initconst const u64 spr_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x10001,
+		[ C(RESULT_MISS)   ] = 0x3fbfc00001,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x3f3ffc0002,
+		[ C(RESULT_MISS)   ] = 0x3f3fc00002,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x10c000001,
+		[ C(RESULT_MISS)   ] = 0x3fb3000001,
+	},
+ },
+};
+
 /*
  * Notes on the events:
  * - data reads do not include code reads (comparable to earlier tables)
@@ -3446,6 +3601,17 @@ static bool is_available_metric_event(struct perf_event *event)
 		event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
 }
 
+static inline bool is_mem_loads_event(struct perf_event *event)
+{
+	return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01);
+}
+
+static inline bool is_mem_loads_aux_event(struct perf_event *event)
+{
+	return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
+}
+
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -3548,6 +3714,33 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		}
 	}
 
+	/*
+	 * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR
+	 * doesn't function quite right. As a work-around it needs to always be
+	 * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82).
+	 * The actual count of this second event is irrelevant it just needs
+	 * to be active to make the first event function correctly.
+	 *
+	 * In a group, the auxiliary event must be in front of the load latency
+	 * event. The rule is to simplify the implementation of the check.
+	 * That's because perf cannot have a complete group at the moment.
+	 */
+	if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+	    (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
+	    is_mem_loads_event(event)) {
+		struct perf_event *leader = event->group_leader;
+		struct perf_event *sibling = NULL;
+
+		if (!is_mem_loads_aux_event(leader)) {
+			for_each_sibling_event(sibling, leader) {
+				if (is_mem_loads_aux_event(sibling))
+					break;
+			}
+			if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list))
+				return -ENODATA;
+		}
+	}
+
 	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
 		return 0;
 
@@ -3735,6 +3928,29 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	return hsw_get_event_constraints(cpuc, idx, event);
 }
 
+static struct event_constraint *
+spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	c = icl_get_event_constraints(cpuc, idx, event);
+
+	/*
+	 * The :ppp indicates the Precise Distribution (PDist) facility, which
+	 * is only supported on the GP counter 0. If a :ppp event which is not
+	 * available on the GP counter 0, error out.
+	 */
+	if (event->attr.precise_ip == 3) {
+		if (c->idxmsk64 & BIT_ULL(0))
+			return &counter0_constraint;
+
+		return &emptyconstraint;
+	}
+
+	return c;
+}
+
 static struct event_constraint *
 glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			  struct perf_event *event)
@@ -3824,6 +4040,14 @@ static u64 nhm_limit_period(struct perf_event *event, u64 left)
 	return max(left, 32ULL);
 }
 
+static u64 spr_limit_period(struct perf_event *event, u64 left)
+{
+	if (event->attr.precise_ip == 3)
+		return max(left, 128ULL);
+
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -4538,6 +4762,42 @@ static struct attribute *icl_tsx_events_attrs[] = {
 	NULL,
 };
 
+
+EVENT_ATTR_STR(mem-stores,	mem_st_spr,	"event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads-aux,	mem_ld_aux,	"event=0x03,umask=0x82");
+
+static struct attribute *spr_events_attrs[] = {
+	EVENT_PTR(mem_ld_hsw),
+	EVENT_PTR(mem_st_spr),
+	EVENT_PTR(mem_ld_aux),
+	NULL,
+};
+
+static struct attribute *spr_td_events_attrs[] = {
+	EVENT_PTR(slots),
+	EVENT_PTR(td_retiring),
+	EVENT_PTR(td_bad_spec),
+	EVENT_PTR(td_fe_bound),
+	EVENT_PTR(td_be_bound),
+	EVENT_PTR(td_heavy_ops),
+	EVENT_PTR(td_br_mispredict),
+	EVENT_PTR(td_fetch_lat),
+	EVENT_PTR(td_mem_bound),
+	NULL,
+};
+
+static struct attribute *spr_tsx_events_attrs[] = {
+	EVENT_PTR(tx_start),
+	EVENT_PTR(tx_abort),
+	EVENT_PTR(tx_commit),
+	EVENT_PTR(tx_capacity_read),
+	EVENT_PTR(tx_capacity_write),
+	EVENT_PTR(tx_conflict),
+	EVENT_PTR(cycles_t),
+	EVENT_PTR(cycles_ct),
+	NULL,
+};
+
 static ssize_t freeze_on_smi_show(struct device *cdev,
 				  struct device_attribute *attr,
 				  char *buf)
@@ -5311,6 +5571,43 @@ __init int intel_pmu_init(void)
 		name = "icelake";
 		break;
 
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		pmem = true;
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		x86_pmu.event_constraints = intel_spr_event_constraints;
+		x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_spr_extra_regs;
+		x86_pmu.limit_period = spr_limit_period;
+		x86_pmu.pebs_aliases = NULL;
+		x86_pmu.pebs_prec_dist = true;
+		x86_pmu.pebs_block = true;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_PEBS_ALL;
+		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+		x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = spr_get_event_constraints;
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
+		extra_skl_attr = skl_format_attr;
+		mem_attr = spr_events_attrs;
+		td_attr = spr_td_events_attrs;
+		tsx_attr = spr_tsx_events_attrs;
+		x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+		x86_pmu.lbr_pt_coexist = true;
+		intel_pmu_pebs_data_source_skl(pmem);
+		x86_pmu.num_topdown_events = 8;
+		x86_pmu.update_topdown_event = icl_update_topdown_event;
+		x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
+		pr_cont("Sapphire Rapids events, ");
+		name = "sapphire_rapids";
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8642e10aef2e..4bb139db1a7b 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -35,7 +35,9 @@ union intel_x86_pebs_dse {
 		unsigned int ld_dse:4;
 		unsigned int ld_stlb_miss:1;
 		unsigned int ld_locked:1;
-		unsigned int ld_reserved:26;
+		unsigned int ld_data_blk:1;
+		unsigned int ld_addr_blk:1;
+		unsigned int ld_reserved:24;
 	};
 	struct {
 		unsigned int st_l1d_hit:1;
@@ -44,6 +46,12 @@ union intel_x86_pebs_dse {
 		unsigned int st_locked:1;
 		unsigned int st_reserved2:26;
 	};
+	struct {
+		unsigned int st_lat_dse:4;
+		unsigned int st_lat_stlb_miss:1;
+		unsigned int st_lat_locked:1;
+		unsigned int ld_reserved3:26;
+	};
 };
 
 
@@ -197,6 +205,63 @@ static u64 load_latency_data(u64 status)
 	if (dse.ld_locked)
 		val |= P(LOCK, LOCKED);
 
+	/*
+	 * Ice Lake and earlier models do not support block infos.
+	 */
+	if (!x86_pmu.pebs_block) {
+		val |= P(BLK, NA);
+		return val;
+	}
+	/*
+	 * bit 6: load was blocked since its data could not be forwarded
+	 *        from a preceding store
+	 */
+	if (dse.ld_data_blk)
+		val |= P(BLK, DATA);
+
+	/*
+	 * bit 7: load was blocked due to potential address conflict with
+	 *        a preceding store
+	 */
+	if (dse.ld_addr_blk)
+		val |= P(BLK, ADDR);
+
+	if (!dse.ld_data_blk && !dse.ld_addr_blk)
+		val |= P(BLK, NA);
+
+	return val;
+}
+
+static u64 store_latency_data(u64 status)
+{
+	union intel_x86_pebs_dse dse;
+	u64 val;
+
+	dse.val = status;
+
+	/*
+	 * use the mapping table for bit 0-3
+	 */
+	val = pebs_data_source[dse.st_lat_dse];
+
+	/*
+	 * bit 4: TLB access
+	 * 0 = did not miss 2nd level TLB
+	 * 1 = missed 2nd level TLB
+	 */
+	if (dse.st_lat_stlb_miss)
+		val |= P(TLB, MISS) | P(TLB, L2);
+	else
+		val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+	/*
+	 * bit 5: locked prefix
+	 */
+	if (dse.st_lat_locked)
+		val |= P(LOCK, LOCKED);
+
+	val |= P(BLK, NA);
+
 	return val;
 }
 
@@ -867,6 +932,28 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_spr_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
+	INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
+	INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
+
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+	/*
+	 * Everything else is handled by PMU_FL_PEBS_ALL, because we
+	 * need the full constraints from the main table.
+	 */
+
+	EVENT_CONSTRAINT_END
+};
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -1328,6 +1415,8 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
 
 	if (fl & PERF_X86_EVENT_PEBS_LDLAT)
 		val = load_latency_data(aux);
+	else if (fl & PERF_X86_EVENT_PEBS_STLAT)
+		val = store_latency_data(aux);
 	else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
 		val = precise_datala_hsw(event, aux);
 	else if (fst)
@@ -1500,6 +1589,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
 #endif
 }
 
+#define PEBS_LATENCY_MASK			0xffff
+#define PEBS_CACHE_LATENCY_OFFSET		32
+
 /*
  * With adaptive PEBS the layout depends on what fields are configured.
  */
@@ -1570,9 +1662,27 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	}
 
 	if (format_size & PEBS_DATACFG_MEMINFO) {
-		if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
-			data->weight.full = meminfo->latency ?:
-				intel_get_tsx_weight(meminfo->tsx_tuning);
+		if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+			u64 weight = meminfo->latency;
+
+			if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
+				data->weight.var2_w = weight & PEBS_LATENCY_MASK;
+				weight >>= PEBS_CACHE_LATENCY_OFFSET;
+			}
+
+			/*
+			 * Although meminfo::latency is defined as a u64,
+			 * only the lower 32 bits include the valid data
+			 * in practice on Ice Lake and earlier platforms.
+			 */
+			if (sample_type & PERF_SAMPLE_WEIGHT) {
+				data->weight.full = weight ?:
+					intel_get_tsx_weight(meminfo->tsx_tuning);
+			} else {
+				data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
+					intel_get_tsx_weight(meminfo->tsx_tuning);
+			}
+		}
 
 		if (sample_type & PERF_SAMPLE_DATA_SRC)
 			data->data_src.val = get_data_src(event, meminfo->aux);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index df4c1c2ad44f..3c1b2f5428de 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -81,6 +81,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
 #define PERF_X86_EVENT_PAIR		0x1000 /* Large Increment per Cycle */
 #define PERF_X86_EVENT_LBR_SELECT	0x2000 /* Save/Restore MSR_LBR_SELECT */
 #define PERF_X86_EVENT_TOPDOWN		0x4000 /* Count Topdown slots/metrics events */
+#define PERF_X86_EVENT_PEBS_STLAT	0x8000 /* st+stlat data address sampling */
 
 static inline bool is_topdown_count(struct perf_event *event)
 {
@@ -443,6 +444,10 @@ struct cpu_hw_events {
 	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
 
+#define INTEL_PSD_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT)
+
 #define INTEL_PST_CONSTRAINT(c, n)	\
 	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
@@ -722,7 +727,8 @@ struct x86_pmu {
 			pebs_broken		:1,
 			pebs_prec_dist		:1,
 			pebs_no_tlb		:1,
-			pebs_no_isolation	:1;
+			pebs_no_isolation	:1,
+			pebs_block		:1;
 	int		pebs_record_size;
 	int		pebs_buffer_size;
 	int		max_pebs_events;
@@ -870,6 +876,8 @@ do {									\
 #define PMU_FL_PEBS_ALL		0x10 /* all events are valid PEBS events */
 #define PMU_FL_TFA		0x20 /* deal with TSX force abort */
 #define PMU_FL_PAIR		0x40 /* merge counters for large incr. events */
+#define PMU_FL_INSTR_LATENCY	0x80 /* Support Instruction Latency in PEBS Memory Info Record */
+#define PMU_FL_MEM_LOADS_AUX	0x100 /* Require an auxiliary event for the complete memory info */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1156,6 +1164,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
 
 extern struct event_constraint intel_icl_pebs_event_constraints[];
 
+extern struct event_constraint intel_spr_pebs_event_constraints[];
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
 
 void intel_pmu_pebs_add(struct perf_event *event);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6c83db8a4a1e..abb5bb10e032 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -259,8 +259,12 @@ struct x86_pmu_capability {
 #define INTEL_PMC_IDX_TD_BAD_SPEC		(INTEL_PMC_IDX_METRIC_BASE + 1)
 #define INTEL_PMC_IDX_TD_FE_BOUND		(INTEL_PMC_IDX_METRIC_BASE + 2)
 #define INTEL_PMC_IDX_TD_BE_BOUND		(INTEL_PMC_IDX_METRIC_BASE + 3)
-#define INTEL_PMC_IDX_METRIC_END		INTEL_PMC_IDX_TD_BE_BOUND
-#define INTEL_PMC_MSK_TOPDOWN			((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \
+#define INTEL_PMC_IDX_TD_HEAVY_OPS		(INTEL_PMC_IDX_METRIC_BASE + 4)
+#define INTEL_PMC_IDX_TD_BR_MISPREDICT		(INTEL_PMC_IDX_METRIC_BASE + 5)
+#define INTEL_PMC_IDX_TD_FETCH_LAT		(INTEL_PMC_IDX_METRIC_BASE + 6)
+#define INTEL_PMC_IDX_TD_MEM_BOUND		(INTEL_PMC_IDX_METRIC_BASE + 7)
+#define INTEL_PMC_IDX_METRIC_END		INTEL_PMC_IDX_TD_MEM_BOUND
+#define INTEL_PMC_MSK_TOPDOWN			((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \
 						INTEL_PMC_MSK_FIXED_SLOTS)
 
 /*
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2f705e5ba20e..a49001fa86e2 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1087,14 +1087,16 @@ union perf_mem_data_src {
 			mem_lvl_num:4,	/* memory hierarchy level number */
 			mem_remote:1,   /* remote */
 			mem_snoopx:2,	/* snoop mode, ext */
-			mem_rsvd:24;
+			mem_blk:3,	/* access blocked */
+			mem_rsvd:21;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd:24,
+		__u64	mem_rsvd:21,
+			mem_blk:3,	/* access blocked */
 			mem_snoopx:2,	/* snoop mode, ext */
 			mem_remote:1,   /* remote */
 			mem_lvl_num:4,	/* memory hierarchy level number */
@@ -1177,6 +1179,12 @@ union perf_mem_data_src {
 #define PERF_MEM_TLB_OS		0x40 /* OS fault handler */
 #define PERF_MEM_TLB_SHIFT	26
 
+/* Access blocked */
+#define PERF_MEM_BLK_NA		0x01 /* not available */
+#define PERF_MEM_BLK_DATA	0x02 /* data could not be forwarded */
+#define PERF_MEM_BLK_ADDR	0x04 /* address conflict */
+#define PERF_MEM_BLK_SHIFT	40
+
 #define PERF_MEM_S(a, s) \
 	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
 
-- 
Gitee


From f9d007e842f4e4cc00c4d3cb09c688cfaafde701 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 Jan 2021 14:40:11 -0800
Subject: [PATCH 2155/2161] perf/x86/intel: Support CPUID 10.ECX to disable
 fixed counters

commit 32451614da2a9cf4296f90d3606ac77814fb519d upstream.

With Architectural Performance Monitoring Version 5, CPUID 10.ECX cpu
leaf indicates the fixed counter enumeration. This extends the previous
count to a bitmap which allows disabling even lower fixed counters.
It could be used by a Hypervisor.

The existing intel_ctrl variable is used to remember the bitmask of the
counters. All code that reads all counters is fixed to check this extra
bitmask.

Intel-SIG: commit 32451614da2a perf/x86/intel: Support CPUID 10.ECX to
disable fixed counters
Backport for Sapphire Rapids core PMU support.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Originally-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-6-git-send-email-kan.liang@linux.intel.com
[ Yunying Sun: amend commit log ]
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/events/core.c       |  8 +++++++-
 arch/x86/events/intel/core.c | 34 ++++++++++++++++++++++++----------
 arch/x86/events/perf_event.h |  5 +++++
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 39bb48bd722a..af598e2f362c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -223,6 +223,8 @@ static bool check_hw_exists(void)
 		if (ret)
 			goto msr_fail;
 		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+			if (fixed_counter_disabled(i))
+				continue;
 			if (val & (0x03 << i*4)) {
 				bios_fail = 1;
 				val_fail = val;
@@ -1500,6 +1502,8 @@ void perf_event_print_debug(void)
 			cpu, idx, prev_left);
 	}
 	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		if (fixed_counter_disabled(idx))
+			continue;
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1938,7 +1942,9 @@ static int __init init_hw_perf_events(void)
 	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
 	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
 	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
+	pr_info("... fixed-purpose events:   %lu\n",
+			hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1)
+					<< INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl));
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
 	/*
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0e2653245725..4a67c42ac7a9 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2703,8 +2703,11 @@ static void intel_pmu_reset(void)
 		wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
 		wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
 	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		if (fixed_counter_disabled(idx))
+			continue;
 		wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+	}
 
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
@@ -5021,7 +5024,7 @@ __init int intel_pmu_init(void)
 	union cpuid10_eax eax;
 	union cpuid10_ebx ebx;
 	struct event_constraint *c;
-	unsigned int unused;
+	unsigned int fixed_mask;
 	struct extra_reg *er;
 	bool pmem = false;
 	int version, i;
@@ -5043,7 +5046,7 @@ __init int intel_pmu_init(void)
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired hw_event or not.
 	 */
-	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+	cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
 	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
 		return -ENODEV;
 
@@ -5067,12 +5070,15 @@ __init int intel_pmu_init(void)
 	 * Quirk: v2 perfmon does not report fixed-purpose events, so
 	 * assume at least 3 events, when not running in a hypervisor:
 	 */
-	if (version > 1) {
+	if (version > 1 && version < 5) {
 		int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
 
 		x86_pmu.num_counters_fixed =
 			max((int)edx.split.num_counters_fixed, assume);
-	}
+
+		fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
+	} else if (version >= 5)
+		x86_pmu.num_counters_fixed = fls(fixed_mask);
 
 	if (boot_cpu_has(X86_FEATURE_PDCM)) {
 		u64 capabilities;
@@ -5650,8 +5656,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
 	}
 
-	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+	x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
 
 	if (x86_pmu.event_constraints) {
 		/*
@@ -5664,13 +5669,22 @@ __init int intel_pmu_init(void)
 			 * events to the generic counters.
 			 */
 			if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+				/*
+				 * Disable topdown slots and metrics events,
+				 * if slots event is not in CPUID.
+				 */
+				if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
+					c->idxmsk64 = 0;
 				c->weight = hweight64(c->idxmsk64);
 				continue;
 			}
 
-			if (c->cmask == FIXED_EVENT_FLAGS
-			    && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-				c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			if (c->cmask == FIXED_EVENT_FLAGS) {
+				/* Disabled fixed counters which are not in CPUID */
+				c->idxmsk64 &= x86_pmu.intel_ctrl;
+
+				if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+					c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
 			}
 			c->idxmsk64 &=
 				~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 3c1b2f5428de..325c74704fa8 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1067,6 +1067,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
 			  char *page);
 
+static inline bool fixed_counter_disabled(int i)
+{
+	return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+}
+
 #ifdef CONFIG_CPU_SUP_AMD
 
 int amd_pmu_init(void);
-- 
Gitee


From 93be7d014f8d402e985aa4c85fd3a3891f4233e3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 15 Oct 2020 20:11:31 -0700
Subject: [PATCH 2156/2161] include/linux/list.h: add a macro to test if entry
 is pointing to the head

commit e130816164e244b692921de49771eeb28205152d upstream.

Add a macro to test if entry is pointing to the head of the list which is
useful in cases like:

  list_for_each_entry(pos, &head, member) {
    if (cond)
      break;
  }
  if (list_entry_is_head(pos, &head, member))
    return -ERRNO;

that allows to avoid additional variable to be added to track if loop has
not been stopped in the middle.

While here, convert list_for_each_entry*() family of macros to use a new one.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://lkml.kernel.org/r/20200929134342.51489-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 include/linux/list.h | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 85c92555e31f..ce19c6b632a5 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -567,6 +567,15 @@ static inline void list_splice_tail_init(struct list_head *list,
 	     pos != (head); \
 	     pos = n, n = pos->prev)
 
+/**
+ * list_entry_is_head - test if the entry points to the head of the list
+ * @pos:	the type * to cursor
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_entry_is_head(pos, head, member)				\
+	(&pos->member == (head))
+
 /**
  * list_for_each_entry	-	iterate over list of given type
  * @pos:	the type * to use as a loop cursor.
@@ -575,7 +584,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry(pos, head, member)				\
 	for (pos = list_first_entry(head, typeof(*pos), member);	\
-	     &pos->member != (head);					\
+	     !list_entry_is_head(pos, head, member);			\
 	     pos = list_next_entry(pos, member))
 
 /**
@@ -586,7 +595,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry_reverse(pos, head, member)			\
 	for (pos = list_last_entry(head, typeof(*pos), member);		\
-	     &pos->member != (head); 					\
+	     !list_entry_is_head(pos, head, member); 			\
 	     pos = list_prev_entry(pos, member))
 
 /**
@@ -611,7 +620,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry_continue(pos, head, member) 		\
 	for (pos = list_next_entry(pos, member);			\
-	     &pos->member != (head);					\
+	     !list_entry_is_head(pos, head, member);			\
 	     pos = list_next_entry(pos, member))
 
 /**
@@ -625,7 +634,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry_continue_reverse(pos, head, member)		\
 	for (pos = list_prev_entry(pos, member);			\
-	     &pos->member != (head);					\
+	     !list_entry_is_head(pos, head, member);			\
 	     pos = list_prev_entry(pos, member))
 
 /**
@@ -637,7 +646,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  * Iterate over list of given type, continuing from current position.
  */
 #define list_for_each_entry_from(pos, head, member) 			\
-	for (; &pos->member != (head);					\
+	for (; !list_entry_is_head(pos, head, member);			\
 	     pos = list_next_entry(pos, member))
 
 /**
@@ -650,7 +659,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  * Iterate backwards over list of given type, continuing from current position.
  */
 #define list_for_each_entry_from_reverse(pos, head, member)		\
-	for (; &pos->member != (head);					\
+	for (; !list_entry_is_head(pos, head, member);			\
 	     pos = list_prev_entry(pos, member))
 
 /**
@@ -663,7 +672,7 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each_entry_safe(pos, n, head, member)			\
 	for (pos = list_first_entry(head, typeof(*pos), member),	\
 		n = list_next_entry(pos, member);			\
-	     &pos->member != (head); 					\
+	     !list_entry_is_head(pos, head, member); 			\
 	     pos = n, n = list_next_entry(n, member))
 
 /**
@@ -679,7 +688,7 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each_entry_safe_continue(pos, n, head, member) 		\
 	for (pos = list_next_entry(pos, member), 				\
 		n = list_next_entry(pos, member);				\
-	     &pos->member != (head);						\
+	     !list_entry_is_head(pos, head, member);				\
 	     pos = n, n = list_next_entry(n, member))
 
 /**
@@ -694,7 +703,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry_safe_from(pos, n, head, member) 			\
 	for (n = list_next_entry(pos, member);					\
-	     &pos->member != (head);						\
+	     !list_entry_is_head(pos, head, member);				\
 	     pos = n, n = list_next_entry(n, member))
 
 /**
@@ -710,7 +719,7 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each_entry_safe_reverse(pos, n, head, member)		\
 	for (pos = list_last_entry(head, typeof(*pos), member),		\
 		n = list_prev_entry(pos, member);			\
-	     &pos->member != (head); 					\
+	     !list_entry_is_head(pos, head, member); 			\
 	     pos = n, n = list_prev_entry(n, member))
 
 /**
-- 
Gitee


From f96a27033189ca18f71c36a766df2ebf53402ad1 Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <xiaochen.shen@intel.com>
Date: Sat, 1 Oct 2022 04:15:27 +0800
Subject: [PATCH 2157/2161] dmaengine: idxd: Fix max batch size for Intel IAA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e8dbd6445dd6b38c4c50410a86f13158486ee99a upstream.

>From Intel IAA spec [1], Intel IAA does not support batch processing.

Two batch related default values for IAA are incorrect in current code:
(1) The max batch size of device is set during device initialization,
    that indicates batch is supported. It should be always 0 on IAA.
(2) The max batch size of work queue is set to WQ_DEFAULT_MAX_BATCH (32)
    as the default value regardless of Intel DSA or IAA device during
    work queue setup and cleanup. It should be always 0 on IAA.

Fix the issues by setting the max batch size of device and max batch
size of work queue to 0 on IAA device, that means batch is not
supported.

[1]: https://cdrdv2.intel.com/v1/dl/getContent/721858

【【SPR内核开发】SPR系列patch合入kernel+kvm】http://tapd.oa.com/Virtualization/prong/stories/view/1020422237869387499
--story=869387499 【SPR内核开发】SPR系列patch合入kernel+kvm

Fixes: 23084545dbb0 ("dmaengine: idxd: set max_xfer and max_batch for RO device")
Fixes: 92452a72ebdf ("dmaengine: idxd: set defaults for wq configs")
Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20220930201528.18621-2-xiaochen.shen@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/dma/idxd/device.c |  6 +++---
 drivers/dma/idxd/idxd.h   | 32 ++++++++++++++++++++++++++++++++
 drivers/dma/idxd/init.c   |  4 ++--
 drivers/dma/idxd/sysfs.c  |  2 +-
 4 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index ddde520c5e0c..328b2a498a57 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -525,7 +525,7 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	memset(wq->name, 0, WQ_NAME_SIZE);
 	memset(wq->driver_name, 0, WQ_NAME_SIZE);
 	wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
-	wq->max_batch_size = WQ_DEFAULT_MAX_BATCH;
+	idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
 }
 
 static void idxd_wq_device_reset_cleanup(struct idxd_wq *wq)
@@ -1033,7 +1033,7 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 
 	/* bytes 12-15 */
 	wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
-	wq->wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
+	idxd_wqcfg_set_max_batch_shift(idxd->data->type, wq->wqcfg, ilog2(wq->max_batch_size));
 
 	dev_dbg(dev, "WQ %d CFGs\n", wq->id);
 	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
@@ -1204,7 +1204,7 @@ static int idxd_wq_load_config(struct idxd_wq *wq)
 		set_bit(WQ_FLAG_MODE_1, &wq->flags);
 
 	wq->max_xfer_bytes = 1ULL << wq->wqcfg->max_xfer_shift;
-	wq->max_batch_size = 1ULL << wq->wqcfg->max_batch_shift;
+	idxd_wq_set_max_batch_size(idxd->data->type, wq, 1U << wq->wqcfg->max_batch_shift);
 
 	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
 		wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 7b404bc171e2..4a2bac349b4b 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -585,6 +585,38 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq)
 	return wq->client_count;
 };
 
+/*
+ * Intel IAA does not support batch processing.
+ * The max batch size of device, max batch size of wq and
+ * max batch shift of wqcfg should be always 0 on IAA.
+ */
+static inline void idxd_set_max_batch_size(int idxd_type, struct idxd_device *idxd,
+					   u32 max_batch_size)
+{
+	if (idxd_type == IDXD_TYPE_IAX)
+		idxd->max_batch_size = 0;
+	else
+		idxd->max_batch_size = max_batch_size;
+}
+
+static inline void idxd_wq_set_max_batch_size(int idxd_type, struct idxd_wq *wq,
+					      u32 max_batch_size)
+{
+	if (idxd_type == IDXD_TYPE_IAX)
+		wq->max_batch_size = 0;
+	else
+		wq->max_batch_size = max_batch_size;
+}
+
+static inline void idxd_wqcfg_set_max_batch_shift(int idxd_type, union wqcfg *wqcfg,
+						  u32 max_batch_shift)
+{
+	if (idxd_type == IDXD_TYPE_IAX)
+		wqcfg->max_batch_shift = 0;
+	else
+		wqcfg->max_batch_shift = max_batch_shift;
+}
+
 #define MODULE_ALIAS_IDXD_DEVICE(type) MODULE_ALIAS("idxd:t" __stringify(type) "*")
 #define IDXD_DEVICES_MODALIAS_FMT "idxd:t%d"
 
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 017136aa165f..bd44a998195b 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -179,7 +179,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		init_completion(&wq->wq_resurrect);
 		INIT_LIST_HEAD(&wq->vdcm_list);
 		wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
-		wq->max_batch_size = WQ_DEFAULT_MAX_BATCH;
+		idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
 		wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
 		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
 		if (!wq->wqcfg) {
@@ -409,7 +409,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
 
 	idxd->max_xfer_bytes = 1ULL << idxd->hw.gen_cap.max_xfer_shift;
 	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
-	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
+	idxd_set_max_batch_size(idxd->data->type, idxd, 1U << idxd->hw.gen_cap.max_batch_shift);
 	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
 	idxd_check_ims(idxd);
 	if (idxd->hw.gen_cap.config_en)
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 2be5fdfa22c2..0c955ecaa50d 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -961,7 +961,7 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu
 	if (batch_size > idxd->max_batch_size)
 		return -EINVAL;
 
-	wq->max_batch_size = (u32)batch_size;
+	idxd_wq_set_max_batch_size(idxd->data->type, wq, (u32)batch_size);
 
 	return count;
 }
-- 
Gitee


From 718472a7ca78f59de1d93d11db90ff2c4eaa429f Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Wed, 10 Mar 2021 11:02:33 -0800
Subject: [PATCH 2158/2161] x86, sched: Treat Intel SNC topology as default,
 COD as exception

commit 2c88d45edbb89029c1190bb3b136d2602f057c98 uptream.

Commit 1340ccfa9a9a ("x86,sched: Allow topologies where NUMA nodes
share an LLC") added a vendor and model specific check to never
call topology_sane() for Intel Skylake Server systems where NUMA
nodes share an LLC.

Intel Ice Lake and Sapphire Rapids CPUs also enumerate an LLC that is
shared by multiple NUMA nodes. The LLC on these CPUs is shared for
off-package data access but private to the NUMA node for on-package
access. Rather than managing a list of allowable SNC topologies, make
this SNC topology the default, and treat Intel's Cluster-On-Die (COD)
topology as the exception.

In SNC mode, Sky Lake, Ice Lake, and Sapphire Rapids servers do not
emit this warning:

sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.

Intel-SIG: commit 2c88d45edbb8 x86, sched: Treat Intel SNC topology as default,
COD as exception.
Backport for avoiding smpboot warning with SNC mode.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210310190233.31752-1-alison.schofield@intel.com
[ Huaisheng Ye amend commit log ]
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 arch/x86/kernel/smpboot.c | 90 ++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 1c48a520754a..8ee307fe07ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -458,29 +458,52 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return false;
 }
 
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	if (c->phys_proc_id == o->phys_proc_id &&
+	    c->cpu_die_id == o->cpu_die_id)
+		return true;
+	return false;
+}
+
 /*
- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node.  If this happens, we will
+ * discard the MC level of the topology later.
+ */
+static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	if (c->phys_proc_id == o->phys_proc_id)
+		return true;
+	return false;
+}
+
+/*
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
  *
- * These are Intel CPUs that enumerate an LLC that is shared by
- * multiple NUMA nodes. The LLC on these systems is shared for
- * off-package data access but private to the NUMA node (half
- * of the package) for on-package access.
+ * Any Intel CPU that has multiple nodes per package and does not
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
  *
- * CPUID (the source of the information about the LLC) can only
- * enumerate the cache as being shared *or* unshared, but not
- * this particular configuration. The CPU in this case enumerates
- * the cache to be shared across the entire package (spanning both
- * NUMA nodes).
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
+ * by multiple NUMA nodes. The LLC is shared for off-package data
+ * access but private to the NUMA node (half of the package) for
+ * on-package access. CPUID (the source of the information about
+ * the LLC) can only enumerate the cache as shared or unshared,
+ * but not this particular configuration.
  */
 
-static const struct x86_cpu_id snc_cpu[] = {
-	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
+static const struct x86_cpu_id intel_cod_cpu[] = {
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_HASWELL_X, 0, 0, 0},   /* COD */
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_BROADWELL_X, 0, 0, 0}, /* COD */
+	{ X86_VENDOR_INTEL, 6, 0, 0, 1, 0},			 /* SNC */
 	{}
 };
 
 static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
+	const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
 	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+	bool intel_snc = id && id->driver_data;
 
 	/* Do not match if we do not have a valid APICID for cpu: */
 	if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
@@ -495,32 +518,12 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	 * means 'c' does not share the LLC of 'o'. This will be
 	 * reflected to userspace.
 	 */
-	if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+	if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
 		return false;
 
 	return topology_sane(c, o, "llc");
 }
 
-/*
- * Unlike the other levels, we do not enforce keeping a
- * multicore group inside a NUMA node.  If this happens, we will
- * discard the MC level of the topology later.
- */
-static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
-{
-	if (c->phys_proc_id == o->phys_proc_id)
-		return true;
-	return false;
-}
-
-static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
-{
-	if ((c->phys_proc_id == o->phys_proc_id) &&
-		(c->cpu_die_id == o->cpu_die_id))
-		return true;
-	return false;
-}
-
 
 #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
 static inline int x86_sched_itmt_flags(void)
@@ -592,14 +595,23 @@ void set_cpu_sibling_map(int cpu)
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		o = &cpu_data(i);
 
+		if (match_pkg(c, o) && !topology_same_node(c, o))
+			x86_has_numa_in_package = true;
+
 		if ((i == cpu) || (has_smt && match_smt(c, o)))
 			link_mask(topology_sibling_cpumask, cpu, i);
 
 		if ((i == cpu) || (has_mp && match_llc(c, o)))
 			link_mask(cpu_llc_shared_mask, cpu, i);
 
+		if ((i == cpu) || (has_mp && match_die(c, o)))
+			link_mask(topology_die_cpumask, cpu, i);
 	}
 
+	threads = cpumask_weight(topology_sibling_cpumask(cpu));
+	if (threads > __max_smt_threads)
+		__max_smt_threads = threads;
+
 	/*
 	 * This needs a separate iteration over the cpus because we rely on all
 	 * topology_sibling_cpumask links to be set-up.
@@ -613,8 +625,7 @@ void set_cpu_sibling_map(int cpu)
 			/*
 			 *  Does this new cpu bringup a new core?
 			 */
-			if (cpumask_weight(
-			    topology_sibling_cpumask(cpu)) == 1) {
+			if (threads == 1) {
 				/*
 				 * for each core in package, increment
 				 * the booted_cores for this new cpu
@@ -631,16 +642,7 @@ void set_cpu_sibling_map(int cpu)
 			} else if (i != cpu && !c->booted_cores)
 				c->booted_cores = cpu_data(i).booted_cores;
 		}
-		if (match_pkg(c, o) && !topology_same_node(c, o))
-			x86_has_numa_in_package = true;
-
-		if ((i == cpu) || (has_mp && match_die(c, o)))
-			link_mask(topology_die_cpumask, cpu, i);
 	}
-
-	threads = cpumask_weight(topology_sibling_cpumask(cpu));
-	if (threads > __max_smt_threads)
-		__max_smt_threads = threads;
 }
 
 /* maps the cpu to the sched domain representing multi-core */
-- 
Gitee


From 3433fa5e0b1788557147de05182d317a6b9d2de6 Mon Sep 17 00:00:00 2001
From: Sun Ke <sunke32@huawei.com>
Date: Wed, 12 May 2021 19:43:30 +0800
Subject: [PATCH 2159/2161] nbd: Fix NULL pointer in flush_workqueue

commit 79ebe9110fa458d58f1fceb078e2068d7ad37390 upstream.

Open /dev/nbdX first, the config_refs will be 1 and
the pointers in nbd_device are still null. Disconnect
/dev/nbdX, then reference a null recv_workq. The
protection by config_refs in nbd_genl_disconnect is useless.

[  656.366194] BUG: kernel NULL pointer dereference, address: 0000000000000020
[  656.368943] #PF: supervisor write access in kernel mode
[  656.369844] #PF: error_code(0x0002) - not-present page
[  656.370717] PGD 10cc87067 P4D 10cc87067 PUD 1074b4067 PMD 0
[  656.371693] Oops: 0002 [#1] SMP
[  656.372242] CPU: 5 PID: 7977 Comm: nbd-client Not tainted 5.11.0-rc5-00040-g76c057c84d28 #1
[  656.373661] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[  656.375904] RIP: 0010:mutex_lock+0x29/0x60
[  656.376627] Code: 00 0f 1f 44 00 00 55 48 89 fd 48 83 05 6f d7 fe 08 01 e8 7a c3 ff ff 48 83 05 6a d7 fe 08 01 31 c0 65 48 8b 14 25 00 6d 01 00 <f0> 48 0f b1 55 d
[  656.378934] RSP: 0018:ffffc900005eb9b0 EFLAGS: 00010246
[  656.379350] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  656.379915] RDX: ffff888104cf2600 RSI: ffffffffaae8f452 RDI: 0000000000000020
[  656.380473] RBP: 0000000000000020 R08: 0000000000000000 R09: ffff88813bd6b318
[  656.381039] R10: 00000000000000c7 R11: fefefefefefefeff R12: ffff888102710b40
[  656.381599] R13: ffffc900005eb9e0 R14: ffffffffb2930680 R15: ffff88810770ef00
[  656.382166] FS:  00007fdf117ebb40(0000) GS:ffff88813bd40000(0000) knlGS:0000000000000000
[  656.382806] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  656.383261] CR2: 0000000000000020 CR3: 0000000100c84000 CR4: 00000000000006e0
[  656.383819] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  656.384370] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  656.384927] Call Trace:
[  656.385111]  flush_workqueue+0x92/0x6c0
[  656.385395]  nbd_disconnect_and_put+0x81/0xd0
[  656.385716]  nbd_genl_disconnect+0x125/0x2a0
[  656.386034]  genl_family_rcv_msg_doit.isra.0+0x102/0x1b0
[  656.386422]  genl_rcv_msg+0xfc/0x2b0
[  656.386685]  ? nbd_ioctl+0x490/0x490
[  656.386954]  ? genl_family_rcv_msg_doit.isra.0+0x1b0/0x1b0
[  656.387354]  netlink_rcv_skb+0x62/0x180
[  656.387638]  genl_rcv+0x34/0x60
[  656.387874]  netlink_unicast+0x26d/0x590
[  656.388162]  netlink_sendmsg+0x398/0x6c0
[  656.388451]  ? netlink_rcv_skb+0x180/0x180
[  656.388750]  ____sys_sendmsg+0x1da/0x320
[  656.389038]  ? ____sys_recvmsg+0x130/0x220
[  656.389334]  ___sys_sendmsg+0x8e/0xf0
[  656.389605]  ? ___sys_recvmsg+0xa2/0xf0
[  656.389889]  ? handle_mm_fault+0x1671/0x21d0
[  656.390201]  __sys_sendmsg+0x6d/0xe0
[  656.390464]  __x64_sys_sendmsg+0x23/0x30
[  656.390751]  do_syscall_64+0x45/0x70
[  656.391017]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

To fix it, just add if (nbd->recv_workq) to nbd_disconnect_and_put().

Fixes: e9e006f5fcf2 ("nbd: fix max number of supported devs")
Signed-off-by: Sun Ke <sunke32@huawei.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20210512114331.1233964-2-sunke32@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/block/nbd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 950dbc5635fc..751bdff249e3 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -2021,7 +2021,8 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
 	 * config ref and try to destroy the workqueue from inside the work
 	 * queue.
 	 */
-	flush_workqueue(nbd->recv_workq);
+	if (nbd->recv_workq)
+		flush_workqueue(nbd->recv_workq);
 	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
 			       &nbd->config->runtime_flags))
 		nbd_config_put(nbd);
-- 
Gitee


From 3d1d3ab29c8a2362ae0ec655fb34b299719acb73 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Mon, 12 Jul 2021 16:55:44 +0800
Subject: [PATCH 2160/2161] fbmem: Do not delete the mode that is still in use

commit 0af778269a522c988ef0b4188556aba97fb420cc upstream.

The execution of fb_delete_videomode() is not based on the result of the
previous fbcon_mode_deleted(). As a result, the mode is directly deleted,
regardless of whether it is still in use, which may cause UAF.

==================================================================
BUG: KASAN: use-after-free in fb_mode_is_equal+0x36e/0x5e0 \
drivers/video/fbdev/core/modedb.c:924
Read of size 4 at addr ffff88807e0ddb1c by task syz-executor.0/18962

CPU: 2 PID: 18962 Comm: syz-executor.0 Not tainted 5.10.45-rc1+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ...
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x137/0x1be lib/dump_stack.c:118
 print_address_description+0x6c/0x640 mm/kasan/report.c:385
 __kasan_report mm/kasan/report.c:545 [inline]
 kasan_report+0x13d/0x1e0 mm/kasan/report.c:562
 fb_mode_is_equal+0x36e/0x5e0 drivers/video/fbdev/core/modedb.c:924
 fbcon_mode_deleted+0x16a/0x220 drivers/video/fbdev/core/fbcon.c:2746
 fb_set_var+0x1e1/0xdb0 drivers/video/fbdev/core/fbmem.c:975
 do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108
 vfs_ioctl fs/ioctl.c:48 [inline]
 __do_sys_ioctl fs/ioctl.c:753 [inline]
 __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 18960:
 kasan_save_stack mm/kasan/common.c:48 [inline]
 kasan_set_track+0x3d/0x70 mm/kasan/common.c:56
 kasan_set_free_info+0x17/0x30 mm/kasan/generic.c:355
 __kasan_slab_free+0x108/0x140 mm/kasan/common.c:422
 slab_free_hook mm/slub.c:1541 [inline]
 slab_free_freelist_hook+0xd6/0x1a0 mm/slub.c:1574
 slab_free mm/slub.c:3139 [inline]
 kfree+0xca/0x3d0 mm/slub.c:4121
 fb_delete_videomode+0x56a/0x820 drivers/video/fbdev/core/modedb.c:1104
 fb_set_var+0x1f3/0xdb0 drivers/video/fbdev/core/fbmem.c:978
 do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108
 vfs_ioctl fs/ioctl.c:48 [inline]
 __do_sys_ioctl fs/ioctl.c:753 [inline]
 __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 13ff178ccd6d ("fbcon: Call fbcon_mode_deleted/new_modelist directly")
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: <stable@vger.kernel.org> # v5.3+
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210712085544.2828-1-thunder.leizhen@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 drivers/video/fbdev/core/fbmem.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index e3807846e548..7e1d00de070b 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -965,13 +965,11 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 		fb_var_to_videomode(&mode2, &info->var);
 		/* make sure we don't delete the videomode of current var */
 		ret = fb_mode_is_equal(&mode1, &mode2);
-
-		if (!ret)
-			fbcon_mode_deleted(info, &mode1);
-
-		if (!ret)
-			fb_delete_videomode(&mode1, &info->modelist);
-
+		if (!ret) {
+			ret = fbcon_mode_deleted(info, &mode1);
+			if (!ret)
+				fb_delete_videomode(&mode1, &info->modelist);
+		}
 
 		return ret ? -EINVAL : 0;
 	}
-- 
Gitee


From 3e5e53d7d447919ebe999476f820f82161d4805b Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Wed, 24 May 2023 17:02:34 +0800
Subject: [PATCH 2161/2161] fuse: fix one null ptr issue in fuse_dev_ioctl

commit opencloudos.

In this code snippet, there is a potential risk when cmd is
FUSE_DEV_IOC_RECOVERY, and either fud or fud->fc is empty. This
could lead to severe null pointer issues, so we perform a non-null
check before using them.

Fixes: e1c207b3e7cdfd98("fuse: add a dev ioctl for recovery")
Signed-off-by: Xinghui Li <korantli@tencent.com>
---
 fs/fuse/dev.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index adc8e6b5ebab..8f1f0548f4f1 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2257,12 +2257,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 	}
 	if (cmd == FUSE_DEV_IOC_RECOVERY) {
 		struct fuse_dev *fud = fuse_get_dev(file);
-		struct fuse_iqueue *fiq = &fud->fc->iq;
-		struct fuse_pqueue *fpq = &fud->pq;
+		struct fuse_iqueue *fiq = NULL;
+		struct fuse_pqueue *fpq = NULL;
 		struct fuse_req *req, *next;
 		LIST_HEAD(recovery);
 		unsigned int i;
 
+		if (fud && fud->fc) {
+			fiq = &fud->fc->iq;
+			fpq = &fud->pq;
+		} else
+			return -ENOMEM;
+
 		spin_lock(&fpq->lock);
 		for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
 			list_splice_tail_init(&fpq->processing[i],
@@ -2278,7 +2284,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 		list_splice(&recovery, &fiq->pending);
 		spin_unlock(&fiq->lock);
 		err = 0;
-        }
+	}
 	return err;
 }
 
-- 
Gitee